1278 files changed, 754706 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.h b/contrib/llvm/lib/Target/AArch64/AArch64.h
new file mode 100644
index 000000000000..fd106a8d9b0b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64.h
@@ -0,0 +1,66 @@
+//==-- AArch64.h - Top-level interface for AArch64  --------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// AArch64 back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64_H
+
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AArch64TargetMachine;
+class FunctionPass;
+class MachineFunctionPass;
+
+FunctionPass *createAArch64DeadRegisterDefinitions();
+FunctionPass *createAArch64RedundantCopyEliminationPass();
+FunctionPass *createAArch64ConditionalCompares();
+FunctionPass *createAArch64AdvSIMDScalar();
+FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
+                                 CodeGenOpt::Level OptLevel);
+FunctionPass *createAArch64StorePairSuppressPass();
+FunctionPass *createAArch64ExpandPseudoPass();
+FunctionPass *createAArch64LoadStoreOptimizationPass();
+FunctionPass *createAArch64VectorByElementOptPass();
+ModulePass *createAArch64PromoteConstantPass();
+FunctionPass *createAArch64ConditionOptimizerPass();
+FunctionPass *createAArch64AddressTypePromotionPass();
+FunctionPass *createAArch64A57FPLoadBalancing();
+FunctionPass *createAArch64A53Fix835769();
+
+FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
+
+FunctionPass *createAArch64CollectLOHPass();
+
+void initializeAArch64A53Fix835769Pass(PassRegistry&);
+void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
+void initializeAArch64AddressTypePromotionPass(PassRegistry&);
+void initializeAArch64AdvSIMDScalarPass(PassRegistry&);
+void initializeAArch64CollectLOHPass(PassRegistry&);
+void initializeAArch64ConditionalComparesPass(PassRegistry&);
+void initializeAArch64ConditionOptimizerPass(PassRegistry&);
+void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
+void initializeAArch64ExpandPseudoPass(PassRegistry&);
+void initializeAArch64LoadStoreOptPass(PassRegistry&);
+void initializeAArch64VectorByElementOptPass(PassRegistry&);
+void initializeAArch64PromoteConstantPass(PassRegistry&);
+void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
+void initializeAArch64StorePairSuppressPass(PassRegistry&);
+void initializeLDTLSCleanupPass(PassRegistry&);
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td
new file mode 100644
index 000000000000..c40391d5ad9d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64.td
@@ -0,0 +1,349 @@
+//=- AArch64.td - Describe the AArch64 Target Machine --------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing.
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// AArch64 Subtarget features.
+//
+
+def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
+                                       "Enable ARMv8 FP">;
+
+def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
+  "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
+
+def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
+  "Enable cryptographic instructions">;
+
+def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
+  "Enable ARMv8 CRC-32 checksum instructions">;
+
+def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
+  "Enable ARMv8 Reliability, Availability and Serviceability Extensions">;
+
+def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
+  "Enable ARMv8.1 Large System Extension (LSE) atomic instructions">;
+
+def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
+  "Enable ARMv8 PMUv3 Performance Monitors extension">;
+
+def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
+  "Full FP16", [FeatureFPARMv8]>;
+
+def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
+  "Enable Statistical Profiling extension">;
+
+/// Cyclone has register move instructions which are "free".
+def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
+                                        "Has zero-cycle register moves">;
+
+/// Cyclone has instructions which zero registers for "free".
+def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+                                        "Has zero-cycle zeroing instructions">;
+
+def FeatureStrictAlign : SubtargetFeature<"strict-align",
+                                          "StrictAlign", "true",
+                                          "Disallow all unaligned memory "
+                                          "access">;
+
+def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true",
+                                         "Reserve X18, making it unavailable "
+                                         "as a GPR">;
+
+def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
+                                    "Use alias analysis during codegen">;
+
+def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps",
+    "true",
+    "balance mix of odd and even D-registers for fp multiply(-accumulate) ops">;
+
+def FeaturePredictableSelectIsExpensive : SubtargetFeature<
+    "predictable-select-expensive", "PredictableSelectIsExpensive", "true",
+    "Prefer likely predicted branches over selects">;
+
+def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
+    "CustomAsCheapAsMove", "true",
+    "Use custom code for TargetInstrInfo::isAsCheapAsAMove()">;
+
+def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
+    "UsePostRAScheduler", "true", "Schedule again after register allocation">;
+
+def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",
+    "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;
+
+def FeatureAvoidQuadLdStPairs : SubtargetFeature<"no-quad-ldst-pairs",
+    "AvoidQuadLdStPairs", "true",
+    "Do not form quad load/store pair operations">;
+
+def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<
+    "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern",
+    "true", "Use alternative pattern for sextload convert to f32">;
+
+def FeatureArithmeticBccFusion : SubtargetFeature<
+    "arith-bcc-fusion", "HasArithmeticBccFusion", "true",
+    "CPU fuses arithmetic+bcc operations">;
+
+def FeatureArithmeticCbzFusion : SubtargetFeature<
+    "arith-cbz-fusion", "HasArithmeticCbzFusion", "true",
+    "CPU fuses arithmetic + cbz/cbnz operations">;
+
+def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
+    "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
+    "Disable latency scheduling heuristic">;
+
+def FeatureUseRSqrt : SubtargetFeature<
+    "use-reciprocal-square-root", "UseRSqrt", "true",
+    "Use the reciprocal square root approximation">;
+
+//===----------------------------------------------------------------------===//
+// Architectures.
+//
+
+def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
+  "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE]>;
+
+def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
+  "Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "AArch64RegisterInfo.td"
+include "AArch64CallingConvention.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "AArch64Schedule.td"
+include "AArch64InstrInfo.td"
+
+def AArch64InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Named operands for MRS/MSR/TLBI/...
+//===----------------------------------------------------------------------===//
+
+include "AArch64SystemOperands.td"
+
+//===----------------------------------------------------------------------===//
+// AArch64 Processors supported.
+//
+include "AArch64SchedA53.td"
+include "AArch64SchedA57.td"
+include "AArch64SchedCyclone.td"
+include "AArch64SchedFalkor.td"
+include "AArch64SchedKryo.td"
+include "AArch64SchedM1.td"
+include "AArch64SchedVulcan.td"
+
+def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
+                                   "Cortex-A35 ARM processors", [
+                                   FeatureCRC,
+                                   FeatureCrypto,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon
+                                   ]>;
+
+def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
+                                   "Cortex-A53 ARM processors", [
+                                   FeatureBalanceFPOps,
+                                   FeatureCRC,
+                                   FeatureCrypto,
+                                   FeatureCustomCheapAsMoveHandling,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon,
+                                   FeaturePostRAScheduler,
+                                   FeatureUseAA
+                                   ]>;
+
+def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
+                                   "Cortex-A57 ARM processors", [
+                                   FeatureBalanceFPOps,
+                                   FeatureCRC,
+                                   FeatureCrypto,
+                                   FeatureCustomCheapAsMoveHandling,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon,
+                                   FeaturePostRAScheduler,
+                                   FeaturePredictableSelectIsExpensive
+                                   ]>;
+
+def ProcA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
+                                   "Cortex-A72 ARM processors", [
+                                   FeatureCRC,
+                                   FeatureCrypto,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon
+                                   ]>;
+
+def ProcA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
+                                   "Cortex-A73 ARM processors", [
+                                   FeatureCRC,
+                                   FeatureCrypto,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon
+                                   ]>;
+
+def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
+                                   "Cyclone", [
+                                   FeatureAlternateSExtLoadCVTF32Pattern,
+                                   FeatureCrypto,
+                                   FeatureDisableLatencySchedHeuristic,
+                                   FeatureFPARMv8,
+                                   FeatureArithmeticBccFusion,
+                                   FeatureArithmeticCbzFusion,
+                                   FeatureNEON,
+                                   FeaturePerfMon,
+                                   FeatureSlowMisaligned128Store,
+                                   FeatureZCRegMove,
+                                   FeatureZCZeroing
+                                   ]>;
+
+def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
+                                    "Samsung Exynos-M1 processors",
+                                    [FeatureAvoidQuadLdStPairs,
+                                     FeatureCRC,
+                                     FeatureCrypto,
+                                     FeatureCustomCheapAsMoveHandling,
+                                     FeatureFPARMv8,
+                                     FeatureNEON,
+                                     FeaturePerfMon,
+                                     FeaturePostRAScheduler,
+                                     FeatureSlowMisaligned128Store,
+                                     FeatureUseRSqrt,
+                                     FeatureZCZeroing]>;
+
+def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
+                                    "Samsung Exynos-M2/M3 processors",
+                                    [FeatureAvoidQuadLdStPairs,
+                                     FeatureCRC,
+                                     FeatureCrypto,
+                                     FeatureCustomCheapAsMoveHandling,
+                                     FeatureFPARMv8,
+                                     FeatureNEON,
+                                     FeaturePerfMon,
+                                     FeaturePostRAScheduler,
+                                     FeatureSlowMisaligned128Store,
+                                     FeatureZCZeroing]>;
+
+def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
+                                   "Qualcomm Kryo processors", [
+                                   FeatureCRC,
+                                   FeatureCrypto,
+                                   FeatureCustomCheapAsMoveHandling,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon,
+                                   FeaturePostRAScheduler,
+                                   FeaturePredictableSelectIsExpensive,
+                                   FeatureZCZeroing
+                                   ]>;
+
+def ProcFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
+                                   "Qualcomm Falkor processors", [
+                                   FeatureCRC,
+                                   FeatureCrypto,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon
+                                   ]>;
+
+def ProcVulcan  : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
+                                   "Broadcom Vulcan processors", [
+                                   FeatureCRC,
+                                   FeatureCrypto,
+                                   FeatureFPARMv8,
+                                   FeatureArithmeticBccFusion,
+                                   FeatureNEON,
+                                   FeaturePostRAScheduler,
+                                   FeaturePredictableSelectIsExpensive,
+                                   HasV8_1aOps]>;
+
+def : ProcessorModel<"generic", NoSchedModel, [
+                     FeatureCRC,
+                     FeatureFPARMv8,
+                     FeatureNEON,
+                     FeaturePerfMon,
+                     FeaturePostRAScheduler
+                     ]>;
+
+// FIXME: Cortex-A35 is currently modelled as a Cortex-A53
+def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
+def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
+def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
+// FIXME: Cortex-A72 and Cortex-A73 are currently modelled as an Cortex-A57.
+def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>;
+def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
+def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
+def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
+def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>;
+def : ProcessorModel<"exynos-m3", ExynosM1Model, [ProcExynosM2]>;
+def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>;
+def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
+def : ProcessorModel<"vulcan", VulcanModel, [ProcVulcan]>;
+
+//===----------------------------------------------------------------------===//
+// Assembly parser
+//===----------------------------------------------------------------------===//
+
+def GenericAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+  string Name = "generic";
+  string BreakCharacters = ".";
+}
+
+def AppleAsmParserVariant : AsmParserVariant {
+  int Variant = 1;
+  string Name = "apple-neon";
+  string BreakCharacters = ".";
+}
+
+//===----------------------------------------------------------------------===//
+// Assembly printer
+//===----------------------------------------------------------------------===//
+// AArch64 Uses the MC printer for asm output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def GenericAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  int PassSubtarget = 1;
+  int Variant = 0;
+  bit isMCAsmWriter = 1;
+}
+
+def AppleAsmWriter : AsmWriter {
+  let AsmWriterClassName = "AppleInstPrinter";
+  int PassSubtarget = 1;
+  int Variant = 1;
+  int isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def AArch64 : Target {
+  let InstructionSet = AArch64InstrInfo;
+  let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
+  let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
new file mode 100644
index 000000000000..e6afb42440a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -0,0 +1,244 @@
+//===-- AArch64A53Fix835769.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass changes code to work around Cortex-A53 erratum 835769.
+// It works around it by inserting a nop instruction in code sequences that
+// in some circumstances may trigger the erratum.
+// It inserts a nop instruction between a sequence of the following 2 classes
+// of instructions:
+// instr 1: mem-instr (including loads, stores and prefetches).
+// instr 2: non-SIMD integer multiply-accumulate writing 64-bit X registers.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-fix-cortex-a53-835769"
+
+STATISTIC(NumNopsAdded, "Number of Nops added to work around erratum 835769");
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+
+// Is the instruction a match for the instruction that comes first in the
+// sequence of instructions that can trigger the erratum?
+static bool isFirstInstructionInSequence(MachineInstr *MI) {
+  // Must return true if this instruction is a load, a store or a prefetch.
+  switch (MI->getOpcode()) {
+  case AArch64::PRFMl:
+  case AArch64::PRFMroW:
+  case AArch64::PRFMroX:
+  case AArch64::PRFMui:
+  case AArch64::PRFUMi:
+    return true;
+  default:
+    return MI->mayLoadOrStore();
+  }
+}
+
+// Is the instruction a match for the instruction that comes second in the
+// sequence that can trigger the erratum?
+static bool isSecondInstructionInSequence(MachineInstr *MI) {
+  // Must return true for non-SIMD integer multiply-accumulates, writing
+  // to a 64-bit register.
+  switch (MI->getOpcode()) {
+  // Erratum cannot be triggered when the destination register is 32 bits,
+  // therefore only include the following.
+  case AArch64::MSUBXrrr:
+  case AArch64::MADDXrrr:
+  case AArch64::SMADDLrrr:
+  case AArch64::SMSUBLrrr:
+  case AArch64::UMADDLrrr:
+  case AArch64::UMSUBLrrr:
+    // Erratum can only be triggered by multiply-adds, not by regular
+    // non-accumulating multiplies, i.e. when Ra=XZR='11111'
+    return MI->getOperand(3).getReg() != AArch64::XZR;
+  default:
+    return false;
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AArch64A53Fix835769 : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+
+public:
+  static char ID;
+  explicit AArch64A53Fix835769() : MachineFunctionPass(ID) {
+    initializeAArch64A53Fix835769Pass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  StringRef getPassName() const override {
+    return "Workaround A53 erratum 835769 pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  bool runOnBasicBlock(MachineBasicBlock &MBB);
+};
+char AArch64A53Fix835769::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(AArch64A53Fix835769, "aarch64-fix-cortex-a53-835769-pass",
+                "AArch64 fix for A53 erratum 835769", false, false)
+
+//===----------------------------------------------------------------------===//
+
+bool
+AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) {
+  DEBUG(dbgs() << "***** AArch64A53Fix835769 *****\n");
+  bool Changed = false;
+  TII = F.getSubtarget().getInstrInfo();
+
+  for (auto &MBB : F) {
+    Changed |= runOnBasicBlock(MBB);
+  }
+  return Changed;
+}
+
+// Return the block that was fallen through to get to MBB, if any,
+// otherwise nullptr.
+static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB,
+                                             const TargetInstrInfo *TII) {
+  // Get the previous machine basic block in the function.
+  MachineFunction::iterator MBBI(MBB);
+
+  // Can't go off top of function.
+  if (MBBI == MBB->getParent()->begin())
+    return nullptr;
+
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 2> Cond;
+
+  MachineBasicBlock *PrevBB = &*std::prev(MBBI);
+  for (MachineBasicBlock *S : MBB->predecessors())
+    if (S == PrevBB && !TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) && !TBB &&
+        !FBB)
+      return S;
+
+  return nullptr;
+}
+
+// Iterate through fallen through blocks trying to find a previous non-pseudo if
+// there is one, otherwise return nullptr. Only look for instructions in
+// previous blocks, not the current block, since we only use this to look at
+// previous blocks.
+static MachineInstr *getLastNonPseudo(MachineBasicBlock &MBB,
+                                      const TargetInstrInfo *TII) {
+  MachineBasicBlock *FMBB = &MBB;
+
+  // If there is no non-pseudo in the current block, loop back around and try
+  // the previous block (if there is one).
+  while ((FMBB = getBBFallenThrough(FMBB, TII))) {
+    for (MachineInstr &I : make_range(FMBB->rbegin(), FMBB->rend()))
+      if (!I.isPseudo())
+        return &I;
+  }
+
+  // There was no previous non-pseudo in the fallen through blocks
+  return nullptr;
+}
+
+static void insertNopBeforeInstruction(MachineBasicBlock &MBB, MachineInstr* MI,
+                                       const TargetInstrInfo *TII) {
+  // If we are the first instruction of the block, put the NOP at the end of
+  // the previous fallthrough block
+  if (MI == &MBB.front()) {
+    MachineInstr *I = getLastNonPseudo(MBB, TII);
+    assert(I && "Expected instruction");
+    DebugLoc DL = I->getDebugLoc();
+    BuildMI(I->getParent(), DL, TII->get(AArch64::HINT)).addImm(0);
+  }
+  else {
+    DebugLoc DL = MI->getDebugLoc();
+    BuildMI(MBB, MI, DL, TII->get(AArch64::HINT)).addImm(0);
+  }
+
+  ++NumNopsAdded;
+}
+
+bool
+AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n");
+
+  // First, scan the basic block, looking for a sequence of 2 instructions
+  // that match the conditions under which the erratum may trigger.
+
+  // List of terminating instructions in matching sequences
+  std::vector<MachineInstr*> Sequences;
+  unsigned Idx = 0;
+  MachineInstr *PrevInstr = nullptr;
+
+  // Try and find the last non-pseudo instruction in any fallen through blocks,
+  // if there isn't one, then we use nullptr to represent that.
+  PrevInstr = getLastNonPseudo(MBB, TII);
+
+  for (auto &MI : MBB) {
+    MachineInstr *CurrInstr = &MI;
+    DEBUG(dbgs() << "  Examining: " << MI);
+    if (PrevInstr) {
+      DEBUG(dbgs() << "    PrevInstr: " << *PrevInstr
+                   << "    CurrInstr: " << *CurrInstr
+                   << "    isFirstInstructionInSequence(PrevInstr): "
+                   << isFirstInstructionInSequence(PrevInstr) << "\n"
+                   << "    isSecondInstructionInSequence(CurrInstr): "
+                   << isSecondInstructionInSequence(CurrInstr) << "\n");
+      if (isFirstInstructionInSequence(PrevInstr) &&
+          isSecondInstructionInSequence(CurrInstr)) {
+        DEBUG(dbgs() << "   ** pattern found at Idx " << Idx << "!\n");
+        Sequences.push_back(CurrInstr);
+      }
+    }
+    if (!CurrInstr->isPseudo())
+      PrevInstr = CurrInstr;
+    ++Idx;
+  }
+
+  DEBUG(dbgs() << "Scan complete, " << Sequences.size()
+               << " occurrences of pattern found.\n");
+
+  // Then update the basic block, inserting nops between the detected sequences.
+  for (auto &MI : Sequences) {
+    Changed = true;
+    insertNopBeforeInstruction(MBB, MI, TII);
+  }
+
+  return Changed;
+}
+
+// Factory function used by AArch64TargetMachine to add the pass to
+// the passmanager.
+FunctionPass *llvm::createAArch64A53Fix835769() {
+  return new AArch64A53Fix835769();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
new file mode 100644
index 000000000000..0aa597bcdc56
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -0,0 +1,733 @@
+//===-- AArch64A57FPLoadBalancing.cpp - Balance FP ops statically on A57---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// For best-case performance on Cortex-A57, we should try to use a balanced
+// mix of odd and even D-registers when performing a critical sequence of
+// independent, non-quadword FP/ASIMD floating-point multiply or
+// multiply-accumulate operations.
+//
+// This pass attempts to detect situations where the register allocation may
+// adversely affect this load balancing and to change the registers used so as
+// to better utilize the CPU.
+//
+// Ideally we'd just take each multiply or multiply-accumulate in turn and
+// allocate it alternating even or odd registers. However, multiply-accumulates
+// are most efficiently performed in the same functional unit as their
+// accumulation operand. Therefore this pass tries to find maximal sequences
+// ("Chains") of multiply-accumulates linked via their accumulation operand,
+// and assign them all the same "color" (oddness/evenness).
+//
+// This optimization affects S-register and D-register floating point
+// multiplies and FMADD/FMAs, as well as vector (floating point only) muls and
+// FMADD/FMA. Q register instructions (and 128-bit vector instructions) are
+// not affected.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-a57-fp-load-balancing"
+
+// Enforce the algorithm to use the scavenged register even when the original
+// destination register is the correct color. Used for testing.
+static cl::opt<bool>
+TransformAll("aarch64-a57-fp-load-balancing-force-all",
+             cl::desc("Always modify dest registers regardless of color"),
+             cl::init(false), cl::Hidden);
+
+// Never use the balance information obtained from chains - return a specific
+// color always. Used for testing.
+static cl::opt<unsigned>
+OverrideBalance("aarch64-a57-fp-load-balancing-override",
+              cl::desc("Ignore balance information, always return "
+                       "(1: Even, 2: Odd)."),
+              cl::init(0), cl::Hidden);
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+
+// Is the instruction a type of multiply on 64-bit (or 32-bit) FPRs?
+static bool isMul(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AArch64::FMULSrr:
+  case AArch64::FNMULSrr:
+  case AArch64::FMULDrr:
+  case AArch64::FNMULDrr:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// Is the instruction a type of FP multiply-accumulate on 64-bit (or 32-bit) FPRs?
+static bool isMla(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AArch64::FMSUBSrrr:
+  case AArch64::FMADDSrrr:
+  case AArch64::FNMSUBSrrr:
+  case AArch64::FNMADDSrrr:
+  case AArch64::FMSUBDrrr:
+  case AArch64::FMADDDrrr:
+  case AArch64::FNMSUBDrrr:
+  case AArch64::FNMADDDrrr:
+    return true;
+  default:
+    return false;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A "color", which is either even or odd. Yes, these aren't really colors
+/// but the algorithm is conceptually doing two-color graph coloring.
+enum class Color { Even, Odd };
+#ifndef NDEBUG
+static const char *ColorNames[2] = { "Even", "Odd" };
+#endif
+
+class Chain;
+
+class AArch64A57FPLoadBalancing : public MachineFunctionPass {
+  MachineRegisterInfo *MRI;
+  const TargetRegisterInfo *TRI;
+  RegisterClassInfo RCI;
+
+public:
+  static char ID;
+  explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) {
+    initializeAArch64A57FPLoadBalancingPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  StringRef getPassName() const override {
+    return "A57 FP Anti-dependency breaker";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  bool runOnBasicBlock(MachineBasicBlock &MBB);
+  bool colorChainSet(std::vector<Chain*> GV, MachineBasicBlock &MBB,
+                     int &Balance);
+  bool colorChain(Chain *G, Color C, MachineBasicBlock &MBB);
+  int scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB);
+  void scanInstruction(MachineInstr *MI, unsigned Idx,
+                       std::map<unsigned, Chain*> &Active,
+                       std::vector<std::unique_ptr<Chain>> &AllChains);
+  void maybeKillChain(MachineOperand &MO, unsigned Idx,
+                      std::map<unsigned, Chain*> &RegChains);
+  Color getColor(unsigned Register);
+  Chain *getAndEraseNext(Color PreferredColor, std::vector<Chain*> &L);
+};
+}
+
+char AArch64A57FPLoadBalancing::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64A57FPLoadBalancing, DEBUG_TYPE,
+                      "AArch64 A57 FP Load-Balancing", false, false)
+INITIALIZE_PASS_END(AArch64A57FPLoadBalancing, DEBUG_TYPE,
+                    "AArch64 A57 FP Load-Balancing", false, false)
+
+namespace {
+/// A Chain is a sequence of instructions that are linked together by
+/// an accumulation operand. For example:
+///
+///   fmul d0<def>, ?
+///   fmla d1<def>, ?, ?, d0<kill>
+///   fmla d2<def>, ?, ?, d1<kill>
+///
+/// There may be other instructions interleaved in the sequence that
+/// do not belong to the chain. These other instructions must not use
+/// the "chain" register at any point.
+///
+/// We currently only support chains where the "chain" operand is killed
+/// at each link in the chain for simplicity.
+/// A chain has three important instructions - Start, Last and Kill.
+///   * The start instruction is the first instruction in the chain.
+///   * Last is the final instruction in the chain.
+///   * Kill may or may not be defined. If defined, Kill is the instruction
+///     where the outgoing value of the Last instruction is killed.
+///     This information is important as if we know the outgoing value is
+///     killed with no intervening uses, we can safely change its register.
+///
+/// Without a kill instruction, we must assume the outgoing value escapes
+/// beyond our model and either must not change its register or must
+/// create a fixup FMOV to keep the old register value consistent.
+///
+class Chain {
+public:
+  /// The important (marker) instructions.
+  MachineInstr *StartInst, *LastInst, *KillInst;
+  /// The index, from the start of the basic block, that each marker
+  /// appears. These are stored so we can do quick interval tests.
+  unsigned StartInstIdx, LastInstIdx, KillInstIdx;
+  /// All instructions in the chain.
+  std::set<MachineInstr*> Insts;
+  /// True if KillInst cannot be modified. If this is true,
+  /// we cannot change LastInst's outgoing register.
+  /// This will be true for tied values and regmasks.
+  bool KillIsImmutable;
+  /// The "color" of LastInst. This will be the preferred chain color,
+  /// as changing intermediate nodes is easy but changing the last
+  /// instruction can be more tricky.
+  Color LastColor;
+
+  Chain(MachineInstr *MI, unsigned Idx, Color C)
+      : StartInst(MI), LastInst(MI), KillInst(nullptr),
+        StartInstIdx(Idx), LastInstIdx(Idx), KillInstIdx(0),
+        LastColor(C) {
+    Insts.insert(MI);
+  }
+
+  /// Add a new instruction into the chain. The instruction's dest operand
+  /// has the given color.
+  void add(MachineInstr *MI, unsigned Idx, Color C) {
+    LastInst = MI;
+    LastInstIdx = Idx;
+    LastColor = C;
+    assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) &&
+           "Chain: broken invariant. A Chain can only be killed after its last "
+           "def");
+
+    Insts.insert(MI);
+  }
+
+  /// Return true if MI is a member of the chain.
+  bool contains(MachineInstr &MI) { return Insts.count(&MI) > 0; }
+
+  /// Return the number of instructions in the chain.
+  unsigned size() const {
+    return Insts.size();
+  }
+
+  /// Inform the chain that its last active register (the dest register of
+  /// LastInst) is killed by MI with no intervening uses or defs.
+  void setKill(MachineInstr *MI, unsigned Idx, bool Immutable) {
+    KillInst = MI;
+    KillInstIdx = Idx;
+    KillIsImmutable = Immutable;
+    assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) &&
+           "Chain: broken invariant. A Chain can only be killed after its last "
+           "def");
+  }
+
+  /// Return the first instruction in the chain.
+  MachineInstr *getStart() const { return StartInst; }
+  /// Return the last instruction in the chain.
+  MachineInstr *getLast() const { return LastInst; }
+  /// Return the "kill" instruction (as set with setKill()) or NULL.
+  MachineInstr *getKill() const { return KillInst; }
+  /// Return an instruction that can be used as an iterator for the end
+  /// of the chain. This is the maximum of KillInst (if set) and LastInst.
+  MachineBasicBlock::iterator end() const {
+    return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst);
+  }
+  MachineBasicBlock::iterator begin() const { return getStart(); }
+
+  /// Can the Kill instruction (assuming one exists) be modified?
+  bool isKillImmutable() const { return KillIsImmutable; }
+
+  /// Return the preferred color of this chain.
+  Color getPreferredColor() {
+    if (OverrideBalance != 0)
+      return OverrideBalance == 1 ? Color::Even : Color::Odd;
+    return LastColor;
+  }
+
+  /// Return true if this chain (StartInst..KillInst) overlaps with Other.
+  bool rangeOverlapsWith(const Chain &Other) const {
+    unsigned End = KillInst ? KillInstIdx : LastInstIdx;
+    unsigned OtherEnd = Other.KillInst ?
+      Other.KillInstIdx : Other.LastInstIdx;
+
+    return StartInstIdx <= OtherEnd && Other.StartInstIdx <= End;
+  }
+
+  /// Return true if this chain starts before Other.
+  bool startsBefore(const Chain *Other) const {
+    return StartInstIdx < Other->StartInstIdx;
+  }
+
+  /// Return true if the group will require a fixup MOV at the end.
+  bool requiresFixup() const {
+    return (getKill() && isKillImmutable()) || !getKill();
+  }
+
+  /// Return a simple string representation of the chain.
+  std::string str() const {
+    std::string S;
+    raw_string_ostream OS(S);
+
+    OS << "{";
+    StartInst->print(OS, /* SkipOpers= */true);
+    OS << " -> ";
+    LastInst->print(OS, /* SkipOpers= */true);
+    if (KillInst) {
+      OS << " (kill @ ";
+      KillInst->print(OS, /* SkipOpers= */true);
+      OS << ")";
+    }
+    OS << "}";
+
+    return OS.str();
+  }
+
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
+  if (skipFunction(*F.getFunction()))
+    return false;
+
+  if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps())
+    return false;
+
+  bool Changed = false;
+  DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n");
+
+  MRI = &F.getRegInfo();
+  TRI = F.getRegInfo().getTargetRegisterInfo();
+  RCI.runOnMachineFunction(F);
+
+  for (auto &MBB : F) {
+    Changed |= runOnBasicBlock(MBB);
+  }
+
+  return Changed;
+}
+
+bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n");
+
+  // First, scan the basic block producing a set of chains.
+
+  // The currently "active" chains - chains that can be added to and haven't
+  // been killed yet. This is keyed by register - all chains can only have one
+  // "link" register between each inst in the chain.
+  std::map<unsigned, Chain*> ActiveChains;
+  std::vector<std::unique_ptr<Chain>> AllChains;
+  unsigned Idx = 0;
+  for (auto &MI : MBB)
+    scanInstruction(&MI, Idx++, ActiveChains, AllChains);
+
+  DEBUG(dbgs() << "Scan complete, "<< AllChains.size() << " chains created.\n");
+
+  // Group the chains into disjoint sets based on their liveness range. This is
+  // a poor-man's version of graph coloring. Ideally we'd create an interference
+  // graph and perform full-on graph coloring on that, but;
+  //   (a) That's rather heavyweight for only two colors.
+  //   (b) We expect multiple disjoint interference regions - in practice the live
+  //       range of chains is quite small and they are clustered between loads
+  //       and stores.
+  EquivalenceClasses<Chain*> EC;
+  for (auto &I : AllChains)
+    EC.insert(I.get());
+
+  for (auto &I : AllChains)
+    for (auto &J : AllChains)
+      if (I != J && I->rangeOverlapsWith(*J))
+        EC.unionSets(I.get(), J.get());
+  DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n");
+
+  // Now we assume that every member of an equivalence class interferes
+  // with every other member of that class, and with no members of other classes.
+
+  // Convert the EquivalenceClasses to a simpler set of sets.
+  std::vector<std::vector<Chain*> > V;
+  for (auto I = EC.begin(), E = EC.end(); I != E; ++I) {
+    std::vector<Chain*> Cs(EC.member_begin(I), EC.member_end());
+    if (Cs.empty()) continue;
+    V.push_back(std::move(Cs));
+  }
+
+  // Now we have a set of sets, order them by start address so
+  // we can iterate over them sequentially.
+  std::sort(V.begin(), V.end(),
+            [](const std::vector<Chain*> &A,
+               const std::vector<Chain*> &B) {
+      return A.front()->startsBefore(B.front());
+    });
+
+  // As we only have two colors, we can track the global (BB-level) balance of
+  // odds versus evens. We aim to keep this near zero to keep both execution
+  // units fed.
+  // Positive means we're even-heavy, negative we're odd-heavy.
+  //
+  // FIXME: If chains have interdependencies, for example:
+  //   mul r0, r1, r2
+  //   mul r3, r0, r1
+  // We do not model this and may color each one differently, assuming we'll
+  // get ILP when we obviously can't. This hasn't been seen to be a problem
+  // in practice so far, so we simplify the algorithm by ignoring it.
+  int Parity = 0;
+
+  for (auto &I : V)
+    Changed |= colorChainSet(std::move(I), MBB, Parity);
+
+  return Changed;
+}
+
+Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor,
+                                                  std::vector<Chain*> &L) {
+  if (L.empty())
+    return nullptr;
+
+  // We try and get the best candidate from L to color next, given that our
+  // preferred color is "PreferredColor". L is ordered from larger to smaller
+  // chains. It is beneficial to color the large chains before the small chains,
+  // but if we can't find a chain of the maximum length with the preferred color,
+  // we fuzz the size and look for slightly smaller chains before giving up and
+  // returning a chain that must be recolored.
+
+  // FIXME: Does this need to be configurable?
+  const unsigned SizeFuzz = 1;
+  unsigned MinSize = L.front()->size() - SizeFuzz;
+  for (auto I = L.begin(), E = L.end(); I != E; ++I) {
+    if ((*I)->size() <= MinSize) {
+      // We've gone past the size limit. Return the previous item.
+      Chain *Ch = *--I;
+      L.erase(I);
+      return Ch;
+    }
+
+    if ((*I)->getPreferredColor() == PreferredColor) {
+      Chain *Ch = *I;
+      L.erase(I);
+      return Ch;
+    }
+  }
+
+  // Bailout case - just return the first item.
+  Chain *Ch = L.front();
+  L.erase(L.begin());
+  return Ch;
+}
+
+bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
+                                              MachineBasicBlock &MBB,
+                                              int &Parity) {
+  bool Changed = false;
+  DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n");
+
+  // Sort by descending size order so that we allocate the most important
+  // sets first.
+  // Tie-break equivalent sizes by sorting chains requiring fixups before
+  // those without fixups. The logic here is that we should look at the
+  // chains that we cannot change before we look at those we can,
+  // so the parity counter is updated and we know what color we should
+  // change them to!
+  // Final tie-break with instruction order so pass output is stable (i.e. not
+  // dependent on malloc'd pointer values).
+  std::sort(GV.begin(), GV.end(), [](const Chain *G1, const Chain *G2) {
+      if (G1->size() != G2->size())
+        return G1->size() > G2->size();
+      if (G1->requiresFixup() != G2->requiresFixup())
+        return G1->requiresFixup() > G2->requiresFixup();
+      // Make sure startsBefore() produces a stable final order.
+      assert((G1 == G2 || (G1->startsBefore(G2) ^ G2->startsBefore(G1))) &&
+             "Starts before not total order!");
+      return G1->startsBefore(G2);
+    });
+
+  Color PreferredColor = Parity < 0 ? Color::Even : Color::Odd;
+  while (Chain *G = getAndEraseNext(PreferredColor, GV)) {
+    // Start off by assuming we'll color to our own preferred color.
+    Color C = PreferredColor;
+    if (Parity == 0)
+      // But if we really don't care, use the chain's preferred color.
+      C = G->getPreferredColor();
+
+    DEBUG(dbgs() << " - Parity=" << Parity << ", Color="
+          << ColorNames[(int)C] << "\n");
+
+    // If we'll need a fixup FMOV, don't bother. Testing has shown that this
+    // happens infrequently and when it does it has at least a 50% chance of
+    // slowing code down instead of speeding it up.
+    if (G->requiresFixup() && C != G->getPreferredColor()) {
+      C = G->getPreferredColor();
+      DEBUG(dbgs() << " - " << G->str() << " - not worthwhile changing; "
+            "color remains " << ColorNames[(int)C] << "\n");
+    }
+
+    Changed |= colorChain(G, C, MBB);
+
+    Parity += (C == Color::Even) ? G->size() : -G->size();
+    PreferredColor = Parity < 0 ? Color::Even : Color::Odd;
+  }
+
+  return Changed;
+}
+
+int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
+                                                MachineBasicBlock &MBB) {
+  RegScavenger RS;
+  RS.enterBasicBlock(MBB);
+  RS.forward(MachineBasicBlock::iterator(G->getStart()));
+
+  // Can we find an appropriate register that is available throughout the life
+  // of the chain?
+  unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass;
+  BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID));
+  for (MachineBasicBlock::iterator I = G->begin(), E = G->end(); I != E; ++I) {
+    RS.forward(I);
+    AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID));
+
+    // Remove any registers clobbered by a regmask or any def register that is
+    // immediately dead.
+    for (auto J : I->operands()) {
+      if (J.isRegMask())
+        AvailableRegs.clearBitsNotInMask(J.getRegMask());
+
+      if (J.isReg() && J.isDef()) {
+        MCRegAliasIterator AI(J.getReg(), TRI, /*IncludeSelf=*/true);
+        if (J.isDead())
+          for (; AI.isValid(); ++AI)
+            AvailableRegs.reset(*AI);
+#ifndef NDEBUG
+        else
+          for (; AI.isValid(); ++AI)
+            assert(!AvailableRegs[*AI] &&
+                   "Non-dead def should have been removed by now!");
+#endif
+      }
+    }
+  }
+
+  // Make sure we allocate in-order, to get the cheapest registers first.
+  auto Ord = RCI.getOrder(TRI->getRegClass(RegClassID));
+  for (auto Reg : Ord) {
+    if (!AvailableRegs[Reg])
+      continue;
+    if (C == getColor(Reg))
+      return Reg;
+  }
+
+  return -1;
+}
+
+bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
+                                           MachineBasicBlock &MBB) {
+  bool Changed = false;
+  DEBUG(dbgs() << " - colorChain(" << G->str() << ", "
+        << ColorNames[(int)C] << ")\n");
+
+  // Try and obtain a free register of the right class. Without a register
+  // to play with we cannot continue.
+  int Reg = scavengeRegister(G, C, MBB);
+  if (Reg == -1) {
+    DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n");
+    return false;
+  }
+  DEBUG(dbgs() << " - Scavenged register: " << TRI->getName(Reg) << "\n");
+
+  std::map<unsigned, unsigned> Substs;
+  for (MachineInstr &I : *G) {
+    if (!G->contains(I) && (&I != G->getKill() || G->isKillImmutable()))
+      continue;
+
+    // I is a member of G, or I is a mutable instruction that kills G.
+
+    std::vector<unsigned> ToErase;
+    for (auto &U : I.operands()) {
+      if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) {
+        unsigned OrigReg = U.getReg();
+        U.setReg(Substs[OrigReg]);
+        if (U.isKill())
+          // Don't erase straight away, because there may be other operands
+          // that also reference this substitution!
+          ToErase.push_back(OrigReg);
+      } else if (U.isRegMask()) {
+        for (auto J : Substs) {
+          if (U.clobbersPhysReg(J.first))
+            ToErase.push_back(J.first);
+        }
+      }
+    }
+    // Now it's safe to remove the substs identified earlier.
+    for (auto J : ToErase)
+      Substs.erase(J);
+
+    // Only change the def if this isn't the last instruction.
+    if (&I != G->getKill()) {
+      MachineOperand &MO = I.getOperand(0);
+
+      bool Change = TransformAll || getColor(MO.getReg()) != C;
+      if (G->requiresFixup() && &I == G->getLast())
+        Change = false;
+
+      if (Change) {
+        Substs[MO.getReg()] = Reg;
+        MO.setReg(Reg);
+
+        Changed = true;
+      }
+    }
+  }
+  assert(Substs.size() == 0 && "No substitutions should be left active!");
+
+  if (G->getKill()) {
+    DEBUG(dbgs() << " - Kill instruction seen.\n");
+  } else {
+    // We didn't have a kill instruction, but we didn't seem to need to change
+    // the destination register anyway.
+    DEBUG(dbgs() << " - Destination register not changed.\n");
+  }
+  return Changed;
+}
+
+void AArch64A57FPLoadBalancing::scanInstruction(
+    MachineInstr *MI, unsigned Idx, std::map<unsigned, Chain *> &ActiveChains,
+    std::vector<std::unique_ptr<Chain>> &AllChains) {
+  // Inspect "MI", updating ActiveChains and AllChains.
+
+  if (isMul(MI)) {
+
+    for (auto &I : MI->uses())
+      maybeKillChain(I, Idx, ActiveChains);
+    for (auto &I : MI->defs())
+      maybeKillChain(I, Idx, ActiveChains);
+
+    // Create a new chain. Multiplies don't require forwarding so can go on any
+    // unit.
+    unsigned DestReg = MI->getOperand(0).getReg();
+
+    DEBUG(dbgs() << "New chain started for register "
+          << TRI->getName(DestReg) << " at " << *MI);
+
+    auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
+    ActiveChains[DestReg] = G.get();
+    AllChains.push_back(std::move(G));
+
+  } else if (isMla(MI)) {
+
+    // It is beneficial to keep MLAs on the same functional unit as their
+    // accumulator operand.
+    unsigned DestReg  = MI->getOperand(0).getReg();
+    unsigned AccumReg = MI->getOperand(3).getReg();
+
+    maybeKillChain(MI->getOperand(1), Idx, ActiveChains);
+    maybeKillChain(MI->getOperand(2), Idx, ActiveChains);
+    if (DestReg != AccumReg)
+      maybeKillChain(MI->getOperand(0), Idx, ActiveChains);
+
+    if (ActiveChains.find(AccumReg) != ActiveChains.end()) {
+      DEBUG(dbgs() << "Chain found for accumulator register "
+            << TRI->getName(AccumReg) << " in MI " << *MI);
+
+      // For simplicity we only chain together sequences of MULs/MLAs where the
+      // accumulator register is killed on each instruction. This means we don't
+      // need to track other uses of the registers we want to rewrite.
+      //
+      // FIXME: We could extend to handle the non-kill cases for more coverage.
+      if (MI->getOperand(3).isKill()) {
+        // Add to chain.
+        DEBUG(dbgs() << "Instruction was successfully added to chain.\n");
+        ActiveChains[AccumReg]->add(MI, Idx, getColor(DestReg));
+        // Handle cases where the destination is not the same as the accumulator.
+        if (DestReg != AccumReg) {
+          ActiveChains[DestReg] = ActiveChains[AccumReg];
+          ActiveChains.erase(AccumReg);
+        }
+        return;
+      }
+
+      DEBUG(dbgs() << "Cannot add to chain because accumulator operand wasn't "
+            << "marked <kill>!\n");
+      maybeKillChain(MI->getOperand(3), Idx, ActiveChains);
+    }
+
+    DEBUG(dbgs() << "Creating new chain for dest register "
+          << TRI->getName(DestReg) << "\n");
+    auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
+    ActiveChains[DestReg] = G.get();
+    AllChains.push_back(std::move(G));
+
+  } else {
+
+    // Non-MUL or MLA instruction. Invalidate any chain in the uses or defs
+    // lists.
+    for (auto &I : MI->uses())
+      maybeKillChain(I, Idx, ActiveChains);
+    for (auto &I : MI->defs())
+      maybeKillChain(I, Idx, ActiveChains);
+
+  }
+}
+
+void AArch64A57FPLoadBalancing::
+maybeKillChain(MachineOperand &MO, unsigned Idx,
+               std::map<unsigned, Chain*> &ActiveChains) {
+  // Given an operand and the set of active chains (keyed by register),
+  // determine if a chain should be ended and remove from ActiveChains.
+  MachineInstr *MI = MO.getParent();
+
+  if (MO.isReg()) {
+
+    // If this is a KILL of a current chain, record it.
+    if (MO.isKill() && ActiveChains.find(MO.getReg()) != ActiveChains.end()) {
+      DEBUG(dbgs() << "Kill seen for chain " << TRI->getName(MO.getReg())
+            << "\n");
+      ActiveChains[MO.getReg()]->setKill(MI, Idx, /*Immutable=*/MO.isTied());
+    }
+    ActiveChains.erase(MO.getReg());
+
+  } else if (MO.isRegMask()) {
+
+    for (auto I = ActiveChains.begin(), E = ActiveChains.end();
+         I != E;) {
+      if (MO.clobbersPhysReg(I->first)) {
+        DEBUG(dbgs() << "Kill (regmask) seen for chain "
+              << TRI->getName(I->first) << "\n");
+        I->second->setKill(MI, Idx, /*Immutable=*/true);
+        ActiveChains.erase(I++);
+      } else
+        ++I;
+    }
+
+  }
+}
+
+Color AArch64A57FPLoadBalancing::getColor(unsigned Reg) {
+  if ((TRI->getEncodingValue(Reg) % 2) == 0)
+    return Color::Even;
+  else
+    return Color::Odd;
+}
+
+// Factory function used by AArch64TargetMachine to add the pass to the passmanager.
+FunctionPass *llvm::createAArch64A57FPLoadBalancing() {
+  return new AArch64A57FPLoadBalancing();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
new file mode 100644
index 000000000000..0cbb2db1134a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -0,0 +1,484 @@
+//===-- AArch64AddressTypePromotion.cpp --- Promote type for addr accesses -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to promote the computations use to obtained a sign extended
+// value used into memory accesses.
+// E.g.
+// a = add nsw i32 b, 3
+// d = sext i32 a to i64
+// e = getelementptr ..., i64 d
+//
+// =>
+// f = sext i32 b to i64
+// a = add nsw i64 f, 3
+// e = getelementptr ..., i64 a
+//
+// This is legal to do if the computations are marked with either nsw or nuw
+// markers. Moreover, the current heuristic is simple: it does not create new
+// sext operations, i.e., it gives up when a sext would have forked (e.g., if a
+// = add i32 b, c, two sexts are required to promote the computation).
+//
+// FIXME: This pass may be useful for other targets too.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-type-promotion"
+
+static cl::opt<bool>
+EnableMerge("aarch64-type-promotion-merge", cl::Hidden,
+            cl::desc("Enable merging of redundant sexts when one is dominating"
+                     " the other."),
+            cl::init(true));
+
+#define AARCH64_TYPE_PROMO_NAME "AArch64 Address Type Promotion"
+
+//===----------------------------------------------------------------------===//
+//                       AArch64AddressTypePromotion
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AArch64AddressTypePromotion : public FunctionPass {
+
+public:
+  static char ID;
+  AArch64AddressTypePromotion()
+      : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) {
+    initializeAArch64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return AARCH64_TYPE_PROMO_NAME; }
+
+  /// Iterate over the functions and promote the computation of interesting
+  // sext instructions.
+  bool runOnFunction(Function &F) override;
+
+private:
+  /// The current function.
+  Function *Func;
+  /// Filter out all sexts that does not have this type.
+  /// Currently initialized with Int64Ty.
+  Type *ConsideredSExtType;
+
+  // This transformation requires dominator info.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  typedef SmallPtrSet<Instruction *, 32> SetOfInstructions;
+  typedef SmallVector<Instruction *, 16> Instructions;
+  typedef DenseMap<Value *, Instructions> ValueToInsts;
+
+  /// Check if it is profitable to move a sext through this instruction.
+  /// Currently, we consider it is profitable if:
+  /// - Inst is used only once (no need to insert truncate).
+  /// - Inst has only one operand that will require a sext operation (we do
+  ///   do not create new sext operation).
+  bool shouldGetThrough(const Instruction *Inst);
+
+  /// Check if it is possible and legal to move a sext through this
+  /// instruction.
+  /// Current heuristic considers that we can get through:
+  /// - Arithmetic operation marked with the nsw or nuw flag.
+  /// - Other sext operation.
+  /// - Truncate operation if it was just dropping sign extended bits.
+  bool canGetThrough(const Instruction *Inst);
+
+  /// Move sext operations through safe to sext instructions.
+  bool propagateSignExtension(Instructions &SExtInsts);
+
+  /// Is this sext should be considered for code motion.
+  /// We look for sext with ConsideredSExtType and uses in at least one
+  // GetElementPtrInst.
+  bool shouldConsiderSExt(const Instruction *SExt) const;
+
+  /// Collect all interesting sext operations, i.e., the ones with the right
+  /// type and used in memory accesses.
+  /// More precisely, a sext instruction is considered as interesting if it
+  /// is used in a "complex" getelementptr or it exits at least another
+  /// sext instruction that sign extended the same initial value.
+  /// A getelementptr is considered as "complex" if it has more than 2
+  // operands.
+  void analyzeSExtension(Instructions &SExtInsts);
+
+  /// Merge redundant sign extension operations in common dominator.
+  void mergeSExts(ValueToInsts &ValToSExtendedUses,
+                  SetOfInstructions &ToRemove);
+};
+} // end anonymous namespace.
+
+char AArch64AddressTypePromotion::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion",
+                      AARCH64_TYPE_PROMO_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion",
+                    AARCH64_TYPE_PROMO_NAME, false, false)
+
+FunctionPass *llvm::createAArch64AddressTypePromotionPass() {
+  return new AArch64AddressTypePromotion();
+}
+
+bool AArch64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
+  if (isa<SExtInst>(Inst))
+    return true;
+
+  const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+  if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
+      (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap()))
+    return true;
+
+  // sext(trunc(sext)) --> sext
+  if (isa<TruncInst>(Inst) && isa<SExtInst>(Inst->getOperand(0))) {
+    const Instruction *Opnd = cast<Instruction>(Inst->getOperand(0));
+    // Check that the truncate just drop sign extended bits.
+    if (Inst->getType()->getIntegerBitWidth() >=
+            Opnd->getOperand(0)->getType()->getIntegerBitWidth() &&
+        Inst->getOperand(0)->getType()->getIntegerBitWidth() <=
+            ConsideredSExtType->getIntegerBitWidth())
+      return true;
+  }
+
+  return false;
+}
+
+bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
+  // If the type of the sext is the same as the considered one, this sext
+  // will become useless.
+  // Otherwise, we will have to do something to preserve the original value,
+  // unless it is used once.
+  if (isa<SExtInst>(Inst) &&
+      (Inst->getType() == ConsideredSExtType || Inst->hasOneUse()))
+    return true;
+
+  // If the Inst is used more that once, we may need to insert truncate
+  // operations and we don't do that at the moment.
+  if (!Inst->hasOneUse())
+    return false;
+
+  // This truncate is used only once, thus if we can get thourgh, it will become
+  // useless.
+  if (isa<TruncInst>(Inst))
+    return true;
+
+  // If both operands are not constant, a new sext will be created here.
+  // Current heuristic is: each step should be profitable.
+  // Therefore we don't allow to increase the number of sext even if it may
+  // be profitable later on.
+  if (isa<BinaryOperator>(Inst) && isa<ConstantInt>(Inst->getOperand(1)))
+    return true;
+
+  return false;
+}
+
+static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
+  return !(isa<SelectInst>(Inst) && OpIdx == 0);
+}
+
+bool
+AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
+  if (SExt->getType() != ConsideredSExtType)
+    return false;
+
+  for (const User *U : SExt->users()) {
+    if (isa<GetElementPtrInst>(U))
+      return true;
+  }
+
+  return false;
+}
+
+// Input:
+// - SExtInsts contains all the sext instructions that are used directly in
+//   GetElementPtrInst, i.e., access to memory.
+// Algorithm:
+// - For each sext operation in SExtInsts:
+//   Let var be the operand of sext.
+//   while it is profitable (see shouldGetThrough), legal, and safe
+//   (see canGetThrough) to move sext through var's definition:
+//   * promote the type of var's definition.
+//   * fold var into sext uses.
+//   * move sext above var's definition.
+//   * update sext operand to use the operand of var that should be sign
+//     extended (by construction there is only one).
+//
+//   E.g.,
+//   a = ... i32 c, 3
+//   b = sext i32 a to i64 <- is it legal/safe/profitable to get through 'a'
+//   ...
+//   = b
+// => Yes, update the code
+//   b = sext i32 c to i64
+//   a = ... i64 b, 3
+//   ...
+//   = a
+// Iterate on 'c'.
+bool
+AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
+  DEBUG(dbgs() << "*** Propagate Sign Extension ***\n");
+
+  bool LocalChange = false;
+  SetOfInstructions ToRemove;
+  ValueToInsts ValToSExtendedUses;
+  while (!SExtInsts.empty()) {
+    // Get through simple chain.
+    Instruction *SExt = SExtInsts.pop_back_val();
+
+    DEBUG(dbgs() << "Consider:\n" << *SExt << '\n');
+
+    // If this SExt has already been merged continue.
+    if (SExt->use_empty() && ToRemove.count(SExt)) {
+      DEBUG(dbgs() << "No uses => marked as delete\n");
+      continue;
+    }
+
+    // Now try to get through the chain of definitions.
+    while (auto *Inst = dyn_cast<Instruction>(SExt->getOperand(0))) {
+      DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n');
+      if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) {
+        // We cannot get through something that is not an Instruction
+        // or not safe to SExt.
+        DEBUG(dbgs() << "Cannot get through\n");
+        break;
+      }
+
+      LocalChange = true;
+      // If this is a sign extend, it becomes useless.
+      if (isa<SExtInst>(Inst) || isa<TruncInst>(Inst)) {
+        DEBUG(dbgs() << "SExt or trunc, mark it as to remove\n");
+        // We cannot use replaceAllUsesWith here because we may trigger some
+        // assertion on the type as all involved sext operation may have not
+        // been moved yet.
+        while (!Inst->use_empty()) {
+          Use &U = *Inst->use_begin();
+          Instruction *User = dyn_cast<Instruction>(U.getUser());
+          assert(User && "User of sext is not an Instruction!");
+          User->setOperand(U.getOperandNo(), SExt);
+        }
+        ToRemove.insert(Inst);
+        SExt->setOperand(0, Inst->getOperand(0));
+        SExt->moveBefore(Inst);
+        continue;
+      }
+
+      // Get through the Instruction:
+      // 1. Update its type.
+      // 2. Replace the uses of SExt by Inst.
+      // 3. Sign extend each operand that needs to be sign extended.
+
+      // Step #1.
+      Inst->mutateType(SExt->getType());
+      // Step #2.
+      SExt->replaceAllUsesWith(Inst);
+      // Step #3.
+      Instruction *SExtForOpnd = SExt;
+
+      DEBUG(dbgs() << "Propagate SExt to operands\n");
+      for (int OpIdx = 0, EndOpIdx = Inst->getNumOperands(); OpIdx != EndOpIdx;
+           ++OpIdx) {
+        DEBUG(dbgs() << "Operand:\n" << *(Inst->getOperand(OpIdx)) << '\n');
+        if (Inst->getOperand(OpIdx)->getType() == SExt->getType() ||
+            !shouldSExtOperand(Inst, OpIdx)) {
+          DEBUG(dbgs() << "No need to propagate\n");
+          continue;
+        }
+        // Check if we can statically sign extend the operand.
+        Value *Opnd = Inst->getOperand(OpIdx);
+        if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
+          DEBUG(dbgs() << "Statically sign extend\n");
+          Inst->setOperand(OpIdx, ConstantInt::getSigned(SExt->getType(),
+                                                         Cst->getSExtValue()));
+          continue;
+        }
+        // UndefValue are typed, so we have to statically sign extend them.
+        if (isa<UndefValue>(Opnd)) {
+          DEBUG(dbgs() << "Statically sign extend\n");
+          Inst->setOperand(OpIdx, UndefValue::get(SExt->getType()));
+          continue;
+        }
+
+        // Otherwise we have to explicity sign extend it.
+        assert(SExtForOpnd &&
+               "Only one operand should have been sign extended");
+
+        SExtForOpnd->setOperand(0, Opnd);
+
+        DEBUG(dbgs() << "Move before:\n" << *Inst << "\nSign extend\n");
+        // Move the sign extension before the insertion point.
+        SExtForOpnd->moveBefore(Inst);
+        Inst->setOperand(OpIdx, SExtForOpnd);
+        // If more sext are required, new instructions will have to be created.
+        SExtForOpnd = nullptr;
+      }
+      if (SExtForOpnd == SExt) {
+        DEBUG(dbgs() << "Sign extension is useless now\n");
+        ToRemove.insert(SExt);
+        break;
+      }
+    }
+
+    // If the use is already of the right type, connect its uses to its argument
+    // and delete it.
+    // This can happen for an Instruction all uses of which are sign extended.
+    if (!ToRemove.count(SExt) &&
+        SExt->getType() == SExt->getOperand(0)->getType()) {
+      DEBUG(dbgs() << "Sign extension is useless, attach its use to "
+                      "its argument\n");
+      SExt->replaceAllUsesWith(SExt->getOperand(0));
+      ToRemove.insert(SExt);
+    } else
+      ValToSExtendedUses[SExt->getOperand(0)].push_back(SExt);
+  }
+
+  if (EnableMerge)
+    mergeSExts(ValToSExtendedUses, ToRemove);
+
+  // Remove all instructions marked as ToRemove.
+  for (Instruction *I: ToRemove)
+    I->eraseFromParent();
+  return LocalChange;
+}
+
+void AArch64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
+                                             SetOfInstructions &ToRemove) {
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  for (auto &Entry : ValToSExtendedUses) {
+    Instructions &Insts = Entry.second;
+    Instructions CurPts;
+    for (Instruction *Inst : Insts) {
+      if (ToRemove.count(Inst))
+        continue;
+      bool inserted = false;
+      for (auto &Pt : CurPts) {
+        if (DT.dominates(Inst, Pt)) {
+          DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n"
+                       << *Inst << '\n');
+          Pt->replaceAllUsesWith(Inst);
+          ToRemove.insert(Pt);
+          Pt = Inst;
+          inserted = true;
+          break;
+        }
+        if (!DT.dominates(Pt, Inst))
+          // Give up if we need to merge in a common dominator as the
+          // expermients show it is not profitable.
+          continue;
+
+        DEBUG(dbgs() << "Replace all uses of:\n" << *Inst << "\nwith:\n"
+                     << *Pt << '\n');
+        Inst->replaceAllUsesWith(Pt);
+        ToRemove.insert(Inst);
+        inserted = true;
+        break;
+      }
+      if (!inserted)
+        CurPts.push_back(Inst);
+    }
+  }
+}
+
+void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
+  DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n");
+
+  DenseMap<Value *, Instruction *> SeenChains;
+
+  for (auto &BB : *Func) {
+    for (auto &II : BB) {
+      Instruction *SExt = &II;
+
+      // Collect all sext operation per type.
+      if (!isa<SExtInst>(SExt) || !shouldConsiderSExt(SExt))
+        continue;
+
+      DEBUG(dbgs() << "Found:\n" << (*SExt) << '\n');
+
+      // Cases where we actually perform the optimization:
+      // 1. SExt is used in a getelementptr with more than 2 operand =>
+      //    likely we can merge some computation if they are done on 64 bits.
+      // 2. The beginning of the SExt chain is SExt several time. =>
+      //    code sharing is possible.
+
+      bool insert = false;
+      // #1.
+      for (const User *U : SExt->users()) {
+        const Instruction *Inst = dyn_cast<GetElementPtrInst>(U);
+        if (Inst && Inst->getNumOperands() > 2) {
+          DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst
+                       << '\n');
+          insert = true;
+          break;
+        }
+      }
+
+      // #2.
+      // Check the head of the chain.
+      Instruction *Inst = SExt;
+      Value *Last;
+      do {
+        int OpdIdx = 0;
+        const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+        if (BinOp && isa<ConstantInt>(BinOp->getOperand(0)))
+          OpdIdx = 1;
+        Last = Inst->getOperand(OpdIdx);
+        Inst = dyn_cast<Instruction>(Last);
+      } while (Inst && canGetThrough(Inst) && shouldGetThrough(Inst));
+
+      DEBUG(dbgs() << "Head of the chain:\n" << *Last << '\n');
+      DenseMap<Value *, Instruction *>::iterator AlreadySeen =
+          SeenChains.find(Last);
+      if (insert || AlreadySeen != SeenChains.end()) {
+        DEBUG(dbgs() << "Insert\n");
+        SExtInsts.push_back(SExt);
+        if (AlreadySeen != SeenChains.end() && AlreadySeen->second != nullptr) {
+          DEBUG(dbgs() << "Insert chain member\n");
+          SExtInsts.push_back(AlreadySeen->second);
+          SeenChains[Last] = nullptr;
+        }
+      } else {
+        DEBUG(dbgs() << "Record its chain membership\n");
+        SeenChains[Last] = SExt;
+      }
+    }
+  }
+}
+
+bool AArch64AddressTypePromotion::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  if (F.isDeclaration())
+    return false;
+  Func = &F;
+  ConsideredSExtType = Type::getInt64Ty(Func->getContext());
+
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << Func->getName() << '\n');
+
+  Instructions SExtInsts;
+  analyzeSExtension(SExtInsts);
+  return propagateSignExtension(SExtInsts);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
new file mode 100644
index 000000000000..bc2320dd20b3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -0,0 +1,414 @@
+//===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// When profitable, replace GPR targeting i64 instructions with their
+// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined
+// as minimizing the number of cross-class register copies.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TODO: Graph based predicate heuristics.
+// Walking the instruction list linearly will get many, perhaps most, of
+// the cases, but to do a truly thorough job of this, we need a more
+// wholistic approach.
+//
+// This optimization is very similar in spirit to the register allocator's
+// spill placement, only here we're determining where to place cross-class
+// register copies rather than spills. As such, a similar approach is
+// called for.
+//
+// We want to build up a set of graphs of all instructions which are candidates
+// for transformation along with instructions which generate their inputs and
+// consume their outputs. For each edge in the graph, we assign a weight
+// based on whether there is a copy required there (weight zero if not) and
+// the block frequency of the block containing the defining or using
+// instruction, whichever is less. Our optimization is then a graph problem
+// to minimize the total weight of all the graphs, then transform instructions
+// and add or remove copy instructions as called for to implement the
+// solution.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-simd-scalar"
+
+// Allow forcing all i64 operations with equivalent SIMD instructions to use
+// them. For stress-testing the transformation function.
+static cl::opt<bool>
+TransformAll("aarch64-simd-scalar-force-all",
+             cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
+             cl::init(false), cl::Hidden);
+
+STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
+STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
+STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
+
+#define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization"
+
+namespace {
+class AArch64AdvSIMDScalar : public MachineFunctionPass {
+  MachineRegisterInfo *MRI;
+  const TargetInstrInfo *TII;
+
+private:
+  // isProfitableToTransform - Predicate function to determine whether an
+  // instruction should be transformed to its equivalent AdvSIMD scalar
+  // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
+  bool isProfitableToTransform(const MachineInstr &MI) const;
+
+  // transformInstruction - Perform the transformation of an instruction
+  // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+  // to be the correct register class, minimizing cross-class copies.
+  void transformInstruction(MachineInstr &MI);
+
+  // processMachineBasicBlock - Main optimzation loop.
+  bool processMachineBasicBlock(MachineBasicBlock *MBB);
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {
+    initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  StringRef getPassName() const override { return AARCH64_ADVSIMD_NAME; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char AArch64AdvSIMDScalar::ID = 0;
+} // end anonymous namespace
+
+INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar",
+                AARCH64_ADVSIMD_NAME, false, false)
+
+static bool isGPR64(unsigned Reg, unsigned SubReg,
+                    const MachineRegisterInfo *MRI) {
+  if (SubReg)
+    return false;
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
+  return AArch64::GPR64RegClass.contains(Reg);
+}
+
+static bool isFPR64(unsigned Reg, unsigned SubReg,
+                    const MachineRegisterInfo *MRI) {
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
+            SubReg == 0) ||
+           (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
+            SubReg == AArch64::dsub);
+  // Physical register references just check the register class directly.
+  return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
+         (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub);
+}
+
+// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
+// copy instruction. Return zero_reg if the instruction is not a copy.
+static MachineOperand *getSrcFromCopy(MachineInstr *MI,
+                                      const MachineRegisterInfo *MRI,
+                                      unsigned &SubReg) {
+  SubReg = 0;
+  // The "FMOV Xd, Dn" instruction is the typical form.
+  if (MI->getOpcode() == AArch64::FMOVDXr ||
+      MI->getOpcode() == AArch64::FMOVXDr)
+    return &MI->getOperand(1);
+  // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
+  // these at this stage, but it's easy to check for.
+  if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
+    SubReg = AArch64::dsub;
+    return &MI->getOperand(1);
+  }
+  // Or just a plain COPY instruction. This can be directly to/from FPR64,
+  // or it can be a dsub subreg reference to an FPR128.
+  if (MI->getOpcode() == AArch64::COPY) {
+    if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
+                MRI) &&
+        isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
+      return &MI->getOperand(1);
+    if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
+                MRI) &&
+        isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
+                MRI)) {
+      SubReg = MI->getOperand(1).getSubReg();
+      return &MI->getOperand(1);
+    }
+  }
+
+  // Otherwise, this is some other kind of instruction.
+  return nullptr;
+}
+
+// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
+// that we're considering transforming to, return that AdvSIMD opcode. For all
+// others, return the original opcode.
+static unsigned getTransformOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    break;
+  // FIXME: Lots more possibilities.
+  case AArch64::ADDXrr:
+    return AArch64::ADDv1i64;
+  case AArch64::SUBXrr:
+    return AArch64::SUBv1i64;
+  case AArch64::ANDXrr:
+    return AArch64::ANDv8i8;
+  case AArch64::EORXrr:
+    return AArch64::EORv8i8;
+  case AArch64::ORRXrr:
+    return AArch64::ORRv8i8;
+  }
+  // No AdvSIMD equivalent, so just return the original opcode.
+  return Opc;
+}
+
+static bool isTransformable(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  return Opc != getTransformOpcode(Opc);
+}
+
+// isProfitableToTransform - Predicate function to determine whether an
+// instruction should be transformed to its equivalent AdvSIMD scalar
+// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
+bool AArch64AdvSIMDScalar::isProfitableToTransform(
+    const MachineInstr &MI) const {
+  // If this instruction isn't eligible to be transformed (no SIMD equivalent),
+  // early exit since that's the common case.
+  if (!isTransformable(MI))
+    return false;
+
+  // Count the number of copies we'll need to add and approximate the number
+  // of copies that a transform will enable us to remove.
+  unsigned NumNewCopies = 3;
+  unsigned NumRemovableCopies = 0;
+
+  unsigned OrigSrc0 = MI.getOperand(1).getReg();
+  unsigned OrigSrc1 = MI.getOperand(2).getReg();
+  unsigned SubReg0;
+  unsigned SubReg1;
+  if (!MRI->def_empty(OrigSrc0)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc0);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+    // If the source was from a copy, we don't need to insert a new copy.
+    if (MOSrc0)
+      --NumNewCopies;
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (MOSrc0 && MRI->hasOneNonDBGUse(OrigSrc0))
+      ++NumRemovableCopies;
+  }
+  if (!MRI->def_empty(OrigSrc1)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc1);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+    if (MOSrc1)
+      --NumNewCopies;
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (MOSrc1 && MRI->hasOneNonDBGUse(OrigSrc1))
+      ++NumRemovableCopies;
+  }
+
+  // If any of the uses of the original instructions is a cross class copy,
+  // that's a copy that will be removable if we transform. Likewise, if
+  // any of the uses is a transformable instruction, it's likely the tranforms
+  // will chain, enabling us to save a copy there, too. This is an aggressive
+  // heuristic that approximates the graph based cost analysis described above.
+  unsigned Dst = MI.getOperand(0).getReg();
+  bool AllUsesAreCopies = true;
+  for (MachineRegisterInfo::use_instr_nodbg_iterator
+           Use = MRI->use_instr_nodbg_begin(Dst),
+           E = MRI->use_instr_nodbg_end();
+       Use != E; ++Use) {
+    unsigned SubReg;
+    if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(*Use))
+      ++NumRemovableCopies;
+    // If the use is an INSERT_SUBREG, that's still something that can
+    // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
+    // preferable to have it use the FPR64 in most cases, as if the source
+    // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
+    // Ditto for a lane insert.
+    else if (Use->getOpcode() == AArch64::INSERT_SUBREG ||
+             Use->getOpcode() == AArch64::INSvi64gpr)
+      ;
+    else
+      AllUsesAreCopies = false;
+  }
+  // If all of the uses of the original destination register are copies to
+  // FPR64, then we won't end up having a new copy back to GPR64 either.
+  if (AllUsesAreCopies)
+    --NumNewCopies;
+
+  // If a transform will not increase the number of cross-class copies required,
+  // return true.
+  if (NumNewCopies <= NumRemovableCopies)
+    return true;
+
+  // Finally, even if we otherwise wouldn't transform, check if we're forcing
+  // transformation of everything.
+  return TransformAll;
+}
+
+static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI,
+                                unsigned Dst, unsigned Src, bool IsKill) {
+  MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                                    TII->get(AArch64::COPY), Dst)
+                                .addReg(Src, getKillRegState(IsKill));
+  DEBUG(dbgs() << "    adding copy: " << *MIB);
+  ++NumCopiesInserted;
+  return MIB;
+}
+
+// transformInstruction - Perform the transformation of an instruction
+// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+// to be the correct register class, minimizing cross-class copies.
+void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
+  DEBUG(dbgs() << "Scalar transform: " << MI);
+
+  MachineBasicBlock *MBB = MI.getParent();
+  unsigned OldOpc = MI.getOpcode();
+  unsigned NewOpc = getTransformOpcode(OldOpc);
+  assert(OldOpc != NewOpc && "transform an instruction to itself?!");
+
+  // Check if we need a copy for the source registers.
+  unsigned OrigSrc0 = MI.getOperand(1).getReg();
+  unsigned OrigSrc1 = MI.getOperand(2).getReg();
+  unsigned Src0 = 0, SubReg0;
+  unsigned Src1 = 0, SubReg1;
+  bool KillSrc0 = false, KillSrc1 = false;
+  if (!MRI->def_empty(OrigSrc0)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc0);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (MOSrc0) {
+      Src0 = MOSrc0->getReg();
+      KillSrc0 = MOSrc0->isKill();
+      // Src0 is going to be reused, thus, it cannot be killed anymore.
+      MOSrc0->setIsKill(false);
+      if (MRI->hasOneNonDBGUse(OrigSrc0)) {
+        assert(MOSrc0 && "Can't delete copy w/o a valid original source!");
+        Def->eraseFromParent();
+        ++NumCopiesDeleted;
+      }
+    }
+  }
+  if (!MRI->def_empty(OrigSrc1)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc1);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (MOSrc1) {
+      Src1 = MOSrc1->getReg();
+      KillSrc1 = MOSrc1->isKill();
+      // Src0 is going to be reused, thus, it cannot be killed anymore.
+      MOSrc1->setIsKill(false);
+      if (MRI->hasOneNonDBGUse(OrigSrc1)) {
+        assert(MOSrc1 && "Can't delete copy w/o a valid original source!");
+        Def->eraseFromParent();
+        ++NumCopiesDeleted;
+      }
+    }
+  }
+  // If we weren't able to reference the original source directly, create a
+  // copy.
+  if (!Src0) {
+    SubReg0 = 0;
+    Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+    insertCopy(TII, MI, Src0, OrigSrc0, KillSrc0);
+    KillSrc0 = true;
+  }
+  if (!Src1) {
+    SubReg1 = 0;
+    Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+    insertCopy(TII, MI, Src1, OrigSrc1, KillSrc1);
+    KillSrc1 = true;
+  }
+
+  // Create a vreg for the destination.
+  // FIXME: No need to do this if the ultimate user expects an FPR64.
+  // Check for that and avoid the copy if possible.
+  unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+
+  // For now, all of the new instructions have the same simple three-register
+  // form, so no need to special case based on what instruction we're
+  // building.
+  BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), Dst)
+      .addReg(Src0, getKillRegState(KillSrc0), SubReg0)
+      .addReg(Src1, getKillRegState(KillSrc1), SubReg1);
+
+  // Now copy the result back out to a GPR.
+  // FIXME: Try to avoid this if all uses could actually just use the FPR64
+  // directly.
+  insertCopy(TII, MI, MI.getOperand(0).getReg(), Dst, true);
+
+  // Erase the old instruction.
+  MI.eraseFromParent();
+
+  ++NumScalarInsnsUsed;
+}
+
+// processMachineBasicBlock - Main optimzation loop.
+bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
+    MachineInstr &MI = *I++;
+    if (isProfitableToTransform(MI)) {
+      transformInstruction(MI);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+// runOnMachineFunction - Pass entry point from PassManager.
+bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
+  bool Changed = false;
+  DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
+
+  if (skipFunction(*mf.getFunction()))
+    return false;
+
+  MRI = &mf.getRegInfo();
+  TII = mf.getSubtarget().getInstrInfo();
+
+  // Just check things on a one-block-at-a-time basis.
+  for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
+    if (processMachineBasicBlock(&*I))
+      Changed = true;
+  return Changed;
+}
+
+// createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine
+// to add the pass to the PassManager.
+FunctionPass *llvm::createAArch64AdvSIMDScalar() {
+  return new AArch64AdvSIMDScalar();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
new file mode 100644
index 000000000000..b2d96a32fd3a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -0,0 +1,716 @@
+//===-- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the AArch64 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "AArch64.h"
+#include "AArch64MCInstLower.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
+#include "InstPrinter/AArch64InstPrinter.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+
+class AArch64AsmPrinter : public AsmPrinter {
+  AArch64MCInstLower MCInstLowering;
+  StackMaps SM;
+  const AArch64Subtarget *STI;
+
+public:
+  AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this),
+        SM(*this), AArch64FI(nullptr) {}
+
+  StringRef getPassName() const override { return "AArch64 Assembly Printer"; }
+
+  /// \brief Wrapper for MCInstLowering.lowerOperand() for the
+  /// tblgen'erated pseudo lowering.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
+    return MCInstLowering.lowerOperand(MO, MCOp);
+  }
+
+  void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                     const MachineInstr &MI);
+  void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                       const MachineInstr &MI);
+
+  void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
+  void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
+  void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
+
+  void EmitXRayTable();
+  void EmitSled(const MachineInstr &MI, SledKind Kind);
+
+  /// \brief tblgen'erated driver function for lowering simple MI->MC
+  /// pseudo instructions.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+  void EmitInstruction(const MachineInstr *MI) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AsmPrinter::getAnalysisUsage(AU);
+    AU.setPreservesAll();
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override {
+    AArch64FI = F.getInfo<AArch64FunctionInfo>();
+    STI = static_cast<const AArch64Subtarget*>(&F.getSubtarget());
+    bool Result = AsmPrinter::runOnMachineFunction(F);
+    EmitXRayTable();
+    return Result;
+  }
+
+private:
+  void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
+  bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
+  bool printAsmRegInClass(const MachineOperand &MO,
+                          const TargetRegisterClass *RC, bool isVector,
+                          raw_ostream &O);
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O) override;
+
+  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+  void EmitFunctionBodyEnd() override;
+
+  MCSymbol *GetCPISymbol(unsigned CPID) const override;
+  void EmitEndOfAsmFile(Module &M) override;
+  AArch64FunctionInfo *AArch64FI;
+
+  /// \brief Emit the LOHs contained in AArch64FI.
+  void EmitLOHs();
+
+  /// Emit instruction to set float register to zero.
+  void EmitFMov0(const MachineInstr &MI);
+
+  typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
+  MInstToMCSymbol LOHInstToLabel;
+};
+
+} // end of anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI)
+{
+  EmitSled(MI, SledKind::FUNCTION_ENTER);
+}
+
+void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI)
+{
+  EmitSled(MI, SledKind::FUNCTION_EXIT);
+}
+
+void AArch64AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI)
+{
+  EmitSled(MI, SledKind::TAIL_CALL);
+}
+
+void AArch64AsmPrinter::EmitXRayTable()
+{
+  //TODO: merge the logic for ELF XRay sleds at a higher level, so to avoid
+  // code duplication as it is now for x86_64, ARM32 and AArch64.
+  if (Sleds.empty())
+    return;
+
+  auto PrevSection = OutStreamer->getCurrentSectionOnly();
+  auto Fn = MF->getFunction();
+  MCSection *Section;
+
+  if (STI->isTargetELF()) {
+    if (Fn->hasComdat())
+      Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+        ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
+        Fn->getComdat()->getName());
+    else
+      Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+        ELF::SHF_ALLOC);
+  } else if (STI->isTargetMachO()) {
+    Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
+                                         SectionKind::getReadOnlyWithRel());
+  } else {
+    llvm_unreachable("Unsupported target");
+  }
+
+  // Before we switch over, we force a reference to a label inside the
+  // xray_instr_map section. Since EmitXRayTable() is always called just
+  // before the function's end, we assume that this is happening after the
+  // last return instruction.
+  //
+  // We then align the reference to 16 byte boundaries, which we determined
+  // experimentally to be beneficial to avoid causing decoder stalls.
+  MCSymbol *Tmp = OutContext.createTempSymbol("xray_synthetic_", true);
+  OutStreamer->EmitCodeAlignment(16);
+  OutStreamer->EmitSymbolValue(Tmp, 8, false);
+  OutStreamer->SwitchSection(Section);
+  OutStreamer->EmitLabel(Tmp);
+  for (const auto &Sled : Sleds) {
+    OutStreamer->EmitSymbolValue(Sled.Sled, 8);
+    OutStreamer->EmitSymbolValue(CurrentFnSym, 8);
+    auto Kind = static_cast<uint8_t>(Sled.Kind);
+    OutStreamer->EmitBytes(
+      StringRef(reinterpret_cast<const char *>(&Kind), 1));
+    OutStreamer->EmitBytes(
+      StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1));
+    OutStreamer->EmitZeros(14);
+  }
+  OutStreamer->SwitchSection(PrevSection);
+
+  Sleds.clear();
+}
+
+void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
+{
+  static const int8_t NoopsInSledCount = 7;
+  // We want to emit the following pattern:
+  //
+  // .Lxray_sled_N:
+  //   ALIGN
+  //   B #32
+  //   ; 7 NOP instructions (28 bytes)
+  // .tmpN
+  //
+  // We need the 28 bytes (7 instructions) because at runtime, we'd be patching
+  // over the full 32 bytes (8 instructions) with the following pattern:
+  //
+  //   STP X0, X30, [SP, #-16]! ; push X0 and the link register to the stack
+  //   LDR W0, #12 ; W0 := function ID
+  //   LDR X16,#12 ; X16 := addr of __xray_FunctionEntry or __xray_FunctionExit
+  //   BLR X16 ; call the tracing trampoline
+  //   ;DATA: 32 bits of function ID
+  //   ;DATA: lower 32 bits of the address of the trampoline
+  //   ;DATA: higher 32 bits of the address of the trampoline
+  //   LDP X0, X30, [SP], #16 ; pop X0 and the link register from the stack
+  //
+  OutStreamer->EmitCodeAlignment(4);
+  auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+  OutStreamer->EmitLabel(CurSled);
+  auto Target = OutContext.createTempSymbol();
+
+  // Emit "B #32" instruction, which jumps over the next 28 bytes.
+  // The operand has to be the number of 4-byte instructions to jump over,
+  // including the current instruction.
+  EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::B).addImm(8));
+
+  for (int8_t I = 0; I < NoopsInSledCount; I++)
+    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+
+  OutStreamer->EmitLabel(Target);
+  recordSled(CurSled, MI, Kind);
+}
+
+void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
+  const Triple &TT = TM.getTargetTriple();
+  if (TT.isOSBinFormatMachO()) {
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+    SM.serializeToStackMapSection();
+  }
+}
+
+void AArch64AsmPrinter::EmitLOHs() {
+  SmallVector<MCSymbol *, 3> MCArgs;
+
+  for (const auto &D : AArch64FI->getLOHContainer()) {
+    for (const MachineInstr *MI : D.getArgs()) {
+      MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI);
+      assert(LabelIt != LOHInstToLabel.end() &&
+             "Label hasn't been inserted for LOH related instruction");
+      MCArgs.push_back(LabelIt->second);
+    }
+    OutStreamer->EmitLOHDirective(D.getKind(), MCArgs);
+    MCArgs.clear();
+  }
+}
+
+void AArch64AsmPrinter::EmitFunctionBodyEnd() {
+  if (!AArch64FI->getLOHRelated().empty())
+    EmitLOHs();
+}
+
+/// GetCPISymbol - Return the symbol for the specified constant pool entry.
+MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const {
+  // Darwin uses a linker-private symbol name for constant-pools (to
+  // avoid addends on the relocation?), ELF has no such concept and
+  // uses a normal private symbol.
+  if (!getDataLayout().getLinkerPrivateGlobalPrefix().empty())
+    return OutContext.getOrCreateSymbol(
+        Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
+        Twine(getFunctionNumber()) + "_" + Twine(CPID));
+
+  return OutContext.getOrCreateSymbol(
+      Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
+      Twine(getFunctionNumber()) + "_" + Twine(CPID));
+}
+
+void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("<unknown operand type>");
+  case MachineOperand::MO_Register: {
+    unsigned Reg = MO.getReg();
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    O << AArch64InstPrinter::getRegisterName(Reg);
+    break;
+  }
+  case MachineOperand::MO_Immediate: {
+    int64_t Imm = MO.getImm();
+    O << '#' << Imm;
+    break;
+  }
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MO.getGlobal();
+    MCSymbol *Sym = getSymbol(GV);
+
+    // FIXME: Can we get anything other than a plain symbol here?
+    assert(!MO.getTargetFlags() && "Unknown operand target flag!");
+
+    Sym->print(O, MAI);
+    printOffset(MO.getOffset(), O);
+    break;
+  }
+  }
+}
+
+bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
+                                          raw_ostream &O) {
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default:
+    return true; // Unknown mode.
+  case 'w':
+    Reg = getWRegFromXReg(Reg);
+    break;
+  case 'x':
+    Reg = getXRegFromWReg(Reg);
+    break;
+  }
+
+  O << AArch64InstPrinter::getRegisterName(Reg);
+  return false;
+}
+
+// Prints the register in MO using class RC using the offset in the
+// new register class. This should not be used for cross class
+// printing.
+bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
+                                           const TargetRegisterClass *RC,
+                                           bool isVector, raw_ostream &O) {
+  assert(MO.isReg() && "Should only get here with a register!");
+  const TargetRegisterInfo *RI = STI->getRegisterInfo();
+  unsigned Reg = MO.getReg();
+  unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
+  assert(RI->regsOverlap(RegToPrint, Reg));
+  O << AArch64InstPrinter::getRegisterName(
+           RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName);
+  return false;
+}
+
+bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                                        unsigned AsmVariant,
+                                        const char *ExtraCode, raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+
+  // First try the generic code, which knows about modifiers like 'c' and 'n'.
+  if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
+    return false;
+
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      return true; // Unknown modifier.
+    case 'w':      // Print W register
+    case 'x':      // Print X register
+      if (MO.isReg())
+        return printAsmMRegister(MO, ExtraCode[0], O);
+      if (MO.isImm() && MO.getImm() == 0) {
+        unsigned Reg = ExtraCode[0] == 'w' ? AArch64::WZR : AArch64::XZR;
+        O << AArch64InstPrinter::getRegisterName(Reg);
+        return false;
+      }
+      printOperand(MI, OpNum, O);
+      return false;
+    case 'b': // Print B register.
+    case 'h': // Print H register.
+    case 's': // Print S register.
+    case 'd': // Print D register.
+    case 'q': // Print Q register.
+      if (MO.isReg()) {
+        const TargetRegisterClass *RC;
+        switch (ExtraCode[0]) {
+        case 'b':
+          RC = &AArch64::FPR8RegClass;
+          break;
+        case 'h':
+          RC = &AArch64::FPR16RegClass;
+          break;
+        case 's':
+          RC = &AArch64::FPR32RegClass;
+          break;
+        case 'd':
+          RC = &AArch64::FPR64RegClass;
+          break;
+        case 'q':
+          RC = &AArch64::FPR128RegClass;
+          break;
+        default:
+          return true;
+        }
+        return printAsmRegInClass(MO, RC, false /* vector */, O);
+      }
+      printOperand(MI, OpNum, O);
+      return false;
+    }
+  }
+
+  // According to ARM, we should emit x and v registers unless we have a
+  // modifier.
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+
+    // If this is a w or x register, print an x register.
+    if (AArch64::GPR32allRegClass.contains(Reg) ||
+        AArch64::GPR64allRegClass.contains(Reg))
+      return printAsmMRegister(MO, 'x', O);
+
+    // If this is a b, h, s, d, or q register, print it as a v register.
+    return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */,
+                              O);
+  }
+
+  printOperand(MI, OpNum, O);
+  return false;
+}
+
+bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                              unsigned OpNum,
+                                              unsigned AsmVariant,
+                                              const char *ExtraCode,
+                                              raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isReg() && "unexpected inline asm memory operand");
+  O << "[" << AArch64InstPrinter::getRegisterName(MO.getReg()) << "]";
+  return false;
+}
+
+void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                               raw_ostream &OS) {
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps == 4);
+  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  OS << cast<DILocalVariable>(MI->getOperand(NOps - 2).getMetadata())
+            ->getName();
+  OS << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+  OS << '[';
+  printOperand(MI, 0, OS);
+  OS << '+';
+  printOperand(MI, 1, OS);
+  OS << ']';
+  OS << "+";
+  printOperand(MI, NOps - 2, OS);
+}
+
+void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                                      const MachineInstr &MI) {
+  unsigned NumNOPBytes = StackMapOpers(&MI).getNumPatchBytes();
+
+  SM.recordStackMap(MI);
+  assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+
+  // Scan ahead to trim the shadow.
+  const MachineBasicBlock &MBB = *MI.getParent();
+  MachineBasicBlock::const_iterator MII(MI);
+  ++MII;
+  while (NumNOPBytes > 0) {
+    if (MII == MBB.end() || MII->isCall() ||
+        MII->getOpcode() == AArch64::DBG_VALUE ||
+        MII->getOpcode() == TargetOpcode::PATCHPOINT ||
+        MII->getOpcode() == TargetOpcode::STACKMAP)
+      break;
+    ++MII;
+    NumNOPBytes -= 4;
+  }
+
+  // Emit nops.
+  for (unsigned i = 0; i < NumNOPBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>
+void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                                        const MachineInstr &MI) {
+  SM.recordPatchPoint(MI);
+
+  PatchPointOpers Opers(&MI);
+
+  int64_t CallTarget = Opers.getCallTarget().getImm();
+  unsigned EncodedBytes = 0;
+  if (CallTarget) {
+    assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
+           "High 16 bits of call target should be zero.");
+    unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+    EncodedBytes = 16;
+    // Materialize the jump address:
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 32) & 0xFFFF)
+                                    .addImm(32));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 16) & 0xFFFF)
+                                    .addImm(16));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm(CallTarget & 0xFFFF)
+                                    .addImm(0));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::BLR).addReg(ScratchReg));
+  }
+  // Emit padding.
+  unsigned NumBytes = Opers.getNumPatchBytes();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+  assert((NumBytes - EncodedBytes) % 4 == 0 &&
+         "Invalid number of NOP bytes requested!");
+  for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+}
+
+void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
+  unsigned DestReg = MI.getOperand(0).getReg();
+  if (STI->hasZeroCycleZeroing()) {
+    // Convert S/D register to corresponding Q register
+    if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31) {
+      DestReg = AArch64::Q0 + (DestReg - AArch64::S0);
+    } else {
+      assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
+      DestReg = AArch64::Q0 + (DestReg - AArch64::D0);
+    }
+    MCInst MOVI;
+    MOVI.setOpcode(AArch64::MOVIv2d_ns);
+    MOVI.addOperand(MCOperand::createReg(DestReg));
+    MOVI.addOperand(MCOperand::createImm(0));
+    EmitToStreamer(*OutStreamer, MOVI);
+  } else {
+    MCInst FMov;
+    switch (MI.getOpcode()) {
+    default: llvm_unreachable("Unexpected opcode");
+    case AArch64::FMOVS0:
+      FMov.setOpcode(AArch64::FMOVWSr);
+      FMov.addOperand(MCOperand::createReg(DestReg));
+      FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+      break;
+    case AArch64::FMOVD0:
+      FMov.setOpcode(AArch64::FMOVXDr);
+      FMov.addOperand(MCOperand::createReg(DestReg));
+      FMov.addOperand(MCOperand::createReg(AArch64::XZR));
+      break;
+    }
+    EmitToStreamer(*OutStreamer, FMov);
+  }
+}
+
+// Simple pseudo-instructions have their lowering (with expansion to real
+// instructions) auto-generated.
+#include "AArch64GenMCPseudoLowering.inc"
+
+void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  // Do any auto-generated pseudo lowerings.
+  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+    return;
+
+  if (AArch64FI->getLOHRelated().count(MI)) {
+    // Generate a label for LOH related instruction
+    MCSymbol *LOHLabel = createTempSymbol("loh");
+    // Associate the instruction with the label
+    LOHInstToLabel[MI] = LOHLabel;
+    OutStreamer->EmitLabel(LOHLabel);
+  }
+
+  // Do any manual lowerings.
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::DBG_VALUE: {
+    if (isVerbose() && OutStreamer->hasRawTextSupport()) {
+      SmallString<128> TmpStr;
+      raw_svector_ostream OS(TmpStr);
+      PrintDebugValueComment(MI, OS);
+      OutStreamer->EmitRawText(StringRef(OS.str()));
+    }
+    return;
+  }
+
+  // Tail calls use pseudo instructions so they have the proper code-gen
+  // attributes (isCall, isReturn, etc.). We lower them to the real
+  // instruction here.
+  case AArch64::TCRETURNri: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(AArch64::BR);
+    TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
+  }
+  case AArch64::TCRETURNdi: {
+    MCOperand Dest;
+    MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
+    MCInst TmpInst;
+    TmpInst.setOpcode(AArch64::B);
+    TmpInst.addOperand(Dest);
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
+  }
+  case AArch64::TLSDESC_CALLSEQ: {
+    /// lower this to:
+    ///    adrp  x0, :tlsdesc:var
+    ///    ldr   x1, [x0, #:tlsdesc_lo12:var]
+    ///    add   x0, x0, #:tlsdesc_lo12:var
+    ///    .tlsdesccall var
+    ///    blr   x1
+    ///    (TPIDR_EL0 offset now in x0)
+    const MachineOperand &MO_Sym = MI->getOperand(0);
+    MachineOperand MO_TLSDESC_LO12(MO_Sym), MO_TLSDESC(MO_Sym);
+    MCOperand Sym, SymTLSDescLo12, SymTLSDesc;
+    MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF |
+                                   AArch64II::MO_NC);
+    MO_TLSDESC.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGE);
+    MCInstLowering.lowerOperand(MO_Sym, Sym);
+    MCInstLowering.lowerOperand(MO_TLSDESC_LO12, SymTLSDescLo12);
+    MCInstLowering.lowerOperand(MO_TLSDESC, SymTLSDesc);
+
+    MCInst Adrp;
+    Adrp.setOpcode(AArch64::ADRP);
+    Adrp.addOperand(MCOperand::createReg(AArch64::X0));
+    Adrp.addOperand(SymTLSDesc);
+    EmitToStreamer(*OutStreamer, Adrp);
+
+    MCInst Ldr;
+    Ldr.setOpcode(AArch64::LDRXui);
+    Ldr.addOperand(MCOperand::createReg(AArch64::X1));
+    Ldr.addOperand(MCOperand::createReg(AArch64::X0));
+    Ldr.addOperand(SymTLSDescLo12);
+    Ldr.addOperand(MCOperand::createImm(0));
+    EmitToStreamer(*OutStreamer, Ldr);
+
+    MCInst Add;
+    Add.setOpcode(AArch64::ADDXri);
+    Add.addOperand(MCOperand::createReg(AArch64::X0));
+    Add.addOperand(MCOperand::createReg(AArch64::X0));
+    Add.addOperand(SymTLSDescLo12);
+    Add.addOperand(MCOperand::createImm(AArch64_AM::getShiftValue(0)));
+    EmitToStreamer(*OutStreamer, Add);
+
+    // Emit a relocation-annotation. This expands to no code, but requests
+    // the following instruction gets an R_AARCH64_TLSDESC_CALL.
+    MCInst TLSDescCall;
+    TLSDescCall.setOpcode(AArch64::TLSDESCCALL);
+    TLSDescCall.addOperand(Sym);
+    EmitToStreamer(*OutStreamer, TLSDescCall);
+
+    MCInst Blr;
+    Blr.setOpcode(AArch64::BLR);
+    Blr.addOperand(MCOperand::createReg(AArch64::X1));
+    EmitToStreamer(*OutStreamer, Blr);
+
+    return;
+  }
+
+  case AArch64::FMOVS0:
+  case AArch64::FMOVD0:
+    EmitFMov0(*MI);
+    return;
+
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(*OutStreamer, SM, *MI);
+
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(*OutStreamer, SM, *MI);
+
+  case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
+    LowerPATCHABLE_FUNCTION_ENTER(*MI);
+    return;
+
+  case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
+    LowerPATCHABLE_FUNCTION_EXIT(*MI);
+    return;
+
+  case TargetOpcode::PATCHABLE_TAIL_CALL:
+    LowerPATCHABLE_TAIL_CALL(*MI);
+    return;
+  }
+
+  // Finally, do the automated lowerings for everything else.
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  EmitToStreamer(*OutStreamer, TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeAArch64AsmPrinter() {
+  RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget());
+  RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget());
+  RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target());
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
new file mode 100644
index 000000000000..a4950af32097
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -0,0 +1,317 @@
+//===-- llvm/lib/Target/AArch64/AArch64CallLowering.cpp - Call lowering ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64CallLowering.h"
+#include "AArch64ISelLowering.h"
+
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "This shouldn't be built without GISel"
+#endif
+
+AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
+  : CallLowering(&TLI) {
+}
+
+struct IncomingArgHandler : public CallLowering::ValueHandler {
+  IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+    : ValueHandler(MIRBuilder, MRI) {}
+
+  unsigned getStackAddress(uint64_t Size, int64_t Offset,
+                           MachinePointerInfo &MPO) override {
+    auto &MFI = MIRBuilder.getMF().getFrameInfo();
+    int FI = MFI.CreateFixedObject(Size, Offset, true);
+    MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
+    unsigned AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 64));
+    MIRBuilder.buildFrameIndex(AddrReg, FI);
+    return AddrReg;
+  }
+
+  void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+                        CCValAssign &VA) override {
+    markPhysRegUsed(PhysReg);
+    MIRBuilder.buildCopy(ValVReg, PhysReg);
+    // FIXME: assert extension
+  }
+
+  void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+                            MachinePointerInfo &MPO, CCValAssign &VA) override {
+    auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+        MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
+        0);
+    MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
+  }
+
+  /// How the physical register gets marked varies between formal
+  /// parameters (it's a basic-block live-in), and a call instruction
+  /// (it's an implicit-def of the BL).
+  virtual void markPhysRegUsed(unsigned PhysReg) = 0;
+};
+
+struct FormalArgHandler : public IncomingArgHandler {
+  FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+      : IncomingArgHandler(MIRBuilder, MRI) {}
+
+  void markPhysRegUsed(unsigned PhysReg) override {
+    MIRBuilder.getMBB().addLiveIn(PhysReg);
+  }
+};
+
+struct CallReturnHandler : public IncomingArgHandler {
+  CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                       MachineInstrBuilder MIB)
+    : IncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
+
+  void markPhysRegUsed(unsigned PhysReg) override {
+    MIB.addDef(PhysReg, RegState::Implicit);
+  }
+
+  MachineInstrBuilder MIB;
+};
+
+struct OutgoingArgHandler : public CallLowering::ValueHandler {
+  OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                     MachineInstrBuilder MIB)
+      : ValueHandler(MIRBuilder, MRI), MIB(MIB) {}
+
+  unsigned getStackAddress(uint64_t Size, int64_t Offset,
+                           MachinePointerInfo &MPO) override {
+    LLT p0 = LLT::pointer(0, 64);
+    LLT s64 = LLT::scalar(64);
+    unsigned SPReg = MRI.createGenericVirtualRegister(p0);
+    MIRBuilder.buildCopy(SPReg, AArch64::SP);
+
+    unsigned OffsetReg = MRI.createGenericVirtualRegister(s64);
+    MIRBuilder.buildConstant(OffsetReg, Offset);
+
+    unsigned AddrReg = MRI.createGenericVirtualRegister(p0);
+    MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
+
+    MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
+    return AddrReg;
+  }
+
+  void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+                        CCValAssign &VA) override {
+    MIB.addUse(PhysReg, RegState::Implicit);
+    unsigned ExtReg = extendRegister(ValVReg, VA);
+    MIRBuilder.buildCopy(PhysReg, ExtReg);
+  }
+
+  void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+                            MachinePointerInfo &MPO, CCValAssign &VA) override {
+    auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+        MPO, MachineMemOperand::MOStore, Size, 0);
+    MIRBuilder.buildStore(ValVReg, Addr, *MMO);
+  }
+
+  MachineInstrBuilder MIB;
+};
+
+void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
+                                            SmallVectorImpl<ArgInfo> &SplitArgs,
+                                            const DataLayout &DL,
+                                            MachineRegisterInfo &MRI,
+                                            SplitArgTy PerformArgSplit) const {
+  const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+  LLVMContext &Ctx = OrigArg.Ty->getContext();
+
+  SmallVector<EVT, 4> SplitVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
+
+  if (SplitVTs.size() == 1) {
+    // No splitting to do, but we want to replace the original type (e.g. [1 x
+    // double] -> double).
+    SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx),
+                           OrigArg.Flags);
+    return;
+  }
+
+  unsigned FirstRegIdx = SplitArgs.size();
+  for (auto SplitVT : SplitVTs) {
+    // FIXME: set split flags if they're actually used (e.g. i128 on AAPCS).
+    Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
+    SplitArgs.push_back(
+        ArgInfo{MRI.createGenericVirtualRegister(LLT{*SplitTy, DL}), SplitTy,
+                OrigArg.Flags});
+  }
+
+  SmallVector<uint64_t, 4> BitOffsets;
+  for (auto Offset : Offsets)
+    BitOffsets.push_back(Offset * 8);
+
+  SmallVector<unsigned, 8> SplitRegs;
+  for (auto I = &SplitArgs[FirstRegIdx]; I != SplitArgs.end(); ++I)
+    SplitRegs.push_back(I->Reg);
+
+  PerformArgSplit(SplitRegs, BitOffsets);
+}
+
+bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+                                      const Value *Val, unsigned VReg) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  const Function &F = *MF.getFunction();
+
+  auto MIB = MIRBuilder.buildInstrNoInsert(AArch64::RET_ReallyLR);
+  assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg");
+  bool Success = true;
+  if (VReg) {
+    const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+    CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(F.getCallingConv());
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    auto &DL = F.getParent()->getDataLayout();
+
+    ArgInfo OrigArg{VReg, Val->getType()};
+    setArgFlags(OrigArg, AttributeSet::ReturnIndex, DL, F);
+
+    SmallVector<ArgInfo, 8> SplitArgs;
+    splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
+                      [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
+                        MIRBuilder.buildExtract(Regs, Offsets, VReg);
+                      });
+
+    OutgoingArgHandler Handler(MIRBuilder, MRI, MIB);
+    Success = handleAssignments(MIRBuilder, AssignFn, SplitArgs, Handler);
+  }
+
+  MIRBuilder.insertInstr(MIB);
+  return Success;
+}
+
+bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+                                               const Function &F,
+                                               ArrayRef<unsigned> VRegs) const {
+  auto &Args = F.getArgumentList();
+  MachineFunction &MF = MIRBuilder.getMF();
+  MachineBasicBlock &MBB = MIRBuilder.getMBB();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  auto &DL = F.getParent()->getDataLayout();
+
+  SmallVector<ArgInfo, 8> SplitArgs;
+  unsigned i = 0;
+  for (auto &Arg : Args) {
+    ArgInfo OrigArg{VRegs[i], Arg.getType()};
+    setArgFlags(OrigArg, i + 1, DL, F);
+    splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
+                      [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
+                        MIRBuilder.buildSequence(VRegs[i], Regs, Offsets);
+                      });
+    ++i;
+  }
+
+  if (!MBB.empty())
+    MIRBuilder.setInstr(*MBB.begin());
+
+  const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+  CCAssignFn *AssignFn =
+      TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false);
+
+  FormalArgHandler Handler(MIRBuilder, MRI);
+  if (!handleAssignments(MIRBuilder, AssignFn, SplitArgs, Handler))
+    return false;
+
+  // Move back to the end of the basic block.
+  MIRBuilder.setMBB(MBB);
+
+  return true;
+}
+
+bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+                                    const MachineOperand &Callee,
+                                    const ArgInfo &OrigRet,
+                                    ArrayRef<ArgInfo> OrigArgs) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  const Function &F = *MF.getFunction();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  auto &DL = F.getParent()->getDataLayout();
+
+  SmallVector<ArgInfo, 8> SplitArgs;
+  for (auto &OrigArg : OrigArgs) {
+    splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
+                      [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
+                        MIRBuilder.buildExtract(Regs, Offsets, OrigArg.Reg);
+                      });
+  }
+
+  // Find out which ABI gets to decide where things go.
+  const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+  CCAssignFn *CallAssignFn =
+      TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false);
+
+  // Create a temporarily-floating call instruction so we can add the implicit
+  // uses of arg registers.
+  auto MIB = MIRBuilder.buildInstrNoInsert(Callee.isReg() ? AArch64::BLR
+                                                          : AArch64::BL);
+  MIB.addOperand(Callee);
+
+  // Tell the call which registers are clobbered.
+  auto TRI = MF.getSubtarget().getRegisterInfo();
+  MIB.addRegMask(TRI->getCallPreservedMask(MF, F.getCallingConv()));
+
+  // Do the actual argument marshalling.
+  SmallVector<unsigned, 8> PhysRegs;
+  OutgoingArgHandler Handler(MIRBuilder, MRI, MIB);
+  if (!handleAssignments(MIRBuilder, CallAssignFn, SplitArgs, Handler))
+    return false;
+
+  // Now we can add the actual call instruction to the correct basic block.
+  MIRBuilder.insertInstr(MIB);
+
+  // If Callee is a reg, since it is used by a target specific
+  // instruction, it must have a register class matching the
+  // constraint of that instruction.
+  if (Callee.isReg())
+    MIB->getOperand(0).setReg(constrainOperandRegClass(
+        MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
+        *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(),
+        Callee.getReg(), 0));
+
+  // Finally we can copy the returned value back into its virtual-register. In
+  // symmetry with the arugments, the physical register must be an
+  // implicit-define of the call instruction.
+  CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(F.getCallingConv());
+  if (OrigRet.Reg) {
+    SplitArgs.clear();
+
+    SmallVector<uint64_t, 8> RegOffsets;
+    SmallVector<unsigned, 8> SplitRegs;
+    splitToValueTypes(OrigRet, SplitArgs, DL, MRI,
+                      [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
+                        std::copy(Offsets.begin(), Offsets.end(),
+                                  std::back_inserter(RegOffsets));
+                        std::copy(Regs.begin(), Regs.end(),
+                                  std::back_inserter(SplitRegs));
+                      });
+
+    CallReturnHandler Handler(MIRBuilder, MRI, MIB);
+    if (!handleAssignments(MIRBuilder, RetAssignFn, SplitArgs, Handler))
+      return false;
+
+    if (!RegOffsets.empty())
+      MIRBuilder.buildSequence(OrigRet.Reg, SplitRegs, RegOffsets);
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h
new file mode 100644
index 000000000000..ce6676249df6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h
@@ -0,0 +1,56 @@
+//===-- llvm/lib/Target/AArch64/AArch64CallLowering.h - Call lowering -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
+#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
+
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/ValueTypes.h"
+
+namespace llvm {
+
+class AArch64TargetLowering;
+
+class AArch64CallLowering: public CallLowering {
+ public:
+  AArch64CallLowering(const AArch64TargetLowering &TLI);
+
+  bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+                   unsigned VReg) const override;
+
+  bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+                            ArrayRef<unsigned> VRegs) const override;
+
+  bool lowerCall(MachineIRBuilder &MIRBuilder, const MachineOperand &Callee,
+                 const ArgInfo &OrigRet,
+                 ArrayRef<ArgInfo> OrigArgs) const override;
+
+private:
+  typedef std::function<void(MachineIRBuilder &, Type *, unsigned,
+                             CCValAssign &)>
+      RegHandler;
+
+  typedef std::function<void(MachineIRBuilder &, int, CCValAssign &)>
+      MemHandler;
+
+  typedef std::function<void(ArrayRef<unsigned>, ArrayRef<uint64_t>)>
+      SplitArgTy;
+
+  void splitToValueTypes(const ArgInfo &OrigArgInfo,
+                         SmallVectorImpl<ArgInfo> &SplitArgs,
+                         const DataLayout &DL, MachineRegisterInfo &MRI,
+                         SplitArgTy SplitArg) const;
+};
+} // End of namespace llvm;
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h
new file mode 100644
index 000000000000..bc44bc5f2461
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h
@@ -0,0 +1,139 @@
+//=== AArch64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the AArch64 Calling Convention
+// that aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace {
+using namespace llvm;
+
+static const MCPhysReg XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
+                                     AArch64::X3, AArch64::X4, AArch64::X5,
+                                     AArch64::X6, AArch64::X7};
+static const MCPhysReg HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
+                                     AArch64::H3, AArch64::H4, AArch64::H5,
+                                     AArch64::H6, AArch64::H7};
+static const MCPhysReg SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
+                                     AArch64::S3, AArch64::S4, AArch64::S5,
+                                     AArch64::S6, AArch64::S7};
+static const MCPhysReg DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
+                                     AArch64::D3, AArch64::D4, AArch64::D5,
+                                     AArch64::D6, AArch64::D7};
+static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
+                                     AArch64::Q3, AArch64::Q4, AArch64::Q5,
+                                     AArch64::Q6, AArch64::Q7};
+
+static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
+                             MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
+                             CCState &State, unsigned SlotAlign) {
+  unsigned Size = LocVT.getSizeInBits() / 8;
+  unsigned StackAlign =
+      State.getMachineFunction().getDataLayout().getStackAlignment();
+  unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign);
+
+  for (auto &It : PendingMembers) {
+    It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign)));
+    State.addLoc(It);
+    SlotAlign = 1;
+  }
+
+  // All pending members have now been allocated
+  PendingMembers.clear();
+  return true;
+}
+
+/// The Darwin variadic PCS places anonymous arguments in 8-byte stack slots. An
+/// [N x Ty] type must still be contiguous in memory though.
+static bool CC_AArch64_Custom_Stack_Block(
+      unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+      ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+  // Add the argument to the list to be allocated once we know the size of the
+  // block.
+  PendingMembers.push_back(
+      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+  if (!ArgFlags.isInConsecutiveRegsLast())
+    return true;
+
+  return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8);
+}
+
+/// Given an [N x Ty] block, it should be passed in a consecutive sequence of
+/// registers. If no such sequence is available, mark the rest of the registers
+/// of that type as used and place the argument on the stack.
+static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                    CCValAssign::LocInfo &LocInfo,
+                                    ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  // Try to allocate a contiguous block of registers, each of the correct
+  // size to hold one member.
+  ArrayRef<MCPhysReg> RegList;
+  if (LocVT.SimpleTy == MVT::i64)
+    RegList = XRegList;
+  else if (LocVT.SimpleTy == MVT::f16)
+    RegList = HRegList;
+  else if (LocVT.SimpleTy == MVT::f32 || LocVT.is32BitVector())
+    RegList = SRegList;
+  else if (LocVT.SimpleTy == MVT::f64 || LocVT.is64BitVector())
+    RegList = DRegList;
+  else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector())
+    RegList = QRegList;
+  else {
+    // Not an array we want to split up after all.
+    return false;
+  }
+
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+  // Add the argument to the list to be allocated once we know the size of the
+  // block.
+  PendingMembers.push_back(
+      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+  if (!ArgFlags.isInConsecutiveRegsLast())
+    return true;
+
+  unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
+  if (RegResult) {
+    for (auto &It : PendingMembers) {
+      It.convertToReg(RegResult);
+      State.addLoc(It);
+      ++RegResult;
+    }
+    PendingMembers.clear();
+    return true;
+  }
+
+  // Mark all regs in the class as unavailable
+  for (auto Reg : RegList)
+    State.AllocateReg(Reg);
+
+  const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
+      State.getMachineFunction().getSubtarget());
+  unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8;
+
+  return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign);
+}
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
new file mode 100644
index 000000000000..9058617768dd
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -0,0 +1,337 @@
+//=- AArch64CallingConv.td - Calling Conventions for AArch64 -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for AArch64 architecture.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfAlign - Match of the original alignment of the arg
+class CCIfAlign<string Align, CCAction A> :
+  CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+/// CCIfBigEndian - Match only if we're in big endian mode.
+class CCIfBigEndian<CCAction A> :
+  CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS64 Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_AArch64_AAPCS : CallingConv<[
+  CCIfType<[iPTR], CCBitConvertToType<i64>>,
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+
+  // Big endian vectors must be passed as if they were 1-element vectors so that
+  // their lanes are in a consistent order.
+  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+                         CCBitConvertToType<f64>>>,
+  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+                         CCBitConvertToType<f128>>>,
+
+  // An SRet is passed in X8, not X0 like a normal pointer parameter.
+  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+
+  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
+  // slot is 64-bit.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // The 'nest' parameter, if any, is passed in X18.
+  // Darwin uses X18 as the platform register and hence 'nest' isn't currently
+  // supported there.
+  CCIfNest<CCAssignToReg<[X18]>>,
+
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>,
+
+  CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
+
+  // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
+  // up to eight each of GPR and FPR.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  // i128 is split to two i64s, we can't fit half to register X7.
+  CCIfType<[i64], CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6],
+                                                    [X0, X1, X3, X5]>>>,
+
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+           CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+  // If more than will fit in registers, pass them on the stack instead.
+  CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>,
+  CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+           CCAssignToStack<16, 16>>
+]>;
+
+def RetCC_AArch64_AAPCS : CallingConv<[
+  CCIfType<[iPTR], CCBitConvertToType<i64>>,
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+
+  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X19], [W19]>>>,
+
+  // Big endian vectors must be passed as if they were 1-element vectors so that
+  // their lanes are in a consistent order.
+  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+                         CCBitConvertToType<f64>>>,
+  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+                         CCBitConvertToType<f128>>>,
+
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+      CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                              [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+      CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+]>;
+
+
+// Darwin uses a calling convention which differs in only two ways
+// from the standard one at this level:
+//     + i128s (i.e. split i64s) don't need even registers.
+//     + Stack slots are sized as needed rather than being at least 64-bit.
+def CC_AArch64_DarwinPCS : CallingConv<[
+  CCIfType<[iPTR], CCBitConvertToType<i64>>,
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  // An SRet is passed in X8, not X0 like a normal pointer parameter.
+  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+
+  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
+  // slot is 64-bit.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>,
+
+  // A SwiftError is passed in X19.
+  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X19], [W19]>>>,
+
+  CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
+
+  // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
+  // up to eight each of GPR and FPR.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  // i128 is split to two i64s, we can't fit half to register X7.
+  CCIfType<[i64],
+           CCIfSplit<CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6],
+                                             [W0, W1, W2, W3, W4, W5, W6]>>>,
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+           CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+  // If more than will fit in registers, pass them on the stack instead.
+  CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
+  CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>,
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+           CCAssignToStack<16, 16>>
+]>;
+
+def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
+  CCIfType<[iPTR], CCBitConvertToType<i64>>,
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Stack_Block">>,
+
+  // Handle all scalar types as either i64 or f64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+  CCIfType<[f16, f32],     CCPromoteToType<f64>>,
+
+  // Everything is on the stack.
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+           CCAssignToStack<16, 16>>
+]>;
+
+// The WebKit_JS calling convention only passes the first argument (the callee)
+// in register and the remaining arguments on stack. We allow 32bit stack slots,
+// so that WebKit can write partial values in the stack and define the other
+// 32bit quantity as undef.
+def CC_AArch64_WebKit_JS : CallingConv<[
+  // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>,
+
+  // Pass the remaining arguments on the stack instead.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def RetCC_AArch64_WebKit_JS : CallingConv<[
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM64 Calling Convention for GHC
+//===----------------------------------------------------------------------===//
+
+// This calling convention is specific to the Glasgow Haskell Compiler.
+// The only documentation is the GHC source code, specifically the C header
+// file:
+//
+//     https://github.com/ghc/ghc/blob/master/includes/stg/MachRegs.h
+//
+// which defines the registers for the Spineless Tagless G-Machine (STG) that
+// GHC uses to implement lazy evaluation. The generic STG machine has a set of
+// registers which are mapped to appropriate set of architecture specific
+// registers for each CPU architecture.
+//
+// The STG Machine is documented here:
+//
+//    https://ghc.haskell.org/trac/ghc/wiki/Commentary/Compiler/GeneratedCode
+//
+// The AArch64 register mapping is under the heading "The ARMv8/AArch64 ABI
+// register mapping".
+
+def CC_AArch64_GHC : CallingConv<[
+  CCIfType<[iPTR], CCBitConvertToType<i64>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, f128], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>,
+  CCIfType<[f32], CCAssignToReg<[S8, S9, S10, S11]>>,
+  CCIfType<[f64], CCAssignToReg<[D12, D13, D14, D15]>>,
+
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
+  CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>>
+]>;
+
+// FIXME: LR is only callee-saved in the sense that *we* preserve it and are
+// presumably a callee to someone. External functions may not do so, but this
+// is currently safe since BL has LR as an implicit-def and what happens after a
+// tail call doesn't matter.
+//
+// It would be better to model its preservation semantics properly (create a
+// vreg on entry, use it in RET & tail call generation; make that vreg def if we
+// end up saving LR as part of a call frame). Watch this space...
+def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+                                           X23, X24, X25, X26, X27, X28,
+                                           D8,  D9,  D10, D11,
+                                           D12, D13, D14, D15)>;
+
+// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
+// 'this' and the pointer return value are both passed in X0 in these cases,
+// this can be partially modelled by treating X0 as a callee-saved register;
+// only the resulting RegMask is used; the SaveList is ignored
+//
+// (For generic ARM 64-bit ABI code, clang will not generate constructors or
+// destructors with 'this' returns, so this RegMask will not be used in that
+// case)
+def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
+
+def CSR_AArch64_AAPCS_SwiftError
+    : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X19)>;
+
+// The function used by Darwin to obtain the address of a thread-local variable
+// guarantees more than a normal AAPCS function. x16 and x17 are used on the
+// fast path for calculation, but other registers except X0 (argument/return)
+// and LR (it is a call, after all) are preserved.
+def CSR_AArch64_TLS_Darwin
+    : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17),
+                           FP,
+                           (sequence "Q%u", 0, 31))>;
+
+// We can only handle a register pair with adjacent registers, the register pair
+// should belong to the same class as well. Since the access function on the
+// fast path calls a function that follows CSR_AArch64_TLS_Darwin,
+// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin.
+def CSR_AArch64_CXX_TLS_Darwin
+    : CalleeSavedRegs<(add CSR_AArch64_AAPCS,
+                           (sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
+                           (sequence "D%u", 0, 31))>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_AArch64_CXX_TLS_Darwin_PE
+    : CalleeSavedRegs<(add LR, FP)>;
+
+// CSRs that are handled explicitly via copies.
+def CSR_AArch64_CXX_TLS_Darwin_ViaCopy
+    : CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>;
+
+// The ELF stub used for TLS-descriptor access saves every feasible
+// register. Only X0 and LR are clobbered.
+def CSR_AArch64_TLS_ELF
+    : CalleeSavedRegs<(add (sequence "X%u", 1, 28), FP,
+                           (sequence "Q%u", 0, 31))>;
+
+def CSR_AArch64_AllRegs
+    : CalleeSavedRegs<(add (sequence "W%u", 0, 30), WSP,
+                           (sequence "X%u", 0, 28), FP, LR, SP,
+                           (sequence "B%u", 0, 31), (sequence "H%u", 0, 31),
+                           (sequence "S%u", 0, 31), (sequence "D%u", 0, 31),
+                           (sequence "Q%u", 0, 31))>;
+
+def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_AArch64_RT_MostRegs :  CalleeSavedRegs<(add CSR_AArch64_AAPCS,
+                                                (sequence "X%u", 9, 15))>;
+
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
new file mode 100644
index 000000000000..6f8dd3e3ac0c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -0,0 +1,151 @@
+//===-- AArch64CleanupLocalDynamicTLSPass.cpp ---------------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Local-dynamic access to thread-local variables proceeds in three stages.
+//
+// 1. The offset of this Module's thread-local area from TPIDR_EL0 is calculated
+//    in much the same way as a general-dynamic TLS-descriptor access against
+//    the special symbol _TLS_MODULE_BASE.
+// 2. The variable's offset from _TLS_MODULE_BASE_ is calculated using
+//    instructions with "dtprel" modifiers.
+// 3. These two are added, together with TPIDR_EL0, to obtain the variable's
+//    true address.
+//
+// This is only better than general-dynamic access to the variable if two or
+// more of the first stage TLS-descriptor calculations can be combined. This
+// pass looks through a function and performs such combinations.
+//
+//===----------------------------------------------------------------------===//
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
+#define TLSCLEANUP_PASS_NAME "AArch64 Local Dynamic TLS Access Clean-up"
+
+namespace {
+struct LDTLSCleanup : public MachineFunctionPass {
+  static char ID;
+  LDTLSCleanup() : MachineFunctionPass(ID) {
+    initializeLDTLSCleanupPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(*MF.getFunction()))
+      return false;
+
+    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+    if (AFI->getNumLocalDynamicTLSAccesses() < 2) {
+      // No point folding accesses if there isn't at least two.
+      return false;
+    }
+
+    MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+    return VisitNode(DT->getRootNode(), 0);
+  }
+
+  // Visit the dominator subtree rooted at Node in pre-order.
+  // If TLSBaseAddrReg is non-null, then use that to replace any
+  // TLS_base_addr instructions. Otherwise, create the register
+  // when the first such instruction is seen, and then use it
+  // as we encounter more instructions.
+  bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+    MachineBasicBlock *BB = Node->getBlock();
+    bool Changed = false;
+
+    // Traverse the current block.
+    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+         ++I) {
+      switch (I->getOpcode()) {
+      case AArch64::TLSDESC_CALLSEQ:
+        // Make sure it's a local dynamic access.
+        if (!I->getOperand(0).isSymbol() ||
+            strcmp(I->getOperand(0).getSymbolName(), "_TLS_MODULE_BASE_"))
+          break;
+
+        if (TLSBaseAddrReg)
+          I = replaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
+        else
+          I = setRegister(*I, &TLSBaseAddrReg);
+        Changed = true;
+        break;
+      default:
+        break;
+      }
+    }
+
+    // Visit the children of this block in the dominator tree.
+    for (MachineDomTreeNode *N : *Node) {
+      Changed |= VisitNode(N, TLSBaseAddrReg);
+    }
+
+    return Changed;
+  }
+
+  // Replace the TLS_base_addr instruction I with a copy from
+  // TLSBaseAddrReg, returning the new instruction.
+  MachineInstr *replaceTLSBaseAddrCall(MachineInstr &I,
+                                       unsigned TLSBaseAddrReg) {
+    MachineFunction *MF = I.getParent()->getParent();
+    const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+    // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
+    // code sequence assumes the address will be.
+    MachineInstr *Copy = BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                                 TII->get(TargetOpcode::COPY), AArch64::X0)
+                             .addReg(TLSBaseAddrReg);
+
+    // Erase the TLS_base_addr instruction.
+    I.eraseFromParent();
+
+    return Copy;
+  }
+
+  // Create a virtal register in *TLSBaseAddrReg, and populate it by
+  // inserting a copy instruction after I. Returns the new instruction.
+  MachineInstr *setRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
+    MachineFunction *MF = I.getParent()->getParent();
+    const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+    // Create a virtual register for the TLS base address.
+    MachineRegisterInfo &RegInfo = MF->getRegInfo();
+    *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
+
+    // Insert a copy from X0 to TLSBaseAddrReg for later.
+    MachineInstr *Copy =
+        BuildMI(*I.getParent(), ++I.getIterator(), I.getDebugLoc(),
+                TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
+            .addReg(AArch64::X0);
+
+    return Copy;
+  }
+
+  StringRef getPassName() const override { return TLSCLEANUP_PASS_NAME; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+}
+
+INITIALIZE_PASS(LDTLSCleanup, "aarch64-local-dynamic-tls-cleanup",
+                TLSCLEANUP_PASS_NAME, false, false)
+
+char LDTLSCleanup::ID = 0;
+FunctionPass *llvm::createAArch64CleanupLocalDynamicTLSPass() {
+  return new LDTLSCleanup();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
new file mode 100644
index 000000000000..7666011f75b6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -0,0 +1,1108 @@
+//===---------- AArch64CollectLOH.cpp - AArch64 collect LOH pass --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that collect the Linker Optimization Hint (LOH).
+// This pass should be run at the very end of the compilation flow, just before
+// assembly printer.
+// To be useful for the linker, the LOH must be printed into the assembly file.
+//
+// A LOH describes a sequence of instructions that may be optimized by the
+// linker.
+// This same sequence cannot be optimized by the compiler because some of
+// the information will be known at link time.
+// For instance, consider the following sequence:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//     L3: ldr xC, [xB, #imm]
+// This sequence can be turned into:
+// A literal load if sym@PAGE + sym@PAGEOFF + #imm - address(L3) is < 1MB:
+//     L3: ldr xC, sym+#imm
+// It may also be turned into either the following more efficient
+// code sequences:
+// - If sym@PAGEOFF + #imm fits the encoding space of L3.
+//     L1: adrp xA, sym@PAGE
+//     L3: ldr xC, [xB, sym@PAGEOFF + #imm]
+// - If sym@PAGE + sym@PAGEOFF - address(L1) < 1MB:
+//     L1: adr xA, sym
+//     L3: ldr xC, [xB, #imm]
+//
+// To be valid a LOH must meet all the requirements needed by all the related
+// possible linker transformations.
+// For instance, using the running example, the constraints to emit
+// ".loh AdrpAddLdr" are:
+// - L1, L2, and L3 instructions are of the expected type, i.e.,
+//   respectively ADRP, ADD (immediate), and LD.
+// - The result of L1 is used only by L2.
+// - The register argument (xA) used in the ADD instruction is defined
+//   only by L1.
+// - The result of L2 is used only by L3.
+// - The base address (xB) in L3 is defined only L2.
+// - The ADRP in L1 and the ADD in L2 must reference the same symbol using
+//   @PAGE/@PAGEOFF with no additional constants
+//
+// Currently supported LOHs are:
+// * So called non-ADRP-related:
+//   - .loh AdrpAddLdr L1, L2, L3:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//     L3: ldr xC, [xB, #imm]
+//   - .loh AdrpLdrGotLdr L1, L2, L3:
+//     L1: adrp xA, sym@GOTPAGE
+//     L2: ldr xB, [xA, sym@GOTPAGEOFF]
+//     L3: ldr xC, [xB, #imm]
+//   - .loh AdrpLdr L1, L3:
+//     L1: adrp xA, sym@PAGE
+//     L3: ldr xC, [xA, sym@PAGEOFF]
+//   - .loh AdrpAddStr L1, L2, L3:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//     L3: str xC, [xB, #imm]
+//   - .loh AdrpLdrGotStr L1, L2, L3:
+//     L1: adrp xA, sym@GOTPAGE
+//     L2: ldr xB, [xA, sym@GOTPAGEOFF]
+//     L3: str xC, [xB, #imm]
+//   - .loh AdrpAdd L1, L2:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//   For all these LOHs, L1, L2, L3 form a simple chain:
+//   L1 result is used only by L2 and L2 result by L3.
+//   L3 LOH-related argument is defined only by L2 and L2 LOH-related argument
+//   by L1.
+// All these LOHs aim at using more efficient load/store patterns by folding
+// some instructions used to compute the address directly into the load/store.
+//
+// * So called ADRP-related:
+//  - .loh AdrpAdrp L2, L1:
+//    L2: ADRP xA, sym1@PAGE
+//    L1: ADRP xA, sym2@PAGE
+//    L2 dominates L1 and xA is not redifined between L2 and L1
+// This LOH aims at getting rid of redundant ADRP instructions.
+//
+// The overall design for emitting the LOHs is:
+// 1. AArch64CollectLOH (this pass) records the LOHs in the AArch64FunctionInfo.
+// 2. AArch64AsmPrinter reads the LOHs from AArch64FunctionInfo and it:
+//     1. Associates them a label.
+//     2. Emits them in a MCStreamer (EmitLOHDirective).
+//         - The MCMachOStreamer records them into the MCAssembler.
+//         - The MCAsmStreamer prints them.
+//         - Other MCStreamers ignore them.
+//     3. Closes the MCStreamer:
+//         - The MachObjectWriter gets them from the MCAssembler and writes
+//           them in the object file.
+//         - Other ObjectWriters ignore them.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-collect-loh"
+
+static cl::opt<bool>
+PreCollectRegister("aarch64-collect-loh-pre-collect-register", cl::Hidden,
+                   cl::desc("Restrict analysis to registers invovled"
+                            " in LOHs"),
+                   cl::init(true));
+
+static cl::opt<bool>
+BasicBlockScopeOnly("aarch64-collect-loh-bb-only", cl::Hidden,
+                    cl::desc("Restrict analysis at basic block scope"),
+                    cl::init(true));
+
+STATISTIC(NumADRPSimpleCandidate,
+          "Number of simplifiable ADRP dominate by another");
+#ifndef NDEBUG
+STATISTIC(NumADRPComplexCandidate2,
+          "Number of simplifiable ADRP reachable by 2 defs");
+STATISTIC(NumADRPComplexCandidate3,
+          "Number of simplifiable ADRP reachable by 3 defs");
+STATISTIC(NumADRPComplexCandidateOther,
+          "Number of simplifiable ADRP reachable by 4 or more defs");
+STATISTIC(NumADDToSTRWithImm,
+          "Number of simplifiable STR with imm reachable by ADD");
+STATISTIC(NumLDRToSTRWithImm,
+          "Number of simplifiable STR with imm reachable by LDR");
+STATISTIC(NumADDToSTR, "Number of simplifiable STR reachable by ADD");
+STATISTIC(NumLDRToSTR, "Number of simplifiable STR reachable by LDR");
+STATISTIC(NumADDToLDRWithImm,
+          "Number of simplifiable LDR with imm reachable by ADD");
+STATISTIC(NumLDRToLDRWithImm,
+          "Number of simplifiable LDR with imm reachable by LDR");
+STATISTIC(NumADDToLDR, "Number of simplifiable LDR reachable by ADD");
+STATISTIC(NumLDRToLDR, "Number of simplifiable LDR reachable by LDR");
+#endif // NDEBUG
+STATISTIC(NumADRPToLDR, "Number of simplifiable LDR reachable by ADRP");
+#ifndef NDEBUG
+STATISTIC(NumCplxLvl1, "Number of complex case of level 1");
+STATISTIC(NumTooCplxLvl1, "Number of too complex case of level 1");
+STATISTIC(NumCplxLvl2, "Number of complex case of level 2");
+STATISTIC(NumTooCplxLvl2, "Number of too complex case of level 2");
+#endif // NDEBUG
+STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD");
+STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD");
+
+#define AARCH64_COLLECT_LOH_NAME "AArch64 Collect Linker Optimization Hint (LOH)"
+
+namespace {
+struct AArch64CollectLOH : public MachineFunctionPass {
+  static char ID;
+  AArch64CollectLOH() : MachineFunctionPass(ID) {
+    initializeAArch64CollectLOHPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  StringRef getPassName() const override { return AARCH64_COLLECT_LOH_NAME; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<MachineDominatorTree>();
+  }
+
+private:
+};
+
+/// A set of MachineInstruction.
+typedef SetVector<const MachineInstr *> SetOfMachineInstr;
+/// Map a basic block to a set of instructions per register.
+/// This is used to represent the exposed uses of a basic block
+/// per register.
+typedef MapVector<const MachineBasicBlock *,
+                  std::unique_ptr<SetOfMachineInstr[]>>
+BlockToSetOfInstrsPerColor;
+/// Map a basic block to an instruction per register.
+/// This is used to represent the live-out definitions of a basic block
+/// per register.
+typedef MapVector<const MachineBasicBlock *,
+                  std::unique_ptr<const MachineInstr *[]>>
+BlockToInstrPerColor;
+/// Map an instruction to a set of instructions. Used to represent the
+/// mapping def to reachable uses or use to definitions.
+typedef MapVector<const MachineInstr *, SetOfMachineInstr> InstrToInstrs;
+/// Map a basic block to a BitVector.
+/// This is used to record the kill registers per basic block.
+typedef MapVector<const MachineBasicBlock *, BitVector> BlockToRegSet;
+
+/// Map a register to a dense id.
+typedef DenseMap<unsigned, unsigned> MapRegToId;
+/// Map a dense id to a register. Used for debug purposes.
+typedef SmallVector<unsigned, 32> MapIdToReg;
+} // end anonymous namespace.
+
+char AArch64CollectLOH::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh",
+                      AARCH64_COLLECT_LOH_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh",
+                    AARCH64_COLLECT_LOH_NAME, false, false)
+
+/// Given a couple (MBB, reg) get the corresponding set of instruction from
+/// the given "sets".
+/// If this couple does not reference any set, an empty set is added to "sets"
+/// for this couple and returned.
+/// \param nbRegs is used internally allocate some memory. It must be consistent
+/// with the way sets is used.
+static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets,
+                                 const MachineBasicBlock &MBB, unsigned reg,
+                                 unsigned nbRegs) {
+  SetOfMachineInstr *result;
+  BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB);
+  if (it != sets.end())
+    result = it->second.get();
+  else
+    result = (sets[&MBB] = make_unique<SetOfMachineInstr[]>(nbRegs)).get();
+
+  return result[reg];
+}
+
+/// Given a couple (reg, MI) get the corresponding set of instructions from the
+/// the given "sets".
+/// This is used to get the uses record in sets of a definition identified by
+/// MI and reg, i.e., MI defines reg.
+/// If the couple does not reference anything, an empty set is added to
+/// "sets[reg]".
+/// \pre set[reg] is valid.
+static SetOfMachineInstr &getUses(InstrToInstrs *sets, unsigned reg,
+                                  const MachineInstr &MI) {
+  return sets[reg][&MI];
+}
+
+/// Same as getUses but does not modify the input map: sets.
+/// \return NULL if the couple (reg, MI) is not in sets.
+static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg,
+                                        const MachineInstr &MI) {
+  InstrToInstrs::const_iterator Res = sets[reg].find(&MI);
+  if (Res != sets[reg].end())
+    return &(Res->second);
+  return nullptr;
+}
+
+/// Initialize the reaching definition algorithm:
+/// For each basic block BB in MF, record:
+/// - its kill set.
+/// - its reachable uses (uses that are exposed to BB's predecessors).
+/// - its the generated definitions.
+/// \param DummyOp if not NULL, specifies a Dummy Operation to be added to
+/// the list of uses of exposed defintions.
+/// \param ADRPMode specifies to only consider ADRP instructions for generated
+/// definition. It also consider definitions of ADRP instructions as uses and
+/// ignore other uses. The ADRPMode is used to collect the information for LHO
+/// that involve ADRP operation only.
+static void initReachingDef(const MachineFunction &MF,
+                            InstrToInstrs *ColorOpToReachedUses,
+                            BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
+                            BlockToSetOfInstrsPerColor &ReachableUses,
+                            const MapRegToId &RegToId,
+                            const MachineInstr *DummyOp, bool ADRPMode) {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  unsigned NbReg = RegToId.size();
+
+  for (const MachineBasicBlock &MBB : MF) {
+    auto &BBGen = Gen[&MBB];
+    BBGen = make_unique<const MachineInstr *[]>(NbReg);
+    std::fill(BBGen.get(), BBGen.get() + NbReg, nullptr);
+
+    BitVector &BBKillSet = Kill[&MBB];
+    BBKillSet.resize(NbReg);
+    for (const MachineInstr &MI : MBB) {
+      bool IsADRP = MI.getOpcode() == AArch64::ADRP;
+
+      // Process uses first.
+      if (IsADRP || !ADRPMode)
+        for (const MachineOperand &MO : MI.operands()) {
+          // Treat ADRP def as use, as the goal of the analysis is to find
+          // ADRP defs reached by other ADRP defs.
+          if (!MO.isReg() || (!ADRPMode && !MO.isUse()) ||
+              (ADRPMode && (!IsADRP || !MO.isDef())))
+            continue;
+          unsigned CurReg = MO.getReg();
+          MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
+          if (ItCurRegId == RegToId.end())
+            continue;
+          CurReg = ItCurRegId->second;
+
+          // if CurReg has not been defined, this use is reachable.
+          if (!BBGen[CurReg] && !BBKillSet.test(CurReg))
+            getSet(ReachableUses, MBB, CurReg, NbReg).insert(&MI);
+          // current basic block definition for this color, if any, is in Gen.
+          if (BBGen[CurReg])
+            getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(&MI);
+        }
+
+      // Process clobbers.
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isRegMask())
+          continue;
+        // Clobbers kill the related colors.
+        const uint32_t *PreservedRegs = MO.getRegMask();
+
+        // Set generated regs.
+        for (const auto &Entry : RegToId) {
+          unsigned Reg = Entry.second;
+          // Use the global register ID when querying APIs external to this
+          // pass.
+          if (MachineOperand::clobbersPhysReg(PreservedRegs, Entry.first)) {
+            // Do not register clobbered definition for no ADRP.
+            // This definition is not used anyway (otherwise register
+            // allocation is wrong).
+            BBGen[Reg] = ADRPMode ? &MI : nullptr;
+            BBKillSet.set(Reg);
+          }
+        }
+      }
+
+      // Process register defs.
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg() || !MO.isDef())
+          continue;
+        unsigned CurReg = MO.getReg();
+        MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
+        if (ItCurRegId == RegToId.end())
+          continue;
+
+        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) {
+          MapRegToId::const_iterator ItRegId = RegToId.find(*AI);
+          // If this alias has not been recorded, then it is not interesting
+          // for the current analysis.
+          // We can end up in this situation because of tuple registers.
+          // E.g., Let say we are interested in S1. When we register
+          // S1, we will also register its aliases and in particular
+          // the tuple Q1_Q2.
+          // Now, when we encounter Q1_Q2, we will look through its aliases
+          // and will find that S2 is not registered.
+          if (ItRegId == RegToId.end())
+            continue;
+
+          BBKillSet.set(ItRegId->second);
+          BBGen[ItRegId->second] = &MI;
+        }
+        BBGen[ItCurRegId->second] = &MI;
+      }
+    }
+
+    // If we restrict our analysis to basic block scope, conservatively add a
+    // dummy
+    // use for each generated value.
+    if (!ADRPMode && DummyOp && !MBB.succ_empty())
+      for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg)
+        if (BBGen[CurReg])
+          getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(DummyOp);
+  }
+}
+
+/// Reaching def core algorithm:
+/// while an Out has changed
+///    for each bb
+///       for each color
+///           In[bb][color] = U Out[bb.predecessors][color]
+///           insert reachableUses[bb][color] in each in[bb][color]
+///                 op.reachedUses
+///
+///           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
+static void reachingDefAlgorithm(const MachineFunction &MF,
+                                 InstrToInstrs *ColorOpToReachedUses,
+                                 BlockToSetOfInstrsPerColor &In,
+                                 BlockToSetOfInstrsPerColor &Out,
+                                 BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
+                                 BlockToSetOfInstrsPerColor &ReachableUses,
+                                 unsigned NbReg) {
+  bool HasChanged;
+  do {
+    HasChanged = false;
+    for (const MachineBasicBlock &MBB : MF) {
+      unsigned CurReg;
+      for (CurReg = 0; CurReg < NbReg; ++CurReg) {
+        SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg);
+        SetOfMachineInstr &BBReachableUses =
+            getSet(ReachableUses, MBB, CurReg, NbReg);
+        SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg);
+        unsigned Size = BBOutSet.size();
+        //   In[bb][color] = U Out[bb.predecessors][color]
+        for (const MachineBasicBlock *PredMBB : MBB.predecessors()) {
+          SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg);
+          BBInSet.insert(PredOutSet.begin(), PredOutSet.end());
+        }
+        //   insert reachableUses[bb][color] in each in[bb][color] op.reachedses
+        for (const MachineInstr *MI : BBInSet) {
+          SetOfMachineInstr &OpReachedUses =
+              getUses(ColorOpToReachedUses, CurReg, *MI);
+          OpReachedUses.insert(BBReachableUses.begin(), BBReachableUses.end());
+        }
+        //           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
+        if (!Kill[&MBB].test(CurReg))
+          BBOutSet.insert(BBInSet.begin(), BBInSet.end());
+        if (Gen[&MBB][CurReg])
+          BBOutSet.insert(Gen[&MBB][CurReg]);
+        HasChanged |= BBOutSet.size() != Size;
+      }
+    }
+  } while (HasChanged);
+}
+
+/// Reaching definition algorithm.
+/// \param MF function on which the algorithm will operate.
+/// \param[out] ColorOpToReachedUses will contain the result of the reaching
+/// def algorithm.
+/// \param ADRPMode specify whether the reaching def algorithm should be tuned
+/// for ADRP optimization. \see initReachingDef for more details.
+/// \param DummyOp if not NULL, the algorithm will work at
+/// basic block scope and will set for every exposed definition a use to
+/// @p DummyOp.
+/// \pre ColorOpToReachedUses is an array of at least number of registers of
+/// InstrToInstrs.
+static void reachingDef(const MachineFunction &MF,
+                        InstrToInstrs *ColorOpToReachedUses,
+                        const MapRegToId &RegToId, bool ADRPMode = false,
+                        const MachineInstr *DummyOp = nullptr) {
+  // structures:
+  // For each basic block.
+  // Out: a set per color of definitions that reach the
+  //      out boundary of this block.
+  // In: Same as Out but for in boundary.
+  // Gen: generated color in this block (one operation per color).
+  // Kill: register set of killed color in this block.
+  // ReachableUses: a set per color of uses (operation) reachable
+  //                for "In" definitions.
+  BlockToSetOfInstrsPerColor Out, In, ReachableUses;
+  BlockToInstrPerColor Gen;
+  BlockToRegSet Kill;
+
+  // Initialize Gen, kill and reachableUses.
+  initReachingDef(MF, ColorOpToReachedUses, Gen, Kill, ReachableUses, RegToId,
+                  DummyOp, ADRPMode);
+
+  // Algo.
+  if (!DummyOp)
+    reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill,
+                         ReachableUses, RegToId.size());
+}
+
+#ifndef NDEBUG
+/// print the result of the reaching definition algorithm.
+static void printReachingDef(const InstrToInstrs *ColorOpToReachedUses,
+                             unsigned NbReg, const TargetRegisterInfo *TRI,
+                             const MapIdToReg &IdToReg) {
+  unsigned CurReg;
+  for (CurReg = 0; CurReg < NbReg; ++CurReg) {
+    if (ColorOpToReachedUses[CurReg].empty())
+      continue;
+    DEBUG(dbgs() << "*** Reg " << PrintReg(IdToReg[CurReg], TRI) << " ***\n");
+
+    for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
+      DEBUG(dbgs() << "Def:\n");
+      DEBUG(DefsIt.first->print(dbgs()));
+      DEBUG(dbgs() << "Reachable uses:\n");
+      for (const MachineInstr *MI : DefsIt.second) {
+        DEBUG(MI->print(dbgs()));
+      }
+    }
+  }
+}
+#endif // NDEBUG
+
+/// Answer the following question: Can Def be one of the definition
+/// involved in a part of a LOH?
+static bool canDefBePartOfLOH(const MachineInstr *Def) {
+  unsigned Opc = Def->getOpcode();
+  // Accept ADRP, ADDLow and LOADGot.
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::ADRP:
+    return true;
+  case AArch64::ADDXri:
+    // Check immediate to see if the immediate is an address.
+    switch (Def->getOperand(2).getType()) {
+    default:
+      return false;
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_JumpTableIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_BlockAddress:
+      return true;
+    }
+  case AArch64::LDRXui:
+    // Check immediate to see if the immediate is an address.
+    switch (Def->getOperand(2).getType()) {
+    default:
+      return false;
+    case MachineOperand::MO_GlobalAddress:
+      return true;
+    }
+  }
+  // Unreachable.
+  return false;
+}
+
+/// Check whether the given instruction can the end of a LOH chain involving a
+/// store.
+static bool isCandidateStore(const MachineInstr *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case AArch64::STRBBui:
+  case AArch64::STRHHui:
+  case AArch64::STRBui:
+  case AArch64::STRHui:
+  case AArch64::STRWui:
+  case AArch64::STRXui:
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+    // In case we have str xA, [xA, #imm], this is two different uses
+    // of xA and we cannot fold, otherwise the xA stored may be wrong,
+    // even if #imm == 0.
+    if (Instr->getOperand(0).getReg() != Instr->getOperand(1).getReg())
+      return true;
+  }
+  return false;
+}
+
+/// Given the result of a reaching definition algorithm in ColorOpToReachedUses,
+/// Build the Use to Defs information and filter out obvious non-LOH candidates.
+/// In ADRPMode, non-LOH candidates are "uses" with non-ADRP definitions.
+/// In non-ADRPMode, non-LOH candidates are "uses" with several definition,
+/// i.e., no simple chain.
+/// \param ADRPMode -- \see initReachingDef.
+static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs,
+                              const InstrToInstrs *ColorOpToReachedUses,
+                              const MapRegToId &RegToId,
+                              bool ADRPMode = false) {
+
+  SetOfMachineInstr NotCandidate;
+  unsigned NbReg = RegToId.size();
+  MapRegToId::const_iterator EndIt = RegToId.end();
+  for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) {
+    // If this color is never defined, continue.
+    if (ColorOpToReachedUses[CurReg].empty())
+      continue;
+
+    for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
+      for (const MachineInstr *MI : DefsIt.second) {
+        const MachineInstr *Def = DefsIt.first;
+        MapRegToId::const_iterator It;
+        // if all the reaching defs are not adrp, this use will not be
+        // simplifiable.
+        if ((ADRPMode && Def->getOpcode() != AArch64::ADRP) ||
+            (!ADRPMode && !canDefBePartOfLOH(Def)) ||
+            (!ADRPMode && isCandidateStore(MI) &&
+             // store are LOH candidate iff the end of the chain is used as
+             // base.
+             ((It = RegToId.find((MI)->getOperand(1).getReg())) == EndIt ||
+              It->second != CurReg))) {
+          NotCandidate.insert(MI);
+          continue;
+        }
+        // Do not consider self reaching as a simplifiable case for ADRP.
+        if (!ADRPMode || MI != DefsIt.first) {
+          UseToReachingDefs[MI].insert(DefsIt.first);
+          // If UsesIt has several reaching definitions, it is not
+          // candidate for simplificaton in non-ADRPMode.
+          if (!ADRPMode && UseToReachingDefs[MI].size() > 1)
+            NotCandidate.insert(MI);
+        }
+      }
+    }
+  }
+  for (const MachineInstr *Elem : NotCandidate) {
+    DEBUG(dbgs() << "Too many reaching defs: " << *Elem << "\n");
+    // It would have been better if we could just remove the entry
+    // from the map.  Because of that, we have to filter the garbage
+    // (second.empty) in the subsequence analysis.
+    UseToReachingDefs[Elem].clear();
+  }
+}
+
+/// Based on the use to defs information (in ADRPMode), compute the
+/// opportunities of LOH ADRP-related.
+static void computeADRP(const InstrToInstrs &UseToDefs,
+                        AArch64FunctionInfo &AArch64FI,
+                        const MachineDominatorTree *MDT) {
+  DEBUG(dbgs() << "*** Compute LOH for ADRP\n");
+  for (const auto &Entry : UseToDefs) {
+    unsigned Size = Entry.second.size();
+    if (Size == 0)
+      continue;
+    if (Size == 1) {
+      const MachineInstr *L2 = *Entry.second.begin();
+      const MachineInstr *L1 = Entry.first;
+      if (!MDT->dominates(L2, L1)) {
+        DEBUG(dbgs() << "Dominance check failed:\n" << *L2 << '\n' << *L1
+                     << '\n');
+        continue;
+      }
+      DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n');
+      AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, {L2, L1});
+      ++NumADRPSimpleCandidate;
+    }
+#ifndef NDEBUG
+    else if (Size == 2)
+      ++NumADRPComplexCandidate2;
+    else if (Size == 3)
+      ++NumADRPComplexCandidate3;
+    else
+      ++NumADRPComplexCandidateOther;
+#endif
+    // if Size < 1, the use should have been removed from the candidates
+    assert(Size >= 1 && "No reaching defs for that use!");
+  }
+}
+
+/// Check whether the given instruction can be the end of a LOH chain
+/// involving a load.
+static bool isCandidateLoad(const MachineInstr *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDRSBWui:
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHWui:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWui:
+  case AArch64::LDRBui:
+  case AArch64::LDRHui:
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+    if (Instr->getOperand(2).getTargetFlags() & AArch64II::MO_GOT)
+      return false;
+    return true;
+  }
+  // Unreachable.
+  return false;
+}
+
+/// Check whether the given instruction can load a litteral.
+static bool supportLoadFromLiteral(const MachineInstr *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDRSWui:
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+    return true;
+  }
+  // Unreachable.
+  return false;
+}
+
+/// Check whether the given instruction is a LOH candidate.
+/// \param UseToDefs is used to check that Instr is at the end of LOH supported
+/// chain.
+/// \pre UseToDefs contains only on def per use, i.e., obvious non candidate are
+/// already been filtered out.
+static bool isCandidate(const MachineInstr *Instr,
+                        const InstrToInstrs &UseToDefs,
+                        const MachineDominatorTree *MDT) {
+  if (!isCandidateLoad(Instr) && !isCandidateStore(Instr))
+    return false;
+
+  const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin();
+  if (Def->getOpcode() != AArch64::ADRP) {
+    // At this point, Def is ADDXri or LDRXui of the right type of
+    // symbol, because we filtered out the uses that were not defined
+    // by these kind of instructions (+ ADRP).
+
+    // Check if this forms a simple chain: each intermediate node must
+    // dominates the next one.
+    if (!MDT->dominates(Def, Instr))
+      return false;
+    // Move one node up in the simple chain.
+    if (UseToDefs.find(Def) ==
+            UseToDefs.end()
+            // The map may contain garbage we have to ignore.
+        ||
+        UseToDefs.find(Def)->second.empty())
+      return false;
+    Instr = Def;
+    Def = *UseToDefs.find(Def)->second.begin();
+  }
+  // Check if we reached the top of the simple chain:
+  // - top is ADRP.
+  // - check the simple chain property: each intermediate node must
+  // dominates the next one.
+  if (Def->getOpcode() == AArch64::ADRP)
+    return MDT->dominates(Def, Instr);
+  return false;
+}
+
+static bool registerADRCandidate(const MachineInstr &Use,
+                                 const InstrToInstrs &UseToDefs,
+                                 const InstrToInstrs *DefsPerColorToUses,
+                                 AArch64FunctionInfo &AArch64FI,
+                                 SetOfMachineInstr *InvolvedInLOHs,
+                                 const MapRegToId &RegToId) {
+  // Look for opportunities to turn ADRP -> ADD or
+  // ADRP -> LDR GOTPAGEOFF into ADR.
+  // If ADRP has more than one use. Give up.
+  if (Use.getOpcode() != AArch64::ADDXri &&
+      (Use.getOpcode() != AArch64::LDRXui ||
+       !(Use.getOperand(2).getTargetFlags() & AArch64II::MO_GOT)))
+    return false;
+  InstrToInstrs::const_iterator It = UseToDefs.find(&Use);
+  // The map may contain garbage that we need to ignore.
+  if (It == UseToDefs.end() || It->second.empty())
+    return false;
+  const MachineInstr &Def = **It->second.begin();
+  if (Def.getOpcode() != AArch64::ADRP)
+    return false;
+  // Check the number of users of ADRP.
+  const SetOfMachineInstr *Users =
+      getUses(DefsPerColorToUses,
+              RegToId.find(Def.getOperand(0).getReg())->second, Def);
+  if (Users->size() > 1) {
+    ++NumADRComplexCandidate;
+    return false;
+  }
+  ++NumADRSimpleCandidate;
+  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Def)) &&
+         "ADRP already involved in LOH.");
+  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Use)) &&
+         "ADD already involved in LOH.");
+  DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n');
+
+  AArch64FI.addLOHDirective(
+      Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd : MCLOH_AdrpLdrGot,
+      {&Def, &Use});
+  return true;
+}
+
+/// Based on the use to defs information (in non-ADRPMode), compute the
+/// opportunities of LOH non-ADRP-related
+static void computeOthers(const InstrToInstrs &UseToDefs,
+                          const InstrToInstrs *DefsPerColorToUses,
+                          AArch64FunctionInfo &AArch64FI, const MapRegToId &RegToId,
+                          const MachineDominatorTree *MDT) {
+  SetOfMachineInstr *InvolvedInLOHs = nullptr;
+#ifndef NDEBUG
+  SetOfMachineInstr InvolvedInLOHsStorage;
+  InvolvedInLOHs = &InvolvedInLOHsStorage;
+#endif // NDEBUG
+  DEBUG(dbgs() << "*** Compute LOH for Others\n");
+  // ADRP -> ADD/LDR -> LDR/STR pattern.
+  // Fall back to ADRP -> ADD pattern if we fail to catch the bigger pattern.
+
+  // FIXME: When the statistics are not important,
+  // This initial filtering loop can be merged into the next loop.
+  // Currently, we didn't do it to have the same code for both DEBUG and
+  // NDEBUG builds. Indeed, the iterator of the second loop would need
+  // to be changed.
+  SetOfMachineInstr PotentialCandidates;
+  SetOfMachineInstr PotentialADROpportunities;
+  for (auto &Use : UseToDefs) {
+    // If no definition is available, this is a non candidate.
+    if (Use.second.empty())
+      continue;
+    // Keep only instructions that are load or store and at the end of
+    // a ADRP -> ADD/LDR/Nothing chain.
+    // We already filtered out the no-chain cases.
+    if (!isCandidate(Use.first, UseToDefs, MDT)) {
+      PotentialADROpportunities.insert(Use.first);
+      continue;
+    }
+    PotentialCandidates.insert(Use.first);
+  }
+
+  // Make the following distinctions for statistics as the linker does
+  // know how to decode instructions:
+  // - ADD/LDR/Nothing make there different patterns.
+  // - LDR/STR make two different patterns.
+  // Hence, 6 - 1 base patterns.
+  // (because ADRP-> Nothing -> STR is not simplifiable)
+
+  // The linker is only able to have a simple semantic, i.e., if pattern A
+  // do B.
+  // However, we want to see the opportunity we may miss if we were able to
+  // catch more complex cases.
+
+  // PotentialCandidates are result of a chain ADRP -> ADD/LDR ->
+  // A potential candidate becomes a candidate, if its current immediate
+  // operand is zero and all nodes of the chain have respectively only one user
+#ifndef NDEBUG
+  SetOfMachineInstr DefsOfPotentialCandidates;
+#endif
+  for (const MachineInstr *Candidate : PotentialCandidates) {
+    // Get the definition of the candidate i.e., ADD or LDR.
+    const MachineInstr *Def = *UseToDefs.find(Candidate)->second.begin();
+    // Record the elements of the chain.
+    const MachineInstr *L1 = Def;
+    const MachineInstr *L2 = nullptr;
+    unsigned ImmediateDefOpc = Def->getOpcode();
+    if (Def->getOpcode() != AArch64::ADRP) {
+      // Check the number of users of this node.
+      const SetOfMachineInstr *Users =
+          getUses(DefsPerColorToUses,
+                  RegToId.find(Def->getOperand(0).getReg())->second, *Def);
+      if (Users->size() > 1) {
+#ifndef NDEBUG
+        // if all the uses of this def are in potential candidate, this is
+        // a complex candidate of level 2.
+        bool IsLevel2 = true;
+        for (const MachineInstr *MI : *Users) {
+          if (!PotentialCandidates.count(MI)) {
+            ++NumTooCplxLvl2;
+            IsLevel2 = false;
+            break;
+          }
+        }
+        if (IsLevel2)
+          ++NumCplxLvl2;
+#endif // NDEBUG
+        PotentialADROpportunities.insert(Def);
+        continue;
+      }
+      L2 = Def;
+      Def = *UseToDefs.find(Def)->second.begin();
+      L1 = Def;
+    } // else the element in the middle of the chain is nothing, thus
+      // Def already contains the first element of the chain.
+
+    // Check the number of users of the first node in the chain, i.e., ADRP
+    const SetOfMachineInstr *Users =
+        getUses(DefsPerColorToUses,
+                RegToId.find(Def->getOperand(0).getReg())->second, *Def);
+    if (Users->size() > 1) {
+#ifndef NDEBUG
+      // if all the uses of this def are in the defs of the potential candidate,
+      // this is a complex candidate of level 1
+      if (DefsOfPotentialCandidates.empty()) {
+        // lazy init
+        DefsOfPotentialCandidates = PotentialCandidates;
+        for (const MachineInstr *Candidate : PotentialCandidates) {
+          if (!UseToDefs.find(Candidate)->second.empty())
+            DefsOfPotentialCandidates.insert(
+                *UseToDefs.find(Candidate)->second.begin());
+        }
+      }
+      bool Found = false;
+      for (auto &Use : *Users) {
+        if (!DefsOfPotentialCandidates.count(Use)) {
+          ++NumTooCplxLvl1;
+          Found = true;
+          break;
+        }
+      }
+      if (!Found)
+        ++NumCplxLvl1;
+#endif // NDEBUG
+      continue;
+    }
+
+    bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri);
+    // If the chain is three instructions long and ldr is the second element,
+    // then this ldr must load form GOT, otherwise this is not a correct chain.
+    if (L2 && !IsL2Add &&
+        !(L2->getOperand(2).getTargetFlags() & AArch64II::MO_GOT))
+      continue;
+    SmallVector<const MachineInstr *, 3> Args;
+    MCLOHType Kind;
+    if (isCandidateLoad(Candidate)) {
+      if (!L2) {
+        // At this point, the candidate LOH indicates that the ldr instruction
+        // may use a direct access to the symbol. There is not such encoding
+        // for loads of byte and half.
+        if (!supportLoadFromLiteral(Candidate))
+          continue;
+
+        DEBUG(dbgs() << "Record AdrpLdr:\n" << *L1 << '\n' << *Candidate
+                     << '\n');
+        Kind = MCLOH_AdrpLdr;
+        Args.push_back(L1);
+        Args.push_back(Candidate);
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+               "L1 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+               "Candidate already involved in LOH.");
+        ++NumADRPToLDR;
+      } else {
+        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
+                     << "Ldr:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
+                     << '\n');
+
+        Kind = IsL2Add ? MCLOH_AdrpAddLdr : MCLOH_AdrpLdrGotLdr;
+        Args.push_back(L1);
+        Args.push_back(L2);
+        Args.push_back(Candidate);
+
+        PotentialADROpportunities.remove(L2);
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+               "L1 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
+               "L2 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+               "Candidate already involved in LOH.");
+#ifndef NDEBUG
+        // get the immediate of the load
+        if (Candidate->getOperand(2).getImm() == 0)
+          if (ImmediateDefOpc == AArch64::ADDXri)
+            ++NumADDToLDR;
+          else
+            ++NumLDRToLDR;
+        else if (ImmediateDefOpc == AArch64::ADDXri)
+          ++NumADDToLDRWithImm;
+        else
+          ++NumLDRToLDRWithImm;
+#endif // NDEBUG
+      }
+    } else {
+      if (ImmediateDefOpc == AArch64::ADRP)
+        continue;
+      else {
+
+        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
+                     << "Str:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
+                     << '\n');
+
+        Kind = IsL2Add ? MCLOH_AdrpAddStr : MCLOH_AdrpLdrGotStr;
+        Args.push_back(L1);
+        Args.push_back(L2);
+        Args.push_back(Candidate);
+
+        PotentialADROpportunities.remove(L2);
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+               "L1 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
+               "L2 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+               "Candidate already involved in LOH.");
+#ifndef NDEBUG
+        // get the immediate of the store
+        if (Candidate->getOperand(2).getImm() == 0)
+          if (ImmediateDefOpc == AArch64::ADDXri)
+            ++NumADDToSTR;
+          else
+            ++NumLDRToSTR;
+        else if (ImmediateDefOpc == AArch64::ADDXri)
+          ++NumADDToSTRWithImm;
+        else
+          ++NumLDRToSTRWithImm;
+#endif // DEBUG
+      }
+    }
+    AArch64FI.addLOHDirective(Kind, Args);
+  }
+
+  // Now, we grabbed all the big patterns, check ADR opportunities.
+  for (const MachineInstr *Candidate : PotentialADROpportunities)
+    registerADRCandidate(*Candidate, UseToDefs, DefsPerColorToUses, AArch64FI,
+                         InvolvedInLOHs, RegToId);
+}
+
+/// Look for every register defined by potential LOHs candidates.
+/// Map these registers with dense id in @p RegToId and vice-versa in
+/// @p IdToReg. @p IdToReg is populated only in DEBUG mode.
+static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId,
+                               MapIdToReg &IdToReg,
+                               const TargetRegisterInfo *TRI) {
+  unsigned CurRegId = 0;
+  if (!PreCollectRegister) {
+    unsigned NbReg = TRI->getNumRegs();
+    for (; CurRegId < NbReg; ++CurRegId) {
+      RegToId[CurRegId] = CurRegId;
+      DEBUG(IdToReg.push_back(CurRegId));
+      DEBUG(assert(IdToReg[CurRegId] == CurRegId && "Reg index mismatches"));
+    }
+    return;
+  }
+
+  DEBUG(dbgs() << "** Collect Involved Register\n");
+  for (const auto &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      if (!canDefBePartOfLOH(&MI) &&
+          !isCandidateLoad(&MI) && !isCandidateStore(&MI))
+        continue;
+
+      // Process defs
+      for (MachineInstr::const_mop_iterator IO = MI.operands_begin(),
+                                            IOEnd = MI.operands_end();
+           IO != IOEnd; ++IO) {
+        if (!IO->isReg() || !IO->isDef())
+          continue;
+        unsigned CurReg = IO->getReg();
+        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI)
+          if (RegToId.find(*AI) == RegToId.end()) {
+            DEBUG(IdToReg.push_back(*AI);
+                  assert(IdToReg[CurRegId] == *AI &&
+                         "Reg index mismatches insertion index."));
+            RegToId[*AI] = CurRegId++;
+            DEBUG(dbgs() << "Register: " << PrintReg(*AI, TRI) << '\n');
+          }
+      }
+    }
+  }
+}
+
+bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+
+  MapRegToId RegToId;
+  MapIdToReg IdToReg;
+  AArch64FunctionInfo *AArch64FI = MF.getInfo<AArch64FunctionInfo>();
+  assert(AArch64FI && "No MachineFunctionInfo for this function!");
+
+  DEBUG(dbgs() << "Looking for LOH in " << MF.getName() << '\n');
+
+  collectInvolvedReg(MF, RegToId, IdToReg, TRI);
+  if (RegToId.empty())
+    return false;
+
+  MachineInstr *DummyOp = nullptr;
+  if (BasicBlockScopeOnly) {
+    const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+    // For local analysis, create a dummy operation to record uses that are not
+    // local.
+    DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc());
+  }
+
+  unsigned NbReg = RegToId.size();
+  bool Modified = false;
+
+  // Start with ADRP.
+  InstrToInstrs *ColorOpToReachedUses = new InstrToInstrs[NbReg];
+
+  // Compute the reaching def in ADRP mode, meaning ADRP definitions
+  // are first considered as uses.
+  reachingDef(MF, ColorOpToReachedUses, RegToId, true, DummyOp);
+  DEBUG(dbgs() << "ADRP reaching defs\n");
+  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
+
+  // Translate the definition to uses map into a use to definitions map to ease
+  // statistic computation.
+  InstrToInstrs ADRPToReachingDefs;
+  reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true);
+
+  // Compute LOH for ADRP.
+  computeADRP(ADRPToReachingDefs, *AArch64FI, MDT);
+  delete[] ColorOpToReachedUses;
+
+  // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern.
+  ColorOpToReachedUses = new InstrToInstrs[NbReg];
+
+  // first perform a regular reaching def analysis.
+  reachingDef(MF, ColorOpToReachedUses, RegToId, false, DummyOp);
+  DEBUG(dbgs() << "All reaching defs\n");
+  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
+
+  // Turn that into a use to defs to ease statistic computation.
+  InstrToInstrs UsesToReachingDefs;
+  reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false);
+
+  // Compute other than AdrpAdrp LOH.
+  computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *AArch64FI, RegToId,
+                MDT);
+  delete[] ColorOpToReachedUses;
+
+  if (BasicBlockScopeOnly)
+    MF.DeleteMachineInstr(DummyOp);
+
+  return Modified;
+}
+
+/// createAArch64CollectLOHPass - returns an instance of the Statistic for
+/// linker optimization pass.
+FunctionPass *llvm::createAArch64CollectLOHPass() {
+  return new AArch64CollectLOH();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
new file mode 100644
index 000000000000..8b186328d125
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -0,0 +1,438 @@
+//=- AArch64ConditionOptimizer.cpp - Remove useless comparisons for AArch64 -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to make consecutive compares of values use same operands to
+// allow CSE pass to remove duplicated instructions.  For this it analyzes
+// branches and adjusts comparisons with immediate values by converting:
+//  * GE -> GT
+//  * GT -> GE
+//  * LT -> LE
+//  * LE -> LT
+// and adjusting immediate values appropriately.  It basically corrects two
+// immediate values towards each other to make them equal.
+//
+// Consider the following example in C:
+//
+//   if ((a < 5 && ...) || (a > 5 && ...)) {
+//        ~~~~~             ~~~~~
+//          ^                 ^
+//          x                 y
+//
+// Here both "x" and "y" expressions compare "a" with "5".  When "x" evaluates
+// to "false", "y" can just check flags set by the first comparison.  As a
+// result of the canonicalization employed by
+// SelectionDAGBuilder::visitSwitchCase, DAGCombine, and other target-specific
+// code, assembly ends up in the form that is not CSE friendly:
+//
+//     ...
+//     cmp      w8, #4
+//     b.gt     .LBB0_3
+//     ...
+//   .LBB0_3:
+//     cmp      w8, #6
+//     b.lt     .LBB0_6
+//     ...
+//
+// Same assembly after the pass:
+//
+//     ...
+//     cmp      w8, #5
+//     b.ge     .LBB0_3
+//     ...
+//   .LBB0_3:
+//     cmp      w8, #5     // <-- CSE pass removes this instruction
+//     b.le     .LBB0_6
+//     ...
+//
+// Currently only SUBS and ADDS followed by b.?? are supported.
+//
+// TODO: maybe handle TBNZ/TBZ the same way as CMP when used instead for "a < 0"
+// TODO: handle other conditional instructions (e.g. CSET)
+// TODO: allow second branching to be anything if it doesn't require adjusting
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cstdlib>
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-condopt"
+
+STATISTIC(NumConditionsAdjusted, "Number of conditions adjusted");
+
+namespace {
+class AArch64ConditionOptimizer : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  MachineDominatorTree *DomTree;
+  const MachineRegisterInfo *MRI;
+
+public:
+  // Stores immediate, compare instruction opcode and branch condition (in this
+  // order) of adjusted comparison.
+  typedef std::tuple<int, unsigned, AArch64CC::CondCode> CmpInfo;
+
+  static char ID;
+  AArch64ConditionOptimizer() : MachineFunctionPass(ID) {
+    initializeAArch64ConditionOptimizerPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  MachineInstr *findSuitableCompare(MachineBasicBlock *MBB);
+  CmpInfo adjustCmp(MachineInstr *CmpMI, AArch64CC::CondCode Cmp);
+  void modifyCmp(MachineInstr *CmpMI, const CmpInfo &Info);
+  bool adjustTo(MachineInstr *CmpMI, AArch64CC::CondCode Cmp, MachineInstr *To,
+                int ToImm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override {
+    return "AArch64 Condition Optimizer";
+  }
+};
+} // end anonymous namespace
+
+char AArch64ConditionOptimizer::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64ConditionOptimizer, "aarch64-condopt",
+                      "AArch64 CondOpt Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(AArch64ConditionOptimizer, "aarch64-condopt",
+                    "AArch64 CondOpt Pass", false, false)
+
+FunctionPass *llvm::createAArch64ConditionOptimizerPass() {
+  return new AArch64ConditionOptimizer();
+}
+
+void AArch64ConditionOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+// Finds compare instruction that corresponds to supported types of branching.
+// Returns the instruction or nullptr on failures or detecting unsupported
+// instructions.
+MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
+    MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator I = MBB->getFirstTerminator();
+  if (I == MBB->end())
+    return nullptr;
+
+  if (I->getOpcode() != AArch64::Bcc)
+    return nullptr;
+
+  // Since we may modify cmp of this MBB, make sure NZCV does not live out.
+  for (auto SuccBB : MBB->successors())
+    if (SuccBB->isLiveIn(AArch64::NZCV))
+      return nullptr;
+
+  // Now find the instruction controlling the terminator.
+  for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
+    --I;
+    assert(!I->isTerminator() && "Spurious terminator");
+    // Check if there is any use of NZCV between CMP and Bcc.
+    if (I->readsRegister(AArch64::NZCV))
+      return nullptr;
+    switch (I->getOpcode()) {
+    // cmp is an alias for subs with a dead destination register.
+    case AArch64::SUBSWri:
+    case AArch64::SUBSXri:
+    // cmn is an alias for adds with a dead destination register.
+    case AArch64::ADDSWri:
+    case AArch64::ADDSXri: {
+      unsigned ShiftAmt = AArch64_AM::getShiftValue(I->getOperand(3).getImm());
+      if (!I->getOperand(2).isImm()) {
+        DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n');
+        return nullptr;
+      } else if (I->getOperand(2).getImm() << ShiftAmt >= 0xfff) {
+        DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I << '\n');
+        return nullptr;
+      } else if (!MRI->use_empty(I->getOperand(0).getReg())) {
+        DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
+        return nullptr;
+      }
+      return &*I;
+    }
+    // Prevent false positive case like:
+    // cmp      w19, #0
+    // cinc     w0, w19, gt
+    // ...
+    // fcmp     d8, #0.0
+    // b.gt     .LBB0_5
+    case AArch64::FCMPDri:
+    case AArch64::FCMPSri:
+    case AArch64::FCMPESri:
+    case AArch64::FCMPEDri:
+
+    case AArch64::SUBSWrr:
+    case AArch64::SUBSXrr:
+    case AArch64::ADDSWrr:
+    case AArch64::ADDSXrr:
+    case AArch64::FCMPSrr:
+    case AArch64::FCMPDrr:
+    case AArch64::FCMPESrr:
+    case AArch64::FCMPEDrr:
+      // Skip comparison instructions without immediate operands.
+      return nullptr;
+    }
+  }
+  DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n');
+  return nullptr;
+}
+
+// Changes opcode adds <-> subs considering register operand width.
+static int getComplementOpc(int Opc) {
+  switch (Opc) {
+  case AArch64::ADDSWri: return AArch64::SUBSWri;
+  case AArch64::ADDSXri: return AArch64::SUBSXri;
+  case AArch64::SUBSWri: return AArch64::ADDSWri;
+  case AArch64::SUBSXri: return AArch64::ADDSXri;
+  default:
+    llvm_unreachable("Unexpected opcode");
+  }
+}
+
+// Changes form of comparison inclusive <-> exclusive.
+static AArch64CC::CondCode getAdjustedCmp(AArch64CC::CondCode Cmp) {
+  switch (Cmp) {
+  case AArch64CC::GT: return AArch64CC::GE;
+  case AArch64CC::GE: return AArch64CC::GT;
+  case AArch64CC::LT: return AArch64CC::LE;
+  case AArch64CC::LE: return AArch64CC::LT;
+  default:
+    llvm_unreachable("Unexpected condition code");
+  }
+}
+
+// Transforms GT -> GE, GE -> GT, LT -> LE, LE -> LT by updating comparison
+// operator and condition code.
+AArch64ConditionOptimizer::CmpInfo AArch64ConditionOptimizer::adjustCmp(
+    MachineInstr *CmpMI, AArch64CC::CondCode Cmp) {
+  unsigned Opc = CmpMI->getOpcode();
+
+  // CMN (compare with negative immediate) is an alias to ADDS (as
+  // "operand - negative" == "operand + positive")
+  bool Negative = (Opc == AArch64::ADDSWri || Opc == AArch64::ADDSXri);
+
+  int Correction = (Cmp == AArch64CC::GT) ? 1 : -1;
+  // Negate Correction value for comparison with negative immediate (CMN).
+  if (Negative) {
+    Correction = -Correction;
+  }
+
+  const int OldImm = (int)CmpMI->getOperand(2).getImm();
+  const int NewImm = std::abs(OldImm + Correction);
+
+  // Handle +0 -> -1 and -0 -> +1 (CMN with 0 immediate) transitions by
+  // adjusting compare instruction opcode.
+  if (OldImm == 0 && ((Negative && Correction == 1) ||
+                      (!Negative && Correction == -1))) {
+    Opc = getComplementOpc(Opc);
+  }
+
+  return CmpInfo(NewImm, Opc, getAdjustedCmp(Cmp));
+}
+
+// Applies changes to comparison instruction suggested by adjustCmp().
+void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
+    const CmpInfo &Info) {
+  int Imm;
+  unsigned Opc;
+  AArch64CC::CondCode Cmp;
+  std::tie(Imm, Opc, Cmp) = Info;
+
+  MachineBasicBlock *const MBB = CmpMI->getParent();
+
+  // Change immediate in comparison instruction (ADDS or SUBS).
+  BuildMI(*MBB, CmpMI, CmpMI->getDebugLoc(), TII->get(Opc))
+      .addOperand(CmpMI->getOperand(0))
+      .addOperand(CmpMI->getOperand(1))
+      .addImm(Imm)
+      .addOperand(CmpMI->getOperand(3));
+  CmpMI->eraseFromParent();
+
+  // The fact that this comparison was picked ensures that it's related to the
+  // first terminator instruction.
+  MachineInstr &BrMI = *MBB->getFirstTerminator();
+
+  // Change condition in branch instruction.
+  BuildMI(*MBB, BrMI, BrMI.getDebugLoc(), TII->get(AArch64::Bcc))
+      .addImm(Cmp)
+      .addOperand(BrMI.getOperand(1));
+  BrMI.eraseFromParent();
+
+  MBB->updateTerminator();
+
+  ++NumConditionsAdjusted;
+}
+
+// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// corresponding to TBB.
+// Returns true if parsing was successful, otherwise false is returned.
+static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
+  // A normal br.cond simply has the condition code.
+  if (Cond[0].getImm() != -1) {
+    assert(Cond.size() == 1 && "Unknown Cond array format");
+    CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+    return true;
+  }
+  return false;
+}
+
+// Adjusts one cmp instruction to another one if result of adjustment will allow
+// CSE.  Returns true if compare instruction was changed, otherwise false is
+// returned.
+bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI,
+  AArch64CC::CondCode Cmp, MachineInstr *To, int ToImm)
+{
+  CmpInfo Info = adjustCmp(CmpMI, Cmp);
+  if (std::get<0>(Info) == ToImm && std::get<1>(Info) == To->getOpcode()) {
+    modifyCmp(CmpMI, Info);
+    return true;
+  }
+  return false;
+}
+
+bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  TII = MF.getSubtarget().getInstrInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  MRI = &MF.getRegInfo();
+
+  bool Changed = false;
+
+  // Visit blocks in dominator tree pre-order. The pre-order enables multiple
+  // cmp-conversions from the same head block.
+  // Note that updateDomTree() modifies the children of the DomTree node
+  // currently being visited. The df_iterator supports that; it doesn't look at
+  // child_begin() / child_end() until after a node has been visited.
+  for (MachineDomTreeNode *I : depth_first(DomTree)) {
+    MachineBasicBlock *HBB = I->getBlock();
+
+    SmallVector<MachineOperand, 4> HeadCond;
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+    if (TII->analyzeBranch(*HBB, TBB, FBB, HeadCond)) {
+      continue;
+    }
+
+    // Equivalence check is to skip loops.
+    if (!TBB || TBB == HBB) {
+      continue;
+    }
+
+    SmallVector<MachineOperand, 4> TrueCond;
+    MachineBasicBlock *TBB_TBB = nullptr, *TBB_FBB = nullptr;
+    if (TII->analyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) {
+      continue;
+    }
+
+    MachineInstr *HeadCmpMI = findSuitableCompare(HBB);
+    if (!HeadCmpMI) {
+      continue;
+    }
+
+    MachineInstr *TrueCmpMI = findSuitableCompare(TBB);
+    if (!TrueCmpMI) {
+      continue;
+    }
+
+    AArch64CC::CondCode HeadCmp;
+    if (HeadCond.empty() || !parseCond(HeadCond, HeadCmp)) {
+      continue;
+    }
+
+    AArch64CC::CondCode TrueCmp;
+    if (TrueCond.empty() || !parseCond(TrueCond, TrueCmp)) {
+      continue;
+    }
+
+    const int HeadImm = (int)HeadCmpMI->getOperand(2).getImm();
+    const int TrueImm = (int)TrueCmpMI->getOperand(2).getImm();
+
+    DEBUG(dbgs() << "Head branch:\n");
+    DEBUG(dbgs() << "\tcondition: "
+          << AArch64CC::getCondCodeName(HeadCmp) << '\n');
+    DEBUG(dbgs() << "\timmediate: " << HeadImm << '\n');
+
+    DEBUG(dbgs() << "True branch:\n");
+    DEBUG(dbgs() << "\tcondition: "
+          << AArch64CC::getCondCodeName(TrueCmp) << '\n');
+    DEBUG(dbgs() << "\timmediate: " << TrueImm << '\n');
+
+    if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::LT) ||
+         (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::GT)) &&
+        std::abs(TrueImm - HeadImm) == 2) {
+      // This branch transforms machine instructions that correspond to
+      //
+      // 1) (a > {TrueImm} && ...) || (a < {HeadImm} && ...)
+      // 2) (a < {TrueImm} && ...) || (a > {HeadImm} && ...)
+      //
+      // into
+      //
+      // 1) (a >= {NewImm} && ...) || (a <= {NewImm} && ...)
+      // 2) (a <= {NewImm} && ...) || (a >= {NewImm} && ...)
+
+      CmpInfo HeadCmpInfo = adjustCmp(HeadCmpMI, HeadCmp);
+      CmpInfo TrueCmpInfo = adjustCmp(TrueCmpMI, TrueCmp);
+      if (std::get<0>(HeadCmpInfo) == std::get<0>(TrueCmpInfo) &&
+          std::get<1>(HeadCmpInfo) == std::get<1>(TrueCmpInfo)) {
+        modifyCmp(HeadCmpMI, HeadCmpInfo);
+        modifyCmp(TrueCmpMI, TrueCmpInfo);
+        Changed = true;
+      }
+    } else if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::GT) ||
+                (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::LT)) &&
+                std::abs(TrueImm - HeadImm) == 1) {
+      // This branch transforms machine instructions that correspond to
+      //
+      // 1) (a > {TrueImm} && ...) || (a > {HeadImm} && ...)
+      // 2) (a < {TrueImm} && ...) || (a < {HeadImm} && ...)
+      //
+      // into
+      //
+      // 1) (a <= {NewImm} && ...) || (a >  {NewImm} && ...)
+      // 2) (a <  {NewImm} && ...) || (a >= {NewImm} && ...)
+
+      // GT -> GE transformation increases immediate value, so picking the
+      // smaller one; LT -> LE decreases immediate value so invert the choice.
+      bool adjustHeadCond = (HeadImm < TrueImm);
+      if (HeadCmp == AArch64CC::LT) {
+          adjustHeadCond = !adjustHeadCond;
+      }
+
+      if (adjustHeadCond) {
+        Changed |= adjustTo(HeadCmpMI, HeadCmp, TrueCmpMI, TrueImm);
+      } else {
+        Changed |= adjustTo(TrueCmpMI, TrueCmp, HeadCmpMI, HeadImm);
+      }
+    }
+    // Other transformation cases almost never occur due to generation of < or >
+    // comparisons instead of <= and >=.
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
new file mode 100644
index 000000000000..da09b36cac9c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -0,0 +1,913 @@
+//===-- AArch64ConditionalCompares.cpp --- CCMP formation for AArch64 -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64ConditionalCompares pass which reduces
+// branching and code size by using the conditional compare instructions CCMP,
+// CCMN, and FCMP.
+//
+// The CFG transformations for forming conditional compares are very similar to
+// if-conversion, and this pass should run immediately before the early
+// if-conversion pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-ccmp"
+
+// Absolute maximum number of instructions allowed per speculated block.
+// This bypasses all other heuristics, so it should be set fairly high.
+static cl::opt<unsigned> BlockInstrLimit(
+    "aarch64-ccmp-limit", cl::init(30), cl::Hidden,
+    cl::desc("Maximum number of instructions per speculated block."));
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("aarch64-stress-ccmp", cl::Hidden,
+                            cl::desc("Turn all knobs to 11"));
+
+STATISTIC(NumConsidered, "Number of ccmps considered");
+STATISTIC(NumPhiRejs, "Number of ccmps rejected (PHI)");
+STATISTIC(NumPhysRejs, "Number of ccmps rejected (Physregs)");
+STATISTIC(NumPhi2Rejs, "Number of ccmps rejected (PHI2)");
+STATISTIC(NumHeadBranchRejs, "Number of ccmps rejected (Head branch)");
+STATISTIC(NumCmpBranchRejs, "Number of ccmps rejected (CmpBB branch)");
+STATISTIC(NumCmpTermRejs, "Number of ccmps rejected (CmpBB is cbz...)");
+STATISTIC(NumImmRangeRejs, "Number of ccmps rejected (Imm out of range)");
+STATISTIC(NumLiveDstRejs, "Number of ccmps rejected (Cmp dest live)");
+STATISTIC(NumMultNZCVUses, "Number of ccmps rejected (NZCV used)");
+STATISTIC(NumUnknNZCVDefs, "Number of ccmps rejected (NZCV def unknown)");
+
+STATISTIC(NumSpeculateRejs, "Number of ccmps rejected (Can't speculate)");
+
+STATISTIC(NumConverted, "Number of ccmp instructions created");
+STATISTIC(NumCompBranches, "Number of cbz/cbnz branches converted");
+
+//===----------------------------------------------------------------------===//
+//                                 SSACCmpConv
+//===----------------------------------------------------------------------===//
+//
+// The SSACCmpConv class performs ccmp-conversion on SSA form machine code
+// after determining if it is possible. The class contains no heuristics;
+// external code should be used to determine when ccmp-conversion is a good
+// idea.
+//
+// CCmp-formation works on a CFG representing chained conditions, typically
+// from C's short-circuit || and && operators:
+//
+//   From:         Head            To:         Head
+//                 / |                         CmpBB
+//                /  |                         / |
+//               |  CmpBB                     /  |
+//               |  / |                    Tail  |
+//               | /  |                      |   |
+//              Tail  |                      |   |
+//                |   |                      |   |
+//               ... ...                    ... ...
+//
+// The Head block is terminated by a br.cond instruction, and the CmpBB block
+// contains compare + br.cond. Tail must be a successor of both.
+//
+// The cmp-conversion turns the compare instruction in CmpBB into a conditional
+// compare, and merges CmpBB into Head, speculatively executing its
+// instructions. The AArch64 conditional compare instructions have an immediate
+// operand that specifies the NZCV flag values when the condition is false and
+// the compare isn't executed. This makes it possible to chain compares with
+// different condition codes.
+//
+// Example:
+//
+//    if (a == 5 || b == 17)
+//      foo();
+//
+//    Head:
+//       cmp  w0, #5
+//       b.eq Tail
+//    CmpBB:
+//       cmp  w1, #17
+//       b.eq Tail
+//    ...
+//    Tail:
+//      bl _foo
+//
+//  Becomes:
+//
+//    Head:
+//       cmp  w0, #5
+//       ccmp w1, #17, 4, ne  ; 4 = nZcv
+//       b.eq Tail
+//    ...
+//    Tail:
+//      bl _foo
+//
+// The ccmp condition code is the one that would cause the Head terminator to
+// branch to CmpBB.
+//
+// FIXME: It should also be possible to speculate a block on the critical edge
+// between Head and Tail, just like if-converting a diamond.
+//
+// FIXME: Handle PHIs in Tail by turning them into selects (if-conversion).
+
+namespace {
+class SSACCmpConv {
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+public:
+  /// The first block containing a conditional branch, dominating everything
+  /// else.
+  MachineBasicBlock *Head;
+
+  /// The block containing cmp+br.cond with a successor shared with Head.
+  MachineBasicBlock *CmpBB;
+
+  /// The common successor for Head and CmpBB.
+  MachineBasicBlock *Tail;
+
+  /// The compare instruction in CmpBB that can be converted to a ccmp.
+  MachineInstr *CmpMI;
+
+private:
+  /// The branch condition in Head as determined by AnalyzeBranch.
+  SmallVector<MachineOperand, 4> HeadCond;
+
+  /// The condition code that makes Head branch to CmpBB.
+  AArch64CC::CondCode HeadCmpBBCC;
+
+  /// The branch condition in CmpBB.
+  SmallVector<MachineOperand, 4> CmpBBCond;
+
+  /// The condition code that makes CmpBB branch to Tail.
+  AArch64CC::CondCode CmpBBTailCC;
+
+  /// Check if the Tail PHIs are trivially convertible.
+  bool trivialTailPHIs();
+
+  /// Remove CmpBB from the Tail PHIs.
+  void updateTailPHIs();
+
+  /// Check if an operand defining DstReg is dead.
+  bool isDeadDef(unsigned DstReg);
+
+  /// Find the compare instruction in MBB that controls the conditional branch.
+  /// Return NULL if a convertible instruction can't be found.
+  MachineInstr *findConvertibleCompare(MachineBasicBlock *MBB);
+
+  /// Return true if all non-terminator instructions in MBB can be safely
+  /// speculated.
+  bool canSpeculateInstrs(MachineBasicBlock *MBB, const MachineInstr *CmpMI);
+
+public:
+  /// runOnMachineFunction - Initialize per-function data structures.
+  void runOnMachineFunction(MachineFunction &MF) {
+    this->MF = &MF;
+    TII = MF.getSubtarget().getInstrInfo();
+    TRI = MF.getSubtarget().getRegisterInfo();
+    MRI = &MF.getRegInfo();
+  }
+
+  /// If the sub-CFG headed by MBB can be cmp-converted, initialize the
+  /// internal state, and return true.
+  bool canConvert(MachineBasicBlock *MBB);
+
+  /// Cmo-convert the last block passed to canConvertCmp(), assuming
+  /// it is possible. Add any erased blocks to RemovedBlocks.
+  void convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks);
+
+  /// Return the expected code size delta if the conversion into a
+  /// conditional compare is performed.
+  int expectedCodeSizeDelta() const;
+};
+} // end anonymous namespace
+
+// Check that all PHIs in Tail are selecting the same value from Head and CmpBB.
+// This means that no if-conversion is required when merging CmpBB into Head.
+bool SSACCmpConv::trivialTailPHIs() {
+  for (auto &I : *Tail) {
+    if (!I.isPHI())
+      break;
+    unsigned HeadReg = 0, CmpBBReg = 0;
+    // PHI operands come in (VReg, MBB) pairs.
+    for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) {
+      MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB();
+      unsigned Reg = I.getOperand(oi).getReg();
+      if (MBB == Head) {
+        assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands");
+        HeadReg = Reg;
+      }
+      if (MBB == CmpBB) {
+        assert((!CmpBBReg || CmpBBReg == Reg) && "Inconsistent PHI operands");
+        CmpBBReg = Reg;
+      }
+    }
+    if (HeadReg != CmpBBReg)
+      return false;
+  }
+  return true;
+}
+
+// Assuming that trivialTailPHIs() is true, update the Tail PHIs by simply
+// removing the CmpBB operands. The Head operands will be identical.
+void SSACCmpConv::updateTailPHIs() {
+  for (auto &I : *Tail) {
+    if (!I.isPHI())
+      break;
+    // I is a PHI. It can have multiple entries for CmpBB.
+    for (unsigned oi = I.getNumOperands(); oi > 2; oi -= 2) {
+      // PHI operands are (Reg, MBB) at (oi-2, oi-1).
+      if (I.getOperand(oi - 1).getMBB() == CmpBB) {
+        I.RemoveOperand(oi - 1);
+        I.RemoveOperand(oi - 2);
+      }
+    }
+  }
+}
+
+// This pass runs before the AArch64DeadRegisterDefinitions pass, so compares
+// are still writing virtual registers without any uses.
+bool SSACCmpConv::isDeadDef(unsigned DstReg) {
+  // Writes to the zero register are dead.
+  if (DstReg == AArch64::WZR || DstReg == AArch64::XZR)
+    return true;
+  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+    return false;
+  // A virtual register def without any uses will be marked dead later, and
+  // eventually replaced by the zero register.
+  return MRI->use_nodbg_empty(DstReg);
+}
+
+// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// corresponding to TBB.
+// Return
+static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
+  // A normal br.cond simply has the condition code.
+  if (Cond[0].getImm() != -1) {
+    assert(Cond.size() == 1 && "Unknown Cond array format");
+    CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+    return true;
+  }
+  // For tbz and cbz instruction, the opcode is next.
+  switch (Cond[1].getImm()) {
+  default:
+    // This includes tbz / tbnz branches which can't be converted to
+    // ccmp + br.cond.
+    return false;
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+    assert(Cond.size() == 3 && "Unknown Cond array format");
+    CC = AArch64CC::EQ;
+    return true;
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    assert(Cond.size() == 3 && "Unknown Cond array format");
+    CC = AArch64CC::NE;
+    return true;
+  }
+}
+
+MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator I = MBB->getFirstTerminator();
+  if (I == MBB->end())
+    return nullptr;
+  // The terminator must be controlled by the flags.
+  if (!I->readsRegister(AArch64::NZCV)) {
+    switch (I->getOpcode()) {
+    case AArch64::CBZW:
+    case AArch64::CBZX:
+    case AArch64::CBNZW:
+    case AArch64::CBNZX:
+      // These can be converted into a ccmp against #0.
+      return &*I;
+    }
+    ++NumCmpTermRejs;
+    DEBUG(dbgs() << "Flags not used by terminator: " << *I);
+    return nullptr;
+  }
+
+  // Now find the instruction controlling the terminator.
+  for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
+    --I;
+    assert(!I->isTerminator() && "Spurious terminator");
+    switch (I->getOpcode()) {
+    // cmp is an alias for subs with a dead destination register.
+    case AArch64::SUBSWri:
+    case AArch64::SUBSXri:
+    // cmn is an alias for adds with a dead destination register.
+    case AArch64::ADDSWri:
+    case AArch64::ADDSXri:
+      // Check that the immediate operand is within range, ccmp wants a uimm5.
+      // Rd = SUBSri Rn, imm, shift
+      if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) {
+        DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I);
+        ++NumImmRangeRejs;
+        return nullptr;
+      }
+      LLVM_FALLTHROUGH;
+    case AArch64::SUBSWrr:
+    case AArch64::SUBSXrr:
+    case AArch64::ADDSWrr:
+    case AArch64::ADDSXrr:
+      if (isDeadDef(I->getOperand(0).getReg()))
+        return &*I;
+      DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
+      ++NumLiveDstRejs;
+      return nullptr;
+    case AArch64::FCMPSrr:
+    case AArch64::FCMPDrr:
+    case AArch64::FCMPESrr:
+    case AArch64::FCMPEDrr:
+      return &*I;
+    }
+
+    // Check for flag reads and clobbers.
+    MIOperands::PhysRegInfo PRI =
+        MIOperands(*I).analyzePhysReg(AArch64::NZCV, TRI);
+
+    if (PRI.Read) {
+      // The ccmp doesn't produce exactly the same flags as the original
+      // compare, so reject the transform if there are uses of the flags
+      // besides the terminators.
+      DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I);
+      ++NumMultNZCVUses;
+      return nullptr;
+    }
+
+    if (PRI.Defined || PRI.Clobbered) {
+      DEBUG(dbgs() << "Not convertible compare: " << *I);
+      ++NumUnknNZCVDefs;
+      return nullptr;
+    }
+  }
+  DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n');
+  return nullptr;
+}
+
+/// Determine if all the instructions in MBB can safely
+/// be speculated. The terminators are not considered.
+///
+/// Only CmpMI is allowed to clobber the flags.
+///
+bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
+                                     const MachineInstr *CmpMI) {
+  // Reject any live-in physregs. It's probably NZCV/EFLAGS, and very hard to
+  // get right.
+  if (!MBB->livein_empty()) {
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n");
+    return false;
+  }
+
+  unsigned InstrCount = 0;
+
+  // Check all instructions, except the terminators. It is assumed that
+  // terminators never have side effects or define any used register values.
+  for (auto &I : make_range(MBB->begin(), MBB->getFirstTerminator())) {
+    if (I.isDebugValue())
+      continue;
+
+    if (++InstrCount > BlockInstrLimit && !Stress) {
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than "
+                   << BlockInstrLimit << " instructions.\n");
+      return false;
+    }
+
+    // There shouldn't normally be any phis in a single-predecessor block.
+    if (I.isPHI()) {
+      DEBUG(dbgs() << "Can't hoist: " << I);
+      return false;
+    }
+
+    // Don't speculate loads. Note that it may be possible and desirable to
+    // speculate GOT or constant pool loads that are guaranteed not to trap,
+    // but we don't support that for now.
+    if (I.mayLoad()) {
+      DEBUG(dbgs() << "Won't speculate load: " << I);
+      return false;
+    }
+
+    // We never speculate stores, so an AA pointer isn't necessary.
+    bool DontMoveAcrossStore = true;
+    if (!I.isSafeToMove(nullptr, DontMoveAcrossStore)) {
+      DEBUG(dbgs() << "Can't speculate: " << I);
+      return false;
+    }
+
+    // Only CmpMI is allowed to clobber the flags.
+    if (&I != CmpMI && I.modifiesRegister(AArch64::NZCV, TRI)) {
+      DEBUG(dbgs() << "Clobbers flags: " << I);
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Analyze the sub-cfg rooted in MBB, and return true if it is a potential
+/// candidate for cmp-conversion. Fill out the internal state.
+///
+bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
+  Head = MBB;
+  Tail = CmpBB = nullptr;
+
+  if (Head->succ_size() != 2)
+    return false;
+  MachineBasicBlock *Succ0 = Head->succ_begin()[0];
+  MachineBasicBlock *Succ1 = Head->succ_begin()[1];
+
+  // CmpBB can only have a single predecessor. Tail is allowed many.
+  if (Succ0->pred_size() != 1)
+    std::swap(Succ0, Succ1);
+
+  // Succ0 is our candidate for CmpBB.
+  if (Succ0->pred_size() != 1 || Succ0->succ_size() != 2)
+    return false;
+
+  CmpBB = Succ0;
+  Tail = Succ1;
+
+  if (!CmpBB->isSuccessor(Tail))
+    return false;
+
+  // The CFG topology checks out.
+  DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber() << " -> BB#"
+               << CmpBB->getNumber() << " -> BB#" << Tail->getNumber() << '\n');
+  ++NumConsidered;
+
+  // Tail is allowed to have many predecessors, but we can't handle PHIs yet.
+  //
+  // FIXME: Real PHIs could be if-converted as long as the CmpBB values are
+  // defined before The CmpBB cmp clobbers the flags. Alternatively, it should
+  // always be safe to sink the ccmp down to immediately before the CmpBB
+  // terminators.
+  if (!trivialTailPHIs()) {
+    DEBUG(dbgs() << "Can't handle phis in Tail.\n");
+    ++NumPhiRejs;
+    return false;
+  }
+
+  if (!Tail->livein_empty()) {
+    DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n");
+    ++NumPhysRejs;
+    return false;
+  }
+
+  // CmpBB should never have PHIs since Head is its only predecessor.
+  // FIXME: Clean them up if it happens.
+  if (!CmpBB->empty() && CmpBB->front().isPHI()) {
+    DEBUG(dbgs() << "Can't handle phis in CmpBB.\n");
+    ++NumPhi2Rejs;
+    return false;
+  }
+
+  if (!CmpBB->livein_empty()) {
+    DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n");
+    ++NumPhysRejs;
+    return false;
+  }
+
+  // The branch we're looking to eliminate must be analyzable.
+  HeadCond.clear();
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  if (TII->analyzeBranch(*Head, TBB, FBB, HeadCond)) {
+    DEBUG(dbgs() << "Head branch not analyzable.\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  // This is weird, probably some sort of degenerate CFG, or an edge to a
+  // landing pad.
+  if (!TBB || HeadCond.empty()) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  if (!parseCond(HeadCond, HeadCmpBBCC)) {
+    DEBUG(dbgs() << "Unsupported branch type on Head\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  // Make sure the branch direction is right.
+  if (TBB != CmpBB) {
+    assert(TBB == Tail && "Unexpected TBB");
+    HeadCmpBBCC = AArch64CC::getInvertedCondCode(HeadCmpBBCC);
+  }
+
+  CmpBBCond.clear();
+  TBB = FBB = nullptr;
+  if (TII->analyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
+    DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (!TBB || CmpBBCond.empty()) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (!parseCond(CmpBBCond, CmpBBTailCC)) {
+    DEBUG(dbgs() << "Unsupported branch type on CmpBB\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (TBB != Tail)
+    CmpBBTailCC = AArch64CC::getInvertedCondCode(CmpBBTailCC);
+
+  DEBUG(dbgs() << "Head->CmpBB on " << AArch64CC::getCondCodeName(HeadCmpBBCC)
+               << ", CmpBB->Tail on " << AArch64CC::getCondCodeName(CmpBBTailCC)
+               << '\n');
+
+  CmpMI = findConvertibleCompare(CmpBB);
+  if (!CmpMI)
+    return false;
+
+  if (!canSpeculateInstrs(CmpBB, CmpMI)) {
+    ++NumSpeculateRejs;
+    return false;
+  }
+  return true;
+}
+
+void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
+  DEBUG(dbgs() << "Merging BB#" << CmpBB->getNumber() << " into BB#"
+               << Head->getNumber() << ":\n" << *CmpBB);
+
+  // All CmpBB instructions are moved into Head, and CmpBB is deleted.
+  // Update the CFG first.
+  updateTailPHIs();
+  Head->removeSuccessor(CmpBB, true);
+  CmpBB->removeSuccessor(Tail, true);
+  Head->transferSuccessorsAndUpdatePHIs(CmpBB);
+  DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
+  TII->removeBranch(*Head);
+
+  // If the Head terminator was one of the cbz / tbz branches with built-in
+  // compare, we need to insert an explicit compare instruction in its place.
+  if (HeadCond[0].getImm() == -1) {
+    ++NumCompBranches;
+    unsigned Opc = 0;
+    switch (HeadCond[1].getImm()) {
+    case AArch64::CBZW:
+    case AArch64::CBNZW:
+      Opc = AArch64::SUBSWri;
+      break;
+    case AArch64::CBZX:
+    case AArch64::CBNZX:
+      Opc = AArch64::SUBSXri;
+      break;
+    default:
+      llvm_unreachable("Cannot convert Head branch");
+    }
+    const MCInstrDesc &MCID = TII->get(Opc);
+    // Create a dummy virtual register for the SUBS def.
+    unsigned DestReg =
+        MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF));
+    // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
+    BuildMI(*Head, Head->end(), TermDL, MCID)
+        .addReg(DestReg, RegState::Define | RegState::Dead)
+        .addOperand(HeadCond[2])
+        .addImm(0)
+        .addImm(0);
+    // SUBS uses the GPR*sp register classes.
+    MRI->constrainRegClass(HeadCond[2].getReg(),
+                           TII->getRegClass(MCID, 1, TRI, *MF));
+  }
+
+  Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end());
+
+  // Now replace CmpMI with a ccmp instruction that also considers the incoming
+  // flags.
+  unsigned Opc = 0;
+  unsigned FirstOp = 1;   // First CmpMI operand to copy.
+  bool isZBranch = false; // CmpMI is a cbz/cbnz instruction.
+  switch (CmpMI->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown compare opcode");
+  case AArch64::SUBSWri:    Opc = AArch64::CCMPWi; break;
+  case AArch64::SUBSWrr:    Opc = AArch64::CCMPWr; break;
+  case AArch64::SUBSXri:    Opc = AArch64::CCMPXi; break;
+  case AArch64::SUBSXrr:    Opc = AArch64::CCMPXr; break;
+  case AArch64::ADDSWri:    Opc = AArch64::CCMNWi; break;
+  case AArch64::ADDSWrr:    Opc = AArch64::CCMNWr; break;
+  case AArch64::ADDSXri:    Opc = AArch64::CCMNXi; break;
+  case AArch64::ADDSXrr:    Opc = AArch64::CCMNXr; break;
+  case AArch64::FCMPSrr:    Opc = AArch64::FCCMPSrr; FirstOp = 0; break;
+  case AArch64::FCMPDrr:    Opc = AArch64::FCCMPDrr; FirstOp = 0; break;
+  case AArch64::FCMPESrr:   Opc = AArch64::FCCMPESrr; FirstOp = 0; break;
+  case AArch64::FCMPEDrr:   Opc = AArch64::FCCMPEDrr; FirstOp = 0; break;
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+    Opc = AArch64::CCMPWi;
+    FirstOp = 0;
+    isZBranch = true;
+    break;
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+    Opc = AArch64::CCMPXi;
+    FirstOp = 0;
+    isZBranch = true;
+    break;
+  }
+
+  // The ccmp instruction should set the flags according to the comparison when
+  // Head would have branched to CmpBB.
+  // The NZCV immediate operand should provide flags for the case where Head
+  // would have branched to Tail. These flags should cause the new Head
+  // terminator to branch to tail.
+  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
+  const MCInstrDesc &MCID = TII->get(Opc);
+  MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(),
+                         TII->getRegClass(MCID, 0, TRI, *MF));
+  if (CmpMI->getOperand(FirstOp + 1).isReg())
+    MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
+                           TII->getRegClass(MCID, 1, TRI, *MF));
+  MachineInstrBuilder MIB =
+      BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
+          .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn
+  if (isZBranch)
+    MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0
+  else
+    MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate
+  MIB.addImm(NZCV).addImm(HeadCmpBBCC);
+
+  // If CmpMI was a terminator, we need a new conditional branch to replace it.
+  // This now becomes a Head terminator.
+  if (isZBranch) {
+    bool isNZ = CmpMI->getOpcode() == AArch64::CBNZW ||
+                CmpMI->getOpcode() == AArch64::CBNZX;
+    BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(AArch64::Bcc))
+        .addImm(isNZ ? AArch64CC::NE : AArch64CC::EQ)
+        .addOperand(CmpMI->getOperand(1)); // Branch target.
+  }
+  CmpMI->eraseFromParent();
+  Head->updateTerminator();
+
+  RemovedBlocks.push_back(CmpBB);
+  CmpBB->eraseFromParent();
+  DEBUG(dbgs() << "Result:\n" << *Head);
+  ++NumConverted;
+}
+
+int SSACCmpConv::expectedCodeSizeDelta() const {
+  int delta = 0;
+  // If the Head terminator was one of the cbz / tbz branches with built-in
+  // compare, we need to insert an explicit compare instruction in its place
+  // plus a branch instruction.
+  if (HeadCond[0].getImm() == -1) {
+    switch (HeadCond[1].getImm()) {
+    case AArch64::CBZW:
+    case AArch64::CBNZW:
+    case AArch64::CBZX:
+    case AArch64::CBNZX:
+      // Therefore delta += 1
+      delta = 1;
+      break;
+    default:
+      llvm_unreachable("Cannot convert Head branch");
+    }
+  }
+  // If the Cmp terminator was one of the cbz / tbz branches with
+  // built-in compare, it will be turned into a compare instruction
+  // into Head, but we do not save any instruction.
+  // Otherwise, we save the branch instruction.
+  switch (CmpMI->getOpcode()) {
+  default:
+    --delta;
+    break;
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+    break;
+  }
+  return delta;
+}
+
+//===----------------------------------------------------------------------===//
+//                       AArch64ConditionalCompares Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AArch64ConditionalCompares : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MCSchedModel SchedModel;
+  // Does the proceeded function has Oz attribute.
+  bool MinSize;
+  MachineRegisterInfo *MRI;
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *Loops;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+  SSACCmpConv CmpConv;
+
+public:
+  static char ID;
+  AArch64ConditionalCompares() : MachineFunctionPass(ID) {
+    initializeAArch64ConditionalComparesPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override {
+    return "AArch64 Conditional Compares";
+  }
+
+private:
+  bool tryConvert(MachineBasicBlock *);
+  void updateDomTree(ArrayRef<MachineBasicBlock *> Removed);
+  void updateLoops(ArrayRef<MachineBasicBlock *> Removed);
+  void invalidateTraces();
+  bool shouldConvert();
+};
+} // end anonymous namespace
+
+char AArch64ConditionalCompares::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
+                      "AArch64 CCMP Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
+                    "AArch64 CCMP Pass", false, false)
+
+FunctionPass *llvm::createAArch64ConditionalCompares() {
+  return new AArch64ConditionalCompares();
+}
+
+void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<MachineTraceMetrics>();
+  AU.addPreserved<MachineTraceMetrics>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Update the dominator tree after if-conversion erased some blocks.
+void AArch64ConditionalCompares::updateDomTree(
+    ArrayRef<MachineBasicBlock *> Removed) {
+  // convert() removes CmpBB which was previously dominated by Head.
+  // CmpBB children should be transferred to Head.
+  MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head);
+  for (MachineBasicBlock *RemovedMBB : Removed) {
+    MachineDomTreeNode *Node = DomTree->getNode(RemovedMBB);
+    assert(Node != HeadNode && "Cannot erase the head node");
+    assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head");
+    while (Node->getNumChildren())
+      DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
+    DomTree->eraseNode(RemovedMBB);
+  }
+}
+
+/// Update LoopInfo after if-conversion.
+void
+AArch64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
+  if (!Loops)
+    return;
+  for (MachineBasicBlock *RemovedMBB : Removed)
+    Loops->removeBlock(RemovedMBB);
+}
+
+/// Invalidate MachineTraceMetrics before if-conversion.
+void AArch64ConditionalCompares::invalidateTraces() {
+  Traces->invalidate(CmpConv.Head);
+  Traces->invalidate(CmpConv.CmpBB);
+}
+
+/// Apply cost model and heuristics to the if-conversion in IfConv.
+/// Return true if the conversion is a good idea.
+///
+bool AArch64ConditionalCompares::shouldConvert() {
+  // Stress testing mode disables all cost considerations.
+  if (Stress)
+    return true;
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  // Head dominates CmpBB, so it is always included in its trace.
+  MachineTraceMetrics::Trace Trace = MinInstr->getTrace(CmpConv.CmpBB);
+
+  // If code size is the main concern
+  if (MinSize) {
+    int CodeSizeDelta = CmpConv.expectedCodeSizeDelta();
+    DEBUG(dbgs() << "Code size delta:  " << CodeSizeDelta << '\n');
+    // If we are minimizing the code size, do the conversion whatever
+    // the cost is.
+    if (CodeSizeDelta < 0)
+      return true;
+    if (CodeSizeDelta > 0) {
+      DEBUG(dbgs() << "Code size is increasing, give up on this one.\n");
+      return false;
+    }
+    // CodeSizeDelta == 0, continue with the regular heuristics
+  }
+
+  // Heuristic: The compare conversion delays the execution of the branch
+  // instruction because we must wait for the inputs to the second compare as
+  // well. The branch has no dependent instructions, but delaying it increases
+  // the cost of a misprediction.
+  //
+  // Set a limit on the delay we will accept.
+  unsigned DelayLimit = SchedModel.MispredictPenalty * 3 / 4;
+
+  // Instruction depths can be computed for all trace instructions above CmpBB.
+  unsigned HeadDepth =
+      Trace.getInstrCycles(*CmpConv.Head->getFirstTerminator()).Depth;
+  unsigned CmpBBDepth =
+      Trace.getInstrCycles(*CmpConv.CmpBB->getFirstTerminator()).Depth;
+  DEBUG(dbgs() << "Head depth:  " << HeadDepth
+               << "\nCmpBB depth: " << CmpBBDepth << '\n');
+  if (CmpBBDepth > HeadDepth + DelayLimit) {
+    DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit
+                 << " cycles.\n");
+    return false;
+  }
+
+  // Check the resource depth at the bottom of CmpBB - these instructions will
+  // be speculated.
+  unsigned ResDepth = Trace.getResourceDepth(true);
+  DEBUG(dbgs() << "Resources:   " << ResDepth << '\n');
+
+  // Heuristic: The speculatively executed instructions must all be able to
+  // merge into the Head block. The Head critical path should dominate the
+  // resource cost of the speculated instructions.
+  if (ResDepth > HeadDepth) {
+    DEBUG(dbgs() << "Too many instructions to speculate.\n");
+    return false;
+  }
+  return true;
+}
+
+bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  while (CmpConv.canConvert(MBB) && shouldConvert()) {
+    invalidateTraces();
+    SmallVector<MachineBasicBlock *, 4> RemovedBlocks;
+    CmpConv.convert(RemovedBlocks);
+    Changed = true;
+    updateDomTree(RemovedBlocks);
+    updateLoops(RemovedBlocks);
+  }
+  return Changed;
+}
+
+bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  SchedModel = MF.getSubtarget().getSchedModel();
+  MRI = &MF.getRegInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = nullptr;
+  MinSize = MF.getFunction()->optForMinSize();
+
+  bool Changed = false;
+  CmpConv.runOnMachineFunction(MF);
+
+  // Visit blocks in dominator tree pre-order. The pre-order enables multiple
+  // cmp-conversions from the same head block.
+  // Note that updateDomTree() modifies the children of the DomTree node
+  // currently being visited. The df_iterator supports that; it doesn't look at
+  // child_begin() / child_end() until after a node has been visited.
+  for (auto *I : depth_first(DomTree))
+    if (tryConvert(I->getBlock()))
+      Changed = true;
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
new file mode 100644
index 000000000000..30e2b2310456
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -0,0 +1,149 @@
+//==-- AArch64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file When allowed by the instruction, replace a dead definition of a GPR
+/// with the zero register. This makes the code a bit friendlier towards the
+/// hardware's register renamer.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-dead-defs"
+
+STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");
+
+#define AARCH64_DEAD_REG_DEF_NAME "AArch64 Dead register definitions"
+
+namespace {
+class AArch64DeadRegisterDefinitions : public MachineFunctionPass {
+private:
+  const TargetRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  const TargetInstrInfo *TII;
+  bool Changed;
+  void processMachineBasicBlock(MachineBasicBlock &MBB);
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {
+    initializeAArch64DeadRegisterDefinitionsPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  StringRef getPassName() const override { return AARCH64_DEAD_REG_DEF_NAME; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char AArch64DeadRegisterDefinitions::ID = 0;
+} // end anonymous namespace
+
+INITIALIZE_PASS(AArch64DeadRegisterDefinitions, "aarch64-dead-defs",
+                AARCH64_DEAD_REG_DEF_NAME, false, false)
+
+static bool usesFrameIndex(const MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.uses())
+    if (MO.isFI())
+      return true;
+  return false;
+}
+
+void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
+    MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
+  for (MachineInstr &MI : MBB) {
+    if (usesFrameIndex(MI)) {
+      // We need to skip this instruction because while it appears to have a
+      // dead def it uses a frame index which might expand into a multi
+      // instruction sequence during EPI.
+      DEBUG(dbgs() << "    Ignoring, operand is frame index\n");
+      continue;
+    }
+    if (MI.definesRegister(AArch64::XZR) || MI.definesRegister(AArch64::WZR)) {
+      // It is not allowed to write to the same register (not even the zero
+      // register) twice in a single instruction.
+      DEBUG(dbgs() << "    Ignoring, XZR or WZR already used by the instruction\n");
+      continue;
+    }
+    const MCInstrDesc &Desc = MI.getDesc();
+    for (int I = 0, E = Desc.getNumDefs(); I != E; ++I) {
+      MachineOperand &MO = MI.getOperand(I);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      // We should not have any relevant physreg defs that are replacable by
+      // zero before register allocation. So we just check for dead vreg defs.
+      unsigned Reg = MO.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
+          (!MO.isDead() && !MRI->use_nodbg_empty(Reg)))
+        continue;
+      assert(!MO.isImplicit() && "Unexpected implicit def!");
+      DEBUG(dbgs() << "  Dead def operand #" << I << " in:\n    ";
+            MI.print(dbgs()));
+      // Be careful not to change the register if it's a tied operand.
+      if (MI.isRegTiedToUseOperand(I)) {
+        DEBUG(dbgs() << "    Ignoring, def is tied operand.\n");
+        continue;
+      }
+      const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI, MF);
+      unsigned NewReg;
+      if (RC == nullptr) {
+        DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
+        continue;
+      } else if (RC->contains(AArch64::WZR))
+        NewReg = AArch64::WZR;
+      else if (RC->contains(AArch64::XZR))
+        NewReg = AArch64::XZR;
+      else {
+        DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
+        continue;
+      }
+      DEBUG(dbgs() << "    Replacing with zero register. New:\n      ");
+      MO.setReg(NewReg);
+      MO.setIsDead();
+      DEBUG(MI.print(dbgs()));
+      ++NumDeadDefsReplaced;
+      Changed = true;
+      // Only replace one dead register, see check for zero register above.
+      break;
+    }
+  }
+}
+
+// Scan the function for instructions that have a dead definition of a
+// register. Replace that register with the zero register when possible.
+bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
+  DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
+  Changed = false;
+  for (auto &MBB : MF)
+    processMachineBasicBlock(MBB);
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64DeadRegisterDefinitions() {
+  return new AArch64DeadRegisterDefinitions();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
new file mode 100644
index 000000000000..fe1c0beee0eb
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -0,0 +1,965 @@
+//==-- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling and other late optimizations.  This
+// pass should be run after register allocation but before the post-regalloc
+// scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+#define AARCH64_EXPAND_PSEUDO_NAME "AArch64 pseudo instruction expansion pass"
+
+namespace {
+class AArch64ExpandPseudo : public MachineFunctionPass {
+public:
+  static char ID;
+  AArch64ExpandPseudo() : MachineFunctionPass(ID) {
+    initializeAArch64ExpandPseudoPass(*PassRegistry::getPassRegistry());
+  }
+
+  const AArch64InstrInfo *TII;
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  StringRef getPassName() const override { return AARCH64_EXPAND_PSEUDO_NAME; }
+
+private:
+  bool expandMBB(MachineBasicBlock &MBB);
+  bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                MachineBasicBlock::iterator &NextMBBI);
+  bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                    unsigned BitSize);
+
+  bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                      unsigned LdarOp, unsigned StlrOp, unsigned CmpOp,
+                      unsigned ExtendImm, unsigned ZeroReg,
+                      MachineBasicBlock::iterator &NextMBBI);
+  bool expandCMP_SWAP_128(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          MachineBasicBlock::iterator &NextMBBI);
+};
+char AArch64ExpandPseudo::ID = 0;
+}
+
+INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo",
+                AARCH64_EXPAND_PSEUDO_NAME, false, false)
+
+/// \brief Transfer implicit operands on the pseudo instruction to the
+/// instructions created from the expansion.
+static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
+                           MachineInstrBuilder &DefMI) {
+  const MCInstrDesc &Desc = OldMI.getDesc();
+  for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e;
+       ++i) {
+    const MachineOperand &MO = OldMI.getOperand(i);
+    assert(MO.isReg() && MO.getReg());
+    if (MO.isUse())
+      UseMI.addOperand(MO);
+    else
+      DefMI.addOperand(MO);
+  }
+}
+
+/// \brief Helper function which extracts the specified 16-bit chunk from a
+/// 64-bit value.
+static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
+  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+
+  return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
+}
+
+/// \brief Helper function which replicates a 16-bit chunk within a 64-bit
+/// value. Indices correspond to element numbers in a v4i16.
+static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) {
+  assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!");
+  const unsigned ShiftAmt = ToIdx * 16;
+
+  // Replicate the source chunk to the destination position.
+  const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt;
+  // Clear the destination chunk.
+  Imm &= ~(0xFFFFLL << ShiftAmt);
+  // Insert the replicated chunk.
+  return Imm | Chunk;
+}
+
+/// \brief Helper function which tries to materialize a 64-bit value with an
+/// ORR + MOVK instruction sequence.
+static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
+                       MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator &MBBI,
+                       const AArch64InstrInfo *TII, unsigned ChunkIdx) {
+  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+  const unsigned ShiftAmt = ChunkIdx * 16;
+
+  uint64_t Encoding;
+  if (AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
+    // Create the ORR-immediate instruction.
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(AArch64::XZR)
+            .addImm(Encoding);
+
+    // Create the MOVK instruction.
+    const unsigned Imm16 = getChunk(UImm, ChunkIdx);
+    const unsigned DstReg = MI.getOperand(0).getReg();
+    const bool DstIsDead = MI.getOperand(0).isDead();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+    transferImpOps(MI, MIB, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Check whether the given 16-bit chunk replicated to full 64-bit width
+/// can be materialized with an ORR instruction.
+static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
+  Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
+
+  return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding);
+}
+
+/// \brief Check for identical 16-bit chunks within the constant and if so
+/// materialize them with a single ORR instruction. The remaining one or two
+/// 16-bit chunks will be materialized with MOVK instructions.
+///
+/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order
+/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with
+/// an ORR instruction.
+///
+static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
+                                 MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &MBBI,
+                                 const AArch64InstrInfo *TII) {
+  typedef DenseMap<uint64_t, unsigned> CountMap;
+  CountMap Counts;
+
+  // Scan the constant and count how often every chunk occurs.
+  for (unsigned Idx = 0; Idx < 4; ++Idx)
+    ++Counts[getChunk(UImm, Idx)];
+
+  // Traverse the chunks to find one which occurs more than once.
+  for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end();
+       Chunk != End; ++Chunk) {
+    const uint64_t ChunkVal = Chunk->first;
+    const unsigned Count = Chunk->second;
+
+    uint64_t Encoding = 0;
+
+    // We are looking for chunks which have two or three instances and can be
+    // materialized with an ORR instruction.
+    if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding))
+      continue;
+
+    const bool CountThree = Count == 3;
+    // Create the ORR-immediate instruction.
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(AArch64::XZR)
+            .addImm(Encoding);
+
+    const unsigned DstReg = MI.getOperand(0).getReg();
+    const bool DstIsDead = MI.getOperand(0).isDead();
+
+    unsigned ShiftAmt = 0;
+    uint64_t Imm16 = 0;
+    // Find the first chunk not materialized with the ORR instruction.
+    for (; ShiftAmt < 64; ShiftAmt += 16) {
+      Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+      if (Imm16 != ChunkVal)
+        break;
+    }
+
+    // Create the first MOVK instruction.
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+            .addReg(DstReg,
+                    RegState::Define | getDeadRegState(DstIsDead && CountThree))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+    // In case we have three instances the whole constant is now materialized
+    // and we can exit.
+    if (CountThree) {
+      transferImpOps(MI, MIB, MIB1);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    // Find the remaining chunk which needs to be materialized.
+    for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) {
+      Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+      if (Imm16 != ChunkVal)
+        break;
+    }
+
+    // Create the second MOVK instruction.
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+    transferImpOps(MI, MIB, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Check whether this chunk matches the pattern '1...0...'. This pattern
+/// starts a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isStartChunk(uint64_t Chunk) {
+  if (Chunk == 0 || Chunk == UINT64_MAX)
+    return false;
+
+  return isMask_64(~Chunk);
+}
+
+/// \brief Check whether this chunk matches the pattern '0...1...' This pattern
+/// ends a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isEndChunk(uint64_t Chunk) {
+  if (Chunk == 0 || Chunk == UINT64_MAX)
+    return false;
+
+  return isMask_64(Chunk);
+}
+
+/// \brief Clear or set all bits in the chunk at the given index.
+static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
+  const uint64_t Mask = 0xFFFF;
+
+  if (Clear)
+    // Clear chunk in the immediate.
+    Imm &= ~(Mask << (Idx * 16));
+  else
+    // Set all bits in the immediate for the particular chunk.
+    Imm |= Mask << (Idx * 16);
+
+  return Imm;
+}
+
+/// \brief Check whether the constant contains a sequence of contiguous ones,
+/// which might be interrupted by one or two chunks. If so, materialize the
+/// sequence of contiguous ones with an ORR instruction.
+/// Materialize the chunks which are either interrupting the sequence or outside
+/// of the sequence with a MOVK instruction.
+///
+/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk
+/// which ends the sequence (0...1...). Then we are looking for constants which
+/// contain at least one S and E chunk.
+/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|.
+///
+/// We are also looking for constants like |S|A|B|E| where the contiguous
+/// sequence of ones wraps around the MSB into the LSB.
+///
+static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              const AArch64InstrInfo *TII) {
+  const int NotSet = -1;
+  const uint64_t Mask = 0xFFFF;
+
+  int StartIdx = NotSet;
+  int EndIdx = NotSet;
+  // Try to find the chunks which start/end a contiguous sequence of ones.
+  for (int Idx = 0; Idx < 4; ++Idx) {
+    int64_t Chunk = getChunk(UImm, Idx);
+    // Sign extend the 16-bit chunk to 64-bit.
+    Chunk = (Chunk << 48) >> 48;
+
+    if (isStartChunk(Chunk))
+      StartIdx = Idx;
+    else if (isEndChunk(Chunk))
+      EndIdx = Idx;
+  }
+
+  // Early exit in case we can't find a start/end chunk.
+  if (StartIdx == NotSet || EndIdx == NotSet)
+    return false;
+
+  // Outside of the contiguous sequence of ones everything needs to be zero.
+  uint64_t Outside = 0;
+  // Chunks between the start and end chunk need to have all their bits set.
+  uint64_t Inside = Mask;
+
+  // If our contiguous sequence of ones wraps around from the MSB into the LSB,
+  // just swap indices and pretend we are materializing a contiguous sequence
+  // of zeros surrounded by a contiguous sequence of ones.
+  if (StartIdx > EndIdx) {
+    std::swap(StartIdx, EndIdx);
+    std::swap(Outside, Inside);
+  }
+
+  uint64_t OrrImm = UImm;
+  int FirstMovkIdx = NotSet;
+  int SecondMovkIdx = NotSet;
+
+  // Find out which chunks we need to patch up to obtain a contiguous sequence
+  // of ones.
+  for (int Idx = 0; Idx < 4; ++Idx) {
+    const uint64_t Chunk = getChunk(UImm, Idx);
+
+    // Check whether we are looking at a chunk which is not part of the
+    // contiguous sequence of ones.
+    if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) {
+      OrrImm = updateImm(OrrImm, Idx, Outside == 0);
+
+      // Remember the index we need to patch.
+      if (FirstMovkIdx == NotSet)
+        FirstMovkIdx = Idx;
+      else
+        SecondMovkIdx = Idx;
+
+      // Check whether we are looking a chunk which is part of the contiguous
+      // sequence of ones.
+    } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) {
+      OrrImm = updateImm(OrrImm, Idx, Inside != Mask);
+
+      // Remember the index we need to patch.
+      if (FirstMovkIdx == NotSet)
+        FirstMovkIdx = Idx;
+      else
+        SecondMovkIdx = Idx;
+    }
+  }
+  assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!");
+
+  // Create the ORR-immediate instruction.
+  uint64_t Encoding = 0;
+  AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
+  MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+          .addOperand(MI.getOperand(0))
+          .addReg(AArch64::XZR)
+          .addImm(Encoding);
+
+  const unsigned DstReg = MI.getOperand(0).getReg();
+  const bool DstIsDead = MI.getOperand(0).isDead();
+
+  const bool SingleMovk = SecondMovkIdx == NotSet;
+  // Create the first MOVK instruction.
+  MachineInstrBuilder MIB1 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+          .addReg(DstReg,
+                  RegState::Define | getDeadRegState(DstIsDead && SingleMovk))
+          .addReg(DstReg)
+          .addImm(getChunk(UImm, FirstMovkIdx))
+          .addImm(
+              AArch64_AM::getShifterImm(AArch64_AM::LSL, FirstMovkIdx * 16));
+
+  // Early exit in case we only need to emit a single MOVK instruction.
+  if (SingleMovk) {
+    transferImpOps(MI, MIB, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Create the second MOVK instruction.
+  MachineInstrBuilder MIB2 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+          .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstReg)
+          .addImm(getChunk(UImm, SecondMovkIdx))
+          .addImm(
+              AArch64_AM::getShifterImm(AArch64_AM::LSL, SecondMovkIdx * 16));
+
+  transferImpOps(MI, MIB, MIB2);
+  MI.eraseFromParent();
+  return true;
+}
+
+/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
+/// real move-immediate instructions to synthesize the immediate.
+bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       unsigned BitSize) {
+  MachineInstr &MI = *MBBI;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  uint64_t Imm = MI.getOperand(1).getImm();
+  const unsigned Mask = 0xFFFF;
+
+  if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) {
+    // Useless def, and we don't want to risk creating an invalid ORR (which
+    // would really write to sp).
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Try a MOVI instruction (aka ORR-immediate with the zero register).
+  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+  uint64_t Encoding;
+  if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+    unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri);
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+            .addOperand(MI.getOperand(0))
+            .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR)
+            .addImm(Encoding);
+    transferImpOps(MI, MIB, MIB);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Scan the immediate and count the number of 16-bit chunks which are either
+  // all ones or all zeros.
+  unsigned OneChunks = 0;
+  unsigned ZeroChunks = 0;
+  for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
+    const unsigned Chunk = (Imm >> Shift) & Mask;
+    if (Chunk == Mask)
+      OneChunks++;
+    else if (Chunk == 0)
+      ZeroChunks++;
+  }
+
+  // Since we can't materialize the constant with a single ORR instruction,
+  // let's see whether we can materialize 3/4 of the constant with an ORR
+  // instruction and use an additional MOVK instruction to materialize the
+  // remaining 1/4.
+  //
+  // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|.
+  //
+  // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR,
+  // we would create the following instruction sequence:
+  //
+  // ORR x0, xzr, |A|X|A|X|
+  // MOVK x0, |B|, LSL #16
+  //
+  // Only look at 64-bit constants which can't be materialized with a single
+  // instruction e.g. which have less than either three all zero or all one
+  // chunks.
+  //
+  // Ignore 32-bit constants here, they always can be materialized with a
+  // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized
+  // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair.
+  // Thus we fall back to the default code below which in the best case creates
+  // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one).
+  //
+  if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) {
+    // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2
+    // identical?
+    if (getChunk(UImm, 0) == getChunk(UImm, 2)) {
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 3 into element 1.
+      uint64_t OrrImm = replicateChunk(UImm, 3, 1);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1))
+        return true;
+
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 1 into element 3.
+      OrrImm = replicateChunk(UImm, 1, 3);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3))
+        return true;
+
+      // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3
+      // identical?
+    } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) {
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 2 into element 0.
+      uint64_t OrrImm = replicateChunk(UImm, 2, 0);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0))
+        return true;
+
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 1 into element 3.
+      OrrImm = replicateChunk(UImm, 0, 2);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2))
+        return true;
+    }
+  }
+
+  // Check for identical 16-bit chunks within the constant and if so materialize
+  // them with a single ORR instruction. The remaining one or two 16-bit chunks
+  // will be materialized with MOVK instructions.
+  if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII))
+    return true;
+
+  // Check whether the constant contains a sequence of contiguous ones, which
+  // might be interrupted by one or two chunks. If so, materialize the sequence
+  // of contiguous ones with an ORR instruction. Materialize the chunks which
+  // are either interrupting the sequence or outside of the sequence with a
+  // MOVK instruction.
+  if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII))
+    return true;
+
+  // Use a MOVZ or MOVN instruction to set the high bits, followed by one or
+  // more MOVK instructions to insert additional 16-bit portions into the
+  // lower bits.
+  bool isNeg = false;
+
+  // Use MOVN to materialize the high bits if we have more all one chunks
+  // than all zero chunks.
+  if (OneChunks > ZeroChunks) {
+    isNeg = true;
+    Imm = ~Imm;
+  }
+
+  unsigned FirstOpc;
+  if (BitSize == 32) {
+    Imm &= (1LL << 32) - 1;
+    FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi);
+  } else {
+    FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi);
+  }
+  unsigned Shift = 0;     // LSL amount for high bits with MOVZ/MOVN
+  unsigned LastShift = 0; // LSL amount for last MOVK
+  if (Imm != 0) {
+    unsigned LZ = countLeadingZeros(Imm);
+    unsigned TZ = countTrailingZeros(Imm);
+    Shift = ((63 - LZ) / 16) * 16;
+    LastShift = (TZ / 16) * 16;
+  }
+  unsigned Imm16 = (Imm >> Shift) & Mask;
+  bool DstIsDead = MI.getOperand(0).isDead();
+  MachineInstrBuilder MIB1 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
+          .addReg(DstReg, RegState::Define |
+                              getDeadRegState(DstIsDead && Shift == LastShift))
+          .addImm(Imm16)
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
+
+  // If a MOVN was used for the high bits of a negative value, flip the rest
+  // of the bits back for use with MOVK.
+  if (isNeg)
+    Imm = ~Imm;
+
+  if (Shift == LastShift) {
+    transferImpOps(MI, MIB1, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  MachineInstrBuilder MIB2;
+  unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
+  while (Shift != LastShift) {
+    Shift -= 16;
+    Imm16 = (Imm >> Shift) & Mask;
+    if (Imm16 == (isNeg ? Mask : 0))
+      continue; // This 16-bit portion is already set correctly.
+    MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+               .addReg(DstReg,
+                       RegState::Define |
+                           getDeadRegState(DstIsDead && Shift == LastShift))
+               .addReg(DstReg)
+               .addImm(Imm16)
+               .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
+  }
+
+  transferImpOps(MI, MIB1, MIB2);
+  MI.eraseFromParent();
+  return true;
+}
+
+static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) {
+  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+    MBB->addLiveIn(*I);
+}
+
+bool AArch64ExpandPseudo::expandCMP_SWAP(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp,
+    unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg,
+    MachineBasicBlock::iterator &NextMBBI) {
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineOperand &Dest = MI.getOperand(0);
+  unsigned StatusReg = MI.getOperand(1).getReg();
+  MachineOperand &Addr = MI.getOperand(2);
+  MachineOperand &Desired = MI.getOperand(3);
+  MachineOperand &New = MI.getOperand(4);
+
+  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+    LiveRegs.stepBackward(*I);
+
+  MachineFunction *MF = MBB.getParent();
+  auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  MF->insert(++MBB.getIterator(), LoadCmpBB);
+  MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+  MF->insert(++StoreBB->getIterator(), DoneBB);
+
+  // .Lloadcmp:
+  //     ldaxr xDest, [xAddr]
+  //     cmp xDest, xDesired
+  //     b.ne .Ldone
+  LoadCmpBB->addLiveIn(Addr.getReg());
+  LoadCmpBB->addLiveIn(Dest.getReg());
+  LoadCmpBB->addLiveIn(Desired.getReg());
+  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+  BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg())
+      .addReg(Addr.getReg());
+  BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg)
+      .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
+      .addOperand(Desired)
+      .addImm(ExtendImm);
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc))
+      .addImm(AArch64CC::NE)
+      .addMBB(DoneBB)
+      .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill);
+  LoadCmpBB->addSuccessor(DoneBB);
+  LoadCmpBB->addSuccessor(StoreBB);
+
+  // .Lstore:
+  //     stlxr wStatus, xNew, [xAddr]
+  //     cbnz wStatus, .Lloadcmp
+  StoreBB->addLiveIn(Addr.getReg());
+  StoreBB->addLiveIn(New.getReg());
+  addPostLoopLiveIns(StoreBB, LiveRegs);
+
+  BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg)
+      .addOperand(New)
+      .addOperand(Addr);
+  BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
+      .addReg(StatusReg, RegState::Kill)
+      .addMBB(LoadCmpBB);
+  StoreBB->addSuccessor(LoadCmpBB);
+  StoreBB->addSuccessor(DoneBB);
+
+  DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+  DoneBB->transferSuccessors(&MBB);
+  addPostLoopLiveIns(DoneBB, LiveRegs);
+
+  MBB.addSuccessor(LoadCmpBB);
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AArch64ExpandPseudo::expandCMP_SWAP_128(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineOperand &DestLo = MI.getOperand(0);
+  MachineOperand &DestHi = MI.getOperand(1);
+  unsigned StatusReg = MI.getOperand(2).getReg();
+  MachineOperand &Addr = MI.getOperand(3);
+  MachineOperand &DesiredLo = MI.getOperand(4);
+  MachineOperand &DesiredHi = MI.getOperand(5);
+  MachineOperand &NewLo = MI.getOperand(6);
+  MachineOperand &NewHi = MI.getOperand(7);
+
+  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+    LiveRegs.stepBackward(*I);
+
+  MachineFunction *MF = MBB.getParent();
+  auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  MF->insert(++MBB.getIterator(), LoadCmpBB);
+  MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+  MF->insert(++StoreBB->getIterator(), DoneBB);
+
+  // .Lloadcmp:
+  //     ldaxp xDestLo, xDestHi, [xAddr]
+  //     cmp xDestLo, xDesiredLo
+  //     sbcs xDestHi, xDesiredHi
+  //     b.ne .Ldone
+  LoadCmpBB->addLiveIn(Addr.getReg());
+  LoadCmpBB->addLiveIn(DestLo.getReg());
+  LoadCmpBB->addLiveIn(DestHi.getReg());
+  LoadCmpBB->addLiveIn(DesiredLo.getReg());
+  LoadCmpBB->addLiveIn(DesiredHi.getReg());
+  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX))
+      .addReg(DestLo.getReg(), RegState::Define)
+      .addReg(DestHi.getReg(), RegState::Define)
+      .addReg(Addr.getReg());
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
+      .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead()))
+      .addOperand(DesiredLo)
+      .addImm(0);
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
+    .addUse(AArch64::WZR)
+    .addUse(AArch64::WZR)
+    .addImm(AArch64CC::EQ);
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
+      .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead()))
+      .addOperand(DesiredHi)
+      .addImm(0);
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
+      .addUse(StatusReg, RegState::Kill)
+      .addUse(StatusReg, RegState::Kill)
+      .addImm(AArch64CC::EQ);
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW))
+      .addUse(StatusReg, RegState::Kill)
+      .addMBB(DoneBB);
+  LoadCmpBB->addSuccessor(DoneBB);
+  LoadCmpBB->addSuccessor(StoreBB);
+
+  // .Lstore:
+  //     stlxp wStatus, xNewLo, xNewHi, [xAddr]
+  //     cbnz wStatus, .Lloadcmp
+  StoreBB->addLiveIn(Addr.getReg());
+  StoreBB->addLiveIn(NewLo.getReg());
+  StoreBB->addLiveIn(NewHi.getReg());
+  addPostLoopLiveIns(StoreBB, LiveRegs);
+  BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg)
+      .addOperand(NewLo)
+      .addOperand(NewHi)
+      .addOperand(Addr);
+  BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
+      .addReg(StatusReg, RegState::Kill)
+      .addMBB(LoadCmpBB);
+  StoreBB->addSuccessor(LoadCmpBB);
+  StoreBB->addSuccessor(DoneBB);
+
+  DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+  DoneBB->transferSuccessors(&MBB);
+  addPostLoopLiveIns(DoneBB, LiveRegs);
+
+  MBB.addSuccessor(LoadCmpBB);
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+  return true;
+}
+
+/// \brief If MBBI references a pseudo instruction that should be expanded here,
+/// do the expansion and return true.  Otherwise return false.
+bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineBasicBlock::iterator &NextMBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  default:
+    break;
+
+  case AArch64::ADDWrr:
+  case AArch64::SUBWrr:
+  case AArch64::ADDXrr:
+  case AArch64::SUBXrr:
+  case AArch64::ADDSWrr:
+  case AArch64::SUBSWrr:
+  case AArch64::ADDSXrr:
+  case AArch64::SUBSXrr:
+  case AArch64::ANDWrr:
+  case AArch64::ANDXrr:
+  case AArch64::BICWrr:
+  case AArch64::BICXrr:
+  case AArch64::ANDSWrr:
+  case AArch64::ANDSXrr:
+  case AArch64::BICSWrr:
+  case AArch64::BICSXrr:
+  case AArch64::EONWrr:
+  case AArch64::EONXrr:
+  case AArch64::EORWrr:
+  case AArch64::EORXrr:
+  case AArch64::ORNWrr:
+  case AArch64::ORNXrr:
+  case AArch64::ORRWrr:
+  case AArch64::ORRXrr: {
+    unsigned Opcode;
+    switch (MI.getOpcode()) {
+    default:
+      return false;
+    case AArch64::ADDWrr:      Opcode = AArch64::ADDWrs; break;
+    case AArch64::SUBWrr:      Opcode = AArch64::SUBWrs; break;
+    case AArch64::ADDXrr:      Opcode = AArch64::ADDXrs; break;
+    case AArch64::SUBXrr:      Opcode = AArch64::SUBXrs; break;
+    case AArch64::ADDSWrr:     Opcode = AArch64::ADDSWrs; break;
+    case AArch64::SUBSWrr:     Opcode = AArch64::SUBSWrs; break;
+    case AArch64::ADDSXrr:     Opcode = AArch64::ADDSXrs; break;
+    case AArch64::SUBSXrr:     Opcode = AArch64::SUBSXrs; break;
+    case AArch64::ANDWrr:      Opcode = AArch64::ANDWrs; break;
+    case AArch64::ANDXrr:      Opcode = AArch64::ANDXrs; break;
+    case AArch64::BICWrr:      Opcode = AArch64::BICWrs; break;
+    case AArch64::BICXrr:      Opcode = AArch64::BICXrs; break;
+    case AArch64::ANDSWrr:     Opcode = AArch64::ANDSWrs; break;
+    case AArch64::ANDSXrr:     Opcode = AArch64::ANDSXrs; break;
+    case AArch64::BICSWrr:     Opcode = AArch64::BICSWrs; break;
+    case AArch64::BICSXrr:     Opcode = AArch64::BICSXrs; break;
+    case AArch64::EONWrr:      Opcode = AArch64::EONWrs; break;
+    case AArch64::EONXrr:      Opcode = AArch64::EONXrs; break;
+    case AArch64::EORWrr:      Opcode = AArch64::EORWrs; break;
+    case AArch64::EORXrr:      Opcode = AArch64::EORXrs; break;
+    case AArch64::ORNWrr:      Opcode = AArch64::ORNWrs; break;
+    case AArch64::ORNXrr:      Opcode = AArch64::ORNXrs; break;
+    case AArch64::ORRWrr:      Opcode = AArch64::ORRWrs; break;
+    case AArch64::ORRXrr:      Opcode = AArch64::ORRXrs; break;
+    }
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
+                MI.getOperand(0).getReg())
+            .addOperand(MI.getOperand(1))
+            .addOperand(MI.getOperand(2))
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    transferImpOps(MI, MIB1, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case AArch64::LOADgot: {
+    // Expand into ADRP + LDR.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    const MachineOperand &MO1 = MI.getOperand(1);
+    unsigned Flags = MO1.getTargetFlags();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
+            .addOperand(MI.getOperand(0))
+            .addReg(DstReg);
+
+    if (MO1.isGlobal()) {
+      MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE);
+      MIB2.addGlobalAddress(MO1.getGlobal(), 0,
+                            Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+    } else if (MO1.isSymbol()) {
+      MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | AArch64II::MO_PAGE);
+      MIB2.addExternalSymbol(MO1.getSymbolName(),
+                             Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+    } else {
+      assert(MO1.isCPI() &&
+             "Only expect globals, externalsymbols, or constant pools");
+      MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
+                                Flags | AArch64II::MO_PAGE);
+      MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
+                                Flags | AArch64II::MO_PAGEOFF |
+                                    AArch64II::MO_NC);
+    }
+
+    transferImpOps(MI, MIB1, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case AArch64::MOVaddr:
+  case AArch64::MOVaddrJT:
+  case AArch64::MOVaddrCP:
+  case AArch64::MOVaddrBA:
+  case AArch64::MOVaddrTLS:
+  case AArch64::MOVaddrEXT: {
+    // Expand into ADRP + ADD.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
+            .addOperand(MI.getOperand(1));
+
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(DstReg)
+            .addOperand(MI.getOperand(2))
+            .addImm(0);
+
+    transferImpOps(MI, MIB1, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case AArch64::MOVi32imm:
+    return expandMOVImm(MBB, MBBI, 32);
+  case AArch64::MOVi64imm:
+    return expandMOVImm(MBB, MBBI, 64);
+  case AArch64::RET_ReallyLR: {
+    // Hiding the LR use with RET_ReallyLR may lead to extra kills in the
+    // function and missing live-ins. We are fine in practice because callee
+    // saved register handling ensures the register value is restored before
+    // RET, but we need the undef flag here to appease the MachineVerifier
+    // liveness checks.
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::RET))
+          .addReg(AArch64::LR, RegState::Undef);
+    transferImpOps(MI, MIB, MIB);
+    MI.eraseFromParent();
+    return true;
+  }
+  case AArch64::CMP_SWAP_8:
+    return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRB, AArch64::STLXRB,
+                          AArch64::SUBSWrx,
+                          AArch64_AM::getArithExtendImm(AArch64_AM::UXTB, 0),
+                          AArch64::WZR, NextMBBI);
+  case AArch64::CMP_SWAP_16:
+    return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRH, AArch64::STLXRH,
+                          AArch64::SUBSWrx,
+                          AArch64_AM::getArithExtendImm(AArch64_AM::UXTH, 0),
+                          AArch64::WZR, NextMBBI);
+  case AArch64::CMP_SWAP_32:
+    return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRW, AArch64::STLXRW,
+                          AArch64::SUBSWrs,
+                          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
+                          AArch64::WZR, NextMBBI);
+  case AArch64::CMP_SWAP_64:
+    return expandCMP_SWAP(MBB, MBBI,
+                          AArch64::LDAXRX, AArch64::STLXRX, AArch64::SUBSXrs,
+                          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
+                          AArch64::XZR, NextMBBI);
+  case AArch64::CMP_SWAP_128:
+    return expandCMP_SWAP_128(MBB, MBBI, NextMBBI);
+  }
+  return false;
+}
+
+/// \brief Iterate over the instructions in basic block MBB and expand any
+/// pseudo instructions.  Return true if anything was modified.
+bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= expandMI(MBB, MBBI, NMBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  bool Modified = false;
+  for (auto &MBB : MF)
+    Modified |= expandMBB(MBB);
+  return Modified;
+}
+
+/// \brief Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createAArch64ExpandPseudoPass() {
+  return new AArch64ExpandPseudo();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
new file mode 100644
index 000000000000..fe2c2d4550a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -0,0 +1,5099 @@
+//===-- AArch6464FastISel.cpp - AArch64 FastISel implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AArch64-specific support for the FastISel class. Some
+// of the target-specific code is generated by tablegen in the file
+// AArch64GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64CallingConvention.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCSymbol.h"
+using namespace llvm;
+
+namespace {
+
+class AArch64FastISel final : public FastISel {
+  class Address {
+  public:
+    typedef enum {
+      RegBase,
+      FrameIndexBase
+    } BaseKind;
+
+  private:
+    BaseKind Kind;
+    AArch64_AM::ShiftExtendType ExtType;
+    union {
+      unsigned Reg;
+      int FI;
+    } Base;
+    unsigned OffsetReg;
+    unsigned Shift;
+    int64_t Offset;
+    const GlobalValue *GV;
+
+  public:
+    Address() : Kind(RegBase), ExtType(AArch64_AM::InvalidShiftExtend),
+      OffsetReg(0), Shift(0), Offset(0), GV(nullptr) { Base.Reg = 0; }
+    void setKind(BaseKind K) { Kind = K; }
+    BaseKind getKind() const { return Kind; }
+    void setExtendType(AArch64_AM::ShiftExtendType E) { ExtType = E; }
+    AArch64_AM::ShiftExtendType getExtendType() const { return ExtType; }
+    bool isRegBase() const { return Kind == RegBase; }
+    bool isFIBase() const { return Kind == FrameIndexBase; }
+    void setReg(unsigned Reg) {
+      assert(isRegBase() && "Invalid base register access!");
+      Base.Reg = Reg;
+    }
+    unsigned getReg() const {
+      assert(isRegBase() && "Invalid base register access!");
+      return Base.Reg;
+    }
+    void setOffsetReg(unsigned Reg) {
+      OffsetReg = Reg;
+    }
+    unsigned getOffsetReg() const {
+      return OffsetReg;
+    }
+    void setFI(unsigned FI) {
+      assert(isFIBase() && "Invalid base frame index  access!");
+      Base.FI = FI;
+    }
+    unsigned getFI() const {
+      assert(isFIBase() && "Invalid base frame index access!");
+      return Base.FI;
+    }
+    void setOffset(int64_t O) { Offset = O; }
+    int64_t getOffset() { return Offset; }
+    void setShift(unsigned S) { Shift = S; }
+    unsigned getShift() { return Shift; }
+
+    void setGlobalValue(const GlobalValue *G) { GV = G; }
+    const GlobalValue *getGlobalValue() { return GV; }
+  };
+
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const AArch64Subtarget *Subtarget;
+  LLVMContext *Context;
+
+  bool fastLowerArguments() override;
+  bool fastLowerCall(CallLoweringInfo &CLI) override;
+  bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
+
+private:
+  // Selection routines.
+  bool selectAddSub(const Instruction *I);
+  bool selectLogicalOp(const Instruction *I);
+  bool selectLoad(const Instruction *I);
+  bool selectStore(const Instruction *I);
+  bool selectBranch(const Instruction *I);
+  bool selectIndirectBr(const Instruction *I);
+  bool selectCmp(const Instruction *I);
+  bool selectSelect(const Instruction *I);
+  bool selectFPExt(const Instruction *I);
+  bool selectFPTrunc(const Instruction *I);
+  bool selectFPToInt(const Instruction *I, bool Signed);
+  bool selectIntToFP(const Instruction *I, bool Signed);
+  bool selectRem(const Instruction *I, unsigned ISDOpcode);
+  bool selectRet(const Instruction *I);
+  bool selectTrunc(const Instruction *I);
+  bool selectIntExt(const Instruction *I);
+  bool selectMul(const Instruction *I);
+  bool selectShift(const Instruction *I);
+  bool selectBitCast(const Instruction *I);
+  bool selectFRem(const Instruction *I);
+  bool selectSDiv(const Instruction *I);
+  bool selectGetElementPtr(const Instruction *I);
+  bool selectAtomicCmpXchg(const AtomicCmpXchgInst *I);
+
+  // Utility helper routines.
+  bool isTypeLegal(Type *Ty, MVT &VT);
+  bool isTypeSupported(Type *Ty, MVT &VT, bool IsVectorAllowed = false);
+  bool isValueAvailable(const Value *V) const;
+  bool computeAddress(const Value *Obj, Address &Addr, Type *Ty = nullptr);
+  bool computeCallAddress(const Value *V, Address &Addr);
+  bool simplifyAddress(Address &Addr, MVT VT);
+  void addLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
+                            MachineMemOperand::Flags Flags,
+                            unsigned ScaleFactor, MachineMemOperand *MMO);
+  bool isMemCpySmall(uint64_t Len, unsigned Alignment);
+  bool tryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
+                          unsigned Alignment);
+  bool foldXALUIntrinsic(AArch64CC::CondCode &CC, const Instruction *I,
+                         const Value *Cond);
+  bool optimizeIntExtLoad(const Instruction *I, MVT RetVT, MVT SrcVT);
+  bool optimizeSelect(const SelectInst *SI);
+  std::pair<unsigned, bool> getRegForGEPIndex(const Value *Idx);
+
+  // Emit helper routines.
+  unsigned emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
+                      const Value *RHS, bool SetFlags = false,
+                      bool WantResult = true,  bool IsZExt = false);
+  unsigned emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                         bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+                         bool SetFlags = false, bool WantResult = true);
+  unsigned emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                         bool LHSIsKill, uint64_t Imm, bool SetFlags = false,
+                         bool WantResult = true);
+  unsigned emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                         bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+                         AArch64_AM::ShiftExtendType ShiftType,
+                         uint64_t ShiftImm, bool SetFlags = false,
+                         bool WantResult = true);
+  unsigned emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                         bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+                          AArch64_AM::ShiftExtendType ExtType,
+                          uint64_t ShiftImm, bool SetFlags = false,
+                         bool WantResult = true);
+
+  // Emit functions.
+  bool emitCompareAndBranch(const BranchInst *BI);
+  bool emitCmp(const Value *LHS, const Value *RHS, bool IsZExt);
+  bool emitICmp(MVT RetVT, const Value *LHS, const Value *RHS, bool IsZExt);
+  bool emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm);
+  bool emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS);
+  unsigned emitLoad(MVT VT, MVT ResultVT, Address Addr, bool WantZExt = true,
+                    MachineMemOperand *MMO = nullptr);
+  bool emitStore(MVT VT, unsigned SrcReg, Address Addr,
+                 MachineMemOperand *MMO = nullptr);
+  bool emitStoreRelease(MVT VT, unsigned SrcReg, unsigned AddrReg,
+                        MachineMemOperand *MMO = nullptr);
+  unsigned emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+  unsigned emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
+  unsigned emitAdd(MVT RetVT, const Value *LHS, const Value *RHS,
+                   bool SetFlags = false, bool WantResult = true,
+                   bool IsZExt = false);
+  unsigned emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, int64_t Imm);
+  unsigned emitSub(MVT RetVT, const Value *LHS, const Value *RHS,
+                   bool SetFlags = false, bool WantResult = true,
+                   bool IsZExt = false);
+  unsigned emitSubs_rr(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+                       unsigned RHSReg, bool RHSIsKill, bool WantResult = true);
+  unsigned emitSubs_rs(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+                       unsigned RHSReg, bool RHSIsKill,
+                       AArch64_AM::ShiftExtendType ShiftType, uint64_t ShiftImm,
+                       bool WantResult = true);
+  unsigned emitLogicalOp(unsigned ISDOpc, MVT RetVT, const Value *LHS,
+                         const Value *RHS);
+  unsigned emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, unsigned LHSReg,
+                            bool LHSIsKill, uint64_t Imm);
+  unsigned emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, unsigned LHSReg,
+                            bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+                            uint64_t ShiftImm);
+  unsigned emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm);
+  unsigned emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                      unsigned Op1, bool Op1IsKill);
+  unsigned emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                        unsigned Op1, bool Op1IsKill);
+  unsigned emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                        unsigned Op1, bool Op1IsKill);
+  unsigned emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                      unsigned Op1Reg, bool Op1IsKill);
+  unsigned emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
+                      uint64_t Imm, bool IsZExt = true);
+  unsigned emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                      unsigned Op1Reg, bool Op1IsKill);
+  unsigned emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
+                      uint64_t Imm, bool IsZExt = true);
+  unsigned emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                      unsigned Op1Reg, bool Op1IsKill);
+  unsigned emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
+                      uint64_t Imm, bool IsZExt = false);
+
+  unsigned materializeInt(const ConstantInt *CI, MVT VT);
+  unsigned materializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned materializeGV(const GlobalValue *GV);
+
+  // Call handling routines.
+private:
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
+  bool processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &ArgVTs,
+                       unsigned &NumBytes);
+  bool finishCall(CallLoweringInfo &CLI, MVT RetVT, unsigned NumBytes);
+
+public:
+  // Backend specific FastISel code.
+  unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
+  unsigned fastMaterializeConstant(const Constant *C) override;
+  unsigned fastMaterializeFloatZero(const ConstantFP* CF) override;
+
+  explicit AArch64FastISel(FunctionLoweringInfo &FuncInfo,
+                           const TargetLibraryInfo *LibInfo)
+      : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) {
+    Subtarget =
+        &static_cast<const AArch64Subtarget &>(FuncInfo.MF->getSubtarget());
+    Context = &FuncInfo.Fn->getContext();
+  }
+
+  bool fastSelectInstruction(const Instruction *I) override;
+
+#include "AArch64GenFastISel.inc"
+};
+
+} // end anonymous namespace
+
+#include "AArch64GenCallingConv.inc"
+
+/// \brief Check if the sign-/zero-extend will be a noop.
+static bool isIntExtFree(const Instruction *I) {
+  assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
+         "Unexpected integer extend instruction.");
+  assert(!I->getType()->isVectorTy() && I->getType()->isIntegerTy() &&
+         "Unexpected value type.");
+  bool IsZExt = isa<ZExtInst>(I);
+
+  if (const auto *LI = dyn_cast<LoadInst>(I->getOperand(0)))
+    if (LI->hasOneUse())
+      return true;
+
+  if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0)))
+    if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr()))
+      return true;
+
+  return false;
+}
+
+/// \brief Determine the implicit scale factor that is applied by a memory
+/// operation for a given value type.
+static unsigned getImplicitScaleFactor(MVT VT) {
+  switch (VT.SimpleTy) {
+  default:
+    return 0;    // invalid
+  case MVT::i1:  // fall-through
+  case MVT::i8:
+    return 1;
+  case MVT::i16:
+    return 2;
+  case MVT::i32: // fall-through
+  case MVT::f32:
+    return 4;
+  case MVT::i64: // fall-through
+  case MVT::f64:
+    return 8;
+  }
+}
+
+CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
+  if (CC == CallingConv::WebKit_JS)
+    return CC_AArch64_WebKit_JS;
+  if (CC == CallingConv::GHC)
+    return CC_AArch64_GHC;
+  return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS;
+}
+
+unsigned AArch64FastISel::fastMaterializeAlloca(const AllocaInst *AI) {
+  assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i64 &&
+         "Alloca should always return a pointer.");
+
+  // Don't handle dynamic allocas.
+  if (!FuncInfo.StaticAllocaMap.count(AI))
+    return 0;
+
+  DenseMap<const AllocaInst *, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+            ResultReg)
+        .addFrameIndex(SI->second)
+        .addImm(0)
+        .addImm(0);
+    return ResultReg;
+  }
+
+  return 0;
+}
+
+unsigned AArch64FastISel::materializeInt(const ConstantInt *CI, MVT VT) {
+  if (VT > MVT::i64)
+    return 0;
+
+  if (!CI->isZero())
+    return fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
+
+  // Create a copy from the zero register to materialize a "0" value.
+  const TargetRegisterClass *RC = (VT == MVT::i64) ? &AArch64::GPR64RegClass
+                                                   : &AArch64::GPR32RegClass;
+  unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+          ResultReg).addReg(ZeroReg, getKillRegState(true));
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) {
+  // Positive zero (+0.0) has to be materialized with a fmov from the zero
+  // register, because the immediate version of fmov cannot encode zero.
+  if (CFP->isNullValue())
+    return fastMaterializeFloatZero(CFP);
+
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return 0;
+
+  const APFloat Val = CFP->getValueAPF();
+  bool Is64Bit = (VT == MVT::f64);
+  // This checks to see if we can use FMOV instructions to materialize
+  // a constant, otherwise we have to materialize via the constant pool.
+  if (TLI.isFPImmLegal(Val, VT)) {
+    int Imm =
+        Is64Bit ? AArch64_AM::getFP64Imm(Val) : AArch64_AM::getFP32Imm(Val);
+    assert((Imm != -1) && "Cannot encode floating-point constant.");
+    unsigned Opc = Is64Bit ? AArch64::FMOVDi : AArch64::FMOVSi;
+    return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
+  }
+
+  // For the MachO large code model materialize the FP constant in code.
+  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
+    unsigned Opc1 = Is64Bit ? AArch64::MOVi64imm : AArch64::MOVi32imm;
+    const TargetRegisterClass *RC = Is64Bit ?
+        &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+    unsigned TmpReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc1), TmpReg)
+        .addImm(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(TmpReg, getKillRegState(true));
+
+    return ResultReg;
+  }
+
+  // Materialize via constant pool.  MachineConstantPool wants an explicit
+  // alignment.
+  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+  if (Align == 0)
+    Align = DL.getTypeAllocSize(CFP->getType());
+
+  unsigned CPI = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+  unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+          ADRPReg).addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGE);
+
+  unsigned Opc = Is64Bit ? AArch64::LDRDui : AArch64::LDRSui;
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(ADRPReg)
+      .addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
+  // We can't handle thread-local variables quickly yet.
+  if (GV->isThreadLocal())
+    return 0;
+
+  // MachO still uses GOT for large code-model accesses, but ELF requires
+  // movz/movk sequences, which FastISel doesn't handle yet.
+  if (TM.getCodeModel() != CodeModel::Small && !Subtarget->isTargetMachO())
+    return 0;
+
+  unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+  EVT DestEVT = TLI.getValueType(DL, GV->getType(), true);
+  if (!DestEVT.isSimple())
+    return 0;
+
+  unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+  unsigned ResultReg;
+
+  if (OpFlags & AArch64II::MO_GOT) {
+    // ADRP + LDRX
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+            ADRPReg)
+      .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+
+    ResultReg = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
+            ResultReg)
+      .addReg(ADRPReg)
+      .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+                        AArch64II::MO_NC);
+  } else {
+    // ADRP + ADDX
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+            ADRPReg)
+      .addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
+
+    ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+            ResultReg)
+      .addReg(ADRPReg)
+      .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
+      .addImm(0);
+  }
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::fastMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(DL, C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple())
+    return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  if (const auto *CI = dyn_cast<ConstantInt>(C))
+    return materializeInt(CI, VT);
+  else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return materializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return materializeGV(GV);
+
+  return 0;
+}
+
+unsigned AArch64FastISel::fastMaterializeFloatZero(const ConstantFP* CFP) {
+  assert(CFP->isNullValue() &&
+         "Floating-point constant is not a positive zero.");
+  MVT VT;
+  if (!isTypeLegal(CFP->getType(), VT))
+    return 0;
+
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return 0;
+
+  bool Is64Bit = (VT == MVT::f64);
+  unsigned ZReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+  unsigned Opc = Is64Bit ? AArch64::FMOVXDr : AArch64::FMOVWSr;
+  return fastEmitInst_r(Opc, TLI.getRegClassFor(VT), ZReg, /*IsKill=*/true);
+}
+
+/// \brief Check if the multiply is by a power-of-2 constant.
+static bool isMulPowOf2(const Value *I) {
+  if (const auto *MI = dyn_cast<MulOperator>(I)) {
+    if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(0)))
+      if (C->getValue().isPowerOf2())
+        return true;
+    if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(1)))
+      if (C->getValue().isPowerOf2())
+        return true;
+  }
+  return false;
+}
+
+// Computes the address to get to an object.
+bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
+{
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  if (auto *Ty = dyn_cast<PointerType>(Obj->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+  switch (Opcode) {
+  default:
+    break;
+  case Instruction::BitCast: {
+    // Look through bitcasts.
+    return computeAddress(U->getOperand(0), Addr, Ty);
+  }
+  case Instruction::IntToPtr: {
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+        TLI.getPointerTy(DL))
+      return computeAddress(U->getOperand(0), Addr, Ty);
+    break;
+  }
+  case Instruction::PtrToInt: {
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+      return computeAddress(U->getOperand(0), Addr, Ty);
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    Address SavedAddr = Addr;
+    uint64_t TmpOffset = Addr.getOffset();
+
+    // Iterate through the GEP folding the constants into offsets where
+    // we can.
+    for (gep_type_iterator GTI = gep_type_begin(U), E = gep_type_end(U);
+         GTI != E; ++GTI) {
+      const Value *Op = GTI.getOperand();
+      if (StructType *STy = GTI.getStructTypeOrNull()) {
+        const StructLayout *SL = DL.getStructLayout(STy);
+        unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+        TmpOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+        for (;;) {
+          if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+            // Constant-offset addressing.
+            TmpOffset += CI->getSExtValue() * S;
+            break;
+          }
+          if (canFoldAddIntoGEP(U, Op)) {
+            // A compatible add with a constant operand. Fold the constant.
+            ConstantInt *CI =
+                cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+            TmpOffset += CI->getSExtValue() * S;
+            // Iterate on the other operand.
+            Op = cast<AddOperator>(Op)->getOperand(0);
+            continue;
+          }
+          // Unsupported
+          goto unsupported_gep;
+        }
+      }
+    }
+
+    // Try to grab the base operand now.
+    Addr.setOffset(TmpOffset);
+    if (computeAddress(U->getOperand(0), Addr, Ty))
+      return true;
+
+    // We failed, restore everything and try the other options.
+    Addr = SavedAddr;
+
+  unsupported_gep:
+    break;
+  }
+  case Instruction::Alloca: {
+    const AllocaInst *AI = cast<AllocaInst>(Obj);
+    DenseMap<const AllocaInst *, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
+      Addr.setKind(Address::FrameIndexBase);
+      Addr.setFI(SI->second);
+      return true;
+    }
+    break;
+  }
+  case Instruction::Add: {
+    // Adds of constants are common and easy enough.
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    if (isa<ConstantInt>(LHS))
+      std::swap(LHS, RHS);
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+      Addr.setOffset(Addr.getOffset() + CI->getSExtValue());
+      return computeAddress(LHS, Addr, Ty);
+    }
+
+    Address Backup = Addr;
+    if (computeAddress(LHS, Addr, Ty) && computeAddress(RHS, Addr, Ty))
+      return true;
+    Addr = Backup;
+
+    break;
+  }
+  case Instruction::Sub: {
+    // Subs of constants are common and easy enough.
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+      Addr.setOffset(Addr.getOffset() - CI->getSExtValue());
+      return computeAddress(LHS, Addr, Ty);
+    }
+    break;
+  }
+  case Instruction::Shl: {
+    if (Addr.getOffsetReg())
+      break;
+
+    const auto *CI = dyn_cast<ConstantInt>(U->getOperand(1));
+    if (!CI)
+      break;
+
+    unsigned Val = CI->getZExtValue();
+    if (Val < 1 || Val > 3)
+      break;
+
+    uint64_t NumBytes = 0;
+    if (Ty && Ty->isSized()) {
+      uint64_t NumBits = DL.getTypeSizeInBits(Ty);
+      NumBytes = NumBits / 8;
+      if (!isPowerOf2_64(NumBits))
+        NumBytes = 0;
+    }
+
+    if (NumBytes != (1ULL << Val))
+      break;
+
+    Addr.setShift(Val);
+    Addr.setExtendType(AArch64_AM::LSL);
+
+    const Value *Src = U->getOperand(0);
+    if (const auto *I = dyn_cast<Instruction>(Src)) {
+      if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+        // Fold the zext or sext when it won't become a noop.
+        if (const auto *ZE = dyn_cast<ZExtInst>(I)) {
+          if (!isIntExtFree(ZE) &&
+              ZE->getOperand(0)->getType()->isIntegerTy(32)) {
+            Addr.setExtendType(AArch64_AM::UXTW);
+            Src = ZE->getOperand(0);
+          }
+        } else if (const auto *SE = dyn_cast<SExtInst>(I)) {
+          if (!isIntExtFree(SE) &&
+              SE->getOperand(0)->getType()->isIntegerTy(32)) {
+            Addr.setExtendType(AArch64_AM::SXTW);
+            Src = SE->getOperand(0);
+          }
+        }
+      }
+    }
+
+    if (const auto *AI = dyn_cast<BinaryOperator>(Src))
+      if (AI->getOpcode() == Instruction::And) {
+        const Value *LHS = AI->getOperand(0);
+        const Value *RHS = AI->getOperand(1);
+
+        if (const auto *C = dyn_cast<ConstantInt>(LHS))
+          if (C->getValue() == 0xffffffff)
+            std::swap(LHS, RHS);
+
+        if (const auto *C = dyn_cast<ConstantInt>(RHS))
+          if (C->getValue() == 0xffffffff) {
+            Addr.setExtendType(AArch64_AM::UXTW);
+            unsigned Reg = getRegForValue(LHS);
+            if (!Reg)
+              return false;
+            bool RegIsKill = hasTrivialKill(LHS);
+            Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill,
+                                             AArch64::sub_32);
+            Addr.setOffsetReg(Reg);
+            return true;
+          }
+      }
+
+    unsigned Reg = getRegForValue(Src);
+    if (!Reg)
+      return false;
+    Addr.setOffsetReg(Reg);
+    return true;
+  }
+  case Instruction::Mul: {
+    if (Addr.getOffsetReg())
+      break;
+
+    if (!isMulPowOf2(U))
+      break;
+
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    // Canonicalize power-of-2 value to the RHS.
+    if (const auto *C = dyn_cast<ConstantInt>(LHS))
+      if (C->getValue().isPowerOf2())
+        std::swap(LHS, RHS);
+
+    assert(isa<ConstantInt>(RHS) && "Expected an ConstantInt.");
+    const auto *C = cast<ConstantInt>(RHS);
+    unsigned Val = C->getValue().logBase2();
+    if (Val < 1 || Val > 3)
+      break;
+
+    uint64_t NumBytes = 0;
+    if (Ty && Ty->isSized()) {
+      uint64_t NumBits = DL.getTypeSizeInBits(Ty);
+      NumBytes = NumBits / 8;
+      if (!isPowerOf2_64(NumBits))
+        NumBytes = 0;
+    }
+
+    if (NumBytes != (1ULL << Val))
+      break;
+
+    Addr.setShift(Val);
+    Addr.setExtendType(AArch64_AM::LSL);
+
+    const Value *Src = LHS;
+    if (const auto *I = dyn_cast<Instruction>(Src)) {
+      if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+        // Fold the zext or sext when it won't become a noop.
+        if (const auto *ZE = dyn_cast<ZExtInst>(I)) {
+          if (!isIntExtFree(ZE) &&
+              ZE->getOperand(0)->getType()->isIntegerTy(32)) {
+            Addr.setExtendType(AArch64_AM::UXTW);
+            Src = ZE->getOperand(0);
+          }
+        } else if (const auto *SE = dyn_cast<SExtInst>(I)) {
+          if (!isIntExtFree(SE) &&
+              SE->getOperand(0)->getType()->isIntegerTy(32)) {
+            Addr.setExtendType(AArch64_AM::SXTW);
+            Src = SE->getOperand(0);
+          }
+        }
+      }
+    }
+
+    unsigned Reg = getRegForValue(Src);
+    if (!Reg)
+      return false;
+    Addr.setOffsetReg(Reg);
+    return true;
+  }
+  case Instruction::And: {
+    if (Addr.getOffsetReg())
+      break;
+
+    if (!Ty || DL.getTypeSizeInBits(Ty) != 8)
+      break;
+
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    if (const auto *C = dyn_cast<ConstantInt>(LHS))
+      if (C->getValue() == 0xffffffff)
+        std::swap(LHS, RHS);
+
+    if (const auto *C = dyn_cast<ConstantInt>(RHS))
+      if (C->getValue() == 0xffffffff) {
+        Addr.setShift(0);
+        Addr.setExtendType(AArch64_AM::LSL);
+        Addr.setExtendType(AArch64_AM::UXTW);
+
+        unsigned Reg = getRegForValue(LHS);
+        if (!Reg)
+          return false;
+        bool RegIsKill = hasTrivialKill(LHS);
+        Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill,
+                                         AArch64::sub_32);
+        Addr.setOffsetReg(Reg);
+        return true;
+      }
+    break;
+  }
+  case Instruction::SExt:
+  case Instruction::ZExt: {
+    if (!Addr.getReg() || Addr.getOffsetReg())
+      break;
+
+    const Value *Src = nullptr;
+    // Fold the zext or sext when it won't become a noop.
+    if (const auto *ZE = dyn_cast<ZExtInst>(U)) {
+      if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) {
+        Addr.setExtendType(AArch64_AM::UXTW);
+        Src = ZE->getOperand(0);
+      }
+    } else if (const auto *SE = dyn_cast<SExtInst>(U)) {
+      if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) {
+        Addr.setExtendType(AArch64_AM::SXTW);
+        Src = SE->getOperand(0);
+      }
+    }
+
+    if (!Src)
+      break;
+
+    Addr.setShift(0);
+    unsigned Reg = getRegForValue(Src);
+    if (!Reg)
+      return false;
+    Addr.setOffsetReg(Reg);
+    return true;
+  }
+  } // end switch
+
+  if (Addr.isRegBase() && !Addr.getReg()) {
+    unsigned Reg = getRegForValue(Obj);
+    if (!Reg)
+      return false;
+    Addr.setReg(Reg);
+    return true;
+  }
+
+  if (!Addr.getOffsetReg()) {
+    unsigned Reg = getRegForValue(Obj);
+    if (!Reg)
+      return false;
+    Addr.setOffsetReg(Reg);
+    return true;
+  }
+
+  return false;
+}
+
+bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) {
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  bool InMBB = true;
+
+  if (const auto *I = dyn_cast<Instruction>(V)) {
+    Opcode = I->getOpcode();
+    U = I;
+    InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
+  } else if (const auto *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+  default: break;
+  case Instruction::BitCast:
+    // Look past bitcasts if its operand is in the same BB.
+    if (InMBB)
+      return computeCallAddress(U->getOperand(0), Addr);
+    break;
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs if its operand is in the same BB.
+    if (InMBB &&
+        TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+            TLI.getPointerTy(DL))
+      return computeCallAddress(U->getOperand(0), Addr);
+    break;
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints if its operand is in the same BB.
+    if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+      return computeCallAddress(U->getOperand(0), Addr);
+    break;
+  }
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    Addr.setGlobalValue(GV);
+    return true;
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!Addr.getGlobalValue()) {
+    Addr.setReg(getRegForValue(V));
+    return Addr.getReg() != 0;
+  }
+
+  return false;
+}
+
+
+bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT evt = TLI.getValueType(DL, Ty, true);
+
+  // Only handle simple types.
+  if (evt == MVT::Other || !evt.isSimple())
+    return false;
+  VT = evt.getSimpleVT();
+
+  // This is a legal type, but it's not something we handle in fast-isel.
+  if (VT == MVT::f128)
+    return false;
+
+  // Handle all other legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+/// \brief Determine if the value type is supported by FastISel.
+///
+/// FastISel for AArch64 can handle more value types than are legal. This adds
+/// simple value type such as i1, i8, and i16.
+bool AArch64FastISel::isTypeSupported(Type *Ty, MVT &VT, bool IsVectorAllowed) {
+  if (Ty->isVectorTy() && !IsVectorAllowed)
+    return false;
+
+  if (isTypeLegal(Ty, VT))
+    return true;
+
+  // If this is a type than can be sign or zero-extended to a basic operation
+  // go ahead and accept it now.
+  if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+    return true;
+
+  return false;
+}
+
+bool AArch64FastISel::isValueAvailable(const Value *V) const {
+  if (!isa<Instruction>(V))
+    return true;
+
+  const auto *I = cast<Instruction>(V);
+  return FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB;
+}
+
+bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
+  unsigned ScaleFactor = getImplicitScaleFactor(VT);
+  if (!ScaleFactor)
+    return false;
+
+  bool ImmediateOffsetNeedsLowering = false;
+  bool RegisterOffsetNeedsLowering = false;
+  int64_t Offset = Addr.getOffset();
+  if (((Offset < 0) || (Offset & (ScaleFactor - 1))) && !isInt<9>(Offset))
+    ImmediateOffsetNeedsLowering = true;
+  else if (Offset > 0 && !(Offset & (ScaleFactor - 1)) &&
+           !isUInt<12>(Offset / ScaleFactor))
+    ImmediateOffsetNeedsLowering = true;
+
+  // Cannot encode an offset register and an immediate offset in the same
+  // instruction. Fold the immediate offset into the load/store instruction and
+  // emit an additional add to take care of the offset register.
+  if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.getOffsetReg())
+    RegisterOffsetNeedsLowering = true;
+
+  // Cannot encode zero register as base.
+  if (Addr.isRegBase() && Addr.getOffsetReg() && !Addr.getReg())
+    RegisterOffsetNeedsLowering = true;
+
+  // If this is a stack pointer and the offset needs to be simplified then put
+  // the alloca address into a register, set the base type back to register and
+  // continue. This should almost never happen.
+  if ((ImmediateOffsetNeedsLowering || Addr.getOffsetReg()) && Addr.isFIBase())
+  {
+    unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+            ResultReg)
+      .addFrameIndex(Addr.getFI())
+      .addImm(0)
+      .addImm(0);
+    Addr.setKind(Address::RegBase);
+    Addr.setReg(ResultReg);
+  }
+
+  if (RegisterOffsetNeedsLowering) {
+    unsigned ResultReg = 0;
+    if (Addr.getReg()) {
+      if (Addr.getExtendType() == AArch64_AM::SXTW ||
+          Addr.getExtendType() == AArch64_AM::UXTW   )
+        ResultReg = emitAddSub_rx(/*UseAdd=*/true, MVT::i64, Addr.getReg(),
+                                  /*TODO:IsKill=*/false, Addr.getOffsetReg(),
+                                  /*TODO:IsKill=*/false, Addr.getExtendType(),
+                                  Addr.getShift());
+      else
+        ResultReg = emitAddSub_rs(/*UseAdd=*/true, MVT::i64, Addr.getReg(),
+                                  /*TODO:IsKill=*/false, Addr.getOffsetReg(),
+                                  /*TODO:IsKill=*/false, AArch64_AM::LSL,
+                                  Addr.getShift());
+    } else {
+      if (Addr.getExtendType() == AArch64_AM::UXTW)
+        ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(),
+                               /*Op0IsKill=*/false, Addr.getShift(),
+                               /*IsZExt=*/true);
+      else if (Addr.getExtendType() == AArch64_AM::SXTW)
+        ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(),
+                               /*Op0IsKill=*/false, Addr.getShift(),
+                               /*IsZExt=*/false);
+      else
+        ResultReg = emitLSL_ri(MVT::i64, MVT::i64, Addr.getOffsetReg(),
+                               /*Op0IsKill=*/false, Addr.getShift());
+    }
+    if (!ResultReg)
+      return false;
+
+    Addr.setReg(ResultReg);
+    Addr.setOffsetReg(0);
+    Addr.setShift(0);
+    Addr.setExtendType(AArch64_AM::InvalidShiftExtend);
+  }
+
+  // Since the offset is too large for the load/store instruction get the
+  // reg+offset into a register.
+  if (ImmediateOffsetNeedsLowering) {
+    unsigned ResultReg;
+    if (Addr.getReg())
+      // Try to fold the immediate into the add instruction.
+      ResultReg = emitAdd_ri_(MVT::i64, Addr.getReg(), /*IsKill=*/false, Offset);
+    else
+      ResultReg = fastEmit_i(MVT::i64, MVT::i64, ISD::Constant, Offset);
+
+    if (!ResultReg)
+      return false;
+    Addr.setReg(ResultReg);
+    Addr.setOffset(0);
+  }
+  return true;
+}
+
+void AArch64FastISel::addLoadStoreOperands(Address &Addr,
+                                           const MachineInstrBuilder &MIB,
+                                           MachineMemOperand::Flags Flags,
+                                           unsigned ScaleFactor,
+                                           MachineMemOperand *MMO) {
+  int64_t Offset = Addr.getOffset() / ScaleFactor;
+  // Frame base works a bit differently. Handle it separately.
+  if (Addr.isFIBase()) {
+    int FI = Addr.getFI();
+    // FIXME: We shouldn't be using getObjectSize/getObjectAlignment.  The size
+    // and alignment should be based on the VT.
+    MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags,
+        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+    // Now add the rest of the operands.
+    MIB.addFrameIndex(FI).addImm(Offset);
+  } else {
+    assert(Addr.isRegBase() && "Unexpected address kind.");
+    const MCInstrDesc &II = MIB->getDesc();
+    unsigned Idx = (Flags & MachineMemOperand::MOStore) ? 1 : 0;
+    Addr.setReg(
+      constrainOperandRegClass(II, Addr.getReg(), II.getNumDefs()+Idx));
+    Addr.setOffsetReg(
+      constrainOperandRegClass(II, Addr.getOffsetReg(), II.getNumDefs()+Idx+1));
+    if (Addr.getOffsetReg()) {
+      assert(Addr.getOffset() == 0 && "Unexpected offset");
+      bool IsSigned = Addr.getExtendType() == AArch64_AM::SXTW ||
+                      Addr.getExtendType() == AArch64_AM::SXTX;
+      MIB.addReg(Addr.getReg());
+      MIB.addReg(Addr.getOffsetReg());
+      MIB.addImm(IsSigned);
+      MIB.addImm(Addr.getShift() != 0);
+    } else
+      MIB.addReg(Addr.getReg()).addImm(Offset);
+  }
+
+  if (MMO)
+    MIB.addMemOperand(MMO);
+}
+
+unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
+                                     const Value *RHS, bool SetFlags,
+                                     bool WantResult,  bool IsZExt) {
+  AArch64_AM::ShiftExtendType ExtendType = AArch64_AM::InvalidShiftExtend;
+  bool NeedExtend = false;
+  switch (RetVT.SimpleTy) {
+  default:
+    return 0;
+  case MVT::i1:
+    NeedExtend = true;
+    break;
+  case MVT::i8:
+    NeedExtend = true;
+    ExtendType = IsZExt ? AArch64_AM::UXTB : AArch64_AM::SXTB;
+    break;
+  case MVT::i16:
+    NeedExtend = true;
+    ExtendType = IsZExt ? AArch64_AM::UXTH : AArch64_AM::SXTH;
+    break;
+  case MVT::i32:  // fall-through
+  case MVT::i64:
+    break;
+  }
+  MVT SrcVT = RetVT;
+  RetVT.SimpleTy = std::max(RetVT.SimpleTy, MVT::i32);
+
+  // Canonicalize immediates to the RHS first.
+  if (UseAdd && isa<Constant>(LHS) && !isa<Constant>(RHS))
+    std::swap(LHS, RHS);
+
+  // Canonicalize mul by power of 2 to the RHS.
+  if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS))
+    if (isMulPowOf2(LHS))
+      std::swap(LHS, RHS);
+
+  // Canonicalize shift immediate to the RHS.
+  if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS))
+    if (const auto *SI = dyn_cast<BinaryOperator>(LHS))
+      if (isa<ConstantInt>(SI->getOperand(1)))
+        if (SI->getOpcode() == Instruction::Shl  ||
+            SI->getOpcode() == Instruction::LShr ||
+            SI->getOpcode() == Instruction::AShr   )
+          std::swap(LHS, RHS);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  if (!LHSReg)
+    return 0;
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  if (NeedExtend)
+    LHSReg = emitIntExt(SrcVT, LHSReg, RetVT, IsZExt);
+
+  unsigned ResultReg = 0;
+  if (const auto *C = dyn_cast<ConstantInt>(RHS)) {
+    uint64_t Imm = IsZExt ? C->getZExtValue() : C->getSExtValue();
+    if (C->isNegative())
+      ResultReg = emitAddSub_ri(!UseAdd, RetVT, LHSReg, LHSIsKill, -Imm,
+                                SetFlags, WantResult);
+    else
+      ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, Imm, SetFlags,
+                                WantResult);
+  } else if (const auto *C = dyn_cast<Constant>(RHS))
+    if (C->isNullValue())
+      ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, 0, SetFlags,
+                                WantResult);
+
+  if (ResultReg)
+    return ResultReg;
+
+  // Only extend the RHS within the instruction if there is a valid extend type.
+  if (ExtendType != AArch64_AM::InvalidShiftExtend && RHS->hasOneUse() &&
+      isValueAvailable(RHS)) {
+    if (const auto *SI = dyn_cast<BinaryOperator>(RHS))
+      if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1)))
+        if ((SI->getOpcode() == Instruction::Shl) && (C->getZExtValue() < 4)) {
+          unsigned RHSReg = getRegForValue(SI->getOperand(0));
+          if (!RHSReg)
+            return 0;
+          bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
+          return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+                               RHSIsKill, ExtendType, C->getZExtValue(),
+                               SetFlags, WantResult);
+        }
+    unsigned RHSReg = getRegForValue(RHS);
+    if (!RHSReg)
+      return 0;
+    bool RHSIsKill = hasTrivialKill(RHS);
+    return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+                         ExtendType, 0, SetFlags, WantResult);
+  }
+
+  // Check if the mul can be folded into the instruction.
+  if (RHS->hasOneUse() && isValueAvailable(RHS)) {
+    if (isMulPowOf2(RHS)) {
+      const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
+      const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
+
+      if (const auto *C = dyn_cast<ConstantInt>(MulLHS))
+        if (C->getValue().isPowerOf2())
+          std::swap(MulLHS, MulRHS);
+
+      assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt.");
+      uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2();
+      unsigned RHSReg = getRegForValue(MulLHS);
+      if (!RHSReg)
+        return 0;
+      bool RHSIsKill = hasTrivialKill(MulLHS);
+      ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+                                RHSIsKill, AArch64_AM::LSL, ShiftVal, SetFlags,
+                                WantResult);
+      if (ResultReg)
+        return ResultReg;
+    }
+  }
+
+  // Check if the shift can be folded into the instruction.
+  if (RHS->hasOneUse() && isValueAvailable(RHS)) {
+    if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) {
+      if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
+        AArch64_AM::ShiftExtendType ShiftType = AArch64_AM::InvalidShiftExtend;
+        switch (SI->getOpcode()) {
+        default: break;
+        case Instruction::Shl:  ShiftType = AArch64_AM::LSL; break;
+        case Instruction::LShr: ShiftType = AArch64_AM::LSR; break;
+        case Instruction::AShr: ShiftType = AArch64_AM::ASR; break;
+        }
+        uint64_t ShiftVal = C->getZExtValue();
+        if (ShiftType != AArch64_AM::InvalidShiftExtend) {
+          unsigned RHSReg = getRegForValue(SI->getOperand(0));
+          if (!RHSReg)
+            return 0;
+          bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
+          ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+                                    RHSIsKill, ShiftType, ShiftVal, SetFlags,
+                                    WantResult);
+          if (ResultReg)
+            return ResultReg;
+        }
+      }
+    }
+  }
+
+  unsigned RHSReg = getRegForValue(RHS);
+  if (!RHSReg)
+    return 0;
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  if (NeedExtend)
+    RHSReg = emitIntExt(SrcVT, RHSReg, RetVT, IsZExt);
+
+  return emitAddSub_rr(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+                       SetFlags, WantResult);
+}
+
+unsigned AArch64FastISel::emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                                        bool LHSIsKill, unsigned RHSReg,
+                                        bool RHSIsKill, bool SetFlags,
+                                        bool WantResult) {
+  assert(LHSReg && RHSReg && "Invalid register number.");
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return 0;
+
+  static const unsigned OpcTable[2][2][2] = {
+    { { AArch64::SUBWrr,  AArch64::SUBXrr  },
+      { AArch64::ADDWrr,  AArch64::ADDXrr  }  },
+    { { AArch64::SUBSWrr, AArch64::SUBSXrr },
+      { AArch64::ADDSWrr, AArch64::ADDSXrr }  }
+  };
+  bool Is64Bit = RetVT == MVT::i64;
+  unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+  const TargetRegisterClass *RC =
+      Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  unsigned ResultReg;
+  if (WantResult)
+    ResultReg = createResultReg(RC);
+  else
+    ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+  const MCInstrDesc &II = TII.get(Opc);
+  LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+  RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+      .addReg(LHSReg, getKillRegState(LHSIsKill))
+      .addReg(RHSReg, getKillRegState(RHSIsKill));
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                                        bool LHSIsKill, uint64_t Imm,
+                                        bool SetFlags, bool WantResult) {
+  assert(LHSReg && "Invalid register number.");
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return 0;
+
+  unsigned ShiftImm;
+  if (isUInt<12>(Imm))
+    ShiftImm = 0;
+  else if ((Imm & 0xfff000) == Imm) {
+    ShiftImm = 12;
+    Imm >>= 12;
+  } else
+    return 0;
+
+  static const unsigned OpcTable[2][2][2] = {
+    { { AArch64::SUBWri,  AArch64::SUBXri  },
+      { AArch64::ADDWri,  AArch64::ADDXri  }  },
+    { { AArch64::SUBSWri, AArch64::SUBSXri },
+      { AArch64::ADDSWri, AArch64::ADDSXri }  }
+  };
+  bool Is64Bit = RetVT == MVT::i64;
+  unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+  const TargetRegisterClass *RC;
+  if (SetFlags)
+    RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  else
+    RC = Is64Bit ? &AArch64::GPR64spRegClass : &AArch64::GPR32spRegClass;
+  unsigned ResultReg;
+  if (WantResult)
+    ResultReg = createResultReg(RC);
+  else
+    ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+  const MCInstrDesc &II = TII.get(Opc);
+  LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+      .addReg(LHSReg, getKillRegState(LHSIsKill))
+      .addImm(Imm)
+      .addImm(getShifterImm(AArch64_AM::LSL, ShiftImm));
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                                        bool LHSIsKill, unsigned RHSReg,
+                                        bool RHSIsKill,
+                                        AArch64_AM::ShiftExtendType ShiftType,
+                                        uint64_t ShiftImm, bool SetFlags,
+                                        bool WantResult) {
+  assert(LHSReg && RHSReg && "Invalid register number.");
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return 0;
+
+  // Don't deal with undefined shifts.
+  if (ShiftImm >= RetVT.getSizeInBits())
+    return 0;
+
+  static const unsigned OpcTable[2][2][2] = {
+    { { AArch64::SUBWrs,  AArch64::SUBXrs  },
+      { AArch64::ADDWrs,  AArch64::ADDXrs  }  },
+    { { AArch64::SUBSWrs, AArch64::SUBSXrs },
+      { AArch64::ADDSWrs, AArch64::ADDSXrs }  }
+  };
+  bool Is64Bit = RetVT == MVT::i64;
+  unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+  const TargetRegisterClass *RC =
+      Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  unsigned ResultReg;
+  if (WantResult)
+    ResultReg = createResultReg(RC);
+  else
+    ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+  const MCInstrDesc &II = TII.get(Opc);
+  LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+  RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+      .addReg(LHSReg, getKillRegState(LHSIsKill))
+      .addReg(RHSReg, getKillRegState(RHSIsKill))
+      .addImm(getShifterImm(ShiftType, ShiftImm));
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                                        bool LHSIsKill, unsigned RHSReg,
+                                        bool RHSIsKill,
+                                        AArch64_AM::ShiftExtendType ExtType,
+                                        uint64_t ShiftImm, bool SetFlags,
+                                        bool WantResult) {
+  assert(LHSReg && RHSReg && "Invalid register number.");
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return 0;
+
+  if (ShiftImm >= 4)
+    return 0;
+
+  static const unsigned OpcTable[2][2][2] = {
+    { { AArch64::SUBWrx,  AArch64::SUBXrx  },
+      { AArch64::ADDWrx,  AArch64::ADDXrx  }  },
+    { { AArch64::SUBSWrx, AArch64::SUBSXrx },
+      { AArch64::ADDSWrx, AArch64::ADDSXrx }  }
+  };
+  bool Is64Bit = RetVT == MVT::i64;
+  unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+  const TargetRegisterClass *RC = nullptr;
+  if (SetFlags)
+    RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  else
+    RC = Is64Bit ? &AArch64::GPR64spRegClass : &AArch64::GPR32spRegClass;
+  unsigned ResultReg;
+  if (WantResult)
+    ResultReg = createResultReg(RC);
+  else
+    ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+  const MCInstrDesc &II = TII.get(Opc);
+  LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+  RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+      .addReg(LHSReg, getKillRegState(LHSIsKill))
+      .addReg(RHSReg, getKillRegState(RHSIsKill))
+      .addImm(getArithExtendImm(ExtType, ShiftImm));
+  return ResultReg;
+}
+
+bool AArch64FastISel::emitCmp(const Value *LHS, const Value *RHS, bool IsZExt) {
+  Type *Ty = LHS->getType();
+  EVT EVT = TLI.getValueType(DL, Ty, true);
+  if (!EVT.isSimple())
+    return false;
+  MVT VT = EVT.getSimpleVT();
+
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+    return emitICmp(VT, LHS, RHS, IsZExt);
+  case MVT::f32:
+  case MVT::f64:
+    return emitFCmp(VT, LHS, RHS);
+  }
+}
+
+bool AArch64FastISel::emitICmp(MVT RetVT, const Value *LHS, const Value *RHS,
+                               bool IsZExt) {
+  return emitSub(RetVT, LHS, RHS, /*SetFlags=*/true, /*WantResult=*/false,
+                 IsZExt) != 0;
+}
+
+bool AArch64FastISel::emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+                                  uint64_t Imm) {
+  return emitAddSub_ri(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, Imm,
+                       /*SetFlags=*/true, /*WantResult=*/false) != 0;
+}
+
+bool AArch64FastISel::emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS) {
+  if (RetVT != MVT::f32 && RetVT != MVT::f64)
+    return false;
+
+  // Check to see if the 2nd operand is a constant that we can encode directly
+  // in the compare.
+  bool UseImm = false;
+  if (const auto *CFP = dyn_cast<ConstantFP>(RHS))
+    if (CFP->isZero() && !CFP->isNegative())
+      UseImm = true;
+
+  unsigned LHSReg = getRegForValue(LHS);
+  if (!LHSReg)
+    return false;
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  if (UseImm) {
+    unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDri : AArch64::FCMPSri;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+        .addReg(LHSReg, getKillRegState(LHSIsKill));
+    return true;
+  }
+
+  unsigned RHSReg = getRegForValue(RHS);
+  if (!RHSReg)
+    return false;
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDrr : AArch64::FCMPSrr;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+      .addReg(LHSReg, getKillRegState(LHSIsKill))
+      .addReg(RHSReg, getKillRegState(RHSIsKill));
+  return true;
+}
+
+unsigned AArch64FastISel::emitAdd(MVT RetVT, const Value *LHS, const Value *RHS,
+                                  bool SetFlags, bool WantResult, bool IsZExt) {
+  return emitAddSub(/*UseAdd=*/true, RetVT, LHS, RHS, SetFlags, WantResult,
+                    IsZExt);
+}
+
+/// \brief This method is a wrapper to simplify add emission.
+///
+/// First try to emit an add with an immediate operand using emitAddSub_ri. If
+/// that fails, then try to materialize the immediate into a register and use
+/// emitAddSub_rr instead.
+unsigned AArch64FastISel::emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill,
+                                      int64_t Imm) {
+  unsigned ResultReg;
+  if (Imm < 0)
+    ResultReg = emitAddSub_ri(false, VT, Op0, Op0IsKill, -Imm);
+  else
+    ResultReg = emitAddSub_ri(true, VT, Op0, Op0IsKill, Imm);
+
+  if (ResultReg)
+    return ResultReg;
+
+  unsigned CReg = fastEmit_i(VT, VT, ISD::Constant, Imm);
+  if (!CReg)
+    return 0;
+
+  ResultReg = emitAddSub_rr(true, VT, Op0, Op0IsKill, CReg, true);
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitSub(MVT RetVT, const Value *LHS, const Value *RHS,
+                                  bool SetFlags, bool WantResult, bool IsZExt) {
+  return emitAddSub(/*UseAdd=*/false, RetVT, LHS, RHS, SetFlags, WantResult,
+                    IsZExt);
+}
+
+unsigned AArch64FastISel::emitSubs_rr(MVT RetVT, unsigned LHSReg,
+                                      bool LHSIsKill, unsigned RHSReg,
+                                      bool RHSIsKill, bool WantResult) {
+  return emitAddSub_rr(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg,
+                       RHSIsKill, /*SetFlags=*/true, WantResult);
+}
+
+unsigned AArch64FastISel::emitSubs_rs(MVT RetVT, unsigned LHSReg,
+                                      bool LHSIsKill, unsigned RHSReg,
+                                      bool RHSIsKill,
+                                      AArch64_AM::ShiftExtendType ShiftType,
+                                      uint64_t ShiftImm, bool WantResult) {
+  return emitAddSub_rs(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg,
+                       RHSIsKill, ShiftType, ShiftImm, /*SetFlags=*/true,
+                       WantResult);
+}
+
+unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
+                                        const Value *LHS, const Value *RHS) {
+  // Canonicalize immediates to the RHS first.
+  if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS))
+    std::swap(LHS, RHS);
+
+  // Canonicalize mul by power-of-2 to the RHS.
+  if (LHS->hasOneUse() && isValueAvailable(LHS))
+    if (isMulPowOf2(LHS))
+      std::swap(LHS, RHS);
+
+  // Canonicalize shift immediate to the RHS.
+  if (LHS->hasOneUse() && isValueAvailable(LHS))
+    if (const auto *SI = dyn_cast<ShlOperator>(LHS))
+      if (isa<ConstantInt>(SI->getOperand(1)))
+        std::swap(LHS, RHS);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  if (!LHSReg)
+    return 0;
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  unsigned ResultReg = 0;
+  if (const auto *C = dyn_cast<ConstantInt>(RHS)) {
+    uint64_t Imm = C->getZExtValue();
+    ResultReg = emitLogicalOp_ri(ISDOpc, RetVT, LHSReg, LHSIsKill, Imm);
+  }
+  if (ResultReg)
+    return ResultReg;
+
+  // Check if the mul can be folded into the instruction.
+  if (RHS->hasOneUse() && isValueAvailable(RHS)) {
+    if (isMulPowOf2(RHS)) {
+      const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
+      const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
+
+      if (const auto *C = dyn_cast<ConstantInt>(MulLHS))
+        if (C->getValue().isPowerOf2())
+          std::swap(MulLHS, MulRHS);
+
+      assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt.");
+      uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2();
+
+      unsigned RHSReg = getRegForValue(MulLHS);
+      if (!RHSReg)
+        return 0;
+      bool RHSIsKill = hasTrivialKill(MulLHS);
+      ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
+                                   RHSIsKill, ShiftVal);
+      if (ResultReg)
+        return ResultReg;
+    }
+  }
+
+  // Check if the shift can be folded into the instruction.
+  if (RHS->hasOneUse() && isValueAvailable(RHS)) {
+    if (const auto *SI = dyn_cast<ShlOperator>(RHS))
+      if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
+        uint64_t ShiftVal = C->getZExtValue();
+        unsigned RHSReg = getRegForValue(SI->getOperand(0));
+        if (!RHSReg)
+          return 0;
+        bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
+        ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
+                                     RHSIsKill, ShiftVal);
+        if (ResultReg)
+          return ResultReg;
+      }
+  }
+
+  unsigned RHSReg = getRegForValue(RHS);
+  if (!RHSReg)
+    return 0;
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  MVT VT = std::max(MVT::i32, RetVT.SimpleTy);
+  ResultReg = fastEmit_rr(VT, VT, ISDOpc, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+  if (RetVT >= MVT::i8 && RetVT <= MVT::i16) {
+    uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  }
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT,
+                                           unsigned LHSReg, bool LHSIsKill,
+                                           uint64_t Imm) {
+  static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR),
+                "ISD nodes are not consecutive!");
+  static const unsigned OpcTable[3][2] = {
+    { AArch64::ANDWri, AArch64::ANDXri },
+    { AArch64::ORRWri, AArch64::ORRXri },
+    { AArch64::EORWri, AArch64::EORXri }
+  };
+  const TargetRegisterClass *RC;
+  unsigned Opc;
+  unsigned RegSize;
+  switch (RetVT.SimpleTy) {
+  default:
+    return 0;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32: {
+    unsigned Idx = ISDOpc - ISD::AND;
+    Opc = OpcTable[Idx][0];
+    RC = &AArch64::GPR32spRegClass;
+    RegSize = 32;
+    break;
+  }
+  case MVT::i64:
+    Opc = OpcTable[ISDOpc - ISD::AND][1];
+    RC = &AArch64::GPR64spRegClass;
+    RegSize = 64;
+    break;
+  }
+
+  if (!AArch64_AM::isLogicalImmediate(Imm, RegSize))
+    return 0;
+
+  unsigned ResultReg =
+      fastEmitInst_ri(Opc, RC, LHSReg, LHSIsKill,
+                      AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
+  if (RetVT >= MVT::i8 && RetVT <= MVT::i16 && ISDOpc != ISD::AND) {
+    uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  }
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT,
+                                           unsigned LHSReg, bool LHSIsKill,
+                                           unsigned RHSReg, bool RHSIsKill,
+                                           uint64_t ShiftImm) {
+  static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR),
+                "ISD nodes are not consecutive!");
+  static const unsigned OpcTable[3][2] = {
+    { AArch64::ANDWrs, AArch64::ANDXrs },
+    { AArch64::ORRWrs, AArch64::ORRXrs },
+    { AArch64::EORWrs, AArch64::EORXrs }
+  };
+
+  // Don't deal with undefined shifts.
+  if (ShiftImm >= RetVT.getSizeInBits())
+    return 0;
+
+  const TargetRegisterClass *RC;
+  unsigned Opc;
+  switch (RetVT.SimpleTy) {
+  default:
+    return 0;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    Opc = OpcTable[ISDOpc - ISD::AND][0];
+    RC = &AArch64::GPR32RegClass;
+    break;
+  case MVT::i64:
+    Opc = OpcTable[ISDOpc - ISD::AND][1];
+    RC = &AArch64::GPR64RegClass;
+    break;
+  }
+  unsigned ResultReg =
+      fastEmitInst_rri(Opc, RC, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+                       AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftImm));
+  if (RetVT >= MVT::i8 && RetVT <= MVT::i16) {
+    uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  }
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+                                     uint64_t Imm) {
+  return emitLogicalOp_ri(ISD::AND, RetVT, LHSReg, LHSIsKill, Imm);
+}
+
+unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr,
+                                   bool WantZExt, MachineMemOperand *MMO) {
+  if (!TLI.allowsMisalignedMemoryAccesses(VT))
+    return 0;
+
+  // Simplify this down to something we can handle.
+  if (!simplifyAddress(Addr, VT))
+    return 0;
+
+  unsigned ScaleFactor = getImplicitScaleFactor(VT);
+  if (!ScaleFactor)
+    llvm_unreachable("Unexpected value type.");
+
+  // Negative offsets require unscaled, 9-bit, signed immediate offsets.
+  // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
+  bool UseScaled = true;
+  if ((Addr.getOffset() < 0) || (Addr.getOffset() & (ScaleFactor - 1))) {
+    UseScaled = false;
+    ScaleFactor = 1;
+  }
+
+  static const unsigned GPOpcTable[2][8][4] = {
+    // Sign-extend.
+    { { AArch64::LDURSBWi,  AArch64::LDURSHWi,  AArch64::LDURWi,
+        AArch64::LDURXi  },
+      { AArch64::LDURSBXi,  AArch64::LDURSHXi,  AArch64::LDURSWi,
+        AArch64::LDURXi  },
+      { AArch64::LDRSBWui,  AArch64::LDRSHWui,  AArch64::LDRWui,
+        AArch64::LDRXui  },
+      { AArch64::LDRSBXui,  AArch64::LDRSHXui,  AArch64::LDRSWui,
+        AArch64::LDRXui  },
+      { AArch64::LDRSBWroX, AArch64::LDRSHWroX, AArch64::LDRWroX,
+        AArch64::LDRXroX },
+      { AArch64::LDRSBXroX, AArch64::LDRSHXroX, AArch64::LDRSWroX,
+        AArch64::LDRXroX },
+      { AArch64::LDRSBWroW, AArch64::LDRSHWroW, AArch64::LDRWroW,
+        AArch64::LDRXroW },
+      { AArch64::LDRSBXroW, AArch64::LDRSHXroW, AArch64::LDRSWroW,
+        AArch64::LDRXroW }
+    },
+    // Zero-extend.
+    { { AArch64::LDURBBi,   AArch64::LDURHHi,   AArch64::LDURWi,
+        AArch64::LDURXi  },
+      { AArch64::LDURBBi,   AArch64::LDURHHi,   AArch64::LDURWi,
+        AArch64::LDURXi  },
+      { AArch64::LDRBBui,   AArch64::LDRHHui,   AArch64::LDRWui,
+        AArch64::LDRXui  },
+      { AArch64::LDRBBui,   AArch64::LDRHHui,   AArch64::LDRWui,
+        AArch64::LDRXui  },
+      { AArch64::LDRBBroX,  AArch64::LDRHHroX,  AArch64::LDRWroX,
+        AArch64::LDRXroX },
+      { AArch64::LDRBBroX,  AArch64::LDRHHroX,  AArch64::LDRWroX,
+        AArch64::LDRXroX },
+      { AArch64::LDRBBroW,  AArch64::LDRHHroW,  AArch64::LDRWroW,
+        AArch64::LDRXroW },
+      { AArch64::LDRBBroW,  AArch64::LDRHHroW,  AArch64::LDRWroW,
+        AArch64::LDRXroW }
+    }
+  };
+
+  static const unsigned FPOpcTable[4][2] = {
+    { AArch64::LDURSi,  AArch64::LDURDi  },
+    { AArch64::LDRSui,  AArch64::LDRDui  },
+    { AArch64::LDRSroX, AArch64::LDRDroX },
+    { AArch64::LDRSroW, AArch64::LDRDroW }
+  };
+
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() &&
+                      Addr.getOffsetReg();
+  unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0;
+  if (Addr.getExtendType() == AArch64_AM::UXTW ||
+      Addr.getExtendType() == AArch64_AM::SXTW)
+    Idx++;
+
+  bool IsRet64Bit = RetVT == MVT::i64;
+  switch (VT.SimpleTy) {
+  default:
+    llvm_unreachable("Unexpected value type.");
+  case MVT::i1: // Intentional fall-through.
+  case MVT::i8:
+    Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][0];
+    RC = (IsRet64Bit && !WantZExt) ?
+             &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
+    break;
+  case MVT::i16:
+    Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][1];
+    RC = (IsRet64Bit && !WantZExt) ?
+             &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
+    break;
+  case MVT::i32:
+    Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][2];
+    RC = (IsRet64Bit && !WantZExt) ?
+             &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
+    break;
+  case MVT::i64:
+    Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][3];
+    RC = &AArch64::GPR64RegClass;
+    break;
+  case MVT::f32:
+    Opc = FPOpcTable[Idx][0];
+    RC = &AArch64::FPR32RegClass;
+    break;
+  case MVT::f64:
+    Opc = FPOpcTable[Idx][1];
+    RC = &AArch64::FPR64RegClass;
+    break;
+  }
+
+  // Create the base instruction, then add the operands.
+  unsigned ResultReg = createResultReg(RC);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(Opc), ResultReg);
+  addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, ScaleFactor, MMO);
+
+  // Loading an i1 requires special handling.
+  if (VT == MVT::i1) {
+    unsigned ANDReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, 1);
+    assert(ANDReg && "Unexpected AND instruction emission failure.");
+    ResultReg = ANDReg;
+  }
+
+  // For zero-extending loads to 64bit we emit a 32bit load and then convert
+  // the 32bit reg to a 64bit reg.
+  if (WantZExt && RetVT == MVT::i64 && VT <= MVT::i32) {
+    unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), Reg64)
+        .addImm(0)
+        .addReg(ResultReg, getKillRegState(true))
+        .addImm(AArch64::sub_32);
+    ResultReg = Reg64;
+  }
+  return ResultReg;
+}
+
+bool AArch64FastISel::selectAddSub(const Instruction *I) {
+  MVT VT;
+  if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
+    return false;
+
+  if (VT.isVector())
+    return selectOperator(I, I->getOpcode());
+
+  unsigned ResultReg;
+  switch (I->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  case Instruction::Add:
+    ResultReg = emitAdd(VT, I->getOperand(0), I->getOperand(1));
+    break;
+  case Instruction::Sub:
+    ResultReg = emitSub(VT, I->getOperand(0), I->getOperand(1));
+    break;
+  }
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectLogicalOp(const Instruction *I) {
+  MVT VT;
+  if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
+    return false;
+
+  if (VT.isVector())
+    return selectOperator(I, I->getOpcode());
+
+  unsigned ResultReg;
+  switch (I->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  case Instruction::And:
+    ResultReg = emitLogicalOp(ISD::AND, VT, I->getOperand(0), I->getOperand(1));
+    break;
+  case Instruction::Or:
+    ResultReg = emitLogicalOp(ISD::OR, VT, I->getOperand(0), I->getOperand(1));
+    break;
+  case Instruction::Xor:
+    ResultReg = emitLogicalOp(ISD::XOR, VT, I->getOperand(0), I->getOperand(1));
+    break;
+  }
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectLoad(const Instruction *I) {
+  MVT VT;
+  // Verify we have a legal type before going any further.  Currently, we handle
+  // simple types that will directly fit in a register (i32/f32/i64/f64) or
+  // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
+  if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true) ||
+      cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  const Value *SV = I->getOperand(0);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(SV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return false;
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
+      if (Alloca->isSwiftError())
+        return false;
+    }
+  }
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!computeAddress(I->getOperand(0), Addr, I->getType()))
+    return false;
+
+  // Fold the following sign-/zero-extend into the load instruction.
+  bool WantZExt = true;
+  MVT RetVT = VT;
+  const Value *IntExtVal = nullptr;
+  if (I->hasOneUse()) {
+    if (const auto *ZE = dyn_cast<ZExtInst>(I->use_begin()->getUser())) {
+      if (isTypeSupported(ZE->getType(), RetVT))
+        IntExtVal = ZE;
+      else
+        RetVT = VT;
+    } else if (const auto *SE = dyn_cast<SExtInst>(I->use_begin()->getUser())) {
+      if (isTypeSupported(SE->getType(), RetVT))
+        IntExtVal = SE;
+      else
+        RetVT = VT;
+      WantZExt = false;
+    }
+  }
+
+  unsigned ResultReg =
+      emitLoad(VT, RetVT, Addr, WantZExt, createMachineMemOperandFor(I));
+  if (!ResultReg)
+    return false;
+
+  // There are a few different cases we have to handle, because the load or the
+  // sign-/zero-extend might not be selected by FastISel if we fall-back to
+  // SelectionDAG. There is also an ordering issue when both instructions are in
+  // different basic blocks.
+  // 1.) The load instruction is selected by FastISel, but the integer extend
+  //     not. This usually happens when the integer extend is in a different
+  //     basic block and SelectionDAG took over for that basic block.
+  // 2.) The load instruction is selected before the integer extend. This only
+  //     happens when the integer extend is in a different basic block.
+  // 3.) The load instruction is selected by SelectionDAG and the integer extend
+  //     by FastISel. This happens if there are instructions between the load
+  //     and the integer extend that couldn't be selected by FastISel.
+  if (IntExtVal) {
+    // The integer extend hasn't been emitted yet. FastISel or SelectionDAG
+    // could select it. Emit a copy to subreg if necessary. FastISel will remove
+    // it when it selects the integer extend.
+    unsigned Reg = lookUpRegForValue(IntExtVal);
+    auto *MI = MRI.getUniqueVRegDef(Reg);
+    if (!MI) {
+      if (RetVT == MVT::i64 && VT <= MVT::i32) {
+        if (WantZExt) {
+          // Delete the last emitted instruction from emitLoad (SUBREG_TO_REG).
+          std::prev(FuncInfo.InsertPt)->eraseFromParent();
+          ResultReg = std::prev(FuncInfo.InsertPt)->getOperand(0).getReg();
+        } else
+          ResultReg = fastEmitInst_extractsubreg(MVT::i32, ResultReg,
+                                                 /*IsKill=*/true,
+                                                 AArch64::sub_32);
+      }
+      updateValueMap(I, ResultReg);
+      return true;
+    }
+
+    // The integer extend has already been emitted - delete all the instructions
+    // that have been emitted by the integer extend lowering code and use the
+    // result from the load instruction directly.
+    while (MI) {
+      Reg = 0;
+      for (auto &Opnd : MI->uses()) {
+        if (Opnd.isReg()) {
+          Reg = Opnd.getReg();
+          break;
+        }
+      }
+      MI->eraseFromParent();
+      MI = nullptr;
+      if (Reg)
+        MI = MRI.getUniqueVRegDef(Reg);
+    }
+    updateValueMap(IntExtVal, ResultReg);
+    return true;
+  }
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::emitStoreRelease(MVT VT, unsigned SrcReg,
+                                       unsigned AddrReg,
+                                       MachineMemOperand *MMO) {
+  unsigned Opc;
+  switch (VT.SimpleTy) {
+  default: return false;
+  case MVT::i8:  Opc = AArch64::STLRB; break;
+  case MVT::i16: Opc = AArch64::STLRH; break;
+  case MVT::i32: Opc = AArch64::STLRW; break;
+  case MVT::i64: Opc = AArch64::STLRX; break;
+  }
+
+  const MCInstrDesc &II = TII.get(Opc);
+  SrcReg = constrainOperandRegClass(II, SrcReg, 0);
+  AddrReg = constrainOperandRegClass(II, AddrReg, 1);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+      .addReg(SrcReg)
+      .addReg(AddrReg)
+      .addMemOperand(MMO);
+  return true;
+}
+
+bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr,
+                                MachineMemOperand *MMO) {
+  if (!TLI.allowsMisalignedMemoryAccesses(VT))
+    return false;
+
+  // Simplify this down to something we can handle.
+  if (!simplifyAddress(Addr, VT))
+    return false;
+
+  unsigned ScaleFactor = getImplicitScaleFactor(VT);
+  if (!ScaleFactor)
+    llvm_unreachable("Unexpected value type.");
+
+  // Negative offsets require unscaled, 9-bit, signed immediate offsets.
+  // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
+  bool UseScaled = true;
+  if ((Addr.getOffset() < 0) || (Addr.getOffset() & (ScaleFactor - 1))) {
+    UseScaled = false;
+    ScaleFactor = 1;
+  }
+
+  static const unsigned OpcTable[4][6] = {
+    { AArch64::STURBBi,  AArch64::STURHHi,  AArch64::STURWi,  AArch64::STURXi,
+      AArch64::STURSi,   AArch64::STURDi },
+    { AArch64::STRBBui,  AArch64::STRHHui,  AArch64::STRWui,  AArch64::STRXui,
+      AArch64::STRSui,   AArch64::STRDui },
+    { AArch64::STRBBroX, AArch64::STRHHroX, AArch64::STRWroX, AArch64::STRXroX,
+      AArch64::STRSroX,  AArch64::STRDroX },
+    { AArch64::STRBBroW, AArch64::STRHHroW, AArch64::STRWroW, AArch64::STRXroW,
+      AArch64::STRSroW,  AArch64::STRDroW }
+  };
+
+  unsigned Opc;
+  bool VTIsi1 = false;
+  bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() &&
+                      Addr.getOffsetReg();
+  unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0;
+  if (Addr.getExtendType() == AArch64_AM::UXTW ||
+      Addr.getExtendType() == AArch64_AM::SXTW)
+    Idx++;
+
+  switch (VT.SimpleTy) {
+  default: llvm_unreachable("Unexpected value type.");
+  case MVT::i1:  VTIsi1 = true;
+  case MVT::i8:  Opc = OpcTable[Idx][0]; break;
+  case MVT::i16: Opc = OpcTable[Idx][1]; break;
+  case MVT::i32: Opc = OpcTable[Idx][2]; break;
+  case MVT::i64: Opc = OpcTable[Idx][3]; break;
+  case MVT::f32: Opc = OpcTable[Idx][4]; break;
+  case MVT::f64: Opc = OpcTable[Idx][5]; break;
+  }
+
+  // Storing an i1 requires special handling.
+  if (VTIsi1 && SrcReg != AArch64::WZR) {
+    unsigned ANDReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1);
+    assert(ANDReg && "Unexpected AND instruction emission failure.");
+    SrcReg = ANDReg;
+  }
+  // Create the base instruction, then add the operands.
+  const MCInstrDesc &II = TII.get(Opc);
+  SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs());
+  MachineInstrBuilder MIB =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(SrcReg);
+  addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, ScaleFactor, MMO);
+
+  return true;
+}
+
+bool AArch64FastISel::selectStore(const Instruction *I) {
+  MVT VT;
+  const Value *Op0 = I->getOperand(0);
+  // Verify we have a legal type before going any further.  Currently, we handle
+  // simple types that will directly fit in a register (i32/f32/i64/f64) or
+  // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
+  if (!isTypeSupported(Op0->getType(), VT, /*IsVectorAllowed=*/true))
+    return false;
+
+  const Value *PtrV = I->getOperand(1);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return false;
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
+      if (Alloca->isSwiftError())
+        return false;
+    }
+  }
+
+  // Get the value to be stored into a register. Use the zero register directly
+  // when possible to avoid an unnecessary copy and a wasted register.
+  unsigned SrcReg = 0;
+  if (const auto *CI = dyn_cast<ConstantInt>(Op0)) {
+    if (CI->isZero())
+      SrcReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+  } else if (const auto *CF = dyn_cast<ConstantFP>(Op0)) {
+    if (CF->isZero() && !CF->isNegative()) {
+      VT = MVT::getIntegerVT(VT.getSizeInBits());
+      SrcReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+    }
+  }
+
+  if (!SrcReg)
+    SrcReg = getRegForValue(Op0);
+
+  if (!SrcReg)
+    return false;
+
+  auto *SI = cast<StoreInst>(I);
+
+  // Try to emit a STLR for seq_cst/release.
+  if (SI->isAtomic()) {
+    AtomicOrdering Ord = SI->getOrdering();
+    // The non-atomic instructions are sufficient for relaxed stores.
+    if (isReleaseOrStronger(Ord)) {
+      // The STLR addressing mode only supports a base reg; pass that directly.
+      unsigned AddrReg = getRegForValue(PtrV);
+      return emitStoreRelease(VT, SrcReg, AddrReg,
+                              createMachineMemOperandFor(I));
+    }
+  }
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!computeAddress(PtrV, Addr, Op0->getType()))
+    return false;
+
+  if (!emitStore(VT, SrcReg, Addr, createMachineMemOperandFor(I)))
+    return false;
+  return true;
+}
+
+static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
+  switch (Pred) {
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_UEQ:
+  default:
+    // AL is our "false" for now. The other two need more compares.
+    return AArch64CC::AL;
+  case CmpInst::ICMP_EQ:
+  case CmpInst::FCMP_OEQ:
+    return AArch64CC::EQ;
+  case CmpInst::ICMP_SGT:
+  case CmpInst::FCMP_OGT:
+    return AArch64CC::GT;
+  case CmpInst::ICMP_SGE:
+  case CmpInst::FCMP_OGE:
+    return AArch64CC::GE;
+  case CmpInst::ICMP_UGT:
+  case CmpInst::FCMP_UGT:
+    return AArch64CC::HI;
+  case CmpInst::FCMP_OLT:
+    return AArch64CC::MI;
+  case CmpInst::ICMP_ULE:
+  case CmpInst::FCMP_OLE:
+    return AArch64CC::LS;
+  case CmpInst::FCMP_ORD:
+    return AArch64CC::VC;
+  case CmpInst::FCMP_UNO:
+    return AArch64CC::VS;
+  case CmpInst::FCMP_UGE:
+    return AArch64CC::PL;
+  case CmpInst::ICMP_SLT:
+  case CmpInst::FCMP_ULT:
+    return AArch64CC::LT;
+  case CmpInst::ICMP_SLE:
+  case CmpInst::FCMP_ULE:
+    return AArch64CC::LE;
+  case CmpInst::FCMP_UNE:
+  case CmpInst::ICMP_NE:
+    return AArch64CC::NE;
+  case CmpInst::ICMP_UGE:
+    return AArch64CC::HS;
+  case CmpInst::ICMP_ULT:
+    return AArch64CC::LO;
+  }
+}
+
+/// \brief Try to emit a combined compare-and-branch instruction.
+bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
+  assert(isa<CmpInst>(BI->getCondition()) && "Expected cmp instruction");
+  const CmpInst *CI = cast<CmpInst>(BI->getCondition());
+  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+  const Value *LHS = CI->getOperand(0);
+  const Value *RHS = CI->getOperand(1);
+
+  MVT VT;
+  if (!isTypeSupported(LHS->getType(), VT))
+    return false;
+
+  unsigned BW = VT.getSizeInBits();
+  if (BW > 64)
+    return false;
+
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  // Try to take advantage of fallthrough opportunities.
+  if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+    std::swap(TBB, FBB);
+    Predicate = CmpInst::getInversePredicate(Predicate);
+  }
+
+  int TestBit = -1;
+  bool IsCmpNE;
+  switch (Predicate) {
+  default:
+    return false;
+  case CmpInst::ICMP_EQ:
+  case CmpInst::ICMP_NE:
+    if (isa<Constant>(LHS) && cast<Constant>(LHS)->isNullValue())
+      std::swap(LHS, RHS);
+
+    if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue())
+      return false;
+
+    if (const auto *AI = dyn_cast<BinaryOperator>(LHS))
+      if (AI->getOpcode() == Instruction::And && isValueAvailable(AI)) {
+        const Value *AndLHS = AI->getOperand(0);
+        const Value *AndRHS = AI->getOperand(1);
+
+        if (const auto *C = dyn_cast<ConstantInt>(AndLHS))
+          if (C->getValue().isPowerOf2())
+            std::swap(AndLHS, AndRHS);
+
+        if (const auto *C = dyn_cast<ConstantInt>(AndRHS))
+          if (C->getValue().isPowerOf2()) {
+            TestBit = C->getValue().logBase2();
+            LHS = AndLHS;
+          }
+      }
+
+    if (VT == MVT::i1)
+      TestBit = 0;
+
+    IsCmpNE = Predicate == CmpInst::ICMP_NE;
+    break;
+  case CmpInst::ICMP_SLT:
+  case CmpInst::ICMP_SGE:
+    if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue())
+      return false;
+
+    TestBit = BW - 1;
+    IsCmpNE = Predicate == CmpInst::ICMP_SLT;
+    break;
+  case CmpInst::ICMP_SGT:
+  case CmpInst::ICMP_SLE:
+    if (!isa<ConstantInt>(RHS))
+      return false;
+
+    if (cast<ConstantInt>(RHS)->getValue() != APInt(BW, -1, true))
+      return false;
+
+    TestBit = BW - 1;
+    IsCmpNE = Predicate == CmpInst::ICMP_SLE;
+    break;
+  } // end switch
+
+  static const unsigned OpcTable[2][2][2] = {
+    { {AArch64::CBZW,  AArch64::CBZX },
+      {AArch64::CBNZW, AArch64::CBNZX} },
+    { {AArch64::TBZW,  AArch64::TBZX },
+      {AArch64::TBNZW, AArch64::TBNZX} }
+  };
+
+  bool IsBitTest = TestBit != -1;
+  bool Is64Bit = BW == 64;
+  if (TestBit < 32 && TestBit >= 0)
+    Is64Bit = false;
+
+  unsigned Opc = OpcTable[IsBitTest][IsCmpNE][Is64Bit];
+  const MCInstrDesc &II = TII.get(Opc);
+
+  unsigned SrcReg = getRegForValue(LHS);
+  if (!SrcReg)
+    return false;
+  bool SrcIsKill = hasTrivialKill(LHS);
+
+  if (BW == 64 && !Is64Bit)
+    SrcReg = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill,
+                                        AArch64::sub_32);
+
+  if ((BW < 32) && !IsBitTest)
+    SrcReg = emitIntExt(VT, SrcReg, MVT::i32, /*IsZExt=*/true);
+
+  // Emit the combined compare and branch instruction.
+  SrcReg = constrainOperandRegClass(II, SrcReg,  II.getNumDefs());
+  MachineInstrBuilder MIB =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+          .addReg(SrcReg, getKillRegState(SrcIsKill));
+  if (IsBitTest)
+    MIB.addImm(TestBit);
+  MIB.addMBB(TBB);
+
+  finishCondBranch(BI->getParent(), TBB, FBB);
+  return true;
+}
+
+bool AArch64FastISel::selectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  if (BI->isUnconditional()) {
+    MachineBasicBlock *MSucc = FuncInfo.MBBMap[BI->getSuccessor(0)];
+    fastEmitBranch(MSucc, BI->getDebugLoc());
+    return true;
+  }
+
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    if (CI->hasOneUse() && isValueAvailable(CI)) {
+      // Try to optimize or fold the cmp.
+      CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+      switch (Predicate) {
+      default:
+        break;
+      case CmpInst::FCMP_FALSE:
+        fastEmitBranch(FBB, DbgLoc);
+        return true;
+      case CmpInst::FCMP_TRUE:
+        fastEmitBranch(TBB, DbgLoc);
+        return true;
+      }
+
+      // Try to emit a combined compare-and-branch first.
+      if (emitCompareAndBranch(BI))
+        return true;
+
+      // Try to take advantage of fallthrough opportunities.
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        Predicate = CmpInst::getInversePredicate(Predicate);
+      }
+
+      // Emit the cmp.
+      if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+        return false;
+
+      // FCMP_UEQ and FCMP_ONE cannot be checked with a single branch
+      // instruction.
+      AArch64CC::CondCode CC = getCompareCC(Predicate);
+      AArch64CC::CondCode ExtraCC = AArch64CC::AL;
+      switch (Predicate) {
+      default:
+        break;
+      case CmpInst::FCMP_UEQ:
+        ExtraCC = AArch64CC::EQ;
+        CC = AArch64CC::VS;
+        break;
+      case CmpInst::FCMP_ONE:
+        ExtraCC = AArch64CC::MI;
+        CC = AArch64CC::GT;
+        break;
+      }
+      assert((CC != AArch64CC::AL) && "Unexpected condition code.");
+
+      // Emit the extra branch for FCMP_UEQ and FCMP_ONE.
+      if (ExtraCC != AArch64CC::AL) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+            .addImm(ExtraCC)
+            .addMBB(TBB);
+      }
+
+      // Emit the branch.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+          .addImm(CC)
+          .addMBB(TBB);
+
+      finishCondBranch(BI->getParent(), TBB, FBB);
+      return true;
+    }
+  } else if (const auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) {
+    uint64_t Imm = CI->getZExtValue();
+    MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B))
+        .addMBB(Target);
+
+    // Obtain the branch probability and add the target to the successor list.
+    if (FuncInfo.BPI) {
+      auto BranchProbability = FuncInfo.BPI->getEdgeProbability(
+          BI->getParent(), Target->getBasicBlock());
+      FuncInfo.MBB->addSuccessor(Target, BranchProbability);
+    } else
+      FuncInfo.MBB->addSuccessorWithoutProb(Target);
+    return true;
+  } else {
+    AArch64CC::CondCode CC = AArch64CC::NE;
+    if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
+      // Fake request the condition, otherwise the intrinsic might be completely
+      // optimized away.
+      unsigned CondReg = getRegForValue(BI->getCondition());
+      if (!CondReg)
+        return false;
+
+      // Emit the branch.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+        .addImm(CC)
+        .addMBB(TBB);
+
+      finishCondBranch(BI->getParent(), TBB, FBB);
+      return true;
+    }
+  }
+
+  unsigned CondReg = getRegForValue(BI->getCondition());
+  if (CondReg == 0)
+    return false;
+  bool CondRegIsKill = hasTrivialKill(BI->getCondition());
+
+  // i1 conditions come as i32 values, test the lowest bit with tb(n)z.
+  unsigned Opcode = AArch64::TBNZW;
+  if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+    std::swap(TBB, FBB);
+    Opcode = AArch64::TBZW;
+  }
+
+  const MCInstrDesc &II = TII.get(Opcode);
+  unsigned ConstrainedCondReg
+    = constrainOperandRegClass(II, CondReg, II.getNumDefs());
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+      .addReg(ConstrainedCondReg, getKillRegState(CondRegIsKill))
+      .addImm(0)
+      .addMBB(TBB);
+
+  finishCondBranch(BI->getParent(), TBB, FBB);
+  return true;
+}
+
+bool AArch64FastISel::selectIndirectBr(const Instruction *I) {
+  const IndirectBrInst *BI = cast<IndirectBrInst>(I);
+  unsigned AddrReg = getRegForValue(BI->getOperand(0));
+  if (AddrReg == 0)
+    return false;
+
+  // Emit the indirect branch.
+  const MCInstrDesc &II = TII.get(AArch64::BR);
+  AddrReg = constrainOperandRegClass(II, AddrReg,  II.getNumDefs());
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(AddrReg);
+
+  // Make sure the CFG is up-to-date.
+  for (auto *Succ : BI->successors())
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[Succ]);
+
+  return true;
+}
+
+bool AArch64FastISel::selectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
+
+  // Vectors of i1 are weird: bail out.
+  if (CI->getType()->isVectorTy())
+    return false;
+
+  // Try to optimize or fold the cmp.
+  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+  unsigned ResultReg = 0;
+  switch (Predicate) {
+  default:
+    break;
+  case CmpInst::FCMP_FALSE:
+    ResultReg = createResultReg(&AArch64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(AArch64::WZR, getKillRegState(true));
+    break;
+  case CmpInst::FCMP_TRUE:
+    ResultReg = fastEmit_i(MVT::i32, MVT::i32, ISD::Constant, 1);
+    break;
+  }
+
+  if (ResultReg) {
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  // Emit the cmp.
+  if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+    return false;
+
+  ResultReg = createResultReg(&AArch64::GPR32RegClass);
+
+  // FCMP_UEQ and FCMP_ONE cannot be checked with a single instruction. These
+  // condition codes are inverted, because they are used by CSINC.
+  static unsigned CondCodeTable[2][2] = {
+    { AArch64CC::NE, AArch64CC::VC },
+    { AArch64CC::PL, AArch64CC::LE }
+  };
+  unsigned *CondCodes = nullptr;
+  switch (Predicate) {
+  default:
+    break;
+  case CmpInst::FCMP_UEQ:
+    CondCodes = &CondCodeTable[0][0];
+    break;
+  case CmpInst::FCMP_ONE:
+    CondCodes = &CondCodeTable[1][0];
+    break;
+  }
+
+  if (CondCodes) {
+    unsigned TmpReg1 = createResultReg(&AArch64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+            TmpReg1)
+        .addReg(AArch64::WZR, getKillRegState(true))
+        .addReg(AArch64::WZR, getKillRegState(true))
+        .addImm(CondCodes[0]);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+            ResultReg)
+        .addReg(TmpReg1, getKillRegState(true))
+        .addReg(AArch64::WZR, getKillRegState(true))
+        .addImm(CondCodes[1]);
+
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  // Now set a register based on the comparison.
+  AArch64CC::CondCode CC = getCompareCC(Predicate);
+  assert((CC != AArch64CC::AL) && "Unexpected condition code.");
+  AArch64CC::CondCode invertedCC = getInvertedCondCode(CC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+          ResultReg)
+      .addReg(AArch64::WZR, getKillRegState(true))
+      .addReg(AArch64::WZR, getKillRegState(true))
+      .addImm(invertedCC);
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+/// \brief Optimize selects of i1 if one of the operands has a 'true' or 'false'
+/// value.
+bool AArch64FastISel::optimizeSelect(const SelectInst *SI) {
+  if (!SI->getType()->isIntegerTy(1))
+    return false;
+
+  const Value *Src1Val, *Src2Val;
+  unsigned Opc = 0;
+  bool NeedExtraOp = false;
+  if (auto *CI = dyn_cast<ConstantInt>(SI->getTrueValue())) {
+    if (CI->isOne()) {
+      Src1Val = SI->getCondition();
+      Src2Val = SI->getFalseValue();
+      Opc = AArch64::ORRWrr;
+    } else {
+      assert(CI->isZero());
+      Src1Val = SI->getFalseValue();
+      Src2Val = SI->getCondition();
+      Opc = AArch64::BICWrr;
+    }
+  } else if (auto *CI = dyn_cast<ConstantInt>(SI->getFalseValue())) {
+    if (CI->isOne()) {
+      Src1Val = SI->getCondition();
+      Src2Val = SI->getTrueValue();
+      Opc = AArch64::ORRWrr;
+      NeedExtraOp = true;
+    } else {
+      assert(CI->isZero());
+      Src1Val = SI->getCondition();
+      Src2Val = SI->getTrueValue();
+      Opc = AArch64::ANDWrr;
+    }
+  }
+
+  if (!Opc)
+    return false;
+
+  unsigned Src1Reg = getRegForValue(Src1Val);
+  if (!Src1Reg)
+    return false;
+  bool Src1IsKill = hasTrivialKill(Src1Val);
+
+  unsigned Src2Reg = getRegForValue(Src2Val);
+  if (!Src2Reg)
+    return false;
+  bool Src2IsKill = hasTrivialKill(Src2Val);
+
+  if (NeedExtraOp) {
+    Src1Reg = emitLogicalOp_ri(ISD::XOR, MVT::i32, Src1Reg, Src1IsKill, 1);
+    Src1IsKill = true;
+  }
+  unsigned ResultReg = fastEmitInst_rr(Opc, &AArch64::GPR32RegClass, Src1Reg,
+                                       Src1IsKill, Src2Reg, Src2IsKill);
+  updateValueMap(SI, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectSelect(const Instruction *I) {
+  assert(isa<SelectInst>(I) && "Expected a select instruction.");
+  MVT VT;
+  if (!isTypeSupported(I->getType(), VT))
+    return false;
+
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    Opc = AArch64::CSELWr;
+    RC = &AArch64::GPR32RegClass;
+    break;
+  case MVT::i64:
+    Opc = AArch64::CSELXr;
+    RC = &AArch64::GPR64RegClass;
+    break;
+  case MVT::f32:
+    Opc = AArch64::FCSELSrrr;
+    RC = &AArch64::FPR32RegClass;
+    break;
+  case MVT::f64:
+    Opc = AArch64::FCSELDrrr;
+    RC = &AArch64::FPR64RegClass;
+    break;
+  }
+
+  const SelectInst *SI = cast<SelectInst>(I);
+  const Value *Cond = SI->getCondition();
+  AArch64CC::CondCode CC = AArch64CC::NE;
+  AArch64CC::CondCode ExtraCC = AArch64CC::AL;
+
+  if (optimizeSelect(SI))
+    return true;
+
+  // Try to pickup the flags, so we don't have to emit another compare.
+  if (foldXALUIntrinsic(CC, I, Cond)) {
+    // Fake request the condition to force emission of the XALU intrinsic.
+    unsigned CondReg = getRegForValue(Cond);
+    if (!CondReg)
+      return false;
+  } else if (isa<CmpInst>(Cond) && cast<CmpInst>(Cond)->hasOneUse() &&
+             isValueAvailable(Cond)) {
+    const auto *Cmp = cast<CmpInst>(Cond);
+    // Try to optimize or fold the cmp.
+    CmpInst::Predicate Predicate = optimizeCmpPredicate(Cmp);
+    const Value *FoldSelect = nullptr;
+    switch (Predicate) {
+    default:
+      break;
+    case CmpInst::FCMP_FALSE:
+      FoldSelect = SI->getFalseValue();
+      break;
+    case CmpInst::FCMP_TRUE:
+      FoldSelect = SI->getTrueValue();
+      break;
+    }
+
+    if (FoldSelect) {
+      unsigned SrcReg = getRegForValue(FoldSelect);
+      if (!SrcReg)
+        return false;
+      unsigned UseReg = lookUpRegForValue(SI);
+      if (UseReg)
+        MRI.clearKillFlags(UseReg);
+
+      updateValueMap(I, SrcReg);
+      return true;
+    }
+
+    // Emit the cmp.
+    if (!emitCmp(Cmp->getOperand(0), Cmp->getOperand(1), Cmp->isUnsigned()))
+      return false;
+
+    // FCMP_UEQ and FCMP_ONE cannot be checked with a single select instruction.
+    CC = getCompareCC(Predicate);
+    switch (Predicate) {
+    default:
+      break;
+    case CmpInst::FCMP_UEQ:
+      ExtraCC = AArch64CC::EQ;
+      CC = AArch64CC::VS;
+      break;
+    case CmpInst::FCMP_ONE:
+      ExtraCC = AArch64CC::MI;
+      CC = AArch64CC::GT;
+      break;
+    }
+    assert((CC != AArch64CC::AL) && "Unexpected condition code.");
+  } else {
+    unsigned CondReg = getRegForValue(Cond);
+    if (!CondReg)
+      return false;
+    bool CondIsKill = hasTrivialKill(Cond);
+
+    const MCInstrDesc &II = TII.get(AArch64::ANDSWri);
+    CondReg = constrainOperandRegClass(II, CondReg, 1);
+
+    // Emit a TST instruction (ANDS wzr, reg, #imm).
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II,
+            AArch64::WZR)
+        .addReg(CondReg, getKillRegState(CondIsKill))
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+  }
+
+  unsigned Src1Reg = getRegForValue(SI->getTrueValue());
+  bool Src1IsKill = hasTrivialKill(SI->getTrueValue());
+
+  unsigned Src2Reg = getRegForValue(SI->getFalseValue());
+  bool Src2IsKill = hasTrivialKill(SI->getFalseValue());
+
+  if (!Src1Reg || !Src2Reg)
+    return false;
+
+  if (ExtraCC != AArch64CC::AL) {
+    Src2Reg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg,
+                               Src2IsKill, ExtraCC);
+    Src2IsKill = true;
+  }
+  unsigned ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg,
+                                        Src2IsKill, CC);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectFPExt(const Instruction *I) {
+  Value *V = I->getOperand(0);
+  if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy())
+    return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0)
+    return false;
+
+  unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr),
+          ResultReg).addReg(Op);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectFPTrunc(const Instruction *I) {
+  Value *V = I->getOperand(0);
+  if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy())
+    return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0)
+    return false;
+
+  unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr),
+          ResultReg).addReg(Op);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+// FPToUI and FPToSI
+bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) {
+  MVT DestVT;
+  if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
+    return false;
+
+  unsigned SrcReg = getRegForValue(I->getOperand(0));
+  if (SrcReg == 0)
+    return false;
+
+  EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true);
+  if (SrcVT == MVT::f128)
+    return false;
+
+  unsigned Opc;
+  if (SrcVT == MVT::f64) {
+    if (Signed)
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWDr : AArch64::FCVTZSUXDr;
+    else
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWDr : AArch64::FCVTZUUXDr;
+  } else {
+    if (Signed)
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWSr : AArch64::FCVTZSUXSr;
+    else
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWSr : AArch64::FCVTZUUXSr;
+  }
+  unsigned ResultReg = createResultReg(
+      DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(SrcReg);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) {
+  MVT DestVT;
+  if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
+    return false;
+  assert ((DestVT == MVT::f32 || DestVT == MVT::f64) &&
+          "Unexpected value type.");
+
+  unsigned SrcReg = getRegForValue(I->getOperand(0));
+  if (!SrcReg)
+    return false;
+  bool SrcIsKill = hasTrivialKill(I->getOperand(0));
+
+  EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true);
+
+  // Handle sign-extension.
+  if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+    SrcReg =
+        emitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
+    if (!SrcReg)
+      return false;
+    SrcIsKill = true;
+  }
+
+  unsigned Opc;
+  if (SrcVT == MVT::i64) {
+    if (Signed)
+      Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUXSri : AArch64::SCVTFUXDri;
+    else
+      Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUXSri : AArch64::UCVTFUXDri;
+  } else {
+    if (Signed)
+      Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUWSri : AArch64::SCVTFUWDri;
+    else
+      Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri;
+  }
+
+  unsigned ResultReg = fastEmitInst_r(Opc, TLI.getRegClassFor(DestVT), SrcReg,
+                                      SrcIsKill);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::fastLowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const Function *F = FuncInfo.Fn;
+  if (F->isVarArg())
+    return false;
+
+  CallingConv::ID CC = F->getCallingConv();
+  if (CC != CallingConv::C && CC != CallingConv::Swift)
+    return false;
+
+  // Only handle simple cases of up to 8 GPR and FPR each.
+  unsigned GPRCnt = 0;
+  unsigned FPRCnt = 0;
+  unsigned Idx = 0;
+  for (auto const &Arg : F->args()) {
+    // The first argument is at index 1.
+    ++Idx;
+    if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+      return false;
+
+    Type *ArgTy = Arg.getType();
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy())
+      return false;
+
+    EVT ArgVT = TLI.getValueType(DL, ArgTy);
+    if (!ArgVT.isSimple())
+      return false;
+
+    MVT VT = ArgVT.getSimpleVT().SimpleTy;
+    if (VT.isFloatingPoint() && !Subtarget->hasFPARMv8())
+      return false;
+
+    if (VT.isVector() &&
+        (!Subtarget->hasNEON() || !Subtarget->isLittleEndian()))
+      return false;
+
+    if (VT >= MVT::i1 && VT <= MVT::i64)
+      ++GPRCnt;
+    else if ((VT >= MVT::f16 && VT <= MVT::f64) || VT.is64BitVector() ||
+             VT.is128BitVector())
+      ++FPRCnt;
+    else
+      return false;
+
+    if (GPRCnt > 8 || FPRCnt > 8)
+      return false;
+  }
+
+  static const MCPhysReg Registers[6][8] = {
+    { AArch64::W0, AArch64::W1, AArch64::W2, AArch64::W3, AArch64::W4,
+      AArch64::W5, AArch64::W6, AArch64::W7 },
+    { AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4,
+      AArch64::X5, AArch64::X6, AArch64::X7 },
+    { AArch64::H0, AArch64::H1, AArch64::H2, AArch64::H3, AArch64::H4,
+      AArch64::H5, AArch64::H6, AArch64::H7 },
+    { AArch64::S0, AArch64::S1, AArch64::S2, AArch64::S3, AArch64::S4,
+      AArch64::S5, AArch64::S6, AArch64::S7 },
+    { AArch64::D0, AArch64::D1, AArch64::D2, AArch64::D3, AArch64::D4,
+      AArch64::D5, AArch64::D6, AArch64::D7 },
+    { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4,
+      AArch64::Q5, AArch64::Q6, AArch64::Q7 }
+  };
+
+  unsigned GPRIdx = 0;
+  unsigned FPRIdx = 0;
+  for (auto const &Arg : F->args()) {
+    MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
+    unsigned SrcReg;
+    const TargetRegisterClass *RC;
+    if (VT >= MVT::i1 && VT <= MVT::i32) {
+      SrcReg = Registers[0][GPRIdx++];
+      RC = &AArch64::GPR32RegClass;
+      VT = MVT::i32;
+    } else if (VT == MVT::i64) {
+      SrcReg = Registers[1][GPRIdx++];
+      RC = &AArch64::GPR64RegClass;
+    } else if (VT == MVT::f16) {
+      SrcReg = Registers[2][FPRIdx++];
+      RC = &AArch64::FPR16RegClass;
+    } else if (VT ==  MVT::f32) {
+      SrcReg = Registers[3][FPRIdx++];
+      RC = &AArch64::FPR32RegClass;
+    } else if ((VT == MVT::f64) || VT.is64BitVector()) {
+      SrcReg = Registers[4][FPRIdx++];
+      RC = &AArch64::FPR64RegClass;
+    } else if (VT.is128BitVector()) {
+      SrcReg = Registers[5][FPRIdx++];
+      RC = &AArch64::FPR128RegClass;
+    } else
+      llvm_unreachable("Unexpected value type.");
+
+    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+    // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+    // Without this, EmitLiveInCopies may eliminate the livein if its only
+    // use is a bitcast (which isn't turned into an instruction).
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(DstReg, getKillRegState(true));
+    updateValueMap(&Arg, ResultReg);
+  }
+  return true;
+}
+
+bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
+                                      SmallVectorImpl<MVT> &OutVTs,
+                                      unsigned &NumBytes) {
+  CallingConv::ID CC = CLI.CallConv;
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC));
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
+    .addImm(NumBytes);
+
+  // Process the args.
+  for (CCValAssign &VA : ArgLocs) {
+    const Value *ArgVal = CLI.OutVals[VA.getValNo()];
+    MVT ArgVT = OutVTs[VA.getValNo()];
+
+    unsigned ArgReg = getRegForValue(ArgVal);
+    if (!ArgReg)
+      return false;
+
+    // Handle arg promotion: SExt, ZExt, AExt.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt: {
+      MVT DestVT = VA.getLocVT();
+      MVT SrcVT = ArgVT;
+      ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/false);
+      if (!ArgReg)
+        return false;
+      break;
+    }
+    case CCValAssign::AExt:
+    // Intentional fall-through.
+    case CCValAssign::ZExt: {
+      MVT DestVT = VA.getLocVT();
+      MVT SrcVT = ArgVT;
+      ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/true);
+      if (!ArgReg)
+        return false;
+      break;
+    }
+    default:
+      llvm_unreachable("Unknown arg promotion!");
+    }
+
+    // Now copy/store arg to correct locations.
+    if (VA.isRegLoc() && !VA.needsCustom()) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
+      CLI.OutRegs.push_back(VA.getLocReg());
+    } else if (VA.needsCustom()) {
+      // FIXME: Handle custom args.
+      return false;
+    } else {
+      assert(VA.isMemLoc() && "Assuming store on stack.");
+
+      // Don't emit stores for undef values.
+      if (isa<UndefValue>(ArgVal))
+        continue;
+
+      // Need to store on the stack.
+      unsigned ArgSize = (ArgVT.getSizeInBits() + 7) / 8;
+
+      unsigned BEAlign = 0;
+      if (ArgSize < 8 && !Subtarget->isLittleEndian())
+        BEAlign = 8 - ArgSize;
+
+      Address Addr;
+      Addr.setKind(Address::RegBase);
+      Addr.setReg(AArch64::SP);
+      Addr.setOffset(VA.getLocMemOffset() + BEAlign);
+
+      unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+      MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+          MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()),
+          MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
+
+      if (!emitStore(ArgVT, ArgReg, Addr, MMO))
+        return false;
+    }
+  }
+  return true;
+}
+
+bool AArch64FastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
+                                 unsigned NumBytes) {
+  CallingConv::ID CC = CLI.CallConv;
+
+  // Issue CALLSEQ_END
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
+    .addImm(NumBytes).addImm(0);
+
+  // Now the return value.
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC));
+
+    // Only handle a single return value.
+    if (RVLocs.size() != 1)
+      return false;
+
+    // Copy all of the result registers out of their specified physreg.
+    MVT CopyVT = RVLocs[0].getValVT();
+
+    // TODO: Handle big-endian results
+    if (CopyVT.isVector() && !Subtarget->isLittleEndian())
+      return false;
+
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(RVLocs[0].getLocReg());
+    CLI.InRegs.push_back(RVLocs[0].getLocReg());
+
+    CLI.ResultReg = ResultReg;
+    CLI.NumResultRegs = 1;
+  }
+
+  return true;
+}
+
+bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
+  CallingConv::ID CC  = CLI.CallConv;
+  bool IsTailCall     = CLI.IsTailCall;
+  bool IsVarArg       = CLI.IsVarArg;
+  const Value *Callee = CLI.Callee;
+  MCSymbol *Symbol = CLI.Symbol;
+
+  if (!Callee && !Symbol)
+    return false;
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (IsTailCall)
+    return false;
+
+  CodeModel::Model CM = TM.getCodeModel();
+  // Only support the small and large code model.
+  if (CM != CodeModel::Small && CM != CodeModel::Large)
+    return false;
+
+  // FIXME: Add large code model support for ELF.
+  if (CM == CodeModel::Large && !Subtarget->isTargetMachO())
+    return false;
+
+  // Let SDISel handle vararg functions.
+  if (IsVarArg)
+    return false;
+
+  // FIXME: Only handle *simple* calls for now.
+  MVT RetVT;
+  if (CLI.RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(CLI.RetTy, RetVT))
+    return false;
+
+  for (auto Flag : CLI.OutFlags)
+    if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal() ||
+        Flag.isSwiftSelf() || Flag.isSwiftError())
+      return false;
+
+  // Set up the argument vectors.
+  SmallVector<MVT, 16> OutVTs;
+  OutVTs.reserve(CLI.OutVals.size());
+
+  for (auto *Val : CLI.OutVals) {
+    MVT VT;
+    if (!isTypeLegal(Val->getType(), VT) &&
+        !(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16))
+      return false;
+
+    // We don't handle vector parameters yet.
+    if (VT.isVector() || VT.getSizeInBits() > 64)
+      return false;
+
+    OutVTs.push_back(VT);
+  }
+
+  Address Addr;
+  if (Callee && !computeCallAddress(Callee, Addr))
+    return false;
+
+  // Handle the arguments now that we've gotten them.
+  unsigned NumBytes;
+  if (!processCallArgs(CLI, OutVTs, NumBytes))
+    return false;
+
+  // Issue the call.
+  MachineInstrBuilder MIB;
+  if (CM == CodeModel::Small) {
+    const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL);
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II);
+    if (Symbol)
+      MIB.addSym(Symbol, 0);
+    else if (Addr.getGlobalValue())
+      MIB.addGlobalAddress(Addr.getGlobalValue(), 0, 0);
+    else if (Addr.getReg()) {
+      unsigned Reg = constrainOperandRegClass(II, Addr.getReg(), 0);
+      MIB.addReg(Reg);
+    } else
+      return false;
+  } else {
+    unsigned CallReg = 0;
+    if (Symbol) {
+      unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+              ADRPReg)
+          .addSym(Symbol, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+
+      CallReg = createResultReg(&AArch64::GPR64RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(AArch64::LDRXui), CallReg)
+          .addReg(ADRPReg)
+          .addSym(Symbol,
+                  AArch64II::MO_GOT | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+    } else if (Addr.getGlobalValue())
+      CallReg = materializeGV(Addr.getGlobalValue());
+    else if (Addr.getReg())
+      CallReg = Addr.getReg();
+
+    if (!CallReg)
+      return false;
+
+    const MCInstrDesc &II = TII.get(AArch64::BLR);
+    CallReg = constrainOperandRegClass(II, CallReg, 0);
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(CallReg);
+  }
+
+  // Add implicit physical register uses to the call.
+  for (auto Reg : CLI.OutRegs)
+    MIB.addReg(Reg, RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
+
+  CLI.Call = MIB;
+
+  // Finish off the call including any return values.
+  return finishCall(CLI, RetVT, NumBytes);
+}
+
+bool AArch64FastISel::isMemCpySmall(uint64_t Len, unsigned Alignment) {
+  if (Alignment)
+    return Len / Alignment <= 4;
+  else
+    return Len < 32;
+}
+
+bool AArch64FastISel::tryEmitSmallMemCpy(Address Dest, Address Src,
+                                         uint64_t Len, unsigned Alignment) {
+  // Make sure we don't bloat code by inlining very large memcpy's.
+  if (!isMemCpySmall(Len, Alignment))
+    return false;
+
+  int64_t UnscaledOffset = 0;
+  Address OrigDest = Dest;
+  Address OrigSrc = Src;
+
+  while (Len) {
+    MVT VT;
+    if (!Alignment || Alignment >= 8) {
+      if (Len >= 8)
+        VT = MVT::i64;
+      else if (Len >= 4)
+        VT = MVT::i32;
+      else if (Len >= 2)
+        VT = MVT::i16;
+      else {
+        VT = MVT::i8;
+      }
+    } else {
+      // Bound based on alignment.
+      if (Len >= 4 && Alignment == 4)
+        VT = MVT::i32;
+      else if (Len >= 2 && Alignment == 2)
+        VT = MVT::i16;
+      else {
+        VT = MVT::i8;
+      }
+    }
+
+    unsigned ResultReg = emitLoad(VT, VT, Src);
+    if (!ResultReg)
+      return false;
+
+    if (!emitStore(VT, ResultReg, Dest))
+      return false;
+
+    int64_t Size = VT.getSizeInBits() / 8;
+    Len -= Size;
+    UnscaledOffset += Size;
+
+    // We need to recompute the unscaled offset for each iteration.
+    Dest.setOffset(OrigDest.getOffset() + UnscaledOffset);
+    Src.setOffset(OrigSrc.getOffset() + UnscaledOffset);
+  }
+
+  return true;
+}
+
+/// \brief Check if it is possible to fold the condition from the XALU intrinsic
+/// into the user. The condition code will only be updated on success.
+bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC,
+                                        const Instruction *I,
+                                        const Value *Cond) {
+  if (!isa<ExtractValueInst>(Cond))
+    return false;
+
+  const auto *EV = cast<ExtractValueInst>(Cond);
+  if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
+    return false;
+
+  const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
+  MVT RetVT;
+  const Function *Callee = II->getCalledFunction();
+  Type *RetTy =
+  cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
+  if (!isTypeLegal(RetTy, RetVT))
+    return false;
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return false;
+
+  const Value *LHS = II->getArgOperand(0);
+  const Value *RHS = II->getArgOperand(1);
+
+  // Canonicalize immediate to the RHS.
+  if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+      isCommutativeIntrinsic(II))
+    std::swap(LHS, RHS);
+
+  // Simplify multiplies.
+  Intrinsic::ID IID = II->getIntrinsicID();
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::smul_with_overflow:
+    if (const auto *C = dyn_cast<ConstantInt>(RHS))
+      if (C->getValue() == 2)
+        IID = Intrinsic::sadd_with_overflow;
+    break;
+  case Intrinsic::umul_with_overflow:
+    if (const auto *C = dyn_cast<ConstantInt>(RHS))
+      if (C->getValue() == 2)
+        IID = Intrinsic::uadd_with_overflow;
+    break;
+  }
+
+  AArch64CC::CondCode TmpCC;
+  switch (IID) {
+  default:
+    return false;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+    TmpCC = AArch64CC::VS;
+    break;
+  case Intrinsic::uadd_with_overflow:
+    TmpCC = AArch64CC::HS;
+    break;
+  case Intrinsic::usub_with_overflow:
+    TmpCC = AArch64CC::LO;
+    break;
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    TmpCC = AArch64CC::NE;
+    break;
+  }
+
+  // Check if both instructions are in the same basic block.
+  if (!isValueAvailable(II))
+    return false;
+
+  // Make sure nothing is in the way
+  BasicBlock::const_iterator Start(I);
+  BasicBlock::const_iterator End(II);
+  for (auto Itr = std::prev(Start); Itr != End; --Itr) {
+    // We only expect extractvalue instructions between the intrinsic and the
+    // instruction to be selected.
+    if (!isa<ExtractValueInst>(Itr))
+      return false;
+
+    // Check that the extractvalue operand comes from the intrinsic.
+    const auto *EVI = cast<ExtractValueInst>(Itr);
+    if (EVI->getAggregateOperand() != II)
+      return false;
+  }
+
+  CC = TmpCC;
+  return true;
+}
+
+bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
+  // FIXME: Handle more intrinsics.
+  switch (II->getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::frameaddress: {
+    MachineFrameInfo &MFI = FuncInfo.MF->getFrameInfo();
+    MFI.setFrameAddressIsTaken(true);
+
+    const AArch64RegisterInfo *RegInfo =
+        static_cast<const AArch64RegisterInfo *>(Subtarget->getRegisterInfo());
+    unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+    unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), SrcReg).addReg(FramePtr);
+    // Recursively load frame address
+    // ldr x0, [fp]
+    // ldr x0, [x0]
+    // ldr x0, [x0]
+    // ...
+    unsigned DestReg;
+    unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
+    while (Depth--) {
+      DestReg = fastEmitInst_ri(AArch64::LDRXui, &AArch64::GPR64RegClass,
+                                SrcReg, /*IsKill=*/true, 0);
+      assert(DestReg && "Unexpected LDR instruction emission failure.");
+      SrcReg = DestReg;
+    }
+
+    updateValueMap(II, SrcReg);
+    return true;
+  }
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove: {
+    const auto *MTI = cast<MemTransferInst>(II);
+    // Don't handle volatile.
+    if (MTI->isVolatile())
+      return false;
+
+    // Disable inlining for memmove before calls to ComputeAddress.  Otherwise,
+    // we would emit dead code because we don't currently handle memmoves.
+    bool IsMemCpy = (II->getIntrinsicID() == Intrinsic::memcpy);
+    if (isa<ConstantInt>(MTI->getLength()) && IsMemCpy) {
+      // Small memcpy's are common enough that we want to do them without a call
+      // if possible.
+      uint64_t Len = cast<ConstantInt>(MTI->getLength())->getZExtValue();
+      unsigned Alignment = MTI->getAlignment();
+      if (isMemCpySmall(Len, Alignment)) {
+        Address Dest, Src;
+        if (!computeAddress(MTI->getRawDest(), Dest) ||
+            !computeAddress(MTI->getRawSource(), Src))
+          return false;
+        if (tryEmitSmallMemCpy(Dest, Src, Len, Alignment))
+          return true;
+      }
+    }
+
+    if (!MTI->getLength()->getType()->isIntegerTy(64))
+      return false;
+
+    if (MTI->getSourceAddressSpace() > 255 || MTI->getDestAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+    const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove";
+    return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 2);
+  }
+  case Intrinsic::memset: {
+    const MemSetInst *MSI = cast<MemSetInst>(II);
+    // Don't handle volatile.
+    if (MSI->isVolatile())
+      return false;
+
+    if (!MSI->getLength()->getType()->isIntegerTy(64))
+      return false;
+
+    if (MSI->getDestAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+    return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+  }
+  case Intrinsic::sin:
+  case Intrinsic::cos:
+  case Intrinsic::pow: {
+    MVT RetVT;
+    if (!isTypeLegal(II->getType(), RetVT))
+      return false;
+
+    if (RetVT != MVT::f32 && RetVT != MVT::f64)
+      return false;
+
+    static const RTLIB::Libcall LibCallTable[3][2] = {
+      { RTLIB::SIN_F32, RTLIB::SIN_F64 },
+      { RTLIB::COS_F32, RTLIB::COS_F64 },
+      { RTLIB::POW_F32, RTLIB::POW_F64 }
+    };
+    RTLIB::Libcall LC;
+    bool Is64Bit = RetVT == MVT::f64;
+    switch (II->getIntrinsicID()) {
+    default:
+      llvm_unreachable("Unexpected intrinsic.");
+    case Intrinsic::sin:
+      LC = LibCallTable[0][Is64Bit];
+      break;
+    case Intrinsic::cos:
+      LC = LibCallTable[1][Is64Bit];
+      break;
+    case Intrinsic::pow:
+      LC = LibCallTable[2][Is64Bit];
+      break;
+    }
+
+    ArgListTy Args;
+    Args.reserve(II->getNumArgOperands());
+
+    // Populate the argument list.
+    for (auto &Arg : II->arg_operands()) {
+      ArgListEntry Entry;
+      Entry.Val = Arg;
+      Entry.Ty = Arg->getType();
+      Args.push_back(Entry);
+    }
+
+    CallLoweringInfo CLI;
+    MCContext &Ctx = MF->getContext();
+    CLI.setCallee(DL, Ctx, TLI.getLibcallCallingConv(LC), II->getType(),
+                  TLI.getLibcallName(LC), std::move(Args));
+    if (!lowerCallTo(CLI))
+      return false;
+    updateValueMap(II, CLI.ResultReg);
+    return true;
+  }
+  case Intrinsic::fabs: {
+    MVT VT;
+    if (!isTypeLegal(II->getType(), VT))
+      return false;
+
+    unsigned Opc;
+    switch (VT.SimpleTy) {
+    default:
+      return false;
+    case MVT::f32:
+      Opc = AArch64::FABSSr;
+      break;
+    case MVT::f64:
+      Opc = AArch64::FABSDr;
+      break;
+    }
+    unsigned SrcReg = getRegForValue(II->getOperand(0));
+    if (!SrcReg)
+      return false;
+    bool SrcRegIsKill = hasTrivialKill(II->getOperand(0));
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(SrcReg, getKillRegState(SrcRegIsKill));
+    updateValueMap(II, ResultReg);
+    return true;
+  }
+  case Intrinsic::trap: {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
+        .addImm(1);
+    return true;
+  }
+  case Intrinsic::sqrt: {
+    Type *RetTy = II->getCalledFunction()->getReturnType();
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    unsigned Op0Reg = getRegForValue(II->getOperand(0));
+    if (!Op0Reg)
+      return false;
+    bool Op0IsKill = hasTrivialKill(II->getOperand(0));
+
+    unsigned ResultReg = fastEmit_r(VT, VT, ISD::FSQRT, Op0Reg, Op0IsKill);
+    if (!ResultReg)
+      return false;
+
+    updateValueMap(II, ResultReg);
+    return true;
+  }
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow: {
+    // This implements the basic lowering of the xalu with overflow intrinsics.
+    const Function *Callee = II->getCalledFunction();
+    auto *Ty = cast<StructType>(Callee->getReturnType());
+    Type *RetTy = Ty->getTypeAtIndex(0U);
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    if (VT != MVT::i32 && VT != MVT::i64)
+      return false;
+
+    const Value *LHS = II->getArgOperand(0);
+    const Value *RHS = II->getArgOperand(1);
+    // Canonicalize immediate to the RHS.
+    if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+        isCommutativeIntrinsic(II))
+      std::swap(LHS, RHS);
+
+    // Simplify multiplies.
+    Intrinsic::ID IID = II->getIntrinsicID();
+    switch (IID) {
+    default:
+      break;
+    case Intrinsic::smul_with_overflow:
+      if (const auto *C = dyn_cast<ConstantInt>(RHS))
+        if (C->getValue() == 2) {
+          IID = Intrinsic::sadd_with_overflow;
+          RHS = LHS;
+        }
+      break;
+    case Intrinsic::umul_with_overflow:
+      if (const auto *C = dyn_cast<ConstantInt>(RHS))
+        if (C->getValue() == 2) {
+          IID = Intrinsic::uadd_with_overflow;
+          RHS = LHS;
+        }
+      break;
+    }
+
+    unsigned ResultReg1 = 0, ResultReg2 = 0, MulReg = 0;
+    AArch64CC::CondCode CC = AArch64CC::Invalid;
+    switch (IID) {
+    default: llvm_unreachable("Unexpected intrinsic!");
+    case Intrinsic::sadd_with_overflow:
+      ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true);
+      CC = AArch64CC::VS;
+      break;
+    case Intrinsic::uadd_with_overflow:
+      ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true);
+      CC = AArch64CC::HS;
+      break;
+    case Intrinsic::ssub_with_overflow:
+      ResultReg1 = emitSub(VT, LHS, RHS, /*SetFlags=*/true);
+      CC = AArch64CC::VS;
+      break;
+    case Intrinsic::usub_with_overflow:
+      ResultReg1 = emitSub(VT, LHS, RHS, /*SetFlags=*/true);
+      CC = AArch64CC::LO;
+      break;
+    case Intrinsic::smul_with_overflow: {
+      CC = AArch64CC::NE;
+      unsigned LHSReg = getRegForValue(LHS);
+      if (!LHSReg)
+        return false;
+      bool LHSIsKill = hasTrivialKill(LHS);
+
+      unsigned RHSReg = getRegForValue(RHS);
+      if (!RHSReg)
+        return false;
+      bool RHSIsKill = hasTrivialKill(RHS);
+
+      if (VT == MVT::i32) {
+        MulReg = emitSMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+        unsigned ShiftReg = emitLSR_ri(MVT::i64, MVT::i64, MulReg,
+                                       /*IsKill=*/false, 32);
+        MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true,
+                                            AArch64::sub_32);
+        ShiftReg = fastEmitInst_extractsubreg(VT, ShiftReg, /*IsKill=*/true,
+                                              AArch64::sub_32);
+        emitSubs_rs(VT, ShiftReg, /*IsKill=*/true, MulReg, /*IsKill=*/false,
+                    AArch64_AM::ASR, 31, /*WantResult=*/false);
+      } else {
+        assert(VT == MVT::i64 && "Unexpected value type.");
+        // LHSReg and RHSReg cannot be killed by this Mul, since they are
+        // reused in the next instruction.
+        MulReg = emitMul_rr(VT, LHSReg, /*IsKill=*/false, RHSReg,
+                            /*IsKill=*/false);
+        unsigned SMULHReg = fastEmit_rr(VT, VT, ISD::MULHS, LHSReg, LHSIsKill,
+                                        RHSReg, RHSIsKill);
+        emitSubs_rs(VT, SMULHReg, /*IsKill=*/true, MulReg, /*IsKill=*/false,
+                    AArch64_AM::ASR, 63, /*WantResult=*/false);
+      }
+      break;
+    }
+    case Intrinsic::umul_with_overflow: {
+      CC = AArch64CC::NE;
+      unsigned LHSReg = getRegForValue(LHS);
+      if (!LHSReg)
+        return false;
+      bool LHSIsKill = hasTrivialKill(LHS);
+
+      unsigned RHSReg = getRegForValue(RHS);
+      if (!RHSReg)
+        return false;
+      bool RHSIsKill = hasTrivialKill(RHS);
+
+      if (VT == MVT::i32) {
+        MulReg = emitUMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+        emitSubs_rs(MVT::i64, AArch64::XZR, /*IsKill=*/true, MulReg,
+                    /*IsKill=*/false, AArch64_AM::LSR, 32,
+                    /*WantResult=*/false);
+        MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true,
+                                            AArch64::sub_32);
+      } else {
+        assert(VT == MVT::i64 && "Unexpected value type.");
+        // LHSReg and RHSReg cannot be killed by this Mul, since they are
+        // reused in the next instruction.
+        MulReg = emitMul_rr(VT, LHSReg, /*IsKill=*/false, RHSReg,
+                            /*IsKill=*/false);
+        unsigned UMULHReg = fastEmit_rr(VT, VT, ISD::MULHU, LHSReg, LHSIsKill,
+                                        RHSReg, RHSIsKill);
+        emitSubs_rr(VT, AArch64::XZR, /*IsKill=*/true, UMULHReg,
+                    /*IsKill=*/false, /*WantResult=*/false);
+      }
+      break;
+    }
+    }
+
+    if (MulReg) {
+      ResultReg1 = createResultReg(TLI.getRegClassFor(VT));
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg1).addReg(MulReg);
+    }
+
+    ResultReg2 = fastEmitInst_rri(AArch64::CSINCWr, &AArch64::GPR32RegClass,
+                                  AArch64::WZR, /*IsKill=*/true, AArch64::WZR,
+                                  /*IsKill=*/true, getInvertedCondCode(CC));
+    (void)ResultReg2;
+    assert((ResultReg1 + 1) == ResultReg2 &&
+           "Nonconsecutive result registers.");
+    updateValueMap(II, ResultReg1, 2);
+    return true;
+  }
+  }
+  return false;
+}
+
+bool AArch64FastISel::selectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  if (F.isVarArg())
+    return false;
+
+  if (TLI.supportSwiftError() &&
+      F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return false;
+
+  if (TLI.supportSplitCSR(FuncInfo.MF))
+    return false;
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+
+  if (Ret->getNumOperands() > 0) {
+    CallingConv::ID CC = F.getCallingConv();
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
+    CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
+                                                     : RetCC_AArch64_AAPCS;
+    CCInfo.AnalyzeReturn(Outs, RetCC);
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+    const Value *RV = Ret->getOperand(0);
+
+    // Don't bother handling odd stuff for now.
+    if ((VA.getLocInfo() != CCValAssign::Full) &&
+        (VA.getLocInfo() != CCValAssign::BCvt))
+      return false;
+
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    unsigned SrcReg = Reg + VA.getValNo();
+    unsigned DestReg = VA.getLocReg();
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!MRI.getRegClass(SrcReg)->contains(DestReg))
+      return false;
+
+    EVT RVEVT = TLI.getValueType(DL, RV->getType());
+    if (!RVEVT.isSimple())
+      return false;
+
+    // Vectors (of > 1 lane) in big endian need tricky handling.
+    if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1 &&
+        !Subtarget->isLittleEndian())
+      return false;
+
+    MVT RVVT = RVEVT.getSimpleVT();
+    if (RVVT == MVT::f128)
+      return false;
+
+    MVT DestVT = VA.getValVT();
+    // Special handling for extended integers.
+    if (RVVT != DestVT) {
+      if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
+        return false;
+
+      if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+        return false;
+
+      bool IsZExt = Outs[0].Flags.isZExt();
+      SrcReg = emitIntExt(RVVT, SrcReg, DestVT, IsZExt);
+      if (SrcReg == 0)
+        return false;
+    }
+
+    // Make the copy.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg);
+
+    // Add register to return instruction.
+    RetRegs.push_back(VA.getLocReg());
+  }
+
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(AArch64::RET_ReallyLR));
+  for (unsigned RetReg : RetRegs)
+    MIB.addReg(RetReg, RegState::Implicit);
+  return true;
+}
+
+bool AArch64FastISel::selectTrunc(const Instruction *I) {
+  Type *DestTy = I->getType();
+  Value *Op = I->getOperand(0);
+  Type *SrcTy = Op->getType();
+
+  EVT SrcEVT = TLI.getValueType(DL, SrcTy, true);
+  EVT DestEVT = TLI.getValueType(DL, DestTy, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+
+  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
+      SrcVT != MVT::i8)
+    return false;
+  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8 &&
+      DestVT != MVT::i1)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Op);
+  if (!SrcReg)
+    return false;
+  bool SrcIsKill = hasTrivialKill(Op);
+
+  // If we're truncating from i64 to a smaller non-legal type then generate an
+  // AND. Otherwise, we know the high bits are undefined and a truncate only
+  // generate a COPY. We cannot mark the source register also as result
+  // register, because this can incorrectly transfer the kill flag onto the
+  // source register.
+  unsigned ResultReg;
+  if (SrcVT == MVT::i64) {
+    uint64_t Mask = 0;
+    switch (DestVT.SimpleTy) {
+    default:
+      // Trunc i64 to i32 is handled by the target-independent fast-isel.
+      return false;
+    case MVT::i1:
+      Mask = 0x1;
+      break;
+    case MVT::i8:
+      Mask = 0xff;
+      break;
+    case MVT::i16:
+      Mask = 0xffff;
+      break;
+    }
+    // Issue an extract_subreg to get the lower 32-bits.
+    unsigned Reg32 = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill,
+                                                AArch64::sub_32);
+    // Create the AND instruction which performs the actual truncation.
+    ResultReg = emitAnd_ri(MVT::i32, Reg32, /*IsKill=*/true, Mask);
+    assert(ResultReg && "Unexpected AND instruction emission failure.");
+  } else {
+    ResultReg = createResultReg(&AArch64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(SrcReg, getKillRegState(SrcIsKill));
+  }
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+unsigned AArch64FastISel::emiti1Ext(unsigned SrcReg, MVT DestVT, bool IsZExt) {
+  assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 ||
+          DestVT == MVT::i64) &&
+         "Unexpected value type.");
+  // Handle i8 and i16 as i32.
+  if (DestVT == MVT::i8 || DestVT == MVT::i16)
+    DestVT = MVT::i32;
+
+  if (IsZExt) {
+    unsigned ResultReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1);
+    assert(ResultReg && "Unexpected AND instruction emission failure.");
+    if (DestVT == MVT::i64) {
+      // We're ZExt i1 to i64.  The ANDWri Wd, Ws, #1 implicitly clears the
+      // upper 32 bits.  Emit a SUBREG_TO_REG to extend from Wd to Xd.
+      unsigned Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(AArch64::SUBREG_TO_REG), Reg64)
+          .addImm(0)
+          .addReg(ResultReg)
+          .addImm(AArch64::sub_32);
+      ResultReg = Reg64;
+    }
+    return ResultReg;
+  } else {
+    if (DestVT == MVT::i64) {
+      // FIXME: We're SExt i1 to i64.
+      return 0;
+    }
+    return fastEmitInst_rii(AArch64::SBFMWri, &AArch64::GPR32RegClass, SrcReg,
+                            /*TODO:IsKill=*/false, 0, 0);
+  }
+}
+
+unsigned AArch64FastISel::emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                                      unsigned Op1, bool Op1IsKill) {
+  unsigned Opc, ZReg;
+  switch (RetVT.SimpleTy) {
+  default: return 0;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    RetVT = MVT::i32;
+    Opc = AArch64::MADDWrrr; ZReg = AArch64::WZR; break;
+  case MVT::i64:
+    Opc = AArch64::MADDXrrr; ZReg = AArch64::XZR; break;
+  }
+
+  const TargetRegisterClass *RC =
+      (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  return fastEmitInst_rrr(Opc, RC, Op0, Op0IsKill, Op1, Op1IsKill,
+                          /*IsKill=*/ZReg, true);
+}
+
+unsigned AArch64FastISel::emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                                        unsigned Op1, bool Op1IsKill) {
+  if (RetVT != MVT::i64)
+    return 0;
+
+  return fastEmitInst_rrr(AArch64::SMADDLrrr, &AArch64::GPR64RegClass,
+                          Op0, Op0IsKill, Op1, Op1IsKill,
+                          AArch64::XZR, /*IsKill=*/true);
+}
+
+unsigned AArch64FastISel::emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                                        unsigned Op1, bool Op1IsKill) {
+  if (RetVT != MVT::i64)
+    return 0;
+
+  return fastEmitInst_rrr(AArch64::UMADDLrrr, &AArch64::GPR64RegClass,
+                          Op0, Op0IsKill, Op1, Op1IsKill,
+                          AArch64::XZR, /*IsKill=*/true);
+}
+
+unsigned AArch64FastISel::emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                                     unsigned Op1Reg, bool Op1IsKill) {
+  unsigned Opc = 0;
+  bool NeedTrunc = false;
+  uint64_t Mask = 0;
+  switch (RetVT.SimpleTy) {
+  default: return 0;
+  case MVT::i8:  Opc = AArch64::LSLVWr; NeedTrunc = true; Mask = 0xff;   break;
+  case MVT::i16: Opc = AArch64::LSLVWr; NeedTrunc = true; Mask = 0xffff; break;
+  case MVT::i32: Opc = AArch64::LSLVWr;                                  break;
+  case MVT::i64: Opc = AArch64::LSLVXr;                                  break;
+  }
+
+  const TargetRegisterClass *RC =
+      (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  if (NeedTrunc) {
+    Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
+    Op1IsKill = true;
+  }
+  unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
+                                       Op1IsKill);
+  if (NeedTrunc)
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
+                                     bool Op0IsKill, uint64_t Shift,
+                                     bool IsZExt) {
+  assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
+         "Unexpected source/return type pair.");
+  assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
+          SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
+         "Unexpected source value type.");
+  assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
+          RetVT == MVT::i64) && "Unexpected return value type.");
+
+  bool Is64Bit = (RetVT == MVT::i64);
+  unsigned RegSize = Is64Bit ? 64 : 32;
+  unsigned DstBits = RetVT.getSizeInBits();
+  unsigned SrcBits = SrcVT.getSizeInBits();
+  const TargetRegisterClass *RC =
+      Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+  // Just emit a copy for "zero" shifts.
+  if (Shift == 0) {
+    if (RetVT == SrcVT) {
+      unsigned ResultReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+          .addReg(Op0, getKillRegState(Op0IsKill));
+      return ResultReg;
+    } else
+      return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+  }
+
+  // Don't deal with undefined shifts.
+  if (Shift >= DstBits)
+    return 0;
+
+  // For immediate shifts we can fold the zero-/sign-extension into the shift.
+  // {S|U}BFM Wd, Wn, #r, #s
+  // Wd<32+s-r,32-r> = Wn<s:0> when r > s
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = shl i16 %1, 4
+  // Wd<32+7-28,32-28> = Wn<7:0> <- clamp s to 7
+  // 0b1111_1111_1111_1111__1111_1010_1010_0000 sext
+  // 0b0000_0000_0000_0000__0000_0101_0101_0000 sext | zext
+  // 0b0000_0000_0000_0000__0000_1010_1010_0000 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = shl i16 %1, 8
+  // Wd<32+7-24,32-24> = Wn<7:0>
+  // 0b1111_1111_1111_1111__1010_1010_0000_0000 sext
+  // 0b0000_0000_0000_0000__0101_0101_0000_0000 sext | zext
+  // 0b0000_0000_0000_0000__1010_1010_0000_0000 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = shl i16 %1, 12
+  // Wd<32+3-20,32-20> = Wn<3:0>
+  // 0b1111_1111_1111_1111__1010_0000_0000_0000 sext
+  // 0b0000_0000_0000_0000__0101_0000_0000_0000 sext | zext
+  // 0b0000_0000_0000_0000__1010_0000_0000_0000 zext
+
+  unsigned ImmR = RegSize - Shift;
+  // Limit the width to the length of the source type.
+  unsigned ImmS = std::min<unsigned>(SrcBits - 1, DstBits - 1 - Shift);
+  static const unsigned OpcTable[2][2] = {
+    {AArch64::SBFMWri, AArch64::SBFMXri},
+    {AArch64::UBFMWri, AArch64::UBFMXri}
+  };
+  unsigned Opc = OpcTable[IsZExt][Is64Bit];
+  if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
+    unsigned TmpReg = MRI.createVirtualRegister(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), TmpReg)
+        .addImm(0)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(AArch64::sub_32);
+    Op0 = TmpReg;
+    Op0IsKill = true;
+  }
+  return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
+}
+
+unsigned AArch64FastISel::emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                                     unsigned Op1Reg, bool Op1IsKill) {
+  unsigned Opc = 0;
+  bool NeedTrunc = false;
+  uint64_t Mask = 0;
+  switch (RetVT.SimpleTy) {
+  default: return 0;
+  case MVT::i8:  Opc = AArch64::LSRVWr; NeedTrunc = true; Mask = 0xff;   break;
+  case MVT::i16: Opc = AArch64::LSRVWr; NeedTrunc = true; Mask = 0xffff; break;
+  case MVT::i32: Opc = AArch64::LSRVWr; break;
+  case MVT::i64: Opc = AArch64::LSRVXr; break;
+  }
+
+  const TargetRegisterClass *RC =
+      (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  if (NeedTrunc) {
+    Op0Reg = emitAnd_ri(MVT::i32, Op0Reg, Op0IsKill, Mask);
+    Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
+    Op0IsKill = Op1IsKill = true;
+  }
+  unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
+                                       Op1IsKill);
+  if (NeedTrunc)
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
+                                     bool Op0IsKill, uint64_t Shift,
+                                     bool IsZExt) {
+  assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
+         "Unexpected source/return type pair.");
+  assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
+          SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
+         "Unexpected source value type.");
+  assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
+          RetVT == MVT::i64) && "Unexpected return value type.");
+
+  bool Is64Bit = (RetVT == MVT::i64);
+  unsigned RegSize = Is64Bit ? 64 : 32;
+  unsigned DstBits = RetVT.getSizeInBits();
+  unsigned SrcBits = SrcVT.getSizeInBits();
+  const TargetRegisterClass *RC =
+      Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+  // Just emit a copy for "zero" shifts.
+  if (Shift == 0) {
+    if (RetVT == SrcVT) {
+      unsigned ResultReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+      .addReg(Op0, getKillRegState(Op0IsKill));
+      return ResultReg;
+    } else
+      return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+  }
+
+  // Don't deal with undefined shifts.
+  if (Shift >= DstBits)
+    return 0;
+
+  // For immediate shifts we can fold the zero-/sign-extension into the shift.
+  // {S|U}BFM Wd, Wn, #r, #s
+  // Wd<s-r:0> = Wn<s:r> when r <= s
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = lshr i16 %1, 4
+  // Wd<7-4:0> = Wn<7:4>
+  // 0b0000_0000_0000_0000__0000_1111_1111_1010 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0101 sext | zext
+  // 0b0000_0000_0000_0000__0000_0000_0000_1010 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = lshr i16 %1, 8
+  // Wd<7-7,0> = Wn<7:7>
+  // 0b0000_0000_0000_0000__0000_0000_1111_1111 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = lshr i16 %1, 12
+  // Wd<7-7,0> = Wn<7:7> <- clamp r to 7
+  // 0b0000_0000_0000_0000__0000_0000_0000_1111 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+  if (Shift >= SrcBits && IsZExt)
+    return materializeInt(ConstantInt::get(*Context, APInt(RegSize, 0)), RetVT);
+
+  // It is not possible to fold a sign-extend into the LShr instruction. In this
+  // case emit a sign-extend.
+  if (!IsZExt) {
+    Op0 = emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+    if (!Op0)
+      return 0;
+    Op0IsKill = true;
+    SrcVT = RetVT;
+    SrcBits = SrcVT.getSizeInBits();
+    IsZExt = true;
+  }
+
+  unsigned ImmR = std::min<unsigned>(SrcBits - 1, Shift);
+  unsigned ImmS = SrcBits - 1;
+  static const unsigned OpcTable[2][2] = {
+    {AArch64::SBFMWri, AArch64::SBFMXri},
+    {AArch64::UBFMWri, AArch64::UBFMXri}
+  };
+  unsigned Opc = OpcTable[IsZExt][Is64Bit];
+  if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
+    unsigned TmpReg = MRI.createVirtualRegister(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), TmpReg)
+        .addImm(0)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(AArch64::sub_32);
+    Op0 = TmpReg;
+    Op0IsKill = true;
+  }
+  return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
+}
+
+unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                                     unsigned Op1Reg, bool Op1IsKill) {
+  unsigned Opc = 0;
+  bool NeedTrunc = false;
+  uint64_t Mask = 0;
+  switch (RetVT.SimpleTy) {
+  default: return 0;
+  case MVT::i8:  Opc = AArch64::ASRVWr; NeedTrunc = true; Mask = 0xff;   break;
+  case MVT::i16: Opc = AArch64::ASRVWr; NeedTrunc = true; Mask = 0xffff; break;
+  case MVT::i32: Opc = AArch64::ASRVWr;                                  break;
+  case MVT::i64: Opc = AArch64::ASRVXr;                                  break;
+  }
+
+  const TargetRegisterClass *RC =
+      (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  if (NeedTrunc) {
+    Op0Reg = emitIntExt(RetVT, Op0Reg, MVT::i32, /*IsZExt=*/false);
+    Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
+    Op0IsKill = Op1IsKill = true;
+  }
+  unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
+                                       Op1IsKill);
+  if (NeedTrunc)
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
+                                     bool Op0IsKill, uint64_t Shift,
+                                     bool IsZExt) {
+  assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
+         "Unexpected source/return type pair.");
+  assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
+          SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
+         "Unexpected source value type.");
+  assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
+          RetVT == MVT::i64) && "Unexpected return value type.");
+
+  bool Is64Bit = (RetVT == MVT::i64);
+  unsigned RegSize = Is64Bit ? 64 : 32;
+  unsigned DstBits = RetVT.getSizeInBits();
+  unsigned SrcBits = SrcVT.getSizeInBits();
+  const TargetRegisterClass *RC =
+      Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+  // Just emit a copy for "zero" shifts.
+  if (Shift == 0) {
+    if (RetVT == SrcVT) {
+      unsigned ResultReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+      .addReg(Op0, getKillRegState(Op0IsKill));
+      return ResultReg;
+    } else
+      return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+  }
+
+  // Don't deal with undefined shifts.
+  if (Shift >= DstBits)
+    return 0;
+
+  // For immediate shifts we can fold the zero-/sign-extension into the shift.
+  // {S|U}BFM Wd, Wn, #r, #s
+  // Wd<s-r:0> = Wn<s:r> when r <= s
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = ashr i16 %1, 4
+  // Wd<7-4:0> = Wn<7:4>
+  // 0b1111_1111_1111_1111__1111_1111_1111_1010 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0101 sext | zext
+  // 0b0000_0000_0000_0000__0000_0000_0000_1010 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = ashr i16 %1, 8
+  // Wd<7-7,0> = Wn<7:7>
+  // 0b1111_1111_1111_1111__1111_1111_1111_1111 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = ashr i16 %1, 12
+  // Wd<7-7,0> = Wn<7:7> <- clamp r to 7
+  // 0b1111_1111_1111_1111__1111_1111_1111_1111 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+  if (Shift >= SrcBits && IsZExt)
+    return materializeInt(ConstantInt::get(*Context, APInt(RegSize, 0)), RetVT);
+
+  unsigned ImmR = std::min<unsigned>(SrcBits - 1, Shift);
+  unsigned ImmS = SrcBits - 1;
+  static const unsigned OpcTable[2][2] = {
+    {AArch64::SBFMWri, AArch64::SBFMXri},
+    {AArch64::UBFMWri, AArch64::UBFMXri}
+  };
+  unsigned Opc = OpcTable[IsZExt][Is64Bit];
+  if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
+    unsigned TmpReg = MRI.createVirtualRegister(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), TmpReg)
+        .addImm(0)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(AArch64::sub_32);
+    Op0 = TmpReg;
+    Op0IsKill = true;
+  }
+  return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
+}
+
+unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                     bool IsZExt) {
+  assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?");
+
+  // FastISel does not have plumbing to deal with extensions where the SrcVT or
+  // DestVT are odd things, so test to make sure that they are both types we can
+  // handle (i1/i8/i16/i32 for SrcVT and i8/i16/i32/i64 for DestVT), otherwise
+  // bail out to SelectionDAG.
+  if (((DestVT != MVT::i8) && (DestVT != MVT::i16) &&
+       (DestVT != MVT::i32) && (DestVT != MVT::i64)) ||
+      ((SrcVT !=  MVT::i1) && (SrcVT !=  MVT::i8) &&
+       (SrcVT !=  MVT::i16) && (SrcVT !=  MVT::i32)))
+    return 0;
+
+  unsigned Opc;
+  unsigned Imm = 0;
+
+  switch (SrcVT.SimpleTy) {
+  default:
+    return 0;
+  case MVT::i1:
+    return emiti1Ext(SrcReg, DestVT, IsZExt);
+  case MVT::i8:
+    if (DestVT == MVT::i64)
+      Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+    else
+      Opc = IsZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+    Imm = 7;
+    break;
+  case MVT::i16:
+    if (DestVT == MVT::i64)
+      Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+    else
+      Opc = IsZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+    Imm = 15;
+    break;
+  case MVT::i32:
+    assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?");
+    Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+    Imm = 31;
+    break;
+  }
+
+  // Handle i8 and i16 as i32.
+  if (DestVT == MVT::i8 || DestVT == MVT::i16)
+    DestVT = MVT::i32;
+  else if (DestVT == MVT::i64) {
+    unsigned Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), Src64)
+        .addImm(0)
+        .addReg(SrcReg)
+        .addImm(AArch64::sub_32);
+    SrcReg = Src64;
+  }
+
+  const TargetRegisterClass *RC =
+      (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  return fastEmitInst_rii(Opc, RC, SrcReg, /*TODO:IsKill=*/false, 0, Imm);
+}
+
+static bool isZExtLoad(const MachineInstr *LI) {
+  switch (LI->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDURBBi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURWi:
+  case AArch64::LDRBBui:
+  case AArch64::LDRHHui:
+  case AArch64::LDRWui:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRWroX:
+  case AArch64::LDRBBroW:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRWroW:
+    return true;
+  }
+}
+
+static bool isSExtLoad(const MachineInstr *LI) {
+  switch (LI->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSHWi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSWi:
+  case AArch64::LDRSBWui:
+  case AArch64::LDRSHWui:
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWui:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSWroW:
+    return true;
+  }
+}
+
+bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT,
+                                         MVT SrcVT) {
+  const auto *LI = dyn_cast<LoadInst>(I->getOperand(0));
+  if (!LI || !LI->hasOneUse())
+    return false;
+
+  // Check if the load instruction has already been selected.
+  unsigned Reg = lookUpRegForValue(LI);
+  if (!Reg)
+    return false;
+
+  MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+  if (!MI)
+    return false;
+
+  // Check if the correct load instruction has been emitted - SelectionDAG might
+  // have emitted a zero-extending load, but we need a sign-extending load.
+  bool IsZExt = isa<ZExtInst>(I);
+  const auto *LoadMI = MI;
+  if (LoadMI->getOpcode() == TargetOpcode::COPY &&
+      LoadMI->getOperand(1).getSubReg() == AArch64::sub_32) {
+    unsigned LoadReg = MI->getOperand(1).getReg();
+    LoadMI = MRI.getUniqueVRegDef(LoadReg);
+    assert(LoadMI && "Expected valid instruction");
+  }
+  if (!(IsZExt && isZExtLoad(LoadMI)) && !(!IsZExt && isSExtLoad(LoadMI)))
+    return false;
+
+  // Nothing to be done.
+  if (RetVT != MVT::i64 || SrcVT > MVT::i32) {
+    updateValueMap(I, Reg);
+    return true;
+  }
+
+  if (IsZExt) {
+    unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), Reg64)
+        .addImm(0)
+        .addReg(Reg, getKillRegState(true))
+        .addImm(AArch64::sub_32);
+    Reg = Reg64;
+  } else {
+    assert((MI->getOpcode() == TargetOpcode::COPY &&
+            MI->getOperand(1).getSubReg() == AArch64::sub_32) &&
+           "Expected copy instruction");
+    Reg = MI->getOperand(1).getReg();
+    MI->eraseFromParent();
+  }
+  updateValueMap(I, Reg);
+  return true;
+}
+
+bool AArch64FastISel::selectIntExt(const Instruction *I) {
+  assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
+         "Unexpected integer extend instruction.");
+  MVT RetVT;
+  MVT SrcVT;
+  if (!isTypeSupported(I->getType(), RetVT))
+    return false;
+
+  if (!isTypeSupported(I->getOperand(0)->getType(), SrcVT))
+    return false;
+
+  // Try to optimize already sign-/zero-extended values from load instructions.
+  if (optimizeIntExtLoad(I, RetVT, SrcVT))
+    return true;
+
+  unsigned SrcReg = getRegForValue(I->getOperand(0));
+  if (!SrcReg)
+    return false;
+  bool SrcIsKill = hasTrivialKill(I->getOperand(0));
+
+  // Try to optimize already sign-/zero-extended values from function arguments.
+  bool IsZExt = isa<ZExtInst>(I);
+  if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0))) {
+    if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) {
+      if (RetVT == MVT::i64 && SrcVT != MVT::i64) {
+        unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(AArch64::SUBREG_TO_REG), ResultReg)
+            .addImm(0)
+            .addReg(SrcReg, getKillRegState(SrcIsKill))
+            .addImm(AArch64::sub_32);
+        SrcReg = ResultReg;
+      }
+      // Conservatively clear all kill flags from all uses, because we are
+      // replacing a sign-/zero-extend instruction at IR level with a nop at MI
+      // level. The result of the instruction at IR level might have been
+      // trivially dead, which is now not longer true.
+      unsigned UseReg = lookUpRegForValue(I);
+      if (UseReg)
+        MRI.clearKillFlags(UseReg);
+
+      updateValueMap(I, SrcReg);
+      return true;
+    }
+  }
+
+  unsigned ResultReg = emitIntExt(SrcVT, SrcReg, RetVT, IsZExt);
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) {
+  EVT DestEVT = TLI.getValueType(DL, I->getType(), true);
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT DestVT = DestEVT.getSimpleVT();
+  if (DestVT != MVT::i64 && DestVT != MVT::i32)
+    return false;
+
+  unsigned DivOpc;
+  bool Is64bit = (DestVT == MVT::i64);
+  switch (ISDOpcode) {
+  default:
+    return false;
+  case ISD::SREM:
+    DivOpc = Is64bit ? AArch64::SDIVXr : AArch64::SDIVWr;
+    break;
+  case ISD::UREM:
+    DivOpc = Is64bit ? AArch64::UDIVXr : AArch64::UDIVWr;
+    break;
+  }
+  unsigned MSubOpc = Is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr;
+  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  if (!Src0Reg)
+    return false;
+  bool Src0IsKill = hasTrivialKill(I->getOperand(0));
+
+  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  if (!Src1Reg)
+    return false;
+  bool Src1IsKill = hasTrivialKill(I->getOperand(1));
+
+  const TargetRegisterClass *RC =
+      (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  unsigned QuotReg = fastEmitInst_rr(DivOpc, RC, Src0Reg, /*IsKill=*/false,
+                                     Src1Reg, /*IsKill=*/false);
+  assert(QuotReg && "Unexpected DIV instruction emission failure.");
+  // The remainder is computed as numerator - (quotient * denominator) using the
+  // MSUB instruction.
+  unsigned ResultReg = fastEmitInst_rrr(MSubOpc, RC, QuotReg, /*IsKill=*/true,
+                                        Src1Reg, Src1IsKill, Src0Reg,
+                                        Src0IsKill);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectMul(const Instruction *I) {
+  MVT VT;
+  if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
+    return false;
+
+  if (VT.isVector())
+    return selectBinaryOp(I, ISD::MUL);
+
+  const Value *Src0 = I->getOperand(0);
+  const Value *Src1 = I->getOperand(1);
+  if (const auto *C = dyn_cast<ConstantInt>(Src0))
+    if (C->getValue().isPowerOf2())
+      std::swap(Src0, Src1);
+
+  // Try to simplify to a shift instruction.
+  if (const auto *C = dyn_cast<ConstantInt>(Src1))
+    if (C->getValue().isPowerOf2()) {
+      uint64_t ShiftVal = C->getValue().logBase2();
+      MVT SrcVT = VT;
+      bool IsZExt = true;
+      if (const auto *ZExt = dyn_cast<ZExtInst>(Src0)) {
+        if (!isIntExtFree(ZExt)) {
+          MVT VT;
+          if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), VT)) {
+            SrcVT = VT;
+            IsZExt = true;
+            Src0 = ZExt->getOperand(0);
+          }
+        }
+      } else if (const auto *SExt = dyn_cast<SExtInst>(Src0)) {
+        if (!isIntExtFree(SExt)) {
+          MVT VT;
+          if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), VT)) {
+            SrcVT = VT;
+            IsZExt = false;
+            Src0 = SExt->getOperand(0);
+          }
+        }
+      }
+
+      unsigned Src0Reg = getRegForValue(Src0);
+      if (!Src0Reg)
+        return false;
+      bool Src0IsKill = hasTrivialKill(Src0);
+
+      unsigned ResultReg =
+          emitLSL_ri(VT, SrcVT, Src0Reg, Src0IsKill, ShiftVal, IsZExt);
+
+      if (ResultReg) {
+        updateValueMap(I, ResultReg);
+        return true;
+      }
+    }
+
+  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  if (!Src0Reg)
+    return false;
+  bool Src0IsKill = hasTrivialKill(I->getOperand(0));
+
+  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  if (!Src1Reg)
+    return false;
+  bool Src1IsKill = hasTrivialKill(I->getOperand(1));
+
+  unsigned ResultReg = emitMul_rr(VT, Src0Reg, Src0IsKill, Src1Reg, Src1IsKill);
+
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectShift(const Instruction *I) {
+  MVT RetVT;
+  if (!isTypeSupported(I->getType(), RetVT, /*IsVectorAllowed=*/true))
+    return false;
+
+  if (RetVT.isVector())
+    return selectOperator(I, I->getOpcode());
+
+  if (const auto *C = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    unsigned ResultReg = 0;
+    uint64_t ShiftVal = C->getZExtValue();
+    MVT SrcVT = RetVT;
+    bool IsZExt = I->getOpcode() != Instruction::AShr;
+    const Value *Op0 = I->getOperand(0);
+    if (const auto *ZExt = dyn_cast<ZExtInst>(Op0)) {
+      if (!isIntExtFree(ZExt)) {
+        MVT TmpVT;
+        if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), TmpVT)) {
+          SrcVT = TmpVT;
+          IsZExt = true;
+          Op0 = ZExt->getOperand(0);
+        }
+      }
+    } else if (const auto *SExt = dyn_cast<SExtInst>(Op0)) {
+      if (!isIntExtFree(SExt)) {
+        MVT TmpVT;
+        if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), TmpVT)) {
+          SrcVT = TmpVT;
+          IsZExt = false;
+          Op0 = SExt->getOperand(0);
+        }
+      }
+    }
+
+    unsigned Op0Reg = getRegForValue(Op0);
+    if (!Op0Reg)
+      return false;
+    bool Op0IsKill = hasTrivialKill(Op0);
+
+    switch (I->getOpcode()) {
+    default: llvm_unreachable("Unexpected instruction.");
+    case Instruction::Shl:
+      ResultReg = emitLSL_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+      break;
+    case Instruction::AShr:
+      ResultReg = emitASR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+      break;
+    case Instruction::LShr:
+      ResultReg = emitLSR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+      break;
+    }
+    if (!ResultReg)
+      return false;
+
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (!Op0Reg)
+    return false;
+  bool Op0IsKill = hasTrivialKill(I->getOperand(0));
+
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (!Op1Reg)
+    return false;
+  bool Op1IsKill = hasTrivialKill(I->getOperand(1));
+
+  unsigned ResultReg = 0;
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("Unexpected instruction.");
+  case Instruction::Shl:
+    ResultReg = emitLSL_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+    break;
+  case Instruction::AShr:
+    ResultReg = emitASR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+    break;
+  case Instruction::LShr:
+    ResultReg = emitLSR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+    break;
+  }
+
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectBitCast(const Instruction *I) {
+  MVT RetVT, SrcVT;
+
+  if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT))
+    return false;
+  if (!isTypeLegal(I->getType(), RetVT))
+    return false;
+
+  unsigned Opc;
+  if (RetVT == MVT::f32 && SrcVT == MVT::i32)
+    Opc = AArch64::FMOVWSr;
+  else if (RetVT == MVT::f64 && SrcVT == MVT::i64)
+    Opc = AArch64::FMOVXDr;
+  else if (RetVT == MVT::i32 && SrcVT == MVT::f32)
+    Opc = AArch64::FMOVSWr;
+  else if (RetVT == MVT::i64 && SrcVT == MVT::f64)
+    Opc = AArch64::FMOVDXr;
+  else
+    return false;
+
+  const TargetRegisterClass *RC = nullptr;
+  switch (RetVT.SimpleTy) {
+  default: llvm_unreachable("Unexpected value type.");
+  case MVT::i32: RC = &AArch64::GPR32RegClass; break;
+  case MVT::i64: RC = &AArch64::GPR64RegClass; break;
+  case MVT::f32: RC = &AArch64::FPR32RegClass; break;
+  case MVT::f64: RC = &AArch64::FPR64RegClass; break;
+  }
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (!Op0Reg)
+    return false;
+  bool Op0IsKill = hasTrivialKill(I->getOperand(0));
+  unsigned ResultReg = fastEmitInst_r(Opc, RC, Op0Reg, Op0IsKill);
+
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectFRem(const Instruction *I) {
+  MVT RetVT;
+  if (!isTypeLegal(I->getType(), RetVT))
+    return false;
+
+  RTLIB::Libcall LC;
+  switch (RetVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::f32:
+    LC = RTLIB::REM_F32;
+    break;
+  case MVT::f64:
+    LC = RTLIB::REM_F64;
+    break;
+  }
+
+  ArgListTy Args;
+  Args.reserve(I->getNumOperands());
+
+  // Populate the argument list.
+  for (auto &Arg : I->operands()) {
+    ArgListEntry Entry;
+    Entry.Val = Arg;
+    Entry.Ty = Arg->getType();
+    Args.push_back(Entry);
+  }
+
+  CallLoweringInfo CLI;
+  MCContext &Ctx = MF->getContext();
+  CLI.setCallee(DL, Ctx, TLI.getLibcallCallingConv(LC), I->getType(),
+                TLI.getLibcallName(LC), std::move(Args));
+  if (!lowerCallTo(CLI))
+    return false;
+  updateValueMap(I, CLI.ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectSDiv(const Instruction *I) {
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT))
+    return false;
+
+  if (!isa<ConstantInt>(I->getOperand(1)))
+    return selectBinaryOp(I, ISD::SDIV);
+
+  const APInt &C = cast<ConstantInt>(I->getOperand(1))->getValue();
+  if ((VT != MVT::i32 && VT != MVT::i64) || !C ||
+      !(C.isPowerOf2() || (-C).isPowerOf2()))
+    return selectBinaryOp(I, ISD::SDIV);
+
+  unsigned Lg2 = C.countTrailingZeros();
+  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  if (!Src0Reg)
+    return false;
+  bool Src0IsKill = hasTrivialKill(I->getOperand(0));
+
+  if (cast<BinaryOperator>(I)->isExact()) {
+    unsigned ResultReg = emitASR_ri(VT, VT, Src0Reg, Src0IsKill, Lg2);
+    if (!ResultReg)
+      return false;
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  int64_t Pow2MinusOne = (1ULL << Lg2) - 1;
+  unsigned AddReg = emitAdd_ri_(VT, Src0Reg, /*IsKill=*/false, Pow2MinusOne);
+  if (!AddReg)
+    return false;
+
+  // (Src0 < 0) ? Pow2 - 1 : 0;
+  if (!emitICmp_ri(VT, Src0Reg, /*IsKill=*/false, 0))
+    return false;
+
+  unsigned SelectOpc;
+  const TargetRegisterClass *RC;
+  if (VT == MVT::i64) {
+    SelectOpc = AArch64::CSELXr;
+    RC = &AArch64::GPR64RegClass;
+  } else {
+    SelectOpc = AArch64::CSELWr;
+    RC = &AArch64::GPR32RegClass;
+  }
+  unsigned SelectReg =
+      fastEmitInst_rri(SelectOpc, RC, AddReg, /*IsKill=*/true, Src0Reg,
+                       Src0IsKill, AArch64CC::LT);
+  if (!SelectReg)
+    return false;
+
+  // Divide by Pow2 --> ashr. If we're dividing by a negative value we must also
+  // negate the result.
+  unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+  unsigned ResultReg;
+  if (C.isNegative())
+    ResultReg = emitAddSub_rs(/*UseAdd=*/false, VT, ZeroReg, /*IsKill=*/true,
+                              SelectReg, /*IsKill=*/true, AArch64_AM::ASR, Lg2);
+  else
+    ResultReg = emitASR_ri(VT, VT, SelectReg, /*IsKill=*/true, Lg2);
+
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+/// This is mostly a copy of the existing FastISel getRegForGEPIndex code. We
+/// have to duplicate it for AArch64, because otherwise we would fail during the
+/// sign-extend emission.
+std::pair<unsigned, bool> AArch64FastISel::getRegForGEPIndex(const Value *Idx) {
+  unsigned IdxN = getRegForValue(Idx);
+  if (IdxN == 0)
+    // Unhandled operand. Halt "fast" selection and bail.
+    return std::pair<unsigned, bool>(0, false);
+
+  bool IdxNIsKill = hasTrivialKill(Idx);
+
+  // If the index is smaller or larger than intptr_t, truncate or extend it.
+  MVT PtrVT = TLI.getPointerTy(DL);
+  EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
+  if (IdxVT.bitsLT(PtrVT)) {
+    IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*IsZExt=*/false);
+    IdxNIsKill = true;
+  } else if (IdxVT.bitsGT(PtrVT))
+    llvm_unreachable("AArch64 FastISel doesn't support types larger than i64");
+  return std::pair<unsigned, bool>(IdxN, IdxNIsKill);
+}
+
+/// This is mostly a copy of the existing FastISel GEP code, but we have to
+/// duplicate it for AArch64, because otherwise we would bail out even for
+/// simple cases. This is because the standard fastEmit functions don't cover
+/// MUL at all and ADD is lowered very inefficientily.
+bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
+  unsigned N = getRegForValue(I->getOperand(0));
+  if (!N)
+    return false;
+  bool NIsKill = hasTrivialKill(I->getOperand(0));
+
+  // Keep a running tab of the total offset to coalesce multiple N = N + Offset
+  // into a single N = N + TotalOffset.
+  uint64_t TotalOffs = 0;
+  MVT VT = TLI.getPointerTy(DL);
+  for (gep_type_iterator GTI = gep_type_begin(I), E = gep_type_end(I);
+       GTI != E; ++GTI) {
+    const Value *Idx = GTI.getOperand();
+    if (auto *StTy = GTI.getStructTypeOrNull()) {
+      unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
+      // N = N + Offset
+      if (Field)
+        TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);
+    } else {
+      Type *Ty = GTI.getIndexedType();
+
+      // If this is a constant subscript, handle it quickly.
+      if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
+        if (CI->isZero())
+          continue;
+        // N = N + Offset
+        TotalOffs +=
+            DL.getTypeAllocSize(Ty) * cast<ConstantInt>(CI)->getSExtValue();
+        continue;
+      }
+      if (TotalOffs) {
+        N = emitAdd_ri_(VT, N, NIsKill, TotalOffs);
+        if (!N)
+          return false;
+        NIsKill = true;
+        TotalOffs = 0;
+      }
+
+      // N = N + Idx * ElementSize;
+      uint64_t ElementSize = DL.getTypeAllocSize(Ty);
+      std::pair<unsigned, bool> Pair = getRegForGEPIndex(Idx);
+      unsigned IdxN = Pair.first;
+      bool IdxNIsKill = Pair.second;
+      if (!IdxN)
+        return false;
+
+      if (ElementSize != 1) {
+        unsigned C = fastEmit_i(VT, VT, ISD::Constant, ElementSize);
+        if (!C)
+          return false;
+        IdxN = emitMul_rr(VT, IdxN, IdxNIsKill, C, true);
+        if (!IdxN)
+          return false;
+        IdxNIsKill = true;
+      }
+      N = fastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill);
+      if (!N)
+        return false;
+    }
+  }
+  if (TotalOffs) {
+    N = emitAdd_ri_(VT, N, NIsKill, TotalOffs);
+    if (!N)
+      return false;
+  }
+  updateValueMap(I, N);
+  return true;
+}
+
+bool AArch64FastISel::selectAtomicCmpXchg(const AtomicCmpXchgInst *I) {
+  assert(TM.getOptLevel() == CodeGenOpt::None &&
+         "cmpxchg survived AtomicExpand at optlevel > -O0");
+
+  auto *RetPairTy = cast<StructType>(I->getType());
+  Type *RetTy = RetPairTy->getTypeAtIndex(0U);
+  assert(RetPairTy->getTypeAtIndex(1U)->isIntegerTy(1) &&
+         "cmpxchg has a non-i1 status result");
+
+  MVT VT;
+  if (!isTypeLegal(RetTy, VT))
+    return false;
+
+  const TargetRegisterClass *ResRC;
+  unsigned Opc, CmpOpc;
+  // This only supports i32/i64, because i8/i16 aren't legal, and the generic
+  // extractvalue selection doesn't support that.
+  if (VT == MVT::i32) {
+    Opc = AArch64::CMP_SWAP_32;
+    CmpOpc = AArch64::SUBSWrs;
+    ResRC = &AArch64::GPR32RegClass;
+  } else if (VT == MVT::i64) {
+    Opc = AArch64::CMP_SWAP_64;
+    CmpOpc = AArch64::SUBSXrs;
+    ResRC = &AArch64::GPR64RegClass;
+  } else {
+    return false;
+  }
+
+  const MCInstrDesc &II = TII.get(Opc);
+
+  const unsigned AddrReg = constrainOperandRegClass(
+      II, getRegForValue(I->getPointerOperand()), II.getNumDefs());
+  const unsigned DesiredReg = constrainOperandRegClass(
+      II, getRegForValue(I->getCompareOperand()), II.getNumDefs() + 1);
+  const unsigned NewReg = constrainOperandRegClass(
+      II, getRegForValue(I->getNewValOperand()), II.getNumDefs() + 2);
+
+  const unsigned ResultReg1 = createResultReg(ResRC);
+  const unsigned ResultReg2 = createResultReg(&AArch64::GPR32RegClass);
+  const unsigned ScratchReg = createResultReg(&AArch64::GPR32RegClass);
+
+  // FIXME: MachineMemOperand doesn't support cmpxchg yet.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+      .addDef(ResultReg1)
+      .addDef(ScratchReg)
+      .addUse(AddrReg)
+      .addUse(DesiredReg)
+      .addUse(NewReg);
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+      .addDef(VT == MVT::i32 ? AArch64::WZR : AArch64::XZR)
+      .addUse(ResultReg1)
+      .addUse(DesiredReg)
+      .addImm(0);
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr))
+      .addDef(ResultReg2)
+      .addUse(AArch64::WZR)
+      .addUse(AArch64::WZR)
+      .addImm(AArch64CC::NE);
+
+  assert((ResultReg1 + 1) == ResultReg2 && "Nonconsecutive result registers.");
+  updateValueMap(I, ResultReg1, 2);
+  return true;
+}
+
+bool AArch64FastISel::fastSelectInstruction(const Instruction *I) {
+  switch (I->getOpcode()) {
+  default:
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+    return selectAddSub(I);
+  case Instruction::Mul:
+    return selectMul(I);
+  case Instruction::SDiv:
+    return selectSDiv(I);
+  case Instruction::SRem:
+    if (!selectBinaryOp(I, ISD::SREM))
+      return selectRem(I, ISD::SREM);
+    return true;
+  case Instruction::URem:
+    if (!selectBinaryOp(I, ISD::UREM))
+      return selectRem(I, ISD::UREM);
+    return true;
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return selectShift(I);
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return selectLogicalOp(I);
+  case Instruction::Br:
+    return selectBranch(I);
+  case Instruction::IndirectBr:
+    return selectIndirectBr(I);
+  case Instruction::BitCast:
+    if (!FastISel::selectBitCast(I))
+      return selectBitCast(I);
+    return true;
+  case Instruction::FPToSI:
+    if (!selectCast(I, ISD::FP_TO_SINT))
+      return selectFPToInt(I, /*Signed=*/true);
+    return true;
+  case Instruction::FPToUI:
+    return selectFPToInt(I, /*Signed=*/false);
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    return selectIntExt(I);
+  case Instruction::Trunc:
+    if (!selectCast(I, ISD::TRUNCATE))
+      return selectTrunc(I);
+    return true;
+  case Instruction::FPExt:
+    return selectFPExt(I);
+  case Instruction::FPTrunc:
+    return selectFPTrunc(I);
+  case Instruction::SIToFP:
+    if (!selectCast(I, ISD::SINT_TO_FP))
+      return selectIntToFP(I, /*Signed=*/true);
+    return true;
+  case Instruction::UIToFP:
+    return selectIntToFP(I, /*Signed=*/false);
+  case Instruction::Load:
+    return selectLoad(I);
+  case Instruction::Store:
+    return selectStore(I);
+  case Instruction::FCmp:
+  case Instruction::ICmp:
+    return selectCmp(I);
+  case Instruction::Select:
+    return selectSelect(I);
+  case Instruction::Ret:
+    return selectRet(I);
+  case Instruction::FRem:
+    return selectFRem(I);
+  case Instruction::GetElementPtr:
+    return selectGetElementPtr(I);
+  case Instruction::AtomicCmpXchg:
+    return selectAtomicCmpXchg(cast<AtomicCmpXchgInst>(I));
+  }
+
+  // fall-back to target-independent instruction selection.
+  return selectOperator(I, I->getOpcode());
+  // Silence warnings.
+  (void)&CC_AArch64_DarwinPCS_VarArg;
+}
+
+namespace llvm {
+llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
+                                        const TargetLibraryInfo *LibInfo) {
+  return new AArch64FastISel(FuncInfo, LibInfo);
+}
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
new file mode 100644
index 000000000000..f5b8c35375f8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -0,0 +1,1200 @@
+//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of TargetFrameLowering class.
+//
+// On AArch64, stack frames are structured as follows:
+//
+// The stack grows downward.
+//
+// All of the individual frame areas on the frame below are optional, i.e. it's
+// possible to create a function so that the particular area isn't present
+// in the frame.
+//
+// At function entry, the "frame" looks as follows:
+//
+// |                                   | Higher address
+// |-----------------------------------|
+// |                                   |
+// | arguments passed on the stack     |
+// |                                   |
+// |-----------------------------------| <- sp
+// |                                   | Lower address
+//
+//
+// After the prologue has run, the frame has the following general structure.
+// Note that this doesn't depict the case where a red-zone is used. Also,
+// technically the last frame area (VLAs) doesn't get created until in the
+// main function body, after the prologue is run. However, it's depicted here
+// for completeness.
+//
+// |                                   | Higher address
+// |-----------------------------------|
+// |                                   |
+// | arguments passed on the stack     |
+// |                                   |
+// |-----------------------------------|
+// |                                   |
+// | prev_fp, prev_lr                  |
+// | (a.k.a. "frame record")           |
+// |-----------------------------------| <- fp(=x29)
+// |                                   |
+// | other callee-saved registers      |
+// |                                   |
+// |-----------------------------------|
+// |.empty.space.to.make.part.below....|
+// |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
+// |.the.standard.16-byte.alignment....|  compile time; if present)
+// |-----------------------------------|
+// |                                   |
+// | local variables of fixed size     |
+// | including spill slots             |
+// |-----------------------------------| <- bp(not defined by ABI,
+// |.variable-sized.local.variables....|       LLVM chooses X19)
+// |.(VLAs)............................| (size of this area is unknown at
+// |...................................|  compile time)
+// |-----------------------------------| <- sp
+// |                                   | Lower address
+//
+//
+// To access the data in a frame, at-compile time, a constant offset must be
+// computable from one of the pointers (fp, bp, sp) to access it. The size
+// of the areas with a dotted background cannot be computed at compile-time
+// if they are present, making it required to have all three of fp, bp and
+// sp to be set up to be able to access all contents in the frame areas,
+// assuming all of the frame areas are non-empty.
+//
+// For most functions, some of the frame areas are empty. For those functions,
+// it may not be necessary to set up fp or bp:
+// * A base pointer is definitely needed when there are both VLAs and local
+//   variables with more-than-default alignment requirements.
+// * A frame pointer is definitely needed when there are local variables with
+//   more-than-default alignment requirements.
+//
+// In some cases when a base pointer is not strictly needed, it is generated
+// anyway when offsets from the frame pointer to access local variables become
+// so large that the offset can't be encoded in the immediate fields of loads
+// or stores.
+//
+// FIXME: also explain the redzone concept.
+// FIXME: also explain the concept of reserved call frames.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64FrameLowering.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "frame-info"
+
+static cl::opt<bool> EnableRedZone("aarch64-redzone",
+                                   cl::desc("enable use of redzone on AArch64"),
+                                   cl::init(false), cl::Hidden);
+
+STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
+
+bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
+  if (!EnableRedZone)
+    return false;
+  // Don't use the red zone if the function explicitly asks us not to.
+  // This is typically used for kernel code.
+  if (MF.getFunction()->hasFnAttribute(Attribute::NoRedZone))
+    return false;
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  unsigned NumBytes = AFI->getLocalStackSize();
+
+  return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128);
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.
+bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  // Retain behavior of always omitting the FP for leaf functions when possible.
+  return (MFI.hasCalls() &&
+          MF.getTarget().Options.DisableFramePointerElim(MF)) ||
+         MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
+         MFI.hasStackMap() || MFI.hasPatchPoint() ||
+         RegInfo->needsStackRealignment(MF);
+}
+
+/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+/// not required, we reserve argument space for call sites in the function
+/// immediately on entry to the current function.  This eliminates the need for
+/// add/sub sp brackets around call sites.  Returns true if the call frame is
+/// included as part of the stack frame.
+bool
+AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo().hasVarSizedObjects();
+}
+
+MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  const AArch64InstrInfo *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  DebugLoc DL = I->getDebugLoc();
+  unsigned Opc = I->getOpcode();
+  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
+  uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  if (!TFI->hasReservedCallFrame(MF)) {
+    unsigned Align = getStackAlignment();
+
+    int64_t Amount = I->getOperand(0).getImm();
+    Amount = alignTo(Amount, Align);
+    if (!IsDestroy)
+      Amount = -Amount;
+
+    // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
+    // doesn't have to pop anything), then the first operand will be zero too so
+    // this adjustment is a no-op.
+    if (CalleePopAmount == 0) {
+      // FIXME: in-function stack adjustment for calls is limited to 24-bits
+      // because there's no guaranteed temporary register available.
+      //
+      // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
+      // 1) For offset <= 12-bit, we use LSL #0
+      // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
+      // LSL #0, and the other uses LSL #12.
+      //
+      // Most call frames will be allocated at the start of a function so
+      // this is OK, but it is a limitation that needs dealing with.
+      assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
+      emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
+    }
+  } else if (CalleePopAmount != 0) {
+    // If the calling convention demands that the callee pops arguments from the
+    // stack, we want to add it back if we have a reserved call frame.
+    assert(CalleePopAmount < 0xffffff && "call frame too large");
+    emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
+                    TII);
+  }
+  return MBB.erase(I);
+}
+
+void AArch64FrameLowering::emitCalleeSavedFrameMoves(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const MCRegisterInfo *MRI = STI.getRegisterInfo();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  if (CSI.empty())
+    return;
+
+  for (const auto &Info : CSI) {
+    unsigned Reg = Info.getReg();
+    int64_t Offset =
+        MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
+    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+    unsigned CFIIndex = MF.addFrameInst(
+        MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
+}
+
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer.  We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
+  MachineFunction *MF = MBB->getParent();
+
+  // If MBB is an entry block, use X9 as the scratch register
+  if (&MF->front() == MBB)
+    return AArch64::X9;
+
+  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+  LivePhysRegs LiveRegs(&TRI);
+  LiveRegs.addLiveIns(*MBB);
+
+  // Mark callee saved registers as used so we will not choose them.
+  const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MF);
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    LiveRegs.addReg(CSRegs[i]);
+
+  // Prefer X9 since it was historically used for the prologue scratch reg.
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  if (LiveRegs.available(MRI, AArch64::X9))
+    return AArch64::X9;
+
+  for (unsigned Reg : AArch64::GPR64RegClass) {
+    if (LiveRegs.available(MRI, Reg))
+      return Reg;
+  }
+  return AArch64::NoRegister;
+}
+
+bool AArch64FrameLowering::canUseAsPrologue(
+    const MachineBasicBlock &MBB) const {
+  const MachineFunction *MF = MBB.getParent();
+  MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
+  const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+
+  // Don't need a scratch register if we're not going to re-align the stack.
+  if (!RegInfo->needsStackRealignment(*MF))
+    return true;
+  // Otherwise, we can use any block as long as it has a scratch register
+  // available.
+  return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
+}
+
+bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
+    MachineFunction &MF, unsigned StackBumpBytes) const {
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+
+  if (AFI->getLocalStackSize() == 0)
+    return false;
+
+  // 512 is the maximum immediate for stp/ldp that will be used for
+  // callee-save save/restores
+  if (StackBumpBytes >= 512)
+    return false;
+
+  if (MFI.hasVarSizedObjects())
+    return false;
+
+  if (RegInfo->needsStackRealignment(MF))
+    return false;
+
+  // This isn't strictly necessary, but it simplifies things a bit since the
+  // current RedZone handling code assumes the SP is adjusted by the
+  // callee-save save/restore code.
+  if (canUseRedZone(MF))
+    return false;
+
+  return true;
+}
+
+// Convert callee-save register save/restore instruction to do stack pointer
+// decrement/increment to allocate/deallocate the callee-save stack area by
+// converting store/load to use pre/post increment version.
+static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) {
+
+  unsigned NewOpc;
+  bool NewIsUnscaled = false;
+  switch (MBBI->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected callee-save save/restore opcode!");
+  case AArch64::STPXi:
+    NewOpc = AArch64::STPXpre;
+    break;
+  case AArch64::STPDi:
+    NewOpc = AArch64::STPDpre;
+    break;
+  case AArch64::STRXui:
+    NewOpc = AArch64::STRXpre;
+    NewIsUnscaled = true;
+    break;
+  case AArch64::STRDui:
+    NewOpc = AArch64::STRDpre;
+    NewIsUnscaled = true;
+    break;
+  case AArch64::LDPXi:
+    NewOpc = AArch64::LDPXpost;
+    break;
+  case AArch64::LDPDi:
+    NewOpc = AArch64::LDPDpost;
+    break;
+  case AArch64::LDRXui:
+    NewOpc = AArch64::LDRXpost;
+    NewIsUnscaled = true;
+    break;
+  case AArch64::LDRDui:
+    NewOpc = AArch64::LDRDpost;
+    NewIsUnscaled = true;
+    break;
+  }
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
+  MIB.addReg(AArch64::SP, RegState::Define);
+
+  // Copy all operands other than the immediate offset.
+  unsigned OpndIdx = 0;
+  for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
+       ++OpndIdx)
+    MIB.addOperand(MBBI->getOperand(OpndIdx));
+
+  assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
+         "Unexpected immediate offset in first/last callee-save save/restore "
+         "instruction!");
+  assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
+         "Unexpected base register in callee-save save/restore instruction!");
+  // Last operand is immediate offset that needs fixing.
+  assert(CSStackSizeInc % 8 == 0);
+  int64_t CSStackSizeIncImm = CSStackSizeInc;
+  if (!NewIsUnscaled)
+    CSStackSizeIncImm /= 8;
+  MIB.addImm(CSStackSizeIncImm);
+
+  MIB.setMIFlags(MBBI->getFlags());
+  MIB.setMemRefs(MBBI->memoperands_begin(), MBBI->memoperands_end());
+
+  return std::prev(MBB.erase(MBBI));
+}
+
+// Fixup callee-save register save/restore instructions to take into account
+// combined SP bump by adding the local stack size to the stack offsets.
+static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
+                                              unsigned LocalStackSize) {
+  unsigned Opc = MI.getOpcode();
+  (void)Opc;
+  assert((Opc == AArch64::STPXi || Opc == AArch64::STPDi ||
+          Opc == AArch64::STRXui || Opc == AArch64::STRDui ||
+          Opc == AArch64::LDPXi || Opc == AArch64::LDPDi ||
+          Opc == AArch64::LDRXui || Opc == AArch64::LDRDui) &&
+         "Unexpected callee-save save/restore opcode!");
+
+  unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
+  assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
+         "Unexpected base register in callee-save save/restore instruction!");
+  // Last operand is immediate offset that needs fixing.
+  MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
+  // All generated opcodes have scaled offsets.
+  assert(LocalStackSize % 8 == 0);
+  OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8);
+}
+
+void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const Function *Fn = MF.getFunction();
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
+  bool HasFP = hasFP(MF);
+
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc DL;
+
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
+  int NumBytes = (int)MFI.getStackSize();
+  if (!AFI->hasStackFrame()) {
+    assert(!HasFP && "unexpected function without stack frame but with FP");
+
+    // All of the stack allocation is for locals.
+    AFI->setLocalStackSize(NumBytes);
+
+    if (!NumBytes)
+      return;
+    // REDZONE: If the stack size is less than 128 bytes, we don't need
+    // to actually allocate.
+    if (canUseRedZone(MF))
+      ++NumRedZoneFunctions;
+    else {
+      emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
+
+      // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+      MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
+      // Encode the stack size of the leaf function.
+      unsigned CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+    return;
+  }
+
+  auto CSStackSize = AFI->getCalleeSavedStackSize();
+  // All of the remaining stack allocations are for locals.
+  AFI->setLocalStackSize(NumBytes - CSStackSize);
+
+  bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+  if (CombineSPBump) {
+    emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+                    MachineInstr::FrameSetup);
+    NumBytes = 0;
+  } else if (CSStackSize != 0) {
+    MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII,
+                                                     -CSStackSize);
+    NumBytes -= CSStackSize;
+  }
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+  // Move past the saves of the callee-saved registers, fixing up the offsets
+  // and pre-inc if we decided to combine the callee-save and local stack
+  // pointer bump above.
+  MachineBasicBlock::iterator End = MBB.end();
+  while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) {
+    if (CombineSPBump)
+      fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize());
+    ++MBBI;
+  }
+  if (HasFP) {
+    // Only set up FP if we actually need to. Frame pointer is fp = sp - 16.
+    int FPOffset = CSStackSize - 16;
+    if (CombineSPBump)
+      FPOffset += AFI->getLocalStackSize();
+
+    // Issue    sub fp, sp, FPOffset or
+    //          mov fp,sp          when FPOffset is zero.
+    // Note: All stores of callee-saved registers are marked as "FrameSetup".
+    // This code marks the instruction(s) that set the FP also.
+    emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII,
+                    MachineInstr::FrameSetup);
+  }
+
+  // Allocate space for the rest of the frame.
+  if (NumBytes) {
+    const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
+    unsigned scratchSPReg = AArch64::SP;
+
+    if (NeedsRealignment) {
+      scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
+      assert(scratchSPReg != AArch64::NoRegister);
+    }
+
+    // If we're a leaf function, try using the red zone.
+    if (!canUseRedZone(MF))
+      // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
+      // the correct value here, as NumBytes also includes padding bytes,
+      // which shouldn't be counted here.
+      emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
+
+    if (NeedsRealignment) {
+      const unsigned Alignment = MFI.getMaxAlignment();
+      const unsigned NrBitsToZero = countTrailingZeros(Alignment);
+      assert(NrBitsToZero > 1);
+      assert(scratchSPReg != AArch64::SP);
+
+      // SUB X9, SP, NumBytes
+      //   -- X9 is temporary register, so shouldn't contain any live data here,
+      //   -- free to use. This is already produced by emitFrameOffset above.
+      // AND SP, X9, 0b11111...0000
+      // The logical immediates have a non-trivial encoding. The following
+      // formula computes the encoded immediate with all ones but
+      // NrBitsToZero zero bits as least significant bits.
+      uint32_t andMaskEncoded = (1 << 12)                         // = N
+                                | ((64 - NrBitsToZero) << 6)      // immr
+                                | ((64 - NrBitsToZero - 1) << 0); // imms
+
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
+          .addReg(scratchSPReg, RegState::Kill)
+          .addImm(andMaskEncoded);
+      AFI->setStackRealigned(true);
+    }
+  }
+
+  // If we need a base pointer, set it up here. It's whatever the value of the
+  // stack pointer is at this point. Any variable size objects will be allocated
+  // after this, so we can still use the base pointer to reference locals.
+  //
+  // FIXME: Clarify FrameSetup flags here.
+  // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
+  // needed.
+  if (RegInfo->hasBasePointer(MF)) {
+    TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
+                     false);
+  }
+
+  if (needsFrameMoves) {
+    const DataLayout &TD = MF.getDataLayout();
+    const int StackGrowth = -TD.getPointerSize(0);
+    unsigned FramePtr = RegInfo->getFrameRegister(MF);
+    // An example of the prologue:
+    //
+    //     .globl __foo
+    //     .align 2
+    //  __foo:
+    // Ltmp0:
+    //     .cfi_startproc
+    //     .cfi_personality 155, ___gxx_personality_v0
+    // Leh_func_begin:
+    //     .cfi_lsda 16, Lexception33
+    //
+    //     stp  xa,bx, [sp, -#offset]!
+    //     ...
+    //     stp  x28, x27, [sp, #offset-32]
+    //     stp  fp, lr, [sp, #offset-16]
+    //     add  fp, sp, #offset - 16
+    //     sub  sp, sp, #1360
+    //
+    // The Stack:
+    //       +-------------------------------------------+
+    // 10000 | ........ | ........ | ........ | ........ |
+    // 10004 | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    // 10008 | ........ | ........ | ........ | ........ |
+    // 1000c | ........ | ........ | ........ | ........ |
+    //       +===========================================+
+    // 10010 |                X28 Register               |
+    // 10014 |                X28 Register               |
+    //       +-------------------------------------------+
+    // 10018 |                X27 Register               |
+    // 1001c |                X27 Register               |
+    //       +===========================================+
+    // 10020 |                Frame Pointer              |
+    // 10024 |                Frame Pointer              |
+    //       +-------------------------------------------+
+    // 10028 |                Link Register              |
+    // 1002c |                Link Register              |
+    //       +===========================================+
+    // 10030 | ........ | ........ | ........ | ........ |
+    // 10034 | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    // 10038 | ........ | ........ | ........ | ........ |
+    // 1003c | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    //
+    //     [sp] = 10030        ::    >>initial value<<
+    //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
+    //     fp = sp == 10020    ::  mov fp, sp
+    //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
+    //     sp == 10010         ::    >>final value<<
+    //
+    // The frame pointer (w29) points to address 10020. If we use an offset of
+    // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
+    // for w27, and -32 for w28:
+    //
+    //  Ltmp1:
+    //     .cfi_def_cfa w29, 16
+    //  Ltmp2:
+    //     .cfi_offset w30, -8
+    //  Ltmp3:
+    //     .cfi_offset w29, -16
+    //  Ltmp4:
+    //     .cfi_offset w27, -24
+    //  Ltmp5:
+    //     .cfi_offset w28, -32
+
+    if (HasFP) {
+      // Define the current CFA rule to use the provided FP.
+      unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
+      unsigned CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+    } else {
+      // Encode the stack size of the leaf function.
+      unsigned CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize()));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+
+    // Now emit the moves for whatever callee saved regs we have (including FP,
+    // LR if those are saved).
+    emitCalleeSavedFrameMoves(MBB, MBBI);
+  }
+}
+
+void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL;
+  bool IsTailCallReturn = false;
+  if (MBB.end() != MBBI) {
+    DL = MBBI->getDebugLoc();
+    unsigned RetOpcode = MBBI->getOpcode();
+    IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
+      RetOpcode == AArch64::TCRETURNri;
+  }
+  int NumBytes = MFI.getStackSize();
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
+  // Initial and residual are named for consistency with the prologue. Note that
+  // in the epilogue, the residual adjustment is executed first.
+  uint64_t ArgumentPopSize = 0;
+  if (IsTailCallReturn) {
+    MachineOperand &StackAdjust = MBBI->getOperand(1);
+
+    // For a tail-call in a callee-pops-arguments environment, some or all of
+    // the stack may actually be in use for the call's arguments, this is
+    // calculated during LowerCall and consumed here...
+    ArgumentPopSize = StackAdjust.getImm();
+  } else {
+    // ... otherwise the amount to pop is *all* of the argument space,
+    // conveniently stored in the MachineFunctionInfo by
+    // LowerFormalArguments. This will, of course, be zero for the C calling
+    // convention.
+    ArgumentPopSize = AFI->getArgumentStackToRestore();
+  }
+
+  // The stack frame should be like below,
+  //
+  //      ----------------------                     ---
+  //      |                    |                      |
+  //      | BytesInStackArgArea|              CalleeArgStackSize
+  //      | (NumReusableBytes) |                (of tail call)
+  //      |                    |                     ---
+  //      |                    |                      |
+  //      ---------------------|        ---           |
+  //      |                    |         |            |
+  //      |   CalleeSavedReg   |         |            |
+  //      | (CalleeSavedStackSize)|      |            |
+  //      |                    |         |            |
+  //      ---------------------|         |         NumBytes
+  //      |                    |     StackSize  (StackAdjustUp)
+  //      |   LocalStackSize   |         |            |
+  //      | (covering callee   |         |            |
+  //      |       args)        |         |            |
+  //      |                    |         |            |
+  //      ----------------------        ---          ---
+  //
+  // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
+  //             = StackSize + ArgumentPopSize
+  //
+  // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
+  // it as the 2nd argument of AArch64ISD::TC_RETURN.
+
+  auto CSStackSize = AFI->getCalleeSavedStackSize();
+  bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+
+  if (!CombineSPBump && CSStackSize != 0)
+    convertCalleeSaveRestoreToSPPrePostIncDec(
+        MBB, std::prev(MBB.getFirstTerminator()), DL, TII, CSStackSize);
+
+  // Move past the restores of the callee-saved registers.
+  MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
+  MachineBasicBlock::iterator Begin = MBB.begin();
+  while (LastPopI != Begin) {
+    --LastPopI;
+    if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) {
+      ++LastPopI;
+      break;
+    } else if (CombineSPBump)
+      fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize());
+  }
+
+  // If there is a single SP update, insert it before the ret and we're done.
+  if (CombineSPBump) {
+    emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+                    NumBytes + ArgumentPopSize, TII,
+                    MachineInstr::FrameDestroy);
+    return;
+  }
+
+  NumBytes -= CSStackSize;
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+  if (!hasFP(MF)) {
+    bool RedZone = canUseRedZone(MF);
+    // If this was a redzone leaf function, we don't need to restore the
+    // stack pointer (but we may need to pop stack args for fastcc).
+    if (RedZone && ArgumentPopSize == 0)
+      return;
+
+    bool NoCalleeSaveRestore = CSStackSize == 0;
+    int StackRestoreBytes = RedZone ? 0 : NumBytes;
+    if (NoCalleeSaveRestore)
+      StackRestoreBytes += ArgumentPopSize;
+    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+                    StackRestoreBytes, TII, MachineInstr::FrameDestroy);
+    // If we were able to combine the local stack pop with the argument pop,
+    // then we're done.
+    if (NoCalleeSaveRestore || ArgumentPopSize == 0)
+      return;
+    NumBytes = 0;
+  }
+
+  // Restore the original stack pointer.
+  // FIXME: Rather than doing the math here, we should instead just use
+  // non-post-indexed loads for the restores if we aren't actually going to
+  // be able to save any instructions.
+  if (MFI.hasVarSizedObjects() || AFI->isStackRealigned())
+    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
+                    -CSStackSize + 16, TII, MachineInstr::FrameDestroy);
+  else if (NumBytes)
+    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII,
+                    MachineInstr::FrameDestroy);
+
+  // This must be placed after the callee-save restore code because that code
+  // assumes the SP is at the same location as it was after the callee-save save
+  // code in the prologue.
+  if (ArgumentPopSize)
+    emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+                    ArgumentPopSize, TII, MachineInstr::FrameDestroy);
+}
+
+/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
+/// debug info.  It's the same as what we use for resolving the code-gen
+/// references for now.  FIXME: This can go wrong when references are
+/// SP-relative and simple call frames aren't used.
+int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                 int FI,
+                                                 unsigned &FrameReg) const {
+  return resolveFrameIndexReference(MF, FI, FrameReg);
+}
+
+int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
+                                                     int FI, unsigned &FrameReg,
+                                                     bool PreferFP) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  int FPOffset = MFI.getObjectOffset(FI) + 16;
+  int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize();
+  bool isFixed = MFI.isFixedObjectIndex(FI);
+
+  // Use frame pointer to reference fixed objects. Use it for locals if
+  // there are VLAs or a dynamically realigned SP (and thus the SP isn't
+  // reliable as a base). Make sure useFPForScavengingIndex() does the
+  // right thing for the emergency spill slot.
+  bool UseFP = false;
+  if (AFI->hasStackFrame()) {
+    // Note: Keeping the following as multiple 'if' statements rather than
+    // merging to a single expression for readability.
+    //
+    // Argument access should always use the FP.
+    if (isFixed) {
+      UseFP = hasFP(MF);
+    } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF) &&
+               !RegInfo->needsStackRealignment(MF)) {
+      // Use SP or FP, whichever gives us the best chance of the offset
+      // being in range for direct access. If the FPOffset is positive,
+      // that'll always be best, as the SP will be even further away.
+      // If the FPOffset is negative, we have to keep in mind that the
+      // available offset range for negative offsets is smaller than for
+      // positive ones. If we have variable sized objects, we're stuck with
+      // using the FP regardless, though, as the SP offset is unknown
+      // and we don't have a base pointer available. If an offset is
+      // available via the FP and the SP, use whichever is closest.
+      if (PreferFP || MFI.hasVarSizedObjects() || FPOffset >= 0 ||
+          (FPOffset >= -256 && Offset > -FPOffset))
+        UseFP = true;
+    }
+  }
+
+  assert((isFixed || !RegInfo->needsStackRealignment(MF) || !UseFP) &&
+         "In the presence of dynamic stack pointer realignment, "
+         "non-argument objects cannot be accessed through the frame pointer");
+
+  if (UseFP) {
+    FrameReg = RegInfo->getFrameRegister(MF);
+    return FPOffset;
+  }
+
+  // Use the base pointer if we have one.
+  if (RegInfo->hasBasePointer(MF))
+    FrameReg = RegInfo->getBaseRegister();
+  else {
+    FrameReg = AArch64::SP;
+    // If we're using the red zone for this function, the SP won't actually
+    // be adjusted, so the offsets will be negative. They're also all
+    // within range of the signed 9-bit immediate instructions.
+    if (canUseRedZone(MF))
+      Offset -= AFI->getLocalStackSize();
+  }
+
+  return Offset;
+}
+
+static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
+  // Do not set a kill flag on values that are also marked as live-in. This
+  // happens with the @llvm-returnaddress intrinsic and with arguments passed in
+  // callee saved registers.
+  // Omitting the kill flags is conservatively correct even if the live-in
+  // is not used after all.
+  bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
+  return getKillRegState(!IsLiveIn);
+}
+
+static bool produceCompactUnwindFrame(MachineFunction &MF) {
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  AttributeSet Attrs = MF.getFunction()->getAttributes();
+  return Subtarget.isTargetMachO() &&
+         !(Subtarget.getTargetLowering()->supportSwiftError() &&
+           Attrs.hasAttrSomewhere(Attribute::SwiftError));
+}
+
+namespace {
+struct RegPairInfo {
+  RegPairInfo() : Reg1(AArch64::NoRegister), Reg2(AArch64::NoRegister) {}
+  unsigned Reg1;
+  unsigned Reg2;
+  int FrameIdx;
+  int Offset;
+  bool IsGPR;
+  bool isPaired() const { return Reg2 != AArch64::NoRegister; }
+};
+} // end anonymous namespace
+
+static void computeCalleeSaveRegisterPairs(
+    MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs) {
+
+  if (CSI.empty())
+    return;
+
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  CallingConv::ID CC = MF.getFunction()->getCallingConv();
+  unsigned Count = CSI.size();
+  (void)CC;
+  // MachO's compact unwind format relies on all registers being stored in
+  // pairs.
+  assert((!produceCompactUnwindFrame(MF) ||
+          CC == CallingConv::PreserveMost ||
+          (Count & 1) == 0) &&
+         "Odd number of callee-saved regs to spill!");
+  unsigned Offset = AFI->getCalleeSavedStackSize();
+
+  for (unsigned i = 0; i < Count; ++i) {
+    RegPairInfo RPI;
+    RPI.Reg1 = CSI[i].getReg();
+
+    assert(AArch64::GPR64RegClass.contains(RPI.Reg1) ||
+           AArch64::FPR64RegClass.contains(RPI.Reg1));
+    RPI.IsGPR = AArch64::GPR64RegClass.contains(RPI.Reg1);
+
+    // Add the next reg to the pair if it is in the same register class.
+    if (i + 1 < Count) {
+      unsigned NextReg = CSI[i + 1].getReg();
+      if ((RPI.IsGPR && AArch64::GPR64RegClass.contains(NextReg)) ||
+          (!RPI.IsGPR && AArch64::FPR64RegClass.contains(NextReg)))
+        RPI.Reg2 = NextReg;
+    }
+
+    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+    // list to come in sorted by frame index so that we can issue the store
+    // pair instructions directly. Assert if we see anything otherwise.
+    //
+    // The order of the registers in the list is controlled by
+    // getCalleeSavedRegs(), so they will always be in-order, as well.
+    assert((!RPI.isPaired() ||
+            (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) &&
+           "Out of order callee saved regs!");
+
+    // MachO's compact unwind format relies on all registers being stored in
+    // adjacent register pairs.
+    assert((!produceCompactUnwindFrame(MF) ||
+            CC == CallingConv::PreserveMost ||
+            (RPI.isPaired() &&
+             ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
+              RPI.Reg1 + 1 == RPI.Reg2))) &&
+           "Callee-save registers not saved as adjacent register pair!");
+
+    RPI.FrameIdx = CSI[i].getFrameIdx();
+
+    if (Count * 8 != AFI->getCalleeSavedStackSize() && !RPI.isPaired()) {
+      // Round up size of non-pair to pair size if we need to pad the
+      // callee-save area to ensure 16-byte alignment.
+      Offset -= 16;
+      assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16);
+      MFI.setObjectAlignment(RPI.FrameIdx, 16);
+      AFI->setCalleeSaveStackHasFreeSpace(true);
+    } else
+      Offset -= RPI.isPaired() ? 16 : 8;
+    assert(Offset % 8 == 0);
+    RPI.Offset = Offset / 8;
+    assert((RPI.Offset >= -64 && RPI.Offset <= 63) &&
+           "Offset out of bounds for LDP/STP immediate");
+
+    RegPairs.push_back(RPI);
+    if (RPI.isPaired())
+      ++i;
+  }
+}
+
+bool AArch64FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  DebugLoc DL;
+  SmallVector<RegPairInfo, 8> RegPairs;
+
+  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+
+  for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
+       ++RPII) {
+    RegPairInfo RPI = *RPII;
+    unsigned Reg1 = RPI.Reg1;
+    unsigned Reg2 = RPI.Reg2;
+    unsigned StrOpc;
+
+    // Issue sequence of spills for cs regs.  The first spill may be converted
+    // to a pre-decrement store later by emitPrologue if the callee-save stack
+    // area allocation can't be combined with the local stack area allocation.
+    // For example:
+    //    stp     x22, x21, [sp, #0]     // addImm(+0)
+    //    stp     x20, x19, [sp, #16]    // addImm(+2)
+    //    stp     fp, lr, [sp, #32]      // addImm(+4)
+    // Rationale: This sequence saves uop updates compared to a sequence of
+    // pre-increment spills like stp xi,xj,[sp,#-16]!
+    // Note: Similar rationale and sequence for restores in epilog.
+    if (RPI.IsGPR)
+      StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
+    else
+      StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
+    DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1);
+          if (RPI.isPaired())
+            dbgs() << ", " << TRI->getName(Reg2);
+          dbgs() << ") -> fi#(" << RPI.FrameIdx;
+          if (RPI.isPaired())
+            dbgs() << ", " << RPI.FrameIdx+1;
+          dbgs() << ")\n");
+
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
+    MBB.addLiveIn(Reg1);
+    if (RPI.isPaired()) {
+      MBB.addLiveIn(Reg2);
+      MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
+      MIB.addMemOperand(MF.getMachineMemOperand(
+          MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
+          MachineMemOperand::MOStore, 8, 8));
+    }
+    MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
+        .addReg(AArch64::SP)
+        .addImm(RPI.Offset) // [sp, #offset*8], where factor*8 is implicit
+        .setMIFlag(MachineInstr::FrameSetup);
+    MIB.addMemOperand(MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
+        MachineMemOperand::MOStore, 8, 8));
+  }
+  return true;
+}
+
+bool AArch64FrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  DebugLoc DL;
+  SmallVector<RegPairInfo, 8> RegPairs;
+
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+
+  for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE;
+       ++RPII) {
+    RegPairInfo RPI = *RPII;
+    unsigned Reg1 = RPI.Reg1;
+    unsigned Reg2 = RPI.Reg2;
+
+    // Issue sequence of restores for cs regs. The last restore may be converted
+    // to a post-increment load later by emitEpilogue if the callee-save stack
+    // area allocation can't be combined with the local stack area allocation.
+    // For example:
+    //    ldp     fp, lr, [sp, #32]       // addImm(+4)
+    //    ldp     x20, x19, [sp, #16]     // addImm(+2)
+    //    ldp     x22, x21, [sp, #0]      // addImm(+0)
+    // Note: see comment in spillCalleeSavedRegisters()
+    unsigned LdrOpc;
+    if (RPI.IsGPR)
+      LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
+    else
+      LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
+    DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1);
+          if (RPI.isPaired())
+            dbgs() << ", " << TRI->getName(Reg2);
+          dbgs() << ") -> fi#(" << RPI.FrameIdx;
+          if (RPI.isPaired())
+            dbgs() << ", " << RPI.FrameIdx+1;
+          dbgs() << ")\n");
+
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
+    if (RPI.isPaired()) {
+      MIB.addReg(Reg2, getDefRegState(true));
+      MIB.addMemOperand(MF.getMachineMemOperand(
+          MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
+          MachineMemOperand::MOLoad, 8, 8));
+    }
+    MIB.addReg(Reg1, getDefRegState(true))
+        .addReg(AArch64::SP)
+        .addImm(RPI.Offset) // [sp, #offset*8] where the factor*8 is implicit
+        .setMIFlag(MachineInstr::FrameDestroy);
+    MIB.addMemOperand(MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
+        MachineMemOperand::MOLoad, 8, 8));
+  }
+  return true;
+}
+
+void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                                BitVector &SavedRegs,
+                                                RegScavenger *RS) const {
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  unsigned UnspilledCSGPR = AArch64::NoRegister;
+  unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
+
+  // The frame record needs to be created by saving the appropriate registers
+  if (hasFP(MF)) {
+    SavedRegs.set(AArch64::FP);
+    SavedRegs.set(AArch64::LR);
+  }
+
+  unsigned BasePointerReg = AArch64::NoRegister;
+  if (RegInfo->hasBasePointer(MF))
+    BasePointerReg = RegInfo->getBaseRegister();
+
+  bool ExtraCSSpill = false;
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+  // Figure out which callee-saved registers to save/restore.
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    const unsigned Reg = CSRegs[i];
+
+    // Add the base pointer register to SavedRegs if it is callee-save.
+    if (Reg == BasePointerReg)
+      SavedRegs.set(Reg);
+
+    bool RegUsed = SavedRegs.test(Reg);
+    unsigned PairedReg = CSRegs[i ^ 1];
+    if (!RegUsed) {
+      if (AArch64::GPR64RegClass.contains(Reg) &&
+          !RegInfo->isReservedReg(MF, Reg)) {
+        UnspilledCSGPR = Reg;
+        UnspilledCSGPRPaired = PairedReg;
+      }
+      continue;
+    }
+
+    // MachO's compact unwind format relies on all registers being stored in
+    // pairs.
+    // FIXME: the usual format is actually better if unwinding isn't needed.
+    if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) {
+      SavedRegs.set(PairedReg);
+      if (AArch64::GPR64RegClass.contains(PairedReg) &&
+          !RegInfo->isReservedReg(MF, PairedReg))
+        ExtraCSSpill = true;
+    }
+  }
+
+  DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
+        for (int Reg = SavedRegs.find_first(); Reg != -1;
+             Reg = SavedRegs.find_next(Reg))
+          dbgs() << ' ' << PrintReg(Reg, RegInfo);
+        dbgs() << "\n";);
+
+  // If any callee-saved registers are used, the frame cannot be eliminated.
+  unsigned NumRegsSpilled = SavedRegs.count();
+  bool CanEliminateFrame = NumRegsSpilled == 0;
+
+  // FIXME: Set BigStack if any stack slot references may be out of range.
+  // For now, just conservatively guestimate based on unscaled indexing
+  // range. We'll end up allocating an unnecessary spill slot a lot, but
+  // realistically that's not a big deal at this stage of the game.
+  // The CSR spill slots have not been allocated yet, so estimateStackSize
+  // won't include them.
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled;
+  DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
+  bool BigStack = (CFSize >= 256);
+  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
+    AFI->setHasStackFrame(true);
+
+  // Estimate if we might need to scavenge a register at some point in order
+  // to materialize a stack offset. If so, either spill one additional
+  // callee-saved register or reserve a special spill slot to facilitate
+  // register scavenging. If we already spilled an extra callee-saved register
+  // above to keep the number of spills even, we don't need to do anything else
+  // here.
+  if (BigStack && !ExtraCSSpill) {
+    if (UnspilledCSGPR != AArch64::NoRegister) {
+      DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo)
+            << " to get a scratch register.\n");
+      SavedRegs.set(UnspilledCSGPR);
+      // MachO's compact unwind format relies on all registers being stored in
+      // pairs, so if we need to spill one extra for BigStack, then we need to
+      // store the pair.
+      if (produceCompactUnwindFrame(MF))
+        SavedRegs.set(UnspilledCSGPRPaired);
+      ExtraCSSpill = true;
+      NumRegsSpilled = SavedRegs.count();
+    }
+
+    // If we didn't find an extra callee-saved register to spill, create
+    // an emergency spill slot.
+    if (!ExtraCSSpill) {
+      const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
+      int FI = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), false);
+      RS->addScavengingFrameIndex(FI);
+      DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
+                   << " as the emergency spill slot.\n");
+    }
+  }
+
+  // Round up to register pair alignment to avoid additional SP adjustment
+  // instructions.
+  AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16));
+}
+
+bool AArch64FrameLowering::enableStackSlotScavenging(
+    const MachineFunction &MF) const {
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  return AFI->hasCalleeSaveStackFreeSpace();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
new file mode 100644
index 000000000000..f254ea9b70aa
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -0,0 +1,79 @@
+//==-- AArch64FrameLowering.h - TargetFrameLowering for AArch64 --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class AArch64FrameLowering : public TargetFrameLowering {
+public:
+  explicit AArch64FrameLowering()
+      : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
+                            true /*StackRealignable*/) {}
+
+  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI) const;
+
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
+
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+  int resolveFrameIndexReference(const MachineFunction &MF, int FI,
+                                 unsigned &FrameReg,
+                                 bool PreferFP = false) const;
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  /// \brief Can this function use the red zone for local allocations.
+  bool canUseRedZone(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
+
+  /// Returns true if the target will correctly handle shrink wrapping.
+  bool enableShrinkWrapping(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  bool enableStackSlotScavenging(const MachineFunction &MF) const override;
+
+private:
+  bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
+                                      unsigned StackBumpBytes) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
new file mode 100644
index 000000000000..e927d58ad612
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -0,0 +1,173 @@
+//===- AArch64GenRegisterBankInfo.def ----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines all the static objects used by AArch64RegisterBankInfo.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+namespace llvm {
+namespace AArch64 {
+
+RegisterBank GPRRegBank;
+RegisterBank FPRRegBank;
+RegisterBank CCRRegBank;
+
+RegisterBank *RegBanks[] = {&GPRRegBank, &FPRRegBank, &CCRRegBank};
+
+// PartialMappings.
+enum PartialMappingIdx {
+  PMI_None = -1,
+  PMI_GPR32 = 1,
+  PMI_GPR64,
+  PMI_FPR32,
+  PMI_FPR64,
+  PMI_FPR128,
+  PMI_FPR256,
+  PMI_FPR512,
+  PMI_FirstGPR = PMI_GPR32,
+  PMI_LastGPR = PMI_GPR64,
+  PMI_FirstFPR = PMI_FPR32,
+  PMI_LastFPR = PMI_FPR512,
+  PMI_Min = PMI_FirstGPR,
+};
+
+static unsigned getRegBankBaseIdxOffset(unsigned Size) {
+  assert(Size && "0-sized type!!");
+  // Make anything smaller than 32 gets 32
+  Size = ((Size + 31) / 32) * 32;
+  // 32 is 0, 64 is 1, 128 is 2, and so on.
+  return Log2_32(Size) - /*Log2_32(32)=*/ 5;
+}
+
+RegisterBankInfo::PartialMapping PartMappings[] {
+  /* StartIdx, Length, RegBank */
+  // 0: GPR 32-bit value.
+  {0, 32, GPRRegBank},
+  // 1: GPR 64-bit value.
+  {0, 64, GPRRegBank},
+  // 2: FPR 32-bit value.
+  {0, 32, FPRRegBank},
+  // 3: FPR 64-bit value.
+  {0, 64, FPRRegBank},
+  // 4: FPR 128-bit value.
+  {0, 128, FPRRegBank},
+  // 5: FPR 256-bit value.
+  {0, 256, FPRRegBank},
+  // 6: FPR 512-bit value.
+  {0, 512, FPRRegBank}
+};
+
+enum ValueMappingIdx {
+  First3OpsIdx = 0,
+  Last3OpsIdx = 18,
+  DistanceBetweenRegBanks = 3,
+  FirstCrossRegCpyIdx = 21,
+  LastCrossRegCpyIdx = 27,
+  DistanceBetweenCrossRegCpy = 2
+};
+
+// ValueMappings.
+RegisterBankInfo::ValueMapping ValMappings[]{
+    /* BreakDown, NumBreakDowns */
+    // 3-operands instructions (all binary operations should end up with one of
+    // those mapping).
+    // 0: GPR 32-bit value. <-- This must match First3OpsIdx.
+    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
+    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
+    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
+    // 3: GPR 64-bit value.
+    {&PartMappings[PMI_GPR64 - PMI_Min], 1},
+    {&PartMappings[PMI_GPR64 - PMI_Min], 1},
+    {&PartMappings[PMI_GPR64 - PMI_Min], 1},
+    // 6: FPR 32-bit value.
+    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
+    // 9: FPR 64-bit value.
+    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
+    // 12: FPR 128-bit value.
+    {&PartMappings[PMI_FPR128 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR128 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR128 - PMI_Min], 1},
+    // 15: FPR 256-bit value.
+    {&PartMappings[PMI_FPR256 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR256 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR256 - PMI_Min], 1},
+    // 18: FPR 512-bit value. <-- This must match Last3OpsIdx.
+    {&PartMappings[PMI_FPR512 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR512 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR512 - PMI_Min], 1},
+    // Cross register bank copies.
+    // 21: GPR 32-bit value to FPR 32-bit value. <-- This must match
+    //                                               FirstCrossRegCpyIdx.
+    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
+    // 23: GPR 64-bit value to FPR 64-bit value.
+    {&PartMappings[PMI_GPR64 - PMI_Min], 1},
+    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
+    // 25: FPR 32-bit value to GPR 32-bit value.
+    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
+    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
+    // 27: FPR 64-bit value to GPR 64-bit value. <-- This must match
+    //                                               LastCrossRegCpyIdx.
+    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
+    {&PartMappings[PMI_GPR64 - PMI_Min], 1}
+};
+
+/// Get the pointer to the ValueMapping representing the RegisterBank
+/// at \p RBIdx with a size of \p Size.
+///
+/// The returned mapping works for instructions with the same kind of
+/// operands for up to 3 operands.
+///
+/// \pre \p RBIdx != PartialMappingIdx::None
+const RegisterBankInfo::ValueMapping *
+getValueMapping(PartialMappingIdx RBIdx, unsigned Size) {
+  assert(RBIdx != PartialMappingIdx::PMI_None && "No mapping needed for that");
+  unsigned ValMappingIdx = First3OpsIdx +
+                           (RBIdx - AArch64::PartialMappingIdx::PMI_Min +
+                            getRegBankBaseIdxOffset(Size)) *
+                               ValueMappingIdx::DistanceBetweenRegBanks;
+  assert(ValMappingIdx >= AArch64::First3OpsIdx &&
+         ValMappingIdx <= AArch64::Last3OpsIdx && "Mapping out of bound");
+
+  return &ValMappings[ValMappingIdx];
+}
+
+/// Get the pointer to the ValueMapping of the operands of a copy
+/// instruction from a GPR or FPR register to a GPR or FPR register
+/// with a size of \p Size.
+///
+/// If \p DstIsGPR is true, the destination of the copy is on GPR,
+/// otherwise it is on FPR. Same thing for \p SrcIsGPR.
+const RegisterBankInfo::ValueMapping *
+getCopyMapping(bool DstIsGPR, bool SrcIsGPR, unsigned Size) {
+  PartialMappingIdx DstRBIdx = DstIsGPR ? PMI_FirstGPR : PMI_FirstFPR;
+  PartialMappingIdx SrcRBIdx = SrcIsGPR ? PMI_FirstGPR : PMI_FirstFPR;
+  if (DstRBIdx == SrcRBIdx)
+    return getValueMapping(DstRBIdx, Size);
+  assert(Size <= 64 && "GPR cannot handle that size");
+  unsigned ValMappingIdx =
+      FirstCrossRegCpyIdx +
+      (DstRBIdx - PMI_Min + getRegBankBaseIdxOffset(Size)) *
+          ValueMappingIdx::DistanceBetweenCrossRegCpy;
+  assert(ValMappingIdx >= AArch64::FirstCrossRegCpyIdx &&
+         ValMappingIdx <= AArch64::LastCrossRegCpyIdx &&
+         "Mapping out of bound");
+  return &ValMappings[ValMappingIdx];
+}
+
+} // End AArch64 namespace.
+} // End llvm namespace.
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
new file mode 100644
index 000000000000..3099383e5b32
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -0,0 +1,3966 @@
+//===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the AArch64 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h" // To access function attributes.
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-isel"
+
+//===--------------------------------------------------------------------===//
+/// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+
+class AArch64DAGToDAGISel : public SelectionDAGISel {
+
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const AArch64Subtarget *Subtarget;
+
+  bool ForCodeSize;
+
+public:
+  explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
+                               CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
+        ForCodeSize(false) {}
+
+  StringRef getPassName() const override {
+    return "AArch64 Instruction Selection";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    ForCodeSize = MF.getFunction()->optForSize();
+    Subtarget = &MF.getSubtarget<AArch64Subtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
+
+  void Select(SDNode *Node) override;
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    unsigned ConstraintID,
+                                    std::vector<SDValue> &OutOps) override;
+
+  bool tryMLAV64LaneV128(SDNode *N);
+  bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N);
+  bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
+  bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+  bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+  bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+    return SelectShiftedRegister(N, false, Reg, Shift);
+  }
+  bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+    return SelectShiftedRegister(N, true, Reg, Shift);
+  }
+  bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 16, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 16, Base, OffImm);
+  }
+
+  template<int Width>
+  bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
+                         SDValue &SignExtend, SDValue &DoShift) {
+    return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
+  }
+
+  template<int Width>
+  bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset,
+                         SDValue &SignExtend, SDValue &DoShift) {
+    return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
+  }
+
+
+  /// Form sequences of consecutive 64/128-bit registers for use in NEON
+  /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
+  /// between 1 and 4 elements. If it contains a single element that is returned
+  /// unchanged; otherwise a REG_SEQUENCE value is returned.
+  SDValue createDTuple(ArrayRef<SDValue> Vecs);
+  SDValue createQTuple(ArrayRef<SDValue> Vecs);
+
+  /// Generic helper for the createDTuple/createQTuple
+  /// functions. Those should almost always be called instead.
+  SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[],
+                      const unsigned SubRegs[]);
+
+  void SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
+
+  bool tryIndexedLoad(SDNode *N);
+
+  void SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                     unsigned SubRegIdx);
+  void SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                         unsigned SubRegIdx);
+  void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+  void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+  bool tryBitfieldExtractOp(SDNode *N);
+  bool tryBitfieldExtractOpFromSExt(SDNode *N);
+  bool tryBitfieldInsertOp(SDNode *N);
+  bool tryBitfieldInsertInZeroOp(SDNode *N);
+
+  bool tryReadRegister(SDNode *N);
+  bool tryWriteRegister(SDNode *N);
+
+// Include the pieces autogenerated from the target description.
+#include "AArch64GenDAGISel.inc"
+
+private:
+  bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
+                             SDValue &Shift);
+  bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
+                               SDValue &OffImm);
+  bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
+                             SDValue &OffImm);
+  bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
+                              SDValue &OffImm);
+  bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base,
+                         SDValue &Offset, SDValue &SignExtend,
+                         SDValue &DoShift);
+  bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
+                         SDValue &Offset, SDValue &SignExtend,
+                         SDValue &DoShift);
+  bool isWorthFolding(SDValue V) const;
+  bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
+                         SDValue &Offset, SDValue &SignExtend);
+
+  template<unsigned RegWidth>
+  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
+    return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
+  }
+
+  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
+
+  void SelectCMP_SWAP(SDNode *N);
+
+};
+} // end anonymous namespace
+
+/// isIntImmediate - This method tests to see if the node is a constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
+  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
+    Imm = C->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+// isIntImmediate - This method tests to see if a constant operand.
+// If so Imm will receive the value.
+static bool isIntImmediate(SDValue N, uint64_t &Imm) {
+  return isIntImmediate(N.getNode(), Imm);
+}
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
+                                  uint64_t &Imm) {
+  return N->getOpcode() == Opc &&
+         isIntImmediate(N->getOperand(1).getNode(), Imm);
+}
+
+bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
+  switch(ConstraintID) {
+  default:
+    llvm_unreachable("Unexpected asm memory constraint");
+  case InlineAsm::Constraint_i:
+  case InlineAsm::Constraint_m:
+  case InlineAsm::Constraint_Q:
+    // Require the address to be in a register.  That is safe for all AArch64
+    // variants and it is hard to do anything much smarter without knowing
+    // how the operand is used.
+    OutOps.push_back(Op);
+    return false;
+  }
+  return true;
+}
+
+/// SelectArithImmed - Select an immediate value that can be represented as
+/// a 12-bit value shifted left by either 0 or 12.  If so, return true with
+/// Val set to the 12-bit value and Shift set to the shifter operand.
+bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
+                                           SDValue &Shift) {
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  if (!isa<ConstantSDNode>(N.getNode()))
+    return false;
+
+  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+  unsigned ShiftAmt;
+
+  if (Immed >> 12 == 0) {
+    ShiftAmt = 0;
+  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
+    ShiftAmt = 12;
+    Immed = Immed >> 12;
+  } else
+    return false;
+
+  unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+  SDLoc dl(N);
+  Val = CurDAG->getTargetConstant(Immed, dl, MVT::i32);
+  Shift = CurDAG->getTargetConstant(ShVal, dl, MVT::i32);
+  return true;
+}
+
+/// SelectNegArithImmed - As above, but negates the value before trying to
+/// select it.
+bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
+                                              SDValue &Shift) {
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  if (!isa<ConstantSDNode>(N.getNode()))
+    return false;
+
+  // The immediate operand must be a 24-bit zero-extended immediate.
+  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+
+  // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
+  // have the opposite effect on the C flag, so this pattern mustn't match under
+  // those circumstances.
+  if (Immed == 0)
+    return false;
+
+  if (N.getValueType() == MVT::i32)
+    Immed = ~((uint32_t)Immed) + 1;
+  else
+    Immed = ~Immed + 1ULL;
+  if (Immed & 0xFFFFFFFFFF000000ULL)
+    return false;
+
+  Immed &= 0xFFFFFFULL;
+  return SelectArithImmed(CurDAG->getConstant(Immed, SDLoc(N), MVT::i32), Val,
+                          Shift);
+}
+
+/// getShiftTypeForNode - Translate a shift node to the corresponding
+/// ShiftType value.
+static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
+  switch (N.getOpcode()) {
+  default:
+    return AArch64_AM::InvalidShiftExtend;
+  case ISD::SHL:
+    return AArch64_AM::LSL;
+  case ISD::SRL:
+    return AArch64_AM::LSR;
+  case ISD::SRA:
+    return AArch64_AM::ASR;
+  case ISD::ROTR:
+    return AArch64_AM::ROR;
+  }
+}
+
+/// \brief Determine whether it is worth to fold V into an extended register.
+bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
+  // it hurts if the value is used at least twice, unless we are optimizing
+  // for code size.
+  return ForCodeSize || V.hasOneUse();
+}
+
+/// SelectShiftedRegister - Select a "shifted register" operand.  If the value
+/// is not shifted, set the Shift operand to default of "LSL 0".  The logical
+/// instructions allow the shifted register to be rotated, but the arithmetic
+/// instructions do not.  The AllowROR parameter specifies whether ROR is
+/// supported.
+bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
+                                                SDValue &Reg, SDValue &Shift) {
+  AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
+  if (ShType == AArch64_AM::InvalidShiftExtend)
+    return false;
+  if (!AllowROR && ShType == AArch64_AM::ROR)
+    return false;
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    unsigned BitSize = N.getValueSizeInBits();
+    unsigned Val = RHS->getZExtValue() & (BitSize - 1);
+    unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
+
+    Reg = N.getOperand(0);
+    Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32);
+    return isWorthFolding(N);
+  }
+
+  return false;
+}
+
+/// getExtendTypeForNode - Translate an extend node to the corresponding
+/// ExtendType value.
+static AArch64_AM::ShiftExtendType
+getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
+  if (N.getOpcode() == ISD::SIGN_EXTEND ||
+      N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    EVT SrcVT;
+    if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
+      SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
+    else
+      SrcVT = N.getOperand(0).getValueType();
+
+    if (!IsLoadStore && SrcVT == MVT::i8)
+      return AArch64_AM::SXTB;
+    else if (!IsLoadStore && SrcVT == MVT::i16)
+      return AArch64_AM::SXTH;
+    else if (SrcVT == MVT::i32)
+      return AArch64_AM::SXTW;
+    assert(SrcVT != MVT::i64 && "extend from 64-bits?");
+
+    return AArch64_AM::InvalidShiftExtend;
+  } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
+             N.getOpcode() == ISD::ANY_EXTEND) {
+    EVT SrcVT = N.getOperand(0).getValueType();
+    if (!IsLoadStore && SrcVT == MVT::i8)
+      return AArch64_AM::UXTB;
+    else if (!IsLoadStore && SrcVT == MVT::i16)
+      return AArch64_AM::UXTH;
+    else if (SrcVT == MVT::i32)
+      return AArch64_AM::UXTW;
+    assert(SrcVT != MVT::i64 && "extend from 64-bits?");
+
+    return AArch64_AM::InvalidShiftExtend;
+  } else if (N.getOpcode() == ISD::AND) {
+    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!CSD)
+      return AArch64_AM::InvalidShiftExtend;
+    uint64_t AndMask = CSD->getZExtValue();
+
+    switch (AndMask) {
+    default:
+      return AArch64_AM::InvalidShiftExtend;
+    case 0xFF:
+      return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
+    case 0xFFFF:
+      return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
+    case 0xFFFFFFFF:
+      return AArch64_AM::UXTW;
+    }
+  }
+
+  return AArch64_AM::InvalidShiftExtend;
+}
+
+// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
+static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
+  if (DL->getOpcode() != AArch64ISD::DUPLANE16 &&
+      DL->getOpcode() != AArch64ISD::DUPLANE32)
+    return false;
+
+  SDValue SV = DL->getOperand(0);
+  if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
+    return false;
+
+  SDValue EV = SV.getOperand(1);
+  if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+    return false;
+
+  ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
+  ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
+  LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
+  LaneOp = EV.getOperand(0);
+
+  return true;
+}
+
+// Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a
+// high lane extract.
+static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
+                             SDValue &LaneOp, int &LaneIdx) {
+
+  if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
+    std::swap(Op0, Op1);
+    if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
+      return false;
+  }
+  StdOp = Op1;
+  return true;
+}
+
+/// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
+/// is a lane in the upper half of a 128-bit vector.  Recognize and select this
+/// so that we don't emit unnecessary lane extracts.
+bool AArch64DAGToDAGISel::tryMLAV64LaneV128(SDNode *N) {
+  SDLoc dl(N);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue MLAOp1;   // Will hold ordinary multiplicand for MLA.
+  SDValue MLAOp2;   // Will hold lane-accessed multiplicand for MLA.
+  int LaneIdx = -1; // Will hold the lane index.
+
+  if (Op1.getOpcode() != ISD::MUL ||
+      !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+                        LaneIdx)) {
+    std::swap(Op0, Op1);
+    if (Op1.getOpcode() != ISD::MUL ||
+        !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+                          LaneIdx))
+      return false;
+  }
+
+  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
+
+  SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
+
+  unsigned MLAOpc = ~0U;
+
+  switch (N->getSimpleValueType(0).SimpleTy) {
+  default:
+    llvm_unreachable("Unrecognized MLA.");
+  case MVT::v4i16:
+    MLAOpc = AArch64::MLAv4i16_indexed;
+    break;
+  case MVT::v8i16:
+    MLAOpc = AArch64::MLAv8i16_indexed;
+    break;
+  case MVT::v2i32:
+    MLAOpc = AArch64::MLAv2i32_indexed;
+    break;
+  case MVT::v4i32:
+    MLAOpc = AArch64::MLAv4i32_indexed;
+    break;
+  }
+
+  ReplaceNode(N, CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops));
+  return true;
+}
+
+bool AArch64DAGToDAGISel::tryMULLV64LaneV128(unsigned IntNo, SDNode *N) {
+  SDLoc dl(N);
+  SDValue SMULLOp0;
+  SDValue SMULLOp1;
+  int LaneIdx;
+
+  if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
+                        LaneIdx))
+    return false;
+
+  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
+
+  SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
+
+  unsigned SMULLOpc = ~0U;
+
+  if (IntNo == Intrinsic::aarch64_neon_smull) {
+    switch (N->getSimpleValueType(0).SimpleTy) {
+    default:
+      llvm_unreachable("Unrecognized SMULL.");
+    case MVT::v4i32:
+      SMULLOpc = AArch64::SMULLv4i16_indexed;
+      break;
+    case MVT::v2i64:
+      SMULLOpc = AArch64::SMULLv2i32_indexed;
+      break;
+    }
+  } else if (IntNo == Intrinsic::aarch64_neon_umull) {
+    switch (N->getSimpleValueType(0).SimpleTy) {
+    default:
+      llvm_unreachable("Unrecognized SMULL.");
+    case MVT::v4i32:
+      SMULLOpc = AArch64::UMULLv4i16_indexed;
+      break;
+    case MVT::v2i64:
+      SMULLOpc = AArch64::UMULLv2i32_indexed;
+      break;
+    }
+  } else
+    llvm_unreachable("Unrecognized intrinsic.");
+
+  ReplaceNode(N, CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops));
+  return true;
+}
+
+/// Instructions that accept extend modifiers like UXTW expect the register
+/// being extended to be a GPR32, but the incoming DAG might be acting on a
+/// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
+/// this is the case.
+static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
+  if (N.getValueType() == MVT::i32)
+    return N;
+
+  SDLoc dl(N);
+  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
+  MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                               dl, MVT::i32, N, SubReg);
+  return SDValue(Node, 0);
+}
+
+
+/// SelectArithExtendedRegister - Select a "extended register" operand.  This
+/// operand folds in an extend followed by an optional left shift.
+bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
+                                                      SDValue &Shift) {
+  unsigned ShiftVal = 0;
+  AArch64_AM::ShiftExtendType Ext;
+
+  if (N.getOpcode() == ISD::SHL) {
+    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!CSD)
+      return false;
+    ShiftVal = CSD->getZExtValue();
+    if (ShiftVal > 4)
+      return false;
+
+    Ext = getExtendTypeForNode(N.getOperand(0));
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
+
+    Reg = N.getOperand(0).getOperand(0);
+  } else {
+    Ext = getExtendTypeForNode(N);
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
+
+    Reg = N.getOperand(0);
+
+    // Don't match if free 32-bit -> 64-bit zext can be used instead.
+    if (Ext == AArch64_AM::UXTW &&
+        Reg->getValueType(0).getSizeInBits() == 32 && isDef32(*Reg.getNode()))
+      return false;
+  }
+
+  // AArch64 mandates that the RHS of the operation must use the smallest
+  // register class that could contain the size being extended from.  Thus,
+  // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
+  // there might not be an actual 32-bit value in the program.  We can
+  // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
+  assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX);
+  Reg = narrowIfNeeded(CurDAG, Reg);
+  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
+                                    MVT::i32);
+  return isWorthFolding(N);
+}
+
+/// If there's a use of this ADDlow that's not itself a load/store then we'll
+/// need to create a real ADD instruction from it anyway and there's no point in
+/// folding it into the mem op. Theoretically, it shouldn't matter, but there's
+/// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
+/// leads to duplicated ADRP instructions.
+static bool isWorthFoldingADDlow(SDValue N) {
+  for (auto Use : N->uses()) {
+    if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
+        Use->getOpcode() != ISD::ATOMIC_LOAD &&
+        Use->getOpcode() != ISD::ATOMIC_STORE)
+      return false;
+
+    // ldar and stlr have much more restrictive addressing modes (just a
+    // register).
+    if (isStrongerThanMonotonic(cast<MemSDNode>(Use)->getOrdering()))
+      return false;
+  }
+
+  return true;
+}
+
+/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit
+/// immediate" address.  The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size,
+                                                  SDValue &Base,
+                                                  SDValue &OffImm) {
+  SDLoc dl(N);
+  const DataLayout &DL = CurDAG->getDataLayout();
+  const TargetLowering *TLI = getTargetLowering();
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+    OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+    return true;
+  }
+
+  // As opposed to the (12-bit) Indexed addressing mode below, the 7-bit signed
+  // selected here doesn't support labels/immediates, only base+offset.
+
+  if (CurDAG->isBaseWithConstantOffset(N)) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int64_t RHSC = RHS->getSExtValue();
+      unsigned Scale = Log2_32(Size);
+      if ((RHSC & (Size - 1)) == 0 && RHSC >= -(0x40 << Scale) &&
+          RHSC < (0x40 << Scale)) {
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+        }
+        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
+        return true;
+      }
+    }
+  }
+
+  // Base only. The address will be materialized into a register before
+  // the memory is accessed.
+  //    add x0, Xbase, #offset
+  //    stp x1, x2, [x0]
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+  return true;
+}
+
+/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
+/// immediate" address.  The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
+                                              SDValue &Base, SDValue &OffImm) {
+  SDLoc dl(N);
+  const DataLayout &DL = CurDAG->getDataLayout();
+  const TargetLowering *TLI = getTargetLowering();
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+    OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+    return true;
+  }
+
+  if (N.getOpcode() == AArch64ISD::ADDlow && isWorthFoldingADDlow(N)) {
+    GlobalAddressSDNode *GAN =
+        dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
+    Base = N.getOperand(0);
+    OffImm = N.getOperand(1);
+    if (!GAN)
+      return true;
+
+    const GlobalValue *GV = GAN->getGlobal();
+    unsigned Alignment = GV->getAlignment();
+    Type *Ty = GV->getValueType();
+    if (Alignment == 0 && Ty->isSized())
+      Alignment = DL.getABITypeAlignment(Ty);
+
+    if (Alignment >= Size)
+      return true;
+  }
+
+  if (CurDAG->isBaseWithConstantOffset(N)) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int64_t RHSC = (int64_t)RHS->getZExtValue();
+      unsigned Scale = Log2_32(Size);
+      if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+        }
+        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
+        return true;
+      }
+    }
+  }
+
+  // Before falling back to our general case, check if the unscaled
+  // instructions can handle this. If so, that's preferable.
+  if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
+    return false;
+
+  // Base only. The address will be materialized into a register before
+  // the memory is accessed.
+  //    add x0, Xbase, #offset
+  //    ldr x0, [x0]
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+  return true;
+}
+
+/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
+/// immediate" address.  This should only match when there is an offset that
+/// is not valid for a scaled immediate addressing mode.  The "Size" argument
+/// is the size in bytes of the memory reference, which is needed here to know
+/// what is valid for a scaled immediate.
+bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
+                                                 SDValue &Base,
+                                                 SDValue &OffImm) {
+  if (!CurDAG->isBaseWithConstantOffset(N))
+    return false;
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int64_t RHSC = RHS->getSExtValue();
+    // If the offset is valid as a scaled immediate, don't match here.
+    if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
+        RHSC < (0x1000 << Log2_32(Size)))
+      return false;
+    if (RHSC >= -256 && RHSC < 256) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        const TargetLowering *TLI = getTargetLowering();
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i64);
+      return true;
+    }
+  }
+  return false;
+}
+
+static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
+  SDLoc dl(N);
+  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
+  SDValue ImpDef = SDValue(
+      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, MVT::i64), 0);
+  MachineSDNode *Node = CurDAG->getMachineNode(
+      TargetOpcode::INSERT_SUBREG, dl, MVT::i64, ImpDef, N, SubReg);
+  return SDValue(Node, 0);
+}
+
+/// \brief Check if the given SHL node (\p N), can be used to form an
+/// extended register for an addressing mode.
+bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
+                                            bool WantExtend, SDValue &Offset,
+                                            SDValue &SignExtend) {
+  assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
+  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
+    return false;
+
+  SDLoc dl(N);
+  if (WantExtend) {
+    AArch64_AM::ShiftExtendType Ext =
+        getExtendTypeForNode(N.getOperand(0), true);
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
+
+    Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
+                                           MVT::i32);
+  } else {
+    Offset = N.getOperand(0);
+    SignExtend = CurDAG->getTargetConstant(0, dl, MVT::i32);
+  }
+
+  unsigned LegalShiftVal = Log2_32(Size);
+  unsigned ShiftVal = CSD->getZExtValue();
+
+  if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
+    return false;
+
+  return isWorthFolding(N);
+}
+
+bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
+                                            SDValue &Base, SDValue &Offset,
+                                            SDValue &SignExtend,
+                                            SDValue &DoShift) {
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+  SDValue LHS = N.getOperand(0);
+  SDValue RHS = N.getOperand(1);
+  SDLoc dl(N);
+
+  // We don't want to match immediate adds here, because they are better lowered
+  // to the register-immediate addressing modes.
+  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
+    return false;
+
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = N.getNode();
+  for (SDNode *UI : Node->uses()) {
+    if (!isa<MemSDNode>(*UI))
+      return false;
+  }
+
+  // Remember if it is worth folding N when it produces extended register.
+  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+  // Try to match a shifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
+    Base = LHS;
+    DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
+    return true;
+  }
+
+  // Try to match a shifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
+    Base = RHS;
+    DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
+    return true;
+  }
+
+  // There was no shift, whatever else we find.
+  DoShift = CurDAG->getTargetConstant(false, dl, MVT::i32);
+
+  AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend;
+  // Try to match an unshifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding &&
+      (Ext = getExtendTypeForNode(LHS, true)) !=
+          AArch64_AM::InvalidShiftExtend) {
+    Base = RHS;
+    Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
+                                           MVT::i32);
+    if (isWorthFolding(LHS))
+      return true;
+  }
+
+  // Try to match an unshifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding &&
+      (Ext = getExtendTypeForNode(RHS, true)) !=
+          AArch64_AM::InvalidShiftExtend) {
+    Base = LHS;
+    Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
+                                           MVT::i32);
+    if (isWorthFolding(RHS))
+      return true;
+  }
+
+  return false;
+}
+
+// Check if the given immediate is preferred by ADD. If an immediate can be
+// encoded in an ADD, or it can be encoded in an "ADD LSL #12" and can not be
+// encoded by one MOVZ, return true.
+static bool isPreferredADD(int64_t ImmOff) {
+  // Constant in [0x0, 0xfff] can be encoded in ADD.
+  if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
+    return true;
+  // Check if it can be encoded in an "ADD LSL #12".
+  if ((ImmOff & 0xffffffffff000fffLL) == 0x0LL)
+    // As a single MOVZ is faster than a "ADD of LSL #12", ignore such constant.
+    return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
+           (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
+  return false;
+}
+
+bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
+                                            SDValue &Base, SDValue &Offset,
+                                            SDValue &SignExtend,
+                                            SDValue &DoShift) {
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+  SDValue LHS = N.getOperand(0);
+  SDValue RHS = N.getOperand(1);
+  SDLoc DL(N);
+
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = N.getNode();
+  for (SDNode *UI : Node->uses()) {
+    if (!isa<MemSDNode>(*UI))
+      return false;
+  }
+
+  // Watch out if RHS is a wide immediate, it can not be selected into
+  // [BaseReg+Imm] addressing mode. Also it may not be able to be encoded into
+  // ADD/SUB. Instead it will use [BaseReg + 0] address mode and generate
+  // instructions like:
+  //     MOV  X0, WideImmediate
+  //     ADD  X1, BaseReg, X0
+  //     LDR  X2, [X1, 0]
+  // For such situation, using [BaseReg, XReg] addressing mode can save one
+  // ADD/SUB:
+  //     MOV  X0, WideImmediate
+  //     LDR  X2, [BaseReg, X0]
+  if (isa<ConstantSDNode>(RHS)) {
+    int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue();
+    unsigned Scale = Log2_32(Size);
+    // Skip the immediate can be selected by load/store addressing mode.
+    // Also skip the immediate can be encoded by a single ADD (SUB is also
+    // checked by using -ImmOff).
+    if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) ||
+        isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
+      return false;
+
+    SDValue Ops[] = { RHS };
+    SDNode *MOVI =
+        CurDAG->getMachineNode(AArch64::MOVi64imm, DL, MVT::i64, Ops);
+    SDValue MOVIV = SDValue(MOVI, 0);
+    // This ADD of two X register will be selected into [Reg+Reg] mode.
+    N = CurDAG->getNode(ISD::ADD, DL, MVT::i64, LHS, MOVIV);
+  }
+
+  // Remember if it is worth folding N when it produces extended register.
+  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+  // Try to match a shifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
+    Base = LHS;
+    DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
+    return true;
+  }
+
+  // Try to match a shifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
+    Base = RHS;
+    DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
+    return true;
+  }
+
+  // Match any non-shifted, non-extend, non-immediate add expression.
+  Base = LHS;
+  Offset = RHS;
+  SignExtend = CurDAG->getTargetConstant(false, DL, MVT::i32);
+  DoShift = CurDAG->getTargetConstant(false, DL, MVT::i32);
+  // Reg1 + Reg2 is free: no check needed.
+  return true;
+}
+
+SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
+  static const unsigned RegClassIDs[] = {
+      AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
+  static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
+                                     AArch64::dsub2, AArch64::dsub3};
+
+  return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
+SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
+  static const unsigned RegClassIDs[] = {
+      AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
+  static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
+                                     AArch64::qsub2, AArch64::qsub3};
+
+  return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
+SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
+                                         const unsigned RegClassIDs[],
+                                         const unsigned SubRegs[]) {
+  // There's no special register-class for a vector-list of 1 element: it's just
+  // a vector.
+  if (Regs.size() == 1)
+    return Regs[0];
+
+  assert(Regs.size() >= 2 && Regs.size() <= 4);
+
+  SDLoc DL(Regs[0]);
+
+  SmallVector<SDValue, 4> Ops;
+
+  // First operand of REG_SEQUENCE is the desired RegClass.
+  Ops.push_back(
+      CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], DL, MVT::i32));
+
+  // Then we get pairs of source & subregister-position for the components.
+  for (unsigned i = 0; i < Regs.size(); ++i) {
+    Ops.push_back(Regs[i]);
+    Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], DL, MVT::i32));
+  }
+
+  SDNode *N =
+      CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
+  return SDValue(N, 0);
+}
+
+void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc,
+                                      bool isExt) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  unsigned ExtOff = isExt;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  unsigned Vec0Off = ExtOff + 1;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
+                               N->op_begin() + Vec0Off + NumVecs);
+  SDValue RegSeq = createQTuple(Regs);
+
+  SmallVector<SDValue, 6> Ops;
+  if (isExt)
+    Ops.push_back(N->getOperand(1));
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
+  ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
+}
+
+bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (LD->isUnindexed())
+    return false;
+  EVT VT = LD->getMemoryVT();
+  EVT DstVT = N->getValueType(0);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
+
+  // We're not doing validity checking here. That was done when checking
+  // if we should mark the load as indexed or not. We're just selecting
+  // the right instruction.
+  unsigned Opcode = 0;
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  bool InsertTo64 = false;
+  if (VT == MVT::i64)
+    Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost;
+  else if (VT == MVT::i32) {
+    if (ExtType == ISD::NON_EXTLOAD)
+      Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+    else if (ExtType == ISD::SEXTLOAD)
+      Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
+    else {
+      Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+      InsertTo64 = true;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::i16) {
+    if (ExtType == ISD::SEXTLOAD) {
+      if (DstVT == MVT::i64)
+        Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
+      else
+        Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
+    } else {
+      Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
+      InsertTo64 = DstVT == MVT::i64;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::i8) {
+    if (ExtType == ISD::SEXTLOAD) {
+      if (DstVT == MVT::i64)
+        Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
+      else
+        Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
+    } else {
+      Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
+      InsertTo64 = DstVT == MVT::i64;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::f16) {
+    Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
+  } else if (VT == MVT::f32) {
+    Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
+  } else if (VT == MVT::f64 || VT.is64BitVector()) {
+    Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
+  } else if (VT.is128BitVector()) {
+    Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
+  } else
+    return false;
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
+  int OffsetVal = (int)OffsetOp->getZExtValue();
+  SDLoc dl(N);
+  SDValue Offset = CurDAG->getTargetConstant(OffsetVal, dl, MVT::i64);
+  SDValue Ops[] = { Base, Offset, Chain };
+  SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT,
+                                       MVT::Other, Ops);
+  // Either way, we're replacing the node, so tell the caller that.
+  SDValue LoadedVal = SDValue(Res, 1);
+  if (InsertTo64) {
+    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
+    LoadedVal =
+        SDValue(CurDAG->getMachineNode(
+                    AArch64::SUBREG_TO_REG, dl, MVT::i64,
+                    CurDAG->getTargetConstant(0, dl, MVT::i64), LoadedVal,
+                    SubReg),
+                0);
+  }
+
+  ReplaceUses(SDValue(N, 0), LoadedVal);
+  ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
+  ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
+  CurDAG->RemoveDeadNode(N);
+  return true;
+}
+
+void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                                     unsigned SubRegIdx) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
+
+  SDValue Ops[] = {N->getOperand(2), // Mem operand;
+                   Chain};
+
+  const EVT ResTys[] = {MVT::Untyped, MVT::Other};
+
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  SDValue SuperReg = SDValue(Ld, 0);
+  for (unsigned i = 0; i < NumVecs; ++i)
+    ReplaceUses(SDValue(N, i),
+        CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
+
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+
+  CurDAG->RemoveDeadNode(N);
+}
+
+void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
+                                         unsigned Opc, unsigned SubRegIdx) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
+
+  SDValue Ops[] = {N->getOperand(1), // Mem operand
+                   N->getOperand(2), // Incremental
+                   Chain};
+
+  const EVT ResTys[] = {MVT::i64, // Type of the write back register
+                        MVT::Untyped, MVT::Other};
+
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Update uses of write back register
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
+
+  // Update uses of vector list
+  SDValue SuperReg = SDValue(Ld, 1);
+  if (NumVecs == 1)
+    ReplaceUses(SDValue(N, 0), SuperReg);
+  else
+    for (unsigned i = 0; i < NumVecs; ++i)
+      ReplaceUses(SDValue(N, i),
+          CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
+
+  // Update the chain
+  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
+  CurDAG->RemoveDeadNode(N);
+}
+
+void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
+                                      unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+
+  // Form a REG_SEQUENCE to force register allocation.
+  bool Is128Bit = VT.getSizeInBits() == 128;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
+
+  SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)};
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+  ReplaceNode(N, St);
+}
+
+void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
+                                          unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+  const EVT ResTys[] = {MVT::i64,    // Type of the write back register
+                        MVT::Other}; // Type for the Chain
+
+  // Form a REG_SEQUENCE to force register allocation.
+  bool Is128Bit = VT.getSizeInBits() == 128;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
+
+  SDValue Ops[] = {RegSeq,
+                   N->getOperand(NumVecs + 1), // base register
+                   N->getOperand(NumVecs + 2), // Incremental
+                   N->getOperand(0)};          // Chain
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  ReplaceNode(N, St);
+}
+
+namespace {
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+class WidenVector {
+  SelectionDAG &DAG;
+
+public:
+  WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
+
+  SDValue operator()(SDValue V64Reg) {
+    EVT VT = V64Reg.getValueType();
+    unsigned NarrowSize = VT.getVectorNumElements();
+    MVT EltTy = VT.getVectorElementType().getSimpleVT();
+    MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+    SDLoc DL(V64Reg);
+
+    SDValue Undef =
+        SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
+    return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg);
+  }
+};
+} // namespace
+
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+  EVT VT = V128Reg.getValueType();
+  unsigned WideSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+
+  return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy,
+                                    V128Reg);
+}
+
+void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
+                                         unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+  if (Narrow)
+    transform(Regs, Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  const EVT ResTys[] = {MVT::Untyped, MVT::Other};
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+
+  SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
+                   N->getOperand(NumVecs + 3), N->getOperand(0)};
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  SDValue SuperReg = SDValue(Ld, 0);
+
+  EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+  static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
+                                    AArch64::qsub2, AArch64::qsub3 };
+  for (unsigned i = 0; i < NumVecs; ++i) {
+    SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
+    if (Narrow)
+      NV = NarrowVector(NV, *CurDAG);
+    ReplaceUses(SDValue(N, i), NV);
+  }
+
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
+  CurDAG->RemoveDeadNode(N);
+}
+
+void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
+                                             unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+
+  if (Narrow)
+    transform(Regs, Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  const EVT ResTys[] = {MVT::i64, // Type of the write back register
+                        RegSeq->getValueType(0), MVT::Other};
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+
+  SDValue Ops[] = {RegSeq,
+                   CurDAG->getTargetConstant(LaneNo, dl,
+                                             MVT::i64),         // Lane Number
+                   N->getOperand(NumVecs + 2),                  // Base register
+                   N->getOperand(NumVecs + 3),                  // Incremental
+                   N->getOperand(0)};
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Update uses of the write back register
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
+
+  // Update uses of the vector list
+  SDValue SuperReg = SDValue(Ld, 1);
+  if (NumVecs == 1) {
+    ReplaceUses(SDValue(N, 0),
+                Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
+  } else {
+    EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+    static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
+                                      AArch64::qsub2, AArch64::qsub3 };
+    for (unsigned i = 0; i < NumVecs; ++i) {
+      SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
+                                                  SuperReg);
+      if (Narrow)
+        NV = NarrowVector(NV, *CurDAG);
+      ReplaceUses(SDValue(N, i), NV);
+    }
+  }
+
+  // Update the Chain
+  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
+  CurDAG->RemoveDeadNode(N);
+}
+
+void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
+                                          unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+  if (Narrow)
+    transform(Regs, Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+
+  SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
+                   N->getOperand(NumVecs + 3), N->getOperand(0)};
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+  ReplaceNode(N, St);
+}
+
+void AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
+                                              unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+
+  if (Narrow)
+    transform(Regs, Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  const EVT ResTys[] = {MVT::i64, // Type of the write back register
+                        MVT::Other};
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+
+  SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
+                   N->getOperand(NumVecs + 2), // Base Register
+                   N->getOperand(NumVecs + 3), // Incremental
+                   N->getOperand(0)};
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+  ReplaceNode(N, St);
+}
+
+static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
+                                       unsigned &Opc, SDValue &Opd0,
+                                       unsigned &LSB, unsigned &MSB,
+                                       unsigned NumberOfIgnoredLowBits,
+                                       bool BiggerPattern) {
+  assert(N->getOpcode() == ISD::AND &&
+         "N must be a AND operation to call this function");
+
+  EVT VT = N->getValueType(0);
+
+  // Here we can test the type of VT and return false when the type does not
+  // match, but since it is done prior to that call in the current context
+  // we turned that into an assert to avoid redundant code.
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  // FIXME: simplify-demanded-bits in DAGCombine will probably have
+  // changed the AND node to a 32-bit mask operation. We'll have to
+  // undo that as part of the transform here if we want to catch all
+  // the opportunities.
+  // Currently the NumberOfIgnoredLowBits argument helps to recover
+  // form these situations when matching bigger pattern (bitfield insert).
+
+  // For unsigned extracts, check for a shift right and mask
+  uint64_t AndImm = 0;
+  if (!isOpcWithIntImmediate(N, ISD::AND, AndImm))
+    return false;
+
+  const SDNode *Op0 = N->getOperand(0).getNode();
+
+  // Because of simplify-demanded-bits in DAGCombine, the mask may have been
+  // simplified. Try to undo that
+  AndImm |= (1 << NumberOfIgnoredLowBits) - 1;
+
+  // The immediate is a mask of the low bits iff imm & (imm+1) == 0
+  if (AndImm & (AndImm + 1))
+    return false;
+
+  bool ClampMSB = false;
+  uint64_t SrlImm = 0;
+  // Handle the SRL + ANY_EXTEND case.
+  if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
+      isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, SrlImm)) {
+    // Extend the incoming operand of the SRL to 64-bit.
+    Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
+    // Make sure to clamp the MSB so that we preserve the semantics of the
+    // original operations.
+    ClampMSB = true;
+  } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
+             isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
+                                   SrlImm)) {
+    // If the shift result was truncated, we can still combine them.
+    Opd0 = Op0->getOperand(0).getOperand(0);
+
+    // Use the type of SRL node.
+    VT = Opd0->getValueType(0);
+  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) {
+    Opd0 = Op0->getOperand(0);
+  } else if (BiggerPattern) {
+    // Let's pretend a 0 shift right has been performed.
+    // The resulting code will be at least as good as the original one
+    // plus it may expose more opportunities for bitfield insert pattern.
+    // FIXME: Currently we limit this to the bigger pattern, because
+    // some optimizations expect AND and not UBFM.
+    Opd0 = N->getOperand(0);
+  } else
+    return false;
+
+  // Bail out on large immediates. This happens when no proper
+  // combining/constant folding was performed.
+  if (!BiggerPattern && (SrlImm <= 0 || SrlImm >= VT.getSizeInBits())) {
+    DEBUG((dbgs() << N
+           << ": Found large shift immediate, this should not happen\n"));
+    return false;
+  }
+
+  LSB = SrlImm;
+  MSB = SrlImm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(AndImm)
+                                 : countTrailingOnes<uint64_t>(AndImm)) -
+        1;
+  if (ClampMSB)
+    // Since we're moving the extend before the right shift operation, we need
+    // to clamp the MSB to make sure we don't shift in undefined bits instead of
+    // the zeros which would get shifted in with the original right shift
+    // operation.
+    MSB = MSB > 31 ? 31 : MSB;
+
+  Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri;
+  return true;
+}
+
+static bool isBitfieldExtractOpFromSExtInReg(SDNode *N, unsigned &Opc,
+                                             SDValue &Opd0, unsigned &Immr,
+                                             unsigned &Imms) {
+  assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
+
+  EVT VT = N->getValueType(0);
+  unsigned BitWidth = VT.getSizeInBits();
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  SDValue Op = N->getOperand(0);
+  if (Op->getOpcode() == ISD::TRUNCATE) {
+    Op = Op->getOperand(0);
+    VT = Op->getValueType(0);
+    BitWidth = VT.getSizeInBits();
+  }
+
+  uint64_t ShiftImm;
+  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRL, ShiftImm) &&
+      !isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
+    return false;
+
+  unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
+  if (ShiftImm + Width > BitWidth)
+    return false;
+
+  Opc = (VT == MVT::i32) ? AArch64::SBFMWri : AArch64::SBFMXri;
+  Opd0 = Op.getOperand(0);
+  Immr = ShiftImm;
+  Imms = ShiftImm + Width - 1;
+  return true;
+}
+
+static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
+                                          SDValue &Opd0, unsigned &LSB,
+                                          unsigned &MSB) {
+  // We are looking for the following pattern which basically extracts several
+  // continuous bits from the source value and places it from the LSB of the
+  // destination value, all other bits of the destination value or set to zero:
+  //
+  // Value2 = AND Value, MaskImm
+  // SRL Value2, ShiftImm
+  //
+  // with MaskImm >> ShiftImm to search for the bit width.
+  //
+  // This gets selected into a single UBFM:
+  //
+  // UBFM Value, ShiftImm, BitWide + SrlImm -1
+  //
+
+  if (N->getOpcode() != ISD::SRL)
+    return false;
+
+  uint64_t AndMask = 0;
+  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, AndMask))
+    return false;
+
+  Opd0 = N->getOperand(0).getOperand(0);
+
+  uint64_t SrlImm = 0;
+  if (!isIntImmediate(N->getOperand(1), SrlImm))
+    return false;
+
+  // Check whether we really have several bits extract here.
+  unsigned BitWide = 64 - countLeadingOnes(~(AndMask >> SrlImm));
+  if (BitWide && isMask_64(AndMask >> SrlImm)) {
+    if (N->getValueType(0) == MVT::i32)
+      Opc = AArch64::UBFMWri;
+    else
+      Opc = AArch64::UBFMXri;
+
+    LSB = SrlImm;
+    MSB = BitWide + SrlImm - 1;
+    return true;
+  }
+
+  return false;
+}
+
+static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+                                       unsigned &Immr, unsigned &Imms,
+                                       bool BiggerPattern) {
+  assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
+         "N must be a SHR/SRA operation to call this function");
+
+  EVT VT = N->getValueType(0);
+
+  // Here we can test the type of VT and return false when the type does not
+  // match, but since it is done prior to that call in the current context
+  // we turned that into an assert to avoid redundant code.
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  // Check for AND + SRL doing several bits extract.
+  if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms))
+    return true;
+
+  // We're looking for a shift of a shift.
+  uint64_t ShlImm = 0;
+  uint64_t TruncBits = 0;
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, ShlImm)) {
+    Opd0 = N->getOperand(0).getOperand(0);
+  } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
+             N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
+    // We are looking for a shift of truncate. Truncate from i64 to i32 could
+    // be considered as setting high 32 bits as zero. Our strategy here is to
+    // always generate 64bit UBFM. This consistency will help the CSE pass
+    // later find more redundancy.
+    Opd0 = N->getOperand(0).getOperand(0);
+    TruncBits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
+    VT = Opd0->getValueType(0);
+    assert(VT == MVT::i64 && "the promoted type should be i64");
+  } else if (BiggerPattern) {
+    // Let's pretend a 0 shift left has been performed.
+    // FIXME: Currently we limit this to the bigger pattern case,
+    // because some optimizations expect AND and not UBFM
+    Opd0 = N->getOperand(0);
+  } else
+    return false;
+
+  // Missing combines/constant folding may have left us with strange
+  // constants.
+  if (ShlImm >= VT.getSizeInBits()) {
+    DEBUG((dbgs() << N
+           << ": Found large shift immediate, this should not happen\n"));
+    return false;
+  }
+
+  uint64_t SrlImm = 0;
+  if (!isIntImmediate(N->getOperand(1), SrlImm))
+    return false;
+
+  assert(SrlImm > 0 && SrlImm < VT.getSizeInBits() &&
+         "bad amount in shift node!");
+  int immr = SrlImm - ShlImm;
+  Immr = immr < 0 ? immr + VT.getSizeInBits() : immr;
+  Imms = VT.getSizeInBits() - ShlImm - TruncBits - 1;
+  // SRA requires a signed extraction
+  if (VT == MVT::i32)
+    Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
+  else
+    Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri;
+  return true;
+}
+
+bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) {
+  assert(N->getOpcode() == ISD::SIGN_EXTEND);
+
+  EVT VT = N->getValueType(0);
+  EVT NarrowVT = N->getOperand(0)->getValueType(0);
+  if (VT != MVT::i64 || NarrowVT != MVT::i32)
+    return false;
+
+  uint64_t ShiftImm;
+  SDValue Op = N->getOperand(0);
+  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
+    return false;
+
+  SDLoc dl(N);
+  // Extend the incoming operand of the shift to 64-bits.
+  SDValue Opd0 = Widen(CurDAG, Op.getOperand(0));
+  unsigned Immr = ShiftImm;
+  unsigned Imms = NarrowVT.getSizeInBits() - 1;
+  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
+                   CurDAG->getTargetConstant(Imms, dl, VT)};
+  CurDAG->SelectNodeTo(N, AArch64::SBFMXri, VT, Ops);
+  return true;
+}
+
+static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
+                                SDValue &Opd0, unsigned &Immr, unsigned &Imms,
+                                unsigned NumberOfIgnoredLowBits = 0,
+                                bool BiggerPattern = false) {
+  if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+    if (!N->isMachineOpcode())
+      return false;
+    break;
+  case ISD::AND:
+    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, Immr, Imms,
+                                      NumberOfIgnoredLowBits, BiggerPattern);
+  case ISD::SRL:
+  case ISD::SRA:
+    return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern);
+
+  case ISD::SIGN_EXTEND_INREG:
+    return isBitfieldExtractOpFromSExtInReg(N, Opc, Opd0, Immr, Imms);
+  }
+
+  unsigned NOpc = N->getMachineOpcode();
+  switch (NOpc) {
+  default:
+    return false;
+  case AArch64::SBFMWri:
+  case AArch64::UBFMWri:
+  case AArch64::SBFMXri:
+  case AArch64::UBFMXri:
+    Opc = NOpc;
+    Opd0 = N->getOperand(0);
+    Immr = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+    Imms = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+    return true;
+  }
+  // Unreachable
+  return false;
+}
+
+bool AArch64DAGToDAGISel::tryBitfieldExtractOp(SDNode *N) {
+  unsigned Opc, Immr, Imms;
+  SDValue Opd0;
+  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms))
+    return false;
+
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  // If the bit extract operation is 64bit but the original type is 32bit, we
+  // need to add one EXTRACT_SUBREG.
+  if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
+    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, MVT::i64),
+                       CurDAG->getTargetConstant(Imms, dl, MVT::i64)};
+
+    SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64);
+    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
+    ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
+                                          MVT::i32, SDValue(BFM, 0), SubReg));
+    return true;
+  }
+
+  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
+                   CurDAG->getTargetConstant(Imms, dl, VT)};
+  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  return true;
+}
+
+/// Does DstMask form a complementary pair with the mask provided by
+/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
+/// this asks whether DstMask zeroes precisely those bits that will be set by
+/// the other half.
+static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted,
+                              unsigned NumberOfIgnoredHighBits, EVT VT) {
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "i32 or i64 mask type expected!");
+  unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
+
+  APInt SignificantDstMask = APInt(BitWidth, DstMask);
+  APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
+
+  return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
+         (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
+}
+
+// Look for bits that will be useful for later uses.
+// A bit is consider useless as soon as it is dropped and never used
+// before it as been dropped.
+// E.g., looking for useful bit of x
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// After #1, x useful bits are 0x7, then the useful bits of x, live through
+// y.
+// After #2, the useful bits of x are 0x4.
+// However, if x is used on an unpredicatable instruction, then all its bits
+// are useful.
+// E.g.
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// 3. str x, [@x]
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
+
+static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
+                                              unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+  Imm = AArch64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
+  UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
+  getUsefulBits(Op, UsefulBits, Depth + 1);
+}
+
+static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
+                                             uint64_t Imm, uint64_t MSB,
+                                             unsigned Depth) {
+  // inherit the bitwidth value
+  APInt OpUsefulBits(UsefulBits);
+  OpUsefulBits = 1;
+
+  if (MSB >= Imm) {
+    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+    --OpUsefulBits;
+    // The interesting part will be in the lower part of the result
+    getUsefulBits(Op, OpUsefulBits, Depth + 1);
+    // The interesting part was starting at Imm in the argument
+    OpUsefulBits = OpUsefulBits.shl(Imm);
+  } else {
+    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+    --OpUsefulBits;
+    // The interesting part will be shifted in the result
+    OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm);
+    getUsefulBits(Op, OpUsefulBits, Depth + 1);
+    // The interesting part was at zero in the argument
+    OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm);
+  }
+
+  UsefulBits &= OpUsefulBits;
+}
+
+static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
+                                  unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+  uint64_t MSB =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+
+  getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
+}
+
+static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
+                                              unsigned Depth) {
+  uint64_t ShiftTypeAndValue =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+  APInt Mask(UsefulBits);
+  Mask.clearAllBits();
+  Mask.flipAllBits();
+
+  if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
+    // Shift Left
+    uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
+    Mask = Mask.shl(ShiftAmt);
+    getUsefulBits(Op, Mask, Depth + 1);
+    Mask = Mask.lshr(ShiftAmt);
+  } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
+    // Shift Right
+    // We do not handle AArch64_AM::ASR, because the sign will change the
+    // number of useful bits
+    uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
+    Mask = Mask.lshr(ShiftAmt);
+    getUsefulBits(Op, Mask, Depth + 1);
+    Mask = Mask.shl(ShiftAmt);
+  } else
+    return;
+
+  UsefulBits &= Mask;
+}
+
+static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
+                                 unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+  uint64_t MSB =
+      cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
+
+  APInt OpUsefulBits(UsefulBits);
+  OpUsefulBits = 1;
+
+  APInt ResultUsefulBits(UsefulBits.getBitWidth(), 0);
+  ResultUsefulBits.flipAllBits();
+  APInt Mask(UsefulBits.getBitWidth(), 0);
+
+  getUsefulBits(Op, ResultUsefulBits, Depth + 1);
+
+  if (MSB >= Imm) {
+    // The instruction is a BFXIL.
+    uint64_t Width = MSB - Imm + 1;
+    uint64_t LSB = Imm;
+
+    OpUsefulBits = OpUsefulBits.shl(Width);
+    --OpUsefulBits;
+
+    if (Op.getOperand(1) == Orig) {
+      // Copy the low bits from the result to bits starting from LSB.
+      Mask = ResultUsefulBits & OpUsefulBits;
+      Mask = Mask.shl(LSB);
+    }
+
+    if (Op.getOperand(0) == Orig)
+      // Bits starting from LSB in the input contribute to the result.
+      Mask |= (ResultUsefulBits & ~OpUsefulBits);
+  } else {
+    // The instruction is a BFI.
+    uint64_t Width = MSB + 1;
+    uint64_t LSB = UsefulBits.getBitWidth() - Imm;
+
+    OpUsefulBits = OpUsefulBits.shl(Width);
+    --OpUsefulBits;
+    OpUsefulBits = OpUsefulBits.shl(LSB);
+
+    if (Op.getOperand(1) == Orig) {
+      // Copy the bits from the result to the zero bits.
+      Mask = ResultUsefulBits & OpUsefulBits;
+      Mask = Mask.lshr(LSB);
+    }
+
+    if (Op.getOperand(0) == Orig)
+      Mask |= (ResultUsefulBits & ~OpUsefulBits);
+  }
+
+  UsefulBits &= Mask;
+}
+
+static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
+                                SDValue Orig, unsigned Depth) {
+
+  // Users of this node should have already been instruction selected
+  // FIXME: Can we turn that into an assert?
+  if (!UserNode->isMachineOpcode())
+    return;
+
+  switch (UserNode->getMachineOpcode()) {
+  default:
+    return;
+  case AArch64::ANDSWri:
+  case AArch64::ANDSXri:
+  case AArch64::ANDWri:
+  case AArch64::ANDXri:
+    // We increment Depth only when we call the getUsefulBits
+    return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
+                                             Depth);
+  case AArch64::UBFMWri:
+  case AArch64::UBFMXri:
+    return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
+
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+    if (UserNode->getOperand(1) != Orig)
+      return;
+    return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
+                                             Depth);
+  case AArch64::BFMWri:
+  case AArch64::BFMXri:
+    return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
+
+  case AArch64::STRBBui:
+  case AArch64::STURBBi:
+    if (UserNode->getOperand(0) != Orig)
+      return;
+    UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xff);
+    return;
+
+  case AArch64::STRHHui:
+  case AArch64::STURHHi:
+    if (UserNode->getOperand(0) != Orig)
+      return;
+    UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xffff);
+    return;
+  }
+}
+
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
+  if (Depth >= 6)
+    return;
+  // Initialize UsefulBits
+  if (!Depth) {
+    unsigned Bitwidth = Op.getScalarValueSizeInBits();
+    // At the beginning, assume every produced bits is useful
+    UsefulBits = APInt(Bitwidth, 0);
+    UsefulBits.flipAllBits();
+  }
+  APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
+
+  for (SDNode *Node : Op.getNode()->uses()) {
+    // A use cannot produce useful bits
+    APInt UsefulBitsForUse = APInt(UsefulBits);
+    getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
+    UsersUsefulBits |= UsefulBitsForUse;
+  }
+  // UsefulBits contains the produced bits that are meaningful for the
+  // current definition, thus a user cannot make a bit meaningful at
+  // this point
+  UsefulBits &= UsersUsefulBits;
+}
+
+/// Create a machine node performing a notional SHL of Op by ShlAmount. If
+/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
+/// 0, return Op unchanged.
+static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
+  if (ShlAmount == 0)
+    return Op;
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned BitWidth = VT.getSizeInBits();
+  unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri;
+
+  SDNode *ShiftNode;
+  if (ShlAmount > 0) {
+    // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
+    ShiftNode = CurDAG->getMachineNode(
+        UBFMOpc, dl, VT, Op,
+        CurDAG->getTargetConstant(BitWidth - ShlAmount, dl, VT),
+        CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, dl, VT));
+  } else {
+    // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
+    assert(ShlAmount < 0 && "expected right shift");
+    int ShrAmount = -ShlAmount;
+    ShiftNode = CurDAG->getMachineNode(
+        UBFMOpc, dl, VT, Op, CurDAG->getTargetConstant(ShrAmount, dl, VT),
+        CurDAG->getTargetConstant(BitWidth - 1, dl, VT));
+  }
+
+  return SDValue(ShiftNode, 0);
+}
+
+/// Does this tree qualify as an attempt to move a bitfield into position,
+/// essentially "(and (shl VAL, N), Mask)".
+static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
+                                    bool BiggerPattern,
+                                    SDValue &Src, int &ShiftAmount,
+                                    int &MaskWidth) {
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  (void)BitWidth;
+  assert(BitWidth == 32 || BitWidth == 64);
+
+  APInt KnownZero, KnownOne;
+  CurDAG->computeKnownBits(Op, KnownZero, KnownOne);
+
+  // Non-zero in the sense that they're not provably zero, which is the key
+  // point if we want to use this value
+  uint64_t NonZeroBits = (~KnownZero).getZExtValue();
+
+  // Discard a constant AND mask if present. It's safe because the node will
+  // already have been factored into the computeKnownBits calculation above.
+  uint64_t AndImm;
+  if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
+    assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0);
+    Op = Op.getOperand(0);
+  }
+
+  // Don't match if the SHL has more than one use, since then we'll end up
+  // generating SHL+UBFIZ instead of just keeping SHL+AND.
+  if (!BiggerPattern && !Op.hasOneUse())
+    return false;
+
+  uint64_t ShlImm;
+  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
+    return false;
+  Op = Op.getOperand(0);
+
+  if (!isShiftedMask_64(NonZeroBits))
+    return false;
+
+  ShiftAmount = countTrailingZeros(NonZeroBits);
+  MaskWidth = countTrailingOnes(NonZeroBits >> ShiftAmount);
+
+  // BFI encompasses sufficiently many nodes that it's worth inserting an extra
+  // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
+  // amount.  BiggerPattern is true when this pattern is being matched for BFI,
+  // BiggerPattern is false when this pattern is being matched for UBFIZ, in
+  // which case it is not profitable to insert an extra shift.
+  if (ShlImm - ShiftAmount != 0 && !BiggerPattern)
+    return false;
+  Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
+
+  return true;
+}
+
+static bool isShiftedMask(uint64_t Mask, EVT VT) {
+  assert(VT == MVT::i32 || VT == MVT::i64);
+  if (VT == MVT::i32)
+    return isShiftedMask_32(Mask);
+  return isShiftedMask_64(Mask);
+}
+
+// Generate a BFI/BFXIL from 'or (and X, MaskImm), OrImm' iff the value being
+// inserted only sets known zero bits.
+static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
+  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return false;
+
+  unsigned BitWidth = VT.getSizeInBits();
+
+  uint64_t OrImm;
+  if (!isOpcWithIntImmediate(N, ISD::OR, OrImm))
+    return false;
+
+  // Skip this transformation if the ORR immediate can be encoded in the ORR.
+  // Otherwise, we'll trade an AND+ORR for ORR+BFI/BFXIL, which is most likely
+  // performance neutral.
+  if (AArch64_AM::isLogicalImmediate(OrImm, BitWidth))
+    return false;
+
+  uint64_t MaskImm;
+  SDValue And = N->getOperand(0);
+  // Must be a single use AND with an immediate operand.
+  if (!And.hasOneUse() ||
+      !isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm))
+    return false;
+
+  // Compute the Known Zero for the AND as this allows us to catch more general
+  // cases than just looking for AND with imm.
+  APInt KnownZero, KnownOne;
+  CurDAG->computeKnownBits(And, KnownZero, KnownOne);
+
+  // Non-zero in the sense that they're not provably zero, which is the key
+  // point if we want to use this value.
+  uint64_t NotKnownZero = (~KnownZero).getZExtValue();
+
+  // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
+  if (!isShiftedMask(KnownZero.getZExtValue(), VT))
+    return false;
+
+  // The bits being inserted must only set those bits that are known to be zero.
+  if ((OrImm & NotKnownZero) != 0) {
+    // FIXME:  It's okay if the OrImm sets NotKnownZero bits to 1, but we don't
+    // currently handle this case.
+    return false;
+  }
+
+  // BFI/BFXIL dst, src, #lsb, #width.
+  int LSB = countTrailingOnes(NotKnownZero);
+  int Width = BitWidth - APInt(BitWidth, NotKnownZero).countPopulation();
+
+  // BFI/BFXIL is an alias of BFM, so translate to BFM operands.
+  unsigned ImmR = (BitWidth - LSB) % BitWidth;
+  unsigned ImmS = Width - 1;
+
+  // If we're creating a BFI instruction avoid cases where we need more
+  // instructions to materialize the BFI constant as compared to the original
+  // ORR.  A BFXIL will use the same constant as the original ORR, so the code
+  // should be no worse in this case.
+  bool IsBFI = LSB != 0;
+  uint64_t BFIImm = OrImm >> LSB;
+  if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) {
+    // We have a BFI instruction and we know the constant can't be materialized
+    // with a ORR-immediate with the zero register.
+    unsigned OrChunks = 0, BFIChunks = 0;
+    for (unsigned Shift = 0; Shift < BitWidth; Shift += 16) {
+      if (((OrImm >> Shift) & 0xFFFF) != 0)
+        ++OrChunks;
+      if (((BFIImm >> Shift) & 0xFFFF) != 0)
+        ++BFIChunks;
+    }
+    if (BFIChunks > OrChunks)
+      return false;
+  }
+
+  // Materialize the constant to be inserted.
+  SDLoc DL(N);
+  unsigned MOVIOpc = VT == MVT::i32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
+  SDNode *MOVI = CurDAG->getMachineNode(
+      MOVIOpc, DL, VT, CurDAG->getTargetConstant(BFIImm, DL, VT));
+
+  // Create the BFI/BFXIL instruction.
+  SDValue Ops[] = {And.getOperand(0), SDValue(MOVI, 0),
+                   CurDAG->getTargetConstant(ImmR, DL, VT),
+                   CurDAG->getTargetConstant(ImmS, DL, VT)};
+  unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  return true;
+}
+
+static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
+                                      SelectionDAG *CurDAG) {
+  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return false;
+
+  unsigned BitWidth = VT.getSizeInBits();
+
+  // Because of simplify-demanded-bits in DAGCombine, involved masks may not
+  // have the expected shape. Try to undo that.
+
+  unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
+  unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
+
+  // Given a OR operation, check if we have the following pattern
+  // ubfm c, b, imm, imm2 (or something that does the same jobs, see
+  //                       isBitfieldExtractOp)
+  // d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
+  //                 countTrailingZeros(mask2) == imm2 - imm + 1
+  // f = d | c
+  // if yes, replace the OR instruction with:
+  // f = BFM Opd0, Opd1, LSB, MSB ; where LSB = imm, and MSB = imm2
+
+  // OR is commutative, check all combinations of operand order and values of
+  // BiggerPattern, i.e.
+  //     Opd0, Opd1, BiggerPattern=false
+  //     Opd1, Opd0, BiggerPattern=false
+  //     Opd0, Opd1, BiggerPattern=true
+  //     Opd1, Opd0, BiggerPattern=true
+  // Several of these combinations may match, so check with BiggerPattern=false
+  // first since that will produce better results by matching more instructions
+  // and/or inserting fewer extra instructions.
+  for (int I = 0; I < 4; ++I) {
+
+    SDValue Dst, Src;
+    unsigned ImmR, ImmS;
+    bool BiggerPattern = I / 2;
+    SDValue OrOpd0Val = N->getOperand(I % 2);
+    SDNode *OrOpd0 = OrOpd0Val.getNode();
+    SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
+    SDNode *OrOpd1 = OrOpd1Val.getNode();
+
+    unsigned BFXOpc;
+    int DstLSB, Width;
+    if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
+                            NumberOfIgnoredLowBits, BiggerPattern)) {
+      // Check that the returned opcode is compatible with the pattern,
+      // i.e., same type and zero extended (U and not S)
+      if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
+          (BFXOpc != AArch64::UBFMWri && VT == MVT::i32))
+        continue;
+
+      // Compute the width of the bitfield insertion
+      DstLSB = 0;
+      Width = ImmS - ImmR + 1;
+      // FIXME: This constraint is to catch bitfield insertion we may
+      // want to widen the pattern if we want to grab general bitfied
+      // move case
+      if (Width <= 0)
+        continue;
+
+      // If the mask on the insertee is correct, we have a BFXIL operation. We
+      // can share the ImmR and ImmS values from the already-computed UBFM.
+    } else if (isBitfieldPositioningOp(CurDAG, OrOpd0Val,
+                                       BiggerPattern,
+                                       Src, DstLSB, Width)) {
+      ImmR = (BitWidth - DstLSB) % BitWidth;
+      ImmS = Width - 1;
+    } else
+      continue;
+
+    // Check the second part of the pattern
+    EVT VT = OrOpd1->getValueType(0);
+    assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
+
+    // Compute the Known Zero for the candidate of the first operand.
+    // This allows to catch more general case than just looking for
+    // AND with imm. Indeed, simplify-demanded-bits may have removed
+    // the AND instruction because it proves it was useless.
+    APInt KnownZero, KnownOne;
+    CurDAG->computeKnownBits(OrOpd1Val, KnownZero, KnownOne);
+
+    // Check if there is enough room for the second operand to appear
+    // in the first one
+    APInt BitsToBeInserted =
+        APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width);
+
+    if ((BitsToBeInserted & ~KnownZero) != 0)
+      continue;
+
+    // Set the first operand
+    uint64_t Imm;
+    if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
+        isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
+      // In that case, we can eliminate the AND
+      Dst = OrOpd1->getOperand(0);
+    else
+      // Maybe the AND has been removed by simplify-demanded-bits
+      // or is useful because it discards more bits
+      Dst = OrOpd1Val;
+
+    // both parts match
+    SDLoc DL(N);
+    SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT),
+                     CurDAG->getTargetConstant(ImmS, DL, VT)};
+    unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+    CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+    return true;
+  }
+
+  // Generate a BFXIL from 'or (and X, Mask0Imm), (and Y, Mask1Imm)' iff
+  // Mask0Imm and ~Mask1Imm are equivalent and one of the MaskImms is a shifted
+  // mask (e.g., 0x000ffff0).
+  uint64_t Mask0Imm, Mask1Imm;
+  SDValue And0 = N->getOperand(0);
+  SDValue And1 = N->getOperand(1);
+  if (And0.hasOneUse() && And1.hasOneUse() &&
+      isOpcWithIntImmediate(And0.getNode(), ISD::AND, Mask0Imm) &&
+      isOpcWithIntImmediate(And1.getNode(), ISD::AND, Mask1Imm) &&
+      APInt(BitWidth, Mask0Imm) == ~APInt(BitWidth, Mask1Imm) &&
+      (isShiftedMask(Mask0Imm, VT) || isShiftedMask(Mask1Imm, VT))) {
+
+    // ORR is commutative, so canonicalize to the form 'or (and X, Mask0Imm),
+    // (and Y, Mask1Imm)' where Mask1Imm is the shifted mask masking off the
+    // bits to be inserted.
+    if (isShiftedMask(Mask0Imm, VT)) {
+      std::swap(And0, And1);
+      std::swap(Mask0Imm, Mask1Imm);
+    }
+
+    SDValue Src = And1->getOperand(0);
+    SDValue Dst = And0->getOperand(0);
+    unsigned LSB = countTrailingZeros(Mask1Imm);
+    int Width = BitWidth - APInt(BitWidth, Mask0Imm).countPopulation();
+
+    // The BFXIL inserts the low-order bits from a source register, so right
+    // shift the needed bits into place.
+    SDLoc DL(N);
+    unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
+    SDNode *LSR = CurDAG->getMachineNode(
+        ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT),
+        CurDAG->getTargetConstant(BitWidth - 1, DL, VT));
+
+    // BFXIL is an alias of BFM, so translate to BFM operands.
+    unsigned ImmR = (BitWidth - LSB) % BitWidth;
+    unsigned ImmS = Width - 1;
+
+    // Create the BFXIL instruction.
+    SDValue Ops[] = {Dst, SDValue(LSR, 0),
+                     CurDAG->getTargetConstant(ImmR, DL, VT),
+                     CurDAG->getTargetConstant(ImmS, DL, VT)};
+    unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+    CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+    return true;
+  }
+
+  return false;
+}
+
+bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) {
+  if (N->getOpcode() != ISD::OR)
+    return false;
+
+  APInt NUsefulBits;
+  getUsefulBits(SDValue(N, 0), NUsefulBits);
+
+  // If all bits are not useful, just return UNDEF.
+  if (!NUsefulBits) {
+    CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
+    return true;
+  }
+
+  if (tryBitfieldInsertOpFromOr(N, NUsefulBits, CurDAG))
+    return true;
+
+  return tryBitfieldInsertOpFromOrAndImm(N, CurDAG);
+}
+
+/// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
+/// equivalent of a left shift by a constant amount followed by an and masking
+/// out a contiguous set of bits.
+bool AArch64DAGToDAGISel::tryBitfieldInsertInZeroOp(SDNode *N) {
+  if (N->getOpcode() != ISD::AND)
+    return false;
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return false;
+
+  SDValue Op0;
+  int DstLSB, Width;
+  if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false,
+                               Op0, DstLSB, Width))
+    return false;
+
+  // ImmR is the rotate right amount.
+  unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+  // ImmS is the most significant bit of the source to be moved.
+  unsigned ImmS = Width - 1;
+
+  SDLoc DL(N);
+  SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
+                   CurDAG->getTargetConstant(ImmS, DL, VT)};
+  unsigned Opc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
+  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  return true;
+}
+
+bool
+AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
+                                              unsigned RegWidth) {
+  APFloat FVal(0.0);
+  if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
+    FVal = CN->getValueAPF();
+  else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
+    // Some otherwise illegal constants are allowed in this case.
+    if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow ||
+        !isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)))
+      return false;
+
+    ConstantPoolSDNode *CN =
+        dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1));
+    FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
+  } else
+    return false;
+
+  // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
+  // is between 1 and 32 for a destination w-register, or 1 and 64 for an
+  // x-register.
+  //
+  // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
+  // want THIS_NODE to be 2^fbits. This is much easier to deal with using
+  // integers.
+  bool IsExact;
+
+  // fbits is between 1 and 64 in the worst-case, which means the fmul
+  // could have 2^64 as an actual operand. Need 65 bits of precision.
+  APSInt IntVal(65, true);
+  FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
+
+  // N.b. isPowerOf2 also checks for > 0.
+  if (!IsExact || !IntVal.isPowerOf2()) return false;
+  unsigned FBits = IntVal.logBase2();
+
+  // Checks above should have guaranteed that we haven't lost information in
+  // finding FBits, but it must still be in range.
+  if (FBits == 0 || FBits > RegWidth) return false;
+
+  FixedPos = CurDAG->getTargetConstant(FBits, SDLoc(N), MVT::i32);
+  return true;
+}
+
+// Inspects a register string of the form o0:op1:CRn:CRm:op2 gets the fields
+// of the string and obtains the integer values from them and combines these
+// into a single value to be used in the MRS/MSR instruction.
+static int getIntOperandFromRegisterString(StringRef RegString) {
+  SmallVector<StringRef, 5> Fields;
+  RegString.split(Fields, ':');
+
+  if (Fields.size() == 1)
+    return -1;
+
+  assert(Fields.size() == 5
+            && "Invalid number of fields in read register string");
+
+  SmallVector<int, 5> Ops;
+  bool AllIntFields = true;
+
+  for (StringRef Field : Fields) {
+    unsigned IntField;
+    AllIntFields &= !Field.getAsInteger(10, IntField);
+    Ops.push_back(IntField);
+  }
+
+  assert(AllIntFields &&
+          "Unexpected non-integer value in special register string.");
+
+  // Need to combine the integer fields of the string into a single value
+  // based on the bit encoding of MRS/MSR instruction.
+  return (Ops[0] << 14) | (Ops[1] << 11) | (Ops[2] << 7) |
+         (Ops[3] << 3) | (Ops[4]);
+}
+
+// Lower the read_register intrinsic to an MRS instruction node if the special
+// register string argument is either of the form detailed in the ALCE (the
+// form described in getIntOperandsFromRegsterString) or is a named register
+// known by the MRS SysReg mapper.
+bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
+  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
+  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  SDLoc DL(N);
+
+  int Reg = getIntOperandFromRegisterString(RegString->getString());
+  if (Reg != -1) {
+    ReplaceNode(N, CurDAG->getMachineNode(
+                       AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
+                       CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                       N->getOperand(0)));
+    return true;
+  }
+
+  // Use the sysreg mapper to map the remaining possible strings to the
+  // value for the register to be used for the instruction operand.
+  auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
+  if (TheReg && TheReg->Readable &&
+      TheReg->haveFeatures(Subtarget->getFeatureBits()))
+    Reg = TheReg->Encoding;
+  else
+    Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
+
+  if (Reg != -1) {
+    ReplaceNode(N, CurDAG->getMachineNode(
+                       AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
+                       CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                       N->getOperand(0)));
+    return true;
+  }
+
+  return false;
+}
+
+// Lower the write_register intrinsic to an MSR instruction node if the special
+// register string argument is either of the form detailed in the ALCE (the
+// form described in getIntOperandsFromRegsterString) or is a named register
+// known by the MSR SysReg mapper.
+bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
+  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
+  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  SDLoc DL(N);
+
+  int Reg = getIntOperandFromRegisterString(RegString->getString());
+  if (Reg != -1) {
+    ReplaceNode(
+        N, CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
+                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                                  N->getOperand(2), N->getOperand(0)));
+    return true;
+  }
+
+  // Check if the register was one of those allowed as the pstatefield value in
+  // the MSR (immediate) instruction. To accept the values allowed in the
+  // pstatefield for the MSR (immediate) instruction, we also require that an
+  // immediate value has been provided as an argument, we know that this is
+  // the case as it has been ensured by semantic checking.
+  auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());;
+  if (PMapper) {
+    assert (isa<ConstantSDNode>(N->getOperand(2))
+              && "Expected a constant integer expression.");
+    unsigned Reg = PMapper->Encoding;
+    uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    unsigned State;
+    if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO) {
+      assert(Immed < 2 && "Bad imm");
+      State = AArch64::MSRpstateImm1;
+    } else {
+      assert(Immed < 16 && "Bad imm");
+      State = AArch64::MSRpstateImm4;
+    }
+    ReplaceNode(N, CurDAG->getMachineNode(
+                       State, DL, MVT::Other,
+                       CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                       CurDAG->getTargetConstant(Immed, DL, MVT::i16),
+                       N->getOperand(0)));
+    return true;
+  }
+
+  // Use the sysreg mapper to attempt to map the remaining possible strings
+  // to the value for the register to be used for the MSR (register)
+  // instruction operand.
+  auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
+  if (TheReg && TheReg->Writeable &&
+      TheReg->haveFeatures(Subtarget->getFeatureBits()))
+    Reg = TheReg->Encoding;
+  else
+    Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
+  if (Reg != -1) {
+    ReplaceNode(N, CurDAG->getMachineNode(
+                       AArch64::MSR, DL, MVT::Other,
+                       CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                       N->getOperand(2), N->getOperand(0)));
+    return true;
+  }
+
+  return false;
+}
+
+/// We've got special pseudo-instructions for these
+void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
+  unsigned Opcode;
+  EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
+  if (MemTy == MVT::i8)
+    Opcode = AArch64::CMP_SWAP_8;
+  else if (MemTy == MVT::i16)
+    Opcode = AArch64::CMP_SWAP_16;
+  else if (MemTy == MVT::i32)
+    Opcode = AArch64::CMP_SWAP_32;
+  else if (MemTy == MVT::i64)
+    Opcode = AArch64::CMP_SWAP_64;
+  else
+    llvm_unreachable("Unknown AtomicCmpSwap type");
+
+  MVT RegTy = MemTy == MVT::i64 ? MVT::i64 : MVT::i32;
+  SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
+                   N->getOperand(0)};
+  SDNode *CmpSwap = CurDAG->getMachineNode(
+      Opcode, SDLoc(N),
+      CurDAG->getVTList(RegTy, MVT::i32, MVT::Other), Ops);
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+  ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
+  ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
+  CurDAG->RemoveDeadNode(N);
+}
+
+void AArch64DAGToDAGISel::Select(SDNode *Node) {
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: ");
+  DEBUG(Node->dump(CurDAG));
+  DEBUG(errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    Node->setNodeId(-1);
+    return;
+  }
+
+  // Few custom selection stuff.
+  EVT VT = Node->getValueType(0);
+
+  switch (Node->getOpcode()) {
+  default:
+    break;
+
+  case ISD::ATOMIC_CMP_SWAP:
+    SelectCMP_SWAP(Node);
+    return;
+
+  case ISD::READ_REGISTER:
+    if (tryReadRegister(Node))
+      return;
+    break;
+
+  case ISD::WRITE_REGISTER:
+    if (tryWriteRegister(Node))
+      return;
+    break;
+
+  case ISD::ADD:
+    if (tryMLAV64LaneV128(Node))
+      return;
+    break;
+
+  case ISD::LOAD: {
+    // Try to select as an indexed load. Fall through to normal processing
+    // if we can't.
+    if (tryIndexedLoad(Node))
+      return;
+    break;
+  }
+
+  case ISD::SRL:
+  case ISD::AND:
+  case ISD::SRA:
+  case ISD::SIGN_EXTEND_INREG:
+    if (tryBitfieldExtractOp(Node))
+      return;
+    if (tryBitfieldInsertInZeroOp(Node))
+      return;
+    break;
+
+  case ISD::SIGN_EXTEND:
+    if (tryBitfieldExtractOpFromSExt(Node))
+      return;
+    break;
+
+  case ISD::OR:
+    if (tryBitfieldInsertOp(Node))
+      return;
+    break;
+
+  case ISD::EXTRACT_VECTOR_ELT: {
+    // Extracting lane zero is a special case where we can just use a plain
+    // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
+    // the rest of the compiler, especially the register allocator and copyi
+    // propagation, to reason about, so is preferred when it's possible to
+    // use it.
+    ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
+    // Bail and use the default Select() for non-zero lanes.
+    if (LaneNode->getZExtValue() != 0)
+      break;
+    // If the element type is not the same as the result type, likewise
+    // bail and use the default Select(), as there's more to do than just
+    // a cross-class COPY. This catches extracts of i8 and i16 elements
+    // since they will need an explicit zext.
+    if (VT != Node->getOperand(0).getValueType().getVectorElementType())
+      break;
+    unsigned SubReg;
+    switch (Node->getOperand(0)
+                .getValueType()
+                .getVectorElementType()
+                .getSizeInBits()) {
+    default:
+      llvm_unreachable("Unexpected vector element type!");
+    case 64:
+      SubReg = AArch64::dsub;
+      break;
+    case 32:
+      SubReg = AArch64::ssub;
+      break;
+    case 16:
+      SubReg = AArch64::hsub;
+      break;
+    case 8:
+      llvm_unreachable("unexpected zext-requiring extract element!");
+    }
+    SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
+                                                     Node->getOperand(0));
+    DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
+    DEBUG(Extract->dumpr(CurDAG));
+    DEBUG(dbgs() << "\n");
+    ReplaceNode(Node, Extract.getNode());
+    return;
+  }
+  case ISD::Constant: {
+    // Materialize zero constants as copies from WZR/XZR.  This allows
+    // the coalescer to propagate these into other instructions.
+    ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
+    if (ConstNode->isNullValue()) {
+      if (VT == MVT::i32) {
+        SDValue New = CurDAG->getCopyFromReg(
+            CurDAG->getEntryNode(), SDLoc(Node), AArch64::WZR, MVT::i32);
+        ReplaceNode(Node, New.getNode());
+        return;
+      } else if (VT == MVT::i64) {
+        SDValue New = CurDAG->getCopyFromReg(
+            CurDAG->getEntryNode(), SDLoc(Node), AArch64::XZR, MVT::i64);
+        ReplaceNode(Node, New.getNode());
+        return;
+      }
+    }
+    break;
+  }
+
+  case ISD::FrameIndex: {
+    // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
+    const TargetLowering *TLI = getTargetLowering();
+    SDValue TFI = CurDAG->getTargetFrameIndex(
+        FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+    SDLoc DL(Node);
+    SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32),
+                      CurDAG->getTargetConstant(Shifter, DL, MVT::i32) };
+    CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
+    return;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_ldaxp:
+    case Intrinsic::aarch64_ldxp: {
+      unsigned Op =
+          IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX;
+      SDValue MemAddr = Node->getOperand(2);
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+
+      SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
+                                          MVT::Other, MemAddr, Chain);
+
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+      cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+      ReplaceNode(Node, Ld);
+      return;
+    }
+    case Intrinsic::aarch64_stlxp:
+    case Intrinsic::aarch64_stxp: {
+      unsigned Op =
+          IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX;
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+      SDValue ValLo = Node->getOperand(2);
+      SDValue ValHi = Node->getOperand(3);
+      SDValue MemAddr = Node->getOperand(4);
+
+      // Place arguments in the right order.
+      SDValue Ops[] = {ValLo, ValHi, MemAddr, Chain};
+
+      SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+      cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+      ReplaceNode(Node, St);
+      return;
+    }
+    case Intrinsic::aarch64_neon_ld1x2:
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
+        return;
+      }
+      break;
+    case Intrinsic::aarch64_neon_ld1x3:
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
+        return;
+      }
+      break;
+    case Intrinsic::aarch64_neon_ld1x4:
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
+        return;
+      }
+      break;
+    case Intrinsic::aarch64_neon_ld2:
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
+        return;
+      }
+      break;
+    case Intrinsic::aarch64_neon_ld3:
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
+        return;
+      }
+      break;
+    case Intrinsic::aarch64_neon_ld4:
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
+        return;
+      }
+      break;
+    case Intrinsic::aarch64_neon_ld2r:
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
+        return;
+      }
+      break;
+    case Intrinsic::aarch64_neon_ld3r:
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
+        return;
+      }
+      break;
+    case Intrinsic::aarch64_neon_ld4r:
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
+        return;
+      }
+      break;
+    case Intrinsic::aarch64_neon_ld2lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectLoadLane(Node, 2, AArch64::LD2i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectLoadLane(Node, 2, AArch64::LD2i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectLoadLane(Node, 2, AArch64::LD2i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectLoadLane(Node, 2, AArch64::LD2i64);
+        return;
+      }
+      break;
+    case Intrinsic::aarch64_neon_ld3lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectLoadLane(Node, 3, AArch64::LD3i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectLoadLane(Node, 3, AArch64::LD3i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectLoadLane(Node, 3, AArch64::LD3i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectLoadLane(Node, 3, AArch64::LD3i64);
+        return;
+      }
+      break;
+    case Intrinsic::aarch64_neon_ld4lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectLoadLane(Node, 4, AArch64::LD4i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectLoadLane(Node, 4, AArch64::LD4i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectLoadLane(Node, 4, AArch64::LD4i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectLoadLane(Node, 4, AArch64::LD4i64);
+        return;
+      }
+      break;
+    }
+  } break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_tbl2:
+      SelectTable(Node, 2,
+                  VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
+                  false);
+      return;
+    case Intrinsic::aarch64_neon_tbl3:
+      SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
+                                           : AArch64::TBLv16i8Three,
+                  false);
+      return;
+    case Intrinsic::aarch64_neon_tbl4:
+      SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
+                                           : AArch64::TBLv16i8Four,
+                  false);
+      return;
+    case Intrinsic::aarch64_neon_tbx2:
+      SelectTable(Node, 2,
+                  VT == MVT::v8i8 ? AArch64::TBXv8i8Two : AArch64::TBXv16i8Two,
+                  true);
+      return;
+    case Intrinsic::aarch64_neon_tbx3:
+      SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
+                                           : AArch64::TBXv16i8Three,
+                  true);
+      return;
+    case Intrinsic::aarch64_neon_tbx4:
+      SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
+                                           : AArch64::TBXv16i8Four,
+                  true);
+      return;
+    case Intrinsic::aarch64_neon_smull:
+    case Intrinsic::aarch64_neon_umull:
+      if (tryMULLV64LaneV128(IntNo, Node))
+        return;
+      break;
+    }
+    break;
+  }
+  case ISD::INTRINSIC_VOID: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    if (Node->getNumOperands() >= 3)
+      VT = Node->getOperand(2)->getValueType(0);
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_st1x2: {
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 2, AArch64::ST1Twov8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 2, AArch64::ST1Twov16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 2, AArch64::ST1Twov4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 2, AArch64::ST1Twov8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 2, AArch64::ST1Twov2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 2, AArch64::ST1Twov4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 2, AArch64::ST1Twov2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 2, AArch64::ST1Twov1d);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_neon_st1x3: {
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 3, AArch64::ST1Threev8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 3, AArch64::ST1Threev16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 3, AArch64::ST1Threev4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 3, AArch64::ST1Threev8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 3, AArch64::ST1Threev2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 3, AArch64::ST1Threev4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 3, AArch64::ST1Threev2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 3, AArch64::ST1Threev1d);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_neon_st1x4: {
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 4, AArch64::ST1Fourv8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 4, AArch64::ST1Fourv16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 4, AArch64::ST1Fourv4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 4, AArch64::ST1Fourv8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 4, AArch64::ST1Fourv2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 4, AArch64::ST1Fourv4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 4, AArch64::ST1Fourv2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 4, AArch64::ST1Fourv1d);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_neon_st2: {
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 2, AArch64::ST2Twov8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 2, AArch64::ST2Twov16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 2, AArch64::ST2Twov4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 2, AArch64::ST2Twov8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 2, AArch64::ST2Twov2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 2, AArch64::ST2Twov4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 2, AArch64::ST2Twov2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 2, AArch64::ST1Twov1d);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_neon_st3: {
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 3, AArch64::ST3Threev8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 3, AArch64::ST3Threev16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 3, AArch64::ST3Threev4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 3, AArch64::ST3Threev8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 3, AArch64::ST3Threev2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 3, AArch64::ST3Threev4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 3, AArch64::ST3Threev2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 3, AArch64::ST1Threev1d);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_neon_st4: {
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 4, AArch64::ST4Fourv8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 4, AArch64::ST4Fourv16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 4, AArch64::ST4Fourv4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 4, AArch64::ST4Fourv8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 4, AArch64::ST4Fourv2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 4, AArch64::ST4Fourv4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 4, AArch64::ST4Fourv2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 4, AArch64::ST1Fourv1d);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_neon_st2lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectStoreLane(Node, 2, AArch64::ST2i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectStoreLane(Node, 2, AArch64::ST2i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectStoreLane(Node, 2, AArch64::ST2i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectStoreLane(Node, 2, AArch64::ST2i64);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_neon_st3lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectStoreLane(Node, 3, AArch64::ST3i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectStoreLane(Node, 3, AArch64::ST3i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectStoreLane(Node, 3, AArch64::ST3i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectStoreLane(Node, 3, AArch64::ST3i64);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_neon_st4lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectStoreLane(Node, 4, AArch64::ST4i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectStoreLane(Node, 4, AArch64::ST4i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectStoreLane(Node, 4, AArch64::ST4i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectStoreLane(Node, 4, AArch64::ST4i64);
+        return;
+      }
+      break;
+    }
+    }
+    break;
+  }
+  case AArch64ISD::LD2post: {
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::LD3post: {
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::LD4post: {
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::LD1x2post: {
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::LD1x3post: {
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::LD1x4post: {
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::LD1DUPpost: {
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::LD2DUPpost: {
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::LD3DUPpost: {
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::LD4DUPpost: {
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::LD1LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::LD2LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::LD3LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::LD4LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::ST2post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::ST3post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::ST4post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::ST1x2post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::ST1x3post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::ST1x4post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::ST2LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::ST3LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::ST4LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
+      return;
+    }
+    break;
+  }
+  }
+
+  // Select the default instruction
+  SelectCode(Node);
+}
+
+/// createAArch64ISelDag - This pass converts a legalized DAG into a
+/// AArch64-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
+                                         CodeGenOpt::Level OptLevel) {
+  return new AArch64DAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
new file mode 100644
index 000000000000..4c98253878e4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -0,0 +1,10666 @@
+//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64TargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64ISelLowering.h"
+#include "AArch64CallingConvention.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64PerfectShuffle.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "AArch64TargetObjectFile.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-lower"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+
+static cl::opt<bool>
+EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
+                           cl::desc("Allow AArch64 SLI/SRI formation"),
+                           cl::init(false));
+
+// FIXME: The necessary dtprel relocations don't seem to be supported
+// well in the GNU bfd and gold linkers at the moment. Therefore, by
+// default, for now, fall back to GeneralDynamic code generation.
+cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
+    "aarch64-elf-ldtls-generation", cl::Hidden,
+    cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
+    cl::init(false));
+
+/// Value type used for condition codes.
+static const MVT MVT_CC = MVT::i32;
+
+AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
+                                             const AArch64Subtarget &STI)
+    : TargetLowering(TM), Subtarget(&STI) {
+
+  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
+  // we have to make something up. Arbitrarily, choose ZeroOrOne.
+  setBooleanContents(ZeroOrOneBooleanContent);
+  // When comparing vectors the result sets the different elements in the
+  // vector to all-one or all-zero.
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
+  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
+
+  if (Subtarget->hasFPARMv8()) {
+    addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
+    addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
+    addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
+  }
+
+  if (Subtarget->hasNEON()) {
+    addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
+    addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
+    // Someone set us up the NEON.
+    addDRTypeForNEON(MVT::v2f32);
+    addDRTypeForNEON(MVT::v8i8);
+    addDRTypeForNEON(MVT::v4i16);
+    addDRTypeForNEON(MVT::v2i32);
+    addDRTypeForNEON(MVT::v1i64);
+    addDRTypeForNEON(MVT::v1f64);
+    addDRTypeForNEON(MVT::v4f16);
+
+    addQRTypeForNEON(MVT::v4f32);
+    addQRTypeForNEON(MVT::v2f64);
+    addQRTypeForNEON(MVT::v16i8);
+    addQRTypeForNEON(MVT::v8i16);
+    addQRTypeForNEON(MVT::v4i32);
+    addQRTypeForNEON(MVT::v2i64);
+    addQRTypeForNEON(MVT::v8f16);
+  }
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties(Subtarget->getRegisterInfo());
+
+  // Provide all sorts of operation actions
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+  setOperationAction(ISD::SETCC, MVT::i64, Custom);
+  setOperationAction(ISD::SETCC, MVT::f32, Custom);
+  setOperationAction(ISD::SETCC, MVT::f64, Custom);
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+  setOperationAction(ISD::SELECT, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT, MVT::f64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+
+  setOperationAction(ISD::FREM, MVT::f32, Expand);
+  setOperationAction(ISD::FREM, MVT::f64, Expand);
+  setOperationAction(ISD::FREM, MVT::f80, Expand);
+
+  // Custom lowering hooks are needed for XOR
+  // to fold it into CSINC/CSINV.
+  setOperationAction(ISD::XOR, MVT::i32, Custom);
+  setOperationAction(ISD::XOR, MVT::i64, Custom);
+
+  // Virtually no operation on f128 is legal, but LLVM can't expand them when
+  // there's a valid register class, so we need custom operations in most cases.
+  setOperationAction(ISD::FABS, MVT::f128, Expand);
+  setOperationAction(ISD::FADD, MVT::f128, Custom);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
+  setOperationAction(ISD::FCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FDIV, MVT::f128, Custom);
+  setOperationAction(ISD::FMA, MVT::f128, Expand);
+  setOperationAction(ISD::FMUL, MVT::f128, Custom);
+  setOperationAction(ISD::FNEG, MVT::f128, Expand);
+  setOperationAction(ISD::FPOW, MVT::f128, Expand);
+  setOperationAction(ISD::FREM, MVT::f128, Expand);
+  setOperationAction(ISD::FRINT, MVT::f128, Expand);
+  setOperationAction(ISD::FSIN, MVT::f128, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f128, Expand);
+  setOperationAction(ISD::FSUB, MVT::f128, Custom);
+  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
+  setOperationAction(ISD::SETCC, MVT::f128, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
+  setOperationAction(ISD::SELECT, MVT::f128, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
+  setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+
+  // Lowering for many of the conversions is actually specified by the non-f128
+  // type. The LowerXXX function will be trivial when f128 isn't involved.
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
+
+  // Variable arguments.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
+  // Variable-sized objects.
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+  // Constant pool entries
+  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
+
+  // BlockAddress
+  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
+
+  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
+  setOperationAction(ISD::ADDC, MVT::i32, Custom);
+  setOperationAction(ISD::ADDE, MVT::i32, Custom);
+  setOperationAction(ISD::SUBC, MVT::i32, Custom);
+  setOperationAction(ISD::SUBE, MVT::i32, Custom);
+  setOperationAction(ISD::ADDC, MVT::i64, Custom);
+  setOperationAction(ISD::ADDE, MVT::i64, Custom);
+  setOperationAction(ISD::SUBC, MVT::i64, Custom);
+  setOperationAction(ISD::SUBE, MVT::i64, Custom);
+
+  // AArch64 lacks both left-rotate and popcount instructions.
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
+  for (MVT VT : MVT::vector_valuetypes()) {
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
+  }
+
+  // AArch64 doesn't have {U|S}MUL_LOHI.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+
+
+  setOperationAction(ISD::CTPOP, MVT::i32, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
+
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+  for (MVT VT : MVT::vector_valuetypes()) {
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+  }
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  // Custom lower Add/Sub/Mul with overflow.
+  setOperationAction(ISD::SADDO, MVT::i32, Custom);
+  setOperationAction(ISD::SADDO, MVT::i64, Custom);
+  setOperationAction(ISD::UADDO, MVT::i32, Custom);
+  setOperationAction(ISD::UADDO, MVT::i64, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
+  setOperationAction(ISD::USUBO, MVT::i32, Custom);
+  setOperationAction(ISD::USUBO, MVT::i64, Custom);
+  setOperationAction(ISD::SMULO, MVT::i32, Custom);
+  setOperationAction(ISD::SMULO, MVT::i64, Custom);
+  setOperationAction(ISD::UMULO, MVT::i32, Custom);
+  setOperationAction(ISD::UMULO, MVT::i64, Custom);
+
+  setOperationAction(ISD::FSIN, MVT::f32, Expand);
+  setOperationAction(ISD::FSIN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FPOW, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+  // f16 is a storage-only type, always promote it to f32.
+  setOperationAction(ISD::SETCC,       MVT::f16,  Promote);
+  setOperationAction(ISD::BR_CC,       MVT::f16,  Promote);
+  setOperationAction(ISD::SELECT_CC,   MVT::f16,  Promote);
+  setOperationAction(ISD::SELECT,      MVT::f16,  Promote);
+  setOperationAction(ISD::FADD,        MVT::f16,  Promote);
+  setOperationAction(ISD::FSUB,        MVT::f16,  Promote);
+  setOperationAction(ISD::FMUL,        MVT::f16,  Promote);
+  setOperationAction(ISD::FDIV,        MVT::f16,  Promote);
+  setOperationAction(ISD::FREM,        MVT::f16,  Promote);
+  setOperationAction(ISD::FMA,         MVT::f16,  Promote);
+  setOperationAction(ISD::FNEG,        MVT::f16,  Promote);
+  setOperationAction(ISD::FABS,        MVT::f16,  Promote);
+  setOperationAction(ISD::FCEIL,       MVT::f16,  Promote);
+  setOperationAction(ISD::FCOPYSIGN,   MVT::f16,  Promote);
+  setOperationAction(ISD::FCOS,        MVT::f16,  Promote);
+  setOperationAction(ISD::FFLOOR,      MVT::f16,  Promote);
+  setOperationAction(ISD::FNEARBYINT,  MVT::f16,  Promote);
+  setOperationAction(ISD::FPOW,        MVT::f16,  Promote);
+  setOperationAction(ISD::FPOWI,       MVT::f16,  Promote);
+  setOperationAction(ISD::FRINT,       MVT::f16,  Promote);
+  setOperationAction(ISD::FSIN,        MVT::f16,  Promote);
+  setOperationAction(ISD::FSINCOS,     MVT::f16,  Promote);
+  setOperationAction(ISD::FSQRT,       MVT::f16,  Promote);
+  setOperationAction(ISD::FEXP,        MVT::f16,  Promote);
+  setOperationAction(ISD::FEXP2,       MVT::f16,  Promote);
+  setOperationAction(ISD::FLOG,        MVT::f16,  Promote);
+  setOperationAction(ISD::FLOG2,       MVT::f16,  Promote);
+  setOperationAction(ISD::FLOG10,      MVT::f16,  Promote);
+  setOperationAction(ISD::FROUND,      MVT::f16,  Promote);
+  setOperationAction(ISD::FTRUNC,      MVT::f16,  Promote);
+  setOperationAction(ISD::FMINNUM,     MVT::f16,  Promote);
+  setOperationAction(ISD::FMAXNUM,     MVT::f16,  Promote);
+  setOperationAction(ISD::FMINNAN,     MVT::f16,  Promote);
+  setOperationAction(ISD::FMAXNAN,     MVT::f16,  Promote);
+
+  // v4f16 is also a storage-only type, so promote it to v4f32 when that is
+  // known to be safe.
+  setOperationAction(ISD::FADD, MVT::v4f16, Promote);
+  setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
+  setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
+  setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
+  setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
+  setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
+  AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);
+
+  // Expand all other v4f16 operations.
+  // FIXME: We could generate better code by promoting some operations to
+  // a pair of v4f32s
+  setOperationAction(ISD::FABS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
+  setOperationAction(ISD::FMA, MVT::v4f16, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
+  setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
+  setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
+  setOperationAction(ISD::FREM, MVT::v4f16, Expand);
+  setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
+  setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
+  setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
+  setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
+  setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
+  setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
+
+
+  // v8f16 is also a storage-only type, so expand it.
+  setOperationAction(ISD::FABS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FADD, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
+  setOperationAction(ISD::FMA, MVT::v8f16, Expand);
+  setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
+  setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
+  setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
+  setOperationAction(ISD::FREM, MVT::v8f16, Expand);
+  setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
+  setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
+  setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
+  setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
+  setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
+  setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
+  setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
+
+  // AArch64 has implementations of a lot of rounding-like FP operations.
+  for (MVT Ty : {MVT::f32, MVT::f64}) {
+    setOperationAction(ISD::FFLOOR, Ty, Legal);
+    setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+    setOperationAction(ISD::FCEIL, Ty, Legal);
+    setOperationAction(ISD::FRINT, Ty, Legal);
+    setOperationAction(ISD::FTRUNC, Ty, Legal);
+    setOperationAction(ISD::FROUND, Ty, Legal);
+    setOperationAction(ISD::FMINNUM, Ty, Legal);
+    setOperationAction(ISD::FMAXNUM, Ty, Legal);
+    setOperationAction(ISD::FMINNAN, Ty, Legal);
+    setOperationAction(ISD::FMAXNAN, Ty, Legal);
+  }
+
+  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+
+  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
+  // This requires the Performance Monitors extension.
+  if (Subtarget->hasPerfMon())
+    setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+
+  if (Subtarget->isTargetMachO()) {
+    // For iOS, we don't want to the normal expansion of a libcall to
+    // sincos. We want to issue a libcall to __sincos_stret to avoid memory
+    // traffic.
+    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+  } else {
+    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+  }
+
+  // Make floating-point constants legal for the large code model, so they don't
+  // become loads from the constant pool.
+  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
+    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+    setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+  }
+
+  // AArch64 does not have floating-point extending loads, i1 sign-extending
+  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
+  for (MVT VT : MVT::fp_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
+  }
+  for (MVT VT : MVT::integer_valuetypes())
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
+
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f80, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+
+  setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+  setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+
+  // Indexed loads and stores are supported.
+  for (unsigned im = (unsigned)ISD::PRE_INC;
+       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+    setIndexedLoadAction(im, MVT::i8, Legal);
+    setIndexedLoadAction(im, MVT::i16, Legal);
+    setIndexedLoadAction(im, MVT::i32, Legal);
+    setIndexedLoadAction(im, MVT::i64, Legal);
+    setIndexedLoadAction(im, MVT::f64, Legal);
+    setIndexedLoadAction(im, MVT::f32, Legal);
+    setIndexedLoadAction(im, MVT::f16, Legal);
+    setIndexedStoreAction(im, MVT::i8, Legal);
+    setIndexedStoreAction(im, MVT::i16, Legal);
+    setIndexedStoreAction(im, MVT::i32, Legal);
+    setIndexedStoreAction(im, MVT::i64, Legal);
+    setIndexedStoreAction(im, MVT::f64, Legal);
+    setIndexedStoreAction(im, MVT::f32, Legal);
+    setIndexedStoreAction(im, MVT::f16, Legal);
+  }
+
+  // Trap.
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // We combine OR nodes for bitfield operations.
+  setTargetDAGCombine(ISD::OR);
+
+  // Vector add and sub nodes may conceal a high-half opportunity.
+  // Also, try to fold ADD into CSINC/CSINV..
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
+
+  setTargetDAGCombine(ISD::FP_TO_SINT);
+  setTargetDAGCombine(ISD::FP_TO_UINT);
+  setTargetDAGCombine(ISD::FDIV);
+
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::BITCAST);
+  setTargetDAGCombine(ISD::CONCAT_VECTORS);
+  setTargetDAGCombine(ISD::STORE);
+  if (Subtarget->supportsAddressTopByteIgnored())
+    setTargetDAGCombine(ISD::LOAD);
+
+  setTargetDAGCombine(ISD::MUL);
+
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::VSELECT);
+
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+
+  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
+  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
+
+  setStackPointerRegisterToSaveRestore(AArch64::SP);
+
+  setSchedulingPreference(Sched::Hybrid);
+
+  // Enable TBZ/TBNZ
+  MaskAndBranchFoldingIsLegal = true;
+  EnableExtLdPromotion = true;
+
+  // Set required alignment.
+  setMinFunctionAlignment(2);
+  // Set preferred alignments.
+  setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
+  setPrefLoopAlignment(STI.getPrefLoopAlignment());
+
+  // Only change the limit for entries in a jump table if specified by
+  // the subtarget, but not at the command line.
+  unsigned MaxJT = STI.getMaximumJumpTableSize();
+  if (MaxJT && getMaximumJumpTableSize() == 0)
+    setMaximumJumpTableSize(MaxJT);
+
+  setHasExtractBitsInsn(true);
+
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  if (Subtarget->hasNEON()) {
+    // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
+    // silliness like this:
+    setOperationAction(ISD::FABS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FADD, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
+    setOperationAction(ISD::FMA, MVT::v1f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
+    setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
+    setOperationAction(ISD::FREM, MVT::v1f64, Expand);
+    setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
+    setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
+    setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
+    setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
+    setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
+
+    setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
+    setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
+
+    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+
+    // AArch64 doesn't have a direct vector ->f32 conversion instructions for
+    // elements smaller than i32, so promote the input to i32 first.
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
+    // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16
+    // -> v8f16 conversions.
+    setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Promote);
+    // Similarly, there is no direct i32 -> f64 vector conversion instruction.
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
+    // Or, direct i32 -> f16 vector conversion.  Set it so custom, so the
+    // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+
+    setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
+    setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
+
+    setOperationAction(ISD::CTTZ,       MVT::v2i8,  Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v4i16, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v2i32, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v1i64, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v16i8, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v8i16, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v4i32, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v2i64, Expand);
+
+    // AArch64 doesn't have MUL.2d:
+    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+    // Custom handling for some quad-vector types to detect MULL.
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+
+    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
+    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+    // Likewise, narrowing and extending vector loads/stores aren't handled
+    // directly.
+    for (MVT VT : MVT::vector_valuetypes()) {
+      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+
+      setOperationAction(ISD::MULHS, VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+      setOperationAction(ISD::MULHU, VT, Expand);
+      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+      setOperationAction(ISD::BSWAP, VT, Expand);
+
+      for (MVT InnerVT : MVT::vector_valuetypes()) {
+        setTruncStoreAction(VT, InnerVT, Expand);
+        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+      }
+    }
+
+    // AArch64 has implementations of a lot of rounding-like FP operations.
+    for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
+      setOperationAction(ISD::FFLOOR, Ty, Legal);
+      setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+      setOperationAction(ISD::FCEIL, Ty, Legal);
+      setOperationAction(ISD::FRINT, Ty, Legal);
+      setOperationAction(ISD::FTRUNC, Ty, Legal);
+      setOperationAction(ISD::FROUND, Ty, Legal);
+    }
+  }
+
+  PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
+}
+
+void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
+  if (VT == MVT::v2f32 || VT == MVT::v4f16) {
+    setOperationAction(ISD::LOAD, VT, Promote);
+    AddPromotedToType(ISD::LOAD, VT, MVT::v2i32);
+
+    setOperationAction(ISD::STORE, VT, Promote);
+    AddPromotedToType(ISD::STORE, VT, MVT::v2i32);
+  } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
+    setOperationAction(ISD::LOAD, VT, Promote);
+    AddPromotedToType(ISD::LOAD, VT, MVT::v2i64);
+
+    setOperationAction(ISD::STORE, VT, Promote);
+    AddPromotedToType(ISD::STORE, VT, MVT::v2i64);
+  }
+
+  // Mark vector float intrinsics as expand.
+  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
+    setOperationAction(ISD::FSIN, VT, Expand);
+    setOperationAction(ISD::FCOS, VT, Expand);
+    setOperationAction(ISD::FPOWI, VT, Expand);
+    setOperationAction(ISD::FPOW, VT, Expand);
+    setOperationAction(ISD::FLOG, VT, Expand);
+    setOperationAction(ISD::FLOG2, VT, Expand);
+    setOperationAction(ISD::FLOG10, VT, Expand);
+    setOperationAction(ISD::FEXP, VT, Expand);
+    setOperationAction(ISD::FEXP2, VT, Expand);
+
+    // But we do support custom-lowering for FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+  }
+
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+  setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+  setOperationAction(ISD::SRA, VT, Custom);
+  setOperationAction(ISD::SRL, VT, Custom);
+  setOperationAction(ISD::SHL, VT, Custom);
+  setOperationAction(ISD::AND, VT, Custom);
+  setOperationAction(ISD::OR, VT, Custom);
+  setOperationAction(ISD::SETCC, VT, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
+
+  setOperationAction(ISD::SELECT, VT, Expand);
+  setOperationAction(ISD::SELECT_CC, VT, Expand);
+  setOperationAction(ISD::VSELECT, VT, Expand);
+  for (MVT InnerVT : MVT::all_valuetypes())
+    setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+
+  // CNT supports only B element sizes.
+  if (VT != MVT::v8i8 && VT != MVT::v16i8)
+    setOperationAction(ISD::CTPOP, VT, Expand);
+
+  setOperationAction(ISD::UDIV, VT, Expand);
+  setOperationAction(ISD::SDIV, VT, Expand);
+  setOperationAction(ISD::UREM, VT, Expand);
+  setOperationAction(ISD::SREM, VT, Expand);
+  setOperationAction(ISD::FREM, VT, Expand);
+
+  setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+  setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+
+  // [SU][MIN|MAX] are available for all NEON types apart from i64.
+  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
+    for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
+      setOperationAction(Opcode, VT, Legal);
+
+  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!).
+  if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16)
+    for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
+                            ISD::FMINNUM, ISD::FMAXNUM})
+      setOperationAction(Opcode, VT, Legal);
+
+  if (Subtarget->isLittleEndian()) {
+    for (unsigned im = (unsigned)ISD::PRE_INC;
+         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+      setIndexedLoadAction(im, VT, Legal);
+      setIndexedStoreAction(im, VT, Legal);
+    }
+  }
+}
+
+void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &AArch64::FPR64RegClass);
+  addTypeForNEON(VT, MVT::v2i32);
+}
+
+void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &AArch64::FPR128RegClass);
+  addTypeForNEON(VT, MVT::v4i32);
+}
+
+EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
+                                              EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+/// computeKnownBitsForTargetNode - Determine which of the bits specified in
+/// Mask are known to be either zero or one and return them in the
+/// KnownZero/KnownOne bitsets.
+void AArch64TargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, APInt &KnownZero, APInt &KnownOne,
+    const SelectionDAG &DAG, unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  default:
+    break;
+  case AArch64ISD::CSEL: {
+    APInt KnownZero2, KnownOne2;
+    DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
+    DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
+    KnownZero &= KnownZero2;
+    KnownOne &= KnownOne2;
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
+    Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
+    switch (IntID) {
+    default: return;
+    case Intrinsic::aarch64_ldaxr:
+    case Intrinsic::aarch64_ldxr: {
+      unsigned BitWidth = KnownOne.getBitWidth();
+      EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
+      unsigned MemBits = VT.getScalarSizeInBits();
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+      return;
+    }
+    }
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID: {
+    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_umaxv:
+    case Intrinsic::aarch64_neon_uminv: {
+      // Figure out the datatype of the vector operand. The UMINV instruction
+      // will zero extend the result, so we can mark as known zero all the
+      // bits larger than the element datatype. 32-bit or larget doesn't need
+      // this as those are legal types and will be handled by isel directly.
+      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
+      unsigned BitWidth = KnownZero.getBitWidth();
+      if (VT == MVT::v8i8 || VT == MVT::v16i8) {
+        assert(BitWidth >= 8 && "Unexpected width!");
+        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
+        KnownZero |= Mask;
+      } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
+        assert(BitWidth >= 16 && "Unexpected width!");
+        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
+        KnownZero |= Mask;
+      }
+      break;
+    } break;
+    }
+  }
+  }
+}
+
+MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
+                                                  EVT) const {
+  return MVT::i64;
+}
+
+bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                           unsigned AddrSpace,
+                                                           unsigned Align,
+                                                           bool *Fast) const {
+  if (Subtarget->requiresStrictAlign())
+    return false;
+
+  if (Fast) {
+    // Some CPUs are fine with unaligned stores except for 128-bit ones.
+    *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
+            // See comments in performSTORECombine() for more details about
+            // these conditions.
+
+            // Code that uses clang vector extensions can mark that it
+            // wants unaligned accesses to be treated as fast by
+            // underspecifying alignment to be 1 or 2.
+            Align <= 2 ||
+
+            // Disregard v2i64. Memcpy lowering produces those and splitting
+            // them regresses performance on micro-benchmarks and olden/bh.
+            VT == MVT::v2i64;
+  }
+  return true;
+}
+
+FastISel *
+AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                      const TargetLibraryInfo *libInfo) const {
+  return AArch64::createFastISel(funcInfo, libInfo);
+}
+
+const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch ((AArch64ISD::NodeType)Opcode) {
+  case AArch64ISD::FIRST_NUMBER:      break;
+  case AArch64ISD::CALL:              return "AArch64ISD::CALL";
+  case AArch64ISD::ADRP:              return "AArch64ISD::ADRP";
+  case AArch64ISD::ADDlow:            return "AArch64ISD::ADDlow";
+  case AArch64ISD::LOADgot:           return "AArch64ISD::LOADgot";
+  case AArch64ISD::RET_FLAG:          return "AArch64ISD::RET_FLAG";
+  case AArch64ISD::BRCOND:            return "AArch64ISD::BRCOND";
+  case AArch64ISD::CSEL:              return "AArch64ISD::CSEL";
+  case AArch64ISD::FCSEL:             return "AArch64ISD::FCSEL";
+  case AArch64ISD::CSINV:             return "AArch64ISD::CSINV";
+  case AArch64ISD::CSNEG:             return "AArch64ISD::CSNEG";
+  case AArch64ISD::CSINC:             return "AArch64ISD::CSINC";
+  case AArch64ISD::THREAD_POINTER:    return "AArch64ISD::THREAD_POINTER";
+  case AArch64ISD::TLSDESC_CALLSEQ:   return "AArch64ISD::TLSDESC_CALLSEQ";
+  case AArch64ISD::ADC:               return "AArch64ISD::ADC";
+  case AArch64ISD::SBC:               return "AArch64ISD::SBC";
+  case AArch64ISD::ADDS:              return "AArch64ISD::ADDS";
+  case AArch64ISD::SUBS:              return "AArch64ISD::SUBS";
+  case AArch64ISD::ADCS:              return "AArch64ISD::ADCS";
+  case AArch64ISD::SBCS:              return "AArch64ISD::SBCS";
+  case AArch64ISD::ANDS:              return "AArch64ISD::ANDS";
+  case AArch64ISD::CCMP:              return "AArch64ISD::CCMP";
+  case AArch64ISD::CCMN:              return "AArch64ISD::CCMN";
+  case AArch64ISD::FCCMP:             return "AArch64ISD::FCCMP";
+  case AArch64ISD::FCMP:              return "AArch64ISD::FCMP";
+  case AArch64ISD::DUP:               return "AArch64ISD::DUP";
+  case AArch64ISD::DUPLANE8:          return "AArch64ISD::DUPLANE8";
+  case AArch64ISD::DUPLANE16:         return "AArch64ISD::DUPLANE16";
+  case AArch64ISD::DUPLANE32:         return "AArch64ISD::DUPLANE32";
+  case AArch64ISD::DUPLANE64:         return "AArch64ISD::DUPLANE64";
+  case AArch64ISD::MOVI:              return "AArch64ISD::MOVI";
+  case AArch64ISD::MOVIshift:         return "AArch64ISD::MOVIshift";
+  case AArch64ISD::MOVIedit:          return "AArch64ISD::MOVIedit";
+  case AArch64ISD::MOVImsl:           return "AArch64ISD::MOVImsl";
+  case AArch64ISD::FMOV:              return "AArch64ISD::FMOV";
+  case AArch64ISD::MVNIshift:         return "AArch64ISD::MVNIshift";
+  case AArch64ISD::MVNImsl:           return "AArch64ISD::MVNImsl";
+  case AArch64ISD::BICi:              return "AArch64ISD::BICi";
+  case AArch64ISD::ORRi:              return "AArch64ISD::ORRi";
+  case AArch64ISD::BSL:               return "AArch64ISD::BSL";
+  case AArch64ISD::NEG:               return "AArch64ISD::NEG";
+  case AArch64ISD::EXTR:              return "AArch64ISD::EXTR";
+  case AArch64ISD::ZIP1:              return "AArch64ISD::ZIP1";
+  case AArch64ISD::ZIP2:              return "AArch64ISD::ZIP2";
+  case AArch64ISD::UZP1:              return "AArch64ISD::UZP1";
+  case AArch64ISD::UZP2:              return "AArch64ISD::UZP2";
+  case AArch64ISD::TRN1:              return "AArch64ISD::TRN1";
+  case AArch64ISD::TRN2:              return "AArch64ISD::TRN2";
+  case AArch64ISD::REV16:             return "AArch64ISD::REV16";
+  case AArch64ISD::REV32:             return "AArch64ISD::REV32";
+  case AArch64ISD::REV64:             return "AArch64ISD::REV64";
+  case AArch64ISD::EXT:               return "AArch64ISD::EXT";
+  case AArch64ISD::VSHL:              return "AArch64ISD::VSHL";
+  case AArch64ISD::VLSHR:             return "AArch64ISD::VLSHR";
+  case AArch64ISD::VASHR:             return "AArch64ISD::VASHR";
+  case AArch64ISD::CMEQ:              return "AArch64ISD::CMEQ";
+  case AArch64ISD::CMGE:              return "AArch64ISD::CMGE";
+  case AArch64ISD::CMGT:              return "AArch64ISD::CMGT";
+  case AArch64ISD::CMHI:              return "AArch64ISD::CMHI";
+  case AArch64ISD::CMHS:              return "AArch64ISD::CMHS";
+  case AArch64ISD::FCMEQ:             return "AArch64ISD::FCMEQ";
+  case AArch64ISD::FCMGE:             return "AArch64ISD::FCMGE";
+  case AArch64ISD::FCMGT:             return "AArch64ISD::FCMGT";
+  case AArch64ISD::CMEQz:             return "AArch64ISD::CMEQz";
+  case AArch64ISD::CMGEz:             return "AArch64ISD::CMGEz";
+  case AArch64ISD::CMGTz:             return "AArch64ISD::CMGTz";
+  case AArch64ISD::CMLEz:             return "AArch64ISD::CMLEz";
+  case AArch64ISD::CMLTz:             return "AArch64ISD::CMLTz";
+  case AArch64ISD::FCMEQz:            return "AArch64ISD::FCMEQz";
+  case AArch64ISD::FCMGEz:            return "AArch64ISD::FCMGEz";
+  case AArch64ISD::FCMGTz:            return "AArch64ISD::FCMGTz";
+  case AArch64ISD::FCMLEz:            return "AArch64ISD::FCMLEz";
+  case AArch64ISD::FCMLTz:            return "AArch64ISD::FCMLTz";
+  case AArch64ISD::SADDV:             return "AArch64ISD::SADDV";
+  case AArch64ISD::UADDV:             return "AArch64ISD::UADDV";
+  case AArch64ISD::SMINV:             return "AArch64ISD::SMINV";
+  case AArch64ISD::UMINV:             return "AArch64ISD::UMINV";
+  case AArch64ISD::SMAXV:             return "AArch64ISD::SMAXV";
+  case AArch64ISD::UMAXV:             return "AArch64ISD::UMAXV";
+  case AArch64ISD::NOT:               return "AArch64ISD::NOT";
+  case AArch64ISD::BIT:               return "AArch64ISD::BIT";
+  case AArch64ISD::CBZ:               return "AArch64ISD::CBZ";
+  case AArch64ISD::CBNZ:              return "AArch64ISD::CBNZ";
+  case AArch64ISD::TBZ:               return "AArch64ISD::TBZ";
+  case AArch64ISD::TBNZ:              return "AArch64ISD::TBNZ";
+  case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
+  case AArch64ISD::PREFETCH:          return "AArch64ISD::PREFETCH";
+  case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
+  case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
+  case AArch64ISD::NVCAST:            return "AArch64ISD::NVCAST";
+  case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
+  case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
+  case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
+  case AArch64ISD::URSHR_I:           return "AArch64ISD::URSHR_I";
+  case AArch64ISD::SQSHLU_I:          return "AArch64ISD::SQSHLU_I";
+  case AArch64ISD::WrapperLarge:      return "AArch64ISD::WrapperLarge";
+  case AArch64ISD::LD2post:           return "AArch64ISD::LD2post";
+  case AArch64ISD::LD3post:           return "AArch64ISD::LD3post";
+  case AArch64ISD::LD4post:           return "AArch64ISD::LD4post";
+  case AArch64ISD::ST2post:           return "AArch64ISD::ST2post";
+  case AArch64ISD::ST3post:           return "AArch64ISD::ST3post";
+  case AArch64ISD::ST4post:           return "AArch64ISD::ST4post";
+  case AArch64ISD::LD1x2post:         return "AArch64ISD::LD1x2post";
+  case AArch64ISD::LD1x3post:         return "AArch64ISD::LD1x3post";
+  case AArch64ISD::LD1x4post:         return "AArch64ISD::LD1x4post";
+  case AArch64ISD::ST1x2post:         return "AArch64ISD::ST1x2post";
+  case AArch64ISD::ST1x3post:         return "AArch64ISD::ST1x3post";
+  case AArch64ISD::ST1x4post:         return "AArch64ISD::ST1x4post";
+  case AArch64ISD::LD1DUPpost:        return "AArch64ISD::LD1DUPpost";
+  case AArch64ISD::LD2DUPpost:        return "AArch64ISD::LD2DUPpost";
+  case AArch64ISD::LD3DUPpost:        return "AArch64ISD::LD3DUPpost";
+  case AArch64ISD::LD4DUPpost:        return "AArch64ISD::LD4DUPpost";
+  case AArch64ISD::LD1LANEpost:       return "AArch64ISD::LD1LANEpost";
+  case AArch64ISD::LD2LANEpost:       return "AArch64ISD::LD2LANEpost";
+  case AArch64ISD::LD3LANEpost:       return "AArch64ISD::LD3LANEpost";
+  case AArch64ISD::LD4LANEpost:       return "AArch64ISD::LD4LANEpost";
+  case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
+  case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
+  case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
+  case AArch64ISD::SMULL:             return "AArch64ISD::SMULL";
+  case AArch64ISD::UMULL:             return "AArch64ISD::UMULL";
+  case AArch64ISD::FRECPE:            return "AArch64ISD::FRECPE";
+  case AArch64ISD::FRECPS:            return "AArch64ISD::FRECPS";
+  case AArch64ISD::FRSQRTE:           return "AArch64ISD::FRSQRTE";
+  case AArch64ISD::FRSQRTS:           return "AArch64ISD::FRSQRTS";
+  }
+  return nullptr;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
+                                    MachineBasicBlock *MBB) const {
+  // We materialise the F128CSEL pseudo-instruction as some control flow and a
+  // phi node:
+
+  // OrigBB:
+  //     [... previous instrs leading to comparison ...]
+  //     b.ne TrueBB
+  //     b EndBB
+  // TrueBB:
+  //     ; Fallthrough
+  // EndBB:
+  //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
+
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction::iterator It = ++MBB->getIterator();
+
+  unsigned DestReg = MI.getOperand(0).getReg();
+  unsigned IfTrueReg = MI.getOperand(1).getReg();
+  unsigned IfFalseReg = MI.getOperand(2).getReg();
+  unsigned CondCode = MI.getOperand(3).getImm();
+  bool NZCVKilled = MI.getOperand(4).isKill();
+
+  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, TrueBB);
+  MF->insert(It, EndBB);
+
+  // Transfer rest of current basic-block to EndBB
+  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
+                MBB->end());
+  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
+  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
+  MBB->addSuccessor(TrueBB);
+  MBB->addSuccessor(EndBB);
+
+  // TrueBB falls through to the end.
+  TrueBB->addSuccessor(EndBB);
+
+  if (!NZCVKilled) {
+    TrueBB->addLiveIn(AArch64::NZCV);
+    EndBB->addLiveIn(AArch64::NZCV);
+  }
+
+  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
+      .addReg(IfTrueReg)
+      .addMBB(TrueBB)
+      .addReg(IfFalseReg)
+      .addMBB(MBB);
+
+  MI.eraseFromParent();
+  return EndBB;
+}
+
+MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr &MI, MachineBasicBlock *BB) const {
+  switch (MI.getOpcode()) {
+  default:
+#ifndef NDEBUG
+    MI.dump();
+#endif
+    llvm_unreachable("Unexpected instruction for custom inserter!");
+
+  case AArch64::F128CSEL:
+    return EmitF128CSEL(MI, BB);
+
+  case TargetOpcode::STACKMAP:
+  case TargetOpcode::PATCHPOINT:
+    return emitPatchPoint(MI, BB);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering private implementation.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
+/// CC
+static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown condition code!");
+  case ISD::SETNE:
+    return AArch64CC::NE;
+  case ISD::SETEQ:
+    return AArch64CC::EQ;
+  case ISD::SETGT:
+    return AArch64CC::GT;
+  case ISD::SETGE:
+    return AArch64CC::GE;
+  case ISD::SETLT:
+    return AArch64CC::LT;
+  case ISD::SETLE:
+    return AArch64CC::LE;
+  case ISD::SETUGT:
+    return AArch64CC::HI;
+  case ISD::SETUGE:
+    return AArch64CC::HS;
+  case ISD::SETULT:
+    return AArch64CC::LO;
+  case ISD::SETULE:
+    return AArch64CC::LS;
+  }
+}
+
+/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
+static void changeFPCCToAArch64CC(ISD::CondCode CC,
+                                  AArch64CC::CondCode &CondCode,
+                                  AArch64CC::CondCode &CondCode2) {
+  CondCode2 = AArch64CC::AL;
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown FP condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ:
+    CondCode = AArch64CC::EQ;
+    break;
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    CondCode = AArch64CC::GT;
+    break;
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    CondCode = AArch64CC::GE;
+    break;
+  case ISD::SETOLT:
+    CondCode = AArch64CC::MI;
+    break;
+  case ISD::SETOLE:
+    CondCode = AArch64CC::LS;
+    break;
+  case ISD::SETONE:
+    CondCode = AArch64CC::MI;
+    CondCode2 = AArch64CC::GT;
+    break;
+  case ISD::SETO:
+    CondCode = AArch64CC::VC;
+    break;
+  case ISD::SETUO:
+    CondCode = AArch64CC::VS;
+    break;
+  case ISD::SETUEQ:
+    CondCode = AArch64CC::EQ;
+    CondCode2 = AArch64CC::VS;
+    break;
+  case ISD::SETUGT:
+    CondCode = AArch64CC::HI;
+    break;
+  case ISD::SETUGE:
+    CondCode = AArch64CC::PL;
+    break;
+  case ISD::SETLT:
+  case ISD::SETULT:
+    CondCode = AArch64CC::LT;
+    break;
+  case ISD::SETLE:
+  case ISD::SETULE:
+    CondCode = AArch64CC::LE;
+    break;
+  case ISD::SETNE:
+  case ISD::SETUNE:
+    CondCode = AArch64CC::NE;
+    break;
+  }
+}
+
+/// Convert a DAG fp condition code to an AArch64 CC.
+/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
+/// should be AND'ed instead of OR'ed.
+static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
+                                     AArch64CC::CondCode &CondCode,
+                                     AArch64CC::CondCode &CondCode2) {
+  CondCode2 = AArch64CC::AL;
+  switch (CC) {
+  default:
+    changeFPCCToAArch64CC(CC, CondCode, CondCode2);
+    assert(CondCode2 == AArch64CC::AL);
+    break;
+  case ISD::SETONE:
+    // (a one b)
+    // == ((a olt b) || (a ogt b))
+    // == ((a ord b) && (a une b))
+    CondCode = AArch64CC::VC;
+    CondCode2 = AArch64CC::NE;
+    break;
+  case ISD::SETUEQ:
+    // (a ueq b)
+    // == ((a uno b) || (a oeq b))
+    // == ((a ule b) && (a uge b))
+    CondCode = AArch64CC::PL;
+    CondCode2 = AArch64CC::LE;
+    break;
+  }
+}
+
+/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
+/// CC usable with the vector instructions. Fewer operations are available
+/// without a real NZCV register, so we have to use less efficient combinations
+/// to get the same effect.
+static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
+                                        AArch64CC::CondCode &CondCode,
+                                        AArch64CC::CondCode &CondCode2,
+                                        bool &Invert) {
+  Invert = false;
+  switch (CC) {
+  default:
+    // Mostly the scalar mappings work fine.
+    changeFPCCToAArch64CC(CC, CondCode, CondCode2);
+    break;
+  case ISD::SETUO:
+    Invert = true;
+    LLVM_FALLTHROUGH;
+  case ISD::SETO:
+    CondCode = AArch64CC::MI;
+    CondCode2 = AArch64CC::GE;
+    break;
+  case ISD::SETUEQ:
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    // All of the compare-mask comparisons are ordered, but we can switch
+    // between the two by a double inversion. E.g. ULE == !OGT.
+    Invert = true;
+    changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
+    break;
+  }
+}
+
+static bool isLegalArithImmed(uint64_t C) {
+  // Matches AArch64DAGToDAGISel::SelectArithImmed().
+  return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
+}
+
+static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                              const SDLoc &dl, SelectionDAG &DAG) {
+  EVT VT = LHS.getValueType();
+
+  if (VT.isFloatingPoint()) {
+    assert(VT != MVT::f128);
+    if (VT == MVT::f16) {
+      LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
+      RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
+      VT = MVT::f32;
+    }
+    return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
+  }
+
+  // The CMP instruction is just an alias for SUBS, and representing it as
+  // SUBS means that it's possible to get CSE with subtract operations.
+  // A later phase can perform the optimization of setting the destination
+  // register to WZR/XZR if it ends up being unused.
+  unsigned Opcode = AArch64ISD::SUBS;
+
+  if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
+    // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
+    // can be set differently by this operation. It comes down to whether
+    // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
+    // everything is fine. If not then the optimization is wrong. Thus general
+    // comparisons are only valid if op2 != 0.
+
+    // So, finally, the only LLVM-native comparisons that don't mention C and V
+    // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
+    // the absence of information about op2.
+    Opcode = AArch64ISD::ADDS;
+    RHS = RHS.getOperand(1);
+  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
+             !isUnsignedIntSetCC(CC)) {
+    // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
+    // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
+    // of the signed comparisons.
+    Opcode = AArch64ISD::ANDS;
+    RHS = LHS.getOperand(1);
+    LHS = LHS.getOperand(0);
+  }
+
+  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
+      .getValue(1);
+}
+
+/// \defgroup AArch64CCMP CMP;CCMP matching
+///
+/// These functions deal with the formation of CMP;CCMP;... sequences.
+/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
+/// a comparison. They set the NZCV flags to a predefined value if their
+/// predicate is false. This allows to express arbitrary conjunctions, for
+/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))"
+/// expressed as:
+///   cmp A
+///   ccmp B, inv(CB), CA
+///   check for CB flags
+///
+/// In general we can create code for arbitrary "... (and (and A B) C)"
+/// sequences. We can also implement some "or" expressions, because "(or A B)"
+/// is equivalent to "not (and (not A) (not B))" and we can implement some
+/// negation operations:
+/// We can negate the results of a single comparison by inverting the flags
+/// used when the predicate fails and inverting the flags tested in the next
+/// instruction; We can also negate the results of the whole previous
+/// conditional compare sequence by inverting the flags tested in the next
+/// instruction. However there is no way to negate the result of a partial
+/// sequence.
+///
+/// Therefore on encountering an "or" expression we can negate the subtree on
+/// one side and have to be able to push the negate to the leafs of the subtree
+/// on the other side (see also the comments in code). As complete example:
+/// "or (or (setCA (cmp A)) (setCB (cmp B)))
+///     (and (setCC (cmp C)) (setCD (cmp D)))"
+/// is transformed to
+/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D))))
+///           (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
+/// and implemented as:
+///   cmp C
+///   ccmp D, inv(CD), CC
+///   ccmp A, CA, inv(CD)
+///   ccmp B, CB, inv(CA)
+///   check for CB flags
+/// A counterexample is "or (and A B) (and C D)" which cannot be implemented
+/// by conditional compare sequences.
+/// @{
+
+/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
+static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
+                                         ISD::CondCode CC, SDValue CCOp,
+                                         AArch64CC::CondCode Predicate,
+                                         AArch64CC::CondCode OutCC,
+                                         const SDLoc &DL, SelectionDAG &DAG) {
+  unsigned Opcode = 0;
+  if (LHS.getValueType().isFloatingPoint()) {
+    assert(LHS.getValueType() != MVT::f128);
+    if (LHS.getValueType() == MVT::f16) {
+      LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
+      RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
+    }
+    Opcode = AArch64ISD::FCCMP;
+  } else if (RHS.getOpcode() == ISD::SUB) {
+    SDValue SubOp0 = RHS.getOperand(0);
+    if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+      // See emitComparison() on why we can only do this for SETEQ and SETNE.
+      Opcode = AArch64ISD::CCMN;
+      RHS = RHS.getOperand(1);
+    }
+  }
+  if (Opcode == 0)
+    Opcode = AArch64ISD::CCMP;
+
+  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
+  AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
+  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
+  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
+  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
+}
+
+/// Returns true if @p Val is a tree of AND/OR/SETCC operations.
+/// CanPushNegate is set to true if we can push a negate operation through
+/// the tree in a was that we are left with AND operations and negate operations
+/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
+/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
+/// brought into such a form.
+static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
+                                         unsigned Depth = 0) {
+  if (!Val.hasOneUse())
+    return false;
+  unsigned Opcode = Val->getOpcode();
+  if (Opcode == ISD::SETCC) {
+    if (Val->getOperand(0).getValueType() == MVT::f128)
+      return false;
+    CanNegate = true;
+    return true;
+  }
+  // Protect against exponential runtime and stack overflow.
+  if (Depth > 6)
+    return false;
+  if (Opcode == ISD::AND || Opcode == ISD::OR) {
+    SDValue O0 = Val->getOperand(0);
+    SDValue O1 = Val->getOperand(1);
+    bool CanNegateL;
+    if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1))
+      return false;
+    bool CanNegateR;
+    if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1))
+      return false;
+
+    if (Opcode == ISD::OR) {
+      // For an OR expression we need to be able to negate at least one side or
+      // we cannot do the transformation at all.
+      if (!CanNegateL && !CanNegateR)
+        return false;
+      // We can however change a (not (or x y)) to (and (not x) (not y)) if we
+      // can negate the x and y subtrees.
+      CanNegate = CanNegateL && CanNegateR;
+    } else {
+      // If the operands are OR expressions then we finally need to negate their
+      // outputs, we can only do that for the operand with emitted last by
+      // negating OutCC, not for both operands.
+      bool NeedsNegOutL = O0->getOpcode() == ISD::OR;
+      bool NeedsNegOutR = O1->getOpcode() == ISD::OR;
+      if (NeedsNegOutL && NeedsNegOutR)
+        return false;
+      // We cannot negate an AND operation (it would become an OR),
+      CanNegate = false;
+    }
+    return true;
+  }
+  return false;
+}
+
+/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
+/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
+/// Tries to transform the given i1 producing node @p Val to a series compare
+/// and conditional compare operations. @returns an NZCV flags producing node
+/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
+/// transformation was not possible.
+/// On recursive invocations @p PushNegate may be set to true to have negation
+/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
+/// for the comparisons in the current subtree; @p Depth limits the search
+/// depth to avoid stack overflow.
+static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
+    AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
+    AArch64CC::CondCode Predicate) {
+  // We're at a tree leaf, produce a conditional comparison operation.
+  unsigned Opcode = Val->getOpcode();
+  if (Opcode == ISD::SETCC) {
+    SDValue LHS = Val->getOperand(0);
+    SDValue RHS = Val->getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
+    bool isInteger = LHS.getValueType().isInteger();
+    if (Negate)
+      CC = getSetCCInverse(CC, isInteger);
+    SDLoc DL(Val);
+    // Determine OutCC and handle FP special case.
+    if (isInteger) {
+      OutCC = changeIntCCToAArch64CC(CC);
+    } else {
+      assert(LHS.getValueType().isFloatingPoint());
+      AArch64CC::CondCode ExtraCC;
+      changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
+      // Some floating point conditions can't be tested with a single condition
+      // code. Construct an additional comparison in this case.
+      if (ExtraCC != AArch64CC::AL) {
+        SDValue ExtraCmp;
+        if (!CCOp.getNode())
+          ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
+        else
+          ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
+                                               ExtraCC, DL, DAG);
+        CCOp = ExtraCmp;
+        Predicate = ExtraCC;
+      }
+    }
+
+    // Produce a normal comparison if we are first in the chain
+    if (!CCOp)
+      return emitComparison(LHS, RHS, CC, DL, DAG);
+    // Otherwise produce a ccmp.
+    return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
+                                     DAG);
+  }
+  assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) &&
+         "Valid conjunction/disjunction tree");
+
+  // Check if both sides can be transformed.
+  SDValue LHS = Val->getOperand(0);
+  SDValue RHS = Val->getOperand(1);
+
+  // In case of an OR we need to negate our operands and the result.
+  // (A v B) <=> not(not(A) ^ not(B))
+  bool NegateOpsAndResult = Opcode == ISD::OR;
+  // We can negate the results of all previous operations by inverting the
+  // predicate flags giving us a free negation for one side. The other side
+  // must be negatable by itself.
+  if (NegateOpsAndResult) {
+    // See which side we can negate.
+    bool CanNegateL;
+    bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL);
+    assert(isValidL && "Valid conjunction/disjunction tree");
+    (void)isValidL;
+
+#ifndef NDEBUG
+    bool CanNegateR;
+    bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR);
+    assert(isValidR && "Valid conjunction/disjunction tree");
+    assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree");
+#endif
+
+    // Order the side which we cannot negate to RHS so we can emit it first.
+    if (!CanNegateL)
+      std::swap(LHS, RHS);
+  } else {
+    bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
+    assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) &&
+           "Valid conjunction/disjunction tree");
+    // Order the side where we need to negate the output flags to RHS so it
+    // gets emitted first.
+    if (NeedsNegOutL)
+      std::swap(LHS, RHS);
+  }
+
+  // Emit RHS. If we want to negate the tree we only need to push a negate
+  // through if we are already in a PushNegate case, otherwise we can negate
+  // the "flags to test" afterwards.
+  AArch64CC::CondCode RHSCC;
+  SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate,
+                                                   CCOp, Predicate);
+  if (NegateOpsAndResult && !Negate)
+    RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
+  // Emit LHS. We may need to negate it.
+  SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC,
+                                                   NegateOpsAndResult, CmpR,
+                                                   RHSCC);
+  // If we transformed an OR to and AND then we have to negate the result
+  // (or absorb the Negate parameter).
+  if (NegateOpsAndResult && !Negate)
+    OutCC = AArch64CC::getInvertedCondCode(OutCC);
+  return CmpL;
+}
+
+/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
+/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
+/// \see emitConjunctionDisjunctionTreeRec().
+static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
+                                              AArch64CC::CondCode &OutCC) {
+  bool CanNegate;
+  if (!isConjunctionDisjunctionTree(Val, CanNegate))
+    return SDValue();
+
+  return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(),
+                                           AArch64CC::AL);
+}
+
+/// @}
+
+static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                             SDValue &AArch64cc, SelectionDAG &DAG,
+                             const SDLoc &dl) {
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+    EVT VT = RHS.getValueType();
+    uint64_t C = RHSC->getZExtValue();
+    if (!isLegalArithImmed(C)) {
+      // Constant does not fit, try adjusting it by one?
+      switch (CC) {
+      default:
+        break;
+      case ISD::SETLT:
+      case ISD::SETGE:
+        if ((VT == MVT::i32 && C != 0x80000000 &&
+             isLegalArithImmed((uint32_t)(C - 1))) ||
+            (VT == MVT::i64 && C != 0x80000000ULL &&
+             isLegalArithImmed(C - 1ULL))) {
+          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+          RHS = DAG.getConstant(C, dl, VT);
+        }
+        break;
+      case ISD::SETULT:
+      case ISD::SETUGE:
+        if ((VT == MVT::i32 && C != 0 &&
+             isLegalArithImmed((uint32_t)(C - 1))) ||
+            (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
+          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+          RHS = DAG.getConstant(C, dl, VT);
+        }
+        break;
+      case ISD::SETLE:
+      case ISD::SETGT:
+        if ((VT == MVT::i32 && C != INT32_MAX &&
+             isLegalArithImmed((uint32_t)(C + 1))) ||
+            (VT == MVT::i64 && C != INT64_MAX &&
+             isLegalArithImmed(C + 1ULL))) {
+          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+          RHS = DAG.getConstant(C, dl, VT);
+        }
+        break;
+      case ISD::SETULE:
+      case ISD::SETUGT:
+        if ((VT == MVT::i32 && C != UINT32_MAX &&
+             isLegalArithImmed((uint32_t)(C + 1))) ||
+            (VT == MVT::i64 && C != UINT64_MAX &&
+             isLegalArithImmed(C + 1ULL))) {
+          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+          RHS = DAG.getConstant(C, dl, VT);
+        }
+        break;
+      }
+    }
+  }
+  SDValue Cmp;
+  AArch64CC::CondCode AArch64CC;
+  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
+    const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
+
+    // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
+    // For the i8 operand, the largest immediate is 255, so this can be easily
+    // encoded in the compare instruction. For the i16 operand, however, the
+    // largest immediate cannot be encoded in the compare.
+    // Therefore, use a sign extending load and cmn to avoid materializing the
+    // -1 constant. For example,
+    // movz w1, #65535
+    // ldrh w0, [x0, #0]
+    // cmp w0, w1
+    // >
+    // ldrsh w0, [x0, #0]
+    // cmn w0, #1
+    // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
+    // if and only if (sext LHS) == (sext RHS). The checks are in place to
+    // ensure both the LHS and RHS are truly zero extended and to make sure the
+    // transformation is profitable.
+    if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
+        cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
+        cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
+        LHS.getNode()->hasNUsesOfValue(1, 0)) {
+      int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
+      if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
+        SDValue SExt =
+            DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
+                        DAG.getValueType(MVT::i16));
+        Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
+                                                   RHS.getValueType()),
+                             CC, dl, DAG);
+        AArch64CC = changeIntCCToAArch64CC(CC);
+      }
+    }
+
+    if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
+      if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) {
+        if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
+          AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
+      }
+    }
+  }
+
+  if (!Cmp) {
+    Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+    AArch64CC = changeIntCCToAArch64CC(CC);
+  }
+  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
+  return Cmp;
+}
+
+static std::pair<SDValue, SDValue>
+getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
+  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
+         "Unsupported value type");
+  SDValue Value, Overflow;
+  SDLoc DL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  unsigned Opc = 0;
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Unknown overflow instruction!");
+  case ISD::SADDO:
+    Opc = AArch64ISD::ADDS;
+    CC = AArch64CC::VS;
+    break;
+  case ISD::UADDO:
+    Opc = AArch64ISD::ADDS;
+    CC = AArch64CC::HS;
+    break;
+  case ISD::SSUBO:
+    Opc = AArch64ISD::SUBS;
+    CC = AArch64CC::VS;
+    break;
+  case ISD::USUBO:
+    Opc = AArch64ISD::SUBS;
+    CC = AArch64CC::LO;
+    break;
+  // Multiply needs a little bit extra work.
+  case ISD::SMULO:
+  case ISD::UMULO: {
+    CC = AArch64CC::NE;
+    bool IsSigned = Op.getOpcode() == ISD::SMULO;
+    if (Op.getValueType() == MVT::i32) {
+      unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      // For a 32 bit multiply with overflow check we want the instruction
+      // selector to generate a widening multiply (SMADDL/UMADDL). For that we
+      // need to generate the following pattern:
+      // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
+      LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
+      RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
+                                DAG.getConstant(0, DL, MVT::i64));
+      // On AArch64 the upper 32 bits are always zero extended for a 32 bit
+      // operation. We need to clear out the upper 32 bits, because we used a
+      // widening multiply that wrote all 64 bits. In the end this should be a
+      // noop.
+      Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
+      if (IsSigned) {
+        // The signed overflow check requires more than just a simple check for
+        // any bit set in the upper 32 bits of the result. These bits could be
+        // just the sign bits of a negative number. To perform the overflow
+        // check we have to arithmetic shift right the 32nd bit of the result by
+        // 31 bits. Then we compare the result to the upper 32 bits.
+        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
+                                        DAG.getConstant(32, DL, MVT::i64));
+        UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
+        SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
+                                        DAG.getConstant(31, DL, MVT::i64));
+        // It is important that LowerBits is last, otherwise the arithmetic
+        // shift will not be folded into the compare (SUBS).
+        SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
+        Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+                       .getValue(1);
+      } else {
+        // The overflow check for unsigned multiply is easy. We only need to
+        // check if any of the upper 32 bits are set. This can be done with a
+        // CMP (shifted register). For that we need to generate the following
+        // pattern:
+        // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
+        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
+                                        DAG.getConstant(32, DL, MVT::i64));
+        SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+        Overflow =
+            DAG.getNode(AArch64ISD::SUBS, DL, VTs,
+                        DAG.getConstant(0, DL, MVT::i64),
+                        UpperBits).getValue(1);
+      }
+      break;
+    }
+    assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
+    // For the 64 bit multiply
+    Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+    if (IsSigned) {
+      SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
+      SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
+                                      DAG.getConstant(63, DL, MVT::i64));
+      // It is important that LowerBits is last, otherwise the arithmetic
+      // shift will not be folded into the compare (SUBS).
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+      Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+                     .getValue(1);
+    } else {
+      SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+      Overflow =
+          DAG.getNode(AArch64ISD::SUBS, DL, VTs,
+                      DAG.getConstant(0, DL, MVT::i64),
+                      UpperBits).getValue(1);
+    }
+    break;
+  }
+  } // switch (...)
+
+  if (Opc) {
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
+
+    // Emit the AArch64 operation with overflow check.
+    Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
+    Overflow = Value.getValue(1);
+  }
+  return std::make_pair(Value, Overflow);
+}
+
+SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                                             RTLIB::Libcall Call) const {
+  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
+  return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
+}
+
+static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
+  SDValue Sel = Op.getOperand(0);
+  SDValue Other = Op.getOperand(1);
+
+  // If neither operand is a SELECT_CC, give up.
+  if (Sel.getOpcode() != ISD::SELECT_CC)
+    std::swap(Sel, Other);
+  if (Sel.getOpcode() != ISD::SELECT_CC)
+    return Op;
+
+  // The folding we want to perform is:
+  // (xor x, (select_cc a, b, cc, 0, -1) )
+  //   -->
+  // (csel x, (xor x, -1), cc ...)
+  //
+  // The latter will get matched to a CSINV instruction.
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
+  SDValue LHS = Sel.getOperand(0);
+  SDValue RHS = Sel.getOperand(1);
+  SDValue TVal = Sel.getOperand(2);
+  SDValue FVal = Sel.getOperand(3);
+  SDLoc dl(Sel);
+
+  // FIXME: This could be generalized to non-integer comparisons.
+  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+    return Op;
+
+  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
+
+  // The values aren't constants, this isn't the pattern we're looking for.
+  if (!CFVal || !CTVal)
+    return Op;
+
+  // We can commute the SELECT_CC by inverting the condition.  This
+  // might be needed to make this fit into a CSINV pattern.
+  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+    std::swap(TVal, FVal);
+    std::swap(CTVal, CFVal);
+    CC = ISD::getSetCCInverse(CC, true);
+  }
+
+  // If the constants line up, perform the transform!
+  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+
+    FVal = Other;
+    TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
+                       DAG.getConstant(-1ULL, dl, Other.getValueType()));
+
+    return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
+                       CCVal, Cmp);
+  }
+
+  return Op;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+  unsigned Opc;
+  bool ExtraOp = false;
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Invalid code");
+  case ISD::ADDC:
+    Opc = AArch64ISD::ADDS;
+    break;
+  case ISD::SUBC:
+    Opc = AArch64ISD::SUBS;
+    break;
+  case ISD::ADDE:
+    Opc = AArch64ISD::ADCS;
+    ExtraOp = true;
+    break;
+  case ISD::SUBE:
+    Opc = AArch64ISD::SBCS;
+    ExtraOp = true;
+    break;
+  }
+
+  if (!ExtraOp)
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
+                     Op.getOperand(2));
+}
+
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+    return SDValue();
+
+  SDLoc dl(Op);
+  AArch64CC::CondCode CC;
+  // The actual operation that sets the overflow or carry flag.
+  SDValue Value, Overflow;
+  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
+
+  // We use 0 and 1 as false and true values.
+  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
+  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
+
+  // We use an inverted condition, because the conditional select is inverted
+  // too. This will allow it to be selected to a single instruction:
+  // CSINC Wd, WZR, WZR, invert(cond).
+  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
+  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
+                         CCVal, Overflow);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
+}
+
+// Prefetch operands are:
+// 1: Address to prefetch
+// 2: bool isWrite
+// 3: int locality (0 = no locality ... 3 = extreme locality)
+// 4: bool isDataCache
+static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+
+  bool IsStream = !Locality;
+  // When the locality number is set
+  if (Locality) {
+    // The front-end should have filtered out the out-of-range values
+    assert(Locality <= 3 && "Prefetch locality out-of-range");
+    // The locality degree is the opposite of the cache speed.
+    // Put the number the other way around.
+    // The encoding starts at 0 for level 1
+    Locality = 3 - Locality;
+  }
+
+  // built the mask value encoding the expected behavior.
+  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
+                   (!IsData << 3) |     // IsDataCache bit
+                   (Locality << 1) |    // Cache level bits
+                   (unsigned)IsStream;  // Stream bit
+  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
+}
+
+SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
+
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128Call(Op, DAG, LC);
+}
+
+SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
+
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  // FP_ROUND node has a second operand indicating whether it is known to be
+  // precise. That doesn't take part in the LibCall so we can't directly use
+  // LowerF128Call.
+  SDValue SrcVal = Op.getOperand(0);
+  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+                     SDLoc(Op)).first;
+}
+
+static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
+  // Any additional optimization in this function should be recorded
+  // in the cost tables.
+  EVT InVT = Op.getOperand(0).getValueType();
+  EVT VT = Op.getValueType();
+  unsigned NumElts = InVT.getVectorNumElements();
+
+  // f16 vectors are promoted to f32 before a conversion.
+  if (InVT.getVectorElementType() == MVT::f16) {
+    MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
+    SDLoc dl(Op);
+    return DAG.getNode(
+        Op.getOpcode(), dl, Op.getValueType(),
+        DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
+  }
+
+  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
+    SDLoc dl(Op);
+    SDValue Cv =
+        DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
+                    Op.getOperand(0));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
+  }
+
+  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
+    SDLoc dl(Op);
+    MVT ExtVT =
+        MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
+                         VT.getVectorNumElements());
+    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
+    return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
+  }
+
+  // Type changing conversions are illegal.
+  return Op;
+}
+
+SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType().isVector())
+    return LowerVectorFP_TO_INT(Op, DAG);
+
+  // f16 conversions are promoted to f32.
+  if (Op.getOperand(0).getValueType() == MVT::f16) {
+    SDLoc dl(Op);
+    return DAG.getNode(
+        Op.getOpcode(), dl, Op.getValueType(),
+        DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
+  }
+
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
+
+  RTLIB::Libcall LC;
+  if (Op.getOpcode() == ISD::FP_TO_SINT)
+    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
+  return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
+}
+
+static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
+  // Any additional optimization in this function should be recorded
+  // in the cost tables.
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  SDValue In = Op.getOperand(0);
+  EVT InVT = In.getValueType();
+
+  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
+    MVT CastVT =
+        MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
+                         InVT.getVectorNumElements());
+    In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
+    return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
+  }
+
+  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
+    unsigned CastOpc =
+        Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    EVT CastVT = VT.changeVectorElementTypeToInteger();
+    In = DAG.getNode(CastOpc, dl, CastVT, In);
+    return DAG.getNode(Op.getOpcode(), dl, VT, In);
+  }
+
+  return Op;
+}
+
+SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  if (Op.getValueType().isVector())
+    return LowerVectorINT_TO_FP(Op, DAG);
+
+  // f16 conversions are promoted to f32.
+  if (Op.getValueType() == MVT::f16) {
+    SDLoc dl(Op);
+    return DAG.getNode(
+        ISD::FP_ROUND, dl, MVT::f16,
+        DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
+        DAG.getIntPtrConstant(0, dl));
+  }
+
+  // i128 conversions are libcalls.
+  if (Op.getOperand(0).getValueType() == MVT::i128)
+    return SDValue();
+
+  // Other conversions are legal, unless it's to the completely software-based
+  // fp128.
+  if (Op.getValueType() != MVT::f128)
+    return Op;
+
+  RTLIB::Libcall LC;
+  if (Op.getOpcode() == ISD::SINT_TO_FP)
+    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128Call(Op, DAG, LC);
+}
+
+SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // which returns the values in two S / D registers.
+  SDLoc dl(Op);
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  const char *LibcallName =
+      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
+  SDValue Callee =
+      DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
+
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
+
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.first;
+}
+
+static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
+  if (Op.getValueType() != MVT::f16)
+    return SDValue();
+
+  assert(Op.getOperand(0).getValueType() == MVT::i16);
+  SDLoc DL(Op);
+
+  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
+  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
+  return SDValue(
+      DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
+                         DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
+      0);
+}
+
+static EVT getExtensionTo64Bits(const EVT &OrigVT) {
+  if (OrigVT.getSizeInBits() >= 64)
+    return OrigVT;
+
+  assert(OrigVT.isSimple() && "Expecting a simple value type");
+
+  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
+  switch (OrigSimpleTy) {
+  default: llvm_unreachable("Unexpected Vector Type");
+  case MVT::v2i8:
+  case MVT::v2i16:
+     return MVT::v2i32;
+  case MVT::v4i8:
+    return  MVT::v4i16;
+  }
+}
+
+static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
+                                                 const EVT &OrigTy,
+                                                 const EVT &ExtTy,
+                                                 unsigned ExtOpcode) {
+  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
+  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
+  // 64-bits we need to insert a new extension so that it will be 64-bits.
+  assert(ExtTy.is128BitVector() && "Unexpected extension size");
+  if (OrigTy.getSizeInBits() >= 64)
+    return N;
+
+  // Must extend size to at least 64 bits to be used as an operand for VMULL.
+  EVT NewVT = getExtensionTo64Bits(OrigTy);
+
+  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
+}
+
+static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
+                                   bool isSigned) {
+  EVT VT = N->getValueType(0);
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  for (const SDValue &Elt : N->op_values()) {
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+      unsigned EltSize = VT.getScalarSizeInBits();
+      unsigned HalfSize = EltSize / 2;
+      if (isSigned) {
+        if (!isIntN(HalfSize, C->getSExtValue()))
+          return false;
+      } else {
+        if (!isUIntN(HalfSize, C->getZExtValue()))
+          return false;
+      }
+      continue;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+    return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
+                                             N->getOperand(0)->getValueType(0),
+                                             N->getValueType(0),
+                                             N->getOpcode());
+
+  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  unsigned EltSize = VT.getScalarSizeInBits() / 2;
+  unsigned NumElts = VT.getVectorNumElements();
+  MVT TruncVT = MVT::getIntegerVT(EltSize);
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
+    const APInt &CInt = C->getAPIntValue();
+    // Element types smaller than 32 bits are not legal, so use i32 elements.
+    // The values are implicitly truncated so sext vs. zext doesn't matter.
+    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
+  }
+  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
+}
+
+static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND)
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, true))
+    return true;
+  return false;
+}
+
+static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::ZERO_EXTEND)
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, false))
+    return true;
+  return false;
+}
+
+static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
+  }
+  return false;
+}
+
+static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
+  }
+  return false;
+}
+
+static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
+  // Multiplications are only custom-lowered for 128-bit vectors so that
+  // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
+  EVT VT = Op.getValueType();
+  assert(VT.is128BitVector() && VT.isInteger() &&
+         "unexpected type for custom-lowering ISD::MUL");
+  SDNode *N0 = Op.getOperand(0).getNode();
+  SDNode *N1 = Op.getOperand(1).getNode();
+  unsigned NewOpc = 0;
+  bool isMLA = false;
+  bool isN0SExt = isSignExtended(N0, DAG);
+  bool isN1SExt = isSignExtended(N1, DAG);
+  if (isN0SExt && isN1SExt)
+    NewOpc = AArch64ISD::SMULL;
+  else {
+    bool isN0ZExt = isZeroExtended(N0, DAG);
+    bool isN1ZExt = isZeroExtended(N1, DAG);
+    if (isN0ZExt && isN1ZExt)
+      NewOpc = AArch64ISD::UMULL;
+    else if (isN1SExt || isN1ZExt) {
+      // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
+      // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
+      if (isN1SExt && isAddSubSExt(N0, DAG)) {
+        NewOpc = AArch64ISD::SMULL;
+        isMLA = true;
+      } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
+        NewOpc =  AArch64ISD::UMULL;
+        isMLA = true;
+      } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
+        std::swap(N0, N1);
+        NewOpc =  AArch64ISD::UMULL;
+        isMLA = true;
+      }
+    }
+
+    if (!NewOpc) {
+      if (VT == MVT::v2i64)
+        // Fall through to expand this.  It is not legal.
+        return SDValue();
+      else
+        // Other vector multiplications are legal.
+        return Op;
+    }
+  }
+
+  // Legalize to a S/UMULL instruction
+  SDLoc DL(Op);
+  SDValue Op0;
+  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
+  if (!isMLA) {
+    Op0 = skipExtensionForVectorMULL(N0, DAG);
+    assert(Op0.getValueType().is64BitVector() &&
+           Op1.getValueType().is64BitVector() &&
+           "unexpected types for extended operands to VMULL");
+    return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+  }
+  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
+  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
+  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
+  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
+  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
+  EVT Op1VT = Op1.getValueType();
+  return DAG.getNode(N0->getOpcode(), DL, VT,
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
+}
+
+SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDLoc dl(Op);
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+  case Intrinsic::thread_pointer: {
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
+    return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
+  }
+  case Intrinsic::aarch64_neon_smax:
+    return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::aarch64_neon_umax:
+    return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::aarch64_neon_smin:
+    return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::aarch64_neon_umin:
+    return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+}
+
+SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unimplemented operand");
+    return SDValue();
+  case ISD::BITCAST:
+    return LowerBITCAST(Op, DAG);
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:
+    return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG);
+  case ISD::BR_CC:
+    return LowerBR_CC(Op, DAG);
+  case ISD::SELECT:
+    return LowerSELECT(Op, DAG);
+  case ISD::SELECT_CC:
+    return LowerSELECT_CC(Op, DAG);
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG);
+  case ISD::ConstantPool:
+    return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:
+    return LowerBlockAddress(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
+  case ISD::VACOPY:
+    return LowerVACOPY(Op, DAG);
+  case ISD::VAARG:
+    return LowerVAARG(Op, DAG);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:
+    return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+    return LowerXALUO(Op, DAG);
+  case ISD::FADD:
+    return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
+  case ISD::FSUB:
+    return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
+  case ISD::FMUL:
+    return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+  case ISD::FDIV:
+    return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
+  case ISD::FP_ROUND:
+    return LowerFP_ROUND(Op, DAG);
+  case ISD::FP_EXTEND:
+    return LowerFP_EXTEND(Op, DAG);
+  case ISD::FRAMEADDR:
+    return LowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:
+    return LowerRETURNADDR(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:
+    return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::BUILD_VECTOR:
+    return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:
+    return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR:
+    return LowerEXTRACT_SUBVECTOR(Op, DAG);
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL:
+    return LowerVectorSRA_SRL_SHL(Op, DAG);
+  case ISD::SHL_PARTS:
+    return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRL_PARTS:
+  case ISD::SRA_PARTS:
+    return LowerShiftRightParts(Op, DAG);
+  case ISD::CTPOP:
+    return LowerCTPOP(Op, DAG);
+  case ISD::FCOPYSIGN:
+    return LowerFCOPYSIGN(Op, DAG);
+  case ISD::AND:
+    return LowerVectorAND(Op, DAG);
+  case ISD::OR:
+    return LowerVectorOR(Op, DAG);
+  case ISD::XOR:
+    return LowerXOR(Op, DAG);
+  case ISD::PREFETCH:
+    return LowerPREFETCH(Op, DAG);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return LowerINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return LowerFP_TO_INT(Op, DAG);
+  case ISD::FSINCOS:
+    return LowerFSINCOS(Op, DAG);
+  case ISD::MUL:
+    return LowerMUL(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "AArch64GenCallingConv.inc"
+
+/// Selects the correct CCAssignFn for a given CallingConvention value.
+CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                     bool IsVarArg) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unsupported calling convention.");
+  case CallingConv::WebKit_JS:
+    return CC_AArch64_WebKit_JS;
+  case CallingConv::GHC:
+    return CC_AArch64_GHC;
+  case CallingConv::C:
+  case CallingConv::Fast:
+  case CallingConv::PreserveMost:
+  case CallingConv::CXX_FAST_TLS:
+  case CallingConv::Swift:
+    if (!Subtarget->isTargetDarwin())
+      return CC_AArch64_AAPCS;
+    return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
+  }
+}
+
+CCAssignFn *
+AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
+  return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
+                                      : RetCC_AArch64_AAPCS;
+}
+
+SDValue AArch64TargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+
+  // At this point, Ins[].VT may already be promoted to i32. To correctly
+  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
+  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
+  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
+  // LocVT.
+  unsigned NumArgs = Ins.size();
+  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    MVT ValVT = Ins[i].VT;
+    if (Ins[i].isOrigArg()) {
+      std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[i].getOrigArgIndex();
+
+      // Get type of the original argument.
+      EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
+                                  /*AllowUnknown*/ true);
+      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
+      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+        ValVT = MVT::i8;
+      else if (ActualMVT == MVT::i16)
+        ValVT = MVT::i16;
+    }
+    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+    bool Res =
+        AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
+    assert(!Res && "Call operand has unhandled type");
+    (void)Res;
+  }
+  assert(ArgLocs.size() == Ins.size());
+  SmallVector<SDValue, 16> ArgValues;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    if (Ins[i].Flags.isByVal()) {
+      // Byval is used for HFAs in the PCS, but the system should work in a
+      // non-compliant manner for larger structs.
+      EVT PtrVT = getPointerTy(DAG.getDataLayout());
+      int Size = Ins[i].Flags.getByValSize();
+      unsigned NumRegs = (Size + 7) / 8;
+
+      // FIXME: This works on big-endian for composite byvals, which are the common
+      // case. It should also work for fundamental types too.
+      unsigned FrameIdx =
+        MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
+      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
+      InVals.push_back(FrameIdxN);
+
+      continue;
+    }
+
+    if (VA.isRegLoc()) {
+      // Arguments stored in registers.
+      EVT RegVT = VA.getLocVT();
+
+      SDValue ArgValue;
+      const TargetRegisterClass *RC;
+
+      if (RegVT == MVT::i32)
+        RC = &AArch64::GPR32RegClass;
+      else if (RegVT == MVT::i64)
+        RC = &AArch64::GPR64RegClass;
+      else if (RegVT == MVT::f16)
+        RC = &AArch64::FPR16RegClass;
+      else if (RegVT == MVT::f32)
+        RC = &AArch64::FPR32RegClass;
+      else if (RegVT == MVT::f64 || RegVT.is64BitVector())
+        RC = &AArch64::FPR64RegClass;
+      else if (RegVT == MVT::f128 || RegVT.is128BitVector())
+        RC = &AArch64::FPR128RegClass;
+      else
+        llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
+
+      // Transform the arguments in physical registers into virtual ones.
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
+
+      // If this is an 8, 16 or 32-bit value, it is really passed promoted
+      // to 64 bits.  Insert an assert[sz]ext to capture this, then
+      // truncate to the right size.
+      switch (VA.getLocInfo()) {
+      default:
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::BCvt:
+        ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::AExt:
+      case CCValAssign::SExt:
+      case CCValAssign::ZExt:
+        // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
+        // nodes after our lowering.
+        assert(RegVT == Ins[i].VT && "incorrect register location selected");
+        break;
+      }
+
+      InVals.push_back(ArgValue);
+
+    } else { // VA.isRegLoc()
+      assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
+      unsigned ArgOffset = VA.getLocMemOffset();
+      unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
+
+      uint32_t BEAlign = 0;
+      if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
+          !Ins[i].Flags.isInConsecutiveRegs())
+        BEAlign = 8 - ArgSize;
+
+      int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
+
+      // Create load nodes to retrieve arguments from the stack.
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+      SDValue ArgValue;
+
+      // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
+      ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+      MVT MemVT = VA.getValVT();
+
+      switch (VA.getLocInfo()) {
+      default:
+        break;
+      case CCValAssign::BCvt:
+        MemVT = VA.getLocVT();
+        break;
+      case CCValAssign::SExt:
+        ExtType = ISD::SEXTLOAD;
+        break;
+      case CCValAssign::ZExt:
+        ExtType = ISD::ZEXTLOAD;
+        break;
+      case CCValAssign::AExt:
+        ExtType = ISD::EXTLOAD;
+        break;
+      }
+
+      ArgValue = DAG.getExtLoad(
+          ExtType, DL, VA.getLocVT(), Chain, FIN,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+          MemVT);
+
+      InVals.push_back(ArgValue);
+    }
+  }
+
+  // varargs
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  if (isVarArg) {
+    if (!Subtarget->isTargetDarwin()) {
+      // The AAPCS variadic function ABI is identical to the non-variadic
+      // one. As a result there may be more arguments in registers and we should
+      // save them for future reference.
+      saveVarArgRegisters(CCInfo, DAG, DL, Chain);
+    }
+
+    // This will point to the next argument passed via stack.
+    unsigned StackOffset = CCInfo.getNextStackOffset();
+    // We currently pass all varargs at 8-byte alignment.
+    StackOffset = ((StackOffset + 7) & ~7);
+    FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
+  }
+
+  unsigned StackArgSize = CCInfo.getNextStackOffset();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
+    // This is a non-standard ABI so by fiat I say we're allowed to make full
+    // use of the stack area to be popped, which must be aligned to 16 bytes in
+    // any case:
+    StackArgSize = alignTo(StackArgSize, 16);
+
+    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
+    // a multiple of 16.
+    FuncInfo->setArgumentStackToRestore(StackArgSize);
+
+    // This realignment carries over to the available bytes below. Our own
+    // callers will guarantee the space is free by giving an aligned value to
+    // CALLSEQ_START.
+  }
+  // Even if we're not expected to free up the space, it's useful to know how
+  // much is there while considering tail calls (because we can reuse it).
+  FuncInfo->setBytesInStackArgArea(StackArgSize);
+
+  return Chain;
+}
+
+void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
+                                                SelectionDAG &DAG,
+                                                const SDLoc &DL,
+                                                SDValue &Chain) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  SmallVector<SDValue, 8> MemOps;
+
+  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
+                                          AArch64::X3, AArch64::X4, AArch64::X5,
+                                          AArch64::X6, AArch64::X7 };
+  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
+  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
+
+  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
+  int GPRIdx = 0;
+  if (GPRSaveSize != 0) {
+    GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
+
+    SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
+
+    for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
+      unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
+      SDValue Store = DAG.getStore(
+          Val.getValue(1), DL, Val, FIN,
+          MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
+      MemOps.push_back(Store);
+      FIN =
+          DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
+    }
+  }
+  FuncInfo->setVarArgsGPRIndex(GPRIdx);
+  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
+
+  if (Subtarget->hasFPARMv8()) {
+    static const MCPhysReg FPRArgRegs[] = {
+        AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
+        AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
+    static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
+    unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
+
+    unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
+    int FPRIdx = 0;
+    if (FPRSaveSize != 0) {
+      FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
+
+      SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
+
+      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
+        unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
+        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
+
+        SDValue Store = DAG.getStore(
+            Val.getValue(1), DL, Val, FIN,
+            MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
+        MemOps.push_back(Store);
+        FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
+                          DAG.getConstant(16, DL, PtrVT));
+      }
+    }
+    FuncInfo->setVarArgsFPRIndex(FPRIdx);
+    FuncInfo->setVarArgsFPRSize(FPRSaveSize);
+  }
+
+  if (!MemOps.empty()) {
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+  }
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue AArch64TargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+    SDValue ThisVal) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, RetCC);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign VA = RVLocs[i];
+
+    // Pass 'this' value directly from the argument to return value, to avoid
+    // reg unit interference
+    if (i == 0 && isThisReturn) {
+      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
+             "unexpected return calling convention register assignment");
+      InVals.push_back(ThisVal);
+      continue;
+    }
+
+    SDValue Val =
+        DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) {
+  return CC == CallingConv::Fast;
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+  switch (CC) {
+  case CallingConv::C:
+  case CallingConv::PreserveMost:
+  case CallingConv::Swift:
+    return true;
+  default:
+    return canGuaranteeTCO(CC);
+  }
+}
+
+bool AArch64TargetLowering::isEligibleForTailCallOptimization(
+    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+  if (!mayTailCallThisCC(CalleeCC))
+    return false;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *CallerF = MF.getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+
+  // Byval parameters hand the function a pointer directly into the stack area
+  // we want to reuse during a tail call. Working around this *is* possible (see
+  // X86) but less efficient and uglier in LowerCall.
+  for (Function::const_arg_iterator i = CallerF->arg_begin(),
+                                    e = CallerF->arg_end();
+       i != e; ++i)
+    if (i->hasByValAttr())
+      return false;
+
+  if (getTargetMachine().Options.GuaranteedTailCallOpt)
+    return canGuaranteeTCO(CalleeCC) && CCMatch;
+
+  // Externally-defined functions with weak linkage should not be
+  // tail-called on AArch64 when the OS does not support dynamic
+  // pre-emption of symbols, as the AAELF spec requires normal calls
+  // to undefined weak functions to be replaced with a NOP or jump to the
+  // next instruction. The behaviour of branch instructions in this
+  // situation (as used for tail calls) is implementation-defined, so we
+  // cannot rely on the linker replacing the tail call with a return.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    const Triple &TT = getTargetMachine().getTargetTriple();
+    if (GV->hasExternalWeakLinkage() &&
+        (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
+      return false;
+  }
+
+  // Now we search for cases where we can use a tail call without changing the
+  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
+  // concept.
+
+  // I want anyone implementing a new calling convention to think long and hard
+  // about this assert.
+  assert((!isVarArg || CalleeCC == CallingConv::C) &&
+         "Unexpected variadic calling convention");
+
+  LLVMContext &C = *DAG.getContext();
+  if (isVarArg && !Outs.empty()) {
+    // At least two cases here: if caller is fastcc then we can't have any
+    // memory arguments (we'd be expected to clean up the stack afterwards). If
+    // caller is C then we could potentially use its argument area.
+
+    // FIXME: for now we take the most conservative of these in both cases:
+    // disallow all variadic memory operands.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
+
+    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
+    for (const CCValAssign &ArgLoc : ArgLocs)
+      if (!ArgLoc.isRegLoc())
+        return false;
+  }
+
+  // Check that the call results are passed in the same way.
+  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
+                                  CCAssignFnForCall(CalleeCC, isVarArg),
+                                  CCAssignFnForCall(CallerCC, isVarArg)))
+    return false;
+  // The callee has to preserve all registers the caller needs to preserve.
+  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+  if (!CCMatch) {
+    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
+      return false;
+  }
+
+  // Nothing more to check if the callee is taking no arguments
+  if (Outs.empty())
+    return true;
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
+
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
+
+  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+
+  // If the stack arguments for this call do not fit into our own save area then
+  // the call cannot be made tail.
+  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
+    return false;
+
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+    return false;
+
+  return true;
+}
+
+SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
+                                                   SelectionDAG &DAG,
+                                                   MachineFrameInfo &MFI,
+                                                   int ClobberedFI) const {
+  SmallVector<SDValue, 8> ArgChains;
+  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
+  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
+
+  // Include the original chain at the beginning of the list. When this is
+  // used by target LowerCall hooks, this helps legalize find the
+  // CALLSEQ_BEGIN node.
+  ArgChains.push_back(Chain);
+
+  // Add a chain value for each stack argument corresponding
+  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
+                            UE = DAG.getEntryNode().getNode()->use_end();
+       U != UE; ++U)
+    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
+        if (FI->getIndex() < 0) {
+          int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
+          int64_t InLastByte = InFirstByte;
+          InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
+
+          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
+              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
+            ArgChains.push_back(SDValue(L, 1));
+        }
+
+  // Build a tokenfactor for all the chains.
+  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
+}
+
+bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
+                                                   bool TailCallOpt) const {
+  return CallCC == CallingConv::Fast && TailCallOpt;
+}
+
+/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
+/// and add input and output parameter nodes.
+SDValue
+AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                 SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &DL = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &IsTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsThisReturn = false;
+
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  bool IsSibCall = false;
+
+  if (IsTailCall) {
+    // Check if it's really possible to do a tail call.
+    IsTailCall = isEligibleForTailCallOptimization(
+        Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
+    if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
+      report_fatal_error("failed to perform tail call elimination on a call "
+                         "site marked musttail");
+
+    // A sibling call is one where we're under the usual C ABI and not planning
+    // to change that but can still do a tail call:
+    if (!TailCallOpt && IsTailCall)
+      IsSibCall = true;
+
+    if (IsTailCall)
+      ++NumTailCalls;
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+
+  if (IsVarArg) {
+    // Handle fixed and variable vector arguments differently.
+    // Variable vector arguments always go into memory.
+    unsigned NumArgs = Outs.size();
+
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ArgVT = Outs[i].VT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
+                                               /*IsVarArg=*/ !Outs[i].IsFixed);
+      bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+      assert(!Res && "Call operand has unhandled type");
+      (void)Res;
+    }
+  } else {
+    // At this point, Outs[].VT may already be promoted to i32. To correctly
+    // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+    // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
+    // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
+    // we use a special version of AnalyzeCallOperands to pass in ValVT and
+    // LocVT.
+    unsigned NumArgs = Outs.size();
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ValVT = Outs[i].VT;
+      // Get type of the original argument.
+      EVT ActualVT = getValueType(DAG.getDataLayout(),
+                                  CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
+                                  /*AllowUnknown*/ true);
+      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+        ValVT = MVT::i8;
+      else if (ActualMVT == MVT::i16)
+        ValVT = MVT::i16;
+
+      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+      bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
+      assert(!Res && "Call operand has unhandled type");
+      (void)Res;
+    }
+  }
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  if (IsSibCall) {
+    // Since we're not changing the ABI to make this a tail call, the memory
+    // operands are already available in the caller's incoming argument space.
+    NumBytes = 0;
+  }
+
+  // FPDiff is the byte offset of the call's argument area from the callee's.
+  // Stores to callee stack arguments will be placed in FixedStackSlots offset
+  // by this amount for a tail call. In a sibling call it must be 0 because the
+  // caller will deallocate the entire stack and the callee still expects its
+  // arguments to begin at SP+0. Completely unused for non-tail calls.
+  int FPDiff = 0;
+
+  if (IsTailCall && !IsSibCall) {
+    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+
+    // Since callee will pop argument stack as a tail call, we must keep the
+    // popped size 16-byte aligned.
+    NumBytes = alignTo(NumBytes, 16);
+
+    // FPDiff will be negative if this tail call requires more space than we
+    // would automatically have in our incoming argument space. Positive if we
+    // can actually shrink the stack.
+    FPDiff = NumReusableBytes - NumBytes;
+
+    // The stack pointer must be 16-byte aligned at all times it's used for a
+    // memory operation, which in practice means at *all* times and in
+    // particular across call boundaries. Therefore our own arguments started at
+    // a 16-byte aligned SP and the delta applied for the tail call should
+    // satisfy the same constraint.
+    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
+  }
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  if (!IsSibCall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL,
+                                                              true),
+                                 DL);
+
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
+                                        getPointerTy(DAG.getDataLayout()));
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
+       ++i, ++realArgIdx) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[realArgIdx];
+    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      if (Outs[realArgIdx].ArgVT == MVT::i1) {
+        // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
+        Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
+      }
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::FPExt:
+      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (VA.isRegLoc()) {
+      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
+        assert(VA.getLocVT() == MVT::i64 &&
+               "unexpected calling convention register assignment");
+        assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
+               "unexpected use of 'returned'");
+        IsThisReturn = true;
+      }
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      SDValue DstAddr;
+      MachinePointerInfo DstInfo;
+
+      // FIXME: This works on big-endian for composite byvals, which are the
+      // common case. It should also work for fundamental types too.
+      uint32_t BEAlign = 0;
+      unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
+                                        : VA.getValVT().getSizeInBits();
+      OpSize = (OpSize + 7) / 8;
+      if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
+          !Flags.isInConsecutiveRegs()) {
+        if (OpSize < 8)
+          BEAlign = 8 - OpSize;
+      }
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      int32_t Offset = LocMemOffset + BEAlign;
+      SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
+      PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+
+      if (IsTailCall) {
+        Offset = Offset + FPDiff;
+        int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
+
+        DstAddr = DAG.getFrameIndex(FI, PtrVT);
+        DstInfo =
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+
+        // Make sure any stack arguments overlapping with where we're storing
+        // are loaded before this eventual operation. Otherwise they'll be
+        // clobbered.
+        Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
+      } else {
+        SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
+
+        DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+        DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
+                                               LocMemOffset);
+      }
+
+      if (Outs[i].Flags.isByVal()) {
+        SDValue SizeNode =
+            DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
+        SDValue Cpy = DAG.getMemcpy(
+            Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+            /*isVol = */ false, /*AlwaysInline = */ false,
+            /*isTailCall = */ false,
+            DstInfo, MachinePointerInfo());
+
+        MemOpChains.push_back(Cpy);
+      } else {
+        // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
+        // promoted to a legal register type i32, we should truncate Arg back to
+        // i1/i8/i16.
+        if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
+            VA.getValVT() == MVT::i16)
+          Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
+
+        SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
+        MemOpChains.push_back(Store);
+      }
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (auto &RegToPass : RegsToPass) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
+                             RegToPass.second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      Subtarget->isTargetMachO()) {
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+      const GlobalValue *GV = G->getGlobal();
+      bool InternalLinkage = GV->hasInternalLinkage();
+      if (InternalLinkage)
+        Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
+      else {
+        Callee =
+            DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
+        Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
+      }
+    } else if (ExternalSymbolSDNode *S =
+                   dyn_cast<ExternalSymbolSDNode>(Callee)) {
+      const char *Sym = S->getSymbol();
+      Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
+      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
+    }
+  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const char *Sym = S->getSymbol();
+    Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
+  }
+
+  // We don't usually want to end the call-sequence here because we would tidy
+  // the frame up *after* the call, however in the ABI-changing tail-call case
+  // we've carefully laid out the parameters so that when sp is reset they'll be
+  // in the correct location.
+  if (IsTailCall && !IsSibCall) {
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
+                               DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
+    InFlag = Chain.getValue(1);
+  }
+
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  if (IsTailCall) {
+    // Each tail call may have to adjust the stack by a different amount, so
+    // this information must travel along with the operation for eventual
+    // consumption by emitEpilogue.
+    Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
+  }
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (auto &RegToPass : RegsToPass)
+    Ops.push_back(DAG.getRegister(RegToPass.first,
+                                  RegToPass.second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const uint32_t *Mask;
+  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  if (IsThisReturn) {
+    // For 'this' returns, use the X0-preserving mask if applicable
+    Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
+    if (!Mask) {
+      IsThisReturn = false;
+      Mask = TRI->getCallPreservedMask(MF, CallConv);
+    }
+  } else
+    Mask = TRI->getCallPreservedMask(MF, CallConv);
+
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  // If we're doing a tall call, use a TC_RETURN here rather than an
+  // actual call instruction.
+  if (IsTailCall) {
+    MF.getFrameInfo().setHasTailCall();
+    return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
+  }
+
+  // Returns a chain and a flag for retval copy to use.
+  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  uint64_t CalleePopBytes =
+      DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
+                             DAG.getIntPtrConstant(CalleePopBytes, DL, true),
+                             InFlag, DL);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+                         InVals, IsThisReturn,
+                         IsThisReturn ? OutVals[0] : SDValue());
+}
+
+bool AArch64TargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC);
+}
+
+SDValue
+AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                   bool isVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const SmallVectorImpl<SDValue> &OutVals,
+                                   const SDLoc &DL, SelectionDAG &DAG) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC);
+
+  // Copy the result values into the output registers.
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue Arg = OutVals[realRVLocIdx];
+
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      if (Outs[i].ArgVT == MVT::i1) {
+        // AAPCS requires i1 to be zero-extended to i8 by the producer of the
+        // value. This is strictly redundant on Darwin (which uses "zeroext
+        // i1"), but will be optimised out before ISel.
+        Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      }
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const MCPhysReg *I =
+      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+  if (I) {
+    for (; *I; ++I) {
+      if (AArch64::GPR64RegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+      else if (AArch64::FPR64RegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
+      else
+        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+    }
+  }
+
+  RetOps[0] = Chain; // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDLoc DL(Op);
+  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GN->getGlobal();
+  unsigned char OpFlags =
+      Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
+
+  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
+         "unexpected offset in global node");
+
+  // This also catched the large code model case for Darwin.
+  if ((OpFlags & AArch64II::MO_GOT) != 0) {
+    SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
+    // FIXME: Once remat is capable of dealing with instructions with register
+    // operands, expand this into two nodes instead of using a wrapper node.
+    return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+  }
+
+  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+  } else {
+    // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
+    // the only correct model on Darwin.
+    SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                            OpFlags | AArch64II::MO_PAGE);
+    unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
+    SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
+
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
+}
+
+/// \brief Convert a TLS address reference into the correct sequence of loads
+/// and calls to compute the variable's address (for Darwin, currently) and
+/// return an SDValue containing the final node.
+
+/// Darwin only has one TLS scheme which must be capable of dealing with the
+/// fully general situation, in the worst case. This means:
+///     + "extern __thread" declaration.
+///     + Defined in a possibly unknown dynamic library.
+///
+/// The general system is that each __thread variable has a [3 x i64] descriptor
+/// which contains information used by the runtime to calculate the address. The
+/// only part of this the compiler needs to know about is the first xword, which
+/// contains a function pointer that must be called with the address of the
+/// entire descriptor in "x0".
+///
+/// Since this descriptor may be in a different unit, in general even the
+/// descriptor must be accessed via an indirect load. The "ideal" code sequence
+/// is:
+///     adrp x0, _var@TLVPPAGE
+///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
+///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
+///                                      ; the function pointer
+///     blr x1                           ; Uses descriptor address in x0
+///     ; Address of _var is now in x0.
+///
+/// If the address of _var's descriptor *is* known to the linker, then it can
+/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
+/// a slight efficiency gain.
+SDValue
+AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
+
+  SDLoc DL(Op);
+  MVT PtrVT = getPointerTy(DAG.getDataLayout());
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+
+  SDValue TLVPAddr =
+      DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+  SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
+
+  // The first entry in the descriptor is a function pointer that we must call
+  // to obtain the address of the variable.
+  SDValue Chain = DAG.getEntryNode();
+  SDValue FuncTLVGet = DAG.getLoad(
+      MVT::i64, DL, Chain, DescAddr,
+      MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+      /* Alignment = */ 8,
+      MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant |
+          MachineMemOperand::MODereferenceable);
+  Chain = FuncTLVGet.getValue(1);
+
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI.setAdjustsStack(true);
+
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+  // silly).
+  const uint32_t *Mask =
+      Subtarget->getRegisterInfo()->getTLSCallPreservedMask();
+
+  // Finally, we can make the call. This is just a degenerate version of a
+  // normal AArch64 call node: x0 takes the address of the descriptor, and
+  // returns the address of the variable in this thread.
+  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
+  Chain =
+      DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+                  Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
+                  DAG.getRegisterMask(Mask), Chain.getValue(1));
+  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
+}
+
+/// When accessing thread-local variables under either the general-dynamic or
+/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
+/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
+/// is a function pointer to carry out the resolution.
+///
+/// The sequence is:
+///    adrp  x0, :tlsdesc:var
+///    ldr   x1, [x0, #:tlsdesc_lo12:var]
+///    add   x0, x0, #:tlsdesc_lo12:var
+///    .tlsdesccall var
+///    blr   x1
+///    (TPIDR_EL0 offset now in x0)
+///
+///  The above sequence must be produced unscheduled, to enable the linker to
+///  optimize/relax this sequence.
+///  Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
+///  above sequence, and expanded really late in the compilation flow, to ensure
+///  the sequence is produced as per above.
+SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
+                                                      const SDLoc &DL,
+                                                      SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  SDValue Chain = DAG.getEntryNode();
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  Chain =
+      DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
+  SDValue Glue = Chain.getValue(1);
+
+  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
+}
+
+SDValue
+AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
+  assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+         "ELF TLS only supported in small memory model");
+  // Different choices can be made for the maximum size of the TLS area for a
+  // module. For the small address model, the default TLS size is 16MiB and the
+  // maximum TLS size is 4GiB.
+  // FIXME: add -mtls-size command line option and make it control the 16MiB
+  // vs. 4GiB code sequence generation.
+  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
+
+  if (DAG.getTarget().Options.EmulatedTLS)
+    return LowerToTLSEmulatedModel(GA, DAG);
+
+  if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
+    if (Model == TLSModel::LocalDynamic)
+      Model = TLSModel::GeneralDynamic;
+  }
+
+  SDValue TPOff;
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDLoc DL(Op);
+  const GlobalValue *GV = GA->getGlobal();
+
+  SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
+
+  if (Model == TLSModel::LocalExec) {
+    SDValue HiVar = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
+    SDValue LoVar = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0,
+        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    SDValue TPWithOff_lo =
+        SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
+                                   HiVar,
+                                   DAG.getTargetConstant(0, DL, MVT::i32)),
+                0);
+    SDValue TPWithOff =
+        SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
+                                   LoVar,
+                                   DAG.getTargetConstant(0, DL, MVT::i32)),
+                0);
+    return TPWithOff;
+  } else if (Model == TLSModel::InitialExec) {
+    TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+    TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
+  } else if (Model == TLSModel::LocalDynamic) {
+    // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
+    // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
+    // the beginning of the module's TLS region, followed by a DTPREL offset
+    // calculation.
+
+    // These accesses will need deduplicating if there's more than one.
+    AArch64FunctionInfo *MFI =
+        DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+    MFI->incNumLocalDynamicTLSAccesses();
+
+    // The call needs a relocation too for linker relaxation. It doesn't make
+    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+    // the address.
+    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
+                                                  AArch64II::MO_TLS);
+
+    // Now we can calculate the offset from TPIDR_EL0 to this module's
+    // thread-local area.
+    TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
+
+    // Now use :dtprel_whatever: operations to calculate this variable's offset
+    // in its thread-storage area.
+    SDValue HiVar = DAG.getTargetGlobalAddress(
+        GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
+    SDValue LoVar = DAG.getTargetGlobalAddress(
+        GV, DL, MVT::i64, 0,
+        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
+                                       DAG.getTargetConstant(0, DL, MVT::i32)),
+                    0);
+    TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
+                                       DAG.getTargetConstant(0, DL, MVT::i32)),
+                    0);
+  } else if (Model == TLSModel::GeneralDynamic) {
+    // The call needs a relocation too for linker relaxation. It doesn't make
+    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+    // the address.
+    SDValue SymAddr =
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+
+    // Finally we can make a call to calculate the offset from tpidr_el0.
+    TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
+  } else
+    llvm_unreachable("Unsupported ELF TLS access model");
+
+  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
+}
+
+SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  if (Subtarget->isTargetDarwin())
+    return LowerDarwinGlobalTLSAddress(Op, DAG);
+  else if (Subtarget->isTargetELF())
+    return LowerELFGlobalTLSAddress(Op, DAG);
+
+  llvm_unreachable("Unexpected platform trying to use TLS");
+}
+SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  SDLoc dl(Op);
+
+  // Handle f128 first, since lowering it will result in comparing the return
+  // value of a libcall against zero, which is just what the rest of LowerBR_CC
+  // is expecting to deal with.
+  if (LHS.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (!RHS.getNode()) {
+      RHS = DAG.getConstant(0, dl, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
+  // instruction.
+  unsigned Opc = LHS.getOpcode();
+  if (LHS.getResNo() == 1 && isOneConstant(RHS) &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+    assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+           "Unexpected condition code.");
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
+      return SDValue();
+
+    // The actual operation with overflow check.
+    AArch64CC::CondCode OFCC;
+    SDValue Value, Overflow;
+    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
+
+    if (CC == ISD::SETNE)
+      OFCC = getInvertedCondCode(OFCC);
+    SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
+
+    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                       Overflow);
+  }
+
+  if (LHS.getValueType().isInteger()) {
+    assert((LHS.getValueType() == RHS.getValueType()) &&
+           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+
+    // If the RHS of the comparison is zero, we can potentially fold this
+    // to a specialized branch.
+    const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
+    if (RHSC && RHSC->getZExtValue() == 0) {
+      if (CC == ISD::SETEQ) {
+        // See if we can use a TBZ to fold in an AND as well.
+        // TBZ has a smaller branch displacement than CBZ.  If the offset is
+        // out of bounds, a late MI-layer pass rewrites branches.
+        // 403.gcc is an example that hits this case.
+        if (LHS.getOpcode() == ISD::AND &&
+            isa<ConstantSDNode>(LHS.getOperand(1)) &&
+            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+          SDValue Test = LHS.getOperand(0);
+          uint64_t Mask = LHS.getConstantOperandVal(1);
+          return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
+                             DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
+                             Dest);
+        }
+
+        return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
+      } else if (CC == ISD::SETNE) {
+        // See if we can use a TBZ to fold in an AND as well.
+        // TBZ has a smaller branch displacement than CBZ.  If the offset is
+        // out of bounds, a late MI-layer pass rewrites branches.
+        // 403.gcc is an example that hits this case.
+        if (LHS.getOpcode() == ISD::AND &&
+            isa<ConstantSDNode>(LHS.getOperand(1)) &&
+            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+          SDValue Test = LHS.getOperand(0);
+          uint64_t Mask = LHS.getConstantOperandVal(1);
+          return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
+                             DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
+                             Dest);
+        }
+
+        return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
+      } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
+        // Don't combine AND since emitComparison converts the AND to an ANDS
+        // (a.k.a. TST) and the test in the test bit and branch instruction
+        // becomes redundant.  This would also increase register pressure.
+        uint64_t Mask = LHS.getValueSizeInBits() - 1;
+        return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
+                           DAG.getConstant(Mask, dl, MVT::i64), Dest);
+      }
+    }
+    if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
+        LHS.getOpcode() != ISD::AND) {
+      // Don't combine AND since emitComparison converts the AND to an ANDS
+      // (a.k.a. TST) and the test in the test bit and branch instruction
+      // becomes redundant.  This would also increase register pressure.
+      uint64_t Mask = LHS.getValueSizeInBits() - 1;
+      return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
+                         DAG.getConstant(Mask, dl, MVT::i64), Dest);
+    }
+
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                       Cmp);
+  }
+
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two branches to implement.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
+  SDValue BR1 =
+      DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
+  if (CC2 != AArch64CC::AL) {
+    SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
+    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
+                       Cmp);
+  }
+
+  return BR1;
+}
+
+SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  SDValue In1 = Op.getOperand(0);
+  SDValue In2 = Op.getOperand(1);
+  EVT SrcVT = In2.getValueType();
+
+  if (SrcVT.bitsLT(VT))
+    In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
+  else if (SrcVT.bitsGT(VT))
+    In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
+
+  EVT VecVT;
+  EVT EltVT;
+  uint64_t EltMask;
+  SDValue VecVal1, VecVal2;
+  if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
+    EltVT = MVT::i32;
+    VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
+    EltMask = 0x80000000ULL;
+
+    if (!VT.isVector()) {
+      VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In1);
+      VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In2);
+    } else {
+      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
+    }
+  } else if (VT == MVT::f64 || VT == MVT::v2f64) {
+    EltVT = MVT::i64;
+    VecVT = MVT::v2i64;
+
+    // We want to materialize a mask with the high bit set, but the AdvSIMD
+    // immediate moves cannot materialize that in a single instruction for
+    // 64-bit elements. Instead, materialize zero and then negate it.
+    EltMask = 0;
+
+    if (!VT.isVector()) {
+      VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In1);
+      VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In2);
+    } else {
+      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
+    }
+  } else {
+    llvm_unreachable("Invalid type for copysign!");
+  }
+
+  SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);
+
+  // If we couldn't materialize the mask above, then the mask vector will be
+  // the zero vector, and we need to negate it here.
+  if (VT == MVT::f64 || VT == MVT::v2f64) {
+    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
+    BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
+    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
+  }
+
+  SDValue Sel =
+      DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
+
+  if (VT == MVT::f32)
+    return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
+  else if (VT == MVT::f64)
+    return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
+  else
+    return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
+}
+
+SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
+  if (DAG.getMachineFunction().getFunction()->hasFnAttribute(
+          Attribute::NoImplicitFloat))
+    return SDValue();
+
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
+  // While there is no integer popcount instruction, it can
+  // be more efficiently lowered to the following sequence that uses
+  // AdvSIMD registers/instructions as long as the copies to/from
+  // the AdvSIMD registers are cheap.
+  //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
+  //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
+  //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
+  //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
+  SDValue Val = Op.getOperand(0);
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  if (VT == MVT::i32)
+    Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
+  Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
+
+  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
+  SDValue UaddLV = DAG.getNode(
+      ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+      DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
+
+  if (VT == MVT::i64)
+    UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
+  return UaddLV;
+}
+
+SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+  if (Op.getValueType().isVector())
+    return LowerVSETCC(Op, DAG);
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDLoc dl(Op);
+
+  // We chose ZeroOrOneBooleanContents, so use zero and one.
+  EVT VT = Op.getValueType();
+  SDValue TVal = DAG.getConstant(1, dl, VT);
+  SDValue FVal = DAG.getConstant(0, dl, VT);
+
+  // Handle f128 first, since one possible outcome is a normal integer
+  // comparison which gets picked up by the next if statement.
+  if (LHS.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, use it.
+    if (!RHS.getNode()) {
+      assert(LHS.getValueType() == Op.getValueType() &&
+             "Unexpected setcc expansion!");
+      return LHS;
+    }
+  }
+
+  if (LHS.getValueType().isInteger()) {
+    SDValue CCVal;
+    SDValue Cmp =
+        getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
+
+    // Note that we inverted the condition above, so we reverse the order of
+    // the true and false operands here.  This will allow the setcc to be
+    // matched to a single CSINC instruction.
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
+  }
+
+  // Now we know we're dealing with FP values.
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
+  // and do the comparison.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  if (CC2 == AArch64CC::AL) {
+    changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
+    SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
+
+    // Note that we inverted the condition above, so we reverse the order of
+    // the true and false operands here.  This will allow the setcc to be
+    // matched to a single CSINC instruction.
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
+  } else {
+    // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
+    // totally clean.  Some of them require two CSELs to implement.  As is in
+    // this case, we emit the first CSEL and then emit a second using the output
+    // of the first as the RHS.  We're effectively OR'ing the two CC's together.
+
+    // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
+    SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
+    SDValue CS1 =
+        DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
+
+    SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
+  }
+}
+
+SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
+                                              SDValue RHS, SDValue TVal,
+                                              SDValue FVal, const SDLoc &dl,
+                                              SelectionDAG &DAG) const {
+  // Handle f128 first, because it will result in a comparison of some RTLIB
+  // call result against zero.
+  if (LHS.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (!RHS.getNode()) {
+      RHS = DAG.getConstant(0, dl, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
+  // Also handle f16, for which we need to do a f32 comparison.
+  if (LHS.getValueType() == MVT::f16) {
+    LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
+    RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
+  }
+
+  // Next, handle integers.
+  if (LHS.getValueType().isInteger()) {
+    assert((LHS.getValueType() == RHS.getValueType()) &&
+           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+
+    unsigned Opcode = AArch64ISD::CSEL;
+
+    // If both the TVal and the FVal are constants, see if we can swap them in
+    // order to for a CSINV or CSINC out of them.
+    ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+    ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
+
+    if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+      std::swap(TVal, FVal);
+      std::swap(CTVal, CFVal);
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
+      std::swap(TVal, FVal);
+      std::swap(CTVal, CFVal);
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (TVal.getOpcode() == ISD::XOR) {
+      // If TVal is a NOT we want to swap TVal and FVal so that we can match
+      // with a CSINV rather than a CSEL.
+      if (isAllOnesConstant(TVal.getOperand(1))) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+    } else if (TVal.getOpcode() == ISD::SUB) {
+      // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
+      // that we can match with a CSNEG rather than a CSEL.
+      if (isNullConstant(TVal.getOperand(0))) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+    } else if (CTVal && CFVal) {
+      const int64_t TrueVal = CTVal->getSExtValue();
+      const int64_t FalseVal = CFVal->getSExtValue();
+      bool Swap = false;
+
+      // If both TVal and FVal are constants, see if FVal is the
+      // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
+      // instead of a CSEL in that case.
+      if (TrueVal == ~FalseVal) {
+        Opcode = AArch64ISD::CSINV;
+      } else if (TrueVal == -FalseVal) {
+        Opcode = AArch64ISD::CSNEG;
+      } else if (TVal.getValueType() == MVT::i32) {
+        // If our operands are only 32-bit wide, make sure we use 32-bit
+        // arithmetic for the check whether we can use CSINC. This ensures that
+        // the addition in the check will wrap around properly in case there is
+        // an overflow (which would not be the case if we do the check with
+        // 64-bit arithmetic).
+        const uint32_t TrueVal32 = CTVal->getZExtValue();
+        const uint32_t FalseVal32 = CFVal->getZExtValue();
+
+        if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
+          Opcode = AArch64ISD::CSINC;
+
+          if (TrueVal32 > FalseVal32) {
+            Swap = true;
+          }
+        }
+        // 64-bit check whether we can use CSINC.
+      } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
+        Opcode = AArch64ISD::CSINC;
+
+        if (TrueVal > FalseVal) {
+          Swap = true;
+        }
+      }
+
+      // Swap TVal and FVal if necessary.
+      if (Swap) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+
+      if (Opcode != AArch64ISD::CSEL) {
+        // Drop FVal since we can get its value by simply inverting/negating
+        // TVal.
+        FVal = TVal;
+      }
+    }
+
+    // Avoid materializing a constant when possible by reusing a known value in
+    // a register.  However, don't perform this optimization if the known value
+    // is one, zero or negative one in the case of a CSEL.  We can always
+    // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
+    // FVal, respectively.
+    ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
+    if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
+        !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
+      AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+      // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
+      // "a != C ? x : a" to avoid materializing C.
+      if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
+        TVal = LHS;
+      else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
+        FVal = LHS;
+    } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
+      assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
+      // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
+      // avoid materializing C.
+      AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+      if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
+        Opcode = AArch64ISD::CSINV;
+        TVal = LHS;
+        FVal = DAG.getConstant(0, dl, FVal.getValueType());
+      }
+    }
+
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+
+    EVT VT = TVal.getValueType();
+    return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
+  }
+
+  // Now we know we're dealing with FP values.
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+  assert(LHS.getValueType() == RHS.getValueType());
+  EVT VT = TVal.getValueType();
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two CSELs to implement.
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+
+  if (DAG.getTarget().Options.UnsafeFPMath) {
+    // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
+    // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
+    ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
+    if (RHSVal && RHSVal->isZero()) {
+      ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
+      ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
+
+      if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
+          CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
+        TVal = LHS;
+      else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
+               CFVal && CFVal->isZero() &&
+               FVal.getValueType() == LHS.getValueType())
+        FVal = LHS;
+    }
+  }
+
+  // Emit first, and possibly only, CSEL.
+  SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
+  SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
+
+  // If we need a second CSEL, emit it, using the output of the first as the
+  // RHS.  We're effectively OR'ing the two CC's together.
+  if (CC2 != AArch64CC::AL) {
+    SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
+  }
+
+  // Otherwise, return the output of the first CSEL.
+  return CS1;
+}
+
+SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue TVal = Op.getOperand(2);
+  SDValue FVal = Op.getOperand(3);
+  SDLoc DL(Op);
+  return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
+}
+
+SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue CCVal = Op->getOperand(0);
+  SDValue TVal = Op->getOperand(1);
+  SDValue FVal = Op->getOperand(2);
+  SDLoc DL(Op);
+
+  unsigned Opc = CCVal.getOpcode();
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
+  // instruction.
+  if (CCVal.getResNo() == 1 &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
+      return SDValue();
+
+    AArch64CC::CondCode OFCC;
+    SDValue Value, Overflow;
+    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
+    SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
+
+    return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
+                       CCVal, Overflow);
+  }
+
+  // Lower it the same way as we would lower a SELECT_CC node.
+  ISD::CondCode CC;
+  SDValue LHS, RHS;
+  if (CCVal.getOpcode() == ISD::SETCC) {
+    LHS = CCVal.getOperand(0);
+    RHS = CCVal.getOperand(1);
+    CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
+  } else {
+    LHS = CCVal;
+    RHS = DAG.getConstant(0, DL, CCVal.getValueType());
+    CC = ISD::SETNE;
+  }
+  return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
+}
+
+SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  // Jump table entries as PC relative offsets. No additional tweaking
+  // is necessary here. Just get the address of the jump table.
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDLoc DL(Op);
+
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      !Subtarget->isTargetMachO()) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                               AArch64II::MO_G0 | MO_NC));
+  }
+
+  SDValue Hi =
+      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
+  SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                                      AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+  return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+}
+
+SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDLoc DL(Op);
+
+  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+    // Use the GOT for the large code model on iOS.
+    if (Subtarget->isTargetMachO()) {
+      SDValue GotAddr = DAG.getTargetConstantPool(
+          CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+          AArch64II::MO_GOT);
+      return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+    }
+
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G3),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G0 | MO_NC));
+  } else {
+    // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
+    // ELF, the only valid one on Darwin.
+    SDValue Hi =
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_PAGE);
+    SDValue Lo = DAG.getTargetConstantPool(
+        CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+        AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
+}
+
+SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDLoc DL(Op);
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      !Subtarget->isTargetMachO()) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+  } else {
+    SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
+    SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF |
+                                                             AArch64II::MO_NC);
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
+}
+
+SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  AArch64FunctionInfo *FuncInfo =
+      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+
+  SDLoc DL(Op);
+  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
+                                 getPointerTy(DAG.getDataLayout()));
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
+SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  // The layout of the va_list struct is specified in the AArch64 Procedure Call
+  // Standard, section B.3.
+  MachineFunction &MF = DAG.getMachineFunction();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDLoc DL(Op);
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue VAList = Op.getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  SmallVector<SDValue, 4> MemOps;
+
+  // void *__stack at offset 0
+  SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
+  MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
+                                MachinePointerInfo(SV), /* Alignment = */ 8));
+
+  // void *__gr_top at offset 8
+  int GPRSize = FuncInfo->getVarArgsGPRSize();
+  if (GPRSize > 0) {
+    SDValue GRTop, GRTopAddr;
+
+    GRTopAddr =
+        DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));
+
+    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
+    GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
+                        DAG.getConstant(GPRSize, DL, PtrVT));
+
+    MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
+                                  MachinePointerInfo(SV, 8),
+                                  /* Alignment = */ 8));
+  }
+
+  // void *__vr_top at offset 16
+  int FPRSize = FuncInfo->getVarArgsFPRSize();
+  if (FPRSize > 0) {
+    SDValue VRTop, VRTopAddr;
+    VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                            DAG.getConstant(16, DL, PtrVT));
+
+    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
+    VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
+                        DAG.getConstant(FPRSize, DL, PtrVT));
+
+    MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
+                                  MachinePointerInfo(SV, 16),
+                                  /* Alignment = */ 8));
+  }
+
+  // int __gr_offs at offset 24
+  SDValue GROffsAddr =
+      DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
+  MemOps.push_back(DAG.getStore(
+      Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
+      MachinePointerInfo(SV, 24), /* Alignment = */ 4));
+
+  // int __vr_offs at offset 28
+  SDValue VROffsAddr =
+      DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
+  MemOps.push_back(DAG.getStore(
+      Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
+      MachinePointerInfo(SV, 28), /* Alignment = */ 4));
+
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+}
+
+SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
+                                     : LowerAAPCS_VASTART(Op, DAG);
+}
+
+SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
+  // pointer.
+  SDLoc DL(Op);
+  unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
+  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+  return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1),
+                       Op.getOperand(2),
+                       DAG.getConstant(VaListSize, DL, MVT::i32),
+                       8, false, false, false, MachinePointerInfo(DestSV),
+                       MachinePointerInfo(SrcSV));
+}
+
+SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() &&
+         "automatic va_arg instruction only works on Darwin");
+
+  const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Addr = Op.getOperand(1);
+  unsigned Align = Op.getConstantOperandVal(3);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
+  Chain = VAList.getValue(1);
+
+  if (Align > 8) {
+    assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
+    VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                         DAG.getConstant(Align - 1, DL, PtrVT));
+    VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
+                         DAG.getConstant(-(int64_t)Align, DL, PtrVT));
+  }
+
+  Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+  uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
+
+  // Scalar integer and FP values smaller than 64 bits are implicitly extended
+  // up to 64 bits.  At the very least, we have to increase the striding of the
+  // vaargs list to match this, and for FP values we need to introduce
+  // FP_ROUND nodes as well.
+  if (VT.isInteger() && !VT.isVector())
+    ArgSize = 8;
+  bool NeedFPTrunc = false;
+  if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
+    ArgSize = 8;
+    NeedFPTrunc = true;
+  }
+
+  // Increment the pointer, VAList, to the next vaarg
+  SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                               DAG.getConstant(ArgSize, DL, PtrVT));
+  // Store the incremented VAList to the legalized pointer
+  SDValue APStore =
+      DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
+
+  // Load the actual argument out of the pointer VAList
+  if (NeedFPTrunc) {
+    // Load the value as an f64.
+    SDValue WideFP =
+        DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
+    // Round the value down to an f32.
+    SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
+                                   DAG.getIntPtrConstant(1, DL));
+    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
+    // Merge the rounded value with the chain output of the load.
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
+}
+
+SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI.setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue FrameAddr =
+      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo());
+  return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                                  SelectionDAG &DAG) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("sp", AArch64::SP)
+                       .Default(0);
+  if (Reg)
+    return Reg;
+  report_fatal_error(Twine("Invalid register name \""
+                              + StringRef(RegName)  + "\"."));
+}
+
+SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setReturnAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
+    return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
+                       MachinePointerInfo());
+  }
+
+  // Return LR, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
+  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
+}
+
+/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
+  SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+
+  // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
+  // is "undef". We wanted 0, so CSEL it directly.
+  SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
+                               ISD::SETEQ, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
+  HiBitsForLo =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
+                  HiBitsForLo, CCVal, Cmp);
+
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+                                   DAG.getConstant(VTBits, dl, MVT::i64));
+
+  SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue LoForNormalShift =
+      DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
+
+  Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
+                       dl, DAG);
+  CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+  SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+  SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
+                           LoForNormalShift, CCVal, Cmp);
+
+  // AArch64 shifts larger than the register width are wrapped rather than
+  // clamped, so we can't just emit "hi >> x".
+  SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+  SDValue HiForBigShift =
+      Opc == ISD::SRA
+          ? DAG.getNode(Opc, dl, VT, ShOpHi,
+                        DAG.getConstant(VTBits - 1, dl, MVT::i64))
+          : DAG.getConstant(0, dl, VT);
+  SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
+                           HiForNormalShift, CCVal, Cmp);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+
+  assert(Op.getOpcode() == ISD::SHL_PARTS);
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                 DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
+  SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+
+  // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
+  // is "undef". We wanted 0, so CSEL it directly.
+  SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
+                               ISD::SETEQ, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
+  LoBitsForHi =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
+                  LoBitsForHi, CCVal, Cmp);
+
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+                                   DAG.getConstant(VTBits, dl, MVT::i64));
+  SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue HiForNormalShift =
+      DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
+
+  SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+  Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
+                       dl, DAG);
+  CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+  SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
+                           HiForNormalShift, CCVal, Cmp);
+
+  // AArch64 shifts of larger than register sizes are wrapped rather than
+  // clamped, so we can't just emit "lo << a" if a is too big.
+  SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
+  SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
+                           LoForNormalShift, CCVal, Cmp);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+bool AArch64TargetLowering::isOffsetFoldingLegal(
+    const GlobalAddressSDNode *GA) const {
+  // The AArch64 target doesn't support folding offsets into global addresses.
+  return false;
+}
+
+bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
+  // FIXME: We should be able to handle f128 as well with a clever lowering.
+  if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
+    return true;
+
+  if (VT == MVT::f64)
+    return AArch64_AM::getFP64Imm(Imm) != -1;
+  else if (VT == MVT::f32)
+    return AArch64_AM::getFP32Imm(Imm) != -1;
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//                          AArch64 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
+                           SDValue Operand, SelectionDAG &DAG,
+                           int &ExtraSteps) {
+  EVT VT = Operand.getValueType();
+  if (ST->hasNEON() &&
+      (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
+       VT == MVT::f32 || VT == MVT::v1f32 ||
+       VT == MVT::v2f32 || VT == MVT::v4f32)) {
+    if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
+      // For the reciprocal estimates, convergence is quadratic, so the number
+      // of digits is doubled after each iteration.  In ARMv8, the accuracy of
+      // the initial estimate is 2^-8.  Thus the number of extra steps to refine
+      // the result for float (23 mantissa bits) is 2 and for double (52
+      // mantissa bits) is 3.
+      ExtraSteps = VT == MVT::f64 ? 3 : 2;
+
+    return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
+  }
+
+  return SDValue();
+}
+
+SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
+                                               SelectionDAG &DAG, int Enabled,
+                                               int &ExtraSteps,
+                                               bool &UseOneConst,
+                                               bool Reciprocal) const {
+  if (Enabled == ReciprocalEstimate::Enabled ||
+      (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
+    if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
+                                       DAG, ExtraSteps)) {
+      SDLoc DL(Operand);
+      EVT VT = Operand.getValueType();
+
+      SDNodeFlags Flags;
+      Flags.setUnsafeAlgebra(true);
+
+      // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
+      // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
+      for (int i = ExtraSteps; i > 0; --i) {
+        SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
+                                   &Flags);
+        Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, &Flags);
+        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags);
+      }
+
+      if (!Reciprocal) {
+        EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                                      VT);
+        SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
+        SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);
+
+        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, &Flags);
+        // Correct the result if the operand is 0.0.
+        Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
+                               VT, Eq, Operand, Estimate);
+      }
+
+      ExtraSteps = 0;
+      return Estimate;
+    }
+
+  return SDValue();
+}
+
+SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
+                                                SelectionDAG &DAG, int Enabled,
+                                                int &ExtraSteps) const {
+  if (Enabled == ReciprocalEstimate::Enabled)
+    if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
+                                       DAG, ExtraSteps)) {
+      SDLoc DL(Operand);
+      EVT VT = Operand.getValueType();
+
+      SDNodeFlags Flags;
+      Flags.setUnsafeAlgebra(true);
+
+      // Newton reciprocal iteration: E * (2 - X * E)
+      // AArch64 reciprocal iteration instruction: (2 - M * N)
+      for (int i = ExtraSteps; i > 0; --i) {
+        SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
+                                   Estimate, &Flags);
+        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags);
+      }
+
+      ExtraSteps = 0;
+      return Estimate;
+    }
+
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//                          AArch64 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+// Table of Constraints
+// TODO: This is the current set of constraints supported by ARM for the
+// compiler, not all of them may make sense, e.g. S may be difficult to support.
+//
+// r - A general register
+// w - An FP/SIMD register of some size in the range v0-v31
+// x - An FP/SIMD register of some size in the range v0-v15
+// I - Constant that can be used with an ADD instruction
+// J - Constant that can be used with a SUB instruction
+// K - Constant that can be used with a 32-bit logical instruction
+// L - Constant that can be used with a 64-bit logical instruction
+// M - Constant that can be used as a 32-bit MOV immediate
+// N - Constant that can be used as a 64-bit MOV immediate
+// Q - A memory reference with base register and no offset
+// S - A symbolic address
+// Y - Floating point constant zero
+// Z - Integer constant zero
+//
+//   Note that general register operands will be output using their 64-bit x
+// register name, whatever the size of the variable, unless the asm operand
+// is prefixed by the %w modifier. Floating-point and SIMD register operands
+// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
+// %q modifier.
+const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
+  // At this point, we have to lower this constraint to something else, so we
+  // lower it to an "r" or "w". However, by doing this we will force the result
+  // to be in register, while the X constraint is much more permissive.
+  //
+  // Although we are correct (we are free to emit anything, without
+  // constraints), we might break use cases that would expect us to be more
+  // efficient and emit something else.
+  if (!Subtarget->hasFPARMv8())
+    return "r";
+
+  if (ConstraintVT.isFloatingPoint())
+    return "w";
+
+  if (ConstraintVT.isVector() &&
+     (ConstraintVT.getSizeInBits() == 64 ||
+      ConstraintVT.getSizeInBits() == 128))
+    return "w";
+
+  return "r";
+}
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+AArch64TargetLowering::ConstraintType
+AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'z':
+      return C_Other;
+    case 'x':
+    case 'w':
+      return C_RegisterClass;
+    // An address with a single base register. Due to the way we
+    // currently handle addresses it is the same as 'r'.
+    case 'Q':
+      return C_Memory;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+AArch64TargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'x':
+  case 'w':
+    if (type->isFloatingPointTy() || type->isVectorTy())
+      weight = CW_Register;
+    break;
+  case 'z':
+    weight = CW_Constant;
+    break;
+  }
+  return weight;
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+AArch64TargetLowering::getRegForInlineAsmConstraint(
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &AArch64::GPR64commonRegClass);
+      return std::make_pair(0U, &AArch64::GPR32commonRegClass);
+    case 'w':
+      if (VT.getSizeInBits() == 16)
+        return std::make_pair(0U, &AArch64::FPR16RegClass);
+      if (VT.getSizeInBits() == 32)
+        return std::make_pair(0U, &AArch64::FPR32RegClass);
+      if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &AArch64::FPR64RegClass);
+      if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, &AArch64::FPR128RegClass);
+      break;
+    // The instructions that this constraint is designed for can
+    // only take 128-bit registers so just use that regclass.
+    case 'x':
+      if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, &AArch64::FPR128_loRegClass);
+      break;
+    }
+  }
+  if (StringRef("{cc}").equals_lower(Constraint))
+    return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
+
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  std::pair<unsigned, const TargetRegisterClass *> Res;
+  Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+
+  // Not found as a standard register?
+  if (!Res.second) {
+    unsigned Size = Constraint.size();
+    if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
+        tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
+      int RegNo;
+      bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
+      if (!Failed && RegNo >= 0 && RegNo <= 31) {
+        // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
+        // By default we'll emit v0-v31 for this unless there's a modifier where
+        // we'll emit the correct register as well.
+        if (VT != MVT::Other && VT.getSizeInBits() == 64) {
+          Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
+          Res.second = &AArch64::FPR64RegClass;
+        } else {
+          Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
+          Res.second = &AArch64::FPR128RegClass;
+        }
+      }
+    }
+  }
+
+  return Res;
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void AArch64TargetLowering::LowerAsmOperandForConstraint(
+    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+    SelectionDAG &DAG) const {
+  SDValue Result;
+
+  // Currently only support length 1 constraints.
+  if (Constraint.length() != 1)
+    return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default:
+    break;
+
+  // This set of constraints deal with valid constants for various instructions.
+  // Validate and return a target constant for them if we can.
+  case 'z': {
+    // 'z' maps to xzr or wzr so it needs an input of 0.
+    if (!isNullConstant(Op))
+      return;
+
+    if (Op.getValueType() == MVT::i64)
+      Result = DAG.getRegister(AArch64::XZR, MVT::i64);
+    else
+      Result = DAG.getRegister(AArch64::WZR, MVT::i32);
+    break;
+  }
+
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C)
+      return;
+
+    // Grab the value and do some validation.
+    uint64_t CVal = C->getZExtValue();
+    switch (ConstraintLetter) {
+    // The I constraint applies only to simple ADD or SUB immediate operands:
+    // i.e. 0 to 4095 with optional shift by 12
+    // The J constraint applies only to ADD or SUB immediates that would be
+    // valid when negated, i.e. if [an add pattern] were to be output as a SUB
+    // instruction [or vice versa], in other words -1 to -4095 with optional
+    // left shift by 12.
+    case 'I':
+      if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
+        break;
+      return;
+    case 'J': {
+      uint64_t NVal = -C->getSExtValue();
+      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
+        CVal = C->getSExtValue();
+        break;
+      }
+      return;
+    }
+    // The K and L constraints apply *only* to logical immediates, including
+    // what used to be the MOVI alias for ORR (though the MOVI alias has now
+    // been removed and MOV should be used). So these constraints have to
+    // distinguish between bit patterns that are valid 32-bit or 64-bit
+    // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
+    // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
+    // versa.
+    case 'K':
+      if (AArch64_AM::isLogicalImmediate(CVal, 32))
+        break;
+      return;
+    case 'L':
+      if (AArch64_AM::isLogicalImmediate(CVal, 64))
+        break;
+      return;
+    // The M and N constraints are a superset of K and L respectively, for use
+    // with the MOV (immediate) alias. As well as the logical immediates they
+    // also match 32 or 64-bit immediates that can be loaded either using a
+    // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
+    // (M) or 64-bit 0x1234000000000000 (N) etc.
+    // As a note some of this code is liberally stolen from the asm parser.
+    case 'M': {
+      if (!isUInt<32>(CVal))
+        return;
+      if (AArch64_AM::isLogicalImmediate(CVal, 32))
+        break;
+      if ((CVal & 0xFFFF) == CVal)
+        break;
+      if ((CVal & 0xFFFF0000ULL) == CVal)
+        break;
+      uint64_t NCVal = ~(uint32_t)CVal;
+      if ((NCVal & 0xFFFFULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF0000ULL) == NCVal)
+        break;
+      return;
+    }
+    case 'N': {
+      if (AArch64_AM::isLogicalImmediate(CVal, 64))
+        break;
+      if ((CVal & 0xFFFFULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF0000ULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF00000000ULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF000000000000ULL) == CVal)
+        break;
+      uint64_t NCVal = ~CVal;
+      if ((NCVal & 0xFFFFULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF0000ULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF00000000ULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
+        break;
+      return;
+    }
+    default:
+      return;
+    }
+
+    // All assembler immediates are 64-bit integers.
+    Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
+    break;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+//                     AArch64 Advanced SIMD Support
+//===----------------------------------------------------------------------===//
+
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
+  EVT VT = V64Reg.getValueType();
+  unsigned NarrowSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+  SDLoc DL(V64Reg);
+
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
+                     V64Reg, DAG.getConstant(0, DL, MVT::i32));
+}
+
+/// getExtFactor - Determine the adjustment factor for the position when
+/// generating an "extract from vector registers" instruction.
+static unsigned getExtFactor(SDValue &V) {
+  EVT EltType = V.getValueType().getVectorElementType();
+  return EltType.getSizeInBits() / 8;
+}
+
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+  EVT VT = V128Reg.getValueType();
+  unsigned WideSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+  SDLoc DL(V128Reg);
+
+  return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
+}
+
+// Gather data to see if the operation can be modelled as a
+// shuffle in combination with VEXTs.
+SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  struct ShuffleSourceInfo {
+    SDValue Vec;
+    unsigned MinElt;
+    unsigned MaxElt;
+
+    // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
+    // be compatible with the shuffle we intend to construct. As a result
+    // ShuffleVec will be some sliding window into the original Vec.
+    SDValue ShuffleVec;
+
+    // Code should guarantee that element i in Vec starts at element "WindowBase
+    // + i * WindowScale in ShuffleVec".
+    int WindowBase;
+    int WindowScale;
+
+    bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
+    ShuffleSourceInfo(SDValue Vec)
+        : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
+          WindowScale(1) {}
+  };
+
+  // First gather all vectors used as an immediate source for this BUILD_VECTOR
+  // node.
+  SmallVector<ShuffleSourceInfo, 2> Sources;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.isUndef())
+      continue;
+    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+             !isa<ConstantSDNode>(V.getOperand(1))) {
+      // A shuffle can only come from building a vector from various
+      // elements of other vectors, provided their indices are constant.
+      return SDValue();
+    }
+
+    // Add this element source to the list if it's not already there.
+    SDValue SourceVec = V.getOperand(0);
+    auto Source = find(Sources, SourceVec);
+    if (Source == Sources.end())
+      Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
+
+    // Update the minimum and maximum lane number seen.
+    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    Source->MinElt = std::min(Source->MinElt, EltNo);
+    Source->MaxElt = std::max(Source->MaxElt, EltNo);
+  }
+
+  // Currently only do something sane when at most two source vectors
+  // are involved.
+  if (Sources.size() > 2)
+    return SDValue();
+
+  // Find out the smallest element size among result and two sources, and use
+  // it as element size to build the shuffle_vector.
+  EVT SmallestEltTy = VT.getVectorElementType();
+  for (auto &Source : Sources) {
+    EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
+    if (SrcEltTy.bitsLT(SmallestEltTy)) {
+      SmallestEltTy = SrcEltTy;
+    }
+  }
+  unsigned ResMultiplier =
+      VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
+  NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
+  EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
+
+  // If the source vector is too wide or too narrow, we may nevertheless be able
+  // to construct a compatible shuffle either by concatenating it with UNDEF or
+  // extracting a suitable range of elements.
+  for (auto &Src : Sources) {
+    EVT SrcVT = Src.ShuffleVec.getValueType();
+
+    if (SrcVT.getSizeInBits() == VT.getSizeInBits())
+      continue;
+
+    // This stage of the search produces a source with the same element type as
+    // the original, but with a total width matching the BUILD_VECTOR output.
+    EVT EltVT = SrcVT.getVectorElementType();
+    unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
+    EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
+
+    if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
+      assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
+      // We can pad out the smaller vector for free, so if it's part of a
+      // shuffle...
+      Src.ShuffleVec =
+          DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
+                      DAG.getUNDEF(Src.ShuffleVec.getValueType()));
+      continue;
+    }
+
+    assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
+
+    if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
+      // Span too large for a VEXT to cope
+      return SDValue();
+    }
+
+    if (Src.MinElt >= NumSrcElts) {
+      // The extraction can just take the second half
+      Src.ShuffleVec =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(NumSrcElts, dl, MVT::i64));
+      Src.WindowBase = -NumSrcElts;
+    } else if (Src.MaxElt < NumSrcElts) {
+      // The extraction can just take the first half
+      Src.ShuffleVec =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(0, dl, MVT::i64));
+    } else {
+      // An actual VEXT is needed
+      SDValue VEXTSrc1 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(0, dl, MVT::i64));
+      SDValue VEXTSrc2 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(NumSrcElts, dl, MVT::i64));
+      unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
+
+      Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
+                                   VEXTSrc2,
+                                   DAG.getConstant(Imm, dl, MVT::i32));
+      Src.WindowBase = -Src.MinElt;
+    }
+  }
+
+  // Another possible incompatibility occurs from the vector element types. We
+  // can fix this by bitcasting the source vectors to the same type we intend
+  // for the shuffle.
+  for (auto &Src : Sources) {
+    EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
+    if (SrcEltTy == SmallestEltTy)
+      continue;
+    assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
+    Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+    Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
+    Src.WindowBase *= Src.WindowScale;
+  }
+
+  // Final sanity check before we try to actually produce a shuffle.
+  DEBUG(
+    for (auto Src : Sources)
+      assert(Src.ShuffleVec.getValueType() == ShuffleVT);
+  );
+
+  // The stars all align, our next step is to produce the mask for the shuffle.
+  SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
+  int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
+  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
+    SDValue Entry = Op.getOperand(i);
+    if (Entry.isUndef())
+      continue;
+
+    auto Src = find(Sources, Entry.getOperand(0));
+    int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
+
+    // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
+    // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
+    // segment.
+    EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
+    int BitsDefined =
+        std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
+    int LanesDefined = BitsDefined / BitsPerShuffleLane;
+
+    // This source is expected to fill ResMultiplier lanes of the final shuffle,
+    // starting at the appropriate offset.
+    int *LaneMask = &Mask[i * ResMultiplier];
+
+    int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
+    ExtractBase += NumElts * (Src - Sources.begin());
+    for (int j = 0; j < LanesDefined; ++j)
+      LaneMask[j] = ExtractBase + j;
+  }
+
+  // Final check before we try to produce nonsense...
+  if (!isShuffleMaskLegal(Mask, ShuffleVT))
+    return SDValue();
+
+  SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
+  for (unsigned i = 0; i < Sources.size(); ++i)
+    ShuffleOps[i] = Sources[i].ShuffleVec;
+
+  SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
+                                         ShuffleOps[1], Mask);
+  return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+}
+
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are the same.
+static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
+
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, just follow it
+    // back to index zero and keep going.
+    ++ExpectedElt;
+    if (ExpectedElt == NumElts)
+      ExpectedElt = 0;
+
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  return true;
+}
+
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are different.
+static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
+                      unsigned &Imm) {
+  // Look for the first non-undef element.
+  const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
+
+  // Benefit form APInt to handle overflow when calculating expected element.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
+  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
+  // The following shuffle indices must be the successive elements after the
+  // first real element.
+  const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
+      [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
+  if (FirstWrongElt != M.end())
+    return false;
+
+  // The index of an EXT is the first element if it is not UNDEF.
+  // Watch out for the beginning UNDEFs. The EXT index should be the expected
+  // value of the first element.  E.g.
+  // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
+  // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
+  // ExpectedElt is the last mask index plus 1.
+  Imm = ExpectedElt.getZExtValue();
+
+  // There are two difference cases requiring to reverse input vectors.
+  // For example, for vector <4 x i32> we have the following cases,
+  // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
+  // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
+  // For both cases, we finally use mask <5, 6, 7, 0>, which requires
+  // to reverse two input vectors.
+  if (Imm < NumElts)
+    ReverseEXT = true;
+  else
+    Imm -= NumElts;
+
+  return true;
+}
+
+/// isREVMask - Check if a vector shuffle corresponds to a REV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+         "Only possible block sizes for REV are: 16, 32, 64");
+
+  unsigned EltSz = VT.getScalarSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BlockElts = M[0] + 1;
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSz;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
+static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
+      return false;
+    Idx += 1;
+  }
+
+  return true;
+}
+
+static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != 2 * i + WhichResult)
+      return false;
+  }
+
+  return true;
+}
+
+static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
+      return false;
+  }
+  return true;
+}
+
+/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
+static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
+      return false;
+    Idx += 1;
+  }
+
+  return true;
+}
+
+/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
+static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned Half = VT.getVectorNumElements() / 2;
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned j = 0; j != 2; ++j) {
+    unsigned Idx = WhichResult;
+    for (unsigned i = 0; i != Half; ++i) {
+      int MIdx = M[i + j * Half];
+      if (MIdx >= 0 && (unsigned)MIdx != Idx)
+        return false;
+      Idx += 2;
+    }
+  }
+
+  return true;
+}
+
+/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
+static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
+      return false;
+  }
+  return true;
+}
+
+static bool isINSMask(ArrayRef<int> M, int NumInputElements,
+                      bool &DstIsLeft, int &Anomaly) {
+  if (M.size() != static_cast<size_t>(NumInputElements))
+    return false;
+
+  int NumLHSMatch = 0, NumRHSMatch = 0;
+  int LastLHSMismatch = -1, LastRHSMismatch = -1;
+
+  for (int i = 0; i < NumInputElements; ++i) {
+    if (M[i] == -1) {
+      ++NumLHSMatch;
+      ++NumRHSMatch;
+      continue;
+    }
+
+    if (M[i] == i)
+      ++NumLHSMatch;
+    else
+      LastLHSMismatch = i;
+
+    if (M[i] == i + NumInputElements)
+      ++NumRHSMatch;
+    else
+      LastRHSMismatch = i;
+  }
+
+  if (NumLHSMatch == NumInputElements - 1) {
+    DstIsLeft = true;
+    Anomaly = LastLHSMismatch;
+    return true;
+  } else if (NumRHSMatch == NumInputElements - 1) {
+    DstIsLeft = false;
+    Anomaly = LastRHSMismatch;
+    return true;
+  }
+
+  return false;
+}
+
+static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
+  if (VT.getSizeInBits() != 128)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (int I = 0, E = NumElts / 2; I != E; I++) {
+    if (Mask[I] != I)
+      return false;
+  }
+
+  int Offset = NumElts / 2;
+  for (int I = NumElts / 2, E = NumElts; I != E; I++) {
+    if (Mask[I] != I + SplitLHS * Offset)
+      return false;
+  }
+
+  return true;
+}
+
+static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue V0 = Op.getOperand(0);
+  SDValue V1 = Op.getOperand(1);
+  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
+
+  if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
+      VT.getVectorElementType() != V1.getValueType().getVectorElementType())
+    return SDValue();
+
+  bool SplitV0 = V0.getValueSizeInBits() == 128;
+
+  if (!isConcatMask(Mask, VT, SplitV0))
+    return SDValue();
+
+  EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                                VT.getVectorNumElements() / 2);
+  if (SplitV0) {
+    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
+                     DAG.getConstant(0, DL, MVT::i64));
+  }
+  if (V1.getValueSizeInBits() == 128) {
+    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
+                     DAG.getConstant(0, DL, MVT::i64));
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      const SDLoc &dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
+  unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
+
+  enum {
+    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VREV,
+    OP_VDUP0,
+    OP_VDUP1,
+    OP_VDUP2,
+    OP_VDUP3,
+    OP_VEXT1,
+    OP_VEXT2,
+    OP_VEXT3,
+    OP_VUZPL, // VUZP, left result
+    OP_VUZPR, // VUZP, right result
+    OP_VZIPL, // VZIP, left result
+    OP_VZIPR, // VZIP, right result
+    OP_VTRNL, // VTRN, left result
+    OP_VTRNR  // VTRN, right result
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1 * 9 + 2) * 9 + 3)
+      return LHS;
+    assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+  EVT VT = OpLHS.getValueType();
+
+  switch (OpNum) {
+  default:
+    llvm_unreachable("Unknown shuffle opcode!");
+  case OP_VREV:
+    // VREV divides the vector in half and swaps within the half.
+    if (VT.getVectorElementType() == MVT::i32 ||
+        VT.getVectorElementType() == MVT::f32)
+      return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
+    // vrev <4 x i16> -> REV32
+    if (VT.getVectorElementType() == MVT::i16 ||
+        VT.getVectorElementType() == MVT::f16)
+      return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
+    // vrev <4 x i8> -> REV16
+    assert(VT.getVectorElementType() == MVT::i8);
+    return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
+  case OP_VDUP0:
+  case OP_VDUP1:
+  case OP_VDUP2:
+  case OP_VDUP3: {
+    EVT EltTy = VT.getVectorElementType();
+    unsigned Opcode;
+    if (EltTy == MVT::i8)
+      Opcode = AArch64ISD::DUPLANE8;
+    else if (EltTy == MVT::i16 || EltTy == MVT::f16)
+      Opcode = AArch64ISD::DUPLANE16;
+    else if (EltTy == MVT::i32 || EltTy == MVT::f32)
+      Opcode = AArch64ISD::DUPLANE32;
+    else if (EltTy == MVT::i64 || EltTy == MVT::f64)
+      Opcode = AArch64ISD::DUPLANE64;
+    else
+      llvm_unreachable("Invalid vector element type?");
+
+    if (VT.getSizeInBits() == 64)
+      OpLHS = WidenVector(OpLHS, DAG);
+    SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
+    return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
+  }
+  case OP_VEXT1:
+  case OP_VEXT2:
+  case OP_VEXT3: {
+    unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
+    return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
+                       DAG.getConstant(Imm, dl, MVT::i32));
+  }
+  case OP_VUZPL:
+    return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VUZPR:
+    return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VZIPL:
+    return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VZIPR:
+    return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VTRNL:
+    return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VTRNR:
+    return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  }
+}
+
+static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
+                           SelectionDAG &DAG) {
+  // Check to see if we can use the TBL instruction.
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc DL(Op);
+
+  EVT EltVT = Op.getValueType().getVectorElementType();
+  unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
+
+  SmallVector<SDValue, 8> TBLMask;
+  for (int Val : ShuffleMask) {
+    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
+      unsigned Offset = Byte + Val * BytesPerElt;
+      TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
+    }
+  }
+
+  MVT IndexVT = MVT::v8i8;
+  unsigned IndexLen = 8;
+  if (Op.getValueSizeInBits() == 128) {
+    IndexVT = MVT::v16i8;
+    IndexLen = 16;
+  }
+
+  SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
+  SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
+
+  SDValue Shuffle;
+  if (V2.getNode()->isUndef()) {
+    if (IndexLen == 8)
+      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
+    Shuffle = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+        DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
+        DAG.getBuildVector(IndexVT, DL,
+                           makeArrayRef(TBLMask.data(), IndexLen)));
+  } else {
+    if (IndexLen == 8) {
+      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
+      Shuffle = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+          DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
+          DAG.getBuildVector(IndexVT, DL,
+                             makeArrayRef(TBLMask.data(), IndexLen)));
+    } else {
+      // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
+      // cannot currently represent the register constraints on the input
+      // table registers.
+      //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
+      //                   DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
+      //                   IndexLen));
+      Shuffle = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+          DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
+          V2Cst, DAG.getBuildVector(IndexVT, DL,
+                                    makeArrayRef(TBLMask.data(), IndexLen)));
+    }
+  }
+  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
+}
+
+static unsigned getDUPLANEOp(EVT EltType) {
+  if (EltType == MVT::i8)
+    return AArch64ISD::DUPLANE8;
+  if (EltType == MVT::i16 || EltType == MVT::f16)
+    return AArch64ISD::DUPLANE16;
+  if (EltType == MVT::i32 || EltType == MVT::f32)
+    return AArch64ISD::DUPLANE32;
+  if (EltType == MVT::i64 || EltType == MVT::f64)
+    return AArch64ISD::DUPLANE64;
+
+  llvm_unreachable("Invalid vector element type?");
+}
+
+SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+
+  // Convert shuffles that are directly supported on NEON to target-specific
+  // DAG nodes, instead of keeping them as shuffles and matching them again
+  // during code selection.  This is more efficient and avoids the possibility
+  // of inconsistencies between legalization and selection.
+  ArrayRef<int> ShuffleMask = SVN->getMask();
+
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+
+  if (SVN->isSplat()) {
+    int Lane = SVN->getSplatIndex();
+    // If this is undef splat, generate it via "just" vdup, if possible.
+    if (Lane == -1)
+      Lane = 0;
+
+    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
+      return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
+                         V1.getOperand(0));
+    // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
+    // constant. If so, we can just reference the lane's definition directly.
+    if (V1.getOpcode() == ISD::BUILD_VECTOR &&
+        !isa<ConstantSDNode>(V1.getOperand(Lane)))
+      return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
+
+    // Otherwise, duplicate from the lane of the input vector.
+    unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
+
+    // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
+    // to make a vector of the same size as this SHUFFLE. We can ignore the
+    // extract entirely, and canonicalise the concat using WidenVector.
+    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+      Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
+      V1 = V1.getOperand(0);
+    } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
+      unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
+      Lane -= Idx * VT.getVectorNumElements() / 2;
+      V1 = WidenVector(V1.getOperand(Idx), DAG);
+    } else if (VT.getSizeInBits() == 64)
+      V1 = WidenVector(V1, DAG);
+
+    return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
+  }
+
+  if (isREVMask(ShuffleMask, VT, 64))
+    return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
+  if (isREVMask(ShuffleMask, VT, 32))
+    return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
+  if (isREVMask(ShuffleMask, VT, 16))
+    return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
+
+  bool ReverseEXT = false;
+  unsigned Imm;
+  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
+    if (ReverseEXT)
+      std::swap(V1, V2);
+    Imm *= getExtFactor(V1);
+    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
+                       DAG.getConstant(Imm, dl, MVT::i32));
+  } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
+    Imm *= getExtFactor(V1);
+    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
+                       DAG.getConstant(Imm, dl, MVT::i32));
+  }
+
+  unsigned WhichResult;
+  if (isZIPMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+  if (isUZPMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+
+  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+  if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+
+  if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
+    return Concat;
+
+  bool DstIsLeft;
+  int Anomaly;
+  int NumInputElements = V1.getValueType().getVectorNumElements();
+  if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
+    SDValue DstVec = DstIsLeft ? V1 : V2;
+    SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
+
+    SDValue SrcVec = V1;
+    int SrcLane = ShuffleMask[Anomaly];
+    if (SrcLane >= NumInputElements) {
+      SrcVec = V2;
+      SrcLane -= VT.getVectorNumElements();
+    }
+    SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
+
+    EVT ScalarVT = VT.getVectorElementType();
+
+    if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
+      ScalarVT = MVT::i32;
+
+    return DAG.getNode(
+        ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
+        DstLaneV);
+  }
+
+  // If the shuffle is not directly supported and it has 4 elements, use
+  // the PerfectShuffle-generated table to synthesize it from other shuffles.
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts == 4) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (ShuffleMask[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = ShuffleMask[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                            PFIndexes[2] * 9 + PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+  }
+
+  return GenerateTBL(Op, ShuffleMask, DAG);
+}
+
+static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
+                               APInt &UndefBits) {
+  EVT VT = BVN->getValueType(0);
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
+
+    for (unsigned i = 0; i < NumSplats; ++i) {
+      CnstBits <<= SplatBitSize;
+      UndefBits <<= SplatBitSize;
+      CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
+      UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  BuildVectorSDNode *BVN =
+      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+  SDValue LHS = Op.getOperand(0);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  if (!BVN)
+    return Op;
+
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We only have BIC vector immediate instruction, which is and-not.
+    CnstBits = ~CnstBits;
+
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(0, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(8, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(16, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(24, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(0, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(8, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = ~UndefBits;
+    goto AttemptModImm;
+  }
+
+// We can always fall back to a non-immediate AND.
+FailedModImm:
+  return Op;
+}
+
+// Specialized code to quickly find if PotentialBVec is a BuildVector that
+// consists of only the same constant int value, returned in reference arg
+// ConstVal
+static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
+                                     uint64_t &ConstVal) {
+  BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
+  if (!Bvec)
+    return false;
+  ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
+  if (!FirstElt)
+    return false;
+  EVT VT = Bvec->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  for (unsigned i = 1; i < NumElts; ++i)
+    if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
+      return false;
+  ConstVal = FirstElt->getZExtValue();
+  return true;
+}
+
+static unsigned getIntrinsicID(const SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  switch (Opcode) {
+  default:
+    return Intrinsic::not_intrinsic;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    if (IID < Intrinsic::num_intrinsics)
+      return IID;
+    return Intrinsic::not_intrinsic;
+  }
+  }
+}
+
+// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
+// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
+// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
+// Also, logical shift right -> sri, with the same structure.
+static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  if (!VT.isVector())
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // Is the first op an AND?
+  const SDValue And = N->getOperand(0);
+  if (And.getOpcode() != ISD::AND)
+    return SDValue();
+
+  // Is the second op an shl or lshr?
+  SDValue Shift = N->getOperand(1);
+  // This will have been turned into: AArch64ISD::VSHL vector, #shift
+  // or AArch64ISD::VLSHR vector, #shift
+  unsigned ShiftOpc = Shift.getOpcode();
+  if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
+    return SDValue();
+  bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
+
+  // Is the shift amount constant?
+  ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+  if (!C2node)
+    return SDValue();
+
+  // Is the and mask vector all constant?
+  uint64_t C1;
+  if (!isAllConstantBuildVector(And.getOperand(1), C1))
+    return SDValue();
+
+  // Is C1 == ~C2, taking into account how much one can shift elements of a
+  // particular size?
+  uint64_t C2 = C2node->getZExtValue();
+  unsigned ElemSizeInBits = VT.getScalarSizeInBits();
+  if (C2 > ElemSizeInBits)
+    return SDValue();
+  unsigned ElemMask = (1 << ElemSizeInBits) - 1;
+  if ((C1 & ElemMask) != (~C2 & ElemMask))
+    return SDValue();
+
+  SDValue X = And.getOperand(0);
+  SDValue Y = Shift.getOperand(0);
+
+  unsigned Intrin =
+      IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
+  SDValue ResultSLI =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                  DAG.getConstant(Intrin, DL, MVT::i32), X, Y,
+                  Shift.getOperand(1));
+
+  DEBUG(dbgs() << "aarch64-lower: transformed: \n");
+  DEBUG(N->dump(&DAG));
+  DEBUG(dbgs() << "into: \n");
+  DEBUG(ResultSLI->dump(&DAG));
+
+  ++NumShiftInserts;
+  return ResultSLI;
+}
+
+SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
+  if (EnableAArch64SlrGeneration) {
+    if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
+      return Res;
+  }
+
+  BuildVectorSDNode *BVN =
+      dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
+  SDValue LHS = Op.getOperand(1);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  // OR commutes, so try swapping the operands.
+  if (!BVN) {
+    LHS = Op.getOperand(0);
+    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+  }
+  if (!BVN)
+    return Op;
+
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(0, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(8, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(16, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(24, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(0, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(8, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = UndefBits;
+    goto AttemptModImm;
+  }
+
+// We can always fall back to a non-immediate OR.
+FailedModImm:
+  return Op;
+}
+
+// Normalize the operands of BUILD_VECTOR. The value of constant operands will
+// be truncated to fit element width.
+static SDValue NormalizeBuildVector(SDValue Op,
+                                    SelectionDAG &DAG) {
+  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  EVT EltTy= VT.getVectorElementType();
+
+  if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
+    return Op;
+
+  SmallVector<SDValue, 16> Ops;
+  for (SDValue Lane : Op->ops()) {
+    if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
+      APInt LowBits(EltTy.getSizeInBits(),
+                    CstLane->getZExtValue());
+      Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
+    }
+    Ops.push_back(Lane);
+  }
+  return DAG.getBuildVector(VT, dl, Ops);
+}
+
+SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  Op = NormalizeBuildVector(Op, DAG);
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      // Certain magic vector constants (used to express things like NOT
+      // and NEG) are passed through unmodified.  This allows codegen patterns
+      // for these operations to match.  Special-purpose patterns will lower
+      // these immediates to MOVIs if it proves necessary.
+      if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
+        return Op;
+
+      // The many faces of MOVI...
+      if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
+        if (VT.getSizeInBits() == 128) {
+          SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
+                                    DAG.getConstant(CnstVal, dl, MVT::i32));
+          return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+        }
+
+        // Support the V64 version via subregister insertion.
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(0, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(8, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(16, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(24, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(0, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(8, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(264, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(272, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      // The few faces of FMOV...
+      if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
+        SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
+          VT.getSizeInBits() == 128) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
+        SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      // The many faces of MVNI...
+      CnstVal = ~CnstVal;
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(0, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(8, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(16, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(24, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(0, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(8, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(264, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, dl, MVT::i32),
+                                  DAG.getConstant(272, dl, MVT::i32));
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = UndefBits;
+    goto AttemptModImm;
+  }
+FailedModImm:
+
+  // Scan through the operands to find some interesting properties we can
+  // exploit:
+  //   1) If only one value is used, we can use a DUP, or
+  //   2) if only the low element is not undef, we can just insert that, or
+  //   3) if only one constant value is used (w/ some non-constant lanes),
+  //      we can splat the constant value into the whole vector then fill
+  //      in the non-constant lanes.
+  //   4) FIXME: If different constant values are used, but we can intelligently
+  //             select the values we'll be overwriting for the non-constant
+  //             lanes such that we can directly materialize the vector
+  //             some other way (MOVI, e.g.), we can be sneaky.
+  unsigned NumElts = VT.getVectorNumElements();
+  bool isOnlyLowElement = true;
+  bool usesOnlyOneValue = true;
+  bool usesOnlyOneConstantValue = true;
+  bool isConstant = true;
+  unsigned NumConstantLanes = 0;
+  SDValue Value;
+  SDValue ConstantValue;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.isUndef())
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
+      ++NumConstantLanes;
+      if (!ConstantValue.getNode())
+        ConstantValue = V;
+      else if (ConstantValue != V)
+        usesOnlyOneConstantValue = false;
+    }
+
+    if (!Value.getNode())
+      Value = V;
+    else if (V != Value)
+      usesOnlyOneValue = false;
+  }
+
+  if (!Value.getNode())
+    return DAG.getUNDEF(VT);
+
+  if (isOnlyLowElement)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
+
+  // Use DUP for non-constant splats.  For f32 constant splats, reduce to
+  // i32 and try again.
+  if (usesOnlyOneValue) {
+    if (!isConstant) {
+      if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+          Value.getValueType() != VT)
+        return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
+
+      // This is actually a DUPLANExx operation, which keeps everything vectory.
+
+      // DUPLANE works on 128-bit vectors, widen it if necessary.
+      SDValue Lane = Value.getOperand(1);
+      Value = Value.getOperand(0);
+      if (Value.getValueSizeInBits() == 64)
+        Value = WidenVector(Value, DAG);
+
+      unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
+      return DAG.getNode(Opcode, dl, VT, Value, Lane);
+    }
+
+    if (VT.getVectorElementType().isFloatingPoint()) {
+      SmallVector<SDValue, 8> Ops;
+      EVT EltTy = VT.getVectorElementType();
+      assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) &&
+              "Unsupported floating-point vector type");
+      MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
+      for (unsigned i = 0; i < NumElts; ++i)
+        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
+      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
+      SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
+      Val = LowerBUILD_VECTOR(Val, DAG);
+      if (Val.getNode())
+        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+    }
+  }
+
+  // If there was only one constant value used and for more than one lane,
+  // start by splatting that value, then replace the non-constant lanes. This
+  // is better than the default, which will perform a separate initialization
+  // for each lane.
+  if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
+    SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
+    // Now insert the non-constant lanes.
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
+      if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
+        // Note that type legalization likely mucked about with the VT of the
+        // source operand, so we may have to convert it here before inserting.
+        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
+      }
+    }
+    return Val;
+  }
+
+  // If all elements are constants and the case above didn't get hit, fall back
+  // to the default expansion, which will generate a load from the constant
+  // pool.
+  if (isConstant)
+    return SDValue();
+
+  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+  if (NumElts >= 4) {
+    if (SDValue shuffle = ReconstructShuffle(Op, DAG))
+      return shuffle;
+  }
+
+  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+  // know the default expansion would otherwise fall back on something even
+  // worse. For a vector with one or two non-undef values, that's
+  // scalar_to_vector for the elements followed by a shuffle (provided the
+  // shuffle is valid for the target) and materialization element by element
+  // on the stack followed by a load for everything else.
+  if (!isConstant && !usesOnlyOneValue) {
+    SDValue Vec = DAG.getUNDEF(VT);
+    SDValue Op0 = Op.getOperand(0);
+    unsigned ElemSize = VT.getScalarSizeInBits();
+    unsigned i = 0;
+    // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
+    // a) Avoid a RMW dependency on the full vector register, and
+    // b) Allow the register coalescer to fold away the copy if the
+    //    value is already in an S or D register.
+    // Do not do this for UNDEF/LOAD nodes because we have better patterns
+    // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR.
+    if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD &&
+        (ElemSize == 32 || ElemSize == 64)) {
+      unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
+      MachineSDNode *N =
+          DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
+                             DAG.getTargetConstant(SubIdx, dl, MVT::i32));
+      Vec = SDValue(N, 0);
+      ++i;
+    }
+    for (; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      if (V.isUndef())
+        continue;
+      SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
+    }
+    return Vec;
+  }
+
+  // Just use the default expansion. We failed to find a better alternative.
+  return SDValue();
+}
+
+SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
+
+  // Check for non-constant or out of range lane.
+  EVT VT = Op.getOperand(0).getValueType();
+  ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+  if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
+    return SDValue();
+
+
+  // Insertion/extraction are legal for V128 types.
+  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+      VT == MVT::v8f16)
+    return Op;
+
+  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
+    return SDValue();
+
+  // For V64 types, we perform insertion by expanding the value
+  // to a V128 type and perform the insertion on that.
+  SDLoc DL(Op);
+  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+  EVT WideTy = WideVec.getValueType();
+
+  SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
+                             Op.getOperand(1), Op.getOperand(2));
+  // Re-narrow the resultant vector.
+  return NarrowVector(Node, DAG);
+}
+
+SDValue
+AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
+
+  // Check for non-constant or out of range lane.
+  EVT VT = Op.getOperand(0).getValueType();
+  ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
+    return SDValue();
+
+
+  // Insertion/extraction are legal for V128 types.
+  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+      VT == MVT::v8f16)
+    return Op;
+
+  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
+    return SDValue();
+
+  // For V64 types, we perform extraction by expanding the value
+  // to a V128 type and perform the extraction on that.
+  SDLoc DL(Op);
+  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+  EVT WideTy = WideVec.getValueType();
+
+  EVT ExtrTy = WideTy.getVectorElementType();
+  if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
+    ExtrTy = MVT::i32;
+
+  // For extractions, we just return the result directly.
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
+                     Op.getOperand(1));
+}
+
+SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  EVT VT = Op.getOperand(0).getValueType();
+  SDLoc dl(Op);
+  // Just in case...
+  if (!VT.isVector())
+    return SDValue();
+
+  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!Cst)
+    return SDValue();
+  unsigned Val = Cst->getZExtValue();
+
+  unsigned Size = Op.getValueSizeInBits();
+
+  // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
+  if (Val == 0)
+    return Op;
+
+  // If this is extracting the upper 64-bits of a 128-bit vector, we match
+  // that directly.
+  if (Size == 64 && Val * VT.getScalarSizeInBits() == 64)
+    return Op;
+
+  return SDValue();
+}
+
+bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                               EVT VT) const {
+  if (VT.getVectorNumElements() == 4 &&
+      (VT.is128BitVector() || VT.is64BitVector())) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (M[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = M[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                            PFIndexes[2] * 9 + PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return true;
+  }
+
+  bool DummyBool;
+  int DummyInt;
+  unsigned DummyUnsigned;
+
+  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
+          isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
+          isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
+          // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
+          isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
+          isZIPMask(M, VT, DummyUnsigned) ||
+          isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
+          isConcatMask(M, VT, VT.getSizeInBits() == 128));
+}
+
+/// getVShiftImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+  // Ignore bit_converts.
+  while (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                    HasAnyUndefs, ElementBits) ||
+      SplatBitSize > ElementBits)
+    return false;
+  Cnt = SplatBits.getSExtValue();
+  return true;
+}
+
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation.  That value must be in the range:
+///   0 <= Value < ElementBits for a left shift; or
+///   0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  int64_t ElementBits = VT.getScalarSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
+}
+
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation. The value must be in the range:
+///   1 <= Value <= ElementBits for a right shift; or
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  int64_t ElementBits = VT.getScalarSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
+}
+
+SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  int64_t Cnt;
+
+  if (!Op.getOperand(1).getValueType().isVector())
+    return Op;
+  unsigned EltSize = VT.getScalarSizeInBits();
+
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unexpected shift opcode");
+
+  case ISD::SHL:
+    if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
+      return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
+                         DAG.getConstant(Cnt, DL, MVT::i32));
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                       DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
+                                       MVT::i32),
+                       Op.getOperand(0), Op.getOperand(1));
+  case ISD::SRA:
+  case ISD::SRL:
+    // Right shift immediate
+    if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
+      unsigned Opc =
+          (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
+      return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
+                         DAG.getConstant(Cnt, DL, MVT::i32));
+    }
+
+    // Right shift register.  Note, there is not a shift right register
+    // instruction, but the shift left register instruction takes a signed
+    // value, where negative numbers specify a right shift.
+    unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
+                                                : Intrinsic::aarch64_neon_ushl;
+    // negate the shift amount
+    SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
+    SDValue NegShiftLeft =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                    DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
+                    NegShift);
+    return NegShiftLeft;
+  }
+
+  return SDValue();
+}
+
+static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
+                                    AArch64CC::CondCode CC, bool NoNans, EVT VT,
+                                    const SDLoc &dl, SelectionDAG &DAG) {
+  EVT SrcVT = LHS.getValueType();
+  assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
+         "function only supposed to emit natural comparisons");
+
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
+  bool IsZero = IsCnst && (CnstBits == 0);
+
+  if (SrcVT.getVectorElementType().isFloatingPoint()) {
+    switch (CC) {
+    default:
+      return SDValue();
+    case AArch64CC::NE: {
+      SDValue Fcmeq;
+      if (IsZero)
+        Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
+      else
+        Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
+      return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
+    }
+    case AArch64CC::EQ:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
+    case AArch64CC::GE:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
+    case AArch64CC::GT:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
+    case AArch64CC::LS:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
+    case AArch64CC::LT:
+      if (!NoNans)
+        return SDValue();
+      // If we ignore NaNs then we can use to the MI implementation.
+      LLVM_FALLTHROUGH;
+    case AArch64CC::MI:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
+    }
+  }
+
+  switch (CC) {
+  default:
+    return SDValue();
+  case AArch64CC::NE: {
+    SDValue Cmeq;
+    if (IsZero)
+      Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
+    else
+      Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
+    return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
+  }
+  case AArch64CC::EQ:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
+  case AArch64CC::GE:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
+  case AArch64CC::GT:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
+  case AArch64CC::LE:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
+  case AArch64CC::LS:
+    return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
+  case AArch64CC::LO:
+    return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
+  case AArch64CC::LT:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
+  case AArch64CC::HI:
+    return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
+  case AArch64CC::HS:
+    return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
+  }
+}
+
+SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
+  SDLoc dl(Op);
+
+  if (LHS.getValueType().getVectorElementType().isInteger()) {
+    assert(LHS.getValueType() == RHS.getValueType());
+    AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+    SDValue Cmp =
+        EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
+    return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
+  }
+
+  if (LHS.getValueType().getVectorElementType() == MVT::f16)
+    return SDValue();
+
+  assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
+         LHS.getValueType().getVectorElementType() == MVT::f64);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two branches to implement.
+  AArch64CC::CondCode CC1, CC2;
+  bool ShouldInvert;
+  changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
+
+  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
+  SDValue Cmp =
+      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
+  if (!Cmp.getNode())
+    return SDValue();
+
+  if (CC2 != AArch64CC::AL) {
+    SDValue Cmp2 =
+        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
+    if (!Cmp2.getNode())
+      return SDValue();
+
+    Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
+  }
+
+  Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
+
+  if (ShouldInvert)
+    return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
+
+  return Cmp;
+}
+
+/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
+/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
+/// specified in the intrinsic calls.
+bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                               const CallInst &I,
+                                               unsigned Intrinsic) const {
+  auto &DL = I.getModule()->getDataLayout();
+  switch (Intrinsic) {
+  case Intrinsic::aarch64_neon_ld2:
+  case Intrinsic::aarch64_neon_ld3:
+  case Intrinsic::aarch64_neon_ld4:
+  case Intrinsic::aarch64_neon_ld1x2:
+  case Intrinsic::aarch64_neon_ld1x3:
+  case Intrinsic::aarch64_neon_ld1x4:
+  case Intrinsic::aarch64_neon_ld2lane:
+  case Intrinsic::aarch64_neon_ld3lane:
+  case Intrinsic::aarch64_neon_ld4lane:
+  case Intrinsic::aarch64_neon_ld2r:
+  case Intrinsic::aarch64_neon_ld3r:
+  case Intrinsic::aarch64_neon_ld4r: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    // Conservatively set memVT to the entire set of vectors loaded.
+    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.offset = 0;
+    Info.align = 0;
+    Info.vol = false; // volatile loads with NEON intrinsics not supported
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::aarch64_neon_st2:
+  case Intrinsic::aarch64_neon_st3:
+  case Intrinsic::aarch64_neon_st4:
+  case Intrinsic::aarch64_neon_st1x2:
+  case Intrinsic::aarch64_neon_st1x3:
+  case Intrinsic::aarch64_neon_st1x4:
+  case Intrinsic::aarch64_neon_st2lane:
+  case Intrinsic::aarch64_neon_st3lane:
+  case Intrinsic::aarch64_neon_st4lane: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    // Conservatively set memVT to the entire set of vectors stored.
+    unsigned NumElts = 0;
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+      Type *ArgTy = I.getArgOperand(ArgI)->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.offset = 0;
+    Info.align = 0;
+    Info.vol = false; // volatile stores with NEON intrinsics not supported
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::aarch64_ldaxr:
+  case Intrinsic::aarch64_ldxr: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::aarch64_stlxr:
+  case Intrinsic::aarch64_stxr: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = 0;
+    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::aarch64_ldaxp:
+  case Intrinsic::aarch64_ldxp: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i128;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = 16;
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::aarch64_stlxp:
+  case Intrinsic::aarch64_stxp: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i128;
+    Info.ptrVal = I.getArgOperand(2);
+    Info.offset = 0;
+    Info.align = 16;
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
+
+// Truncations from 64-bit GPR to 32-bit GPR is free.
+bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  return NumBits1 > NumBits2;
+}
+bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  return NumBits1 > NumBits2;
+}
+
+/// Check if it is profitable to hoist instruction in then/else to if.
+/// Not profitable if I and it's user can form a FMA instruction
+/// because we prefer FMSUB/FMADD.
+bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
+  if (I->getOpcode() != Instruction::FMul)
+    return true;
+
+  if (I->getNumUses() != 1)
+    return true;
+
+  Instruction *User = I->user_back();
+
+  if (User &&
+      !(User->getOpcode() == Instruction::FSub ||
+        User->getOpcode() == Instruction::FAdd))
+    return true;
+
+  const TargetOptions &Options = getTargetMachine().Options;
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  EVT VT = getValueType(DL, User->getOperand(0)->getType());
+
+  return !(isFMAFasterThanFMulAndFAdd(VT) &&
+           isOperationLegalOrCustom(ISD::FMA, VT) &&
+           (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+            Options.UnsafeFPMath));
+}
+
+// All 32-bit GPR operations implicitly zero the high-half of the corresponding
+// 64-bit GPR.
+bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  return NumBits1 == 32 && NumBits2 == 64;
+}
+bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  return NumBits1 == 32 && NumBits2 == 64;
+}
+
+bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  EVT VT1 = Val.getValueType();
+  if (isZExtFree(VT1, VT2)) {
+    return true;
+  }
+
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
+  return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
+          VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
+          VT1.getSizeInBits() <= 32);
+}
+
+bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
+  if (isa<FPExtInst>(Ext))
+    return false;
+
+  // Vector types are next free.
+  if (Ext->getType()->isVectorTy())
+    return false;
+
+  for (const Use &U : Ext->uses()) {
+    // The extension is free if we can fold it with a left shift in an
+    // addressing mode or an arithmetic operation: add, sub, and cmp.
+
+    // Is there a shift?
+    const Instruction *Instr = cast<Instruction>(U.getUser());
+
+    // Is this a constant shift?
+    switch (Instr->getOpcode()) {
+    case Instruction::Shl:
+      if (!isa<ConstantInt>(Instr->getOperand(1)))
+        return false;
+      break;
+    case Instruction::GetElementPtr: {
+      gep_type_iterator GTI = gep_type_begin(Instr);
+      auto &DL = Ext->getModule()->getDataLayout();
+      std::advance(GTI, U.getOperandNo()-1);
+      Type *IdxTy = GTI.getIndexedType();
+      // This extension will end up with a shift because of the scaling factor.
+      // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
+      // Get the shift amount based on the scaling factor:
+      // log2(sizeof(IdxTy)) - log2(8).
+      uint64_t ShiftAmt =
+          countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3;
+      // Is the constant foldable in the shift of the addressing mode?
+      // I.e., shift amount is between 1 and 4 inclusive.
+      if (ShiftAmt == 0 || ShiftAmt > 4)
+        return false;
+      break;
+    }
+    case Instruction::Trunc:
+      // Check if this is a noop.
+      // trunc(sext ty1 to ty2) to ty1.
+      if (Instr->getType() == Ext->getOperand(0)->getType())
+        continue;
+      LLVM_FALLTHROUGH;
+    default:
+      return false;
+    }
+
+    // At this point we can use the bfm family, so this extension is free
+    // for that use.
+  }
+  return true;
+}
+
+bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
+                                          unsigned &RequiredAligment) const {
+  if (!LoadedType.isSimple() ||
+      (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
+    return false;
+  // Cyclone supports unaligned accesses.
+  RequiredAligment = 0;
+  unsigned NumBits = LoadedType.getSizeInBits();
+  return NumBits == 32 || NumBits == 64;
+}
+
+/// \brief Lower an interleaved load into a ldN intrinsic.
+///
+/// E.g. Lower an interleaved load (Factor = 2):
+///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
+///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
+///
+///      Into:
+///        %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
+///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
+///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
+bool AArch64TargetLowering::lowerInterleavedLoad(
+    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    ArrayRef<unsigned> Indices, unsigned Factor) const {
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
+  assert(!Shuffles.empty() && "Empty shufflevector input");
+  assert(Shuffles.size() == Indices.size() &&
+         "Unmatched number of shufflevectors and indices");
+
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+
+  VectorType *VecTy = Shuffles[0]->getType();
+  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
+
+  // Skip if we do not have NEON and skip illegal vector types.
+  if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128))
+    return false;
+
+  // A pointer vector can not be the return type of the ldN intrinsics. Need to
+  // load integer vectors first and then convert to pointer vectors.
+  Type *EltTy = VecTy->getVectorElementType();
+  if (EltTy->isPointerTy())
+    VecTy =
+        VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
+
+  Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
+  Type *Tys[2] = {VecTy, PtrTy};
+  static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
+                                            Intrinsic::aarch64_neon_ld3,
+                                            Intrinsic::aarch64_neon_ld4};
+  Function *LdNFunc =
+      Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
+
+  IRBuilder<> Builder(LI);
+  Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy);
+
+  CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN");
+
+  // Replace uses of each shufflevector with the corresponding vector loaded
+  // by ldN.
+  for (unsigned i = 0; i < Shuffles.size(); i++) {
+    ShuffleVectorInst *SVI = Shuffles[i];
+    unsigned Index = Indices[i];
+
+    Value *SubVec = Builder.CreateExtractValue(LdN, Index);
+
+    // Convert the integer vector to pointer vector if the element is pointer.
+    if (EltTy->isPointerTy())
+      SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());
+
+    SVI->replaceAllUsesWith(SubVec);
+  }
+
+  return true;
+}
+
+/// \brief Get a mask consisting of sequential integers starting from \p Start.
+///
+/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
+                                   unsigned NumElts) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < NumElts; i++)
+    Mask.push_back(Builder.getInt32(Start + i));
+
+  return ConstantVector::get(Mask);
+}
+
+/// \brief Lower an interleaved store into a stN intrinsic.
+///
+/// E.g. Lower an interleaved store (Factor = 3):
+///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
+///                 <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+///        store <12 x i32> %i.vec, <12 x i32>* %ptr
+///
+///      Into:
+///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
+///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
+///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
+///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
+///
+/// Note that the new shufflevectors will be removed and we'll only generate one
+/// st3 instruction in CodeGen.
+///
+/// Example for a more general valid mask (Factor 3). Lower:
+///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
+///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
+///        store <12 x i32> %i.vec, <12 x i32>* %ptr
+///
+///      Into:
+///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
+///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
+///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
+///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
+bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
+                                                  ShuffleVectorInst *SVI,
+                                                  unsigned Factor) const {
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
+
+  VectorType *VecTy = SVI->getType();
+  assert(VecTy->getVectorNumElements() % Factor == 0 &&
+         "Invalid interleaved store");
+
+  unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
+  Type *EltTy = VecTy->getVectorElementType();
+  VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
+
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+
+  // Skip if we do not have NEON and skip illegal vector types.
+  if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128))
+    return false;
+
+  Value *Op0 = SVI->getOperand(0);
+  Value *Op1 = SVI->getOperand(1);
+  IRBuilder<> Builder(SI);
+
+  // StN intrinsics don't support pointer vectors as arguments. Convert pointer
+  // vectors to integer vectors.
+  if (EltTy->isPointerTy()) {
+    Type *IntTy = DL.getIntPtrType(EltTy);
+    unsigned NumOpElts =
+        dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();
+
+    // Convert to the corresponding integer vector.
+    Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
+    Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
+    Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
+
+    SubVecTy = VectorType::get(IntTy, LaneLen);
+  }
+
+  Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
+  Type *Tys[2] = {SubVecTy, PtrTy};
+  static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
+                                             Intrinsic::aarch64_neon_st3,
+                                             Intrinsic::aarch64_neon_st4};
+  Function *StNFunc =
+      Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
+
+  SmallVector<Value *, 5> Ops;
+
+  // Split the shufflevector operands into sub vectors for the new stN call.
+  auto Mask = SVI->getShuffleMask();
+  for (unsigned i = 0; i < Factor; i++) {
+    if (Mask[i] >= 0) {
+      Ops.push_back(Builder.CreateShuffleVector(
+        Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
+    } else {
+      unsigned StartMask = 0;
+      for (unsigned j = 1; j < LaneLen; j++) {
+        if (Mask[j*Factor + i] >= 0) {
+          StartMask = Mask[j*Factor + i] - j;
+          break;
+        }
+      }
+      // Note: If all elements in a chunk are undefs, StartMask=0!
+      // Note: Filling undef gaps with random elements is ok, since
+      // those elements were being written anyway (with undefs).
+      // In the case of all undefs we're defaulting to using elems from 0
+      // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
+      Ops.push_back(Builder.CreateShuffleVector(
+        Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
+    }
+  }
+
+  Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
+  Builder.CreateCall(StNFunc, Ops);
+  return true;
+}
+
+static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
+                       unsigned AlignCheck) {
+  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
+          (DstAlign == 0 || DstAlign % AlignCheck == 0));
+}
+
+EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                               unsigned SrcAlign, bool IsMemset,
+                                               bool ZeroMemset,
+                                               bool MemcpyStrSrc,
+                                               MachineFunction &MF) const {
+  // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
+  // instruction to materialize the v2i64 zero and one store (with restrictive
+  // addressing mode). Just do two i64 store of zero-registers.
+  bool Fast;
+  const Function *F = MF.getFunction();
+  if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
+      !F->hasFnAttribute(Attribute::NoImplicitFloat) &&
+      (memOpAlign(SrcAlign, DstAlign, 16) ||
+       (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
+    return MVT::f128;
+
+  if (Size >= 8 &&
+      (memOpAlign(SrcAlign, DstAlign, 8) ||
+       (allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast)))
+    return MVT::i64;
+
+  if (Size >= 4 &&
+      (memOpAlign(SrcAlign, DstAlign, 4) ||
+       (allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast)))
+    return MVT::i32;
+
+  return MVT::Other;
+}
+
+// 12-bit optionally shifted immediates are legal for adds.
+bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
+  // Avoid UB for INT64_MIN.
+  if (Immed == std::numeric_limits<int64_t>::min())
+    return false;
+  // Same encoding for add/sub, just flip the sign.
+  Immed = std::abs(Immed);
+  return ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
+}
+
+// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
+// immediates is the same as for an add or a sub.
+bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
+  return isLegalAddImmediate(Immed);
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                  const AddrMode &AM, Type *Ty,
+                                                  unsigned AS) const {
+  // AArch64 has five basic addressing modes:
+  //  reg
+  //  reg + 9-bit signed offset
+  //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
+  //  reg1 + reg2
+  //  reg + SIZE_IN_BYTES * reg
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // No reg+reg+imm addressing.
+  if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
+    return false;
+
+  // check reg + imm case:
+  // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
+  uint64_t NumBytes = 0;
+  if (Ty->isSized()) {
+    uint64_t NumBits = DL.getTypeSizeInBits(Ty);
+    NumBytes = NumBits / 8;
+    if (!isPowerOf2_64(NumBits))
+      NumBytes = 0;
+  }
+
+  if (!AM.Scale) {
+    int64_t Offset = AM.BaseOffs;
+
+    // 9-bit signed offset
+    if (isInt<9>(Offset))
+      return true;
+
+    // 12-bit unsigned offset
+    unsigned shift = Log2_64(NumBytes);
+    if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
+        // Must be a multiple of NumBytes (NumBytes is a power of 2)
+        (Offset >> shift) << shift == Offset)
+      return true;
+    return false;
+  }
+
+  // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
+
+  return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
+}
+
+int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
+                                                const AddrMode &AM, Type *Ty,
+                                                unsigned AS) const {
+  // Scaling factors are not free at all.
+  // Operands                     | Rt Latency
+  // -------------------------------------------
+  // Rt, [Xn, Xm]                 | 4
+  // -------------------------------------------
+  // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
+  // Rt, [Xn, Wm, <extend> #imm]  |
+  if (isLegalAddressingMode(DL, AM, Ty, AS))
+    // Scale represents reg2 * scale, thus account for 1 if
+    // it is not equal to 0 or 1.
+    return AM.Scale != 0 && AM.Scale != 1;
+  return -1;
+}
+
+bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+const MCPhysReg *
+AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
+  // LR is a callee-save register, but we must treat it as clobbered by any call
+  // site. Hence we include LR in the scratch registers, which are in turn added
+  // as implicit-defs for stackmaps and patchpoints.
+  static const MCPhysReg ScratchRegs[] = {
+    AArch64::X16, AArch64::X17, AArch64::LR, 0
+  };
+  return ScratchRegs;
+}
+
+bool
+AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
+  EVT VT = N->getValueType(0);
+    // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
+    // it with shift to let it be lowered to UBFX.
+  if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
+      isa<ConstantSDNode>(N->getOperand(1))) {
+    uint64_t TruncMask = N->getConstantOperandVal(1);
+    if (isMask_64(TruncMask) &&
+      N->getOperand(0).getOpcode() == ISD::SRL &&
+      isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
+      return false;
+  }
+  return true;
+}
+
+bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                              Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return false;
+
+  int64_t Val = Imm.getSExtValue();
+  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
+    return true;
+
+  if ((int64_t)Val < 0)
+    Val = ~Val;
+  if (BitSize == 32)
+    Val &= (1LL << 32) - 1;
+
+  unsigned LZ = countLeadingZeros((uint64_t)Val);
+  unsigned Shift = (63 - LZ) / 16;
+  // MOVZ is free so return true for one or fewer MOVK.
+  return Shift < 3;
+}
+
+/// Turn vector tests of the signbit in the form of:
+///   xor (sra X, elt_size(X)-1), -1
+/// into:
+///   cmge X, X, #0
+static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
+                                         const AArch64Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!Subtarget->hasNEON() || !VT.isVector())
+    return SDValue();
+
+  // There must be a shift right algebraic before the xor, and the xor must be a
+  // 'not' operation.
+  SDValue Shift = N->getOperand(0);
+  SDValue Ones = N->getOperand(1);
+  if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
+      !ISD::isBuildVectorAllOnes(Ones.getNode()))
+    return SDValue();
+
+  // The shift should be smearing the sign bit across each vector element.
+  auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+  EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
+  if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
+    return SDValue();
+
+  return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
+}
+
+// Generate SUBS and CSEL for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+  // and change it to SUB and CSEL.
+  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+      N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
+    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+      if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
+        SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+                                  N0.getOperand(0));
+        // Generate SUBS & CSEL.
+        SDValue Cmp =
+            DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
+                        N0.getOperand(0), DAG.getConstant(0, DL, VT));
+        return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
+                           DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
+                           SDValue(Cmp.getNode(), 1));
+      }
+  return SDValue();
+}
+
+static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const AArch64Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
+    return Cmp;
+
+  return performIntegerAbsCombine(N, DAG);
+}
+
+SDValue
+AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+                                     SelectionDAG &DAG,
+                                     std::vector<SDNode *> *Created) const {
+  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  if (isIntDivCheap(N->getValueType(0), Attr))
+    return SDValue(N,0); // Lower SDIV as SDIV
+
+  // fold (sdiv X, pow2)
+  EVT VT = N->getValueType(0);
+  if ((VT != MVT::i32 && VT != MVT::i64) ||
+      !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+  unsigned Lg2 = Divisor.countTrailingZeros();
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
+
+  // Add (N0 < 0) ? Pow2 - 1 : 0;
+  SDValue CCVal;
+  SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
+  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
+  SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
+
+  if (Created) {
+    Created->push_back(Cmp.getNode());
+    Created->push_back(Add.getNode());
+    Created->push_back(CSel.getNode());
+  }
+
+  // Divide by pow2.
+  SDValue SRA =
+      DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
+
+  // If we're dividing by a positive value, we're done.  Otherwise, we must
+  // negate the result.
+  if (Divisor.isNonNegative())
+    return SRA;
+
+  if (Created)
+    Created->push_back(SRA.getNode());
+  return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
+}
+
+static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const AArch64Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // The below optimizations require a constant RHS.
+  if (!isa<ConstantSDNode>(N->getOperand(1)))
+    return SDValue();
+
+  ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
+  const APInt &ConstValue = C->getAPIntValue();
+
+  // Multiplication of a power of two plus/minus one can be done more
+  // cheaply as as shift+add/sub. For now, this is true unilaterally. If
+  // future CPUs have a cheaper MADD instruction, this may need to be
+  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
+  // 64-bit is 5 cycles, so this is always a win.
+  // More aggressively, some multiplications N0 * C can be lowered to
+  // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
+  // e.g. 6=3*2=(2+1)*2.
+  // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
+  // which equals to (1+2)*16-(1+2).
+  SDValue N0 = N->getOperand(0);
+  // TrailingZeroes is used to test if the mul can be lowered to
+  // shift+add+shift.
+  unsigned TrailingZeroes = ConstValue.countTrailingZeros();
+  if (TrailingZeroes) {
+    // Conservatively do not lower to shift+add+shift if the mul might be
+    // folded into smul or umul.
+    if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
+                            isZeroExtended(N0.getNode(), DAG)))
+      return SDValue();
+    // Conservatively do not lower to shift+add+shift if the mul might be
+    // folded into madd or msub.
+    if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
+                           N->use_begin()->getOpcode() == ISD::SUB))
+      return SDValue();
+  }
+  // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
+  // and shift+add+shift.
+  APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
+
+  unsigned ShiftAmt, AddSubOpc;
+  // Is the shifted value the LHS operand of the add/sub?
+  bool ShiftValUseIsN0 = true;
+  // Do we need to negate the result?
+  bool NegateResult = false;
+
+  if (ConstValue.isNonNegative()) {
+    // (mul x, 2^N + 1) => (add (shl x, N), x)
+    // (mul x, 2^N - 1) => (sub (shl x, N), x)
+    // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
+    APInt SCVMinus1 = ShiftedConstValue - 1;
+    APInt CVPlus1 = ConstValue + 1;
+    if (SCVMinus1.isPowerOf2()) {
+      ShiftAmt = SCVMinus1.logBase2();
+      AddSubOpc = ISD::ADD;
+    } else if (CVPlus1.isPowerOf2()) {
+      ShiftAmt = CVPlus1.logBase2();
+      AddSubOpc = ISD::SUB;
+    } else
+      return SDValue();
+  } else {
+    // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+    // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
+    APInt CVNegPlus1 = -ConstValue + 1;
+    APInt CVNegMinus1 = -ConstValue - 1;
+    if (CVNegPlus1.isPowerOf2()) {
+      ShiftAmt = CVNegPlus1.logBase2();
+      AddSubOpc = ISD::SUB;
+      ShiftValUseIsN0 = false;
+    } else if (CVNegMinus1.isPowerOf2()) {
+      ShiftAmt = CVNegMinus1.logBase2();
+      AddSubOpc = ISD::ADD;
+      NegateResult = true;
+    } else
+      return SDValue();
+  }
+
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
+                                   DAG.getConstant(ShiftAmt, DL, MVT::i64));
+
+  SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
+  SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
+  SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
+  assert(!(NegateResult && TrailingZeroes) &&
+         "NegateResult and TrailingZeroes cannot both be true for now.");
+  // Negate the result.
+  if (NegateResult)
+    return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
+  // Shift the result.
+  if (TrailingZeroes)
+    return DAG.getNode(ISD::SHL, DL, VT, Res,
+                       DAG.getConstant(TrailingZeroes, DL, MVT::i64));
+  return Res;
+}
+
+static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
+                                                         SelectionDAG &DAG) {
+  // Take advantage of vector comparisons producing 0 or -1 in each lane to
+  // optimize away operation when it's from a constant.
+  //
+  // The general transformation is:
+  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
+  //       AND(VECTOR_CMP(x,y), constant2)
+  //    constant2 = UNARYOP(constant)
+
+  // Early exit if this isn't a vector operation, the operand of the
+  // unary operation isn't a bitwise AND, or if the sizes of the operations
+  // aren't the same.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
+      N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
+      VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
+    return SDValue();
+
+  // Now check that the other operand of the AND is a constant. We could
+  // make the transformation for non-constant splats as well, but it's unclear
+  // that would be a benefit as it would not eliminate any operations, just
+  // perform one more step in scalar code before moving to the vector unit.
+  if (BuildVectorSDNode *BV =
+          dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+    // Bail out if the vector isn't a constant.
+    if (!BV->isConstant())
+      return SDValue();
+
+    // Everything checks out. Build up the new and improved node.
+    SDLoc DL(N);
+    EVT IntVT = BV->getValueType(0);
+    // Create a new constant of the appropriate type for the transformed
+    // DAG.
+    SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+    // The AND node needs bitcasts to/from an integer vector type around it.
+    SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
+    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
+                                 N->getOperand(0)->getOperand(0), MaskConst);
+    SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
+    return Res;
+  }
+
+  return SDValue();
+}
+
+static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
+                                     const AArch64Subtarget *Subtarget) {
+  // First try to optimize away the conversion when it's conditionally from
+  // a constant. Vectors only.
+  if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
+    return Res;
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return SDValue();
+
+  // Only optimize when the source and destination types have the same width.
+  if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
+    return SDValue();
+
+  // If the result of an integer load is only used by an integer-to-float
+  // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
+  // This eliminates an "integer-to-vector-move" UOP and improves throughput.
+  SDValue N0 = N->getOperand(0);
+  if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      // Do not change the width of a volatile load.
+      !cast<LoadSDNode>(N0)->isVolatile()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
+                               LN0->getPointerInfo(), LN0->getAlignment(),
+                               LN0->getMemOperand()->getFlags());
+
+    // Make sure successors of the original load stay after it by updating them
+    // to use the new Chain.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
+
+    unsigned Opcode =
+        (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
+    return DAG.getNode(Opcode, SDLoc(N), VT, Load);
+  }
+
+  return SDValue();
+}
+
+/// Fold a floating-point multiply by power of two into floating-point to
+/// fixed-point conversion.
+static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
+  SDValue Op = N->getOperand(0);
+  if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
+      Op.getOpcode() != ISD::FMUL)
+    return SDValue();
+
+  SDValue ConstVec = Op->getOperand(1);
+  if (!isa<BuildVectorSDNode>(ConstVec))
+    return SDValue();
+
+  MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
+  uint32_t FloatBits = FloatTy.getSizeInBits();
+  if (FloatBits != 32 && FloatBits != 64)
+    return SDValue();
+
+  MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
+  uint32_t IntBits = IntTy.getSizeInBits();
+  if (IntBits != 16 && IntBits != 32 && IntBits != 64)
+    return SDValue();
+
+  // Avoid conversions where iN is larger than the float (e.g., float -> i64).
+  if (IntBits > FloatBits)
+    return SDValue();
+
+  BitVector UndefElements;
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+  int32_t Bits = IntBits == 64 ? 64 : 32;
+  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
+  if (C == -1 || C == 0 || C > Bits)
+    return SDValue();
+
+  MVT ResTy;
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  switch (NumLanes) {
+  default:
+    return SDValue();
+  case 2:
+    ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
+    break;
+  case 4:
+    ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
+    break;
+  }
+
+  if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
+         "Illegal vector type after legalization");
+
+  SDLoc DL(N);
+  bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+  unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
+                                      : Intrinsic::aarch64_neon_vcvtfp2fxu;
+  SDValue FixConv =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
+                  DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
+                  Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
+  // We can handle smaller integers by generating an extra trunc.
+  if (IntBits < FloatBits)
+    FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
+
+  return FixConv;
+}
+
+/// Fold a floating-point divide by power of two into fixed-point to
+/// floating-point conversion.
+static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
+  SDValue Op = N->getOperand(0);
+  unsigned Opc = Op->getOpcode();
+  if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
+      !Op.getOperand(0).getValueType().isSimple() ||
+      (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
+    return SDValue();
+
+  SDValue ConstVec = N->getOperand(1);
+  if (!isa<BuildVectorSDNode>(ConstVec))
+    return SDValue();
+
+  MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
+  int32_t IntBits = IntTy.getSizeInBits();
+  if (IntBits != 16 && IntBits != 32 && IntBits != 64)
+    return SDValue();
+
+  MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
+  int32_t FloatBits = FloatTy.getSizeInBits();
+  if (FloatBits != 32 && FloatBits != 64)
+    return SDValue();
+
+  // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
+  if (IntBits > FloatBits)
+    return SDValue();
+
+  BitVector UndefElements;
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
+  if (C == -1 || C == 0 || C > FloatBits)
+    return SDValue();
+
+  MVT ResTy;
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  switch (NumLanes) {
+  default:
+    return SDValue();
+  case 2:
+    ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
+    break;
+  case 4:
+    ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
+    break;
+  }
+
+  if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue ConvInput = Op.getOperand(0);
+  bool IsSigned = Opc == ISD::SINT_TO_FP;
+  if (IntBits < FloatBits)
+    ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
+                            ResTy, ConvInput);
+
+  unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
+                                      : Intrinsic::aarch64_neon_vcvtfxu2fp;
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+                     DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
+                     DAG.getConstant(C, DL, MVT::i32));
+}
+
+/// An EXTR instruction is made up of two shifts, ORed together. This helper
+/// searches for and classifies those shifts.
+static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
+                         bool &FromHi) {
+  if (N.getOpcode() == ISD::SHL)
+    FromHi = false;
+  else if (N.getOpcode() == ISD::SRL)
+    FromHi = true;
+  else
+    return false;
+
+  if (!isa<ConstantSDNode>(N.getOperand(1)))
+    return false;
+
+  ShiftAmount = N->getConstantOperandVal(1);
+  Src = N->getOperand(0);
+  return true;
+}
+
+/// EXTR instruction extracts a contiguous chunk of bits from two existing
+/// registers viewed as a high/low pair. This function looks for the pattern:
+/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
+/// EXTR. Can't quite be done in TableGen because the two immediates aren't
+/// independent.
+static SDValue tryCombineToEXTR(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  assert(N->getOpcode() == ISD::OR && "Unexpected root");
+
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  SDValue LHS;
+  uint32_t ShiftLHS = 0;
+  bool LHSFromHi = 0;
+  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
+    return SDValue();
+
+  SDValue RHS;
+  uint32_t ShiftRHS = 0;
+  bool RHSFromHi = 0;
+  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
+    return SDValue();
+
+  // If they're both trying to come from the high part of the register, they're
+  // not really an EXTR.
+  if (LHSFromHi == RHSFromHi)
+    return SDValue();
+
+  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
+    return SDValue();
+
+  if (LHSFromHi) {
+    std::swap(LHS, RHS);
+    std::swap(ShiftLHS, ShiftRHS);
+  }
+
+  return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
+                     DAG.getConstant(ShiftRHS, DL, MVT::i64));
+}
+
+static SDValue tryCombineToBSL(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  if (!VT.isVector())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() != ISD::AND)
+    return SDValue();
+
+  // We only have to look for constant vectors here since the general, variable
+  // case can be handled in TableGen.
+  unsigned Bits = VT.getScalarSizeInBits();
+  uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
+  for (int i = 1; i >= 0; --i)
+    for (int j = 1; j >= 0; --j) {
+      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
+      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
+      if (!BVN0 || !BVN1)
+        continue;
+
+      bool FoundMatch = true;
+      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+        if (!CN0 || !CN1 ||
+            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+          FoundMatch = false;
+          break;
+        }
+      }
+
+      if (FoundMatch)
+        return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
+                           N0->getOperand(1 - i), N1->getOperand(1 - j));
+    }
+
+  return SDValue();
+}
+
+static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                                const AArch64Subtarget *Subtarget) {
+  // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  if (SDValue Res = tryCombineToEXTR(N, DCI))
+    return Res;
+
+  if (SDValue Res = tryCombineToBSL(N, DCI))
+    return Res;
+
+  return SDValue();
+}
+
+static SDValue performSRLCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
+  // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
+  // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() == ISD::BSWAP) {
+    SDLoc DL(N);
+    SDValue N1 = N->getOperand(1);
+    SDValue N00 = N0.getOperand(0);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+      uint64_t ShiftAmt = C->getZExtValue();
+      if (VT == MVT::i32 && ShiftAmt == 16 &&
+          DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
+        return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
+      if (VT == MVT::i64 && ShiftAmt == 32 &&
+          DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
+        return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
+    }
+  }
+  return SDValue();
+}
+
+static SDValue performBitcastCombine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // Remove extraneous bitcasts around an extract_subvector.
+  // For example,
+  //    (v4i16 (bitconvert
+  //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
+  //  becomes
+  //    (extract_subvector ((v8i16 ...), (i64 4)))
+
+  // Only interested in 64-bit vectors as the ultimate result.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector())
+    return SDValue();
+  if (VT.getSimpleVT().getSizeInBits() != 64)
+    return SDValue();
+  // Is the operand an extract_subvector starting at the beginning or halfway
+  // point of the vector? A low half may also come through as an
+  // EXTRACT_SUBREG, so look for that, too.
+  SDValue Op0 = N->getOperand(0);
+  if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
+      !(Op0->isMachineOpcode() &&
+        Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
+    return SDValue();
+  uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
+  if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+    if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
+      return SDValue();
+  } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
+    if (idx != AArch64::dsub)
+      return SDValue();
+    // The dsub reference is equivalent to a lane zero subvector reference.
+    idx = 0;
+  }
+  // Look through the bitcast of the input to the extract.
+  if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
+    return SDValue();
+  SDValue Source = Op0->getOperand(0)->getOperand(0);
+  // If the source type has twice the number of elements as our destination
+  // type, we know this is an extract of the high or low half of the vector.
+  EVT SVT = Source->getValueType(0);
+  if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
+    return SDValue();
+
+  DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
+
+  // Create the simplified form to just extract the low or high half of the
+  // vector directly rather than bothering with the bitcasts.
+  SDLoc dl(N);
+  unsigned NumElements = VT.getVectorNumElements();
+  if (idx) {
+    SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
+  } else {
+    SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32);
+    return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
+                                      Source, SubReg),
+                   0);
+  }
+}
+
+static SDValue performConcatVectorsCombine(SDNode *N,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           SelectionDAG &DAG) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+
+  // Optimize concat_vectors of truncated vectors, where the intermediate
+  // type is illegal, to avoid said illegality,  e.g.,
+  //   (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
+  //                          (v2i16 (truncate (v2i64)))))
+  // ->
+  //   (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
+  //                                    (v4i32 (bitcast (v2i64))),
+  //                                    <0, 2, 4, 6>)))
+  // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
+  // on both input and result type, so we might generate worse code.
+  // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
+  if (N->getNumOperands() == 2 &&
+      N0->getOpcode() == ISD::TRUNCATE &&
+      N1->getOpcode() == ISD::TRUNCATE) {
+    SDValue N00 = N0->getOperand(0);
+    SDValue N10 = N1->getOperand(0);
+    EVT N00VT = N00.getValueType();
+
+    if (N00VT == N10.getValueType() &&
+        (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
+        N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
+      MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
+      SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
+      for (size_t i = 0; i < Mask.size(); ++i)
+        Mask[i] = i * 2;
+      return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                         DAG.getVectorShuffle(
+                             MidVT, dl,
+                             DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
+                             DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
+    }
+  }
+
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
+  // splat. The indexed instructions are going to be expecting a DUPLANE64, so
+  // canonicalise to that.
+  if (N0 == N1 && VT.getVectorNumElements() == 2) {
+    assert(VT.getScalarSizeInBits() == 64);
+    return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
+                       DAG.getConstant(0, dl, MVT::i64));
+  }
+
+  // Canonicalise concat_vectors so that the right-hand vector has as few
+  // bit-casts as possible before its real operation. The primary matching
+  // destination for these operations will be the narrowing "2" instructions,
+  // which depend on the operation being performed on this right-hand vector.
+  // For example,
+  //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
+  // becomes
+  //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
+
+  if (N1->getOpcode() != ISD::BITCAST)
+    return SDValue();
+  SDValue RHS = N1->getOperand(0);
+  MVT RHSTy = RHS.getValueType().getSimpleVT();
+  // If the RHS is not a vector, this is not the pattern we're looking for.
+  if (!RHSTy.isVector())
+    return SDValue();
+
+  DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
+
+  MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
+                                  RHSTy.getVectorNumElements() * 2);
+  return DAG.getNode(ISD::BITCAST, dl, VT,
+                     DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
+                                 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
+                                 RHS));
+}
+
+static SDValue tryCombineFixedPointConvert(SDNode *N,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+  // Transform a scalar conversion of a value from a lane extract into a
+  // lane extract of a vector conversion. E.g., from foo1 to foo2:
+  // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
+  // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
+  //
+  // The second form interacts better with instruction selection and the
+  // register allocator to avoid cross-class register copies that aren't
+  // coalescable due to a lane reference.
+
+  // Check the operand and see if it originates from a lane extract.
+  SDValue Op1 = N->getOperand(1);
+  if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    // Yep, no additional predication needed. Perform the transform.
+    SDValue IID = N->getOperand(0);
+    SDValue Shift = N->getOperand(2);
+    SDValue Vec = Op1.getOperand(0);
+    SDValue Lane = Op1.getOperand(1);
+    EVT ResTy = N->getValueType(0);
+    EVT VecResTy;
+    SDLoc DL(N);
+
+    // The vector width should be 128 bits by the time we get here, even
+    // if it started as 64 bits (the extract_vector handling will have
+    // done so).
+    assert(Vec.getValueSizeInBits() == 128 &&
+           "unexpected vector size on extract_vector_elt!");
+    if (Vec.getValueType() == MVT::v4i32)
+      VecResTy = MVT::v4f32;
+    else if (Vec.getValueType() == MVT::v2i64)
+      VecResTy = MVT::v2f64;
+    else
+      llvm_unreachable("unexpected vector type!");
+
+    SDValue Convert =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
+  }
+  return SDValue();
+}
+
+// AArch64 high-vector "long" operations are formed by performing the non-high
+// version on an extract_subvector of each operand which gets the high half:
+//
+//  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
+//
+// However, there are cases which don't have an extract_high explicitly, but
+// have another operation that can be made compatible with one for free. For
+// example:
+//
+//  (dupv64 scalar) --> (extract_high (dup128 scalar))
+//
+// This routine does the actual conversion of such DUPs, once outer routines
+// have determined that everything else is in order.
+// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
+// similarly here.
+static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
+  switch (N.getOpcode()) {
+  case AArch64ISD::DUP:
+  case AArch64ISD::DUPLANE8:
+  case AArch64ISD::DUPLANE16:
+  case AArch64ISD::DUPLANE32:
+  case AArch64ISD::DUPLANE64:
+  case AArch64ISD::MOVI:
+  case AArch64ISD::MOVIshift:
+  case AArch64ISD::MOVIedit:
+  case AArch64ISD::MOVImsl:
+  case AArch64ISD::MVNIshift:
+  case AArch64ISD::MVNImsl:
+    break;
+  default:
+    // FMOV could be supported, but isn't very useful, as it would only occur
+    // if you passed a bitcast' floating point immediate to an eligible long
+    // integer op (addl, smull, ...).
+    return SDValue();
+  }
+
+  MVT NarrowTy = N.getSimpleValueType();
+  if (!NarrowTy.is64BitVector())
+    return SDValue();
+
+  MVT ElementTy = NarrowTy.getVectorElementType();
+  unsigned NumElems = NarrowTy.getVectorNumElements();
+  MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+
+  SDLoc dl(N);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
+                     DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
+                     DAG.getConstant(NumElems, dl, MVT::i64));
+}
+
+static bool isEssentiallyExtractSubvector(SDValue N) {
+  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+    return true;
+
+  return N.getOpcode() == ISD::BITCAST &&
+         N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
+}
+
+/// \brief Helper structure to keep track of ISD::SET_CC operands.
+struct GenericSetCCInfo {
+  const SDValue *Opnd0;
+  const SDValue *Opnd1;
+  ISD::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
+struct AArch64SetCCInfo {
+  const SDValue *Cmp;
+  AArch64CC::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of SetCC information.
+union SetCCInfo {
+  GenericSetCCInfo Generic;
+  AArch64SetCCInfo AArch64;
+};
+
+/// \brief Helper structure to be able to read SetCC information.  If set to
+/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
+/// GenericSetCCInfo.
+struct SetCCInfoAndKind {
+  SetCCInfo Info;
+  bool IsAArch64;
+};
+
+/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
+/// an
+/// AArch64 lowered one.
+/// \p SetCCInfo is filled accordingly.
+/// \post SetCCInfo is meanginfull only when this function returns true.
+/// \return True when Op is a kind of SET_CC operation.
+static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
+  // If this is a setcc, this is straight forward.
+  if (Op.getOpcode() == ISD::SETCC) {
+    SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
+    SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
+    SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    SetCCInfo.IsAArch64 = false;
+    return true;
+  }
+  // Otherwise, check if this is a matching csel instruction.
+  // In other words:
+  // - csel 1, 0, cc
+  // - csel 0, 1, !cc
+  if (Op.getOpcode() != AArch64ISD::CSEL)
+    return false;
+  // Set the information about the operands.
+  // TODO: we want the operands of the Cmp not the csel
+  SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
+  SetCCInfo.IsAArch64 = true;
+  SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
+      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+  // Check that the operands matches the constraints:
+  // (1) Both operands must be constants.
+  // (2) One must be 1 and the other must be 0.
+  ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+  ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+
+  // Check (1).
+  if (!TValue || !FValue)
+    return false;
+
+  // Check (2).
+  if (!TValue->isOne()) {
+    // Update the comparison when we are interested in !cc.
+    std::swap(TValue, FValue);
+    SetCCInfo.Info.AArch64.CC =
+        AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
+  }
+  return TValue->isOne() && FValue->isNullValue();
+}
+
+// Returns true if Op is setcc or zext of setcc.
+static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
+  if (isSetCC(Op, Info))
+    return true;
+  return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
+    isSetCC(Op->getOperand(0), Info));
+}
+
+// The folding we want to perform is:
+// (add x, [zext] (setcc cc ...) )
+//   -->
+// (csel x, (add x, 1), !cc ...)
+//
+// The latter will get matched to a CSINC instruction.
+static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
+  assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
+  SDValue LHS = Op->getOperand(0);
+  SDValue RHS = Op->getOperand(1);
+  SetCCInfoAndKind InfoAndKind;
+
+  // If neither operand is a SET_CC, give up.
+  if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
+    std::swap(LHS, RHS);
+    if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
+      return SDValue();
+  }
+
+  // FIXME: This could be generatized to work for FP comparisons.
+  EVT CmpVT = InfoAndKind.IsAArch64
+                  ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
+                  : InfoAndKind.Info.Generic.Opnd0->getValueType();
+  if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
+    return SDValue();
+
+  SDValue CCVal;
+  SDValue Cmp;
+  SDLoc dl(Op);
+  if (InfoAndKind.IsAArch64) {
+    CCVal = DAG.getConstant(
+        AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
+        MVT::i32);
+    Cmp = *InfoAndKind.Info.AArch64.Cmp;
+  } else
+    Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
+                      *InfoAndKind.Info.Generic.Opnd1,
+                      ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
+                      CCVal, DAG, dl);
+
+  EVT VT = Op->getValueType(0);
+  LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
+  return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
+}
+
+// The basic add/sub long vector instructions have variants with "2" on the end
+// which act on the high-half of their inputs. They are normally matched by
+// patterns like:
+//
+// (add (zeroext (extract_high LHS)),
+//      (zeroext (extract_high RHS)))
+// -> uaddl2 vD, vN, vM
+//
+// However, if one of the extracts is something like a duplicate, this
+// instruction can still be used profitably. This function puts the DAG into a
+// more appropriate form for those patterns to trigger.
+static SDValue performAddSubLongCombine(SDNode *N,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  MVT VT = N->getSimpleValueType(0);
+  if (!VT.is128BitVector()) {
+    if (N->getOpcode() == ISD::ADD)
+      return performSetccAddFolding(N, DAG);
+    return SDValue();
+  }
+
+  // Make sure both branches are extended in the same way.
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
+       LHS.getOpcode() != ISD::SIGN_EXTEND) ||
+      LHS.getOpcode() != RHS.getOpcode())
+    return SDValue();
+
+  unsigned ExtType = LHS.getOpcode();
+
+  // It's not worth doing if at least one of the inputs isn't already an
+  // extract, but we don't know which it'll be so we have to try both.
+  if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
+    RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
+    if (!RHS.getNode())
+      return SDValue();
+
+    RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
+  } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
+    LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
+    if (!LHS.getNode())
+      return SDValue();
+
+    LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
+  }
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
+}
+
+// Massage DAGs which we can use the high-half "long" operations on into
+// something isel will recognize better. E.g.
+//
+// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
+//   (aarch64_neon_umull (extract_high (v2i64 vec)))
+//                     (extract_high (v2i64 (dup128 scalar)))))
+//
+static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  assert(LHS.getValueType().is64BitVector() &&
+         RHS.getValueType().is64BitVector() &&
+         "unexpected shape for long operation");
+
+  // Either node could be a DUP, but it's not worth doing both of them (you'd
+  // just as well use the non-high version) so look for a corresponding extract
+  // operation on the other "wing".
+  if (isEssentiallyExtractSubvector(LHS)) {
+    RHS = tryExtendDUPToExtractHigh(RHS, DAG);
+    if (!RHS.getNode())
+      return SDValue();
+  } else if (isEssentiallyExtractSubvector(RHS)) {
+    LHS = tryExtendDUPToExtractHigh(LHS, DAG);
+    if (!LHS.getNode())
+      return SDValue();
+  }
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
+                     N->getOperand(0), LHS, RHS);
+}
+
+static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
+  MVT ElemTy = N->getSimpleValueType(0).getScalarType();
+  unsigned ElemBits = ElemTy.getSizeInBits();
+
+  int64_t ShiftAmount;
+  if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
+    APInt SplatValue, SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                              HasAnyUndefs, ElemBits) ||
+        SplatBitSize != ElemBits)
+      return SDValue();
+
+    ShiftAmount = SplatValue.getSExtValue();
+  } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
+    ShiftAmount = CVN->getSExtValue();
+  } else
+    return SDValue();
+
+  unsigned Opcode;
+  bool IsRightShift;
+  switch (IID) {
+  default:
+    llvm_unreachable("Unknown shift intrinsic");
+  case Intrinsic::aarch64_neon_sqshl:
+    Opcode = AArch64ISD::SQSHL_I;
+    IsRightShift = false;
+    break;
+  case Intrinsic::aarch64_neon_uqshl:
+    Opcode = AArch64ISD::UQSHL_I;
+    IsRightShift = false;
+    break;
+  case Intrinsic::aarch64_neon_srshl:
+    Opcode = AArch64ISD::SRSHR_I;
+    IsRightShift = true;
+    break;
+  case Intrinsic::aarch64_neon_urshl:
+    Opcode = AArch64ISD::URSHR_I;
+    IsRightShift = true;
+    break;
+  case Intrinsic::aarch64_neon_sqshlu:
+    Opcode = AArch64ISD::SQSHLU_I;
+    IsRightShift = false;
+    break;
+  }
+
+  if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
+    SDLoc dl(N);
+    return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
+                       DAG.getConstant(-ShiftAmount, dl, MVT::i32));
+  } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
+    SDLoc dl(N);
+    return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
+                       DAG.getConstant(ShiftAmount, dl, MVT::i32));
+  }
+
+  return SDValue();
+}
+
+// The CRC32[BH] instructions ignore the high bits of their data operand. Since
+// the intrinsics must be legal and take an i32, this means there's almost
+// certainly going to be a zext in the DAG which we can eliminate.
+static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
+  SDValue AndN = N->getOperand(2);
+  if (AndN.getOpcode() != ISD::AND)
+    return SDValue();
+
+  ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
+  if (!CMask || CMask->getZExtValue() != Mask)
+    return SDValue();
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
+                     N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
+}
+
+static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
+                                           SelectionDAG &DAG) {
+  SDLoc dl(N);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
+                     DAG.getNode(Opc, dl,
+                                 N->getOperand(1).getSimpleValueType(),
+                                 N->getOperand(1)),
+                     DAG.getConstant(0, dl, MVT::i64));
+}
+
+static SDValue performIntrinsicCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       const AArch64Subtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned IID = getIntrinsicID(N);
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::aarch64_neon_vcvtfxs2fp:
+  case Intrinsic::aarch64_neon_vcvtfxu2fp:
+    return tryCombineFixedPointConvert(N, DCI, DAG);
+  case Intrinsic::aarch64_neon_saddv:
+    return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
+  case Intrinsic::aarch64_neon_uaddv:
+    return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
+  case Intrinsic::aarch64_neon_sminv:
+    return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
+  case Intrinsic::aarch64_neon_uminv:
+    return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
+  case Intrinsic::aarch64_neon_smaxv:
+    return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
+  case Intrinsic::aarch64_neon_umaxv:
+    return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
+  case Intrinsic::aarch64_neon_fmax:
+    return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_fmin:
+    return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_fmaxnm:
+    return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_fminnm:
+    return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_smull:
+  case Intrinsic::aarch64_neon_umull:
+  case Intrinsic::aarch64_neon_pmull:
+  case Intrinsic::aarch64_neon_sqdmull:
+    return tryCombineLongOpWithDup(IID, N, DCI, DAG);
+  case Intrinsic::aarch64_neon_sqshl:
+  case Intrinsic::aarch64_neon_uqshl:
+  case Intrinsic::aarch64_neon_sqshlu:
+  case Intrinsic::aarch64_neon_srshl:
+  case Intrinsic::aarch64_neon_urshl:
+    return tryCombineShiftImm(IID, N, DAG);
+  case Intrinsic::aarch64_crc32b:
+  case Intrinsic::aarch64_crc32cb:
+    return tryCombineCRC32(0xff, N, DAG);
+  case Intrinsic::aarch64_crc32h:
+  case Intrinsic::aarch64_crc32ch:
+    return tryCombineCRC32(0xffff, N, DAG);
+  }
+  return SDValue();
+}
+
+static SDValue performExtendCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    SelectionDAG &DAG) {
+  // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
+  // we can convert that DUP into another extract_high (of a bigger DUP), which
+  // helps the backend to decide that an sabdl2 would be useful, saving a real
+  // extract_high operation.
+  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
+      N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+    SDNode *ABDNode = N->getOperand(0).getNode();
+    unsigned IID = getIntrinsicID(ABDNode);
+    if (IID == Intrinsic::aarch64_neon_sabd ||
+        IID == Intrinsic::aarch64_neon_uabd) {
+      SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
+      if (!NewABD.getNode())
+        return SDValue();
+
+      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
+                         NewABD);
+    }
+  }
+
+  // This is effectively a custom type legalization for AArch64.
+  //
+  // Type legalization will split an extend of a small, legal, type to a larger
+  // illegal type by first splitting the destination type, often creating
+  // illegal source types, which then get legalized in isel-confusing ways,
+  // leading to really terrible codegen. E.g.,
+  //   %result = v8i32 sext v8i8 %value
+  // becomes
+  //   %losrc = extract_subreg %value, ...
+  //   %hisrc = extract_subreg %value, ...
+  //   %lo = v4i32 sext v4i8 %losrc
+  //   %hi = v4i32 sext v4i8 %hisrc
+  // Things go rapidly downhill from there.
+  //
+  // For AArch64, the [sz]ext vector instructions can only go up one element
+  // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
+  // take two instructions.
+  //
+  // This implies that the most efficient way to do the extend from v8i8
+  // to two v4i32 values is to first extend the v8i8 to v8i16, then do
+  // the normal splitting to happen for the v8i16->v8i32.
+
+  // This is pre-legalization to catch some cases where the default
+  // type legalization will create ill-tempered code.
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // We're only interested in cleaning things up for non-legal vector types
+  // here. If both the source and destination are legal, things will just
+  // work naturally without any fiddling.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT ResVT = N->getValueType(0);
+  if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
+    return SDValue();
+  // If the vector type isn't a simple VT, it's beyond the scope of what
+  // we're  worried about here. Let legalization do its thing and hope for
+  // the best.
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src->getValueType(0);
+  if (!ResVT.isSimple() || !SrcVT.isSimple())
+    return SDValue();
+
+  // If the source VT is a 64-bit vector, we can play games and get the
+  // better results we want.
+  if (SrcVT.getSizeInBits() != 64)
+    return SDValue();
+
+  unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
+  unsigned ElementCount = SrcVT.getVectorNumElements();
+  SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
+  SDLoc DL(N);
+  Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
+
+  // Now split the rest of the operation into two halves, each with a 64
+  // bit source.
+  EVT LoVT, HiVT;
+  SDValue Lo, Hi;
+  unsigned NumElements = ResVT.getVectorNumElements();
+  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+  LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
+                                 ResVT.getVectorElementType(), NumElements / 2);
+
+  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
+                               LoVT.getVectorNumElements());
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+                   DAG.getConstant(0, DL, MVT::i64));
+  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+                   DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64));
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
+
+  // Now combine the parts back together so we still have a single result
+  // like the combiner expects.
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
+}
+
+static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
+                               SDValue SplatVal, unsigned NumVecElts) {
+  unsigned OrigAlignment = St.getAlignment();
+  unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
+
+  // Create scalar stores. This is at least as good as the code sequence for a
+  // split unaligned store which is a dup.s, ext.b, and two stores.
+  // Most of the time the three stores should be replaced by store pair
+  // instructions (stp).
+  SDLoc DL(&St);
+  SDValue BasePtr = St.getBasePtr();
+  SDValue NewST1 =
+      DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, St.getPointerInfo(),
+                   OrigAlignment, St.getMemOperand()->getFlags());
+
+  unsigned Offset = EltOffset;
+  while (--NumVecElts) {
+    unsigned Alignment = MinAlign(OrigAlignment, Offset);
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                    DAG.getConstant(Offset, DL, MVT::i64));
+    NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
+                          St.getPointerInfo(), Alignment,
+                          St.getMemOperand()->getFlags());
+    Offset += EltOffset;
+  }
+  return NewST1;
+}
+
+/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.  The
+/// load store optimizer pass will merge them to store pair stores.  This should
+/// be better than a movi to create the vector zero followed by a vector store
+/// if the zero constant is not re-used, since one instructions and one register
+/// live range will be removed.
+///
+/// For example, the final generated code should be:
+///
+///   stp xzr, xzr, [x0]
+///
+/// instead of:
+///
+///   movi v0.2d, #0
+///   str q0, [x0]
+///
+static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
+  SDValue StVal = St.getValue();
+  EVT VT = StVal.getValueType();
+
+  // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
+  // 2, 3 or 4 i32 elements.
+  int NumVecElts = VT.getVectorNumElements();
+  if (!(((NumVecElts == 2 || NumVecElts == 3) &&
+         VT.getVectorElementType().getSizeInBits() == 64) ||
+        ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
+         VT.getVectorElementType().getSizeInBits() == 32)))
+    return SDValue();
+
+  if (StVal.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  // If the zero constant has more than one use then the vector store could be
+  // better since the constant mov will be amortized and stp q instructions
+  // should be able to be formed.
+  if (!StVal.hasOneUse())
+    return SDValue();
+
+  // If the immediate offset of the address operand is too large for the stp
+  // instruction, then bail out.
+  if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
+    int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
+    if (Offset < -512 || Offset > 504)
+      return SDValue();
+  }
+
+  for (int I = 0; I < NumVecElts; ++I) {
+    SDValue EltVal = StVal.getOperand(I);
+    if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
+      return SDValue();
+  }
+
+  // Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from
+  // undoing this transformation.
+  SDValue SplatVal = VT.getVectorElementType().getSizeInBits() == 32
+                         ? DAG.getRegister(AArch64::WZR, MVT::i32)
+                         : DAG.getRegister(AArch64::XZR, MVT::i64);
+  return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
+}
+
+/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
+/// value. The load store optimizer pass will merge them to store pair stores.
+/// This has better performance than a splat of the scalar followed by a split
+/// vector store. Even if the stores are not merged it is four stores vs a dup,
+/// followed by an ext.b and two stores.
+static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
+  SDValue StVal = St.getValue();
+  EVT VT = StVal.getValueType();
+
+  // Don't replace floating point stores, they possibly won't be transformed to
+  // stp because of the store pair suppress pass.
+  if (VT.isFloatingPoint())
+    return SDValue();
+
+  // We can express a splat as store pair(s) for 2 or 4 elements.
+  unsigned NumVecElts = VT.getVectorNumElements();
+  if (NumVecElts != 4 && NumVecElts != 2)
+    return SDValue();
+
+  // Check that this is a splat.
+  // Make sure that each of the relevant vector element locations are inserted
+  // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
+  std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
+  SDValue SplatVal;
+  for (unsigned I = 0; I < NumVecElts; ++I) {
+    // Check for insert vector elements.
+    if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
+      return SDValue();
+
+    // Check that same value is inserted at each vector element.
+    if (I == 0)
+      SplatVal = StVal.getOperand(1);
+    else if (StVal.getOperand(1) != SplatVal)
+      return SDValue();
+
+    // Check insert element index.
+    ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
+    if (!CIndex)
+      return SDValue();
+    uint64_t IndexVal = CIndex->getZExtValue();
+    if (IndexVal >= NumVecElts)
+      return SDValue();
+    IndexNotInserted.reset(IndexVal);
+
+    StVal = StVal.getOperand(0);
+  }
+  // Check that all vector element locations were inserted to.
+  if (IndexNotInserted.any())
+      return SDValue();
+
+  return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
+}
+
+static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                           SelectionDAG &DAG,
+                           const AArch64Subtarget *Subtarget) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  StoreSDNode *S = cast<StoreSDNode>(N);
+  if (S->isVolatile())
+    return SDValue();
+
+  SDValue StVal = S->getValue();
+  EVT VT = StVal.getValueType();
+  if (!VT.isVector())
+    return SDValue();
+
+  // If we get a splat of zeros, convert this vector store to a store of
+  // scalars. They will be merged into store pairs of xzr thereby removing one
+  // instruction and one register.
+  if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
+    return ReplacedZeroSplat;
+
+  // FIXME: The logic for deciding if an unaligned store should be split should
+  // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
+  // a call to that function here.
+
+  if (!Subtarget->isMisaligned128StoreSlow())
+    return SDValue();
+
+  // Don't split at -Oz.
+  if (DAG.getMachineFunction().getFunction()->optForMinSize())
+    return SDValue();
+
+  // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
+  // those up regresses performance on micro-benchmarks and olden/bh.
+  if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
+    return SDValue();
+
+  // Split unaligned 16B stores. They are terrible for performance.
+  // Don't split stores with alignment of 1 or 2. Code that uses clang vector
+  // extensions can use this to mark that it does not want splitting to happen
+  // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
+  // eliminating alignment hazards is only 1 in 8 for alignment of 2.
+  if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
+      S->getAlignment() <= 2)
+    return SDValue();
+
+  // If we get a splat of a scalar convert this vector store to a store of
+  // scalars. They will be merged into store pairs thereby removing two
+  // instructions.
+  if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
+    return ReplacedSplat;
+
+  SDLoc DL(S);
+  unsigned NumElts = VT.getVectorNumElements() / 2;
+  // Split VT into two.
+  EVT HalfVT =
+      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
+  SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+                                   DAG.getConstant(0, DL, MVT::i64));
+  SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+                                   DAG.getConstant(NumElts, DL, MVT::i64));
+  SDValue BasePtr = S->getBasePtr();
+  SDValue NewST1 =
+      DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
+                   S->getAlignment(), S->getMemOperand()->getFlags());
+  SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                  DAG.getConstant(8, DL, MVT::i64));
+  return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
+                      S->getPointerInfo(), S->getAlignment(),
+                      S->getMemOperand()->getFlags());
+}
+
+/// Target-specific DAG combine function for post-increment LD1 (lane) and
+/// post-increment LD1R.
+static SDValue performPostLD1Combine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     bool IsLaneOp) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  unsigned LoadIdx = IsLaneOp ? 1 : 0;
+  SDNode *LD = N->getOperand(LoadIdx).getNode();
+  // If it is not LOAD, can not do such combine.
+  if (LD->getOpcode() != ISD::LOAD)
+    return SDValue();
+
+  LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
+  EVT MemVT = LoadSDN->getMemoryVT();
+  // Check if memory operand is the same type as the vector element.
+  if (MemVT != VT.getVectorElementType())
+    return SDValue();
+
+  // Check if there are other uses. If so, do not combine as it will introduce
+  // an extra load.
+  for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
+       ++UI) {
+    if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
+      continue;
+    if (*UI != N)
+      return SDValue();
+  }
+
+  SDValue Addr = LD->getOperand(1);
+  SDValue Vector = N->getOperand(0);
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
+       Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD
+        || UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load.  Otherwise, folding it
+    // would create a cycle.
+    if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
+      continue;
+    // Also check that add is not used in the vector operand.  This would also
+    // create a cycle.
+    if (User->isPredecessorOf(Vector.getNode()))
+      continue;
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint32_t IncVal = CInc->getZExtValue();
+      unsigned NumBytes = VT.getScalarSizeInBits() / 8;
+      if (IncVal != NumBytes)
+        continue;
+      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
+    }
+
+    // Finally, check that the vector doesn't depend on the load.
+    // Again, this would create a cycle.
+    // The load depending on the vector is fine, as that's the case for the
+    // LD1*post we'll eventually generate anyway.
+    if (LoadSDN->isPredecessorOf(Vector.getNode()))
+      continue;
+
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(LD->getOperand(0));  // Chain
+    if (IsLaneOp) {
+      Ops.push_back(Vector);           // The vector to be inserted
+      Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
+    }
+    Ops.push_back(Addr);
+    Ops.push_back(Inc);
+
+    EVT Tys[3] = { VT, MVT::i64, MVT::Other };
+    SDVTList SDTys = DAG.getVTList(Tys);
+    unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
+                                           MemVT,
+                                           LoadSDN->getMemOperand());
+
+    // Update the uses.
+    SDValue NewResults[] = {
+        SDValue(LD, 0),            // The result of load
+        SDValue(UpdN.getNode(), 2) // Chain
+    };
+    DCI.CombineTo(LD, NewResults);
+    DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
+
+    break;
+  }
+  return SDValue();
+}
+
+/// Simplify \Addr given that the top byte of it is ignored by HW during
+/// address translation.
+static bool performTBISimplification(SDValue Addr,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     SelectionDAG &DAG) {
+  APInt DemandedMask = APInt::getLowBitsSet(64, 56);
+  APInt KnownZero, KnownOne;
+  TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
+                                        DCI.isBeforeLegalizeOps());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) {
+    DCI.CommitTargetLoweringOpt(TLO);
+    return true;
+  }
+  return false;
+}
+
+static SDValue performSTORECombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   SelectionDAG &DAG,
+                                   const AArch64Subtarget *Subtarget) {
+  if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
+    return Split;
+
+  if (Subtarget->supportsAddressTopByteIgnored() &&
+      performTBISimplification(N->getOperand(2), DCI, DAG))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+  /// This function handles the log2-shuffle pattern produced by the
+/// LoopVectorizer for the across vector reduction. It consists of
+/// log2(NumVectorElements) steps and, in each step, 2^(s) elements
+/// are reduced, where s is an induction variable from 0 to
+/// log2(NumVectorElements).
+static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
+                                                     unsigned Op,
+                                                     SelectionDAG &DAG) {
+  EVT VTy = OpV->getOperand(0).getValueType();
+  if (!VTy.isVector())
+    return SDValue();
+
+  int NumVecElts = VTy.getVectorNumElements();
+  if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
+    if (NumVecElts != 4)
+      return SDValue();
+  } else {
+    if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
+      return SDValue();
+  }
+
+  int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
+  SDValue PreOp = OpV;
+  // Iterate over each step of the across vector reduction.
+  for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
+    SDValue CurOp = PreOp.getOperand(0);
+    SDValue Shuffle = PreOp.getOperand(1);
+    if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
+      // Try to swap the 1st and 2nd operand as add and min/max instructions
+      // are commutative.
+      CurOp = PreOp.getOperand(1);
+      Shuffle = PreOp.getOperand(0);
+      if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
+        return SDValue();
+    }
+
+    // Check if the input vector is fed by the operator we want to handle,
+    // except the last step; the very first input vector is not necessarily
+    // the same operator we are handling.
+    if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1)))
+      return SDValue();
+
+    // Check if it forms one step of the across vector reduction.
+    // E.g.,
+    //   %cur = add %1, %0
+    //   %shuffle = vector_shuffle %cur, <2, 3, u, u>
+    //   %pre = add %cur, %shuffle
+    if (Shuffle.getOperand(0) != CurOp)
+      return SDValue();
+
+    int NumMaskElts = 1 << CurStep;
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask();
+    // Check mask values in each step.
+    // We expect the shuffle mask in each step follows a specific pattern
+    // denoted here by the <M, U> form, where M is a sequence of integers
+    // starting from NumMaskElts, increasing by 1, and the number integers
+    // in M should be NumMaskElts. U is a sequence of UNDEFs and the number
+    // of undef in U should be NumVecElts - NumMaskElts.
+    // E.g., for <8 x i16>, mask values in each step should be :
+    //   step 0 : <1,u,u,u,u,u,u,u>
+    //   step 1 : <2,3,u,u,u,u,u,u>
+    //   step 2 : <4,5,6,7,u,u,u,u>
+    for (int i = 0; i < NumVecElts; ++i)
+      if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) ||
+          (i >= NumMaskElts && !(Mask[i] < 0)))
+        return SDValue();
+
+    PreOp = CurOp;
+  }
+  unsigned Opcode;
+  bool IsIntrinsic = false;
+
+  switch (Op) {
+  default:
+    llvm_unreachable("Unexpected operator for across vector reduction");
+  case ISD::ADD:
+    Opcode = AArch64ISD::UADDV;
+    break;
+  case ISD::SMAX:
+    Opcode = AArch64ISD::SMAXV;
+    break;
+  case ISD::UMAX:
+    Opcode = AArch64ISD::UMAXV;
+    break;
+  case ISD::SMIN:
+    Opcode = AArch64ISD::SMINV;
+    break;
+  case ISD::UMIN:
+    Opcode = AArch64ISD::UMINV;
+    break;
+  case ISD::FMAXNUM:
+    Opcode = Intrinsic::aarch64_neon_fmaxnmv;
+    IsIntrinsic = true;
+    break;
+  case ISD::FMINNUM:
+    Opcode = Intrinsic::aarch64_neon_fminnmv;
+    IsIntrinsic = true;
+    break;
+  }
+  SDLoc DL(N);
+
+  return IsIntrinsic
+             ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0),
+                           DAG.getConstant(Opcode, DL, MVT::i32), PreOp)
+             : DAG.getNode(
+                   ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
+                   DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
+                   DAG.getConstant(0, DL, MVT::i64));
+}
+
+/// Target-specific DAG combine for the across vector min/max reductions.
+/// This function specifically handles the final clean-up step of the vector
+/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle
+/// pattern, which narrows down and finds the final min/max value from all
+/// elements of the vector.
+/// For example, for a <16 x i8> vector :
+///   svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u>
+///   %smax0 = smax %arr, svn0
+///   %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u>
+///   %smax1 = smax %smax0, %svn1
+///   %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+///   %smax2 = smax %smax1, svn2
+///   %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+///   %sc = setcc %smax2, %svn3, gt
+///   %n0 = extract_vector_elt %sc, #0
+///   %n1 = extract_vector_elt %smax2, #0
+///   %n2 = extract_vector_elt $smax2, #1
+///   %result = select %n0, %n1, n2
+///     becomes :
+///   %1 = smaxv %0
+///   %result = extract_vector_elt %1, 0
+static SDValue
+performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
+                                        const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue IfTrue = N->getOperand(1);
+  SDValue IfFalse = N->getOperand(2);
+
+  // Check if the SELECT merges up the final result of the min/max
+  // from a vector.
+  if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+
+  // Expect N0 is fed by SETCC.
+  SDValue SetCC = N0.getOperand(0);
+  EVT SetCCVT = SetCC.getValueType();
+  if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() ||
+      SetCCVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  SDValue VectorOp = SetCC.getOperand(0);
+  unsigned Op = VectorOp->getOpcode();
+  // Check if the input vector is fed by the operator we want to handle.
+  if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN &&
+      Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM)
+    return SDValue();
+
+  EVT VTy = VectorOp.getValueType();
+  if (!VTy.isVector())
+    return SDValue();
+
+  if (VTy.getSizeInBits() < 64)
+    return SDValue();
+
+  EVT EltTy = VTy.getVectorElementType();
+  if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
+    if (EltTy != MVT::f32)
+      return SDValue();
+  } else {
+    if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+      return SDValue();
+  }
+
+  // Check if extracting from the same vector.
+  // For example,
+  //   %sc = setcc %vector, %svn1, gt
+  //   %n0 = extract_vector_elt %sc, #0
+  //   %n1 = extract_vector_elt %vector, #0
+  //   %n2 = extract_vector_elt $vector, #1
+  if (!(VectorOp == IfTrue->getOperand(0) &&
+        VectorOp == IfFalse->getOperand(0)))
+    return SDValue();
+
+  // Check if the condition code is matched with the operator type.
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+  if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
+      (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
+      (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
+      (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) ||
+      (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE &&
+       CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT &&
+       CC != ISD::SETGE) ||
+      (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE &&
+       CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT &&
+       CC != ISD::SETLE))
+    return SDValue();
+
+  // Expect to check only lane 0 from the vector SETCC.
+  if (!isNullConstant(N0.getOperand(1)))
+    return SDValue();
+
+  // Expect to extract the true value from lane 0.
+  if (!isNullConstant(IfTrue.getOperand(1)))
+    return SDValue();
+
+  // Expect to extract the false value from lane 1.
+  if (!isOneConstant(IfFalse.getOperand(1)))
+    return SDValue();
+
+  return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
+}
+
+/// Target-specific DAG combine for the across vector add reduction.
+/// This function specifically handles the final clean-up step of the vector
+/// add reduction produced by the LoopVectorizer. It is the log2-shuffle
+/// pattern, which adds all elements of a vector together.
+/// For example, for a <4 x i32> vector :
+///   %1 = vector_shuffle %0, <2,3,u,u>
+///   %2 = add %0, %1
+///   %3 = vector_shuffle %2, <1,u,u,u>
+///   %4 = add %2, %3
+///   %result = extract_vector_elt %4, 0
+/// becomes :
+///   %0 = uaddv %0
+///   %result = extract_vector_elt %0, 0
+static SDValue
+performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
+                                     const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // Check if the input vector is fed by the ADD.
+  if (N0->getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // The vector extract idx must constant zero because we only expect the final
+  // result of the reduction is placed in lane 0.
+  if (!isNullConstant(N1))
+    return SDValue();
+
+  EVT VTy = N0.getValueType();
+  if (!VTy.isVector())
+    return SDValue();
+
+  EVT EltTy = VTy.getVectorElementType();
+  if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+    return SDValue();
+
+  if (VTy.getSizeInBits() < 64)
+    return SDValue();
+
+  return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
+}
+
+/// Target-specific DAG combine function for NEON load/store intrinsics
+/// to merge base address updates.
+static SDValue performNEONPostLDSTCombine(SDNode *N,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  unsigned AddrOpIdx = N->getNumOperands() - 1;
+  SDValue Addr = N->getOperand(AddrOpIdx);
+
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD ||
+        UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load/store.  Otherwise, folding
+    // it would create a cycle.
+    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+      continue;
+
+    // Find the new opcode for the updating load/store.
+    bool IsStore = false;
+    bool IsLaneOp = false;
+    bool IsDupOp = false;
+    unsigned NewOpc = 0;
+    unsigned NumVecs = 0;
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default: llvm_unreachable("unexpected intrinsic for Neon base update");
+    case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
+      NumVecs = 2; break;
+    case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
+      NumVecs = 3; break;
+    case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
+      NumVecs = 4; break;
+    case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
+      NumVecs = 2; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
+      NumVecs = 3; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
+      NumVecs = 4; IsStore = true; break;
+    case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
+      NumVecs = 2; break;
+    case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
+      NumVecs = 3; break;
+    case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
+      NumVecs = 4; break;
+    case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
+      NumVecs = 2; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
+      NumVecs = 3; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
+      NumVecs = 4; IsStore = true; break;
+    case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
+      NumVecs = 2; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
+      NumVecs = 3; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
+      NumVecs = 4; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
+      NumVecs = 2; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
+      NumVecs = 3; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
+      NumVecs = 4; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
+      NumVecs = 2; IsStore = true; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
+      NumVecs = 3; IsStore = true; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
+      NumVecs = 4; IsStore = true; IsLaneOp = true; break;
+    }
+
+    EVT VecTy;
+    if (IsStore)
+      VecTy = N->getOperand(2).getValueType();
+    else
+      VecTy = N->getValueType(0);
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint32_t IncVal = CInc->getZExtValue();
+      unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+      if (IsLaneOp || IsDupOp)
+        NumBytes /= VecTy.getVectorNumElements();
+      if (IncVal != NumBytes)
+        continue;
+      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
+    }
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(N->getOperand(0)); // Incoming chain
+    // Load lane and store have vector list as input.
+    if (IsLaneOp || IsStore)
+      for (unsigned i = 2; i < AddrOpIdx; ++i)
+        Ops.push_back(N->getOperand(i));
+    Ops.push_back(Addr); // Base register
+    Ops.push_back(Inc);
+
+    // Return Types.
+    EVT Tys[6];
+    unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
+    unsigned n;
+    for (n = 0; n < NumResultVecs; ++n)
+      Tys[n] = VecTy;
+    Tys[n++] = MVT::i64;  // Type of write back register
+    Tys[n] = MVT::Other;  // Type of the chain
+    SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
+
+    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
+                                           MemInt->getMemoryVT(),
+                                           MemInt->getMemOperand());
+
+    // Update the uses.
+    std::vector<SDValue> NewResults;
+    for (unsigned i = 0; i < NumResultVecs; ++i) {
+      NewResults.push_back(SDValue(UpdN.getNode(), i));
+    }
+    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
+    DCI.CombineTo(N, NewResults);
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+    break;
+  }
+  return SDValue();
+}
+
+// Checks to see if the value is the prescribed width and returns information
+// about its extension mode.
+static
+bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
+  ExtType = ISD::NON_EXTLOAD;
+  switch(V.getNode()->getOpcode()) {
+  default:
+    return false;
+  case ISD::LOAD: {
+    LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
+    if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
+       || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
+      ExtType = LoadNode->getExtensionType();
+      return true;
+    }
+    return false;
+  }
+  case ISD::AssertSext: {
+    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+    if ((TypeNode->getVT() == MVT::i8 && width == 8)
+       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+      ExtType = ISD::SEXTLOAD;
+      return true;
+    }
+    return false;
+  }
+  case ISD::AssertZext: {
+    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+    if ((TypeNode->getVT() == MVT::i8 && width == 8)
+       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+      ExtType = ISD::ZEXTLOAD;
+      return true;
+    }
+    return false;
+  }
+  case ISD::Constant:
+  case ISD::TargetConstant: {
+    return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
+           1LL << (width - 1);
+  }
+  }
+
+  return true;
+}
+
+// This function does a whole lot of voodoo to determine if the tests are
+// equivalent without and with a mask. Essentially what happens is that given a
+// DAG resembling:
+//
+//  +-------------+ +-------------+ +-------------+ +-------------+
+//  |    Input    | | AddConstant | | CompConstant| |     CC      |
+//  +-------------+ +-------------+ +-------------+ +-------------+
+//           |           |           |               |
+//           V           V           |    +----------+
+//          +-------------+  +----+  |    |
+//          |     ADD     |  |0xff|  |    |
+//          +-------------+  +----+  |    |
+//                  |           |    |    |
+//                  V           V    |    |
+//                 +-------------+   |    |
+//                 |     AND     |   |    |
+//                 +-------------+   |    |
+//                      |            |    |
+//                      +-----+      |    |
+//                            |      |    |
+//                            V      V    V
+//                           +-------------+
+//                           |     CMP     |
+//                           +-------------+
+//
+// The AND node may be safely removed for some combinations of inputs. In
+// particular we need to take into account the extension type of the Input,
+// the exact values of AddConstant, CompConstant, and CC, along with the nominal
+// width of the input (this can work for any width inputs, the above graph is
+// specific to 8 bits.
+//
+// The specific equations were worked out by generating output tables for each
+// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
+// problem was simplified by working with 4 bit inputs, which means we only
+// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
+// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
+// patterns present in both extensions (0,7). For every distinct set of
+// AddConstant and CompConstants bit patterns we can consider the masked and
+// unmasked versions to be equivalent if the result of this function is true for
+// all 16 distinct bit patterns of for the current extension type of Input (w0).
+//
+//   sub      w8, w0, w1
+//   and      w10, w8, #0x0f
+//   cmp      w8, w2
+//   cset     w9, AArch64CC
+//   cmp      w10, w2
+//   cset     w11, AArch64CC
+//   cmp      w9, w11
+//   cset     w0, eq
+//   ret
+//
+// Since the above function shows when the outputs are equivalent it defines
+// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
+// would be expensive to run during compiles. The equations below were written
+// in a test harness that confirmed they gave equivalent outputs to the above
+// for all inputs function, so they can be used determine if the removal is
+// legal instead.
+//
+// isEquivalentMaskless() is the code for testing if the AND can be removed
+// factored out of the DAG recognition as the DAG can take several forms.
+
+static bool isEquivalentMaskless(unsigned CC, unsigned width,
+                                 ISD::LoadExtType ExtType, int AddConstant,
+                                 int CompConstant) {
+  // By being careful about our equations and only writing the in term
+  // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
+  // make them generally applicable to all bit widths.
+  int MaxUInt = (1 << width);
+
+  // For the purposes of these comparisons sign extending the type is
+  // equivalent to zero extending the add and displacing it by half the integer
+  // width. Provided we are careful and make sure our equations are valid over
+  // the whole range we can just adjust the input and avoid writing equations
+  // for sign extended inputs.
+  if (ExtType == ISD::SEXTLOAD)
+    AddConstant -= (1 << (width-1));
+
+  switch(CC) {
+  case AArch64CC::LE:
+  case AArch64CC::GT: {
+    if ((AddConstant == 0) ||
+        (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
+        (AddConstant >= 0 && CompConstant < 0) ||
+        (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
+      return true;
+  } break;
+  case AArch64CC::LT:
+  case AArch64CC::GE: {
+    if ((AddConstant == 0) ||
+        (AddConstant >= 0 && CompConstant <= 0) ||
+        (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
+      return true;
+  } break;
+  case AArch64CC::HI:
+  case AArch64CC::LS: {
+    if ((AddConstant >= 0 && CompConstant < 0) ||
+       (AddConstant <= 0 && CompConstant >= -1 &&
+        CompConstant < AddConstant + MaxUInt))
+      return true;
+  } break;
+  case AArch64CC::PL:
+  case AArch64CC::MI: {
+    if ((AddConstant == 0) ||
+        (AddConstant > 0 && CompConstant <= 0) ||
+        (AddConstant < 0 && CompConstant <= AddConstant))
+      return true;
+  } break;
+  case AArch64CC::LO:
+  case AArch64CC::HS: {
+    if ((AddConstant >= 0 && CompConstant <= 0) ||
+        (AddConstant <= 0 && CompConstant >= 0 &&
+         CompConstant <= AddConstant + MaxUInt))
+      return true;
+  } break;
+  case AArch64CC::EQ:
+  case AArch64CC::NE: {
+    if ((AddConstant > 0 && CompConstant < 0) ||
+        (AddConstant < 0 && CompConstant >= 0 &&
+         CompConstant < AddConstant + MaxUInt) ||
+        (AddConstant >= 0 && CompConstant >= 0 &&
+         CompConstant >= AddConstant) ||
+        (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
+
+      return true;
+  } break;
+  case AArch64CC::VS:
+  case AArch64CC::VC:
+  case AArch64CC::AL:
+  case AArch64CC::NV:
+    return true;
+  case AArch64CC::Invalid:
+    break;
+  }
+
+  return false;
+}
+
+static
+SDValue performCONDCombine(SDNode *N,
+                           TargetLowering::DAGCombinerInfo &DCI,
+                           SelectionDAG &DAG, unsigned CCIndex,
+                           unsigned CmpIndex) {
+  unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
+  SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
+  unsigned CondOpcode = SubsNode->getOpcode();
+
+  if (CondOpcode != AArch64ISD::SUBS)
+    return SDValue();
+
+  // There is a SUBS feeding this condition. Is it fed by a mask we can
+  // use?
+
+  SDNode *AndNode = SubsNode->getOperand(0).getNode();
+  unsigned MaskBits = 0;
+
+  if (AndNode->getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
+    uint32_t CNV = CN->getZExtValue();
+    if (CNV == 255)
+      MaskBits = 8;
+    else if (CNV == 65535)
+      MaskBits = 16;
+  }
+
+  if (!MaskBits)
+    return SDValue();
+
+  SDValue AddValue = AndNode->getOperand(0);
+
+  if (AddValue.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // The basic dag structure is correct, grab the inputs and validate them.
+
+  SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
+  SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
+  SDValue SubsInputValue = SubsNode->getOperand(1);
+
+  // The mask is present and the provenance of all the values is a smaller type,
+  // lets see if the mask is superfluous.
+
+  if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
+      !isa<ConstantSDNode>(SubsInputValue.getNode()))
+    return SDValue();
+
+  ISD::LoadExtType ExtType;
+
+  if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
+      !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
+      !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
+    return SDValue();
+
+  if(!isEquivalentMaskless(CC, MaskBits, ExtType,
+                cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
+                cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
+    return SDValue();
+
+  // The AND is not necessary, remove it.
+
+  SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
+                               SubsNode->getValueType(1));
+  SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
+
+  SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
+  DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
+
+  return SDValue(N, 0);
+}
+
+// Optimize compare with zero and branch.
+static SDValue performBRCONDCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    SelectionDAG &DAG) {
+  if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
+    N = NV.getNode();
+  SDValue Chain = N->getOperand(0);
+  SDValue Dest = N->getOperand(1);
+  SDValue CCVal = N->getOperand(2);
+  SDValue Cmp = N->getOperand(3);
+
+  assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
+  unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
+  if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
+    return SDValue();
+
+  unsigned CmpOpc = Cmp.getOpcode();
+  if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
+    return SDValue();
+
+  // Only attempt folding if there is only one use of the flag and no use of the
+  // value.
+  if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
+    return SDValue();
+
+  SDValue LHS = Cmp.getOperand(0);
+  SDValue RHS = Cmp.getOperand(1);
+
+  assert(LHS.getValueType() == RHS.getValueType() &&
+         "Expected the value type to be the same for both operands!");
+  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+    return SDValue();
+
+  if (isNullConstant(LHS))
+    std::swap(LHS, RHS);
+
+  if (!isNullConstant(RHS))
+    return SDValue();
+
+  if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
+      LHS.getOpcode() == ISD::SRL)
+    return SDValue();
+
+  // Fold the compare into the branch instruction.
+  SDValue BR;
+  if (CC == AArch64CC::EQ)
+    BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
+  else
+    BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
+
+  // Do not add new nodes to DAG combiner worklist.
+  DCI.CombineTo(N, BR, false);
+
+  return SDValue();
+}
+
+// Optimize some simple tbz/tbnz cases.  Returns the new operand and bit to test
+// as well as whether the test should be inverted.  This code is required to
+// catch these cases (as opposed to standard dag combines) because
+// AArch64ISD::TBZ is matched during legalization.
+static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
+                                 SelectionDAG &DAG) {
+
+  if (!Op->hasOneUse())
+    return Op;
+
+  // We don't handle undef/constant-fold cases below, as they should have
+  // already been taken care of (e.g. and of 0, test of undefined shifted bits,
+  // etc.)
+
+  // (tbz (trunc x), b) -> (tbz x, b)
+  // This case is just here to enable more of the below cases to be caught.
+  if (Op->getOpcode() == ISD::TRUNCATE &&
+      Bit < Op->getValueType(0).getSizeInBits()) {
+    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+  }
+
+  if (Op->getNumOperands() != 2)
+    return Op;
+
+  auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+  if (!C)
+    return Op;
+
+  switch (Op->getOpcode()) {
+  default:
+    return Op;
+
+  // (tbz (and x, m), b) -> (tbz x, b)
+  case ISD::AND:
+    if ((C->getZExtValue() >> Bit) & 1)
+      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+    return Op;
+
+  // (tbz (shl x, c), b) -> (tbz x, b-c)
+  case ISD::SHL:
+    if (C->getZExtValue() <= Bit &&
+        (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
+      Bit = Bit - C->getZExtValue();
+      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+    }
+    return Op;
+
+  // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
+  case ISD::SRA:
+    Bit = Bit + C->getZExtValue();
+    if (Bit >= Op->getValueType(0).getSizeInBits())
+      Bit = Op->getValueType(0).getSizeInBits() - 1;
+    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+
+  // (tbz (srl x, c), b) -> (tbz x, b+c)
+  case ISD::SRL:
+    if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
+      Bit = Bit + C->getZExtValue();
+      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+    }
+    return Op;
+
+  // (tbz (xor x, -1), b) -> (tbnz x, b)
+  case ISD::XOR:
+    if ((C->getZExtValue() >> Bit) & 1)
+      Invert = !Invert;
+    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+  }
+}
+
+// Optimize test single bit zero/non-zero and branch.
+static SDValue performTBZCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 SelectionDAG &DAG) {
+  unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  bool Invert = false;
+  SDValue TestSrc = N->getOperand(1);
+  SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
+
+  if (TestSrc == NewTestSrc)
+    return SDValue();
+
+  unsigned NewOpc = N->getOpcode();
+  if (Invert) {
+    if (NewOpc == AArch64ISD::TBZ)
+      NewOpc = AArch64ISD::TBNZ;
+    else {
+      assert(NewOpc == AArch64ISD::TBNZ);
+      NewOpc = AArch64ISD::TBZ;
+    }
+  }
+
+  SDLoc DL(N);
+  return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
+                     DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
+}
+
+// vselect (v1i1 setcc) ->
+//     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
+// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
+// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
+// such VSELECT.
+static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  EVT CCVT = N0.getValueType();
+
+  if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
+      CCVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  EVT ResVT = N->getValueType(0);
+  EVT CmpVT = N0.getOperand(0).getValueType();
+  // Only combine when the result type is of the same size as the compared
+  // operands.
+  if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
+    return SDValue();
+
+  SDValue IfTrue = N->getOperand(1);
+  SDValue IfFalse = N->getOperand(2);
+  SDValue SetCC =
+      DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
+                   N0.getOperand(0), N0.getOperand(1),
+                   cast<CondCodeSDNode>(N0.getOperand(2))->get());
+  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
+                     IfTrue, IfFalse);
+}
+
+/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
+/// the compare-mask instructions rather than going via NZCV, even if LHS and
+/// RHS are really scalar. This replaces any scalar setcc in the above pattern
+/// with a vector one followed by a DUP shuffle on the result.
+static SDValue performSelectCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+  EVT ResVT = N->getValueType(0);
+
+  if (N0.getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
+  // scalar SetCCResultType. We also don't expect vectors, because we assume
+  // that selects fed by vector SETCCs are canonicalized to VSELECT.
+  assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
+         "Scalar-SETCC feeding SELECT has unexpected result type!");
+
+  // If NumMaskElts == 0, the comparison is larger than select result. The
+  // largest real NEON comparison is 64-bits per lane, which means the result is
+  // at most 32-bits and an illegal vector. Just bail out for now.
+  EVT SrcVT = N0.getOperand(0).getValueType();
+
+  // Don't try to do this optimization when the setcc itself has i1 operands.
+  // There are no legal vectors of i1, so this would be pointless.
+  if (SrcVT == MVT::i1)
+    return SDValue();
+
+  int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
+  if (!ResVT.isVector() || NumMaskElts == 0)
+    return SDValue();
+
+  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
+  EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
+
+  // Also bail out if the vector CCVT isn't the same size as ResVT.
+  // This can happen if the SETCC operand size doesn't divide the ResVT size
+  // (e.g., f64 vs v3f32).
+  if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
+    return SDValue();
+
+  // Make sure we didn't create illegal types, if we're not supposed to.
+  assert(DCI.isBeforeLegalize() ||
+         DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
+
+  // First perform a vector comparison, where lane 0 is the one we're interested
+  // in.
+  SDLoc DL(N0);
+  SDValue LHS =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
+  SDValue RHS =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
+  SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
+
+  // Now duplicate the comparison mask we want across all other lanes.
+  SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
+  SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
+  Mask = DAG.getNode(ISD::BITCAST, DL,
+                     ResVT.changeVectorElementTypeToInteger(), Mask);
+
+  return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
+}
+
+/// Get rid of unnecessary NVCASTs (that don't change the type).
+static SDValue performNVCASTCombine(SDNode *N) {
+  if (N->getValueType(0) == N->getOperand(0).getValueType())
+    return N->getOperand(0);
+
+  return SDValue();
+}
+
+SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::ADD:
+  case ISD::SUB:
+    return performAddSubLongCombine(N, DCI, DAG);
+  case ISD::XOR:
+    return performXorCombine(N, DAG, DCI, Subtarget);
+  case ISD::MUL:
+    return performMulCombine(N, DAG, DCI, Subtarget);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return performIntToFpCombine(N, DAG, Subtarget);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return performFpToIntCombine(N, DAG, DCI, Subtarget);
+  case ISD::FDIV:
+    return performFDivCombine(N, DAG, DCI, Subtarget);
+  case ISD::OR:
+    return performORCombine(N, DCI, Subtarget);
+  case ISD::SRL:
+    return performSRLCombine(N, DCI);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return performIntrinsicCombine(N, DCI, Subtarget);
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+    return performExtendCombine(N, DCI, DAG);
+  case ISD::BITCAST:
+    return performBitcastCombine(N, DCI, DAG);
+  case ISD::CONCAT_VECTORS:
+    return performConcatVectorsCombine(N, DCI, DAG);
+  case ISD::SELECT: {
+    SDValue RV = performSelectCombine(N, DCI);
+    if (!RV.getNode())
+      RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget);
+    return RV;
+  }
+  case ISD::VSELECT:
+    return performVSelectCombine(N, DCI.DAG);
+  case ISD::LOAD:
+    if (performTBISimplification(N->getOperand(1), DCI, DAG))
+      return SDValue(N, 0);
+    break;
+  case ISD::STORE:
+    return performSTORECombine(N, DCI, DAG, Subtarget);
+  case AArch64ISD::BRCOND:
+    return performBRCONDCombine(N, DCI, DAG);
+  case AArch64ISD::TBNZ:
+  case AArch64ISD::TBZ:
+    return performTBZCombine(N, DCI, DAG);
+  case AArch64ISD::CSEL:
+    return performCONDCombine(N, DCI, DAG, 2, 3);
+  case AArch64ISD::DUP:
+    return performPostLD1Combine(N, DCI, false);
+  case AArch64ISD::NVCAST:
+    return performNVCASTCombine(N);
+  case ISD::INSERT_VECTOR_ELT:
+    return performPostLD1Combine(N, DCI, true);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return performAcrossLaneAddReductionCombine(N, DAG, Subtarget);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    case Intrinsic::aarch64_neon_ld2:
+    case Intrinsic::aarch64_neon_ld3:
+    case Intrinsic::aarch64_neon_ld4:
+    case Intrinsic::aarch64_neon_ld1x2:
+    case Intrinsic::aarch64_neon_ld1x3:
+    case Intrinsic::aarch64_neon_ld1x4:
+    case Intrinsic::aarch64_neon_ld2lane:
+    case Intrinsic::aarch64_neon_ld3lane:
+    case Intrinsic::aarch64_neon_ld4lane:
+    case Intrinsic::aarch64_neon_ld2r:
+    case Intrinsic::aarch64_neon_ld3r:
+    case Intrinsic::aarch64_neon_ld4r:
+    case Intrinsic::aarch64_neon_st2:
+    case Intrinsic::aarch64_neon_st3:
+    case Intrinsic::aarch64_neon_st4:
+    case Intrinsic::aarch64_neon_st1x2:
+    case Intrinsic::aarch64_neon_st1x3:
+    case Intrinsic::aarch64_neon_st1x4:
+    case Intrinsic::aarch64_neon_st2lane:
+    case Intrinsic::aarch64_neon_st3lane:
+    case Intrinsic::aarch64_neon_st4lane:
+      return performNEONPostLDSTCombine(N, DCI, DAG);
+    default:
+      break;
+    }
+  }
+  return SDValue();
+}
+
+// Check if the return value is used as only a return value, as otherwise
+// we can't perform a tail-call. In particular, we need to check for
+// target ISD nodes that are returns and any other "odd" constructs
+// that the generic analysis code won't necessarily catch.
+bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
+                                               SDValue &Chain) const {
+  if (N->getNumValues() != 1)
+    return false;
+  if (!N->hasNUsesOfValue(1, 0))
+    return false;
+
+  SDValue TCChain = Chain;
+  SDNode *Copy = *N->use_begin();
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
+        MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+    return false;
+
+  bool HasRet = false;
+  for (SDNode *Node : Copy->uses()) {
+    if (Node->getOpcode() != AArch64ISD::RET_FLAG)
+      return false;
+    HasRet = true;
+  }
+
+  if (!HasRet)
+    return false;
+
+  Chain = TCChain;
+  return true;
+}
+
+// Return whether the an instruction can potentially be optimized to a tail
+// call. This will cause the optimizers to attempt to move, or duplicate,
+// return instructions to help enable tail call optimizations for this
+// instruction.
+bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  return CI->isTailCall();
+}
+
+bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
+                                                   SDValue &Offset,
+                                                   ISD::MemIndexedMode &AM,
+                                                   bool &IsInc,
+                                                   SelectionDAG &DAG) const {
+  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
+    return false;
+
+  Base = Op->getOperand(0);
+  // All of the indexed addressing mode instructions take a signed
+  // 9 bit immediate offset.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+    int64_t RHSC = RHS->getSExtValue();
+    if (Op->getOpcode() == ISD::SUB)
+      RHSC = -(uint64_t)RHSC;
+    if (!isInt<9>(RHSC))
+      return false;
+    IsInc = (Op->getOpcode() == ISD::ADD);
+    Offset = Op->getOperand(1);
+    return true;
+  }
+  return false;
+}
+
+bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                                      SDValue &Offset,
+                                                      ISD::MemIndexedMode &AM,
+                                                      SelectionDAG &DAG) const {
+  EVT VT;
+  SDValue Ptr;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+  } else
+    return false;
+
+  bool IsInc;
+  if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
+    return false;
+  AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
+  return true;
+}
+
+bool AArch64TargetLowering::getPostIndexedAddressParts(
+    SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
+    ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
+  EVT VT;
+  SDValue Ptr;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+  } else
+    return false;
+
+  bool IsInc;
+  if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
+    return false;
+  // Post-indexing updates the base, so it's not a valid transform
+  // if that's not the same as the load's pointer.
+  if (Ptr != Base)
+    return false;
+  AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
+  return true;
+}
+
+static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                  SelectionDAG &DAG) {
+  SDLoc DL(N);
+  SDValue Op = N->getOperand(0);
+
+  if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16)
+    return;
+
+  Op = SDValue(
+      DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
+                         DAG.getUNDEF(MVT::i32), Op,
+                         DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
+      0);
+  Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
+  Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
+}
+
+static void ReplaceReductionResults(SDNode *N,
+                                    SmallVectorImpl<SDValue> &Results,
+                                    SelectionDAG &DAG, unsigned InterOp,
+                                    unsigned AcrossOp) {
+  EVT LoVT, HiVT;
+  SDValue Lo, Hi;
+  SDLoc dl(N);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+  SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
+  SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
+  Results.push_back(SplitVal);
+}
+
+static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
+  SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
+                           DAG.getNode(ISD::SRL, DL, MVT::i128, N,
+                                       DAG.getConstant(64, DL, MVT::i64)));
+  return std::make_pair(Lo, Hi);
+}
+
+static void ReplaceCMP_SWAP_128Results(SDNode *N,
+                                       SmallVectorImpl<SDValue> & Results,
+                                       SelectionDAG &DAG) {
+  assert(N->getValueType(0) == MVT::i128 &&
+         "AtomicCmpSwap on types less than 128 should be legal");
+  auto Desired = splitInt128(N->getOperand(2), DAG);
+  auto New = splitInt128(N->getOperand(3), DAG);
+  SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
+                   New.first,        New.second,    N->getOperand(0)};
+  SDNode *CmpSwap = DAG.getMachineNode(
+      AArch64::CMP_SWAP_128, SDLoc(N),
+      DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+  Results.push_back(SDValue(CmpSwap, 0));
+  Results.push_back(SDValue(CmpSwap, 1));
+  Results.push_back(SDValue(CmpSwap, 3));
+}
+
+void AArch64TargetLowering::ReplaceNodeResults(
+    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to custom expand this");
+  case ISD::BITCAST:
+    ReplaceBITCASTResults(N, Results, DAG);
+    return;
+  case AArch64ISD::SADDV:
+    ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
+    return;
+  case AArch64ISD::UADDV:
+    ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
+    return;
+  case AArch64ISD::SMINV:
+    ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
+    return;
+  case AArch64ISD::UMINV:
+    ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
+    return;
+  case AArch64ISD::SMAXV:
+    ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
+    return;
+  case AArch64ISD::UMAXV:
+    ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
+    return;
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+    assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
+    // Let normal code take care of it by not adding anything to Results.
+    return;
+  case ISD::ATOMIC_CMP_SWAP:
+    ReplaceCMP_SWAP_128Results(N, Results, DAG);
+    return;
+  }
+}
+
+bool AArch64TargetLowering::useLoadStackGuardNode() const {
+  if (!Subtarget->isTargetAndroid())
+    return true;
+  return TargetLowering::useLoadStackGuardNode();
+}
+
+unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
+  // Combine multiple FDIVs with the same divisor into multiple FMULs by the
+  // reciprocal if there are three or more FDIVs.
+  return 3;
+}
+
+TargetLoweringBase::LegalizeTypeAction
+AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
+  MVT SVT = VT.getSimpleVT();
+  // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
+  // v4i16, v2i32 instead of to promote.
+  if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32
+      || SVT == MVT::v1f32)
+    return TypeWidenVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+// Loads and stores less than 128-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong.
+bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
+  return Size == 128;
+}
+
+// Loads and stores less than 128-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong.
+TargetLowering::AtomicExpansionKind
+AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  unsigned Size = LI->getType()->getPrimitiveSizeInBits();
+  return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
+}
+
+// For the real atomic operations, we have ldxr/stxr up to 128 bits,
+TargetLowering::AtomicExpansionKind
+AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
+  return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
+}
+
+bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
+    AtomicCmpXchgInst *AI) const {
+  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+  // implement cmpxchg without spilling. If the address being exchanged is also
+  // on the stack and close enough to the spill slot, this can lead to a
+  // situation where the monitor always gets cleared and the atomic operation
+  // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
+  return getTargetMachine().getOptLevel() != 0;
+}
+
+Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                                             AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
+  bool IsAcquire = isAcquireOrStronger(Ord);
+
+  // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
+  // intrinsic must return {i64, i64} and we have to recombine them into a
+  // single i128 here.
+  if (ValTy->getPrimitiveSizeInBits() == 128) {
+    Intrinsic::ID Int =
+        IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
+    Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
+
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
+
+    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+    return Builder.CreateOr(
+        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
+  }
+
+  Type *Tys[] = { Addr->getType() };
+  Intrinsic::ID Int =
+      IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
+  Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateTruncOrBitCast(
+      Builder.CreateCall(Ldxr, Addr),
+      cast<PointerType>(Addr->getType())->getElementType());
+}
+
+void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
+    IRBuilder<> &Builder) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Builder.CreateCall(
+      llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
+}
+
+Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
+                                                   Value *Val, Value *Addr,
+                                                   AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  bool IsRelease = isReleaseOrStronger(Ord);
+
+  // Since the intrinsics must have legal type, the i128 intrinsics take two
+  // parameters: "i64, i64". We must marshal Val into the appropriate form
+  // before the call.
+  if (Val->getType()->getPrimitiveSizeInBits() == 128) {
+    Intrinsic::ID Int =
+        IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
+    Function *Stxr = Intrinsic::getDeclaration(M, Int);
+    Type *Int64Ty = Type::getInt64Ty(M->getContext());
+
+    Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
+    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
+  }
+
+  Intrinsic::ID Int =
+      IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
+  Type *Tys[] = { Addr->getType() };
+  Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateCall(Stxr,
+                            {Builder.CreateZExtOrBitCast(
+                                 Val, Stxr->getFunctionType()->getParamType(0)),
+                             Addr});
+}
+
+bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
+    Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
+  return Ty->isArrayTy();
+}
+
+bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
+                                                            EVT) const {
+  return false;
+}
+
+Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+  if (!Subtarget->isTargetAndroid())
+    return TargetLowering::getIRStackGuard(IRB);
+
+  // Android provides a fixed TLS slot for the stack cookie. See the definition
+  // of TLS_SLOT_STACK_GUARD in
+  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+  const unsigned TlsOffset = 0x28;
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  Function *ThreadPointerFunc =
+      Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
+  return IRB.CreatePointerCast(
+      IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
+      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+}
+
+Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+  if (!Subtarget->isTargetAndroid())
+    return TargetLowering::getSafeStackPointerLocation(IRB);
+
+  // Android provides a fixed TLS slot for the SafeStack pointer. See the
+  // definition of TLS_SLOT_SAFESTACK in
+  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+  const unsigned TlsOffset = 0x48;
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  Function *ThreadPointerFunc =
+      Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
+  return IRB.CreatePointerCast(
+      IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
+      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+}
+
+void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+  // Update IsSplitCSR in AArch64unctionInfo.
+  AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
+  AFI->setIsSplitCSR(true);
+}
+
+void AArch64TargetLowering::insertCopiesSplitCSR(
+    MachineBasicBlock *Entry,
+    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+  if (!IStart)
+    return;
+
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  MachineBasicBlock::iterator MBBI = Entry->begin();
+  for (const MCPhysReg *I = IStart; *I; ++I) {
+    const TargetRegisterClass *RC = nullptr;
+    if (AArch64::GPR64RegClass.contains(*I))
+      RC = &AArch64::GPR64RegClass;
+    else if (AArch64::FPR64RegClass.contains(*I))
+      RC = &AArch64::FPR64RegClass;
+    else
+      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+    unsigned NewVR = MRI->createVirtualRegister(RC);
+    // Create copy from CSR to a virtual register.
+    // FIXME: this currently does not emit CFI pseudo-instructions, it works
+    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+    // nounwind. If we want to generalize this later, we may need to emit
+    // CFI pseudo-instructions.
+    assert(Entry->getParent()->getFunction()->hasFnAttribute(
+               Attribute::NoUnwind) &&
+           "Function should be nounwind in insertCopiesSplitCSR!");
+    Entry->addLiveIn(*I);
+    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+        .addReg(*I);
+
+    // Insert the copy-back instructions right before the terminator.
+    for (auto *Exit : Exits)
+      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), *I)
+          .addReg(NewVR);
+  }
+}
+
+bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+  // Integer division on AArch64 is expensive. However, when aggressively
+  // optimizing for code size, we prefer to use a div instruction, as it is
+  // usually smaller than the alternative sequence.
+  // The exception to this is vector division. Since AArch64 doesn't have vector
+  // integer division, leaving the division as-is is a loss even in terms of
+  // size, because it will have to be scalarized, while the alternative code
+  // sequence can be performed in vector form.
+  bool OptSize =
+      Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+  return OptSize && !VT.isVector();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
new file mode 100644
index 000000000000..054ccc31674f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -0,0 +1,607 @@
+//==-- AArch64ISelLowering.h - AArch64 DAG Lowering Interface ----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that AArch64 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
+
+#include "AArch64.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+namespace AArch64ISD {
+
+enum NodeType : unsigned {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
+  CALL,         // Function call.
+
+  // Produces the full sequence of instructions for getting the thread pointer
+  // offset of a variable into X0, using the TLSDesc model.
+  TLSDESC_CALLSEQ,
+  ADRP,     // Page address of a TargetGlobalAddress operand.
+  ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
+  LOADgot,  // Load from automatically generated descriptor (e.g. Global
+            // Offset Table, TLS record).
+  RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
+  BRCOND,   // Conditional branch instruction; "b.cond".
+  CSEL,
+  FCSEL, // Conditional move instruction.
+  CSINV, // Conditional select invert.
+  CSNEG, // Conditional select negate.
+  CSINC, // Conditional select increment.
+
+  // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
+  // ELF.
+  THREAD_POINTER,
+  ADC,
+  SBC, // adc, sbc instructions
+
+  // Arithmetic instructions which write flags.
+  ADDS,
+  SUBS,
+  ADCS,
+  SBCS,
+  ANDS,
+
+  // Conditional compares. Operands: left,right,falsecc,cc,flags
+  CCMP,
+  CCMN,
+  FCCMP,
+
+  // Floating point comparison
+  FCMP,
+
+  // Scalar extract
+  EXTR,
+
+  // Scalar-to-vector duplication
+  DUP,
+  DUPLANE8,
+  DUPLANE16,
+  DUPLANE32,
+  DUPLANE64,
+
+  // Vector immedate moves
+  MOVI,
+  MOVIshift,
+  MOVIedit,
+  MOVImsl,
+  FMOV,
+  MVNIshift,
+  MVNImsl,
+
+  // Vector immediate ops
+  BICi,
+  ORRi,
+
+  // Vector bit select: similar to ISD::VSELECT but not all bits within an
+  // element must be identical.
+  BSL,
+
+  // Vector arithmetic negation
+  NEG,
+
+  // Vector shuffles
+  ZIP1,
+  ZIP2,
+  UZP1,
+  UZP2,
+  TRN1,
+  TRN2,
+  REV16,
+  REV32,
+  REV64,
+  EXT,
+
+  // Vector shift by scalar
+  VSHL,
+  VLSHR,
+  VASHR,
+
+  // Vector shift by scalar (again)
+  SQSHL_I,
+  UQSHL_I,
+  SQSHLU_I,
+  SRSHR_I,
+  URSHR_I,
+
+  // Vector comparisons
+  CMEQ,
+  CMGE,
+  CMGT,
+  CMHI,
+  CMHS,
+  FCMEQ,
+  FCMGE,
+  FCMGT,
+
+  // Vector zero comparisons
+  CMEQz,
+  CMGEz,
+  CMGTz,
+  CMLEz,
+  CMLTz,
+  FCMEQz,
+  FCMGEz,
+  FCMGTz,
+  FCMLEz,
+  FCMLTz,
+
+  // Vector across-lanes addition
+  // Only the lower result lane is defined.
+  SADDV,
+  UADDV,
+
+  // Vector across-lanes min/max
+  // Only the lower result lane is defined.
+  SMINV,
+  UMINV,
+  SMAXV,
+  UMAXV,
+
+  // Vector bitwise negation
+  NOT,
+
+  // Vector bitwise selection
+  BIT,
+
+  // Compare-and-branch
+  CBZ,
+  CBNZ,
+  TBZ,
+  TBNZ,
+
+  // Tail calls
+  TC_RETURN,
+
+  // Custom prefetch handling
+  PREFETCH,
+
+  // {s|u}int to FP within a FP register.
+  SITOF,
+  UITOF,
+
+  /// Natural vector cast. ISD::BITCAST is not natural in the big-endian
+  /// world w.r.t vectors; which causes additional REV instructions to be
+  /// generated to compensate for the byte-swapping. But sometimes we do
+  /// need to re-interpret the data in SIMD vector registers in big-endian
+  /// mode without emitting such REV instructions.
+  NVCAST,
+
+  SMULL,
+  UMULL,
+
+  // Reciprocal estimates and steps.
+  FRECPE, FRECPS,
+  FRSQRTE, FRSQRTS,
+
+  // NEON Load/Store with post-increment base updates
+  LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  LD3post,
+  LD4post,
+  ST2post,
+  ST3post,
+  ST4post,
+  LD1x2post,
+  LD1x3post,
+  LD1x4post,
+  ST1x2post,
+  ST1x3post,
+  ST1x4post,
+  LD1DUPpost,
+  LD2DUPpost,
+  LD3DUPpost,
+  LD4DUPpost,
+  LD1LANEpost,
+  LD2LANEpost,
+  LD3LANEpost,
+  LD4LANEpost,
+  ST2LANEpost,
+  ST3LANEpost,
+  ST4LANEpost
+};
+
+} // end namespace AArch64ISD
+
+namespace {
+
+// Any instruction that defines a 32-bit result zeros out the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. But any other 32-bit operation will zero-extend
+// up to 64 bits.
+// FIXME: X86 also checks for CMOV here. Do we need something similar?
+static inline bool isDef32(const SDNode &N) {
+  unsigned Opc = N.getOpcode();
+  return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG &&
+         Opc != ISD::CopyFromReg;
+}
+
+} // end anonymous namespace
+
+class AArch64Subtarget;
+class AArch64TargetMachine;
+
+class AArch64TargetLowering : public TargetLowering {
+public:
+  explicit AArch64TargetLowering(const TargetMachine &TM,
+                                 const AArch64Subtarget &STI);
+
+  /// Selects the correct CCAssignFn for a given CallingConvention value.
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
+
+  /// Selects the correct CCAssignFn for a given CallingConvention value.
+  CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC) const;
+
+  /// Determine which of the bits specified in Mask are known to be either zero
+  /// or one and return them in the KnownZero/KnownOne bitsets.
+  void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
+                                     APInt &KnownOne, const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
+  MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
+
+  /// Returns true if the target allows unaligned memory accesses of the
+  /// specified type.
+  bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
+                                      unsigned Align = 1,
+                                      bool *Fast = nullptr) const override;
+
+  /// Provide custom lowering hooks for some operations.
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+  /// Returns true if a cast between SrcAS and DestAS is a noop.
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+    // Addrspacecasts are always noops.
+    return true;
+  }
+
+  /// This method returns a target specific FastISel object, or null if the
+  /// target does not support "fast" ISel.
+  FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                           const TargetLibraryInfo *libInfo) const override;
+
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+  /// Return true if the given shuffle mask can be codegen'd directly, or if it
+  /// should be stack expanded.
+  bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
+
+  /// Return the ISD::SETCC ValueType.
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                         EVT VT) const override;
+
+  SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
+
+  MachineBasicBlock *EmitF128CSEL(MachineInstr &MI,
+                                  MachineBasicBlock *BB) const;
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *MBB) const override;
+
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                          unsigned Intrinsic) const override;
+
+  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+  bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+  bool isProfitableToHoist(Instruction *I) const override;
+
+  bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+  bool isZExtFree(EVT VT1, EVT VT2) const override;
+  bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+  bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
+
+  unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
+
+  bool lowerInterleavedLoad(LoadInst *LI,
+                            ArrayRef<ShuffleVectorInst *> Shuffles,
+                            ArrayRef<unsigned> Indices,
+                            unsigned Factor) const override;
+  bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+                             unsigned Factor) const override;
+
+  bool isLegalAddImmediate(int64_t) const override;
+  bool isLegalICmpImmediate(int64_t) const override;
+
+  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                          MachineFunction &MF) const override;
+
+  /// Return true if the addressing mode represented by AM is legal for this
+  /// target, for a load/store of the specified type.
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
+
+  /// \brief Return the cost of the scaling factor used in the addressing
+  /// mode represented by AM for this target, for a load/store
+  /// of the specified type.
+  /// If the AM is supported, the return value must be >= 0.
+  /// If the AM is not supported, it returns a negative value.
+  int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                           unsigned AS) const override;
+
+  /// Return true if an FMA operation is faster than a pair of fmul and fadd
+  /// instructions. fmuladd intrinsics will be expanded to FMAs when this method
+  /// returns true, otherwise fmuladd is expanded to fmul + fadd.
+  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+
+  const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+
+  /// \brief Returns false if N is a bit extraction pattern of (X >> C) & Mask.
+  bool isDesirableToCommuteWithShift(const SDNode *N) const override;
+
+  /// \brief Returns true if it is beneficial to convert a load of a constant
+  /// to just the constant itself.
+  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                         Type *Ty) const override;
+
+  Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                        AtomicOrdering Ord) const override;
+  Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+                              Value *Addr, AtomicOrdering Ord) const override;
+
+  void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
+
+  TargetLoweringBase::AtomicExpansionKind
+  shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+  bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+  TargetLoweringBase::AtomicExpansionKind
+  shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
+  bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
+  bool useLoadStackGuardNode() const override;
+  TargetLoweringBase::LegalizeTypeAction
+  getPreferredVectorAction(EVT VT) const override;
+
+  /// If the target has a standard location for the stack protector cookie,
+  /// returns the address of that location. Otherwise, returns nullptr.
+  Value *getIRStackGuard(IRBuilder<> &IRB) const override;
+
+  /// If the target has a standard location for the unsafe stack pointer,
+  /// returns the address of that location. Otherwise, returns nullptr.
+  Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
+
+  /// If a physical register, this returns the register that receives the
+  /// exception address on entry to an EH pad.
+  unsigned
+  getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+    // FIXME: This is a guess. Has this been defined yet?
+    return AArch64::X0;
+  }
+
+  /// If a physical register, this returns the register that receives the
+  /// exception typeid on entry to a landing pad.
+  unsigned
+  getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+    // FIXME: This is a guess. Has this been defined yet?
+    return AArch64::X1;
+  }
+
+  bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+
+  bool isCheapToSpeculateCttz() const override {
+    return true;
+  }
+
+  bool isCheapToSpeculateCtlz() const override {
+    return true;
+  }
+
+  bool hasAndNotCompare(SDValue) const override {
+    // 'bics'
+    return true;
+  }
+
+  bool hasBitPreservingFPLogic(EVT VT) const override {
+    // FIXME: Is this always true? It should be true for vectors at least.
+    return VT == MVT::f32 || VT == MVT::f64;
+  }
+
+  bool supportSplitCSR(MachineFunction *MF) const override {
+    return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+           MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+  }
+  void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+  void insertCopiesSplitCSR(
+      MachineBasicBlock *Entry,
+      const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
+  bool supportSwiftError() const override {
+    return true;
+  }
+
+private:
+  bool isExtFreeImpl(const Instruction *Ext) const override;
+
+  /// Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const AArch64Subtarget *Subtarget;
+
+  void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT);
+  void addDRTypeForNEON(MVT VT);
+  void addQRTypeForNEON(MVT VT);
+
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerCall(CallLoweringInfo & /*CLI*/,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool isVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins,
+                          const SDLoc &DL, SelectionDAG &DAG,
+                          SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+                          SDValue ThisVal) const;
+
+  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+
+  bool isEligibleForTailCallOptimization(
+      SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+      const SmallVectorImpl<ISD::OutputArg> &Outs,
+      const SmallVectorImpl<SDValue> &OutVals,
+      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+
+  /// Finds the incoming stack arguments which overlap the given fixed stack
+  /// object and incorporates their load into the current chain. This prevents
+  /// an upcoming store from clobbering the stack argument before it's used.
+  SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
+                              MachineFrameInfo &MFI, int ClobberedFI) const;
+
+  bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
+
+  void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL,
+                           SDValue &Chain) const;
+
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+                      SelectionDAG &DAG) const override;
+
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL,
+                                 SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
+                         SDValue TVal, SDValue FVal, const SDLoc &dl,
+                         SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                        RTLIB::Libcall Call) const;
+  SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+                        std::vector<SDNode *> *Created) const override;
+  SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                          int &ExtraSteps, bool &UseOneConst,
+                          bool Reciprocal) const override;
+  SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                           int &ExtraSteps) const override;
+  unsigned combineRepeatedFPDivisors() const override;
+
+  ConstraintType getConstraintType(StringRef Constraint) const override;
+  unsigned getRegisterByName(const char* RegName, EVT VT,
+                             SelectionDAG &DAG) const override;
+
+  /// Examine constraint string and operand type and determine a weight value.
+  /// The operand object must already have been set up with the operand type.
+  ConstraintWeight
+  getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                 const char *constraint) const override;
+
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+
+  const char *LowerXConstraint(EVT ConstraintVT) const override;
+
+  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+
+  unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+    if (ConstraintCode == "Q")
+      return InlineAsm::Constraint_Q;
+    // FIXME: clang has code for 'Ump', 'Utf', 'Usa', and 'Ush' but these are
+    //        followed by llvm_unreachable so we'll leave them unimplemented in
+    //        the backend for now.
+    return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+  }
+
+  bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+  bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+  bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
+                              ISD::MemIndexedMode &AM, bool &IsInc,
+                              SelectionDAG &DAG) const;
+  bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+                                 ISD::MemIndexedMode &AM,
+                                 SelectionDAG &DAG) const override;
+  bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+                                  SDValue &Offset, ISD::MemIndexedMode &AM,
+                                  SelectionDAG &DAG) const override;
+
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+
+  bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
+                                                 CallingConv::ID CallConv,
+                                                 bool isVarArg) const override;
+
+  bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override;
+};
+
+namespace AArch64 {
+FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo);
+} // end namespace AArch64
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
new file mode 100644
index 000000000000..867074c3c374
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -0,0 +1,404 @@
+//=- AArch64InstrAtomics.td - AArch64 Atomic codegen support -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Atomic operand code-gen constructs.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------
+// Atomic fences
+//===----------------------------------
+def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
+def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
+
+//===----------------------------------
+// Atomic loads
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A atomic load operation that actually needs acquire semantics.
+class acquiring_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return isAcquireOrStronger(Ordering);
+}]>;
+
+// An atomic load operation that does not need either acquire or release
+// semantics.
+class relaxed_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return !isAcquireOrStronger(Ordering);
+}]>;
+
+// 8-bit loads
+def : Pat<(acquiring_load<atomic_load_8>  GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                                     ro_Wextend8:$offset)),
+          (LDRBBroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8> (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                                     ro_Xextend8:$offset)),
+          (LDRBBroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8> (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8>
+               (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bit loads
+def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend16:$extend)),
+          (LDRHHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
+def : Pat<(relaxed_load<atomic_load_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend16:$extend)),
+          (LDRHHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
+def : Pat<(relaxed_load<atomic_load_16> (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)),
+          (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(relaxed_load<atomic_load_16>
+               (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bit loads
+def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend32:$extend)),
+          (LDRWroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_load<atomic_load_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend32:$extend)),
+          (LDRWroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_load<atomic_load_32> (am_indexed32 GPR64sp:$Rn,
+                                                      uimm12s4:$offset)),
+          (LDRWui GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_load<atomic_load_32>
+               (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+          (LDURWi GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bit loads
+def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend64:$extend)),
+          (LDRXroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_load<atomic_load_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend64:$extend)),
+          (LDRXroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_load<atomic_load_64> (am_indexed64 GPR64sp:$Rn,
+                                                      uimm12s8:$offset)),
+          (LDRXui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_64>
+               (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (LDURXi GPR64sp:$Rn, simm9:$offset)>;
+
+//===----------------------------------
+// Atomic stores
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A store operation that actually needs release semantics.
+class releasing_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  assert(Ordering != AtomicOrdering::AcquireRelease &&
+         "unexpected store ordering");
+  return isReleaseOrStronger(Ordering);
+}]>;
+
+// An atomic store operation that doesn't actually need to be atomic on AArch64.
+class relaxed_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return !isReleaseOrStronger(Ordering);
+}]>;
+
+// 8-bit stores
+def : Pat<(releasing_store<atomic_store_8> GPR64sp:$ptr, GPR32:$val),
+          (STLRB GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+               GPR32:$val),
+          (STRBBroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+               GPR32:$val),
+          (STRBBroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset), GPR32:$val),
+          (STRBBui GPR32:$val, GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURBBi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bit stores
+def : Pat<(releasing_store<atomic_store_16> GPR64sp:$ptr, GPR32:$val),
+          (STLRH GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend16:$extend),
+                                          GPR32:$val),
+          (STRHHroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
+def : Pat<(relaxed_store<atomic_store_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend16:$extend),
+                                          GPR32:$val),
+          (STRHHroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
+def : Pat<(relaxed_store<atomic_store_16>
+              (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), GPR32:$val),
+          (STRHHui GPR32:$val, GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(relaxed_store<atomic_store_16>
+               (am_unscaled16 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURHHi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bit stores
+def : Pat<(releasing_store<atomic_store_32> GPR64sp:$ptr, GPR32:$val),
+          (STLRW GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend32:$extend),
+                                          GPR32:$val),
+          (STRWroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend32:$extend),
+                                          GPR32:$val),
+          (STRWroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32>
+              (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), GPR32:$val),
+          (STRWui GPR32:$val, GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_store<atomic_store_32>
+               (am_unscaled32 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURWi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bit stores
+def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
+          (STLRX GPR64:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend16:$extend),
+                                          GPR64:$val),
+          (STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend16:$extend),
+                                          GPR64:$val),
+          (STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64>
+              (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), GPR64:$val),
+          (STRXui GPR64:$val, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(relaxed_store<atomic_store_64>
+               (am_unscaled64 GPR64sp:$Rn, simm9:$offset), GPR64:$val),
+          (STURXi GPR64:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+//===----------------------------------
+// Low-level exclusive operations
+//===----------------------------------
+
+// Load-exclusives.
+
+def ldxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldxr_1 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_2 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_4 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_8 GPR64sp:$addr), (LDXRX GPR64sp:$addr)>;
+
+def : Pat<(and (ldxr_1 GPR64sp:$addr), 0xff),
+          (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldxr_2 GPR64sp:$addr), 0xffff),
+          (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff),
+          (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
+
+// Load-exclusives.
+
+def ldaxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldaxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldaxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldaxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldaxr_1 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_2 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_4 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_8 GPR64sp:$addr), (LDAXRX GPR64sp:$addr)>;
+
+def : Pat<(and (ldaxr_1 GPR64sp:$addr), 0xff),
+          (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldaxr_2 GPR64sp:$addr), 0xffff),
+          (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff),
+          (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
+
+// Store-exclusives.
+
+def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+
+def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr),
+          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_2 GPR64:$val, GPR64sp:$addr),
+          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_4 GPR64:$val, GPR64sp:$addr),
+          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_8 GPR64:$val, GPR64sp:$addr),
+          (STXRX GPR64:$val, GPR64sp:$addr)>;
+
+def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
+          (STXRB GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
+          (STXRH GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stxr_4 (zext GPR32:$val), GPR64sp:$addr),
+          (STXRW GPR32:$val, GPR64sp:$addr)>;
+
+def : Pat<(stxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
+          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
+          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
+          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+
+// Store-release-exclusives.
+
+def stlxr_1 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stlxr_2 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stlxr_4 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stlxr_8 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+
+def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr),
+          (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_2 GPR64:$val, GPR64sp:$addr),
+          (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_4 GPR64:$val, GPR64sp:$addr),
+          (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_8 GPR64:$val, GPR64sp:$addr),
+          (STLXRX GPR64:$val, GPR64sp:$addr)>;
+
+def : Pat<(stlxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
+          (STLXRB GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stlxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
+          (STLXRH GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stlxr_4 (zext GPR32:$val), GPR64sp:$addr),
+          (STLXRW GPR32:$val, GPR64sp:$addr)>;
+
+def : Pat<(stlxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
+          (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
+          (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
+          (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+
+
+// And clear exclusive.
+
+def : Pat<(int_aarch64_clrex), (CLREX 0xf)>;
+
+//===----------------------------------
+// Atomic cmpxchg for -O0
+//===----------------------------------
+
+// The fast register allocator used during -O0 inserts spills to cover any VRegs
+// live across basic block boundaries. When this happens between an LDXR and an
+// STXR it can clear the exclusive monitor, causing all cmpxchg attempts to
+// fail.
+
+// Unfortunately, this means we have to have an alternative (expanded
+// post-regalloc) path for -O0 compilations. Fortunately this path can be
+// significantly more naive than the standard expansion: we conservatively
+// assume seq_cst, strong cmpxchg and omit clrex on failure.
+
+let Constraints = "@earlyclobber $Rd,@earlyclobber $scratch",
+    mayLoad = 1, mayStore = 1 in {
+def CMP_SWAP_8 : Pseudo<(outs GPR32:$Rd, GPR32:$scratch),
+                        (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
+                 Sched<[WriteAtomic]>;
+
+def CMP_SWAP_16 : Pseudo<(outs GPR32:$Rd, GPR32:$scratch),
+                         (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
+                  Sched<[WriteAtomic]>;
+
+def CMP_SWAP_32 : Pseudo<(outs GPR32:$Rd, GPR32:$scratch),
+                         (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
+                  Sched<[WriteAtomic]>;
+
+def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$scratch),
+                         (ins GPR64:$addr, GPR64:$desired, GPR64:$new), []>,
+                  Sched<[WriteAtomic]>;
+}
+
+let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $scratch",
+    mayLoad = 1, mayStore = 1 in
+def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$scratch),
+                          (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
+                               GPR64:$newLo, GPR64:$newHi), []>,
+                   Sched<[WriteAtomic]>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
new file mode 100644
index 000000000000..cefdf51b50d2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -0,0 +1,9516 @@
+//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe AArch64 instructions format here
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<2> val> {
+  bits<2> Value = val;
+}
+
+def PseudoFrm   : Format<0>;
+def NormalFrm   : Format<1>; // Do we need any others?
+
+// AArch64 Instruction Format
+class AArch64Inst<Format f, string cstr> : Instruction {
+  field bits<32> Inst; // Instruction encoding.
+  // Mask of bits that cause an encoding to be UNPREDICTABLE.
+  // If a bit is set, then if the corresponding bit in the
+  // target encoding differs from its value in the "Inst" field,
+  // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
+  field bits<32> Unpredictable = 0;
+  // SoftFail is the generic name for this field, but we alias it so
+  // as to make it more obvious what it means in ARM-land.
+  field bits<32> SoftFail = Unpredictable;
+  let Namespace   = "AArch64";
+  Format F        = f;
+  bits<2> Form    = F.Value;
+  let Pattern     = [];
+  let Constraints = cstr;
+}
+
+// Pseudo instructions (don't have encoding information)
+class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
+    : AArch64Inst<PseudoFrm, cstr> {
+  dag OutOperandList = oops;
+  dag InOperandList  = iops;
+  let Pattern        = pattern;
+  let isCodeGenOnly  = 1;
+}
+
+// Real instructions (have encoding information)
+class EncodedI<string cstr, list<dag> pattern> : AArch64Inst<NormalFrm, cstr> {
+  let Pattern = pattern;
+  let Size = 4;
+}
+
+// Normal instructions
+class I<dag oops, dag iops, string asm, string operands, string cstr,
+        list<dag> pattern>
+    : EncodedI<cstr, pattern> {
+  dag OutOperandList = oops;
+  dag InOperandList  = iops;
+  let AsmString      = !strconcat(asm, operands);
+}
+
+class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag<dag res>  : PatFrag<(ops node:$LHS), res>;
+
+// Helper fragment for an extract of the high portion of a 128-bit vector.
+def extract_high_v16i8 :
+   UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
+def extract_high_v8i16 :
+   UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
+def extract_high_v4i32 :
+   UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
+def extract_high_v2i64 :
+   UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;
+
+//===----------------------------------------------------------------------===//
+// Asm Operand Classes.
+//
+
+// Shifter operand for arithmetic shifted encodings.
+def ShifterOperand : AsmOperandClass {
+  let Name = "Shifter";
+}
+
+// Shifter operand for mov immediate encodings.
+def MovImm32ShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MovImm32Shifter";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "InvalidMovImm32Shift";
+}
+def MovImm64ShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MovImm64Shifter";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "InvalidMovImm64Shift";
+}
+
+// Shifter operand for arithmetic register shifted encodings.
+class ArithmeticShifterOperand<int width> : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "ArithmeticShifter" # width;
+  let PredicateMethod = "isArithmeticShifter<" # width # ">";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "AddSubRegShift" # width;
+}
+
+def ArithmeticShifterOperand32 : ArithmeticShifterOperand<32>;
+def ArithmeticShifterOperand64 : ArithmeticShifterOperand<64>;
+
+// Shifter operand for logical register shifted encodings.
+class LogicalShifterOperand<int width> : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "LogicalShifter" # width;
+  let PredicateMethod = "isLogicalShifter<" # width # ">";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "AddSubRegShift" # width;
+}
+
+def LogicalShifterOperand32 : LogicalShifterOperand<32>;
+def LogicalShifterOperand64 : LogicalShifterOperand<64>;
+
+// Shifter operand for logical vector 128/64-bit shifted encodings.
+def LogicalVecShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "LogicalVecShifter";
+  let RenderMethod = "addShifterOperands";
+}
+def LogicalVecHalfWordShifterOperand : AsmOperandClass {
+  let SuperClasses = [LogicalVecShifterOperand];
+  let Name = "LogicalVecHalfWordShifter";
+  let RenderMethod = "addShifterOperands";
+}
+
+// The "MSL" shifter on the vector MOVI instruction.
+def MoveVecShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MoveVecShifter";
+  let RenderMethod = "addShifterOperands";
+}
+
+// Extend operand for arithmetic encodings.
+def ExtendOperand : AsmOperandClass {
+  let Name = "Extend";
+  let DiagnosticType = "AddSubRegExtendLarge";
+}
+def ExtendOperand64 : AsmOperandClass {
+  let SuperClasses = [ExtendOperand];
+  let Name = "Extend64";
+  let DiagnosticType = "AddSubRegExtendSmall";
+}
+// 'extend' that's a lsl of a 64-bit register.
+def ExtendOperandLSL64 : AsmOperandClass {
+  let SuperClasses = [ExtendOperand];
+  let Name = "ExtendLSL64";
+  let RenderMethod = "addExtend64Operands";
+  let DiagnosticType = "AddSubRegExtendLarge";
+}
+
+// 8-bit floating-point immediate encodings.
+def FPImmOperand : AsmOperandClass {
+  let Name = "FPImm";
+  let ParserMethod = "tryParseFPImm";
+  let DiagnosticType = "InvalidFPImm";
+}
+
+def CondCode : AsmOperandClass {
+  let Name = "CondCode";
+  let DiagnosticType = "InvalidCondCode";
+}
+
+// A 32-bit register pasrsed as 64-bit
+def GPR32as64Operand : AsmOperandClass {
+  let Name = "GPR32as64";
+}
+def GPR32as64 : RegisterOperand<GPR32> {
+  let ParserMatchClass = GPR32as64Operand;
+}
+
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }
+
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// ADR[P] instruction labels.
+def AdrpOperand : AsmOperandClass {
+  let Name = "AdrpLabel";
+  let ParserMethod = "tryParseAdrpLabel";
+  let DiagnosticType = "InvalidLabel";
+}
+def adrplabel : Operand<i64> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let PrintMethod = "printAdrpLabel";
+  let ParserMatchClass = AdrpOperand;
+}
+
+def AdrOperand : AsmOperandClass {
+  let Name = "AdrLabel";
+  let ParserMethod = "tryParseAdrLabel";
+  let DiagnosticType = "InvalidLabel";
+}
+def adrlabel : Operand<i64> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let ParserMatchClass = AdrOperand;
+}
+
+// simm9 predicate - True if the immediate is in the range [-256, 255].
+def SImm9Operand : AsmOperandClass {
+  let Name = "SImm9";
+  let DiagnosticType = "InvalidMemoryIndexedSImm9";
+}
+def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
+  let ParserMatchClass = SImm9Operand;
+}
+
+// simm7sN predicate - True if the immediate is a multiple of N in the range
+// [-64 * N, 63 * N].
+class SImm7Scaled<int Scale> : AsmOperandClass {
+  let Name = "SImm7s" # Scale;
+  let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm7";
+}
+
+def SImm7s4Operand : SImm7Scaled<4>;
+def SImm7s8Operand : SImm7Scaled<8>;
+def SImm7s16Operand : SImm7Scaled<16>;
+
+def simm7s4 : Operand<i32> {
+  let ParserMatchClass = SImm7s4Operand;
+  let PrintMethod = "printImmScale<4>";
+}
+
+def simm7s8 : Operand<i32> {
+  let ParserMatchClass = SImm7s8Operand;
+  let PrintMethod = "printImmScale<8>";
+}
+
+def simm7s16 : Operand<i32> {
+  let ParserMatchClass = SImm7s16Operand;
+  let PrintMethod = "printImmScale<16>";
+}
+
+def am_indexed7s8   : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>;
+def am_indexed7s16  : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>;
+def am_indexed7s32  : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
+def am_indexed7s64  : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>;
+def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
+
+class AsmImmRange<int Low, int High> : AsmOperandClass {
+  let Name = "Imm" # Low # "_" # High;
+  let DiagnosticType = "InvalidImm" # Low # "_" # High;
+}
+
+def Imm1_8Operand : AsmImmRange<1, 8>;
+def Imm1_16Operand : AsmImmRange<1, 16>;
+def Imm1_32Operand : AsmImmRange<1, 32>;
+def Imm1_64Operand : AsmImmRange<1, 64>;
+
+def MovZSymbolG3AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG3";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g3 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG3AsmOperand;
+}
+
+def MovZSymbolG2AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG2";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g2 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG2AsmOperand;
+}
+
+def MovZSymbolG1AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG1";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g1 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG1AsmOperand;
+}
+
+def MovZSymbolG0AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG0";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g0 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG0AsmOperand;
+}
+
+def MovKSymbolG3AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG3";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g3 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG3AsmOperand;
+}
+
+def MovKSymbolG2AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG2";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g2 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG2AsmOperand;
+}
+
+def MovKSymbolG1AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG1";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g1 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG1AsmOperand;
+}
+
+def MovKSymbolG0AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG0";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g0 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG0AsmOperand;
+}
+
+class fixedpoint_i32<ValueType FloatVT>
+  : Operand<FloatVT>,
+    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm, ld]> {
+  let EncoderMethod = "getFixedPointScaleOpValue";
+  let DecoderMethod = "DecodeFixedPointScaleImm32";
+  let ParserMatchClass = Imm1_32Operand;
+}
+
+class fixedpoint_i64<ValueType FloatVT>
+  : Operand<FloatVT>,
+    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm, ld]> {
+  let EncoderMethod = "getFixedPointScaleOpValue";
+  let DecoderMethod = "DecodeFixedPointScaleImm64";
+  let ParserMatchClass = Imm1_64Operand;
+}
+
+def fixedpoint_f16_i32 : fixedpoint_i32<f16>;
+def fixedpoint_f32_i32 : fixedpoint_i32<f32>;
+def fixedpoint_f64_i32 : fixedpoint_i32<f64>;
+
+def fixedpoint_f16_i64 : fixedpoint_i64<f16>;
+def fixedpoint_f32_i64 : fixedpoint_i64<f32>;
+def fixedpoint_f64_i64 : fixedpoint_i64<f64>;
+
+def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+  let EncoderMethod = "getVecShiftR8OpValue";
+  let DecoderMethod = "DecodeVecShiftR8Imm";
+  let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+  let EncoderMethod = "getVecShiftR16OpValue";
+  let DecoderMethod = "DecodeVecShiftR16Imm";
+  let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+  let EncoderMethod = "getVecShiftR16OpValue";
+  let DecoderMethod = "DecodeVecShiftR16ImmNarrow";
+  let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+  let EncoderMethod = "getVecShiftR32OpValue";
+  let DecoderMethod = "DecodeVecShiftR32Imm";
+  let ParserMatchClass = Imm1_32Operand;
+}
+def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+  let EncoderMethod = "getVecShiftR32OpValue";
+  let DecoderMethod = "DecodeVecShiftR32ImmNarrow";
+  let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
+}]> {
+  let EncoderMethod = "getVecShiftR64OpValue";
+  let DecoderMethod = "DecodeVecShiftR64Imm";
+  let ParserMatchClass = Imm1_64Operand;
+}
+def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+  let EncoderMethod = "getVecShiftR64OpValue";
+  let DecoderMethod = "DecodeVecShiftR64ImmNarrow";
+  let ParserMatchClass = Imm1_32Operand;
+}
+
+def Imm0_1Operand : AsmImmRange<0, 1>;
+def Imm0_7Operand : AsmImmRange<0, 7>;
+def Imm0_15Operand : AsmImmRange<0, 15>;
+def Imm0_31Operand : AsmImmRange<0, 31>;
+def Imm0_63Operand : AsmImmRange<0, 63>;
+
+def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 8);
+}]> {
+  let EncoderMethod = "getVecShiftL8OpValue";
+  let DecoderMethod = "DecodeVecShiftL8Imm";
+  let ParserMatchClass = Imm0_7Operand;
+}
+def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 16);
+}]> {
+  let EncoderMethod = "getVecShiftL16OpValue";
+  let DecoderMethod = "DecodeVecShiftL16Imm";
+  let ParserMatchClass = Imm0_15Operand;
+}
+def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 32);
+}]> {
+  let EncoderMethod = "getVecShiftL32OpValue";
+  let DecoderMethod = "DecodeVecShiftL32Imm";
+  let ParserMatchClass = Imm0_31Operand;
+}
+def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 64);
+}]> {
+  let EncoderMethod = "getVecShiftL64OpValue";
+  let DecoderMethod = "DecodeVecShiftL64Imm";
+  let ParserMatchClass = Imm0_63Operand;
+}
+
+
+// Crazy immediate formats used by 32-bit and 64-bit logical immediate
+// instructions for splatting repeating bit patterns across the immediate.
+def logical_imm32_XFORM : SDNodeXForm<imm, [{
+  uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 32);
+  return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+}]>;
+def logical_imm64_XFORM : SDNodeXForm<imm, [{
+  uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 64);
+  return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+}]>;
+
+let DiagnosticType = "LogicalSecondSource" in {
+  def LogicalImm32Operand : AsmOperandClass {
+    let Name = "LogicalImm32";
+  }
+  def LogicalImm64Operand : AsmOperandClass {
+    let Name = "LogicalImm64";
+  }
+  def LogicalImm32NotOperand : AsmOperandClass {
+    let Name = "LogicalImm32Not";
+  }
+  def LogicalImm64NotOperand : AsmOperandClass {
+    let Name = "LogicalImm64Not";
+  }
+}
+def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{
+  return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 32);
+}], logical_imm32_XFORM> {
+  let PrintMethod = "printLogicalImm32";
+  let ParserMatchClass = LogicalImm32Operand;
+}
+def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{
+  return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 64);
+}], logical_imm64_XFORM> {
+  let PrintMethod = "printLogicalImm64";
+  let ParserMatchClass = LogicalImm64Operand;
+}
+def logical_imm32_not : Operand<i32> {
+  let ParserMatchClass = LogicalImm32NotOperand;
+}
+def logical_imm64_not : Operand<i64> {
+  let ParserMatchClass = LogicalImm64NotOperand;
+}
+
+// imm0_65535 predicate - True if the immediate is in the range [0,65535].
+def Imm0_65535Operand : AsmImmRange<0, 65535>;
+def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 65536;
+}]> {
+  let ParserMatchClass = Imm0_65535Operand;
+  let PrintMethod = "printImmHex";
+}
+
+// imm0_255 predicate - True if the immediate is in the range [0,255].
+def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
+def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 256;
+}]> {
+  let ParserMatchClass = Imm0_255Operand;
+  let PrintMethod = "printImm";
+}
+
+// imm0_127 predicate - True if the immediate is in the range [0,127]
+def Imm0_127Operand : AsmImmRange<0, 127>;
+def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 128;
+}]> {
+  let ParserMatchClass = Imm0_127Operand;
+  let PrintMethod = "printImm";
+}
+
+// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
+// for all shift-amounts.
+
+// imm0_63 predicate - True if the immediate is in the range [0,63]
+def imm0_63 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 64;
+}]> {
+  let ParserMatchClass = Imm0_63Operand;
+}
+
+// imm0_31 predicate - True if the immediate is in the range [0,31]
+def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 32;
+}]> {
+  let ParserMatchClass = Imm0_31Operand;
+}
+
+// True if the 32-bit immediate is in the range [0,31]
+def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 32;
+}]> {
+  let ParserMatchClass = Imm0_31Operand;
+}
+
+// imm0_1 predicate - True if the immediate is in the range [0,1]
+def imm0_1 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = Imm0_1Operand;
+}
+
+// imm0_15 predicate - True if the immediate is in the range [0,15]
+def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 16;
+}]> {
+  let ParserMatchClass = Imm0_15Operand;
+}
+
+// imm0_7 predicate - True if the immediate is in the range [0,7]
+def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = Imm0_7Operand;
+}
+
+// imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15]
+def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 16;
+}]> {
+  let ParserMatchClass = Imm0_15Operand;
+}
+
+// An arithmetic shifter operand:
+//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
+//  {5-0} - imm6
+class arith_shift<ValueType Ty, int width> : Operand<Ty> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = !cast<AsmOperandClass>(
+                         "ArithmeticShifterOperand" # width);
+}
+
+def arith_shift32 : arith_shift<i32, 32>;
+def arith_shift64 : arith_shift<i64, 64>;
+
+class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width>
+    : Operand<Ty>,
+      ComplexPattern<Ty, 2, "SelectArithShiftedRegister", []> {
+  let PrintMethod = "printShiftedRegister";
+  let MIOperandInfo = (ops regclass, !cast<Operand>("arith_shift" # width));
+}
+
+def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>;
+def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>;
+
+// An arithmetic shifter operand:
+//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
+//  {5-0} - imm6
+class logical_shift<int width> : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = !cast<AsmOperandClass>(
+                         "LogicalShifterOperand" # width);
+}
+
+def logical_shift32 : logical_shift<32>;
+def logical_shift64 : logical_shift<64>;
+
+class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop>
+    : Operand<Ty>,
+      ComplexPattern<Ty, 2, "SelectLogicalShiftedRegister", []> {
+  let PrintMethod = "printShiftedRegister";
+  let MIOperandInfo = (ops regclass, shiftop);
+}
+
+def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>;
+def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>;
+
+// A logical vector shifter operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0, #8, #16, or #24
+def logical_vec_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getVecShifterOpValue";
+  let ParserMatchClass = LogicalVecShifterOperand;
+}
+
+// A logical vector half-word shifter operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0 or #8
+def logical_vec_hw_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getVecShifterOpValue";
+  let ParserMatchClass = LogicalVecHalfWordShifterOperand;
+}
+
+// A vector move shifter operand:
+//  {0} - imm1: #8 or #16
+def move_vec_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getMoveVecShifterOpValue";
+  let ParserMatchClass = MoveVecShifterOperand;
+}
+
+let DiagnosticType = "AddSubSecondSource" in {
+  def AddSubImmOperand : AsmOperandClass {
+    let Name = "AddSubImm";
+    let ParserMethod = "tryParseAddSubImm";
+  }
+  def AddSubImmNegOperand : AsmOperandClass {
+    let Name = "AddSubImmNeg";
+    let ParserMethod = "tryParseAddSubImm";
+  }
+}
+// An ADD/SUB immediate shifter operand:
+//  second operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0 or #12
+class addsub_shifted_imm<ValueType Ty>
+    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectArithImmed", [imm]> {
+  let PrintMethod = "printAddSubImm";
+  let EncoderMethod = "getAddSubImmOpValue";
+  let ParserMatchClass = AddSubImmOperand;
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+class addsub_shifted_imm_neg<ValueType Ty>
+    : Operand<Ty> {
+  let EncoderMethod = "getAddSubImmOpValue";
+  let ParserMatchClass = AddSubImmNegOperand;
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+def addsub_shifted_imm32 : addsub_shifted_imm<i32>;
+def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
+def addsub_shifted_imm32_neg : addsub_shifted_imm_neg<i32>;
+def addsub_shifted_imm64_neg : addsub_shifted_imm_neg<i64>;
+
+class neg_addsub_shifted_imm<ValueType Ty>
+    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
+  let PrintMethod = "printAddSubImm";
+  let EncoderMethod = "getAddSubImmOpValue";
+  let ParserMatchClass = AddSubImmOperand;
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
+def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
+
+// An extend operand:
+//  {5-3} - extend type
+//  {2-0} - imm3
+def arith_extend : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperand;
+}
+def arith_extend64 : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperand64;
+}
+
+// 'extend' that's a lsl of a 64-bit register.
+def arith_extendlsl64 : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperandLSL64;
+}
+
+class arith_extended_reg32<ValueType Ty> : Operand<Ty>,
+                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+  let PrintMethod = "printExtendedRegister";
+  let MIOperandInfo = (ops GPR32, arith_extend);
+}
+
+class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
+                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+  let PrintMethod = "printExtendedRegister";
+  let MIOperandInfo = (ops GPR32, arith_extend64);
+}
+
+// Floating-point immediate.
+def fpimm16 : Operand<f16>,
+              PatLeaf<(f16 fpimm), [{
+      return AArch64_AM::getFP16Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::getFP16Imm(InVal);
+      return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+    }]>> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+def fpimm32 : Operand<f32>,
+              PatLeaf<(f32 fpimm), [{
+      return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::getFP32Imm(InVal);
+      return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+    }]>> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+def fpimm64 : Operand<f64>,
+              PatLeaf<(f64 fpimm), [{
+      return AArch64_AM::getFP64Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::getFP64Imm(InVal);
+      return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+    }]>> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+
+def fpimm8 : Operand<i32> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+// Vector lane operands
+class AsmVectorIndex<string Suffix> : AsmOperandClass {
+  let Name = "VectorIndex" # Suffix;
+  let DiagnosticType = "InvalidIndex" # Suffix;
+}
+def VectorIndex1Operand : AsmVectorIndex<"1">;
+def VectorIndexBOperand : AsmVectorIndex<"B">;
+def VectorIndexHOperand : AsmVectorIndex<"H">;
+def VectorIndexSOperand : AsmVectorIndex<"S">;
+def VectorIndexDOperand : AsmVectorIndex<"D">;
+
+def VectorIndex1 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) == 1;
+}]> {
+  let ParserMatchClass = VectorIndex1Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexB : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 16;
+}]> {
+  let ParserMatchClass = VectorIndexBOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexH : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = VectorIndexHOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexS : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 4;
+}]> {
+  let ParserMatchClass = VectorIndexSOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = VectorIndexDOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def simdimmtype10 : Operand<i32>,
+                    PatLeaf<(f64 fpimm), [{
+      return AArch64_AM::isAdvSIMDModImmType10(N->getValueAPF()
+                                               .bitcastToAPInt()
+                                               .getZExtValue());
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType10(N->getValueAPF()
+                                                           .bitcastToAPInt()
+                                                           .getZExtValue());
+      return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+    }]>> {
+  let ParserMatchClass = SIMDImmType10Operand;
+  let PrintMethod = "printSIMDType10Operand";
+}
+
+
+//---
+// System management
+//---
+
+// Base encoding for system instruction operands.
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands,
+                  list<dag> pattern = []>
+    : I<oops, iops, asm, operands, "", pattern> {
+  let Inst{31-22} = 0b1101010100;
+  let Inst{21}    = L;
+}
+
+// System instructions which do not have an Rt register.
+class SimpleSystemI<bit L, dag iops, string asm, string operands,
+                    list<dag> pattern = []>
+    : BaseSystemI<L, (outs), iops, asm, operands, pattern> {
+  let Inst{4-0} = 0b11111;
+}
+
+// System instructions which have an Rt register.
+class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
+    : BaseSystemI<L, oops, iops, asm, operands>,
+      Sched<[WriteSys]> {
+  bits<5> Rt;
+  let Inst{4-0} = Rt;
+}
+
+// Hint instructions that take both a CRm and a 3-bit immediate.
+// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
+// model patterns with sufficiently fine granularity
+let mayStore = 1, mayLoad = 1, hasSideEffects = 1 in
+  class HintI<string mnemonic>
+      : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#"\t$imm", "",
+                      [(int_aarch64_hint imm0_127:$imm)]>,
+        Sched<[WriteHint]> {
+    bits <7> imm;
+    let Inst{20-12} = 0b000110010;
+    let Inst{11-5} = imm;
+  }
+
+// System instructions taking a single literal operand which encodes into
+// CRm. op2 differentiates the opcodes.
+def BarrierAsmOperand : AsmOperandClass {
+  let Name = "Barrier";
+  let ParserMethod = "tryParseBarrierOperand";
+}
+def barrier_op : Operand<i32> {
+  let PrintMethod = "printBarrierOption";
+  let ParserMatchClass = BarrierAsmOperand;
+}
+class CRmSystemI<Operand crmtype, bits<3> opc, string asm,
+                 list<dag> pattern = []>
+    : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm", pattern>,
+      Sched<[WriteBarrier]> {
+  bits<4> CRm;
+  let Inst{20-12} = 0b000110011;
+  let Inst{11-8} = CRm;
+  let Inst{7-5} = opc;
+}
+
+// MRS/MSR system instructions. These have different operand classes because
+// a different subset of registers can be accessed through each instruction.
+def MRSSystemRegisterOperand : AsmOperandClass {
+  let Name = "MRSSystemRegister";
+  let ParserMethod = "tryParseSysReg";
+  let DiagnosticType = "MRS";
+}
+// concatenation of op0, op1, CRn, CRm, op2. 16-bit immediate.
+def mrs_sysreg_op : Operand<i32> {
+  let ParserMatchClass = MRSSystemRegisterOperand;
+  let DecoderMethod = "DecodeMRSSystemRegister";
+  let PrintMethod = "printMRSSystemRegister";
+}
+
+def MSRSystemRegisterOperand : AsmOperandClass {
+  let Name = "MSRSystemRegister";
+  let ParserMethod = "tryParseSysReg";
+  let DiagnosticType = "MSR";
+}
+def msr_sysreg_op : Operand<i32> {
+  let ParserMatchClass = MSRSystemRegisterOperand;
+  let DecoderMethod = "DecodeMSRSystemRegister";
+  let PrintMethod = "printMSRSystemRegister";
+}
+
+def PSBHintOperand : AsmOperandClass {
+  let Name = "PSBHint";
+  let ParserMethod = "tryParsePSBHint";
+}
+def psbhint_op : Operand<i32> {
+  let ParserMatchClass = PSBHintOperand;
+  let PrintMethod = "printPSBHintOp";
+  let MCOperandPredicate = [{
+    // Check, if operand is valid, to fix exhaustive aliasing in disassembly.
+    // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields.
+    if (!MCOp.isImm())
+      return false;
+    return AArch64PSBHint::lookupPSBByEncoding(MCOp.getImm()) != nullptr;
+  }];
+}
+
+class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
+                       "mrs", "\t$Rt, $systemreg"> {
+  bits<16> systemreg;
+  let Inst{20-5} = systemreg;
+}
+
+// FIXME: Some of these def NZCV, others don't. Best way to model that?
+// Explicitly modeling each of the system register as a register class
+// would do it, but feels like overkill at this point.
+class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
+                       "msr", "\t$systemreg, $Rt"> {
+  bits<16> systemreg;
+  let Inst{20-5} = systemreg;
+}
+
+def SystemPStateFieldWithImm0_15Operand : AsmOperandClass {
+  let Name = "SystemPStateFieldWithImm0_15";
+  let ParserMethod = "tryParseSysReg";
+}
+def pstatefield4_op : Operand<i32> {
+  let ParserMatchClass = SystemPStateFieldWithImm0_15Operand;
+  let PrintMethod = "printSystemPStateField";
+}
+
+let Defs = [NZCV] in
+class MSRpstateImm0_15
+  : SimpleSystemI<0, (ins pstatefield4_op:$pstatefield, imm0_15:$imm),
+                  "msr", "\t$pstatefield, $imm">,
+    Sched<[WriteSys]> {
+  bits<6> pstatefield;
+  bits<4> imm;
+  let Inst{20-19} = 0b00;
+  let Inst{18-16} = pstatefield{5-3};
+  let Inst{15-12} = 0b0100;
+  let Inst{11-8} = imm;
+  let Inst{7-5} = pstatefield{2-0};
+
+  let DecoderMethod = "DecodeSystemPStateInstruction";
+  // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns
+  // Fail the decoder should attempt to decode the instruction as MSRI.
+  let hasCompleteDecoder = 0;
+}
+
+def SystemPStateFieldWithImm0_1Operand : AsmOperandClass {
+  let Name = "SystemPStateFieldWithImm0_1";
+  let ParserMethod = "tryParseSysReg";
+}
+def pstatefield1_op : Operand<i32> {
+  let ParserMatchClass = SystemPStateFieldWithImm0_1Operand;
+  let PrintMethod = "printSystemPStateField";
+}
+
+let Defs = [NZCV] in
+class MSRpstateImm0_1
+  : SimpleSystemI<0, (ins pstatefield1_op:$pstatefield, imm0_1:$imm),
+                  "msr", "\t$pstatefield, $imm">,
+    Sched<[WriteSys]> {
+  bits<6> pstatefield;
+  bit imm;
+  let Inst{20-19} = 0b00;
+  let Inst{18-16} = pstatefield{5-3};
+  let Inst{15-9} = 0b0100000;
+  let Inst{8} = imm;
+  let Inst{7-5} = pstatefield{2-0};
+
+  let DecoderMethod = "DecodeSystemPStateInstruction";
+  // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns
+  // Fail the decoder should attempt to decode the instruction as MSRI.
+  let hasCompleteDecoder = 0;
+}
+
+// SYS and SYSL generic system instructions.
+def SysCRAsmOperand : AsmOperandClass {
+  let Name = "SysCR";
+  let ParserMethod = "tryParseSysCROperand";
+}
+
+def sys_cr_op : Operand<i32> {
+  let PrintMethod = "printSysCROperand";
+  let ParserMatchClass = SysCRAsmOperand;
+}
+
+class SystemXtI<bit L, string asm>
+  : RtSystemI<L, (outs),
+       (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, GPR64:$Rt),
+       asm, "\t$op1, $Cn, $Cm, $op2, $Rt"> {
+  bits<3> op1;
+  bits<4> Cn;
+  bits<4> Cm;
+  bits<3> op2;
+  let Inst{20-19} = 0b01;
+  let Inst{18-16} = op1;
+  let Inst{15-12} = Cn;
+  let Inst{11-8}  = Cm;
+  let Inst{7-5}   = op2;
+}
+
+class SystemLXtI<bit L, string asm>
+  : RtSystemI<L, (outs),
+       (ins GPR64:$Rt, imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
+       asm, "\t$Rt, $op1, $Cn, $Cm, $op2"> {
+  bits<3> op1;
+  bits<4> Cn;
+  bits<4> Cm;
+  bits<3> op2;
+  let Inst{20-19} = 0b01;
+  let Inst{18-16} = op1;
+  let Inst{15-12} = Cn;
+  let Inst{11-8}  = Cm;
+  let Inst{7-5}   = op2;
+}
+
+
+// Branch (register) instructions:
+//
+//  case opc of
+//    0001 blr
+//    0000 br
+//    0101 dret
+//    0100 eret
+//    0010 ret
+//    otherwise UNDEFINED
+class BaseBranchReg<bits<4> opc, dag oops, dag iops, string asm,
+                    string operands, list<dag> pattern>
+    : I<oops, iops, asm, operands, "", pattern>, Sched<[WriteBrReg]> {
+  let Inst{31-25} = 0b1101011;
+  let Inst{24-21} = opc;
+  let Inst{20-16} = 0b11111;
+  let Inst{15-10} = 0b000000;
+  let Inst{4-0}   = 0b00000;
+}
+
+class BranchReg<bits<4> opc, string asm, list<dag> pattern>
+    : BaseBranchReg<opc, (outs), (ins GPR64:$Rn), asm, "\t$Rn", pattern> {
+  bits<5> Rn;
+  let Inst{9-5} = Rn;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in
+class SpecialReturn<bits<4> opc, string asm>
+    : BaseBranchReg<opc, (outs), (ins), asm, "", []> {
+  let Inst{9-5} = 0b11111;
+}
+
+//---
+// Conditional branch instruction.
+//---
+
+// Condition code.
+// 4-bit immediate. Pretty-printed as <cc>
+def ccode : Operand<i32> {
+  let PrintMethod = "printCondCode";
+  let ParserMatchClass = CondCode;
+}
+def inv_ccode : Operand<i32> {
+  // AL and NV are invalid in the aliases which use inv_ccode
+  let PrintMethod = "printInverseCondCode";
+  let ParserMatchClass = CondCode;
+  let MCOperandPredicate = [{
+    return MCOp.isImm() &&
+           MCOp.getImm() != AArch64CC::AL &&
+           MCOp.getImm() != AArch64CC::NV;
+  }];
+}
+
+// Conditional branch target. 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def PCRelLabel19Operand : AsmOperandClass {
+  let Name = "PCRelLabel19";
+  let DiagnosticType = "InvalidLabel";
+}
+def am_brcond : Operand<OtherVT> {
+  let EncoderMethod = "getCondBranchTargetOpValue";
+  let DecoderMethod = "DecodePCRelLabel19";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = PCRelLabel19Operand;
+}
+
+class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target),
+                     "b", ".$cond\t$target", "",
+                     [(AArch64brcond bb:$target, imm:$cond, NZCV)]>,
+                   Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let Uses = [NZCV];
+
+  bits<4> cond;
+  bits<19> target;
+  let Inst{31-24} = 0b01010100;
+  let Inst{23-5} = target;
+  let Inst{4} = 0;
+  let Inst{3-0} = cond;
+}
+
+//---
+// Compare-and-branch instructions.
+//---
+class BaseCmpBranch<RegisterClass regtype, bit op, string asm, SDNode node>
+    : I<(outs), (ins regtype:$Rt, am_brcond:$target),
+         asm, "\t$Rt, $target", "",
+         [(node regtype:$Rt, bb:$target)]>,
+      Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+
+  bits<5> Rt;
+  bits<19> target;
+  let Inst{30-25} = 0b011010;
+  let Inst{24}    = op;
+  let Inst{23-5}  = target;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass CmpBranch<bit op, string asm, SDNode node> {
+  def W : BaseCmpBranch<GPR32, op, asm, node> {
+    let Inst{31} = 0;
+  }
+  def X : BaseCmpBranch<GPR64, op, asm, node> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Test-bit-and-branch instructions.
+//---
+// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
+// the target offset are implied zero and so are not part of the immediate.
+def BranchTarget14Operand : AsmOperandClass {
+  let Name = "BranchTarget14";
+}
+def am_tbrcond : Operand<OtherVT> {
+  let EncoderMethod = "getTestBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget14Operand;
+}
+
+// AsmOperand classes to emit (or not) special diagnostics
+def TBZImm0_31Operand : AsmOperandClass {
+  let Name = "TBZImm0_31";
+  let PredicateMethod = "isImm0_31";
+  let RenderMethod = "addImm0_31Operands";
+}
+def TBZImm32_63Operand : AsmOperandClass {
+  let Name = "Imm32_63";
+  let DiagnosticType = "InvalidImm0_63";
+}
+
+class tbz_imm0_31<AsmOperandClass matcher> : Operand<i64>, ImmLeaf<i64, [{
+  return (((uint32_t)Imm) < 32);
+}]> {
+  let ParserMatchClass = matcher;
+}
+
+def tbz_imm0_31_diag : tbz_imm0_31<Imm0_31Operand>;
+def tbz_imm0_31_nodiag : tbz_imm0_31<TBZImm0_31Operand>;
+
+def tbz_imm32_63 : Operand<i64>, ImmLeaf<i64, [{
+  return (((uint32_t)Imm) > 31) && (((uint32_t)Imm) < 64);
+}]> {
+  let ParserMatchClass = TBZImm32_63Operand;
+}
+
+class BaseTestBranch<RegisterClass regtype, Operand immtype,
+                     bit op, string asm, SDNode node>
+    : I<(outs), (ins regtype:$Rt, immtype:$bit_off, am_tbrcond:$target),
+       asm, "\t$Rt, $bit_off, $target", "",
+       [(node regtype:$Rt, immtype:$bit_off, bb:$target)]>,
+      Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+
+  bits<5> Rt;
+  bits<6> bit_off;
+  bits<14> target;
+
+  let Inst{30-25} = 0b011011;
+  let Inst{24}    = op;
+  let Inst{23-19} = bit_off{4-0};
+  let Inst{18-5}  = target;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeTestAndBranch";
+}
+
+multiclass TestBranch<bit op, string asm, SDNode node> {
+  def W : BaseTestBranch<GPR32, tbz_imm0_31_diag, op, asm, node> {
+    let Inst{31} = 0;
+  }
+
+  def X : BaseTestBranch<GPR64, tbz_imm32_63, op, asm, node> {
+    let Inst{31} = 1;
+  }
+
+  // Alias X-reg with 0-31 imm to W-Reg.
+  def : InstAlias<asm # "\t$Rd, $imm, $target",
+                  (!cast<Instruction>(NAME#"W") GPR32as64:$Rd,
+                  tbz_imm0_31_nodiag:$imm, am_tbrcond:$target), 0>;
+  def : Pat<(node GPR64:$Rn, tbz_imm0_31_diag:$imm, bb:$target),
+            (!cast<Instruction>(NAME#"W") (EXTRACT_SUBREG GPR64:$Rn, sub_32),
+            tbz_imm0_31_diag:$imm, bb:$target)>;
+}
+
+//---
+// Unconditional branch (immediate) instructions.
+//---
+def BranchTarget26Operand : AsmOperandClass {
+  let Name = "BranchTarget26";
+  let DiagnosticType = "InvalidLabel";
+}
+def am_b_target : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget26Operand;
+}
+def am_bl_target : Operand<i64> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget26Operand;
+}
+
+class BImm<bit op, dag iops, string asm, list<dag> pattern>
+    : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> {
+  bits<26> addr;
+  let Inst{31}    = op;
+  let Inst{30-26} = 0b00101;
+  let Inst{25-0}  = addr;
+
+  let DecoderMethod = "DecodeUnconditionalBranch";
+}
+
+class BranchImm<bit op, string asm, list<dag> pattern>
+    : BImm<op, (ins am_b_target:$addr), asm, pattern>;
+class CallImm<bit op, string asm, list<dag> pattern>
+    : BImm<op, (ins am_bl_target:$addr), asm, pattern>;
+
+//---
+// Basic one-operand data processing instructions.
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandData<bits<3> opc, RegisterClass regtype, string asm,
+                         SDPatternOperator node>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+      [(set regtype:$Rd, (node regtype:$Rn))]>,
+    Sched<[WriteI, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+
+  let Inst{30-13} = 0b101101011000000000;
+  let Inst{12-10} = opc;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass OneOperandData<bits<3> opc, string asm,
+                          SDPatternOperator node = null_frag> {
+  def Wr : BaseOneOperandData<opc, GPR32, asm, node> {
+    let Inst{31} = 0;
+  }
+
+  def Xr : BaseOneOperandData<opc, GPR64, asm, node> {
+    let Inst{31} = 1;
+  }
+}
+
+class OneWRegData<bits<3> opc, string asm, SDPatternOperator node>
+    : BaseOneOperandData<opc, GPR32, asm, node> {
+  let Inst{31} = 0;
+}
+
+class OneXRegData<bits<3> opc, string asm, SDPatternOperator node>
+    : BaseOneOperandData<opc, GPR64, asm, node> {
+  let Inst{31} = 1;
+}
+
+//---
+// Basic two-operand data processing instructions.
+//---
+class BaseBaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+                          list<dag> pattern>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{30}    = isSub;
+  let Inst{28-21} = 0b11010000;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+                      SDNode OpNode>
+    : BaseBaseAddSubCarry<isSub, regtype, asm,
+        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV))]>;
+
+class BaseAddSubCarrySetFlags<bit isSub, RegisterClass regtype, string asm,
+                              SDNode OpNode>
+    : BaseBaseAddSubCarry<isSub, regtype, asm,
+        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV)),
+         (implicit NZCV)]> {
+  let Defs = [NZCV];
+}
+
+multiclass AddSubCarry<bit isSub, string asm, string asm_setflags,
+                       SDNode OpNode, SDNode OpNode_setflags> {
+  def Wr : BaseAddSubCarry<isSub, GPR32, asm, OpNode> {
+    let Inst{31} = 0;
+    let Inst{29} = 0;
+  }
+  def Xr : BaseAddSubCarry<isSub, GPR64, asm, OpNode> {
+    let Inst{31} = 1;
+    let Inst{29} = 0;
+  }
+
+  // Sets flags.
+  def SWr : BaseAddSubCarrySetFlags<isSub, GPR32, asm_setflags,
+                                    OpNode_setflags> {
+    let Inst{31} = 0;
+    let Inst{29} = 1;
+  }
+  def SXr : BaseAddSubCarrySetFlags<isSub, GPR64, asm_setflags,
+                                    OpNode_setflags> {
+    let Inst{31} = 1;
+    let Inst{29} = 1;
+  }
+}
+
+class BaseTwoOperand<bits<4> opc, RegisterClass regtype, string asm,
+                     SDPatternOperator OpNode>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{30-21} = 0b0011010110;
+  let Inst{20-16} = Rm;
+  let Inst{15-14} = 0b00;
+  let Inst{13-10} = opc;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseDiv<bit isSigned, RegisterClass regtype, string asm,
+              SDPatternOperator OpNode>
+    : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> {
+  let Inst{10}    = isSigned;
+}
+
+multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> {
+  def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>,
+           Sched<[WriteID32, ReadID, ReadID]> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>,
+           Sched<[WriteID64, ReadID, ReadID]> {
+    let Inst{31} = 1;
+  }
+}
+
+class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm,
+                SDPatternOperator OpNode = null_frag>
+  : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>,
+    Sched<[WriteIS, ReadI]> {
+  let Inst{11-10} = shift_type;
+}
+
+multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
+  def Wr : BaseShift<shift_type, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xr : BaseShift<shift_type, GPR64, asm, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn,
+                                             (EXTRACT_SUBREG i64:$Rm, sub_32))>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+}
+
+class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
+    : InstAlias<asm#"\t$dst, $src1, $src2",
+                (inst regtype:$dst, regtype:$src1, regtype:$src2), 0>;
+
+class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
+                       RegisterClass addtype, string asm,
+                       list<dag> pattern>
+  : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra),
+      asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<5> Ra;
+  let Inst{30-24} = 0b0011011;
+  let Inst{23-21} = opc;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = isSub;
+  let Inst{14-10} = Ra;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
+  // MADD/MSUB generation is decided by MachineCombiner.cpp
+  def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
+      [/*(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))*/]>,
+      Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
+    let Inst{31} = 0;
+  }
+
+  def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
+      [/*(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))*/]>,
+      Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> {
+    let Inst{31} = 1;
+  }
+}
+
+class WideMulAccum<bit isSub, bits<3> opc, string asm,
+                   SDNode AccNode, SDNode ExtNode>
+  : BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
+    [(set GPR64:$Rd, (AccNode GPR64:$Ra,
+                            (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
+    Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
+  let Inst{31} = 1;
+}
+
+class MulHi<bits<3> opc, string asm, SDNode OpNode>
+  : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>,
+    Sched<[WriteIM64, ReadIM, ReadIM]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-24} = 0b10011011;
+  let Inst{23-21} = opc;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
+  // (i.e. all bits 1) but is ignored by the processor.
+  let PostEncoderMethod = "fixMulHigh";
+}
+
+class MulAccumWAlias<string asm, Instruction inst>
+    : InstAlias<asm#"\t$dst, $src1, $src2",
+                (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
+class MulAccumXAlias<string asm, Instruction inst>
+    : InstAlias<asm#"\t$dst, $src1, $src2",
+                (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
+class WideMulAccumAlias<string asm, Instruction inst>
+    : InstAlias<asm#"\t$dst, $src1, $src2",
+                (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
+
+class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
+              SDPatternOperator OpNode, string asm>
+  : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>,
+    Sched<[WriteISReg, ReadI, ReadISReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31} = sf;
+  let Inst{30-21} = 0b0011010110;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = 0b010;
+  let Inst{12} = C;
+  let Inst{11-10} = sz;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+  let Predicates = [HasCRC];
+}
+
+//---
+// Address generation.
+//---
+
+class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
+    : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "",
+        pattern>,
+      Sched<[WriteI]> {
+  bits<5>  Xd;
+  bits<21> label;
+  let Inst{31}    = page;
+  let Inst{30-29} = label{1-0};
+  let Inst{28-24} = 0b10000;
+  let Inst{23-5}  = label{20-2};
+  let Inst{4-0}   = Xd;
+
+  let DecoderMethod = "DecodeAdrInstruction";
+}
+
+//---
+// Move immediate.
+//---
+
+def movimm32_imm : Operand<i32> {
+  let ParserMatchClass = Imm0_65535Operand;
+  let EncoderMethod = "getMoveWideImmOpValue";
+  let PrintMethod = "printImm";
+}
+def movimm32_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = MovImm32ShifterOperand;
+}
+def movimm64_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = MovImm64ShifterOperand;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseMoveImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+                        string asm>
+  : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift),
+       asm, "\t$Rd, $imm$shift", "", []>,
+    Sched<[WriteImm]> {
+  bits<5> Rd;
+  bits<16> imm;
+  bits<6> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100101;
+  let Inst{22-21} = shift{5-4};
+  let Inst{20-5}  = imm;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeMoveImmInstruction";
+}
+
+multiclass MoveImmediate<bits<2> opc, string asm> {
+  def Wi : BaseMoveImmediate<opc, GPR32, movimm32_shift, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xi : BaseMoveImmediate<opc, GPR64, movimm64_shift, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseInsertImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+                          string asm>
+  : I<(outs regtype:$Rd),
+      (ins regtype:$src, movimm32_imm:$imm, shifter:$shift),
+       asm, "\t$Rd, $imm$shift", "$src = $Rd", []>,
+    Sched<[WriteI, ReadI]> {
+  bits<5> Rd;
+  bits<16> imm;
+  bits<6> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100101;
+  let Inst{22-21} = shift{5-4};
+  let Inst{20-5}  = imm;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeMoveImmInstruction";
+}
+
+multiclass InsertImmediate<bits<2> opc, string asm> {
+  def Wi : BaseInsertImmediate<opc, GPR32, movimm32_shift, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xi : BaseInsertImmediate<opc, GPR64, movimm64_shift, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Add/Subtract
+//---
+
+class BaseAddSubImm<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                    RegisterClass srcRegtype, addsub_shifted_imm immtype,
+                    string asm, SDPatternOperator OpNode>
+    : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm),
+        asm, "\t$Rd, $Rn, $imm", "",
+        [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>,
+      Sched<[WriteI, ReadI]>  {
+  bits<5>  Rd;
+  bits<5>  Rn;
+  bits<14> imm;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b10001;
+  let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
+  let Inst{21-10} = imm{11-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+  let DecoderMethod = "DecodeBaseAddSubImm";
+}
+
+class BaseAddSubRegPseudo<RegisterClass regtype,
+                          SDPatternOperator OpNode>
+    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+      Sched<[WriteI, ReadI, ReadI]>;
+
+class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype,
+                     arith_shifted_reg shifted_regtype, string asm,
+                     SDPatternOperator OpNode>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "",
+        [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>,
+      Sched<[WriteISReg, ReadI, ReadISReg]> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<8> shift;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-22} = shift{7-6};
+  let Inst{21}    = 0;
+  let Inst{20-16} = src2;
+  let Inst{15-10} = shift{5-0};
+  let Inst{9-5}   = src1;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+class BaseAddSubEReg<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                     RegisterClass src1Regtype, Operand src2Regtype,
+                     string asm, SDPatternOperator OpNode>
+    : I<(outs dstRegtype:$R1),
+        (ins src1Regtype:$R2, src2Regtype:$R3),
+        asm, "\t$R1, $R2, $R3", "",
+        [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>,
+      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> ext;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-21} = 0b001;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = ext{5-3};
+  let Inst{12-10} = ext{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeAddSubERegInstruction";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                       RegisterClass src1Regtype, RegisterClass src2Regtype,
+                       Operand ext_op, string asm>
+    : I<(outs dstRegtype:$Rd),
+        (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext),
+        asm, "\t$Rd, $Rn, $Rm$ext", "", []>,
+      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> ext;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-21} = 0b001;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = ext{5};
+  let Inst{12-10} = ext{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeAddSubERegInstruction";
+}
+
+// Aliases for register+register add/subtract.
+class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
+                     RegisterClass src1Regtype, RegisterClass src2Regtype,
+                     int shiftExt>
+    : InstAlias<asm#"\t$dst, $src1, $src2",
+                (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
+                      shiftExt)>;
+
+multiclass AddSub<bit isSub, string mnemonic, string alias,
+                  SDPatternOperator OpNode = null_frag> {
+  let hasSideEffects = 0, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+  // Add/Subtract immediate
+  // Increase the weight of the immediate variant to try to match it before
+  // the extended register variant.
+  // We used to match the register variant before the immediate when the
+  // register argument could be implicitly zero-extended.
+  let AddedComplexity = 6 in
+  def Wri  : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
+                           mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  let AddedComplexity = 6 in
+  def Xri  : BaseAddSubImm<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64,
+                           mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract register - Only used for CodeGen
+  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+  // Add/Subtract shifted register
+  def Wrs : BaseAddSubSReg<isSub, 0, GPR32, arith_shifted_reg32, mnemonic,
+                           OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseAddSubSReg<isSub, 0, GPR64, arith_shifted_reg64, mnemonic,
+                           OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  // Add/Subtract extended register
+  let AddedComplexity = 1, hasSideEffects = 0 in {
+  def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
+                           arith_extended_reg32<i32>, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
+                           arith_extended_reg32to64<i64>, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  def Xrx64 : BaseAddSubEReg64<isSub, 0, GPR64sp, GPR64sp, GPR64,
+                               arith_extendlsl64, mnemonic> {
+    // UXTX and SXTX only.
+    let Inst{14-13} = 0b11;
+    let Inst{31} = 1;
+  }
+
+  // add Rd, Rb, -imm -> sub Rd, Rn, imm
+  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn,
+                      addsub_shifted_imm32_neg:$imm), 0>;
+  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn,
+                       addsub_shifted_imm64_neg:$imm), 0>;
+
+  // Register/register aliases with no shift when SP is not used.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+                       GPR32, GPR32, GPR32, 0>;
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+                       GPR64, GPR64, GPR64, 0>;
+
+  // Register/register aliases with no shift when either the destination or
+  // first source register is SP.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0
+}
+
+multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
+                   string alias, string cmpAlias> {
+  let isCompare = 1, Defs = [NZCV] in {
+  // Add/Subtract immediate
+  def Wri  : BaseAddSubImm<isSub, 1, GPR32, GPR32sp, addsub_shifted_imm32,
+                           mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xri  : BaseAddSubImm<isSub, 1, GPR64, GPR64sp, addsub_shifted_imm64,
+                           mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract register
+  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+  // Add/Subtract shifted register
+  def Wrs : BaseAddSubSReg<isSub, 1, GPR32, arith_shifted_reg32, mnemonic,
+                           OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseAddSubSReg<isSub, 1, GPR64, arith_shifted_reg64, mnemonic,
+                           OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract extended register
+  let AddedComplexity = 1 in {
+  def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
+                           arith_extended_reg32<i32>, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
+                           arith_extended_reg32<i64>, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  def Xrx64 : BaseAddSubEReg64<isSub, 1, GPR64, GPR64sp, GPR64,
+                               arith_extendlsl64, mnemonic> {
+    // UXTX and SXTX only.
+    let Inst{14-13} = 0b11;
+    let Inst{31} = 1;
+  }
+  } // Defs = [NZCV]
+
+  // Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm
+  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn,
+                      addsub_shifted_imm32_neg:$imm), 0>;
+  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn,
+                       addsub_shifted_imm64_neg:$imm), 0>;
+
+  // Compare aliases
+  def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
+                  WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>;
+  def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
+                  XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
+  def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
+                  WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+  def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
+                  XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+  def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
+                  XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
+  def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
+                  WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
+  def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
+                  XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
+
+  // Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm
+  def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
+                  WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>;
+  def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
+                  XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>;
+
+  // Compare shorthands
+  def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrs")
+                  WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
+  def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrs")
+                  XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
+  def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrx")
+                  WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>;
+  def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrx64")
+                  XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>;
+
+  // Register/register aliases with no shift when SP is not used.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+                       GPR32, GPR32, GPR32, 0>;
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+                       GPR64, GPR64, GPR64, 0>;
+
+  // Register/register aliases with no shift when the first source register
+  // is SP.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32, GPR32sponly, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64, GPR64sponly, GPR64, 24>; // UXTX #0
+}
+
+//---
+// Extract
+//---
+def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                      SDTCisPtrTy<3>]>;
+def AArch64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>;
+
+class BaseExtractImm<RegisterClass regtype, Operand imm_type, string asm,
+                     list<dag> patterns>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm),
+         asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>,
+      Sched<[WriteExtr, ReadExtrHi]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> imm;
+
+  let Inst{30-23} = 0b00100111;
+  let Inst{21}    = 0;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = imm;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass ExtractImm<string asm> {
+  def Wrri : BaseExtractImm<GPR32, imm0_31, asm,
+                      [(set GPR32:$Rd,
+                        (AArch64Extr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imm<5> must be zero.
+    let imm{5}   = 0;
+  }
+  def Xrri : BaseExtractImm<GPR64, imm0_63, asm,
+                      [(set GPR64:$Rd,
+                        (AArch64Extr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> {
+
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+//---
+// Bitfield
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImm<bits<2> opc,
+                      RegisterClass regtype, Operand imm_type, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms),
+         asm, "\t$Rd, $Rn, $immr, $imms", "", []>,
+      Sched<[WriteIS, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> immr;
+  bits<6> imms;
+
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100110;
+  let Inst{21-16} = immr;
+  let Inst{15-10} = imms;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass BitfieldImm<bits<2> opc, string asm> {
+  def Wri : BaseBitfieldImm<opc, GPR32, imm0_31, asm> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imms<5> and immr<5> must be zero, else ReservedValue().
+    let Inst{21} = 0;
+    let Inst{15} = 0;
+  }
+  def Xri : BaseBitfieldImm<opc, GPR64, imm0_63, asm> {
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImmWith2RegArgs<bits<2> opc,
+                      RegisterClass regtype, Operand imm_type, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr,
+                             imm_type:$imms),
+         asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>,
+      Sched<[WriteIS, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> immr;
+  bits<6> imms;
+
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100110;
+  let Inst{21-16} = immr;
+  let Inst{15-10} = imms;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass BitfieldImmWith2RegArgs<bits<2> opc, string asm> {
+  def Wri : BaseBitfieldImmWith2RegArgs<opc, GPR32, imm0_31, asm> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imms<5> and immr<5> must be zero, else ReservedValue().
+    let Inst{21} = 0;
+    let Inst{15} = 0;
+  }
+  def Xri : BaseBitfieldImmWith2RegArgs<opc, GPR64, imm0_63, asm> {
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+//---
+// Logical
+//---
+
+// Logical (immediate)
+class BaseLogicalImm<bits<2> opc, RegisterClass dregtype,
+                     RegisterClass sregtype, Operand imm_type, string asm,
+                     list<dag> pattern>
+    : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm),
+         asm, "\t$Rd, $Rn, $imm", "", pattern>,
+      Sched<[WriteI, ReadI]> {
+  bits<5>  Rd;
+  bits<5>  Rn;
+  bits<13> imm;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100100;
+  let Inst{22}    = imm{12};
+  let Inst{21-16} = imm{11-6};
+  let Inst{15-10} = imm{5-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeLogicalImmInstruction";
+}
+
+// Logical (shifted register)
+class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
+                      logical_shifted_reg shifted_regtype, string asm,
+                      list<dag> pattern>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+      Sched<[WriteISReg, ReadI, ReadISReg]> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<8> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-24} = 0b01010;
+  let Inst{23-22} = shift{7-6};
+  let Inst{21}    = N;
+  let Inst{20-16} = src2;
+  let Inst{15-10} = shift{5-0};
+  let Inst{9-5}   = src1;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+// Aliases for register+register logical instructions.
+class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
+    : InstAlias<asm#"\t$dst, $src1, $src2",
+                (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
+
+multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
+                      string Alias> {
+  let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in
+  def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
+                           [(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
+                                               logical_imm32:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+  }
+  let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in
+  def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
+                           [(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
+                                               logical_imm64:$imm))]> {
+    let Inst{31} = 1;
+  }
+
+  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn,
+                      logical_imm32_not:$imm), 0>;
+  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn,
+                       logical_imm64_not:$imm), 0>;
+}
+
+multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode,
+                       string Alias> {
+  let isCompare = 1, Defs = [NZCV] in {
+  def Wri  : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
+      [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+  }
+  def Xri  : BaseLogicalImm<opc, GPR64, GPR64, logical_imm64, mnemonic,
+      [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> {
+    let Inst{31} = 1;
+  }
+  } // end Defs = [NZCV]
+
+  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn,
+                      logical_imm32_not:$imm), 0>;
+  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+                  (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn,
+                       logical_imm64_not:$imm), 0>;
+}
+
+class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
+    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+      Sched<[WriteI, ReadI, ReadI]>;
+
+// Split from LogicalImm as not all instructions have both.
+multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
+                      SDPatternOperator OpNode> {
+  let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+  }
+
+  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+                            [(set GPR32:$Rd, (OpNode GPR32:$Rn,
+                                                 logical_shifted_reg32:$Rm))]> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+                            [(set GPR64:$Rd, (OpNode GPR64:$Rn,
+                                                 logical_shifted_reg64:$Rm))]> {
+    let Inst{31} = 1;
+  }
+
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+// Split from LogicalReg to allow setting NZCV Defs
+multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
+                       SDPatternOperator OpNode = null_frag> {
+  let Defs = [NZCV], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+
+  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+            [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_shifted_reg32:$Rm))]> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+            [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_shifted_reg64:$Rm))]> {
+    let Inst{31} = 1;
+  }
+  } // Defs = [NZCV]
+
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+//---
+// Conditionally set flags
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondComparisonImm<bit op, RegisterClass regtype, ImmLeaf immtype,
+                            string mnemonic, SDNode OpNode>
+    : I<(outs), (ins regtype:$Rn, immtype:$imm, imm32_0_15:$nzcv, ccode:$cond),
+         mnemonic, "\t$Rn, $imm, $nzcv, $cond", "",
+         [(set NZCV, (OpNode regtype:$Rn, immtype:$imm, (i32 imm:$nzcv),
+                             (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteI, ReadI]> {
+  let Uses = [NZCV];
+  let Defs = [NZCV];
+
+  bits<5> Rn;
+  bits<5> imm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b111010010;
+  let Inst{20-16} = imm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = nzcv;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondComparisonReg<bit op, RegisterClass regtype, string mnemonic,
+                            SDNode OpNode>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
+         mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "",
+         [(set NZCV, (OpNode regtype:$Rn, regtype:$Rm, (i32 imm:$nzcv),
+                             (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+  let Defs = [NZCV];
+
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b111010010;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass CondComparison<bit op, string mnemonic, SDNode OpNode> {
+  // immediate operand variants
+  def Wi : BaseCondComparisonImm<op, GPR32, imm32_0_31, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xi : BaseCondComparisonImm<op, GPR64, imm0_31, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  // register operand variants
+  def Wr : BaseCondComparisonReg<op, GPR32, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondComparisonReg<op, GPR64, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Conditional select
+//---
+
+class BaseCondSelect<bit op, bits<2> op2, RegisterClass regtype, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b011010100;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = op2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass CondSelect<bit op, bits<2> op2, string asm> {
+  def Wr : BaseCondSelect<op, op2, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSelect<op, op2, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm,
+                       PatFrag frag>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel regtype:$Rn, (frag regtype:$Rm),
+               (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b011010100;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = op2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+  AArch64CC::CondCode CC = static_cast<AArch64CC::CondCode>(N->getZExtValue());
+  return CurDAG->getTargetConstant(AArch64CC::getInvertedCondCode(CC), SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> {
+  def Wr : BaseCondSelectOp<op, op2, GPR32, asm, frag> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSelectOp<op, op2, GPR64, asm, frag> {
+    let Inst{31} = 1;
+  }
+
+  def : Pat<(AArch64csel (frag GPR32:$Rm), GPR32:$Rn, (i32 imm:$cond), NZCV),
+            (!cast<Instruction>(NAME # Wr) GPR32:$Rn, GPR32:$Rm,
+                                           (inv_cond_XFORM imm:$cond))>;
+
+  def : Pat<(AArch64csel (frag GPR64:$Rm), GPR64:$Rn, (i32 imm:$cond), NZCV),
+            (!cast<Instruction>(NAME # Xr) GPR64:$Rn, GPR64:$Rm,
+                                           (inv_cond_XFORM imm:$cond))>;
+}
+
+//---
+// Special Mask Value
+//---
+def maski8_or_more : Operand<i32>,
+  ImmLeaf<i32, [{ return (Imm & 0xff) == 0xff; }]> {
+}
+def maski16_or_more : Operand<i32>,
+  ImmLeaf<i32, [{ return (Imm & 0xffff) == 0xffff; }]> {
+}
+
+
+//---
+// Load/store
+//---
+
+// (unsigned immediate)
+// Indexed for 8-bit registers. offset is in range [0,4095].
+def am_indexed8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []>;
+def am_indexed16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []>;
+def am_indexed32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []>;
+def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>;
+def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>;
+
+class UImm12OffsetOperand<int Scale> : AsmOperandClass {
+  let Name = "UImm12Offset" # Scale;
+  let RenderMethod = "addUImm12OffsetOperands<" # Scale # ">";
+  let PredicateMethod = "isUImm12Offset<" # Scale # ">";
+  let DiagnosticType = "InvalidMemoryIndexed" # Scale;
+}
+
+def UImm12OffsetScale1Operand : UImm12OffsetOperand<1>;
+def UImm12OffsetScale2Operand : UImm12OffsetOperand<2>;
+def UImm12OffsetScale4Operand : UImm12OffsetOperand<4>;
+def UImm12OffsetScale8Operand : UImm12OffsetOperand<8>;
+def UImm12OffsetScale16Operand : UImm12OffsetOperand<16>;
+
+class uimm12_scaled<int Scale> : Operand<i64> {
+  let ParserMatchClass
+   = !cast<AsmOperandClass>("UImm12OffsetScale" # Scale # "Operand");
+  let EncoderMethod
+   = "getLdStUImm12OpValue<AArch64::fixup_aarch64_ldst_imm12_scale" # Scale # ">";
+  let PrintMethod = "printUImm12Offset<" # Scale # ">";
+}
+
+def uimm12s1 : uimm12_scaled<1>;
+def uimm12s2 : uimm12_scaled<2>;
+def uimm12s4 : uimm12_scaled<4>;
+def uimm12s8 : uimm12_scaled<8>;
+def uimm12s16 : uimm12_scaled<16>;
+
+class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                      string asm, list<dag> pattern>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
+  bits<5> Rt;
+
+  bits<5> Rn;
+  bits<12> offset;
+
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b01;
+  let Inst{23-22} = opc;
+  let Inst{21-10} = offset;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeUnsignedLdStInstruction";
+}
+
+multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                  Operand indextype, string asm, list<dag> pattern> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def ui : BaseLoadStoreUI<sz, V, opc, (outs regtype:$Rt),
+                           (ins GPR64sp:$Rn, indextype:$offset),
+                           asm, pattern>,
+           Sched<[WriteLD]>;
+
+  def : InstAlias<asm # "\t$Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             Operand indextype, string asm, list<dag> pattern> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def ui : BaseLoadStoreUI<sz, V, opc, (outs),
+                           (ins regtype:$Rt, GPR64sp:$Rn, indextype:$offset),
+                           asm, pattern>,
+           Sched<[WriteST]>;
+
+  def : InstAlias<asm # "\t$Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+def PrefetchOperand : AsmOperandClass {
+  let Name = "Prefetch";
+  let ParserMethod = "tryParsePrefetch";
+}
+def prfop : Operand<i32> {
+  let PrintMethod = "printPrefetchOp";
+  let ParserMatchClass = PrefetchOperand;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
+    : BaseLoadStoreUI<sz, V, opc,
+                      (outs), (ins prfop:$Rt, GPR64sp:$Rn, uimm12s8:$offset),
+                      asm, pat>,
+      Sched<[WriteLD]>;
+
+//---
+// Load literal
+//---
+
+// Load literal address: 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def am_ldrlit : Operand<OtherVT> {
+  let EncoderMethod = "getLoadLiteralOpValue";
+  let DecoderMethod = "DecodePCRelLabel19";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = PCRelLabel19Operand;
+}
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class LoadLiteral<bits<2> opc, bit V, RegisterClass regtype, string asm>
+    : I<(outs regtype:$Rt), (ins am_ldrlit:$label),
+        asm, "\t$Rt, $label", "", []>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<19> label;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b011;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-5}  = label;
+  let Inst{4-0}   = Rt;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat>
+    : I<(outs), (ins prfop:$Rt, am_ldrlit:$label),
+        asm, "\t$Rt, $label", "", pat>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<19> label;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b011;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-5}  = label;
+  let Inst{4-0}   = Rt;
+}
+
+//---
+// Load/store register offset
+//---
+
+def ro_Xindexed8 : ComplexPattern<i64, 4, "SelectAddrModeXRO<8>", []>;
+def ro_Xindexed16 : ComplexPattern<i64, 4, "SelectAddrModeXRO<16>", []>;
+def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>;
+def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>;
+def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>;
+
+def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>;
+def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>;
+def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>;
+def ro_Windexed64 : ComplexPattern<i64, 4, "SelectAddrModeWRO<64>", []>;
+def ro_Windexed128 : ComplexPattern<i64, 4, "SelectAddrModeWRO<128>", []>;
+
+class MemExtendOperand<string Reg, int Width> : AsmOperandClass {
+  let Name = "Mem" # Reg # "Extend" # Width;
+  let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">";
+  let RenderMethod = "addMemExtendOperands";
+  let DiagnosticType = "InvalidMemory" # Reg # "Extend" # Width;
+}
+
+def MemWExtend8Operand : MemExtendOperand<"W", 8> {
+  // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
+  // the trivial shift.
+  let RenderMethod = "addMemExtend8Operands";
+}
+def MemWExtend16Operand : MemExtendOperand<"W", 16>;
+def MemWExtend32Operand : MemExtendOperand<"W", 32>;
+def MemWExtend64Operand : MemExtendOperand<"W", 64>;
+def MemWExtend128Operand : MemExtendOperand<"W", 128>;
+
+def MemXExtend8Operand : MemExtendOperand<"X", 8> {
+  // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
+  // the trivial shift.
+  let RenderMethod = "addMemExtend8Operands";
+}
+def MemXExtend16Operand : MemExtendOperand<"X", 16>;
+def MemXExtend32Operand : MemExtendOperand<"X", 32>;
+def MemXExtend64Operand : MemExtendOperand<"X", 64>;
+def MemXExtend128Operand : MemExtendOperand<"X", 128>;
+
+class ro_extend<AsmOperandClass ParserClass, string Reg, int Width>
+        : Operand<i32> {
+  let ParserMatchClass = ParserClass;
+  let PrintMethod = "printMemExtend<'" # Reg # "', " # Width # ">";
+  let DecoderMethod = "DecodeMemExtend";
+  let EncoderMethod = "getMemExtendOpValue";
+  let MIOperandInfo = (ops i32imm:$signed, i32imm:$doshift);
+}
+
+def ro_Wextend8   : ro_extend<MemWExtend8Operand,   "w", 8>;
+def ro_Wextend16  : ro_extend<MemWExtend16Operand,  "w", 16>;
+def ro_Wextend32  : ro_extend<MemWExtend32Operand,  "w", 32>;
+def ro_Wextend64  : ro_extend<MemWExtend64Operand,  "w", 64>;
+def ro_Wextend128 : ro_extend<MemWExtend128Operand, "w", 128>;
+
+def ro_Xextend8   : ro_extend<MemXExtend8Operand,   "x", 8>;
+def ro_Xextend16  : ro_extend<MemXExtend16Operand,  "x", 16>;
+def ro_Xextend32  : ro_extend<MemXExtend32Operand,  "x", 32>;
+def ro_Xextend64  : ro_extend<MemXExtend64Operand,  "x", 64>;
+def ro_Xextend128 : ro_extend<MemXExtend128Operand, "x", 128>;
+
+class ROAddrMode<ComplexPattern windex, ComplexPattern xindex,
+                  Operand wextend, Operand xextend>  {
+  // CodeGen-level pattern covering the entire addressing mode.
+  ComplexPattern Wpat = windex;
+  ComplexPattern Xpat = xindex;
+
+  // Asm-level Operand covering the valid "uxtw #3" style syntax.
+  Operand Wext = wextend;
+  Operand Xext = xextend;
+}
+
+def ro8 : ROAddrMode<ro_Windexed8, ro_Xindexed8, ro_Wextend8, ro_Xextend8>;
+def ro16 : ROAddrMode<ro_Windexed16, ro_Xindexed16, ro_Wextend16, ro_Xextend16>;
+def ro32 : ROAddrMode<ro_Windexed32, ro_Xindexed32, ro_Wextend32, ro_Xextend32>;
+def ro64 : ROAddrMode<ro_Windexed64, ro_Xindexed64, ro_Wextend64, ro_Xextend64>;
+def ro128 : ROAddrMode<ro_Windexed128, ro_Xindexed128, ro_Wextend128,
+                       ro_Xextend128>;
+
+class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+class ROInstAlias<string asm, RegisterClass regtype, Instruction INST>
+  : InstAlias<asm # "\t$Rt, [$Rn, $Rm]",
+              (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+
+multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore8RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                             ro_Wextend8:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore8RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                             ro_Xextend8:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+                 (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+                 [(storeop (Ty regtype:$Rt),
+                           (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend8:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+                 (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+                 [(storeop (Ty regtype:$Rt),
+                           (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend8:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                              ro_Wextend16:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                             ro_Xextend16:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend16:$extend))]>,
+           Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend16:$extend))]>,
+           Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                              ro_Wextend32:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                              ro_Xextend32:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend32:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                        ro_Xextend32:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                [(set (Ty regtype:$Rt),
+                      (loadop (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                             ro_Wextend64:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                              ro_Xextend64:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend64:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend64:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+                                               ro_Wextend128:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+                                               ro_Xextend128:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+               (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+                                          ro_Wextend128:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+               (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+                                          ro_Xextend128:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BasePrefetchRO<bits<2> sz, bit V, bits<2> opc, dag outs, dag ins,
+                     string asm, list<dag> pat>
+    : I<outs, ins, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> {
+  def roW : BasePrefetchRO<sz, V, opc, (outs),
+                (ins prfop:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                asm, [(AArch64Prefetch imm:$Rt,
+                                     (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                    ro_Wextend64:$extend))]> {
+    let Inst{13} = 0b0;
+  }
+
+  def roX : BasePrefetchRO<sz, V, opc, (outs),
+                (ins prfop:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                asm,  [(AArch64Prefetch imm:$Rt,
+                                      (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                     ro_Xextend64:$extend))]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
+               (!cast<Instruction>(NAME # "roX") prfop:$Rt,
+                                                 GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+}
+
+//---
+// Load/store unscaled immediate
+//---
+
+def am_unscaled8 :  ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>;
+def am_unscaled16 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>;
+def am_unscaled32 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>;
+def am_unscaled64 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>;
+def am_unscaled128 :ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>;
+
+class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                           string asm, list<dag> pattern>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, list<dag> pattern> {
+  let AddedComplexity = 1 in // try this before LoadUI
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
+                               (ins GPR64sp:$Rn, simm9:$offset), asm, pattern>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # "\t$Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                         string asm, list<dag> pattern> {
+  let AddedComplexity = 1 in // try this before StoreUI
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
+                               (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                               asm, pattern>,
+          Sched<[WriteST]>;
+
+  def : InstAlias<asm # "\t$Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm,
+                            list<dag> pat> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
+                               (ins prfop:$Rt, GPR64sp:$Rn, simm9:$offset),
+                               asm, pat>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # "\t$Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store unscaled immediate, unprivileged
+//---
+
+class BaseLoadStoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                                dag oops, dag iops, string asm>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                            RegisterClass regtype, string asm> {
+  let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in
+  def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs regtype:$Rt),
+                                    (ins GPR64sp:$Rn, simm9:$offset), asm>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # "\t$Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                             RegisterClass regtype, string asm> {
+  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
+  def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs),
+                                 (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                                 asm>,
+          Sched<[WriteST]>;
+
+  def : InstAlias<asm # "\t$Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store pre-indexed
+//---
+
+class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                          string asm, string cstr, list<dag> pat>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]!", cstr, pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b11;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm>
+    : BaseLoadStorePreIdx<sz, V, opc,
+                     (outs GPR64sp:$wback, regtype:$Rt),
+                     (ins GPR64sp:$Rn, simm9:$offset), asm,
+                     "$Rn = $wback,@earlyclobber $wback", []>,
+      Sched<[WriteLD, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                  string asm, SDPatternOperator storeop, ValueType Ty>
+    : BaseLoadStorePreIdx<sz, V, opc,
+                      (outs GPR64sp:$wback),
+                      (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                      asm, "$Rn = $wback,@earlyclobber $wback",
+      [(set GPR64sp:$wback,
+            (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
+      Sched<[WriteAdr, WriteST]>;
+} // hasSideEffects = 0
+
+//---
+// Load/store post-indexed
+//---
+
+class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                          string asm, string cstr, list<dag> pat>
+    : I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0b0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm>
+    : BaseLoadStorePostIdx<sz, V, opc,
+                      (outs GPR64sp:$wback, regtype:$Rt),
+                      (ins GPR64sp:$Rn, simm9:$offset),
+                      asm, "$Rn = $wback,@earlyclobber $wback", []>,
+      Sched<[WriteLD, WriteI]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, SDPatternOperator storeop, ValueType Ty>
+    : BaseLoadStorePostIdx<sz, V, opc,
+                      (outs GPR64sp:$wback),
+                      (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                       asm, "$Rn = $wback,@earlyclobber $wback",
+      [(set GPR64sp:$wback,
+            (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
+    Sched<[WriteAdr, WriteST, ReadAdrBase]>;
+} // hasSideEffects = 0
+
+
+//---
+// Load/store pair
+//---
+
+// (indexed, offset)
+
+class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b010;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
+                          Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+  def i : BaseLoadStorePairOffset<opc, V, 1,
+                                  (outs regtype:$Rt, regtype:$Rt2),
+                                  (ins GPR64sp:$Rn, indextype:$offset), asm>,
+          Sched<[WriteLD, WriteLDHi]>;
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+
+multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
+                           Operand indextype, string asm> {
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+  def i : BaseLoadStorePairOffset<opc, V, 0, (outs),
+                                  (ins regtype:$Rt, regtype:$Rt2,
+                                       GPR64sp:$Rn, indextype:$offset),
+                                  asm>,
+          Sched<[WriteSTP]>;
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+// (pre-indexed)
+class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback,@earlyclobber $wback", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b011;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+                     Operand indextype, string asm>
+    : BaseLoadStorePairPreIdx<opc, V, 1,
+                              (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
+                              (ins GPR64sp:$Rn, indextype:$offset), asm>,
+      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand indextype, string asm>
+    : BaseLoadStorePairPreIdx<opc, V, 0, (outs GPR64sp:$wback),
+                             (ins regtype:$Rt, regtype:$Rt2,
+                                  GPR64sp:$Rn, indextype:$offset),
+                             asm>,
+      Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+// (post-indexed)
+
+class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback,@earlyclobber $wback", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b001;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand idxtype, string asm>
+    : BaseLoadStorePairPostIdx<opc, V, 1,
+                              (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
+                              (ins GPR64sp:$Rn, idxtype:$offset), asm>,
+      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+                       Operand idxtype, string asm>
+    : BaseLoadStorePairPostIdx<opc, V, 0, (outs GPR64sp:$wback),
+                             (ins regtype:$Rt, regtype:$Rt2,
+                                  GPR64sp:$Rn, idxtype:$offset),
+                             asm>,
+      Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+//  (no-allocate)
+
+class BaseLoadStorePairNoAlloc<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b000;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+multiclass LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+                           Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+  def i : BaseLoadStorePairNoAlloc<opc, V, 1,
+                                   (outs regtype:$Rt, regtype:$Rt2),
+                                   (ins GPR64sp:$Rn, indextype:$offset), asm>,
+          Sched<[WriteLD, WriteLDHi]>;
+
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+multiclass StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in
+  def i : BaseLoadStorePairNoAlloc<opc, V, 0, (outs),
+                                   (ins regtype:$Rt, regtype:$Rt2,
+                                        GPR64sp:$Rn, indextype:$offset),
+                                   asm>,
+          Sched<[WriteSTP]>;
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store exclusive
+//---
+
+// True exclusive operations write to and/or read from the system's exclusive
+// monitors, which as far as a compiler is concerned can be modelled as a
+// random shared memory address. Hence LoadExclusive mayStore.
+//
+// Since these instructions have the undefined register bits set to 1 in
+// their canonical form, we need a post encoder method to set those bits
+// to 1 when encoding these instructions. We do this using the
+// fixLoadStoreExclusive function. This function has template parameters:
+//
+// fixLoadStoreExclusive<int hasRs, int hasRt2>
+//
+// hasRs indicates that the instruction uses the Rs field, so we won't set
+// it to 1 (and the same for Rt2). We don't need template parameters for
+// the other register fields since Rt and Rn are always used.
+//
+let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
+class BaseLoadStoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                             dag oops, dag iops, string asm, string operands>
+    : I<oops, iops, asm, operands, "", []> {
+  let Inst{31-30} = sz;
+  let Inst{29-24} = 0b001000;
+  let Inst{23}    = o2;
+  let Inst{22}    = L;
+  let Inst{21}    = o1;
+  let Inst{15}    = o0;
+
+  let DecoderMethod = "DecodeExclusiveLdStInstruction";
+}
+
+// Neither Rs nor Rt2 operands.
+class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                               dag oops, dag iops, string asm, string operands>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
+  bits<5> Rt;
+  bits<5> Rn;
+  let Inst{20-16} = 0b11111;
+  let Unpredictable{20-16} = 0b11111;
+  let Inst{14-10} = 0b11111;
+  let Unpredictable{14-10} = 0b11111;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
+}
+
+// Simple load acquires don't set the exclusive monitor
+let mayLoad = 1, mayStore = 0 in
+class LoadAcquire<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                  RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+                               (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteLD]>;
+
+class LoadExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                    RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+                               (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteLD]>;
+
+class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                       RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+                             (outs regtype:$Rt, regtype:$Rt2),
+                             (ins GPR64sp0:$Rn), asm,
+                             "\t$Rt, $Rt2, [$Rn]">,
+      Sched<[WriteLD, WriteLDHi]> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
+}
+
+// Simple store release operations do not check the exclusive monitor.
+let mayLoad = 0, mayStore = 1 in
+class StoreRelease<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                   RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs),
+                               (ins regtype:$Rt, GPR64sp0:$Rn),
+                               asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteST]>;
+
+let mayLoad = 1, mayStore = 1 in
+class StoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                     RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, (outs GPR32:$Ws),
+                             (ins regtype:$Rt, GPR64sp0:$Rn),
+                             asm, "\t$Ws, $Rt, [$Rn]">,
+      Sched<[WriteSTX]> {
+  bits<5> Ws;
+  bits<5> Rt;
+  bits<5> Rn;
+  let Inst{20-16} = Ws;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let Constraints = "@earlyclobber $Ws";
+  let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
+}
+
+class StoreExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                         RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+                             (outs GPR32:$Ws),
+                             (ins regtype:$Rt, regtype:$Rt2, GPR64sp0:$Rn),
+                              asm, "\t$Ws, $Rt, $Rt2, [$Rn]">,
+      Sched<[WriteSTX]> {
+  bits<5> Ws;
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  let Inst{20-16} = Ws;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let Constraints = "@earlyclobber $Ws";
+}
+
+//---
+// Exception generation
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
+    : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>,
+      Sched<[WriteSys]> {
+  bits<16> imm;
+  let Inst{31-24} = 0b11010100;
+  let Inst{23-21} = op1;
+  let Inst{20-5}  = imm;
+  let Inst{4-2}   = 0b000;
+  let Inst{1-0}   = ll;
+}
+
+let Predicates = [HasFPARMv8] in {
+
+//---
+// Floating point to integer conversion
+//---
+
+class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn),
+         asm, "\t$Rd, $Rn", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-29} = 0b00;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      Operand immType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+         asm, "\t$Rd, $Rn, $scale", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-29} = 0b00;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21}    = 0;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = scale;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
+           SDPatternOperator OpN> {
+  // Unscaled half-precision to 32-bit
+  def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm,
+                                     [(set GPR32:$Rd, (OpN FPR16:$Rn))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Predicates = [HasFullFP16];
+  }
+
+  // Unscaled half-precision to 64-bit
+  def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm,
+                                     [(set GPR64:$Rd, (OpN FPR16:$Rn))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Predicates = [HasFullFP16];
+  }
+
+  // Unscaled single-precision to 32-bit
+  def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
+                                     [(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+  }
+
+  // Unscaled single-precision to 64-bit
+  def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm,
+                                     [(set GPR64:$Rd, (OpN FPR32:$Rn))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+
+  // Unscaled double-precision to 32-bit
+  def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm,
+                                     [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+  }
+
+  // Unscaled double-precision to 64-bit
+  def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm,
+                                     [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+}
+
+multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
+                             SDPatternOperator OpN> {
+  // Scaled half-precision to 32-bit
+  def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32,
+                              fixedpoint_f16_i32, asm,
+              [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn,
+                                          fixedpoint_f16_i32:$scale)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let scale{5} = 1;
+    let Predicates = [HasFullFP16];
+  }
+
+  // Scaled half-precision to 64-bit
+  def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64,
+                              fixedpoint_f16_i64, asm,
+              [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn,
+                                          fixedpoint_f16_i64:$scale)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Predicates = [HasFullFP16];
+  }
+
+  // Scaled single-precision to 32-bit
+  def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
+                              fixedpoint_f32_i32, asm,
+              [(set GPR32:$Rd, (OpN (fmul FPR32:$Rn,
+                                          fixedpoint_f32_i32:$scale)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let scale{5} = 1;
+  }
+
+  // Scaled single-precision to 64-bit
+  def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64,
+                              fixedpoint_f32_i64, asm,
+              [(set GPR64:$Rd, (OpN (fmul FPR32:$Rn,
+                                          fixedpoint_f32_i64:$scale)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+
+  // Scaled double-precision to 32-bit
+  def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32,
+                              fixedpoint_f64_i32, asm,
+              [(set GPR32:$Rd, (OpN (fmul FPR64:$Rn,
+                                          fixedpoint_f64_i32:$scale)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let scale{5} = 1;
+  }
+
+  // Scaled double-precision to 64-bit
+  def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64,
+                              fixedpoint_f64_i64, asm,
+              [(set GPR64:$Rd, (OpN (fmul FPR64:$Rn,
+                                          fixedpoint_f64_i64:$scale)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+}
+
+//---
+// Integer to floating point conversion
+//---
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseIntegerToFP<bit isUnsigned,
+                      RegisterClass srcType, RegisterClass dstType,
+                      Operand immType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+         asm, "\t$Rd, $Rn, $scale", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-24} = 0b0011110;
+  let Inst{21-17} = 0b00001;
+  let Inst{16}    = isUnsigned;
+  let Inst{15-10} = scale;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseIntegerToFPUnscaled<bit isUnsigned,
+                      RegisterClass srcType, RegisterClass dstType,
+                      ValueType dvt, string asm, SDNode node>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn),
+         asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-24} = 0b0011110;
+  let Inst{21-17} = 0b10001;
+  let Inst{16}    = isUnsigned;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
+  // Unscaled
+  def UWHri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR16, f16, asm, node> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{23-22} = 0b11; // 16-bit FPR flag
+    let Predicates = [HasFullFP16];
+  }
+
+  def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{23-22} = 0b00; // 32-bit FPR flag
+  }
+
+  def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{23-22} = 0b01; // 64-bit FPR flag
+  }
+
+  def UXHri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR16, f16, asm, node> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{23-22} = 0b11; // 16-bit FPR flag
+    let Predicates = [HasFullFP16];
+  }
+
+  def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{23-22} = 0b00; // 32-bit FPR flag
+  }
+
+  def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{23-22} = 0b01; // 64-bit FPR flag
+  }
+
+  // Scaled
+  def SWHri: BaseIntegerToFP<isUnsigned, GPR32, FPR16, fixedpoint_f16_i32, asm,
+                             [(set FPR16:$Rd,
+                                   (fdiv (node GPR32:$Rn),
+                                         fixedpoint_f16_i32:$scale))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{23-22} = 0b11; // 16-bit FPR flag
+    let scale{5} = 1;
+    let Predicates = [HasFullFP16];
+  }
+
+  def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm,
+                             [(set FPR32:$Rd,
+                                   (fdiv (node GPR32:$Rn),
+                                         fixedpoint_f32_i32:$scale))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{23-22} = 0b00; // 32-bit FPR flag
+    let scale{5} = 1;
+  }
+
+  def SWDri: BaseIntegerToFP<isUnsigned, GPR32, FPR64, fixedpoint_f64_i32, asm,
+                             [(set FPR64:$Rd,
+                                   (fdiv (node GPR32:$Rn),
+                                         fixedpoint_f64_i32:$scale))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{23-22} = 0b01; // 64-bit FPR flag
+    let scale{5} = 1;
+  }
+
+  def SXHri: BaseIntegerToFP<isUnsigned, GPR64, FPR16, fixedpoint_f16_i64, asm,
+                             [(set FPR16:$Rd,
+                                   (fdiv (node GPR64:$Rn),
+                                         fixedpoint_f16_i64:$scale))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{23-22} = 0b11; // 16-bit FPR flag
+    let Predicates = [HasFullFP16];
+  }
+
+  def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm,
+                             [(set FPR32:$Rd,
+                                   (fdiv (node GPR64:$Rn),
+                                         fixedpoint_f32_i64:$scale))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{23-22} = 0b00; // 32-bit FPR flag
+  }
+
+  def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm,
+                             [(set FPR64:$Rd,
+                                   (fdiv (node GPR64:$Rn),
+                                         fixedpoint_f64_i64:$scale))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{23-22} = 0b01; // 64-bit FPR flag
+  }
+}
+
+//---
+// Unscaled integer <-> floating point conversion (i.e. FMOV)
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      string asm>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "",
+        // We use COPY_TO_REGCLASS for these bitconvert operations.
+        // copyPhysReg() expands the resultant COPY instructions after
+        // regalloc is done. This gives greater freedom for the allocator
+        // and related passes (coalescing, copy propagation, et. al.) to
+        // be more effective.
+        [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-24} = 0b0011110;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionToHigh<bits<2> rmode, bits<3> opcode,
+                     RegisterClass srcType, RegisterOperand dstType, string asm,
+                     string kind>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
+        "{\t$Rd"#kind#"$idx, $Rn|"#kind#"\t$Rd$idx, $Rn}", "", []>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111101;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod =  "DecodeFMOVLaneInstruction";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
+                     RegisterOperand srcType, RegisterClass dstType, string asm,
+                     string kind>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
+        "{\t$Rd, $Rn"#kind#"$idx|"#kind#"\t$Rd, $Rn$idx}", "", []>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111101;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod =  "DecodeFMOVLaneInstruction";
+}
+
+
+multiclass UnscaledConversion<string asm> {
+  def WHr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR16, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{23-22} = 0b11; // 16-bit FPR flag
+    let Predicates = [HasFullFP16];
+  }
+
+  def XHr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR16, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{23-22} = 0b11; // 16-bit FPR flag
+    let Predicates = [HasFullFP16];
+  }
+
+  def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{23-22} = 0b00; // 32-bit FPR flag
+  }
+
+  def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{23-22} = 0b01; // 64-bit FPR flag
+  }
+
+  def HWr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{23-22} = 0b11; // 16-bit FPR flag
+    let Predicates = [HasFullFP16];
+  }
+
+  def HXr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{23-22} = 0b11; // 16-bit FPR flag
+    let Predicates = [HasFullFP16];
+  }
+
+  def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{23-22} = 0b00; // 32-bit FPR flag
+  }
+
+  def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{23-22} = 0b01; // 64-bit FPR flag
+  }
+
+  def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
+                                             asm, ".d"> {
+    let Inst{31} = 1;
+    let Inst{22} = 0;
+  }
+
+  def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64,
+                                               asm, ".d"> {
+    let Inst{31} = 1;
+    let Inst{22} = 0;
+  }
+}
+
+//---
+// Floating point conversion
+//---
+
+class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
+                       RegisterClass srcType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-24} = 0b00011110;
+  let Inst{23-22} = type;
+  let Inst{21-17} = 0b10001;
+  let Inst{16-15} = opcode;
+  let Inst{14-10} = 0b10000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPConversion<string asm> {
+  // Double-precision to Half-precision
+  def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm,
+                             [(set FPR16:$Rd, (fpround FPR64:$Rn))]>;
+
+  // Double-precision to Single-precision
+  def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
+                             [(set FPR32:$Rd, (fpround FPR64:$Rn))]>;
+
+  // Half-precision to Double-precision
+  def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
+                             [(set FPR64:$Rd, (fpextend FPR16:$Rn))]>;
+
+  // Half-precision to Single-precision
+  def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
+                             [(set FPR32:$Rd, (fpextend FPR16:$Rn))]>;
+
+  // Single-precision to Double-precision
+  def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
+                             [(set FPR64:$Rd, (fpextend FPR32:$Rn))]>;
+
+  // Single-precision to Half-precision
+  def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
+                             [(set FPR16:$Rd, (fpround FPR32:$Rn))]>;
+}
+
+//---
+// Single operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
+                              ValueType vt, string asm, SDPatternOperator node>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+         [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-24} = 0b00011110;
+  let Inst{21-19} = 0b100;
+  let Inst{18-15} = opcode;
+  let Inst{14-10} = 0b10000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SingleOperandFPData<bits<4> opcode, string asm,
+                               SDPatternOperator node = null_frag> {
+  def Hr : BaseSingleOperandFPData<opcode, FPR16, f16, asm, node> {
+    let Inst{23-22} = 0b11; // 16-bit size flag
+    let Predicates = [HasFullFP16];
+  }
+
+  def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
+    let Inst{23-22} = 0b00; // 32-bit size flag
+  }
+
+  def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
+    let Inst{23-22} = 0b01; // 64-bit size flag
+  }
+}
+
+//---
+// Two operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
+                           string asm, list<dag> pat>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+         asm, "\t$Rd, $Rn, $Rm", "", pat>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-24} = 0b00011110;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass TwoOperandFPData<bits<4> opcode, string asm,
+                            SDPatternOperator node = null_frag> {
+  def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
+                         [(set (f16 FPR16:$Rd),
+                               (node (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]> {
+    let Inst{23-22} = 0b11; // 16-bit size flag
+    let Predicates = [HasFullFP16];
+  }
+
+  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+                         [(set (f32 FPR32:$Rd),
+                               (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
+    let Inst{23-22} = 0b00; // 32-bit size flag
+  }
+
+  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+                         [(set (f64 FPR64:$Rd),
+                               (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
+    let Inst{23-22} = 0b01; // 64-bit size flag
+  }
+}
+
+multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
+  def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
+                  [(set FPR16:$Rd, (fneg (node FPR16:$Rn, (f16 FPR16:$Rm))))]> {
+    let Inst{23-22} = 0b11; // 16-bit size flag
+    let Predicates = [HasFullFP16];
+  }
+
+  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+                  [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
+    let Inst{23-22} = 0b00; // 32-bit size flag
+  }
+
+  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+                  [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
+    let Inst{23-22} = 0b01; // 64-bit size flag
+  }
+}
+
+
+//---
+// Three operand floating point data processing
+//---
+
+class BaseThreeOperandFPData<bit isNegated, bit isSub,
+                             RegisterClass regtype, string asm, list<dag> pat>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
+         asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>,
+      Sched<[WriteFMul]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<5> Ra;
+  let Inst{31-24} = 0b00011111;
+  let Inst{21}    = isNegated;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = isSub;
+  let Inst{14-10} = Ra;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
+                              SDPatternOperator node> {
+  def Hrrr : BaseThreeOperandFPData<isNegated, isSub, FPR16, asm,
+            [(set FPR16:$Rd,
+                  (node (f16 FPR16:$Rn), (f16 FPR16:$Rm), (f16 FPR16:$Ra)))]> {
+    let Inst{23-22} = 0b11; // 16-bit size flag
+    let Predicates = [HasFullFP16];
+  }
+
+  def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
+            [(set FPR32:$Rd,
+                  (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
+    let Inst{23-22} = 0b00; // 32-bit size flag
+  }
+
+  def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
+            [(set FPR64:$Rd,
+                  (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
+    let Inst{23-22} = 0b01; // 64-bit size flag
+  }
+}
+
+//---
+// Floating point data comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandFPComparison<bit signalAllNans,
+                                 RegisterClass regtype, string asm,
+                                 list<dag> pat>
+    : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rn;
+  let Inst{31-24} = 0b00011110;
+  let Inst{21}    = 1;
+
+  let Inst{15-10} = 0b001000;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = 0b1000;
+
+  // Rm should be 0b00000 canonically, but we need to accept any value.
+  let PostEncoderMethod = "fixOneOperandFPComparison";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
+                                string asm, list<dag> pat>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rm;
+  bits<5> Rn;
+  let Inst{31-24} = 0b00011110;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = 0b001000;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = 0b0000;
+}
+
+multiclass FPComparison<bit signalAllNans, string asm,
+                        SDPatternOperator OpNode = null_frag> {
+  let Defs = [NZCV] in {
+  def Hrr : BaseTwoOperandFPComparison<signalAllNans, FPR16, asm,
+      [(OpNode FPR16:$Rn, (f16 FPR16:$Rm)), (implicit NZCV)]> {
+    let Inst{23-22} = 0b11;
+    let Predicates = [HasFullFP16];
+  }
+
+  def Hri : BaseOneOperandFPComparison<signalAllNans, FPR16, asm,
+      [(OpNode (f16 FPR16:$Rn), fpimm0), (implicit NZCV)]> {
+    let Inst{23-22} = 0b11;
+    let Predicates = [HasFullFP16];
+  }
+
+  def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
+      [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> {
+    let Inst{23-22} = 0b00;
+  }
+
+  def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
+      [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> {
+    let Inst{23-22} = 0b00;
+  }
+
+  def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
+      [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> {
+    let Inst{23-22} = 0b01;
+  }
+
+  def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
+      [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> {
+    let Inst{23-22} = 0b01;
+  }
+  } // Defs = [NZCV]
+}
+
+//---
+// Floating point conditional comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype,
+                           string mnemonic, list<dag> pat>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
+         mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", pat>,
+      Sched<[WriteFCmp]> {
+  let Uses = [NZCV];
+  let Defs = [NZCV];
+
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{31-24} = 0b00011110;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass FPCondComparison<bit signalAllNans, string mnemonic,
+                            SDPatternOperator OpNode = null_frag> {
+  def Hrr : BaseFPCondComparison<signalAllNans, FPR16, mnemonic, []> {
+    let Inst{23-22} = 0b11;
+    let Predicates = [HasFullFP16];
+  }
+
+  def Srr : BaseFPCondComparison<signalAllNans, FPR32, mnemonic,
+      [(set NZCV, (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm), (i32 imm:$nzcv),
+                          (i32 imm:$cond), NZCV))]> {
+    let Inst{23-22} = 0b00;
+  }
+
+  def Drr : BaseFPCondComparison<signalAllNans, FPR64, mnemonic,
+      [(set NZCV, (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm), (i32 imm:$nzcv),
+                          (i32 imm:$cond), NZCV))]> {
+    let Inst{23-22} = 0b01;
+  }
+}
+
+//---
+// Floating point conditional select
+//---
+
+class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel (vt regtype:$Rn), regtype:$Rm,
+                          (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{31-24} = 0b00011110;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b11;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPCondSelect<string asm> {
+  let Uses = [NZCV] in {
+  def Hrrr : BaseFPCondSelect<FPR16, f16, asm> {
+    let Inst{23-22} = 0b11;
+    let Predicates = [HasFullFP16];
+  }
+
+  def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
+    let Inst{23-22} = 0b00;
+  }
+
+  def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
+    let Inst{23-22} = 0b01;
+  }
+  } // Uses = [NZCV]
+}
+
+//---
+// Floating move immediate
+//---
+
+class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
+  : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "",
+      [(set regtype:$Rd, fpimmtype:$imm)]>,
+    Sched<[WriteFImm]> {
+  bits<5> Rd;
+  bits<8> imm;
+  let Inst{31-24} = 0b00011110;
+  let Inst{21}    = 1;
+  let Inst{20-13} = imm;
+  let Inst{12-5}  = 0b10000000;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPMoveImmediate<string asm> {
+  def Hi : BaseFPMoveImmediate<FPR16, fpimm16, asm> {
+    let Inst{23-22} = 0b11;
+    let Predicates = [HasFullFP16];
+  }
+
+  def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
+    let Inst{23-22} = 0b00;
+  }
+
+  def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
+    let Inst{23-22} = 0b01;
+  }
+}
+} // end of 'let Predicates = [HasFPARMv8]'
+
+//----------------------------------------------------------------------------
+// AdvSIMD
+//----------------------------------------------------------------------------
+
+let Predicates = [HasNEON] in {
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register vector instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVector<bit Q, bit U, bits<3> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string kind,
+                        list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-21} = size;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string kind,
+                        list<dag> pattern>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-21} = size;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// All operand sizes distinguished in the encoding.
+multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
+                                      asm, ".8b",
+         [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
+                                      asm, ".16b",
+         [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
+                                      asm, ".4h",
+         [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
+                                      asm, ".8h",
+         [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
+                                      asm, ".2s",
+         [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
+                                      asm, ".4s",
+         [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b111, opc, V128,
+                                      asm, ".2d",
+         [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+}
+
+// As above, but D sized elements unsupported.
+multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
+                                      asm, ".8b",
+        [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
+                                      asm, ".16b",
+        [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
+                                      asm, ".4h",
+        [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
+                                      asm, ".8h",
+        [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
+                                      asm, ".2s",
+        [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
+                                      asm, ".4s",
+        [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
+}
+
+multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b001, opc, V64,
+                                      asm, ".8b",
+      [(set (v8i8 V64:$dst),
+            (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b001, opc, V128,
+                                      asm, ".16b",
+      [(set (v16i8 V128:$dst),
+            (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b011, opc, V64,
+                                      asm, ".4h",
+      [(set (v4i16 V64:$dst),
+            (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b011, opc, V128,
+                                      asm, ".8h",
+      [(set (v8i16 V128:$dst),
+            (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b101, opc, V64,
+                                      asm, ".2s",
+      [(set (v2i32 V64:$dst),
+            (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b101, opc, V128,
+                                      asm, ".4s",
+      [(set (v4i32 V128:$dst),
+            (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// As above, but only B sized elements supported.
+multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64,
+                                      asm, ".8b",
+    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128,
+                                      asm, ".16b",
+    [(set (v16i8 V128:$Rd),
+          (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+}
+
+// As above, but only floating point elements supported.
+multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+                                      asm, ".4h",
+        [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+  def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+                                      asm, ".8h",
+        [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
+                                      asm, ".2s",
+        [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
+                                      asm, ".4s",
+        [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
+                                      asm, ".2d",
+        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc,
+                                    string asm,
+                                    SDPatternOperator OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64,
+                                      asm, ".4h",
+        [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+  def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128,
+                                      asm, ".8h",
+        [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64,
+                                      asm, ".2s",
+        [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128,
+                                      asm, ".4s",
+        [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128,
+                                      asm, ".2d",
+        [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDThreeSameVectorTied<0, U, {S,0b10}, {0b00,opc}, V64,
+                                      asm, ".4h",
+     [(set (v4f16 V64:$dst),
+           (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>;
+  def v8f16 : BaseSIMDThreeSameVectorTied<1, U, {S,0b10}, {0b00,opc}, V128,
+                                      asm, ".8h",
+     [(set (v8f16 V128:$dst),
+           (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0b01}, {0b11,opc}, V64,
+                                      asm, ".2s",
+     [(set (v2f32 V64:$dst),
+           (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0b01}, {0b11,opc}, V128,
+                                      asm, ".4s",
+     [(set (v4f32 V128:$dst),
+           (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,0b11}, {0b11,opc}, V128,
+                                      asm, ".2d",
+     [(set (v2f64 V128:$dst),
+           (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+// As above, but D and B sized elements unsupported.
+multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64,
+                                      asm, ".4h",
+        [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128,
+                                      asm, ".8h",
+        [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64,
+                                      asm, ".2s",
+        [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128,
+                                      asm, ".4s",
+        [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// Logical three vector ops share opcode bits, and only use B sized elements.
+multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, {size,1}, 0b00011, V64,
+                                     asm, ".8b",
+                         [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
+  def v16i8  : BaseSIMDThreeSameVector<1, U, {size,1}, 0b00011, V128,
+                                     asm, ".16b",
+                         [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
+
+  def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+}
+
+multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
+                                  string asm, SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64,
+                                     asm, ".8b",
+             [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, {size,1}, 0b00011, V128,
+                                     asm, ".16b",
+             [(set (v16i8 V128:$dst),
+                   (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                           (v16i8 V128:$Rm)))]>;
+
+  def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
+                           (v4i16 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
+                           (v2i32 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
+                           (v1i64 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
+                           (v8i16 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
+                           (v4i32 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
+                           (v2i64 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                            bits<2> size2, RegisterOperand regtype, string asm,
+                            string dstkind, string srckind, list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind #
+      "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                                bits<2> size2, RegisterOperand regtype,
+                                string asm, string dstkind, string srckind,
+                                list<dag> pattern>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind #
+      "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// Supports B, H, and S element sizes.
+multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
+                            SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
+                                      asm, ".8b", ".8b",
+                          [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
+                                      asm, ".16b", ".16b",
+                          [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
+                                      asm, ".4h", ".4h",
+                          [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
+                                      asm, ".8h", ".8h",
+                          [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
+                                      asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
+                                      asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
+                            RegisterOperand regtype, string asm, string dstkind,
+                            string srckind, string amount>
+  : I<(outs V128:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount #
+      "|" # dstkind # "\t$Rd, $Rn, #" #  amount # "}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-24} = 0b101110;
+  let Inst{23-22} = size;
+  let Inst{21-10} = 0b100001001110;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDVectorLShiftLongBySizeBHS {
+  let hasSideEffects = 0 in {
+  def v8i8  : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
+                                             "shll", ".8h",  ".8b", "8">;
+  def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
+                                             "shll2", ".8h", ".16b", "8">;
+  def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64,
+                                             "shll", ".4s",  ".4h", "16">;
+  def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128,
+                                             "shll2", ".4s", ".8h", "16">;
+  def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64,
+                                             "shll", ".2d",  ".2s", "32">;
+  def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128,
+                                             "shll2", ".2d", ".4s", "32">;
+  }
+}
+
+// Supports all element sizes.
+multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
+                                      asm, ".4h", ".8b",
+               [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
+                                      asm, ".8h", ".16b",
+               [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
+                                      asm, ".2s", ".4h",
+               [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
+                                      asm, ".4s", ".8h",
+               [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
+                                      asm, ".1d", ".2s",
+               [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
+                                      asm, ".2d", ".4s",
+               [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode> {
+  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
+                                          asm, ".4h", ".8b",
+      [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
+                                      (v8i8 V64:$Rn)))]>;
+  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
+                                          asm, ".8h", ".16b",
+      [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
+                                      (v16i8 V128:$Rn)))]>;
+  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
+                                          asm, ".2s", ".4h",
+      [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
+                                      (v4i16 V64:$Rn)))]>;
+  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
+                                          asm, ".4s", ".8h",
+      [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
+                                      (v8i16 V128:$Rn)))]>;
+  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
+                                          asm, ".1d", ".2s",
+      [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
+                                      (v2i32 V64:$Rn)))]>;
+  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
+                                          asm, ".2d", ".4s",
+      [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
+                                      (v4i32 V128:$Rn)))]>;
+}
+
+// Supports all element sizes, except 1xD.
+multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64,
+                                    asm, ".8b", ".8b",
+    [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128,
+                                    asm, ".16b", ".16b",
+    [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64,
+                                    asm, ".4h", ".4h",
+    [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128,
+                                    asm, ".8h", ".8h",
+    [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64,
+                                    asm, ".2s", ".2s",
+    [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128,
+                                    asm, ".4s", ".4s",
+    [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, 0b00, V128,
+                                    asm, ".2d", ".2d",
+    [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
+                                asm, ".8b", ".8b",
+    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
+                                asm, ".16b", ".16b",
+    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
+                                asm, ".4h", ".4h",
+    [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
+                                asm, ".8h", ".8h",
+    [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64,
+                                asm, ".2s", ".2s",
+    [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128,
+                                asm, ".4s", ".4s",
+    [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, 0b00, V128,
+                                asm, ".2d", ".2d",
+    [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+// Supports only B element sizes.
+multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
+                          SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, 0b00, V64,
+                                asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, 0b00, V128,
+                                asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+}
+
+// Supports only B and H element sizes.
+multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64,
+                                asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128,
+                                asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64,
+                                asm, ".4h", ".4h",
+                    [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128,
+                                asm, ".8h", ".8h",
+                    [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
+}
+
+// Supports only S and D element sizes, uses high bit of the size field
+// as an extra opcode bit.
+multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+                                asm, ".4h", ".4h",
+                          [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
+  def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+                                asm, ".8h", ".8h",
+                          [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+// Supports only S element size.
+multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+
+multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+                                asm, ".4h", ".4h",
+                          [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>;
+  def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+                                asm, ".8h", ".8h",
+                          [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64,
+                                asm, ".4h", ".4h",
+                          [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128,
+                                asm, ".8h", ".8h",
+                          [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand inreg, RegisterOperand outreg,
+                           string asm, string outkind, string inkind,
+                           list<dag> pattern>
+  : I<(outs outreg:$Rd), (ins inreg:$Rn), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind #
+      "|" # outkind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand inreg, RegisterOperand outreg,
+                           string asm, string outkind, string inkind,
+                           list<dag> pattern>
+  : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind #
+      "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64,
+                                      asm, ".8b", ".8h",
+        [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128,
+                                      asm#"2", ".16b", ".8h", []>;
+  def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64,
+                                      asm, ".4h", ".4s",
+        [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128,
+                                      asm#"2", ".8h", ".4s", []>;
+  def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64,
+                                      asm, ".2s", ".2d",
+        [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+  def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128,
+                                      asm#"2", ".4s", ".2d", []>;
+
+  def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v16i8")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v8i16")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v4i32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2,
+                           bits<5> opcode, RegisterOperand regtype, string asm,
+                           string kind, string zero, ValueType dty,
+                           ValueType sty, SDNode OpNode>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
+      "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
+      [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// Comparisons support all element sizes, except 1xD.
+multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
+                            SDNode OpNode> {
+  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, 0b00, opc, V64,
+                                     asm, ".8b", "0",
+                                     v8i8, v8i8, OpNode>;
+  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, 0b00, opc, V128,
+                                     asm, ".16b", "0",
+                                     v16i8, v16i8, OpNode>;
+  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, 0b00, opc, V64,
+                                     asm, ".4h", "0",
+                                     v4i16, v4i16, OpNode>;
+  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, 0b00, opc, V128,
+                                     asm, ".8h", "0",
+                                     v8i16, v8i16, OpNode>;
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, 0b00, opc, V64,
+                                     asm, ".2s", "0",
+                                     v2i32, v2i32, OpNode>;
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, 0b00, opc, V128,
+                                     asm, ".4s", "0",
+                                     v4i32, v4i32, OpNode>;
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, 0b00, opc, V128,
+                                     asm, ".2d", "0",
+                                     v2i64, v2i64, OpNode>;
+}
+
+// FP Comparisons support only S and D element sizes (and H for v8.2a).
+multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
+                              string asm, SDNode OpNode> {
+
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64,
+                                     asm, ".4h", "0.0",
+                                     v4i16, v4f16, OpNode>;
+  def v8i16rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b11, opc, V128,
+                                     asm, ".8h", "0.0",
+                                     v8i16, v8f16, OpNode>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, 0b00, opc, V64,
+                                     asm, ".2s", "0.0",
+                                     v2i32, v2f32, OpNode>;
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, 0b00, opc, V128,
+                                     asm, ".4s", "0.0",
+                                     v4i32, v4f32, OpNode>;
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128,
+                                     asm, ".2d", "0.0",
+                                     v2i64, v2f64, OpNode>;
+
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0",
+                  (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # "\t$Vd.8h, $Vn.8h, #0",
+                  (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
+  }
+  def : InstAlias<asm # "\t$Vd.2s, $Vn.2s, #0",
+                  (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # "\t$Vd.4s, $Vn.4s, #0",
+                  (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # "\t$Vd.2d, $Vn.2d, #0",
+                  (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def : InstAlias<asm # ".4h\t$Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # ".8h\t$Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>;
+  }
+  def : InstAlias<asm # ".2s\t$Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # ".4s\t$Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # ".2d\t$Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                             RegisterOperand outtype, RegisterOperand intype,
+                             string asm, string VdTy, string VnTy,
+                             list<dag> pattern>
+  : I<(outs outtype:$Rd), (ins intype:$Rn), asm,
+      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                             RegisterOperand outtype, RegisterOperand intype,
+                             string asm, string VdTy, string VnTy,
+                             list<dag> pattern>
+  : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm,
+      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDFPWidenTwoVector<bit U, bit S, bits<5> opc, string asm> {
+  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64,
+                                    asm, ".4s", ".4h", []>;
+  def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128,
+                                    asm#"2", ".4s", ".8h", []>;
+  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64,
+                                    asm, ".2d", ".2s", []>;
+  def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".2d", ".4s", []>;
+}
+
+multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
+  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128,
+                                    asm, ".4h", ".4s", []>;
+  def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128,
+                                    asm#"2", ".8h", ".4s", []>;
+  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+                                    asm, ".2s", ".2d", []>;
+  def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".4s", ".2d", []>;
+}
+
+multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
+                                     Intrinsic OpNode> {
+  def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+                                     asm, ".2s", ".2d",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+  def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".4s", ".2d", []>;
+
+  def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v4f32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register different-size vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode,
+                      RegisterOperand outtype, RegisterOperand intype1,
+                      RegisterOperand intype2, string asm,
+                      string outkind, string inkind1, string inkind2,
+                      list<dag> pattern>
+  : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+      "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode,
+                      RegisterOperand outtype, RegisterOperand intype1,
+                      RegisterOperand intype2, string asm,
+                      string outkind, string inkind1, string inkind2,
+                      list<dag> pattern>
+  : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+      "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// FIXME: TableGen doesn't know how to deal with expanded types that also
+//        change the element count (in this case, placing the results in
+//        the high elements of the result register rather than the low
+//        elements). Until that's fixed, we can't code-gen those.
+multiclass SIMDNarrowThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                    Intrinsic IntOp> {
+  def v8i16_v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".8b", ".8h", ".8h",
+     [(set (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v8i16_v16i8  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".16b", ".8h", ".8h",
+     []>;
+  def v4i32_v4i16  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".4h", ".4s", ".4s",
+     [(set (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v4i32_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".8h", ".4s", ".4s",
+     []>;
+  def v2i64_v2i32  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".2s", ".2d", ".2d",
+     [(set (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+  def v2i64_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".2d", ".2d",
+     []>;
+
+
+  // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in
+  // a version attached to an instruction.
+  def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn),
+                                                   (v8i16 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v8i16_v16i8")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn),
+                                                    (v4i32 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v4i32_v8i16")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn),
+                                                    (v2i64 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v2i64_v4i32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
+                                      Intrinsic IntOp> {
+  def v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                            V128, V64, V64,
+                                            asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                            V128, V128, V128,
+                                            asm#"2", ".8h", ".16b", ".16b", []>;
+  let Predicates = [HasCrypto] in {
+    def v1i64  : BaseSIMDDifferentThreeVector<U, 0b110, opc,
+                                              V128, V64, V64,
+                                              asm, ".1q", ".1d", ".1d", []>;
+    def v2i64  : BaseSIMDDifferentThreeVector<U, 0b111, opc,
+                                              V128, V128, V128,
+                                              asm#"2", ".1q", ".2d", ".2d", []>;
+  }
+
+  def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
+                          (v8i8 (extract_high_v16i8 V128:$Rm)))),
+      (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
+                                 SDPatternOperator OpNode> {
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+                                      (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+                                      (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd),
+            (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+      [(set (v8i16 V128:$Rd),
+            (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+                                (extract_high_v16i8 V128:$Rm)))))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd),
+            (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd),
+            (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+                                  (extract_high_v8i16 V128:$Rm)))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd),
+            (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd),
+            (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+                                 (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
+                                          string asm,
+                                          SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+    [(set (v8i16 V128:$dst),
+          (add (v8i16 V128:$Rd),
+               (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+    [(set (v8i16 V128:$dst),
+          (add (v8i16 V128:$Rd),
+               (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+                                   (extract_high_v16i8 V128:$Rm))))))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (add (v4i32 V128:$Rd),
+               (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (add (v4i32 V128:$Rd),
+               (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+                                    (extract_high_v8i16 V128:$Rm))))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (add (v2i64 V128:$Rd),
+               (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (add (v2i64 V128:$Rd),
+               (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+                                    (extract_high_v4i32 V128:$Rm))))))]>;
+}
+
+multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+      [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
+                                      (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+                                      (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+                                      (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
+                                      string asm,
+                                      SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+    [(set (v8i16 V128:$dst),
+          (OpNode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+    [(set (v8i16 V128:$dst),
+          (OpNode (v8i16 V128:$Rd),
+                  (extract_high_v16i8 V128:$Rn),
+                  (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd),
+                  (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd),
+                  (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
+                                           SDPatternOperator Accum> {
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+                                                (v4i16 V64:$Rm)))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
+                                            (extract_high_v8i16 V128:$Rm)))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull (v2i32 V64:$Rn),
+                                                (v2i32 V64:$Rm)))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
+                                            (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".8h", ".8h", ".8b",
+       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".8h", ".8h", ".16b",
+       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                                       (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".4s", ".4s", ".4h",
+       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".4s", ".8h",
+       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                                       (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".2d", ".2d", ".2s",
+       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".2d", ".4s",
+       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                                       (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty,
+                             string asm, string kind>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" #
+      "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "",
+      [(set (vty regtype:$Rd),
+            (AArch64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> imm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size;
+  let Inst{29-21} = 0b101110000;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-11} = imm;
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+
+multiclass SIMDBitwiseExtract<string asm> {
+  def v8i8  : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b"> {
+    let imm{3} = 0;
+  }
+  def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype,
+                        string asm, string kind, SDNode OpNode, ValueType valty>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm}", "",
+      [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29-24} = 0b001110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 0;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDZipVector<bits<3>opc, string asm,
+                         SDNode OpNode> {
+  def v8i8   : BaseSIMDZipVector<0b000, opc, V64,
+      asm, ".8b", OpNode, v8i8>;
+  def v16i8  : BaseSIMDZipVector<0b001, opc, V128,
+      asm, ".16b", OpNode, v16i8>;
+  def v4i16  : BaseSIMDZipVector<0b010, opc, V64,
+      asm, ".4h", OpNode, v4i16>;
+  def v8i16  : BaseSIMDZipVector<0b011, opc, V128,
+      asm, ".8h", OpNode, v8i16>;
+  def v2i32  : BaseSIMDZipVector<0b100, opc, V64,
+      asm, ".2s", OpNode, v2i32>;
+  def v4i32  : BaseSIMDZipVector<0b101, opc, V128,
+      asm, ".4s", OpNode, v4i32>;
+  def v2i64  : BaseSIMDZipVector<0b111, opc, V128,
+      asm, ".2d", OpNode, v2i64>;
+
+  def : Pat<(v4f16 (OpNode V64:$Rn, V64:$Rm)),
+        (!cast<Instruction>(NAME#"v4i16") V64:$Rn, V64:$Rm)>;
+  def : Pat<(v8f16 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v8i16") V128:$Rn, V128:$Rm)>;
+  def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
+        (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
+  def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v4i32") V128:$Rn, V128:$Rm)>;
+  def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v2i64") V128:$Rn, V128:$Rm)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDThreeScalar<bit U, bits<3> size, bits<5> opcode,
+                        RegisterClass regtype, string asm,
+                        list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "\t$Rd, $Rn, $Rm", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-21} = size;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode,
+                        dag oops, dag iops, string asm,
+            list<dag> pattern>
+  : I<oops, iops, asm, "\t$Rd, $Rn, $Rm", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21}    = R;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
+                            SDPatternOperator OpNode> {
+  def v1i64  : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+}
+
+multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v1i64  : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+  def v1i32  : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, []>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
+  def v1i8   : BaseSIMDThreeScalar<U, 0b001, opc, FPR8 , asm, []>;
+
+  def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
+  def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
+            (!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
+}
+
+multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v1i32  : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm,
+                             [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
+}
+
+multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def v1i32: BaseSIMDThreeScalarTied<U, 0b10, R, opc, (outs FPR32:$dst),
+                                     (ins FPR32:$Rd, FPR32:$Rn, FPR32:$Rm), 
+                                     asm, []>;
+  def v1i16: BaseSIMDThreeScalarTied<U, 0b01, R, opc, (outs FPR16:$dst),
+                                     (ins FPR16:$Rd, FPR16:$Rn, FPR16:$Rm), 
+                                     asm, []>;
+}
+
+multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
+      [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
+      [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+    let Predicates = [HasNEON, HasFullFP16] in {
+    def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+      [(set FPR16:$Rd, (OpNode FPR16:$Rn, FPR16:$Rm))]>;
+    } // Predicates = [HasNEON, HasFullFP16]
+  }
+
+  def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm,
+                                SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
+      [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
+      [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
+    let Predicates = [HasNEON, HasFullFP16] in {
+    def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+      []>;
+    } // Predicates = [HasNEON, HasFullFP16]
+  }
+
+  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
+              dag oops, dag iops, string asm, string cstr, list<dag> pat>
+  : I<oops, iops, asm,
+      "\t$Rd, $Rn, $Rm", cstr, pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+                                      (outs FPR32:$Rd),
+                                      (ins FPR16:$Rn, FPR16:$Rm), asm, "", []>;
+  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+                                      (outs FPR64:$Rd),
+                                      (ins FPR32:$Rn, FPR32:$Rm), asm, "",
+            [(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+                                      (outs FPR32:$dst),
+                                      (ins FPR32:$Rd, FPR16:$Rn, FPR16:$Rm),
+                                      asm, "$Rd = $dst", []>;
+  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+                                      (outs FPR64:$dst),
+                                      (ins FPR64:$Rd, FPR32:$Rn, FPR32:$Rm),
+                                      asm, "$Rd = $dst",
+            [(set (i64 FPR64:$dst),
+                  (OpNode (i64 FPR64:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
+                        RegisterClass regtype, RegisterClass regtype2,
+                        string asm, list<dag> pat>
+  : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
+      "\t$Rd, $Rn", "", pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, RegisterClass regtype2,
+                        string asm, list<dag> pat>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm,
+      "\t$Rd, $Rn", "$Rd = $dst", pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode,
+                        RegisterClass regtype, string asm, string zero>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "\t$Rd, $Rn, #" # zero, "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = size2;
+  let Inst{18-17} = 0b00;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
+  : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
+     [(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-17} = 0b011111100110000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, 0b00, opc, FPR64, asm, "0">;
+
+  def : Pat<(v1i64 (OpNode FPR64:$Rn)),
+            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">;
+  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1i16rz  : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">;
+  }
+
+  def : InstAlias<asm # "\t$Rd, $Rn, #0",
+                  (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
+  def : InstAlias<asm # "\t$Rd, $Rn, #0",
+                  (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def : InstAlias<asm # "\t$Rd, $Rn, #0",
+                  (!cast<Instruction>(NAME # v1i16rz) FPR16:$Rd, FPR16:$Rn), 0>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
+                          SDPatternOperator OpNode = null_frag> {
+  def v1i64       : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
+
+  def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
+}
+
+multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> {
+  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>;
+  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1f16       : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>;
+  }
+}
+
+multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,
+                                [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
+  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,
+                                [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
+                                [(set FPR16:$Rd, (OpNode (f16 FPR16:$Rn)))]>;
+  }
+}
+
+multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def v1i64  : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm,
+           [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+    def v1i32  : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR32, asm,
+           [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+    def v1i16  : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR8 , asm, []>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
+                                 Intrinsic OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def v1i64  : BaseSIMDTwoScalarTied<U, 0b11, opc, FPR64, FPR64, asm,
+        [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn)))]>;
+    def v1i32  : BaseSIMDTwoScalarTied<U, 0b10, opc, FPR32, FPR32, asm,
+        [(set (i32 FPR32:$dst), (OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn)))]>;
+    def v1i16  : BaseSIMDTwoScalarTied<U, 0b01, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalarTied<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>;
+}
+
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def v1i32  : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR64, asm,
+        [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+  def v1i16  : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR32, asm, []>;
+  def v1i8   : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR16, asm, []>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, RegisterOperand vectype,
+                        string asm, string kind>
+  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
+  def v2i64p : BaseSIMDPairwiseScalar<U, 0b11, opc, FPR64Op, V128,
+                                      asm, ".2d">;
+}
+
+multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64,
+                                      asm, ".2h">;
+  }
+  def v2i32p : BaseSIMDPairwiseScalar<1, {S,0}, opc, FPR32Op, V64,
+                                      asm, ".2s">;
+  def v2i64p : BaseSIMDPairwiseScalar<1, {S,1}, opc, FPR64Op, V128,
+                                      asm, ".2d">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode,
+                          RegisterClass regtype, RegisterOperand vectype,
+                          string asm, string kind, list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDAcrossLanesBHS<bit U, bits<5> opcode,
+                              string asm> {
+  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8,  V64,
+                                   asm, ".8b", []>;
+  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8,  V128,
+                                   asm, ".16b", []>;
+  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64,
+                                   asm, ".4h", []>;
+  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128,
+                                   asm, ".8h", []>;
+  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128,
+                                   asm, ".4s", []>;
+}
+
+multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
+  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64,
+                                   asm, ".8b", []>;
+  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128,
+                                   asm, ".16b", []>;
+  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64,
+                                   asm, ".4h", []>;
+  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128,
+                                   asm, ".8h", []>;
+  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128,
+                                   asm, ".4s", []>;
+}
+
+multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm,
+                            Intrinsic intOp> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64,
+                                   asm, ".4h",
+        [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>;
+  def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128,
+                                   asm, ".8h",
+        [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>;
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
+                                   asm, ".4s",
+        [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+// FIXME: There has got to be a better way to factor these. ugh.
+
+class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm,
+                     string operands, string constraints, list<dag> pattern>
+  : I<outs, ins, asm, operands, constraints, pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
+  let Inst{29} = op;
+  let Inst{28-21} = 0b01110000;
+  let Inst{15} = 0;
+  let Inst{10} = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype,
+                      RegisterOperand vecreg, RegisterClass regtype>
+  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins regtype:$Rn), "dup",
+                   "{\t$Rd" # size # ", $Rn" #
+                   "|" # size # "\t$Rd, $Rn}", "",
+                   [(set (vectype vecreg:$Rd), (AArch64dup regtype:$Rn))]> {
+  let Inst{20-16} = imm5;
+  let Inst{14-11} = 0b0001;
+}
+
+class SIMDDupFromElement<bit Q, string dstkind, string srckind,
+                         ValueType vectype, ValueType insreg,
+                         RegisterOperand vecreg, Operand idxtype,
+                         ValueType elttype, SDNode OpNode>
+  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup",
+                   "{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" #
+                   "|" # dstkind # "\t$Rd, $Rn$idx}", "",
+                 [(set (vectype vecreg:$Rd),
+                       (OpNode (insreg V128:$Rn), idxtype:$idx))]> {
+  let Inst{14-11} = 0b0000;
+}
+
+class SIMDDup64FromElement
+  : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128,
+                       VectorIndexD, i64, AArch64duplane64> {
+  bits<1> idx;
+  let Inst{20} = idx;
+  let Inst{19-16} = 0b1000;
+}
+
+class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
+                           RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg,
+                       VectorIndexS, i64, AArch64duplane32> {
+  bits<2> idx;
+  let Inst{20-19} = idx;
+  let Inst{18-16} = 0b100;
+}
+
+class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
+                           RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg,
+                       VectorIndexH, i64, AArch64duplane16> {
+  bits<3> idx;
+  let Inst{20-18} = idx;
+  let Inst{17-16} = 0b10;
+}
+
+class SIMDDup8FromElement<bit Q, string size, ValueType vectype,
+                          RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg,
+                       VectorIndexB, i64, AArch64duplane8> {
+  bits<4> idx;
+  let Inst{20-17} = idx;
+  let Inst{16} = 1;
+}
+
+class BaseSIMDMov<bit Q, string size, bits<4> imm4, RegisterClass regtype,
+                  Operand idxtype, string asm, list<dag> pattern>
+  : BaseSIMDInsDup<Q, 0, (outs regtype:$Rd), (ins V128:$Rn, idxtype:$idx), asm,
+                   "{\t$Rd, $Rn" # size # "$idx" #
+                   "|" # size # "\t$Rd, $Rn$idx}", "", pattern> {
+  let Inst{14-11} = imm4;
+}
+
+class SIMDSMov<bit Q, string size, RegisterClass regtype,
+               Operand idxtype>
+  : BaseSIMDMov<Q, size, 0b0101, regtype, idxtype, "smov", []>;
+class SIMDUMov<bit Q, string size, ValueType vectype, RegisterClass regtype,
+               Operand idxtype>
+  : BaseSIMDMov<Q, size, 0b0111, regtype, idxtype, "umov",
+      [(set regtype:$Rd, (vector_extract (vectype V128:$Rn), idxtype:$idx))]>;
+
+class SIMDMovAlias<string asm, string size, Instruction inst,
+                   RegisterClass regtype, Operand idxtype>
+    : InstAlias<asm#"{\t$dst, $src"#size#"$idx" #
+                    "|" # size # "\t$dst, $src$idx}",
+                (inst regtype:$dst, V128:$src, idxtype:$idx)>;
+
+multiclass SMov {
+  def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+}
+
+multiclass UMov {
+  def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+  def : SIMDMovAlias<"mov", ".s",
+                     !cast<Instruction>(NAME#"vi32"),
+                     GPR32, VectorIndexS>;
+  def : SIMDMovAlias<"mov", ".d",
+                     !cast<Instruction>(NAME#"vi64"),
+                     GPR64, VectorIndexD>;
+}
+
+class SIMDInsFromMain<string size, ValueType vectype,
+                      RegisterClass regtype, Operand idxtype>
+  : BaseSIMDInsDup<1, 0, (outs V128:$dst),
+                   (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins",
+                   "{\t$Rd" # size # "$idx, $Rn" #
+                   "|" # size # "\t$Rd$idx, $Rn}",
+                   "$Rd = $dst",
+            [(set V128:$dst,
+              (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> {
+  let Inst{14-11} = 0b0011;
+}
+
+class SIMDInsFromElement<string size, ValueType vectype,
+                         ValueType elttype, Operand idxtype>
+  : BaseSIMDInsDup<1, 1, (outs V128:$dst),
+                   (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins",
+                   "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" #
+                   "|" # size # "\t$Rd$idx, $Rn$idx2}",
+                   "$Rd = $dst",
+         [(set V128:$dst,
+               (vector_insert
+                 (vectype V128:$Rd),
+                 (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)),
+                 idxtype:$idx))]>;
+
+class SIMDInsMainMovAlias<string size, Instruction inst,
+                          RegisterClass regtype, Operand idxtype>
+    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" #
+                        "|" # size #"\t$dst$idx, $src}",
+                (inst V128:$dst, idxtype:$idx, regtype:$src)>;
+class SIMDInsElementMovAlias<string size, Instruction inst,
+                             Operand idxtype>
+    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
+                      # "|" # size #"\t$dst$idx, $src$idx2}",
+                (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
+
+
+multiclass SIMDIns {
+  def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+
+  def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> {
+    bits<4> idx;
+    bits<4> idx2;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+    let Inst{14-11} = idx2;
+  }
+  def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> {
+    bits<3> idx;
+    bits<3> idx2;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+    let Inst{14-12} = idx2;
+    let Inst{11} = {?};
+  }
+  def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> {
+    bits<2> idx;
+    bits<2> idx2;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+    let Inst{14-13} = idx2;
+    let Inst{12-11} = {?,?};
+  }
+  def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> {
+    bits<1> idx;
+    bits<1> idx2;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+    let Inst{14} = idx2;
+    let Inst{13-11} = {?,?,?};
+  }
+
+  // For all forms of the INS instruction, the "mov" mnemonic is the
+  // preferred alias. Why they didn't just call the instruction "mov" in
+  // the first place is a very good question indeed...
+  def : SIMDInsMainMovAlias<".b", !cast<Instruction>(NAME#"vi8gpr"),
+                         GPR32, VectorIndexB>;
+  def : SIMDInsMainMovAlias<".h", !cast<Instruction>(NAME#"vi16gpr"),
+                         GPR32, VectorIndexH>;
+  def : SIMDInsMainMovAlias<".s", !cast<Instruction>(NAME#"vi32gpr"),
+                         GPR32, VectorIndexS>;
+  def : SIMDInsMainMovAlias<".d", !cast<Instruction>(NAME#"vi64gpr"),
+                         GPR64, VectorIndexD>;
+
+  def : SIMDInsElementMovAlias<".b", !cast<Instruction>(NAME#"vi8lane"),
+                         VectorIndexB>;
+  def : SIMDInsElementMovAlias<".h", !cast<Instruction>(NAME#"vi16lane"),
+                         VectorIndexH>;
+  def : SIMDInsElementMovAlias<".s", !cast<Instruction>(NAME#"vi32lane"),
+                         VectorIndexS>;
+  def : SIMDInsElementMovAlias<".d", !cast<Instruction>(NAME#"vi64lane"),
+                         VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+                          RegisterOperand listtype, string asm, string kind>
+  : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm,
+       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>,
+    Sched<[WriteV]> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-21} = 0b001110000;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0;
+  let Inst{14-13} = len;
+  let Inst{12}    = op;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Vd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+                          RegisterOperand listtype, string asm, string kind>
+  : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm,
+       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>,
+    Sched<[WriteV]> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-21} = 0b001110000;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0;
+  let Inst{14-13} = len;
+  let Inst{12}    = op;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Vd;
+}
+
+class SIMDTableLookupAlias<string asm, Instruction inst,
+                          RegisterOperand vectype, RegisterOperand listtype>
+    : InstAlias<!strconcat(asm, "\t$dst, $lst, $index"),
+                (inst vectype:$dst, listtype:$lst, vectype:$index), 0>;
+
+multiclass SIMDTableLookup<bit op, string asm> {
+  def v8i8One   : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b,
+                                      asm, ".8b">;
+  def v8i8Two   : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b,
+                                      asm, ".8b">;
+  def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b,
+                                      asm, ".8b">;
+  def v8i8Four  : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b,
+                                      asm, ".8b">;
+  def v16i8One  : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b,
+                                      asm, ".16b">;
+  def v16i8Two  : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b,
+                                      asm, ".16b">;
+  def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b,
+                                      asm, ".16b">;
+  def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b,
+                                      asm, ".16b">;
+
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8One"),
+                         V64, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Two"),
+                         V64, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Three"),
+                         V64, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Four"),
+                         V64, VecListFour128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8One"),
+                         V128, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Two"),
+                         V128, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Three"),
+                         V128, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Four"),
+                         V128, VecListFour128>;
+}
+
+multiclass SIMDTableLookupTied<bit op, string asm> {
+  def v8i8One   : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b,
+                                      asm, ".8b">;
+  def v8i8Two   : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b,
+                                      asm, ".8b">;
+  def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b,
+                                      asm, ".8b">;
+  def v8i8Four  : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b,
+                                      asm, ".8b">;
+  def v16i8One  : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b,
+                                      asm, ".16b">;
+  def v16i8Two  : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b,
+                                      asm, ".16b">;
+  def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b,
+                                      asm, ".16b">;
+  def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b,
+                                      asm, ".16b">;
+
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8One"),
+                         V64, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Two"),
+                         V64, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Three"),
+                         V64, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Four"),
+                         V64, VecListFour128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8One"),
+                         V128, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Two"),
+                         V128, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Three"),
+                         V128, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Four"),
+                         V128, VecListFour128>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY
+//----------------------------------------------------------------------------
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
+                        string kind, Operand idxtype>
+  : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov",
+       "{\t$dst, $src" # kind # "$idx" #
+       "|\t$dst, $src$idx}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> dst;
+  bits<5> src;
+  let Inst{31-21} = 0b01011110000;
+  let Inst{15-10} = 0b000001;
+  let Inst{9-5}   = src;
+  let Inst{4-0}   = dst;
+}
+
+class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
+      RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
+    : InstAlias<asm # "{\t$dst, $src" # size # "$index" #
+                    # "|\t$dst, $src$index}",
+                (inst regtype:$dst, vectype:$src, idxtype:$index), 0>;
+
+
+multiclass SIMDScalarCPY<string asm> {
+  def i8  : BaseSIMDScalarCPY<FPR8,  V128, ".b", VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+
+  def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract (v2i64 V128:$src),
+                                                          VectorIndexD:$idx)))),
+            (!cast<Instruction>(NAME # i64) V128:$src, VectorIndexD:$idx)>;
+
+  // 'DUP' mnemonic aliases.
+  def : SIMDScalarCPYAlias<"dup", ".b",
+                           !cast<Instruction>(NAME#"i8"),
+                           FPR8, V128, VectorIndexB>;
+  def : SIMDScalarCPYAlias<"dup", ".h",
+                           !cast<Instruction>(NAME#"i16"),
+                           FPR16, V128, VectorIndexH>;
+  def : SIMDScalarCPYAlias<"dup", ".s",
+                           !cast<Instruction>(NAME#"i32"),
+                           FPR32, V128, VectorIndexS>;
+  def : SIMDScalarCPYAlias<"dup", ".d",
+                           !cast<Instruction>(NAME#"i64"),
+                           FPR64, V128, VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//----------------------------------------------------------------------------
+
+class BaseSIMDModifiedImm<bit Q, bit op, bit op2, dag oops, dag iops,
+                          string asm, string op_string,
+                          string cstr, list<dag> pattern>
+  : I<oops, iops, asm, op_string, cstr, pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<8> imm8;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = op;
+  let Inst{28-19} = 0b0111100000;
+  let Inst{18-16} = imm8{7-5};
+  let Inst{11} = op2;
+  let Inst{10} = 1;
+  let Inst{9-5}   = imm8{4-0};
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDModifiedImmVector<bit Q, bit op, bit op2, RegisterOperand vectype,
+                                Operand immtype, dag opt_shift_iop,
+                                string opt_shift, string asm, string kind,
+                                list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, op2, (outs vectype:$Rd),
+                        !con((ins immtype:$imm8), opt_shift_iop), asm,
+                        "{\t$Rd" # kind # ", $imm8" # opt_shift #
+                        "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+                        "", pattern> {
+  let DecoderMethod = "DecodeModImmInstruction";
+}
+
+class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
+                                Operand immtype, dag opt_shift_iop,
+                                string opt_shift, string asm, string kind,
+                                list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, 0, (outs vectype:$dst),
+                        !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
+                        asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
+                             "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+                        "$Rd = $dst", pattern> {
+  let DecoderMethod = "DecodeModImmTiedInstruction";
+}
+
+class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
+                                     RegisterOperand vectype, string asm,
+                                     string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
+                              (ins logical_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15}    = b15_b12{1};
+  let Inst{14-13} = shift;
+  let Inst{12}    = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
+                                     RegisterOperand vectype, string asm,
+                                     string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+                              (ins logical_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15}    = b15_b12{1};
+  let Inst{14-13} = shift;
+  let Inst{12}    = b15_b12{0};
+}
+
+
+class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
+                                         RegisterOperand vectype, string asm,
+                                         string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
+                              (ins logical_vec_hw_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15} = b15_b12{1};
+  let Inst{14} = 0;
+  let Inst{13} = shift{0};
+  let Inst{12} = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftHalfTied<bit Q, bit op, bits<2> b15_b12,
+                                         RegisterOperand vectype, string asm,
+                                         string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+                              (ins logical_vec_hw_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15} = b15_b12{1};
+  let Inst{14} = 0;
+  let Inst{13} = shift{0};
+  let Inst{12} = b15_b12{0};
+}
+
+multiclass SIMDModifiedImmVectorShift<bit op, bits<2> hw_cmode, bits<2> w_cmode,
+                                      string asm> {
+  def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64,
+                                                 asm, ".4h", []>;
+  def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128,
+                                                 asm, ".8h", []>;
+
+  def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64,
+                                             asm, ".2s", []>;
+  def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128,
+                                             asm, ".4s", []>;
+}
+
+multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
+                                      bits<2> w_cmode, string asm,
+                                      SDNode OpNode> {
+  def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64,
+                                                 asm, ".4h",
+             [(set (v4i16 V64:$dst), (OpNode V64:$Rd,
+                                             imm0_255:$imm8,
+                                             (i32 imm:$shift)))]>;
+  def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128,
+                                                 asm, ".8h",
+             [(set (v8i16 V128:$dst), (OpNode V128:$Rd,
+                                              imm0_255:$imm8,
+                                              (i32 imm:$shift)))]>;
+
+  def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64,
+                                             asm, ".2s",
+             [(set (v2i32 V64:$dst), (OpNode V64:$Rd,
+                                             imm0_255:$imm8,
+                                             (i32 imm:$shift)))]>;
+  def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128,
+                                             asm, ".4s",
+             [(set (v4i32 V128:$dst), (OpNode V128:$Rd,
+                                              imm0_255:$imm8,
+                                              (i32 imm:$shift)))]>;
+}
+
+class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
+                             RegisterOperand vectype, string asm,
+                             string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255,
+                              (ins move_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<1> shift;
+  let Inst{15-13} = cmode{3-1};
+  let Inst{12}    = shift;
+}
+
+class SIMDModifiedImmVectorNoShift<bit Q, bit op, bit op2, bits<4> cmode,
+                                   RegisterOperand vectype,
+                                   Operand imm_type, string asm,
+                                   string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, op2, vectype, imm_type, (ins), "",
+                              asm, kind, pattern> {
+  let Inst{15-12} = cmode;
+}
+
+class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
+                                   list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, 0, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
+                        "\t$Rd, $imm8", "", pattern> {
+  let Inst{15-12} = cmode;
+  let DecoderMethod = "DecodeModImmInstruction";
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
+                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
+                      string apple_kind, string dst_kind, string lhs_kind,
+                      string rhs_kind, list<dag> pattern>
+  : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx),
+      asm,
+      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28}    = Scalar;
+  let Inst{27-24} = 0b1111;
+  let Inst{23-22} = size;
+  // Bit 21 must be set by the derived class.
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opc;
+  // Bit 11 must be set by the derived class.
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
+                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
+                      string apple_kind, string dst_kind, string lhs_kind,
+                      string rhs_kind, list<dag> pattern>
+  : I<(outs dst_reg:$dst),
+      (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm,
+      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28}    = Scalar;
+  let Inst{27-24} = 0b1111;
+  let Inst{23-22} = size;
+  // Bit 21 must be set by the derived class.
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opc;
+  // Bit 11 must be set by the derived class.
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
+                         SDPatternOperator OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc,
+                                      V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4f16 V64:$Rd),
+        (OpNode (v4f16 V64:$Rn),
+         (v4f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b00, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8f16 V128:$Rd),
+        (OpNode (v8f16 V128:$Rn),
+         (v8f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2f32 V64:$Rd),
+        (OpNode (v2f32 V64:$Rn),
+         (v2f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4f32 V128:$Rd),
+        (OpNode (v4f32 V128:$Rn),
+         (v4f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc,
+                                      V128, V128,
+                                      V128, VectorIndexD,
+                                      asm, ".2d", ".2d", ".2d", ".d",
+    [(set (v2f64 V128:$Rd),
+        (OpNode (v2f64 V128:$Rn),
+         (v2f64 (AArch64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b00, opc,
+                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h",
+    [(set (f16 FPR16Op:$Rd),
+          (OpNode (f16 FPR16Op:$Rn),
+                  (f16 (vector_extract (v8f16 V128_lo:$Rm),
+                                       VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+    [(set (f32 FPR32Op:$Rd),
+          (OpNode (f32 FPR32Op:$Rn),
+                  (f32 (vector_extract (v4f32 V128:$Rm),
+                                       VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc,
+                                      FPR64Op, FPR64Op, V128, VectorIndexD,
+                                      asm, ".d", "", "", ".d",
+    [(set (f64 FPR64Op:$Rd),
+          (OpNode (f64 FPR64Op:$Rn),
+                  (f64 (vector_extract (v2f64 V128:$Rm),
+                                       VectorIndexD:$idx))))]> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+}
+
+multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
+  // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64duplane32 (v4f32 V128:$Rm),
+                                           VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # v2i32_indexed)
+                V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64dup (f32 FPR32Op:$Rm)))),
+            (!cast<Instruction>(INST # "v2i32_indexed") V64:$Rd, V64:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+
+  // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64duplane32 (v4f32 V128:$Rm),
+                                           VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v4i32_indexed")
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64dup (f32 FPR32Op:$Rm)))),
+            (!cast<Instruction>(INST # "v4i32_indexed") V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64duplane64 (v2f64 V128:$Rm),
+                                           VectorIndexD:$idx))),
+            (!cast<Instruction>(INST # "v2i64_indexed")
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64dup (f64 FPR64Op:$Rm)))),
+            (!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+                         (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
+            (!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
+                V128:$Rm, VectorIndexD:$idx)>;
+}
+
+multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64,
+                                          V128_lo, VectorIndexH,
+                                          asm, ".4h", ".4h", ".4h", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b00, opc,
+                                          V128, V128,
+                                          V128_lo, VectorIndexH,
+                                          asm, ".8h", ".8h", ".8h", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
+                                          V128, VectorIndexS,
+                                          asm, ".2s", ".2s", ".2s", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc,
+                                      V128, V128,
+                                      V128, VectorIndexD,
+                                      asm, ".2d", ".2d", ".2d", ".d", []> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v1i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b00, opc,
+                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
+  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc,
+                                      FPR64Op, FPR64Op, V128, VectorIndexD,
+                                      asm, ".d", "", "", ".d", []> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+}
+
+multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
+                         SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$Rd),
+       (OpNode (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s",  ".s",
+    [(set (v2i32 V64:$Rd),
+       (OpNode (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$Rd),
+       (OpNode (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+      [(set (i32 FPR32Op:$Rd),
+            (OpNode FPR32Op:$Rn,
+                    (i32 (vector_extract (v4i32 V128:$Rm),
+                                         VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$Rd),
+       (OpNode (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$Rd),
+       (OpNode (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$Rd),
+       (OpNode (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedHSTied<bit U, bits<4> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64,
+                                          V128_lo, VectorIndexH,
+                                          asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$dst),
+        (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$dst),
+       (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$dst),
+       (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$dst),
+       (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$Rd),
+          (OpNode (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$Rd),
+        (OpNode (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$Rd),
+          (OpNode (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR64Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
+                                       SDPatternOperator Accum> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull
+                             (v4i16 V64:$Rn),
+                             (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
+  // intermediate EXTRACT_SUBREG would be untyped.
+  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+                (i32 (vector_extract (v4i32
+                         (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+                             (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx)))),
+                         (i64 0))))),
+            (EXTRACT_SUBREG
+                (!cast<Instruction>(NAME # v4i16_indexed)
+                    (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
+                    V128_lo:$Rm, VectorIndexH:$idx),
+                ssub)>;
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull
+                            (extract_high_v8i16 V128:$Rn),
+                            (extract_high_v8i16
+                                (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$dst),
+        (Accum (v2i64 V128:$Rd),
+               (v2i64 (int_aarch64_neon_sqdmull
+                          (v2i32 V64:$Rn),
+                          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull
+                            (extract_high_v4i32 V128:$Rn),
+                            (extract_high_v4i32
+                                (AArch64duplane32 (v4i32 V128:$Rm),
+                                                VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
+                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+
+  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                      FPR64Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+    [(set (i64 FPR64Op:$dst),
+          (Accum (i64 FPR64Op:$Rd),
+                 (i64 (int_aarch64_neon_sqdmulls_scalar
+                            (i32 FPR32Op:$Rn),
+                            (i32 (vector_extract (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$Rd),
+          (OpNode (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$Rd),
+        (OpNode (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$Rd),
+          (OpNode (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+  }
+}
+
+multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
+                                       SDPatternOperator OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$dst),
+        (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd),
+                  (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$dst),
+        (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd),
+                  (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+  }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift by immediate
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterClass regtype1, RegisterClass regtype2,
+                     Operand immtype, string asm, list<dag> pattern>
+  : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm),
+      asm, "\t$Rd, $Rn, $imm", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<7> imm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterClass regtype1, RegisterClass regtype2,
+                     Operand immtype, string asm, list<dag> pattern>
+  : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm),
+      asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<7> imm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+
+multiclass SIMDFPScalarRShift<bit U, bits<5> opc, string asm> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftR32, asm, []> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+multiclass SIMDScalarRShiftD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm,
+  [(set (i64 FPR64:$Rd),
+     (OpNode (i64 FPR64:$Rn), (i32 vecshiftR64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarRShiftDTied<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm,
+  [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn),
+                                                   (i32 vecshiftR64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                           (i32 vecshiftR64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rd, FPR64:$Rn,
+                                            vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarLShiftD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm,
+    [(set (v1i64 FPR64:$Rd),
+       (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarLShiftDTied<bit U, bits<5> opc, string asm> {
+  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarRShiftBHS<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode = null_frag> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR16, vecshiftR8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR32, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR64, vecshiftR32, asm,
+    [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn), vecshiftR32:$imm))]> {
+    let Inst{20-16} = imm{4-0};
+  }
+}
+
+multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR8, vecshiftL8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftL16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftL32, asm,
+    [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn), (i32 vecshiftL32:$imm)))]> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm,
+    [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>;
+}
+
+multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR8, vecshiftR8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftR32, asm, []> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD vector x indexed element
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterOperand dst_reg, RegisterOperand src_reg,
+                     Operand immtype,
+                     string asm, string dst_kind, string src_kind,
+                     list<dag> pattern>
+  : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm),
+      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b011110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterOperand vectype1, RegisterOperand vectype2,
+                     Operand immtype,
+                     string asm, string dst_kind, string src_kind,
+                     list<dag> pattern>
+  : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm),
+      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b011110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
+                              Intrinsic OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16,
+                                  asm, ".4h", ".4h",
+      [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16,
+                                  asm, ".8h", ".8h",
+      [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+      [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+      [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+      [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm,
+                                  Intrinsic OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16,
+                                  asm, ".4h", ".4h",
+      [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16,
+                                  asm, ".8h", ".8h",
+      [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+  } // Predicates = [HasNEON, HasFullFP16]
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+      [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+      [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+      [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftNarrowBHS<bit U, bits<5> opc, string asm,
+                                     SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V128, vecshiftR16Narrow,
+                                  asm, ".8b", ".8h",
+      [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR16Narrow,
+                                  asm#"2", ".16b", ".8h", []> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V128, vecshiftR32Narrow,
+                                  asm, ".4h", ".4s",
+      [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR32Narrow,
+                                  asm#"2", ".8h", ".4s", []> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V128, vecshiftR64Narrow,
+                                  asm, ".2s", ".2d",
+      [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR64Narrow,
+                                  asm#"2", ".4s", ".2d", []> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions
+  // themselves, so put them here instead.
+
+  // Patterns involving what's effectively an insert high and a normal
+  // intrinsic, represented by CONCAT_VECTORS.
+  def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn),
+                                                   vecshiftR16Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v16i8_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR16Narrow:$imm)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn),
+                                                     vecshiftR32Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v8i16_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR32Narrow:$imm)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn),
+                                                     vecshiftR64Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v4i32_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR64Narrow:$imm)>;
+}
+
+multiclass SIMDVectorLShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftL8,
+                                  asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+                       (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm, ".16b", ".16b",
+             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+                   (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftL16,
+                                  asm, ".4h", ".4h",
+              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+                    (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm, ".8h", ".8h",
+            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                  (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftL32,
+                                  asm, ".2s", ".2s",
+              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+                    (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm, ".4s", ".4s",
+            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftL64,
+                                  asm, ".2d", ".2d",
+            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                  (i32 vecshiftL64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftR8,
+                                  asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+                       (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR8,
+                                  asm, ".16b", ".16b",
+             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+                   (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16,
+                                  asm, ".4h", ".4h",
+              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+                    (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16,
+                                  asm, ".8h", ".8h",
+            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                  (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+                    (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                  (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                  (i32 vecshiftR64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDVectorRShiftBHSDTied<bit U, bits<5> opc, string asm,
+                                    SDPatternOperator OpNode = null_frag> {
+  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftR8, asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+                           (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR8, asm, ".16b", ".16b",
+             [(set (v16i8 V128:$dst),
+               (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                       (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16, asm, ".4h", ".4h",
+              [(set (v4i16 V64:$dst),
+                (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+                        (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16, asm, ".8h", ".8h",
+            [(set (v8i16 V128:$dst),
+              (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+                      (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32, asm, ".2s", ".2s",
+              [(set (v2i32 V64:$dst),
+                (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+                        (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32, asm, ".4s", ".4s",
+            [(set (v4i32 V128:$dst),
+              (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                      (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d", [(set (v2i64 V128:$dst),
+              (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+                      (i32 vecshiftR64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorLShiftBHSDTied<bit U, bits<5> opc, string asm,
+                                    SDPatternOperator OpNode = null_frag> {
+  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftL8,
+                                  asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$dst),
+                          (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+                                  (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$dst),
+                          (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                                  (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftL16,
+                                  asm, ".4h", ".4h",
+                    [(set (v4i16 V64:$dst),
+                           (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+                                   (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm, ".8h", ".8h",
+                    [(set (v8i16 V128:$dst),
+                          (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+                                  (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftL32,
+                                  asm, ".2s", ".2s",
+                    [(set (v2i32 V64:$dst),
+                          (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+                                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm, ".4s", ".4s",
+                    [(set (v4i32 V128:$dst),
+                          (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftL64,
+                                  asm, ".2d", ".2d",
+                    [(set (v2i64 V128:$dst),
+                          (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+                                  (i32 vecshiftL64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V64, vecshiftL8, asm, ".8h", ".8b",
+      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm#"2", ".8h", ".16b",
+      [(set (v8i16 V128:$Rd),
+            (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V64, vecshiftL16, asm, ".4s", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm#"2", ".4s", ".8h",
+      [(set (v4i32 V128:$Rd),
+            (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {
+
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V64, vecshiftL32, asm, ".2d", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm#"2", ".2d", ".4s",
+      [(set (v2i64 V128:$Rd),
+            (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+}
+
+
+//---
+// Vector load/store
+//---
+// SIMD ldX/stX no-index memory references don't allow the optional
+// ", #0" constant and handle post-indexing explicitly, so we use
+// a more specialized parse method for them. Otherwise, it's the same as
+// the general GPR64sp handling.
+
+class BaseSIMDLdSt<bit Q, bit L, bits<4> opcode, bits<2> size,
+                   string asm, dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, "\t$Vt, [$Rn]", "", pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
+  let Inst{29-23} = 0b0011000;
+  let Inst{22} = L;
+  let Inst{21-16} = 0b000000;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = size;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
+                       string asm, dag oops, dag iops>
+  : I<oops, iops, asm, "\t$Vt, [$Rn], $Xm", "$Rn = $wback", []> {
+  bits<5> Vt;
+  bits<5> Rn;
+  bits<5> Xm;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
+  let Inst{29-23} = 0b0011001;
+  let Inst{22} = L;
+  let Inst{21} = 0;
+  let Inst{20-16} = Xm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = size;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+// The immediate form of AdvSIMD post-indexed addressing is encoded with
+// register post-index addressing from the zero register.
+multiclass SIMDLdStAliases<string asm, string layout, string Count,
+                           int Offset, int Size> {
+  // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
+  //      "ld1\t$Vt, [$Rn], #16"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      XZR), 1>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1], #16"
+  //      "ld1.8b\t$Vt, [$Rn], #16"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      XZR), 0>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1]"
+  //      "ld1\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
+                  (!cast<Instruction>(NAME # Count # "v" # layout)
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1], x2"
+  //      "ld1\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
+                       int Offset64, bits<4> opcode> {
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
+                           (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm,
+                           (outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm,
+                           (outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm,
+                           (outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm,
+                           (outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm,
+                           (outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm,
+                           (outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+
+
+    def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "16b"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "8h"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "4s"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "2d"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "8b"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "4h"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "2s"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+// Only ld1/st1 has a v1d version.
+multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
+                       int Offset64, bits<4> opcode> {
+  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
+    def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
+                            (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+                                 GPR64sp:$Rn), []>;
+    def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+                                GPR64sp:$Rn), []>;
+
+    def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDLd1<string Count, string asm, string veclist,
+                       int Offset128, int Offset64, bits<4> opcode>
+  : BaseSIMDLdN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+  // LD1 instructions have extra "1d" variants.
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm,
+                           (outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+
+    def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "1d"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDSt1<string Count, string asm, string veclist,
+                       int Offset128, int Offset64, bits<4> opcode>
+  : BaseSIMDStN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+  // ST1 instructions have extra "1d" variants.
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+    def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+                                GPR64sp:$Rn), []>;
+
+    def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass SIMDLd1Multiple<string asm> {
+  defm One   : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8,  0b0111>;
+  defm Two   : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+  defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+  defm Four  : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDSt1Multiple<string asm> {
+  defm One   : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8,  0b0111>;
+  defm Two   : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+  defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+  defm Four  : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDLd2Multiple<string asm> {
+  defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDSt2Multiple<string asm> {
+  defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDLd3Multiple<string asm> {
+  defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDSt3Multiple<string asm> {
+  defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDLd4Multiple<string asm> {
+  defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+multiclass SIMDSt4Multiple<string asm> {
+  defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+//---
+// AdvSIMD Load/store single-element
+//---
+
+class BaseSIMDLdStSingle<bit L, bit R, bits<3> opcode,
+                         string asm, string operands, string cst,
+                         dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, operands, cst, pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{29-24} = 0b001101;
+  let Inst{22} = L;
+  let Inst{21} = R;
+  let Inst{15-13} = opcode;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
+                         string asm, string operands, string cst,
+                         dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, operands, "$Vt = $dst," # cst, pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{29-24} = 0b001101;
+  let Inst{22} = L;
+  let Inst{21} = R;
+  let Inst{15-13} = opcode;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
+                  Operand listtype>
+  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "",
+                       (outs listtype:$Vt), (ins GPR64sp:$Rn),
+                       []> {
+  let Inst{30} = Q;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = S;
+  let Inst{11-10} = size;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
+                      string asm, Operand listtype, Operand GPR64pi>
+  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm",
+                       "$Rn = $wback",
+                       (outs GPR64sp:$wback, listtype:$Vt),
+                       (ins GPR64sp:$Rn, GPR64pi:$Xm), []> {
+  bits<5> Xm;
+  let Inst{30} = Q;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = S;
+  let Inst{11-10} = size;
+}
+
+multiclass SIMDLdrAliases<string asm, string layout, string Count,
+                          int Offset, int Size> {
+  // E.g. "ld1r { v0.8b }, [x1], #1"
+  //      "ld1r.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      XZR), 1>;
+
+  // E.g. "ld1r.8b { v0 }, [x1], #1"
+  //      "ld1r.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      XZR), 0>;
+
+  // E.g. "ld1r.8b { v0 }, [x1]"
+  //      "ld1r.8b\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
+                  (!cast<Instruction>(NAME # "v" # layout)
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1r.8b { v0 }, [x1], x2"
+  //      "ld1r.8b\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
+  int Offset1, int Offset2, int Offset4, int Offset8> {
+  def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
+                        !cast<Operand>("VecList" # Count # "8b")>;
+  def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
+                        !cast<Operand>("VecList" # Count #"16b")>;
+  def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
+                        !cast<Operand>("VecList" # Count #"4h")>;
+  def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
+                        !cast<Operand>("VecList" # Count #"8h")>;
+  def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
+                        !cast<Operand>("VecList" # Count #"2s")>;
+  def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
+                        !cast<Operand>("VecList" # Count #"4s")>;
+  def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
+                        !cast<Operand>("VecList" # Count #"1d")>;
+  def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
+                        !cast<Operand>("VecList" # Count #"2d")>;
+
+  def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
+                                 !cast<Operand>("VecList" # Count # "8b"),
+                                 !cast<Operand>("GPR64pi" # Offset1)>;
+  def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
+                                 !cast<Operand>("VecList" # Count # "16b"),
+                                 !cast<Operand>("GPR64pi" # Offset1)>;
+  def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
+                                 !cast<Operand>("VecList" # Count # "4h"),
+                                 !cast<Operand>("GPR64pi" # Offset2)>;
+  def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
+                                 !cast<Operand>("VecList" # Count # "8h"),
+                                 !cast<Operand>("GPR64pi" # Offset2)>;
+  def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
+                                 !cast<Operand>("VecList" # Count # "2s"),
+                                 !cast<Operand>("GPR64pi" # Offset4)>;
+  def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
+                                 !cast<Operand>("VecList" # Count # "4s"),
+                                 !cast<Operand>("GPR64pi" # Offset4)>;
+  def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
+                                 !cast<Operand>("VecList" # Count # "1d"),
+                                 !cast<Operand>("GPR64pi" # Offset8)>;
+  def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
+                                 !cast<Operand>("VecList" # Count # "2d"),
+                                 !cast<Operand>("GPR64pi" # Offset8)>;
+
+  defm : SIMDLdrAliases<asm, "8b",  Count, Offset1,  64>;
+  defm : SIMDLdrAliases<asm, "16b", Count, Offset1, 128>;
+  defm : SIMDLdrAliases<asm, "4h",  Count, Offset2,  64>;
+  defm : SIMDLdrAliases<asm, "8h",  Count, Offset2, 128>;
+  defm : SIMDLdrAliases<asm, "2s",  Count, Offset4,  64>;
+  defm : SIMDLdrAliases<asm, "4s",  Count, Offset4, 128>;
+  defm : SIMDLdrAliases<asm, "1d",  Count, Offset8,  64>;
+  defm : SIMDLdrAliases<asm, "2d",  Count, Offset8, 128>;
+}
+
+class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  let Inst{30} = idx{3};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTied<bit L, bit R, bits<3> opcode, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  let Inst{30} = idx{3};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBPost<bit L, bit R, bits<3> opcode, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{3};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTiedPost<bit L, bit R, bits<3> opcode, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{3};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+
+class SIMDLdStSingleH<bit L, bit R, bits<3> opcode, bit size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  let Inst{30} = idx{2};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleHTied<bit L, bit R, bits<3> opcode, bit size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  let Inst{30} = idx{2};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+
+class SIMDLdStSingleHPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{2};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleHTiedPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{2};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleS<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  let Inst{30} = idx{1};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  let Inst{30} = idx{1};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{1};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{1};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleD<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  let Inst{30} = idx;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  let Inst{30} = idx;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  bits<5> Xm;
+  let Inst{30} = idx;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  bits<5> Xm;
+  let Inst{30} = idx;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleBTied<bit R, bits<3> opcode, string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i8 : SIMDLdStSingleBTied<1, R, opcode, asm,
+                           (outs listtype:$dst),
+                           (ins listtype:$Vt, VectorIndexB:$idx,
+                                GPR64sp:$Rn), []>;
+
+  def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexB:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleHTied<bit R, bits<3> opcode, bit size, string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i8 : SIMDLdStSingleB<0, R, opcode, asm,
+                           (outs), (ins listtype:$Vt, VectorIndexB:$idx,
+                                        GPR64sp:$Rn), []>;
+
+  def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
+                                    (outs GPR64sp:$wback),
+                                    (ins listtype:$Vt, VectorIndexB:$idx,
+                                         GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexH:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexS:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexD:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+
+multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
+                                 string Count, int Offset, Operand idxtype> {
+  // E.g. "ld1 { v0.8b }[0], [x1], #1"
+  //      "ld1\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt$idx, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Type  # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      idxtype:$idx, XZR), 1>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1], #1"
+  //      "ld1.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Type # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                      idxtype:$idx, XZR), 0>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1]"
+  //      "ld1.8b\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn]",
+                      (!cast<Instruction>(NAME # Type)
+                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                         idxtype:$idx, GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1], x2"
+  //      "ld1.8b\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], $Xm",
+                      (!cast<Instruction>(NAME # Type # "_POST")
+                         GPR64sp:$Rn,
+                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                         idxtype:$idx,
+                         !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass SIMDLdSt1SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "One", 1, VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
+}
+
+multiclass SIMDLdSt2SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Two", 2,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8,  VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
+}
+
+multiclass SIMDLdSt3SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Three", 3,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
+}
+
+multiclass SIMDLdSt4SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Four", 4,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
+}
+} // end of 'let Predicates = [HasNEON]'
+
+//----------------------------------------------------------------------------
+// AdvSIMD v8.1 Rounding Double Multiply Add/Subtract
+//----------------------------------------------------------------------------
+
+let Predicates = [HasNEON, HasV8_1a] in {
+
+class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
+                                    RegisterOperand regtype, string asm, 
+                                    string kind, list<dag> pattern>
+  : BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind, 
+                                pattern> {
+}
+multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
+                                             SDPatternOperator Accum> {
+  def v4i16 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b01, opc, V64, asm, ".4h",
+    [(set (v4i16 V64:$dst),
+          (Accum (v4i16 V64:$Rd),
+                 (v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn),
+                                                   (v4i16 V64:$Rm)))))]>;         
+  def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h",
+    [(set (v8i16 V128:$dst),
+          (Accum (v8i16 V128:$Rd),
+                 (v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn),
+                                                   (v8i16 V128:$Rm)))))]>;
+  def v2i32 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b10, opc, V64, asm, ".2s",
+    [(set (v2i32 V64:$dst),
+          (Accum (v2i32 V64:$Rd),
+                 (v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn),
+                                                   (v2i32 V64:$Rm)))))]>;
+  def v4i32 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b10, opc, V128, asm, ".4s",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn),
+                                                   (v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
+                                     SDPatternOperator Accum> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                          V64, V64, V128_lo, VectorIndexH,
+                                          asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$dst),
+          (Accum (v4i16 V64:$Rd),
+                 (v4i16 (int_aarch64_neon_sqrdmulh
+                          (v4i16 V64:$Rn),
+                          (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                          V128, V128, V128_lo, VectorIndexH,
+                                          asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$dst),
+          (Accum (v8i16 V128:$Rd),
+                 (v8i16 (int_aarch64_neon_sqrdmulh
+                          (v8i16 V128:$Rn),
+                          (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                   VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                          V64, V64, V128, VectorIndexS,
+                                          asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$dst),
+        (Accum (v2i32 V64:$Rd),
+               (v2i32 (int_aarch64_neon_sqrdmulh
+                        (v2i32 V64:$Rn),
+                        (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but 
+  // an intermediate EXTRACT_SUBREG would be untyped.
+  // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we 
+  // got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..)))
+  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+                       (i32 (vector_extract 
+                               (v4i32 (insert_subvector
+                                       (undef), 
+                                        (v2i32 (int_aarch64_neon_sqrdmulh 
+                                                 (v2i32 V64:$Rn),
+                                                 (v2i32 (AArch64duplane32 
+                                                          (v4i32 V128:$Rm),
+                                                          VectorIndexS:$idx)))),
+                                      (i32 0))),
+                               (i64 0))))),
+            (EXTRACT_SUBREG
+                (v2i32 (!cast<Instruction>(NAME # v2i32_indexed)
+                          (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), 
+                                                FPR32Op:$Rd, 
+                                                ssub)), 
+                          V64:$Rn,
+                          V128:$Rm, 
+                          VectorIndexS:$idx)),
+                ssub)>;
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                          V128, V128, V128, VectorIndexS,
+                                          asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqrdmulh
+                          (v4i32 V128:$Rn),
+                          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                   VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but
+  // an intermediate EXTRACT_SUBREG would be untyped.
+  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+                        (i32 (vector_extract 
+                               (v4i32 (int_aarch64_neon_sqrdmulh 
+                                        (v4i32 V128:$Rn),
+                                        (v4i32 (AArch64duplane32 
+                                                 (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx)))),
+                               (i64 0))))),
+            (EXTRACT_SUBREG
+                (v4i32 (!cast<Instruction>(NAME # v4i32_indexed)
+                         (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), 
+                                               FPR32Op:$Rd, 
+                                               ssub)), 
+                         V128:$Rn,
+                         V128:$Rm, 
+                         VectorIndexS:$idx)),
+                ssub)>;
+
+  def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
+                                        FPR16Op, FPR16Op, V128_lo,
+                                        VectorIndexH, asm, ".h", "", "", ".h", 
+                                        []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                        FPR32Op, FPR32Op, V128, VectorIndexS,
+                                        asm, ".s", "", "", ".s",
+    [(set (i32 FPR32Op:$dst),
+          (Accum (i32 FPR32Op:$Rd),
+                 (i32 (int_aarch64_neon_sqrdmulh
+                        (i32 FPR32Op:$Rn),
+                        (i32 (vector_extract (v4i32 V128:$Rm),
+                                             VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+} // let Predicates = [HasNeon, HasV8_1a]
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+let Predicates = [HasCrypto] in {
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
+              list<dag> pat>
+  : I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b|.16b\t$Rd, $Rn}", cstr, pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-16} = 0b0100111000101000;
+  let Inst{15-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class AESInst<bits<4> opc, string asm, Intrinsic OpNode>
+  : AESBase<opc, asm, (outs V128:$Rd), (ins V128:$Rn), "",
+            [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+class AESTiedInst<bits<4> opc, string asm, Intrinsic OpNode>
+  : AESBase<opc, asm, (outs V128:$dst), (ins V128:$Rd, V128:$Rn),
+            "$Rd = $dst",
+            [(set (v16i8 V128:$dst),
+                  (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind,
+                     dag oops, dag iops, list<dag> pat>
+  : I<oops, iops, asm,
+      "{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" #
+      "|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-21} = 0b01011110000;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SHATiedInstQSV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+                   (ins FPR128:$Rd, FPR32:$Rn, V128:$Rm),
+                   [(set (v4i32 FPR128:$dst),
+                         (OpNode (v4i32 FPR128:$Rd), (i32 FPR32:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstVVV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, ".4s", (outs V128:$dst),
+                   (ins V128:$Rd, V128:$Rn, V128:$Rm),
+                   [(set (v4i32 V128:$dst),
+                         (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstQQV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+                   (ins FPR128:$Rd, FPR128:$Rn, V128:$Rm),
+                   [(set (v4i32 FPR128:$dst),
+                         (OpNode (v4i32 FPR128:$Rd), (v4i32 FPR128:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA2OpInst<bits<4> opc, string asm, string kind,
+                 string cstr, dag oops, dag iops,
+                 list<dag> pat>
+  : I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind #
+                       "|" # kind # "\t$Rd, $Rn}", cstr, pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-16} = 0b0101111000101000;
+  let Inst{15-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
+  : SHA2OpInst<opc, asm, ".4s", "$Rd = $dst", (outs V128:$dst),
+               (ins V128:$Rd, V128:$Rn),
+               [(set (v4i32 V128:$dst),
+                     (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+
+class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
+  : SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
+               [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+} // end of 'let Predicates = [HasCrypto]'
+
+//----------------------------------------------------------------------------
+// v8.1 atomic instructions extension:
+// * CAS
+// * CASP
+// * SWP
+// * LDOPregister<OP>, and aliases STOPregister<OP>
+
+// Instruction encodings:
+//
+//      31 30|29  24|23|22|21|20 16|15|14  10|9 5|4 0
+// CAS  SZ   |001000|1 |A |1 |Rs   |R |11111 |Rn |Rt
+// CASP  0|SZ|001000|0 |A |1 |Rs   |R |11111 |Rn |Rt
+// SWP  SZ   |111000|A |R |1 |Rs   |1 |OPC|00|Rn |Rt
+// LD   SZ   |111000|A |R |1 |Rs   |0 |OPC|00|Rn |Rt
+// ST   SZ   |111000|A |R |1 |Rs   |0 |OPC|00|Rn |11111
+
+// Instruction syntax:
+//
+// CAS{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
+// CAS{<order>} <Xs>, <Xt>, [<Xn|SP>]
+// CASP{<order>} <Ws>, <W(s+1)>, <Wt>, <W(t+1)>, [<Xn|SP>]
+// CASP{<order>} <Xs>, <X(s+1)>, <Xt>, <X(t+1)>, [<Xn|SP>]
+// SWP{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
+// SWP{<order>} <Xs>, <Xt>, [<Xn|SP>]
+// LD<OP>{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
+// LD<OP>{<order>} <Xs>, <Xt>, [<Xn|SP>]
+// ST<OP>{<order>}[<size>] <Ws>, [<Xn|SP>]
+// ST<OP>{<order>} <Xs>, [<Xn|SP>]
+
+let Predicates = [HasLSE], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
+class BaseCASEncoding<dag oops, dag iops, string asm, string operands,
+                      string cstr, list<dag> pattern>
+      : I<oops, iops, asm, operands, cstr, pattern> {
+  bits<2> Sz;
+  bit NP;
+  bit Acq;
+  bit Rel;
+  bits<5> Rs;
+  bits<5> Rn;
+  bits<5> Rt;
+  let Inst{31-30} = Sz;
+  let Inst{29-24} = 0b001000;
+  let Inst{23} = NP;
+  let Inst{22} = Acq;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rs;
+  let Inst{15} = Rel;
+  let Inst{14-10} = 0b11111;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+  let Predicates = [HasLSE];
+}
+
+class BaseCAS<string order, string size, RegisterClass RC>
+      : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
+                        "cas" # order # size, "\t$Rs, $Rt, [$Rn]",
+                        "$out = $Rs",[]>,
+        Sched<[WriteAtomic]> {
+  let NP = 1;
+}
+
+multiclass CompareAndSwap<bits<1> Acq, bits<1> Rel, string order> {
+  let Sz = 0b00, Acq = Acq, Rel = Rel in def b : BaseCAS<order, "b", GPR32>;
+  let Sz = 0b01, Acq = Acq, Rel = Rel in def h : BaseCAS<order, "h", GPR32>;
+  let Sz = 0b10, Acq = Acq, Rel = Rel in def s : BaseCAS<order, "", GPR32>;
+  let Sz = 0b11, Acq = Acq, Rel = Rel in def d : BaseCAS<order, "", GPR64>;
+}
+
+class BaseCASP<string order, string size, RegisterOperand RC>
+      : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
+                        "casp" # order # size, "\t$Rs, $Rt, [$Rn]",
+                        "$out = $Rs",[]>,
+        Sched<[WriteAtomic]> {
+  let NP = 0;
+}
+
+multiclass CompareAndSwapPair<bits<1> Acq, bits<1> Rel, string order> {
+  let Sz = 0b00, Acq = Acq, Rel = Rel in 
+    def s : BaseCASP<order, "", WSeqPairClassOperand>;
+  let Sz = 0b01, Acq = Acq, Rel = Rel in 
+    def d : BaseCASP<order, "", XSeqPairClassOperand>;
+}
+
+let Predicates = [HasLSE] in
+class BaseSWP<string order, string size, RegisterClass RC>
+      : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "swp" # order # size,
+          "\t$Rs, $Rt, [$Rn]","",[]>,
+        Sched<[WriteAtomic]> {
+  bits<2> Sz;
+  bit Acq;
+  bit Rel;
+  bits<5> Rs;
+  bits<3> opc = 0b000;
+  bits<5> Rn;
+  bits<5> Rt;
+  let Inst{31-30} = Sz;
+  let Inst{29-24} = 0b111000;
+  let Inst{23} = Acq;
+  let Inst{22} = Rel;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rs;
+  let Inst{15} = 0b1;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+  let Predicates = [HasLSE];
+}
+
+multiclass Swap<bits<1> Acq, bits<1> Rel, string order> {
+  let Sz = 0b00, Acq = Acq, Rel = Rel in def b : BaseSWP<order, "b", GPR32>;
+  let Sz = 0b01, Acq = Acq, Rel = Rel in def h : BaseSWP<order, "h", GPR32>;
+  let Sz = 0b10, Acq = Acq, Rel = Rel in def s : BaseSWP<order, "", GPR32>;
+  let Sz = 0b11, Acq = Acq, Rel = Rel in def d : BaseSWP<order, "", GPR64>;
+}
+
+let Predicates = [HasLSE], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
+class BaseLDOPregister<string op, string order, string size, RegisterClass RC>
+      : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "ld" # op # order # size,
+          "\t$Rs, $Rt, [$Rn]","",[]>,
+        Sched<[WriteAtomic]> {
+  bits<2> Sz;
+  bit Acq;
+  bit Rel;
+  bits<5> Rs;
+  bits<3> opc;
+  bits<5> Rn;
+  bits<5> Rt;
+  let Inst{31-30} = Sz;
+  let Inst{29-24} = 0b111000;
+  let Inst{23} = Acq;
+  let Inst{22} = Rel;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rs;
+  let Inst{15} = 0b0;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+  let Predicates = [HasLSE];
+}
+
+multiclass LDOPregister<bits<3> opc, string op, bits<1> Acq, bits<1> Rel, 
+                        string order> {
+  let Sz = 0b00, Acq = Acq, Rel = Rel, opc = opc in 
+    def b : BaseLDOPregister<op, order, "b", GPR32>;
+  let Sz = 0b01, Acq = Acq, Rel = Rel, opc = opc in 
+    def h : BaseLDOPregister<op, order, "h", GPR32>;
+  let Sz = 0b10, Acq = Acq, Rel = Rel, opc = opc in 
+    def s : BaseLDOPregister<op, order, "", GPR32>;
+  let Sz = 0b11, Acq = Acq, Rel = Rel, opc = opc in 
+    def d : BaseLDOPregister<op, order, "", GPR64>;
+}
+
+let Predicates = [HasLSE] in
+class BaseSTOPregister<string asm, RegisterClass OP, Register Reg,
+                        Instruction inst> :
+      InstAlias<asm # "\t$Rs, [$Rn]", (inst Reg, OP:$Rs, GPR64sp:$Rn)>;
+
+multiclass STOPregister<string asm, string instr> {
+  def : BaseSTOPregister<asm # "lb", GPR32, WZR, 
+                    !cast<Instruction>(instr # "Lb")>;
+  def : BaseSTOPregister<asm # "lh", GPR32, WZR, 
+                    !cast<Instruction>(instr # "Lh")>;
+  def : BaseSTOPregister<asm # "l",  GPR32, WZR, 
+                    !cast<Instruction>(instr # "Ls")>;
+  def : BaseSTOPregister<asm # "l",  GPR64, XZR, 
+                    !cast<Instruction>(instr # "Ld")>;
+  def : BaseSTOPregister<asm # "b",  GPR32, WZR, 
+                    !cast<Instruction>(instr # "b")>;
+  def : BaseSTOPregister<asm # "h",  GPR32, WZR, 
+                    !cast<Instruction>(instr # "h")>;
+  def : BaseSTOPregister<asm,        GPR32, WZR, 
+                    !cast<Instruction>(instr # "s")>;
+  def : BaseSTOPregister<asm,        GPR64, XZR, 
+                    !cast<Instruction>(instr # "d")>;
+}
+
+//----------------------------------------------------------------------------
+// Allow the size specifier tokens to be upper case, not just lower.
+def : TokenAlias<".8B", ".8b">;
+def : TokenAlias<".4H", ".4h">;
+def : TokenAlias<".2S", ".2s">;
+def : TokenAlias<".1D", ".1d">;
+def : TokenAlias<".16B", ".16b">;
+def : TokenAlias<".8H", ".8h">;
+def : TokenAlias<".4S", ".4s">;
+def : TokenAlias<".2D", ".2d">;
+def : TokenAlias<".1Q", ".1q">;
+def : TokenAlias<".2H", ".2h">;
+def : TokenAlias<".B", ".b">;
+def : TokenAlias<".H", ".h">;
+def : TokenAlias<".S", ".s">;
+def : TokenAlias<".D", ".d">;
+def : TokenAlias<".Q", ".q">;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
new file mode 100644
index 000000000000..b50749a29b89
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -0,0 +1,4173 @@
+//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "AArch64GenInstrInfo.inc"
+
+static const MachineMemOperand::Flags MOSuppressPair =
+    MachineMemOperand::MOTargetFlag1;
+
+static cl::opt<unsigned>
+TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
+                    cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
+
+static cl::opt<unsigned>
+CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
+                    cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
+
+static cl::opt<unsigned>
+BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
+                    cl::desc("Restrict range of Bcc instructions (DEBUG)"));
+
+AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
+    : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
+      RI(STI.getTargetTriple()), Subtarget(STI) {}
+
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+  const MachineBasicBlock &MBB = *MI.getParent();
+  const MachineFunction *MF = MBB.getParent();
+  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+
+  if (MI.getOpcode() == AArch64::INLINEASM)
+    return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
+
+  // FIXME: We currently only handle pseudoinstructions that don't get expanded
+  //        before the assembly printer.
+  unsigned NumBytes = 0;
+  const MCInstrDesc &Desc = MI.getDesc();
+  switch (Desc.getOpcode()) {
+  default:
+    // Anything not explicitly designated otherwise is a normal 4-byte insn.
+    NumBytes = 4;
+    break;
+  case TargetOpcode::DBG_VALUE:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+    NumBytes = 0;
+    break;
+  case TargetOpcode::STACKMAP:
+    // The upper bound for a stackmap intrinsic is the full length of its shadow
+    NumBytes = StackMapOpers(&MI).getNumPatchBytes();
+    assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+    break;
+  case TargetOpcode::PATCHPOINT:
+    // The size of the patchpoint intrinsic is the number of bytes requested
+    NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
+    assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+    break;
+  case AArch64::TLSDESC_CALLSEQ:
+    // This gets lowered to an instruction sequence which takes 16 bytes
+    NumBytes = 16;
+    break;
+  case AArch64::TLSDESC_CALLSEQ:
+    // This gets lowered to an instruction sequence which takes 16 bytes
+    return 16;
+  }
+
+  return NumBytes;
+}
+
+static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
+                            SmallVectorImpl<MachineOperand> &Cond) {
+  // Block ends with fall-through condbranch.
+  switch (LastInst->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown branch instruction?");
+  case AArch64::Bcc:
+    Target = LastInst->getOperand(1).getMBB();
+    Cond.push_back(LastInst->getOperand(0));
+    break;
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    Target = LastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(-1));
+    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+    Cond.push_back(LastInst->getOperand(0));
+    break;
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    Target = LastInst->getOperand(2).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(-1));
+    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+    Cond.push_back(LastInst->getOperand(0));
+    Cond.push_back(LastInst->getOperand(1));
+  }
+}
+
+static unsigned getBranchDisplacementBits(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("unexpected opcode!");
+  case AArch64::B:
+    return 64;
+  case AArch64::TBNZW:
+  case AArch64::TBZW:
+  case AArch64::TBNZX:
+  case AArch64::TBZX:
+    return TBZDisplacementBits;
+  case AArch64::CBNZW:
+  case AArch64::CBZW:
+  case AArch64::CBNZX:
+  case AArch64::CBZX:
+    return CBZDisplacementBits;
+  case AArch64::Bcc:
+    return BCCDisplacementBits;
+  }
+}
+
+bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
+                                             int64_t BrOffset) const {
+  unsigned Bits = getBranchDisplacementBits(BranchOp);
+  assert(Bits >= 3 && "max branch displacement must be enough to jump"
+                      "over conditional branch expansion");
+  return isIntN(Bits, BrOffset / 4);
+}
+
+MachineBasicBlock *AArch64InstrInfo::getBranchDestBlock(
+  const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("unexpected opcode!");
+  case AArch64::B:
+    return MI.getOperand(0).getMBB();
+  case AArch64::TBZW:
+  case AArch64::TBNZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZX:
+    return MI.getOperand(2).getMBB();
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+  case AArch64::Bcc:
+    return MI.getOperand(1).getMBB();
+  }
+}
+
+// Branch analysis.
+bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *&TBB,
+                                     MachineBasicBlock *&FBB,
+                                     SmallVectorImpl<MachineOperand> &Cond,
+                                     bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return false;
+
+  if (!isUnpredicatedTerminator(*I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = &*I;
+
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+    if (isUncondBranchOpcode(LastOpc)) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (isCondBranchOpcode(LastOpc)) {
+      // Block ends with fall-through condbranch.
+      parseCondBranch(LastInst, TBB, Cond);
+      return false;
+    }
+    return true; // Can't handle indirect branch.
+  }
+
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = &*I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If AllowModify is true and the block ends with two or more unconditional
+  // branches, delete all but the first unconditional branch.
+  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+    while (isUncondBranchOpcode(SecondLastOpc)) {
+      LastInst->eraseFromParent();
+      LastInst = SecondLastInst;
+      LastOpc = LastInst->getOpcode();
+      if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+        // Return now the only terminator is an unconditional branch.
+        TBB = LastInst->getOperand(0).getMBB();
+        return false;
+      } else {
+        SecondLastInst = &*I;
+        SecondLastOpc = SecondLastInst->getOpcode();
+      }
+    }
+  }
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
+    return true;
+
+  // If the block ends with a B and a Bcc, handle it.
+  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    parseCondBranch(SecondLastInst, TBB, Cond);
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // ...likewise if it ends with an indirect branch followed by an unconditional
+  // branch.
+  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+bool AArch64InstrInfo::reverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond[0].getImm() != -1) {
+    // Regular Bcc
+    AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+    Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
+  } else {
+    // Folded compare-and-branch
+    switch (Cond[1].getImm()) {
+    default:
+      llvm_unreachable("Unknown conditional branch!");
+    case AArch64::CBZW:
+      Cond[1].setImm(AArch64::CBNZW);
+      break;
+    case AArch64::CBNZW:
+      Cond[1].setImm(AArch64::CBZW);
+      break;
+    case AArch64::CBZX:
+      Cond[1].setImm(AArch64::CBNZX);
+      break;
+    case AArch64::CBNZX:
+      Cond[1].setImm(AArch64::CBZX);
+      break;
+    case AArch64::TBZW:
+      Cond[1].setImm(AArch64::TBNZW);
+      break;
+    case AArch64::TBNZW:
+      Cond[1].setImm(AArch64::TBZW);
+      break;
+    case AArch64::TBZX:
+      Cond[1].setImm(AArch64::TBNZX);
+      break;
+    case AArch64::TBNZX:
+      Cond[1].setImm(AArch64::TBZX);
+      break;
+    }
+  }
+
+  return false;
+}
+
+unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                        int *BytesRemoved) const {
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return 0;
+
+  if (!isUncondBranchOpcode(I->getOpcode()) &&
+      !isCondBranchOpcode(I->getOpcode()))
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) {
+    if (BytesRemoved)
+      *BytesRemoved = 4;
+    return 1;
+  }
+  --I;
+  if (!isCondBranchOpcode(I->getOpcode())) {
+    if (BytesRemoved)
+      *BytesRemoved = 4;
+    return 1;
+  }
+
+  // Remove the branch.
+  I->eraseFromParent();
+  if (BytesRemoved)
+    *BytesRemoved = 8;
+
+  return 2;
+}
+
+void AArch64InstrInfo::instantiateCondBranch(
+    MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
+    ArrayRef<MachineOperand> Cond) const {
+  if (Cond[0].getImm() != -1) {
+    // Regular Bcc
+    BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
+  } else {
+    // Folded compare-and-branch
+    // Note that we use addOperand instead of addReg to keep the flags.
+    const MachineInstrBuilder MIB =
+        BuildMI(&MBB, DL, get(Cond[1].getImm())).addOperand(Cond[2]);
+    if (Cond.size() > 3)
+      MIB.addImm(Cond[3].getImm());
+    MIB.addMBB(TBB);
+  }
+}
+
+unsigned AArch64InstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                        MachineBasicBlock *TBB,
+                                        MachineBasicBlock *FBB,
+                                        ArrayRef<MachineOperand> Cond,
+                                        const DebugLoc &DL,
+                                        int *BytesAdded) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+
+  if (!FBB) {
+    if (Cond.empty()) // Unconditional branch?
+      BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
+    else
+      instantiateCondBranch(MBB, DL, TBB, Cond);
+
+    if (BytesAdded)
+      *BytesAdded = 4;
+
+    return 1;
+  }
+
+  // Two-way conditional branch.
+  instantiateCondBranch(MBB, DL, TBB, Cond);
+  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
+
+  if (BytesAdded)
+    *BytesAdded = 8;
+
+  return 2;
+}
+
+// Find the original register that VReg is copied from.
+static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
+  while (TargetRegisterInfo::isVirtualRegister(VReg)) {
+    const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+    if (!DefMI->isFullCopy())
+      return VReg;
+    VReg = DefMI->getOperand(1).getReg();
+  }
+  return VReg;
+}
+
+// Determine if VReg is defined by an instruction that can be folded into a
+// csel instruction. If so, return the folded opcode, and the replacement
+// register.
+static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
+                                unsigned *NewVReg = nullptr) {
+  VReg = removeCopies(MRI, VReg);
+  if (!TargetRegisterInfo::isVirtualRegister(VReg))
+    return 0;
+
+  bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
+  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+  unsigned Opc = 0;
+  unsigned SrcOpNum = 0;
+  switch (DefMI->getOpcode()) {
+  case AArch64::ADDSXri:
+  case AArch64::ADDSWri:
+    // if NZCV is used, do not fold.
+    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+      return 0;
+    // fall-through to ADDXri and ADDWri.
+    LLVM_FALLTHROUGH;
+  case AArch64::ADDXri:
+  case AArch64::ADDWri:
+    // add x, 1 -> csinc.
+    if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
+        DefMI->getOperand(3).getImm() != 0)
+      return 0;
+    SrcOpNum = 1;
+    Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
+    break;
+
+  case AArch64::ORNXrr:
+  case AArch64::ORNWrr: {
+    // not x -> csinv, represented as orn dst, xzr, src.
+    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+    if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
+      return 0;
+    SrcOpNum = 2;
+    Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
+    break;
+  }
+
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSWrr:
+    // if NZCV is used, do not fold.
+    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+      return 0;
+    // fall-through to SUBXrr and SUBWrr.
+    LLVM_FALLTHROUGH;
+  case AArch64::SUBXrr:
+  case AArch64::SUBWrr: {
+    // neg x -> csneg, represented as sub dst, xzr, src.
+    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+    if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
+      return 0;
+    SrcOpNum = 2;
+    Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
+    break;
+  }
+  default:
+    return 0;
+  }
+  assert(Opc && SrcOpNum && "Missing parameters");
+
+  if (NewVReg)
+    *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
+  return Opc;
+}
+
+bool AArch64InstrInfo::canInsertSelect(
+    const MachineBasicBlock &MBB, ArrayRef<MachineOperand> Cond,
+    unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles,
+    int &FalseCycles) const {
+  // Check register classes.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+      RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  if (!RC)
+    return false;
+
+  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
+  unsigned ExtraCondLat = Cond.size() != 1;
+
+  // GPRs are handled by csel.
+  // FIXME: Fold in x+1, -x, and ~x when applicable.
+  if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
+      AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+    // Single-cycle csel, csinc, csinv, and csneg.
+    CondCycles = 1 + ExtraCondLat;
+    TrueCycles = FalseCycles = 1;
+    if (canFoldIntoCSel(MRI, TrueReg))
+      TrueCycles = 0;
+    else if (canFoldIntoCSel(MRI, FalseReg))
+      FalseCycles = 0;
+    return true;
+  }
+
+  // Scalar floating point is handled by fcsel.
+  // FIXME: Form fabs, fmin, and fmax when applicable.
+  if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
+      AArch64::FPR32RegClass.hasSubClassEq(RC)) {
+    CondCycles = 5 + ExtraCondLat;
+    TrueCycles = FalseCycles = 2;
+    return true;
+  }
+
+  // Can't do vectors.
+  return false;
+}
+
+void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    const DebugLoc &DL, unsigned DstReg,
+                                    ArrayRef<MachineOperand> Cond,
+                                    unsigned TrueReg, unsigned FalseReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  // Parse the condition code, see parseCondBranch() above.
+  AArch64CC::CondCode CC;
+  switch (Cond.size()) {
+  default:
+    llvm_unreachable("Unknown condition opcode in Cond");
+  case 1: // b.cc
+    CC = AArch64CC::CondCode(Cond[0].getImm());
+    break;
+  case 3: { // cbz/cbnz
+    // We must insert a compare against 0.
+    bool Is64Bit;
+    switch (Cond[1].getImm()) {
+    default:
+      llvm_unreachable("Unknown branch opcode in Cond");
+    case AArch64::CBZW:
+      Is64Bit = 0;
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::CBZX:
+      Is64Bit = 1;
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::CBNZW:
+      Is64Bit = 0;
+      CC = AArch64CC::NE;
+      break;
+    case AArch64::CBNZX:
+      Is64Bit = 1;
+      CC = AArch64CC::NE;
+      break;
+    }
+    unsigned SrcReg = Cond[2].getReg();
+    if (Is64Bit) {
+      // cmp reg, #0 is actually subs xzr, reg, #0.
+      MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
+      BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
+          .addReg(SrcReg)
+          .addImm(0)
+          .addImm(0);
+    } else {
+      MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
+      BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
+          .addReg(SrcReg)
+          .addImm(0)
+          .addImm(0);
+    }
+    break;
+  }
+  case 4: { // tbz/tbnz
+    // We must insert a tst instruction.
+    switch (Cond[1].getImm()) {
+    default:
+      llvm_unreachable("Unknown branch opcode in Cond");
+    case AArch64::TBZW:
+    case AArch64::TBZX:
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::TBNZW:
+    case AArch64::TBNZX:
+      CC = AArch64CC::NE;
+      break;
+    }
+    // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
+    if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
+      BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
+          .addReg(Cond[2].getReg())
+          .addImm(
+              AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
+    else
+      BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
+          .addReg(Cond[2].getReg())
+          .addImm(
+              AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
+    break;
+  }
+  }
+
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = nullptr;
+  bool TryFold = false;
+  if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
+    RC = &AArch64::GPR64RegClass;
+    Opc = AArch64::CSELXr;
+    TryFold = true;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
+    RC = &AArch64::GPR32RegClass;
+    Opc = AArch64::CSELWr;
+    TryFold = true;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
+    RC = &AArch64::FPR64RegClass;
+    Opc = AArch64::FCSELDrrr;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
+    RC = &AArch64::FPR32RegClass;
+    Opc = AArch64::FCSELSrrr;
+  }
+  assert(RC && "Unsupported regclass");
+
+  // Try folding simple instructions into the csel.
+  if (TryFold) {
+    unsigned NewVReg = 0;
+    unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
+    if (FoldedOpc) {
+      // The folded opcodes csinc, csinc and csneg apply the operation to
+      // FalseReg, so we need to invert the condition.
+      CC = AArch64CC::getInvertedCondCode(CC);
+      TrueReg = FalseReg;
+    } else
+      FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
+
+    // Fold the operation. Leave any dead instructions for DCE to clean up.
+    if (FoldedOpc) {
+      FalseReg = NewVReg;
+      Opc = FoldedOpc;
+      // The extends the live range of NewVReg.
+      MRI.clearKillFlags(NewVReg);
+    }
+  }
+
+  // Pull all virtual register into the appropriate class.
+  MRI.constrainRegClass(TrueReg, RC);
+  MRI.constrainRegClass(FalseReg, RC);
+
+  // Insert the csel.
+  BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(TrueReg).addReg(FalseReg).addImm(
+      CC);
+}
+
+/// Returns true if a MOVi32imm or MOVi64imm can be expanded to an  ORRxx.
+static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
+  uint64_t Imm = MI.getOperand(1).getImm();
+  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+  uint64_t Encoding;
+  return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
+}
+
+// FIXME: this implementation should be micro-architecture dependent, so a
+// micro-architecture target hook should be introduced here in future.
+bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
+  if (!Subtarget.hasCustomCheapAsMoveHandling())
+    return MI.isAsCheapAsAMove();
+
+  unsigned Imm;
+
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+
+  // add/sub on register without shift
+  case AArch64::ADDWri:
+  case AArch64::ADDXri:
+  case AArch64::SUBWri:
+  case AArch64::SUBXri:
+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 ||
+            MI.getOperand(3).getImm() == 0);
+
+  // add/sub on register with shift
+  case AArch64::ADDWrs:
+  case AArch64::ADDXrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBXrs:
+    Imm = MI.getOperand(3).getImm();
+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
+            AArch64_AM::getArithShiftValue(Imm) < 4);
+
+  // logical ops on immediate
+  case AArch64::ANDWri:
+  case AArch64::ANDXri:
+  case AArch64::EORWri:
+  case AArch64::EORXri:
+  case AArch64::ORRWri:
+  case AArch64::ORRXri:
+    return true;
+
+  // logical ops on register without shift
+  case AArch64::ANDWrr:
+  case AArch64::ANDXrr:
+  case AArch64::BICWrr:
+  case AArch64::BICXrr:
+  case AArch64::EONWrr:
+  case AArch64::EONXrr:
+  case AArch64::EORWrr:
+  case AArch64::EORXrr:
+  case AArch64::ORNWrr:
+  case AArch64::ORNXrr:
+  case AArch64::ORRWrr:
+  case AArch64::ORRXrr:
+    return true;
+
+  // logical ops on register with shift
+  case AArch64::ANDWrs:
+  case AArch64::ANDXrs:
+  case AArch64::BICWrs:
+  case AArch64::BICXrs:
+  case AArch64::EONWrs:
+  case AArch64::EONXrs:
+  case AArch64::EORWrs:
+  case AArch64::EORXrs:
+  case AArch64::ORNWrs:
+  case AArch64::ORNXrs:
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+    Imm = MI.getOperand(3).getImm();
+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
+            AArch64_AM::getShiftValue(Imm) < 4 &&
+            AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL);
+
+  // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
+  // ORRXri, it is as cheap as MOV
+  case AArch64::MOVi32imm:
+    return canBeExpandedToORR(MI, 32);
+  case AArch64::MOVi64imm:
+    return canBeExpandedToORR(MI, 64);
+
+  // It is cheap to zero out registers if the subtarget has ZeroCycleZeroing
+  // feature.
+  case AArch64::FMOVS0:
+  case AArch64::FMOVD0:
+    return Subtarget.hasZeroCycleZeroing();
+  case TargetOpcode::COPY:
+    return (Subtarget.hasZeroCycleZeroing() &&
+            (MI.getOperand(1).getReg() == AArch64::WZR ||
+             MI.getOperand(1).getReg() == AArch64::XZR));
+  }
+
+  llvm_unreachable("Unknown opcode to check as cheap as a move!");
+}
+
+bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                             unsigned &SrcReg, unsigned &DstReg,
+                                             unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::SBFMXri: // aka sxtw
+  case AArch64::UBFMXri: // aka uxtw
+    // Check for the 32 -> 64 bit extension case, these instructions can do
+    // much more.
+    if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
+      return false;
+    // This is a signed or unsigned 32 -> 64 bit extension.
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    SubIdx = AArch64::sub_32;
+    return true;
+  }
+}
+
+bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
+    MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const {
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  unsigned BaseRegA = 0, BaseRegB = 0;
+  int64_t OffsetA = 0, OffsetB = 0;
+  unsigned WidthA = 0, WidthB = 0;
+
+  assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
+  assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
+
+  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
+      MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
+    return false;
+
+  // Retrieve the base register, offset from the base register and width. Width
+  // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8).  If
+  // base registers are identical, and the offset of a lower memory access +
+  // the width doesn't overlap the offset of a higher memory access,
+  // then the memory accesses are different.
+  if (getMemOpBaseRegImmOfsWidth(MIa, BaseRegA, OffsetA, WidthA, TRI) &&
+      getMemOpBaseRegImmOfsWidth(MIb, BaseRegB, OffsetB, WidthB, TRI)) {
+    if (BaseRegA == BaseRegB) {
+      int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
+      int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
+      int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+      if (LowOffset + LowWidth <= HighOffset)
+        return true;
+    }
+  }
+  return false;
+}
+
+/// analyzeCompare - For a comparison instruction, return the source registers
+/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+/// Return true if the comparison instruction can be analyzed.
+bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                                      unsigned &SrcReg2, int &CmpMask,
+                                      int &CmpValue) const {
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSWrs:
+  case AArch64::SUBSWrx:
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSXrs:
+  case AArch64::SUBSXrx:
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSWrx:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXrs:
+  case AArch64::ADDSXrx:
+    // Replace SUBSWrr with SUBWrr if NZCV is not used.
+    SrcReg = MI.getOperand(1).getReg();
+    SrcReg2 = MI.getOperand(2).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case AArch64::SUBSWri:
+  case AArch64::ADDSWri:
+  case AArch64::SUBSXri:
+  case AArch64::ADDSXri:
+    SrcReg = MI.getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    // FIXME: In order to convert CmpValue to 0 or 1
+    CmpValue = MI.getOperand(2).getImm() != 0;
+    return true;
+  case AArch64::ANDSWri:
+  case AArch64::ANDSXri:
+    // ANDS does not use the same encoding scheme as the others xxxS
+    // instructions.
+    SrcReg = MI.getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
+    // while the type of CmpValue is int. When converting uint64_t to int,
+    // the high 32 bits of uint64_t will be lost.
+    // In fact it causes a bug in spec2006-483.xalancbmk
+    // CmpValue is only used to compare with zero in OptimizeCompareInstr
+    CmpValue = AArch64_AM::decodeLogicalImmediate(
+                   MI.getOperand(2).getImm(),
+                   MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
+    return true;
+  }
+
+  return false;
+}
+
+static bool UpdateOperandRegClass(MachineInstr &Instr) {
+  MachineBasicBlock *MBB = Instr.getParent();
+  assert(MBB && "Can't get MachineBasicBlock here");
+  MachineFunction *MF = MBB->getParent();
+  assert(MF && "Can't get MachineFunction here");
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+
+  for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
+       ++OpIdx) {
+    MachineOperand &MO = Instr.getOperand(OpIdx);
+    const TargetRegisterClass *OpRegCstraints =
+        Instr.getRegClassConstraint(OpIdx, TII, TRI);
+
+    // If there's no constraint, there's nothing to do.
+    if (!OpRegCstraints)
+      continue;
+    // If the operand is a frame index, there's nothing to do here.
+    // A frame index operand will resolve correctly during PEI.
+    if (MO.isFI())
+      continue;
+
+    assert(MO.isReg() &&
+           "Operand has register constraints without being a register!");
+
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (!OpRegCstraints->contains(Reg))
+        return false;
+    } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
+               !MRI->constrainRegClass(Reg, OpRegCstraints))
+      return false;
+  }
+
+  return true;
+}
+
+/// \brief Return the opcode that does not set flags when possible - otherwise
+/// return the original opcode. The caller is responsible to do the actual
+/// substitution and legality checking.
+static unsigned convertFlagSettingOpcode(const MachineInstr &MI) {
+  // Don't convert all compare instructions, because for some the zero register
+  // encoding becomes the sp register.
+  bool MIDefinesZeroReg = false;
+  if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
+    MIDefinesZeroReg = true;
+
+  switch (MI.getOpcode()) {
+  default:
+    return MI.getOpcode();
+  case AArch64::ADDSWrr:
+    return AArch64::ADDWrr;
+  case AArch64::ADDSWri:
+    return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
+  case AArch64::ADDSWrs:
+    return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
+  case AArch64::ADDSWrx:
+    return AArch64::ADDWrx;
+  case AArch64::ADDSXrr:
+    return AArch64::ADDXrr;
+  case AArch64::ADDSXri:
+    return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
+  case AArch64::ADDSXrs:
+    return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
+  case AArch64::ADDSXrx:
+    return AArch64::ADDXrx;
+  case AArch64::SUBSWrr:
+    return AArch64::SUBWrr;
+  case AArch64::SUBSWri:
+    return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
+  case AArch64::SUBSWrs:
+    return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
+  case AArch64::SUBSWrx:
+    return AArch64::SUBWrx;
+  case AArch64::SUBSXrr:
+    return AArch64::SUBXrr;
+  case AArch64::SUBSXri:
+    return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
+  case AArch64::SUBSXrs:
+    return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
+  case AArch64::SUBSXrx:
+    return AArch64::SUBXrx;
+  }
+}
+
+enum AccessKind {
+  AK_Write = 0x01,
+  AK_Read  = 0x10,
+  AK_All   = 0x11
+};
+
+/// True when condition flags are accessed (either by writing or reading)
+/// on the instruction trace starting at From and ending at To.
+///
+/// Note: If From and To are from different blocks it's assumed CC are accessed
+///       on the path.
+static bool areCFlagsAccessedBetweenInstrs(
+    MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
+    const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
+  // Early exit if To is at the beginning of the BB.
+  if (To == To->getParent()->begin())
+    return true;
+
+  // Check whether the instructions are in the same basic block
+  // If not, assume the condition flags might get modified somewhere.
+  if (To->getParent() != From->getParent())
+    return true;
+
+  // From must be above To.
+  assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
+                      [From](MachineInstr &MI) {
+                        return MI.getIterator() == From;
+                      }) != To->getParent()->rend());
+
+  // We iterate backward starting \p To until we hit \p From.
+  for (--To; To != From; --To) {
+    const MachineInstr &Instr = *To;
+
+    if ( ((AccessToCheck & AK_Write) && Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
+         ((AccessToCheck & AK_Read)  && Instr.readsRegister(AArch64::NZCV, TRI)))
+      return true;
+  }
+  return false;
+}
+
+/// Try to optimize a compare instruction. A compare instruction is an
+/// instruction which produces AArch64::NZCV. It can be truly compare instruction
+/// when there are no uses of its destination register.
+///
+/// The following steps are tried in order:
+/// 1. Convert CmpInstr into an unconditional version.
+/// 2. Remove CmpInstr if above there is an instruction producing a needed
+///    condition code or an instruction which can be converted into such an instruction.
+///    Only comparison with zero is supported.
+bool AArch64InstrInfo::optimizeCompareInstr(
+    MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+    int CmpValue, const MachineRegisterInfo *MRI) const {
+  assert(CmpInstr.getParent());
+  assert(MRI);
+
+  // Replace SUBSWrr with SUBWrr if NZCV is not used.
+  int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
+  if (DeadNZCVIdx != -1) {
+    if (CmpInstr.definesRegister(AArch64::WZR) ||
+        CmpInstr.definesRegister(AArch64::XZR)) {
+      CmpInstr.eraseFromParent();
+      return true;
+    }
+    unsigned Opc = CmpInstr.getOpcode();
+    unsigned NewOpc = convertFlagSettingOpcode(CmpInstr);
+    if (NewOpc == Opc)
+      return false;
+    const MCInstrDesc &MCID = get(NewOpc);
+    CmpInstr.setDesc(MCID);
+    CmpInstr.RemoveOperand(DeadNZCVIdx);
+    bool succeeded = UpdateOperandRegClass(CmpInstr);
+    (void)succeeded;
+    assert(succeeded && "Some operands reg class are incompatible!");
+    return true;
+  }
+
+  // Continue only if we have a "ri" where immediate is zero.
+  // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
+  // function.
+  assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
+  if (CmpValue != 0 || SrcReg2 != 0)
+    return false;
+
+  // CmpInstr is a Compare instruction if destination register is not used.
+  if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
+    return false;
+
+  return substituteCmpToZero(CmpInstr, SrcReg, MRI);
+}
+
+/// Get opcode of S version of Instr.
+/// If Instr is S version its opcode is returned.
+/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
+/// or we are not interested in it.
+static unsigned sForm(MachineInstr &Instr) {
+  switch (Instr.getOpcode()) {
+  default:
+    return AArch64::INSTRUCTION_LIST_END;
+
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWri:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXri:
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSWri:
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSXri:
+    return Instr.getOpcode();;
+
+  case AArch64::ADDWrr:    return AArch64::ADDSWrr;
+  case AArch64::ADDWri:    return AArch64::ADDSWri;
+  case AArch64::ADDXrr:    return AArch64::ADDSXrr;
+  case AArch64::ADDXri:    return AArch64::ADDSXri;
+  case AArch64::ADCWr:     return AArch64::ADCSWr;
+  case AArch64::ADCXr:     return AArch64::ADCSXr;
+  case AArch64::SUBWrr:    return AArch64::SUBSWrr;
+  case AArch64::SUBWri:    return AArch64::SUBSWri;
+  case AArch64::SUBXrr:    return AArch64::SUBSXrr;
+  case AArch64::SUBXri:    return AArch64::SUBSXri;
+  case AArch64::SBCWr:     return AArch64::SBCSWr;
+  case AArch64::SBCXr:     return AArch64::SBCSXr;
+  case AArch64::ANDWri:    return AArch64::ANDSWri;
+  case AArch64::ANDXri:    return AArch64::ANDSXri;
+  }
+}
+
+/// Check if AArch64::NZCV should be alive in successors of MBB.
+static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
+  for (auto *BB : MBB->successors())
+    if (BB->isLiveIn(AArch64::NZCV))
+      return true;
+  return false;
+}
+
+namespace {
+struct UsedNZCV {
+  bool N;
+  bool Z;
+  bool C;
+  bool V;
+  UsedNZCV(): N(false), Z(false), C(false), V(false) {}
+  UsedNZCV& operator |=(const UsedNZCV& UsedFlags) {
+    this->N |= UsedFlags.N;
+    this->Z |= UsedFlags.Z;
+    this->C |= UsedFlags.C;
+    this->V |= UsedFlags.V;
+    return *this;
+  }
+};
+} // end anonymous namespace
+
+/// Find a condition code used by the instruction.
+/// Returns AArch64CC::Invalid if either the instruction does not use condition
+/// codes or we don't optimize CmpInstr in the presence of such instructions.
+static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
+  switch (Instr.getOpcode()) {
+    default:
+      return AArch64CC::Invalid;
+
+    case AArch64::Bcc: {
+      int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
+      assert(Idx >= 2);
+      return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
+    }
+
+    case AArch64::CSINVWr:
+    case AArch64::CSINVXr:
+    case AArch64::CSINCWr:
+    case AArch64::CSINCXr:
+    case AArch64::CSELWr:
+    case AArch64::CSELXr:
+    case AArch64::CSNEGWr:
+    case AArch64::CSNEGXr:
+    case AArch64::FCSELSrrr:
+    case AArch64::FCSELDrrr: {
+      int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
+      assert(Idx >= 1);
+      return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
+    }
+  }
+}
+
+static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
+  assert(CC != AArch64CC::Invalid);
+  UsedNZCV UsedFlags;
+  switch (CC) {
+    default:
+      break;
+
+    case AArch64CC::EQ: // Z set
+    case AArch64CC::NE: // Z clear
+      UsedFlags.Z = true;
+      break;
+
+    case AArch64CC::HI: // Z clear and C set
+    case AArch64CC::LS: // Z set   or  C clear
+      UsedFlags.Z = true;
+    case AArch64CC::HS: // C set
+    case AArch64CC::LO: // C clear
+      UsedFlags.C = true;
+      break;
+
+    case AArch64CC::MI: // N set
+    case AArch64CC::PL: // N clear
+      UsedFlags.N = true;
+      break;
+
+    case AArch64CC::VS: // V set
+    case AArch64CC::VC: // V clear
+      UsedFlags.V = true;
+      break;
+
+    case AArch64CC::GT: // Z clear, N and V the same
+    case AArch64CC::LE: // Z set,   N and V differ
+      UsedFlags.Z = true;
+    case AArch64CC::GE: // N and V the same
+    case AArch64CC::LT: // N and V differ 
+      UsedFlags.N = true;
+      UsedFlags.V = true;
+      break;
+  }
+  return UsedFlags;
+}
+
+static bool isADDSRegImm(unsigned Opcode) {
+  return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
+}
+
+static bool isSUBSRegImm(unsigned Opcode) {
+  return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
+}
+
+/// Check if CmpInstr can be substituted by MI.
+///
+/// CmpInstr can be substituted:
+/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
+/// - and, MI and CmpInstr are from the same MachineBB
+/// - and, condition flags are not alive in successors of the CmpInstr parent
+/// - and, if MI opcode is the S form there must be no defs of flags between
+///        MI and CmpInstr
+///        or if MI opcode is not the S form there must be neither defs of flags
+///        nor uses of flags between MI and CmpInstr.
+/// - and  C/V flags are not used after CmpInstr
+static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
+    const TargetRegisterInfo *TRI) {
+  assert(MI);
+  assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
+  assert(CmpInstr);
+
+  const unsigned CmpOpcode = CmpInstr->getOpcode();
+  if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
+    return false;
+
+  if (MI->getParent() != CmpInstr->getParent())
+    return false;
+
+  if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
+    return false;
+
+  AccessKind AccessToCheck = AK_Write;
+  if (sForm(*MI) != MI->getOpcode())
+    AccessToCheck = AK_All;
+  if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
+    return false;
+
+  UsedNZCV NZCVUsedAfterCmp;
+  for (auto I = std::next(CmpInstr->getIterator()), E = CmpInstr->getParent()->instr_end();
+       I != E; ++I) {
+    const MachineInstr &Instr = *I;
+    if (Instr.readsRegister(AArch64::NZCV, TRI)) {
+      AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
+      if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
+        return false;
+      NZCVUsedAfterCmp |= getUsedNZCV(CC);
+    }
+
+    if (Instr.modifiesRegister(AArch64::NZCV, TRI))
+      break;
+  }
+  
+  return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
+}
+
+/// Substitute an instruction comparing to zero with another instruction
+/// which produces needed condition flags.
+///
+/// Return true on success.
+bool AArch64InstrInfo::substituteCmpToZero(
+    MachineInstr &CmpInstr, unsigned SrcReg,
+    const MachineRegisterInfo *MRI) const {
+  assert(MRI);
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI)
+    return false;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+  unsigned NewOpc = sForm(*MI);
+  if (NewOpc == AArch64::INSTRUCTION_LIST_END)
+    return false;
+
+  if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
+    return false;
+
+  // Update the instruction to set NZCV.
+  MI->setDesc(get(NewOpc));
+  CmpInstr.eraseFromParent();
+  bool succeeded = UpdateOperandRegClass(*MI);
+  (void)succeeded;
+  assert(succeeded && "Some operands reg class are incompatible!");
+  MI->addRegisterDefined(AArch64::NZCV, TRI);
+  return true;
+}
+
+bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD)
+    return false;
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Reg = MI.getOperand(0).getReg();
+  const GlobalValue *GV =
+      cast<GlobalValue>((*MI.memoperands_begin())->getValue());
+  const TargetMachine &TM = MBB.getParent()->getTarget();
+  unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
+  const unsigned char MO_NC = AArch64II::MO_NC;
+
+  if ((OpFlags & AArch64II::MO_GOT) != 0) {
+    BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
+        .addGlobalAddress(GV, 0, AArch64II::MO_GOT);
+    BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addImm(0)
+        .addMemOperand(*MI.memoperands_begin());
+  } else if (TM.getCodeModel() == CodeModel::Large) {
+    BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
+        .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48);
+    BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32);
+    BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16);
+    BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0);
+    BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addImm(0)
+        .addMemOperand(*MI.memoperands_begin());
+  } else {
+    BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
+        .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
+    unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
+    BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addGlobalAddress(GV, 0, LoFlags)
+        .addMemOperand(*MI.memoperands_begin());
+  }
+
+  MBB.erase(MI);
+
+  return true;
+}
+
+/// Return true if this is this instruction has a non-zero immediate
+bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSXrs:
+  case AArch64::ADDWrs:
+  case AArch64::ADDXrs:
+  case AArch64::ANDSWrs:
+  case AArch64::ANDSXrs:
+  case AArch64::ANDWrs:
+  case AArch64::ANDXrs:
+  case AArch64::BICSWrs:
+  case AArch64::BICSXrs:
+  case AArch64::BICWrs:
+  case AArch64::BICXrs:
+  case AArch64::CRC32Brr:
+  case AArch64::CRC32CBrr:
+  case AArch64::CRC32CHrr:
+  case AArch64::CRC32CWrr:
+  case AArch64::CRC32CXrr:
+  case AArch64::CRC32Hrr:
+  case AArch64::CRC32Wrr:
+  case AArch64::CRC32Xrr:
+  case AArch64::EONWrs:
+  case AArch64::EONXrs:
+  case AArch64::EORWrs:
+  case AArch64::EORXrs:
+  case AArch64::ORNWrs:
+  case AArch64::ORNXrs:
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+  case AArch64::SUBSWrs:
+  case AArch64::SUBSXrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBXrs:
+    if (MI.getOperand(3).isImm()) {
+      unsigned val = MI.getOperand(3).getImm();
+      return (val != 0);
+    }
+    break;
+  }
+  return false;
+}
+
+/// Return true if this is this instruction has a non-zero immediate
+bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case AArch64::ADDSWrx:
+  case AArch64::ADDSXrx:
+  case AArch64::ADDSXrx64:
+  case AArch64::ADDWrx:
+  case AArch64::ADDXrx:
+  case AArch64::ADDXrx64:
+  case AArch64::SUBSWrx:
+  case AArch64::SUBSXrx:
+  case AArch64::SUBSXrx64:
+  case AArch64::SUBWrx:
+  case AArch64::SUBXrx:
+  case AArch64::SUBXrx64:
+    if (MI.getOperand(3).isImm()) {
+      unsigned val = MI.getOperand(3).getImm();
+      return (val != 0);
+    }
+    break;
+  }
+
+  return false;
+}
+
+// Return true if this instruction simply sets its single destination register
+// to zero. This is equivalent to a register rename of the zero-register.
+bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case AArch64::MOVZWi:
+  case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
+    if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
+      assert(MI.getDesc().getNumOperands() == 3 &&
+             MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
+      return true;
+    }
+    break;
+  case AArch64::ANDWri: // and Rd, Rzr, #imm
+    return MI.getOperand(1).getReg() == AArch64::WZR;
+  case AArch64::ANDXri:
+    return MI.getOperand(1).getReg() == AArch64::XZR;
+  case TargetOpcode::COPY:
+    return MI.getOperand(1).getReg() == AArch64::WZR;
+  }
+  return false;
+}
+
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case TargetOpcode::COPY: {
+    // GPR32 copies will by lowered to ORRXrs
+    unsigned DstReg = MI.getOperand(0).getReg();
+    return (AArch64::GPR32RegClass.contains(DstReg) ||
+            AArch64::GPR64RegClass.contains(DstReg));
+  }
+  case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
+    if (MI.getOperand(1).getReg() == AArch64::XZR) {
+      assert(MI.getDesc().getNumOperands() == 4 &&
+             MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
+      return true;
+    }
+    break;
+  case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
+    if (MI.getOperand(2).getImm() == 0) {
+      assert(MI.getDesc().getNumOperands() == 4 &&
+             MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
+      return true;
+    }
+    break;
+  }
+  return false;
+}
+
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case TargetOpcode::COPY: {
+    // FPR64 copies will by lowered to ORR.16b
+    unsigned DstReg = MI.getOperand(0).getReg();
+    return (AArch64::FPR64RegClass.contains(DstReg) ||
+            AArch64::FPR128RegClass.contains(DstReg));
+  }
+  case AArch64::ORRv16i8:
+    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
+      assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
+             "invalid ORRv16i8 operands");
+      return true;
+    }
+    break;
+  }
+  return false;
+}
+
+unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                               int &FrameIndex) const {
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRBui:
+  case AArch64::LDRHui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+    if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
+        MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
+  }
+
+  return 0;
+}
+
+unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                              int &FrameIndex) const {
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case AArch64::STRWui:
+  case AArch64::STRXui:
+  case AArch64::STRBui:
+  case AArch64::STRHui:
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+    if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
+        MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+/// Return true if this is load/store scales or extends its register offset.
+/// This refers to scaling a dynamic index as opposed to scaled immediates.
+/// MI should be a memory op that allows scaled addressing.
+bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case AArch64::LDRBBroW:
+  case AArch64::LDRBroW:
+  case AArch64::LDRDroW:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRHroW:
+  case AArch64::LDRQroW:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSWroW:
+  case AArch64::LDRSroW:
+  case AArch64::LDRWroW:
+  case AArch64::LDRXroW:
+  case AArch64::STRBBroW:
+  case AArch64::STRBroW:
+  case AArch64::STRDroW:
+  case AArch64::STRHHroW:
+  case AArch64::STRHroW:
+  case AArch64::STRQroW:
+  case AArch64::STRSroW:
+  case AArch64::STRWroW:
+  case AArch64::STRXroW:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRBroX:
+  case AArch64::LDRDroX:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRHroX:
+  case AArch64::LDRQroX:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRSroX:
+  case AArch64::LDRWroX:
+  case AArch64::LDRXroX:
+  case AArch64::STRBBroX:
+  case AArch64::STRBroX:
+  case AArch64::STRDroX:
+  case AArch64::STRHHroX:
+  case AArch64::STRHroX:
+  case AArch64::STRQroX:
+  case AArch64::STRSroX:
+  case AArch64::STRWroX:
+  case AArch64::STRXroX:
+
+    unsigned Val = MI.getOperand(3).getImm();
+    AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val);
+    return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val);
+  }
+  return false;
+}
+
+/// Check all MachineMemOperands for a hint to suppress pairing.
+bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) const {
+  return any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
+    return MMO->getFlags() & MOSuppressPair;
+  });
+}
+
+/// Set a flag on the first MachineMemOperand to suppress pairing.
+void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const {
+  if (MI.memoperands_empty())
+    return;
+  (*MI.memoperands_begin())->setFlags(MOSuppressPair);
+}
+
+bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
+  case AArch64::STURBBi:
+  case AArch64::STURHHi:
+  case AArch64::STURWi:
+  case AArch64::STURXi:
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
+  case AArch64::LDURSWi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSHWi:
+    return true;
+  }
+}
+
+bool AArch64InstrInfo::isUnscaledLdSt(MachineInstr &MI) const {
+  return isUnscaledLdSt(MI.getOpcode());
+}
+
+// Is this a candidate for ld/st merging or pairing?  For example, we don't
+// touch volatiles or load/stores that have a hint to avoid pair formation.
+bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const {
+  // If this is a volatile load/store, don't mess with it.
+  if (MI.hasOrderedMemoryRef())
+    return false;
+
+  // Make sure this is a reg+imm (as opposed to an address reloc).
+  assert(MI.getOperand(1).isReg() && "Expected a reg operand.");
+  if (!MI.getOperand(2).isImm())
+    return false;
+
+  // Can't merge/pair if the instruction modifies the base register.
+  // e.g., ldr x0, [x0]
+  unsigned BaseReg = MI.getOperand(1).getReg();
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  if (MI.modifiesRegister(BaseReg, TRI))
+    return false;
+
+  // Check if this load/store has a hint to avoid pair formation.
+  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
+  if (isLdStPairSuppressed(MI))
+    return false;
+
+  // On some CPUs quad load/store pairs are slower than two single load/stores.
+  if (Subtarget.avoidQuadLdStPairs()) {
+    switch (MI.getOpcode()) {
+    default:
+      break;
+    case AArch64::LDURQi:
+    case AArch64::STURQi:
+    case AArch64::LDRQui:
+    case AArch64::STRQui:
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool AArch64InstrInfo::getMemOpBaseRegImmOfs(
+    MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset,
+    const TargetRegisterInfo *TRI) const {
+  unsigned Width;
+  return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI);
+}
+
+bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
+    MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, unsigned &Width,
+    const TargetRegisterInfo *TRI) const {
+  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
+  // Handle only loads/stores with base register followed by immediate offset.
+  if (LdSt.getNumExplicitOperands() == 3) {
+    // Non-paired instruction (e.g., ldr x1, [x0, #8]).
+    if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isImm())
+      return false;
+  } else if (LdSt.getNumExplicitOperands() == 4) {
+    // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
+    if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isReg() ||
+        !LdSt.getOperand(3).isImm())
+      return false;
+  } else
+    return false;
+
+  // Offset is calculated as the immediate operand multiplied by the scaling factor.
+  // Unscaled instructions have scaling factor set to 1.
+  unsigned Scale = 0;
+  switch (LdSt.getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDURQi:
+  case AArch64::STURQi:
+    Width = 16;
+    Scale = 1;
+    break;
+  case AArch64::LDURXi:
+  case AArch64::LDURDi:
+  case AArch64::STURXi:
+  case AArch64::STURDi:
+    Width = 8;
+    Scale = 1;
+    break;
+  case AArch64::LDURWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURSWi:
+  case AArch64::STURWi:
+  case AArch64::STURSi:
+    Width = 4;
+    Scale = 1;
+    break;
+  case AArch64::LDURHi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSHWi:
+  case AArch64::STURHi:
+  case AArch64::STURHHi:
+    Width = 2;
+    Scale = 1;
+    break;
+  case AArch64::LDURBi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSBWi:
+  case AArch64::STURBi:
+  case AArch64::STURBBi:
+    Width = 1;
+    Scale = 1;
+    break;
+  case AArch64::LDPQi:
+  case AArch64::LDNPQi:
+  case AArch64::STPQi:
+  case AArch64::STNPQi:
+    Scale = 16;
+    Width = 32;
+    break;
+  case AArch64::LDRQui:
+  case AArch64::STRQui:
+    Scale = Width = 16;
+    break;
+  case AArch64::LDPXi:
+  case AArch64::LDPDi:
+  case AArch64::LDNPXi:
+  case AArch64::LDNPDi:
+  case AArch64::STPXi:
+  case AArch64::STPDi:
+  case AArch64::STNPXi:
+  case AArch64::STNPDi:
+    Scale = 8;
+    Width = 16;
+    break;
+  case AArch64::LDRXui:
+  case AArch64::LDRDui:
+  case AArch64::STRXui:
+  case AArch64::STRDui:
+    Scale = Width = 8;
+    break;
+  case AArch64::LDPWi:
+  case AArch64::LDPSi:
+  case AArch64::LDNPWi:
+  case AArch64::LDNPSi:
+  case AArch64::STPWi:
+  case AArch64::STPSi:
+  case AArch64::STNPWi:
+  case AArch64::STNPSi:
+    Scale = 4;
+    Width = 8;
+    break;
+  case AArch64::LDRWui:
+  case AArch64::LDRSui:
+  case AArch64::LDRSWui:
+  case AArch64::STRWui:
+  case AArch64::STRSui:
+    Scale = Width = 4;
+    break;
+  case AArch64::LDRHui:
+  case AArch64::LDRHHui:
+  case AArch64::STRHui:
+  case AArch64::STRHHui:
+    Scale = Width = 2;
+    break;
+  case AArch64::LDRBui:
+  case AArch64::LDRBBui:
+  case AArch64::STRBui:
+  case AArch64::STRBBui:
+    Scale = Width = 1;
+    break;
+  }
+
+  if (LdSt.getNumExplicitOperands() == 3) {
+    BaseReg = LdSt.getOperand(1).getReg();
+    Offset = LdSt.getOperand(2).getImm() * Scale;
+  } else {
+    assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
+    BaseReg = LdSt.getOperand(2).getReg();
+    Offset = LdSt.getOperand(3).getImm() * Scale;
+  }
+  return true;
+}
+
+// Scale the unscaled offsets.  Returns false if the unscaled offset can't be
+// scaled.
+static bool scaleOffset(unsigned Opc, int64_t &Offset) {
+  unsigned OffsetStride = 1;
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::LDURQi:
+  case AArch64::STURQi:
+    OffsetStride = 16;
+    break;
+  case AArch64::LDURXi:
+  case AArch64::LDURDi:
+  case AArch64::STURXi:
+  case AArch64::STURDi:
+    OffsetStride = 8;
+    break;
+  case AArch64::LDURWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURSWi:
+  case AArch64::STURWi:
+  case AArch64::STURSi:
+    OffsetStride = 4;
+    break;
+  }
+  // If the byte-offset isn't a multiple of the stride, we can't scale this
+  // offset.
+  if (Offset % OffsetStride != 0)
+    return false;
+
+  // Convert the byte-offset used by unscaled into an "element" offset used
+  // by the scaled pair load/store instructions.
+  Offset /= OffsetStride;
+  return true;
+}
+
+static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
+  if (FirstOpc == SecondOpc)
+    return true;
+  // We can also pair sign-ext and zero-ext instructions.
+  switch (FirstOpc) {
+  default:
+    return false;
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+    return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
+  case AArch64::LDRSWui:
+  case AArch64::LDURSWi:
+    return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
+  }
+  // These instructions can't be paired based on their opcodes.
+  return false;
+}
+
+/// Detect opportunities for ldp/stp formation.
+///
+/// Only called for LdSt for which getMemOpBaseRegImmOfs returns true.
+bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
+                                           MachineInstr &SecondLdSt,
+                                           unsigned NumLoads) const {
+  // Only cluster up to a single pair.
+  if (NumLoads > 1)
+    return false;
+
+  if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
+    return false;
+
+  // Can we pair these instructions based on their opcodes?
+  unsigned FirstOpc = FirstLdSt.getOpcode();
+  unsigned SecondOpc = SecondLdSt.getOpcode();
+  if (!canPairLdStOpc(FirstOpc, SecondOpc))
+    return false;
+
+  // Can't merge volatiles or load/stores that have a hint to avoid pair
+  // formation, for example.
+  if (!isCandidateToMergeOrPair(FirstLdSt) ||
+      !isCandidateToMergeOrPair(SecondLdSt))
+    return false;
+
+  // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
+  int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
+  if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
+    return false;
+
+  int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
+  if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
+    return false;
+
+  // Pairwise instructions have a 7-bit signed offset field.
+  if (Offset1 > 63 || Offset1 < -64)
+    return false;
+
+  // The caller should already have ordered First/SecondLdSt by offset.
+  assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
+  return Offset1 + 1 == Offset2;
+}
+
+bool AArch64InstrInfo::shouldScheduleAdjacent(
+    const MachineInstr &First, const MachineInstr &Second) const {
+  if (Subtarget.hasArithmeticBccFusion()) {
+    // Fuse CMN, CMP, TST followed by Bcc.
+    unsigned SecondOpcode = Second.getOpcode();
+    if (SecondOpcode == AArch64::Bcc) {
+      switch (First.getOpcode()) {
+      default:
+        return false;
+      case AArch64::ADDSWri:
+      case AArch64::ADDSWrr:
+      case AArch64::ADDSXri:
+      case AArch64::ADDSXrr:
+      case AArch64::ANDSWri:
+      case AArch64::ANDSWrr:
+      case AArch64::ANDSXri:
+      case AArch64::ANDSXrr:
+      case AArch64::SUBSWri:
+      case AArch64::SUBSWrr:
+      case AArch64::SUBSXri:
+      case AArch64::SUBSXrr:
+      case AArch64::BICSWrr:
+      case AArch64::BICSXrr:
+        return true;
+      case AArch64::ADDSWrs:
+      case AArch64::ADDSXrs:
+      case AArch64::ANDSWrs:
+      case AArch64::ANDSXrs:
+      case AArch64::SUBSWrs:
+      case AArch64::SUBSXrs:
+      case AArch64::BICSWrs:
+      case AArch64::BICSXrs:
+        // Shift value can be 0 making these behave like the "rr" variant...
+        return !hasShiftedReg(Second);
+      }
+    }
+  }
+  if (Subtarget.hasArithmeticCbzFusion()) {
+    // Fuse ALU operations followed by CBZ/CBNZ.
+    unsigned SecondOpcode = Second.getOpcode();
+    if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
+        SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
+      switch (First.getOpcode()) {
+      default:
+        return false;
+      case AArch64::ADDWri:
+      case AArch64::ADDWrr:
+      case AArch64::ADDXri:
+      case AArch64::ADDXrr:
+      case AArch64::ANDWri:
+      case AArch64::ANDWrr:
+      case AArch64::ANDXri:
+      case AArch64::ANDXrr:
+      case AArch64::EORWri:
+      case AArch64::EORWrr:
+      case AArch64::EORXri:
+      case AArch64::EORXrr:
+      case AArch64::ORRWri:
+      case AArch64::ORRWrr:
+      case AArch64::ORRXri:
+      case AArch64::ORRXrr:
+      case AArch64::SUBWri:
+      case AArch64::SUBWrr:
+      case AArch64::SUBXri:
+      case AArch64::SUBXrr:
+        return true;
+      case AArch64::ADDWrs:
+      case AArch64::ADDXrs:
+      case AArch64::ANDWrs:
+      case AArch64::ANDXrs:
+      case AArch64::SUBWrs:
+      case AArch64::SUBXrs:
+      case AArch64::BICWrs:
+      case AArch64::BICXrs:
+        // Shift value can be 0 making these behave like the "rr" variant...
+        return !hasShiftedReg(Second);
+      }
+    }
+  }
+  return false;
+}
+
+MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
+    MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var,
+    const MDNode *Expr, const DebugLoc &DL) const {
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
+                                .addFrameIndex(FrameIx)
+                                .addImm(0)
+                                .addImm(Offset)
+                                .addMetadata(Var)
+                                .addMetadata(Expr);
+  return &*MIB;
+}
+
+static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
+                                            unsigned Reg, unsigned SubIdx,
+                                            unsigned State,
+                                            const TargetRegisterInfo *TRI) {
+  if (!SubIdx)
+    return MIB.addReg(Reg, State);
+
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
+  return MIB.addReg(Reg, State, SubIdx);
+}
+
+static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
+                                        unsigned NumRegs) {
+  // We really want the positive remainder mod 32 here, that happens to be
+  // easily obtainable with a mask.
+  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
+}
+
+void AArch64InstrInfo::copyPhysRegTuple(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
+    unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode,
+    llvm::ArrayRef<unsigned> Indices) const {
+  assert(Subtarget.hasNEON() &&
+         "Unexpected register copy without NEON");
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
+  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
+  unsigned NumRegs = Indices.size();
+
+  int SubReg = 0, End = NumRegs, Incr = 1;
+  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
+    SubReg = NumRegs - 1;
+    End = -1;
+    Incr = -1;
+  }
+
+  for (; SubReg != End; SubReg += Incr) {
+    const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
+    AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
+    AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
+    AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
+  }
+}
+
+void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I,
+                                   const DebugLoc &DL, unsigned DestReg,
+                                   unsigned SrcReg, bool KillSrc) const {
+  if (AArch64::GPR32spRegClass.contains(DestReg) &&
+      (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+    if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
+      // If either operand is WSP, expand to ADD #0.
+      if (Subtarget.hasZeroCycleRegMove()) {
+        // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
+        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+                                                     &AArch64::GPR64spRegClass);
+        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+                                                    &AArch64::GPR64spRegClass);
+        // This instruction is reading and writing X registers.  This may upset
+        // the register scavenger and machine verifier, so we need to indicate
+        // that we are reading an undefined value from SrcRegX, but a proper
+        // value from SrcReg.
+        BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
+            .addReg(SrcRegX, RegState::Undef)
+            .addImm(0)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      } else {
+        BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc))
+            .addImm(0)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+      }
+    } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroing()) {
+      BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg).addImm(0).addImm(
+          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else {
+      if (Subtarget.hasZeroCycleRegMove()) {
+        // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
+        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+                                                     &AArch64::GPR64spRegClass);
+        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+                                                    &AArch64::GPR64spRegClass);
+        // This instruction is reading and writing X registers.  This may upset
+        // the register scavenger and machine verifier, so we need to indicate
+        // that we are reading an undefined value from SrcRegX, but a proper
+        // value from SrcReg.
+        BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
+            .addReg(AArch64::XZR)
+            .addReg(SrcRegX, RegState::Undef)
+            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      } else {
+        // Otherwise, expand to ORR WZR.
+        BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
+            .addReg(AArch64::WZR)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+      }
+    }
+    return;
+  }
+
+  if (AArch64::GPR64spRegClass.contains(DestReg) &&
+      (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
+    if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
+      // If either operand is SP, expand to ADD #0.
+      BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          .addImm(0)
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroing()) {
+      BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg).addImm(0).addImm(
+          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else {
+      // Otherwise, expand to ORR XZR.
+      BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
+          .addReg(AArch64::XZR)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  // Copy a DDDD register quad by copying the individual sub-registers.
+  if (AArch64::DDDDRegClass.contains(DestReg) &&
+      AArch64::DDDDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1,
+                                        AArch64::dsub2, AArch64::dsub3 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a DDD register triple by copying the individual sub-registers.
+  if (AArch64::DDDRegClass.contains(DestReg) &&
+      AArch64::DDDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1,
+                                        AArch64::dsub2 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a DD register pair by copying the individual sub-registers.
+  if (AArch64::DDRegClass.contains(DestReg) &&
+      AArch64::DDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQQQ register quad by copying the individual sub-registers.
+  if (AArch64::QQQQRegClass.contains(DestReg) &&
+      AArch64::QQQQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1,
+                                        AArch64::qsub2, AArch64::qsub3 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQQ register triple by copying the individual sub-registers.
+  if (AArch64::QQQRegClass.contains(DestReg) &&
+      AArch64::QQQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1,
+                                        AArch64::qsub2 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQ register pair by copying the individual sub-registers.
+  if (AArch64::QQRegClass.contains(DestReg) &&
+      AArch64::QQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  if (AArch64::FPR128RegClass.contains(DestReg) &&
+      AArch64::FPR128RegClass.contains(SrcReg)) {
+    if(Subtarget.hasNEON()) {
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::STRQpre))
+        .addReg(AArch64::SP, RegState::Define)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addReg(AArch64::SP)
+        .addImm(-16);
+      BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
+        .addReg(AArch64::SP, RegState::Define)
+        .addReg(DestReg, RegState::Define)
+        .addReg(AArch64::SP)
+        .addImm(16);
+    }
+    return;
+  }
+
+  if (AArch64::FPR64RegClass.contains(DestReg) &&
+      AArch64::FPR64RegClass.contains(SrcReg)) {
+    if(Subtarget.hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  if (AArch64::FPR32RegClass.contains(DestReg) &&
+      AArch64::FPR32RegClass.contains(SrcReg)) {
+    if(Subtarget.hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  if (AArch64::FPR16RegClass.contains(DestReg) &&
+      AArch64::FPR16RegClass.contains(SrcReg)) {
+    if(Subtarget.hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+                                       &AArch64::FPR32RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+                                      &AArch64::FPR32RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  if (AArch64::FPR8RegClass.contains(DestReg) &&
+      AArch64::FPR8RegClass.contains(SrcReg)) {
+    if(Subtarget.hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+                                       &AArch64::FPR32RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+                                      &AArch64::FPR32RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  // Copies between GPR64 and FPR64.
+  if (AArch64::FPR64RegClass.contains(DestReg) &&
+      AArch64::GPR64RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (AArch64::GPR64RegClass.contains(DestReg) &&
+      AArch64::FPR64RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  // Copies between GPR32 and FPR32.
+  if (AArch64::FPR32RegClass.contains(DestReg) &&
+      AArch64::GPR32RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (AArch64::GPR32RegClass.contains(DestReg) &&
+      AArch64::FPR32RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (DestReg == AArch64::NZCV) {
+    assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
+    BuildMI(MBB, I, DL, get(AArch64::MSR))
+      .addImm(AArch64SysReg::NZCV)
+      .addReg(SrcReg, getKillRegState(KillSrc))
+      .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
+    return;
+  }
+
+  if (SrcReg == AArch64::NZCV) {
+    assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
+    BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
+      .addImm(AArch64SysReg::NZCV)
+      .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
+    return;
+  }
+
+  llvm_unreachable("unimplemented reg-to-reg copy");
+}
+
+void AArch64InstrInfo::storeRegToStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+    bool isKill, int FI, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
+  unsigned Opc = 0;
+  bool Offset = true;
+  switch (RC->getSize()) {
+  case 1:
+    if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRBui;
+    break;
+  case 2:
+    if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRHui;
+    break;
+  case 4:
+    if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::STRWui;
+      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+        MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+      else
+        assert(SrcReg != AArch64::WSP);
+    } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRSui;
+    break;
+  case 8:
+    if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::STRXui;
+      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+        MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
+      else
+        assert(SrcReg != AArch64::SP);
+    } else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRDui;
+    break;
+  case 16:
+    if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRQui;
+    else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Twov1d;
+      Offset = false;
+    }
+    break;
+  case 24:
+    if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Threev1d;
+      Offset = false;
+    }
+    break;
+  case 32:
+    if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Fourv1d;
+      Offset = false;
+    } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Twov2d;
+      Offset = false;
+    }
+    break;
+  case 48:
+    if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Threev2d;
+      Offset = false;
+    }
+    break;
+  case 64:
+    if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Fourv2d;
+      Offset = false;
+    }
+    break;
+  }
+  assert(Opc && "Unknown register class");
+
+  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DL, get(Opc))
+                                      .addReg(SrcReg, getKillRegState(isKill))
+                                      .addFrameIndex(FI);
+
+  if (Offset)
+    MI.addImm(0);
+  MI.addMemOperand(MMO);
+}
+
+void AArch64InstrInfo::loadRegFromStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
+    int FI, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
+
+  unsigned Opc = 0;
+  bool Offset = true;
+  switch (RC->getSize()) {
+  case 1:
+    if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRBui;
+    break;
+  case 2:
+    if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRHui;
+    break;
+  case 4:
+    if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::LDRWui;
+      if (TargetRegisterInfo::isVirtualRegister(DestReg))
+        MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
+      else
+        assert(DestReg != AArch64::WSP);
+    } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRSui;
+    break;
+  case 8:
+    if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::LDRXui;
+      if (TargetRegisterInfo::isVirtualRegister(DestReg))
+        MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
+      else
+        assert(DestReg != AArch64::SP);
+    } else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRDui;
+    break;
+  case 16:
+    if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRQui;
+    else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Twov1d;
+      Offset = false;
+    }
+    break;
+  case 24:
+    if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Threev1d;
+      Offset = false;
+    }
+    break;
+  case 32:
+    if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Fourv1d;
+      Offset = false;
+    } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Twov2d;
+      Offset = false;
+    }
+    break;
+  case 48:
+    if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Threev2d;
+      Offset = false;
+    }
+    break;
+  case 64:
+    if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Fourv2d;
+      Offset = false;
+    }
+    break;
+  }
+  assert(Opc && "Unknown register class");
+
+  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DL, get(Opc))
+                                      .addReg(DestReg, getDefRegState(true))
+                                      .addFrameIndex(FI);
+  if (Offset)
+    MI.addImm(0);
+  MI.addMemOperand(MMO);
+}
+
+void llvm::emitFrameOffset(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+                           unsigned DestReg, unsigned SrcReg, int Offset,
+                           const TargetInstrInfo *TII,
+                           MachineInstr::MIFlag Flag, bool SetNZCV) {
+  if (DestReg == SrcReg && Offset == 0)
+    return;
+
+  assert((DestReg != AArch64::SP || Offset % 16 == 0) &&
+         "SP increment/decrement not 16-byte aligned");
+
+  bool isSub = Offset < 0;
+  if (isSub)
+    Offset = -Offset;
+
+  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
+  // scratch register.  If DestReg is a virtual register, use it as the
+  // scratch register; otherwise, create a new virtual register (to be
+  // replaced by the scavenger at the end of PEI).  That case can be optimized
+  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
+  // register can be loaded with offset%8 and the add/sub can use an extending
+  // instruction with LSL#3.
+  // Currently the function handles any offsets but generates a poor sequence
+  // of code.
+  //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
+
+  unsigned Opc;
+  if (SetNZCV)
+    Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
+  else
+    Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
+  const unsigned MaxEncoding = 0xfff;
+  const unsigned ShiftSize = 12;
+  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
+  while (((unsigned)Offset) >= (1 << ShiftSize)) {
+    unsigned ThisVal;
+    if (((unsigned)Offset) > MaxEncodableValue) {
+      ThisVal = MaxEncodableValue;
+    } else {
+      ThisVal = Offset & MaxEncodableValue;
+    }
+    assert((ThisVal >> ShiftSize) <= MaxEncoding &&
+           "Encoding cannot handle value that big");
+    BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+        .addReg(SrcReg)
+        .addImm(ThisVal >> ShiftSize)
+        .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
+        .setMIFlag(Flag);
+
+    SrcReg = DestReg;
+    Offset -= ThisVal;
+    if (Offset == 0)
+      return;
+  }
+  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+      .addReg(SrcReg)
+      .addImm(Offset)
+      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+      .setMIFlag(Flag);
+}
+
+MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex,
+    LiveIntervals *LIS) const {
+  // This is a bit of a hack. Consider this instruction:
+  //
+  //   %vreg0<def> = COPY %SP; GPR64all:%vreg0
+  //
+  // We explicitly chose GPR64all for the virtual register so such a copy might
+  // be eliminated by RegisterCoalescer. However, that may not be possible, and
+  // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all
+  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
+  //
+  // To prevent that, we are going to constrain the %vreg0 register class here.
+  //
+  // <rdar://problem/11522048>
+  //
+  if (MI.isCopy()) {
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned SrcReg = MI.getOperand(1).getReg();
+    if (SrcReg == AArch64::SP &&
+        TargetRegisterInfo::isVirtualRegister(DstReg)) {
+      MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
+      return nullptr;
+    }
+    if (DstReg == AArch64::SP &&
+        TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+      MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
+      return nullptr;
+    }
+  }
+
+  // Handle the case where a copy is being spilled or refilled but the source
+  // and destination register class don't match.  For example:
+  //
+  //   %vreg0<def> = COPY %XZR; GPR64common:%vreg0
+  //
+  // In this case we can still safely fold away the COPY and generate the
+  // following spill code:
+  //
+  //   STRXui %XZR, <fi#0>
+  //
+  // This also eliminates spilled cross register class COPYs (e.g. between x and
+  // d regs) of the same size.  For example:
+  //
+  //   %vreg0<def> = COPY %vreg1; GPR64:%vreg0, FPR64:%vreg1
+  //
+  // will be refilled as
+  //
+  //   LDRDui %vreg0, fi<#0>
+  //
+  // instead of
+  //
+  //   LDRXui %vregTemp, fi<#0>
+  //   %vreg0 = FMOV %vregTemp
+  //
+  if (MI.isFullCopy() && Ops.size() == 1 &&
+      // Make sure we're only folding the explicit COPY defs/uses.
+      (Ops[0] == 0 || Ops[0] == 1)) {
+    const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    MachineBasicBlock &MBB = *MI.getParent();
+    const MachineOperand &DstMO = MI.getOperand(0);
+    const MachineOperand &SrcMO = MI.getOperand(1);
+    unsigned DstReg = DstMO.getReg();
+    unsigned SrcReg = SrcMO.getReg();
+    auto getRegClass = [&](unsigned Reg) {
+      return TargetRegisterInfo::isVirtualRegister(Reg)
+                 ? MRI.getRegClass(Reg)
+                 : TRI.getMinimalPhysRegClass(Reg);
+    };
+    const TargetRegisterClass &DstRC = *getRegClass(DstReg);
+    const TargetRegisterClass &SrcRC = *getRegClass(SrcReg);
+    if (DstRC.getSize() == SrcRC.getSize()) {
+      if (Ops[0] == 0)
+        storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
+                            &SrcRC, &TRI);
+      else
+        loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, &DstRC, &TRI);
+      return &*--InsertPt;
+    }
+  }
+
+  // Cannot fold.
+  return nullptr;
+}
+
+int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+                                    bool *OutUseUnscaledOp,
+                                    unsigned *OutUnscaledOp,
+                                    int *EmittableOffset) {
+  int Scale = 1;
+  bool IsSigned = false;
+  // The ImmIdx should be changed case by case if it is not 2.
+  unsigned ImmIdx = 2;
+  unsigned UnscaledOp = 0;
+  // Set output values in case of early exit.
+  if (EmittableOffset)
+    *EmittableOffset = 0;
+  if (OutUseUnscaledOp)
+    *OutUseUnscaledOp = false;
+  if (OutUnscaledOp)
+    *OutUnscaledOp = 0;
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("unhandled opcode in rewriteAArch64FrameIndex");
+  // Vector spills/fills can't take an immediate offset.
+  case AArch64::LD1Twov2d:
+  case AArch64::LD1Threev2d:
+  case AArch64::LD1Fourv2d:
+  case AArch64::LD1Twov1d:
+  case AArch64::LD1Threev1d:
+  case AArch64::LD1Fourv1d:
+  case AArch64::ST1Twov2d:
+  case AArch64::ST1Threev2d:
+  case AArch64::ST1Fourv2d:
+  case AArch64::ST1Twov1d:
+  case AArch64::ST1Threev1d:
+  case AArch64::ST1Fourv1d:
+    return AArch64FrameOffsetCannotUpdate;
+  case AArch64::PRFMui:
+    Scale = 8;
+    UnscaledOp = AArch64::PRFUMi;
+    break;
+  case AArch64::LDRXui:
+    Scale = 8;
+    UnscaledOp = AArch64::LDURXi;
+    break;
+  case AArch64::LDRWui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURWi;
+    break;
+  case AArch64::LDRBui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURBi;
+    break;
+  case AArch64::LDRHui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURHi;
+    break;
+  case AArch64::LDRSui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURSi;
+    break;
+  case AArch64::LDRDui:
+    Scale = 8;
+    UnscaledOp = AArch64::LDURDi;
+    break;
+  case AArch64::LDRQui:
+    Scale = 16;
+    UnscaledOp = AArch64::LDURQi;
+    break;
+  case AArch64::LDRBBui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURBBi;
+    break;
+  case AArch64::LDRHHui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURHHi;
+    break;
+  case AArch64::LDRSBXui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURSBXi;
+    break;
+  case AArch64::LDRSBWui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURSBWi;
+    break;
+  case AArch64::LDRSHXui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURSHXi;
+    break;
+  case AArch64::LDRSHWui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURSHWi;
+    break;
+  case AArch64::LDRSWui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURSWi;
+    break;
+
+  case AArch64::STRXui:
+    Scale = 8;
+    UnscaledOp = AArch64::STURXi;
+    break;
+  case AArch64::STRWui:
+    Scale = 4;
+    UnscaledOp = AArch64::STURWi;
+    break;
+  case AArch64::STRBui:
+    Scale = 1;
+    UnscaledOp = AArch64::STURBi;
+    break;
+  case AArch64::STRHui:
+    Scale = 2;
+    UnscaledOp = AArch64::STURHi;
+    break;
+  case AArch64::STRSui:
+    Scale = 4;
+    UnscaledOp = AArch64::STURSi;
+    break;
+  case AArch64::STRDui:
+    Scale = 8;
+    UnscaledOp = AArch64::STURDi;
+    break;
+  case AArch64::STRQui:
+    Scale = 16;
+    UnscaledOp = AArch64::STURQi;
+    break;
+  case AArch64::STRBBui:
+    Scale = 1;
+    UnscaledOp = AArch64::STURBBi;
+    break;
+  case AArch64::STRHHui:
+    Scale = 2;
+    UnscaledOp = AArch64::STURHHi;
+    break;
+
+  case AArch64::LDPXi:
+  case AArch64::LDPDi:
+  case AArch64::STPXi:
+  case AArch64::STPDi:
+  case AArch64::LDNPXi:
+  case AArch64::LDNPDi:
+  case AArch64::STNPXi:
+  case AArch64::STNPDi:
+    ImmIdx = 3;
+    IsSigned = true;
+    Scale = 8;
+    break;
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
+  case AArch64::LDNPQi:
+  case AArch64::STNPQi:
+    ImmIdx = 3;
+    IsSigned = true;
+    Scale = 16;
+    break;
+  case AArch64::LDPWi:
+  case AArch64::LDPSi:
+  case AArch64::STPWi:
+  case AArch64::STPSi:
+  case AArch64::LDNPWi:
+  case AArch64::LDNPSi:
+  case AArch64::STNPWi:
+  case AArch64::STNPSi:
+    ImmIdx = 3;
+    IsSigned = true;
+    Scale = 4;
+    break;
+
+  case AArch64::LDURXi:
+  case AArch64::LDURWi:
+  case AArch64::LDURBi:
+  case AArch64::LDURHi:
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSHWi:
+  case AArch64::LDURSWi:
+  case AArch64::STURXi:
+  case AArch64::STURWi:
+  case AArch64::STURBi:
+  case AArch64::STURHi:
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
+  case AArch64::STURBBi:
+  case AArch64::STURHHi:
+    Scale = 1;
+    break;
+  }
+
+  Offset += MI.getOperand(ImmIdx).getImm() * Scale;
+
+  bool useUnscaledOp = false;
+  // If the offset doesn't match the scale, we rewrite the instruction to
+  // use the unscaled instruction instead. Likewise, if we have a negative
+  // offset (and have an unscaled op to use).
+  if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
+    useUnscaledOp = true;
+
+  // Use an unscaled addressing mode if the instruction has a negative offset
+  // (or if the instruction is already using an unscaled addressing mode).
+  unsigned MaskBits;
+  if (IsSigned) {
+    // ldp/stp instructions.
+    MaskBits = 7;
+    Offset /= Scale;
+  } else if (UnscaledOp == 0 || useUnscaledOp) {
+    MaskBits = 9;
+    IsSigned = true;
+    Scale = 1;
+  } else {
+    MaskBits = 12;
+    IsSigned = false;
+    Offset /= Scale;
+  }
+
+  // Attempt to fold address computation.
+  int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
+  int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
+  if (Offset >= MinOff && Offset <= MaxOff) {
+    if (EmittableOffset)
+      *EmittableOffset = Offset;
+    Offset = 0;
+  } else {
+    int NewOff = Offset < 0 ? MinOff : MaxOff;
+    if (EmittableOffset)
+      *EmittableOffset = NewOff;
+    Offset = (Offset - NewOff) * Scale;
+  }
+  if (OutUseUnscaledOp)
+    *OutUseUnscaledOp = useUnscaledOp;
+  if (OutUnscaledOp)
+    *OutUnscaledOp = UnscaledOp;
+  return AArch64FrameOffsetCanUpdate |
+         (Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
+}
+
+bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                                    unsigned FrameReg, int &Offset,
+                                    const AArch64InstrInfo *TII) {
+  unsigned Opcode = MI.getOpcode();
+  unsigned ImmIdx = FrameRegIdx + 1;
+
+  if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
+    Offset += MI.getOperand(ImmIdx).getImm();
+    emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
+                    MI.getOperand(0).getReg(), FrameReg, Offset, TII,
+                    MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
+    MI.eraseFromParent();
+    Offset = 0;
+    return true;
+  }
+
+  int NewOffset;
+  unsigned UnscaledOp;
+  bool UseUnscaledOp;
+  int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
+                                         &UnscaledOp, &NewOffset);
+  if (Status & AArch64FrameOffsetCanUpdate) {
+    if (Status & AArch64FrameOffsetIsLegal)
+      // Replace the FrameIndex with FrameReg.
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+    if (UseUnscaledOp)
+      MI.setDesc(TII->get(UnscaledOp));
+
+    MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
+    return Offset == 0;
+  }
+
+  return false;
+}
+
+void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(AArch64::HINT);
+  NopInst.addOperand(MCOperand::createImm(0));
+}
+
+// AArch64 supports MachineCombiner.
+bool AArch64InstrInfo::useMachineCombiner() const {
+
+  return true;
+}
+//
+// True when Opc sets flag
+static bool isCombineInstrSettingFlag(unsigned Opc) {
+  switch (Opc) {
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWri:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXri:
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSXrr:
+  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+  case AArch64::SUBSWri:
+  case AArch64::SUBSXri:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+//
+// 32b Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate32(unsigned Opc) {
+  switch (Opc) {
+  case AArch64::ADDWrr:
+  case AArch64::ADDWri:
+  case AArch64::SUBWrr:
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWri:
+  case AArch64::SUBSWrr:
+  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+  case AArch64::SUBWri:
+  case AArch64::SUBSWri:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+//
+// 64b Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate64(unsigned Opc) {
+  switch (Opc) {
+  case AArch64::ADDXrr:
+  case AArch64::ADDXri:
+  case AArch64::SUBXrr:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXri:
+  case AArch64::SUBSXrr:
+  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+  case AArch64::SUBXri:
+  case AArch64::SUBSXri:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+//
+// FP Opcodes that can be combined with a FMUL
+static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+  case AArch64::FADDSrr:
+  case AArch64::FADDDrr:
+  case AArch64::FADDv2f32:
+  case AArch64::FADDv2f64:
+  case AArch64::FADDv4f32:
+  case AArch64::FSUBSrr:
+  case AArch64::FSUBDrr:
+  case AArch64::FSUBv2f32:
+  case AArch64::FSUBv2f64:
+  case AArch64::FSUBv4f32:
+		TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 
+    return (Options.UnsafeFPMath || 
+				    Options.AllowFPOpFusion == FPOpFusion::Fast);
+  }
+  return false;
+}
+//
+// Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate(unsigned Opc) {
+  return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
+}
+
+//
+// Utility routine that checks if \param MO is defined by an
+// \param CombineOpc instruction in the basic block \param MBB
+static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
+                       unsigned CombineOpc, unsigned ZeroReg = 0,
+                       bool CheckZeroReg = false) {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineInstr *MI = nullptr;
+
+  if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    MI = MRI.getUniqueVRegDef(MO.getReg());
+  // And it needs to be in the trace (otherwise, it won't have a depth).
+  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
+    return false;
+  // Must only used by the user we combine with.
+  if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
+    return false;
+
+  if (CheckZeroReg) {
+    assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
+           MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+           MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
+    // The third input reg must be zero.
+    if (MI->getOperand(3).getReg() != ZeroReg)
+      return false;
+  }
+
+  return true;
+}
+
+//
+// Is \param MO defined by an integer multiply and can be combined?
+static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+                              unsigned MulOpc, unsigned ZeroReg) {
+  return canCombine(MBB, MO, MulOpc, ZeroReg, true);
+}
+
+//
+// Is \param MO defined by a floating-point multiply and can be combined?
+static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+                               unsigned MulOpc) {
+  return canCombine(MBB, MO, MulOpc);
+}
+
+// TODO: There are many more machine instruction opcodes to match:
+//       1. Other data types (integer, vectors)
+//       2. Other math / logic operations (xor, or)
+//       3. Other forms of the same operation (intrinsics and other variants)
+bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
+  switch (Inst.getOpcode()) {
+  case AArch64::FADDDrr:
+  case AArch64::FADDSrr:
+  case AArch64::FADDv2f32:
+  case AArch64::FADDv2f64:
+  case AArch64::FADDv4f32:
+  case AArch64::FMULDrr:
+  case AArch64::FMULSrr:
+  case AArch64::FMULX32:
+  case AArch64::FMULX64:
+  case AArch64::FMULXv2f32:
+  case AArch64::FMULXv2f64:
+  case AArch64::FMULXv4f32:
+  case AArch64::FMULv2f32:
+  case AArch64::FMULv2f64:
+  case AArch64::FMULv4f32:
+    return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+  default:
+    return false;
+  }
+}
+
+/// Find instructions that can be turned into madd.
+static bool getMaddPatterns(MachineInstr &Root,
+                            SmallVectorImpl<MachineCombinerPattern> &Patterns) {
+  unsigned Opc = Root.getOpcode();
+  MachineBasicBlock &MBB = *Root.getParent();
+  bool Found = false;
+
+  if (!isCombineInstrCandidate(Opc))
+    return false;
+  if (isCombineInstrSettingFlag(Opc)) {
+    int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
+    // When NZCV is live bail out.
+    if (Cmp_NZCV == -1)
+      return false;
+    unsigned NewOpc = convertFlagSettingOpcode(Root);
+    // When opcode can't change bail out.
+    // CHECKME: do we miss any cases for opcode conversion?
+    if (NewOpc == Opc)
+      return false;
+    Opc = NewOpc;
+  }
+
+  switch (Opc) {
+  default:
+    break;
+  case AArch64::ADDWrr:
+    assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+           "ADDWrr does not have register operands");
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Patterns.push_back(MachineCombinerPattern::MULADDW_OP1);
+      Found = true;
+    }
+    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Patterns.push_back(MachineCombinerPattern::MULADDW_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::ADDXrr:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Patterns.push_back(MachineCombinerPattern::MULADDX_OP1);
+      Found = true;
+    }
+    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Patterns.push_back(MachineCombinerPattern::MULADDX_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::SUBWrr:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1);
+      Found = true;
+    }
+    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::SUBXrr:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1);
+      Found = true;
+    }
+    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::ADDWri:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1);
+      Found = true;
+    }
+    break;
+  case AArch64::ADDXri:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1);
+      Found = true;
+    }
+    break;
+  case AArch64::SUBWri:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1);
+      Found = true;
+    }
+    break;
+  case AArch64::SUBXri:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1);
+      Found = true;
+    }
+    break;
+  }
+  return Found;
+}
+/// Floating-Point Support
+
+/// Find instructions that can be turned into madd.
+static bool getFMAPatterns(MachineInstr &Root,
+                           SmallVectorImpl<MachineCombinerPattern> &Patterns) {
+
+  if (!isCombineInstrCandidateFP(Root))
+    return 0;
+
+  MachineBasicBlock &MBB = *Root.getParent();
+  bool Found = false;
+
+  switch (Root.getOpcode()) {
+  default:
+    assert(false && "Unsupported FP instruction in combiner\n");
+    break;
+  case AArch64::FADDSrr:
+    assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+           "FADDWrr does not have register operands");
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv1i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDDrr:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv1i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDv2f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                           AArch64::FMULv2i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv2f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDv2f64:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                           AArch64::FMULv2i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv2f64)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f64)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDv4f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                           AArch64::FMULv4i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv4f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv4i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv4f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
+      Found = true;
+    }
+    break;
+
+  case AArch64::FSUBSrr:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBDrr:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBv2f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBv2f64:
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f64)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBv4f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv4i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv4f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
+      Found = true;
+    }
+    break;
+  }
+  return Found;
+}
+
+/// Return true when a code sequence can improve throughput. It
+/// should be called only for instructions in loops.
+/// \param Pattern - combiner pattern
+bool
+AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
+  switch (Pattern) {
+  default:
+    break;
+  case MachineCombinerPattern::FMULADDS_OP1:
+  case MachineCombinerPattern::FMULADDS_OP2:
+  case MachineCombinerPattern::FMULSUBS_OP1:
+  case MachineCombinerPattern::FMULSUBS_OP2:
+  case MachineCombinerPattern::FMULADDD_OP1:
+  case MachineCombinerPattern::FMULADDD_OP2:
+  case MachineCombinerPattern::FMULSUBD_OP1:
+  case MachineCombinerPattern::FMULSUBD_OP2:
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2f32_OP2:
+  case MachineCombinerPattern::FMLAv2f32_OP1:
+  case MachineCombinerPattern::FMLAv2f64_OP1:
+  case MachineCombinerPattern::FMLAv2f64_OP2:
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMLAv4f32_OP1:
+  case MachineCombinerPattern::FMLAv4f32_OP2:
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
+  case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMLSv2f32_OP2:
+  case MachineCombinerPattern::FMLSv2f64_OP2:
+  case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv4f32_OP2:
+    return true;
+  } // end switch (Pattern)
+  return false;
+}
+/// Return true when there is potentially a faster code sequence for an
+/// instruction chain ending in \p Root. All potential patterns are listed in
+/// the \p Pattern vector. Pattern should be sorted in priority order since the
+/// pattern evaluator stops checking as soon as it finds a faster sequence.
+
+bool AArch64InstrInfo::getMachineCombinerPatterns(
+    MachineInstr &Root,
+    SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+  // Integer patterns
+  if (getMaddPatterns(Root, Patterns))
+    return true;
+  // Floating point patterns
+  if (getFMAPatterns(Root, Patterns))
+    return true;
+
+  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
+}
+
+enum class FMAInstKind { Default, Indexed, Accumulator };
+/// genFusedMultiply - Generate fused multiply instructions.
+/// This function supports both integer and floating point instructions.
+/// A typical example:
+///  F|MUL I=A,B,0
+///  F|ADD R,I,C
+///  ==> F|MADD R,A,B,C
+/// \param Root is the F|ADD instruction
+/// \param [out] InsInstrs is a vector of machine instructions and will
+/// contain the generated madd instruction
+/// \param IdxMulOpd is index of operand in Root that is the result of
+/// the F|MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the f|madd instruction
+static MachineInstr *
+genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
+                 const TargetInstrInfo *TII, MachineInstr &Root,
+                 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
+                 unsigned MaddOpc, const TargetRegisterClass *RC,
+                 FMAInstKind kind = FMAInstKind::Default) {
+  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
+
+  unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
+  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
+  unsigned ResultReg = Root.getOperand(0).getReg();
+  unsigned SrcReg0 = MUL->getOperand(1).getReg();
+  bool Src0IsKill = MUL->getOperand(1).isKill();
+  unsigned SrcReg1 = MUL->getOperand(2).getReg();
+  bool Src1IsKill = MUL->getOperand(2).isKill();
+  unsigned SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
+  bool Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
+
+  if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+    MRI.constrainRegClass(ResultReg, RC);
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
+    MRI.constrainRegClass(SrcReg0, RC);
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
+    MRI.constrainRegClass(SrcReg1, RC);
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
+    MRI.constrainRegClass(SrcReg2, RC);
+
+  MachineInstrBuilder MIB;
+  if (kind == FMAInstKind::Default)
+    MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+              .addReg(SrcReg0, getKillRegState(Src0IsKill))
+              .addReg(SrcReg1, getKillRegState(Src1IsKill))
+              .addReg(SrcReg2, getKillRegState(Src2IsKill));
+  else if (kind == FMAInstKind::Indexed)
+    MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+              .addReg(SrcReg2, getKillRegState(Src2IsKill))
+              .addReg(SrcReg0, getKillRegState(Src0IsKill))
+              .addReg(SrcReg1, getKillRegState(Src1IsKill))
+              .addImm(MUL->getOperand(3).getImm());
+  else if (kind == FMAInstKind::Accumulator)
+    MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+              .addReg(SrcReg2, getKillRegState(Src2IsKill))
+              .addReg(SrcReg0, getKillRegState(Src0IsKill))
+              .addReg(SrcReg1, getKillRegState(Src1IsKill));
+  else
+    assert(false && "Invalid FMA instruction kind \n");
+  // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
+  InsInstrs.push_back(MIB);
+  return MUL;
+}
+
+/// genMaddR - Generate madd instruction and combine mul and add using
+/// an extra virtual register
+/// Example - an ADD intermediate needs to be stored in a register:
+///   MUL I=A,B,0
+///   ADD R,I,Imm
+///   ==> ORR  V, ZR, Imm
+///   ==> MADD R,A,B,V
+/// \param Root is the ADD instruction
+/// \param [out] InsInstrs is a vector of machine instructions and will
+/// contain the generated madd instruction
+/// \param IdxMulOpd is index of operand in Root that is the result of
+/// the MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the madd instruction
+/// \param VR is a virtual register that holds the value of an ADD operand
+/// (V in the example above).
+static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
+                              const TargetInstrInfo *TII, MachineInstr &Root,
+                              SmallVectorImpl<MachineInstr *> &InsInstrs,
+                              unsigned IdxMulOpd, unsigned MaddOpc,
+                              unsigned VR, const TargetRegisterClass *RC) {
+  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
+
+  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
+  unsigned ResultReg = Root.getOperand(0).getReg();
+  unsigned SrcReg0 = MUL->getOperand(1).getReg();
+  bool Src0IsKill = MUL->getOperand(1).isKill();
+  unsigned SrcReg1 = MUL->getOperand(2).getReg();
+  bool Src1IsKill = MUL->getOperand(2).isKill();
+
+  if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+    MRI.constrainRegClass(ResultReg, RC);
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
+    MRI.constrainRegClass(SrcReg0, RC);
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
+    MRI.constrainRegClass(SrcReg1, RC);
+  if (TargetRegisterInfo::isVirtualRegister(VR))
+    MRI.constrainRegClass(VR, RC);
+
+  MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc),
+                                    ResultReg)
+                                .addReg(SrcReg0, getKillRegState(Src0IsKill))
+                                .addReg(SrcReg1, getKillRegState(Src1IsKill))
+                                .addReg(VR);
+  // Insert the MADD
+  InsInstrs.push_back(MIB);
+  return MUL;
+}
+
+/// When getMachineCombinerPatterns() finds potential patterns,
+/// this function generates the instructions that could replace the
+/// original code sequence
+void AArch64InstrInfo::genAlternativeCodeSequence(
+    MachineInstr &Root, MachineCombinerPattern Pattern,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+  MachineBasicBlock &MBB = *Root.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+  MachineInstr *MUL;
+  const TargetRegisterClass *RC;
+  unsigned Opc;
+  switch (Pattern) {
+  default:
+    // Reassociate instructions.
+    TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
+                                                DelInstrs, InstrIdxForVirtReg);
+    return;
+  case MachineCombinerPattern::MULADDW_OP1:
+  case MachineCombinerPattern::MULADDX_OP1:
+    // MUL I=A,B,0
+    // ADD R,I,C
+    // ==> MADD R,A,B,C
+    // --- Create(MADD);
+    if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
+      Opc = AArch64::MADDWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      Opc = AArch64::MADDXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDW_OP2:
+  case MachineCombinerPattern::MULADDX_OP2:
+    // MUL I=A,B,0
+    // ADD R,C,I
+    // ==> MADD R,A,B,C
+    // --- Create(MADD);
+    if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
+      Opc = AArch64::MADDWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      Opc = AArch64::MADDXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDWI_OP1:
+  case MachineCombinerPattern::MULADDXI_OP1: {
+    // MUL I=A,B,0
+    // ADD R,I,Imm
+    // ==> ORR  V, ZR, Imm
+    // ==> MADD R,A,B,V
+    // --- Create(MADD);
+    const TargetRegisterClass *OrrRC;
+    unsigned BitSize, OrrOpc, ZeroReg;
+    if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
+      OrrOpc = AArch64::ORRWri;
+      OrrRC = &AArch64::GPR32spRegClass;
+      BitSize = 32;
+      ZeroReg = AArch64::WZR;
+      Opc = AArch64::MADDWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      OrrOpc = AArch64::ORRXri;
+      OrrRC = &AArch64::GPR64spRegClass;
+      BitSize = 64;
+      ZeroReg = AArch64::XZR;
+      Opc = AArch64::MADDXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    unsigned NewVR = MRI.createVirtualRegister(OrrRC);
+    uint64_t Imm = Root.getOperand(2).getImm();
+
+    if (Root.getOperand(3).isImm()) {
+      unsigned Val = Root.getOperand(3).getImm();
+      Imm = Imm << Val;
+    }
+    uint64_t UImm = SignExtend64(Imm, BitSize);
+    uint64_t Encoding;
+    if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+      MachineInstrBuilder MIB1 =
+          BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
+              .addReg(ZeroReg)
+              .addImm(Encoding);
+      InsInstrs.push_back(MIB1);
+      InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+      MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
+    }
+    break;
+  }
+  case MachineCombinerPattern::MULSUBW_OP1:
+  case MachineCombinerPattern::MULSUBX_OP1: {
+    // MUL I=A,B,0
+    // SUB R,I, C
+    // ==> SUB  V, 0, C
+    // ==> MADD R,A,B,V // = -C + A*B
+    // --- Create(MADD);
+    const TargetRegisterClass *SubRC;
+    unsigned SubOpc, ZeroReg;
+    if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
+      SubOpc = AArch64::SUBWrr;
+      SubRC = &AArch64::GPR32spRegClass;
+      ZeroReg = AArch64::WZR;
+      Opc = AArch64::MADDWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      SubOpc = AArch64::SUBXrr;
+      SubRC = &AArch64::GPR64spRegClass;
+      ZeroReg = AArch64::XZR;
+      Opc = AArch64::MADDXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    unsigned NewVR = MRI.createVirtualRegister(SubRC);
+    // SUB NewVR, 0, C
+    MachineInstrBuilder MIB1 =
+        BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
+            .addReg(ZeroReg)
+            .addOperand(Root.getOperand(2));
+    InsInstrs.push_back(MIB1);
+    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+    MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
+    break;
+  }
+  case MachineCombinerPattern::MULSUBW_OP2:
+  case MachineCombinerPattern::MULSUBX_OP2:
+    // MUL I=A,B,0
+    // SUB R,C,I
+    // ==> MSUB R,A,B,C (computes C - A*B)
+    // --- Create(MSUB);
+    if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
+      Opc = AArch64::MSUBWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      Opc = AArch64::MSUBXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBWI_OP1:
+  case MachineCombinerPattern::MULSUBXI_OP1: {
+    // MUL I=A,B,0
+    // SUB R,I, Imm
+    // ==> ORR  V, ZR, -Imm
+    // ==> MADD R,A,B,V // = -Imm + A*B
+    // --- Create(MADD);
+    const TargetRegisterClass *OrrRC;
+    unsigned BitSize, OrrOpc, ZeroReg;
+    if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
+      OrrOpc = AArch64::ORRWri;
+      OrrRC = &AArch64::GPR32spRegClass;
+      BitSize = 32;
+      ZeroReg = AArch64::WZR;
+      Opc = AArch64::MADDWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      OrrOpc = AArch64::ORRXri;
+      OrrRC = &AArch64::GPR64spRegClass;
+      BitSize = 64;
+      ZeroReg = AArch64::XZR;
+      Opc = AArch64::MADDXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    unsigned NewVR = MRI.createVirtualRegister(OrrRC);
+    uint64_t Imm = Root.getOperand(2).getImm();
+    if (Root.getOperand(3).isImm()) {
+      unsigned Val = Root.getOperand(3).getImm();
+      Imm = Imm << Val;
+    }
+    uint64_t UImm = SignExtend64(-Imm, BitSize);
+    uint64_t Encoding;
+    if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+      MachineInstrBuilder MIB1 =
+          BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
+              .addReg(ZeroReg)
+              .addImm(Encoding);
+      InsInstrs.push_back(MIB1);
+      InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+      MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
+    }
+    break;
+  }
+  // Floating Point Support
+  case MachineCombinerPattern::FMULADDS_OP1:
+  case MachineCombinerPattern::FMULADDD_OP1:
+    // MUL I=A,B,0
+    // ADD R,I,C
+    // ==> MADD R,A,B,C
+    // --- Create(MADD);
+    if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
+      Opc = AArch64::FMADDSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FMADDDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::FMULADDS_OP2:
+  case MachineCombinerPattern::FMULADDD_OP2:
+    // FMUL I=A,B,0
+    // FADD R,C,I
+    // ==> FMADD R,A,B,C
+    // --- Create(FMADD);
+    if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
+      Opc = AArch64::FMADDSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FMADDDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
+    Opc = AArch64::FMLAv1i32_indexed;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
+    Opc = AArch64::FMLAv1i32_indexed;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
+    Opc = AArch64::FMLAv1i64_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+    Opc = AArch64::FMLAv1i64_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2f32_OP1:
+    RC = &AArch64::FPR64RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
+      Opc = AArch64::FMLAv2i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2f32_OP2:
+    RC = &AArch64::FPR64RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
+      Opc = AArch64::FMLAv2i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2f64_OP1:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
+      Opc = AArch64::FMLAv2i64_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f64;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2f64_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
+      Opc = AArch64::FMLAv2i64_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f64;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv4f32_OP1:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
+      Opc = AArch64::FMLAv4i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv4f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv4f32_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
+      Opc = AArch64::FMLAv4i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv4f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMULSUBS_OP1:
+  case MachineCombinerPattern::FMULSUBD_OP1: {
+    // FMUL I=A,B,0
+    // FSUB R,I,C
+    // ==> FNMSUB R,A,B,C // = -C + A*B
+    // --- Create(FNMSUB);
+    if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
+      Opc = AArch64::FNMSUBSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FNMSUBDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  }
+  case MachineCombinerPattern::FMULSUBS_OP2:
+  case MachineCombinerPattern::FMULSUBD_OP2: {
+    // FMUL I=A,B,0
+    // FSUB R,C,I
+    // ==> FMSUB R,A,B,C (computes C - A*B)
+    // --- Create(FMSUB);
+    if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
+      Opc = AArch64::FMSUBSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FMSUBDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+
+  case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
+    Opc = AArch64::FMLSv1i32_indexed;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
+    Opc = AArch64::FMLSv1i64_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLSv2f32_OP2:
+  case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
+    RC = &AArch64::FPR64RegClass;
+    if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
+      Opc = AArch64::FMLSv2i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLSv2f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLSv2f64_OP2:
+  case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
+      Opc = AArch64::FMLSv2i64_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLSv2f64;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLSv4f32_OP2:
+  case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
+      Opc = AArch64::FMLSv4i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLSv4f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+  }
+  } // end switch (Pattern)
+  // Record MUL and ADD/SUB for deletion
+  DelInstrs.push_back(MUL);
+  DelInstrs.push_back(&Root);
+
+  return;
+}
+
+/// \brief Replace csincr-branch sequence by simple conditional branch
+///
+/// Examples:
+/// 1.
+///   csinc  w9, wzr, wzr, <condition code>
+///   tbnz   w9, #0, 0x44
+/// to
+///   b.<inverted condition code>
+///
+/// 2.
+///   csinc w9, wzr, wzr, <condition code>
+///   tbz   w9, #0, 0x44
+/// to
+///   b.<condition code>
+///
+/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
+/// compare's constant operand is power of 2.
+///
+/// Examples:
+///   and  w8, w8, #0x400
+///   cbnz w8, L1
+/// to
+///   tbnz w8, #10, L1
+///
+/// \param  MI Conditional Branch
+/// \return True when the simple conditional branch is generated
+///
+bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
+  bool IsNegativeBranch = false;
+  bool IsTestAndBranch = false;
+  unsigned TargetBBInMI = 0;
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Unknown branch instruction?");
+  case AArch64::Bcc:
+    return false;
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+    TargetBBInMI = 1;
+    break;
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    TargetBBInMI = 1;
+    IsNegativeBranch = true;
+    break;
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+    TargetBBInMI = 2;
+    IsTestAndBranch = true;
+    break;
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    TargetBBInMI = 2;
+    IsNegativeBranch = true;
+    IsTestAndBranch = true;
+    break;
+  }
+  // So we increment a zero register and test for bits other
+  // than bit 0? Conservatively bail out in case the verifier
+  // missed this case.
+  if (IsTestAndBranch && MI.getOperand(1).getImm())
+    return false;
+
+  // Find Definition.
+  assert(MI.getParent() && "Incomplete machine instruciton\n");
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  unsigned VReg = MI.getOperand(0).getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(VReg))
+    return false;
+
+  MachineInstr *DefMI = MRI->getVRegDef(VReg);
+
+  // Look through COPY instructions to find definition.
+  while (DefMI->isCopy()) {
+    unsigned CopyVReg = DefMI->getOperand(1).getReg();
+    if (!MRI->hasOneNonDBGUse(CopyVReg))
+      return false;
+    if (!MRI->hasOneDef(CopyVReg))
+      return false;
+    DefMI = MRI->getVRegDef(CopyVReg);
+  }
+
+  switch (DefMI->getOpcode()) {
+  default:
+    return false;
+  // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
+  case AArch64::ANDWri:
+  case AArch64::ANDXri: {
+    if (IsTestAndBranch)
+      return false;
+    if (DefMI->getParent() != MBB)
+      return false;
+    if (!MRI->hasOneNonDBGUse(VReg))
+      return false;
+
+    bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
+    uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
+        DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
+    if (!isPowerOf2_64(Mask))
+      return false;
+
+    MachineOperand &MO = DefMI->getOperand(1);
+    unsigned NewReg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(NewReg))
+      return false;
+
+    assert(!MRI->def_empty(NewReg) && "Register must be defined.");
+
+    MachineBasicBlock &RefToMBB = *MBB;
+    MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
+    DebugLoc DL = MI.getDebugLoc();
+    unsigned Imm = Log2_64(Mask);
+    unsigned Opc = (Imm < 32)
+                       ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
+                       : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
+    MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
+                              .addReg(NewReg)
+                              .addImm(Imm)
+                              .addMBB(TBB);
+    // Register lives on to the CBZ now.
+    MO.setIsKill(false);
+
+    // For immediate smaller than 32, we need to use the 32-bit
+    // variant (W) in all cases. Indeed the 64-bit variant does not
+    // allow to encode them.
+    // Therefore, if the input register is 64-bit, we need to take the
+    // 32-bit sub-part.
+    if (!Is32Bit && Imm < 32)
+      NewMI->getOperand(0).setSubReg(AArch64::sub_32);
+    MI.eraseFromParent();
+    return true;
+  }
+  // Look for CSINC
+  case AArch64::CSINCWr:
+  case AArch64::CSINCXr: {
+    if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
+          DefMI->getOperand(2).getReg() == AArch64::WZR) &&
+        !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
+          DefMI->getOperand(2).getReg() == AArch64::XZR))
+      return false;
+
+    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
+      return false;
+
+    AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
+    // Convert only when the condition code is not modified between
+    // the CSINC and the branch. The CC may be used by other
+    // instructions in between.
+    if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
+      return false;
+    MachineBasicBlock &RefToMBB = *MBB;
+    MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
+    DebugLoc DL = MI.getDebugLoc();
+    if (IsNegativeBranch)
+      CC = AArch64CC::getInvertedCondCode(CC);
+    BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
+    MI.eraseFromParent();
+    return true;
+  }
+  }
+}
+
+std::pair<unsigned, unsigned>
+AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+  const unsigned Mask = AArch64II::MO_FRAGMENT;
+  return std::make_pair(TF & Mask, TF & ~Mask);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+  using namespace AArch64II;
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_PAGE, "aarch64-page"},
+      {MO_PAGEOFF, "aarch64-pageoff"},
+      {MO_G3, "aarch64-g3"},
+      {MO_G2, "aarch64-g2"},
+      {MO_G1, "aarch64-g1"},
+      {MO_G0, "aarch64-g0"},
+      {MO_HI12, "aarch64-hi12"}};
+  return makeArrayRef(TargetFlags);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
+  using namespace AArch64II;
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_GOT, "aarch64-got"},
+      {MO_NC, "aarch64-nc"},
+      {MO_TLS, "aarch64-tls"}};
+  return makeArrayRef(TargetFlags);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
new file mode 100644
index 000000000000..90b2c0896872
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -0,0 +1,318 @@
+//===- AArch64InstrInfo.h - AArch64 Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64INSTRINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64INSTRINFO_H
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/CodeGen/MachineCombinerPattern.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "AArch64GenInstrInfo.inc"
+
+namespace llvm {
+
+class AArch64Subtarget;
+class AArch64TargetMachine;
+
+class AArch64InstrInfo final : public AArch64GenInstrInfo {
+  const AArch64RegisterInfo RI;
+  const AArch64Subtarget &Subtarget;
+
+public:
+  explicit AArch64InstrInfo(const AArch64Subtarget &STI);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  const AArch64RegisterInfo &getRegisterInfo() const { return RI; }
+
+  unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+  bool isAsCheapAsAMove(const MachineInstr &MI) const override;
+
+  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
+                             unsigned &DstReg, unsigned &SubIdx) const override;
+
+  bool
+  areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+                                  AliasAnalysis *AA = nullptr) const override;
+
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+
+  /// Returns true if there is a shiftable register and that the shift value
+  /// is non-zero.
+  bool hasShiftedReg(const MachineInstr &MI) const;
+
+  /// Returns true if there is an extendable register and that the extending
+  /// value is non-zero.
+  bool hasExtendedReg(const MachineInstr &MI) const;
+
+  /// \brief Does this instruction set its full destination register to zero?
+  bool isGPRZero(const MachineInstr &MI) const;
+
+  /// \brief Does this instruction rename a GPR without modifying bits?
+  bool isGPRCopy(const MachineInstr &MI) const;
+
+  /// \brief Does this instruction rename an FPR without modifying bits?
+  bool isFPRCopy(const MachineInstr &MI) const;
+
+  /// Return true if this is load/store scales or extends its register offset.
+  /// This refers to scaling a dynamic index as opposed to scaled immediates.
+  /// MI should be a memory op that allows scaled addressing.
+  bool isScaledAddr(const MachineInstr &MI) const;
+
+  /// Return true if pairing the given load or store is hinted to be
+  /// unprofitable.
+  bool isLdStPairSuppressed(const MachineInstr &MI) const;
+
+  /// Return true if this is an unscaled load/store.
+  bool isUnscaledLdSt(unsigned Opc) const;
+
+  /// Return true if this is an unscaled load/store.
+  bool isUnscaledLdSt(MachineInstr &MI) const;
+
+  static bool isPairableLdStInst(const MachineInstr &MI) {
+    switch (MI.getOpcode()) {
+    default:
+      return false;
+    // Scaled instructions.
+    case AArch64::STRSui:
+    case AArch64::STRDui:
+    case AArch64::STRQui:
+    case AArch64::STRXui:
+    case AArch64::STRWui:
+    case AArch64::LDRSui:
+    case AArch64::LDRDui:
+    case AArch64::LDRQui:
+    case AArch64::LDRXui:
+    case AArch64::LDRWui:
+    case AArch64::LDRSWui:
+    // Unscaled instructions.
+    case AArch64::STURSi:
+    case AArch64::STURDi:
+    case AArch64::STURQi:
+    case AArch64::STURWi:
+    case AArch64::STURXi:
+    case AArch64::LDURSi:
+    case AArch64::LDURDi:
+    case AArch64::LDURQi:
+    case AArch64::LDURWi:
+    case AArch64::LDURXi:
+    case AArch64::LDURSWi:
+      return true;
+    }
+  }
+
+  /// Return true if this is a load/store that can be potentially paired/merged.
+  bool isCandidateToMergeOrPair(MachineInstr &MI) const;
+
+  /// Hint that pairing the given load or store is unprofitable.
+  void suppressLdStPair(MachineInstr &MI) const;
+
+  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+                             int64_t &Offset,
+                             const TargetRegisterInfo *TRI) const override;
+
+  bool getMemOpBaseRegImmOfsWidth(MachineInstr &LdSt, unsigned &BaseReg,
+                                  int64_t &Offset, unsigned &Width,
+                                  const TargetRegisterInfo *TRI) const;
+
+  bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
+                           unsigned NumLoads) const override;
+
+  bool shouldScheduleAdjacent(const MachineInstr &First,
+                              const MachineInstr &Second) const override;
+
+  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+                                         uint64_t Offset, const MDNode *Var,
+                                         const MDNode *Expr,
+                                         const DebugLoc &DL) const;
+  void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                        const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                        bool KillSrc, unsigned Opcode,
+                        llvm::ArrayRef<unsigned> Indices) const;
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+                           bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI, unsigned DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  using TargetInstrInfo::foldMemoryOperandImpl;
+  MachineInstr *
+  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                        ArrayRef<unsigned> Ops,
+                        MachineBasicBlock::iterator InsertPt, int FrameIndex,
+                        LiveIntervals *LIS = nullptr) const override;
+
+  /// \returns true if a branch from an instruction with opcode \p BranchOpc
+  ///  bytes is capable of jumping to a position \p BrOffset bytes away.
+  bool isBranchOffsetInRange(unsigned BranchOpc,
+                             int64_t BrOffset) const override;
+
+  MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override;
+
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override;
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+  bool
+  reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+  bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
+                       unsigned, unsigned, int &, int &, int &) const override;
+  void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                    const DebugLoc &DL, unsigned DstReg,
+                    ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+                    unsigned FalseReg) const override;
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+  /// Return true if the comparison instruction can be analyzed.
+  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &CmpMask,
+                      int &CmpValue) const override;
+  /// optimizeCompareInstr - Convert the instruction supplying the argument to
+  /// the comparison into one that sets the zero bit in the flags register.
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int CmpMask, int CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
+  bool optimizeCondBranch(MachineInstr &MI) const override;
+
+  /// Return true when a code sequence can improve throughput. It
+  /// should be called only for instructions in loops.
+  /// \param Pattern - combiner pattern
+  bool isThroughputPattern(MachineCombinerPattern Pattern) const override;
+  /// Return true when there is potentially a faster code sequence
+  /// for an instruction chain ending in <Root>. All potential patterns are
+  /// listed in the <Patterns> array.
+  bool getMachineCombinerPatterns(MachineInstr &Root,
+                  SmallVectorImpl<MachineCombinerPattern> &Patterns)
+      const override;
+  /// Return true when Inst is associative and commutative so that it can be
+  /// reassociated.
+  bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
+  /// When getMachineCombinerPatterns() finds patterns, this function generates
+  /// the instructions that could replace the original code sequence
+  void genAlternativeCodeSequence(
+      MachineInstr &Root, MachineCombinerPattern Pattern,
+      SmallVectorImpl<MachineInstr *> &InsInstrs,
+      SmallVectorImpl<MachineInstr *> &DelInstrs,
+      DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
+  /// AArch64 supports MachineCombiner.
+  bool useMachineCombiner() const override;
+
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+  std::pair<unsigned, unsigned>
+  decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableDirectMachineOperandTargetFlags() const override;
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableBitmaskMachineOperandTargetFlags() const override;
+
+private:
+  void instantiateCondBranch(MachineBasicBlock &MBB, const DebugLoc &DL,
+                             MachineBasicBlock *TBB,
+                             ArrayRef<MachineOperand> Cond) const;
+  bool substituteCmpToZero(MachineInstr &CmpInstr, unsigned SrcReg,
+                           const MachineRegisterInfo *MRI) const;
+};
+
+/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
+/// plus Offset.  This is intended to be used from within the prolog/epilog
+/// insertion (PEI) pass, where a virtual scratch register may be allocated
+/// if necessary, to be replaced by the scavenger at the end of PEI.
+void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                     int Offset, const TargetInstrInfo *TII,
+                     MachineInstr::MIFlag = MachineInstr::NoFlags,
+                     bool SetNZCV = false);
+
+/// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the
+/// FP. Return false if the offset could not be handled directly in MI, and
+/// return the left-over portion by reference.
+bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                            unsigned FrameReg, int &Offset,
+                            const AArch64InstrInfo *TII);
+
+/// \brief Use to report the frame offset status in isAArch64FrameOffsetLegal.
+enum AArch64FrameOffsetStatus {
+  AArch64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply.
+  AArch64FrameOffsetIsLegal = 0x1,      ///< Offset is legal.
+  AArch64FrameOffsetCanUpdate = 0x2     ///< Offset can apply, at least partly.
+};
+
+/// \brief Check if the @p Offset is a valid frame offset for @p MI.
+/// The returned value reports the validity of the frame offset for @p MI.
+/// It uses the values defined by AArch64FrameOffsetStatus for that.
+/// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to
+/// use an offset.eq
+/// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be
+/// rewriten in @p MI.
+/// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the
+/// amount that is off the limit of the legal offset.
+/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
+/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp.
+/// If set, @p EmittableOffset contains the amount that can be set in @p MI
+/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
+/// is a legal offset.
+int isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+                            bool *OutUseUnscaledOp = nullptr,
+                            unsigned *OutUnscaledOp = nullptr,
+                            int *EmittableOffset = nullptr);
+
+static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
+
+static inline bool isCondBranchOpcode(int Opc) {
+  switch (Opc) {
+  case AArch64::Bcc:
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static inline bool isIndirectBranchOpcode(int Opc) { return Opc == AArch64::BR; }
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
new file mode 100644
index 000000000000..c5b95f282ea8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -0,0 +1,6132 @@
+//=- AArch64InstrInfo.td - Describe the AArch64 Instructions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Instruction definitions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction Predicate Definitions.
+//
+def HasV8_1a         : Predicate<"Subtarget->hasV8_1aOps()">,
+                                 AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
+def HasV8_2a         : Predicate<"Subtarget->hasV8_2aOps()">,
+                                 AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
+def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
+                               AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
+def HasNEON          : Predicate<"Subtarget->hasNEON()">,
+                                 AssemblerPredicate<"FeatureNEON", "neon">;
+def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
+                                 AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasCRC           : Predicate<"Subtarget->hasCRC()">,
+                                 AssemblerPredicate<"FeatureCRC", "crc">;
+def HasLSE           : Predicate<"Subtarget->hasLSE()">,
+                                 AssemblerPredicate<"FeatureLSE", "lse">;
+def HasRAS           : Predicate<"Subtarget->hasRAS()">,
+                                 AssemblerPredicate<"FeatureRAS", "ras">;
+def HasPerfMon       : Predicate<"Subtarget->hasPerfMon()">;
+def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
+                                 AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
+def HasSPE           : Predicate<"Subtarget->hasSPE()">,
+                                 AssemblerPredicate<"FeatureSPE", "spe">;
+
+def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
+def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
+def UseAlternateSExtLoadCVTF32
+    : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
+
+//===----------------------------------------------------------------------===//
+// AArch64-specific DAG Nodes.
+//
+
+// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
+def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
+                                              [SDTCisSameAs<0, 2>,
+                                               SDTCisSameAs<0, 3>,
+                                               SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<3, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<4, i32>]>;
+
+def SDT_AArch64Brcond  : SDTypeProfile<0, 3,
+                                     [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
+                                      SDTCisVT<2, i32>]>;
+def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
+def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
+                                        SDTCisVT<2, OtherVT>]>;
+
+
+def SDT_AArch64CSel  : SDTypeProfile<1, 4,
+                                   [SDTCisSameAs<0, 1>,
+                                    SDTCisSameAs<0, 2>,
+                                    SDTCisInt<3>,
+                                    SDTCisVT<4, i32>]>;
+def SDT_AArch64CCMP : SDTypeProfile<1, 5,
+                                    [SDTCisVT<0, i32>,
+                                     SDTCisInt<1>,
+                                     SDTCisSameAs<1, 2>,
+                                     SDTCisInt<3>,
+                                     SDTCisInt<4>,
+                                     SDTCisVT<5, i32>]>;
+def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
+                                     [SDTCisVT<0, i32>,
+                                      SDTCisFP<1>,
+                                      SDTCisSameAs<1, 2>,
+                                      SDTCisInt<3>,
+                                      SDTCisInt<4>,
+                                      SDTCisVT<5, i32>]>;
+def SDT_AArch64FCmp   : SDTypeProfile<0, 2,
+                                   [SDTCisFP<0>,
+                                    SDTCisSameAs<0, 1>]>;
+def SDT_AArch64Dup   : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDT_AArch64DupLane   : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
+def SDT_AArch64Zip   : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisSameAs<0, 1>,
+                                          SDTCisSameAs<0, 2>]>;
+def SDT_AArch64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
+def SDT_AArch64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDT_AArch64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisInt<2>, SDTCisInt<3>]>;
+def SDT_AArch64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                          SDTCisSameAs<0,2>, SDTCisInt<3>]>;
+def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+
+def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>;
+def SDT_AArch64fcmp  : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
+def SDT_AArch64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisSameAs<0,2>]>;
+def SDT_AArch64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisSameAs<0,2>,
+                                           SDTCisSameAs<0,3>]>;
+def SDT_AArch64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
+def SDT_AArch64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
+
+def SDT_AArch64ITOF  : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
+
+def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
+                                                 SDTCisPtrTy<1>]>;
+
+// Generates the general dynamic sequences, i.e.
+//  adrp  x0, :tlsdesc:var
+//  ldr   x1, [x0, #:tlsdesc_lo12:var]
+//  add   x0, x0, #:tlsdesc_lo12:var
+//  .tlsdesccall var
+//  blr   x1
+
+// (the TPIDR_EL0 offset is put directly in X0, hence no "result" here)
+// number of operands (the variable)
+def SDT_AArch64TLSDescCallSeq : SDTypeProfile<0,1,
+                                          [SDTCisPtrTy<0>]>;
+
+def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4,
+                                        [SDTCisVT<0, i64>, SDTCisVT<1, i32>,
+                                         SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
+                                         SDTCisSameAs<1, 4>]>;
+
+
+// Node definitions.
+def AArch64adrp          : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
+def AArch64addlow        : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
+def AArch64LOADgot       : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
+def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
+                                SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
+                                [SDNPHasChain, SDNPOutGlue]>;
+def AArch64callseq_end   : SDNode<"ISD::CALLSEQ_END",
+                                SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                               SDTCisVT<1, i32> ]>,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64call          : SDNode<"AArch64ISD::CALL",
+                                SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                 SDNPVariadic]>;
+def AArch64brcond        : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond,
+                                [SDNPHasChain]>;
+def AArch64cbz           : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz,
+                                [SDNPHasChain]>;
+def AArch64cbnz           : SDNode<"AArch64ISD::CBNZ", SDT_AArch64cbz,
+                                [SDNPHasChain]>;
+def AArch64tbz           : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz,
+                                [SDNPHasChain]>;
+def AArch64tbnz           : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz,
+                                [SDNPHasChain]>;
+
+
+def AArch64csel          : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>;
+def AArch64csinv         : SDNode<"AArch64ISD::CSINV", SDT_AArch64CSel>;
+def AArch64csneg         : SDNode<"AArch64ISD::CSNEG", SDT_AArch64CSel>;
+def AArch64csinc         : SDNode<"AArch64ISD::CSINC", SDT_AArch64CSel>;
+def AArch64retflag       : SDNode<"AArch64ISD::RET_FLAG", SDTNone,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def AArch64adc       : SDNode<"AArch64ISD::ADC",  SDTBinaryArithWithFlagsIn >;
+def AArch64sbc       : SDNode<"AArch64ISD::SBC",  SDTBinaryArithWithFlagsIn>;
+def AArch64add_flag  : SDNode<"AArch64ISD::ADDS",  SDTBinaryArithWithFlagsOut,
+                            [SDNPCommutative]>;
+def AArch64sub_flag  : SDNode<"AArch64ISD::SUBS",  SDTBinaryArithWithFlagsOut>;
+def AArch64and_flag  : SDNode<"AArch64ISD::ANDS",  SDTBinaryArithWithFlagsOut,
+                            [SDNPCommutative]>;
+def AArch64adc_flag  : SDNode<"AArch64ISD::ADCS",  SDTBinaryArithWithFlagsInOut>;
+def AArch64sbc_flag  : SDNode<"AArch64ISD::SBCS",  SDTBinaryArithWithFlagsInOut>;
+
+def AArch64ccmp      : SDNode<"AArch64ISD::CCMP",  SDT_AArch64CCMP>;
+def AArch64ccmn      : SDNode<"AArch64ISD::CCMN",  SDT_AArch64CCMP>;
+def AArch64fccmp     : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>;
+
+def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
+
+def AArch64fcmp      : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
+
+def AArch64dup       : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
+def AArch64duplane8  : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
+def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
+def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
+def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
+
+def AArch64zip1      : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>;
+def AArch64zip2      : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>;
+def AArch64uzp1      : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>;
+def AArch64uzp2      : SDNode<"AArch64ISD::UZP2", SDT_AArch64Zip>;
+def AArch64trn1      : SDNode<"AArch64ISD::TRN1", SDT_AArch64Zip>;
+def AArch64trn2      : SDNode<"AArch64ISD::TRN2", SDT_AArch64Zip>;
+
+def AArch64movi_edit : SDNode<"AArch64ISD::MOVIedit", SDT_AArch64MOVIedit>;
+def AArch64movi_shift : SDNode<"AArch64ISD::MOVIshift", SDT_AArch64MOVIshift>;
+def AArch64movi_msl : SDNode<"AArch64ISD::MOVImsl", SDT_AArch64MOVIshift>;
+def AArch64mvni_shift : SDNode<"AArch64ISD::MVNIshift", SDT_AArch64MOVIshift>;
+def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
+def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
+def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;
+
+def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
+def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
+def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
+def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>;
+
+def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>;
+def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>;
+def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>;
+def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>;
+def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>;
+def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>;
+def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>;
+def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;
+
+def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
+def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
+def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;
+
+def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
+def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
+def AArch64cmgt: SDNode<"AArch64ISD::CMGT", SDT_AArch64binvec>;
+def AArch64cmhi: SDNode<"AArch64ISD::CMHI", SDT_AArch64binvec>;
+def AArch64cmhs: SDNode<"AArch64ISD::CMHS", SDT_AArch64binvec>;
+
+def AArch64fcmeq: SDNode<"AArch64ISD::FCMEQ", SDT_AArch64fcmp>;
+def AArch64fcmge: SDNode<"AArch64ISD::FCMGE", SDT_AArch64fcmp>;
+def AArch64fcmgt: SDNode<"AArch64ISD::FCMGT", SDT_AArch64fcmp>;
+
+def AArch64cmeqz: SDNode<"AArch64ISD::CMEQz", SDT_AArch64unvec>;
+def AArch64cmgez: SDNode<"AArch64ISD::CMGEz", SDT_AArch64unvec>;
+def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>;
+def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>;
+def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>;
+def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
+                        (AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>;
+
+def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>;
+def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>;
+def AArch64fcmgtz: SDNode<"AArch64ISD::FCMGTz", SDT_AArch64fcmpz>;
+def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>;
+def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>;
+
+def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
+def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;
+
+def AArch64neg : SDNode<"AArch64ISD::NEG", SDT_AArch64unvec>;
+
+def AArch64tcret: SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64TCRET,
+                  [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+def AArch64Prefetch        : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH,
+                               [SDNPHasChain, SDNPSideEffect]>;
+
+def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>;
+def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;
+
+def AArch64tlsdesc_callseq : SDNode<"AArch64ISD::TLSDESC_CALLSEQ",
+                                    SDT_AArch64TLSDescCallSeq,
+                                    [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
+                                     SDNPVariadic]>;
+
+
+def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
+                                 SDT_AArch64WrapperLarge>;
+
+def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>;
+
+def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+                                    SDTCisSameAs<1, 2>]>;
+def AArch64smull    : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
+def AArch64umull    : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
+
+def AArch64frecpe   : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
+def AArch64frecps   : SDNode<"AArch64ISD::FRECPS", SDTFPBinOp>;
+def AArch64frsqrte  : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>;
+def AArch64frsqrts  : SDNode<"AArch64ISD::FRSQRTS", SDTFPBinOp>;
+
+def AArch64saddv    : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
+def AArch64uaddv    : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
+def AArch64sminv    : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>;
+def AArch64uminv    : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>;
+def AArch64smaxv    : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>;
+def AArch64umaxv    : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+// AArch64 Instruction Predicate Definitions.
+def IsDarwin  : Predicate<"Subtarget->isTargetDarwin()">;
+def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
+def ForCodeSize   : Predicate<"ForCodeSize">;
+def NotForCodeSize   : Predicate<"!ForCodeSize">;
+
+include "AArch64InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous instructions.
+//===----------------------------------------------------------------------===//
+
+let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
+// We set Sched to empty list because we expect these instructions to simply get
+// removed in most cases.
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                              [(AArch64callseq_start timm:$amt)]>, Sched<[]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            [(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
+                            Sched<[]>;
+} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
+
+let isReMaterializable = 1, isCodeGenOnly = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions.  When that changes, they can be
+// removed, along with the AArch64Wrapper node.
+
+let AddedComplexity = 10 in
+def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
+                     [(set GPR64:$dst, (AArch64LOADgot tglobaladdr:$addr))]>,
+              Sched<[WriteLDAdr]>;
+
+// The MOVaddr instruction should match only when the add is not folded
+// into a load or store address.
+def MOVaddr
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi),
+                                            tglobaladdr:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrJT
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi),
+                                             tjumptable:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrCP
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi),
+                                             tconstpool:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrBA
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi),
+                                             tblockaddress:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrTLS
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi),
+                                            tglobaltlsaddr:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrEXT
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi),
+                                            texternalsym:$low))]>,
+      Sched<[WriteAdrAdr]>;
+
+} // isReMaterializable, isCodeGenOnly
+
+def : Pat<(AArch64LOADgot tglobaltlsaddr:$addr),
+          (LOADgot tglobaltlsaddr:$addr)>;
+
+def : Pat<(AArch64LOADgot texternalsym:$addr),
+          (LOADgot texternalsym:$addr)>;
+
+def : Pat<(AArch64LOADgot tconstpool:$addr),
+          (LOADgot tconstpool:$addr)>;
+
+//===----------------------------------------------------------------------===//
+// System instructions.
+//===----------------------------------------------------------------------===//
+
+def HINT : HintI<"hint">;
+def : InstAlias<"nop",  (HINT 0b000)>;
+def : InstAlias<"yield",(HINT 0b001)>;
+def : InstAlias<"wfe",  (HINT 0b010)>;
+def : InstAlias<"wfi",  (HINT 0b011)>;
+def : InstAlias<"sev",  (HINT 0b100)>;
+def : InstAlias<"sevl", (HINT 0b101)>;
+def : InstAlias<"esb",  (HINT 0b10000)>, Requires<[HasRAS]>;
+
+// v8.2a Statistical Profiling extension
+def : InstAlias<"psb $op",  (HINT psbhint_op:$op)>, Requires<[HasSPE]>;
+
+// As far as LLVM is concerned this writes to the system's exclusive monitors.
+let mayLoad = 1, mayStore = 1 in
+def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
+
+// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
+// model patterns with sufficiently fine granularity.
+let mayLoad = ?, mayStore = ? in {
+def DMB   : CRmSystemI<barrier_op, 0b101, "dmb",
+                       [(int_aarch64_dmb (i32 imm32_0_15:$CRm))]>;
+
+def DSB   : CRmSystemI<barrier_op, 0b100, "dsb",
+                       [(int_aarch64_dsb (i32 imm32_0_15:$CRm))]>;
+
+def ISB   : CRmSystemI<barrier_op, 0b110, "isb",
+                       [(int_aarch64_isb (i32 imm32_0_15:$CRm))]>;
+}
+
+def : InstAlias<"clrex", (CLREX 0xf)>;
+def : InstAlias<"isb", (ISB 0xf)>;
+
+def MRS    : MRSI;
+def MSR    : MSRI;
+def MSRpstateImm1 : MSRpstateImm0_1;
+def MSRpstateImm4 : MSRpstateImm0_15;
+
+// The thread pointer (on Linux, at least, where this has been implemented) is
+// TPIDR_EL0.
+def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
+
+// The cycle counter PMC register is PMCCNTR_EL0.
+let Predicates = [HasPerfMon] in
+def : Pat<(readcyclecounter), (MRS 0xdce8)>;
+
+// Generic system instructions
+def SYSxt  : SystemXtI<0, "sys">;
+def SYSLxt : SystemLXtI<1, "sysl">;
+
+def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
+                (SYSxt imm0_7:$op1, sys_cr_op:$Cn,
+                 sys_cr_op:$Cm, imm0_7:$op2, XZR)>;
+
+//===----------------------------------------------------------------------===//
+// Move immediate instructions.
+//===----------------------------------------------------------------------===//
+
+defm MOVK : InsertImmediate<0b11, "movk">;
+defm MOVN : MoveImmediate<0b00, "movn">;
+
+let PostEncoderMethod = "fixMOVZ" in
+defm MOVZ : MoveImmediate<0b10, "movz">;
+
+// First group of aliases covers an implicit "lsl #0".
+def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
+
+// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
+
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>;
+
+// Final group of aliases covers true "mov $Rd, $imm" cases.
+multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
+                          int width, int shift> {
+  def _asmoperand : AsmOperandClass {
+    let Name = basename # width # "_lsl" # shift # "MovAlias";
+    let PredicateMethod = "is" # basename # "MovAlias<" # width # ", "
+                               # shift # ">";
+    let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">";
+  }
+
+  def _movimm : Operand<i32> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_asmoperand");
+  }
+
+  def : InstAlias<"mov $Rd, $imm",
+                  (INST GPR:$Rd, !cast<Operand>(NAME # "_movimm"):$imm, shift)>;
+}
+
+defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>;
+defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>;
+
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>;
+
+defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>;
+defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>;
+
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>;
+
+let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
+    isAsCheapAsAMove = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions.  When that changes, we can select
+// directly to the real instructions and get rid of these pseudos.
+
+def MOVi32imm
+    : Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
+             [(set GPR32:$dst, imm:$src)]>,
+      Sched<[WriteImm]>;
+def MOVi64imm
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
+             [(set GPR64:$dst, imm:$src)]>,
+      Sched<[WriteImm]>;
+} // isReMaterializable, isCodeGenOnly
+
+// If possible, we want to use MOVi32imm even for 64-bit moves. This gives the
+// eventual expansion code fewer bits to worry about getting right. Marshalling
+// the types is a little tricky though:
+def i64imm_32bit : ImmLeaf<i64, [{
+  return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
+}]>;
+
+def s64imm_32bit : ImmLeaf<i64, [{
+  int64_t Imm64 = static_cast<int64_t>(Imm);
+  return Imm64 >= std::numeric_limits<int32_t>::min() &&
+         Imm64 <= std::numeric_limits<int32_t>::max();
+}]>;
+
+def trunc_imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def : Pat<(i64 i64imm_32bit:$src),
+          (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
+
+// Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model).
+def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+  N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+  N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
+
+
+def : Pat<(f32 fpimm:$in),
+  (COPY_TO_REGCLASS (MOVi32imm (bitcast_fpimm_to_i32 f32:$in)), FPR32)>;
+def : Pat<(f64 fpimm:$in),
+  (COPY_TO_REGCLASS (MOVi64imm (bitcast_fpimm_to_i64 f64:$in)), FPR64)>;
+
+
+// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
+// sequences.
+def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
+                             tglobaladdr:$g1, tglobaladdr:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
+                                  tglobaladdr:$g2, 32),
+                          tglobaladdr:$g1, 16),
+                  tglobaladdr:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
+                             tblockaddress:$g1, tblockaddress:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
+                                  tblockaddress:$g2, 32),
+                          tblockaddress:$g1, 16),
+                  tblockaddress:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2,
+                             tconstpool:$g1, tconstpool:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
+                                  tconstpool:$g2, 32),
+                          tconstpool:$g1, 16),
+                  tconstpool:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2,
+                             tjumptable:$g1, tjumptable:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48),
+                                  tjumptable:$g2, 32),
+                          tjumptable:$g1, 16),
+                  tjumptable:$g0, 0)>;
+
+
+//===----------------------------------------------------------------------===//
+// Arithmetic instructions.
+//===----------------------------------------------------------------------===//
+
+// Add/subtract with carry.
+defm ADC : AddSubCarry<0, "adc", "adcs", AArch64adc, AArch64adc_flag>;
+defm SBC : AddSubCarry<1, "sbc", "sbcs", AArch64sbc, AArch64sbc_flag>;
+
+def : InstAlias<"ngc $dst, $src",  (SBCWr  GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngc $dst, $src",  (SBCXr  GPR64:$dst, XZR, GPR64:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;
+
+// Add/subtract
+defm ADD : AddSub<0, "add", "sub", add>;
+defm SUB : AddSub<1, "sub", "add">;
+
+def : InstAlias<"mov $dst, $src",
+                (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>;
+
+defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn", "subs", "cmp">;
+defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp", "adds", "cmn">;
+
+// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
+def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
+          (SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
+          (SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
+def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
+          (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
+          (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
+let AddedComplexity = 1 in {
+def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
+          (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
+def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
+          (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
+}
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+//  These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+}
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+//  These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(AArch64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(AArch64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(AArch64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+}
+
+def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src$shift",
+                (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+def : InstAlias<"neg $dst, $src$shift",
+                (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
+
+def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
+def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"negs $dst, $src$shift",
+                (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+def : InstAlias<"negs $dst, $src$shift",
+                (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
+
+
+// Unsigned/Signed divide
+defm UDIV : Div<0, "udiv", udiv>;
+defm SDIV : Div<1, "sdiv", sdiv>;
+
+def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr $Rn, $Rm)>;
+def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr $Rn, $Rm)>;
+def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr $Rn, $Rm)>;
+def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr $Rn, $Rm)>;
+
+// Variable shift
+defm ASRV : Shift<0b10, "asr", sra>;
+defm LSLV : Shift<0b00, "lsl", shl>;
+defm LSRV : Shift<0b01, "lsr", srl>;
+defm RORV : Shift<0b11, "ror", rotr>;
+
+def : ShiftAlias<"asrv", ASRVWr, GPR32>;
+def : ShiftAlias<"asrv", ASRVXr, GPR64>;
+def : ShiftAlias<"lslv", LSLVWr, GPR32>;
+def : ShiftAlias<"lslv", LSLVXr, GPR64>;
+def : ShiftAlias<"lsrv", LSRVWr, GPR32>;
+def : ShiftAlias<"lsrv", LSRVXr, GPR64>;
+def : ShiftAlias<"rorv", RORVWr, GPR32>;
+def : ShiftAlias<"rorv", RORVXr, GPR64>;
+
+// Multiply-add
+let AddedComplexity = 7 in {
+defm MADD : MulAccum<0, "madd", add>;
+defm MSUB : MulAccum<1, "msub", sub>;
+
+def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
+          (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
+          (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+
+def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
+          (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
+          (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)),
+          (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)),
+          (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+} // AddedComplexity = 7
+
+let AddedComplexity = 5 in {
+def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
+def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
+def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
+def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
+
+def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
+          (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
+          (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+
+def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
+          (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
+          (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+
+def : Pat<(i64 (mul (sext GPR32:$Rn), (s64imm_32bit:$C))),
+          (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (mul (zext GPR32:$Rn), (i64imm_32bit:$C))),
+          (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C))),
+          (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+                     (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+
+def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
+          (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
+          (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (ineg (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)))),
+          (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+                     (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+
+def : Pat<(i64 (add (mul (sext GPR32:$Rn), (s64imm_32bit:$C)), GPR64:$Ra)),
+          (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (add (mul (zext GPR32:$Rn), (i64imm_32bit:$C)), GPR64:$Ra)),
+          (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (add (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)),
+                    GPR64:$Ra)),
+          (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+                     (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+
+def : Pat<(i64 (sub GPR64:$Ra, (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
+          (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (sub GPR64:$Ra, (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
+          (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (sub GPR64:$Ra, (mul (sext_inreg GPR64:$Rn, i32),
+                                    (s64imm_32bit:$C)))),
+          (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+                     (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+} // AddedComplexity = 5
+
+def : MulAccumWAlias<"mul", MADDWrrr>;
+def : MulAccumXAlias<"mul", MADDXrrr>;
+def : MulAccumWAlias<"mneg", MSUBWrrr>;
+def : MulAccumXAlias<"mneg", MSUBXrrr>;
+def : WideMulAccumAlias<"smull", SMADDLrrr>;
+def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
+def : WideMulAccumAlias<"umull", UMADDLrrr>;
+def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;
+
+// Multiply-high
+def SMULHrr : MulHi<0b010, "smulh", mulhs>;
+def UMULHrr : MulHi<0b110, "umulh", mulhu>;
+
+// CRC32
+def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_aarch64_crc32b, "crc32b">;
+def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_aarch64_crc32h, "crc32h">;
+def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_aarch64_crc32w, "crc32w">;
+def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_aarch64_crc32x, "crc32x">;
+
+def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_aarch64_crc32cb, "crc32cb">;
+def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_aarch64_crc32ch, "crc32ch">;
+def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_aarch64_crc32cw, "crc32cw">;
+def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">;
+
+// v8.1 atomic CAS
+defm CAS   : CompareAndSwap<0, 0, "">;
+defm CASA  : CompareAndSwap<1, 0, "a">;
+defm CASL  : CompareAndSwap<0, 1, "l">;
+defm CASAL : CompareAndSwap<1, 1, "al">;
+
+// v8.1 atomic CASP
+defm CASP   : CompareAndSwapPair<0, 0, "">;
+defm CASPA  : CompareAndSwapPair<1, 0, "a">;
+defm CASPL  : CompareAndSwapPair<0, 1, "l">;
+defm CASPAL : CompareAndSwapPair<1, 1, "al">;
+
+// v8.1 atomic SWP
+defm SWP   : Swap<0, 0, "">;
+defm SWPA  : Swap<1, 0, "a">;
+defm SWPL  : Swap<0, 1, "l">;
+defm SWPAL : Swap<1, 1, "al">;
+
+// v8.1 atomic LD<OP>(register). Performs load and then ST<OP>(register)
+defm LDADD   : LDOPregister<0b000, "add", 0, 0, "">;
+defm LDADDA  : LDOPregister<0b000, "add", 1, 0, "a">;
+defm LDADDL  : LDOPregister<0b000, "add", 0, 1, "l">;
+defm LDADDAL : LDOPregister<0b000, "add", 1, 1, "al">;
+
+defm LDCLR   : LDOPregister<0b001, "clr", 0, 0, "">;
+defm LDCLRA  : LDOPregister<0b001, "clr", 1, 0, "a">;
+defm LDCLRL  : LDOPregister<0b001, "clr", 0, 1, "l">;
+defm LDCLRAL : LDOPregister<0b001, "clr", 1, 1, "al">;
+
+defm LDEOR   : LDOPregister<0b010, "eor", 0, 0, "">;
+defm LDEORA  : LDOPregister<0b010, "eor", 1, 0, "a">;
+defm LDEORL  : LDOPregister<0b010, "eor", 0, 1, "l">;
+defm LDEORAL : LDOPregister<0b010, "eor", 1, 1, "al">;
+
+defm LDSET   : LDOPregister<0b011, "set", 0, 0, "">;
+defm LDSETA  : LDOPregister<0b011, "set", 1, 0, "a">;
+defm LDSETL  : LDOPregister<0b011, "set", 0, 1, "l">;
+defm LDSETAL : LDOPregister<0b011, "set", 1, 1, "al">;
+
+defm LDSMAX   : LDOPregister<0b100, "smax", 0, 0, "">;
+defm LDSMAXA  : LDOPregister<0b100, "smax", 1, 0, "a">;
+defm LDSMAXL  : LDOPregister<0b100, "smax", 0, 1, "l">;
+defm LDSMAXAL : LDOPregister<0b100, "smax", 1, 1, "al">;
+
+defm LDSMIN   : LDOPregister<0b101, "smin", 0, 0, "">;
+defm LDSMINA  : LDOPregister<0b101, "smin", 1, 0, "a">;
+defm LDSMINL  : LDOPregister<0b101, "smin", 0, 1, "l">;
+defm LDSMINAL : LDOPregister<0b101, "smin", 1, 1, "al">;
+
+defm LDUMAX   : LDOPregister<0b110, "umax", 0, 0, "">;
+defm LDUMAXA  : LDOPregister<0b110, "umax", 1, 0, "a">;
+defm LDUMAXL  : LDOPregister<0b110, "umax", 0, 1, "l">;
+defm LDUMAXAL : LDOPregister<0b110, "umax", 1, 1, "al">;
+
+defm LDUMIN   : LDOPregister<0b111, "umin", 0, 0, "">;
+defm LDUMINA  : LDOPregister<0b111, "umin", 1, 0, "a">;
+defm LDUMINL  : LDOPregister<0b111, "umin", 0, 1, "l">;
+defm LDUMINAL : LDOPregister<0b111, "umin", 1, 1, "al">;
+
+// v8.1 atomic ST<OP>(register) as aliases to "LD<OP>(register) when Rt=xZR"
+defm : STOPregister<"stadd","LDADD">; // STADDx
+defm : STOPregister<"stclr","LDCLR">; // STCLRx
+defm : STOPregister<"steor","LDEOR">; // STEORx
+defm : STOPregister<"stset","LDSET">; // STSETx
+defm : STOPregister<"stsmax","LDSMAX">;// STSMAXx
+defm : STOPregister<"stsmin","LDSMIN">;// STSMINx
+defm : STOPregister<"stumax","LDUMAX">;// STUMAXx
+defm : STOPregister<"stumin","LDUMIN">;// STUMINx
+
+//===----------------------------------------------------------------------===//
+// Logical instructions.
+//===----------------------------------------------------------------------===//
+
+// (immediate)
+defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag, "bics">;
+defm AND  : LogicalImm<0b00, "and", and, "bic">;
+defm EOR  : LogicalImm<0b10, "eor", xor, "eon">;
+defm ORR  : LogicalImm<0b01, "orr", or, "orn">;
+
+// FIXME: these aliases *are* canonical sometimes (when movz can't be
+// used). Actually, it seems to be working right now, but putting logical_immXX
+// here is a bit dodgy on the AsmParser side too.
+def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
+                                          logical_imm32:$imm), 0>;
+def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
+                                          logical_imm64:$imm), 0>;
+
+
+// (register)
+defm ANDS : LogicalRegS<0b11, 0, "ands", AArch64and_flag>;
+defm BICS : LogicalRegS<0b11, 1, "bics",
+                        BinOpFrag<(AArch64and_flag node:$LHS, (not node:$RHS))>>;
+defm AND  : LogicalReg<0b00, 0, "and", and>;
+defm BIC  : LogicalReg<0b00, 1, "bic",
+                       BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+defm EON  : LogicalReg<0b10, 1, "eon",
+                       BinOpFrag<(not (xor node:$LHS, node:$RHS))>>;
+defm EOR  : LogicalReg<0b10, 0, "eor", xor>;
+defm ORN  : LogicalReg<0b01, 1, "orn",
+                       BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
+defm ORR  : LogicalReg<0b01, 0, "orr", or>;
+
+def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>;
+def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>;
+
+def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>;
+def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>;
+
+def : InstAlias<"mvn $Wd, $Wm$sh",
+                (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>;
+def : InstAlias<"mvn $Xd, $Xm$sh",
+                (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>;
+
+def : InstAlias<"tst $src1, $src2",
+                (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>;
+def : InstAlias<"tst $src1, $src2",
+                (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>;
+
+def : InstAlias<"tst $src1, $src2",
+                        (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>;
+def : InstAlias<"tst $src1, $src2",
+                        (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>;
+
+def : InstAlias<"tst $src1, $src2$sh",
+               (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>;
+def : InstAlias<"tst $src1, $src2$sh",
+               (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>;
+
+
+def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
+def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
+
+
+//===----------------------------------------------------------------------===//
+// One operand data processing instructions.
+//===----------------------------------------------------------------------===//
+
+defm CLS    : OneOperandData<0b101, "cls">;
+defm CLZ    : OneOperandData<0b100, "clz", ctlz>;
+defm RBIT   : OneOperandData<0b000, "rbit">;
+
+def : Pat<(int_aarch64_rbit GPR32:$Rn), (RBITWr $Rn)>;
+def : Pat<(int_aarch64_rbit GPR64:$Rn), (RBITXr $Rn)>;
+
+def  REV16Wr : OneWRegData<0b001, "rev16",
+                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
+def  REV16Xr : OneXRegData<0b001, "rev16", null_frag>;
+
+def : Pat<(cttz GPR32:$Rn),
+          (CLZWr (RBITWr GPR32:$Rn))>;
+def : Pat<(cttz GPR64:$Rn),
+          (CLZXr (RBITXr GPR64:$Rn))>;
+def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)),
+                (i32 1))),
+          (CLSWr GPR32:$Rn)>;
+def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)),
+                (i64 1))),
+          (CLSXr GPR64:$Rn)>;
+
+// Unlike the other one operand instructions, the instructions with the "rev"
+// mnemonic do *not* just different in the size bit, but actually use different
+// opcode bits for the different sizes.
+def REVWr   : OneWRegData<0b010, "rev", bswap>;
+def REVXr   : OneXRegData<0b011, "rev", bswap>;
+def REV32Xr : OneXRegData<0b010, "rev32",
+                                 UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
+
+def : InstAlias<"rev64 $Rd, $Rn", (REVXr GPR64:$Rd, GPR64:$Rn), 0>;
+
+// The bswap commutes with the rotr so we want a pattern for both possible
+// orders.
+def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
+def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
+
+//===----------------------------------------------------------------------===//
+// Bitfield immediate extraction instruction.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+defm EXTR : ExtractImm<"extr">;
+def : InstAlias<"ror $dst, $src, $shift",
+            (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
+def : InstAlias<"ror $dst, $src, $shift",
+            (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;
+
+def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
+          (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
+def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
+          (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// Other bitfield immediate instructions.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in {
+defm BFM  : BitfieldImmWith2RegArgs<0b01, "bfm">;
+defm SBFM : BitfieldImm<0b00, "sbfm">;
+defm UBFM : BitfieldImm<0b10, "ubfm">;
+}
+
+def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
+  return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+// min(7, 31 - shift_amt)
+def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  enc = enc > 7 ? 7 : enc;
+  return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+// min(15, 31 - shift_amt)
+def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  enc = enc > 15 ? 15 : enc;
+  return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
+  return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+// min(7, 63 - shift_amt)
+def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 7 ? 7 : enc;
+  return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+// min(15, 63 - shift_amt)
+def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 15 ? 15 : enc;
+  return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+// min(31, 63 - shift_amt)
+def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 31 ? 31 : enc;
+  return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
+}]>;
+
+def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
+          (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
+                              (i64 (i32shift_b imm0_31:$imm)))>;
+def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
+          (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+                              (i64 (i64shift_b imm0_63:$imm)))>;
+
+let AddedComplexity = 10 in {
+def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+}
+
+def : InstAlias<"asr $dst, $src, $shift",
+                (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"asr $dst, $src, $shift",
+                (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
+
+def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
+          (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
+          (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+
+def : InstAlias<"lsr $dst, $src, $shift",
+                (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"lsr $dst, $src, $shift",
+                (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
+
+//===----------------------------------------------------------------------===//
+// Conditional comparison instructions.
+//===----------------------------------------------------------------------===//
+defm CCMN : CondComparison<0, "ccmn", AArch64ccmn>;
+defm CCMP : CondComparison<1, "ccmp", AArch64ccmp>;
+
+//===----------------------------------------------------------------------===//
+// Conditional select instructions.
+//===----------------------------------------------------------------------===//
+defm CSEL  : CondSelect<0, 0b00, "csel">;
+
+def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
+defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
+defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
+defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;
+
+def : Pat<(AArch64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+
+def : Pat<(AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV),
+          (CSINCWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV),
+          (CSINCXr XZR, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR32:$tval, (i32 1), (i32 imm:$cc), NZCV),
+          (CSINCWr GPR32:$tval, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR64:$tval, (i64 1), (i32 imm:$cc), NZCV),
+          (CSINCXr GPR64:$tval, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i32 1), GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSINCWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>;
+def : Pat<(AArch64csel (i64 1), GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSINCXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;
+def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
+          (CSINVWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
+          (CSINVXr XZR, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR32:$tval, (i32 -1), (i32 imm:$cc), NZCV),
+          (CSINVWr GPR32:$tval, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR64:$tval, (i64 -1), (i32 imm:$cc), NZCV),
+          (CSINVXr GPR64:$tval, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i32 -1), GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSINVWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>;
+def : Pat<(AArch64csel (i64 -1), GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSINVXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;
+
+// The inverse of the condition code from the alias instruction is what is used
+// in the aliased instruction. The parser all ready inverts the condition code
+// for these aliases.
+def : InstAlias<"cset $dst, $cc",
+                (CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
+def : InstAlias<"cset $dst, $cc",
+                (CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
+
+def : InstAlias<"csetm $dst, $cc",
+                (CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
+def : InstAlias<"csetm $dst, $cc",
+                (CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
+
+def : InstAlias<"cinc $dst, $src, $cc",
+                (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cinc $dst, $src, $cc",
+                (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+def : InstAlias<"cinv $dst, $src, $cc",
+                (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cinv $dst, $src, $cc",
+                (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+def : InstAlias<"cneg $dst, $src, $cc",
+                (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cneg $dst, $src, $cc",
+                (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+//===----------------------------------------------------------------------===//
+// PC-relative instructions.
+//===----------------------------------------------------------------------===//
+let isReMaterializable = 1 in {
+let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
+def ADR  : ADRI<0, "adr", adrlabel, []>;
+} // hasSideEffects = 0
+
+def ADRP : ADRI<1, "adrp", adrplabel,
+                [(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>;
+} // isReMaterializable = 1
+
+// page address of a constant pool entry, block address
+def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
+def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
+
+//===----------------------------------------------------------------------===//
+// Unconditional branch (register) instructions.
+//===----------------------------------------------------------------------===//
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+def RET  : BranchReg<0b0010, "ret", []>;
+def DRPS : SpecialReturn<0b0101, "drps">;
+def ERET : SpecialReturn<0b0100, "eret">;
+} // isReturn = 1, isTerminator = 1, isBarrier = 1
+
+// Default to the LR register.
+def : InstAlias<"ret", (RET LR)>;
+
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>;
+} // isCall
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
+} // isBranch, isTerminator, isBarrier, isIndirectBranch
+
+// Create a separate pseudo-instruction for codegen to use so that we don't
+// flag lr as used in every function. It'll be restored before the RET by the
+// epilogue if it's legitimately used.
+def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]>,
+                   Sched<[WriteBrReg]> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
+}
+
+// This is a directive-like pseudo-instruction. The purpose is to insert an
+// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
+// (which in the usual case is a BLR).
+let hasSideEffects = 1 in
+def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []>, Sched<[]> {
+  let AsmString = ".tlsdesccall $sym";
+}
+
+// FIXME: maybe the scratch register used shouldn't be fixed to X1?
+// FIXME: can "hasSideEffects be dropped?
+let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1,
+    isCodeGenOnly = 1 in
+def TLSDESC_CALLSEQ
+    : Pseudo<(outs), (ins i64imm:$sym),
+             [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>,
+      Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>;
+def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym),
+          (TLSDESC_CALLSEQ texternalsym:$sym)>;
+
+//===----------------------------------------------------------------------===//
+// Conditional branch (immediate) instruction.
+//===----------------------------------------------------------------------===//
+def Bcc : BranchCond;
+
+//===----------------------------------------------------------------------===//
+// Compare-and-branch instructions.
+//===----------------------------------------------------------------------===//
+defm CBZ  : CmpBranch<0, "cbz", AArch64cbz>;
+defm CBNZ : CmpBranch<1, "cbnz", AArch64cbnz>;
+
+//===----------------------------------------------------------------------===//
+// Test-bit-and-branch instructions.
+//===----------------------------------------------------------------------===//
+defm TBZ  : TestBranch<0, "tbz", AArch64tbz>;
+defm TBNZ : TestBranch<1, "tbnz", AArch64tbnz>;
+
+//===----------------------------------------------------------------------===//
+// Unconditional branch (immediate) instructions.
+//===----------------------------------------------------------------------===//
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+def B  : BranchImm<0, "b", [(br bb:$addr)]>;
+} // isBranch, isTerminator, isBarrier
+
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BL : CallImm<1, "bl", [(AArch64call tglobaladdr:$addr)]>;
+} // isCall
+def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>;
+
+//===----------------------------------------------------------------------===//
+// Exception generation instructions.
+//===----------------------------------------------------------------------===//
+def BRK   : ExceptionGeneration<0b001, 0b00, "brk">;
+def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
+def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
+def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
+def HLT   : ExceptionGeneration<0b010, 0b00, "hlt">;
+def HVC   : ExceptionGeneration<0b000, 0b10, "hvc">;
+def SMC   : ExceptionGeneration<0b000, 0b11, "smc">;
+def SVC   : ExceptionGeneration<0b000, 0b01, "svc">;
+
+// DCPSn defaults to an immediate operand of zero if unspecified.
+def : InstAlias<"dcps1", (DCPS1 0)>;
+def : InstAlias<"dcps2", (DCPS2 0)>;
+def : InstAlias<"dcps3", (DCPS3 0)>;
+
+//===----------------------------------------------------------------------===//
+// Load instructions.
+//===----------------------------------------------------------------------===//
+
+// Pair (indexed, offset)
+defm LDPW : LoadPairOffset<0b00, 0, GPR32, simm7s4, "ldp">;
+defm LDPX : LoadPairOffset<0b10, 0, GPR64, simm7s8, "ldp">;
+defm LDPS : LoadPairOffset<0b00, 1, FPR32, simm7s4, "ldp">;
+defm LDPD : LoadPairOffset<0b01, 1, FPR64, simm7s8, "ldp">;
+defm LDPQ : LoadPairOffset<0b10, 1, FPR128, simm7s16, "ldp">;
+
+defm LDPSW : LoadPairOffset<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+// Pair (pre-indexed)
+def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+// Pair (post-indexed)
+def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+
+// Pair (no allocate)
+defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32, simm7s4, "ldnp">;
+defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64, simm7s8, "ldnp">;
+defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32, simm7s4, "ldnp">;
+defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64, simm7s8, "ldnp">;
+defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128, simm7s16, "ldnp">;
+
+//---
+// (register offset)
+//---
+
+// Integer
+defm LDRBB : Load8RO<0b00,  0, 0b01, GPR32, "ldrb", i32, zextloadi8>;
+defm LDRHH : Load16RO<0b01, 0, 0b01, GPR32, "ldrh", i32, zextloadi16>;
+defm LDRW  : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>;
+defm LDRX  : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;
+
+// Floating-point
+defm LDRB : Load8RO<0b00,   1, 0b01, FPR8,   "ldr", untyped, load>;
+defm LDRH : Load16RO<0b01,  1, 0b01, FPR16,  "ldr", f16, load>;
+defm LDRS : Load32RO<0b10,  1, 0b01, FPR32,  "ldr", f32, load>;
+defm LDRD : Load64RO<0b11,  1, 0b01, FPR64,  "ldr", f64, load>;
+defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128, "ldr", f128, load>;
+
+// Load sign-extended half-word
+defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>;
+defm LDRSHX : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh", i64, sextloadi16>;
+
+// Load sign-extended byte
+defm LDRSBW : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb", i32, sextloadi8>;
+defm LDRSBX : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb", i64, sextloadi8>;
+
+// Load sign-extended word
+defm LDRSW  : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
+
+// Pre-fetch.
+defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
+                              ValueType ScalTy, ValueType VecTy,
+                              Instruction LOADW, Instruction LOADX,
+                              SubRegIndex sub> {
+  def : Pat<(VecTy (scalar_to_vector (ScalTy
+              (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
+            (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
+                           (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
+                           sub)>;
+
+  def : Pat<(VecTy (scalar_to_vector (ScalTy
+              (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
+            (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
+                           (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
+                           sub)>;
+}
+
+let AddedComplexity = 10 in {
+defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v8i8,  LDRBroW, LDRBroX, bsub>;
+defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v16i8, LDRBroW, LDRBroX, bsub>;
+
+defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
+defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
+
+defm : ScalToVecROLoadPat<ro16, load,       i32, v4f16, LDRHroW, LDRHroX, hsub>;
+defm : ScalToVecROLoadPat<ro16, load,       i32, v8f16, LDRHroW, LDRHroX, hsub>;
+
+defm : ScalToVecROLoadPat<ro32, load,       i32, v2i32, LDRSroW, LDRSroX, ssub>;
+defm : ScalToVecROLoadPat<ro32, load,       i32, v4i32, LDRSroW, LDRSroX, ssub>;
+
+defm : ScalToVecROLoadPat<ro32, load,       f32, v2f32, LDRSroW, LDRSroX, ssub>;
+defm : ScalToVecROLoadPat<ro32, load,       f32, v4f32, LDRSroW, LDRSroX, ssub>;
+
+defm : ScalToVecROLoadPat<ro64, load,       i64, v2i64, LDRDroW, LDRDroX, dsub>;
+
+defm : ScalToVecROLoadPat<ro64, load,       f64, v2f64, LDRDroW, LDRDroX, dsub>;
+
+
+def : Pat <(v1i64 (scalar_to_vector (i64
+                      (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                           ro_Wextend64:$extend))))),
+           (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+
+def : Pat <(v1i64 (scalar_to_vector (i64
+                      (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                           ro_Xextend64:$extend))))),
+           (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+}
+
+// Match all load 64 bits width whose type is compatible with FPR64
+multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
+                        Instruction LOADW, Instruction LOADX> {
+
+  def : Pat<(VecTy (load (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(VecTy (load (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 10 in {
+let Predicates = [IsLE] in {
+  // We must do vector loads with LD1 in big-endian.
+  defm : VecROLoadPat<ro64, v2i32, LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v8i8,  LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v4f16, LDRDroW, LDRDroX>;
+}
+
+defm : VecROLoadPat<ro64, v1i64,  LDRDroW, LDRDroX>;
+defm : VecROLoadPat<ro64, v1f64,  LDRDroW, LDRDroX>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must do vector loads with LD1 in big-endian.
+  defm : VecROLoadPat<ro128, v2i64,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v2f64,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v4i32,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v4f32,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v8i16,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v8f16,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v16i8,  LDRQroW, LDRQroX>;
+}
+} // AddedComplexity = 10
+
+// zextload -> i64
+multiclass ExtLoadTo64ROPat<ROAddrMode ro, SDPatternOperator loadop,
+                            Instruction INSTW, Instruction INSTX> {
+  def : Pat<(i64 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (SUBREG_TO_REG (i64 0),
+                           (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
+                           sub_32)>;
+
+  def : Pat<(i64 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (SUBREG_TO_REG (i64 0),
+                           (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
+                           sub_32)>;
+}
+
+let AddedComplexity = 10 in {
+  defm : ExtLoadTo64ROPat<ro8,  zextloadi8,  LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo64ROPat<ro16, zextloadi16, LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo64ROPat<ro32, zextloadi32, LDRWroW,  LDRWroX>;
+
+  // zextloadi1 -> zextloadi8
+  defm : ExtLoadTo64ROPat<ro8,  zextloadi1,  LDRBBroW, LDRBBroX>;
+
+  // extload -> zextload
+  defm : ExtLoadTo64ROPat<ro8,  extloadi8,   LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo64ROPat<ro16, extloadi16,  LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo64ROPat<ro32, extloadi32,  LDRWroW,  LDRWroX>;
+
+  // extloadi1 -> zextloadi8
+  defm : ExtLoadTo64ROPat<ro8,  extloadi1,   LDRBBroW, LDRBBroX>;
+}
+
+
+// zextload -> i64
+multiclass ExtLoadTo32ROPat<ROAddrMode ro, SDPatternOperator loadop,
+                            Instruction INSTW, Instruction INSTX> {
+  def : Pat<(i32 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(i32 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+
+}
+
+let AddedComplexity = 10 in {
+  // extload -> zextload
+  defm : ExtLoadTo32ROPat<ro8,  extloadi8,   LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo32ROPat<ro16, extloadi16,  LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo32ROPat<ro32, extloadi32,  LDRWroW,  LDRWroX>;
+
+  // zextloadi1 -> zextloadi8
+  defm : ExtLoadTo32ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;
+}
+
+//---
+// (unsigned immediate)
+//---
+defm LDRX : LoadUI<0b11, 0, 0b01, GPR64, uimm12s8, "ldr",
+                   [(set GPR64:$Rt,
+                         (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
+defm LDRW : LoadUI<0b10, 0, 0b01, GPR32, uimm12s4, "ldr",
+                   [(set GPR32:$Rt,
+                         (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
+defm LDRB : LoadUI<0b00, 1, 0b01, FPR8, uimm12s1, "ldr",
+                   [(set FPR8:$Rt,
+                         (load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
+defm LDRH : LoadUI<0b01, 1, 0b01, FPR16, uimm12s2, "ldr",
+                   [(set (f16 FPR16:$Rt),
+                         (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>;
+defm LDRS : LoadUI<0b10, 1, 0b01, FPR32, uimm12s4, "ldr",
+                   [(set (f32 FPR32:$Rt),
+                         (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
+defm LDRD : LoadUI<0b11, 1, 0b01, FPR64, uimm12s8, "ldr",
+                   [(set (f64 FPR64:$Rt),
+                         (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
+defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128, uimm12s16, "ldr",
+                 [(set (f128 FPR128:$Rt),
+                       (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+def : Pat <(v8i8 (scalar_to_vector (i32
+               (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat <(v16i8 (scalar_to_vector (i32
+               (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat <(v4i16 (scalar_to_vector (i32
+               (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+def : Pat <(v8i16 (scalar_to_vector (i32
+               (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+def : Pat <(v2i32 (scalar_to_vector (i32
+               (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+def : Pat <(v4i32 (scalar_to_vector (i32
+               (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+def : Pat <(v1i64 (scalar_to_vector (i64
+               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+           (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat <(v2i64 (scalar_to_vector (i64
+               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                          (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use LD1 to perform vector loads in big-endian.
+  def : Pat<(v2f32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v8i8 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v4i16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+}
+def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+          (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(v1i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+          (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use LD1 to perform vector loads in big-endian.
+  def : Pat<(v4f32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v2f64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v16i8 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v8i16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v4i32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+}
+def : Pat<(f128  (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+          (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+
+defm LDRHH : LoadUI<0b01, 0, 0b01, GPR32, uimm12s2, "ldrh",
+                    [(set GPR32:$Rt,
+                          (zextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                     uimm12s2:$offset)))]>;
+defm LDRBB : LoadUI<0b00, 0, 0b01, GPR32, uimm12s1, "ldrb",
+                    [(set GPR32:$Rt,
+                          (zextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                   uimm12s1:$offset)))]>;
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
+
+// zextloadi1 -> zextloadi8
+def : Pat<(i32 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i64 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+
+// extload -> zextload
+def : Pat<(i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+          (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i32 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i64 (extloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+def : Pat<(i64 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
+def : Pat<(i64 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+def : Pat<(i64 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+
+// load sign-extended half-word
+defm LDRSHW : LoadUI<0b01, 0, 0b11, GPR32, uimm12s2, "ldrsh",
+                     [(set GPR32:$Rt,
+                           (sextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)))]>;
+defm LDRSHX : LoadUI<0b01, 0, 0b10, GPR64, uimm12s2, "ldrsh",
+                     [(set GPR64:$Rt,
+                           (sextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)))]>;
+
+// load sign-extended byte
+defm LDRSBW : LoadUI<0b00, 0, 0b11, GPR32, uimm12s1, "ldrsb",
+                     [(set GPR32:$Rt,
+                           (sextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)))]>;
+defm LDRSBX : LoadUI<0b00, 0, 0b10, GPR64, uimm12s1, "ldrsb",
+                     [(set GPR64:$Rt,
+                           (sextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)))]>;
+
+// load sign-extended word
+defm LDRSW  : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
+                     [(set GPR64:$Rt,
+                           (sextloadi32 (am_indexed32 GPR64sp:$Rn,
+                                                      uimm12s4:$offset)))]>;
+
+// load zero-extended word
+def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
+      (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+
+// Pre-fetch.
+def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
+                        [(AArch64Prefetch imm:$Rt,
+                                        (am_indexed64 GPR64sp:$Rn,
+                                                      uimm12s8:$offset))]>;
+
+def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
+
+//---
+// (literal)
+def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
+def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
+def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
+def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
+def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;
+
+// load sign-extended word
+def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;
+
+// prefetch
+def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
+//                   [(AArch64Prefetch imm:$Rt, tglobaladdr:$label)]>;
+
+//---
+// (unscaled immediate)
+defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64, "ldur",
+                    [(set GPR64:$Rt,
+                          (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32, "ldur",
+                    [(set GPR32:$Rt,
+                          (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8, "ldur",
+                    [(set FPR8:$Rt,
+                          (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16, "ldur",
+                    [(set FPR16:$Rt,
+                          (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32, "ldur",
+                    [(set (f32 FPR32:$Rt),
+                          (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64, "ldur",
+                    [(set (f64 FPR64:$Rt),
+                          (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128, "ldur",
+                    [(set (f128 FPR128:$Rt),
+                          (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>;
+
+defm LDURHH
+    : LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh",
+             [(set GPR32:$Rt,
+                    (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURBB
+    : LoadUnscaled<0b00, 0, 0b01, GPR32, "ldurb",
+             [(set GPR32:$Rt,
+                    (zextloadi8 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  def : Pat<(v2f32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v2i32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4i16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4f16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+}
+def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+          (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(v1i64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+          (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  def : Pat<(v2f64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v2i64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4f32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4i32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v8i16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v8f16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+}
+
+//  anyext -> zext
+def : Pat<(i32 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i64 (extloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+// unscaled zext
+def : Pat<(i32 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i64 (zextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+
+
+//---
+// LDR mnemonics fall back to LDUR for negative or unaligned offsets.
+
+// Define new assembler match classes as we want to only match these when
+// the don't otherwise match the scaled addressing mode for LDR/STR. Don't
+// associate a DiagnosticType either, as we want the diagnostic for the
+// canonical form (the scaled operand) to take precedence.
+class SImm9OffsetOperand<int Width> : AsmOperandClass {
+  let Name = "SImm9OffsetFB" # Width;
+  let PredicateMethod = "isSImm9OffsetFB<" # Width # ">";
+  let RenderMethod = "addImmOperands";
+}
+
+def SImm9OffsetFB8Operand : SImm9OffsetOperand<8>;
+def SImm9OffsetFB16Operand : SImm9OffsetOperand<16>;
+def SImm9OffsetFB32Operand : SImm9OffsetOperand<32>;
+def SImm9OffsetFB64Operand : SImm9OffsetOperand<64>;
+def SImm9OffsetFB128Operand : SImm9OffsetOperand<128>;
+
+def simm9_offset_fb8 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB8Operand;
+}
+def simm9_offset_fb16 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB16Operand;
+}
+def simm9_offset_fb32 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB32Operand;
+}
+def simm9_offset_fb64 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB64Operand;
+}
+def simm9_offset_fb128 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB128Operand;
+}
+
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+               (LDURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+  (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+  (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+
+// load sign-extended half-word
+defm LDURSHW
+    : LoadUnscaled<0b01, 0, 0b11, GPR32, "ldursh",
+               [(set GPR32:$Rt,
+                    (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURSHX
+    : LoadUnscaled<0b01, 0, 0b10, GPR64, "ldursh",
+              [(set GPR64:$Rt,
+                    (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// load sign-extended byte
+defm LDURSBW
+    : LoadUnscaled<0b00, 0, 0b11, GPR32, "ldursb",
+                [(set GPR32:$Rt,
+                      (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURSBX
+    : LoadUnscaled<0b00, 0, 0b10, GPR64, "ldursb",
+                [(set GPR64:$Rt,
+                      (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// load sign-extended word
+defm LDURSW
+    : LoadUnscaled<0b10, 0, 0b10, GPR64, "ldursw",
+              [(set GPR64:$Rt,
+                    (sextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
+def : InstAlias<"ldrb $Rt, [$Rn, $offset]",
+                (LDURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrh $Rt, [$Rn, $offset]",
+                (LDURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
+                (LDURSBWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
+                (LDURSBXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
+                (LDURSHWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
+                (LDURSHXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
+                (LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+
+// Pre-fetch.
+defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
+                  [(AArch64Prefetch imm:$Rt,
+                                  (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+
+//---
+// (unscaled immediate, unprivileged)
+defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
+defm LDTRW : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;
+
+defm LDTRH : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
+defm LDTRB : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;
+
+// load sign-extended half-word
+defm LDTRSHW : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
+defm LDTRSHX : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;
+
+// load sign-extended byte
+defm LDTRSBW : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
+defm LDTRSBX : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;
+
+// load sign-extended word
+defm LDTRSW  : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
+
+//---
+// (immediate pre-indexed)
+def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8,  "ldr">;
+def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+
+//---
+// (immediate post-indexed)
+def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8,  "ldr">;
+def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+
+//===----------------------------------------------------------------------===//
+// Store instructions.
+//===----------------------------------------------------------------------===//
+
+// Pair (indexed, offset)
+// FIXME: Use dedicated range-checked addressing mode operand here.
+defm STPW : StorePairOffset<0b00, 0, GPR32, simm7s4, "stp">;
+defm STPX : StorePairOffset<0b10, 0, GPR64, simm7s8, "stp">;
+defm STPS : StorePairOffset<0b00, 1, FPR32, simm7s4, "stp">;
+defm STPD : StorePairOffset<0b01, 1, FPR64, simm7s8, "stp">;
+defm STPQ : StorePairOffset<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (pre-indexed)
+def STPWpre : StorePairPreIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpre : StorePairPreIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpre : StorePairPreIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpre : StorePairPreIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpre : StorePairPreIdx<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (pre-indexed)
+def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (no allocate)
+defm STNPW : StorePairNoAlloc<0b00, 0, GPR32, simm7s4, "stnp">;
+defm STNPX : StorePairNoAlloc<0b10, 0, GPR64, simm7s8, "stnp">;
+defm STNPS : StorePairNoAlloc<0b00, 1, FPR32, simm7s4, "stnp">;
+defm STNPD : StorePairNoAlloc<0b01, 1, FPR64, simm7s8, "stnp">;
+defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128, simm7s16, "stnp">;
+
+//---
+// (Register offset)
+
+// Integer
+defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>;
+defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>;
+defm STRW  : Store32RO<0b10, 0, 0b00, GPR32, "str",  i32, store>;
+defm STRX  : Store64RO<0b11, 0, 0b00, GPR64, "str",  i64, store>;
+
+
+// Floating-point
+defm STRB : Store8RO< 0b00,  1, 0b00, FPR8,   "str", untyped, store>;
+defm STRH : Store16RO<0b01,  1, 0b00, FPR16,  "str", f16,     store>;
+defm STRS : Store32RO<0b10,  1, 0b00, FPR32,  "str", f32,     store>;
+defm STRD : Store64RO<0b11,  1, 0b00, FPR64,  "str", f64,     store>;
+defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128,    store>;
+
+multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop,
+                                 Instruction STRW, Instruction STRX> {
+
+  def : Pat<(storeop GPR64:$Rt,
+                     (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+            (STRW (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+                  GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(storeop GPR64:$Rt,
+                     (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+            (STRX (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+                  GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 10 in {
+  // truncstore i64
+  defm : TruncStoreFrom64ROPat<ro8,  truncstorei8,  STRBBroW, STRBBroX>;
+  defm : TruncStoreFrom64ROPat<ro16, truncstorei16, STRHHroW, STRHHroX>;
+  defm : TruncStoreFrom64ROPat<ro32, truncstorei32, STRWroW,  STRWroX>;
+}
+
+multiclass VecROStorePat<ROAddrMode ro, ValueType VecTy, RegisterClass FPR,
+                         Instruction STRW, Instruction STRX> {
+  def : Pat<(store (VecTy FPR:$Rt),
+                   (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+            (STRW FPR:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(store (VecTy FPR:$Rt),
+                   (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+            (STRX FPR:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 10 in {
+// Match all store 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  defm : VecROStorePat<ro64, v2i32, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v4f16, FPR64, STRDroW, STRDroX>;
+}
+
+defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
+defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v4i32, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v8f16, FPR128, STRQroW, STRQroX>;
+}
+} // AddedComplexity = 10
+
+// Match stores from lane 0 to the appropriate subreg's store.
+multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop,
+                              ValueType VecTy, ValueType STy,
+                              SubRegIndex SubRegIdx,
+                              Instruction STRW, Instruction STRX> {
+
+  def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
+                     (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+            (STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
+                  GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
+                     (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+            (STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
+                  GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 19 in {
+  defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>;
+  defm : VecROStoreLane0Pat<ro16,      store   , v8i16, i16, hsub, STRHroW, STRHroX>;
+  defm : VecROStoreLane0Pat<ro32, truncstorei32, v4i32, i32, ssub, STRSroW, STRSroX>;
+  defm : VecROStoreLane0Pat<ro32,      store   , v4i32, i32, ssub, STRSroW, STRSroX>;
+  defm : VecROStoreLane0Pat<ro32,      store   , v4f32, f32, ssub, STRSroW, STRSroX>;
+  defm : VecROStoreLane0Pat<ro64,      store   , v2i64, i64, dsub, STRDroW, STRDroX>;
+  defm : VecROStoreLane0Pat<ro64,      store   , v2f64, f64, dsub, STRDroW, STRDroX>;
+}
+
+//---
+// (unsigned immediate)
+defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str",
+                   [(store GPR64:$Rt,
+                            (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
+defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str",
+                    [(store GPR32:$Rt,
+                            (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
+defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str",
+                    [(store FPR8:$Rt,
+                            (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
+defm STRH : StoreUI<0b01, 1, 0b00, FPR16, uimm12s2, "str",
+                    [(store (f16 FPR16:$Rt),
+                            (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>;
+defm STRS : StoreUI<0b10, 1, 0b00, FPR32, uimm12s4, "str",
+                    [(store (f32 FPR32:$Rt),
+                            (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
+defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str",
+                    [(store (f64 FPR64:$Rt),
+                            (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
+defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>;
+
+defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh",
+                     [(truncstorei16 GPR32:$Rt,
+                                     (am_indexed16 GPR64sp:$Rn,
+                                                   uimm12s2:$offset))]>;
+defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1,  "strb",
+                     [(truncstorei8 GPR32:$Rt,
+                                    (am_indexed8 GPR64sp:$Rn,
+                                                 uimm12s1:$offset))]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+let AddedComplexity = 10 in {
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v2f32 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v8i8 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v4i16 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v2i32 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v4f16 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+}
+def : Pat<(store (v1f64 FPR64:$Rt),
+                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt),
+                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v4f32 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v16i8 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v8i16 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v4i32 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v2i64 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v8f16 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+}
+def : Pat<(store (f128  FPR128:$Rt),
+                 (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+          (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+
+// truncstore i64
+def : Pat<(truncstorei32 GPR64:$Rt,
+                         (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
+  (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(truncstorei16 GPR64:$Rt,
+                         (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+  (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
+  (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>;
+
+} // AddedComplexity = 10
+
+//---
+// (unscaled immediate)
+defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur",
+                         [(store GPR64:$Rt,
+                                 (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32, "stur",
+                         [(store GPR32:$Rt,
+                                 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8, "stur",
+                         [(store FPR8:$Rt,
+                                 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16, "stur",
+                         [(store (f16 FPR16:$Rt),
+                                 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32, "stur",
+                         [(store (f32 FPR32:$Rt),
+                                 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64, "stur",
+                         [(store (f64 FPR64:$Rt),
+                                 (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128, "stur",
+                         [(store (f128 FPR128:$Rt),
+                                 (am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32, "sturh",
+                         [(truncstorei16 GPR32:$Rt,
+                                 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32, "sturb",
+                         [(truncstorei8 GPR32:$Rt,
+                                  (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v2f32 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v8i8 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v4i16 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2i32 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v4f16 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+}
+def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v4f32 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v16i8 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v8i16 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v4i32 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2i64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v8f16 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+}
+
+// unscaled i64 truncating stores
+def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+  (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+  (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+  (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+
+//---
+// STR mnemonics fall back to STUR for negative or unaligned offsets.
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+
+def : InstAlias<"strb $Rt, [$Rn, $offset]",
+                (STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"strh $Rt, [$Rn, $offset]",
+                (STURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+
+//---
+// (unscaled immediate, unprivileged)
+defm STTRW : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
+defm STTRX : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;
+
+defm STTRH : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
+defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
+
+//---
+// (immediate pre-indexed)
+def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str",  pre_store, i32>;
+def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str",  pre_store, i64>;
+def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8,  "str",  pre_store, untyped>;
+def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str",  pre_store, f16>;
+def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str",  pre_store, f32>;
+def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str",  pre_store, f64>;
+def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str", pre_store, f128>;
+
+def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb", pre_truncsti8,  i32>;
+def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh", pre_truncsti16, i32>;
+
+// truncstore i64
+def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+           simm9:$off)>;
+def : Pat<(pre_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRHHpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+def : Pat<(pre_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRBBpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+
+def : Pat<(pre_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+//---
+// (immediate post-indexed)
+def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32,  "str", post_store, i32>;
+def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64,  "str", post_store, i64>;
+def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8,   "str", post_store, untyped>;
+def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16,  "str", post_store, f16>;
+def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32,  "str", post_store, f32>;
+def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64,  "str", post_store, f64>;
+def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str", post_store, f128>;
+
+def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb", post_truncsti8, i32>;
+def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh", post_truncsti16, i32>;
+
+// truncstore i64
+def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRWpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+def : Pat<(post_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRHHpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+             simm9:$off)>;
+def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+             simm9:$off)>;
+
+def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+//===----------------------------------------------------------------------===//
+// Load/store exclusive instructions.
+//===----------------------------------------------------------------------===//
+
+def LDARW  : LoadAcquire   <0b10, 1, 1, 0, 1, GPR32, "ldar">;
+def LDARX  : LoadAcquire   <0b11, 1, 1, 0, 1, GPR64, "ldar">;
+def LDARB  : LoadAcquire   <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
+def LDARH  : LoadAcquire   <0b01, 1, 1, 0, 1, GPR32, "ldarh">;
+
+def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
+def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
+def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
+def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;
+
+def LDXRW  : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
+def LDXRX  : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
+def LDXRB  : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
+def LDXRH  : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;
+
+def STLRW  : StoreRelease  <0b10, 1, 0, 0, 1, GPR32, "stlr">;
+def STLRX  : StoreRelease  <0b11, 1, 0, 0, 1, GPR64, "stlr">;
+def STLRB  : StoreRelease  <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
+def STLRH  : StoreRelease  <0b01, 1, 0, 0, 1, GPR32, "stlrh">;
+
+def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
+def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
+def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
+def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;
+
+def STXRW  : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
+def STXRX  : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
+def STXRB  : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
+def STXRH  : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;
+
+def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
+def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;
+
+def LDXPW  : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
+def LDXPX  : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;
+
+def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
+def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;
+
+def STXPW  : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
+def STXPX  : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
+
+let Predicates = [HasV8_1a] in {
+  // v8.1a "Limited Order Region" extension load-acquire instructions
+  def LDLARW  : LoadAcquire   <0b10, 1, 1, 0, 0, GPR32, "ldlar">;
+  def LDLARX  : LoadAcquire   <0b11, 1, 1, 0, 0, GPR64, "ldlar">;
+  def LDLARB  : LoadAcquire   <0b00, 1, 1, 0, 0, GPR32, "ldlarb">;
+  def LDLARH  : LoadAcquire   <0b01, 1, 1, 0, 0, GPR32, "ldlarh">;
+
+  // v8.1a "Limited Order Region" extension store-release instructions
+  def STLLRW  : StoreRelease   <0b10, 1, 0, 0, 0, GPR32, "stllr">;
+  def STLLRX  : StoreRelease   <0b11, 1, 0, 0, 0, GPR64, "stllr">;
+  def STLLRB  : StoreRelease   <0b00, 1, 0, 0, 0, GPR32, "stllrb">;
+  def STLLRH  : StoreRelease   <0b01, 1, 0, 0, 0, GPR32, "stllrh">;
+}
+
+//===----------------------------------------------------------------------===//
+// Scaled floating point to integer conversion instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_aarch64_neon_fcvtas>;
+defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_aarch64_neon_fcvtau>;
+defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_aarch64_neon_fcvtms>;
+defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_aarch64_neon_fcvtmu>;
+defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns>;
+defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>;
+defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>;
+defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>;
+defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+
+multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
+  def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>;
+  def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # UXHr) $Rn)>;
+  def : Pat<(i32 (round f32:$Rn)), (!cast<Instruction>(INST # UWSr) $Rn)>;
+  def : Pat<(i64 (round f32:$Rn)), (!cast<Instruction>(INST # UXSr) $Rn)>;
+  def : Pat<(i32 (round f64:$Rn)), (!cast<Instruction>(INST # UWDr) $Rn)>;
+  def : Pat<(i64 (round f64:$Rn)), (!cast<Instruction>(INST # UXDr) $Rn)>;
+
+  def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
+            (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
+  def : Pat<(i64 (round (fmul f16:$Rn, fixedpoint_f16_i64:$scale))),
+            (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
+  def : Pat<(i32 (round (fmul f32:$Rn, fixedpoint_f32_i32:$scale))),
+            (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
+  def : Pat<(i64 (round (fmul f32:$Rn, fixedpoint_f32_i64:$scale))),
+            (!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
+  def : Pat<(i32 (round (fmul f64:$Rn, fixedpoint_f64_i32:$scale))),
+            (!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
+  def : Pat<(i64 (round (fmul f64:$Rn, fixedpoint_f64_i64:$scale))),
+            (!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
+}
+
+defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">;
+defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">;
+
+multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
+  def : Pat<(i32 (to_int (round f32:$Rn))),
+            (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+  def : Pat<(i64 (to_int (round f32:$Rn))),
+            (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+  def : Pat<(i32 (to_int (round f64:$Rn))),
+            (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+  def : Pat<(i64 (to_int (round f64:$Rn))),
+            (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+}
+
+defm : FPToIntegerPats<fp_to_sint, fceil,  "FCVTPS">;
+defm : FPToIntegerPats<fp_to_uint, fceil,  "FCVTPU">;
+defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">;
+defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">;
+defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">;
+defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">;
+defm : FPToIntegerPats<fp_to_sint, fround, "FCVTAS">;
+defm : FPToIntegerPats<fp_to_uint, fround, "FCVTAU">;
+
+//===----------------------------------------------------------------------===//
+// Scaled integer to floating point conversion instructions.
+//===----------------------------------------------------------------------===//
+
+defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
+defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
+
+//===----------------------------------------------------------------------===//
+// Unscaled integer to floating point conversion instruction.
+//===----------------------------------------------------------------------===//
+
+defm FMOV : UnscaledConversion<"fmov">;
+
+// Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable
+let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in {
+def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>,
+    Sched<[WriteF]>;
+def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>,
+    Sched<[WriteF]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating point conversion instruction.
+//===----------------------------------------------------------------------===//
+
+defm FCVT : FPConversion<"fcvt">;
+
+//===----------------------------------------------------------------------===//
+// Floating point single operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FABS   : SingleOperandFPData<0b0001, "fabs", fabs>;
+defm FMOV   : SingleOperandFPData<0b0000, "fmov">;
+defm FNEG   : SingleOperandFPData<0b0010, "fneg", fneg>;
+defm FRINTA : SingleOperandFPData<0b1100, "frinta", fround>;
+defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
+defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
+defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
+
+def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))),
+          (FRINTNDr FPR64:$Rn)>;
+
+defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
+defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
+
+let SchedRW = [WriteFDiv] in {
+defm FSQRT  : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating point two operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FADD   : TwoOperandFPData<0b0010, "fadd", fadd>;
+let SchedRW = [WriteFDiv] in {
+defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
+}
+defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>;
+defm FMAX   : TwoOperandFPData<0b0100, "fmax", fmaxnan>;
+defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>;
+defm FMIN   : TwoOperandFPData<0b0101, "fmin", fminnan>;
+let SchedRW = [WriteFMul] in {
+defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
+defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
+}
+defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
+
+def : Pat<(v1f64 (fmaxnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (fminnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
+
+//===----------------------------------------------------------------------===//
+// Floating point three operand instructions.
+//===----------------------------------------------------------------------===//
+
+defm FMADD  : ThreeOperandFPData<0, 0, "fmadd", fma>;
+defm FMSUB  : ThreeOperandFPData<0, 1, "fmsub",
+     TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
+defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
+     TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
+defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
+     TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
+
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
+
+// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike
+// the NEON variant.
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
+          (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
+          (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and
+// "(-a) + b*(-c)".
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
+          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
+          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
+          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
+          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+//===----------------------------------------------------------------------===//
+// Floating point comparison instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCMPE : FPComparison<1, "fcmpe">;
+defm FCMP  : FPComparison<0, "fcmp", AArch64fcmp>;
+
+//===----------------------------------------------------------------------===//
+// Floating point conditional comparison instructions.
+//===----------------------------------------------------------------------===//
+
+defm FCCMPE : FPCondComparison<1, "fccmpe">;
+defm FCCMP  : FPCondComparison<0, "fccmp", AArch64fccmp>;
+
+//===----------------------------------------------------------------------===//
+// Floating point conditional select instruction.
+//===----------------------------------------------------------------------===//
+
+defm FCSEL : FPCondSelect<"fcsel">;
+
+// CSEL instructions providing f128 types need to be handled by a
+// pseudo-instruction since the eventual code will need to introduce basic
+// blocks and control flow.
+def F128CSEL : Pseudo<(outs FPR128:$Rd),
+                      (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
+                      [(set (f128 FPR128:$Rd),
+                            (AArch64csel FPR128:$Rn, FPR128:$Rm,
+                                       (i32 imm:$cond), NZCV))]> {
+  let Uses = [NZCV];
+  let usesCustomInserter = 1;
+  let hasNoSchedulingInfo = 1;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Floating point immediate move.
+//===----------------------------------------------------------------------===//
+
+let isReMaterializable = 1 in {
+defm FMOV : FPMoveImmediate<"fmov">;
+}
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
+                                          int_aarch64_neon_uabd>;
+// Match UABDL in log2-shuffle patterns.
+def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
+               (v8i16 (add (sub (zext (v8i8 V64:$opA)),
+                                (zext (v8i8 V64:$opB))),
+                           (AArch64vashr v8i16:$src, (i32 15))))),
+          (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
+def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
+               (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)),
+                                (zext (extract_high_v16i8 V128:$opB))),
+                           (AArch64vashr v8i16:$src, (i32 15))))),
+          (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
+def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))),
+               (v4i32 (add (sub (zext (v4i16 V64:$opA)),
+                                (zext (v4i16 V64:$opB))),
+                           (AArch64vashr v4i32:$src, (i32 31))))),
+          (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>;
+def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))),
+               (v4i32 (add (sub (zext (extract_high_v8i16 V128:$opA)),
+                                (zext (extract_high_v8i16 V128:$opB))),
+                          (AArch64vashr v4i32:$src, (i32 31))))),
+          (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>;
+def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))),
+               (v2i64 (add (sub (zext (v2i32 V64:$opA)),
+                                (zext (v2i32 V64:$opB))),
+                           (AArch64vashr v2i64:$src, (i32 63))))),
+          (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>;
+def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))),
+               (v2i64 (add (sub (zext (extract_high_v4i32 V128:$opA)),
+                                (zext (extract_high_v4i32 V128:$opB))),
+                          (AArch64vashr v2i64:$src, (i32 63))))),
+          (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>;
+
+defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
+def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))),
+               (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))),
+          (ABSv8i8 V64:$src)>;
+def : Pat<(xor (v4i16 (AArch64vashr V64:$src, (i32 15))),
+               (v4i16 (add V64:$src, (AArch64vashr V64:$src, (i32 15))))),
+          (ABSv4i16 V64:$src)>;
+def : Pat<(xor (v2i32 (AArch64vashr V64:$src, (i32 31))),
+               (v2i32 (add V64:$src, (AArch64vashr V64:$src, (i32 31))))),
+          (ABSv2i32 V64:$src)>;
+def : Pat<(xor (v16i8 (AArch64vashr V128:$src, (i32 7))),
+               (v16i8 (add V128:$src, (AArch64vashr V128:$src, (i32 7))))),
+          (ABSv16i8 V128:$src)>;
+def : Pat<(xor (v8i16 (AArch64vashr V128:$src, (i32 15))),
+               (v8i16 (add V128:$src, (AArch64vashr V128:$src, (i32 15))))),
+          (ABSv8i16 V128:$src)>;
+def : Pat<(xor (v4i32 (AArch64vashr V128:$src, (i32 31))),
+               (v4i32 (add V128:$src, (AArch64vashr V128:$src, (i32 31))))),
+          (ABSv4i32 V128:$src)>;
+def : Pat<(xor (v2i64 (AArch64vashr V128:$src, (i32 63))),
+               (v2i64 (add V128:$src, (AArch64vashr V128:$src, (i32 63))))),
+          (ABSv2i64 V128:$src)>;
+
+defm CLS    : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
+defm CLZ    : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
+defm CMEQ   : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
+defm CMGE   : SIMDCmpTwoVector<1, 0b01000, "cmge", AArch64cmgez>;
+defm CMGT   : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>;
+defm CMLE   : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>;
+defm CMLT   : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>;
+defm CNT    : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
+defm FABS   : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
+
+defm FCMEQ  : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE  : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT  : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE  : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT  : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_aarch64_neon_fcvtas>;
+defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_aarch64_neon_fcvtau>;
+defm FCVTL  : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
+def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
+          (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
+                                                              (i64 4)))),
+          (FCVTLv8i16 V128:$Rn)>;
+def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
+def : Pat<(v2f64 (fpextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
+                                                    (i64 2))))),
+          (FCVTLv4i32 V128:$Rn)>;
+
+def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (fpextend (v4f16 (extract_subvector (v8f16 V128:$Rn),
+                                                    (i64 4))))),
+          (FCVTLv8i16 V128:$Rn)>;
+
+defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
+defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
+defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>;
+defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_aarch64_neon_fcvtnu>;
+defm FCVTN  : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
+def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
+          (FCVTNv4i16 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd,
+                          (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
+          (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+def : Pat<(v2f32 (fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+def : Pat<(v4f16 (fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd, (v2f32 (fpround (v2f64 V128:$Rn)))),
+          (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
+defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
+defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
+                                        int_aarch64_neon_fcvtxn>;
+defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
+defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
+
+def : Pat<(v4i16 (int_aarch64_neon_fcvtzs v4f16:$Rn)), (FCVTZSv4f16 $Rn)>;
+def : Pat<(v8i16 (int_aarch64_neon_fcvtzs v8f16:$Rn)), (FCVTZSv8f16 $Rn)>;
+def : Pat<(v2i32 (int_aarch64_neon_fcvtzs v2f32:$Rn)), (FCVTZSv2f32 $Rn)>;
+def : Pat<(v4i32 (int_aarch64_neon_fcvtzs v4f32:$Rn)), (FCVTZSv4f32 $Rn)>;
+def : Pat<(v2i64 (int_aarch64_neon_fcvtzs v2f64:$Rn)), (FCVTZSv2f64 $Rn)>;
+
+def : Pat<(v4i16 (int_aarch64_neon_fcvtzu v4f16:$Rn)), (FCVTZUv4f16 $Rn)>;
+def : Pat<(v8i16 (int_aarch64_neon_fcvtzu v8f16:$Rn)), (FCVTZUv8f16 $Rn)>;
+def : Pat<(v2i32 (int_aarch64_neon_fcvtzu v2f32:$Rn)), (FCVTZUv2f32 $Rn)>;
+def : Pat<(v4i32 (int_aarch64_neon_fcvtzu v4f32:$Rn)), (FCVTZUv4f32 $Rn)>;
+def : Pat<(v2i64 (int_aarch64_neon_fcvtzu v2f64:$Rn)), (FCVTZUv2f64 $Rn)>;
+
+defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
+defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
+defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", fround>;
+defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
+defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
+defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
+defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
+defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
+defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>;
+defm FSQRT  : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
+defm NEG    : SIMDTwoVectorBHSD<1, 0b01011, "neg",
+                               UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm NOT    : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
+// Aliases for MVN -> NOT.
+def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}",
+                (NOTv8i8 V64:$Vd, V64:$Vn)>;
+def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}",
+                (NOTv16i8 V128:$Vd, V128:$Vn)>;
+
+def : Pat<(AArch64neg (v8i8  V64:$Rn)),  (NEGv8i8  V64:$Rn)>;
+def : Pat<(AArch64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
+def : Pat<(AArch64neg (v4i16 V64:$Rn)),  (NEGv4i16 V64:$Rn)>;
+def : Pat<(AArch64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
+def : Pat<(AArch64neg (v2i32 V64:$Rn)),  (NEGv2i32 V64:$Rn)>;
+def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
+def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
+
+def : Pat<(AArch64not (v8i8 V64:$Rn)),   (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v1i64 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+def : Pat<(vnot (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+defm RBIT   : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>;
+defm REV16  : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
+defm REV32  : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
+defm REV64  : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
+defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
+       BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >;
+defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>;
+defm SCVTF  : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
+defm SHLL   : SIMDVectorLShiftLongBySizeBHS;
+defm SQABS  : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
+defm SQNEG  : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
+defm SQXTN  : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>;
+defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>;
+defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>;
+defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
+       BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >;
+defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
+                    int_aarch64_neon_uaddlp>;
+defm UCVTF  : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
+defm UQXTN  : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
+defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
+defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
+defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
+defm XTN    : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
+
+def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>;
+def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>;
+def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
+def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
+def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
+
+// Patterns for vector long shift (by element width). These need to match all
+// three of zext, sext and anyext so it's easier to pull the patterns out of the
+// definition.
+multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
+  def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
+            (SHLLv8i8 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
+            (SHLLv16i8 V128:$Rn)>;
+  def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
+            (SHLLv4i16 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
+            (SHLLv8i16 V128:$Rn)>;
+  def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
+            (SHLLv2i32 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
+            (SHLLv4i32 V128:$Rn)>;
+}
+
+defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADD     : SIMDThreeSameVector<0, 0b10000, "add", add>;
+defm ADDP    : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>;
+defm CMEQ    : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>;
+defm CMGE    : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>;
+defm CMGT    : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
+defm CMHI    : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
+defm CMHS    : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
+defm CMTST   : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
+defm FABD    : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>;
+defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
+defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
+defm FADDP   : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>;
+defm FADD    : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;
+defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
+defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
+defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
+defm FDIV    : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>;
+defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
+defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>;
+defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>;
+defm FMAX    : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaxnan>;
+defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>;
+defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>;
+defm FMINP   : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
+defm FMIN    : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminnan>;
+
+// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
+// instruction expects the addend first, while the fma intrinsic puts it last.
+defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla",
+            TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls",
+            TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
+def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
+          (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+          (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+          (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+defm FMULX    : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>;
+defm FMUL     : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
+defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
+defm FSUB     : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
+defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
+                      TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
+                      TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
+defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
+defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
+      TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >;
+defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
+defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
+defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
+defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
+defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>;
+defm SMINP    : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
+defm SMIN     : SIMDThreeSameVectorBHS<0,0b01101,"smin", smin>;
+defm SQADD    : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
+defm SQDMULH  : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
+defm SQRSHL   : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
+defm SQSHL    : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
+defm SQSUB    : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
+defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>;
+defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
+defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
+defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
+defm UABA     : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
+      TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >;
+defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
+defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
+defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
+defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
+defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>;
+defm UMINP    : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
+defm UMIN     : SIMDThreeSameVectorBHS<1,0b01101,"umin", umin>;
+defm UQADD    : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
+defm UQRSHL   : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
+defm UQSHL    : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
+defm UQSUB    : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
+defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
+defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
+defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
+defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
+                                                  int_aarch64_neon_sqadd>;
+defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
+                                                    int_aarch64_neon_sqsub>;
+
+defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
+defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
+                                  BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
+defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
+defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
+defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
+    TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
+defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
+defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
+                                  BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
+defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
+
+
+def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
+def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"mov{\t$dst.4s, $src.4s|.4s\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"mov{\t$dst.2d, $src.2d|.2d\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+
+def : InstAlias<"mov{\t$dst.8b, $src.8b|.8b\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>;
+def : InstAlias<"mov{\t$dst.4h, $src.4h|.4h\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"mov{\t$dst.2s, $src.2s|.2s\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"mov{\t$dst.1d, $src.1d|.1d\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+
+def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmls.8b\t$dst, $src1, $src2}",
+                (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmls.16b\t$dst, $src1, $src2}",
+                (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmls.4h\t$dst, $src1, $src2}",
+                (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmls.8h\t$dst, $src1, $src2}",
+                (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmls.2s\t$dst, $src1, $src2}",
+                (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmls.4s\t$dst, $src1, $src2}",
+                (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmls.2d\t$dst, $src1, $src2}",
+                (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmlo.8b\t$dst, $src1, $src2}",
+                (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmlo.16b\t$dst, $src1, $src2}",
+                (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmlo.4h\t$dst, $src1, $src2}",
+                (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmlo.8h\t$dst, $src1, $src2}",
+                (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmlo.2s\t$dst, $src1, $src2}",
+                (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmlo.4s\t$dst, $src1, $src2}",
+                (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmlo.2d\t$dst, $src1, $src2}",
+                (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmle.8b\t$dst, $src1, $src2}",
+                (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmle.16b\t$dst, $src1, $src2}",
+                (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmle.4h\t$dst, $src1, $src2}",
+                (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmle.8h\t$dst, $src1, $src2}",
+                (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmle.2s\t$dst, $src1, $src2}",
+                (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmle.4s\t$dst, $src1, $src2}",
+                (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmle.2d\t$dst, $src1, $src2}",
+                (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmlt.8b\t$dst, $src1, $src2}",
+                (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmlt.16b\t$dst, $src1, $src2}",
+                (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmlt.4h\t$dst, $src1, $src2}",
+                (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmlt.8h\t$dst, $src1, $src2}",
+                (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmlt.2s\t$dst, $src1, $src2}",
+                (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmlt.4s\t$dst, $src1, $src2}",
+                (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmlt.2d\t$dst, $src1, $src2}",
+                (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{fcmle\t$dst.4h, $src1.4h, $src2.4h" #
+                "|fcmle.4h\t$dst, $src1, $src2}",
+                (FCMGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.8h, $src1.8h, $src2.8h" #
+                "|fcmle.8h\t$dst, $src1, $src2}",
+                (FCMGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
+def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|fcmle.2s\t$dst, $src1, $src2}",
+                (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|fcmle.4s\t$dst, $src1, $src2}",
+                (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|fcmle.2d\t$dst, $src1, $src2}",
+                (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{fcmlt\t$dst.4h, $src1.4h, $src2.4h" #
+                "|fcmlt.4h\t$dst, $src1, $src2}",
+                (FCMGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.8h, $src1.8h, $src2.8h" #
+                "|fcmlt.8h\t$dst, $src1, $src2}",
+                (FCMGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
+def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|fcmlt.2s\t$dst, $src1, $src2}",
+                (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|fcmlt.4s\t$dst, $src1, $src2}",
+                (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|fcmlt.2d\t$dst, $src1, $src2}",
+                (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{facle\t$dst.4h, $src1.4h, $src2.4h" #
+                "|facle.4h\t$dst, $src1, $src2}",
+                (FACGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{facle\t$dst.8h, $src1.8h, $src2.8h" #
+                "|facle.8h\t$dst, $src1, $src2}",
+                (FACGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
+def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|facle.2s\t$dst, $src1, $src2}",
+                (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|facle.4s\t$dst, $src1, $src2}",
+                (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|facle.2d\t$dst, $src1, $src2}",
+                (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+let Predicates = [HasNEON, HasFullFP16] in {
+def : InstAlias<"{faclt\t$dst.4h, $src1.4h, $src2.4h" #
+                "|faclt.4h\t$dst, $src1, $src2}",
+                (FACGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.8h, $src1.8h, $src2.8h" #
+                "|faclt.8h\t$dst, $src1, $src2}",
+                (FACGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
+}
+def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|faclt.2s\t$dst, $src1, $src2}",
+                (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|faclt.4s\t$dst, $src1, $src2}",
+                (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|faclt.2d\t$dst, $src1, $src2}",
+                (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three scalar instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADD      : SIMDThreeScalarD<0, 0b10000, "add", add>;
+defm CMEQ     : SIMDThreeScalarD<1, 0b10001, "cmeq", AArch64cmeq>;
+defm CMGE     : SIMDThreeScalarD<0, 0b00111, "cmge", AArch64cmge>;
+defm CMGT     : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
+defm CMHI     : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
+defm CMHS     : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
+defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
+defm FABD     : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>;
+def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FABD64 FPR64:$Rn, FPR64:$Rm)>;
+defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge",
+                                     int_aarch64_neon_facge>;
+defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
+                                     int_aarch64_neon_facgt>;
+defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
+defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
+defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
+defm FMULX    : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx>;
+defm FRECPS   : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts>;
+defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
+defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm SQRSHL   : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_aarch64_neon_sqrshl>;
+defm SQSHL    : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_aarch64_neon_sqshl>;
+defm SQSUB    : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_aarch64_neon_sqsub>;
+defm SRSHL    : SIMDThreeScalarD<   0, 0b01010, "srshl", int_aarch64_neon_srshl>;
+defm SSHL     : SIMDThreeScalarD<   0, 0b01000, "sshl", int_aarch64_neon_sshl>;
+defm SUB      : SIMDThreeScalarD<   1, 0b10000, "sub", sub>;
+defm UQADD    : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_aarch64_neon_uqadd>;
+defm UQRSHL   : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_aarch64_neon_uqrshl>;
+defm UQSHL    : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>;
+defm UQSUB    : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
+defm URSHL    : SIMDThreeScalarD<   1, 0b01010, "urshl", int_aarch64_neon_urshl>;
+defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_aarch64_neon_ushl>;
+let Predicates = [HasV8_1a] in {
+  defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">;
+  defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">;
+  def : Pat<(i32 (int_aarch64_neon_sqadd
+                   (i32 FPR32:$Rd),
+                   (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
+                                                   (i32 FPR32:$Rm))))),
+            (SQRDMLAHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+  def : Pat<(i32 (int_aarch64_neon_sqsub
+                   (i32 FPR32:$Rd),
+                   (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
+                                                   (i32 FPR32:$Rm))))),
+            (SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+}
+
+def : InstAlias<"cmls $dst, $src1, $src2",
+                (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmle $dst, $src1, $src2",
+                (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmlo $dst, $src1, $src2",
+                (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmlt $dst, $src1, $src2",
+                (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+                (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+                (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+                (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+                (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"facle $dst, $src1, $src2",
+                (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"facle $dst, $src1, $src2",
+                (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+                (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+                (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three scalar instructions (mixed operands).
+//===----------------------------------------------------------------------===//
+defm SQDMULL  : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
+                                       int_aarch64_neon_sqdmulls_scalar>;
+defm SQDMLAL  : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
+defm SQDMLSL  : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
+
+def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
+                   (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                                        (i32 FPR32:$Rm))))),
+          (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
+                   (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                                        (i32 FPR32:$Rm))))),
+          (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two scalar instructions.
+//===----------------------------------------------------------------------===//
+
+defm ABS    : SIMDTwoScalarD<    0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm CMEQ   : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>;
+defm CMGE   : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
+defm CMGT   : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
+defm CMLE   : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>;
+defm CMLT   : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>;
+defm FCMEQ  : SIMDFPCmpTwoScalar<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE  : SIMDFPCmpTwoScalar<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT  : SIMDFPCmpTwoScalar<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE  : SIMDFPCmpTwoScalar<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT  : SIMDFPCmpTwoScalar<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDFPTwoScalar<   0, 0, 0b11100, "fcvtas">;
+defm FCVTAU : SIMDFPTwoScalar<   1, 0, 0b11100, "fcvtau">;
+defm FCVTMS : SIMDFPTwoScalar<   0, 0, 0b11011, "fcvtms">;
+defm FCVTMU : SIMDFPTwoScalar<   1, 0, 0b11011, "fcvtmu">;
+defm FCVTNS : SIMDFPTwoScalar<   0, 0, 0b11010, "fcvtns">;
+defm FCVTNU : SIMDFPTwoScalar<   1, 0, 0b11010, "fcvtnu">;
+defm FCVTPS : SIMDFPTwoScalar<   0, 1, 0b11010, "fcvtps">;
+defm FCVTPU : SIMDFPTwoScalar<   1, 1, 0b11010, "fcvtpu">;
+def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
+defm FCVTZS : SIMDFPTwoScalar<   0, 1, 0b11011, "fcvtzs">;
+defm FCVTZU : SIMDFPTwoScalar<   1, 1, 0b11011, "fcvtzu">;
+defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe">;
+defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx">;
+defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte">;
+defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
+                                 UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm SCVTF  : SIMDFPTwoScalarCVT<   0, 0, 0b11101, "scvtf", AArch64sitof>;
+defm SQABS  : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
+defm SQNEG  : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
+defm SQXTN  : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
+defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
+defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
+                                     int_aarch64_neon_suqadd>;
+defm UCVTF  : SIMDFPTwoScalarCVT<   1, 0, 0b11101, "ucvtf", AArch64uitof>;
+defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
+defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
+                                    int_aarch64_neon_usqadd>;
+
+def : Pat<(AArch64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>;
+
+def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))),
+          (FCVTASv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))),
+          (FCVTAUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtms (v1f64 FPR64:$Rn))),
+          (FCVTMSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtmu (v1f64 FPR64:$Rn))),
+          (FCVTMUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtns (v1f64 FPR64:$Rn))),
+          (FCVTNSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtnu (v1f64 FPR64:$Rn))),
+          (FCVTNUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))),
+          (FCVTPSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
+          (FCVTPUv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))),
+          (FRECPEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))),
+          (FRECPEv1i32 FPR32:$Rn)>;
+def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))),
+          (FRECPEv2f32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))),
+          (FRECPEv4f32 FPR128:$Rn)>;
+def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))),
+          (FRECPEv2f64 FPR128:$Rn)>;
+
+def : Pat<(f32 (AArch64frecps (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
+          (FRECPS32 FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(v2f32 (AArch64frecps (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+          (FRECPSv2f32 V64:$Rn, V64:$Rm)>;
+def : Pat<(v4f32 (AArch64frecps (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))),
+          (FRECPSv4f32 FPR128:$Rn, FPR128:$Rm)>;
+def : Pat<(f64 (AArch64frecps (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
+          (FRECPS64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v2f64 (AArch64frecps (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
+          (FRECPSv2f64 FPR128:$Rn, FPR128:$Rm)>;
+
+def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
+          (FRECPXv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
+          (FRECPXv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))),
+          (FRSQRTEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))),
+          (FRSQRTEv1i32 FPR32:$Rn)>;
+def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))),
+          (FRSQRTEv2f32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))),
+          (FRSQRTEv4f32 FPR128:$Rn)>;
+def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))),
+          (FRSQRTEv2f64 FPR128:$Rn)>;
+
+def : Pat<(f32 (AArch64frsqrts (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
+          (FRSQRTS32 FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(v2f32 (AArch64frsqrts (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+          (FRSQRTSv2f32 V64:$Rn, V64:$Rm)>;
+def : Pat<(v4f32 (AArch64frsqrts (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))),
+          (FRSQRTSv4f32 FPR128:$Rn, FPR128:$Rm)>;
+def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
+          (FRSQRTS64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
+          (FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// Here are the patterns for 8 and 16-bits to float.
+// 8-bits -> float.
+multiclass UIntToFPROLoadPat<ValueType DstTy, ValueType SrcTy,
+                             SDPatternOperator loadop, Instruction UCVTF,
+                             ROAddrMode ro, Instruction LDRW, Instruction LDRX,
+                             SubRegIndex sub> {
+  def : Pat<(DstTy (uint_to_fp (SrcTy
+                     (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm,
+                                      ro.Wext:$extend))))),
+           (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
+                                 (LDRW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
+                                 sub))>;
+
+  def : Pat<(DstTy (uint_to_fp (SrcTy
+                     (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm,
+                                      ro.Wext:$extend))))),
+           (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
+                                 (LDRX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
+                                 sub))>;
+}
+
+defm : UIntToFPROLoadPat<f32, i32, zextloadi8,
+                         UCVTFv1i32, ro8, LDRBroW, LDRBroX, bsub>;
+def : Pat <(f32 (uint_to_fp (i32
+               (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
+def : Pat <(f32 (uint_to_fp (i32
+                     (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
+// 16-bits -> float.
+defm : UIntToFPROLoadPat<f32, i32, zextloadi16,
+                         UCVTFv1i32, ro16, LDRHroW, LDRHroX, hsub>;
+def : Pat <(f32 (uint_to_fp (i32
+                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
+def : Pat <(f32 (uint_to_fp (i32
+                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
+// 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// UCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, zextloadi8,
+                         UCVTFv1i64, ro8, LDRBroW, LDRBroX, bsub>;
+def : Pat <(f64 (uint_to_fp (i32
+                    (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
+// 16-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, zextloadi16,
+                         UCVTFv1i64, ro16, LDRHroW, LDRHroX, hsub>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
+// 32-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, load,
+                         UCVTFv1i64, ro32, LDRSroW, LDRSroX, ssub>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three different-sized vector instructions.
+//===----------------------------------------------------------------------===//
+
+defm ADDHN  : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>;
+defm SUBHN  : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
+defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
+defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
+defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>;
+defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
+                                             int_aarch64_neon_sabd>;
+defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
+                                          int_aarch64_neon_sabd>;
+defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
+            BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
+defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
+                 BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
+defm SMLAL   : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL   : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
+defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
+                                               int_aarch64_neon_sqadd>;
+defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
+                                               int_aarch64_neon_sqsub>;
+defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
+                                     int_aarch64_neon_sqdmull>;
+defm SSUBL   : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
+                 BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
+defm SSUBW   : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
+                 BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
+defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
+                                              int_aarch64_neon_uabd>;
+defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
+                 BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
+defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
+                 BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
+defm UMLAL   : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL   : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
+defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
+                 BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
+defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
+                 BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
+
+// Additional patterns for SMULL and UMULL
+multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
+  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+  def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
+            (INST8B V64:$Rn, V64:$Rm)>;
+  def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+            (INST4H V64:$Rn, V64:$Rm)>;
+  def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+            (INST2S V64:$Rn, V64:$Rm)>;
+}
+
+defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
+  SMULLv4i16_v4i32, SMULLv2i32_v2i64>;
+defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
+  UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
+
+// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
+multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
+  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+  def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
+            (INST8B V128:$Rd, V64:$Rn, V64:$Rm)>;
+  def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+            (INST4H V128:$Rd, V64:$Rn, V64:$Rm)>;
+  def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+            (INST2S  V128:$Rd, V64:$Rn, V64:$Rm)>;
+}
+
+defm : Neon_mulacc_widen_patterns<
+  TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
+  SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+  TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
+  UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+  TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
+  SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+  TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
+  UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
+
+// Patterns for 64-bit pmull
+def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
+          (PMULLv1i64 V64:$Rn, V64:$Rm)>;
+def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)),
+                                    (extractelt (v2i64 V128:$Rm), (i64 1))),
+          (PMULLv2i64 V128:$Rn, V128:$Rm)>;
+
+// CodeGen patterns for addhn and subhn instructions, which can actually be
+// written in LLVM IR without too much difficulty.
+
+// ADDHN
+def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
+          (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                           (i32 16))))),
+          (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                           (i32 32))))),
+          (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+                          (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 8))))),
+          (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+                          (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 16))))),
+          (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+                          (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 32))))),
+          (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+
+// SUBHN
+def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
+          (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                           (i32 16))))),
+          (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                           (i32 32))))),
+          (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+                          (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 8))))),
+          (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+                          (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 16))))),
+          (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+                          (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 32))))),
+          (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector instruction.
+//----------------------------------------------------------------------------
+
+defm EXT : SIMDBitwiseExtract<"ext">;
+
+def : Pat<(v4i16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v8i16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v2f32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v4i32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v4f32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v4f16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v8f16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+
+// We use EXT to handle extract_subvector to copy the upper 64-bits of a
+// 128-bit vector.
+def : Pat<(v8i8  (extract_subvector V128:$Rn, (i64 8))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 4))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+defm TRN1 : SIMDZipVector<0b010, "trn1", AArch64trn1>;
+defm TRN2 : SIMDZipVector<0b110, "trn2", AArch64trn2>;
+defm UZP1 : SIMDZipVector<0b001, "uzp1", AArch64uzp1>;
+defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
+defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
+defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX instructions
+//----------------------------------------------------------------------------
+
+defm TBL : SIMDTableLookup<    0, "tbl">;
+defm TBX : SIMDTableLookupTied<1, "tbx">;
+
+def : Pat<(v8i8 (int_aarch64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+          (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_aarch64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+          (TBLv16i8One V128:$Ri, V128:$Rn)>;
+
+def : Pat<(v8i8 (int_aarch64_neon_tbx1 (v8i8 V64:$Rd),
+                  (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+          (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
+                   (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+          (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY instruction
+//----------------------------------------------------------------------------
+
+defm CPY : SIMDScalarCPY<"cpy">;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+defm ADDP    : SIMDPairwiseScalarD<0, 0b11011, "addp">;
+defm FADDP   : SIMDFPPairwiseScalar<0, 0b01101, "faddp">;
+defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">;
+defm FMAXP   : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">;
+defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">;
+defm FMINP   : SIMDFPPairwiseScalar<1, 0b01111, "fminp">;
+def : Pat<(v2i64 (AArch64saddv V128:$Rn)),
+          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
+def : Pat<(v2i64 (AArch64uaddv V128:$Rn)),
+          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
+def : Pat<(f32 (int_aarch64_neon_faddv (v2f32 V64:$Rn))),
+          (FADDPv2i32p V64:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_faddv (v4f32 V128:$Rn))),
+          (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
+def : Pat<(f64 (int_aarch64_neon_faddv (v2f64 V128:$Rn))),
+          (FADDPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fmaxnmv (v2f32 V64:$Rn))),
+          (FMAXNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fmaxnmv (v2f64 V128:$Rn))),
+          (FMAXNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fmaxv (v2f32 V64:$Rn))),
+          (FMAXPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fmaxv (v2f64 V128:$Rn))),
+          (FMAXPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fminnmv (v2f32 V64:$Rn))),
+          (FMINNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fminnmv (v2f64 V128:$Rn))),
+          (FMINNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fminv (v2f32 V64:$Rn))),
+          (FMINPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fminv (v2f64 V128:$Rn))),
+          (FMINPv2i64p V128:$Rn)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+def DUPv8i8gpr  : SIMDDupFromMain<0, {?,?,?,?,1}, ".8b", v8i8, V64, GPR32>;
+def DUPv16i8gpr : SIMDDupFromMain<1, {?,?,?,?,1}, ".16b", v16i8, V128, GPR32>;
+def DUPv4i16gpr : SIMDDupFromMain<0, {?,?,?,1,0}, ".4h", v4i16, V64, GPR32>;
+def DUPv8i16gpr : SIMDDupFromMain<1, {?,?,?,1,0}, ".8h", v8i16, V128, GPR32>;
+def DUPv2i32gpr : SIMDDupFromMain<0, {?,?,1,0,0}, ".2s", v2i32, V64, GPR32>;
+def DUPv4i32gpr : SIMDDupFromMain<1, {?,?,1,0,0}, ".4s", v4i32, V128, GPR32>;
+def DUPv2i64gpr : SIMDDupFromMain<1, {?,1,0,0,0}, ".2d", v2i64, V128, GPR64>;
+
+def DUPv2i64lane : SIMDDup64FromElement;
+def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
+def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
+def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
+def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
+def DUPv8i8lane  : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
+def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;
+
+def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))),
+          (v2f32 (DUPv2i32lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+            (i64 0)))>;
+def : Pat<(v4f32 (AArch64dup (f32 FPR32:$Rn))),
+          (v4f32 (DUPv4i32lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+            (i64 0)))>;
+def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))),
+          (v2f64 (DUPv2i64lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
+            (i64 0)))>;
+def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))),
+          (v4f16 (DUPv4i16lane
+            (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
+            (i64 0)))>;
+def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))),
+          (v8f16 (DUPv8i16lane
+            (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
+            (i64 0)))>;
+
+def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
+          (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>;
+def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
+          (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>;
+
+def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+          (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+         (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
+          (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
+
+// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
+// instruction even if the types don't match: we just have to remap the lane
+// carefully. N.b. this trick only applies to truncations.
+def VecIndex_x2 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(2 * N->getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
+def VecIndex_x4 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(4 * N->getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
+def VecIndex_x8 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(8 * N->getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
+
+multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
+                            ValueType Src128VT, ValueType ScalVT,
+                            Instruction DUP, SDNodeXForm IdxXFORM> {
+  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
+                                                     imm:$idx)))),
+            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
+                                                     imm:$idx)))),
+            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
+}
+
+defm : DUPWithTruncPats<v8i8,   v4i16, v8i16, i32, DUPv8i8lane,  VecIndex_x2>;
+defm : DUPWithTruncPats<v8i8,   v2i32, v4i32, i32, DUPv8i8lane,  VecIndex_x4>;
+defm : DUPWithTruncPats<v4i16,  v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
+
+defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
+defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
+defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
+
+multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
+                               SDNodeXForm IdxXFORM> {
+  def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
+                                                         imm:$idx))))),
+            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+  def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
+                                                       imm:$idx))))),
+            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
+}
+
+defm : DUPWithTrunci64Pats<v8i8,  DUPv8i8lane,   VecIndex_x8>;
+defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane,  VecIndex_x4>;
+defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane,  VecIndex_x2>;
+
+defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
+defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
+defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
+
+// SMOV and UMOV definitions, with some extra patterns for convenience
+defm SMOV : SMov;
+defm UMOV : UMov;
+
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+          (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+          (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
+          (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
+
+def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v16i8 V128:$Rn),
+            VectorIndexB:$idx)))), i8),
+          (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn),
+            VectorIndexH:$idx)))), i16),
+          (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
+
+// Extracting i8 or i16 elements will have the zero-extend transformed to
+// an 'and' mask by type legalization since neither i8 nor i16 are legal types
+// for AArch64. Match these patterns here since UMOV already zeroes out the high
+// bits of the destination register.
+def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
+               (i32 0xff)),
+          (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
+               (i32 0xffff)),
+          (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;
+
+defm INS : SIMDIns;
+
+def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+
+def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+
+def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
+            (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                                  (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
+            (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                                  (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
+            (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                                  (i64 FPR64:$Rn), dsub))>;
+
+def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+          (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+          (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+
+def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
+
+def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
+            (f16 FPR16:$Rm), (i64 VectorIndexS:$imm))),
+          (EXTRACT_SUBREG
+            (INSvi16lane
+              (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+              VectorIndexS:$imm,
+              (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
+              (i64 0)),
+            dsub)>;
+
+def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
+            (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
+          (INSvi16lane
+            V128:$Rn, VectorIndexH:$imm,
+            (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
+            (i64 0))>;
+
+def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
+            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+          (EXTRACT_SUBREG
+            (INSvi32lane
+              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+              VectorIndexS:$imm,
+              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+              (i64 0)),
+            dsub)>;
+def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
+            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+          (INSvi32lane
+            V128:$Rn, VectorIndexS:$imm,
+            (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+            (i64 0))>;
+def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
+            (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
+          (INSvi64lane
+            V128:$Rn, VectorIndexD:$imm,
+            (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
+            (i64 0))>;
+
+// Copy an element at a constant index in one vector into a constant indexed
+// element of another.
+// FIXME refactor to a shared class/dev parameterized on vector type, vector
+// index type and INS extension
+def : Pat<(v16i8 (int_aarch64_neon_vcopy_lane
+                   (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
+                   VectorIndexB:$idx2)),
+          (v16i8 (INSvi8lane
+                   V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
+          )>;
+def : Pat<(v8i16 (int_aarch64_neon_vcopy_lane
+                   (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
+                   VectorIndexH:$idx2)),
+          (v8i16 (INSvi16lane
+                   V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
+          )>;
+def : Pat<(v4i32 (int_aarch64_neon_vcopy_lane
+                   (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
+                   VectorIndexS:$idx2)),
+          (v4i32 (INSvi32lane
+                   V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
+          )>;
+def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane
+                   (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
+                   VectorIndexD:$idx2)),
+          (v2i64 (INSvi64lane
+                   V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
+          )>;
+
+multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
+                                ValueType VTScal, Instruction INS> {
+  def : Pat<(VT128 (vector_insert V128:$src,
+                        (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>;
+
+  def : Pat<(VT128 (vector_insert V128:$src,
+                        (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (INS V128:$src, imm:$Immd,
+                 (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;
+
+  def : Pat<(VT64 (vector_insert V64:$src,
+                        (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub),
+                                 imm:$Immd, V128:$Rn, imm:$Immn),
+                            dsub)>;
+
+  def : Pat<(VT64 (vector_insert V64:$src,
+                        (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (EXTRACT_SUBREG
+                (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd,
+                     (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn),
+                dsub)>;
+}
+
+defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>;
+defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
+
+
+// Floating point vector extractions are codegen'd as either a sequence of
+// subregister extractions, or a MOV (aka CPY here, alias for DUP) if
+// the lane number is anything other than zero.
+def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
+          (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
+          (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
+def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
+          (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
+
+def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
+          (f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
+          (f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>;
+def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx),
+          (f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>;
+
+// All concat_vectors operations are canonicalised to act on i64 vectors for
+// AArch64. In the general case we need an instruction, which had just as well be
+// INS.
+class ConcatPat<ValueType DstTy, ValueType SrcTy>
+  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
+        (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
+                     (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
+
+def : ConcatPat<v2i64, v1i64>;
+def : ConcatPat<v2f64, v1f64>;
+def : ConcatPat<v4i32, v2i32>;
+def : ConcatPat<v4f32, v2f32>;
+def : ConcatPat<v8i16, v4i16>;
+def : ConcatPat<v8f16, v4f16>;
+def : ConcatPat<v16i8, v8i8>;
+
+// If the high lanes are undef, though, we can just ignore them:
+class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
+  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
+        (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
+
+def : ConcatUndefPat<v2i64, v1i64>;
+def : ConcatUndefPat<v2f64, v1f64>;
+def : ConcatUndefPat<v4i32, v2i32>;
+def : ConcatUndefPat<v4f32, v2f32>;
+def : ConcatUndefPat<v8i16, v4i16>;
+def : ConcatUndefPat<v16i8, v8i8>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+defm ADDV    : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
+defm SMAXV   : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
+defm SMINV   : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
+defm UMAXV   : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
+defm UMINV   : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
+defm SADDLV  : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
+defm UADDLV  : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
+defm FMAXNMV : SIMDFPAcrossLanes<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
+defm FMAXV   : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
+defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
+defm FMINV   : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
+
+// Patterns for across-vector intrinsics, that have a node equivalent, that
+// returns a vector (with only the low lane defined) instead of a scalar.
+// In effect, opNode is the same as (scalar_to_vector (IntNode)).
+multiclass SIMDAcrossLanesIntrinsic<string baseOpc,
+                                    SDPatternOperator opNode> {
+// If a lane instruction caught the vector_extract around opNode, we can
+// directly match the latter to the instruction.
+def : Pat<(v8i8 (opNode V64:$Rn)),
+          (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub)>;
+def : Pat<(v16i8 (opNode V128:$Rn)),
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub)>;
+def : Pat<(v4i16 (opNode V64:$Rn)),
+          (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub)>;
+def : Pat<(v8i16 (opNode V128:$Rn)),
+          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub)>;
+def : Pat<(v4i32 (opNode V128:$Rn)),
+          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub)>;
+
+
+// If none did, fallback to the explicit patterns, consuming the vector_extract.
+def : Pat<(i32 (vector_extract (insert_subvector undef, (v8i8 (opNode V64:$Rn)),
+            (i32 0)), (i64 0))),
+          (EXTRACT_SUBREG (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn),
+            bsub), ssub)>;
+def : Pat<(i32 (vector_extract (v16i8 (opNode V128:$Rn)), (i64 0))),
+          (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn),
+            bsub), ssub)>;
+def : Pat<(i32 (vector_extract (insert_subvector undef,
+            (v4i16 (opNode V64:$Rn)), (i32 0)), (i64 0))),
+          (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn),
+            hsub), ssub)>;
+def : Pat<(i32 (vector_extract (v8i16 (opNode V128:$Rn)), (i64 0))),
+          (EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn),
+            hsub), ssub)>;
+def : Pat<(i32 (vector_extract (v4i32 (opNode V128:$Rn)), (i64 0))),
+          (EXTRACT_SUBREG (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn),
+            ssub), ssub)>;
+
+}
+
+multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc,
+                                          SDPatternOperator opNode>
+    : SIMDAcrossLanesIntrinsic<baseOpc, opNode> {
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef,
+            (opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), i8)),
+          (i32 (SMOVvi8to32
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+            (i64 0)))>;
+def : Pat<(i32 (sext_inreg (i32 (vector_extract
+            (opNode (v16i8 V128:$Rn)), (i64 0))), i8)),
+          (i32 (SMOVvi8to32
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+             (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+            (i64 0)))>;
+def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef,
+            (opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), i16)),
+          (i32 (SMOVvi16to32
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+           (i64 0)))>;
+def : Pat<(i32 (sext_inreg (i32 (vector_extract
+            (opNode (v8i16 V128:$Rn)), (i64 0))), i16)),
+          (i32 (SMOVvi16to32
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+             (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+            (i64 0)))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc,
+                                            SDPatternOperator opNode>
+    : SIMDAcrossLanesIntrinsic<baseOpc, opNode> {
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef,
+            (opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), maski8_or_more)),
+      (i32 (EXTRACT_SUBREG
+        (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+          (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+        ssub))>;
+def : Pat<(i32 (and (i32 (vector_extract (opNode (v16i8 V128:$Rn)), (i64 0))),
+            maski8_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          ssub))>;
+def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef,
+            (opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), maski16_or_more)),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+            ssub))>;
+def : Pat<(i32 (and (i32 (vector_extract (opNode (v8i16 V128:$Rn)), (i64 0))),
+            maski16_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          ssub))>;
+}
+
+defm : SIMDAcrossLanesSignedIntrinsic<"ADDV",  AArch64saddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(v2i32 (AArch64saddv (v2i32 V64:$Rn))),
+          (ADDPv2i32 V64:$Rn, V64:$Rn)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV", AArch64uaddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(v2i32 (AArch64uaddv (v2i32 V64:$Rn))),
+          (ADDPv2i32 V64:$Rn, V64:$Rn)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", AArch64smaxv>;
+def : Pat<(v2i32 (AArch64smaxv (v2i32 V64:$Rn))),
+          (SMAXPv2i32 V64:$Rn, V64:$Rn)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", AArch64sminv>;
+def : Pat<(v2i32 (AArch64sminv (v2i32 V64:$Rn))),
+          (SMINPv2i32 V64:$Rn, V64:$Rn)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", AArch64umaxv>;
+def : Pat<(v2i32 (AArch64umaxv (v2i32 V64:$Rn))),
+          (UMAXPv2i32 V64:$Rn, V64:$Rn)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", AArch64uminv>;
+def : Pat<(v2i32 (AArch64uminv (v2i32 V64:$Rn))),
+          (UMINPv2i32 V64:$Rn, V64:$Rn)>;
+
+multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+          (i64 0)))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+           ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+          ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+        (i64 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+          dsub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
+                                                Intrinsic intOp> {
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+          ssub))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+            ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+          ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+        (i64 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+          dsub))>;
+}
+
+defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
+defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;
+
+// The vaddlv_s32 intrinsic gets mapped to SADDLP.
+def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
+          (i64 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (SADDLPv2i32_v1i64 V64:$Rn), dsub),
+            dsub))>;
+// The vaddlv_u32 intrinsic gets mapped to UADDLP.
+def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
+          (i64 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (UADDLPv2i32_v1i64 V64:$Rn), dsub),
+            dsub))>;
+
+//------------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//------------------------------------------------------------------------------
+
+// AdvSIMD BIC
+defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", AArch64bici>;
+// AdvSIMD ORR
+defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", AArch64orri>;
+
+def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;
+
+def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;
+
+def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+// AdvSIMD FMOV
+def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8,
+                                              "fmov", ".2d",
+                       [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1111, V64,  fpimm8,
+                                              "fmov", ".2s",
+                       [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1111, V128, fpimm8,
+                                              "fmov", ".4s",
+                       [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def FMOVv4f16_ns : SIMDModifiedImmVectorNoShift<0, 0, 1, 0b1111, V64,  fpimm8,
+                                              "fmov", ".4h",
+                       [(set (v4f16 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv8f16_ns : SIMDModifiedImmVectorNoShift<1, 0, 1, 0b1111, V128, fpimm8,
+                                              "fmov", ".8h",
+                       [(set (v8f16 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+} // Predicates = [HasNEON, HasFullFP16]
+
+// AdvSIMD MOVI
+
+// EDIT byte mask: scalar
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVID      : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
+                    [(set FPR64:$Rd, simdimmtype10:$imm8)]>;
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 here.
+def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)),
+          (MOVID imm0_255:$shift)>;
+
+def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v8i8  immAllZerosV), (MOVID (i32 0))>;
+
+def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
+
+// EDIT byte mask: 2d
+
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 in the pattern
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128,
+                                                simdimmtype10,
+                                                "movi", ".2d",
+                   [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
+
+def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+
+def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+
+def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>;
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
+
+def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : Pat<(v2i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MOVIv2s_msl  : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
+                      [(set (v2i32 V64:$Rd),
+                            (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MOVIv4s_msl  : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
+                      [(set (v4i32 V128:$Rd),
+                            (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+// Per byte: 8b & 16b
+def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64,  imm0_255,
+                                                 "movi", ".8b",
+                       [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
+def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255,
+                                                 "movi", ".16b",
+                       [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
+
+// AdvSIMD MVNI
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MVNI      : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
+
+def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : Pat<(v2i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MVNIv2s_msl   : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
+                      [(set (v2i32 V64:$Rd),
+                            (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
+                      [(set (v4i32 V128:$Rd),
+                            (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let hasSideEffects = 0 in {
+  defm FMLA  : SIMDFPIndexedTied<0, 0b0001, "fmla">;
+  defm FMLS  : SIMDFPIndexedTied<0, 0b0101, "fmls">;
+}
+
+// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
+// instruction expects the addend first, while the intrinsic expects it last.
+
+// On the other hand, there are quite a few valid combinatorial options due to
+// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
+defm : SIMDFPIndexedTiedPatterns<"FMLA",
+           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
+defm : SIMDFPIndexedTiedPatterns<"FMLA",
+           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
+
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
+           TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
+           TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
+           TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
+defm : SIMDFPIndexedTiedPatterns<"FMLS",
+           TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
+
+multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
+  // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+  // and DUP scalar.
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
+                                           VectorIndexS:$idx))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (v2f32 (AArch64duplane32
+                                      (v4f32 (insert_subvector undef,
+                                                 (v2f32 (fneg V64:$Rm)),
+                                                 (i32 0))),
+                                      VectorIndexS:$idx)))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                               VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+  // and DUP scalar.
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
+                                           VectorIndexS:$idx))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
+                               VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (v4f32 (AArch64duplane32
+                                      (v4f32 (insert_subvector undef,
+                                                 (v2f32 (fneg V64:$Rm)),
+                                                 (i32 0))),
+                                      VectorIndexS:$idx)))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                               VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
+  // (DUPLANE from 64-bit would be trivial).
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64duplane64 (v2f64 (fneg V128:$Rm)),
+                                           VectorIndexD:$idx))),
+            (FMLSv2i64_indexed
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64dup (f64 (fneg FPR64Op:$Rm))))),
+            (FMLSv2i64_indexed V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v4f32 (fneg V128:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v4f32 (insert_subvector undef,
+                                                    (v2f32 (fneg V64:$Rm)),
+                                                    (i32 0))),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+                         (vector_extract (v2f64 (fneg V128:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+}
+
+defm : FMLSIndexedAfterNegPatterns<
+           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm : FMLSIndexedAfterNegPatterns<
+           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
+
+defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
+defm FMUL  : SIMDFPIndexed<0, 0b1001, "fmul", fmul>;
+
+def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+          (FMULv2i32_indexed V64:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+            (i64 0))>;
+def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+          (FMULv4i32_indexed V128:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+            (i64 0))>;
+def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
+          (FMULv2i64_indexed V128:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
+            (i64 0))>;
+
+defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
+              TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
+              TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MUL   : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
+defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
+                int_aarch64_neon_smull>;
+defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
+                                           int_aarch64_neon_sqadd>;
+defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
+                                           int_aarch64_neon_sqsub>;
+defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah",
+                                          int_aarch64_neon_sqadd>;
+defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
+                                          int_aarch64_neon_sqsub>;
+defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
+defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL   : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
+                int_aarch64_neon_umull>;
+
+// A scalar sqdmull with the second operand being a vector lane can be
+// handled directly with the indexed instruction encoding.
+def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                          (vector_extract (v4i32 V128:$Vm),
+                                                           VectorIndexS:$idx)),
+          (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS : SIMDFPScalarRShift<0, 0b11111, "fcvtzs">;
+defm FCVTZU : SIMDFPScalarRShift<1, 0b11111, "fcvtzu">;
+defm SCVTF  : SIMDFPScalarRShift<0, 0b11100, "scvtf">;
+defm UCVTF  : SIMDFPScalarRShift<1, 0b11100, "ucvtf">;
+// Codegen patterns for the above. We don't put these directly on the
+// instructions because TableGen's type inference can't handle the truth.
+// Having the same base pattern for fp <--> int totally freaks it out.
+def : Pat<(int_aarch64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
+          (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
+          (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
+          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
+          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
+          (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
+          (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+
+defm SHL      : SIMDScalarLShiftD<   0, 0b01010, "shl", AArch64vshl>;
+defm SLI      : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
+defm SQRSHRN  : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
+                                     int_aarch64_neon_sqrshrn>;
+defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
+                                     int_aarch64_neon_sqrshrun>;
+defm SQSHLU   : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
+defm SQSHL    : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
+defm SQSHRN   : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
+                                     int_aarch64_neon_sqshrn>;
+defm SQSHRUN  : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
+                                     int_aarch64_neon_sqshrun>;
+defm SRI      : SIMDScalarRShiftDTied<   1, 0b01000, "sri">;
+defm SRSHR    : SIMDScalarRShiftD<   0, 0b00100, "srshr", AArch64srshri>;
+defm SRSRA    : SIMDScalarRShiftDTied<   0, 0b00110, "srsra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64srshri node:$MHS, node:$RHS))>>;
+defm SSHR     : SIMDScalarRShiftD<   0, 0b00000, "sshr", AArch64vashr>;
+defm SSRA     : SIMDScalarRShiftDTied<   0, 0b00010, "ssra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64vashr node:$MHS, node:$RHS))>>;
+defm UQRSHRN  : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
+                                     int_aarch64_neon_uqrshrn>;
+defm UQSHL    : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
+defm UQSHRN   : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
+                                     int_aarch64_neon_uqshrn>;
+defm URSHR    : SIMDScalarRShiftD<   1, 0b00100, "urshr", AArch64urshri>;
+defm URSRA    : SIMDScalarRShiftDTied<   1, 0b00110, "ursra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64urshri node:$MHS, node:$RHS))>>;
+defm USHR     : SIMDScalarRShiftD<   1, 0b00000, "ushr", AArch64vlshr>;
+defm USRA     : SIMDScalarRShiftDTied<   1, 0b00010, "usra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64vlshr node:$MHS, node:$RHS))>>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD vector shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
+defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
+defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf",
+                                   int_aarch64_neon_vcvtfxs2fp>;
+defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
+                                         int_aarch64_neon_rshrn>;
+defm SHL     : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
+defm SHRN    : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
+                          BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>;
+defm SLI     : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>;
+def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                                      (i32 vecshiftL64:$imm))),
+          (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
+defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
+                                         int_aarch64_neon_sqrshrn>;
+defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
+                                         int_aarch64_neon_sqrshrun>;
+defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
+defm SQSHL  : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
+defm SQSHRN  : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
+                                         int_aarch64_neon_sqshrn>;
+defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
+                                         int_aarch64_neon_sqshrun>;
+defm SRI     : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>;
+def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                                      (i32 vecshiftR64:$imm))),
+          (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
+defm SRSHR   : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>;
+defm SRSRA   : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
+                 TriOpFrag<(add node:$LHS,
+                                (AArch64srshri node:$MHS, node:$RHS))> >;
+defm SSHLL   : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
+                BinOpFrag<(AArch64vshl (sext node:$LHS), node:$RHS)>>;
+
+defm SSHR    : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
+defm SSRA    : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
+                TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
+defm UCVTF   : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf",
+                        int_aarch64_neon_vcvtfxu2fp>;
+defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
+                                         int_aarch64_neon_uqrshrn>;
+defm UQSHL   : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
+defm UQSHRN  : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
+                                         int_aarch64_neon_uqshrn>;
+defm URSHR   : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>;
+defm URSRA   : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
+                TriOpFrag<(add node:$LHS,
+                               (AArch64urshri node:$MHS, node:$RHS))> >;
+defm USHLL   : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
+                BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>;
+defm USHR    : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
+defm USRA    : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
+                TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
+
+// SHRN patterns for when a logical right shift was used instead of arithmetic
+// (the immediate guarantees no sign bits actually end up in the result so it
+// doesn't matter).
+def : Pat<(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
+          (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v4i16 (trunc (AArch64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
+          (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
+          (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;
+
+def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
+                                 (trunc (AArch64vlshr (v8i16 V128:$Rn),
+                                                    vecshiftR16Narrow:$imm)))),
+          (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
+                                 (trunc (AArch64vlshr (v4i32 V128:$Rn),
+                                                    vecshiftR32Narrow:$imm)))),
+          (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
+                                 (trunc (AArch64vlshr (v2i64 V128:$Rn),
+                                                    vecshiftR64Narrow:$imm)))),
+          (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR32Narrow:$imm)>;
+
+// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
+// Anyexts are implemented as zexts.
+def : Pat<(v8i16 (sext   (v8i8 V64:$Rn))),  (SSHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext   (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext   (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext   (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext   (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext   (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+// Also match an extend from the upper half of a 128 bit source register.
+def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (sext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (SSHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (SSHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (SSHLLv4i32_shift V128:$Rn, (i32 0))>;
+
+// Vector shift sxtl aliases
+def : InstAlias<"sxtl.8h $dst, $src1",
+                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.8h, $src1.8b",
+                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.4s $dst, $src1",
+                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.4s, $src1.4h",
+                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.2d $dst, $src1",
+                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.2d, $src1.2s",
+                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift sxtl2 aliases
+def : InstAlias<"sxtl2.8h $dst, $src1",
+                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
+                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.4s $dst, $src1",
+                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
+                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.2d $dst, $src1",
+                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
+                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// Vector shift uxtl aliases
+def : InstAlias<"uxtl.8h $dst, $src1",
+                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.8h, $src1.8b",
+                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.4s $dst, $src1",
+                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.4s, $src1.4h",
+                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.2d $dst, $src1",
+                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.2d, $src1.2s",
+                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift uxtl2 aliases
+def : InstAlias<"uxtl2.8h $dst, $src1",
+                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
+                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.4s $dst, $src1",
+                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
+                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.2d $dst, $src1",
+                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
+                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// These patterns are more complex because floating point loads do not
+// support sign extension.
+// The sign extension has to be explicitly added and is only supported for
+// one step: byte-to-half, half-to-word, word-to-doubleword.
+// SCVTF GPR -> FPR is 9 cycles.
+// SCVTF FPR -> FPR is 4 cyclces.
+// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
+// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
+// and still being faster.
+// However, this is not good for code size.
+// 8-bits -> float. 2 sizes step-up.
+class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
+  : Pat<(f32 (sint_to_fp (i32 (sextloadi8 addrmode)))),
+        (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                            (SSHLLv4i16_shift
+                              (f64
+                                (EXTRACT_SUBREG
+                                  (SSHLLv8i8_shift
+                                    (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                        INST,
+                                        bsub),
+                                    0),
+                                  dsub)),
+                               0),
+                             ssub)))>,
+    Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
+
+def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
+                          (LDRBroW  GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
+def : SExtLoadi8CVTf32Pat<(ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext),
+                          (LDRBroX  GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext)>;
+def : SExtLoadi8CVTf32Pat<(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : SExtLoadi8CVTf32Pat<(am_unscaled8 GPR64sp:$Rn, simm9:$offset),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bits -> float. 1 size step-up.
+class SExtLoadi16CVTf32Pat<dag addrmode, dag INST>
+  : Pat<(f32 (sint_to_fp (i32 (sextloadi16 addrmode)))),
+        (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                            (SSHLLv4i16_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                  INST,
+                                  hsub),
+                                0),
+                            ssub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
+                           (LDRHroW   GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
+def : SExtLoadi16CVTf32Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
+                           (LDRHroX   GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
+def : SExtLoadi16CVTf32Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
+                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : SExtLoadi16CVTf32Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
+                           (LDURHi GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bits to 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// SCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double. 3 size step-up: give up.
+// 16-bits -> double. 2 size step.
+class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
+  : Pat <(f64 (sint_to_fp (i32 (sextloadi16 addrmode)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                 (f64
+                                  (EXTRACT_SUBREG
+                                    (SSHLLv4i16_shift
+                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                        INST,
+                                        hsub),
+                                     0),
+                                   dsub)),
+                               0),
+                             dsub)))>,
+    Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
+ 
+def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
+                           (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
+def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
+                           (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
+def : SExtLoadi16CVTf64Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
+                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : SExtLoadi16CVTf64Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
+                           (LDURHi GPR64sp:$Rn, simm9:$offset)>;
+// 32-bits -> double. 1 size step-up.
+class SExtLoadi32CVTf64Pat<dag addrmode, dag INST>
+  : Pat <(f64 (sint_to_fp (i32 (load addrmode)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                  INST,
+                                  ssub),
+                               0),
+                             dsub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext),
+                           (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>;
+def : SExtLoadi32CVTf64Pat<(ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext),
+                           (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext)>;
+def : SExtLoadi32CVTf64Pat<(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset),
+                           (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
+def : SExtLoadi32CVTf64Pat<(am_unscaled32 GPR64sp:$Rn, simm9:$offset),
+                           (LDURSi GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD Load-Store Structure
+//----------------------------------------------------------------------------
+defm LD1 : SIMDLd1Multiple<"ld1">;
+defm LD2 : SIMDLd2Multiple<"ld2">;
+defm LD3 : SIMDLd3Multiple<"ld3">;
+defm LD4 : SIMDLd4Multiple<"ld4">;
+
+defm ST1 : SIMDSt1Multiple<"st1">;
+defm ST2 : SIMDSt2Multiple<"st2">;
+defm ST3 : SIMDSt3Multiple<"st3">;
+defm ST4 : SIMDSt4Multiple<"st4">;
+
+class Ld1Pat<ValueType ty, Instruction INST>
+  : Pat<(ty (load GPR64sp:$Rn)), (INST GPR64sp:$Rn)>;
+
+def : Ld1Pat<v16i8, LD1Onev16b>;
+def : Ld1Pat<v8i16, LD1Onev8h>;
+def : Ld1Pat<v4i32, LD1Onev4s>;
+def : Ld1Pat<v2i64, LD1Onev2d>;
+def : Ld1Pat<v8i8,  LD1Onev8b>;
+def : Ld1Pat<v4i16, LD1Onev4h>;
+def : Ld1Pat<v2i32, LD1Onev2s>;
+def : Ld1Pat<v1i64, LD1Onev1d>;
+
+class St1Pat<ValueType ty, Instruction INST>
+  : Pat<(store ty:$Vt, GPR64sp:$Rn),
+        (INST ty:$Vt, GPR64sp:$Rn)>;
+
+def : St1Pat<v16i8, ST1Onev16b>;
+def : St1Pat<v8i16, ST1Onev8h>;
+def : St1Pat<v4i32, ST1Onev4s>;
+def : St1Pat<v2i64, ST1Onev2d>;
+def : St1Pat<v8i8,  ST1Onev8b>;
+def : St1Pat<v4i16, ST1Onev4h>;
+def : St1Pat<v2i32, ST1Onev2s>;
+def : St1Pat<v1i64, ST1Onev1d>;
+
+//---
+// Single-element
+//---
+
+defm LD1R          : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
+defm LD2R          : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
+defm LD3R          : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
+defm LD4R          : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
+let mayLoad = 1, hasSideEffects = 0 in {
+defm LD1 : SIMDLdSingleBTied<0, 0b000,       "ld1", VecListOneb,   GPR64pi1>;
+defm LD1 : SIMDLdSingleHTied<0, 0b010, 0,    "ld1", VecListOneh,   GPR64pi2>;
+defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes,   GPR64pi4>;
+defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned,   GPR64pi8>;
+defm LD2 : SIMDLdSingleBTied<1, 0b000,       "ld2", VecListTwob,   GPR64pi2>;
+defm LD2 : SIMDLdSingleHTied<1, 0b010, 0,    "ld2", VecListTwoh,   GPR64pi4>;
+defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos,   GPR64pi8>;
+defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod,   GPR64pi16>;
+defm LD3 : SIMDLdSingleBTied<0, 0b001,       "ld3", VecListThreeb, GPR64pi3>;
+defm LD3 : SIMDLdSingleHTied<0, 0b011, 0,    "ld3", VecListThreeh, GPR64pi6>;
+defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
+defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
+defm LD4 : SIMDLdSingleBTied<1, 0b001,       "ld4", VecListFourb,  GPR64pi4>;
+defm LD4 : SIMDLdSingleHTied<1, 0b011, 0,    "ld4", VecListFourh,  GPR64pi8>;
+defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours,  GPR64pi16>;
+defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd,  GPR64pi32>;
+}
+
+def : Pat<(v8i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
+          (LD1Rv8b GPR64sp:$Rn)>;
+def : Pat<(v16i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
+          (LD1Rv16b GPR64sp:$Rn)>;
+def : Pat<(v4i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
+          (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
+          (LD1Rv8h GPR64sp:$Rn)>;
+def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+          (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+          (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+          (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+          (LD1Rv1d GPR64sp:$Rn)>;
+// Grab the floating point version too
+def : Pat<(v2f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
+          (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
+          (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
+          (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
+          (LD1Rv1d GPR64sp:$Rn)>;
+def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
+          (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
+          (LD1Rv8h GPR64sp:$Rn)>;
+
+class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne128:$Rd),
+           (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+        (LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : Ld1Lane128Pat<extloadi8,  VectorIndexB, v16i8, i32, LD1i8>;
+def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4i32, i32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
+def : Ld1Lane128Pat<load,       VectorIndexH, v8f16, f16, LD1i16>;
+
+class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne64:$Rd),
+           (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+        (EXTRACT_SUBREG
+            (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
+                          VecIndex:$idx, GPR64sp:$Rn),
+            dsub)>;
+
+def : Ld1Lane64Pat<extloadi8,  VectorIndexB, v8i8,  i32, LD1i8>;
+def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;
+def : Ld1Lane64Pat<load,       VectorIndexH, v4f16, f16, LD1i16>;
+
+
+defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
+defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
+defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
+defm LD4 : SIMDLdSt4SingleAliases<"ld4">;
+
+// Stores
+defm ST1 : SIMDStSingleB<0, 0b000,       "st1", VecListOneb, GPR64pi1>;
+defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh, GPR64pi2>;
+defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
+defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
+
+let AddedComplexity = 19 in
+class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+             GPR64sp:$Rn),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : St1Lane128Pat<truncstorei8,  VectorIndexB, v16i8, i32, ST1i8>;
+def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
+def : St1Lane128Pat<store,         VectorIndexS, v4i32, i32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
+def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
+def : St1Lane128Pat<store,         VectorIndexH, v8f16, f16, ST1i16>;
+
+let AddedComplexity = 19 in
+class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+             GPR64sp:$Rn),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : St1Lane64Pat<truncstorei8,  VectorIndexB, v8i8, i32, ST1i8>;
+def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
+def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
+def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;
+def : St1Lane64Pat<store,         VectorIndexH, v4f16, f16, ST1i16>;
+
+multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                             ValueType VTy, ValueType STy, Instruction ST1,
+                             int offset> {
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, offset),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn, XZR)>;
+
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, GPR64:$Rm),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
+}
+
+defm : St1LanePost64Pat<post_truncsti8, VectorIndexB, v8i8, i32, ST1i8_POST, 1>;
+defm : St1LanePost64Pat<post_truncsti16, VectorIndexH, v4i16, i32, ST1i16_POST,
+                        2>;
+defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>;
+defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
+defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
+defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
+defm : St1LanePost64Pat<post_store, VectorIndexH, v4f16, f16, ST1i16_POST, 2>;
+
+multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                             ValueType VTy, ValueType STy, Instruction ST1,
+                             int offset> {
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, offset),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, XZR)>;
+
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, GPR64:$Rm),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
+}
+
+defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i32, ST1i8_POST,
+                         1>;
+defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i32, ST1i16_POST,
+                         2>;
+defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
+defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
+defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
+defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
+defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>;
+
+let mayStore = 1, hasSideEffects = 0 in {
+defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
+defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   GPR64pi4>;
+defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   GPR64pi8>;
+defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod,   GPR64pi16>;
+defm ST3 : SIMDStSingleB<0, 0b001,       "st3", VecListThreeb, GPR64pi3>;
+defm ST3 : SIMDStSingleH<0, 0b011, 0,    "st3", VecListThreeh, GPR64pi6>;
+defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
+defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
+defm ST4 : SIMDStSingleB<1, 0b001,       "st4", VecListFourb,  GPR64pi4>;
+defm ST4 : SIMDStSingleH<1, 0b011, 0,    "st4", VecListFourh,  GPR64pi8>;
+defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours,  GPR64pi16>;
+defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd,  GPR64pi32>;
+}
+
+defm ST1 : SIMDLdSt1SingleAliases<"st1">;
+defm ST2 : SIMDLdSt2SingleAliases<"st2">;
+defm ST3 : SIMDLdSt3SingleAliases<"st3">;
+defm ST4 : SIMDLdSt4SingleAliases<"st4">;
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+def AESErr   : AESTiedInst<0b0100, "aese",   int_aarch64_crypto_aese>;
+def AESDrr   : AESTiedInst<0b0101, "aesd",   int_aarch64_crypto_aesd>;
+def AESMCrr  : AESInst<    0b0110, "aesmc",  int_aarch64_crypto_aesmc>;
+def AESIMCrr : AESInst<    0b0111, "aesimc", int_aarch64_crypto_aesimc>;
+
+def SHA1Crrr     : SHATiedInstQSV<0b000, "sha1c",   int_aarch64_crypto_sha1c>;
+def SHA1Prrr     : SHATiedInstQSV<0b001, "sha1p",   int_aarch64_crypto_sha1p>;
+def SHA1Mrrr     : SHATiedInstQSV<0b010, "sha1m",   int_aarch64_crypto_sha1m>;
+def SHA1SU0rrr   : SHATiedInstVVV<0b011, "sha1su0", int_aarch64_crypto_sha1su0>;
+def SHA256Hrrr   : SHATiedInstQQV<0b100, "sha256h", int_aarch64_crypto_sha256h>;
+def SHA256H2rrr  : SHATiedInstQQV<0b101, "sha256h2",int_aarch64_crypto_sha256h2>;
+def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_aarch64_crypto_sha256su1>;
+
+def SHA1Hrr     : SHAInstSS<    0b0000, "sha1h",    int_aarch64_crypto_sha1h>;
+def SHA1SU1rr   : SHATiedInstVV<0b0001, "sha1su1",  int_aarch64_crypto_sha1su1>;
+def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0>;
+
+//----------------------------------------------------------------------------
+// Compiler-pseudos
+//----------------------------------------------------------------------------
+// FIXME: Like for X86, these should go in their own separate .td file.
+
+def def32 : PatLeaf<(i32 GPR32:$src), [{
+  return isDef32(*N);
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
+
+// For an anyext, we don't care what the high bits are, so we can perform an
+// INSERT_SUBREF into an IMPLICIT_DEF.
+def : Pat<(i64 (anyext GPR32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
+
+// When we need to explicitly zero-extend, we use a 32-bit MOV instruction and
+// then assert the extension has happened.
+def : Pat<(i64 (zext GPR32:$src)),
+          (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
+
+// To sign extend, we use a signed bitfield move instruction (SBFM) on the
+// containing super-reg.
+def : Pat<(i64 (sext GPR32:$src)),
+   (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i8)),  (SBFMXri GPR64:$src, 0, 7)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i1)),  (SBFMXri GPR64:$src, 0, 0)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i8)),  (SBFMWri GPR32:$src, 0, 7)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i1)),  (SBFMWri GPR32:$src, 0, 0)>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 (i32shift_a       imm0_31:$imm)),
+                              (i64 (i32shift_sext_i8 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+                              (i64 (i64shift_sext_i8 imm0_63:$imm)))>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 (i32shift_a        imm0_31:$imm)),
+                              (i64 (i32shift_sext_i16 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 (i64shift_a        imm0_63:$imm)),
+                              (i64 (i64shift_sext_i16 imm0_63:$imm)))>;
+
+def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
+          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+                   (i64 (i64shift_a        imm0_63:$imm)),
+                   (i64 (i64shift_sext_i32 imm0_63:$imm)))>;
+
+// sra patterns have an AddedComplexity of 10, so make sure we have a higher
+// AddedComplexity for the following patterns since we want to match sext + sra
+// patterns before we attempt to match a single sra node.
+let AddedComplexity = 20 in {
+// We support all sext + sra combinations which preserve at least one bit of the
+// original value which is to be sign extended. E.g. we support shifts up to
+// bitwidth-1 bits.
+def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;
+
+def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;
+
+def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
+          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+                   (i64 imm0_31:$imm), 31)>;
+} // AddedComplexity = 20
+
+// To truncate, we can simply extract from a subregister.
+def : Pat<(i32 (trunc GPR64sp:$src)),
+          (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;
+
+// __builtin_trap() uses the BRK instruction on AArch64.
+def : Pat<(trap), (BRK 1)>;
+
+// Conversions within AdvSIMD types in the same register size are free.
+// But because we need a consistent lane ordering, in big endian many
+// conversions require one or more REV instructions.
+//
+// Consider a simple memory load followed by a bitconvert then a store.
+//   v0 = load v2i32
+//   v1 = BITCAST v2i32 v0 to v4i16
+//        store v4i16 v2
+//
+// In big endian mode every memory access has an implicit byte swap. LDR and
+// STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that
+// is, they treat the vector as a sequence of elements to be byte-swapped.
+// The two pairs of instructions are fundamentally incompatible. We've decided
+// to use LD1/ST1 only to simplify compiler implementation.
+//
+// LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes
+// the original code sequence:
+//   v0 = load v2i32
+//   v1 = REV v2i32                  (implicit)
+//   v2 = BITCAST v2i32 v1 to v4i16
+//   v3 = REV v4i16 v2               (implicit)
+//        store v4i16 v3
+//
+// But this is now broken - the value stored is different to the value loaded
+// due to lane reordering. To fix this, on every BITCAST we must perform two
+// other REVs:
+//   v0 = load v2i32
+//   v1 = REV v2i32                  (implicit)
+//   v2 = REV v2i32
+//   v3 = BITCAST v2i32 v2 to v4i16
+//   v4 = REV v4i16
+//   v5 = REV v4i16 v4               (implicit)
+//        store v4i16 v5
+//
+// This means an extra two instructions, but actually in most cases the two REV
+// instructions can be combined into one. For example:
+//   (REV64_2s (REV64_4h X)) === (REV32_4h X)
+//
+// There is also no 128-bit REV instruction. This must be synthesized with an
+// EXT instruction.
+//
+// Most bitconverts require some sort of conversion. The only exceptions are:
+//   a) Identity conversions -  vNfX <-> vNiX
+//   b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
+//
+
+// Natural vector casts (64 bit)
+def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4f16 (AArch64NvCast (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4f16 (AArch64NvCast (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4f16 (AArch64NvCast (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1f64 (AArch64NvCast (f64 FPR64:$src))), (v1f64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+// Natural vector casts (128 bit)
+def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8f16 (AArch64NvCast (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+
+def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i8  (bitconvert GPR64:$Xn)),
+                 (REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)),
+                 (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
+                 (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v4f16 (bitconvert GPR64:$Xn)),
+                 (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
+                 (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+
+def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
+          (REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
+          (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
+          (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
+          (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
+          (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+}
+def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
+
+def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
+          (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
+def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))),
+          (COPY_TO_REGCLASS FPR32:$Xn, GPR32)>;
+def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
+          (COPY_TO_REGCLASS FPR64:$Xn, GPR64)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))),
+                             (v1i64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))),
+                             (v1i64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))),
+                             (v1i64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))),
+                             (v1i64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
+                             (v1i64 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))),
+                             (v2i32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))),
+                             (v2i32 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))),
+                             (v2i32 (REV64v4i16 FPR64:$src))>;
+}
+def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))),
+                             (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))),
+                             (v4i16 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))),
+                             (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
+                             (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v8i8  FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (f64   FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v8i8  FPR64:$src))),
+                             (v4f16 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (f64   FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+}
+
+
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4f16 FPR64:$src))), (v8i8  FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))),
+                             (v8i8 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))),
+                             (v8i8 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))),
+                             (v8i8 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v4f16 FPR64:$src))),
+                             (v8i8 (REV16v8i8 FPR64:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v4f16 FPR64:$src))), (f64   FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))),
+                             (f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))),
+                             (f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))),
+                             (f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))),
+                             (f64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v4f16 FPR64:$src))),
+                             (f64 (REV64v4i16 FPR64:$src))>;
+}
+def : Pat<(f64   (bitconvert (v1i64 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v1f64 FPR64:$src))), (f64   FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
+                             (v1f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))),
+                             (v1f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))),
+                             (v1f64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
+                             (v1f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))),
+                             (v1f64 (REV64v4i16 FPR64:$src))>;
+}
+def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (f64   FPR64:$src))), (v1f64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))),
+                             (v2f32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))),
+                             (v2f32 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))),
+                             (v2f32 (REV64v4i16 FPR64:$src))>;
+}
+def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))),
+                            (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                            (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                            (REV64v8i16 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                            (REV64v8i16 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
+                            (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                            (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v16i8 FPR128:$src),
+                                            (REV64v16i8 FPR128:$src), (i32 8)))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))),
+                             (v2f64 (EXTv16i8 FPR128:$src,
+                                              FPR128:$src, (i32 8)))>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))),
+                             (v2f64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
+                             (v2f64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))),
+                             (v2f64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
+                             (v2f64 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
+                             (v2f64 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))),
+                             (v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                    (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
+                             (v4f32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))),
+                             (v4f32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
+                             (v4f32 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
+                             (v4f32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))),
+                             (v4f32 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))),
+                             (v2i64 (EXTv16i8 FPR128:$src,
+                                              FPR128:$src, (i32 8)))>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))),
+                             (v2i64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))),
+                             (v2i64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))),
+                             (v2i64 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
+                             (v2i64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))),
+                             (v2i64 (REV64v8i16 FPR128:$src))>;
+}
+def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))),
+                             (v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                              (REV64v4i32 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))),
+                             (v4i32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))),
+                             (v4i32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))),
+                             (v4i32 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
+                             (v4i32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))),
+                             (v4i32 (REV32v8i16 FPR128:$src))>;
+}
+def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))),
+                             (v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                              (REV64v8i16 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))),
+                             (v8i16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))),
+                             (v8i16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))),
+                             (v8i16 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
+                             (v8i16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
+                             (v8i16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))),
+                             (v8i16 (REV32v8i16 FPR128:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v8f16 (bitconvert (f128  FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8f16 (bitconvert (f128  FPR128:$src))),
+                             (v8f16 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                              (REV64v8i16 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))),
+                             (v8f16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))),
+                             (v8f16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))),
+                             (v8f16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))),
+                             (v8f16 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))),
+                             (v8f16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))),
+                             (v8f16 (REV32v8i16 FPR128:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))),
+                             (v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src),
+                                              (REV64v16i8 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))),
+                             (v16i8 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))),
+                             (v16i8 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))),
+                             (v16i8 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))),
+                             (v16i8 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
+                             (v16i8 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))),
+                             (v16i8 (REV16v16i8 FPR128:$src))>;
+}
+
+def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))),
+           (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 0))),
+           (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))),
+           (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))),
+           (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))),
+           (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))),
+           (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 0))),
+           (EXTRACT_SUBREG V128:$Rn, dsub)>;
+
+def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+
+// A 64-bit subvector insert to the first 128-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+
+// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
+// or v2f32.
+def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
+                    (vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
+           (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
+def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
+                     (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
+           (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
+    // vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
+    // so we match on v4f32 here, not v2f32. This will also catch adding
+    // the low two lanes of a true v4f32 vector.
+def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
+                (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
+          (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
+
+// Scalar 64-bit shifts in FPR64 registers.
+def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+
+// Patterns for nontemporal/no-allocate stores.
+// We have to resort to tricks to turn a single-input store into a store pair,
+// because there is no single-input nontemporal store, only STNP.
+let Predicates = [IsLE] in {
+let AddedComplexity = 15 in {
+class NTStore128Pat<ValueType VT> :
+  Pat<(nontemporalstore (VT FPR128:$Rt),
+        (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
+      (STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub),
+              (CPYi64 FPR128:$Rt, (i64 1)),
+              GPR64sp:$Rn, simm7s8:$offset)>;
+
+def : NTStore128Pat<v2i64>;
+def : NTStore128Pat<v4i32>;
+def : NTStore128Pat<v8i16>;
+def : NTStore128Pat<v16i8>;
+
+class NTStore64Pat<ValueType VT> :
+  Pat<(nontemporalstore (VT FPR64:$Rt),
+        (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
+      (STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub),
+              (CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)),
+              GPR64sp:$Rn, simm7s4:$offset)>;
+
+// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64?
+def : NTStore64Pat<v1f64>;
+def : NTStore64Pat<v1i64>;
+def : NTStore64Pat<v2i32>;
+def : NTStore64Pat<v4i16>;
+def : NTStore64Pat<v8i8>;
+
+def : Pat<(nontemporalstore GPR64:$Rt,
+            (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
+          (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+                  (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 32, 63), sub_32),
+                  GPR64sp:$Rn, simm7s4:$offset)>;
+} // AddedComplexity=10
+} // Predicates = [IsLE]
+
+// Tail call return handling. These are all compiler pseudo-instructions,
+// so no encoding information or anything like that.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
+  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff), []>,
+                   Sched<[WriteBrReg]>;
+  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>,
+                   Sched<[WriteBrReg]>;
+}
+
+def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>;
+def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
+def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
+
+include "AArch64InstrAtomics.td"
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
new file mode 100644
index 000000000000..20de07424c53
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -0,0 +1,1161 @@
+//===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for
+/// AArch64.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstructionSelector.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64RegisterBankInfo.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "aarch64-isel"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+#include "AArch64GenGlobalISel.inc"
+
+AArch64InstructionSelector::AArch64InstructionSelector(
+    const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
+    const AArch64RegisterBankInfo &RBI)
+  : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
+      TRI(*STI.getRegisterInfo()), RBI(RBI) {}
+
+// FIXME: This should be target-independent, inferred from the types declared
+// for each class in the bank.
+static const TargetRegisterClass *
+getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
+                         const RegisterBankInfo &RBI) {
+  if (RB.getID() == AArch64::GPRRegBankID) {
+    if (Ty.getSizeInBits() <= 32)
+      return &AArch64::GPR32RegClass;
+    if (Ty.getSizeInBits() == 64)
+      return &AArch64::GPR64RegClass;
+    return nullptr;
+  }
+
+  if (RB.getID() == AArch64::FPRRegBankID) {
+    if (Ty.getSizeInBits() == 32)
+      return &AArch64::FPR32RegClass;
+    if (Ty.getSizeInBits() == 64)
+      return &AArch64::FPR64RegClass;
+    if (Ty.getSizeInBits() == 128)
+      return &AArch64::FPR128RegClass;
+    return nullptr;
+  }
+
+  return nullptr;
+}
+
+/// Check whether \p I is a currently unsupported binary operation:
+/// - it has an unsized type
+/// - an operand is not a vreg
+/// - all operands are not in the same bank
+/// These are checks that should someday live in the verifier, but right now,
+/// these are mostly limitations of the aarch64 selector.
+static bool unsupportedBinOp(const MachineInstr &I,
+                             const AArch64RegisterBankInfo &RBI,
+                             const MachineRegisterInfo &MRI,
+                             const AArch64RegisterInfo &TRI) {
+  LLT Ty = MRI.getType(I.getOperand(0).getReg());
+  if (!Ty.isValid()) {
+    DEBUG(dbgs() << "Generic binop register should be typed\n");
+    return true;
+  }
+
+  const RegisterBank *PrevOpBank = nullptr;
+  for (auto &MO : I.operands()) {
+    // FIXME: Support non-register operands.
+    if (!MO.isReg()) {
+      DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
+      return true;
+    }
+
+    // FIXME: Can generic operations have physical registers operands? If
+    // so, this will need to be taught about that, and we'll need to get the
+    // bank out of the minimal class for the register.
+    // Either way, this needs to be documented (and possibly verified).
+    if (!TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+      DEBUG(dbgs() << "Generic inst has physical register operand\n");
+      return true;
+    }
+
+    const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
+    if (!OpBank) {
+      DEBUG(dbgs() << "Generic register has no bank or class\n");
+      return true;
+    }
+
+    if (PrevOpBank && OpBank != PrevOpBank) {
+      DEBUG(dbgs() << "Generic inst operands have different banks\n");
+      return true;
+    }
+    PrevOpBank = OpBank;
+  }
+  return false;
+}
+
+/// Select the AArch64 opcode for the basic binary operation \p GenericOpc
+/// (such as G_OR or G_ADD), appropriate for the register bank \p RegBankID
+/// and of size \p OpSize.
+/// \returns \p GenericOpc if the combination is unsupported.
+static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
+                               unsigned OpSize) {
+  switch (RegBankID) {
+  case AArch64::GPRRegBankID:
+    if (OpSize <= 32) {
+      assert((OpSize == 32 || (GenericOpc != TargetOpcode::G_SDIV &&
+                               GenericOpc != TargetOpcode::G_UDIV &&
+                               GenericOpc != TargetOpcode::G_LSHR &&
+                               GenericOpc != TargetOpcode::G_ASHR)) &&
+             "operation should have been legalized before now");
+
+      switch (GenericOpc) {
+      case TargetOpcode::G_OR:
+        return AArch64::ORRWrr;
+      case TargetOpcode::G_XOR:
+        return AArch64::EORWrr;
+      case TargetOpcode::G_AND:
+        return AArch64::ANDWrr;
+      case TargetOpcode::G_ADD:
+        assert(OpSize != 32 && "s32 G_ADD should have been selected");
+        return AArch64::ADDWrr;
+      case TargetOpcode::G_SUB:
+        return AArch64::SUBWrr;
+      case TargetOpcode::G_SHL:
+        return AArch64::LSLVWr;
+      case TargetOpcode::G_LSHR:
+        return AArch64::LSRVWr;
+      case TargetOpcode::G_ASHR:
+        return AArch64::ASRVWr;
+      case TargetOpcode::G_SDIV:
+        return AArch64::SDIVWr;
+      case TargetOpcode::G_UDIV:
+        return AArch64::UDIVWr;
+      default:
+        return GenericOpc;
+      }
+    } else if (OpSize == 64) {
+      switch (GenericOpc) {
+      case TargetOpcode::G_OR:
+        return AArch64::ORRXrr;
+      case TargetOpcode::G_XOR:
+        return AArch64::EORXrr;
+      case TargetOpcode::G_AND:
+        return AArch64::ANDXrr;
+      case TargetOpcode::G_GEP:
+        return AArch64::ADDXrr;
+      case TargetOpcode::G_SUB:
+        return AArch64::SUBXrr;
+      case TargetOpcode::G_SHL:
+        return AArch64::LSLVXr;
+      case TargetOpcode::G_LSHR:
+        return AArch64::LSRVXr;
+      case TargetOpcode::G_ASHR:
+        return AArch64::ASRVXr;
+      case TargetOpcode::G_SDIV:
+        return AArch64::SDIVXr;
+      case TargetOpcode::G_UDIV:
+        return AArch64::UDIVXr;
+      default:
+        return GenericOpc;
+      }
+    }
+  case AArch64::FPRRegBankID:
+    switch (OpSize) {
+    case 32:
+      switch (GenericOpc) {
+      case TargetOpcode::G_FADD:
+        return AArch64::FADDSrr;
+      case TargetOpcode::G_FSUB:
+        return AArch64::FSUBSrr;
+      case TargetOpcode::G_FMUL:
+        return AArch64::FMULSrr;
+      case TargetOpcode::G_FDIV:
+        return AArch64::FDIVSrr;
+      default:
+        return GenericOpc;
+      }
+    case 64:
+      switch (GenericOpc) {
+      case TargetOpcode::G_FADD:
+        return AArch64::FADDDrr;
+      case TargetOpcode::G_FSUB:
+        return AArch64::FSUBDrr;
+      case TargetOpcode::G_FMUL:
+        return AArch64::FMULDrr;
+      case TargetOpcode::G_FDIV:
+        return AArch64::FDIVDrr;
+      case TargetOpcode::G_OR:
+        return AArch64::ORRv8i8;
+      default:
+        return GenericOpc;
+      }
+    }
+  };
+  return GenericOpc;
+}
+
+/// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
+/// appropriate for the (value) register bank \p RegBankID and of memory access
+/// size \p OpSize.  This returns the variant with the base+unsigned-immediate
+/// addressing mode (e.g., LDRXui).
+/// \returns \p GenericOpc if the combination is unsupported.
+static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
+                                    unsigned OpSize) {
+  const bool isStore = GenericOpc == TargetOpcode::G_STORE;
+  switch (RegBankID) {
+  case AArch64::GPRRegBankID:
+    switch (OpSize) {
+    case 8:
+      return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
+    case 16:
+      return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
+    case 32:
+      return isStore ? AArch64::STRWui : AArch64::LDRWui;
+    case 64:
+      return isStore ? AArch64::STRXui : AArch64::LDRXui;
+    }
+  case AArch64::FPRRegBankID:
+    switch (OpSize) {
+    case 8:
+      return isStore ? AArch64::STRBui : AArch64::LDRBui;
+    case 16:
+      return isStore ? AArch64::STRHui : AArch64::LDRHui;
+    case 32:
+      return isStore ? AArch64::STRSui : AArch64::LDRSui;
+    case 64:
+      return isStore ? AArch64::STRDui : AArch64::LDRDui;
+    }
+  };
+  return GenericOpc;
+}
+
+static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
+                       MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+                       const RegisterBankInfo &RBI) {
+
+  unsigned DstReg = I.getOperand(0).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+    assert(I.isCopy() && "Generic operators do not allow physical registers");
+    return true;
+  }
+
+  const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+  const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+  unsigned SrcReg = I.getOperand(1).getReg();
+  const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
+  (void)SrcSize;
+  assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) &&
+         "No phys reg on generic operators");
+  assert(
+      (DstSize == SrcSize ||
+       // Copies are a mean to setup initial types, the number of
+       // bits may not exactly match.
+       (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+        DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI)) ||
+       // Copies are a mean to copy bits around, as long as we are
+       // on the same register class, that's fine. Otherwise, that
+       // means we need some SUBREG_TO_REG or AND & co.
+       (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
+      "Copy with different width?!");
+  assert((DstSize <= 64 || RegBank.getID() == AArch64::FPRRegBankID) &&
+         "GPRs cannot get more than 64-bit width values");
+  const TargetRegisterClass *RC = nullptr;
+
+  if (RegBank.getID() == AArch64::FPRRegBankID) {
+    if (DstSize <= 32)
+      RC = &AArch64::FPR32RegClass;
+    else if (DstSize <= 64)
+      RC = &AArch64::FPR64RegClass;
+    else if (DstSize <= 128)
+      RC = &AArch64::FPR128RegClass;
+    else {
+      DEBUG(dbgs() << "Unexpected bitcast size " << DstSize << '\n');
+      return false;
+    }
+  } else {
+    assert(RegBank.getID() == AArch64::GPRRegBankID &&
+           "Bitcast for the flags?");
+    RC =
+        DstSize <= 32 ? &AArch64::GPR32allRegClass : &AArch64::GPR64allRegClass;
+  }
+
+  // No need to constrain SrcReg. It will get constrained when
+  // we hit another of its use or its defs.
+  // Copies do not have constraints.
+  if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
+    DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+                 << " operand\n");
+    return false;
+  }
+  I.setDesc(TII.get(AArch64::COPY));
+  return true;
+}
+
+static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
+  if (!DstTy.isScalar() || !SrcTy.isScalar())
+    return GenericOpc;
+
+  const unsigned DstSize = DstTy.getSizeInBits();
+  const unsigned SrcSize = SrcTy.getSizeInBits();
+
+  switch (DstSize) {
+  case 32:
+    switch (SrcSize) {
+    case 32:
+      switch (GenericOpc) {
+      case TargetOpcode::G_SITOFP:
+        return AArch64::SCVTFUWSri;
+      case TargetOpcode::G_UITOFP:
+        return AArch64::UCVTFUWSri;
+      case TargetOpcode::G_FPTOSI:
+        return AArch64::FCVTZSUWSr;
+      case TargetOpcode::G_FPTOUI:
+        return AArch64::FCVTZUUWSr;
+      default:
+        return GenericOpc;
+      }
+    case 64:
+      switch (GenericOpc) {
+      case TargetOpcode::G_SITOFP:
+        return AArch64::SCVTFUXSri;
+      case TargetOpcode::G_UITOFP:
+        return AArch64::UCVTFUXSri;
+      case TargetOpcode::G_FPTOSI:
+        return AArch64::FCVTZSUWDr;
+      case TargetOpcode::G_FPTOUI:
+        return AArch64::FCVTZUUWDr;
+      default:
+        return GenericOpc;
+      }
+    default:
+      return GenericOpc;
+    }
+  case 64:
+    switch (SrcSize) {
+    case 32:
+      switch (GenericOpc) {
+      case TargetOpcode::G_SITOFP:
+        return AArch64::SCVTFUWDri;
+      case TargetOpcode::G_UITOFP:
+        return AArch64::UCVTFUWDri;
+      case TargetOpcode::G_FPTOSI:
+        return AArch64::FCVTZSUXSr;
+      case TargetOpcode::G_FPTOUI:
+        return AArch64::FCVTZUUXSr;
+      default:
+        return GenericOpc;
+      }
+    case 64:
+      switch (GenericOpc) {
+      case TargetOpcode::G_SITOFP:
+        return AArch64::SCVTFUXDri;
+      case TargetOpcode::G_UITOFP:
+        return AArch64::UCVTFUXDri;
+      case TargetOpcode::G_FPTOSI:
+        return AArch64::FCVTZSUXDr;
+      case TargetOpcode::G_FPTOUI:
+        return AArch64::FCVTZUUXDr;
+      default:
+        return GenericOpc;
+      }
+    default:
+      return GenericOpc;
+    }
+  default:
+    return GenericOpc;
+  };
+  return GenericOpc;
+}
+
+static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
+  switch (P) {
+  default:
+    llvm_unreachable("Unknown condition code!");
+  case CmpInst::ICMP_NE:
+    return AArch64CC::NE;
+  case CmpInst::ICMP_EQ:
+    return AArch64CC::EQ;
+  case CmpInst::ICMP_SGT:
+    return AArch64CC::GT;
+  case CmpInst::ICMP_SGE:
+    return AArch64CC::GE;
+  case CmpInst::ICMP_SLT:
+    return AArch64CC::LT;
+  case CmpInst::ICMP_SLE:
+    return AArch64CC::LE;
+  case CmpInst::ICMP_UGT:
+    return AArch64CC::HI;
+  case CmpInst::ICMP_UGE:
+    return AArch64CC::HS;
+  case CmpInst::ICMP_ULT:
+    return AArch64CC::LO;
+  case CmpInst::ICMP_ULE:
+    return AArch64CC::LS;
+  }
+}
+
+static void changeFCMPPredToAArch64CC(CmpInst::Predicate P,
+                                      AArch64CC::CondCode &CondCode,
+                                      AArch64CC::CondCode &CondCode2) {
+  CondCode2 = AArch64CC::AL;
+  switch (P) {
+  default:
+    llvm_unreachable("Unknown FP condition!");
+  case CmpInst::FCMP_OEQ:
+    CondCode = AArch64CC::EQ;
+    break;
+  case CmpInst::FCMP_OGT:
+    CondCode = AArch64CC::GT;
+    break;
+  case CmpInst::FCMP_OGE:
+    CondCode = AArch64CC::GE;
+    break;
+  case CmpInst::FCMP_OLT:
+    CondCode = AArch64CC::MI;
+    break;
+  case CmpInst::FCMP_OLE:
+    CondCode = AArch64CC::LS;
+    break;
+  case CmpInst::FCMP_ONE:
+    CondCode = AArch64CC::MI;
+    CondCode2 = AArch64CC::GT;
+    break;
+  case CmpInst::FCMP_ORD:
+    CondCode = AArch64CC::VC;
+    break;
+  case CmpInst::FCMP_UNO:
+    CondCode = AArch64CC::VS;
+    break;
+  case CmpInst::FCMP_UEQ:
+    CondCode = AArch64CC::EQ;
+    CondCode2 = AArch64CC::VS;
+    break;
+  case CmpInst::FCMP_UGT:
+    CondCode = AArch64CC::HI;
+    break;
+  case CmpInst::FCMP_UGE:
+    CondCode = AArch64CC::PL;
+    break;
+  case CmpInst::FCMP_ULT:
+    CondCode = AArch64CC::LT;
+    break;
+  case CmpInst::FCMP_ULE:
+    CondCode = AArch64CC::LE;
+    break;
+  case CmpInst::FCMP_UNE:
+    CondCode = AArch64CC::NE;
+    break;
+  }
+}
+
+bool AArch64InstructionSelector::select(MachineInstr &I) const {
+  assert(I.getParent() && "Instruction should be in a basic block!");
+  assert(I.getParent()->getParent() && "Instruction should be in a function!");
+
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned Opcode = I.getOpcode();
+  if (!isPreISelGenericOpcode(I.getOpcode())) {
+    // Certain non-generic instructions also need some special handling.
+
+    if (Opcode ==  TargetOpcode::LOAD_STACK_GUARD)
+      return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+
+    if (Opcode == TargetOpcode::PHI) {
+      const unsigned DefReg = I.getOperand(0).getReg();
+      const LLT DefTy = MRI.getType(DefReg);
+
+      const TargetRegisterClass *DefRC = nullptr;
+      if (TargetRegisterInfo::isPhysicalRegister(DefReg)) {
+        DefRC = TRI.getRegClass(DefReg);
+      } else {
+        const RegClassOrRegBank &RegClassOrBank =
+            MRI.getRegClassOrRegBank(DefReg);
+
+        DefRC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
+        if (!DefRC) {
+          if (!DefTy.isValid()) {
+            DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
+            return false;
+          }
+          const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
+          DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
+          if (!DefRC) {
+            DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
+            return false;
+          }
+        }
+      }
+
+      return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
+    }
+
+    if (I.isCopy())
+      return selectCopy(I, TII, MRI, TRI, RBI);
+
+    return true;
+  }
+
+
+  if (I.getNumOperands() != I.getNumExplicitOperands()) {
+    DEBUG(dbgs() << "Generic instruction has unexpected implicit operands\n");
+    return false;
+  }
+
+  if (selectImpl(I))
+    return true;
+
+  LLT Ty =
+      I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
+
+  switch (Opcode) {
+  case TargetOpcode::G_BRCOND: {
+    if (Ty.getSizeInBits() > 32) {
+      // We shouldn't need this on AArch64, but it would be implemented as an
+      // EXTRACT_SUBREG followed by a TBNZW because TBNZX has no encoding if the
+      // bit being tested is < 32.
+      DEBUG(dbgs() << "G_BRCOND has type: " << Ty
+                   << ", expected at most 32-bits");
+      return false;
+    }
+
+    const unsigned CondReg = I.getOperand(0).getReg();
+    MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
+
+    auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW))
+                   .addUse(CondReg)
+                   .addImm(/*bit offset=*/0)
+                   .addMBB(DestMBB);
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
+  }
+
+  case TargetOpcode::G_FCONSTANT:
+  case TargetOpcode::G_CONSTANT: {
+    const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
+
+    const LLT s32 = LLT::scalar(32);
+    const LLT s64 = LLT::scalar(64);
+    const LLT p0 = LLT::pointer(0, 64);
+
+    const unsigned DefReg = I.getOperand(0).getReg();
+    const LLT DefTy = MRI.getType(DefReg);
+    const unsigned DefSize = DefTy.getSizeInBits();
+    const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+    // FIXME: Redundant check, but even less readable when factored out.
+    if (isFP) {
+      if (Ty != s32 && Ty != s64) {
+        DEBUG(dbgs() << "Unable to materialize FP " << Ty
+                     << " constant, expected: " << s32 << " or " << s64
+                     << '\n');
+        return false;
+      }
+
+      if (RB.getID() != AArch64::FPRRegBankID) {
+        DEBUG(dbgs() << "Unable to materialize FP " << Ty
+                     << " constant on bank: " << RB << ", expected: FPR\n");
+        return false;
+      }
+    } else {
+      if (Ty != s32 && Ty != s64 && Ty != p0) {
+        DEBUG(dbgs() << "Unable to materialize integer " << Ty
+                     << " constant, expected: " << s32 << ", " << s64 << ", or "
+                     << p0 << '\n');
+        return false;
+      }
+
+      if (RB.getID() != AArch64::GPRRegBankID) {
+        DEBUG(dbgs() << "Unable to materialize integer " << Ty
+                     << " constant on bank: " << RB << ", expected: GPR\n");
+        return false;
+      }
+    }
+
+    const unsigned MovOpc =
+        DefSize == 32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
+
+    I.setDesc(TII.get(MovOpc));
+
+    if (isFP) {
+      const TargetRegisterClass &GPRRC =
+          DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass;
+      const TargetRegisterClass &FPRRC =
+          DefSize == 32 ? AArch64::FPR32RegClass : AArch64::FPR64RegClass;
+
+      const unsigned DefGPRReg = MRI.createVirtualRegister(&GPRRC);
+      MachineOperand &RegOp = I.getOperand(0);
+      RegOp.setReg(DefGPRReg);
+
+      BuildMI(MBB, std::next(I.getIterator()), I.getDebugLoc(),
+              TII.get(AArch64::COPY))
+          .addDef(DefReg)
+          .addUse(DefGPRReg);
+
+      if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
+        DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
+        return false;
+      }
+
+      MachineOperand &ImmOp = I.getOperand(1);
+      // FIXME: Is going through int64_t always correct?
+      ImmOp.ChangeToImmediate(
+          ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
+    } else {
+      uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
+      I.getOperand(1).ChangeToImmediate(Val);
+    }
+
+    constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+    return true;
+  }
+
+  case TargetOpcode::G_FRAME_INDEX: {
+    // allocas and G_FRAME_INDEX are only supported in addrspace(0).
+    if (Ty != LLT::pointer(0, 64)) {
+      DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
+            << ", expected: " << LLT::pointer(0, 64) << '\n');
+      return false;
+    }
+
+    I.setDesc(TII.get(AArch64::ADDXri));
+
+    // MOs for a #0 shifted immediate.
+    I.addOperand(MachineOperand::CreateImm(0));
+    I.addOperand(MachineOperand::CreateImm(0));
+
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
+
+  case TargetOpcode::G_GLOBAL_VALUE: {
+    auto GV = I.getOperand(1).getGlobal();
+    if (GV->isThreadLocal()) {
+      // FIXME: we don't support TLS yet.
+      return false;
+    }
+    unsigned char OpFlags = STI.ClassifyGlobalReference(GV, TM);
+    if (OpFlags & AArch64II::MO_GOT) {
+      I.setDesc(TII.get(AArch64::LOADgot));
+      I.getOperand(1).setTargetFlags(OpFlags);
+    } else {
+      I.setDesc(TII.get(AArch64::MOVaddr));
+      I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
+      MachineInstrBuilder MIB(MF, I);
+      MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
+                           OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+    }
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
+
+  case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_STORE: {
+    LLT MemTy = Ty;
+    LLT PtrTy = MRI.getType(I.getOperand(1).getReg());
+
+    if (PtrTy != LLT::pointer(0, 64)) {
+      DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
+                   << ", expected: " << LLT::pointer(0, 64) << '\n');
+      return false;
+    }
+
+#ifndef NDEBUG
+    // Sanity-check the pointer register.
+    const unsigned PtrReg = I.getOperand(1).getReg();
+    const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
+    assert(PtrRB.getID() == AArch64::GPRRegBankID &&
+           "Load/Store pointer operand isn't a GPR");
+    assert(MRI.getType(PtrReg).isPointer() &&
+           "Load/Store pointer operand isn't a pointer");
+#endif
+
+    const unsigned ValReg = I.getOperand(0).getReg();
+    const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
+
+    const unsigned NewOpc =
+        selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemTy.getSizeInBits());
+    if (NewOpc == I.getOpcode())
+      return false;
+
+    I.setDesc(TII.get(NewOpc));
+
+    I.addOperand(MachineOperand::CreateImm(0));
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
+
+  case TargetOpcode::G_MUL: {
+    // Reject the various things we don't support yet.
+    if (unsupportedBinOp(I, RBI, MRI, TRI))
+      return false;
+
+    const unsigned DefReg = I.getOperand(0).getReg();
+    const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+    if (RB.getID() != AArch64::GPRRegBankID) {
+      DEBUG(dbgs() << "G_MUL on bank: " << RB << ", expected: GPR\n");
+      return false;
+    }
+
+    unsigned ZeroReg;
+    unsigned NewOpc;
+    if (Ty.isScalar() && Ty.getSizeInBits() <= 32) {
+      NewOpc = AArch64::MADDWrrr;
+      ZeroReg = AArch64::WZR;
+    } else if (Ty == LLT::scalar(64)) {
+      NewOpc = AArch64::MADDXrrr;
+      ZeroReg = AArch64::XZR;
+    } else {
+      DEBUG(dbgs() << "G_MUL has type: " << Ty << ", expected: "
+                   << LLT::scalar(32) << " or " << LLT::scalar(64) << '\n');
+      return false;
+    }
+
+    I.setDesc(TII.get(NewOpc));
+
+    I.addOperand(MachineOperand::CreateReg(ZeroReg, /*isDef=*/false));
+
+    // Now that we selected an opcode, we need to constrain the register
+    // operands to use appropriate classes.
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
+
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FSUB:
+  case TargetOpcode::G_FMUL:
+  case TargetOpcode::G_FDIV:
+
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_XOR:
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_SHL:
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_ASHR:
+  case TargetOpcode::G_SDIV:
+  case TargetOpcode::G_UDIV:
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_SUB:
+  case TargetOpcode::G_GEP: {
+    // Reject the various things we don't support yet.
+    if (unsupportedBinOp(I, RBI, MRI, TRI))
+      return false;
+
+    const unsigned OpSize = Ty.getSizeInBits();
+
+    const unsigned DefReg = I.getOperand(0).getReg();
+    const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+    const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
+    if (NewOpc == I.getOpcode())
+      return false;
+
+    I.setDesc(TII.get(NewOpc));
+    // FIXME: Should the type be always reset in setDesc?
+
+    // Now that we selected an opcode, we need to constrain the register
+    // operands to use appropriate classes.
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
+
+  case TargetOpcode::G_PTRTOINT:
+  case TargetOpcode::G_TRUNC: {
+    const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+    const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
+
+    const unsigned DstReg = I.getOperand(0).getReg();
+    const unsigned SrcReg = I.getOperand(1).getReg();
+
+    const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+    const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
+
+    if (DstRB.getID() != SrcRB.getID()) {
+      DEBUG(dbgs() << "G_TRUNC input/output on different banks\n");
+      return false;
+    }
+
+    if (DstRB.getID() == AArch64::GPRRegBankID) {
+      const TargetRegisterClass *DstRC =
+          getRegClassForTypeOnBank(DstTy, DstRB, RBI);
+      if (!DstRC)
+        return false;
+
+      const TargetRegisterClass *SrcRC =
+          getRegClassForTypeOnBank(SrcTy, SrcRB, RBI);
+      if (!SrcRC)
+        return false;
+
+      if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+          !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+        DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+        return false;
+      }
+
+      if (DstRC == SrcRC) {
+        // Nothing to be done
+      } else if (DstRC == &AArch64::GPR32RegClass &&
+                 SrcRC == &AArch64::GPR64RegClass) {
+        I.getOperand(1).setSubReg(AArch64::sub_32);
+      } else {
+        return false;
+      }
+
+      I.setDesc(TII.get(TargetOpcode::COPY));
+      return true;
+    } else if (DstRB.getID() == AArch64::FPRRegBankID) {
+      if (DstTy == LLT::vector(4, 16) && SrcTy == LLT::vector(4, 32)) {
+        I.setDesc(TII.get(AArch64::XTNv4i16));
+        constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  case TargetOpcode::G_ANYEXT: {
+    const unsigned DstReg = I.getOperand(0).getReg();
+    const unsigned SrcReg = I.getOperand(1).getReg();
+
+    const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
+    if (RBDst.getID() != AArch64::GPRRegBankID) {
+      DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst << ", expected: GPR\n");
+      return false;
+    }
+
+    const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
+    if (RBSrc.getID() != AArch64::GPRRegBankID) {
+      DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc << ", expected: GPR\n");
+      return false;
+    }
+
+    const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+
+    if (DstSize == 0) {
+      DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
+      return false;
+    }
+
+    if (DstSize != 64 && DstSize > 32) {
+      DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
+                   << ", expected: 32 or 64\n");
+      return false;
+    }
+    // At this point G_ANYEXT is just like a plain COPY, but we need
+    // to explicitly form the 64-bit value if any.
+    if (DstSize > 32) {
+      unsigned ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
+      BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
+          .addDef(ExtSrc)
+          .addImm(0)
+          .addUse(SrcReg)
+          .addImm(AArch64::sub_32);
+      I.getOperand(1).setReg(ExtSrc);
+    }
+    return selectCopy(I, TII, MRI, TRI, RBI);
+  }
+
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_SEXT: {
+    unsigned Opcode = I.getOpcode();
+    const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
+              SrcTy = MRI.getType(I.getOperand(1).getReg());
+    const bool isSigned = Opcode == TargetOpcode::G_SEXT;
+    const unsigned DefReg = I.getOperand(0).getReg();
+    const unsigned SrcReg = I.getOperand(1).getReg();
+    const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+    if (RB.getID() != AArch64::GPRRegBankID) {
+      DEBUG(dbgs() << TII.getName(I.getOpcode()) << " on bank: " << RB
+                   << ", expected: GPR\n");
+      return false;
+    }
+
+    MachineInstr *ExtI;
+    if (DstTy == LLT::scalar(64)) {
+      // FIXME: Can we avoid manually doing this?
+      if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) {
+        DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
+                     << " operand\n");
+        return false;
+      }
+
+      const unsigned SrcXReg =
+          MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+      BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
+          .addDef(SrcXReg)
+          .addImm(0)
+          .addUse(SrcReg)
+          .addImm(AArch64::sub_32);
+
+      const unsigned NewOpc = isSigned ? AArch64::SBFMXri : AArch64::UBFMXri;
+      ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
+                 .addDef(DefReg)
+                 .addUse(SrcXReg)
+                 .addImm(0)
+                 .addImm(SrcTy.getSizeInBits() - 1);
+    } else if (DstTy.isScalar() && DstTy.getSizeInBits() <= 32) {
+      const unsigned NewOpc = isSigned ? AArch64::SBFMWri : AArch64::UBFMWri;
+      ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
+                 .addDef(DefReg)
+                 .addUse(SrcReg)
+                 .addImm(0)
+                 .addImm(SrcTy.getSizeInBits() - 1);
+    } else {
+      return false;
+    }
+
+    constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
+
+    I.eraseFromParent();
+    return true;
+  }
+
+  case TargetOpcode::G_SITOFP:
+  case TargetOpcode::G_UITOFP:
+  case TargetOpcode::G_FPTOSI:
+  case TargetOpcode::G_FPTOUI: {
+    const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
+              SrcTy = MRI.getType(I.getOperand(1).getReg());
+    const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
+    if (NewOpc == Opcode)
+      return false;
+
+    I.setDesc(TII.get(NewOpc));
+    constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+
+    return true;
+  }
+
+
+  case TargetOpcode::G_INTTOPTR:
+  case TargetOpcode::G_BITCAST:
+    return selectCopy(I, TII, MRI, TRI, RBI);
+
+  case TargetOpcode::G_FPEXT: {
+    if (MRI.getType(I.getOperand(0).getReg()) != LLT::scalar(64)) {
+      DEBUG(dbgs() << "G_FPEXT to type " << Ty
+                   << ", expected: " << LLT::scalar(64) << '\n');
+      return false;
+    }
+
+    if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(32)) {
+      DEBUG(dbgs() << "G_FPEXT from type " << Ty
+                   << ", expected: " << LLT::scalar(32) << '\n');
+      return false;
+    }
+
+    const unsigned DefReg = I.getOperand(0).getReg();
+    const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+    if (RB.getID() != AArch64::FPRRegBankID) {
+      DEBUG(dbgs() << "G_FPEXT on bank: " << RB << ", expected: FPR\n");
+      return false;
+    }
+
+    I.setDesc(TII.get(AArch64::FCVTDSr));
+    constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+
+    return true;
+  }
+
+  case TargetOpcode::G_FPTRUNC: {
+    if (MRI.getType(I.getOperand(0).getReg()) != LLT::scalar(32)) {
+      DEBUG(dbgs() << "G_FPTRUNC to type " << Ty
+                   << ", expected: " << LLT::scalar(32) << '\n');
+      return false;
+    }
+
+    if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(64)) {
+      DEBUG(dbgs() << "G_FPTRUNC from type " << Ty
+                   << ", expected: " << LLT::scalar(64) << '\n');
+      return false;
+    }
+
+    const unsigned DefReg = I.getOperand(0).getReg();
+    const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+    if (RB.getID() != AArch64::FPRRegBankID) {
+      DEBUG(dbgs() << "G_FPTRUNC on bank: " << RB << ", expected: FPR\n");
+      return false;
+    }
+
+    I.setDesc(TII.get(AArch64::FCVTSDr));
+    constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+
+    return true;
+  }
+
+  case TargetOpcode::G_SELECT: {
+    if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
+      DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
+                   << ", expected: " << LLT::scalar(1) << '\n');
+      return false;
+    }
+
+    const unsigned CondReg = I.getOperand(1).getReg();
+    const unsigned TReg = I.getOperand(2).getReg();
+    const unsigned FReg = I.getOperand(3).getReg();
+
+    unsigned CSelOpc = 0;
+
+    if (Ty == LLT::scalar(32)) {
+      CSelOpc = AArch64::CSELWr;
+    } else if (Ty == LLT::scalar(64)) {
+      CSelOpc = AArch64::CSELXr;
+    } else {
+      return false;
+    }
+
+    MachineInstr &TstMI =
+        *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri))
+             .addDef(AArch64::WZR)
+             .addUse(CondReg)
+             .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+
+    MachineInstr &CSelMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CSelOpc))
+                                .addDef(I.getOperand(0).getReg())
+                                .addUse(TReg)
+                                .addUse(FReg)
+                                .addImm(AArch64CC::NE);
+
+    constrainSelectedInstRegOperands(TstMI, TII, TRI, RBI);
+    constrainSelectedInstRegOperands(CSelMI, TII, TRI, RBI);
+
+    I.eraseFromParent();
+    return true;
+  }
+  case TargetOpcode::G_ICMP: {
+    if (Ty != LLT::scalar(1)) {
+      DEBUG(dbgs() << "G_ICMP result has type: " << Ty
+                   << ", expected: " << LLT::scalar(1) << '\n');
+      return false;
+    }
+
+    unsigned CmpOpc = 0;
+    unsigned ZReg = 0;
+
+    LLT CmpTy = MRI.getType(I.getOperand(2).getReg());
+    if (CmpTy == LLT::scalar(32)) {
+      CmpOpc = AArch64::SUBSWrr;
+      ZReg = AArch64::WZR;
+    } else if (CmpTy == LLT::scalar(64) || CmpTy.isPointer()) {
+      CmpOpc = AArch64::SUBSXrr;
+      ZReg = AArch64::XZR;
+    } else {
+      return false;
+    }
+
+    const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
+        (CmpInst::Predicate)I.getOperand(1).getPredicate());
+
+    MachineInstr &CmpMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
+                               .addDef(ZReg)
+                               .addUse(I.getOperand(2).getReg())
+                               .addUse(I.getOperand(3).getReg());
+
+    MachineInstr &CSetMI =
+        *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
+             .addDef(I.getOperand(0).getReg())
+             .addUse(AArch64::WZR)
+             .addUse(AArch64::WZR)
+             .addImm(CC);
+
+    constrainSelectedInstRegOperands(CmpMI, TII, TRI, RBI);
+    constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI);
+
+    I.eraseFromParent();
+    return true;
+  }
+
+  case TargetOpcode::G_FCMP: {
+    if (Ty != LLT::scalar(1)) {
+      DEBUG(dbgs() << "G_FCMP result has type: " << Ty
+                   << ", expected: " << LLT::scalar(1) << '\n');
+      return false;
+    }
+
+    unsigned CmpOpc = 0;
+    LLT CmpTy = MRI.getType(I.getOperand(2).getReg());
+    if (CmpTy == LLT::scalar(32)) {
+      CmpOpc = AArch64::FCMPSrr;
+    } else if (CmpTy == LLT::scalar(64)) {
+      CmpOpc = AArch64::FCMPDrr;
+    } else {
+      return false;
+    }
+
+    // FIXME: regbank
+
+    AArch64CC::CondCode CC1, CC2;
+    changeFCMPPredToAArch64CC(
+        (CmpInst::Predicate)I.getOperand(1).getPredicate(), CC1, CC2);
+
+    MachineInstr &CmpMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
+                               .addUse(I.getOperand(2).getReg())
+                               .addUse(I.getOperand(3).getReg());
+
+    const unsigned DefReg = I.getOperand(0).getReg();
+    unsigned Def1Reg = DefReg;
+    if (CC2 != AArch64CC::AL)
+      Def1Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+
+    MachineInstr &CSetMI =
+        *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
+             .addDef(Def1Reg)
+             .addUse(AArch64::WZR)
+             .addUse(AArch64::WZR)
+             .addImm(CC1);
+
+    if (CC2 != AArch64CC::AL) {
+      unsigned Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+      MachineInstr &CSet2MI =
+          *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
+               .addDef(Def2Reg)
+               .addUse(AArch64::WZR)
+               .addUse(AArch64::WZR)
+               .addImm(CC2);
+      MachineInstr &OrMI =
+          *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ORRWrr))
+               .addDef(DefReg)
+               .addUse(Def1Reg)
+               .addUse(Def2Reg);
+      constrainSelectedInstRegOperands(OrMI, TII, TRI, RBI);
+      constrainSelectedInstRegOperands(CSet2MI, TII, TRI, RBI);
+    }
+
+    constrainSelectedInstRegOperands(CmpMI, TII, TRI, RBI);
+    constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI);
+
+    I.eraseFromParent();
+    return true;
+  }
+  }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h
new file mode 100644
index 000000000000..0d44e696ac20
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h
@@ -0,0 +1,47 @@
+//===- AArch64InstructionSelector --------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the InstructionSelector class for
+/// AArch64.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
+
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+
+namespace llvm {
+class AArch64InstrInfo;
+class AArch64RegisterBankInfo;
+class AArch64RegisterInfo;
+class AArch64Subtarget;
+class AArch64TargetMachine;
+
+class AArch64InstructionSelector : public InstructionSelector {
+public:
+  AArch64InstructionSelector(const AArch64TargetMachine &TM,
+                             const AArch64Subtarget &STI,
+                             const AArch64RegisterBankInfo &RBI);
+
+  virtual bool select(MachineInstr &I) const override;
+
+private:
+  /// tblgen-erated 'select' implementation, used as the initial selector for
+  /// the patterns that don't require complex C++.
+  bool selectImpl(MachineInstr &I) const;
+
+  const AArch64TargetMachine &TM;
+  const AArch64Subtarget &STI;
+  const AArch64InstrInfo &TII;
+  const AArch64RegisterInfo &TRI;
+  const AArch64RegisterBankInfo &RBI;
+};
+
+} // End llvm namespace.
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
new file mode 100644
index 000000000000..83f276a8161b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -0,0 +1,204 @@
+//===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for
+/// AArch64.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64LegalizerInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Target/TargetOpcodes.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+AArch64LegalizerInfo::AArch64LegalizerInfo() {
+  using namespace TargetOpcode;
+  const LLT p0 = LLT::pointer(0, 64);
+  const LLT s1 = LLT::scalar(1);
+  const LLT s8 = LLT::scalar(8);
+  const LLT s16 = LLT::scalar(16);
+  const LLT s32 = LLT::scalar(32);
+  const LLT s64 = LLT::scalar(64);
+  const LLT v2s32 = LLT::vector(2, 32);
+  const LLT v4s32 = LLT::vector(4, 32);
+  const LLT v2s64 = LLT::vector(2, 64);
+
+  for (auto BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) {
+    // These operations naturally get the right answer when used on
+    // GPR32, even if the actual type is narrower.
+    for (auto Ty : {s1, s8, s16, s32, s64, v2s32, v4s32, v2s64})
+      setAction({BinOp, Ty}, Legal);
+  }
+
+  setAction({G_GEP, p0}, Legal);
+  setAction({G_GEP, 1, s64}, Legal);
+
+  for (auto Ty : {s1, s8, s16, s32})
+    setAction({G_GEP, 1, Ty}, WidenScalar);
+
+  for (auto BinOp : {G_LSHR, G_ASHR, G_SDIV, G_UDIV}) {
+    for (auto Ty : {s32, s64})
+      setAction({BinOp, Ty}, Legal);
+
+    for (auto Ty : {s1, s8, s16})
+      setAction({BinOp, Ty}, WidenScalar);
+  }
+
+  for (auto BinOp : { G_SREM, G_UREM })
+    for (auto Ty : { s1, s8, s16, s32, s64 })
+      setAction({BinOp, Ty}, Lower);
+
+  for (auto Op : { G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULO, G_UMULO }) {
+    for (auto Ty : { s32, s64 })
+      setAction({Op, Ty}, Legal);
+
+    setAction({Op, 1, s1}, Legal);
+  }
+
+  for (auto BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
+    for (auto Ty : {s32, s64})
+      setAction({BinOp, Ty}, Legal);
+
+  setAction({G_FREM, s32}, Libcall);
+  setAction({G_FREM, s64}, Libcall);
+
+  for (auto MemOp : {G_LOAD, G_STORE}) {
+    for (auto Ty : {s8, s16, s32, s64, p0, v2s32})
+      setAction({MemOp, Ty}, Legal);
+
+    setAction({MemOp, s1}, WidenScalar);
+
+    // And everything's fine in addrspace 0.
+    setAction({MemOp, 1, p0}, Legal);
+  }
+
+  // Constants
+  for (auto Ty : {s32, s64}) {
+    setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
+    setAction({TargetOpcode::G_FCONSTANT, Ty}, Legal);
+  }
+
+  setAction({G_CONSTANT, p0}, Legal);
+
+  for (auto Ty : {s1, s8, s16})
+    setAction({TargetOpcode::G_CONSTANT, Ty}, WidenScalar);
+
+  setAction({TargetOpcode::G_FCONSTANT, s16}, WidenScalar);
+
+  setAction({G_ICMP, s1}, Legal);
+  setAction({G_ICMP, 1, s32}, Legal);
+  setAction({G_ICMP, 1, s64}, Legal);
+  setAction({G_ICMP, 1, p0}, Legal);
+
+  for (auto Ty : {s1, s8, s16}) {
+    setAction({G_ICMP, 1, Ty}, WidenScalar);
+  }
+
+  setAction({G_FCMP, s1}, Legal);
+  setAction({G_FCMP, 1, s32}, Legal);
+  setAction({G_FCMP, 1, s64}, Legal);
+
+  // Extensions
+  for (auto Ty : { s1, s8, s16, s32, s64 }) {
+    setAction({G_ZEXT, Ty}, Legal);
+    setAction({G_SEXT, Ty}, Legal);
+    setAction({G_ANYEXT, Ty}, Legal);
+  }
+
+  for (auto Ty : { s1, s8, s16, s32 }) {
+    setAction({G_ZEXT, 1, Ty}, Legal);
+    setAction({G_SEXT, 1, Ty}, Legal);
+    setAction({G_ANYEXT, 1, Ty}, Legal);
+  }
+
+  setAction({G_FPEXT, s64}, Legal);
+  setAction({G_FPEXT, 1, s32}, Legal);
+
+  // Truncations
+  for (auto Ty : { s16, s32 })
+    setAction({G_FPTRUNC, Ty}, Legal);
+
+  for (auto Ty : { s32, s64 })
+    setAction({G_FPTRUNC, 1, Ty}, Legal);
+
+  for (auto Ty : { s1, s8, s16, s32 })
+    setAction({G_TRUNC, Ty}, Legal);
+
+  for (auto Ty : { s8, s16, s32, s64 })
+    setAction({G_TRUNC, 1, Ty}, Legal);
+
+  // Conversions
+  for (auto Ty : { s1, s8, s16, s32, s64 }) {
+    setAction({G_FPTOSI, 0, Ty}, Legal);
+    setAction({G_FPTOUI, 0, Ty}, Legal);
+    setAction({G_SITOFP, 1, Ty}, Legal);
+    setAction({G_UITOFP, 1, Ty}, Legal);
+  }
+
+  for (auto Ty : { s32, s64 }) {
+    setAction({G_FPTOSI, 1, Ty}, Legal);
+    setAction({G_FPTOUI, 1, Ty}, Legal);
+    setAction({G_SITOFP, 0, Ty}, Legal);
+    setAction({G_UITOFP, 0, Ty}, Legal);
+  }
+
+  // Control-flow
+  for (auto Ty : {s1, s8, s16, s32})
+    setAction({G_BRCOND, Ty}, Legal);
+
+  // Select
+  for (auto Ty : {s1, s8, s16, s32, s64})
+    setAction({G_SELECT, Ty}, Legal);
+
+  setAction({G_SELECT, 1, s1}, Legal);
+
+  // Pointer-handling
+  setAction({G_FRAME_INDEX, p0}, Legal);
+  setAction({G_GLOBAL_VALUE, p0}, Legal);
+
+  for (auto Ty : {s1, s8, s16, s32, s64})
+    setAction({G_PTRTOINT, 0, Ty}, Legal);
+
+  setAction({G_PTRTOINT, 1, p0}, Legal);
+
+  setAction({G_INTTOPTR, 0, p0}, Legal);
+  setAction({G_INTTOPTR, 1, s64}, Legal);
+
+  // Casts for 32 and 64-bit width type are just copies.
+  for (auto Ty : {s1, s8, s16, s32, s64}) {
+    setAction({G_BITCAST, 0, Ty}, Legal);
+    setAction({G_BITCAST, 1, Ty}, Legal);
+  }
+
+  // For the sake of copying bits around, the type does not really
+  // matter as long as it fits a register.
+  for (int EltSize = 8; EltSize <= 64; EltSize *= 2) {
+    setAction({G_BITCAST, 0, LLT::vector(128/EltSize, EltSize)}, Legal);
+    setAction({G_BITCAST, 1, LLT::vector(128/EltSize, EltSize)}, Legal);
+    if (EltSize >= 64)
+      continue;
+
+    setAction({G_BITCAST, 0, LLT::vector(64/EltSize, EltSize)}, Legal);
+    setAction({G_BITCAST, 1, LLT::vector(64/EltSize, EltSize)}, Legal);
+    if (EltSize >= 32)
+      continue;
+
+    setAction({G_BITCAST, 0, LLT::vector(32/EltSize, EltSize)}, Legal);
+    setAction({G_BITCAST, 1, LLT::vector(32/EltSize, EltSize)}, Legal);
+  }
+
+  computeTables();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h
new file mode 100644
index 000000000000..feacbef9f147
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h
@@ -0,0 +1,30 @@
+//===- AArch64LegalizerInfo --------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for
+/// AArch64.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class LLVMContext;
+
+/// This class provides the information for the target register banks.
+class AArch64LegalizerInfo : public LegalizerInfo {
+public:
+  AArch64LegalizerInfo();
+};
+} // End llvm namespace.
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
new file mode 100644
index 000000000000..dcb05601e5f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -0,0 +1,1728 @@
+//=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs load / store related peephole
+// optimizations. This pass should be run after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-ldst-opt"
+
+STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
+STATISTIC(NumPostFolded, "Number of post-index updates folded");
+STATISTIC(NumPreFolded, "Number of pre-index updates folded");
+STATISTIC(NumUnscaledPairCreated,
+          "Number of load/store from unscaled generated");
+STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
+STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
+
+// The LdStLimit limits how far we search for load/store pairs.
+static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
+                                   cl::init(20), cl::Hidden);
+
+// The UpdateLimit limits how far we search for update instructions when we form
+// pre-/post-index instructions.
+static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
+                                     cl::Hidden);
+
+#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
+
+namespace {
+
+typedef struct LdStPairFlags {
+  // If a matching instruction is found, MergeForward is set to true if the
+  // merge is to remove the first instruction and replace the second with
+  // a pair-wise insn, and false if the reverse is true.
+  bool MergeForward;
+
+  // SExtIdx gives the index of the result of the load pair that must be
+  // extended. The value of SExtIdx assumes that the paired load produces the
+  // value in this order: (I, returned iterator), i.e., -1 means no value has
+  // to be extended, 0 means I, and 1 means the returned iterator.
+  int SExtIdx;
+
+  LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
+
+  void setMergeForward(bool V = true) { MergeForward = V; }
+  bool getMergeForward() const { return MergeForward; }
+
+  void setSExtIdx(int V) { SExtIdx = V; }
+  int getSExtIdx() const { return SExtIdx; }
+
+} LdStPairFlags;
+
+struct AArch64LoadStoreOpt : public MachineFunctionPass {
+  static char ID;
+  AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
+    initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
+  }
+
+  const AArch64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const AArch64Subtarget *Subtarget;
+
+  // Track which registers have been modified and used.
+  BitVector ModifiedRegs, UsedRegs;
+
+  // Scan the instructions looking for a load/store that can be combined
+  // with the current instruction into a load/store pair.
+  // Return the matching instruction if one is found, else MBB->end().
+  MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
+                                               LdStPairFlags &Flags,
+                                               unsigned Limit,
+                                               bool FindNarrowMerge);
+
+  // Scan the instructions looking for a store that writes to the address from
+  // which the current load instruction reads. Return true if one is found.
+  bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
+                         MachineBasicBlock::iterator &StoreI);
+
+  // Merge the two instructions indicated into a wider narrow store instruction.
+  MachineBasicBlock::iterator
+  mergeNarrowZeroStores(MachineBasicBlock::iterator I,
+                        MachineBasicBlock::iterator MergeMI,
+                        const LdStPairFlags &Flags);
+
+  // Merge the two instructions indicated into a single pair-wise instruction.
+  MachineBasicBlock::iterator
+  mergePairedInsns(MachineBasicBlock::iterator I,
+                   MachineBasicBlock::iterator Paired,
+                   const LdStPairFlags &Flags);
+
+  // Promote the load that reads directly from the address stored to.
+  MachineBasicBlock::iterator
+  promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
+                       MachineBasicBlock::iterator StoreI);
+
+  // Scan the instruction list to find a base register update that can
+  // be combined with the current instruction (a load or store) using
+  // pre or post indexed addressing with writeback. Scan forwards.
+  MachineBasicBlock::iterator
+  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
+                                int UnscaledOffset, unsigned Limit);
+
+  // Scan the instruction list to find a base register update that can
+  // be combined with the current instruction (a load or store) using
+  // pre or post indexed addressing with writeback. Scan backwards.
+  MachineBasicBlock::iterator
+  findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
+
+  // Find an instruction that updates the base register of the ld/st
+  // instruction.
+  bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
+                            unsigned BaseReg, int Offset);
+
+  // Merge a pre- or post-index base register update into a ld/st instruction.
+  MachineBasicBlock::iterator
+  mergeUpdateInsn(MachineBasicBlock::iterator I,
+                  MachineBasicBlock::iterator Update, bool IsPreIdx);
+
+  // Find and merge zero store instructions.
+  bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
+
+  // Find and pair ldr/str instructions.
+  bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
+
+  // Find and promote load instructions which read directly from store.
+  bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
+
+  bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
+};
+char AArch64LoadStoreOpt::ID = 0;
+} // namespace
+
+INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
+                AARCH64_LOAD_STORE_OPT_NAME, false, false)
+
+static bool isNarrowStore(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::STRBBui:
+  case AArch64::STURBBi:
+  case AArch64::STRHHui:
+  case AArch64::STURHHi:
+    return true;
+  }
+}
+
+// Scaling factor for unscaled load or store.
+static int getMemScale(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Opcode has unknown scale!");
+  case AArch64::LDRBBui:
+  case AArch64::LDURBBi:
+  case AArch64::LDRSBWui:
+  case AArch64::LDURSBWi:
+  case AArch64::STRBBui:
+  case AArch64::STURBBi:
+    return 1;
+  case AArch64::LDRHHui:
+  case AArch64::LDURHHi:
+  case AArch64::LDRSHWui:
+  case AArch64::LDURSHWi:
+  case AArch64::STRHHui:
+  case AArch64::STURHHi:
+    return 2;
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
+  case AArch64::LDRSWui:
+  case AArch64::LDURSWi:
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+  case AArch64::STRSui:
+  case AArch64::STURSi:
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+  case AArch64::LDPSi:
+  case AArch64::LDPSWi:
+  case AArch64::LDPWi:
+  case AArch64::STPSi:
+  case AArch64::STPWi:
+    return 4;
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+  case AArch64::LDPDi:
+  case AArch64::LDPXi:
+  case AArch64::STPDi:
+  case AArch64::STPXi:
+    return 8;
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
+    return 16;
+  }
+}
+
+static unsigned getMatchingNonSExtOpcode(unsigned Opc,
+                                         bool *IsValidLdStrOpc = nullptr) {
+  if (IsValidLdStrOpc)
+    *IsValidLdStrOpc = true;
+  switch (Opc) {
+  default:
+    if (IsValidLdStrOpc)
+      *IsValidLdStrOpc = false;
+    return UINT_MAX;
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+  case AArch64::STRBBui:
+  case AArch64::STURBBi:
+  case AArch64::STRHHui:
+  case AArch64::STURHHi:
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+  case AArch64::STRSui:
+  case AArch64::STURSi:
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
+    return Opc;
+  case AArch64::LDRSWui:
+    return AArch64::LDRWui;
+  case AArch64::LDURSWi:
+    return AArch64::LDURWi;
+  }
+}
+
+static unsigned getMatchingWideOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no wide equivalent!");
+  case AArch64::STRBBui:
+    return AArch64::STRHHui;
+  case AArch64::STRHHui:
+    return AArch64::STRWui;
+  case AArch64::STURBBi:
+    return AArch64::STURHHi;
+  case AArch64::STURHHi:
+    return AArch64::STURWi;
+  case AArch64::STURWi:
+    return AArch64::STURXi;
+  case AArch64::STRWui:
+    return AArch64::STRXui;
+  }
+}
+
+static unsigned getMatchingPairOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no pairwise equivalent!");
+  case AArch64::STRSui:
+  case AArch64::STURSi:
+    return AArch64::STPSi;
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+    return AArch64::STPDi;
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+    return AArch64::STPQi;
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+    return AArch64::STPWi;
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+    return AArch64::STPXi;
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
+    return AArch64::LDPSi;
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
+    return AArch64::LDPDi;
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+    return AArch64::LDPQi;
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+    return AArch64::LDPWi;
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+    return AArch64::LDPXi;
+  case AArch64::LDRSWui:
+  case AArch64::LDURSWi:
+    return AArch64::LDPSWi;
+  }
+}
+
+static unsigned isMatchingStore(MachineInstr &LoadInst,
+                                MachineInstr &StoreInst) {
+  unsigned LdOpc = LoadInst.getOpcode();
+  unsigned StOpc = StoreInst.getOpcode();
+  switch (LdOpc) {
+  default:
+    llvm_unreachable("Unsupported load instruction!");
+  case AArch64::LDRBBui:
+    return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
+           StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
+  case AArch64::LDURBBi:
+    return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
+           StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
+  case AArch64::LDRHHui:
+    return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
+           StOpc == AArch64::STRXui;
+  case AArch64::LDURHHi:
+    return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
+           StOpc == AArch64::STURXi;
+  case AArch64::LDRWui:
+    return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
+  case AArch64::LDURWi:
+    return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
+  case AArch64::LDRXui:
+    return StOpc == AArch64::STRXui;
+  case AArch64::LDURXi:
+    return StOpc == AArch64::STURXi;
+  }
+}
+
+static unsigned getPreIndexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no pre-indexed equivalent!");
+  case AArch64::STRSui:
+    return AArch64::STRSpre;
+  case AArch64::STRDui:
+    return AArch64::STRDpre;
+  case AArch64::STRQui:
+    return AArch64::STRQpre;
+  case AArch64::STRBBui:
+    return AArch64::STRBBpre;
+  case AArch64::STRHHui:
+    return AArch64::STRHHpre;
+  case AArch64::STRWui:
+    return AArch64::STRWpre;
+  case AArch64::STRXui:
+    return AArch64::STRXpre;
+  case AArch64::LDRSui:
+    return AArch64::LDRSpre;
+  case AArch64::LDRDui:
+    return AArch64::LDRDpre;
+  case AArch64::LDRQui:
+    return AArch64::LDRQpre;
+  case AArch64::LDRBBui:
+    return AArch64::LDRBBpre;
+  case AArch64::LDRHHui:
+    return AArch64::LDRHHpre;
+  case AArch64::LDRWui:
+    return AArch64::LDRWpre;
+  case AArch64::LDRXui:
+    return AArch64::LDRXpre;
+  case AArch64::LDRSWui:
+    return AArch64::LDRSWpre;
+  case AArch64::LDPSi:
+    return AArch64::LDPSpre;
+  case AArch64::LDPSWi:
+    return AArch64::LDPSWpre;
+  case AArch64::LDPDi:
+    return AArch64::LDPDpre;
+  case AArch64::LDPQi:
+    return AArch64::LDPQpre;
+  case AArch64::LDPWi:
+    return AArch64::LDPWpre;
+  case AArch64::LDPXi:
+    return AArch64::LDPXpre;
+  case AArch64::STPSi:
+    return AArch64::STPSpre;
+  case AArch64::STPDi:
+    return AArch64::STPDpre;
+  case AArch64::STPQi:
+    return AArch64::STPQpre;
+  case AArch64::STPWi:
+    return AArch64::STPWpre;
+  case AArch64::STPXi:
+    return AArch64::STPXpre;
+  }
+}
+
+static unsigned getPostIndexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no post-indexed wise equivalent!");
+  case AArch64::STRSui:
+    return AArch64::STRSpost;
+  case AArch64::STRDui:
+    return AArch64::STRDpost;
+  case AArch64::STRQui:
+    return AArch64::STRQpost;
+  case AArch64::STRBBui:
+    return AArch64::STRBBpost;
+  case AArch64::STRHHui:
+    return AArch64::STRHHpost;
+  case AArch64::STRWui:
+    return AArch64::STRWpost;
+  case AArch64::STRXui:
+    return AArch64::STRXpost;
+  case AArch64::LDRSui:
+    return AArch64::LDRSpost;
+  case AArch64::LDRDui:
+    return AArch64::LDRDpost;
+  case AArch64::LDRQui:
+    return AArch64::LDRQpost;
+  case AArch64::LDRBBui:
+    return AArch64::LDRBBpost;
+  case AArch64::LDRHHui:
+    return AArch64::LDRHHpost;
+  case AArch64::LDRWui:
+    return AArch64::LDRWpost;
+  case AArch64::LDRXui:
+    return AArch64::LDRXpost;
+  case AArch64::LDRSWui:
+    return AArch64::LDRSWpost;
+  case AArch64::LDPSi:
+    return AArch64::LDPSpost;
+  case AArch64::LDPSWi:
+    return AArch64::LDPSWpost;
+  case AArch64::LDPDi:
+    return AArch64::LDPDpost;
+  case AArch64::LDPQi:
+    return AArch64::LDPQpost;
+  case AArch64::LDPWi:
+    return AArch64::LDPWpost;
+  case AArch64::LDPXi:
+    return AArch64::LDPXpost;
+  case AArch64::STPSi:
+    return AArch64::STPSpost;
+  case AArch64::STPDi:
+    return AArch64::STPDpost;
+  case AArch64::STPQi:
+    return AArch64::STPQpost;
+  case AArch64::STPWi:
+    return AArch64::STPWpost;
+  case AArch64::STPXi:
+    return AArch64::STPXpost;
+  }
+}
+
+static bool isPairedLdSt(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDPSi:
+  case AArch64::LDPSWi:
+  case AArch64::LDPDi:
+  case AArch64::LDPQi:
+  case AArch64::LDPWi:
+  case AArch64::LDPXi:
+  case AArch64::STPSi:
+  case AArch64::STPDi:
+  case AArch64::STPQi:
+  case AArch64::STPWi:
+  case AArch64::STPXi:
+    return true;
+  }
+}
+
+static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
+                                          unsigned PairedRegOp = 0) {
+  assert(PairedRegOp < 2 && "Unexpected register operand idx.");
+  unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
+  return MI.getOperand(Idx);
+}
+
+static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
+  unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
+  return MI.getOperand(Idx);
+}
+
+static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
+  unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
+  return MI.getOperand(Idx);
+}
+
+static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
+                                  MachineInstr &StoreInst,
+                                  const AArch64InstrInfo *TII) {
+  assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
+  int LoadSize = getMemScale(LoadInst);
+  int StoreSize = getMemScale(StoreInst);
+  int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
+                             ? getLdStOffsetOp(StoreInst).getImm()
+                             : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
+  int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
+                             ? getLdStOffsetOp(LoadInst).getImm()
+                             : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
+  return (UnscaledStOffset <= UnscaledLdOffset) &&
+         (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
+}
+
+static bool isPromotableZeroStoreInst(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
+          isNarrowStore(Opc)) &&
+         getLdStRegOp(MI).getReg() == AArch64::WZR;
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
+                                           MachineBasicBlock::iterator MergeMI,
+                                           const LdStPairFlags &Flags) {
+  assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
+         "Expected promotable zero stores.");
+
+  MachineBasicBlock::iterator NextI = I;
+  ++NextI;
+  // If NextI is the second of the two instructions to be merged, we need
+  // to skip one further. Either way we merge will invalidate the iterator,
+  // and we don't need to scan the new instruction, as it's a pairwise
+  // instruction, which we're not considering for further action anyway.
+  if (NextI == MergeMI)
+    ++NextI;
+
+  unsigned Opc = I->getOpcode();
+  bool IsScaled = !TII->isUnscaledLdSt(Opc);
+  int OffsetStride = IsScaled ? 1 : getMemScale(*I);
+
+  bool MergeForward = Flags.getMergeForward();
+  // Insert our new paired instruction after whichever of the paired
+  // instructions MergeForward indicates.
+  MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
+  // Also based on MergeForward is from where we copy the base register operand
+  // so we get the flags compatible with the input code.
+  const MachineOperand &BaseRegOp =
+      MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
+
+  // Which register is Rt and which is Rt2 depends on the offset order.
+  MachineInstr *RtMI;
+  if (getLdStOffsetOp(*I).getImm() ==
+      getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
+    RtMI = &*MergeMI;
+  else
+    RtMI = &*I;
+
+  int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
+  // Change the scaled offset from small to large type.
+  if (IsScaled) {
+    assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
+    OffsetImm /= 2;
+  }
+
+  // Construct the new instruction.
+  DebugLoc DL = I->getDebugLoc();
+  MachineBasicBlock *MBB = I->getParent();
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
+            .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
+            .addOperand(BaseRegOp)
+            .addImm(OffsetImm)
+            .setMemRefs(I->mergeMemRefsWith(*MergeMI));
+  (void)MIB;
+
+  DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(MergeMI->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions.
+  I->eraseFromParent();
+  MergeMI->eraseFromParent();
+  return NextI;
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
+                                      MachineBasicBlock::iterator Paired,
+                                      const LdStPairFlags &Flags) {
+  MachineBasicBlock::iterator NextI = I;
+  ++NextI;
+  // If NextI is the second of the two instructions to be merged, we need
+  // to skip one further. Either way we merge will invalidate the iterator,
+  // and we don't need to scan the new instruction, as it's a pairwise
+  // instruction, which we're not considering for further action anyway.
+  if (NextI == Paired)
+    ++NextI;
+
+  int SExtIdx = Flags.getSExtIdx();
+  unsigned Opc =
+      SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
+  bool IsUnscaled = TII->isUnscaledLdSt(Opc);
+  int OffsetStride = IsUnscaled ? getMemScale(*I) : 1;
+
+  bool MergeForward = Flags.getMergeForward();
+  // Insert our new paired instruction after whichever of the paired
+  // instructions MergeForward indicates.
+  MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
+  // Also based on MergeForward is from where we copy the base register operand
+  // so we get the flags compatible with the input code.
+  const MachineOperand &BaseRegOp =
+      MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
+
+  int Offset = getLdStOffsetOp(*I).getImm();
+  int PairedOffset = getLdStOffsetOp(*Paired).getImm();
+  bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
+  if (IsUnscaled != PairedIsUnscaled) {
+    // We're trying to pair instructions that differ in how they are scaled.  If
+    // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
+    // the opposite (i.e., make Paired's offset unscaled).
+    int MemSize = getMemScale(*Paired);
+    if (PairedIsUnscaled) {
+      // If the unscaled offset isn't a multiple of the MemSize, we can't
+      // pair the operations together.
+      assert(!(PairedOffset % getMemScale(*Paired)) &&
+             "Offset should be a multiple of the stride!");
+      PairedOffset /= MemSize;
+    } else {
+      PairedOffset *= MemSize;
+    }
+  }
+
+  // Which register is Rt and which is Rt2 depends on the offset order.
+  MachineInstr *RtMI, *Rt2MI;
+  if (Offset == PairedOffset + OffsetStride) {
+    RtMI = &*Paired;
+    Rt2MI = &*I;
+    // Here we swapped the assumption made for SExtIdx.
+    // I.e., we turn ldp I, Paired into ldp Paired, I.
+    // Update the index accordingly.
+    if (SExtIdx != -1)
+      SExtIdx = (SExtIdx + 1) % 2;
+  } else {
+    RtMI = &*I;
+    Rt2MI = &*Paired;
+  }
+  int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
+  // Scale the immediate offset, if necessary.
+  if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
+    assert(!(OffsetImm % getMemScale(*RtMI)) &&
+           "Unscaled offset cannot be scaled.");
+    OffsetImm /= getMemScale(*RtMI);
+  }
+
+  // Construct the new instruction.
+  MachineInstrBuilder MIB;
+  DebugLoc DL = I->getDebugLoc();
+  MachineBasicBlock *MBB = I->getParent();
+  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
+            .addOperand(getLdStRegOp(*RtMI))
+            .addOperand(getLdStRegOp(*Rt2MI))
+            .addOperand(BaseRegOp)
+            .addImm(OffsetImm)
+            .setMemRefs(I->mergeMemRefsWith(*Paired));
+
+  (void)MIB;
+
+  DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Paired->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  if (SExtIdx != -1) {
+    // Generate the sign extension for the proper result of the ldp.
+    // I.e., with X1, that would be:
+    // %W1<def> = KILL %W1, %X1<imp-def>
+    // %X1<def> = SBFMXri %X1<kill>, 0, 31
+    MachineOperand &DstMO = MIB->getOperand(SExtIdx);
+    // Right now, DstMO has the extended register, since it comes from an
+    // extended opcode.
+    unsigned DstRegX = DstMO.getReg();
+    // Get the W variant of that register.
+    unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
+    // Update the result of LDP to use the W instead of the X variant.
+    DstMO.setReg(DstRegW);
+    DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+    DEBUG(dbgs() << "\n");
+    // Make the machine verifier happy by providing a definition for
+    // the X register.
+    // Insert this definition right after the generated LDP, i.e., before
+    // InsertionPoint.
+    MachineInstrBuilder MIBKill =
+        BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
+            .addReg(DstRegW)
+            .addReg(DstRegX, RegState::Define);
+    MIBKill->getOperand(2).setImplicit();
+    // Create the sign extension.
+    MachineInstrBuilder MIBSXTW =
+        BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
+            .addReg(DstRegX)
+            .addImm(0)
+            .addImm(31);
+    (void)MIBSXTW;
+    DEBUG(dbgs() << "  Extend operand:\n    ");
+    DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
+  } else {
+    DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  }
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions.
+  I->eraseFromParent();
+  Paired->eraseFromParent();
+
+  return NextI;
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
+                                          MachineBasicBlock::iterator StoreI) {
+  MachineBasicBlock::iterator NextI = LoadI;
+  ++NextI;
+
+  int LoadSize = getMemScale(*LoadI);
+  int StoreSize = getMemScale(*StoreI);
+  unsigned LdRt = getLdStRegOp(*LoadI).getReg();
+  unsigned StRt = getLdStRegOp(*StoreI).getReg();
+  bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
+
+  assert((IsStoreXReg ||
+          TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
+         "Unexpected RegClass");
+
+  MachineInstr *BitExtMI;
+  if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
+    // Remove the load, if the destination register of the loads is the same
+    // register for stored value.
+    if (StRt == LdRt && LoadSize == 8) {
+      StoreI->clearRegisterKills(StRt, TRI);
+      DEBUG(dbgs() << "Remove load instruction:\n    ");
+      DEBUG(LoadI->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+      LoadI->eraseFromParent();
+      return NextI;
+    }
+    // Replace the load with a mov if the load and store are in the same size.
+    BitExtMI =
+        BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+                TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
+            .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
+            .addReg(StRt)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+  } else {
+    // FIXME: Currently we disable this transformation in big-endian targets as
+    // performance and correctness are verified only in little-endian.
+    if (!Subtarget->isLittleEndian())
+      return NextI;
+    bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
+    assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
+           "Unsupported ld/st match");
+    assert(LoadSize <= StoreSize && "Invalid load size");
+    int UnscaledLdOffset = IsUnscaled
+                               ? getLdStOffsetOp(*LoadI).getImm()
+                               : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
+    int UnscaledStOffset = IsUnscaled
+                               ? getLdStOffsetOp(*StoreI).getImm()
+                               : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
+    int Width = LoadSize * 8;
+    int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
+    int Imms = Immr + Width - 1;
+    unsigned DestReg = IsStoreXReg
+                           ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
+                                                      &AArch64::GPR64RegClass)
+                           : LdRt;
+
+    assert((UnscaledLdOffset >= UnscaledStOffset &&
+            (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
+           "Invalid offset");
+
+    Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
+    Imms = Immr + Width - 1;
+    if (UnscaledLdOffset == UnscaledStOffset) {
+      uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
+                                | ((Immr) << 6)               // immr
+                                | ((Imms) << 0)               // imms
+          ;
+
+      BitExtMI =
+          BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+                  TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
+                  DestReg)
+              .addReg(StRt)
+              .addImm(AndMaskEncoded);
+    } else {
+      BitExtMI =
+          BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+                  TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
+                  DestReg)
+              .addReg(StRt)
+              .addImm(Immr)
+              .addImm(Imms);
+    }
+  }
+  StoreI->clearRegisterKills(StRt, TRI);
+
+  (void)BitExtMI;
+
+  DEBUG(dbgs() << "Promoting load by replacing :\n    ");
+  DEBUG(StoreI->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(LoadI->print(dbgs()));
+  DEBUG(dbgs() << "  with instructions:\n    ");
+  DEBUG(StoreI->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG((BitExtMI)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions.
+  LoadI->eraseFromParent();
+  return NextI;
+}
+
+/// trackRegDefsUses - Remember what registers the specified instruction uses
+/// and modifies.
+static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs,
+                             BitVector &UsedRegs,
+                             const TargetRegisterInfo *TRI) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isRegMask())
+      ModifiedRegs.setBitsNotInMask(MO.getRegMask());
+
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isDef()) {
+      // WZR/XZR are not modified even when used as a destination register.
+      if (Reg != AArch64::WZR && Reg != AArch64::XZR)
+        for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+          ModifiedRegs.set(*AI);
+    } else {
+      assert(MO.isUse() && "Reg operand not a def and not a use?!?");
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        UsedRegs.set(*AI);
+    }
+  }
+}
+
+static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
+  // Convert the byte-offset used by unscaled into an "element" offset used
+  // by the scaled pair load/store instructions.
+  if (IsUnscaled) {
+    // If the byte-offset isn't a multiple of the stride, there's no point
+    // trying to match it.
+    if (Offset % OffsetStride)
+      return false;
+    Offset /= OffsetStride;
+  }
+  return Offset <= 63 && Offset >= -64;
+}
+
+// Do alignment, specialized to power of 2 and for signed ints,
+// avoiding having to do a C-style cast from uint_64t to int when
+// using alignTo from include/llvm/Support/MathExtras.h.
+// FIXME: Move this function to include/MathExtras.h?
+static int alignTo(int Num, int PowOf2) {
+  return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
+}
+
+static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
+                     const AArch64InstrInfo *TII) {
+  // One of the instructions must modify memory.
+  if (!MIa.mayStore() && !MIb.mayStore())
+    return false;
+
+  // Both instructions must be memory operations.
+  if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
+    return false;
+
+  return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
+}
+
+static bool mayAlias(MachineInstr &MIa,
+                     SmallVectorImpl<MachineInstr *> &MemInsns,
+                     const AArch64InstrInfo *TII) {
+  for (MachineInstr *MIb : MemInsns)
+    if (mayAlias(MIa, *MIb, TII))
+      return true;
+
+  return false;
+}
+
+bool AArch64LoadStoreOpt::findMatchingStore(
+    MachineBasicBlock::iterator I, unsigned Limit,
+    MachineBasicBlock::iterator &StoreI) {
+  MachineBasicBlock::iterator B = I->getParent()->begin();
+  MachineBasicBlock::iterator MBBI = I;
+  MachineInstr &LoadMI = *I;
+  unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();
+
+  // If the load is the first instruction in the block, there's obviously
+  // not any matching store.
+  if (MBBI == B)
+    return false;
+
+  // Track which registers have been modified and used between the first insn
+  // and the second insn.
+  ModifiedRegs.reset();
+  UsedRegs.reset();
+
+  unsigned Count = 0;
+  do {
+    --MBBI;
+    MachineInstr &MI = *MBBI;
+
+    // Don't count transient instructions towards the search limit since there
+    // may be different numbers of them if e.g. debug information is present.
+    if (!MI.isTransient())
+      ++Count;
+
+    // If the load instruction reads directly from the address to which the
+    // store instruction writes and the stored value is not modified, we can
+    // promote the load. Since we do not handle stores with pre-/post-index,
+    // it's unnecessary to check if BaseReg is modified by the store itself.
+    if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
+        BaseReg == getLdStBaseOp(MI).getReg() &&
+        isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
+        !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
+      StoreI = MBBI;
+      return true;
+    }
+
+    if (MI.isCall())
+      return false;
+
+    // Update modified / uses register lists.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg])
+      return false;
+
+    // If we encounter a store aliased with the load, return early.
+    if (MI.mayStore() && mayAlias(LoadMI, MI, TII))
+      return false;
+  } while (MBBI != B && Count < Limit);
+  return false;
+}
+
+// Returns true if FirstMI and MI are candidates for merging or pairing.
+// Otherwise, returns false.
+static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
+                                       LdStPairFlags &Flags,
+                                       const AArch64InstrInfo *TII) {
+  // If this is volatile or if pairing is suppressed, not a candidate.
+  if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
+    return false;
+
+  // We should have already checked FirstMI for pair suppression and volatility.
+  assert(!FirstMI.hasOrderedMemoryRef() &&
+         !TII->isLdStPairSuppressed(FirstMI) &&
+         "FirstMI shouldn't get here if either of these checks are true.");
+
+  unsigned OpcA = FirstMI.getOpcode();
+  unsigned OpcB = MI.getOpcode();
+
+  // Opcodes match: nothing more to check.
+  if (OpcA == OpcB)
+    return true;
+
+  // Try to match a sign-extended load/store with a zero-extended load/store.
+  bool IsValidLdStrOpc, PairIsValidLdStrOpc;
+  unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
+  assert(IsValidLdStrOpc &&
+         "Given Opc should be a Load or Store with an immediate");
+  // OpcA will be the first instruction in the pair.
+  if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
+    Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
+    return true;
+  }
+
+  // If the second instruction isn't even a mergable/pairable load/store, bail
+  // out.
+  if (!PairIsValidLdStrOpc)
+    return false;
+
+  // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
+  // offsets.
+  if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
+    return false;
+
+  // Try to match an unscaled load/store with a scaled load/store.
+  return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
+         getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
+
+  // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
+}
+
+/// Scan the instructions looking for a load/store that can be combined with the
+/// current instruction into a wider equivalent or a load/store pair.
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
+                                      LdStPairFlags &Flags, unsigned Limit,
+                                      bool FindNarrowMerge) {
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator MBBI = I;
+  MachineInstr &FirstMI = *I;
+  ++MBBI;
+
+  bool MayLoad = FirstMI.mayLoad();
+  bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
+  unsigned Reg = getLdStRegOp(FirstMI).getReg();
+  unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
+  int Offset = getLdStOffsetOp(FirstMI).getImm();
+  int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
+  bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  ModifiedRegs.reset();
+  UsedRegs.reset();
+
+  // Remember any instructions that read/write memory between FirstMI and MI.
+  SmallVector<MachineInstr *, 4> MemInsns;
+
+  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+    MachineInstr &MI = *MBBI;
+
+    // Don't count transient instructions towards the search limit since there
+    // may be different numbers of them if e.g. debug information is present.
+    if (!MI.isTransient())
+      ++Count;
+
+    Flags.setSExtIdx(-1);
+    if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
+        getLdStOffsetOp(MI).isImm()) {
+      assert(MI.mayLoadOrStore() && "Expected memory operation.");
+      // If we've found another instruction with the same opcode, check to see
+      // if the base and offset are compatible with our starting instruction.
+      // These instructions all have scaled immediate operands, so we just
+      // check for +1/-1. Make sure to check the new instruction offset is
+      // actually an immediate and not a symbolic reference destined for
+      // a relocation.
+      unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
+      int MIOffset = getLdStOffsetOp(MI).getImm();
+      bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
+      if (IsUnscaled != MIIsUnscaled) {
+        // We're trying to pair instructions that differ in how they are scaled.
+        // If FirstMI is scaled then scale the offset of MI accordingly.
+        // Otherwise, do the opposite (i.e., make MI's offset unscaled).
+        int MemSize = getMemScale(MI);
+        if (MIIsUnscaled) {
+          // If the unscaled offset isn't a multiple of the MemSize, we can't
+          // pair the operations together: bail and keep looking.
+          if (MIOffset % MemSize) {
+            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+            MemInsns.push_back(&MI);
+            continue;
+          }
+          MIOffset /= MemSize;
+        } else {
+          MIOffset *= MemSize;
+        }
+      }
+
+      if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
+                                   (Offset + OffsetStride == MIOffset))) {
+        int MinOffset = Offset < MIOffset ? Offset : MIOffset;
+        if (FindNarrowMerge) {
+          // If the alignment requirements of the scaled wide load/store
+          // instruction can't express the offset of the scaled narrow input,
+          // bail and keep looking. For promotable zero stores, allow only when
+          // the stored value is the same (i.e., WZR).
+          if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
+              (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
+            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+            MemInsns.push_back(&MI);
+            continue;
+          }
+        } else {
+          // Pairwise instructions have a 7-bit signed offset field. Single
+          // insns have a 12-bit unsigned offset field.  If the resultant
+          // immediate offset of merging these instructions is out of range for
+          // a pairwise instruction, bail and keep looking.
+          if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
+            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+            MemInsns.push_back(&MI);
+            continue;
+          }
+          // If the alignment requirements of the paired (scaled) instruction
+          // can't express the offset of the unscaled input, bail and keep
+          // looking.
+          if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
+            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+            MemInsns.push_back(&MI);
+            continue;
+          }
+        }
+        // If the destination register of the loads is the same register, bail
+        // and keep looking. A load-pair instruction with both destination
+        // registers the same is UNPREDICTABLE and will result in an exception.
+        if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          MemInsns.push_back(&MI);
+          continue;
+        }
+
+        // If the Rt of the second instruction was not modified or used between
+        // the two instructions and none of the instructions between the second
+        // and first alias with the second, we can combine the second into the
+        // first.
+        if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
+            !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
+            !mayAlias(MI, MemInsns, TII)) {
+          Flags.setMergeForward(false);
+          return MBBI;
+        }
+
+        // Likewise, if the Rt of the first instruction is not modified or used
+        // between the two instructions and none of the instructions between the
+        // first and the second alias with the first, we can combine the first
+        // into the second.
+        if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
+            !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
+            !mayAlias(FirstMI, MemInsns, TII)) {
+          Flags.setMergeForward(true);
+          return MBBI;
+        }
+        // Unable to combine these instructions due to interference in between.
+        // Keep looking.
+      }
+    }
+
+    // If the instruction wasn't a matching load or store.  Stop searching if we
+    // encounter a call instruction that might modify memory.
+    if (MI.isCall())
+      return E;
+
+    // Update modified / uses register lists.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg])
+      return E;
+
+    // Update list of instructions that read/write memory.
+    if (MI.mayLoadOrStore())
+      MemInsns.push_back(&MI);
+  }
+  return E;
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
+                                     MachineBasicBlock::iterator Update,
+                                     bool IsPreIdx) {
+  assert((Update->getOpcode() == AArch64::ADDXri ||
+          Update->getOpcode() == AArch64::SUBXri) &&
+         "Unexpected base register update instruction to merge!");
+  MachineBasicBlock::iterator NextI = I;
+  // Return the instruction following the merged instruction, which is
+  // the instruction following our unmerged load. Unless that's the add/sub
+  // instruction we're merging, in which case it's the one after that.
+  if (++NextI == Update)
+    ++NextI;
+
+  int Value = Update->getOperand(2).getImm();
+  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+         "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
+  if (Update->getOpcode() == AArch64::SUBXri)
+    Value = -Value;
+
+  unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
+                             : getPostIndexedOpcode(I->getOpcode());
+  MachineInstrBuilder MIB;
+  if (!isPairedLdSt(*I)) {
+    // Non-paired instruction.
+    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+              .addOperand(getLdStRegOp(*Update))
+              .addOperand(getLdStRegOp(*I))
+              .addOperand(getLdStBaseOp(*I))
+              .addImm(Value)
+              .setMemRefs(I->memoperands_begin(), I->memoperands_end());
+  } else {
+    // Paired instruction.
+    int Scale = getMemScale(*I);
+    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+              .addOperand(getLdStRegOp(*Update))
+              .addOperand(getLdStRegOp(*I, 0))
+              .addOperand(getLdStRegOp(*I, 1))
+              .addOperand(getLdStBaseOp(*I))
+              .addImm(Value / Scale)
+              .setMemRefs(I->memoperands_begin(), I->memoperands_end());
+  }
+  (void)MIB;
+
+  if (IsPreIdx)
+    DEBUG(dbgs() << "Creating pre-indexed load/store.");
+  else
+    DEBUG(dbgs() << "Creating post-indexed load/store.");
+  DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Update->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions for the block.
+  I->eraseFromParent();
+  Update->eraseFromParent();
+
+  return NextI;
+}
+
+bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
+                                               MachineInstr &MI,
+                                               unsigned BaseReg, int Offset) {
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case AArch64::SUBXri:
+  case AArch64::ADDXri:
+    // Make sure it's a vanilla immediate operand, not a relocation or
+    // anything else we can't handle.
+    if (!MI.getOperand(2).isImm())
+      break;
+    // Watch out for 1 << 12 shifted value.
+    if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
+      break;
+
+    // The update instruction source and destination register must be the
+    // same as the load/store base register.
+    if (MI.getOperand(0).getReg() != BaseReg ||
+        MI.getOperand(1).getReg() != BaseReg)
+      break;
+
+    bool IsPairedInsn = isPairedLdSt(MemMI);
+    int UpdateOffset = MI.getOperand(2).getImm();
+    if (MI.getOpcode() == AArch64::SUBXri)
+      UpdateOffset = -UpdateOffset;
+
+    // For non-paired load/store instructions, the immediate must fit in a
+    // signed 9-bit integer.
+    if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
+      break;
+
+    // For paired load/store instructions, the immediate must be a multiple of
+    // the scaling factor.  The scaled offset must also fit into a signed 7-bit
+    // integer.
+    if (IsPairedInsn) {
+      int Scale = getMemScale(MemMI);
+      if (UpdateOffset % Scale != 0)
+        break;
+
+      int ScaledOffset = UpdateOffset / Scale;
+      if (ScaledOffset > 63 || ScaledOffset < -64)
+        break;
+    }
+
+    // If we have a non-zero Offset, we check that it matches the amount
+    // we're adding to the register.
+    if (!Offset || Offset == UpdateOffset)
+      return true;
+    break;
+  }
+  return false;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
+    MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr &MemMI = *I;
+  MachineBasicBlock::iterator MBBI = I;
+
+  unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
+  int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
+
+  // Scan forward looking for post-index opportunities.  Updating instructions
+  // can't be formed if the memory instruction doesn't have the offset we're
+  // looking for.
+  if (MIUnscaledOffset != UnscaledOffset)
+    return E;
+
+  // If the base register overlaps a destination register, we can't
+  // merge the update.
+  bool IsPairedInsn = isPairedLdSt(MemMI);
+  for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
+    unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
+    if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+      return E;
+  }
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  ModifiedRegs.reset();
+  UsedRegs.reset();
+  ++MBBI;
+  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+    MachineInstr &MI = *MBBI;
+
+    // Don't count transient instructions towards the search limit since there
+    // may be different numbers of them if e.g. debug information is present.
+    if (!MI.isTransient())
+      ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
+      return MBBI;
+
+    // Update the status of what the instruction clobbered and used.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is used or modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
+    MachineBasicBlock::iterator I, unsigned Limit) {
+  MachineBasicBlock::iterator B = I->getParent()->begin();
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr &MemMI = *I;
+  MachineBasicBlock::iterator MBBI = I;
+
+  unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
+  int Offset = getLdStOffsetOp(MemMI).getImm();
+
+  // If the load/store is the first instruction in the block, there's obviously
+  // not any matching update. Ditto if the memory offset isn't zero.
+  if (MBBI == B || Offset != 0)
+    return E;
+  // If the base register overlaps a destination register, we can't
+  // merge the update.
+  bool IsPairedInsn = isPairedLdSt(MemMI);
+  for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
+    unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
+    if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+      return E;
+  }
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  ModifiedRegs.reset();
+  UsedRegs.reset();
+  unsigned Count = 0;
+  do {
+    --MBBI;
+    MachineInstr &MI = *MBBI;
+
+    // Don't count transient instructions towards the search limit since there
+    // may be different numbers of them if e.g. debug information is present.
+    if (!MI.isTransient())
+      ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
+      return MBBI;
+
+    // Update the status of what the instruction clobbered and used.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is used or modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+      return E;
+  } while (MBBI != B && Count < Limit);
+  return E;
+}
+
+bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
+    MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  // If this is a volatile load, don't mess with it.
+  if (MI.hasOrderedMemoryRef())
+    return false;
+
+  // Make sure this is a reg+imm.
+  // FIXME: It is possible to extend it to handle reg+reg cases.
+  if (!getLdStOffsetOp(MI).isImm())
+    return false;
+
+  // Look backward up to LdStLimit instructions.
+  MachineBasicBlock::iterator StoreI;
+  if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
+    ++NumLoadsFromStoresPromoted;
+    // Promote the load. Keeping the iterator straight is a
+    // pain, so we let the merge routine tell us what the next instruction
+    // is after it's done mucking about.
+    MBBI = promoteLoadFromStore(MBBI, StoreI);
+    return true;
+  }
+  return false;
+}
+
+// Merge adjacent zero stores into a wider store.
+bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
+    MachineBasicBlock::iterator &MBBI) {
+  assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock::iterator E = MI.getParent()->end();
+
+  if (!TII->isCandidateToMergeOrPair(MI))
+    return false;
+
+  // Look ahead up to LdStLimit instructions for a mergable instruction.
+  LdStPairFlags Flags;
+  MachineBasicBlock::iterator MergeMI =
+      findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
+  if (MergeMI != E) {
+    ++NumZeroStoresPromoted;
+
+    // Keeping the iterator straight is a pain, so we let the merge routine tell
+    // us what the next instruction is after it's done mucking about.
+    MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
+    return true;
+  }
+  return false;
+}
+
+// Find loads and stores that can be merged into a single load or store pair
+// instruction.
+bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock::iterator E = MI.getParent()->end();
+
+  if (!TII->isCandidateToMergeOrPair(MI))
+    return false;
+
+  // Early exit if the offset is not possible to match. (6 bits of positive
+  // range, plus allow an extra one in case we find a later insn that matches
+  // with Offset-1)
+  bool IsUnscaled = TII->isUnscaledLdSt(MI);
+  int Offset = getLdStOffsetOp(MI).getImm();
+  int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
+  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
+    return false;
+
+  // Look ahead up to LdStLimit instructions for a pairable instruction.
+  LdStPairFlags Flags;
+  MachineBasicBlock::iterator Paired =
+      findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
+  if (Paired != E) {
+    ++NumPairCreated;
+    if (TII->isUnscaledLdSt(MI))
+      ++NumUnscaledPairCreated;
+    // Keeping the iterator straight is a pain, so we let the merge routine tell
+    // us what the next instruction is after it's done mucking about.
+    MBBI = mergePairedInsns(MBBI, Paired, Flags);
+    return true;
+  }
+  return false;
+}
+
+bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
+                                        bool EnableNarrowZeroStOpt) {
+  bool Modified = false;
+  // Four tranformations to do here:
+  // 1) Find loads that directly read from stores and promote them by
+  //    replacing with mov instructions. If the store is wider than the load,
+  //    the load will be replaced with a bitfield extract.
+  //      e.g.,
+  //        str w1, [x0, #4]
+  //        ldrh w2, [x0, #6]
+  //        ; becomes
+  //        str w1, [x0, #4]
+  //        lsr w2, w1, #16
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    MachineInstr &MI = *MBBI;
+    switch (MI.getOpcode()) {
+    default:
+      // Just move on to the next instruction.
+      ++MBBI;
+      break;
+    // Scaled instructions.
+    case AArch64::LDRBBui:
+    case AArch64::LDRHHui:
+    case AArch64::LDRWui:
+    case AArch64::LDRXui:
+    // Unscaled instructions.
+    case AArch64::LDURBBi:
+    case AArch64::LDURHHi:
+    case AArch64::LDURWi:
+    case AArch64::LDURXi: {
+      if (tryToPromoteLoadFromStore(MBBI)) {
+        Modified = true;
+        break;
+      }
+      ++MBBI;
+      break;
+    }
+    }
+  }
+  // 2) Merge adjacent zero stores into a wider store.
+  //      e.g.,
+  //        strh wzr, [x0]
+  //        strh wzr, [x0, #2]
+  //        ; becomes
+  //        str wzr, [x0]
+  //      e.g.,
+  //        str wzr, [x0]
+  //        str wzr, [x0, #4]
+  //        ; becomes
+  //        str xzr, [x0]
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       EnableNarrowZeroStOpt && MBBI != E;) {
+    if (isPromotableZeroStoreInst(*MBBI)) {
+      if (tryToMergeZeroStInst(MBBI)) {
+        Modified = true;
+      } else
+        ++MBBI;
+    } else
+      ++MBBI;
+  }
+
+  // 3) Find loads and stores that can be merged into a single load or store
+  //    pair instruction.
+  //      e.g.,
+  //        ldr x0, [x2]
+  //        ldr x1, [x2, #8]
+  //        ; becomes
+  //        ldp x0, x1, [x2]
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
+      Modified = true;
+    else
+      ++MBBI;
+  }
+  // 4) Find base register updates that can be merged into the load or store
+  //    as a base-reg writeback.
+  //      e.g.,
+  //        ldr x0, [x2]
+  //        add x2, x2, #4
+  //        ; becomes
+  //        ldr x0, [x2], #4
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    MachineInstr &MI = *MBBI;
+    // Do update merging. It's simpler to keep this separate from the above
+    // switchs, though not strictly necessary.
+    unsigned Opc = MI.getOpcode();
+    switch (Opc) {
+    default:
+      // Just move on to the next instruction.
+      ++MBBI;
+      break;
+    // Scaled instructions.
+    case AArch64::STRSui:
+    case AArch64::STRDui:
+    case AArch64::STRQui:
+    case AArch64::STRXui:
+    case AArch64::STRWui:
+    case AArch64::STRHHui:
+    case AArch64::STRBBui:
+    case AArch64::LDRSui:
+    case AArch64::LDRDui:
+    case AArch64::LDRQui:
+    case AArch64::LDRXui:
+    case AArch64::LDRWui:
+    case AArch64::LDRHHui:
+    case AArch64::LDRBBui:
+    // Unscaled instructions.
+    case AArch64::STURSi:
+    case AArch64::STURDi:
+    case AArch64::STURQi:
+    case AArch64::STURWi:
+    case AArch64::STURXi:
+    case AArch64::LDURSi:
+    case AArch64::LDURDi:
+    case AArch64::LDURQi:
+    case AArch64::LDURWi:
+    case AArch64::LDURXi:
+    // Paired instructions.
+    case AArch64::LDPSi:
+    case AArch64::LDPSWi:
+    case AArch64::LDPDi:
+    case AArch64::LDPQi:
+    case AArch64::LDPWi:
+    case AArch64::LDPXi:
+    case AArch64::STPSi:
+    case AArch64::STPDi:
+    case AArch64::STPQi:
+    case AArch64::STPWi:
+    case AArch64::STPXi: {
+      // Make sure this is a reg+imm (as opposed to an address reloc).
+      if (!getLdStOffsetOp(MI).isImm()) {
+        ++MBBI;
+        break;
+      }
+      // Look forward to try to form a post-index instruction. For example,
+      // ldr x0, [x20]
+      // add x20, x20, #32
+      //   merged into:
+      // ldr x0, [x20], #32
+      MachineBasicBlock::iterator Update =
+          findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
+        Modified = true;
+        ++NumPostFolded;
+        break;
+      }
+      // Don't know how to handle pre/post-index versions, so move to the next
+      // instruction.
+      if (TII->isUnscaledLdSt(Opc)) {
+        ++MBBI;
+        break;
+      }
+
+      // Look back to try to find a pre-index instruction. For example,
+      // add x0, x0, #8
+      // ldr x1, [x0]
+      //   merged into:
+      // ldr x1, [x0, #8]!
+      Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
+        Modified = true;
+        ++NumPreFolded;
+        break;
+      }
+      // The immediate in the load/store is scaled by the size of the memory
+      // operation. The immediate in the add we're looking for,
+      // however, is not, so adjust here.
+      int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
+
+      // Look forward to try to find a post-index instruction. For example,
+      // ldr x1, [x0, #64]
+      // add x0, x0, #64
+      //   merged into:
+      // ldr x1, [x0, #64]!
+      Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
+        Modified = true;
+        ++NumPreFolded;
+        break;
+      }
+
+      // Nothing found. Just move to the next instruction.
+      ++MBBI;
+      break;
+    }
+    }
+  }
+
+  return Modified;
+}
+
+bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
+  Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+  TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
+  TRI = Subtarget->getRegisterInfo();
+
+  // Resize the modified and used register bitfield trackers.  We do this once
+  // per function and then clear the bitfield each time we optimize a load or
+  // store.
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+
+  bool Modified = false;
+  bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
+  for (auto &MBB : Fn)
+    Modified |= optimizeBlock(MBB, enableNarrowZeroStOpt);
+
+  return Modified;
+}
+
+// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
+// stores near one another?  Note: The pre-RA instruction scheduler already has
+// hooks to try and schedule pairable loads/stores together to improve pairing
+// opportunities.  Thus, pre-RA pairing pass may not be worth the effort.
+
+// FIXME: When pairing store instructions it's very possible for this pass to
+// hoist a store with a KILL marker above another use (without a KILL marker).
+// The resulting IR is invalid, but nothing uses the KILL markers after this
+// pass, so it's never caused a problem in practice.
+
+/// createAArch64LoadStoreOptimizationPass - returns an instance of the
+/// load / store optimization pass.
+FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
+  return new AArch64LoadStoreOpt();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
new file mode 100644
index 000000000000..45083df7ab45
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -0,0 +1,216 @@
+//==-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst --==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower AArch64 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCInstLower.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+extern cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration;
+
+AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer)
+    : Ctx(ctx), Printer(printer) {}
+
+MCSymbol *
+AArch64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  return Printer.getSymbol(MO.getGlobal());
+}
+
+MCSymbol *
+AArch64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCOperand AArch64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
+                                                       MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+  if ((MO.getTargetFlags() & AArch64II::MO_GOT) != 0) {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_GOTPAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF;
+    else
+      llvm_unreachable("Unexpected target flags with MO_GOT on GV operand");
+  } else if ((MO.getTargetFlags() & AArch64II::MO_TLS) != 0) {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_TLVPPAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF;
+    else
+      llvm_unreachable("Unexpected target flags with MO_TLS on GV operand");
+  } else {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_PAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_PAGEOFF;
+  }
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+  return MCOperand::createExpr(Expr);
+}
+
+MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
+                                                    MCSymbol *Sym) const {
+  uint32_t RefFlags = 0;
+
+  if (MO.getTargetFlags() & AArch64II::MO_GOT)
+    RefFlags |= AArch64MCExpr::VK_GOT;
+  else if (MO.getTargetFlags() & AArch64II::MO_TLS) {
+    TLSModel::Model Model;
+    if (MO.isGlobal()) {
+      const GlobalValue *GV = MO.getGlobal();
+      Model = Printer.TM.getTLSModel(GV);
+      if (!EnableAArch64ELFLocalDynamicTLSGeneration &&
+          Model == TLSModel::LocalDynamic)
+        Model = TLSModel::GeneralDynamic;
+
+    } else {
+      assert(MO.isSymbol() &&
+             StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
+             "unexpected external TLS symbol");
+      // The general dynamic access sequence is used to get the
+      // address of _TLS_MODULE_BASE_.
+      Model = TLSModel::GeneralDynamic;
+    }
+    switch (Model) {
+    case TLSModel::InitialExec:
+      RefFlags |= AArch64MCExpr::VK_GOTTPREL;
+      break;
+    case TLSModel::LocalExec:
+      RefFlags |= AArch64MCExpr::VK_TPREL;
+      break;
+    case TLSModel::LocalDynamic:
+      RefFlags |= AArch64MCExpr::VK_DTPREL;
+      break;
+    case TLSModel::GeneralDynamic:
+      RefFlags |= AArch64MCExpr::VK_TLSDESC;
+      break;
+    }
+  } else {
+    // No modifier means this is a generic reference, classified as absolute for
+    // the cases where it matters (:abs_g0: etc).
+    RefFlags |= AArch64MCExpr::VK_ABS;
+  }
+
+  if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+    RefFlags |= AArch64MCExpr::VK_PAGE;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+           AArch64II::MO_PAGEOFF)
+    RefFlags |= AArch64MCExpr::VK_PAGEOFF;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G3)
+    RefFlags |= AArch64MCExpr::VK_G3;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G2)
+    RefFlags |= AArch64MCExpr::VK_G2;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G1)
+    RefFlags |= AArch64MCExpr::VK_G1;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G0)
+    RefFlags |= AArch64MCExpr::VK_G0;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_HI12)
+    RefFlags |= AArch64MCExpr::VK_HI12;
+
+  if (MO.getTargetFlags() & AArch64II::MO_NC)
+    RefFlags |= AArch64MCExpr::VK_NC;
+
+  const MCExpr *Expr =
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+
+  AArch64MCExpr::VariantKind RefKind;
+  RefKind = static_cast<AArch64MCExpr::VariantKind>(RefFlags);
+  Expr = AArch64MCExpr::create(Expr, RefKind, Ctx);
+
+  return MCOperand::createExpr(Expr);
+}
+
+MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                                 MCSymbol *Sym) const {
+  if (Printer.TM.getTargetTriple().isOSDarwin())
+    return lowerSymbolOperandDarwin(MO, Sym);
+
+  assert(Printer.TM.getTargetTriple().isOSBinFormatELF() &&
+         "Expect Darwin or ELF target");
+  return lowerSymbolOperandELF(MO, Sym);
+}
+
+bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
+                                      MCOperand &MCOp) const {
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit())
+      return false;
+    MCOp = MCOperand::createReg(MO.getReg());
+    break;
+  case MachineOperand::MO_RegisterMask:
+    // Regmasks are like implicit defs.
+    return false;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::createImm(MO.getImm());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::createExpr(
+        MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+    break;
+  case MachineOperand::MO_MCSymbol:
+    MCOp = LowerSymbolOperand(MO, MO.getMCSymbol());
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_BlockAddress:
+    MCOp = LowerSymbolOperand(
+        MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
+    break;
+  }
+  return true;
+}
+
+void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (const MachineOperand &MO : MI->operands()) {
+    MCOperand MCOp;
+    if (lowerOperand(MO, MCOp))
+      OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h
new file mode 100644
index 000000000000..1e29b80c2d62
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h
@@ -0,0 +1,52 @@
+//===-- AArch64MCInstLower.h - Lower MachineInstr to MCInst ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MCINSTLOWER_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MCINSTLOWER_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCAsmInfo;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+class MachineInstr;
+class MachineModuleInfoMachO;
+class MachineOperand;
+class Mangler;
+
+/// AArch64MCInstLower - This class is used to lower an MachineInstr
+/// into an MCInst.
+class LLVM_LIBRARY_VISIBILITY AArch64MCInstLower {
+  MCContext &Ctx;
+  AsmPrinter &Printer;
+  Triple TargetTriple;
+
+public:
+  AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer);
+
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCOperand lowerSymbolOperandDarwin(const MachineOperand &MO,
+                                     MCSymbol *Sym) const;
+  MCOperand lowerSymbolOperandELF(const MachineOperand &MO,
+                                  MCSymbol *Sym) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
new file mode 100644
index 000000000000..ca2860afe13d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -0,0 +1,198 @@
+//=- AArch64MachineFunctionInfo.h - AArch64 machine function info -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares AArch64-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
+
+namespace llvm {
+
+/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
+/// contains private AArch64-specific information for each MachineFunction.
+class AArch64FunctionInfo final : public MachineFunctionInfo {
+
+  /// Number of bytes of arguments this function has on the stack. If the callee
+  /// is expected to restore the argument stack this should be a multiple of 16,
+  /// all usable during a tail call.
+  ///
+  /// The alternative would forbid tail call optimisation in some cases: if we
+  /// want to transfer control from a function with 8-bytes of stack-argument
+  /// space to a function with 16-bytes then misalignment of this value would
+  /// make a stack adjustment necessary, which could not be undone by the
+  /// callee.
+  unsigned BytesInStackArgArea;
+
+  /// The number of bytes to restore to deallocate space for incoming
+  /// arguments. Canonically 0 in the C calling convention, but non-zero when
+  /// callee is expected to pop the args.
+  unsigned ArgumentStackToRestore;
+
+  /// HasStackFrame - True if this function has a stack frame. Set by
+  /// determineCalleeSaves().
+  bool HasStackFrame;
+
+  /// \brief Amount of stack frame size, not including callee-saved registers.
+  unsigned LocalStackSize;
+
+  /// \brief Amount of stack frame size used for saving callee-saved registers.
+  unsigned CalleeSavedStackSize;
+
+  /// \brief Number of TLS accesses using the special (combinable)
+  /// _TLS_MODULE_BASE_ symbol.
+  unsigned NumLocalDynamicTLSAccesses;
+
+  /// \brief FrameIndex for start of varargs area for arguments passed on the
+  /// stack.
+  int VarArgsStackIndex;
+
+  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// general purpose registers.
+  int VarArgsGPRIndex;
+
+  /// \brief Size of the varargs area for arguments passed in general purpose
+  /// registers.
+  unsigned VarArgsGPRSize;
+
+  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// floating-point registers.
+  int VarArgsFPRIndex;
+
+  /// \brief Size of the varargs area for arguments passed in floating-point
+  /// registers.
+  unsigned VarArgsFPRSize;
+
+  /// True if this function has a subset of CSRs that is handled explicitly via
+  /// copies.
+  bool IsSplitCSR;
+
+  /// True when the stack gets realigned dynamically because the size of stack
+  /// frame is unknown at compile time. e.g., in case of VLAs.
+  bool StackRealigned;
+
+  /// True when the callee-save stack area has unused gaps that may be used for
+  /// other stack allocations.
+  bool CalleeSaveStackHasFreeSpace;
+
+public:
+  AArch64FunctionInfo()
+      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
+        IsSplitCSR(false), StackRealigned(false),
+        CalleeSaveStackHasFreeSpace(false) {}
+
+  explicit AArch64FunctionInfo(MachineFunction &MF)
+      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
+        IsSplitCSR(false), StackRealigned(false),
+        CalleeSaveStackHasFreeSpace(false) {
+    (void)MF;
+  }
+
+  unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
+  void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; }
+
+  unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
+  void setArgumentStackToRestore(unsigned bytes) {
+    ArgumentStackToRestore = bytes;
+  }
+
+  bool hasStackFrame() const { return HasStackFrame; }
+  void setHasStackFrame(bool s) { HasStackFrame = s; }
+
+  bool isStackRealigned() const { return StackRealigned; }
+  void setStackRealigned(bool s) { StackRealigned = s; }
+
+  bool hasCalleeSaveStackFreeSpace() const {
+    return CalleeSaveStackHasFreeSpace;
+  }
+  void setCalleeSaveStackHasFreeSpace(bool s) {
+    CalleeSaveStackHasFreeSpace = s;
+  }
+
+  bool isSplitCSR() const { return IsSplitCSR; }
+  void setIsSplitCSR(bool s) { IsSplitCSR = s; }
+
+  void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
+  unsigned getLocalStackSize() const { return LocalStackSize; }
+
+  void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; }
+  unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; }
+
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
+  unsigned getNumLocalDynamicTLSAccesses() const {
+    return NumLocalDynamicTLSAccesses;
+  }
+
+  int getVarArgsStackIndex() const { return VarArgsStackIndex; }
+  void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; }
+
+  int getVarArgsGPRIndex() const { return VarArgsGPRIndex; }
+  void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; }
+
+  unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; }
+  void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; }
+
+  int getVarArgsFPRIndex() const { return VarArgsFPRIndex; }
+  void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; }
+
+  unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
+  void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
+
+  typedef SmallPtrSet<const MachineInstr *, 16> SetOfInstructions;
+
+  const SetOfInstructions &getLOHRelated() const { return LOHRelated; }
+
+  // Shortcuts for LOH related types.
+  class MILOHDirective {
+    MCLOHType Kind;
+
+    /// Arguments of this directive. Order matters.
+    SmallVector<const MachineInstr *, 3> Args;
+
+  public:
+    typedef ArrayRef<const MachineInstr *> LOHArgs;
+
+    MILOHDirective(MCLOHType Kind, LOHArgs Args)
+        : Kind(Kind), Args(Args.begin(), Args.end()) {
+      assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!");
+    }
+
+    MCLOHType getKind() const { return Kind; }
+    LOHArgs getArgs() const { return Args; }
+  };
+
+  typedef MILOHDirective::LOHArgs MILOHArgs;
+  typedef SmallVector<MILOHDirective, 32> MILOHContainer;
+
+  const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
+
+  /// Add a LOH directive of this @p Kind and this @p Args.
+  void addLOHDirective(MCLOHType Kind, MILOHArgs Args) {
+    LOHContainerSet.push_back(MILOHDirective(Kind, Args));
+    LOHRelated.insert(Args.begin(), Args.end());
+  }
+
+private:
+  // Hold the lists of LOHs.
+  MILOHContainer LOHContainerSet;
+  SetOfInstructions LOHRelated;
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
new file mode 100644
index 000000000000..038162c6f54a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -0,0 +1,383 @@
+//===-- AArch64PBQPRegAlloc.cpp - AArch64 specific PBQP constraints -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file contains the AArch64 / Cortex-A57 specific register allocation
+// constraints for use by the PBQP register allocator.
+//
+// It is essentially a transcription of what is contained in
+// AArch64A57FPLoadBalancing, which tries to use a balanced
+// mix of odd and even D-registers when performing a critical sequence of
+// independent, non-quadword FP/ASIMD floating-point multiply-accumulates.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64-pbqp"
+
+#include "AArch64.h"
+#include "AArch64PBQPRegAlloc.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegAllocPBQP.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+#ifndef NDEBUG
+bool isFPReg(unsigned reg) {
+  return AArch64::FPR32RegClass.contains(reg) ||
+         AArch64::FPR64RegClass.contains(reg) ||
+         AArch64::FPR128RegClass.contains(reg);
+}
+#endif
+
+bool isOdd(unsigned reg) {
+  switch (reg) {
+  default:
+    llvm_unreachable("Register is not from the expected class !");
+  case AArch64::S1:
+  case AArch64::S3:
+  case AArch64::S5:
+  case AArch64::S7:
+  case AArch64::S9:
+  case AArch64::S11:
+  case AArch64::S13:
+  case AArch64::S15:
+  case AArch64::S17:
+  case AArch64::S19:
+  case AArch64::S21:
+  case AArch64::S23:
+  case AArch64::S25:
+  case AArch64::S27:
+  case AArch64::S29:
+  case AArch64::S31:
+  case AArch64::D1:
+  case AArch64::D3:
+  case AArch64::D5:
+  case AArch64::D7:
+  case AArch64::D9:
+  case AArch64::D11:
+  case AArch64::D13:
+  case AArch64::D15:
+  case AArch64::D17:
+  case AArch64::D19:
+  case AArch64::D21:
+  case AArch64::D23:
+  case AArch64::D25:
+  case AArch64::D27:
+  case AArch64::D29:
+  case AArch64::D31:
+  case AArch64::Q1:
+  case AArch64::Q3:
+  case AArch64::Q5:
+  case AArch64::Q7:
+  case AArch64::Q9:
+  case AArch64::Q11:
+  case AArch64::Q13:
+  case AArch64::Q15:
+  case AArch64::Q17:
+  case AArch64::Q19:
+  case AArch64::Q21:
+  case AArch64::Q23:
+  case AArch64::Q25:
+  case AArch64::Q27:
+  case AArch64::Q29:
+  case AArch64::Q31:
+    return true;
+  case AArch64::S0:
+  case AArch64::S2:
+  case AArch64::S4:
+  case AArch64::S6:
+  case AArch64::S8:
+  case AArch64::S10:
+  case AArch64::S12:
+  case AArch64::S14:
+  case AArch64::S16:
+  case AArch64::S18:
+  case AArch64::S20:
+  case AArch64::S22:
+  case AArch64::S24:
+  case AArch64::S26:
+  case AArch64::S28:
+  case AArch64::S30:
+  case AArch64::D0:
+  case AArch64::D2:
+  case AArch64::D4:
+  case AArch64::D6:
+  case AArch64::D8:
+  case AArch64::D10:
+  case AArch64::D12:
+  case AArch64::D14:
+  case AArch64::D16:
+  case AArch64::D18:
+  case AArch64::D20:
+  case AArch64::D22:
+  case AArch64::D24:
+  case AArch64::D26:
+  case AArch64::D28:
+  case AArch64::D30:
+  case AArch64::Q0:
+  case AArch64::Q2:
+  case AArch64::Q4:
+  case AArch64::Q6:
+  case AArch64::Q8:
+  case AArch64::Q10:
+  case AArch64::Q12:
+  case AArch64::Q14:
+  case AArch64::Q16:
+  case AArch64::Q18:
+  case AArch64::Q20:
+  case AArch64::Q22:
+  case AArch64::Q24:
+  case AArch64::Q26:
+  case AArch64::Q28:
+  case AArch64::Q30:
+    return false;
+
+  }
+}
+
+bool haveSameParity(unsigned reg1, unsigned reg2) {
+  assert(isFPReg(reg1) && "Expecting an FP register for reg1");
+  assert(isFPReg(reg2) && "Expecting an FP register for reg2");
+
+  return isOdd(reg1) == isOdd(reg2);
+}
+
+}
+
+bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd,
+                                                 unsigned Ra) {
+  if (Rd == Ra)
+    return false;
+
+  LiveIntervals &LIs = G.getMetadata().LIS;
+
+  if (TRI->isPhysicalRegister(Rd) || TRI->isPhysicalRegister(Ra)) {
+    DEBUG(dbgs() << "Rd is a physical reg:" << TRI->isPhysicalRegister(Rd)
+          << '\n');
+    DEBUG(dbgs() << "Ra is a physical reg:" << TRI->isPhysicalRegister(Ra)
+          << '\n');
+    return false;
+  }
+
+  PBQPRAGraph::NodeId node1 = G.getMetadata().getNodeIdForVReg(Rd);
+  PBQPRAGraph::NodeId node2 = G.getMetadata().getNodeIdForVReg(Ra);
+
+  const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRdAllowed =
+    &G.getNodeMetadata(node1).getAllowedRegs();
+  const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRaAllowed =
+    &G.getNodeMetadata(node2).getAllowedRegs();
+
+  PBQPRAGraph::EdgeId edge = G.findEdge(node1, node2);
+
+  // The edge does not exist. Create one with the appropriate interference
+  // costs.
+  if (edge == G.invalidEdgeId()) {
+    const LiveInterval &ld = LIs.getInterval(Rd);
+    const LiveInterval &la = LIs.getInterval(Ra);
+    bool livesOverlap = ld.overlaps(la);
+
+    PBQPRAGraph::RawMatrix costs(vRdAllowed->size() + 1,
+                                 vRaAllowed->size() + 1, 0);
+    for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) {
+      unsigned pRd = (*vRdAllowed)[i];
+      for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) {
+        unsigned pRa = (*vRaAllowed)[j];
+        if (livesOverlap && TRI->regsOverlap(pRd, pRa))
+          costs[i + 1][j + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity();
+        else
+          costs[i + 1][j + 1] = haveSameParity(pRd, pRa) ? 0.0 : 1.0;
+      }
+    }
+    G.addEdge(node1, node2, std::move(costs));
+    return true;
+  }
+
+  if (G.getEdgeNode1Id(edge) == node2) {
+    std::swap(node1, node2);
+    std::swap(vRdAllowed, vRaAllowed);
+  }
+
+  // Enforce minCost(sameParity(RaClass)) > maxCost(otherParity(RdClass))
+  PBQPRAGraph::RawMatrix costs(G.getEdgeCosts(edge));
+  for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) {
+    unsigned pRd = (*vRdAllowed)[i];
+
+    // Get the maximum cost (excluding unallocatable reg) for same parity
+    // registers
+    PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min();
+    for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) {
+      unsigned pRa = (*vRaAllowed)[j];
+      if (haveSameParity(pRd, pRa))
+        if (costs[i + 1][j + 1] !=
+                std::numeric_limits<PBQP::PBQPNum>::infinity() &&
+            costs[i + 1][j + 1] > sameParityMax)
+          sameParityMax = costs[i + 1][j + 1];
+    }
+
+    // Ensure all registers with a different parity have a higher cost
+    // than sameParityMax
+    for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) {
+      unsigned pRa = (*vRaAllowed)[j];
+      if (!haveSameParity(pRd, pRa))
+        if (sameParityMax > costs[i + 1][j + 1])
+          costs[i + 1][j + 1] = sameParityMax + 1.0;
+    }
+  }
+  G.updateEdgeCosts(edge, std::move(costs));
+
+  return true;
+}
+
+void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd,
+                                                 unsigned Ra) {
+  LiveIntervals &LIs = G.getMetadata().LIS;
+
+  // Do some Chain management
+  if (Chains.count(Ra)) {
+    if (Rd != Ra) {
+      DEBUG(dbgs() << "Moving acc chain from " << PrintReg(Ra, TRI) << " to "
+                   << PrintReg(Rd, TRI) << '\n';);
+      Chains.remove(Ra);
+      Chains.insert(Rd);
+    }
+  } else {
+    DEBUG(dbgs() << "Creating new acc chain for " << PrintReg(Rd, TRI)
+                 << '\n';);
+    Chains.insert(Rd);
+  }
+
+  PBQPRAGraph::NodeId node1 = G.getMetadata().getNodeIdForVReg(Rd);
+
+  const LiveInterval &ld = LIs.getInterval(Rd);
+  for (auto r : Chains) {
+    // Skip self
+    if (r == Rd)
+      continue;
+
+    const LiveInterval &lr = LIs.getInterval(r);
+    if (ld.overlaps(lr)) {
+      const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRdAllowed =
+        &G.getNodeMetadata(node1).getAllowedRegs();
+
+      PBQPRAGraph::NodeId node2 = G.getMetadata().getNodeIdForVReg(r);
+      const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRrAllowed =
+        &G.getNodeMetadata(node2).getAllowedRegs();
+
+      PBQPRAGraph::EdgeId edge = G.findEdge(node1, node2);
+      assert(edge != G.invalidEdgeId() &&
+             "PBQP error ! The edge should exist !");
+
+      DEBUG(dbgs() << "Refining constraint !\n";);
+
+      if (G.getEdgeNode1Id(edge) == node2) {
+        std::swap(node1, node2);
+        std::swap(vRdAllowed, vRrAllowed);
+      }
+
+      // Enforce that cost is higher with all other Chains of the same parity
+      PBQP::Matrix costs(G.getEdgeCosts(edge));
+      for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) {
+        unsigned pRd = (*vRdAllowed)[i];
+
+        // Get the maximum cost (excluding unallocatable reg) for all other
+        // parity registers
+        PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min();
+        for (unsigned j = 0, je = vRrAllowed->size(); j != je; ++j) {
+          unsigned pRa = (*vRrAllowed)[j];
+          if (!haveSameParity(pRd, pRa))
+            if (costs[i + 1][j + 1] !=
+                    std::numeric_limits<PBQP::PBQPNum>::infinity() &&
+                costs[i + 1][j + 1] > sameParityMax)
+              sameParityMax = costs[i + 1][j + 1];
+        }
+
+        // Ensure all registers with same parity have a higher cost
+        // than sameParityMax
+        for (unsigned j = 0, je = vRrAllowed->size(); j != je; ++j) {
+          unsigned pRa = (*vRrAllowed)[j];
+          if (haveSameParity(pRd, pRa))
+            if (sameParityMax > costs[i + 1][j + 1])
+              costs[i + 1][j + 1] = sameParityMax + 1.0;
+        }
+      }
+      G.updateEdgeCosts(edge, std::move(costs));
+    }
+  }
+}
+
+static bool regJustKilledBefore(const LiveIntervals &LIs, unsigned reg,
+                                const MachineInstr &MI) {
+  const LiveInterval &LI = LIs.getInterval(reg);
+  SlotIndex SI = LIs.getInstructionIndex(MI);
+  return LI.expiredAt(SI);
+}
+
+void A57ChainingConstraint::apply(PBQPRAGraph &G) {
+  const MachineFunction &MF = G.getMetadata().MF;
+  LiveIntervals &LIs = G.getMetadata().LIS;
+
+  TRI = MF.getSubtarget().getRegisterInfo();
+  DEBUG(MF.dump());
+
+  for (const auto &MBB: MF) {
+    Chains.clear(); // FIXME: really needed ? Could not work at MF level ?
+
+    for (const auto &MI: MBB) {
+
+      // Forget Chains which have expired
+      for (auto r : Chains) {
+        SmallVector<unsigned, 8> toDel;
+        if(regJustKilledBefore(LIs, r, MI)) {
+          DEBUG(dbgs() << "Killing chain " << PrintReg(r, TRI) << " at ";
+                MI.print(dbgs()););
+          toDel.push_back(r);
+        }
+
+        while (!toDel.empty()) {
+          Chains.remove(toDel.back());
+          toDel.pop_back();
+        }
+      }
+
+      switch (MI.getOpcode()) {
+      case AArch64::FMSUBSrrr:
+      case AArch64::FMADDSrrr:
+      case AArch64::FNMSUBSrrr:
+      case AArch64::FNMADDSrrr:
+      case AArch64::FMSUBDrrr:
+      case AArch64::FMADDDrrr:
+      case AArch64::FNMSUBDrrr:
+      case AArch64::FNMADDDrrr: {
+        unsigned Rd = MI.getOperand(0).getReg();
+        unsigned Ra = MI.getOperand(3).getReg();
+
+        if (addIntraChainConstraint(G, Rd, Ra))
+          addInterChainConstraint(G, Rd, Ra);
+        break;
+      }
+
+      case AArch64::FMLAv2f32:
+      case AArch64::FMLSv2f32: {
+        unsigned Rd = MI.getOperand(0).getReg();
+        addInterChainConstraint(G, Rd, Rd);
+        break;
+      }
+
+      default:
+        break;
+      }
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h
new file mode 100644
index 000000000000..4f656f94ea12
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h
@@ -0,0 +1,38 @@
+//===-- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/PBQPRAConstraint.h"
+
+namespace llvm {
+
+/// Add the accumulator chaining constraint to a PBQP graph
+class A57ChainingConstraint : public PBQPRAConstraint {
+public:
+  // Add A57 specific constraints to the PBQP graph.
+  void apply(PBQPRAGraph &G) override;
+
+private:
+  SmallSetVector<unsigned, 32> Chains;
+  const TargetRegisterInfo *TRI;
+
+  // Add the accumulator chaining constraint, inside the chain, i.e. so that
+  // parity(Rd) == parity(Ra).
+  // \return true if a constraint was added
+  bool addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra);
+
+  // Add constraints between existing chains
+  void addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra);
+};
+}
+
+#endif // LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/contrib/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
new file mode 100644
index 000000000000..9e9eec48c555
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -0,0 +1,6591 @@
+//===-- AArch64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle using AdvSIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
+
+// 31 entries have cost 0
+// 242 entries have cost 1
+// 1447 entries have cost 2
+// 3602 entries have cost 3
+// 1237 entries have cost 4
+// 2 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+  135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
+  1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+  2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+  2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+  2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+  2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+  2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
+  2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+  1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
+  1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+  2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+  2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+  3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+  2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+  1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
+  3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+  3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+  1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+  3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+  3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+  2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+  2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+  2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+  2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+  2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+  2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+  3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+  3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+  3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+  2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+  2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+  3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+  1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+  1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+  2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+  2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+  3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+  3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+  2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+  2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+  2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+  2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+  2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+  2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+  3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+  2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+  3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+  3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+  3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+  2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+  2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+  2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+  2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+  3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+  3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+  2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+  3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+  3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+  2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+  2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
+  1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
+  1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+  2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+  1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+  1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+  135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
+  2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+  1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+  2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+  2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+  4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+  2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+  2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+  1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+  1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+  2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+  2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+  2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+  1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+  2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+  2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+  2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+  1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+  1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+  3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+  2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+  835584U, // <0,1,2,3>: Cost 0 copy LHS
+  1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+  3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+  2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+  1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+  835584U, // <0,1,2,u>: Cost 0 copy LHS
+  2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+  2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+  2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+  2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+  2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+  2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+  2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+  2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+  2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+  2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+  4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+  2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+  2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+  2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+  1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+  1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+  3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+  2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+  3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+  2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+  2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+  2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+  2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+  2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+  2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+  2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+  3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+  2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+  3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+  2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+  3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+  2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+  1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+  1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+  2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+  2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+  2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+  2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+  2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+  3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+  2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+  2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+  2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+  1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+  1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+  835584U, // <0,1,u,3>: Cost 0 copy LHS
+  1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+  1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+  835584U, // <0,1,u,u>: Cost 0 copy LHS
+  2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+  1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+  2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+  2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+  2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+  2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+  2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+  1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+  2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+  2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+  2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+  2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+  2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+  2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+  3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+  2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+  1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+  2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+  2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+  3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+  2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+  2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+  2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+  1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+  2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+  2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+  2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+  2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+  2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+  2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+  4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+  2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+  2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+  2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+  1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+  2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+  2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+  2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+  3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+  3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+  2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+  2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+  2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+  2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+  3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+  2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+  2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+  2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+  3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+  2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+  2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+  2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+  2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+  3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+  2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+  2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+  2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+  3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+  2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+  2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+  1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+  3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+  1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+  2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+  2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+  2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+  2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+  4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+  2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+  4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+  3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+  3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+  2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+  2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+  2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+  2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+  2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+  2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+  4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+  3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+  3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+  2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+  1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+  1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+  2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+  3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+  3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+  2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+  1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+  3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+  2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+  3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+  2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+  3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+  3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+  3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+  3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+  2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+  3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+  2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+  2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+  4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+  3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+  2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+  3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+  3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+  2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+  3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+  3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+  4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+  3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+  2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+  3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+  2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+  2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+  2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+  2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+  3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+  3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+  3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+  3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+  3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+  3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+  2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+  2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+  3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+  2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+  3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+  3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+  3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+  3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+  3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+  3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+  2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+  1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+  1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+  2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+  3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+  3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+  2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+  1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+  2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+  2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+  3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+  2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+  2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+  3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+  3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+  3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+  3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+  2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+  2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+  1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
+  3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+  2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
+  2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+  2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+  3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+  2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+  2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+  2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+  1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+  3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+  3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+  3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+  3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+  3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+  3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+  4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+  3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+  3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+  2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+  3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+  3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+  3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+  2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+  2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+  2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+  3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+  2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+  2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+  2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+  3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+  2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+  2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+  2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+  1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+  1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+  3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+  2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+  3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+  2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+  3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+  2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+  2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+  2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+  3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+  3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+  3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+  3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+  3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+  2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+  3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+  2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+  2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+  2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+  2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+  1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
+  1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+  2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+  1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+  3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+  2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+  3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+  3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+  3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+  2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+  3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+  2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+  1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+  2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+  2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+  2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+  1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+  2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+  2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+  2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+  1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+  2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+  2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+  3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+  2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+  2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+  3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+  4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+  3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+  3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+  3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+  3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+  2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+  3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+  3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+  2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+  2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+  2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+  3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+  3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+  3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+  3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+  2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+  2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+  2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+  3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+  3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+  3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+  3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+  3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+  2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+  2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+  2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+  2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+  2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+  3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+  3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+  3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+  2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+  2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+  3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+  1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+  1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+  2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+  2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+  2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+  3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+  2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+  2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+  2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+  2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+  2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+  1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+  2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+  2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+  1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+  2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+  1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+  1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+  2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+  2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+  3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+  2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+  2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+  2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+  2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+  2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+  3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+  2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+  3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+  2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+  2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+  2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+  2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+  1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+  2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+  2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+  2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+  1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+  2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+  3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+  2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+  1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+  3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+  3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+  3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+  3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+  2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+  3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+  3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+  2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+  2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+  2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+  4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+  2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+  3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+  2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+  2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+  4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+  2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+  2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+  3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+  3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+  3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+  3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+  3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+  4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+  2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+  3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+  3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+  3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+  3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+  3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+  2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+  2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+  2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+  2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+  2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+  3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+  2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+  2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+  2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+  3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+  2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+  2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+  1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+  2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+  2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+  1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+  2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+  2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+  1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+  2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+  2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+  3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+  2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+  2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+  2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+  3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+  2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+  3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+  2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+  3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+  2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+  3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+  3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+  2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+  2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+  2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+  1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+  3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+  2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+  2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+  2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+  2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+  3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+  1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+  3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+  3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+  3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+  3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+  3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+  3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+  2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+  2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+  3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+  2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+  2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+  3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+  3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+  2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+  3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+  2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+  3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+  3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+  3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+  3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+  2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+  3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+  2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+  2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+  2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+  3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+  2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+  3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+  3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+  3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+  2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+  3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+  2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+  2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+  3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+  3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+  3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+  3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+  3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+  3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+  3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+  2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+  2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+  2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+  1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+  2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+  2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+  2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+  2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+  2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+  1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+  135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
+  1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+  2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+  1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+  2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+  3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+  135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
+  1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+  1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
+  1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+  1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
+  2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+  2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
+  1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+  1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+  1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+  835584U, // <0,u,2,3>: Cost 0 copy LHS
+  1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+  3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+  1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+  1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+  835584U, // <0,u,2,u>: Cost 0 copy LHS
+  2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+  2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+  2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+  2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+  2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+  2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+  2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+  1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+  2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+  1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+  2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+  2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+  3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+  2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+  2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+  2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+  1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+  1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+  2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+  2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+  2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+  2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+  2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+  2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+  1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+  1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+  2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+  2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+  2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+  2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+  2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+  2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+  2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+  2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+  2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+  135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
+  1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+  835584U, // <0,u,u,3>: Cost 0 copy LHS
+  1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+  1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+  1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+  835584U, // <0,u,u,u>: Cost 0 copy LHS
+  2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+  1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+  2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+  2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+  2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+  2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+  3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+  3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+  1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+  2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+  2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+  1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+  2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+  2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+  2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+  3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+  1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+  2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+  2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+  2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+  2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+  2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+  3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+  2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+  2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+  2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+  3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+  67944550U, // <1,0,3,2>: Cost 1 vrev LHS
+  2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+  2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+  4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+  3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+  2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+  68386972U, // <1,0,3,u>: Cost 1 vrev LHS
+  2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+  2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+  2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+  3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+  3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+  2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+  3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+  2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+  4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+  2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+  3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+  2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+  3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+  3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+  3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+  3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+  2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+  3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+  3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+  3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+  3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+  3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+  2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+  2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+  2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+  4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+  2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+  3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+  3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+  3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+  2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+  3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+  2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+  3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+  67985515U, // <1,0,u,2>: Cost 1 vrev LHS
+  2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+  2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+  2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+  2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+  68427937U, // <1,0,u,u>: Cost 1 vrev LHS
+  1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+  1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+  2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+  2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+  2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+  2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+  3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+  3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+  1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+  1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
+  2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+  2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+  1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+  2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+  2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
+  2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+  2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+  2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+  2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+  2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+  3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+  2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+  3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+  2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+  2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+  3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+  4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+  2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+  2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+  2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+  3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+  2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+  2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+  2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+  2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+  3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+  3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+  2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+  1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+  3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+  1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+  2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+  2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+  4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+  2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+  2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+  2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+  2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+  2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+  3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+  2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+  2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+  3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+  3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+  4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+  2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+  2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+  2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+  2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+  2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+  4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+  2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+  3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+  3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+  2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+  2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
+  2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+  2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+  1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+  2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
+  2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+  1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+  2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+  2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+  2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+  3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+  2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+  2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+  1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+  2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+  2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+  2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+  2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+  2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+  2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+  2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+  3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+  2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+  2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+  2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+  2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+  2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+  3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+  3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+  2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+  2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+  2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+  403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+  1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+  403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+  1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+  2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+  3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+  3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+  2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+  2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+  1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+  2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+  1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+  2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+  2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+  3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+  2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+  2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+  2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+  2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+  2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+  2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+  2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+  2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+  2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+  3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+  3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+  2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+  2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+  2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+  1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+  2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+  3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+  2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+  2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+  3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+  3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+  2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+  1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+  403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+  1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+  403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+  1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+  2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+  1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+  2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+  2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+  2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+  3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+  3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+  3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+  1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+  2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+  2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+  2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+  1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+  2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+  2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+  3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+  2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+  1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+  2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+  2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+  2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+  2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+  1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+  2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+  2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+  1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+  2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+  3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+  1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+  2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+  2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+  3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+  2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+  2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+  1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+  3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+  1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+  2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+  2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+  2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+  2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+  2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+  2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+  2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+  1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+  3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+  2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+  3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+  2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+  3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+  2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+  2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+  2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+  3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+  2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+  2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+  2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+  3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+  2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+  2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+  1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+  1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+  2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+  1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+  1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+  1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+  1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+  1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+  2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+  2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+  2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+  1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+  2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+  3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+  1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+  3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+  2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+  3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+  3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+  3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+  2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+  3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+  2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+  3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+  3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+  3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+  2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+  3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+  2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+  3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+  2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+  2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+  2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+  2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+  3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+  2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+  3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+  3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+  3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+  3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+  2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+  3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+  3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+  3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+  2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+  2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+  2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+  3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+  2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+  2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+  2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+  2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+  3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+  2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+  2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+  1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+  2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+  3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+  2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+  2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+  3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+  3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+  2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+  2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+  3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+  3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+  3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+  3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+  3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+  2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+  3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+  2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+  2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+  2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+  2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+  2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+  1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+  1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+  1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+  3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+  1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+  2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+  3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+  1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+  2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+  2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+  2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+  2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+  2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+  2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+  3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+  2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+  3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+  2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+  2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+  2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+  3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+  3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+  2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+  4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+  2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+  2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+  2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+  3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+  2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+  2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+  3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+  4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+  2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+  2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+  3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+  2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+  1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+  2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+  1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+  2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+  2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+  3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+  4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+  2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+  2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+  2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+  2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+  2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+  2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+  2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+  2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+  3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+  3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+  3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+  2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+  2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+  2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+  2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+  2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+  2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+  2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+  2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+  2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+  3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+  2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+  2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+  2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+  1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+  2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+  1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+  1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+  2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+  2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+  3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+  2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+  3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+  2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+  2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+  2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+  4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+  2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+  3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+  2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+  3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+  3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+  2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+  3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+  3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+  2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+  2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+  3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+  3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+  3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+  3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+  3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+  2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+  2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+  2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+  3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+  2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+  3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+  2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+  4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+  2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+  3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+  3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+  3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+  3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+  3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+  3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+  2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+  4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+  2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+  3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+  3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+  3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+  3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+  3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+  3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+  2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+  2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+  2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+  2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+  3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+  3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+  3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+  3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+  3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+  2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+  2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+  2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+  1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+  2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+  2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+  3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+  2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+  2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+  2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+  4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+  1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+  1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+  2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+  3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+  2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+  2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+  3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+  2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+  2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+  3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+  2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+  3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+  2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+  2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+  2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+  2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+  2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+  3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+  2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+  2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+  2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+  2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+  2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+  2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+  2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+  3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+  3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+  2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+  2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+  3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+  2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+  3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+  2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+  1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+  2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+  2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+  2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+  1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+  1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+  2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+  2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+  1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+  2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+  3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+  3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+  3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+  3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+  2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+  3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+  3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+  2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+  2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+  2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+  3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+  2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+  2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+  4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+  2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+  3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+  2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+  2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+  2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+  3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+  3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+  3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+  3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+  3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+  2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+  2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+  2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+  3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+  3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+  3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+  3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+  2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+  3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+  2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+  2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+  1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+  2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+  2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+  1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+  1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+  2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+  2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+  1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+  1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+  1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+  2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+  2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+  1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+  1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+  2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+  2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+  1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+  1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
+  1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+  1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+  3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
+  2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+  2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+  1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  115726126U, // <1,u,3,2>: Cost 1 vrev LHS
+  2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+  403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+  1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+  1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+  403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
+  2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+  2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+  2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+  2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+  2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+  1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+  2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+  1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+  2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+  2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+  3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+  2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+  1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+  2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+  2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+  2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+  2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+  2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+  3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+  2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+  2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+  2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+  2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+  2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+  2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+  2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+  2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+  2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+  1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+  403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+  202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
+  115767091U, // <1,u,u,2>: Cost 1 vrev LHS
+  1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+  403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+  1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+  403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
+  2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+  2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+  1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+  2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+  2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+  3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+  2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+  4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+  1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+  2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+  2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+  1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+  2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+  3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+  2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+  3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+  1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+  2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+  2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+  2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+  1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+  2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+  2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+  2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+  1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+  2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+  4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+  3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+  3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+  4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+  3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+  2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+  2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+  2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+  2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+  3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+  2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+  2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+  2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+  3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+  3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+  2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+  3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+  2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+  3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+  3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+  2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+  4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+  2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+  2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+  3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+  2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+  3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+  3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+  3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+  2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+  2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+  2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+  3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+  3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+  3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+  3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+  3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+  2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+  2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+  1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+  2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+  2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+  3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+  1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+  2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+  2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+  3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+  2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+  1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+  2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+  2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+  2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+  2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+  2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+  3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+  3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+  3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+  2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+  2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+  3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+  2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+  2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+  2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+  3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+  3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+  2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+  2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+  2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+  2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+  2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+  2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+  2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+  2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+  3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+  3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+  1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+  3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+  2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+  3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+  3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+  1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+  2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+  2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+  3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+  2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+  2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+  3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+  3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+  3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+  2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+  2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+  3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+  2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+  3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+  2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+  4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+  3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+  3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+  2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+  2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+  3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+  3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+  2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+  3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+  3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+  3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+  3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+  2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+  2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+  2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+  2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+  2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+  1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+  1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+  1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+  2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+  2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+  3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+  3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+  3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+  1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+  2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+  2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+  2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+  2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+  3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+  2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+  3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+  3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+  2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+  1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+  269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
+  2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+  1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+  2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+  2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
+  2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+  2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+  2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+  1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
+  2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+  3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+  2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+  1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
+  2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+  3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+  2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+  3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+  2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+  1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+  3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+  1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+  3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+  2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+  2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+  4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+  2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+  2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+  2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+  2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+  3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+  3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+  2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+  2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+  3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+  3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+  3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+  2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+  2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+  2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+  3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+  2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+  4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+  2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+  3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+  3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+  2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+  2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+  1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
+  1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
+  1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
+  1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+  1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+  1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+  470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+  1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+  2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+  1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+  2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+  2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+  1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+  1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+  1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+  1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+  1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+  2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+  2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+  1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+  1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+  2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+  1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+  470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+  470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+  2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+  1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+  2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+  1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+  2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+  1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+  2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+  1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+  1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+  2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+  2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+  1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+  1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+  1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+  1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+  470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+  2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+  2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+  2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+  2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+  2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+  1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+  1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+  3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+  2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+  2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+  3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+  3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+  3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+  3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+  2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+  2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+  3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+  2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+  2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+  2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+  2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+  3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+  3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+  2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+  2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+  4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+  3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+  3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+  3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+  2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+  2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+  3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+  3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+  3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+  2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+  2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+  2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+  4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+  2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+  2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+  2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+  2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+  2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+  2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+  3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+  1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+  2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+  2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+  2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+  1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+  2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+  2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+  1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+  2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+  3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+  3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+  3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+  2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+  2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+  3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+  2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+  2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+  2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+  1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+  2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+  2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+  2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+  2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+  2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+  3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+  3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+  3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+  2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+  2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+  2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+  3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+  3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+  2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+  2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+  3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+  4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+  2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+  3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+  2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+  3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+  3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+  3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+  2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+  2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+  2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+  3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+  2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+  2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+  2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+  2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+  3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+  2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+  2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+  2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+  2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+  3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+  2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+  3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+  3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+  3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+  2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+  2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+  4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+  2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+  2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+  2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+  3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+  2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+  3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+  2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+  3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+  4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+  3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+  3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+  2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+  2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+  2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+  3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+  2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+  2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+  2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+  4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+  2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+  2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+  2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+  2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+  2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+  2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+  1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+  3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+  1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+  3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+  2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+  2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+  1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+  2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+  2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+  2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+  2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+  3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+  2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+  2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+  2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+  2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+  2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+  2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+  2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+  2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+  2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+  2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+  2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+  3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+  2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+  2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+  2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+  3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+  3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
+  1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
+  2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+  2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+  2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+  3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+  2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+  1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+  4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+  1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+  3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+  2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+  2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+  3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+  2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+  2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+  2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+  2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+  3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+  2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+  3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+  2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+  3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+  2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+  2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+  2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+  2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+  2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+  2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+  2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+  2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+  3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+  2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+  2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+  2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+  2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+  1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+  2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+  1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+  1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
+  1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
+  2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+  1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+  2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+  2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+  2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+  2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+  2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+  2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+  1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+  2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+  3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+  3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+  3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+  3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+  2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+  3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+  2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+  3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+  2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+  3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+  3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+  2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+  2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+  3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+  2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+  1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+  2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+  2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+  2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+  1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+  2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+  1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+  2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+  1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+  2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+  3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+  3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+  2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+  3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+  2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+  3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+  2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+  2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+  3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+  3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+  3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+  2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+  3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+  3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+  2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+  2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+  2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+  3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+  2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+  3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+  2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+  2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+  2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+  3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+  2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+  2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+  3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+  3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+  3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+  2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+  3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+  2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+  2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+  2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+  1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+  1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+  2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+  2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+  1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+  2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+  2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+  1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+  1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+  1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+  3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+  1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+  1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+  2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
+  1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+  2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
+  1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+  1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+  1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
+  1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+  1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+  1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
+  1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
+  1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+  1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+  2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+  1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+  1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+  470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+  1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+  2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+  1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+  2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+  1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+  2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+  1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+  1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+  2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+  1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+  2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+  2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+  1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+  470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+  269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
+  1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
+  1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+  470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+  1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
+  470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+  1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+  2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+  3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+  3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+  3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+  1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+  1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+  2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+  537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+  2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+  1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+  2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+  2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+  2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+  537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+  2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+  2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+  2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+  1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+  2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+  2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+  2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+  3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+  2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+  3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+  2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+  2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+  1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+  2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+  1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+  3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+  1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+  2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+  3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+  3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+  2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+  2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+  2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+  2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+  3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+  2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+  2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+  2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+  3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+  3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+  3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+  2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+  2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+  2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+  2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+  2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+  2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+  3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+  2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+  3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+  3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+  2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+  2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+  1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+  1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+  2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+  2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+  537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+  2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+  2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+  2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+  1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+  2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+  3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+  3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+  1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+  1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+  2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+  1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+  2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+  3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+  1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+  2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+  2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+  1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+  2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+  2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+  2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+  1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+  1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+  1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+  1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+  1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+  1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+  2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+  2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+  2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+  2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+  2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+  2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+  3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+  2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+  2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+  2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+  2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+  1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+  3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+  3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+  2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+  2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+  2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+  2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+  2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+  2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+  3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+  2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+  4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+  3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+  2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+  3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+  3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+  3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+  1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+  2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+  1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+  1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+  2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+  1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+  2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+  2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+  2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+  1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+  1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+  2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+  2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+  2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+  2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+  2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+  1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+  2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+  2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+  1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+  2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+  3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+  1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+  1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+  2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+  2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+  2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+  1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+  2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+  2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+  2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+  1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+  2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+  2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+  2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+  2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+  1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+  2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+  1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+  2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+  2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+  2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+  2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+  2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+  1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+  2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+  2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+  2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+  1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+  2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+  1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+  2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+  4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+  2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+  2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+  2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+  2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+  2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+  2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+  1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+  1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+  1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+  2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+  1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+  2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+  1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+  2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+  2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+  2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+  2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+  3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+  4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+  1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+  2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+  1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+  2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+  2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+  2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+  2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+  3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+  2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+  1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+  2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+  2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+  1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+  2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+  2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+  2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+  2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+  2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+  1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+  1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+  2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+  336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+  2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+  2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+  336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
+  2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+  2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+  2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+  2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+  2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+  1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+  2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+  4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+  1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+  2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+  2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+  2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+  2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+  2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+  3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+  2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+  2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+  3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+  2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+  2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+  2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+  3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+  2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+  2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+  2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+  2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+  2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+  2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+  2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+  2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+  2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+  3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+  3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+  2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+  1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+  1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+  336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+  2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+  2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
+  2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+  1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+  3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+  2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+  1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+  1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+  3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+  1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+  2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+  1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+  2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+  2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+  2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+  2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+  1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+  3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+  2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+  2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+  2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+  2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+  2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+  2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+  2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+  2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+  2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+  2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+  2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+  2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+  2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+  2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+  3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+  3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+  2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+  2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+  2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+  2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+  2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+  3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+  1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+  1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+  2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+  2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+  1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+  2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+  537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+  2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+  537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+  2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+  2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+  2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+  2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+  1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+  2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+  1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+  2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+  3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+  2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+  3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+  2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+  2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+  2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+  2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+  2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+  1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+  1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+  2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+  1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+  1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+  2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+  537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+  3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+  2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+  2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+  3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+  2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+  2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+  2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+  2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+  2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+  3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+  2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+  2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+  2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+  2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+  3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+  1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+  1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+  3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+  2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+  3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+  2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+  2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+  2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+  3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+  2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+  2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+  3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+  2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+  3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+  2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+  2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+  2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+  4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+  3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+  3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+  2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+  3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+  2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+  3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+  2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+  2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+  2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+  1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+  2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+  2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+  2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+  2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+  2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+  1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+  1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+  1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+  2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+  2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+  2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+  2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+  2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+  2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+  2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+  1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+  1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+  1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+  2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+  2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+  1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+  1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+  2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+  1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+  1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+  2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+  2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+  1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+  1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+  2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+  1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+  2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+  2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+  2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+  3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+  2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+  2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+  2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+  2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+  3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+  3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+  2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+  3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+  2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+  3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+  2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+  2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+  2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+  2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+  2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+  2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+  3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+  2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+  2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+  2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+  1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+  1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+  3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+  3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+  3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+  2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+  2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+  3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+  3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+  2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+  3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+  2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+  2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+  3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+  2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+  2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+  2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+  2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+  2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+  2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+  3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+  2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+  2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+  3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+  4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+  2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+  2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+  2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+  2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+  2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+  3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+  2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+  2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+  1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+  1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+  1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+  2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+  2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+  2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+  1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+  2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+  2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+  2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+  1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+  1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+  2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+  2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+  2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+  1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+  2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+  1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+  1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+  2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+  1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+  3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+  2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+  2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+  2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+  2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+  1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+  2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+  2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+  2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+  2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+  1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+  2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+  2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+  1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+  2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+  3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+  2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+  2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+  2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+  2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+  1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+  2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+  1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+  2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+  2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+  2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+  2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+  2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+  2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+  2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+  2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+  2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+  2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+  2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+  2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+  3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+  2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+  1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+  2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+  1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+  2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+  2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+  3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+  2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+  2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+  2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+  2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+  2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+  2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+  2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+  2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+  2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+  2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+  2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+  2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+  2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+  2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+  2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+  2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+  2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+  2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+  2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+  2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+  1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+  2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+  1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+  2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+  2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+  1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+  1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+  1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+  1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+  1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+  1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+  2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+  1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+  1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+  537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+  1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+  1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+  1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+  2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+  1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+  537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+  1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+  1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+  1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+  1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+  1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+  1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+  1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+  1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+  2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+  336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
+  1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+  1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+  2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
+  2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+  1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+  1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+  1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+  1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+  1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+  1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+  1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+  2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+  1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+  1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+  1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+  1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+  537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+  2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+  2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+  2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+  1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+  1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+  1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+  1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+  1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+  2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+  3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+  1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+  1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+  1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+  1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+  537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+  336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
+  1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+  1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+  537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+  1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+  537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+  2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+  2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+  2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+  3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+  2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+  3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+  3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+  3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+  2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+  2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+  2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+  1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+  2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+  2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+  3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+  2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+  1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+  3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+  2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+  2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+  2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+  3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+  2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+  2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+  3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+  2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+  3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+  3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+  3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+  3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+  3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+  2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+  2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+  2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+  3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+  2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+  2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+  3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+  2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+  2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+  1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
+  2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+  2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+  2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+  3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+  3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+  2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+  1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
+  2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+  3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+  1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+  2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+  2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+  3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+  2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+  1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+  3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+  3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+  4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+  3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+  3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+  3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+  3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+  3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+  4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+  2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+  1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
+  1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+  2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+  2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+  1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+  2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+  3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+  2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+  2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+  3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+  3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+  3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+  2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+  3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+  2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+  2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+  2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+  3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+  3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+  3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+  3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+  2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+  2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+  2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+  3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+  1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+  2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+  3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+  3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+  3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+  1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+  2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+  2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+  2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+  2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+  2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+  2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+  2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+  2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+  3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+  3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+  3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+  3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+  3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+  2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+  3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+  2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+  1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+  2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+  2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+  1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+  2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+  2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+  2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+  4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+  3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+  2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+  2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+  4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+  3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+  2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+  2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+  3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+  2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+  3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+  3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+  3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+  3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+  2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+  2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+  2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+  1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+  1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+  2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+  2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+  2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+  2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+  3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+  2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+  3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+  2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+  3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+  2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+  3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+  3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+  3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+  3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+  2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+  3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+  3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+  2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+  3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+  2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+  2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+  3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+  3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+  3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+  3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+  2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+  2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+  3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+  3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+  2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+  2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+  2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+  3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+  2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+  2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+  2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+  2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+  2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+  2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+  2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+  2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+  3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+  2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+  2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+  2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+  2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+  2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+  2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+  3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+  2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+  3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+  2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+  1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+  2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+  2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+  2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+  1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+  2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+  2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+  2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+  1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+  2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+  3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+  2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+  4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+  3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+  3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+  3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+  2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+  2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+  2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+  2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+  1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+  2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+  2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+  1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+  3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+  2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+  2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+  3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+  3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+  2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+  3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+  3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+  2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+  2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+  3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+  2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+  2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+  3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+  3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+  3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+  3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+  2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+  3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+  2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+  3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+  2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+  3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+  2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+  3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+  3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+  2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+  3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+  3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+  3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+  2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+  3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+  3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+  3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+  2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+  2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+  2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+  3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+  2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+  2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+  2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+  3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+  2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+  2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+  2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+  2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+  2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+  4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+  3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+  2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+  2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+  2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+  2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+  2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+  2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+  1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+  4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+  2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+  1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+  3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+  3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+  3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+  3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+  3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+  3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+  3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+  3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+  3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+  2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+  2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+  2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+  2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+  2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+  1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+  2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+  2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+  1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+  2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+  1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+  2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+  2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+  2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+  3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+  1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+  2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+  2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+  2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+  2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+  2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+  2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+  3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+  3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+  2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+  3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+  3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+  2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+  2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+  2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+  3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+  2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+  3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+  2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+  2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+  3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+  3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+  2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+  3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+  3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+  3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+  2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+  2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+  2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+  2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+  161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
+  1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+  2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+  2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
+  2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+  2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+  3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+  2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+  2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+  1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
+  1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+  1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+  3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+  2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+  2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+  2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+  2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+  1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+  1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+  2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+  3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+  3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+  3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+  2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+  2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+  2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+  2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+  1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+  161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
+  1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
+  1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+  161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
+  2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+  1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+  2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+  2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+  3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+  2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+  2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+  1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+  2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+  2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+  2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+  3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+  2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+  3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+  2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+  2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+  2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+  3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+  2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+  1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+  2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+  3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+  2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+  4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+  1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+  2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+  3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+  2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+  2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+  2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+  2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+  2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+  2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+  2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+  2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+  3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+  2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+  2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+  2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+  1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+  2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+  1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+  1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+  2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+  2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+  2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+  1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+  2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+  2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+  2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+  1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+  1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+  2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+  2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+  1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+  1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+  3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+  2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+  27705344U, // <4,5,6,7>: Cost 0 copy RHS
+  27705344U, // <4,5,6,u>: Cost 0 copy RHS
+  2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+  2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+  2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+  2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+  2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+  2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+  2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+  2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+  2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+  1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+  1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+  1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+  1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+  1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+  27705344U, // <4,5,u,7>: Cost 0 copy RHS
+  27705344U, // <4,5,u,u>: Cost 0 copy RHS
+  2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+  1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+  3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+  2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+  3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+  2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+  4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+  1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+  2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+  2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+  2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+  3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+  2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+  3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+  4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+  2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+  2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+  2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+  2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+  2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+  2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+  2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+  2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+  2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+  2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+  2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+  2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+  3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+  2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+  2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+  2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+  3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+  2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+  2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+  2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+  2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+  2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+  2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+  2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+  1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+  2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+  1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+  2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+  2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+  2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+  3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+  2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+  2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+  2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+  2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+  2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+  1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+  2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+  2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+  2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+  1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+  2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+  2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+  2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+  1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+  2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+  2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+  3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+  2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+  2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+  2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+  2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+  1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+  1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+  2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+  1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+  1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+  2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+  1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+  2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+  3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+  3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+  2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+  3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+  3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+  2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+  2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+  3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+  2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+  3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+  3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+  3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+  3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+  3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+  2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+  3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+  3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+  3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+  2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+  3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+  2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+  3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+  3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+  2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+  3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+  3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+  3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+  3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+  3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+  2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+  2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+  3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+  2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+  2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+  3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+  3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+  3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+  2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+  2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+  2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+  3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+  2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+  2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+  3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+  2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+  3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+  2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+  2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+  2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+  2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+  2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+  1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+  3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+  2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+  2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+  1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+  1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+  2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+  3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+  3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+  3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+  3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+  3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+  3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+  2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+  3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+  2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+  2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+  1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+  2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+  2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+  1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+  1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+  2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+  3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+  2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+  1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+  2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+  2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+  2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+  2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+  2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+  1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+  2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+  2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+  1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+  2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+  2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+  3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+  2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+  1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+  2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+  2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+  1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+  2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+  2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+  2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+  2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+  1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+  2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+  2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+  2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+  2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+  2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+  2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+  2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+  2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+  1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+  2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+  2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+  2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+  161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
+  1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+  2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+  161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
+  1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
+  2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+  3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+  1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
+  1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+  1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+  2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+  1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+  1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+  1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+  1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+  1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+  27705344U, // <4,u,6,7>: Cost 0 copy RHS
+  27705344U, // <4,u,6,u>: Cost 0 copy RHS
+  2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+  2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+  2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+  2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+  2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+  2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+  2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+  1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+  1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+  161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
+  1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  27705344U, // <4,u,u,7>: Cost 0 copy RHS
+  27705344U, // <4,u,u,u>: Cost 0 copy RHS
+  2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+  2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+  2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+  3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+  2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+  3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+  3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+  3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+  2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+  2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+  2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+  1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+  2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+  2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+  3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+  3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+  1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+  2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+  2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+  2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+  2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+  2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+  2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+  3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+  3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+  2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+  3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+  2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+  3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+  3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+  3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+  2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+  2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+  1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+  3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+  2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+  2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+  2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+  2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+  3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+  3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+  3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+  2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+  3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+  3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+  2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+  4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+  2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+  3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+  3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+  3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+  3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+  2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+  2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+  2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+  2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+  2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+  3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+  2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+  2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+  2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+  4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+  2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+  2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+  1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+  1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+  2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+  2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+  2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+  1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+  2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+  2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+  2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+  3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+  3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+  1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+  2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+  2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+  2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+  2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+  2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+  2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+  3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+  3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+  2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+  3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+  3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+  2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+  2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+  2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+  2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+  2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+  3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+  2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+  2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+  2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+  3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+  2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+  2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+  2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+  3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+  2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+  2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+  1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+  2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+  2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+  2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+  2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+  1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+  3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+  1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+  2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+  2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+  3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+  2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+  2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+  2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+  2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+  2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+  2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+  2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+  3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+  2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+  3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+  2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+  2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+  2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+  2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+  2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+  2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+  2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+  2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+  2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+  2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+  4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+  2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+  2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+  1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+  2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+  2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+  1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+  2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+  2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+  3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+  2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+  3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+  3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+  3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+  3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+  3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+  2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+  2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+  3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+  3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+  2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+  3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+  3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+  3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+  3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+  2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+  3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+  3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+  2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+  2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+  2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+  3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+  3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+  3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+  2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+  2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+  2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+  2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+  3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+  1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+  2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+  3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+  3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+  1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+  2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+  3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+  2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+  2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+  2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+  2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+  3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+  2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+  2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+  3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+  3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+  2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+  2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+  2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+  3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+  3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+  2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+  3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+  3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+  2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+  2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+  2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+  3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+  3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+  3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+  2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+  2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+  2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+  2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+  2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+  2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+  4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+  3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+  3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+  2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+  2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+  2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+  2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+  1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+  2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+  3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+  1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+  3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+  2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+  3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+  3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+  2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+  3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+  3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+  2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+  2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+  3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+  3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+  2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+  2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+  3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+  2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+  3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+  2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+  2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+  3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+  3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+  3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+  2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+  2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+  2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+  3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+  3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+  2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+  3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+  2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+  3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+  2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+  2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+  2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+  3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+  3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+  2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+  2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+  3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+  2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+  2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+  2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+  2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+  3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+  2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+  2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+  2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+  2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+  2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+  2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+  2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+  2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+  3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+  2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+  2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+  2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+  3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+  2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+  2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+  2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+  3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+  4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+  2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+  2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+  1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+  2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+  2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+  1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+  2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+  2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+  3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+  1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+  1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+  2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+  2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+  1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+  2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+  2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+  2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+  1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+  3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+  2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+  3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+  3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+  2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+  2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+  2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+  3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+  2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+  2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+  3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+  3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+  3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+  3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+  2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+  3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+  3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+  2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+  3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+  3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+  3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+  2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+  3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+  2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+  3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+  3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+  2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+  3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+  3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+  2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+  3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+  2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+  3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+  2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+  3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+  2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+  2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+  2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+  3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+  3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+  2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+  1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+  2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+  2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+  1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+  2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+  2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+  2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+  3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+  2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+  2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+  1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+  1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+  2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+  2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+  3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+  2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+  2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+  2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+  3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+  2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+  2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+  2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+  2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+  2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+  2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+  2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+  3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+  94817590U, // <5,4,7,6>: Cost 1 vrev RHS
+  2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+  94965064U, // <5,4,7,u>: Cost 1 vrev RHS
+  2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+  2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+  2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+  2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+  2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+  1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+  94825783U, // <5,4,u,6>: Cost 1 vrev RHS
+  2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+  94973257U, // <5,4,u,u>: Cost 1 vrev RHS
+  2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+  1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+  3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+  2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+  2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+  3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+  4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+  1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+  2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+  2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+  2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+  2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+  2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+  3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+  2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+  2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+  3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+  3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+  2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+  2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+  3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+  2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+  2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+  4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+  2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+  2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+  3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+  3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+  2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+  2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+  2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+  3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+  3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+  2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+  2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+  2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+  3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+  3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+  1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+  2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+  2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+  1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+  1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+  2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+  2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+  1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
+  2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+  2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+  229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
+  2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+  3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+  2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+  3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+  2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+  2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+  2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+  2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+  2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+  2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+  3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+  2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+  2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+  2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+  3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+  4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+  2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+  2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+  1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+  2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+  1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
+  2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+  2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+  229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
+  2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+  1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+  2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+  2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+  3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+  3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+  4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+  1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+  2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+  2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+  2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+  2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+  2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+  2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+  3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+  2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+  3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+  3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+  2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+  2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+  2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+  2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+  2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+  2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+  2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+  2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+  3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+  3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+  2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+  1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+  2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+  3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+  4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+  1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+  2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+  3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+  2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+  2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+  2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+  1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+  2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+  1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+  2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+  2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+  3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+  3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+  2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+  2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+  2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+  2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+  2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+  2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+  3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+  2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+  2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+  2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+  2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+  2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+  2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+  430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+  1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+  1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+  1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+  1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+  430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+  430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+  1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+  1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+  430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+  2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+  1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+  2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+  2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+  2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+  3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+  2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+  1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+  2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+  2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+  1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+  2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+  2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+  2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+  3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+  1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+  2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+  3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+  2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+  2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+  2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+  2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+  2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+  3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+  2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+  2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+  2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+  3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+  2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+  2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+  2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+  3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+  2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+  2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+  2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+  2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+  3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+  2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+  2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+  1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+  1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+  2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+  2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+  2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+  2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+  2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+  2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+  2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+  1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+  1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+  2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+  3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+  2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+  2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+  2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+  2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+  2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+  2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+  1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+  2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+  2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+  2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+  1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+  1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+  2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+  2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+  1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+  1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+  1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+  1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+  1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+  1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+  1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+  1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+  1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+  2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+  2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+  2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+  2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+  2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+  1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+  2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+  1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+  2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+  2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+  2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+  2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+  2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+  2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+  2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+  2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+  2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+  2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+  2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+  2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+  2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+  2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+  2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+  1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+  2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+  2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+  2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+  1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+  1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+  1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+  2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+  2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+  1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+  2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+  1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+  1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+  2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+  2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+  1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
+  1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+  229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
+  2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+  2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+  2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+  2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+  2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+  2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+  430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+  1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+  1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+  430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+  1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  118708378U, // <5,u,7,6>: Cost 1 vrev RHS
+  2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+  430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
+  430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+  1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+  430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+  229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
+  118716571U, // <5,u,u,6>: Cost 1 vrev RHS
+  1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+  430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
+  2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+  2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+  2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+  3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+  2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+  3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+  3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+  3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+  2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+  2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+  3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+  1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+  2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+  2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+  2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+  2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+  1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+  2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+  3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+  2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+  1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+  3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+  2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+  3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+  1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+  3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+  2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+  2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+  3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+  2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+  3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+  3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+  3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+  2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+  2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+  2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+  1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+  3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+  2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+  2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+  2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+  1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+  3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+  2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+  3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+  3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+  2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+  3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+  2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+  2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+  2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+  3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+  3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+  2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+  3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+  2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+  2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+  2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+  2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+  2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+  2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+  2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+  2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+  4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+  3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+  4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+  2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+  2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+  2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+  1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+  1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+  2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+  2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+  2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+  3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+  2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+  2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+  3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+  2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+  2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+  2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+  3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+  2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+  3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+  2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+  2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+  3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+  3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+  3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+  2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+  2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+  3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+  3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+  2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+  2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+  3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+  2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+  3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+  2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+  2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+  2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+  2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+  3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+  2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+  2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+  2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+  3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+  2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+  2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+  2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+  3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+  2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+  3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+  2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+  3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+  2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+  2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+  2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+  3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+  3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+  2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+  2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+  3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+  2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+  3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+  2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+  2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+  3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+  3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+  3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+  3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+  2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+  3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+  3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+  2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+  3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+  2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+  2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+  3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+  4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+  2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+  2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+  2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+  2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+  2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+  2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+  2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+  2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+  2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+  2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+  1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+  3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+  2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+  3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+  2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+  3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+  1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+  2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+  2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+  2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+  2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+  3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+  2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+  2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+  3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+  2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+  3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+  3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+  2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+  2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+  2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+  3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+  2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+  3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+  2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+  2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+  3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+  2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+  2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+  2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+  2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+  2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+  2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+  2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+  1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+  2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+  2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+  2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+  1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+  3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+  1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+  3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+  2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+  3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+  2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+  2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+  2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+  2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+  2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+  2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+  2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+  2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+  2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+  2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+  2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+  2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+  2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+  2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+  2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+  2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+  2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+  1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
+  2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+  3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+  2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+  1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
+  1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+  1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+  1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
+  2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+  1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+  2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
+  3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+  2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+  2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+  3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+  2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+  2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+  3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+  2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+  2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+  3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+  3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+  3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+  3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+  2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+  3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+  3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+  2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+  2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+  3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+  3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+  3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+  2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+  3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+  2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+  3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+  2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+  2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+  2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+  3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+  3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+  3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+  2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+  2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+  2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+  2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+  2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+  2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+  1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+  2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+  2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+  1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+  3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+  3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+  3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+  3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+  2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+  3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+  3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+  3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+  2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+  2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+  2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+  2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+  3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+  2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+  3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+  2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+  3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+  2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+  1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+  2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+  1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+  2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+  1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+  2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+  2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+  2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+  1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+  2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+  1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+  2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+  1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+  1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+  2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+  2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+  3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+  2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+  3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+  2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+  2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+  2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+  3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+  2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+  2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+  3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+  3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+  2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+  3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+  3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+  2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+  3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+  2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+  2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+  3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+  3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+  3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+  2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+  2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+  2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+  3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+  2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+  3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+  3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+  3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+  3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+  2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+  3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+  2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+  3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+  2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+  2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+  3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+  2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+  3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+  2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+  2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+  1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+  3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+  2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+  2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+  2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+  2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+  1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+  2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+  2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+  1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+  2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+  2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+  2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+  1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+  2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+  2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+  3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+  2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+  2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+  2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+  2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+  4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+  2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+  1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+  2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+  1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+  2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+  2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+  3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+  3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+  3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+  4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+  3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+  1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+  1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+  2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+  3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+  3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+  3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+  3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+  3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+  3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+  2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+  2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+  2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+  3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+  3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+  3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+  2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+  3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+  2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+  3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+  2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+  3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+  3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+  3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+  3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+  2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+  3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+  3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+  2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+  2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+  2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+  3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+  3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+  2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+  2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+  2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+  1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+  1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+  2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+  3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+  3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+  3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+  2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+  2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+  2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+  2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+  2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+  2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+  3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+  3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+  2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+  2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+  3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+  2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+  2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+  2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+  2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+  2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+  3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+  2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+  2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+  2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+  2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+  2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+  2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+  2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+  2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+  2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+  1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+  2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+  1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+  3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+  2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+  3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+  2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+  4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+  1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+  2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+  2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+  2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+  2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+  2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+  2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+  3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+  2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+  3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+  2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+  2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+  3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+  3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+  2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+  2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+  2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+  2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+  3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+  3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+  2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+  2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+  3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+  2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+  4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+  2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+  2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+  3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+  3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+  3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+  1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+  2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+  1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+  3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+  2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+  3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+  3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+  2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+  2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+  2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+  2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+  2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+  1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+  2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+  2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+  1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+  296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
+  2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+  296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
+  2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+  3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+  2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+  2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+  2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+  2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+  2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+  1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
+  1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
+  1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+  2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
+  1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
+  296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
+  1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+  1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+  2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+  497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+  1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+  1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+  2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+  2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+  2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+  1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+  1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+  1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+  2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+  2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+  1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+  1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+  497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+  2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+  2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+  1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+  2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+  1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+  2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+  2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+  1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+  1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+  1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+  1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+  1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+  497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+  1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+  1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+  1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+  2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+  497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+  1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+  1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+  1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+  2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+  2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+  1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+  1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+  1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+  2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+  2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+  2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+  1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+  1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+  497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+  2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+  1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+  2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+  1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+  2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+  1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+  2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+  296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
+  1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
+  1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+  2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+  1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+  1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
+  1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+  2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+  1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+  1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
+  1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+  1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+  497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+  1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+  497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+  296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
+  1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
+  497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+  1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+  1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+  3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+  2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+  2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+  2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+  3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+  1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+  1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+  2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+  564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+  2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+  1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+  2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+  1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+  2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+  564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+  2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+  2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+  1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+  2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+  1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+  2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+  2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+  2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+  2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+  2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+  2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+  2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+  3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+  2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+  2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+  1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+  1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+  3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+  2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+  1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+  3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+  1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+  2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+  2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+  2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+  2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+  2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+  1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+  2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+  1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+  2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+  2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+  2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+  3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+  2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+  2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+  2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+  2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+  2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+  2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+  3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+  3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+  2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+  2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+  2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+  2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+  2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+  1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+  1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+  564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+  2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+  1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+  1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+  2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+  564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+  2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+  2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+  2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+  1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+  2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+  2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+  3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+  1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+  2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+  1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+  1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+  2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+  2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+  2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+  3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+  1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+  2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+  2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+  3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+  1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+  2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+  2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+  2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+  2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+  1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+  2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+  1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+  2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+  2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+  1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+  2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+  2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+  1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+  2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+  2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+  2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+  2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+  3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+  2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+  2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+  3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+  2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+  2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+  2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+  3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+  1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+  2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+  3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+  2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+  2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+  1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+  3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+  2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+  2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+  3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+  2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+  3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+  2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+  2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+  2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+  2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+  3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+  3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+  3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+  3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+  2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+  3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+  1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+  2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+  1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+  2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+  1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+  2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+  2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+  1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+  2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+  2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+  2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+  2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+  2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+  2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+  2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+  3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+  2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+  2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+  3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+  3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+  2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+  2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+  3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+  2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+  2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+  2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+  2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+  1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+  2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+  2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+  2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+  3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+  1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+  1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+  2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+  2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+  2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+  1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+  2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+  2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+  2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+  1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+  2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+  2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+  2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+  2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+  2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+  2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+  2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+  3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+  2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+  2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+  3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+  2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+  2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+  2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+  3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+  2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+  3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+  2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+  2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+  2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+  2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+  1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+  2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+  2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+  2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+  3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+  1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+  2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+  3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+  3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+  2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+  2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+  3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+  4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+  2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+  2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+  1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+  2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+  1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+  1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+  2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+  2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+  2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+  1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+  2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+  1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+  2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+  2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+  2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+  2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+  2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+  3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+  1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+  2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+  2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+  2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+  2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+  2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+  2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+  3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+  2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+  2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+  2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+  2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+  2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+  2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+  2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+  2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+  2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+  2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+  2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+  2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+  2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+  2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+  1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+  2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+  2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+  2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+  1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+  2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+  2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+  2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+  2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+  1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+  2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+  3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+  1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+  2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+  1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+  2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+  2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+  2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+  2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+  2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+  2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+  1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+  2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+  2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+  1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+  2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+  2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+  3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+  2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+  2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+  1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+  2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+  2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+  2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+  2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+  2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+  2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+  2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+  2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+  2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+  2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+  1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+  1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+  1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+  1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+  2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+  2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+  1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+  2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+  1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+  3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+  2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+  1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+  2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+  2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+  2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+  2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+  2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+  2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+  2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+  3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+  2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+  3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+  3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+  2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+  2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+  3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+  2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+  3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+  2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+  2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+  3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+  3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+  2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+  2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+  2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+  2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+  3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+  2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+  2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+  3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+  3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+  3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+  1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+  1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+  1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+  1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+  2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+  2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+  2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+  1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+  2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+  564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+  2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+  564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+  2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+  3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+  2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+  2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+  1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+  2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+  2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+  2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+  3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+  3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+  3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+  2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+  2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+  2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+  2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+  1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+  1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+  2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+  1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+  1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+  564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+  2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+  564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+  2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+  2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+  3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+  2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+  2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+  2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+  2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+  2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+  3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+  3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+  2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+  2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+  2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+  2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+  1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+  3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+  3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+  2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+  3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+  2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+  2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+  2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+  3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+  2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+  3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+  3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+  3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+  2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+  3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+  2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+  2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+  2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+  3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+  3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+  3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+  2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+  2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+  2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+  1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+  2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+  2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+  3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+  3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+  2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+  1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+  2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+  2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+  2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+  3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+  2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+  2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+  2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+  1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+  1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+  2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+  2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+  2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+  1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+  1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+  1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+  1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+  2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+  2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+  1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+  1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+  2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+  1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+  1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+  2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+  1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+  2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+  2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+  2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+  2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+  2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+  1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+  1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+  2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+  2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+  2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+  2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+  2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+  2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+  1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+  2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+  2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+  2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+  2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+  2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+  2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+  2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+  1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+  2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+  2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+  2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+  2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+  2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+  3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+  2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+  2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+  2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+  2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+  2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+  2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+  2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+  1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+  2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+  1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+  2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+  2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+  2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+  3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+  1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+  2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+  2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+  2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+  1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+  2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+  2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+  2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+  3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+  2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+  2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+  1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+  1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+  2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+  2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+  2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+  1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+  2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+  2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+  2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+  1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+  1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+  1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+  2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+  1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+  1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+  1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+  2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+  1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+  2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+  3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+  2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+  2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+  2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+  2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+  1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+  2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+  2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+  2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+  2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+  2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+  2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+  2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+  2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+  2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+  3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+  3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+  2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+  2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+  3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+  3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+  2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+  2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+  2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+  2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+  3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+  3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+  2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+  2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+  2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+  2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+  2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+  2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+  2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+  3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+  3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+  3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+  2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+  1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+  2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+  2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+  1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+  2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+  2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+  3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+  2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+  2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+  1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+  2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+  2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+  1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+  2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+  2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+  2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+  2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+  2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+  2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+  1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+  2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+  1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+  1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+  2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+  2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+  1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+  2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+  363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
+  1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+  2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+  2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+  1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+  1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+  363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
+  1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+  1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+  1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+  2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+  1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+  1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+  1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+  1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+  1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+  1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+  1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+  2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+  1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+  1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+  1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+  1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+  2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+  1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+  1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+  1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+  1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+  1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+  2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+  2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+  1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+  2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+  1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+  1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+  2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+  1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+  1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+  1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+  1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+  1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+  2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+  1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+  1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+  1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+  564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+  1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+  564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+  2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+  2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+  1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+  1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+  1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+  1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+  1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+  1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+  1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+  2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+  2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+  2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+  1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+  1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+  2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+  363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
+  1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+  1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+  564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+  1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+  1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+  1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+  564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+  363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
+  564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+  135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
+  1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+  2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+  2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+  2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
+  1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+  1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
+  537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+  2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+  1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+  2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+  1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+  2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+  537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+  2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  72589981U, // <u,0,3,2>: Cost 1 vrev LHS
+  2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+  2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+  2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+  2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+  2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+  73032403U, // <u,0,3,u>: Cost 1 vrev LHS
+  2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+  1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+  2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+  1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+  2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+  1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
+  2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+  2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+  2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+  1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+  2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+  2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+  2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+  2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+  2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+  2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+  2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+  1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+  2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+  2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+  2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+  2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+  2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+  2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+  2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+  2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+  2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+  135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
+  1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+  2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+  2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+  1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+  1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+  2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+  1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+  2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+  2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+  1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+  202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
+  2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+  1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+  2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+  2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
+  1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+  2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+  835584U, // <u,1,2,3>: Cost 0 copy LHS
+  1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+  2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+  1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+  835584U, // <u,1,2,u>: Cost 0 copy LHS
+  1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+  1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+  1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+  1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+  1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+  2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+  2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+  1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+  2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+  1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+  2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+  2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+  1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+  1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+  2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+  2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+  1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+  2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+  2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+  2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+  1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+  2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+  2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+  1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+  1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+  2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+  2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+  2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+  2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+  2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+  2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+  2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+  2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+  202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
+  2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  835584U, // <u,1,u,3>: Cost 0 copy LHS
+  1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+  1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+  835584U, // <u,1,u,u>: Cost 0 copy LHS
+  1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+  1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+  2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+  2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+  2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+  1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+  1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+  2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+  2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+  2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+  2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+  2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+  1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+  2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+  269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
+  1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+  2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+  2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
+  408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+  1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
+  408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+  1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
+  1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+  2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+  2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+  1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+  2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+  2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+  2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+  2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+  2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+  2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+  1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+  1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+  2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+  2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+  1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+  2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+  1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+  2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+  2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+  1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
+  2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+  2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+  2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+  1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
+  408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+  1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
+  1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+  1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+  1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
+  1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+  1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+  1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+  2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+  471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+  1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+  1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+  1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+  2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+  1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+  1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+  336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
+  1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+  2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
+  1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+  1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+  2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+  1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+  471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+  471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+  2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+  1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+  2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+  2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+  1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+  1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+  2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+  2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+  1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+  1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+  1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+  1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+  2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+  1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+  2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+  2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+  1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+  1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+  1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
+  1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+  1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+  471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+  2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+  1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+  2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+  1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+  2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+  2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+  1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+  2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+  2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+  1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
+  2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+  2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+  2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+  2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+  2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+  2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+  2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+  2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+  2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+  2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+  2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+  2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+  2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+  2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+  2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+  2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+  2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
+  1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
+  1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+  2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+  2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+  1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+  1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
+  537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+  2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+  1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+  2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+  2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+  2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+  1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+  2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+  2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+  2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+  2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+  2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+  2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+  2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  96808489U, // <u,4,7,6>: Cost 1 vrev RHS
+  2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+  96955963U, // <u,4,7,u>: Cost 1 vrev RHS
+  1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+  1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+  2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+  161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
+  1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+  2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+  2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+  1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+  2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+  2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+  1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+  1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+  1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+  1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+  2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+  2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+  2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+  1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+  2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+  2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+  1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+  2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+  2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+  1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+  2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+  2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+  3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+  1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+  2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+  2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+  2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+  2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+  2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+  3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+  2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+  2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+  2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+  1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+  1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+  2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+  1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+  1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+  2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+  2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+  2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+  1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+  229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
+  2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
+  1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+  2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+  2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+  1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+  1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+  2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  27705344U, // <u,5,6,7>: Cost 0 copy RHS
+  27705344U, // <u,5,6,u>: Cost 0 copy RHS
+  1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+  2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+  2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+  1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+  1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+  1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+  1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+  2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+  1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+  1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+  229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
+  2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  27705344U, // <u,5,u,7>: Cost 0 copy RHS
+  27705344U, // <u,5,u,u>: Cost 0 copy RHS
+  2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+  1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+  2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+  1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+  2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+  2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+  2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+  1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+  1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+  2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+  2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+  2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+  2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+  2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+  2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+  1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+  1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+  2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+  2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+  2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+  1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+  2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+  2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+  1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+  2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+  2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+  2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+  1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+  2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+  3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
+  1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
+  2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+  2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+  2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+  2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+  1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+  2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+  1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+  2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+  2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+  2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+  2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+  2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+  2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+  2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+  1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+  1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+  2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+  2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+  2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+  1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+  2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+  296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
+  1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
+  432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+  1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+  1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
+  432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
+  432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+  1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+  1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
+  1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
+  432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
+  1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+  1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+  2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+  2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+  497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+  2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+  1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+  2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+  2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+  1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+  2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+  1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+  1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+  2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+  1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+  2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+  1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+  1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+  2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+  1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+  1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+  2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+  1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+  2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+  1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+  1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+  497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+  2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+  2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+  1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+  1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+  2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+  2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+  1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+  1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+  363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
+  1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
+  497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+  135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
+  471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+  1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+  471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
+  537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+  1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+  1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+  1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+  1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+  1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+  269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
+  835584U, // <u,u,2,3>: Cost 0 copy LHS
+  1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+  2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  835584U, // <u,u,2,u>: Cost 0 copy LHS
+  408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+  1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  120371557U, // <u,u,3,2>: Cost 1 vrev LHS
+  336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
+  408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+  1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
+  408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
+  1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+  1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+  1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+  161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
+  471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+  1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+  1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+  1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+  1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+  229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
+  537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+  1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+  537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+  1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+  2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+  1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+  1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+  296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
+  27705344U, // <u,u,6,7>: Cost 0 copy RHS
+  27705344U, // <u,u,6,u>: Cost 0 copy RHS
+  432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+  1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+  1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+  1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
+  432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+  1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  120699277U, // <u,u,7,6>: Cost 1 vrev RHS
+  363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
+  432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
+  408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+  471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+  537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+  835584U, // <u,u,u,3>: Cost 0 copy LHS
+  408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+  471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+  537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+  27705344U, // <u,u,u,7>: Cost 0 copy RHS
+  835584U, // <u,u,u,u>: Cost 0 copy LHS
+  0
+};
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
new file mode 100644
index 000000000000..8693f76d7c32
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -0,0 +1,566 @@
+//=- AArch64PromoteConstant.cpp --- Promote constant to global for AArch64 -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64PromoteConstant pass which promotes constants
+// to global variables when this is likely to be more efficient. Currently only
+// types related to constant vector (i.e., constant vector, array of constant
+// vectors, constant structure with a constant vector field, etc.) are promoted
+// to global variables. Constant vectors are likely to be lowered in target
+// constant pool during instruction selection already; therefore, the access
+// will remain the same (memory load), but the structure types are not split
+// into different constant pool accesses for each field. A bonus side effect is
+// that created globals may be merged by the global merge pass.
+//
+// FIXME: This pass may be useful for other targets too.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-promote-const"
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("aarch64-stress-promote-const", cl::Hidden,
+                            cl::desc("Promote all vector constants"));
+
+STATISTIC(NumPromoted, "Number of promoted constants");
+STATISTIC(NumPromotedUses, "Number of promoted constants uses");
+
+//===----------------------------------------------------------------------===//
+//                       AArch64PromoteConstant
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Promotes interesting constant into global variables.
+/// The motivating example is:
+/// static const uint16_t TableA[32] = {
+///   41944, 40330, 38837, 37450, 36158, 34953, 33826, 32768,
+///   31776, 30841, 29960, 29128, 28340, 27595, 26887, 26215,
+///   25576, 24967, 24386, 23832, 23302, 22796, 22311, 21846,
+///   21400, 20972, 20561, 20165, 19785, 19419, 19066, 18725,
+/// };
+///
+/// uint8x16x4_t LoadStatic(void) {
+///   uint8x16x4_t ret;
+///   ret.val[0] = vld1q_u16(TableA +  0);
+///   ret.val[1] = vld1q_u16(TableA +  8);
+///   ret.val[2] = vld1q_u16(TableA + 16);
+///   ret.val[3] = vld1q_u16(TableA + 24);
+///   return ret;
+/// }
+///
+/// The constants in this example are folded into the uses. Thus, 4 different
+/// constants are created.
+///
+/// As their type is vector the cheapest way to create them is to load them
+/// for the memory.
+///
+/// Therefore the final assembly final has 4 different loads. With this pass
+/// enabled, only one load is issued for the constants.
+class AArch64PromoteConstant : public ModulePass {
+
+public:
+  struct PromotedConstant {
+    bool ShouldConvert = false;
+    GlobalVariable *GV = nullptr;
+  };
+  typedef SmallDenseMap<Constant *, PromotedConstant, 16> PromotionCacheTy;
+
+  struct UpdateRecord {
+    Constant *C;
+    Instruction *User;
+    unsigned Op;
+
+    UpdateRecord(Constant *C, Instruction *User, unsigned Op)
+        : C(C), User(User), Op(Op) {}
+  };
+
+  static char ID;
+  AArch64PromoteConstant() : ModulePass(ID) {
+    initializeAArch64PromoteConstantPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "AArch64 Promote Constant"; }
+
+  /// Iterate over the functions and promote the interesting constants into
+  /// global variables with module scope.
+  bool runOnModule(Module &M) override {
+    DEBUG(dbgs() << getPassName() << '\n');
+    if (skipModule(M))
+      return false;
+    bool Changed = false;
+    PromotionCacheTy PromotionCache;
+    for (auto &MF : M) {
+      Changed |= runOnFunction(MF, PromotionCache);
+    }
+    return Changed;
+  }
+
+private:
+  /// Look for interesting constants used within the given function.
+  /// Promote them into global variables, load these global variables within
+  /// the related function, so that the number of inserted load is minimal.
+  bool runOnFunction(Function &F, PromotionCacheTy &PromotionCache);
+
+  // This transformation requires dominator info
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+  /// Type to store a list of Uses.
+  typedef SmallVector<std::pair<Instruction *, unsigned>, 4> Uses;
+  /// Map an insertion point to all the uses it dominates.
+  typedef DenseMap<Instruction *, Uses> InsertionPoints;
+
+  /// Find the closest point that dominates the given Use.
+  Instruction *findInsertionPoint(Instruction &User, unsigned OpNo);
+
+  /// Check if the given insertion point is dominated by an existing
+  /// insertion point.
+  /// If true, the given use is added to the list of dominated uses for
+  /// the related existing point.
+  /// \param NewPt the insertion point to be checked
+  /// \param User the user of the constant
+  /// \param OpNo the operand number of the use
+  /// \param InsertPts existing insertion points
+  /// \pre NewPt and all instruction in InsertPts belong to the same function
+  /// \return true if one of the insertion point in InsertPts dominates NewPt,
+  ///         false otherwise
+  bool isDominated(Instruction *NewPt, Instruction *User, unsigned OpNo,
+                   InsertionPoints &InsertPts);
+
+  /// Check if the given insertion point can be merged with an existing
+  /// insertion point in a common dominator.
+  /// If true, the given use is added to the list of the created insertion
+  /// point.
+  /// \param NewPt the insertion point to be checked
+  /// \param User the user of the constant
+  /// \param OpNo the operand number of the use
+  /// \param InsertPts existing insertion points
+  /// \pre NewPt and all instruction in InsertPts belong to the same function
+  /// \pre isDominated returns false for the exact same parameters.
+  /// \return true if it exists an insertion point in InsertPts that could
+  ///         have been merged with NewPt in a common dominator,
+  ///         false otherwise
+  bool tryAndMerge(Instruction *NewPt, Instruction *User, unsigned OpNo,
+                   InsertionPoints &InsertPts);
+
+  /// Compute the minimal insertion points to dominates all the interesting
+  /// uses of value.
+  /// Insertion points are group per function and each insertion point
+  /// contains a list of all the uses it dominates within the related function
+  /// \param User the user of the constant
+  /// \param OpNo the operand number of the constant
+  /// \param[out] InsertPts output storage of the analysis
+  void computeInsertionPoint(Instruction *User, unsigned OpNo,
+                             InsertionPoints &InsertPts);
+
+  /// Insert a definition of a new global variable at each point contained in
+  /// InsPtsPerFunc and update the related uses (also contained in
+  /// InsPtsPerFunc).
+  void insertDefinitions(Function &F, GlobalVariable &GV,
+                         InsertionPoints &InsertPts);
+
+  /// Do the constant promotion indicated by the Updates records, keeping track
+  /// of globals in PromotionCache.
+  void promoteConstants(Function &F, SmallVectorImpl<UpdateRecord> &Updates,
+                        PromotionCacheTy &PromotionCache);
+
+  /// Transfer the list of dominated uses of IPI to NewPt in InsertPts.
+  /// Append Use to this list and delete the entry of IPI in InsertPts.
+  static void appendAndTransferDominatedUses(Instruction *NewPt,
+                                             Instruction *User, unsigned OpNo,
+                                             InsertionPoints::iterator &IPI,
+                                             InsertionPoints &InsertPts) {
+    // Record the dominated use.
+    IPI->second.emplace_back(User, OpNo);
+    // Transfer the dominated uses of IPI to NewPt
+    // Inserting into the DenseMap may invalidate existing iterator.
+    // Keep a copy of the key to find the iterator to erase.  Keep a copy of the
+    // value so that we don't have to dereference IPI->second.
+    Instruction *OldInstr = IPI->first;
+    Uses OldUses = std::move(IPI->second);
+    InsertPts[NewPt] = std::move(OldUses);
+    // Erase IPI.
+    InsertPts.erase(OldInstr);
+  }
+};
+} // end anonymous namespace
+
+char AArch64PromoteConstant::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64PromoteConstant, "aarch64-promote-const",
+                      "AArch64 Promote Constant Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AArch64PromoteConstant, "aarch64-promote-const",
+                    "AArch64 Promote Constant Pass", false, false)
+
+ModulePass *llvm::createAArch64PromoteConstantPass() {
+  return new AArch64PromoteConstant();
+}
+
+/// Check if the given type uses a vector type.
+static bool isConstantUsingVectorTy(const Type *CstTy) {
+  if (CstTy->isVectorTy())
+    return true;
+  if (CstTy->isStructTy()) {
+    for (unsigned EltIdx = 0, EndEltIdx = CstTy->getStructNumElements();
+         EltIdx < EndEltIdx; ++EltIdx)
+      if (isConstantUsingVectorTy(CstTy->getStructElementType(EltIdx)))
+        return true;
+  } else if (CstTy->isArrayTy())
+    return isConstantUsingVectorTy(CstTy->getArrayElementType());
+  return false;
+}
+
+/// Check if the given use (Instruction + OpIdx) of Cst should be converted into
+/// a load of a global variable initialized with Cst.
+/// A use should be converted if it is legal to do so.
+/// For instance, it is not legal to turn the mask operand of a shuffle vector
+/// into a load of a global variable.
+static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
+                             unsigned OpIdx) {
+  // shufflevector instruction expects a const for the mask argument, i.e., the
+  // third argument. Do not promote this use in that case.
+  if (isa<const ShuffleVectorInst>(Instr) && OpIdx == 2)
+    return false;
+
+  // extractvalue instruction expects a const idx.
+  if (isa<const ExtractValueInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // extractvalue instruction expects a const idx.
+  if (isa<const InsertValueInst>(Instr) && OpIdx > 1)
+    return false;
+
+  if (isa<const AllocaInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Alignment argument must be constant.
+  if (isa<const LoadInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Alignment argument must be constant.
+  if (isa<const StoreInst>(Instr) && OpIdx > 1)
+    return false;
+
+  // Index must be constant.
+  if (isa<const GetElementPtrInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Personality function and filters must be constant.
+  // Give up on that instruction.
+  if (isa<const LandingPadInst>(Instr))
+    return false;
+
+  // Switch instruction expects constants to compare to.
+  if (isa<const SwitchInst>(Instr))
+    return false;
+
+  // Expected address must be a constant.
+  if (isa<const IndirectBrInst>(Instr))
+    return false;
+
+  // Do not mess with intrinsics.
+  if (isa<const IntrinsicInst>(Instr))
+    return false;
+
+  // Do not mess with inline asm.
+  const CallInst *CI = dyn_cast<const CallInst>(Instr);
+  return !(CI && isa<const InlineAsm>(CI->getCalledValue()));
+}
+
+/// Check if the given Cst should be converted into
+/// a load of a global variable initialized with Cst.
+/// A constant should be converted if it is likely that the materialization of
+/// the constant will be tricky. Thus, we give up on zero or undef values.
+///
+/// \todo Currently, accept only vector related types.
+/// Also we give up on all simple vector type to keep the existing
+/// behavior. Otherwise, we should push here all the check of the lowering of
+/// BUILD_VECTOR. By giving up, we lose the potential benefit of merging
+/// constant via global merge and the fact that the same constant is stored
+/// only once with this method (versus, as many function that uses the constant
+/// for the regular approach, even for float).
+/// Again, the simplest solution would be to promote every
+/// constant and rematerialize them when they are actually cheap to create.
+static bool shouldConvertImpl(const Constant *Cst) {
+  if (isa<const UndefValue>(Cst))
+    return false;
+
+  // FIXME: In some cases, it may be interesting to promote in memory
+  // a zero initialized constant.
+  // E.g., when the type of Cst require more instructions than the
+  // adrp/add/load sequence or when this sequence can be shared by several
+  // instances of Cst.
+  // Ideally, we could promote this into a global and rematerialize the constant
+  // when it was a bad idea.
+  if (Cst->isZeroValue())
+    return false;
+
+  if (Stress)
+    return true;
+
+  // FIXME: see function \todo
+  if (Cst->getType()->isVectorTy())
+    return false;
+  return isConstantUsingVectorTy(Cst->getType());
+}
+
+static bool
+shouldConvert(Constant &C,
+              AArch64PromoteConstant::PromotionCacheTy &PromotionCache) {
+  auto Converted = PromotionCache.insert(
+      std::make_pair(&C, AArch64PromoteConstant::PromotedConstant()));
+  if (Converted.second)
+    Converted.first->second.ShouldConvert = shouldConvertImpl(&C);
+  return Converted.first->second.ShouldConvert;
+}
+
+Instruction *AArch64PromoteConstant::findInsertionPoint(Instruction &User,
+                                                        unsigned OpNo) {
+  // If this user is a phi, the insertion point is in the related
+  // incoming basic block.
+  if (PHINode *PhiInst = dyn_cast<PHINode>(&User))
+    return PhiInst->getIncomingBlock(OpNo)->getTerminator();
+
+  return &User;
+}
+
+bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Instruction *User,
+                                         unsigned OpNo,
+                                         InsertionPoints &InsertPts) {
+
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+      *NewPt->getParent()->getParent()).getDomTree();
+
+  // Traverse all the existing insertion points and check if one is dominating
+  // NewPt. If it is, remember that.
+  for (auto &IPI : InsertPts) {
+    if (NewPt == IPI.first || DT.dominates(IPI.first, NewPt) ||
+        // When IPI.first is a terminator instruction, DT may think that
+        // the result is defined on the edge.
+        // Here we are testing the insertion point, not the definition.
+        (IPI.first->getParent() != NewPt->getParent() &&
+         DT.dominates(IPI.first->getParent(), NewPt->getParent()))) {
+      // No need to insert this point. Just record the dominated use.
+      DEBUG(dbgs() << "Insertion point dominated by:\n");
+      DEBUG(IPI.first->print(dbgs()));
+      DEBUG(dbgs() << '\n');
+      IPI.second.emplace_back(User, OpNo);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Instruction *User,
+                                         unsigned OpNo,
+                                         InsertionPoints &InsertPts) {
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+      *NewPt->getParent()->getParent()).getDomTree();
+  BasicBlock *NewBB = NewPt->getParent();
+
+  // Traverse all the existing insertion point and check if one is dominated by
+  // NewPt and thus useless or can be combined with NewPt into a common
+  // dominator.
+  for (InsertionPoints::iterator IPI = InsertPts.begin(),
+                                 EndIPI = InsertPts.end();
+       IPI != EndIPI; ++IPI) {
+    BasicBlock *CurBB = IPI->first->getParent();
+    if (NewBB == CurBB) {
+      // Instructions are in the same block.
+      // By construction, NewPt is dominating the other.
+      // Indeed, isDominated returned false with the exact same arguments.
+      DEBUG(dbgs() << "Merge insertion point with:\n");
+      DEBUG(IPI->first->print(dbgs()));
+      DEBUG(dbgs() << "\nat considered insertion point.\n");
+      appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts);
+      return true;
+    }
+
+    // Look for a common dominator
+    BasicBlock *CommonDominator = DT.findNearestCommonDominator(NewBB, CurBB);
+    // If none exists, we cannot merge these two points.
+    if (!CommonDominator)
+      continue;
+
+    if (CommonDominator != NewBB) {
+      // By construction, the CommonDominator cannot be CurBB.
+      assert(CommonDominator != CurBB &&
+             "Instruction has not been rejected during isDominated check!");
+      // Take the last instruction of the CommonDominator as insertion point
+      NewPt = CommonDominator->getTerminator();
+    }
+    // else, CommonDominator is the block of NewBB, hence NewBB is the last
+    // possible insertion point in that block.
+    DEBUG(dbgs() << "Merge insertion point with:\n");
+    DEBUG(IPI->first->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+    DEBUG(NewPt->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+    appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts);
+    return true;
+  }
+  return false;
+}
+
+void AArch64PromoteConstant::computeInsertionPoint(
+    Instruction *User, unsigned OpNo, InsertionPoints &InsertPts) {
+  DEBUG(dbgs() << "Considered use, opidx " << OpNo << ":\n");
+  DEBUG(User->print(dbgs()));
+  DEBUG(dbgs() << '\n');
+
+  Instruction *InsertionPoint = findInsertionPoint(*User, OpNo);
+
+  DEBUG(dbgs() << "Considered insertion point:\n");
+  DEBUG(InsertionPoint->print(dbgs()));
+  DEBUG(dbgs() << '\n');
+
+  if (isDominated(InsertionPoint, User, OpNo, InsertPts))
+    return;
+  // This insertion point is useful, check if we can merge some insertion
+  // point in a common dominator or if NewPt dominates an existing one.
+  if (tryAndMerge(InsertionPoint, User, OpNo, InsertPts))
+    return;
+
+  DEBUG(dbgs() << "Keep considered insertion point\n");
+
+  // It is definitely useful by its own
+  InsertPts[InsertionPoint].emplace_back(User, OpNo);
+}
+
+static void ensurePromotedGV(Function &F, Constant &C,
+                             AArch64PromoteConstant::PromotedConstant &PC) {
+  assert(PC.ShouldConvert &&
+         "Expected that we should convert this to a global");
+  if (PC.GV)
+    return;
+  PC.GV = new GlobalVariable(
+      *F.getParent(), C.getType(), true, GlobalValue::InternalLinkage, nullptr,
+      "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
+  PC.GV->setInitializer(&C);
+  DEBUG(dbgs() << "Global replacement: ");
+  DEBUG(PC.GV->print(dbgs()));
+  DEBUG(dbgs() << '\n');
+  ++NumPromoted;
+}
+
+void AArch64PromoteConstant::insertDefinitions(Function &F,
+                                               GlobalVariable &PromotedGV,
+                                               InsertionPoints &InsertPts) {
+#ifndef NDEBUG
+  // Do more checking for debug purposes.
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+#endif
+  assert(!InsertPts.empty() && "Empty uses does not need a definition");
+
+  for (const auto &IPI : InsertPts) {
+    // Create the load of the global variable.
+    IRBuilder<> Builder(IPI.first);
+    LoadInst *LoadedCst = Builder.CreateLoad(&PromotedGV);
+    DEBUG(dbgs() << "**********\n");
+    DEBUG(dbgs() << "New def: ");
+    DEBUG(LoadedCst->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+
+    // Update the dominated uses.
+    for (auto Use : IPI.second) {
+#ifndef NDEBUG
+      assert(DT.dominates(LoadedCst,
+                          findInsertionPoint(*Use.first, Use.second)) &&
+             "Inserted definition does not dominate all its uses!");
+#endif
+      DEBUG({
+            dbgs() << "Use to update " << Use.second << ":";
+            Use.first->print(dbgs());
+            dbgs() << '\n';
+            });
+      Use.first->setOperand(Use.second, LoadedCst);
+      ++NumPromotedUses;
+    }
+  }
+}
+
+void AArch64PromoteConstant::promoteConstants(
+    Function &F, SmallVectorImpl<UpdateRecord> &Updates,
+    PromotionCacheTy &PromotionCache) {
+  // Promote the constants.
+  for (auto U = Updates.begin(), E = Updates.end(); U != E;) {
+    DEBUG(dbgs() << "** Compute insertion points **\n");
+    auto First = U;
+    Constant *C = First->C;
+    InsertionPoints InsertPts;
+    do {
+      computeInsertionPoint(U->User, U->Op, InsertPts);
+    } while (++U != E && U->C == C);
+
+    auto &Promotion = PromotionCache[C];
+    ensurePromotedGV(F, *C, Promotion);
+    insertDefinitions(F, *Promotion.GV, InsertPts);
+  }
+}
+
+bool AArch64PromoteConstant::runOnFunction(Function &F,
+                                           PromotionCacheTy &PromotionCache) {
+  // Look for instructions using constant vector. Promote that constant to a
+  // global variable. Create as few loads of this variable as possible and
+  // update the uses accordingly.
+  SmallVector<UpdateRecord, 64> Updates;
+  for (Instruction &I : instructions(&F)) {
+    // Traverse the operand, looking for constant vectors. Replace them by a
+    // load of a global variable of constant vector type.
+    for (Use &U : I.operands()) {
+      Constant *Cst = dyn_cast<Constant>(U);
+      // There is no point in promoting global values as they are already
+      // global. Do not promote constant expressions either, as they may
+      // require some code expansion.
+      if (!Cst || isa<GlobalValue>(Cst) || isa<ConstantExpr>(Cst))
+        continue;
+
+      // Check if this constant is worth promoting.
+      if (!shouldConvert(*Cst, PromotionCache))
+        continue;
+
+      // Check if this use should be promoted.
+      unsigned OpNo = &U - I.op_begin();
+      if (!shouldConvertUse(Cst, &I, OpNo))
+        continue;
+
+      Updates.emplace_back(Cst, &I, OpNo);
+    }
+  }
+
+  if (Updates.empty())
+    return false;
+
+  promoteConstants(F, Updates, PromotionCache);
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
new file mode 100644
index 000000000000..8f45e6a80a36
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
@@ -0,0 +1,179 @@
+//=- AArch64RedundantCopyElimination.cpp - Remove useless copy for AArch64 -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// This pass removes unnecessary zero copies in BBs that are targets of
+// cbz/cbnz instructions. For instance, the copy instruction in the code below
+// can be removed because the CBZW jumps to BB#2 when W0 is zero.
+//  BB#1:
+//    CBZW %W0, <BB#2>
+//  BB#2:
+//    %W0 = COPY %WZR
+// This pass should be run after register allocation.
+//
+// FIXME: This should be extended to handle any constant other than zero. E.g.,
+//   cmp w0, #1
+//     b.eq .BB1
+//   BB1:
+//     mov w0, #1
+//
+// FIXME: This could also be extended to check the whole dominance subtree below
+// the comparison if the compile time regression is acceptable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-copyelim"
+
+STATISTIC(NumCopiesRemoved, "Number of copies removed.");
+
+namespace {
+class AArch64RedundantCopyElimination : public MachineFunctionPass {
+  const MachineRegisterInfo *MRI;
+  const TargetRegisterInfo *TRI;
+
+public:
+  static char ID;
+  AArch64RedundantCopyElimination() : MachineFunctionPass(ID) {
+    initializeAArch64RedundantCopyEliminationPass(
+        *PassRegistry::getPassRegistry());
+  }
+  bool optimizeCopy(MachineBasicBlock *MBB);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+  StringRef getPassName() const override {
+    return "AArch64 Redundant Copy Elimination";
+  }
+};
+char AArch64RedundantCopyElimination::ID = 0;
+}
+
+INITIALIZE_PASS(AArch64RedundantCopyElimination, "aarch64-copyelim",
+                "AArch64 redundant copy elimination pass", false, false)
+
+static bool guaranteesZeroRegInBlock(MachineInstr &MI, MachineBasicBlock *MBB) {
+  unsigned Opc = MI.getOpcode();
+  // Check if the current basic block is the target block to which the
+  // CBZ/CBNZ instruction jumps when its Wt/Xt is zero.
+  if ((Opc == AArch64::CBZW || Opc == AArch64::CBZX) &&
+      MBB == MI.getOperand(1).getMBB())
+    return true;
+  else if ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) &&
+           MBB != MI.getOperand(1).getMBB())
+    return true;
+
+  return false;
+}
+
+bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) {
+  // Check if the current basic block has a single predecessor.
+  if (MBB->pred_size() != 1)
+    return false;
+
+  MachineBasicBlock *PredMBB = *MBB->pred_begin();
+  MachineBasicBlock::iterator CompBr = PredMBB->getLastNonDebugInstr();
+  if (CompBr == PredMBB->end() || PredMBB->succ_size() != 2)
+    return false;
+
+  ++CompBr;
+  do {
+    --CompBr;
+    if (guaranteesZeroRegInBlock(*CompBr, MBB))
+      break;
+  } while (CompBr != PredMBB->begin() && CompBr->isTerminator());
+
+  // We've not found a CBZ/CBNZ, time to bail out.
+  if (!guaranteesZeroRegInBlock(*CompBr, MBB))
+    return false;
+
+  unsigned TargetReg = CompBr->getOperand(0).getReg();
+  if (!TargetReg)
+    return false;
+  assert(TargetRegisterInfo::isPhysicalRegister(TargetReg) &&
+         "Expect physical register");
+
+  // Remember all registers aliasing with TargetReg.
+  SmallSetVector<unsigned, 8> TargetRegs;
+  for (MCRegAliasIterator AI(TargetReg, TRI, true); AI.isValid(); ++AI)
+    TargetRegs.insert(*AI);
+
+  bool Changed = false;
+  MachineBasicBlock::iterator LastChange = MBB->begin();
+  unsigned SmallestDef = TargetReg;
+  // Remove redundant Copy instructions unless TargetReg is modified.
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
+    MachineInstr *MI = &*I;
+    ++I;
+    if (MI->isCopy() && MI->getOperand(0).isReg() &&
+        MI->getOperand(1).isReg()) {
+
+      unsigned DefReg = MI->getOperand(0).getReg();
+      unsigned SrcReg = MI->getOperand(1).getReg();
+
+      if ((SrcReg == AArch64::XZR || SrcReg == AArch64::WZR) &&
+          !MRI->isReserved(DefReg) &&
+          (TargetReg == DefReg || TRI->isSuperRegister(DefReg, TargetReg))) {
+        DEBUG(dbgs() << "Remove redundant Copy : ");
+        DEBUG((MI)->print(dbgs()));
+
+        MI->eraseFromParent();
+        Changed = true;
+        LastChange = I;
+        NumCopiesRemoved++;
+        SmallestDef =
+            TRI->isSubRegister(SmallestDef, DefReg) ? DefReg : SmallestDef;
+        continue;
+      }
+    }
+
+    if (MI->modifiesRegister(TargetReg, TRI))
+      break;
+  }
+
+  if (!Changed)
+    return false;
+
+  // Otherwise, we have to fixup the use-def chain, starting with the
+  // CBZ/CBNZ. Conservatively mark as much as we can live.
+  CompBr->clearRegisterKills(SmallestDef, TRI);
+
+  if (none_of(TargetRegs, [&](unsigned Reg) { return MBB->isLiveIn(Reg); }))
+    MBB->addLiveIn(TargetReg);
+
+  // Clear any kills of TargetReg between CompBr and the last removed COPY.
+  for (MachineInstr &MMI : make_range(MBB->begin(), LastChange))
+    MMI.clearRegisterKills(SmallestDef, TRI);
+
+  return true;
+}
+
+bool AArch64RedundantCopyElimination::runOnMachineFunction(
+    MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF)
+    Changed |= optimizeCopy(&MBB);
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64RedundantCopyEliminationPass() {
+  return new AArch64RedundantCopyElimination();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
new file mode 100644
index 000000000000..a5fd2fbdde19
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -0,0 +1,577 @@
+//===- AArch64RegisterBankInfo.cpp -------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for
+/// AArch64.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64RegisterBankInfo.h"
+#include "AArch64InstrInfo.h" // For XXXRegClassID.
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+// This file will be TableGen'ed at some point.
+#include "AArch64GenRegisterBankInfo.def"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
+    : RegisterBankInfo(AArch64::RegBanks, AArch64::NumRegisterBanks) {
+  static bool AlreadyInit = false;
+  // We have only one set of register banks, whatever the subtarget
+  // is. Therefore, the initialization of the RegBanks table should be
+  // done only once. Indeed the table of all register banks
+  // (AArch64::RegBanks) is unique in the compiler. At some point, it
+  // will get tablegen'ed and the whole constructor becomes empty.
+  if (AlreadyInit)
+    return;
+  AlreadyInit = true;
+  // Initialize the GPR bank.
+  createRegisterBank(AArch64::GPRRegBankID, "GPR");
+  // The GPR register bank is fully defined by all the registers in
+  // GR64all + its subclasses.
+  addRegBankCoverage(AArch64::GPRRegBankID, AArch64::GPR64allRegClassID, TRI);
+  const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID);
+  (void)RBGPR;
+  assert(&AArch64::GPRRegBank == &RBGPR &&
+         "The order in RegBanks is messed up");
+  assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) &&
+         "Subclass not added?");
+  assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
+
+  // Initialize the FPR bank.
+  createRegisterBank(AArch64::FPRRegBankID, "FPR");
+  // The FPR register bank is fully defined by all the registers in
+  // GR64all + its subclasses.
+  addRegBankCoverage(AArch64::FPRRegBankID, AArch64::QQQQRegClassID, TRI);
+  const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID);
+  (void)RBFPR;
+  assert(&AArch64::FPRRegBank == &RBFPR &&
+         "The order in RegBanks is messed up");
+  assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) &&
+         "Subclass not added?");
+  assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) &&
+         "Subclass not added?");
+  assert(RBFPR.getSize() == 512 &&
+         "FPRs should hold up to 512-bit via QQQQ sequence");
+
+  // Initialize the CCR bank.
+  createRegisterBank(AArch64::CCRRegBankID, "CCR");
+  addRegBankCoverage(AArch64::CCRRegBankID, AArch64::CCRRegClassID, TRI);
+  const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID);
+  (void)RBCCR;
+  assert(&AArch64::CCRRegBank == &RBCCR &&
+         "The order in RegBanks is messed up");
+  assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) &&
+         "Class not added?");
+  assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit");
+
+  // Check that the TableGen'ed like file is in sync we our expectations.
+  // First, the Idx.
+  assert(AArch64::PartialMappingIdx::PMI_GPR32 ==
+             AArch64::PartialMappingIdx::PMI_FirstGPR &&
+         "GPR32 index not first in the GPR list");
+  assert(AArch64::PartialMappingIdx::PMI_GPR64 ==
+             AArch64::PartialMappingIdx::PMI_LastGPR &&
+         "GPR64 index not last in the GPR list");
+  assert(AArch64::PartialMappingIdx::PMI_FirstGPR <=
+             AArch64::PartialMappingIdx::PMI_LastGPR &&
+         "GPR list is backward");
+  assert(AArch64::PartialMappingIdx::PMI_FPR32 ==
+             AArch64::PartialMappingIdx::PMI_FirstFPR &&
+         "FPR32 index not first in the FPR list");
+  assert(AArch64::PartialMappingIdx::PMI_FPR512 ==
+             AArch64::PartialMappingIdx::PMI_LastFPR &&
+         "FPR512 index not last in the FPR list");
+  assert(AArch64::PartialMappingIdx::PMI_FirstFPR <=
+             AArch64::PartialMappingIdx::PMI_LastFPR &&
+         "FPR list is backward");
+  assert(AArch64::PartialMappingIdx::PMI_FPR32 + 1 ==
+             AArch64::PartialMappingIdx::PMI_FPR64 &&
+         AArch64::PartialMappingIdx::PMI_FPR64 + 1 ==
+             AArch64::PartialMappingIdx::PMI_FPR128 &&
+         AArch64::PartialMappingIdx::PMI_FPR128 + 1 ==
+             AArch64::PartialMappingIdx::PMI_FPR256 &&
+         AArch64::PartialMappingIdx::PMI_FPR256 + 1 ==
+             AArch64::PartialMappingIdx::PMI_FPR512 &&
+         "FPR indices not properly ordered");
+// Now, the content.
+// Check partial mapping.
+#define CHECK_PARTIALMAP(Idx, ValStartIdx, ValLength, RB)                      \
+  do {                                                                         \
+    const PartialMapping &Map =                                                \
+        AArch64::PartMappings[AArch64::PartialMappingIdx::Idx -                \
+                              AArch64::PartialMappingIdx::PMI_Min];            \
+    (void)Map;                                                                 \
+    assert(Map.StartIdx == ValStartIdx && Map.Length == ValLength &&           \
+           Map.RegBank == &RB && #Idx " is incorrectly initialized");          \
+  } while (0)
+
+  CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR);
+  CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR);
+  CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR);
+  CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR);
+  CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR);
+  CHECK_PARTIALMAP(PMI_FPR256, 0, 256, RBFPR);
+  CHECK_PARTIALMAP(PMI_FPR512, 0, 512, RBFPR);
+
+// Check value mapping.
+#define CHECK_VALUEMAP_IMPL(RBName, Size, Offset)                              \
+  do {                                                                         \
+    unsigned PartialMapBaseIdx =                                               \
+        AArch64::PartialMappingIdx::PMI_##RBName##Size -                       \
+        AArch64::PartialMappingIdx::PMI_Min;                                   \
+    (void)PartialMapBaseIdx;                                                   \
+    const ValueMapping &Map = AArch64::getValueMapping(                        \
+        AArch64::PartialMappingIdx::PMI_First##RBName, Size)[Offset];          \
+    (void)Map;                                                                 \
+    assert(Map.BreakDown == &AArch64::PartMappings[PartialMapBaseIdx] &&       \
+           Map.NumBreakDowns == 1 && #RBName #Size                             \
+           " " #Offset " is incorrectly initialized");                         \
+  } while (0)
+
+#define CHECK_VALUEMAP(RBName, Size) CHECK_VALUEMAP_IMPL(RBName, Size, 0)
+
+  CHECK_VALUEMAP(GPR, 32);
+  CHECK_VALUEMAP(GPR, 64);
+  CHECK_VALUEMAP(FPR, 32);
+  CHECK_VALUEMAP(FPR, 64);
+  CHECK_VALUEMAP(FPR, 128);
+  CHECK_VALUEMAP(FPR, 256);
+  CHECK_VALUEMAP(FPR, 512);
+
+// Check the value mapping for 3-operands instructions where all the operands
+// map to the same value mapping.
+#define CHECK_VALUEMAP_3OPS(RBName, Size)                                      \
+  do {                                                                         \
+    CHECK_VALUEMAP_IMPL(RBName, Size, 0);                                      \
+    CHECK_VALUEMAP_IMPL(RBName, Size, 1);                                      \
+    CHECK_VALUEMAP_IMPL(RBName, Size, 2);                                      \
+  } while (0)
+
+  CHECK_VALUEMAP_3OPS(GPR, 32);
+  CHECK_VALUEMAP_3OPS(GPR, 64);
+  CHECK_VALUEMAP_3OPS(FPR, 32);
+  CHECK_VALUEMAP_3OPS(FPR, 64);
+  CHECK_VALUEMAP_3OPS(FPR, 128);
+  CHECK_VALUEMAP_3OPS(FPR, 256);
+  CHECK_VALUEMAP_3OPS(FPR, 512);
+
+#define CHECK_VALUEMAP_CROSSREGCPY(RBNameDst, RBNameSrc, Size)                 \
+  do {                                                                         \
+    unsigned PartialMapDstIdx =                                                \
+        AArch64::PMI_##RBNameDst##Size - AArch64::PMI_Min;                     \
+    unsigned PartialMapSrcIdx =                                                \
+        AArch64::PMI_##RBNameSrc##Size - AArch64::PMI_Min;                     \
+    (void) PartialMapDstIdx;                                                   \
+    (void) PartialMapSrcIdx;                                                   \
+    const ValueMapping *Map = AArch64::getCopyMapping(                         \
+        AArch64::PMI_First##RBNameDst == AArch64::PMI_FirstGPR,                \
+        AArch64::PMI_First##RBNameSrc == AArch64::PMI_FirstGPR, Size);         \
+    (void) Map;                                                                \
+    assert(Map[0].BreakDown == &AArch64::PartMappings[PartialMapDstIdx] &&     \
+           Map[0].NumBreakDowns == 1 && #RBNameDst #Size                       \
+           " Dst is incorrectly initialized");                                 \
+    assert(Map[1].BreakDown == &AArch64::PartMappings[PartialMapSrcIdx] &&     \
+           Map[1].NumBreakDowns == 1 && #RBNameSrc #Size                       \
+           " Src is incorrectly initialized");                                 \
+                                                                               \
+  } while (0)
+
+  CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32);
+  CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32);
+  CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 64);
+  CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 64);
+  CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 32);
+  CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 32);
+  CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 64);
+  CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 64);
+
+  assert(verify(TRI) && "Invalid register bank information");
+}
+
+unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A,
+                                           const RegisterBank &B,
+                                           unsigned Size) const {
+  // What do we do with different size?
+  // copy are same size.
+  // Will introduce other hooks for different size:
+  // * extract cost.
+  // * build_sequence cost.
+
+  // Copy from (resp. to) GPR to (resp. from) FPR involves FMOV.
+  // FIXME: This should be deduced from the scheduling model.
+  if (&A == &AArch64::GPRRegBank && &B == &AArch64::FPRRegBank)
+    // FMOVXDr or FMOVWSr.
+    return 5;
+  if (&A == &AArch64::FPRRegBank && &B == &AArch64::GPRRegBank)
+    // FMOVDXr or FMOVSWr.
+    return 4;
+
+  return RegisterBankInfo::copyCost(A, B, Size);
+}
+
+const RegisterBank &AArch64RegisterBankInfo::getRegBankFromRegClass(
+    const TargetRegisterClass &RC) const {
+  switch (RC.getID()) {
+  case AArch64::FPR8RegClassID:
+  case AArch64::FPR16RegClassID:
+  case AArch64::FPR32RegClassID:
+  case AArch64::FPR64RegClassID:
+  case AArch64::FPR128RegClassID:
+  case AArch64::FPR128_loRegClassID:
+  case AArch64::DDRegClassID:
+  case AArch64::DDDRegClassID:
+  case AArch64::DDDDRegClassID:
+  case AArch64::QQRegClassID:
+  case AArch64::QQQRegClassID:
+  case AArch64::QQQQRegClassID:
+    return getRegBank(AArch64::FPRRegBankID);
+  case AArch64::GPR32commonRegClassID:
+  case AArch64::GPR32RegClassID:
+  case AArch64::GPR32spRegClassID:
+  case AArch64::GPR32sponlyRegClassID:
+  case AArch64::GPR32allRegClassID:
+  case AArch64::GPR64commonRegClassID:
+  case AArch64::GPR64RegClassID:
+  case AArch64::GPR64spRegClassID:
+  case AArch64::GPR64sponlyRegClassID:
+  case AArch64::GPR64allRegClassID:
+  case AArch64::tcGPR64RegClassID:
+  case AArch64::WSeqPairsClassRegClassID:
+  case AArch64::XSeqPairsClassRegClassID:
+    return getRegBank(AArch64::GPRRegBankID);
+  case AArch64::CCRRegClassID:
+    return getRegBank(AArch64::CCRRegBankID);
+  default:
+    llvm_unreachable("Register class not supported");
+  }
+}
+
+RegisterBankInfo::InstructionMappings
+AArch64RegisterBankInfo::getInstrAlternativeMappings(
+    const MachineInstr &MI) const {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_OR: {
+    // 32 and 64-bit or can be mapped on either FPR or
+    // GPR for the same cost.
+    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+    if (Size != 32 && Size != 64)
+      break;
+
+    // If the instruction has any implicit-defs or uses,
+    // do not mess with it.
+    if (MI.getNumOperands() != 3)
+      break;
+    InstructionMappings AltMappings;
+    InstructionMapping GPRMapping(
+        /*ID*/ 1, /*Cost*/ 1,
+        AArch64::getValueMapping(AArch64::PMI_FirstGPR, Size),
+        /*NumOperands*/ 3);
+    InstructionMapping FPRMapping(
+        /*ID*/ 2, /*Cost*/ 1,
+        AArch64::getValueMapping(AArch64::PMI_FirstFPR, Size),
+        /*NumOperands*/ 3);
+
+    AltMappings.emplace_back(std::move(GPRMapping));
+    AltMappings.emplace_back(std::move(FPRMapping));
+    return AltMappings;
+  }
+  case TargetOpcode::G_BITCAST: {
+    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+    if (Size != 32 && Size != 64)
+      break;
+
+    // If the instruction has any implicit-defs or uses,
+    // do not mess with it.
+    if (MI.getNumOperands() != 2)
+      break;
+
+    InstructionMappings AltMappings;
+    InstructionMapping GPRMapping(
+        /*ID*/ 1, /*Cost*/ 1,
+        AArch64::getCopyMapping(/*DstIsGPR*/ true, /*SrcIsGPR*/ true, Size),
+        /*NumOperands*/ 2);
+    InstructionMapping FPRMapping(
+        /*ID*/ 2, /*Cost*/ 1,
+        AArch64::getCopyMapping(/*DstIsGPR*/ false, /*SrcIsGPR*/ false, Size),
+        /*NumOperands*/ 2);
+    InstructionMapping GPRToFPRMapping(
+        /*ID*/ 3,
+        /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size),
+        AArch64::getCopyMapping(/*DstIsGPR*/ false, /*SrcIsGPR*/ true, Size),
+        /*NumOperands*/ 2);
+    InstructionMapping FPRToGPRMapping(
+        /*ID*/ 3,
+        /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size),
+        AArch64::getCopyMapping(/*DstIsGPR*/ true, /*SrcIsGPR*/ false, Size),
+        /*NumOperands*/ 2);
+
+    AltMappings.emplace_back(std::move(GPRMapping));
+    AltMappings.emplace_back(std::move(FPRMapping));
+    AltMappings.emplace_back(std::move(GPRToFPRMapping));
+    AltMappings.emplace_back(std::move(FPRToGPRMapping));
+    return AltMappings;
+  }
+  case TargetOpcode::G_LOAD: {
+    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+    if (Size != 64)
+      break;
+
+    // If the instruction has any implicit-defs or uses,
+    // do not mess with it.
+    if (MI.getNumOperands() != 2)
+      break;
+
+    InstructionMappings AltMappings;
+    InstructionMapping GPRMapping(
+        /*ID*/ 1, /*Cost*/ 1,
+        getOperandsMapping(
+            {AArch64::getValueMapping(AArch64::PMI_FirstGPR, Size),
+             // Addresses are GPR 64-bit.
+             AArch64::getValueMapping(AArch64::PMI_FirstGPR, 64)}),
+        /*NumOperands*/ 2);
+    InstructionMapping FPRMapping(
+        /*ID*/ 2, /*Cost*/ 1,
+        getOperandsMapping(
+            {AArch64::getValueMapping(AArch64::PMI_FirstFPR, Size),
+             // Addresses are GPR 64-bit.
+             AArch64::getValueMapping(AArch64::PMI_FirstGPR, 64)}),
+        /*NumOperands*/ 2);
+
+    AltMappings.emplace_back(std::move(GPRMapping));
+    AltMappings.emplace_back(std::move(FPRMapping));
+    return AltMappings;
+  }
+  default:
+    break;
+  }
+  return RegisterBankInfo::getInstrAlternativeMappings(MI);
+}
+
+void AArch64RegisterBankInfo::applyMappingImpl(
+    const OperandsMapper &OpdMapper) const {
+  switch (OpdMapper.getMI().getOpcode()) {
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_BITCAST:
+  case TargetOpcode::G_LOAD: {
+    // Those ID must match getInstrAlternativeMappings.
+    assert((OpdMapper.getInstrMapping().getID() >= 1 &&
+            OpdMapper.getInstrMapping().getID() <= 4) &&
+           "Don't know how to handle that ID");
+    return applyDefaultMapping(OpdMapper);
+  }
+  default:
+    llvm_unreachable("Don't know how to handle that operation");
+  }
+}
+
+/// Returns whether opcode \p Opc is a pre-isel generic floating-point opcode,
+/// having only floating-point operands.
+static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) {
+  switch (Opc) {
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FSUB:
+  case TargetOpcode::G_FMUL:
+  case TargetOpcode::G_FDIV:
+  case TargetOpcode::G_FCONSTANT:
+  case TargetOpcode::G_FPEXT:
+  case TargetOpcode::G_FPTRUNC:
+    return true;
+  }
+  return false;
+}
+
+RegisterBankInfo::InstructionMapping
+AArch64RegisterBankInfo::getSameKindOfOperandsMapping(const MachineInstr &MI) {
+  const unsigned Opc = MI.getOpcode();
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned NumOperands = MI.getNumOperands();
+  assert(NumOperands <= 3 &&
+         "This code is for instructions with 3 or less operands");
+
+  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+  unsigned Size = Ty.getSizeInBits();
+  bool IsFPR = Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc);
+
+#ifndef NDEBUG
+  // Make sure all the operands are using similar size and type.
+  // Should probably be checked by the machine verifier.
+  // This code won't catch cases where the number of lanes is
+  // different between the operands.
+  // If we want to go to that level of details, it is probably
+  // best to check that the types are the same, period.
+  // Currently, we just check that the register banks are the same
+  // for each types.
+  for (unsigned Idx = 1; Idx != NumOperands; ++Idx) {
+    LLT OpTy = MRI.getType(MI.getOperand(Idx).getReg());
+    assert(AArch64::getRegBankBaseIdxOffset(OpTy.getSizeInBits()) ==
+               AArch64::getRegBankBaseIdxOffset(Size) &&
+           "Operand has incompatible size");
+    bool OpIsFPR = OpTy.isVector() || isPreISelGenericFloatingPointOpcode(Opc);
+    (void)OpIsFPR;
+    assert(IsFPR == OpIsFPR && "Operand has incompatible type");
+  }
+#endif // End NDEBUG.
+
+  AArch64::PartialMappingIdx RBIdx =
+      IsFPR ? AArch64::PMI_FirstFPR : AArch64::PMI_FirstGPR;
+
+  return InstructionMapping{DefaultMappingID, 1,
+                            AArch64::getValueMapping(RBIdx, Size), NumOperands};
+}
+
+RegisterBankInfo::InstructionMapping
+AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+  const unsigned Opc = MI.getOpcode();
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Try the default logic for non-generic instructions that are either copies
+  // or already have some operands assigned to banks.
+  if (!isPreISelGenericOpcode(Opc)) {
+    RegisterBankInfo::InstructionMapping Mapping = getInstrMappingImpl(MI);
+    if (Mapping.isValid())
+      return Mapping;
+  }
+
+  switch (Opc) {
+    // G_{F|S|U}REM are not listed because they are not legal.
+    // Arithmetic ops.
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_SUB:
+  case TargetOpcode::G_GEP:
+  case TargetOpcode::G_MUL:
+  case TargetOpcode::G_SDIV:
+  case TargetOpcode::G_UDIV:
+    // Bitwise ops.
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_XOR:
+    // Shifts.
+  case TargetOpcode::G_SHL:
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_ASHR:
+    // Floating point ops.
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FSUB:
+  case TargetOpcode::G_FMUL:
+  case TargetOpcode::G_FDIV:
+    return getSameKindOfOperandsMapping(MI);
+  case TargetOpcode::G_BITCAST: {
+    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+    LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+    unsigned Size = DstTy.getSizeInBits();
+    bool DstIsGPR = !DstTy.isVector();
+    bool SrcIsGPR = !SrcTy.isVector();
+    const RegisterBank &DstRB =
+        DstIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank;
+    const RegisterBank &SrcRB =
+        SrcIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank;
+    return InstructionMapping{DefaultMappingID, copyCost(DstRB, SrcRB, Size),
+                              AArch64::getCopyMapping(DstIsGPR, SrcIsGPR, Size),
+                              /*NumOperands*/ 2};
+  }
+  case TargetOpcode::G_SEQUENCE:
+    // FIXME: support this, but the generic code is really not going to do
+    // anything sane.
+    return InstructionMapping();
+  default:
+    break;
+  }
+
+  unsigned NumOperands = MI.getNumOperands();
+
+  // Track the size and bank of each register.  We don't do partial mappings.
+  SmallVector<unsigned, 4> OpSize(NumOperands);
+  SmallVector<AArch64::PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
+  for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+    auto &MO = MI.getOperand(Idx);
+    if (!MO.isReg())
+      continue;
+
+    LLT Ty = MRI.getType(MO.getReg());
+    OpSize[Idx] = Ty.getSizeInBits();
+
+    // As a top-level guess, vectors go in FPRs, scalars and pointers in GPRs.
+    // For floating-point instructions, scalars go in FPRs.
+    if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc))
+      OpRegBankIdx[Idx] = AArch64::PMI_FirstFPR;
+    else
+      OpRegBankIdx[Idx] = AArch64::PMI_FirstGPR;
+  }
+
+  unsigned Cost = 1;
+  // Some of the floating-point instructions have mixed GPR and FPR operands:
+  // fine-tune the computed mapping.
+  switch (Opc) {
+  case TargetOpcode::G_SITOFP:
+  case TargetOpcode::G_UITOFP: {
+    OpRegBankIdx = {AArch64::PMI_FirstFPR, AArch64::PMI_FirstGPR};
+    break;
+  }
+  case TargetOpcode::G_FPTOSI:
+  case TargetOpcode::G_FPTOUI: {
+    OpRegBankIdx = {AArch64::PMI_FirstGPR, AArch64::PMI_FirstFPR};
+    break;
+  }
+  case TargetOpcode::G_FCMP: {
+    OpRegBankIdx = {AArch64::PMI_FirstGPR,
+                    /* Predicate */ AArch64::PMI_None, AArch64::PMI_FirstFPR,
+                    AArch64::PMI_FirstFPR};
+    break;
+  }
+  case TargetOpcode::G_BITCAST: {
+    // This is going to be a cross register bank copy and this is expensive.
+    if (OpRegBankIdx[0] != OpRegBankIdx[1])
+      Cost =
+          copyCost(*AArch64::PartMappings[OpRegBankIdx[0]].RegBank,
+                   *AArch64::PartMappings[OpRegBankIdx[1]].RegBank, OpSize[0]);
+    break;
+  }
+  case TargetOpcode::G_LOAD: {
+    // Loading in vector unit is slightly more expensive.
+    // This is actually only true for the LD1R and co instructions,
+    // but anyway for the fast mode this number does not matter and
+    // for the greedy mode the cost of the cross bank copy will
+    // offset this number.
+    // FIXME: Should be derived from the scheduling model.
+    if (OpRegBankIdx[0] >= AArch64::PMI_FirstFPR)
+      Cost = 2;
+  }
+  }
+
+  // Finally construct the computed mapping.
+  RegisterBankInfo::InstructionMapping Mapping =
+      InstructionMapping{DefaultMappingID, Cost, nullptr, NumOperands};
+  SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
+  for (unsigned Idx = 0; Idx < NumOperands; ++Idx)
+    if (MI.getOperand(Idx).isReg())
+      OpdsMapping[Idx] =
+          AArch64::getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]);
+
+  Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping));
+  return Mapping;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h
new file mode 100644
index 000000000000..f763235049d4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h
@@ -0,0 +1,66 @@
+//===- AArch64RegisterBankInfo -----------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for AArch64.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+namespace llvm {
+
+class TargetRegisterInfo;
+
+namespace AArch64 {
+enum {
+  GPRRegBankID = 0, /// General Purpose Registers: W, X.
+  FPRRegBankID = 1, /// Floating Point/Vector Registers: B, H, S, D, Q.
+  CCRRegBankID = 2, /// Conditional register: NZCV.
+  NumRegisterBanks
+};
+
+extern RegisterBank GPRRegBank;
+extern RegisterBank FPRRegBank;
+extern RegisterBank CCRRegBank;
+} // End AArch64 namespace.
+
+/// This class provides the information for the target register banks.
+class AArch64RegisterBankInfo final : public RegisterBankInfo {
+  /// See RegisterBankInfo::applyMapping.
+  void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
+
+  /// Get an instruction mapping where all the operands map to
+  /// the same register bank and have similar size.
+  ///
+  /// \pre MI.getNumOperands() <= 3
+  ///
+  /// \return An InstructionMappings with a statically allocated
+  /// OperandsMapping.
+  static InstructionMapping
+  getSameKindOfOperandsMapping(const MachineInstr &MI);
+
+public:
+  AArch64RegisterBankInfo(const TargetRegisterInfo &TRI);
+
+  unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
+                    unsigned Size) const override;
+
+  const RegisterBank &
+  getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+
+  InstructionMappings
+  getInstrAlternativeMappings(const MachineInstr &MI) const override;
+
+  InstructionMapping getInstrMapping(const MachineInstr &MI) const override;
+};
+} // End llvm namespace.
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
new file mode 100644
index 000000000000..98fad71aa18a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -0,0 +1,450 @@
+//===- AArch64RegisterInfo.cpp - AArch64 Register Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64RegisterInfo.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "AArch64GenRegisterInfo.inc"
+
+AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
+    : AArch64GenRegisterInfo(AArch64::LR), TT(TT) {}
+
+const MCPhysReg *
+AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::GHC)
+    // GHC set of callee saved regs is empty as all those regs are
+    // used for passing STG regs around
+    return CSR_AArch64_NoRegs_SaveList;
+  if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
+    return CSR_AArch64_AllRegs_SaveList;
+  if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS)
+    return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ?
+           CSR_AArch64_CXX_TLS_Darwin_PE_SaveList :
+           CSR_AArch64_CXX_TLS_Darwin_SaveList;
+  if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
+          ->supportSwiftError() &&
+      MF->getFunction()->getAttributes().hasAttrSomewhere(
+          Attribute::SwiftError))
+    return CSR_AArch64_AAPCS_SwiftError_SaveList;
+  if (MF->getFunction()->getCallingConv() == CallingConv::PreserveMost)
+    return CSR_AArch64_RT_MostRegs_SaveList;
+  else
+    return CSR_AArch64_AAPCS_SaveList;
+}
+
+const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
+    const MachineFunction *MF) const {
+  assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+      MF->getInfo<AArch64FunctionInfo>()->isSplitCSR())
+    return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList;
+  return nullptr;
+}
+
+const uint32_t *
+AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                          CallingConv::ID CC) const {
+  if (CC == CallingConv::GHC)
+    // This is academic becase all GHC calls are (supposed to be) tail calls
+    return CSR_AArch64_NoRegs_RegMask;
+  if (CC == CallingConv::AnyReg)
+    return CSR_AArch64_AllRegs_RegMask;
+  if (CC == CallingConv::CXX_FAST_TLS)
+    return CSR_AArch64_CXX_TLS_Darwin_RegMask;
+  if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering()
+          ->supportSwiftError() &&
+      MF.getFunction()->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return CSR_AArch64_AAPCS_SwiftError_RegMask;
+  if (CC == CallingConv::PreserveMost)
+    return CSR_AArch64_RT_MostRegs_RegMask;
+  else
+    return CSR_AArch64_AAPCS_RegMask;
+}
+
+const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
+  if (TT.isOSDarwin())
+    return CSR_AArch64_TLS_Darwin_RegMask;
+
+  assert(TT.isOSBinFormatELF() && "only expect Darwin or ELF TLS");
+  return CSR_AArch64_TLS_ELF_RegMask;
+}
+
+const uint32_t *
+AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
+                                                CallingConv::ID CC) const {
+  // This should return a register mask that is the same as that returned by
+  // getCallPreservedMask but that additionally preserves the register used for
+  // the first i64 argument (which must also be the register used to return a
+  // single i64 return value)
+  //
+  // In case that the calling convention does not use the same register for
+  // both, the function should return NULL (does not currently apply)
+  assert(CC != CallingConv::GHC && "should not be GHC calling convention.");
+  return CSR_AArch64_AAPCS_ThisReturn_RegMask;
+}
+
+BitVector
+AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
+
+  // FIXME: avoid re-calculating this every time.
+  BitVector Reserved(getNumRegs());
+  markSuperRegs(Reserved, AArch64::SP);
+  markSuperRegs(Reserved, AArch64::XZR);
+  markSuperRegs(Reserved, AArch64::WSP);
+  markSuperRegs(Reserved, AArch64::WZR);
+
+  if (TFI->hasFP(MF) || TT.isOSDarwin()) {
+    markSuperRegs(Reserved, AArch64::FP);
+    markSuperRegs(Reserved, AArch64::W29);
+  }
+
+  if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) {
+    markSuperRegs(Reserved, AArch64::X18); // Platform register
+    markSuperRegs(Reserved, AArch64::W18);
+  }
+
+  if (hasBasePointer(MF)) {
+    markSuperRegs(Reserved, AArch64::X19);
+    markSuperRegs(Reserved, AArch64::W19);
+  }
+
+  assert(checkAllSuperRegsMarked(Reserved));
+  return Reserved;
+}
+
+bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
+                                      unsigned Reg) const {
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
+
+  switch (Reg) {
+  default:
+    break;
+  case AArch64::SP:
+  case AArch64::XZR:
+  case AArch64::WSP:
+  case AArch64::WZR:
+    return true;
+  case AArch64::X18:
+  case AArch64::W18:
+    return MF.getSubtarget<AArch64Subtarget>().isX18Reserved();
+  case AArch64::FP:
+  case AArch64::W29:
+    return TFI->hasFP(MF) || TT.isOSDarwin();
+  case AArch64::W19:
+  case AArch64::X19:
+    return hasBasePointer(MF);
+  }
+
+  return false;
+}
+
+bool AArch64RegisterInfo::isConstantPhysReg(unsigned PhysReg) const {
+  return PhysReg == AArch64::WZR || PhysReg == AArch64::XZR;
+}
+
+const TargetRegisterClass *
+AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                      unsigned Kind) const {
+  return &AArch64::GPR64RegClass;
+}
+
+const TargetRegisterClass *
+AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &AArch64::CCRRegClass)
+    return &AArch64::GPR64RegClass; // Only MSR & MRS copy NZCV.
+  return RC;
+}
+
+unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; }
+
+bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // In the presence of variable sized objects, if the fixed stack size is
+  // large enough that referencing from the FP won't result in things being
+  // in range relatively often, we can use a base pointer to allow access
+  // from the other direction like the SP normally works.
+  // Furthermore, if both variable sized objects are present, and the
+  // stack needs to be dynamically re-aligned, the base pointer is the only
+  // reliable way to reference the locals.
+  if (MFI.hasVarSizedObjects()) {
+    if (needsStackRealignment(MF))
+      return true;
+    // Conservatively estimate whether the negative offset from the frame
+    // pointer will be sufficient to reach. If a function has a smallish
+    // frame, it's less likely to have lots of spills and callee saved
+    // space, so it's all more likely to be within range of the frame pointer.
+    // If it's wrong, we'll materialize the constant and still get to the
+    // object; it's just suboptimal. Negative offsets use the unscaled
+    // load/store instructions, which have a 9-bit signed immediate.
+    return MFI.getLocalFrameSize() >= 256;
+  }
+
+  return false;
+}
+
+unsigned
+AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
+  return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
+}
+
+bool AArch64RegisterInfo::requiresRegisterScavenging(
+    const MachineFunction &MF) const {
+  return true;
+}
+
+bool AArch64RegisterInfo::requiresVirtualBaseRegisters(
+    const MachineFunction &MF) const {
+  return true;
+}
+
+bool
+AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  // AArch64FrameLowering::resolveFrameIndexReference() can always fall back
+  // to the stack pointer, so only put the emergency spill slot next to the
+  // FP when there's no better way to access it (SP or base pointer).
+  return MFI.hasVarSizedObjects() && !hasBasePointer(MF);
+}
+
+bool AArch64RegisterInfo::requiresFrameIndexScavenging(
+    const MachineFunction &MF) const {
+  return true;
+}
+
+bool
+AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI.adjustsStack())
+    return true;
+  return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken();
+}
+
+/// needsFrameBaseReg - Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
+                                            int64_t Offset) const {
+  for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
+    assert(i < MI->getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+
+  // It's the load/store FI references that cause issues, as it can be difficult
+  // to materialize the offset if it won't fit in the literal field. Estimate
+  // based on the size of the local frame and some conservative assumptions
+  // about the rest of the stack frame (note, this is pre-regalloc, so
+  // we don't know everything for certain yet) whether this offset is likely
+  // to be out of range of the immediate. Return true if so.
+
+  // We only generate virtual base registers for loads and stores, so
+  // return false for everything else.
+  if (!MI->mayLoad() && !MI->mayStore())
+    return false;
+
+  // Without a virtual base register, if the function has variable sized
+  // objects, all fixed-size local references will be via the frame pointer,
+  // Approximate the offset and see if it's legal for the instruction.
+  // Note that the incoming offset is based on the SP value at function entry,
+  // so it'll be negative.
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // Estimate an offset from the frame pointer.
+  // Conservatively assume all GPR callee-saved registers get pushed.
+  // FP, LR, X19-X28, D8-D15. 64-bits each.
+  int64_t FPOffset = Offset - 16 * 20;
+  // Estimate an offset from the stack pointer.
+  // The incoming offset is relating to the SP at the start of the function,
+  // but when we access the local it'll be relative to the SP after local
+  // allocation, so adjust our SP-relative offset by that allocation size.
+  Offset += MFI.getLocalFrameSize();
+  // Assume that we'll have at least some spill slots allocated.
+  // FIXME: This is a total SWAG number. We should run some statistics
+  //        and pick a real one.
+  Offset += 128; // 128 bytes of spill slots
+
+  // If there is a frame pointer, try using it.
+  // The FP is only available if there is no dynamic realignment. We
+  // don't know for sure yet whether we'll need that, so we guess based
+  // on whether there are any local variables that would trigger it.
+  if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, AArch64::FP, FPOffset))
+    return false;
+
+  // If we can reference via the stack pointer or base pointer, try that.
+  // FIXME: This (and the code that resolves the references) can be improved
+  //        to only disallow SP relative references in the live range of
+  //        the VLA(s). In practice, it's unclear how much difference that
+  //        would make, but it may be worth doing.
+  if (isFrameOffsetLegal(MI, AArch64::SP, Offset))
+    return false;
+
+  // The offset likely isn't legal; we want to allocate a virtual base register.
+  return true;
+}
+
+bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                             unsigned BaseReg,
+                                             int64_t Offset) const {
+  assert(Offset <= INT_MAX && "Offset too big to fit in int.");
+  assert(MI && "Unable to get the legal offset for nil instruction.");
+  int SaveOffset = Offset;
+  return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
+}
+
+/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
+/// at the beginning of the basic block.
+void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                       unsigned BaseReg,
+                                                       int FrameIdx,
+                                                       int64_t Offset) const {
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL; // Defaults to "unknown"
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+  const MachineFunction &MF = *MBB->getParent();
+  const AArch64InstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
+  unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
+
+  BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+      .addFrameIndex(FrameIdx)
+      .addImm(Offset)
+      .addImm(Shifter);
+}
+
+void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                            int64_t Offset) const {
+  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  unsigned i = 0;
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const AArch64InstrInfo *TII =
+      MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
+  bool Done = rewriteAArch64FrameIndex(MI, i, BaseReg, Off, TII);
+  assert(Done && "Unable to resolve frame index!");
+  (void)Done;
+}
+
+void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                              int SPAdj, unsigned FIOperandNum,
+                                              RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64InstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  unsigned FrameReg;
+  int Offset;
+
+  // Special handling of dbg_value, stackmap and patchpoint instructions.
+  if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
+      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+    Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
+                                             /*PreferFP=*/true);
+    Offset += MI.getOperand(FIOperandNum + 1).getImm();
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // Modify MI as necessary to handle as much of 'Offset' as possible
+  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg);
+  if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
+    return;
+
+  assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) &&
+         "Emergency spill slot is out of reach");
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above.  Handle the rest, providing a register that is
+  // SP+LargeImm.
+  unsigned ScratchReg =
+      MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+  emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
+  MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
+}
+
+unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                                  MachineFunction &MF) const {
+  const AArch64FrameLowering *TFI = getFrameLowering(MF);
+
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case AArch64::GPR32RegClassID:
+  case AArch64::GPR32spRegClassID:
+  case AArch64::GPR32allRegClassID:
+  case AArch64::GPR64spRegClassID:
+  case AArch64::GPR64allRegClassID:
+  case AArch64::GPR64RegClassID:
+  case AArch64::GPR32commonRegClassID:
+  case AArch64::GPR64commonRegClassID:
+    return 32 - 1                                   // XZR/SP
+              - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP
+              - MF.getSubtarget<AArch64Subtarget>()
+                    .isX18Reserved() // X18 reserved as platform register
+              - hasBasePointer(MF);  // X19
+  case AArch64::FPR8RegClassID:
+  case AArch64::FPR16RegClassID:
+  case AArch64::FPR32RegClassID:
+  case AArch64::FPR64RegClassID:
+  case AArch64::FPR128RegClassID:
+    return 32;
+
+  case AArch64::DDRegClassID:
+  case AArch64::DDDRegClassID:
+  case AArch64::DDDDRegClassID:
+  case AArch64::QQRegClassID:
+  case AArch64::QQQRegClassID:
+  case AArch64::QQQQRegClassID:
+    return 32;
+
+  case AArch64::FPR128_loRegClassID:
+    return 16;
+  }
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
new file mode 100644
index 000000000000..8ce893516fe2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -0,0 +1,106 @@
+//==- AArch64RegisterInfo.h - AArch64 Register Information Impl --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERINFO_H
+
+#define GET_REGINFO_HEADER
+#include "AArch64GenRegisterInfo.inc"
+
+namespace llvm {
+
+class MachineFunction;
+class RegScavenger;
+class TargetRegisterClass;
+class Triple;
+
+class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
+  const Triple &TT;
+
+public:
+  AArch64RegisterInfo(const Triple &TT);
+
+  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+
+  /// Code Generation virtual methods...
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  const MCPhysReg *
+  getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID) const override;
+
+  unsigned getCSRFirstUseCost() const override {
+    // The cost will be compared against BlockFrequency where entry has the
+    // value of 1 << 14. A value of 5 will choose to spill or split really
+    // cold path instead of using a callee-saved register.
+    return 5;
+  }
+
+  // Calls involved in thread-local variable lookup save more registers than
+  // normal calls, so they need a different mask to represent this.
+  const uint32_t *getTLSCallPreservedMask() const;
+
+  /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
+  /// case that 'returned' is on an i64 first argument if the calling convention
+  /// is one that can (partially) model this attribute with a preserved mask
+  /// (i.e. it is a calling convention that uses the same register for the first
+  /// i64 argument and an i64 return value)
+  ///
+  /// Should return NULL in the case that the calling convention does not have
+  /// this property
+  const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF,
+                                             CallingConv::ID) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  bool isConstantPhysReg(unsigned PhysReg) const override;
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+  bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+                          int64_t Offset) const override;
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg,
+                                    int FrameIdx,
+                                    int64_t Offset) const override;
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+  bool cannotEliminateFrame(const MachineFunction &MF) const;
+
+  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
+  bool hasBasePointer(const MachineFunction &MF) const;
+  unsigned getBaseRegister() const;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction&) const override {
+    return true;
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
new file mode 100644
index 000000000000..7e29ee5e9baf
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -0,0 +1,635 @@
+//=- AArch64RegisterInfo.td - Describe the AArch64 Registers -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+
+class AArch64Reg<bits<16> enc, string n, list<Register> subregs = [],
+               list<string> altNames = []>
+        : Register<n, altNames> {
+  let HWEncoding = enc;
+  let Namespace = "AArch64";
+  let SubRegs = subregs;
+}
+
+let Namespace = "AArch64" in {
+  def sub_32 : SubRegIndex<32>;
+
+  def bsub : SubRegIndex<8>;
+  def hsub : SubRegIndex<16>;
+  def ssub : SubRegIndex<32>;
+  def dsub : SubRegIndex<32>;
+  def sube32 : SubRegIndex<32>;
+  def subo32 : SubRegIndex<32>;
+  def qhisub : SubRegIndex<64>;
+  def qsub : SubRegIndex<64>;
+  def sube64 : SubRegIndex<64>;
+  def subo64 : SubRegIndex<64>;
+  // Note: Code depends on these having consecutive numbers
+  def dsub0 : SubRegIndex<64>;
+  def dsub1 : SubRegIndex<64>;
+  def dsub2 : SubRegIndex<64>;
+  def dsub3 : SubRegIndex<64>;
+  // Note: Code depends on these having consecutive numbers
+  def qsub0 : SubRegIndex<128>;
+  def qsub1 : SubRegIndex<128>;
+  def qsub2 : SubRegIndex<128>;
+  def qsub3 : SubRegIndex<128>;
+}
+
+let Namespace = "AArch64" in {
+  def vreg : RegAltNameIndex;
+  def vlist1 : RegAltNameIndex;
+}
+
+//===----------------------------------------------------------------------===//
+// Registers
+//===----------------------------------------------------------------------===//
+def W0    : AArch64Reg<0,   "w0" >, DwarfRegNum<[0]>;
+def W1    : AArch64Reg<1,   "w1" >, DwarfRegNum<[1]>;
+def W2    : AArch64Reg<2,   "w2" >, DwarfRegNum<[2]>;
+def W3    : AArch64Reg<3,   "w3" >, DwarfRegNum<[3]>;
+def W4    : AArch64Reg<4,   "w4" >, DwarfRegNum<[4]>;
+def W5    : AArch64Reg<5,   "w5" >, DwarfRegNum<[5]>;
+def W6    : AArch64Reg<6,   "w6" >, DwarfRegNum<[6]>;
+def W7    : AArch64Reg<7,   "w7" >, DwarfRegNum<[7]>;
+def W8    : AArch64Reg<8,   "w8" >, DwarfRegNum<[8]>;
+def W9    : AArch64Reg<9,   "w9" >, DwarfRegNum<[9]>;
+def W10   : AArch64Reg<10, "w10">, DwarfRegNum<[10]>;
+def W11   : AArch64Reg<11, "w11">, DwarfRegNum<[11]>;
+def W12   : AArch64Reg<12, "w12">, DwarfRegNum<[12]>;
+def W13   : AArch64Reg<13, "w13">, DwarfRegNum<[13]>;
+def W14   : AArch64Reg<14, "w14">, DwarfRegNum<[14]>;
+def W15   : AArch64Reg<15, "w15">, DwarfRegNum<[15]>;
+def W16   : AArch64Reg<16, "w16">, DwarfRegNum<[16]>;
+def W17   : AArch64Reg<17, "w17">, DwarfRegNum<[17]>;
+def W18   : AArch64Reg<18, "w18">, DwarfRegNum<[18]>;
+def W19   : AArch64Reg<19, "w19">, DwarfRegNum<[19]>;
+def W20   : AArch64Reg<20, "w20">, DwarfRegNum<[20]>;
+def W21   : AArch64Reg<21, "w21">, DwarfRegNum<[21]>;
+def W22   : AArch64Reg<22, "w22">, DwarfRegNum<[22]>;
+def W23   : AArch64Reg<23, "w23">, DwarfRegNum<[23]>;
+def W24   : AArch64Reg<24, "w24">, DwarfRegNum<[24]>;
+def W25   : AArch64Reg<25, "w25">, DwarfRegNum<[25]>;
+def W26   : AArch64Reg<26, "w26">, DwarfRegNum<[26]>;
+def W27   : AArch64Reg<27, "w27">, DwarfRegNum<[27]>;
+def W28   : AArch64Reg<28, "w28">, DwarfRegNum<[28]>;
+def W29   : AArch64Reg<29, "w29">, DwarfRegNum<[29]>;
+def W30   : AArch64Reg<30, "w30">, DwarfRegNum<[30]>;
+def WSP   : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>;
+def WZR   : AArch64Reg<31, "wzr">, DwarfRegAlias<WSP>;
+
+let SubRegIndices = [sub_32] in {
+def X0    : AArch64Reg<0,   "x0",  [W0]>, DwarfRegAlias<W0>;
+def X1    : AArch64Reg<1,   "x1",  [W1]>, DwarfRegAlias<W1>;
+def X2    : AArch64Reg<2,   "x2",  [W2]>, DwarfRegAlias<W2>;
+def X3    : AArch64Reg<3,   "x3",  [W3]>, DwarfRegAlias<W3>;
+def X4    : AArch64Reg<4,   "x4",  [W4]>, DwarfRegAlias<W4>;
+def X5    : AArch64Reg<5,   "x5",  [W5]>, DwarfRegAlias<W5>;
+def X6    : AArch64Reg<6,   "x6",  [W6]>, DwarfRegAlias<W6>;
+def X7    : AArch64Reg<7,   "x7",  [W7]>, DwarfRegAlias<W7>;
+def X8    : AArch64Reg<8,   "x8",  [W8]>, DwarfRegAlias<W8>;
+def X9    : AArch64Reg<9,   "x9",  [W9]>, DwarfRegAlias<W9>;
+def X10   : AArch64Reg<10, "x10", [W10]>, DwarfRegAlias<W10>;
+def X11   : AArch64Reg<11, "x11", [W11]>, DwarfRegAlias<W11>;
+def X12   : AArch64Reg<12, "x12", [W12]>, DwarfRegAlias<W12>;
+def X13   : AArch64Reg<13, "x13", [W13]>, DwarfRegAlias<W13>;
+def X14   : AArch64Reg<14, "x14", [W14]>, DwarfRegAlias<W14>;
+def X15   : AArch64Reg<15, "x15", [W15]>, DwarfRegAlias<W15>;
+def X16   : AArch64Reg<16, "x16", [W16]>, DwarfRegAlias<W16>;
+def X17   : AArch64Reg<17, "x17", [W17]>, DwarfRegAlias<W17>;
+def X18   : AArch64Reg<18, "x18", [W18]>, DwarfRegAlias<W18>;
+def X19   : AArch64Reg<19, "x19", [W19]>, DwarfRegAlias<W19>;
+def X20   : AArch64Reg<20, "x20", [W20]>, DwarfRegAlias<W20>;
+def X21   : AArch64Reg<21, "x21", [W21]>, DwarfRegAlias<W21>;
+def X22   : AArch64Reg<22, "x22", [W22]>, DwarfRegAlias<W22>;
+def X23   : AArch64Reg<23, "x23", [W23]>, DwarfRegAlias<W23>;
+def X24   : AArch64Reg<24, "x24", [W24]>, DwarfRegAlias<W24>;
+def X25   : AArch64Reg<25, "x25", [W25]>, DwarfRegAlias<W25>;
+def X26   : AArch64Reg<26, "x26", [W26]>, DwarfRegAlias<W26>;
+def X27   : AArch64Reg<27, "x27", [W27]>, DwarfRegAlias<W27>;
+def X28   : AArch64Reg<28, "x28", [W28]>, DwarfRegAlias<W28>;
+def FP    : AArch64Reg<29, "x29", [W29]>, DwarfRegAlias<W29>;
+def LR    : AArch64Reg<30, "x30", [W30]>, DwarfRegAlias<W30>;
+def SP    : AArch64Reg<31, "sp",  [WSP]>, DwarfRegAlias<WSP>;
+def XZR   : AArch64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
+}
+
+// Condition code register.
+def NZCV  : AArch64Reg<0, "nzcv">;
+
+// GPR register classes with the intersections of GPR32/GPR32sp and
+// GPR64/GPR64sp for use by the coalescer.
+def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> {
+  let AltOrders = [(rotl GPR32common, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64common : RegisterClass<"AArch64", [i64], 64,
+                                (add (sequence "X%u", 0, 28), FP, LR)> {
+  let AltOrders = [(rotl GPR64common, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+// GPR register classes which exclude SP/WSP.
+def GPR32 : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR)> {
+  let AltOrders = [(rotl GPR32, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64 : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR)> {
+  let AltOrders = [(rotl GPR64, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// GPR register classes which include SP/WSP.
+def GPR32sp : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WSP)> {
+  let AltOrders = [(rotl GPR32sp, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64sp : RegisterClass<"AArch64", [i64], 64, (add GPR64common, SP)> {
+  let AltOrders = [(rotl GPR64sp, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+def GPR32sponly : RegisterClass<"AArch64", [i32], 32, (add WSP)>;
+def GPR64sponly : RegisterClass<"AArch64", [i64], 64, (add SP)>;
+
+def GPR64spPlus0Operand : AsmOperandClass {
+  let Name = "GPR64sp0";
+  let RenderMethod = "addRegOperands";
+  let ParserMethod = "tryParseGPR64sp0Operand";
+}
+
+def GPR64sp0 : RegisterOperand<GPR64sp> {
+  let ParserMatchClass = GPR64spPlus0Operand;
+}
+
+// GPR register classes which include WZR/XZR AND SP/WSP. This is not a
+// constraint used by any instructions, it is used as a common super-class.
+def GPR32all : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR, WSP)>;
+def GPR64all : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR, SP)>;
+
+// For tail calls, we can't use callee-saved registers, as they are restored
+// to the saved value before the tail call, which would clobber a call address.
+// This is for indirect tail calls to store the address of the destination.
+def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X21,
+                                                     X22, X23, X24, X25, X26,
+                                                     X27, X28, FP, LR)>;
+
+// GPR register classes for post increment amount of vector load/store that
+// has alternate printing when Rm=31 and prints a constant immediate value
+// equal to the total number of bytes transferred.
+
+// FIXME: TableGen *should* be able to do these itself now. There appears to be
+// a bug in counting how many operands a Post-indexed MCInst should have which
+// means the aliases don't trigger.
+def GPR64pi1  : RegisterOperand<GPR64, "printPostIncOperand<1>">;
+def GPR64pi2  : RegisterOperand<GPR64, "printPostIncOperand<2>">;
+def GPR64pi3  : RegisterOperand<GPR64, "printPostIncOperand<3>">;
+def GPR64pi4  : RegisterOperand<GPR64, "printPostIncOperand<4>">;
+def GPR64pi6  : RegisterOperand<GPR64, "printPostIncOperand<6>">;
+def GPR64pi8  : RegisterOperand<GPR64, "printPostIncOperand<8>">;
+def GPR64pi12 : RegisterOperand<GPR64, "printPostIncOperand<12>">;
+def GPR64pi16 : RegisterOperand<GPR64, "printPostIncOperand<16>">;
+def GPR64pi24 : RegisterOperand<GPR64, "printPostIncOperand<24>">;
+def GPR64pi32 : RegisterOperand<GPR64, "printPostIncOperand<32>">;
+def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">;
+def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">;
+
+// Condition code regclass.
+def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+
+  // CCR is not allocatable.
+  let isAllocatable = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Scalar Registers
+//===----------------------------------------------------------------------===//
+
+def B0    : AArch64Reg<0,   "b0">, DwarfRegNum<[64]>;
+def B1    : AArch64Reg<1,   "b1">, DwarfRegNum<[65]>;
+def B2    : AArch64Reg<2,   "b2">, DwarfRegNum<[66]>;
+def B3    : AArch64Reg<3,   "b3">, DwarfRegNum<[67]>;
+def B4    : AArch64Reg<4,   "b4">, DwarfRegNum<[68]>;
+def B5    : AArch64Reg<5,   "b5">, DwarfRegNum<[69]>;
+def B6    : AArch64Reg<6,   "b6">, DwarfRegNum<[70]>;
+def B7    : AArch64Reg<7,   "b7">, DwarfRegNum<[71]>;
+def B8    : AArch64Reg<8,   "b8">, DwarfRegNum<[72]>;
+def B9    : AArch64Reg<9,   "b9">, DwarfRegNum<[73]>;
+def B10   : AArch64Reg<10, "b10">, DwarfRegNum<[74]>;
+def B11   : AArch64Reg<11, "b11">, DwarfRegNum<[75]>;
+def B12   : AArch64Reg<12, "b12">, DwarfRegNum<[76]>;
+def B13   : AArch64Reg<13, "b13">, DwarfRegNum<[77]>;
+def B14   : AArch64Reg<14, "b14">, DwarfRegNum<[78]>;
+def B15   : AArch64Reg<15, "b15">, DwarfRegNum<[79]>;
+def B16   : AArch64Reg<16, "b16">, DwarfRegNum<[80]>;
+def B17   : AArch64Reg<17, "b17">, DwarfRegNum<[81]>;
+def B18   : AArch64Reg<18, "b18">, DwarfRegNum<[82]>;
+def B19   : AArch64Reg<19, "b19">, DwarfRegNum<[83]>;
+def B20   : AArch64Reg<20, "b20">, DwarfRegNum<[84]>;
+def B21   : AArch64Reg<21, "b21">, DwarfRegNum<[85]>;
+def B22   : AArch64Reg<22, "b22">, DwarfRegNum<[86]>;
+def B23   : AArch64Reg<23, "b23">, DwarfRegNum<[87]>;
+def B24   : AArch64Reg<24, "b24">, DwarfRegNum<[88]>;
+def B25   : AArch64Reg<25, "b25">, DwarfRegNum<[89]>;
+def B26   : AArch64Reg<26, "b26">, DwarfRegNum<[90]>;
+def B27   : AArch64Reg<27, "b27">, DwarfRegNum<[91]>;
+def B28   : AArch64Reg<28, "b28">, DwarfRegNum<[92]>;
+def B29   : AArch64Reg<29, "b29">, DwarfRegNum<[93]>;
+def B30   : AArch64Reg<30, "b30">, DwarfRegNum<[94]>;
+def B31   : AArch64Reg<31, "b31">, DwarfRegNum<[95]>;
+
+let SubRegIndices = [bsub] in {
+def H0    : AArch64Reg<0,   "h0", [B0]>, DwarfRegAlias<B0>;
+def H1    : AArch64Reg<1,   "h1", [B1]>, DwarfRegAlias<B1>;
+def H2    : AArch64Reg<2,   "h2", [B2]>, DwarfRegAlias<B2>;
+def H3    : AArch64Reg<3,   "h3", [B3]>, DwarfRegAlias<B3>;
+def H4    : AArch64Reg<4,   "h4", [B4]>, DwarfRegAlias<B4>;
+def H5    : AArch64Reg<5,   "h5", [B5]>, DwarfRegAlias<B5>;
+def H6    : AArch64Reg<6,   "h6", [B6]>, DwarfRegAlias<B6>;
+def H7    : AArch64Reg<7,   "h7", [B7]>, DwarfRegAlias<B7>;
+def H8    : AArch64Reg<8,   "h8", [B8]>, DwarfRegAlias<B8>;
+def H9    : AArch64Reg<9,   "h9", [B9]>, DwarfRegAlias<B9>;
+def H10   : AArch64Reg<10, "h10", [B10]>, DwarfRegAlias<B10>;
+def H11   : AArch64Reg<11, "h11", [B11]>, DwarfRegAlias<B11>;
+def H12   : AArch64Reg<12, "h12", [B12]>, DwarfRegAlias<B12>;
+def H13   : AArch64Reg<13, "h13", [B13]>, DwarfRegAlias<B13>;
+def H14   : AArch64Reg<14, "h14", [B14]>, DwarfRegAlias<B14>;
+def H15   : AArch64Reg<15, "h15", [B15]>, DwarfRegAlias<B15>;
+def H16   : AArch64Reg<16, "h16", [B16]>, DwarfRegAlias<B16>;
+def H17   : AArch64Reg<17, "h17", [B17]>, DwarfRegAlias<B17>;
+def H18   : AArch64Reg<18, "h18", [B18]>, DwarfRegAlias<B18>;
+def H19   : AArch64Reg<19, "h19", [B19]>, DwarfRegAlias<B19>;
+def H20   : AArch64Reg<20, "h20", [B20]>, DwarfRegAlias<B20>;
+def H21   : AArch64Reg<21, "h21", [B21]>, DwarfRegAlias<B21>;
+def H22   : AArch64Reg<22, "h22", [B22]>, DwarfRegAlias<B22>;
+def H23   : AArch64Reg<23, "h23", [B23]>, DwarfRegAlias<B23>;
+def H24   : AArch64Reg<24, "h24", [B24]>, DwarfRegAlias<B24>;
+def H25   : AArch64Reg<25, "h25", [B25]>, DwarfRegAlias<B25>;
+def H26   : AArch64Reg<26, "h26", [B26]>, DwarfRegAlias<B26>;
+def H27   : AArch64Reg<27, "h27", [B27]>, DwarfRegAlias<B27>;
+def H28   : AArch64Reg<28, "h28", [B28]>, DwarfRegAlias<B28>;
+def H29   : AArch64Reg<29, "h29", [B29]>, DwarfRegAlias<B29>;
+def H30   : AArch64Reg<30, "h30", [B30]>, DwarfRegAlias<B30>;
+def H31   : AArch64Reg<31, "h31", [B31]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [hsub] in {
+def S0    : AArch64Reg<0,   "s0", [H0]>, DwarfRegAlias<B0>;
+def S1    : AArch64Reg<1,   "s1", [H1]>, DwarfRegAlias<B1>;
+def S2    : AArch64Reg<2,   "s2", [H2]>, DwarfRegAlias<B2>;
+def S3    : AArch64Reg<3,   "s3", [H3]>, DwarfRegAlias<B3>;
+def S4    : AArch64Reg<4,   "s4", [H4]>, DwarfRegAlias<B4>;
+def S5    : AArch64Reg<5,   "s5", [H5]>, DwarfRegAlias<B5>;
+def S6    : AArch64Reg<6,   "s6", [H6]>, DwarfRegAlias<B6>;
+def S7    : AArch64Reg<7,   "s7", [H7]>, DwarfRegAlias<B7>;
+def S8    : AArch64Reg<8,   "s8", [H8]>, DwarfRegAlias<B8>;
+def S9    : AArch64Reg<9,   "s9", [H9]>, DwarfRegAlias<B9>;
+def S10   : AArch64Reg<10, "s10", [H10]>, DwarfRegAlias<B10>;
+def S11   : AArch64Reg<11, "s11", [H11]>, DwarfRegAlias<B11>;
+def S12   : AArch64Reg<12, "s12", [H12]>, DwarfRegAlias<B12>;
+def S13   : AArch64Reg<13, "s13", [H13]>, DwarfRegAlias<B13>;
+def S14   : AArch64Reg<14, "s14", [H14]>, DwarfRegAlias<B14>;
+def S15   : AArch64Reg<15, "s15", [H15]>, DwarfRegAlias<B15>;
+def S16   : AArch64Reg<16, "s16", [H16]>, DwarfRegAlias<B16>;
+def S17   : AArch64Reg<17, "s17", [H17]>, DwarfRegAlias<B17>;
+def S18   : AArch64Reg<18, "s18", [H18]>, DwarfRegAlias<B18>;
+def S19   : AArch64Reg<19, "s19", [H19]>, DwarfRegAlias<B19>;
+def S20   : AArch64Reg<20, "s20", [H20]>, DwarfRegAlias<B20>;
+def S21   : AArch64Reg<21, "s21", [H21]>, DwarfRegAlias<B21>;
+def S22   : AArch64Reg<22, "s22", [H22]>, DwarfRegAlias<B22>;
+def S23   : AArch64Reg<23, "s23", [H23]>, DwarfRegAlias<B23>;
+def S24   : AArch64Reg<24, "s24", [H24]>, DwarfRegAlias<B24>;
+def S25   : AArch64Reg<25, "s25", [H25]>, DwarfRegAlias<B25>;
+def S26   : AArch64Reg<26, "s26", [H26]>, DwarfRegAlias<B26>;
+def S27   : AArch64Reg<27, "s27", [H27]>, DwarfRegAlias<B27>;
+def S28   : AArch64Reg<28, "s28", [H28]>, DwarfRegAlias<B28>;
+def S29   : AArch64Reg<29, "s29", [H29]>, DwarfRegAlias<B29>;
+def S30   : AArch64Reg<30, "s30", [H30]>, DwarfRegAlias<B30>;
+def S31   : AArch64Reg<31, "s31", [H31]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in {
+def D0    : AArch64Reg<0,   "d0", [S0], ["v0", ""]>, DwarfRegAlias<B0>;
+def D1    : AArch64Reg<1,   "d1", [S1], ["v1", ""]>, DwarfRegAlias<B1>;
+def D2    : AArch64Reg<2,   "d2", [S2], ["v2", ""]>, DwarfRegAlias<B2>;
+def D3    : AArch64Reg<3,   "d3", [S3], ["v3", ""]>, DwarfRegAlias<B3>;
+def D4    : AArch64Reg<4,   "d4", [S4], ["v4", ""]>, DwarfRegAlias<B4>;
+def D5    : AArch64Reg<5,   "d5", [S5], ["v5", ""]>, DwarfRegAlias<B5>;
+def D6    : AArch64Reg<6,   "d6", [S6], ["v6", ""]>, DwarfRegAlias<B6>;
+def D7    : AArch64Reg<7,   "d7", [S7], ["v7", ""]>, DwarfRegAlias<B7>;
+def D8    : AArch64Reg<8,   "d8", [S8], ["v8", ""]>, DwarfRegAlias<B8>;
+def D9    : AArch64Reg<9,   "d9", [S9], ["v9", ""]>, DwarfRegAlias<B9>;
+def D10   : AArch64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias<B10>;
+def D11   : AArch64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias<B11>;
+def D12   : AArch64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias<B12>;
+def D13   : AArch64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias<B13>;
+def D14   : AArch64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias<B14>;
+def D15   : AArch64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias<B15>;
+def D16   : AArch64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias<B16>;
+def D17   : AArch64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias<B17>;
+def D18   : AArch64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias<B18>;
+def D19   : AArch64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias<B19>;
+def D20   : AArch64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias<B20>;
+def D21   : AArch64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias<B21>;
+def D22   : AArch64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias<B22>;
+def D23   : AArch64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias<B23>;
+def D24   : AArch64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias<B24>;
+def D25   : AArch64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias<B25>;
+def D26   : AArch64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias<B26>;
+def D27   : AArch64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias<B27>;
+def D28   : AArch64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias<B28>;
+def D29   : AArch64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias<B29>;
+def D30   : AArch64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias<B30>;
+def D31   : AArch64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias<B31>;
+}
+
+let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in {
+def Q0    : AArch64Reg<0,   "q0", [D0], ["v0", ""]>, DwarfRegAlias<B0>;
+def Q1    : AArch64Reg<1,   "q1", [D1], ["v1", ""]>, DwarfRegAlias<B1>;
+def Q2    : AArch64Reg<2,   "q2", [D2], ["v2", ""]>, DwarfRegAlias<B2>;
+def Q3    : AArch64Reg<3,   "q3", [D3], ["v3", ""]>, DwarfRegAlias<B3>;
+def Q4    : AArch64Reg<4,   "q4", [D4], ["v4", ""]>, DwarfRegAlias<B4>;
+def Q5    : AArch64Reg<5,   "q5", [D5], ["v5", ""]>, DwarfRegAlias<B5>;
+def Q6    : AArch64Reg<6,   "q6", [D6], ["v6", ""]>, DwarfRegAlias<B6>;
+def Q7    : AArch64Reg<7,   "q7", [D7], ["v7", ""]>, DwarfRegAlias<B7>;
+def Q8    : AArch64Reg<8,   "q8", [D8], ["v8", ""]>, DwarfRegAlias<B8>;
+def Q9    : AArch64Reg<9,   "q9", [D9], ["v9", ""]>, DwarfRegAlias<B9>;
+def Q10   : AArch64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias<B10>;
+def Q11   : AArch64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias<B11>;
+def Q12   : AArch64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias<B12>;
+def Q13   : AArch64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias<B13>;
+def Q14   : AArch64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias<B14>;
+def Q15   : AArch64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias<B15>;
+def Q16   : AArch64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias<B16>;
+def Q17   : AArch64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias<B17>;
+def Q18   : AArch64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias<B18>;
+def Q19   : AArch64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias<B19>;
+def Q20   : AArch64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias<B20>;
+def Q21   : AArch64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias<B21>;
+def Q22   : AArch64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias<B22>;
+def Q23   : AArch64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias<B23>;
+def Q24   : AArch64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias<B24>;
+def Q25   : AArch64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias<B25>;
+def Q26   : AArch64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias<B26>;
+def Q27   : AArch64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias<B27>;
+def Q28   : AArch64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias<B28>;
+def Q29   : AArch64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias<B29>;
+def Q30   : AArch64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias<B30>;
+def Q31   : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
+}
+
+def FPR8  : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> {
+  let Size = 8;
+}
+def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> {
+  let Size = 16;
+}
+def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
+def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
+                                    v1i64, v4f16],
+                                    64, (sequence "D%u", 0, 31)>;
+// We don't (yet) have an f128 legal type, so don't use that here. We
+// normalize 128-bit vectors to v2f64 for arg passing and such, so use
+// that here.
+def FPR128 : RegisterClass<"AArch64",
+                           [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128,
+                            v8f16],
+                           128, (sequence "Q%u", 0, 31)>;
+
+// The lower 16 vector registers.  Some instructions can only take registers
+// in this range.
+def FPR128_lo : RegisterClass<"AArch64",
+                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16],
+                              128, (trunc FPR128, 16)>;
+
+// Pairs, triples, and quads of 64-bit vector registers.
+def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>;
+def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2],
+                                 [(rotl FPR64, 0), (rotl FPR64, 1),
+                                  (rotl FPR64, 2)]>;
+def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3],
+                               [(rotl FPR64, 0), (rotl FPR64, 1),
+                                (rotl FPR64, 2), (rotl FPR64, 3)]>;
+def DD   : RegisterClass<"AArch64", [untyped], 64, (add DSeqPairs)> {
+  let Size = 128;
+}
+def DDD  : RegisterClass<"AArch64", [untyped], 64, (add DSeqTriples)> {
+  let Size = 192;
+}
+def DDDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqQuads)> {
+  let Size = 256;
+}
+
+// Pairs, triples, and quads of 128-bit vector registers.
+def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>;
+def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2],
+                                 [(rotl FPR128, 0), (rotl FPR128, 1),
+                                  (rotl FPR128, 2)]>;
+def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3],
+                               [(rotl FPR128, 0), (rotl FPR128, 1),
+                                (rotl FPR128, 2), (rotl FPR128, 3)]>;
+def QQ   : RegisterClass<"AArch64", [untyped], 128, (add QSeqPairs)> {
+  let Size = 256;
+}
+def QQQ  : RegisterClass<"AArch64", [untyped], 128, (add QSeqTriples)> {
+  let Size = 384;
+}
+def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> {
+  let Size = 512;
+}
+
+
+// Vector operand versions of the FP registers. Alternate name printing and
+// assmebler matching.
+def VectorReg64AsmOperand : AsmOperandClass {
+  let Name = "VectorReg64";
+  let PredicateMethod = "isVectorReg";
+}
+def VectorReg128AsmOperand : AsmOperandClass {
+  let Name = "VectorReg128";
+  let PredicateMethod = "isVectorReg";
+}
+
+def V64  : RegisterOperand<FPR64, "printVRegOperand"> {
+  let ParserMatchClass = VectorReg64AsmOperand;
+}
+
+def V128 : RegisterOperand<FPR128, "printVRegOperand"> {
+  let ParserMatchClass = VectorReg128AsmOperand;
+}
+
+def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; }
+def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
+  let ParserMatchClass = VectorRegLoAsmOperand;
+}
+
+class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind>
+    : AsmOperandClass {
+  let Name = "TypedVectorList" # count # "_" # lanes # kind;
+
+  let PredicateMethod
+      = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>";
+  let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">";
+}
+
+class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind>
+    : RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
+                                                   # kind # "'>">;
+
+multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
+  // With implicit types (probably on instruction instead). E.g. { v0, v1 }
+  def _64AsmOperand : AsmOperandClass {
+    let Name = NAME # "64";
+    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+    let RenderMethod = "addVectorList64Operands<" # count # ">";
+  }
+
+  def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_64AsmOperand");
+  }
+
+  def _128AsmOperand : AsmOperandClass {
+    let Name = NAME # "128";
+    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+    let RenderMethod = "addVectorList128Operands<" # count # ">";
+  }
+
+  def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_128AsmOperand");
+  }
+
+  // 64-bit register lists with explicit type.
+
+  // { v0.8b, v1.8b }
+  def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">;
+  def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
+  }
+
+  // { v0.4h, v1.4h }
+  def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">;
+  def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
+  }
+
+  // { v0.2s, v1.2s }
+  def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">;
+  def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
+  }
+
+  // { v0.1d, v1.1d }
+  def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">;
+  def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
+  }
+
+  // 128-bit register lists with explicit type
+
+  // { v0.16b, v1.16b }
+  def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">;
+  def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
+  }
+
+  // { v0.8h, v1.8h }
+  def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">;
+  def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
+  }
+
+  // { v0.4s, v1.4s }
+  def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">;
+  def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
+  }
+
+  // { v0.2d, v1.2d }
+  def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">;
+  def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
+  }
+
+  // { v0.b, v1.b }
+  def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">;
+  def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
+  }
+
+  // { v0.h, v1.h }
+  def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">;
+  def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
+  }
+
+  // { v0.s, v1.s }
+  def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">;
+  def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
+  }
+
+  // { v0.d, v1.d }
+  def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">;
+  def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
+  }
+
+
+}
+
+defm VecListOne   : VectorList<1, FPR64, FPR128>;
+defm VecListTwo   : VectorList<2, DD,    QQ>;
+defm VecListThree : VectorList<3, DDD,   QQQ>;
+defm VecListFour  : VectorList<4, DDDD,  QQQQ>;
+
+
+// Register operand versions of the scalar FP registers.
+def FPR16Op : RegisterOperand<FPR16, "printOperand">;
+def FPR32Op : RegisterOperand<FPR32, "printOperand">;
+def FPR64Op : RegisterOperand<FPR64, "printOperand">;
+def FPR128Op : RegisterOperand<FPR128, "printOperand">;
+
+
+//===----------------------------------------------------------------------===//
+// ARMv8.1a atomic CASP register operands
+
+
+def WSeqPairs : RegisterTuples<[sube32, subo32], 
+                               [(rotl GPR32, 0), (rotl GPR32, 1)]>;
+def XSeqPairs : RegisterTuples<[sube64, subo64], 
+                               [(rotl GPR64, 0), (rotl GPR64, 1)]>;
+
+def WSeqPairsClass   : RegisterClass<"AArch64", [untyped], 32, 
+                                     (add WSeqPairs)>{
+  let Size = 64;
+}
+def XSeqPairsClass   : RegisterClass<"AArch64", [untyped], 64, 
+                                     (add XSeqPairs)>{
+  let Size = 128;
+}
+
+
+let RenderMethod = "addRegOperands", ParserMethod="tryParseGPRSeqPair" in {
+  def WSeqPairsAsmOperandClass : AsmOperandClass { let Name = "WSeqPair"; }
+  def XSeqPairsAsmOperandClass : AsmOperandClass { let Name = "XSeqPair"; }
+}
+
+def WSeqPairClassOperand :
+    RegisterOperand<WSeqPairsClass, "printGPRSeqPairsClassOperand<32>"> {
+  let ParserMatchClass = WSeqPairsAsmOperandClass;
+}
+def XSeqPairClassOperand :
+    RegisterOperand<XSeqPairsClass, "printGPRSeqPairsClassOperand<64>"> {
+  let ParserMatchClass = XSeqPairsAsmOperandClass;
+}
+
+
+//===----- END: v8.1a atomic CASP register operands -----------------------===//
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td
new file mode 100644
index 000000000000..93ca079275c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td
@@ -0,0 +1,293 @@
+//==- AArch64SchedA53.td - Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM Cortex A53 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See MCSchedModel.h for details.
+
+// Cortex-A53 machine model for scheduling and other instruction cost heuristics.
+def CortexA53Model : SchedMachineModel {
+  let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order.
+  let IssueWidth = 2;        // 2 micro-ops are dispatched per cycle.
+  let LoadLatency = 3;       // Optimistic load latency assuming bypass.
+                             // This is overriden by OperandCycles if the
+                             // Itineraries are queried instead.
+  let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
+                             // Specification - Instruction Timings"
+                             // v 1.0 Spreadsheet
+  let CompleteModel = 1;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
+// Cortex-A53 is in-order.
+
+def A53UnitALU    : ProcResource<2> { let BufferSize = 0; } // Int ALU
+def A53UnitMAC    : ProcResource<1> { let BufferSize = 0; } // Int MAC
+def A53UnitDiv    : ProcResource<1> { let BufferSize = 0; } // Int Division
+def A53UnitLdSt   : ProcResource<1> { let BufferSize = 0; } // Load/Store
+def A53UnitB      : ProcResource<1> { let BufferSize = 0; } // Branch
+def A53UnitFPALU  : ProcResource<1> { let BufferSize = 0; } // FP ALU
+def A53UnitFPMDS  : ProcResource<1> { let BufferSize = 0; } // FP Mult/Div/Sqrt
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types which both map the ProcResources and
+// set the latency.
+
+let SchedModel = CortexA53Model in {
+
+// ALU - Despite having a full latency of 4, most of the ALU instructions can
+//       forward a cycle earlier and then two cycles earlier in the case of a
+//       shift-only instruction. These latencies will be incorrect when the
+//       result cannot be forwarded, but modeling isn't rocket surgery.
+def : WriteRes<WriteImm, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteI, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteISReg, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteIEReg, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteIS, [A53UnitALU]> { let Latency = 2; }
+def : WriteRes<WriteExtr, [A53UnitALU]> { let Latency = 3; }
+
+// MAC
+def : WriteRes<WriteIM32, [A53UnitMAC]> { let Latency = 4; }
+def : WriteRes<WriteIM64, [A53UnitMAC]> { let Latency = 4; }
+
+// Div
+def : WriteRes<WriteID32, [A53UnitDiv]> { let Latency = 4; }
+def : WriteRes<WriteID64, [A53UnitDiv]> { let Latency = 4; }
+
+// Load
+def : WriteRes<WriteLD, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDIdx, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDHi, [A53UnitLdSt]> { let Latency = 4; }
+
+// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd
+//               below, choosing the median of 3 which makes the latency 6.
+//               May model this more carefully in the future. The remaining
+//               A53WriteVLD# types represent the 1-5 cycle issues explicitly.
+def : WriteRes<WriteVLD, [A53UnitLdSt]> { let Latency = 6;
+                                          let ResourceCycles = [3]; }
+def A53WriteVLD1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
+def A53WriteVLD2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
+                                                  let ResourceCycles = [2]; }
+def A53WriteVLD3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
+                                                  let ResourceCycles = [3]; }
+def A53WriteVLD4 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 7;
+                                                  let ResourceCycles = [4]; }
+def A53WriteVLD5 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 8;
+                                                  let ResourceCycles = [5]; }
+
+// Pre/Post Indexing - Performed as part of address generation which is already
+//                     accounted for in the WriteST* latencies below
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Store
+def : WriteRes<WriteST, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTP, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTIdx, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTX, [A53UnitLdSt]> { let Latency = 4; }
+
+// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
+def : WriteRes<WriteVST, [A53UnitLdSt]> { let Latency = 5;
+                                          let ResourceCycles = [2];}
+def A53WriteVST1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
+def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
+                                                  let ResourceCycles = [2]; }
+def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
+                                                  let ResourceCycles = [3]; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// Branch
+def : WriteRes<WriteBr, [A53UnitB]>;
+def : WriteRes<WriteBrReg, [A53UnitB]>;
+def : WriteRes<WriteSys, [A53UnitB]>;
+def : WriteRes<WriteBarrier, [A53UnitB]>;
+def : WriteRes<WriteHint, [A53UnitB]>;
+
+// FP ALU
+def : WriteRes<WriteF, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCmp, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCvt, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCopy, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFImm, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteV, [A53UnitFPALU]> { let Latency = 6; }
+
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFMul, [A53UnitFPMDS]> { let Latency = 6; }
+def : WriteRes<WriteFDiv, [A53UnitFPMDS]> { let Latency = 33;
+                                            let ResourceCycles = [29]; }
+def A53WriteFMAC : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 10; }
+def A53WriteFDivSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 18;
+                                                     let ResourceCycles = [14]; }
+def A53WriteFDivDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33;
+                                                     let ResourceCycles = [29]; }
+def A53WriteFSqrtSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 17;
+                                                      let ResourceCycles = [13]; }
+def A53WriteFSqrtDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32;
+                                                      let ResourceCycles = [28]; }
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+// No forwarding for these reads.
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
+//       operands are needed one cycle later if and only if they are to be
+//       shifted. Otherwise, they too are needed two cycles later. This same
+//       ReadAdvance applies to Extended registers as well, even though there is
+//       a separate SchedPredicate for them.
+def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
+                             WriteISReg, WriteIEReg,WriteIS,
+                             WriteID32,WriteID64,
+                             WriteIM32,WriteIM64]>;
+def A53ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI,
+                                          WriteISReg, WriteIEReg,WriteIS,
+                                          WriteID32,WriteID64,
+                                          WriteIM32,WriteIM64]>;
+def A53ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI,
+                                             WriteISReg, WriteIEReg,WriteIS,
+                                             WriteID32,WriteID64,
+                                             WriteIM32,WriteIM64]>;
+def A53ReadISReg : SchedReadVariant<[
+	SchedVar<RegShiftedPred, [A53ReadShifted]>,
+	SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
+def : SchedAlias<ReadISReg, A53ReadISReg>;
+
+def A53ReadIEReg : SchedReadVariant<[
+	SchedVar<RegExtendedPred, [A53ReadShifted]>,
+	SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
+def : SchedAlias<ReadIEReg, A53ReadIEReg>;
+
+// MAC - Operands are generally needed one cycle later in the MAC pipe.
+//       Accumulator operands are needed two cycles later.
+def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg,WriteIS,
+                              WriteID32,WriteID64,
+                              WriteIM32,WriteIM64]>;
+def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI,
+                               WriteISReg, WriteIEReg,WriteIS,
+                               WriteID32,WriteID64,
+                               WriteIM32,WriteIM64]>;
+
+// Div
+def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg,WriteIS,
+                              WriteID32,WriteID64,
+                              WriteIM32,WriteIM64]>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific InstRWs.
+
+//---
+// Miscellaneous
+//---
+def : InstRW<[WriteI], (instrs COPY)>;
+
+//---
+// Vector Loads
+//---
+def : InstRW<[A53WriteVLD1], (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>;
+
+//---
+// Vector Stores
+//---
+def : InstRW<[A53WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVST1], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST3Threev(2d)$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+
+//---
+// Floating Point MAC, DIV, SQRT
+//---
+def : InstRW<[A53WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
+def : InstRW<[A53WriteFMAC], (instregex "^FML(A|S).*")>;
+def : InstRW<[A53WriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[A53WriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[A53WriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[A53WriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[A53WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[A53WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td
new file mode 100644
index 000000000000..99c48d0146e4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -0,0 +1,664 @@
+//=- AArch64SchedA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for ARM Cortex-A57 to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// The Cortex-A57 is a traditional superscaler microprocessor with a
+// conservative 3-wide in-order stage for decode and dispatch. Combined with the
+// much wider out-of-order issue stage, this produced a need to carefully
+// schedule micro-ops so that all three decoded each cycle are successfully
+// issued as the reservation station(s) simply don't stay occupied for long.
+// Therefore, IssueWidth is set to the narrower of the two at three, while still
+// modeling the machine as out-of-order.
+
+def CortexA57Model : SchedMachineModel {
+  let IssueWidth        =   3; // 3-way decode and dispatch
+  let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
+  let LoadLatency       =   4; // Optimistic load latency
+  let MispredictPenalty =  14; // Fetch + Decode/Rename/Dispatch + Branch
+
+  // Enable partial & runtime unrolling. The magic number is chosen based on
+  // experiments and benchmarking data.
+  let LoopMicroOpBufferSize = 16;
+  let CompleteModel = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Cortex-A57.
+// Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
+// micro-ops wait for their operands and then issue out-of-order.
+
+def A57UnitB : ProcResource<1>;  // Type B micro-ops
+def A57UnitI : ProcResource<2>;  // Type I micro-ops
+def A57UnitM : ProcResource<1>;  // Type M micro-ops
+def A57UnitL : ProcResource<1>;  // Type L micro-ops
+def A57UnitS : ProcResource<1>;  // Type S micro-ops
+def A57UnitX : ProcResource<1>;  // Type X micro-ops
+def A57UnitW : ProcResource<1>;  // Type W micro-ops
+let SchedModel = CortexA57Model in {
+  def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>;    // Type V micro-ops
+}
+
+let SchedModel = CortexA57Model in {
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Cortex-A57.
+
+include "AArch64SchedA57WriteRes.td"
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for
+// Cortex-A57. The Cortex-A57 types are directly associated with resources, so
+// defining the aliases precludes the need for mapping them using WriteRes. The
+// aliases are sufficient for creating a coarse, working model. As the model
+// evolves, InstRWs will be used to override some of these SchedAliases.
+//
+// WARNING: Using SchedAliases is convenient and works well for latency and
+//          resource lookup for instructions. However, this creates an entry in
+//          AArch64WriteLatencyTable with a WriteResourceID of 0, breaking
+//          any SchedReadAdvance since the lookup will fail.
+
+def : SchedAlias<WriteImm,   A57Write_1cyc_1I>;
+def : SchedAlias<WriteI,     A57Write_1cyc_1I>;
+def : SchedAlias<WriteISReg, A57Write_2cyc_1M>;
+def : SchedAlias<WriteIEReg, A57Write_2cyc_1M>;
+def : SchedAlias<WriteExtr,  A57Write_1cyc_1I>;
+def : SchedAlias<WriteIS,    A57Write_1cyc_1I>;
+def : SchedAlias<WriteID32,  A57Write_19cyc_1M>;
+def : SchedAlias<WriteID64,  A57Write_35cyc_1M>;
+def : WriteRes<WriteIM32, [A57UnitM]> { let Latency = 3; }
+def : WriteRes<WriteIM64, [A57UnitM]> { let Latency = 5; }
+def : SchedAlias<WriteBr,    A57Write_1cyc_1B>;
+def : SchedAlias<WriteBrReg, A57Write_1cyc_1B>;
+def : SchedAlias<WriteLD,    A57Write_4cyc_1L>;
+def : SchedAlias<WriteST,    A57Write_1cyc_1S>;
+def : SchedAlias<WriteSTP,   A57Write_1cyc_1S>;
+def : SchedAlias<WriteAdr,   A57Write_1cyc_1I>;
+def : SchedAlias<WriteLDIdx, A57Write_4cyc_1I_1L>;
+def : SchedAlias<WriteSTIdx, A57Write_1cyc_1I_1S>;
+def : SchedAlias<WriteF,     A57Write_3cyc_1V>;
+def : SchedAlias<WriteFCmp,  A57Write_3cyc_1V>;
+def : SchedAlias<WriteFCvt,  A57Write_5cyc_1V>;
+def : SchedAlias<WriteFCopy, A57Write_5cyc_1L>;
+def : SchedAlias<WriteFImm,  A57Write_3cyc_1V>;
+def : SchedAlias<WriteFMul,  A57Write_5cyc_1V>;
+def : SchedAlias<WriteFDiv,  A57Write_17cyc_1W>;
+def : SchedAlias<WriteV,     A57Write_3cyc_1V>;
+def : SchedAlias<WriteVLD,   A57Write_5cyc_1L>;
+def : SchedAlias<WriteVST,   A57Write_1cyc_1S>;
+
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi,    []> { let Latency = 4; }
+
+// Forwarding logic is only modeled for multiply and accumulate
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     2, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+
+//===----------------------------------------------------------------------===//
+// Specialize the coarse model by associating instruction groups with the
+// subtarget-defined types. As the modeled is refined, this will override most
+// of the above ShchedAlias mappings.
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+
+// Branch Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1B_1I], (instrs BL)>;
+def : InstRW<[A57Write_2cyc_1B_1I], (instrs BLR)>;
+
+
+// Shifted Register with Shift == 0
+// ----------------------------------------------------------------------------
+
+def A57WriteISReg : SchedWriteVariant<[
+       SchedVar<RegShiftedPred, [WriteISReg]>,
+       SchedVar<NoSchedPred, [WriteI]>]>;
+def : InstRW<[A57WriteISReg], (instregex ".*rs$")>;
+
+
+// Divide and Multiply Instructions
+// -----------------------------------------------------------------------------
+
+// Multiply high
+def : InstRW<[A57Write_6cyc_1M], (instrs SMULHrr, UMULHrr)>;
+
+
+// Miscellaneous Data-Processing Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1I],    (instrs EXTRWrri)>;
+def : InstRW<[A57Write_3cyc_1I_1M], (instrs EXTRXrri)>;
+def : InstRW<[A57Write_2cyc_1M],    (instregex "BFM")>;
+
+
+// Cryptography Extensions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
+def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
+def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>;
+def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^CRC32")>;
+
+
+// Vector Load
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_8cyc_1L_1V],           (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1i(8|16|32)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],            (instregex "LD1i(64)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],  (instregex "LD1i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_1V],           (instregex "LD1Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],            (instregex "LD1Rv(1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],  (instregex "LD1Rv(1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_1V],           (instregex "LD1Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_5cyc_1L],              (instregex "LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],    (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],              (instregex "LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],    (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],              (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],    (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_7cyc_3L],            (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_7cyc_3L, WriteAdr],  (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_4L],           (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_4L, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_2V],           (instregex "LD2i(8|16)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD2i(8|16)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],            (instregex "LD2i(32)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],  (instregex "LD2i(32)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_1V],            (instregex "LD2i(64)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr],  (instregex "LD2i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_1V],            (instregex "LD2Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr],  (instregex "LD2Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_5cyc_1L],             (instregex "LD2Rv(1d)$")>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr],   (instregex "LD2Rv(1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_2V],           (instregex "LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_1V],             (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr],   (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_2V],           (instregex "LD2Twov(16b|8h|4s)$")>;
+def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr], (instregex "LD2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD2Twov(2d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD2Twov(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_1L_3V],           (instregex "LD3i(8|16)$")>;
+def : InstRW<[A57Write_9cyc_1L_3V, WriteAdr], (instregex "LD3i(8|16)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_2V],            (instregex "LD3i(32)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],  (instregex "LD3i(32)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],             (instregex "LD3i(64)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD3i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_2V],             (instregex "LD3Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],   (instregex "LD3Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],              (instregex "LD3Rv(1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],    (instregex "LD3Rv(1d)_POST$")>;
+def : InstRW<[A57Write_9cyc_1L_3V],            (instregex "LD3Rv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_9cyc_1L_3V, WriteAdr],  (instregex "LD3Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_3V],           (instregex "LD3Rv(2d)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD3Rv(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_2L_2V],               (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr],     (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_10cyc_3L_4V],           (instregex "LD3Threev(16b|8h|4s)$")>;
+def : InstRW<[A57Write_10cyc_3L_4V, WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_8cyc_4L],               (instregex "LD3Threev(2d)$")>;
+def : InstRW<[A57Write_8cyc_4L, WriteAdr],     (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_2L_3V],           (instregex "LD4i(8|16)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4i(8|16)_POST$")>;
+def : InstRW<[A57Write_8cyc_1L_2V],             (instregex "LD4i(32)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],   (instregex "LD4i(32)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_3V],           (instregex "LD4i(64)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4i(64)_POST$")>;
+
+def : InstRW<[A57Write_8cyc_1L_2V],              (instregex "LD4Rv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],    (instregex "LD4Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_6cyc_2L],               (instregex "LD4Rv(1d)$")>;
+def : InstRW<[A57Write_6cyc_2L, WriteAdr],     (instregex "LD4Rv(1d)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_3V],            (instregex "LD4Rv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr],  (instregex "LD4Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_9cyc_2L_4V],           (instregex "LD4Rv(2d)$")>;
+def : InstRW<[A57Write_9cyc_2L_4V, WriteAdr], (instregex "LD4Rv(2d)_POST$")>;
+
+def : InstRW<[A57Write_9cyc_2L_2V],                (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr],      (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_11cyc_4L_4V],           (instregex "LD4Fourv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_11cyc_4L_4V, WriteAdr], (instregex "LD4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_8cyc_4L],                (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[A57Write_8cyc_4L, WriteAdr],      (instregex "LD4Fourv(2d)_POST$")>;
+
+// Vector Store
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1S],            (instregex "ST1i(8|16|32)$")>;
+def : InstRW<[A57Write_1cyc_1S, WriteAdr],  (instregex "ST1i(8|16|32)_POST$")>;
+def : InstRW<[A57Write_3cyc_1S_1V],           (instregex "ST1i(64)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST1i(64)_POST$")>;
+
+def : InstRW<[A57Write_1cyc_1S],                  (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_1cyc_1S, WriteAdr],        (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_2cyc_2S],                 (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_2cyc_2S, WriteAdr],       (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_2cyc_2S],                 (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_2cyc_2S, WriteAdr],       (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S],               (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr],     (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_3cyc_3S],                (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_3cyc_3S, WriteAdr],      (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_6cyc_6S],             (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_6cyc_6S, WriteAdr],   (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S],               (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr],     (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[A57Write_8cyc_8S],           (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_1S_1V],           (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST2i(8|16|32)_POST$")>;
+def : InstRW<[A57Write_2cyc_2S],           (instregex "ST2i(64)$")>;
+def : InstRW<[A57Write_2cyc_2S, WriteAdr], (instregex "ST2i(64)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_2S_1V],              (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr],    (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S_2V],           (instregex "ST2Twov(16b|8h|4s)$")>;
+def : InstRW<[A57Write_4cyc_4S_2V, WriteAdr], (instregex "ST2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S],             (instregex "ST2Twov(2d)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr],   (instregex "ST2Twov(2d)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_1S_1V],            (instregex "ST3i(8|16)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr],  (instregex "ST3i(8|16)_POST$")>;
+def : InstRW<[A57Write_3cyc_3S],           (instregex "ST3i(32)$")>;
+def : InstRW<[A57Write_3cyc_3S, WriteAdr], (instregex "ST3i(32)_POST$")>;
+def : InstRW<[A57Write_3cyc_2S_1V],           (instregex "ST3i(64)$")>;
+def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr], (instregex "ST3i(64)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_3S_2V],                 (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[A57Write_3cyc_3S_2V, WriteAdr],       (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_6cyc_6S_4V],           (instregex "ST3Threev(16b|8h|4s)$")>;
+def : InstRW<[A57Write_6cyc_6S_4V, WriteAdr], (instregex "ST3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_6cyc_6S],                (instregex "ST3Threev(2d)$")>;
+def : InstRW<[A57Write_6cyc_6S, WriteAdr],      (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[A57Write_3cyc_1S_1V],             (instregex "ST4i(8|16)$")>;
+def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr],   (instregex "ST4i(8|16)_POST$")>;
+def : InstRW<[A57Write_4cyc_4S],           (instregex "ST4i(32)$")>;
+def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST4i(32)_POST$")>;
+def : InstRW<[A57Write_3cyc_2S_1V],            (instregex "ST4i(64)$")>;
+def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr],  (instregex "ST4i(64)_POST$")>;
+
+def : InstRW<[A57Write_4cyc_4S_2V],                  (instregex "ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[A57Write_4cyc_4S_2V, WriteAdr],        (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[A57Write_8cyc_8S_4V],           (instregex "ST4Fourv(16b|8h|4s)$")>;
+def : InstRW<[A57Write_8cyc_8S_4V, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[A57Write_8cyc_8S],                (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[A57Write_8cyc_8S, WriteAdr],      (instregex "ST4Fourv(2d)_POST$")>;
+
+// Vector - Integer
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+//   D form - v8i8, v4i16, v2i32
+//   Q form - v16i8, v8i16, v4i32
+//   D form - v1i8, v1i16, v1i32, v1i64
+//   Q form - v16i8, v8i16, v4i32, v2i64
+//   D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64
+//   Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64
+
+// ASIMD absolute diff accum, D-form
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
+// ASIMD absolute diff accum, Q-form
+def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
+// ASIMD absolute diff accum long
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABAL")>;
+
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
+// ASIMD arith, reduce, 8B/8H
+def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>;
+// ASIMD arith, reduce, 16B
+def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU]?ADDL?Vv16i8v$")>;
+
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>;
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>;
+// ASIMD max/min, reduce, 16B
+def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
+
+// ASIMD multiply, D-form
+def : InstRW<[A57Write_5cyc_1W], (instregex "^(P?MUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>;
+// ASIMD multiply, Q-form
+def : InstRW<[A57Write_6cyc_2W], (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// ASIMD multiply accumulate, D-form
+def : InstRW<[A57Write_5cyc_1W], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, Q-form
+def : InstRW<[A57Write_6cyc_2W], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// ASIMD multiply accumulate long
+// ASIMD multiply accumulate saturating long
+def A57WriteIVMA   : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
+def A57ReadIVMA4   : SchedReadAdvance<4, [A57WriteIVMA]>;
+def : InstRW<[A57WriteIVMA, A57ReadIVMA4], (instregex "^(S|U|SQD)ML[AS]L")>;
+
+// ASIMD multiply long
+def : InstRW<[A57Write_5cyc_1W], (instregex "^(S|U|SQD)MULL")>;
+def : InstRW<[A57Write_5cyc_1W], (instregex "^PMULL(v8i8|v16i8)")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^PMULL(v1i64|v2i64)")>;
+
+// ASIMD pairwise add and accumulate
+// ASIMD shift accumulate
+def A57WriteIVA    : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
+def A57ReadIVA3    : SchedReadAdvance<3, [A57WriteIVA]>;
+def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^[SU]ADALP")>;
+def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>;
+
+// ASIMD shift by immed, complex
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?(Q|R){1,2}SHR")>;
+def : InstRW<[A57Write_4cyc_1X], (instregex "^SQSHLU")>;
+
+
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[A57Write_4cyc_2X], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD shift by register, complex, D-form
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU][QR]{1,2}SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
+
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+
+// Vector - Floating Point
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+//   D form - v2f32
+//   Q form - v4f32, v2f64
+//   D form - 32, 64
+//   D form - v1i32, v1i64
+//   D form - v2i32
+//   Q form - v4i32, v2i64
+
+// ASIMD FP arith, normal, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FABD|FADD|FSUB)(v2f32|32|64|v2i32p)")>;
+// ASIMD FP arith, normal, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^(FABD|FADD|FSUB)(v4f32|v2f64|v2i64p)")>;
+
+// ASIMD FP arith, pairwise, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FADDP(v2f32|32|64|v2i32)")>;
+// ASIMD FP arith, pairwise, Q-form
+def : InstRW<[A57Write_9cyc_3V], (instregex "^FADDP(v4f32|v2f64|v2i64)")>;
+
+// ASIMD FP compare, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCMLE|FCMLT)(v2f32|32|64|v1i32|v2i32|v1i64)")>;
+// ASIMD FP compare, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCMLE|FCMLT)(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP convert, long and narrow
+def : InstRW<[A57Write_8cyc_3V], (instregex "^FCVT(L|N|XN)v")>;
+// ASIMD FP convert, other, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD FP convert, other, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[A57Write_17cyc_1W], (instregex "FDIVv2f32")>;
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[A57Write_34cyc_2W], (instregex "FDIVv4f32")>;
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[A57Write_64cyc_2W], (instregex "FDIVv2f64")>;
+
+// Note: These were simply duplicated from ASIMD FDIV because of missing documentation
+// ASIMD FP square root, D-form, F32
+def : InstRW<[A57Write_17cyc_1W], (instregex "FSQRTv2f32")>;
+// ASIMD FP square root, Q-form, F32
+def : InstRW<[A57Write_34cyc_2W], (instregex "FSQRTv4f32")>;
+// ASIMD FP square root, Q-form, F64
+def : InstRW<[A57Write_64cyc_2W], (instregex "FSQRTv2f64")>;
+
+// ASIMD FP max/min, normal, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FMAX|FMIN)(NM)?(v2f32)")>;
+// ASIMD FP max/min, normal, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^(FMAX|FMIN)(NM)?(v4f32|v2f64)")>;
+// ASIMD FP max/min, pairwise, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FMAX|FMIN)(NM)?P(v2f32|v2i32)")>;
+// ASIMD FP max/min, pairwise, Q-form
+def : InstRW<[A57Write_9cyc_3V], (instregex "^(FMAX|FMIN)(NM)?P(v4f32|v2f64|v2i64)")>;
+// ASIMD FP max/min, reduce
+def : InstRW<[A57Write_10cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv")>;
+
+// ASIMD FP multiply, D-form, FZ
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+// ASIMD FP multiply, Q-form, FZ
+def : InstRW<[A57Write_5cyc_2V], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP multiply accumulate, D-form, FZ
+// ASIMD FP multiply accumulate, Q-form, FZ
+def A57WriteFPVMAD : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
+def A57WriteFPVMAQ : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 10;  }
+def A57ReadFPVMA5  : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ]>;
+def : InstRW<[A57WriteFPVMAD, A57ReadFPVMA5], (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
+def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA5], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP round, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT[AIMNPXZ](v2f32)")>;
+// ASIMD FP round, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
+
+
+// Vector - Miscellaneous
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+//   D form - v8i8, v4i16, v2i32
+//   Q form - v16i8, v8i16, v4i32
+//   D form - v1i8, v1i16, v1i32, v1i64
+//   Q form - v16i8, v8i16, v4i32, v2i64
+
+// ASIMD bitwise insert, Q-form
+def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL)v16i8")>;
+
+// ASIMD duplicate, gen reg, D-form and Q-form
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>;
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^DUPv.+gpr")>;
+
+// ASIMD move, saturating
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]QXTU?N")>;
+
+// ASIMD reciprocal estimate, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^[FU](RECP|RSQRT)(E|X)(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD reciprocal estimate, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^[FU](RECP|RSQRT)(E|X)(v2f64|v4f32|v4i32)")>;
+
+// ASIMD reciprocal step, D-form, FZ
+def : InstRW<[A57Write_9cyc_1V], (instregex "^F(RECP|RSQRT)S(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+// ASIMD reciprocal step, Q-form, FZ
+def : InstRW<[A57Write_9cyc_2V], (instregex "^F(RECP|RSQRT)S(v2f64|v4f32|v4i32)")>;
+
+// ASIMD table lookup, D-form
+def : InstRW<[A57Write_3cyc_1V], (instregex "^TB[LX]v8i8One")>;
+def : InstRW<[A57Write_6cyc_2V], (instregex "^TB[LX]v8i8Two")>;
+def : InstRW<[A57Write_9cyc_3V], (instregex "^TB[LX]v8i8Three")>;
+def : InstRW<[A57Write_12cyc_4V], (instregex "^TB[LX]v8i8Four")>;
+// ASIMD table lookup, Q-form
+def : InstRW<[A57Write_6cyc_3V], (instregex "^TB[LX]v16i8One")>;
+def : InstRW<[A57Write_9cyc_5V], (instregex "^TB[LX]v16i8Two")>;
+def : InstRW<[A57Write_12cyc_7V], (instregex "^TB[LX]v16i8Three")>;
+def : InstRW<[A57Write_15cyc_9V], (instregex "^TB[LX]v16i8Four")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[A57Write_6cyc_1I_1L], (instregex "^[SU]MOVv")>;
+
+// ASIMD transfer, gen reg to element
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^INSv")>;
+
+// ASIMD unzip/zip, Q-form
+def : InstRW<[A57Write_6cyc_3V], (instregex "^(UZP|ZIP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>;
+
+
+// Remainder
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_5cyc_1V], (instregex "^F(ADD|SUB)[DS]rr")>;
+
+def A57WriteFPMA  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
+def A57ReadFPMA5  : SchedReadAdvance<5, [A57WriteFPMA]>;
+def A57ReadFPM    : SchedReadAdvance<0>;
+def : InstRW<[A57WriteFPMA, A57ReadFPM, A57ReadFPM, A57ReadFPMA5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
+
+def : InstRW<[A57Write_10cyc_1L_1V], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>;
+def : InstRW<[A57Write_10cyc_1L_1V], (instregex "^[SU]CVTF")>;
+
+def : InstRW<[A57Write_32cyc_1W], (instrs FDIVDrr)>;
+def : InstRW<[A57Write_17cyc_1W], (instrs FDIVSrr)>;
+
+def : InstRW<[A57Write_5cyc_1V], (instregex "^F(MAX|MIN).+rr")>;
+
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT.+r")>;
+
+def : InstRW<[A57Write_32cyc_1W], (instrs FSQRTDr)>;
+def : InstRW<[A57Write_17cyc_1W], (instrs FSQRTSr)>;
+
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDNPDi)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi], (instrs LDNPQi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDNPSi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDPDi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPDpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPDpre)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi], (instrs LDPQi)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi, WriteAdr], (instrs LDPQpost)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi, WriteAdr], (instrs LDPQpre)>;
+def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi], (instrs LDPSWi)>;
+def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi, WriteAdr], (instrs LDPSWpost)>;
+def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi, WriteAdr], (instrs LDPSWpre)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDPSi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPSpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPSpre)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRBpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRBroW)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRBroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRBui)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRDl)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRDpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRDroW)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRDroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRDui)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRHHroW)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRHHroX)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRHpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRHroW)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRHroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRHui)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRQl)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRQpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRQroW)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRQroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRQui)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHWroW)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHWroX)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHXroW)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHXroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRSl)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRSpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRSroW)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRSroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRSui)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURBi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURDi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURHi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURQi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURSi)>;
+
+def : InstRW<[A57Write_2cyc_2S], (instrs STNPDi)>;
+def : InstRW<[A57Write_4cyc_1I_4S], (instrs STNPQi)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STNPXi)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STPDi)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPDpost)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPDpre)>;
+def : InstRW<[A57Write_4cyc_1I_4S], (instrs STPQi)>;
+def : InstRW<[WriteAdr, A57Write_4cyc_1I_4S], (instrs STPQpost)>;
+def : InstRW<[WriteAdr, A57Write_4cyc_2I_4S], (instrs STPQpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPSpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPSpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPWpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPWpre)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STPXi)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPXpost)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPXpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBBpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBBpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRBpre)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRBroW)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRBroX)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRDpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRDpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHHpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHHpre)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHHroW)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHHroX)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRHpre)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHroW)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHroX)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQpost)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STRQpre)>;
+def : InstRW<[A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQroW)>;
+def : InstRW<[A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQroX)>;
+def : InstRW<[A57Write_2cyc_1I_2S], (instrs STRQui)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRSpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRSpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRWpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRWpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRXpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRXpre)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STURQi)>;
+
+} // SchedModel = CortexA57Model
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td
new file mode 100644
index 000000000000..55005e1d9ed1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td
@@ -0,0 +1,544 @@
+//=- AArch64SchedA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains all of the Cortex-A57 specific SchedWriteRes types. The approach
+// below is to define a generic SchedWriteRes for every combination of
+// latency and microOps. The naming conventions is to use a prefix, one field
+// for latency, and one or more microOp count/type designators.
+//   Prefix: A57Write
+//   Latency: #cyc
+//   MicroOp Count/Types: #(B|I|M|L|S|X|W|V)
+//
+// e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are
+//      11 micro-ops to be issued down one I pipe, six S pipes and four V pipes.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Define Generic 1 micro-op types
+
+def A57Write_5cyc_1L  : SchedWriteRes<[A57UnitL]> { let Latency = 5;  }
+def A57Write_5cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 5;  }
+def A57Write_5cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
+def A57Write_5cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
+def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; }
+def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17;
+                                                    let ResourceCycles = [17]; }
+def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19;
+                                                    let ResourceCycles = [19]; }
+def A57Write_1cyc_1B  : SchedWriteRes<[A57UnitB]> { let Latency = 1;  }
+def A57Write_1cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 1;  }
+def A57Write_1cyc_1S  : SchedWriteRes<[A57UnitS]> { let Latency = 1;  }
+def A57Write_2cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 2;  }
+def A57Write_32cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 32;
+                                                    let ResourceCycles = [32]; }
+def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35;
+                                                    let ResourceCycles = [35]; }
+def A57Write_3cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 3;  }
+def A57Write_3cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 3;  }
+def A57Write_3cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 3;  }
+def A57Write_3cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 3;  }
+def A57Write_4cyc_1L  : SchedWriteRes<[A57UnitL]> { let Latency = 4;  }
+def A57Write_4cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
+def A57Write_9cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
+def A57Write_6cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 6;  }
+def A57Write_6cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 6;  }
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 2 micro-op types
+
+def A57Write_64cyc_2W    : SchedWriteRes<[A57UnitW, A57UnitW]> {
+  let Latency     = 64;
+  let NumMicroOps = 2;
+  let ResourceCycles = [32, 32];
+}
+def A57Write_6cyc_1I_1L  : SchedWriteRes<[A57UnitI,
+                                          A57UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_7cyc_1V_1X  : SchedWriteRes<[A57UnitV,
+                                          A57UnitX]> {
+  let Latency     = 7;
+  let NumMicroOps = 2;
+}
+def A57Write_8cyc_1L_1V  : SchedWriteRes<[A57UnitL,
+                                          A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 2;
+}
+def A57Write_9cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 2;
+}
+def A57Write_8cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 8;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2L     : SchedWriteRes<[A57UnitL, A57UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2W     : SchedWriteRes<[A57UnitW, A57UnitW]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_1I_1L  : SchedWriteRes<[A57UnitI,
+                                          A57UnitL]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL,
+                                          A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 2;
+}
+def A57Write_10cyc_2V    : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 2;
+}
+def A57Write_1cyc_1B_1I  : SchedWriteRes<[A57UnitB,
+                                          A57UnitI]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+def A57Write_1cyc_1I_1S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_1B_1I  : SchedWriteRes<[A57UnitB,
+                                          A57UnitI]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_2S     : SchedWriteRes<[A57UnitS, A57UnitS]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_34cyc_2W    : SchedWriteRes<[A57UnitW, A57UnitW]> {
+  let Latency     = 34;
+  let NumMicroOps = 2;
+  let ResourceCycles = [17, 17];
+}
+def A57Write_3cyc_1I_1M  : SchedWriteRes<[A57UnitI,
+                                          A57UnitM]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1I_1S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1S_1V  : SchedWriteRes<[A57UnitS,
+                                          A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_4cyc_1I_1L  : SchedWriteRes<[A57UnitI,
+                                          A57UnitL]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+def A57Write_4cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 3 micro-op types
+
+def A57Write_10cyc_3V       : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 3;
+}
+def A57Write_2cyc_1I_2S     : SchedWriteRes<[A57UnitI,
+                                             A57UnitS, A57UnitS]> {
+  let Latency     = 2;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_1I_1S_1V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitS,
+                                             A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_1M_2S     : SchedWriteRes<[A57UnitM,
+                                             A57UnitS, A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_3S        : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_2S_1V     : SchedWriteRes<[A57UnitS, A57UnitS,
+                                             A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_5cyc_1I_2L     : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL]> {
+  let Latency     = 5;
+  let NumMicroOps = 3;
+}
+def A57Write_6cyc_1I_2L     : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 3;
+}
+def A57Write_6cyc_3V        : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 3;
+}
+def A57Write_7cyc_3L        : SchedWriteRes<[A57UnitL, A57UnitL, A57UnitL]> {
+  let Latency     = 7;
+  let NumMicroOps = 3;
+}
+def A57Write_8cyc_1I_1L_1V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitL,
+                                             A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 3;
+}
+def A57Write_8cyc_1L_2V     : SchedWriteRes<[A57UnitL,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 3;
+}
+def A57Write_8cyc_3V        : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 3;
+}
+def A57Write_9cyc_3V        : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 3;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 4 micro-op types
+
+def A57Write_2cyc_2I_2S    : SchedWriteRes<[A57UnitI, A57UnitI,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 2;
+  let NumMicroOps = 4;
+}
+def A57Write_3cyc_2I_2S    : SchedWriteRes<[A57UnitI, A57UnitI,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 4;
+}
+def A57Write_3cyc_1I_3S    : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS, A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 4;
+}
+def A57Write_3cyc_1I_2S_1V : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 4;
+}
+def A57Write_4cyc_4S       : SchedWriteRes<[A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 4;
+  let NumMicroOps = 4;
+}
+def A57Write_7cyc_1I_3L    : SchedWriteRes<[A57UnitI,
+                                            A57UnitL, A57UnitL, A57UnitL]> {
+  let Latency     = 7;
+  let NumMicroOps = 4;
+}
+def A57Write_5cyc_2I_2L    : SchedWriteRes<[A57UnitI, A57UnitI,
+                                            A57UnitL, A57UnitL]> {
+  let Latency     = 5;
+  let NumMicroOps = 4;
+}
+def A57Write_8cyc_1I_1L_2V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+def A57Write_8cyc_4L       : SchedWriteRes<[A57UnitL, A57UnitL,
+                                            A57UnitL, A57UnitL]> {
+  let Latency     = 8;
+  let NumMicroOps = 4;
+}
+def A57Write_9cyc_2L_2V    : SchedWriteRes<[A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 4;
+}
+def A57Write_9cyc_1L_3V    : SchedWriteRes<[A57UnitL,
+                                            A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 4;
+}
+def A57Write_12cyc_4V      : SchedWriteRes<[A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 12;
+  let NumMicroOps = 4;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 5 micro-op types
+
+def A57Write_3cyc_3S_2V    : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 5;
+}
+def A57Write_8cyc_1I_4L    : SchedWriteRes<[A57UnitI,
+                                            A57UnitL, A57UnitL,
+                                            A57UnitL, A57UnitL]> {
+  let Latency     = 8;
+  let NumMicroOps = 5;
+}
+def A57Write_4cyc_1I_4S    : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 4;
+  let NumMicroOps = 5;
+}
+def A57Write_9cyc_1I_2L_2V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
+def A57Write_9cyc_1I_1L_3V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL,
+                                            A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
+def A57Write_9cyc_2L_3V    : SchedWriteRes<[A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
+def A57Write_9cyc_5V       : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 6 micro-op types
+
+def A57Write_3cyc_1I_3S_2V : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 6;
+}
+def A57Write_4cyc_2I_4S    : SchedWriteRes<[A57UnitI, A57UnitI,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 4;
+  let NumMicroOps = 6;
+}
+def A57Write_4cyc_4S_2V    : SchedWriteRes<[A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 6;
+}
+def A57Write_6cyc_6S       : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS, A57UnitS]> {
+  let Latency     = 6;
+  let NumMicroOps = 6;
+}
+def A57Write_9cyc_1I_2L_3V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+def A57Write_9cyc_1I_1L_4V : SchedWriteRes<[A57UnitI,
+                                            A57UnitL,
+                                            A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+def A57Write_9cyc_2L_4V    : SchedWriteRes<[A57UnitL, A57UnitL,
+                                            A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 6;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 7 micro-op types
+
+def A57Write_10cyc_3L_4V : SchedWriteRes<[A57UnitL, A57UnitL, A57UnitL,
+                                          A57UnitV, A57UnitV,
+                                          A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 7;
+}
+def A57Write_4cyc_1I_4S_2V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitS, A57UnitS,
+                                             A57UnitS, A57UnitS,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 7;
+}
+def A57Write_6cyc_1I_6S     : SchedWriteRes<[A57UnitI,
+                                          A57UnitS, A57UnitS, A57UnitS,
+                                          A57UnitS, A57UnitS, A57UnitS]> {
+  let Latency     = 6;
+  let NumMicroOps = 7;
+}
+def A57Write_9cyc_1I_2L_4V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL,
+                                             A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 7;
+}
+def A57Write_12cyc_7V       : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 12;
+  let NumMicroOps = 7;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 8 micro-op types
+
+def A57Write_10cyc_1I_3L_4V : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL, A57UnitL,
+                                             A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 8;
+}
+def A57Write_11cyc_4L_4V : SchedWriteRes<[A57UnitL, A57UnitL,
+                                          A57UnitL, A57UnitL,
+                                          A57UnitV, A57UnitV,
+                                          A57UnitV, A57UnitV]> {
+  let Latency     = 11;
+  let NumMicroOps = 8;
+}
+def A57Write_8cyc_8S  : SchedWriteRes<[A57UnitS, A57UnitS,
+                                       A57UnitS, A57UnitS,
+                                       A57UnitS, A57UnitS,
+                                       A57UnitS, A57UnitS]> {
+  let Latency     = 8;
+  let NumMicroOps = 8;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 9 micro-op types
+
+def A57Write_8cyc_1I_8S     : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS]> {
+  let Latency     = 8;
+  let NumMicroOps = 9;
+}
+def A57Write_11cyc_1I_4L_4V : SchedWriteRes<[A57UnitI,
+                                             A57UnitL, A57UnitL,
+                                             A57UnitL, A57UnitL,
+                                             A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 11;
+  let NumMicroOps = 9;
+}
+def A57Write_15cyc_9V       : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 15;
+  let NumMicroOps = 9;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 10 micro-op types
+
+def A57Write_6cyc_6S_4V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS,
+                                         A57UnitS, A57UnitS, A57UnitS,
+                                         A57UnitV, A57UnitV,
+                                         A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 10;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 11 micro-op types
+
+def A57Write_6cyc_1I_6S_4V : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 11;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 12 micro-op types
+
+def A57Write_8cyc_8S_4V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS, A57UnitS,
+                                         A57UnitS, A57UnitS, A57UnitS, A57UnitS,
+                                         A57UnitV, A57UnitV,
+                                         A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 12;
+}
+
+//===----------------------------------------------------------------------===//
+// Define Generic 13 micro-op types
+
+def A57Write_8cyc_1I_8S_4V : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 13;
+}
+
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
new file mode 100644
index 000000000000..9fd3ae6818e5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -0,0 +1,869 @@
+//=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for AArch64 Cyclone to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def CycloneModel : SchedMachineModel {
+  let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
+  let MicroOpBufferSize = 192; // Based on the reorder buffer.
+  let LoadLatency = 4; // Optimistic load latency.
+  let MispredictPenalty = 16; // 14-19 cycles are typical.
+  let CompleteModel = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Cyclone.
+
+// 4 integer pipes
+def CyUnitI : ProcResource<4> {
+  let BufferSize = 48;
+}
+
+// 2 branch units: I[0..1]
+def CyUnitB : ProcResource<2> {
+  let Super  = CyUnitI;
+  let BufferSize = 24;
+}
+
+// 1 indirect-branch unit: I[0]
+def CyUnitBR : ProcResource<1> {
+  let Super  = CyUnitB;
+}
+
+// 2 shifter pipes: I[2..3]
+// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
+def CyUnitIS : ProcResource<2> {
+  let Super = CyUnitI;
+  let BufferSize = 24;
+}
+
+// 1 mul pipe: I[0]
+def CyUnitIM : ProcResource<1> {
+  let Super = CyUnitBR;
+  let BufferSize = 32;
+}
+
+// 1 div pipe: I[1]
+def CyUnitID : ProcResource<1> {
+  let Super = CyUnitB;
+  let BufferSize = 16;
+}
+
+// 1 integer division unit. This is driven by the ID pipe, but only
+// consumes the pipe for one cycle at issue and another cycle at writeback.
+def CyUnitIntDiv : ProcResource<1>;
+
+// 2 ld/st pipes.
+def CyUnitLS : ProcResource<2> {
+  let BufferSize = 28;
+}
+
+// 3 fp/vector pipes.
+def CyUnitV : ProcResource<3> {
+  let BufferSize = 48;
+}
+// 2 fp/vector arithmetic and multiply pipes: V[0-1]
+def CyUnitVM : ProcResource<2> {
+  let Super = CyUnitV;
+  let BufferSize = 32;
+}
+// 1 fp/vector division/sqrt pipe: V[2]
+def CyUnitVD : ProcResource<1> {
+  let Super = CyUnitV;
+  let BufferSize = 16;
+}
+// 1 fp compare pipe: V[0]
+def CyUnitVC : ProcResource<1> {
+  let Super = CyUnitVM;
+  let BufferSize = 16;
+}
+
+// 2 fp division/square-root units.  These are driven by the VD pipe,
+// but only consume the pipe for one cycle at issue and a cycle at writeback.
+def CyUnitFloatDiv : ProcResource<2>;
+
+//===----------------------------------------------------------------------===//
+// Define scheduler read/write resources and latency on Cyclone.
+// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
+
+let SchedModel = CycloneModel in {
+
+//---
+// 7.8.1. Moves
+//---
+
+// A single nop micro-op (uX).
+def WriteX : SchedWriteRes<[]> { let Latency = 0; }
+
+// Move zero is a register rename (to machine register zero).
+// The move is replaced by a single nop micro-op.
+// MOVZ Rd, #0
+// AND Rd, Rzr, #imm
+def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
+def WriteImmZ  : SchedWriteVariant<[
+                   SchedVar<WriteZPred, [WriteX]>,
+                   SchedVar<NoSchedPred, [WriteImm]>]>;
+def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
+
+// Move GPR is a register rename and single nop micro-op.
+// ORR Xd, XZR, Xm
+// ADD Xd, Xn, #0
+def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>;
+def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>;
+def WriteMov      : SchedWriteVariant<[
+                      SchedVar<WriteIMovPred, [WriteX]>,
+                      SchedVar<WriteVMovPred, [WriteX]>,
+                      SchedVar<NoSchedPred,   [WriteI]>]>;
+def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
+
+// Move non-zero immediate is an integer ALU op.
+// MOVN,MOVZ,MOVK
+def : WriteRes<WriteImm, [CyUnitI]>;
+
+//---
+// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
+//              Shifts and Bitfield Operations
+//---
+
+// ADR,ADRP
+// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
+// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
+// ADC(S),SBC(S)
+// Aliases: CMN, CMP, TST
+//
+// Conditional operations.
+// CCMNi,CCMPi,CCMNr,CCMPr,
+// CSEL,CSINC,CSINV,CSNEG
+//
+// Bit counting and reversal operations.
+// CLS,CLZ,RBIT,REV,REV16,REV32
+def : WriteRes<WriteI, [CyUnitI]>;
+
+// ADD with shifted register operand is a single micro-op that
+// consumes a shift pipeline for two cycles.
+// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
+// EXAMPLE: ADDrs Xn, Xm LSL #imm
+def : WriteRes<WriteISReg, [CyUnitIS]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+// ADD with extended register operand is the same as shifted reg operand.
+// ADD(S)re,SUB(S)re
+// EXAMPLE: ADDXre Xn, Xm, UXTB #1
+def : WriteRes<WriteIEReg, [CyUnitIS]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+// Variable shift and bitfield operations.
+// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
+def : WriteRes<WriteIS, [CyUnitIS]>;
+
+// EXTR Shifts a pair of registers and requires two micro-ops.
+// The second micro-op is delayed, as modeled by ReadExtrHi.
+// EXTR Xn, Xm, #imm
+def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+// EXTR's first register read is delayed by one cycle, effectively
+// shortening its writer's latency.
+// EXTR Xn, Xm, #imm
+def : ReadAdvance<ReadExtrHi, 1>;
+
+//---
+// 7.8.6. Multiplies
+//---
+
+// MUL/MNEG are aliases for MADD/MSUB.
+// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
+def : WriteRes<WriteIM32, [CyUnitIM]> {
+  let Latency = 4;
+}
+// MADDX,MSUBX,SMULH,UMULH
+def : WriteRes<WriteIM64, [CyUnitIM]> {
+  let Latency = 5;
+}
+
+//---
+// 7.8.7. Divide
+//---
+
+// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
+// The ID pipe is consumed for 2 cycles: issue and writeback.
+// SDIVW,UDIVW
+def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
+  let Latency = 10;
+  let ResourceCycles = [2, 10];
+}
+// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
+// The ID pipe is consumed for 2 cycles: issue and writeback.
+// SDIVX,UDIVX
+def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
+  let Latency = 13;
+  let ResourceCycles = [2, 13];
+}
+
+//---
+// 7.8.8,7.8.10. Load/Store, single element
+//---
+
+// Integer loads take 4 cycles and use one LS unit for one cycle.
+def : WriteRes<WriteLD, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// Store-load forwarding is 4 cycles.
+//
+// Note: The store-exclusive sequence incorporates this
+// latency. However, general heuristics should not model the
+// dependence between a store and subsequent may-alias load because
+// hardware speculation works.
+def : WriteRes<WriteST, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// Load from base address plus an optionally scaled register offset.
+// Rt latency is latency WriteIS + WriteLD.
+// EXAMPLE: LDR Xn, Xm [, lsl 3]
+def CyWriteLDIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
+  SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
+def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map AArch64->Cyclone type.
+
+// EXAMPLE: STR Xn, Xm [, lsl 3]
+def CyWriteSTIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
+  SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
+def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map AArch64->Cyclone type.
+
+// Read the (unshifted) base register Xn in the second micro-op one cycle later.
+// EXAMPLE: LDR Xn, Xm [, lsl 3]
+def ReadBaseRS : SchedReadAdvance<1>;
+def CyReadAdrBase : SchedReadVariant<[
+  SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
+  SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
+def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type.
+
+//---
+// 7.8.9,7.8.11. Load/Store, paired
+//---
+
+// Address pre/post increment is a simple ALU op with one cycle latency.
+def : WriteRes<WriteAdr, [CyUnitI]>;
+
+// LDP high register write is fused with the load, but a nop micro-op remains.
+def : WriteRes<WriteLDHi, []> {
+  let Latency = 4;
+}
+
+// STP is a vector op and store, except for QQ, which is just two stores.
+def : SchedAlias<WriteSTP, WriteVSTShuffle>;
+def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
+
+//---
+// 7.8.13. Branches
+//---
+
+// Branches take a single micro-op.
+// The misprediction penalty is defined as a SchedMachineModel property.
+def : WriteRes<WriteBr,    [CyUnitB]>  {let Latency = 0;}
+def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
+
+//---
+// 7.8.14. Never-issued Instructions, Barrier and Hint Operations
+//---
+
+// NOP,SEV,SEVL,WFE,WFI,YIELD
+def : WriteRes<WriteHint, []> {let Latency = 0;}
+// ISB
+def : InstRW<[WriteI], (instrs ISB)>;
+// SLREX,DMB,DSB
+def : WriteRes<WriteBarrier, [CyUnitLS]>;
+
+// System instructions get an invalid latency because the latency of
+// other operations across them is meaningless.
+def : WriteRes<WriteSys, []> {let Latency = -1;}
+
+//===----------------------------------------------------------------------===//
+// 7.9 Vector Unit Instructions
+
+// Simple vector operations take 2 cycles.
+def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
+
+// Define some longer latency vector op types for Cyclone.
+def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
+def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
+def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
+
+// Simple floating-point operations take 2 cycles.
+def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
+
+//---
+// 7.9.1 Vector Moves
+//---
+
+// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
+// generates expensive int-float conversion instead:
+// FMOVDi Dd, #0.0
+// FMOVv2f64ns Vd.2d, #0.0
+
+// FMOVSi,FMOVDi
+def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
+
+// MOVI,MVNI are WriteV
+// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
+
+// Move FPR is a register rename and single nop micro-op.
+// ORR.16b Vd,Vn,Vn
+// COPY is handled above in the WriteMov Variant.
+def WriteVMov    : SchedWriteVariant<[
+                     SchedVar<WriteVMovPred, [WriteX]>,
+                     SchedVar<NoSchedPred,   [WriteV]>]>;
+def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
+
+// FMOVSr,FMOVDr are WriteF.
+
+// MOV V,V is a WriteV.
+
+// CPY D,V[x] is a WriteV
+
+// INS V[x],V[y] is a WriteV.
+
+// FMOVWSr,FMOVXDr,FMOVXDHighr
+def : WriteRes<WriteFCopy, [CyUnitLS]> {
+  let Latency = 5;
+}
+
+// FMOVSWr,FMOVDXr
+def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
+
+// INS V[x],R
+def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
+def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
+
+// SMOV,UMOV R,V[x]
+def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
+def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
+
+// DUP V,R
+def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
+
+// DUP V,V[x] is a WriteV.
+
+//---
+// 7.9.2 Integer Arithmetic, Logical, and Comparisons
+//---
+
+// BIC,ORR V,#imm are WriteV
+
+def : InstRW<[CyWriteV3], (instregex "ABSv")>;
+
+// MVN,NEG,NOT are WriteV
+
+def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
+
+// ADDP is a WriteV.
+def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
+def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
+
+def : InstRW<[CyWriteV3],
+             (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
+
+// ADD,SUB are WriteV
+
+// Forward declare.
+def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+
+// Add/Diff and accumulate uses the vector multiply unit.
+def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
+def CyReadVAccum  : SchedReadAdvance<1,
+                    [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
+
+def : InstRW<[CyWriteVAccum, CyReadVAccum],
+             (instregex "SADALP","UADALP")>;
+
+def : InstRW<[CyWriteVAccum, CyReadVAccum],
+             (instregex "SABAv","UABAv","SABALv","UABALv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
+
+def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
+
+// WriteV includes:
+// AND,BIC,CMTST,EOR,ORN,ORR
+// ADDP
+// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
+// SADDL,SSUBL,UADDL,USUBL
+// SADDW,SSUBW,UADDW,USUBW
+
+def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
+                                     "CMLEv","CMLTv",
+                                     "CMHIv","CMHSv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
+                                     "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
+
+def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
+                                       "SABDLv","UABDLv")>;
+
+//---
+// 7.9.3 Floating Point Arithmetic and Comparisons
+//---
+
+// FABS,FNEG are WriteF
+
+def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
+def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
+
+def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
+                                     "FMINPv2i","FMINNMPv2i")>;
+
+def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
+
+def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
+                                  FSUBSrr,FSUBv2f32,FSUBv4f32,
+                                  FADDPv2f32,FADDPv4f32,
+                                  FABD32,FABDv2f32,FABDv4f32)>;
+def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
+                                  FSUBDrr,FSUBv2f64,
+                                  FADDPv2f64,
+                                  FABD64,FABDv2f64)>;
+
+def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
+
+def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
+                                     "FMAXS","FMAXD","FMAXv",
+                                     "FMINS","FMIND","FMINv",
+                                     "FMAXNMS","FMAXNMD","FMAXNMv",
+                                     "FMINNMS","FMINNMD","FMINNMv",
+                                     "FMAXPv2f","FMAXPv4f",
+                                     "FMINPv2f","FMINPv4f",
+                                     "FMAXNMPv2f","FMAXNMPv4f",
+                                     "FMINNMPv2f","FMINNMPv4f")>;
+
+// FCMP,FCMPE,FCCMP,FCCMPE
+def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
+
+// FCSEL is a WriteF.
+
+//---
+// 7.9.4 Shifts and Bitfield Operations
+//---
+
+// SHL is a WriteV
+
+def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
+def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
+
+def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
+
+// Shift and accumulate uses the vector multiply unit.
+def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
+def CyReadVShiftAcc  : SchedReadAdvance<1,
+                        [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
+def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
+             (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
+
+// SSHL,USHL are WriteV.
+
+def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
+
+// SQSHL,SQSHLU,UQSHL are WriteV.
+
+def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
+
+// WriteV includes:
+// SHLL,SSHLL,USHLL
+// SLI,SRI
+// BIF,BIT,BSL
+// EXT
+// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
+// XTN2
+
+def : InstRW<[CyWriteV4],
+             (instregex "RSHRNv","SHRNv",
+                        "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
+                        "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
+
+//---
+// 7.9.5 Multiplication
+//---
+
+def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
+def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
+                             "SQDMULLv","SQDMULHv","SQRDMULHv")>;
+
+// FMUL,FMULX,FNMUL default to WriteFMul.
+def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
+
+def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
+def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
+                               FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
+
+def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
+def : InstRW<[CyWriteVMul, CyReadVMulAcc],
+             (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
+              "SQDMLAL","SQDMLSL")>;
+
+def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
+def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
+def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
+def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
+
+def : InstRW<[CyWriteSMul, CyReadSMul],
+             (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
+              FMLAv2f32,FMLAv4f32,
+              FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
+def : InstRW<[CyWriteDMul, CyReadDMul],
+             (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
+              FMLAv2f64,FMLAv2i64_indexed,
+              FMLSv2f64,FMLSv2i64_indexed)>;
+
+def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
+def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
+
+//---
+// 7.9.6 Divide and Square Root
+//---
+
+// FDIV,FSQRT
+// TODO: Add 64-bit variant with 19 cycle latency.
+// TODO: Specialize FSQRT for longer latency.
+def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
+  let Latency = 17;
+  let ResourceCycles = [2, 17];
+}
+
+def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
+
+def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
+def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
+
+def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
+def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
+def : InstRW<[WriteFRECPS],  (instregex "FRECPSv")>;
+def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
+
+//---
+// 7.9.7 Integer-FP Conversions
+//---
+
+// FCVT lengthen f16/s32
+def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
+
+// FCVT,FCVTN,FCVTXN
+// SCVTF,UCVTF V,V
+// FRINT(AIMNPXZ) V,V
+def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
+
+// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
+def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
+def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
+
+// FCVT Rd, S/D = V6+LD4: 10 cycles
+def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
+def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
+
+// FCVTL is a WriteV
+
+//---
+// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
+//---
+
+def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
+def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
+                                       AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
+                                       SHA1SU0rrr)>;
+
+def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
+def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
+
+def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
+def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
+                                       SHA256Hrrr,SHA256H2rrr)>;
+
+// TRN,UZP,ZUP are WriteV.
+
+// TBL,TBX are WriteV.
+
+//---
+// 7.9.11-7.9.14 Load/Store, single element and paired
+//---
+
+// Loading into the vector unit takes 5 cycles vs 4 for integer loads.
+def : WriteRes<WriteVLD, [CyUnitLS]> {
+  let Latency = 5;
+}
+
+// Store-load forwarding is 4 cycles.
+def : WriteRes<WriteVST, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// WriteVLDPair/VSTPair sequences are expanded by the target description.
+
+//---
+// 7.9.15 Load, element operations
+//---
+
+// Only the first WriteVLD and WriteAdr for writeback matches def operands.
+// Subsequent WriteVLDs consume resources. Since all loaded values have the
+// same latency, this is acceptable.
+
+// Vd is read 5 cycles after issuing the vector load.
+def : ReadAdvance<ReadVLD, 5>;
+
+def : InstRW<[WriteVLD],
+             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr],
+             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+// Register writes from the load's high half are fused micro-ops.
+def : InstRW<[WriteVLD],
+             (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr],
+             (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
+             (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD],
+             (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
+             (instregex "LD1i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD],          (instrs LD1i64)>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
+
+def : InstRW<[WriteVLDShuffle],
+             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr],
+             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV],
+             (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+             (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
+             (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+             (instregex "LD2i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+             (instregex "LD2i(8|16|32)_POST")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+             (instregex "LD2i64$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+             (instregex "LD2i64_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV],
+             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+             (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+             (instregex "LD3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
+             (instregex "LD3i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
+             (instregex "LD3i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
+             (instregex "LD3i64$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+             (instregex "LD3i64_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
+             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+             (instrs LD3Rv1d,LD3Rv2d)>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+             (instrs LD3Rv1d_POST,LD3Rv2d_POST)>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD4Fourv(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
+              WriteVLDPairShuffle, WriteVLDPairShuffle],
+             (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
+              WriteVLDPairShuffle, WriteVLDPairShuffle],
+             (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
+             (instregex "LD4i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
+             (instregex "LD4i(8|16|32)_POST")>;
+
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4i64)>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+             (instrs LD4i64_POST)>;
+
+def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
+             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
+             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4Rv1d,LD4Rv2d)>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
+
+//---
+// 7.9.16 Store, element operations
+//---
+
+// Only the WriteAdr for writeback matches a def operands.
+// Subsequent WriteVLDs only consume resources.
+
+def : InstRW<[WriteVST],
+             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST],
+             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],
+             (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle],
+             (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST],
+             (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST],
+             (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVST],
+             (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
+             (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST1i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instrs ST1i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
+
+def : InstRW<[WriteVSTShuffle],
+             (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle],
+             (instregex "ST2Twov(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
+def : InstRW<[WriteVSTShuffle],           (instrs ST2i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST3i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
+
+def :InstRW<[WriteVSTShuffle, WriteVSTShuffle],           (instrs ST3i64)>;
+def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
+
+def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
+            (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
+            (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
+              WriteVSTPairShuffle, WriteVSTPairShuffle],
+             (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
+              WriteVSTPairShuffle, WriteVSTPairShuffle],
+             (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTPairShuffle],           (instregex "ST4i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
+
+// Atomic operations are not supported.
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+//---
+// Unused SchedRead types
+//---
+
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+
+} // SchedModel = CycloneModel
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
new file mode 100644
index 000000000000..19a6d6f2a1ad
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -0,0 +1,26 @@
+//==- AArch64SchedFalkor.td - Falkor Scheduling Definitions -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Qualcomm Falkor to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Define the SchedMachineModel and provide basic properties for coarse grained
+// instruction cost model.
+
+def FalkorModel : SchedMachineModel {
+  let IssueWidth = 4;          // 4-wide issue for expanded uops.
+  let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer.
+  let LoopMicroOpBufferSize = 16;
+  let LoadLatency = 3;         // Optimistic load latency.
+  let MispredictPenalty = 11;  // Minimum branch misprediction penalty.
+  let CompleteModel = 0;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedKryo.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryo.td
new file mode 100644
index 000000000000..4e491a04c78d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryo.td
@@ -0,0 +1,133 @@
+//==- AArch64SchedKryo.td - Qualcomm Kryo Scheduling Defs ---*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Qualcomm Kryo to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// The issue width is set to five, matching the five issue queues for expanded
+// uops. Now, the latency spreadsheet has information based on fragmented uops,
+// but these do not actually take up an issue queue.
+
+def KryoModel : SchedMachineModel {
+  let IssueWidth        =   5; // 5-wide issue for expanded uops
+  let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer
+  let LoadLatency       =   4; // Optimistic load latency
+  let MispredictPenalty =  14; // Fetch + Decode/Rename/Dispatch + Branch
+
+  // Enable partial & runtime unrolling. The magic number is chosen based on
+  // experiments and benchmarking data.
+  let LoopMicroOpBufferSize = 16;
+  let CompleteModel = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Kryo.
+
+let SchedModel = KryoModel in {
+  def KryoUnitXA : ProcResource<1>;                   // Type X(A) micro-ops
+  def KryoUnitXB : ProcResource<1>;                   // Type X(B) micro-ops
+  def KryoUnitYA : ProcResource<1>;                   // Type Y(A) micro-ops
+  def KryoUnitYB : ProcResource<1>;                   // Type Y(B) micro-ops
+  def KryoUnitX : ProcResGroup<[KryoUnitXA,          // Type X micro-ops
+                                KryoUnitXB]>;
+  def KryoUnitY : ProcResGroup<[KryoUnitYA,          // Type Y micro-ops
+                                KryoUnitYB]>;
+  def KryoUnitXY : ProcResGroup<[KryoUnitXA,         // Type XY micro-ops
+                                 KryoUnitXB,
+                                 KryoUnitYA,
+                                 KryoUnitYB]>;
+  def KryoUnitLSA : ProcResource<1>;                  // Type LS(A) micro-ops
+  def KryoUnitLSB : ProcResource<1>;                  // Type LS(B) micro-ops
+  def KryoUnitLS : ProcResGroup<[KryoUnitLSA,        // Type LS micro-ops
+                                 KryoUnitLSB]>;
+}
+
+let SchedModel = KryoModel in {
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for
+// Kryo.
+
+def : WriteRes<WriteImm,   [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteI,     [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteISReg, [KryoUnitXY, KryoUnitXY]>
+      { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteIEReg, [KryoUnitXY, KryoUnitXY]>
+      { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteExtr,  [KryoUnitXY, KryoUnitX]>
+      { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteIS,    [KryoUnitXY]> { let Latency = 2; }
+def : WriteRes<WriteID32,  [KryoUnitXA, KryoUnitY]>
+      { let Latency = 8; let NumMicroOps = 1; } // Fragent -1
+def : WriteRes<WriteID64,  [KryoUnitXA, KryoUnitY]>
+      { let Latency = 8; let NumMicroOps = 1; } // Fragent -1
+def : WriteRes<WriteIM32,  [KryoUnitX]> { let Latency = 5; }
+def : WriteRes<WriteIM64,  [KryoUnitX]> { let Latency = 5; }
+def : WriteRes<WriteBr,    [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteBrReg, [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteLD,    [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteST,    [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteSTP,   [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteAdr,   [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteLDIdx, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteSTIdx, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteF,     [KryoUnitXY, KryoUnitXY]>
+      { let Latency = 3; let NumMicroOps = 2; }
+def : WriteRes<WriteFCmp,  [KryoUnitXY]> { let Latency = 2; }
+def : WriteRes<WriteFCvt,  [KryoUnitX]> { let Latency = 4; }
+def : WriteRes<WriteFCopy, [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteFImm,  [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteFMul,  [KryoUnitX, KryoUnitX]>
+      { let Latency = 6; let NumMicroOps = 2; }
+def : WriteRes<WriteFDiv,  [KryoUnitXA, KryoUnitY]>
+      { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1
+def : WriteRes<WriteV,     [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteVLD,   [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteVST,   [KryoUnitLS]> { let Latency = 4; }
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi,    []> { let Latency = 4; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// No forwarding logic is modelled yet.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+
+//===----------------------------------------------------------------------===//
+// Specialize the coarse model by associating instruction groups with the
+// subtarget-defined types. As the modeled is refined, this will override most
+// of the above SchedWriteRes and SchedAlias mappings.
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+
+// Detailed Refinedments
+// -----------------------------------------------------------------------------
+include "AArch64SchedKryoDetails.td"
+
+
+} // SchedModel = KryoModel
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
new file mode 100644
index 000000000000..426ae6103e4b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
@@ -0,0 +1,2358 @@
+//=- AArch64SchedKryoDetails.td - QC Kryo Scheduling Defs ----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the uop and latency details for the machine model for the
+// Qualcomm Kryo subtarget.
+//
+//===----------------------------------------------------------------------===//
+
+def KryoWrite_3cyc_X_noRSV_138ln :
+	SchedWriteRes<[KryoUnitX]> {
+    let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_noRSV_138ln],
+    (instregex "(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)")>;
+
+def KryoWrite_3cyc_X_X_139ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+    let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_139ln],
+    (instregex "(S|U)R?SRA(v2i64|v4i32|v8i16|v16i8)_shift")>;
+
+def KryoWrite_4cyc_XY_XY_noRSV_172ln :
+    SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+    let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XY_XY_noRSV_172ln],
+	(instregex "(S|U)ABA(v8i8|v4i16|v2i32)")>;
+def KryoWrite_4cyc_XY_XY_XY_XY_178ln :
+    SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+    let Latency = 4; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_4cyc_XY_XY_XY_XY_178ln],
+	(instregex "(S|U)ABA(v16i8|v8i16|v4i32)")>;
+def KryoWrite_3cyc_XY_XY_XY_XY_177ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+    let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_XY_XY_177ln],
+	(instregex "(S|U)ABALv.*")>;
+def KryoWrite_3cyc_XY_XY_166ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+    let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_166ln],
+	(instregex "(S|U)(ABD|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_3cyc_XY_noRSV_159ln :
+	SchedWriteRes<[KryoUnitXY]> {
+    let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_159ln],
+	(instregex "(S|U)(ABD|RHADD)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_165ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+    let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_165ln],
+	(instregex "(S|U)ABDLv.*")>;
+def KryoWrite_3cyc_X_noRSV_154ln :
+	SchedWriteRes<[KryoUnitX]> {
+let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_noRSV_154ln],
+	(instregex "(S|U)ADALP(v8i8|v4i16|v2i32)_v.*")>;
+def KryoWrite_3cyc_X_X_155ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_155ln],
+	(instregex "(S|U)ADALP(v16i8|v8i16|v4i32)_v.*")>;
+def KryoWrite_2cyc_XY_XY_151ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_151ln],
+	(instregex "(S|U)(ADD|SUB)Lv.*")>;
+def KryoWrite_2cyc_XY_noRSV_148ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_148ln],
+	(instregex "((S|U)ADDLP|ABS)(v2i32|v4i16|v8i8)(_v.*)?")>;
+def KryoWrite_2cyc_XY_XY_150ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_150ln],
+	(instregex "((S|U)ADDLP|ABS)(v2i64|v4i32|v8i16|v16i8)(_v.*)?")>;
+def KryoWrite_3cyc_XY_XY_XY_noRSV_179ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_XY_noRSV_179ln],
+	(instrs SADDLVv4i32v, UADDLVv4i32v)>;
+def KryoWrite_5cyc_XY_XY_XY_noRSV_180ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+	let Latency = 5; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_5cyc_XY_XY_XY_noRSV_180ln],
+	(instrs SADDLVv8i16v, UADDLVv8i16v)>;
+def KryoWrite_6cyc_XY_XY_X_noRSV_181ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_6cyc_XY_XY_X_noRSV_181ln],
+	(instrs SADDLVv16i8v, UADDLVv16i8v)>;
+def KryoWrite_3cyc_XY_noRSV_158ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_158ln],
+	(instrs SADDLVv4i16v, UADDLVv4i16v, ADDVv4i16v)>;
+def KryoWrite_4cyc_X_noRSV_169ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_169ln],
+	(instrs SADDLVv8i8v, UADDLVv8i8v, ADDVv8i8v)>;
+def KryoWrite_2cyc_XY_XY_XY_XY_176ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_XY_XY_176ln],
+	(instregex "(S|U)(ADDW|SUBW)v.*")>;
+def KryoWrite_4cyc_X_noRSV_40ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_40ln],
+	(instregex "(S|U)CVTFS(W|X)(D|S)ri")>;
+def KryoWrite_4cyc_X_noRSV_97ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_97ln],
+	(instregex "(S|U)CVTFU(W|X)(D|S)ri")>;
+def KryoWrite_4cyc_X_noRSV_110ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_110ln],
+	(instregex "(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>;
+def KryoWrite_4cyc_X_X_114ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_114ln],
+	(instregex "(S|U)CVTF(v2i64|v4i32|v2f64|v4f32)(_shift)?")>;
+def KryoWrite_1cyc_XA_Y_98ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_98ln],
+	(instregex "(S|U)DIV(_Int)?(W|X)r")>;
+def KryoWrite_2cyc_XY_XY_152ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_152ln],
+	(instregex "(S|U)H(ADD|SUB)(v16i8|v8i16|v4i32)")>;
+def KryoWrite_2cyc_XY_noRSV_149ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_149ln],
+	(instregex "((S|U)H(ADD|SUB)|ADDP)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_4cyc_X_70ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_70ln],
+	(instregex "(S|U)(MADDL|MSUBL)rrr")>;
+def KryoWrite_4cyc_X_X_191ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_191ln],
+	(instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>;
+def KryoWrite_1cyc_XY_195ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_195ln],
+	(instregex "(S|U)MOVv.*")>;
+def KryoWrite_5cyc_X_71ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_5cyc_X_71ln],
+	(instrs SMULHrr, UMULHrr)>;
+def KryoWrite_3cyc_XY_noRSV_186ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_186ln],
+	(instregex "^(S|U)QADD(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_187ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_187ln],
+	(instregex "^(S|U)QADD(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_3cyc_XY_noRSV_69ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_69ln],
+	(instregex "(S|U|SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64)")>;
+def KryoWrite_3cyc_XY_noRSV_248ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_248ln],
+	(instregex "(S|U)QSHLU?(d|s|h|b|(v8i8|v4i16|v2i32)_shift)$")>;
+def KryoWrite_3cyc_XY_XY_250ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_250ln],
+	(instregex "(S|U)(QSHLU?|RSHR)(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def KryoWrite_3cyc_XY_noRSV_246ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_246ln],
+	(instregex "(S|U)(QSHL|RSHL|QRSHL)(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32)$")>;
+def KryoWrite_3cyc_XY_XY_251ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_251ln],
+	(instregex "(S|U)(QSHL|RSHL|QRSHL)(v16i8|v8i16|v4i32|v2i64)$")>;
+def KryoWrite_6cyc_XY_X_238ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_XY_X_238ln],
+	(instregex "((S|U)QR?SHRN|SQR?SHRUN)(v16i8|v8i16|v4i32)_shift$")>;
+def KryoWrite_3cyc_XY_noRSV_249ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_249ln],
+	(instregex "((S|U)QR?SHRN|SQR?SHRUN)(s|h|b)?")>;
+def KryoWrite_6cyc_XY_X_noRSV_252ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_6cyc_XY_X_noRSV_252ln],
+	(instregex "((S|U)QR?SHRN|SQR?SHRUN)(v8i8|v4i16|v2i32)_shift?")>;
+def KryoWrite_3cyc_XY_noRSV_161ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_161ln],
+	(instregex "(S|U)QSUB(v8i8|v4i16|v2i32|v1i64|v1i32|v1i16|v1i8)")>;
+def KryoWrite_3cyc_XY_noRSV_163ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_163ln],
+	(instregex "(S|U)QXTU?N(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_noRSV_162ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_162ln],
+	(instregex "(S|U)QXTU?N(v1i8|v1i16|v1i32)")>;
+def KryoWrite_3cyc_XY_noRSV_247ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_247ln],
+	(instregex "(S|U)RSHR(d|(v8i8|v4i16|v2i32)_shift)$")>;
+def KryoWrite_2cyc_XY_noRSV_239ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_239ln],
+	(instregex "(S|U)SHL(d|v8i8|v4i16|v2i32|v1i64)$")>;
+def KryoWrite_2cyc_XY_XY_243ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_243ln],
+	(instregex "(S|U)SHL(v16i8|v8i16|v4i32|v2i64)$")>;
+def KryoWrite_2cyc_XY_XY_241ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_241ln],
+	(instregex "(S|U)?SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>;
+def KryoWrite_2cyc_XY_noRSV_240ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_240ln],
+	(instregex "((S|U)SHR|SHL)(d|(v8i8|v4i16|v2i32)_shift)$")>;
+def KryoWrite_2cyc_XY_XY_242ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_242ln],
+	(instregex "((S|U)SHR|SHL)(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def KryoWrite_2cyc_XY_XY_183ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_183ln],
+	(instregex "(S|U)(MAX|MIN)P?(v16i8|v8i16|v4i32)")>;
+def KryoWrite_2cyc_XY_noRSV_182ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_182ln],
+	(instregex "(S|U)(MAX|MIN)P?(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_noRSV_184ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_184ln],
+	(instregex "(S|U)(MAX|MIN)V(v4i16v|v8i8v|v4i32)")>;
+def KryoWrite_4cyc_X_noRSV_185ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_185ln],
+	(instregex "(S|U)(MAX|MIN)V(v16i8v|v8i16v)")>;
+def KryoWrite_2cyc_XY_noRSV_67ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_67ln],
+	(instrs ABSv1i64)>;
+def KryoWrite_1cyc_XY_63ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_63ln, ReadI, ReadI],
+	(instregex "ADC.*")>;
+def KryoWrite_1cyc_XY_63_1ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_63_1ln],
+	(instregex "ADR.*")>;
+def KryoWrite_1cyc_XY_62ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_62ln, ReadI],
+	(instregex "ADDS?(W|X)ri")>;
+def KryoWrite_2cyc_XY_XY_64ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_64ln, ReadI, ReadI],
+	(instregex "ADDS?(W|X)r(r|s|x)(64)?")>;
+def KryoWrite_1cyc_XY_noRSV_65ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_65ln],
+	(instrs ADDv1i64)>;
+def KryoWrite_1cyc_XY_noRSV_144ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_144ln],
+	(instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>;
+def KryoWrite_1cyc_XY_XY_146ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_146ln],
+	(instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_4cyc_XY_X_noRSV_171ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XY_X_noRSV_171ln],
+	(instregex "(ADD|SUB)HNv.*")>;
+def KryoWrite_1cyc_XY_noRSV_66ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_66ln],
+	(instrs ADDPv2i64p)>;
+def KryoWrite_2cyc_XY_XY_153ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_153ln],
+	(instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_3cyc_XY_XY_noRSV_170ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_noRSV_170ln],
+	(instrs ADDVv4i32v)>;
+def KryoWrite_4cyc_XY_XY_noRSV_173ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XY_XY_noRSV_173ln],
+	(instrs ADDVv8i16v)>;
+def KryoWrite_5cyc_XY_X_noRSV_174ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_5cyc_XY_X_noRSV_174ln],
+	(instrs ADDVv16i8v)>;
+def KryoWrite_3cyc_XY_XY_X_X_27ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_X_X_27ln],
+	(instrs AESDrr, AESErr)>;
+def KryoWrite_2cyc_X_X_22ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_22ln],
+	(instrs AESIMCrr, AESMCrr)>;
+def KryoWrite_1cyc_XY_noRSV_76ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_76ln],
+	(instregex "((AND|ORN|EOR|EON)S?(Wr[rsi]|v8i8|v4i16|v2i32)|(ORR|BIC)S?(Wr[rs]|v8i8|v4i16|v2i32))")>;
+def KryoWrite_1cyc_XY_XY_79ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_79ln],
+	(instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>;
+def KryoWrite_1cyc_X_72ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_72ln],
+	(instregex "(S|U)?BFM.*")>;
+def KryoWrite_1cyc_XY_noRSV_77ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_77ln],
+	(instregex "(BIC|ORR)S?Wri")>;
+def KryoWrite_1cyc_XY_XY_78ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_78ln],
+	(instregex "(BIC|ORR)S?Xri")>;
+def KryoWrite_1cyc_X_noRSV_74ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_74ln],
+	(instrs BIFv8i8, BITv8i8, BSLv8i8)>;
+def KryoWrite_1cyc_X_X_75ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_75ln],
+	(instrs BIFv16i8, BITv16i8, BSLv16i8)>;
+def KryoWrite_0cyc_noRSV_11ln :
+	SchedWriteRes<[]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_noRSV_11ln],
+	(instrs BRK, DCPS1, DCPS2, DCPS3, HLT, HVC, ISB, HINT, SMC, SVC)>;
+def KryoWrite_0cyc_XY_16ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_16ln, ReadI],
+	(instregex "(CCMN|CCMP)(W|X)i")>;
+def KryoWrite_0cyc_XY_16_1ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_16_1ln, ReadI, ReadI],
+	(instregex "(CCMN|CCMP)(W|X)r")>;
+def KryoWrite_2cyc_XY_3ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_2cyc_XY_3ln, ReadI],
+	(instregex "(CLS|CLZ)(W|X)r")>;
+def KryoWrite_2cyc_XY_noRSV_7ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_7ln],
+	(instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>;
+def KryoWrite_2cyc_XY_XY_8ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_8ln],
+	(instregex "(CLS|CLZ|CNT)(v2i32|v4i16|v8i8)")>;
+def KryoWrite_2cyc_XY_noRSV_80ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_80ln],
+	(instregex "CM(EQ|GE|HS|GT|HI|TST)(v8i8|v4i16|v2i32|v1i64)$")>;
+def KryoWrite_2cyc_XY_XY_83ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_83ln],
+	(instregex "CM(EQ|GE|HS|GT|HI|TST)(v16i8|v8i16|v4i32|v2i64)$")>;
+def KryoWrite_2cyc_XY_noRSV_81ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_81ln],
+	(instregex "CM(EQ|LE|GE|GT|LT)(v8i8|v4i16|v2i32|v1i64)rz$")>;
+def KryoWrite_2cyc_XY_XY_82ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_82ln],
+	(instregex "CM(EQ|LE|GE|GT|LT)(v16i8|v8i16|v4i32|v2i64)rz$")>;
+def KryoWrite_3cyc_XY_4ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_XY_4ln, ReadI, ReadISReg],
+	(instregex "CRC32.*")>;
+def KryoWrite_1cyc_XY_20ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_20ln, ReadI, ReadI],
+	(instregex "CSEL(W|X)r")>;
+def KryoWrite_1cyc_X_17ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_17ln, ReadI, ReadI],
+	(instregex "(CSINC|CSNEG)(W|X)r")>;
+def KryoWrite_1cyc_XY_18ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_18ln, ReadI, ReadI],
+	(instregex "(CSINV)(W|X)r")>;
+def KryoWrite_3cyc_LS_X_13ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_X_13ln],
+	(instrs DRPS)>;
+def KryoWrite_0cyc_LS_10ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_10ln],
+	(instrs DSB, DMB, CLREX)>;
+def KryoWrite_1cyc_X_noRSV_196ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_196ln],
+	(instregex "DUP(v8i8|v4i16|v2i32)(gpr|lane)")>;
+def KryoWrite_1cyc_X_X_197ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_197ln],
+	(instregex "DUP(v16i8|v8i16|v4i32|v2i64)(gpr|lane)")>;
+def KryoWrite_3cyc_LS_LS_X_15ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_X_15ln],
+	(instrs ERET)>;
+def KryoWrite_1cyc_X_noRSV_207ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_207ln],
+	(instrs EXTv8i8)>;
+def KryoWrite_1cyc_X_X_212ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_212ln],
+	(instrs EXTv16i8)>;
+def KryoWrite_2cyc_XY_X_136ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_X_136ln],
+	(instrs EXTRWrri, EXTRXrri)>;
+def KryoWrite_2cyc_XY_noRSV_35ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_35ln],
+	(instregex "F(MAX|MIN)(NM)?P?(D|S)rr")>;
+def KryoWrite_2cyc_XY_XY_106ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_106ln],
+	(instregex "(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2i64p|v2f64|v4f32)")>;
+def KryoWrite_2cyc_XY_noRSV_104ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_104ln],
+	(instregex "(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f32|v2i32p)")>;
+def KryoWrite_3cyc_XY_noRSV_107ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_107ln],
+	(instregex "F(MAX|MIN)(NM)?Vv4i32v")>;
+def KryoWrite_3cyc_XY_noRSV_101ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_101ln],
+	(instregex "FABD(32|64|v2f32)")>;
+def KryoWrite_3cyc_XY_XY_103ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_103ln],
+	(instregex "(FABD|FADD|FSUB|FADDP)(v4f32|v2f64)")>;
+def KryoWrite_1cyc_XY_noRSV_48ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_48ln],
+	(instregex "F(ABS|NEG)(D|S)r")>;
+def KryoWrite_1cyc_XY_noRSV_124ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_124ln],
+	(instregex "F(ABS|NEG)v2f32")>;
+def KryoWrite_1cyc_XY_XY_125ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_125ln],
+	(instregex "F(ABS|NEG)(v2f64|v4f32)")>;
+def KryoWrite_2cyc_XY_noRSV_33ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_33ln],
+	(instregex "(FAC(GE|GT)|FCM(EQ|GE|GT))(32|64)")>;
+def KryoWrite_3cyc_XY_noRSV_30ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_30ln],
+	(instregex "(FADD|FSUB)(D|S)rr")>;
+def KryoWrite_3cyc_XY_noRSV_100ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_100ln],
+	(instregex "(FADD|FSUB|FADDP)v2f32")>;
+def KryoWrite_3cyc_XY_noRSV_29ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_29ln],
+	(instregex "FADDP(v2i32p|v2i64p)")>;
+def KryoWrite_0cyc_XY_31ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_31ln],
+	(instregex "FCCMPE?(D|S)rr")>;
+def KryoWrite_2cyc_XY_noRSV_34ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_34ln],
+	(instregex "FCM(EQ|LE|GE|GT|LT)(v1i32|v1i64)rz")>;
+def KryoWrite_2cyc_XY_XY_36ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_36ln],
+	(instregex "FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32)rz")>;
+def KryoWrite_2cyc_XY_noRSV_105ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_105ln],
+	(instregex "FCM(EQ|LE|GE|GT|LT)v2i32rz")>;
+def KryoWrite_0cyc_XY_32ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_32ln],
+	(instregex "FCMPE?(D|S)r(r|i)")>;
+def KryoWrite_1cyc_XY_noRSV_49ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_49ln],
+	(instrs FCSELDrrr, FCSELSrrr)>;
+def KryoWrite_4cyc_X_noRSV_41ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_41ln],
+	(instrs FCVTDHr, FCVTDSr, FCVTHDr, FCVTHSr, FCVTSDr, FCVTSHr)>;
+def KryoWrite_4cyc_X_38ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_38ln],
+	(instregex "FCVT(((A|N|M|P)(S|U)(S|U)|Z(S|U)_Int(S|U))(W|X)(D|S)ri?|Z(S|U)(d|s))$")>;
+def KryoWrite_4cyc_X_noRSV_113ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_113ln],
+	(instregex "FCVT((A|N|M|P)(S|U)|Z(S|U)_Int)(v1i32|v1i64|v2f32)$")>;
+def KryoWrite_4cyc_X_X_117ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_117ln],
+	(instregex "FCVT((A|N|M|P)(S|U)|Z(S|U)_Int)(v4f32|v2f64)$")>;
+def KryoWrite_5cyc_X_X_XY_noRSV_119ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitXY]> {
+	let Latency = 5; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_XY_noRSV_119ln],
+	(instregex "FCVTX?N(v2f32|v4f32|v2i32|v4i16|v4i32|v8i16)$")>;
+def KryoWrite_4cyc_X_X_116ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_116ln],
+	(instregex "FCVTL(v2i32|v4i16|v4i32|v8i16)$")>;
+def KryoWrite_4cyc_X_noRSV_112ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_112ln],
+	(instrs FCVTXNv1i64)>;
+def KryoWrite_4cyc_X_37ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_37ln],
+	(instregex "FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>;
+def KryoWrite_4cyc_X_noRSV_111ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_111ln],
+	(instregex "FCVTZ(S|U)(v2f32|v1i32|v1i64|v2i32(_shift)?)$")>;
+def KryoWrite_4cyc_X_X_115ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_115ln],
+	(instregex "FCVTZ(S|U)(v2f64|v4f32|(v2i64|v4i32)(_shift)?)$")>;
+def KryoWrite_1cyc_XA_Y_noRSV_43ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_43ln],
+	(instrs FDIVDrr, FDIVSrr)>;
+def KryoWrite_1cyc_XA_Y_noRSV_121ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_121ln],
+	(instrs FDIVv2f32)>;
+def KryoWrite_1cyc_XA_Y_XA_Y_123ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_123ln],
+	(instrs FDIVv2f64, FDIVv4f32)>;
+def KryoWrite_5cyc_X_noRSV_55ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_55ln],
+	(instregex "FN?M(ADD|SUB)Srrr")>;
+def KryoWrite_6cyc_X_noRSV_57ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_57ln],
+	(instregex "FN?M(ADD|SUB)Drrr")>;
+def KryoWrite_5cyc_X_noRSV_51ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_51ln],
+	(instrs FMLAv2f32, FMLSv2f32, FMLAv1i32_indexed, FMLSv1i32_indexed)>;
+def KryoWrite_5cyc_X_X_56ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_56ln],
+	(instrs FMLAv4f32, FMLSv4f32)>;
+def KryoWrite_6cyc_X_X_61ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_61ln],
+	(instrs FMLAv2f64, FMLSv2f64)>;
+def KryoWrite_5cyc_X_noRSV_128ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_128ln],
+	(instrs FMLAv2i32_indexed, FMLSv2i32_indexed)>;
+def KryoWrite_5cyc_X_X_131ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_131ln],
+	(instrs FMLAv4i32_indexed, FMLSv4i32_indexed)>;
+def KryoWrite_6cyc_X_X_134ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_134ln],
+	(instrs FMLAv2i64_indexed, FMLSv2i64_indexed)>;
+def KryoWrite_6cyc_X_noRSV_60ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_60ln],
+	(instrs FMLAv1i64_indexed, FMLSv1i64_indexed, FMULv1i64_indexed, FMULXv1i64_indexed)>;
+def KryoWrite_1cyc_XY_45ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_45ln],
+	(instregex "FMOV(XDHigh|DXHigh|DX)r")>;
+def KryoWrite_1cyc_XY_noRSV_47ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_47ln],
+	(instregex "FMOV(Di|Dr|Si|Sr|SWr|WSr|XDr|v.*_ns)")>;
+def KryoWrite_5cyc_X_noRSV_53ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_53ln],
+	(instrs FMULv1i32_indexed, FMULXv1i32_indexed)>;
+def KryoWrite_5cyc_X_noRSV_127ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_127ln],
+	(instrs FMULv2f32, FMULXv2f32, FMULv2i32_indexed, FMULXv2i32_indexed)>;
+def KryoWrite_5cyc_X_X_130ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_130ln],
+	(instrs FMULv4f32, FMULXv4f32, FMULv4i32_indexed, FMULXv4i32_indexed)>;
+def KryoWrite_6cyc_X_X_133ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_133ln],
+	(instrs FMULv2f64, FMULXv2f64, FMULv2i64_indexed, FMULXv2i64_indexed)>;
+def KryoWrite_5cyc_X_noRSV_54ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_54ln],
+	(instrs FMULSrr, FNMULSrr, FMULX32)>;
+def KryoWrite_6cyc_X_noRSV_59ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_59ln],
+	(instrs FMULDrr, FNMULDrr, FMULX64)>;
+def KryoWrite_3cyc_XY_noRSV_28ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_28ln],
+	(instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64 )>;
+def KryoWrite_3cyc_XY_noRSV_99ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_99ln],
+	(instrs FRECPEv2f32, FRSQRTEv2f32)>;
+def KryoWrite_3cyc_XY_XY_102ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_102ln],
+	(instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>;
+def KryoWrite_5cyc_X_noRSV_52ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_52ln],
+	(instrs FRECPS32, FRSQRTS32)>;
+def KryoWrite_6cyc_X_noRSV_58ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_58ln],
+	(instrs FRECPS64, FRSQRTS64)>;
+def KryoWrite_5cyc_X_noRSV_126ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_126ln],
+	(instrs FRECPSv2f32, FRSQRTSv2f32)>;
+def KryoWrite_5cyc_X_X_129ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_129ln],
+	(instrs FRECPSv4f32, FRSQRTSv4f32)>;
+def KryoWrite_6cyc_X_X_132ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_132ln],
+	(instrs FRECPSv2f64, FRSQRTSv2f64)>;
+def KryoWrite_3cyc_XY_noRSV_50ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_50ln],
+	(instrs FRECPXv1i32, FRECPXv1i64)>;
+def KryoWrite_2cyc_XY_noRSV_39ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_39ln],
+	(instregex "FRINT(A|I|M|N|P|X|Z)(S|D)r")>;
+def KryoWrite_2cyc_XY_noRSV_108ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_108ln],
+	(instregex "FRINT(A|I|M|N|P|X|Z)v2f32")>;
+def KryoWrite_2cyc_XY_XY_109ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_109ln],
+	(instregex "FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)")>;
+def KryoWrite_1cyc_XA_Y_noRSV_42ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_42ln],
+	(instregex "FSQRT(S|D)r")>;
+def KryoWrite_1cyc_XA_Y_noRSV_120ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_120ln],
+	(instregex "FSQRTv2f32")>;
+def KryoWrite_1cyc_XA_Y_XA_Y_122ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_122ln],
+	(instregex "FSQRT(v2f64|v4f32)")>;
+def KryoWrite_1cyc_X_201ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_201ln],
+	(instregex "INSv.*")>;
+def KryoWrite_3cyc_LS_255ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_255ln],
+	(instregex "LD1(One(v16b|v8h|v4s|v2d)|i64)$")>;
+def KryoWrite_4cyc_LS_X_270ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_270ln],
+	(instregex "LD1(i8|i16|i32)$")>;
+def KryoWrite_3cyc_LS_noRSV_285ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_285ln],
+	(instregex "LD1One(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_289ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_289ln, WriteAdr],
+	(instregex "LD1(One(v16b|v8h|v4s|v2d)|i64)_POST$")>;
+def KryoWrite_4cyc_LS_XY_X_298ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_298ln, WriteAdr],
+	(instregex "LD1(i8|i16|i32)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_308ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_308ln],
+	(instregex "LD1Three(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_317ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_317ln, WriteAdr],
+	(instregex "LD1One(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_LS_328ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_328ln, WriteAdr],
+	(instregex "LD1Four(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_332ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_332ln, WriteAdr],
+	(instregex "LD1Three(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_348ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_348ln],
+	(instregex "LD1Three(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_LS_351ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_351ln],
+	(instregex "LD1Four(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_358ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_358ln],
+	(instregex "LD1Four(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_360ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_360ln, WriteAdr],
+	(instregex "LD1Three(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_368ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 7;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_368ln, WriteAdr],
+	(instregex "LD1Four(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_281ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_281ln],
+	(instregex "LD(1|2)Two(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_311ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_311ln],
+	(instregex "LD(1|2)Two(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_313ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_313ln, WriteAdr],
+	(instregex "LD(1|2)Two(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_noRSV_334ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_334ln, WriteAdr],
+	(instregex "LD(1|2)Two(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_256ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_256ln],
+	(instregex "LD1R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_noRSV_286ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_286ln],
+	(instregex "LD1R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_290ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_290ln, WriteAdr],
+	(instregex "LD1R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_318ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_318ln, WriteAdr],
+	(instregex "LD1R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_257ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_257ln],
+	(instregex "LD2i64$")>;
+def KryoWrite_3cyc_LS_XY_291ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_291ln, WriteAdr],
+	(instregex "LD2i64_POST$")>;
+def KryoWrite_4cyc_LS_X_X_296ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_X_296ln],
+	(instregex "LD2(i8|i16|i32)$")>;
+def KryoWrite_4cyc_LS_XY_X_X_321ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_321ln, WriteAdr],
+	(instregex "LD2(i8|i16|i32)_POST$")>;
+def KryoWrite_3cyc_LS_LS_282ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_282ln],
+	(instregex "LD2R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_312ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_312ln],
+	(instregex "LD2R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_314ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_314ln, WriteAdr],
+	(instregex "LD2R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_noRSV_335ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_335ln, WriteAdr],
+	(instregex "LD2R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_283ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_283ln],
+	(instregex "LD3i64$")>;
+def KryoWrite_3cyc_LS_LS_LS_309ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_309ln],
+	(instregex "LD3Threev2d$")>;
+def KryoWrite_3cyc_LS_XY_LS_315ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_315ln, WriteAdr],
+	(instregex "LD3i64_POST$")>;
+def KryoWrite_4cyc_LS_X_X_X_320ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_X_X_320ln],
+	(instregex "LD3(i8|i16|i32)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_331ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_331ln, WriteAdr],
+	(instregex "LD3Threev2d_POST$")>;
+def KryoWrite_4cyc_LS_XY_X_X_X_338ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_X_338ln, WriteAdr],
+	(instregex "LD3(i8|i16|i32)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_noRSV_noRSV_noRSV_373ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_noRSV_noRSV_noRSV_373ln],
+	(instregex "LD3Three(v8b|v4h|v2s)$")>;
+def KryoWrite_4cyc_LS_XY_LS_X_X_X_noRSV_noRSV_noRSV_380ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 9;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_LS_X_X_X_noRSV_noRSV_noRSV_380ln, WriteAdr],
+	(instregex "LD3Three(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_LS_LS_X_X_X_381ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 10;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_LS_LS_X_X_X_381ln],
+	(instregex "LD3Three(v16b|v8h|v4s)$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_LS_XY_LS_X_X_X_383ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 11;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_LS_XY_LS_X_X_X_383ln, WriteAdr],
+	(instregex "LD3Three(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_310ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_310ln],
+	(instregex "LD3R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_333ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_333ln, WriteAdr],
+	(instregex "LD3R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_349ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_349ln],
+	(instregex "LD3R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_361ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_361ln, WriteAdr],
+	(instregex "LD3R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_284ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_284ln],
+	(instregex "LD4i64$")>;
+def KryoWrite_3cyc_LS_XY_LS_316ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_316ln, WriteAdr],
+	(instregex "LD4i64_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_LS_329ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_329ln],
+	(instregex "LD4Four(v2d)$")>;
+def KryoWrite_4cyc_LS_X_X_X_X_337ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_X_X_X_337ln],
+	(instregex "LD4(i8|i16|i32)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_LS_350ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_350ln, WriteAdr],
+	(instregex "LD4Four(v2d)_POST$")>;
+def KryoWrite_4cyc_LS_XY_X_X_X_X_355ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_X_X_355ln, WriteAdr],
+	(instregex "LD4(i8|i16|i32)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_382ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 10;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_382ln],
+	(instregex "LD4Four(v8b|v4h|v2s)$")>;
+def KryoWrite_4cyc_LS_XY_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_384ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 11;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_384ln, WriteAdr],
+	(instregex "LD4Four(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_X_LS_LS_X_X_X_X_386ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 12;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_LS_LS_X_X_X_X_386ln],
+	(instregex "LD4Four(v16b|v8h|v4s)$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_X_LS_XY_LS_X_X_X_X_389ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 13;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_LS_XY_LS_X_X_X_X_389ln, WriteAdr],
+	(instregex "LD4Four(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_LS_330ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_330ln],
+	(instregex "LD4R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_LS_352ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_352ln, WriteAdr],
+	(instregex "LD4R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_359ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_359ln],
+	(instregex "LD4R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_369ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 7;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_369ln, WriteAdr],
+	(instregex "LD4R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_400ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_400ln],
+	(instregex "(LDAX?R(B|H|W|X)|LDAXP(W|X))")>;
+def KryoWrite_3cyc_LS_LS_401ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_401ln, WriteLDHi],
+	(instrs LDNPQi)>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_408ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_408ln, WriteLDHi],
+	(instrs LDNPDi, LDNPSi)>;
+def KryoWrite_3cyc_LS_394ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_394ln, WriteLDHi],
+	(instrs LDNPWi, LDNPXi)>;
+def KryoWrite_3cyc_LS_LS_402ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_402ln, WriteLDHi],
+	(instrs LDPQi)>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_409ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_409ln, WriteLDHi],
+	(instrs LDPDi, LDPSi)>;
+def KryoWrite_3cyc_LS_XY_LS_410ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_410ln, WriteLDHi, WriteAdr],
+	(instregex "LDPQ(post|pre)")>;
+def KryoWrite_3cyc_LS_XY_noRSV_noRSV_411ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_411ln, WriteLDHi, WriteAdr],
+	(instregex "LDP(D|S)(post|pre)")>;
+def KryoWrite_3cyc_LS_393ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_393ln, WriteLDHi],
+	(instrs LDPWi, LDPXi)>;
+def KryoWrite_3cyc_LS_XY_403ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_403ln, WriteLDHi, WriteAdr],
+	(instregex "LDP(W|X)(post|pre)")>;
+def KryoWrite_4cyc_LS_395ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_395ln, WriteLDHi],
+	(instrs LDPSWi)>;
+def KryoWrite_4cyc_LS_XY_405ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_405ln, WriteLDHi, WriteAdr],
+	(instrs LDPSWpost, LDPSWpre)>;
+def KryoWrite_3cyc_LS_264ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_264ln],
+	(instrs LDRQui, LDRQl)>;
+def KryoWrite_4cyc_X_LS_271ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_LS_271ln],
+	(instrs LDRQroW, LDRQroX)>;
+def KryoWrite_3cyc_LS_noRSV_287ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_287ln],
+	(instregex "LDR((D|S)l|(D|S|H|B)ui)")>;
+def KryoWrite_3cyc_LS_XY_293ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_293ln, WriteAdr],
+	(instrs LDRQpost, LDRQpre)>;
+def KryoWrite_4cyc_X_LS_noRSV_297ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_X_LS_noRSV_297ln],
+	(instregex "LDR(D|S|H|B)ro(W|X)")>;
+def KryoWrite_3cyc_LS_XY_noRSV_319ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_319ln, WriteAdr],
+	(instregex "LDR(D|S|H|B)(post|pre)")>;
+def KryoWrite_3cyc_LS_261ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_261ln],
+	(instregex "LDR(BB|HH|W|X)ui")>;
+def KryoWrite_3cyc_LS_XY_292ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_292ln, WriteAdr],
+	(instregex "LDR(BB|HH|W|X)(post|pre)")>;
+def KryoWrite_4cyc_X_LS_272ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_LS_272ln],
+	(instregex "(LDR(BB|HH|W|X)ro(W|X)|PRFMro(W|X))")>;
+def KryoWrite_3cyc_LS_262ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_262ln],
+	(instrs LDRWl, LDRXl)>;
+def KryoWrite_4cyc_LS_268ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_268ln],
+	(instregex "LDRS(BW|BX|HW|HX|W)ui")>;
+def KryoWrite_5cyc_X_LS_273ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_LS_273ln],
+	(instregex "LDRS(BW|BX|HW|HX|W)ro(W|X)")>;
+def KryoWrite_4cyc_LS_XY_294ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_294ln, WriteAdr],
+	(instregex "LDRS(BW|BX|HW|HX|W)(post|pre)")>;
+def KryoWrite_4cyc_LS_269ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_269ln],
+	(instrs LDRSWl)>;
+def KryoWrite_3cyc_LS_260ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_260ln],
+	(instregex "LDTR(B|H|W|X)i")>;
+def KryoWrite_4cyc_LS_267ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_267ln],
+	(instregex "LDTRS(BW|BX|HW|HX|W)i")>;
+def KryoWrite_3cyc_LS_263ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_263ln],
+	(instrs LDURQi)>;
+def KryoWrite_3cyc_LS_noRSV_288ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_288ln],
+	(instregex "LDUR(D|S|H|B)i")>;
+def KryoWrite_3cyc_LS_259ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_259ln],
+	(instregex "LDUR(BB|HH|W|X)i")>;
+def KryoWrite_4cyc_LS_266ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_266ln],
+	(instregex "LDURS(B|H)?(W|X)i")>;
+def KryoWrite_3cyc_LS_258ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_258ln],
+	(instregex "LDXP(W|X)")>;
+def KryoWrite_3cyc_LS_258_1ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_258_1ln],
+	(instregex "LDXR(B|H|W|X)")>;
+def KryoWrite_2cyc_XY_XY_137ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_137ln],
+	(instrs LSLVWr, LSLVXr)>;
+def KryoWrite_1cyc_XY_135ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_135ln],
+	(instregex "(LS|AS|RO)RV(W|X)r")>;
+def KryoWrite_4cyc_X_84ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_84ln],
+	(instrs MADDWrrr, MSUBWrrr)>;
+def KryoWrite_5cyc_X_85ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_5cyc_X_85ln],
+	(instrs MADDXrrr, MSUBXrrr)>;
+def KryoWrite_4cyc_X_noRSV_188ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_188ln],
+	(instregex "(MLA|MLS|MUL)(v8i8|v4i16|v2i32)(_indexed)?")>;
+def KryoWrite_4cyc_X_X_192ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_192ln],
+	(instregex "(MLA|MLS|MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?")>;
+def KryoWrite_1cyc_XY_noRSV_198ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_198ln],
+	(instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)")>;
+def KryoWrite_1cyc_XY_XY_199ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_199ln],
+	(instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)")>;
+def KryoWrite_1cyc_X_89ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_89ln],
+	(instrs MOVKWi, MOVKXi)>;
+def KryoWrite_1cyc_XY_91ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_91ln],
+	(instrs MOVNWi, MOVNXi)>;
+def KryoWrite_1cyc_XY_90ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_90ln],
+	(instrs MOVZWi, MOVZXi)>;
+def KryoWrite_2cyc_XY_93ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_2cyc_XY_93ln],
+	(instrs MRS)>;
+def KryoWrite_0cyc_X_87ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_X_87ln],
+	(instrs MSRpstateImm4)>;
+def : InstRW<[KryoWrite_0cyc_X_87ln],
+	(instrs MSRpstateImm1)>;
+def KryoWrite_0cyc_XY_88ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_88ln],
+	(instrs MSR)>;
+def KryoWrite_1cyc_XY_noRSV_143ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_143ln],
+	(instregex "NEG(v8i8|v4i16|v2i32|v1i64)")>;
+def KryoWrite_1cyc_XY_XY_145ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_145ln],
+	(instregex "NEG(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_1cyc_XY_noRSV_193ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_193ln],
+	(instrs NOTv8i8)>;
+def KryoWrite_1cyc_XY_XY_194ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_194ln],
+	(instrs NOTv16i8)>;
+def KryoWrite_2cyc_XY_noRSV_234ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_234ln],
+	(instrs PMULv8i8)>;
+def KryoWrite_2cyc_XY_XY_236ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_236ln],
+	(instrs PMULv16i8)>;
+def KryoWrite_2cyc_XY_XY_235ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_235ln],
+	(instrs PMULLv8i8, PMULLv16i8)>;
+def KryoWrite_3cyc_XY_XY_237ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_237ln],
+	(instrs PMULLv1i64, PMULLv2i64)>;
+def KryoWrite_0cyc_LS_254ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_254ln],
+	(instrs PRFMl, PRFMui)>;
+def KryoWrite_0cyc_LS_253ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_253ln],
+	(instrs PRFUMi)>;
+def KryoWrite_6cyc_XY_X_noRSV_175ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_6cyc_XY_X_noRSV_175ln],
+	(instregex "R(ADD|SUB)HNv.*")>;
+def KryoWrite_2cyc_XY_204ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_2cyc_XY_204ln],
+	(instrs RBITWr, RBITXr)>;
+def KryoWrite_2cyc_XY_noRSV_218ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_218ln],
+	(instrs RBITv8i8)>;
+def KryoWrite_2cyc_XY_XY_219ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_219ln],
+	(instrs RBITv16i8)>;
+def KryoWrite_1cyc_X_202ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_202ln],
+	(instregex "REV(16|32)?(W|X)r")>;
+def KryoWrite_1cyc_XY_noRSV_214ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_214ln],
+	(instregex "REV(16|32|64)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_1cyc_XY_XY_216ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_216ln],
+	(instregex "REV(16|32|64)(v16i8|v8i16|v4i32)")>;
+def KryoWrite_3cyc_X_noRSV_244ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_noRSV_244ln],
+	(instregex "S(L|R)I(d|(v8i8|v4i16|v2i32)_shift)")>;
+def KryoWrite_3cyc_X_X_245ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_245ln],
+	(instregex "S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift")>;
+def KryoWrite_1cyc_XY_2ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_2ln, ReadI, ReadI],
+	(instregex "SBCS?(W|X)r")>;
+def KryoWrite_2cyc_XA_XA_XA_24ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitXA, KryoUnitXA]> {
+	let Latency = 2; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_2cyc_XA_XA_XA_24ln],
+	(instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr)>;
+def KryoWrite_1cyc_XY_noRSV_21ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_21ln],
+	(instrs SHA1Hrr)>;
+def KryoWrite_2cyc_X_X_23ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_23ln],
+	(instrs SHA1SU0rrr, SHA1SU1rr, SHA256SU0rr)>;
+def KryoWrite_4cyc_XA_XA_XA_25ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitXA, KryoUnitXA]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XA_XA_XA_25ln],
+	(instrs SHA256Hrrr, SHA256H2rrr)>;
+def KryoWrite_3cyc_XY_XY_X_X_26ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_X_X_26ln],
+	(instrs SHA256SU1rrr)>;
+def KryoWrite_4cyc_X_noRSV_189ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_189ln],
+	(instregex "SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?")>;
+def KryoWrite_3cyc_XY_noRSV_68ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_68ln],
+	(instregex "SQ(ABS|NEG)(v1i8|v1i16|v1i32|v1i64)")>;
+def KryoWrite_3cyc_XY_noRSV_157ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_157ln],
+	(instregex "SQ(ABS|NEG)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_164ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_164ln],
+	(instregex "SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_4cyc_X_noRSV_190ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_190ln],
+	(instregex "SQD(MLAL|MLSL|MULL)(i16|i32)")>;
+def KryoWrite_0cyc_LS_Y_274ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_274ln],
+	(instregex "ST1(One(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64)|Two(v8b|v4h|v2s|v1d))$")>;
+def KryoWrite_1cyc_LS_Y_X_301ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_301ln],
+	(instregex "ST1(One(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64)|Two(v8b|v4h|v2s|v1d))_POST$")>;
+def KryoWrite_1cyc_LS_Y_XY_305ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_305ln],
+	(instregex "ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_323ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_323ln],
+	(instregex "ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_345ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_1cyc_LS_Y_XY_LS_Y_345ln],
+	(instregex "ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_356ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_356ln],
+	(instregex "ST1Three(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_366ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY,
+                   KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 7;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_366ln],
+	(instregex "ST1Three(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_371ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_371ln],
+	(instregex "ST1Four(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_377ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitXY,
+                   KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 9;
+}
+def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_377ln],
+	(instregex "ST1Four(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_0cyc_LS_Y_275ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_275ln],
+	(instregex "ST2(Two(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64))$")>;
+def KryoWrite_1cyc_LS_Y_XY_306ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_306ln],
+	(instregex "ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_322ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_322ln],
+	(instregex "ST2Two(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_344ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_344ln],
+	(instregex "ST2Two(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_324ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_324ln],
+	(instregex "ST3(Threev1d|(i8|i16|i32|i64))$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_346ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_346ln],
+	(instregex "ST3(Threev1d|(i8|i16|i32|i64))_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_LS_Y_353ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_LS_Y_353ln],
+	(instregex "ST3Three(v8b|v4h|v2s)$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_357ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_357ln],
+	(instregex "ST3Threev2d$")>;
+def KryoWrite_1cyc_X_X_LS_Y_XY_LS_Y_363ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY,
+                   KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 7;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_XY_LS_Y_363ln],
+	(instregex "ST3Three(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_367ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY,
+                   KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 7;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_367ln],
+	(instregex "ST3Threev2d_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_LS_Y_385ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY,
+                   KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 12;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_LS_Y_385ln],
+	(instregex "ST3Three(v16b|v8h|v4s)$")>;
+def KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_XY_LS_Y_388ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY,
+                   KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 13;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_XY_LS_Y_388ln],
+	(instregex "ST3Three(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_325ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_325ln],
+	(instregex "ST4(Fourv1d|(i8|i16|i32|i64))$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_347ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_347ln],
+	(instregex "ST4(Fourv1d|(i8|i16|i32|i64))_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_370ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX,
+                   KryoUnitX, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_370ln],
+	(instregex "ST4Four(v8b|v4h|v2s)$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_372ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_372ln],
+	(instregex "ST4Fourv2d$")>;
+def KryoWrite_1cyc_X_X_LS_Y_XY_X_X_LS_Y_375ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY,
+                   KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 9;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_XY_X_X_LS_Y_375ln],
+	(instregex "ST4Four(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_379ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitXY,
+                   KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 9;
+}
+def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_379ln],
+	(instregex "ST4Fourv2d_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_390ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX,
+                   KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX,
+                   KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS,
+                   KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 16;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_390ln],
+	(instregex "ST4Four(v16b|v8h|v4s)$")>;
+def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_XY_X_X_LS_Y_392ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX,
+                   KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX,
+                   KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitX, KryoUnitX,
+                   KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 17;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_XY_X_X_LS_Y_392ln],
+	(instregex "ST4Four(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_0cyc_LS_LS_Y_299ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_0cyc_LS_LS_Y_299ln],
+	(instregex "STLR(B|H|W|X)")>;
+def KryoWrite_3cyc_LS_LS_Y_307ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitY]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_Y_307ln],
+	(instregex "STLX(P(W|X)|R(B|H|W|X))")>;
+def KryoWrite_0cyc_LS_Y_276ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_276ln],
+	(instrs STNPDi, STNPSi)>;
+def KryoWrite_0cyc_LS_Y_LS_Y_326ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_326ln],
+	(instrs STNPQi)>;
+def KryoWrite_0cyc_LS_Y_280ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_280ln],
+	(instrs STNPWi, STNPXi)>;
+def KryoWrite_0cyc_LS_Y_277ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_277ln],
+	(instregex "STP(D|S)i")>;
+def KryoWrite_1cyc_LS_Y_X_303ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_303ln],
+	(instregex "STP(D|S)(post|pre)")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_327ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_327ln],
+	(instrs STPQi)>;
+def KryoWrite_1cyc_LS_Y_X_LS_Y_343ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_LS_Y_343ln],
+	(instrs STPQpost, STPQpre)>;
+def KryoWrite_0cyc_LS_Y_279ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_279ln],
+	(instregex "STP(W|X)i")>;
+def KryoWrite_1cyc_LS_X_Y_300ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_X_Y_300ln],
+	(instregex "STP(W|X)(post|pre)")>;
+def KryoWrite_0cyc_LS_Y_278ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_278ln],
+	(instregex "STR(Q|D|S|H|B)ui")>;
+def KryoWrite_1cyc_X_LS_Y_295ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_X_LS_Y_295ln],
+	(instregex "STR(D|S|H|B)ro(W|X)")>;
+def KryoWrite_1cyc_LS_Y_X_304ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_304ln],
+	(instregex "STR(Q|D|S|H|B)(post|pre)")>;
+def KryoWrite_2cyc_X_LS_Y_XY_LS_Y_354ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS,
+                   KryoUnitY]> {
+	let Latency = 2; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_2cyc_X_LS_Y_XY_LS_Y_354ln],
+	(instregex "STRQro(W|X)")>;
+def KryoWrite_0cyc_LS_Y_399ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_399ln],
+	(instregex "STR(BB|HH|W|X)ui")>;
+def KryoWrite_1cyc_X_LS_Y_406ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_X_LS_Y_406ln],
+	(instregex "STR(BB|HH|W|X)ro(W|X)")>;
+def KryoWrite_1cyc_LS_X_Y_407ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_X_Y_407ln],
+	(instregex "STR(BB|HH|W|X)(post|pre)")>;
+def KryoWrite_0cyc_LS_Y_398ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_398ln],
+	(instregex "STTR(B|H|W|X)i")>;
+def KryoWrite_0cyc_LS_Y_396ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_396ln],
+	(instregex "STUR(Q|D|S|H|B)i")>;
+def KryoWrite_0cyc_LS_Y_397ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_397ln],
+	(instregex "STUR(BB|HH|W|X)i")>;
+def KryoWrite_3cyc_LS_Y_404ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_Y_404ln],
+	(instregex "STX(P(W|X)|R(B|H|W|X))")>;
+def KryoWrite_3cyc_XY_noRSV_160ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_160ln],
+	(instregex "^(SU|US)QADD(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_167ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_167ln],
+	(instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_1cyc_XY_1ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_1ln, ReadI],
+	(instregex "SUBS?(W|X)ri")>;
+def KryoWrite_2cyc_XY_XY_5ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_5ln, ReadI, ReadIEReg],
+	(instregex "SUBS?(W|X)rx")>;
+def KryoWrite_2cyc_XY_XY_5_1ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_5_1ln, ReadI, ReadISReg],
+	(instregex "SUBS?(W|X)rs")>;
+def KryoWrite_1cyc_XY_noRSV_6ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_6ln, ReadI, ReadI],
+	(instregex "SUBS?(W|X)rr")>;
+def KryoWrite_0cyc_LS_9ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_9ln],
+	(instregex "SYSL?xt")>;
+def KryoWrite_1cyc_X_noRSV_205ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_205ln],
+	(instrs TBLv8i8One)>;
+def KryoWrite_1cyc_X_X_208ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_208ln],
+	(instrs TBLv16i8One)>;
+def KryoWrite_2cyc_X_X_X_noRSV_222ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_X_noRSV_222ln],
+	(instrs TBLv8i8Two)>;
+def KryoWrite_2cyc_X_X_X_X_X_X_224ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_X_X_X_X_224ln],
+	(instrs TBLv16i8Two)>;
+def KryoWrite_3cyc_X_X_X_X_X_noRSV_225ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_noRSV_225ln],
+	(instrs TBLv8i8Three)>;
+def KryoWrite_3cyc_X_X_X_X_X_X_X_noRSV_228ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_X_X_noRSV_228ln],
+	(instrs TBLv8i8Four)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_X_X_XY_X_X_230ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitXY, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 11;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_XY_X_X_230ln],
+	(instrs TBLv16i8Three)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_232ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 15;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_232ln],
+	(instrs TBLv16i8Four)>;
+def KryoWrite_2cyc_X_X_noRSV_220ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_noRSV_220ln],
+	(instrs TBXv8i8One)>;
+def KryoWrite_2cyc_X_X_X_X_221ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_X_X_221ln],
+	(instrs TBXv16i8One)>;
+def KryoWrite_3cyc_X_X_X_X_noRSV_223ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_noRSV_223ln],
+	(instrs TBXv8i8Two)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_noRSV_226ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 7;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_noRSV_226ln],
+	(instrs TBXv8i8Three)>;
+def KryoWrite_3cyc_X_X_X_X_X_X_X_X_227ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_X_X_X_227ln],
+	(instrs TBXv16i8Two)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_X_X_noRSV_229ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 9;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_noRSV_229ln],
+	(instrs TBXv8i8Four)>;
+def KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_XY_X_X_X_231ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitXY,
+                   KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 13;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_XY_X_X_X_231ln],
+	(instrs TBXv16i8Three)>;
+def KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_X_233ln :
+    SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 17;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_X_233ln],
+	(instrs TBXv16i8Four)>;
+def KryoWrite_1cyc_XY_XY_217ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_217ln],
+	(instregex "((TRN1|TRN2|ZIP1|UZP1|UZP2)v2i64|ZIP2(v2i64|v4i32|v8i16|v16i8))")>;
+def KryoWrite_1cyc_X_X_211ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_211ln],
+	(instregex "(TRN1|TRN2)(v4i32|v8i16|v16i8)")>;
+def KryoWrite_1cyc_X_XY_213ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_XY_213ln],
+	(instregex "(TRN1|TRN2)(v2i32|v4i16|v8i8)")>;
+def KryoWrite_3cyc_XY_noRSV_156ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_156ln],
+	(instrs URECPEv2i32, URSQRTEv2i32)>;
+def KryoWrite_3cyc_XY_XY_168ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_168ln],
+	(instrs URECPEv4i32, URSQRTEv4i32)>;
+def KryoWrite_1cyc_X_X_210ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_210ln],
+	(instregex "(UZP1|UZP2)(v4i32|v8i16|v16i8)")>;
+def KryoWrite_1cyc_X_noRSV_206ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_206ln],
+	(instregex "(UZP1|UZP2|ZIP1|ZIP2)(v2i32|v4i16|v8i8)")>;
+def KryoWrite_1cyc_XY_noRSV_215ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_215ln],
+	(instregex "XTNv.*")>;
+def KryoWrite_1cyc_X_X_209ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_209ln],
+	(instregex "ZIP1(v4i32|v8i16|v16i8)")>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td
new file mode 100644
index 000000000000..14d6891253fa
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td
@@ -0,0 +1,380 @@
+//=- AArch64SchedM1.td - Samsung Exynos-M1 Scheduling Defs ---*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Samsung Exynos-M1 to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// The Exynos-M1 is a traditional superscalar microprocessor with a
+// 4-wide in-order stage for decode and dispatch and a wider issue stage.
+// The execution units and loads and stores are out-of-order.
+
+def ExynosM1Model : SchedMachineModel {
+  let IssueWidth            =  4; // Up to 4 uops per cycle.
+  let MicroOpBufferSize     = 96; // ROB size.
+  let LoopMicroOpBufferSize = 24; // Based on the instruction queue size.
+  let LoadLatency           =  4; // Optimistic load cases.
+  let MispredictPenalty     = 14; // Minimum branch misprediction penalty.
+  let CompleteModel         =  0; // Use the default model otherwise.
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on the Exynos-M1,
+// which has 9 pipelines, each with its own queue with out-of-order dispatch.
+
+def M1UnitA  : ProcResource<2>; // Simple integer
+def M1UnitC  : ProcResource<1>; // Simple and complex integer
+def M1UnitD  : ProcResource<1>; // Integer division (inside C, serialized)
+def M1UnitB  : ProcResource<2>; // Branch
+def M1UnitL  : ProcResource<1>; // Load
+def M1UnitS  : ProcResource<1>; // Store
+def M1PipeF0 : ProcResource<1>; // FP #0
+let Super = M1PipeF0 in {
+  def M1UnitFMAC   : ProcResource<1>; // FP multiplication
+  def M1UnitNAL0   : ProcResource<1>; // Simple vector
+  def M1UnitNMISC  : ProcResource<1>; // Miscellanea
+  def M1UnitFCVT   : ProcResource<1>; // FP conversion
+  def M1UnitNCRYPT : ProcResource<1>; // Cryptographic
+}
+def M1PipeF1 : ProcResource<1>; // FP #1
+let Super = M1PipeF1 in {
+  def M1UnitFADD : ProcResource<1>; // Simple FP
+  def M1UnitNAL1 : ProcResource<1>; // Simple vector
+  def M1UnitFVAR : ProcResource<1>; // FP division & square root (serialized)
+  def M1UnitFST  : ProcResource<1>; // FP store
+}
+
+let SchedModel = ExynosM1Model in {
+  def M1UnitALU  : ProcResGroup<[M1UnitA,
+                                 M1UnitC]>;    // All integer
+  def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
+                                 M1UnitNAL1]>; // All simple vector
+}
+
+let SchedModel = ExynosM1Model in {
+
+//===----------------------------------------------------------------------===//
+// Coarse scheduling model for the Exynos-M1.
+
+def M1WriteA1 : SchedWriteRes<[M1UnitALU]> { let Latency = 1; }
+def M1WriteA2 : SchedWriteRes<[M1UnitALU]> { let Latency = 2; }
+def M1WriteC1 : SchedWriteRes<[M1UnitC]>   { let Latency = 1; }
+def M1WriteC2 : SchedWriteRes<[M1UnitC]>   { let Latency = 2; }
+
+def M1WriteB1 : SchedWriteRes<[M1UnitB]>      { let Latency = 1; }
+
+def M1WriteL5 : SchedWriteRes<[M1UnitL]>   { let Latency = 5; }
+def M1WriteLA : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteL5,
+                                                            M1WriteA1]>,
+                                   SchedVar<NoSchedPred,   [M1WriteL5]>]>;
+
+def M1WriteS1 : SchedWriteRes<[M1UnitS]> { let Latency = 1; }
+def M1WriteS2 : SchedWriteRes<[M1UnitS]> { let Latency = 2; }
+def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; }
+def M1WriteSA : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteS2,
+                                                            M1WriteA1]>,
+                                   SchedVar<NoSchedPred,   [M1WriteS1]>]>;
+
+def M1ReadAdrBase : SchedReadVariant<[SchedVar<ScaledIdxPred, [ReadDefault]>,
+                                      SchedVar<NoSchedPred,   [ReadDefault]>]>;
+def : SchedAlias<ReadAdrBase, M1ReadAdrBase>;
+
+// Branch instructions.
+// NOTE: Unconditional direct branches actually take neither cycles nor units.
+def : WriteRes<WriteBr,    [M1UnitB]> { let Latency = 1; }
+def : WriteRes<WriteBrReg, [M1UnitC]> { let Latency = 1; }
+
+// Arithmetic and logical integer instructions.
+def : WriteRes<WriteI,     [M1UnitALU]> { let Latency = 1; }
+// TODO: Shift over 3 and some extensions take 2 cycles.
+def : WriteRes<WriteISReg, [M1UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteIEReg, [M1UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteIS,    [M1UnitALU]> { let Latency = 1; }
+
+// Move instructions.
+def : WriteRes<WriteImm, [M1UnitALU]> { let Latency = 1; }
+
+// Divide and multiply instructions.
+def : WriteRes<WriteID32, [M1UnitC,
+                           M1UnitD]> { let Latency = 13;
+                                       let ResourceCycles = [1, 13]; }
+def : WriteRes<WriteID64, [M1UnitC,
+                           M1UnitD]> { let Latency = 21;
+                                       let ResourceCycles = [1, 21]; }
+// TODO: Long multiplication take 5 cycles and also the ALU.
+// TODO: Multiplication with accumulation can be advanced.
+def : WriteRes<WriteIM32, [M1UnitC]> { let Latency = 3; }
+// TODO: 64-bit multiplication has a throughput of 1/2.
+def : WriteRes<WriteIM64, [M1UnitC]> { let Latency = 4; }
+
+// Miscellaneous instructions.
+def : WriteRes<WriteExtr, [M1UnitALU,
+                           M1UnitALU]> { let Latency = 2; }
+
+// TODO: The latency for the post or pre register is 1 cycle.
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Load instructions.
+def : WriteRes<WriteLD,    [M1UnitL]>   { let Latency = 4; }
+def : WriteRes<WriteLDHi,  [M1UnitALU]> { let Latency = 4; }
+def : SchedAlias<WriteLDIdx, M1WriteLA>;
+
+// Store instructions.
+def : WriteRes<WriteST,    [M1UnitS]> { let Latency = 1; }
+def : WriteRes<WriteSTP,   [M1UnitS]> { let Latency = 1; }
+def : WriteRes<WriteSTX,   [M1UnitS]> { let Latency = 1; }
+def : SchedAlias<WriteSTIdx, M1WriteSA>;
+
+// FP data instructions.
+def : WriteRes<WriteF,    [M1UnitFADD]>  { let Latency = 3; }
+// TODO: FCCMP is much different.
+def : WriteRes<WriteFCmp, [M1UnitNMISC]> { let Latency = 4; }
+def : WriteRes<WriteFDiv, [M1UnitFVAR]>  { let Latency = 15;
+                                           let ResourceCycles = [15]; }
+def : WriteRes<WriteFMul, [M1UnitFMAC]>  { let Latency = 4; }
+
+// FP miscellaneous instructions.
+// TODO: Conversion between register files is much different.
+def : WriteRes<WriteFCvt,  [M1UnitFCVT]> { let Latency = 3; }
+def : WriteRes<WriteFImm,  [M1UnitNALU]> { let Latency = 1; }
+def : WriteRes<WriteFCopy, [M1UnitS]>    { let Latency = 4; }
+
+// FP load instructions.
+// TODO: ASIMD loads are much different.
+def : WriteRes<WriteVLD, [M1UnitL]> { let Latency = 5; }
+
+// FP store instructions.
+// TODO: ASIMD stores are much different.
+def : WriteRes<WriteVST, [M1UnitS, M1UnitFST]> { let Latency = 1; }
+
+// ASIMD FP instructions.
+def : WriteRes<WriteV, [M1UnitFADD]> { let Latency = 3; }
+
+// Other miscellaneous instructions.
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+
+//===----------------------------------------------------------------------===//
+// Generic fast forwarding.
+
+// TODO: Add FP register forwarding rules.
+
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+// Integer multiply-accumulate.
+// TODO: The forwarding for WriteIM64 saves actually 3 cycles.
+def : ReadAdvance<ReadIMA,     2, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+//===----------------------------------------------------------------------===//
+// Finer scheduling model for the Exynos-M1.
+
+def M1WriteNEONA   : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitNALU,
+                                    M1UnitFADD]>   { let Latency = 9; }
+def M1WriteNEONB   : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitFST]>    { let Latency = 5; }
+def M1WriteNEONC   : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitFST]>    { let Latency = 6; }
+def M1WriteNEOND   : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitFST,
+                                    M1UnitL]>      { let Latency = 10; }
+def M1WriteNEONE   : SchedWriteRes<[M1UnitFCVT,
+                                    M1UnitFST]>    { let Latency = 8; }
+def M1WriteNEONF   : SchedWriteRes<[M1UnitFCVT,
+                                    M1UnitFST,
+                                    M1UnitL]>      { let Latency = 13; }
+def M1WriteNEONG   : SchedWriteRes<[M1UnitNMISC,
+                                    M1UnitFST]>    { let Latency = 6; }
+def M1WriteNEONH   : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitFST]>    { let Latency = 3; }
+def M1WriteNEONI   : SchedWriteRes<[M1UnitFST,
+                                    M1UnitL]>      { let Latency = 9; }
+def M1WriteNEONJ   : SchedWriteRes<[M1UnitNMISC,
+                                    M1UnitFMAC]>   { let Latency = 6; }
+def M1WriteNEONK   : SchedWriteRes<[M1UnitNMISC,
+                                    M1UnitFMAC]>   { let Latency = 7; }
+def M1WriteFADD3   : SchedWriteRes<[M1UnitFADD]>   { let Latency = 3; }
+def M1WriteFCVT3   : SchedWriteRes<[M1UnitFCVT]>   { let Latency = 3; }
+def M1WriteFCVT4   : SchedWriteRes<[M1UnitFCVT]>   { let Latency = 4; }
+def M1WriteFMAC4   : SchedWriteRes<[M1UnitFMAC]>   { let Latency = 4; }
+def M1WriteFMAC5   : SchedWriteRes<[M1UnitFMAC]>   { let Latency = 5; }
+def M1WriteFVAR15  : SchedWriteRes<[M1UnitFVAR]>   { let Latency = 15;
+                                                     let ResourceCycles = [15]; }
+def M1WriteFVAR23  : SchedWriteRes<[M1UnitFVAR]>   { let Latency = 23;
+                                                     let ResourceCycles = [23]; }
+def M1WriteNALU1   : SchedWriteRes<[M1UnitNALU]>   { let Latency = 1; }
+def M1WriteNALU2   : SchedWriteRes<[M1UnitNALU]>   { let Latency = 2; }
+def M1WriteNAL11   : SchedWriteRes<[M1UnitNAL1]>   { let Latency = 1; }
+def M1WriteNAL12   : SchedWriteRes<[M1UnitNAL1]>   { let Latency = 2; }
+def M1WriteNAL13   : SchedWriteRes<[M1UnitNAL1]>   { let Latency = 3; }
+def M1WriteNCRYPT1 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
+def M1WriteNCRYPT5 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 5; }
+def M1WriteNMISC1  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 1; }
+def M1WriteNMISC2  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 2; }
+def M1WriteNMISC3  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 3; }
+def M1WriteNMISC4  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 4; }
+def M1WriteTB      : SchedWriteRes<[M1UnitC,
+                                    M1UnitALU]>    { let Latency = 2; }
+
+// Branch instructions
+def : InstRW<[M1WriteB1], (instrs Bcc)>;
+// NOTE: Conditional branch and link adds a B uop.
+def : InstRW<[M1WriteA1], (instrs BL)>;
+// NOTE: Indirect branch and link with LR adds an ALU uop.
+def : InstRW<[M1WriteA1,
+              M1WriteC1], (instrs BLR)>;
+def : InstRW<[M1WriteC1], (instregex "^CBN?Z[WX]")>;
+def : InstRW<[M1WriteC1,
+              M1WriteA2], (instregex "^TBN?Z[WX]")>;
+
+// Arithmetic and logical integer instructions.
+def : InstRW<[M1WriteA1], (instrs COPY)>;
+
+// Divide and multiply instructions.
+
+// Miscellaneous instructions.
+
+// Load instructions.
+
+// Store instructions.
+
+// FP data instructions.
+def : InstRW<[M1WriteNALU1],  (instregex "^F(ABS|NEG)[DS]r")>;
+def : InstRW<[M1WriteFADD3],  (instregex "^F(ADD|SUB)[DS]rr")>;
+def : InstRW<[M1WriteNEONG],  (instregex "^FCCMPE?[DS]rr")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^FCMPE?[DS]r")>;
+def : InstRW<[M1WriteFVAR15], (instrs FDIVSrr)>;
+def : InstRW<[M1WriteFVAR23], (instrs FDIVDrr)>;
+def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN).+rr")>;
+def : InstRW<[M1WriteFMAC4],  (instregex "^FN?MUL[DS]rr")>;
+def : InstRW<[M1WriteFMAC5],  (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
+def : InstRW<[M1WriteFCVT3],  (instregex "^FRINT.+r")>;
+def : InstRW<[M1WriteNEONH],  (instregex "^FCSEL[DS]rrr")>;
+def : InstRW<[M1WriteFVAR15], (instrs FSQRTSr)>;
+def : InstRW<[M1WriteFVAR23], (instrs FSQRTDr)>;
+
+// FP miscellaneous instructions.
+def : InstRW<[M1WriteFCVT3], (instregex "^FCVT[DS][DS]r")>;
+def : InstRW<[M1WriteNEONF], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>;
+def : InstRW<[M1WriteNEONE], (instregex "^[SU]CVTF[SU]")>;
+def : InstRW<[M1WriteNALU1], (instregex "^FMOV[DS][ir]")>;
+def : InstRW<[M1WriteS4],    (instregex "^FMOV[WX][DS](High)?r")>;
+def : InstRW<[M1WriteNEONI], (instregex "^FMOV[DS][WX](High)?r")>;
+
+// FP load instructions.
+
+// FP store instructions.
+
+// ASIMD instructions.
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]ABAL?v")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^[SU]ABDL?v")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^(SQ)?ABSv")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^SQNEGv")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^(ADD|NEG|SUB)v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?H(ADD|SUB)v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?AD[AD](L|LP|P|W)V?2?v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?SUB[LW]2?v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^R?(ADD|SUB)HN?2?v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]+Q(ADD|SUB)v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]RHADDv")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^CMTSTv")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^[SU](MIN|MAX)v")>;
+def : InstRW<[M1WriteNMISC2], (instregex "^[SU](MIN|MAX)Pv")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU](MIN|MAX)Vv")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^(MUL|SQR?DMULH)v")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^ML[AS]v")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD|SQRD)ML[AS][HL]v")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)MULLv")>;
+def : InstRW<[M1WriteNAL13],  (instregex "^(S|SR|U|UR)SRAv")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^[SU]?SH(L|LL|R)2?v")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^S[LR]Iv")>;
+def : InstRW<[M1WriteNAL13],  (instregex "^[SU]?(Q|QR|R)?SHR(N|U|UN)?2?v")>;
+def : InstRW<[M1WriteNAL13],  (instregex "^[SU](Q|QR|R)SHLU?v")>;
+
+// ASIMD FP instructions.
+def : InstRW<[M1WriteNALU1],  (instregex "^F(ABS|NEG)v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^F(ABD|ADD|SUB)v")>;
+def : InstRW<[M1WriteNEONA],  (instregex "^FADDP")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>;
+def : InstRW<[M1WriteFCVT3],  (instregex "^[FVSU]CVTX?[AFLMNPZ][SU]?(_Int)?v")>;
+def : InstRW<[M1WriteFVAR15], (instregex "FDIVv.f32")>;
+def : InstRW<[M1WriteFVAR23], (instregex "FDIVv2f64")>;
+def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>;
+def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>;
+def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>;
+def : InstRW<[M1WriteNEONJ],  (instregex "^FMULX?v.i")>;
+def : InstRW<[M1WriteFMAC4],  (instregex "^FMULX?v.f")>;
+def : InstRW<[M1WriteNEONK],  (instregex "^FML[AS]v.i")>;
+def : InstRW<[M1WriteFMAC5],  (instregex "^FML[AS]v.f")>;
+def : InstRW<[M1WriteFCVT3],  (instregex "^FRINT[AIMNPXZ]v")>;
+
+// ASIMD miscellaneous instructions.
+def : InstRW<[M1WriteNALU1],  (instregex "^RBITv")>;
+def : InstRW<[M1WriteNAL11],  (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^CPY")>;
+def : InstRW<[M1WriteNEONB],  (instregex "^DUPv.+gpr")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^DUPv.+lane")>;
+def : InstRW<[M1WriteNAL13],  (instregex "^[SU]?Q?XTU?Nv")>;
+def : InstRW<[M1WriteNEONC],  (instregex "^INSv.+gpr")>;
+def : InstRW<[M1WriteFCVT4],  (instregex "^[FU](RECP|RSQRT)Ev")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^[FU](RECP|RSQRT)Xv")>;
+def : InstRW<[M1WriteFMAC5],  (instregex "^F(RECP|RSQRT)Sv")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^REV(16|32|64)v")>;
+def : InstRW<[M1WriteNAL11],  (instregex "^TB[LX]v8i8One")>;
+def : InstRW<[WriteSequence<[M1WriteNAL11], 2>],
+                              (instregex "^TB[LX]v8i8Two")>;
+def : InstRW<[WriteSequence<[M1WriteNAL11], 3>],
+                              (instregex "^TB[LX]v8i8Three")>;
+def : InstRW<[WriteSequence<[M1WriteNAL11], 4>],
+                              (instregex "^TB[LX]v8i8Four")>;
+def : InstRW<[M1WriteNAL12],  (instregex "^TB[LX]v16i8One")>;
+def : InstRW<[WriteSequence<[M1WriteNAL12], 2>],
+                              (instregex "^TB[LX]v16i8Two")>;
+def : InstRW<[WriteSequence<[M1WriteNAL12], 3>],
+                              (instregex "^TB[LX]v16i8Three")>;
+def : InstRW<[WriteSequence<[M1WriteNAL12], 4>],
+                              (instregex "^TB[LX]v16i8Four")>;
+def : InstRW<[M1WriteNEOND],  (instregex "^[SU]MOVv")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^INSv.+lane")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^(TRN|UZP)[12](v8i8|v4i16|v2i32)")>;
+def : InstRW<[M1WriteNALU2],  (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^ZIP[12]v")>;
+
+// ASIMD load instructions.
+
+// ASIMD store instructions.
+
+// Cryptography instructions.
+def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
+def M1ReadAES  : SchedReadAdvance<1, [M1WriteAES]>;
+def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AES")>;
+
+def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>;
+def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>;
+def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>;
+def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA256SU0")>;
+def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA256(H|SU1)")>;
+
+// CRC instructions.
+def : InstRW<[M1WriteC2], (instregex "^CRC32")>;
+
+} // SchedModel = ExynosM1Model
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td
new file mode 100644
index 000000000000..35a40c314bf4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td
@@ -0,0 +1,852 @@
+//=- AArch64SchedVulcan.td - Vulcan Scheduling Defs ----------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// 1. Introduction
+//
+// This file defines the machine model for Broadcom Vulcan to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 2. Pipeline Description.
+
+def VulcanModel : SchedMachineModel {
+  let IssueWidth            =   4; // 4 micro-ops dispatched at a time.
+  let MicroOpBufferSize     = 180; // 180 entries in micro-op re-order buffer.
+  let LoadLatency           =   4; // Optimistic load latency.
+  let MispredictPenalty     =  12; // Extra cycles for mispredicted branch.
+  // Determined via a mix of micro-arch details and experimentation.
+  let LoopMicroOpBufferSize =  32; 
+  let PostRAScheduler       =   1; // Using PostRA sched.
+  let CompleteModel         =   1;
+}
+
+// Define the issue ports.
+
+// Port 0: ALU, FP/SIMD.
+def VulcanP0 : ProcResource<1>;
+
+// Port 1: ALU, FP/SIMD, integer mul/div.
+def VulcanP1 : ProcResource<1>;
+
+// Port 2: ALU, Branch.
+def VulcanP2 : ProcResource<1>;
+
+// Port 3: Store data.
+def VulcanP3 : ProcResource<1>;
+
+// Port 4: Load/store.
+def VulcanP4 : ProcResource<1>;
+
+// Port 5: Load/store.
+def VulcanP5 : ProcResource<1>;
+
+let SchedModel = VulcanModel in {
+
+// Define groups for the functional units on each issue port.  Each group
+// created will be used by a WriteRes later on.
+//
+// NOTE: Some groups only contain one member.  This is a way to create names for
+// the various functional units that share a single issue port.  For example,
+// "VulcanI1" for ALU ops on port 1 and "VulcanF1" for FP ops on port 1.
+
+// Integer divide and multiply micro-ops only on port 1.
+def VulcanI1 : ProcResGroup<[VulcanP1]>;
+
+// Branch micro-ops only on port 2.
+def VulcanI2 : ProcResGroup<[VulcanP2]>;
+
+// ALU micro-ops on ports 0, 1, and 2.
+def VulcanI012 : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2]>;
+
+// Crypto FP/SIMD micro-ops only on port 1.
+def VulcanF1 : ProcResGroup<[VulcanP1]>;
+
+// FP/SIMD micro-ops on ports 0 and 1.
+def VulcanF01 : ProcResGroup<[VulcanP0, VulcanP1]>;
+
+// Store data micro-ops only on port 3.
+def VulcanSD : ProcResGroup<[VulcanP3]>;
+
+// Load/store micro-ops on ports 4 and 5.
+def VulcanLS01 : ProcResGroup<[VulcanP4, VulcanP5]>;
+
+// 60 entry unified scheduler.
+def VulcanAny : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2,
+                              VulcanP3, VulcanP4, VulcanP5]> {
+  let BufferSize=60;
+}
+
+// Define commonly used write types for InstRW specializations.
+// All definitions follow the format: VulcanWrite_<NumCycles>Cyc_<Resources>.
+
+// 3 cycles on I1.
+def VulcanWrite_3Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 3; }
+
+// 4 cycles on I1.
+def VulcanWrite_4Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 4; }
+
+// 1 cycle on I0, I1, or I2.
+def VulcanWrite_1Cyc_I012 : SchedWriteRes<[VulcanI012]> { let Latency = 1; }
+
+// 5 cycles on F1.
+def VulcanWrite_5Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 5; }
+
+// 7 cycles on F1.
+def VulcanWrite_7Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 7; }
+
+// 4 cycles on F0 or F1.
+def VulcanWrite_4Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 4; }
+
+// 5 cycles on F0 or F1.
+def VulcanWrite_5Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 5; }
+
+// 6 cycles on F0 or F1.
+def VulcanWrite_6Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 6; }
+
+// 7 cycles on F0 or F1.
+def VulcanWrite_7Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 7; }
+
+// 8 cycles on F0 or F1.
+def VulcanWrite_8Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 8; }
+
+// 16 cycles on F0 or F1.
+def VulcanWrite_16Cyc_F01 : SchedWriteRes<[VulcanF01]> {
+  let Latency = 16;
+  let ResourceCycles = [8];
+}
+
+// 23 cycles on F0 or F1.
+def VulcanWrite_23Cyc_F01 : SchedWriteRes<[VulcanF01]> {
+  let Latency = 23;
+  let ResourceCycles = [11];
+}
+
+// 1 cycles on LS0 or LS1.
+def VulcanWrite_1Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 1; }
+
+// 4 cycles on LS0 or LS1.
+def VulcanWrite_4Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 4; }
+
+// 5 cycles on LS0 or LS1.
+def VulcanWrite_5Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 5; }
+
+// 6 cycles on LS0 or LS1.
+def VulcanWrite_6Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 6; }
+
+// 5 cycles on LS0 or LS1 and I0, I1, or I2.
+def VulcanWrite_5Cyc_LS01_I012 : SchedWriteRes<[VulcanLS01, VulcanI012]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+// 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def VulcanWrite_6Cyc_LS01_I012_I012 : 
+  SchedWriteRes<[VulcanLS01, VulcanI012, VulcanI012]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+// 1 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_1Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// 5 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_5Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+// 6 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_6Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+// 7 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_7Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+// 8 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_8Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+// Define commonly used read types.
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+}
+
+
+//===----------------------------------------------------------------------===//
+// 3. Instruction Tables.
+
+let SchedModel = VulcanModel in {
+
+//---
+// 3.1 Branch Instructions
+//---
+
+// Branch, immed
+// Branch and link, immed
+// Compare and branch
+def : WriteRes<WriteBr,      [VulcanI2]> { let Latency = 1; }
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+
+// Branch, register
+// Branch and link, register != LR
+// Branch and link, register = LR
+def : WriteRes<WriteBrReg,   [VulcanI2]> { let Latency = 1; }
+
+//---
+// 3.2 Arithmetic and Logical Instructions
+// 3.3 Move and Shift Instructions
+//---
+
+// ALU, basic
+// Conditional compare
+// Conditional select
+// Address generation
+def : WriteRes<WriteI,       [VulcanI012]> { let Latency = 1; }
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// ALU, extend and/or shift
+def : WriteRes<WriteISReg,   [VulcanI012]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+def : WriteRes<WriteIEReg,   [VulcanI012]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+// Move immed
+def : WriteRes<WriteImm,     [VulcanI012]> { let Latency = 1; }
+
+// Variable shift
+def : WriteRes<WriteIS,      [VulcanI012]> { let Latency = 1; }
+
+//---
+// 3.4 Divide and Multiply Instructions
+//---
+
+// Divide, W-form
+// Latency range of 13-23.  Take the average.
+def : WriteRes<WriteID32,    [VulcanI1]> {
+  let Latency = 18;
+  let ResourceCycles = [18];
+}
+
+// Divide, X-form
+// Latency range of 13-39.  Take the average.
+def : WriteRes<WriteID64,    [VulcanI1]> {
+  let Latency = 26;
+  let ResourceCycles = [26];
+}
+
+// Multiply accumulate, W-form
+def : WriteRes<WriteIM32,    [VulcanI012]> { let Latency = 5; }
+
+// Multiply accumulate, X-form
+def : WriteRes<WriteIM64,    [VulcanI012]> { let Latency = 5; }
+
+// Bitfield extract, two reg
+def : WriteRes<WriteExtr,    [VulcanI012]> { let Latency = 1; }
+
+// Bitfield move, basic
+// Bitfield move, insert
+// NOTE: Handled by WriteIS.
+
+// Count leading
+def : InstRW<[VulcanWrite_3Cyc_I1], (instregex "^CLS(W|X)r$",
+                                               "^CLZ(W|X)r$")>;
+
+// Reverse bits/bytes
+// NOTE: Handled by WriteI.
+
+//---
+// 3.6 Load Instructions 
+// 3.10 FP Load Instructions
+//---
+
+// Load register, literal
+// Load register, unscaled immed
+// Load register, immed unprivileged
+// Load register, unsigned immed
+def : WriteRes<WriteLD,      [VulcanLS01]> { let Latency = 4; }
+
+// Load register, immed post-index
+// NOTE: Handled by WriteLD, WriteI.
+// Load register, immed pre-index
+// NOTE: Handled by WriteLD, WriteAdr.
+def : WriteRes<WriteAdr,     [VulcanI012]> { let Latency = 1; }
+
+// Load register offset, basic
+// Load register, register offset, scale by 4/8
+// Load register, register offset, scale by 2
+// Load register offset, extend
+// Load register, register offset, extend, scale by 4/8
+// Load register, register offset, extend, scale by 2
+def VulcanWriteLDIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [VulcanWrite_6Cyc_LS01_I012_I012]>,
+  SchedVar<NoSchedPred,   [VulcanWrite_5Cyc_LS01_I012]>]>;
+def : SchedAlias<WriteLDIdx, VulcanWriteLDIdx>;
+
+def VulcanReadAdrBase : SchedReadVariant<[
+  SchedVar<ScaledIdxPred, [ReadDefault]>,
+  SchedVar<NoSchedPred,   [ReadDefault]>]>;
+def : SchedAlias<ReadAdrBase, VulcanReadAdrBase>;
+
+// Load pair, immed offset, normal
+// Load pair, immed offset, signed words, base != SP
+// Load pair, immed offset signed words, base = SP
+// LDP only breaks into *one* LS micro-op.  Thus
+// the resources are handling by WriteLD.
+def : WriteRes<WriteLDHi,    []> {
+  let Latency = 5;
+}
+
+// Load pair, immed pre-index, normal
+// Load pair, immed pre-index, signed words
+// Load pair, immed post-index, normal
+// Load pair, immed post-index, signed words
+// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr.
+
+//--
+// 3.7 Store Instructions 
+// 3.11 FP Store Instructions
+//--
+
+// Store register, unscaled immed
+// Store register, immed unprivileged
+// Store register, unsigned immed
+def : WriteRes<WriteST,      [VulcanLS01, VulcanSD]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Store register, immed post-index
+// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase
+
+// Store register, immed pre-index
+// NOTE: Handled by WriteAdr, WriteST
+
+// Store register, register offset, basic
+// Store register, register offset, scaled by 4/8
+// Store register, register offset, scaled by 2
+// Store register, register offset, extend
+// Store register, register offset, extend, scale by 4/8
+// Store register, register offset, extend, scale by 1
+def : WriteRes<WriteSTIdx, [VulcanLS01, VulcanSD, VulcanI012]> {
+  let Latency = 1;
+  let NumMicroOps = 3;
+}
+
+// Store pair, immed offset, W-form
+// Store pair, immed offset, X-form
+def : WriteRes<WriteSTP,     [VulcanLS01, VulcanSD]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Store pair, immed post-index, W-form
+// Store pair, immed post-index, X-form
+// Store pair, immed pre-index, W-form
+// Store pair, immed pre-index, X-form
+// NOTE: Handled by WriteAdr, WriteSTP.
+
+//---
+// 3.8 FP Data Processing Instructions
+//---
+
+// FP absolute value
+// FP min/max
+// FP negate
+def : WriteRes<WriteF,       [VulcanF01]> { let Latency = 5; }
+
+// FP arithmetic
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
+
+// FP compare
+def : WriteRes<WriteFCmp,    [VulcanF01]> { let Latency = 5; }
+
+// FP divide, S-form
+// FP square root, S-form
+def : WriteRes<WriteFDiv,    [VulcanF01]> {
+  let Latency = 16;
+  let ResourceCycles = [8];
+}
+
+// FP divide, D-form
+// FP square root, D-form
+def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>;
+
+// FP multiply
+// FP multiply accumulate
+def : WriteRes<WriteFMul, [VulcanF01]> { let Latency = 6; }
+
+// FP round to integral
+def : InstRW<[VulcanWrite_7Cyc_F01],
+            (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
+
+// FP select
+def : InstRW<[VulcanWrite_4Cyc_F01], (instregex "^FCSEL")>;
+
+//---
+// 3.9 FP Miscellaneous Instructions
+//---
+
+// FP convert, from vec to vec reg
+// FP convert, from gen to vec reg
+// FP convert, from vec to gen reg
+def : WriteRes<WriteFCvt, [VulcanF01]> { let Latency = 7; }
+
+// FP move, immed
+// FP move, register
+def : WriteRes<WriteFImm, [VulcanF01]> { let Latency = 4; }
+
+// FP transfer, from gen to vec reg
+// FP transfer, from vec to gen reg
+def : WriteRes<WriteFCopy, [VulcanF01]> { let Latency = 4; }
+def : InstRW<[VulcanWrite_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
+
+//---
+// 3.12 ASIMD Integer Instructions
+//---
+
+// ASIMD absolute diff, D-form
+// ASIMD absolute diff, Q-form
+// ASIMD absolute diff accum, D-form
+// ASIMD absolute diff accum, Q-form
+// ASIMD absolute diff accum long
+// ASIMD absolute diff long
+// ASIMD arith, basic
+// ASIMD arith, complex
+// ASIMD compare
+// ASIMD logical (AND, BIC, EOR)
+// ASIMD max/min, basic
+// ASIMD max/min, reduce, 4H/4S
+// ASIMD max/min, reduce, 8B/8H
+// ASIMD max/min, reduce, 16B
+// ASIMD multiply, D-form
+// ASIMD multiply, Q-form
+// ASIMD multiply accumulate long
+// ASIMD multiply accumulate saturating long
+// ASIMD multiply long
+// ASIMD pairwise add and accumulate
+// ASIMD shift accumulate
+// ASIMD shift by immed, basic
+// ASIMD shift by immed and insert, basic, D-form
+// ASIMD shift by immed and insert, basic, Q-form
+// ASIMD shift by immed, complex
+// ASIMD shift by register, basic, D-form
+// ASIMD shift by register, basic, Q-form
+// ASIMD shift by register, complex, D-form
+// ASIMD shift by register, complex, Q-form
+def : WriteRes<WriteV, [VulcanF01]> { let Latency = 7; }
+
+// ASIMD arith, reduce, 4H/4S
+// ASIMD arith, reduce, 8B/8H
+// ASIMD arith, reduce, 16B
+def : InstRW<[VulcanWrite_5Cyc_F01], 
+            (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
+
+// ASIMD logical (MOV, MVN, ORN, ORR)
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>;
+
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[VulcanWrite_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>;
+
+//---
+// 3.13 ASIMD Floating-point Instructions
+//---
+
+// ASIMD FP absolute value
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FABSv")>;
+
+// ASIMD FP arith, normal, D-form
+// ASIMD FP arith, normal, Q-form
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+
+// ASIMD FP arith,pairwise, D-form
+// ASIMD FP arith, pairwise, Q-form
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADDPv")>;
+
+// ASIMD FP compare, D-form
+// ASIMD FP compare, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>;
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
+                                                "^FCMGTv", "^FCMLEv",
+                                                "^FCMLTv")>;
+
+// ASIMD FP convert, long
+// ASIMD FP convert, narrow
+// ASIMD FP convert, other, D-form
+// ASIMD FP convert, other, Q-form
+// NOTE: Handled by WriteV.
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv2f32)>;
+
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv4f32)>;
+
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVv2f64)>;
+
+// ASIMD FP max/min, normal, D-form
+// ASIMD FP max/min, normal, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv",
+                                                "^FMINv", "^FMINNMv")>;
+
+// ASIMD FP max/min, pairwise, D-form
+// ASIMD FP max/min, pairwise, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv",
+                                                "^FMINPv", "^FMINNMPv")>;
+
+// ASIMD FP max/min, reduce
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv",
+                                                "^FMINVv", "^FMINNMVv")>;
+
+// ASIMD FP multiply, D-form, FZ
+// ASIMD FP multiply, D-form, no FZ
+// ASIMD FP multiply, Q-form, FZ
+// ASIMD FP multiply, Q-form, no FZ
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>;
+
+// ASIMD FP multiply accumulate, Dform, FZ
+// ASIMD FP multiply accumulate, Dform, no FZ
+// ASIMD FP multiply accumulate, Qform, FZ
+// ASIMD FP multiply accumulate, Qform, no FZ
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>;
+
+// ASIMD FP negate
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FNEGv")>;
+
+// ASIMD FP round, D-form
+// ASIMD FP round, Q-form
+// NOTE: Handled by WriteV.
+
+//--
+// 3.14 ASIMD Miscellaneous Instructions
+//--
+
+// ASIMD bit reverse
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^RBITv")>;
+
+// ASIMD bitwise insert, D-form
+// ASIMD bitwise insert, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>;
+
+// ASIMD count, D-form
+// ASIMD count, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>;
+
+// ASIMD duplicate, gen reg
+// ASIMD duplicate, element
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^DUPv")>;
+
+// ASIMD extract
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^EXTv")>;
+
+// ASIMD extract narrow
+// ASIMD extract narrow, saturating
+// NOTE: Handled by WriteV.
+
+// ASIMD insert, element to element
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
+
+// ASIMD move, integer immed
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
+
+// ASIMD move, FP immed
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMOVv")>;
+
+// ASIMD reciprocal estimate, D-form
+// ASIMD reciprocal estimate, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], 
+            (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
+                                   "^FRSQRTEv", "^URSQRTEv")>;
+
+// ASIMD reciprocal step, D-form, FZ
+// ASIMD reciprocal step, D-form, no FZ
+// ASIMD reciprocal step, Q-form, FZ
+// ASIMD reciprocal step, Q-form, no FZ
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>;
+
+// ASIMD reverse
+def : InstRW<[VulcanWrite_5Cyc_F01], 
+            (instregex "^REV16v", "^REV32v", "^REV64v")>;
+
+// ASIMD table lookup, D-form
+// ASIMD table lookup, Q-form
+def : InstRW<[VulcanWrite_8Cyc_F01], (instregex "^TBLv", "^TBXv")>;
+
+// ASIMD transfer, element to word or word
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^UMOVv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>;
+
+// ASIMD transfer gen reg to element
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
+
+// ASIMD transpose
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^TRN1v", "^TRN2v",
+                                                "^UZP1v", "^UZP2v")>;
+
+// ASIMD unzip/zip
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
+
+//--
+// 3.15 ASIMD Load Instructions 
+//--
+
+// ASIMD load, 1 element, multiple, 1 reg, D-form
+// ASIMD load, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[VulcanWrite_4Cyc_LS01], 
+            (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], 
+            (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form
+// ASIMD load, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[VulcanWrite_4Cyc_LS01], 
+            (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], 
+            (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form
+// ASIMD load, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[VulcanWrite_5Cyc_LS01], 
+            (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01, WriteAdr], 
+            (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form
+// ASIMD load, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[VulcanWrite_6Cyc_LS01], 
+            (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_6Cyc_LS01, WriteAdr], 
+            (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S
+// ASIMD load, 1 element, one lane, D
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S
+// ASIMD load, 1 element, all lanes, D-form, D
+// ASIMD load, 1 element, all lanes, Q-form
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
+            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S
+// ASIMD load, 2 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
+            (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H
+// ASIMD load, 2 element, one lane, S
+// ASIMD load, 2 element, one lane, D
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S
+// ASIMD load, 2 element, all lanes, D-form, D
+// ASIMD load, 2 element, all lanes, Q-form
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
+            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01], 
+            (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lone, B/H
+// ASIMD load, 3 element, one lane, S
+// ASIMD load, 3 element, one lane, D
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD3i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S
+// ASIMD load, 3 element, all lanes, D-form, D
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S
+// ASIMD load, 3 element, all lanes, Q-form, D
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01], 
+            (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01], 
+            (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H
+// ASIMD load, 4 element, one lane, S
+// ASIMD load, 4 element, one lane, D
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD4i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S
+// ASIMD load, 4 element, all lanes, D-form, D
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S
+// ASIMD load, 4 element, all lanes, Q-form, D
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01], 
+            (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+//--
+// 3.16 ASIMD Store Instructions
+//--
+
+// ASIMD store, 1 element, multiple, 1 reg, D-form
+// ASIMD store, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01], 
+            (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+            (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form
+// ASIMD store, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01], 
+            (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+            (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form
+// ASIMD store, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01], 
+            (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+            (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form
+// ASIMD store, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01], 
+            (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+            (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S
+// ASIMD store, 1 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+            (instregex "^ST1i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+            (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S
+// ASIMD store, 2 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+            (instregex "^ST2i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+            (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H
+// ASIMD store, 3 element, one lane, S
+// ASIMD store, 3 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST3i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+            (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H
+// ASIMD store, 4 element, one lane, S
+// ASIMD store, 4 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST4i(8|16|32|64)_POST$")>;
+
+//--
+// 3.17 Cryptography Extensions
+//--
+
+// Crypto AES ops
+def : InstRW<[VulcanWrite_5Cyc_F1], (instregex "^AES")>;
+
+// Crypto polynomial (64x64) multiply long
+def : InstRW<[VulcanWrite_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>;
+
+// Crypto SHA1 xor ops
+// Crypto SHA1 schedule acceleration ops
+// Crypto SHA256 schedule acceleration op (1 u-op)
+// Crypto SHA256 schedule acceleration op (2 u-ops)
+// Crypto SHA256 hash acceleration ops
+def : InstRW<[VulcanWrite_7Cyc_F1], (instregex "^SHA")>;
+
+//--
+// 3.18 CRC
+//--
+
+// CRC checksum ops
+def : InstRW<[VulcanWrite_4Cyc_I1], (instregex "^CRC32")>;
+
+} // SchedModel = VulcanModel
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Schedule.td b/contrib/llvm/lib/Target/AArch64/AArch64Schedule.td
new file mode 100644
index 000000000000..ce81f48acf71
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Schedule.td
@@ -0,0 +1,106 @@
+//==-- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Define TII for use in SchedVariant Predicates.
+// const MachineInstr *MI and const TargetSchedModel *SchedModel
+// are defined by default.
+def : PredicateProlog<[{
+  const AArch64InstrInfo *TII =
+    static_cast<const AArch64InstrInfo*>(SchedModel->getInstrInfo());
+  (void)TII;
+}]>;
+
+// AArch64 Scheduler Definitions
+
+def WriteImm       : SchedWrite; // MOVN, MOVZ
+// TODO: Provide variants for MOV32/64imm Pseudos that dynamically
+// select the correct sequence of WriteImms.
+
+def WriteI         : SchedWrite; // ALU
+def WriteISReg     : SchedWrite; // ALU of Shifted-Reg
+def WriteIEReg     : SchedWrite; // ALU of Extended-Reg
+def ReadI          : SchedRead;  // ALU
+def ReadISReg      : SchedRead;  // ALU of Shifted-Reg
+def ReadIEReg      : SchedRead;  // ALU of Extended-Reg
+def WriteExtr      : SchedWrite; // EXTR shifts a reg pair
+def ReadExtrHi     : SchedRead;  // Read the high reg of the EXTR pair
+def WriteIS        : SchedWrite; // Shift/Scale
+def WriteID32      : SchedWrite; // 32-bit Divide
+def WriteID64      : SchedWrite; // 64-bit Divide
+def ReadID         : SchedRead;  // 32/64-bit Divide
+def WriteIM32      : SchedWrite; // 32-bit Multiply
+def WriteIM64      : SchedWrite; // 64-bit Multiply
+def ReadIM         : SchedRead;  // 32/64-bit Multiply
+def ReadIMA        : SchedRead;  // 32/64-bit Multiply Accumulate
+def WriteBr        : SchedWrite; // Branch
+def WriteBrReg     : SchedWrite; // Indirect Branch
+
+def WriteLD        : SchedWrite; // Load from base addr plus immediate offset
+def WriteST        : SchedWrite; // Store to base addr plus immediate offset
+def WriteSTP       : SchedWrite; // Store a register pair.
+def WriteAdr       : SchedWrite; // Address pre/post increment.
+
+def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled).
+def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
+def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
+
+// Predicate for determining when a shiftable register is shifted.
+def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(*MI)}]>;
+
+// Predicate for determining when a extendedable register is extended.
+def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(*MI)}]>;
+
+// ScaledIdxPred is true if a WriteLDIdx operand will be
+// scaled. Subtargets can use this to dynamically select resources and
+// latency for WriteLDIdx and ReadAdrBase.
+def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(*MI)}]>;
+
+// Serialized two-level address load.
+// EXAMPLE: LOADGot
+def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>;
+
+// Serialized two-level address lookup.
+// EXAMPLE: MOVaddr...
+def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>;
+
+// The second register of a load-pair.
+// LDP,LDPSW,LDNP,LDXP,LDAXP
+def WriteLDHi : SchedWrite;
+
+// Store-exclusive is a store followed by a dependent load.
+def WriteSTX : WriteSequence<[WriteST, WriteLD]>;
+
+def WriteSys     : SchedWrite; // Long, variable latency system ops.
+def WriteBarrier : SchedWrite; // Memory barrier.
+def WriteHint    : SchedWrite; // Hint instruction.
+
+def WriteF       : SchedWrite; // General floating-point ops.
+def WriteFCmp    : SchedWrite; // Floating-point compare.
+def WriteFCvt    : SchedWrite; // Float conversion.
+def WriteFCopy   : SchedWrite; // Float-int register copy.
+def WriteFImm    : SchedWrite; // Floating-point immediate.
+def WriteFMul    : SchedWrite; // Floating-point multiply.
+def WriteFDiv    : SchedWrite; // Floating-point division.
+
+def WriteV   : SchedWrite; // Vector ops.
+def WriteVLD : SchedWrite; // Vector loads.
+def WriteVST : SchedWrite; // Vector stores.
+
+def WriteAtomic : SchedWrite; // Atomic memory operations (CAS, Swap, LDOP)
+
+// Read the unwritten lanes of the VLD's destination registers.
+def ReadVLD : SchedRead;
+
+// Sequential vector load and shuffle.
+def WriteVLDShuffle     : WriteSequence<[WriteVLD, WriteV]>;
+def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>;
+
+// Store a shuffled vector.
+def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>;
+def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
new file mode 100644
index 000000000000..66a8f332513a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -0,0 +1,59 @@
+//===-- AArch64SelectionDAGInfo.cpp - AArch64 SelectionDAG Info -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-selectiondag-info"
+
+SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo) const {
+  // Check to see if there is a specialized entry-point for memory zeroing.
+  ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+  ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
+  const AArch64Subtarget &STI =
+      DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
+  const char *bzeroEntry =
+      (V && V->isNullValue()) ? STI.getBZeroEntry() : nullptr;
+  // For small size (< 256), it is not beneficial to use bzero
+  // instead of memset.
+  if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
+    const AArch64TargetLowering &TLI = *STI.getTargetLowering();
+
+    EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
+    Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = Dst;
+    Entry.Ty = IntPtrTy;
+    Args.push_back(Entry);
+    Entry.Node = Size;
+    Args.push_back(Entry);
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl).setChain(Chain)
+      .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                 DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args))
+      .setDiscardResult();
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+    return CallResult.second;
+  }
+  return SDValue();
+}
+bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner(
+    CodeGenOpt::Level OptLevel) const {
+  if (OptLevel >= CodeGenOpt::Aggressive)
+    return true;
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
new file mode 100644
index 000000000000..7e4f11091226
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -0,0 +1,31 @@
+//===-- AArch64SelectionDAGInfo.h - AArch64 SelectionDAG Info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AArch64 subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const override;
+  bool generateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
new file mode 100644
index 000000000000..fe984ccbaf1d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -0,0 +1,171 @@
+//===--- AArch64StorePairSuppress.cpp --- Suppress store pair formation ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies floating point stores that should not be combined into
+// store pairs. Later we may do the same for floating point loads.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-stp-suppress"
+
+#define STPSUPPRESS_PASS_NAME "AArch64 Store Pair Suppression"
+
+namespace {
+class AArch64StorePairSuppress : public MachineFunctionPass {
+  const AArch64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  TargetSchedModel SchedModel;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+
+public:
+  static char ID;
+  AArch64StorePairSuppress() : MachineFunctionPass(ID) {
+    initializeAArch64StorePairSuppressPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return STPSUPPRESS_PASS_NAME; }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+private:
+  bool shouldAddSTPToBlock(const MachineBasicBlock *BB);
+
+  bool isNarrowFPStore(const MachineInstr &MI);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineTraceMetrics>();
+    AU.addPreserved<MachineTraceMetrics>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char AArch64StorePairSuppress::ID = 0;
+} // anonymous
+
+INITIALIZE_PASS(AArch64StorePairSuppress, "aarch64-stp-suppress",
+                STPSUPPRESS_PASS_NAME, false, false)
+
+FunctionPass *llvm::createAArch64StorePairSuppressPass() {
+  return new AArch64StorePairSuppress();
+}
+
+/// Return true if an STP can be added to this block without increasing the
+/// critical resource height. STP is good to form in Ld/St limited blocks and
+/// bad to form in float-point limited blocks. This is true independent of the
+/// critical path. If the critical path is longer than the resource height, the
+/// extra vector ops can limit physreg renaming. Otherwise, it could simply
+/// oversaturate the vector units.
+bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  MachineTraceMetrics::Trace BBTrace = MinInstr->getTrace(BB);
+  unsigned ResLength = BBTrace.getResourceLength();
+
+  // Get the machine model's scheduling class for STPQi.
+  // Bypass TargetSchedule's SchedClass resolution since we only have an opcode.
+  unsigned SCIdx = TII->get(AArch64::STPDi).getSchedClass();
+  const MCSchedClassDesc *SCDesc =
+      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
+
+  // If a subtarget does not define resources for STPQi, bail here.
+  if (SCDesc->isValid() && !SCDesc->isVariant()) {
+    unsigned ResLenWithSTP = BBTrace.getResourceLength(None, SCDesc);
+    if (ResLenWithSTP > ResLength) {
+      DEBUG(dbgs() << "  Suppress STP in BB: " << BB->getNumber()
+                   << " resources " << ResLength << " -> " << ResLenWithSTP
+                   << "\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Return true if this is a floating-point store smaller than the V reg. On
+/// cyclone, these require a vector shuffle before storing a pair.
+/// Ideally we would call getMatchingPairOpcode() and have the machine model
+/// tell us if it's profitable with no cpu knowledge here.
+///
+/// FIXME: We plan to develop a decent Target abstraction for simple loads and
+/// stores. Until then use a nasty switch similar to AArch64LoadStoreOptimizer.
+bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+    return true;
+  }
+}
+
+bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+  TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
+  TRI = ST.getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  SchedModel.init(ST.getSchedModel(), &ST, TII);
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = nullptr;
+
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << MF.getName() << '\n');
+
+  if (!SchedModel.hasInstrSchedModel()) {
+    DEBUG(dbgs() << "  Skipping pass: no machine model present.\n");
+    return false;
+  }
+
+  // Check for a sequence of stores to the same base address. We don't need to
+  // precisely determine whether a store pair can be formed. But we do want to
+  // filter out most situations where we can't form store pairs to avoid
+  // computing trace metrics in those cases.
+  for (auto &MBB : MF) {
+    bool SuppressSTP = false;
+    unsigned PrevBaseReg = 0;
+    for (auto &MI : MBB) {
+      if (!isNarrowFPStore(MI))
+        continue;
+      unsigned BaseReg;
+      int64_t Offset;
+      if (TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI)) {
+        if (PrevBaseReg == BaseReg) {
+          // If this block can take STPs, skip ahead to the next block.
+          if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
+            break;
+          // Otherwise, continue unpairing the stores in this block.
+          DEBUG(dbgs() << "Unpairing store " << MI << "\n");
+          SuppressSTP = true;
+          TII->suppressLdStPair(MI);
+        }
+        PrevBaseReg = BaseReg;
+      } else
+        PrevBaseReg = 0;
+    }
+  }
+  // This pass just sets some internal MachineMemOperand flags. It can't really
+  // invalidate anything.
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
new file mode 100644
index 000000000000..f58bbbd26132
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -0,0 +1,188 @@
+//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64Subtarget.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64PBQPRegAlloc.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-subtarget"
+
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
+#include "AArch64GenSubtargetInfo.inc"
+
+static cl::opt<bool>
+EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
+                     "converter pass"), cl::init(true), cl::Hidden);
+
+// If OS supports TBI, use this flag to enable it.
+static cl::opt<bool>
+UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
+                         "an address is ignored"), cl::init(false), cl::Hidden);
+
+AArch64Subtarget &
+AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
+                                                  StringRef CPUString) {
+  // Determine default and user-specified characteristics
+
+  if (CPUString.empty())
+    CPUString = "generic";
+
+  ParseSubtargetFeatures(CPUString, FS);
+  initializeProperties();
+
+  return *this;
+}
+
+void AArch64Subtarget::initializeProperties() {
+  // Initialize CPU specific properties. We should add a tablegen feature for
+  // this in the future so we can specify it together with the subtarget
+  // features.
+  switch (ARMProcFamily) {
+  case Cyclone:
+    CacheLineSize = 64;
+    PrefetchDistance = 280;
+    MinPrefetchStride = 2048;
+    MaxPrefetchIterationsAhead = 3;
+    break;
+  case CortexA57:
+    MaxInterleaveFactor = 4;
+    break;
+  case ExynosM1:
+    MaxInterleaveFactor = 4;
+    MaxJumpTableSize = 8;
+    PrefFunctionAlignment = 4;
+    PrefLoopAlignment = 3;
+    break;
+  case Falkor:
+    MaxInterleaveFactor = 4;
+    break;
+  case Kryo:
+    MaxInterleaveFactor = 4;
+    VectorInsertExtractBaseCost = 2;
+    CacheLineSize = 128;
+    PrefetchDistance = 740;
+    MinPrefetchStride = 1024;
+    MaxPrefetchIterationsAhead = 11;
+    break;
+  case Vulcan:
+    MaxInterleaveFactor = 4;
+    break;
+  case CortexA35: break;
+  case CortexA53: break;
+  case CortexA72: break;
+  case CortexA73: break;
+  case Others: break;
+  }
+}
+
+AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
+                                   const std::string &FS,
+                                   const TargetMachine &TM, bool LittleEndian)
+    : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()),
+      IsLittle(LittleEndian), TargetTriple(TT), FrameLowering(),
+      InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
+      TLInfo(TM, *this), GISel() {}
+
+const CallLowering *AArch64Subtarget::getCallLowering() const {
+  assert(GISel && "Access to GlobalISel APIs not set");
+  return GISel->getCallLowering();
+}
+
+const InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
+  assert(GISel && "Access to GlobalISel APIs not set");
+  return GISel->getInstructionSelector();
+}
+
+const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
+  assert(GISel && "Access to GlobalISel APIs not set");
+  return GISel->getLegalizerInfo();
+}
+
+const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
+  assert(GISel && "Access to GlobalISel APIs not set");
+  return GISel->getRegBankInfo();
+}
+
+/// Find the target operand flags that describe how a global value should be
+/// referenced for the current subtarget.
+unsigned char
+AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
+                                          const TargetMachine &TM) const {
+  // MachO large model always goes via a GOT, simply to get a single 8-byte
+  // absolute relocation on all global addresses.
+  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
+    return AArch64II::MO_GOT;
+
+  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+    return AArch64II::MO_GOT;
+
+  // The small code mode's direct accesses use ADRP, which cannot necessarily
+  // produce the value 0 (if the code is above 4GB).
+  if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage())
+    return AArch64II::MO_GOT;
+
+  return AArch64II::MO_NO_FLAG;
+}
+
+/// This function returns the name of a function which has an interface
+/// like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over
+/// memset with zero passed as the second argument. Otherwise it
+/// returns null.
+const char *AArch64Subtarget::getBZeroEntry() const {
+  // Prefer bzero on Darwin only.
+  if(isTargetDarwin())
+    return "bzero";
+
+  return nullptr;
+}
+
+void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                           unsigned NumRegionInstrs) const {
+  // LNT run (at least on Cyclone) showed reasonably significant gains for
+  // bi-directional scheduling. 253.perlbmk.
+  Policy.OnlyTopDown = false;
+  Policy.OnlyBottomUp = false;
+  // Enabling or Disabling the latency heuristic is a close call: It seems to
+  // help nearly no benchmark on out-of-order architectures, on the other hand
+  // it regresses register pressure on a few benchmarking.
+  Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
+}
+
+bool AArch64Subtarget::enableEarlyIfConversion() const {
+  return EnableEarlyIfConvert;
+}
+
+bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
+  if (!UseAddressTopByteIgnored)
+    return false;
+
+  if (TargetTriple.isiOS()) {
+    unsigned Major, Minor, Micro;
+    TargetTriple.getiOSVersion(Major, Minor, Micro);
+    return Major >= 8;
+  }
+
+  return false;
+}
+
+std::unique_ptr<PBQPRAConstraint>
+AArch64Subtarget::getCustomPBQPConstraints() const {
+  return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
new file mode 100644
index 000000000000..73f63b8b9f67
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -0,0 +1,265 @@
+//===--- AArch64Subtarget.h - Define Subtarget for the AArch64 -*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AArch64 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SUBTARGET_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64SUBTARGET_H
+
+#include "AArch64FrameLowering.h"
+#include "AArch64ISelLowering.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64SelectionDAGInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AArch64GenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+class Triple;
+
+class AArch64Subtarget final : public AArch64GenSubtargetInfo {
+public:
+  enum ARMProcFamilyEnum : uint8_t {
+    Others,
+    CortexA35,
+    CortexA53,
+    CortexA57,
+    CortexA72,
+    CortexA73,
+    Cyclone,
+    ExynosM1,
+    Falkor,
+    Kryo,
+    Vulcan
+  };
+
+protected:
+  /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
+  ARMProcFamilyEnum ARMProcFamily = Others;
+
+  bool HasV8_1aOps = false;
+  bool HasV8_2aOps = false;
+
+  bool HasFPARMv8 = false;
+  bool HasNEON = false;
+  bool HasCrypto = false;
+  bool HasCRC = false;
+  bool HasLSE = false;
+  bool HasRAS = false;
+  bool HasPerfMon = false;
+  bool HasFullFP16 = false;
+  bool HasSPE = false;
+
+  // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
+  bool HasZeroCycleRegMove = false;
+
+  // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
+  bool HasZeroCycleZeroing = false;
+
+  // StrictAlign - Disallow unaligned memory accesses.
+  bool StrictAlign = false;
+  bool UseAA = false;
+  bool PredictableSelectIsExpensive = false;
+  bool BalanceFPOps = false;
+  bool CustomAsCheapAsMove = false;
+  bool UsePostRAScheduler = false;
+  bool Misaligned128StoreIsSlow = false;
+  bool AvoidQuadLdStPairs = false;
+  bool UseAlternateSExtLoadCVTF32Pattern = false;
+  bool HasArithmeticBccFusion = false;
+  bool HasArithmeticCbzFusion = false;
+  bool DisableLatencySchedHeuristic = false;
+  bool UseRSqrt = false;
+  uint8_t MaxInterleaveFactor = 2;
+  uint8_t VectorInsertExtractBaseCost = 3;
+  uint16_t CacheLineSize = 0;
+  uint16_t PrefetchDistance = 0;
+  uint16_t MinPrefetchStride = 1;
+  unsigned MaxPrefetchIterationsAhead = UINT_MAX;
+  unsigned PrefFunctionAlignment = 0;
+  unsigned PrefLoopAlignment = 0;
+  unsigned MaxJumpTableSize = 0;
+
+  // ReserveX18 - X18 is not available as a general purpose register.
+  bool ReserveX18;
+
+  bool IsLittle;
+
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
+  AArch64FrameLowering FrameLowering;
+  AArch64InstrInfo InstrInfo;
+  AArch64SelectionDAGInfo TSInfo;
+  AArch64TargetLowering TLInfo;
+  /// Gather the accessor points to GlobalISel-related APIs.
+  /// This is used to avoid ifndefs spreading around while GISel is
+  /// an optional library.
+  std::unique_ptr<GISelAccessor> GISel;
+
+private:
+  /// initializeSubtargetDependencies - Initializes using CPUString and the
+  /// passed in feature string so that we can use initializer lists for
+  /// subtarget initialization.
+  AArch64Subtarget &initializeSubtargetDependencies(StringRef FS,
+                                                    StringRef CPUString);
+
+  /// Initialize properties based on the selected processor family.
+  void initializeProperties();
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  AArch64Subtarget(const Triple &TT, const std::string &CPU,
+                   const std::string &FS, const TargetMachine &TM,
+                   bool LittleEndian);
+
+  /// This object will take onwership of \p GISelAccessor.
+  void setGISelAccessor(GISelAccessor &GISel) {
+    this->GISel.reset(&GISel);
+  }
+
+  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const AArch64FrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const AArch64TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const AArch64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const AArch64RegisterInfo *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
+  }
+  const CallLowering *getCallLowering() const override;
+  const InstructionSelector *getInstructionSelector() const override;
+  const LegalizerInfo *getLegalizerInfo() const override;
+  const RegisterBankInfo *getRegBankInfo() const override;
+  const Triple &getTargetTriple() const { return TargetTriple; }
+  bool enableMachineScheduler() const override { return true; }
+  bool enablePostRAScheduler() const override {
+    return UsePostRAScheduler;
+  }
+
+  /// Returns ARM processor family.
+  /// Avoid this function! CPU specifics should be kept local to this class
+  /// and preferably modeled with SubtargetFeatures or properties in
+  /// initializeProperties().
+  ARMProcFamilyEnum getProcFamily() const {
+    return ARMProcFamily;
+  }
+
+  bool hasV8_1aOps() const { return HasV8_1aOps; }
+  bool hasV8_2aOps() const { return HasV8_2aOps; }
+
+  bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
+
+  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
+
+  bool requiresStrictAlign() const { return StrictAlign; }
+
+  bool isXRaySupported() const override { return true; }
+
+  bool isX18Reserved() const { return ReserveX18; }
+  bool hasFPARMv8() const { return HasFPARMv8; }
+  bool hasNEON() const { return HasNEON; }
+  bool hasCrypto() const { return HasCrypto; }
+  bool hasCRC() const { return HasCRC; }
+  bool hasLSE() const { return HasLSE; }
+  bool hasRAS() const { return HasRAS; }
+  bool balanceFPOps() const { return BalanceFPOps; }
+  bool predictableSelectIsExpensive() const {
+    return PredictableSelectIsExpensive;
+  }
+  bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
+  bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
+  bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; }
+  bool useAlternateSExtLoadCVTF32Pattern() const {
+    return UseAlternateSExtLoadCVTF32Pattern;
+  }
+  bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; }
+  bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; }
+  bool useRSqrt() const { return UseRSqrt; }
+  unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
+  unsigned getVectorInsertExtractBaseCost() const {
+    return VectorInsertExtractBaseCost;
+  }
+  unsigned getCacheLineSize() const { return CacheLineSize; }
+  unsigned getPrefetchDistance() const { return PrefetchDistance; }
+  unsigned getMinPrefetchStride() const { return MinPrefetchStride; }
+  unsigned getMaxPrefetchIterationsAhead() const {
+    return MaxPrefetchIterationsAhead;
+  }
+  unsigned getPrefFunctionAlignment() const { return PrefFunctionAlignment; }
+  unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; }
+
+  unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; }
+
+  /// CPU has TBI (top byte of addresses is ignored during HW address
+  /// translation) and OS enables it.
+  bool supportsAddressTopByteIgnored() const;
+
+  bool hasPerfMon() const { return HasPerfMon; }
+  bool hasFullFP16() const { return HasFullFP16; }
+  bool hasSPE() const { return HasSPE; }
+
+  bool isLittleEndian() const { return IsLittle; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+  bool isTargetIOS() const { return TargetTriple.isiOS(); }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
+  bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
+
+  bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+  bool useAA() const override { return UseAA; }
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const { return 64; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// ClassifyGlobalReference - Find the target operand flags that describe
+  /// how a global value should be referenced for the current subtarget.
+  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM) const;
+
+  /// This function returns the name of a function which has an interface
+  /// like the non-standard bzero function, if such a function exists on
+  /// the current subtarget and it is considered prefereable over
+  /// memset with zero passed as the second argument. Otherwise it
+  /// returns null.
+  const char *getBZeroEntry() const;
+
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
+                           unsigned NumRegionInstrs) const override;
+
+  bool enableEarlyIfConversion() const override;
+
+  std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override;
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td
new file mode 100644
index 000000000000..a3736c0868fb
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -0,0 +1,1018 @@
+//===- AArch64SystemOperands.td ----------------------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the symbolic operands permitted for various kinds of
+// AArch64 system instruction.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/TableGen/SearchableTable.td"
+
+//===----------------------------------------------------------------------===//
+// AT (address translate) instruction options.
+//===----------------------------------------------------------------------===//
+
+class AT<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+         bits<3> op2> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<16> Encoding;
+  let Encoding{15-14} = op0;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+}
+
+def : AT<"S1E1R",  0b01, 0b000, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E2R",  0b01, 0b100, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E3R",  0b01, 0b110, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E1W",  0b01, 0b000, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E2W",  0b01, 0b100, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E3W",  0b01, 0b110, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E0R",  0b01, 0b000, 0b0111, 0b1000, 0b010>;
+def : AT<"S1E0W",  0b01, 0b000, 0b0111, 0b1000, 0b011>;
+def : AT<"S12E1R", 0b01, 0b100, 0b0111, 0b1000, 0b100>;
+def : AT<"S12E1W", 0b01, 0b100, 0b0111, 0b1000, 0b101>;
+def : AT<"S12E0R", 0b01, 0b100, 0b0111, 0b1000, 0b110>;
+def : AT<"S12E0W", 0b01, 0b100, 0b0111, 0b1000, 0b111>;
+def : AT<"S1E1RP", 0b01, 0b000, 0b0111, 0b1001, 0b000>;
+def : AT<"S1E1WP", 0b01, 0b000, 0b0111, 0b1001, 0b001>;
+
+
+//===----------------------------------------------------------------------===//
+// DMB/DSB (data barrier) instruction options.
+//===----------------------------------------------------------------------===//
+
+class DB<string name, bits<4> encoding> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<4> Encoding = encoding;
+}
+
+def : DB<"oshld", 0x1>;
+def : DB<"oshst", 0x2>;
+def : DB<"osh",   0x3>;
+def : DB<"nshld", 0x5>;
+def : DB<"nshst", 0x6>;
+def : DB<"nsh",   0x7>;
+def : DB<"ishld", 0x9>;
+def : DB<"ishst", 0xa>;
+def : DB<"ish",   0xb>;
+def : DB<"ld",    0xd>;
+def : DB<"st",    0xe>;
+def : DB<"sy",    0xf>;
+
+//===----------------------------------------------------------------------===//
+// DC (data cache maintenance) instruction options.
+//===----------------------------------------------------------------------===//
+
+class DC<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+         bits<3> op2> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<16> Encoding;
+  let Encoding{15-14} = op0;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+}
+
+def : DC<"ZVA",   0b01, 0b011, 0b0111, 0b0100, 0b001>;
+def : DC<"IVAC",  0b01, 0b000, 0b0111, 0b0110, 0b001>;
+def : DC<"ISW",   0b01, 0b000, 0b0111, 0b0110, 0b010>;
+def : DC<"CVAC",  0b01, 0b011, 0b0111, 0b1010, 0b001>;
+def : DC<"CSW",   0b01, 0b000, 0b0111, 0b1010, 0b010>;
+def : DC<"CVAU",  0b01, 0b011, 0b0111, 0b1011, 0b001>;
+def : DC<"CIVAC", 0b01, 0b011, 0b0111, 0b1110, 0b001>;
+def : DC<"CISW",  0b01, 0b000, 0b0111, 0b1110, 0b010>;
+
+//===----------------------------------------------------------------------===//
+// IC (instruction cache maintenance) instruction options.
+//===----------------------------------------------------------------------===//
+
+class IC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2,
+         bit needsreg> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<14> Encoding;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  bit NeedsReg = needsreg;
+}
+
+def : IC<"IALLUIS", 0b000, 0b0111, 0b0001, 0b000, 0>;
+def : IC<"IALLU",   0b000, 0b0111, 0b0101, 0b000, 0>;
+def : IC<"IVAU",    0b000, 0b0111, 0b0001, 0b000, 1>;
+
+//===----------------------------------------------------------------------===//
+// ISB (instruction-fetch barrier) instruction options.
+//===----------------------------------------------------------------------===//
+
+class ISB<string name, bits<4> encoding> : SearchableTable{
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<4> Encoding;
+  let Encoding = encoding;
+}
+
+def : ISB<"sy", 0xf>;
+
+//===----------------------------------------------------------------------===//
+// PRFM (prefetch) instruction options.
+//===----------------------------------------------------------------------===//
+
+class PRFM<string name, bits<5> encoding> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<5> Encoding;
+  let Encoding = encoding;
+}
+
+def : PRFM<"pldl1keep", 0x00>;
+def : PRFM<"pldl1strm", 0x01>;
+def : PRFM<"pldl2keep", 0x02>;
+def : PRFM<"pldl2strm", 0x03>;
+def : PRFM<"pldl3keep", 0x04>;
+def : PRFM<"pldl3strm", 0x05>;
+def : PRFM<"plil1keep", 0x08>;
+def : PRFM<"plil1strm", 0x09>;
+def : PRFM<"plil2keep", 0x0a>;
+def : PRFM<"plil2strm", 0x0b>;
+def : PRFM<"plil3keep", 0x0c>;
+def : PRFM<"plil3strm", 0x0d>;
+def : PRFM<"pstl1keep", 0x10>;
+def : PRFM<"pstl1strm", 0x11>;
+def : PRFM<"pstl2keep", 0x12>;
+def : PRFM<"pstl2strm", 0x13>;
+def : PRFM<"pstl3keep", 0x14>;
+def : PRFM<"pstl3strm", 0x15>;
+
+//===----------------------------------------------------------------------===//
+// PState instruction options.
+//===----------------------------------------------------------------------===//
+
+class PState<string name, bits<5> encoding> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<5> Encoding;
+  let Encoding = encoding;
+  code Requires = [{ {} }];
+}
+
+def : PState<"SPSel",   0b00101>;
+def : PState<"DAIFSet", 0b11110>;
+def : PState<"DAIFClr", 0b11111>;
+// v8.1a "Privileged Access Never" extension-specific PStates
+let Requires = [{ {AArch64::HasV8_1aOps} }] in
+def : PState<"PAN",     0b00100>;
+// v8.2a "User Access Override" extension-specific PStates
+let Requires = [{ {AArch64::HasV8_2aOps} }] in
+def : PState<"UAO",     0b00011>;
+
+
+//===----------------------------------------------------------------------===//
+// PSB instruction options.
+//===----------------------------------------------------------------------===//
+
+class PSB<string name, bits<5> encoding> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<5> Encoding;
+  let Encoding = encoding;
+}
+
+def : PSB<"csync", 0x11>;
+
+//===----------------------------------------------------------------------===//
+// TLBI (translation lookaside buffer invalidate) instruction options.
+//===----------------------------------------------------------------------===//
+
+class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+             bits<3> op2, bit needsreg = 1> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<16> Encoding;
+  let Encoding{15-14} = op0;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  bit NeedsReg = needsreg;
+}
+
+def : TLBI<"IPAS2E1IS",    0b01, 0b100, 0b1000, 0b0000, 0b001>;
+def : TLBI<"IPAS2LE1IS",   0b01, 0b100, 0b1000, 0b0000, 0b101>;
+def : TLBI<"VMALLE1IS",    0b01, 0b000, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE2IS",      0b01, 0b100, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE3IS",      0b01, 0b110, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"VAE1IS",       0b01, 0b000, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE2IS",       0b01, 0b100, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE3IS",       0b01, 0b110, 0b1000, 0b0011, 0b001>;
+def : TLBI<"ASIDE1IS",     0b01, 0b000, 0b1000, 0b0011, 0b010>;
+def : TLBI<"VAAE1IS",      0b01, 0b000, 0b1000, 0b0011, 0b011>;
+def : TLBI<"ALLE1IS",      0b01, 0b100, 0b1000, 0b0011, 0b100, 0>;
+def : TLBI<"VALE1IS",      0b01, 0b000, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE2IS",      0b01, 0b100, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE3IS",      0b01, 0b110, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VMALLS12E1IS", 0b01, 0b100, 0b1000, 0b0011, 0b110, 0>;
+def : TLBI<"VAALE1IS",     0b01, 0b000, 0b1000, 0b0011, 0b111>;
+def : TLBI<"IPAS2E1",      0b01, 0b100, 0b1000, 0b0100, 0b001>;
+def : TLBI<"IPAS2LE1",     0b01, 0b100, 0b1000, 0b0100, 0b101>;
+def : TLBI<"VMALLE1",      0b01, 0b000, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE2",        0b01, 0b100, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE3",        0b01, 0b110, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"VAE1",         0b01, 0b000, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE2",         0b01, 0b100, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE3",         0b01, 0b110, 0b1000, 0b0111, 0b001>;
+def : TLBI<"ASIDE1",       0b01, 0b000, 0b1000, 0b0111, 0b010>;
+def : TLBI<"VAAE1",        0b01, 0b000, 0b1000, 0b0111, 0b011>;
+def : TLBI<"ALLE1",        0b01, 0b100, 0b1000, 0b0111, 0b100, 0>;
+def : TLBI<"VALE1",        0b01, 0b000, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE2",        0b01, 0b100, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE3",        0b01, 0b110, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VMALLS12E1",   0b01, 0b100, 0b1000, 0b0111, 0b110, 0>;
+def : TLBI<"VAALE1",       0b01, 0b000, 0b1000, 0b0111, 0b111>;
+
+
+//===----------------------------------------------------------------------===//
+// MRS/MSR (system register read/write) instruction options.
+//===----------------------------------------------------------------------===//
+
+class SysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+             bits<3> op2> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<16> Encoding;
+  let Encoding{15-14} = op0;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  bit Readable = ?;
+  bit Writeable = ?;
+  code Requires = [{ {} }];
+}
+
+class RWSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+               bits<3> op2>
+    : SysReg<name, op0, op1, crn, crm, op2> {
+  let Readable = 1;
+  let Writeable = 1;
+}
+
+class ROSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+               bits<3> op2>
+    : SysReg<name, op0, op1, crn, crm, op2> {
+  let Readable = 1;
+  let Writeable = 0;
+}
+
+class WOSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+               bits<3> op2>
+    : SysReg<name, op0, op1, crn, crm, op2> {
+  let Readable = 0;
+  let Writeable = 1;
+}
+
+//===----------------------
+// Read-only regs
+//===----------------------
+
+//                                    Op0    Op1     CRn     CRm    Op2
+def : ROSysReg<"MDCCSR_EL0",         0b10, 0b011, 0b0000, 0b0001, 0b000>;
+def : ROSysReg<"DBGDTRRX_EL0",       0b10, 0b011, 0b0000, 0b0101, 0b000>;
+def : ROSysReg<"MDRAR_EL1",          0b10, 0b000, 0b0001, 0b0000, 0b000>;
+def : ROSysReg<"OSLSR_EL1",          0b10, 0b000, 0b0001, 0b0001, 0b100>;
+def : ROSysReg<"DBGAUTHSTATUS_EL1",  0b10, 0b000, 0b0111, 0b1110, 0b110>;
+def : ROSysReg<"PMCEID0_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b110>;
+def : ROSysReg<"PMCEID1_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b111>;
+def : ROSysReg<"MIDR_EL1",           0b11, 0b000, 0b0000, 0b0000, 0b000>;
+def : ROSysReg<"CCSIDR_EL1",         0b11, 0b001, 0b0000, 0b0000, 0b000>;
+def : ROSysReg<"CLIDR_EL1",          0b11, 0b001, 0b0000, 0b0000, 0b001>;
+def : ROSysReg<"CTR_EL0",            0b11, 0b011, 0b0000, 0b0000, 0b001>;
+def : ROSysReg<"MPIDR_EL1",          0b11, 0b000, 0b0000, 0b0000, 0b101>;
+def : ROSysReg<"REVIDR_EL1",         0b11, 0b000, 0b0000, 0b0000, 0b110>;
+def : ROSysReg<"AIDR_EL1",           0b11, 0b001, 0b0000, 0b0000, 0b111>;
+def : ROSysReg<"DCZID_EL0",          0b11, 0b011, 0b0000, 0b0000, 0b111>;
+def : ROSysReg<"ID_PFR0_EL1",        0b11, 0b000, 0b0000, 0b0001, 0b000>;
+def : ROSysReg<"ID_PFR1_EL1",        0b11, 0b000, 0b0000, 0b0001, 0b001>;
+def : ROSysReg<"ID_DFR0_EL1",        0b11, 0b000, 0b0000, 0b0001, 0b010>;
+def : ROSysReg<"ID_AFR0_EL1",        0b11, 0b000, 0b0000, 0b0001, 0b011>;
+def : ROSysReg<"ID_MMFR0_EL1",       0b11, 0b000, 0b0000, 0b0001, 0b100>;
+def : ROSysReg<"ID_MMFR1_EL1",       0b11, 0b000, 0b0000, 0b0001, 0b101>;
+def : ROSysReg<"ID_MMFR2_EL1",       0b11, 0b000, 0b0000, 0b0001, 0b110>;
+def : ROSysReg<"ID_MMFR3_EL1",       0b11, 0b000, 0b0000, 0b0001, 0b111>;
+def : ROSysReg<"ID_ISAR0_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b000>;
+def : ROSysReg<"ID_ISAR1_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b001>;
+def : ROSysReg<"ID_ISAR2_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b010>;
+def : ROSysReg<"ID_ISAR3_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b011>;
+def : ROSysReg<"ID_ISAR4_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b100>;
+def : ROSysReg<"ID_ISAR5_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b101>;
+def : ROSysReg<"ID_AA64PFR0_EL1",     0b11, 0b000, 0b0000, 0b0100, 0b000>;
+def : ROSysReg<"ID_AA64PFR1_EL1",     0b11, 0b000, 0b0000, 0b0100, 0b001>;
+def : ROSysReg<"ID_AA64DFR0_EL1",     0b11, 0b000, 0b0000, 0b0101, 0b000>;
+def : ROSysReg<"ID_AA64DFR1_EL1",     0b11, 0b000, 0b0000, 0b0101, 0b001>;
+def : ROSysReg<"ID_AA64AFR0_EL1",     0b11, 0b000, 0b0000, 0b0101, 0b100>;
+def : ROSysReg<"ID_AA64AFR1_EL1",     0b11, 0b000, 0b0000, 0b0101, 0b101>;
+def : ROSysReg<"ID_AA64ISAR0_EL1",    0b11, 0b000, 0b0000, 0b0110, 0b000>;
+def : ROSysReg<"ID_AA64ISAR1_EL1",    0b11, 0b000, 0b0000, 0b0110, 0b001>;
+def : ROSysReg<"ID_AA64MMFR0_EL1",    0b11, 0b000, 0b0000, 0b0111, 0b000>;
+def : ROSysReg<"ID_AA64MMFR1_EL1",    0b11, 0b000, 0b0000, 0b0111, 0b001>;
+def : ROSysReg<"ID_AA64MMFR2_EL1",    0b11, 0b000, 0b0000, 0b0111, 0b010> {
+  let Requires = [{ {AArch64::HasV8_2aOps} }];
+}
+def : ROSysReg<"MVFR0_EL1",          0b11, 0b000, 0b0000, 0b0011, 0b000>;
+def : ROSysReg<"MVFR1_EL1",          0b11, 0b000, 0b0000, 0b0011, 0b001>;
+def : ROSysReg<"MVFR2_EL1",          0b11, 0b000, 0b0000, 0b0011, 0b010>;
+def : ROSysReg<"RVBAR_EL1",          0b11, 0b000, 0b1100, 0b0000, 0b001>;
+def : ROSysReg<"RVBAR_EL2",          0b11, 0b100, 0b1100, 0b0000, 0b001>;
+def : ROSysReg<"RVBAR_EL3",          0b11, 0b110, 0b1100, 0b0000, 0b001>;
+def : ROSysReg<"ISR_EL1",            0b11, 0b000, 0b1100, 0b0001, 0b000>;
+def : ROSysReg<"CNTPCT_EL0",         0b11, 0b011, 0b1110, 0b0000, 0b001>;
+def : ROSysReg<"CNTVCT_EL0",         0b11, 0b011, 0b1110, 0b0000, 0b010>;
+def : ROSysReg<"ID_MMFR4_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b110>;
+
+// Trace registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : ROSysReg<"TRCSTATR",           0b10, 0b001, 0b0000, 0b0011, 0b000>;
+def : ROSysReg<"TRCIDR8",            0b10, 0b001, 0b0000, 0b0000, 0b110>;
+def : ROSysReg<"TRCIDR9",            0b10, 0b001, 0b0000, 0b0001, 0b110>;
+def : ROSysReg<"TRCIDR10",           0b10, 0b001, 0b0000, 0b0010, 0b110>;
+def : ROSysReg<"TRCIDR11",           0b10, 0b001, 0b0000, 0b0011, 0b110>;
+def : ROSysReg<"TRCIDR12",           0b10, 0b001, 0b0000, 0b0100, 0b110>;
+def : ROSysReg<"TRCIDR13",           0b10, 0b001, 0b0000, 0b0101, 0b110>;
+def : ROSysReg<"TRCIDR0",            0b10, 0b001, 0b0000, 0b1000, 0b111>;
+def : ROSysReg<"TRCIDR1",            0b10, 0b001, 0b0000, 0b1001, 0b111>;
+def : ROSysReg<"TRCIDR2",            0b10, 0b001, 0b0000, 0b1010, 0b111>;
+def : ROSysReg<"TRCIDR3",            0b10, 0b001, 0b0000, 0b1011, 0b111>;
+def : ROSysReg<"TRCIDR4",            0b10, 0b001, 0b0000, 0b1100, 0b111>;
+def : ROSysReg<"TRCIDR5",            0b10, 0b001, 0b0000, 0b1101, 0b111>;
+def : ROSysReg<"TRCIDR6",            0b10, 0b001, 0b0000, 0b1110, 0b111>;
+def : ROSysReg<"TRCIDR7",            0b10, 0b001, 0b0000, 0b1111, 0b111>;
+def : ROSysReg<"TRCOSLSR",           0b10, 0b001, 0b0001, 0b0001, 0b100>;
+def : ROSysReg<"TRCPDSR",            0b10, 0b001, 0b0001, 0b0101, 0b100>;
+def : ROSysReg<"TRCDEVAFF0",         0b10, 0b001, 0b0111, 0b1010, 0b110>;
+def : ROSysReg<"TRCDEVAFF1",         0b10, 0b001, 0b0111, 0b1011, 0b110>;
+def : ROSysReg<"TRCLSR",             0b10, 0b001, 0b0111, 0b1101, 0b110>;
+def : ROSysReg<"TRCAUTHSTATUS",      0b10, 0b001, 0b0111, 0b1110, 0b110>;
+def : ROSysReg<"TRCDEVARCH",         0b10, 0b001, 0b0111, 0b1111, 0b110>;
+def : ROSysReg<"TRCDEVID",           0b10, 0b001, 0b0111, 0b0010, 0b111>;
+def : ROSysReg<"TRCDEVTYPE",         0b10, 0b001, 0b0111, 0b0011, 0b111>;
+def : ROSysReg<"TRCPIDR4",           0b10, 0b001, 0b0111, 0b0100, 0b111>;
+def : ROSysReg<"TRCPIDR5",           0b10, 0b001, 0b0111, 0b0101, 0b111>;
+def : ROSysReg<"TRCPIDR6",           0b10, 0b001, 0b0111, 0b0110, 0b111>;
+def : ROSysReg<"TRCPIDR7",           0b10, 0b001, 0b0111, 0b0111, 0b111>;
+def : ROSysReg<"TRCPIDR0",           0b10, 0b001, 0b0111, 0b1000, 0b111>;
+def : ROSysReg<"TRCPIDR1",           0b10, 0b001, 0b0111, 0b1001, 0b111>;
+def : ROSysReg<"TRCPIDR2",           0b10, 0b001, 0b0111, 0b1010, 0b111>;
+def : ROSysReg<"TRCPIDR3",           0b10, 0b001, 0b0111, 0b1011, 0b111>;
+def : ROSysReg<"TRCCIDR0",           0b10, 0b001, 0b0111, 0b1100, 0b111>;
+def : ROSysReg<"TRCCIDR1",           0b10, 0b001, 0b0111, 0b1101, 0b111>;
+def : ROSysReg<"TRCCIDR2",           0b10, 0b001, 0b0111, 0b1110, 0b111>;
+def : ROSysReg<"TRCCIDR3",           0b10, 0b001, 0b0111, 0b1111, 0b111>;
+
+// GICv3 registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : ROSysReg<"ICC_IAR1_EL1",       0b11, 0b000, 0b1100, 0b1100, 0b000>;
+def : ROSysReg<"ICC_IAR0_EL1",       0b11, 0b000, 0b1100, 0b1000, 0b000>;
+def : ROSysReg<"ICC_HPPIR1_EL1",     0b11, 0b000, 0b1100, 0b1100, 0b010>;
+def : ROSysReg<"ICC_HPPIR0_EL1",     0b11, 0b000, 0b1100, 0b1000, 0b010>;
+def : ROSysReg<"ICC_RPR_EL1",        0b11, 0b000, 0b1100, 0b1011, 0b011>;
+def : ROSysReg<"ICH_VTR_EL2",        0b11, 0b100, 0b1100, 0b1011, 0b001>;
+def : ROSysReg<"ICH_EISR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b011>;
+def : ROSysReg<"ICH_ELSR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b101>;
+
+// v8.1a "Limited Ordering Regions" extension-specific system register
+//                         Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::HasV8_1aOps} }] in
+def : ROSysReg<"LORID_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b111>;
+
+// v8.2a "RAS extension" registers
+//                         Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::FeatureRAS} }] in {
+def : ROSysReg<"ERRIDR_EL1", 0b11, 0b000, 0b0101, 0b0011, 0b000>;
+def : ROSysReg<"ERXFR_EL1",  0b11, 0b000, 0b0101, 0b0100, 0b000>;
+}
+
+//===----------------------
+// Write-only regs
+//===----------------------
+
+//                                 Op0    Op1     CRn     CRm    Op2
+def : WOSysReg<"DBGDTRTX_EL0",       0b10, 0b011, 0b0000, 0b0101, 0b000>;
+def : WOSysReg<"OSLAR_EL1",          0b10, 0b000, 0b0001, 0b0000, 0b100>;
+def : WOSysReg<"PMSWINC_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b100>;
+
+// Trace Registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : WOSysReg<"TRCOSLAR",           0b10, 0b001, 0b0001, 0b0000, 0b100>;
+def : WOSysReg<"TRCLAR",             0b10, 0b001, 0b0111, 0b1100, 0b110>;
+
+// GICv3 registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : WOSysReg<"ICC_EOIR1_EL1",      0b11, 0b000, 0b1100, 0b1100, 0b001>;
+def : WOSysReg<"ICC_EOIR0_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b001>;
+def : WOSysReg<"ICC_DIR_EL1",        0b11, 0b000, 0b1100, 0b1011, 0b001>;
+def : WOSysReg<"ICC_SGI1R_EL1",      0b11, 0b000, 0b1100, 0b1011, 0b101>;
+def : WOSysReg<"ICC_ASGI1R_EL1",     0b11, 0b000, 0b1100, 0b1011, 0b110>;
+def : WOSysReg<"ICC_SGI0R_EL1",      0b11, 0b000, 0b1100, 0b1011, 0b111>;
+
+//===----------------------
+// Read-write regs
+//===----------------------
+
+//                                 Op0    Op1     CRn     CRm    Op2
+def : RWSysReg<"OSDTRRX_EL1",        0b10, 0b000, 0b0000, 0b0000, 0b010>;
+def : RWSysReg<"OSDTRTX_EL1",        0b10, 0b000, 0b0000, 0b0011, 0b010>;
+def : RWSysReg<"TEECR32_EL1",        0b10, 0b010, 0b0000, 0b0000, 0b000>;
+def : RWSysReg<"MDCCINT_EL1",        0b10, 0b000, 0b0000, 0b0010, 0b000>;
+def : RWSysReg<"MDSCR_EL1",          0b10, 0b000, 0b0000, 0b0010, 0b010>;
+def : RWSysReg<"DBGDTR_EL0",         0b10, 0b011, 0b0000, 0b0100, 0b000>;
+def : RWSysReg<"OSECCR_EL1",         0b10, 0b000, 0b0000, 0b0110, 0b010>;
+def : RWSysReg<"DBGVCR32_EL2",       0b10, 0b100, 0b0000, 0b0111, 0b000>;
+def : RWSysReg<"DBGBVR0_EL1",        0b10, 0b000, 0b0000, 0b0000, 0b100>;
+def : RWSysReg<"DBGBVR1_EL1",        0b10, 0b000, 0b0000, 0b0001, 0b100>;
+def : RWSysReg<"DBGBVR2_EL1",        0b10, 0b000, 0b0000, 0b0010, 0b100>;
+def : RWSysReg<"DBGBVR3_EL1",        0b10, 0b000, 0b0000, 0b0011, 0b100>;
+def : RWSysReg<"DBGBVR4_EL1",        0b10, 0b000, 0b0000, 0b0100, 0b100>;
+def : RWSysReg<"DBGBVR5_EL1",        0b10, 0b000, 0b0000, 0b0101, 0b100>;
+def : RWSysReg<"DBGBVR6_EL1",        0b10, 0b000, 0b0000, 0b0110, 0b100>;
+def : RWSysReg<"DBGBVR7_EL1",        0b10, 0b000, 0b0000, 0b0111, 0b100>;
+def : RWSysReg<"DBGBVR8_EL1",        0b10, 0b000, 0b0000, 0b1000, 0b100>;
+def : RWSysReg<"DBGBVR9_EL1",        0b10, 0b000, 0b0000, 0b1001, 0b100>;
+def : RWSysReg<"DBGBVR10_EL1",       0b10, 0b000, 0b0000, 0b1010, 0b100>;
+def : RWSysReg<"DBGBVR11_EL1",       0b10, 0b000, 0b0000, 0b1011, 0b100>;
+def : RWSysReg<"DBGBVR12_EL1",       0b10, 0b000, 0b0000, 0b1100, 0b100>;
+def : RWSysReg<"DBGBVR13_EL1",       0b10, 0b000, 0b0000, 0b1101, 0b100>;
+def : RWSysReg<"DBGBVR14_EL1",       0b10, 0b000, 0b0000, 0b1110, 0b100>;
+def : RWSysReg<"DBGBVR15_EL1",       0b10, 0b000, 0b0000, 0b1111, 0b100>;
+def : RWSysReg<"DBGBCR0_EL1",        0b10, 0b000, 0b0000, 0b0000, 0b101>;
+def : RWSysReg<"DBGBCR1_EL1",        0b10, 0b000, 0b0000, 0b0001, 0b101>;
+def : RWSysReg<"DBGBCR2_EL1",        0b10, 0b000, 0b0000, 0b0010, 0b101>;
+def : RWSysReg<"DBGBCR3_EL1",        0b10, 0b000, 0b0000, 0b0011, 0b101>;
+def : RWSysReg<"DBGBCR4_EL1",        0b10, 0b000, 0b0000, 0b0100, 0b101>;
+def : RWSysReg<"DBGBCR5_EL1",        0b10, 0b000, 0b0000, 0b0101, 0b101>;
+def : RWSysReg<"DBGBCR6_EL1",        0b10, 0b000, 0b0000, 0b0110, 0b101>;
+def : RWSysReg<"DBGBCR7_EL1",        0b10, 0b000, 0b0000, 0b0111, 0b101>;
+def : RWSysReg<"DBGBCR8_EL1",        0b10, 0b000, 0b0000, 0b1000, 0b101>;
+def : RWSysReg<"DBGBCR9_EL1",        0b10, 0b000, 0b0000, 0b1001, 0b101>;
+def : RWSysReg<"DBGBCR10_EL1",       0b10, 0b000, 0b0000, 0b1010, 0b101>;
+def : RWSysReg<"DBGBCR11_EL1",       0b10, 0b000, 0b0000, 0b1011, 0b101>;
+def : RWSysReg<"DBGBCR12_EL1",       0b10, 0b000, 0b0000, 0b1100, 0b101>;
+def : RWSysReg<"DBGBCR13_EL1",       0b10, 0b000, 0b0000, 0b1101, 0b101>;
+def : RWSysReg<"DBGBCR14_EL1",       0b10, 0b000, 0b0000, 0b1110, 0b101>;
+def : RWSysReg<"DBGBCR15_EL1",       0b10, 0b000, 0b0000, 0b1111, 0b101>;
+def : RWSysReg<"DBGWVR0_EL1",        0b10, 0b000, 0b0000, 0b0000, 0b110>;
+def : RWSysReg<"DBGWVR1_EL1",        0b10, 0b000, 0b0000, 0b0001, 0b110>;
+def : RWSysReg<"DBGWVR2_EL1",        0b10, 0b000, 0b0000, 0b0010, 0b110>;
+def : RWSysReg<"DBGWVR3_EL1",        0b10, 0b000, 0b0000, 0b0011, 0b110>;
+def : RWSysReg<"DBGWVR4_EL1",        0b10, 0b000, 0b0000, 0b0100, 0b110>;
+def : RWSysReg<"DBGWVR5_EL1",        0b10, 0b000, 0b0000, 0b0101, 0b110>;
+def : RWSysReg<"DBGWVR6_EL1",        0b10, 0b000, 0b0000, 0b0110, 0b110>;
+def : RWSysReg<"DBGWVR7_EL1",        0b10, 0b000, 0b0000, 0b0111, 0b110>;
+def : RWSysReg<"DBGWVR8_EL1",        0b10, 0b000, 0b0000, 0b1000, 0b110>;
+def : RWSysReg<"DBGWVR9_EL1",        0b10, 0b000, 0b0000, 0b1001, 0b110>;
+def : RWSysReg<"DBGWVR10_EL1",       0b10, 0b000, 0b0000, 0b1010, 0b110>;
+def : RWSysReg<"DBGWVR11_EL1",       0b10, 0b000, 0b0000, 0b1011, 0b110>;
+def : RWSysReg<"DBGWVR12_EL1",       0b10, 0b000, 0b0000, 0b1100, 0b110>;
+def : RWSysReg<"DBGWVR13_EL1",       0b10, 0b000, 0b0000, 0b1101, 0b110>;
+def : RWSysReg<"DBGWVR14_EL1",       0b10, 0b000, 0b0000, 0b1110, 0b110>;
+def : RWSysReg<"DBGWVR15_EL1",       0b10, 0b000, 0b0000, 0b1111, 0b110>;
+def : RWSysReg<"DBGWCR0_EL1",        0b10, 0b000, 0b0000, 0b0000, 0b111>;
+def : RWSysReg<"DBGWCR1_EL1",        0b10, 0b000, 0b0000, 0b0001, 0b111>;
+def : RWSysReg<"DBGWCR2_EL1",        0b10, 0b000, 0b0000, 0b0010, 0b111>;
+def : RWSysReg<"DBGWCR3_EL1",        0b10, 0b000, 0b0000, 0b0011, 0b111>;
+def : RWSysReg<"DBGWCR4_EL1",        0b10, 0b000, 0b0000, 0b0100, 0b111>;
+def : RWSysReg<"DBGWCR5_EL1",        0b10, 0b000, 0b0000, 0b0101, 0b111>;
+def : RWSysReg<"DBGWCR6_EL1",        0b10, 0b000, 0b0000, 0b0110, 0b111>;
+def : RWSysReg<"DBGWCR7_EL1",        0b10, 0b000, 0b0000, 0b0111, 0b111>;
+def : RWSysReg<"DBGWCR8_EL1",        0b10, 0b000, 0b0000, 0b1000, 0b111>;
+def : RWSysReg<"DBGWCR9_EL1",        0b10, 0b000, 0b0000, 0b1001, 0b111>;
+def : RWSysReg<"DBGWCR10_EL1",       0b10, 0b000, 0b0000, 0b1010, 0b111>;
+def : RWSysReg<"DBGWCR11_EL1",       0b10, 0b000, 0b0000, 0b1011, 0b111>;
+def : RWSysReg<"DBGWCR12_EL1",       0b10, 0b000, 0b0000, 0b1100, 0b111>;
+def : RWSysReg<"DBGWCR13_EL1",       0b10, 0b000, 0b0000, 0b1101, 0b111>;
+def : RWSysReg<"DBGWCR14_EL1",       0b10, 0b000, 0b0000, 0b1110, 0b111>;
+def : RWSysReg<"DBGWCR15_EL1",       0b10, 0b000, 0b0000, 0b1111, 0b111>;
+def : RWSysReg<"TEEHBR32_EL1",       0b10, 0b010, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"OSDLR_EL1",          0b10, 0b000, 0b0001, 0b0011, 0b100>;
+def : RWSysReg<"DBGPRCR_EL1",        0b10, 0b000, 0b0001, 0b0100, 0b100>;
+def : RWSysReg<"DBGCLAIMSET_EL1",    0b10, 0b000, 0b0111, 0b1000, 0b110>;
+def : RWSysReg<"DBGCLAIMCLR_EL1",    0b10, 0b000, 0b0111, 0b1001, 0b110>;
+def : RWSysReg<"CSSELR_EL1",         0b11, 0b010, 0b0000, 0b0000, 0b000>;
+def : RWSysReg<"VPIDR_EL2",          0b11, 0b100, 0b0000, 0b0000, 0b000>;
+def : RWSysReg<"VMPIDR_EL2",         0b11, 0b100, 0b0000, 0b0000, 0b101>;
+def : RWSysReg<"CPACR_EL1",          0b11, 0b000, 0b0001, 0b0000, 0b010>;
+def : RWSysReg<"SCTLR_EL1",          0b11, 0b000, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"SCTLR_EL2",          0b11, 0b100, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"SCTLR_EL3",          0b11, 0b110, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"ACTLR_EL1",          0b11, 0b000, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"ACTLR_EL2",          0b11, 0b100, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"ACTLR_EL3",          0b11, 0b110, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"HCR_EL2",            0b11, 0b100, 0b0001, 0b0001, 0b000>;
+def : RWSysReg<"SCR_EL3",            0b11, 0b110, 0b0001, 0b0001, 0b000>;
+def : RWSysReg<"MDCR_EL2",           0b11, 0b100, 0b0001, 0b0001, 0b001>;
+def : RWSysReg<"SDER32_EL3",         0b11, 0b110, 0b0001, 0b0001, 0b001>;
+def : RWSysReg<"CPTR_EL2",           0b11, 0b100, 0b0001, 0b0001, 0b010>;
+def : RWSysReg<"CPTR_EL3",           0b11, 0b110, 0b0001, 0b0001, 0b010>;
+def : RWSysReg<"HSTR_EL2",           0b11, 0b100, 0b0001, 0b0001, 0b011>;
+def : RWSysReg<"HACR_EL2",           0b11, 0b100, 0b0001, 0b0001, 0b111>;
+def : RWSysReg<"MDCR_EL3",           0b11, 0b110, 0b0001, 0b0011, 0b001>;
+def : RWSysReg<"TTBR0_EL1",          0b11, 0b000, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR0_EL2",          0b11, 0b100, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR0_EL3",          0b11, 0b110, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR1_EL1",          0b11, 0b000, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"TCR_EL1",            0b11, 0b000, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"TCR_EL2",            0b11, 0b100, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"TCR_EL3",            0b11, 0b110, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"VTTBR_EL2",          0b11, 0b100, 0b0010, 0b0001, 0b000>;
+def : RWSysReg<"VTCR_EL2",           0b11, 0b100, 0b0010, 0b0001, 0b010>;
+def : RWSysReg<"DACR32_EL2",         0b11, 0b100, 0b0011, 0b0000, 0b000>;
+def : RWSysReg<"SPSR_EL1",           0b11, 0b000, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"SPSR_EL2",           0b11, 0b100, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"SPSR_EL3",           0b11, 0b110, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"ELR_EL1",            0b11, 0b000, 0b0100, 0b0000, 0b001>;
+def : RWSysReg<"ELR_EL2",            0b11, 0b100, 0b0100, 0b0000, 0b001>;
+def : RWSysReg<"ELR_EL3",            0b11, 0b110, 0b0100, 0b0000, 0b001>;
+def : RWSysReg<"SP_EL0",             0b11, 0b000, 0b0100, 0b0001, 0b000>;
+def : RWSysReg<"SP_EL1",             0b11, 0b100, 0b0100, 0b0001, 0b000>;
+def : RWSysReg<"SP_EL2",             0b11, 0b110, 0b0100, 0b0001, 0b000>;
+def : RWSysReg<"SPSel",              0b11, 0b000, 0b0100, 0b0010, 0b000>;
+def : RWSysReg<"NZCV",               0b11, 0b011, 0b0100, 0b0010, 0b000>;
+def : RWSysReg<"DAIF",               0b11, 0b011, 0b0100, 0b0010, 0b001>;
+def : RWSysReg<"CurrentEL",          0b11, 0b000, 0b0100, 0b0010, 0b010>;
+def : RWSysReg<"SPSR_irq",           0b11, 0b100, 0b0100, 0b0011, 0b000>;
+def : RWSysReg<"SPSR_abt",           0b11, 0b100, 0b0100, 0b0011, 0b001>;
+def : RWSysReg<"SPSR_und",           0b11, 0b100, 0b0100, 0b0011, 0b010>;
+def : RWSysReg<"SPSR_fiq",           0b11, 0b100, 0b0100, 0b0011, 0b011>;
+def : RWSysReg<"FPCR",               0b11, 0b011, 0b0100, 0b0100, 0b000>;
+def : RWSysReg<"FPSR",               0b11, 0b011, 0b0100, 0b0100, 0b001>;
+def : RWSysReg<"DSPSR_EL0",          0b11, 0b011, 0b0100, 0b0101, 0b000>;
+def : RWSysReg<"DLR_EL0",            0b11, 0b011, 0b0100, 0b0101, 0b001>;
+def : RWSysReg<"IFSR32_EL2",         0b11, 0b100, 0b0101, 0b0000, 0b001>;
+def : RWSysReg<"AFSR0_EL1",          0b11, 0b000, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR0_EL2",          0b11, 0b100, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR0_EL3",          0b11, 0b110, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR1_EL1",          0b11, 0b000, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"AFSR1_EL2",          0b11, 0b100, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"AFSR1_EL3",          0b11, 0b110, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"ESR_EL1",            0b11, 0b000, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"ESR_EL2",            0b11, 0b100, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"ESR_EL3",            0b11, 0b110, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"FPEXC32_EL2",        0b11, 0b100, 0b0101, 0b0011, 0b000>;
+def : RWSysReg<"FAR_EL1",            0b11, 0b000, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"FAR_EL2",            0b11, 0b100, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"FAR_EL3",            0b11, 0b110, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"HPFAR_EL2",          0b11, 0b100, 0b0110, 0b0000, 0b100>;
+def : RWSysReg<"PAR_EL1",            0b11, 0b000, 0b0111, 0b0100, 0b000>;
+def : RWSysReg<"PMCR_EL0",           0b11, 0b011, 0b1001, 0b1100, 0b000>;
+def : RWSysReg<"PMCNTENSET_EL0",     0b11, 0b011, 0b1001, 0b1100, 0b001>;
+def : RWSysReg<"PMCNTENCLR_EL0",     0b11, 0b011, 0b1001, 0b1100, 0b010>;
+def : RWSysReg<"PMOVSCLR_EL0",       0b11, 0b011, 0b1001, 0b1100, 0b011>;
+def : RWSysReg<"PMSELR_EL0",         0b11, 0b011, 0b1001, 0b1100, 0b101>;
+def : RWSysReg<"PMCCNTR_EL0",        0b11, 0b011, 0b1001, 0b1101, 0b000>;
+def : RWSysReg<"PMXEVTYPER_EL0",     0b11, 0b011, 0b1001, 0b1101, 0b001>;
+def : RWSysReg<"PMXEVCNTR_EL0",      0b11, 0b011, 0b1001, 0b1101, 0b010>;
+def : RWSysReg<"PMUSERENR_EL0",      0b11, 0b011, 0b1001, 0b1110, 0b000>;
+def : RWSysReg<"PMINTENSET_EL1",     0b11, 0b000, 0b1001, 0b1110, 0b001>;
+def : RWSysReg<"PMINTENCLR_EL1",     0b11, 0b000, 0b1001, 0b1110, 0b010>;
+def : RWSysReg<"PMOVSSET_EL0",       0b11, 0b011, 0b1001, 0b1110, 0b011>;
+def : RWSysReg<"MAIR_EL1",           0b11, 0b000, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"MAIR_EL2",           0b11, 0b100, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"MAIR_EL3",           0b11, 0b110, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"AMAIR_EL1",          0b11, 0b000, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"AMAIR_EL2",          0b11, 0b100, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"AMAIR_EL3",          0b11, 0b110, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"VBAR_EL1",           0b11, 0b000, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"VBAR_EL2",           0b11, 0b100, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"VBAR_EL3",           0b11, 0b110, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"RMR_EL1",            0b11, 0b000, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"RMR_EL2",            0b11, 0b100, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"RMR_EL3",            0b11, 0b110, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"CONTEXTIDR_EL1",     0b11, 0b000, 0b1101, 0b0000, 0b001>;
+def : RWSysReg<"TPIDR_EL0",          0b11, 0b011, 0b1101, 0b0000, 0b010>;
+def : RWSysReg<"TPIDR_EL2",          0b11, 0b100, 0b1101, 0b0000, 0b010>;
+def : RWSysReg<"TPIDR_EL3",          0b11, 0b110, 0b1101, 0b0000, 0b010>;
+def : RWSysReg<"TPIDRRO_EL0",        0b11, 0b011, 0b1101, 0b0000, 0b011>;
+def : RWSysReg<"TPIDR_EL1",          0b11, 0b000, 0b1101, 0b0000, 0b100>;
+def : RWSysReg<"CNTFRQ_EL0",         0b11, 0b011, 0b1110, 0b0000, 0b000>;
+def : RWSysReg<"CNTVOFF_EL2",        0b11, 0b100, 0b1110, 0b0000, 0b011>;
+def : RWSysReg<"CNTKCTL_EL1",        0b11, 0b000, 0b1110, 0b0001, 0b000>;
+def : RWSysReg<"CNTHCTL_EL2",        0b11, 0b100, 0b1110, 0b0001, 0b000>;
+def : RWSysReg<"CNTP_TVAL_EL0",      0b11, 0b011, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTHP_TVAL_EL2",     0b11, 0b100, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTPS_TVAL_EL1",     0b11, 0b111, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTP_CTL_EL0",       0b11, 0b011, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTHP_CTL_EL2",      0b11, 0b100, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTPS_CTL_EL1",      0b11, 0b111, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTP_CVAL_EL0",      0b11, 0b011, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTHP_CVAL_EL2",     0b11, 0b100, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTPS_CVAL_EL1",     0b11, 0b111, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTV_TVAL_EL0",      0b11, 0b011, 0b1110, 0b0011, 0b000>;
+def : RWSysReg<"CNTV_CTL_EL0",       0b11, 0b011, 0b1110, 0b0011, 0b001>;
+def : RWSysReg<"CNTV_CVAL_EL0",      0b11, 0b011, 0b1110, 0b0011, 0b010>;
+def : RWSysReg<"PMEVCNTR0_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b000>;
+def : RWSysReg<"PMEVCNTR1_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b001>;
+def : RWSysReg<"PMEVCNTR2_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b010>;
+def : RWSysReg<"PMEVCNTR3_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b011>;
+def : RWSysReg<"PMEVCNTR4_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b100>;
+def : RWSysReg<"PMEVCNTR5_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b101>;
+def : RWSysReg<"PMEVCNTR6_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b110>;
+def : RWSysReg<"PMEVCNTR7_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b111>;
+def : RWSysReg<"PMEVCNTR8_EL0",      0b11, 0b011, 0b1110, 0b1001, 0b000>;
+def : RWSysReg<"PMEVCNTR9_EL0",      0b11, 0b011, 0b1110, 0b1001, 0b001>;
+def : RWSysReg<"PMEVCNTR10_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b010>;
+def : RWSysReg<"PMEVCNTR11_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b011>;
+def : RWSysReg<"PMEVCNTR12_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b100>;
+def : RWSysReg<"PMEVCNTR13_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b101>;
+def : RWSysReg<"PMEVCNTR14_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b110>;
+def : RWSysReg<"PMEVCNTR15_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b111>;
+def : RWSysReg<"PMEVCNTR16_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b000>;
+def : RWSysReg<"PMEVCNTR17_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b001>;
+def : RWSysReg<"PMEVCNTR18_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b010>;
+def : RWSysReg<"PMEVCNTR19_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b011>;
+def : RWSysReg<"PMEVCNTR20_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b100>;
+def : RWSysReg<"PMEVCNTR21_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b101>;
+def : RWSysReg<"PMEVCNTR22_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b110>;
+def : RWSysReg<"PMEVCNTR23_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b111>;
+def : RWSysReg<"PMEVCNTR24_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b000>;
+def : RWSysReg<"PMEVCNTR25_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b001>;
+def : RWSysReg<"PMEVCNTR26_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b010>;
+def : RWSysReg<"PMEVCNTR27_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b011>;
+def : RWSysReg<"PMEVCNTR28_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b100>;
+def : RWSysReg<"PMEVCNTR29_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b101>;
+def : RWSysReg<"PMEVCNTR30_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b110>;
+def : RWSysReg<"PMCCFILTR_EL0",      0b11, 0b011, 0b1110, 0b1111, 0b111>;
+def : RWSysReg<"PMEVTYPER0_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b000>;
+def : RWSysReg<"PMEVTYPER1_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b001>;
+def : RWSysReg<"PMEVTYPER2_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b010>;
+def : RWSysReg<"PMEVTYPER3_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b011>;
+def : RWSysReg<"PMEVTYPER4_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b100>;
+def : RWSysReg<"PMEVTYPER5_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b101>;
+def : RWSysReg<"PMEVTYPER6_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b110>;
+def : RWSysReg<"PMEVTYPER7_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b111>;
+def : RWSysReg<"PMEVTYPER8_EL0",     0b11, 0b011, 0b1110, 0b1101, 0b000>;
+def : RWSysReg<"PMEVTYPER9_EL0",     0b11, 0b011, 0b1110, 0b1101, 0b001>;
+def : RWSysReg<"PMEVTYPER10_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b010>;
+def : RWSysReg<"PMEVTYPER11_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b011>;
+def : RWSysReg<"PMEVTYPER12_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b100>;
+def : RWSysReg<"PMEVTYPER13_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b101>;
+def : RWSysReg<"PMEVTYPER14_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b110>;
+def : RWSysReg<"PMEVTYPER15_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b111>;
+def : RWSysReg<"PMEVTYPER16_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b000>;
+def : RWSysReg<"PMEVTYPER17_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b001>;
+def : RWSysReg<"PMEVTYPER18_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b010>;
+def : RWSysReg<"PMEVTYPER19_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b011>;
+def : RWSysReg<"PMEVTYPER20_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b100>;
+def : RWSysReg<"PMEVTYPER21_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b101>;
+def : RWSysReg<"PMEVTYPER22_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b110>;
+def : RWSysReg<"PMEVTYPER23_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b111>;
+def : RWSysReg<"PMEVTYPER24_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b000>;
+def : RWSysReg<"PMEVTYPER25_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b001>;
+def : RWSysReg<"PMEVTYPER26_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b010>;
+def : RWSysReg<"PMEVTYPER27_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b011>;
+def : RWSysReg<"PMEVTYPER28_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b100>;
+def : RWSysReg<"PMEVTYPER29_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b101>;
+def : RWSysReg<"PMEVTYPER30_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b110>;
+
+// Trace registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : RWSysReg<"TRCPRGCTLR",         0b10, 0b001, 0b0000, 0b0001, 0b000>;
+def : RWSysReg<"TRCPROCSELR",        0b10, 0b001, 0b0000, 0b0010, 0b000>;
+def : RWSysReg<"TRCCONFIGR",         0b10, 0b001, 0b0000, 0b0100, 0b000>;
+def : RWSysReg<"TRCAUXCTLR",         0b10, 0b001, 0b0000, 0b0110, 0b000>;
+def : RWSysReg<"TRCEVENTCTL0R",      0b10, 0b001, 0b0000, 0b1000, 0b000>;
+def : RWSysReg<"TRCEVENTCTL1R",      0b10, 0b001, 0b0000, 0b1001, 0b000>;
+def : RWSysReg<"TRCSTALLCTLR",       0b10, 0b001, 0b0000, 0b1011, 0b000>;
+def : RWSysReg<"TRCTSCTLR",          0b10, 0b001, 0b0000, 0b1100, 0b000>;
+def : RWSysReg<"TRCSYNCPR",          0b10, 0b001, 0b0000, 0b1101, 0b000>;
+def : RWSysReg<"TRCCCCTLR",          0b10, 0b001, 0b0000, 0b1110, 0b000>;
+def : RWSysReg<"TRCBBCTLR",          0b10, 0b001, 0b0000, 0b1111, 0b000>;
+def : RWSysReg<"TRCTRACEIDR",        0b10, 0b001, 0b0000, 0b0000, 0b001>;
+def : RWSysReg<"TRCQCTLR",           0b10, 0b001, 0b0000, 0b0001, 0b001>;
+def : RWSysReg<"TRCVICTLR",          0b10, 0b001, 0b0000, 0b0000, 0b010>;
+def : RWSysReg<"TRCVIIECTLR",        0b10, 0b001, 0b0000, 0b0001, 0b010>;
+def : RWSysReg<"TRCVISSCTLR",        0b10, 0b001, 0b0000, 0b0010, 0b010>;
+def : RWSysReg<"TRCVIPCSSCTLR",      0b10, 0b001, 0b0000, 0b0011, 0b010>;
+def : RWSysReg<"TRCVDCTLR",          0b10, 0b001, 0b0000, 0b1000, 0b010>;
+def : RWSysReg<"TRCVDSACCTLR",       0b10, 0b001, 0b0000, 0b1001, 0b010>;
+def : RWSysReg<"TRCVDARCCTLR",       0b10, 0b001, 0b0000, 0b1010, 0b010>;
+def : RWSysReg<"TRCSEQEVR0",         0b10, 0b001, 0b0000, 0b0000, 0b100>;
+def : RWSysReg<"TRCSEQEVR1",         0b10, 0b001, 0b0000, 0b0001, 0b100>;
+def : RWSysReg<"TRCSEQEVR2",         0b10, 0b001, 0b0000, 0b0010, 0b100>;
+def : RWSysReg<"TRCSEQRSTEVR",       0b10, 0b001, 0b0000, 0b0110, 0b100>;
+def : RWSysReg<"TRCSEQSTR",          0b10, 0b001, 0b0000, 0b0111, 0b100>;
+def : RWSysReg<"TRCEXTINSELR",       0b10, 0b001, 0b0000, 0b1000, 0b100>;
+def : RWSysReg<"TRCCNTRLDVR0",       0b10, 0b001, 0b0000, 0b0000, 0b101>;
+def : RWSysReg<"TRCCNTRLDVR1",       0b10, 0b001, 0b0000, 0b0001, 0b101>;
+def : RWSysReg<"TRCCNTRLDVR2",       0b10, 0b001, 0b0000, 0b0010, 0b101>;
+def : RWSysReg<"TRCCNTRLDVR3",       0b10, 0b001, 0b0000, 0b0011, 0b101>;
+def : RWSysReg<"TRCCNTCTLR0",        0b10, 0b001, 0b0000, 0b0100, 0b101>;
+def : RWSysReg<"TRCCNTCTLR1",        0b10, 0b001, 0b0000, 0b0101, 0b101>;
+def : RWSysReg<"TRCCNTCTLR2",        0b10, 0b001, 0b0000, 0b0110, 0b101>;
+def : RWSysReg<"TRCCNTCTLR3",        0b10, 0b001, 0b0000, 0b0111, 0b101>;
+def : RWSysReg<"TRCCNTVR0",          0b10, 0b001, 0b0000, 0b1000, 0b101>;
+def : RWSysReg<"TRCCNTVR1",          0b10, 0b001, 0b0000, 0b1001, 0b101>;
+def : RWSysReg<"TRCCNTVR2",          0b10, 0b001, 0b0000, 0b1010, 0b101>;
+def : RWSysReg<"TRCCNTVR3",          0b10, 0b001, 0b0000, 0b1011, 0b101>;
+def : RWSysReg<"TRCIMSPEC0",         0b10, 0b001, 0b0000, 0b0000, 0b111>;
+def : RWSysReg<"TRCIMSPEC1",         0b10, 0b001, 0b0000, 0b0001, 0b111>;
+def : RWSysReg<"TRCIMSPEC2",         0b10, 0b001, 0b0000, 0b0010, 0b111>;
+def : RWSysReg<"TRCIMSPEC3",         0b10, 0b001, 0b0000, 0b0011, 0b111>;
+def : RWSysReg<"TRCIMSPEC4",         0b10, 0b001, 0b0000, 0b0100, 0b111>;
+def : RWSysReg<"TRCIMSPEC5",         0b10, 0b001, 0b0000, 0b0101, 0b111>;
+def : RWSysReg<"TRCIMSPEC6",         0b10, 0b001, 0b0000, 0b0110, 0b111>;
+def : RWSysReg<"TRCIMSPEC7",         0b10, 0b001, 0b0000, 0b0111, 0b111>;
+def : RWSysReg<"TRCRSCTLR2",         0b10, 0b001, 0b0001, 0b0010, 0b000>;
+def : RWSysReg<"TRCRSCTLR3",         0b10, 0b001, 0b0001, 0b0011, 0b000>;
+def : RWSysReg<"TRCRSCTLR4",         0b10, 0b001, 0b0001, 0b0100, 0b000>;
+def : RWSysReg<"TRCRSCTLR5",         0b10, 0b001, 0b0001, 0b0101, 0b000>;
+def : RWSysReg<"TRCRSCTLR6",         0b10, 0b001, 0b0001, 0b0110, 0b000>;
+def : RWSysReg<"TRCRSCTLR7",         0b10, 0b001, 0b0001, 0b0111, 0b000>;
+def : RWSysReg<"TRCRSCTLR8",         0b10, 0b001, 0b0001, 0b1000, 0b000>;
+def : RWSysReg<"TRCRSCTLR9",         0b10, 0b001, 0b0001, 0b1001, 0b000>;
+def : RWSysReg<"TRCRSCTLR10",        0b10, 0b001, 0b0001, 0b1010, 0b000>;
+def : RWSysReg<"TRCRSCTLR11",        0b10, 0b001, 0b0001, 0b1011, 0b000>;
+def : RWSysReg<"TRCRSCTLR12",        0b10, 0b001, 0b0001, 0b1100, 0b000>;
+def : RWSysReg<"TRCRSCTLR13",        0b10, 0b001, 0b0001, 0b1101, 0b000>;
+def : RWSysReg<"TRCRSCTLR14",        0b10, 0b001, 0b0001, 0b1110, 0b000>;
+def : RWSysReg<"TRCRSCTLR15",        0b10, 0b001, 0b0001, 0b1111, 0b000>;
+def : RWSysReg<"TRCRSCTLR16",        0b10, 0b001, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"TRCRSCTLR17",        0b10, 0b001, 0b0001, 0b0001, 0b001>;
+def : RWSysReg<"TRCRSCTLR18",        0b10, 0b001, 0b0001, 0b0010, 0b001>;
+def : RWSysReg<"TRCRSCTLR19",        0b10, 0b001, 0b0001, 0b0011, 0b001>;
+def : RWSysReg<"TRCRSCTLR20",        0b10, 0b001, 0b0001, 0b0100, 0b001>;
+def : RWSysReg<"TRCRSCTLR21",        0b10, 0b001, 0b0001, 0b0101, 0b001>;
+def : RWSysReg<"TRCRSCTLR22",        0b10, 0b001, 0b0001, 0b0110, 0b001>;
+def : RWSysReg<"TRCRSCTLR23",        0b10, 0b001, 0b0001, 0b0111, 0b001>;
+def : RWSysReg<"TRCRSCTLR24",        0b10, 0b001, 0b0001, 0b1000, 0b001>;
+def : RWSysReg<"TRCRSCTLR25",        0b10, 0b001, 0b0001, 0b1001, 0b001>;
+def : RWSysReg<"TRCRSCTLR26",        0b10, 0b001, 0b0001, 0b1010, 0b001>;
+def : RWSysReg<"TRCRSCTLR27",        0b10, 0b001, 0b0001, 0b1011, 0b001>;
+def : RWSysReg<"TRCRSCTLR28",        0b10, 0b001, 0b0001, 0b1100, 0b001>;
+def : RWSysReg<"TRCRSCTLR29",        0b10, 0b001, 0b0001, 0b1101, 0b001>;
+def : RWSysReg<"TRCRSCTLR30",        0b10, 0b001, 0b0001, 0b1110, 0b001>;
+def : RWSysReg<"TRCRSCTLR31",        0b10, 0b001, 0b0001, 0b1111, 0b001>;
+def : RWSysReg<"TRCSSCCR0",          0b10, 0b001, 0b0001, 0b0000, 0b010>;
+def : RWSysReg<"TRCSSCCR1",          0b10, 0b001, 0b0001, 0b0001, 0b010>;
+def : RWSysReg<"TRCSSCCR2",          0b10, 0b001, 0b0001, 0b0010, 0b010>;
+def : RWSysReg<"TRCSSCCR3",          0b10, 0b001, 0b0001, 0b0011, 0b010>;
+def : RWSysReg<"TRCSSCCR4",          0b10, 0b001, 0b0001, 0b0100, 0b010>;
+def : RWSysReg<"TRCSSCCR5",          0b10, 0b001, 0b0001, 0b0101, 0b010>;
+def : RWSysReg<"TRCSSCCR6",          0b10, 0b001, 0b0001, 0b0110, 0b010>;
+def : RWSysReg<"TRCSSCCR7",          0b10, 0b001, 0b0001, 0b0111, 0b010>;
+def : RWSysReg<"TRCSSCSR0",          0b10, 0b001, 0b0001, 0b1000, 0b010>;
+def : RWSysReg<"TRCSSCSR1",          0b10, 0b001, 0b0001, 0b1001, 0b010>;
+def : RWSysReg<"TRCSSCSR2",          0b10, 0b001, 0b0001, 0b1010, 0b010>;
+def : RWSysReg<"TRCSSCSR3",          0b10, 0b001, 0b0001, 0b1011, 0b010>;
+def : RWSysReg<"TRCSSCSR4",          0b10, 0b001, 0b0001, 0b1100, 0b010>;
+def : RWSysReg<"TRCSSCSR5",          0b10, 0b001, 0b0001, 0b1101, 0b010>;
+def : RWSysReg<"TRCSSCSR6",          0b10, 0b001, 0b0001, 0b1110, 0b010>;
+def : RWSysReg<"TRCSSCSR7",          0b10, 0b001, 0b0001, 0b1111, 0b010>;
+def : RWSysReg<"TRCSSPCICR0",        0b10, 0b001, 0b0001, 0b0000, 0b011>;
+def : RWSysReg<"TRCSSPCICR1",        0b10, 0b001, 0b0001, 0b0001, 0b011>;
+def : RWSysReg<"TRCSSPCICR2",        0b10, 0b001, 0b0001, 0b0010, 0b011>;
+def : RWSysReg<"TRCSSPCICR3",        0b10, 0b001, 0b0001, 0b0011, 0b011>;
+def : RWSysReg<"TRCSSPCICR4",        0b10, 0b001, 0b0001, 0b0100, 0b011>;
+def : RWSysReg<"TRCSSPCICR5",        0b10, 0b001, 0b0001, 0b0101, 0b011>;
+def : RWSysReg<"TRCSSPCICR6",        0b10, 0b001, 0b0001, 0b0110, 0b011>;
+def : RWSysReg<"TRCSSPCICR7",        0b10, 0b001, 0b0001, 0b0111, 0b011>;
+def : RWSysReg<"TRCPDCR",            0b10, 0b001, 0b0001, 0b0100, 0b100>;
+def : RWSysReg<"TRCACVR0",           0b10, 0b001, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TRCACVR1",           0b10, 0b001, 0b0010, 0b0010, 0b000>;
+def : RWSysReg<"TRCACVR2",           0b10, 0b001, 0b0010, 0b0100, 0b000>;
+def : RWSysReg<"TRCACVR3",           0b10, 0b001, 0b0010, 0b0110, 0b000>;
+def : RWSysReg<"TRCACVR4",           0b10, 0b001, 0b0010, 0b1000, 0b000>;
+def : RWSysReg<"TRCACVR5",           0b10, 0b001, 0b0010, 0b1010, 0b000>;
+def : RWSysReg<"TRCACVR6",           0b10, 0b001, 0b0010, 0b1100, 0b000>;
+def : RWSysReg<"TRCACVR7",           0b10, 0b001, 0b0010, 0b1110, 0b000>;
+def : RWSysReg<"TRCACVR8",           0b10, 0b001, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"TRCACVR9",           0b10, 0b001, 0b0010, 0b0010, 0b001>;
+def : RWSysReg<"TRCACVR10",          0b10, 0b001, 0b0010, 0b0100, 0b001>;
+def : RWSysReg<"TRCACVR11",          0b10, 0b001, 0b0010, 0b0110, 0b001>;
+def : RWSysReg<"TRCACVR12",          0b10, 0b001, 0b0010, 0b1000, 0b001>;
+def : RWSysReg<"TRCACVR13",          0b10, 0b001, 0b0010, 0b1010, 0b001>;
+def : RWSysReg<"TRCACVR14",          0b10, 0b001, 0b0010, 0b1100, 0b001>;
+def : RWSysReg<"TRCACVR15",          0b10, 0b001, 0b0010, 0b1110, 0b001>;
+def : RWSysReg<"TRCACATR0",          0b10, 0b001, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"TRCACATR1",          0b10, 0b001, 0b0010, 0b0010, 0b010>;
+def : RWSysReg<"TRCACATR2",          0b10, 0b001, 0b0010, 0b0100, 0b010>;
+def : RWSysReg<"TRCACATR3",          0b10, 0b001, 0b0010, 0b0110, 0b010>;
+def : RWSysReg<"TRCACATR4",          0b10, 0b001, 0b0010, 0b1000, 0b010>;
+def : RWSysReg<"TRCACATR5",          0b10, 0b001, 0b0010, 0b1010, 0b010>;
+def : RWSysReg<"TRCACATR6",          0b10, 0b001, 0b0010, 0b1100, 0b010>;
+def : RWSysReg<"TRCACATR7",          0b10, 0b001, 0b0010, 0b1110, 0b010>;
+def : RWSysReg<"TRCACATR8",          0b10, 0b001, 0b0010, 0b0000, 0b011>;
+def : RWSysReg<"TRCACATR9",          0b10, 0b001, 0b0010, 0b0010, 0b011>;
+def : RWSysReg<"TRCACATR10",         0b10, 0b001, 0b0010, 0b0100, 0b011>;
+def : RWSysReg<"TRCACATR11",         0b10, 0b001, 0b0010, 0b0110, 0b011>;
+def : RWSysReg<"TRCACATR12",         0b10, 0b001, 0b0010, 0b1000, 0b011>;
+def : RWSysReg<"TRCACATR13",         0b10, 0b001, 0b0010, 0b1010, 0b011>;
+def : RWSysReg<"TRCACATR14",         0b10, 0b001, 0b0010, 0b1100, 0b011>;
+def : RWSysReg<"TRCACATR15",         0b10, 0b001, 0b0010, 0b1110, 0b011>;
+def : RWSysReg<"TRCDVCVR0",          0b10, 0b001, 0b0010, 0b0000, 0b100>;
+def : RWSysReg<"TRCDVCVR1",          0b10, 0b001, 0b0010, 0b0100, 0b100>;
+def : RWSysReg<"TRCDVCVR2",          0b10, 0b001, 0b0010, 0b1000, 0b100>;
+def : RWSysReg<"TRCDVCVR3",          0b10, 0b001, 0b0010, 0b1100, 0b100>;
+def : RWSysReg<"TRCDVCVR4",          0b10, 0b001, 0b0010, 0b0000, 0b101>;
+def : RWSysReg<"TRCDVCVR5",          0b10, 0b001, 0b0010, 0b0100, 0b101>;
+def : RWSysReg<"TRCDVCVR6",          0b10, 0b001, 0b0010, 0b1000, 0b101>;
+def : RWSysReg<"TRCDVCVR7",          0b10, 0b001, 0b0010, 0b1100, 0b101>;
+def : RWSysReg<"TRCDVCMR0",          0b10, 0b001, 0b0010, 0b0000, 0b110>;
+def : RWSysReg<"TRCDVCMR1",          0b10, 0b001, 0b0010, 0b0100, 0b110>;
+def : RWSysReg<"TRCDVCMR2",          0b10, 0b001, 0b0010, 0b1000, 0b110>;
+def : RWSysReg<"TRCDVCMR3",          0b10, 0b001, 0b0010, 0b1100, 0b110>;
+def : RWSysReg<"TRCDVCMR4",          0b10, 0b001, 0b0010, 0b0000, 0b111>;
+def : RWSysReg<"TRCDVCMR5",          0b10, 0b001, 0b0010, 0b0100, 0b111>;
+def : RWSysReg<"TRCDVCMR6",          0b10, 0b001, 0b0010, 0b1000, 0b111>;
+def : RWSysReg<"TRCDVCMR7",          0b10, 0b001, 0b0010, 0b1100, 0b111>;
+def : RWSysReg<"TRCCIDCVR0",         0b10, 0b001, 0b0011, 0b0000, 0b000>;
+def : RWSysReg<"TRCCIDCVR1",         0b10, 0b001, 0b0011, 0b0010, 0b000>;
+def : RWSysReg<"TRCCIDCVR2",         0b10, 0b001, 0b0011, 0b0100, 0b000>;
+def : RWSysReg<"TRCCIDCVR3",         0b10, 0b001, 0b0011, 0b0110, 0b000>;
+def : RWSysReg<"TRCCIDCVR4",         0b10, 0b001, 0b0011, 0b1000, 0b000>;
+def : RWSysReg<"TRCCIDCVR5",         0b10, 0b001, 0b0011, 0b1010, 0b000>;
+def : RWSysReg<"TRCCIDCVR6",         0b10, 0b001, 0b0011, 0b1100, 0b000>;
+def : RWSysReg<"TRCCIDCVR7",         0b10, 0b001, 0b0011, 0b1110, 0b000>;
+def : RWSysReg<"TRCVMIDCVR0",        0b10, 0b001, 0b0011, 0b0000, 0b001>;
+def : RWSysReg<"TRCVMIDCVR1",        0b10, 0b001, 0b0011, 0b0010, 0b001>;
+def : RWSysReg<"TRCVMIDCVR2",        0b10, 0b001, 0b0011, 0b0100, 0b001>;
+def : RWSysReg<"TRCVMIDCVR3",        0b10, 0b001, 0b0011, 0b0110, 0b001>;
+def : RWSysReg<"TRCVMIDCVR4",        0b10, 0b001, 0b0011, 0b1000, 0b001>;
+def : RWSysReg<"TRCVMIDCVR5",        0b10, 0b001, 0b0011, 0b1010, 0b001>;
+def : RWSysReg<"TRCVMIDCVR6",        0b10, 0b001, 0b0011, 0b1100, 0b001>;
+def : RWSysReg<"TRCVMIDCVR7",        0b10, 0b001, 0b0011, 0b1110, 0b001>;
+def : RWSysReg<"TRCCIDCCTLR0",       0b10, 0b001, 0b0011, 0b0000, 0b010>;
+def : RWSysReg<"TRCCIDCCTLR1",       0b10, 0b001, 0b0011, 0b0001, 0b010>;
+def : RWSysReg<"TRCVMIDCCTLR0",      0b10, 0b001, 0b0011, 0b0010, 0b010>;
+def : RWSysReg<"TRCVMIDCCTLR1",      0b10, 0b001, 0b0011, 0b0011, 0b010>;
+def : RWSysReg<"TRCITCTRL",          0b10, 0b001, 0b0111, 0b0000, 0b100>;
+def : RWSysReg<"TRCCLAIMSET",        0b10, 0b001, 0b0111, 0b1000, 0b110>;
+def : RWSysReg<"TRCCLAIMCLR",        0b10, 0b001, 0b0111, 0b1001, 0b110>;
+
+// GICv3 registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : RWSysReg<"ICC_BPR1_EL1",       0b11, 0b000, 0b1100, 0b1100, 0b011>;
+def : RWSysReg<"ICC_BPR0_EL1",       0b11, 0b000, 0b1100, 0b1000, 0b011>;
+def : RWSysReg<"ICC_PMR_EL1",        0b11, 0b000, 0b0100, 0b0110, 0b000>;
+def : RWSysReg<"ICC_CTLR_EL1",       0b11, 0b000, 0b1100, 0b1100, 0b100>;
+def : RWSysReg<"ICC_CTLR_EL3",       0b11, 0b110, 0b1100, 0b1100, 0b100>;
+def : RWSysReg<"ICC_SRE_EL1",        0b11, 0b000, 0b1100, 0b1100, 0b101>;
+def : RWSysReg<"ICC_SRE_EL2",        0b11, 0b100, 0b1100, 0b1001, 0b101>;
+def : RWSysReg<"ICC_SRE_EL3",        0b11, 0b110, 0b1100, 0b1100, 0b101>;
+def : RWSysReg<"ICC_IGRPEN0_EL1",    0b11, 0b000, 0b1100, 0b1100, 0b110>;
+def : RWSysReg<"ICC_IGRPEN1_EL1",    0b11, 0b000, 0b1100, 0b1100, 0b111>;
+def : RWSysReg<"ICC_IGRPEN1_EL3",    0b11, 0b110, 0b1100, 0b1100, 0b111>;
+def : RWSysReg<"ICC_SEIEN_EL1",      0b11, 0b000, 0b1100, 0b1101, 0b000>;
+def : RWSysReg<"ICC_AP0R0_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b100>;
+def : RWSysReg<"ICC_AP0R1_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b101>;
+def : RWSysReg<"ICC_AP0R2_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b110>;
+def : RWSysReg<"ICC_AP0R3_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b111>;
+def : RWSysReg<"ICC_AP1R0_EL1",      0b11, 0b000, 0b1100, 0b1001, 0b000>;
+def : RWSysReg<"ICC_AP1R1_EL1",      0b11, 0b000, 0b1100, 0b1001, 0b001>;
+def : RWSysReg<"ICC_AP1R2_EL1",      0b11, 0b000, 0b1100, 0b1001, 0b010>;
+def : RWSysReg<"ICC_AP1R3_EL1",      0b11, 0b000, 0b1100, 0b1001, 0b011>;
+def : RWSysReg<"ICH_AP0R0_EL2",      0b11, 0b100, 0b1100, 0b1000, 0b000>;
+def : RWSysReg<"ICH_AP0R1_EL2",      0b11, 0b100, 0b1100, 0b1000, 0b001>;
+def : RWSysReg<"ICH_AP0R2_EL2",      0b11, 0b100, 0b1100, 0b1000, 0b010>;
+def : RWSysReg<"ICH_AP0R3_EL2",      0b11, 0b100, 0b1100, 0b1000, 0b011>;
+def : RWSysReg<"ICH_AP1R0_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b000>;
+def : RWSysReg<"ICH_AP1R1_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b001>;
+def : RWSysReg<"ICH_AP1R2_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b010>;
+def : RWSysReg<"ICH_AP1R3_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b011>;
+def : RWSysReg<"ICH_HCR_EL2",        0b11, 0b100, 0b1100, 0b1011, 0b000>;
+def : RWSysReg<"ICH_MISR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b010>;
+def : RWSysReg<"ICH_VMCR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b111>;
+def : RWSysReg<"ICH_VSEIR_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b100>;
+def : RWSysReg<"ICH_LR0_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b000>;
+def : RWSysReg<"ICH_LR1_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b001>;
+def : RWSysReg<"ICH_LR2_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b010>;
+def : RWSysReg<"ICH_LR3_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b011>;
+def : RWSysReg<"ICH_LR4_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b100>;
+def : RWSysReg<"ICH_LR5_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b101>;
+def : RWSysReg<"ICH_LR6_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b110>;
+def : RWSysReg<"ICH_LR7_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b111>;
+def : RWSysReg<"ICH_LR8_EL2",        0b11, 0b100, 0b1100, 0b1101, 0b000>;
+def : RWSysReg<"ICH_LR9_EL2",        0b11, 0b100, 0b1100, 0b1101, 0b001>;
+def : RWSysReg<"ICH_LR10_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b010>;
+def : RWSysReg<"ICH_LR11_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b011>;
+def : RWSysReg<"ICH_LR12_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b100>;
+def : RWSysReg<"ICH_LR13_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b101>;
+def : RWSysReg<"ICH_LR14_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b110>;
+def : RWSysReg<"ICH_LR15_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b111>;
+
+// v8.1a "Privileged Access Never" extension-specific system registers
+let Requires = [{ {AArch64::HasV8_1aOps} }] in
+def : RWSysReg<"PAN", 0b11, 0b000, 0b0100, 0b0010, 0b011>;
+
+// v8.1a "Limited Ordering Regions" extension-specific system registers
+//                         Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::HasV8_1aOps} }] in {
+def : RWSysReg<"LORSA_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b000>;
+def : RWSysReg<"LOREA_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b001>;
+def : RWSysReg<"LORN_EL1",   0b11, 0b000, 0b1010, 0b0100, 0b010>;
+def : RWSysReg<"LORC_EL1",   0b11, 0b000, 0b1010, 0b0100, 0b011>;
+}
+
+// v8.1a "Virtualization hos extensions" system registers
+//                              Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::HasV8_1aOps} }] in {
+def : RWSysReg<"TTBR1_EL2",       0b11, 0b100, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"CONTEXTIDR_EL2",  0b11, 0b100, 0b1101, 0b0000, 0b001>;
+def : RWSysReg<"CNTHV_TVAL_EL2",  0b11, 0b100, 0b1110, 0b0011, 0b000>;
+def : RWSysReg<"CNTHV_CVAL_EL2",  0b11, 0b100, 0b1110, 0b0011, 0b010>;
+def : RWSysReg<"CNTHV_CTL_EL2",   0b11, 0b100, 0b1110, 0b0011, 0b001>;
+def : RWSysReg<"SCTLR_EL12",      0b11, 0b101, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"CPACR_EL12",      0b11, 0b101, 0b0001, 0b0000, 0b010>;
+def : RWSysReg<"TTBR0_EL12",      0b11, 0b101, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR1_EL12",      0b11, 0b101, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"TCR_EL12",        0b11, 0b101, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"AFSR0_EL12",      0b11, 0b101, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR1_EL12",      0b11, 0b101, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"ESR_EL12",        0b11, 0b101, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"FAR_EL12",        0b11, 0b101, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"MAIR_EL12",       0b11, 0b101, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"AMAIR_EL12",      0b11, 0b101, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"VBAR_EL12",       0b11, 0b101, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"CONTEXTIDR_EL12", 0b11, 0b101, 0b1101, 0b0000, 0b001>;
+def : RWSysReg<"CNTKCTL_EL12",    0b11, 0b101, 0b1110, 0b0001, 0b000>;
+def : RWSysReg<"CNTP_TVAL_EL02",  0b11, 0b101, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTP_CTL_EL02",   0b11, 0b101, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTP_CVAL_EL02",  0b11, 0b101, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTV_TVAL_EL02",  0b11, 0b101, 0b1110, 0b0011, 0b000>;
+def : RWSysReg<"CNTV_CTL_EL02",   0b11, 0b101, 0b1110, 0b0011, 0b001>;
+def : RWSysReg<"CNTV_CVAL_EL02",  0b11, 0b101, 0b1110, 0b0011, 0b010>;
+def : RWSysReg<"SPSR_EL12",       0b11, 0b101, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"ELR_EL12",        0b11, 0b101, 0b0100, 0b0000, 0b001>;
+}
+// v8.2a registers
+//                  Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::HasV8_2aOps} }] in
+def : RWSysReg<"UAO", 0b11, 0b000, 0b0100, 0b0010, 0b100>;
+
+// v8.2a "Statistical Profiling extension" registers
+//                            Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::FeatureSPE} }] in {
+def : RWSysReg<"PMBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b000>;
+def : RWSysReg<"PMBPTR_EL1",    0b11, 0b000, 0b1001, 0b1010, 0b001>;
+def : RWSysReg<"PMBSR_EL1",     0b11, 0b000, 0b1001, 0b1010, 0b011>;
+def : RWSysReg<"PMBIDR_EL1",    0b11, 0b000, 0b1001, 0b1010, 0b111>;
+def : RWSysReg<"PMSCR_EL2",     0b11, 0b100, 0b1001, 0b1001, 0b000>;
+def : RWSysReg<"PMSCR_EL12",    0b11, 0b101, 0b1001, 0b1001, 0b000>;
+def : RWSysReg<"PMSCR_EL1",     0b11, 0b000, 0b1001, 0b1001, 0b000>;
+def : RWSysReg<"PMSICR_EL1",    0b11, 0b000, 0b1001, 0b1001, 0b010>;
+def : RWSysReg<"PMSIRR_EL1",    0b11, 0b000, 0b1001, 0b1001, 0b011>;
+def : RWSysReg<"PMSFCR_EL1",    0b11, 0b000, 0b1001, 0b1001, 0b100>;
+def : RWSysReg<"PMSEVFR_EL1",   0b11, 0b000, 0b1001, 0b1001, 0b101>;
+def : RWSysReg<"PMSLATFR_EL1",  0b11, 0b000, 0b1001, 0b1001, 0b110>;
+def : RWSysReg<"PMSIDR_EL1",    0b11, 0b000, 0b1001, 0b1001, 0b111>;
+}
+
+// v8.2a "RAS extension" registers
+//                         Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::FeatureRAS} }] in {
+def : RWSysReg<"ERRSELR_EL1",   0b11, 0b000, 0b0101, 0b0011, 0b001>;
+def : RWSysReg<"ERXCTLR_EL1",   0b11, 0b000, 0b0101, 0b0100, 0b001>;
+def : RWSysReg<"ERXSTATUS_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b010>;
+def : RWSysReg<"ERXADDR_EL1",   0b11, 0b000, 0b0101, 0b0100, 0b011>;
+def : RWSysReg<"ERXMISC0_EL1",  0b11, 0b000, 0b0101, 0b0101, 0b000>;
+def : RWSysReg<"ERXMISC1_EL1",  0b11, 0b000, 0b0101, 0b0101, 0b001>;
+def : RWSysReg<"DISR_EL1",      0b11, 0b000, 0b1100, 0b0001, 0b001>;
+def : RWSysReg<"VDISR_EL2",     0b11, 0b100, 0b1100, 0b0001, 0b001>;
+def : RWSysReg<"VSESR_EL2",     0b11, 0b100, 0b0101, 0b0010, 0b011>;
+}
+
+// Cyclone specific system registers
+//                                 Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::ProcCyclone} }] in
+def : RWSysReg<"CPM_IOACC_CTL_EL3", 0b11, 0b111, 0b1111, 0b0010, 0b000>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
new file mode 100644
index 000000000000..e4ef0d4bb8db
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -0,0 +1,489 @@
+//===-- AArch64TargetMachine.cpp - Define TargetMachine for AArch64 -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64CallLowering.h"
+#include "AArch64InstructionSelector.h"
+#include "AArch64LegalizerInfo.h"
+#include "AArch64RegisterBankInfo.h"
+#include "AArch64TargetMachine.h"
+#include "AArch64TargetObjectFile.h"
+#include "AArch64TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+static cl::opt<bool> EnableCCMP("aarch64-enable-ccmp",
+                                cl::desc("Enable the CCMP formation pass"),
+                                cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnableMCR("aarch64-enable-mcr",
+                               cl::desc("Enable the machine combiner pass"),
+                               cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnableStPairSuppress("aarch64-enable-stp-suppress",
+                                          cl::desc("Suppress STP for AArch64"),
+                                          cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnableAdvSIMDScalar(
+    "aarch64-enable-simd-scalar",
+    cl::desc("Enable use of AdvSIMD scalar integer instructions"),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+    EnablePromoteConstant("aarch64-enable-promote-const",
+                          cl::desc("Enable the promote constant pass"),
+                          cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnableCollectLOH(
+    "aarch64-enable-collect-loh",
+    cl::desc("Enable the pass that emits the linker optimization hints (LOH)"),
+    cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+    EnableDeadRegisterElimination("aarch64-enable-dead-defs", cl::Hidden,
+                                  cl::desc("Enable the pass that removes dead"
+                                           " definitons and replaces stores to"
+                                           " them with stores to the zero"
+                                           " register"),
+                                  cl::init(true));
+
+static cl::opt<bool> EnableRedundantCopyElimination(
+    "aarch64-enable-copyelim",
+    cl::desc("Enable the redundant copy elimination pass"), cl::init(true),
+    cl::Hidden);
+
+static cl::opt<bool> EnableLoadStoreOpt("aarch64-enable-ldst-opt",
+                                        cl::desc("Enable the load/store pair"
+                                                 " optimization pass"),
+                                        cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnableAtomicTidy(
+    "aarch64-enable-atomic-cfg-tidy", cl::Hidden,
+    cl::desc("Run SimplifyCFG after expanding atomic operations"
+             " to make use of cmpxchg flow-based information"),
+    cl::init(true));
+
+static cl::opt<bool>
+EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden,
+                        cl::desc("Run early if-conversion"),
+                        cl::init(true));
+
+static cl::opt<bool>
+    EnableCondOpt("aarch64-enable-condopt",
+                  cl::desc("Enable the condition optimizer pass"),
+                  cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableA53Fix835769("aarch64-fix-cortex-a53-835769", cl::Hidden,
+                cl::desc("Work around Cortex-A53 erratum 835769"),
+                cl::init(false));
+
+static cl::opt<bool>
+    EnableAddressTypePromotion("aarch64-enable-type-promotion", cl::Hidden,
+                               cl::desc("Enable the type promotion pass"),
+                               cl::init(true));
+
+static cl::opt<bool>
+    EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden,
+                 cl::desc("Enable optimizations on complex GEPs"),
+                 cl::init(false));
+
+static cl::opt<bool>
+    BranchRelaxation("aarch64-enable-branch-relax", cl::Hidden, cl::init(true),
+                     cl::desc("Relax out of range conditional branches"));
+
+// FIXME: Unify control over GlobalMerge.
+static cl::opt<cl::boolOrDefault>
+    EnableGlobalMerge("aarch64-enable-global-merge", cl::Hidden,
+                      cl::desc("Enable the global merge pass"));
+
+static cl::opt<bool>
+    EnableLoopDataPrefetch("aarch64-enable-loop-data-prefetch", cl::Hidden,
+                           cl::desc("Enable the loop data prefetch pass"),
+                           cl::init(true));
+
+extern "C" void LLVMInitializeAArch64Target() {
+  // Register the target.
+  RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
+  RegisterTargetMachine<AArch64beTargetMachine> Y(getTheAArch64beTarget());
+  RegisterTargetMachine<AArch64leTargetMachine> Z(getTheARM64Target());
+  auto PR = PassRegistry::getPassRegistry();
+  initializeGlobalISel(*PR);
+  initializeAArch64A53Fix835769Pass(*PR);
+  initializeAArch64A57FPLoadBalancingPass(*PR);
+  initializeAArch64AddressTypePromotionPass(*PR);
+  initializeAArch64AdvSIMDScalarPass(*PR);
+  initializeAArch64CollectLOHPass(*PR);
+  initializeAArch64ConditionalComparesPass(*PR);
+  initializeAArch64ConditionOptimizerPass(*PR);
+  initializeAArch64DeadRegisterDefinitionsPass(*PR);
+  initializeAArch64ExpandPseudoPass(*PR);
+  initializeAArch64LoadStoreOptPass(*PR);
+  initializeAArch64VectorByElementOptPass(*PR);
+  initializeAArch64PromoteConstantPass(*PR);
+  initializeAArch64RedundantCopyEliminationPass(*PR);
+  initializeAArch64StorePairSuppressPass(*PR);
+  initializeLDTLSCleanupPass(*PR);
+}
+
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering public interface.
+//===----------------------------------------------------------------------===//
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO())
+    return make_unique<AArch64_MachoTargetObjectFile>();
+
+  return make_unique<AArch64_ELFTargetObjectFile>();
+}
+
+// Helper function to build a DataLayout string
+static std::string computeDataLayout(const Triple &TT,
+                                     const MCTargetOptions &Options,
+                                     bool LittleEndian) {
+  if (Options.getABIName() == "ilp32")
+    return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128";
+  if (TT.isOSBinFormatMachO())
+    return "e-m:o-i64:64-i128:128-n32:64-S128";
+  if (LittleEndian)
+    return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
+  return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
+}
+
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+                                           Optional<Reloc::Model> RM) {
+  // AArch64 Darwin is always PIC.
+  if (TT.isOSDarwin())
+    return Reloc::PIC_;
+  // On ELF platforms the default static relocation model has a smart enough
+  // linker to cope with referencing external symbols defined in a shared
+  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
+  if (!RM.hasValue() || *RM == Reloc::DynamicNoPIC)
+    return Reloc::Static;
+  return *RM;
+}
+
+/// Create an AArch64 architecture model.
+///
+AArch64TargetMachine::AArch64TargetMachine(
+    const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
+    const TargetOptions &Options, Optional<Reloc::Model> RM,
+    CodeModel::Model CM, CodeGenOpt::Level OL, bool LittleEndian)
+    // This nested ternary is horrible, but DL needs to be properly
+    // initialized before TLInfo is constructed.
+    : LLVMTargetMachine(T, computeDataLayout(TT, Options.MCOptions,
+                                             LittleEndian),
+                        TT, CPU, FS, Options,
+			getEffectiveRelocModel(TT, RM), CM, OL),
+      TLOF(createTLOF(getTargetTriple())),
+      isLittle(LittleEndian) {
+  initAsmInfo();
+}
+
+AArch64TargetMachine::~AArch64TargetMachine() {}
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+struct AArch64GISelActualAccessor : public GISelAccessor {
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+  const CallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+  const InstructionSelector *getInstructionSelector() const override {
+    return InstSelector.get();
+  }
+  const LegalizerInfo *getLegalizerInfo() const override {
+    return Legalizer.get();
+  }
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
+  }
+};
+} // End anonymous namespace.
+#endif
+
+const AArch64Subtarget *
+AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
+                                            isLittle);
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+    GISelAccessor *GISel = new GISelAccessor();
+#else
+    AArch64GISelActualAccessor *GISel =
+        new AArch64GISelActualAccessor();
+    GISel->CallLoweringInfo.reset(
+        new AArch64CallLowering(*I->getTargetLowering()));
+    GISel->Legalizer.reset(new AArch64LegalizerInfo());
+
+    auto *RBI = new AArch64RegisterBankInfo(*I->getRegisterInfo());
+
+    // FIXME: At this point, we can't rely on Subtarget having RBI.
+    // It's awkward to mix passing RBI and the Subtarget; should we pass
+    // TII/TRI as well?
+    GISel->InstSelector.reset(new AArch64InstructionSelector(*this, *I, *RBI));
+
+    GISel->RegBankInfo.reset(RBI);
+#endif
+    I->setGISelAccessor(*GISel);
+  }
+  return I.get();
+}
+
+void AArch64leTargetMachine::anchor() { }
+
+AArch64leTargetMachine::AArch64leTargetMachine(
+    const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
+    const TargetOptions &Options, Optional<Reloc::Model> RM,
+    CodeModel::Model CM, CodeGenOpt::Level OL)
+    : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+void AArch64beTargetMachine::anchor() { }
+
+AArch64beTargetMachine::AArch64beTargetMachine(
+    const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
+    const TargetOptions &Options, Optional<Reloc::Model> RM,
+    CodeModel::Model CM, CodeGenOpt::Level OL)
+    : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+namespace {
+/// AArch64 Code Generator Pass Configuration Options.
+class AArch64PassConfig : public TargetPassConfig {
+public:
+  AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {
+    if (TM->getOptLevel() != CodeGenOpt::None)
+      substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
+  }
+
+  AArch64TargetMachine &getAArch64TargetMachine() const {
+    return getTM<AArch64TargetMachine>();
+  }
+
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override {
+    ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+    DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
+    return DAG;
+  }
+
+  void addIRPasses()  override;
+  bool addPreISel() override;
+  bool addInstSelector() override;
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+  bool addIRTranslator() override;
+  bool addLegalizeMachineIR() override;
+  bool addRegBankSelect() override;
+  bool addGlobalInstructionSelect() override;
+#endif
+  bool addILPOpts() override;
+  void addPreRegAlloc() override;
+  void addPostRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
+};
+} // namespace
+
+TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(AArch64TTIImpl(this, F));
+  });
+}
+
+TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new AArch64PassConfig(this, PM);
+}
+
+void AArch64PassConfig::addIRPasses() {
+  // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
+  // ourselves.
+  addPass(createAtomicExpandPass(TM));
+
+  // Cmpxchg instructions are often used with a subsequent comparison to
+  // determine whether it succeeded. We can exploit existing control-flow in
+  // ldrex/strex loops to simplify this, but it needs tidying up.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
+    addPass(createCFGSimplificationPass());
+
+  // Run LoopDataPrefetch
+  //
+  // Run this before LSR to remove the multiplies involved in computing the
+  // pointer values N iterations ahead.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch)
+    addPass(createLoopDataPrefetchPass());
+
+  TargetPassConfig::addIRPasses();
+
+  // Match interleaved memory accesses to ldN/stN intrinsics.
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createInterleavedAccessPass(TM));
+
+  if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
+    // Call SeparateConstOffsetFromGEP pass to extract constants within indices
+    // and lower a GEP with multiple indices to either arithmetic operations or
+    // multiple GEPs with single index.
+    addPass(createSeparateConstOffsetFromGEPPass(TM, true));
+    // Call EarlyCSE pass to find and remove subexpressions in the lowered
+    // result.
+    addPass(createEarlyCSEPass());
+    // Do loop invariant code motion in case part of the lowered result is
+    // invariant.
+    addPass(createLICMPass());
+  }
+}
+
+// Pass Pipeline Configuration
+bool AArch64PassConfig::addPreISel() {
+  // Run promote constant before global merge, so that the promoted constants
+  // get a chance to be merged
+  if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant)
+    addPass(createAArch64PromoteConstantPass());
+  // FIXME: On AArch64, this depends on the type.
+  // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes().
+  // and the offset has to be a multiple of the related size in bytes.
+  if ((TM->getOptLevel() != CodeGenOpt::None &&
+       EnableGlobalMerge == cl::BOU_UNSET) ||
+      EnableGlobalMerge == cl::BOU_TRUE) {
+    bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) &&
+                               (EnableGlobalMerge == cl::BOU_UNSET);
+    addPass(createGlobalMergePass(TM, 4095, OnlyOptimizeForSize));
+  }
+
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAddressTypePromotion)
+    addPass(createAArch64AddressTypePromotionPass());
+
+  return false;
+}
+
+bool AArch64PassConfig::addInstSelector() {
+  addPass(createAArch64ISelDag(getAArch64TargetMachine(), getOptLevel()));
+
+  // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many
+  // references to _TLS_MODULE_BASE_ as possible.
+  if (TM->getTargetTriple().isOSBinFormatELF() &&
+      getOptLevel() != CodeGenOpt::None)
+    addPass(createAArch64CleanupLocalDynamicTLSPass());
+
+  return false;
+}
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+bool AArch64PassConfig::addIRTranslator() {
+  addPass(new IRTranslator());
+  return false;
+}
+bool AArch64PassConfig::addLegalizeMachineIR() {
+  addPass(new Legalizer());
+  return false;
+}
+bool AArch64PassConfig::addRegBankSelect() {
+  addPass(new RegBankSelect());
+  return false;
+}
+bool AArch64PassConfig::addGlobalInstructionSelect() {
+  addPass(new InstructionSelect());
+  return false;
+}
+#endif
+
+bool AArch64PassConfig::addILPOpts() {
+  if (EnableCondOpt)
+    addPass(createAArch64ConditionOptimizerPass());
+  if (EnableCCMP)
+    addPass(createAArch64ConditionalCompares());
+  if (EnableMCR)
+    addPass(&MachineCombinerID);
+  if (EnableEarlyIfConversion)
+    addPass(&EarlyIfConverterID);
+  if (EnableStPairSuppress)
+    addPass(createAArch64StorePairSuppressPass());
+  addPass(createAArch64VectorByElementOptPass());
+  return true;
+}
+
+void AArch64PassConfig::addPreRegAlloc() {
+  // Change dead register definitions to refer to the zero register.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
+    addPass(createAArch64DeadRegisterDefinitions());
+
+  // Use AdvSIMD scalar instructions whenever profitable.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar) {
+    addPass(createAArch64AdvSIMDScalar());
+    // The AdvSIMD pass may produce copies that can be rewritten to
+    // be register coaleascer friendly.
+    addPass(&PeepholeOptimizerID);
+  }
+}
+
+void AArch64PassConfig::addPostRegAlloc() {
+  // Remove redundant copy instructions.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableRedundantCopyElimination)
+    addPass(createAArch64RedundantCopyEliminationPass());
+
+  if (TM->getOptLevel() != CodeGenOpt::None && usingDefaultRegAlloc())
+    // Improve performance for some FP/SIMD code for A57.
+    addPass(createAArch64A57FPLoadBalancing());
+}
+
+void AArch64PassConfig::addPreSched2() {
+  // Expand some pseudo instructions to allow proper scheduling.
+  addPass(createAArch64ExpandPseudoPass());
+  // Use load/store pair instructions when possible.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
+    addPass(createAArch64LoadStoreOptimizationPass());
+}
+
+void AArch64PassConfig::addPreEmitPass() {
+  if (EnableA53Fix835769)
+    addPass(createAArch64A53Fix835769());
+  // Relax conditional branch instructions if they're otherwise out of
+  // range of their destination.
+  if (BranchRelaxation)
+    addPass(&BranchRelaxationPassID);
+
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
+      TM->getTargetTriple().isOSBinFormatMachO())
+    addPass(createAArch64CollectLOHPass());
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h
new file mode 100644
index 000000000000..6fa5e83957e1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -0,0 +1,76 @@
+//==-- AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AArch64 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AArch64TargetMachine : public LLVMTargetMachine {
+protected:
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  mutable StringMap<std::unique_ptr<AArch64Subtarget>> SubtargetMap;
+
+public:
+  AArch64TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL, bool IsLittleEndian);
+
+  ~AArch64TargetMachine() override;
+  const AArch64Subtarget *getSubtargetImpl(const Function &F) const override;
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  /// \brief Get the TargetIRAnalysis for this target.
+  TargetIRAnalysis getTargetIRAnalysis() override;
+
+  TargetLoweringObjectFile* getObjFileLowering() const override {
+    return TLOF.get();
+  }
+
+private:
+  bool isLittle;
+};
+
+// AArch64 little endian target machine.
+//
+class AArch64leTargetMachine : public AArch64TargetMachine {
+  virtual void anchor();
+public:
+  AArch64leTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                         StringRef FS, const TargetOptions &Options,
+                         Optional<Reloc::Model> RM, CodeModel::Model CM,
+                         CodeGenOpt::Level OL);
+};
+
+// AArch64 big endian target machine.
+//
+class AArch64beTargetMachine : public AArch64TargetMachine {
+  virtual void anchor();
+public:
+  AArch64beTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                         StringRef FS, const TargetOptions &Options,
+                         Optional<Reloc::Model> RM, CodeModel::Model CM,
+                         CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
new file mode 100644
index 000000000000..8875f9b72647
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -0,0 +1,72 @@
+//===-- AArch64TargetObjectFile.cpp - AArch64 Object Info -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetObjectFile.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Dwarf.h"
+using namespace llvm;
+using namespace dwarf;
+
+void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
+                                             const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
+AArch64_MachoTargetObjectFile::AArch64_MachoTargetObjectFile()
+  : TargetLoweringObjectFileMachO() {
+  SupportGOTPCRelWithOffset = false;
+}
+
+const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
+    MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+  // is an indirect pc-relative reference. The default implementation
+  // won't reference using the GOT, so we need this target-specific
+  // version.
+  if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
+    const MCSymbol *Sym = TM.getSymbol(GV);
+    const MCExpr *Res =
+        MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
+    MCSymbol *PCSym = getContext().createTempSymbol();
+    Streamer.EmitLabel(PCSym);
+    const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
+    return MCBinaryExpr::createSub(Res, PC, getContext());
+  }
+
+  return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+      GV, Encoding, TM, MMI, Streamer);
+}
+
+MCSymbol *AArch64_MachoTargetObjectFile::getCFIPersonalitySymbol(
+    const GlobalValue *GV, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  return TM.getSymbol(GV);
+}
+
+const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
+    const MCSymbol *Sym, const MCValue &MV, int64_t Offset,
+    MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+  assert((Offset+MV.getConstant() == 0) &&
+         "Arch64 does not support GOT PC rel with extra offset");
+  // On ARM64 Darwin, we can reference symbols with foo@GOT-., which
+  // is an indirect pc-relative reference.
+  const MCExpr *Res =
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
+  MCSymbol *PCSym = getContext().createTempSymbol();
+  Streamer.EmitLabel(PCSym);
+  const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
+  return MCBinaryExpr::createSub(Res, PC, getContext());
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
new file mode 100644
index 000000000000..05e1dfa9e6c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -0,0 +1,47 @@
+//===-- AArch64TargetObjectFile.h - AArch64 Object Info -*- C++ ---------*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+class AArch64TargetMachine;
+
+/// This implementation is used for AArch64 ELF targets (Linux in particular).
+class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+/// AArch64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
+class AArch64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+public:
+  AArch64_MachoTargetObjectFile();
+
+  const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+                                        unsigned Encoding,
+                                        const TargetMachine &TM,
+                                        MachineModuleInfo *MMI,
+                                        MCStreamer &Streamer) const override;
+
+  MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV,
+                                    const TargetMachine &TM,
+                                    MachineModuleInfo *MMI) const override;
+
+  const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
+                                          const MCValue &MV, int64_t Offset,
+                                          MachineModuleInfo *MMI,
+                                          MCStreamer &Streamer) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
new file mode 100644
index 000000000000..88c98865bbc6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -0,0 +1,643 @@
+//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetTransformInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64tti"
+
+/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+int AArch64TTIImpl::getIntImmCost(int64_t Val) {
+  // Check if the immediate can be encoded within an instruction.
+  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
+    return 0;
+
+  if (Val < 0)
+    Val = ~Val;
+
+  // Calculate how many moves we will need to materialize this constant.
+  unsigned LZ = countLeadingZeros((uint64_t)Val);
+  return (64 - LZ + 15) / 16;
+}
+
+/// \brief Calculate the cost of materializing the given constant.
+int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  // Sign-extend all constants to a multiple of 64-bit.
+  APInt ImmVal = Imm;
+  if (BitSize & 0x3f)
+    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
+
+  // Split the constant into 64-bit chunks and calculate the cost for each
+  // chunk.
+  int Cost = 0;
+  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+    int64_t Val = Tmp.getSExtValue();
+    Cost += getIntImmCost(Val);
+  }
+  // We need at least one instruction to materialze the constant.
+  return std::max(1, Cost);
+}
+
+int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
+                                  const APInt &Imm, Type *Ty) {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TTI::TCC_Free;
+
+  unsigned ImmIdx = ~0U;
+  switch (Opcode) {
+  default:
+    return TTI::TCC_Free;
+  case Instruction::GetElementPtr:
+    // Always hoist the base address of a GetElementPtr.
+    if (Idx == 0)
+      return 2 * TTI::TCC_Basic;
+    return TTI::TCC_Free;
+  case Instruction::Store:
+    ImmIdx = 0;
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::ICmp:
+    ImmIdx = 1;
+    break;
+  // Always return TCC_Free for the shift value of a shift instruction.
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    if (Idx == 1)
+      return TTI::TCC_Free;
+    break;
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+  case Instruction::BitCast:
+  case Instruction::PHI:
+  case Instruction::Call:
+  case Instruction::Select:
+  case Instruction::Ret:
+  case Instruction::Load:
+    break;
+  }
+
+  if (Idx == ImmIdx) {
+    int NumConstants = (BitSize + 63) / 64;
+    int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+    return (Cost <= NumConstants * TTI::TCC_Basic)
+               ? static_cast<int>(TTI::TCC_Free)
+               : Cost;
+  }
+  return AArch64TTIImpl::getIntImmCost(Imm, Ty);
+}
+
+int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                                  const APInt &Imm, Type *Ty) {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TTI::TCC_Free;
+
+  switch (IID) {
+  default:
+    return TTI::TCC_Free;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    if (Idx == 1) {
+      int NumConstants = (BitSize + 63) / 64;
+      int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+      return (Cost <= NumConstants * TTI::TCC_Basic)
+                 ? static_cast<int>(TTI::TCC_Free)
+                 : Cost;
+    }
+    break;
+  case Intrinsic::experimental_stackmap:
+    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TTI::TCC_Free;
+    break;
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TTI::TCC_Free;
+    break;
+  }
+  return AArch64TTIImpl::getIntImmCost(Imm, Ty);
+}
+
+TargetTransformInfo::PopcntSupportKind
+AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  if (TyWidth == 32 || TyWidth == 64)
+    return TTI::PSK_FastHardware;
+  // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
+  return TTI::PSK_Software;
+}
+
+int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  EVT SrcTy = TLI->getValueType(DL, Src);
+  EVT DstTy = TLI->getValueType(DL, Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return BaseT::getCastInstrCost(Opcode, Dst, Src);
+
+  static const TypeConversionCostTblEntry
+  ConversionTbl[] = {
+    { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32,  1 },
+    { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64,  0 },
+    { ISD::TRUNCATE, MVT::v8i8,  MVT::v8i32,  3 },
+    { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
+
+    // The number of shll instructions for the extension.
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+
+    // LowerVectorINT_TO_FP:
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+
+    // Complex: to v2f32
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
+
+    // Complex: to v4f32
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8,  4 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
+
+    // Complex: to v8f32
+    { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
+    { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+    { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
+    { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+
+    // Complex: to v16f32
+    { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
+    { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
+
+    // Complex: to v2f64
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
+
+
+    // LowerVectorFP_TO_INT
+    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
+    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+
+    // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
+    { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
+    { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f32, 1 },
+
+    // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
+    { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
+    { ISD::FP_TO_SINT, MVT::v4i8,  MVT::v4f32, 2 },
+    { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
+    { ISD::FP_TO_UINT, MVT::v4i8,  MVT::v4f32, 2 },
+
+    // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
+    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
+    { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
+    { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f64, 2 },
+    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
+    { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
+    { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
+  };
+
+  if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
+                                                 DstTy.getSimpleVT(),
+                                                 SrcTy.getSimpleVT()))
+    return Entry->Cost;
+
+  return BaseT::getCastInstrCost(Opcode, Dst, Src);
+}
+
+int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
+                                             VectorType *VecTy,
+                                             unsigned Index) {
+
+  // Make sure we were given a valid extend opcode.
+  assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
+         "Invalid opcode");
+
+  // We are extending an element we extract from a vector, so the source type
+  // of the extend is the element type of the vector.
+  auto *Src = VecTy->getElementType();
+
+  // Sign- and zero-extends are for integer types only.
+  assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
+
+  // Get the cost for the extract. We compute the cost (if any) for the extend
+  // below.
+  auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
+
+  // Legalize the types.
+  auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
+  auto DstVT = TLI->getValueType(DL, Dst);
+  auto SrcVT = TLI->getValueType(DL, Src);
+
+  // If the resulting type is still a vector and the destination type is legal,
+  // we may get the extension for free. If not, get the default cost for the
+  // extend.
+  if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
+    return Cost + getCastInstrCost(Opcode, Dst, Src);
+
+  // The destination type should be larger than the element type. If not, get
+  // the default cost for the extend.
+  if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
+    return Cost + getCastInstrCost(Opcode, Dst, Src);
+
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Opcode should be either SExt or ZExt");
+
+  // For sign-extends, we only need a smov, which performs the extension
+  // automatically.
+  case Instruction::SExt:
+    return Cost;
+
+  // For zero-extends, the extend is performed automatically by a umov unless
+  // the destination type is i64 and the element type is i8 or i16.
+  case Instruction::ZExt:
+    if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
+      return Cost;
+  }
+
+  // If we are unable to perform the extend for free, get the default cost.
+  return Cost + getCastInstrCost(Opcode, Dst, Src);
+}
+
+int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                       unsigned Index) {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  if (Index != -1U) {
+    // Legalize the type.
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
+
+    // This type is legalized to a scalar type.
+    if (!LT.second.isVector())
+      return 0;
+
+    // The type may be split. Normalize the index to the new type.
+    unsigned Width = LT.second.getVectorNumElements();
+    Index = Index % Width;
+
+    // The element at index zero is already inside the vector.
+    if (Index == 0)
+      return 0;
+  }
+
+  // All other insert/extracts cost this much.
+  return ST->getVectorInsertExtractBaseCost();
+}
+
+int AArch64TTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+  if (ISD == ISD::SDIV &&
+      Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
+    // On AArch64, scalar signed division by constants power-of-two are
+    // normally expanded to the sequence ADD + CMP + SELECT + SRA.
+    // The OperandValue properties many not be same as that of previous
+    // operation; conservatively assume OP_None.
+    int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+                                      TargetTransformInfo::OP_None,
+                                      TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+    return Cost;
+  }
+
+  switch (ISD) {
+  default:
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                         Opd1PropInfo, Opd2PropInfo);
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::XOR:
+  case ISD::OR:
+  case ISD::AND:
+    // These nodes are marked as 'custom' for combining purposes only.
+    // We know that they are legal. See LowerAdd in ISelLowering.
+    return 1 * LT.first;
+  }
+}
+
+int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+  // Address computations in vectorized code with non-consecutive addresses will
+  // likely result in more instructions compared to scalar code where the
+  // computation can more often be merged into the index mode. The resulting
+  // extra micro-ops can significantly decrease throughput.
+  unsigned NumVectorInstToHideOverhead = 10;
+
+  if (Ty->isVectorTy() && IsComplex)
+    return NumVectorInstToHideOverhead;
+
+  // In many cases the address computation is not merged into the instruction
+  // addressing mode.
+  return 1;
+}
+
+int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                       Type *CondTy) {
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  // We don't lower some vector selects well that are wider than the register
+  // width.
+  if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
+    // We would need this many instructions to hide the scalarization happening.
+    const int AmortizationCost = 20;
+    static const TypeConversionCostTblEntry
+    VectorSelectTbl[] = {
+      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
+      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
+    };
+
+    EVT SelCondTy = TLI->getValueType(DL, CondTy);
+    EVT SelValTy = TLI->getValueType(DL, ValTy);
+    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
+      if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
+                                                     SelCondTy.getSimpleVT(),
+                                                     SelValTy.getSimpleVT()))
+        return Entry->Cost;
+    }
+  }
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                    unsigned Alignment, unsigned AddressSpace) {
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+
+  if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
+      Src->isVectorTy() && Alignment != 16 &&
+      Src->getVectorElementType()->isIntegerTy(64)) {
+    // Unaligned stores are extremely inefficient. We don't split
+    // unaligned v2i64 stores because the negative impact that has shown in
+    // practice on inlined memcpy code.
+    // We make v2i64 stores expensive so that we will only vectorize if there
+    // are 6 other instructions getting vectorized.
+    int AmortizationCost = 6;
+
+    return LT.first * 2 * AmortizationCost;
+  }
+
+  if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
+      Src->getVectorNumElements() < 8) {
+    // We scalarize the loads/stores because there is not v.4b register and we
+    // have to promote the elements to v.4h.
+    unsigned NumVecElts = Src->getVectorNumElements();
+    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
+    // We generate 2 instructions per vector element.
+    return NumVectorizableInstsToAmortize * NumVecElts * 2;
+  }
+
+  return LT.first;
+}
+
+int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                               unsigned Factor,
+                                               ArrayRef<unsigned> Indices,
+                                               unsigned Alignment,
+                                               unsigned AddressSpace) {
+  assert(Factor >= 2 && "Invalid interleave factor");
+  assert(isa<VectorType>(VecTy) && "Expect a vector type");
+
+  if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+    unsigned NumElts = VecTy->getVectorNumElements();
+    Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
+    unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+
+    // ldN/stN only support legal vector types of size 64 or 128 in bits.
+    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
+      return Factor;
+  }
+
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+}
+
+int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
+  int Cost = 0;
+  for (auto *I : Tys) {
+    if (!I->isVectorTy())
+      continue;
+    if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
+      Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) +
+        getMemoryOpCost(Instruction::Load, I, 128, 0);
+  }
+  return Cost;
+}
+
+unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
+  return ST->getMaxInterleaveFactor();
+}
+
+void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
+                                             TTI::UnrollingPreferences &UP) {
+  // Enable partial unrolling and runtime unrolling.
+  BaseT::getUnrollingPreferences(L, UP);
+
+  // For inner loop, it is more likely to be a hot one, and the runtime check
+  // can be promoted out from LICM pass, so the overhead is less, let's try
+  // a larger threshold to unroll more loops.
+  if (L->getLoopDepth() > 1)
+    UP.PartialThreshold *= 2;
+
+  // Disable partial & runtime unrolling on -Os.
+  UP.PartialOptSizeThreshold = 0;
+}
+
+Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                                         Type *ExpectedType) {
+  switch (Inst->getIntrinsicID()) {
+  default:
+    return nullptr;
+  case Intrinsic::aarch64_neon_st2:
+  case Intrinsic::aarch64_neon_st3:
+  case Intrinsic::aarch64_neon_st4: {
+    // Create a struct type
+    StructType *ST = dyn_cast<StructType>(ExpectedType);
+    if (!ST)
+      return nullptr;
+    unsigned NumElts = Inst->getNumArgOperands() - 1;
+    if (ST->getNumElements() != NumElts)
+      return nullptr;
+    for (unsigned i = 0, e = NumElts; i != e; ++i) {
+      if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
+        return nullptr;
+    }
+    Value *Res = UndefValue::get(ExpectedType);
+    IRBuilder<> Builder(Inst);
+    for (unsigned i = 0, e = NumElts; i != e; ++i) {
+      Value *L = Inst->getArgOperand(i);
+      Res = Builder.CreateInsertValue(Res, L, i);
+    }
+    return Res;
+  }
+  case Intrinsic::aarch64_neon_ld2:
+  case Intrinsic::aarch64_neon_ld3:
+  case Intrinsic::aarch64_neon_ld4:
+    if (Inst->getType() == ExpectedType)
+      return Inst;
+    return nullptr;
+  }
+}
+
+bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
+                                        MemIntrinsicInfo &Info) {
+  switch (Inst->getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::aarch64_neon_ld2:
+  case Intrinsic::aarch64_neon_ld3:
+  case Intrinsic::aarch64_neon_ld4:
+    Info.ReadMem = true;
+    Info.WriteMem = false;
+    Info.IsSimple = true;
+    Info.NumMemRefs = 1;
+    Info.PtrVal = Inst->getArgOperand(0);
+    break;
+  case Intrinsic::aarch64_neon_st2:
+  case Intrinsic::aarch64_neon_st3:
+  case Intrinsic::aarch64_neon_st4:
+    Info.ReadMem = false;
+    Info.WriteMem = true;
+    Info.IsSimple = true;
+    Info.NumMemRefs = 1;
+    Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
+    break;
+  }
+
+  switch (Inst->getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::aarch64_neon_ld2:
+  case Intrinsic::aarch64_neon_st2:
+    Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
+    break;
+  case Intrinsic::aarch64_neon_ld3:
+  case Intrinsic::aarch64_neon_st3:
+    Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
+    break;
+  case Intrinsic::aarch64_neon_ld4:
+  case Intrinsic::aarch64_neon_st4:
+    Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
+    break;
+  }
+  return true;
+}
+
+unsigned AArch64TTIImpl::getCacheLineSize() {
+  return ST->getCacheLineSize();
+}
+
+unsigned AArch64TTIImpl::getPrefetchDistance() {
+  return ST->getPrefetchDistance();
+}
+
+unsigned AArch64TTIImpl::getMinPrefetchStride() {
+  return ST->getMinPrefetchStride();
+}
+
+unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
+  return ST->getMaxPrefetchIterationsAhead();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
new file mode 100644
index 000000000000..24642cb1698e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -0,0 +1,139 @@
+//===-- AArch64TargetTransformInfo.h - AArch64 specific TTI -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// AArch64 target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+
+namespace llvm {
+
+class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
+  typedef BasicTTIImplBase<AArch64TTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const AArch64Subtarget *ST;
+  const AArch64TargetLowering *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+
+  const AArch64Subtarget *getST() const { return ST; }
+  const AArch64TargetLowering *getTLI() const { return TLI; }
+
+  enum MemIntrinsicType {
+    VECTOR_LDST_TWO_ELEMENTS,
+    VECTOR_LDST_THREE_ELEMENTS,
+    VECTOR_LDST_FOUR_ELEMENTS
+  };
+
+public:
+  explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  /// \name Scalar TTI Implementations
+  /// @{
+
+  using BaseT::getIntImmCost;
+  int getIntImmCost(int64_t Val);
+  int getIntImmCost(const APInt &Imm, Type *Ty);
+  int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+  int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                    Type *Ty);
+  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  bool enableInterleavedAccessVectorization() { return true; }
+
+  unsigned getNumberOfRegisters(bool Vector) {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 32;
+      return 0;
+    }
+    return 31;
+  }
+
+  unsigned getRegisterBitWidth(bool Vector) {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 128;
+      return 0;
+    }
+    return 64;
+  }
+
+  unsigned getMaxInterleaveFactor(unsigned VF);
+
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+
+  int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
+                               unsigned Index);
+
+  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+
+  int getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+
+  int getAddressComputationCost(Type *Ty, bool IsComplex);
+
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+
+  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                      unsigned AddressSpace);
+
+  int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
+
+  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                           Type *ExpectedType);
+
+  bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
+
+  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+                                 ArrayRef<unsigned> Indices, unsigned Alignment,
+                                 unsigned AddressSpace);
+
+  unsigned getCacheLineSize();
+
+  unsigned getPrefetchDistance();
+
+  unsigned getMinPrefetchStride();
+
+  unsigned getMaxPrefetchIterationsAhead();
+  /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp b/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
new file mode 100644
index 000000000000..e3b1d7cea48d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
@@ -0,0 +1,371 @@
+//=- AArch64VectorByElementOpt.cpp - AArch64 vector by element inst opt pass =//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs optimization for vector by element
+// SIMD instructions.
+//
+// Certain SIMD instructions with vector element operand are not efficient.
+// Rewrite them into SIMD instructions with vector operands. This rewrite
+// is driven by the latency of the instructions.
+//
+// Example:
+//    fmla v0.4s, v1.4s, v2.s[1]
+//    is rewritten into
+//    dup v3.4s, v2.s[1]
+//    fmla v0.4s, v1.4s, v3.4s
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-vectorbyelement-opt"
+
+STATISTIC(NumModifiedInstr,
+          "Number of vector by element instructions modified");
+
+#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME                                     \
+  "AArch64 vector by element instruction optimization pass"
+
+namespace {
+
+struct AArch64VectorByElementOpt : public MachineFunctionPass {
+  static char ID;
+  AArch64VectorByElementOpt() : MachineFunctionPass(ID) {
+    initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry());
+  }
+
+  const TargetInstrInfo *TII;
+  MachineRegisterInfo *MRI;
+  TargetSchedModel SchedModel;
+
+  /// Based only on latency of instructions, determine if it is cost efficient
+  /// to replace the instruction InstDesc by the two instructions InstDescRep1
+  /// and InstDescRep2.
+  /// Return true if replacement is recommended.
+  bool
+  shouldReplaceInstruction(MachineFunction *MF, const MCInstrDesc *InstDesc,
+                           const MCInstrDesc *InstDescRep1,
+                           const MCInstrDesc *InstDescRep2,
+                           std::map<unsigned, bool> &VecInstElemTable) const;
+
+  /// Determine if we need to exit the vector by element instruction
+  /// optimization pass early. This makes sure that Targets with no need
+  /// for this optimization do not spent any compile time on this pass.
+  /// This check is done by comparing the latency of an indexed FMLA
+  /// instruction to the latency of the DUP + the latency of a vector
+  /// FMLA instruction. We do not check on other related instructions such
+  /// as FMLS as we assume that if the situation shows up for one
+  /// instruction, then it is likely to show up for the related ones.
+  /// Return true if early exit of the pass is recommended.
+  bool earlyExitVectElement(MachineFunction *MF);
+
+  /// Check whether an equivalent DUP instruction has already been
+  /// created or not.
+  /// Return true when the dup instruction already exists. In this case,
+  /// DestReg will point to the destination of the already created DUP.
+  bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
+                unsigned LaneNumber, unsigned *DestReg) const;
+
+  /// Certain SIMD instructions with vector element operand are not efficient.
+  /// Rewrite them into SIMD instructions with vector operands. This rewrite
+  /// is driven by the latency of the instructions.
+  /// Return true if the SIMD instruction is modified.
+  bool optimizeVectElement(MachineInstr &MI,
+                           std::map<unsigned, bool> *VecInstElemTable) const;
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  StringRef getPassName() const override {
+    return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
+  }
+};
+char AArch64VectorByElementOpt::ID = 0;
+} // namespace
+
+INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt",
+                AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
+
+/// Based only on latency of instructions, determine if it is cost efficient
+/// to replace the instruction InstDesc by the two instructions InstDescRep1
+/// and InstDescRep2. Note that it is assumed in this fuction that an
+/// instruction of type InstDesc is always replaced by the same two
+/// instructions as results are cached here.
+/// Return true if replacement is recommended.
+bool AArch64VectorByElementOpt::shouldReplaceInstruction(
+    MachineFunction *MF, const MCInstrDesc *InstDesc,
+    const MCInstrDesc *InstDescRep1, const MCInstrDesc *InstDescRep2,
+    std::map<unsigned, bool> &VecInstElemTable) const {
+  // Check if replacment decision is alredy available in the cached table.
+  // if so, return it.
+  if (!VecInstElemTable.empty() &&
+      VecInstElemTable.find(InstDesc->getOpcode()) != VecInstElemTable.end())
+    return VecInstElemTable[InstDesc->getOpcode()];
+
+  unsigned SCIdx = InstDesc->getSchedClass();
+  unsigned SCIdxRep1 = InstDescRep1->getSchedClass();
+  unsigned SCIdxRep2 = InstDescRep2->getSchedClass();
+  const MCSchedClassDesc *SCDesc =
+      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
+  const MCSchedClassDesc *SCDescRep1 =
+      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep1);
+  const MCSchedClassDesc *SCDescRep2 =
+      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep2);
+
+  // If a subtarget does not define resources for any of the instructions
+  // of interest, then return false for no replacement.
+  if (!SCDesc->isValid() || SCDesc->isVariant() || !SCDescRep1->isValid() ||
+      SCDescRep1->isVariant() || !SCDescRep2->isValid() ||
+      SCDescRep2->isVariant()) {
+    VecInstElemTable[InstDesc->getOpcode()] = false;
+    return false;
+  }
+
+  if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) >
+      SchedModel.computeInstrLatency(InstDescRep1->getOpcode()) +
+          SchedModel.computeInstrLatency(InstDescRep2->getOpcode())) {
+    VecInstElemTable[InstDesc->getOpcode()] = true;
+    return true;
+  }
+  VecInstElemTable[InstDesc->getOpcode()] = false;
+  return false;
+}
+
+/// Determine if we need to exit the vector by element instruction
+/// optimization pass early. This makes sure that Targets with no need
+/// for this optimization do not spent any compile time on this pass.
+/// This check is done by comparing the latency of an indexed FMLA
+/// instruction to the latency of the DUP + the latency of a vector
+/// FMLA instruction. We do not check on other related instructions such
+/// as FMLS as we assume that if the situation shows up for one
+/// instruction, then it is likely to show up for the related ones.
+/// Return true if early exit of the pass is recommended.
+bool AArch64VectorByElementOpt::earlyExitVectElement(MachineFunction *MF) {
+  std::map<unsigned, bool> VecInstElemTable;
+  const MCInstrDesc *IndexMulMCID = &TII->get(AArch64::FMLAv4i32_indexed);
+  const MCInstrDesc *DupMCID = &TII->get(AArch64::DUPv4i32lane);
+  const MCInstrDesc *MulMCID = &TII->get(AArch64::FMULv4f32);
+
+  if (!shouldReplaceInstruction(MF, IndexMulMCID, DupMCID, MulMCID,
+                                VecInstElemTable))
+    return true;
+  return false;
+}
+
+/// Check whether an equivalent DUP instruction has already been
+/// created or not.
+/// Return true when the dup instruction already exists. In this case,
+/// DestReg will point to the destination of the already created DUP.
+bool AArch64VectorByElementOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
+                                         unsigned SrcReg, unsigned LaneNumber,
+                                         unsigned *DestReg) const {
+  for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
+       MII != MIE;) {
+    MII--;
+    MachineInstr *CurrentMI = &*MII;
+
+    if (CurrentMI->getOpcode() == DupOpcode &&
+        CurrentMI->getNumOperands() == 3 &&
+        CurrentMI->getOperand(1).getReg() == SrcReg &&
+        CurrentMI->getOperand(2).getImm() == LaneNumber) {
+      *DestReg = CurrentMI->getOperand(0).getReg();
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// Certain SIMD instructions with vector element operand are not efficient.
+/// Rewrite them into SIMD instructions with vector operands. This rewrite
+/// is driven by the latency of the instructions.
+/// The instruction of concerns are for the time being fmla, fmls, fmul,
+/// and fmulx and hence they are hardcoded.
+///
+/// Example:
+///    fmla v0.4s, v1.4s, v2.s[1]
+///    is rewritten into
+///    dup v3.4s, v2.s[1]           // dup not necessary if redundant
+///    fmla v0.4s, v1.4s, v3.4s
+/// Return true if the SIMD instruction is modified.
+bool AArch64VectorByElementOpt::optimizeVectElement(
+    MachineInstr &MI, std::map<unsigned, bool> *VecInstElemTable) const {
+  const MCInstrDesc *MulMCID, *DupMCID;
+  const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
+
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+
+  // 4X32 instructions
+  case AArch64::FMLAv4i32_indexed:
+    DupMCID = &TII->get(AArch64::DUPv4i32lane);
+    MulMCID = &TII->get(AArch64::FMLAv4f32);
+    break;
+  case AArch64::FMLSv4i32_indexed:
+    DupMCID = &TII->get(AArch64::DUPv4i32lane);
+    MulMCID = &TII->get(AArch64::FMLSv4f32);
+    break;
+  case AArch64::FMULXv4i32_indexed:
+    DupMCID = &TII->get(AArch64::DUPv4i32lane);
+    MulMCID = &TII->get(AArch64::FMULXv4f32);
+    break;
+  case AArch64::FMULv4i32_indexed:
+    DupMCID = &TII->get(AArch64::DUPv4i32lane);
+    MulMCID = &TII->get(AArch64::FMULv4f32);
+    break;
+
+  // 2X64 instructions
+  case AArch64::FMLAv2i64_indexed:
+    DupMCID = &TII->get(AArch64::DUPv2i64lane);
+    MulMCID = &TII->get(AArch64::FMLAv2f64);
+    break;
+  case AArch64::FMLSv2i64_indexed:
+    DupMCID = &TII->get(AArch64::DUPv2i64lane);
+    MulMCID = &TII->get(AArch64::FMLSv2f64);
+    break;
+  case AArch64::FMULXv2i64_indexed:
+    DupMCID = &TII->get(AArch64::DUPv2i64lane);
+    MulMCID = &TII->get(AArch64::FMULXv2f64);
+    break;
+  case AArch64::FMULv2i64_indexed:
+    DupMCID = &TII->get(AArch64::DUPv2i64lane);
+    MulMCID = &TII->get(AArch64::FMULv2f64);
+    break;
+
+  // 2X32 instructions
+  case AArch64::FMLAv2i32_indexed:
+    RC = &AArch64::FPR64RegClass;
+    DupMCID = &TII->get(AArch64::DUPv2i32lane);
+    MulMCID = &TII->get(AArch64::FMLAv2f32);
+    break;
+  case AArch64::FMLSv2i32_indexed:
+    RC = &AArch64::FPR64RegClass;
+    DupMCID = &TII->get(AArch64::DUPv2i32lane);
+    MulMCID = &TII->get(AArch64::FMLSv2f32);
+    break;
+  case AArch64::FMULXv2i32_indexed:
+    RC = &AArch64::FPR64RegClass;
+    DupMCID = &TII->get(AArch64::DUPv2i32lane);
+    MulMCID = &TII->get(AArch64::FMULXv2f32);
+    break;
+  case AArch64::FMULv2i32_indexed:
+    RC = &AArch64::FPR64RegClass;
+    DupMCID = &TII->get(AArch64::DUPv2i32lane);
+    MulMCID = &TII->get(AArch64::FMULv2f32);
+    break;
+  }
+
+  if (!shouldReplaceInstruction(MI.getParent()->getParent(),
+                                &TII->get(MI.getOpcode()), DupMCID, MulMCID,
+                                *VecInstElemTable))
+    return false;
+
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  // get the operands of the current SIMD arithmetic instruction.
+  unsigned MulDest = MI.getOperand(0).getReg();
+  unsigned SrcReg0 = MI.getOperand(1).getReg();
+  unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
+  unsigned SrcReg1 = MI.getOperand(2).getReg();
+  unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
+  unsigned DupDest;
+
+  // Instructions of interest have either 4 or 5 operands.
+  if (MI.getNumOperands() == 5) {
+    unsigned SrcReg2 = MI.getOperand(3).getReg();
+    unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
+    unsigned LaneNumber = MI.getOperand(4).getImm();
+
+    // Create a new DUP instruction. Note that if an equivalent DUP instruction
+    // has already been created before, then use that one instread of creating
+    // a new one.
+    if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
+      DupDest = MRI.createVirtualRegister(RC);
+      BuildMI(MBB, MI, DL, *DupMCID, DupDest)
+          .addReg(SrcReg2, Src2IsKill)
+          .addImm(LaneNumber);
+    }
+    BuildMI(MBB, MI, DL, *MulMCID, MulDest)
+        .addReg(SrcReg0, Src0IsKill)
+        .addReg(SrcReg1, Src1IsKill)
+        .addReg(DupDest, Src2IsKill);
+  } else if (MI.getNumOperands() == 4) {
+    unsigned LaneNumber = MI.getOperand(3).getImm();
+    if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
+      DupDest = MRI.createVirtualRegister(RC);
+      BuildMI(MBB, MI, DL, *DupMCID, DupDest)
+          .addReg(SrcReg1, Src1IsKill)
+          .addImm(LaneNumber);
+    }
+    BuildMI(MBB, MI, DL, *MulMCID, MulDest)
+        .addReg(SrcReg0, Src0IsKill)
+        .addReg(DupDest, Src1IsKill);
+  } else {
+    return false;
+  }
+
+  ++NumModifiedInstr;
+  return true;
+}
+
+bool AArch64VectorByElementOpt::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  TII = MF.getSubtarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+  const AArch64InstrInfo *AAII =
+      static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
+  if (!AAII)
+    return false;
+  SchedModel.init(ST.getSchedModel(), &ST, AAII);
+  if (!SchedModel.hasInstrSchedModel())
+    return false;
+
+  // A simple check to exit this pass early for targets that do not need it.
+  if (earlyExitVectElement(&MF))
+    return false;
+
+  bool Changed = false;
+  std::map<unsigned, bool> VecInstElemTable;
+  SmallVector<MachineInstr *, 8> RemoveMIs;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
+         MII != MIE;) {
+      MachineInstr &MI = *MII;
+      if (optimizeVectElement(MI, &VecInstElemTable)) {
+        // Add MI to the list of instructions to be removed given that it has
+        // been replaced.
+        RemoveMIs.push_back(&MI);
+        Changed = true;
+      }
+      ++MII;
+    }
+  }
+
+  for (MachineInstr *MI : RemoveMIs)
+    MI->eraseFromParent();
+
+  return Changed;
+}
+
+/// createAArch64VectorByElementOptPass - returns an instance of the
+/// vector by element optimization pass.
+FunctionPass *llvm::createAArch64VectorByElementOptPass() {
+  return new AArch64VectorByElementOpt();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
new file mode 100644
index 000000000000..db84afacf30e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -0,0 +1,4625 @@
+//==- AArch64AsmParser.cpp - Parse AArch64 assembly to MCInst instructions -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64TargetStreamer.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdio>
+using namespace llvm;
+
+namespace {
+
+class AArch64Operand;
+
+class AArch64AsmParser : public MCTargetAsmParser {
+private:
+  StringRef Mnemonic; ///< Instruction mnemonic.
+
+  // Map of register aliases registers via the .req directive.
+  StringMap<std::pair<bool, unsigned> > RegisterReqs;
+
+  AArch64TargetStreamer &getTargetStreamer() {
+    MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+    return static_cast<AArch64TargetStreamer &>(TS);
+  }
+
+  SMLoc getLoc() const { return getParser().getTok().getLoc(); }
+
+  bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
+  AArch64CC::CondCode parseCondCodeString(StringRef Cond);
+  bool parseCondCode(OperandVector &Operands, bool invertCondCode);
+  unsigned matchRegisterNameAlias(StringRef Name, bool isVector);
+  int tryParseRegister();
+  int tryMatchVectorRegister(StringRef &Kind, bool expected);
+  bool parseRegister(OperandVector &Operands);
+  bool parseSymbolicImmVal(const MCExpr *&ImmVal);
+  bool parseVectorList(OperandVector &Operands);
+  bool parseOperand(OperandVector &Operands, bool isCondCode,
+                    bool invertCondCode);
+
+  bool showMatchError(SMLoc Loc, unsigned ErrCode);
+
+  bool parseDirectiveArch(SMLoc L);
+  bool parseDirectiveCPU(SMLoc L);
+  bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseDirectiveInst(SMLoc L);
+
+  bool parseDirectiveTLSDescCall(SMLoc L);
+
+  bool parseDirectiveLOH(StringRef LOH, SMLoc L);
+  bool parseDirectiveLtorg(SMLoc L);
+
+  bool parseDirectiveReq(StringRef Name, SMLoc L);
+  bool parseDirectiveUnreq(SMLoc L);
+
+  bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+/// @name Auto-generated Match Functions
+/// {
+
+#define GET_ASSEMBLER_HEADER
+#include "AArch64GenAsmMatcher.inc"
+
+  /// }
+
+  OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
+  OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
+  OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands);
+  OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
+  OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
+  OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
+  OperandMatchResultTy tryParsePSBHint(OperandVector &Operands);
+  OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
+  OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
+  OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
+  OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands);
+  OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
+  bool tryParseVectorRegister(OperandVector &Operands);
+  OperandMatchResultTy tryParseGPRSeqPair(OperandVector &Operands);
+
+public:
+  enum AArch64MatchResultTy {
+    Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "AArch64GenAsmMatcher.inc"
+  };
+  bool IsILP32;
+  AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
+                   const MCInstrInfo &MII, const MCTargetOptions &Options)
+    : MCTargetAsmParser(Options, STI) {
+    IsILP32 = Options.getABIName() == "ilp32";
+    MCAsmParserExtension::Initialize(Parser);
+    MCStreamer &S = getParser().getStreamer();
+    if (S.getTargetStreamer() == nullptr)
+      new AArch64TargetStreamer(S);
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+  }
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
+
+  static bool classifySymbolRef(const MCExpr *Expr,
+                                AArch64MCExpr::VariantKind &ELFRefKind,
+                                MCSymbolRefExpr::VariantKind &DarwinRefKind,
+                                int64_t &Addend);
+};
+} // end anonymous namespace
+
+namespace {
+
+/// AArch64Operand - Instances of this class represent a parsed AArch64 machine
+/// instruction.
+class AArch64Operand : public MCParsedAsmOperand {
+private:
+  enum KindTy {
+    k_Immediate,
+    k_ShiftedImm,
+    k_CondCode,
+    k_Register,
+    k_VectorList,
+    k_VectorIndex,
+    k_Token,
+    k_SysReg,
+    k_SysCR,
+    k_Prefetch,
+    k_ShiftExtend,
+    k_FPImm,
+    k_Barrier,
+    k_PSBHint,
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+    bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+    bool isVector;
+  };
+
+  struct VectorListOp {
+    unsigned RegNum;
+    unsigned Count;
+    unsigned NumElements;
+    unsigned ElementKind;
+  };
+
+  struct VectorIndexOp {
+    unsigned Val;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct ShiftedImmOp {
+    const MCExpr *Val;
+    unsigned ShiftAmount;
+  };
+
+  struct CondCodeOp {
+    AArch64CC::CondCode Code;
+  };
+
+  struct FPImmOp {
+    unsigned Val; // Encoded 8-bit representation.
+  };
+
+  struct BarrierOp {
+    const char *Data;
+    unsigned Length;
+    unsigned Val; // Not the enum since not all values have names.
+  };
+
+  struct SysRegOp {
+    const char *Data;
+    unsigned Length;
+    uint32_t MRSReg;
+    uint32_t MSRReg;
+    uint32_t PStateField;
+  };
+
+  struct SysCRImmOp {
+    unsigned Val;
+  };
+
+  struct PrefetchOp {
+    const char *Data;
+    unsigned Length;
+    unsigned Val;
+  };
+
+  struct PSBHintOp {
+    const char *Data;
+    unsigned Length;
+    unsigned Val;
+  };
+
+  struct ShiftExtendOp {
+    AArch64_AM::ShiftExtendType Type;
+    unsigned Amount;
+    bool HasExplicitAmount;
+  };
+
+  struct ExtendOp {
+    unsigned Val;
+  };
+
+  union {
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct VectorListOp VectorList;
+    struct VectorIndexOp VectorIndex;
+    struct ImmOp Imm;
+    struct ShiftedImmOp ShiftedImm;
+    struct CondCodeOp CondCode;
+    struct FPImmOp FPImm;
+    struct BarrierOp Barrier;
+    struct SysRegOp SysReg;
+    struct SysCRImmOp SysCRImm;
+    struct PrefetchOp Prefetch;
+    struct PSBHintOp PSBHint;
+    struct ShiftExtendOp ShiftExtend;
+  };
+
+  // Keep the MCContext around as the MCExprs may need manipulated during
+  // the add<>Operands() calls.
+  MCContext &Ctx;
+
+public:
+  AArch64Operand(KindTy K, MCContext &Ctx) : Kind(K), Ctx(Ctx) {}
+
+  AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) {
+    Kind = o.Kind;
+    StartLoc = o.StartLoc;
+    EndLoc = o.EndLoc;
+    switch (Kind) {
+    case k_Token:
+      Tok = o.Tok;
+      break;
+    case k_Immediate:
+      Imm = o.Imm;
+      break;
+    case k_ShiftedImm:
+      ShiftedImm = o.ShiftedImm;
+      break;
+    case k_CondCode:
+      CondCode = o.CondCode;
+      break;
+    case k_FPImm:
+      FPImm = o.FPImm;
+      break;
+    case k_Barrier:
+      Barrier = o.Barrier;
+      break;
+    case k_Register:
+      Reg = o.Reg;
+      break;
+    case k_VectorList:
+      VectorList = o.VectorList;
+      break;
+    case k_VectorIndex:
+      VectorIndex = o.VectorIndex;
+      break;
+    case k_SysReg:
+      SysReg = o.SysReg;
+      break;
+    case k_SysCR:
+      SysCRImm = o.SysCRImm;
+      break;
+    case k_Prefetch:
+      Prefetch = o.Prefetch;
+      break;
+    case k_PSBHint:
+      PSBHint = o.PSBHint;
+      break;
+    case k_ShiftExtend:
+      ShiftExtend = o.ShiftExtend;
+      break;
+    }
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  bool isTokenSuffix() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return Tok.IsSuffix;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == k_Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  const MCExpr *getShiftedImmVal() const {
+    assert(Kind == k_ShiftedImm && "Invalid access!");
+    return ShiftedImm.Val;
+  }
+
+  unsigned getShiftedImmShift() const {
+    assert(Kind == k_ShiftedImm && "Invalid access!");
+    return ShiftedImm.ShiftAmount;
+  }
+
+  AArch64CC::CondCode getCondCode() const {
+    assert(Kind == k_CondCode && "Invalid access!");
+    return CondCode.Code;
+  }
+
+  unsigned getFPImm() const {
+    assert(Kind == k_FPImm && "Invalid access!");
+    return FPImm.Val;
+  }
+
+  unsigned getBarrier() const {
+    assert(Kind == k_Barrier && "Invalid access!");
+    return Barrier.Val;
+  }
+
+  StringRef getBarrierName() const {
+    assert(Kind == k_Barrier && "Invalid access!");
+    return StringRef(Barrier.Data, Barrier.Length);
+  }
+
+  unsigned getReg() const override {
+    assert(Kind == k_Register && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  unsigned getVectorListStart() const {
+    assert(Kind == k_VectorList && "Invalid access!");
+    return VectorList.RegNum;
+  }
+
+  unsigned getVectorListCount() const {
+    assert(Kind == k_VectorList && "Invalid access!");
+    return VectorList.Count;
+  }
+
+  unsigned getVectorIndex() const {
+    assert(Kind == k_VectorIndex && "Invalid access!");
+    return VectorIndex.Val;
+  }
+
+  StringRef getSysReg() const {
+    assert(Kind == k_SysReg && "Invalid access!");
+    return StringRef(SysReg.Data, SysReg.Length);
+  }
+
+  unsigned getSysCR() const {
+    assert(Kind == k_SysCR && "Invalid access!");
+    return SysCRImm.Val;
+  }
+
+  unsigned getPrefetch() const {
+    assert(Kind == k_Prefetch && "Invalid access!");
+    return Prefetch.Val;
+  }
+
+  unsigned getPSBHint() const {
+    assert(Kind == k_PSBHint && "Invalid access!");
+    return PSBHint.Val;
+  }
+
+  StringRef getPSBHintName() const {
+    assert(Kind == k_PSBHint && "Invalid access!");
+    return StringRef(PSBHint.Data, PSBHint.Length);
+  }
+
+  StringRef getPrefetchName() const {
+    assert(Kind == k_Prefetch && "Invalid access!");
+    return StringRef(Prefetch.Data, Prefetch.Length);
+  }
+
+  AArch64_AM::ShiftExtendType getShiftExtendType() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.Type;
+  }
+
+  unsigned getShiftExtendAmount() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.Amount;
+  }
+
+  bool hasShiftExtendAmount() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.HasExplicitAmount;
+  }
+
+  bool isImm() const override { return Kind == k_Immediate; }
+  bool isMem() const override { return false; }
+  bool isSImm9() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -256 && Val < 256);
+  }
+  bool isSImm7s4() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -256 && Val <= 252 && (Val & 3) == 0);
+  }
+  bool isSImm7s8() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -512 && Val <= 504 && (Val & 7) == 0);
+  }
+  bool isSImm7s16() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0);
+  }
+
+  bool isSymbolicUImm12Offset(const MCExpr *Expr, unsigned Scale) const {
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (!AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
+                                           Addend)) {
+      // If we don't understand the expression, assume the best and
+      // let the fixup and relocation code deal with it.
+      return true;
+    }
+
+    if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+        ELFRefKind == AArch64MCExpr::VK_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_GOT_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) {
+      // Note that we don't range-check the addend. It's adjusted modulo page
+      // size when converted, so there is no "out of range" condition when using
+      // @pageoff.
+      return Addend >= 0 && (Addend % Scale) == 0;
+    } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF ||
+               DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) {
+      // @gotpageoff/@tlvppageoff can only be used directly, not with an addend.
+      return Addend == 0;
+    }
+
+    return false;
+  }
+
+  template <int Scale> bool isUImm12Offset() const {
+    if (!isImm())
+      return false;
+
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return isSymbolicUImm12Offset(getImm(), Scale);
+
+    int64_t Val = MCE->getValue();
+    return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
+  }
+
+  bool isImm0_1() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 2);
+  }
+  bool isImm0_7() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 8);
+  }
+  bool isImm1_8() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val > 0 && Val < 9);
+  }
+  bool isImm0_15() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 16);
+  }
+  bool isImm1_16() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val > 0 && Val < 17);
+  }
+  bool isImm0_31() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 32);
+  }
+  bool isImm1_31() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 32);
+  }
+  bool isImm1_32() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 33);
+  }
+  bool isImm0_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 64);
+  }
+  bool isImm1_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 64);
+  }
+  bool isImm1_64() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 65);
+  }
+  bool isImm0_127() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 128);
+  }
+  bool isImm0_255() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 256);
+  }
+  bool isImm0_65535() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 65536);
+  }
+  bool isImm32_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 32 && Val < 64);
+  }
+  bool isLogicalImm32() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    if (Val >> 32 != 0 && Val >> 32 != ~0LL)
+      return false;
+    Val &= 0xFFFFFFFF;
+    return AArch64_AM::isLogicalImmediate(Val, 32);
+  }
+  bool isLogicalImm64() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64);
+  }
+  bool isLogicalImm32Not() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = ~MCE->getValue() & 0xFFFFFFFF;
+    return AArch64_AM::isLogicalImmediate(Val, 32);
+  }
+  bool isLogicalImm64Not() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isLogicalImmediate(~MCE->getValue(), 64);
+  }
+  bool isShiftedImm() const { return Kind == k_ShiftedImm; }
+  bool isAddSubImm() const {
+    if (!isShiftedImm() && !isImm())
+      return false;
+
+    const MCExpr *Expr;
+
+    // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
+    if (isShiftedImm()) {
+      unsigned Shift = ShiftedImm.ShiftAmount;
+      Expr = ShiftedImm.Val;
+      if (Shift != 0 && Shift != 12)
+        return false;
+    } else {
+      Expr = getImm();
+    }
+
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind,
+                                          DarwinRefKind, Addend)) {
+      return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF
+          || DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF
+          || (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0)
+          || ELFRefKind == AArch64MCExpr::VK_LO12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_HI12
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC
+          || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12;
+    }
+
+    // If it's a constant, it should be a real immediate in range:
+    if (auto *CE = dyn_cast<MCConstantExpr>(Expr))
+      return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+
+    // If it's an expression, we hope for the best and let the fixup/relocation
+    // code deal with it.
+    return true;
+  }
+  bool isAddSubImmNeg() const {
+    if (!isShiftedImm() && !isImm())
+      return false;
+
+    const MCExpr *Expr;
+
+    // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
+    if (isShiftedImm()) {
+      unsigned Shift = ShiftedImm.ShiftAmount;
+      Expr = ShiftedImm.Val;
+      if (Shift != 0 && Shift != 12)
+        return false;
+    } else
+      Expr = getImm();
+
+    // Otherwise it should be a real negative immediate in range:
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+    return CE != nullptr && CE->getValue() < 0 && -CE->getValue() <= 0xfff;
+  }
+  bool isCondCode() const { return Kind == k_CondCode; }
+  bool isSIMDImmType10() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue());
+  }
+  bool isBranchTarget26() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
+  }
+  bool isPCRelLabel19() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
+  }
+  bool isBranchTarget14() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
+  }
+
+  bool
+  isMovWSymbol(ArrayRef<AArch64MCExpr::VariantKind> AllowedModifiers) const {
+    if (!isImm())
+      return false;
+
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (!AArch64AsmParser::classifySymbolRef(getImm(), ELFRefKind,
+                                             DarwinRefKind, Addend)) {
+      return false;
+    }
+    if (DarwinRefKind != MCSymbolRefExpr::VK_None)
+      return false;
+
+    for (unsigned i = 0; i != AllowedModifiers.size(); ++i) {
+      if (ELFRefKind == AllowedModifiers[i])
+        return Addend == 0;
+    }
+
+    return false;
+  }
+
+  bool isMovZSymbolG3() const {
+    return isMovWSymbol(AArch64MCExpr::VK_ABS_G3);
+  }
+
+  bool isMovZSymbolG2() const {
+    return isMovWSymbol({AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S,
+                         AArch64MCExpr::VK_TPREL_G2,
+                         AArch64MCExpr::VK_DTPREL_G2});
+  }
+
+  bool isMovZSymbolG1() const {
+    return isMovWSymbol({
+        AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S,
+        AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1,
+        AArch64MCExpr::VK_DTPREL_G1,
+    });
+  }
+
+  bool isMovZSymbolG0() const {
+    return isMovWSymbol({AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S,
+                         AArch64MCExpr::VK_TPREL_G0,
+                         AArch64MCExpr::VK_DTPREL_G0});
+  }
+
+  bool isMovKSymbolG3() const {
+    return isMovWSymbol(AArch64MCExpr::VK_ABS_G3);
+  }
+
+  bool isMovKSymbolG2() const {
+    return isMovWSymbol(AArch64MCExpr::VK_ABS_G2_NC);
+  }
+
+  bool isMovKSymbolG1() const {
+    return isMovWSymbol({AArch64MCExpr::VK_ABS_G1_NC,
+                         AArch64MCExpr::VK_TPREL_G1_NC,
+                         AArch64MCExpr::VK_DTPREL_G1_NC});
+  }
+
+  bool isMovKSymbolG0() const {
+    return isMovWSymbol(
+        {AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC,
+         AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC});
+  }
+
+  template<int RegWidth, int Shift>
+  bool isMOVZMovAlias() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    uint64_t Value = CE->getValue();
+
+    return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth);
+  }
+
+  template<int RegWidth, int Shift>
+  bool isMOVNMovAlias() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    uint64_t Value = CE->getValue();
+
+    return AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth);
+  }
+
+  bool isFPImm() const { return Kind == k_FPImm; }
+  bool isBarrier() const { return Kind == k_Barrier; }
+  bool isSysReg() const { return Kind == k_SysReg; }
+  bool isMRSSystemRegister() const {
+    if (!isSysReg()) return false;
+
+    return SysReg.MRSReg != -1U;
+  }
+  bool isMSRSystemRegister() const {
+    if (!isSysReg()) return false;
+    return SysReg.MSRReg != -1U;
+  }
+  bool isSystemPStateFieldWithImm0_1() const {
+    if (!isSysReg()) return false;
+    return (SysReg.PStateField == AArch64PState::PAN ||
+            SysReg.PStateField == AArch64PState::UAO);
+  }
+  bool isSystemPStateFieldWithImm0_15() const {
+    if (!isSysReg() || isSystemPStateFieldWithImm0_1()) return false;
+    return SysReg.PStateField != -1U;
+  }
+  bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
+  bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
+  bool isVectorRegLo() const {
+    return Kind == k_Register && Reg.isVector &&
+           AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
+               Reg.RegNum);
+  }
+  bool isGPR32as64() const {
+    return Kind == k_Register && !Reg.isVector &&
+      AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
+  }
+  bool isWSeqPair() const {
+    return Kind == k_Register && !Reg.isVector &&
+           AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains(
+               Reg.RegNum);
+  }
+  bool isXSeqPair() const {
+    return Kind == k_Register && !Reg.isVector &&
+           AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID].contains(
+               Reg.RegNum);
+  }
+
+  bool isGPR64sp0() const {
+    return Kind == k_Register && !Reg.isVector &&
+      AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].contains(Reg.RegNum);
+  }
+
+  /// Is this a vector list with the type implicit (presumably attached to the
+  /// instruction itself)?
+  template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const {
+    return Kind == k_VectorList && VectorList.Count == NumRegs &&
+           !VectorList.ElementKind;
+  }
+
+  template <unsigned NumRegs, unsigned NumElements, char ElementKind>
+  bool isTypedVectorList() const {
+    if (Kind != k_VectorList)
+      return false;
+    if (VectorList.Count != NumRegs)
+      return false;
+    if (VectorList.ElementKind != ElementKind)
+      return false;
+    return VectorList.NumElements == NumElements;
+  }
+
+  bool isVectorIndex1() const {
+    return Kind == k_VectorIndex && VectorIndex.Val == 1;
+  }
+  bool isVectorIndexB() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 16;
+  }
+  bool isVectorIndexH() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 8;
+  }
+  bool isVectorIndexS() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 4;
+  }
+  bool isVectorIndexD() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 2;
+  }
+  bool isToken() const override { return Kind == k_Token; }
+  bool isTokenEqual(StringRef Str) const {
+    return Kind == k_Token && getToken() == Str;
+  }
+  bool isSysCR() const { return Kind == k_SysCR; }
+  bool isPrefetch() const { return Kind == k_Prefetch; }
+  bool isPSBHint() const { return Kind == k_PSBHint; }
+  bool isShiftExtend() const { return Kind == k_ShiftExtend; }
+  bool isShifter() const {
+    if (!isShiftExtend())
+      return false;
+
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR || ST == AArch64_AM::ROR ||
+            ST == AArch64_AM::MSL);
+  }
+  bool isExtend() const {
+    if (!isShiftExtend())
+      return false;
+
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTB || ET == AArch64_AM::SXTB ||
+            ET == AArch64_AM::UXTH || ET == AArch64_AM::SXTH ||
+            ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW ||
+            ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
+            ET == AArch64_AM::LSL) &&
+           getShiftExtendAmount() <= 4;
+  }
+
+  bool isExtend64() const {
+    if (!isExtend())
+      return false;
+    // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class).
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return ET != AArch64_AM::UXTX && ET != AArch64_AM::SXTX;
+  }
+  bool isExtendLSL64() const {
+    if (!isExtend())
+      return false;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
+            ET == AArch64_AM::LSL) &&
+           getShiftExtendAmount() <= 4;
+  }
+
+  template<int Width> bool isMemXExtend() const {
+    if (!isExtend())
+      return false;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::LSL || ET == AArch64_AM::SXTX) &&
+           (getShiftExtendAmount() == Log2_32(Width / 8) ||
+            getShiftExtendAmount() == 0);
+  }
+
+  template<int Width> bool isMemWExtend() const {
+    if (!isExtend())
+      return false;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW) &&
+           (getShiftExtendAmount() == Log2_32(Width / 8) ||
+            getShiftExtendAmount() == 0);
+  }
+
+  template <unsigned width>
+  bool isArithmeticShifter() const {
+    if (!isShifter())
+      return false;
+
+    // An arithmetic shifter is LSL, LSR, or ASR.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR) && getShiftExtendAmount() < width;
+  }
+
+  template <unsigned width>
+  bool isLogicalShifter() const {
+    if (!isShifter())
+      return false;
+
+    // A logical shifter is LSL, LSR, ASR or ROR.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR || ST == AArch64_AM::ROR) &&
+           getShiftExtendAmount() < width;
+  }
+
+  bool isMovImm32Shifter() const {
+    if (!isShifter())
+      return false;
+
+    // A MOVi shifter is LSL of 0, 16, 32, or 48.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    if (ST != AArch64_AM::LSL)
+      return false;
+    uint64_t Val = getShiftExtendAmount();
+    return (Val == 0 || Val == 16);
+  }
+
+  bool isMovImm64Shifter() const {
+    if (!isShifter())
+      return false;
+
+    // A MOVi shifter is LSL of 0 or 16.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    if (ST != AArch64_AM::LSL)
+      return false;
+    uint64_t Val = getShiftExtendAmount();
+    return (Val == 0 || Val == 16 || Val == 32 || Val == 48);
+  }
+
+  bool isLogicalVecShifter() const {
+    if (!isShifter())
+      return false;
+
+    // A logical vector shifter is a left shift by 0, 8, 16, or 24.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::LSL &&
+           (Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24);
+  }
+
+  bool isLogicalVecHalfWordShifter() const {
+    if (!isLogicalVecShifter())
+      return false;
+
+    // A logical vector shifter is a left shift by 0 or 8.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::LSL &&
+           (Shift == 0 || Shift == 8);
+  }
+
+  bool isMoveVecShifter() const {
+    if (!isShiftExtend())
+      return false;
+
+    // A logical vector shifter is a left shift by 8 or 16.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::MSL &&
+           (Shift == 8 || Shift == 16);
+  }
+
+  // Fallback unscaled operands are for aliases of LDR/STR that fall back
+  // to LDUR/STUR when the offset is not legal for the former but is for
+  // the latter. As such, in addition to checking for being a legal unscaled
+  // address, also check that it is not a legal scaled address. This avoids
+  // ambiguity in the matcher.
+  template<int Width>
+  bool isSImm9OffsetFB() const {
+    return isSImm9() && !isUImm12Offset<Width / 8>();
+  }
+
+  bool isAdrpLabel() const {
+    // Validation was handled during parsing, so we just sanity check that
+    // something didn't go haywire.
+    if (!isImm())
+        return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Min = - (4096 * (1LL << (21 - 1)));
+      int64_t Max = 4096 * ((1LL << (21 - 1)) - 1);
+      return (Val % 4096) == 0 && Val >= Min && Val <= Max;
+    }
+
+    return true;
+  }
+
+  bool isAdrLabel() const {
+    // Validation was handled during parsing, so we just sanity check that
+    // something didn't go haywire.
+    if (!isImm())
+        return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Min = - (1LL << (21 - 1));
+      int64_t Max = ((1LL << (21 - 1)) - 1);
+      return Val >= Min && Val <= Max;
+    }
+
+    return true;
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.  Null MCExpr = 0.
+    if (!Expr)
+      Inst.addOperand(MCOperand::createImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
+  void addGPR32as64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(
+        AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(getReg()));
+
+    const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+    uint32_t Reg = RI->getRegClass(AArch64::GPR32RegClassID).getRegister(
+        RI->getEncodingValue(getReg()));
+
+    Inst.addOperand(MCOperand::createReg(Reg));
+  }
+
+  void addVectorReg64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(
+        AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
+    Inst.addOperand(MCOperand::createReg(AArch64::D0 + getReg() - AArch64::Q0));
+  }
+
+  void addVectorReg128Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(
+        AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
+  void addVectorRegLoOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
+  template <unsigned NumRegs>
+  void addVectorList64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    static const unsigned FirstRegs[] = { AArch64::D0,
+                                          AArch64::D0_D1,
+                                          AArch64::D0_D1_D2,
+                                          AArch64::D0_D1_D2_D3 };
+    unsigned FirstReg = FirstRegs[NumRegs - 1];
+
+    Inst.addOperand(
+        MCOperand::createReg(FirstReg + getVectorListStart() - AArch64::Q0));
+  }
+
+  template <unsigned NumRegs>
+  void addVectorList128Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    static const unsigned FirstRegs[] = { AArch64::Q0,
+                                          AArch64::Q0_Q1,
+                                          AArch64::Q0_Q1_Q2,
+                                          AArch64::Q0_Q1_Q2_Q3 };
+    unsigned FirstReg = FirstRegs[NumRegs - 1];
+
+    Inst.addOperand(
+        MCOperand::createReg(FirstReg + getVectorListStart() - AArch64::Q0));
+  }
+
+  void addVectorIndex1Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+  }
+
+  void addVectorIndexBOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+  }
+
+  void addVectorIndexHOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+  }
+
+  void addVectorIndexSOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+  }
+
+  void addVectorIndexDOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // If this is a pageoff symrefexpr with an addend, adjust the addend
+    // to be only the page-offset portion. Otherwise, just add the expr
+    // as-is.
+    addExpr(Inst, getImm());
+  }
+
+  void addAddSubImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    if (isShiftedImm()) {
+      addExpr(Inst, getShiftedImmVal());
+      Inst.addOperand(MCOperand::createImm(getShiftedImmShift()));
+    } else {
+      addExpr(Inst, getImm());
+      Inst.addOperand(MCOperand::createImm(0));
+    }
+  }
+
+  void addAddSubImmNegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    const MCExpr *MCE = isShiftedImm() ? getShiftedImmVal() : getImm();
+    const MCConstantExpr *CE = cast<MCConstantExpr>(MCE);
+    int64_t Val = -CE->getValue();
+    unsigned ShiftAmt = isShiftedImm() ? ShiftedImm.ShiftAmount : 0;
+
+    Inst.addOperand(MCOperand::createImm(Val));
+    Inst.addOperand(MCOperand::createImm(ShiftAmt));
+  }
+
+  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getCondCode()));
+  }
+
+  void addAdrpLabelOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      addExpr(Inst, getImm());
+    else
+      Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 12));
+  }
+
+  void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  template<int Scale>
+  void addUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+
+    if (!MCE) {
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+      return;
+    }
+    Inst.addOperand(MCOperand::createImm(MCE->getValue() / Scale));
+  }
+
+  void addSImm9Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
+  void addSImm7s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue() / 4));
+  }
+
+  void addSImm7s8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue() / 8));
+  }
+
+  void addSImm7s16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue() / 16));
+  }
+
+  void addImm0_1Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
+  void addImm0_7Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
+  void addImm1_8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
+  void addImm0_15Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
+  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
+  void addImm0_31Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
+  void addImm1_31Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
+  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
+  void addImm0_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
+  void addImm1_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
+  void addImm1_64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
+  void addImm0_127Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
+  void addImm0_255Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
+  void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
+  void addImm32_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(MCE->getValue()));
+  }
+
+  void addLogicalImm32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    uint64_t encoding =
+        AArch64_AM::encodeLogicalImmediate(MCE->getValue() & 0xFFFFFFFF, 32);
+    Inst.addOperand(MCOperand::createImm(encoding));
+  }
+
+  void addLogicalImm64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 64);
+    Inst.addOperand(MCOperand::createImm(encoding));
+  }
+
+  void addLogicalImm32NotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    int64_t Val = ~MCE->getValue() & 0xFFFFFFFF;
+    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, 32);
+    Inst.addOperand(MCOperand::createImm(encoding));
+  }
+
+  void addLogicalImm64NotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    uint64_t encoding =
+        AArch64_AM::encodeLogicalImmediate(~MCE->getValue(), 64);
+    Inst.addOperand(MCOperand::createImm(encoding));
+  }
+
+  void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
+    uint64_t encoding = AArch64_AM::encodeAdvSIMDModImmType10(MCE->getValue());
+    Inst.addOperand(MCOperand::createImm(encoding));
+  }
+
+  void addBranchTarget26Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
+  }
+
+  void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
+  }
+
+  void addBranchTarget14Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
+  }
+
+  void addFPImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getFPImm()));
+  }
+
+  void addBarrierOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getBarrier()));
+  }
+
+  void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createImm(SysReg.MRSReg));
+  }
+
+  void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createImm(SysReg.MSRReg));
+  }
+
+  void addSystemPStateFieldWithImm0_1Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createImm(SysReg.PStateField));
+  }
+
+  void addSystemPStateFieldWithImm0_15Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createImm(SysReg.PStateField));
+  }
+
+  void addSysCROperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getSysCR()));
+  }
+
+  void addPrefetchOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getPrefetch()));
+  }
+
+  void addPSBHintOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getPSBHint()));
+  }
+
+  void addShifterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    unsigned Imm =
+        AArch64_AM::getShifterImm(getShiftExtendType(), getShiftExtendAmount());
+    Inst.addOperand(MCOperand::createImm(Imm));
+  }
+
+  void addExtendOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTW;
+    unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
+    Inst.addOperand(MCOperand::createImm(Imm));
+  }
+
+  void addExtend64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTX;
+    unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
+    Inst.addOperand(MCOperand::createImm(Imm));
+  }
+
+  void addMemExtendOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
+    Inst.addOperand(MCOperand::createImm(IsSigned));
+    Inst.addOperand(MCOperand::createImm(getShiftExtendAmount() != 0));
+  }
+
+  // For 8-bit load/store instructions with a register offset, both the
+  // "DoShift" and "NoShift" variants have a shift of 0. Because of this,
+  // they're disambiguated by whether the shift was explicit or implicit rather
+  // than its size.
+  void addMemExtend8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
+    Inst.addOperand(MCOperand::createImm(IsSigned));
+    Inst.addOperand(MCOperand::createImm(hasShiftExtendAmount()));
+  }
+
+  template<int Shift>
+  void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+    Inst.addOperand(MCOperand::createImm((Value >> Shift) & 0xffff));
+  }
+
+  template<int Shift>
+  void addMOVNMovAliasOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+    Inst.addOperand(MCOperand::createImm((~Value >> Shift) & 0xffff));
+  }
+
+  void print(raw_ostream &OS) const override;
+
+  static std::unique_ptr<AArch64Operand>
+  CreateToken(StringRef Str, bool IsSuffix, SMLoc S, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Token, Ctx);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->Tok.IsSuffix = IsSuffix;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand>
+  CreateReg(unsigned RegNum, bool isVector, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
+    Op->Reg.RegNum = RegNum;
+    Op->Reg.isVector = isVector;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand>
+  CreateVectorList(unsigned RegNum, unsigned Count, unsigned NumElements,
+                   char ElementKind, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_VectorList, Ctx);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.NumElements = NumElements;
+    Op->VectorList.ElementKind = ElementKind;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand>
+  CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_VectorIndex, Ctx);
+    Op->VectorIndex.Val = Idx;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand> CreateImm(const MCExpr *Val, SMLoc S,
+                                                   SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Immediate, Ctx);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand> CreateShiftedImm(const MCExpr *Val,
+                                                          unsigned ShiftAmount,
+                                                          SMLoc S, SMLoc E,
+                                                          MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_ShiftedImm, Ctx);
+    Op->ShiftedImm .Val = Val;
+    Op->ShiftedImm.ShiftAmount = ShiftAmount;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand>
+  CreateCondCode(AArch64CC::CondCode Code, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_CondCode, Ctx);
+    Op->CondCode.Code = Code;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand> CreateFPImm(unsigned Val, SMLoc S,
+                                                     MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_FPImm, Ctx);
+    Op->FPImm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand> CreateBarrier(unsigned Val,
+                                                       StringRef Str,
+                                                       SMLoc S,
+                                                       MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Barrier, Ctx);
+    Op->Barrier.Val = Val;
+    Op->Barrier.Data = Str.data();
+    Op->Barrier.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand> CreateSysReg(StringRef Str, SMLoc S,
+                                                      uint32_t MRSReg,
+                                                      uint32_t MSRReg,
+                                                      uint32_t PStateField,
+                                                      MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_SysReg, Ctx);
+    Op->SysReg.Data = Str.data();
+    Op->SysReg.Length = Str.size();
+    Op->SysReg.MRSReg = MRSReg;
+    Op->SysReg.MSRReg = MSRReg;
+    Op->SysReg.PStateField = PStateField;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand> CreateSysCR(unsigned Val, SMLoc S,
+                                                     SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_SysCR, Ctx);
+    Op->SysCRImm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand> CreatePrefetch(unsigned Val,
+                                                        StringRef Str,
+                                                        SMLoc S,
+                                                        MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_Prefetch, Ctx);
+    Op->Prefetch.Val = Val;
+    Op->Barrier.Data = Str.data();
+    Op->Barrier.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand> CreatePSBHint(unsigned Val,
+                                                       StringRef Str,
+                                                       SMLoc S,
+                                                       MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx);
+    Op->PSBHint.Val = Val;
+    Op->PSBHint.Data = Str.data();
+    Op->PSBHint.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<AArch64Operand>
+  CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val,
+                    bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<AArch64Operand>(k_ShiftExtend, Ctx);
+    Op->ShiftExtend.Type = ShOp;
+    Op->ShiftExtend.Amount = Val;
+    Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+};
+
+} // end anonymous namespace.
+
+void AArch64Operand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case k_FPImm:
+    OS << "<fpimm " << getFPImm() << "("
+       << AArch64_AM::getFPImmFloat(getFPImm()) << ") >";
+    break;
+  case k_Barrier: {
+    StringRef Name = getBarrierName();
+    if (!Name.empty())
+      OS << "<barrier " << Name << ">";
+    else
+      OS << "<barrier invalid #" << getBarrier() << ">";
+    break;
+  }
+  case k_Immediate:
+    OS << *getImm();
+    break;
+  case k_ShiftedImm: {
+    unsigned Shift = getShiftedImmShift();
+    OS << "<shiftedimm ";
+    OS << *getShiftedImmVal();
+    OS << ", lsl #" << AArch64_AM::getShiftValue(Shift) << ">";
+    break;
+  }
+  case k_CondCode:
+    OS << "<condcode " << getCondCode() << ">";
+    break;
+  case k_Register:
+    OS << "<register " << getReg() << ">";
+    break;
+  case k_VectorList: {
+    OS << "<vectorlist ";
+    unsigned Reg = getVectorListStart();
+    for (unsigned i = 0, e = getVectorListCount(); i != e; ++i)
+      OS << Reg + i << " ";
+    OS << ">";
+    break;
+  }
+  case k_VectorIndex:
+    OS << "<vectorindex " << getVectorIndex() << ">";
+    break;
+  case k_SysReg:
+    OS << "<sysreg: " << getSysReg() << '>';
+    break;
+  case k_Token:
+    OS << "'" << getToken() << "'";
+    break;
+  case k_SysCR:
+    OS << "c" << getSysCR();
+    break;
+  case k_Prefetch: {
+    StringRef Name = getPrefetchName();
+    if (!Name.empty())
+      OS << "<prfop " << Name << ">";
+    else
+      OS << "<prfop invalid #" << getPrefetch() << ">";
+    break;
+  }
+  case k_PSBHint: {
+    OS << getPSBHintName();
+    break;
+  }
+  case k_ShiftExtend: {
+    OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
+       << getShiftExtendAmount();
+    if (!hasShiftExtendAmount())
+      OS << "<imp>";
+    OS << '>';
+    break;
+  }
+  }
+}
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+static unsigned matchVectorRegName(StringRef Name) {
+  return StringSwitch<unsigned>(Name.lower())
+      .Case("v0", AArch64::Q0)
+      .Case("v1", AArch64::Q1)
+      .Case("v2", AArch64::Q2)
+      .Case("v3", AArch64::Q3)
+      .Case("v4", AArch64::Q4)
+      .Case("v5", AArch64::Q5)
+      .Case("v6", AArch64::Q6)
+      .Case("v7", AArch64::Q7)
+      .Case("v8", AArch64::Q8)
+      .Case("v9", AArch64::Q9)
+      .Case("v10", AArch64::Q10)
+      .Case("v11", AArch64::Q11)
+      .Case("v12", AArch64::Q12)
+      .Case("v13", AArch64::Q13)
+      .Case("v14", AArch64::Q14)
+      .Case("v15", AArch64::Q15)
+      .Case("v16", AArch64::Q16)
+      .Case("v17", AArch64::Q17)
+      .Case("v18", AArch64::Q18)
+      .Case("v19", AArch64::Q19)
+      .Case("v20", AArch64::Q20)
+      .Case("v21", AArch64::Q21)
+      .Case("v22", AArch64::Q22)
+      .Case("v23", AArch64::Q23)
+      .Case("v24", AArch64::Q24)
+      .Case("v25", AArch64::Q25)
+      .Case("v26", AArch64::Q26)
+      .Case("v27", AArch64::Q27)
+      .Case("v28", AArch64::Q28)
+      .Case("v29", AArch64::Q29)
+      .Case("v30", AArch64::Q30)
+      .Case("v31", AArch64::Q31)
+      .Default(0);
+}
+
+static bool isValidVectorKind(StringRef Name) {
+  return StringSwitch<bool>(Name.lower())
+      .Case(".8b", true)
+      .Case(".16b", true)
+      .Case(".4h", true)
+      .Case(".8h", true)
+      .Case(".2s", true)
+      .Case(".4s", true)
+      .Case(".1d", true)
+      .Case(".2d", true)
+      .Case(".1q", true)
+      // Accept the width neutral ones, too, for verbose syntax. If those
+      // aren't used in the right places, the token operand won't match so
+      // all will work out.
+      .Case(".b", true)
+      .Case(".h", true)
+      .Case(".s", true)
+      .Case(".d", true)
+      // Needed for fp16 scalar pairwise reductions
+      .Case(".2h", true)
+      .Default(false);
+}
+
+static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
+                                 char &ElementKind) {
+  assert(isValidVectorKind(Name));
+
+  ElementKind = Name.lower()[Name.size() - 1];
+  NumElements = 0;
+
+  if (Name.size() == 2)
+    return;
+
+  // Parse the lane count
+  Name = Name.drop_front();
+  while (isdigit(Name.front())) {
+    NumElements = 10 * NumElements + (Name.front() - '0');
+    Name = Name.drop_front();
+  }
+}
+
+bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                     SMLoc &EndLoc) {
+  StartLoc = getLoc();
+  RegNo = tryParseRegister();
+  EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  return (RegNo == (unsigned)-1);
+}
+
+// Matches a register name or register alias previously defined by '.req'
+unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
+                                                  bool isVector) {
+  unsigned RegNum = isVector ? matchVectorRegName(Name)
+                             : MatchRegisterName(Name);
+
+  if (RegNum == 0) {
+    // Check for aliases registered via .req. Canonicalize to lower case.
+    // That's more consistent since register names are case insensitive, and
+    // it's how the original entry was passed in from MC/MCParser/AsmParser.
+    auto Entry = RegisterReqs.find(Name.lower());
+    if (Entry == RegisterReqs.end())
+      return 0;
+    // set RegNum if the match is the right kind of register
+    if (isVector == Entry->getValue().first)
+      RegNum = Entry->getValue().second;
+  }
+  return RegNum;
+}
+
+/// tryParseRegister - Try to parse a register name. The token must be an
+/// Identifier when called, and if it is a register name the token is eaten and
+/// the register is added to the operand list.
+int AArch64AsmParser::tryParseRegister() {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return -1;
+
+  std::string lowerCase = Tok.getString().lower();
+  unsigned RegNum = matchRegisterNameAlias(lowerCase, false);
+  // Also handle a few aliases of registers.
+  if (RegNum == 0)
+    RegNum = StringSwitch<unsigned>(lowerCase)
+                 .Case("fp",  AArch64::FP)
+                 .Case("lr",  AArch64::LR)
+                 .Case("x31", AArch64::XZR)
+                 .Case("w31", AArch64::WZR)
+                 .Default(0);
+
+  if (RegNum == 0)
+    return -1;
+
+  Parser.Lex(); // Eat identifier token.
+  return RegNum;
+}
+
+/// tryMatchVectorRegister - Try to parse a vector register name with optional
+/// kind specifier. If it is a register specifier, eat the token and return it.
+int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
+  MCAsmParser &Parser = getParser();
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    TokError("vector register expected");
+    return -1;
+  }
+
+  StringRef Name = Parser.getTok().getString();
+  // If there is a kind specifier, it's separated from the register name by
+  // a '.'.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+  unsigned RegNum = matchRegisterNameAlias(Head, true);
+
+  if (RegNum) {
+    if (Next != StringRef::npos) {
+      Kind = Name.slice(Next, StringRef::npos);
+      if (!isValidVectorKind(Kind)) {
+        TokError("invalid vector kind qualifier");
+        return -1;
+      }
+    }
+    Parser.Lex(); // Eat the register token.
+    return RegNum;
+  }
+
+  if (expected)
+    TokError("vector register expected");
+  return -1;
+}
+
+/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
+OperandMatchResultTy
+AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = getLoc();
+
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
+
+  StringRef Tok = Parser.getTok().getIdentifier();
+  if (Tok[0] != 'c' && Tok[0] != 'C') {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
+
+  uint32_t CRNum;
+  bool BadNum = Tok.drop_front().getAsInteger(10, CRNum);
+  if (BadNum || CRNum > 15) {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(
+      AArch64Operand::CreateSysCR(CRNum, S, getLoc(), getContext()));
+  return MatchOperand_Success;
+}
+
+/// tryParsePrefetch - Try to parse a prefetch operand.
+OperandMatchResultTy
+AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  // Either an identifier for named values or a 5-bit immediate.
+  // Eat optional hash.
+  if (parseOptionalToken(AsmToken::Hash) ||
+      Tok.is(AsmToken::Integer)) {
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for prefetch operand");
+      return MatchOperand_ParseFail;
+    }
+    unsigned prfop = MCE->getValue();
+    if (prfop > 31) {
+      TokError("prefetch operand out of range, [0,31] expected");
+      return MatchOperand_ParseFail;
+    }
+
+    auto PRFM = AArch64PRFM::lookupPRFMByEncoding(MCE->getValue());
+    Operands.push_back(AArch64Operand::CreatePrefetch(
+        prfop, PRFM ? PRFM->Name : "", S, getContext()));
+    return MatchOperand_Success;
+  }
+
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("pre-fetch hint expected");
+    return MatchOperand_ParseFail;
+  }
+
+  auto PRFM = AArch64PRFM::lookupPRFMByName(Tok.getString());
+  if (!PRFM) {
+    TokError("pre-fetch hint expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(AArch64Operand::CreatePrefetch(
+      PRFM->Encoding, Tok.getString(), S, getContext()));
+  return MatchOperand_Success;
+}
+
+/// tryParsePSBHint - Try to parse a PSB operand, mapped to Hint command
+OperandMatchResultTy
+AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("invalid operand for instruction");
+    return MatchOperand_ParseFail;
+  }
+
+  auto PSB = AArch64PSBHint::lookupPSBByName(Tok.getString());
+  if (!PSB) {
+    TokError("invalid operand for instruction");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(AArch64Operand::CreatePSBHint(
+      PSB->Encoding, Tok.getString(), S, getContext()));
+  return MatchOperand_Success;
+}
+
+/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
+/// instruction.
+OperandMatchResultTy
+AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = getLoc();
+  const MCExpr *Expr;
+
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat hash token.
+  }
+
+  if (parseSymbolicImmVal(Expr))
+    return MatchOperand_ParseFail;
+
+  AArch64MCExpr::VariantKind ELFRefKind;
+  MCSymbolRefExpr::VariantKind DarwinRefKind;
+  int64_t Addend;
+  if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+    if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
+        ELFRefKind == AArch64MCExpr::VK_INVALID) {
+      // No modifier was specified at all; this is the syntax for an ELF basic
+      // ADRP relocation (unfortunately).
+      Expr =
+          AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS_PAGE, getContext());
+    } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE ||
+                DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) &&
+               Addend != 0) {
+      Error(S, "gotpage label reference not allowed an addend");
+      return MatchOperand_ParseFail;
+    } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE &&
+               DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE &&
+               DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
+               ELFRefKind != AArch64MCExpr::VK_GOT_PAGE &&
+               ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE &&
+               ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) {
+      // The operand must be an @page or @gotpage qualified symbolref.
+      Error(S, "page or gotpage label reference expected");
+      return MatchOperand_ParseFail;
+    }
+  }
+
+  // We have either a label reference possibly with addend or an immediate. The
+  // addend is a raw value here. The linker will adjust it to only reference the
+  // page.
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+
+  return MatchOperand_Success;
+}
+
+/// tryParseAdrLabel - Parse and validate a source label for the ADR
+/// instruction.
+OperandMatchResultTy
+AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const MCExpr *Expr;
+
+  parseOptionalToken(AsmToken::Hash);
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_ParseFail;
+
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+
+  return MatchOperand_Success;
+}
+
+/// tryParseFPImm - A floating point immediate expression operand.
+OperandMatchResultTy
+AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = getLoc();
+
+  bool Hash = parseOptionalToken(AsmToken::Hash);
+
+  // Handle negation, as that still comes through as a separate token.
+  bool isNegative = parseOptionalToken(AsmToken::Minus);
+
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.is(AsmToken::Real)) {
+    APFloat RealVal(APFloat::IEEEdouble(), Tok.getString());
+    if (isNegative)
+      RealVal.changeSign();
+
+    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+    int Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+    Parser.Lex(); // Eat the token.
+    // Check for out of range values. As an exception, we let Zero through,
+    // as we handle that special case in post-processing before matching in
+    // order to use the zero register for it.
+    if (Val == -1 && !RealVal.isPosZero()) {
+      TokError("expected compatible register or floating-point constant");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
+  }
+  if (Tok.is(AsmToken::Integer)) {
+    int64_t Val;
+    if (!isNegative && Tok.getString().startswith("0x")) {
+      Val = Tok.getIntVal();
+      if (Val > 255 || Val < 0) {
+        TokError("encoded floating point value out of range");
+        return MatchOperand_ParseFail;
+      }
+    } else {
+      APFloat RealVal(APFloat::IEEEdouble(), Tok.getString());
+      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+      // If we had a '-' in front, toggle the sign bit.
+      IntVal ^= (uint64_t)isNegative << 63;
+      Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+    }
+    Parser.Lex(); // Eat the token.
+    Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
+  }
+
+  if (!Hash)
+    return MatchOperand_NoMatch;
+
+  TokError("invalid floating point immediate");
+  return MatchOperand_ParseFail;
+}
+
+/// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand
+OperandMatchResultTy
+AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = getLoc();
+
+  if (Parser.getTok().is(AsmToken::Hash))
+    Parser.Lex(); // Eat '#'
+  else if (Parser.getTok().isNot(AsmToken::Integer))
+    // Operand should start from # or should be integer, emit error otherwise.
+    return MatchOperand_NoMatch;
+
+  const MCExpr *Imm;
+  if (parseSymbolicImmVal(Imm))
+    return MatchOperand_ParseFail;
+  else if (Parser.getTok().isNot(AsmToken::Comma)) {
+    uint64_t ShiftAmount = 0;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Imm);
+    if (MCE) {
+      int64_t Val = MCE->getValue();
+      if (Val > 0xfff && (Val & 0xfff) == 0) {
+        Imm = MCConstantExpr::create(Val >> 12, getContext());
+        ShiftAmount = 12;
+      }
+    }
+    SMLoc E = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount, S, E,
+                                                        getContext()));
+    return MatchOperand_Success;
+  }
+
+  // Eat ','
+  Parser.Lex();
+
+  // The optional operand must be "lsl #N" where N is non-negative.
+  if (!Parser.getTok().is(AsmToken::Identifier) ||
+      !Parser.getTok().getIdentifier().equals_lower("lsl")) {
+    Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  // Eat 'lsl'
+  Parser.Lex();
+
+  parseOptionalToken(AsmToken::Hash);
+
+  if (Parser.getTok().isNot(AsmToken::Integer)) {
+    Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t ShiftAmount = Parser.getTok().getIntVal();
+
+  if (ShiftAmount < 0) {
+    Error(Parser.getTok().getLoc(), "positive shift amount required");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat the number
+
+  SMLoc E = Parser.getTok().getLoc();
+  Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount,
+                                                      S, E, getContext()));
+  return MatchOperand_Success;
+}
+
+/// parseCondCodeString - Parse a Condition Code string.
+AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
+  AArch64CC::CondCode CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
+                    .Case("eq", AArch64CC::EQ)
+                    .Case("ne", AArch64CC::NE)
+                    .Case("cs", AArch64CC::HS)
+                    .Case("hs", AArch64CC::HS)
+                    .Case("cc", AArch64CC::LO)
+                    .Case("lo", AArch64CC::LO)
+                    .Case("mi", AArch64CC::MI)
+                    .Case("pl", AArch64CC::PL)
+                    .Case("vs", AArch64CC::VS)
+                    .Case("vc", AArch64CC::VC)
+                    .Case("hi", AArch64CC::HI)
+                    .Case("ls", AArch64CC::LS)
+                    .Case("ge", AArch64CC::GE)
+                    .Case("lt", AArch64CC::LT)
+                    .Case("gt", AArch64CC::GT)
+                    .Case("le", AArch64CC::LE)
+                    .Case("al", AArch64CC::AL)
+                    .Case("nv", AArch64CC::NV)
+                    .Default(AArch64CC::Invalid);
+  return CC;
+}
+
+/// parseCondCode - Parse a Condition Code operand.
+bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
+                                     bool invertCondCode) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  StringRef Cond = Tok.getString();
+  AArch64CC::CondCode CC = parseCondCodeString(Cond);
+  if (CC == AArch64CC::Invalid)
+    return TokError("invalid condition code");
+  Parser.Lex(); // Eat identifier token.
+
+  if (invertCondCode) {
+    if (CC == AArch64CC::AL || CC == AArch64CC::NV)
+      return TokError("condition codes AL and NV are invalid for this instruction");
+    CC = AArch64CC::getInvertedCondCode(AArch64CC::CondCode(CC));
+  }
+
+  Operands.push_back(
+      AArch64Operand::CreateCondCode(CC, S, getLoc(), getContext()));
+  return false;
+}
+
+/// tryParseOptionalShift - Some operands take an optional shift argument. Parse
+/// them if present.
+OperandMatchResultTy
+AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  std::string LowerID = Tok.getString().lower();
+  AArch64_AM::ShiftExtendType ShOp =
+      StringSwitch<AArch64_AM::ShiftExtendType>(LowerID)
+          .Case("lsl", AArch64_AM::LSL)
+          .Case("lsr", AArch64_AM::LSR)
+          .Case("asr", AArch64_AM::ASR)
+          .Case("ror", AArch64_AM::ROR)
+          .Case("msl", AArch64_AM::MSL)
+          .Case("uxtb", AArch64_AM::UXTB)
+          .Case("uxth", AArch64_AM::UXTH)
+          .Case("uxtw", AArch64_AM::UXTW)
+          .Case("uxtx", AArch64_AM::UXTX)
+          .Case("sxtb", AArch64_AM::SXTB)
+          .Case("sxth", AArch64_AM::SXTH)
+          .Case("sxtw", AArch64_AM::SXTW)
+          .Case("sxtx", AArch64_AM::SXTX)
+          .Default(AArch64_AM::InvalidShiftExtend);
+
+  if (ShOp == AArch64_AM::InvalidShiftExtend)
+    return MatchOperand_NoMatch;
+
+  SMLoc S = Tok.getLoc();
+  Parser.Lex();
+
+  bool Hash = parseOptionalToken(AsmToken::Hash);
+
+  if (!Hash && getLexer().isNot(AsmToken::Integer)) {
+    if (ShOp == AArch64_AM::LSL || ShOp == AArch64_AM::LSR ||
+        ShOp == AArch64_AM::ASR || ShOp == AArch64_AM::ROR ||
+        ShOp == AArch64_AM::MSL) {
+      // We expect a number here.
+      TokError("expected #imm after shift specifier");
+      return MatchOperand_ParseFail;
+    }
+
+    // "extend" type operations don't need an immediate, #0 is implicit.
+    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(
+        AArch64Operand::CreateShiftExtend(ShOp, 0, false, S, E, getContext()));
+    return MatchOperand_Success;
+  }
+
+  // Make sure we do actually have a number, identifier or a parenthesized
+  // expression.
+  SMLoc E = Parser.getTok().getLoc();
+  if (!Parser.getTok().is(AsmToken::Integer) &&
+      !Parser.getTok().is(AsmToken::LParen) &&
+      !Parser.getTok().is(AsmToken::Identifier)) {
+    Error(E, "expected integer shift amount");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCExpr *ImmVal;
+  if (getParser().parseExpression(ImmVal))
+    return MatchOperand_ParseFail;
+
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+  if (!MCE) {
+    Error(E, "expected constant '#imm' after shift specifier");
+    return MatchOperand_ParseFail;
+  }
+
+  E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateShiftExtend(
+      ShOp, MCE->getValue(), true, S, E, getContext()));
+  return MatchOperand_Success;
+}
+
+/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
+/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
+bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
+                                   OperandVector &Operands) {
+  if (Name.find('.') != StringRef::npos)
+    return TokError("invalid operand");
+
+  Mnemonic = Name;
+  Operands.push_back(
+      AArch64Operand::CreateToken("sys", false, NameLoc, getContext()));
+
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  StringRef Op = Tok.getString();
+  SMLoc S = Tok.getLoc();
+
+  const MCExpr *Expr = nullptr;
+
+#define SYS_ALIAS(op1, Cn, Cm, op2)                                            \
+  do {                                                                         \
+    Expr = MCConstantExpr::create(op1, getContext());                          \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));           \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));           \
+    Expr = MCConstantExpr::create(op2, getContext());                          \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
+  } while (0)
+
+  if (Mnemonic == "ic") {
+    if (!Op.compare_lower("ialluis")) {
+      // SYS #0, C7, C1, #0
+      SYS_ALIAS(0, 7, 1, 0);
+    } else if (!Op.compare_lower("iallu")) {
+      // SYS #0, C7, C5, #0
+      SYS_ALIAS(0, 7, 5, 0);
+    } else if (!Op.compare_lower("ivau")) {
+      // SYS #3, C7, C5, #1
+      SYS_ALIAS(3, 7, 5, 1);
+    } else {
+      return TokError("invalid operand for IC instruction");
+    }
+  } else if (Mnemonic == "dc") {
+    if (!Op.compare_lower("zva")) {
+      // SYS #3, C7, C4, #1
+      SYS_ALIAS(3, 7, 4, 1);
+    } else if (!Op.compare_lower("ivac")) {
+      // SYS #3, C7, C6, #1
+      SYS_ALIAS(0, 7, 6, 1);
+    } else if (!Op.compare_lower("isw")) {
+      // SYS #0, C7, C6, #2
+      SYS_ALIAS(0, 7, 6, 2);
+    } else if (!Op.compare_lower("cvac")) {
+      // SYS #3, C7, C10, #1
+      SYS_ALIAS(3, 7, 10, 1);
+    } else if (!Op.compare_lower("csw")) {
+      // SYS #0, C7, C10, #2
+      SYS_ALIAS(0, 7, 10, 2);
+    } else if (!Op.compare_lower("cvau")) {
+      // SYS #3, C7, C11, #1
+      SYS_ALIAS(3, 7, 11, 1);
+    } else if (!Op.compare_lower("civac")) {
+      // SYS #3, C7, C14, #1
+      SYS_ALIAS(3, 7, 14, 1);
+    } else if (!Op.compare_lower("cisw")) {
+      // SYS #0, C7, C14, #2
+      SYS_ALIAS(0, 7, 14, 2);
+    } else if (!Op.compare_lower("cvap")) {
+      if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
+        // SYS #3, C7, C12, #1
+        SYS_ALIAS(3, 7, 12, 1);
+      } else {
+        return TokError("DC CVAP requires ARMv8.2a");
+      }
+    } else {
+      return TokError("invalid operand for DC instruction");
+    }
+  } else if (Mnemonic == "at") {
+    if (!Op.compare_lower("s1e1r")) {
+      // SYS #0, C7, C8, #0
+      SYS_ALIAS(0, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e2r")) {
+      // SYS #4, C7, C8, #0
+      SYS_ALIAS(4, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e3r")) {
+      // SYS #6, C7, C8, #0
+      SYS_ALIAS(6, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e1w")) {
+      // SYS #0, C7, C8, #1
+      SYS_ALIAS(0, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e2w")) {
+      // SYS #4, C7, C8, #1
+      SYS_ALIAS(4, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e3w")) {
+      // SYS #6, C7, C8, #1
+      SYS_ALIAS(6, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e0r")) {
+      // SYS #0, C7, C8, #3
+      SYS_ALIAS(0, 7, 8, 2);
+    } else if (!Op.compare_lower("s1e0w")) {
+      // SYS #0, C7, C8, #3
+      SYS_ALIAS(0, 7, 8, 3);
+    } else if (!Op.compare_lower("s12e1r")) {
+      // SYS #4, C7, C8, #4
+      SYS_ALIAS(4, 7, 8, 4);
+    } else if (!Op.compare_lower("s12e1w")) {
+      // SYS #4, C7, C8, #5
+      SYS_ALIAS(4, 7, 8, 5);
+    } else if (!Op.compare_lower("s12e0r")) {
+      // SYS #4, C7, C8, #6
+      SYS_ALIAS(4, 7, 8, 6);
+    } else if (!Op.compare_lower("s12e0w")) {
+      // SYS #4, C7, C8, #7
+      SYS_ALIAS(4, 7, 8, 7);
+    } else if (!Op.compare_lower("s1e1rp")) {
+      if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
+        // SYS #0, C7, C9, #0
+        SYS_ALIAS(0, 7, 9, 0);
+      } else {
+        return TokError("AT S1E1RP requires ARMv8.2a");
+      }
+    } else if (!Op.compare_lower("s1e1wp")) {
+      if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
+        // SYS #0, C7, C9, #1
+        SYS_ALIAS(0, 7, 9, 1);
+      } else {
+        return TokError("AT S1E1WP requires ARMv8.2a");
+      }
+    } else {
+      return TokError("invalid operand for AT instruction");
+    }
+  } else if (Mnemonic == "tlbi") {
+    if (!Op.compare_lower("vmalle1is")) {
+      // SYS #0, C8, C3, #0
+      SYS_ALIAS(0, 8, 3, 0);
+    } else if (!Op.compare_lower("alle2is")) {
+      // SYS #4, C8, C3, #0
+      SYS_ALIAS(4, 8, 3, 0);
+    } else if (!Op.compare_lower("alle3is")) {
+      // SYS #6, C8, C3, #0
+      SYS_ALIAS(6, 8, 3, 0);
+    } else if (!Op.compare_lower("vae1is")) {
+      // SYS #0, C8, C3, #1
+      SYS_ALIAS(0, 8, 3, 1);
+    } else if (!Op.compare_lower("vae2is")) {
+      // SYS #4, C8, C3, #1
+      SYS_ALIAS(4, 8, 3, 1);
+    } else if (!Op.compare_lower("vae3is")) {
+      // SYS #6, C8, C3, #1
+      SYS_ALIAS(6, 8, 3, 1);
+    } else if (!Op.compare_lower("aside1is")) {
+      // SYS #0, C8, C3, #2
+      SYS_ALIAS(0, 8, 3, 2);
+    } else if (!Op.compare_lower("vaae1is")) {
+      // SYS #0, C8, C3, #3
+      SYS_ALIAS(0, 8, 3, 3);
+    } else if (!Op.compare_lower("alle1is")) {
+      // SYS #4, C8, C3, #4
+      SYS_ALIAS(4, 8, 3, 4);
+    } else if (!Op.compare_lower("vale1is")) {
+      // SYS #0, C8, C3, #5
+      SYS_ALIAS(0, 8, 3, 5);
+    } else if (!Op.compare_lower("vaale1is")) {
+      // SYS #0, C8, C3, #7
+      SYS_ALIAS(0, 8, 3, 7);
+    } else if (!Op.compare_lower("vmalle1")) {
+      // SYS #0, C8, C7, #0
+      SYS_ALIAS(0, 8, 7, 0);
+    } else if (!Op.compare_lower("alle2")) {
+      // SYS #4, C8, C7, #0
+      SYS_ALIAS(4, 8, 7, 0);
+    } else if (!Op.compare_lower("vale2is")) {
+      // SYS #4, C8, C3, #5
+      SYS_ALIAS(4, 8, 3, 5);
+    } else if (!Op.compare_lower("vale3is")) {
+      // SYS #6, C8, C3, #5
+      SYS_ALIAS(6, 8, 3, 5);
+    } else if (!Op.compare_lower("alle3")) {
+      // SYS #6, C8, C7, #0
+      SYS_ALIAS(6, 8, 7, 0);
+    } else if (!Op.compare_lower("vae1")) {
+      // SYS #0, C8, C7, #1
+      SYS_ALIAS(0, 8, 7, 1);
+    } else if (!Op.compare_lower("vae2")) {
+      // SYS #4, C8, C7, #1
+      SYS_ALIAS(4, 8, 7, 1);
+    } else if (!Op.compare_lower("vae3")) {
+      // SYS #6, C8, C7, #1
+      SYS_ALIAS(6, 8, 7, 1);
+    } else if (!Op.compare_lower("aside1")) {
+      // SYS #0, C8, C7, #2
+      SYS_ALIAS(0, 8, 7, 2);
+    } else if (!Op.compare_lower("vaae1")) {
+      // SYS #0, C8, C7, #3
+      SYS_ALIAS(0, 8, 7, 3);
+    } else if (!Op.compare_lower("alle1")) {
+      // SYS #4, C8, C7, #4
+      SYS_ALIAS(4, 8, 7, 4);
+    } else if (!Op.compare_lower("vale1")) {
+      // SYS #0, C8, C7, #5
+      SYS_ALIAS(0, 8, 7, 5);
+    } else if (!Op.compare_lower("vale2")) {
+      // SYS #4, C8, C7, #5
+      SYS_ALIAS(4, 8, 7, 5);
+    } else if (!Op.compare_lower("vale3")) {
+      // SYS #6, C8, C7, #5
+      SYS_ALIAS(6, 8, 7, 5);
+    } else if (!Op.compare_lower("vaale1")) {
+      // SYS #0, C8, C7, #7
+      SYS_ALIAS(0, 8, 7, 7);
+    } else if (!Op.compare_lower("ipas2e1")) {
+      // SYS #4, C8, C4, #1
+      SYS_ALIAS(4, 8, 4, 1);
+    } else if (!Op.compare_lower("ipas2le1")) {
+      // SYS #4, C8, C4, #5
+      SYS_ALIAS(4, 8, 4, 5);
+    } else if (!Op.compare_lower("ipas2e1is")) {
+      // SYS #4, C8, C4, #1
+      SYS_ALIAS(4, 8, 0, 1);
+    } else if (!Op.compare_lower("ipas2le1is")) {
+      // SYS #4, C8, C4, #5
+      SYS_ALIAS(4, 8, 0, 5);
+    } else if (!Op.compare_lower("vmalls12e1")) {
+      // SYS #4, C8, C7, #6
+      SYS_ALIAS(4, 8, 7, 6);
+    } else if (!Op.compare_lower("vmalls12e1is")) {
+      // SYS #4, C8, C3, #6
+      SYS_ALIAS(4, 8, 3, 6);
+    } else {
+      return TokError("invalid operand for TLBI instruction");
+    }
+  }
+
+#undef SYS_ALIAS
+
+  Parser.Lex(); // Eat operand.
+
+  bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
+  bool HasRegister = false;
+
+  // Check for the optional register operand.
+  if (parseOptionalToken(AsmToken::Comma)) {
+    if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands))
+      return TokError("expected register operand");
+    HasRegister = true;
+  }
+
+  if (ExpectRegister && !HasRegister) {
+    return TokError("specified " + Mnemonic + " op requires a register");
+  }
+  else if (!ExpectRegister && HasRegister) {
+    return TokError("specified " + Mnemonic + " op does not use a register");
+  }
+
+  if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
+    return true;
+
+  return false;
+}
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+
+  // Can be either a #imm style literal or an option name
+  if (parseOptionalToken(AsmToken::Hash) ||
+      Tok.is(AsmToken::Integer)) {
+    // Immediate operand.
+    const MCExpr *ImmVal;
+    SMLoc ExprLoc = getLoc();
+    if (getParser().parseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      Error(ExprLoc, "immediate value expected for barrier operand");
+      return MatchOperand_ParseFail;
+    }
+    if (MCE->getValue() < 0 || MCE->getValue() > 15) {
+      Error(ExprLoc, "barrier operand out of range");
+      return MatchOperand_ParseFail;
+    }
+    auto DB = AArch64DB::lookupDBByEncoding(MCE->getValue());
+    Operands.push_back(AArch64Operand::CreateBarrier(
+        MCE->getValue(), DB ? DB->Name : "", ExprLoc, getContext()));
+    return MatchOperand_Success;
+  }
+
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("invalid operand for instruction");
+    return MatchOperand_ParseFail;
+  }
+
+  auto DB = AArch64DB::lookupDBByName(Tok.getString());
+  if (!DB) {
+    TokError("invalid barrier option name");
+    return MatchOperand_ParseFail;
+  }
+
+  // The only valid named option for ISB is 'sy'
+  if (Mnemonic == "isb" && DB->Encoding != AArch64DB::sy) {
+    TokError("'sy' or #imm operand expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(AArch64Operand::CreateBarrier(
+      DB->Encoding, Tok.getString(), getLoc(), getContext()));
+  Parser.Lex(); // Consume the option
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  int MRSReg, MSRReg;
+  auto SysReg = AArch64SysReg::lookupSysRegByName(Tok.getString());
+  if (SysReg && SysReg->haveFeatures(getSTI().getFeatureBits())) {
+    MRSReg = SysReg->Readable ? SysReg->Encoding : -1;
+    MSRReg = SysReg->Writeable ? SysReg->Encoding : -1;
+  } else
+    MRSReg = MSRReg = AArch64SysReg::parseGenericRegister(Tok.getString());
+
+  auto PState = AArch64PState::lookupPStateByName(Tok.getString());
+  unsigned PStateImm = -1;
+  if (PState && PState->haveFeatures(getSTI().getFeatureBits()))
+    PStateImm = PState->Encoding;
+
+  Operands.push_back(
+      AArch64Operand::CreateSysReg(Tok.getString(), getLoc(), MRSReg, MSRReg,
+                                   PStateImm, getContext()));
+  Parser.Lex(); // Eat identifier
+
+  return MatchOperand_Success;
+}
+
+/// tryParseVectorRegister - Parse a vector register operand.
+bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  if (Parser.getTok().isNot(AsmToken::Identifier))
+    return true;
+
+  SMLoc S = getLoc();
+  // Check for a vector register specifier first.
+  StringRef Kind;
+  int64_t Reg = tryMatchVectorRegister(Kind, false);
+  if (Reg == -1)
+    return true;
+  Operands.push_back(
+      AArch64Operand::CreateReg(Reg, true, S, getLoc(), getContext()));
+  // If there was an explicit qualifier, that goes on as a literal text
+  // operand.
+  if (!Kind.empty())
+    Operands.push_back(
+        AArch64Operand::CreateToken(Kind, false, S, getContext()));
+
+  // If there is an index specifier following the register, parse that too.
+  SMLoc SIdx = getLoc();
+  if (parseOptionalToken(AsmToken::LBrac)) {
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return false;
+    }
+
+    SMLoc E = getLoc();
+
+    if (parseToken(AsmToken::RBrac, "']' expected"))
+      return false;
+
+    Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
+                                                         E, getContext()));
+  }
+
+  return false;
+}
+
+/// parseRegister - Parse a non-vector register operand.
+bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = getLoc();
+  // Try for a vector register.
+  if (!tryParseVectorRegister(Operands))
+    return false;
+
+  // Try for a scalar register.
+  int64_t Reg = tryParseRegister();
+  if (Reg == -1)
+    return true;
+  Operands.push_back(
+      AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
+
+  // A small number of instructions (FMOVXDhighr, for example) have "[1]"
+  // as a string token in the instruction itself.
+  SMLoc LBracS = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (parseOptionalToken(AsmToken::LBrac)) {
+    if (Tok.is(AsmToken::Integer)) {
+      SMLoc IntS = getLoc();
+      int64_t Val = Tok.getIntVal();
+      if (Val == 1) {
+        Parser.Lex();
+        SMLoc RBracS = getLoc();
+        if (parseOptionalToken(AsmToken::RBrac)) {
+          Operands.push_back(
+              AArch64Operand::CreateToken("[", false, LBracS, getContext()));
+          Operands.push_back(
+              AArch64Operand::CreateToken("1", false, IntS, getContext()));
+          Operands.push_back(
+              AArch64Operand::CreateToken("]", false, RBracS, getContext()));
+          return false;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
+  MCAsmParser &Parser = getParser();
+  bool HasELFModifier = false;
+  AArch64MCExpr::VariantKind RefKind;
+
+  if (parseOptionalToken(AsmToken::Colon)) {
+    HasELFModifier = true;
+
+    if (Parser.getTok().isNot(AsmToken::Identifier))
+      return TokError("expect relocation specifier in operand after ':'");
+
+    std::string LowerCase = Parser.getTok().getIdentifier().lower();
+    RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
+                  .Case("lo12", AArch64MCExpr::VK_LO12)
+                  .Case("abs_g3", AArch64MCExpr::VK_ABS_G3)
+                  .Case("abs_g2", AArch64MCExpr::VK_ABS_G2)
+                  .Case("abs_g2_s", AArch64MCExpr::VK_ABS_G2_S)
+                  .Case("abs_g2_nc", AArch64MCExpr::VK_ABS_G2_NC)
+                  .Case("abs_g1", AArch64MCExpr::VK_ABS_G1)
+                  .Case("abs_g1_s", AArch64MCExpr::VK_ABS_G1_S)
+                  .Case("abs_g1_nc", AArch64MCExpr::VK_ABS_G1_NC)
+                  .Case("abs_g0", AArch64MCExpr::VK_ABS_G0)
+                  .Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S)
+                  .Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC)
+                  .Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2)
+                  .Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1)
+                  .Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC)
+                  .Case("dtprel_g0", AArch64MCExpr::VK_DTPREL_G0)
+                  .Case("dtprel_g0_nc", AArch64MCExpr::VK_DTPREL_G0_NC)
+                  .Case("dtprel_hi12", AArch64MCExpr::VK_DTPREL_HI12)
+                  .Case("dtprel_lo12", AArch64MCExpr::VK_DTPREL_LO12)
+                  .Case("dtprel_lo12_nc", AArch64MCExpr::VK_DTPREL_LO12_NC)
+                  .Case("tprel_g2", AArch64MCExpr::VK_TPREL_G2)
+                  .Case("tprel_g1", AArch64MCExpr::VK_TPREL_G1)
+                  .Case("tprel_g1_nc", AArch64MCExpr::VK_TPREL_G1_NC)
+                  .Case("tprel_g0", AArch64MCExpr::VK_TPREL_G0)
+                  .Case("tprel_g0_nc", AArch64MCExpr::VK_TPREL_G0_NC)
+                  .Case("tprel_hi12", AArch64MCExpr::VK_TPREL_HI12)
+                  .Case("tprel_lo12", AArch64MCExpr::VK_TPREL_LO12)
+                  .Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC)
+                  .Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12)
+                  .Case("got", AArch64MCExpr::VK_GOT_PAGE)
+                  .Case("got_lo12", AArch64MCExpr::VK_GOT_LO12)
+                  .Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE)
+                  .Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC)
+                  .Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1)
+                  .Case("gottprel_g0_nc", AArch64MCExpr::VK_GOTTPREL_G0_NC)
+                  .Case("tlsdesc", AArch64MCExpr::VK_TLSDESC_PAGE)
+                  .Default(AArch64MCExpr::VK_INVALID);
+
+    if (RefKind == AArch64MCExpr::VK_INVALID)
+      return TokError("expect relocation specifier in operand after ':'");
+
+    Parser.Lex(); // Eat identifier
+
+    if (parseToken(AsmToken::Colon, "expect ':' after relocation specifier"))
+      return true;
+  }
+
+  if (getParser().parseExpression(ImmVal))
+    return true;
+
+  if (HasELFModifier)
+    ImmVal = AArch64MCExpr::create(ImmVal, RefKind, getContext());
+
+  return false;
+}
+
+/// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
+bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
+  SMLoc S = getLoc();
+  Parser.Lex(); // Eat left bracket token.
+  StringRef Kind;
+  int64_t FirstReg = tryMatchVectorRegister(Kind, true);
+  if (FirstReg == -1)
+    return true;
+  int64_t PrevReg = FirstReg;
+  unsigned Count = 1;
+
+  if (parseOptionalToken(AsmToken::Minus)) {
+    SMLoc Loc = getLoc();
+    StringRef NextKind;
+    int64_t Reg = tryMatchVectorRegister(NextKind, true);
+    if (Reg == -1)
+      return true;
+    // Any Kind suffices must match on all regs in the list.
+    if (Kind != NextKind)
+      return Error(Loc, "mismatched register size suffix");
+
+    unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg);
+
+    if (Space == 0 || Space > 3) {
+      return Error(Loc, "invalid number of vectors");
+    }
+
+    Count += Space;
+  }
+  else {
+    while (parseOptionalToken(AsmToken::Comma)) {
+      SMLoc Loc = getLoc();
+      StringRef NextKind;
+      int64_t Reg = tryMatchVectorRegister(NextKind, true);
+      if (Reg == -1)
+        return true;
+      // Any Kind suffices must match on all regs in the list.
+      if (Kind != NextKind)
+        return Error(Loc, "mismatched register size suffix");
+
+      // Registers must be incremental (with wraparound at 31)
+      if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
+          (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32)
+       return Error(Loc, "registers must be sequential");
+
+      PrevReg = Reg;
+      ++Count;
+    }
+  }
+
+  if (parseToken(AsmToken::RCurly, "'}' expected"))
+    return true;
+
+  if (Count > 4)
+    return Error(S, "invalid number of vectors");
+
+  unsigned NumElements = 0;
+  char ElementKind = 0;
+  if (!Kind.empty())
+    parseValidVectorKind(Kind, NumElements, ElementKind);
+
+  Operands.push_back(AArch64Operand::CreateVectorList(
+      FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext()));
+
+  // If there is an index specifier following the list, parse that too.
+  SMLoc SIdx = getLoc();
+  if (parseOptionalToken(AsmToken::LBrac)) { // Eat left bracket token.
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return false;
+    }
+
+    SMLoc E = getLoc();
+    if (parseToken(AsmToken::RBrac, "']' expected"))
+      return false;
+
+    Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
+                                                         E, getContext()));
+  }
+  return false;
+}
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  unsigned RegNum = matchRegisterNameAlias(Tok.getString().lower(), false);
+
+  MCContext &Ctx = getContext();
+  const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+  if (!RI->getRegClass(AArch64::GPR64spRegClassID).contains(RegNum))
+    return MatchOperand_NoMatch;
+
+  SMLoc S = getLoc();
+  Parser.Lex(); // Eat register
+
+  if (!parseOptionalToken(AsmToken::Comma)) {
+    Operands.push_back(
+        AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+    return MatchOperand_Success;
+  }
+
+  parseOptionalToken(AsmToken::Hash);
+
+  if (Parser.getTok().isNot(AsmToken::Integer)) {
+    Error(getLoc(), "index must be absent or #0");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCExpr *ImmVal;
+  if (Parser.parseExpression(ImmVal) || !isa<MCConstantExpr>(ImmVal) ||
+      cast<MCConstantExpr>(ImmVal)->getValue() != 0) {
+    Error(getLoc(), "index must be absent or #0");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(
+      AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+  return MatchOperand_Success;
+}
+
+/// parseOperand - Parse a arm instruction operand.  For now this parses the
+/// operand regardless of the mnemonic.
+bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
+                                  bool invertCondCode) {
+  MCAsmParser &Parser = getParser();
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  // Nothing custom, so do general case parsing.
+  SMLoc S, E;
+  switch (getLexer().getKind()) {
+  default: {
+    SMLoc S = getLoc();
+    const MCExpr *Expr;
+    if (parseSymbolicImmVal(Expr))
+      return Error(S, "invalid operand");
+
+    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+    return false;
+  }
+  case AsmToken::LBrac: {
+    SMLoc Loc = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateToken("[", false, Loc,
+                                                   getContext()));
+    Parser.Lex(); // Eat '['
+
+    // There's no comma after a '[', so we can parse the next operand
+    // immediately.
+    return parseOperand(Operands, false, false);
+  }
+  case AsmToken::LCurly:
+    return parseVectorList(Operands);
+  case AsmToken::Identifier: {
+    // If we're expecting a Condition Code operand, then just parse that.
+    if (isCondCode)
+      return parseCondCode(Operands, invertCondCode);
+
+    // If it's a register name, parse it.
+    if (!parseRegister(Operands))
+      return false;
+
+    // This could be an optional "shift" or "extend" operand.
+    OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands);
+    // We can only continue if no tokens were eaten.
+    if (GotShift != MatchOperand_NoMatch)
+      return GotShift;
+
+    // This was not a register so parse other operands that start with an
+    // identifier (like labels) as expressions and create them as immediates.
+    const MCExpr *IdVal;
+    S = getLoc();
+    if (getParser().parseExpression(IdVal))
+      return true;
+    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(IdVal, S, E, getContext()));
+    return false;
+  }
+  case AsmToken::Integer:
+  case AsmToken::Real:
+  case AsmToken::Hash: {
+    // #42 -> immediate.
+    S = getLoc();
+
+    parseOptionalToken(AsmToken::Hash);
+
+    // Parse a negative sign
+    bool isNegative = false;
+    if (Parser.getTok().is(AsmToken::Minus)) {
+      isNegative = true;
+      // We need to consume this token only when we have a Real, otherwise
+      // we let parseSymbolicImmVal take care of it
+      if (Parser.getLexer().peekTok().is(AsmToken::Real))
+        Parser.Lex();
+    }
+
+    // The only Real that should come through here is a literal #0.0 for
+    // the fcmp[e] r, #0.0 instructions. They expect raw token operands,
+    // so convert the value.
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.is(AsmToken::Real)) {
+      APFloat RealVal(APFloat::IEEEdouble(), Tok.getString());
+      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+      if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" &&
+          Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" &&
+          Mnemonic != "fcmlt")
+        return TokError("unexpected floating point literal");
+      else if (IntVal != 0 || isNegative)
+        return TokError("expected floating-point constant #0.0");
+      Parser.Lex(); // Eat the token.
+
+      Operands.push_back(
+          AArch64Operand::CreateToken("#0", false, S, getContext()));
+      Operands.push_back(
+          AArch64Operand::CreateToken(".0", false, S, getContext()));
+      return false;
+    }
+
+    const MCExpr *ImmVal;
+    if (parseSymbolicImmVal(ImmVal))
+      return true;
+
+    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext()));
+    return false;
+  }
+  case AsmToken::Equal: {
+    SMLoc Loc = getLoc();
+    if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val)
+      return TokError("unexpected token in operand");
+    Parser.Lex(); // Eat '='
+    const MCExpr *SubExprVal;
+    if (getParser().parseExpression(SubExprVal))
+      return true;
+
+    if (Operands.size() < 2 ||
+        !static_cast<AArch64Operand &>(*Operands[1]).isReg())
+      return Error(Loc, "Only valid when first operand is register");
+
+    bool IsXReg =
+        AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+            Operands[1]->getReg());
+
+    MCContext& Ctx = getContext();
+    E = SMLoc::getFromPointer(Loc.getPointer() - 1);
+    // If the op is an imm and can be fit into a mov, then replace ldr with mov.
+    if (isa<MCConstantExpr>(SubExprVal)) {
+      uint64_t Imm = (cast<MCConstantExpr>(SubExprVal))->getValue();
+      uint32_t ShiftAmt = 0, MaxShiftAmt = IsXReg ? 48 : 16;
+      while(Imm > 0xFFFF && countTrailingZeros(Imm) >= 16) {
+        ShiftAmt += 16;
+        Imm >>= 16;
+      }
+      if (ShiftAmt <= MaxShiftAmt && Imm <= 0xFFFF) {
+          Operands[0] = AArch64Operand::CreateToken("movz", false, Loc, Ctx);
+          Operands.push_back(AArch64Operand::CreateImm(
+                     MCConstantExpr::create(Imm, Ctx), S, E, Ctx));
+        if (ShiftAmt)
+          Operands.push_back(AArch64Operand::CreateShiftExtend(AArch64_AM::LSL,
+                     ShiftAmt, true, S, E, Ctx));
+        return false;
+      }
+      APInt Simm = APInt(64, Imm << ShiftAmt);
+      // check if the immediate is an unsigned or signed 32-bit int for W regs
+      if (!IsXReg && !(Simm.isIntN(32) || Simm.isSignedIntN(32)))
+        return Error(Loc, "Immediate too large for register");
+    }
+    // If it is a label or an imm that cannot fit in a movz, put it into CP.
+    const MCExpr *CPLoc =
+        getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4, Loc);
+    Operands.push_back(AArch64Operand::CreateImm(CPLoc, S, E, Ctx));
+    return false;
+  }
+  }
+}
+
+/// ParseInstruction - Parse an AArch64 instruction mnemonic followed by its
+/// operands.
+bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                        StringRef Name, SMLoc NameLoc,
+                                        OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  Name = StringSwitch<StringRef>(Name.lower())
+             .Case("beq", "b.eq")
+             .Case("bne", "b.ne")
+             .Case("bhs", "b.hs")
+             .Case("bcs", "b.cs")
+             .Case("blo", "b.lo")
+             .Case("bcc", "b.cc")
+             .Case("bmi", "b.mi")
+             .Case("bpl", "b.pl")
+             .Case("bvs", "b.vs")
+             .Case("bvc", "b.vc")
+             .Case("bhi", "b.hi")
+             .Case("bls", "b.ls")
+             .Case("bge", "b.ge")
+             .Case("blt", "b.lt")
+             .Case("bgt", "b.gt")
+             .Case("ble", "b.le")
+             .Case("bal", "b.al")
+             .Case("bnv", "b.nv")
+             .Default(Name);
+
+  // First check for the AArch64-specific .req directive.
+  if (Parser.getTok().is(AsmToken::Identifier) &&
+      Parser.getTok().getIdentifier() == ".req") {
+    parseDirectiveReq(Name, NameLoc);
+    // We always return 'error' for this, as we're done with this
+    // statement and don't need to match the 'instruction."
+    return true;
+  }
+
+  // Create the leading tokens for the mnemonic, split by '.' characters.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+
+  // IC, DC, AT, and TLBI instructions are aliases for the SYS instruction.
+  if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi")
+    return parseSysAlias(Head, NameLoc, Operands);
+
+  Operands.push_back(
+      AArch64Operand::CreateToken(Head, false, NameLoc, getContext()));
+  Mnemonic = Head;
+
+  // Handle condition codes for a branch mnemonic
+  if (Head == "b" && Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    Head = Name.slice(Start + 1, Next);
+
+    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+                                            (Head.data() - Name.data()));
+    AArch64CC::CondCode CC = parseCondCodeString(Head);
+    if (CC == AArch64CC::Invalid)
+      return Error(SuffixLoc, "invalid condition code");
+    Operands.push_back(
+        AArch64Operand::CreateToken(".", true, SuffixLoc, getContext()));
+    Operands.push_back(
+        AArch64Operand::CreateCondCode(CC, NameLoc, NameLoc, getContext()));
+  }
+
+  // Add the remaining tokens in the mnemonic.
+  while (Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    Head = Name.slice(Start, Next);
+    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+                                            (Head.data() - Name.data()) + 1);
+    Operands.push_back(
+        AArch64Operand::CreateToken(Head, true, SuffixLoc, getContext()));
+  }
+
+  // Conditional compare instructions have a Condition Code operand, which needs
+  // to be parsed and an immediate operand created.
+  bool condCodeFourthOperand =
+      (Head == "ccmp" || Head == "ccmn" || Head == "fccmp" ||
+       Head == "fccmpe" || Head == "fcsel" || Head == "csel" ||
+       Head == "csinc" || Head == "csinv" || Head == "csneg");
+
+  // These instructions are aliases to some of the conditional select
+  // instructions. However, the condition code is inverted in the aliased
+  // instruction.
+  //
+  // FIXME: Is this the correct way to handle these? Or should the parser
+  //        generate the aliased instructions directly?
+  bool condCodeSecondOperand = (Head == "cset" || Head == "csetm");
+  bool condCodeThirdOperand =
+      (Head == "cinc" || Head == "cinv" || Head == "cneg");
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (parseOperand(Operands, false, false)) {
+      return true;
+    }
+
+    unsigned N = 2;
+    while (parseOptionalToken(AsmToken::Comma)) {
+      // Parse and remember the operand.
+      if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) ||
+                                     (N == 3 && condCodeThirdOperand) ||
+                                     (N == 2 && condCodeSecondOperand),
+                       condCodeSecondOperand || condCodeThirdOperand)) {
+        return true;
+      }
+
+      // After successfully parsing some operands there are two special cases to
+      // consider (i.e. notional operands not separated by commas). Both are due
+      // to memory specifiers:
+      //  + An RBrac will end an address for load/store/prefetch
+      //  + An '!' will indicate a pre-indexed operation.
+      //
+      // It's someone else's responsibility to make sure these tokens are sane
+      // in the given context!
+
+      SMLoc RLoc = Parser.getTok().getLoc();
+      if (parseOptionalToken(AsmToken::RBrac))
+        Operands.push_back(
+            AArch64Operand::CreateToken("]", false, RLoc, getContext()));
+      SMLoc ELoc = Parser.getTok().getLoc();
+      if (parseOptionalToken(AsmToken::Exclaim))
+        Operands.push_back(
+            AArch64Operand::CreateToken("!", false, ELoc, getContext()));
+
+      ++N;
+    }
+  }
+
+  if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
+    return true;
+
+  return false;
+}
+
+// FIXME: This entire function is a giant hack to provide us with decent
+// operand range validation/diagnostics until TableGen/MC can be extended
+// to support autogeneration of this kind of validation.
+bool AArch64AsmParser::validateInstruction(MCInst &Inst,
+                                         SmallVectorImpl<SMLoc> &Loc) {
+  const MCRegisterInfo *RI = getContext().getRegisterInfo();
+  // Check for indexed addressing modes w/ the base register being the
+  // same as a destination/source register or pair load where
+  // the Rt == Rt2. All of those are undefined behaviour.
+  switch (Inst.getOpcode()) {
+  case AArch64::LDPSWpre:
+  case AArch64::LDPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::LDPXpost:
+  case AArch64::LDPXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    unsigned Rn = Inst.getOperand(3).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable LDP instruction, writeback base "
+                           "is also a destination");
+    if (RI->isSubRegisterEq(Rn, Rt2))
+      return Error(Loc[1], "unpredictable LDP instruction, writeback base "
+                           "is also a destination");
+    LLVM_FALLTHROUGH;
+  }
+  case AArch64::LDPDi:
+  case AArch64::LDPQi:
+  case AArch64::LDPSi:
+  case AArch64::LDPSWi:
+  case AArch64::LDPWi:
+  case AArch64::LDPXi: {
+    unsigned Rt = Inst.getOperand(0).getReg();
+    unsigned Rt2 = Inst.getOperand(1).getReg();
+    if (Rt == Rt2)
+      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+    break;
+  }
+  case AArch64::LDPDpost:
+  case AArch64::LDPDpre:
+  case AArch64::LDPQpost:
+  case AArch64::LDPQpre:
+  case AArch64::LDPSpost:
+  case AArch64::LDPSpre:
+  case AArch64::LDPSWpost: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    if (Rt == Rt2)
+      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+    break;
+  }
+  case AArch64::STPDpost:
+  case AArch64::STPDpre:
+  case AArch64::STPQpost:
+  case AArch64::STPQpre:
+  case AArch64::STPSpost:
+  case AArch64::STPSpre:
+  case AArch64::STPWpost:
+  case AArch64::STPWpre:
+  case AArch64::STPXpost:
+  case AArch64::STPXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    unsigned Rn = Inst.getOperand(3).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable STP instruction, writeback base "
+                           "is also a source");
+    if (RI->isSubRegisterEq(Rn, Rt2))
+      return Error(Loc[1], "unpredictable STP instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  case AArch64::LDRBBpre:
+  case AArch64::LDRBpre:
+  case AArch64::LDRHHpre:
+  case AArch64::LDRHpre:
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRBBpost:
+  case AArch64::LDRBpost:
+  case AArch64::LDRHHpost:
+  case AArch64::LDRHpost:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRWpost:
+  case AArch64::LDRXpost: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rn = Inst.getOperand(2).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable LDR instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  case AArch64::STRBBpost:
+  case AArch64::STRBpost:
+  case AArch64::STRHHpost:
+  case AArch64::STRHpost:
+  case AArch64::STRWpost:
+  case AArch64::STRXpost:
+  case AArch64::STRBBpre:
+  case AArch64::STRBpre:
+  case AArch64::STRHHpre:
+  case AArch64::STRHpre:
+  case AArch64::STRWpre:
+  case AArch64::STRXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rn = Inst.getOperand(2).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable STR instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  }
+
+  // Now check immediate ranges. Separate from the above as there is overlap
+  // in the instructions being checked and this keeps the nested conditionals
+  // to a minimum.
+  switch (Inst.getOpcode()) {
+  case AArch64::ADDSWri:
+  case AArch64::ADDSXri:
+  case AArch64::ADDWri:
+  case AArch64::ADDXri:
+  case AArch64::SUBSWri:
+  case AArch64::SUBSXri:
+  case AArch64::SUBWri:
+  case AArch64::SUBXri: {
+    // Annoyingly we can't do this in the isAddSubImm predicate, so there is
+    // some slight duplication here.
+    if (Inst.getOperand(2).isExpr()) {
+      const MCExpr *Expr = Inst.getOperand(2).getExpr();
+      AArch64MCExpr::VariantKind ELFRefKind;
+      MCSymbolRefExpr::VariantKind DarwinRefKind;
+      int64_t Addend;
+      if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+
+        // Only allow these with ADDXri.
+        if ((DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+             DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) &&
+            Inst.getOpcode() == AArch64::ADDXri)
+          return false;
+
+        // Only allow these with ADDXri/ADDWri
+        if ((ELFRefKind == AArch64MCExpr::VK_LO12 ||
+             ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 ||
+             ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+             ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+             ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 ||
+             ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+             ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+             ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) &&
+            (Inst.getOpcode() == AArch64::ADDXri ||
+             Inst.getOpcode() == AArch64::ADDWri))
+          return false;
+
+        // Don't allow symbol refs in the immediate field otherwise
+        // Note: Loc.back() may be Loc[1] or Loc[2] depending on the number of
+        // operands of the original instruction (i.e. 'add w0, w1, borked' vs
+        // 'cmp w0, 'borked')
+        return Error(Loc.back(), "invalid immediate expression");
+      }
+      // We don't validate more complex expressions here
+    }
+    return false;
+  }
+  default:
+    return false;
+  }
+}
+
+bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
+  switch (ErrCode) {
+  case Match_MissingFeature:
+    return Error(Loc,
+                 "instruction requires a CPU feature not currently enabled");
+  case Match_InvalidOperand:
+    return Error(Loc, "invalid operand for instruction");
+  case Match_InvalidSuffix:
+    return Error(Loc, "invalid type suffix for instruction");
+  case Match_InvalidCondCode:
+    return Error(Loc, "expected AArch64 condition code");
+  case Match_AddSubRegExtendSmall:
+    return Error(Loc,
+      "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
+  case Match_AddSubRegExtendLarge:
+    return Error(Loc,
+      "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
+  case Match_AddSubSecondSource:
+    return Error(Loc,
+      "expected compatible register, symbol or integer in range [0, 4095]");
+  case Match_LogicalSecondSource:
+    return Error(Loc, "expected compatible register or logical immediate");
+  case Match_InvalidMovImm32Shift:
+    return Error(Loc, "expected 'lsl' with optional integer 0 or 16");
+  case Match_InvalidMovImm64Shift:
+    return Error(Loc, "expected 'lsl' with optional integer 0, 16, 32 or 48");
+  case Match_AddSubRegShift32:
+    return Error(Loc,
+       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
+  case Match_AddSubRegShift64:
+    return Error(Loc,
+       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
+  case Match_InvalidFPImm:
+    return Error(Loc,
+                 "expected compatible register or floating-point constant");
+  case Match_InvalidMemoryIndexedSImm9:
+    return Error(Loc, "index must be an integer in range [-256, 255].");
+  case Match_InvalidMemoryIndexed4SImm7:
+    return Error(Loc, "index must be a multiple of 4 in range [-256, 252].");
+  case Match_InvalidMemoryIndexed8SImm7:
+    return Error(Loc, "index must be a multiple of 8 in range [-512, 504].");
+  case Match_InvalidMemoryIndexed16SImm7:
+    return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008].");
+  case Match_InvalidMemoryWExtend8:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0");
+  case Match_InvalidMemoryWExtend16:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
+  case Match_InvalidMemoryWExtend32:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
+  case Match_InvalidMemoryWExtend64:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
+  case Match_InvalidMemoryWExtend128:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #4");
+  case Match_InvalidMemoryXExtend8:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0");
+  case Match_InvalidMemoryXExtend16:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
+  case Match_InvalidMemoryXExtend32:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
+  case Match_InvalidMemoryXExtend64:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
+  case Match_InvalidMemoryXExtend128:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
+  case Match_InvalidMemoryIndexed1:
+    return Error(Loc, "index must be an integer in range [0, 4095].");
+  case Match_InvalidMemoryIndexed2:
+    return Error(Loc, "index must be a multiple of 2 in range [0, 8190].");
+  case Match_InvalidMemoryIndexed4:
+    return Error(Loc, "index must be a multiple of 4 in range [0, 16380].");
+  case Match_InvalidMemoryIndexed8:
+    return Error(Loc, "index must be a multiple of 8 in range [0, 32760].");
+  case Match_InvalidMemoryIndexed16:
+    return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
+  case Match_InvalidImm0_1:
+    return Error(Loc, "immediate must be an integer in range [0, 1].");
+  case Match_InvalidImm0_7:
+    return Error(Loc, "immediate must be an integer in range [0, 7].");
+  case Match_InvalidImm0_15:
+    return Error(Loc, "immediate must be an integer in range [0, 15].");
+  case Match_InvalidImm0_31:
+    return Error(Loc, "immediate must be an integer in range [0, 31].");
+  case Match_InvalidImm0_63:
+    return Error(Loc, "immediate must be an integer in range [0, 63].");
+  case Match_InvalidImm0_127:
+    return Error(Loc, "immediate must be an integer in range [0, 127].");
+  case Match_InvalidImm0_65535:
+    return Error(Loc, "immediate must be an integer in range [0, 65535].");
+  case Match_InvalidImm1_8:
+    return Error(Loc, "immediate must be an integer in range [1, 8].");
+  case Match_InvalidImm1_16:
+    return Error(Loc, "immediate must be an integer in range [1, 16].");
+  case Match_InvalidImm1_32:
+    return Error(Loc, "immediate must be an integer in range [1, 32].");
+  case Match_InvalidImm1_64:
+    return Error(Loc, "immediate must be an integer in range [1, 64].");
+  case Match_InvalidIndex1:
+    return Error(Loc, "expected lane specifier '[1]'");
+  case Match_InvalidIndexB:
+    return Error(Loc, "vector lane must be an integer in range [0, 15].");
+  case Match_InvalidIndexH:
+    return Error(Loc, "vector lane must be an integer in range [0, 7].");
+  case Match_InvalidIndexS:
+    return Error(Loc, "vector lane must be an integer in range [0, 3].");
+  case Match_InvalidIndexD:
+    return Error(Loc, "vector lane must be an integer in range [0, 1].");
+  case Match_InvalidLabel:
+    return Error(Loc, "expected label or encodable integer pc offset");
+  case Match_MRS:
+    return Error(Loc, "expected readable system register");
+  case Match_MSR:
+    return Error(Loc, "expected writable system register or pstate");
+  case Match_MnemonicFail:
+    return Error(Loc, "unrecognized instruction mnemonic");
+  default:
+    llvm_unreachable("unexpected error code!");
+  }
+}
+
+static const char *getSubtargetFeatureName(uint64_t Val);
+
+bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                               OperandVector &Operands,
+                                               MCStreamer &Out,
+                                               uint64_t &ErrorInfo,
+                                               bool MatchingInlineAsm) {
+  assert(!Operands.empty() && "Unexpect empty operand list!");
+  AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[0]);
+  assert(Op.isToken() && "Leading operand should always be a mnemonic!");
+
+  StringRef Tok = Op.getToken();
+  unsigned NumOperands = Operands.size();
+
+  if (NumOperands == 4 && Tok == "lsl") {
+    AArch64Operand &Op2 = static_cast<AArch64Operand &>(*Operands[2]);
+    AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
+    if (Op2.isReg() && Op3.isImm()) {
+      const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
+      if (Op3CE) {
+        uint64_t Op3Val = Op3CE->getValue();
+        uint64_t NewOp3Val = 0;
+        uint64_t NewOp4Val = 0;
+        if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
+                Op2.getReg())) {
+          NewOp3Val = (32 - Op3Val) & 0x1f;
+          NewOp4Val = 31 - Op3Val;
+        } else {
+          NewOp3Val = (64 - Op3Val) & 0x3f;
+          NewOp4Val = 63 - Op3Val;
+        }
+
+        const MCExpr *NewOp3 = MCConstantExpr::create(NewOp3Val, getContext());
+        const MCExpr *NewOp4 = MCConstantExpr::create(NewOp4Val, getContext());
+
+        Operands[0] = AArch64Operand::CreateToken(
+            "ubfm", false, Op.getStartLoc(), getContext());
+        Operands.push_back(AArch64Operand::CreateImm(
+            NewOp4, Op3.getStartLoc(), Op3.getEndLoc(), getContext()));
+        Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3.getStartLoc(),
+                                                Op3.getEndLoc(), getContext());
+      }
+    }
+  } else if (NumOperands == 4 && Tok == "bfc") {
+    // FIXME: Horrible hack to handle BFC->BFM alias.
+    AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
+    AArch64Operand LSBOp = static_cast<AArch64Operand &>(*Operands[2]);
+    AArch64Operand WidthOp = static_cast<AArch64Operand &>(*Operands[3]);
+
+    if (Op1.isReg() && LSBOp.isImm() && WidthOp.isImm()) {
+      const MCConstantExpr *LSBCE = dyn_cast<MCConstantExpr>(LSBOp.getImm());
+      const MCConstantExpr *WidthCE = dyn_cast<MCConstantExpr>(WidthOp.getImm());
+
+      if (LSBCE && WidthCE) {
+        uint64_t LSB = LSBCE->getValue();
+        uint64_t Width = WidthCE->getValue();
+
+        uint64_t RegWidth = 0;
+        if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+                Op1.getReg()))
+          RegWidth = 64;
+        else
+          RegWidth = 32;
+
+        if (LSB >= RegWidth)
+          return Error(LSBOp.getStartLoc(),
+                       "expected integer in range [0, 31]");
+        if (Width < 1 || Width > RegWidth)
+          return Error(WidthOp.getStartLoc(),
+                       "expected integer in range [1, 32]");
+
+        uint64_t ImmR = 0;
+        if (RegWidth == 32)
+          ImmR = (32 - LSB) & 0x1f;
+        else
+          ImmR = (64 - LSB) & 0x3f;
+
+        uint64_t ImmS = Width - 1;
+
+        if (ImmR != 0 && ImmS >= ImmR)
+          return Error(WidthOp.getStartLoc(),
+                       "requested insert overflows register");
+
+        const MCExpr *ImmRExpr = MCConstantExpr::create(ImmR, getContext());
+        const MCExpr *ImmSExpr = MCConstantExpr::create(ImmS, getContext());
+        Operands[0] = AArch64Operand::CreateToken(
+              "bfm", false, Op.getStartLoc(), getContext());
+        Operands[2] = AArch64Operand::CreateReg(
+            RegWidth == 32 ? AArch64::WZR : AArch64::XZR, false, SMLoc(),
+            SMLoc(), getContext());
+        Operands[3] = AArch64Operand::CreateImm(
+            ImmRExpr, LSBOp.getStartLoc(), LSBOp.getEndLoc(), getContext());
+        Operands.emplace_back(
+            AArch64Operand::CreateImm(ImmSExpr, WidthOp.getStartLoc(),
+                                      WidthOp.getEndLoc(), getContext()));
+      }
+    }
+  } else if (NumOperands == 5) {
+    // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and
+    // UBFIZ -> UBFM aliases.
+    if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") {
+      AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
+      AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
+      AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]);
+
+      if (Op1.isReg() && Op3.isImm() && Op4.isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
+        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm());
+
+        if (Op3CE && Op4CE) {
+          uint64_t Op3Val = Op3CE->getValue();
+          uint64_t Op4Val = Op4CE->getValue();
+
+          uint64_t RegWidth = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+                  Op1.getReg()))
+            RegWidth = 64;
+          else
+            RegWidth = 32;
+
+          if (Op3Val >= RegWidth)
+            return Error(Op3.getStartLoc(),
+                         "expected integer in range [0, 31]");
+          if (Op4Val < 1 || Op4Val > RegWidth)
+            return Error(Op4.getStartLoc(),
+                         "expected integer in range [1, 32]");
+
+          uint64_t NewOp3Val = 0;
+          if (RegWidth == 32)
+            NewOp3Val = (32 - Op3Val) & 0x1f;
+          else
+            NewOp3Val = (64 - Op3Val) & 0x3f;
+
+          uint64_t NewOp4Val = Op4Val - 1;
+
+          if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val)
+            return Error(Op4.getStartLoc(),
+                         "requested insert overflows register");
+
+          const MCExpr *NewOp3 =
+              MCConstantExpr::create(NewOp3Val, getContext());
+          const MCExpr *NewOp4 =
+              MCConstantExpr::create(NewOp4Val, getContext());
+          Operands[3] = AArch64Operand::CreateImm(
+              NewOp3, Op3.getStartLoc(), Op3.getEndLoc(), getContext());
+          Operands[4] = AArch64Operand::CreateImm(
+              NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
+          if (Tok == "bfi")
+            Operands[0] = AArch64Operand::CreateToken(
+                "bfm", false, Op.getStartLoc(), getContext());
+          else if (Tok == "sbfiz")
+            Operands[0] = AArch64Operand::CreateToken(
+                "sbfm", false, Op.getStartLoc(), getContext());
+          else if (Tok == "ubfiz")
+            Operands[0] = AArch64Operand::CreateToken(
+                "ubfm", false, Op.getStartLoc(), getContext());
+          else
+            llvm_unreachable("No valid mnemonic for alias?");
+        }
+      }
+
+      // FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and
+      // UBFX -> UBFM aliases.
+    } else if (NumOperands == 5 &&
+               (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) {
+      AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
+      AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
+      AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]);
+
+      if (Op1.isReg() && Op3.isImm() && Op4.isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
+        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm());
+
+        if (Op3CE && Op4CE) {
+          uint64_t Op3Val = Op3CE->getValue();
+          uint64_t Op4Val = Op4CE->getValue();
+
+          uint64_t RegWidth = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+                  Op1.getReg()))
+            RegWidth = 64;
+          else
+            RegWidth = 32;
+
+          if (Op3Val >= RegWidth)
+            return Error(Op3.getStartLoc(),
+                         "expected integer in range [0, 31]");
+          if (Op4Val < 1 || Op4Val > RegWidth)
+            return Error(Op4.getStartLoc(),
+                         "expected integer in range [1, 32]");
+
+          uint64_t NewOp4Val = Op3Val + Op4Val - 1;
+
+          if (NewOp4Val >= RegWidth || NewOp4Val < Op3Val)
+            return Error(Op4.getStartLoc(),
+                         "requested extract overflows register");
+
+          const MCExpr *NewOp4 =
+              MCConstantExpr::create(NewOp4Val, getContext());
+          Operands[4] = AArch64Operand::CreateImm(
+              NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
+          if (Tok == "bfxil")
+            Operands[0] = AArch64Operand::CreateToken(
+                "bfm", false, Op.getStartLoc(), getContext());
+          else if (Tok == "sbfx")
+            Operands[0] = AArch64Operand::CreateToken(
+                "sbfm", false, Op.getStartLoc(), getContext());
+          else if (Tok == "ubfx")
+            Operands[0] = AArch64Operand::CreateToken(
+                "ubfm", false, Op.getStartLoc(), getContext());
+          else
+            llvm_unreachable("No valid mnemonic for alias?");
+        }
+      }
+    }
+  }
+  // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands.
+  //        InstAlias can't quite handle this since the reg classes aren't
+  //        subclasses.
+  if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) {
+    // The source register can be Wn here, but the matcher expects a
+    // GPR64. Twiddle it here if necessary.
+    AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
+    if (Op.isReg()) {
+      unsigned Reg = getXRegFromWReg(Op.getReg());
+      Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+                                              Op.getEndLoc(), getContext());
+    }
+  }
+  // FIXME: Likewise for sxt[bh] with a Xd dst operand
+  else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) {
+    AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
+    if (Op.isReg() &&
+        AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+            Op.getReg())) {
+      // The source register can be Wn here, but the matcher expects a
+      // GPR64. Twiddle it here if necessary.
+      AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
+      if (Op.isReg()) {
+        unsigned Reg = getXRegFromWReg(Op.getReg());
+        Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+                                                Op.getEndLoc(), getContext());
+      }
+    }
+  }
+  // FIXME: Likewise for uxt[bh] with a Xd dst operand
+  else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) {
+    AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
+    if (Op.isReg() &&
+        AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+            Op.getReg())) {
+      // The source register can be Wn here, but the matcher expects a
+      // GPR32. Twiddle it here if necessary.
+      AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
+      if (Op.isReg()) {
+        unsigned Reg = getWRegFromXReg(Op.getReg());
+        Operands[1] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(),
+                                                Op.getEndLoc(), getContext());
+      }
+    }
+  }
+
+  // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR.
+  if (NumOperands == 3 && Tok == "fmov") {
+    AArch64Operand &RegOp = static_cast<AArch64Operand &>(*Operands[1]);
+    AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]);
+    if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) {
+      unsigned zreg =
+          !AArch64MCRegisterClasses[AArch64::FPR64RegClassID].contains(
+              RegOp.getReg())
+              ? AArch64::WZR
+              : AArch64::XZR;
+      Operands[2] = AArch64Operand::CreateReg(zreg, false, Op.getStartLoc(),
+                                              Op.getEndLoc(), getContext());
+    }
+  }
+
+  MCInst Inst;
+  // First try to match against the secondary set of tables containing the
+  // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1);
+
+  // If that fails, try against the alternate table containing long-form NEON:
+  // "fadd v0.2s, v1.2s, v2.2s"
+  if (MatchResult != Match_Success) {
+    // But first, save the short-form match result: we can use it in case the
+    // long-form match also fails.
+    auto ShortFormNEONErrorInfo = ErrorInfo;
+    auto ShortFormNEONMatchResult = MatchResult;
+
+    MatchResult =
+        MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
+
+    // Now, both matches failed, and the long-form match failed on the mnemonic
+    // suffix token operand.  The short-form match failure is probably more
+    // relevant: use it instead.
+    if (MatchResult == Match_InvalidOperand && ErrorInfo == 1 &&
+        Operands.size() > 1 && ((AArch64Operand &)*Operands[1]).isToken() &&
+        ((AArch64Operand &)*Operands[1]).isTokenSuffix()) {
+      MatchResult = ShortFormNEONMatchResult;
+      ErrorInfo = ShortFormNEONErrorInfo;
+    }
+  }
+
+
+  switch (MatchResult) {
+  case Match_Success: {
+    // Perform range checking and other semantic validations
+    SmallVector<SMLoc, 8> OperandLocs;
+    NumOperands = Operands.size();
+    for (unsigned i = 1; i < NumOperands; ++i)
+      OperandLocs.push_back(Operands[i]->getStartLoc());
+    if (validateInstruction(Inst, OperandLocs))
+      return true;
+
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst, getSTI());
+    return false;
+  }
+  case Match_MissingFeature: {
+    assert(ErrorInfo && "Unknown missing feature!");
+    // Special case the error message for the very common case where only
+    // a single subtarget feature is missing (neon, e.g.).
+    std::string Msg = "instruction requires:";
+    uint64_t Mask = 1;
+    for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+      if (ErrorInfo & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg);
+  }
+  case Match_MnemonicFail:
+    return showMatchError(IDLoc, MatchResult);
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+
+    if (ErrorInfo != ~0ULL) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction",
+                     SMRange(IDLoc, getTok().getLoc()));
+
+      ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+    // If the match failed on a suffix token operand, tweak the diagnostic
+    // accordingly.
+    if (((AArch64Operand &)*Operands[ErrorInfo]).isToken() &&
+        ((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix())
+      MatchResult = Match_InvalidSuffix;
+
+    return showMatchError(ErrorLoc, MatchResult);
+  }
+  case Match_InvalidMemoryIndexed1:
+  case Match_InvalidMemoryIndexed2:
+  case Match_InvalidMemoryIndexed4:
+  case Match_InvalidMemoryIndexed8:
+  case Match_InvalidMemoryIndexed16:
+  case Match_InvalidCondCode:
+  case Match_AddSubRegExtendSmall:
+  case Match_AddSubRegExtendLarge:
+  case Match_AddSubSecondSource:
+  case Match_LogicalSecondSource:
+  case Match_AddSubRegShift32:
+  case Match_AddSubRegShift64:
+  case Match_InvalidMovImm32Shift:
+  case Match_InvalidMovImm64Shift:
+  case Match_InvalidFPImm:
+  case Match_InvalidMemoryWExtend8:
+  case Match_InvalidMemoryWExtend16:
+  case Match_InvalidMemoryWExtend32:
+  case Match_InvalidMemoryWExtend64:
+  case Match_InvalidMemoryWExtend128:
+  case Match_InvalidMemoryXExtend8:
+  case Match_InvalidMemoryXExtend16:
+  case Match_InvalidMemoryXExtend32:
+  case Match_InvalidMemoryXExtend64:
+  case Match_InvalidMemoryXExtend128:
+  case Match_InvalidMemoryIndexed4SImm7:
+  case Match_InvalidMemoryIndexed8SImm7:
+  case Match_InvalidMemoryIndexed16SImm7:
+  case Match_InvalidMemoryIndexedSImm9:
+  case Match_InvalidImm0_1:
+  case Match_InvalidImm0_7:
+  case Match_InvalidImm0_15:
+  case Match_InvalidImm0_31:
+  case Match_InvalidImm0_63:
+  case Match_InvalidImm0_127:
+  case Match_InvalidImm0_65535:
+  case Match_InvalidImm1_8:
+  case Match_InvalidImm1_16:
+  case Match_InvalidImm1_32:
+  case Match_InvalidImm1_64:
+  case Match_InvalidIndex1:
+  case Match_InvalidIndexB:
+  case Match_InvalidIndexH:
+  case Match_InvalidIndexS:
+  case Match_InvalidIndexD:
+  case Match_InvalidLabel:
+  case Match_MSR:
+  case Match_MRS: {
+    if (ErrorInfo >= Operands.size())
+      return Error(IDLoc, "too few operands for instruction", SMRange(IDLoc, (*Operands.back()).getEndLoc()));
+    // Any time we get here, there's nothing fancy to do. Just get the
+    // operand SMLoc and display the diagnostic.
+    SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
+    if (ErrorLoc == SMLoc())
+      ErrorLoc = IDLoc;
+    return showMatchError(ErrorLoc, MatchResult);
+  }
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+}
+
+/// ParseDirective parses the arm specific directives
+bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
+  const MCObjectFileInfo::Environment Format =
+    getContext().getObjectFileInfo()->getObjectFileType();
+  bool IsMachO = Format == MCObjectFileInfo::IsMachO;
+  bool IsCOFF = Format == MCObjectFileInfo::IsCOFF;
+
+  StringRef IDVal = DirectiveID.getIdentifier();
+  SMLoc Loc = DirectiveID.getLoc();
+  if (IDVal == ".arch")
+    parseDirectiveArch(Loc);
+  else if (IDVal == ".cpu")
+    parseDirectiveCPU(Loc);
+  else if (IDVal == ".hword")
+    parseDirectiveWord(2, Loc);
+  else if (IDVal == ".word")
+    parseDirectiveWord(4, Loc);
+  else if (IDVal == ".xword")
+    parseDirectiveWord(8, Loc);
+  else if (IDVal == ".tlsdesccall")
+    parseDirectiveTLSDescCall(Loc);
+  else if (IDVal == ".ltorg" || IDVal == ".pool")
+    parseDirectiveLtorg(Loc);
+  else if (IDVal == ".unreq")
+    parseDirectiveUnreq(Loc);
+  else if (!IsMachO && !IsCOFF) {
+    if (IDVal == ".inst")
+      parseDirectiveInst(Loc);
+    else
+      return true;
+  } else if (IDVal == MCLOHDirectiveName())
+    parseDirectiveLOH(IDVal, Loc);
+  else
+    return true;
+  return false;
+}
+
+static const struct {
+  const char *Name;
+  const FeatureBitset Features;
+} ExtensionMap[] = {
+  { "crc", {AArch64::FeatureCRC} },
+  { "crypto", {AArch64::FeatureCrypto} },
+  { "fp", {AArch64::FeatureFPARMv8} },
+  { "simd", {AArch64::FeatureNEON} },
+  { "ras", {AArch64::FeatureRAS} },
+  { "lse", {AArch64::FeatureLSE} },
+
+  // FIXME: Unsupported extensions
+  { "pan", {} },
+  { "lor", {} },
+  { "rdma", {} },
+  { "profile", {} },
+};
+
+/// parseDirectiveArch
+///   ::= .arch token
+bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
+  SMLoc ArchLoc = getLoc();
+
+  StringRef Arch, ExtensionString;
+  std::tie(Arch, ExtensionString) =
+      getParser().parseStringToEndOfStatement().trim().split('+');
+
+  unsigned ID = AArch64::parseArch(Arch);
+  if (ID == static_cast<unsigned>(AArch64::ArchKind::AK_INVALID))
+    return Error(ArchLoc, "unknown arch name");
+
+  if (parseToken(AsmToken::EndOfStatement))
+    return true;
+
+  // Get the architecture and extension features.
+  std::vector<StringRef> AArch64Features;
+  AArch64::getArchFeatures(ID, AArch64Features);
+  AArch64::getExtensionFeatures(AArch64::getDefaultExtensions("generic", ID),
+                                AArch64Features);
+
+  MCSubtargetInfo &STI = copySTI();
+  std::vector<std::string> ArchFeatures(AArch64Features.begin(), AArch64Features.end());
+  STI.setDefaultFeatures("generic", join(ArchFeatures.begin(), ArchFeatures.end(), ","));
+
+  SmallVector<StringRef, 4> RequestedExtensions;
+  if (!ExtensionString.empty())
+    ExtensionString.split(RequestedExtensions, '+');
+
+  FeatureBitset Features = STI.getFeatureBits();
+  for (auto Name : RequestedExtensions) {
+    bool EnableFeature = true;
+
+    if (Name.startswith_lower("no")) {
+      EnableFeature = false;
+      Name = Name.substr(2);
+    }
+
+    for (const auto &Extension : ExtensionMap) {
+      if (Extension.Name != Name)
+        continue;
+
+      if (Extension.Features.none())
+        report_fatal_error("unsupported architectural extension: " + Name);
+
+      FeatureBitset ToggleFeatures = EnableFeature
+                                         ? (~Features & Extension.Features)
+                                         : ( Features & Extension.Features);
+      uint64_t Features =
+          ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
+      setAvailableFeatures(Features);
+      break;
+    }
+  }
+  return false;
+}
+
+/// parseDirectiveCPU
+///   ::= .cpu id
+bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
+  SMLoc CPULoc = getLoc();
+
+  StringRef CPU, ExtensionString;
+  std::tie(CPU, ExtensionString) =
+      getParser().parseStringToEndOfStatement().trim().split('+');
+
+  if (parseToken(AsmToken::EndOfStatement))
+    return true;
+
+  SmallVector<StringRef, 4> RequestedExtensions;
+  if (!ExtensionString.empty())
+    ExtensionString.split(RequestedExtensions, '+');
+
+  // FIXME This is using tablegen data, but should be moved to ARMTargetParser
+  // once that is tablegen'ed
+  if (!getSTI().isCPUStringValid(CPU)) {
+    Error(CPULoc, "unknown CPU name");
+    return false;
+  }
+
+  MCSubtargetInfo &STI = copySTI();
+  STI.setDefaultFeatures(CPU, "");
+
+  FeatureBitset Features = STI.getFeatureBits();
+  for (auto Name : RequestedExtensions) {
+    bool EnableFeature = true;
+
+    if (Name.startswith_lower("no")) {
+      EnableFeature = false;
+      Name = Name.substr(2);
+    }
+
+    for (const auto &Extension : ExtensionMap) {
+      if (Extension.Name != Name)
+        continue;
+
+      if (Extension.Features.none())
+        report_fatal_error("unsupported architectural extension: " + Name);
+
+      FeatureBitset ToggleFeatures = EnableFeature
+                                         ? (~Features & Extension.Features)
+                                         : ( Features & Extension.Features);
+      uint64_t Features =
+          ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
+      setAvailableFeatures(Features);
+
+      break;
+    }
+  }
+  return false;
+}
+
+/// parseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
+  auto parseOp = [&]() -> bool {
+    const MCExpr *Value;
+    if (getParser().parseExpression(Value))
+      return true;
+    getParser().getStreamer().EmitValue(Value, Size, L);
+    return false;
+  };
+
+  if (parseMany(parseOp))
+    return true;
+  return false;
+}
+
+/// parseDirectiveInst
+///  ::= .inst opcode [, ...]
+bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) {
+  if (getLexer().is(AsmToken::EndOfStatement))
+    return Error(Loc, "expected expression following '.inst' directive");
+
+  auto parseOp = [&]() -> bool {
+    SMLoc L = getLoc();
+    const MCExpr *Expr;
+    if (check(getParser().parseExpression(Expr), L, "expected expression"))
+      return true;
+    const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
+    if (check(!Value, L, "expected constant expression"))
+      return true;
+    getTargetStreamer().emitInst(Value->getValue());
+    return false;
+  };
+
+  if (parseMany(parseOp))
+    return addErrorSuffix(" in '.inst' directive");
+  return false;
+}
+
+// parseDirectiveTLSDescCall:
+//   ::= .tlsdesccall symbol
+bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
+  StringRef Name;
+  if (check(getParser().parseIdentifier(Name), L,
+            "expected symbol after directive") ||
+      parseToken(AsmToken::EndOfStatement))
+    return true;
+
+  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
+  Expr = AArch64MCExpr::create(Expr, AArch64MCExpr::VK_TLSDESC, getContext());
+
+  MCInst Inst;
+  Inst.setOpcode(AArch64::TLSDESCCALL);
+  Inst.addOperand(MCOperand::createExpr(Expr));
+
+  getParser().getStreamer().EmitInstruction(Inst, getSTI());
+  return false;
+}
+
+/// ::= .loh <lohName | lohId> label1, ..., labelN
+/// The number of arguments depends on the loh identifier.
+bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
+  MCLOHType Kind;
+  if (getParser().getTok().isNot(AsmToken::Identifier)) {
+    if (getParser().getTok().isNot(AsmToken::Integer))
+      return TokError("expected an identifier or a number in directive");
+    // We successfully get a numeric value for the identifier.
+    // Check if it is valid.
+    int64_t Id = getParser().getTok().getIntVal();
+    if (Id <= -1U && !isValidMCLOHType(Id))
+      return TokError("invalid numeric identifier in directive");
+    Kind = (MCLOHType)Id;
+  } else {
+    StringRef Name = getTok().getIdentifier();
+    // We successfully parse an identifier.
+    // Check if it is a recognized one.
+    int Id = MCLOHNameToId(Name);
+
+    if (Id == -1)
+      return TokError("invalid identifier in directive");
+    Kind = (MCLOHType)Id;
+  }
+  // Consume the identifier.
+  Lex();
+  // Get the number of arguments of this LOH.
+  int NbArgs = MCLOHIdToNbArgs(Kind);
+
+  assert(NbArgs != -1 && "Invalid number of arguments");
+
+  SmallVector<MCSymbol *, 3> Args;
+  for (int Idx = 0; Idx < NbArgs; ++Idx) {
+    StringRef Name;
+    if (getParser().parseIdentifier(Name))
+      return TokError("expected identifier in directive");
+    Args.push_back(getContext().getOrCreateSymbol(Name));
+
+    if (Idx + 1 == NbArgs)
+      break;
+    if (parseToken(AsmToken::Comma,
+                   "unexpected token in '" + Twine(IDVal) + "' directive"))
+      return true;
+  }
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '" + Twine(IDVal) + "' directive"))
+    return true;
+
+  getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
+  return false;
+}
+
+/// parseDirectiveLtorg
+///  ::= .ltorg | .pool
+bool AArch64AsmParser::parseDirectiveLtorg(SMLoc L) {
+  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+    return true;
+  getTargetStreamer().emitCurrentConstantPool();
+  return false;
+}
+
+/// parseDirectiveReq
+///  ::= name .req registername
+bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex(); // Eat the '.req' token.
+  SMLoc SRegLoc = getLoc();
+  unsigned RegNum = tryParseRegister();
+  bool IsVector = false;
+
+  if (RegNum == static_cast<unsigned>(-1)) {
+    StringRef Kind;
+    RegNum = tryMatchVectorRegister(Kind, false);
+    if (!Kind.empty())
+      return Error(SRegLoc, "vector register without type specifier expected");
+    IsVector = true;
+  }
+
+  if (RegNum == static_cast<unsigned>(-1))
+    return Error(SRegLoc, "register name or alias expected");
+
+  // Shouldn't be anything else.
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected input in .req directive"))
+    return true;
+
+  auto pair = std::make_pair(IsVector, RegNum);
+  if (RegisterReqs.insert(std::make_pair(Name, pair)).first->second != pair)
+    Warning(L, "ignoring redefinition of register alias '" + Name + "'");
+
+  return false;
+}
+
+/// parseDirectiveUneq
+///  ::= .unreq registername
+bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  if (getTok().isNot(AsmToken::Identifier))
+    return TokError("unexpected input in .unreq directive.");
+  RegisterReqs.erase(Parser.getTok().getIdentifier().lower());
+  Parser.Lex(); // Eat the identifier.
+  if (parseToken(AsmToken::EndOfStatement))
+    return addErrorSuffix("in '.unreq' directive");
+  return false;
+}
+
+bool
+AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
+                                    AArch64MCExpr::VariantKind &ELFRefKind,
+                                    MCSymbolRefExpr::VariantKind &DarwinRefKind,
+                                    int64_t &Addend) {
+  ELFRefKind = AArch64MCExpr::VK_INVALID;
+  DarwinRefKind = MCSymbolRefExpr::VK_None;
+  Addend = 0;
+
+  if (const AArch64MCExpr *AE = dyn_cast<AArch64MCExpr>(Expr)) {
+    ELFRefKind = AE->getKind();
+    Expr = AE->getSubExpr();
+  }
+
+  const MCSymbolRefExpr *SE = dyn_cast<MCSymbolRefExpr>(Expr);
+  if (SE) {
+    // It's a simple symbol reference with no addend.
+    DarwinRefKind = SE->getKind();
+    return true;
+  }
+
+  const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
+  if (!BE)
+    return false;
+
+  SE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+  if (!SE)
+    return false;
+  DarwinRefKind = SE->getKind();
+
+  if (BE->getOpcode() != MCBinaryExpr::Add &&
+      BE->getOpcode() != MCBinaryExpr::Sub)
+    return false;
+
+  // See if the addend is is a constant, otherwise there's more going
+  // on here than we can deal with.
+  auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
+  if (!AddendExpr)
+    return false;
+
+  Addend = AddendExpr->getValue();
+  if (BE->getOpcode() == MCBinaryExpr::Sub)
+    Addend = -Addend;
+
+  // It's some symbol reference + a constant addend, but really
+  // shouldn't use both Darwin and ELF syntax.
+  return ELFRefKind == AArch64MCExpr::VK_INVALID ||
+         DarwinRefKind == MCSymbolRefExpr::VK_None;
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializeAArch64AsmParser() {
+  RegisterMCAsmParser<AArch64AsmParser> X(getTheAArch64leTarget());
+  RegisterMCAsmParser<AArch64AsmParser> Y(getTheAArch64beTarget());
+  RegisterMCAsmParser<AArch64AsmParser> Z(getTheARM64Target());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#include "AArch64GenAsmMatcher.inc"
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+                                                      unsigned Kind) {
+  AArch64Operand &Op = static_cast<AArch64Operand &>(AsmOp);
+  // If the kind is a token for a literal immediate, check if our asm
+  // operand matches. This is for InstAliases which have a fixed-value
+  // immediate in the syntax.
+  int64_t ExpectedVal;
+  switch (Kind) {
+  default:
+    return Match_InvalidOperand;
+  case MCK__35_0:
+    ExpectedVal = 0;
+    break;
+  case MCK__35_1:
+    ExpectedVal = 1;
+    break;
+  case MCK__35_12:
+    ExpectedVal = 12;
+    break;
+  case MCK__35_16:
+    ExpectedVal = 16;
+    break;
+  case MCK__35_2:
+    ExpectedVal = 2;
+    break;
+  case MCK__35_24:
+    ExpectedVal = 24;
+    break;
+  case MCK__35_3:
+    ExpectedVal = 3;
+    break;
+  case MCK__35_32:
+    ExpectedVal = 32;
+    break;
+  case MCK__35_4:
+    ExpectedVal = 4;
+    break;
+  case MCK__35_48:
+    ExpectedVal = 48;
+    break;
+  case MCK__35_6:
+    ExpectedVal = 6;
+    break;
+  case MCK__35_64:
+    ExpectedVal = 64;
+    break;
+  case MCK__35_8:
+    ExpectedVal = 8;
+    break;
+  }
+  if (!Op.isImm())
+    return Match_InvalidOperand;
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm());
+  if (!CE)
+    return Match_InvalidOperand;
+  if (CE->getValue() == ExpectedVal)
+    return Match_Success;
+  return Match_InvalidOperand;
+}
+
+
+OperandMatchResultTy
+AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
+
+  SMLoc S = getLoc();
+
+  if (getParser().getTok().isNot(AsmToken::Identifier)) {
+    Error(S, "expected register");
+    return MatchOperand_ParseFail;
+  }
+
+  int FirstReg = tryParseRegister();
+  if (FirstReg == -1) {
+    return MatchOperand_ParseFail;
+  }
+  const MCRegisterClass &WRegClass =
+      AArch64MCRegisterClasses[AArch64::GPR32RegClassID];
+  const MCRegisterClass &XRegClass =
+      AArch64MCRegisterClasses[AArch64::GPR64RegClassID];
+
+  bool isXReg = XRegClass.contains(FirstReg),
+       isWReg = WRegClass.contains(FirstReg);
+  if (!isXReg && !isWReg) {
+    Error(S, "expected first even register of a "
+             "consecutive same-size even/odd register pair");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCRegisterInfo *RI = getContext().getRegisterInfo();
+  unsigned FirstEncoding = RI->getEncodingValue(FirstReg);
+
+  if (FirstEncoding & 0x1) {
+    Error(S, "expected first even register of a "
+             "consecutive same-size even/odd register pair");
+    return MatchOperand_ParseFail;
+  }
+
+  SMLoc M = getLoc();
+  if (getParser().getTok().isNot(AsmToken::Comma)) {
+    Error(M, "expected comma");
+    return MatchOperand_ParseFail;
+  }
+  // Eat the comma
+  getParser().Lex();
+
+  SMLoc E = getLoc();
+  int SecondReg = tryParseRegister();
+  if (SecondReg ==-1) {
+    return MatchOperand_ParseFail;
+  }
+
+ if (RI->getEncodingValue(SecondReg) != FirstEncoding + 1 ||
+      (isXReg && !XRegClass.contains(SecondReg)) ||
+      (isWReg && !WRegClass.contains(SecondReg))) {
+    Error(E,"expected second odd register of a "
+             "consecutive same-size even/odd register pair");
+    return MatchOperand_ParseFail;
+  }
+
+  unsigned Pair = 0;
+  if(isXReg) {
+    Pair = RI->getMatchingSuperReg(FirstReg, AArch64::sube64,
+           &AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID]);
+  } else {
+    Pair = RI->getMatchingSuperReg(FirstReg, AArch64::sube32,
+           &AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID]);
+  }
+
+  Operands.push_back(AArch64Operand::CreateReg(Pair, false, S, getLoc(),
+      getContext()));
+
+  return MatchOperand_Success;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
new file mode 100644
index 000000000000..0d860a7eef79
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -0,0 +1,1588 @@
+//===- AArch64Disassembler.cpp - Disassembler for AArch64 -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64Disassembler.h"
+#include "AArch64ExternalSymbolizer.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-disassembler"
+
+// Pull DecodeStatus and its enum values into the global namespace.
+typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
+
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
+static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
+                                              unsigned RegNo, uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+
+static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
+                                       uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
+                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst,
+                                                      unsigned RegNo,
+                                                      uint64_t Addr,
+                                                      const void *Decoder);
+static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
+                                                      unsigned RegNo,
+                                                      uint64_t Addr,
+                                                      const void *Decoder);
+
+static bool Check(DecodeStatus &Out, DecodeStatus In) {
+  switch (In) {
+    case MCDisassembler::Success:
+      // Out stays the same.
+      return true;
+    case MCDisassembler::SoftFail:
+      Out = In;
+      return true;
+    case MCDisassembler::Fail:
+      Out = In;
+      return false;
+  }
+  llvm_unreachable("Invalid DecodeStatus!");
+}
+
+#include "AArch64GenDisassemblerTables.inc"
+#include "AArch64GenInstrInfo.inc"
+
+#define Success llvm::MCDisassembler::Success
+#define Fail llvm::MCDisassembler::Fail
+#define SoftFail llvm::MCDisassembler::SoftFail
+
+static MCDisassembler *createAArch64Disassembler(const Target &T,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new AArch64Disassembler(STI, Ctx);
+}
+
+DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                 ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address,
+                                                 raw_ostream &OS,
+                                                 raw_ostream &CS) const {
+  CommentStream = &CS;
+
+  Size = 0;
+  // We want to read exactly 4 bytes of data.
+  if (Bytes.size() < 4)
+    return Fail;
+  Size = 4;
+
+  // Encoded as a small-endian 32-bit word in the stream.
+  uint32_t Insn =
+      (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
+
+  // Calling the auto-generated decoder function.
+  return decodeInstruction(DecoderTable32, MI, Insn, Address, this, STI);
+}
+
+static MCSymbolizer *
+createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo,
+                                LLVMSymbolLookupCallback SymbolLookUp,
+                                void *DisInfo, MCContext *Ctx,
+                                std::unique_ptr<MCRelocationInfo> &&RelInfo) {
+  return new llvm::AArch64ExternalSymbolizer(*Ctx, move(RelInfo), GetOpInfo,
+                                             SymbolLookUp, DisInfo);
+}
+
+extern "C" void LLVMInitializeAArch64Disassembler() {
+  TargetRegistry::RegisterMCDisassembler(getTheAArch64leTarget(),
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheAArch64beTarget(),
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCSymbolizer(getTheAArch64leTarget(),
+                                       createAArch64ExternalSymbolizer);
+  TargetRegistry::RegisterMCSymbolizer(getTheAArch64beTarget(),
+                                       createAArch64ExternalSymbolizer);
+
+  TargetRegistry::RegisterMCDisassembler(getTheARM64Target(),
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCSymbolizer(getTheARM64Target(),
+                                       createAArch64ExternalSymbolizer);
+}
+
+static const unsigned FPR128DecoderTable[] = {
+    AArch64::Q0,  AArch64::Q1,  AArch64::Q2,  AArch64::Q3,  AArch64::Q4,
+    AArch64::Q5,  AArch64::Q6,  AArch64::Q7,  AArch64::Q8,  AArch64::Q9,
+    AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
+    AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
+    AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
+    AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
+    AArch64::Q30, AArch64::Q31
+};
+
+static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR128DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                 uint64_t Addr,
+                                                 const void *Decoder) {
+  if (RegNo > 15)
+    return Fail;
+  return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
+}
+
+static const unsigned FPR64DecoderTable[] = {
+    AArch64::D0,  AArch64::D1,  AArch64::D2,  AArch64::D3,  AArch64::D4,
+    AArch64::D5,  AArch64::D6,  AArch64::D7,  AArch64::D8,  AArch64::D9,
+    AArch64::D10, AArch64::D11, AArch64::D12, AArch64::D13, AArch64::D14,
+    AArch64::D15, AArch64::D16, AArch64::D17, AArch64::D18, AArch64::D19,
+    AArch64::D20, AArch64::D21, AArch64::D22, AArch64::D23, AArch64::D24,
+    AArch64::D25, AArch64::D26, AArch64::D27, AArch64::D28, AArch64::D29,
+    AArch64::D30, AArch64::D31
+};
+
+static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR64DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static const unsigned FPR32DecoderTable[] = {
+    AArch64::S0,  AArch64::S1,  AArch64::S2,  AArch64::S3,  AArch64::S4,
+    AArch64::S5,  AArch64::S6,  AArch64::S7,  AArch64::S8,  AArch64::S9,
+    AArch64::S10, AArch64::S11, AArch64::S12, AArch64::S13, AArch64::S14,
+    AArch64::S15, AArch64::S16, AArch64::S17, AArch64::S18, AArch64::S19,
+    AArch64::S20, AArch64::S21, AArch64::S22, AArch64::S23, AArch64::S24,
+    AArch64::S25, AArch64::S26, AArch64::S27, AArch64::S28, AArch64::S29,
+    AArch64::S30, AArch64::S31
+};
+
+static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR32DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static const unsigned FPR16DecoderTable[] = {
+    AArch64::H0,  AArch64::H1,  AArch64::H2,  AArch64::H3,  AArch64::H4,
+    AArch64::H5,  AArch64::H6,  AArch64::H7,  AArch64::H8,  AArch64::H9,
+    AArch64::H10, AArch64::H11, AArch64::H12, AArch64::H13, AArch64::H14,
+    AArch64::H15, AArch64::H16, AArch64::H17, AArch64::H18, AArch64::H19,
+    AArch64::H20, AArch64::H21, AArch64::H22, AArch64::H23, AArch64::H24,
+    AArch64::H25, AArch64::H26, AArch64::H27, AArch64::H28, AArch64::H29,
+    AArch64::H30, AArch64::H31
+};
+
+static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR16DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static const unsigned FPR8DecoderTable[] = {
+    AArch64::B0,  AArch64::B1,  AArch64::B2,  AArch64::B3,  AArch64::B4,
+    AArch64::B5,  AArch64::B6,  AArch64::B7,  AArch64::B8,  AArch64::B9,
+    AArch64::B10, AArch64::B11, AArch64::B12, AArch64::B13, AArch64::B14,
+    AArch64::B15, AArch64::B16, AArch64::B17, AArch64::B18, AArch64::B19,
+    AArch64::B20, AArch64::B21, AArch64::B22, AArch64::B23, AArch64::B24,
+    AArch64::B25, AArch64::B26, AArch64::B27, AArch64::B28, AArch64::B29,
+    AArch64::B30, AArch64::B31
+};
+
+static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = FPR8DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static const unsigned GPR64DecoderTable[] = {
+    AArch64::X0,  AArch64::X1,  AArch64::X2,  AArch64::X3,  AArch64::X4,
+    AArch64::X5,  AArch64::X6,  AArch64::X7,  AArch64::X8,  AArch64::X9,
+    AArch64::X10, AArch64::X11, AArch64::X12, AArch64::X13, AArch64::X14,
+    AArch64::X15, AArch64::X16, AArch64::X17, AArch64::X18, AArch64::X19,
+    AArch64::X20, AArch64::X21, AArch64::X22, AArch64::X23, AArch64::X24,
+    AArch64::X25, AArch64::X26, AArch64::X27, AArch64::X28, AArch64::FP,
+    AArch64::LR,  AArch64::XZR
+};
+
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = GPR64DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = GPR64DecoderTable[RegNo];
+  if (Register == AArch64::XZR)
+    Register = AArch64::SP;
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static const unsigned GPR32DecoderTable[] = {
+    AArch64::W0,  AArch64::W1,  AArch64::W2,  AArch64::W3,  AArch64::W4,
+    AArch64::W5,  AArch64::W6,  AArch64::W7,  AArch64::W8,  AArch64::W9,
+    AArch64::W10, AArch64::W11, AArch64::W12, AArch64::W13, AArch64::W14,
+    AArch64::W15, AArch64::W16, AArch64::W17, AArch64::W18, AArch64::W19,
+    AArch64::W20, AArch64::W21, AArch64::W22, AArch64::W23, AArch64::W24,
+    AArch64::W25, AArch64::W26, AArch64::W27, AArch64::W28, AArch64::W29,
+    AArch64::W30, AArch64::WZR
+};
+
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = GPR32DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = GPR32DecoderTable[RegNo];
+  if (Register == AArch64::WZR)
+    Register = AArch64::WSP;
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static const unsigned VectorDecoderTable[] = {
+    AArch64::Q0,  AArch64::Q1,  AArch64::Q2,  AArch64::Q3,  AArch64::Q4,
+    AArch64::Q5,  AArch64::Q6,  AArch64::Q7,  AArch64::Q8,  AArch64::Q9,
+    AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
+    AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
+    AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
+    AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
+    AArch64::Q30, AArch64::Q31
+};
+
+static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+
+  unsigned Register = VectorDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static const unsigned QQDecoderTable[] = {
+  AArch64::Q0_Q1,   AArch64::Q1_Q2,   AArch64::Q2_Q3,   AArch64::Q3_Q4,
+  AArch64::Q4_Q5,   AArch64::Q5_Q6,   AArch64::Q6_Q7,   AArch64::Q7_Q8,
+  AArch64::Q8_Q9,   AArch64::Q9_Q10,  AArch64::Q10_Q11, AArch64::Q11_Q12,
+  AArch64::Q12_Q13, AArch64::Q13_Q14, AArch64::Q14_Q15, AArch64::Q15_Q16,
+  AArch64::Q16_Q17, AArch64::Q17_Q18, AArch64::Q18_Q19, AArch64::Q19_Q20,
+  AArch64::Q20_Q21, AArch64::Q21_Q22, AArch64::Q22_Q23, AArch64::Q23_Q24,
+  AArch64::Q24_Q25, AArch64::Q25_Q26, AArch64::Q26_Q27, AArch64::Q27_Q28,
+  AArch64::Q28_Q29, AArch64::Q29_Q30, AArch64::Q30_Q31, AArch64::Q31_Q0
+};
+
+static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static const unsigned QQQDecoderTable[] = {
+  AArch64::Q0_Q1_Q2,    AArch64::Q1_Q2_Q3,    AArch64::Q2_Q3_Q4,
+  AArch64::Q3_Q4_Q5,    AArch64::Q4_Q5_Q6,    AArch64::Q5_Q6_Q7,
+  AArch64::Q6_Q7_Q8,    AArch64::Q7_Q8_Q9,    AArch64::Q8_Q9_Q10,
+  AArch64::Q9_Q10_Q11,  AArch64::Q10_Q11_Q12, AArch64::Q11_Q12_Q13,
+  AArch64::Q12_Q13_Q14, AArch64::Q13_Q14_Q15, AArch64::Q14_Q15_Q16,
+  AArch64::Q15_Q16_Q17, AArch64::Q16_Q17_Q18, AArch64::Q17_Q18_Q19,
+  AArch64::Q18_Q19_Q20, AArch64::Q19_Q20_Q21, AArch64::Q20_Q21_Q22,
+  AArch64::Q21_Q22_Q23, AArch64::Q22_Q23_Q24, AArch64::Q23_Q24_Q25,
+  AArch64::Q24_Q25_Q26, AArch64::Q25_Q26_Q27, AArch64::Q26_Q27_Q28,
+  AArch64::Q27_Q28_Q29, AArch64::Q28_Q29_Q30, AArch64::Q29_Q30_Q31,
+  AArch64::Q30_Q31_Q0,  AArch64::Q31_Q0_Q1
+};
+
+static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static const unsigned QQQQDecoderTable[] = {
+  AArch64::Q0_Q1_Q2_Q3,     AArch64::Q1_Q2_Q3_Q4,     AArch64::Q2_Q3_Q4_Q5,
+  AArch64::Q3_Q4_Q5_Q6,     AArch64::Q4_Q5_Q6_Q7,     AArch64::Q5_Q6_Q7_Q8,
+  AArch64::Q6_Q7_Q8_Q9,     AArch64::Q7_Q8_Q9_Q10,    AArch64::Q8_Q9_Q10_Q11,
+  AArch64::Q9_Q10_Q11_Q12,  AArch64::Q10_Q11_Q12_Q13, AArch64::Q11_Q12_Q13_Q14,
+  AArch64::Q12_Q13_Q14_Q15, AArch64::Q13_Q14_Q15_Q16, AArch64::Q14_Q15_Q16_Q17,
+  AArch64::Q15_Q16_Q17_Q18, AArch64::Q16_Q17_Q18_Q19, AArch64::Q17_Q18_Q19_Q20,
+  AArch64::Q18_Q19_Q20_Q21, AArch64::Q19_Q20_Q21_Q22, AArch64::Q20_Q21_Q22_Q23,
+  AArch64::Q21_Q22_Q23_Q24, AArch64::Q22_Q23_Q24_Q25, AArch64::Q23_Q24_Q25_Q26,
+  AArch64::Q24_Q25_Q26_Q27, AArch64::Q25_Q26_Q27_Q28, AArch64::Q26_Q27_Q28_Q29,
+  AArch64::Q27_Q28_Q29_Q30, AArch64::Q28_Q29_Q30_Q31, AArch64::Q29_Q30_Q31_Q0,
+  AArch64::Q30_Q31_Q0_Q1,   AArch64::Q31_Q0_Q1_Q2
+};
+
+static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQQQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static const unsigned DDDecoderTable[] = {
+  AArch64::D0_D1,   AArch64::D1_D2,   AArch64::D2_D3,   AArch64::D3_D4,
+  AArch64::D4_D5,   AArch64::D5_D6,   AArch64::D6_D7,   AArch64::D7_D8,
+  AArch64::D8_D9,   AArch64::D9_D10,  AArch64::D10_D11, AArch64::D11_D12,
+  AArch64::D12_D13, AArch64::D13_D14, AArch64::D14_D15, AArch64::D15_D16,
+  AArch64::D16_D17, AArch64::D17_D18, AArch64::D18_D19, AArch64::D19_D20,
+  AArch64::D20_D21, AArch64::D21_D22, AArch64::D22_D23, AArch64::D23_D24,
+  AArch64::D24_D25, AArch64::D25_D26, AArch64::D26_D27, AArch64::D27_D28,
+  AArch64::D28_D29, AArch64::D29_D30, AArch64::D30_D31, AArch64::D31_D0
+};
+
+static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static const unsigned DDDDecoderTable[] = {
+  AArch64::D0_D1_D2,    AArch64::D1_D2_D3,    AArch64::D2_D3_D4,
+  AArch64::D3_D4_D5,    AArch64::D4_D5_D6,    AArch64::D5_D6_D7,
+  AArch64::D6_D7_D8,    AArch64::D7_D8_D9,    AArch64::D8_D9_D10,
+  AArch64::D9_D10_D11,  AArch64::D10_D11_D12, AArch64::D11_D12_D13,
+  AArch64::D12_D13_D14, AArch64::D13_D14_D15, AArch64::D14_D15_D16,
+  AArch64::D15_D16_D17, AArch64::D16_D17_D18, AArch64::D17_D18_D19,
+  AArch64::D18_D19_D20, AArch64::D19_D20_D21, AArch64::D20_D21_D22,
+  AArch64::D21_D22_D23, AArch64::D22_D23_D24, AArch64::D23_D24_D25,
+  AArch64::D24_D25_D26, AArch64::D25_D26_D27, AArch64::D26_D27_D28,
+  AArch64::D27_D28_D29, AArch64::D28_D29_D30, AArch64::D29_D30_D31,
+  AArch64::D30_D31_D0,  AArch64::D31_D0_D1
+};
+
+static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static const unsigned DDDDDecoderTable[] = {
+  AArch64::D0_D1_D2_D3,     AArch64::D1_D2_D3_D4,     AArch64::D2_D3_D4_D5,
+  AArch64::D3_D4_D5_D6,     AArch64::D4_D5_D6_D7,     AArch64::D5_D6_D7_D8,
+  AArch64::D6_D7_D8_D9,     AArch64::D7_D8_D9_D10,    AArch64::D8_D9_D10_D11,
+  AArch64::D9_D10_D11_D12,  AArch64::D10_D11_D12_D13, AArch64::D11_D12_D13_D14,
+  AArch64::D12_D13_D14_D15, AArch64::D13_D14_D15_D16, AArch64::D14_D15_D16_D17,
+  AArch64::D15_D16_D17_D18, AArch64::D16_D17_D18_D19, AArch64::D17_D18_D19_D20,
+  AArch64::D18_D19_D20_D21, AArch64::D19_D20_D21_D22, AArch64::D20_D21_D22_D23,
+  AArch64::D21_D22_D23_D24, AArch64::D22_D23_D24_D25, AArch64::D23_D24_D25_D26,
+  AArch64::D24_D25_D26_D27, AArch64::D25_D26_D27_D28, AArch64::D26_D27_D28_D29,
+  AArch64::D27_D28_D29_D30, AArch64::D28_D29_D30_D31, AArch64::D29_D30_D31_D0,
+  AArch64::D30_D31_D0_D1,   AArch64::D31_D0_D1_D2
+};
+
+static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  // scale{5} is asserted as 1 in tblgen.
+  Imm |= 0x20;
+  Inst.addOperand(MCOperand::createImm(64 - Imm));
+  return Success;
+}
+
+static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  Inst.addOperand(MCOperand::createImm(64 - Imm));
+  return Success;
+}
+
+static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
+                                       uint64_t Addr, const void *Decoder) {
+  int64_t ImmVal = Imm;
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend 19-bit immediate.
+  if (ImmVal & (1 << (19 - 1)))
+    ImmVal |= ~((1LL << 19) - 1);
+
+  if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal *  4, Addr,
+                                     Inst.getOpcode() != AArch64::LDRXl, 0, 4))
+    Inst.addOperand(MCOperand::createImm(ImmVal));
+  return Success;
+}
+
+static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
+                                    uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::createImm((Imm  >> 1) & 1));
+  Inst.addOperand(MCOperand::createImm(Imm & 1));
+  return Success;
+}
+
+static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  Inst.addOperand(MCOperand::createImm(Imm));
+
+  // Every system register in the encoding space is valid with the syntax
+  // S<op0>_<op1>_<Cn>_<Cm>_<op2>, so decoding system registers always succeeds.
+  return Success;
+}
+
+static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  Inst.addOperand(MCOperand::createImm(Imm));
+
+  return Success;
+}
+
+static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  // This decoder exists to add the dummy Lane operand to the MCInst, which must
+  // be 1 in assembly but has no other real manifestation.
+  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned IsToVec = fieldFromInstruction(Insn, 16, 1);
+
+  if (IsToVec) {
+    DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
+  } else {
+    DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder);
+  }
+
+  // Add the lane
+  Inst.addOperand(MCOperand::createImm(1));
+
+  return Success;
+}
+
+static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm,
+                                       unsigned Add) {
+  Inst.addOperand(MCOperand::createImm(Add - Imm));
+  return Success;
+}
+
+static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm,
+                                       unsigned Add) {
+  Inst.addOperand(MCOperand::createImm((Imm + Add) & (Add - 1)));
+  return Success;
+}
+
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 64);
+}
+
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x20, 64);
+}
+
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 32);
+}
+
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x10, 32);
+}
+
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 16);
+}
+
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x8, 16);
+}
+
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 8);
+}
+
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 64);
+}
+
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 32);
+}
+
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 16);
+}
+
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 8);
+}
+
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn, uint64_t Addr,
+                                                   const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(insn, 16, 5);
+  unsigned shiftHi = fieldFromInstruction(insn, 22, 2);
+  unsigned shiftLo = fieldFromInstruction(insn, 10, 6);
+  unsigned shift = (shiftHi << 6) | shiftLo;
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::ADDWrs:
+  case AArch64::ADDSWrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBSWrs:
+    // if shift == '11' then ReservedValue()
+    if (shiftHi == 0x3)
+      return Fail;
+    // Deliberate fallthrough
+  case AArch64::ANDWrs:
+  case AArch64::ANDSWrs:
+  case AArch64::BICWrs:
+  case AArch64::BICSWrs:
+  case AArch64::ORRWrs:
+  case AArch64::ORNWrs:
+  case AArch64::EORWrs:
+  case AArch64::EONWrs: {
+    // if sf == '0' and imm6<5> == '1' then ReservedValue()
+    if (shiftLo >> 5 == 1)
+      return Fail;
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  }
+  case AArch64::ADDXrs:
+  case AArch64::ADDSXrs:
+  case AArch64::SUBXrs:
+  case AArch64::SUBSXrs:
+    // if shift == '11' then ReservedValue()
+    if (shiftHi == 0x3)
+      return Fail;
+    // Deliberate fallthrough
+  case AArch64::ANDXrs:
+  case AArch64::ANDSXrs:
+  case AArch64::BICXrs:
+  case AArch64::BICSXrs:
+  case AArch64::ORRXrs:
+  case AArch64::ORNXrs:
+  case AArch64::EORXrs:
+  case AArch64::EONXrs:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  }
+
+  Inst.addOperand(MCOperand::createImm(shift));
+  return Success;
+}
+
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned imm = fieldFromInstruction(insn, 5, 16);
+  unsigned shift = fieldFromInstruction(insn, 21, 2);
+  shift <<= 4;
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::MOVZWi:
+  case AArch64::MOVNWi:
+  case AArch64::MOVKWi:
+    if (shift & (1U << 5))
+      return Fail;
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  case AArch64::MOVZXi:
+  case AArch64::MOVNXi:
+  case AArch64::MOVKXi:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  }
+
+  if (Inst.getOpcode() == AArch64::MOVKWi ||
+      Inst.getOpcode() == AArch64::MOVKXi)
+    Inst.addOperand(Inst.getOperand(0));
+
+  Inst.addOperand(MCOperand::createImm(imm));
+  Inst.addOperand(MCOperand::createImm(shift));
+  return Success;
+}
+
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn, uint64_t Addr,
+                                                  const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned offset = fieldFromInstruction(insn, 10, 12);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::PRFMui:
+    // Rt is an immediate in prefetch.
+    Inst.addOperand(MCOperand::createImm(Rt));
+    break;
+  case AArch64::STRBBui:
+  case AArch64::LDRBBui:
+  case AArch64::LDRSBWui:
+  case AArch64::STRHHui:
+  case AArch64::LDRHHui:
+  case AArch64::LDRSHWui:
+  case AArch64::STRWui:
+  case AArch64::LDRWui:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWui:
+  case AArch64::STRXui:
+  case AArch64::LDRXui:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRQui:
+  case AArch64::STRQui:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRDui:
+  case AArch64::STRDui:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRSui:
+  case AArch64::STRSui:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRHui:
+  case AArch64::STRHui:
+    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRBui:
+  case AArch64::STRBui:
+    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  if (!Dis->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::createImm(offset));
+  return Success;
+}
+
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  int64_t offset = fieldFromInstruction(insn, 12, 9);
+
+  // offset is a 9-bit signed immediate, so sign extend it to
+  // fill the unsigned.
+  if (offset & (1 << (9 - 1)))
+    offset |= ~((1LL << 9) - 1);
+
+  // First operand is always the writeback to the address register, if needed.
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::STRBBpre:
+  case AArch64::LDRBBpre:
+  case AArch64::STRHHpre:
+  case AArch64::LDRHHpre:
+  case AArch64::STRWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::STRBBpost:
+  case AArch64::LDRBBpost:
+  case AArch64::STRHHpost:
+  case AArch64::LDRHHpost:
+  case AArch64::STRWpost:
+  case AArch64::LDRWpost:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::STRXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::STRXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRXpost:
+  case AArch64::LDRQpre:
+  case AArch64::STRQpre:
+  case AArch64::LDRQpost:
+  case AArch64::STRQpost:
+  case AArch64::LDRDpre:
+  case AArch64::STRDpre:
+  case AArch64::LDRDpost:
+  case AArch64::STRDpost:
+  case AArch64::LDRSpre:
+  case AArch64::STRSpre:
+  case AArch64::LDRSpost:
+  case AArch64::STRSpost:
+  case AArch64::LDRHpre:
+  case AArch64::STRHpre:
+  case AArch64::LDRHpost:
+  case AArch64::STRHpost:
+  case AArch64::LDRBpre:
+  case AArch64::STRBpre:
+  case AArch64::LDRBpost:
+  case AArch64::STRBpost:
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    break;
+  }
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::PRFUMi:
+    // Rt is an immediate in prefetch.
+    Inst.addOperand(MCOperand::createImm(Rt));
+    break;
+  case AArch64::STURBBi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBWi:
+  case AArch64::STURHHi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURSHWi:
+  case AArch64::STURWi:
+  case AArch64::LDURWi:
+  case AArch64::LDTRSBWi:
+  case AArch64::LDTRSHWi:
+  case AArch64::STTRWi:
+  case AArch64::LDTRWi:
+  case AArch64::STTRHi:
+  case AArch64::LDTRHi:
+  case AArch64::LDTRBi:
+  case AArch64::STTRBi:
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::STRBBpre:
+  case AArch64::LDRBBpre:
+  case AArch64::STRHHpre:
+  case AArch64::LDRHHpre:
+  case AArch64::STRWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::STRBBpost:
+  case AArch64::LDRBBpost:
+  case AArch64::STRHHpost:
+  case AArch64::LDRHHpost:
+  case AArch64::STRWpost:
+  case AArch64::LDRWpost:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSWi:
+  case AArch64::STURXi:
+  case AArch64::LDURXi:
+  case AArch64::LDTRSBXi:
+  case AArch64::LDTRSHXi:
+  case AArch64::LDTRSWi:
+  case AArch64::STTRXi:
+  case AArch64::LDTRXi:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::STRXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::STRXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRXpost:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURQi:
+  case AArch64::STURQi:
+  case AArch64::LDRQpre:
+  case AArch64::STRQpre:
+  case AArch64::LDRQpost:
+  case AArch64::STRQpost:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURDi:
+  case AArch64::STURDi:
+  case AArch64::LDRDpre:
+  case AArch64::STRDpre:
+  case AArch64::LDRDpost:
+  case AArch64::STRDpost:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURSi:
+  case AArch64::STURSi:
+  case AArch64::LDRSpre:
+  case AArch64::STRSpre:
+  case AArch64::LDRSpost:
+  case AArch64::STRSpost:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURHi:
+  case AArch64::STURHi:
+  case AArch64::LDRHpre:
+  case AArch64::STRHpre:
+  case AArch64::LDRHpost:
+  case AArch64::STRHpost:
+    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURBi:
+  case AArch64::STURBi:
+  case AArch64::LDRBpre:
+  case AArch64::STRBpre:
+  case AArch64::LDRBpost:
+  case AArch64::STRBpost:
+    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  Inst.addOperand(MCOperand::createImm(offset));
+
+  bool IsLoad = fieldFromInstruction(insn, 22, 1);
+  bool IsIndexed = fieldFromInstruction(insn, 10, 2) != 0;
+  bool IsFP = fieldFromInstruction(insn, 26, 1);
+
+  // Cannot write back to a transfer register (but xzr != sp).
+  if (IsLoad && IsIndexed && !IsFP && Rn != 31 && Rt == Rn)
+    return SoftFail;
+
+  return Success;
+}
+
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn, uint64_t Addr,
+                                                   const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+  unsigned Rs = fieldFromInstruction(insn, 16, 5);
+
+  unsigned Opcode = Inst.getOpcode();
+  switch (Opcode) {
+  default:
+    return Fail;
+  case AArch64::STLXRW:
+  case AArch64::STLXRB:
+  case AArch64::STLXRH:
+  case AArch64::STXRW:
+  case AArch64::STXRB:
+  case AArch64::STXRH:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+    LLVM_FALLTHROUGH;
+  case AArch64::LDARW:
+  case AArch64::LDARB:
+  case AArch64::LDARH:
+  case AArch64::LDAXRW:
+  case AArch64::LDAXRB:
+  case AArch64::LDAXRH:
+  case AArch64::LDXRW:
+  case AArch64::LDXRB:
+  case AArch64::LDXRH:
+  case AArch64::STLRW:
+  case AArch64::STLRB:
+  case AArch64::STLRH:
+  case AArch64::STLLRW:
+  case AArch64::STLLRB:
+  case AArch64::STLLRH:
+  case AArch64::LDLARW:
+  case AArch64::LDLARB:
+  case AArch64::LDLARH:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::STLXRX:
+  case AArch64::STXRX:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+    LLVM_FALLTHROUGH;
+  case AArch64::LDARX:
+  case AArch64::LDAXRX:
+  case AArch64::LDXRX:
+  case AArch64::STLRX:
+  case AArch64::LDLARX:
+  case AArch64::STLLRX:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::STLXPW:
+  case AArch64::STXPW:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+    LLVM_FALLTHROUGH;
+  case AArch64::LDAXPW:
+  case AArch64::LDXPW:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::STLXPX:
+  case AArch64::STXPX:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+    LLVM_FALLTHROUGH;
+  case AArch64::LDAXPX:
+  case AArch64::LDXPX:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+
+  // You shouldn't load to the same register twice in an instruction...
+  if ((Opcode == AArch64::LDAXPW || Opcode == AArch64::LDXPW ||
+       Opcode == AArch64::LDAXPX || Opcode == AArch64::LDXPX) &&
+      Rt == Rt2)
+    return SoftFail;
+
+  return Success;
+}
+
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+  int64_t offset = fieldFromInstruction(insn, 15, 7);
+  bool IsLoad = fieldFromInstruction(insn, 22, 1);
+
+  // offset is a 7-bit signed immediate, so sign extend it to
+  // fill the unsigned.
+  if (offset & (1 << (7 - 1)))
+    offset |= ~((1LL << 7) - 1);
+
+  unsigned Opcode = Inst.getOpcode();
+  bool NeedsDisjointWritebackTransfer = false;
+
+  // First operand is always writeback of base register.
+  switch (Opcode) {
+  default:
+    break;
+  case AArch64::LDPXpost:
+  case AArch64::STPXpost:
+  case AArch64::LDPSWpost:
+  case AArch64::LDPXpre:
+  case AArch64::STPXpre:
+  case AArch64::LDPSWpre:
+  case AArch64::LDPWpost:
+  case AArch64::STPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::STPWpre:
+  case AArch64::LDPQpost:
+  case AArch64::STPQpost:
+  case AArch64::LDPQpre:
+  case AArch64::STPQpre:
+  case AArch64::LDPDpost:
+  case AArch64::STPDpost:
+  case AArch64::LDPDpre:
+  case AArch64::STPDpre:
+  case AArch64::LDPSpost:
+  case AArch64::STPSpost:
+  case AArch64::LDPSpre:
+  case AArch64::STPSpre:
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    break;
+  }
+
+  switch (Opcode) {
+  default:
+    return Fail;
+  case AArch64::LDPXpost:
+  case AArch64::STPXpost:
+  case AArch64::LDPSWpost:
+  case AArch64::LDPXpre:
+  case AArch64::STPXpre:
+  case AArch64::LDPSWpre:
+    NeedsDisjointWritebackTransfer = true;
+    LLVM_FALLTHROUGH;
+  case AArch64::LDNPXi:
+  case AArch64::STNPXi:
+  case AArch64::LDPXi:
+  case AArch64::STPXi:
+  case AArch64::LDPSWi:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDPWpost:
+  case AArch64::STPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::STPWpre:
+    NeedsDisjointWritebackTransfer = true;
+    LLVM_FALLTHROUGH;
+  case AArch64::LDNPWi:
+  case AArch64::STNPWi:
+  case AArch64::LDPWi:
+  case AArch64::STPWi:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDNPQi:
+  case AArch64::STNPQi:
+  case AArch64::LDPQpost:
+  case AArch64::STPQpost:
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
+  case AArch64::LDPQpre:
+  case AArch64::STPQpre:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDNPDi:
+  case AArch64::STNPDi:
+  case AArch64::LDPDpost:
+  case AArch64::STPDpost:
+  case AArch64::LDPDi:
+  case AArch64::STPDi:
+  case AArch64::LDPDpre:
+  case AArch64::STPDpre:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDNPSi:
+  case AArch64::STNPSi:
+  case AArch64::LDPSpost:
+  case AArch64::STPSpost:
+  case AArch64::LDPSi:
+  case AArch64::STPSi:
+  case AArch64::LDPSpre:
+  case AArch64::STPSpre:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  }
+
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  Inst.addOperand(MCOperand::createImm(offset));
+
+  // You shouldn't load to the same register twice in an instruction...
+  if (IsLoad && Rt == Rt2)
+    return SoftFail;
+
+  // ... or do any operation that writes-back to a transfer register. But note
+  // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different.
+  if (NeedsDisjointWritebackTransfer && Rn != 31 && (Rt == Rn || Rt2 == Rn))
+    return SoftFail;
+
+  return Success;
+}
+
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(insn, 16, 5);
+  unsigned extend = fieldFromInstruction(insn, 10, 6);
+
+  unsigned shift = extend & 0x7;
+  if (shift > 4)
+    return Fail;
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::ADDWrx:
+  case AArch64::SUBWrx:
+    DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case AArch64::ADDSWrx:
+  case AArch64::SUBSWrx:
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case AArch64::ADDXrx:
+  case AArch64::SUBXrx:
+    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case AArch64::ADDSXrx:
+  case AArch64::SUBSXrx:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case AArch64::ADDXrx64:
+  case AArch64::SUBXrx64:
+    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  case AArch64::SUBSXrx64:
+  case AArch64::ADDSXrx64:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
+  }
+
+  Inst.addOperand(MCOperand::createImm(extend));
+  return Success;
+}
+
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+  unsigned imm;
+
+  if (Datasize) {
+    if (Inst.getOpcode() == AArch64::ANDSXri)
+      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+    imm = fieldFromInstruction(insn, 10, 13);
+    if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 64))
+      return Fail;
+  } else {
+    if (Inst.getOpcode() == AArch64::ANDSWri)
+      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+    imm = fieldFromInstruction(insn, 10, 12);
+    if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 32))
+      return Fail;
+  }
+  Inst.addOperand(MCOperand::createImm(imm));
+  return Success;
+}
+
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned cmode = fieldFromInstruction(insn, 12, 4);
+  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+  imm |= fieldFromInstruction(insn, 5, 5);
+
+  if (Inst.getOpcode() == AArch64::MOVID)
+    DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
+  else
+    DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+  Inst.addOperand(MCOperand::createImm(imm));
+
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+  case AArch64::MOVIv4i16:
+  case AArch64::MOVIv8i16:
+  case AArch64::MVNIv4i16:
+  case AArch64::MVNIv8i16:
+  case AArch64::MOVIv2i32:
+  case AArch64::MOVIv4i32:
+  case AArch64::MVNIv2i32:
+  case AArch64::MVNIv4i32:
+    Inst.addOperand(MCOperand::createImm((cmode & 6) << 2));
+    break;
+  case AArch64::MOVIv2s_msl:
+  case AArch64::MOVIv4s_msl:
+  case AArch64::MVNIv2s_msl:
+  case AArch64::MVNIv4s_msl:
+    Inst.addOperand(MCOperand::createImm(cmode & 1 ? 0x110 : 0x108));
+    break;
+  }
+
+  return Success;
+}
+
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned cmode = fieldFromInstruction(insn, 12, 4);
+  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+  imm |= fieldFromInstruction(insn, 5, 5);
+
+  // Tied operands added twice.
+  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+  Inst.addOperand(MCOperand::createImm(imm));
+  Inst.addOperand(MCOperand::createImm((cmode & 6) << 2));
+
+  return Success;
+}
+
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                         uint64_t Addr, const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
+  imm |= fieldFromInstruction(insn, 29, 2);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend the 21-bit immediate.
+  if (imm & (1 << (21 - 1)))
+    imm |= ~((1LL << 21) - 1);
+
+  DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+  if (!Dis->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::createImm(imm));
+
+  return Success;
+}
+
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Addr, const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Imm = fieldFromInstruction(insn, 10, 14);
+  unsigned S = fieldFromInstruction(insn, 29, 1);
+  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+
+  unsigned ShifterVal = (Imm >> 12) & 3;
+  unsigned ImmVal = Imm & 0xFFF;
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  if (ShifterVal != 0 && ShifterVal != 1)
+    return Fail;
+
+  if (Datasize) {
+    if (Rd == 31 && !S)
+      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  } else {
+    if (Rd == 31 && !S)
+      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+  }
+
+  if (!Dis->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::createImm(ImmVal));
+  Inst.addOperand(MCOperand::createImm(12 * ShifterVal));
+  return Success;
+}
+
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  int64_t imm = fieldFromInstruction(insn, 0, 26);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend the 26-bit immediate.
+  if (imm & (1 << (26 - 1)))
+    imm |= ~((1LL << 26) - 1);
+
+  if (!Dis->tryAddingSymbolicOperand(Inst, imm * 4, Addr, true, 0, 4))
+    Inst.addOperand(MCOperand::createImm(imm));
+
+  return Success;
+}
+
+static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn, uint64_t Addr,
+                                                  const void *Decoder) {
+  uint64_t op1 = fieldFromInstruction(insn, 16, 3);
+  uint64_t op2 = fieldFromInstruction(insn, 5, 3);
+  uint64_t crm = fieldFromInstruction(insn, 8, 4);
+
+  uint64_t pstate_field = (op1 << 3) | op2;
+
+  if ((pstate_field == AArch64PState::PAN  ||
+       pstate_field == AArch64PState::UAO) && crm > 1)
+    return Fail;
+
+  Inst.addOperand(MCOperand::createImm(pstate_field));
+  Inst.addOperand(MCOperand::createImm(crm));
+
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+  auto PState = AArch64PState::lookupPStateByEncoding(pstate_field);
+  if (PState && PState->haveFeatures(Dis->getSubtargetInfo().getFeatureBits()))
+    return Success;
+  return Fail;
+}
+
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Addr, const void *Decoder) {
+  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
+  uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
+  bit |= fieldFromInstruction(insn, 19, 5);
+  int64_t dst = fieldFromInstruction(insn, 5, 14);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend 14-bit immediate.
+  if (dst & (1 << (14 - 1)))
+    dst |= ~((1LL << 14) - 1);
+
+  if (fieldFromInstruction(insn, 31, 1) == 0)
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+  else
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+  Inst.addOperand(MCOperand::createImm(bit));
+  if (!Dis->tryAddingSymbolicOperand(Inst, dst * 4, Addr, true, 0, 4))
+    Inst.addOperand(MCOperand::createImm(dst));
+
+  return Success;
+}
+
+static DecodeStatus DecodeGPRSeqPairsClassRegisterClass(MCInst &Inst,
+                                                        unsigned RegClassID,
+                                                        unsigned RegNo,
+                                                        uint64_t Addr,
+                                                        const void *Decoder) {
+  // Register number must be even (see CASP instruction)
+  if (RegNo & 0x1)
+    return Fail;
+
+  unsigned Register = AArch64MCRegisterClasses[RegClassID].getRegister(RegNo);
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst,
+                                                      unsigned RegNo,
+                                                      uint64_t Addr,
+                                                      const void *Decoder) {
+  return DecodeGPRSeqPairsClassRegisterClass(Inst,
+                                             AArch64::WSeqPairsClassRegClassID,
+                                             RegNo, Addr, Decoder);
+}
+
+static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
+                                                      unsigned RegNo,
+                                                      uint64_t Addr,
+                                                      const void *Decoder) {
+  return DecodeGPRSeqPairsClassRegisterClass(Inst,
+                                             AArch64::XSeqPairsClassRegClassID,
+                                             RegNo, Addr, Decoder);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
new file mode 100644
index 000000000000..24e353cf4b96
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -0,0 +1,38 @@
+//===- AArch64Disassembler.h - Disassembler for AArch64 ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
+#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
+
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class raw_ostream;
+
+class AArch64Disassembler : public MCDisassembler {
+public:
+  AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+    : MCDisassembler(STI, Ctx) {}
+
+  ~AArch64Disassembler() {}
+
+  MCDisassembler::DecodeStatus
+  getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes,
+                 uint64_t Address, raw_ostream &VStream,
+                 raw_ostream &CStream) const override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
new file mode 100644
index 000000000000..19d0ba2e1c41
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -0,0 +1,222 @@
+//===- AArch64ExternalSymbolizer.cpp - Symbolizer for AArch64 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64ExternalSymbolizer.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-disassembler"
+
+static MCSymbolRefExpr::VariantKind
+getVariant(uint64_t LLVMDisassembler_VariantKind) {
+  switch (LLVMDisassembler_VariantKind) {
+  case LLVMDisassembler_VariantKind_None:
+    return MCSymbolRefExpr::VK_None;
+  case LLVMDisassembler_VariantKind_ARM64_PAGE:
+    return MCSymbolRefExpr::VK_PAGE;
+  case LLVMDisassembler_VariantKind_ARM64_PAGEOFF:
+    return MCSymbolRefExpr::VK_PAGEOFF;
+  case LLVMDisassembler_VariantKind_ARM64_GOTPAGE:
+    return MCSymbolRefExpr::VK_GOTPAGE;
+  case LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF:
+    return MCSymbolRefExpr::VK_GOTPAGEOFF;
+  case LLVMDisassembler_VariantKind_ARM64_TLVP:
+  case LLVMDisassembler_VariantKind_ARM64_TLVOFF:
+  default:
+    llvm_unreachable("bad LLVMDisassembler_VariantKind");
+  }
+}
+
+/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
+/// operand in place of the immediate Value in the MCInst.  The immediate
+/// Value has not had any PC adjustment made by the caller. If the instruction
+/// is a branch that adds the PC to the immediate Value then isBranch is
+/// Success, else Fail. If GetOpInfo is non-null, then it is called to get any
+/// symbolic information at the Address for this instrution.  If that returns
+/// non-zero then the symbolic information it returns is used to create an
+/// MCExpr and that is added as an operand to the MCInst.  If GetOpInfo()
+/// returns zero and isBranch is Success then a symbol look up for
+/// Address + Value is done and if a symbol is found an MCExpr is created with
+/// that, else an MCExpr with Address + Value is created.  If GetOpInfo()
+/// returns zero and isBranch is Fail then the Opcode of the MCInst is
+/// tested and for ADRP an other instructions that help to load of pointers
+/// a symbol look up is done to see it is returns a specific reference type
+/// to add to the comment stream.  This function returns Success if it adds
+/// an operand to the MCInst and Fail otherwise.
+bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
+    MCInst &MI, raw_ostream &CommentStream, int64_t Value, uint64_t Address,
+    bool IsBranch, uint64_t Offset, uint64_t InstSize) {
+  // FIXME: This method shares a lot of code with
+  //        MCExternalSymbolizer::tryAddingSymbolicOperand. It may be possible
+  //        refactor the MCExternalSymbolizer interface to allow more of this
+  //        implementation to be shared.
+  //
+  struct LLVMOpInfo1 SymbolicOp;
+  memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+  SymbolicOp.Value = Value;
+  uint64_t ReferenceType;
+  const char *ReferenceName;
+  if (!GetOpInfo ||
+      !GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
+    if (IsBranch) {
+      ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
+      const char *Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType,
+                                      Address, &ReferenceName);
+      if (Name) {
+        SymbolicOp.AddSymbol.Name = Name;
+        SymbolicOp.AddSymbol.Present = true;
+        SymbolicOp.Value = 0;
+      } else {
+        SymbolicOp.Value = Address + Value;
+      }
+      if (ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
+        CommentStream << "symbol stub for: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Message)
+        CommentStream << "Objc message: " << ReferenceName;
+    } else if (MI.getOpcode() == AArch64::ADRP) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADRP;
+        // otool expects the fully encoded ADRP instruction to be passed in as
+        // the value here, so reconstruct it:
+        const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo();
+        uint32_t EncodedInst = 0x90000000;
+        EncodedInst |= (Value & 0x3) << 29; // immlo
+        EncodedInst |= ((Value >> 2) & 0x7FFFF) << 5; // immhi
+        EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // reg
+        SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
+                     &ReferenceName);
+        CommentStream << format("0x%llx",
+                                0xfffffffffffff000LL & (Address + Value));
+    } else if (MI.getOpcode() == AArch64::ADDXri ||
+               MI.getOpcode() == AArch64::LDRXui ||
+               MI.getOpcode() == AArch64::LDRXl ||
+               MI.getOpcode() == AArch64::ADR) {
+      if (MI.getOpcode() == AArch64::ADDXri)
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADDXri;
+      else if (MI.getOpcode() == AArch64::LDRXui)
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXui;
+      if (MI.getOpcode() == AArch64::LDRXl) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXl;
+        SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+                     &ReferenceName);
+      } else if (MI.getOpcode() == AArch64::ADR) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADR;
+        SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+                            &ReferenceName);
+      } else {
+        const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo();
+        // otool expects the fully encoded ADD/LDR instruction to be passed in
+        // as the value here, so reconstruct it:
+        unsigned EncodedInst =
+          MI.getOpcode() == AArch64::ADDXri ? 0x91000000: 0xF9400000;
+        EncodedInst |= Value << 10; // imm12 [+ shift:2 for ADD]
+        EncodedInst |=
+          MCRI.getEncodingValue(MI.getOperand(1).getReg()) << 5; // Rn
+        EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // Rd
+
+        SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
+                     &ReferenceName);
+      }
+      if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
+        CommentStream << "literal pool symbol address: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) {
+        CommentStream << "literal pool for: \"";
+        CommentStream.write_escaped(ReferenceName);
+        CommentStream << "\"";
+      } else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
+        CommentStream << "Objc cfstring ref: @\"" << ReferenceName << "\"";
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Message)
+        CommentStream << "Objc message: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref)
+        CommentStream << "Objc message ref: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref)
+        CommentStream << "Objc selector ref: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref)
+        CommentStream << "Objc class ref: " << ReferenceName;
+      // For these instructions, the SymbolLookUp() above is just to get the
+      // ReferenceType and ReferenceName.  We want to make sure not to
+      // fall through so we don't build an MCExpr to leave the disassembly
+      // of the immediate values of these instructions to the InstPrinter.
+      return false;
+    } else {
+      return false;
+    }
+  }
+
+  const MCExpr *Add = nullptr;
+  if (SymbolicOp.AddSymbol.Present) {
+    if (SymbolicOp.AddSymbol.Name) {
+      StringRef Name(SymbolicOp.AddSymbol.Name);
+      MCSymbol *Sym = Ctx.getOrCreateSymbol(Name);
+      MCSymbolRefExpr::VariantKind Variant = getVariant(SymbolicOp.VariantKind);
+      if (Variant != MCSymbolRefExpr::VK_None)
+        Add = MCSymbolRefExpr::create(Sym, Variant, Ctx);
+      else
+        Add = MCSymbolRefExpr::create(Sym, Ctx);
+    } else {
+      Add = MCConstantExpr::create(SymbolicOp.AddSymbol.Value, Ctx);
+    }
+  }
+
+  const MCExpr *Sub = nullptr;
+  if (SymbolicOp.SubtractSymbol.Present) {
+    if (SymbolicOp.SubtractSymbol.Name) {
+      StringRef Name(SymbolicOp.SubtractSymbol.Name);
+      MCSymbol *Sym = Ctx.getOrCreateSymbol(Name);
+      Sub = MCSymbolRefExpr::create(Sym, Ctx);
+    } else {
+      Sub = MCConstantExpr::create(SymbolicOp.SubtractSymbol.Value, Ctx);
+    }
+  }
+
+  const MCExpr *Off = nullptr;
+  if (SymbolicOp.Value != 0)
+    Off = MCConstantExpr::create(SymbolicOp.Value, Ctx);
+
+  const MCExpr *Expr;
+  if (Sub) {
+    const MCExpr *LHS;
+    if (Add)
+      LHS = MCBinaryExpr::createSub(Add, Sub, Ctx);
+    else
+      LHS = MCUnaryExpr::createMinus(Sub, Ctx);
+    if (Off)
+      Expr = MCBinaryExpr::createAdd(LHS, Off, Ctx);
+    else
+      Expr = LHS;
+  } else if (Add) {
+    if (Off)
+      Expr = MCBinaryExpr::createAdd(Add, Off, Ctx);
+    else
+      Expr = Add;
+  } else {
+    if (Off)
+      Expr = Off;
+    else
+      Expr = MCConstantExpr::create(0, Ctx);
+  }
+
+  MI.addOperand(MCOperand::createExpr(Expr));
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
new file mode 100644
index 000000000000..49e844963797
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
@@ -0,0 +1,38 @@
+//===- AArch64ExternalSymbolizer.h - Symbolizer for AArch64 -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Symbolize AArch64 assembly code during disassembly using callbacks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H
+#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H
+
+#include "llvm/MC/MCDisassembler/MCExternalSymbolizer.h"
+
+namespace llvm {
+
+class AArch64ExternalSymbolizer : public MCExternalSymbolizer {
+public:
+  AArch64ExternalSymbolizer(MCContext &Ctx,
+                            std::unique_ptr<MCRelocationInfo> RelInfo,
+                            LLVMOpInfoCallback GetOpInfo,
+                            LLVMSymbolLookupCallback SymbolLookUp,
+                            void *DisInfo)
+      : MCExternalSymbolizer(Ctx, std::move(RelInfo), GetOpInfo, SymbolLookUp,
+                             DisInfo) {}
+
+  bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &CommentStream,
+                                int64_t Value, uint64_t Address, bool IsBranch,
+                                uint64_t Offset, uint64_t InstSize) override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
new file mode 100644
index 000000000000..b4f85204714f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -0,0 +1,1475 @@
+//==-- AArch64InstPrinter.cpp - Convert AArch64 MCInst to assembly syntax --==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an AArch64 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstPrinter.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "AArch64GenAsmWriter.inc"
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "AArch64GenAsmWriter1.inc"
+
+AArch64InstPrinter::AArch64InstPrinter(const MCAsmInfo &MAI,
+                                       const MCInstrInfo &MII,
+                                       const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+AArch64AppleInstPrinter::AArch64AppleInstPrinter(const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI)
+    : AArch64InstPrinter(MAI, MII, MRI) {}
+
+void AArch64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  // This is for .cfi directives.
+  OS << getRegisterName(RegNo);
+}
+
+void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                   StringRef Annot,
+                                   const MCSubtargetInfo &STI) {
+  // Check for special encodings and print the canonical alias instead.
+
+  unsigned Opcode = MI->getOpcode();
+
+  if (Opcode == AArch64::SYSxt)
+    if (printSysAlias(MI, STI, O)) {
+      printAnnotation(O, Annot);
+      return;
+    }
+
+  // SBFM/UBFM should print to a nicer aliased form if possible.
+  if (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri ||
+      Opcode == AArch64::UBFMXri || Opcode == AArch64::UBFMWri) {
+    const MCOperand &Op0 = MI->getOperand(0);
+    const MCOperand &Op1 = MI->getOperand(1);
+    const MCOperand &Op2 = MI->getOperand(2);
+    const MCOperand &Op3 = MI->getOperand(3);
+
+    bool IsSigned = (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri);
+    bool Is64Bit = (Opcode == AArch64::SBFMXri || Opcode == AArch64::UBFMXri);
+    if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) {
+      const char *AsmMnemonic = nullptr;
+
+      switch (Op3.getImm()) {
+      default:
+        break;
+      case 7:
+        if (IsSigned)
+          AsmMnemonic = "sxtb";
+        else if (!Is64Bit)
+          AsmMnemonic = "uxtb";
+        break;
+      case 15:
+        if (IsSigned)
+          AsmMnemonic = "sxth";
+        else if (!Is64Bit)
+          AsmMnemonic = "uxth";
+        break;
+      case 31:
+        // *xtw is only valid for signed 64-bit operations.
+        if (Is64Bit && IsSigned)
+          AsmMnemonic = "sxtw";
+        break;
+      }
+
+      if (AsmMnemonic) {
+        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+          << ", " << getRegisterName(getWRegFromXReg(Op1.getReg()));
+        printAnnotation(O, Annot);
+        return;
+      }
+    }
+
+    // All immediate shifts are aliases, implemented using the Bitfield
+    // instruction. In all cases the immediate shift amount shift must be in
+    // the range 0 to (reg.size -1).
+    if (Op2.isImm() && Op3.isImm()) {
+      const char *AsmMnemonic = nullptr;
+      int shift = 0;
+      int64_t immr = Op2.getImm();
+      int64_t imms = Op3.getImm();
+      if (Opcode == AArch64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) {
+        AsmMnemonic = "lsl";
+        shift = 31 - imms;
+      } else if (Opcode == AArch64::UBFMXri && imms != 0x3f &&
+                 ((imms + 1 == immr))) {
+        AsmMnemonic = "lsl";
+        shift = 63 - imms;
+      } else if (Opcode == AArch64::UBFMWri && imms == 0x1f) {
+        AsmMnemonic = "lsr";
+        shift = immr;
+      } else if (Opcode == AArch64::UBFMXri && imms == 0x3f) {
+        AsmMnemonic = "lsr";
+        shift = immr;
+      } else if (Opcode == AArch64::SBFMWri && imms == 0x1f) {
+        AsmMnemonic = "asr";
+        shift = immr;
+      } else if (Opcode == AArch64::SBFMXri && imms == 0x3f) {
+        AsmMnemonic = "asr";
+        shift = immr;
+      }
+      if (AsmMnemonic) {
+        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+          << ", " << getRegisterName(Op1.getReg()) << ", #" << shift;
+        printAnnotation(O, Annot);
+        return;
+      }
+    }
+
+    // SBFIZ/UBFIZ aliases
+    if (Op2.getImm() > Op3.getImm()) {
+      O << '\t' << (IsSigned ? "sbfiz" : "ubfiz") << '\t'
+        << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
+        << ", #" << (Is64Bit ? 64 : 32) - Op2.getImm() << ", #" << Op3.getImm() + 1;
+      printAnnotation(O, Annot);
+      return;
+    }
+
+    // Otherwise SBFX/UBFX is the preferred form
+    O << '\t' << (IsSigned ? "sbfx" : "ubfx") << '\t'
+      << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
+      << ", #" << Op2.getImm() << ", #" << Op3.getImm() - Op2.getImm() + 1;
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  if (Opcode == AArch64::BFMXri || Opcode == AArch64::BFMWri) {
+    const MCOperand &Op0 = MI->getOperand(0); // Op1 == Op0
+    const MCOperand &Op2 = MI->getOperand(2);
+    int ImmR = MI->getOperand(3).getImm();
+    int ImmS = MI->getOperand(4).getImm();
+
+    if ((Op2.getReg() == AArch64::WZR || Op2.getReg() == AArch64::XZR) &&
+        (ImmR == 0 || ImmS < ImmR)) {
+      // BFC takes precedence over its entire range, sligtly differently to BFI.
+      int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32;
+      int LSB = (BitWidth - ImmR) % BitWidth;
+      int Width = ImmS + 1;
+
+      O << "\tbfc\t" << getRegisterName(Op0.getReg())
+        << ", #" << LSB << ", #" << Width;
+      printAnnotation(O, Annot);
+      return;
+    } else if (ImmS < ImmR) {
+      // BFI alias
+      int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32;
+      int LSB = (BitWidth - ImmR) % BitWidth;
+      int Width = ImmS + 1;
+
+      O << "\tbfi\t" << getRegisterName(Op0.getReg()) << ", "
+        << getRegisterName(Op2.getReg()) << ", #" << LSB << ", #" << Width;
+      printAnnotation(O, Annot);
+      return;
+    }
+
+    int LSB = ImmR;
+    int Width = ImmS - ImmR + 1;
+    // Otherwise BFXIL the preferred form
+    O << "\tbfxil\t"
+      << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op2.getReg())
+      << ", #" << LSB << ", #" << Width;
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift
+  // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be
+  // printed.
+  if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi ||
+       Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) &&
+      MI->getOperand(1).isExpr()) {
+    if (Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi)
+      O << "\tmovz\t";
+    else
+      O << "\tmovn\t";
+
+    O << getRegisterName(MI->getOperand(0).getReg()) << ", #";
+    MI->getOperand(1).getExpr()->print(O, &MAI);
+    return;
+  }
+
+  if ((Opcode == AArch64::MOVKXi || Opcode == AArch64::MOVKWi) &&
+      MI->getOperand(2).isExpr()) {
+    O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #";
+    MI->getOperand(2).getExpr()->print(O, &MAI);
+    return;
+  }
+
+  // MOVZ, MOVN and "ORR wzr, #imm" instructions are aliases for MOV, but their
+  // domains overlap so they need to be prioritized. The chain is "MOVZ lsl #0 >
+  // MOVZ lsl #N > MOVN lsl #0 > MOVN lsl #N > ORR". The highest instruction
+  // that can represent the move is the MOV alias, and the rest get printed
+  // normally.
+  if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi) &&
+      MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) {
+    int RegWidth = Opcode == AArch64::MOVZXi ? 64 : 32;
+    int Shift = MI->getOperand(2).getImm();
+    uint64_t Value = (uint64_t)MI->getOperand(1).getImm() << Shift;
+
+    if (AArch64_AM::isMOVZMovAlias(Value, Shift,
+                                   Opcode == AArch64::MOVZXi ? 64 : 32)) {
+      O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+        << formatImm(SignExtend64(Value, RegWidth));
+      return;
+    }
+  }
+
+  if ((Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) &&
+      MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) {
+    int RegWidth = Opcode == AArch64::MOVNXi ? 64 : 32;
+    int Shift = MI->getOperand(2).getImm();
+    uint64_t Value = ~((uint64_t)MI->getOperand(1).getImm() << Shift);
+    if (RegWidth == 32)
+      Value = Value & 0xffffffff;
+
+    if (AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth)) {
+      O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+        << formatImm(SignExtend64(Value, RegWidth));
+      return;
+    }
+  }
+
+  if ((Opcode == AArch64::ORRXri || Opcode == AArch64::ORRWri) &&
+      (MI->getOperand(1).getReg() == AArch64::XZR ||
+       MI->getOperand(1).getReg() == AArch64::WZR) &&
+      MI->getOperand(2).isImm()) {
+    int RegWidth = Opcode == AArch64::ORRXri ? 64 : 32;
+    uint64_t Value = AArch64_AM::decodeLogicalImmediate(
+        MI->getOperand(2).getImm(), RegWidth);
+    if (!AArch64_AM::isAnyMOVWMovAlias(Value, RegWidth)) {
+      O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+        << formatImm(SignExtend64(Value, RegWidth));
+      return;
+    }
+  }
+
+  if (!printAliasInstr(MI, STI, O))
+    printInstruction(MI, STI, O);
+
+  printAnnotation(O, Annot);
+}
+
+static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout,
+                                bool &IsTbx) {
+  switch (Opcode) {
+  case AArch64::TBXv8i8One:
+  case AArch64::TBXv8i8Two:
+  case AArch64::TBXv8i8Three:
+  case AArch64::TBXv8i8Four:
+    IsTbx = true;
+    Layout = ".8b";
+    return true;
+  case AArch64::TBLv8i8One:
+  case AArch64::TBLv8i8Two:
+  case AArch64::TBLv8i8Three:
+  case AArch64::TBLv8i8Four:
+    IsTbx = false;
+    Layout = ".8b";
+    return true;
+  case AArch64::TBXv16i8One:
+  case AArch64::TBXv16i8Two:
+  case AArch64::TBXv16i8Three:
+  case AArch64::TBXv16i8Four:
+    IsTbx = true;
+    Layout = ".16b";
+    return true;
+  case AArch64::TBLv16i8One:
+  case AArch64::TBLv16i8Two:
+  case AArch64::TBLv16i8Three:
+  case AArch64::TBLv16i8Four:
+    IsTbx = false;
+    Layout = ".16b";
+    return true;
+  default:
+    return false;
+  }
+}
+
+struct LdStNInstrDesc {
+  unsigned Opcode;
+  const char *Mnemonic;
+  const char *Layout;
+  int ListOperand;
+  bool HasLane;
+  int NaturalOffset;
+};
+
+static const LdStNInstrDesc LdStNInstInfo[] = {
+  { AArch64::LD1i8,             "ld1",  ".b",     1, true,  0  },
+  { AArch64::LD1i16,            "ld1",  ".h",     1, true,  0  },
+  { AArch64::LD1i32,            "ld1",  ".s",     1, true,  0  },
+  { AArch64::LD1i64,            "ld1",  ".d",     1, true,  0  },
+  { AArch64::LD1i8_POST,        "ld1",  ".b",     2, true,  1  },
+  { AArch64::LD1i16_POST,       "ld1",  ".h",     2, true,  2  },
+  { AArch64::LD1i32_POST,       "ld1",  ".s",     2, true,  4  },
+  { AArch64::LD1i64_POST,       "ld1",  ".d",     2, true,  8  },
+  { AArch64::LD1Rv16b,          "ld1r", ".16b",   0, false, 0  },
+  { AArch64::LD1Rv8h,           "ld1r", ".8h",    0, false, 0  },
+  { AArch64::LD1Rv4s,           "ld1r", ".4s",    0, false, 0  },
+  { AArch64::LD1Rv2d,           "ld1r", ".2d",    0, false, 0  },
+  { AArch64::LD1Rv8b,           "ld1r", ".8b",    0, false, 0  },
+  { AArch64::LD1Rv4h,           "ld1r", ".4h",    0, false, 0  },
+  { AArch64::LD1Rv2s,           "ld1r", ".2s",    0, false, 0  },
+  { AArch64::LD1Rv1d,           "ld1r", ".1d",    0, false, 0  },
+  { AArch64::LD1Rv16b_POST,     "ld1r", ".16b",   1, false, 1  },
+  { AArch64::LD1Rv8h_POST,      "ld1r", ".8h",    1, false, 2  },
+  { AArch64::LD1Rv4s_POST,      "ld1r", ".4s",    1, false, 4  },
+  { AArch64::LD1Rv2d_POST,      "ld1r", ".2d",    1, false, 8  },
+  { AArch64::LD1Rv8b_POST,      "ld1r", ".8b",    1, false, 1  },
+  { AArch64::LD1Rv4h_POST,      "ld1r", ".4h",    1, false, 2  },
+  { AArch64::LD1Rv2s_POST,      "ld1r", ".2s",    1, false, 4  },
+  { AArch64::LD1Rv1d_POST,      "ld1r", ".1d",    1, false, 8  },
+  { AArch64::LD1Onev16b,        "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Onev8h,         "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Onev4s,         "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Onev2d,         "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Onev8b,         "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Onev4h,         "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Onev2s,         "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Onev1d,         "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Onev16b_POST,   "ld1",  ".16b",   1, false, 16 },
+  { AArch64::LD1Onev8h_POST,    "ld1",  ".8h",    1, false, 16 },
+  { AArch64::LD1Onev4s_POST,    "ld1",  ".4s",    1, false, 16 },
+  { AArch64::LD1Onev2d_POST,    "ld1",  ".2d",    1, false, 16 },
+  { AArch64::LD1Onev8b_POST,    "ld1",  ".8b",    1, false, 8  },
+  { AArch64::LD1Onev4h_POST,    "ld1",  ".4h",    1, false, 8  },
+  { AArch64::LD1Onev2s_POST,    "ld1",  ".2s",    1, false, 8  },
+  { AArch64::LD1Onev1d_POST,    "ld1",  ".1d",    1, false, 8  },
+  { AArch64::LD1Twov16b,        "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Twov8h,         "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Twov4s,         "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Twov2d,         "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Twov8b,         "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Twov4h,         "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Twov2s,         "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Twov1d,         "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Twov16b_POST,   "ld1",  ".16b",   1, false, 32 },
+  { AArch64::LD1Twov8h_POST,    "ld1",  ".8h",    1, false, 32 },
+  { AArch64::LD1Twov4s_POST,    "ld1",  ".4s",    1, false, 32 },
+  { AArch64::LD1Twov2d_POST,    "ld1",  ".2d",    1, false, 32 },
+  { AArch64::LD1Twov8b_POST,    "ld1",  ".8b",    1, false, 16 },
+  { AArch64::LD1Twov4h_POST,    "ld1",  ".4h",    1, false, 16 },
+  { AArch64::LD1Twov2s_POST,    "ld1",  ".2s",    1, false, 16 },
+  { AArch64::LD1Twov1d_POST,    "ld1",  ".1d",    1, false, 16 },
+  { AArch64::LD1Threev16b,      "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Threev8h,       "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Threev4s,       "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Threev2d,       "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Threev8b,       "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Threev4h,       "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Threev2s,       "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Threev1d,       "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Threev16b_POST, "ld1",  ".16b",   1, false, 48 },
+  { AArch64::LD1Threev8h_POST,  "ld1",  ".8h",    1, false, 48 },
+  { AArch64::LD1Threev4s_POST,  "ld1",  ".4s",    1, false, 48 },
+  { AArch64::LD1Threev2d_POST,  "ld1",  ".2d",    1, false, 48 },
+  { AArch64::LD1Threev8b_POST,  "ld1",  ".8b",    1, false, 24 },
+  { AArch64::LD1Threev4h_POST,  "ld1",  ".4h",    1, false, 24 },
+  { AArch64::LD1Threev2s_POST,  "ld1",  ".2s",    1, false, 24 },
+  { AArch64::LD1Threev1d_POST,  "ld1",  ".1d",    1, false, 24 },
+  { AArch64::LD1Fourv16b,       "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Fourv8h,        "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Fourv4s,        "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Fourv2d,        "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Fourv8b,        "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Fourv4h,        "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Fourv2s,        "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Fourv1d,        "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Fourv16b_POST,  "ld1",  ".16b",   1, false, 64 },
+  { AArch64::LD1Fourv8h_POST,   "ld1",  ".8h",    1, false, 64 },
+  { AArch64::LD1Fourv4s_POST,   "ld1",  ".4s",    1, false, 64 },
+  { AArch64::LD1Fourv2d_POST,   "ld1",  ".2d",    1, false, 64 },
+  { AArch64::LD1Fourv8b_POST,   "ld1",  ".8b",    1, false, 32 },
+  { AArch64::LD1Fourv4h_POST,   "ld1",  ".4h",    1, false, 32 },
+  { AArch64::LD1Fourv2s_POST,   "ld1",  ".2s",    1, false, 32 },
+  { AArch64::LD1Fourv1d_POST,   "ld1",  ".1d",    1, false, 32 },
+  { AArch64::LD2i8,             "ld2",  ".b",     1, true,  0  },
+  { AArch64::LD2i16,            "ld2",  ".h",     1, true,  0  },
+  { AArch64::LD2i32,            "ld2",  ".s",     1, true,  0  },
+  { AArch64::LD2i64,            "ld2",  ".d",     1, true,  0  },
+  { AArch64::LD2i8_POST,        "ld2",  ".b",     2, true,  2  },
+  { AArch64::LD2i16_POST,       "ld2",  ".h",     2, true,  4  },
+  { AArch64::LD2i32_POST,       "ld2",  ".s",     2, true,  8  },
+  { AArch64::LD2i64_POST,       "ld2",  ".d",     2, true,  16  },
+  { AArch64::LD2Rv16b,          "ld2r", ".16b",   0, false, 0  },
+  { AArch64::LD2Rv8h,           "ld2r", ".8h",    0, false, 0  },
+  { AArch64::LD2Rv4s,           "ld2r", ".4s",    0, false, 0  },
+  { AArch64::LD2Rv2d,           "ld2r", ".2d",    0, false, 0  },
+  { AArch64::LD2Rv8b,           "ld2r", ".8b",    0, false, 0  },
+  { AArch64::LD2Rv4h,           "ld2r", ".4h",    0, false, 0  },
+  { AArch64::LD2Rv2s,           "ld2r", ".2s",    0, false, 0  },
+  { AArch64::LD2Rv1d,           "ld2r", ".1d",    0, false, 0  },
+  { AArch64::LD2Rv16b_POST,     "ld2r", ".16b",   1, false, 2  },
+  { AArch64::LD2Rv8h_POST,      "ld2r", ".8h",    1, false, 4  },
+  { AArch64::LD2Rv4s_POST,      "ld2r", ".4s",    1, false, 8  },
+  { AArch64::LD2Rv2d_POST,      "ld2r", ".2d",    1, false, 16 },
+  { AArch64::LD2Rv8b_POST,      "ld2r", ".8b",    1, false, 2  },
+  { AArch64::LD2Rv4h_POST,      "ld2r", ".4h",    1, false, 4  },
+  { AArch64::LD2Rv2s_POST,      "ld2r", ".2s",    1, false, 8  },
+  { AArch64::LD2Rv1d_POST,      "ld2r", ".1d",    1, false, 16 },
+  { AArch64::LD2Twov16b,        "ld2",  ".16b",   0, false, 0  },
+  { AArch64::LD2Twov8h,         "ld2",  ".8h",    0, false, 0  },
+  { AArch64::LD2Twov4s,         "ld2",  ".4s",    0, false, 0  },
+  { AArch64::LD2Twov2d,         "ld2",  ".2d",    0, false, 0  },
+  { AArch64::LD2Twov8b,         "ld2",  ".8b",    0, false, 0  },
+  { AArch64::LD2Twov4h,         "ld2",  ".4h",    0, false, 0  },
+  { AArch64::LD2Twov2s,         "ld2",  ".2s",    0, false, 0  },
+  { AArch64::LD2Twov16b_POST,   "ld2",  ".16b",   1, false, 32 },
+  { AArch64::LD2Twov8h_POST,    "ld2",  ".8h",    1, false, 32 },
+  { AArch64::LD2Twov4s_POST,    "ld2",  ".4s",    1, false, 32 },
+  { AArch64::LD2Twov2d_POST,    "ld2",  ".2d",    1, false, 32 },
+  { AArch64::LD2Twov8b_POST,    "ld2",  ".8b",    1, false, 16 },
+  { AArch64::LD2Twov4h_POST,    "ld2",  ".4h",    1, false, 16 },
+  { AArch64::LD2Twov2s_POST,    "ld2",  ".2s",    1, false, 16 },
+  { AArch64::LD3i8,             "ld3",  ".b",     1, true,  0  },
+  { AArch64::LD3i16,            "ld3",  ".h",     1, true,  0  },
+  { AArch64::LD3i32,            "ld3",  ".s",     1, true,  0  },
+  { AArch64::LD3i64,            "ld3",  ".d",     1, true,  0  },
+  { AArch64::LD3i8_POST,        "ld3",  ".b",     2, true,  3  },
+  { AArch64::LD3i16_POST,       "ld3",  ".h",     2, true,  6  },
+  { AArch64::LD3i32_POST,       "ld3",  ".s",     2, true,  12  },
+  { AArch64::LD3i64_POST,       "ld3",  ".d",     2, true,  24  },
+  { AArch64::LD3Rv16b,          "ld3r", ".16b",   0, false, 0  },
+  { AArch64::LD3Rv8h,           "ld3r", ".8h",    0, false, 0  },
+  { AArch64::LD3Rv4s,           "ld3r", ".4s",    0, false, 0  },
+  { AArch64::LD3Rv2d,           "ld3r", ".2d",    0, false, 0  },
+  { AArch64::LD3Rv8b,           "ld3r", ".8b",    0, false, 0  },
+  { AArch64::LD3Rv4h,           "ld3r", ".4h",    0, false, 0  },
+  { AArch64::LD3Rv2s,           "ld3r", ".2s",    0, false, 0  },
+  { AArch64::LD3Rv1d,           "ld3r", ".1d",    0, false, 0  },
+  { AArch64::LD3Rv16b_POST,     "ld3r", ".16b",   1, false, 3  },
+  { AArch64::LD3Rv8h_POST,      "ld3r", ".8h",    1, false, 6  },
+  { AArch64::LD3Rv4s_POST,      "ld3r", ".4s",    1, false, 12 },
+  { AArch64::LD3Rv2d_POST,      "ld3r", ".2d",    1, false, 24 },
+  { AArch64::LD3Rv8b_POST,      "ld3r", ".8b",    1, false, 3  },
+  { AArch64::LD3Rv4h_POST,      "ld3r", ".4h",    1, false, 6  },
+  { AArch64::LD3Rv2s_POST,      "ld3r", ".2s",    1, false, 12 },
+  { AArch64::LD3Rv1d_POST,      "ld3r", ".1d",    1, false, 24 },
+  { AArch64::LD3Threev16b,      "ld3",  ".16b",   0, false, 0  },
+  { AArch64::LD3Threev8h,       "ld3",  ".8h",    0, false, 0  },
+  { AArch64::LD3Threev4s,       "ld3",  ".4s",    0, false, 0  },
+  { AArch64::LD3Threev2d,       "ld3",  ".2d",    0, false, 0  },
+  { AArch64::LD3Threev8b,       "ld3",  ".8b",    0, false, 0  },
+  { AArch64::LD3Threev4h,       "ld3",  ".4h",    0, false, 0  },
+  { AArch64::LD3Threev2s,       "ld3",  ".2s",    0, false, 0  },
+  { AArch64::LD3Threev16b_POST, "ld3",  ".16b",   1, false, 48 },
+  { AArch64::LD3Threev8h_POST,  "ld3",  ".8h",    1, false, 48 },
+  { AArch64::LD3Threev4s_POST,  "ld3",  ".4s",    1, false, 48 },
+  { AArch64::LD3Threev2d_POST,  "ld3",  ".2d",    1, false, 48 },
+  { AArch64::LD3Threev8b_POST,  "ld3",  ".8b",    1, false, 24 },
+  { AArch64::LD3Threev4h_POST,  "ld3",  ".4h",    1, false, 24 },
+  { AArch64::LD3Threev2s_POST,  "ld3",  ".2s",    1, false, 24 },
+  { AArch64::LD4i8,             "ld4",  ".b",     1, true,  0  },
+  { AArch64::LD4i16,            "ld4",  ".h",     1, true,  0  },
+  { AArch64::LD4i32,            "ld4",  ".s",     1, true,  0  },
+  { AArch64::LD4i64,            "ld4",  ".d",     1, true,  0  },
+  { AArch64::LD4i8_POST,        "ld4",  ".b",     2, true,  4  },
+  { AArch64::LD4i16_POST,       "ld4",  ".h",     2, true,  8  },
+  { AArch64::LD4i32_POST,       "ld4",  ".s",     2, true,  16 },
+  { AArch64::LD4i64_POST,       "ld4",  ".d",     2, true,  32 },
+  { AArch64::LD4Rv16b,          "ld4r", ".16b",   0, false, 0  },
+  { AArch64::LD4Rv8h,           "ld4r", ".8h",    0, false, 0  },
+  { AArch64::LD4Rv4s,           "ld4r", ".4s",    0, false, 0  },
+  { AArch64::LD4Rv2d,           "ld4r", ".2d",    0, false, 0  },
+  { AArch64::LD4Rv8b,           "ld4r", ".8b",    0, false, 0  },
+  { AArch64::LD4Rv4h,           "ld4r", ".4h",    0, false, 0  },
+  { AArch64::LD4Rv2s,           "ld4r", ".2s",    0, false, 0  },
+  { AArch64::LD4Rv1d,           "ld4r", ".1d",    0, false, 0  },
+  { AArch64::LD4Rv16b_POST,     "ld4r", ".16b",   1, false, 4  },
+  { AArch64::LD4Rv8h_POST,      "ld4r", ".8h",    1, false, 8  },
+  { AArch64::LD4Rv4s_POST,      "ld4r", ".4s",    1, false, 16 },
+  { AArch64::LD4Rv2d_POST,      "ld4r", ".2d",    1, false, 32 },
+  { AArch64::LD4Rv8b_POST,      "ld4r", ".8b",    1, false, 4  },
+  { AArch64::LD4Rv4h_POST,      "ld4r", ".4h",    1, false, 8  },
+  { AArch64::LD4Rv2s_POST,      "ld4r", ".2s",    1, false, 16 },
+  { AArch64::LD4Rv1d_POST,      "ld4r", ".1d",    1, false, 32 },
+  { AArch64::LD4Fourv16b,       "ld4",  ".16b",   0, false, 0  },
+  { AArch64::LD4Fourv8h,        "ld4",  ".8h",    0, false, 0  },
+  { AArch64::LD4Fourv4s,        "ld4",  ".4s",    0, false, 0  },
+  { AArch64::LD4Fourv2d,        "ld4",  ".2d",    0, false, 0  },
+  { AArch64::LD4Fourv8b,        "ld4",  ".8b",    0, false, 0  },
+  { AArch64::LD4Fourv4h,        "ld4",  ".4h",    0, false, 0  },
+  { AArch64::LD4Fourv2s,        "ld4",  ".2s",    0, false, 0  },
+  { AArch64::LD4Fourv16b_POST,  "ld4",  ".16b",   1, false, 64 },
+  { AArch64::LD4Fourv8h_POST,   "ld4",  ".8h",    1, false, 64 },
+  { AArch64::LD4Fourv4s_POST,   "ld4",  ".4s",    1, false, 64 },
+  { AArch64::LD4Fourv2d_POST,   "ld4",  ".2d",    1, false, 64 },
+  { AArch64::LD4Fourv8b_POST,   "ld4",  ".8b",    1, false, 32 },
+  { AArch64::LD4Fourv4h_POST,   "ld4",  ".4h",    1, false, 32 },
+  { AArch64::LD4Fourv2s_POST,   "ld4",  ".2s",    1, false, 32 },
+  { AArch64::ST1i8,             "st1",  ".b",     0, true,  0  },
+  { AArch64::ST1i16,            "st1",  ".h",     0, true,  0  },
+  { AArch64::ST1i32,            "st1",  ".s",     0, true,  0  },
+  { AArch64::ST1i64,            "st1",  ".d",     0, true,  0  },
+  { AArch64::ST1i8_POST,        "st1",  ".b",     1, true,  1  },
+  { AArch64::ST1i16_POST,       "st1",  ".h",     1, true,  2  },
+  { AArch64::ST1i32_POST,       "st1",  ".s",     1, true,  4  },
+  { AArch64::ST1i64_POST,       "st1",  ".d",     1, true,  8  },
+  { AArch64::ST1Onev16b,        "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Onev8h,         "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Onev4s,         "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Onev2d,         "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Onev8b,         "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Onev4h,         "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Onev2s,         "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Onev1d,         "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Onev16b_POST,   "st1",  ".16b",   1, false, 16 },
+  { AArch64::ST1Onev8h_POST,    "st1",  ".8h",    1, false, 16 },
+  { AArch64::ST1Onev4s_POST,    "st1",  ".4s",    1, false, 16 },
+  { AArch64::ST1Onev2d_POST,    "st1",  ".2d",    1, false, 16 },
+  { AArch64::ST1Onev8b_POST,    "st1",  ".8b",    1, false, 8  },
+  { AArch64::ST1Onev4h_POST,    "st1",  ".4h",    1, false, 8  },
+  { AArch64::ST1Onev2s_POST,    "st1",  ".2s",    1, false, 8  },
+  { AArch64::ST1Onev1d_POST,    "st1",  ".1d",    1, false, 8  },
+  { AArch64::ST1Twov16b,        "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Twov8h,         "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Twov4s,         "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Twov2d,         "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Twov8b,         "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Twov4h,         "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Twov2s,         "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Twov1d,         "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Twov16b_POST,   "st1",  ".16b",   1, false, 32 },
+  { AArch64::ST1Twov8h_POST,    "st1",  ".8h",    1, false, 32 },
+  { AArch64::ST1Twov4s_POST,    "st1",  ".4s",    1, false, 32 },
+  { AArch64::ST1Twov2d_POST,    "st1",  ".2d",    1, false, 32 },
+  { AArch64::ST1Twov8b_POST,    "st1",  ".8b",    1, false, 16 },
+  { AArch64::ST1Twov4h_POST,    "st1",  ".4h",    1, false, 16 },
+  { AArch64::ST1Twov2s_POST,    "st1",  ".2s",    1, false, 16 },
+  { AArch64::ST1Twov1d_POST,    "st1",  ".1d",    1, false, 16 },
+  { AArch64::ST1Threev16b,      "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Threev8h,       "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Threev4s,       "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Threev2d,       "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Threev8b,       "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Threev4h,       "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Threev2s,       "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Threev1d,       "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Threev16b_POST, "st1",  ".16b",   1, false, 48 },
+  { AArch64::ST1Threev8h_POST,  "st1",  ".8h",    1, false, 48 },
+  { AArch64::ST1Threev4s_POST,  "st1",  ".4s",    1, false, 48 },
+  { AArch64::ST1Threev2d_POST,  "st1",  ".2d",    1, false, 48 },
+  { AArch64::ST1Threev8b_POST,  "st1",  ".8b",    1, false, 24 },
+  { AArch64::ST1Threev4h_POST,  "st1",  ".4h",    1, false, 24 },
+  { AArch64::ST1Threev2s_POST,  "st1",  ".2s",    1, false, 24 },
+  { AArch64::ST1Threev1d_POST,  "st1",  ".1d",    1, false, 24 },
+  { AArch64::ST1Fourv16b,       "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Fourv8h,        "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Fourv4s,        "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Fourv2d,        "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Fourv8b,        "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Fourv4h,        "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Fourv2s,        "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Fourv1d,        "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Fourv16b_POST,  "st1",  ".16b",   1, false, 64 },
+  { AArch64::ST1Fourv8h_POST,   "st1",  ".8h",    1, false, 64 },
+  { AArch64::ST1Fourv4s_POST,   "st1",  ".4s",    1, false, 64 },
+  { AArch64::ST1Fourv2d_POST,   "st1",  ".2d",    1, false, 64 },
+  { AArch64::ST1Fourv8b_POST,   "st1",  ".8b",    1, false, 32 },
+  { AArch64::ST1Fourv4h_POST,   "st1",  ".4h",    1, false, 32 },
+  { AArch64::ST1Fourv2s_POST,   "st1",  ".2s",    1, false, 32 },
+  { AArch64::ST1Fourv1d_POST,   "st1",  ".1d",    1, false, 32 },
+  { AArch64::ST2i8,             "st2",  ".b",     0, true,  0  },
+  { AArch64::ST2i16,            "st2",  ".h",     0, true,  0  },
+  { AArch64::ST2i32,            "st2",  ".s",     0, true,  0  },
+  { AArch64::ST2i64,            "st2",  ".d",     0, true,  0  },
+  { AArch64::ST2i8_POST,        "st2",  ".b",     1, true,  2  },
+  { AArch64::ST2i16_POST,       "st2",  ".h",     1, true,  4  },
+  { AArch64::ST2i32_POST,       "st2",  ".s",     1, true,  8  },
+  { AArch64::ST2i64_POST,       "st2",  ".d",     1, true,  16 },
+  { AArch64::ST2Twov16b,        "st2",  ".16b",   0, false, 0  },
+  { AArch64::ST2Twov8h,         "st2",  ".8h",    0, false, 0  },
+  { AArch64::ST2Twov4s,         "st2",  ".4s",    0, false, 0  },
+  { AArch64::ST2Twov2d,         "st2",  ".2d",    0, false, 0  },
+  { AArch64::ST2Twov8b,         "st2",  ".8b",    0, false, 0  },
+  { AArch64::ST2Twov4h,         "st2",  ".4h",    0, false, 0  },
+  { AArch64::ST2Twov2s,         "st2",  ".2s",    0, false, 0  },
+  { AArch64::ST2Twov16b_POST,   "st2",  ".16b",   1, false, 32 },
+  { AArch64::ST2Twov8h_POST,    "st2",  ".8h",    1, false, 32 },
+  { AArch64::ST2Twov4s_POST,    "st2",  ".4s",    1, false, 32 },
+  { AArch64::ST2Twov2d_POST,    "st2",  ".2d",    1, false, 32 },
+  { AArch64::ST2Twov8b_POST,    "st2",  ".8b",    1, false, 16 },
+  { AArch64::ST2Twov4h_POST,    "st2",  ".4h",    1, false, 16 },
+  { AArch64::ST2Twov2s_POST,    "st2",  ".2s",    1, false, 16 },
+  { AArch64::ST3i8,             "st3",  ".b",     0, true,  0  },
+  { AArch64::ST3i16,            "st3",  ".h",     0, true,  0  },
+  { AArch64::ST3i32,            "st3",  ".s",     0, true,  0  },
+  { AArch64::ST3i64,            "st3",  ".d",     0, true,  0  },
+  { AArch64::ST3i8_POST,        "st3",  ".b",     1, true,  3  },
+  { AArch64::ST3i16_POST,       "st3",  ".h",     1, true,  6  },
+  { AArch64::ST3i32_POST,       "st3",  ".s",     1, true,  12 },
+  { AArch64::ST3i64_POST,       "st3",  ".d",     1, true,  24 },
+  { AArch64::ST3Threev16b,      "st3",  ".16b",   0, false, 0  },
+  { AArch64::ST3Threev8h,       "st3",  ".8h",    0, false, 0  },
+  { AArch64::ST3Threev4s,       "st3",  ".4s",    0, false, 0  },
+  { AArch64::ST3Threev2d,       "st3",  ".2d",    0, false, 0  },
+  { AArch64::ST3Threev8b,       "st3",  ".8b",    0, false, 0  },
+  { AArch64::ST3Threev4h,       "st3",  ".4h",    0, false, 0  },
+  { AArch64::ST3Threev2s,       "st3",  ".2s",    0, false, 0  },
+  { AArch64::ST3Threev16b_POST, "st3",  ".16b",   1, false, 48 },
+  { AArch64::ST3Threev8h_POST,  "st3",  ".8h",    1, false, 48 },
+  { AArch64::ST3Threev4s_POST,  "st3",  ".4s",    1, false, 48 },
+  { AArch64::ST3Threev2d_POST,  "st3",  ".2d",    1, false, 48 },
+  { AArch64::ST3Threev8b_POST,  "st3",  ".8b",    1, false, 24 },
+  { AArch64::ST3Threev4h_POST,  "st3",  ".4h",    1, false, 24 },
+  { AArch64::ST3Threev2s_POST,  "st3",  ".2s",    1, false, 24 },
+  { AArch64::ST4i8,             "st4",  ".b",     0, true,  0  },
+  { AArch64::ST4i16,            "st4",  ".h",     0, true,  0  },
+  { AArch64::ST4i32,            "st4",  ".s",     0, true,  0  },
+  { AArch64::ST4i64,            "st4",  ".d",     0, true,  0  },
+  { AArch64::ST4i8_POST,        "st4",  ".b",     1, true,  4  },
+  { AArch64::ST4i16_POST,       "st4",  ".h",     1, true,  8  },
+  { AArch64::ST4i32_POST,       "st4",  ".s",     1, true,  16 },
+  { AArch64::ST4i64_POST,       "st4",  ".d",     1, true,  32 },
+  { AArch64::ST4Fourv16b,       "st4",  ".16b",   0, false, 0  },
+  { AArch64::ST4Fourv8h,        "st4",  ".8h",    0, false, 0  },
+  { AArch64::ST4Fourv4s,        "st4",  ".4s",    0, false, 0  },
+  { AArch64::ST4Fourv2d,        "st4",  ".2d",    0, false, 0  },
+  { AArch64::ST4Fourv8b,        "st4",  ".8b",    0, false, 0  },
+  { AArch64::ST4Fourv4h,        "st4",  ".4h",    0, false, 0  },
+  { AArch64::ST4Fourv2s,        "st4",  ".2s",    0, false, 0  },
+  { AArch64::ST4Fourv16b_POST,  "st4",  ".16b",   1, false, 64 },
+  { AArch64::ST4Fourv8h_POST,   "st4",  ".8h",    1, false, 64 },
+  { AArch64::ST4Fourv4s_POST,   "st4",  ".4s",    1, false, 64 },
+  { AArch64::ST4Fourv2d_POST,   "st4",  ".2d",    1, false, 64 },
+  { AArch64::ST4Fourv8b_POST,   "st4",  ".8b",    1, false, 32 },
+  { AArch64::ST4Fourv4h_POST,   "st4",  ".4h",    1, false, 32 },
+  { AArch64::ST4Fourv2s_POST,   "st4",  ".2s",    1, false, 32 },
+};
+
+static const LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
+  unsigned Idx;
+  for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx)
+    if (LdStNInstInfo[Idx].Opcode == Opcode)
+      return &LdStNInstInfo[Idx];
+
+  return nullptr;
+}
+
+void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                        StringRef Annot,
+                                        const MCSubtargetInfo &STI) {
+  unsigned Opcode = MI->getOpcode();
+  StringRef Layout, Mnemonic;
+
+  bool IsTbx;
+  if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) {
+    O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t'
+      << getRegisterName(MI->getOperand(0).getReg(), AArch64::vreg) << ", ";
+
+    unsigned ListOpNum = IsTbx ? 2 : 1;
+    printVectorList(MI, ListOpNum, STI, O, "");
+
+    O << ", "
+      << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), AArch64::vreg);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  if (const LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
+    O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t';
+
+    // Now onto the operands: first a vector list with possible lane
+    // specifier. E.g. { v0 }[2]
+    int OpNum = LdStDesc->ListOperand;
+    printVectorList(MI, OpNum++, STI, O, "");
+
+    if (LdStDesc->HasLane)
+      O << '[' << MI->getOperand(OpNum++).getImm() << ']';
+
+    // Next the address: [xN]
+    unsigned AddrReg = MI->getOperand(OpNum++).getReg();
+    O << ", [" << getRegisterName(AddrReg) << ']';
+
+    // Finally, there might be a post-indexed offset.
+    if (LdStDesc->NaturalOffset != 0) {
+      unsigned Reg = MI->getOperand(OpNum++).getReg();
+      if (Reg != AArch64::XZR)
+        O << ", " << getRegisterName(Reg);
+      else {
+        assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?");
+        O << ", #" << LdStDesc->NaturalOffset;
+      }
+    }
+
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  AArch64InstPrinter::printInst(MI, O, Annot, STI);
+}
+
+bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+#ifndef NDEBUG
+  unsigned Opcode = MI->getOpcode();
+  assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!");
+#endif
+
+  const char *Asm = nullptr;
+  const MCOperand &Op1 = MI->getOperand(0);
+  const MCOperand &Cn = MI->getOperand(1);
+  const MCOperand &Cm = MI->getOperand(2);
+  const MCOperand &Op2 = MI->getOperand(3);
+
+  unsigned Op1Val = Op1.getImm();
+  unsigned CnVal = Cn.getImm();
+  unsigned CmVal = Cm.getImm();
+  unsigned Op2Val = Op2.getImm();
+
+  if (CnVal == 7) {
+    switch (CmVal) {
+    default:
+      break;
+
+    // IC aliases
+    case 1:
+      if (Op1Val == 0 && Op2Val == 0)
+        Asm = "ic\tialluis";
+      break;
+    case 5:
+      if (Op1Val == 0 && Op2Val == 0)
+        Asm = "ic\tiallu";
+      else if (Op1Val == 3 && Op2Val == 1)
+        Asm = "ic\tivau";
+      break;
+
+    // DC aliases
+    case 4:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tzva";
+      break;
+    case 6:
+      if (Op1Val == 0 && Op2Val == 1)
+        Asm = "dc\tivac";
+      if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tisw";
+      break;
+    case 10:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcvac";
+      else if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tcsw";
+      break;
+    case 11:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcvau";
+      break;
+    case 12:
+      if (Op1Val == 3 && Op2Val == 1 &&
+          (STI.getFeatureBits()[AArch64::HasV8_2aOps]))
+        Asm = "dc\tcvap";
+      break;
+    case 14:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcivac";
+      else if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tcisw";
+      break;
+
+    // AT aliases
+    case 8:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e1r"; break;
+        case 1: Asm = "at\ts1e1w"; break;
+        case 2: Asm = "at\ts1e0r"; break;
+        case 3: Asm = "at\ts1e0w"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e2r"; break;
+        case 1: Asm = "at\ts1e2w"; break;
+        case 4: Asm = "at\ts12e1r"; break;
+        case 5: Asm = "at\ts12e1w"; break;
+        case 6: Asm = "at\ts12e0r"; break;
+        case 7: Asm = "at\ts12e0w"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e3r"; break;
+        case 1: Asm = "at\ts1e3w"; break;
+        }
+        break;
+      }
+      break;
+    case 9:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        if (STI.getFeatureBits()[AArch64::HasV8_2aOps]) {
+          switch (Op2Val) {
+          default:
+            break;
+          case 0: Asm = "at\ts1e1rp"; break;
+          case 1: Asm = "at\ts1e1wp"; break;
+          }
+        }
+        break;
+      }
+    }
+  } else if (CnVal == 8) {
+    // TLBI aliases
+    switch (CmVal) {
+    default:
+      break;
+    case 3:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\tvmalle1is"; break;
+        case 1: Asm = "tlbi\tvae1is"; break;
+        case 2: Asm = "tlbi\taside1is"; break;
+        case 3: Asm = "tlbi\tvaae1is"; break;
+        case 5: Asm = "tlbi\tvale1is"; break;
+        case 7: Asm = "tlbi\tvaale1is"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle2is"; break;
+        case 1: Asm = "tlbi\tvae2is"; break;
+        case 4: Asm = "tlbi\talle1is"; break;
+        case 5: Asm = "tlbi\tvale2is"; break;
+        case 6: Asm = "tlbi\tvmalls12e1is"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle3is"; break;
+        case 1: Asm = "tlbi\tvae3is"; break;
+        case 5: Asm = "tlbi\tvale3is"; break;
+        }
+        break;
+      }
+      break;
+    case 0:
+      switch (Op1Val) {
+      default:
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 1: Asm = "tlbi\tipas2e1is"; break;
+        case 5: Asm = "tlbi\tipas2le1is"; break;
+        }
+        break;
+      }
+      break;
+    case 4:
+      switch (Op1Val) {
+      default:
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 1: Asm = "tlbi\tipas2e1"; break;
+        case 5: Asm = "tlbi\tipas2le1"; break;
+        }
+        break;
+      }
+      break;
+    case 7:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\tvmalle1"; break;
+        case 1: Asm = "tlbi\tvae1"; break;
+        case 2: Asm = "tlbi\taside1"; break;
+        case 3: Asm = "tlbi\tvaae1"; break;
+        case 5: Asm = "tlbi\tvale1"; break;
+        case 7: Asm = "tlbi\tvaale1"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle2"; break;
+        case 1: Asm = "tlbi\tvae2"; break;
+        case 4: Asm = "tlbi\talle1"; break;
+        case 5: Asm = "tlbi\tvale2"; break;
+        case 6: Asm = "tlbi\tvmalls12e1"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle3"; break;
+        case 1: Asm = "tlbi\tvae3";  break;
+        case 5: Asm = "tlbi\tvale3"; break;
+        }
+        break;
+      }
+      break;
+    }
+  }
+
+  if (Asm) {
+    unsigned Reg = MI->getOperand(4).getReg();
+
+    O << '\t' << Asm;
+    if (StringRef(Asm).lower().find("all") == StringRef::npos)
+      O << ", " << getRegisterName(Reg);
+  }
+
+  return Asm != nullptr;
+}
+
+void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    O << getRegisterName(Reg);
+  } else if (Op.isImm()) {
+    printImm(MI, OpNo, STI, O);
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    Op.getExpr()->print(O, &MAI);
+  }
+}
+
+void AArch64InstPrinter::printImm(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  O << "#" << formatImm(Op.getImm());
+}
+
+void AArch64InstPrinter::printImmHex(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  O << format("#%#llx", Op.getImm());
+}
+
+void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo,
+                                             unsigned Imm, raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    if (Reg == AArch64::XZR)
+      O << "#" << Imm;
+    else
+      O << getRegisterName(Reg);
+  } else
+    llvm_unreachable("unknown operand kind in printPostIncOperand64");
+}
+
+void AArch64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isReg() && "Non-register vreg operand!");
+  unsigned Reg = Op.getReg();
+  O << getRegisterName(Reg, AArch64::vreg);
+}
+
+void AArch64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm() && "System instruction C[nm] operands must be immediates!");
+  O << "c" << Op.getImm();
+}
+
+void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  if (MO.isImm()) {
+    unsigned Val = (MO.getImm() & 0xfff);
+    assert(Val == MO.getImm() && "Add/sub immediate out of range!");
+    unsigned Shift =
+        AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
+    O << '#' << formatImm(Val);
+    if (Shift != 0)
+      printShifter(MI, OpNum + 1, STI, O);
+
+    if (CommentStream)
+      *CommentStream << '=' << formatImm(Val << Shift) << '\n';
+  } else {
+    assert(MO.isExpr() && "Unexpected operand type!");
+    MO.getExpr()->print(O, &MAI);
+    printShifter(MI, OpNum + 1, STI, O);
+  }
+}
+
+void AArch64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  uint64_t Val = MI->getOperand(OpNum).getImm();
+  O << "#0x";
+  O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 32));
+}
+
+void AArch64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  uint64_t Val = MI->getOperand(OpNum).getImm();
+  O << "#0x";
+  O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 64));
+}
+
+void AArch64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNum).getImm();
+  // LSL #0 should not be printed.
+  if (AArch64_AM::getShiftType(Val) == AArch64_AM::LSL &&
+      AArch64_AM::getShiftValue(Val) == 0)
+    return;
+  O << ", " << AArch64_AM::getShiftExtendName(AArch64_AM::getShiftType(Val))
+    << " #" << AArch64_AM::getShiftValue(Val);
+}
+
+void AArch64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum,
+                                              const MCSubtargetInfo &STI,
+                                              raw_ostream &O) {
+  O << getRegisterName(MI->getOperand(OpNum).getReg());
+  printShifter(MI, OpNum + 1, STI, O);
+}
+
+void AArch64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum,
+                                               const MCSubtargetInfo &STI,
+                                               raw_ostream &O) {
+  O << getRegisterName(MI->getOperand(OpNum).getReg());
+  printArithExtend(MI, OpNum + 1, STI, O);
+}
+
+void AArch64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNum).getImm();
+  AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getArithExtendType(Val);
+  unsigned ShiftVal = AArch64_AM::getArithShiftValue(Val);
+
+  // If the destination or first source register operand is [W]SP, print
+  // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at
+  // all.
+  if (ExtType == AArch64_AM::UXTW || ExtType == AArch64_AM::UXTX) {
+    unsigned Dest = MI->getOperand(0).getReg();
+    unsigned Src1 = MI->getOperand(1).getReg();
+    if ( ((Dest == AArch64::SP || Src1 == AArch64::SP) &&
+          ExtType == AArch64_AM::UXTX) ||
+         ((Dest == AArch64::WSP || Src1 == AArch64::WSP) &&
+          ExtType == AArch64_AM::UXTW) ) {
+      if (ShiftVal != 0)
+        O << ", lsl #" << ShiftVal;
+      return;
+    }
+  }
+  O << ", " << AArch64_AM::getShiftExtendName(ExtType);
+  if (ShiftVal != 0)
+    O << " #" << ShiftVal;
+}
+
+void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O, char SrcRegKind,
+                                        unsigned Width) {
+  unsigned SignExtend = MI->getOperand(OpNum).getImm();
+  unsigned DoShift = MI->getOperand(OpNum + 1).getImm();
+
+  // sxtw, sxtx, uxtw or lsl (== uxtx)
+  bool IsLSL = !SignExtend && SrcRegKind == 'x';
+  if (IsLSL)
+    O << "lsl";
+  else
+    O << (SignExtend ? 's' : 'u') << "xt" << SrcRegKind;
+
+  if (DoShift || IsLSL)
+    O << " #" << Log2_32(Width / 8);
+}
+
+void AArch64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm();
+  O << AArch64CC::getCondCodeName(CC);
+}
+
+void AArch64InstPrinter::printInverseCondCode(const MCInst *MI, unsigned OpNum,
+                                              const MCSubtargetInfo &STI,
+                                              raw_ostream &O) {
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm();
+  O << AArch64CC::getCondCodeName(AArch64CC::getInvertedCondCode(CC));
+}
+
+void AArch64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']';
+}
+
+template<int Scale>
+void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+  O << '#' << formatImm(Scale * MI->getOperand(OpNum).getImm());
+}
+
+void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum,
+                                           unsigned Scale, raw_ostream &O) {
+  const MCOperand MO = MI->getOperand(OpNum);
+  if (MO.isImm()) {
+    O << "#" << formatImm(MO.getImm() * Scale);
+  } else {
+    assert(MO.isExpr() && "Unexpected operand type!");
+    MO.getExpr()->print(O, &MAI);
+  }
+}
+
+void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
+                                          unsigned Scale, raw_ostream &O) {
+  const MCOperand MO1 = MI->getOperand(OpNum + 1);
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
+  if (MO1.isImm()) {
+      O << ", #" << formatImm(MO1.getImm() * Scale);
+  } else {
+    assert(MO1.isExpr() && "Unexpected operand type!");
+    O << ", ";
+    MO1.getExpr()->print(O, &MAI);
+  }
+  O << ']';
+}
+
+void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  unsigned prfop = MI->getOperand(OpNum).getImm();
+  auto PRFM = AArch64PRFM::lookupPRFMByEncoding(prfop);
+  if (PRFM)
+    O << PRFM->Name;
+  else
+    O << '#' << formatImm(prfop);
+}
+
+void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  unsigned psbhintop = MI->getOperand(OpNum).getImm();
+  auto PSB = AArch64PSBHint::lookupPSBByEncoding(psbhintop);
+  if (PSB)
+    O << PSB->Name;
+  else
+    O << '#' << formatImm(psbhintop);
+}
+
+void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  float FPImm =
+      MO.isFPImm() ? MO.getFPImm() : AArch64_AM::getFPImmFloat(MO.getImm());
+
+  // 8 decimal places are enough to perfectly represent permitted floats.
+  O << format("#%.8f", FPImm);
+}
+
+static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
+  while (Stride--) {
+    switch (Reg) {
+    default:
+      llvm_unreachable("Vector register expected!");
+    case AArch64::Q0:  Reg = AArch64::Q1;  break;
+    case AArch64::Q1:  Reg = AArch64::Q2;  break;
+    case AArch64::Q2:  Reg = AArch64::Q3;  break;
+    case AArch64::Q3:  Reg = AArch64::Q4;  break;
+    case AArch64::Q4:  Reg = AArch64::Q5;  break;
+    case AArch64::Q5:  Reg = AArch64::Q6;  break;
+    case AArch64::Q6:  Reg = AArch64::Q7;  break;
+    case AArch64::Q7:  Reg = AArch64::Q8;  break;
+    case AArch64::Q8:  Reg = AArch64::Q9;  break;
+    case AArch64::Q9:  Reg = AArch64::Q10; break;
+    case AArch64::Q10: Reg = AArch64::Q11; break;
+    case AArch64::Q11: Reg = AArch64::Q12; break;
+    case AArch64::Q12: Reg = AArch64::Q13; break;
+    case AArch64::Q13: Reg = AArch64::Q14; break;
+    case AArch64::Q14: Reg = AArch64::Q15; break;
+    case AArch64::Q15: Reg = AArch64::Q16; break;
+    case AArch64::Q16: Reg = AArch64::Q17; break;
+    case AArch64::Q17: Reg = AArch64::Q18; break;
+    case AArch64::Q18: Reg = AArch64::Q19; break;
+    case AArch64::Q19: Reg = AArch64::Q20; break;
+    case AArch64::Q20: Reg = AArch64::Q21; break;
+    case AArch64::Q21: Reg = AArch64::Q22; break;
+    case AArch64::Q22: Reg = AArch64::Q23; break;
+    case AArch64::Q23: Reg = AArch64::Q24; break;
+    case AArch64::Q24: Reg = AArch64::Q25; break;
+    case AArch64::Q25: Reg = AArch64::Q26; break;
+    case AArch64::Q26: Reg = AArch64::Q27; break;
+    case AArch64::Q27: Reg = AArch64::Q28; break;
+    case AArch64::Q28: Reg = AArch64::Q29; break;
+    case AArch64::Q29: Reg = AArch64::Q30; break;
+    case AArch64::Q30: Reg = AArch64::Q31; break;
+    // Vector lists can wrap around.
+    case AArch64::Q31:
+      Reg = AArch64::Q0;
+      break;
+    }
+  }
+  return Reg;
+}
+
+template<unsigned size>
+void AArch64InstPrinter::printGPRSeqPairsClassOperand(const MCInst *MI,
+                                                   unsigned OpNum,
+                                                   const MCSubtargetInfo &STI,
+                                                   raw_ostream &O) {
+  static_assert(size == 64 || size == 32,
+                "Template parameter must be either 32 or 64");
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+
+  unsigned Sube = (size == 32) ? AArch64::sube32 : AArch64::sube64;
+  unsigned Subo = (size == 32) ? AArch64::subo32 : AArch64::subo64;
+
+  unsigned Even = MRI.getSubReg(Reg,  Sube);
+  unsigned Odd = MRI.getSubReg(Reg,  Subo);
+  O << getRegisterName(Even) << ", " << getRegisterName(Odd);
+}
+
+void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O,
+                                         StringRef LayoutSuffix) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+
+  O << "{ ";
+
+  // Work out how many registers there are in the list (if there is an actual
+  // list).
+  unsigned NumRegs = 1;
+  if (MRI.getRegClass(AArch64::DDRegClassID).contains(Reg) ||
+      MRI.getRegClass(AArch64::QQRegClassID).contains(Reg))
+    NumRegs = 2;
+  else if (MRI.getRegClass(AArch64::DDDRegClassID).contains(Reg) ||
+           MRI.getRegClass(AArch64::QQQRegClassID).contains(Reg))
+    NumRegs = 3;
+  else if (MRI.getRegClass(AArch64::DDDDRegClassID).contains(Reg) ||
+           MRI.getRegClass(AArch64::QQQQRegClassID).contains(Reg))
+    NumRegs = 4;
+
+  // Now forget about the list and find out what the first register is.
+  if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::dsub0))
+    Reg = FirstReg;
+  else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::qsub0))
+    Reg = FirstReg;
+
+  // If it's a D-reg, we need to promote it to the equivalent Q-reg before
+  // printing (otherwise getRegisterName fails).
+  if (MRI.getRegClass(AArch64::FPR64RegClassID).contains(Reg)) {
+    const MCRegisterClass &FPR128RC =
+        MRI.getRegClass(AArch64::FPR128RegClassID);
+    Reg = MRI.getMatchingSuperReg(Reg, AArch64::dsub, &FPR128RC);
+  }
+
+  for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) {
+    O << getRegisterName(Reg, AArch64::vreg) << LayoutSuffix;
+    if (i + 1 != NumRegs)
+      O << ", ";
+  }
+
+  O << " }";
+}
+
+void
+AArch64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI,
+                                                   unsigned OpNum,
+                                                   const MCSubtargetInfo &STI,
+                                                   raw_ostream &O) {
+  printVectorList(MI, OpNum, STI, O, "");
+}
+
+template <unsigned NumLanes, char LaneKind>
+void AArch64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum,
+                                              const MCSubtargetInfo &STI,
+                                              raw_ostream &O) {
+  std::string Suffix(".");
+  if (NumLanes)
+    Suffix += itostr(NumLanes) + LaneKind;
+  else
+    Suffix += LaneKind;
+
+  printVectorList(MI, OpNum, STI, O, Suffix);
+}
+
+void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  O << "[" << MI->getOperand(OpNum).getImm() << "]";
+}
+
+void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+
+  // If the label has already been resolved to an immediate offset (say, when
+  // we're running the disassembler), just print the immediate.
+  if (Op.isImm()) {
+    O << "#" << formatImm(Op.getImm() * 4);
+    return;
+  }
+
+  // If the branch target is simply an address then print it in hex.
+  const MCConstantExpr *BranchTarget =
+      dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr());
+  int64_t Address;
+  if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
+    O << "0x";
+    O.write_hex(Address);
+  } else {
+    // Otherwise, just print the expression.
+    MI->getOperand(OpNum).getExpr()->print(O, &MAI);
+  }
+}
+
+void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+
+  // If the label has already been resolved to an immediate offset (say, when
+  // we're running the disassembler), just print the immediate.
+  if (Op.isImm()) {
+    O << "#" << formatImm(Op.getImm() * (1 << 12));
+    return;
+  }
+
+  // Otherwise, just print the expression.
+  MI->getOperand(OpNum).getExpr()->print(O, &MAI);
+}
+
+void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
+                                            const MCSubtargetInfo &STI,
+                                            raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+  unsigned Opcode = MI->getOpcode();
+
+  StringRef Name;
+  if (Opcode == AArch64::ISB) {
+    auto ISB = AArch64ISB::lookupISBByEncoding(Val);
+    Name = ISB ? ISB->Name : "";
+  } else {
+    auto DB = AArch64DB::lookupDBByEncoding(Val);
+    Name = DB ? DB->Name : "";
+  }
+  if (!Name.empty())
+    O << Name;
+  else
+    O << "#" << Val;
+}
+
+void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
+                                                const MCSubtargetInfo &STI,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+
+  // Horrible hack for the one register that has identical encodings but
+  // different names in MSR and MRS. Because of this, one of MRS and MSR is
+  // going to get the wrong entry
+  if (Val == AArch64SysReg::DBGDTRRX_EL0) {
+    O << "DBGDTRRX_EL0";
+    return;
+  }
+
+  const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
+  if (Reg && Reg->Readable && Reg->haveFeatures(STI.getFeatureBits()))
+    O << Reg->Name;
+  else
+    O << AArch64SysReg::genericRegisterString(Val);
+}
+
+void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
+                                                const MCSubtargetInfo &STI,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+
+  // Horrible hack for the one register that has identical encodings but
+  // different names in MSR and MRS. Because of this, one of MRS and MSR is
+  // going to get the wrong entry
+  if (Val == AArch64SysReg::DBGDTRTX_EL0) {
+    O << "DBGDTRTX_EL0";
+    return;
+  }
+
+  const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
+  if (Reg && Reg->Writeable && Reg->haveFeatures(STI.getFeatureBits()))
+    O << Reg->Name;
+  else
+    O << AArch64SysReg::genericRegisterString(Val);
+}
+
+void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
+                                                const MCSubtargetInfo &STI,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+
+  auto PState = AArch64PState::lookupPStateByEncoding(Val);
+  if (PState && PState->haveFeatures(STI.getFeatureBits()))
+    O << PState->Name;
+  else
+    O << "#" << formatImm(Val);
+}
+
+void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo,
+                                                const MCSubtargetInfo &STI,
+                                                raw_ostream &O) {
+  unsigned RawVal = MI->getOperand(OpNo).getImm();
+  uint64_t Val = AArch64_AM::decodeAdvSIMDModImmType10(RawVal);
+  O << format("#%#016llx", Val);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
new file mode 100644
index 000000000000..65dca99ed04e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -0,0 +1,188 @@
+//===-- AArch64InstPrinter.h - Convert AArch64 MCInst to assembly syntax --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an AArch64 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
+#define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
+
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class AArch64InstPrinter : public MCInstPrinter {
+public:
+  AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                     const MCRegisterInfo &MRI);
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+
+  // Autogenerated by tblgen.
+  virtual void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
+                                raw_ostream &O);
+  virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
+                               raw_ostream &O);
+  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                                       unsigned PrintMethodIdx,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O);
+  virtual StringRef getRegName(unsigned RegNo) const {
+    return getRegisterName(RegNo);
+  }
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = AArch64::NoRegAltName);
+
+protected:
+  bool printSysAlias(const MCInst *MI, const MCSubtargetInfo &STI,
+                     raw_ostream &O);
+  // Operand printers
+  void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
+  void printImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                raw_ostream &O);
+  void printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                   raw_ostream &O);
+  void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
+                           raw_ostream &O);
+  template <int Amount>
+  void printPostIncOperand(const MCInst *MI, unsigned OpNo,
+                           const MCSubtargetInfo &STI, raw_ostream &O) {
+    printPostIncOperand(MI, OpNo, Amount, O);
+  }
+
+  void printVRegOperand(const MCInst *MI, unsigned OpNo,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
+  void printSysCROperand(const MCInst *MI, unsigned OpNo,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printAddSubImm(const MCInst *MI, unsigned OpNum,
+                      const MCSubtargetInfo &STI, raw_ostream &O);
+  void printLogicalImm32(const MCInst *MI, unsigned OpNum,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printLogicalImm64(const MCInst *MI, unsigned OpNum,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printShifter(const MCInst *MI, unsigned OpNum,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printShiftedRegister(const MCInst *MI, unsigned OpNum,
+                            const MCSubtargetInfo &STI, raw_ostream &O);
+  void printExtendedRegister(const MCInst *MI, unsigned OpNum,
+                             const MCSubtargetInfo &STI, raw_ostream &O);
+  void printArithExtend(const MCInst *MI, unsigned OpNum,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
+
+  void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+                      char SrcRegKind, unsigned Width);
+  template <char SrcRegKind, unsigned Width>
+  void printMemExtend(const MCInst *MI, unsigned OpNum,
+                      const MCSubtargetInfo &STI, raw_ostream &O) {
+    printMemExtend(MI, OpNum, O, SrcRegKind, Width);
+  }
+
+  void printCondCode(const MCInst *MI, unsigned OpNum,
+                     const MCSubtargetInfo &STI, raw_ostream &O);
+  void printInverseCondCode(const MCInst *MI, unsigned OpNum,
+                            const MCSubtargetInfo &STI, raw_ostream &O);
+  void printAlignedLabel(const MCInst *MI, unsigned OpNum,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale,
+                         raw_ostream &O);
+  void printAMIndexedWB(const MCInst *MI, unsigned OpNum, unsigned Scale,
+                        raw_ostream &O);
+
+  template <int Scale>
+  void printUImm12Offset(const MCInst *MI, unsigned OpNum,
+                         const MCSubtargetInfo &STI, raw_ostream &O) {
+    printUImm12Offset(MI, OpNum, Scale, O);
+  }
+
+  template <int BitWidth>
+  void printAMIndexedWB(const MCInst *MI, unsigned OpNum,
+                        const MCSubtargetInfo &STI, raw_ostream &O) {
+    printAMIndexedWB(MI, OpNum, BitWidth / 8, O);
+  }
+
+  void printAMNoIndex(const MCInst *MI, unsigned OpNum,
+                      const MCSubtargetInfo &STI, raw_ostream &O);
+
+  template <int Scale>
+  void printImmScale(const MCInst *MI, unsigned OpNum,
+                     const MCSubtargetInfo &STI, raw_ostream &O);
+
+  void printPrefetchOp(const MCInst *MI, unsigned OpNum,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+
+  void printPSBHintOp(const MCInst *MI, unsigned OpNum,
+                      const MCSubtargetInfo &STI, raw_ostream &O);
+
+  void printFPImmOperand(const MCInst *MI, unsigned OpNum,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+
+  void printVectorList(const MCInst *MI, unsigned OpNum,
+                       const MCSubtargetInfo &STI, raw_ostream &O,
+                       StringRef LayoutSuffix);
+
+  /// Print a list of vector registers where the type suffix is implicit
+  /// (i.e. attached to the instruction rather than the registers).
+  void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O);
+
+  template <unsigned NumLanes, char LaneKind>
+  void printTypedVectorList(const MCInst *MI, unsigned OpNum,
+                            const MCSubtargetInfo &STI, raw_ostream &O);
+
+  void printVectorIndex(const MCInst *MI, unsigned OpNum,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
+  void printAdrpLabel(const MCInst *MI, unsigned OpNum,
+                      const MCSubtargetInfo &STI, raw_ostream &O);
+  void printBarrierOption(const MCInst *MI, unsigned OpNum,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printMSRSystemRegister(const MCInst *MI, unsigned OpNum,
+                              const MCSubtargetInfo &STI, raw_ostream &O);
+  void printMRSSystemRegister(const MCInst *MI, unsigned OpNum,
+                              const MCSubtargetInfo &STI, raw_ostream &O);
+  void printSystemPStateField(const MCInst *MI, unsigned OpNum,
+                              const MCSubtargetInfo &STI, raw_ostream &O);
+  void printSIMDType10Operand(const MCInst *MI, unsigned OpNum,
+                              const MCSubtargetInfo &STI, raw_ostream &O);
+  template<unsigned size>
+  void printGPRSeqPairsClassOperand(const MCInst *MI, unsigned OpNum,
+                                    const MCSubtargetInfo &STI,
+                                    raw_ostream &O);
+};
+
+class AArch64AppleInstPrinter : public AArch64InstPrinter {
+public:
+  AArch64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                          const MCRegisterInfo &MRI);
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+
+  void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
+                        raw_ostream &O) override;
+  bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
+                       raw_ostream &O) override;
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx,
+                               const MCSubtargetInfo &STI,
+                               raw_ostream &O) override;
+  StringRef getRegName(unsigned RegNo) const override {
+    return getRegisterName(RegNo);
+  }
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = AArch64::NoRegAltName);
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
new file mode 100644
index 000000000000..3e5ef4df4706
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -0,0 +1,803 @@
+//===- AArch64AddressingModes.h - AArch64 Addressing Modes ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+
+/// AArch64_AM - AArch64 Addressing Mode Stuff
+namespace AArch64_AM {
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//
+
+enum ShiftExtendType {
+  InvalidShiftExtend = -1,
+  LSL = 0,
+  LSR,
+  ASR,
+  ROR,
+  MSL,
+
+  UXTB,
+  UXTH,
+  UXTW,
+  UXTX,
+
+  SXTB,
+  SXTH,
+  SXTW,
+  SXTX,
+};
+
+/// getShiftName - Get the string encoding for the shift type.
+static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) {
+  switch (ST) {
+  default: llvm_unreachable("unhandled shift type!");
+  case AArch64_AM::LSL: return "lsl";
+  case AArch64_AM::LSR: return "lsr";
+  case AArch64_AM::ASR: return "asr";
+  case AArch64_AM::ROR: return "ror";
+  case AArch64_AM::MSL: return "msl";
+  case AArch64_AM::UXTB: return "uxtb";
+  case AArch64_AM::UXTH: return "uxth";
+  case AArch64_AM::UXTW: return "uxtw";
+  case AArch64_AM::UXTX: return "uxtx";
+  case AArch64_AM::SXTB: return "sxtb";
+  case AArch64_AM::SXTH: return "sxth";
+  case AArch64_AM::SXTW: return "sxtw";
+  case AArch64_AM::SXTX: return "sxtx";
+  }
+  return nullptr;
+}
+
+/// getShiftType - Extract the shift type.
+static inline AArch64_AM::ShiftExtendType getShiftType(unsigned Imm) {
+  switch ((Imm >> 6) & 0x7) {
+  default: return AArch64_AM::InvalidShiftExtend;
+  case 0: return AArch64_AM::LSL;
+  case 1: return AArch64_AM::LSR;
+  case 2: return AArch64_AM::ASR;
+  case 3: return AArch64_AM::ROR;
+  case 4: return AArch64_AM::MSL;
+  }
+}
+
+/// getShiftValue - Extract the shift value.
+static inline unsigned getShiftValue(unsigned Imm) {
+  return Imm & 0x3f;
+}
+
+/// getShifterImm - Encode the shift type and amount:
+///   imm:     6-bit shift amount
+///   shifter: 000 ==> lsl
+///            001 ==> lsr
+///            010 ==> asr
+///            011 ==> ror
+///            100 ==> msl
+///   {8-6}  = shifter
+///   {5-0}  = imm
+static inline unsigned getShifterImm(AArch64_AM::ShiftExtendType ST,
+                                     unsigned Imm) {
+  assert((Imm & 0x3f) == Imm && "Illegal shifted immedate value!");
+  unsigned STEnc = 0;
+  switch (ST) {
+  default:  llvm_unreachable("Invalid shift requested");
+  case AArch64_AM::LSL: STEnc = 0; break;
+  case AArch64_AM::LSR: STEnc = 1; break;
+  case AArch64_AM::ASR: STEnc = 2; break;
+  case AArch64_AM::ROR: STEnc = 3; break;
+  case AArch64_AM::MSL: STEnc = 4; break;
+  }
+  return (STEnc << 6) | (Imm & 0x3f);
+}
+
+//===----------------------------------------------------------------------===//
+// Extends
+//
+
+/// getArithShiftValue - get the arithmetic shift value.
+static inline unsigned getArithShiftValue(unsigned Imm) {
+  return Imm & 0x7;
+}
+
+/// getExtendType - Extract the extend type for operands of arithmetic ops.
+static inline AArch64_AM::ShiftExtendType getExtendType(unsigned Imm) {
+  assert((Imm & 0x7) == Imm && "invalid immediate!");
+  switch (Imm) {
+  default: llvm_unreachable("Compiler bug!");
+  case 0: return AArch64_AM::UXTB;
+  case 1: return AArch64_AM::UXTH;
+  case 2: return AArch64_AM::UXTW;
+  case 3: return AArch64_AM::UXTX;
+  case 4: return AArch64_AM::SXTB;
+  case 5: return AArch64_AM::SXTH;
+  case 6: return AArch64_AM::SXTW;
+  case 7: return AArch64_AM::SXTX;
+  }
+}
+
+static inline AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm) {
+  return getExtendType((Imm >> 3) & 0x7);
+}
+
+/// Mapping from extend bits to required operation:
+///   shifter: 000 ==> uxtb
+///            001 ==> uxth
+///            010 ==> uxtw
+///            011 ==> uxtx
+///            100 ==> sxtb
+///            101 ==> sxth
+///            110 ==> sxtw
+///            111 ==> sxtx
+inline unsigned getExtendEncoding(AArch64_AM::ShiftExtendType ET) {
+  switch (ET) {
+  default: llvm_unreachable("Invalid extend type requested");
+  case AArch64_AM::UXTB: return 0; break;
+  case AArch64_AM::UXTH: return 1; break;
+  case AArch64_AM::UXTW: return 2; break;
+  case AArch64_AM::UXTX: return 3; break;
+  case AArch64_AM::SXTB: return 4; break;
+  case AArch64_AM::SXTH: return 5; break;
+  case AArch64_AM::SXTW: return 6; break;
+  case AArch64_AM::SXTX: return 7; break;
+  }
+}
+
+/// getArithExtendImm - Encode the extend type and shift amount for an
+///                     arithmetic instruction:
+///   imm:     3-bit extend amount
+///   {5-3}  = shifter
+///   {2-0}  = imm3
+static inline unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET,
+                                         unsigned Imm) {
+  assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!");
+  return (getExtendEncoding(ET) << 3) | (Imm & 0x7);
+}
+
+/// getMemDoShift - Extract the "do shift" flag value for load/store
+/// instructions.
+static inline bool getMemDoShift(unsigned Imm) {
+  return (Imm & 0x1) != 0;
+}
+
+/// getExtendType - Extract the extend type for the offset operand of
+/// loads/stores.
+static inline AArch64_AM::ShiftExtendType getMemExtendType(unsigned Imm) {
+  return getExtendType((Imm >> 1) & 0x7);
+}
+
+/// getExtendImm - Encode the extend type and amount for a load/store inst:
+///   doshift:     should the offset be scaled by the access size
+///   shifter: 000 ==> uxtb
+///            001 ==> uxth
+///            010 ==> uxtw
+///            011 ==> uxtx
+///            100 ==> sxtb
+///            101 ==> sxth
+///            110 ==> sxtw
+///            111 ==> sxtx
+///   {3-1}  = shifter
+///   {0}  = doshift
+static inline unsigned getMemExtendImm(AArch64_AM::ShiftExtendType ET,
+                                       bool DoShift) {
+  return (getExtendEncoding(ET) << 1) | unsigned(DoShift);
+}
+
+static inline uint64_t ror(uint64_t elt, unsigned size) {
+  return ((elt & 1) << (size-1)) | (elt >> 1);
+}
+
+/// processLogicalImmediate - Determine if an immediate value can be encoded
+/// as the immediate operand of a logical instruction for the given register
+/// size.  If so, return true with "encoding" set to the encoded value in
+/// the form N:immr:imms.
+static inline bool processLogicalImmediate(uint64_t Imm, unsigned RegSize,
+                                           uint64_t &Encoding) {
+  if (Imm == 0ULL || Imm == ~0ULL ||
+      (RegSize != 64 && (Imm >> RegSize != 0 || Imm == ~0U)))
+    return false;
+
+  // First, determine the element size.
+  unsigned Size = RegSize;
+
+  do {
+    Size /= 2;
+    uint64_t Mask = (1ULL << Size) - 1;
+
+    if ((Imm & Mask) != ((Imm >> Size) & Mask)) {
+      Size *= 2;
+      break;
+    }
+  } while (Size > 2);
+
+  // Second, determine the rotation to make the element be: 0^m 1^n.
+  uint32_t CTO, I;
+  uint64_t Mask = ((uint64_t)-1LL) >> (64 - Size);
+  Imm &= Mask;
+
+  if (isShiftedMask_64(Imm)) {
+    I = countTrailingZeros(Imm);
+    assert(I < 64 && "undefined behavior");
+    CTO = countTrailingOnes(Imm >> I);
+  } else {
+    Imm |= ~Mask;
+    if (!isShiftedMask_64(~Imm))
+      return false;
+
+    unsigned CLO = countLeadingOnes(Imm);
+    I = 64 - CLO;
+    CTO = CLO + countTrailingOnes(Imm) - (64 - Size);
+  }
+
+  // Encode in Immr the number of RORs it would take to get *from* 0^m 1^n
+  // to our target value, where I is the number of RORs to go the opposite
+  // direction.
+  assert(Size > I && "I should be smaller than element size");
+  unsigned Immr = (Size - I) & (Size - 1);
+
+  // If size has a 1 in the n'th bit, create a value that has zeroes in
+  // bits [0, n] and ones above that.
+  uint64_t NImms = ~(Size-1) << 1;
+
+  // Or the CTO value into the low bits, which must be below the Nth bit
+  // bit mentioned above.
+  NImms |= (CTO-1);
+
+  // Extract the seventh bit and toggle it to create the N field.
+  unsigned N = ((NImms >> 6) & 1) ^ 1;
+
+  Encoding = (N << 12) | (Immr << 6) | (NImms & 0x3f);
+  return true;
+}
+
+/// isLogicalImmediate - Return true if the immediate is valid for a logical
+/// immediate instruction of the given register size. Return false otherwise.
+static inline bool isLogicalImmediate(uint64_t imm, unsigned regSize) {
+  uint64_t encoding;
+  return processLogicalImmediate(imm, regSize, encoding);
+}
+
+/// encodeLogicalImmediate - Return the encoded immediate value for a logical
+/// immediate instruction of the given register size.
+static inline uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize) {
+  uint64_t encoding = 0;
+  bool res = processLogicalImmediate(imm, regSize, encoding);
+  assert(res && "invalid logical immediate");
+  (void)res;
+  return encoding;
+}
+
+/// decodeLogicalImmediate - Decode a logical immediate value in the form
+/// "N:immr:imms" (where the immr and imms fields are each 6 bits) into the
+/// integer value it represents with regSize bits.
+static inline uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize) {
+  // Extract the N, imms, and immr fields.
+  unsigned N = (val >> 12) & 1;
+  unsigned immr = (val >> 6) & 0x3f;
+  unsigned imms = val & 0x3f;
+
+  assert((regSize == 64 || N == 0) && "undefined logical immediate encoding");
+  int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
+  assert(len >= 0 && "undefined logical immediate encoding");
+  unsigned size = (1 << len);
+  unsigned R = immr & (size - 1);
+  unsigned S = imms & (size - 1);
+  assert(S != size - 1 && "undefined logical immediate encoding");
+  uint64_t pattern = (1ULL << (S + 1)) - 1;
+  for (unsigned i = 0; i < R; ++i)
+    pattern = ror(pattern, size);
+
+  // Replicate the pattern to fill the regSize.
+  while (size != regSize) {
+    pattern |= (pattern << size);
+    size *= 2;
+  }
+  return pattern;
+}
+
+/// isValidDecodeLogicalImmediate - Check to see if the logical immediate value
+/// in the form "N:immr:imms" (where the immr and imms fields are each 6 bits)
+/// is a valid encoding for an integer value with regSize bits.
+static inline bool isValidDecodeLogicalImmediate(uint64_t val,
+                                                 unsigned regSize) {
+  // Extract the N and imms fields needed for checking.
+  unsigned N = (val >> 12) & 1;
+  unsigned imms = val & 0x3f;
+
+  if (regSize == 32 && N != 0) // undefined logical immediate encoding
+    return false;
+  int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
+  if (len < 0) // undefined logical immediate encoding
+    return false;
+  unsigned size = (1 << len);
+  unsigned S = imms & (size - 1);
+  if (S == size - 1) // undefined logical immediate encoding
+    return false;
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-point Immediates
+//
+static inline float getFPImmFloat(unsigned Imm) {
+  // We expect an 8-bit binary encoding of a floating-point number here.
+  union {
+    uint32_t I;
+    float F;
+  } FPUnion;
+
+  uint8_t Sign = (Imm >> 7) & 0x1;
+  uint8_t Exp = (Imm >> 4) & 0x7;
+  uint8_t Mantissa = Imm & 0xf;
+
+  //   8-bit FP    iEEEE Float Encoding
+  //   abcd efgh   aBbbbbbc defgh000 00000000 00000000
+  //
+  // where B = NOT(b);
+
+  FPUnion.I = 0;
+  FPUnion.I |= Sign << 31;
+  FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
+  FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
+  FPUnion.I |= (Exp & 0x3) << 23;
+  FPUnion.I |= Mantissa << 19;
+  return FPUnion.F;
+}
+
+/// getFP16Imm - Return an 8-bit floating-point version of the 16-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP16Imm(const APInt &Imm) {
+  uint32_t Sign = Imm.lshr(15).getZExtValue() & 1;
+  int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15;  // -14 to 15
+  int32_t Mantissa = Imm.getZExtValue() & 0x3ff;  // 10 bits
+
+  // We can handle 4 bits of mantissa.
+  // mantissa = (16+UInt(e:f:g:h))/16.
+  if (Mantissa & 0x3f)
+    return -1;
+  Mantissa >>= 6;
+
+  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+  if (Exp < -3 || Exp > 4)
+    return -1;
+  Exp = ((Exp+3) & 0x7) ^ 4;
+
+  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP16Imm(const APFloat &FPImm) {
+  return getFP16Imm(FPImm.bitcastToAPInt());
+}
+
+/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP32Imm(const APInt &Imm) {
+  uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
+  int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
+  int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
+
+  // We can handle 4 bits of mantissa.
+  // mantissa = (16+UInt(e:f:g:h))/16.
+  if (Mantissa & 0x7ffff)
+    return -1;
+  Mantissa >>= 19;
+  if ((Mantissa & 0xf) != Mantissa)
+    return -1;
+
+  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+  if (Exp < -3 || Exp > 4)
+    return -1;
+  Exp = ((Exp+3) & 0x7) ^ 4;
+
+  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP32Imm(const APFloat &FPImm) {
+  return getFP32Imm(FPImm.bitcastToAPInt());
+}
+
+/// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP64Imm(const APInt &Imm) {
+  uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
+  int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023;   // -1022 to 1023
+  uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
+
+  // We can handle 4 bits of mantissa.
+  // mantissa = (16+UInt(e:f:g:h))/16.
+  if (Mantissa & 0xffffffffffffULL)
+    return -1;
+  Mantissa >>= 48;
+  if ((Mantissa & 0xf) != Mantissa)
+    return -1;
+
+  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+  if (Exp < -3 || Exp > 4)
+    return -1;
+  Exp = ((Exp+3) & 0x7) ^ 4;
+
+  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP64Imm(const APFloat &FPImm) {
+  return getFP64Imm(FPImm.bitcastToAPInt());
+}
+
+//===--------------------------------------------------------------------===//
+// AdvSIMD Modified Immediates
+//===--------------------------------------------------------------------===//
+
+// 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh
+static inline bool isAdvSIMDModImmType1(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffffff00ffffff00ULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType1(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType1(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 32) | EncVal;
+}
+
+// 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00
+static inline bool isAdvSIMDModImmType2(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffff00ffffff00ffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType2(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType2(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 40) | (EncVal << 8);
+}
+
+// 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00
+static inline bool isAdvSIMDModImmType3(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xff00ffffff00ffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType3(uint64_t Imm) {
+  return (Imm & 0xff0000ULL) >> 16;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType3(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 16);
+}
+
+// abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00
+static inline bool isAdvSIMDModImmType4(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0x00ffffff00ffffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType4(uint64_t Imm) {
+  return (Imm & 0xff000000ULL) >> 24;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType4(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 56) | (EncVal << 24);
+}
+
+// 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh
+static inline bool isAdvSIMDModImmType5(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (((Imm & 0x00ff0000ULL) >> 16) == (Imm & 0x000000ffULL)) &&
+         ((Imm & 0xff00ff00ff00ff00ULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType5(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType5(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 32) | (EncVal << 16) | EncVal;
+}
+
+// abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00
+static inline bool isAdvSIMDModImmType6(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (((Imm & 0xff000000ULL) >> 16) == (Imm & 0x0000ff00ULL)) &&
+         ((Imm & 0x00ff00ff00ff00ffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType6(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType6(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 56) | (EncVal << 40) | (EncVal << 24) | (EncVal << 8);
+}
+
+// 0x00 0x00 abcdefgh 0xFF 0x00 0x00 abcdefgh 0xFF
+static inline bool isAdvSIMDModImmType7(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffff00ffffff00ffULL) == 0x000000ff000000ffULL);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType7(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType7(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 40) | (EncVal << 8) | 0x000000ff000000ffULL;
+}
+
+// 0x00 abcdefgh 0xFF 0xFF 0x00 abcdefgh 0xFF 0xFF
+static inline bool isAdvSIMDModImmType8(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xff00ffffff00ffffULL) == 0x0000ffff0000ffffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType8(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 16) | 0x0000ffff0000ffffULL;
+}
+
+static inline uint8_t encodeAdvSIMDModImmType8(uint64_t Imm) {
+  return (Imm & 0x00ff0000ULL) >> 16;
+}
+
+// abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh
+static inline bool isAdvSIMDModImmType9(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm >> 48) == (Imm & 0x0000ffffULL)) &&
+         ((Imm >> 56) == (Imm & 0x000000ffULL));
+}
+
+static inline uint8_t encodeAdvSIMDModImmType9(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType9(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  EncVal |= (EncVal << 8);
+  EncVal |= (EncVal << 16);
+  EncVal |= (EncVal << 32);
+  return EncVal;
+}
+
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// cmode: 1110, op: 1
+static inline bool isAdvSIMDModImmType10(uint64_t Imm) {
+  uint64_t ByteA = Imm & 0xff00000000000000ULL;
+  uint64_t ByteB = Imm & 0x00ff000000000000ULL;
+  uint64_t ByteC = Imm & 0x0000ff0000000000ULL;
+  uint64_t ByteD = Imm & 0x000000ff00000000ULL;
+  uint64_t ByteE = Imm & 0x00000000ff000000ULL;
+  uint64_t ByteF = Imm & 0x0000000000ff0000ULL;
+  uint64_t ByteG = Imm & 0x000000000000ff00ULL;
+  uint64_t ByteH = Imm & 0x00000000000000ffULL;
+
+  return (ByteA == 0ULL || ByteA == 0xff00000000000000ULL) &&
+         (ByteB == 0ULL || ByteB == 0x00ff000000000000ULL) &&
+         (ByteC == 0ULL || ByteC == 0x0000ff0000000000ULL) &&
+         (ByteD == 0ULL || ByteD == 0x000000ff00000000ULL) &&
+         (ByteE == 0ULL || ByteE == 0x00000000ff000000ULL) &&
+         (ByteF == 0ULL || ByteF == 0x0000000000ff0000ULL) &&
+         (ByteG == 0ULL || ByteG == 0x000000000000ff00ULL) &&
+         (ByteH == 0ULL || ByteH == 0x00000000000000ffULL);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType10(uint64_t Imm) {
+  uint8_t BitA = (Imm & 0xff00000000000000ULL) != 0;
+  uint8_t BitB = (Imm & 0x00ff000000000000ULL) != 0;
+  uint8_t BitC = (Imm & 0x0000ff0000000000ULL) != 0;
+  uint8_t BitD = (Imm & 0x000000ff00000000ULL) != 0;
+  uint8_t BitE = (Imm & 0x00000000ff000000ULL) != 0;
+  uint8_t BitF = (Imm & 0x0000000000ff0000ULL) != 0;
+  uint8_t BitG = (Imm & 0x000000000000ff00ULL) != 0;
+  uint8_t BitH = (Imm & 0x00000000000000ffULL) != 0;
+
+  uint8_t EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType10(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0xff00000000000000ULL;
+  if (Imm & 0x40) EncVal |= 0x00ff000000000000ULL;
+  if (Imm & 0x20) EncVal |= 0x0000ff0000000000ULL;
+  if (Imm & 0x10) EncVal |= 0x000000ff00000000ULL;
+  if (Imm & 0x08) EncVal |= 0x00000000ff000000ULL;
+  if (Imm & 0x04) EncVal |= 0x0000000000ff0000ULL;
+  if (Imm & 0x02) EncVal |= 0x000000000000ff00ULL;
+  if (Imm & 0x01) EncVal |= 0x00000000000000ffULL;
+  return EncVal;
+}
+
+// aBbbbbbc defgh000 0x00 0x00 aBbbbbbc defgh000 0x00 0x00
+static inline bool isAdvSIMDModImmType11(uint64_t Imm) {
+  uint64_t BString = (Imm & 0x7E000000ULL) >> 25;
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (BString == 0x1f || BString == 0x20) &&
+         ((Imm & 0x0007ffff0007ffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType11(uint64_t Imm) {
+  uint8_t BitA = (Imm & 0x80000000ULL) != 0;
+  uint8_t BitB = (Imm & 0x20000000ULL) != 0;
+  uint8_t BitC = (Imm & 0x01000000ULL) != 0;
+  uint8_t BitD = (Imm & 0x00800000ULL) != 0;
+  uint8_t BitE = (Imm & 0x00400000ULL) != 0;
+  uint8_t BitF = (Imm & 0x00200000ULL) != 0;
+  uint8_t BitG = (Imm & 0x00100000ULL) != 0;
+  uint8_t BitH = (Imm & 0x00080000ULL) != 0;
+
+  uint8_t EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType11(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0x80000000ULL;
+  if (Imm & 0x40) EncVal |= 0x3e000000ULL;
+  else            EncVal |= 0x40000000ULL;
+  if (Imm & 0x20) EncVal |= 0x01000000ULL;
+  if (Imm & 0x10) EncVal |= 0x00800000ULL;
+  if (Imm & 0x08) EncVal |= 0x00400000ULL;
+  if (Imm & 0x04) EncVal |= 0x00200000ULL;
+  if (Imm & 0x02) EncVal |= 0x00100000ULL;
+  if (Imm & 0x01) EncVal |= 0x00080000ULL;
+  return (EncVal << 32) | EncVal;
+}
+
+// aBbbbbbb bbcdefgh 0x00 0x00 0x00 0x00 0x00 0x00
+static inline bool isAdvSIMDModImmType12(uint64_t Imm) {
+  uint64_t BString = (Imm & 0x7fc0000000000000ULL) >> 54;
+  return ((BString == 0xff || BString == 0x100) &&
+         ((Imm & 0x0000ffffffffffffULL) == 0));
+}
+
+static inline uint8_t encodeAdvSIMDModImmType12(uint64_t Imm) {
+  uint8_t BitA = (Imm & 0x8000000000000000ULL) != 0;
+  uint8_t BitB = (Imm & 0x0040000000000000ULL) != 0;
+  uint8_t BitC = (Imm & 0x0020000000000000ULL) != 0;
+  uint8_t BitD = (Imm & 0x0010000000000000ULL) != 0;
+  uint8_t BitE = (Imm & 0x0008000000000000ULL) != 0;
+  uint8_t BitF = (Imm & 0x0004000000000000ULL) != 0;
+  uint8_t BitG = (Imm & 0x0002000000000000ULL) != 0;
+  uint8_t BitH = (Imm & 0x0001000000000000ULL) != 0;
+
+  uint8_t EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0x8000000000000000ULL;
+  if (Imm & 0x40) EncVal |= 0x3fc0000000000000ULL;
+  else            EncVal |= 0x4000000000000000ULL;
+  if (Imm & 0x20) EncVal |= 0x0020000000000000ULL;
+  if (Imm & 0x10) EncVal |= 0x0010000000000000ULL;
+  if (Imm & 0x08) EncVal |= 0x0008000000000000ULL;
+  if (Imm & 0x04) EncVal |= 0x0004000000000000ULL;
+  if (Imm & 0x02) EncVal |= 0x0002000000000000ULL;
+  if (Imm & 0x01) EncVal |= 0x0001000000000000ULL;
+  return (EncVal << 32) | EncVal;
+}
+
+inline static bool isAnyMOVZMovAlias(uint64_t Value, int RegWidth) {
+  for (int Shift = 0; Shift <= RegWidth - 16; Shift += 16)
+    if ((Value & ~(0xffffULL << Shift)) == 0)
+      return true;
+
+  return false;
+}
+
+inline static bool isMOVZMovAlias(uint64_t Value, int Shift, int RegWidth) {
+  if (RegWidth == 32)
+    Value &= 0xffffffffULL;
+
+  // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0".
+  if (Value == 0 && Shift != 0)
+    return false;
+
+  return (Value & ~(0xffffULL << Shift)) == 0;
+}
+
+inline static bool isMOVNMovAlias(uint64_t Value, int Shift, int RegWidth) {
+  // MOVZ takes precedence over MOVN.
+  if (isAnyMOVZMovAlias(Value, RegWidth))
+    return false;
+
+  Value = ~Value;
+  if (RegWidth == 32)
+    Value &= 0xffffffffULL;
+
+  return isMOVZMovAlias(Value, Shift, RegWidth);
+}
+
+inline static bool isAnyMOVWMovAlias(uint64_t Value, int RegWidth) {
+  if (isAnyMOVZMovAlias(Value, RegWidth))
+    return true;
+
+  // It's not a MOVZ, but it might be a MOVN.
+  Value = ~Value;
+  if (RegWidth == 32)
+    Value &= 0xffffffffULL;
+
+  return isAnyMOVZMovAlias(Value, RegWidth);
+}
+
+} // end namespace AArch64_AM
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
new file mode 100644
index 000000000000..14c0327f5fa8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -0,0 +1,612 @@
+//===-- AArch64AsmBackend.cpp - AArch64 Assembler Backend -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+
+class AArch64AsmBackend : public MCAsmBackend {
+  static const unsigned PCRelFlagVal =
+      MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
+public:
+  bool IsLittleEndian;
+
+public:
+  AArch64AsmBackend(const Target &T, bool IsLittleEndian)
+     : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
+
+  unsigned getNumFixupKinds() const override {
+    return AArch64::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
+      // This table *must* be in the order that the fixup_* kinds are defined in
+      // AArch64FixupKinds.h.
+      //
+      // Name                           Offset (bits) Size (bits)     Flags
+      { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
+      { "fixup_aarch64_add_imm12", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 },
+      { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal },
+      { "fixup_aarch64_movw", 5, 16, 0 },
+      { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal },
+      { "fixup_aarch64_tlsdesc_call", 0, 0, 0 }
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override;
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override;
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override;
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+  void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
+
+  unsigned getPointerSize() const { return 8; }
+
+  unsigned getFixupKindContainereSizeInBytes(unsigned Kind) const;
+};
+
+} // end anonymous namespace
+
+/// \brief The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+
+  case AArch64::fixup_aarch64_tlsdesc_call:
+    return 0;
+
+  case FK_Data_1:
+    return 1;
+
+  case FK_Data_2:
+  case AArch64::fixup_aarch64_movw:
+    return 2;
+
+  case AArch64::fixup_aarch64_pcrel_branch14:
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+  case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+  case AArch64::fixup_aarch64_pcrel_branch19:
+    return 3;
+
+  case AArch64::fixup_aarch64_pcrel_adr_imm21:
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+  case FK_Data_4:
+    return 4;
+
+  case FK_Data_8:
+    return 8;
+  }
+}
+
+static unsigned AdrImmBits(unsigned Value) {
+  unsigned lo2 = Value & 0x3;
+  unsigned hi19 = (Value & 0x1ffffc) >> 2;
+  return (hi19 << 5) | (lo2 << 29);
+}
+
+static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+                                 MCContext *Ctx) {
+  unsigned Kind = Fixup.getKind();
+  int64_t SignedValue = static_cast<int64_t>(Value);
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case AArch64::fixup_aarch64_pcrel_adr_imm21:
+    if (Ctx && (SignedValue > 2097151 || SignedValue < -2097152))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    return AdrImmBits(Value & 0x1fffffULL);
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+    return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
+  case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+  case AArch64::fixup_aarch64_pcrel_branch19:
+    // Signed 21-bit immediate
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      if (Ctx) Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Ctx && (Value & 0x3))
+      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+    // Low two bits are not encoded.
+    return (Value >> 2) & 0x7ffff;
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+    // Unsigned 12-bit immediate
+    if (Ctx && Value >= 0x1000)
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    return Value;
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+    // Unsigned 12-bit immediate which gets multiplied by 2
+    if (Ctx && (Value >= 0x2000))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Ctx && (Value & 0x1))
+      Ctx->reportError(Fixup.getLoc(), "fixup must be 2-byte aligned");
+    return Value >> 1;
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+    // Unsigned 12-bit immediate which gets multiplied by 4
+    if (Ctx && (Value >= 0x4000))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Ctx && (Value & 0x3))
+      Ctx->reportError(Fixup.getLoc(), "fixup must be 4-byte aligned");
+    return Value >> 2;
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+    // Unsigned 12-bit immediate which gets multiplied by 8
+    if (Ctx && (Value >= 0x8000))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Ctx && (Value & 0x7))
+      Ctx->reportError(Fixup.getLoc(), "fixup must be 8-byte aligned");
+    return Value >> 3;
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+    // Unsigned 12-bit immediate which gets multiplied by 16
+    if (Ctx && (Value >= 0x10000))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Ctx && (Value & 0xf))
+      Ctx->reportError(Fixup.getLoc(), "fixup must be 16-byte aligned");
+    return Value >> 4;
+  case AArch64::fixup_aarch64_movw:
+    if (Ctx)
+      Ctx->reportError(Fixup.getLoc(),
+                       "no resolvable MOVZ/MOVK fixups supported yet");
+    return Value;
+  case AArch64::fixup_aarch64_pcrel_branch14:
+    // Signed 16-bit immediate
+    if (Ctx && (SignedValue > 32767 || SignedValue < -32768))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Ctx && (Value & 0x3))
+      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+    return (Value >> 2) & 0x3fff;
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    // Signed 28-bit immediate
+    if (Ctx && (SignedValue > 134217727 || SignedValue < -134217728))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Ctx && (Value & 0x3))
+      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+    return (Value >> 2) & 0x3ffffff;
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+    return Value;
+  }
+}
+
+/// getFixupKindContainereSizeInBytes - The number of bytes of the
+/// container involved in big endian or 0 if the item is little endian
+unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) const {
+  if (IsLittleEndian)
+    return 0;
+
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+
+  case FK_Data_1:
+    return 1;
+  case FK_Data_2:
+    return 2;
+  case FK_Data_4:
+    return 4;
+  case FK_Data_8:
+    return 8;
+
+  case AArch64::fixup_aarch64_tlsdesc_call:
+  case AArch64::fixup_aarch64_movw:
+  case AArch64::fixup_aarch64_pcrel_branch14:
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+  case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+  case AArch64::fixup_aarch64_pcrel_branch19:
+  case AArch64::fixup_aarch64_pcrel_adr_imm21:
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    // Instructions are always little endian
+    return 0;
+  }
+}
+
+void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                   unsigned DataSize, uint64_t Value,
+                                   bool IsPCRel) const {
+  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+  if (!Value)
+    return; // Doesn't change encoding.
+  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+  // Apply any target-specific value adjustments.
+  Value = adjustFixupValue(Fixup, Value, nullptr);
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // Used to point to big endian bytes.
+  unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind());
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  if (FulleSizeInBytes == 0) {
+    // Handle as little-endian
+    for (unsigned i = 0; i != NumBytes; ++i) {
+      Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+    }
+  } else {
+    // Handle as big-endian
+    assert((Offset + FulleSizeInBytes) <= DataSize && "Invalid fixup size!");
+    assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!");
+    for (unsigned i = 0; i != NumBytes; ++i) {
+      unsigned Idx = FulleSizeInBytes - 1 - i;
+      Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+    }
+  }
+}
+
+bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  return false;
+}
+
+bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                             uint64_t Value,
+                                             const MCRelaxableFragment *DF,
+                                             const MCAsmLayout &Layout) const {
+  // FIXME:  This isn't correct for AArch64. Just moving the "generic" logic
+  // into the targets for now.
+  //
+  // Relax if the value is too big for a (signed) i8.
+  return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
+                                         const MCSubtargetInfo &STI,
+                                         MCInst &Res) const {
+  llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented");
+}
+
+bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  // If the count is not 4-byte aligned, we must be writing data into the text
+  // section (otherwise we have unaligned instructions, and thus have far
+  // bigger problems), so just write zeros instead.
+  OW->WriteZeros(Count % 4);
+
+  // We are properly aligned, so write NOPs as requested.
+  Count /= 4;
+  for (uint64_t i = 0; i != Count; ++i)
+    OW->write32(0xd503201f);
+  return true;
+}
+
+namespace {
+
+namespace CU {
+
+/// \brief Compact unwind encoding values.
+enum CompactUnwindEncodings {
+  /// \brief A "frameless" leaf function, where no non-volatile registers are
+  /// saved. The return remains in LR throughout the function.
+  UNWIND_ARM64_MODE_FRAMELESS = 0x02000000,
+
+  /// \brief No compact unwind encoding available. Instead the low 23-bits of
+  /// the compact unwind encoding is the offset of the DWARF FDE in the
+  /// __eh_frame section. This mode is never used in object files. It is only
+  /// generated by the linker in final linked images, which have only DWARF info
+  /// for a function.
+  UNWIND_ARM64_MODE_DWARF = 0x03000000,
+
+  /// \brief This is a standard arm64 prologue where FP/LR are immediately
+  /// pushed on the stack, then SP is copied to FP. If there are any
+  /// non-volatile register saved, they are copied into the stack fame in pairs
+  /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
+  /// five X pairs and four D pairs can be saved, but the memory layout must be
+  /// in register number order.
+  UNWIND_ARM64_MODE_FRAME = 0x04000000,
+
+  /// \brief Frame register pair encodings.
+  UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001,
+  UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002,
+  UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004,
+  UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008,
+  UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010,
+  UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100,
+  UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200,
+  UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400,
+  UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800
+};
+
+} // end CU namespace
+
+// FIXME: This should be in a separate file.
+class DarwinAArch64AsmBackend : public AArch64AsmBackend {
+  const MCRegisterInfo &MRI;
+
+  /// \brief Encode compact unwind stack adjustment for frameless functions.
+  /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
+  /// The stack size always needs to be 16 byte aligned.
+  uint32_t encodeStackAdjustment(uint32_t StackSize) const {
+    return (StackSize / 16) << 12;
+  }
+
+public:
+  DarwinAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
+      : AArch64AsmBackend(T, /*IsLittleEndian*/true), MRI(MRI) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
+                                         MachO::CPU_SUBTYPE_ARM64_ALL);
+  }
+
+  /// \brief Generate the compact unwind encoding from the CFI directives.
+  uint32_t generateCompactUnwindEncoding(
+                             ArrayRef<MCCFIInstruction> Instrs) const override {
+    if (Instrs.empty())
+      return CU::UNWIND_ARM64_MODE_FRAMELESS;
+
+    bool HasFP = false;
+    unsigned StackSize = 0;
+
+    uint32_t CompactUnwindEncoding = 0;
+    for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
+      const MCCFIInstruction &Inst = Instrs[i];
+
+      switch (Inst.getOperation()) {
+      default:
+        // Cannot handle this directive:  bail out.
+        return CU::UNWIND_ARM64_MODE_DWARF;
+      case MCCFIInstruction::OpDefCfa: {
+        // Defines a frame pointer.
+        assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
+                   AArch64::FP &&
+               "Invalid frame pointer!");
+        assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
+
+        const MCCFIInstruction &LRPush = Instrs[++i];
+        assert(LRPush.getOperation() == MCCFIInstruction::OpOffset &&
+               "Link register not pushed!");
+        const MCCFIInstruction &FPPush = Instrs[++i];
+        assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
+               "Frame pointer not pushed!");
+
+        unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
+        unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
+
+        LRReg = getXRegFromWReg(LRReg);
+        FPReg = getXRegFromWReg(FPReg);
+
+        assert(LRReg == AArch64::LR && FPReg == AArch64::FP &&
+               "Pushing invalid registers for frame!");
+
+        // Indicate that the function has a frame.
+        CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME;
+        HasFP = true;
+        break;
+      }
+      case MCCFIInstruction::OpDefCfaOffset: {
+        assert(StackSize == 0 && "We already have the CFA offset!");
+        StackSize = std::abs(Inst.getOffset());
+        break;
+      }
+      case MCCFIInstruction::OpOffset: {
+        // Registers are saved in pairs. We expect there to be two consecutive
+        // `.cfi_offset' instructions with the appropriate registers specified.
+        unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
+        if (i + 1 == e)
+          return CU::UNWIND_ARM64_MODE_DWARF;
+
+        const MCCFIInstruction &Inst2 = Instrs[++i];
+        if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
+          return CU::UNWIND_ARM64_MODE_DWARF;
+        unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
+
+        // N.B. The encodings must be in register number order, and the X
+        // registers before the D registers.
+
+        // X19/X20 pair = 0x00000001,
+        // X21/X22 pair = 0x00000002,
+        // X23/X24 pair = 0x00000004,
+        // X25/X26 pair = 0x00000008,
+        // X27/X28 pair = 0x00000010
+        Reg1 = getXRegFromWReg(Reg1);
+        Reg2 = getXRegFromWReg(Reg2);
+
+        if (Reg1 == AArch64::X19 && Reg2 == AArch64::X20 &&
+            (CompactUnwindEncoding & 0xF1E) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X19_X20_PAIR;
+        else if (Reg1 == AArch64::X21 && Reg2 == AArch64::X22 &&
+                 (CompactUnwindEncoding & 0xF1C) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X21_X22_PAIR;
+        else if (Reg1 == AArch64::X23 && Reg2 == AArch64::X24 &&
+                 (CompactUnwindEncoding & 0xF18) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X23_X24_PAIR;
+        else if (Reg1 == AArch64::X25 && Reg2 == AArch64::X26 &&
+                 (CompactUnwindEncoding & 0xF10) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X25_X26_PAIR;
+        else if (Reg1 == AArch64::X27 && Reg2 == AArch64::X28 &&
+                 (CompactUnwindEncoding & 0xF00) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X27_X28_PAIR;
+        else {
+          Reg1 = getDRegFromBReg(Reg1);
+          Reg2 = getDRegFromBReg(Reg2);
+
+          // D8/D9 pair   = 0x00000100,
+          // D10/D11 pair = 0x00000200,
+          // D12/D13 pair = 0x00000400,
+          // D14/D15 pair = 0x00000800
+          if (Reg1 == AArch64::D8 && Reg2 == AArch64::D9 &&
+              (CompactUnwindEncoding & 0xE00) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D8_D9_PAIR;
+          else if (Reg1 == AArch64::D10 && Reg2 == AArch64::D11 &&
+                   (CompactUnwindEncoding & 0xC00) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D10_D11_PAIR;
+          else if (Reg1 == AArch64::D12 && Reg2 == AArch64::D13 &&
+                   (CompactUnwindEncoding & 0x800) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D12_D13_PAIR;
+          else if (Reg1 == AArch64::D14 && Reg2 == AArch64::D15)
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D14_D15_PAIR;
+          else
+            // A pair was pushed which we cannot handle.
+            return CU::UNWIND_ARM64_MODE_DWARF;
+        }
+
+        break;
+      }
+      }
+    }
+
+    if (!HasFP) {
+      // With compact unwind info we can only represent stack adjustments of up
+      // to 65520 bytes.
+      if (StackSize > 65520)
+        return CU::UNWIND_ARM64_MODE_DWARF;
+
+      CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAMELESS;
+      CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
+    }
+
+    return CompactUnwindEncoding;
+  }
+
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override {
+    // Try to get the encoded value for the fixup as-if we're mapping it into
+    // the instruction. This allows adjustFixupValue() to issue a diagnostic
+    // if the value is invalid.
+    if (IsResolved)
+      (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
+  }
+};
+
+} // end anonymous namespace
+
+namespace {
+
+class ELFAArch64AsmBackend : public AArch64AsmBackend {
+public:
+  uint8_t OSABI;
+  bool IsILP32;
+
+  ELFAArch64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian,
+                       bool IsILP32)
+    : AArch64AsmBackend(T, IsLittleEndian), OSABI(OSABI), IsILP32(IsILP32) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian, IsILP32);
+  }
+
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override;
+};
+
+void ELFAArch64AsmBackend::processFixupValue(
+    const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup,
+    const MCFragment *DF, const MCValue &Target, uint64_t &Value,
+    bool &IsResolved) {
+  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
+  // ~0xfff. This means that the required offset to reach a symbol can vary by
+  // up to one step depending on where the ADRP is in memory. For example:
+  //
+  //     ADRP x0, there
+  //  there:
+  //
+  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
+  // we'll need that as an offset. At any other address "there" will be in the
+  // same page as the ADRP and the instruction should encode 0x0. Assuming the
+  // section isn't 0x1000-aligned, we therefore need to delegate this decision
+  // to the linker -- a relocation!
+  if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
+    IsResolved = false;
+
+  // Try to get the encoded value for the fixup as-if we're mapping it into
+  // the instruction. This allows adjustFixupValue() to issue a diagnostic
+  // if the value is invalid.
+  if (IsResolved)
+    (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
+}
+
+}
+
+MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
+                                              const MCRegisterInfo &MRI,
+                                              const Triple &TheTriple,
+                                              StringRef CPU,
+                                              const MCTargetOptions &Options) {
+  if (TheTriple.isOSBinFormatMachO())
+    return new DarwinAArch64AsmBackend(T, MRI);
+
+  assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+  bool IsILP32 = Options.getABIName() == "ilp32";
+  return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/true, IsILP32);
+}
+
+MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T,
+                                              const MCRegisterInfo &MRI,
+                                              const Triple &TheTriple,
+                                              StringRef CPU,
+                                              const MCTargetOptions &Options) {
+  assert(TheTriple.isOSBinFormatELF() &&
+         "Big endian is only supported for ELF targets!");
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+  bool IsILP32 = Options.getABIName() == "ilp32";
+  return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/false, IsILP32);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
new file mode 100644
index 000000000000..a1edb3cef46a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -0,0 +1,346 @@
+//===-- AArch64ELFObjectWriter.cpp - AArch64 ELF Writer -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file handles ELF-specific object emission, converting LLVM's internal
+// fixups into the appropriate relocations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class AArch64ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian, bool IsILP32);
+
+  ~AArch64ELFObjectWriter() override;
+
+protected:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+  bool IsILP32;
+private:
+};
+}
+
+AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
+                                               bool IsLittleEndian,
+                                               bool IsILP32)
+    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
+                              /*HasRelocationAddend*/ true),
+      IsILP32(IsILP32) {}
+
+AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {}
+
+#define R_CLS(rtype) \
+        IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype
+#define BAD_ILP32_MOV(lp64rtype) "ILP32 absolute MOV relocation not "\
+        "supported (LP64 eqv: " #lp64rtype ")"
+
+// assumes IsILP32 is true
+static bool isNonILP32reloc(const MCFixup &Fixup,
+                            AArch64MCExpr::VariantKind RefKind,
+                            MCContext &Ctx) {
+  if ((unsigned)Fixup.getKind() != AArch64::fixup_aarch64_movw)
+    return false;
+  switch(RefKind) {
+    case AArch64MCExpr::VK_ABS_G3:
+      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G3));
+      return true;
+    case AArch64MCExpr::VK_ABS_G2:
+      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2));
+      return true;
+    case AArch64MCExpr::VK_ABS_G2_S:
+      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G2));
+      return ELF::R_AARCH64_NONE;
+    case AArch64MCExpr::VK_ABS_G2_NC:
+      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2_NC));
+      return ELF::R_AARCH64_NONE;
+    case AArch64MCExpr::VK_ABS_G1_S:
+      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G1));
+      return ELF::R_AARCH64_NONE;
+    case AArch64MCExpr::VK_ABS_G1_NC:
+      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G1_NC));
+      return ELF::R_AARCH64_NONE;
+    case AArch64MCExpr::VK_DTPREL_G2:
+      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G2));
+      return ELF::R_AARCH64_NONE;
+    case AArch64MCExpr::VK_DTPREL_G1_NC:
+      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G1_NC));
+      return ELF::R_AARCH64_NONE;
+    case AArch64MCExpr::VK_TPREL_G2:
+      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G2));
+      return ELF::R_AARCH64_NONE;
+    case AArch64MCExpr::VK_TPREL_G1_NC:
+      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G1_NC));
+      return ELF::R_AARCH64_NONE;
+    case AArch64MCExpr::VK_GOTTPREL_G1:
+      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G1));
+      return ELF::R_AARCH64_NONE;
+    case AArch64MCExpr::VK_GOTTPREL_G0_NC:
+      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G0_NC));
+      return ELF::R_AARCH64_NONE;
+    default: return false;
+  }
+  return false;
+}
+
+unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
+                                              const MCValue &Target,
+                                              const MCFixup &Fixup,
+                                              bool IsPCRel) const {
+  AArch64MCExpr::VariantKind RefKind =
+      static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
+  AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
+  bool IsNC = AArch64MCExpr::isNotChecked(RefKind);
+
+  assert((!Target.getSymA() ||
+          Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) &&
+         "Should only be expression-level modifiers here");
+
+  assert((!Target.getSymB() ||
+          Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) &&
+         "Should only be expression-level modifiers here");
+
+  if (IsPCRel) {
+    switch ((unsigned)Fixup.getKind()) {
+    case FK_Data_1:
+      Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
+      return ELF::R_AARCH64_NONE;
+    case FK_Data_2:
+      return R_CLS(PREL16);
+    case FK_Data_4:
+      return R_CLS(PREL32);
+    case FK_Data_8:
+      if (IsILP32) {
+        Ctx.reportError(Fixup.getLoc(), "ILP32 8 byte PC relative data "
+                        "relocation not supported (LP64 eqv: PREL64)");
+        return ELF::R_AARCH64_NONE;
+      } else
+        return ELF::R_AARCH64_PREL64;
+    case AArch64::fixup_aarch64_pcrel_adr_imm21:
+      assert(SymLoc == AArch64MCExpr::VK_NONE && "unexpected ADR relocation");
+      return R_CLS(ADR_PREL_LO21);
+    case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+      if (SymLoc == AArch64MCExpr::VK_ABS && !IsNC)
+        return R_CLS(ADR_PREL_PG_HI21);
+      if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC)
+        return R_CLS(ADR_GOT_PAGE);
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC)
+        return R_CLS(TLSIE_ADR_GOTTPREL_PAGE21);
+      if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC)
+        return R_CLS(TLSDESC_ADR_PAGE21);
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid symbol kind for ADRP relocation");
+      return ELF::R_AARCH64_NONE;
+    case AArch64::fixup_aarch64_pcrel_branch26:
+      return R_CLS(JUMP26);
+    case AArch64::fixup_aarch64_pcrel_call26:
+      return R_CLS(CALL26);
+    case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL)
+        return R_CLS(TLSIE_LD_GOTTPREL_PREL19);
+      return R_CLS(LD_PREL_LO19);
+    case AArch64::fixup_aarch64_pcrel_branch14:
+      return R_CLS(TSTBR14);
+    case AArch64::fixup_aarch64_pcrel_branch19:
+      return R_CLS(CONDBR19);
+    default:
+      Ctx.reportError(Fixup.getLoc(), "Unsupported pc-relative fixup kind");
+      return ELF::R_AARCH64_NONE;
+    }
+  } else {
+    if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx))
+        return ELF::R_AARCH64_NONE;
+    switch ((unsigned)Fixup.getKind()) {
+    case FK_Data_1:
+      Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
+      return ELF::R_AARCH64_NONE;
+    case FK_Data_2:
+      return R_CLS(ABS16);
+    case FK_Data_4:
+      return R_CLS(ABS32);
+    case FK_Data_8:
+      if (IsILP32) {
+        Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(ABS64));
+        return ELF::R_AARCH64_NONE;
+      } else
+        return ELF::R_AARCH64_ABS64;
+    case AArch64::fixup_aarch64_add_imm12:
+      if (RefKind == AArch64MCExpr::VK_DTPREL_HI12)
+        return R_CLS(TLSLD_ADD_DTPREL_HI12);
+      if (RefKind == AArch64MCExpr::VK_TPREL_HI12)
+        return R_CLS(TLSLE_ADD_TPREL_HI12);
+      if (RefKind == AArch64MCExpr::VK_DTPREL_LO12_NC)
+        return R_CLS(TLSLD_ADD_DTPREL_LO12_NC);
+      if (RefKind == AArch64MCExpr::VK_DTPREL_LO12)
+        return R_CLS(TLSLD_ADD_DTPREL_LO12);
+      if (RefKind == AArch64MCExpr::VK_TPREL_LO12_NC)
+        return R_CLS(TLSLE_ADD_TPREL_LO12_NC);
+      if (RefKind == AArch64MCExpr::VK_TPREL_LO12)
+        return R_CLS(TLSLE_ADD_TPREL_LO12);
+      if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12)
+        return R_CLS(TLSDESC_ADD_LO12_NC);
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return R_CLS(ADD_ABS_LO12_NC);
+
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for add (uimm12) instruction");
+      return ELF::R_AARCH64_NONE;
+    case AArch64::fixup_aarch64_ldst_imm12_scale1:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return R_CLS(LDST8_ABS_LO12_NC);
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return R_CLS(TLSLD_LDST8_DTPREL_LO12);
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return R_CLS(TLSLD_LDST8_DTPREL_LO12_NC);
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return R_CLS(TLSLE_LDST8_TPREL_LO12);
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return R_CLS(TLSLE_LDST8_TPREL_LO12_NC);
+
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 8-bit load/store instruction");
+      return ELF::R_AARCH64_NONE;
+    case AArch64::fixup_aarch64_ldst_imm12_scale2:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return R_CLS(LDST16_ABS_LO12_NC);
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return R_CLS(TLSLD_LDST16_DTPREL_LO12);
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return R_CLS(TLSLD_LDST16_DTPREL_LO12_NC);
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return R_CLS(TLSLE_LDST16_TPREL_LO12);
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return R_CLS(TLSLE_LDST16_TPREL_LO12_NC);
+
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 16-bit load/store instruction");
+      return ELF::R_AARCH64_NONE;
+    case AArch64::fixup_aarch64_ldst_imm12_scale4:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return R_CLS(LDST32_ABS_LO12_NC);
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return R_CLS(TLSLD_LDST32_DTPREL_LO12);
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return R_CLS(TLSLD_LDST32_DTPREL_LO12_NC);
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return R_CLS(TLSLE_LDST32_TPREL_LO12);
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return R_CLS(TLSLE_LDST32_TPREL_LO12_NC);
+
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 32-bit load/store instruction");
+      return ELF::R_AARCH64_NONE;
+    case AArch64::fixup_aarch64_ldst_imm12_scale8:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return R_CLS(LDST64_ABS_LO12_NC);
+      if (SymLoc == AArch64MCExpr::VK_GOT && IsNC)
+        return R_CLS(LD64_GOT_LO12_NC);
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return R_CLS(TLSLD_LDST64_DTPREL_LO12);
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return R_CLS(TLSLD_LDST64_DTPREL_LO12_NC);
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return R_CLS(TLSLE_LDST64_TPREL_LO12);
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return R_CLS(TLSLE_LDST64_TPREL_LO12_NC);
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC)
+        return IsILP32 ? ELF::R_AARCH64_P32_TLSIE_LD32_GOTTPREL_LO12_NC
+                       : ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC)
+        return IsILP32 ? ELF::R_AARCH64_P32_TLSDESC_LD32_LO12_NC
+                       : ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
+
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 64-bit load/store instruction");
+      return ELF::R_AARCH64_NONE;
+    case AArch64::fixup_aarch64_ldst_imm12_scale16:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return R_CLS(LDST128_ABS_LO12_NC);
+
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 128-bit load/store instruction");
+      return ELF::R_AARCH64_NONE;
+    // ILP32 case not reached here, tested with isNonILP32reloc
+    case AArch64::fixup_aarch64_movw:
+      if (RefKind == AArch64MCExpr::VK_ABS_G3)
+        return ELF::R_AARCH64_MOVW_UABS_G3;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2)
+        return ELF::R_AARCH64_MOVW_UABS_G2;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2_S)
+        return ELF::R_AARCH64_MOVW_SABS_G2;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G2_NC;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1)
+        return R_CLS(MOVW_UABS_G1);
+      if (RefKind == AArch64MCExpr::VK_ABS_G1_S)
+        return ELF::R_AARCH64_MOVW_SABS_G1;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_ABS_G0)
+        return R_CLS(MOVW_UABS_G0);
+      if (RefKind == AArch64MCExpr::VK_ABS_G0_S)
+        return R_CLS(MOVW_SABS_G0);
+      if (RefKind == AArch64MCExpr::VK_ABS_G0_NC)
+        return R_CLS(MOVW_UABS_G0_NC);
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G2)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G1)
+        return R_CLS(TLSLD_MOVW_DTPREL_G1);
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G1_NC)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G0)
+        return R_CLS(TLSLD_MOVW_DTPREL_G0);
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G0_NC)
+        return R_CLS(TLSLD_MOVW_DTPREL_G0_NC);
+      if (RefKind == AArch64MCExpr::VK_TPREL_G2)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G1)
+        return R_CLS(TLSLE_MOVW_TPREL_G1);
+      if (RefKind == AArch64MCExpr::VK_TPREL_G1_NC)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G0)
+        return R_CLS(TLSLE_MOVW_TPREL_G0);
+      if (RefKind == AArch64MCExpr::VK_TPREL_G0_NC)
+        return R_CLS(TLSLE_MOVW_TPREL_G0_NC);
+      if (RefKind == AArch64MCExpr::VK_GOTTPREL_G1)
+        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
+      if (RefKind == AArch64MCExpr::VK_GOTTPREL_G0_NC)
+        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for movz/movk instruction");
+      return ELF::R_AARCH64_NONE;
+    case AArch64::fixup_aarch64_tlsdesc_call:
+      return R_CLS(TLSDESC_CALL);
+    default:
+      Ctx.reportError(Fixup.getLoc(), "Unknown ELF relocation type");
+      return ELF::R_AARCH64_NONE;
+    }
+  }
+
+  llvm_unreachable("Unimplemented fixup -> relocation");
+}
+
+MCObjectWriter *llvm::createAArch64ELFObjectWriter(raw_pwrite_stream &OS,
+                                                   uint8_t OSABI,
+                                                   bool IsLittleEndian,
+                                                   bool IsILP32) {
+  MCELFObjectTargetWriter *MOTW =
+      new AArch64ELFObjectWriter(OSABI, IsLittleEndian, IsILP32);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
new file mode 100644
index 000000000000..685907a2178e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -0,0 +1,217 @@
+//===- lib/MC/AArch64ELFStreamer.cpp - ELF Object Output for AArch64 ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits AArch64 ELF .o object files. Different
+// from generic ELF streamer in emitting mapping symbols ($x and $d) to delimit
+// regions of data and code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetStreamer.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+class AArch64ELFStreamer;
+
+class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
+  formatted_raw_ostream &OS;
+
+  void emitInst(uint32_t Inst) override;
+
+public:
+  AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+};
+
+AArch64TargetAsmStreamer::AArch64TargetAsmStreamer(MCStreamer &S,
+                                                   formatted_raw_ostream &OS)
+  : AArch64TargetStreamer(S), OS(OS) {}
+
+void AArch64TargetAsmStreamer::emitInst(uint32_t Inst) {
+  OS << "\t.inst\t0x" << Twine::utohexstr(Inst) << "\n";
+}
+
+class AArch64TargetELFStreamer : public AArch64TargetStreamer {
+private:
+  AArch64ELFStreamer &getStreamer();
+
+  void emitInst(uint32_t Inst) override;
+
+public:
+  AArch64TargetELFStreamer(MCStreamer &S) : AArch64TargetStreamer(S) {}
+};
+
+/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
+/// the appropriate points in the object files. These symbols are defined in the
+/// AArch64 ELF ABI:
+///    infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf
+///
+/// In brief: $x or $d should be emitted at the start of each contiguous region
+/// of A64 code or data in a section. In practice, this emission does not rely
+/// on explicit assembler directives but on inherent properties of the
+/// directives doing the emission (e.g. ".byte" is data, "add x0, x0, x0" an
+/// instruction).
+///
+/// As a result this system is orthogonal to the DataRegion infrastructure used
+/// by MachO. Beware!
+class AArch64ELFStreamer : public MCELFStreamer {
+public:
+  friend class AArch64TargetELFStreamer;
+
+  AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                     raw_pwrite_stream &OS, MCCodeEmitter *Emitter)
+      : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
+        LastEMS(EMS_None) {}
+
+  void ChangeSection(MCSection *Section, const MCExpr *Subsection) override {
+    // We have to keep track of the mapping symbol state of any sections we
+    // use. Each one should start off as EMS_None, which is provided as the
+    // default constructor by DenseMap::lookup.
+    LastMappingSymbols[getPreviousSection().first] = LastEMS;
+    LastEMS = LastMappingSymbols.lookup(Section);
+
+    MCELFStreamer::ChangeSection(Section, Subsection);
+  }
+
+  /// This function is the one used to emit instruction data into the ELF
+  /// streamer. We override it to add the appropriate mapping symbol if
+  /// necessary.
+  void EmitInstruction(const MCInst &Inst,
+                       const MCSubtargetInfo &STI) override {
+    EmitA64MappingSymbol();
+    MCELFStreamer::EmitInstruction(Inst, STI);
+  }
+
+  /// Emit a 32-bit value as an instruction. This is only used for the .inst
+  /// directive, EmitInstruction should be used in other cases.
+  void emitInst(uint32_t Inst) {
+    char Buffer[4];
+
+    // We can't just use EmitIntValue here, as that will emit a data mapping
+    // symbol, and swap the endianness on big-endian systems (instructions are
+    // always little-endian).
+    for (unsigned I = 0; I < 4; ++I) {
+      Buffer[I] = uint8_t(Inst);
+      Inst >>= 8;
+    }
+
+    EmitA64MappingSymbol();
+    MCELFStreamer::EmitBytes(StringRef(Buffer, 4));
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
+  /// if necessary.
+  void EmitBytes(StringRef Data) override {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitBytes(Data);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
+  /// if necessary.
+  void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitValueImpl(Value, Size, Loc);
+  }
+
+private:
+  enum ElfMappingSymbol {
+    EMS_None,
+    EMS_A64,
+    EMS_Data
+  };
+
+  void EmitDataMappingSymbol() {
+    if (LastEMS == EMS_Data)
+      return;
+    EmitMappingSymbol("$d");
+    LastEMS = EMS_Data;
+  }
+
+  void EmitA64MappingSymbol() {
+    if (LastEMS == EMS_A64)
+      return;
+    EmitMappingSymbol("$x");
+    LastEMS = EMS_A64;
+  }
+
+  void EmitMappingSymbol(StringRef Name) {
+    auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
+        Name + "." + Twine(MappingSymbolCounter++)));
+    EmitLabel(Symbol);
+    Symbol->setType(ELF::STT_NOTYPE);
+    Symbol->setBinding(ELF::STB_LOCAL);
+    Symbol->setExternal(false);
+  }
+
+  int64_t MappingSymbolCounter;
+
+  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
+  ElfMappingSymbol LastEMS;
+};
+} // end anonymous namespace
+
+AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() {
+  return static_cast<AArch64ELFStreamer &>(Streamer);
+}
+
+void AArch64TargetELFStreamer::emitInst(uint32_t Inst) {
+  getStreamer().emitInst(Inst);
+}
+
+namespace llvm {
+MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
+                                                 formatted_raw_ostream &OS,
+                                                 MCInstPrinter *InstPrint,
+                                                 bool isVerboseAsm) {
+  return new AArch64TargetAsmStreamer(S, OS);
+}
+
+MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                        raw_pwrite_stream &OS,
+                                        MCCodeEmitter *Emitter, bool RelaxAll) {
+  AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  return S;
+}
+
+MCTargetStreamer *
+createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  const Triple &TT = STI.getTargetTriple();
+  if (TT.isOSBinFormatELF())
+    return new AArch64TargetELFStreamer(S);
+  return nullptr;
+}
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
new file mode 100644
index 000000000000..ef48203c8bc0
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
@@ -0,0 +1,26 @@
+//===-- AArch64ELFStreamer.h - ELF Streamer for AArch64 ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF streamer information for the AArch64 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ELFSTREAMER_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ELFSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+
+MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                        raw_pwrite_stream &OS,
+                                        MCCodeEmitter *Emitter, bool RelaxAll);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
new file mode 100644
index 000000000000..0f5b765c7697
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -0,0 +1,76 @@
+//===-- AArch64FixupKinds.h - AArch64 Specific Fixup Entries ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64FIXUPKINDS_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace AArch64 {
+
+enum Fixups {
+  // fixup_aarch64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
+  // an ADR instruction.
+  fixup_aarch64_pcrel_adr_imm21 = FirstTargetFixupKind,
+
+  // fixup_aarch64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
+  // an ADRP instruction.
+  fixup_aarch64_pcrel_adrp_imm21,
+
+  // fixup_aarch64_imm12 - 12-bit fixup for add/sub instructions.
+  //     No alignment adjustment. All value bits are encoded.
+  fixup_aarch64_add_imm12,
+
+  // fixup_aarch64_ldst_imm12_* - unsigned 12-bit fixups for load and
+  // store instructions.
+  fixup_aarch64_ldst_imm12_scale1,
+  fixup_aarch64_ldst_imm12_scale2,
+  fixup_aarch64_ldst_imm12_scale4,
+  fixup_aarch64_ldst_imm12_scale8,
+  fixup_aarch64_ldst_imm12_scale16,
+
+  // fixup_aarch64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
+  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is used by
+  // pc-relative loads and generates relocations directly when necessary.
+  fixup_aarch64_ldr_pcrel_imm19,
+
+  // FIXME: comment
+  fixup_aarch64_movw,
+
+  // fixup_aarch64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
+  // immediate.
+  fixup_aarch64_pcrel_branch14,
+
+  // fixup_aarch64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative
+  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is use by
+  // b.cc and generates relocations directly when necessary.
+  fixup_aarch64_pcrel_branch19,
+
+  // fixup_aarch64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
+  // immediate.
+  fixup_aarch64_pcrel_branch26,
+
+  // fixup_aarch64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
+  // immediate. Distinguished from branch26 only on ELF.
+  fixup_aarch64_pcrel_call26,
+
+  // fixup_aarch64_tlsdesc_call - zero-space placeholder for the ELF
+  // R_AARCH64_TLSDESC_CALL relocation.
+  fixup_aarch64_tlsdesc_call,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+
+} // end namespace AArch64
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
new file mode 100644
index 000000000000..8fc822329595
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -0,0 +1,100 @@
+//===-- AArch64MCAsmInfo.cpp - AArch64 asm properties ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the AArch64MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+enum AsmWriterVariantTy {
+  Default = -1,
+  Generic = 0,
+  Apple = 1
+};
+
+static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
+    "aarch64-neon-syntax", cl::init(Default),
+    cl::desc("Choose style of NEON code to emit from AArch64 backend:"),
+    cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
+               clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly")));
+
+AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
+  // We prefer NEON instructions to be printed in the short form.
+  AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
+
+  PrivateGlobalPrefix = "L";
+  PrivateLabelPrefix = "L";
+  SeparatorString = "%%";
+  CommentString = ";";
+  PointerSize = CalleeSaveStackSlotSize = 8;
+
+  AlignmentIsInBytes = false;
+  UsesELFSectionDirectiveForBSS = true;
+  SupportsDebugInformation = true;
+  UseDataRegionDirectives = true;
+
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
+    const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const {
+  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+  // is an indirect pc-relative reference. The default implementation
+  // won't reference using the GOT, so we need this target-specific
+  // version.
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Res =
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, Context);
+  MCSymbol *PCSym = Context.createTempSymbol();
+  Streamer.EmitLabel(PCSym);
+  const MCExpr *PC = MCSymbolRefExpr::create(PCSym, Context);
+  return MCBinaryExpr::createSub(Res, PC, Context);
+}
+
+AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
+  if (T.getArch() == Triple::aarch64_be)
+    IsLittleEndian = false;
+
+  // We prefer NEON instructions to be printed in the short form.
+  AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
+
+  PointerSize = 8;
+
+  // ".comm align is in bytes but .align is pow-2."
+  AlignmentIsInBytes = false;
+
+  CommentString = "//";
+  PrivateGlobalPrefix = ".L";
+  PrivateLabelPrefix = ".L";
+  Code32Directive = ".code\t32";
+
+  Data16bitsDirective = "\t.hword\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = "\t.xword\t";
+
+  UseDataRegionDirectives = false;
+
+  WeakRefDirective = "\t.weak\t";
+
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  UseIntegratedAssembler = true;
+
+  HasIdentDirective = true;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
new file mode 100644
index 000000000000..253cd30f26ee
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -0,0 +1,38 @@
+//=====-- AArch64MCAsmInfo.h - AArch64 asm properties ---------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the AArch64MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class MCStreamer;
+class Target;
+class Triple;
+
+struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
+  explicit AArch64MCAsmInfoDarwin();
+  const MCExpr *
+  getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+                              MCStreamer &Streamer) const override;
+};
+
+struct AArch64MCAsmInfoELF : public MCAsmInfoELF {
+  explicit AArch64MCAsmInfoELF(const Triple &T);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
new file mode 100644
index 000000000000..f7058cdf2373
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -0,0 +1,603 @@
+//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumFixups, "Number of MC fixups created.");
+
+namespace {
+
+class AArch64MCCodeEmitter : public MCCodeEmitter {
+  MCContext &Ctx;
+  const MCInstrInfo &MCII;
+
+  AArch64MCCodeEmitter(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const AArch64MCCodeEmitter &);     // DO NOT IMPLEMENT
+public:
+  AArch64MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+      : Ctx(ctx), MCII(mcii) {}
+
+  ~AArch64MCCodeEmitter() override {}
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  /// getLdStUImm12OpValue - Return encoding info for 12-bit unsigned immediate
+  /// attached to a load, store or prfm instruction. If operand requires a
+  /// relocation, record it and return zero in that part of the encoding.
+  template <uint32_t FixupKind>
+  uint32_t getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+  /// target.
+  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+  /// the 2-bit shift field.
+  uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getCondBranchTargetOpValue - Return the encoded value for a conditional
+  /// branch target.
+  uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  /// getLoadLiteralOpValue - Return the encoded value for a load-literal
+  /// pc-relative address.
+  uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMemExtendOpValue - Return the encoded value for a reg-extend load/store
+  /// instruction: bit 0 is whether a shift is present, bit 1 is whether the
+  /// operation is a sign extend (as opposed to a zero extend).
+  uint32_t getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+  /// branch target.
+  uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  /// getBranchTargetOpValue - Return the encoded value for an unconditional
+  /// branch target.
+  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  /// getMoveWideImmOpValue - Return the encoded value for the immediate operand
+  /// of a MOVZ or MOVK instruction.
+  uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getVecShifterOpValue - Return the encoded value for the vector shifter.
+  uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getMoveVecShifterOpValue - Return the encoded value for the vector move
+  /// shifter (MSL).
+  uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  /// getFixedPointScaleOpValue - Return the encoded value for the
+  // FP-to-fixed-point scale factor.
+  uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                   const MCSubtargetInfo &STI) const;
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue,
+                      const MCSubtargetInfo &STI) const;
+
+  template<int hasRs, int hasRt2> unsigned
+  fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue,
+                        const MCSubtargetInfo &STI) const;
+
+  unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue,
+                                     const MCSubtargetInfo &STI) const;
+
+private:
+  uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+  void verifyInstructionPredicates(const MCInst &MI,
+                                   uint64_t AvailableFeatures) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                MCContext &Ctx) {
+  return new AArch64MCCodeEmitter(MCII, Ctx);
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned
+AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  assert(MO.isImm() && "did not expect relocated expression");
+  return static_cast<unsigned>(MO.getImm());
+}
+
+template<unsigned FixupKind> uint32_t
+AArch64MCCodeEmitter::getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  uint32_t ImmVal = 0;
+
+  if (MO.isImm())
+    ImmVal = static_cast<uint32_t>(MO.getImm());
+  else {
+    assert(MO.isExpr() && "unable to encode load/store imm operand");
+    MCFixupKind Kind = MCFixupKind(FixupKind);
+    Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+    ++MCNumFixups;
+  }
+
+  return ImmVal;
+}
+
+/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+/// target.
+uint32_t
+AArch64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
+  const MCExpr *Expr = MO.getExpr();
+
+  MCFixupKind Kind = MI.getOpcode() == AArch64::ADR
+                         ? MCFixupKind(AArch64::fixup_aarch64_pcrel_adr_imm21)
+                         : MCFixupKind(AArch64::fixup_aarch64_pcrel_adrp_imm21);
+  Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+  MCNumFixups += 1;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+/// the 2-bit shift field.  The shift field is stored in bits 13-14 of the
+/// return value.
+uint32_t
+AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  // Suboperands are [imm, shifter].
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  assert(AArch64_AM::getShiftType(MO1.getImm()) == AArch64_AM::LSL &&
+         "unexpected shift type for add/sub immediate");
+  unsigned ShiftVal = AArch64_AM::getShiftValue(MO1.getImm());
+  assert((ShiftVal == 0 || ShiftVal == 12) &&
+         "unexpected shift value for add/sub immediate");
+  if (MO.isImm())
+    return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << ShiftVal));
+  assert(MO.isExpr() && "Unable to encode MCOperand!");
+  const MCExpr *Expr = MO.getExpr();
+
+  // Encode the 12 bits of the fixup.
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_add_imm12);
+  Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // Set the shift bit of the add instruction for relocation types
+  // R_AARCH64_TLSLE_ADD_TPREL_HI12 and R_AARCH64_TLSLD_ADD_DTPREL_HI12.
+  if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
+    AArch64MCExpr::VariantKind RefKind = A64E->getKind();
+    if (RefKind == AArch64MCExpr::VK_TPREL_HI12 ||
+        RefKind == AArch64MCExpr::VK_DTPREL_HI12)
+      ShiftVal = 12;
+  }
+  return ShiftVal == 0 ? 0 : (1 << ShiftVal);
+}
+
+/// getCondBranchTargetOpValue - Return the encoded value for a conditional
+/// branch target.
+uint32_t AArch64MCCodeEmitter::getCondBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
+
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch19);
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getLoadLiteralOpValue - Return the encoded value for a load-literal
+/// pc-relative address.
+uint32_t
+AArch64MCCodeEmitter::getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
+
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_ldr_pcrel_imm19);
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  unsigned SignExtend = MI.getOperand(OpIdx).getImm();
+  unsigned DoShift = MI.getOperand(OpIdx + 1).getImm();
+  return (SignExtend << 1) | DoShift;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected movz/movk immediate");
+
+  Fixups.push_back(MCFixup::create(
+      0, MO.getExpr(), MCFixupKind(AArch64::fixup_aarch64_movw), MI.getLoc()));
+
+  ++MCNumFixups;
+
+  return 0;
+}
+
+/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+/// branch target.
+uint32_t AArch64MCCodeEmitter::getTestBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
+
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch14);
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getBranchTargetOpValue - Return the encoded value for an unconditional
+/// branch target.
+uint32_t
+AArch64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
+
+  MCFixupKind Kind = MI.getOpcode() == AArch64::BL
+                         ? MCFixupKind(AArch64::fixup_aarch64_pcrel_call26)
+                         : MCFixupKind(AArch64::fixup_aarch64_pcrel_branch26);
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getVecShifterOpValue - Return the encoded value for the vector shifter:
+///
+///   00 -> 0
+///   01 -> 8
+///   10 -> 16
+///   11 -> 24
+uint32_t
+AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+
+  switch (MO.getImm()) {
+  default:
+    break;
+  case 0:
+    return 0;
+  case 8:
+    return 1;
+  case 16:
+    return 2;
+  case 24:
+    return 3;
+  }
+
+  llvm_unreachable("Invalid value for vector shift amount!");
+}
+
+/// getFixedPointScaleOpValue - Return the encoded value for the
+// FP-to-fixed-point scale factor.
+uint32_t AArch64MCCodeEmitter::getFixedPointScaleOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 32 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 16 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 8 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 64;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 32;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 16;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 8;
+}
+
+/// getMoveVecShifterOpValue - Return the encoded value for the vector move
+/// shifter (MSL).
+uint32_t AArch64MCCodeEmitter::getMoveVecShifterOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() &&
+         "Expected an immediate value for the move shift amount!");
+  unsigned ShiftVal = AArch64_AM::getShiftValue(MO.getImm());
+  assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!");
+  return ShiftVal == 8 ? 0 : 1;
+}
+
+unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                                       const MCSubtargetInfo &STI) const {
+  // If one of the signed fixup kinds is applied to a MOVZ instruction, the
+  // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
+  // job to ensure that any bits possibly affected by this are 0. This means we
+  // must zero out bit 30 (essentially emitting a MOVN).
+  MCOperand UImm16MO = MI.getOperand(1);
+
+  // Nothing to do if there's no fixup.
+  if (UImm16MO.isImm())
+    return EncodedValue;
+
+  const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
+  switch (A64E->getKind()) {
+  case AArch64MCExpr::VK_DTPREL_G2:
+  case AArch64MCExpr::VK_DTPREL_G1:
+  case AArch64MCExpr::VK_DTPREL_G0:
+  case AArch64MCExpr::VK_GOTTPREL_G1:
+  case AArch64MCExpr::VK_TPREL_G2:
+  case AArch64MCExpr::VK_TPREL_G1:
+  case AArch64MCExpr::VK_TPREL_G0:
+    return EncodedValue & ~(1u << 30);
+  default:
+    // Nothing to do for an unsigned fixup.
+    return EncodedValue;
+  }
+
+
+  return EncodedValue & ~(1u << 30);
+}
+
+void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  verifyInstructionPredicates(MI,
+                              computeAvailableFeatures(STI.getFeatureBits()));
+
+  if (MI.getOpcode() == AArch64::TLSDESCCALL) {
+    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
+    // following (BLR) instruction. It doesn't emit any code itself so it
+    // doesn't go through the normal TableGenerated channels.
+    MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call);
+    Fixups.push_back(MCFixup::create(0, MI.getOperand(0).getExpr(), Fixup));
+    return;
+  }
+
+  uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+  support::endian::Writer<support::little>(OS).write<uint32_t>(Binary);
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+unsigned
+AArch64MCCodeEmitter::fixMulHigh(const MCInst &MI,
+                                 unsigned EncodedValue,
+                                 const MCSubtargetInfo &STI) const {
+  // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
+  // (i.e. all bits 1) but is ignored by the processor.
+  EncodedValue |= 0x1f << 10;
+  return EncodedValue;
+}
+
+template<int hasRs, int hasRt2> unsigned
+AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
+                                            unsigned EncodedValue,
+                                            const MCSubtargetInfo &STI) const {
+  if (!hasRs) EncodedValue |= 0x001F0000;
+  if (!hasRt2) EncodedValue |= 0x00007C00;
+
+  return EncodedValue;
+}
+
+unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison(
+    const MCInst &MI, unsigned EncodedValue, const MCSubtargetInfo &STI) const {
+  // The Rm field of FCMP and friends is unused - it should be assembled
+  // as 0, but is ignored by the processor.
+  EncodedValue &= ~(0x1f << 16);
+  return EncodedValue;
+}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "AArch64GenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
new file mode 100644
index 000000000000..a540f49866a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -0,0 +1,145 @@
+//===-- AArch64MCExpr.cpp - AArch64 specific MC expression classes --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the assembly expression modifiers
+// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...).
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64symbolrefexpr"
+
+const AArch64MCExpr *AArch64MCExpr::create(const MCExpr *Expr, VariantKind Kind,
+                                       MCContext &Ctx) {
+  return new (Ctx) AArch64MCExpr(Expr, Kind);
+}
+
+StringRef AArch64MCExpr::getVariantKindName() const {
+  switch (static_cast<uint32_t>(getKind())) {
+  case VK_CALL:                return "";
+  case VK_LO12:                return ":lo12:";
+  case VK_ABS_G3:              return ":abs_g3:";
+  case VK_ABS_G2:              return ":abs_g2:";
+  case VK_ABS_G2_S:            return ":abs_g2_s:";
+  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
+  case VK_ABS_G1:              return ":abs_g1:";
+  case VK_ABS_G1_S:            return ":abs_g1_s:";
+  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
+  case VK_ABS_G0:              return ":abs_g0:";
+  case VK_ABS_G0_S:            return ":abs_g0_s:";
+  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
+  case VK_DTPREL_G2:           return ":dtprel_g2:";
+  case VK_DTPREL_G1:           return ":dtprel_g1:";
+  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
+  case VK_DTPREL_G0:           return ":dtprel_g0:";
+  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
+  case VK_DTPREL_HI12:         return ":dtprel_hi12:";
+  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
+  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
+  case VK_TPREL_G2:            return ":tprel_g2:";
+  case VK_TPREL_G1:            return ":tprel_g1:";
+  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
+  case VK_TPREL_G0:            return ":tprel_g0:";
+  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
+  case VK_TPREL_HI12:          return ":tprel_hi12:";
+  case VK_TPREL_LO12:          return ":tprel_lo12:";
+  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
+  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
+  case VK_ABS_PAGE:            return "";
+  case VK_GOT_PAGE:            return ":got:";
+  case VK_GOT_LO12:            return ":got_lo12:";
+  case VK_GOTTPREL_PAGE:       return ":gottprel:";
+  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
+  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
+  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
+  case VK_TLSDESC:             return "";
+  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  default:
+    llvm_unreachable("Invalid ELF symbol kind");
+  }
+}
+
+void AArch64MCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  if (getKind() != VK_NONE)
+    OS << getVariantKindName();
+  Expr->print(OS, MAI);
+}
+
+void AArch64MCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
+}
+
+MCFragment *AArch64MCExpr::findAssociatedFragment() const {
+  llvm_unreachable("FIXME: what goes here?");
+}
+
+bool AArch64MCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+                                              const MCAsmLayout *Layout,
+                                              const MCFixup *Fixup) const {
+  if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
+    return false;
+
+  Res =
+      MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+  return true;
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expression");
+    break;
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef: {
+    // We're known to be under a TLS fixup, so any symbol should be
+    // modified. There should be only one.
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    cast<MCSymbolELF>(SymRef.getSymbol()).setType(ELF::STT_TLS);
+    break;
+  }
+
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void AArch64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  switch (getSymbolLoc(Kind)) {
+  default:
+    return;
+  case VK_DTPREL:
+  case VK_GOTTPREL:
+  case VK_TPREL:
+  case VK_TLSDESC:
+    break;
+  }
+
+  fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
new file mode 100644
index 000000000000..db36a65564ce
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -0,0 +1,167 @@
+//=--- AArch64MCExpr.h - AArch64 specific MC expression classes ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes AArch64-specific MCExprs, used for modifiers like
+// ":lo12:" or ":gottprel_g1:".
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCEXPR_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+class AArch64MCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_NONE     = 0x000,
+
+    // Symbol locations specifying (roughly speaking) what calculation should be
+    // performed to construct the final address for the relocated
+    // symbol. E.g. direct, via the GOT, ...
+    VK_ABS      = 0x001,
+    VK_SABS     = 0x002,
+    VK_GOT      = 0x003,
+    VK_DTPREL   = 0x004,
+    VK_GOTTPREL = 0x005,
+    VK_TPREL    = 0x006,
+    VK_TLSDESC  = 0x007,
+    VK_SymLocBits = 0x00f,
+
+    // Variants specifying which part of the final address calculation is
+    // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a
+    // MOVZ/MOVK.
+    VK_PAGE     = 0x010,
+    VK_PAGEOFF  = 0x020,
+    VK_HI12     = 0x030,
+    VK_G0       = 0x040,
+    VK_G1       = 0x050,
+    VK_G2       = 0x060,
+    VK_G3       = 0x070,
+    VK_AddressFragBits = 0x0f0,
+
+    // Whether the final relocation is a checked one (where a linker should
+    // perform a range-check on the final address) or not. Note that this field
+    // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12:
+    // on its own is a non-checked relocation. We side with ELF on being
+    // explicit about this!
+    VK_NC       = 0x100,
+
+    // Convenience definitions for referring to specific textual representations
+    // of relocation specifiers. Note that this means the "_NC" is sometimes
+    // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC
+    // since a user would write ":lo12:").
+    VK_CALL              = VK_ABS,
+    VK_ABS_PAGE          = VK_ABS      | VK_PAGE,
+    VK_ABS_G3            = VK_ABS      | VK_G3,
+    VK_ABS_G2            = VK_ABS      | VK_G2,
+    VK_ABS_G2_S          = VK_SABS     | VK_G2,
+    VK_ABS_G2_NC         = VK_ABS      | VK_G2      | VK_NC,
+    VK_ABS_G1            = VK_ABS      | VK_G1,
+    VK_ABS_G1_S          = VK_SABS     | VK_G1,
+    VK_ABS_G1_NC         = VK_ABS      | VK_G1      | VK_NC,
+    VK_ABS_G0            = VK_ABS      | VK_G0,
+    VK_ABS_G0_S          = VK_SABS     | VK_G0,
+    VK_ABS_G0_NC         = VK_ABS      | VK_G0      | VK_NC,
+    VK_LO12              = VK_ABS      | VK_PAGEOFF | VK_NC,
+    VK_GOT_LO12          = VK_GOT      | VK_PAGEOFF | VK_NC,
+    VK_GOT_PAGE          = VK_GOT      | VK_PAGE,
+    VK_DTPREL_G2         = VK_DTPREL   | VK_G2,
+    VK_DTPREL_G1         = VK_DTPREL   | VK_G1,
+    VK_DTPREL_G1_NC      = VK_DTPREL   | VK_G1      | VK_NC,
+    VK_DTPREL_G0         = VK_DTPREL   | VK_G0,
+    VK_DTPREL_G0_NC      = VK_DTPREL   | VK_G0      | VK_NC,
+    VK_DTPREL_HI12       = VK_DTPREL   | VK_HI12,
+    VK_DTPREL_LO12       = VK_DTPREL   | VK_PAGEOFF,
+    VK_DTPREL_LO12_NC    = VK_DTPREL   | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_PAGE     = VK_GOTTPREL | VK_PAGE,
+    VK_GOTTPREL_LO12_NC  = VK_GOTTPREL | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_G1       = VK_GOTTPREL | VK_G1,
+    VK_GOTTPREL_G0_NC    = VK_GOTTPREL | VK_G0      | VK_NC,
+    VK_TPREL_G2          = VK_TPREL    | VK_G2,
+    VK_TPREL_G1          = VK_TPREL    | VK_G1,
+    VK_TPREL_G1_NC       = VK_TPREL    | VK_G1      | VK_NC,
+    VK_TPREL_G0          = VK_TPREL    | VK_G0,
+    VK_TPREL_G0_NC       = VK_TPREL    | VK_G0      | VK_NC,
+    VK_TPREL_HI12        = VK_TPREL    | VK_HI12,
+    VK_TPREL_LO12        = VK_TPREL    | VK_PAGEOFF,
+    VK_TPREL_LO12_NC     = VK_TPREL    | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_PAGE      = VK_TLSDESC  | VK_PAGE,
+
+    VK_INVALID  = 0xfff
+  };
+
+private:
+  const MCExpr *Expr;
+  const VariantKind Kind;
+
+  explicit AArch64MCExpr(const MCExpr *Expr, VariantKind Kind)
+    : Expr(Expr), Kind(Kind) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const AArch64MCExpr *create(const MCExpr *Expr, VariantKind Kind,
+                                   MCContext &Ctx);
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// Get the expression this modifier applies to.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// @}
+  /// @name VariantKind information extractors.
+  /// @{
+
+  static VariantKind getSymbolLoc(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_SymLocBits);
+  }
+
+  static VariantKind getAddressFrag(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_AddressFragBits);
+  }
+
+  static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; }
+
+  /// @}
+
+  /// Convert the variant kind into an ELF-appropriate modifier
+  /// (e.g. ":got:", ":lo12:").
+  StringRef getVariantKindName() const;
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+
+  MCFragment *findAssociatedFragment() const override;
+
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override;
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+
+  static bool classof(const AArch64MCExpr *) { return true; }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
new file mode 100644
index 000000000000..e9d38d3dcf10
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -0,0 +1,169 @@
+//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AArch64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCTargetDesc.h"
+#include "AArch64ELFStreamer.h"
+#include "AArch64MCAsmInfo.h"
+#include "InstPrinter/AArch64InstPrinter.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "AArch64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "AArch64GenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "AArch64GenRegisterInfo.inc"
+
+static MCInstrInfo *createAArch64MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitAArch64MCInstrInfo(X);
+  return X;
+}
+
+static MCSubtargetInfo *
+createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  if (CPU.empty())
+    CPU = "generic";
+
+  return createAArch64MCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCRegisterInfo *createAArch64MCRegisterInfo(const Triple &Triple) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitAArch64MCRegisterInfo(X, AArch64::LR);
+  return X;
+}
+
+static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
+                                         const Triple &TheTriple) {
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSBinFormatMachO())
+    MAI = new AArch64MCAsmInfoDarwin();
+  else {
+    assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
+    MAI = new AArch64MCAsmInfoELF(TheTriple);
+  }
+
+  // Initial state of the frame pointer is SP.
+  unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
+  MAI->addInitialFrameState(Inst);
+
+  return MAI;
+}
+
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+                              CodeModel::Model &CM) {
+  assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) &&
+         "Only expect Darwin and ELF targets");
+
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  // The default MCJIT memory managers make no guarantees about where they can
+  // find an executable page; JITed code needs to be able to refer to globals
+  // no matter how far away they are.
+  else if (CM == CodeModel::JITDefault)
+    CM = CodeModel::Large;
+  else if (CM != CodeModel::Small && CM != CodeModel::Large)
+    report_fatal_error(
+        "Only small and large code models are allowed on AArch64");
+}
+
+static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T,
+                                                 unsigned SyntaxVariant,
+                                                 const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI) {
+  if (SyntaxVariant == 0)
+    return new AArch64InstPrinter(MAI, MII, MRI);
+  if (SyntaxVariant == 1)
+    return new AArch64AppleInstPrinter(MAI, MII, MRI);
+
+  return nullptr;
+}
+
+static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
+                                     MCAsmBackend &TAB, raw_pwrite_stream &OS,
+                                     MCCodeEmitter *Emitter, bool RelaxAll) {
+  return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
+}
+
+static MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB,
+                                       raw_pwrite_stream &OS,
+                                       MCCodeEmitter *Emitter, bool RelaxAll,
+                                       bool DWARFMustBeAtTheEnd) {
+  return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
+                             DWARFMustBeAtTheEnd,
+                             /*LabelSections*/ true);
+}
+
+static MCInstrAnalysis *createAArch64InstrAnalysis(const MCInstrInfo *Info) {
+  return new MCInstrAnalysis(Info);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeAArch64TargetMC() {
+  for (Target *T : {&getTheAArch64leTarget(), &getTheAArch64beTarget(),
+                    &getTheARM64Target()}) {
+    // Register the MC asm info.
+    RegisterMCAsmInfoFn X(*T, createAArch64MCAsmInfo);
+
+    // Register the MC codegen info.
+    TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts);
+
+    // Register the MC instruction info.
+    TargetRegistry::RegisterMCInstrInfo(*T, createAArch64MCInstrInfo);
+
+    // Register the MC register info.
+    TargetRegistry::RegisterMCRegInfo(*T, createAArch64MCRegisterInfo);
+
+    // Register the MC subtarget info.
+    TargetRegistry::RegisterMCSubtargetInfo(*T, createAArch64MCSubtargetInfo);
+
+    // Register the MC instruction analyzer.
+    TargetRegistry::RegisterMCInstrAnalysis(*T, createAArch64InstrAnalysis);
+
+    // Register the MC Code Emitter
+    TargetRegistry::RegisterMCCodeEmitter(*T, createAArch64MCCodeEmitter);
+
+    // Register the obj streamers.
+    TargetRegistry::RegisterELFStreamer(*T, createELFStreamer);
+    TargetRegistry::RegisterMachOStreamer(*T, createMachOStreamer);
+
+    // Register the obj target streamer.
+    TargetRegistry::RegisterObjectTargetStreamer(
+        *T, createAArch64ObjectTargetStreamer);
+
+    // Register the asm streamer.
+    TargetRegistry::RegisterAsmTargetStreamer(*T,
+                                              createAArch64AsmTargetStreamer);
+    // Register the MCInstPrinter.
+    TargetRegistry::RegisterMCInstPrinter(*T, createAArch64MCInstPrinter);
+  }
+
+  // Register the asm backend.
+  for (Target *T : {&getTheAArch64leTarget(), &getTheARM64Target()})
+    TargetRegistry::RegisterMCAsmBackend(*T, createAArch64leAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(getTheAArch64beTarget(),
+                                       createAArch64beAsmBackend);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
new file mode 100644
index 000000000000..615d7dab2c51
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -0,0 +1,87 @@
+//===-- AArch64MCTargetDesc.h - AArch64 Target Descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AArch64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class formatted_raw_ostream;
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCInstPrinter;
+class MCRegisterInfo;
+class MCObjectWriter;
+class MCStreamer;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class MCTargetStreamer;
+class StringRef;
+class Target;
+class Triple;
+class raw_ostream;
+class raw_pwrite_stream;
+
+Target &getTheAArch64leTarget();
+Target &getTheAArch64beTarget();
+Target &getTheARM64Target();
+
+MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+                                          const MCRegisterInfo &MRI,
+                                          MCContext &Ctx);
+MCAsmBackend *createAArch64leAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        const Triple &TT, StringRef CPU,
+                                        const MCTargetOptions &Options);
+MCAsmBackend *createAArch64beAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        const Triple &TT, StringRef CPU,
+                                        const MCTargetOptions &Options);
+
+MCObjectWriter *createAArch64ELFObjectWriter(raw_pwrite_stream &OS,
+                                             uint8_t OSABI,
+                                             bool IsLittleEndian,
+                                             bool IsILP32);
+
+MCObjectWriter *createAArch64MachObjectWriter(raw_pwrite_stream &OS,
+                                              uint32_t CPUType,
+                                              uint32_t CPUSubtype);
+
+MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
+                                                 formatted_raw_ostream &OS,
+                                                 MCInstPrinter *InstPrint,
+                                                 bool isVerboseAsm);
+
+MCTargetStreamer *createAArch64ObjectTargetStreamer(MCStreamer &S,
+                                                    const MCSubtargetInfo &STI);
+
+} // End llvm namespace
+
+// Defines symbolic names for AArch64 registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "AArch64GenRegisterInfo.inc"
+
+// Defines symbolic names for the AArch64 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "AArch64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AArch64GenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
new file mode 100644
index 000000000000..53a68527ee8e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -0,0 +1,431 @@
+//===-- AArch64MachObjectWriter.cpp - ARM Mach Object Writer --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+class AArch64MachObjectWriter : public MCMachObjectTargetWriter {
+  bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
+                                  const MCSymbolRefExpr *Sym,
+                                  unsigned &Log2Size, const MCAssembler &Asm);
+
+public:
+  AArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype) {}
+
+  void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override;
+};
+}
+
+bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
+    const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
+    unsigned &Log2Size, const MCAssembler &Asm) {
+  RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
+  Log2Size = ~0U;
+
+  switch ((unsigned)Fixup.getKind()) {
+  default:
+    return false;
+
+  case FK_Data_1:
+    Log2Size = llvm::Log2_32(1);
+    return true;
+  case FK_Data_2:
+    Log2Size = llvm::Log2_32(2);
+    return true;
+  case FK_Data_4:
+    Log2Size = llvm::Log2_32(4);
+    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+    return true;
+  case FK_Data_8:
+    Log2Size = llvm::Log2_32(8);
+    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+    return true;
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+    Log2Size = llvm::Log2_32(4);
+    switch (Sym->getKind()) {
+    default:
+      return false;
+    case MCSymbolRefExpr::VK_PAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12);
+      return true;
+    case MCSymbolRefExpr::VK_GOTPAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12);
+      return true;
+    case MCSymbolRefExpr::VK_TLVPPAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12);
+      return true;
+    }
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+    Log2Size = llvm::Log2_32(4);
+    // This encompasses the relocation for the whole 21-bit value.
+    switch (Sym->getKind()) {
+    default: {
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                   "ADR/ADRP relocations must be GOT relative");
+      return false;
+    }
+    case MCSymbolRefExpr::VK_PAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
+      return true;
+    case MCSymbolRefExpr::VK_GOTPAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGE21);
+      return true;
+    case MCSymbolRefExpr::VK_TLVPPAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGE21);
+      return true;
+    }
+    return true;
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    Log2Size = llvm::Log2_32(4);
+    RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
+    return true;
+  }
+}
+
+static bool canUseLocalRelocation(const MCSectionMachO &Section,
+                                  const MCSymbol &Symbol, unsigned Log2Size) {
+  // Debug info sections can use local relocations.
+  if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+    return true;
+
+  // Otherwise, only pointer sized relocations are supported.
+  if (Log2Size != 3)
+    return false;
+
+  // But only if they don't point to a few forbidden sections.
+  if (!Symbol.isInSection())
+    return true;
+  const MCSectionMachO &RefSec = cast<MCSectionMachO>(Symbol.getSection());
+  if (RefSec.getType() == MachO::S_CSTRING_LITERALS)
+    return false;
+
+  if (RefSec.getSegmentName() == "__DATA" &&
+      RefSec.getSectionName() == "__objc_classrefs")
+    return false;
+
+  // FIXME: ld64 currently handles internal pointer-sized relocations
+  // incorrectly (applying the addend twice). We should be able to return true
+  // unconditionally by this point when that's fixed.
+  return false;
+}
+
+void AArch64MachObjectWriter::recordRelocation(
+    MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+
+  // See <reloc.h>.
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment);
+  unsigned Log2Size = 0;
+  int64_t Value = 0;
+  unsigned Index = 0;
+  unsigned Type = 0;
+  unsigned Kind = Fixup.getKind();
+  const MCSymbol *RelSymbol = nullptr;
+
+  FixupOffset += Fixup.getOffset();
+
+  // AArch64 pcrel relocation addends do not include the section offset.
+  if (IsPCRel)
+    FixedValue += FixupOffset;
+
+  // ADRP fixups use relocations for the whole symbol value and only
+  // put the addend in the instruction itself. Clear out any value the
+  // generic code figured out from the sybmol definition.
+  if (Kind == AArch64::fixup_aarch64_pcrel_adrp_imm21)
+    FixedValue = 0;
+
+  // imm19 relocations are for conditional branches, which require
+  // assembler local symbols. If we got here, that's not what we have,
+  // so complain loudly.
+  if (Kind == AArch64::fixup_aarch64_pcrel_branch19) {
+    Asm.getContext().reportError(Fixup.getLoc(),
+                                 "conditional branch requires assembler-local"
+                                 " label. '" +
+                                     Target.getSymA()->getSymbol().getName() +
+                                     "' is external.");
+    return;
+  }
+
+  // 14-bit branch relocations should only target internal labels, and so
+  // should never get here.
+  if (Kind == AArch64::fixup_aarch64_pcrel_branch14) {
+    Asm.getContext().reportError(Fixup.getLoc(),
+                                 "Invalid relocation on conditional branch!");
+    return;
+  }
+
+  if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
+                                    Asm)) {
+    Asm.getContext().reportError(Fixup.getLoc(), "unknown AArch64 fixup kind!");
+    return;
+  }
+
+  Value = Target.getConstant();
+
+  if (Target.isAbsolute()) { // constant
+    // FIXME: Should this always be extern?
+    // SymbolNum of 0 indicates the absolute section.
+    Type = MachO::ARM64_RELOC_UNSIGNED;
+
+    if (IsPCRel) {
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                   "PC relative absolute relocation!");
+      return;
+
+      // FIXME: x86_64 sets the type to a branch reloc here. Should we do
+      // something similar?
+    }
+  } else if (Target.getSymB()) { // A - B + constant
+    const MCSymbol *A = &Target.getSymA()->getSymbol();
+    const MCSymbol *A_Base = Asm.getAtom(*A);
+
+    const MCSymbol *B = &Target.getSymB()->getSymbol();
+    const MCSymbol *B_Base = Asm.getAtom(*B);
+
+    // Check for "_foo@got - .", which comes through here as:
+    // Ltmp0:
+    //    ... _foo@got - Ltmp0
+    if (Target.getSymA()->getKind() == MCSymbolRefExpr::VK_GOT &&
+        Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None &&
+        Layout.getSymbolOffset(*B) ==
+            Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) {
+      // SymB is the PC, so use a PC-rel pointer-to-GOT relocation.
+      Type = MachO::ARM64_RELOC_POINTER_TO_GOT;
+      IsPCRel = 1;
+      MachO::any_relocation_info MRE;
+      MRE.r_word0 = FixupOffset;
+      MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+      Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
+      return;
+    } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+               Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
+      // Otherwise, neither symbol can be modified.
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                   "unsupported relocation of modified symbol");
+      return;
+    }
+
+    // We don't support PCrel relocations of differences.
+    if (IsPCRel) {
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                   "unsupported pc-relative relocation of "
+                                   "difference");
+      return;
+    }
+
+    // AArch64 always uses external relocations. If there is no symbol to use as
+    // a base address (a local symbol with no preceding non-local symbol),
+    // error out.
+    //
+    // FIXME: We should probably just synthesize an external symbol and use
+    // that.
+    if (!A_Base) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(),
+          "unsupported relocation of local symbol '" + A->getName() +
+              "'. Must have non-local symbol earlier in section.");
+      return;
+    }
+    if (!B_Base) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(),
+          "unsupported relocation of local symbol '" + B->getName() +
+              "'. Must have non-local symbol earlier in section.");
+      return;
+    }
+
+    if (A_Base == B_Base && A_Base) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported relocation with identical base");
+      return;
+    }
+
+    Value += (!A->getFragment() ? 0 : Writer->getSymbolAddress(*A, Layout)) -
+             (!A_Base || !A_Base->getFragment() ? 0 : Writer->getSymbolAddress(
+                                                          *A_Base, Layout));
+    Value -= (!B->getFragment() ? 0 : Writer->getSymbolAddress(*B, Layout)) -
+             (!B_Base || !B_Base->getFragment() ? 0 : Writer->getSymbolAddress(
+                                                          *B_Base, Layout));
+
+    Type = MachO::ARM64_RELOC_UNSIGNED;
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+    Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
+
+    RelSymbol = B_Base;
+    Type = MachO::ARM64_RELOC_SUBTRACTOR;
+  } else { // A + constant
+    const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+    const MCSectionMachO &Section =
+        static_cast<const MCSectionMachO &>(*Fragment->getParent());
+
+    bool CanUseLocalRelocation =
+        canUseLocalRelocation(Section, *Symbol, Log2Size);
+    if (Symbol->isTemporary() && (Value || !CanUseLocalRelocation)) {
+      const MCSection &Sec = Symbol->getSection();
+      if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
+        Symbol->setUsedInReloc();
+    }
+
+    const MCSymbol *Base = Asm.getAtom(*Symbol);
+
+    // If the symbol is a variable and we weren't able to get a Base for it
+    // (i.e., it's not in the symbol table associated with a section) resolve
+    // the relocation based its expansion instead.
+    if (Symbol->isVariable() && !Base) {
+      // If the evaluation is an absolute value, just use that directly
+      // to keep things easy.
+      int64_t Res;
+      if (Symbol->getVariableValue()->evaluateAsAbsolute(
+              Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+
+      // FIXME: Will the Target we already have ever have any data in it
+      // we need to preserve and merge with the new Target? How about
+      // the FixedValue?
+      if (!Symbol->getVariableValue()->evaluateAsRelocatable(Target, &Layout,
+                                                             &Fixup)) {
+        Asm.getContext().reportError(Fixup.getLoc(),
+                                     "unable to resolve variable '" +
+                                         Symbol->getName() + "'");
+        return;
+      }
+      return recordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                              FixedValue);
+    }
+
+    // Relocations inside debug sections always use local relocations when
+    // possible. This seems to be done because the debugger doesn't fully
+    // understand relocation entries and expects to find values that
+    // have already been fixed up.
+    if (Symbol->isInSection()) {
+      if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+        Base = nullptr;
+    }
+
+    // AArch64 uses external relocations as much as possible. For debug
+    // sections, and for pointer-sized relocations (.quad), we allow section
+    // relocations.  It's code sections that run into trouble.
+    if (Base) {
+      RelSymbol = Base;
+
+      // Add the local offset, if needed.
+      if (Base != Symbol)
+        Value +=
+            Layout.getSymbolOffset(*Symbol) - Layout.getSymbolOffset(*Base);
+    } else if (Symbol->isInSection()) {
+      if (!CanUseLocalRelocation) {
+        Asm.getContext().reportError(
+            Fixup.getLoc(),
+            "unsupported relocation of local symbol '" + Symbol->getName() +
+                "'. Must have non-local symbol earlier in section.");
+        return;
+      }
+      // Adjust the relocation to be section-relative.
+      // The index is the section ordinal (1-based).
+      const MCSection &Sec = Symbol->getSection();
+      Index = Sec.getOrdinal() + 1;
+      Value += Writer->getSymbolAddress(*Symbol, Layout);
+
+      if (IsPCRel)
+        Value -= Writer->getFragmentAddress(Fragment, Layout) +
+                 Fixup.getOffset() + (1ULL << Log2Size);
+    } else {
+      // Resolve constant variables.
+      if (Symbol->isVariable()) {
+        int64_t Res;
+        if (Symbol->getVariableValue()->evaluateAsAbsolute(
+                Res, Layout, Writer->getSectionAddressMap())) {
+          FixedValue = Res;
+          return;
+        }
+      }
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                  "unsupported relocation of variable '" +
+                                      Symbol->getName() + "'");
+      return;
+    }
+  }
+
+  // If the relocation kind is Branch26, Page21, or Pageoff12, any addend
+  // is represented via an Addend relocation, not encoded directly into
+  // the instruction.
+  if ((Type == MachO::ARM64_RELOC_BRANCH26 ||
+       Type == MachO::ARM64_RELOC_PAGE21 ||
+       Type == MachO::ARM64_RELOC_PAGEOFF12) &&
+      Value) {
+    assert((Value & 0xff000000) == 0 && "Added relocation out of range!");
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 =
+        (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+    Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+
+    // Now set up the Addend relocation.
+    Type = MachO::ARM64_RELOC_ADDEND;
+    Index = Value;
+    RelSymbol = nullptr;
+    IsPCRel = 0;
+    Log2Size = 2;
+
+    // Put zero into the instruction itself. The addend is in the relocation.
+    Value = 0;
+  }
+
+  // If there's any addend left to handle, encode it in the instruction.
+  FixedValue = Value;
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 =
+      (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_pwrite_stream &OS,
+                                                    uint32_t CPUType,
+                                                    uint32_t CPUSubtype) {
+  return createMachObjectWriter(
+      new AArch64MachObjectWriter(CPUType, CPUSubtype), OS,
+      /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
new file mode 100644
index 000000000000..3e86a42d5be6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -0,0 +1,41 @@
+//===- AArch64TargetStreamer.cpp - AArch64TargetStreamer class ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64TargetStreamer class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetStreamer.h"
+#include "llvm/MC/ConstantPools.h"
+using namespace llvm;
+
+//
+// AArch64TargetStreamer Implemenation
+//
+AArch64TargetStreamer::AArch64TargetStreamer(MCStreamer &S)
+    : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {}
+
+AArch64TargetStreamer::~AArch64TargetStreamer() {}
+
+// The constant pool handling is shared by all AArch64TargetStreamer
+// implementations.
+const MCExpr *AArch64TargetStreamer::addConstantPoolEntry(const MCExpr *Expr,
+                                                          unsigned Size,
+                                                          SMLoc Loc) {
+  return ConstantPools->addEntry(Streamer, Expr, Size, Loc);
+}
+
+void AArch64TargetStreamer::emitCurrentConstantPool() {
+  ConstantPools->emitForCurrentSection(Streamer);
+}
+
+// finish() - write out any non-empty assembler constant pools.
+void AArch64TargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
+
+void AArch64TargetStreamer::emitInst(uint32_t Inst) {}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
new file mode 100644
index 000000000000..51432830f795
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -0,0 +1,42 @@
+//===-- AArch64TargetStreamer.h - AArch64 Target Streamer ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64TARGETSTREAMER_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64TARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class AArch64TargetStreamer : public MCTargetStreamer {
+public:
+  AArch64TargetStreamer(MCStreamer &S);
+  ~AArch64TargetStreamer() override;
+
+  void finish() override;
+
+  /// Callback used to implement the ldr= pseudo.
+  /// Add a new entry to the constant pool for the current section and return an
+  /// MCExpr that can be used to refer to the constant pool location.
+  const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size, SMLoc Loc);
+
+  /// Callback used to implemnt the .ltorg directive.
+  /// Emit contents of constant pool for the current section.
+  void emitCurrentConstantPool();
+
+  /// Callback used to implement the .inst directive.
+  virtual void emitInst(uint32_t Inst);
+
+private:
+  std::unique_ptr<AssemblerConstantPools> ConstantPools;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
new file mode 100644
index 000000000000..7ac9a5a08484
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -0,0 +1,39 @@
+//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+namespace llvm {
+Target &getTheAArch64leTarget() {
+  static Target TheAArch64leTarget;
+  return TheAArch64leTarget;
+}
+Target &getTheAArch64beTarget() {
+  static Target TheAArch64beTarget;
+  return TheAArch64beTarget;
+}
+Target &getTheARM64Target() {
+  static Target TheARM64Target;
+  return TheARM64Target;
+}
+} // namespace llvm
+
+extern "C" void LLVMInitializeAArch64TargetInfo() {
+  // Now register the "arm64" name for use with "-march". We don't want it to
+  // take possession of the Triple::aarch64 tag though.
+  TargetRegistry::RegisterTarget(getTheARM64Target(), "arm64",
+                                 "ARM64 (little endian)",
+                                 [](Triple::ArchType) { return false; }, true);
+
+  RegisterTarget<Triple::aarch64, /*HasJIT=*/true> Z(
+      getTheAArch64leTarget(), "aarch64", "AArch64 (little endian)");
+  RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true> W(
+      getTheAArch64beTarget(), "aarch64_be", "AArch64 (big endian)");
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
new file mode 100644
index 000000000000..e65ba1f2401d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -0,0 +1,122 @@
+//===-- AArch64BaseInfo.cpp - AArch64 Base encoding information------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides basic encoding and assembly information for AArch64.
+//
+//===----------------------------------------------------------------------===//
+#include "AArch64BaseInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Regex.h"
+
+using namespace llvm;
+
+namespace llvm {
+  namespace AArch64AT {
+#define GET_AT_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
+
+
+namespace llvm {
+  namespace AArch64DB {
+#define GET_DB_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
+
+namespace llvm {
+  namespace AArch64DC {
+#define GET_DC_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
+
+namespace llvm {
+  namespace AArch64IC {
+#define GET_IC_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
+
+namespace llvm {
+  namespace AArch64ISB {
+#define GET_ISB_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
+namespace llvm {
+  namespace AArch64PRFM {
+#define GET_PRFM_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
+
+namespace llvm {
+  namespace AArch64PState {
+#define GET_PSTATE_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
+
+namespace llvm {
+  namespace AArch64PSBHint {
+#define GET_PSB_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
+
+namespace llvm {
+  namespace AArch64SysReg {
+#define GET_SYSREG_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
+
+uint32_t AArch64SysReg::parseGenericRegister(StringRef Name) {
+  // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name
+  Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$");
+
+  std::string UpperName = Name.upper();
+  SmallVector<StringRef, 5> Ops;
+  if (!GenericRegPattern.match(UpperName, &Ops))
+    return -1;
+
+  uint32_t Op0 = 0, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
+  uint32_t Bits;
+  Ops[1].getAsInteger(10, Op0);
+  Ops[2].getAsInteger(10, Op1);
+  Ops[3].getAsInteger(10, CRn);
+  Ops[4].getAsInteger(10, CRm);
+  Ops[5].getAsInteger(10, Op2);
+  Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2;
+
+  return Bits;
+}
+
+std::string AArch64SysReg::genericRegisterString(uint32_t Bits) {
+  assert(Bits < 0x10000);
+  uint32_t Op0 = (Bits >> 14) & 0x3;
+  uint32_t Op1 = (Bits >> 11) & 0x7;
+  uint32_t CRn = (Bits >> 7) & 0xf;
+  uint32_t CRm = (Bits >> 3) & 0xf;
+  uint32_t Op2 = Bits & 0x7;
+
+  return "S" + utostr(Op0) + "_" + utostr(Op1) + "_C" + utostr(CRn) + "_C" +
+         utostr(CRm) + "_" + utostr(Op2);
+}
+
+namespace llvm {
+  namespace AArch64TLBI {
+#define GET_TLBI_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
new file mode 100644
index 000000000000..dcc39176031c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -0,0 +1,524 @@
+//===-- AArch64BaseInfo.h - Top level definitions for AArch64 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the AArch64 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_UTILS_AARCH64BASEINFO_H
+#define LLVM_LIB_TARGET_AARCH64_UTILS_AARCH64BASEINFO_H
+
+// FIXME: Is it easiest to fix this layering violation by moving the .inc
+// #includes from AArch64MCTargetDesc.h to here?
+#include "MCTargetDesc/AArch64MCTargetDesc.h" // For AArch64::X0 and friends.
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+inline static unsigned getWRegFromXReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::X0: return AArch64::W0;
+  case AArch64::X1: return AArch64::W1;
+  case AArch64::X2: return AArch64::W2;
+  case AArch64::X3: return AArch64::W3;
+  case AArch64::X4: return AArch64::W4;
+  case AArch64::X5: return AArch64::W5;
+  case AArch64::X6: return AArch64::W6;
+  case AArch64::X7: return AArch64::W7;
+  case AArch64::X8: return AArch64::W8;
+  case AArch64::X9: return AArch64::W9;
+  case AArch64::X10: return AArch64::W10;
+  case AArch64::X11: return AArch64::W11;
+  case AArch64::X12: return AArch64::W12;
+  case AArch64::X13: return AArch64::W13;
+  case AArch64::X14: return AArch64::W14;
+  case AArch64::X15: return AArch64::W15;
+  case AArch64::X16: return AArch64::W16;
+  case AArch64::X17: return AArch64::W17;
+  case AArch64::X18: return AArch64::W18;
+  case AArch64::X19: return AArch64::W19;
+  case AArch64::X20: return AArch64::W20;
+  case AArch64::X21: return AArch64::W21;
+  case AArch64::X22: return AArch64::W22;
+  case AArch64::X23: return AArch64::W23;
+  case AArch64::X24: return AArch64::W24;
+  case AArch64::X25: return AArch64::W25;
+  case AArch64::X26: return AArch64::W26;
+  case AArch64::X27: return AArch64::W27;
+  case AArch64::X28: return AArch64::W28;
+  case AArch64::FP: return AArch64::W29;
+  case AArch64::LR: return AArch64::W30;
+  case AArch64::SP: return AArch64::WSP;
+  case AArch64::XZR: return AArch64::WZR;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+inline static unsigned getXRegFromWReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::W0: return AArch64::X0;
+  case AArch64::W1: return AArch64::X1;
+  case AArch64::W2: return AArch64::X2;
+  case AArch64::W3: return AArch64::X3;
+  case AArch64::W4: return AArch64::X4;
+  case AArch64::W5: return AArch64::X5;
+  case AArch64::W6: return AArch64::X6;
+  case AArch64::W7: return AArch64::X7;
+  case AArch64::W8: return AArch64::X8;
+  case AArch64::W9: return AArch64::X9;
+  case AArch64::W10: return AArch64::X10;
+  case AArch64::W11: return AArch64::X11;
+  case AArch64::W12: return AArch64::X12;
+  case AArch64::W13: return AArch64::X13;
+  case AArch64::W14: return AArch64::X14;
+  case AArch64::W15: return AArch64::X15;
+  case AArch64::W16: return AArch64::X16;
+  case AArch64::W17: return AArch64::X17;
+  case AArch64::W18: return AArch64::X18;
+  case AArch64::W19: return AArch64::X19;
+  case AArch64::W20: return AArch64::X20;
+  case AArch64::W21: return AArch64::X21;
+  case AArch64::W22: return AArch64::X22;
+  case AArch64::W23: return AArch64::X23;
+  case AArch64::W24: return AArch64::X24;
+  case AArch64::W25: return AArch64::X25;
+  case AArch64::W26: return AArch64::X26;
+  case AArch64::W27: return AArch64::X27;
+  case AArch64::W28: return AArch64::X28;
+  case AArch64::W29: return AArch64::FP;
+  case AArch64::W30: return AArch64::LR;
+  case AArch64::WSP: return AArch64::SP;
+  case AArch64::WZR: return AArch64::XZR;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+static inline unsigned getBRegFromDReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::D0:  return AArch64::B0;
+  case AArch64::D1:  return AArch64::B1;
+  case AArch64::D2:  return AArch64::B2;
+  case AArch64::D3:  return AArch64::B3;
+  case AArch64::D4:  return AArch64::B4;
+  case AArch64::D5:  return AArch64::B5;
+  case AArch64::D6:  return AArch64::B6;
+  case AArch64::D7:  return AArch64::B7;
+  case AArch64::D8:  return AArch64::B8;
+  case AArch64::D9:  return AArch64::B9;
+  case AArch64::D10: return AArch64::B10;
+  case AArch64::D11: return AArch64::B11;
+  case AArch64::D12: return AArch64::B12;
+  case AArch64::D13: return AArch64::B13;
+  case AArch64::D14: return AArch64::B14;
+  case AArch64::D15: return AArch64::B15;
+  case AArch64::D16: return AArch64::B16;
+  case AArch64::D17: return AArch64::B17;
+  case AArch64::D18: return AArch64::B18;
+  case AArch64::D19: return AArch64::B19;
+  case AArch64::D20: return AArch64::B20;
+  case AArch64::D21: return AArch64::B21;
+  case AArch64::D22: return AArch64::B22;
+  case AArch64::D23: return AArch64::B23;
+  case AArch64::D24: return AArch64::B24;
+  case AArch64::D25: return AArch64::B25;
+  case AArch64::D26: return AArch64::B26;
+  case AArch64::D27: return AArch64::B27;
+  case AArch64::D28: return AArch64::B28;
+  case AArch64::D29: return AArch64::B29;
+  case AArch64::D30: return AArch64::B30;
+  case AArch64::D31: return AArch64::B31;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+
+static inline unsigned getDRegFromBReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::B0:  return AArch64::D0;
+  case AArch64::B1:  return AArch64::D1;
+  case AArch64::B2:  return AArch64::D2;
+  case AArch64::B3:  return AArch64::D3;
+  case AArch64::B4:  return AArch64::D4;
+  case AArch64::B5:  return AArch64::D5;
+  case AArch64::B6:  return AArch64::D6;
+  case AArch64::B7:  return AArch64::D7;
+  case AArch64::B8:  return AArch64::D8;
+  case AArch64::B9:  return AArch64::D9;
+  case AArch64::B10: return AArch64::D10;
+  case AArch64::B11: return AArch64::D11;
+  case AArch64::B12: return AArch64::D12;
+  case AArch64::B13: return AArch64::D13;
+  case AArch64::B14: return AArch64::D14;
+  case AArch64::B15: return AArch64::D15;
+  case AArch64::B16: return AArch64::D16;
+  case AArch64::B17: return AArch64::D17;
+  case AArch64::B18: return AArch64::D18;
+  case AArch64::B19: return AArch64::D19;
+  case AArch64::B20: return AArch64::D20;
+  case AArch64::B21: return AArch64::D21;
+  case AArch64::B22: return AArch64::D22;
+  case AArch64::B23: return AArch64::D23;
+  case AArch64::B24: return AArch64::D24;
+  case AArch64::B25: return AArch64::D25;
+  case AArch64::B26: return AArch64::D26;
+  case AArch64::B27: return AArch64::D27;
+  case AArch64::B28: return AArch64::D28;
+  case AArch64::B29: return AArch64::D29;
+  case AArch64::B30: return AArch64::D30;
+  case AArch64::B31: return AArch64::D31;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
+
+namespace AArch64CC {
+
+// The CondCodes constants map directly to the 4-bit encoding of the condition
+// field for predicated instructions.
+enum CondCode {  // Meaning (integer)          Meaning (floating-point)
+  EQ = 0x0,      // Equal                      Equal
+  NE = 0x1,      // Not equal                  Not equal, or unordered
+  HS = 0x2,      // Unsigned higher or same    >, ==, or unordered
+  LO = 0x3,      // Unsigned lower             Less than
+  MI = 0x4,      // Minus, negative            Less than
+  PL = 0x5,      // Plus, positive or zero     >, ==, or unordered
+  VS = 0x6,      // Overflow                   Unordered
+  VC = 0x7,      // No overflow                Not unordered
+  HI = 0x8,      // Unsigned higher            Greater than, or unordered
+  LS = 0x9,      // Unsigned lower or same     Less than or equal
+  GE = 0xa,      // Greater than or equal      Greater than or equal
+  LT = 0xb,      // Less than                  Less than, or unordered
+  GT = 0xc,      // Greater than               Greater than
+  LE = 0xd,      // Less than or equal         <, ==, or unordered
+  AL = 0xe,      // Always (unconditional)     Always (unconditional)
+  NV = 0xf,      // Always (unconditional)     Always (unconditional)
+  // Note the NV exists purely to disassemble 0b1111. Execution is "always".
+  Invalid
+};
+
+inline static const char *getCondCodeName(CondCode Code) {
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ:  return "eq";
+  case NE:  return "ne";
+  case HS:  return "hs";
+  case LO:  return "lo";
+  case MI:  return "mi";
+  case PL:  return "pl";
+  case VS:  return "vs";
+  case VC:  return "vc";
+  case HI:  return "hi";
+  case LS:  return "ls";
+  case GE:  return "ge";
+  case LT:  return "lt";
+  case GT:  return "gt";
+  case LE:  return "le";
+  case AL:  return "al";
+  case NV:  return "nv";
+  }
+}
+
+inline static CondCode getInvertedCondCode(CondCode Code) {
+  // To reverse a condition it's necessary to only invert the low bit:
+
+  return static_cast<CondCode>(static_cast<unsigned>(Code) ^ 0x1);
+}
+
+/// Given a condition code, return NZCV flags that would satisfy that condition.
+/// The flag bits are in the format expected by the ccmp instructions.
+/// Note that many different flag settings can satisfy a given condition code,
+/// this function just returns one of them.
+inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
+  // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7.
+  enum { N = 8, Z = 4, C = 2, V = 1 };
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ: return Z; // Z == 1
+  case NE: return 0; // Z == 0
+  case HS: return C; // C == 1
+  case LO: return 0; // C == 0
+  case MI: return N; // N == 1
+  case PL: return 0; // N == 0
+  case VS: return V; // V == 1
+  case VC: return 0; // V == 0
+  case HI: return C; // C == 1 && Z == 0
+  case LS: return 0; // C == 0 || Z == 1
+  case GE: return 0; // N == V
+  case LT: return N; // N != V
+  case GT: return 0; // Z == 0 && N == V
+  case LE: return Z; // Z == 1 || N != V
+  }
+}
+} // end namespace AArch64CC
+
+namespace AArch64AT{
+  struct AT {
+    const char *Name;
+    uint16_t Encoding;
+  };
+
+  #define GET_AT_DECL
+  #include "AArch64GenSystemOperands.inc"
+
+}
+namespace AArch64DB {
+  struct DB {
+    const char *Name;
+    uint16_t Encoding;
+  };
+
+  #define GET_DB_DECL
+  #include "AArch64GenSystemOperands.inc"
+}
+
+namespace  AArch64DC {
+  struct DC {
+    const char *Name;
+    uint16_t Encoding;
+  };
+
+  #define GET_DC_DECL
+  #include "AArch64GenSystemOperands.inc"
+}
+
+namespace  AArch64IC {
+  struct IC {
+    const char *Name;
+    uint16_t Encoding;
+    bool NeedsReg;
+  };
+  #define GET_IC_DECL
+  #include "AArch64GenSystemOperands.inc"
+}
+
+namespace  AArch64ISB {
+  struct ISB {
+    const char *Name;
+    uint16_t Encoding;
+  };
+  #define GET_ISB_DECL
+  #include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64PRFM {
+  struct PRFM {
+    const char *Name;
+    uint16_t Encoding;
+  };
+  #define GET_PRFM_DECL
+  #include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64PState {
+  struct PState {
+    const char *Name;
+    uint16_t Encoding;
+    FeatureBitset FeaturesRequired;
+
+    bool haveFeatures(FeatureBitset ActiveFeatures) const {
+      return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+    }
+  };
+  #define GET_PSTATE_DECL
+  #include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64PSBHint {
+  struct PSB {
+    const char *Name;
+    uint16_t Encoding;
+  };
+  #define GET_PSB_DECL
+  #include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64SE {
+    enum ShiftExtSpecifiers {
+        Invalid = -1,
+        LSL,
+        MSL,
+        LSR,
+        ASR,
+        ROR,
+
+        UXTB,
+        UXTH,
+        UXTW,
+        UXTX,
+
+        SXTB,
+        SXTH,
+        SXTW,
+        SXTX
+    };
+}
+
+namespace AArch64Layout {
+    enum VectorLayout {
+        Invalid = -1,
+        VL_8B,
+        VL_4H,
+        VL_2S,
+        VL_1D,
+
+        VL_16B,
+        VL_8H,
+        VL_4S,
+        VL_2D,
+
+        // Bare layout for the 128-bit vector
+        // (only show ".b", ".h", ".s", ".d" without vector number)
+        VL_B,
+        VL_H,
+        VL_S,
+        VL_D
+    };
+}
+
+inline static const char *
+AArch64VectorLayoutToString(AArch64Layout::VectorLayout Layout) {
+  switch (Layout) {
+  case AArch64Layout::VL_8B:  return ".8b";
+  case AArch64Layout::VL_4H:  return ".4h";
+  case AArch64Layout::VL_2S:  return ".2s";
+  case AArch64Layout::VL_1D:  return ".1d";
+  case AArch64Layout::VL_16B:  return ".16b";
+  case AArch64Layout::VL_8H:  return ".8h";
+  case AArch64Layout::VL_4S:  return ".4s";
+  case AArch64Layout::VL_2D:  return ".2d";
+  case AArch64Layout::VL_B:  return ".b";
+  case AArch64Layout::VL_H:  return ".h";
+  case AArch64Layout::VL_S:  return ".s";
+  case AArch64Layout::VL_D:  return ".d";
+  default: llvm_unreachable("Unknown Vector Layout");
+  }
+}
+
+inline static AArch64Layout::VectorLayout
+AArch64StringToVectorLayout(StringRef LayoutStr) {
+  return StringSwitch<AArch64Layout::VectorLayout>(LayoutStr)
+             .Case(".8b", AArch64Layout::VL_8B)
+             .Case(".4h", AArch64Layout::VL_4H)
+             .Case(".2s", AArch64Layout::VL_2S)
+             .Case(".1d", AArch64Layout::VL_1D)
+             .Case(".16b", AArch64Layout::VL_16B)
+             .Case(".8h", AArch64Layout::VL_8H)
+             .Case(".4s", AArch64Layout::VL_4S)
+             .Case(".2d", AArch64Layout::VL_2D)
+             .Case(".b", AArch64Layout::VL_B)
+             .Case(".h", AArch64Layout::VL_H)
+             .Case(".s", AArch64Layout::VL_S)
+             .Case(".d", AArch64Layout::VL_D)
+             .Default(AArch64Layout::Invalid);
+}
+
+namespace AArch64SysReg {
+  struct SysReg {
+    const char *Name;
+    unsigned Encoding;
+    bool Readable;
+    bool Writeable;
+    FeatureBitset FeaturesRequired;
+
+    bool haveFeatures(FeatureBitset ActiveFeatures) const {
+      return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+    }
+  };
+
+  #define GET_SYSREG_DECL
+  #include "AArch64GenSystemOperands.inc"
+
+  const SysReg *lookupSysRegByName(StringRef);
+  const SysReg *lookupSysRegByEncoding(uint16_t);
+
+  uint32_t parseGenericRegister(StringRef Name);
+  std::string genericRegisterString(uint32_t Bits);
+}
+
+namespace AArch64TLBI {
+  struct TLBI {
+    const char *Name;
+    uint16_t Encoding;
+    bool NeedsReg;
+  };
+  #define GET_TLBI_DECL
+  #include "AArch64GenSystemOperands.inc"
+}
+
+namespace AArch64II {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // AArch64 Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    MO_FRAGMENT = 0xf,
+
+    /// MO_PAGE - A symbol operand with this flag represents the pc-relative
+    /// offset of the 4K page containing the symbol.  This is used with the
+    /// ADRP instruction.
+    MO_PAGE = 1,
+
+    /// MO_PAGEOFF - A symbol operand with this flag represents the offset of
+    /// that symbol within a 4K page.  This offset is added to the page address
+    /// to produce the complete address.
+    MO_PAGEOFF = 2,
+
+    /// MO_G3 - A symbol operand with this flag (granule 3) represents the high
+    /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G3 = 3,
+
+    /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits
+    /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G2 = 4,
+
+    /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits
+    /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G1 = 5,
+
+    /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits
+    /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G0 = 6,
+
+    /// MO_HI12 - This flag indicates that a symbol operand represents the bits
+    /// 13-24 of a 64-bit address, used in a arithmetic immediate-shifted-left-
+    /// by-12-bits instruction.
+    MO_HI12 = 7,
+
+    /// MO_GOT - This flag indicates that a symbol operand represents the
+    /// address of the GOT entry for the symbol, rather than the address of
+    /// the symbol itself.
+    MO_GOT = 0x10,
+
+    /// MO_NC - Indicates whether the linker is expected to check the symbol
+    /// reference for overflow. For example in an ADRP/ADD pair of relocations
+    /// the ADRP usually does check, but not the ADD.
+    MO_NC = 0x20,
+
+    /// MO_TLS - Indicates that the operand being accessed is some kind of
+    /// thread-local symbol. On Darwin, only one type of thread-local access
+    /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
+    /// referee will affect interpretation.
+    MO_TLS = 0x40
+  };
+} // end namespace AArch64II
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
new file mode 100644
index 000000000000..7b0a7f4b6058
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -0,0 +1,175 @@
+//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AMDGPUTargetMachine;
+class FunctionPass;
+class GCNTargetMachine;
+class ModulePass;
+class Pass;
+class Target;
+class TargetMachine;
+class PassRegistry;
+
+// R600 Passes
+FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
+FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
+FunctionPass *createR600EmitClauseMarkers();
+FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
+FunctionPass *createR600Packetizer(TargetMachine &tm);
+FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
+FunctionPass *createAMDGPUCFGStructurizerPass();
+
+// SI Passes
+FunctionPass *createSITypeRewriter();
+FunctionPass *createSIAnnotateControlFlowPass();
+FunctionPass *createSIFoldOperandsPass();
+FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createSIShrinkInstructionsPass();
+FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
+FunctionPass *createSIWholeQuadModePass();
+FunctionPass *createSIFixControlFlowLiveIntervalsPass();
+FunctionPass *createSIFixSGPRCopiesPass();
+FunctionPass *createSIDebuggerInsertNopsPass();
+FunctionPass *createSIInsertWaitsPass();
+FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
+
+ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
+void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
+extern char &AMDGPUAnnotateKernelFeaturesID;
+
+void initializeSIFoldOperandsPass(PassRegistry &);
+extern char &SIFoldOperandsID;
+
+void initializeSIShrinkInstructionsPass(PassRegistry&);
+extern char &SIShrinkInstructionsID;
+
+void initializeSIFixSGPRCopiesPass(PassRegistry &);
+extern char &SIFixSGPRCopiesID;
+
+void initializeSILowerI1CopiesPass(PassRegistry &);
+extern char &SILowerI1CopiesID;
+
+void initializeSILoadStoreOptimizerPass(PassRegistry &);
+extern char &SILoadStoreOptimizerID;
+
+void initializeSIWholeQuadModePass(PassRegistry &);
+extern char &SIWholeQuadModeID;
+
+void initializeSILowerControlFlowPass(PassRegistry &);
+extern char &SILowerControlFlowID;
+
+void initializeSIInsertSkipsPass(PassRegistry &);
+extern char &SIInsertSkipsPassID;
+
+void initializeSIOptimizeExecMaskingPass(PassRegistry &);
+extern char &SIOptimizeExecMaskingID;
+
+// Passes common to R600 and SI
+FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr);
+void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
+extern char &AMDGPUPromoteAllocaID;
+
+Pass *createAMDGPUStructurizeCFGPass();
+FunctionPass *createAMDGPUISelDag(TargetMachine &TM,
+                                  CodeGenOpt::Level OptLevel);
+ModulePass *createAMDGPUAlwaysInlinePass();
+ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
+FunctionPass *createAMDGPUAnnotateUniformValues();
+
+FunctionPass* createAMDGPUUnifyMetadataPass();
+void initializeAMDGPUUnifyMetadataPass(PassRegistry&);
+extern char &AMDGPUUnifyMetadataID;
+
+void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
+extern char &SIFixControlFlowLiveIntervalsID;
+
+void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&);
+extern char &AMDGPUAnnotateUniformValuesPassID;
+
+void initializeAMDGPUCodeGenPreparePass(PassRegistry&);
+extern char &AMDGPUCodeGenPrepareID;
+
+void initializeSIAnnotateControlFlowPass(PassRegistry&);
+extern char &SIAnnotateControlFlowPassID;
+
+void initializeSIDebuggerInsertNopsPass(PassRegistry&);
+extern char &SIDebuggerInsertNopsID;
+
+void initializeSIInsertWaitsPass(PassRegistry&);
+extern char &SIInsertWaitsID;
+
+Target &getTheAMDGPUTarget();
+Target &getTheGCNTarget();
+
+namespace AMDGPU {
+enum TargetIndex {
+  TI_CONSTDATA_START,
+  TI_SCRATCH_RSRC_DWORD0,
+  TI_SCRATCH_RSRC_DWORD1,
+  TI_SCRATCH_RSRC_DWORD2,
+  TI_SCRATCH_RSRC_DWORD3
+};
+}
+
+} // End namespace llvm
+
+/// OpenCL uses address spaces to differentiate between
+/// various memory regions on the hardware. On the CPU
+/// all of the address spaces point to the same memory,
+/// however on the GPU, each address space points to
+/// a separate piece of memory that is unique from other
+/// memory locations.
+namespace AMDGPUAS {
+enum AddressSpaces : unsigned {
+  PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
+  GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
+  CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2)
+  LOCAL_ADDRESS    = 3, ///< Address space for local memory.
+  FLAT_ADDRESS     = 4, ///< Address space for flat memory.
+  REGION_ADDRESS   = 5, ///< Address space for region memory.
+  PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
+  PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
+
+  // Do not re-order the CONSTANT_BUFFER_* enums.  Several places depend on this
+  // order to be able to dynamically index a constant buffer, for example:
+  //
+  // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
+
+  CONSTANT_BUFFER_0 = 8,
+  CONSTANT_BUFFER_1 = 9,
+  CONSTANT_BUFFER_2 = 10,
+  CONSTANT_BUFFER_3 = 11,
+  CONSTANT_BUFFER_4 = 12,
+  CONSTANT_BUFFER_5 = 13,
+  CONSTANT_BUFFER_6 = 14,
+  CONSTANT_BUFFER_7 = 15,
+  CONSTANT_BUFFER_8 = 16,
+  CONSTANT_BUFFER_9 = 17,
+  CONSTANT_BUFFER_10 = 18,
+  CONSTANT_BUFFER_11 = 19,
+  CONSTANT_BUFFER_12 = 20,
+  CONSTANT_BUFFER_13 = 21,
+  CONSTANT_BUFFER_14 = 22,
+  CONSTANT_BUFFER_15 = 23,
+
+  // Some places use this if the address space can't be determined.
+  UNKNOWN_ADDRESS_SPACE = ~0u
+};
+
+} // namespace AMDGPUAS
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
new file mode 100644
index 000000000000..0b2badff7ccf
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -0,0 +1,530 @@
+//===-- AMDGPU.td - AMDGPU Tablegen files --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===------------------------------------------------------------===//
+// Subtarget Features (device properties)
+//===------------------------------------------------------------===//
+
+def FeatureFP64 : SubtargetFeature<"fp64",
+  "FP64",
+  "true",
+  "Enable double precision operations"
+>;
+
+def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
+  "FastFMAF32",
+  "true",
+  "Assuming f32 fma is at least as fast as mul + add"
+>;
+
+def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops",
+  "HalfRate64Ops",
+  "true",
+  "Most fp64 instructions are half rate instead of quarter"
+>;
+
+def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
+  "R600ALUInst",
+  "false",
+  "Older version of ALU instructions encoding"
+>;
+
+def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
+  "HasVertexCache",
+  "true",
+  "Specify use of dedicated vertex cache"
+>;
+
+def FeatureCaymanISA : SubtargetFeature<"caymanISA",
+  "CaymanISA",
+  "true",
+  "Use Cayman ISA"
+>;
+
+def FeatureCFALUBug : SubtargetFeature<"cfalubug",
+  "CFALUBug",
+  "true",
+  "GPU has CF_ALU bug"
+>;
+
+def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
+  "FlatAddressSpace",
+  "true",
+  "Support flat address space"
+>;
+
+def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
+  "UnalignedBufferAccess",
+  "true",
+  "Support unaligned global loads and stores"
+>;
+
+def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access",
+  "UnalignedScratchAccess",
+  "true",
+  "Support unaligned scratch loads and stores"
+>;
+
+// XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
+// XNACK. The current default kernel driver setting is:
+// - graphics ring: XNACK disabled
+// - compute ring: XNACK enabled
+//
+// If XNACK is enabled, the VMEM latency can be worse.
+// If XNACK is disabled, the 2 SGPRs can be used for general purposes.
+def FeatureXNACK : SubtargetFeature<"xnack",
+  "EnableXNACK",
+  "true",
+  "Enable XNACK support"
+>;
+
+def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
+  "SGPRInitBug",
+  "true",
+  "VI SGPR initilization bug requiring a fixed SGPR allocation size"
+>;
+
+class SubtargetFeatureFetchLimit <string Value> :
+                          SubtargetFeature <"fetch"#Value,
+  "TexVTXClauseSize",
+  Value,
+  "Limit the maximum number of fetches in a clause to "#Value
+>;
+
+def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
+def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
+
+class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
+  "wavefrontsize"#Value,
+  "WavefrontSize",
+  !cast<string>(Value),
+  "The number of threads per wavefront"
+>;
+
+def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
+def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
+def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
+
+class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
+  "ldsbankcount"#Value,
+  "LDSBankCount",
+  !cast<string>(Value),
+  "The number of LDS banks per compute unit."
+>;
+
+def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
+def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>;
+
+class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
+  "localmemorysize"#Value,
+  "LocalMemorySize",
+  !cast<string>(Value),
+  "The size of local memory in bytes"
+>;
+
+def FeatureGCN : SubtargetFeature<"gcn",
+  "IsGCN",
+  "true",
+  "GCN or newer GPU"
+>;
+
+def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding",
+  "GCN1Encoding",
+  "true",
+  "Encoding format for SI and CI"
+>;
+
+def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding",
+  "GCN3Encoding",
+  "true",
+  "Encoding format for VI"
+>;
+
+def FeatureCIInsts : SubtargetFeature<"ci-insts",
+  "CIInsts",
+  "true",
+  "Additional intstructions for CI+"
+>;
+
+def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime",
+  "HasSMemRealTime",
+  "true",
+  "Has s_memrealtime instruction"
+>;
+
+def FeatureInv2PiInlineImm : SubtargetFeature<"inv-2pi-inline-imm",
+  "HasInv2PiInlineImm",
+  "true",
+  "Has 1 / (2 * pi) as inline immediate"
+>;
+
+def Feature16BitInsts : SubtargetFeature<"16-bit-insts",
+  "Has16BitInsts",
+  "true",
+  "Has i16/f16 instructions"
+>;
+
+def FeatureMovrel : SubtargetFeature<"movrel",
+  "HasMovrel",
+  "true",
+  "Has v_movrel*_b32 instructions"
+>;
+
+def FeatureVGPRIndexMode : SubtargetFeature<"vgpr-index-mode",
+  "HasVGPRIndexMode",
+  "true",
+  "Has VGPR mode register indexing"
+>;
+
+def FeatureScalarStores : SubtargetFeature<"scalar-stores",
+  "HasScalarStores",
+  "true",
+  "Has store scalar memory instructions"
+>;
+
+//===------------------------------------------------------------===//
+// Subtarget Features (options and debugging)
+//===------------------------------------------------------------===//
+
+def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
+  "FP16Denormals",
+  "true",
+  "Enable half precision denormal handling"
+>;
+
+// Some instructions do not support denormals despite this flag. Using
+// fp32 denormals also causes instructions to run at the double
+// precision rate for the device.
+def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
+  "FP32Denormals",
+  "true",
+  "Enable single precision denormal handling"
+>;
+
+def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
+  "FP64Denormals",
+  "true",
+  "Enable double precision denormal handling",
+  [FeatureFP64]
+>;
+
+def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
+  "FPExceptions",
+  "true",
+  "Enable floating point exceptions"
+>;
+
+class FeatureMaxPrivateElementSize<int size> : SubtargetFeature<
+  "max-private-element-size-"#size,
+  "MaxPrivateElementSize",
+  !cast<string>(size),
+  "Maximum private access size may be "#size
+>;
+
+def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
+def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
+def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
+
+def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
+  "EnableVGPRSpilling",
+  "true",
+  "Enable spilling of VGPRs to scratch memory"
+>;
+
+def FeatureDumpCode : SubtargetFeature <"DumpCode",
+  "DumpCode",
+  "true",
+  "Dump MachineInstrs in the CodeEmitter"
+>;
+
+def FeatureDumpCodeLower : SubtargetFeature <"dumpcode",
+  "DumpCode",
+  "true",
+  "Dump MachineInstrs in the CodeEmitter"
+>;
+
+def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
+  "EnablePromoteAlloca",
+  "true",
+  "Enable promote alloca pass"
+>;
+
+// XXX - This should probably be removed once enabled by default
+def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
+  "EnableLoadStoreOpt",
+  "true",
+  "Enable SI load/store optimizer pass"
+>;
+
+// Performance debugging feature. Allow using DS instruction immediate
+// offsets even if the base pointer can't be proven to be base. On SI,
+// base pointer values that won't give the same result as a 16-bit add
+// are not safe to fold, but this will override the conservative test
+// for the base pointer.
+def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <
+  "unsafe-ds-offset-folding",
+  "EnableUnsafeDSOffsetFolding",
+  "true",
+  "Force using DS instruction immediate offsets on SI"
+>;
+
+def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler",
+  "EnableSIScheduler",
+  "true",
+  "Enable SI Machine Scheduler"
+>;
+
+def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
+  "FlatForGlobal",
+  "true",
+  "Force to generate flat instruction for global"
+>;
+
+// Dummy feature used to disable assembler instructions.
+def FeatureDisable : SubtargetFeature<"",
+  "FeatureDisable","true",
+  "Dummy feature to disable assembler instructions"
+>;
+
+class SubtargetFeatureGeneration <string Value,
+                                  list<SubtargetFeature> Implies> :
+        SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
+                          Value#" GPU generation", Implies>;
+
+def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
+def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
+def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
+
+def FeatureR600 : SubtargetFeatureGeneration<"R600",
+  [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]
+>;
+
+def FeatureR700 : SubtargetFeatureGeneration<"R700",
+  [FeatureFetchLimit16, FeatureLocalMemorySize0]
+>;
+
+def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
+  [FeatureFetchLimit16, FeatureLocalMemorySize32768]
+>;
+
+def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
+  [FeatureFetchLimit16, FeatureWavefrontSize64,
+   FeatureLocalMemorySize32768]
+>;
+
+def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
+  [FeatureFP64, FeatureLocalMemorySize32768,
+  FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding,
+  FeatureLDSBankCount32, FeatureMovrel]
+>;
+
+def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
+  [FeatureFP64, FeatureLocalMemorySize65536,
+  FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
+  FeatureGCN1Encoding, FeatureCIInsts, FeatureMovrel]
+>;
+
+def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
+  [FeatureFP64, FeatureLocalMemorySize65536,
+   FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
+   FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
+   FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
+   FeatureScalarStores, FeatureInv2PiInlineImm
+  ]
+>;
+
+class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping,
+                                  list<SubtargetFeature> Implies>
+                                 : SubtargetFeature <
+  "isaver"#Major#"."#Minor#"."#Stepping,
+  "IsaVersion",
+  "ISAVersion"#Major#"_"#Minor#"_"#Stepping,
+  "Instruction set version number",
+  Implies
+>;
+
+def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0,
+  [FeatureSeaIslands,
+   FeatureLDSBankCount32]>;
+
+def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1,
+  [FeatureSeaIslands,
+   HalfRate64Ops,
+   FeatureLDSBankCount32,
+   FeatureFastFMAF32]>;
+
+def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2,
+  [FeatureSeaIslands,
+   FeatureLDSBankCount16]>;
+
+def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0,
+  [FeatureVolcanicIslands,
+   FeatureLDSBankCount32,
+   FeatureSGPRInitBug]>;
+
+def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1,
+  [FeatureVolcanicIslands,
+   FeatureLDSBankCount32,
+   FeatureXNACK]>;
+
+def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2,
+  [FeatureVolcanicIslands,
+   FeatureLDSBankCount32,
+   FeatureSGPRInitBug]>;
+
+def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3,
+  [FeatureVolcanicIslands,
+   FeatureLDSBankCount32]>;
+
+def FeatureISAVersion8_0_4 : SubtargetFeatureISAVersion <8,0,4,
+  [FeatureVolcanicIslands,
+   FeatureLDSBankCount32]>;
+
+def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
+  [FeatureVolcanicIslands,
+   FeatureLDSBankCount16,
+   FeatureXNACK]>;
+
+//===----------------------------------------------------------------------===//
+// Debugger related subtarget features.
+//===----------------------------------------------------------------------===//
+
+def FeatureDebuggerInsertNops : SubtargetFeature<
+  "amdgpu-debugger-insert-nops",
+  "DebuggerInsertNops",
+  "true",
+  "Insert one nop instruction for each high level source statement"
+>;
+
+def FeatureDebuggerReserveRegs : SubtargetFeature<
+  "amdgpu-debugger-reserve-regs",
+  "DebuggerReserveRegs",
+  "true",
+  "Reserve registers for debugger usage"
+>;
+
+def FeatureDebuggerEmitPrologue : SubtargetFeature<
+  "amdgpu-debugger-emit-prologue",
+  "DebuggerEmitPrologue",
+  "true",
+  "Emit debugger prologue"
+>;
+
+//===----------------------------------------------------------------------===//
+
+def AMDGPUInstrInfo : InstrInfo {
+  let guessInstructionProperties = 1;
+  let noNamedPositionallyEncodedOperands = 1;
+}
+
+def AMDGPUAsmParser : AsmParser {
+  // Some of the R600 registers have the same name, so this crashes.
+  // For example T0_XYZW and T0_XY both have the asm name T0.
+  let ShouldEmitMatchRegisterName = 0;
+}
+
+def AMDGPUAsmWriter : AsmWriter {
+  int PassSubtarget = 1;
+}
+
+def AMDGPUAsmVariants {
+  string Default = "Default";
+  int Default_ID = 0;
+  string VOP3 = "VOP3";
+  int VOP3_ID = 1;
+  string SDWA = "SDWA";
+  int SDWA_ID = 2;
+  string DPP = "DPP";
+  int DPP_ID = 3;
+  string Disable = "Disable";
+  int Disable_ID = 4;
+}
+
+def DefaultAMDGPUAsmParserVariant : AsmParserVariant {
+  let Variant = AMDGPUAsmVariants.Default_ID;
+  let Name = AMDGPUAsmVariants.Default;
+}
+
+def VOP3AsmParserVariant : AsmParserVariant {
+  let Variant = AMDGPUAsmVariants.VOP3_ID;
+  let Name = AMDGPUAsmVariants.VOP3;
+}
+
+def SDWAAsmParserVariant : AsmParserVariant {
+  let Variant = AMDGPUAsmVariants.SDWA_ID;
+  let Name = AMDGPUAsmVariants.SDWA;
+}
+
+def DPPAsmParserVariant : AsmParserVariant {
+  let Variant = AMDGPUAsmVariants.DPP_ID;
+  let Name = AMDGPUAsmVariants.DPP;
+}
+
+def AMDGPU : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = AMDGPUInstrInfo;
+  let AssemblyParsers = [AMDGPUAsmParser];
+  let AssemblyParserVariants = [DefaultAMDGPUAsmParserVariant,
+                                VOP3AsmParserVariant,
+                                SDWAAsmParserVariant,
+                                DPPAsmParserVariant];
+  let AssemblyWriters = [AMDGPUAsmWriter];
+}
+
+// Dummy Instruction itineraries for pseudo instructions
+def ALU_NULL : FuncUnit;
+def NullALU : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Predicate helper class
+//===----------------------------------------------------------------------===//
+
+def TruePredicate : Predicate<"true">;
+
+def isSICI : Predicate<
+  "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
+  "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
+>, AssemblerPredicate<"FeatureGCN1Encoding">;
+
+def isVI : Predicate <
+  "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
+  AssemblerPredicate<"FeatureGCN3Encoding">;
+
+def isCIVI : Predicate <
+  "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || "
+  "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS"
+>, AssemblerPredicate<"FeatureCIInsts">;
+
+def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
+
+def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">;
+
+class PredicateControl {
+  Predicate SubtargetPredicate;
+  Predicate SIAssemblerPredicate = isSICI;
+  Predicate VIAssemblerPredicate = isVI;
+  list<Predicate> AssemblerPredicates = [];
+  Predicate AssemblerPredicate = TruePredicate;
+  list<Predicate> OtherPredicates = [];
+  list<Predicate> Predicates = !listconcat([SubtargetPredicate, AssemblerPredicate],
+                                            AssemblerPredicates,
+                                            OtherPredicates);
+}
+
+// Include AMDGPU TD files
+include "R600Schedule.td"
+include "SISchedule.td"
+include "Processors.td"
+include "AMDGPUInstrInfo.td"
+include "AMDGPUIntrinsics.td"
+include "AMDGPURegisterInfo.td"
+include "AMDGPUInstructions.td"
+include "AMDGPUCallingConv.td"
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
new file mode 100644
index 000000000000..067a16a2af7f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -0,0 +1,75 @@
+//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass marks all internal functions as always_inline and creates
+/// duplicates of all other functions a marks the duplicates as always_inline.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAlwaysInline : public ModulePass {
+  static char ID;
+
+public:
+  AMDGPUAlwaysInline() : ModulePass(ID) { }
+  bool runOnModule(Module &M) override;
+  StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; }
+};
+
+} // End anonymous namespace
+
+char AMDGPUAlwaysInline::ID = 0;
+
+bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+  std::vector<GlobalAlias*> AliasesToRemove;
+  std::vector<Function *> FuncsToClone;
+
+  for (GlobalAlias &A : M.aliases()) {
+    if (Function* F = dyn_cast<Function>(A.getAliasee())) {
+      A.replaceAllUsesWith(F);
+      AliasesToRemove.push_back(&A);
+    }
+  }
+
+  for (GlobalAlias* A : AliasesToRemove) {
+    A->eraseFromParent();
+  }
+
+  for (Function &F : M) {
+    if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
+        !F.hasFnAttribute(Attribute::NoInline))
+      FuncsToClone.push_back(&F);
+  }
+
+  for (Function *F : FuncsToClone) {
+    ValueToValueMapTy VMap;
+    Function *NewFunc = CloneFunction(F, VMap);
+    NewFunc->setLinkage(GlobalValue::InternalLinkage);
+    F->replaceAllUsesWith(NewFunc);
+  }
+
+  for (Function &F : M) {
+    if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) {
+      F.addFnAttr(Attribute::AlwaysInline);
+    }
+  }
+  return false;
+}
+
+ModulePass *llvm::createAMDGPUAlwaysInlinePass() {
+  return new AMDGPUAlwaysInline();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
new file mode 100644
index 000000000000..c98d25e20185
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -0,0 +1,222 @@
+//===-- AMDGPUAnnotateKernelFeaturesPass.cpp ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass adds target attributes to functions which use intrinsics
+/// which will impact calling convention lowering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+
+#define DEBUG_TYPE "amdgpu-annotate-kernel-features"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAnnotateKernelFeatures : public ModulePass {
+private:
+  static bool hasAddrSpaceCast(const Function &F);
+
+  void addAttrToCallers(Function *Intrin, StringRef AttrName);
+  bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
+
+public:
+  static char ID;
+
+  AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { }
+  bool runOnModule(Module &M) override;
+  StringRef getPassName() const override {
+    return "AMDGPU Annotate Kernel Features";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    ModulePass::getAnalysisUsage(AU);
+  }
+
+  static bool visitConstantExpr(const ConstantExpr *CE);
+  static bool visitConstantExprsRecursively(
+    const Constant *EntryC,
+    SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
+};
+
+}
+
+char AMDGPUAnnotateKernelFeatures::ID = 0;
+
+char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
+
+INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
+                "Add AMDGPU function attributes", false, false)
+
+
+// The queue ptr is only needed when casting to flat, not from it.
+static bool castRequiresQueuePtr(unsigned SrcAS) {
+  return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
+}
+
+static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
+  return castRequiresQueuePtr(ASC->getSrcAddressSpace());
+}
+
+bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
+  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+    unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
+    return castRequiresQueuePtr(SrcAS);
+  }
+
+  return false;
+}
+
+bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
+  const Constant *EntryC,
+  SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
+
+  if (!ConstantExprVisited.insert(EntryC).second)
+    return false;
+
+  SmallVector<const Constant *, 16> Stack;
+  Stack.push_back(EntryC);
+
+  while (!Stack.empty()) {
+    const Constant *C = Stack.pop_back_val();
+
+    // Check this constant expression.
+    if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
+      if (visitConstantExpr(CE))
+        return true;
+    }
+
+    // Visit all sub-expressions.
+    for (const Use &U : C->operands()) {
+      const auto *OpC = dyn_cast<Constant>(U);
+      if (!OpC)
+        continue;
+
+      if (!ConstantExprVisited.insert(OpC).second)
+        continue;
+
+      Stack.push_back(OpC);
+    }
+  }
+
+  return false;
+}
+
+// Return true if an addrspacecast is used that requires the queue ptr.
+bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) {
+  SmallPtrSet<const Constant *, 8> ConstantExprVisited;
+
+  for (const BasicBlock &BB : F) {
+    for (const Instruction &I : BB) {
+      if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
+        if (castRequiresQueuePtr(ASC))
+          return true;
+      }
+
+      for (const Use &U : I.operands()) {
+        const auto *OpC = dyn_cast<Constant>(U);
+        if (!OpC)
+          continue;
+
+        if (visitConstantExprsRecursively(OpC, ConstantExprVisited))
+          return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin,
+                                                    StringRef AttrName) {
+  SmallPtrSet<Function *, 4> SeenFuncs;
+
+  for (User *U : Intrin->users()) {
+    // CallInst is the only valid user for an intrinsic.
+    CallInst *CI = cast<CallInst>(U);
+
+    Function *CallingFunction = CI->getParent()->getParent();
+    if (SeenFuncs.insert(CallingFunction).second)
+      CallingFunction->addFnAttr(AttrName);
+  }
+}
+
+bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics(
+  Module &M,
+  ArrayRef<StringRef[2]> IntrinsicToAttr) {
+  bool Changed = false;
+
+  for (const StringRef *Arr  : IntrinsicToAttr) {
+    if (Function *Fn = M.getFunction(Arr[0])) {
+      addAttrToCallers(Fn, Arr[1]);
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
+  Triple TT(M.getTargetTriple());
+
+  static const StringRef IntrinsicToAttr[][2] = {
+    // .x omitted
+    { "llvm.amdgcn.workitem.id.y", "amdgpu-work-item-id-y" },
+    { "llvm.amdgcn.workitem.id.z", "amdgpu-work-item-id-z" },
+
+    { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" },
+    { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" },
+
+    { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" },
+    { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" },
+
+    // .x omitted
+    { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" },
+    { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" }
+  };
+
+  static const StringRef HSAIntrinsicToAttr[][2] = {
+    { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" },
+    { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" },
+    { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" }
+  };
+
+  // TODO: We should not add the attributes if the known compile time workgroup
+  // size is 1 for y/z.
+
+  // TODO: Intrinsics that require queue ptr.
+
+  // We do not need to note the x workitem or workgroup id because they are
+  // always initialized.
+
+  bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
+  if (TT.getOS() == Triple::AMDHSA || TT.getOS() == Triple::Mesa3D) {
+    Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
+
+    for (Function &F : M) {
+      if (F.hasFnAttribute("amdgpu-queue-ptr"))
+        continue;
+
+      if (hasAddrSpaceCast(F))
+        F.addFnAttr("amdgpu-queue-ptr");
+    }
+  }
+
+  return Changed;
+}
+
+ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
+  return new AMDGPUAnnotateKernelFeatures();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
new file mode 100644
index 000000000000..c011be6fa169
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -0,0 +1,189 @@
+//===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass adds amdgpu.uniform metadata to IR values so this information
+/// can be used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-annotate-uniform"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAnnotateUniformValues : public FunctionPass,
+                       public InstVisitor<AMDGPUAnnotateUniformValues> {
+  DivergenceAnalysis *DA;
+  MemoryDependenceResults *MDR;
+  LoopInfo *LI;
+  DenseMap<Value*, GetElementPtrInst*> noClobberClones;
+  bool isKernelFunc;
+
+public:
+  static char ID;
+  AMDGPUAnnotateUniformValues() :
+    FunctionPass(ID) { }
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+  StringRef getPassName() const override {
+    return "AMDGPU Annotate Uniform Values";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DivergenceAnalysis>();
+    AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.setPreservesAll();
+ }
+
+  void visitBranchInst(BranchInst &I);
+  void visitLoadInst(LoadInst &I);
+  bool isClobberedInFunction(LoadInst * Load);
+};
+
+} // End anonymous namespace
+
+INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
+                      "Add AMDGPU uniform metadata", false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
+                    "Add AMDGPU uniform metadata", false, false)
+
+char AMDGPUAnnotateUniformValues::ID = 0;
+
+static void setUniformMetadata(Instruction *I) {
+  I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
+}
+static void setNoClobberMetadata(Instruction *I) {
+  I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
+}
+
+static void DFS(BasicBlock *Root, SetVector<BasicBlock*> & Set) {
+  for (auto I : predecessors(Root))
+    if (Set.insert(I))
+      DFS(I, Set);
+}
+
+bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
+  // 1. get Loop for the Load->getparent();
+  // 2. if it exists, collect all the BBs from the most outer
+  // loop and check for the writes. If NOT - start DFS over all preds.
+  // 3. Start DFS over all preds from the most outer loop header.
+  SetVector<BasicBlock *> Checklist;
+  BasicBlock *Start = Load->getParent();
+  Checklist.insert(Start);
+  const Value *Ptr = Load->getPointerOperand();
+  const Loop *L = LI->getLoopFor(Start);
+  if (L) {
+    const Loop *P = L;
+    do {
+      L = P;
+      P = P->getParentLoop();
+    } while (P);
+    Checklist.insert(L->block_begin(), L->block_end());
+    Start = L->getHeader();
+  }
+
+  DFS(Start, Checklist);
+  for (auto &BB : Checklist) {
+    BasicBlock::iterator StartIt = (BB == Load->getParent()) ?
+     BasicBlock::iterator(Load) : BB->end();
+     if (MDR->getPointerDependencyFrom(MemoryLocation(Ptr),
+       true, StartIt, BB, Load).isClobber())
+       return true;
+  }
+  return false;
+}
+
+void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
+  if (I.isUnconditional())
+    return;
+
+  Value *Cond = I.getCondition();
+  if (!DA->isUniform(Cond))
+    return;
+
+  setUniformMetadata(I.getParent()->getTerminator());
+}
+
+void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
+  Value *Ptr = I.getPointerOperand();
+  if (!DA->isUniform(Ptr))
+    return;
+  auto isGlobalLoad = [](LoadInst &Load)->bool {
+    return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+  };
+  // We're tracking up to the Function boundaries
+  // We cannot go beyond because of FunctionPass restrictions
+  // Thus we can ensure that memory not clobbered for memory
+  // operations that live in kernel only.
+  bool NotClobbered = isKernelFunc &&   !isClobberedInFunction(&I);
+  Instruction *PtrI = dyn_cast<Instruction>(Ptr);
+  if (!PtrI && NotClobbered && isGlobalLoad(I)) {
+    if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
+      // Lookup for the existing GEP
+      if (noClobberClones.count(Ptr)) {
+        PtrI = noClobberClones[Ptr];
+      } else {
+        // Create GEP of the Value
+        Function *F = I.getParent()->getParent();
+        Value *Idx = Constant::getIntegerValue(
+          Type::getInt32Ty(Ptr->getContext()), APInt(64, 0));
+        // Insert GEP at the entry to make it dominate all uses
+        PtrI = GetElementPtrInst::Create(
+          Ptr->getType()->getPointerElementType(), Ptr,
+          ArrayRef<Value*>(Idx), Twine(""), F->getEntryBlock().getFirstNonPHI());
+      }
+      I.replaceUsesOfWith(Ptr, PtrI);
+    }
+  }
+
+  if (PtrI) {
+    setUniformMetadata(PtrI);
+    if (NotClobbered)
+      setNoClobberMetadata(PtrI);
+  }
+}
+
+bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
+  return false;
+}
+
+bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  DA  = &getAnalysis<DivergenceAnalysis>();
+  MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+  LI  = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
+
+  visit(F);
+  noClobberClones.clear();
+  return true;
+}
+
+FunctionPass *
+llvm::createAMDGPUAnnotateUniformValues() {
+  return new AMDGPUAnnotateUniformValues();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
new file mode 100644
index 000000000000..a8e6902c252b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -0,0 +1,826 @@
+//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
+/// code.  When passed an MCAsmStreamer it prints assembly and when passed
+/// an MCObjectStreamer it outputs binary code.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPUAsmPrinter.h"
+#include "MCTargetDesc/AMDGPUTargetStreamer.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "AMDGPU.h"
+#include "AMDKernelCodeT.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "SIDefines.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+// TODO: This should get the default rounding mode from the kernel. We just set
+// the default here, but this could change if the OpenCL rounding mode pragmas
+// are used.
+//
+// The denormal mode here should match what is reported by the OpenCL runtime
+// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
+// can also be override to flush with the -cl-denorms-are-zero compiler flag.
+//
+// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
+// precision, and leaves single precision to flush all and does not report
+// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
+// CL_FP_DENORM for both.
+//
+// FIXME: It seems some instructions do not support single precision denormals
+// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
+// and sin_f32, cos_f32 on most parts).
+
+// We want to use these instructions, and using fp32 denormals also causes
+// instructions to run at the double precision rate for the device so it's
+// probably best to just report no single precision denormals.
+static uint32_t getFPMode(const MachineFunction &F) {
+  const SISubtarget& ST = F.getSubtarget<SISubtarget>();
+  // TODO: Is there any real use for the flush in only / flush out only modes?
+
+  uint32_t FP32Denormals =
+    ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
+  uint32_t FP64Denormals =
+    ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
+  return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
+         FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
+         FP_DENORM_MODE_SP(FP32Denormals) |
+         FP_DENORM_MODE_DP(FP64Denormals);
+}
+
+static AsmPrinter *
+createAMDGPUAsmPrinterPass(TargetMachine &tm,
+                           std::unique_ptr<MCStreamer> &&Streamer) {
+  return new AMDGPUAsmPrinter(tm, std::move(Streamer));
+}
+
+extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
+  TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
+                                     createAMDGPUAsmPrinterPass);
+  TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
+                                     createAMDGPUAsmPrinterPass);
+}
+
+AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
+                                   std::unique_ptr<MCStreamer> Streamer)
+  : AsmPrinter(TM, std::move(Streamer)) {}
+
+StringRef AMDGPUAsmPrinter::getPassName() const {
+  return "AMDGPU Assembly Printer";
+}
+
+void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
+    return;
+
+  // Need to construct an MCSubtargetInfo here in case we have no functions
+  // in the module.
+  std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
+        TM.getTargetTriple().str(), TM.getTargetCPU(),
+        TM.getTargetFeatureString()));
+
+  AMDGPUTargetStreamer *TS =
+      static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+
+  TS->EmitDirectiveHSACodeObjectVersion(2, 1);
+
+  AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits());
+  TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
+                                    "AMD", "AMDGPU");
+
+  // Emit runtime metadata.
+  TS->EmitRuntimeMetadata(M);
+}
+
+bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
+  const MachineBasicBlock *MBB) const {
+  if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
+    return false;
+
+  if (MBB->empty())
+    return true;
+
+  // If this is a block implementing a long branch, an expression relative to
+  // the start of the block is needed.  to the start of the block.
+  // XXX - Is there a smarter way to check this?
+  return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
+}
+
+
+void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
+  const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
+  SIProgramInfo KernelInfo;
+  if (STM.isAmdCodeObjectV2()) {
+    getSIProgramInfo(KernelInfo, *MF);
+    EmitAmdKernelCodeT(*MF, KernelInfo);
+  }
+}
+
+void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
+  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
+  if (MFI->isKernel() && STM.isAmdCodeObjectV2()) {
+    AMDGPUTargetStreamer *TS =
+        static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+    SmallString<128> SymbolName;
+    getNameWithPrefix(SymbolName, MF->getFunction()),
+    TS->EmitAMDGPUSymbolType(SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
+  }
+
+  AsmPrinter::EmitFunctionEntryLabel();
+}
+
+void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+
+  // Group segment variables aren't emitted in HSA.
+  if (AMDGPU::isGroupSegment(GV))
+    return;
+
+  AsmPrinter::EmitGlobalVariable(GV);
+}
+
+bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+
+  // The starting address of all shader programs must be 256 bytes aligned.
+  MF.setAlignment(8);
+
+  SetupMachineFunction(MF);
+
+  MCContext &Context = getObjFileLowering().getContext();
+  MCSectionELF *ConfigSection =
+      Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
+  OutStreamer->SwitchSection(ConfigSection);
+
+  const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
+  SIProgramInfo KernelInfo;
+  if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    getSIProgramInfo(KernelInfo, MF);
+    if (!STM.isAmdHsaOS()) {
+      EmitProgramInfoSI(MF, KernelInfo);
+    }
+  } else {
+    EmitProgramInfoR600(MF);
+  }
+
+  DisasmLines.clear();
+  HexLines.clear();
+  DisasmLineMaxLen = 0;
+
+  EmitFunctionBody();
+
+  if (isVerbose()) {
+    MCSectionELF *CommentSection =
+        Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
+    OutStreamer->SwitchSection(CommentSection);
+
+    if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+      OutStreamer->emitRawComment(" Kernel info:", false);
+      OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
+                                  false);
+      OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
+                                  false);
+      OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
+                                  false);
+      OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
+                                  false);
+      OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
+                                  false);
+      OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
+                                  false);
+      OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +
+                                  " bytes/workgroup (compile time only)", false);
+
+      OutStreamer->emitRawComment(" SGPRBlocks: " +
+                                  Twine(KernelInfo.SGPRBlocks), false);
+      OutStreamer->emitRawComment(" VGPRBlocks: " +
+                                  Twine(KernelInfo.VGPRBlocks), false);
+
+      OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " +
+                                  Twine(KernelInfo.NumSGPRsForWavesPerEU), false);
+      OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " +
+                                  Twine(KernelInfo.NumVGPRsForWavesPerEU), false);
+
+      OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),
+                                  false);
+      OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
+                                  false);
+
+      if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
+        OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
+                                    Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
+        OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" +
+                                    Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false);
+      }
+
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
+                                  Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
+                                  false);
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+                                  Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
+                                  false);
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+                                  Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),
+                                  false);
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+                                  Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),
+                                  false);
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+                                  Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),
+                                  false);
+
+    } else {
+      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+      OutStreamer->emitRawComment(
+        Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize)));
+    }
+  }
+
+  if (STM.dumpCode()) {
+
+    OutStreamer->SwitchSection(
+        Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
+
+    for (size_t i = 0; i < DisasmLines.size(); ++i) {
+      std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
+      Comment += " ; " + HexLines[i] + "\n";
+
+      OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
+      OutStreamer->EmitBytes(StringRef(Comment));
+    }
+  }
+
+  return false;
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
+  unsigned MaxGPR = 0;
+  bool killPixel = false;
+  const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>();
+  const R600RegisterInfo *RI = STM.getRegisterInfo();
+  const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      if (MI.getOpcode() == AMDGPU::KILLGT)
+        killPixel = true;
+      unsigned numOperands = MI.getNumOperands();
+      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+        const MachineOperand &MO = MI.getOperand(op_idx);
+        if (!MO.isReg())
+          continue;
+        unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
+
+        // Register with value > 127 aren't GPR
+        if (HWReg > 127)
+          continue;
+        MaxGPR = std::max(MaxGPR, HWReg);
+      }
+    }
+  }
+
+  unsigned RsrcReg;
+  if (STM.getGeneration() >= R600Subtarget::EVERGREEN) {
+    // Evergreen / Northern Islands
+    switch (MF.getFunction()->getCallingConv()) {
+    default: LLVM_FALLTHROUGH;
+    case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
+    case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
+    case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
+    case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
+    }
+  } else {
+    // R600 / R700
+    switch (MF.getFunction()->getCallingConv()) {
+    default: LLVM_FALLTHROUGH;
+    case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH;
+    case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH;
+    case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
+    case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
+    }
+  }
+
+  OutStreamer->EmitIntValue(RsrcReg, 4);
+  OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
+                           S_STACK_SIZE(MFI->CFStackSize), 4);
+  OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
+  OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
+
+  if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
+    OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
+    OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
+  }
+}
+
+void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
+                                        const MachineFunction &MF) const {
+  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  uint64_t CodeSize = 0;
+  unsigned MaxSGPR = 0;
+  unsigned MaxVGPR = 0;
+  bool VCCUsed = false;
+  bool FlatUsed = false;
+  const SIRegisterInfo *RI = STM.getRegisterInfo();
+  const SIInstrInfo *TII = STM.getInstrInfo();
+
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      // TODO: CodeSize should account for multiple functions.
+
+      // TODO: Should we count size of debug info?
+      if (MI.isDebugValue())
+        continue;
+
+      if (isVerbose())
+        CodeSize += TII->getInstSizeInBytes(MI);
+
+      unsigned numOperands = MI.getNumOperands();
+      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+        const MachineOperand &MO = MI.getOperand(op_idx);
+        unsigned width = 0;
+        bool isSGPR = false;
+
+        if (!MO.isReg())
+          continue;
+
+        unsigned reg = MO.getReg();
+        switch (reg) {
+        case AMDGPU::EXEC:
+        case AMDGPU::EXEC_LO:
+        case AMDGPU::EXEC_HI:
+        case AMDGPU::SCC:
+        case AMDGPU::M0:
+          continue;
+
+        case AMDGPU::VCC:
+        case AMDGPU::VCC_LO:
+        case AMDGPU::VCC_HI:
+          VCCUsed = true;
+          continue;
+
+        case AMDGPU::FLAT_SCR:
+        case AMDGPU::FLAT_SCR_LO:
+        case AMDGPU::FLAT_SCR_HI:
+          // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
+          // instructions aren't used to access the scratch buffer.
+          if (MFI->hasFlatScratchInit())
+            FlatUsed = true;
+          continue;
+
+        case AMDGPU::TBA:
+        case AMDGPU::TBA_LO:
+        case AMDGPU::TBA_HI:
+        case AMDGPU::TMA:
+        case AMDGPU::TMA_LO:
+        case AMDGPU::TMA_HI:
+          llvm_unreachable("trap handler registers should not be used");
+
+        default:
+          break;
+        }
+
+        if (AMDGPU::SReg_32RegClass.contains(reg)) {
+          assert(!AMDGPU::TTMP_32RegClass.contains(reg) &&
+                 "trap handler registers should not be used");
+          isSGPR = true;
+          width = 1;
+        } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 1;
+        } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
+          assert(!AMDGPU::TTMP_64RegClass.contains(reg) &&
+                 "trap handler registers should not be used");
+          isSGPR = true;
+          width = 2;
+        } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 2;
+        } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 3;
+        } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 4;
+        } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 4;
+        } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 8;
+        } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 8;
+        } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 16;
+        } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 16;
+        } else {
+          llvm_unreachable("Unknown register class");
+        }
+        unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
+        unsigned maxUsed = hwReg + width - 1;
+        if (isSGPR) {
+          MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
+        } else {
+          MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
+        }
+      }
+    }
+  }
+
+  unsigned ExtraSGPRs = 0;
+
+  if (VCCUsed)
+    ExtraSGPRs = 2;
+
+  if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
+    if (FlatUsed)
+      ExtraSGPRs = 4;
+  } else {
+    if (STM.isXNACKEnabled())
+      ExtraSGPRs = 4;
+
+    if (FlatUsed)
+      ExtraSGPRs = 6;
+  }
+
+  // Record first reserved register and reserved register count fields, and
+  // update max register counts if "amdgpu-debugger-reserve-regs" attribute was
+  // requested.
+  ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0;
+  ProgInfo.ReservedVGPRCount = RI->getNumDebuggerReservedVGPRs(STM);
+
+  // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
+  // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
+  // attribute was requested.
+  if (STM.debuggerEmitPrologue()) {
+    ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
+      RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
+    ProgInfo.DebuggerPrivateSegmentBufferSGPR =
+      RI->getHWRegIndex(MFI->getScratchRSrcReg());
+  }
+
+  // Check the addressable register limit before we add ExtraSGPRs.
+  if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+      !STM.hasSGPRInitBug()) {
+    unsigned MaxAddressableNumSGPRs = STM.getMaxNumSGPRs();
+    if (MaxSGPR + 1 > MaxAddressableNumSGPRs) {
+      // This can happen due to a compiler bug or when using inline asm.
+      LLVMContext &Ctx = MF.getFunction()->getContext();
+      DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
+                                       "addressable scalar registers",
+                                       MaxSGPR + 1, DS_Error,
+                                       DK_ResourceLimit, MaxAddressableNumSGPRs);
+      Ctx.diagnose(Diag);
+      MaxSGPR = MaxAddressableNumSGPRs - 1;
+    }
+  }
+
+  // Account for extra SGPRs and VGPRs reserved for debugger use.
+  MaxSGPR += ExtraSGPRs;
+  MaxVGPR += RI->getNumDebuggerReservedVGPRs(STM);
+
+  // We found the maximum register index. They start at 0, so add one to get the
+  // number of registers.
+  ProgInfo.NumVGPR = MaxVGPR + 1;
+  ProgInfo.NumSGPR = MaxSGPR + 1;
+
+  // Adjust number of registers used to meet default/requested minimum/maximum
+  // number of waves per execution unit request.
+  ProgInfo.NumSGPRsForWavesPerEU = std::max(
+    ProgInfo.NumSGPR, RI->getMinNumSGPRs(STM, MFI->getMaxWavesPerEU()));
+  ProgInfo.NumVGPRsForWavesPerEU = std::max(
+    ProgInfo.NumVGPR, RI->getMinNumVGPRs(MFI->getMaxWavesPerEU()));
+
+  if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
+      STM.hasSGPRInitBug()) {
+    unsigned MaxNumSGPRs = STM.getMaxNumSGPRs();
+    if (ProgInfo.NumSGPR > MaxNumSGPRs) {
+      // This can happen due to a compiler bug or when using inline asm to use the
+      // registers which are usually reserved for vcc etc.
+
+      LLVMContext &Ctx = MF.getFunction()->getContext();
+      DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
+                                       "scalar registers",
+                                       ProgInfo.NumSGPR, DS_Error,
+                                       DK_ResourceLimit, MaxNumSGPRs);
+      Ctx.diagnose(Diag);
+      ProgInfo.NumSGPR = MaxNumSGPRs;
+      ProgInfo.NumSGPRsForWavesPerEU = MaxNumSGPRs;
+    }
+  }
+
+  if (STM.hasSGPRInitBug()) {
+    ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+    ProgInfo.NumSGPRsForWavesPerEU = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+  }
+
+  if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
+    LLVMContext &Ctx = MF.getFunction()->getContext();
+    DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs",
+                                     MFI->NumUserSGPRs, DS_Error);
+    Ctx.diagnose(Diag);
+  }
+
+  if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
+    LLVMContext &Ctx = MF.getFunction()->getContext();
+    DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory",
+                                     MFI->getLDSSize(), DS_Error);
+    Ctx.diagnose(Diag);
+  }
+
+  // SGPRBlocks is actual number of SGPR blocks minus 1.
+  ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU,
+                                RI->getSGPRAllocGranule());
+  ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / RI->getSGPRAllocGranule() - 1;
+
+  // VGPRBlocks is actual number of VGPR blocks minus 1.
+  ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU,
+                                RI->getVGPRAllocGranule());
+  ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / RI->getVGPRAllocGranule() - 1;
+
+  // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
+  // register.
+  ProgInfo.FloatMode = getFPMode(MF);
+
+  ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
+
+  // Make clamp modifier on NaN input returns 0.
+  ProgInfo.DX10Clamp = 1;
+
+  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  ProgInfo.ScratchSize = FrameInfo.getStackSize();
+
+  ProgInfo.FlatUsed = FlatUsed;
+  ProgInfo.VCCUsed = VCCUsed;
+  ProgInfo.CodeLen = CodeSize;
+
+  unsigned LDSAlignShift;
+  if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
+    // LDS is allocated in 64 dword blocks.
+    LDSAlignShift = 8;
+  } else {
+    // LDS is allocated in 128 dword blocks.
+    LDSAlignShift = 9;
+  }
+
+  unsigned LDSSpillSize =
+    MFI->LDSWaveSpillSize * MFI->getMaxFlatWorkGroupSize();
+
+  ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
+  ProgInfo.LDSBlocks =
+      alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
+
+  // Scratch is allocated in 256 dword blocks.
+  unsigned ScratchAlignShift = 10;
+  // We need to program the hardware with the amount of scratch memory that
+  // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
+  // scratch memory used per thread.
+  ProgInfo.ScratchBlocks =
+      alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
+              1ULL << ScratchAlignShift) >>
+      ScratchAlignShift;
+
+  ProgInfo.ComputePGMRSrc1 =
+      S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
+      S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
+      S_00B848_PRIORITY(ProgInfo.Priority) |
+      S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
+      S_00B848_PRIV(ProgInfo.Priv) |
+      S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
+      S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
+      S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
+
+  // 0 = X, 1 = XY, 2 = XYZ
+  unsigned TIDIGCompCnt = 0;
+  if (MFI->hasWorkItemIDZ())
+    TIDIGCompCnt = 2;
+  else if (MFI->hasWorkItemIDY())
+    TIDIGCompCnt = 1;
+
+  ProgInfo.ComputePGMRSrc2 =
+      S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
+      S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
+      S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
+      S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
+      S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
+      S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
+      S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
+      S_00B84C_EXCP_EN_MSB(0) |
+      S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
+      S_00B84C_EXCP_EN(0);
+}
+
+static unsigned getRsrcReg(CallingConv::ID CallConv) {
+  switch (CallConv) {
+  default: LLVM_FALLTHROUGH;
+  case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
+  case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
+  case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
+  case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
+  }
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
+                                         const SIProgramInfo &KernelInfo) {
+  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv());
+
+  if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
+    OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
+
+    OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
+
+    OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
+    OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
+
+    OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
+    OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+
+    // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
+    // 0" comment but I don't see a corresponding field in the register spec.
+  } else {
+    OutStreamer->EmitIntValue(RsrcReg, 4);
+    OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
+                              S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
+    if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
+      OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
+      OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+    }
+  }
+
+  if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
+    OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
+    OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
+    OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
+    OutStreamer->EmitIntValue(MFI->PSInputEna, 4);
+    OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
+    OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
+  }
+
+  OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
+  OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
+  OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
+  OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4);
+}
+
+// This is supposed to be log2(Size)
+static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMD_ELEMENT_4_BYTES;
+  case 8:
+    return AMD_ELEMENT_8_BYTES;
+  case 16:
+    return AMD_ELEMENT_16_BYTES;
+  default:
+    llvm_unreachable("invalid private_element_size");
+  }
+}
+
+void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
+                                         const SIProgramInfo &KernelInfo) const {
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  amd_kernel_code_t header;
+
+  AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits());
+
+  header.compute_pgm_resource_registers =
+      KernelInfo.ComputePGMRSrc1 |
+      (KernelInfo.ComputePGMRSrc2 << 32);
+  header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
+
+
+  AMD_HSA_BITS_SET(header.code_properties,
+                   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
+                   getElementByteSizeValue(STM.getMaxPrivateElementSize()));
+
+  if (MFI->hasPrivateSegmentBuffer()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
+  }
+
+  if (MFI->hasDispatchPtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+
+  if (MFI->hasQueuePtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+
+  if (MFI->hasKernargSegmentPtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+
+  if (MFI->hasDispatchID())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+
+  if (MFI->hasFlatScratchInit())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+
+  // TODO: Private segment size
+
+  if (MFI->hasGridWorkgroupCountX()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
+  }
+
+  if (MFI->hasGridWorkgroupCountY()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
+  }
+
+  if (MFI->hasGridWorkgroupCountZ()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
+  }
+
+  if (MFI->hasDispatchPtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+
+  if (STM.debuggerSupported())
+    header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
+
+  if (STM.isXNACKEnabled())
+    header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
+
+  // FIXME: Should use getKernArgSize
+  header.kernarg_segment_byte_size =
+      STM.getKernArgSegmentSize(MFI->getABIArgOffset());
+  header.wavefront_sgpr_count = KernelInfo.NumSGPR;
+  header.workitem_vgpr_count = KernelInfo.NumVGPR;
+  header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
+  header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
+  header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
+  header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
+
+  // These alignment values are specified in powers of two, so alignment =
+  // 2^n.  The minimum alignment is 2^4 = 16.
+  header.kernarg_segment_alignment = std::max((size_t)4,
+      countTrailingZeros(MFI->getMaxKernArgAlign()));
+
+  if (STM.debuggerEmitPrologue()) {
+    header.debug_wavefront_private_segment_offset_sgpr =
+      KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
+    header.debug_private_segment_buffer_sgpr =
+      KernelInfo.DebuggerPrivateSegmentBufferSGPR;
+  }
+
+  AMDGPUTargetStreamer *TS =
+      static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+
+  OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+  TS->EmitAMDKernelCodeT(header);
+}
+
+bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                       unsigned AsmVariant,
+                                       const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'r':
+      break;
+    }
+  }
+
+  AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O,
+                   *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo());
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
new file mode 100644
index 000000000000..9a4bafef3a25
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -0,0 +1,160 @@
+//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
+
+#include "AMDGPUMCInstLower.h"
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include <vector>
+
+namespace llvm {
+class MCOperand;
+
+class AMDGPUAsmPrinter final : public AsmPrinter {
+private:
+  struct SIProgramInfo {
+    SIProgramInfo() :
+      VGPRBlocks(0),
+      SGPRBlocks(0),
+      Priority(0),
+      FloatMode(0),
+      Priv(0),
+      DX10Clamp(0),
+      DebugMode(0),
+      IEEEMode(0),
+      ScratchSize(0),
+      ComputePGMRSrc1(0),
+      LDSBlocks(0),
+      ScratchBlocks(0),
+      ComputePGMRSrc2(0),
+      NumVGPR(0),
+      NumSGPR(0),
+      FlatUsed(false),
+      NumSGPRsForWavesPerEU(0),
+      NumVGPRsForWavesPerEU(0),
+      ReservedVGPRFirst(0),
+      ReservedVGPRCount(0),
+      DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1),
+      DebuggerPrivateSegmentBufferSGPR((uint16_t)-1),
+      VCCUsed(false),
+      CodeLen(0) {}
+
+    // Fields set in PGM_RSRC1 pm4 packet.
+    uint32_t VGPRBlocks;
+    uint32_t SGPRBlocks;
+    uint32_t Priority;
+    uint32_t FloatMode;
+    uint32_t Priv;
+    uint32_t DX10Clamp;
+    uint32_t DebugMode;
+    uint32_t IEEEMode;
+    uint32_t ScratchSize;
+
+    uint64_t ComputePGMRSrc1;
+
+    // Fields set in PGM_RSRC2 pm4 packet.
+    uint32_t LDSBlocks;
+    uint32_t ScratchBlocks;
+
+    uint64_t ComputePGMRSrc2;
+
+    uint32_t NumVGPR;
+    uint32_t NumSGPR;
+    uint32_t LDSSize;
+    bool FlatUsed;
+
+    // Number of SGPRs that meets number of waves per execution unit request.
+    uint32_t NumSGPRsForWavesPerEU;
+
+    // Number of VGPRs that meets number of waves per execution unit request.
+    uint32_t NumVGPRsForWavesPerEU;
+
+    // If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first
+    // fixed VGPR number reserved.
+    uint16_t ReservedVGPRFirst;
+
+    // The number of consecutive VGPRs reserved.
+    uint16_t ReservedVGPRCount;
+
+    // Fixed SGPR number used to hold wave scratch offset for entire kernel
+    // execution, or uint16_t(-1) if the register is not used or not known.
+    uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR;
+
+    // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
+    // kernel execution, or uint16_t(-1) if the register is not used or not
+    // known.
+    uint16_t DebuggerPrivateSegmentBufferSGPR;
+
+    // Bonus information for debugging.
+    bool VCCUsed;
+    uint64_t CodeLen;
+  };
+
+  void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;
+  void findNumUsedRegistersSI(const MachineFunction &MF,
+                              unsigned &NumSGPR,
+                              unsigned &NumVGPR) const;
+
+  /// \brief Emit register usage information so that the GPU driver
+  /// can correctly setup the GPU state.
+  void EmitProgramInfoR600(const MachineFunction &MF);
+  void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
+  void EmitAmdKernelCodeT(const MachineFunction &MF,
+                          const SIProgramInfo &KernelInfo) const;
+
+public:
+  explicit AMDGPUAsmPrinter(TargetMachine &TM,
+                            std::unique_ptr<MCStreamer> Streamer);
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override;
+
+  /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated
+  /// pseudo lowering.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+
+  /// \brief tblgen'erated driver function for lowering simple MI->MC pseudo
+  /// instructions.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+  /// Implemented in AMDGPUMCInstLower.cpp
+  void EmitInstruction(const MachineInstr *MI) override;
+
+  void EmitFunctionBodyStart() override;
+
+  void EmitFunctionEntryLabel() override;
+
+  void EmitGlobalVariable(const GlobalVariable *GV) override;
+
+  void EmitStartOfAsmFile(Module &M) override;
+
+  bool isBlockOnlyReachableByFallthrough(
+    const MachineBasicBlock *MBB) const override;
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O) override;
+
+protected:
+  std::vector<std::string> DisasmLines, HexLines;
+  size_t DisasmLineMaxLen;
+};
+
+} // End anonymous llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
new file mode 100644
index 000000000000..d53cc153dc9a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -0,0 +1,42 @@
+//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUCallLowering.h"
+#include "AMDGPUISelLowering.h"
+
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "This shouldn't be built without GISel"
+#endif
+
+AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
+  : CallLowering(&TLI) {
+}
+
+bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+                                        const Value *Val, unsigned VReg) const {
+  return true;
+}
+
+bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+                                              const Function &F,
+                                              ArrayRef<unsigned> VRegs) const {
+  // TODO: Implement once there are generic loads/stores.
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
new file mode 100644
index 000000000000..9ae87c9397ab
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -0,0 +1,34 @@
+//===- lib/Target/AMDGPU/AMDGPUCallLowering.h - Call lowering -*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
+
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+
+namespace llvm {
+
+class AMDGPUTargetLowering;
+
+class AMDGPUCallLowering: public CallLowering {
+ public:
+  AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
+
+  bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+                   unsigned VReg) const override;
+  bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+                            ArrayRef<unsigned> VRegs) const override;
+};
+} // End of namespace llvm;
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
new file mode 100644
index 000000000000..47dfa4992068
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -0,0 +1,135 @@
+//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the AMD Radeon GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+// Inversion of CCIfInReg
+class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
+
+// Calling convention for SI
+def CC_SI : CallingConv<[
+
+  CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+    SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
+    SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
+  ]>>>,
+
+  CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
+      SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
+      SGPR32, SGPR34, SGPR36, SGPR38 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
+      SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
+      SGPR33, SGPR35, SGPR37, SGPR39 ]
+  >>>,
+
+  // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
+  CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
+    VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
+    VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
+    VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
+    VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
+    VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
+    VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
+    VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
+    VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
+    VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
+    VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
+    VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
+    VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
+    VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
+  ]>>>,
+
+  CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow<
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
+      SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
+      SGPR32, SGPR34, SGPR36, SGPR38 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
+      SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
+      SGPR33, SGPR35, SGPR37, SGPR39 ]
+  >>>
+
+]>;
+
+def RetCC_SI : CallingConv<[
+  CCIfType<[i32] , CCAssignToReg<[
+    SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+    SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
+    SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
+  ]>>,
+
+  // 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
+  CCIfType<[f32] , CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
+    VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
+    VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
+    VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
+    VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
+    VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
+    VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
+    VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
+    VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
+    VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
+    VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
+    VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
+    VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
+    VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
+  ]>>
+]>;
+
+// Calling convention for R600
+def CC_R600 : CallingConv<[
+  CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
+    T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
+    T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
+    T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
+    T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
+    T30_XYZW, T31_XYZW, T32_XYZW
+  ]>>>
+]>;
+
+// Calling convention for compute kernels
+def CC_AMDGPU_Kernel : CallingConv<[
+  CCCustom<"allocateKernArg">
+]>;
+
+def CC_AMDGPU : CallingConv<[
+  CCIf<"static_cast<const AMDGPUSubtarget&>"
+        "(State.getMachineFunction().getSubtarget()).getGeneration() >="
+          "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+        "!AMDGPU::isShader(State.getCallingConv())",
+       CCDelegateTo<CC_AMDGPU_Kernel>>,
+  CCIf<"static_cast<const AMDGPUSubtarget&>"
+        "(State.getMachineFunction().getSubtarget()).getGeneration() < "
+          "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+         "!AMDGPU::isShader(State.getCallingConv())",
+        CCDelegateTo<CC_AMDGPU_Kernel>>,
+   CCIf<"static_cast<const AMDGPUSubtarget&>"
+         "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
+           "AMDGPUSubtarget::SOUTHERN_ISLANDS",
+        CCDelegateTo<CC_SI>>,
+   CCIf<"static_cast<const AMDGPUSubtarget&>"
+          "(State.getMachineFunction().getSubtarget()).getGeneration() < "
+            "AMDGPUSubtarget::SOUTHERN_ISLANDS",
+        CCDelegateTo<CC_R600>>
+]>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
new file mode 100644
index 000000000000..e6230547a9b3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -0,0 +1,480 @@
+//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass does misc. AMDGPU optimizations on IR before instruction
+/// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-codegenprepare"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUCodeGenPrepare : public FunctionPass,
+                             public InstVisitor<AMDGPUCodeGenPrepare, bool> {
+  const GCNTargetMachine *TM;
+  const SISubtarget *ST;
+  DivergenceAnalysis *DA;
+  Module *Mod;
+  bool HasUnsafeFPMath;
+
+  /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
+  /// binary operation \p V.
+  ///
+  /// \returns Binary operation \p V.
+  Value *copyFlags(const BinaryOperator &I, Value *V) const;
+
+  /// \returns \p T's base element bit width.
+  unsigned getBaseElementBitWidth(const Type *T) const;
+
+  /// \returns Equivalent 32 bit integer type for given type \p T. For example,
+  /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
+  /// is returned.
+  Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
+
+  /// \returns True if binary operation \p I is a signed binary operation, false
+  /// otherwise.
+  bool isSigned(const BinaryOperator &I) const;
+
+  /// \returns True if the condition of 'select' operation \p I comes from a
+  /// signed 'icmp' operation, false otherwise.
+  bool isSigned(const SelectInst &I) const;
+
+  /// \returns True if type \p T needs to be promoted to 32 bit integer type,
+  /// false otherwise.
+  bool needsPromotionToI32(const Type *T) const;
+
+  /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
+  /// operation.
+  ///
+  /// \details \p I's base element bit width must be greater than 1 and less
+  /// than or equal 16. Promotion is done by sign or zero extending operands to
+  /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
+  /// truncating the result of 32 bit binary operation back to \p I's original
+  /// type. Division operation is not promoted.
+  ///
+  /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
+  /// false otherwise.
+  bool promoteUniformOpToI32(BinaryOperator &I) const;
+
+  /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
+  ///
+  /// \details \p I's base element bit width must be greater than 1 and less
+  /// than or equal 16. Promotion is done by sign or zero extending operands to
+  /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
+  ///
+  /// \returns True.
+  bool promoteUniformOpToI32(ICmpInst &I) const;
+
+  /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
+  /// operation.
+  ///
+  /// \details \p I's base element bit width must be greater than 1 and less
+  /// than or equal 16. Promotion is done by sign or zero extending operands to
+  /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
+  /// result of 32 bit 'select' operation back to \p I's original type.
+  ///
+  /// \returns True.
+  bool promoteUniformOpToI32(SelectInst &I) const;
+
+  /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
+  /// intrinsic.
+  ///
+  /// \details \p I's base element bit width must be greater than 1 and less
+  /// than or equal 16. Promotion is done by zero extending the operand to 32
+  /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
+  /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
+  /// shift amount is 32 minus \p I's base element bit width), and truncating
+  /// the result of the shift operation back to \p I's original type.
+  ///
+  /// \returns True.
+  bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
+
+public:
+  static char ID;
+  AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
+    FunctionPass(ID),
+    TM(static_cast<const GCNTargetMachine *>(TM)),
+    ST(nullptr),
+    DA(nullptr),
+    Mod(nullptr),
+    HasUnsafeFPMath(false) { }
+
+  bool visitFDiv(BinaryOperator &I);
+
+  bool visitInstruction(Instruction &I) { return false; }
+  bool visitBinaryOperator(BinaryOperator &I);
+  bool visitICmpInst(ICmpInst &I);
+  bool visitSelectInst(SelectInst &I);
+
+  bool visitIntrinsicInst(IntrinsicInst &I);
+  bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DivergenceAnalysis>();
+    AU.setPreservesAll();
+ }
+};
+
+} // End anonymous namespace
+
+Value *AMDGPUCodeGenPrepare::copyFlags(
+    const BinaryOperator &I, Value *V) const {
+  BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
+  if (!BinOp) // Possibly constant expression.
+    return V;
+
+  if (isa<OverflowingBinaryOperator>(BinOp)) {
+    BinOp->setHasNoSignedWrap(I.hasNoSignedWrap());
+    BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+  } else if (isa<PossiblyExactOperator>(BinOp))
+    BinOp->setIsExact(I.isExact());
+
+  return V;
+}
+
+unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
+  assert(needsPromotionToI32(T) && "T does not need promotion to i32");
+
+  if (T->isIntegerTy())
+    return T->getIntegerBitWidth();
+  return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
+}
+
+Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
+  assert(needsPromotionToI32(T) && "T does not need promotion to i32");
+
+  if (T->isIntegerTy())
+    return B.getInt32Ty();
+  return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
+}
+
+bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
+  return I.getOpcode() == Instruction::AShr ||
+      I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
+}
+
+bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
+  return isa<ICmpInst>(I.getOperand(0)) ?
+      cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
+}
+
+bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
+  if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 &&
+      T->getIntegerBitWidth() <= 16)
+    return true;
+  if (!T->isVectorTy())
+    return false;
+  return needsPromotionToI32(cast<VectorType>(T)->getElementType());
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
+  assert(needsPromotionToI32(I.getType()) &&
+         "I does not need promotion to i32");
+
+  if (I.getOpcode() == Instruction::SDiv ||
+      I.getOpcode() == Instruction::UDiv)
+    return false;
+
+  IRBuilder<> Builder(&I);
+  Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+  Type *I32Ty = getI32Ty(Builder, I.getType());
+  Value *ExtOp0 = nullptr;
+  Value *ExtOp1 = nullptr;
+  Value *ExtRes = nullptr;
+  Value *TruncRes = nullptr;
+
+  if (isSigned(I)) {
+    ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
+    ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
+  } else {
+    ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
+    ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
+  }
+  ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1));
+  TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
+
+  I.replaceAllUsesWith(TruncRes);
+  I.eraseFromParent();
+
+  return true;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
+  assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
+         "I does not need promotion to i32");
+
+  IRBuilder<> Builder(&I);
+  Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+  Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
+  Value *ExtOp0 = nullptr;
+  Value *ExtOp1 = nullptr;
+  Value *NewICmp  = nullptr;
+
+  if (I.isSigned()) {
+    ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
+    ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
+  } else {
+    ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
+    ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
+  }
+  NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
+
+  I.replaceAllUsesWith(NewICmp);
+  I.eraseFromParent();
+
+  return true;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
+  assert(needsPromotionToI32(I.getType()) &&
+         "I does not need promotion to i32");
+
+  IRBuilder<> Builder(&I);
+  Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+  Type *I32Ty = getI32Ty(Builder, I.getType());
+  Value *ExtOp1 = nullptr;
+  Value *ExtOp2 = nullptr;
+  Value *ExtRes = nullptr;
+  Value *TruncRes = nullptr;
+
+  if (isSigned(I)) {
+    ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
+    ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
+  } else {
+    ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
+    ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
+  }
+  ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
+  TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
+
+  I.replaceAllUsesWith(TruncRes);
+  I.eraseFromParent();
+
+  return true;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
+    IntrinsicInst &I) const {
+  assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
+         "I must be bitreverse intrinsic");
+  assert(needsPromotionToI32(I.getType()) &&
+         "I does not need promotion to i32");
+
+  IRBuilder<> Builder(&I);
+  Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+  Type *I32Ty = getI32Ty(Builder, I.getType());
+  Function *I32 =
+      Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
+  Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
+  Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
+  Value *LShrOp =
+      Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
+  Value *TruncRes =
+      Builder.CreateTrunc(LShrOp, I.getType());
+
+  I.replaceAllUsesWith(TruncRes);
+  I.eraseFromParent();
+
+  return true;
+}
+
+static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
+  const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
+  if (!CNum)
+    return false;
+
+  // Reciprocal f32 is handled separately without denormals.
+  return UnsafeDiv || CNum->isExactlyValue(+1.0);
+}
+
+// Insert an intrinsic for fast fdiv for safe math situations where we can
+// reduce precision. Leave fdiv for situations where the generic node is
+// expected to be optimized.
+bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
+  Type *Ty = FDiv.getType();
+
+  if (!Ty->getScalarType()->isFloatTy())
+    return false;
+
+  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
+  if (!FPMath)
+    return false;
+
+  const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
+  float ULP = FPOp->getFPAccuracy();
+  if (ULP < 2.5f)
+    return false;
+
+  FastMathFlags FMF = FPOp->getFastMathFlags();
+  bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
+                                      FMF.allowReciprocal();
+  if (ST->hasFP32Denormals() && !UnsafeDiv)
+    return false;
+
+  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
+  Builder.setFastMathFlags(FMF);
+  Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
+
+  const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
+  Function *Decl
+    = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
+
+  Value *Num = FDiv.getOperand(0);
+  Value *Den = FDiv.getOperand(1);
+
+  Value *NewFDiv = nullptr;
+
+  if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+    NewFDiv = UndefValue::get(VT);
+
+    // FIXME: Doesn't do the right thing for cases where the vector is partially
+    // constant. This works when the scalarizer pass is run first.
+    for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
+      Value *NumEltI = Builder.CreateExtractElement(Num, I);
+      Value *DenEltI = Builder.CreateExtractElement(Den, I);
+      Value *NewElt;
+
+      if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
+        NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
+      } else {
+        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
+      }
+
+      NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
+    }
+  } else {
+    if (!shouldKeepFDivF32(Num, UnsafeDiv))
+      NewFDiv = Builder.CreateCall(Decl, { Num, Den });
+  }
+
+  if (NewFDiv) {
+    FDiv.replaceAllUsesWith(NewFDiv);
+    NewFDiv->takeName(&FDiv);
+    FDiv.eraseFromParent();
+  }
+
+  return true;
+}
+
+static bool hasUnsafeFPMath(const Function &F) {
+  Attribute Attr = F.getFnAttribute("unsafe-fp-math");
+  return Attr.getValueAsString() == "true";
+}
+
+bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
+  bool Changed = false;
+
+  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
+      DA->isUniform(&I))
+    Changed |= promoteUniformOpToI32(I);
+
+  return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
+  bool Changed = false;
+
+  if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
+      DA->isUniform(&I))
+    Changed |= promoteUniformOpToI32(I);
+
+  return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
+  bool Changed = false;
+
+  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
+      DA->isUniform(&I))
+    Changed |= promoteUniformOpToI32(I);
+
+  return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
+  switch (I.getIntrinsicID()) {
+  case Intrinsic::bitreverse:
+    return visitBitreverseIntrinsicInst(I);
+  default:
+    return false;
+  }
+}
+
+bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
+  bool Changed = false;
+
+  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
+      DA->isUniform(&I))
+    Changed |= promoteUniformBitreverseToI32(I);
+
+  return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
+  Mod = &M;
+  return false;
+}
+
+bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
+  if (!TM || skipFunction(F))
+    return false;
+
+  ST = &TM->getSubtarget<SISubtarget>(F);
+  DA = &getAnalysis<DivergenceAnalysis>();
+  HasUnsafeFPMath = hasUnsafeFPMath(F);
+
+  bool MadeChange = false;
+
+  for (BasicBlock &BB : F) {
+    BasicBlock::iterator Next;
+    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
+      Next = std::next(I);
+      MadeChange |= visit(*I);
+    }
+  }
+
+  return MadeChange;
+}
+
+INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
+                      "AMDGPU IR optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
+                       "AMDGPU IR optimizations", false, false)
+
+char AMDGPUCodeGenPrepare::ID = 0;
+
+FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
+  return new AMDGPUCodeGenPrepare(TM);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
new file mode 100644
index 000000000000..805fb7102a35
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -0,0 +1,102 @@
+//===----------------------- AMDGPUFrameLowering.cpp ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Interface to describe a layout of a stack frame on a AMDGPU target machine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
+    int LAO, unsigned TransAl)
+  : TargetFrameLowering(D, StackAl, LAO, TransAl) { }
+
+AMDGPUFrameLowering::~AMDGPUFrameLowering() = default;
+
+unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
+  // XXX: Hardcoding to 1 for now.
+  //
+  // I think the StackWidth should stored as metadata associated with the
+  // MachineFunction.  This metadata can either be added by a frontend, or
+  // calculated by a R600 specific LLVM IR pass.
+  //
+  // The StackWidth determines how stack objects are laid out in memory.
+  // For a vector stack variable, like: int4 stack[2], the data will be stored
+  // in the following ways depending on the StackWidth.
+  //
+  // StackWidth = 1:
+  //
+  // T0.X = stack[0].x
+  // T1.X = stack[0].y
+  // T2.X = stack[0].z
+  // T3.X = stack[0].w
+  // T4.X = stack[1].x
+  // T5.X = stack[1].y
+  // T6.X = stack[1].z
+  // T7.X = stack[1].w
+  //
+  // StackWidth = 2:
+  //
+  // T0.X = stack[0].x
+  // T0.Y = stack[0].y
+  // T1.X = stack[0].z
+  // T1.Y = stack[0].w
+  // T2.X = stack[1].x
+  // T2.Y = stack[1].y
+  // T3.X = stack[1].z
+  // T3.Y = stack[1].w
+  //
+  // StackWidth = 4:
+  // T0.X = stack[0].x
+  // T0.Y = stack[0].y
+  // T0.Z = stack[0].z
+  // T0.W = stack[0].w
+  // T1.X = stack[1].x
+  // T1.Y = stack[1].y
+  // T1.Z = stack[1].z
+  // T1.W = stack[1].w
+  return 1;
+}
+
+/// \returns The number of registers allocated for \p FI.
+int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                int FI,
+                                                unsigned &FrameReg) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const AMDGPURegisterInfo *RI
+    = MF.getSubtarget<AMDGPUSubtarget>().getRegisterInfo();
+
+  // Fill in FrameReg output argument.
+  FrameReg = RI->getFrameRegister(MF);
+
+  // Start the offset at 2 so we don't overwrite work group information.
+  // XXX: We should only do this when the shader actually uses this
+  // information.
+  unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4);
+  int UpperBound = FI == -1 ? MFI.getNumObjects() : FI;
+
+  for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) {
+    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i));
+    OffsetBytes += MFI.getObjectSize(i);
+    // Each register holds 4 bytes, so we must always align the offset to at
+    // least 4 bytes, so that 2 frame objects won't share the same register.
+    OffsetBytes = alignTo(OffsetBytes, 4);
+  }
+
+  if (FI != -1)
+    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI));
+
+  return OffsetBytes / (getStackWidth(MF) * 4);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
new file mode 100644
index 000000000000..5d51351a00d2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -0,0 +1,47 @@
+//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface to describe a layout of a stack frame on an AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+/// \brief Information about the stack frame layout on the AMDGPU targets.
+///
+/// It holds the direction of the stack growth, the known stack alignment on
+/// entry to each function, and the offset to the locals area.
+/// See TargetFrameInfo for more comments.
+class AMDGPUFrameLowering : public TargetFrameLowering {
+public:
+  AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
+                      unsigned TransAl = 1);
+  ~AMDGPUFrameLowering() override;
+
+  /// \returns The number of 32-bit sub-registers that are used when storing
+  /// values to the stack.
+  unsigned getStackWidth(const MachineFunction &MF) const;
+
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+
+  bool hasFP(const MachineFunction &MF) const override {
+    return false;
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
new file mode 100644
index 000000000000..ef3b44f7c211
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -0,0 +1,1632 @@
+//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Defines an instruction selector for the AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUISelLowering.h" // For AMDGPUISD
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "SIISelLowering.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+#include <new>
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+
+class R600InstrInfo;
+
+} // end namespace llvm
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// AMDGPU specific code to select AMDGPU machine instructions for
+/// SelectionDAG operations.
+class AMDGPUDAGToDAGISel : public SelectionDAGISel {
+  // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
+  // make the right decision when generating code for different targets.
+  const AMDGPUSubtarget *Subtarget;
+
+public:
+  explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(TM, OptLevel) {}
+  ~AMDGPUDAGToDAGISel() override = default;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void Select(SDNode *N) override;
+  StringRef getPassName() const override;
+  void PostprocessISelDAG() override;
+
+private:
+  SDValue foldFrameIndex(SDValue N) const;
+  bool isInlineImmediate(const SDNode *N) const;
+  bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
+                   const R600InstrInfo *TII);
+  bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
+  bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
+
+  bool isConstantLoad(const MemSDNode *N, int cbID) const;
+  bool isUniformBr(const SDNode *N) const;
+
+  SDNode *glueCopyToM0(SDNode *N) const;
+
+  const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
+  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
+  bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
+                                       SDValue& Offset);
+  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
+  bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
+  bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+                       unsigned OffsetBits) const;
+  bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
+  bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
+                                 SDValue &Offset1) const;
+  bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+                   SDValue &SOffset, SDValue &Offset, SDValue &Offen,
+                   SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
+                   SDValue &TFE) const;
+  bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+                         SDValue &SOffset, SDValue &Offset, SDValue &GLC,
+                         SDValue &SLC, SDValue &TFE) const;
+  bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
+                         SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
+                         SDValue &SLC) const;
+  bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
+                          SDValue &SOffset, SDValue &ImmOffset) const;
+  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
+                         SDValue &Offset, SDValue &GLC, SDValue &SLC,
+                         SDValue &TFE) const;
+  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
+                         SDValue &Offset, SDValue &SLC) const;
+  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
+                         SDValue &Offset) const;
+  bool SelectMUBUFConstant(SDValue Constant,
+                           SDValue &SOffset,
+                           SDValue &ImmOffset) const;
+  bool SelectMUBUFIntrinsicOffset(SDValue Offset, SDValue &SOffset,
+                                  SDValue &ImmOffset) const;
+  bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset,
+                                   SDValue &ImmOffset, SDValue &VOffset) const;
+
+  bool SelectFlat(SDValue Addr, SDValue &VAddr,
+                  SDValue &SLC, SDValue &TFE) const;
+
+  bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
+                        bool &Imm) const;
+  bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
+                  bool &Imm) const;
+  bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+  bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+  bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+  bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
+  bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
+  bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
+  bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
+  bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+                       SDValue &Clamp, SDValue &Omod) const;
+  bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+                         SDValue &Clamp, SDValue &Omod) const;
+
+  bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods,
+                            SDValue &Omod) const;
+  bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods,
+                                 SDValue &Clamp,
+                                 SDValue &Omod) const;
+
+  void SelectADD_SUB_I64(SDNode *N);
+  void SelectDIV_SCALE(SDNode *N);
+  void SelectFMA_W_CHAIN(SDNode *N);
+  void SelectFMUL_W_CHAIN(SDNode *N);
+
+  SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
+                   uint32_t Offset, uint32_t Width);
+  void SelectS_BFEFromShifts(SDNode *N);
+  void SelectS_BFE(SDNode *N);
+  bool isCBranchSCC(const SDNode *N) const;
+  void SelectBRCOND(SDNode *N);
+  void SelectATOMIC_CMP_SWAP(SDNode *N);
+
+  // Include the pieces autogenerated from the target description.
+#include "AMDGPUGenDAGISel.inc"
+};
+
+}  // end anonymous namespace
+
+/// \brief This pass converts a legalized DAG into a AMDGPU-specific
+// DAG, ready for instruction scheduling.
+FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
+                                        CodeGenOpt::Level OptLevel) {
+  return new AMDGPUDAGToDAGISel(TM, OptLevel);
+}
+
+bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<AMDGPUSubtarget>();
+  return SelectionDAGISel::runOnMachineFunction(MF);
+}
+
+bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
+  const SIInstrInfo *TII
+    = static_cast<const SISubtarget *>(Subtarget)->getInstrInfo();
+
+  if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
+    return TII->isInlineConstant(C->getAPIntValue());
+
+  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
+    return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
+
+  return false;
+}
+
+/// \brief Determine the register class for \p OpNo
+/// \returns The register class of the virtual register that will be used for
+/// the given operand number \OpNo or NULL if the register class cannot be
+/// determined.
+const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
+                                                          unsigned OpNo) const {
+  if (!N->isMachineOpcode()) {
+    if (N->getOpcode() == ISD::CopyToReg) {
+      unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
+        return MRI.getRegClass(Reg);
+      }
+
+      const SIRegisterInfo *TRI
+        = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+      return TRI->getPhysRegClass(Reg);
+    }
+
+    return nullptr;
+  }
+
+  switch (N->getMachineOpcode()) {
+  default: {
+    const MCInstrDesc &Desc =
+        Subtarget->getInstrInfo()->get(N->getMachineOpcode());
+    unsigned OpIdx = Desc.getNumDefs() + OpNo;
+    if (OpIdx >= Desc.getNumOperands())
+      return nullptr;
+    int RegClass = Desc.OpInfo[OpIdx].RegClass;
+    if (RegClass == -1)
+      return nullptr;
+
+    return Subtarget->getRegisterInfo()->getRegClass(RegClass);
+  }
+  case AMDGPU::REG_SEQUENCE: {
+    unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    const TargetRegisterClass *SuperRC =
+        Subtarget->getRegisterInfo()->getRegClass(RCID);
+
+    SDValue SubRegOp = N->getOperand(OpNo + 1);
+    unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
+    return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
+                                                              SubRegIdx);
+  }
+  }
+}
+
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
+  if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+      cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+    return N;
+
+  const SITargetLowering& Lowering =
+      *static_cast<const SITargetLowering*>(getTargetLowering());
+
+  // Write max value to m0 before each load operation
+
+  SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),
+                                 CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
+
+  SDValue Glue = M0.getValue(1);
+
+  SmallVector <SDValue, 8> Ops;
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+     Ops.push_back(N->getOperand(i));
+  }
+  Ops.push_back(Glue);
+  CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
+
+  return N;
+}
+
+static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
+  switch (NumVectorElts) {
+  case 1:
+    return AMDGPU::SReg_32_XM0RegClassID;
+  case 2:
+    return AMDGPU::SReg_64RegClassID;
+  case 4:
+    return AMDGPU::SReg_128RegClassID;
+  case 8:
+    return AMDGPU::SReg_256RegClassID;
+  case 16:
+    return AMDGPU::SReg_512RegClassID;
+  }
+
+  llvm_unreachable("invalid vector size");
+}
+
+void AMDGPUDAGToDAGISel::Select(SDNode *N) {
+  unsigned int Opc = N->getOpcode();
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return;   // Already selected.
+  }
+
+  if (isa<AtomicSDNode>(N) ||
+      (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC))
+    N = glueCopyToM0(N);
+
+  switch (Opc) {
+  default: break;
+  // We are selecting i64 ADD here instead of custom lower it during
+  // DAG legalization, so we can fold some i64 ADDs used for address
+  // calculation into the LOAD and STORE instructions.
+  case ISD::ADD:
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUB:
+  case ISD::SUBC:
+  case ISD::SUBE: {
+    if (N->getValueType(0) != MVT::i64 ||
+        Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      break;
+
+    SelectADD_SUB_I64(N);
+    return;
+  }
+  case AMDGPUISD::FMUL_W_CHAIN: {
+    SelectFMUL_W_CHAIN(N);
+    return;
+  }
+  case AMDGPUISD::FMA_W_CHAIN: {
+    SelectFMA_W_CHAIN(N);
+    return;
+  }
+
+  case ISD::SCALAR_TO_VECTOR:
+  case AMDGPUISD::BUILD_VERTICAL_VECTOR:
+  case ISD::BUILD_VECTOR: {
+    unsigned RegClassID;
+    const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+    EVT VT = N->getValueType(0);
+    unsigned NumVectorElts = VT.getVectorNumElements();
+    EVT EltVT = VT.getVectorElementType();
+    assert(EltVT.bitsEq(MVT::i32));
+    if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+      RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
+    } else {
+      // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+      // that adds a 128 bits reg copy when going through TwoAddressInstructions
+      // pass. We want to avoid 128 bits copies as much as possible because they
+      // can't be bundled by our scheduler.
+      switch(NumVectorElts) {
+      case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+      case 4:
+        if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+          RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
+        else
+          RegClassID = AMDGPU::R600_Reg128RegClassID;
+        break;
+      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+      }
+    }
+
+    SDLoc DL(N);
+    SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
+
+    if (NumVectorElts == 1) {
+      CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
+                           RegClass);
+      return;
+    }
+
+    assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
+                                  "supported yet");
+    // 16 = Max Num Vector Elements
+    // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
+    // 1 = Vector Register Class
+    SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
+
+    RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
+    bool IsRegSeq = true;
+    unsigned NOps = N->getNumOperands();
+    for (unsigned i = 0; i < NOps; i++) {
+      // XXX: Why is this here?
+      if (isa<RegisterSDNode>(N->getOperand(i))) {
+        IsRegSeq = false;
+        break;
+      }
+      RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
+      RegSeqArgs[1 + (2 * i) + 1] =
+              CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL,
+                                        MVT::i32);
+    }
+
+    if (NOps != NumVectorElts) {
+      // Fill in the missing undef elements if this was a scalar_to_vector.
+      assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
+
+      MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                     DL, EltVT);
+      for (unsigned i = NOps; i < NumVectorElts; ++i) {
+        RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
+        RegSeqArgs[1 + (2 * i) + 1] =
+          CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32);
+      }
+    }
+
+    if (!IsRegSeq)
+      break;
+    CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
+    return;
+  }
+  case ISD::BUILD_PAIR: {
+    SDValue RC, SubReg0, SubReg1;
+    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+      break;
+    }
+    SDLoc DL(N);
+    if (N->getValueType(0) == MVT::i128) {
+      RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32);
+      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
+      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
+    } else if (N->getValueType(0) == MVT::i64) {
+      RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
+      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+    } else {
+      llvm_unreachable("Unhandled value type for BUILD_PAIR");
+    }
+    const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
+                            N->getOperand(1), SubReg1 };
+    ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
+                                          N->getValueType(0), Ops));
+    return;
+  }
+
+  case ISD::Constant:
+  case ISD::ConstantFP: {
+    if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+        N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
+      break;
+
+    uint64_t Imm;
+    if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
+      Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
+    else {
+      ConstantSDNode *C = cast<ConstantSDNode>(N);
+      Imm = C->getZExtValue();
+    }
+
+    SDLoc DL(N);
+    SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+                                CurDAG->getConstant(Imm & 0xFFFFFFFF, DL,
+                                                    MVT::i32));
+    SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+                                CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
+    const SDValue Ops[] = {
+      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
+      SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+      SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
+    };
+
+    ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
+                                          N->getValueType(0), Ops));
+    return;
+  }
+  case ISD::LOAD:
+  case ISD::STORE: {
+    N = glueCopyToM0(N);
+    break;
+  }
+
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      break;
+
+    // There is a scalar version available, but unlike the vector version which
+    // has a separate operand for the offset and width, the scalar version packs
+    // the width and offset into a single operand. Try to move to the scalar
+    // version if the offsets are constant, so that we can try to keep extended
+    // loads of kernel arguments in SGPRs.
+
+    // TODO: Technically we could try to pattern match scalar bitshifts of
+    // dynamic values, but it's probably not useful.
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!Offset)
+      break;
+
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+    if (!Width)
+      break;
+
+    bool Signed = Opc == AMDGPUISD::BFE_I32;
+
+    uint32_t OffsetVal = Offset->getZExtValue();
+    uint32_t WidthVal = Width->getZExtValue();
+
+    ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
+                            SDLoc(N), N->getOperand(0), OffsetVal, WidthVal));
+    return;
+  }
+  case AMDGPUISD::DIV_SCALE: {
+    SelectDIV_SCALE(N);
+    return;
+  }
+  case ISD::CopyToReg: {
+    const SITargetLowering& Lowering =
+      *static_cast<const SITargetLowering*>(getTargetLowering());
+    Lowering.legalizeTargetIndependentNode(N, *CurDAG);
+    break;
+  }
+  case ISD::AND:
+  case ISD::SRL:
+  case ISD::SRA:
+  case ISD::SIGN_EXTEND_INREG:
+    if (N->getValueType(0) != MVT::i32 ||
+        Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      break;
+
+    SelectS_BFE(N);
+    return;
+  case ISD::BRCOND:
+    SelectBRCOND(N);
+    return;
+
+  case AMDGPUISD::ATOMIC_CMP_SWAP:
+    SelectATOMIC_CMP_SWAP(N);
+    return;
+  }
+
+  SelectCode(N);
+}
+
+bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
+  if (!N->readMem())
+    return false;
+  if (CbId == -1)
+    return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+
+  return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId;
+}
+
+bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
+  const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
+  const Instruction *Term = BB->getTerminator();
+  return Term->getMetadata("amdgpu.uniform") ||
+         Term->getMetadata("structurizecfg.uniform");
+}
+
+StringRef AMDGPUDAGToDAGISel::getPassName() const {
+  return "AMDGPU DAG->DAG Pattern Instruction Selection";
+}
+
+//===----------------------------------------------------------------------===//
+// Complex Patterns
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
+                                                         SDValue& IntPtr) {
+  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
+    IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
+                                       true);
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
+    SDValue& BaseReg, SDValue &Offset) {
+  if (!isa<ConstantSDNode>(Addr)) {
+    BaseReg = Addr;
+    Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
+                                           SDValue &Offset) {
+  ConstantSDNode *IMMOffset;
+
+  if (Addr.getOpcode() == ISD::ADD
+      && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && isInt<16>(IMMOffset->getZExtValue())) {
+
+      Base = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
+                                         MVT::i32);
+      return true;
+  // If the pointer address is constant, we can move it to the offset field.
+  } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
+             && isInt<16>(IMMOffset->getZExtValue())) {
+    Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                  SDLoc(CurDAG->getEntryNode()),
+                                  AMDGPU::ZERO, MVT::i32);
+    Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
+                                       MVT::i32);
+    return true;
+  }
+
+  // Default case, no offset
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  ConstantSDNode *C;
+  SDLoc DL(Addr);
+
+  if ((C = dyn_cast<ConstantSDNode>(Addr))) {
+    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+  } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
+            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+    Base = Addr.getOperand(0);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+  } else {
+    Base = Addr;
+    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  }
+
+  return true;
+}
+
+void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
+  SDLoc DL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  unsigned Opcode = N->getOpcode();
+  bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
+  bool ProduceCarry =
+      ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
+  bool IsAdd =
+      (Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE);
+
+  SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+  SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+
+  SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                       DL, MVT::i32, LHS, Sub0);
+  SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                       DL, MVT::i32, LHS, Sub1);
+
+  SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                       DL, MVT::i32, RHS, Sub0);
+  SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                       DL, MVT::i32, RHS, Sub1);
+
+  SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
+
+  unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+  unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+
+  SDNode *AddLo;
+  if (!ConsumeCarry) {
+    SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
+    AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
+  } else {
+    SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
+    AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
+  }
+  SDValue AddHiArgs[] = {
+    SDValue(Hi0, 0),
+    SDValue(Hi1, 0),
+    SDValue(AddLo, 1)
+  };
+  SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
+
+  SDValue RegSequenceArgs[] = {
+    CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
+    SDValue(AddLo,0),
+    Sub0,
+    SDValue(AddHi,0),
+    Sub1,
+  };
+  SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                               MVT::i64, RegSequenceArgs);
+
+  if (ProduceCarry) {
+    // Replace the carry-use
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(AddHi, 1));
+  }
+
+  // Replace the remaining uses.
+  CurDAG->ReplaceAllUsesWith(N, RegSequence);
+  CurDAG->RemoveDeadNode(N);
+}
+
+void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
+  SDLoc SL(N);
+  //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+  SDValue Ops[10];
+
+  SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
+  SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
+  SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
+  Ops[8] = N->getOperand(0);
+  Ops[9] = N->getOperand(4);
+
+  CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops);
+}
+
+void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
+  SDLoc SL(N);
+  //	src0_modifiers, src0,  src1_modifiers, src1, clamp, omod
+  SDValue Ops[8];
+
+  SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
+  SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
+  Ops[6] = N->getOperand(0);
+  Ops[7] = N->getOperand(3);
+
+  CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
+}
+
+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
+void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
+  SDLoc SL(N);
+  EVT VT = N->getValueType(0);
+
+  assert(VT == MVT::f32 || VT == MVT::f64);
+
+  unsigned Opc
+    = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
+
+  // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
+  // omod
+  SDValue Ops[8];
+
+  SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
+  SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
+  SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
+  CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
+}
+
+bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+                                         unsigned OffsetBits) const {
+  if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
+      (OffsetBits == 8 && !isUInt<8>(Offset)))
+    return false;
+
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS ||
+      Subtarget->unsafeDSOffsetFoldingEnabled())
+    return true;
+
+  // On Southern Islands instruction with a negative base value and an offset
+  // don't seem to work.
+  return CurDAG->SignBitIsZero(Base);
+}
+
+bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
+                                              SDValue &Offset) const {
+  SDLoc DL(Addr);
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+    if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) {
+      // (add n0, c0)
+      Base = N0;
+      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+      return true;
+    }
+  } else if (Addr.getOpcode() == ISD::SUB) {
+    // sub C, x -> add (sub 0, x), C
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+      int64_t ByteOffset = C->getSExtValue();
+      if (isUInt<16>(ByteOffset)) {
+        SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+        // XXX - This is kind of hacky. Create a dummy sub node so we can check
+        // the known bits in isDSOffsetLegal. We need to emit the selected node
+        // here, so this is thrown away.
+        SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
+                                      Zero, Addr.getOperand(1));
+
+        if (isDSOffsetLegal(Sub, ByteOffset, 16)) {
+          MachineSDNode *MachineSub
+            = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32,
+                                     Zero, Addr.getOperand(1));
+
+          Base = SDValue(MachineSub, 0);
+          Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
+          return true;
+        }
+      }
+    }
+  } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+    // If we have a constant address, prefer to put the constant into the
+    // offset. This can save moves to load the constant address since multiple
+    // operations can share the zero base address register, and enables merging
+    // into read2 / write2 instructions.
+
+    SDLoc DL(Addr);
+
+    if (isUInt<16>(CAddr->getZExtValue())) {
+      SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+      MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+                                 DL, MVT::i32, Zero);
+      Base = SDValue(MovZero, 0);
+      Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
+      return true;
+    }
+  }
+
+  // default case
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
+  return true;
+}
+
+// TODO: If offset is too big, put low 16-bit into offset.
+bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
+                                                   SDValue &Offset0,
+                                                   SDValue &Offset1) const {
+  SDLoc DL(Addr);
+
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+    unsigned DWordOffset0 = C1->getZExtValue() / 4;
+    unsigned DWordOffset1 = DWordOffset0 + 1;
+    // (add n0, c0)
+    if (isDSOffsetLegal(N0, DWordOffset1, 8)) {
+      Base = N0;
+      Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
+      Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+      return true;
+    }
+  } else if (Addr.getOpcode() == ISD::SUB) {
+    // sub C, x -> add (sub 0, x), C
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+      unsigned DWordOffset0 = C->getZExtValue() / 4;
+      unsigned DWordOffset1 = DWordOffset0 + 1;
+
+      if (isUInt<8>(DWordOffset0)) {
+        SDLoc DL(Addr);
+        SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+        // XXX - This is kind of hacky. Create a dummy sub node so we can check
+        // the known bits in isDSOffsetLegal. We need to emit the selected node
+        // here, so this is thrown away.
+        SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
+                                      Zero, Addr.getOperand(1));
+
+        if (isDSOffsetLegal(Sub, DWordOffset1, 8)) {
+          MachineSDNode *MachineSub
+            = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32,
+                                     Zero, Addr.getOperand(1));
+
+          Base = SDValue(MachineSub, 0);
+          Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
+          Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+          return true;
+        }
+      }
+    }
+  } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+    unsigned DWordOffset0 = CAddr->getZExtValue() / 4;
+    unsigned DWordOffset1 = DWordOffset0 + 1;
+    assert(4 * DWordOffset0 == CAddr->getZExtValue());
+
+    if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) {
+      SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+      MachineSDNode *MovZero
+        = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+                                 DL, MVT::i32, Zero);
+      Base = SDValue(MovZero, 0);
+      Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
+      Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+      return true;
+    }
+  }
+
+  // default case
+
+  // FIXME: This is broken on SI where we still need to check if the base
+  // pointer is positive here.
+  Base = Addr;
+  Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
+  Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
+  return true;
+}
+
+static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
+  return isUInt<12>(Imm->getZExtValue());
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
+                                     SDValue &VAddr, SDValue &SOffset,
+                                     SDValue &Offset, SDValue &Offen,
+                                     SDValue &Idxen, SDValue &Addr64,
+                                     SDValue &GLC, SDValue &SLC,
+                                     SDValue &TFE) const {
+  // Subtarget prefers to use flat instruction
+  if (Subtarget->useFlatForGlobal())
+    return false;
+
+  SDLoc DL(Addr);
+
+  if (!GLC.getNode())
+    GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  if (!SLC.getNode())
+    SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
+
+  Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+
+    if (N0.getOpcode() == ISD::ADD) {
+      // (add (add N2, N3), C1) -> addr64
+      SDValue N2 = N0.getOperand(0);
+      SDValue N3 = N0.getOperand(1);
+      Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
+      Ptr = N2;
+      VAddr = N3;
+    } else {
+      // (add N0, C1) -> offset
+      VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
+      Ptr = N0;
+    }
+
+    if (isLegalMUBUFImmOffset(C1)) {
+      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+      return true;
+    }
+
+    if (isUInt<32>(C1->getZExtValue())) {
+      // Illegal offset, store it in soffset.
+      Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+      SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+                   CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
+                        0);
+      return true;
+    }
+  }
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    // (add N0, N1) -> addr64
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
+    Ptr = N0;
+    VAddr = N1;
+    Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+    return true;
+  }
+
+  // default case -> offset
+  VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  Ptr = Addr;
+  Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
+                                           SDValue &VAddr, SDValue &SOffset,
+                                           SDValue &Offset, SDValue &GLC,
+                                           SDValue &SLC, SDValue &TFE) const {
+  SDValue Ptr, Offen, Idxen, Addr64;
+
+  // addr64 bit was removed for volcanic islands.
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return false;
+
+  if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+              GLC, SLC, TFE))
+    return false;
+
+  ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
+  if (C->getSExtValue()) {
+    SDLoc DL(Addr);
+
+    const SITargetLowering& Lowering =
+      *static_cast<const SITargetLowering*>(getTargetLowering());
+
+    SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
+    return true;
+  }
+
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
+                                           SDValue &VAddr, SDValue &SOffset,
+                                           SDValue &Offset,
+                                           SDValue &SLC) const {
+  SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
+  SDValue GLC, TFE;
+
+  return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);
+}
+
+SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+  if (auto FI = dyn_cast<FrameIndexSDNode>(N))
+    return CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
+  return N;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
+                                            SDValue &VAddr, SDValue &SOffset,
+                                            SDValue &ImmOffset) const {
+
+  SDLoc DL(Addr);
+  MachineFunction &MF = CurDAG->getMachineFunction();
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+  Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+  SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
+
+  // (add n0, c1)
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+
+    // Offsets in vaddr must be positive.
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+    if (isLegalMUBUFImmOffset(C1)) {
+      VAddr = foldFrameIndex(N0);
+      ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+      return true;
+    }
+  }
+
+  // (node)
+  VAddr = foldFrameIndex(Addr);
+  ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
+                                           SDValue &SOffset, SDValue &Offset,
+                                           SDValue &GLC, SDValue &SLC,
+                                           SDValue &TFE) const {
+  SDValue Ptr, VAddr, Offen, Idxen, Addr64;
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+  if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+              GLC, SLC, TFE))
+    return false;
+
+  if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
+      !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
+      !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
+    uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
+                    APInt::getAllOnesValue(32).getZExtValue(); // Size
+    SDLoc DL(Addr);
+
+    const SITargetLowering& Lowering =
+      *static_cast<const SITargetLowering*>(getTargetLowering());
+
+    SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
+                                           SDValue &Soffset, SDValue &Offset
+                                           ) const {
+  SDValue GLC, SLC, TFE;
+
+  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
+}
+bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
+                                           SDValue &Soffset, SDValue &Offset,
+                                           SDValue &SLC) const {
+  SDValue GLC, TFE;
+
+  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant,
+                                             SDValue &SOffset,
+                                             SDValue &ImmOffset) const {
+  SDLoc DL(Constant);
+  uint32_t Imm = cast<ConstantSDNode>(Constant)->getZExtValue();
+  uint32_t Overflow = 0;
+
+  if (Imm >= 4096) {
+    if (Imm <= 4095 + 64) {
+      // Use an SOffset inline constant for 1..64
+      Overflow = Imm - 4095;
+      Imm = 4095;
+    } else {
+      // Try to keep the same value in SOffset for adjacent loads, so that
+      // the corresponding register contents can be re-used.
+      //
+      // Load values with all low-bits set into SOffset, so that a larger
+      // range of values can be covered using s_movk_i32
+      uint32_t High = (Imm + 1) & ~4095;
+      uint32_t Low = (Imm + 1) & 4095;
+      Imm = Low;
+      Overflow = High - 1;
+    }
+  }
+
+  // There is a hardware bug in SI and CI which prevents address clamping in
+  // MUBUF instructions from working correctly with SOffsets. The immediate
+  // offset is unaffected.
+  if (Overflow > 0 &&
+      Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
+    return false;
+
+  ImmOffset = CurDAG->getTargetConstant(Imm, DL, MVT::i16);
+
+  if (Overflow <= 64)
+    SOffset = CurDAG->getTargetConstant(Overflow, DL, MVT::i32);
+  else
+    SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+                      CurDAG->getTargetConstant(Overflow, DL, MVT::i32)),
+                      0);
+
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset,
+                                                    SDValue &SOffset,
+                                                    SDValue &ImmOffset) const {
+  SDLoc DL(Offset);
+
+  if (!isa<ConstantSDNode>(Offset))
+    return false;
+
+  return SelectMUBUFConstant(Offset, SOffset, ImmOffset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
+                                                     SDValue &SOffset,
+                                                     SDValue &ImmOffset,
+                                                     SDValue &VOffset) const {
+  SDLoc DL(Offset);
+
+  // Don't generate an unnecessary voffset for constant offsets.
+  if (isa<ConstantSDNode>(Offset)) {
+    SDValue Tmp1, Tmp2;
+
+    // When necessary, use a voffset in <= CI anyway to work around a hardware
+    // bug.
+    if (Subtarget->getGeneration() > AMDGPUSubtarget::SEA_ISLANDS ||
+        SelectMUBUFConstant(Offset, Tmp1, Tmp2))
+      return false;
+  }
+
+  if (CurDAG->isBaseWithConstantOffset(Offset)) {
+    SDValue N0 = Offset.getOperand(0);
+    SDValue N1 = Offset.getOperand(1);
+    if (cast<ConstantSDNode>(N1)->getSExtValue() >= 0 &&
+        SelectMUBUFConstant(N1, SOffset, ImmOffset)) {
+      VOffset = N0;
+      return true;
+    }
+  }
+
+  SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+  VOffset = Offset;
+
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr,
+                                    SDValue &VAddr,
+                                    SDValue &SLC,
+                                    SDValue &TFE) const {
+  VAddr = Addr;
+  TFE = SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
+  return true;
+}
+
+///
+/// \param EncodedOffset This is the immediate value that will be encoded
+///        directly into the instruction.  On SI/CI the \p EncodedOffset
+///        will be in units of dwords and on VI+ it will be units of bytes.
+static bool isLegalSMRDImmOffset(const AMDGPUSubtarget *ST,
+                                 int64_t EncodedOffset) {
+  return ST->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
+     isUInt<8>(EncodedOffset) : isUInt<20>(EncodedOffset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
+                                          SDValue &Offset, bool &Imm) const {
+
+  // FIXME: Handle non-constant offsets.
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
+  if (!C)
+    return false;
+
+  SDLoc SL(ByteOffsetNode);
+  AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration();
+  int64_t ByteOffset = C->getSExtValue();
+  int64_t EncodedOffset = Gen < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
+      ByteOffset >> 2 : ByteOffset;
+
+  if (isLegalSMRDImmOffset(Subtarget, EncodedOffset)) {
+    Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
+    Imm = true;
+    return true;
+  }
+
+  if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset))
+    return false;
+
+  if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) {
+    // 32-bit Immediates are supported on Sea Islands.
+    Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
+  } else {
+    SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
+    Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32,
+                                            C32Bit), 0);
+  }
+  Imm = false;
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
+                                     SDValue &Offset, bool &Imm) const {
+  SDLoc SL(Addr);
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+
+    if (SelectSMRDOffset(N1, Offset, Imm)) {
+      SBase = N0;
+      return true;
+    }
+  }
+  SBase = Addr;
+  Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
+  Imm = true;
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
+                                       SDValue &Offset) const {
+  bool Imm;
+  return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
+                                         SDValue &Offset) const {
+
+  if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
+    return false;
+
+  bool Imm;
+  if (!SelectSMRD(Addr, SBase, Offset, Imm))
+    return false;
+
+  return !Imm && isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
+                                        SDValue &Offset) const {
+  bool Imm;
+  return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
+         !isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
+                                             SDValue &Offset) const {
+  bool Imm;
+  return SelectSMRDOffset(Addr, Offset, Imm) && Imm;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
+                                               SDValue &Offset) const {
+  if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
+    return false;
+
+  bool Imm;
+  if (!SelectSMRDOffset(Addr, Offset, Imm))
+    return false;
+
+  return !Imm && isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr,
+                                              SDValue &Offset) const {
+  bool Imm;
+  return SelectSMRDOffset(Addr, Offset, Imm) && !Imm &&
+         !isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
+                                            SDValue &Base,
+                                            SDValue &Offset) const {
+  SDLoc DL(Index);
+
+  if (CurDAG->isBaseWithConstantOffset(Index)) {
+    SDValue N0 = Index.getOperand(0);
+    SDValue N1 = Index.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+
+    // (add n0, c0)
+    Base = N0;
+    Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
+    return true;
+  }
+
+  if (isa<ConstantSDNode>(Index))
+    return false;
+
+  Base = Index;
+  Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  return true;
+}
+
+SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL,
+                                     SDValue Val, uint32_t Offset,
+                                     uint32_t Width) {
+  // Transformation function, pack the offset and width of a BFE into
+  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+  // source, bits [5:0] contain the offset and bits [22:16] the width.
+  uint32_t PackedVal = Offset | (Width << 16);
+  SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
+
+  return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
+}
+
+void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
+  // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
+  // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
+  // Predicate: 0 < b <= c < 32
+
+  const SDValue &Shl = N->getOperand(0);
+  ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+
+  if (B && C) {
+    uint32_t BVal = B->getZExtValue();
+    uint32_t CVal = C->getZExtValue();
+
+    if (0 < BVal && BVal <= CVal && CVal < 32) {
+      bool Signed = N->getOpcode() == ISD::SRA;
+      unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
+
+      ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal,
+                              32 - CVal));
+      return;
+    }
+  }
+  SelectCode(N);
+}
+
+void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
+  switch (N->getOpcode()) {
+  case ISD::AND:
+    if (N->getOperand(0).getOpcode() == ISD::SRL) {
+      // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
+      // Predicate: isMask(mask)
+      const SDValue &Srl = N->getOperand(0);
+      ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
+      ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
+
+      if (Shift && Mask) {
+        uint32_t ShiftVal = Shift->getZExtValue();
+        uint32_t MaskVal = Mask->getZExtValue();
+
+        if (isMask_32(MaskVal)) {
+          uint32_t WidthVal = countPopulation(MaskVal);
+
+          ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
+                                  Srl.getOperand(0), ShiftVal, WidthVal));
+          return;
+        }
+      }
+    }
+    break;
+  case ISD::SRL:
+    if (N->getOperand(0).getOpcode() == ISD::AND) {
+      // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
+      // Predicate: isMask(mask >> b)
+      const SDValue &And = N->getOperand(0);
+      ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
+      ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
+
+      if (Shift && Mask) {
+        uint32_t ShiftVal = Shift->getZExtValue();
+        uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
+
+        if (isMask_32(MaskVal)) {
+          uint32_t WidthVal = countPopulation(MaskVal);
+
+          ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
+                                  And.getOperand(0), ShiftVal, WidthVal));
+          return;
+        }
+      }
+    } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
+      SelectS_BFEFromShifts(N);
+      return;
+    }
+    break;
+  case ISD::SRA:
+    if (N->getOperand(0).getOpcode() == ISD::SHL) {
+      SelectS_BFEFromShifts(N);
+      return;
+    }
+    break;
+
+  case ISD::SIGN_EXTEND_INREG: {
+    // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
+    SDValue Src = N->getOperand(0);
+    if (Src.getOpcode() != ISD::SRL)
+      break;
+
+    const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
+    if (!Amt)
+      break;
+
+    unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
+    ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0),
+                            Amt->getZExtValue(), Width));
+    return;
+  }
+  }
+
+  SelectCode(N);
+}
+
+bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
+  assert(N->getOpcode() == ISD::BRCOND);
+  if (!N->hasOneUse())
+    return false;
+
+  SDValue Cond = N->getOperand(1);
+  if (Cond.getOpcode() == ISD::CopyToReg)
+    Cond = Cond.getOperand(2);
+
+  if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
+    return false;
+
+  MVT VT = Cond.getOperand(0).getSimpleValueType();
+  if (VT == MVT::i32)
+    return true;
+
+  if (VT == MVT::i64) {
+    auto ST = static_cast<const SISubtarget *>(Subtarget);
+
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+    return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
+  }
+
+  return false;
+}
+
+void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
+  SDValue Cond = N->getOperand(1);
+
+  if (Cond.isUndef()) {
+    CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
+                         N->getOperand(2), N->getOperand(0));
+    return;
+  }
+
+  if (isCBranchSCC(N)) {
+    // This brcond will use S_CBRANCH_SCC*, so let tablegen handle it.
+    SelectCode(N);
+    return;
+  }
+
+  SDLoc SL(N);
+
+  SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC, Cond);
+  CurDAG->SelectNodeTo(N, AMDGPU::S_CBRANCH_VCCNZ, MVT::Other,
+                       N->getOperand(2), // Basic Block
+                       VCC.getValue(0));
+}
+
+// This is here because there isn't a way to use the generated sub0_sub1 as the
+// subreg index to EXTRACT_SUBREG in tablegen.
+void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
+  MemSDNode *Mem = cast<MemSDNode>(N);
+  unsigned AS = Mem->getAddressSpace();
+  if (AS == AMDGPUAS::FLAT_ADDRESS) {
+    SelectCode(N);
+    return;
+  }
+
+  MVT VT = N->getSimpleValueType(0);
+  bool Is32 = (VT == MVT::i32);
+  SDLoc SL(N);
+
+  MachineSDNode *CmpSwap = nullptr;
+  if (Subtarget->hasAddr64()) {
+    SDValue SRsrc, VAddr, SOffset, Offset, GLC, SLC;
+
+    if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) {
+      unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_ADDR64 :
+        AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_ADDR64;
+      SDValue CmpVal = Mem->getOperand(2);
+
+      // XXX - Do we care about glue operands?
+
+      SDValue Ops[] = {
+        CmpVal, VAddr, SRsrc, SOffset, Offset, SLC, Mem->getChain()
+      };
+
+      CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
+    }
+  }
+
+  if (!CmpSwap) {
+    SDValue SRsrc, SOffset, Offset, SLC;
+    if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) {
+      unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET :
+        AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_OFFSET;
+
+      SDValue CmpVal = Mem->getOperand(2);
+      SDValue Ops[] = {
+        CmpVal, SRsrc, SOffset, Offset, SLC, Mem->getChain()
+      };
+
+      CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
+    }
+  }
+
+  if (!CmpSwap) {
+    SelectCode(N);
+    return;
+  }
+
+  MachineSDNode::mmo_iterator MMOs = MF->allocateMemRefsArray(1);
+  *MMOs = Mem->getMemOperand();
+  CmpSwap->setMemRefs(MMOs, MMOs + 1);
+
+  unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
+  SDValue Extract
+    = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0));
+
+  ReplaceUses(SDValue(N, 0), Extract);
+  ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1));
+  CurDAG->RemoveDeadNode(N);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
+                                        SDValue &SrcMods) const {
+  unsigned Mods = 0;
+
+  Src = In;
+
+  if (Src.getOpcode() == ISD::FNEG) {
+    Mods |= SISrcMods::NEG;
+    Src = Src.getOperand(0);
+  }
+
+  if (Src.getOpcode() == ISD::FABS) {
+    Mods |= SISrcMods::ABS;
+    Src = Src.getOperand(0);
+  }
+
+  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src,
+                                         SDValue &SrcMods) const {
+  bool Res = SelectVOP3Mods(In, Src, SrcMods);
+  return Res && cast<ConstantSDNode>(SrcMods)->isNullValue();
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
+                                         SDValue &SrcMods, SDValue &Clamp,
+                                         SDValue &Omod) const {
+  SDLoc DL(In);
+  // FIXME: Handle Clamp and Omod
+  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  Omod = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+  return SelectVOP3Mods(In, Src, SrcMods);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3NoMods0(SDValue In, SDValue &Src,
+                                           SDValue &SrcMods, SDValue &Clamp,
+                                           SDValue &Omod) const {
+  bool Res = SelectVOP3Mods0(In, Src, SrcMods, Clamp, Omod);
+
+  return Res && cast<ConstantSDNode>(SrcMods)->isNullValue() &&
+                cast<ConstantSDNode>(Clamp)->isNullValue() &&
+                cast<ConstantSDNode>(Omod)->isNullValue();
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src,
+                                              SDValue &SrcMods,
+                                              SDValue &Omod) const {
+  // FIXME: Handle Omod
+  Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
+
+  return SelectVOP3Mods(In, Src, SrcMods);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
+                                                   SDValue &SrcMods,
+                                                   SDValue &Clamp,
+                                                   SDValue &Omod) const {
+  Clamp = Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
+  return SelectVOP3Mods(In, Src, SrcMods);
+}
+
+void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
+  const AMDGPUTargetLowering& Lowering =
+    *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
+  bool IsModified = false;
+  do {
+    IsModified = false;
+    // Go over all selected nodes and try to fold them a bit more
+    for (SDNode &Node : CurDAG->allnodes()) {
+      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node);
+      if (!MachineNode)
+        continue;
+
+      SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
+      if (ResNode != &Node) {
+        ReplaceUses(&Node, ResNode);
+        IsModified = true;
+      }
+    }
+    CurDAG->RemoveDeadNodes();
+  } while (IsModified);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
new file mode 100644
index 000000000000..a87204d46eae
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -0,0 +1,3176 @@
+//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This is the parent TargetLowering class for hardware code gen
+/// targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUISelLowering.h"
+#include "AMDGPU.h"
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600MachineFunctionInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "SIInstrInfo.h"
+using namespace llvm;
+
+static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                            CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  MachineFunction &MF = State.getMachineFunction();
+  AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
+
+  uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
+                                         ArgFlags.getOrigAlign());
+  State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return true;
+}
+
+#include "AMDGPUGenCallingConv.inc"
+
+// Find a larger type to do a load / store of a vector with.
+EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
+  unsigned StoreSize = VT.getStoreSizeInBits();
+  if (StoreSize <= 32)
+    return EVT::getIntegerVT(Ctx, StoreSize);
+
+  assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
+  return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
+}
+
+AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
+                                           const AMDGPUSubtarget &STI)
+    : TargetLowering(TM), Subtarget(&STI) {
+  // Lower floating point store/load to integer store/load to reduce the number
+  // of patterns in tablegen.
+  setOperationAction(ISD::LOAD, MVT::f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
+
+  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+
+  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
+
+  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
+
+  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
+
+  setOperationAction(ISD::LOAD, MVT::i64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
+
+  setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
+
+  setOperationAction(ISD::LOAD, MVT::f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
+
+  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
+
+  // There are no 64-bit extloads. These should be done as a 32-bit extload and
+  // an extension to 64-bit.
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
+  }
+
+  for (MVT VT : MVT::integer_valuetypes()) {
+    if (VT == MVT::i64)
+      continue;
+
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
+
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
+
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
+  }
+
+  for (MVT VT : MVT::integer_vector_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
+  }
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
+
+  setOperationAction(ISD::STORE, MVT::f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
+
+  setOperationAction(ISD::STORE, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
+
+  setOperationAction(ISD::STORE, MVT::v4f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
+
+  setOperationAction(ISD::STORE, MVT::v8f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
+
+  setOperationAction(ISD::STORE, MVT::v16f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
+
+  setOperationAction(ISD::STORE, MVT::i64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
+
+  setOperationAction(ISD::STORE, MVT::v2i64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
+
+  setOperationAction(ISD::STORE, MVT::f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
+
+  setOperationAction(ISD::STORE, MVT::v2f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
+
+  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
+  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
+
+  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
+  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
+
+  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+
+  setTruncStoreAction(MVT::i64, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+
+  setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
+  setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
+  setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
+  setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
+
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+  setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
+  setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
+  setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
+
+  setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
+  setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
+
+  setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
+  setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
+
+
+  setOperationAction(ISD::Constant, MVT::i32, Legal);
+  setOperationAction(ISD::Constant, MVT::i64, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+
+  // This is totally unsupported, just custom lower to produce an error.
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+
+  // We need to custom lower some of the intrinsics
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+
+  // Library functions.  These default to Expand, but we have instructions
+  // for them.
+  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
+  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
+  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
+  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
+  setOperationAction(ISD::FABS,   MVT::f32, Legal);
+  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
+  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+
+  setOperationAction(ISD::FROUND, MVT::f32, Custom);
+  setOperationAction(ISD::FROUND, MVT::f64, Custom);
+
+  setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
+  setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
+
+  setOperationAction(ISD::FREM, MVT::f32, Custom);
+  setOperationAction(ISD::FREM, MVT::f64, Custom);
+
+  // v_mad_f32 does not support denormals according to some sources.
+  if (!Subtarget->hasFP32Denormals())
+    setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+  // Expand to fneg + fadd.
+  setOperationAction(ISD::FSUB, MVT::f64, Expand);
+
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
+
+  if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
+    setOperationAction(ISD::FCEIL, MVT::f64, Custom);
+    setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
+    setOperationAction(ISD::FRINT, MVT::f64, Custom);
+    setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
+  }
+
+  if (!Subtarget->hasBFI()) {
+    // fcopysign can be done in a single instruction with BFI.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  }
+
+  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
+
+  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
+  for (MVT VT : ScalarIntVTs) {
+    // These should use [SU]DIVREM, so set them to expand
+    setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+
+    // GPU does not have divrem function for signed or unsigned.
+    setOperationAction(ISD::SDIVREM, VT, Custom);
+    setOperationAction(ISD::UDIVREM, VT, Custom);
+
+    // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+    setOperationAction(ISD::BSWAP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+  }
+
+  if (!Subtarget->hasBCNT(32))
+    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+
+  if (!Subtarget->hasBCNT(64))
+    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+  // The hardware supports 32-bit ROTR, but not ROTL.
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
+  setOperationAction(ISD::ROTR, MVT::i64, Expand);
+
+  setOperationAction(ISD::MUL, MVT::i64, Expand);
+  setOperationAction(ISD::MULHU, MVT::i64, Expand);
+  setOperationAction(ISD::MULHS, MVT::i64, Expand);
+  setOperationAction(ISD::UDIV, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+
+  setOperationAction(ISD::SMIN, MVT::i32, Legal);
+  setOperationAction(ISD::UMIN, MVT::i32, Legal);
+  setOperationAction(ISD::SMAX, MVT::i32, Legal);
+  setOperationAction(ISD::UMAX, MVT::i32, Legal);
+
+  if (Subtarget->hasFFBH())
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+
+  if (Subtarget->hasFFBL())
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
+
+  setOperationAction(ISD::CTLZ, MVT::i64, Custom);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+
+  // We only really have 32-bit BFE instructions (and 16-bit on VI).
+  //
+  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
+  // effort to match them now. We want this to be false for i64 cases when the
+  // extraction isn't restricted to the upper or lower half. Ideally we would
+  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
+  // span the midpoint are probably relatively rare, so don't worry about them
+  // for now.
+  if (Subtarget->hasBFE())
+    setHasExtractBitsInsn(true);
+
+  static const MVT::SimpleValueType VectorIntTypes[] = {
+    MVT::v2i32, MVT::v4i32
+  };
+
+  for (MVT VT : VectorIntTypes) {
+    // Expand the following operations for the current type by default.
+    setOperationAction(ISD::ADD,  VT, Expand);
+    setOperationAction(ISD::AND,  VT, Expand);
+    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+    setOperationAction(ISD::MUL,  VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
+    setOperationAction(ISD::MULHS, VT, Expand);
+    setOperationAction(ISD::OR,   VT, Expand);
+    setOperationAction(ISD::SHL,  VT, Expand);
+    setOperationAction(ISD::SRA,  VT, Expand);
+    setOperationAction(ISD::SRL,  VT, Expand);
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
+    setOperationAction(ISD::SUB,  VT, Expand);
+    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Custom);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+    setOperationAction(ISD::ADDC, VT, Expand);
+    setOperationAction(ISD::SUBC, VT, Expand);
+    setOperationAction(ISD::ADDE, VT, Expand);
+    setOperationAction(ISD::SUBE, VT, Expand);
+    setOperationAction(ISD::SELECT, VT, Expand);
+    setOperationAction(ISD::VSELECT, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+    setOperationAction(ISD::XOR,  VT, Expand);
+    setOperationAction(ISD::BSWAP, VT, Expand);
+    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+  }
+
+  static const MVT::SimpleValueType FloatVectorTypes[] = {
+    MVT::v2f32, MVT::v4f32
+  };
+
+  for (MVT VT : FloatVectorTypes) {
+    setOperationAction(ISD::FABS, VT, Expand);
+    setOperationAction(ISD::FMINNUM, VT, Expand);
+    setOperationAction(ISD::FMAXNUM, VT, Expand);
+    setOperationAction(ISD::FADD, VT, Expand);
+    setOperationAction(ISD::FCEIL, VT, Expand);
+    setOperationAction(ISD::FCOS, VT, Expand);
+    setOperationAction(ISD::FDIV, VT, Expand);
+    setOperationAction(ISD::FEXP2, VT, Expand);
+    setOperationAction(ISD::FLOG2, VT, Expand);
+    setOperationAction(ISD::FREM, VT, Expand);
+    setOperationAction(ISD::FPOW, VT, Expand);
+    setOperationAction(ISD::FFLOOR, VT, Expand);
+    setOperationAction(ISD::FTRUNC, VT, Expand);
+    setOperationAction(ISD::FMUL, VT, Expand);
+    setOperationAction(ISD::FMA, VT, Expand);
+    setOperationAction(ISD::FRINT, VT, Expand);
+    setOperationAction(ISD::FNEARBYINT, VT, Expand);
+    setOperationAction(ISD::FSQRT, VT, Expand);
+    setOperationAction(ISD::FSIN, VT, Expand);
+    setOperationAction(ISD::FSUB, VT, Expand);
+    setOperationAction(ISD::FNEG, VT, Expand);
+    setOperationAction(ISD::VSELECT, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+  }
+
+  // This causes using an unrolled select operation rather than expansion with
+  // bit operations. This is in general better, but the alternative using BFI
+  // instructions may be better if the select sources are SGPRs.
+  setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
+
+  setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
+
+  // There are no libcalls of any kind.
+  for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
+    setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
+
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  setSchedulingPreference(Sched::RegPressure);
+  setJumpIsExpensive(true);
+  setHasMultipleConditionRegisters(true);
+
+  // SI at least has hardware support for floating point exceptions, but no way
+  // of using or handling them is implemented. They are also optional in OpenCL
+  // (Section 7.3)
+  setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
+
+  PredictableSelectIsExpensive = false;
+
+  // We want to find all load dependencies for long chains of stores to enable
+  // merging into very wide vectors. The problem is with vectors with > 4
+  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
+  // vectors are a legal type, even though we have to split the loads
+  // usually. When we can more precisely specify load legality per address
+  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
+  // smarter so that they can figure out what to do in 2 iterations without all
+  // N > 4 stores on the same chain.
+  GatherAllAliasesMaxDepth = 16;
+
+  // FIXME: Need to really handle these.
+  MaxStoresPerMemcpy  = 4096;
+  MaxStoresPerMemmove = 4096;
+  MaxStoresPerMemset  = 4096;
+
+  setTargetDAGCombine(ISD::BITCAST);
+  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::MULHU);
+  setTargetDAGCombine(ISD::MULHS);
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::SELECT_CC);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::FSUB);
+}
+
+//===----------------------------------------------------------------------===//
+// Target Information
+//===----------------------------------------------------------------------===//
+
+MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
+  return MVT::i32;
+}
+
+bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
+  return true;
+}
+
+// The backend supports 32 and 64 bit floating point immediates.
+// FIXME: Why are we reporting vectors of FP immediates as legal?
+bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  EVT ScalarVT = VT.getScalarType();
+  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
+         (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
+}
+
+// We don't want to shrink f64 / f32 constants.
+bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
+  EVT ScalarVT = VT.getScalarType();
+  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
+}
+
+bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
+                                                 ISD::LoadExtType,
+                                                 EVT NewVT) const {
+
+  unsigned NewSize = NewVT.getStoreSizeInBits();
+
+  // If we are reducing to a 32-bit load, this is always better.
+  if (NewSize == 32)
+    return true;
+
+  EVT OldVT = N->getValueType(0);
+  unsigned OldSize = OldVT.getStoreSizeInBits();
+
+  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
+  // extloads, so doing one requires using a buffer_load. In cases where we
+  // still couldn't use a scalar load, using the wider load shouldn't really
+  // hurt anything.
+
+  // If the old size already had to be an extload, there's no harm in continuing
+  // to reduce the width.
+  return (OldSize < 32);
+}
+
+bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
+                                                   EVT CastTy) const {
+
+  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
+
+  if (LoadTy.getScalarType() == MVT::i32)
+    return false;
+
+  unsigned LScalarSize = LoadTy.getScalarSizeInBits();
+  unsigned CastScalarSize = CastTy.getScalarSizeInBits();
+
+  return (LScalarSize < CastScalarSize) ||
+         (CastScalarSize >= 32);
+}
+
+// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
+// profitable with the expansion for 64-bit since it's generally good to
+// speculate things.
+// FIXME: These should really have the size as a parameter.
+bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
+  return true;
+}
+
+bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
+  return true;
+}
+
+//===---------------------------------------------------------------------===//
+// Target Properties
+//===---------------------------------------------------------------------===//
+
+bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
+  assert(VT.isFloatingPoint());
+  return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() &&
+                                              VT == MVT::f16);
+}
+
+bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
+  return isFAbsFree(VT);
+}
+
+bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
+                                                         unsigned NumElem,
+                                                         unsigned AS) const {
+  return true;
+}
+
+bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
+  // There are few operations which truly have vector input operands. Any vector
+  // operation is going to involve operations on each component, and a
+  // build_vector will be a copy per element, so it always makes sense to use a
+  // build_vector input in place of the extracted element to avoid a copy into a
+  // super register.
+  //
+  // We should probably only do this if all users are extracts only, but this
+  // should be the common case.
+  return true;
+}
+
+bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
+  // Truncate is just accessing a subregister.
+
+  unsigned SrcSize = Source.getSizeInBits();
+  unsigned DestSize = Dest.getSizeInBits();
+
+  return DestSize < SrcSize && DestSize % 32 == 0 ;
+}
+
+bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
+  // Truncate is just accessing a subregister.
+
+  unsigned SrcSize = Source->getScalarSizeInBits();
+  unsigned DestSize = Dest->getScalarSizeInBits();
+
+  if (DestSize== 16 && Subtarget->has16BitInsts())
+    return SrcSize >= 32;
+
+  return DestSize < SrcSize && DestSize % 32 == 0;
+}
+
+bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
+  unsigned SrcSize = Src->getScalarSizeInBits();
+  unsigned DestSize = Dest->getScalarSizeInBits();
+
+  if (SrcSize == 16 && Subtarget->has16BitInsts())
+    return DestSize >= 32;
+
+  return SrcSize == 32 && DestSize == 64;
+}
+
+bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
+  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
+  // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
+  // this will enable reducing 64-bit operations the 32-bit, which is always
+  // good.
+
+  if (Src == MVT::i16)
+    return Dest == MVT::i32 ||Dest == MVT::i64 ;
+
+  return Src == MVT::i32 && Dest == MVT::i64;
+}
+
+bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  return isZExtFree(Val.getValueType(), VT2);
+}
+
+bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
+  // limited number of native 64-bit operations. Shrinking an operation to fit
+  // in a single 32-bit register should always be helpful. As currently used,
+  // this is much less general than the name suggests, and is only used in
+  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
+  // not profitable, and may actually be harmful.
+  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+}
+
+//===---------------------------------------------------------------------===//
+// TargetLowering Callbacks
+//===---------------------------------------------------------------------===//
+
+/// The SelectionDAGBuilder will automatically promote function arguments
+/// with illegal types.  However, this does not work for the AMDGPU targets
+/// since the function arguments are stored in memory as these illegal types.
+/// In order to handle this properly we need to get the original types sizes
+/// from the LLVM IR Function and fixup the ISD:InputArg values before
+/// passing them to AnalyzeFormalArguments()
+
+/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
+/// input values across multiple registers.  Each item in the Ins array
+/// represents a single value that will be stored in regsters.  Ins[x].VT is
+/// the value type of the value that will be stored in the register, so
+/// whatever SDNode we lower the argument to needs to be this type.
+///
+/// In order to correctly lower the arguments we need to know the size of each
+/// argument.  Since Ins[x].VT gives us the size of the register that will
+/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
+/// for the orignal function argument so that we can deduce the correct memory
+/// type to use for Ins[x].  In most cases the correct memory type will be
+/// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
+/// we have a kernel argument of type v8i8, this argument will be split into
+/// 8 parts and each part will be represented by its own item in the Ins array.
+/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
+/// the argument before it was split.  From this, we deduce that the memory type
+/// for each individual part is i8.  We pass the memory type as LocVT to the
+/// calling convention analysis function and the register type (Ins[x].VT) as
+/// the ValVT.
+void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
+                             const SmallVectorImpl<ISD::InputArg> &Ins) const {
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    const ISD::InputArg &In = Ins[i];
+    EVT MemVT;
+
+    unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
+
+    if (!Subtarget->isAmdHsaOS() &&
+        (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
+      // The ABI says the caller will extend these values to 32-bits.
+      MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
+    } else if (NumRegs == 1) {
+      // This argument is not split, so the IR type is the memory type.
+      assert(!In.Flags.isSplit());
+      if (In.ArgVT.isExtended()) {
+        // We have an extended type, like i24, so we should just use the register type
+        MemVT = In.VT;
+      } else {
+        MemVT = In.ArgVT;
+      }
+    } else if (In.ArgVT.isVector() && In.VT.isVector() &&
+               In.ArgVT.getScalarType() == In.VT.getScalarType()) {
+      assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
+      // We have a vector value which has been split into a vector with
+      // the same scalar type, but fewer elements.  This should handle
+      // all the floating-point vector types.
+      MemVT = In.VT;
+    } else if (In.ArgVT.isVector() &&
+               In.ArgVT.getVectorNumElements() == NumRegs) {
+      // This arg has been split so that each element is stored in a separate
+      // register.
+      MemVT = In.ArgVT.getScalarType();
+    } else if (In.ArgVT.isExtended()) {
+      // We have an extended type, like i65.
+      MemVT = In.VT;
+    } else {
+      unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
+      assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
+      if (In.VT.isInteger()) {
+        MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
+      } else if (In.VT.isVector()) {
+        assert(!In.VT.getScalarType().isFloatingPoint());
+        unsigned NumElements = In.VT.getVectorNumElements();
+        assert(MemoryBits % NumElements == 0);
+        // This vector type has been split into another vector type with
+        // a different elements size.
+        EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
+                                         MemoryBits / NumElements);
+        MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
+      } else {
+        llvm_unreachable("cannot deduce memory type.");
+      }
+    }
+
+    // Convert one element vectors to scalar.
+    if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
+      MemVT = MemVT.getScalarType();
+
+    if (MemVT.isExtended()) {
+      // This should really only happen if we have vec3 arguments
+      assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
+      MemVT = MemVT.getPow2VectorType(State.getContext());
+    }
+
+    assert(MemVT.isSimple());
+    allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
+                    State);
+  }
+}
+
+void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
+                              const SmallVectorImpl<ISD::InputArg> &Ins) const {
+  State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
+}
+
+void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs) const {
+
+  State.AnalyzeReturn(Outs, RetCC_SI);
+}
+
+SDValue
+AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                  bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  const SmallVectorImpl<SDValue> &OutVals,
+                                  const SDLoc &DL, SelectionDAG &DAG) const {
+  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
+}
+
+//===---------------------------------------------------------------------===//
+// Target specific lowering
+//===---------------------------------------------------------------------===//
+
+SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                        SmallVectorImpl<SDValue> &InVals) const {
+  SDValue Callee = CLI.Callee;
+  SelectionDAG &DAG = CLI.DAG;
+
+  const Function &Fn = *DAG.getMachineFunction().getFunction();
+
+  StringRef FuncName("<unknown>");
+
+  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
+    FuncName = G->getSymbol();
+  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    FuncName = G->getGlobal()->getName();
+
+  DiagnosticInfoUnsupported NoCalls(
+      Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc());
+  DAG.getContext()->diagnose(NoCalls);
+
+  if (!CLI.IsTailCall) {
+    for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
+      InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
+  }
+
+  return DAG.getEntryNode();
+}
+
+SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  const Function &Fn = *DAG.getMachineFunction().getFunction();
+
+  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
+                                            SDLoc(Op).getDebugLoc());
+  DAG.getContext()->diagnose(NoDynamicAlloca);
+  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
+  return DAG.getMergeValues(Ops, SDLoc());
+}
+
+SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    Op->dump(&DAG);
+    llvm_unreachable("Custom lowering code for this"
+                     "instruction is not implemented yet!");
+    break;
+  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
+  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
+  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
+  case ISD::FREM: return LowerFREM(Op, DAG);
+  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
+  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
+  case ISD::FRINT: return LowerFRINT(Op, DAG);
+  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
+  case ISD::FROUND: return LowerFROUND(Op, DAG);
+  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
+  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
+  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
+  case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
+  case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+    return LowerCTLZ(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  }
+  return Op;
+}
+
+void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
+                                              SmallVectorImpl<SDValue> &Results,
+                                              SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  case ISD::SIGN_EXTEND_INREG:
+    // Different parts of legalization seem to interpret which type of
+    // sign_extend_inreg is the one to check for custom lowering. The extended
+    // from type is what really matters, but some places check for custom
+    // lowering of the result type. This results in trying to use
+    // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
+    // nothing here and let the illegal result integer be handled normally.
+    return;
+  default:
+    return;
+  }
+}
+
+static bool hasDefinedInitializer(const GlobalValue *GV) {
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar || !GVar->hasInitializer())
+    return false;
+
+  return !isa<UndefValue>(GVar->getInitializer());
+}
+
+SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
+                                                 SDValue Op,
+                                                 SelectionDAG &DAG) const {
+
+  const DataLayout &DL = DAG.getDataLayout();
+  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = G->getGlobal();
+
+  switch (G->getAddressSpace()) {
+  case AMDGPUAS::LOCAL_ADDRESS: {
+    // XXX: What does the value of G->getOffset() mean?
+    assert(G->getOffset() == 0 &&
+         "Do not know what to do with an non-zero offset");
+
+    // TODO: We could emit code to handle the initialization somewhere.
+    if (hasDefinedInitializer(GV))
+      break;
+
+    unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
+    return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
+  }
+  }
+
+  const Function &Fn = *DAG.getMachineFunction().getFunction();
+  DiagnosticInfoUnsupported BadInit(
+      Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
+  DAG.getContext()->diagnose(BadInit);
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SmallVector<SDValue, 8> Args;
+
+  for (const SDUse &U : Op->ops())
+    DAG.ExtractVectorElements(U.get(), Args);
+
+  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
+}
+
+SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+
+  SmallVector<SDValue, 8> Args;
+  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  EVT VT = Op.getValueType();
+  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
+                            VT.getVectorNumElements());
+
+  return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
+}
+
+SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+    SelectionDAG &DAG) const {
+  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  switch (IntrinsicID) {
+    default: return Op;
+    case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name.
+      return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_bfe_i32:
+      return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
+                         Op.getOperand(1),
+                         Op.getOperand(2),
+                         Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_bfe_u32:
+      return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
+                         Op.getOperand(1),
+                         Op.getOperand(2),
+                         Op.getOperand(3));
+  }
+}
+
+/// \brief Generate Min/Max node
+SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(const SDLoc &DL, EVT VT,
+                                                   SDValue LHS, SDValue RHS,
+                                                   SDValue True, SDValue False,
+                                                   SDValue CC,
+                                                   DAGCombinerInfo &DCI) const {
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return SDValue();
+
+  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+  switch (CCOpcode) {
+  case ISD::SETOEQ:
+  case ISD::SETONE:
+  case ISD::SETUNE:
+  case ISD::SETNE:
+  case ISD::SETUEQ:
+  case ISD::SETEQ:
+  case ISD::SETFALSE:
+  case ISD::SETFALSE2:
+  case ISD::SETTRUE:
+  case ISD::SETTRUE2:
+  case ISD::SETUO:
+  case ISD::SETO:
+    break;
+  case ISD::SETULE:
+  case ISD::SETULT: {
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
+    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+  }
+  case ISD::SETOLE:
+  case ISD::SETOLT:
+  case ISD::SETLE:
+  case ISD::SETLT: {
+    // Ordered. Assume ordered for undefined.
+
+    // Only do this after legalization to avoid interfering with other combines
+    // which might occur.
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
+        !DCI.isCalledByLegalizer())
+      return SDValue();
+
+    // We need to permute the operands to get the correct NaN behavior. The
+    // selected operand is the second one based on the failing compare with NaN,
+    // so permute it based on the compare type the hardware uses.
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
+    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+  }
+  case ISD::SETUGE:
+  case ISD::SETUGT: {
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
+  }
+  case ISD::SETGT:
+  case ISD::SETGE:
+  case ISD::SETOGE:
+  case ISD::SETOGT: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
+        !DCI.isCalledByLegalizer())
+      return SDValue();
+
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
+  }
+  case ISD::SETCC_INVALID:
+    llvm_unreachable("Invalid setcc condcode!");
+  }
+  return SDValue();
+}
+
+std::pair<SDValue, SDValue>
+AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+
+  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
+
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
+
+  return std::make_pair(Lo, Hi);
+}
+
+SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+
+  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
+}
+
+SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+
+  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
+  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
+}
+
+SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
+                                              SelectionDAG &DAG) const {
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+  EVT VT = Op.getValueType();
+
+
+  // If this is a 2 element vector, we really want to scalarize and not create
+  // weird 1 element vectors.
+  if (VT.getVectorNumElements() == 2)
+    return scalarizeVectorLoad(Load, DAG);
+
+  SDValue BasePtr = Load->getBasePtr();
+  EVT PtrVT = BasePtr.getValueType();
+  EVT MemVT = Load->getMemoryVT();
+  SDLoc SL(Op);
+
+  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
+
+  EVT LoVT, HiVT;
+  EVT LoMemVT, HiMemVT;
+  SDValue Lo, Hi;
+
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
+  std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
+
+  unsigned Size = LoMemVT.getStoreSize();
+  unsigned BaseAlign = Load->getAlignment();
+  unsigned HiAlign = MinAlign(BaseAlign, Size);
+
+  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
+                                  Load->getChain(), BasePtr, SrcValue, LoMemVT,
+                                  BaseAlign, Load->getMemOperand()->getFlags());
+  SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+                              DAG.getConstant(Size, SL, PtrVT));
+  SDValue HiLoad =
+      DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
+                     HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+                     HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
+
+  SDValue Ops[] = {
+    DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
+    DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
+                LoLoad.getValue(1), HiLoad.getValue(1))
+  };
+
+  return DAG.getMergeValues(Ops, SL);
+}
+
+SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  SDValue Val = Store->getValue();
+  EVT VT = Val.getValueType();
+
+  // If this is a 2 element vector, we really want to scalarize and not create
+  // weird 1 element vectors.
+  if (VT.getVectorNumElements() == 2)
+    return scalarizeVectorStore(Store, DAG);
+
+  EVT MemVT = Store->getMemoryVT();
+  SDValue Chain = Store->getChain();
+  SDValue BasePtr = Store->getBasePtr();
+  SDLoc SL(Op);
+
+  EVT LoVT, HiVT;
+  EVT LoMemVT, HiMemVT;
+  SDValue Lo, Hi;
+
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
+  std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
+
+  EVT PtrVT = BasePtr.getValueType();
+  SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+                              DAG.getConstant(LoMemVT.getStoreSize(), SL,
+                                              PtrVT));
+
+  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
+  unsigned BaseAlign = Store->getAlignment();
+  unsigned Size = LoMemVT.getStoreSize();
+  unsigned HiAlign = MinAlign(BaseAlign, Size);
+
+  SDValue LoStore =
+      DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
+                        Store->getMemOperand()->getFlags());
+  SDValue HiStore =
+      DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
+                        HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
+
+  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
+}
+
+// This is a shortcut for integer division because we have fast i32<->f32
+// conversions, and fast f32 reciprocal instructions. The fractional part of a
+// float is enough to accurately represent up to a 24-bit signed integer.
+SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
+                                            bool Sign) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  MVT IntVT = MVT::i32;
+  MVT FltVT = MVT::f32;
+
+  unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
+  if (LHSSignBits < 9)
+    return SDValue();
+
+  unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
+  if (RHSSignBits < 9)
+    return SDValue();
+
+  unsigned BitSize = VT.getSizeInBits();
+  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
+  unsigned DivBits = BitSize - SignBits;
+  if (Sign)
+    ++DivBits;
+
+  ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
+  ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
+
+  SDValue jq = DAG.getConstant(1, DL, IntVT);
+
+  if (Sign) {
+    // char|short jq = ia ^ ib;
+    jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
+
+    // jq = jq >> (bitsize - 2)
+    jq = DAG.getNode(ISD::SRA, DL, VT, jq,
+                     DAG.getConstant(BitSize - 2, DL, VT));
+
+    // jq = jq | 0x1
+    jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
+  }
+
+  // int ia = (int)LHS;
+  SDValue ia = LHS;
+
+  // int ib, (int)RHS;
+  SDValue ib = RHS;
+
+  // float fa = (float)ia;
+  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
+
+  // float fb = (float)ib;
+  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
+
+  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
+                           fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
+
+  // fq = trunc(fq);
+  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
+
+  // float fqneg = -fq;
+  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
+
+  // float fr = mad(fqneg, fb, fa);
+  SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa);
+
+  // int iq = (int)fq;
+  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
+
+  // fr = fabs(fr);
+  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
+
+  // fb = fabs(fb);
+  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
+
+  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+
+  // int cv = fr >= fb;
+  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
+
+  // jq = (cv ? jq : 0);
+  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
+
+  // dst = iq + jq;
+  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
+
+  // Rem needs compensation, it's easier to recompute it
+  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
+  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
+
+  // Truncate to number of bits this divide really is.
+  if (Sign) {
+    SDValue InRegSize
+      = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
+    Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
+    Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
+  } else {
+    SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
+    Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
+    Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
+  }
+
+  return DAG.getMergeValues({ Div, Rem }, DL);
+}
+
+void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
+                                      SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &Results) const {
+  assert(Op.getValueType() == MVT::i64);
+
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+  SDValue one = DAG.getConstant(1, DL, HalfVT);
+  SDValue zero = DAG.getConstant(0, DL, HalfVT);
+
+  //HiLo split
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
+  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
+
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
+  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
+
+  if (VT == MVT::i64 &&
+    DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
+    DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
+
+    SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
+                              LHS_Lo, RHS_Lo);
+
+    SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero});
+    SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero});
+
+    Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
+    Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
+    return;
+  }
+
+  // Get Speculative values
+  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
+  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
+
+  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
+  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero});
+  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
+
+  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
+  SDValue DIV_Lo = zero;
+
+  const unsigned halfBitWidth = HalfVT.getSizeInBits();
+
+  for (unsigned i = 0; i < halfBitWidth; ++i) {
+    const unsigned bitPos = halfBitWidth - i - 1;
+    SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
+    // Get value of high bit
+    SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
+    HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
+    HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
+
+    // Shift
+    REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
+    // Add LHS high bit
+    REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
+
+    SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
+    SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
+
+    DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
+
+    // Update REM
+    SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
+    REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
+  }
+
+  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
+  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
+  Results.push_back(DIV);
+  Results.push_back(REM);
+}
+
+SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  if (VT == MVT::i64) {
+    SmallVector<SDValue, 2> Results;
+    LowerUDIVREM64(Op, DAG, Results);
+    return DAG.getMergeValues(Results, DL);
+  }
+
+  if (VT == MVT::i32) {
+    if (SDValue Res = LowerDIVREM24(Op, DAG, false))
+      return Res;
+  }
+
+  SDValue Num = Op.getOperand(0);
+  SDValue Den = Op.getOperand(1);
+
+  // RCP =  URECIP(Den) = 2^32 / Den + e
+  // e is rounding error.
+  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
+
+  // RCP_LO = mul(RCP, Den) */
+  SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
+
+  // RCP_HI = mulhu (RCP, Den) */
+  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
+
+  // NEG_RCP_LO = -RCP_LO
+  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+                                                     RCP_LO);
+
+  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
+  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
+                                           NEG_RCP_LO, RCP_LO,
+                                           ISD::SETEQ);
+  // Calculate the rounding error from the URECIP instruction
+  // E = mulhu(ABS_RCP_LO, RCP)
+  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
+
+  // RCP_A_E = RCP + E
+  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
+
+  // RCP_S_E = RCP - E
+  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
+
+  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
+  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
+                                     RCP_A_E, RCP_S_E,
+                                     ISD::SETEQ);
+  // Quotient = mulhu(Tmp0, Num)
+  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
+
+  // Num_S_Remainder = Quotient * Den
+  SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
+
+  // Remainder = Num - Num_S_Remainder
+  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
+
+  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
+  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
+                                                 DAG.getConstant(-1, DL, VT),
+                                                 DAG.getConstant(0, DL, VT),
+                                                 ISD::SETUGE);
+  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
+  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
+                                                  Num_S_Remainder,
+                                                  DAG.getConstant(-1, DL, VT),
+                                                  DAG.getConstant(0, DL, VT),
+                                                  ISD::SETUGE);
+  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
+  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
+                                               Remainder_GE_Zero);
+
+  // Calculate Division result:
+
+  // Quotient_A_One = Quotient + 1
+  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
+                                       DAG.getConstant(1, DL, VT));
+
+  // Quotient_S_One = Quotient - 1
+  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
+                                       DAG.getConstant(1, DL, VT));
+
+  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
+  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
+                                     Quotient, Quotient_A_One, ISD::SETEQ);
+
+  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
+  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
+                            Quotient_S_One, Div, ISD::SETEQ);
+
+  // Calculate Rem result:
+
+  // Remainder_S_Den = Remainder - Den
+  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
+
+  // Remainder_A_Den = Remainder + Den
+  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
+
+  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
+  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
+                                    Remainder, Remainder_S_Den, ISD::SETEQ);
+
+  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
+  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
+                            Remainder_A_Den, Rem, ISD::SETEQ);
+  SDValue Ops[2] = {
+    Div,
+    Rem
+  };
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue NegOne = DAG.getConstant(-1, DL, VT);
+
+  if (VT == MVT::i32) {
+    if (SDValue Res = LowerDIVREM24(Op, DAG, true))
+      return Res;
+  }
+
+  if (VT == MVT::i64 &&
+      DAG.ComputeNumSignBits(LHS) > 32 &&
+      DAG.ComputeNumSignBits(RHS) > 32) {
+    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+    //HiLo split
+    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
+    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
+    SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
+                                 LHS_Lo, RHS_Lo);
+    SDValue Res[2] = {
+      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
+      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
+    };
+    return DAG.getMergeValues(Res, DL);
+  }
+
+  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
+  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
+  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
+  SDValue RSign = LHSign; // Remainder sign is the same as LHS
+
+  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
+  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
+
+  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
+  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
+
+  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
+  SDValue Rem = Div.getValue(1);
+
+  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
+  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
+
+  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
+  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
+
+  SDValue Res[2] = {
+    Div,
+    Rem
+  };
+  return DAG.getMergeValues(Res, DL);
+}
+
+// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
+SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  EVT VT = Op.getValueType();
+  SDValue X = Op.getOperand(0);
+  SDValue Y = Op.getOperand(1);
+
+  // TODO: Should this propagate fast-math-flags?
+
+  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
+  SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
+
+  return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
+}
+
+SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  // result = trunc(src)
+  // if (src > 0.0 && src != result)
+  //   result += 1.0
+
+  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
+  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
+
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
+
+  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
+  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
+  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
+
+  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
+  // TODO: Should this propagate fast-math-flags?
+  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
+}
+
+static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
+                                  SelectionDAG &DAG) {
+  const unsigned FractBits = 52;
+  const unsigned ExpBits = 11;
+
+  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
+                                Hi,
+                                DAG.getConstant(FractBits - 32, SL, MVT::i32),
+                                DAG.getConstant(ExpBits, SL, MVT::i32));
+  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
+                            DAG.getConstant(1023, SL, MVT::i32));
+
+  return Exp;
+}
+
+SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  assert(Op.getValueType() == MVT::f64);
+
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+  SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+  // Extract the upper half, since this is where we will find the sign and
+  // exponent.
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
+
+  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
+
+  const unsigned FractBits = 52;
+
+  // Extract the sign bit.
+  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
+  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
+
+  // Extend back to to 64-bits.
+  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
+  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
+
+  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
+  const SDValue FractMask
+    = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
+
+  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
+  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
+  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
+
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
+
+  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
+
+  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
+  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
+
+  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
+  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
+
+  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
+}
+
+SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  assert(Op.getValueType() == MVT::f64);
+
+  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
+  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
+  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
+
+  // TODO: Should this propagate fast-math-flags?
+
+  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
+  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
+
+  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
+
+  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
+  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
+
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
+  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
+
+  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
+}
+
+SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
+  // FNEARBYINT and FRINT are the same, except in their handling of FP
+  // exceptions. Those aren't really meaningful for us, and OpenCL only has
+  // rint, so just treat them as equivalent.
+  return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
+}
+
+// XXX - May require not supporting f32 denormals?
+SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue X = Op.getOperand(0);
+
+  SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
+
+  // TODO: Should this propagate fast-math-flags?
+
+  SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
+
+  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
+
+  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32);
+  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+  const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32);
+
+  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
+
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+
+  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
+
+  SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
+
+  return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
+}
+
+SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue X = Op.getOperand(0);
+
+  SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
+
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+  const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
+  const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
+
+  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
+
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
+
+  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
+
+  const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
+                                       MVT::i64);
+
+  SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
+  SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
+                          DAG.getConstant(INT64_C(0x0008000000000000), SL,
+                                          MVT::i64),
+                          Exp);
+
+  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
+  SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
+                              DAG.getConstant(0, SL, MVT::i64), Tmp0,
+                              ISD::SETNE);
+
+  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
+                             D, DAG.getConstant(0, SL, MVT::i64));
+  SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
+
+  K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
+  K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
+
+  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
+  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
+  SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
+
+  SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
+                            ExpEqNegOne,
+                            DAG.getConstantFP(1.0, SL, MVT::f64),
+                            DAG.getConstantFP(0.0, SL, MVT::f64));
+
+  SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
+
+  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
+  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
+
+  return K;
+}
+
+SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  if (VT == MVT::f32)
+    return LowerFROUND32(Op, DAG);
+
+  if (VT == MVT::f64)
+    return LowerFROUND64(Op, DAG);
+
+  llvm_unreachable("unhandled type");
+}
+
+SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  // result = trunc(src);
+  // if (src < 0.0 && src != result)
+  //   result += -1.0.
+
+  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
+  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
+
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
+
+  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
+  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
+  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
+
+  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
+  // TODO: Should this propagate fast-math-flags?
+  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
+}
+
+SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+  bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
+
+  if (ZeroUndef && Src.getValueType() == MVT::i32)
+    return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src);
+
+  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
+
+  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
+                                   *DAG.getContext(), MVT::i32);
+
+  SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
+
+  SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
+  SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
+
+  const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
+  SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
+
+  // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
+  SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
+
+  if (!ZeroUndef) {
+    // Test if the full 64-bit input is zero.
+
+    // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
+    // which we probably don't want.
+    SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
+    SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
+
+    // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
+    // with the same cycles, otherwise it is slower.
+    // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
+    // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
+
+    const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
+
+    // The instruction returns -1 for 0 input, but the defined intrinsic
+    // behavior is to return the number of bits.
+    NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
+                          SrcIsZero, Bits32, NewCtlz);
+  }
+
+  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
+}
+
+SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
+                                               bool Signed) const {
+  // Unsigned
+  // cul2f(ulong u)
+  //{
+  //  uint lz = clz(u);
+  //  uint e = (u != 0) ? 127U + 63U - lz : 0;
+  //  u = (u << lz) & 0x7fffffffffffffffUL;
+  //  ulong t = u & 0xffffffffffUL;
+  //  uint v = (e << 23) | (uint)(u >> 40);
+  //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
+  //  return as_float(v + r);
+  //}
+  // Signed
+  // cl2f(long l)
+  //{
+  //  long s = l >> 63;
+  //  float r = cul2f((l + s) ^ s);
+  //  return s ? -r : r;
+  //}
+
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+  SDValue L = Src;
+
+  SDValue S;
+  if (Signed) {
+    const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
+    S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
+
+    SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
+    L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
+  }
+
+  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
+                                   *DAG.getContext(), MVT::f32);
+
+
+  SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
+  SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
+  SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
+  LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
+
+  SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
+  SDValue E = DAG.getSelect(SL, MVT::i32,
+    DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
+    DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
+    ZeroI32);
+
+  SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
+    DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
+    DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
+
+  SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
+                          DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
+
+  SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
+                             U, DAG.getConstant(40, SL, MVT::i64));
+
+  SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
+    DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
+    DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
+
+  SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
+  SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
+  SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
+
+  SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+  SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
+
+  SDValue R = DAG.getSelect(SL, MVT::i32,
+    RCmp,
+    One,
+    DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
+  R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
+  R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
+
+  if (!Signed)
+    return R;
+
+  SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
+  return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
+}
+
+SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
+                                               bool Signed) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
+                           DAG.getConstant(0, SL, MVT::i32));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
+                           DAG.getConstant(1, SL, MVT::i32));
+
+  SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
+                              SL, MVT::f64, Hi);
+
+  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
+
+  SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
+                              DAG.getConstant(32, SL, MVT::i32));
+  // TODO: Should this propagate fast-math-flags?
+  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
+}
+
+SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Op.getOperand(0).getValueType() == MVT::i64 &&
+         "operation should be legal");
+
+  // TODO: Factor out code common with LowerSINT_TO_FP.
+
+  EVT DestVT = Op.getValueType();
+  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
+    SDLoc DL(Op);
+    SDValue Src = Op.getOperand(0);
+
+    SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
+    SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
+    SDValue FPRound =
+        DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
+
+    return FPRound;
+  }
+
+  if (DestVT == MVT::f32)
+    return LowerINT_TO_FP32(Op, DAG, false);
+
+  assert(DestVT == MVT::f64);
+  return LowerINT_TO_FP64(Op, DAG, false);
+}
+
+SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  assert(Op.getOperand(0).getValueType() == MVT::i64 &&
+         "operation should be legal");
+
+  // TODO: Factor out code common with LowerUINT_TO_FP.
+
+  EVT DestVT = Op.getValueType();
+  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
+    SDLoc DL(Op);
+    SDValue Src = Op.getOperand(0);
+
+    SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
+    SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
+    SDValue FPRound =
+        DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
+
+    return FPRound;
+  }
+
+  if (DestVT == MVT::f32)
+    return LowerINT_TO_FP32(Op, DAG, true);
+
+  assert(DestVT == MVT::f64);
+  return LowerINT_TO_FP64(Op, DAG, true);
+}
+
+SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
+                                               bool Signed) const {
+  SDLoc SL(Op);
+
+  SDValue Src = Op.getOperand(0);
+
+  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+  SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
+                                 MVT::f64);
+  SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
+                                 MVT::f64);
+  // TODO: Should this propagate fast-math-flags?
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
+
+  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
+
+
+  SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
+
+  SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
+                           MVT::i32, FloorMul);
+  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
+
+  SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
+
+  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
+}
+
+SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
+
+  if (getTargetMachine().Options.UnsafeFPMath) {
+    // There is a generic expand for FP_TO_FP16 with unsafe fast math.
+    return SDValue();
+  }
+
+  SDLoc DL(Op);
+  SDValue N0 = Op.getOperand(0);
+  assert (N0.getSimpleValueType() == MVT::f64);
+
+  // f64 -> f16 conversion using round-to-nearest-even rounding mode.
+  const unsigned ExpMask = 0x7ff;
+  const unsigned ExpBiasf64 = 1023;
+  const unsigned ExpBiasf16 = 15;
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
+  SDValue One = DAG.getConstant(1, DL, MVT::i32);
+  SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
+  SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
+                           DAG.getConstant(32, DL, MVT::i64));
+  UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
+  U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
+  SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
+                          DAG.getConstant(20, DL, MVT::i64));
+  E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
+                  DAG.getConstant(ExpMask, DL, MVT::i32));
+  // Subtract the fp64 exponent bias (1023) to get the real exponent and
+  // add the f16 bias (15) to get the biased exponent for the f16 format.
+  E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
+                  DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
+
+  SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
+                          DAG.getConstant(8, DL, MVT::i32));
+  M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
+                  DAG.getConstant(0xffe, DL, MVT::i32));
+
+  SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
+                                  DAG.getConstant(0x1ff, DL, MVT::i32));
+  MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
+
+  SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
+  M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
+
+  // (M != 0 ? 0x0200 : 0) | 0x7c00;
+  SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
+      DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
+                      Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
+
+  // N = M | (E << 12);
+  SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
+      DAG.getNode(ISD::SHL, DL, MVT::i32, E,
+                  DAG.getConstant(12, DL, MVT::i32)));
+
+  // B = clamp(1-E, 0, 13);
+  SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
+                                  One, E);
+  SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
+  B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
+                  DAG.getConstant(13, DL, MVT::i32));
+
+  SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
+                                   DAG.getConstant(0x1000, DL, MVT::i32));
+
+  SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
+  SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
+  SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
+  D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
+
+  SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
+  SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
+                              DAG.getConstant(0x7, DL, MVT::i32));
+  V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
+                  DAG.getConstant(2, DL, MVT::i32));
+  SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
+                               One, Zero, ISD::SETEQ);
+  SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
+                               One, Zero, ISD::SETGT);
+  V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
+  V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
+
+  V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
+                      DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
+  V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
+                      I, V, ISD::SETEQ);
+
+  // Extract the sign bit.
+  SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
+                            DAG.getConstant(16, DL, MVT::i32));
+  Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
+                     DAG.getConstant(0x8000, DL, MVT::i32));
+
+  V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
+  return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
+}
+
+SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDValue Src = Op.getOperand(0);
+
+  // TODO: Factor out code common with LowerFP_TO_UINT.
+
+  EVT SrcVT = Src.getValueType();
+  if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
+    SDLoc DL(Op);
+
+    SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
+    SDValue FpToInt32 =
+        DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
+
+    return FpToInt32;
+  }
+
+  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
+    return LowerFP64_TO_INT(Op, DAG, true);
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDValue Src = Op.getOperand(0);
+
+  // TODO: Factor out code common with LowerFP_TO_SINT.
+
+  EVT SrcVT = Src.getValueType();
+  if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
+    SDLoc DL(Op);
+
+    SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
+    SDValue FpToInt32 =
+        DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
+
+    return FpToInt32;
+  }
+
+  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
+    return LowerFP64_TO_INT(Op, DAG, false);
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+  MVT VT = Op.getSimpleValueType();
+  MVT ScalarVT = VT.getScalarType();
+
+  assert(VT.isVector());
+
+  SDValue Src = Op.getOperand(0);
+  SDLoc DL(Op);
+
+  // TODO: Don't scalarize on Evergreen?
+  unsigned NElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 8> Args;
+  DAG.ExtractVectorElements(Src, Args, 0, NElts);
+
+  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
+  for (unsigned I = 0; I < NElts; ++I)
+    Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
+
+  return DAG.getBuildVector(VT, DL, Args);
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
+
+static bool isU24(SDValue Op, SelectionDAG &DAG) {
+  APInt KnownZero, KnownOne;
+  EVT VT = Op.getValueType();
+  DAG.computeKnownBits(Op, KnownZero, KnownOne);
+
+  return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
+}
+
+static bool isI24(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  // In order for this to be a signed 24-bit value, bit 23, must
+  // be a sign bit.
+  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
+                                     // as unsigned 24-bit values.
+         (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
+}
+
+static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
+                        TargetLowering::DAGCombinerInfo &DCI) {
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Op = Node24->getOperand(OpIdx);
+  EVT VT = Op.getValueType();
+
+  APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
+  APInt KnownZero, KnownOne;
+  TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
+  if (TLO.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI))
+    return true;
+
+  return false;
+}
+
+template <typename IntTy>
+static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
+                               uint32_t Width, const SDLoc &DL) {
+  if (Width + Offset < 32) {
+    uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
+    IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
+    return DAG.getConstant(Result, DL, MVT::i32);
+  }
+
+  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
+}
+
+static bool hasVolatileUser(SDNode *Val) {
+  for (SDNode *U : Val->uses()) {
+    if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
+      if (M->isVolatile())
+        return true;
+    }
+  }
+
+  return false;
+}
+
+bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
+  // i32 vectors are the canonical memory type.
+  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
+    return false;
+
+  if (!VT.isByteSized())
+    return false;
+
+  unsigned Size = VT.getStoreSize();
+
+  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
+    return false;
+
+  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
+    return false;
+
+  return true;
+}
+
+// Replace load of an illegal type with a store of a bitcast to a friendlier
+// type.
+SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  LoadSDNode *LN = cast<LoadSDNode>(N);
+  if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
+    return SDValue();
+
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = LN->getMemoryVT();
+
+  unsigned Size = VT.getStoreSize();
+  unsigned Align = LN->getAlignment();
+  if (Align < Size && isTypeLegal(VT)) {
+    bool IsFast;
+    unsigned AS = LN->getAddressSpace();
+
+    // Expand unaligned loads earlier than legalization. Due to visitation order
+    // problems during legalization, the emitted instructions to pack and unpack
+    // the bytes again are not eliminated in the case of an unaligned copy.
+    if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+      if (VT.isVector())
+        return scalarizeVectorLoad(LN, DAG);
+
+      SDValue Ops[2];
+      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
+      return DAG.getMergeValues(Ops, SDLoc(N));
+    }
+
+    if (!IsFast)
+      return SDValue();
+  }
+
+  if (!shouldCombineMemoryType(VT))
+    return SDValue();
+
+  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
+
+  SDValue NewLoad
+    = DAG.getLoad(NewVT, SL, LN->getChain(),
+                  LN->getBasePtr(), LN->getMemOperand());
+
+  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
+  DCI.CombineTo(N, BC, NewLoad.getValue(1));
+  return SDValue(N, 0);
+}
+
+// Replace store of an illegal type with a store of a bitcast to a friendlier
+// type.
+SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  StoreSDNode *SN = cast<StoreSDNode>(N);
+  if (SN->isVolatile() || !ISD::isNormalStore(SN))
+    return SDValue();
+
+  EVT VT = SN->getMemoryVT();
+  unsigned Size = VT.getStoreSize();
+
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned Align = SN->getAlignment();
+  if (Align < Size && isTypeLegal(VT)) {
+    bool IsFast;
+    unsigned AS = SN->getAddressSpace();
+
+    // Expand unaligned stores earlier than legalization. Due to visitation
+    // order problems during legalization, the emitted instructions to pack and
+    // unpack the bytes again are not eliminated in the case of an unaligned
+    // copy.
+    if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+      if (VT.isVector())
+        return scalarizeVectorStore(SN, DAG);
+
+      return expandUnalignedStore(SN, DAG);
+    }
+
+    if (!IsFast)
+      return SDValue();
+  }
+
+  if (!shouldCombineMemoryType(VT))
+    return SDValue();
+
+  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
+  SDValue Val = SN->getValue();
+
+  //DCI.AddToWorklist(Val.getNode());
+
+  bool OtherUses = !Val.hasOneUse();
+  SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
+  if (OtherUses) {
+    SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
+    DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
+  }
+
+  return DAG.getStore(SN->getChain(), SL, CastVal,
+                      SN->getBasePtr(), SN->getMemOperand());
+}
+
+/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
+/// binary operation \p Opc to it with the corresponding constant operands.
+SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
+  DAGCombinerInfo &DCI, const SDLoc &SL,
+  unsigned Opc, SDValue LHS,
+  uint32_t ValLo, uint32_t ValHi) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Lo, Hi;
+  std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
+
+  SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
+  SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
+
+  SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
+  SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
+
+  // Re-visit the ands. It's possible we eliminated one of them and it could
+  // simplify the vector.
+  DCI.AddToWorklist(Lo.getNode());
+  DCI.AddToWorklist(Hi.getNode());
+
+  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
+  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+}
+
+SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  if (N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
+
+  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
+  // common case, splitting this into a move and a 32-bit shift is faster and
+  // the same code size.
+  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!RHS)
+    return SDValue();
+
+  unsigned RHSVal = RHS->getZExtValue();
+  if (RHSVal < 32)
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+
+  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
+
+  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
+
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+
+  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
+  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+}
+
+SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  if (N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!RHS)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+  unsigned RHSVal = RHS->getZExtValue();
+
+  // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
+  if (RHSVal == 32) {
+    SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
+    SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
+                                   DAG.getConstant(31, SL, MVT::i32));
+
+    SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
+    return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
+  }
+
+  // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
+  if (RHSVal == 63) {
+    SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
+    SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
+                                   DAG.getConstant(31, SL, MVT::i32));
+    SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
+    return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
+  }
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  if (N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!RHS)
+    return SDValue();
+
+  unsigned ShiftAmt = RHS->getZExtValue();
+  if (ShiftAmt < 32)
+    return SDValue();
+
+  // srl i64:x, C for C >= 32
+  // =>
+  //   build_pair (srl hi_32(x), C - 32), 0
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+
+  SDValue One = DAG.getConstant(1, SL, MVT::i32);
+  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+
+  SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
+                           VecOp, One);
+
+  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
+  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
+
+  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
+
+  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
+}
+
+// We need to specifically handle i64 mul here to avoid unnecessary conversion
+// instructions. If we only match on the legalized i64 mul expansion,
+// SimplifyDemandedBits will be unable to remove them because there will be
+// multiple uses due to the separate mul + mulh[su].
+static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
+                        SDValue N0, SDValue N1, unsigned Size, bool Signed) {
+  if (Size <= 32) {
+    unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+    return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
+  }
+
+  // Because we want to eliminate extension instructions before the
+  // operation, we need to create a single user here (i.e. not the separate
+  // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
+
+  unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
+
+  SDValue Mul = DAG.getNode(MulOpc, SL,
+                            DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
+
+  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
+                     Mul.getValue(0), Mul.getValue(1));
+}
+
+SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  EVT VT = N->getValueType(0);
+
+  unsigned Size = VT.getSizeInBits();
+  if (VT.isVector() || Size > 64)
+    return SDValue();
+
+  // There are i16 integer mul/mad.
+  if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue Mul;
+
+  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
+    N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+    N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+    Mul = getMul24(DAG, DL, N0, N1, Size, false);
+  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
+    N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+    N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+    Mul = getMul24(DAG, DL, N0, N1, Size, true);
+  } else {
+    return SDValue();
+  }
+
+  // We need to use sext even for MUL_U24, because MUL_U24 is used
+  // for signed multiply of 8 and 16-bit types.
+  return DAG.getSExtOrTrunc(Mul, DL, VT);
+}
+
+SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  EVT VT = N->getValueType(0);
+
+  if (!Subtarget->hasMulI24() || VT.isVector())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  if (!isI24(N0, DAG) || !isI24(N1, DAG))
+    return SDValue();
+
+  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+
+  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
+  DCI.AddToWorklist(Mulhi.getNode());
+  return DAG.getSExtOrTrunc(Mulhi, DL, VT);
+}
+
+SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  EVT VT = N->getValueType(0);
+
+  if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  if (!isU24(N0, DAG) || !isU24(N1, DAG))
+    return SDValue();
+
+  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+
+  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
+  DCI.AddToWorklist(Mulhi.getNode());
+  return DAG.getZExtOrTrunc(Mulhi, DL, VT);
+}
+
+SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
+  SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  // Simplify demanded bits before splitting into multiple users.
+  if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
+
+  unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+  unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
+
+  SDLoc SL(N);
+
+  SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
+  SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
+  return DAG.getMergeValues({ MulLo, MulHi }, SL);
+}
+
+static bool isNegativeOne(SDValue Val) {
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
+    return C->isAllOnesValue();
+  return false;
+}
+
+static bool isCtlzOpc(unsigned Opc) {
+  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
+}
+
+SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG,
+                                          SDValue Op,
+                                          const SDLoc &DL) const {
+  EVT VT = Op.getValueType();
+  EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
+  if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
+                              LegalVT != MVT::i16))
+    return SDValue();
+
+  if (VT != MVT::i32)
+    Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
+
+  SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op);
+  if (VT != MVT::i32)
+    FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH);
+
+  return FFBH;
+}
+
+// The native instructions return -1 on 0 input. Optimize out a select that
+// produces -1 on 0.
+//
+// TODO: If zero is not undef, we could also do this if the output is compared
+// against the bitwidth.
+//
+// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
+SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
+                                                 SDValue LHS, SDValue RHS,
+                                                 DAGCombinerInfo &DCI) const {
+  ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+  if (!CmpRhs || !CmpRhs->isNullValue())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+  SDValue CmpLHS = Cond.getOperand(0);
+
+  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
+  if (CCOpcode == ISD::SETEQ &&
+      isCtlzOpc(RHS.getOpcode()) &&
+      RHS.getOperand(0) == CmpLHS &&
+      isNegativeOne(LHS)) {
+    return getFFBH_U32(DAG, CmpLHS, SL);
+  }
+
+  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
+  if (CCOpcode == ISD::SETNE &&
+      isCtlzOpc(LHS.getOpcode()) &&
+      LHS.getOperand(0) == CmpLHS &&
+      isNegativeOne(RHS)) {
+    return getFFBH_U32(DAG, CmpLHS, SL);
+  }
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
+                                                   DAGCombinerInfo &DCI) const {
+  SDValue Cond = N->getOperand(0);
+  if (Cond.getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDValue LHS = Cond.getOperand(0);
+  SDValue RHS = Cond.getOperand(1);
+  SDValue CC = Cond.getOperand(2);
+
+  SDValue True = N->getOperand(1);
+  SDValue False = N->getOperand(2);
+
+  if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
+    SelectionDAG &DAG = DCI.DAG;
+    if ((DAG.isConstantValueOfAnyType(True) ||
+         DAG.isConstantValueOfAnyType(True)) &&
+        (!DAG.isConstantValueOfAnyType(False) &&
+         !DAG.isConstantValueOfAnyType(False))) {
+      // Swap cmp + select pair to move constant to false input.
+      // This will allow using VOPC cndmasks more often.
+      // select (setcc x, y), k, x -> select (setcc y, x) x, x
+
+      SDLoc SL(N);
+      ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+                                            LHS.getValueType().isInteger());
+
+      SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
+      return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
+    }
+  }
+
+  if (VT == MVT::f32 && Cond.hasOneUse()) {
+    SDValue MinMax
+      = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+    // Revisit this node so we can catch min3/max3/med3 patterns.
+    //DCI.AddToWorklist(MinMax.getNode());
+    return MinMax;
+  }
+
+  // There's no reason to not do this if the condition has other uses.
+  return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
+}
+
+SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  switch(N->getOpcode()) {
+  default:
+    break;
+  case ISD::BITCAST: {
+    EVT DestVT = N->getValueType(0);
+
+    // Push casts through vector builds. This helps avoid emitting a large
+    // number of copies when materializing floating point vector constants.
+    //
+    // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
+    //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
+    if (DestVT.isVector()) {
+      SDValue Src = N->getOperand(0);
+      if (Src.getOpcode() == ISD::BUILD_VECTOR) {
+        EVT SrcVT = Src.getValueType();
+        unsigned NElts = DestVT.getVectorNumElements();
+
+        if (SrcVT.getVectorNumElements() == NElts) {
+          EVT DestEltVT = DestVT.getVectorElementType();
+
+          SmallVector<SDValue, 8> CastedElts;
+          SDLoc SL(N);
+          for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
+            SDValue Elt = Src.getOperand(I);
+            CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
+          }
+
+          return DAG.getBuildVector(DestVT, SL, CastedElts);
+        }
+      }
+    }
+
+    if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
+      break;
+
+    // Fold bitcasts of constants.
+    //
+    // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
+    // TODO: Generalize and move to DAGCombiner
+    SDValue Src = N->getOperand(0);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
+      assert(Src.getValueType() == MVT::i64);
+      SDLoc SL(N);
+      uint64_t CVal = C->getZExtValue();
+      return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
+                         DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+                         DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+    }
+
+    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
+      const APInt &Val = C->getValueAPF().bitcastToAPInt();
+      SDLoc SL(N);
+      uint64_t CVal = Val.getZExtValue();
+      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+                                DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+                                DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+
+      return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
+    }
+
+    break;
+  }
+  case ISD::SHL: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    return performShlCombine(N, DCI);
+  }
+  case ISD::SRL: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    return performSrlCombine(N, DCI);
+  }
+  case ISD::SRA: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    return performSraCombine(N, DCI);
+  }
+  case ISD::MUL:
+    return performMulCombine(N, DCI);
+  case ISD::MULHS:
+    return performMulhsCombine(N, DCI);
+  case ISD::MULHU:
+    return performMulhuCombine(N, DCI);
+  case AMDGPUISD::MUL_I24:
+  case AMDGPUISD::MUL_U24:
+  case AMDGPUISD::MULHI_I24:
+  case AMDGPUISD::MULHI_U24: {
+    // If the first call to simplify is successfull, then N may end up being
+    // deleted, so we shouldn't call simplifyI24 again.
+    simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
+    return SDValue();
+  }
+  case AMDGPUISD::MUL_LOHI_I24:
+  case AMDGPUISD::MUL_LOHI_U24:
+    return performMulLoHi24Combine(N, DCI);
+  case ISD::SELECT:
+    return performSelectCombine(N, DCI);
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    assert(!N->getValueType(0).isVector() &&
+           "Vector handling of BFE not implemented");
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+    if (!Width)
+      break;
+
+    uint32_t WidthVal = Width->getZExtValue() & 0x1f;
+    if (WidthVal == 0)
+      return DAG.getConstant(0, DL, MVT::i32);
+
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!Offset)
+      break;
+
+    SDValue BitsFrom = N->getOperand(0);
+    uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
+
+    bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
+
+    if (OffsetVal == 0) {
+      // This is already sign / zero extended, so try to fold away extra BFEs.
+      unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
+
+      unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
+      if (OpSignBits >= SignBits)
+        return BitsFrom;
+
+      EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
+      if (Signed) {
+        // This is a sign_extend_inreg. Replace it to take advantage of existing
+        // DAG Combines. If not eliminated, we will match back to BFE during
+        // selection.
+
+        // TODO: The sext_inreg of extended types ends, although we can could
+        // handle them in a single BFE.
+        return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
+                           DAG.getValueType(SmallVT));
+      }
+
+      return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
+    }
+
+    if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
+      if (Signed) {
+        return constantFoldBFE<int32_t>(DAG,
+                                        CVal->getSExtValue(),
+                                        OffsetVal,
+                                        WidthVal,
+                                        DL);
+      }
+
+      return constantFoldBFE<uint32_t>(DAG,
+                                       CVal->getZExtValue(),
+                                       OffsetVal,
+                                       WidthVal,
+                                       DL);
+    }
+
+    if ((OffsetVal + WidthVal) >= 32) {
+      SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
+      return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
+                         BitsFrom, ShiftVal);
+    }
+
+    if (BitsFrom.hasOneUse()) {
+      APInt Demanded = APInt::getBitsSet(32,
+                                         OffsetVal,
+                                         OffsetVal + WidthVal);
+
+      APInt KnownZero, KnownOne;
+      TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                            !DCI.isBeforeLegalizeOps());
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
+          TLI.SimplifyDemandedBits(BitsFrom, Demanded,
+                                   KnownZero, KnownOne, TLO)) {
+        DCI.CommitTargetLoweringOpt(TLO);
+      }
+    }
+
+    break;
+  }
+  case ISD::LOAD:
+    return performLoadCombine(N, DCI);
+  case ISD::STORE:
+    return performStoreCombine(N, DCI);
+  }
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
+                                                  const TargetRegisterClass *RC,
+                                                   unsigned Reg, EVT VT) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned VirtualRegister;
+  if (!MRI.isLiveIn(Reg)) {
+    VirtualRegister = MRI.createVirtualRegister(RC);
+    MRI.addLiveIn(Reg, VirtualRegister);
+  } else {
+    VirtualRegister = MRI.getLiveInVirtReg(Reg);
+  }
+  return DAG.getRegister(VirtualRegister, VT);
+}
+
+uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
+    const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
+  unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
+  uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
+  switch (Param) {
+  case GRID_DIM:
+    return ArgOffset;
+  case GRID_OFFSET:
+    return ArgOffset + 4;
+  }
+  llvm_unreachable("unexpected implicit parameter type");
+}
+
+#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
+
+const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch ((AMDGPUISD::NodeType)Opcode) {
+  case AMDGPUISD::FIRST_NUMBER: break;
+  // AMDIL DAG nodes
+  NODE_NAME_CASE(CALL);
+  NODE_NAME_CASE(UMUL);
+  NODE_NAME_CASE(BRANCH_COND);
+
+  // AMDGPU DAG nodes
+  NODE_NAME_CASE(ENDPGM)
+  NODE_NAME_CASE(RETURN)
+  NODE_NAME_CASE(DWORDADDR)
+  NODE_NAME_CASE(FRACT)
+  NODE_NAME_CASE(SETCC)
+  NODE_NAME_CASE(SETREG)
+  NODE_NAME_CASE(FMA_W_CHAIN)
+  NODE_NAME_CASE(FMUL_W_CHAIN)
+  NODE_NAME_CASE(CLAMP)
+  NODE_NAME_CASE(COS_HW)
+  NODE_NAME_CASE(SIN_HW)
+  NODE_NAME_CASE(FMAX_LEGACY)
+  NODE_NAME_CASE(FMIN_LEGACY)
+  NODE_NAME_CASE(FMAX3)
+  NODE_NAME_CASE(SMAX3)
+  NODE_NAME_CASE(UMAX3)
+  NODE_NAME_CASE(FMIN3)
+  NODE_NAME_CASE(SMIN3)
+  NODE_NAME_CASE(UMIN3)
+  NODE_NAME_CASE(FMED3)
+  NODE_NAME_CASE(SMED3)
+  NODE_NAME_CASE(UMED3)
+  NODE_NAME_CASE(URECIP)
+  NODE_NAME_CASE(DIV_SCALE)
+  NODE_NAME_CASE(DIV_FMAS)
+  NODE_NAME_CASE(DIV_FIXUP)
+  NODE_NAME_CASE(TRIG_PREOP)
+  NODE_NAME_CASE(RCP)
+  NODE_NAME_CASE(RSQ)
+  NODE_NAME_CASE(RCP_LEGACY)
+  NODE_NAME_CASE(RSQ_LEGACY)
+  NODE_NAME_CASE(FMUL_LEGACY)
+  NODE_NAME_CASE(RSQ_CLAMP)
+  NODE_NAME_CASE(LDEXP)
+  NODE_NAME_CASE(FP_CLASS)
+  NODE_NAME_CASE(DOT4)
+  NODE_NAME_CASE(CARRY)
+  NODE_NAME_CASE(BORROW)
+  NODE_NAME_CASE(BFE_U32)
+  NODE_NAME_CASE(BFE_I32)
+  NODE_NAME_CASE(BFI)
+  NODE_NAME_CASE(BFM)
+  NODE_NAME_CASE(FFBH_U32)
+  NODE_NAME_CASE(FFBH_I32)
+  NODE_NAME_CASE(MUL_U24)
+  NODE_NAME_CASE(MUL_I24)
+  NODE_NAME_CASE(MULHI_U24)
+  NODE_NAME_CASE(MULHI_I24)
+  NODE_NAME_CASE(MUL_LOHI_U24)
+  NODE_NAME_CASE(MUL_LOHI_I24)
+  NODE_NAME_CASE(MAD_U24)
+  NODE_NAME_CASE(MAD_I24)
+  NODE_NAME_CASE(TEXTURE_FETCH)
+  NODE_NAME_CASE(EXPORT)
+  NODE_NAME_CASE(EXPORT_DONE)
+  NODE_NAME_CASE(R600_EXPORT)
+  NODE_NAME_CASE(CONST_ADDRESS)
+  NODE_NAME_CASE(REGISTER_LOAD)
+  NODE_NAME_CASE(REGISTER_STORE)
+  NODE_NAME_CASE(LOAD_INPUT)
+  NODE_NAME_CASE(SAMPLE)
+  NODE_NAME_CASE(SAMPLEB)
+  NODE_NAME_CASE(SAMPLED)
+  NODE_NAME_CASE(SAMPLEL)
+  NODE_NAME_CASE(CVT_F32_UBYTE0)
+  NODE_NAME_CASE(CVT_F32_UBYTE1)
+  NODE_NAME_CASE(CVT_F32_UBYTE2)
+  NODE_NAME_CASE(CVT_F32_UBYTE3)
+  NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
+  NODE_NAME_CASE(CONST_DATA_PTR)
+  NODE_NAME_CASE(PC_ADD_REL_OFFSET)
+  NODE_NAME_CASE(KILL)
+  case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
+  NODE_NAME_CASE(SENDMSG)
+  NODE_NAME_CASE(INTERP_MOV)
+  NODE_NAME_CASE(INTERP_P1)
+  NODE_NAME_CASE(INTERP_P2)
+  NODE_NAME_CASE(STORE_MSKOR)
+  NODE_NAME_CASE(LOAD_CONSTANT)
+  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
+  NODE_NAME_CASE(ATOMIC_CMP_SWAP)
+  NODE_NAME_CASE(ATOMIC_INC)
+  NODE_NAME_CASE(ATOMIC_DEC)
+  NODE_NAME_CASE(BUFFER_LOAD)
+  NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
+  case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
+  }
+  return nullptr;
+}
+
+SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
+                                              SelectionDAG &DAG, int Enabled,
+                                              int &RefinementSteps,
+                                              bool &UseOneConstNR,
+                                              bool Reciprocal) const {
+  EVT VT = Operand.getValueType();
+
+  if (VT == MVT::f32) {
+    RefinementSteps = 0;
+    return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
+  }
+
+  // TODO: There is also f64 rsq instruction, but the documentation is less
+  // clear on its precision.
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
+                                               SelectionDAG &DAG, int Enabled,
+                                               int &RefinementSteps) const {
+  EVT VT = Operand.getValueType();
+
+  if (VT == MVT::f32) {
+    // Reciprocal, < 1 ulp error.
+    //
+    // This reciprocal approximation converges to < 0.5 ulp error with one
+    // newton rhapson performed with two fused multiple adds (FMAs).
+
+    RefinementSteps = 0;
+    return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
+  }
+
+  // TODO: There is also f64 rcp instruction, but the documentation is less
+  // clear on its precision.
+
+  return SDValue();
+}
+
+void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
+  const SDValue Op,
+  APInt &KnownZero,
+  APInt &KnownOne,
+  const SelectionDAG &DAG,
+  unsigned Depth) const {
+
+  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
+
+  APInt KnownZero2;
+  APInt KnownOne2;
+  unsigned Opc = Op.getOpcode();
+
+  switch (Opc) {
+  default:
+    break;
+  case AMDGPUISD::CARRY:
+  case AMDGPUISD::BORROW: {
+    KnownZero = APInt::getHighBitsSet(32, 31);
+    break;
+  }
+
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!CWidth)
+      return;
+
+    unsigned BitWidth = 32;
+    uint32_t Width = CWidth->getZExtValue() & 0x1f;
+
+    if (Opc == AMDGPUISD::BFE_U32)
+      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+
+    break;
+  }
+  }
+}
+
+unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
+  SDValue Op,
+  const SelectionDAG &DAG,
+  unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  case AMDGPUISD::BFE_I32: {
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!Width)
+      return 1;
+
+    unsigned SignBits = 32 - Width->getZExtValue() + 1;
+    if (!isNullConstant(Op.getOperand(1)))
+      return SignBits;
+
+    // TODO: Could probably figure something out with non-0 offsets.
+    unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    return std::max(SignBits, Op0SignBits);
+  }
+
+  case AMDGPUISD::BFE_U32: {
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
+  }
+
+  case AMDGPUISD::CARRY:
+  case AMDGPUISD::BORROW:
+    return 31;
+
+  default:
+    return 1;
+  }
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
new file mode 100644
index 000000000000..5cc5efb331e3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -0,0 +1,338 @@
+//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition of the TargetLowering class that is common
+/// to all AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class AMDGPUMachineFunction;
+class AMDGPUSubtarget;
+class MachineRegisterInfo;
+
+class AMDGPUTargetLowering : public TargetLowering {
+private:
+  /// \returns AMDGPUISD::FFBH_U32 node if the incoming \p Op may have been
+  /// legalized from a smaller type VT. Need to match pre-legalized type because
+  /// the generic legalization inserts the add/sub between the select and
+  /// compare.
+  SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const;
+
+protected:
+  const AMDGPUSubtarget *Subtarget;
+
+  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  /// \brief Split a vector store into multiple scalar stores.
+  /// \returns The resulting chain.
+
+  SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+  SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+  SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+  SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+
+protected:
+  bool shouldCombineMemoryType(EVT VT) const;
+  SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+  SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
+                                       unsigned Opc, SDValue LHS,
+                                       uint32_t ValLo, uint32_t ValHi) const;
+  SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
+                             SDValue RHS, DAGCombinerInfo &DCI) const;
+  SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+  static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
+
+  virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
+                                     SelectionDAG &DAG) const;
+
+  /// Return 64-bit value Op as two 32-bit integers.
+  std::pair<SDValue, SDValue> split64BitValue(SDValue Op,
+                                              SelectionDAG &DAG) const;
+  SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const;
+
+  /// \brief Split a vector load into 2 loads of half the vector.
+  SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+
+  /// \brief Split a vector store into 2 stores of half the vector.
+  SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
+  void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &Results) const;
+  void analyzeFormalArgumentsCompute(CCState &State,
+                              const SmallVectorImpl<ISD::InputArg> &Ins) const;
+  void AnalyzeFormalArguments(CCState &State,
+                              const SmallVectorImpl<ISD::InputArg> &Ins) const;
+  void AnalyzeReturn(CCState &State,
+                     const SmallVectorImpl<ISD::OutputArg> &Outs) const;
+
+public:
+  AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
+
+  bool isFAbsFree(EVT VT) const override;
+  bool isFNegFree(EVT VT) const override;
+  bool isTruncateFree(EVT Src, EVT Dest) const override;
+  bool isTruncateFree(Type *Src, Type *Dest) const override;
+
+  bool isZExtFree(Type *Src, Type *Dest) const override;
+  bool isZExtFree(EVT Src, EVT Dest) const override;
+  bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+  bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
+
+  MVT getVectorIdxTy(const DataLayout &) const override;
+  bool isSelectSupported(SelectSupportKind) const override;
+
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+  bool ShouldShrinkFPConstant(EVT VT) const override;
+  bool shouldReduceLoadWidth(SDNode *Load,
+                             ISD::LoadExtType ExtType,
+                             EVT ExtVT) const override;
+
+  bool isLoadBitCastBeneficial(EVT, EVT) const final;
+
+  bool storeOfVectorConstantIsCheap(EVT MemVT,
+                                    unsigned NumElem,
+                                    unsigned AS) const override;
+  bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override;
+  bool isCheapToSpeculateCttz() const override;
+  bool isCheapToSpeculateCtlz() const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+                      SelectionDAG &DAG) const override;
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                  SelectionDAG &DAG) const;
+
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+  void ReplaceNodeResults(SDNode * N,
+                          SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+
+  SDValue CombineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS,
+                               SDValue RHS, SDValue True, SDValue False,
+                               SDValue CC, DAGCombinerInfo &DCI) const;
+
+  const char* getTargetNodeName(unsigned Opcode) const override;
+
+  bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
+    return true;
+  }
+  SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                           int &RefinementSteps, bool &UseOneConstNR,
+                           bool Reciprocal) const override;
+  SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                           int &RefinementSteps) const override;
+
+  virtual SDNode *PostISelFolding(MachineSDNode *N,
+                                  SelectionDAG &DAG) const = 0;
+
+  /// \brief Determine which of the bits specified in \p Mask are known to be
+  /// either zero or one and return them in the \p KnownZero and \p KnownOne
+  /// bitsets.
+  void computeKnownBitsForTargetNode(const SDValue Op,
+                                     APInt &KnownZero,
+                                     APInt &KnownOne,
+                                     const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
+  unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG,
+                                           unsigned Depth = 0) const override;
+
+  /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
+  /// MachineFunction.
+  ///
+  /// \returns a RegisterSDNode representing Reg.
+  virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
+                                       const TargetRegisterClass *RC,
+                                       unsigned Reg, EVT VT) const;
+
+  enum ImplicitParameter {
+    FIRST_IMPLICIT,
+    GRID_DIM = FIRST_IMPLICIT,
+    GRID_OFFSET,
+  };
+
+  /// \brief Helper function that returns the byte offset of the given
+  /// type of implicit parameter.
+  uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
+                                      const ImplicitParameter Param) const;
+};
+
+namespace AMDGPUISD {
+
+enum NodeType : unsigned {
+  // AMDIL ISD Opcodes
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  CALL,        // Function call based on a single integer
+  UMUL,        // 32bit unsigned multiplication
+  BRANCH_COND,
+  // End AMDIL ISD Opcodes
+  ENDPGM,
+  RETURN,
+  DWORDADDR,
+  FRACT,
+  CLAMP,
+  // This is SETCC with the full mask result which is used for a compare with a
+  // result bit per item in the wavefront.
+  SETCC,
+  SETREG,
+  // FP ops with input and output chain.
+  FMA_W_CHAIN,
+  FMUL_W_CHAIN,
+
+  // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
+  // Denormals handled on some parts.
+  COS_HW,
+  SIN_HW,
+  FMAX_LEGACY,
+  FMIN_LEGACY,
+  FMAX3,
+  SMAX3,
+  UMAX3,
+  FMIN3,
+  SMIN3,
+  UMIN3,
+  FMED3,
+  SMED3,
+  UMED3,
+  URECIP,
+  DIV_SCALE,
+  DIV_FMAS,
+  DIV_FIXUP,
+  TRIG_PREOP, // 1 ULP max error for f64
+
+  // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
+  //            For f64, max error 2^29 ULP, handles denormals.
+  RCP,
+  RSQ,
+  RCP_LEGACY,
+  RSQ_LEGACY,
+  FMUL_LEGACY,
+  RSQ_CLAMP,
+  LDEXP,
+  FP_CLASS,
+  DOT4,
+  CARRY,
+  BORROW,
+  BFE_U32, // Extract range of bits with zero extension to 32-bits.
+  BFE_I32, // Extract range of bits with sign extension to 32-bits.
+  BFI, // (src0 & src1) | (~src0 & src2)
+  BFM, // Insert a range of bits into a 32-bit word.
+  FFBH_U32, // ctlz with -1 if input is zero.
+  FFBH_I32,
+  MUL_U24,
+  MUL_I24,
+  MULHI_U24,
+  MULHI_I24,
+  MAD_U24,
+  MAD_I24,
+  MUL_LOHI_I24,
+  MUL_LOHI_U24,
+  TEXTURE_FETCH,
+  EXPORT, // exp on SI+
+  EXPORT_DONE, // exp on SI+ with done bit set
+  R600_EXPORT,
+  CONST_ADDRESS,
+  REGISTER_LOAD,
+  REGISTER_STORE,
+  LOAD_INPUT,
+  SAMPLE,
+  SAMPLEB,
+  SAMPLED,
+  SAMPLEL,
+
+  // These cvt_f32_ubyte* nodes need to remain consecutive and in order.
+  CVT_F32_UBYTE0,
+  CVT_F32_UBYTE1,
+  CVT_F32_UBYTE2,
+  CVT_F32_UBYTE3,
+  /// This node is for VLIW targets and it is used to represent a vector
+  /// that is stored in consecutive registers with the same channel.
+  /// For example:
+  ///   |X  |Y|Z|W|
+  /// T0|v.x| | | |
+  /// T1|v.y| | | |
+  /// T2|v.z| | | |
+  /// T3|v.w| | | |
+  BUILD_VERTICAL_VECTOR,
+  /// Pointer to the start of the shader's constant data.
+  CONST_DATA_PTR,
+  SENDMSG,
+  INTERP_MOV,
+  INTERP_P1,
+  INTERP_P2,
+  PC_ADD_REL_OFFSET,
+  KILL,
+  FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  STORE_MSKOR,
+  LOAD_CONSTANT,
+  TBUFFER_STORE_FORMAT,
+  ATOMIC_CMP_SWAP,
+  ATOMIC_INC,
+  ATOMIC_DEC,
+  BUFFER_LOAD,
+  BUFFER_LOAD_FORMAT,
+  LAST_AMDGPU_ISD_NUMBER
+};
+
+
+} // End namespace AMDGPUISD
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
new file mode 100644
index 000000000000..e4dc6599e156
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -0,0 +1,115 @@
+//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Implementation of the TargetInstrInfo class that is common to all
+/// AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRMAP_INFO
+#include "AMDGPUGenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void AMDGPUInstrInfo::anchor() {}
+
+AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
+  : AMDGPUGenInstrInfo(-1, -1), ST(ST) {}
+
+// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
+// the first 16 loads will be interleaved with the stores, and the next 16 will
+// be clustered as expected. It should really split into 2 16 store batches.
+//
+// Loads are clustered until this returns false, rather than trying to schedule
+// groups of stores. This also means we have to deal with saying different
+// address space loads should be clustered, and ones which might cause bank
+// conflicts.
+//
+// This might be deprecated so it might not be worth that much effort to fix.
+bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
+                                              int64_t Offset0, int64_t Offset1,
+                                              unsigned NumLoads) const {
+  assert(Offset1 > Offset0 &&
+         "Second offset should be larger than first offset!");
+  // If we have less than 16 loads in a row, and the offsets are within 64
+  // bytes, then schedule together.
+
+  // A cacheline is 64 bytes (for global memory).
+  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
+}
+
+int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
+  switch (Channels) {
+  default: return Opcode;
+  case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1);
+  case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2);
+  case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3);
+  }
+}
+
+// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
+enum SIEncodingFamily {
+  SI = 0,
+  VI = 1
+};
+
+// Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
+// header files, so we need to wrap it in a function that takes unsigned
+// instead.
+namespace llvm {
+namespace AMDGPU {
+static int getMCOpcode(uint16_t Opcode, unsigned Gen) {
+  return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
+}
+}
+}
+
+static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
+  switch (ST.getGeneration()) {
+  case AMDGPUSubtarget::SOUTHERN_ISLANDS:
+  case AMDGPUSubtarget::SEA_ISLANDS:
+    return SIEncodingFamily::SI;
+  case AMDGPUSubtarget::VOLCANIC_ISLANDS:
+    return SIEncodingFamily::VI;
+
+  // FIXME: This should never be called for r600 GPUs.
+  case AMDGPUSubtarget::R600:
+  case AMDGPUSubtarget::R700:
+  case AMDGPUSubtarget::EVERGREEN:
+  case AMDGPUSubtarget::NORTHERN_ISLANDS:
+    return SIEncodingFamily::SI;
+  }
+
+  llvm_unreachable("Unknown subtarget generation!");
+}
+
+int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
+  int MCOp = AMDGPU::getMCOpcode(Opcode, subtargetEncodingFamily(ST));
+
+  // -1 means that Opcode is already a native instruction.
+  if (MCOp == -1)
+    return Opcode;
+
+  // (uint16_t)-1 means that Opcode is a pseudo instruction that has
+  // no encoding in the given subtarget generation.
+  if (MCOp == (uint16_t)-1)
+    return -1;
+
+  return MCOp;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
new file mode 100644
index 000000000000..bd8e389639f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -0,0 +1,57 @@
+//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Contains the definition of a TargetInstrInfo class that is common
+/// to all AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#define GET_INSTRINFO_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+class MachineFunction;
+class MachineInstr;
+class MachineInstrBuilder;
+
+class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
+private:
+  const AMDGPUSubtarget &ST;
+
+  virtual void anchor();
+
+public:
+  explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
+
+  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                               int64_t Offset1, int64_t Offset2,
+                               unsigned NumLoads) const override;
+
+  /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
+  /// Return -1 if the target-specific opcode for the pseudo instruction does
+  /// not exist. If Opcode is not a pseudo instruction, this is identity.
+  int pseudoToMCOpcode(int Opcode) const;
+
+  /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the
+  /// equivalent opcode that writes \p Channels Channels.
+  int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const;
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
new file mode 100644
index 000000000000..e7b40016e272
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -0,0 +1,330 @@
+//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains DAG node defintions for the AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AMDGPU DAG Profiles
+//===----------------------------------------------------------------------===//
+
+def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
+]>;
+
+def AMDGPUTrigPreOp : SDTypeProfile<1, 2,
+  [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
+>;
+
+def AMDGPULdExpOp : SDTypeProfile<1, 2,
+  [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
+>;
+
+def AMDGPUFPClassOp : SDTypeProfile<1, 2,
+  [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>]
+>;
+
+def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
+  [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
+>;
+
+// float, float, float, vcc
+def AMDGPUFmasOp : SDTypeProfile<1, 4,
+  [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>]
+>;
+
+def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+//===----------------------------------------------------------------------===//
+// AMDGPU DAG Nodes
+//
+
+def AMDGPUconstdata_ptr : SDNode<
+  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>,
+                                                     SDTCisVT<0, iPTR>]>
+>;
+
+// This argument to this node is a dword address.
+def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
+
+def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
+def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
+
+// out = a - floor(a)
+def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
+
+// out = 1.0 / a
+def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a)
+def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a)
+def AMDGPUrcp_legacy : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>;
+def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a) result clamped to +/- max_float.
+def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
+
+def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
+
+def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
+
+// out = max(a, b) a and b are floats, where a nan comparison fails.
+// This is not commutative because this gives the second operand:
+//   x < nan ? x : nan -> nan
+//   nan < x ? nan : x -> x
+def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp,
+  []
+>;
+
+def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
+
+// out = max(a, b) a and b are signed ints
+def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = max(a, b) a and b are unsigned ints
+def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = min(a, b) a and b are floats, where a nan comparison fails.
+def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
+  []
+>;
+
+// FIXME: TableGen doesn't like commutative instructions with more
+// than 2 operands.
+// out = max(a, b, c) a, b and c are floats
+def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = max(a, b, c) a, b, and c are signed ints
+def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = max(a, b, c) a, b and c are unsigned ints
+def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = min(a, b, c) a, b and c are floats
+def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = min(a, b, c) a, b and c are signed ints
+def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = min(a, b) a and b are unsigned ints
+def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = (src0 + src1 > 0xFFFFFFFF) ? 1 : 0
+def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>;
+
+// out = (src1 > src0) ? 1 : 0
+def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>;
+
+def AMDGPUSetCCOp : SDTypeProfile<1, 3, [        // setcc
+  SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
+]>;
+
+def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;
+
+def AMDGPUSetRegOp :  SDTypeProfile<0, 2, [
+  SDTCisInt<0>, SDTCisInt<1>
+]>;
+
+def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [
+  SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [
+   SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [
+  SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
+  SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1",
+  SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2",
+  SDTIntToFPOp, []>;
+def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
+  SDTIntToFPOp, []>;
+
+
+// urecip - This operation is a helper for integer division, it returns the
+// result of 1 / a as a fractional unsigned integer.
+// out = (2^32 / a) + e
+// e is rounding error
+def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
+
+// Special case divide preop and flags.
+def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
+
+//  Special case divide FMA with scale and flags (src0 = Quotient,
+//  src1 = Denominator, src2 = Numerator).
+def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>;
+
+// Single or double precision division fixup.
+// Special case divide fixup and flags(src0 = Quotient, src1 =
+// Denominator, src2 = Numerator).
+def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
+
+// Look Up 2.0 / pi src0 with segment select src1[4:0]
+def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;
+
+def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
+                          SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
+                          [SDNPHasChain, SDNPMayLoad]>;
+
+def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE",
+                           SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
+                           [SDNPHasChain, SDNPMayStore]>;
+
+// MSKOR instructions are atomic memory instructions used mainly for storing
+// 8-bit and 16-bit values.  The definition is:
+//
+// MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src)
+//
+// src0: vec4(src, 0, 0, mask)
+// src1: dst - rat offset (aka pointer) in dwords
+def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
+                        SDTypeProfile<0, 2, []>,
+                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP",
+                            SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisVec<2>]>,
+                            [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                             SDNPMemOperand]>;
+
+def AMDGPUround : SDNode<"ISD::FROUND",
+                         SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
+
+def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
+
+def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>;
+def AMDGPUffbh_i32 : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>;
+
+// Signed and unsigned 24-bit multiply. The highest 8-bits are ignore
+// when performing the mulitply. The result is a 32-bit value.
+def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+def AMDGPUmulhi_u24 : SDNode<"AMDGPUISD::MULHI_U24", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+def AMDGPUmulhi_i24 : SDNode<"AMDGPUISD::MULHI_I24", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp,
+  []
+>;
+def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp,
+  []
+>;
+
+def AMDGPUsmed3 : SDNode<"AMDGPUISD::SMED3", AMDGPUDTIntTernaryOp,
+  []
+>;
+
+def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
+  []
+>;
+
+def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
+
+def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
+                    SDTypeProfile<0, 1, [SDTCisInt<0>]>,
+                    [SDNPHasChain, SDNPInGlue]>;
+
+def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV",
+                        SDTypeProfile<1, 3, [SDTCisFP<0>]>,
+                        [SDNPInGlue]>;
+
+def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1",
+                      SDTypeProfile<1, 3, [SDTCisFP<0>]>,
+                      [SDNPInGlue, SDNPOutGlue]>;
+
+def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2",
+                      SDTypeProfile<1, 4, [SDTCisFP<0>]>,
+                      [SDNPInGlue]>;
+
+
+def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT,
+  [SDNPHasChain, SDNPSideEffect]>;
+
+// SI+ export
+def AMDGPUExportOp : SDTypeProfile<0, 8, [
+  SDTCisInt<0>, // i8 en
+  SDTCisInt<1>, // i1 vm
+  // skip done
+  SDTCisInt<2>, // i8 tgt
+  SDTCisSameAs<3, 1>, // i1 compr
+  SDTCisFP<4>,        // f32 src0
+  SDTCisSameAs<5, 4>, // f32 src1
+  SDTCisSameAs<6, 4>, // f32 src2
+  SDTCisSameAs<7, 4>  // f32 src3
+]>;
+
+def AMDGPUexport: SDNode<"AMDGPUISD::EXPORT", AMDGPUExportOp,
+  [SDNPHasChain, SDNPMayStore]>;
+
+def AMDGPUexport_done: SDNode<"AMDGPUISD::EXPORT_DONE", AMDGPUExportOp,
+  [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
+
+
+def R600ExportOp : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
+
+def R600_EXPORT: SDNode<"AMDGPUISD::R600_EXPORT", R600ExportOp,
+  [SDNPHasChain, SDNPSideEffect]>;
+
+//===----------------------------------------------------------------------===//
+// Flow Control Profile Types
+//===----------------------------------------------------------------------===//
+// Branch instruction where second and third are basic blocks
+def SDTIL_BRCond : SDTypeProfile<0, 2, [
+    SDTCisVT<0, OtherVT>
+    ]>;
+
+//===----------------------------------------------------------------------===//
+// Flow Control DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// Call/Return DAG Nodes
+//===----------------------------------------------------------------------===//
+def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
+    [SDNPHasChain, SDNPOptInGlue]>;
+
+def AMDGPUreturn : SDNode<"AMDGPUISD::RETURN", SDTNone,
+    [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
new file mode 100644
index 000000000000..513df3a9cdf3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -0,0 +1,677 @@
+//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction defs that are common to all hw codegen
+// targets.
+//
+//===----------------------------------------------------------------------===//
+
+class AMDGPUInst <dag outs, dag ins, string asm = "",
+  list<dag> pattern = []> : Instruction {
+  field bit isRegisterLoad = 0;
+  field bit isRegisterStore = 0;
+
+  let Namespace = "AMDGPU";
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let Itinerary = NullALU;
+
+  // SoftFail is a field the disassembler can use to provide a way for
+  // instructions to not match without killing the whole decode process. It is
+  // mainly used for ARM, but Tablegen expects this field to exist or it fails
+  // to build the decode table.
+  field bits<64> SoftFail = 0;
+
+  let DecoderNamespace = Namespace;
+
+  let TSFlags{63} = isRegisterLoad;
+  let TSFlags{62} = isRegisterStore;
+}
+
+class AMDGPUShaderInst <dag outs, dag ins, string asm = "",
+  list<dag> pattern = []> : AMDGPUInst<outs, ins, asm, pattern> {
+
+  field bits<32> Inst = 0xffffffff;
+}
+
+def FP16Denormals : Predicate<"Subtarget.hasFP16Denormals()">;
+def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">;
+def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">;
+def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
+
+def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
+def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
+
+let OperandType = "OPERAND_IMMEDIATE" in {
+
+def u32imm : Operand<i32> {
+  let PrintMethod = "printU32ImmOperand";
+}
+
+def u16imm : Operand<i16> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def u8imm : Operand<i8> {
+  let PrintMethod = "printU8ImmOperand";
+}
+
+} // End OperandType = "OPERAND_IMMEDIATE"
+
+//===--------------------------------------------------------------------===//
+// Custom Operands
+//===--------------------------------------------------------------------===//
+def brtarget   : Operand<OtherVT>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for floating-point comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_OEQ : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}]
+>;
+
+def COND_ONE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}]
+>;
+
+def COND_OGT : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}]
+>;
+
+def COND_OGE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}]
+>;
+
+def COND_OLT : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}]
+>;
+
+def COND_OLE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
+>;
+
+
+def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
+def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for unsigned / unordered comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>;
+def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>;
+def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>;
+def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>;
+def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>;
+def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>;
+
+// XXX - For some reason R600 version is preferring to use unordered
+// for setne?
+def COND_UNE_NE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
+>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for signed comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>;
+def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>;
+def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>;
+def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for integer equality
+//===----------------------------------------------------------------------===//
+
+def COND_EQ : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}]
+>;
+
+def COND_NE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}]
+>;
+
+def COND_NULL : PatLeaf <
+  (cond),
+  [{(void)N; return false;}]
+>;
+
+
+//===----------------------------------------------------------------------===//
+// Misc. PatFrags
+//===----------------------------------------------------------------------===//
+
+class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
+  (ops node:$src0, node:$src1),
+  (op $src0, $src1),
+  [{ return N->hasOneUse(); }]
+>;
+
+class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
+  (ops node:$src0, node:$src1, node:$src2),
+  (op $src0, $src1, $src2),
+  [{ return N->hasOneUse(); }]
+>;
+
+//===----------------------------------------------------------------------===//
+// Load/Store Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+}]>;
+
+class PrivateLoad <SDPatternOperator op> : PrivateMemOp <
+  (ops node:$ptr), (op node:$ptr)
+>;
+
+class PrivateStore <SDPatternOperator op> : PrivateMemOp <
+  (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
+>;
+
+def load_private : PrivateLoad <load>;
+
+def truncstorei8_private : PrivateStore <truncstorei8>;
+def truncstorei16_private : PrivateStore <truncstorei16>;
+def store_private : PrivateStore <store>;
+
+class GlobalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}]>;
+
+// Global address space loads
+class GlobalLoad <SDPatternOperator op> : GlobalMemOp <
+  (ops node:$ptr), (op node:$ptr)
+>;
+
+def global_load : GlobalLoad <load>;
+
+// Global address space stores
+class GlobalStore <SDPatternOperator op> : GlobalMemOp <
+  (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
+>;
+
+def global_store : GlobalStore <store>;
+def global_store_atomic : GlobalStore<atomic_store>;
+
+
+class ConstantMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+}]>;
+
+// Constant address space loads
+class ConstantLoad <SDPatternOperator op> : ConstantMemOp <
+  (ops node:$ptr), (op node:$ptr)
+>;
+
+def constant_load : ConstantLoad<load>;
+
+class LocalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+// Local address space loads
+class LocalLoad <SDPatternOperator op> : LocalMemOp <
+  (ops node:$ptr), (op node:$ptr)
+>;
+
+class LocalStore <SDPatternOperator op> : LocalMemOp <
+  (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
+>;
+
+class FlatMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUAS::FLAT_ADDRESS;
+}]>;
+
+class FlatLoad <SDPatternOperator op> : FlatMemOp <
+  (ops node:$ptr), (op node:$ptr)
+>;
+
+class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
+                                              (ld_node node:$ptr), [{
+  LoadSDNode *L = cast<LoadSDNode>(N);
+  return L->getExtensionType() == ISD::ZEXTLOAD ||
+         L->getExtensionType() == ISD::EXTLOAD;
+}]>;
+
+def az_extload : AZExtLoadBase <unindexedload>;
+
+def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def az_extloadi8_global : GlobalLoad <az_extloadi8>;
+def sextloadi8_global : GlobalLoad <sextloadi8>;
+
+def az_extloadi8_constant : ConstantLoad <az_extloadi8>;
+def sextloadi8_constant : ConstantLoad <sextloadi8>;
+
+def az_extloadi8_local : LocalLoad <az_extloadi8>;
+def sextloadi8_local : LocalLoad <sextloadi8>;
+
+def extloadi8_private : PrivateLoad <az_extloadi8>;
+def sextloadi8_private : PrivateLoad <sextloadi8>;
+
+def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def az_extloadi16_global : GlobalLoad <az_extloadi16>;
+def sextloadi16_global : GlobalLoad <sextloadi16>;
+
+def az_extloadi16_constant : ConstantLoad <az_extloadi16>;
+def sextloadi16_constant : ConstantLoad <sextloadi16>;
+
+def az_extloadi16_local : LocalLoad <az_extloadi16>;
+def sextloadi16_local : LocalLoad <sextloadi16>;
+
+def extloadi16_private : PrivateLoad <az_extloadi16>;
+def sextloadi16_private : PrivateLoad <sextloadi16>;
+
+def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def az_extloadi32_global : GlobalLoad <az_extloadi32>;
+
+def az_extloadi32_flat : FlatLoad <az_extloadi32>;
+
+def az_extloadi32_constant : ConstantLoad <az_extloadi32>;
+
+def truncstorei8_global : GlobalStore <truncstorei8>;
+def truncstorei16_global : GlobalStore <truncstorei16>;
+
+def local_store : LocalStore <store>;
+def truncstorei8_local : LocalStore <truncstorei8>;
+def truncstorei16_local : LocalStore <truncstorei16>;
+
+def local_load : LocalLoad <load>;
+
+class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
+    return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
+}]>;
+
+def local_load_aligned8bytes : Aligned8Bytes <
+  (ops node:$ptr), (local_load node:$ptr)
+>;
+
+def local_store_aligned8bytes : Aligned8Bytes <
+  (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr)
+>;
+
+class local_binary_atomic_op<SDNode atomic_op> :
+  PatFrag<(ops node:$ptr, node:$value),
+    (atomic_op node:$ptr, node:$value), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+
+def atomic_swap_local : local_binary_atomic_op<atomic_swap>;
+def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>;
+def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>;
+def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>;
+def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>;
+def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>;
+def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>;
+def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>;
+def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>;
+def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>;
+def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>;
+
+def mskor_global : PatFrag<(ops node:$val, node:$ptr),
+                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}]>;
+
+multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
+
+  def _32_local : PatFrag <
+    (ops node:$ptr, node:$cmp, node:$swap),
+    (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
+      AtomicSDNode *AN = cast<AtomicSDNode>(N);
+      return AN->getMemoryVT() == MVT::i32 &&
+             AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+  }]>;
+
+  def _64_local : PatFrag<
+    (ops node:$ptr, node:$cmp, node:$swap),
+    (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
+      AtomicSDNode *AN = cast<AtomicSDNode>(N);
+      return AN->getMemoryVT() == MVT::i64 &&
+             AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+  }]>;
+}
+
+defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>;
+
+multiclass global_binary_atomic_op<SDNode atomic_op> {
+  def "" : PatFrag<
+        (ops node:$ptr, node:$value),
+        (atomic_op node:$ptr, node:$value),
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+
+  def _noret : PatFrag<
+        (ops node:$ptr, node:$value),
+        (atomic_op node:$ptr, node:$value),
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+
+  def _ret : PatFrag<
+        (ops node:$ptr, node:$value),
+        (atomic_op node:$ptr, node:$value),
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+}
+
+defm atomic_swap_global : global_binary_atomic_op<atomic_swap>;
+defm atomic_add_global : global_binary_atomic_op<atomic_load_add>;
+defm atomic_and_global : global_binary_atomic_op<atomic_load_and>;
+defm atomic_max_global : global_binary_atomic_op<atomic_load_max>;
+defm atomic_min_global : global_binary_atomic_op<atomic_load_min>;
+defm atomic_or_global : global_binary_atomic_op<atomic_load_or>;
+defm atomic_sub_global : global_binary_atomic_op<atomic_load_sub>;
+defm atomic_umax_global : global_binary_atomic_op<atomic_load_umax>;
+defm atomic_umin_global : global_binary_atomic_op<atomic_load_umin>;
+defm atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
+
+//legacy
+def AMDGPUatomic_cmp_swap_global : PatFrag<
+        (ops node:$ptr, node:$value),
+        (AMDGPUatomic_cmp_swap node:$ptr, node:$value),
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+
+def atomic_cmp_swap_global : PatFrag<
+      (ops node:$ptr, node:$cmp, node:$value),
+      (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
+      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+
+def atomic_cmp_swap_global_noret : PatFrag<
+      (ops node:$ptr, node:$cmp, node:$value),
+      (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
+      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+
+def atomic_cmp_swap_global_ret : PatFrag<
+      (ops node:$ptr, node:$cmp, node:$value),
+      (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
+      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+
+//===----------------------------------------------------------------------===//
+// Misc Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+class Constants {
+int TWO_PI = 0x40c90fdb;
+int PI = 0x40490fdb;
+int TWO_PI_INV = 0x3e22f983;
+int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
+int FP16_ONE = 0x3C00;
+int FP32_ONE = 0x3f800000;
+int FP32_NEG_ONE = 0xbf800000;
+int FP64_ONE = 0x3ff0000000000000;
+int FP64_NEG_ONE = 0xbff0000000000000;
+}
+def CONST : Constants;
+
+def FP_ZERO : PatLeaf <
+  (fpimm),
+  [{return N->getValueAPF().isZero();}]
+>;
+
+def FP_ONE : PatLeaf <
+  (fpimm),
+  [{return N->isExactlyValue(1.0);}]
+>;
+
+def FP_HALF : PatLeaf <
+  (fpimm),
+  [{return N->isExactlyValue(0.5);}]
+>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in {
+
+let usesCustomInserter = 1  in {
+
+class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "CLAMP $dst, $src0",
+  [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
+>;
+
+class FABS <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "FABS $dst, $src0",
+  [(set f32:$dst, (fabs f32:$src0))]
+>;
+
+class FNEG <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "FNEG $dst, $src0",
+  [(set f32:$dst, (fneg f32:$src0))]
+>;
+
+} // usesCustomInserter = 1
+
+multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
+                    ComplexPattern addrPat> {
+let UseNamedOperandTable = 1 in {
+
+  def RegisterLoad : AMDGPUShaderInst <
+    (outs dstClass:$dst),
+    (ins addrClass:$addr, i32imm:$chan),
+    "RegisterLoad $dst, $addr",
+    [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))]
+  > {
+    let isRegisterLoad = 1;
+  }
+
+  def RegisterStore : AMDGPUShaderInst <
+    (outs),
+    (ins dstClass:$val, addrClass:$addr, i32imm:$chan),
+    "RegisterStore $val, $addr",
+    [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))]
+  > {
+    let isRegisterStore = 1;
+  }
+}
+}
+
+} // End isCodeGenOnly = 1, isPseudo = 1
+
+/* Generic helper patterns for intrinsics */
+/* -------------------------------------- */
+
+class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul>
+  : Pat <
+  (fpow f32:$src0, f32:$src1),
+  (exp_ieee (mul f32:$src1, (log_ieee f32:$src0)))
+>;
+
+/* Other helper patterns */
+/* --------------------- */
+
+/* Extract element pattern */
+class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
+                       SubRegIndex sub_reg>
+  : Pat<
+  (sub_type (extractelt vec_type:$src, sub_idx)),
+  (EXTRACT_SUBREG $src, sub_reg)
+>;
+
+/* Insert element pattern */
+class Insert_Element <ValueType elem_type, ValueType vec_type,
+                      int sub_idx, SubRegIndex sub_reg>
+  : Pat <
+  (insertelt vec_type:$vec, elem_type:$elem, sub_idx),
+  (INSERT_SUBREG $vec, $elem, sub_reg)
+>;
+
+// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
+// can handle COPY instructions.
+// bitconvert pattern
+class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat <
+  (dt (bitconvert (st rc:$src0))),
+  (dt rc:$src0)
+>;
+
+// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
+// can handle COPY instructions.
+class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
+  (vt (AMDGPUdwordaddr (vt rc:$addr))),
+  (vt rc:$addr)
+>;
+
+// BFI_INT patterns
+
+multiclass BFIPatterns <Instruction BFI_INT,
+                        Instruction LoadImm32,
+                        RegisterClass RC64> {
+  // Definition from ISA doc:
+  // (y & x) | (z & ~x)
+  def : Pat <
+    (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
+    (BFI_INT $x, $y, $z)
+  >;
+
+  // SHA-256 Ch function
+  // z ^ (x & (y ^ z))
+  def : Pat <
+    (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
+    (BFI_INT $x, $y, $z)
+  >;
+
+  def : Pat <
+    (fcopysign f32:$src0, f32:$src1),
+    (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1)
+  >;
+
+  def : Pat <
+    (f64 (fcopysign f64:$src0, f64:$src1)),
+    (REG_SEQUENCE RC64,
+      (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+      (BFI_INT (LoadImm32 (i32 0x7fffffff)),
+               (i32 (EXTRACT_SUBREG $src0, sub1)),
+               (i32 (EXTRACT_SUBREG $src1, sub1))), sub1)
+  >;
+
+  def : Pat <
+    (f64 (fcopysign f64:$src0, f32:$src1)),
+    (REG_SEQUENCE RC64,
+      (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+      (BFI_INT (LoadImm32 (i32 0x7fffffff)),
+               (i32 (EXTRACT_SUBREG $src0, sub1)),
+               $src1), sub1)
+  >;
+}
+
+// SHA-256 Ma patterns
+
+// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y
+class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat <
+  (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
+  (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
+>;
+
+// Bitfield extract patterns
+
+def IMMZeroBasedBitfieldMask : PatLeaf <(imm), [{
+  return isMask_32(N->getZExtValue());
+}]>;
+
+def IMMPopCount : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+class BFEPattern <Instruction BFE, Instruction MOV> : Pat <
+  (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)),
+  (BFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
+>;
+
+// rotr pattern
+class ROTRPattern <Instruction BIT_ALIGN> : Pat <
+  (rotr i32:$src0, i32:$src1),
+  (BIT_ALIGN $src0, $src0, $src1)
+>;
+
+// This matches 16 permutations of
+// max(min(x, y), min(max(x, y), z))
+class IntMed3Pat<Instruction med3Inst,
+                 SDPatternOperator max,
+                 SDPatternOperator max_oneuse,
+                 SDPatternOperator min_oneuse> : Pat<
+  (max (min_oneuse i32:$src0, i32:$src1),
+       (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)),
+  (med3Inst $src0, $src1, $src2)
+>;
+
+let Properties = [SDNPCommutative, SDNPAssociative] in {
+def smax_oneuse : HasOneUseBinOp<smax>;
+def smin_oneuse : HasOneUseBinOp<smin>;
+def umax_oneuse : HasOneUseBinOp<umax>;
+def umin_oneuse : HasOneUseBinOp<umin>;
+def sub_oneuse : HasOneUseBinOp<sub>;
+} // Properties = [SDNPCommutative, SDNPAssociative]
+
+def select_oneuse : HasOneUseTernaryOp<select>;
+
+// Special conversion patterns
+
+def cvt_rpi_i32_f32 : PatFrag <
+  (ops node:$src),
+  (fp_to_sint (ffloor (fadd $src, FP_HALF))),
+  [{ (void) N; return TM.Options.NoNaNsFPMath; }]
+>;
+
+def cvt_flr_i32_f32 : PatFrag <
+  (ops node:$src),
+  (fp_to_sint (ffloor $src)),
+  [{ (void)N; return TM.Options.NoNaNsFPMath; }]
+>;
+
+class IMad24Pat<Instruction Inst> : Pat <
+  (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2),
+  (Inst $src0, $src1, $src2)
+>;
+
+class UMad24Pat<Instruction Inst> : Pat <
+  (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2),
+  (Inst $src0, $src1, $src2)
+>;
+
+class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
+  (fdiv FP_ONE, vt:$src),
+  (RcpInst $src)
+>;
+
+class RsqPat<Instruction RsqInst, ValueType vt> : Pat <
+  (AMDGPUrcp (fsqrt vt:$src)),
+  (RsqInst $src)
+>;
+
+include "R600Instructions.td"
+include "R700Instructions.td"
+include "EvergreenInstructions.td"
+include "CaymanInstructions.td"
+
+include "SIInstrInfo.td"
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
new file mode 100644
index 000000000000..8e3471bd2083
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -0,0 +1,110 @@
+//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Implementation of the IntrinsicInfo class.
+//
+//===-----------------------------------------------------------------------===//
+
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo()
+    : TargetIntrinsicInfo() {}
+
+static const char *const IntrinsicNameTable[] = {
+#define GET_INTRINSIC_NAME_TABLE
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_NAME_TABLE
+};
+
+namespace {
+#define GET_INTRINSIC_ATTRIBUTES
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_ATTRIBUTES
+}
+
+StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID,
+                                       ArrayRef<Type *> Tys) const {
+  if (IntrID < Intrinsic::num_intrinsics)
+    return StringRef();
+
+  assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
+         "Invalid intrinsic ID");
+
+  return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics];
+}
+
+std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
+                                         unsigned NumTys) const {
+  return getName(IntrID, makeArrayRef(Tys, NumTys)).str();
+}
+
+FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID,
+                                           ArrayRef<Type*> Tys) const {
+  // FIXME: Re-use Intrinsic::getType machinery
+  switch (ID) {
+  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+    Type *F32Ty = Type::getFloatTy(Context);
+    return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false);
+  }
+  default:
+    llvm_unreachable("unhandled intrinsic");
+  }
+}
+
+unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
+                                         unsigned Len) const {
+  StringRef Name(NameData, Len);
+  if (!Name.startswith("llvm."))
+    return 0; // All intrinsics start with 'llvm.'
+
+  // Look for a name match in our table.  If the intrinsic is not overloaded,
+  // require an exact match. If it is overloaded, require a prefix match. The
+  // AMDGPU enum enum starts at Intrinsic::num_intrinsics.
+  int Idx = Intrinsic::lookupLLVMIntrinsicByName(IntrinsicNameTable, Name);
+  if (Idx >= 0) {
+    bool IsPrefixMatch = Name.size() > strlen(IntrinsicNameTable[Idx]);
+    return IsPrefixMatch == isOverloaded(Idx + 1)
+               ? Intrinsic::num_intrinsics + Idx
+               : 0;
+  }
+
+  return 0;
+}
+
+bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
+// Overload Table
+#define GET_INTRINSIC_OVERLOAD_TABLE
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_OVERLOAD_TABLE
+}
+
+Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+                                              ArrayRef<Type *> Tys) const {
+  FunctionType *FTy = getType(M->getContext(), IntrID, Tys);
+  Function *F
+    = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
+
+  AttributeSet AS = getAttributes(M->getContext(),
+                                  static_cast<AMDGPUIntrinsic::ID>(IntrID));
+  F->setAttributes(AS);
+  return F;
+}
+
+Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+                                              Type **Tys,
+                                              unsigned NumTys) const {
+  return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys));
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
new file mode 100644
index 000000000000..6cb8b9644642
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
@@ -0,0 +1,58 @@
+//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
+//
+//===-----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H
+
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+namespace llvm {
+class TargetMachine;
+
+namespace AMDGPUIntrinsic {
+enum ID {
+  last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
+#define GET_INTRINSIC_ENUM_VALUES
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_ENUM_VALUES
+      , num_AMDGPU_intrinsics
+};
+
+} // end namespace AMDGPUIntrinsic
+
+class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo {
+public:
+  AMDGPUIntrinsicInfo();
+
+  StringRef getName(unsigned IntrId, ArrayRef<Type *> Tys = None) const;
+
+  std::string getName(unsigned IntrId, Type **Tys = nullptr,
+                      unsigned NumTys = 0) const override;
+
+  unsigned lookupName(const char *Name, unsigned Len) const override;
+  bool isOverloaded(unsigned IID) const override;
+  Function *getDeclaration(Module *M, unsigned ID,
+                           Type **Tys = nullptr,
+                           unsigned NumTys = 0) const override;
+
+  Function *getDeclaration(Module *M, unsigned ID,
+                           ArrayRef<Type *> = None) const;
+
+  FunctionType *getType(LLVMContext &Context, unsigned ID,
+                        ArrayRef<Type*> Tys = None) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
new file mode 100644
index 000000000000..ceae0b575395
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -0,0 +1,36 @@
+//===-- AMDGPUIntrinsics.td - Common intrinsics  -*- tablegen -*-----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines intrinsics that are used by all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "AMDGPU", isTarget = 1 in {
+  def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+
+  def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
+  def int_AMDGPU_kilp : Intrinsic<[], [], []>;
+
+  // Deprecated in favor of llvm.amdgcn.sffbh
+  def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  // Deprecated in favor of separate int_amdgcn_cube* intrinsics.
+  def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+
+  // Deprecated in favor of expanded bit operations
+  def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  // Deprecated in favor of llvm.amdgcn.rsq
+  def int_AMDGPU_rsq : Intrinsic<
+    [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]
+  >;
+}
+
+include "SIIntrinsics.td"
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
new file mode 100644
index 000000000000..7d56355074b1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -0,0 +1,242 @@
+//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPUMCInstLower.h"
+#include "AMDGPUAsmPrinter.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#include "AMDGPUGenMCPseudoLowering.inc"
+
+
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st,
+                                     const AsmPrinter &ap):
+  Ctx(ctx), ST(st), AP(ap) { }
+
+static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) {
+  switch (MOFlags) {
+  default:
+    return MCSymbolRefExpr::VK_None;
+  case SIInstrInfo::MO_GOTPCREL:
+    return MCSymbolRefExpr::VK_GOTPCREL;
+  case SIInstrInfo::MO_GOTPCREL32_LO:
+    return MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_LO;
+  case SIInstrInfo::MO_GOTPCREL32_HI:
+    return MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_HI;
+  case SIInstrInfo::MO_REL32_LO:
+    return MCSymbolRefExpr::VK_AMDGPU_REL32_LO;
+  case SIInstrInfo::MO_REL32_HI:
+    return MCSymbolRefExpr::VK_AMDGPU_REL32_HI;
+  }
+}
+
+const MCExpr *AMDGPUMCInstLower::getLongBranchBlockExpr(
+  const MachineBasicBlock &SrcBB,
+  const MachineOperand &MO) const {
+  const MCExpr *DestBBSym
+    = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx);
+  const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx);
+
+  assert(SrcBB.front().getOpcode() == AMDGPU::S_GETPC_B64 &&
+         ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4);
+
+  // s_getpc_b64 returns the address of next instruction.
+  const MCConstantExpr *One = MCConstantExpr::create(4, Ctx);
+  SrcBBSym = MCBinaryExpr::createAdd(SrcBBSym, One, Ctx);
+
+  if (MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_FORWARD)
+    return MCBinaryExpr::createSub(DestBBSym, SrcBBSym, Ctx);
+
+  assert(MO.getTargetFlags() == AMDGPU::TF_LONG_BRANCH_BACKWARD);
+  return MCBinaryExpr::createSub(SrcBBSym, DestBBSym, Ctx);
+}
+
+bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
+                                     MCOperand &MCOp) const {
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::createImm(MO.getImm());
+    return true;
+  case MachineOperand::MO_Register:
+    MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST));
+    return true;
+  case MachineOperand::MO_MachineBasicBlock: {
+    if (MO.getTargetFlags() != 0) {
+      MCOp = MCOperand::createExpr(
+        getLongBranchBlockExpr(*MO.getParent()->getParent(), MO));
+    } else {
+      MCOp = MCOperand::createExpr(
+        MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
+    }
+
+    return true;
+  }
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MO.getGlobal();
+    SmallString<128> SymbolName;
+    AP.getNameWithPrefix(SymbolName, GV);
+    MCSymbol *Sym = Ctx.getOrCreateSymbol(SymbolName);
+    const MCExpr *SymExpr =
+      MCSymbolRefExpr::create(Sym, getVariantKind(MO.getTargetFlags()),Ctx);
+    const MCExpr *Expr = MCBinaryExpr::createAdd(SymExpr,
+      MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+    MCOp = MCOperand::createExpr(Expr);
+    return true;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName()));
+    Sym->setExternal(true);
+    const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
+    MCOp = MCOperand::createExpr(Expr);
+    return true;
+  }
+  }
+}
+
+void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+
+  int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode());
+
+  if (MCOpcode == -1) {
+    LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
+    C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
+                "a target-specific version: " + Twine(MI->getOpcode()));
+  }
+
+  OutMI.setOpcode(MCOpcode);
+
+  for (const MachineOperand &MO : MI->explicit_operands()) {
+    MCOperand MCOp;
+    lowerOperand(MO, MCOp);
+    OutMI.addOperand(MCOp);
+  }
+}
+
+bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO,
+                                    MCOperand &MCOp) const {
+  const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+  AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this);
+  return MCInstLowering.lowerOperand(MO, MCOp);
+}
+
+void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+    return;
+
+  const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
+  AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this);
+
+  StringRef Err;
+  if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
+    LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
+    C.emitError("Illegal instruction detected: " + Err);
+    MI->dump();
+  }
+
+  if (MI->isBundle()) {
+    const MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
+    while (I != MBB->instr_end() && I->isInsideBundle()) {
+      EmitInstruction(&*I);
+      ++I;
+    }
+  } else {
+    // We don't want SI_MASK_BRANCH/SI_RETURN encoded. They are placeholder
+    // terminator instructions and should only be printed as comments.
+    if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
+      if (isVerbose()) {
+        SmallVector<char, 16> BBStr;
+        raw_svector_ostream Str(BBStr);
+
+        const MachineBasicBlock *MBB = MI->getOperand(0).getMBB();
+        const MCSymbolRefExpr *Expr
+          = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+        Expr->print(Str, MAI);
+        OutStreamer->emitRawComment(" mask branch " + BBStr);
+      }
+
+      return;
+    }
+
+    if (MI->getOpcode() == AMDGPU::SI_RETURN) {
+      if (isVerbose())
+        OutStreamer->emitRawComment(" return");
+      return;
+    }
+
+    if (MI->getOpcode() == AMDGPU::WAVE_BARRIER) {
+      if (isVerbose())
+        OutStreamer->emitRawComment(" wave barrier");
+      return;
+    }
+
+    MCInst TmpInst;
+    MCInstLowering.lower(MI, TmpInst);
+    EmitToStreamer(*OutStreamer, TmpInst);
+
+    if (STI.dumpCode()) {
+      // Disassemble instruction/operands to text.
+      DisasmLines.resize(DisasmLines.size() + 1);
+      std::string &DisasmLine = DisasmLines.back();
+      raw_string_ostream DisasmStream(DisasmLine);
+
+      AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(),
+                                    *STI.getInstrInfo(),
+                                    *STI.getRegisterInfo());
+      InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), STI);
+
+      // Disassemble instruction/operands to hex representation.
+      SmallVector<MCFixup, 4> Fixups;
+      SmallVector<char, 16> CodeBytes;
+      raw_svector_ostream CodeStream(CodeBytes);
+
+      auto &ObjStreamer = static_cast<MCObjectStreamer&>(*OutStreamer);
+      MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
+      InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups,
+                                    MF->getSubtarget<MCSubtargetInfo>());
+      HexLines.resize(HexLines.size() + 1);
+      std::string &HexLine = HexLines.back();
+      raw_string_ostream HexStream(HexLine);
+
+      for (size_t i = 0; i < CodeBytes.size(); i += 4) {
+        unsigned int CodeDWord = *(unsigned int *)&CodeBytes[i];
+        HexStream << format("%s%08X", (i > 0 ? " " : ""), CodeDWord);
+      }
+
+      DisasmStream.flush();
+      DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLine.size());
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
new file mode 100644
index 000000000000..57d2d85daecd
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@@ -0,0 +1,46 @@
+//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+class AsmPrinter;
+class MachineBasicBlock;
+class MachineInstr;
+class MachineOperand;
+class MCContext;
+class MCExpr;
+class MCInst;
+class MCOperand;
+
+class AMDGPUMCInstLower {
+  MCContext &Ctx;
+  const AMDGPUSubtarget &ST;
+  const AsmPrinter &AP;
+
+  const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB,
+                                       const MachineOperand &MO) const;
+
+public:
+  AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST,
+                    const AsmPrinter &AP);
+
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+
+  /// \brief Lower a MachineInstr to an MCInst
+  void lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
new file mode 100644
index 000000000000..40c3327a98db
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -0,0 +1,47 @@
+//===-- AMDGPUMachineFunctionInfo.cpp ---------------------------------------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMachineFunction.h"
+#include "AMDGPUSubtarget.h"
+
+using namespace llvm;
+
+AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
+  MachineFunctionInfo(),
+  LocalMemoryObjects(),
+  KernArgSize(0),
+  MaxKernArgAlign(0),
+  LDSSize(0),
+  ABIArgOffset(0),
+  IsKernel(MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+           MF.getFunction()->getCallingConv() == CallingConv::SPIR_KERNEL) {
+  // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
+  // except reserved size is not correctly aligned.
+}
+
+unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
+                                                  const GlobalValue &GV) {
+  auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0));
+  if (!Entry.second)
+    return Entry.first->second;
+
+  unsigned Align = GV.getAlignment();
+  if (Align == 0)
+    Align = DL.getABITypeAlignment(GV.getValueType());
+
+  /// TODO: We should sort these to minimize wasted space due to alignment
+  /// padding. Currently the padding is decided by the first encountered use
+  /// during lowering.
+  unsigned Offset = LDSSize = alignTo(LDSSize, Align);
+
+  Entry.first->second = Offset;
+  LDSSize += DL.getTypeAllocSize(GV.getValueType());
+
+  return Offset;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
new file mode 100644
index 000000000000..5d0640b816f3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -0,0 +1,77 @@
+//===-- AMDGPUMachineFunctionInfo.h -------------------------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+class AMDGPUMachineFunction : public MachineFunctionInfo {
+  /// A map to keep track of local memory objects and their offsets within the
+  /// local memory space.
+  SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects;
+
+  uint64_t KernArgSize;
+  unsigned MaxKernArgAlign;
+
+  /// Number of bytes in the LDS that are being used.
+  unsigned LDSSize;
+
+  // FIXME: This should probably be removed.
+  /// Start of implicit kernel args
+  unsigned ABIArgOffset;
+
+  bool IsKernel;
+
+public:
+  AMDGPUMachineFunction(const MachineFunction &MF);
+
+  uint64_t allocateKernArg(uint64_t Size, unsigned Align) {
+    assert(isPowerOf2_32(Align));
+    KernArgSize = alignTo(KernArgSize, Align);
+
+    uint64_t Result = KernArgSize;
+    KernArgSize += Size;
+
+    MaxKernArgAlign = std::max(Align, MaxKernArgAlign);
+    return Result;
+  }
+
+  uint64_t getKernArgSize() const {
+    return KernArgSize;
+  }
+
+  unsigned getMaxKernArgAlign() const {
+    return MaxKernArgAlign;
+  }
+
+  void setABIArgOffset(unsigned NewOffset) {
+    ABIArgOffset = NewOffset;
+  }
+
+  unsigned getABIArgOffset() const {
+    return ABIArgOffset;
+  }
+
+  unsigned getLDSSize() const {
+    return LDSSize;
+  }
+
+  bool isKernel() const {
+    return IsKernel;
+  }
+
+  unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV);
+};
+
+}
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
new file mode 100644
index 000000000000..410bd52d9c21
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
@@ -0,0 +1,372 @@
+//===-- AMDGPUOpenCLImageTypeLoweringPass.cpp -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass resolves calls to OpenCL image attribute, image resource ID and
+/// sampler resource ID getter functions.
+///
+/// Image attributes (size and format) are expected to be passed to the kernel
+/// as kernel arguments immediately following the image argument itself,
+/// therefore this pass adds image size and format arguments to the kernel
+/// functions in the module. The kernel functions with image arguments are
+/// re-created using the new signature. The new arguments are added to the
+/// kernel metadata with kernel_arg_type set to "image_size" or "image_format".
+/// Note: this pass may invalidate pointers to functions.
+///
+/// Resource IDs of read-only images, write-only images and samplers are
+/// defined to be their index among the kernel arguments of the same
+/// type and access qualifier.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+namespace {
+
+StringRef GetImageSizeFunc =         "llvm.OpenCL.image.get.size";
+StringRef GetImageFormatFunc =       "llvm.OpenCL.image.get.format";
+StringRef GetImageResourceIDFunc =   "llvm.OpenCL.image.get.resource.id";
+StringRef GetSamplerResourceIDFunc = "llvm.OpenCL.sampler.get.resource.id";
+
+StringRef ImageSizeArgMDType =   "__llvm_image_size";
+StringRef ImageFormatArgMDType = "__llvm_image_format";
+
+StringRef KernelsMDNodeName = "opencl.kernels";
+StringRef KernelArgMDNodeNames[] = {
+  "kernel_arg_addr_space",
+  "kernel_arg_access_qual",
+  "kernel_arg_type",
+  "kernel_arg_base_type",
+  "kernel_arg_type_qual"};
+const unsigned NumKernelArgMDNodes = 5;
+
+typedef SmallVector<Metadata *, 8> MDVector;
+struct KernelArgMD {
+  MDVector ArgVector[NumKernelArgMDNodes];
+};
+
+} // end anonymous namespace
+
+static inline bool
+IsImageType(StringRef TypeString) {
+  return TypeString == "image2d_t" || TypeString == "image3d_t";
+}
+
+static inline bool
+IsSamplerType(StringRef TypeString) {
+  return TypeString == "sampler_t";
+}
+
+static Function *
+GetFunctionFromMDNode(MDNode *Node) {
+  if (!Node)
+    return nullptr;
+
+  size_t NumOps = Node->getNumOperands();
+  if (NumOps != NumKernelArgMDNodes + 1)
+    return nullptr;
+
+  auto F = mdconst::dyn_extract<Function>(Node->getOperand(0));
+  if (!F)
+    return nullptr;
+
+  // Sanity checks.
+  size_t ExpectNumArgNodeOps = F->arg_size() + 1;
+  for (size_t i = 0; i < NumKernelArgMDNodes; ++i) {
+    MDNode *ArgNode = dyn_cast_or_null<MDNode>(Node->getOperand(i + 1));
+    if (ArgNode->getNumOperands() != ExpectNumArgNodeOps)
+      return nullptr;
+    if (!ArgNode->getOperand(0))
+      return nullptr;
+
+    // FIXME: It should be possible to do image lowering when some metadata
+    // args missing or not in the expected order.
+    MDString *StringNode = dyn_cast<MDString>(ArgNode->getOperand(0));
+    if (!StringNode || StringNode->getString() != KernelArgMDNodeNames[i])
+      return nullptr;
+  }
+
+  return F;
+}
+
+static StringRef
+AccessQualFromMD(MDNode *KernelMDNode, unsigned ArgIdx) {
+  MDNode *ArgAQNode = cast<MDNode>(KernelMDNode->getOperand(2));
+  return cast<MDString>(ArgAQNode->getOperand(ArgIdx + 1))->getString();
+}
+
+static StringRef
+ArgTypeFromMD(MDNode *KernelMDNode, unsigned ArgIdx) {
+  MDNode *ArgTypeNode = cast<MDNode>(KernelMDNode->getOperand(3));
+  return cast<MDString>(ArgTypeNode->getOperand(ArgIdx + 1))->getString();
+}
+
+static MDVector
+GetArgMD(MDNode *KernelMDNode, unsigned OpIdx) {
+  MDVector Res;
+  for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) {
+    MDNode *Node = cast<MDNode>(KernelMDNode->getOperand(i + 1));
+    Res.push_back(Node->getOperand(OpIdx));
+  }
+  return Res;
+}
+
+static void
+PushArgMD(KernelArgMD &MD, const MDVector &V) {
+  assert(V.size() == NumKernelArgMDNodes);
+  for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) {
+    MD.ArgVector[i].push_back(V[i]);
+  }
+}
+
+namespace {
+
+class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
+  static char ID;
+
+  LLVMContext *Context;
+  Type *Int32Type;
+  Type *ImageSizeType;
+  Type *ImageFormatType;
+  SmallVector<Instruction *, 4> InstsToErase;
+
+  bool replaceImageUses(Argument &ImageArg, uint32_t ResourceID,
+                        Argument &ImageSizeArg,
+                        Argument &ImageFormatArg) {
+    bool Modified = false;
+
+    for (auto &Use : ImageArg.uses()) {
+      auto Inst = dyn_cast<CallInst>(Use.getUser());
+      if (!Inst) {
+        continue;
+      }
+
+      Function *F = Inst->getCalledFunction();
+      if (!F)
+        continue;
+
+      Value *Replacement = nullptr;
+      StringRef Name = F->getName();
+      if (Name.startswith(GetImageResourceIDFunc)) {
+        Replacement = ConstantInt::get(Int32Type, ResourceID);
+      } else if (Name.startswith(GetImageSizeFunc)) {
+        Replacement = &ImageSizeArg;
+      } else if (Name.startswith(GetImageFormatFunc)) {
+        Replacement = &ImageFormatArg;
+      } else {
+        continue;
+      }
+
+      Inst->replaceAllUsesWith(Replacement);
+      InstsToErase.push_back(Inst);
+      Modified = true;
+    }
+
+    return Modified;
+  }
+
+  bool replaceSamplerUses(Argument &SamplerArg, uint32_t ResourceID) {
+    bool Modified = false;
+
+    for (const auto &Use : SamplerArg.uses()) {
+      auto Inst = dyn_cast<CallInst>(Use.getUser());
+      if (!Inst) {
+        continue;
+      }
+
+      Function *F = Inst->getCalledFunction();
+      if (!F)
+        continue;
+
+      Value *Replacement = nullptr;
+      StringRef Name = F->getName();
+      if (Name == GetSamplerResourceIDFunc) {
+        Replacement = ConstantInt::get(Int32Type, ResourceID);
+      } else {
+        continue;
+      }
+
+      Inst->replaceAllUsesWith(Replacement);
+      InstsToErase.push_back(Inst);
+      Modified = true;
+    }
+
+    return Modified;
+  }
+
+  bool replaceImageAndSamplerUses(Function *F, MDNode *KernelMDNode) {
+    uint32_t NumReadOnlyImageArgs = 0;
+    uint32_t NumWriteOnlyImageArgs = 0;
+    uint32_t NumSamplerArgs = 0;
+
+    bool Modified = false;
+    InstsToErase.clear();
+    for (auto ArgI = F->arg_begin(); ArgI != F->arg_end(); ++ArgI) {
+      Argument &Arg = *ArgI;
+      StringRef Type = ArgTypeFromMD(KernelMDNode, Arg.getArgNo());
+
+      // Handle image types.
+      if (IsImageType(Type)) {
+        StringRef AccessQual = AccessQualFromMD(KernelMDNode, Arg.getArgNo());
+        uint32_t ResourceID;
+        if (AccessQual == "read_only") {
+          ResourceID = NumReadOnlyImageArgs++;
+        } else if (AccessQual == "write_only") {
+          ResourceID = NumWriteOnlyImageArgs++;
+        } else {
+          llvm_unreachable("Wrong image access qualifier.");
+        }
+
+        Argument &SizeArg = *(++ArgI);
+        Argument &FormatArg = *(++ArgI);
+        Modified |= replaceImageUses(Arg, ResourceID, SizeArg, FormatArg);
+
+      // Handle sampler type.
+      } else if (IsSamplerType(Type)) {
+        uint32_t ResourceID = NumSamplerArgs++;
+        Modified |= replaceSamplerUses(Arg, ResourceID);
+      }
+    }
+    for (unsigned i = 0; i < InstsToErase.size(); ++i) {
+      InstsToErase[i]->eraseFromParent();
+    }
+
+    return Modified;
+  }
+
+  std::tuple<Function *, MDNode *>
+  addImplicitArgs(Function *F, MDNode *KernelMDNode) {
+    bool Modified = false;
+
+    FunctionType *FT = F->getFunctionType();
+    SmallVector<Type *, 8> ArgTypes;
+
+    // Metadata operands for new MDNode.
+    KernelArgMD NewArgMDs;
+    PushArgMD(NewArgMDs, GetArgMD(KernelMDNode, 0));
+
+    // Add implicit arguments to the signature.
+    for (unsigned i = 0; i < FT->getNumParams(); ++i) {
+      ArgTypes.push_back(FT->getParamType(i));
+      MDVector ArgMD = GetArgMD(KernelMDNode, i + 1);
+      PushArgMD(NewArgMDs, ArgMD);
+
+      if (!IsImageType(ArgTypeFromMD(KernelMDNode, i)))
+        continue;
+
+      // Add size implicit argument.
+      ArgTypes.push_back(ImageSizeType);
+      ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageSizeArgMDType);
+      PushArgMD(NewArgMDs, ArgMD);
+
+      // Add format implicit argument.
+      ArgTypes.push_back(ImageFormatType);
+      ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageFormatArgMDType);
+      PushArgMD(NewArgMDs, ArgMD);
+
+      Modified = true;
+    }
+    if (!Modified) {
+      return std::make_tuple(nullptr, nullptr);
+    }
+
+    // Create function with new signature and clone the old body into it.
+    auto NewFT = FunctionType::get(FT->getReturnType(), ArgTypes, false);
+    auto NewF = Function::Create(NewFT, F->getLinkage(), F->getName());
+    ValueToValueMapTy VMap;
+    auto NewFArgIt = NewF->arg_begin();
+    for (auto &Arg: F->args()) {
+      auto ArgName = Arg.getName();
+      NewFArgIt->setName(ArgName);
+      VMap[&Arg] = &(*NewFArgIt++);
+      if (IsImageType(ArgTypeFromMD(KernelMDNode, Arg.getArgNo()))) {
+        (NewFArgIt++)->setName(Twine("__size_") + ArgName);
+        (NewFArgIt++)->setName(Twine("__format_") + ArgName);
+      }
+    }
+    SmallVector<ReturnInst*, 8> Returns;
+    CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns);
+
+    // Build new MDNode.
+    SmallVector<llvm::Metadata *, 6> KernelMDArgs;
+    KernelMDArgs.push_back(ConstantAsMetadata::get(NewF));
+    for (unsigned i = 0; i < NumKernelArgMDNodes; ++i)
+      KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i]));
+    MDNode *NewMDNode = MDNode::get(*Context, KernelMDArgs);
+
+    return std::make_tuple(NewF, NewMDNode);
+  }
+
+  bool transformKernels(Module &M) {
+    NamedMDNode *KernelsMDNode = M.getNamedMetadata(KernelsMDNodeName);
+    if (!KernelsMDNode)
+      return false;
+
+    bool Modified = false;
+    for (unsigned i = 0; i < KernelsMDNode->getNumOperands(); ++i) {
+      MDNode *KernelMDNode = KernelsMDNode->getOperand(i);
+      Function *F = GetFunctionFromMDNode(KernelMDNode);
+      if (!F)
+        continue;
+
+      Function *NewF;
+      MDNode *NewMDNode;
+      std::tie(NewF, NewMDNode) = addImplicitArgs(F, KernelMDNode);
+      if (NewF) {
+        // Replace old function and metadata with new ones.
+        F->eraseFromParent();
+        M.getFunctionList().push_back(NewF);
+        M.getOrInsertFunction(NewF->getName(), NewF->getFunctionType(),
+                              NewF->getAttributes());
+        KernelsMDNode->setOperand(i, NewMDNode);
+
+        F = NewF;
+        KernelMDNode = NewMDNode;
+        Modified = true;
+      }
+
+      Modified |= replaceImageAndSamplerUses(F, KernelMDNode);
+    }
+
+    return Modified;
+  }
+
+ public:
+  AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override {
+    Context = &M.getContext();
+    Int32Type = Type::getInt32Ty(M.getContext());
+    ImageSizeType = ArrayType::get(Int32Type, 3);
+    ImageFormatType = ArrayType::get(Int32Type, 2);
+
+    return transformKernels(M);
+  }
+
+  StringRef getPassName() const override {
+    return "AMDGPU OpenCL Image Type Pass";
+  }
+};
+
+char AMDGPUOpenCLImageTypeLoweringPass::ID = 0;
+
+} // end anonymous namespace
+
+ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() {
+  return new AMDGPUOpenCLImageTypeLoweringPass();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
new file mode 100644
index 000000000000..947d45b66969
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -0,0 +1,42 @@
+//===-- AMDGPUNoteType.h - AMDGPU ELF PT_NOTE section info-------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// Enums and constants for AMDGPU PT_NOTE sections.
+///
+//
+//===----------------------------------------------------------------------===//
+//
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H
+
+namespace AMDGPU {
+
+namespace PT_NOTE {
+
+const char SectionName[] = ".note";
+
+const char NoteName[] = "AMD";
+
+enum NoteType{
+    NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1,
+    NT_AMDGPU_HSA_HSAIL = 2,
+    NT_AMDGPU_HSA_ISA = 3,
+    NT_AMDGPU_HSA_PRODUCER = 4,
+    NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5,
+    NT_AMDGPU_HSA_EXTENSION = 6,
+    NT_AMDGPU_HSA_RUNTIME_METADATA = 7,
+    NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101,
+    NT_AMDGPU_HSA_HLDEBUG_TARGET = 102
+};
+}
+}
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUNOTETYPE_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
new file mode 100644
index 000000000000..baa28de7a770
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -0,0 +1,840 @@
+//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates allocas by either converting them into vectors or
+// by migrating them to local address space.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-promote-alloca"
+
+using namespace llvm;
+
+namespace {
+
+// FIXME: This can create globals so should be a module pass.
+class AMDGPUPromoteAlloca : public FunctionPass {
+private:
+  const TargetMachine *TM;
+  Module *Mod;
+  const DataLayout *DL;
+  MDNode *MaxWorkGroupSizeRange;
+
+  // FIXME: This should be per-kernel.
+  uint32_t LocalMemLimit;
+  uint32_t CurrentLocalMemUsage;
+
+  bool IsAMDGCN;
+  bool IsAMDHSA;
+
+  std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
+  Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
+
+  /// BaseAlloca is the alloca root the search started from.
+  /// Val may be that alloca or a recursive user of it.
+  bool collectUsesWithPtrTypes(Value *BaseAlloca,
+                               Value *Val,
+                               std::vector<Value*> &WorkList) const;
+
+  /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
+  /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
+  /// Returns true if both operands are derived from the same alloca. Val should
+  /// be the same value as one of the input operands of UseInst.
+  bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
+                                       Instruction *UseInst,
+                                       int OpIdx0, int OpIdx1) const;
+
+public:
+  static char ID;
+
+  AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
+    FunctionPass(ID),
+    TM(TM_),
+    Mod(nullptr),
+    DL(nullptr),
+    MaxWorkGroupSizeRange(nullptr),
+    LocalMemLimit(0),
+    CurrentLocalMemUsage(0),
+    IsAMDGCN(false),
+    IsAMDHSA(false) { }
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
+
+  void handleAlloca(AllocaInst &I);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace
+
+char AMDGPUPromoteAlloca::ID = 0;
+
+INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
+                   "AMDGPU promote alloca to vector or LDS", false, false)
+
+char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
+
+
+bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
+  if (!TM)
+    return false;
+
+  Mod = &M;
+  DL = &Mod->getDataLayout();
+
+  // The maximum workitem id.
+  //
+  // FIXME: Should get as subtarget property. Usually runtime enforced max is
+  // 256.
+  MDBuilder MDB(Mod->getContext());
+  MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
+
+  const Triple &TT = TM->getTargetTriple();
+
+  IsAMDGCN = TT.getArch() == Triple::amdgcn;
+  IsAMDHSA = TT.getOS() == Triple::AMDHSA;
+
+  return false;
+}
+
+bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
+  if (!TM || skipFunction(F))
+    return false;
+
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+  if (!ST.isPromoteAllocaEnabled())
+    return false;
+
+  FunctionType *FTy = F.getFunctionType();
+
+  // If the function has any arguments in the local address space, then it's
+  // possible these arguments require the entire local memory space, so
+  // we cannot use local memory in the pass.
+  for (Type *ParamTy : FTy->params()) {
+    PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
+    if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+      LocalMemLimit = 0;
+      DEBUG(dbgs() << "Function has local memory argument. Promoting to "
+                      "local memory disabled.\n");
+      return false;
+    }
+  }
+
+  LocalMemLimit = ST.getLocalMemorySize();
+  if (LocalMemLimit == 0)
+    return false;
+
+  const DataLayout &DL = Mod->getDataLayout();
+
+  // Check how much local memory is being used by global objects
+  CurrentLocalMemUsage = 0;
+  for (GlobalVariable &GV : Mod->globals()) {
+    if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+      continue;
+
+    for (const User *U : GV.users()) {
+      const Instruction *Use = dyn_cast<Instruction>(U);
+      if (!Use)
+        continue;
+
+      if (Use->getParent()->getParent() == &F) {
+        unsigned Align = GV.getAlignment();
+        if (Align == 0)
+          Align = DL.getABITypeAlignment(GV.getValueType());
+
+        // FIXME: Try to account for padding here. The padding is currently
+        // determined from the inverse order of uses in the function. I'm not
+        // sure if the use list order is in any way connected to this, so the
+        // total reported size is likely incorrect.
+        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
+        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
+        CurrentLocalMemUsage += AllocSize;
+        break;
+      }
+    }
+  }
+
+  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
+
+  // Restrict local memory usage so that we don't drastically reduce occupancy,
+  // unless it is already significantly reduced.
+
+  // TODO: Have some sort of hint or other heuristics to guess occupancy based
+  // on other factors..
+  unsigned OccupancyHint = ST.getWavesPerEU(F).second;
+  if (OccupancyHint == 0)
+    OccupancyHint = 7;
+
+  // Clamp to max value.
+  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
+
+  // Check the hint but ignore it if it's obviously wrong from the existing LDS
+  // usage.
+  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
+
+
+  // Round up to the next tier of usage.
+  unsigned MaxSizeWithWaveCount
+    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
+
+  // Program is possibly broken by using more local mem than available.
+  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
+    return false;
+
+  LocalMemLimit = MaxSizeWithWaveCount;
+
+  DEBUG(
+    dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
+    << "  Rounding size to " << MaxSizeWithWaveCount
+    << " with a maximum occupancy of " << MaxOccupancy << '\n'
+    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
+    << " available for promotion\n"
+  );
+
+  BasicBlock &EntryBB = *F.begin();
+  for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
+    AllocaInst *AI = dyn_cast<AllocaInst>(I);
+
+    ++I;
+    if (AI)
+      handleAlloca(*AI);
+  }
+
+  return true;
+}
+
+std::pair<Value *, Value *>
+AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
+  if (!IsAMDHSA) {
+    Function *LocalSizeYFn
+      = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
+    Function *LocalSizeZFn
+      = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
+
+    CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
+    CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
+
+    LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+    LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+
+    return std::make_pair(LocalSizeY, LocalSizeZ);
+  }
+
+  // We must read the size out of the dispatch pointer.
+  assert(IsAMDGCN);
+
+  // We are indexing into this struct, and want to extract the workgroup_size_*
+  // fields.
+  //
+  //   typedef struct hsa_kernel_dispatch_packet_s {
+  //     uint16_t header;
+  //     uint16_t setup;
+  //     uint16_t workgroup_size_x ;
+  //     uint16_t workgroup_size_y;
+  //     uint16_t workgroup_size_z;
+  //     uint16_t reserved0;
+  //     uint32_t grid_size_x ;
+  //     uint32_t grid_size_y ;
+  //     uint32_t grid_size_z;
+  //
+  //     uint32_t private_segment_size;
+  //     uint32_t group_segment_size;
+  //     uint64_t kernel_object;
+  //
+  // #ifdef HSA_LARGE_MODEL
+  //     void *kernarg_address;
+  // #elif defined HSA_LITTLE_ENDIAN
+  //     void *kernarg_address;
+  //     uint32_t reserved1;
+  // #else
+  //     uint32_t reserved1;
+  //     void *kernarg_address;
+  // #endif
+  //     uint64_t reserved2;
+  //     hsa_signal_t completion_signal; // uint64_t wrapper
+  //   } hsa_kernel_dispatch_packet_t
+  //
+  Function *DispatchPtrFn
+    = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
+
+  CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
+  DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
+  DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+
+  // Size of the dispatch packet struct.
+  DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
+
+  Type *I32Ty = Type::getInt32Ty(Mod->getContext());
+  Value *CastDispatchPtr = Builder.CreateBitCast(
+    DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
+
+  // We could do a single 64-bit load here, but it's likely that the basic
+  // 32-bit and extract sequence is already present, and it is probably easier
+  // to CSE this. The loads should be mergable later anyway.
+  Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
+  LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
+
+  Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
+  LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
+
+  MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
+  LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
+  LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
+  LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+
+  // Extract y component. Upper half of LoadZU should be zero already.
+  Value *Y = Builder.CreateLShr(LoadXY, 16);
+
+  return std::make_pair(Y, LoadZU);
+}
+
+Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
+  Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
+
+  switch (N) {
+  case 0:
+    IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
+      : Intrinsic::r600_read_tidig_x;
+    break;
+  case 1:
+    IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
+      : Intrinsic::r600_read_tidig_y;
+    break;
+
+  case 2:
+    IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
+      : Intrinsic::r600_read_tidig_z;
+    break;
+  default:
+    llvm_unreachable("invalid dimension");
+  }
+
+  Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
+  CallInst *CI = Builder.CreateCall(WorkitemIdFn);
+  CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+
+  return CI;
+}
+
+static VectorType *arrayTypeToVecType(Type *ArrayTy) {
+  return VectorType::get(ArrayTy->getArrayElementType(),
+                         ArrayTy->getArrayNumElements());
+}
+
+static Value *
+calculateVectorIndex(Value *Ptr,
+                     const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
+  GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
+
+  auto I = GEPIdx.find(GEP);
+  return I == GEPIdx.end() ? nullptr : I->second;
+}
+
+static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
+  // FIXME we only support simple cases
+  if (GEP->getNumOperands() != 3)
+    return nullptr;
+
+  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
+  if (!I0 || !I0->isZero())
+    return nullptr;
+
+  return GEP->getOperand(2);
+}
+
+// Not an instruction handled below to turn into a vector.
+//
+// TODO: Check isTriviallyVectorizable for calls and handle other
+// instructions.
+static bool canVectorizeInst(Instruction *Inst, User *User) {
+  switch (Inst->getOpcode()) {
+  case Instruction::Load:
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+    return true;
+  case Instruction::Store: {
+    // Must be the stored pointer operand, not a stored value.
+    StoreInst *SI = cast<StoreInst>(Inst);
+    return SI->getPointerOperand() == User;
+  }
+  default:
+    return false;
+  }
+}
+
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
+  ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
+
+  DEBUG(dbgs() << "Alloca candidate for vectorization\n");
+
+  // FIXME: There is no reason why we can't support larger arrays, we
+  // are just being conservative for now.
+  if (!AllocaTy ||
+      AllocaTy->getElementType()->isVectorTy() ||
+      AllocaTy->getNumElements() > 4 ||
+      AllocaTy->getNumElements() < 2) {
+    DEBUG(dbgs() << "  Cannot convert type to vector\n");
+    return false;
+  }
+
+  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
+  std::vector<Value*> WorkList;
+  for (User *AllocaUser : Alloca->users()) {
+    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
+    if (!GEP) {
+      if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
+        return false;
+
+      WorkList.push_back(AllocaUser);
+      continue;
+    }
+
+    Value *Index = GEPToVectorIndex(GEP);
+
+    // If we can't compute a vector index from this GEP, then we can't
+    // promote this alloca to vector.
+    if (!Index) {
+      DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
+      return false;
+    }
+
+    GEPVectorIdx[GEP] = Index;
+    for (User *GEPUser : AllocaUser->users()) {
+      if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
+        return false;
+
+      WorkList.push_back(GEPUser);
+    }
+  }
+
+  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
+
+  DEBUG(dbgs() << "  Converting alloca to vector "
+        << *AllocaTy << " -> " << *VectorTy << '\n');
+
+  for (Value *V : WorkList) {
+    Instruction *Inst = cast<Instruction>(V);
+    IRBuilder<> Builder(Inst);
+    switch (Inst->getOpcode()) {
+    case Instruction::Load: {
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+      Value *Ptr = Inst->getOperand(0);
+      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+
+      Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
+      Value *VecValue = Builder.CreateLoad(BitCast);
+      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
+      Inst->replaceAllUsesWith(ExtractElement);
+      Inst->eraseFromParent();
+      break;
+    }
+    case Instruction::Store: {
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+
+      Value *Ptr = Inst->getOperand(1);
+      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
+      Value *VecValue = Builder.CreateLoad(BitCast);
+      Value *NewVecValue = Builder.CreateInsertElement(VecValue,
+                                                       Inst->getOperand(0),
+                                                       Index);
+      Builder.CreateStore(NewVecValue, BitCast);
+      Inst->eraseFromParent();
+      break;
+    }
+    case Instruction::BitCast:
+    case Instruction::AddrSpaceCast:
+      break;
+
+    default:
+      llvm_unreachable("Inconsistency in instructions promotable to vector");
+    }
+  }
+  return true;
+}
+
+static bool isCallPromotable(CallInst *CI) {
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+  if (!II)
+    return false;
+
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove:
+  case Intrinsic::memset:
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end:
+  case Intrinsic::invariant_start:
+  case Intrinsic::invariant_end:
+  case Intrinsic::invariant_group_barrier:
+  case Intrinsic::objectsize:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
+                                                          Value *Val,
+                                                          Instruction *Inst,
+                                                          int OpIdx0,
+                                                          int OpIdx1) const {
+  // Figure out which operand is the one we might not be promoting.
+  Value *OtherOp = Inst->getOperand(OpIdx0);
+  if (Val == OtherOp)
+    OtherOp = Inst->getOperand(OpIdx1);
+
+  if (isa<ConstantPointerNull>(OtherOp))
+    return true;
+
+  Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
+  if (!isa<AllocaInst>(OtherObj))
+    return false;
+
+  // TODO: We should be able to replace undefs with the right pointer type.
+
+  // TODO: If we know the other base object is another promotable
+  // alloca, not necessarily this alloca, we can do this. The
+  // important part is both must have the same address space at
+  // the end.
+  if (OtherObj != BaseAlloca) {
+    DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
+    return false;
+  }
+
+  return true;
+}
+
+bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
+  Value *BaseAlloca,
+  Value *Val,
+  std::vector<Value*> &WorkList) const {
+
+  for (User *User : Val->users()) {
+    if (is_contained(WorkList, User))
+      continue;
+
+    if (CallInst *CI = dyn_cast<CallInst>(User)) {
+      if (!isCallPromotable(CI))
+        return false;
+
+      WorkList.push_back(User);
+      continue;
+    }
+
+    Instruction *UseInst = cast<Instruction>(User);
+    if (UseInst->getOpcode() == Instruction::PtrToInt)
+      return false;
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
+      if (LI->isVolatile())
+        return false;
+
+      continue;
+    }
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
+      if (SI->isVolatile())
+        return false;
+
+      // Reject if the stored value is not the pointer operand.
+      if (SI->getPointerOperand() != Val)
+        return false;
+    } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
+      if (RMW->isVolatile())
+        return false;
+    } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
+      if (CAS->isVolatile())
+        return false;
+    }
+
+    // Only promote a select if we know that the other select operand
+    // is from another pointer that will also be promoted.
+    if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
+      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
+        return false;
+
+      // May need to rewrite constant operands.
+      WorkList.push_back(ICmp);
+    }
+
+    if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
+      // Don't collect the users of this.
+      WorkList.push_back(User);
+      continue;
+    }
+
+    if (!User->getType()->isPointerTy())
+      continue;
+
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
+      // Be conservative if an address could be computed outside the bounds of
+      // the alloca.
+      if (!GEP->isInBounds())
+        return false;
+    }
+
+    // Only promote a select if we know that the other select operand is from
+    // another pointer that will also be promoted.
+    if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
+      if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
+        return false;
+    }
+
+    // Repeat for phis.
+    if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
+      // TODO: Handle more complex cases. We should be able to replace loops
+      // over arrays.
+      switch (Phi->getNumIncomingValues()) {
+      case 1:
+        break;
+      case 2:
+        if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
+          return false;
+        break;
+      default:
+        return false;
+      }
+    }
+
+    WorkList.push_back(User);
+    if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
+      return false;
+  }
+
+  return true;
+}
+
+// FIXME: Should try to pick the most likely to be profitable allocas first.
+void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
+  // Array allocations are probably not worth handling, since an allocation of
+  // the array type is the canonical form.
+  if (!I.isStaticAlloca() || I.isArrayAllocation())
+    return;
+
+  IRBuilder<> Builder(&I);
+
+  // First try to replace the alloca with a vector
+  Type *AllocaTy = I.getAllocatedType();
+
+  DEBUG(dbgs() << "Trying to promote " << I << '\n');
+
+  if (tryPromoteAllocaToVector(&I)) {
+    DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
+    return;
+  }
+
+  const Function &ContainingFunction = *I.getParent()->getParent();
+
+  // Don't promote the alloca to LDS for shader calling conventions as the work
+  // item ID intrinsics are not supported for these calling conventions.
+  // Furthermore not all LDS is available for some of the stages.
+  if (AMDGPU::isShader(ContainingFunction.getCallingConv()))
+    return;
+
+  const AMDGPUSubtarget &ST =
+    TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
+  // FIXME: We should also try to get this value from the reqd_work_group_size
+  // function attribute if it is available.
+  unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
+
+  const DataLayout &DL = Mod->getDataLayout();
+
+  unsigned Align = I.getAlignment();
+  if (Align == 0)
+    Align = DL.getABITypeAlignment(I.getAllocatedType());
+
+  // FIXME: This computed padding is likely wrong since it depends on inverse
+  // usage order.
+  //
+  // FIXME: It is also possible that if we're allowed to use all of the memory
+  // could could end up using more than the maximum due to alignment padding.
+
+  uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
+  uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
+  NewSize += AllocSize;
+
+  if (NewSize > LocalMemLimit) {
+    DEBUG(dbgs() << "  " << AllocSize
+          << " bytes of local memory not available to promote\n");
+    return;
+  }
+
+  CurrentLocalMemUsage = NewSize;
+
+  std::vector<Value*> WorkList;
+
+  if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
+    DEBUG(dbgs() << " Do not know how to convert all uses\n");
+    return;
+  }
+
+  DEBUG(dbgs() << "Promoting alloca to local memory\n");
+
+  Function *F = I.getParent()->getParent();
+
+  Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
+  GlobalVariable *GV = new GlobalVariable(
+      *Mod, GVTy, false, GlobalValue::InternalLinkage,
+      UndefValue::get(GVTy),
+      Twine(F->getName()) + Twine('.') + I.getName(),
+      nullptr,
+      GlobalVariable::NotThreadLocal,
+      AMDGPUAS::LOCAL_ADDRESS);
+  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  GV->setAlignment(I.getAlignment());
+
+  Value *TCntY, *TCntZ;
+
+  std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
+  Value *TIdX = getWorkitemID(Builder, 0);
+  Value *TIdY = getWorkitemID(Builder, 1);
+  Value *TIdZ = getWorkitemID(Builder, 2);
+
+  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
+  Tmp0 = Builder.CreateMul(Tmp0, TIdX);
+  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
+  Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
+  TID = Builder.CreateAdd(TID, TIdZ);
+
+  Value *Indices[] = {
+    Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
+    TID
+  };
+
+  Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
+  I.mutateType(Offset->getType());
+  I.replaceAllUsesWith(Offset);
+  I.eraseFromParent();
+
+  for (Value *V : WorkList) {
+    CallInst *Call = dyn_cast<CallInst>(V);
+    if (!Call) {
+      if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
+        Value *Src0 = CI->getOperand(0);
+        Type *EltTy = Src0->getType()->getPointerElementType();
+        PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+
+        if (isa<ConstantPointerNull>(CI->getOperand(0)))
+          CI->setOperand(0, ConstantPointerNull::get(NewTy));
+
+        if (isa<ConstantPointerNull>(CI->getOperand(1)))
+          CI->setOperand(1, ConstantPointerNull::get(NewTy));
+
+        continue;
+      }
+
+      // The operand's value should be corrected on its own and we don't want to
+      // touch the users.
+      if (isa<AddrSpaceCastInst>(V))
+        continue;
+
+      Type *EltTy = V->getType()->getPointerElementType();
+      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+
+      // FIXME: It doesn't really make sense to try to do this for all
+      // instructions.
+      V->mutateType(NewTy);
+
+      // Adjust the types of any constant operands.
+      if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
+        if (isa<ConstantPointerNull>(SI->getOperand(1)))
+          SI->setOperand(1, ConstantPointerNull::get(NewTy));
+
+        if (isa<ConstantPointerNull>(SI->getOperand(2)))
+          SI->setOperand(2, ConstantPointerNull::get(NewTy));
+      } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
+        for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+          if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
+            Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
+        }
+      }
+
+      continue;
+    }
+
+    IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
+    Builder.SetInsertPoint(Intr);
+    switch (Intr->getIntrinsicID()) {
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+      // These intrinsics are for address space 0 only
+      Intr->eraseFromParent();
+      continue;
+    case Intrinsic::memcpy: {
+      MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
+      Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
+                           MemCpy->getLength(), MemCpy->getAlignment(),
+                           MemCpy->isVolatile());
+      Intr->eraseFromParent();
+      continue;
+    }
+    case Intrinsic::memmove: {
+      MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
+      Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
+                            MemMove->getLength(), MemMove->getAlignment(),
+                            MemMove->isVolatile());
+      Intr->eraseFromParent();
+      continue;
+    }
+    case Intrinsic::memset: {
+      MemSetInst *MemSet = cast<MemSetInst>(Intr);
+      Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
+                           MemSet->getLength(), MemSet->getAlignment(),
+                           MemSet->isVolatile());
+      Intr->eraseFromParent();
+      continue;
+    }
+    case Intrinsic::invariant_start:
+    case Intrinsic::invariant_end:
+    case Intrinsic::invariant_group_barrier:
+      Intr->eraseFromParent();
+      // FIXME: I think the invariant marker should still theoretically apply,
+      // but the intrinsics need to be changed to accept pointers with any
+      // address space.
+      continue;
+    case Intrinsic::objectsize: {
+      Value *Src = Intr->getOperand(0);
+      Type *SrcTy = Src->getType()->getPointerElementType();
+      Function *ObjectSize = Intrinsic::getDeclaration(Mod,
+        Intrinsic::objectsize,
+        { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
+      );
+
+      CallInst *NewCall
+        = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
+      Intr->replaceAllUsesWith(NewCall);
+      Intr->eraseFromParent();
+      continue;
+    }
+    default:
+      Intr->dump();
+      llvm_unreachable("Don't know how to promote alloca intrinsic use.");
+    }
+  }
+}
+
+FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
+  return new AMDGPUPromoteAlloca(TM);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
new file mode 100644
index 000000000000..941f2d8a468a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -0,0 +1,52 @@
+//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Parent TargetRegisterInfo class common to all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+
+using namespace llvm;
+
+AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
+
+//===----------------------------------------------------------------------===//
+// Function handling callbacks - Functions are a seldom used feature of GPUS, so
+// they are not supported at this time.
+//===----------------------------------------------------------------------===//
+
+// Dummy to not crash RegisterClassInfo.
+static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
+
+const MCPhysReg *AMDGPURegisterInfo::getCalleeSavedRegs(
+  const MachineFunction *) const {
+  return &CalleeSavedReg;
+}
+
+unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return AMDGPU::NoRegister;
+}
+
+unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
+  static const unsigned SubRegs[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
+    AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
+    AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14,
+    AMDGPU::sub15
+  };
+
+  assert(Channel < array_lengthof(SubRegs));
+  return SubRegs[Channel];
+}
+
+#define GET_REGINFO_TARGET_DESC
+#include "AMDGPUGenRegisterInfo.inc"
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
new file mode 100644
index 000000000000..ef51aad95dce
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -0,0 +1,43 @@
+//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief TargetRegisterInfo interface that is implemented by all hw codegen
+/// targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+class TargetInstrInfo;
+
+struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
+  AMDGPURegisterInfo();
+
+  /// \returns the sub reg enum value for the given \p Channel
+  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
+  unsigned getSubRegFromChannel(unsigned Channel) const;
+
+  const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td
new file mode 100644
index 000000000000..ba0490abee8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td
@@ -0,0 +1,25 @@
+//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tablegen register definitions common to all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+let Namespace = "AMDGPU" in {
+
+foreach Index = 0-15 in {
+  def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
+}
+
+def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">;
+
+}
+
+include "R600RegisterInfo.td"
+include "SIRegisterInfo.td"
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
new file mode 100644
index 000000000000..ecd2ac72bf1b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
@@ -0,0 +1,193 @@
+//===-- AMDGPURuntimeMetadata.h - AMDGPU Runtime Metadata -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// Enums and structure types used by runtime metadata.
+///
+/// Runtime requests certain information (metadata) about kernels to be able
+/// to execute the kernels and answer the queries about the kernels.
+/// The metadata is represented as a note element in the .note ELF section of a
+/// binary (code object). The desc field of the note element is a YAML string
+/// consisting of key-value pairs. Each key is a string. Each value can be
+/// an integer, a string, or an YAML sequence. There are 3 levels of YAML maps.
+/// At the beginning of the YAML string is the module level YAML map. A
+/// kernel-level YAML map is in the amd.Kernels sequence. A
+/// kernel-argument-level map is in the amd.Args sequence.
+///
+/// The format should be kept backward compatible. New enum values and bit
+/// fields should be appended at the end. It is suggested to bump up the
+/// revision number whenever the format changes and document the change
+/// in the revision in this header.
+///
+//
+//===----------------------------------------------------------------------===//
+//
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
+
+#include <cstdint>
+#include <vector>
+#include <string>
+
+namespace AMDGPU {
+
+namespace RuntimeMD {
+
+  // Version and revision of runtime metadata
+  const unsigned char MDVersion   = 2;
+  const unsigned char MDRevision  = 0;
+
+  // Name of keys for runtime metadata.
+  namespace KeyName {
+    const char MDVersion[]                = "amd.MDVersion";            // Runtime metadata version
+    const char Language[]                 = "amd.Language";             // Language
+    const char LanguageVersion[]          = "amd.LanguageVersion";      // Language version
+    const char Kernels[]                  = "amd.Kernels";              // Kernels
+    const char KernelName[]               = "amd.KernelName";           // Kernel name
+    const char Args[]                     = "amd.Args";                 // Kernel arguments
+    const char ArgSize[]                  = "amd.ArgSize";              // Kernel arg size
+    const char ArgAlign[]                 = "amd.ArgAlign";             // Kernel arg alignment
+    const char ArgTypeName[]              = "amd.ArgTypeName";          // Kernel type name
+    const char ArgName[]                  = "amd.ArgName";              // Kernel name
+    const char ArgKind[]                  = "amd.ArgKind";              // Kernel argument kind
+    const char ArgValueType[]             = "amd.ArgValueType";         // Kernel argument value type
+    const char ArgAddrQual[]              = "amd.ArgAddrQual";          // Kernel argument address qualifier
+    const char ArgAccQual[]               = "amd.ArgAccQual";           // Kernel argument access qualifier
+    const char ArgIsConst[]               = "amd.ArgIsConst";           // Kernel argument is const qualified
+    const char ArgIsRestrict[]            = "amd.ArgIsRestrict";        // Kernel argument is restrict qualified
+    const char ArgIsVolatile[]            = "amd.ArgIsVolatile";        // Kernel argument is volatile qualified
+    const char ArgIsPipe[]                = "amd.ArgIsPipe";            // Kernel argument is pipe qualified
+    const char ReqdWorkGroupSize[]        = "amd.ReqdWorkGroupSize";    // Required work group size
+    const char WorkGroupSizeHint[]        = "amd.WorkGroupSizeHint";    // Work group size hint
+    const char VecTypeHint[]              = "amd.VecTypeHint";          // Vector type hint
+    const char KernelIndex[]              = "amd.KernelIndex";          // Kernel index for device enqueue
+    const char NoPartialWorkGroups[]      = "amd.NoPartialWorkGroups";  // No partial work groups
+    const char PrintfInfo[]               = "amd.PrintfInfo";           // Prinf function call information
+    const char ArgActualAcc[]             = "amd.ArgActualAcc";         // The actual kernel argument access qualifier
+    const char ArgPointeeAlign[]          = "amd.ArgPointeeAlign";      // Alignment of pointee type
+  }
+
+  namespace KernelArg {
+    enum Kind : uint8_t {
+      ByValue                 = 0,
+      GlobalBuffer            = 1,
+      DynamicSharedPointer    = 2,
+      Sampler                 = 3,
+      Image                   = 4,
+      Pipe                    = 5,
+      Queue                   = 6,
+      HiddenGlobalOffsetX     = 7,
+      HiddenGlobalOffsetY     = 8,
+      HiddenGlobalOffsetZ     = 9,
+      HiddenNone              = 10,
+      HiddenPrintfBuffer      = 11,
+      HiddenDefaultQueue      = 12,
+      HiddenCompletionAction  = 13,
+    };
+
+    enum ValueType : uint16_t {
+      Struct  = 0,
+      I8      = 1,
+      U8      = 2,
+      I16     = 3,
+      U16     = 4,
+      F16     = 5,
+      I32     = 6,
+      U32     = 7,
+      F32     = 8,
+      I64     = 9,
+      U64     = 10,
+      F64     = 11,
+    };
+
+    // Avoid using 'None' since it conflicts with a macro in X11 header file.
+    enum AccessQualifer : uint8_t {
+      AccNone    = 0,
+      ReadOnly   = 1,
+      WriteOnly  = 2,
+      ReadWrite  = 3,
+    };
+
+    enum AddressSpaceQualifer : uint8_t {
+      Private    = 0,
+      Global     = 1,
+      Constant   = 2,
+      Local      = 3,
+      Generic    = 4,
+      Region     = 5,
+    };
+  } // namespace KernelArg
+
+  // Invalid values are used to indicate an optional key should not be emitted.
+  const uint8_t INVALID_ADDR_QUAL     = 0xff;
+  const uint8_t INVALID_ACC_QUAL      = 0xff;
+  const uint32_t INVALID_KERNEL_INDEX = ~0U;
+
+  namespace KernelArg {
+    // In-memory representation of kernel argument information.
+    struct Metadata {
+      uint32_t Size;
+      uint32_t Align;
+      uint32_t PointeeAlign;
+      uint8_t Kind;
+      uint16_t ValueType;
+      std::string TypeName;
+      std::string Name;
+      uint8_t AddrQual;
+      uint8_t AccQual;
+      uint8_t IsVolatile;
+      uint8_t IsConst;
+      uint8_t IsRestrict;
+      uint8_t IsPipe;
+      Metadata() : Size(0), Align(0), PointeeAlign(0), Kind(0), ValueType(0),
+          AddrQual(INVALID_ADDR_QUAL), AccQual(INVALID_ACC_QUAL), IsVolatile(0),
+          IsConst(0), IsRestrict(0), IsPipe(0) {}
+    };
+  }
+
+  namespace Kernel {
+    // In-memory representation of kernel information.
+    struct Metadata {
+      std::string Name;
+      std::string Language;
+      std::vector<uint8_t> LanguageVersion;
+      std::vector<uint32_t> ReqdWorkGroupSize;
+      std::vector<uint32_t> WorkGroupSizeHint;
+      std::string VecTypeHint;
+      uint32_t KernelIndex;
+      uint8_t NoPartialWorkGroups;
+      std::vector<KernelArg::Metadata> Args;
+      Metadata() : KernelIndex(INVALID_KERNEL_INDEX), NoPartialWorkGroups(0) {}
+    };
+  }
+
+  namespace Program {
+    // In-memory representation of program information.
+    struct Metadata {
+      std::vector<uint8_t> MDVersionSeq;
+      std::vector<std::string> PrintfInfo;
+      std::vector<Kernel::Metadata> Kernels;
+
+      explicit Metadata(){}
+
+      // Construct from an YAML string.
+      explicit Metadata(const std::string &YAML);
+
+      // Convert to YAML string.
+      std::string toYAML();
+
+      // Convert from YAML string.
+      static Metadata fromYAML(const std::string &S);
+    };
+  }
+} // namespace RuntimeMD
+} // namespace AMDGPU
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
new file mode 100644
index 000000000000..74851aedbb21
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -0,0 +1,362 @@
+//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-subtarget"
+
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "AMDGPUGenSubtargetInfo.inc"
+
+AMDGPUSubtarget::~AMDGPUSubtarget() = default;
+
+AMDGPUSubtarget &
+AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
+                                                 StringRef GPU, StringRef FS) {
+  // Determine default and user-specified characteristics
+  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
+  // enabled, but some instructions do not respect them and they run at the
+  // double precision rate, so don't enable by default.
+  //
+  // We want to be able to turn these off, but making this a subtarget feature
+  // for SI has the unhelpful behavior that it unsets everything else if you
+  // disable it.
+
+  SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
+  if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
+    FullFS += "+flat-for-global,+unaligned-buffer-access,";
+  FullFS += FS;
+
+  ParseSubtargetFeatures(GPU, FullFS);
+
+  // FIXME: I don't think think Evergreen has any useful support for
+  // denormals, but should be checked. Should we issue a warning somewhere
+  // if someone tries to enable these?
+  if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    FP16Denormals = false;
+    FP32Denormals = false;
+    FP64Denormals = false;
+  }
+
+  // Set defaults if needed.
+  if (MaxPrivateElementSize == 0)
+    MaxPrivateElementSize = 4;
+
+  return *this;
+}
+
+AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+                                 const TargetMachine &TM)
+  : AMDGPUGenSubtargetInfo(TT, GPU, FS),
+    TargetTriple(TT),
+    Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
+    IsaVersion(ISAVersion0_0_0),
+    WavefrontSize(64),
+    LocalMemorySize(0),
+    LDSBankCount(0),
+    MaxPrivateElementSize(0),
+
+    FastFMAF32(false),
+    HalfRate64Ops(false),
+
+    FP16Denormals(false),
+    FP32Denormals(false),
+    FP64Denormals(false),
+    FPExceptions(false),
+    FlatForGlobal(false),
+    UnalignedScratchAccess(false),
+    UnalignedBufferAccess(false),
+
+    EnableXNACK(false),
+    DebuggerInsertNops(false),
+    DebuggerReserveRegs(false),
+    DebuggerEmitPrologue(false),
+
+    EnableVGPRSpilling(false),
+    EnablePromoteAlloca(false),
+    EnableLoadStoreOpt(false),
+    EnableUnsafeDSOffsetFolding(false),
+    EnableSIScheduler(false),
+    DumpCode(false),
+
+    FP64(false),
+    IsGCN(false),
+    GCN1Encoding(false),
+    GCN3Encoding(false),
+    CIInsts(false),
+    SGPRInitBug(false),
+    HasSMemRealTime(false),
+    Has16BitInsts(false),
+    HasMovrel(false),
+    HasVGPRIndexMode(false),
+    HasScalarStores(false),
+    HasInv2PiInlineImm(false),
+    FlatAddressSpace(false),
+
+    R600ALUInst(false),
+    CaymanISA(false),
+    CFALUBug(false),
+    HasVertexCache(false),
+    TexVTXClauseSize(0),
+    ScalarizeGlobal(false),
+
+    FeatureDisable(false),
+    InstrItins(getInstrItineraryForCPU(GPU)) {
+  initializeSubtargetDependencies(TT, GPU, FS);
+}
+
+// FIXME: These limits are for SI. Did they change with the larger maximum LDS
+// size?
+unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
+  switch (NWaves) {
+  case 10:
+    return 1638;
+  case 9:
+    return 1820;
+  case 8:
+    return 2048;
+  case 7:
+    return 2340;
+  case 6:
+    return 2730;
+  case 5:
+    return 3276;
+  case 4:
+    return 4096;
+  case 3:
+    return 5461;
+  case 2:
+    return 8192;
+  default:
+    return getLocalMemorySize();
+  }
+}
+
+unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
+  if (Bytes <= 1638)
+    return 10;
+
+  if (Bytes <= 1820)
+    return 9;
+
+  if (Bytes <= 2048)
+    return 8;
+
+  if (Bytes <= 2340)
+    return 7;
+
+  if (Bytes <= 2730)
+    return 6;
+
+  if (Bytes <= 3276)
+    return 5;
+
+  if (Bytes <= 4096)
+    return 4;
+
+  if (Bytes <= 5461)
+    return 3;
+
+  if (Bytes <= 8192)
+    return 2;
+
+  return 1;
+}
+
+std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
+  const Function &F) const {
+  // Default minimum/maximum flat work group sizes.
+  std::pair<unsigned, unsigned> Default =
+    AMDGPU::isCompute(F.getCallingConv()) ?
+      std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
+                                    getWavefrontSize() * 4) :
+      std::pair<unsigned, unsigned>(1, getWavefrontSize());
+
+  // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
+  // starts using "amdgpu-flat-work-group-size" attribute.
+  Default.second = AMDGPU::getIntegerAttribute(
+    F, "amdgpu-max-work-group-size", Default.second);
+  Default.first = std::min(Default.first, Default.second);
+
+  // Requested minimum/maximum flat work group sizes.
+  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
+    F, "amdgpu-flat-work-group-size", Default);
+
+  // Make sure requested minimum is less than requested maximum.
+  if (Requested.first > Requested.second)
+    return Default;
+
+  // Make sure requested values do not violate subtarget's specifications.
+  if (Requested.first < getMinFlatWorkGroupSize())
+    return Default;
+  if (Requested.second > getMaxFlatWorkGroupSize())
+    return Default;
+
+  return Requested;
+}
+
+std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
+  const Function &F) const {
+  // Default minimum/maximum number of waves per execution unit.
+  std::pair<unsigned, unsigned> Default(1, 0);
+
+  // Default/requested minimum/maximum flat work group sizes.
+  std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
+
+  // If minimum/maximum flat work group sizes were explicitly requested using
+  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
+  // number of waves per execution unit to values implied by requested
+  // minimum/maximum flat work group sizes.
+  unsigned MinImpliedByFlatWorkGroupSize =
+    getMaxWavesPerEU(FlatWorkGroupSizes.second);
+  bool RequestedFlatWorkGroupSize = false;
+
+  // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
+  // starts using "amdgpu-flat-work-group-size" attribute.
+  if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
+      F.hasFnAttribute("amdgpu-flat-work-group-size")) {
+    Default.first = MinImpliedByFlatWorkGroupSize;
+    RequestedFlatWorkGroupSize = true;
+  }
+
+  // Requested minimum/maximum number of waves per execution unit.
+  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
+    F, "amdgpu-waves-per-eu", Default, true);
+
+  // Make sure requested minimum is less than requested maximum.
+  if (Requested.second && Requested.first > Requested.second)
+    return Default;
+
+  // Make sure requested values do not violate subtarget's specifications.
+  if (Requested.first < getMinWavesPerEU() ||
+      Requested.first > getMaxWavesPerEU())
+    return Default;
+  if (Requested.second > getMaxWavesPerEU())
+    return Default;
+
+  // Make sure requested values are compatible with values implied by requested
+  // minimum/maximum flat work group sizes.
+  if (RequestedFlatWorkGroupSize &&
+      Requested.first > MinImpliedByFlatWorkGroupSize)
+    return Default;
+
+  return Requested;
+}
+
+R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
+                             const TargetMachine &TM) :
+  AMDGPUSubtarget(TT, GPU, FS, TM),
+  InstrInfo(*this),
+  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
+  TLInfo(TM, *this) {}
+
+SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+                         const TargetMachine &TM) :
+  AMDGPUSubtarget(TT, GPU, FS, TM),
+  InstrInfo(*this),
+  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
+  TLInfo(TM, *this) {}
+
+void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                      unsigned NumRegionInstrs) const {
+  // Track register pressure so the scheduler can try to decrease
+  // pressure once register usage is above the threshold defined by
+  // SIRegisterInfo::getRegPressureSetLimit()
+  Policy.ShouldTrackPressure = true;
+
+  // Enabling both top down and bottom up scheduling seems to give us less
+  // register spills than just using one of these approaches on its own.
+  Policy.OnlyTopDown = false;
+  Policy.OnlyBottomUp = false;
+
+  // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
+  if (!enableSIScheduler())
+    Policy.ShouldTrackLaneMasks = true;
+}
+
+bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
+  return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
+}
+
+unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const {
+  unsigned ImplicitBytes = getImplicitArgNumBytes();
+  if (ImplicitBytes == 0)
+    return ExplicitArgBytes;
+
+  unsigned Alignment = getAlignmentForImplicitArgPtr();
+  return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
+}
+
+unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
+  if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+    if (SGPRs <= 80)
+      return 10;
+    if (SGPRs <= 88)
+      return 9;
+    if (SGPRs <= 100)
+      return 8;
+    return 7;
+  }
+  if (SGPRs <= 48)
+    return 10;
+  if (SGPRs <= 56)
+    return 9;
+  if (SGPRs <= 64)
+    return 8;
+  if (SGPRs <= 72)
+    return 7;
+  if (SGPRs <= 80)
+    return 6;
+  return 5;
+}
+
+unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
+  if (VGPRs <= 24)
+    return 10;
+  if (VGPRs <= 28)
+    return 9;
+  if (VGPRs <= 32)
+    return 8;
+  if (VGPRs <= 36)
+    return 7;
+  if (VGPRs <= 40)
+    return 6;
+  if (VGPRs <= 48)
+    return 5;
+  if (VGPRs <= 64)
+    return 4;
+  if (VGPRs <= 84)
+    return 3;
+  if (VGPRs <= 128)
+    return 2;
+  return 1;
+}
+
+unsigned SISubtarget::getMaxNumSGPRs() const {
+  if (hasSGPRInitBug())
+    return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+
+  if (getGeneration() >= VOLCANIC_ISLANDS)
+    return 102;
+
+  return 104;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
new file mode 100644
index 000000000000..51ba501bddd1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -0,0 +1,607 @@
+//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
+
+#include "AMDGPU.h"
+#include "R600InstrInfo.h"
+#include "R600ISelLowering.h"
+#include "R600FrameLowering.h"
+#include "SIInstrInfo.h"
+#include "SIISelLowering.h"
+#include "SIFrameLowering.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AMDGPUGenSubtargetInfo.inc"
+
+namespace llvm {
+
+class StringRef;
+
+class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
+public:
+  enum Generation {
+    R600 = 0,
+    R700,
+    EVERGREEN,
+    NORTHERN_ISLANDS,
+    SOUTHERN_ISLANDS,
+    SEA_ISLANDS,
+    VOLCANIC_ISLANDS,
+  };
+
+  enum {
+    ISAVersion0_0_0,
+    ISAVersion7_0_0,
+    ISAVersion7_0_1,
+    ISAVersion7_0_2,
+    ISAVersion8_0_0,
+    ISAVersion8_0_1,
+    ISAVersion8_0_2,
+    ISAVersion8_0_3,
+    ISAVersion8_0_4,
+    ISAVersion8_1_0,
+  };
+
+protected:
+  // Basic subtarget description.
+  Triple TargetTriple;
+  Generation Gen;
+  unsigned IsaVersion;
+  unsigned WavefrontSize;
+  int LocalMemorySize;
+  int LDSBankCount;
+  unsigned MaxPrivateElementSize;
+
+  // Possibly statically set by tablegen, but may want to be overridden.
+  bool FastFMAF32;
+  bool HalfRate64Ops;
+
+  // Dynamially set bits that enable features.
+  bool FP16Denormals;
+  bool FP32Denormals;
+  bool FP64Denormals;
+  bool FPExceptions;
+  bool FlatForGlobal;
+  bool UnalignedScratchAccess;
+  bool UnalignedBufferAccess;
+  bool EnableXNACK;
+  bool DebuggerInsertNops;
+  bool DebuggerReserveRegs;
+  bool DebuggerEmitPrologue;
+
+  // Used as options.
+  bool EnableVGPRSpilling;
+  bool EnablePromoteAlloca;
+  bool EnableLoadStoreOpt;
+  bool EnableUnsafeDSOffsetFolding;
+  bool EnableSIScheduler;
+  bool DumpCode;
+
+  // Subtarget statically properties set by tablegen
+  bool FP64;
+  bool IsGCN;
+  bool GCN1Encoding;
+  bool GCN3Encoding;
+  bool CIInsts;
+  bool SGPRInitBug;
+  bool HasSMemRealTime;
+  bool Has16BitInsts;
+  bool HasMovrel;
+  bool HasVGPRIndexMode;
+  bool HasScalarStores;
+  bool HasInv2PiInlineImm;
+  bool FlatAddressSpace;
+  bool R600ALUInst;
+  bool CaymanISA;
+  bool CFALUBug;
+  bool HasVertexCache;
+  short TexVTXClauseSize;
+  bool ScalarizeGlobal;
+
+  // Dummy feature to use for assembler in tablegen.
+  bool FeatureDisable;
+
+  InstrItineraryData InstrItins;
+  SelectionDAGTargetInfo TSInfo;
+
+public:
+  AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
+                  const TargetMachine &TM);
+  ~AMDGPUSubtarget() override;
+
+  AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT,
+                                                   StringRef GPU, StringRef FS);
+
+  const AMDGPUInstrInfo *getInstrInfo() const override = 0;
+  const AMDGPUFrameLowering *getFrameLowering() const override = 0;
+  const AMDGPUTargetLowering *getTargetLowering() const override = 0;
+  const AMDGPURegisterInfo *getRegisterInfo() const override = 0;
+
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
+
+  // Nothing implemented, just prevent crashes on use.
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  bool isAmdHsaOS() const {
+    return TargetTriple.getOS() == Triple::AMDHSA;
+  }
+
+  bool isMesa3DOS() const {
+    return TargetTriple.getOS() == Triple::Mesa3D;
+  }
+
+  bool isOpenCLEnv() const {
+    return TargetTriple.getEnvironment() == Triple::OpenCL;
+  }
+
+  Generation getGeneration() const {
+    return Gen;
+  }
+
+  unsigned getWavefrontSize() const {
+    return WavefrontSize;
+  }
+
+  int getLocalMemorySize() const {
+    return LocalMemorySize;
+  }
+
+  int getLDSBankCount() const {
+    return LDSBankCount;
+  }
+
+  unsigned getMaxPrivateElementSize() const {
+    return MaxPrivateElementSize;
+  }
+
+  bool has16BitInsts() const {
+    return Has16BitInsts;
+  }
+
+  bool hasHWFP64() const {
+    return FP64;
+  }
+
+  bool hasFastFMAF32() const {
+    return FastFMAF32;
+  }
+
+  bool hasHalfRate64Ops() const {
+    return HalfRate64Ops;
+  }
+
+  bool hasAddr64() const {
+    return (getGeneration() < VOLCANIC_ISLANDS);
+  }
+
+  bool hasBFE() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasBFI() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasBFM() const {
+    return hasBFE();
+  }
+
+  bool hasBCNT(unsigned Size) const {
+    if (Size == 32)
+      return (getGeneration() >= EVERGREEN);
+
+    if (Size == 64)
+      return (getGeneration() >= SOUTHERN_ISLANDS);
+
+    return false;
+  }
+
+  bool hasMulU24() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasMulI24() const {
+    return (getGeneration() >= SOUTHERN_ISLANDS ||
+            hasCaymanISA());
+  }
+
+  bool hasFFBL() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasFFBH() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasCARRY() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasBORROW() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasCaymanISA() const {
+    return CaymanISA;
+  }
+
+  bool isPromoteAllocaEnabled() const {
+    return EnablePromoteAlloca;
+  }
+
+  bool unsafeDSOffsetFoldingEnabled() const {
+    return EnableUnsafeDSOffsetFolding;
+  }
+
+  bool dumpCode() const {
+    return DumpCode;
+  }
+
+  bool enableIEEEBit(const MachineFunction &MF) const {
+    return AMDGPU::isCompute(MF.getFunction()->getCallingConv());
+  }
+
+  /// Return the amount of LDS that can be used that will not restrict the
+  /// occupancy lower than WaveCount.
+  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const;
+
+  /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
+  /// the given LDS memory size is the only constraint.
+  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const;
+
+  bool hasFP16Denormals() const {
+    return FP16Denormals;
+  }
+
+  bool hasFP32Denormals() const {
+    return FP32Denormals;
+  }
+
+  bool hasFP64Denormals() const {
+    return FP64Denormals;
+  }
+
+  bool hasFPExceptions() const {
+    return FPExceptions;
+  }
+
+  bool useFlatForGlobal() const {
+    return FlatForGlobal;
+  }
+
+  bool hasUnalignedBufferAccess() const {
+    return UnalignedBufferAccess;
+  }
+
+  bool hasUnalignedScratchAccess() const {
+    return UnalignedScratchAccess;
+  }
+
+  bool isXNACKEnabled() const {
+    return EnableXNACK;
+  }
+
+  bool isAmdCodeObjectV2() const {
+    return isAmdHsaOS() || isMesa3DOS();
+  }
+
+  /// \brief Returns the offset in bytes from the start of the input buffer
+  ///        of the first explicit kernel argument.
+  unsigned getExplicitKernelArgOffset() const {
+    return isAmdCodeObjectV2() ? 0 : 36;
+  }
+
+  unsigned getAlignmentForImplicitArgPtr() const {
+    return isAmdHsaOS() ? 8 : 4;
+  }
+
+  unsigned getImplicitArgNumBytes() const {
+    if (isMesa3DOS())
+      return 16;
+    if (isAmdHsaOS() && isOpenCLEnv())
+      return 32;
+    return 0;
+  }
+
+  unsigned getStackAlignment() const {
+    // Scratch is allocated in 256 dword per wave blocks.
+    return 4 * 256 / getWavefrontSize();
+  }
+
+  bool enableMachineScheduler() const override {
+    return true;
+  }
+
+  bool enableSubRegLiveness() const override {
+    return true;
+  }
+
+  /// \returns Number of execution units per compute unit supported by the
+  /// subtarget.
+  unsigned getEUsPerCU() const {
+    return 4;
+  }
+
+  /// \returns Maximum number of work groups per compute unit supported by the
+  /// subtarget and limited by given flat work group size.
+  unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
+    if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      return 8;
+    return getWavesPerWorkGroup(FlatWorkGroupSize) == 1 ? 40 : 16;
+  }
+
+  /// \returns Maximum number of waves per compute unit supported by the
+  /// subtarget without any kind of limitation.
+  unsigned getMaxWavesPerCU() const {
+    return getMaxWavesPerEU() * getEUsPerCU();
+  }
+
+  /// \returns Maximum number of waves per compute unit supported by the
+  /// subtarget and limited by given flat work group size.
+  unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
+    return getWavesPerWorkGroup(FlatWorkGroupSize);
+  }
+
+  /// \returns Minimum number of waves per execution unit supported by the
+  /// subtarget.
+  unsigned getMinWavesPerEU() const {
+    return 1;
+  }
+
+  /// \returns Maximum number of waves per execution unit supported by the
+  /// subtarget without any kind of limitation.
+  unsigned getMaxWavesPerEU() const {
+    if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      return 8;
+    // FIXME: Need to take scratch memory into account.
+    return 10;
+  }
+
+  /// \returns Maximum number of waves per execution unit supported by the
+  /// subtarget and limited by given flat work group size.
+  unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
+    return alignTo(getMaxWavesPerCU(FlatWorkGroupSize), getEUsPerCU()) /
+      getEUsPerCU();
+  }
+
+  /// \returns Minimum flat work group size supported by the subtarget.
+  unsigned getMinFlatWorkGroupSize() const {
+    return 1;
+  }
+
+  /// \returns Maximum flat work group size supported by the subtarget.
+  unsigned getMaxFlatWorkGroupSize() const {
+    return 2048;
+  }
+
+  /// \returns Number of waves per work group given the flat work group size.
+  unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
+    return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize();
+  }
+
+  void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
+  bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
+
+  /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
+  /// for function \p F, or minimum/maximum flat work group sizes explicitly
+  /// requested using "amdgpu-flat-work-group-size" attribute attached to
+  /// function \p F.
+  ///
+  /// \returns Subtarget's default values if explicitly requested values cannot
+  /// be converted to integer, or violate subtarget's specifications.
+  std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
+
+  /// \returns Subtarget's default pair of minimum/maximum number of waves per
+  /// execution unit for function \p F, or minimum/maximum number of waves per
+  /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
+  /// attached to function \p F.
+  ///
+  /// \returns Subtarget's default values if explicitly requested values cannot
+  /// be converted to integer, violate subtarget's specifications, or are not
+  /// compatible with minimum/maximum number of waves limited by flat work group
+  /// size, register usage, and/or lds usage.
+  std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
+};
+
+class R600Subtarget final : public AMDGPUSubtarget {
+private:
+  R600InstrInfo InstrInfo;
+  R600FrameLowering FrameLowering;
+  R600TargetLowering TLInfo;
+
+public:
+  R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                const TargetMachine &TM);
+
+  const R600InstrInfo *getInstrInfo() const override {
+    return &InstrInfo;
+  }
+
+  const R600FrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+
+  const R600TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+
+  const R600RegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  bool hasCFAluBug() const {
+    return CFALUBug;
+  }
+
+  bool hasVertexCache() const {
+    return HasVertexCache;
+  }
+
+  short getTexVTXClauseSize() const {
+    return TexVTXClauseSize;
+  }
+};
+
+class SISubtarget final : public AMDGPUSubtarget {
+public:
+  enum {
+    // The closed Vulkan driver sets 96, which limits the wave count to 8 but
+    // doesn't spill SGPRs as much as when 80 is set.
+    FIXED_SGPR_COUNT_FOR_INIT_BUG = 96
+  };
+
+private:
+  SIInstrInfo InstrInfo;
+  SIFrameLowering FrameLowering;
+  SITargetLowering TLInfo;
+  std::unique_ptr<GISelAccessor> GISel;
+
+public:
+  SISubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+              const TargetMachine &TM);
+
+  const SIInstrInfo *getInstrInfo() const override {
+    return &InstrInfo;
+  }
+
+  const SIFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+
+  const SITargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+
+  const CallLowering *getCallLowering() const override {
+    assert(GISel && "Access to GlobalISel APIs not set");
+    return GISel->getCallLowering();
+  }
+
+  const SIRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  void setGISelAccessor(GISelAccessor &GISel) {
+    this->GISel.reset(&GISel);
+  }
+
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
+                           unsigned NumRegionInstrs) const override;
+
+  bool isVGPRSpillingEnabled(const Function& F) const;
+
+  unsigned getMaxNumUserSGPRs() const {
+    return 16;
+  }
+
+  bool hasFlatAddressSpace() const {
+    return FlatAddressSpace;
+  }
+
+  bool hasSMemRealTime() const {
+    return HasSMemRealTime;
+  }
+
+  bool hasMovrel() const {
+    return HasMovrel;
+  }
+
+  bool hasVGPRIndexMode() const {
+    return HasVGPRIndexMode;
+  }
+
+  bool hasScalarCompareEq64() const {
+    return getGeneration() >= VOLCANIC_ISLANDS;
+  }
+
+  bool hasScalarStores() const {
+    return HasScalarStores;
+  }
+
+  bool hasInv2PiInlineImm() const {
+    return HasInv2PiInlineImm;
+  }
+
+  bool enableSIScheduler() const {
+    return EnableSIScheduler;
+  }
+
+  bool debuggerSupported() const {
+    return debuggerInsertNops() && debuggerReserveRegs() &&
+      debuggerEmitPrologue();
+  }
+
+  bool debuggerInsertNops() const {
+    return DebuggerInsertNops;
+  }
+
+  bool debuggerReserveRegs() const {
+    return DebuggerReserveRegs;
+  }
+
+  bool debuggerEmitPrologue() const {
+    return DebuggerEmitPrologue;
+  }
+
+  bool loadStoreOptEnabled() const {
+    return EnableLoadStoreOpt;
+  }
+
+  bool hasSGPRInitBug() const {
+    return SGPRInitBug;
+  }
+
+  bool has12DWordStoreHazard() const {
+    return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
+  }
+
+  unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const;
+
+  /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
+  unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
+
+  /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs
+  unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
+
+  /// \returns True if waitcnt instruction is needed before barrier instruction,
+  /// false otherwise.
+  bool needWaitcntBeforeBarrier() const {
+    return true;
+  }
+
+  unsigned getMaxNumSGPRs() const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
new file mode 100644
index 000000000000..d8a0c716279c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -0,0 +1,645 @@
+//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The AMDGPU target machine contains all of the hardware specific
+/// information  needed to emit code for R600 and SI GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPU.h"
+#include "AMDGPUCallLowering.h"
+#include "AMDGPUTargetObjectFile.h"
+#include "AMDGPUTargetTransformInfo.h"
+#include "GCNSchedStrategy.h"
+#include "R600MachineScheduler.h"
+#include "SIMachineScheduler.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <memory>
+
+using namespace llvm;
+
+static cl::opt<bool> EnableR600StructurizeCFG(
+  "r600-ir-structurize",
+  cl::desc("Use StructurizeCFG IR pass"),
+  cl::init(true));
+
+static cl::opt<bool> EnableSROA(
+  "amdgpu-sroa",
+  cl::desc("Run SROA after promote alloca pass"),
+  cl::ReallyHidden,
+  cl::init(true));
+
+static cl::opt<bool> EnableR600IfConvert(
+  "r600-if-convert",
+  cl::desc("Use if conversion pass"),
+  cl::ReallyHidden,
+  cl::init(true));
+
+// Option to disable vectorizer for tests.
+static cl::opt<bool> EnableLoadStoreVectorizer(
+  "amdgpu-load-store-vectorizer",
+  cl::desc("Enable load store vectorizer"),
+  cl::init(true),
+  cl::Hidden);
+
+// Option to to control global loads scalarization
+static cl::opt<bool> ScalarizeGlobal(
+  "amdgpu-scalarize-global-loads",
+  cl::desc("Enable global load scalarization"),
+  cl::init(false),
+  cl::Hidden);
+
+extern "C" void LLVMInitializeAMDGPUTarget() {
+  // Register the target
+  RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
+  RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
+
+  PassRegistry *PR = PassRegistry::getPassRegistry();
+  initializeSILowerI1CopiesPass(*PR);
+  initializeSIFixSGPRCopiesPass(*PR);
+  initializeSIFoldOperandsPass(*PR);
+  initializeSIShrinkInstructionsPass(*PR);
+  initializeSIFixControlFlowLiveIntervalsPass(*PR);
+  initializeSILoadStoreOptimizerPass(*PR);
+  initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
+  initializeAMDGPUAnnotateUniformValuesPass(*PR);
+  initializeAMDGPUPromoteAllocaPass(*PR);
+  initializeAMDGPUCodeGenPreparePass(*PR);
+  initializeAMDGPUUnifyMetadataPass(*PR);
+  initializeSIAnnotateControlFlowPass(*PR);
+  initializeSIInsertWaitsPass(*PR);
+  initializeSIWholeQuadModePass(*PR);
+  initializeSILowerControlFlowPass(*PR);
+  initializeSIInsertSkipsPass(*PR);
+  initializeSIDebuggerInsertNopsPass(*PR);
+  initializeSIOptimizeExecMaskingPass(*PR);
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+  return llvm::make_unique<AMDGPUTargetObjectFile>();
+}
+
+static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
+  return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
+}
+
+static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
+  return new SIScheduleDAGMI(C);
+}
+
+static ScheduleDAGInstrs *
+createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+  ScheduleDAGMILive *DAG =
+      new ScheduleDAGMILive(C,
+                            llvm::make_unique<GCNMaxOccupancySchedStrategy>(C));
+  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+  return DAG;
+}
+
+static MachineSchedRegistry
+R600SchedRegistry("r600", "Run R600's custom scheduler",
+                   createR600MachineScheduler);
+
+static MachineSchedRegistry
+SISchedRegistry("si", "Run SI's custom scheduler",
+                createSIMachineScheduler);
+
+static MachineSchedRegistry
+GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
+                             "Run GCN scheduler to maximize occupancy",
+                             createGCNMaxOccupancyMachineScheduler);
+
+static StringRef computeDataLayout(const Triple &TT) {
+  if (TT.getArch() == Triple::r600) {
+    // 32-bit pointers.
+    return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+            "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+  }
+
+  // 32-bit private, local, and region pointers. 64-bit global, constant and
+  // flat.
+  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
+         "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+}
+
+LLVM_READNONE
+static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
+  if (!GPU.empty())
+    return GPU;
+
+  // HSA only supports CI+, so change the default GPU to a CI for HSA.
+  if (TT.getArch() == Triple::amdgcn)
+    return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti";
+
+  return "r600";
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  // The AMDGPU toolchain only supports generating shared objects, so we
+  // must always use PIC.
+  return Reloc::PIC_;
+}
+
+AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
+                                         StringRef CPU, StringRef FS,
+                                         TargetOptions Options,
+                                         Optional<Reloc::Model> RM,
+                                         CodeModel::Model CM,
+                                         CodeGenOpt::Level OptLevel)
+  : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
+                      FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
+    TLOF(createTLOF(getTargetTriple())) {
+  initAsmInfo();
+}
+
+AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
+
+StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
+  Attribute GPUAttr = F.getFnAttribute("target-cpu");
+  return GPUAttr.hasAttribute(Attribute::None) ?
+    getTargetCPU() : GPUAttr.getValueAsString();
+}
+
+StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  return FSAttr.hasAttribute(Attribute::None) ?
+    getTargetFeatureString() :
+    FSAttr.getValueAsString();
+}
+
+void AMDGPUTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) {
+  PM.add(createAMDGPUUnifyMetadataPass());
+}
+
+//===----------------------------------------------------------------------===//
+// R600 Target Machine (R600 -> Cayman)
+//===----------------------------------------------------------------------===//
+
+R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
+                                     StringRef CPU, StringRef FS,
+                                     TargetOptions Options,
+                                     Optional<Reloc::Model> RM,
+                                     CodeModel::Model CM, CodeGenOpt::Level OL)
+  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
+  setRequiresStructuredCFG(true);
+}
+
+const R600Subtarget *R600TargetMachine::getSubtargetImpl(
+  const Function &F) const {
+  StringRef GPU = getGPUName(F);
+  StringRef FS = getFeatureString(F);
+
+  SmallString<128> SubtargetKey(GPU);
+  SubtargetKey.append(FS);
+
+  auto &I = SubtargetMap[SubtargetKey];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
+  }
+
+  return I.get();
+}
+
+//===----------------------------------------------------------------------===//
+// GCN Target Machine (SI+)
+//===----------------------------------------------------------------------===//
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+
+struct SIGISelActualAccessor : public GISelAccessor {
+  std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+  const AMDGPUCallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+};
+
+} // end anonymous namespace
+#endif
+
+GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
+                                   StringRef CPU, StringRef FS,
+                                   TargetOptions Options,
+                                   Optional<Reloc::Model> RM,
+                                   CodeModel::Model CM, CodeGenOpt::Level OL)
+  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
+  StringRef GPU = getGPUName(F);
+  StringRef FS = getFeatureString(F);
+
+  SmallString<128> SubtargetKey(GPU);
+  SubtargetKey.append(FS);
+
+  auto &I = SubtargetMap[SubtargetKey];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+    GISelAccessor *GISel = new GISelAccessor();
+#else
+    SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
+    GISel->CallLoweringInfo.reset(
+      new AMDGPUCallLowering(*I->getTargetLowering()));
+#endif
+
+    I->setGISelAccessor(*GISel);
+  }
+
+  I->setScalarizeGlobalBehavior(ScalarizeGlobal);
+
+  return I.get();
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPU Pass Setup
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class AMDGPUPassConfig : public TargetPassConfig {
+public:
+  AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {
+    // Exceptions and StackMaps are not supported, so these passes will never do
+    // anything.
+    disablePass(&StackMapLivenessID);
+    disablePass(&FuncletLayoutID);
+  }
+
+  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
+    return getTM<AMDGPUTargetMachine>();
+  }
+
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override {
+    ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+    return DAG;
+  }
+
+  void addEarlyCSEOrGVNPass();
+  void addStraightLineScalarOptimizationPasses();
+  void addIRPasses() override;
+  void addCodeGenPrepare() override;
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addGCPasses() override;
+};
+
+class R600PassConfig final : public AMDGPUPassConfig {
+public:
+  R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
+    : AMDGPUPassConfig(TM, PM) {}
+
+  ScheduleDAGInstrs *createMachineScheduler(
+    MachineSchedContext *C) const override {
+    return createR600MachineScheduler(C);
+  }
+
+  bool addPreISel() override;
+  void addPreRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
+};
+
+class GCNPassConfig final : public AMDGPUPassConfig {
+public:
+  GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
+    : AMDGPUPassConfig(TM, PM) {}
+
+  GCNTargetMachine &getGCNTargetMachine() const {
+    return getTM<GCNTargetMachine>();
+  }
+
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override;
+
+  void addIRPasses() override;
+  bool addPreISel() override;
+  void addMachineSSAOptimization() override;
+  bool addInstSelector() override;
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+  bool addIRTranslator() override;
+  bool addLegalizeMachineIR() override;
+  bool addRegBankSelect() override;
+  bool addGlobalInstructionSelect() override;
+#endif
+  void addFastRegAlloc(FunctionPass *RegAllocPass) override;
+  void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
+  void addPreRegAlloc() override;
+  void addPostRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
+};
+
+} // end anonymous namespace
+
+TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(AMDGPUTTIImpl(this, F));
+  });
+}
+
+void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
+  if (getOptLevel() == CodeGenOpt::Aggressive)
+    addPass(createGVNPass());
+  else
+    addPass(createEarlyCSEPass());
+}
+
+void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
+  addPass(createSeparateConstOffsetFromGEPPass());
+  addPass(createSpeculativeExecutionPass());
+  // ReassociateGEPs exposes more opportunites for SLSR. See
+  // the example in reassociate-geps-and-slsr.ll.
+  addPass(createStraightLineStrengthReducePass());
+  // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
+  // EarlyCSE can reuse.
+  addEarlyCSEOrGVNPass();
+  // Run NaryReassociate after EarlyCSE/GVN to be more effective.
+  addPass(createNaryReassociatePass());
+  // NaryReassociate on GEPs creates redundant common expressions, so run
+  // EarlyCSE after it.
+  addPass(createEarlyCSEPass());
+}
+
+void AMDGPUPassConfig::addIRPasses() {
+  // There is no reason to run these.
+  disablePass(&StackMapLivenessID);
+  disablePass(&FuncletLayoutID);
+  disablePass(&PatchableFunctionID);
+
+  // Function calls are not supported, so make sure we inline everything.
+  addPass(createAMDGPUAlwaysInlinePass());
+  addPass(createAlwaysInlinerLegacyPass());
+  // We need to add the barrier noop pass, otherwise adding the function
+  // inlining pass will cause all of the PassConfigs passes to be run
+  // one function at a time, which means if we have a nodule with two
+  // functions, then we will generate code for the first function
+  // without ever running any passes on the second.
+  addPass(createBarrierNoopPass());
+
+  // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
+  addPass(createAMDGPUOpenCLImageTypeLoweringPass());
+
+  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
+  if (TM.getOptLevel() > CodeGenOpt::None) {
+    addPass(createAMDGPUPromoteAlloca(&TM));
+
+    if (EnableSROA)
+      addPass(createSROAPass());
+
+    addStraightLineScalarOptimizationPasses();
+  }
+
+  TargetPassConfig::addIRPasses();
+
+  // EarlyCSE is not always strong enough to clean up what LSR produces. For
+  // example, GVN can combine
+  //
+  //   %0 = add %a, %b
+  //   %1 = add %b, %a
+  //
+  // and
+  //
+  //   %0 = shl nsw %a, 2
+  //   %1 = shl %a, 2
+  //
+  // but EarlyCSE can do neither of them.
+  if (getOptLevel() != CodeGenOpt::None)
+    addEarlyCSEOrGVNPass();
+}
+
+void AMDGPUPassConfig::addCodeGenPrepare() {
+  TargetPassConfig::addCodeGenPrepare();
+
+  if (EnableLoadStoreVectorizer)
+    addPass(createLoadStoreVectorizerPass());
+}
+
+bool AMDGPUPassConfig::addPreISel() {
+  addPass(createFlattenCFGPass());
+  return false;
+}
+
+bool AMDGPUPassConfig::addInstSelector() {
+  addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
+  return false;
+}
+
+bool AMDGPUPassConfig::addGCPasses() {
+  // Do nothing. GC is not supported.
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// R600 Pass Setup
+//===----------------------------------------------------------------------===//
+
+bool R600PassConfig::addPreISel() {
+  AMDGPUPassConfig::addPreISel();
+
+  if (EnableR600StructurizeCFG)
+    addPass(createStructurizeCFGPass());
+  return false;
+}
+
+void R600PassConfig::addPreRegAlloc() {
+  addPass(createR600VectorRegMerger(*TM));
+}
+
+void R600PassConfig::addPreSched2() {
+  addPass(createR600EmitClauseMarkers(), false);
+  if (EnableR600IfConvert)
+    addPass(&IfConverterID, false);
+  addPass(createR600ClauseMergePass(*TM), false);
+}
+
+void R600PassConfig::addPreEmitPass() {
+  addPass(createAMDGPUCFGStructurizerPass(), false);
+  addPass(createR600ExpandSpecialInstrsPass(*TM), false);
+  addPass(&FinalizeMachineBundlesID, false);
+  addPass(createR600Packetizer(*TM), false);
+  addPass(createR600ControlFlowFinalizer(*TM), false);
+}
+
+TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new R600PassConfig(this, PM);
+}
+
+//===----------------------------------------------------------------------===//
+// GCN Pass Setup
+//===----------------------------------------------------------------------===//
+
+ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
+  MachineSchedContext *C) const {
+  const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
+  if (ST.enableSIScheduler())
+    return createSIMachineScheduler(C);
+  return createGCNMaxOccupancyMachineScheduler(C);
+}
+
+bool GCNPassConfig::addPreISel() {
+  AMDGPUPassConfig::addPreISel();
+
+  // FIXME: We need to run a pass to propagate the attributes when calls are
+  // supported.
+  addPass(&AMDGPUAnnotateKernelFeaturesID);
+  addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+  addPass(createSinkingPass());
+  addPass(createSITypeRewriter());
+  addPass(createAMDGPUAnnotateUniformValues());
+  addPass(createSIAnnotateControlFlowPass());
+
+  return false;
+}
+
+void GCNPassConfig::addMachineSSAOptimization() {
+  TargetPassConfig::addMachineSSAOptimization();
+
+  // We want to fold operands after PeepholeOptimizer has run (or as part of
+  // it), because it will eliminate extra copies making it easier to fold the
+  // real source operand. We want to eliminate dead instructions after, so that
+  // we see fewer uses of the copies. We then need to clean up the dead
+  // instructions leftover after the operands are folded as well.
+  //
+  // XXX - Can we get away without running DeadMachineInstructionElim again?
+  addPass(&SIFoldOperandsID);
+  addPass(&DeadMachineInstructionElimID);
+  addPass(&SILoadStoreOptimizerID);
+}
+
+void GCNPassConfig::addIRPasses() {
+  // TODO: May want to move later or split into an early and late one.
+  addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine()));
+
+  AMDGPUPassConfig::addIRPasses();
+}
+
+bool GCNPassConfig::addInstSelector() {
+  AMDGPUPassConfig::addInstSelector();
+  addPass(createSILowerI1CopiesPass());
+  addPass(&SIFixSGPRCopiesID);
+  return false;
+}
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+bool GCNPassConfig::addIRTranslator() {
+  addPass(new IRTranslator());
+  return false;
+}
+
+bool GCNPassConfig::addLegalizeMachineIR() {
+  return false;
+}
+
+bool GCNPassConfig::addRegBankSelect() {
+  return false;
+}
+
+bool GCNPassConfig::addGlobalInstructionSelect() {
+  return false;
+}
+#endif
+
+void GCNPassConfig::addPreRegAlloc() {
+  addPass(createSIShrinkInstructionsPass());
+  addPass(createSIWholeQuadModePass());
+}
+
+void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+  // FIXME: We have to disable the verifier here because of PHIElimination +
+  // TwoAddressInstructions disabling it.
+
+  // This must be run immediately after phi elimination and before
+  // TwoAddressInstructions, otherwise the processing of the tied operand of
+  // SI_ELSE will introduce a copy of the tied operand source after the else.
+  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
+
+  TargetPassConfig::addFastRegAlloc(RegAllocPass);
+}
+
+void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+  // This needs to be run directly before register allocation because earlier
+  // passes might recompute live intervals.
+  insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
+
+  // This must be run immediately after phi elimination and before
+  // TwoAddressInstructions, otherwise the processing of the tied operand of
+  // SI_ELSE will introduce a copy of the tied operand source after the else.
+  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
+
+  TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
+}
+
+void GCNPassConfig::addPostRegAlloc() {
+  addPass(&SIOptimizeExecMaskingID);
+  TargetPassConfig::addPostRegAlloc();
+}
+
+void GCNPassConfig::addPreSched2() {
+}
+
+void GCNPassConfig::addPreEmitPass() {
+  // The hazard recognizer that runs as part of the post-ra scheduler does not
+  // guarantee to be able handle all hazards correctly. This is because if there
+  // are multiple scheduling regions in a basic block, the regions are scheduled
+  // bottom up, so when we begin to schedule a region we don't know what
+  // instructions were emitted directly before it.
+  //
+  // Here we add a stand-alone hazard recognizer pass which can handle all
+  // cases.
+  addPass(&PostRAHazardRecognizerID);
+
+  addPass(createSIInsertWaitsPass());
+  addPass(createSIShrinkInstructionsPass());
+  addPass(&SIInsertSkipsPassID);
+  addPass(createSIDebuggerInsertNopsPass());
+  addPass(&BranchRelaxationPassID);
+}
+
+TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new GCNPassConfig(this, PM);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
new file mode 100644
index 000000000000..9496773a073f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -0,0 +1,103 @@
+//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
+
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
+#include <memory>
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+// AMDGPU Target Machine (R600+)
+//===----------------------------------------------------------------------===//
+
+class AMDGPUTargetMachine : public LLVMTargetMachine {
+protected:
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  AMDGPUIntrinsicInfo IntrinsicInfo;
+
+  StringRef getGPUName(const Function &F) const;
+  StringRef getFeatureString(const Function &F) const;
+
+public:
+  AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                      StringRef FS, TargetOptions Options,
+                      Optional<Reloc::Model> RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
+  ~AMDGPUTargetMachine() override;
+
+  const AMDGPUSubtarget *getSubtargetImpl() const;
+  const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override = 0;
+
+  const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
+    return &IntrinsicInfo;
+  }
+  TargetIRAnalysis getTargetIRAnalysis() override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+  void addEarlyAsPossiblePasses(PassManagerBase &PM) override;
+};
+
+//===----------------------------------------------------------------------===//
+// R600 Target Machine (R600 -> Cayman)
+//===----------------------------------------------------------------------===//
+
+class R600TargetMachine final : public AMDGPUTargetMachine {
+private:
+  mutable StringMap<std::unique_ptr<R600Subtarget>> SubtargetMap;
+
+public:
+  R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                    StringRef FS, TargetOptions Options,
+                    Optional<Reloc::Model> RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL);
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  const R600Subtarget *getSubtargetImpl(const Function &) const override;
+};
+
+//===----------------------------------------------------------------------===//
+// GCN Target Machine (SI+)
+//===----------------------------------------------------------------------===//
+
+class GCNTargetMachine final : public AMDGPUTargetMachine {
+private:
+  mutable StringMap<std::unique_ptr<SISubtarget>> SubtargetMap;
+
+public:
+  GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                   StringRef FS, TargetOptions Options,
+                   Optional<Reloc::Model> RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL);
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  const SISubtarget *getSubtargetImpl(const Function &) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
new file mode 100644
index 000000000000..1fddc88a705a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -0,0 +1,30 @@
+//===-- AMDGPUHSATargetObjectFile.cpp - AMDGPU Object Files ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetObjectFile.h"
+#include "AMDGPU.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+#include "Utils/AMDGPUBaseInfo.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Generic Object File
+//===----------------------------------------------------------------------===//
+
+MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO) &&
+      AMDGPU::shouldEmitConstantsToTextSection(TM.getTargetTriple()))
+    return TextSection;
+
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
new file mode 100644
index 000000000000..de327786dff6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -0,0 +1,32 @@
+//===-- AMDGPUTargetObjectFile.h - AMDGPU  Object Info ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares the AMDGPU-specific subclass of
+/// TargetLoweringObjectFile.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF {
+  public:
+    MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+                                      const TargetMachine &TM) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
new file mode 100644
index 000000000000..a1a352642242
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -0,0 +1,340 @@
+//===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file implements a TargetTransformInfo analysis pass specific to the
+// AMDGPU target machine. It uses the target's detailed information to provide
+// more precise answers to certain TTI queries, while letting the target
+// independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetTransformInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "AMDGPUtti"
+
+
+void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
+                                            TTI::UnrollingPreferences &UP) {
+  UP.Threshold = 300; // Twice the default.
+  UP.MaxCount = UINT_MAX;
+  UP.Partial = true;
+
+  // TODO: Do we want runtime unrolling?
+
+  for (const BasicBlock *BB : L->getBlocks()) {
+    const DataLayout &DL = BB->getModule()->getDataLayout();
+    for (const Instruction &I : *BB) {
+      const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
+      if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+        continue;
+
+      const Value *Ptr = GEP->getPointerOperand();
+      const AllocaInst *Alloca =
+          dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
+      if (Alloca) {
+        // We want to do whatever we can to limit the number of alloca
+        // instructions that make it through to the code generator.  allocas
+        // require us to use indirect addressing, which is slow and prone to
+        // compiler bugs.  If this loop does an address calculation on an
+        // alloca ptr, then we want to use a higher than normal loop unroll
+        // threshold. This will give SROA a better chance to eliminate these
+        // allocas.
+        //
+        // Don't use the maximum allowed value here as it will make some
+        // programs way too big.
+        UP.Threshold = 800;
+      }
+    }
+  }
+}
+
+unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
+  if (Vec)
+    return 0;
+
+  // Number of VGPRs on SI.
+  if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+    return 256;
+
+  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
+}
+
+unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
+  return Vector ? 0 : 32;
+}
+
+unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
+  switch (AddrSpace) {
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS:
+  case AMDGPUAS::FLAT_ADDRESS:
+    return 128;
+  case AMDGPUAS::LOCAL_ADDRESS:
+  case AMDGPUAS::REGION_ADDRESS:
+    return 64;
+  case AMDGPUAS::PRIVATE_ADDRESS:
+    return 8 * ST->getMaxPrivateElementSize();
+  default:
+    if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
+        (AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
+         AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
+         (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
+          AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
+      return 128;
+    llvm_unreachable("unhandled address space");
+  }
+}
+
+unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+  // Semi-arbitrary large amount.
+  return 64;
+}
+
+int AMDGPUTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
+
+  EVT OrigTy = TLI->getValueType(DL, Ty);
+  if (!OrigTy.isSimple()) {
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                         Opd1PropInfo, Opd2PropInfo);
+  }
+
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+  // Because we don't have any legal vector operations, but the legal types, we
+  // need to account for split vectors.
+  unsigned NElts = LT.second.isVector() ?
+    LT.second.getVectorNumElements() : 1;
+
+  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
+
+  switch (ISD) {
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA: {
+    if (SLT == MVT::i64)
+      return get64BitInstrCost() * LT.first * NElts;
+
+    // i32
+    return getFullRateInstrCost() * LT.first * NElts;
+  }
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    if (SLT == MVT::i64){
+      // and, or and xor are typically split into 2 VALU instructions.
+      return 2 * getFullRateInstrCost() * LT.first * NElts;
+    }
+
+    return LT.first * NElts * getFullRateInstrCost();
+  }
+  case ISD::MUL: {
+    const int QuarterRateCost = getQuarterRateInstrCost();
+    if (SLT == MVT::i64) {
+      const int FullRateCost = getFullRateInstrCost();
+      return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
+    }
+
+    // i32
+    return QuarterRateCost * NElts * LT.first;
+  }
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+    if (SLT == MVT::f64)
+      return LT.first * NElts * get64BitInstrCost();
+
+    if (SLT == MVT::f32 || SLT == MVT::f16)
+      return LT.first * NElts * getFullRateInstrCost();
+    break;
+
+  case ISD::FDIV:
+  case ISD::FREM:
+    // FIXME: frem should be handled separately. The fdiv in it is most of it,
+    // but the current lowering is also not entirely correct.
+    if (SLT == MVT::f64) {
+      int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
+
+      // Add cost of workaround.
+      if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
+        Cost += 3 * getFullRateInstrCost();
+
+      return LT.first * Cost * NElts;
+    }
+
+    // Assuming no fp32 denormals lowering.
+    if (SLT == MVT::f32 || SLT == MVT::f16) {
+      assert(!ST->hasFP32Denormals() && "will change when supported");
+      int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
+      return LT.first * NElts * Cost;
+    }
+
+    break;
+  default:
+    break;
+  }
+
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                       Opd1PropInfo, Opd2PropInfo);
+}
+
+unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
+  // XXX - For some reason this isn't called for switch.
+  switch (Opcode) {
+  case Instruction::Br:
+  case Instruction::Ret:
+    return 10;
+  default:
+    return BaseT::getCFInstrCost(Opcode);
+  }
+}
+
+int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                      unsigned Index) {
+  switch (Opcode) {
+  case Instruction::ExtractElement:
+  case Instruction::InsertElement:
+    // Extracts are just reads of a subregister, so are free. Inserts are
+    // considered free because we don't want to have any cost for scalarizing
+    // operations, and we don't have to copy into a different register class.
+
+    // Dynamic indexing isn't free and is best avoided.
+    return Index == ~0u ? 2 : 0;
+  default:
+    return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+  }
+}
+
+static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
+                                          const IntrinsicInst *I) {
+  switch (I->getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::not_intrinsic:
+    // This means we have an intrinsic that isn't defined in
+    // IntrinsicsAMDGPU.td
+    break;
+
+  case Intrinsic::amdgcn_workitem_id_x:
+  case Intrinsic::amdgcn_workitem_id_y:
+  case Intrinsic::amdgcn_workitem_id_z:
+  case Intrinsic::amdgcn_interp_mov:
+  case Intrinsic::amdgcn_interp_p1:
+  case Intrinsic::amdgcn_interp_p2:
+  case Intrinsic::amdgcn_mbcnt_hi:
+  case Intrinsic::amdgcn_mbcnt_lo:
+  case Intrinsic::r600_read_tidig_x:
+  case Intrinsic::r600_read_tidig_y:
+  case Intrinsic::r600_read_tidig_z:
+  case Intrinsic::amdgcn_image_atomic_swap:
+  case Intrinsic::amdgcn_image_atomic_add:
+  case Intrinsic::amdgcn_image_atomic_sub:
+  case Intrinsic::amdgcn_image_atomic_smin:
+  case Intrinsic::amdgcn_image_atomic_umin:
+  case Intrinsic::amdgcn_image_atomic_smax:
+  case Intrinsic::amdgcn_image_atomic_umax:
+  case Intrinsic::amdgcn_image_atomic_and:
+  case Intrinsic::amdgcn_image_atomic_or:
+  case Intrinsic::amdgcn_image_atomic_xor:
+  case Intrinsic::amdgcn_image_atomic_inc:
+  case Intrinsic::amdgcn_image_atomic_dec:
+  case Intrinsic::amdgcn_image_atomic_cmpswap:
+  case Intrinsic::amdgcn_buffer_atomic_swap:
+  case Intrinsic::amdgcn_buffer_atomic_add:
+  case Intrinsic::amdgcn_buffer_atomic_sub:
+  case Intrinsic::amdgcn_buffer_atomic_smin:
+  case Intrinsic::amdgcn_buffer_atomic_umin:
+  case Intrinsic::amdgcn_buffer_atomic_smax:
+  case Intrinsic::amdgcn_buffer_atomic_umax:
+  case Intrinsic::amdgcn_buffer_atomic_and:
+  case Intrinsic::amdgcn_buffer_atomic_or:
+  case Intrinsic::amdgcn_buffer_atomic_xor:
+  case Intrinsic::amdgcn_buffer_atomic_cmpswap:
+  case Intrinsic::amdgcn_ps_live:
+    return true;
+  }
+
+  StringRef Name = I->getCalledFunction()->getName();
+  switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) {
+  default:
+    return false;
+  case AMDGPUIntrinsic::SI_fs_interp:
+  case AMDGPUIntrinsic::SI_fs_constant:
+    return true;
+  }
+}
+
+static bool isArgPassedInSGPR(const Argument *A) {
+  const Function *F = A->getParent();
+
+  // Arguments to compute shaders are never a source of divergence.
+  if (!AMDGPU::isShader(F->getCallingConv()))
+    return true;
+
+  // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
+  if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) ||
+      F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal))
+    return true;
+
+  // Everything else is in VGPRs.
+  return false;
+}
+
+///
+/// \returns true if the result of the value could potentially be
+/// different across workitems in a wavefront.
+bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
+
+  if (const Argument *A = dyn_cast<Argument>(V))
+    return !isArgPassedInSGPR(A);
+
+  // Loads from the private address space are divergent, because threads
+  // can execute the load instruction with the same inputs and get different
+  // results.
+  //
+  // All other loads are not divergent, because if threads issue loads with the
+  // same arguments, they will always get the same result.
+  if (const LoadInst *Load = dyn_cast<LoadInst>(V))
+    return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+
+  // Atomics are divergent because they are executed sequentially: when an
+  // atomic operation refers to the same address in each thread, then each
+  // thread after the first sees the value written by the previous thread as
+  // original value.
+  if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
+    return true;
+
+  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+    const TargetMachine &TM = getTLI()->getTargetMachine();
+    return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic);
+  }
+
+  // Assume all function calls are a source of divergence.
+  if (isa<CallInst>(V) || isa<InvokeInst>(V))
+    return true;
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
new file mode 100644
index 000000000000..1177007644ff
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -0,0 +1,98 @@
+//===-- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// AMDGPU target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+
+namespace llvm {
+class AMDGPUTargetLowering;
+
+class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
+  typedef BasicTTIImplBase<AMDGPUTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const AMDGPUSubtarget *ST;
+  const AMDGPUTargetLowering *TLI;
+
+  const AMDGPUSubtarget *getST() const { return ST; }
+  const AMDGPUTargetLowering *getTLI() const { return TLI; }
+
+
+  static inline int getFullRateInstrCost() {
+    return TargetTransformInfo::TCC_Basic;
+  }
+
+  static inline int getHalfRateInstrCost() {
+    return 2 * TargetTransformInfo::TCC_Basic;
+  }
+
+  // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
+  // should be 2 or 4.
+  static inline int getQuarterRateInstrCost() {
+    return 3 * TargetTransformInfo::TCC_Basic;
+  }
+
+   // On some parts, normal fp64 operations are half rate, and others
+   // quarter. This also applies to some integer operations.
+  inline int get64BitInstrCost() const {
+    return ST->hasHalfRate64Ops() ?
+      getHalfRateInstrCost() : getQuarterRateInstrCost();
+  }
+
+public:
+  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+    : BaseT(TM, F.getParent()->getDataLayout()),
+      ST(TM->getSubtargetImpl(F)),
+      TLI(ST->getTargetLowering()) {}
+
+  bool hasBranchDivergence() { return true; }
+
+  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
+    assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+    return TTI::PSK_FastHardware;
+  }
+
+  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
+  unsigned getMaxInterleaveFactor(unsigned VF);
+
+  int getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty,
+    TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+    TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+    TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+    TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+
+  unsigned getCFInstrCost(unsigned Opcode);
+
+  int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
+  bool isSourceOfDivergence(const Value *V) const;
+
+  unsigned getVectorSplitCost() { return 0; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
new file mode 100644
index 000000000000..bf501a1e8405
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
@@ -0,0 +1,149 @@
+//===-- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// \brief This pass that unifies multiple OpenCL metadata due to linking.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+  namespace kOCLMD {
+    const char SpirVer[]            = "opencl.spir.version";
+    const char OCLVer[]             = "opencl.ocl.version";
+    const char UsedExt[]            = "opencl.used.extensions";
+    const char UsedOptCoreFeat[]    = "opencl.used.optional.core.features";
+    const char CompilerOptions[]    = "opencl.compiler.options";
+    const char LLVMIdent[]          = "llvm.ident";
+  }
+
+  /// \brief Unify multiple OpenCL metadata due to linking.
+  class AMDGPUUnifyMetadata : public FunctionPass {
+  public:
+    static char ID;
+    explicit AMDGPUUnifyMetadata() : FunctionPass(ID) {};
+
+  private:
+    // This should really be a module pass but we have to run it as early
+    // as possible, so given function passes are executed first and
+    // TargetMachine::addEarlyAsPossiblePasses() expects only function passes
+    // it has to be a function pass.
+    virtual bool runOnModule(Module &M);
+
+    // \todo: Convert to a module pass.
+    virtual bool runOnFunction(Function &F);
+
+    /// \brief Unify version metadata.
+    /// \return true if changes are made.
+    /// Assume the named metadata has operands each of which is a pair of
+    /// integer constant, e.g.
+    /// !Name = {!n1, !n2}
+    /// !n1 = {i32 1, i32 2}
+    /// !n2 = {i32 2, i32 0}
+    /// Keep the largest version as the sole operand if PickFirst is false.
+    /// Otherwise pick it from the first value, representing kernel module.
+    bool unifyVersionMD(Module &M, StringRef Name, bool PickFirst) {
+      auto NamedMD = M.getNamedMetadata(Name);
+      if (!NamedMD || NamedMD->getNumOperands() <= 1)
+        return false;
+      MDNode *MaxMD = nullptr;
+      auto MaxVer = 0U;
+      for (const auto &VersionMD : NamedMD->operands()) {
+        assert(VersionMD->getNumOperands() == 2);
+        auto CMajor = mdconst::extract<ConstantInt>(VersionMD->getOperand(0));
+        auto VersionMajor = CMajor->getZExtValue();
+        auto CMinor = mdconst::extract<ConstantInt>(VersionMD->getOperand(1));
+        auto VersionMinor = CMinor->getZExtValue();
+        auto Ver = (VersionMajor * 100) + (VersionMinor * 10);
+        if (Ver > MaxVer) {
+          MaxVer = Ver;
+          MaxMD = VersionMD;
+        }
+        if (PickFirst)
+          break;
+      }
+      NamedMD->eraseFromParent();
+      NamedMD = M.getOrInsertNamedMetadata(Name);
+      NamedMD->addOperand(MaxMD);
+      return true;
+    }
+
+  /// \brief Unify version metadata.
+  /// \return true if changes are made.
+  /// Assume the named metadata has operands each of which is a list e.g.
+  /// !Name = {!n1, !n2}
+  /// !n1 = !{!"cl_khr_fp16", {!"cl_khr_fp64"}}
+  /// !n2 = !{!"cl_khr_image"}
+  /// Combine it into a single list with unique operands.
+  bool unifyExtensionMD(Module &M, StringRef Name) {
+    auto NamedMD = M.getNamedMetadata(Name);
+    if (!NamedMD || NamedMD->getNumOperands() == 1)
+      return false;
+
+    SmallVector<Metadata *, 4> All;
+    for (const auto &MD : NamedMD->operands())
+      for (const auto &Op : MD->operands())
+        if (std::find(All.begin(), All.end(), Op.get()) == All.end())
+          All.push_back(Op.get());
+
+    NamedMD->eraseFromParent();
+    NamedMD = M.getOrInsertNamedMetadata(Name);
+    for (const auto &MD : All)
+      NamedMD->addOperand(MDNode::get(M.getContext(), MD));
+
+    return true;
+  }
+};
+
+} // end anonymous namespace
+
+char AMDGPUUnifyMetadata::ID = 0;
+
+char &llvm::AMDGPUUnifyMetadataID = AMDGPUUnifyMetadata::ID;
+
+INITIALIZE_PASS(AMDGPUUnifyMetadata, "amdgpu-unify-metadata",
+                "Unify multiple OpenCL metadata due to linking",
+                false, false)
+
+FunctionPass* llvm::createAMDGPUUnifyMetadataPass() {
+  return new AMDGPUUnifyMetadata();
+}
+
+bool AMDGPUUnifyMetadata::runOnModule(Module &M) {
+  const char* Vers[] = {
+      kOCLMD::SpirVer,
+      kOCLMD::OCLVer
+  };
+  const char* Exts[] = {
+      kOCLMD::UsedExt,
+      kOCLMD::UsedOptCoreFeat,
+      kOCLMD::CompilerOptions,
+      kOCLMD::LLVMIdent
+  };
+
+  bool Changed = false;
+
+  for (auto &I : Vers)
+    Changed |= unifyVersionMD(M, I, true);
+
+  for (auto &I : Exts)
+    Changed |= unifyExtensionMD(M, I);
+
+  return Changed;
+}
+
+bool AMDGPUUnifyMetadata::runOnFunction(Function &F) {
+  return runOnModule(*F.getParent());
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
new file mode 100644
index 000000000000..7faeccdc5df3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -0,0 +1,1745 @@
+//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <deque>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "structcfg"
+
+#define DEFAULT_VEC_SLOTS 8
+
+// TODO: move-begin.
+
+//===----------------------------------------------------------------------===//
+//
+// Statistics for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+
+STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
+    "matched");
+STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
+    "matched");
+STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
+STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
+
+namespace llvm {
+  void initializeAMDGPUCFGStructurizerPass(PassRegistry&);
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Miscellaneous utility for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+namespace {
+#define SHOWNEWINSTR(i) \
+  DEBUG(dbgs() << "New instr: " << *i << "\n");
+
+#define SHOWNEWBLK(b, msg) \
+DEBUG( \
+  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  dbgs() << "\n"; \
+);
+
+#define SHOWBLK_DETAIL(b, msg) \
+DEBUG( \
+  if (b) { \
+  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  b->print(dbgs()); \
+  dbgs() << "\n"; \
+  } \
+);
+
+#define INVALIDSCCNUM -1
+
+template<class NodeT>
+void ReverseVector(SmallVectorImpl<NodeT *> &Src) {
+  size_t sz = Src.size();
+  for (size_t i = 0; i < sz/2; ++i) {
+    NodeT *t = Src[i];
+    Src[i] = Src[sz - i - 1];
+    Src[sz - i - 1] = t;
+  }
+}
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//
+// supporting data structure for CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+
+namespace {
+
+class BlockInformation {
+public:
+  bool IsRetired;
+  int  SccNum;
+  BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {}
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AMDGPUCFGStructurizer : public MachineFunctionPass {
+public:
+  typedef SmallVector<MachineBasicBlock *, 32> MBBVector;
+  typedef std::map<MachineBasicBlock *, BlockInformation *> MBBInfoMap;
+  typedef std::map<MachineLoop *, MachineBasicBlock *> LoopLandInfoMap;
+
+  enum PathToKind {
+    Not_SinglePath = 0,
+    SinglePath_InPath = 1,
+    SinglePath_NotInPath = 2
+  };
+
+  static char ID;
+
+  AMDGPUCFGStructurizer() :
+      MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {
+    initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "AMDGPU Control Flow Graph structurizer Pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  /// Perform the CFG structurization
+  bool run();
+
+  /// Perform the CFG preparation
+  /// This step will remove every unconditionnal/dead jump instructions and make
+  /// sure all loops have an exit block
+  bool prepare();
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    TII = MF.getSubtarget<R600Subtarget>().getInstrInfo();
+    TRI = &TII->getRegisterInfo();
+    DEBUG(MF.dump(););
+    OrderedBlks.clear();
+    Visited.clear();
+    FuncRep = &MF;
+    MLI = &getAnalysis<MachineLoopInfo>();
+    DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
+    MDT = &getAnalysis<MachineDominatorTree>();
+    DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr););
+    PDT = &getAnalysis<MachinePostDominatorTree>();
+    DEBUG(PDT->print(dbgs()););
+    prepare();
+    run();
+    DEBUG(MF.dump(););
+    return true;
+  }
+
+protected:
+  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *PDT;
+  MachineLoopInfo *MLI;
+  const R600InstrInfo *TII;
+  const R600RegisterInfo *TRI;
+
+  // PRINT FUNCTIONS
+  /// Print the ordered Blocks.
+  void printOrderedBlocks() const {
+    size_t i = 0;
+    for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(),
+        iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) {
+      dbgs() << "BB" << (*iterBlk)->getNumber();
+      dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
+      if (i != 0 && i % 10 == 0) {
+        dbgs() << "\n";
+      } else {
+        dbgs() << " ";
+      }
+    }
+  }
+  static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
+    for (MachineLoop::iterator iter = LoopInfo.begin(),
+         iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
+      (*iter)->print(dbgs(), 0);
+    }
+  }
+
+  // UTILITY FUNCTIONS
+  int getSCCNum(MachineBasicBlock *MBB) const;
+  MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
+  bool hasBackEdge(MachineBasicBlock *MBB) const;
+  bool isRetiredBlock(MachineBasicBlock *MBB) const;
+  bool isActiveLoophead(MachineBasicBlock *MBB) const;
+  PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
+      bool AllowSideEntry = true) const;
+  int countActiveBlock(MBBVector::const_iterator It,
+      MBBVector::const_iterator E) const;
+  bool needMigrateBlock(MachineBasicBlock *MBB) const;
+
+  // Utility Functions
+  void reversePredicateSetter(MachineBasicBlock::iterator I,
+                              MachineBasicBlock &MBB);
+  /// Compute the reversed DFS post order of Blocks
+  void orderBlocks(MachineFunction *MF);
+
+  // Function originally from CFGStructTraits
+  void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
+                      const DebugLoc &DL = DebugLoc());
+  MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
+                                  const DebugLoc &DL = DebugLoc());
+  MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
+  void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
+                              const DebugLoc &DL);
+  void insertCondBranchBefore(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator I, int NewOpcode,
+                              int RegNum, const DebugLoc &DL);
+  static int getBranchNzeroOpcode(int OldOpcode);
+  static int getBranchZeroOpcode(int OldOpcode);
+  static int getContinueNzeroOpcode(int OldOpcode);
+  static int getContinueZeroOpcode(int OldOpcode);
+  static MachineBasicBlock *getTrueBranch(MachineInstr *MI);
+  static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB);
+  static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB,
+      MachineInstr *MI);
+  static bool isCondBranch(MachineInstr *MI);
+  static bool isUncondBranch(MachineInstr *MI);
+  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB);
+  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB);
+  /// The correct naming for this is getPossibleLoopendBlockBranchInstr.
+  ///
+  /// BB with backward-edge could have move instructions after the branch
+  /// instruction.  Such move instruction "belong to" the loop backward-edge.
+  MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
+  static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
+  static bool isReturnBlock(MachineBasicBlock *MBB);
+  static void cloneSuccessorList(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *SrcMBB) ;
+  static MachineBasicBlock *clone(MachineBasicBlock *MBB);
+  /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose
+  /// because the AMDGPU instruction is not recognized as terminator fix this
+  /// and retire this routine
+  void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB,
+      MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
+  static void wrapup(MachineBasicBlock *MBB);
+
+
+  int patternMatch(MachineBasicBlock *MBB);
+  int patternMatchGroup(MachineBasicBlock *MBB);
+  int serialPatternMatch(MachineBasicBlock *MBB);
+  int ifPatternMatch(MachineBasicBlock *MBB);
+  int loopendPatternMatch();
+  int mergeLoop(MachineLoop *LoopRep);
+
+  /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in
+  /// the same loop with LoopLandInfo without explicitly keeping track of
+  /// loopContBlks and loopBreakBlks, this is a method to get the information.
+  bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
+      MachineBasicBlock *Src2MBB);
+  int handleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
+  int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
+  int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+      MachineBasicBlock **LandMBBPtr);
+  void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+      MachineBasicBlock *LandMBB, bool Detail = false);
+  int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+      MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
+  void mergeSerialBlock(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *SrcMBB);
+
+  void mergeIfthenelseBlock(MachineInstr *BranchMI,
+      MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
+      MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB);
+  void mergeLooplandBlock(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *LandMBB);
+  void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+      MachineBasicBlock *LandMBB);
+  void settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+      MachineBasicBlock *ContMBB);
+  /// normalizeInfiniteLoopExit change
+  ///   B1:
+  ///        uncond_br LoopHeader
+  ///
+  /// to
+  ///   B1:
+  ///        cond_br 1 LoopHeader dummyExit
+  /// and return the newly added dummy exit block
+  MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep);
+  void removeUnconditionalBranch(MachineBasicBlock *MBB);
+  /// Remove duplicate branches instructions in a block.
+  /// For instance
+  /// B0:
+  ///    cond_br X B1 B2
+  ///    cond_br X B1 B2
+  /// is transformed to
+  /// B0:
+  ///    cond_br X B1 B2
+  void removeRedundantConditionalBranch(MachineBasicBlock *MBB);
+  void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB);
+  void removeSuccessor(MachineBasicBlock *MBB);
+  MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
+      MachineBasicBlock *PredMBB);
+  void migrateInstruction(MachineBasicBlock *SrcMBB,
+      MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
+  void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
+  void retireBlock(MachineBasicBlock *MBB);
+
+
+private:
+  MBBInfoMap BlockInfoMap;
+  LoopLandInfoMap LLInfoMap;
+  std::map<MachineLoop *, bool> Visited;
+  MachineFunction *FuncRep;
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
+};
+
+int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
+  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
+  if (It == BlockInfoMap.end())
+    return INVALIDSCCNUM;
+  return (*It).second->SccNum;
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
+    const {
+  LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
+  if (It == LLInfoMap.end())
+    return nullptr;
+  return (*It).second;
+}
+
+bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
+  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
+  if (!LoopRep)
+    return false;
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  return MBB->isSuccessor(LoopHeader);
+}
+
+bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
+  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
+  if (It == BlockInfoMap.end())
+    return false;
+  return (*It).second->IsRetired;
+}
+
+bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
+  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
+  while (LoopRep && LoopRep->getHeader() == MBB) {
+    MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
+    if(!LoopLand)
+      return true;
+    if (!isRetiredBlock(LoopLand))
+      return true;
+    LoopRep = LoopRep->getParentLoop();
+  }
+  return false;
+}
+AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
+    bool AllowSideEntry) const {
+  assert(DstMBB);
+  if (SrcMBB == DstMBB)
+    return SinglePath_InPath;
+  while (SrcMBB && SrcMBB->succ_size() == 1) {
+    SrcMBB = *SrcMBB->succ_begin();
+    if (SrcMBB == DstMBB)
+      return SinglePath_InPath;
+    if (!AllowSideEntry && SrcMBB->pred_size() > 1)
+      return Not_SinglePath;
+  }
+  if (SrcMBB && SrcMBB->succ_size()==0)
+    return SinglePath_NotInPath;
+  return Not_SinglePath;
+}
+
+int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
+    MBBVector::const_iterator E) const {
+  int Count = 0;
+  while (It != E) {
+    if (!isRetiredBlock(*It))
+      ++Count;
+    ++It;
+  }
+  return Count;
+}
+
+bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
+  unsigned BlockSizeThreshold = 30;
+  unsigned CloneInstrThreshold = 100;
+  bool MultiplePreds = MBB && (MBB->pred_size() > 1);
+
+  if(!MultiplePreds)
+    return false;
+  unsigned BlkSize = MBB->size();
+  return ((BlkSize > BlockSizeThreshold) &&
+      (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
+}
+
+void AMDGPUCFGStructurizer::reversePredicateSetter(
+    MachineBasicBlock::iterator I, MachineBasicBlock &MBB) {
+  assert(I.isValid() && "Expected valid iterator");
+  for (;; --I) {
+    if (I == MBB.end())
+      continue;
+    if (I->getOpcode() == AMDGPU::PRED_X) {
+      switch (I->getOperand(2).getImm()) {
+      case AMDGPU::PRED_SETE_INT:
+        I->getOperand(2).setImm(AMDGPU::PRED_SETNE_INT);
+        return;
+      case AMDGPU::PRED_SETNE_INT:
+        I->getOperand(2).setImm(AMDGPU::PRED_SETE_INT);
+        return;
+      case AMDGPU::PRED_SETE:
+        I->getOperand(2).setImm(AMDGPU::PRED_SETNE);
+        return;
+      case AMDGPU::PRED_SETNE:
+        I->getOperand(2).setImm(AMDGPU::PRED_SETE);
+        return;
+      default:
+        llvm_unreachable("PRED_X Opcode invalid!");
+      }
+    }
+  }
+}
+
+void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
+                                           int NewOpcode, const DebugLoc &DL) {
+  MachineInstr *MI =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
+  MBB->push_back(MI);
+  //assume the instruction doesn't take any reg operand ...
+  SHOWNEWINSTR(MI);
+}
+
+MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
+                                                       int NewOpcode,
+                                                       const DebugLoc &DL) {
+  MachineInstr *MI =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
+  if (MBB->begin() != MBB->end())
+    MBB->insert(MBB->begin(), MI);
+  else
+    MBB->push_back(MI);
+  SHOWNEWINSTR(MI);
+  return MI;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
+    MachineBasicBlock::iterator I, int NewOpcode) {
+  MachineInstr *OldMI = &(*I);
+  MachineBasicBlock *MBB = OldMI->getParent();
+  MachineInstr *NewMBB =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
+  MBB->insert(I, NewMBB);
+  //assume the instruction doesn't take any reg operand ...
+  SHOWNEWINSTR(NewMBB);
+  return NewMBB;
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchBefore(
+    MachineBasicBlock::iterator I, int NewOpcode, const DebugLoc &DL) {
+  MachineInstr *OldMI = &(*I);
+  MachineBasicBlock *MBB = OldMI->getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
+  MBB->insert(I, NewMI);
+  MachineInstrBuilder MIB(*MF, NewMI);
+  MIB.addReg(OldMI->getOperand(1).getReg(), false);
+  SHOWNEWINSTR(NewMI);
+  //erase later oldInstr->eraseFromParent();
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchBefore(
+    MachineBasicBlock *blk, MachineBasicBlock::iterator I, int NewOpcode,
+    int RegNum, const DebugLoc &DL) {
+  MachineFunction *MF = blk->getParent();
+  MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
+  //insert before
+  blk->insert(I, NewInstr);
+  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
+  SHOWNEWINSTR(NewInstr);
+}
+
+int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+  case AMDGPU::BRANCH_COND_i32:
+  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
+  default: llvm_unreachable("internal error");
+  }
+  return -1;
+}
+
+int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+  case AMDGPU::BRANCH_COND_i32:
+  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
+  default: llvm_unreachable("internal error");
+  }
+  return -1;
+}
+
+int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
+  default: llvm_unreachable("internal error");
+  };
+  return -1;
+}
+
+int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
+  default: llvm_unreachable("internal error");
+  }
+  return -1;
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) {
+  return MI->getOperand(0).getMBB();
+}
+
+void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI,
+    MachineBasicBlock *MBB) {
+  MI->getOperand(0).setMBB(MBB);
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
+    MachineInstr *MI) {
+  assert(MBB->succ_size() == 2);
+  MachineBasicBlock *TrueBranch = getTrueBranch(MI);
+  MachineBasicBlock::succ_iterator It = MBB->succ_begin();
+  MachineBasicBlock::succ_iterator Next = It;
+  ++Next;
+  return (*It == TrueBranch) ? *Next : *It;
+}
+
+bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+    case AMDGPU::JUMP_COND:
+    case AMDGPU::BRANCH_COND_i32:
+    case AMDGPU::BRANCH_COND_f32: return true;
+  default:
+    return false;
+  }
+  return false;
+}
+
+bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AMDGPU::JUMP:
+  case AMDGPU::BRANCH:
+    return true;
+  default:
+    return false;
+  }
+  return false;
+}
+
+DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
+  //get DebugLoc from the first MachineBasicBlock instruction with debug info
+  DebugLoc DL;
+  for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end();
+      ++It) {
+    MachineInstr *instr = &(*It);
+    if (instr->getDebugLoc())
+      DL = instr->getDebugLoc();
+  }
+  return DL;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
+    MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  MachineInstr *MI = &*It;
+  if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
+    return MI;
+  return nullptr;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
+    MachineBasicBlock *MBB) {
+  for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
+      It != E; ++It) {
+    // FIXME: Simplify
+    MachineInstr *MI = &*It;
+    if (MI) {
+      if (isCondBranch(MI) || isUncondBranch(MI))
+        return MI;
+      else if (!TII->isMov(MI->getOpcode()))
+        break;
+    }
+  }
+  return nullptr;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  if (It != MBB->rend()) {
+    MachineInstr *instr = &(*It);
+    if (instr->getOpcode() == AMDGPU::RETURN)
+      return instr;
+  }
+  return nullptr;
+}
+
+bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
+  MachineInstr *MI = getReturnInstr(MBB);
+  bool IsReturn = (MBB->succ_size() == 0);
+  if (MI)
+    assert(IsReturn);
+  else if (IsReturn)
+    DEBUG(
+      dbgs() << "BB" << MBB->getNumber()
+             <<" is return block without RETURN instr\n";);
+  return  IsReturn;
+}
+
+void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
+    MachineBasicBlock *SrcMBB) {
+  for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(),
+       iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It)
+    DstMBB->addSuccessor(*It);  // *iter's predecessor is also taken care of
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
+  MachineFunction *Func = MBB->getParent();
+  MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
+  Func->push_back(NewMBB);  //insert to function
+  for (const MachineInstr &It : *MBB)
+    NewMBB->push_back(Func->CloneMachineInstr(&It));
+  return NewMBB;
+}
+
+void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
+    MachineBasicBlock *NewBlk) {
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
+  if (BranchMI && isCondBranch(BranchMI) &&
+      getTrueBranch(BranchMI) == OldMBB)
+    setTrueBranch(BranchMI, NewBlk);
+}
+
+void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
+  assert((!MBB->getParent()->getJumpTableInfo()
+          || MBB->getParent()->getJumpTableInfo()->isEmpty())
+         && "found a jump table");
+
+   //collect continue right before endloop
+   SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr;
+   MachineBasicBlock::iterator Pre = MBB->begin();
+   MachineBasicBlock::iterator E = MBB->end();
+   MachineBasicBlock::iterator It = Pre;
+   while (It != E) {
+     if (Pre->getOpcode() == AMDGPU::CONTINUE
+         && It->getOpcode() == AMDGPU::ENDLOOP)
+       ContInstr.push_back(&*Pre);
+     Pre = It;
+     ++It;
+   }
+
+   //delete continue right before endloop
+   for (unsigned i = 0; i < ContInstr.size(); ++i)
+      ContInstr[i]->eraseFromParent();
+
+   // TODO to fix up jump table so later phase won't be confused.  if
+   // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
+   // there isn't such an interface yet.  alternatively, replace all the other
+   // blocks in the jump table with the entryBlk //}
+
+}
+
+
+bool AMDGPUCFGStructurizer::prepare() {
+  bool Changed = false;
+
+  //FIXME: if not reducible flow graph, make it so ???
+
+  DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
+
+  orderBlocks(FuncRep);
+
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
+
+  // Add an ExitBlk to loop that don't have one
+  for (MachineLoopInfo::iterator It = MLI->begin(),
+       E = MLI->end(); It != E; ++It) {
+    MachineLoop *LoopRep = (*It);
+    MBBVector ExitingMBBs;
+    LoopRep->getExitingBlocks(ExitingMBBs);
+
+    if (ExitingMBBs.size() == 0) {
+      MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep);
+      if (DummyExitBlk)
+        RetBlks.push_back(DummyExitBlk);
+    }
+  }
+
+  // Remove unconditional branch instr.
+  // Add dummy exit block iff there are multiple returns.
+  for (SmallVectorImpl<MachineBasicBlock *>::const_iterator
+       It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    removeUnconditionalBranch(MBB);
+    removeRedundantConditionalBranch(MBB);
+    if (isReturnBlock(MBB)) {
+      RetBlks.push_back(MBB);
+    }
+    assert(MBB->succ_size() <= 2);
+  }
+
+  if (RetBlks.size() >= 2) {
+    addDummyExitBlock(RetBlks);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool AMDGPUCFGStructurizer::run() {
+
+  //Assume reducible CFG...
+  DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
+
+#ifdef STRESSTEST
+  //Use the worse block ordering to test the algorithm.
+  ReverseVector(orderedBlks);
+#endif
+
+  DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
+  int NumIter = 0;
+  bool Finish = false;
+  MachineBasicBlock *MBB;
+  bool MakeProgress = false;
+  int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
+                                        OrderedBlks.end());
+
+  do {
+    ++NumIter;
+    DEBUG(
+      dbgs() << "numIter = " << NumIter
+             << ", numRemaintedBlk = " << NumRemainedBlk << "\n";
+    );
+
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
+        OrderedBlks.begin();
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator E =
+        OrderedBlks.end();
+
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
+        It;
+    MachineBasicBlock *SccBeginMBB = nullptr;
+    int SccNumBlk = 0;  // The number of active blocks, init to a
+                        // maximum possible number.
+    int SccNumIter;     // Number of iteration in this SCC.
+
+    while (It != E) {
+      MBB = *It;
+
+      if (!SccBeginMBB) {
+        SccBeginIter = It;
+        SccBeginMBB = MBB;
+        SccNumIter = 0;
+        SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
+        DEBUG(
+              dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
+              dbgs() << "\n";
+        );
+      }
+
+      if (!isRetiredBlock(MBB))
+        patternMatch(MBB);
+
+      ++It;
+
+      bool ContNextScc = true;
+      if (It == E
+          || getSCCNum(SccBeginMBB) != getSCCNum(*It)) {
+        // Just finish one scc.
+        ++SccNumIter;
+        int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
+        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
+          DEBUG(
+            dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
+                   << ", sccNumIter = " << SccNumIter;
+            dbgs() << "doesn't make any progress\n";
+          );
+          ContNextScc = true;
+        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
+          SccNumBlk = sccRemainedNumBlk;
+          It = SccBeginIter;
+          ContNextScc = false;
+          DEBUG(
+            dbgs() << "repeat processing SCC" << getSCCNum(MBB)
+                   << "sccNumIter = " << SccNumIter << '\n';
+          );
+        } else {
+          // Finish the current scc.
+          ContNextScc = true;
+        }
+      } else {
+        // Continue on next component in the current scc.
+        ContNextScc = false;
+      }
+
+      if (ContNextScc)
+        SccBeginMBB = nullptr;
+    } //while, "one iteration" over the function.
+
+    MachineBasicBlock *EntryMBB =
+        *GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
+    if (EntryMBB->succ_size() == 0) {
+      Finish = true;
+      DEBUG(
+        dbgs() << "Reduce to one block\n";
+      );
+    } else {
+      int NewnumRemainedBlk
+        = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
+      // consider cloned blocks ??
+      if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) {
+        MakeProgress = true;
+        NumRemainedBlk = NewnumRemainedBlk;
+      } else {
+        MakeProgress = false;
+        DEBUG(
+          dbgs() << "No progress\n";
+        );
+      }
+    }
+  } while (!Finish && MakeProgress);
+
+  // Misc wrap up to maintain the consistency of the Function representation.
+  wrapup(*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
+
+  // Detach retired Block, release memory.
+  for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
+      It != E; ++It) {
+    if ((*It).second && (*It).second->IsRetired) {
+      assert(((*It).first)->getNumber() != -1);
+      DEBUG(
+        dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";
+      );
+      (*It).first->eraseFromParent();  //Remove from the parent Function.
+    }
+    delete (*It).second;
+  }
+  BlockInfoMap.clear();
+  LLInfoMap.clear();
+
+  if (!Finish) {
+    DEBUG(FuncRep->viewCFG());
+    report_fatal_error("IRREDUCIBLE_CFG");
+  }
+
+  return true;
+}
+
+
+
+void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
+  int SccNum = 0;
+  MachineBasicBlock *MBB;
+  for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
+       ++It, ++SccNum) {
+    const std::vector<MachineBasicBlock *> &SccNext = *It;
+    for (std::vector<MachineBasicBlock *>::const_iterator
+         blockIter = SccNext.begin(), blockEnd = SccNext.end();
+         blockIter != blockEnd; ++blockIter) {
+      MBB = *blockIter;
+      OrderedBlks.push_back(MBB);
+      recordSccnum(MBB, SccNum);
+    }
+  }
+
+  //walk through all the block in func to check for unreachable
+  typedef GraphTraits<MachineFunction *> GTM;
+  auto It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF);
+  for (; It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    SccNum = getSCCNum(MBB);
+    if (SccNum == INVALIDSCCNUM)
+      dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
+  }
+}
+
+int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
+  int NumMatch = 0;
+  int CurMatch;
+
+  DEBUG(
+        dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";
+  );
+
+  while ((CurMatch = patternMatchGroup(MBB)) > 0)
+    NumMatch += CurMatch;
+
+  DEBUG(
+        dbgs() << "End patternMatch BB" << MBB->getNumber()
+      << ", numMatch = " << NumMatch << "\n";
+  );
+
+  return NumMatch;
+}
+
+int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
+  int NumMatch = 0;
+  NumMatch += loopendPatternMatch();
+  NumMatch += serialPatternMatch(MBB);
+  NumMatch += ifPatternMatch(MBB);
+  return NumMatch;
+}
+
+
+int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
+  if (MBB->succ_size() != 1)
+    return 0;
+
+  MachineBasicBlock *childBlk = *MBB->succ_begin();
+  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
+    return 0;
+
+  mergeSerialBlock(MBB, childBlk);
+  ++numSerialPatternMatch;
+  return 1;
+}
+
+int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
+  //two edges
+  if (MBB->succ_size() != 2)
+    return 0;
+  if (hasBackEdge(MBB))
+    return 0;
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
+  if (!BranchMI)
+    return 0;
+
+  assert(isCondBranch(BranchMI));
+  int NumMatch = 0;
+
+  MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
+  NumMatch += serialPatternMatch(TrueMBB);
+  NumMatch += ifPatternMatch(TrueMBB);
+  MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
+  NumMatch += serialPatternMatch(FalseMBB);
+  NumMatch += ifPatternMatch(FalseMBB);
+  MachineBasicBlock *LandBlk;
+  int Cloned = 0;
+
+  assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
+  // TODO: Simplify
+  if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1
+    && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) {
+    // Diamond pattern
+    LandBlk = *TrueMBB->succ_begin();
+  } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
+    // Triangle pattern, false is empty
+    LandBlk = FalseMBB;
+    FalseMBB = nullptr;
+  } else if (FalseMBB->succ_size() == 1
+             && *FalseMBB->succ_begin() == TrueMBB) {
+    // Triangle pattern, true is empty
+    // We reverse the predicate to make a triangle, empty false pattern;
+    std::swap(TrueMBB, FalseMBB);
+    reversePredicateSetter(MBB->end(), *MBB);
+    LandBlk = FalseMBB;
+    FalseMBB = nullptr;
+  } else if (FalseMBB->succ_size() == 1
+             && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
+    LandBlk = *FalseMBB->succ_begin();
+  } else if (TrueMBB->succ_size() == 1
+    && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
+    LandBlk = *TrueMBB->succ_begin();
+  } else {
+    return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
+  }
+
+  // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
+  // new BB created for landBlk==NULL may introduce new challenge to the
+  // reduction process.
+  if (LandBlk &&
+      ((TrueMBB && TrueMBB->pred_size() > 1)
+      || (FalseMBB && FalseMBB->pred_size() > 1))) {
+     Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk);
+  }
+
+  if (TrueMBB && TrueMBB->pred_size() > 1) {
+    TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
+    ++Cloned;
+  }
+
+  if (FalseMBB && FalseMBB->pred_size() > 1) {
+    FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
+    ++Cloned;
+  }
+
+  mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);
+
+  ++numIfPatternMatch;
+
+  numClonedBlock += Cloned;
+
+  return 1 + Cloned + NumMatch;
+}
+
+int AMDGPUCFGStructurizer::loopendPatternMatch() {
+  std::deque<MachineLoop *> NestedLoops;
+  for (auto &It: *MLI)
+    for (MachineLoop *ML : depth_first(It))
+      NestedLoops.push_front(ML);
+
+  if (NestedLoops.size() == 0)
+    return 0;
+
+  // Process nested loop outside->inside (we did push_front),
+  // so "continue" to a outside loop won't be mistaken as "break"
+  // of the current loop.
+  int Num = 0;
+  for (MachineLoop *ExaminedLoop : NestedLoops) {
+    if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
+      continue;
+    DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
+    int NumBreak = mergeLoop(ExaminedLoop);
+    if (NumBreak == -1)
+      break;
+    Num += NumBreak;
+  }
+  return Num;
+}
+
+int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  MBBVector ExitingMBBs;
+  LoopRep->getExitingBlocks(ExitingMBBs);
+  assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
+  DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";);
+  // We assume a single ExitBlk
+  MBBVector ExitBlks;
+  LoopRep->getExitBlocks(ExitBlks);
+  SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet;
+  for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i)
+    ExitBlkSet.insert(ExitBlks[i]);
+  assert(ExitBlkSet.size() == 1);
+  MachineBasicBlock *ExitBlk = *ExitBlks.begin();
+  assert(ExitBlk && "Loop has several exit block");
+  MBBVector LatchBlks;
+  typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits;
+  InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader),
+      PE = InvMBBTraits::child_end(LoopHeader);
+  for (; PI != PE; PI++) {
+    if (LoopRep->contains(*PI))
+      LatchBlks.push_back(*PI);
+  }
+
+  for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
+    mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
+  for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
+    settleLoopcontBlock(LatchBlks[i], LoopHeader);
+  int Match = 0;
+  do {
+    Match = 0;
+    Match += serialPatternMatch(LoopHeader);
+    Match += ifPatternMatch(LoopHeader);
+  } while (Match > 0);
+  mergeLooplandBlock(LoopHeader, ExitBlk);
+  MachineLoop *ParentLoop = LoopRep->getParentLoop();
+  if (ParentLoop)
+    MLI->changeLoopFor(LoopHeader, ParentLoop);
+  else
+    MLI->removeBlock(LoopHeader);
+  Visited[LoopRep] = true;
+  return 1;
+}
+
+bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
+    MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
+  if (Src1MBB->succ_size() == 0) {
+    MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
+    if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
+      MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
+      if (TheEntry) {
+        DEBUG(
+          dbgs() << "isLoopContBreakBlock yes src1 = BB"
+                 << Src1MBB->getNumber()
+                 << " src2 = BB" << Src2MBB->getNumber() << "\n";
+        );
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
+  int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
+  if (Num == 0) {
+    DEBUG(
+      dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
+    );
+    Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
+  }
+  return Num;
+}
+
+int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
+  int Num = 0;
+  MachineBasicBlock *DownBlk;
+
+  //trueBlk could be the common post dominator
+  DownBlk = TrueMBB;
+
+  DEBUG(
+    dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
+           << " true = BB" << TrueMBB->getNumber()
+           << ", numSucc=" << TrueMBB->succ_size()
+           << " false = BB" << FalseMBB->getNumber() << "\n";
+  );
+
+  while (DownBlk) {
+    DEBUG(
+      dbgs() << "check down = BB" << DownBlk->getNumber();
+    );
+
+    if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
+      DEBUG(
+        dbgs() << " working\n";
+      );
+
+      Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
+      Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
+
+      numClonedBlock += Num;
+      Num += serialPatternMatch(*HeadMBB->succ_begin());
+      Num += serialPatternMatch(*std::next(HeadMBB->succ_begin()));
+      Num += ifPatternMatch(HeadMBB);
+      assert(Num > 0);
+
+      break;
+    }
+    DEBUG(
+      dbgs() << " not working\n";
+    );
+    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
+  } // walk down the postDomTree
+
+  return Num;
+}
+
+void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
+    MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
+    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
+  dbgs() << "head = BB" << HeadMBB->getNumber()
+         << " size = " << HeadMBB->size();
+  if (Detail) {
+    dbgs() << "\n";
+    HeadMBB->print(dbgs());
+    dbgs() << "\n";
+  }
+
+  if (TrueMBB) {
+    dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = "
+           << TrueMBB->size() << " numPred = " << TrueMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      TrueMBB->print(dbgs());
+      dbgs() << "\n";
+    }
+  }
+  if (FalseMBB) {
+    dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = "
+           << FalseMBB->size() << " numPred = " << FalseMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      FalseMBB->print(dbgs());
+      dbgs() << "\n";
+    }
+  }
+  if (LandMBB) {
+    dbgs() << ", land = BB" << LandMBB->getNumber() << " size = "
+           << LandMBB->size() << " numPred = " << LandMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      LandMBB->print(dbgs());
+      dbgs() << "\n";
+    }
+  }
+
+    dbgs() << "\n";
+}
+
+int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+    MachineBasicBlock **LandMBBPtr) {
+  bool MigrateTrue = false;
+  bool MigrateFalse = false;
+
+  MachineBasicBlock *LandBlk = *LandMBBPtr;
+
+  assert((!TrueMBB || TrueMBB->succ_size() <= 1)
+         && (!FalseMBB || FalseMBB->succ_size() <= 1));
+
+  if (TrueMBB == FalseMBB)
+    return 0;
+
+  MigrateTrue = needMigrateBlock(TrueMBB);
+  MigrateFalse = needMigrateBlock(FalseMBB);
+
+  if (!MigrateTrue && !MigrateFalse)
+    return 0;
+
+  // If we need to migrate either trueBlk and falseBlk, migrate the rest that
+  // have more than one predecessors.  without doing this, its predecessor
+  // rather than headBlk will have undefined value in initReg.
+  if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1)
+    MigrateTrue = true;
+  if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
+    MigrateFalse = true;
+
+  DEBUG(
+    dbgs() << "before improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
+  );
+
+  // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
+  //
+  // new: headBlk => if () {initReg = 1; org trueBlk branch} else
+  //      {initReg = 0; org falseBlk branch }
+  //      => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
+  //      => org landBlk
+  //      if landBlk->pred_size() > 2, put the about if-else inside
+  //      if (initReg !=2) {...}
+  //
+  // add initReg = initVal to headBlk
+
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+  if (!MigrateTrue || !MigrateFalse) {
+    // XXX: We have an opportunity here to optimize the "branch into if" case
+    // here.  Branch into if looks like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \           |
+    // diamond_false        diamond_true
+    //             \      /
+    //               done
+    //
+    // The diamond_head block begins the "if" and the diamond_true block
+    // is the block being "branched into".
+    //
+    // If MigrateTrue is true, then TrueBB is the block being "branched into"
+    // and if MigrateFalse is true, then FalseBB is the block being
+    // "branched into"
+    //
+    // Here is the pseudo code for how I think the optimization should work:
+    // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
+    // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
+    // 3. Move the branch instruction from diamond_head into its own basic
+    //    block (new_block).
+    // 4. Add an unconditional branch from diamond_head to new_block
+    // 5. Replace the branch instruction in branch_from with an unconditional
+    //    branch to new_block.  If branch_from has multiple predecessors, then
+    //    we need to replace the True/False block in the branch
+    //    instruction instead of replacing it.
+    // 6. Change the condition of the branch instruction in new_block from
+    //    COND to (COND || GPR0)
+    //
+    // In order insert these MOV instruction, we will need to use the
+    // RegisterScavenger.  Usually liveness stops being tracked during
+    // the late machine optimization passes, however if we implement
+    // bool TargetRegisterInfo::requiresRegisterScavenging(
+    //                                                const MachineFunction &MF)
+    // and have it return true, liveness will be tracked correctly
+    // by generic optimization passes.  We will also need to make sure that
+    // all of our target-specific passes that run after regalloc and before
+    // the CFGStructurizer track liveness and we will need to modify this pass
+    // to correctly track liveness.
+    //
+    // After the above changes, the new CFG should look like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //                       \     /
+    //                      new_block
+    //                      /      |
+    //         diamond_false        diamond_true
+    //                      \      /
+    //                        done
+    //
+    // Without this optimization, we are forced to duplicate the diamond_true
+    // block and we will end up with a CFG like this:
+    //
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \                   |
+    // diamond_false        diamond_true      diamond_true (duplicate)
+    //             \      /                   |
+    //               done --------------------|
+    //
+    // Duplicating diamond_true can be very costly especially if it has a
+    // lot of instructions.
+    return 0;
+  }
+
+  int NumNewBlk = 0;
+
+  bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
+
+  //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
+  MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
+
+  if (LandBlkHasOtherPred) {
+    report_fatal_error("Extra register needed to handle CFG");
+    unsigned CmpResReg =
+      HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+    report_fatal_error("Extra compare instruction needed to handle CFG");
+    insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
+        CmpResReg, DebugLoc());
+  }
+
+  // XXX: We are running this after RA, so creating virtual registers will
+  // cause an assertion failure in the PostRA scheduling pass.
+  unsigned InitReg =
+    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+  insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
+      DebugLoc());
+
+  if (MigrateTrue) {
+    migrateInstruction(TrueMBB, LandBlk, I);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 1).
+    report_fatal_error("Extra register needed to handle CFG");
+  }
+  insertInstrBefore(I, AMDGPU::ELSE);
+
+  if (MigrateFalse) {
+    migrateInstruction(FalseMBB, LandBlk, I);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 0)
+    report_fatal_error("Extra register needed to handle CFG");
+  }
+
+  if (LandBlkHasOtherPred) {
+    // add endif
+    insertInstrBefore(I, AMDGPU::ENDIF);
+
+    // put initReg = 2 to other predecessors of landBlk
+    for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
+         PE = LandBlk->pred_end(); PI != PE; ++PI) {
+      MachineBasicBlock *MBB = *PI;
+      if (MBB != TrueMBB && MBB != FalseMBB)
+        report_fatal_error("Extra register needed to handle CFG");
+    }
+  }
+  DEBUG(
+    dbgs() << "result from improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
+  );
+
+  // update landBlk
+  *LandMBBPtr = LandBlk;
+
+  return NumNewBlk;
+}
+
+void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
+    MachineBasicBlock *SrcMBB) {
+  DEBUG(
+    dbgs() << "serialPattern BB" << DstMBB->getNumber()
+           << " <= BB" << SrcMBB->getNumber() << "\n";
+  );
+  DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
+
+  DstMBB->removeSuccessor(SrcMBB, true);
+  cloneSuccessorList(DstMBB, SrcMBB);
+
+  removeSuccessor(SrcMBB);
+  MLI->removeBlock(SrcMBB);
+  retireBlock(SrcMBB);
+}
+
+void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
+    MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
+    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
+  assert (TrueMBB);
+  DEBUG(
+    dbgs() << "ifPattern BB" << MBB->getNumber();
+    dbgs() << "{  ";
+    if (TrueMBB) {
+      dbgs() << "BB" << TrueMBB->getNumber();
+    }
+    dbgs() << "  } else ";
+    dbgs() << "{  ";
+    if (FalseMBB) {
+      dbgs() << "BB" << FalseMBB->getNumber();
+    }
+    dbgs() << "  }\n ";
+    dbgs() << "landBlock: ";
+    if (!LandMBB) {
+      dbgs() << "NULL";
+    } else {
+      dbgs() << "BB" << LandMBB->getNumber();
+    }
+    dbgs() << "\n";
+  );
+
+  int OldOpcode = BranchMI->getOpcode();
+  DebugLoc BranchDL = BranchMI->getDebugLoc();
+
+//    transform to
+//    if cond
+//       trueBlk
+//    else
+//       falseBlk
+//    endif
+//    landBlk
+
+  MachineBasicBlock::iterator I = BranchMI;
+  insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode),
+      BranchDL);
+
+  if (TrueMBB) {
+    MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
+    MBB->removeSuccessor(TrueMBB, true);
+    if (LandMBB && TrueMBB->succ_size()!=0)
+      TrueMBB->removeSuccessor(LandMBB, true);
+    retireBlock(TrueMBB);
+    MLI->removeBlock(TrueMBB);
+  }
+
+  if (FalseMBB) {
+    insertInstrBefore(I, AMDGPU::ELSE);
+    MBB->splice(I, FalseMBB, FalseMBB->begin(),
+                   FalseMBB->end());
+    MBB->removeSuccessor(FalseMBB, true);
+    if (LandMBB && FalseMBB->succ_size() != 0)
+      FalseMBB->removeSuccessor(LandMBB, true);
+    retireBlock(FalseMBB);
+    MLI->removeBlock(FalseMBB);
+  }
+  insertInstrBefore(I, AMDGPU::ENDIF);
+
+  BranchMI->eraseFromParent();
+
+  if (LandMBB && TrueMBB && FalseMBB)
+    MBB->addSuccessor(LandMBB);
+
+}
+
+void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
+    MachineBasicBlock *LandMBB) {
+  DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
+               << " land = BB" << LandMBB->getNumber() << "\n";);
+
+  insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
+  insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
+  DstBlk->replaceSuccessor(DstBlk, LandMBB);
+}
+
+
+void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+    MachineBasicBlock *LandMBB) {
+  DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
+               << " land = BB" << LandMBB->getNumber() << "\n";);
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
+  assert(BranchMI && isCondBranch(BranchMI));
+  DebugLoc DL = BranchMI->getDebugLoc();
+  MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
+  MachineBasicBlock::iterator I = BranchMI;
+  if (TrueBranch != LandMBB)
+    reversePredicateSetter(I, *I->getParent());
+  insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
+  insertInstrBefore(I, AMDGPU::BREAK);
+  insertInstrBefore(I, AMDGPU::ENDIF);
+  //now branchInst can be erase safely
+  BranchMI->eraseFromParent();
+  //now take care of successors, retire blocks
+  ExitingMBB->removeSuccessor(LandMBB, true);
+}
+
+void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+    MachineBasicBlock *ContMBB) {
+  DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
+               << ContingMBB->getNumber()
+               << ", cont = BB" << ContMBB->getNumber() << "\n";);
+
+  MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
+  if (MI) {
+    assert(isCondBranch(MI));
+    MachineBasicBlock::iterator I = MI;
+    MachineBasicBlock *TrueBranch = getTrueBranch(MI);
+    int OldOpcode = MI->getOpcode();
+    DebugLoc DL = MI->getDebugLoc();
+
+    bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);
+
+    if (!UseContinueLogical) {
+      int BranchOpcode =
+          TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) :
+          getBranchZeroOpcode(OldOpcode);
+      insertCondBranchBefore(I, BranchOpcode, DL);
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL);
+      insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL);
+    } else {
+      int BranchOpcode =
+          TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
+          getContinueZeroOpcode(OldOpcode);
+      insertCondBranchBefore(I, BranchOpcode, DL);
+    }
+
+    MI->eraseFromParent();
+  } else {
+    // if we've arrived here then we've already erased the branch instruction
+    // travel back up the basic block to see the last reference of our debug
+    // location we've just inserted that reference here so it should be
+    // representative insertEnd to ensure phi-moves, if exist, go before the
+    // continue-instr.
+    insertInstrEnd(ContingMBB, AMDGPU::CONTINUE,
+        getLastDebugLocInBB(ContingMBB));
+  }
+}
+
+int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
+  int Cloned = 0;
+  assert(PreMBB->isSuccessor(SrcMBB));
+  while (SrcMBB && SrcMBB != DstMBB) {
+    assert(SrcMBB->succ_size() == 1);
+    if (SrcMBB->pred_size() > 1) {
+      SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
+      ++Cloned;
+    }
+
+    PreMBB = SrcMBB;
+    SrcMBB = *SrcMBB->succ_begin();
+  }
+
+  return Cloned;
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
+    MachineBasicBlock *PredMBB) {
+  assert(PredMBB->isSuccessor(MBB) &&
+         "succBlk is not a prececessor of curBlk");
+
+  MachineBasicBlock *CloneMBB = clone(MBB);  //clone instructions
+  replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
+  //srcBlk, oldBlk, newBlk
+
+  PredMBB->replaceSuccessor(MBB, CloneMBB);
+
+  // add all successor to cloneBlk
+  cloneSuccessorList(CloneMBB, MBB);
+
+  numClonedInstr += MBB->size();
+
+  DEBUG(
+    dbgs() << "Cloned block: " << "BB"
+           << MBB->getNumber() << "size " << MBB->size() << "\n";
+  );
+
+  SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
+
+  return CloneMBB;
+}
+
+void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
+    MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
+  MachineBasicBlock::iterator SpliceEnd;
+  //look for the input branchinstr, not the AMDGPU branchinstr
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
+  if (!BranchMI) {
+    DEBUG(
+      dbgs() << "migrateInstruction don't see branch instr\n" ;
+    );
+    SpliceEnd = SrcMBB->end();
+  } else {
+    DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
+    SpliceEnd = BranchMI;
+  }
+  DEBUG(
+    dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size()
+      << "srcSize = " << SrcMBB->size() << "\n";
+  );
+
+  //splice insert before insertPos
+  DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
+
+  DEBUG(
+    dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
+      << "srcSize = " << SrcMBB->size() << '\n';
+  );
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
+
+  if (!LoopHeader || !LoopLatch)
+    return nullptr;
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
+  // Is LoopRep an infinite loop ?
+  if (!BranchMI || !isUncondBranch(BranchMI))
+    return nullptr;
+
+  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
+  FuncRep->push_back(DummyExitBlk);  //insert to function
+  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
+  DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
+  LLVMContext &Ctx = LoopHeader->getParent()->getFunction()->getContext();
+  Ctx.emitError("Extra register needed to handle CFG");
+  return nullptr;
+}
+
+void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
+  MachineInstr *BranchMI;
+
+  // I saw two unconditional branch in one basic block in example
+  // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
+  while ((BranchMI = getLoopendBlockBranchInstr(MBB))
+          && isUncondBranch(BranchMI)) {
+    DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
+    BranchMI->eraseFromParent();
+  }
+}
+
+void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
+    MachineBasicBlock *MBB) {
+  if (MBB->succ_size() != 2)
+    return;
+  MachineBasicBlock *MBB1 = *MBB->succ_begin();
+  MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin());
+  if (MBB1 != MBB2)
+    return;
+
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
+  assert(BranchMI && isCondBranch(BranchMI));
+  DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
+  BranchMI->eraseFromParent();
+  SHOWNEWBLK(MBB1, "Removing redundant successor");
+  MBB->removeSuccessor(MBB1, true);
+}
+
+void AMDGPUCFGStructurizer::addDummyExitBlock(
+    SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
+  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
+  FuncRep->push_back(DummyExitBlk);  //insert to function
+  insertInstrEnd(DummyExitBlk, AMDGPU::RETURN);
+
+  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
+       E = RetMBB.end(); It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    MachineInstr *MI = getReturnInstr(MBB);
+    if (MI)
+      MI->eraseFromParent();
+    MBB->addSuccessor(DummyExitBlk);
+    DEBUG(
+      dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
+             << " successors\n";
+    );
+  }
+  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
+}
+
+void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
+  while (MBB->succ_size())
+    MBB->removeSuccessor(*MBB->succ_begin());
+}
+
+void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
+    int SccNum) {
+  BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
+  if (!srcBlkInfo)
+    srcBlkInfo = new BlockInformation();
+  srcBlkInfo->SccNum = SccNum;
+}
+
+void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
+  DEBUG(
+        dbgs() << "Retiring BB" << MBB->getNumber() << "\n";
+  );
+
+  BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
+
+  if (!SrcBlkInfo)
+    SrcBlkInfo = new BlockInformation();
+
+  SrcBlkInfo->IsRetired = true;
+  assert(MBB->succ_size() == 0 && MBB->pred_size() == 0
+         && "can't retire block yet");
+}
+
+char AMDGPUCFGStructurizer::ID = 0;
+
+} // end anonymous namespace
+
+
+INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
+                      "AMDGPU CFG Structurizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
+                      "AMDGPU CFG Structurizer", false, false)
+
+FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
+  return new AMDGPUCFGStructurizer();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h b/contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
new file mode 100644
index 000000000000..5d243e949fd3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h
@@ -0,0 +1,658 @@
+//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file AMDKernelCodeT.h
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDKERNELCODET_H
+#define AMDKERNELCODET_H
+
+#include "llvm/MC/SubtargetFeature.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include "llvm/Support/Debug.h"
+//---------------------------------------------------------------------------//
+// AMD Kernel Code, and its dependencies                                     //
+//---------------------------------------------------------------------------//
+
+typedef uint8_t hsa_powertwo8_t;
+typedef uint32_t hsa_ext_code_kind_t;
+typedef uint8_t hsa_ext_brig_profile8_t;
+typedef uint8_t hsa_ext_brig_machine_model8_t;
+typedef uint64_t hsa_ext_control_directive_present64_t;
+typedef uint16_t hsa_ext_exception_kind16_t;
+typedef uint32_t hsa_ext_code_kind32_t;
+
+typedef struct hsa_dim3_s {
+  uint32_t x;
+  uint32_t y;
+  uint32_t z;
+} hsa_dim3_t;
+
+/// The version of the amd_*_code_t struct. Minor versions must be
+/// backward compatible.
+typedef uint32_t amd_code_version32_t;
+enum amd_code_version_t {
+  AMD_CODE_VERSION_MAJOR = 0,
+  AMD_CODE_VERSION_MINOR = 1
+};
+
+// Sets val bits for specified mask in specified dst packed instance.
+#define AMD_HSA_BITS_SET(dst, mask, val)                                       \
+  dst &= (~(1 << mask ## _SHIFT) & ~mask);                                     \
+  dst |= (((val) << mask ## _SHIFT) & mask)
+
+// Gets bits for specified mask from specified src packed instance.
+#define AMD_HSA_BITS_GET(src, mask)                                            \
+  ((src & mask) >> mask ## _SHIFT)                                             \
+
+/// The values used to define the number of bytes to use for the
+/// swizzle element size.
+enum amd_element_byte_size_t {
+  AMD_ELEMENT_2_BYTES = 0,
+  AMD_ELEMENT_4_BYTES = 1,
+  AMD_ELEMENT_8_BYTES = 2,
+  AMD_ELEMENT_16_BYTES = 3
+};
+
+/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
+/// COMPUTE_PGM_RSRC2 registers.
+typedef uint64_t amd_compute_pgm_resource_register64_t;
+
+/// Every amd_*_code_t has the following properties, which are composed of
+/// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*),
+/// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount
+/// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0.
+///
+/// (Note that bit fields cannot be used as their layout is
+/// implementation defined in the C standard and so cannot be used to
+/// specify an ABI)
+typedef uint32_t amd_code_property32_t;
+enum amd_code_property_mask_t {
+
+  /// Enable the setup of the SGPR user data registers
+  /// (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t
+  /// for initial register state.
+  ///
+  /// The total number of SGPRuser data registers requested must not
+  /// exceed 16. Any requests beyond 16 will be ignored.
+  ///
+  /// Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of
+  /// SGPR user data registers enabled up to 16).
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT,
+
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
+
+  AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10,
+  AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6,
+  AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED1_SHIFT,
+
+  /// Control wave ID base counter for GDS ordered-append. Used to set
+  /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
+  /// ORDERED_APPEND_MODE also needs to be settable)
+  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16,
+  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
+  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
+
+  /// The interleave (swizzle) element size in bytes required by the
+  /// code for private memory. This must be 2, 4, 8 or 16. This value
+  /// is provided to the finalizer when it is invoked and is recorded
+  /// here. The hardware will interleave the memory requests of each
+  /// lane of a wavefront by this element size to ensure each
+  /// work-item gets a distinct memory memory location. Therefore, the
+  /// finalizer ensures that all load and store operations done to
+  /// private memory do not exceed this size. For example, if the
+  /// element size is 4 (32-bits or dword) and a 64-bit value must be
+  /// loaded, the finalizer will generate two 32-bit loads. This
+  /// ensures that the interleaving will get the work-item
+  /// specific dword for both halves of the 64-bit value. If it just
+  /// did a 64-bit load then it would get one dword which belonged to
+  /// its own work-item, but the second dword would belong to the
+  /// adjacent lane work-item since the interleaving is in dwords.
+  ///
+  /// The value used must match the value that the runtime configures
+  /// the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This
+  /// is generally DWORD.
+  ///
+  /// uSE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM.
+  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17,
+  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
+  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
+
+  /// Are global memory addresses 64 bits. Must match
+  /// amd_kernel_code_t.hsail_machine_model ==
+  /// HSA_MACHINE_LARGE. Must also match
+  /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
+  /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
+  AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19,
+  AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
+  AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
+
+  /// Indicate if the generated ISA is using a dynamically sized call
+  /// stack. This can happen if calls are implemented using a call
+  /// stack and recursion, alloca or calls to indirect functions are
+  /// present. In these cases the Finalizer cannot compute the total
+  /// private segment size at compile time. In this case the
+  /// workitem_private_segment_byte_size only specifies the statically
+  /// know private segment size, and additional space must be added
+  /// for the call stack.
+  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20,
+  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
+  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
+
+  /// Indicate if code generated has support for debugging.
+  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21,
+  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
+  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT,
+
+  AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22,
+  AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1,
+  AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT,
+
+  AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23,
+  AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9,
+  AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT
+};
+
+/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL
+/// control directives. These control how the finalizer generates code. This
+/// struct is used both as an argument to hsaFinalizeKernel to specify values for
+/// the control directives, and is used in HsaKernelCode to record the values of
+/// the control directives that the finalize used when generating the code which
+/// either came from the finalizer argument or explicit HSAIL control
+/// directives. See the definition of the control directives in HSA Programmer's
+/// Reference Manual which also defines how the values specified as finalizer
+/// arguments have to agree with the control directives in the HSAIL code.
+typedef struct hsa_ext_control_directives_s {
+  /// This is a bit set indicating which control directives have been
+  /// specified. If the value is 0 then there are no control directives specified
+  /// and the rest of the fields can be ignored. The bits are accessed using the
+  /// hsa_ext_control_directives_present_mask_t. Any control directive that is not
+  /// enabled in this bit set must have the value of all 0s.
+  hsa_ext_control_directive_present64_t enabled_control_directives;
+
+  /// If enableBreakExceptions is not enabled then must be 0, otherwise must be
+  /// non-0 and specifies the set of HSAIL exceptions that must have the BREAK
+  /// policy enabled. If this set is not empty then the generated code may have
+  /// lower performance than if the set is empty. If the kernel being finalized
+  /// has any enablebreakexceptions control directives, then the values specified
+  /// by this argument are unioned with the values in these control
+  /// directives. If any of the functions the kernel calls have an
+  /// enablebreakexceptions control directive, then they must be equal or a
+  /// subset of, this union.
+  hsa_ext_exception_kind16_t enable_break_exceptions;
+
+  /// If enableDetectExceptions is not enabled then must be 0, otherwise must be
+  /// non-0 and specifies the set of HSAIL exceptions that must have the DETECT
+  /// policy enabled. If this set is not empty then the generated code may have
+  /// lower performance than if the set is empty. However, an implementation
+  /// should endeavour to make the performance impact small. If the kernel being
+  /// finalized has any enabledetectexceptions control directives, then the
+  /// values specified by this argument are unioned with the values in these
+  /// control directives. If any of the functions the kernel calls have an
+  /// enabledetectexceptions control directive, then they must be equal or a
+  /// subset of, this union.
+  hsa_ext_exception_kind16_t enable_detect_exceptions;
+
+  /// If maxDynamicGroupSize is not enabled then must be 0, and any amount of
+  /// dynamic group segment can be allocated for a dispatch, otherwise the value
+  /// specifies the maximum number of bytes of dynamic group segment that can be
+  /// allocated for a dispatch. If the kernel being finalized has any
+  /// maxdynamicsize control directives, then the values must be the same, and
+  /// must be the same as this argument if it is enabled. This value can be used
+  /// by the finalizer to determine the maximum number of bytes of group memory
+  /// used by each work-group by adding this value to the group memory required
+  /// for all group segment variables used by the kernel and all functions it
+  /// calls, and group memory used to implement other HSAIL features such as
+  /// fbarriers and the detect exception operations. This can allow the finalizer
+  /// to determine the expected number of work-groups that can be executed by a
+  /// compute unit and allow more resources to be allocated to the work-items if
+  /// it is known that fewer work-groups can be executed due to group memory
+  /// limitations.
+  uint32_t max_dynamic_group_size;
+
+  /// If maxFlatGridSize is not enabled then must be 0, otherwise must be greater
+  /// than 0. See HSA Programmer's Reference Manual description of
+  /// maxflatgridsize control directive.
+  uint32_t max_flat_grid_size;
+
+  /// If maxFlatWorkgroupSize is not enabled then must be 0, otherwise must be
+  /// greater than 0. See HSA Programmer's Reference Manual description of
+  /// maxflatworkgroupsize control directive.
+  uint32_t max_flat_workgroup_size;
+
+  /// If requestedWorkgroupsPerCu is not enabled then must be 0, and the
+  /// finalizer is free to generate ISA that may result in any number of
+  /// work-groups executing on a single compute unit. Otherwise, the finalizer
+  /// should attempt to generate ISA that will allow the specified number of
+  /// work-groups to execute on a single compute unit. This is only a hint and
+  /// can be ignored by the finalizer. If the kernel being finalized, or any of
+  /// the functions it calls, has a requested control directive, then the values
+  /// must be the same. This can be used to determine the number of resources
+  /// that should be allocated to a single work-group and work-item. For example,
+  /// a low value may allow more resources to be allocated, resulting in higher
+  /// per work-item performance, as it is known there will never be more than the
+  /// specified number of work-groups actually executing on the compute
+  /// unit. Conversely, a high value may allocate fewer resources, resulting in
+  /// lower per work-item performance, which is offset by the fact it allows more
+  /// work-groups to actually execute on the compute unit.
+  uint32_t requested_workgroups_per_cu;
+
+  /// If not enabled then all elements for Dim3 must be 0, otherwise every
+  /// element must be greater than 0. See HSA Programmer's Reference Manual
+  /// description of requiredgridsize control directive.
+  hsa_dim3_t required_grid_size;
+
+  /// If requiredWorkgroupSize is not enabled then all elements for Dim3 must be
+  /// 0, and the produced code can be dispatched with any legal work-group range
+  /// consistent with the dispatch dimensions. Otherwise, the code produced must
+  /// always be dispatched with the specified work-group range. No element of the
+  /// specified range must be 0. It must be consistent with required_dimensions
+  /// and max_flat_workgroup_size. If the kernel being finalized, or any of the
+  /// functions it calls, has a requiredworkgroupsize control directive, then the
+  /// values must be the same. Specifying a value can allow the finalizer to
+  /// optimize work-group id operations, and if the number of work-items in the
+  /// work-group is less than the WAVESIZE then barrier operations can be
+  /// optimized to just a memory fence.
+  hsa_dim3_t required_workgroup_size;
+
+  /// If requiredDim is not enabled then must be 0 and the produced kernel code
+  /// can be dispatched with 1, 2 or 3 dimensions. If enabled then the value is
+  /// 1..3 and the code produced must only be dispatched with a dimension that
+  /// matches. Other values are illegal. If the kernel being finalized, or any of
+  /// the functions it calls, has a requireddimsize control directive, then the
+  /// values must be the same. This can be used to optimize the code generated to
+  /// compute the absolute and flat work-group and work-item id, and the dim
+  /// HSAIL operations.
+  uint8_t required_dim;
+
+  /// Reserved. Must be 0.
+  uint8_t reserved[75];
+} hsa_ext_control_directives_t;
+
+/// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel
+/// Code Object to set up the hardware to execute the kernel dispatch.
+///
+/// Initial Kernel Register State.
+///
+/// Initial kernel register state will be set up by CP/SPI prior to the start
+/// of execution of every wavefront. This is limited by the constraints of the
+/// current hardware.
+///
+/// The order of the SGPR registers is defined, but the Finalizer can specify
+/// which ones are actually setup in the amd_kernel_code_t object using the
+/// enable_sgpr_* bit fields. The register numbers used for enabled registers
+/// are dense starting at SGPR0: the first enabled register is SGPR0, the next
+/// enabled register is SGPR1 etc.; disabled registers do not have an SGPR
+/// number.
+///
+/// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and
+/// apply to all waves of the grid. It is possible to specify more than 16 User
+/// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16
+/// are actually initialized. These are then immediately followed by the System
+/// SGPRs that are set up by ADC/SPI and can have different values for each wave
+/// of the grid dispatch.
+///
+/// SGPR register initial state is defined as follows:
+///
+/// Private Segment Buffer (enable_sgpr_private_segment_buffer):
+///   Number of User SGPR registers: 4. V# that can be used, together with
+///   Scratch Wave Offset as an offset, to access the Private/Spill/Arg
+///   segments using a segment address. It must be set as follows:
+///     - Base address: of the scratch memory area used by the dispatch. It
+///       does not include the scratch wave offset. It will be the per process
+///       SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for
+///       example there may be a per pipe offset, or per AQL Queue offset).
+///     - Stride + data_format: Element Size * Index Stride (???)
+///     - Cache swizzle: ???
+///     - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for
+///       scratch)
+///     - Num records: Flat Scratch Work Item Size / Element Size (???)
+///     - Dst_sel_*: ???
+///     - Num_format: ???
+///     - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must
+///       agree with amd_kernel_code_t.privateElementSize)
+///     - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must
+///       be number of wavefront lanes for scratch, must agree with
+///       amd_kernel_code_t.wavefrontSize)
+///     - Add tid enable: 1
+///     - ATC: from SH_MEM_CONFIG.PRIVATE_ATC,
+///     - Hash_enable: ???
+///     - Heap: ???
+///     - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE
+///     - Type: 0 (a buffer) (???)
+///
+/// Dispatch Ptr (enable_sgpr_dispatch_ptr):
+///   Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet
+///   for kernel actually executing.
+///
+/// Queue Ptr (enable_sgpr_queue_ptr):
+///   Number of User SGPR registers: 2. 64 bit address of AmdQueue object for
+///   AQL queue on which the dispatch packet was queued.
+///
+/// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr):
+///   Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This
+///   is directly copied from the kernargPtr in the dispatch packet. Having CP
+///   load it once avoids loading it at the beginning of every wavefront.
+///
+/// Dispatch Id (enable_sgpr_dispatch_id):
+///   Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch
+///   packet being executed.
+///
+/// Flat Scratch Init (enable_sgpr_flat_scratch_init):
+///   Number of User SGPR registers: 2. This is 2 SGPRs.
+///
+///   For CI/VI:
+///     The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE
+///     to base of memory for scratch for this dispatch. This is the same offset
+///     used in computing the Scratch Segment Buffer base address. The value of
+///     Scratch Wave Offset must be added by the kernel code and moved to
+///     SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions.
+///
+///     The second SGPR is 32 bit byte size of a single work-item's scratch
+///     memory usage. This is directly loaded from the dispatch packet Private
+///     Segment Byte Size and rounded up to a multiple of DWORD.
+///
+///     \todo [Does CP need to round this to >4 byte alignment?]
+///
+///     The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in
+///     flat memory instructions. Having CP load it once avoids loading it at
+///     the beginning of every wavefront.
+///
+///   For PI:
+///     This is the 64 bit base address of the scratch backing memory for
+///     allocated by CP for this dispatch.
+///
+/// Private Segment Size (enable_sgpr_private_segment_size):
+///   Number of User SGPR registers: 1. The 32 bit byte size of a single
+///   work-item's scratch memory allocation. This is the value from the dispatch
+///   packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD.
+///
+///   \todo [Does CP need to round this to >4 byte alignment?]
+///
+///   Having CP load it once avoids loading it at the beginning of every
+///   wavefront.
+///
+///   \todo [This will not be used for CI/VI since it is the same value as
+///   the second SGPR of Flat Scratch Init. However, it is need for PI which
+///   changes meaning of Flat Scratchg Init..]
+///
+/// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x):
+///   Number of User SGPR registers: 1. 32 bit count of the number of
+///   work-groups in the X dimension for the grid being executed. Computed from
+///   the fields in the HsaDispatchPacket as
+///   ((gridSize.x+workgroupSize.x-1)/workgroupSize.x).
+///
+/// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y):
+///   Number of User SGPR registers: 1. 32 bit count of the number of
+///   work-groups in the Y dimension for the grid being executed. Computed from
+///   the fields in the HsaDispatchPacket as
+///   ((gridSize.y+workgroupSize.y-1)/workgroupSize.y).
+///
+///   Only initialized if <16 previous SGPRs initialized.
+///
+/// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z):
+///   Number of User SGPR registers: 1. 32 bit count of the number of
+///   work-groups in the Z dimension for the grid being executed. Computed
+///   from the fields in the HsaDispatchPacket as
+///   ((gridSize.z+workgroupSize.z-1)/workgroupSize.z).
+///
+///   Only initialized if <16 previous SGPRs initialized.
+///
+/// Work-Group Id X (enable_sgpr_workgroup_id_x):
+///   Number of System SGPR registers: 1. 32 bit work group id in X dimension
+///   of grid for wavefront. Always present.
+///
+/// Work-Group Id Y (enable_sgpr_workgroup_id_y):
+///   Number of System SGPR registers: 1. 32 bit work group id in Y dimension
+///   of grid for wavefront.
+///
+/// Work-Group Id Z (enable_sgpr_workgroup_id_z):
+///   Number of System SGPR registers: 1. 32 bit work group id in Z dimension
+///   of grid for wavefront. If present then Work-group Id Y will also be
+///   present
+///
+/// Work-Group Info (enable_sgpr_workgroup_info):
+///   Number of System SGPR registers: 1. {first_wave, 14'b0000,
+///   ordered_append_term[10:0], threadgroup_size_in_waves[5:0]}
+///
+/// Private Segment Wave Byte Offset
+/// (enable_sgpr_private_segment_wave_byte_offset):
+///   Number of System SGPR registers: 1. 32 bit byte offset from base of
+///   dispatch scratch base. Must be used as an offset with Private/Spill/Arg
+///   segment address when using Scratch Segment Buffer. It must be added to
+///   Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing.
+///
+///
+/// The order of the VGPR registers is defined, but the Finalizer can specify
+/// which ones are actually setup in the amd_kernel_code_t object using the
+/// enableVgpr*  bit fields. The register numbers used for enabled registers
+/// are dense starting at VGPR0: the first enabled register is VGPR0, the next
+/// enabled register is VGPR1 etc.; disabled registers do not have an VGPR
+/// number.
+///
+/// VGPR register initial state is defined as follows:
+///
+/// Work-Item Id X (always initialized):
+///   Number of registers: 1. 32 bit work item id in X dimension of work-group
+///   for wavefront lane.
+///
+/// Work-Item Id X (enable_vgpr_workitem_id > 0):
+///   Number of registers: 1. 32 bit work item id in Y dimension of work-group
+///   for wavefront lane.
+///
+/// Work-Item Id X (enable_vgpr_workitem_id > 0):
+///   Number of registers: 1. 32 bit work item id in Z dimension of work-group
+///   for wavefront lane.
+///
+///
+/// The setting of registers is being done by existing GPU hardware as follows:
+///   1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data
+///      registers.
+///   2) Work-group Id registers X, Y, Z are set by SPI which supports any
+///      combination including none.
+///   3) Scratch Wave Offset is also set by SPI which is why its value cannot
+///      be added into the value Flat Scratch Offset which would avoid the
+///      Finalizer generated prolog having to do the add.
+///   4) The VGPRs are set by SPI which only supports specifying either (X),
+///      (X, Y) or (X, Y, Z).
+///
+/// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so
+/// they can be moved as a 64 bit value to the hardware required SGPRn-3 and
+/// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register.
+///
+/// The global segment can be accessed either using flat operations or buffer
+/// operations. If buffer operations are used then the Global Buffer used to
+/// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a
+/// segment address is not passed into the kernel code by CP since its base
+/// address is always 0. Instead the Finalizer generates prolog code to
+/// initialize 4 SGPRs with a V# that has the following properties, and then
+/// uses that in the buffer instructions:
+///   - base address of 0
+///   - no swizzle
+///   - ATC=1
+///   - MTYPE set to support memory coherence specified in
+///     amd_kernel_code_t.globalMemoryCoherence
+///
+/// When the Global Buffer is used to access the Kernarg segment, must add the
+/// dispatch packet kernArgPtr to a kernarg segment address before using this V#.
+/// Alternatively scalar loads can be used if the kernarg offset is uniform, as
+/// the kernarg segment is constant for the duration of the kernel execution.
+///
+
+typedef struct amd_kernel_code_s {
+  uint32_t amd_kernel_code_version_major;
+  uint32_t amd_kernel_code_version_minor;
+  uint16_t amd_machine_kind;
+  uint16_t amd_machine_version_major;
+  uint16_t amd_machine_version_minor;
+  uint16_t amd_machine_version_stepping;
+
+  /// Byte offset (possibly negative) from start of amd_kernel_code_t
+  /// object to kernel's entry point instruction. The actual code for
+  /// the kernel is required to be 256 byte aligned to match hardware
+  /// requirements (SQ cache line is 16). The code must be position
+  /// independent code (PIC) for AMD devices to give runtime the
+  /// option of copying code to discrete GPU memory or APU L2
+  /// cache. The Finalizer should endeavour to allocate all kernel
+  /// machine code in contiguous memory pages so that a device
+  /// pre-fetcher will tend to only pre-fetch Kernel Code objects,
+  /// improving cache performance.
+  int64_t kernel_code_entry_byte_offset;
+
+  /// Range of bytes to consider prefetching expressed as an offset
+  /// and size. The offset is from the start (possibly negative) of
+  /// amd_kernel_code_t object. Set both to 0 if no prefetch
+  /// information is available.
+  int64_t kernel_code_prefetch_byte_offset;
+  uint64_t kernel_code_prefetch_byte_size;
+
+  /// Number of bytes of scratch backing memory required for full
+  /// occupancy of target chip. This takes into account the number of
+  /// bytes of scratch per work-item, the wavefront size, the maximum
+  /// number of wavefronts per CU, and the number of CUs. This is an
+  /// upper limit on scratch. If the grid being dispatched is small it
+  /// may only need less than this. If the kernel uses no scratch, or
+  /// the Finalizer has not computed this value, it must be 0.
+  uint64_t max_scratch_backing_memory_byte_size;
+
+  /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
+  /// COMPUTE_PGM_RSRC2 registers.
+  uint64_t compute_pgm_resource_registers;
+
+  /// Code properties. See amd_code_property_mask_t for a full list of
+  /// properties.
+  uint32_t code_properties;
+
+  /// The amount of memory required for the combined private, spill
+  /// and arg segments for a work-item in bytes. If
+  /// is_dynamic_callstack is 1 then additional space must be added to
+  /// this value for the call stack.
+  uint32_t workitem_private_segment_byte_size;
+
+  /// The amount of group segment memory required by a work-group in
+  /// bytes. This does not include any dynamically allocated group
+  /// segment memory that may be added when the kernel is
+  /// dispatched.
+  uint32_t workgroup_group_segment_byte_size;
+
+  /// Number of byte of GDS required by kernel dispatch. Must be 0 if
+  /// not using GDS.
+  uint32_t gds_segment_byte_size;
+
+  /// The size in bytes of the kernarg segment that holds the values
+  /// of the arguments to the kernel. This could be used by CP to
+  /// prefetch the kernarg segment pointed to by the dispatch packet.
+  uint64_t kernarg_segment_byte_size;
+
+  /// Number of fbarrier's used in the kernel and all functions it
+  /// calls. If the implementation uses group memory to allocate the
+  /// fbarriers then that amount must already be included in the
+  /// workgroup_group_segment_byte_size total.
+  uint32_t workgroup_fbarrier_count;
+
+  /// Number of scalar registers used by a wavefront. This includes
+  /// the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size
+  /// and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a
+  /// trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS.
+  uint16_t wavefront_sgpr_count;
+
+  /// Number of vector registers used by each work-item. Used to set
+  /// COMPUTE_PGM_RSRC1.VGPRS.
+  uint16_t workitem_vgpr_count;
+
+  /// If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the
+  /// first fixed VGPR number reserved.
+  uint16_t reserved_vgpr_first;
+
+  /// The number of consecutive VGPRs reserved by the client. If
+  /// is_debug_supported then this count includes VGPRs reserved
+  /// for debugger use.
+  uint16_t reserved_vgpr_count;
+
+  /// If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the
+  /// first fixed SGPR number reserved.
+  uint16_t reserved_sgpr_first;
+
+  /// The number of consecutive SGPRs reserved by the client. If
+  /// is_debug_supported then this count includes SGPRs reserved
+  /// for debugger use.
+  uint16_t reserved_sgpr_count;
+
+  /// If is_debug_supported is 0 then must be 0. Otherwise, this is the
+  /// fixed SGPR number used to hold the wave scratch offset for the
+  /// entire kernel execution, or uint16_t(-1) if the register is not
+  /// used or not known.
+  uint16_t debug_wavefront_private_segment_offset_sgpr;
+
+  /// If is_debug_supported is 0 then must be 0. Otherwise, this is the
+  /// fixed SGPR number of the first of 4 SGPRs used to hold the
+  /// scratch V# used for the entire kernel execution, or uint16_t(-1)
+  /// if the registers are not used or not known.
+  uint16_t debug_private_segment_buffer_sgpr;
+
+  /// The maximum byte alignment of variables used by the kernel in
+  /// the specified memory segment. Expressed as a power of two. Must
+  /// be at least HSA_POWERTWO_16.
+  uint8_t kernarg_segment_alignment;
+  uint8_t group_segment_alignment;
+  uint8_t private_segment_alignment;
+
+  /// Wavefront size expressed as a power of two. Must be a power of 2
+  /// in range 1..64 inclusive. Used to support runtime query that
+  /// obtains wavefront size, which may be used by application to
+  /// allocated dynamic group memory and set the dispatch work-group
+  /// size.
+  uint8_t wavefront_size;
+
+  int32_t call_convention;
+  uint8_t reserved3[12];
+  uint64_t runtime_loader_kernel_symbol;
+  uint64_t control_directives[16];
+} amd_kernel_code_t;
+
+#endif // AMDKERNELCODET_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
new file mode 100644
index 000000000000..a6c31629e7c4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -0,0 +1,3599 @@
+//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDKernelCodeT.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/AMDGPUTargetStreamer.h"
+#include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDKernelCodeTUtils.h"
+#include "Utils/AMDGPUAsmUtils.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+namespace {
+
+class AMDGPUAsmParser;
+
+enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_TTMP, IS_SPECIAL };
+
+//===----------------------------------------------------------------------===//
+// Operand
+//===----------------------------------------------------------------------===//
+
+class AMDGPUOperand : public MCParsedAsmOperand {
+  enum KindTy {
+    Token,
+    Immediate,
+    Register,
+    Expression
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+  const AMDGPUAsmParser *AsmParser;
+
+public:
+  AMDGPUOperand(enum KindTy Kind_, const AMDGPUAsmParser *AsmParser_)
+    : MCParsedAsmOperand(), Kind(Kind_), AsmParser(AsmParser_) {}
+
+  typedef std::unique_ptr<AMDGPUOperand> Ptr;
+
+  struct Modifiers {
+    bool Abs = false;
+    bool Neg = false;
+    bool Sext = false;
+
+    bool hasFPModifiers() const { return Abs || Neg; }
+    bool hasIntModifiers() const { return Sext; }
+    bool hasModifiers() const { return hasFPModifiers() || hasIntModifiers(); }
+
+    int64_t getFPModifiersOperand() const {
+      int64_t Operand = 0;
+      Operand |= Abs ? SISrcMods::ABS : 0;
+      Operand |= Neg ? SISrcMods::NEG : 0;
+      return Operand;
+    }
+
+    int64_t getIntModifiersOperand() const {
+      int64_t Operand = 0;
+      Operand |= Sext ? SISrcMods::SEXT : 0;
+      return Operand;
+    }
+
+    int64_t getModifiersOperand() const {
+      assert(!(hasFPModifiers() && hasIntModifiers())
+           && "fp and int modifiers should not be used simultaneously");
+      if (hasFPModifiers()) {
+        return getFPModifiersOperand();
+      } else if (hasIntModifiers()) {
+        return getIntModifiersOperand();
+      } else {
+        return 0;
+      }
+    }
+
+    friend raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods);
+  };
+
+  enum ImmTy {
+    ImmTyNone,
+    ImmTyGDS,
+    ImmTyOffen,
+    ImmTyIdxen,
+    ImmTyAddr64,
+    ImmTyOffset,
+    ImmTyOffset0,
+    ImmTyOffset1,
+    ImmTyGLC,
+    ImmTySLC,
+    ImmTyTFE,
+    ImmTyClampSI,
+    ImmTyOModSI,
+    ImmTyDppCtrl,
+    ImmTyDppRowMask,
+    ImmTyDppBankMask,
+    ImmTyDppBoundCtrl,
+    ImmTySdwaDstSel,
+    ImmTySdwaSrc0Sel,
+    ImmTySdwaSrc1Sel,
+    ImmTySdwaDstUnused,
+    ImmTyDMask,
+    ImmTyUNorm,
+    ImmTyDA,
+    ImmTyR128,
+    ImmTyLWE,
+    ImmTyExpTgt,
+    ImmTyExpCompr,
+    ImmTyExpVM,
+    ImmTyHwreg,
+    ImmTyOff,
+    ImmTySendMsg,
+    ImmTyInterpSlot,
+    ImmTyInterpAttr,
+    ImmTyAttrChan
+  };
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct ImmOp {
+    int64_t Val;
+    ImmTy Type;
+    bool IsFPImm;
+    Modifiers Mods;
+  };
+
+  struct RegOp {
+    unsigned RegNo;
+    bool IsForcedVOP3;
+    Modifiers Mods;
+  };
+
+  union {
+    TokOp Tok;
+    ImmOp Imm;
+    RegOp Reg;
+    const MCExpr *Expr;
+  };
+
+  bool isToken() const override {
+    if (Kind == Token)
+      return true;
+
+    if (Kind != Expression || !Expr)
+      return false;
+
+    // When parsing operands, we can't always tell if something was meant to be
+    // a token, like 'gds', or an expression that references a global variable.
+    // In this case, we assume the string is an expression, and if we need to
+    // interpret is a token, then we treat the symbol name as the token.
+    return isa<MCSymbolRefExpr>(Expr);
+  }
+
+  bool isImm() const override {
+    return Kind == Immediate;
+  }
+
+  bool isInlinableImm(MVT type) const;
+  bool isLiteralImm(MVT type) const;
+
+  bool isRegKind() const {
+    return Kind == Register;
+  }
+
+  bool isReg() const override {
+    return isRegKind() && !Reg.Mods.hasModifiers();
+  }
+
+  bool isRegOrImmWithInputMods(MVT type) const {
+    return isRegKind() || isInlinableImm(type);
+  }
+
+  bool isRegOrImmWithInt16InputMods() const {
+    return isRegOrImmWithInputMods(MVT::i16);
+  }
+
+  bool isRegOrImmWithInt32InputMods() const {
+    return isRegOrImmWithInputMods(MVT::i32);
+  }
+
+  bool isRegOrImmWithInt64InputMods() const {
+    return isRegOrImmWithInputMods(MVT::i64);
+  }
+
+  bool isRegOrImmWithFP16InputMods() const {
+    return isRegOrImmWithInputMods(MVT::f16);
+  }
+
+  bool isRegOrImmWithFP32InputMods() const {
+    return isRegOrImmWithInputMods(MVT::f32);
+  }
+
+  bool isRegOrImmWithFP64InputMods() const {
+    return isRegOrImmWithInputMods(MVT::f64);
+  }
+
+  bool isVReg32OrOff() const {
+    return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
+  }
+
+  bool isImmTy(ImmTy ImmT) const {
+    return isImm() && Imm.Type == ImmT;
+  }
+
+  bool isImmModifier() const {
+    return isImm() && Imm.Type != ImmTyNone;
+  }
+
+  bool isClampSI() const { return isImmTy(ImmTyClampSI); }
+  bool isOModSI() const { return isImmTy(ImmTyOModSI); }
+  bool isDMask() const { return isImmTy(ImmTyDMask); }
+  bool isUNorm() const { return isImmTy(ImmTyUNorm); }
+  bool isDA() const { return isImmTy(ImmTyDA); }
+  bool isR128() const { return isImmTy(ImmTyUNorm); }
+  bool isLWE() const { return isImmTy(ImmTyLWE); }
+  bool isOff() const { return isImmTy(ImmTyOff); }
+  bool isExpTgt() const { return isImmTy(ImmTyExpTgt); }
+  bool isExpVM() const { return isImmTy(ImmTyExpVM); }
+  bool isExpCompr() const { return isImmTy(ImmTyExpCompr); }
+  bool isOffen() const { return isImmTy(ImmTyOffen); }
+  bool isIdxen() const { return isImmTy(ImmTyIdxen); }
+  bool isAddr64() const { return isImmTy(ImmTyAddr64); }
+  bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); }
+  bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<16>(getImm()); }
+  bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); }
+  bool isGDS() const { return isImmTy(ImmTyGDS); }
+  bool isGLC() const { return isImmTy(ImmTyGLC); }
+  bool isSLC() const { return isImmTy(ImmTySLC); }
+  bool isTFE() const { return isImmTy(ImmTyTFE); }
+  bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
+  bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
+  bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
+  bool isSDWADstSel() const { return isImmTy(ImmTySdwaDstSel); }
+  bool isSDWASrc0Sel() const { return isImmTy(ImmTySdwaSrc0Sel); }
+  bool isSDWASrc1Sel() const { return isImmTy(ImmTySdwaSrc1Sel); }
+  bool isSDWADstUnused() const { return isImmTy(ImmTySdwaDstUnused); }
+  bool isInterpSlot() const { return isImmTy(ImmTyInterpSlot); }
+  bool isInterpAttr() const { return isImmTy(ImmTyInterpAttr); }
+  bool isAttrChan() const { return isImmTy(ImmTyAttrChan); }
+
+  bool isMod() const {
+    return isClampSI() || isOModSI();
+  }
+
+  bool isRegOrImm() const {
+    return isReg() || isImm();
+  }
+
+  bool isRegClass(unsigned RCID) const;
+
+  bool isSCSrcB16() const {
+    return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i16);
+  }
+
+  bool isSCSrcB32() const {
+    return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i32);
+  }
+
+  bool isSCSrcB64() const {
+    return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::i64);
+  }
+
+  bool isSCSrcF16() const {
+    return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f16);
+  }
+
+  bool isSCSrcF32() const {
+    return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f32);
+  }
+
+  bool isSCSrcF64() const {
+    return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::f64);
+  }
+
+  bool isSSrcB32() const {
+    return isSCSrcB32() || isLiteralImm(MVT::i32) || isExpr();
+  }
+
+  bool isSSrcB16() const {
+    return isSCSrcB16() || isLiteralImm(MVT::i16);
+  }
+
+  bool isSSrcB64() const {
+    // TODO: Find out how SALU supports extension of 32-bit literals to 64 bits.
+    // See isVSrc64().
+    return isSCSrcB64() || isLiteralImm(MVT::i64);
+  }
+
+  bool isSSrcF32() const {
+    return isSCSrcB32() || isLiteralImm(MVT::f32) || isExpr();
+  }
+
+  bool isSSrcF64() const {
+    return isSCSrcB64() || isLiteralImm(MVT::f64);
+  }
+
+  bool isSSrcF16() const {
+    return isSCSrcB16() || isLiteralImm(MVT::f16);
+  }
+
+  bool isVCSrcB32() const {
+    return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i32);
+  }
+
+  bool isVCSrcB64() const {
+    return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::i64);
+  }
+
+  bool isVCSrcB16() const {
+    return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i16);
+  }
+
+  bool isVCSrcF32() const {
+    return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f32);
+  }
+
+  bool isVCSrcF64() const {
+    return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::f64);
+  }
+
+  bool isVCSrcF16() const {
+    return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f16);
+  }
+
+  bool isVSrcB32() const {
+    return isVCSrcF32() || isLiteralImm(MVT::i32);
+  }
+
+  bool isVSrcB64() const {
+    return isVCSrcF64() || isLiteralImm(MVT::i64);
+  }
+
+  bool isVSrcB16() const {
+    return isVCSrcF16() || isLiteralImm(MVT::i16);
+  }
+
+  bool isVSrcF32() const {
+    return isVCSrcF32() || isLiteralImm(MVT::f32);
+  }
+
+  bool isVSrcF64() const {
+    return isVCSrcF64() || isLiteralImm(MVT::f64);
+  }
+
+  bool isVSrcF16() const {
+    return isVCSrcF16() || isLiteralImm(MVT::f16);
+  }
+
+  bool isKImmFP32() const {
+    return isLiteralImm(MVT::f32);
+  }
+
+  bool isKImmFP16() const {
+    return isLiteralImm(MVT::f16);
+  }
+
+  bool isMem() const override {
+    return false;
+  }
+
+  bool isExpr() const {
+    return Kind == Expression;
+  }
+
+  bool isSoppBrTarget() const {
+    return isExpr() || isImm();
+  }
+
+  bool isSWaitCnt() const;
+  bool isHwreg() const;
+  bool isSendMsg() const;
+  bool isSMRDOffset8() const;
+  bool isSMRDOffset20() const;
+  bool isSMRDLiteralOffset() const;
+  bool isDPPCtrl() const;
+  bool isGPRIdxMode() const;
+
+  StringRef getExpressionAsToken() const {
+    assert(isExpr());
+    const MCSymbolRefExpr *S = cast<MCSymbolRefExpr>(Expr);
+    return S->getSymbol().getName();
+  }
+
+  StringRef getToken() const {
+    assert(isToken());
+
+    if (Kind == Expression)
+      return getExpressionAsToken();
+
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  int64_t getImm() const {
+    assert(isImm());
+    return Imm.Val;
+  }
+
+  enum ImmTy getImmTy() const {
+    assert(isImm());
+    return Imm.Type;
+  }
+
+  unsigned getReg() const override {
+    return Reg.RegNo;
+  }
+
+  SMLoc getStartLoc() const override {
+    return StartLoc;
+  }
+
+  SMLoc getEndLoc() const override {
+    return EndLoc;
+  }
+
+  Modifiers getModifiers() const {
+    assert(isRegKind() || isImmTy(ImmTyNone));
+    return isRegKind() ? Reg.Mods : Imm.Mods;
+  }
+
+  void setModifiers(Modifiers Mods) {
+    assert(isRegKind() || isImmTy(ImmTyNone));
+    if (isRegKind())
+      Reg.Mods = Mods;
+    else
+      Imm.Mods = Mods;
+  }
+
+  bool hasModifiers() const {
+    return getModifiers().hasModifiers();
+  }
+
+  bool hasFPModifiers() const {
+    return getModifiers().hasFPModifiers();
+  }
+
+  bool hasIntModifiers() const {
+    return getModifiers().hasIntModifiers();
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers = true) const;
+
+  void addLiteralImmOperand(MCInst &Inst, int64_t Val) const;
+
+  template <unsigned Bitwidth>
+  void addKImmFPOperands(MCInst &Inst, unsigned N) const;
+
+  void addKImmFP16Operands(MCInst &Inst, unsigned N) const {
+    addKImmFPOperands<16>(Inst, N);
+  }
+
+  void addKImmFP32Operands(MCInst &Inst, unsigned N) const {
+    addKImmFPOperands<32>(Inst, N);
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const;
+
+  void addRegOrImmOperands(MCInst &Inst, unsigned N) const {
+    if (isRegKind())
+      addRegOperands(Inst, N);
+    else if (isExpr())
+      Inst.addOperand(MCOperand::createExpr(Expr));
+    else
+      addImmOperands(Inst, N);
+  }
+
+  void addRegOrImmWithInputModsOperands(MCInst &Inst, unsigned N) const {
+    Modifiers Mods = getModifiers();
+    Inst.addOperand(MCOperand::createImm(Mods.getModifiersOperand()));
+    if (isRegKind()) {
+      addRegOperands(Inst, N);
+    } else {
+      addImmOperands(Inst, N, false);
+    }
+  }
+
+  void addRegOrImmWithFPInputModsOperands(MCInst &Inst, unsigned N) const {
+    assert(!hasIntModifiers());
+    addRegOrImmWithInputModsOperands(Inst, N);
+  }
+
+  void addRegOrImmWithIntInputModsOperands(MCInst &Inst, unsigned N) const {
+    assert(!hasFPModifiers());
+    addRegOrImmWithInputModsOperands(Inst, N);
+  }
+
+  void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const {
+    if (isImm())
+      addImmOperands(Inst, N);
+    else {
+      assert(isExpr());
+      Inst.addOperand(MCOperand::createExpr(Expr));
+    }
+  }
+
+  static void printImmTy(raw_ostream& OS, ImmTy Type) {
+    switch (Type) {
+    case ImmTyNone: OS << "None"; break;
+    case ImmTyGDS: OS << "GDS"; break;
+    case ImmTyOffen: OS << "Offen"; break;
+    case ImmTyIdxen: OS << "Idxen"; break;
+    case ImmTyAddr64: OS << "Addr64"; break;
+    case ImmTyOffset: OS << "Offset"; break;
+    case ImmTyOffset0: OS << "Offset0"; break;
+    case ImmTyOffset1: OS << "Offset1"; break;
+    case ImmTyGLC: OS << "GLC"; break;
+    case ImmTySLC: OS << "SLC"; break;
+    case ImmTyTFE: OS << "TFE"; break;
+    case ImmTyClampSI: OS << "ClampSI"; break;
+    case ImmTyOModSI: OS << "OModSI"; break;
+    case ImmTyDppCtrl: OS << "DppCtrl"; break;
+    case ImmTyDppRowMask: OS << "DppRowMask"; break;
+    case ImmTyDppBankMask: OS << "DppBankMask"; break;
+    case ImmTyDppBoundCtrl: OS << "DppBoundCtrl"; break;
+    case ImmTySdwaDstSel: OS << "SdwaDstSel"; break;
+    case ImmTySdwaSrc0Sel: OS << "SdwaSrc0Sel"; break;
+    case ImmTySdwaSrc1Sel: OS << "SdwaSrc1Sel"; break;
+    case ImmTySdwaDstUnused: OS << "SdwaDstUnused"; break;
+    case ImmTyDMask: OS << "DMask"; break;
+    case ImmTyUNorm: OS << "UNorm"; break;
+    case ImmTyDA: OS << "DA"; break;
+    case ImmTyR128: OS << "R128"; break;
+    case ImmTyLWE: OS << "LWE"; break;
+    case ImmTyOff: OS << "Off"; break;
+    case ImmTyExpTgt: OS << "ExpTgt"; break;
+    case ImmTyExpCompr: OS << "ExpCompr"; break;
+    case ImmTyExpVM: OS << "ExpVM"; break;
+    case ImmTyHwreg: OS << "Hwreg"; break;
+    case ImmTySendMsg: OS << "SendMsg"; break;
+    case ImmTyInterpSlot: OS << "InterpSlot"; break;
+    case ImmTyInterpAttr: OS << "InterpAttr"; break;
+    case ImmTyAttrChan: OS << "AttrChan"; break;
+    }
+  }
+
+  void print(raw_ostream &OS) const override {
+    switch (Kind) {
+    case Register:
+      OS << "<register " << getReg() << " mods: " << Reg.Mods << '>';
+      break;
+    case Immediate:
+      OS << '<' << getImm();
+      if (getImmTy() != ImmTyNone) {
+        OS << " type: "; printImmTy(OS, getImmTy());
+      }
+      OS << " mods: " << Imm.Mods << '>';
+      break;
+    case Token:
+      OS << '\'' << getToken() << '\'';
+      break;
+    case Expression:
+      OS << "<expr " << *Expr << '>';
+      break;
+    }
+  }
+
+  static AMDGPUOperand::Ptr CreateImm(const AMDGPUAsmParser *AsmParser,
+                                      int64_t Val, SMLoc Loc,
+                                      enum ImmTy Type = ImmTyNone,
+                                      bool IsFPImm = false) {
+    auto Op = llvm::make_unique<AMDGPUOperand>(Immediate, AsmParser);
+    Op->Imm.Val = Val;
+    Op->Imm.IsFPImm = IsFPImm;
+    Op->Imm.Type = Type;
+    Op->Imm.Mods = Modifiers();
+    Op->StartLoc = Loc;
+    Op->EndLoc = Loc;
+    return Op;
+  }
+
+  static AMDGPUOperand::Ptr CreateToken(const AMDGPUAsmParser *AsmParser,
+                                        StringRef Str, SMLoc Loc,
+                                        bool HasExplicitEncodingSize = true) {
+    auto Res = llvm::make_unique<AMDGPUOperand>(Token, AsmParser);
+    Res->Tok.Data = Str.data();
+    Res->Tok.Length = Str.size();
+    Res->StartLoc = Loc;
+    Res->EndLoc = Loc;
+    return Res;
+  }
+
+  static AMDGPUOperand::Ptr CreateReg(const AMDGPUAsmParser *AsmParser,
+                                      unsigned RegNo, SMLoc S,
+                                      SMLoc E,
+                                      bool ForceVOP3) {
+    auto Op = llvm::make_unique<AMDGPUOperand>(Register, AsmParser);
+    Op->Reg.RegNo = RegNo;
+    Op->Reg.Mods = Modifiers();
+    Op->Reg.IsForcedVOP3 = ForceVOP3;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static AMDGPUOperand::Ptr CreateExpr(const AMDGPUAsmParser *AsmParser,
+                                       const class MCExpr *Expr, SMLoc S) {
+    auto Op = llvm::make_unique<AMDGPUOperand>(Expression, AsmParser);
+    Op->Expr = Expr;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+};
+
+raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) {
+  OS << "abs:" << Mods.Abs << " neg: " << Mods.Neg << " sext:" << Mods.Sext;
+  return OS;
+}
+
+//===----------------------------------------------------------------------===//
+// AsmParser
+//===----------------------------------------------------------------------===//
+
+// Holds info related to the current kernel, e.g. count of SGPRs used.
+// Kernel scope begins at .amdgpu_hsa_kernel directive, ends at next
+// .amdgpu_hsa_kernel or at EOF.
+class KernelScopeInfo {
+  int SgprIndexUnusedMin;
+  int VgprIndexUnusedMin;
+  MCContext *Ctx;
+
+  void usesSgprAt(int i) {
+    if (i >= SgprIndexUnusedMin) {
+      SgprIndexUnusedMin = ++i;
+      if (Ctx) {
+        MCSymbol * const Sym = Ctx->getOrCreateSymbol(Twine(".kernel.sgpr_count"));
+        Sym->setVariableValue(MCConstantExpr::create(SgprIndexUnusedMin, *Ctx));
+      }
+    }
+  }
+  void usesVgprAt(int i) {
+    if (i >= VgprIndexUnusedMin) {
+      VgprIndexUnusedMin = ++i;
+      if (Ctx) {
+        MCSymbol * const Sym = Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
+        Sym->setVariableValue(MCConstantExpr::create(VgprIndexUnusedMin, *Ctx));
+      }
+    }
+  }
+public:
+  KernelScopeInfo() : SgprIndexUnusedMin(-1), VgprIndexUnusedMin(-1), Ctx(nullptr)
+  {}
+  void initialize(MCContext &Context) {
+    Ctx = &Context;
+    usesSgprAt(SgprIndexUnusedMin = -1);
+    usesVgprAt(VgprIndexUnusedMin = -1);
+  }
+  void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) {
+    switch (RegKind) {
+      case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break;
+      case IS_VGPR: usesVgprAt(DwordRegIndex + RegWidth - 1); break;
+      default: break;
+    }
+  }
+};
+
+class AMDGPUAsmParser : public MCTargetAsmParser {
+  const MCInstrInfo &MII;
+  MCAsmParser &Parser;
+
+  unsigned ForcedEncodingSize;
+  bool ForcedDPP;
+  bool ForcedSDWA;
+  KernelScopeInfo KernelScope;
+
+  /// @name Auto-generated Match Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "AMDGPUGenAsmMatcher.inc"
+
+  /// }
+
+private:
+  bool ParseAsAbsoluteExpression(uint32_t &Ret);
+  bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor);
+  bool ParseDirectiveHSACodeObjectVersion();
+  bool ParseDirectiveHSACodeObjectISA();
+  bool ParseDirectiveRuntimeMetadata();
+  bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
+  bool ParseDirectiveAMDKernelCodeT();
+  bool ParseSectionDirectiveHSAText();
+  bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const;
+  bool ParseDirectiveAMDGPUHsaKernel();
+  bool ParseDirectiveAMDGPUHsaModuleGlobal();
+  bool ParseDirectiveAMDGPUHsaProgramGlobal();
+  bool ParseSectionDirectiveHSADataGlobalAgent();
+  bool ParseSectionDirectiveHSADataGlobalProgram();
+  bool ParseSectionDirectiveHSARodataReadonlyAgent();
+  bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum);
+  bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex);
+  void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic, bool IsAtomicReturn);
+
+public:
+  enum AMDGPUMatchResultTy {
+    Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
+  };
+
+  AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser,
+               const MCInstrInfo &MII,
+               const MCTargetOptions &Options)
+      : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser),
+        ForcedEncodingSize(0),
+        ForcedDPP(false),
+        ForcedSDWA(false) {
+    MCAsmParserExtension::Initialize(Parser);
+
+    if (getSTI().getFeatureBits().none()) {
+      // Set default features.
+      copySTI().ToggleFeature("SOUTHERN_ISLANDS");
+    }
+
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+
+    {
+      // TODO: make those pre-defined variables read-only.
+      // Currently there is none suitable machinery in the core llvm-mc for this.
+      // MCSymbol::isRedefinable is intended for another purpose, and
+      // AsmParser::parseDirectiveSet() cannot be specialized for specific target.
+      AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits());
+      MCContext &Ctx = getContext();
+      MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
+      Sym->setVariableValue(MCConstantExpr::create(Isa.Major, Ctx));
+      Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
+      Sym->setVariableValue(MCConstantExpr::create(Isa.Minor, Ctx));
+      Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
+      Sym->setVariableValue(MCConstantExpr::create(Isa.Stepping, Ctx));
+    }
+    KernelScope.initialize(getContext());
+  }
+
+  bool isSI() const {
+    return AMDGPU::isSI(getSTI());
+  }
+
+  bool isCI() const {
+    return AMDGPU::isCI(getSTI());
+  }
+
+  bool isVI() const {
+    return AMDGPU::isVI(getSTI());
+  }
+
+  bool hasInv2PiInlineImm() const {
+    return getSTI().getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
+  }
+
+  bool hasSGPR102_SGPR103() const {
+    return !isVI();
+  }
+
+  AMDGPUTargetStreamer &getTargetStreamer() {
+    MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+    return static_cast<AMDGPUTargetStreamer &>(TS);
+  }
+
+  const MCRegisterInfo *getMRI() const {
+    // We need this const_cast because for some reason getContext() is not const
+    // in MCAsmParser.
+    return const_cast<AMDGPUAsmParser*>(this)->getContext().getRegisterInfo();
+  }
+
+  const MCInstrInfo *getMII() const {
+    return &MII;
+  }
+
+  void setForcedEncodingSize(unsigned Size) { ForcedEncodingSize = Size; }
+  void setForcedDPP(bool ForceDPP_) { ForcedDPP = ForceDPP_; }
+  void setForcedSDWA(bool ForceSDWA_) { ForcedSDWA = ForceSDWA_; }
+
+  unsigned getForcedEncodingSize() const { return ForcedEncodingSize; }
+  bool isForcedVOP3() const { return ForcedEncodingSize == 64; }
+  bool isForcedDPP() const { return ForcedDPP; }
+  bool isForcedSDWA() const { return ForcedSDWA; }
+
+  std::unique_ptr<AMDGPUOperand> parseRegister();
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
+  OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic);
+  StringRef parseMnemonicSuffix(StringRef Name);
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+  //bool ProcessInstruction(MCInst &Inst);
+
+  OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int);
+  OperandMatchResultTy
+  parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
+                     enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
+                     bool (*ConvertResult)(int64_t &) = nullptr);
+  OperandMatchResultTy
+  parseNamedBit(const char *Name, OperandVector &Operands,
+                enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
+  OperandMatchResultTy parseStringWithPrefix(StringRef Prefix,
+                                             StringRef &Value);
+
+  OperandMatchResultTy parseImm(OperandVector &Operands);
+  OperandMatchResultTy parseRegOrImm(OperandVector &Operands);
+  OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands);
+  OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands);
+  OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
+
+  void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
+  void cvtDS(MCInst &Inst, const OperandVector &Operands);
+  void cvtExp(MCInst &Inst, const OperandVector &Operands);
+
+  bool parseCnt(int64_t &IntVal);
+  OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands);
+  OperandMatchResultTy parseHwreg(OperandVector &Operands);
+
+private:
+  struct OperandInfoTy {
+    int64_t Id;
+    bool IsSymbolic;
+    OperandInfoTy(int64_t Id_) : Id(Id_), IsSymbolic(false) { }
+  };
+
+  bool parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId);
+  bool parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width);
+
+  void errorExpTgt();
+  OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
+
+public:
+  OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
+
+  OperandMatchResultTy parseExpTgt(OperandVector &Operands);
+  OperandMatchResultTy parseSendMsgOp(OperandVector &Operands);
+  OperandMatchResultTy parseInterpSlot(OperandVector &Operands);
+  OperandMatchResultTy parseInterpAttr(OperandVector &Operands);
+  OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
+
+  void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
+  void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
+  void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
+  AMDGPUOperand::Ptr defaultGLC() const;
+  AMDGPUOperand::Ptr defaultSLC() const;
+  AMDGPUOperand::Ptr defaultTFE() const;
+
+  AMDGPUOperand::Ptr defaultDMask() const;
+  AMDGPUOperand::Ptr defaultUNorm() const;
+  AMDGPUOperand::Ptr defaultDA() const;
+  AMDGPUOperand::Ptr defaultR128() const;
+  AMDGPUOperand::Ptr defaultLWE() const;
+  AMDGPUOperand::Ptr defaultSMRDOffset8() const;
+  AMDGPUOperand::Ptr defaultSMRDOffset20() const;
+  AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
+
+  OperandMatchResultTy parseOModOperand(OperandVector &Operands);
+
+  void cvtId(MCInst &Inst, const OperandVector &Operands);
+  void cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands);
+  void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
+
+  void cvtMIMG(MCInst &Inst, const OperandVector &Operands);
+  void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);
+
+  OperandMatchResultTy parseDPPCtrl(OperandVector &Operands);
+  AMDGPUOperand::Ptr defaultRowMask() const;
+  AMDGPUOperand::Ptr defaultBankMask() const;
+  AMDGPUOperand::Ptr defaultBoundCtrl() const;
+  void cvtDPP(MCInst &Inst, const OperandVector &Operands);
+
+  OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix,
+                                    AMDGPUOperand::ImmTy Type);
+  OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands);
+  void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands);
+  void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands);
+  void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands);
+  void cvtSDWA(MCInst &Inst, const OperandVector &Operands,
+               uint64_t BasicInstType);
+};
+
+struct OptionalOperand {
+  const char *Name;
+  AMDGPUOperand::ImmTy Type;
+  bool IsBit;
+  bool (*ConvertResult)(int64_t&);
+};
+
+} // end anonymous namespace
+
+// May be called with integer type with equivalent bitwidth.
+static const fltSemantics *getFltSemantics(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return &APFloat::IEEEsingle();
+  case 8:
+    return &APFloat::IEEEdouble();
+  case 2:
+    return &APFloat::IEEEhalf();
+  default:
+    llvm_unreachable("unsupported fp type");
+  }
+}
+
+static const fltSemantics *getFltSemantics(MVT VT) {
+  return getFltSemantics(VT.getSizeInBits() / 8);
+}
+
+//===----------------------------------------------------------------------===//
+// Operand
+//===----------------------------------------------------------------------===//
+
+static bool canLosslesslyConvertToFPType(APFloat &FPLiteral, MVT VT) {
+  bool Lost;
+
+  // Convert literal to single precision
+  APFloat::opStatus Status = FPLiteral.convert(*getFltSemantics(VT),
+                                               APFloat::rmNearestTiesToEven,
+                                               &Lost);
+  // We allow precision lost but not overflow or underflow
+  if (Status != APFloat::opOK &&
+      Lost &&
+      ((Status & APFloat::opOverflow)  != 0 ||
+       (Status & APFloat::opUnderflow) != 0)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool AMDGPUOperand::isInlinableImm(MVT type) const {
+  if (!isImmTy(ImmTyNone)) {
+    // Only plain immediates are inlinable (e.g. "clamp" attribute is not)
+    return false;
+  }
+  // TODO: We should avoid using host float here. It would be better to
+  // check the float bit values which is what a few other places do.
+  // We've had bot failures before due to weird NaN support on mips hosts.
+
+  APInt Literal(64, Imm.Val);
+
+  if (Imm.IsFPImm) { // We got fp literal token
+    if (type == MVT::f64 || type == MVT::i64) { // Expected 64-bit operand
+      return AMDGPU::isInlinableLiteral64(Imm.Val,
+                                          AsmParser->hasInv2PiInlineImm());
+    }
+
+    APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
+    if (!canLosslesslyConvertToFPType(FPLiteral, type))
+      return false;
+
+    // Check if single precision literal is inlinable
+    return AMDGPU::isInlinableLiteral32(
+      static_cast<int32_t>(FPLiteral.bitcastToAPInt().getZExtValue()),
+      AsmParser->hasInv2PiInlineImm());
+  }
+
+
+  // We got int literal token.
+  if (type == MVT::f64 || type == MVT::i64) { // Expected 64-bit operand
+    return AMDGPU::isInlinableLiteral64(Imm.Val,
+                                        AsmParser->hasInv2PiInlineImm());
+  }
+
+  if (type.getScalarSizeInBits() == 16) {
+    return AMDGPU::isInlinableLiteral16(
+      static_cast<int16_t>(Literal.getLoBits(16).getSExtValue()),
+      AsmParser->hasInv2PiInlineImm());
+  }
+
+  return AMDGPU::isInlinableLiteral32(
+    static_cast<int32_t>(Literal.getLoBits(32).getZExtValue()),
+    AsmParser->hasInv2PiInlineImm());
+}
+
+bool AMDGPUOperand::isLiteralImm(MVT type) const {
+  // Check that this imediate can be added as literal
+  if (!isImmTy(ImmTyNone)) {
+    return false;
+  }
+
+  if (!Imm.IsFPImm) {
+    // We got int literal token.
+
+    unsigned Size = type.getSizeInBits();
+    if (Size == 64)
+      Size = 32;
+
+    // FIXME: 64-bit operands can zero extend, sign extend, or pad zeroes for FP
+    // types.
+    return isUIntN(Size, Imm.Val) || isIntN(Size, Imm.Val);
+  }
+
+  // We got fp literal token
+  if (type == MVT::f64) { // Expected 64-bit fp operand
+    // We would set low 64-bits of literal to zeroes but we accept this literals
+    return true;
+  }
+
+  if (type == MVT::i64) { // Expected 64-bit int operand
+    // We don't allow fp literals in 64-bit integer instructions. It is
+    // unclear how we should encode them.
+    return false;
+  }
+
+  APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));
+  return canLosslesslyConvertToFPType(FPLiteral, type);
+}
+
+bool AMDGPUOperand::isRegClass(unsigned RCID) const {
+  return isReg() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
+}
+
+void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const {
+  int64_t Val = Imm.Val;
+  if (isImmTy(ImmTyNone) && ApplyModifiers && Imm.Mods.hasFPModifiers() && Imm.Mods.Neg) {
+    // Apply modifiers to immediate value. Only negate can get here
+    if (Imm.IsFPImm) {
+      APFloat F(BitsToDouble(Val));
+      F.changeSign();
+      Val = F.bitcastToAPInt().getZExtValue();
+    } else {
+      Val = -Val;
+    }
+  }
+
+  if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()),
+                             Inst.getNumOperands())) {
+    addLiteralImmOperand(Inst, Val);
+  } else {
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+}
+
+void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
+  const auto& InstDesc = AsmParser->getMII()->get(Inst.getOpcode());
+  auto OpNum = Inst.getNumOperands();
+  // Check that this operand accepts literals
+  assert(AMDGPU::isSISrcOperand(InstDesc, OpNum));
+
+  auto OpSize = AMDGPU::getOperandSize(InstDesc, OpNum); // expected operand size
+
+  if (Imm.IsFPImm) { // We got fp literal token
+    APInt Literal(64, Val);
+
+    switch (OpSize) {
+    case 8: {
+      if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
+                                       AsmParser->hasInv2PiInlineImm())) {
+        Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
+        return;
+      }
+
+      // Non-inlineable
+      if (AMDGPU::isSISrcFPOperand(InstDesc, OpNum)) { // Expected 64-bit fp operand
+        // For fp operands we check if low 32 bits are zeros
+        if (Literal.getLoBits(32) != 0) {
+          const_cast<AMDGPUAsmParser *>(AsmParser)->Warning(Inst.getLoc(),
+          "Can't encode literal as exact 64-bit floating-point operand. "
+          "Low 32-bits will be set to zero");
+        }
+
+        Inst.addOperand(MCOperand::createImm(Literal.lshr(32).getZExtValue()));
+        return;
+      }
+
+      // We don't allow fp literals in 64-bit integer instructions. It is
+      // unclear how we should encode them. This case should be checked earlier
+      // in predicate methods (isLiteralImm())
+      llvm_unreachable("fp literal in 64-bit integer instruction.");
+    }
+    case 4:
+    case 2: {
+      bool lost;
+      APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
+      // Convert literal to single precision
+      FPLiteral.convert(*getFltSemantics(OpSize),
+                        APFloat::rmNearestTiesToEven, &lost);
+      // We allow precision lost but not overflow or underflow. This should be
+      // checked earlier in isLiteralImm()
+      Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
+      return;
+    }
+    default:
+      llvm_unreachable("invalid operand size");
+    }
+
+    return;
+  }
+
+   // We got int literal token.
+  // Only sign extend inline immediates.
+  // FIXME: No errors on truncation
+  switch (OpSize) {
+  case 4: {
+    if (isInt<32>(Val) &&
+        AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
+                                     AsmParser->hasInv2PiInlineImm())) {
+      Inst.addOperand(MCOperand::createImm(Val));
+      return;
+    }
+
+    Inst.addOperand(MCOperand::createImm(Val & 0xffffffff));
+    return;
+  }
+  case 8: {
+    if (AMDGPU::isInlinableLiteral64(Val,
+                                     AsmParser->hasInv2PiInlineImm())) {
+      Inst.addOperand(MCOperand::createImm(Val));
+      return;
+    }
+
+    Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
+    return;
+  }
+  case 2: {
+    if (isInt<16>(Val) &&
+        AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
+                                     AsmParser->hasInv2PiInlineImm())) {
+      Inst.addOperand(MCOperand::createImm(Val));
+      return;
+    }
+
+    Inst.addOperand(MCOperand::createImm(Val & 0xffff));
+    return;
+  }
+  default:
+    llvm_unreachable("invalid operand size");
+  }
+}
+
+template <unsigned Bitwidth>
+void AMDGPUOperand::addKImmFPOperands(MCInst &Inst, unsigned N) const {
+  APInt Literal(64, Imm.Val);
+
+  if (!Imm.IsFPImm) {
+    // We got int literal token.
+    Inst.addOperand(MCOperand::createImm(Literal.getLoBits(Bitwidth).getZExtValue()));
+    return;
+  }
+
+  bool Lost;
+  APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
+  FPLiteral.convert(*getFltSemantics(Bitwidth / 8),
+                    APFloat::rmNearestTiesToEven, &Lost);
+  Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
+}
+
+void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const {
+  Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), AsmParser->getSTI())));
+}
+
+//===----------------------------------------------------------------------===//
+// AsmParser
+//===----------------------------------------------------------------------===//
+
+static int getRegClass(RegisterKind Is, unsigned RegWidth) {
+  if (Is == IS_VGPR) {
+    switch (RegWidth) {
+      default: return -1;
+      case 1: return AMDGPU::VGPR_32RegClassID;
+      case 2: return AMDGPU::VReg_64RegClassID;
+      case 3: return AMDGPU::VReg_96RegClassID;
+      case 4: return AMDGPU::VReg_128RegClassID;
+      case 8: return AMDGPU::VReg_256RegClassID;
+      case 16: return AMDGPU::VReg_512RegClassID;
+    }
+  } else if (Is == IS_TTMP) {
+    switch (RegWidth) {
+      default: return -1;
+      case 1: return AMDGPU::TTMP_32RegClassID;
+      case 2: return AMDGPU::TTMP_64RegClassID;
+      case 4: return AMDGPU::TTMP_128RegClassID;
+    }
+  } else if (Is == IS_SGPR) {
+    switch (RegWidth) {
+      default: return -1;
+      case 1: return AMDGPU::SGPR_32RegClassID;
+      case 2: return AMDGPU::SGPR_64RegClassID;
+      case 4: return AMDGPU::SGPR_128RegClassID;
+      case 8: return AMDGPU::SReg_256RegClassID;
+      case 16: return AMDGPU::SReg_512RegClassID;
+    }
+  }
+  return -1;
+}
+
+static unsigned getSpecialRegForName(StringRef RegName) {
+  return StringSwitch<unsigned>(RegName)
+    .Case("exec", AMDGPU::EXEC)
+    .Case("vcc", AMDGPU::VCC)
+    .Case("flat_scratch", AMDGPU::FLAT_SCR)
+    .Case("m0", AMDGPU::M0)
+    .Case("scc", AMDGPU::SCC)
+    .Case("tba", AMDGPU::TBA)
+    .Case("tma", AMDGPU::TMA)
+    .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
+    .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
+    .Case("vcc_lo", AMDGPU::VCC_LO)
+    .Case("vcc_hi", AMDGPU::VCC_HI)
+    .Case("exec_lo", AMDGPU::EXEC_LO)
+    .Case("exec_hi", AMDGPU::EXEC_HI)
+    .Case("tma_lo", AMDGPU::TMA_LO)
+    .Case("tma_hi", AMDGPU::TMA_HI)
+    .Case("tba_lo", AMDGPU::TBA_LO)
+    .Case("tba_hi", AMDGPU::TBA_HI)
+    .Default(0);
+}
+
+bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+  auto R = parseRegister();
+  if (!R) return true;
+  assert(R->isReg());
+  RegNo = R->getReg();
+  StartLoc = R->getStartLoc();
+  EndLoc = R->getEndLoc();
+  return false;
+}
+
+bool AMDGPUAsmParser::AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum)
+{
+  switch (RegKind) {
+  case IS_SPECIAL:
+    if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { Reg = AMDGPU::EXEC; RegWidth = 2; return true; }
+    if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) { Reg = AMDGPU::FLAT_SCR; RegWidth = 2; return true; }
+    if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) { Reg = AMDGPU::VCC; RegWidth = 2; return true; }
+    if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) { Reg = AMDGPU::TBA; RegWidth = 2; return true; }
+    if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) { Reg = AMDGPU::TMA; RegWidth = 2; return true; }
+    return false;
+  case IS_VGPR:
+  case IS_SGPR:
+  case IS_TTMP:
+    if (Reg1 != Reg + RegWidth) { return false; }
+    RegWidth++;
+    return true;
+  default:
+    llvm_unreachable("unexpected register kind");
+  }
+}
+
+bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex)
+{
+  if (DwordRegIndex) { *DwordRegIndex = 0; }
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+  if (getLexer().is(AsmToken::Identifier)) {
+    StringRef RegName = Parser.getTok().getString();
+    if ((Reg = getSpecialRegForName(RegName))) {
+      Parser.Lex();
+      RegKind = IS_SPECIAL;
+    } else {
+      unsigned RegNumIndex = 0;
+      if (RegName[0] == 'v') {
+        RegNumIndex = 1;
+        RegKind = IS_VGPR;
+      } else if (RegName[0] == 's') {
+        RegNumIndex = 1;
+        RegKind = IS_SGPR;
+      } else if (RegName.startswith("ttmp")) {
+        RegNumIndex = strlen("ttmp");
+        RegKind = IS_TTMP;
+      } else {
+        return false;
+      }
+      if (RegName.size() > RegNumIndex) {
+        // Single 32-bit register: vXX.
+        if (RegName.substr(RegNumIndex).getAsInteger(10, RegNum))
+          return false;
+        Parser.Lex();
+        RegWidth = 1;
+      } else {
+        // Range of registers: v[XX:YY]. ":YY" is optional.
+        Parser.Lex();
+        int64_t RegLo, RegHi;
+        if (getLexer().isNot(AsmToken::LBrac))
+          return false;
+        Parser.Lex();
+
+        if (getParser().parseAbsoluteExpression(RegLo))
+          return false;
+
+        const bool isRBrace = getLexer().is(AsmToken::RBrac);
+        if (!isRBrace && getLexer().isNot(AsmToken::Colon))
+          return false;
+        Parser.Lex();
+
+        if (isRBrace) {
+          RegHi = RegLo;
+        } else {
+          if (getParser().parseAbsoluteExpression(RegHi))
+            return false;
+
+          if (getLexer().isNot(AsmToken::RBrac))
+            return false;
+          Parser.Lex();
+        }
+        RegNum = (unsigned) RegLo;
+        RegWidth = (RegHi - RegLo) + 1;
+      }
+    }
+  } else if (getLexer().is(AsmToken::LBrac)) {
+    // List of consecutive registers: [s0,s1,s2,s3]
+    Parser.Lex();
+    if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, nullptr))
+      return false;
+    if (RegWidth != 1)
+      return false;
+    RegisterKind RegKind1;
+    unsigned Reg1, RegNum1, RegWidth1;
+    do {
+      if (getLexer().is(AsmToken::Comma)) {
+        Parser.Lex();
+      } else if (getLexer().is(AsmToken::RBrac)) {
+        Parser.Lex();
+        break;
+      } else if (ParseAMDGPURegister(RegKind1, Reg1, RegNum1, RegWidth1, nullptr)) {
+        if (RegWidth1 != 1) {
+          return false;
+        }
+        if (RegKind1 != RegKind) {
+          return false;
+        }
+        if (!AddNextRegisterToList(Reg, RegWidth, RegKind1, Reg1, RegNum1)) {
+          return false;
+        }
+      } else {
+        return false;
+      }
+    } while (true);
+  } else {
+    return false;
+  }
+  switch (RegKind) {
+  case IS_SPECIAL:
+    RegNum = 0;
+    RegWidth = 1;
+    break;
+  case IS_VGPR:
+  case IS_SGPR:
+  case IS_TTMP:
+  {
+    unsigned Size = 1;
+    if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
+      // SGPR and TTMP registers must be aligned. Max required alignment is 4 dwords.
+      Size = std::min(RegWidth, 4u);
+    }
+    if (RegNum % Size != 0)
+      return false;
+    if (DwordRegIndex) { *DwordRegIndex = RegNum; }
+    RegNum = RegNum / Size;
+    int RCID = getRegClass(RegKind, RegWidth);
+    if (RCID == -1)
+      return false;
+    const MCRegisterClass RC = TRI->getRegClass(RCID);
+    if (RegNum >= RC.getNumRegs())
+      return false;
+    Reg = RC.getRegister(RegNum);
+    break;
+  }
+
+  default:
+    llvm_unreachable("unexpected register kind");
+  }
+
+  if (!subtargetHasRegister(*TRI, Reg))
+    return false;
+  return true;
+}
+
+std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
+  const auto &Tok = Parser.getTok();
+  SMLoc StartLoc = Tok.getLoc();
+  SMLoc EndLoc = Tok.getEndLoc();
+  RegisterKind RegKind;
+  unsigned Reg, RegNum, RegWidth, DwordRegIndex;
+
+  if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) {
+    return nullptr;
+  }
+  KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth);
+  return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false);
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseImm(OperandVector &Operands) {
+  // TODO: add syntactic sugar for 1/(2*PI)
+  bool Minus = false;
+  if (getLexer().getKind() == AsmToken::Minus) {
+    Minus = true;
+    Parser.Lex();
+  }
+
+  SMLoc S = Parser.getTok().getLoc();
+  switch(getLexer().getKind()) {
+  case AsmToken::Integer: {
+    int64_t IntVal;
+    if (getParser().parseAbsoluteExpression(IntVal))
+      return MatchOperand_ParseFail;
+    if (Minus)
+      IntVal *= -1;
+    Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S));
+    return MatchOperand_Success;
+  }
+  case AsmToken::Real: {
+    int64_t IntVal;
+    if (getParser().parseAbsoluteExpression(IntVal))
+      return MatchOperand_ParseFail;
+
+    APFloat F(BitsToDouble(IntVal));
+    if (Minus)
+      F.changeSign();
+    Operands.push_back(
+        AMDGPUOperand::CreateImm(this, F.bitcastToAPInt().getZExtValue(), S,
+                                 AMDGPUOperand::ImmTyNone, true));
+    return MatchOperand_Success;
+  }
+  default:
+    return Minus ? MatchOperand_ParseFail : MatchOperand_NoMatch;
+  }
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
+  auto res = parseImm(Operands);
+  if (res != MatchOperand_NoMatch) {
+    return res;
+  }
+
+  if (auto R = parseRegister()) {
+    assert(R->isReg());
+    R->Reg.IsForcedVOP3 = isForcedVOP3();
+    Operands.push_back(std::move(R));
+    return MatchOperand_Success;
+  }
+  return MatchOperand_ParseFail;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
+  // XXX: During parsing we can't determine if minus sign means
+  // negate-modifier or negative immediate value.
+  // By default we suppose it is modifier.
+  bool Negate = false, Abs = false, Abs2 = false;
+
+  if (getLexer().getKind()== AsmToken::Minus) {
+    Parser.Lex();
+    Negate = true;
+  }
+
+  if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "abs") {
+    Parser.Lex();
+    Abs2 = true;
+    if (getLexer().isNot(AsmToken::LParen)) {
+      Error(Parser.getTok().getLoc(), "expected left paren after abs");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+  }
+
+  if (getLexer().getKind() == AsmToken::Pipe) {
+    if (Abs2) {
+      Error(Parser.getTok().getLoc(), "expected register or immediate");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+    Abs = true;
+  }
+
+  auto Res = parseRegOrImm(Operands);
+  if (Res != MatchOperand_Success) {
+    return Res;
+  }
+
+  AMDGPUOperand::Modifiers Mods;
+  if (Negate) {
+    Mods.Neg = true;
+  }
+  if (Abs) {
+    if (getLexer().getKind() != AsmToken::Pipe) {
+      Error(Parser.getTok().getLoc(), "expected vertical bar");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+    Mods.Abs = true;
+  }
+  if (Abs2) {
+    if (getLexer().isNot(AsmToken::RParen)) {
+      Error(Parser.getTok().getLoc(), "expected closing parentheses");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+    Mods.Abs = true;
+  }
+
+  if (Mods.hasFPModifiers()) {
+    AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
+    Op.setModifiers(Mods);
+  }
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
+  bool Sext = false;
+
+  if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") {
+    Parser.Lex();
+    Sext = true;
+    if (getLexer().isNot(AsmToken::LParen)) {
+      Error(Parser.getTok().getLoc(), "expected left paren after sext");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+  }
+
+  auto Res = parseRegOrImm(Operands);
+  if (Res != MatchOperand_Success) {
+    return Res;
+  }
+
+  AMDGPUOperand::Modifiers Mods;
+  if (Sext) {
+    if (getLexer().isNot(AsmToken::RParen)) {
+      Error(Parser.getTok().getLoc(), "expected closing parentheses");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+    Mods.Sext = true;
+  }
+
+  if (Mods.hasIntModifiers()) {
+    AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
+    Op.setModifiers(Mods);
+  }
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) {
+  std::unique_ptr<AMDGPUOperand> Reg = parseRegister();
+  if (Reg) {
+    Operands.push_back(std::move(Reg));
+    return MatchOperand_Success;
+  }
+
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.getString() == "off") {
+    Operands.push_back(AMDGPUOperand::CreateImm(this, 0, Tok.getLoc(),
+                                                AMDGPUOperand::ImmTyOff, false));
+    Parser.Lex();
+    return MatchOperand_Success;
+  }
+
+  return MatchOperand_NoMatch;
+}
+
+unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+
+  uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+
+  if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) ||
+      (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3)) ||
+      (isForcedDPP() && !(TSFlags & SIInstrFlags::DPP)) ||
+      (isForcedSDWA() && !(TSFlags & SIInstrFlags::SDWA)) )
+    return Match_InvalidOperand;
+
+  if ((TSFlags & SIInstrFlags::VOP3) &&
+      (TSFlags & SIInstrFlags::VOPAsmPrefer32Bit) &&
+      getForcedEncodingSize() != 64)
+    return Match_PreferE32;
+
+  if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa_vi ||
+      Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi) {
+    // v_mac_f32/16 allow only dst_sel == DWORD;
+    auto OpNum =
+        AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::dst_sel);
+    const auto &Op = Inst.getOperand(OpNum);
+    if (!Op.isImm() || Op.getImm() != AMDGPU::SDWA::SdwaSel::DWORD) {
+      return Match_InvalidOperand;
+    }
+  }
+
+  return Match_Success;
+}
+
+bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                              OperandVector &Operands,
+                                              MCStreamer &Out,
+                                              uint64_t &ErrorInfo,
+                                              bool MatchingInlineAsm) {
+  // What asm variants we should check
+  std::vector<unsigned> MatchedVariants;
+  if (getForcedEncodingSize() == 32) {
+    MatchedVariants = {AMDGPUAsmVariants::DEFAULT};
+  } else if (isForcedVOP3()) {
+    MatchedVariants = {AMDGPUAsmVariants::VOP3};
+  } else if (isForcedSDWA()) {
+    MatchedVariants = {AMDGPUAsmVariants::SDWA};
+  } else if (isForcedDPP()) {
+    MatchedVariants = {AMDGPUAsmVariants::DPP};
+  } else {
+    MatchedVariants = {AMDGPUAsmVariants::DEFAULT,
+                       AMDGPUAsmVariants::VOP3,
+                       AMDGPUAsmVariants::SDWA,
+                       AMDGPUAsmVariants::DPP};
+  }
+
+  MCInst Inst;
+  unsigned Result = Match_Success;
+  for (auto Variant : MatchedVariants) {
+    uint64_t EI;
+    auto R = MatchInstructionImpl(Operands, Inst, EI, MatchingInlineAsm,
+                                  Variant);
+    // We order match statuses from least to most specific. We use most specific
+    // status as resulting
+    // Match_MnemonicFail < Match_InvalidOperand < Match_MissingFeature < Match_PreferE32
+    if ((R == Match_Success) ||
+        (R == Match_PreferE32) ||
+        (R == Match_MissingFeature && Result != Match_PreferE32) ||
+        (R == Match_InvalidOperand && Result != Match_MissingFeature
+                                   && Result != Match_PreferE32) ||
+        (R == Match_MnemonicFail   && Result != Match_InvalidOperand
+                                   && Result != Match_MissingFeature
+                                   && Result != Match_PreferE32)) {
+      Result = R;
+      ErrorInfo = EI;
+    }
+    if (R == Match_Success)
+      break;
+  }
+
+  switch (Result) {
+  default: break;
+  case Match_Success:
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst, getSTI());
+    return false;
+
+  case Match_MissingFeature:
+    return Error(IDLoc, "instruction not supported on this GPU");
+
+  case Match_MnemonicFail:
+    return Error(IDLoc, "unrecognized instruction mnemonic");
+
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0ULL) {
+      if (ErrorInfo >= Operands.size()) {
+        return Error(IDLoc, "too few operands for instruction");
+      }
+      ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+
+  case Match_PreferE32:
+    return Error(IDLoc, "internal error: instruction without _e64 suffix "
+                        "should be encoded as e32");
+  }
+  llvm_unreachable("Implement any new match types added!");
+}
+
+bool AMDGPUAsmParser::ParseAsAbsoluteExpression(uint32_t &Ret) {
+  int64_t Tmp = -1;
+  if (getLexer().isNot(AsmToken::Integer) && getLexer().isNot(AsmToken::Identifier)) {
+    return true;
+  }
+  if (getParser().parseAbsoluteExpression(Tmp)) {
+    return true;
+  }
+  Ret = static_cast<uint32_t>(Tmp);
+  return false;
+}
+
+
+bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major,
+                                               uint32_t &Minor) {
+  if (ParseAsAbsoluteExpression(Major))
+    return TokError("invalid major version");
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("minor version number required, comma expected");
+  Lex();
+
+  if (ParseAsAbsoluteExpression(Minor))
+    return TokError("invalid minor version");
+
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() {
+
+  uint32_t Major;
+  uint32_t Minor;
+
+  if (ParseDirectiveMajorMinor(Major, Minor))
+    return true;
+
+  getTargetStreamer().EmitDirectiveHSACodeObjectVersion(Major, Minor);
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
+  uint32_t Major;
+  uint32_t Minor;
+  uint32_t Stepping;
+  StringRef VendorName;
+  StringRef ArchName;
+
+  // If this directive has no arguments, then use the ISA version for the
+  // targeted GPU.
+  if (getLexer().is(AsmToken::EndOfStatement)) {
+    AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits());
+    getTargetStreamer().EmitDirectiveHSACodeObjectISA(Isa.Major, Isa.Minor,
+                                                      Isa.Stepping,
+                                                      "AMD", "AMDGPU");
+    return false;
+  }
+
+  if (ParseDirectiveMajorMinor(Major, Minor))
+    return true;
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("stepping version number required, comma expected");
+  Lex();
+
+  if (ParseAsAbsoluteExpression(Stepping))
+    return TokError("invalid stepping version");
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("vendor name required, comma expected");
+  Lex();
+
+  if (getLexer().isNot(AsmToken::String))
+    return TokError("invalid vendor name");
+
+  VendorName = getLexer().getTok().getStringContents();
+  Lex();
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("arch name required, comma expected");
+  Lex();
+
+  if (getLexer().isNot(AsmToken::String))
+    return TokError("invalid arch name");
+
+  ArchName = getLexer().getTok().getStringContents();
+  Lex();
+
+  getTargetStreamer().EmitDirectiveHSACodeObjectISA(Major, Minor, Stepping,
+                                                    VendorName, ArchName);
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveRuntimeMetadata() {
+  std::string Metadata;
+  raw_string_ostream MS(Metadata);
+
+  getLexer().setSkipSpace(false);
+
+  bool FoundEnd = false;
+  while (!getLexer().is(AsmToken::Eof)) {
+    while (getLexer().is(AsmToken::Space)) {
+      MS << ' ';
+      Lex();
+    }
+
+    if (getLexer().is(AsmToken::Identifier)) {
+      StringRef ID = getLexer().getTok().getIdentifier();
+      if (ID == ".end_amdgpu_runtime_metadata") {
+        Lex();
+        FoundEnd = true;
+        break;
+      }
+    }
+
+    MS << Parser.parseStringToEndOfStatement()
+       << getContext().getAsmInfo()->getSeparatorString();
+
+    Parser.eatToEndOfStatement();
+  }
+
+  getLexer().setSkipSpace(true);
+
+  if (getLexer().is(AsmToken::Eof) && !FoundEnd)
+    return TokError("expected directive .end_amdgpu_runtime_metadata not found");
+
+  MS.flush();
+
+  getTargetStreamer().EmitRuntimeMetadata(Metadata);
+
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
+                                               amd_kernel_code_t &Header) {
+  SmallString<40> ErrStr;
+  raw_svector_ostream Err(ErrStr);
+  if (!parseAmdKernelCodeField(ID, getParser(), Header, Err)) {
+    return TokError(Err.str());
+  }
+  Lex();
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
+  amd_kernel_code_t Header;
+  AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits());
+
+  while (true) {
+    // Lex EndOfStatement.  This is in a while loop, because lexing a comment
+    // will set the current token to EndOfStatement.
+    while(getLexer().is(AsmToken::EndOfStatement))
+      Lex();
+
+    if (getLexer().isNot(AsmToken::Identifier))
+      return TokError("expected value identifier or .end_amd_kernel_code_t");
+
+    StringRef ID = getLexer().getTok().getIdentifier();
+    Lex();
+
+    if (ID == ".end_amd_kernel_code_t")
+      break;
+
+    if (ParseAMDKernelCodeTValue(ID, Header))
+      return true;
+  }
+
+  getTargetStreamer().EmitAMDKernelCodeT(Header);
+
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSAText() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSATextSection(getContext()));
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
+  if (getLexer().isNot(AsmToken::Identifier))
+    return TokError("expected symbol name");
+
+  StringRef KernelName = Parser.getTok().getString();
+
+  getTargetStreamer().EmitAMDGPUSymbolType(KernelName,
+                                           ELF::STT_AMDGPU_HSA_KERNEL);
+  Lex();
+  KernelScope.initialize(getContext());
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaModuleGlobal() {
+  if (getLexer().isNot(AsmToken::Identifier))
+    return TokError("expected symbol name");
+
+  StringRef GlobalName = Parser.getTok().getIdentifier();
+
+  getTargetStreamer().EmitAMDGPUHsaModuleScopeGlobal(GlobalName);
+  Lex();
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaProgramGlobal() {
+  if (getLexer().isNot(AsmToken::Identifier))
+    return TokError("expected symbol name");
+
+  StringRef GlobalName = Parser.getTok().getIdentifier();
+
+  getTargetStreamer().EmitAMDGPUHsaProgramScopeGlobal(GlobalName);
+  Lex();
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalAgent() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSADataGlobalAgentSection(getContext()));
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalProgram() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSADataGlobalProgramSection(getContext()));
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSARodataReadonlyAgent() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSARodataReadonlyAgentSection(getContext()));
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getString();
+
+  if (IDVal == ".hsa_code_object_version")
+    return ParseDirectiveHSACodeObjectVersion();
+
+  if (IDVal == ".hsa_code_object_isa")
+    return ParseDirectiveHSACodeObjectISA();
+
+  if (IDVal == ".amdgpu_runtime_metadata")
+    return ParseDirectiveRuntimeMetadata();
+
+  if (IDVal == ".amd_kernel_code_t")
+    return ParseDirectiveAMDKernelCodeT();
+
+  if (IDVal == ".hsatext")
+    return ParseSectionDirectiveHSAText();
+
+  if (IDVal == ".amdgpu_hsa_kernel")
+    return ParseDirectiveAMDGPUHsaKernel();
+
+  if (IDVal == ".amdgpu_hsa_module_global")
+    return ParseDirectiveAMDGPUHsaModuleGlobal();
+
+  if (IDVal == ".amdgpu_hsa_program_global")
+    return ParseDirectiveAMDGPUHsaProgramGlobal();
+
+  if (IDVal == ".hsadata_global_agent")
+    return ParseSectionDirectiveHSADataGlobalAgent();
+
+  if (IDVal == ".hsadata_global_program")
+    return ParseSectionDirectiveHSADataGlobalProgram();
+
+  if (IDVal == ".hsarodata_readonly_agent")
+    return ParseSectionDirectiveHSARodataReadonlyAgent();
+
+  return true;
+}
+
+bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
+                                           unsigned RegNo) const {
+  if (isCI())
+    return true;
+
+  if (isSI()) {
+    // No flat_scr
+    switch (RegNo) {
+    case AMDGPU::FLAT_SCR:
+    case AMDGPU::FLAT_SCR_LO:
+    case AMDGPU::FLAT_SCR_HI:
+      return false;
+    default:
+      return true;
+    }
+  }
+
+  // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that
+  // SI/CI have.
+  for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true);
+       R.isValid(); ++R) {
+    if (*R == RegNo)
+      return false;
+  }
+
+  return true;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+
+  // Try to parse with a custom parser
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+
+  // If we successfully parsed the operand or if there as an error parsing,
+  // we are done.
+  //
+  // If we are parsing after we reach EndOfStatement then this means we
+  // are appending default values to the Operands list.  This is only done
+  // by custom parser, so we shouldn't continue on to the generic parsing.
+  if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail ||
+      getLexer().is(AsmToken::EndOfStatement))
+    return ResTy;
+
+  ResTy = parseRegOrImm(Operands);
+
+  if (ResTy == MatchOperand_Success)
+    return ResTy;
+
+  if (getLexer().getKind() == AsmToken::Identifier) {
+    // If this identifier is a symbol, we want to create an expression for it.
+    // It is a little difficult to distinguish between a symbol name, and
+    // an instruction flag like 'gds'.  In order to do this, we parse
+    // all tokens as expressions and then treate the symbol name as the token
+    // string when we want to interpret the operand as a token.
+    const auto &Tok = Parser.getTok();
+    SMLoc S = Tok.getLoc();
+    const MCExpr *Expr = nullptr;
+    if (!Parser.parseExpression(Expr)) {
+      Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S));
+      return MatchOperand_Success;
+    }
+
+    Operands.push_back(AMDGPUOperand::CreateToken(this, Tok.getString(), Tok.getLoc()));
+    Parser.Lex();
+    return MatchOperand_Success;
+  }
+  return MatchOperand_NoMatch;
+}
+
+StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) {
+  // Clear any forced encodings from the previous instruction.
+  setForcedEncodingSize(0);
+  setForcedDPP(false);
+  setForcedSDWA(false);
+
+  if (Name.endswith("_e64")) {
+    setForcedEncodingSize(64);
+    return Name.substr(0, Name.size() - 4);
+  } else if (Name.endswith("_e32")) {
+    setForcedEncodingSize(32);
+    return Name.substr(0, Name.size() - 4);
+  } else if (Name.endswith("_dpp")) {
+    setForcedDPP(true);
+    return Name.substr(0, Name.size() - 4);
+  } else if (Name.endswith("_sdwa")) {
+    setForcedSDWA(true);
+    return Name.substr(0, Name.size() - 5);
+  }
+  return Name;
+}
+
+bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                       StringRef Name,
+                                       SMLoc NameLoc, OperandVector &Operands) {
+  // Add the instruction mnemonic
+  Name = parseMnemonicSuffix(Name);
+  Operands.push_back(AMDGPUOperand::CreateToken(this, Name, NameLoc));
+
+  while (!getLexer().is(AsmToken::EndOfStatement)) {
+    OperandMatchResultTy Res = parseOperand(Operands, Name);
+
+    // Eat the comma or space if there is one.
+    if (getLexer().is(AsmToken::Comma))
+      Parser.Lex();
+
+    switch (Res) {
+      case MatchOperand_Success: break;
+      case MatchOperand_ParseFail:
+        Error(getLexer().getLoc(), "failed parsing operand.");
+        while (!getLexer().is(AsmToken::EndOfStatement)) {
+          Parser.Lex();
+        }
+        return true;
+      case MatchOperand_NoMatch:
+        Error(getLexer().getLoc(), "not a valid operand.");
+        while (!getLexer().is(AsmToken::EndOfStatement)) {
+          Parser.Lex();
+        }
+        return true;
+    }
+  }
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Utility functions
+//===----------------------------------------------------------------------===//
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) {
+  switch(getLexer().getKind()) {
+    default: return MatchOperand_NoMatch;
+    case AsmToken::Identifier: {
+      StringRef Name = Parser.getTok().getString();
+      if (!Name.equals(Prefix)) {
+        return MatchOperand_NoMatch;
+      }
+
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Colon))
+        return MatchOperand_ParseFail;
+
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::Integer))
+        return MatchOperand_ParseFail;
+
+      if (getParser().parseAbsoluteExpression(Int))
+        return MatchOperand_ParseFail;
+      break;
+    }
+  }
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
+                                    enum AMDGPUOperand::ImmTy ImmTy,
+                                    bool (*ConvertResult)(int64_t&)) {
+  SMLoc S = Parser.getTok().getLoc();
+  int64_t Value = 0;
+
+  OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Value);
+  if (Res != MatchOperand_Success)
+    return Res;
+
+  if (ConvertResult && !ConvertResult(Value)) {
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Value, S, ImmTy));
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
+                               enum AMDGPUOperand::ImmTy ImmTy) {
+  int64_t Bit = 0;
+  SMLoc S = Parser.getTok().getLoc();
+
+  // We are at the end of the statement, and this is a default argument, so
+  // use a default value.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    switch(getLexer().getKind()) {
+      case AsmToken::Identifier: {
+        StringRef Tok = Parser.getTok().getString();
+        if (Tok == Name) {
+          Bit = 1;
+          Parser.Lex();
+        } else if (Tok.startswith("no") && Tok.endswith(Name)) {
+          Bit = 0;
+          Parser.Lex();
+        } else {
+          return MatchOperand_NoMatch;
+        }
+        break;
+      }
+      default:
+        return MatchOperand_NoMatch;
+    }
+  }
+
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Bit, S, ImmTy));
+  return MatchOperand_Success;
+}
+
+typedef std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap;
+
+void addOptionalImmOperand(MCInst& Inst, const OperandVector& Operands,
+                           OptionalImmIndexMap& OptionalIdx,
+                           enum AMDGPUOperand::ImmTy ImmT, int64_t Default = 0) {
+  auto i = OptionalIdx.find(ImmT);
+  if (i != OptionalIdx.end()) {
+    unsigned Idx = i->second;
+    ((AMDGPUOperand &)*Operands[Idx]).addImmOperands(Inst, 1);
+  } else {
+    Inst.addOperand(MCOperand::createImm(Default));
+  }
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, StringRef &Value) {
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    return MatchOperand_NoMatch;
+  }
+  StringRef Tok = Parser.getTok().getString();
+  if (Tok != Prefix) {
+    return MatchOperand_NoMatch;
+  }
+
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::Colon)) {
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    return MatchOperand_ParseFail;
+  }
+
+  Value = Parser.getTok().getString();
+  return MatchOperand_Success;
+}
+
+//===----------------------------------------------------------------------===//
+// ds
+//===----------------------------------------------------------------------===//
+
+void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
+                                    const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+    // Add the register arguments
+    if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle optional arguments
+    OptionalIdx[Op.getImmTy()] = i;
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset0);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset1);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
+
+  Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
+}
+
+void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
+  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
+  bool GDSOnly = false;
+
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+    // Add the register arguments
+    if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
+      continue;
+    }
+
+    if (Op.isToken() && Op.getToken() == "gds") {
+      GDSOnly = true;
+      continue;
+    }
+
+    // Handle optional arguments
+    OptionalIdx[Op.getImmTy()] = i;
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
+
+  if (!GDSOnly) {
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
+  }
+  Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
+}
+
+void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+
+  unsigned EnMask = 0;
+  int SrcIdx = 0;
+
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+    // Add the register arguments
+    if (Op.isReg()) {
+      EnMask |= (1 << SrcIdx);
+      Op.addRegOperands(Inst, 1);
+      ++SrcIdx;
+      continue;
+    }
+
+    if (Op.isOff()) {
+      ++SrcIdx;
+      Inst.addOperand(MCOperand::createReg(AMDGPU::NoRegister));
+      continue;
+    }
+
+    if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyExpTgt) {
+      Op.addImmOperands(Inst, 1);
+      continue;
+    }
+
+    if (Op.isToken() && Op.getToken() == "done")
+      continue;
+
+    // Handle optional arguments
+    OptionalIdx[Op.getImmTy()] = i;
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpVM);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpCompr);
+
+  Inst.addOperand(MCOperand::createImm(EnMask));
+}
+
+//===----------------------------------------------------------------------===//
+// s_waitcnt
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
+  StringRef CntName = Parser.getTok().getString();
+  int64_t CntVal;
+
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::LParen))
+    return true;
+
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::Integer))
+    return true;
+
+  if (getParser().parseAbsoluteExpression(CntVal))
+    return true;
+
+  if (getLexer().isNot(AsmToken::RParen))
+    return true;
+
+  Parser.Lex();
+  if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma))
+    Parser.Lex();
+
+  IsaVersion IV = getIsaVersion(getSTI().getFeatureBits());
+  if (CntName == "vmcnt")
+    IntVal = encodeVmcnt(IV, IntVal, CntVal);
+  else if (CntName == "expcnt")
+    IntVal = encodeExpcnt(IV, IntVal, CntVal);
+  else if (CntName == "lgkmcnt")
+    IntVal = encodeLgkmcnt(IV, IntVal, CntVal);
+  else
+    return true;
+
+  return false;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
+  IsaVersion IV = getIsaVersion(getSTI().getFeatureBits());
+  int64_t Waitcnt = getWaitcntBitMask(IV);
+  SMLoc S = Parser.getTok().getLoc();
+
+  switch(getLexer().getKind()) {
+    default: return MatchOperand_ParseFail;
+    case AsmToken::Integer:
+      // The operand can be an integer value.
+      if (getParser().parseAbsoluteExpression(Waitcnt))
+        return MatchOperand_ParseFail;
+      break;
+
+    case AsmToken::Identifier:
+      do {
+        if (parseCnt(Waitcnt))
+          return MatchOperand_ParseFail;
+      } while(getLexer().isNot(AsmToken::EndOfStatement));
+      break;
+  }
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Waitcnt, S));
+  return MatchOperand_Success;
+}
+
+bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width) {
+  using namespace llvm::AMDGPU::Hwreg;
+
+  if (Parser.getTok().getString() != "hwreg")
+    return true;
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::LParen))
+    return true;
+  Parser.Lex();
+
+  if (getLexer().is(AsmToken::Identifier)) {
+    HwReg.IsSymbolic = true;
+    HwReg.Id = ID_UNKNOWN_;
+    const StringRef tok = Parser.getTok().getString();
+    for (int i = ID_SYMBOLIC_FIRST_; i < ID_SYMBOLIC_LAST_; ++i) {
+      if (tok == IdSymbolic[i]) {
+        HwReg.Id = i;
+        break;
+      }
+    }
+    Parser.Lex();
+  } else {
+    HwReg.IsSymbolic = false;
+    if (getLexer().isNot(AsmToken::Integer))
+      return true;
+    if (getParser().parseAbsoluteExpression(HwReg.Id))
+      return true;
+  }
+
+  if (getLexer().is(AsmToken::RParen)) {
+    Parser.Lex();
+    return false;
+  }
+
+  // optional params
+  if (getLexer().isNot(AsmToken::Comma))
+    return true;
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::Integer))
+    return true;
+  if (getParser().parseAbsoluteExpression(Offset))
+    return true;
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return true;
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::Integer))
+    return true;
+  if (getParser().parseAbsoluteExpression(Width))
+    return true;
+
+  if (getLexer().isNot(AsmToken::RParen))
+    return true;
+  Parser.Lex();
+
+  return false;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
+  using namespace llvm::AMDGPU::Hwreg;
+
+  int64_t Imm16Val = 0;
+  SMLoc S = Parser.getTok().getLoc();
+
+  switch(getLexer().getKind()) {
+    default: return MatchOperand_NoMatch;
+    case AsmToken::Integer:
+      // The operand can be an integer value.
+      if (getParser().parseAbsoluteExpression(Imm16Val))
+        return MatchOperand_NoMatch;
+      if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) {
+        Error(S, "invalid immediate: only 16-bit values are legal");
+        // Do not return error code, but create an imm operand anyway and proceed
+        // to the next operand, if any. That avoids unneccessary error messages.
+      }
+      break;
+
+    case AsmToken::Identifier: {
+        OperandInfoTy HwReg(ID_UNKNOWN_);
+        int64_t Offset = OFFSET_DEFAULT_;
+        int64_t Width = WIDTH_M1_DEFAULT_ + 1;
+        if (parseHwregConstruct(HwReg, Offset, Width))
+          return MatchOperand_ParseFail;
+        if (HwReg.Id < 0 || !isUInt<ID_WIDTH_>(HwReg.Id)) {
+          if (HwReg.IsSymbolic)
+            Error(S, "invalid symbolic name of hardware register");
+          else
+            Error(S, "invalid code of hardware register: only 6-bit values are legal");
+        }
+        if (Offset < 0 || !isUInt<OFFSET_WIDTH_>(Offset))
+          Error(S, "invalid bit offset: only 5-bit values are legal");
+        if ((Width-1) < 0 || !isUInt<WIDTH_M1_WIDTH_>(Width-1))
+          Error(S, "invalid bitfield width: only values from 1 to 32 are legal");
+        Imm16Val = (HwReg.Id << ID_SHIFT_) | (Offset << OFFSET_SHIFT_) | ((Width-1) << WIDTH_M1_SHIFT_);
+      }
+      break;
+  }
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Imm16Val, S, AMDGPUOperand::ImmTyHwreg));
+  return MatchOperand_Success;
+}
+
+bool AMDGPUOperand::isSWaitCnt() const {
+  return isImm();
+}
+
+bool AMDGPUOperand::isHwreg() const {
+  return isImmTy(ImmTyHwreg);
+}
+
+bool AMDGPUAsmParser::parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId) {
+  using namespace llvm::AMDGPU::SendMsg;
+
+  if (Parser.getTok().getString() != "sendmsg")
+    return true;
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::LParen))
+    return true;
+  Parser.Lex();
+
+  if (getLexer().is(AsmToken::Identifier)) {
+    Msg.IsSymbolic = true;
+    Msg.Id = ID_UNKNOWN_;
+    const std::string tok = Parser.getTok().getString();
+    for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) {
+      switch(i) {
+        default: continue; // Omit gaps.
+        case ID_INTERRUPT: case ID_GS: case ID_GS_DONE:  case ID_SYSMSG: break;
+      }
+      if (tok == IdSymbolic[i]) {
+        Msg.Id = i;
+        break;
+      }
+    }
+    Parser.Lex();
+  } else {
+    Msg.IsSymbolic = false;
+    if (getLexer().isNot(AsmToken::Integer))
+      return true;
+    if (getParser().parseAbsoluteExpression(Msg.Id))
+      return true;
+    if (getLexer().is(AsmToken::Integer))
+      if (getParser().parseAbsoluteExpression(Msg.Id))
+        Msg.Id = ID_UNKNOWN_;
+  }
+  if (Msg.Id == ID_UNKNOWN_) // Don't know how to parse the rest.
+    return false;
+
+  if (!(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG)) {
+    if (getLexer().isNot(AsmToken::RParen))
+      return true;
+    Parser.Lex();
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return true;
+  Parser.Lex();
+
+  assert(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG);
+  Operation.Id = ID_UNKNOWN_;
+  if (getLexer().is(AsmToken::Identifier)) {
+    Operation.IsSymbolic = true;
+    const char* const *S = (Msg.Id == ID_SYSMSG) ? OpSysSymbolic : OpGsSymbolic;
+    const int F = (Msg.Id == ID_SYSMSG) ? OP_SYS_FIRST_ : OP_GS_FIRST_;
+    const int L = (Msg.Id == ID_SYSMSG) ? OP_SYS_LAST_ : OP_GS_LAST_;
+    const StringRef Tok = Parser.getTok().getString();
+    for (int i = F; i < L; ++i) {
+      if (Tok == S[i]) {
+        Operation.Id = i;
+        break;
+      }
+    }
+    Parser.Lex();
+  } else {
+    Operation.IsSymbolic = false;
+    if (getLexer().isNot(AsmToken::Integer))
+      return true;
+    if (getParser().parseAbsoluteExpression(Operation.Id))
+      return true;
+  }
+
+  if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) {
+    // Stream id is optional.
+    if (getLexer().is(AsmToken::RParen)) {
+      Parser.Lex();
+      return false;
+    }
+
+    if (getLexer().isNot(AsmToken::Comma))
+      return true;
+    Parser.Lex();
+
+    if (getLexer().isNot(AsmToken::Integer))
+      return true;
+    if (getParser().parseAbsoluteExpression(StreamId))
+      return true;
+  }
+
+  if (getLexer().isNot(AsmToken::RParen))
+    return true;
+  Parser.Lex();
+  return false;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseInterpSlot(OperandVector &Operands) {
+  if (getLexer().getKind() != AsmToken::Identifier)
+    return MatchOperand_NoMatch;
+
+  StringRef Str = Parser.getTok().getString();
+  int Slot = StringSwitch<int>(Str)
+    .Case("p10", 0)
+    .Case("p20", 1)
+    .Case("p0", 2)
+    .Default(-1);
+
+  SMLoc S = Parser.getTok().getLoc();
+  if (Slot == -1)
+    return MatchOperand_ParseFail;
+
+  Parser.Lex();
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Slot, S,
+                                              AMDGPUOperand::ImmTyInterpSlot));
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) {
+  if (getLexer().getKind() != AsmToken::Identifier)
+    return MatchOperand_NoMatch;
+
+  StringRef Str = Parser.getTok().getString();
+  if (!Str.startswith("attr"))
+    return MatchOperand_NoMatch;
+
+  StringRef Chan = Str.take_back(2);
+  int AttrChan = StringSwitch<int>(Chan)
+    .Case(".x", 0)
+    .Case(".y", 1)
+    .Case(".z", 2)
+    .Case(".w", 3)
+    .Default(-1);
+  if (AttrChan == -1)
+    return MatchOperand_ParseFail;
+
+  Str = Str.drop_back(2).drop_front(4);
+
+  uint8_t Attr;
+  if (Str.getAsInteger(10, Attr))
+    return MatchOperand_ParseFail;
+
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex();
+  if (Attr > 63) {
+    Error(S, "out of bounds attr");
+    return MatchOperand_Success;
+  }
+
+  SMLoc SChan = SMLoc::getFromPointer(Chan.data());
+
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Attr, S,
+                                              AMDGPUOperand::ImmTyInterpAttr));
+  Operands.push_back(AMDGPUOperand::CreateImm(this, AttrChan, SChan,
+                                              AMDGPUOperand::ImmTyAttrChan));
+  return MatchOperand_Success;
+}
+
+void AMDGPUAsmParser::errorExpTgt() {
+  Error(Parser.getTok().getLoc(), "invalid exp target");
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseExpTgtImpl(StringRef Str,
+                                                      uint8_t &Val) {
+  if (Str == "null") {
+    Val = 9;
+    return MatchOperand_Success;
+  }
+
+  if (Str.startswith("mrt")) {
+    Str = Str.drop_front(3);
+    if (Str == "z") { // == mrtz
+      Val = 8;
+      return MatchOperand_Success;
+    }
+
+    if (Str.getAsInteger(10, Val))
+      return MatchOperand_ParseFail;
+
+    if (Val > 7)
+      errorExpTgt();
+
+    return MatchOperand_Success;
+  }
+
+  if (Str.startswith("pos")) {
+    Str = Str.drop_front(3);
+    if (Str.getAsInteger(10, Val))
+      return MatchOperand_ParseFail;
+
+    if (Val > 3)
+      errorExpTgt();
+
+    Val += 12;
+    return MatchOperand_Success;
+  }
+
+  if (Str.startswith("param")) {
+    Str = Str.drop_front(5);
+    if (Str.getAsInteger(10, Val))
+      return MatchOperand_ParseFail;
+
+    if (Val >= 32)
+      errorExpTgt();
+
+    Val += 32;
+    return MatchOperand_Success;
+  }
+
+  if (Str.startswith("invalid_target_")) {
+    Str = Str.drop_front(15);
+    if (Str.getAsInteger(10, Val))
+      return MatchOperand_ParseFail;
+
+    errorExpTgt();
+    return MatchOperand_Success;
+  }
+
+  return MatchOperand_NoMatch;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) {
+  uint8_t Val;
+  StringRef Str = Parser.getTok().getString();
+
+  auto Res = parseExpTgtImpl(Str, Val);
+  if (Res != MatchOperand_Success)
+    return Res;
+
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex();
+
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S,
+                                              AMDGPUOperand::ImmTyExpTgt));
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
+  using namespace llvm::AMDGPU::SendMsg;
+
+  int64_t Imm16Val = 0;
+  SMLoc S = Parser.getTok().getLoc();
+
+  switch(getLexer().getKind()) {
+  default:
+    return MatchOperand_NoMatch;
+  case AsmToken::Integer:
+    // The operand can be an integer value.
+    if (getParser().parseAbsoluteExpression(Imm16Val))
+      return MatchOperand_NoMatch;
+    if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) {
+      Error(S, "invalid immediate: only 16-bit values are legal");
+      // Do not return error code, but create an imm operand anyway and proceed
+      // to the next operand, if any. That avoids unneccessary error messages.
+    }
+    break;
+  case AsmToken::Identifier: {
+      OperandInfoTy Msg(ID_UNKNOWN_);
+      OperandInfoTy Operation(OP_UNKNOWN_);
+      int64_t StreamId = STREAM_ID_DEFAULT_;
+      if (parseSendMsgConstruct(Msg, Operation, StreamId))
+        return MatchOperand_ParseFail;
+      do {
+        // Validate and encode message ID.
+        if (! ((ID_INTERRUPT <= Msg.Id && Msg.Id <= ID_GS_DONE)
+                || Msg.Id == ID_SYSMSG)) {
+          if (Msg.IsSymbolic)
+            Error(S, "invalid/unsupported symbolic name of message");
+          else
+            Error(S, "invalid/unsupported code of message");
+          break;
+        }
+        Imm16Val = (Msg.Id << ID_SHIFT_);
+        // Validate and encode operation ID.
+        if (Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) {
+          if (! (OP_GS_FIRST_ <= Operation.Id && Operation.Id < OP_GS_LAST_)) {
+            if (Operation.IsSymbolic)
+              Error(S, "invalid symbolic name of GS_OP");
+            else
+              Error(S, "invalid code of GS_OP: only 2-bit values are legal");
+            break;
+          }
+          if (Operation.Id == OP_GS_NOP
+              && Msg.Id != ID_GS_DONE) {
+            Error(S, "invalid GS_OP: NOP is for GS_DONE only");
+            break;
+          }
+          Imm16Val |= (Operation.Id << OP_SHIFT_);
+        }
+        if (Msg.Id == ID_SYSMSG) {
+          if (! (OP_SYS_FIRST_ <= Operation.Id && Operation.Id < OP_SYS_LAST_)) {
+            if (Operation.IsSymbolic)
+              Error(S, "invalid/unsupported symbolic name of SYSMSG_OP");
+            else
+              Error(S, "invalid/unsupported code of SYSMSG_OP");
+            break;
+          }
+          Imm16Val |= (Operation.Id << OP_SHIFT_);
+        }
+        // Validate and encode stream ID.
+        if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) {
+          if (! (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_)) {
+            Error(S, "invalid stream id: only 2-bit values are legal");
+            break;
+          }
+          Imm16Val |= (StreamId << STREAM_ID_SHIFT_);
+        }
+      } while (false);
+    }
+    break;
+  }
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Imm16Val, S, AMDGPUOperand::ImmTySendMsg));
+  return MatchOperand_Success;
+}
+
+bool AMDGPUOperand::isSendMsg() const {
+  return isImmTy(ImmTySendMsg);
+}
+
+//===----------------------------------------------------------------------===//
+// sopp branch targets
+//===----------------------------------------------------------------------===//
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+
+  switch (getLexer().getKind()) {
+    default: return MatchOperand_ParseFail;
+    case AsmToken::Integer: {
+      int64_t Imm;
+      if (getParser().parseAbsoluteExpression(Imm))
+        return MatchOperand_ParseFail;
+      Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S));
+      return MatchOperand_Success;
+    }
+
+    case AsmToken::Identifier:
+      Operands.push_back(AMDGPUOperand::CreateExpr(this,
+          MCSymbolRefExpr::create(getContext().getOrCreateSymbol(
+                                  Parser.getTok().getString()), getContext()), S));
+      Parser.Lex();
+      return MatchOperand_Success;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// mubuf
+//===----------------------------------------------------------------------===//
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyGLC);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTySLC);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultTFE() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyTFE);
+}
+
+void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
+                               const OperandVector &Operands,
+                               bool IsAtomic, bool IsAtomicReturn) {
+  OptionalImmIndexMap OptionalIdx;
+  assert(IsAtomicReturn ? IsAtomic : true);
+
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+    // Add the register arguments
+    if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle the case where soffset is an immediate
+    if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
+      Op.addImmOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle tokens like 'offen' which are sometimes hard-coded into the
+    // asm string.  There are no MCInst operands for these.
+    if (Op.isToken()) {
+      continue;
+    }
+    assert(Op.isImm());
+
+    // Handle optional arguments
+    OptionalIdx[Op.getImmTy()] = i;
+  }
+
+  // Copy $vdata_in operand and insert as $vdata for MUBUF_Atomic RTN insns.
+  if (IsAtomicReturn) {
+    MCInst::iterator I = Inst.begin(); // $vdata_in is always at the beginning.
+    Inst.insert(I, *I);
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
+  if (!IsAtomic) { // glc is hard-coded.
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+  }
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+}
+
+//===----------------------------------------------------------------------===//
+// mimg
+//===----------------------------------------------------------------------===//
+
+void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands) {
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
+
+  OptionalImmIndexMap OptionalIdx;
+
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+
+    // Add the register arguments
+    if (Op.isRegOrImm()) {
+      Op.addRegOrImmOperands(Inst, 1);
+      continue;
+    } else if (Op.isImmModifier()) {
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      llvm_unreachable("unexpected operand type");
+    }
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+}
+
+void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) {
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
+
+  // Add src, same as dst
+  ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1);
+
+  OptionalImmIndexMap OptionalIdx;
+
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+
+    // Add the register arguments
+    if (Op.isRegOrImm()) {
+      Op.addRegOrImmOperands(Inst, 1);
+      continue;
+    } else if (Op.isImmModifier()) {
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      llvm_unreachable("unexpected operand type");
+    }
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDMask);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultUNorm() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyUNorm);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDA() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDA);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultR128() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyR128);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultLWE() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyLWE);
+}
+
+//===----------------------------------------------------------------------===//
+// smrd
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUOperand::isSMRDOffset8() const {
+  return isImm() && isUInt<8>(getImm());
+}
+
+bool AMDGPUOperand::isSMRDOffset20() const {
+  return isImm() && isUInt<20>(getImm());
+}
+
+bool AMDGPUOperand::isSMRDLiteralOffset() const {
+  // 32-bit literals are only supported on CI and we only want to use them
+  // when the offset is > 8-bits.
+  return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm());
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset8() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset20() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
+//===----------------------------------------------------------------------===//
+// vop3
+//===----------------------------------------------------------------------===//
+
+static bool ConvertOmodMul(int64_t &Mul) {
+  if (Mul != 1 && Mul != 2 && Mul != 4)
+    return false;
+
+  Mul >>= 1;
+  return true;
+}
+
+static bool ConvertOmodDiv(int64_t &Div) {
+  if (Div == 1) {
+    Div = 0;
+    return true;
+  }
+
+  if (Div == 2) {
+    Div = 3;
+    return true;
+  }
+
+  return false;
+}
+
+static bool ConvertBoundCtrl(int64_t &BoundCtrl) {
+  if (BoundCtrl == 0) {
+    BoundCtrl = 1;
+    return true;
+  }
+
+  if (BoundCtrl == -1) {
+    BoundCtrl = 0;
+    return true;
+  }
+
+  return false;
+}
+
+// Note: the order in this table matches the order of operands in AsmString.
+static const OptionalOperand AMDGPUOptionalOperandTable[] = {
+  {"offen",   AMDGPUOperand::ImmTyOffen, true, nullptr},
+  {"idxen",   AMDGPUOperand::ImmTyIdxen, true, nullptr},
+  {"addr64",  AMDGPUOperand::ImmTyAddr64, true, nullptr},
+  {"offset0", AMDGPUOperand::ImmTyOffset0, false, nullptr},
+  {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
+  {"gds",     AMDGPUOperand::ImmTyGDS, true, nullptr},
+  {"offset",  AMDGPUOperand::ImmTyOffset, false, nullptr},
+  {"glc",     AMDGPUOperand::ImmTyGLC, true, nullptr},
+  {"slc",     AMDGPUOperand::ImmTySLC, true, nullptr},
+  {"tfe",     AMDGPUOperand::ImmTyTFE, true, nullptr},
+  {"clamp",   AMDGPUOperand::ImmTyClampSI, true, nullptr},
+  {"omod",    AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul},
+  {"unorm",   AMDGPUOperand::ImmTyUNorm, true, nullptr},
+  {"da",      AMDGPUOperand::ImmTyDA,    true, nullptr},
+  {"r128",    AMDGPUOperand::ImmTyR128,  true, nullptr},
+  {"lwe",     AMDGPUOperand::ImmTyLWE,   true, nullptr},
+  {"dmask",   AMDGPUOperand::ImmTyDMask, false, nullptr},
+  {"row_mask",   AMDGPUOperand::ImmTyDppRowMask, false, nullptr},
+  {"bank_mask",  AMDGPUOperand::ImmTyDppBankMask, false, nullptr},
+  {"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl},
+  {"dst_sel",    AMDGPUOperand::ImmTySdwaDstSel, false, nullptr},
+  {"src0_sel",   AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr},
+  {"src1_sel",   AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr},
+  {"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr},
+  {"vm", AMDGPUOperand::ImmTyExpVM, true, nullptr},
+};
+
+OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
+  OperandMatchResultTy res;
+  for (const OptionalOperand &Op : AMDGPUOptionalOperandTable) {
+    // try to parse any optional operand here
+    if (Op.IsBit) {
+      res = parseNamedBit(Op.Name, Operands, Op.Type);
+    } else if (Op.Type == AMDGPUOperand::ImmTyOModSI) {
+      res = parseOModOperand(Operands);
+    } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstSel ||
+               Op.Type == AMDGPUOperand::ImmTySdwaSrc0Sel ||
+               Op.Type == AMDGPUOperand::ImmTySdwaSrc1Sel) {
+      res = parseSDWASel(Operands, Op.Name, Op.Type);
+    } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstUnused) {
+      res = parseSDWADstUnused(Operands);
+    } else {
+      res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
+    }
+    if (res != MatchOperand_NoMatch) {
+      return res;
+    }
+  }
+  return MatchOperand_NoMatch;
+}
+
+OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) {
+  StringRef Name = Parser.getTok().getString();
+  if (Name == "mul") {
+    return parseIntWithPrefix("mul", Operands,
+                              AMDGPUOperand::ImmTyOModSI, ConvertOmodMul);
+  }
+
+  if (Name == "div") {
+    return parseIntWithPrefix("div", Operands,
+                              AMDGPUOperand::ImmTyOModSI, ConvertOmodDiv);
+  }
+
+  return MatchOperand_NoMatch;
+}
+
+void AMDGPUAsmParser::cvtId(MCInst &Inst, const OperandVector &Operands) {
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
+  for (unsigned E = Operands.size(); I != E; ++I)
+    ((AMDGPUOperand &)*Operands[I]).addRegOrImmOperands(Inst, 1);
+}
+
+void AMDGPUAsmParser::cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands) {
+  uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+  if (TSFlags & SIInstrFlags::VOP3) {
+    cvtVOP3(Inst, Operands);
+  } else {
+    cvtId(Inst, Operands);
+  }
+}
+
+static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
+      // 1. This operand is input modifiers
+  return Desc.OpInfo[OpNum].OperandType == AMDGPU::OPERAND_INPUT_MODS
+      // 2. This is not last operand
+      && Desc.NumOperands > (OpNum + 1)
+      // 3. Next operand is register class
+      && Desc.OpInfo[OpNum + 1].RegClass != -1
+      // 4. Next register is not tied to any other operand
+      && Desc.getOperandConstraint(OpNum + 1, MCOI::OperandConstraint::TIED_TO) == -1;
+}
+
+void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
+
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+    if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+      Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+    } else if (Op.isImm()) {
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      llvm_unreachable("unhandled operand type");
+    }
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
+
+  // special case v_mac_{f16, f32}:
+  // it has src2 register operand that is tied to dst operand
+  // we don't allow modifiers for this operand in assembler so src2_modifiers
+  // should be 0
+  if (Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
+      Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
+      Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi) {
+    auto it = Inst.begin();
+    std::advance(
+      it,
+      AMDGPU::getNamedOperandIdx(Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ?
+                                     AMDGPU::V_MAC_F16_e64 :
+                                     AMDGPU::V_MAC_F32_e64,
+                                 AMDGPU::OpName::src2_modifiers));
+    it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
+    ++it;
+    Inst.insert(it, Inst.getOperand(0)); // src2 = dst
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// dpp
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUOperand::isDPPCtrl() const {
+  bool result = isImm() && getImmTy() == ImmTyDppCtrl && isUInt<9>(getImm());
+  if (result) {
+    int64_t Imm = getImm();
+    return ((Imm >= 0x000) && (Imm <= 0x0ff)) ||
+           ((Imm >= 0x101) && (Imm <= 0x10f)) ||
+           ((Imm >= 0x111) && (Imm <= 0x11f)) ||
+           ((Imm >= 0x121) && (Imm <= 0x12f)) ||
+           (Imm == 0x130) ||
+           (Imm == 0x134) ||
+           (Imm == 0x138) ||
+           (Imm == 0x13c) ||
+           (Imm == 0x140) ||
+           (Imm == 0x141) ||
+           (Imm == 0x142) ||
+           (Imm == 0x143);
+  }
+  return false;
+}
+
+bool AMDGPUOperand::isGPRIdxMode() const {
+  return isImm() && isUInt<4>(getImm());
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  StringRef Prefix;
+  int64_t Int;
+
+  if (getLexer().getKind() == AsmToken::Identifier) {
+    Prefix = Parser.getTok().getString();
+  } else {
+    return MatchOperand_NoMatch;
+  }
+
+  if (Prefix == "row_mirror") {
+    Int = 0x140;
+    Parser.Lex();
+  } else if (Prefix == "row_half_mirror") {
+    Int = 0x141;
+    Parser.Lex();
+  } else {
+    // Check to prevent parseDPPCtrlOps from eating invalid tokens
+    if (Prefix != "quad_perm"
+        && Prefix != "row_shl"
+        && Prefix != "row_shr"
+        && Prefix != "row_ror"
+        && Prefix != "wave_shl"
+        && Prefix != "wave_rol"
+        && Prefix != "wave_shr"
+        && Prefix != "wave_ror"
+        && Prefix != "row_bcast") {
+      return MatchOperand_NoMatch;
+    }
+
+    Parser.Lex();
+    if (getLexer().isNot(AsmToken::Colon))
+      return MatchOperand_ParseFail;
+
+    if (Prefix == "quad_perm") {
+      // quad_perm:[%d,%d,%d,%d]
+      Parser.Lex();
+      if (getLexer().isNot(AsmToken::LBrac))
+        return MatchOperand_ParseFail;
+      Parser.Lex();
+
+      if (getParser().parseAbsoluteExpression(Int) || !(0 <= Int && Int <=3))
+        return MatchOperand_ParseFail;
+
+      for (int i = 0; i < 3; ++i) {
+        if (getLexer().isNot(AsmToken::Comma))
+          return MatchOperand_ParseFail;
+        Parser.Lex();
+
+        int64_t Temp;
+        if (getParser().parseAbsoluteExpression(Temp) || !(0 <= Temp && Temp <=3))
+          return MatchOperand_ParseFail;
+        const int shift = i*2 + 2;
+        Int += (Temp << shift);
+      }
+
+      if (getLexer().isNot(AsmToken::RBrac))
+        return MatchOperand_ParseFail;
+      Parser.Lex();
+
+    } else {
+      // sel:%d
+      Parser.Lex();
+      if (getParser().parseAbsoluteExpression(Int))
+        return MatchOperand_ParseFail;
+
+      if (Prefix == "row_shl" && 1 <= Int && Int <= 15) {
+        Int |= 0x100;
+      } else if (Prefix == "row_shr" && 1 <= Int && Int <= 15) {
+        Int |= 0x110;
+      } else if (Prefix == "row_ror" && 1 <= Int && Int <= 15) {
+        Int |= 0x120;
+      } else if (Prefix == "wave_shl" && 1 == Int) {
+        Int = 0x130;
+      } else if (Prefix == "wave_rol" && 1 == Int) {
+        Int = 0x134;
+      } else if (Prefix == "wave_shr" && 1 == Int) {
+        Int = 0x138;
+      } else if (Prefix == "wave_ror" && 1 == Int) {
+        Int = 0x13C;
+      } else if (Prefix == "row_bcast") {
+        if (Int == 15) {
+          Int = 0x142;
+        } else if (Int == 31) {
+          Int = 0x143;
+        } else {
+          return MatchOperand_ParseFail;
+        }
+      } else {
+        return MatchOperand_ParseFail;
+      }
+    }
+  }
+
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, AMDGPUOperand::ImmTyDppCtrl));
+  return MatchOperand_Success;
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultRowMask() const {
+  return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppRowMask);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBankMask() const {
+  return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppBankMask);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBoundCtrl() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl);
+}
+
+void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
+
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+    // Add the register arguments
+    if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {
+      // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token.
+      // Skip it.
+      continue;
+    } if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+      Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+    } else if (Op.isDPPCtrl()) {
+      Op.addImmOperands(Inst, 1);
+    } else if (Op.isImm()) {
+      // Handle optional arguments
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      llvm_unreachable("Invalid operand type");
+    }
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl);
+
+  // special case v_mac_{f16, f32}:
+  // it has src2 register operand that is tied to dst operand
+  if (Inst.getOpcode() == AMDGPU::V_MAC_F32_dpp ||
+      Inst.getOpcode() == AMDGPU::V_MAC_F16_dpp) {
+    auto it = Inst.begin();
+    std::advance(
+        it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
+    Inst.insert(it, Inst.getOperand(0)); // src2 = dst
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// sdwa
+//===----------------------------------------------------------------------===//
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix,
+                              AMDGPUOperand::ImmTy Type) {
+  using namespace llvm::AMDGPU::SDWA;
+
+  SMLoc S = Parser.getTok().getLoc();
+  StringRef Value;
+  OperandMatchResultTy res;
+
+  res = parseStringWithPrefix(Prefix, Value);
+  if (res != MatchOperand_Success) {
+    return res;
+  }
+
+  int64_t Int;
+  Int = StringSwitch<int64_t>(Value)
+        .Case("BYTE_0", SdwaSel::BYTE_0)
+        .Case("BYTE_1", SdwaSel::BYTE_1)
+        .Case("BYTE_2", SdwaSel::BYTE_2)
+        .Case("BYTE_3", SdwaSel::BYTE_3)
+        .Case("WORD_0", SdwaSel::WORD_0)
+        .Case("WORD_1", SdwaSel::WORD_1)
+        .Case("DWORD", SdwaSel::DWORD)
+        .Default(0xffffffff);
+  Parser.Lex(); // eat last token
+
+  if (Int == 0xffffffff) {
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, Type));
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) {
+  using namespace llvm::AMDGPU::SDWA;
+
+  SMLoc S = Parser.getTok().getLoc();
+  StringRef Value;
+  OperandMatchResultTy res;
+
+  res = parseStringWithPrefix("dst_unused", Value);
+  if (res != MatchOperand_Success) {
+    return res;
+  }
+
+  int64_t Int;
+  Int = StringSwitch<int64_t>(Value)
+        .Case("UNUSED_PAD", DstUnused::UNUSED_PAD)
+        .Case("UNUSED_SEXT", DstUnused::UNUSED_SEXT)
+        .Case("UNUSED_PRESERVE", DstUnused::UNUSED_PRESERVE)
+        .Default(0xffffffff);
+  Parser.Lex(); // eat last token
+
+  if (Int == 0xffffffff) {
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, AMDGPUOperand::ImmTySdwaDstUnused));
+  return MatchOperand_Success;
+}
+
+void AMDGPUAsmParser::cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands) {
+  cvtSDWA(Inst, Operands, SIInstrFlags::VOP1);
+}
+
+void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) {
+  cvtSDWA(Inst, Operands, SIInstrFlags::VOP2);
+}
+
+void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) {
+  cvtSDWA(Inst, Operands, SIInstrFlags::VOPC);
+}
+
+void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
+                              uint64_t BasicInstType) {
+  OptionalImmIndexMap OptionalIdx;
+
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
+
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+    // Add the register arguments
+    if ((BasicInstType == SIInstrFlags::VOPC || 
+         BasicInstType == SIInstrFlags::VOP2)&&
+        Op.isReg() &&
+        Op.Reg.RegNo == AMDGPU::VCC) {
+      // VOPC and VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst.
+      // Skip it.
+      continue;
+    } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+      Op.addRegOrImmWithInputModsOperands(Inst, 2);
+    } else if (Op.isImm()) {
+      // Handle optional arguments
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      llvm_unreachable("Invalid operand type");
+    }
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
+
+  if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {
+    // V_NOP_sdwa_vi has no optional sdwa arguments
+    switch (BasicInstType) {
+    case SIInstrFlags::VOP1:
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+      break;
+
+    case SIInstrFlags::VOP2:
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6);
+      break;
+
+    case SIInstrFlags::VOPC:
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6);
+      break;
+
+    default:
+      llvm_unreachable("Invalid instruction type. Only VOP1, VOP2 and VOPC allowed");
+    }
+  }
+
+  // special case v_mac_{f16, f32}:
+  // it has src2 register operand that is tied to dst operand
+  if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa_vi ||
+      Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi)  {
+    auto it = Inst.begin();
+    std::advance(
+        it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
+    Inst.insert(it, Inst.getOperand(0)); // src2 = dst
+  }
+
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializeAMDGPUAsmParser() {
+  RegisterMCAsmParser<AMDGPUAsmParser> A(getTheAMDGPUTarget());
+  RegisterMCAsmParser<AMDGPUAsmParser> B(getTheGCNTarget());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "AMDGPUGenAsmMatcher.inc"
+
+// This fuction should be defined after auto-generated include so that we have
+// MatchClassKind enum defined
+unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                                     unsigned Kind) {
+  // Tokens like "glc" would be parsed as immediate operands in ParseOperand().
+  // But MatchInstructionImpl() expects to meet token and fails to validate
+  // operand. This method checks if we are given immediate operand but expect to
+  // get corresponding token.
+  AMDGPUOperand &Operand = (AMDGPUOperand&)Op;
+  switch (Kind) {
+  case MCK_addr64:
+    return Operand.isAddr64() ? Match_Success : Match_InvalidOperand;
+  case MCK_gds:
+    return Operand.isGDS() ? Match_Success : Match_InvalidOperand;
+  case MCK_glc:
+    return Operand.isGLC() ? Match_Success : Match_InvalidOperand;
+  case MCK_idxen:
+    return Operand.isIdxen() ? Match_Success : Match_InvalidOperand;
+  case MCK_offen:
+    return Operand.isOffen() ? Match_Success : Match_InvalidOperand;
+  case MCK_SSrcB32:
+    // When operands have expression values, they will return true for isToken,
+    // because it is not possible to distinguish between a token and an
+    // expression at parse time. MatchInstructionImpl() will always try to
+    // match an operand as a token, when isToken returns true, and when the
+    // name of the expression is not a valid token, the match will fail,
+    // so we need to handle it here.
+    return Operand.isSSrcB32() ? Match_Success : Match_InvalidOperand;
+  case MCK_SSrcF32:
+    return Operand.isSSrcF32() ? Match_Success : Match_InvalidOperand;
+  case MCK_SoppBrTarget:
+    return Operand.isSoppBrTarget() ? Match_Success : Match_InvalidOperand;
+  case MCK_VReg32OrOff:
+    return Operand.isVReg32OrOff() ? Match_Success : Match_InvalidOperand;
+  case MCK_InterpSlot:
+    return Operand.isInterpSlot() ? Match_Success : Match_InvalidOperand;
+  case MCK_Attr:
+    return Operand.isInterpAttr() ? Match_Success : Match_InvalidOperand;
+  case MCK_AttrChan:
+    return Operand.isAttrChan() ? Match_Success : Match_InvalidOperand;
+  default:
+    return Match_InvalidOperand;
+  }
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td
new file mode 100644
index 000000000000..45a7fe6d3439
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -0,0 +1,1350 @@
+//===-- BUFInstructions.td - Buffer Instruction Defintions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
+def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
+def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
+
+def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
+def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
+def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
+def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
+def MUBUFIntrinsicOffset : ComplexPattern<i32, 2, "SelectMUBUFIntrinsicOffset">;
+def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset">;
+
+class MubufLoad <SDPatternOperator op> : PatFrag <
+  (ops node:$ptr), (op node:$ptr), [{
+  auto const AS = cast<MemSDNode>(N)->getAddressSpace();
+  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+         AS == AMDGPUAS::CONSTANT_ADDRESS;
+}]>;
+
+def mubuf_load          : MubufLoad <load>;
+def mubuf_az_extloadi8  : MubufLoad <az_extloadi8>;
+def mubuf_sextloadi8    : MubufLoad <sextloadi8>;
+def mubuf_az_extloadi16 : MubufLoad <az_extloadi16>;
+def mubuf_sextloadi16   : MubufLoad <sextloadi16>;
+def mubuf_load_atomic   : MubufLoad <atomic_load>;
+
+def BUFAddrKind {
+  int Offset = 0;
+  int OffEn  = 1;
+  int IdxEn  = 2;
+  int BothEn = 3;
+  int Addr64 = 4;
+}
+
+class getAddrName<int addrKind> {
+  string ret =
+    !if(!eq(addrKind, BUFAddrKind.Offset), "offset",
+    !if(!eq(addrKind, BUFAddrKind.OffEn),  "offen",
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),  "idxen",
+    !if(!eq(addrKind, BUFAddrKind.BothEn), "bothen",
+    !if(!eq(addrKind, BUFAddrKind.Addr64), "addr64",
+    "")))));
+}
+
+class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
+  bit IsAddr64 = is_addr64;
+  string OpName = NAME # suffix;
+}
+
+//===----------------------------------------------------------------------===//
+// MTBUF classes
+//===----------------------------------------------------------------------===//
+
+class MTBUF_Pseudo <string opName, dag outs, dag ins,
+                    string asmOps, list<dag> pattern=[]> :
+  InstSI<outs, ins, "", pattern>,
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let Size = 8;
+  let UseNamedOperandTable = 1;
+
+  string Mnemonic = opName;
+  string AsmOperands = asmOps;
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+  let MTBUF = 1;
+  let Uses = [EXEC];
+
+  let hasSideEffects = 0;
+  let SchedRW = [WriteVMEM];
+}
+
+class MTBUF_Real <MTBUF_Pseudo ps> :
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+  Enc64 {
+
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  // copy relevant pseudo op flags
+  let SubtargetPredicate = ps.SubtargetPredicate;
+  let AsmMatchConverter  = ps.AsmMatchConverter;
+  let Constraints        = ps.Constraints;
+  let DisableEncoding    = ps.DisableEncoding;
+  let TSFlags            = ps.TSFlags;
+
+  bits<8> vdata;
+  bits<12> offset;
+  bits<1> offen;
+  bits<1> idxen;
+  bits<1> glc;
+  bits<1> addr64;
+  bits<4> dfmt;
+  bits<3> nfmt;
+  bits<8> vaddr;
+  bits<7> srsrc;
+  bits<1> slc;
+  bits<1> tfe;
+  bits<8> soffset;
+
+  let Inst{11-0}  = offset;
+  let Inst{12}    = offen;
+  let Inst{13}    = idxen;
+  let Inst{14}    = glc;
+  let Inst{22-19} = dfmt;
+  let Inst{25-23} = nfmt;
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = vaddr;
+  let Inst{47-40} = vdata;
+  let Inst{52-48} = srsrc{6-2};
+  let Inst{54}    = slc;
+  let Inst{55}    = tfe;
+  let Inst{63-56} = soffset;
+}
+
+class MTBUF_Load_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
+  opName, (outs regClass:$dst),
+  (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+       i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
+       i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
+  " $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
+  " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+class MTBUF_Store_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
+  opName, (outs),
+  (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
+       i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
+       SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
+  " $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
+  " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+  let mayLoad = 0;
+  let mayStore = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// MUBUF classes
+//===----------------------------------------------------------------------===//
+
+class MUBUF_Pseudo <string opName, dag outs, dag ins,
+                    string asmOps, list<dag> pattern=[]> :
+  InstSI<outs, ins, "", pattern>,
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let Size = 8;
+  let UseNamedOperandTable = 1;
+
+  string Mnemonic = opName;
+  string AsmOperands = asmOps;
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+  let MUBUF = 1;
+  let Uses = [EXEC];
+  let hasSideEffects = 0;
+  let SchedRW = [WriteVMEM];
+
+  let AsmMatchConverter = "cvtMubuf";
+
+  bits<1> offen       = 0;
+  bits<1> idxen       = 0;
+  bits<1> addr64      = 0;
+  bits<1> has_vdata   = 1;
+  bits<1> has_vaddr   = 1;
+  bits<1> has_glc     = 1;
+  bits<1> glc_value   = 0; // the value for glc if no such operand
+  bits<1> has_srsrc   = 1;
+  bits<1> has_soffset = 1;
+  bits<1> has_offset  = 1;
+  bits<1> has_slc     = 1;
+  bits<1> has_tfe     = 1;
+}
+
+class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> :
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
+
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  // copy relevant pseudo op flags
+  let SubtargetPredicate = ps.SubtargetPredicate;
+  let AsmMatchConverter  = ps.AsmMatchConverter;
+  let Constraints        = ps.Constraints;
+  let DisableEncoding    = ps.DisableEncoding;
+  let TSFlags            = ps.TSFlags;
+
+  bits<12> offset;
+  bits<1>  glc;
+  bits<1>  lds = 0;
+  bits<8>  vaddr;
+  bits<8>  vdata;
+  bits<7>  srsrc;
+  bits<1>  slc;
+  bits<1>  tfe;
+  bits<8>  soffset;
+}
+
+
+// For cache invalidation instructions.
+class MUBUF_Invalidate <string opName, SDPatternOperator node> :
+  MUBUF_Pseudo<opName, (outs), (ins), "", [(node)]> {
+
+  let AsmMatchConverter = "";
+
+  let hasSideEffects = 1;
+  let mayStore = 1;
+
+  // Set everything to 0.
+  let offen       = 0;
+  let idxen       = 0;
+  let addr64      = 0;
+  let has_vdata   = 0;
+  let has_vaddr   = 0;
+  let has_glc     = 0;
+  let glc_value   = 0;
+  let has_srsrc   = 0;
+  let has_soffset = 0;
+  let has_offset  = 0;
+  let has_slc     = 0;
+  let has_tfe     = 0;
+}
+
+class getMUBUFInsDA<list<RegisterClass> vdataList,
+                    list<RegisterClass> vaddrList=[]> {
+  RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
+  RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+  dag InsNoData = !if(!empty(vaddrList),
+    (ins                    SReg_128:$srsrc, SCSrc_b32:$soffset,
+         offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe),
+    (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
+         offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe)
+  );
+  dag InsData = !if(!empty(vaddrList),
+    (ins vdataClass:$vdata,                    SReg_128:$srsrc,
+         SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe),
+    (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+         SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe)
+  );
+  dag ret = !if(!empty(vdataList), InsNoData, InsData);
+}
+
+class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
+  dag ret =
+    !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList>.ret,
+    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMUBUFInsDA<vdataList, [VGPR_32]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMUBUFInsDA<vdataList, [VGPR_32]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64]>.ret,
+    (ins))))));
+}
+
+class getMUBUFAsmOps<int addrKind> {
+  string Pfx =
+    !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $soffset",
+    !if(!eq(addrKind, BUFAddrKind.OffEn),  "$vaddr, $srsrc, $soffset offen",
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),  "$vaddr, $srsrc, $soffset idxen",
+    !if(!eq(addrKind, BUFAddrKind.BothEn), "$vaddr, $srsrc, $soffset idxen offen",
+    !if(!eq(addrKind, BUFAddrKind.Addr64), "$vaddr, $srsrc, $soffset addr64",
+    "")))));
+  string ret = Pfx # "$offset";
+}
+
+class MUBUF_SetupAddr<int addrKind> {
+  bits<1> offen  = !if(!eq(addrKind, BUFAddrKind.OffEn), 1,
+                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+  bits<1> idxen  = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1,
+                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+  bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0);
+
+  bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1);
+}
+
+class MUBUF_Load_Pseudo <string opName,
+                         int addrKind,
+                         RegisterClass vdataClass,
+                         list<dag> pattern=[],
+                         // Workaround bug bz30254
+                         int addrKindCopy = addrKind>
+  : MUBUF_Pseudo<opName,
+                 (outs vdataClass:$vdata),
+                 getMUBUFIns<addrKindCopy>.ret,
+                 " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+                 pattern>,
+    MUBUF_SetupAddr<addrKindCopy> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+// FIXME: tfe can't be an operand because it requires a separate
+// opcode because it needs an N+1 register class dest register.
+multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+                              ValueType load_vt = i32,
+                              SDPatternOperator ld = null_frag> {
+
+  def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+    [(set load_vt:$vdata,
+     (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
+    MUBUFAddr64Table<0>;
+
+  def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+    [(set load_vt:$vdata,
+     (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>,
+    MUBUFAddr64Table<1>;
+
+  def _OFFEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+  def _IDXEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+  def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+    def _OFFEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+    def _IDXEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+    def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  }
+}
+
+class MUBUF_Store_Pseudo <string opName,
+                          int addrKind,
+                          RegisterClass vdataClass,
+                          list<dag> pattern=[],
+                          // Workaround bug bz30254
+                          int addrKindCopy = addrKind,
+                          RegisterClass vdataClassCopy = vdataClass>
+  : MUBUF_Pseudo<opName,
+                 (outs),
+                 getMUBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
+                 " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+                 pattern>,
+    MUBUF_SetupAddr<addrKindCopy> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+  let mayLoad = 0;
+  let mayStore = 1;
+}
+
+multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+                               ValueType store_vt = i32,
+                               SDPatternOperator st = null_frag> {
+
+  def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+    [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+                                       i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
+    MUBUFAddr64Table<0>;
+
+  def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+    [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+                                       i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>,
+    MUBUFAddr64Table<1>;
+
+  def _OFFEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+  def _IDXEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+  def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+    def _OFFEN_exact  : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+    def _IDXEN_exact  : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+    def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  }
+}
+
+
+class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
+                          list<RegisterClass> vaddrList=[]> {
+  RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+  dag ret = !if(vdata_in,
+    !if(!empty(vaddrList),
+      (ins vdataClass:$vdata_in,
+           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc),
+      (ins vdataClass:$vdata_in, vaddrClass:$vaddr,
+           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc)
+    ),
+    !if(!empty(vaddrList),
+      (ins vdataClass:$vdata,
+           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc),
+      (ins vdataClass:$vdata, vaddrClass:$vaddr,
+           SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc)
+  ));
+}
+
+class getMUBUFAtomicIns<int addrKind,
+                        RegisterClass vdataClass,
+                        bit vdata_in,
+                        // Workaround bug bz30254
+                        RegisterClass vdataClassCopy=vdataClass> {
+  dag ret =
+    !if(!eq(addrKind, BUFAddrKind.Offset),
+            getMUBUFAtomicInsDA<vdataClassCopy, vdata_in>.ret,
+    !if(!eq(addrKind, BUFAddrKind.OffEn),
+            getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VGPR_32]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),
+            getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VGPR_32]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.BothEn),
+            getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VReg_64]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Addr64),
+            getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VReg_64]>.ret,
+    (ins))))));
+}
+
+class MUBUF_Atomic_Pseudo<string opName,
+                          int addrKind,
+                          dag outs,
+                          dag ins,
+                          string asmOps,
+                          list<dag> pattern=[],
+                          // Workaround bug bz30254
+                          int addrKindCopy = addrKind>
+  : MUBUF_Pseudo<opName, outs, ins, asmOps, pattern>,
+    MUBUF_SetupAddr<addrKindCopy> {
+  let mayStore = 1;
+  let mayLoad = 1;
+  let hasPostISelHook = 1;
+  let hasSideEffects = 1;
+  let DisableWQM = 1;
+  let has_glc = 0;
+  let has_tfe = 0;
+}
+
+class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind,
+                               RegisterClass vdataClass,
+                               list<dag> pattern=[],
+                               // Workaround bug bz30254
+                               int addrKindCopy = addrKind,
+                               RegisterClass vdataClassCopy = vdataClass>
+  : MUBUF_Atomic_Pseudo<opName, addrKindCopy,
+                        (outs),
+                        getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 0>.ret,
+                        " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$slc",
+                        pattern>,
+    AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 0> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+  let glc_value = 0;
+  let AsmMatchConverter = "cvtMubufAtomic";
+}
+
+class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
+                             RegisterClass vdataClass,
+                             list<dag> pattern=[],
+                             // Workaround bug bz30254
+                             int addrKindCopy = addrKind,
+                             RegisterClass vdataClassCopy = vdataClass>
+  : MUBUF_Atomic_Pseudo<opName, addrKindCopy,
+                        (outs vdataClassCopy:$vdata),
+                        getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1>.ret,
+                        " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # " glc$slc",
+                        pattern>,
+    AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 1> {
+  let PseudoInstr = opName # "_rtn_" # getAddrName<addrKindCopy>.ret;
+  let glc_value = 1;
+  let Constraints = "$vdata = $vdata_in";
+  let DisableEncoding = "$vdata_in";
+  let AsmMatchConverter = "cvtMubufAtomicReturn";
+}
+
+multiclass MUBUF_Pseudo_Atomics <string opName,
+                                 RegisterClass vdataClass,
+                                 ValueType vdataType,
+                                 SDPatternOperator atomic> {
+
+  def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
+                MUBUFAddr64Table <0>;
+  def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>,
+                MUBUFAddr64Table <1>;
+  def _OFFEN  : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn,  vdataClass>;
+  def _IDXEN  : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn,  vdataClass>;
+  def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+  def _RTN_OFFSET : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+    [(set vdataType:$vdata,
+     (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc),
+             vdataType:$vdata_in))]>,
+    MUBUFAddr64Table <0, "_RTN">;
+
+  def _RTN_ADDR64 : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+    [(set vdataType:$vdata,
+     (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc),
+             vdataType:$vdata_in))]>,
+    MUBUFAddr64Table <1, "_RTN">;
+
+  def _RTN_OFFEN  : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn,  vdataClass>;
+  def _RTN_IDXEN  : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn,  vdataClass>;
+  def _RTN_BOTHEN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// MUBUF Instructions
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isGCN in {
+
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads <
+  "buffer_load_format_x", VGPR_32
+>;
+defm BUFFER_LOAD_FORMAT_XY : MUBUF_Pseudo_Loads <
+  "buffer_load_format_xy", VReg_64
+>;
+defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Pseudo_Loads <
+  "buffer_load_format_xyz", VReg_96
+>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Pseudo_Loads <
+  "buffer_load_format_xyzw", VReg_128
+>;
+defm BUFFER_STORE_FORMAT_X : MUBUF_Pseudo_Stores <
+  "buffer_store_format_x", VGPR_32
+>;
+defm BUFFER_STORE_FORMAT_XY : MUBUF_Pseudo_Stores <
+  "buffer_store_format_xy", VReg_64
+>;
+defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Pseudo_Stores <
+  "buffer_store_format_xyz", VReg_96
+>;
+defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores <
+  "buffer_store_format_xyzw", VReg_128
+>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads <
+  "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8
+>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads <
+  "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8
+>;
+defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads <
+  "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16
+>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads <
+  "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16
+>;
+defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads <
+  "buffer_load_dword", VGPR_32, i32, mubuf_load
+>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads <
+  "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load
+>;
+defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads <
+  "buffer_load_dwordx3", VReg_96, untyped, mubuf_load
+>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
+  "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load
+>;
+defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
+  "buffer_store_byte", VGPR_32, i32, truncstorei8_global
+>;
+defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores <
+  "buffer_store_short", VGPR_32, i32, truncstorei16_global
+>;
+defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores <
+  "buffer_store_dword", VGPR_32, i32, global_store
+>;
+defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores <
+  "buffer_store_dwordx2", VReg_64, v2i32, global_store
+>;
+defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores <
+  "buffer_store_dwordx3", VReg_96, untyped, global_store
+>;
+defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <
+  "buffer_store_dwordx4", VReg_128, v4i32, global_store
+>;
+defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
+>;
+defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag
+>;
+defm BUFFER_ATOMIC_ADD : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_add", VGPR_32, i32, atomic_add_global
+>;
+defm BUFFER_ATOMIC_SUB : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global
+>;
+defm BUFFER_ATOMIC_SMIN : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_smin", VGPR_32, i32, atomic_min_global
+>;
+defm BUFFER_ATOMIC_UMIN : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global
+>;
+defm BUFFER_ATOMIC_SMAX : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_smax", VGPR_32, i32, atomic_max_global
+>;
+defm BUFFER_ATOMIC_UMAX : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global
+>;
+defm BUFFER_ATOMIC_AND : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_and", VGPR_32, i32, atomic_and_global
+>;
+defm BUFFER_ATOMIC_OR : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_or", VGPR_32, i32, atomic_or_global
+>;
+defm BUFFER_ATOMIC_XOR : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global
+>;
+defm BUFFER_ATOMIC_INC : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global
+>;
+defm BUFFER_ATOMIC_DEC : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global
+>;
+defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global
+>;
+defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag
+>;
+defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_add_x2", VReg_64, i64, atomic_add_global
+>;
+defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_sub_x2", VReg_64, i64, atomic_sub_global
+>;
+defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_smin_x2", VReg_64, i64, atomic_min_global
+>;
+defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_umin_x2", VReg_64, i64, atomic_umin_global
+>;
+defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_smax_x2", VReg_64, i64, atomic_max_global
+>;
+defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_umax_x2", VReg_64, i64, atomic_umax_global
+>;
+defm BUFFER_ATOMIC_AND_X2 : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_and_x2", VReg_64, i64, atomic_and_global
+>;
+defm BUFFER_ATOMIC_OR_X2 : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_or_x2", VReg_64, i64, atomic_or_global
+>;
+defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_xor_x2", VReg_64, i64, atomic_xor_global
+>;
+defm BUFFER_ATOMIC_INC_X2 : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global
+>;
+defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
+  "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global
+>;
+
+let SubtargetPredicate = isSI in { // isn't on CI & VI
+/*
+defm BUFFER_ATOMIC_RSUB        : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">;
+defm BUFFER_ATOMIC_FCMPSWAP    : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap">;
+defm BUFFER_ATOMIC_FMIN        : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin">;
+defm BUFFER_ATOMIC_FMAX        : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax">;
+defm BUFFER_ATOMIC_RSUB_X2     : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub_x2">;
+defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap_x2">;
+defm BUFFER_ATOMIC_FMIN_X2     : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin_x2">;
+defm BUFFER_ATOMIC_FMAX_X2     : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax_x2">;
+*/
+
+def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
+                                          int_amdgcn_buffer_wbinvl1_sc>;
+}
+
+def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
+                                       int_amdgcn_buffer_wbinvl1>;
+
+//===----------------------------------------------------------------------===//
+// MTBUF Instructions
+//===----------------------------------------------------------------------===//
+
+//def TBUFFER_LOAD_FORMAT_X    : MTBUF_ <0, "tbuffer_load_format_x", []>;
+//def TBUFFER_LOAD_FORMAT_XY   : MTBUF_ <1, "tbuffer_load_format_xy", []>;
+//def TBUFFER_LOAD_FORMAT_XYZ  : MTBUF_ <2, "tbuffer_load_format_xyz", []>;
+def TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Load_Pseudo  <"tbuffer_load_format_xyzw", VReg_128>;
+def TBUFFER_STORE_FORMAT_X    : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>;
+def TBUFFER_STORE_FORMAT_XY   : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>;
+def TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Store_Pseudo <"tbuffer_store_format_xyz", VReg_128>;
+def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Pseudo <"tbuffer_store_format_xyzw", VReg_128>;
+
+} // End let SubtargetPredicate = isGCN
+
+let SubtargetPredicate = isCIVI in {
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions for CI and newer.
+//===----------------------------------------------------------------------===//
+// Remaining instructions:
+// BUFFER_LOAD_DWORDX3
+// BUFFER_STORE_DWORDX3
+
+def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
+                                           int_amdgcn_buffer_wbinvl1_vol>;
+
+} // End let SubtargetPredicate = isCIVI
+
+//===----------------------------------------------------------------------===//
+// MUBUF Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isGCN] in {
+
+// int_SI_vs_load_input
+def : Pat<
+  (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
+  (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, (i32 0), imm:$attr_offset, 0, 0, 0)
+>;
+
+// Offset in an 32-bit VGPR
+def : Pat <
+  (SIload_constant v4i32:$sbase, i32:$voff),
+  (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, (i32 0), 0, 0, 0, 0)
+>;
+
+
+//===----------------------------------------------------------------------===//
+// buffer_load/store_format patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                  string opcode> {
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0,
+              (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+              imm:$glc, imm:$slc)),
+    (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex,
+              (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+              imm:$glc, imm:$slc)),
+    (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0,
+              (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+              imm:$glc, imm:$slc)),
+    (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex,
+              (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+              imm:$glc, imm:$slc)),
+    (!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
+
+multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                   string opcode> {
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+                                    (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
+      $vdata,
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, f32, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
+
+//===----------------------------------------------------------------------===//
+// buffer_atomic patterns
+//===----------------------------------------------------------------------===//
+
+multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> {
+  def : Pat<
+    (name i32:$vdata_in, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$slc),
+    (!cast<MUBUF_Pseudo>(opcode # _RTN_OFFSET) $vdata_in, $rsrc, $soffset,
+                                        (as_i16imm $offset), (as_i1imm $slc))
+  >;
+
+  def : Pat<
+    (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$slc),
+    (!cast<MUBUF_Pseudo>(opcode # _RTN_IDXEN) $vdata_in, $vindex, $rsrc, $soffset,
+                                       (as_i16imm $offset), (as_i1imm $slc))
+  >;
+
+  def : Pat<
+    (name i32:$vdata_in, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$slc),
+    (!cast<MUBUF_Pseudo>(opcode # _RTN_OFFEN) $vdata_in, $voffset, $rsrc, $soffset,
+                                       (as_i16imm $offset), (as_i1imm $slc))
+  >;
+
+  def : Pat<
+    (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$slc),
+    (!cast<MUBUF_Pseudo>(opcode # _RTN_BOTHEN)
+      $vdata_in,
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc))
+  >;
+}
+
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_swap, "BUFFER_ATOMIC_SWAP">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_add, "BUFFER_ATOMIC_ADD">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_sub, "BUFFER_ATOMIC_SUB">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smin, "BUFFER_ATOMIC_SMIN">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umin, "BUFFER_ATOMIC_UMIN">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smax, "BUFFER_ATOMIC_SMAX">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umax, "BUFFER_ATOMIC_UMAX">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_and, "BUFFER_ATOMIC_AND">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_or, "BUFFER_ATOMIC_OR">;
+defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_xor, "BUFFER_ATOMIC_XOR">;
+
+def : Pat<
+  (int_amdgcn_buffer_atomic_cmpswap
+      i32:$data, i32:$cmp, v4i32:$rsrc, 0,
+      (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+      imm:$slc),
+  (EXTRACT_SUBREG
+    (BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET
+      (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+      $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+    sub0)
+>;
+
+def : Pat<
+  (int_amdgcn_buffer_atomic_cmpswap
+      i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
+      (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+      imm:$slc),
+  (EXTRACT_SUBREG
+    (BUFFER_ATOMIC_CMPSWAP_RTN_IDXEN
+      (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+      $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+    sub0)
+>;
+
+def : Pat<
+  (int_amdgcn_buffer_atomic_cmpswap
+      i32:$data, i32:$cmp, v4i32:$rsrc, 0,
+      (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+      imm:$slc),
+  (EXTRACT_SUBREG
+    (BUFFER_ATOMIC_CMPSWAP_RTN_OFFEN
+      (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+      $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+    sub0)
+>;
+
+def : Pat<
+  (int_amdgcn_buffer_atomic_cmpswap
+      i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
+      (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+      imm:$slc),
+  (EXTRACT_SUBREG
+    (BUFFER_ATOMIC_CMPSWAP_RTN_BOTHEN
+      (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)),
+    sub0)
+>;
+
+
+class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
+                              PatFrag constant_ld> : Pat <
+     (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+                                   i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
+     (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+  >;
+
+multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
+                                     ValueType vt, PatFrag atomic_ld> {
+  def : Pat <
+     (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+                                   i16:$offset, i1:$slc))),
+     (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0)
+  >;
+
+  def : Pat <
+    (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))),
+    (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0)
+  >;
+}
+
+let Predicates = [isSICI] in {
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
+
+defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>;
+defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>;
+} // End Predicates = [isSICI]
+
+multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
+                               PatFrag ld> {
+
+  def : Pat <
+    (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+                          i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
+    (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+  >;
+}
+
+let Predicates = [Has16BitInsts] in {
+
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, mubuf_sextloadi8>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>;
+
+} // End Predicates = [Has16BitInsts]
+
+class MUBUFScratchLoadPat <MUBUF_Pseudo Instr, ValueType vt, PatFrag ld> : Pat <
+  (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
+                        i32:$soffset, u16imm:$offset))),
+  (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+>;
+
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i16, sextloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i16, extloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>;
+
+// BUFFER_LOAD_DWORD*, addr64=0
+multiclass MUBUF_Load_Dword <ValueType vt,
+                             MUBUF_Pseudo offset,
+                             MUBUF_Pseudo offen,
+                             MUBUF_Pseudo idxen,
+                             MUBUF_Pseudo bothen> {
+
+  def : Pat <
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset,
+                                  imm:$offset, 0, 0, imm:$glc, imm:$slc,
+                                  imm:$tfe)),
+    (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
+            (as_i1imm $slc), (as_i1imm $tfe))
+  >;
+
+  def : Pat <
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
+                                  imm:$offset, 1, 0, imm:$glc, imm:$slc,
+                                  imm:$tfe)),
+    (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
+           (as_i1imm $tfe))
+  >;
+
+  def : Pat <
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
+                                  imm:$offset, 0, 1, imm:$glc, imm:$slc,
+                                  imm:$tfe)),
+    (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
+           (as_i1imm $slc), (as_i1imm $tfe))
+  >;
+
+  def : Pat <
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
+                                  imm:$offset, 1, 1, imm:$glc, imm:$slc,
+                                  imm:$tfe)),
+    (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
+            (as_i1imm $tfe))
+  >;
+}
+
+defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN,
+                         BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>;
+defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN,
+                         BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>;
+defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
+                         BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
+
+multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
+                                      ValueType vt, PatFrag atomic_st> {
+  // Store follows atomic op convention so address is forst
+  def : Pat <
+     (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+                                   i16:$offset, i1:$slc), vt:$val),
+     (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0)
+  >;
+
+  def : Pat <
+    (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
+    (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0)
+  >;
+}
+let Predicates = [isSICI] in {
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, global_store_atomic>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, global_store_atomic>;
+} // End Predicates = [isSICI]
+
+
+multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
+                               PatFrag st> {
+
+  def : Pat <
+    (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+                                      i16:$offset, i1:$glc, i1:$slc, i1:$tfe)),
+    (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+  >;
+}
+
+defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_global>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, global_store>;
+
+class MUBUFScratchStorePat <MUBUF_Pseudo Instr, ValueType vt, PatFrag st> : Pat <
+  (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
+                               u16imm:$offset)),
+  (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+>;
+
+def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i16, truncstorei8_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i16, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;
+
+//===----------------------------------------------------------------------===//
+// MTBUF Patterns
+//===----------------------------------------------------------------------===//
+
+// TBUFFER_STORE_FORMAT_*, addr64=0
+class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF_Pseudo opcode> : Pat<
+  (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
+                   i32:$soffset, imm:$inst_offset, imm:$dfmt,
+                   imm:$nfmt, imm:$offen, imm:$idxen,
+                   imm:$glc, imm:$slc, imm:$tfe),
+  (opcode
+    $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
+    (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
+    (as_i1imm $slc), (as_i1imm $tfe), $soffset)
+>;
+
+def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
+def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
+def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
+def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
+
+} // End let Predicates = [isGCN]
+
+//===----------------------------------------------------------------------===//
+// Target instructions, move to the appropriate target TD file
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+class MUBUF_Real_si <bits<7> op, MUBUF_Pseudo ps> :
+  MUBUF_Real<op, ps>,
+  Enc64,
+  SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
+  let AssemblerPredicate=isSICI;
+  let DecoderNamespace="SICI";
+
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
+  let Inst{15}    = ps.addr64;
+  let Inst{16}    = lds;
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x38; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{54}    = !if(ps.has_slc, slc, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+multiclass MUBUF_Real_AllAddr_si<bits<7> op> {
+  def _OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>;
+  def _OFFEN_si  : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_si  : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+
+multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> {
+  def _RTN_OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFSET")>;
+  def _RTN_ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_ADDR64")>;
+  def _RTN_OFFEN_si  : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFEN")>;
+  def _RTN_IDXEN_si  : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_IDXEN")>;
+  def _RTN_BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_BOTHEN")>;
+}
+
+defm BUFFER_LOAD_FORMAT_X       : MUBUF_Real_AllAddr_si <0x00>;
+defm BUFFER_LOAD_FORMAT_XY      : MUBUF_Real_AllAddr_si <0x01>;
+defm BUFFER_LOAD_FORMAT_XYZ     : MUBUF_Real_AllAddr_si <0x02>;
+defm BUFFER_LOAD_FORMAT_XYZW    : MUBUF_Real_AllAddr_si <0x03>;
+defm BUFFER_STORE_FORMAT_X      : MUBUF_Real_AllAddr_si <0x04>;
+defm BUFFER_STORE_FORMAT_XY     : MUBUF_Real_AllAddr_si <0x05>;
+defm BUFFER_STORE_FORMAT_XYZ    : MUBUF_Real_AllAddr_si <0x06>;
+defm BUFFER_STORE_FORMAT_XYZW   : MUBUF_Real_AllAddr_si <0x07>;
+defm BUFFER_LOAD_UBYTE          : MUBUF_Real_AllAddr_si <0x08>;
+defm BUFFER_LOAD_SBYTE          : MUBUF_Real_AllAddr_si <0x09>;
+defm BUFFER_LOAD_USHORT         : MUBUF_Real_AllAddr_si <0x0a>;
+defm BUFFER_LOAD_SSHORT         : MUBUF_Real_AllAddr_si <0x0b>;
+defm BUFFER_LOAD_DWORD          : MUBUF_Real_AllAddr_si <0x0c>;
+defm BUFFER_LOAD_DWORDX2        : MUBUF_Real_AllAddr_si <0x0d>;
+defm BUFFER_LOAD_DWORDX4        : MUBUF_Real_AllAddr_si <0x0e>;
+defm BUFFER_LOAD_DWORDX3        : MUBUF_Real_AllAddr_si <0x0f>;
+defm BUFFER_STORE_BYTE          : MUBUF_Real_AllAddr_si <0x18>;
+defm BUFFER_STORE_SHORT         : MUBUF_Real_AllAddr_si <0x1a>;
+defm BUFFER_STORE_DWORD         : MUBUF_Real_AllAddr_si <0x1c>;
+defm BUFFER_STORE_DWORDX2       : MUBUF_Real_AllAddr_si <0x1d>;
+defm BUFFER_STORE_DWORDX4       : MUBUF_Real_AllAddr_si <0x1e>;
+defm BUFFER_STORE_DWORDX3       : MUBUF_Real_AllAddr_si <0x1f>;
+
+defm BUFFER_ATOMIC_SWAP         : MUBUF_Real_Atomic_si <0x30>;
+defm BUFFER_ATOMIC_CMPSWAP      : MUBUF_Real_Atomic_si <0x31>;
+defm BUFFER_ATOMIC_ADD          : MUBUF_Real_Atomic_si <0x32>;
+defm BUFFER_ATOMIC_SUB          : MUBUF_Real_Atomic_si <0x33>;
+//defm BUFFER_ATOMIC_RSUB         : MUBUF_Real_Atomic_si <0x34>;    // isn't on CI & VI
+defm BUFFER_ATOMIC_SMIN         : MUBUF_Real_Atomic_si <0x35>;
+defm BUFFER_ATOMIC_UMIN         : MUBUF_Real_Atomic_si <0x36>;
+defm BUFFER_ATOMIC_SMAX         : MUBUF_Real_Atomic_si <0x37>;
+defm BUFFER_ATOMIC_UMAX         : MUBUF_Real_Atomic_si <0x38>;
+defm BUFFER_ATOMIC_AND          : MUBUF_Real_Atomic_si <0x39>;
+defm BUFFER_ATOMIC_OR           : MUBUF_Real_Atomic_si <0x3a>;
+defm BUFFER_ATOMIC_XOR          : MUBUF_Real_Atomic_si <0x3b>;
+defm BUFFER_ATOMIC_INC          : MUBUF_Real_Atomic_si <0x3c>;
+defm BUFFER_ATOMIC_DEC          : MUBUF_Real_Atomic_si <0x3d>;
+
+//defm BUFFER_ATOMIC_FCMPSWAP     : MUBUF_Real_Atomic_si <0x3e>;    // isn't on VI
+//defm BUFFER_ATOMIC_FMIN         : MUBUF_Real_Atomic_si <0x3f>;    // isn't on VI
+//defm BUFFER_ATOMIC_FMAX         : MUBUF_Real_Atomic_si <0x40>;    // isn't on VI
+defm BUFFER_ATOMIC_SWAP_X2      : MUBUF_Real_Atomic_si <0x50>;
+defm BUFFER_ATOMIC_CMPSWAP_X2   : MUBUF_Real_Atomic_si <0x51>;
+defm BUFFER_ATOMIC_ADD_X2       : MUBUF_Real_Atomic_si <0x52>;
+defm BUFFER_ATOMIC_SUB_X2       : MUBUF_Real_Atomic_si <0x53>;
+//defm BUFFER_ATOMIC_RSUB_X2      : MUBUF_Real_Atomic_si <0x54>;    // isn't on CI & VI
+defm BUFFER_ATOMIC_SMIN_X2      : MUBUF_Real_Atomic_si <0x55>;
+defm BUFFER_ATOMIC_UMIN_X2      : MUBUF_Real_Atomic_si <0x56>;
+defm BUFFER_ATOMIC_SMAX_X2      : MUBUF_Real_Atomic_si <0x57>;
+defm BUFFER_ATOMIC_UMAX_X2      : MUBUF_Real_Atomic_si <0x58>;
+defm BUFFER_ATOMIC_AND_X2       : MUBUF_Real_Atomic_si <0x59>;
+defm BUFFER_ATOMIC_OR_X2        : MUBUF_Real_Atomic_si <0x5a>;
+defm BUFFER_ATOMIC_XOR_X2       : MUBUF_Real_Atomic_si <0x5b>;
+defm BUFFER_ATOMIC_INC_X2       : MUBUF_Real_Atomic_si <0x5c>;
+defm BUFFER_ATOMIC_DEC_X2       : MUBUF_Real_Atomic_si <0x5d>;
+// FIXME: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on CI.
+//defm BUFFER_ATOMIC_FCMPSWAP_X2  : MUBUF_Real_Atomic_si <0x5e">;   // isn't on VI
+//defm BUFFER_ATOMIC_FMIN_X2      : MUBUF_Real_Atomic_si <0x5f>;    // isn't on VI
+//defm BUFFER_ATOMIC_FMAX_X2      : MUBUF_Real_Atomic_si <0x60>;    // isn't on VI
+
+def BUFFER_WBINVL1_SC_si        : MUBUF_Real_si <0x70, BUFFER_WBINVL1_SC>;
+def BUFFER_WBINVL1_si           : MUBUF_Real_si <0x71, BUFFER_WBINVL1>;
+
+class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
+  MTBUF_Real<ps>,
+  SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
+  let AssemblerPredicate=isSICI;
+  let DecoderNamespace="SICI";
+
+  bits<1> addr64;
+  let Inst{15}    = addr64;
+  let Inst{18-16} = op;
+}
+
+def TBUFFER_LOAD_FORMAT_XYZW_si  : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>;
+def TBUFFER_STORE_FORMAT_X_si    : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>;
+def TBUFFER_STORE_FORMAT_XY_si   : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>;
+def TBUFFER_STORE_FORMAT_XYZ_si  : MTBUF_Real_si <6, TBUFFER_STORE_FORMAT_XYZ>;
+def TBUFFER_STORE_FORMAT_XYZW_si : MTBUF_Real_si <7, TBUFFER_STORE_FORMAT_XYZW>;
+
+
+//===----------------------------------------------------------------------===//
+// CI
+//===----------------------------------------------------------------------===//
+
+class MUBUF_Real_ci <bits<7> op, MUBUF_Pseudo ps> :
+  MUBUF_Real_si<op, ps> {
+  let AssemblerPredicate=isCIOnly;
+  let DecoderNamespace="CI";
+}
+
+def BUFFER_WBINVL1_VOL_ci : MUBUF_Real_ci <0x70, BUFFER_WBINVL1_VOL>;
+
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> :
+  MUBUF_Real<op, ps>,
+  Enc64,
+  SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
+  let AssemblerPredicate=isVI;
+  let DecoderNamespace="VI";
+
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
+  let Inst{16}    = lds;
+  let Inst{17}    = !if(ps.has_slc, slc, ?);
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x38; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+multiclass MUBUF_Real_AllAddr_vi<bits<7> op> {
+  def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _OFFEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+
+multiclass MUBUF_Real_Atomic_vi<bits<7> op> :
+  MUBUF_Real_AllAddr_vi<op> {
+  def _RTN_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFSET")>;
+  def _RTN_OFFEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFEN")>;
+  def _RTN_IDXEN_vi  : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_IDXEN")>;
+  def _RTN_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_BOTHEN")>;
+}
+
+defm BUFFER_LOAD_FORMAT_X       : MUBUF_Real_AllAddr_vi <0x00>;
+defm BUFFER_LOAD_FORMAT_XY      : MUBUF_Real_AllAddr_vi <0x01>;
+defm BUFFER_LOAD_FORMAT_XYZ     : MUBUF_Real_AllAddr_vi <0x02>;
+defm BUFFER_LOAD_FORMAT_XYZW    : MUBUF_Real_AllAddr_vi <0x03>;
+defm BUFFER_STORE_FORMAT_X      : MUBUF_Real_AllAddr_vi <0x04>;
+defm BUFFER_STORE_FORMAT_XY     : MUBUF_Real_AllAddr_vi <0x05>;
+defm BUFFER_STORE_FORMAT_XYZ    : MUBUF_Real_AllAddr_vi <0x06>;
+defm BUFFER_STORE_FORMAT_XYZW   : MUBUF_Real_AllAddr_vi <0x07>;
+defm BUFFER_LOAD_UBYTE          : MUBUF_Real_AllAddr_vi <0x10>;
+defm BUFFER_LOAD_SBYTE          : MUBUF_Real_AllAddr_vi <0x11>;
+defm BUFFER_LOAD_USHORT         : MUBUF_Real_AllAddr_vi <0x12>;
+defm BUFFER_LOAD_SSHORT         : MUBUF_Real_AllAddr_vi <0x13>;
+defm BUFFER_LOAD_DWORD          : MUBUF_Real_AllAddr_vi <0x14>;
+defm BUFFER_LOAD_DWORDX2        : MUBUF_Real_AllAddr_vi <0x15>;
+defm BUFFER_LOAD_DWORDX3        : MUBUF_Real_AllAddr_vi <0x16>;
+defm BUFFER_LOAD_DWORDX4        : MUBUF_Real_AllAddr_vi <0x17>;
+defm BUFFER_STORE_BYTE          : MUBUF_Real_AllAddr_vi <0x18>;
+defm BUFFER_STORE_SHORT         : MUBUF_Real_AllAddr_vi <0x1a>;
+defm BUFFER_STORE_DWORD         : MUBUF_Real_AllAddr_vi <0x1c>;
+defm BUFFER_STORE_DWORDX2       : MUBUF_Real_AllAddr_vi <0x1d>;
+defm BUFFER_STORE_DWORDX3       : MUBUF_Real_AllAddr_vi <0x1e>;
+defm BUFFER_STORE_DWORDX4       : MUBUF_Real_AllAddr_vi <0x1f>;
+
+defm BUFFER_ATOMIC_SWAP         : MUBUF_Real_Atomic_vi <0x40>;
+defm BUFFER_ATOMIC_CMPSWAP      : MUBUF_Real_Atomic_vi <0x41>;
+defm BUFFER_ATOMIC_ADD          : MUBUF_Real_Atomic_vi <0x42>;
+defm BUFFER_ATOMIC_SUB          : MUBUF_Real_Atomic_vi <0x43>;
+defm BUFFER_ATOMIC_SMIN         : MUBUF_Real_Atomic_vi <0x44>;
+defm BUFFER_ATOMIC_UMIN         : MUBUF_Real_Atomic_vi <0x45>;
+defm BUFFER_ATOMIC_SMAX         : MUBUF_Real_Atomic_vi <0x46>;
+defm BUFFER_ATOMIC_UMAX         : MUBUF_Real_Atomic_vi <0x47>;
+defm BUFFER_ATOMIC_AND          : MUBUF_Real_Atomic_vi <0x48>;
+defm BUFFER_ATOMIC_OR           : MUBUF_Real_Atomic_vi <0x49>;
+defm BUFFER_ATOMIC_XOR          : MUBUF_Real_Atomic_vi <0x4a>;
+defm BUFFER_ATOMIC_INC          : MUBUF_Real_Atomic_vi <0x4b>;
+defm BUFFER_ATOMIC_DEC          : MUBUF_Real_Atomic_vi <0x4c>;
+
+defm BUFFER_ATOMIC_SWAP_X2      : MUBUF_Real_Atomic_vi <0x60>;
+defm BUFFER_ATOMIC_CMPSWAP_X2   : MUBUF_Real_Atomic_vi <0x61>;
+defm BUFFER_ATOMIC_ADD_X2       : MUBUF_Real_Atomic_vi <0x62>;
+defm BUFFER_ATOMIC_SUB_X2       : MUBUF_Real_Atomic_vi <0x63>;
+defm BUFFER_ATOMIC_SMIN_X2      : MUBUF_Real_Atomic_vi <0x64>;
+defm BUFFER_ATOMIC_UMIN_X2      : MUBUF_Real_Atomic_vi <0x65>;
+defm BUFFER_ATOMIC_SMAX_X2      : MUBUF_Real_Atomic_vi <0x66>;
+defm BUFFER_ATOMIC_UMAX_X2      : MUBUF_Real_Atomic_vi <0x67>;
+defm BUFFER_ATOMIC_AND_X2       : MUBUF_Real_Atomic_vi <0x68>;
+defm BUFFER_ATOMIC_OR_X2        : MUBUF_Real_Atomic_vi <0x69>;
+defm BUFFER_ATOMIC_XOR_X2       : MUBUF_Real_Atomic_vi <0x6a>;
+defm BUFFER_ATOMIC_INC_X2       : MUBUF_Real_Atomic_vi <0x6b>;
+defm BUFFER_ATOMIC_DEC_X2       : MUBUF_Real_Atomic_vi <0x6c>;
+
+def BUFFER_WBINVL1_vi           : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>;
+def BUFFER_WBINVL1_VOL_vi       : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
+
+class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
+  MTBUF_Real<ps>,
+  SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
+  let AssemblerPredicate=isVI;
+  let DecoderNamespace="VI";
+
+  let Inst{18-15} = op;
+}
+
+def TBUFFER_LOAD_FORMAT_XYZW_vi  : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>;
+def TBUFFER_STORE_FORMAT_X_vi    : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>;
+def TBUFFER_STORE_FORMAT_XY_vi   : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>;
+def TBUFFER_STORE_FORMAT_XYZ_vi  : MTBUF_Real_vi <6, TBUFFER_STORE_FORMAT_XYZ>;
+def TBUFFER_STORE_FORMAT_XYZW_vi : MTBUF_Real_vi <7, TBUFFER_STORE_FORMAT_XYZW>;
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td
new file mode 100644
index 000000000000..26a483a8abf6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td
@@ -0,0 +1,15 @@
+//===-- CIInstructions.td - CI Instruction Defintions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Instruction definitions for CI and newer.
+//===----------------------------------------------------------------------===//
+// Remaining instructions:
+// S_CBRANCH_CDBGUSER
+// S_CBRANCH_CDBGSYS
+// S_CBRANCH_CDBGSYS_OR_USER
+// S_CBRANCH_CDBGSYS_AND_USER
+\ No newline at end of file
diff --git a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td
new file mode 100644
index 000000000000..6b8e85a73c73
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td
@@ -0,0 +1,222 @@
+//===-- CaymanInstructions.td - CM Instruction defs  -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are available only on Cayman
+// family GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+def isCayman : Predicate<"Subtarget->hasCaymanISA()">;
+
+//===----------------------------------------------------------------------===//
+// Cayman Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isCayman] in {
+
+def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24",
+  [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU
+>;
+def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24",
+  [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU
+>;
+
+def : IMad24Pat<MULADD_INT24_cm>;
+
+let isVector = 1 in {
+
+def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
+
+def MULLO_INT_cm : MULLO_INT_Common<0x8F>;
+def MULHI_INT_cm : MULHI_INT_Common<0x90>;
+def MULLO_UINT_cm : MULLO_UINT_Common<0x91>;
+def MULHI_UINT_cm : MULHI_UINT_Common<0x92>;
+def MULHI_INT_cm24 : MULHI_INT24_Common<0x5c>;
+def MULHI_UINT_cm24 : MULHI_UINT24_Common<0xb2>;
+
+def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>;
+def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;
+def LOG_IEEE_cm : LOG_IEEE_Common<0x83>;
+def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>;
+def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>;
+def SIN_cm : SIN_Common<0x8D>;
+def COS_cm : COS_Common<0x8E>;
+} // End isVector = 1
+
+def : RsqPat<RECIPSQRT_IEEE_cm, f32>;
+
+def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
+
+defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
+
+// RECIP_UINT emulation for Cayman
+// The multiplication scales from [0,1] to the unsigned integer range
+def : Pat <
+  (AMDGPUurecip i32:$src0),
+  (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)),
+                            (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1)))
+>;
+
+  def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
+    let ADDR = 0;
+    let POP_COUNT = 0;
+    let COUNT = 0;
+  }
+
+
+def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
+
+class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> :
+  CF_MEM_RAT_CACHELESS <0x14, 0, mask,
+                        (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr),
+                        "STORE_DWORD $rw_gpr, $index_gpr",
+                        [(global_store vt:$rw_gpr, i32:$index_gpr)]> {
+  let eop = 0; // This bit is not used on Cayman.
+}
+
+def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>;
+def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>;
+def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>;
+
+def RAT_STORE_TYPED_cm: CF_MEM_RAT_STORE_TYPED<0> {
+  let eop = 0; // This bit is not used on Cayman.
+}
+
+class VTX_READ_cm <string name, dag outs>
+    : VTX_WORD0_cm, VTX_READ<name, outs, []> {
+
+  // Static fields
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let SRC_REL = 0;
+  // XXX: We can infer this field based on the SRC_GPR.  This would allow us
+  // to store vertex addresses in any channel, not just X.
+  let SRC_SEL_X = 0;
+  let SRC_SEL_Y = 0;
+  let STRUCTURED_READ = 0;
+  let LDS_REQ = 0;
+  let COALESCED_READ = 0;
+
+  let Inst{31-0} = Word0;
+}
+
+def VTX_READ_8_cm
+    : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr",
+                   (outs R600_TReg32_X:$dst_gpr)> {
+
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 1; // FMT_8
+}
+
+def VTX_READ_16_cm
+    : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr",
+                   (outs R600_TReg32_X:$dst_gpr)> {
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 5; // FMT_16
+
+}
+
+def VTX_READ_32_cm
+    : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr",
+                   (outs R600_TReg32_X:$dst_gpr)> {
+
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 7;   // Masked
+  let DST_SEL_Z        = 7;   // Masked
+  let DST_SEL_W        = 7;   // Masked
+  let DATA_FORMAT      = 0xD; // COLOR_32
+
+  // This is not really necessary, but there were some GPU hangs that appeared
+  // to be caused by ALU instructions in the next instruction group that wrote
+  // to the $src_gpr registers of the VTX_READ.
+  // e.g.
+  // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
+  // %T2_X<def> = MOV %ZERO
+  //Adding this constraint prevents this from happening.
+  let Constraints = "$src_gpr.ptr = $dst_gpr";
+}
+
+def VTX_READ_64_cm
+    : VTX_READ_cm <"VTX_READ_64 $dst_gpr.XY, $src_gpr",
+                   (outs R600_Reg64:$dst_gpr)> {
+
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 1;
+  let DST_SEL_Z        = 7;
+  let DST_SEL_W        = 7;
+  let DATA_FORMAT      = 0x1D; // COLOR_32_32
+}
+
+def VTX_READ_128_cm
+    : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr",
+                   (outs R600_Reg128:$dst_gpr)> {
+
+  let DST_SEL_X        =  0;
+  let DST_SEL_Y        =  1;
+  let DST_SEL_Z        =  2;
+  let DST_SEL_W        =  3;
+  let DATA_FORMAT      =  0x22; // COLOR_32_32_32_32
+
+  // XXX: Need to force VTX_READ_128 instructions to write to the same register
+  // that holds its buffer address to avoid potential hangs.  We can't use
+  // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst
+  // registers are different sizes.
+}
+
+//===----------------------------------------------------------------------===//
+// VTX Read from parameter memory space
+//===----------------------------------------------------------------------===//
+def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_8_cm MEMxi:$src_gpr, 3)>;
+def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_16_cm MEMxi:$src_gpr, 3)>;
+def : Pat<(i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_32_cm MEMxi:$src_gpr, 3)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_64_cm MEMxi:$src_gpr, 3)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_128_cm MEMxi:$src_gpr, 3)>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from constant memory space
+//===----------------------------------------------------------------------===//
+def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_8_cm MEMxi:$src_gpr, 2)>;
+def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_16_cm MEMxi:$src_gpr, 2)>;
+def : Pat<(i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_32_cm MEMxi:$src_gpr, 2)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_64_cm MEMxi:$src_gpr, 2)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_128_cm MEMxi:$src_gpr, 2)>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from global memory space
+//===----------------------------------------------------------------------===//
+def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_8_cm MEMxi:$src_gpr, 1)>;
+def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_16_cm MEMxi:$src_gpr, 1)>;
+def : Pat<(i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_32_cm MEMxi:$src_gpr, 1)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_64_cm MEMxi:$src_gpr, 1)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_128_cm MEMxi:$src_gpr, 1)>;
+
+} // End isCayman
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td b/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td
new file mode 100644
index 000000000000..a077001df6bd
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -0,0 +1,906 @@
+//===-- DSInstructions.td - DS Instruction Defintions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> :
+  InstSI <outs, ins, "", pattern>,
+  SIMCInstr <opName, SIEncodingFamily.NONE> {
+
+  let SubtargetPredicate = isGCN;
+
+  let LGKM_CNT = 1;
+  let DS = 1;
+  let Size = 8;
+  let UseNamedOperandTable = 1;
+  let Uses = [M0, EXEC];
+
+  // Most instruction load and store data, so set this as the default.
+  let mayLoad = 1;
+  let mayStore = 1;
+
+  let hasSideEffects = 0;
+  let SchedRW = [WriteLDS];
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+
+  let AsmMatchConverter = "cvtDS";
+
+  string Mnemonic = opName;
+  string AsmOperands = asmOps;
+
+  // Well these bits a kind of hack because it would be more natural
+  // to test "outs" and "ins" dags for the presence of particular operands
+  bits<1> has_vdst = 1;
+  bits<1> has_addr = 1;
+  bits<1> has_data0 = 1;
+  bits<1> has_data1 = 1;
+
+  bits<1> has_offset  = 1; // has "offset" that should be split to offset0,1
+  bits<1> has_offset0 = 1;
+  bits<1> has_offset1 = 1;
+
+  bits<1> has_gds = 1;
+  bits<1> gdsValue = 0; // if has_gds == 0 set gds to this value
+}
+
+class DS_Real <DS_Pseudo ds> :
+  InstSI <ds.OutOperandList, ds.InOperandList, ds.Mnemonic # " " # ds.AsmOperands, []>,
+  Enc64 {
+
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  // copy relevant pseudo op flags
+  let SubtargetPredicate = ds.SubtargetPredicate;
+  let AsmMatchConverter  = ds.AsmMatchConverter;
+
+  // encoding fields
+  bits<8> vdst;
+  bits<1> gds;
+  bits<8> addr;
+  bits<8> data0;
+  bits<8> data1;
+  bits<8> offset0;
+  bits<8> offset1;
+
+  bits<16> offset;
+  let offset0 = !if(ds.has_offset, offset{7-0}, ?);
+  let offset1 = !if(ds.has_offset, offset{15-8}, ?);
+}
+
+
+// DS Pseudo instructions
+
+class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+  (outs),
+  (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
+  "$addr, $data0$offset$gds">,
+  AtomicNoRet<opName, 0> {
+
+  let has_data1 = 0;
+  let has_vdst = 0;
+}
+
+class DS_1A_Off8_NORET<string opName> : DS_Pseudo<opName,
+  (outs),
+  (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds),
+  "$addr $offset0$offset1$gds"> {
+
+  let has_data0 = 0;
+  let has_data1 = 0;
+  let has_vdst  = 0;
+  let has_offset = 0;
+  let AsmMatchConverter = "cvtDSOffset01";
+}
+
+class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+  (outs),
+  (ins VGPR_32:$addr, rc:$data0, rc:$data1, offset:$offset, gds:$gds),
+  "$addr, $data0, $data1"#"$offset"#"$gds">,
+  AtomicNoRet<opName, 0> {
+
+  let has_vdst = 0;
+}
+
+class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+  (outs),
+  (ins VGPR_32:$addr, rc:$data0, rc:$data1,
+       offset0:$offset0, offset1:$offset1, gds:$gds),
+  "$addr, $data0, $data1$offset0$offset1$gds"> {
+
+  let has_vdst = 0;
+  let has_offset = 0;
+  let AsmMatchConverter = "cvtDSOffset01";
+}
+
+class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+  (outs rc:$vdst),
+  (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds),
+  "$vdst, $addr, $data0$offset$gds"> {
+
+  let hasPostISelHook = 1;
+  let has_data1 = 0;
+}
+
+class DS_1A2D_RET<string opName,
+                  RegisterClass rc = VGPR_32,
+                  RegisterClass src = rc>
+: DS_Pseudo<opName,
+  (outs rc:$vdst),
+  (ins VGPR_32:$addr, src:$data0, src:$data1, offset:$offset, gds:$gds),
+  "$vdst, $addr, $data0, $data1$offset$gds"> {
+
+  let hasPostISelHook = 1;
+}
+
+class DS_1A_RET<string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+  (outs rc:$vdst),
+  (ins VGPR_32:$addr, offset:$offset, gds:$gds),
+  "$vdst, $addr$offset$gds"> {
+
+  let has_data0 = 0;
+  let has_data1 = 0;
+}
+
+class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+  (outs rc:$vdst),
+  (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds),
+  "$vdst, $addr$offset0$offset1$gds"> {
+
+  let has_offset = 0;
+  let has_data0 = 0;
+  let has_data1 = 0;
+  let AsmMatchConverter = "cvtDSOffset01";
+}
+
+class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
+  (outs VGPR_32:$vdst),
+  (ins VGPR_32:$addr, offset:$offset),
+  "$vdst, $addr$offset gds"> {
+
+  let has_data0 = 0;
+  let has_data1 = 0;
+  let has_gds = 0;
+  let gdsValue = 1;
+}
+
+class DS_0A_RET <string opName> : DS_Pseudo<opName,
+  (outs VGPR_32:$vdst),
+  (ins offset:$offset, gds:$gds),
+  "$vdst$offset$gds"> {
+
+  let mayLoad = 1;
+  let mayStore = 1;
+
+  let has_addr = 0;
+  let has_data0 = 0;
+  let has_data1 = 0;
+}
+
+class DS_1A <string opName> : DS_Pseudo<opName,
+  (outs),
+  (ins VGPR_32:$addr, offset:$offset, gds:$gds),
+  "$addr$offset$gds"> {
+
+  let mayLoad = 1;
+  let mayStore = 1;
+
+  let has_vdst = 0;
+  let has_data0 = 0;
+  let has_data1 = 0;
+}
+
+class DS_1A_GDS <string opName> : DS_Pseudo<opName,
+  (outs),
+  (ins VGPR_32:$addr),
+  "$addr gds"> {
+
+  let has_vdst    = 0;
+  let has_data0   = 0;
+  let has_data1   = 0;
+  let has_offset  = 0;
+  let has_offset0 = 0;
+  let has_offset1 = 0;
+
+  let has_gds     = 0;
+  let gdsValue    = 1;
+}
+
+class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag>
+: DS_Pseudo<opName,
+  (outs VGPR_32:$vdst),
+  (ins VGPR_32:$addr, VGPR_32:$data0, offset:$offset),
+  "$vdst, $addr, $data0$offset",
+  [(set i32:$vdst,
+   (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let isConvergent = 1;
+
+  let has_data1 = 0;
+  let has_gds = 0;
+}
+
+def DS_ADD_U32        : DS_1A1D_NORET<"ds_add_u32">;
+def DS_SUB_U32        : DS_1A1D_NORET<"ds_sub_u32">;
+def DS_RSUB_U32       : DS_1A1D_NORET<"ds_rsub_u32">;
+def DS_INC_U32        : DS_1A1D_NORET<"ds_inc_u32">;
+def DS_DEC_U32        : DS_1A1D_NORET<"ds_dec_u32">;
+def DS_MIN_I32        : DS_1A1D_NORET<"ds_min_i32">;
+def DS_MAX_I32        : DS_1A1D_NORET<"ds_max_i32">;
+def DS_MIN_U32        : DS_1A1D_NORET<"ds_min_u32">;
+def DS_MAX_U32        : DS_1A1D_NORET<"ds_max_u32">;
+def DS_AND_B32        : DS_1A1D_NORET<"ds_and_b32">;
+def DS_OR_B32         : DS_1A1D_NORET<"ds_or_b32">;
+def DS_XOR_B32        : DS_1A1D_NORET<"ds_xor_b32">;
+def DS_ADD_F32        : DS_1A1D_NORET<"ds_add_f32">;
+def DS_MIN_F32        : DS_1A1D_NORET<"ds_min_f32">;
+def DS_MAX_F32        : DS_1A1D_NORET<"ds_max_f32">;
+
+let mayLoad = 0 in {
+def DS_WRITE_B8       : DS_1A1D_NORET<"ds_write_b8">;
+def DS_WRITE_B16      : DS_1A1D_NORET<"ds_write_b16">;
+def DS_WRITE_B32      : DS_1A1D_NORET<"ds_write_b32">;
+def DS_WRITE2_B32     : DS_1A2D_Off8_NORET<"ds_write2_b32">;
+def DS_WRITE2ST64_B32 : DS_1A2D_Off8_NORET<"ds_write2st64_b32">;
+}
+
+def DS_MSKOR_B32      : DS_1A2D_NORET<"ds_mskor_b32">;
+def DS_CMPST_B32      : DS_1A2D_NORET<"ds_cmpst_b32">;
+def DS_CMPST_F32      : DS_1A2D_NORET<"ds_cmpst_f32">;
+
+def DS_ADD_U64        : DS_1A1D_NORET<"ds_add_u64", VReg_64>;
+def DS_SUB_U64        : DS_1A1D_NORET<"ds_sub_u64", VReg_64>;
+def DS_RSUB_U64       : DS_1A1D_NORET<"ds_rsub_u64", VReg_64>;
+def DS_INC_U64        : DS_1A1D_NORET<"ds_inc_u64", VReg_64>;
+def DS_DEC_U64        : DS_1A1D_NORET<"ds_dec_u64", VReg_64>;
+def DS_MIN_I64        : DS_1A1D_NORET<"ds_min_i64", VReg_64>;
+def DS_MAX_I64        : DS_1A1D_NORET<"ds_max_i64", VReg_64>;
+def DS_MIN_U64        : DS_1A1D_NORET<"ds_min_u64", VReg_64>;
+def DS_MAX_U64        : DS_1A1D_NORET<"ds_max_u64", VReg_64>;
+def DS_AND_B64        : DS_1A1D_NORET<"ds_and_b64", VReg_64>;
+def DS_OR_B64         : DS_1A1D_NORET<"ds_or_b64", VReg_64>;
+def DS_XOR_B64        : DS_1A1D_NORET<"ds_xor_b64", VReg_64>;
+def DS_MSKOR_B64      : DS_1A2D_NORET<"ds_mskor_b64", VReg_64>;
+let mayLoad = 0 in {
+def DS_WRITE_B64      : DS_1A1D_NORET<"ds_write_b64", VReg_64>;
+def DS_WRITE2_B64     : DS_1A2D_Off8_NORET<"ds_write2_b64", VReg_64>;
+def DS_WRITE2ST64_B64 : DS_1A2D_Off8_NORET<"ds_write2st64_b64", VReg_64>;
+}
+def DS_CMPST_B64      : DS_1A2D_NORET<"ds_cmpst_b64", VReg_64>;
+def DS_CMPST_F64      : DS_1A2D_NORET<"ds_cmpst_f64", VReg_64>;
+def DS_MIN_F64        : DS_1A1D_NORET<"ds_min_f64", VReg_64>;
+def DS_MAX_F64        : DS_1A1D_NORET<"ds_max_f64", VReg_64>;
+
+def DS_ADD_RTN_U32    : DS_1A1D_RET<"ds_add_rtn_u32">,
+                        AtomicNoRet<"ds_add_u32", 1>;
+def DS_ADD_RTN_F32    : DS_1A1D_RET<"ds_add_rtn_f32">,
+                        AtomicNoRet<"ds_add_f32", 1>;
+def DS_SUB_RTN_U32    : DS_1A1D_RET<"ds_sub_rtn_u32">,
+                        AtomicNoRet<"ds_sub_u32", 1>;
+def DS_RSUB_RTN_U32   : DS_1A1D_RET<"ds_rsub_rtn_u32">,
+                        AtomicNoRet<"ds_rsub_u32", 1>;
+def DS_INC_RTN_U32    : DS_1A1D_RET<"ds_inc_rtn_u32">,
+                        AtomicNoRet<"ds_inc_u32", 1>;
+def DS_DEC_RTN_U32    : DS_1A1D_RET<"ds_dec_rtn_u32">,
+                        AtomicNoRet<"ds_dec_u32", 1>;
+def DS_MIN_RTN_I32    : DS_1A1D_RET<"ds_min_rtn_i32">,
+                        AtomicNoRet<"ds_min_i32", 1>;
+def DS_MAX_RTN_I32    : DS_1A1D_RET<"ds_max_rtn_i32">,
+                        AtomicNoRet<"ds_max_i32", 1>;
+def DS_MIN_RTN_U32    : DS_1A1D_RET<"ds_min_rtn_u32">,
+                        AtomicNoRet<"ds_min_u32", 1>;
+def DS_MAX_RTN_U32    : DS_1A1D_RET<"ds_max_rtn_u32">,
+                        AtomicNoRet<"ds_max_u32", 1>;
+def DS_AND_RTN_B32    : DS_1A1D_RET<"ds_and_rtn_b32">,
+                        AtomicNoRet<"ds_and_b32", 1>;
+def DS_OR_RTN_B32     : DS_1A1D_RET<"ds_or_rtn_b32">,
+                        AtomicNoRet<"ds_or_b32", 1>;
+def DS_XOR_RTN_B32    : DS_1A1D_RET<"ds_xor_rtn_b32">,
+                        AtomicNoRet<"ds_xor_b32", 1>;
+def DS_MSKOR_RTN_B32  : DS_1A2D_RET<"ds_mskor_rtn_b32">,
+                        AtomicNoRet<"ds_mskor_b32", 1>;
+def DS_CMPST_RTN_B32  : DS_1A2D_RET <"ds_cmpst_rtn_b32">,
+                        AtomicNoRet<"ds_cmpst_b32", 1>;
+def DS_CMPST_RTN_F32  : DS_1A2D_RET <"ds_cmpst_rtn_f32">,
+                        AtomicNoRet<"ds_cmpst_f32", 1>;
+def DS_MIN_RTN_F32    : DS_1A1D_RET <"ds_min_rtn_f32">,
+                        AtomicNoRet<"ds_min_f32", 1>;
+def DS_MAX_RTN_F32    : DS_1A1D_RET <"ds_max_rtn_f32">,
+                        AtomicNoRet<"ds_max_f32", 1>;
+
+def DS_WRXCHG_RTN_B32      : DS_1A1D_RET<"ds_wrxchg_rtn_b32">,
+                             AtomicNoRet<"", 1>;
+def DS_WRXCHG2_RTN_B32     : DS_1A2D_RET<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>,
+                             AtomicNoRet<"", 1>;
+def DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>,
+                             AtomicNoRet<"", 1>;
+
+def DS_ADD_RTN_U64    : DS_1A1D_RET<"ds_add_rtn_u64", VReg_64>,
+                        AtomicNoRet<"ds_add_u64", 1>;
+def DS_SUB_RTN_U64    : DS_1A1D_RET<"ds_sub_rtn_u64", VReg_64>,
+                        AtomicNoRet<"ds_sub_u64", 1>;
+def DS_RSUB_RTN_U64   : DS_1A1D_RET<"ds_rsub_rtn_u64", VReg_64>,
+                        AtomicNoRet<"ds_rsub_u64", 1>;
+def DS_INC_RTN_U64    : DS_1A1D_RET<"ds_inc_rtn_u64", VReg_64>,
+                        AtomicNoRet<"ds_inc_u64", 1>;
+def DS_DEC_RTN_U64    : DS_1A1D_RET<"ds_dec_rtn_u64", VReg_64>,
+                        AtomicNoRet<"ds_dec_u64", 1>;
+def DS_MIN_RTN_I64    : DS_1A1D_RET<"ds_min_rtn_i64", VReg_64>,
+                        AtomicNoRet<"ds_min_i64", 1>;
+def DS_MAX_RTN_I64    : DS_1A1D_RET<"ds_max_rtn_i64", VReg_64>,
+                        AtomicNoRet<"ds_max_i64", 1>;
+def DS_MIN_RTN_U64    : DS_1A1D_RET<"ds_min_rtn_u64", VReg_64>,
+                        AtomicNoRet<"ds_min_u64", 1>;
+def DS_MAX_RTN_U64    : DS_1A1D_RET<"ds_max_rtn_u64", VReg_64>,
+                        AtomicNoRet<"ds_max_u64", 1>;
+def DS_AND_RTN_B64    : DS_1A1D_RET<"ds_and_rtn_b64", VReg_64>,
+                        AtomicNoRet<"ds_and_b64", 1>;
+def DS_OR_RTN_B64     : DS_1A1D_RET<"ds_or_rtn_b64", VReg_64>,
+                        AtomicNoRet<"ds_or_b64", 1>;
+def DS_XOR_RTN_B64    : DS_1A1D_RET<"ds_xor_rtn_b64", VReg_64>,
+                        AtomicNoRet<"ds_xor_b64", 1>;
+def DS_MSKOR_RTN_B64  : DS_1A2D_RET<"ds_mskor_rtn_b64", VReg_64>,
+                        AtomicNoRet<"ds_mskor_b64", 1>;
+def DS_CMPST_RTN_B64  : DS_1A2D_RET<"ds_cmpst_rtn_b64", VReg_64>,
+                        AtomicNoRet<"ds_cmpst_b64", 1>;
+def DS_CMPST_RTN_F64  : DS_1A2D_RET<"ds_cmpst_rtn_f64", VReg_64>,
+                        AtomicNoRet<"ds_cmpst_f64", 1>;
+def DS_MIN_RTN_F64    : DS_1A1D_RET<"ds_min_rtn_f64", VReg_64>,
+                        AtomicNoRet<"ds_min_f64", 1>;
+def DS_MAX_RTN_F64    : DS_1A1D_RET<"ds_max_rtn_f64", VReg_64>,
+                        AtomicNoRet<"ds_max_f64", 1>;
+
+def DS_WRXCHG_RTN_B64      : DS_1A1D_RET<"ds_wrxchg_rtn_b64", VReg_64>,
+                             AtomicNoRet<"ds_wrxchg_b64", 1>;
+def DS_WRXCHG2_RTN_B64     : DS_1A2D_RET<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>,
+                             AtomicNoRet<"ds_wrxchg2_b64", 1>;
+def DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>,
+                             AtomicNoRet<"ds_wrxchg2st64_b64", 1>;
+
+def DS_GWS_INIT       : DS_1A_GDS<"ds_gws_init">;
+def DS_GWS_SEMA_V     : DS_1A_GDS<"ds_gws_sema_v">;
+def DS_GWS_SEMA_BR    : DS_1A_GDS<"ds_gws_sema_br">;
+def DS_GWS_SEMA_P     : DS_1A_GDS<"ds_gws_sema_p">;
+def DS_GWS_BARRIER    : DS_1A_GDS<"ds_gws_barrier">;
+
+def DS_ADD_SRC2_U32   : DS_1A<"ds_add_src2_u32">;
+def DS_SUB_SRC2_U32   : DS_1A<"ds_sub_src2_u32">;
+def DS_RSUB_SRC2_U32  : DS_1A<"ds_rsub_src2_u32">;
+def DS_INC_SRC2_U32   : DS_1A<"ds_inc_src2_u32">;
+def DS_DEC_SRC2_U32   : DS_1A<"ds_dec_src2_u32">;
+def DS_MIN_SRC2_I32   : DS_1A<"ds_min_src2_i32">;
+def DS_MAX_SRC2_I32   : DS_1A<"ds_max_src2_i32">;
+def DS_MIN_SRC2_U32   : DS_1A<"ds_min_src2_u32">;
+def DS_MAX_SRC2_U32   : DS_1A<"ds_max_src2_u32">;
+def DS_AND_SRC2_B32   : DS_1A<"ds_and_src_b32">;
+def DS_OR_SRC2_B32    : DS_1A<"ds_or_src2_b32">;
+def DS_XOR_SRC2_B32   : DS_1A<"ds_xor_src2_b32">;
+def DS_MIN_SRC2_F32   : DS_1A<"ds_min_src2_f32">;
+def DS_MAX_SRC2_F32   : DS_1A<"ds_max_src2_f32">;
+
+def DS_ADD_SRC2_U64   : DS_1A<"ds_add_src2_u64">;
+def DS_SUB_SRC2_U64   : DS_1A<"ds_sub_src2_u64">;
+def DS_RSUB_SRC2_U64  : DS_1A<"ds_rsub_src2_u64">;
+def DS_INC_SRC2_U64   : DS_1A<"ds_inc_src2_u64">;
+def DS_DEC_SRC2_U64   : DS_1A<"ds_dec_src2_u64">;
+def DS_MIN_SRC2_I64   : DS_1A<"ds_min_src2_i64">;
+def DS_MAX_SRC2_I64   : DS_1A<"ds_max_src2_i64">;
+def DS_MIN_SRC2_U64   : DS_1A<"ds_min_src2_u64">;
+def DS_MAX_SRC2_U64   : DS_1A<"ds_max_src2_u64">;
+def DS_AND_SRC2_B64   : DS_1A<"ds_and_src2_b64">;
+def DS_OR_SRC2_B64    : DS_1A<"ds_or_src2_b64">;
+def DS_XOR_SRC2_B64   : DS_1A<"ds_xor_src2_b64">;
+def DS_MIN_SRC2_F64   : DS_1A<"ds_min_src2_f64">;
+def DS_MAX_SRC2_F64   : DS_1A<"ds_max_src2_f64">;
+
+def DS_WRITE_SRC2_B32 : DS_1A_Off8_NORET<"ds_write_src2_b32">;
+def DS_WRITE_SRC2_B64 : DS_1A_Off8_NORET<"ds_write_src2_b64">;
+
+let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
+def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32">;
+}
+
+let mayStore = 0 in {
+def DS_READ_I8       : DS_1A_RET<"ds_read_i8">;
+def DS_READ_U8       : DS_1A_RET<"ds_read_u8">;
+def DS_READ_I16      : DS_1A_RET<"ds_read_i16">;
+def DS_READ_U16      : DS_1A_RET<"ds_read_u16">;
+def DS_READ_B32      : DS_1A_RET<"ds_read_b32">;
+def DS_READ_B64      : DS_1A_RET<"ds_read_b64", VReg_64>;
+
+def DS_READ2_B32     : DS_1A_Off8_RET<"ds_read2_b32", VReg_64>;
+def DS_READ2ST64_B32 : DS_1A_Off8_RET<"ds_read2st64_b32", VReg_64>;
+
+def DS_READ2_B64     : DS_1A_Off8_RET<"ds_read2_b64", VReg_128>;
+def DS_READ2ST64_B64 : DS_1A_Off8_RET<"ds_read2st64_b64", VReg_128>;
+}
+
+let SubtargetPredicate = isSICI in {
+def DS_CONSUME       : DS_0A_RET<"ds_consume">;
+def DS_APPEND        : DS_0A_RET<"ds_append">;
+def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions for CI and newer.
+//===----------------------------------------------------------------------===//
+// Remaining instructions:
+// DS_NOP
+// DS_GWS_SEMA_RELEASE_ALL
+// DS_WRAP_RTN_B32
+// DS_CNDXCHG32_RTN_B64
+// DS_WRITE_B96
+// DS_WRITE_B128
+// DS_CONDXCHG32_RTN_B128
+// DS_READ_B96
+// DS_READ_B128
+
+let SubtargetPredicate = isCIVI in {
+
+def DS_WRAP_RTN_F32 : DS_1A1D_RET <"ds_wrap_rtn_f32">,
+                      AtomicNoRet<"ds_wrap_f32", 1>;
+
+} // let SubtargetPredicate = isCIVI
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions for VI and newer.
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isVI in {
+
+let Uses = [EXEC] in {
+def DS_PERMUTE_B32  : DS_1A1D_PERMUTE <"ds_permute_b32",
+                                       int_amdgcn_ds_permute>;
+def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
+                                       int_amdgcn_ds_bpermute>;
+}
+
+} // let SubtargetPredicate = isVI
+
+//===----------------------------------------------------------------------===//
+// DS Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isGCN] in {
+
+def : Pat <
+  (int_amdgcn_ds_swizzle i32:$src, imm:$offset16),
+  (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0))
+>;
+
+class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
+  (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
+  (inst $ptr, (as_i16imm $offset), (i1 0))
+>;
+
+def : DSReadPat <DS_READ_I8,  i32, si_sextload_local_i8>;
+def : DSReadPat <DS_READ_U8,  i32, si_az_extload_local_i8>;
+def : DSReadPat <DS_READ_I8,  i16, si_sextload_local_i8>;
+def : DSReadPat <DS_READ_U8,  i16, si_az_extload_local_i8>;
+def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
+def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
+def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>;
+def : DSReadPat <DS_READ_U16, i16, si_load_local>;
+def : DSReadPat <DS_READ_B32, i32, si_load_local>;
+
+let AddedComplexity = 100 in {
+
+def : DSReadPat <DS_READ_B64, v2i32, si_load_local_align8>;
+
+} // End AddedComplexity = 100
+
+def : Pat <
+  (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
+                                                    i8:$offset1))),
+  (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0))
+>;
+
+class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
+  (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
+  (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+>;
+
+def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>;
+def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>;
+def : DSWritePat <DS_WRITE_B8, i16, si_truncstore_local_i8>;
+def : DSWritePat <DS_WRITE_B16, i16, si_store_local>;
+def : DSWritePat <DS_WRITE_B32, i32, si_store_local>;
+
+let AddedComplexity = 100 in {
+
+def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>;
+} // End AddedComplexity = 100
+
+def : Pat <
+  (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
+                                                               i8:$offset1)),
+  (DS_WRITE2_B32 $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
+                       (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
+                       (i1 0))
+>;
+
+class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
+  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
+  (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+>;
+
+class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
+  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
+  (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0))
+>;
+
+
+// 32-bit atomics.
+def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>;
+def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>;
+def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_INC_RTN_U32, i32, si_atomic_inc_local>;
+def : DSAtomicRetPat<DS_DEC_RTN_U32, i32, si_atomic_dec_local>;
+def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>;
+def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>;
+def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>;
+def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>;
+
+// 64-bit atomics.
+def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>;
+def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>;
+def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_INC_RTN_U64, i64, si_atomic_inc_local>;
+def : DSAtomicRetPat<DS_DEC_RTN_U64, i64, si_atomic_dec_local>;
+def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>;
+def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>;
+def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, si_atomic_load_min_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, si_atomic_load_max_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, si_atomic_load_umin_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, si_atomic_load_umax_local>;
+
+def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>;
+
+} // let Predicates = [isGCN]
+
+//===----------------------------------------------------------------------===//
+// Real instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SIInstructions.td
+//===----------------------------------------------------------------------===//
+
+class DS_Real_si <bits<8> op, DS_Pseudo ds> :
+  DS_Real <ds>,
+  SIMCInstr <ds.Mnemonic, SIEncodingFamily.SI> {
+  let AssemblerPredicates=[isSICI];
+  let DecoderNamespace="SICI";
+
+  // encoding
+  let Inst{7-0}   = !if(ds.has_offset0, offset0, 0);
+  let Inst{15-8}  = !if(ds.has_offset1, offset1, 0);
+  let Inst{17}    = !if(ds.has_gds, gds, ds.gdsValue);
+  let Inst{25-18} = op;
+  let Inst{31-26} = 0x36; // ds prefix
+  let Inst{39-32} = !if(ds.has_addr, addr, 0);
+  let Inst{47-40} = !if(ds.has_data0, data0, 0);
+  let Inst{55-48} = !if(ds.has_data1, data1, 0);
+  let Inst{63-56} = !if(ds.has_vdst, vdst, 0);
+}
+
+def DS_ADD_U32_si         : DS_Real_si<0x0,  DS_ADD_U32>;
+def DS_SUB_U32_si         : DS_Real_si<0x1,  DS_SUB_U32>;
+def DS_RSUB_U32_si        : DS_Real_si<0x2,  DS_RSUB_U32>;
+def DS_INC_U32_si         : DS_Real_si<0x3,  DS_INC_U32>;
+def DS_DEC_U32_si         : DS_Real_si<0x4,  DS_DEC_U32>;
+def DS_MIN_I32_si         : DS_Real_si<0x5,  DS_MIN_I32>;
+def DS_MAX_I32_si         : DS_Real_si<0x6,  DS_MAX_I32>;
+def DS_MIN_U32_si         : DS_Real_si<0x7,  DS_MIN_U32>;
+def DS_MAX_U32_si         : DS_Real_si<0x8,  DS_MAX_U32>;
+def DS_AND_B32_si         : DS_Real_si<0x9,  DS_AND_B32>;
+def DS_OR_B32_si          : DS_Real_si<0xa,  DS_OR_B32>;
+def DS_XOR_B32_si         : DS_Real_si<0xb,  DS_XOR_B32>;
+def DS_MSKOR_B32_si       : DS_Real_si<0xc,  DS_MSKOR_B32>;
+def DS_WRITE_B32_si       : DS_Real_si<0xd,  DS_WRITE_B32>;
+def DS_WRITE2_B32_si      : DS_Real_si<0xe,  DS_WRITE2_B32>;
+def DS_WRITE2ST64_B32_si  : DS_Real_si<0xf,  DS_WRITE2ST64_B32>;
+def DS_CMPST_B32_si       : DS_Real_si<0x10, DS_CMPST_B32>;
+def DS_CMPST_F32_si       : DS_Real_si<0x11, DS_CMPST_F32>;
+def DS_MIN_F32_si         : DS_Real_si<0x12, DS_MIN_F32>;
+def DS_MAX_F32_si         : DS_Real_si<0x13, DS_MAX_F32>;
+def DS_GWS_INIT_si        : DS_Real_si<0x19, DS_GWS_INIT>;
+def DS_GWS_SEMA_V_si      : DS_Real_si<0x1a, DS_GWS_SEMA_V>;
+def DS_GWS_SEMA_BR_si     : DS_Real_si<0x1b, DS_GWS_SEMA_BR>;
+def DS_GWS_SEMA_P_si      : DS_Real_si<0x1c, DS_GWS_SEMA_P>;
+def DS_GWS_BARRIER_si     : DS_Real_si<0x1d, DS_GWS_BARRIER>;
+def DS_WRITE_B8_si        : DS_Real_si<0x1e, DS_WRITE_B8>;
+def DS_WRITE_B16_si       : DS_Real_si<0x1f, DS_WRITE_B16>;
+def DS_ADD_RTN_U32_si     : DS_Real_si<0x20, DS_ADD_RTN_U32>;
+def DS_SUB_RTN_U32_si     : DS_Real_si<0x21, DS_SUB_RTN_U32>;
+def DS_RSUB_RTN_U32_si    : DS_Real_si<0x22, DS_RSUB_RTN_U32>;
+def DS_INC_RTN_U32_si     : DS_Real_si<0x23, DS_INC_RTN_U32>;
+def DS_DEC_RTN_U32_si     : DS_Real_si<0x24, DS_DEC_RTN_U32>;
+def DS_MIN_RTN_I32_si     : DS_Real_si<0x25, DS_MIN_RTN_I32>;
+def DS_MAX_RTN_I32_si     : DS_Real_si<0x26, DS_MAX_RTN_I32>;
+def DS_MIN_RTN_U32_si     : DS_Real_si<0x27, DS_MIN_RTN_U32>;
+def DS_MAX_RTN_U32_si     : DS_Real_si<0x28, DS_MAX_RTN_U32>;
+def DS_AND_RTN_B32_si     : DS_Real_si<0x29, DS_AND_RTN_B32>;
+def DS_OR_RTN_B32_si      : DS_Real_si<0x2a, DS_OR_RTN_B32>;
+def DS_XOR_RTN_B32_si     : DS_Real_si<0x2b, DS_XOR_RTN_B32>;
+def DS_MSKOR_RTN_B32_si   : DS_Real_si<0x2c, DS_MSKOR_RTN_B32>;
+def DS_WRXCHG_RTN_B32_si  : DS_Real_si<0x2d, DS_WRXCHG_RTN_B32>;
+def DS_WRXCHG2_RTN_B32_si : DS_Real_si<0x2e, DS_WRXCHG2_RTN_B32>;
+def DS_WRXCHG2ST64_RTN_B32_si : DS_Real_si<0x2f, DS_WRXCHG2ST64_RTN_B32>;
+def DS_CMPST_RTN_B32_si   : DS_Real_si<0x30, DS_CMPST_RTN_B32>;
+def DS_CMPST_RTN_F32_si   : DS_Real_si<0x31, DS_CMPST_RTN_F32>;
+def DS_MIN_RTN_F32_si     : DS_Real_si<0x32, DS_MIN_RTN_F32>;
+def DS_MAX_RTN_F32_si     : DS_Real_si<0x33, DS_MAX_RTN_F32>;
+
+// FIXME: this instruction is actually CI/VI
+def DS_WRAP_RTN_F32_si    : DS_Real_si<0x34, DS_WRAP_RTN_F32>;
+
+def DS_SWIZZLE_B32_si     : DS_Real_si<0x35, DS_SWIZZLE_B32>;
+def DS_READ_B32_si        : DS_Real_si<0x36, DS_READ_B32>;
+def DS_READ2_B32_si       : DS_Real_si<0x37, DS_READ2_B32>;
+def DS_READ2ST64_B32_si   : DS_Real_si<0x38, DS_READ2ST64_B32>;
+def DS_READ_I8_si         : DS_Real_si<0x39, DS_READ_I8>;
+def DS_READ_U8_si         : DS_Real_si<0x3a, DS_READ_U8>;
+def DS_READ_I16_si        : DS_Real_si<0x3b, DS_READ_I16>;
+def DS_READ_U16_si        : DS_Real_si<0x3c, DS_READ_U16>;
+def DS_CONSUME_si         : DS_Real_si<0x3d, DS_CONSUME>;
+def DS_APPEND_si          : DS_Real_si<0x3e, DS_APPEND>;
+def DS_ORDERED_COUNT_si   : DS_Real_si<0x3f, DS_ORDERED_COUNT>;
+def DS_ADD_U64_si         : DS_Real_si<0x40, DS_ADD_U64>;
+def DS_SUB_U64_si         : DS_Real_si<0x41, DS_SUB_U64>;
+def DS_RSUB_U64_si        : DS_Real_si<0x42, DS_RSUB_U64>;
+def DS_INC_U64_si         : DS_Real_si<0x43, DS_INC_U64>;
+def DS_DEC_U64_si         : DS_Real_si<0x44, DS_DEC_U64>;
+def DS_MIN_I64_si         : DS_Real_si<0x45, DS_MIN_I64>;
+def DS_MAX_I64_si         : DS_Real_si<0x46, DS_MAX_I64>;
+def DS_MIN_U64_si         : DS_Real_si<0x47, DS_MIN_U64>;
+def DS_MAX_U64_si         : DS_Real_si<0x48, DS_MAX_U64>;
+def DS_AND_B64_si         : DS_Real_si<0x49, DS_AND_B64>;
+def DS_OR_B64_si          : DS_Real_si<0x4a, DS_OR_B64>;
+def DS_XOR_B64_si         : DS_Real_si<0x4b, DS_XOR_B64>;
+def DS_MSKOR_B64_si       : DS_Real_si<0x4c, DS_MSKOR_B64>;
+def DS_WRITE_B64_si       : DS_Real_si<0x4d, DS_WRITE_B64>;
+def DS_WRITE2_B64_si      : DS_Real_si<0x4E, DS_WRITE2_B64>;
+def DS_WRITE2ST64_B64_si  : DS_Real_si<0x4f, DS_WRITE2ST64_B64>;
+def DS_CMPST_B64_si       : DS_Real_si<0x50, DS_CMPST_B64>;
+def DS_CMPST_F64_si       : DS_Real_si<0x51, DS_CMPST_F64>;
+def DS_MIN_F64_si         : DS_Real_si<0x52, DS_MIN_F64>;
+def DS_MAX_F64_si         : DS_Real_si<0x53, DS_MAX_F64>;
+
+def DS_ADD_RTN_U64_si     : DS_Real_si<0x60, DS_ADD_RTN_U64>;
+def DS_SUB_RTN_U64_si     : DS_Real_si<0x61, DS_SUB_RTN_U64>;
+def DS_RSUB_RTN_U64_si    : DS_Real_si<0x62, DS_RSUB_RTN_U64>;
+def DS_INC_RTN_U64_si     : DS_Real_si<0x63, DS_INC_RTN_U64>;
+def DS_DEC_RTN_U64_si     : DS_Real_si<0x64, DS_DEC_RTN_U64>;
+def DS_MIN_RTN_I64_si     : DS_Real_si<0x65, DS_MIN_RTN_I64>;
+def DS_MAX_RTN_I64_si     : DS_Real_si<0x66, DS_MAX_RTN_I64>;
+def DS_MIN_RTN_U64_si     : DS_Real_si<0x67, DS_MIN_RTN_U64>;
+def DS_MAX_RTN_U64_si     : DS_Real_si<0x68, DS_MAX_RTN_U64>;
+def DS_AND_RTN_B64_si     : DS_Real_si<0x69, DS_AND_RTN_B64>;
+def DS_OR_RTN_B64_si      : DS_Real_si<0x6a, DS_OR_RTN_B64>;
+def DS_XOR_RTN_B64_si     : DS_Real_si<0x6b, DS_XOR_RTN_B64>;
+def DS_MSKOR_RTN_B64_si   : DS_Real_si<0x6c, DS_MSKOR_RTN_B64>;
+def DS_WRXCHG_RTN_B64_si  : DS_Real_si<0x6d, DS_WRXCHG_RTN_B64>;
+def DS_WRXCHG2_RTN_B64_si : DS_Real_si<0x6e, DS_WRXCHG2_RTN_B64>;
+def DS_WRXCHG2ST64_RTN_B64_si : DS_Real_si<0x6f, DS_WRXCHG2ST64_RTN_B64>;
+def DS_CMPST_RTN_B64_si   : DS_Real_si<0x70, DS_CMPST_RTN_B64>;
+def DS_CMPST_RTN_F64_si   : DS_Real_si<0x71, DS_CMPST_RTN_F64>;
+def DS_MIN_RTN_F64_si     : DS_Real_si<0x72, DS_MIN_RTN_F64>;
+def DS_MAX_RTN_F64_si     : DS_Real_si<0x73, DS_MAX_RTN_F64>;
+
+def DS_READ_B64_si        : DS_Real_si<0x76, DS_READ_B64>;
+def DS_READ2_B64_si       : DS_Real_si<0x77, DS_READ2_B64>;
+def DS_READ2ST64_B64_si   : DS_Real_si<0x78, DS_READ2ST64_B64>;
+
+def DS_ADD_SRC2_U32_si    : DS_Real_si<0x80, DS_ADD_SRC2_U32>;
+def DS_SUB_SRC2_U32_si    : DS_Real_si<0x81, DS_SUB_SRC2_U32>;
+def DS_RSUB_SRC2_U32_si   : DS_Real_si<0x82, DS_RSUB_SRC2_U32>;
+def DS_INC_SRC2_U32_si    : DS_Real_si<0x83, DS_INC_SRC2_U32>;
+def DS_DEC_SRC2_U32_si    : DS_Real_si<0x84, DS_DEC_SRC2_U32>;
+def DS_MIN_SRC2_I32_si    : DS_Real_si<0x85, DS_MIN_SRC2_I32>;
+def DS_MAX_SRC2_I32_si    : DS_Real_si<0x86, DS_MAX_SRC2_I32>;
+def DS_MIN_SRC2_U32_si    : DS_Real_si<0x87, DS_MIN_SRC2_U32>;
+def DS_MAX_SRC2_U32_si    : DS_Real_si<0x88, DS_MAX_SRC2_U32>;
+def DS_AND_SRC2_B32_si    : DS_Real_si<0x89, DS_AND_SRC2_B32>;
+def DS_OR_SRC2_B32_si     : DS_Real_si<0x8a, DS_OR_SRC2_B32>;
+def DS_XOR_SRC2_B32_si    : DS_Real_si<0x8b, DS_XOR_SRC2_B32>;
+def DS_WRITE_SRC2_B32_si  : DS_Real_si<0x8d, DS_WRITE_SRC2_B32>;
+
+def DS_MIN_SRC2_F32_si    : DS_Real_si<0x92, DS_MIN_SRC2_F32>;
+def DS_MAX_SRC2_F32_si    : DS_Real_si<0x93, DS_MAX_SRC2_F32>;
+
+def DS_ADD_SRC2_U64_si    : DS_Real_si<0xc0, DS_ADD_SRC2_U64>;
+def DS_SUB_SRC2_U64_si    : DS_Real_si<0xc1, DS_SUB_SRC2_U64>;
+def DS_RSUB_SRC2_U64_si   : DS_Real_si<0xc2, DS_RSUB_SRC2_U64>;
+def DS_INC_SRC2_U64_si    : DS_Real_si<0xc3, DS_INC_SRC2_U64>;
+def DS_DEC_SRC2_U64_si    : DS_Real_si<0xc4, DS_DEC_SRC2_U64>;
+def DS_MIN_SRC2_I64_si    : DS_Real_si<0xc5, DS_MIN_SRC2_I64>;
+def DS_MAX_SRC2_I64_si    : DS_Real_si<0xc6, DS_MAX_SRC2_I64>;
+def DS_MIN_SRC2_U64_si    : DS_Real_si<0xc7, DS_MIN_SRC2_U64>;
+def DS_MAX_SRC2_U64_si    : DS_Real_si<0xc8, DS_MAX_SRC2_U64>;
+def DS_AND_SRC2_B64_si    : DS_Real_si<0xc9, DS_AND_SRC2_B64>;
+def DS_OR_SRC2_B64_si     : DS_Real_si<0xca, DS_OR_SRC2_B64>;
+def DS_XOR_SRC2_B64_si    : DS_Real_si<0xcb, DS_XOR_SRC2_B64>;
+def DS_WRITE_SRC2_B64_si  : DS_Real_si<0xcd, DS_WRITE_SRC2_B64>;
+
+def DS_MIN_SRC2_F64_si    : DS_Real_si<0xd2, DS_MIN_SRC2_F64>;
+def DS_MAX_SRC2_F64_si    : DS_Real_si<0xd3, DS_MAX_SRC2_F64>;
+
+//===----------------------------------------------------------------------===//
+// VIInstructions.td
+//===----------------------------------------------------------------------===//
+
+class DS_Real_vi <bits<8> op, DS_Pseudo ds> :
+  DS_Real <ds>,
+  SIMCInstr <ds.Mnemonic, SIEncodingFamily.VI> {
+  let AssemblerPredicates = [isVI];
+  let DecoderNamespace="VI";
+
+  // encoding
+  let Inst{7-0}   = !if(ds.has_offset0, offset0, 0);
+  let Inst{15-8}  = !if(ds.has_offset1, offset1, 0);
+  let Inst{16}    = !if(ds.has_gds, gds, ds.gdsValue);
+  let Inst{24-17} = op;
+  let Inst{31-26} = 0x36; // ds prefix
+  let Inst{39-32} = !if(ds.has_addr, addr, 0);
+  let Inst{47-40} = !if(ds.has_data0, data0, 0);
+  let Inst{55-48} = !if(ds.has_data1, data1, 0);
+  let Inst{63-56} = !if(ds.has_vdst, vdst, 0);
+}
+
+def DS_ADD_U32_vi         : DS_Real_vi<0x0,  DS_ADD_U32>;
+def DS_SUB_U32_vi         : DS_Real_vi<0x1,  DS_SUB_U32>;
+def DS_RSUB_U32_vi        : DS_Real_vi<0x2,  DS_RSUB_U32>;
+def DS_INC_U32_vi         : DS_Real_vi<0x3,  DS_INC_U32>;
+def DS_DEC_U32_vi         : DS_Real_vi<0x4,  DS_DEC_U32>;
+def DS_MIN_I32_vi         : DS_Real_vi<0x5,  DS_MIN_I32>;
+def DS_MAX_I32_vi         : DS_Real_vi<0x6,  DS_MAX_I32>;
+def DS_MIN_U32_vi         : DS_Real_vi<0x7,  DS_MIN_U32>;
+def DS_MAX_U32_vi         : DS_Real_vi<0x8,  DS_MAX_U32>;
+def DS_AND_B32_vi         : DS_Real_vi<0x9,  DS_AND_B32>;
+def DS_OR_B32_vi          : DS_Real_vi<0xa,  DS_OR_B32>;
+def DS_XOR_B32_vi         : DS_Real_vi<0xb,  DS_XOR_B32>;
+def DS_MSKOR_B32_vi       : DS_Real_vi<0xc,  DS_MSKOR_B32>;
+def DS_WRITE_B32_vi       : DS_Real_vi<0xd,  DS_WRITE_B32>;
+def DS_WRITE2_B32_vi      : DS_Real_vi<0xe,  DS_WRITE2_B32>;
+def DS_WRITE2ST64_B32_vi  : DS_Real_vi<0xf,  DS_WRITE2ST64_B32>;
+def DS_CMPST_B32_vi       : DS_Real_vi<0x10, DS_CMPST_B32>;
+def DS_CMPST_F32_vi       : DS_Real_vi<0x11, DS_CMPST_F32>;
+def DS_MIN_F32_vi         : DS_Real_vi<0x12, DS_MIN_F32>;
+def DS_MAX_F32_vi         : DS_Real_vi<0x13, DS_MAX_F32>;
+def DS_ADD_F32_vi         : DS_Real_vi<0x15, DS_ADD_F32>;
+def DS_GWS_INIT_vi        : DS_Real_vi<0x19, DS_GWS_INIT>;
+def DS_GWS_SEMA_V_vi      : DS_Real_vi<0x1a, DS_GWS_SEMA_V>;
+def DS_GWS_SEMA_BR_vi     : DS_Real_vi<0x1b, DS_GWS_SEMA_BR>;
+def DS_GWS_SEMA_P_vi      : DS_Real_vi<0x1c, DS_GWS_SEMA_P>;
+def DS_GWS_BARRIER_vi     : DS_Real_vi<0x1d, DS_GWS_BARRIER>;
+def DS_WRITE_B8_vi        : DS_Real_vi<0x1e, DS_WRITE_B8>;
+def DS_WRITE_B16_vi       : DS_Real_vi<0x1f, DS_WRITE_B16>;
+def DS_ADD_RTN_U32_vi     : DS_Real_vi<0x20, DS_ADD_RTN_U32>;
+def DS_SUB_RTN_U32_vi     : DS_Real_vi<0x21, DS_SUB_RTN_U32>;
+def DS_RSUB_RTN_U32_vi    : DS_Real_vi<0x22, DS_RSUB_RTN_U32>;
+def DS_INC_RTN_U32_vi     : DS_Real_vi<0x23, DS_INC_RTN_U32>;
+def DS_DEC_RTN_U32_vi     : DS_Real_vi<0x24, DS_DEC_RTN_U32>;
+def DS_MIN_RTN_I32_vi     : DS_Real_vi<0x25, DS_MIN_RTN_I32>;
+def DS_MAX_RTN_I32_vi     : DS_Real_vi<0x26, DS_MAX_RTN_I32>;
+def DS_MIN_RTN_U32_vi     : DS_Real_vi<0x27, DS_MIN_RTN_U32>;
+def DS_MAX_RTN_U32_vi     : DS_Real_vi<0x28, DS_MAX_RTN_U32>;
+def DS_AND_RTN_B32_vi     : DS_Real_vi<0x29, DS_AND_RTN_B32>;
+def DS_OR_RTN_B32_vi      : DS_Real_vi<0x2a, DS_OR_RTN_B32>;
+def DS_XOR_RTN_B32_vi     : DS_Real_vi<0x2b, DS_XOR_RTN_B32>;
+def DS_MSKOR_RTN_B32_vi   : DS_Real_vi<0x2c, DS_MSKOR_RTN_B32>;
+def DS_WRXCHG_RTN_B32_vi  : DS_Real_vi<0x2d, DS_WRXCHG_RTN_B32>;
+def DS_WRXCHG2_RTN_B32_vi : DS_Real_vi<0x2e, DS_WRXCHG2_RTN_B32>;
+def DS_WRXCHG2ST64_RTN_B32_vi : DS_Real_vi<0x2f, DS_WRXCHG2ST64_RTN_B32>;
+def DS_CMPST_RTN_B32_vi   : DS_Real_vi<0x30, DS_CMPST_RTN_B32>;
+def DS_CMPST_RTN_F32_vi   : DS_Real_vi<0x31, DS_CMPST_RTN_F32>;
+def DS_MIN_RTN_F32_vi     : DS_Real_vi<0x32, DS_MIN_RTN_F32>;
+def DS_MAX_RTN_F32_vi     : DS_Real_vi<0x33, DS_MAX_RTN_F32>;
+def DS_WRAP_RTN_F32_vi    : DS_Real_vi<0x34, DS_WRAP_RTN_F32>;
+def DS_ADD_RTN_F32_vi     : DS_Real_vi<0x35, DS_ADD_RTN_F32>;
+def DS_READ_B32_vi        : DS_Real_vi<0x36, DS_READ_B32>;
+def DS_READ2_B32_vi       : DS_Real_vi<0x37, DS_READ2_B32>;
+def DS_READ2ST64_B32_vi   : DS_Real_vi<0x38, DS_READ2ST64_B32>;
+def DS_READ_I8_vi         : DS_Real_vi<0x39, DS_READ_I8>;
+def DS_READ_U8_vi         : DS_Real_vi<0x3a, DS_READ_U8>;
+def DS_READ_I16_vi        : DS_Real_vi<0x3b, DS_READ_I16>;
+def DS_READ_U16_vi        : DS_Real_vi<0x3c, DS_READ_U16>;
+def DS_SWIZZLE_B32_vi     : DS_Real_vi<0x3d, DS_SWIZZLE_B32>;
+def DS_PERMUTE_B32_vi     : DS_Real_vi<0x3e, DS_PERMUTE_B32>;
+def DS_BPERMUTE_B32_vi    : DS_Real_vi<0x3f, DS_BPERMUTE_B32>;
+
+def DS_ADD_U64_vi         : DS_Real_vi<0x40, DS_ADD_U64>;
+def DS_SUB_U64_vi         : DS_Real_vi<0x41, DS_SUB_U64>;
+def DS_RSUB_U64_vi        : DS_Real_vi<0x42, DS_RSUB_U64>;
+def DS_INC_U64_vi         : DS_Real_vi<0x43, DS_INC_U64>;
+def DS_DEC_U64_vi         : DS_Real_vi<0x44, DS_DEC_U64>;
+def DS_MIN_I64_vi         : DS_Real_vi<0x45, DS_MIN_I64>;
+def DS_MAX_I64_vi         : DS_Real_vi<0x46, DS_MAX_I64>;
+def DS_MIN_U64_vi         : DS_Real_vi<0x47, DS_MIN_U64>;
+def DS_MAX_U64_vi         : DS_Real_vi<0x48, DS_MAX_U64>;
+def DS_AND_B64_vi         : DS_Real_vi<0x49, DS_AND_B64>;
+def DS_OR_B64_vi          : DS_Real_vi<0x4a, DS_OR_B64>;
+def DS_XOR_B64_vi         : DS_Real_vi<0x4b, DS_XOR_B64>;
+def DS_MSKOR_B64_vi       : DS_Real_vi<0x4c, DS_MSKOR_B64>;
+def DS_WRITE_B64_vi       : DS_Real_vi<0x4d, DS_WRITE_B64>;
+def DS_WRITE2_B64_vi      : DS_Real_vi<0x4E, DS_WRITE2_B64>;
+def DS_WRITE2ST64_B64_vi  : DS_Real_vi<0x4f, DS_WRITE2ST64_B64>;
+def DS_CMPST_B64_vi       : DS_Real_vi<0x50, DS_CMPST_B64>;
+def DS_CMPST_F64_vi       : DS_Real_vi<0x51, DS_CMPST_F64>;
+def DS_MIN_F64_vi         : DS_Real_vi<0x52, DS_MIN_F64>;
+def DS_MAX_F64_vi         : DS_Real_vi<0x53, DS_MAX_F64>;
+
+def DS_ADD_RTN_U64_vi     : DS_Real_vi<0x60, DS_ADD_RTN_U64>;
+def DS_SUB_RTN_U64_vi     : DS_Real_vi<0x61, DS_SUB_RTN_U64>;
+def DS_RSUB_RTN_U64_vi    : DS_Real_vi<0x62, DS_RSUB_RTN_U64>;
+def DS_INC_RTN_U64_vi     : DS_Real_vi<0x63, DS_INC_RTN_U64>;
+def DS_DEC_RTN_U64_vi     : DS_Real_vi<0x64, DS_DEC_RTN_U64>;
+def DS_MIN_RTN_I64_vi     : DS_Real_vi<0x65, DS_MIN_RTN_I64>;
+def DS_MAX_RTN_I64_vi     : DS_Real_vi<0x66, DS_MAX_RTN_I64>;
+def DS_MIN_RTN_U64_vi     : DS_Real_vi<0x67, DS_MIN_RTN_U64>;
+def DS_MAX_RTN_U64_vi     : DS_Real_vi<0x68, DS_MAX_RTN_U64>;
+def DS_AND_RTN_B64_vi     : DS_Real_vi<0x69, DS_AND_RTN_B64>;
+def DS_OR_RTN_B64_vi      : DS_Real_vi<0x6a, DS_OR_RTN_B64>;
+def DS_XOR_RTN_B64_vi     : DS_Real_vi<0x6b, DS_XOR_RTN_B64>;
+def DS_MSKOR_RTN_B64_vi   : DS_Real_vi<0x6c, DS_MSKOR_RTN_B64>;
+def DS_WRXCHG_RTN_B64_vi  : DS_Real_vi<0x6d, DS_WRXCHG_RTN_B64>;
+def DS_WRXCHG2_RTN_B64_vi : DS_Real_vi<0x6e, DS_WRXCHG2_RTN_B64>;
+def DS_WRXCHG2ST64_RTN_B64_vi : DS_Real_vi<0x6f, DS_WRXCHG2ST64_RTN_B64>;
+def DS_CMPST_RTN_B64_vi   : DS_Real_vi<0x70, DS_CMPST_RTN_B64>;
+def DS_CMPST_RTN_F64_vi   : DS_Real_vi<0x71, DS_CMPST_RTN_F64>;
+def DS_MIN_RTN_F64_vi     : DS_Real_vi<0x72, DS_MIN_RTN_F64>;
+def DS_MAX_RTN_F64_vi     : DS_Real_vi<0x73, DS_MAX_RTN_F64>;
+
+def DS_READ_B64_vi        : DS_Real_vi<0x76, DS_READ_B64>;
+def DS_READ2_B64_vi       : DS_Real_vi<0x77, DS_READ2_B64>;
+def DS_READ2ST64_B64_vi   : DS_Real_vi<0x78, DS_READ2ST64_B64>;
+
+def DS_ADD_SRC2_U32_vi    : DS_Real_vi<0x80, DS_ADD_SRC2_U32>;
+def DS_SUB_SRC2_U32_vi    : DS_Real_vi<0x81, DS_SUB_SRC2_U32>;
+def DS_RSUB_SRC2_U32_vi   : DS_Real_vi<0x82, DS_RSUB_SRC2_U32>;
+def DS_INC_SRC2_U32_vi    : DS_Real_vi<0x83, DS_INC_SRC2_U32>;
+def DS_DEC_SRC2_U32_vi    : DS_Real_vi<0x84, DS_DEC_SRC2_U32>;
+def DS_MIN_SRC2_I32_vi    : DS_Real_vi<0x85, DS_MIN_SRC2_I32>;
+def DS_MAX_SRC2_I32_vi    : DS_Real_vi<0x86, DS_MAX_SRC2_I32>;
+def DS_MIN_SRC2_U32_vi    : DS_Real_vi<0x87, DS_MIN_SRC2_U32>;
+def DS_MAX_SRC2_U32_vi    : DS_Real_vi<0x88, DS_MAX_SRC2_U32>;
+def DS_AND_SRC2_B32_vi    : DS_Real_vi<0x89, DS_AND_SRC2_B32>;
+def DS_OR_SRC2_B32_vi     : DS_Real_vi<0x8a, DS_OR_SRC2_B32>;
+def DS_XOR_SRC2_B32_vi    : DS_Real_vi<0x8b, DS_XOR_SRC2_B32>;
+def DS_WRITE_SRC2_B32_vi  : DS_Real_vi<0x8d, DS_WRITE_SRC2_B32>;
+def DS_MIN_SRC2_F32_vi    : DS_Real_vi<0x92, DS_MIN_SRC2_F32>;
+def DS_MAX_SRC2_F32_vi    : DS_Real_vi<0x93, DS_MAX_SRC2_F32>;
+def DS_ADD_SRC2_U64_vi    : DS_Real_vi<0xc0, DS_ADD_SRC2_U64>;
+def DS_SUB_SRC2_U64_vi    : DS_Real_vi<0xc1, DS_SUB_SRC2_U64>;
+def DS_RSUB_SRC2_U64_vi   : DS_Real_vi<0xc2, DS_RSUB_SRC2_U64>;
+def DS_INC_SRC2_U64_vi    : DS_Real_vi<0xc3, DS_INC_SRC2_U64>;
+def DS_DEC_SRC2_U64_vi    : DS_Real_vi<0xc4, DS_DEC_SRC2_U64>;
+def DS_MIN_SRC2_I64_vi    : DS_Real_vi<0xc5, DS_MIN_SRC2_I64>;
+def DS_MAX_SRC2_I64_vi    : DS_Real_vi<0xc6, DS_MAX_SRC2_I64>;
+def DS_MIN_SRC2_U64_vi    : DS_Real_vi<0xc7, DS_MIN_SRC2_U64>;
+def DS_MAX_SRC2_U64_vi    : DS_Real_vi<0xc8, DS_MAX_SRC2_U64>;
+def DS_AND_SRC2_B64_vi    : DS_Real_vi<0xc9, DS_AND_SRC2_B64>;
+def DS_OR_SRC2_B64_vi     : DS_Real_vi<0xca, DS_OR_SRC2_B64>;
+def DS_XOR_SRC2_B64_vi    : DS_Real_vi<0xcb, DS_XOR_SRC2_B64>;
+def DS_WRITE_SRC2_B64_vi  : DS_Real_vi<0xcd, DS_WRITE_SRC2_B64>;
+def DS_MIN_SRC2_F64_vi    : DS_Real_vi<0xd2, DS_MIN_SRC2_F64>;
+def DS_MAX_SRC2_F64_vi    : DS_Real_vi<0xd3, DS_MAX_SRC2_F64>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
new file mode 100644
index 000000000000..2247cad7bb51
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -0,0 +1,609 @@
+//===-- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This file contains definition for AMDGPU ISA disassembler
+//
+//===----------------------------------------------------------------------===//
+
+// ToDo: What to do with instruction suffixes (v_mov_b32 vs v_mov_b32_e32)?
+
+#include "AMDGPUDisassembler.h"
+#include "AMDGPU.h"
+#include "AMDGPURegisterInfo.h"
+#include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
+
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-disassembler"
+
+typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
+
+
+inline static MCDisassembler::DecodeStatus
+addOperand(MCInst &Inst, const MCOperand& Opnd) {
+  Inst.addOperand(Opnd);
+  return Opnd.isValid() ?
+    MCDisassembler::Success :
+    MCDisassembler::SoftFail;
+}
+
+static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
+                                       uint64_t Addr, const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+
+  APInt SignedOffset(18, Imm * 4, true);
+  int64_t Offset = (SignedOffset.sext(64) + 4 + Addr).getSExtValue();
+
+  if (DAsm->tryAddingSymbolicOperand(Inst, Offset, Addr, true, 2, 2))
+    return MCDisassembler::Success;
+  return addOperand(Inst, MCOperand::createImm(Imm));
+}
+
+#define DECODE_OPERAND2(RegClass, DecName) \
+static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \
+                                                    unsigned Imm, \
+                                                    uint64_t /*Addr*/, \
+                                                    const void *Decoder) { \
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); \
+  return addOperand(Inst, DAsm->decodeOperand_##DecName(Imm)); \
+}
+
+#define DECODE_OPERAND(RegClass) DECODE_OPERAND2(RegClass, RegClass)
+
+DECODE_OPERAND(VGPR_32)
+DECODE_OPERAND(VS_32)
+DECODE_OPERAND(VS_64)
+
+DECODE_OPERAND(VReg_64)
+DECODE_OPERAND(VReg_96)
+DECODE_OPERAND(VReg_128)
+
+DECODE_OPERAND(SReg_32)
+DECODE_OPERAND(SReg_32_XM0_XEXEC)
+DECODE_OPERAND(SReg_64)
+DECODE_OPERAND(SReg_64_XEXEC)
+DECODE_OPERAND(SReg_128)
+DECODE_OPERAND(SReg_256)
+DECODE_OPERAND(SReg_512)
+
+
+static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
+                                         unsigned Imm,
+                                         uint64_t Addr,
+                                         const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+  return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
+}
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AMDGPUGenSubtargetInfo.inc"
+#undef GET_SUBTARGETINFO_ENUM
+
+#include "AMDGPUGenDisassemblerTables.inc"
+
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
+  assert(Bytes.size() >= sizeof(T));
+  const auto Res = support::endian::read<T, support::endianness::little>(Bytes.data());
+  Bytes = Bytes.slice(sizeof(T));
+  return Res;
+}
+
+DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table,
+                                               MCInst &MI,
+                                               uint64_t Inst,
+                                               uint64_t Address) const {
+  assert(MI.getOpcode() == 0);
+  assert(MI.getNumOperands() == 0);
+  MCInst TmpInst;
+  const auto SavedBytes = Bytes;
+  if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) {
+    MI = TmpInst;
+    return MCDisassembler::Success;
+  }
+  Bytes = SavedBytes;
+  return MCDisassembler::Fail;
+}
+
+DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                ArrayRef<uint8_t> Bytes_,
+                                                uint64_t Address,
+                                                raw_ostream &WS,
+                                                raw_ostream &CS) const {
+  CommentStream = &CS;
+
+  // ToDo: AMDGPUDisassembler supports only VI ISA.
+  assert(AMDGPU::isVI(STI) && "Can disassemble only VI ISA.");
+
+  const unsigned MaxInstBytesNum = (std::min)((size_t)8, Bytes_.size());
+  Bytes = Bytes_.slice(0, MaxInstBytesNum);
+
+  DecodeStatus Res = MCDisassembler::Fail;
+  do {
+    // ToDo: better to switch encoding length using some bit predicate
+    // but it is unknown yet, so try all we can
+
+    // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
+    // encodings
+    if (Bytes.size() >= 8) {
+      const uint64_t QW = eatBytes<uint64_t>(Bytes);
+      Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address);
+      if (Res) break;
+
+      Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);
+      if (Res) break;
+    }
+
+    // Reinitialize Bytes as DPP64 could have eaten too much
+    Bytes = Bytes_.slice(0, MaxInstBytesNum);
+
+    // Try decode 32-bit instruction
+    if (Bytes.size() < 4) break;
+    const uint32_t DW = eatBytes<uint32_t>(Bytes);
+    Res = tryDecodeInst(DecoderTableVI32, MI, DW, Address);
+    if (Res) break;
+
+    Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address);
+    if (Res) break;
+
+    if (Bytes.size() < 4) break;
+    const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
+    Res = tryDecodeInst(DecoderTableVI64, MI, QW, Address);
+    if (Res) break;
+
+    Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address);
+  } while (false);
+
+  Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0;
+  return Res;
+}
+
+const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
+  return getContext().getRegisterInfo()->
+    getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]);
+}
+
+inline
+MCOperand AMDGPUDisassembler::errOperand(unsigned V,
+                                         const Twine& ErrMsg) const {
+  *CommentStream << "Error: " + ErrMsg;
+
+  // ToDo: add support for error operands to MCInst.h
+  // return MCOperand::createError(V);
+  return MCOperand();
+}
+
+inline
+MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const {
+  return MCOperand::createReg(RegId);
+}
+
+inline
+MCOperand AMDGPUDisassembler::createRegOperand(unsigned RegClassID,
+                                               unsigned Val) const {
+  const auto& RegCl = AMDGPUMCRegisterClasses[RegClassID];
+  if (Val >= RegCl.getNumRegs())
+    return errOperand(Val, Twine(getRegClassName(RegClassID)) +
+                           ": unknown register " + Twine(Val));
+  return createRegOperand(RegCl.getRegister(Val));
+}
+
+inline
+MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
+                                                unsigned Val) const {
+  // ToDo: SI/CI have 104 SGPRs, VI - 102
+  // Valery: here we accepting as much as we can, let assembler sort it out
+  int shift = 0;
+  switch (SRegClassID) {
+  case AMDGPU::SGPR_32RegClassID:
+  case AMDGPU::TTMP_32RegClassID:
+    break;
+  case AMDGPU::SGPR_64RegClassID:
+  case AMDGPU::TTMP_64RegClassID:
+    shift = 1;
+    break;
+  case AMDGPU::SGPR_128RegClassID:
+  case AMDGPU::TTMP_128RegClassID:
+  // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in
+  // this bundle?
+  case AMDGPU::SReg_256RegClassID:
+  // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in
+  // this bundle?
+  case AMDGPU::SReg_512RegClassID:
+    shift = 2;
+    break;
+  // ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in
+  // this bundle?
+  default:
+    llvm_unreachable("unhandled register class");
+  }
+
+  if (Val % (1 << shift)) {
+    *CommentStream << "Warning: " << getRegClassName(SRegClassID)
+                   << ": scalar reg isn't aligned " << Val;
+  }
+
+  return createRegOperand(SRegClassID, Val >> shift);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VS_32(unsigned Val) const {
+  return decodeSrcOp(OPW32, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const {
+  return decodeSrcOp(OPW64, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const {
+  return decodeSrcOp(OPW16, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
+  // Some instructions have operand restrictions beyond what the encoding
+  // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra
+  // high bit.
+  Val &= 255;
+
+  return createRegOperand(AMDGPU::VGPR_32RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_64(unsigned Val) const {
+  return createRegOperand(AMDGPU::VReg_64RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_96(unsigned Val) const {
+  return createRegOperand(AMDGPU::VReg_96RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_VReg_128(unsigned Val) const {
+  return createRegOperand(AMDGPU::VReg_128RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const {
+  // table-gen generated disassembler doesn't care about operand types
+  // leaving only registry class so SSrc_32 operand turns into SReg_32
+  // and therefore we accept immediates and literals here as well
+  return decodeSrcOp(OPW32, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XM0_XEXEC(
+  unsigned Val) const {
+  // SReg_32_XM0 is SReg_32 without M0 or EXEC_LO/EXEC_HI
+  return decodeOperand_SReg_32(Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const {
+  return decodeSrcOp(OPW64, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_64_XEXEC(unsigned Val) const {
+  return decodeSrcOp(OPW64, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_128(unsigned Val) const {
+  return decodeSrcOp(OPW128, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_256(unsigned Val) const {
+  return createSRegOperand(AMDGPU::SReg_256RegClassID, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const {
+  return createSRegOperand(AMDGPU::SReg_512RegClassID, Val);
+}
+
+
+MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
+  // For now all literal constants are supposed to be unsigned integer
+  // ToDo: deal with signed/unsigned 64-bit integer constants
+  // ToDo: deal with float/double constants
+  if (Bytes.size() < 4)
+    return errOperand(0, "cannot read literal, inst bytes left " +
+                         Twine(Bytes.size()));
+  return MCOperand::createImm(eatBytes<uint32_t>(Bytes));
+}
+
+MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
+  using namespace AMDGPU::EncValues;
+  assert(Imm >= INLINE_INTEGER_C_MIN && Imm <= INLINE_INTEGER_C_MAX);
+  return MCOperand::createImm((Imm <= INLINE_INTEGER_C_POSITIVE_MAX) ?
+    (static_cast<int64_t>(Imm) - INLINE_INTEGER_C_MIN) :
+    (INLINE_INTEGER_C_POSITIVE_MAX - static_cast<int64_t>(Imm)));
+      // Cast prevents negative overflow.
+}
+
+static int64_t getInlineImmVal32(unsigned Imm) {
+  switch (Imm) {
+  case 240:
+    return FloatToBits(0.5f);
+  case 241:
+    return FloatToBits(-0.5f);
+  case 242:
+    return FloatToBits(1.0f);
+  case 243:
+    return FloatToBits(-1.0f);
+  case 244:
+    return FloatToBits(2.0f);
+  case 245:
+    return FloatToBits(-2.0f);
+  case 246:
+    return FloatToBits(4.0f);
+  case 247:
+    return FloatToBits(-4.0f);
+  case 248: // 1 / (2 * PI)
+    return 0x3e22f983;
+  default:
+    llvm_unreachable("invalid fp inline imm");
+  }
+}
+
+static int64_t getInlineImmVal64(unsigned Imm) {
+  switch (Imm) {
+  case 240:
+    return DoubleToBits(0.5);
+  case 241:
+    return DoubleToBits(-0.5);
+  case 242:
+    return DoubleToBits(1.0);
+  case 243:
+    return DoubleToBits(-1.0);
+  case 244:
+    return DoubleToBits(2.0);
+  case 245:
+    return DoubleToBits(-2.0);
+  case 246:
+    return DoubleToBits(4.0);
+  case 247:
+    return DoubleToBits(-4.0);
+  case 248: // 1 / (2 * PI)
+    return 0x3fc45f306dc9c882;
+  default:
+    llvm_unreachable("invalid fp inline imm");
+  }
+}
+
+static int64_t getInlineImmVal16(unsigned Imm) {
+  switch (Imm) {
+  case 240:
+    return 0x3800;
+  case 241:
+    return 0xB800;
+  case 242:
+    return 0x3C00;
+  case 243:
+    return 0xBC00;
+  case 244:
+    return 0x4000;
+  case 245:
+    return 0xC000;
+  case 246:
+    return 0x4400;
+  case 247:
+    return 0xC400;
+  case 248: // 1 / (2 * PI)
+    return 0x3118;
+  default:
+    llvm_unreachable("invalid fp inline imm");
+  }
+}
+
+MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) {
+  assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN
+      && Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX);
+
+  // ToDo: case 248: 1/(2*PI) - is allowed only on VI
+  switch (Width) {
+  case OPW32:
+    return MCOperand::createImm(getInlineImmVal32(Imm));
+  case OPW64:
+    return MCOperand::createImm(getInlineImmVal64(Imm));
+  case OPW16:
+    return MCOperand::createImm(getInlineImmVal16(Imm));
+  default:
+    llvm_unreachable("implement me");
+  }
+}
+
+unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
+  using namespace AMDGPU;
+  assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
+  switch (Width) {
+  default: // fall
+  case OPW32:
+  case OPW16:
+    return VGPR_32RegClassID;
+  case OPW64: return VReg_64RegClassID;
+  case OPW128: return VReg_128RegClassID;
+  }
+}
+
+unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
+  using namespace AMDGPU;
+  assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
+  switch (Width) {
+  default: // fall
+  case OPW32:
+  case OPW16:
+    return SGPR_32RegClassID;
+  case OPW64: return SGPR_64RegClassID;
+  case OPW128: return SGPR_128RegClassID;
+  }
+}
+
+unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
+  using namespace AMDGPU;
+  assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
+  switch (Width) {
+  default: // fall
+  case OPW32:
+  case OPW16:
+    return TTMP_32RegClassID;
+  case OPW64: return TTMP_64RegClassID;
+  case OPW128: return TTMP_128RegClassID;
+  }
+}
+
+MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) const {
+  using namespace AMDGPU::EncValues;
+  assert(Val < 512); // enum9
+
+  if (VGPR_MIN <= Val && Val <= VGPR_MAX) {
+    return createRegOperand(getVgprClassId(Width), Val - VGPR_MIN);
+  }
+  if (Val <= SGPR_MAX) {
+    assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning.
+    return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
+  }
+  if (TTMP_MIN <= Val && Val <= TTMP_MAX) {
+    return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN);
+  }
+
+  assert(Width == OPW16 || Width == OPW32 || Width == OPW64);
+
+  if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
+    return decodeIntImmed(Val);
+
+  if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX)
+    return decodeFPImmed(Width, Val);
+
+  if (Val == LITERAL_CONST)
+    return decodeLiteralConstant();
+
+  switch (Width) {
+  case OPW32:
+  case OPW16:
+    return decodeSpecialReg32(Val);
+  case OPW64:
+    return decodeSpecialReg64(Val);
+  default:
+    llvm_unreachable("unexpected immediate type");
+  }
+}
+
+MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
+  using namespace AMDGPU;
+  switch (Val) {
+  case 102: return createRegOperand(getMCReg(FLAT_SCR_LO, STI));
+  case 103: return createRegOperand(getMCReg(FLAT_SCR_HI, STI));
+    // ToDo: no support for xnack_mask_lo/_hi register
+  case 104:
+  case 105: break;
+  case 106: return createRegOperand(VCC_LO);
+  case 107: return createRegOperand(VCC_HI);
+  case 108: return createRegOperand(TBA_LO);
+  case 109: return createRegOperand(TBA_HI);
+  case 110: return createRegOperand(TMA_LO);
+  case 111: return createRegOperand(TMA_HI);
+  case 124: return createRegOperand(M0);
+  case 126: return createRegOperand(EXEC_LO);
+  case 127: return createRegOperand(EXEC_HI);
+    // ToDo: no support for vccz register
+  case 251: break;
+    // ToDo: no support for execz register
+  case 252: break;
+  case 253: return createRegOperand(SCC);
+  default: break;
+  }
+  return errOperand(Val, "unknown operand encoding " + Twine(Val));
+}
+
+MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
+  using namespace AMDGPU;
+  switch (Val) {
+  case 102: return createRegOperand(getMCReg(FLAT_SCR, STI));
+  case 106: return createRegOperand(VCC);
+  case 108: return createRegOperand(TBA);
+  case 110: return createRegOperand(TMA);
+  case 126: return createRegOperand(EXEC);
+  default: break;
+  }
+  return errOperand(Val, "unknown operand encoding " + Twine(Val));
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPUSymbolizer
+//===----------------------------------------------------------------------===//
+
+// Try to find symbol name for specified label
+bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
+                                raw_ostream &/*cStream*/, int64_t Value,
+                                uint64_t /*Address*/, bool IsBranch,
+                                uint64_t /*Offset*/, uint64_t /*InstSize*/) {
+  typedef std::tuple<uint64_t, StringRef, uint8_t> SymbolInfoTy;
+  typedef std::vector<SymbolInfoTy> SectionSymbolsTy;
+
+  if (!IsBranch) {
+    return false;
+  }
+
+  auto *Symbols = static_cast<SectionSymbolsTy *>(DisInfo);
+  auto Result = std::find_if(Symbols->begin(), Symbols->end(),
+                             [Value](const SymbolInfoTy& Val) {
+                                return std::get<0>(Val) == static_cast<uint64_t>(Value)
+                                    && std::get<2>(Val) == ELF::STT_NOTYPE;
+                             });
+  if (Result != Symbols->end()) {
+    auto *Sym = Ctx.getOrCreateSymbol(std::get<1>(*Result));
+    const auto *Add = MCSymbolRefExpr::create(Sym, Ctx);
+    Inst.addOperand(MCOperand::createExpr(Add));
+    return true;
+  }
+  return false;
+}
+
+void AMDGPUSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
+                                                       int64_t Value,
+                                                       uint64_t Address) {
+  llvm_unreachable("unimplemented");
+}
+
+//===----------------------------------------------------------------------===//
+// Initialization
+//===----------------------------------------------------------------------===//
+
+static MCSymbolizer *createAMDGPUSymbolizer(const Triple &/*TT*/,
+                              LLVMOpInfoCallback /*GetOpInfo*/,
+                              LLVMSymbolLookupCallback /*SymbolLookUp*/,
+                              void *DisInfo,
+                              MCContext *Ctx,
+                              std::unique_ptr<MCRelocationInfo> &&RelInfo) {
+  return new AMDGPUSymbolizer(*Ctx, std::move(RelInfo), DisInfo);
+}
+
+static MCDisassembler *createAMDGPUDisassembler(const Target &T,
+                                                const MCSubtargetInfo &STI,
+                                                MCContext &Ctx) {
+  return new AMDGPUDisassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeAMDGPUDisassembler() {
+  TargetRegistry::RegisterMCDisassembler(getTheGCNTarget(),
+                                         createAMDGPUDisassembler);
+  TargetRegistry::RegisterMCSymbolizer(getTheGCNTarget(),
+                                       createAMDGPUSymbolizer);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
new file mode 100644
index 000000000000..ee5883a984e0
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -0,0 +1,130 @@
+//===-- AMDGPUDisassembler.hpp - Disassembler for AMDGPU ISA ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This file contains declaration for AMDGPU ISA disassembler
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
+#define LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
+#include "llvm/MC/MCDisassembler/MCSymbolizer.h"
+#include <cstdint>
+#include <algorithm>
+#include <memory>
+
+namespace llvm {
+
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSubtargetInfo;
+class Twine;
+
+//===----------------------------------------------------------------------===//
+// AMDGPUDisassembler
+//===----------------------------------------------------------------------===//
+
+class AMDGPUDisassembler : public MCDisassembler {
+private:
+  mutable ArrayRef<uint8_t> Bytes;
+
+public:
+  AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+    MCDisassembler(STI, Ctx) {}
+
+  ~AMDGPUDisassembler() override = default;
+
+  DecodeStatus getInstruction(MCInst &MI, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &WS, raw_ostream &CS) const override;
+
+  const char* getRegClassName(unsigned RegClassID) const;
+
+  MCOperand createRegOperand(unsigned int RegId) const;
+  MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const;
+  MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const;
+
+  MCOperand errOperand(unsigned V, const Twine& ErrMsg) const;
+
+  DecodeStatus tryDecodeInst(const uint8_t* Table,
+                              MCInst &MI,
+                              uint64_t Inst,
+                              uint64_t Address) const;
+
+  MCOperand decodeOperand_VGPR_32(unsigned Val) const;
+  MCOperand decodeOperand_VS_32(unsigned Val) const;
+  MCOperand decodeOperand_VS_64(unsigned Val) const;
+  MCOperand decodeOperand_VSrc16(unsigned Val) const;
+
+  MCOperand decodeOperand_VReg_64(unsigned Val) const;
+  MCOperand decodeOperand_VReg_96(unsigned Val) const;
+  MCOperand decodeOperand_VReg_128(unsigned Val) const;
+
+  MCOperand decodeOperand_SReg_32(unsigned Val) const;
+  MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const;
+  MCOperand decodeOperand_SReg_64(unsigned Val) const;
+  MCOperand decodeOperand_SReg_64_XEXEC(unsigned Val) const;
+  MCOperand decodeOperand_SReg_128(unsigned Val) const;
+  MCOperand decodeOperand_SReg_256(unsigned Val) const;
+  MCOperand decodeOperand_SReg_512(unsigned Val) const;
+
+  enum OpWidthTy {
+    OPW32,
+    OPW64,
+    OPW128,
+    OPW16,
+    OPW_LAST_,
+    OPW_FIRST_ = OPW32
+  };
+
+  unsigned getVgprClassId(const OpWidthTy Width) const;
+  unsigned getSgprClassId(const OpWidthTy Width) const;
+  unsigned getTtmpClassId(const OpWidthTy Width) const;
+
+  static MCOperand decodeIntImmed(unsigned Imm);
+  static MCOperand decodeFPImmed(OpWidthTy Width, unsigned Imm);
+  MCOperand decodeLiteralConstant() const;
+
+  MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const;
+  MCOperand decodeSpecialReg32(unsigned Val) const;
+  MCOperand decodeSpecialReg64(unsigned Val) const;
+};
+
+//===----------------------------------------------------------------------===//
+// AMDGPUSymbolizer
+//===----------------------------------------------------------------------===//
+
+class AMDGPUSymbolizer : public MCSymbolizer {
+private:
+  void *DisInfo;
+
+public:
+  AMDGPUSymbolizer(MCContext &Ctx, std::unique_ptr<MCRelocationInfo> &&RelInfo,
+                   void *disInfo)
+                   : MCSymbolizer(Ctx, std::move(RelInfo)), DisInfo(disInfo) {}
+
+  bool tryAddingSymbolicOperand(MCInst &Inst, raw_ostream &cStream,
+                                int64_t Value, uint64_t Address,
+                                bool IsBranch, uint64_t Offset,
+                                uint64_t InstSize) override;
+
+  void tryAddingPcLoadReferenceComment(raw_ostream &cStream,
+                                       int64_t Value,
+                                       uint64_t Address) override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
new file mode 100644
index 000000000000..4112ad100584
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -0,0 +1,663 @@
+//===-- EvergreenInstructions.td - EG Instruction defs  ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are:
+// - Available to Evergreen and newer VLIW4/VLIW5 GPUs
+// - Available only on Evergreen family GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+def isEG : Predicate<
+  "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
+  "Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+  "!Subtarget->hasCaymanISA()"
+>;
+
+def isEGorCayman : Predicate<
+  "Subtarget->getGeneration() == AMDGPUSubtarget::EVERGREEN ||"
+  "Subtarget->getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS"
+>;
+
+//===----------------------------------------------------------------------===//
+// Evergreen / Cayman store instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEGorCayman] in {
+
+class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins,
+                           string name, list<dag> pattern>
+    : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins,
+                 "MEM_RAT_CACHELESS "#name, pattern>;
+
+class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name,
+                  list<dag> pattern>
+    : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins,
+                 "MEM_RAT "#name, pattern>;
+
+class CF_MEM_RAT_STORE_TYPED<bits<1> has_eop>
+    : CF_MEM_RAT <0x1, ?, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr,
+                           i32imm:$rat_id, InstFlag:$eop),
+                  "STORE_TYPED RAT($rat_id) $rw_gpr, $index_gpr"
+                               #!if(has_eop, ", $eop", ""),
+                  [(int_r600_rat_store_typed R600_Reg128:$rw_gpr,
+                                             R600_Reg128:$index_gpr,
+                                             (i32 imm:$rat_id))]>;
+
+def RAT_MSKOR : CF_MEM_RAT <0x11, 0,
+  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
+  "MSKOR $rw_gpr.XW, $index_gpr",
+  [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)]
+> {
+  let eop = 0;
+}
+
+} // End let Predicates = [isEGorCayman]
+
+//===----------------------------------------------------------------------===//
+// Evergreen Only instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEG] in {
+
+def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>;
+defm DIV_eg : DIV_Common<RECIP_IEEE_eg>;
+
+def MULLO_INT_eg : MULLO_INT_Common<0x8F>;
+def MULHI_INT_eg : MULHI_INT_Common<0x90>;
+def MULLO_UINT_eg : MULLO_UINT_Common<0x91>;
+def MULHI_UINT_eg : MULHI_UINT_Common<0x92>;
+def MULHI_UINT24_eg : MULHI_UINT24_Common<0xb2>;
+
+def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
+def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>;
+def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
+def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
+def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
+def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
+def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
+def SIN_eg : SIN_Common<0x8D>;
+def COS_eg : COS_Common<0x8E>;
+
+def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
+def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
+
+//===----------------------------------------------------------------------===//
+// Memory read/write instructions
+//===----------------------------------------------------------------------===//
+
+let usesCustomInserter = 1 in {
+
+// 32-bit store
+def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1,
+  (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  "STORE_RAW $rw_gpr, $index_gpr, $eop",
+  [(global_store i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+// 64-bit store
+def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3,
+  (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  "STORE_RAW $rw_gpr.XY, $index_gpr, $eop",
+  [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+//128-bit store
+def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf,
+  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  "STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop",
+  [(global_store v4i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+def RAT_STORE_TYPED_eg: CF_MEM_RAT_STORE_TYPED<1>;
+
+} // End usesCustomInserter = 1
+
+class VTX_READ_eg <string name, dag outs>
+    : VTX_WORD0_eg, VTX_READ<name, outs, []> {
+
+  // Static fields
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let SRC_REL = 0;
+  // XXX: We can infer this field based on the SRC_GPR.  This would allow us
+  // to store vertex addresses in any channel, not just X.
+  let SRC_SEL_X = 0;
+
+  let Inst{31-0} = Word0;
+}
+
+def VTX_READ_8_eg
+    : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr",
+                   (outs R600_TReg32_X:$dst_gpr)> {
+
+  let MEGA_FETCH_COUNT = 1;
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 1; // FMT_8
+}
+
+def VTX_READ_16_eg
+    : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr",
+                   (outs R600_TReg32_X:$dst_gpr)> {
+  let MEGA_FETCH_COUNT = 2;
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 5; // FMT_16
+
+}
+
+def VTX_READ_32_eg
+    : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr",
+                   (outs R600_TReg32_X:$dst_gpr)> {
+
+  let MEGA_FETCH_COUNT = 4;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 7;   // Masked
+  let DST_SEL_Z        = 7;   // Masked
+  let DST_SEL_W        = 7;   // Masked
+  let DATA_FORMAT      = 0xD; // COLOR_32
+
+  // This is not really necessary, but there were some GPU hangs that appeared
+  // to be caused by ALU instructions in the next instruction group that wrote
+  // to the $src_gpr registers of the VTX_READ.
+  // e.g.
+  // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
+  // %T2_X<def> = MOV %ZERO
+  //Adding this constraint prevents this from happening.
+  let Constraints = "$src_gpr.ptr = $dst_gpr";
+}
+
+def VTX_READ_64_eg
+    : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr",
+                   (outs R600_Reg64:$dst_gpr)> {
+
+  let MEGA_FETCH_COUNT = 8;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 1;
+  let DST_SEL_Z        = 7;
+  let DST_SEL_W        = 7;
+  let DATA_FORMAT      = 0x1D; // COLOR_32_32
+}
+
+def VTX_READ_128_eg
+    : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr",
+                   (outs R600_Reg128:$dst_gpr)> {
+
+  let MEGA_FETCH_COUNT = 16;
+  let DST_SEL_X        =  0;
+  let DST_SEL_Y        =  1;
+  let DST_SEL_Z        =  2;
+  let DST_SEL_W        =  3;
+  let DATA_FORMAT      =  0x22; // COLOR_32_32_32_32
+
+  // XXX: Need to force VTX_READ_128 instructions to write to the same register
+  // that holds its buffer address to avoid potential hangs.  We can't use
+  // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst
+  // registers are different sizes.
+}
+
+//===----------------------------------------------------------------------===//
+// VTX Read from parameter memory space
+//===----------------------------------------------------------------------===//
+def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_8_eg MEMxi:$src_gpr, 3)>;
+def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_16_eg MEMxi:$src_gpr, 3)>;
+def : Pat<(i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_32_eg MEMxi:$src_gpr, 3)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_64_eg MEMxi:$src_gpr, 3)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_128_eg MEMxi:$src_gpr, 3)>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from constant memory space
+//===----------------------------------------------------------------------===//
+def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_8_eg MEMxi:$src_gpr, 2)>;
+def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_16_eg MEMxi:$src_gpr, 2)>;
+def : Pat<(i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_32_eg MEMxi:$src_gpr, 2)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_64_eg MEMxi:$src_gpr, 2)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_128_eg MEMxi:$src_gpr, 2)>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from global memory space
+//===----------------------------------------------------------------------===//
+def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_8_eg MEMxi:$src_gpr, 1)>;
+def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_16_eg MEMxi:$src_gpr, 1)>;
+def : Pat<(i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_32_eg MEMxi:$src_gpr, 1)>;
+def : Pat<(v2i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_64_eg MEMxi:$src_gpr, 1)>;
+def : Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
+          (VTX_READ_128_eg MEMxi:$src_gpr, 1)>;
+
+} // End Predicates = [isEG]
+
+//===----------------------------------------------------------------------===//
+// Evergreen / Cayman Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEGorCayman] in {
+
+// Should be predicated on FeatureFP64
+// def FMA_64 : R600_3OP <
+//   0xA, "FMA_64",
+//   [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
+// >;
+
+// BFE_UINT - bit_extract, an optimization for mask and shift
+// Src0 = Input
+// Src1 = Offset
+// Src2 = Width
+//
+// bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width)
+//
+// Example Usage:
+// (Offset, Width)
+//
+// (0, 8)  = (Input << 24) >> 24 = (Input &  0xff)       >> 0
+// (8, 8)  = (Input << 16) >> 24 = (Input &  0xffff)     >> 8
+// (16, 8) = (Input <<  8) >> 24 = (Input &  0xffffff)   >> 16
+// (24, 8) = (Input <<  0) >> 24 = (Input &  0xffffffff) >> 24
+def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
+  [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))],
+  VecALU
+>;
+
+def BFE_INT_eg : R600_3OP <0x5, "BFE_INT",
+  [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))],
+  VecALU
+>;
+
+def : BFEPattern <BFE_UINT_eg, MOV_IMM_I32>;
+
+def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
+  [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))],
+  VecALU
+>;
+
+def : Pat<(i32 (sext_inreg i32:$src, i1)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>;
+def : Pat<(i32 (sext_inreg i32:$src, i8)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>;
+def : Pat<(i32 (sext_inreg i32:$src, i16)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
+
+defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32, R600_Reg64>;
+
+def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
+  [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))],
+  VecALU
+>;
+
+def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
+  [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))], VecALU
+>;
+
+def : UMad24Pat<MULADD_UINT24_eg>;
+
+def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
+def : ROTRPattern <BIT_ALIGN_INT_eg>;
+def MULADD_eg : MULADD_Common<0x14>;
+def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
+def FMA_eg : FMA_Common<0x7>;
+def ASHR_eg : ASHR_Common<0x15>;
+def LSHR_eg : LSHR_Common<0x16>;
+def LSHL_eg : LSHL_Common<0x17>;
+def CNDE_eg : CNDE_Common<0x19>;
+def CNDGT_eg : CNDGT_Common<0x1A>;
+def CNDGE_eg : CNDGE_Common<0x1B>;
+def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
+def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
+def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
+  [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU
+>;
+def DOT4_eg : DOT4_Common<0xBE>;
+defm CUBE_eg : CUBE_Common<0xC0>;
+
+def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
+
+def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>;
+def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>;
+
+def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>;
+def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
+
+let hasSideEffects = 1 in {
+  def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
+}
+
+def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
+  let Pattern = [];
+  let Itinerary = AnyALU;
+}
+
+def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
+
+def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
+  let Pattern = [];
+}
+
+def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
+
+def GROUP_BARRIER : InstR600 <
+    (outs), (ins), "  GROUP_BARRIER", [(int_r600_group_barrier)], AnyALU>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP2 <0x54> {
+
+  let dst = 0;
+  let dst_rel = 0;
+  let src0 = 0;
+  let src0_rel = 0;
+  let src0_neg = 0;
+  let src0_abs = 0;
+  let src1 = 0;
+  let src1_rel = 0;
+  let src1_neg = 0;
+  let src1_abs = 0;
+  let write = 0;
+  let omod = 0;
+  let clamp = 0;
+  let last = 1;
+  let bank_swizzle = 0;
+  let pred_sel = 0;
+  let update_exec_mask = 0;
+  let update_pred = 0;
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+
+  let ALUInst = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// LDS Instructions
+//===----------------------------------------------------------------------===//
+class R600_LDS  <bits<6> op, dag outs, dag ins, string asm,
+                 list<dag> pattern = []> :
+
+    InstR600 <outs, ins, asm, pattern, XALU>,
+    R600_ALU_LDS_Word0,
+    R600LDS_Word1 {
+
+  bits<6>  offset = 0;
+  let lds_op = op;
+
+  let Word1{27} = offset{0};
+  let Word1{12} = offset{1};
+  let Word1{28} = offset{2};
+  let Word1{31} = offset{3};
+  let Word0{12} = offset{4};
+  let Word0{25} = offset{5};
+
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+
+  let ALUInst = 1;
+  let HasNativeOperands = 1;
+  let UseNamedOperandTable = 1;
+}
+
+class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS <
+  lds_op,
+  (outs R600_Reg32:$dst),
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       LAST:$last, R600_Pred:$pred_sel,
+       BANK_SWIZZLE:$bank_swizzle),
+  "  "#name#" $last OQAP, $src0$src0_rel $pred_sel",
+  pattern
+  > {
+
+  let src1 = 0;
+  let src1_rel = 0;
+  let src2 = 0;
+  let src2_rel = 0;
+
+  let usesCustomInserter = 1;
+  let LDS_1A = 1;
+  let DisableEncoding = "$dst";
+}
+
+class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
+                     string dst =""> :
+    R600_LDS <
+  lds_op, outs,
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
+       LAST:$last, R600_Pred:$pred_sel,
+       BANK_SWIZZLE:$bank_swizzle),
+  "  "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel",
+  pattern
+  > {
+
+  field string BaseOp;
+
+  let src2 = 0;
+  let src2_rel = 0;
+  let LDS_1A1D = 1;
+}
+
+class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A1D <lds_op, (outs), name, pattern> {
+  let BaseOp = name;
+}
+
+class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A1D <lds_op,  (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> {
+
+  let BaseOp = name;
+  let usesCustomInserter = 1;
+  let DisableEncoding = "$dst";
+}
+
+class R600_LDS_1A2D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
+                     string dst =""> :
+    R600_LDS <
+  lds_op, outs,
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
+       R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel,
+       LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle),
+  "  "#name# "$last "#dst#"$src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel",
+  pattern> {
+
+  field string BaseOp;
+
+  let LDS_1A1D = 0;
+  let LDS_1A2D = 1;
+}
+
+class R600_LDS_1A2D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A2D <lds_op, (outs), name, pattern> {
+  let BaseOp = name;
+}
+
+class R600_LDS_1A2D_RET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A2D <lds_op, (outs R600_Reg32:$dst), name, pattern> {
+
+  let BaseOp = name;
+  let usesCustomInserter = 1;
+  let DisableEncoding = "$dst";
+}
+
+def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >;
+def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >;
+def LDS_AND : R600_LDS_1A1D_NORET <0x9, "LDS_AND", [] >;
+def LDS_OR : R600_LDS_1A1D_NORET <0xa, "LDS_OR", [] >;
+def LDS_XOR : R600_LDS_1A1D_NORET <0xb, "LDS_XOR", [] >;
+def LDS_WRXCHG: R600_LDS_1A1D_NORET <0xd, "LDS_WRXCHG", [] >;
+def LDS_CMPST: R600_LDS_1A2D_NORET <0x10, "LDS_CMPST", [] >;
+def LDS_MIN_INT : R600_LDS_1A1D_NORET <0x5, "LDS_MIN_INT", [] >;
+def LDS_MAX_INT : R600_LDS_1A1D_NORET <0x6, "LDS_MAX_INT", [] >;
+def LDS_MIN_UINT : R600_LDS_1A1D_NORET <0x7, "LDS_MIN_UINT", [] >;
+def LDS_MAX_UINT : R600_LDS_1A1D_NORET <0x8, "LDS_MAX_UINT", [] >;
+def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE",
+  [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
+>;
+def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE",
+  [(truncstorei8_local i32:$src1, i32:$src0)]
+>;
+def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE",
+  [(truncstorei16_local i32:$src1, i32:$src0)]
+>;
+def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
+  [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))]
+>;
+def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
+  [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))]
+>;
+def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND",
+  [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))]
+>;
+def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR",
+  [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))]
+>;
+def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR",
+  [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))]
+>;
+def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT",
+  [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))]
+>;
+def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT",
+  [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))]
+>;
+def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT",
+  [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))]
+>;
+def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT",
+  [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))]
+>;
+def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG",
+  [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))]
+>;
+def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST",
+  [(set i32:$dst, (atomic_cmp_swap_32_local i32:$src0, i32:$src1, i32:$src2))]
+>;
+def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
+  [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
+>;
+def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET",
+  [(set i32:$dst, (sextloadi8_local i32:$src0))]
+>;
+def LDS_UBYTE_READ_RET : R600_LDS_1A <0x37, "LDS_UBYTE_READ_RET",
+  [(set i32:$dst, (az_extloadi8_local i32:$src0))]
+>;
+def LDS_SHORT_READ_RET : R600_LDS_1A <0x38, "LDS_SHORT_READ_RET",
+  [(set i32:$dst, (sextloadi16_local i32:$src0))]
+>;
+def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET",
+  [(set i32:$dst, (az_extloadi16_local i32:$src0))]
+>;
+
+// TRUNC is used for the FLT_TO_INT instructions to work around a
+// perceived problem where the rounding modes are applied differently
+// depending on the instruction and the slot they are in.
+// See:
+// https://bugs.freedesktop.org/show_bug.cgi?id=50232
+// Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c
+//
+// XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes,
+// which do not need to be truncated since the fp values are 0.0f or 1.0f.
+// We should look into handling these cases separately.
+def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>;
+
+def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
+
+// SHA-256 Patterns
+def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
+
+def EG_ExportSwz : ExportSwzInst {
+  let Word1{19-16} = 0; // BURST_COUNT
+  let Word1{20} = 0; // VALID_PIXEL_MODE
+  let Word1{21} = eop;
+  let Word1{29-22} = inst;
+  let Word1{30} = 0; // MARK
+  let Word1{31} = 1; // BARRIER
+}
+defm : ExportPattern<EG_ExportSwz, 83>;
+
+def EG_ExportBuf : ExportBufInst {
+  let Word1{19-16} = 0; // BURST_COUNT
+  let Word1{20} = 0; // VALID_PIXEL_MODE
+  let Word1{21} = eop;
+  let Word1{29-22} = inst;
+  let Word1{30} = 0; // MARK
+  let Word1{31} = 1; // BARRIER
+}
+defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>;
+
+def CF_TC_EG : CF_CLAUSE_EG<1, (ins i32imm:$ADDR, i32imm:$COUNT),
+  "TEX $COUNT @$ADDR"> {
+  let POP_COUNT = 0;
+}
+def CF_VC_EG : CF_CLAUSE_EG<2, (ins i32imm:$ADDR, i32imm:$COUNT),
+  "VTX $COUNT @$ADDR"> {
+  let POP_COUNT = 0;
+}
+def WHILE_LOOP_EG : CF_CLAUSE_EG<6, (ins i32imm:$ADDR),
+  "LOOP_START_DX10 @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+def END_LOOP_EG : CF_CLAUSE_EG<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+def LOOP_BREAK_EG : CF_CLAUSE_EG<9, (ins i32imm:$ADDR),
+  "LOOP_BREAK @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+def CF_CONTINUE_EG : CF_CLAUSE_EG<8, (ins i32imm:$ADDR),
+  "CONTINUE @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+def CF_JUMP_EG : CF_CLAUSE_EG<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "JUMP @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+def CF_PUSH_EG : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+                              "PUSH @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "ELSE @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+def CF_CALL_FS_EG : CF_CLAUSE_EG<19, (ins), "CALL_FS"> {
+  let ADDR = 0;
+  let COUNT = 0;
+  let POP_COUNT = 0;
+}
+def POP_EG : CF_CLAUSE_EG<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "POP @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+def CF_END_EG :  CF_CLAUSE_EG<0, (ins), "CF_END"> {
+  let COUNT = 0;
+  let POP_COUNT = 0;
+  let ADDR = 0;
+  let END_OF_PROGRAM = 1;
+}
+
+} // End Predicates = [isEGorCayman]
diff --git a/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td
new file mode 100644
index 000000000000..849fb8ad50f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -0,0 +1,530 @@
+//===-- FLATInstructions.td - FLAT Instruction Defintions -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def FLATAtomic : ComplexPattern<i64, 3, "SelectFlat">;
+
+//===----------------------------------------------------------------------===//
+// FLAT classes
+//===----------------------------------------------------------------------===//
+
+class FLAT_Pseudo<string opName, dag outs, dag ins,
+                  string asmOps, list<dag> pattern=[]> :
+  InstSI<outs, ins, "", pattern>,
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+
+  let SubtargetPredicate = isCIVI;
+
+  let FLAT = 1;
+  // Internally, FLAT instruction are executed as both an LDS and a
+  // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
+  // and are not considered done until both have been decremented.
+  let VM_CNT = 1;
+  let LGKM_CNT = 1;
+
+  let Uses = [EXEC, FLAT_SCR]; // M0
+
+  let UseNamedOperandTable = 1;
+  let hasSideEffects = 0;
+  let SchedRW = [WriteVMEM];
+
+  string Mnemonic = opName;
+  string AsmOperands = asmOps;
+
+  bits<1> has_vdst = 1;
+  bits<1> has_data = 1;
+  bits<1> has_glc  = 1;
+  bits<1> glcValue = 0;
+}
+
+class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+  Enc64 {
+
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  // copy relevant pseudo op flags
+  let SubtargetPredicate = ps.SubtargetPredicate;
+  let AsmMatchConverter  = ps.AsmMatchConverter;
+
+  // encoding fields
+  bits<8> vaddr;
+  bits<8> vdata;
+  bits<8> vdst;
+  bits<1> slc;
+  bits<1> glc;
+  bits<1> tfe;
+
+  // 15-0 is reserved.
+  let Inst{16}    = !if(ps.has_glc, glc, ps.glcValue);
+  let Inst{17}    = slc;
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x37; // Encoding.
+  let Inst{39-32} = vaddr;
+  let Inst{47-40} = !if(ps.has_data, vdata, ?);
+  // 54-48 is reserved.
+  let Inst{55}    = tfe;
+  let Inst{63-56} = !if(ps.has_vdst, vdst, ?);
+}
+
+class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo<
+  opName,
+  (outs regClass:$vdst),
+  (ins VReg_64:$vaddr, GLC:$glc, slc:$slc, tfe:$tfe),
+  " $vdst, $vaddr$glc$slc$tfe"> {
+  let has_data = 0;
+  let mayLoad = 1;
+}
+
+class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass> : FLAT_Pseudo<
+  opName,
+  (outs),
+  (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc, tfe:$tfe),
+  " $vaddr, $vdata$glc$slc$tfe"> {
+  let mayLoad  = 0;
+  let mayStore = 1;
+  let has_vdst = 0;
+}
+
+multiclass FLAT_Atomic_Pseudo<
+  string opName,
+  RegisterClass vdst_rc,
+  ValueType vt,
+  SDPatternOperator atomic = null_frag,
+  ValueType data_vt = vt,
+  RegisterClass data_rc = vdst_rc> {
+
+  def "" : FLAT_Pseudo <opName,
+    (outs),
+    (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe),
+    " $vaddr, $vdata$slc$tfe",
+    []>,
+    AtomicNoRet <NAME, 0> {
+    let mayLoad = 1;
+    let mayStore = 1;
+    let has_glc  = 0;
+    let glcValue = 0;
+    let has_vdst = 0;
+    let PseudoInstr = NAME;
+  }
+
+  def _RTN : FLAT_Pseudo <opName,
+    (outs vdst_rc:$vdst),
+    (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe),
+    " $vdst, $vaddr, $vdata glc$slc$tfe",
+    [(set vt:$vdst,
+      (atomic (FLATAtomic i64:$vaddr, i1:$slc, i1:$tfe), data_vt:$vdata))]>,
+    AtomicNoRet <NAME, 1> {
+    let mayLoad  = 1;
+    let mayStore = 1;
+    let hasPostISelHook = 1;
+    let has_glc  = 0;
+    let glcValue = 1;
+    let PseudoInstr = NAME # "_RTN";
+  }
+}
+
+class flat_binary_atomic_op<SDNode atomic_op> : PatFrag<
+  (ops node:$ptr, node:$value),
+  (atomic_op node:$ptr, node:$value),
+  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}]
+>;
+
+def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>;
+def atomic_swap_flat     : flat_binary_atomic_op<atomic_swap>;
+def atomic_add_flat      : flat_binary_atomic_op<atomic_load_add>;
+def atomic_and_flat      : flat_binary_atomic_op<atomic_load_and>;
+def atomic_max_flat      : flat_binary_atomic_op<atomic_load_max>;
+def atomic_min_flat      : flat_binary_atomic_op<atomic_load_min>;
+def atomic_or_flat       : flat_binary_atomic_op<atomic_load_or>;
+def atomic_sub_flat      : flat_binary_atomic_op<atomic_load_sub>;
+def atomic_umax_flat     : flat_binary_atomic_op<atomic_load_umax>;
+def atomic_umin_flat     : flat_binary_atomic_op<atomic_load_umin>;
+def atomic_xor_flat      : flat_binary_atomic_op<atomic_load_xor>;
+def atomic_inc_flat      : flat_binary_atomic_op<SIatomic_inc>;
+def atomic_dec_flat      : flat_binary_atomic_op<SIatomic_dec>;
+
+
+
+//===----------------------------------------------------------------------===//
+// Flat Instructions
+//===----------------------------------------------------------------------===//
+
+def FLAT_LOAD_UBYTE    : FLAT_Load_Pseudo <"flat_load_ubyte", VGPR_32>;
+def FLAT_LOAD_SBYTE    : FLAT_Load_Pseudo <"flat_load_sbyte", VGPR_32>;
+def FLAT_LOAD_USHORT   : FLAT_Load_Pseudo <"flat_load_ushort", VGPR_32>;
+def FLAT_LOAD_SSHORT   : FLAT_Load_Pseudo <"flat_load_sshort", VGPR_32>;
+def FLAT_LOAD_DWORD    : FLAT_Load_Pseudo <"flat_load_dword", VGPR_32>;
+def FLAT_LOAD_DWORDX2  : FLAT_Load_Pseudo <"flat_load_dwordx2", VReg_64>;
+def FLAT_LOAD_DWORDX4  : FLAT_Load_Pseudo <"flat_load_dwordx4", VReg_128>;
+def FLAT_LOAD_DWORDX3  : FLAT_Load_Pseudo <"flat_load_dwordx3", VReg_96>;
+
+def FLAT_STORE_BYTE    : FLAT_Store_Pseudo <"flat_store_byte", VGPR_32>;
+def FLAT_STORE_SHORT   : FLAT_Store_Pseudo <"flat_store_short", VGPR_32>;
+def FLAT_STORE_DWORD   : FLAT_Store_Pseudo <"flat_store_dword", VGPR_32>;
+def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>;
+def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>;
+def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
+
+defm FLAT_ATOMIC_CMPSWAP    : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap",
+                                VGPR_32, i32, atomic_cmp_swap_flat,
+                                v2i32, VReg_64>;
+
+defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap_x2",
+                                VReg_64, i64, atomic_cmp_swap_flat,
+                                v2i64, VReg_128>;
+
+defm FLAT_ATOMIC_SWAP       : FLAT_Atomic_Pseudo <"flat_atomic_swap",
+                                VGPR_32, i32, atomic_swap_flat>;
+
+defm FLAT_ATOMIC_SWAP_X2    : FLAT_Atomic_Pseudo <"flat_atomic_swap_x2",
+                                VReg_64, i64, atomic_swap_flat>;
+
+defm FLAT_ATOMIC_ADD        : FLAT_Atomic_Pseudo <"flat_atomic_add",
+                                VGPR_32, i32, atomic_add_flat>;
+
+defm FLAT_ATOMIC_SUB        : FLAT_Atomic_Pseudo <"flat_atomic_sub",
+                                VGPR_32, i32, atomic_sub_flat>;
+
+defm FLAT_ATOMIC_SMIN       : FLAT_Atomic_Pseudo <"flat_atomic_smin",
+                                VGPR_32, i32, atomic_min_flat>;
+
+defm FLAT_ATOMIC_UMIN       : FLAT_Atomic_Pseudo <"flat_atomic_umin",
+                                VGPR_32, i32, atomic_umin_flat>;
+
+defm FLAT_ATOMIC_SMAX       : FLAT_Atomic_Pseudo <"flat_atomic_smax",
+                                VGPR_32, i32, atomic_max_flat>;
+
+defm FLAT_ATOMIC_UMAX       : FLAT_Atomic_Pseudo <"flat_atomic_umax",
+                                VGPR_32, i32, atomic_umax_flat>;
+
+defm FLAT_ATOMIC_AND        : FLAT_Atomic_Pseudo <"flat_atomic_and",
+                                VGPR_32, i32, atomic_and_flat>;
+
+defm FLAT_ATOMIC_OR         : FLAT_Atomic_Pseudo <"flat_atomic_or",
+                                VGPR_32, i32, atomic_or_flat>;
+
+defm FLAT_ATOMIC_XOR        : FLAT_Atomic_Pseudo <"flat_atomic_xor",
+                                VGPR_32, i32, atomic_xor_flat>;
+
+defm FLAT_ATOMIC_INC        : FLAT_Atomic_Pseudo <"flat_atomic_inc",
+                                VGPR_32, i32, atomic_inc_flat>;
+
+defm FLAT_ATOMIC_DEC        : FLAT_Atomic_Pseudo <"flat_atomic_dec",
+                                VGPR_32, i32, atomic_dec_flat>;
+
+defm FLAT_ATOMIC_ADD_X2     : FLAT_Atomic_Pseudo <"flat_atomic_add_x2",
+                                VReg_64, i64, atomic_add_flat>;
+
+defm FLAT_ATOMIC_SUB_X2     : FLAT_Atomic_Pseudo <"flat_atomic_sub_x2",
+                                VReg_64, i64, atomic_sub_flat>;
+
+defm FLAT_ATOMIC_SMIN_X2    : FLAT_Atomic_Pseudo <"flat_atomic_smin_x2",
+                                VReg_64, i64, atomic_min_flat>;
+
+defm FLAT_ATOMIC_UMIN_X2    : FLAT_Atomic_Pseudo <"flat_atomic_umin_x2",
+                                VReg_64, i64, atomic_umin_flat>;
+
+defm FLAT_ATOMIC_SMAX_X2    : FLAT_Atomic_Pseudo <"flat_atomic_smax_x2",
+                                VReg_64, i64, atomic_max_flat>;
+
+defm FLAT_ATOMIC_UMAX_X2    : FLAT_Atomic_Pseudo <"flat_atomic_umax_x2",
+                                VReg_64, i64, atomic_umax_flat>;
+
+defm FLAT_ATOMIC_AND_X2     : FLAT_Atomic_Pseudo <"flat_atomic_and_x2",
+                                VReg_64, i64, atomic_and_flat>;
+
+defm FLAT_ATOMIC_OR_X2      : FLAT_Atomic_Pseudo <"flat_atomic_or_x2",
+                                VReg_64, i64, atomic_or_flat>;
+
+defm FLAT_ATOMIC_XOR_X2     : FLAT_Atomic_Pseudo <"flat_atomic_xor_x2",
+                                VReg_64, i64, atomic_xor_flat>;
+
+defm FLAT_ATOMIC_INC_X2     : FLAT_Atomic_Pseudo <"flat_atomic_inc_x2",
+                                VReg_64, i64, atomic_inc_flat>;
+
+defm FLAT_ATOMIC_DEC_X2     : FLAT_Atomic_Pseudo <"flat_atomic_dec_x2",
+                                VReg_64, i64, atomic_dec_flat>;
+
+let SubtargetPredicate = isCI in { // CI Only flat instructions : FIXME Only?
+
+defm FLAT_ATOMIC_FCMPSWAP    : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap",
+                                VGPR_32, f32, null_frag, v2f32, VReg_64>;
+
+defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fcmpswap_x2",
+                                VReg_64, f64, null_frag, v2f64, VReg_128>;
+
+defm FLAT_ATOMIC_FMIN        : FLAT_Atomic_Pseudo <"flat_atomic_fmin",
+                                VGPR_32, f32>;
+
+defm FLAT_ATOMIC_FMAX        : FLAT_Atomic_Pseudo <"flat_atomic_fmax",
+                                VGPR_32, f32>;
+
+defm FLAT_ATOMIC_FMIN_X2     : FLAT_Atomic_Pseudo <"flat_atomic_fmin_x2",
+                                VReg_64, f64>;
+
+defm FLAT_ATOMIC_FMAX_X2     : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
+                                VReg_64, f64>;
+
+} // End SubtargetPredicate = isCI
+
+//===----------------------------------------------------------------------===//
+// Flat Patterns
+//===----------------------------------------------------------------------===//
+
+class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr),
+                                               (ld node:$ptr), [{
+  auto const AS = cast<MemSDNode>(N)->getAddressSpace();
+  return AS == AMDGPUAS::FLAT_ADDRESS ||
+         AS == AMDGPUAS::GLOBAL_ADDRESS ||
+         AS == AMDGPUAS::CONSTANT_ADDRESS;
+}]>;
+
+class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr),
+                                               (st node:$val, node:$ptr), [{
+  auto const AS = cast<MemSDNode>(N)->getAddressSpace();
+  return AS == AMDGPUAS::FLAT_ADDRESS ||
+         AS == AMDGPUAS::GLOBAL_ADDRESS;
+}]>;
+
+def atomic_flat_load   : flat_ld <atomic_load>;
+def flat_load          : flat_ld <load>;
+def flat_az_extloadi8  : flat_ld <az_extloadi8>;
+def flat_sextloadi8    : flat_ld <sextloadi8>;
+def flat_az_extloadi16 : flat_ld <az_extloadi16>;
+def flat_sextloadi16   : flat_ld <sextloadi16>;
+
+def atomic_flat_store  : flat_st <atomic_store>;
+def flat_store         : flat_st <store>;
+def flat_truncstorei8  : flat_st <truncstorei8>;
+def flat_truncstorei16 : flat_st <truncstorei16>;
+
+// Patterns for global loads with no offset.
+class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
+  (vt (node i64:$addr)),
+  (inst $addr, 0, 0, 0)
+>;
+
+class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
+  (vt (node i64:$addr)),
+  (inst $addr, 1, 0, 0)
+>;
+
+class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
+  (node vt:$data, i64:$addr),
+  (inst $addr, $data, 0, 0, 0)
+>;
+
+class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
+  // atomic store follows atomic binop convention so the address comes
+  // first.
+  (node i64:$addr, vt:$data),
+  (inst $addr, $data, 1, 0, 0)
+>;
+
+class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
+                     ValueType data_vt = vt> : Pat <
+  (vt (node i64:$addr, data_vt:$data)),
+  (inst $addr, $data, 0, 0)
+>;
+
+let Predicates = [isCIVI] in {
+
+def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i16>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i16>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>;
+def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>;
+
+def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_flat_load, i32>;
+def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_flat_load, i64>;
+
+def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>;
+def : FlatStorePat <FLAT_STORE_SHORT, flat_truncstorei16, i32>;
+def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>;
+
+def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_flat_store, i32>;
+def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_flat_store, i64>;
+
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global, i32, v2i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
+
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_and_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_or_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
+
+} // End Predicates = [isCIVI]
+
+let Predicates = [isVI] in {
+  def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i16>;
+  def : FlatStorePat <FLAT_STORE_SHORT, flat_store, i16>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Target
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// CI
+//===----------------------------------------------------------------------===//
+
+class FLAT_Real_ci <bits<7> op, FLAT_Pseudo ps> :
+  FLAT_Real <op, ps>,
+  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SI> {
+  let AssemblerPredicate = isCIOnly;
+  let DecoderNamespace="CI";
+}
+
+def FLAT_LOAD_UBYTE_ci         : FLAT_Real_ci <0x8,  FLAT_LOAD_UBYTE>;
+def FLAT_LOAD_SBYTE_ci         : FLAT_Real_ci <0x9,  FLAT_LOAD_SBYTE>;
+def FLAT_LOAD_USHORT_ci        : FLAT_Real_ci <0xa,  FLAT_LOAD_USHORT>;
+def FLAT_LOAD_SSHORT_ci        : FLAT_Real_ci <0xb,  FLAT_LOAD_SSHORT>;
+def FLAT_LOAD_DWORD_ci         : FLAT_Real_ci <0xc,  FLAT_LOAD_DWORD>;
+def FLAT_LOAD_DWORDX2_ci       : FLAT_Real_ci <0xd,  FLAT_LOAD_DWORDX2>;
+def FLAT_LOAD_DWORDX4_ci       : FLAT_Real_ci <0xe,  FLAT_LOAD_DWORDX4>;
+def FLAT_LOAD_DWORDX3_ci       : FLAT_Real_ci <0xf,  FLAT_LOAD_DWORDX3>;
+
+def FLAT_STORE_BYTE_ci         : FLAT_Real_ci <0x18, FLAT_STORE_BYTE>;
+def FLAT_STORE_SHORT_ci        : FLAT_Real_ci <0x1a, FLAT_STORE_SHORT>;
+def FLAT_STORE_DWORD_ci        : FLAT_Real_ci <0x1c, FLAT_STORE_DWORD>;
+def FLAT_STORE_DWORDX2_ci      : FLAT_Real_ci <0x1d, FLAT_STORE_DWORDX2>;
+def FLAT_STORE_DWORDX4_ci      : FLAT_Real_ci <0x1e, FLAT_STORE_DWORDX4>;
+def FLAT_STORE_DWORDX3_ci      : FLAT_Real_ci <0x1f, FLAT_STORE_DWORDX3>;
+
+multiclass FLAT_Real_Atomics_ci <bits<7> op, FLAT_Pseudo ps> {
+  def _ci     : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
+  def _RTN_ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
+}
+
+defm FLAT_ATOMIC_SWAP          : FLAT_Real_Atomics_ci <0x30, FLAT_ATOMIC_SWAP>;
+defm FLAT_ATOMIC_CMPSWAP       : FLAT_Real_Atomics_ci <0x31, FLAT_ATOMIC_CMPSWAP>;
+defm FLAT_ATOMIC_ADD           : FLAT_Real_Atomics_ci <0x32, FLAT_ATOMIC_ADD>;
+defm FLAT_ATOMIC_SUB           : FLAT_Real_Atomics_ci <0x33, FLAT_ATOMIC_SUB>;
+defm FLAT_ATOMIC_SMIN          : FLAT_Real_Atomics_ci <0x35, FLAT_ATOMIC_SMIN>;
+defm FLAT_ATOMIC_UMIN          : FLAT_Real_Atomics_ci <0x36, FLAT_ATOMIC_UMIN>;
+defm FLAT_ATOMIC_SMAX          : FLAT_Real_Atomics_ci <0x37, FLAT_ATOMIC_SMAX>;
+defm FLAT_ATOMIC_UMAX          : FLAT_Real_Atomics_ci <0x38, FLAT_ATOMIC_UMAX>;
+defm FLAT_ATOMIC_AND           : FLAT_Real_Atomics_ci <0x39, FLAT_ATOMIC_AND>;
+defm FLAT_ATOMIC_OR            : FLAT_Real_Atomics_ci <0x3a, FLAT_ATOMIC_OR>;
+defm FLAT_ATOMIC_XOR           : FLAT_Real_Atomics_ci <0x3b, FLAT_ATOMIC_XOR>;
+defm FLAT_ATOMIC_INC           : FLAT_Real_Atomics_ci <0x3c, FLAT_ATOMIC_INC>;
+defm FLAT_ATOMIC_DEC           : FLAT_Real_Atomics_ci <0x3d, FLAT_ATOMIC_DEC>;
+defm FLAT_ATOMIC_SWAP_X2       : FLAT_Real_Atomics_ci <0x50, FLAT_ATOMIC_SWAP_X2>;
+defm FLAT_ATOMIC_CMPSWAP_X2    : FLAT_Real_Atomics_ci <0x51, FLAT_ATOMIC_CMPSWAP_X2>;
+defm FLAT_ATOMIC_ADD_X2        : FLAT_Real_Atomics_ci <0x52, FLAT_ATOMIC_ADD_X2>;
+defm FLAT_ATOMIC_SUB_X2        : FLAT_Real_Atomics_ci <0x53, FLAT_ATOMIC_SUB_X2>;
+defm FLAT_ATOMIC_SMIN_X2       : FLAT_Real_Atomics_ci <0x55, FLAT_ATOMIC_SMIN_X2>;
+defm FLAT_ATOMIC_UMIN_X2       : FLAT_Real_Atomics_ci <0x56, FLAT_ATOMIC_UMIN_X2>;
+defm FLAT_ATOMIC_SMAX_X2       : FLAT_Real_Atomics_ci <0x57, FLAT_ATOMIC_SMAX_X2>;
+defm FLAT_ATOMIC_UMAX_X2       : FLAT_Real_Atomics_ci <0x58, FLAT_ATOMIC_UMAX_X2>;
+defm FLAT_ATOMIC_AND_X2        : FLAT_Real_Atomics_ci <0x59, FLAT_ATOMIC_AND_X2>;
+defm FLAT_ATOMIC_OR_X2         : FLAT_Real_Atomics_ci <0x5a, FLAT_ATOMIC_OR_X2>;
+defm FLAT_ATOMIC_XOR_X2        : FLAT_Real_Atomics_ci <0x5b, FLAT_ATOMIC_XOR_X2>;
+defm FLAT_ATOMIC_INC_X2        : FLAT_Real_Atomics_ci <0x5c, FLAT_ATOMIC_INC_X2>;
+defm FLAT_ATOMIC_DEC_X2        : FLAT_Real_Atomics_ci <0x5d, FLAT_ATOMIC_DEC_X2>;
+
+// CI Only flat instructions
+defm FLAT_ATOMIC_FCMPSWAP      : FLAT_Real_Atomics_ci <0x3e, FLAT_ATOMIC_FCMPSWAP>;
+defm FLAT_ATOMIC_FMIN          : FLAT_Real_Atomics_ci <0x3f, FLAT_ATOMIC_FMIN>;
+defm FLAT_ATOMIC_FMAX          : FLAT_Real_Atomics_ci <0x40, FLAT_ATOMIC_FMAX>;
+defm FLAT_ATOMIC_FCMPSWAP_X2   : FLAT_Real_Atomics_ci <0x5e, FLAT_ATOMIC_FCMPSWAP_X2>;
+defm FLAT_ATOMIC_FMIN_X2       : FLAT_Real_Atomics_ci <0x5f, FLAT_ATOMIC_FMIN_X2>;
+defm FLAT_ATOMIC_FMAX_X2       : FLAT_Real_Atomics_ci <0x60, FLAT_ATOMIC_FMAX_X2>;
+
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps> :
+  FLAT_Real <op, ps>,
+  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+  let AssemblerPredicate = isVI;
+  let DecoderNamespace="VI";
+}
+
+def FLAT_LOAD_UBYTE_vi         : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>;
+def FLAT_LOAD_SBYTE_vi         : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>;
+def FLAT_LOAD_USHORT_vi        : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>;
+def FLAT_LOAD_SSHORT_vi        : FLAT_Real_vi <0x13, FLAT_LOAD_SSHORT>;
+def FLAT_LOAD_DWORD_vi         : FLAT_Real_vi <0x14, FLAT_LOAD_DWORD>;
+def FLAT_LOAD_DWORDX2_vi       : FLAT_Real_vi <0x15, FLAT_LOAD_DWORDX2>;
+def FLAT_LOAD_DWORDX4_vi       : FLAT_Real_vi <0x17, FLAT_LOAD_DWORDX4>;
+def FLAT_LOAD_DWORDX3_vi       : FLAT_Real_vi <0x16, FLAT_LOAD_DWORDX3>;
+
+def FLAT_STORE_BYTE_vi         : FLAT_Real_vi <0x18, FLAT_STORE_BYTE>;
+def FLAT_STORE_SHORT_vi        : FLAT_Real_vi <0x1a, FLAT_STORE_SHORT>;
+def FLAT_STORE_DWORD_vi        : FLAT_Real_vi <0x1c, FLAT_STORE_DWORD>;
+def FLAT_STORE_DWORDX2_vi      : FLAT_Real_vi <0x1d, FLAT_STORE_DWORDX2>;
+def FLAT_STORE_DWORDX4_vi      : FLAT_Real_vi <0x1f, FLAT_STORE_DWORDX4>;
+def FLAT_STORE_DWORDX3_vi      : FLAT_Real_vi <0x1e, FLAT_STORE_DWORDX3>;
+
+multiclass FLAT_Real_Atomics_vi <bits<7> op, FLAT_Pseudo ps> {
+  def _vi     : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
+  def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
+}
+
+defm FLAT_ATOMIC_SWAP       : FLAT_Real_Atomics_vi <0x40, FLAT_ATOMIC_SWAP>;
+defm FLAT_ATOMIC_CMPSWAP    : FLAT_Real_Atomics_vi <0x41, FLAT_ATOMIC_CMPSWAP>;
+defm FLAT_ATOMIC_ADD        : FLAT_Real_Atomics_vi <0x42, FLAT_ATOMIC_ADD>;
+defm FLAT_ATOMIC_SUB        : FLAT_Real_Atomics_vi <0x43, FLAT_ATOMIC_SUB>;
+defm FLAT_ATOMIC_SMIN       : FLAT_Real_Atomics_vi <0x44, FLAT_ATOMIC_SMIN>;
+defm FLAT_ATOMIC_UMIN       : FLAT_Real_Atomics_vi <0x45, FLAT_ATOMIC_UMIN>;
+defm FLAT_ATOMIC_SMAX       : FLAT_Real_Atomics_vi <0x46, FLAT_ATOMIC_SMAX>;
+defm FLAT_ATOMIC_UMAX       : FLAT_Real_Atomics_vi <0x47, FLAT_ATOMIC_UMAX>;
+defm FLAT_ATOMIC_AND        : FLAT_Real_Atomics_vi <0x48, FLAT_ATOMIC_AND>;
+defm FLAT_ATOMIC_OR         : FLAT_Real_Atomics_vi <0x49, FLAT_ATOMIC_OR>;
+defm FLAT_ATOMIC_XOR        : FLAT_Real_Atomics_vi <0x4a, FLAT_ATOMIC_XOR>;
+defm FLAT_ATOMIC_INC        : FLAT_Real_Atomics_vi <0x4b, FLAT_ATOMIC_INC>;
+defm FLAT_ATOMIC_DEC        : FLAT_Real_Atomics_vi <0x4c, FLAT_ATOMIC_DEC>;
+defm FLAT_ATOMIC_SWAP_X2    : FLAT_Real_Atomics_vi <0x60, FLAT_ATOMIC_SWAP_X2>;
+defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Real_Atomics_vi <0x61, FLAT_ATOMIC_CMPSWAP_X2>;
+defm FLAT_ATOMIC_ADD_X2     : FLAT_Real_Atomics_vi <0x62, FLAT_ATOMIC_ADD_X2>;
+defm FLAT_ATOMIC_SUB_X2     : FLAT_Real_Atomics_vi <0x63, FLAT_ATOMIC_SUB_X2>;
+defm FLAT_ATOMIC_SMIN_X2    : FLAT_Real_Atomics_vi <0x64, FLAT_ATOMIC_SMIN_X2>;
+defm FLAT_ATOMIC_UMIN_X2    : FLAT_Real_Atomics_vi <0x65, FLAT_ATOMIC_UMIN_X2>;
+defm FLAT_ATOMIC_SMAX_X2    : FLAT_Real_Atomics_vi <0x66, FLAT_ATOMIC_SMAX_X2>;
+defm FLAT_ATOMIC_UMAX_X2    : FLAT_Real_Atomics_vi <0x67, FLAT_ATOMIC_UMAX_X2>;
+defm FLAT_ATOMIC_AND_X2     : FLAT_Real_Atomics_vi <0x68, FLAT_ATOMIC_AND_X2>;
+defm FLAT_ATOMIC_OR_X2      : FLAT_Real_Atomics_vi <0x69, FLAT_ATOMIC_OR_X2>;
+defm FLAT_ATOMIC_XOR_X2     : FLAT_Real_Atomics_vi <0x6a, FLAT_ATOMIC_XOR_X2>;
+defm FLAT_ATOMIC_INC_X2     : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>;
+defm FLAT_ATOMIC_DEC_X2     : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>;
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
new file mode 100644
index 000000000000..dd3b46f13921
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -0,0 +1,502 @@
+//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements hazard recognizers for scheduling on GCN processors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNHazardRecognizer.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Hazard Recoginizer Implementation
+//===----------------------------------------------------------------------===//
+
+GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
+  CurrCycleInstr(nullptr),
+  MF(MF),
+  ST(MF.getSubtarget<SISubtarget>()) {
+  MaxLookAhead = 5;
+}
+
+void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
+  EmitInstruction(SU->getInstr());
+}
+
+void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
+  CurrCycleInstr = MI;
+}
+
+static bool isDivFMas(unsigned Opcode) {
+  return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
+}
+
+static bool isSGetReg(unsigned Opcode) {
+  return Opcode == AMDGPU::S_GETREG_B32;
+}
+
+static bool isSSetReg(unsigned Opcode) {
+  return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32;
+}
+
+static bool isRWLane(unsigned Opcode) {
+  return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
+}
+
+static bool isRFE(unsigned Opcode) {
+  return Opcode == AMDGPU::S_RFE_B64;
+}
+
+static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
+
+  const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
+                                                     AMDGPU::OpName::simm16);
+  return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
+}
+
+ScheduleHazardRecognizer::HazardType
+GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  MachineInstr *MI = SU->getInstr();
+
+  if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
+    return NoopHazard;
+
+  if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
+    return NoopHazard;
+
+  if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
+    return NoopHazard;
+
+  if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
+    return NoopHazard;
+
+  if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
+    return NoopHazard;
+
+  if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
+    return NoopHazard;
+
+  if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
+    return NoopHazard;
+
+  if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
+    return NoopHazard;
+
+  if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
+    return NoopHazard;
+
+  return NoHazard;
+}
+
+unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
+  return PreEmitNoops(SU->getInstr());
+}
+
+unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
+  if (SIInstrInfo::isSMRD(*MI))
+    return std::max(0, checkSMRDHazards(MI));
+
+  if (SIInstrInfo::isVALU(*MI)) {
+    int WaitStates = std::max(0, checkVALUHazards(MI));
+
+    if (SIInstrInfo::isVMEM(*MI))
+      WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
+
+    if (SIInstrInfo::isDPP(*MI))
+      WaitStates = std::max(WaitStates, checkDPPHazards(MI));
+
+    if (isDivFMas(MI->getOpcode()))
+      WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
+
+    if (isRWLane(MI->getOpcode()))
+      WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
+
+    return WaitStates;
+  }
+
+  if (isSGetReg(MI->getOpcode()))
+    return std::max(0, checkGetRegHazards(MI));
+
+  if (isSSetReg(MI->getOpcode()))
+    return std::max(0, checkSetRegHazards(MI));
+
+  if (isRFE(MI->getOpcode()))
+    return std::max(0, checkRFEHazards(MI));
+
+  return 0;
+}
+
+void GCNHazardRecognizer::EmitNoop() {
+  EmittedInstrs.push_front(nullptr);
+}
+
+void GCNHazardRecognizer::AdvanceCycle() {
+
+  // When the scheduler detects a stall, it will call AdvanceCycle() without
+  // emitting any instructions.
+  if (!CurrCycleInstr)
+    return;
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  unsigned NumWaitStates = TII->getNumWaitStates(*CurrCycleInstr);
+
+  // Keep track of emitted instructions
+  EmittedInstrs.push_front(CurrCycleInstr);
+
+  // Add a nullptr for each additional wait state after the first.  Make sure
+  // not to add more than getMaxLookAhead() items to the list, since we
+  // truncate the list to that size right after this loop.
+  for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
+       i < e; ++i) {
+    EmittedInstrs.push_front(nullptr);
+  }
+
+  // getMaxLookahead() is the largest number of wait states we will ever need
+  // to insert, so there is no point in keeping track of more than that many
+  // wait states.
+  EmittedInstrs.resize(getMaxLookAhead());
+
+  CurrCycleInstr = nullptr;
+}
+
+void GCNHazardRecognizer::RecedeCycle() {
+  llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+int GCNHazardRecognizer::getWaitStatesSince(
+    function_ref<bool(MachineInstr *)> IsHazard) {
+
+  int WaitStates = -1;
+  for (MachineInstr *MI : EmittedInstrs) {
+    ++WaitStates;
+    if (!MI || !IsHazard(MI))
+      continue;
+    return WaitStates;
+  }
+  return std::numeric_limits<int>::max();
+}
+
+int GCNHazardRecognizer::getWaitStatesSinceDef(
+    unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
+    return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
+  };
+
+  return getWaitStatesSince(IsHazardFn);
+}
+
+int GCNHazardRecognizer::getWaitStatesSinceSetReg(
+    function_ref<bool(MachineInstr *)> IsHazard) {
+
+  auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
+    return isSSetReg(MI->getOpcode()) && IsHazard(MI);
+  };
+
+  return getWaitStatesSince(IsHazardFn);
+}
+
+//===----------------------------------------------------------------------===//
+// No-op Hazard Detection
+//===----------------------------------------------------------------------===//
+
+static void addRegsToSet(iterator_range<MachineInstr::const_mop_iterator> Ops,
+                         std::set<unsigned> &Set) {
+  for (const MachineOperand &Op : Ops) {
+    if (Op.isReg())
+      Set.insert(Op.getReg());
+  }
+}
+
+int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) {
+  // SMEM soft clause are only present on VI+
+  if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+    return 0;
+
+  // A soft-clause is any group of consecutive SMEM instructions.  The
+  // instructions in this group may return out of order and/or may be
+  // replayed (i.e. the same instruction issued more than once).
+  //
+  // In order to handle these situations correctly we need to make sure
+  // that when a clause has more than one instruction, no instruction in the
+  // clause writes to a register that is read another instruction in the clause
+  // (including itself). If we encounter this situaion, we need to break the
+  // clause by inserting a non SMEM instruction.
+
+  std::set<unsigned> ClauseDefs;
+  std::set<unsigned> ClauseUses;
+
+  for (MachineInstr *MI : EmittedInstrs) {
+
+    // When we hit a non-SMEM instruction then we have passed the start of the
+    // clause and we can stop.
+    if (!MI || !SIInstrInfo::isSMRD(*MI))
+      break;
+
+    addRegsToSet(MI->defs(), ClauseDefs);
+    addRegsToSet(MI->uses(), ClauseUses);
+  }
+
+  if (ClauseDefs.empty())
+    return 0;
+
+  // FIXME: When we support stores, we need to make sure not to put loads and
+  // stores in the same clause if they use the same address.  For now, just
+  // start a new clause whenever we see a store.
+  if (SMEM->mayStore())
+    return 1;
+
+  addRegsToSet(SMEM->defs(), ClauseDefs);
+  addRegsToSet(SMEM->uses(), ClauseUses);
+
+  std::vector<unsigned> Result(std::max(ClauseDefs.size(), ClauseUses.size()));
+  std::vector<unsigned>::iterator End;
+
+  End = std::set_intersection(ClauseDefs.begin(), ClauseDefs.end(),
+                              ClauseUses.begin(), ClauseUses.end(), Result.begin());
+
+  // If the set of defs and uses intersect then we cannot add this instruction
+  // to the clause, so we have a hazard.
+  if (End != Result.begin())
+    return 1;
+
+  return 0;
+}
+
+int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  int WaitStatesNeeded = 0;
+
+  WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD);
+
+  // This SMRD hazard only affects SI.
+  if (ST.getGeneration() != SISubtarget::SOUTHERN_ISLANDS)
+    return WaitStatesNeeded;
+
+  // A read of an SGPR by SMRD instruction requires 4 wait states when the
+  // SGPR was written by a VALU instruction.
+  int SmrdSgprWaitStates = 4;
+  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+
+  for (const MachineOperand &Use : SMRD->uses()) {
+    if (!Use.isReg())
+      continue;
+    int WaitStatesNeededForUse =
+        SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+  return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+    return 0;
+
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+  // A read of an SGPR by a VMEM instruction requires 5 wait states when the
+  // SGPR was written by a VALU Instruction.
+  int VmemSgprWaitStates = 5;
+  int WaitStatesNeeded = 0;
+  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+
+  for (const MachineOperand &Use : VMEM->uses()) {
+    if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
+      continue;
+
+    int WaitStatesNeededForUse =
+        VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+  return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  // Check for DPP VGPR read after VALU VGPR write.
+  int DppVgprWaitStates = 2;
+  int WaitStatesNeeded = 0;
+
+  for (const MachineOperand &Use : DPP->uses()) {
+    if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
+      continue;
+    int WaitStatesNeededForUse =
+        DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg());
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+
+  return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  // v_div_fmas requires 4 wait states after a write to vcc from a VALU
+  // instruction.
+  const int DivFMasWaitStates = 4;
+  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+  int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn);
+
+  return DivFMasWaitStates - WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
+
+  const int GetRegWaitStates = 2;
+  auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
+    return GetRegHWReg == getHWReg(TII, *MI);
+  };
+  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+
+  return GetRegWaitStates - WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  unsigned HWReg = getHWReg(TII, *SetRegInstr);
+
+  const int SetRegWaitStates =
+      ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ? 1 : 2;
+  auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
+    return HWReg == getHWReg(TII, *MI);
+  };
+  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+  return SetRegWaitStates - WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
+  if (!MI.mayStore())
+    return -1;
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MI.getDesc();
+
+  int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
+  int VDataRCID = -1;
+  if (VDataIdx != -1)
+    VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
+
+  if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
+    // There is no hazard if the instruction does not use vector regs
+    // (like wbinvl1)
+    if (VDataIdx == -1)
+      return -1;
+    // For MUBUF/MTBUF instructions this hazard only exists if the
+    // instruction is not using a register in the soffset field.
+    const MachineOperand *SOffset =
+        TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
+    // If we have no soffset operand, then assume this field has been
+    // hardcoded to zero.
+    if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
+        (!SOffset || !SOffset->isReg()))
+      return VDataIdx;
+  }
+
+  // MIMG instructions create a hazard if they don't use a 256-bit T# and
+  // the store size is greater than 8 bytes and they have more than two bits
+  // of their dmask set.
+  // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
+  if (TII->isMIMG(MI)) {
+    int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
+    assert(SRsrcIdx != -1 &&
+           AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
+    (void)SRsrcIdx;
+  }
+
+  if (TII->isFLAT(MI)) {
+    int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
+    if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
+      return DataIdx;
+  }
+
+  return -1;
+}
+
+int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
+  // This checks for the hazard where VMEM instructions that store more than
+  // 8 bytes can have there store data over written by the next instruction.
+  if (!ST.has12DWordStoreHazard())
+    return 0;
+
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const MachineRegisterInfo &MRI = VALU->getParent()->getParent()->getRegInfo();
+
+  const int VALUWaitStates = 1;
+  int WaitStatesNeeded = 0;
+
+  for (const MachineOperand &Def : VALU->defs()) {
+    if (!TRI->isVGPR(MRI, Def.getReg()))
+      continue;
+    unsigned Reg = Def.getReg();
+    auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
+      int DataIdx = createsVALUHazard(*MI);
+      return DataIdx >= 0 &&
+             TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
+    };
+    int WaitStatesNeededForDef =
+        VALUWaitStates - getWaitStatesSince(IsHazardFn);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
+  }
+  return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const MachineRegisterInfo &MRI =
+      RWLane->getParent()->getParent()->getRegInfo();
+
+  const MachineOperand *LaneSelectOp =
+      TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
+
+  if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
+    return 0;
+
+  unsigned LaneSelectReg = LaneSelectOp->getReg();
+  auto IsHazardFn = [TII] (MachineInstr *MI) {
+    return TII->isVALU(*MI);
+  };
+
+  const int RWLaneWaitStates = 4;
+  int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn);
+  return RWLaneWaitStates - WaitStatesSince;
+}
+
+int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
+
+  if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return 0;
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  const int RFEWaitStates = 1;
+
+  auto IsHazardFn = [TII] (MachineInstr *MI) {
+    return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
+  };
+  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+  return RFEWaitStates - WaitStatesNeeded;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
new file mode 100644
index 000000000000..0ab82ff4635b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -0,0 +1,71 @@
+//===-- GCNHazardRecognizers.h - GCN Hazard Recognizers ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling on GCN processors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
+#define LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include <list>
+
+namespace llvm {
+
+class MachineFunction;
+class MachineInstr;
+class ScheduleDAG;
+class SIInstrInfo;
+class SISubtarget;
+
+class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
+  // This variable stores the instruction that has been emitted this cycle. It
+  // will be added to EmittedInstrs, when AdvanceCycle() or RecedeCycle() is
+  // called.
+  MachineInstr *CurrCycleInstr;
+  std::list<MachineInstr*> EmittedInstrs;
+  const MachineFunction &MF;
+  const SISubtarget &ST;
+
+  int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard);
+  int getWaitStatesSinceDef(unsigned Reg,
+                            function_ref<bool(MachineInstr *)> IsHazardDef =
+                                [](MachineInstr *) { return true; });
+  int getWaitStatesSinceSetReg(function_ref<bool(MachineInstr *)> IsHazard);
+
+  int checkSMEMSoftClauseHazards(MachineInstr *SMEM);
+  int checkSMRDHazards(MachineInstr *SMRD);
+  int checkVMEMHazards(MachineInstr* VMEM);
+  int checkDPPHazards(MachineInstr *DPP);
+  int checkDivFMasHazards(MachineInstr *DivFMas);
+  int checkGetRegHazards(MachineInstr *GetRegInstr);
+  int checkSetRegHazards(MachineInstr *SetRegInstr);
+  int createsVALUHazard(const MachineInstr &MI);
+  int checkVALUHazards(MachineInstr *VALU);
+  int checkRWLaneHazards(MachineInstr *RWLane);
+  int checkRFEHazards(MachineInstr *RFE);
+public:
+  GCNHazardRecognizer(const MachineFunction &MF);
+  // We can only issue one instruction per cycle.
+  bool atIssueLimit() const override { return true; }
+  void EmitInstruction(SUnit *SU) override;
+  void EmitInstruction(MachineInstr *MI) override;
+  HazardType getHazardType(SUnit *SU, int Stalls) override;
+  void EmitNoop() override;
+  unsigned PreEmitNoops(SUnit *SU) override;
+  unsigned PreEmitNoops(MachineInstr *) override;
+  void AdvanceCycle() override;
+  void RecedeCycle() override;
+};
+
+} // end namespace llvm
+
+#endif //LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
new file mode 100644
index 000000000000..2f88033c807f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -0,0 +1,312 @@
+//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This contains a MachineSchedStrategy implementation for maximizing wave
+/// occupancy on GCN hardware.
+//===----------------------------------------------------------------------===//
+
+#include "GCNSchedStrategy.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+
+#define DEBUG_TYPE "misched"
+
+using namespace llvm;
+
+GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
+    const MachineSchedContext *C) :
+    GenericScheduler(C) { }
+
+static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
+                            const MachineFunction &MF) {
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
+                                      ST.getOccupancyWithNumVGPRs(VGPRs));
+  return std::min(MinRegOccupancy,
+                  ST.getOccupancyWithLocalMemSize(MFI->getLDSSize()));
+}
+
+void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
+                                     bool AtTop, const RegPressureTracker &RPTracker,
+                                     const SIRegisterInfo *SRI,
+                                     int SGPRPressure,
+                                     int VGPRPressure,
+                                     int SGPRExcessLimit,
+                                     int VGPRExcessLimit,
+                                     int SGPRCriticalLimit,
+                                     int VGPRCriticalLimit) {
+
+  Cand.SU = SU;
+  Cand.AtTop = AtTop;
+
+  // getDownwardPressure() and getUpwardPressure() make temporary changes to
+  // the the tracker, so we need to pass those function a non-const copy.
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+  std::vector<unsigned> Pressure;
+  std::vector<unsigned> MaxPressure;
+
+  if (AtTop)
+    TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
+  else {
+    // FIXME: I think for bottom up scheduling, the register pressure is cached
+    // and can be retrieved by DAG->getPressureDif(SU).
+    TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+  }
+
+  int NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
+  int NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];
+
+  // If two instructions increase the pressure of different register sets
+  // by the same amount, the generic scheduler will prefer to schedule the
+  // instruction that increases the set with the least amount of registers,
+  // which in our case would be SGPRs.  This is rarely what we want, so
+  // when we report excess/critical register pressure, we do it either
+  // only for VGPRs or only for SGPRs.
+
+  // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
+  const int MaxVGPRPressureInc = 16;
+  bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
+  bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
+
+
+  // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
+  // to increase the likelihood we don't go over the limits.  We should improve
+  // the analysis to look through dependencies to find the path with the least
+  // register pressure.
+  // FIXME: This is also necessary, because some passes that run after
+  // scheduling and before regalloc increase register pressure.
+  const int ErrorMargin = 3;
+  VGPRExcessLimit -= ErrorMargin;
+  SGPRExcessLimit -= ErrorMargin;
+
+  // We only need to update the RPDelata for instructions that increase
+  // register pressure.  Instructions that decrease or keep reg pressure
+  // the same will be marked as RegExcess in tryCandidate() when they
+  // are compared with instructions that increase the register pressure.
+  if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
+    Cand.RPDelta.Excess = PressureChange(SRI->getVGPRPressureSet());
+    Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
+  }
+
+  if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
+    Cand.RPDelta.Excess = PressureChange(SRI->getSGPRPressureSet());
+    Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure = SGPRExcessLimit);
+  }
+
+  // Register pressure is considered 'CRITICAL' if it is approaching a value
+  // that would reduce the wave occupancy for the execution unit.  When
+  // register pressure is 'CRITICAL', increading SGPR and VGPR pressure both
+  // has the same cost, so we don't need to prefer one over the other.
+
+  VGPRCriticalLimit -= ErrorMargin;
+  SGPRCriticalLimit -= ErrorMargin;
+
+  int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
+  int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
+
+  if (SGPRDelta >= 0 || VGPRDelta >= 0) {
+    if (SGPRDelta > VGPRDelta) {
+      Cand.RPDelta.CriticalMax = PressureChange(SRI->getSGPRPressureSet());
+      Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
+    } else {
+      Cand.RPDelta.CriticalMax = PressureChange(SRI->getVGPRPressureSet());
+      Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
+    }
+  }
+}
+
+// This function is mostly cut and pasted from
+// GenericScheduler::pickNodeFromQueue()
+void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
+                                         const CandPolicy &ZonePolicy,
+                                         const RegPressureTracker &RPTracker,
+                                         SchedCandidate &Cand) {
+  const SISubtarget &ST = DAG->MF.getSubtarget<SISubtarget>();
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
+  ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
+  unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()];
+  unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()];
+  unsigned SGPRExcessLimit =
+      Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
+  unsigned VGPRExcessLimit =
+      Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
+  unsigned MaxWaves = getMaxWaves(SGPRPressure, VGPRPressure, DAG->MF);
+  unsigned SGPRCriticalLimit = SRI->getMaxNumSGPRs(ST, MaxWaves, true);
+  unsigned VGPRCriticalLimit = SRI->getMaxNumVGPRs(MaxWaves);
+
+  ReadyQueue &Q = Zone.Available;
+  for (SUnit *SU : Q) {
+
+    SchedCandidate TryCand(ZonePolicy);
+    initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
+                  SGPRPressure, VGPRPressure,
+                  SGPRExcessLimit, VGPRExcessLimit,
+                  SGPRCriticalLimit, VGPRCriticalLimit);
+    // Pass SchedBoundary only when comparing nodes from the same boundary.
+    SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
+    GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg);
+    if (TryCand.Reason != NoCand) {
+      // Initialize resource delta if needed in case future heuristics query it.
+      if (TryCand.ResDelta == SchedResourceDelta())
+        TryCand.initResourceDelta(Zone.DAG, SchedModel);
+      Cand.setBest(TryCand);
+    }
+  }
+}
+
+static int getBidirectionalReasonRank(GenericSchedulerBase::CandReason Reason) {
+  switch (Reason) {
+  default:
+    return Reason;
+  case GenericSchedulerBase::RegCritical:
+  case GenericSchedulerBase::RegExcess:
+    return -Reason;
+ }
+}
+
+// This function is mostly cut and pasted from
+// GenericScheduler::pickNodeBidirectional()
+SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
+  // Schedule as far as possible in the direction of no choice. This is most
+  // efficient, but also provides the best heuristics for CriticalPSets.
+  if (SUnit *SU = Bot.pickOnlyChoice()) {
+    IsTopNode = false;
+    return SU;
+  }
+  if (SUnit *SU = Top.pickOnlyChoice()) {
+    IsTopNode = true;
+    return SU;
+  }
+  // Set the bottom-up policy based on the state of the current bottom zone and
+  // the instructions outside the zone, including the top zone.
+  CandPolicy BotPolicy;
+  setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
+  // Set the top-down policy based on the state of the current top zone and
+  // the instructions outside the zone, including the bottom zone.
+  CandPolicy TopPolicy;
+  setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
+
+  // See if BotCand is still valid (because we previously scheduled from Top).
+  DEBUG(dbgs() << "Picking from Bot:\n");
+  if (!BotCand.isValid() || BotCand.SU->isScheduled ||
+      BotCand.Policy != BotPolicy) {
+    BotCand.reset(CandPolicy());
+    pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
+    assert(BotCand.Reason != NoCand && "failed to find the first candidate");
+  } else {
+    DEBUG(traceCandidate(BotCand));
+  }
+
+  // Check if the top Q has a better candidate.
+  DEBUG(dbgs() << "Picking from Top:\n");
+  if (!TopCand.isValid() || TopCand.SU->isScheduled ||
+      TopCand.Policy != TopPolicy) {
+    TopCand.reset(CandPolicy());
+    pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
+    assert(TopCand.Reason != NoCand && "failed to find the first candidate");
+  } else {
+    DEBUG(traceCandidate(TopCand));
+  }
+
+  // Pick best from BotCand and TopCand.
+  DEBUG(
+    dbgs() << "Top Cand: ";
+    traceCandidate(BotCand);
+    dbgs() << "Bot Cand: ";
+    traceCandidate(TopCand);
+  );
+  SchedCandidate Cand;
+  if (TopCand.Reason == BotCand.Reason) {
+    Cand = BotCand;
+    GenericSchedulerBase::CandReason TopReason = TopCand.Reason;
+    TopCand.Reason = NoCand;
+    GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
+    if (TopCand.Reason != NoCand) {
+      Cand.setBest(TopCand);
+    } else {
+      TopCand.Reason = TopReason;
+    }
+  } else {
+    if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) {
+      Cand = TopCand;
+    } else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) {
+      Cand = BotCand;
+    } else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
+      Cand = TopCand;
+    } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
+      Cand = BotCand;
+    } else {
+      int TopRank = getBidirectionalReasonRank(TopCand.Reason);
+      int BotRank = getBidirectionalReasonRank(BotCand.Reason);
+      if (TopRank > BotRank) {
+        Cand = TopCand;
+      } else {
+        Cand = BotCand;
+      }
+    }
+  }
+  DEBUG(
+    dbgs() << "Picking: ";
+    traceCandidate(Cand);
+  );
+
+  IsTopNode = Cand.AtTop;
+  return Cand.SU;
+}
+
+// This function is mostly cut and pasted from
+// GenericScheduler::pickNode()
+SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+    return nullptr;
+  }
+  SUnit *SU;
+  do {
+    if (RegionPolicy.OnlyTopDown) {
+      SU = Top.pickOnlyChoice();
+      if (!SU) {
+        CandPolicy NoPolicy;
+        TopCand.reset(NoPolicy);
+        pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
+        assert(TopCand.Reason != NoCand && "failed to find a candidate");
+        SU = TopCand.SU;
+      }
+      IsTopNode = true;
+    } else if (RegionPolicy.OnlyBottomUp) {
+      SU = Bot.pickOnlyChoice();
+      if (!SU) {
+        CandPolicy NoPolicy;
+        BotCand.reset(NoPolicy);
+        pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
+        assert(BotCand.Reason != NoCand && "failed to find a candidate");
+        SU = BotCand.SU;
+      }
+      IsTopNode = false;
+    } else {
+      SU = pickNodeBidirectional(IsTopNode);
+    }
+  } while (SU->isScheduled);
+
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
+
+  DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
+  return SU;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
new file mode 100644
index 000000000000..4cfc0cea81fb
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -0,0 +1,54 @@
+//===-- GCNSchedStrategy.h - GCN Scheduler Strategy -*- C++ -*-------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
+#define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+class SIRegisterInfo;
+
+/// This is a minimal scheduler strategy.  The main difference between this
+/// and the GenericScheduler is that GCNSchedStrategy uses different
+/// heuristics to determine excess/critical pressure sets.  Its goal is to
+/// maximize kernel occupancy (i.e. maximum number of waves per simd).
+class GCNMaxOccupancySchedStrategy : public GenericScheduler {
+
+  SUnit *pickNodeBidirectional(bool &IsTopNode);
+
+  void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
+                         const RegPressureTracker &RPTracker,
+                         SchedCandidate &Cand);
+
+  void initCandidate(SchedCandidate &Cand, SUnit *SU,
+                     bool AtTop, const RegPressureTracker &RPTracker,
+                     const SIRegisterInfo *SRI,
+                     int SGPRPressure, int VGPRPressure,
+                     int SGPRExcessLimit, int VGPRExcessLimit,
+                     int SGPRCriticalLimit, int VGPRCriticalLimit);
+
+  void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+                    SchedBoundary *Zone, const SIRegisterInfo *SRI,
+                    unsigned SGPRPressure, unsigned VGPRPressure);
+
+public:
+  GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
+
+  SUnit *pickNode(bool &IsTopNode) override;
+};
+
+} // End namespace llvm
+
+#endif // GCNSCHEDSTRATEGY_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
new file mode 100644
index 000000000000..7172a0aa7167
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -0,0 +1,1108 @@
+//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstPrinter.h"
+#include "SIDefines.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUAsmUtils.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                  StringRef Annot, const MCSubtargetInfo &STI) {
+  OS.flush();
+  printInstruction(MI, STI, OS);
+  printAnnotation(OS, Annot);
+}
+
+void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xf);
+}
+
+void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xff);
+}
+
+void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  // It's possible to end up with a 32-bit literal used with a 16-bit operand
+  // with ignored high bits. Print as 32-bit anyway in that case.
+  int64_t Imm = MI->getOperand(OpNo).getImm();
+  if (isInt<16>(Imm) || isUInt<16>(Imm))
+    O << formatHex(static_cast<uint64_t>(Imm & 0xffff));
+  else
+    printU32ImmOperand(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  O << formatDec(MI->getOperand(OpNo).getImm() & 0xf);
+}
+
+void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  O << formatDec(MI->getOperand(OpNo).getImm() & 0xff);
+}
+
+void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
+                                              raw_ostream &O) {
+  O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
+}
+
+void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
+}
+
+void AMDGPUInstPrinter::printNamedBit(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O, StringRef BitName) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << ' ' << BitName;
+  }
+}
+
+void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "offen");
+}
+
+void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "idxen");
+}
+
+void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "addr64");
+}
+
+void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " offset:";
+    printU16ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
+                                    const MCSubtargetInfo &STI,
+                                    raw_ostream &O) {
+  uint16_t Imm = MI->getOperand(OpNo).getImm();
+  if (Imm != 0) {
+    O << " offset:";
+    printU16ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " offset0:";
+    printU8ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printOffset1(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " offset1:";
+    printU8ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printSMRDOffset8(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  printU32ImmOperand(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printSMRDOffset20(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  printU32ImmOperand(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
+                                               const MCSubtargetInfo &STI,
+                                               raw_ostream &O) {
+  printU32ImmOperand(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo,
+                                 const MCSubtargetInfo &STI, raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "gds");
+}
+
+void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo,
+                                 const MCSubtargetInfo &STI, raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "glc");
+}
+
+void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo,
+                                 const MCSubtargetInfo &STI, raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "slc");
+}
+
+void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo,
+                                 const MCSubtargetInfo &STI, raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "tfe");
+}
+
+void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI, raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " dmask:";
+    printU16ImmOperand(MI, OpNo, STI, O);
+  }
+}
+
+void AMDGPUInstPrinter::printUNorm(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI, raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "unorm");
+}
+
+void AMDGPUInstPrinter::printDA(const MCInst *MI, unsigned OpNo,
+                                const MCSubtargetInfo &STI, raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "da");
+}
+
+void AMDGPUInstPrinter::printR128(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI, raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "r128");
+}
+
+void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo,
+                                 const MCSubtargetInfo &STI, raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "lwe");
+}
+
+void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " compr";
+}
+
+void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI,
+                                   raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " vm";
+}
+
+void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
+                                        const MCRegisterInfo &MRI) {
+  switch (RegNo) {
+  case AMDGPU::VCC:
+    O << "vcc";
+    return;
+  case AMDGPU::SCC:
+    O << "scc";
+    return;
+  case AMDGPU::EXEC:
+    O << "exec";
+    return;
+  case AMDGPU::M0:
+    O << "m0";
+    return;
+  case AMDGPU::FLAT_SCR:
+    O << "flat_scratch";
+    return;
+  case AMDGPU::VCC_LO:
+    O << "vcc_lo";
+    return;
+  case AMDGPU::VCC_HI:
+    O << "vcc_hi";
+    return;
+  case AMDGPU::TBA_LO:
+    O << "tba_lo";
+    return;
+  case AMDGPU::TBA_HI:
+    O << "tba_hi";
+    return;
+  case AMDGPU::TMA_LO:
+    O << "tma_lo";
+    return;
+  case AMDGPU::TMA_HI:
+    O << "tma_hi";
+    return;
+  case AMDGPU::EXEC_LO:
+    O << "exec_lo";
+    return;
+  case AMDGPU::EXEC_HI:
+    O << "exec_hi";
+    return;
+  case AMDGPU::FLAT_SCR_LO:
+    O << "flat_scratch_lo";
+    return;
+  case AMDGPU::FLAT_SCR_HI:
+    O << "flat_scratch_hi";
+    return;
+  default:
+    break;
+  }
+
+  // The low 8 bits of the encoding value is the register index, for both VGPRs
+  // and SGPRs.
+  unsigned RegIdx = MRI.getEncodingValue(RegNo) & ((1 << 8) - 1);
+
+  unsigned NumRegs;
+  if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(RegNo)) {
+    O << 'v';
+    NumRegs = 1;
+  } else  if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(RegNo)) {
+    O << 's';
+    NumRegs = 1;
+  } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo)) {
+    O <<'v';
+    NumRegs = 2;
+  } else  if (MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo)) {
+    O << 's';
+    NumRegs = 2;
+  } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo)) {
+    O << 'v';
+    NumRegs = 4;
+  } else  if (MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo)) {
+    O << 's';
+    NumRegs = 4;
+  } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo)) {
+    O << 'v';
+    NumRegs = 3;
+  } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo)) {
+    O << 'v';
+    NumRegs = 8;
+  } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(RegNo)) {
+    O << 's';
+    NumRegs = 8;
+  } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo)) {
+    O << 'v';
+    NumRegs = 16;
+  } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(RegNo)) {
+    O << 's';
+    NumRegs = 16;
+  } else if (MRI.getRegClass(AMDGPU::TTMP_64RegClassID).contains(RegNo)) {
+    O << "ttmp";
+    NumRegs = 2;
+    // Trap temps start at offset 112. TODO: Get this from tablegen.
+    RegIdx -= 112;
+  } else if (MRI.getRegClass(AMDGPU::TTMP_128RegClassID).contains(RegNo)) {
+    O << "ttmp";
+    NumRegs = 4;
+    // Trap temps start at offset 112. TODO: Get this from tablegen.
+    RegIdx -= 112;
+  } else {
+    O << getRegisterName(RegNo);
+    return;
+  }
+
+  if (NumRegs == 1) {
+    O << RegIdx;
+    return;
+  }
+
+  O << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
+}
+
+void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
+                                    const MCSubtargetInfo &STI, raw_ostream &O) {
+  if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3)
+    O << "_e64 ";
+  else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP)
+    O << "_dpp ";
+  else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA)
+    O << "_sdwa ";
+  else
+    O << "_e32 ";
+
+  printOperand(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  int16_t SImm = static_cast<int16_t>(Imm);
+  if (SImm >= -16 && SImm <= 64) {
+    O << SImm;
+    return;
+  }
+
+  if (Imm == 0x3C00)
+    O<< "1.0";
+  else if (Imm == 0xBC00)
+    O<< "-1.0";
+  else if (Imm == 0x3800)
+    O<< "0.5";
+  else if (Imm == 0xB800)
+    O<< "-0.5";
+  else if (Imm == 0x4000)
+    O<< "2.0";
+  else if (Imm == 0xC000)
+    O<< "-2.0";
+  else if (Imm == 0x4400)
+    O<< "4.0";
+  else if (Imm == 0xC400)
+    O<< "-4.0";
+  else if (Imm == 0x3118) {
+    assert(STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]);
+    O << "0.15915494";
+  } else
+    O << formatHex(static_cast<uint64_t>(Imm));
+}
+
+void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  int32_t SImm = static_cast<int32_t>(Imm);
+  if (SImm >= -16 && SImm <= 64) {
+    O << SImm;
+    return;
+  }
+
+  if (Imm == FloatToBits(0.0f))
+    O << "0.0";
+  else if (Imm == FloatToBits(1.0f))
+    O << "1.0";
+  else if (Imm == FloatToBits(-1.0f))
+    O << "-1.0";
+  else if (Imm == FloatToBits(0.5f))
+    O << "0.5";
+  else if (Imm == FloatToBits(-0.5f))
+    O << "-0.5";
+  else if (Imm == FloatToBits(2.0f))
+    O << "2.0";
+  else if (Imm == FloatToBits(-2.0f))
+    O << "-2.0";
+  else if (Imm == FloatToBits(4.0f))
+    O << "4.0";
+  else if (Imm == FloatToBits(-4.0f))
+    O << "-4.0";
+  else if (Imm == 0x3e22f983 &&
+           STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+    O << "0.15915494";
+  else
+    O << formatHex(static_cast<uint64_t>(Imm));
+}
+
+void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  int64_t SImm = static_cast<int64_t>(Imm);
+  if (SImm >= -16 && SImm <= 64) {
+    O << SImm;
+    return;
+  }
+
+  if (Imm == DoubleToBits(0.0))
+    O << "0.0";
+  else if (Imm == DoubleToBits(1.0))
+    O << "1.0";
+  else if (Imm == DoubleToBits(-1.0))
+    O << "-1.0";
+  else if (Imm == DoubleToBits(0.5))
+    O << "0.5";
+  else if (Imm == DoubleToBits(-0.5))
+    O << "-0.5";
+  else if (Imm == DoubleToBits(2.0))
+    O << "2.0";
+  else if (Imm == DoubleToBits(-2.0))
+    O << "-2.0";
+  else if (Imm == DoubleToBits(4.0))
+    O << "4.0";
+  else if (Imm == DoubleToBits(-4.0))
+    O << "-4.0";
+  else if (Imm == 0x3fc45f306dc9c882 &&
+           STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+  O << "0.15915494";
+  else {
+    assert(isUInt<32>(Imm) || Imm == 0x3fc45f306dc9c882);
+
+    // In rare situations, we will have a 32-bit literal in a 64-bit
+    // operand. This is technically allowed for the encoding of s_mov_b64.
+    O << formatHex(static_cast<uint64_t>(Imm));
+  }
+}
+
+void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  if (OpNo >= MI->getNumOperands()) {
+    O << "/*Missing OP" << OpNo << "*/";
+    return;
+  }
+
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    switch (Op.getReg()) {
+    // This is the default predicate state, so we don't need to print it.
+    case AMDGPU::PRED_SEL_OFF:
+      break;
+
+    default:
+      printRegOperand(Op.getReg(), O, MRI);
+      break;
+    }
+  } else if (Op.isImm()) {
+    const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+    switch (Desc.OpInfo[OpNo].OperandType) {
+    case AMDGPU::OPERAND_REG_IMM_INT32:
+    case AMDGPU::OPERAND_REG_IMM_FP32:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+    case MCOI::OPERAND_IMMEDIATE:
+      printImmediate32(Op.getImm(), STI, O);
+      break;
+    case AMDGPU::OPERAND_REG_IMM_INT64:
+    case AMDGPU::OPERAND_REG_IMM_FP64:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+      printImmediate64(Op.getImm(), STI, O);
+      break;
+    case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+    case AMDGPU::OPERAND_REG_IMM_INT16:
+    case AMDGPU::OPERAND_REG_IMM_FP16:
+      printImmediate16(Op.getImm(), STI, O);
+      break;
+    case MCOI::OPERAND_UNKNOWN:
+    case MCOI::OPERAND_PCREL:
+      O << formatDec(Op.getImm());
+      break;
+    case MCOI::OPERAND_REGISTER:
+      // FIXME: This should be removed and handled somewhere else. Seems to come
+      // from a disassembler bug.
+      O << "/*invalid immediate*/";
+      break;
+    default:
+      // We hit this for the immediate instruction bits that don't yet have a
+      // custom printer.
+      llvm_unreachable("unexpected immediate operand type");
+    }
+  } else if (Op.isFPImm()) {
+    // We special case 0.0 because otherwise it will be printed as an integer.
+    if (Op.getFPImm() == 0.0)
+      O << "0.0";
+    else {
+      const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+      int RCID = Desc.OpInfo[OpNo].RegClass;
+      unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID));
+      if (RCBits == 32)
+        printImmediate32(FloatToBits(Op.getFPImm()), STI, O);
+      else if (RCBits == 64)
+        printImmediate64(DoubleToBits(Op.getFPImm()), STI, O);
+      else
+        llvm_unreachable("Invalid register class size");
+    }
+  } else if (Op.isExpr()) {
+    const MCExpr *Exp = Op.getExpr();
+    Exp->print(O, &MAI);
+  } else {
+    O << "/*INV_OP*/";
+  }
+}
+
+void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
+                                                   unsigned OpNo,
+                                                   const MCSubtargetInfo &STI,
+                                                   raw_ostream &O) {
+  unsigned InputModifiers = MI->getOperand(OpNo).getImm();
+  if (InputModifiers & SISrcMods::NEG)
+    O << '-';
+  if (InputModifiers & SISrcMods::ABS)
+    O << '|';
+  printOperand(MI, OpNo + 1, STI, O);
+  if (InputModifiers & SISrcMods::ABS)
+    O << '|';
+}
+
+void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
+                                                    unsigned OpNo,
+                                                    const MCSubtargetInfo &STI,
+                                                    raw_ostream &O) {
+  unsigned InputModifiers = MI->getOperand(OpNo).getImm();
+  if (InputModifiers & SISrcMods::SEXT)
+    O << "sext(";
+  printOperand(MI, OpNo + 1, STI, O);
+  if (InputModifiers & SISrcMods::SEXT)
+    O << ')';
+}
+
+void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNo).getImm();
+  if (Imm <= 0x0ff) {
+    O << " quad_perm:[";
+    O << formatDec(Imm & 0x3)         << ',';
+    O << formatDec((Imm & 0xc)  >> 2) << ',';
+    O << formatDec((Imm & 0x30) >> 4) << ',';
+    O << formatDec((Imm & 0xc0) >> 6) << ']';
+  } else if ((Imm >= 0x101) && (Imm <= 0x10f)) {
+    O << " row_shl:";
+    printU4ImmDecOperand(MI, OpNo, O);
+  } else if ((Imm >= 0x111) && (Imm <= 0x11f)) {
+    O << " row_shr:";
+    printU4ImmDecOperand(MI, OpNo, O);
+  } else if ((Imm >= 0x121) && (Imm <= 0x12f)) {
+    O << " row_ror:";
+    printU4ImmDecOperand(MI, OpNo, O);
+  } else if (Imm == 0x130) {
+    O << " wave_shl:1";
+  } else if (Imm == 0x134) {
+    O << " wave_rol:1";
+  } else if (Imm == 0x138) {
+    O << " wave_shr:1";
+  } else if (Imm == 0x13c) {
+    O << " wave_ror:1";
+  } else if (Imm == 0x140) {
+    O << " row_mirror";
+  } else if (Imm == 0x141) {
+    O << " row_half_mirror";
+  } else if (Imm == 0x142) {
+    O << " row_bcast:15";
+  } else if (Imm == 0x143) {
+    O << " row_bcast:31";
+  } else {
+    llvm_unreachable("Invalid dpp_ctrl value");
+  }
+}
+
+void AMDGPUInstPrinter::printRowMask(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  O << " row_mask:";
+  printU4ImmOperand(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printBankMask(const MCInst *MI, unsigned OpNo,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  O << " bank_mask:";
+  printU4ImmOperand(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNo).getImm();
+  if (Imm) {
+    O << " bound_ctrl:0"; // XXX - this syntax is used in sp3
+  }
+}
+
+void AMDGPUInstPrinter::printSDWASel(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  using namespace llvm::AMDGPU::SDWA;
+
+  unsigned Imm = MI->getOperand(OpNo).getImm();
+  switch (Imm) {
+  case SdwaSel::BYTE_0: O << "BYTE_0"; break;
+  case SdwaSel::BYTE_1: O << "BYTE_1"; break;
+  case SdwaSel::BYTE_2: O << "BYTE_2"; break;
+  case SdwaSel::BYTE_3: O << "BYTE_3"; break;
+  case SdwaSel::WORD_0: O << "WORD_0"; break;
+  case SdwaSel::WORD_1: O << "WORD_1"; break;
+  case SdwaSel::DWORD: O << "DWORD"; break;
+  default: llvm_unreachable("Invalid SDWA data select operand");
+  }
+}
+
+void AMDGPUInstPrinter::printSDWADstSel(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  O << "dst_sel:";
+  printSDWASel(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printSDWASrc0Sel(const MCInst *MI, unsigned OpNo,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  O << "src0_sel:";
+  printSDWASel(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printSDWASrc1Sel(const MCInst *MI, unsigned OpNo,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  O << "src1_sel:";
+  printSDWASel(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printSDWADstUnused(const MCInst *MI, unsigned OpNo,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  using namespace llvm::AMDGPU::SDWA;
+
+  O << "dst_unused:";
+  unsigned Imm = MI->getOperand(OpNo).getImm();
+  switch (Imm) {
+  case DstUnused::UNUSED_PAD: O << "UNUSED_PAD"; break;
+  case DstUnused::UNUSED_SEXT: O << "UNUSED_SEXT"; break;
+  case DstUnused::UNUSED_PRESERVE: O << "UNUSED_PRESERVE"; break;
+  default: llvm_unreachable("Invalid SDWA dest_unused operand");
+  }
+}
+
+template <unsigned N>
+void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  int EnIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::en);
+  unsigned En = MI->getOperand(EnIdx).getImm();
+
+  // FIXME: What do we do with compr? The meaning of en changes depending on if
+  // compr is set.
+
+  if (En & (1 << N))
+    printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI);
+  else
+    O << "off";
+}
+
+void AMDGPUInstPrinter::printExpSrc0(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  printExpSrcN<0>(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printExpSrc1(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  printExpSrcN<1>(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printExpSrc2(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  printExpSrcN<2>(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printExpSrc3(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  printExpSrcN<3>(MI, OpNo, STI, O);
+}
+
+void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo,
+                                    const MCSubtargetInfo &STI,
+                                    raw_ostream &O) {
+  // This is really a 6 bit field.
+  uint32_t Tgt = MI->getOperand(OpNo).getImm() & ((1 << 6) - 1);
+
+  if (Tgt <= 7)
+    O << " mrt" << Tgt;
+  else if (Tgt == 8)
+    O << " mrtz";
+  else if (Tgt == 9)
+    O << " null";
+  else if (Tgt >= 12 && Tgt <= 15)
+    O << " pos" << Tgt - 12;
+  else if (Tgt >= 32 && Tgt <= 63)
+    O << " param" << Tgt - 32;
+  else {
+    // Reserved values 10, 11
+    O << " invalid_target_" << Tgt;
+  }
+}
+
+void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  switch (Imm) {
+  case 0:
+    O << "p10";
+    break;
+  case 1:
+    O << "p20";
+    break;
+  case 2:
+    O << "p0";
+    break;
+  default:
+    O << "invalid_param_" << Imm;
+  }
+}
+
+void AMDGPUInstPrinter::printInterpAttr(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  unsigned Attr = MI->getOperand(OpNum).getImm();
+  O << "attr" << Attr;
+}
+
+void AMDGPUInstPrinter::printInterpAttrChan(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  unsigned Chan = MI->getOperand(OpNum).getImm();
+  O << '.' << "xyzw"[Chan & 0x3];
+}
+
+void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+  if (Val == 0) {
+    O << " 0";
+    return;
+  }
+
+  if (Val & VGPRIndexMode::DST_ENABLE)
+    O << " dst";
+
+  if (Val & VGPRIndexMode::SRC0_ENABLE)
+    O << " src0";
+
+  if (Val & VGPRIndexMode::SRC1_ENABLE)
+    O << " src1";
+
+  if (Val & VGPRIndexMode::SRC2_ENABLE)
+    O << " src2";
+}
+
+void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  printOperand(MI, OpNo, STI, O);
+  O  << ", ";
+  printOperand(MI, OpNo + 1, STI, O);
+}
+
+void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O, StringRef Asm,
+                                   StringRef Default) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm());
+  if (Op.getImm() == 1) {
+    O << Asm;
+  } else {
+    O << Default;
+  }
+}
+
+void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O, char Asm) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm());
+  if (Op.getImm() == 1)
+    O << Asm;
+}
+
+void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
+                                 const MCSubtargetInfo &STI, raw_ostream &O) {
+  printIfSet(MI, OpNo, O, '|');
+}
+
+void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI, raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "_SAT");
+}
+
+void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " clamp";
+}
+
+void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
+                                    const MCSubtargetInfo &STI,
+                                    raw_ostream &O) {
+  int Imm = MI->getOperand(OpNo).getImm();
+  if (Imm == SIOutMods::MUL2)
+    O << " mul:2";
+  else if (Imm == SIOutMods::MUL4)
+    O << " mul:4";
+  else if (Imm == SIOutMods::DIV2)
+    O << " div:2";
+}
+
+void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm() || Op.isExpr());
+  if (Op.isImm()) {
+    int64_t Imm = Op.getImm();
+    O << Imm << '(' << BitsToFloat(Imm) << ')';
+  }
+  if (Op.isExpr()) {
+    Op.getExpr()->print(O << '@', &MAI);
+  }
+}
+
+void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI, raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "*", " ");
+}
+
+void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
+                                 const MCSubtargetInfo &STI, raw_ostream &O) {
+  printIfSet(MI, OpNo, O, '-');
+}
+
+void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI, raw_ostream &O) {
+  switch (MI->getOperand(OpNo).getImm()) {
+  default: break;
+  case 1:
+    O << " * 2.0";
+    break;
+  case 2:
+    O << " * 4.0";
+    break;
+  case 3:
+    O << " / 2.0";
+    break;
+  }
+}
+
+void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
+                                 const MCSubtargetInfo &STI, raw_ostream &O) {
+  printIfSet(MI, OpNo, O, '+');
+}
+
+void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
+                                            const MCSubtargetInfo &STI,
+                                            raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "ExecMask,");
+}
+
+void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "Pred,");
+}
+
+void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI, raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.getImm() == 0) {
+    O << " (MASKED)";
+  }
+}
+
+void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  const char * chans = "XYZW";
+  int sel = MI->getOperand(OpNo).getImm();
+
+  int chan = sel & 3;
+  sel >>= 2;
+
+  if (sel >= 512) {
+    sel -= 512;
+    int cb = sel >> 12;
+    sel &= 4095;
+    O << cb << '[' << sel << ']';
+  } else if (sel >= 448) {
+    sel -= 448;
+    O << sel;
+  } else if (sel >= 0){
+    O << sel;
+  }
+
+  if (sel >= 0)
+    O << '.' << chans[chan];
+}
+
+void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  int BankSwizzle = MI->getOperand(OpNo).getImm();
+  switch (BankSwizzle) {
+  case 1:
+    O << "BS:VEC_021/SCL_122";
+    break;
+  case 2:
+    O << "BS:VEC_120/SCL_212";
+    break;
+  case 3:
+    O << "BS:VEC_102/SCL_221";
+    break;
+  case 4:
+    O << "BS:VEC_201";
+    break;
+  case 5:
+    O << "BS:VEC_210";
+    break;
+  default:
+    break;
+  }
+}
+
+void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI, raw_ostream &O) {
+  unsigned Sel = MI->getOperand(OpNo).getImm();
+  switch (Sel) {
+  case 0:
+    O << 'X';
+    break;
+  case 1:
+    O << 'Y';
+    break;
+  case 2:
+    O << 'Z';
+    break;
+  case 3:
+    O << 'W';
+    break;
+  case 4:
+    O << '0';
+    break;
+  case 5:
+    O << '1';
+    break;
+  case 7:
+    O << '_';
+    break;
+  default:
+    break;
+  }
+}
+
+void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo,
+                                const MCSubtargetInfo &STI, raw_ostream &O) {
+  unsigned CT = MI->getOperand(OpNo).getImm();
+  switch (CT) {
+  case 0:
+    O << 'U';
+    break;
+  case 1:
+    O << 'N';
+    break;
+  default:
+    break;
+  }
+}
+
+void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
+                                    const MCSubtargetInfo &STI, raw_ostream &O) {
+  int KCacheMode = MI->getOperand(OpNo).getImm();
+  if (KCacheMode > 0) {
+    int KCacheBank = MI->getOperand(OpNo - 2).getImm();
+    O << "CB" << KCacheBank << ':';
+    int KCacheAddr = MI->getOperand(OpNo + 2).getImm();
+    int LineSize = (KCacheMode == 1) ? 16 : 32;
+    O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize;
+  }
+}
+
+void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  using namespace llvm::AMDGPU::SendMsg;
+
+  const unsigned SImm16 = MI->getOperand(OpNo).getImm();
+  const unsigned Id = SImm16 & ID_MASK_;
+  do {
+    if (Id == ID_INTERRUPT) {
+      if ((SImm16 & ~ID_MASK_) != 0) // Unused/unknown bits must be 0.
+        break;
+      O << "sendmsg(" << IdSymbolic[Id] << ')';
+      return;
+    }
+    if (Id == ID_GS || Id == ID_GS_DONE) {
+      if ((SImm16 & ~(ID_MASK_|OP_GS_MASK_|STREAM_ID_MASK_)) != 0) // Unused/unknown bits must be 0.
+        break;
+      const unsigned OpGs = (SImm16 & OP_GS_MASK_) >> OP_SHIFT_;
+      const unsigned StreamId = (SImm16 & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_;
+      if (OpGs == OP_GS_NOP && Id != ID_GS_DONE) // NOP to be used for GS_DONE only.
+        break;
+      if (OpGs == OP_GS_NOP && StreamId != 0) // NOP does not use/define stream id bits.
+        break;
+      O << "sendmsg(" << IdSymbolic[Id] << ", " << OpGsSymbolic[OpGs];
+      if (OpGs != OP_GS_NOP) {  O << ", " << StreamId; }
+      O << ')';
+      return;
+    }
+    if (Id == ID_SYSMSG) {
+      if ((SImm16 & ~(ID_MASK_|OP_SYS_MASK_)) != 0) // Unused/unknown bits must be 0.
+        break;
+      const unsigned OpSys = (SImm16 & OP_SYS_MASK_) >> OP_SHIFT_;
+      if (! (OP_SYS_FIRST_ <= OpSys && OpSys < OP_SYS_LAST_)) // Unused/unknown.
+        break;
+      O << "sendmsg(" << IdSymbolic[Id] << ", " << OpSysSymbolic[OpSys] << ')';
+      return;
+    }
+  } while (false);
+  O << SImm16; // Unknown simm16 code.
+}
+
+void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  IsaVersion IV = getIsaVersion(STI.getFeatureBits());
+
+  unsigned SImm16 = MI->getOperand(OpNo).getImm();
+  unsigned Vmcnt, Expcnt, Lgkmcnt;
+  decodeWaitcnt(IV, SImm16, Vmcnt, Expcnt, Lgkmcnt);
+
+  bool NeedSpace = false;
+
+  if (Vmcnt != getVmcntBitMask(IV)) {
+    O << "vmcnt(" << Vmcnt << ')';
+    NeedSpace = true;
+  }
+
+  if (Expcnt != getExpcntBitMask(IV)) {
+    if (NeedSpace)
+      O << ' ';
+    O << "expcnt(" << Expcnt << ')';
+    NeedSpace = true;
+  }
+
+  if (Lgkmcnt != getLgkmcntBitMask(IV)) {
+    if (NeedSpace)
+      O << ' ';
+    O << "lgkmcnt(" << Lgkmcnt << ')';
+  }
+}
+
+void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI, raw_ostream &O) {
+  using namespace llvm::AMDGPU::Hwreg;
+
+  unsigned SImm16 = MI->getOperand(OpNo).getImm();
+  const unsigned Id = (SImm16 & ID_MASK_) >> ID_SHIFT_;
+  const unsigned Offset = (SImm16 & OFFSET_MASK_) >> OFFSET_SHIFT_;
+  const unsigned Width = ((SImm16 & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1;
+
+  O << "hwreg(";
+  if (ID_SYMBOLIC_FIRST_ <= Id && Id < ID_SYMBOLIC_LAST_) {
+    O << IdSymbolic[Id];
+  } else {
+    O << Id;
+  }
+  if (Width != WIDTH_M1_DEFAULT_ + 1 || Offset != OFFSET_DEFAULT_) {
+    O << ", " << Offset << ", " << Width;
+  }
+  O << ')';
+}
+
+#include "AMDGPUGenAsmWriter.inc"
diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
new file mode 100644
index 000000000000..a6d348ff0f12
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -0,0 +1,194 @@
+//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H
+#define LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class AMDGPUInstPrinter : public MCInstPrinter {
+public:
+  AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                     const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  //Autogenerated by tblgen
+  void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
+                        raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+  static void printRegOperand(unsigned RegNo, raw_ostream &O,
+                              const MCRegisterInfo &MRI);
+
+private:
+  void printU4ImmOperand(const MCInst *MI, unsigned OpNo,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                     StringRef BitName);
+  void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                   raw_ostream &O);
+  void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
+  void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
+  void printSMRDOffset8(const MCInst *MI, unsigned OpNo,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printSMRDOffset20(const MCInst *MI, unsigned OpNo,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
+                              const MCSubtargetInfo &STI, raw_ostream &O);
+  void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                raw_ostream &O);
+  void printGLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                raw_ostream &O);
+  void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                raw_ostream &O);
+  void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                raw_ostream &O);
+  void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                  raw_ostream &O);
+  void printUNorm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                  raw_ostream &O);
+  void printDA(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+               raw_ostream &O);
+  void printR128(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                 raw_ostream &O);
+  void printLWE(const MCInst *MI, unsigned OpNo,
+                const MCSubtargetInfo &STI, raw_ostream &O);
+  void printExpCompr(const MCInst *MI, unsigned OpNo,
+                     const MCSubtargetInfo &STI, raw_ostream &O);
+  void printExpVM(const MCInst *MI, unsigned OpNo,
+                  const MCSubtargetInfo &STI, raw_ostream &O);
+
+  void printRegOperand(unsigned RegNo, raw_ostream &O);
+  void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                   raw_ostream &O);
+  void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
+                        raw_ostream &O);
+  void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
+                        raw_ostream &O);
+  void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
+                        raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
+  void printOperandAndFPInputMods(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI, raw_ostream &O);
+  void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI, raw_ostream &O);
+  void printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
+  void printRowMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
+  void printBankMask(const MCInst *MI, unsigned OpNo,
+                     const MCSubtargetInfo &STI, raw_ostream &O);
+  void printBoundCtrl(const MCInst *MI, unsigned OpNo,
+                      const MCSubtargetInfo &STI, raw_ostream &O);
+  void printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSDWADstSel(const MCInst *MI, unsigned OpNo,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printSDWASrc0Sel(const MCInst *MI, unsigned OpNo,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
+  void printSDWASrc1Sel(const MCInst *MI, unsigned OpNo,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
+  void printSDWADstUnused(const MCInst *MI, unsigned OpNo,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printInterpSlot(const MCInst *MI, unsigned OpNo,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printInterpAttr(const MCInst *MI, unsigned OpNo,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printInterpAttrChan(const MCInst *MI, unsigned OpNo,
+                           const MCSubtargetInfo &STI, raw_ostream &O);
+
+  void printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, unsigned OpNo,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+
+
+  template <unsigned N>
+  void printExpSrcN(const MCInst *MI, unsigned OpNo,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printExpSrc0(const MCInst *MI, unsigned OpNo,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printExpSrc1(const MCInst *MI, unsigned OpNo,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printExpSrc2(const MCInst *MI, unsigned OpNo,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printExpSrc3(const MCInst *MI, unsigned OpNo,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printExpTgt(const MCInst *MI, unsigned OpNo,
+                   const MCSubtargetInfo &STI, raw_ostream &O);
+
+  static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                         StringRef Asm, StringRef Default = "");
+  static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                         char Asm);
+  void printAbs(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                raw_ostream &O);
+  void printClamp(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                  raw_ostream &O);
+  void printClampSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
+  void printOModSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                   raw_ostream &O);
+  void printLiteral(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
+  void printLast(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                 raw_ostream &O);
+  void printNeg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                raw_ostream &O);
+  void printOMOD(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                 raw_ostream &O);
+  void printRel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                raw_ostream &O);
+  void printUpdateExecMask(const MCInst *MI, unsigned OpNo,
+                           const MCSubtargetInfo &STI, raw_ostream &O);
+  void printUpdatePred(const MCInst *MI, unsigned OpNo,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printWrite(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                  raw_ostream &O);
+  void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printBankSwizzle(const MCInst *MI, unsigned OpNo,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
+  void printRSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                 raw_ostream &O);
+  void printCT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+               raw_ostream &O);
+  void printKCache(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                   raw_ostream &O);
+  void printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
+  void printWaitFlag(const MCInst *MI, unsigned OpNo,
+                     const MCSubtargetInfo &STI, raw_ostream &O);
+  void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                  raw_ostream &O);
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
new file mode 100644
index 000000000000..ffb92aae599e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -0,0 +1,200 @@
+//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/AMDGPUFixupKinds.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAsmBackend : public MCAsmBackend {
+public:
+  AMDGPUAsmBackend(const Target &T)
+    : MCAsmBackend() {}
+
+  unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
+
+  void processFixupValue(const MCAssembler &Asm,
+                         const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override;
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {
+    llvm_unreachable("Not implemented");
+  }
+  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+};
+
+} //End anonymous namespace
+
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  case AMDGPU::fixup_si_sopp_br:
+    return 2;
+  case FK_SecRel_1:
+  case FK_Data_1:
+    return 1;
+  case FK_SecRel_2:
+  case FK_Data_2:
+    return 2;
+  case FK_SecRel_4:
+  case FK_Data_4:
+  case FK_PCRel_4:
+    return 4;
+  case FK_SecRel_8:
+  case FK_Data_8:
+    return 8;
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  }
+}
+
+static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+                                 MCContext *Ctx) {
+  int64_t SignedValue = static_cast<int64_t>(Value);
+
+  switch (Fixup.getKind()) {
+  case AMDGPU::fixup_si_sopp_br: {
+    int64_t BrImm = (SignedValue - 4) / 4;
+
+    if (Ctx && !isInt<16>(BrImm))
+      Ctx->reportError(Fixup.getLoc(), "branch size exceeds simm16");
+
+    return BrImm;
+  }
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+  case FK_PCRel_4:
+  case FK_SecRel_4:
+    return Value;
+  default:
+    llvm_unreachable("unhandled fixup kind");
+  }
+}
+
+void AMDGPUAsmBackend::processFixupValue(const MCAssembler &Asm,
+                                         const MCAsmLayout &Layout,
+                                         const MCFixup &Fixup, const MCFragment *DF,
+                                         const MCValue &Target, uint64_t &Value,
+                                         bool &IsResolved) {
+  MCValue Res;
+
+  // When we have complex expressions like: BB0_1 + (BB0_2 - 4), which are
+  // used for long branches, this function will be called with
+  // IsResolved = false and Value set to some pre-computed value.  In
+  // the example above, the value would be:
+  // (BB0_1 + (BB0_2 - 4)) - CurrentOffsetFromStartOfFunction.
+  // This is not what we want.  We just want the expression computation
+  // only.  The reason the MC layer subtracts the current offset from the
+  // expression is because the fixup is of kind FK_PCRel_4.
+  // For these scenarios, evaluateAsValue gives us the computation that we
+  // want.
+  if (!IsResolved && Fixup.getValue()->evaluateAsValue(Res, Layout) &&
+      Res.isAbsolute()) {
+    Value = Res.getConstant();
+    IsResolved = true;
+
+  }
+  if (IsResolved)
+    Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
+}
+
+void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                  unsigned DataSize, uint64_t Value,
+                                  bool IsPCRel) const {
+  if (!Value)
+    return; // Doesn't change encoding.
+
+  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+  uint32_t Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the bits from
+  // the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff);
+}
+
+const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
+                                                       MCFixupKind Kind) const {
+  const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
+    // name                   offset bits  flags
+    { "fixup_si_sopp_br",     0,     16,   MCFixupKindInfo::FKF_IsPCRel },
+  };
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  return Infos[Kind - FirstTargetFixupKind];
+}
+
+bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  OW->WriteZeros(Count);
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// ELFAMDGPUAsmBackend class
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
+  bool Is64Bit;
+  bool HasRelocationAddend;
+
+public:
+  ELFAMDGPUAsmBackend(const Target &T, const Triple &TT) :
+      AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn),
+      HasRelocationAddend(TT.getOS() == Triple::AMDHSA) { }
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createAMDGPUELFObjectWriter(Is64Bit, HasRelocationAddend, OS);
+  }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           const Triple &TT, StringRef CPU,
+                                           const MCTargetOptions &Options) {
+  // Use 64-bit ELF for amdgcn
+  return new ELFAMDGPUAsmBackend(T, TT);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
new file mode 100644
index 000000000000..1847d7a67328
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -0,0 +1,87 @@
+//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  AMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend);
+protected:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+};
+
+
+} // End anonymous namespace
+
+AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit,
+                                             bool HasRelocationAddend)
+  : MCELFObjectTargetWriter(Is64Bit,
+                            ELF::ELFOSABI_AMDGPU_HSA,
+                            ELF::EM_AMDGPU,
+                            HasRelocationAddend) { }
+
+unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
+                                             const MCValue &Target,
+                                             const MCFixup &Fixup,
+                                             bool IsPCRel) const {
+  if (const auto *SymA = Target.getSymA()) {
+    // SCRATCH_RSRC_DWORD[01] is a special global variable that represents
+    // the scratch buffer.
+    if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD0")
+      return ELF::R_AMDGPU_ABS32_LO;
+
+    if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD1")
+      return ELF::R_AMDGPU_ABS32_HI;
+  }
+
+  switch (Target.getAccessVariant()) {
+  default:
+    break;
+  case MCSymbolRefExpr::VK_GOTPCREL:
+    return ELF::R_AMDGPU_GOTPCREL;
+  case MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_LO:
+    return ELF::R_AMDGPU_GOTPCREL32_LO;
+  case MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_HI:
+    return ELF::R_AMDGPU_GOTPCREL32_HI;
+  case MCSymbolRefExpr::VK_AMDGPU_REL32_LO:
+    return ELF::R_AMDGPU_REL32_LO;
+  case MCSymbolRefExpr::VK_AMDGPU_REL32_HI:
+    return ELF::R_AMDGPU_REL32_HI;
+  }
+
+  switch (Fixup.getKind()) {
+  default: break;
+  case FK_PCRel_4:
+    return ELF::R_AMDGPU_REL32;
+  case FK_Data_4:
+  case FK_SecRel_4:
+    return ELF::R_AMDGPU_ABS32;
+  case FK_Data_8:
+    return ELF::R_AMDGPU_ABS64;
+  }
+
+  llvm_unreachable("unhandled relocation type");
+}
+
+
+MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit,
+                                                  bool HasRelocationAddend,
+                                                  raw_pwrite_stream &OS) {
+  MCELFObjectTargetWriter *MOTW =
+      new AMDGPUELFObjectWriter(Is64Bit, HasRelocationAddend);
+  return createELFObjectWriter(MOTW, OS, true);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
new file mode 100644
index 000000000000..43338a5bebd2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -0,0 +1,21 @@
+//===-------- AMDGPUELFStreamer.cpp - ELF Object Output -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUELFStreamer.h"
+#include "Utils/AMDGPUBaseInfo.h"
+
+using namespace llvm;
+
+MCELFStreamer *llvm::createAMDGPUELFStreamer(MCContext &Context,
+                                           MCAsmBackend &MAB,
+                                           raw_pwrite_stream &OS,
+                                           MCCodeEmitter *Emitter,
+                                           bool RelaxAll) {
+  return new AMDGPUELFStreamer(Context, MAB, OS, Emitter);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
new file mode 100644
index 000000000000..5319b65d65f9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -0,0 +1,39 @@
+//===-------- AMDGPUELFStreamer.h - ELF Object Output -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a custom MCELFStreamer which allows us to insert some hooks before
+// emitting data into an actual object file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCSubtargetInfo;
+
+class AMDGPUELFStreamer : public MCELFStreamer {
+public:
+  AMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS,
+                  MCCodeEmitter *Emitter)
+      : MCELFStreamer(Context, MAB, OS, Emitter) { }
+
+};
+
+MCELFStreamer *createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     raw_pwrite_stream &OS,
+                                     MCCodeEmitter *Emitter, bool RelaxAll);
+} // namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
new file mode 100644
index 000000000000..20c1adfbc6b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
@@ -0,0 +1,28 @@
+//===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUFIXUPKINDS_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace AMDGPU {
+enum Fixups {
+  /// 16-bit PC relative fixup for SOPP branch instructions.
+  fixup_si_sopp_br = FirstTargetFixupKind,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
new file mode 100644
index 000000000000..1655591abf39
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -0,0 +1,45 @@
+//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
+  HasSingleParameterDotFile = false;
+  //===------------------------------------------------------------------===//
+  MinInstAlignment = 4;
+  MaxInstLength = (TT.getArch() == Triple::amdgcn) ? 8 : 16;
+  SeparatorString = "\n";
+  CommentString = ";";
+  PrivateLabelPrefix = "";
+  InlineAsmStart = ";#ASMSTART";
+  InlineAsmEnd = ";#ASMEND";
+
+  //===--- Data Emission Directives -------------------------------------===//
+  SunStyleELFSectionSwitchSyntax = true;
+  UsesELFSectionDirectiveForBSS = true;
+
+  //===--- Global Variable Emission Directives --------------------------===//
+  HasAggressiveSymbolFolding = true;
+  COMMDirectiveAlignmentIsInBytes = false;
+  HasNoDeadStrip = true;
+  WeakRefDirective = ".weakref\t";
+  //===--- Dwarf Emission Directives -----------------------------------===//
+  SupportsDebugInformation = true;
+}
+
+bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const {
+  return SectionName == ".hsatext" || SectionName == ".hsadata_global_agent" ||
+         SectionName == ".hsadata_global_program" ||
+         SectionName == ".hsarodata_readonly_agent" ||
+         MCAsmInfo::shouldOmitSectionDirective(SectionName);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
new file mode 100644
index 000000000000..8cb33a3179cd
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -0,0 +1,33 @@
+//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCASMINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+namespace llvm {
+
+class Triple;
+
+// If you need to create another MCAsmInfo class, which inherits from MCAsmInfo,
+// you will need to make sure your new class sets PrivateGlobalPrefix to
+// a prefix that won't appear in a function name.  The default value
+// for PrivateGlobalPrefix is 'L', so it will consider any function starting
+// with 'L' as a local symbol.
+class AMDGPUMCAsmInfo : public MCAsmInfoELF {
+public:
+  explicit AMDGPUMCAsmInfo(const Triple &TT);
+  bool shouldOmitSectionDirective(StringRef SectionName) const override;
+};
+} // namespace llvm
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
new file mode 100644
index 000000000000..521b3b39bba2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -0,0 +1,21 @@
+//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief CodeEmitter interface for R600 and SI codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCCodeEmitter.h"
+
+using namespace llvm;
+
+// pin vtable to this file
+void AMDGPUMCCodeEmitter::anchor() {}
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
new file mode 100644
index 000000000000..3d3858ab47ec
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -0,0 +1,63 @@
+//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief CodeEmitter interface for R600 and SI codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class MCInst;
+class MCInstrInfo;
+class MCOperand;
+class MCSubtargetInfo;
+class FeatureBitset;
+
+class AMDGPUMCCodeEmitter : public MCCodeEmitter {
+  virtual void anchor();
+
+protected:
+  const MCInstrInfo &MCII;
+
+  AMDGPUMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
+
+public:
+
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+    return 0;
+  }
+
+  virtual unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+    return 0;
+  }
+
+protected:
+  uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+  void verifyInstructionPredicates(const MCInst &MI,
+                                   uint64_t AvailableFeatures) const;
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
new file mode 100644
index 000000000000..136e6ec4ceb5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -0,0 +1,112 @@
+//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This file provides AMDGPU specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCTargetDesc.h"
+#include "AMDGPUELFStreamer.h"
+#include "AMDGPUMCAsmInfo.h"
+#include "AMDGPUTargetStreamer.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "SIDefines.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "AMDGPUGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "AMDGPUGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "AMDGPUGenRegisterInfo.inc"
+
+static MCInstrInfo *createAMDGPUMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitAMDGPUMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitAMDGPUMCRegisterInfo(X, 0);
+  return X;
+}
+
+static MCSubtargetInfo *
+createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T,
+                                                unsigned SyntaxVariant,
+                                                const MCAsmInfo &MAI,
+                                                const MCInstrInfo &MII,
+                                                const MCRegisterInfo &MRI) {
+  return new AMDGPUInstPrinter(MAI, MII, MRI);
+}
+
+static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S,
+                                                      formatted_raw_ostream &OS,
+                                                      MCInstPrinter *InstPrint,
+                                                      bool isVerboseAsm) {
+  return new AMDGPUTargetAsmStreamer(S, OS);
+}
+
+static MCTargetStreamer * createAMDGPUObjectTargetStreamer(
+                                                   MCStreamer &S,
+                                                   const MCSubtargetInfo &STI) {
+  return new AMDGPUTargetELFStreamer(S);
+}
+
+static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
+                                    MCAsmBackend &MAB, raw_pwrite_stream &OS,
+                                    MCCodeEmitter *Emitter, bool RelaxAll) {
+  if (T.getOS() == Triple::AMDHSA)
+    return createAMDGPUELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+
+  return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+}
+
+extern "C" void LLVMInitializeAMDGPUTargetMC() {
+  for (Target *T : {&getTheAMDGPUTarget(), &getTheGCNTarget()}) {
+    RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T);
+
+    TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo);
+    TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo);
+    TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo);
+    TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter);
+    TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend);
+    TargetRegistry::RegisterELFStreamer(*T, createMCStreamer);
+  }
+
+  // R600 specific registration
+  TargetRegistry::RegisterMCCodeEmitter(getTheAMDGPUTarget(),
+                                        createR600MCCodeEmitter);
+
+  // GCN specific registration
+  TargetRegistry::RegisterMCCodeEmitter(getTheGCNTarget(),
+                                        createSIMCCodeEmitter);
+
+  TargetRegistry::RegisterAsmTargetStreamer(getTheGCNTarget(),
+                                            createAMDGPUAsmTargetStreamer);
+  TargetRegistry::RegisterObjectTargetStreamer(
+      getTheGCNTarget(), createAMDGPUObjectTargetStreamer);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
new file mode 100644
index 000000000000..548bad56e174
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -0,0 +1,64 @@
+//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Provides AMDGPU specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class StringRef;
+class Target;
+class Triple;
+class raw_pwrite_stream;
+
+Target &getTheAMDGPUTarget();
+Target &getTheGCNTarget();
+
+MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
+                                       const MCRegisterInfo &MRI,
+                                       MCContext &Ctx);
+
+MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
+                                     const MCRegisterInfo &MRI,
+                                     MCContext &Ctx);
+
+MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     const Triple &TT, StringRef CPU,
+                                     const MCTargetOptions &Options);
+
+MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit,
+                                            bool HasRelocationAddend,
+                                            raw_pwrite_stream &OS);
+} // End llvm namespace
+
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+
+#define GET_INSTRINFO_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AMDGPUGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp
new file mode 100644
index 000000000000..95387ad1627c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp
@@ -0,0 +1,408 @@
+//===-- AMDGPURuntimeMD.cpp - Generates runtime metadata ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// Generates AMDGPU runtime metadata for YAML mapping.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPURuntimeMetadata.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <vector>
+#include "AMDGPURuntimeMD.h"
+
+using namespace llvm;
+using namespace ::AMDGPU::RuntimeMD;
+
+static cl::opt<bool>
+DumpRuntimeMD("amdgpu-dump-rtmd",
+              cl::desc("Dump AMDGPU runtime metadata"));
+
+static cl::opt<bool>
+CheckRuntimeMDParser("amdgpu-check-rtmd-parser", cl::Hidden,
+                     cl::desc("Check AMDGPU runtime metadata YAML parser"));
+
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint8_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(std::string)
+LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Metadata)
+LLVM_YAML_IS_SEQUENCE_VECTOR(KernelArg::Metadata)
+
+namespace llvm {
+namespace yaml {
+
+template <> struct MappingTraits<KernelArg::Metadata> {
+  static void mapping(IO &YamlIO, KernelArg::Metadata &A) {
+    YamlIO.mapRequired(KeyName::ArgSize, A.Size);
+    YamlIO.mapRequired(KeyName::ArgAlign, A.Align);
+    YamlIO.mapOptional(KeyName::ArgPointeeAlign, A.PointeeAlign, 0U);
+    YamlIO.mapRequired(KeyName::ArgKind, A.Kind);
+    YamlIO.mapRequired(KeyName::ArgValueType, A.ValueType);
+    YamlIO.mapOptional(KeyName::ArgTypeName, A.TypeName, std::string());
+    YamlIO.mapOptional(KeyName::ArgName, A.Name, std::string());
+    YamlIO.mapOptional(KeyName::ArgAddrQual, A.AddrQual, INVALID_ADDR_QUAL);
+    YamlIO.mapOptional(KeyName::ArgAccQual, A.AccQual, INVALID_ACC_QUAL);
+    YamlIO.mapOptional(KeyName::ArgIsVolatile, A.IsVolatile, uint8_t(0));
+    YamlIO.mapOptional(KeyName::ArgIsConst, A.IsConst, uint8_t(0));
+    YamlIO.mapOptional(KeyName::ArgIsRestrict, A.IsRestrict, uint8_t(0));
+    YamlIO.mapOptional(KeyName::ArgIsPipe, A.IsPipe, uint8_t(0));
+  }
+  static const bool flow = true;
+};
+
+template <> struct MappingTraits<Kernel::Metadata> {
+  static void mapping(IO &YamlIO, Kernel::Metadata &K) {
+    YamlIO.mapRequired(KeyName::KernelName, K.Name);
+    YamlIO.mapOptional(KeyName::Language, K.Language, std::string());
+    YamlIO.mapOptional(KeyName::LanguageVersion, K.LanguageVersion);
+    YamlIO.mapOptional(KeyName::ReqdWorkGroupSize, K.ReqdWorkGroupSize);
+    YamlIO.mapOptional(KeyName::WorkGroupSizeHint, K.WorkGroupSizeHint);
+    YamlIO.mapOptional(KeyName::VecTypeHint, K.VecTypeHint, std::string());
+    YamlIO.mapOptional(KeyName::KernelIndex, K.KernelIndex,
+        INVALID_KERNEL_INDEX);
+    YamlIO.mapOptional(KeyName::NoPartialWorkGroups, K.NoPartialWorkGroups,
+        uint8_t(0));
+    YamlIO.mapRequired(KeyName::Args, K.Args);
+  }
+  static const bool flow = true;
+};
+
+template <> struct MappingTraits<Program::Metadata> {
+  static void mapping(IO &YamlIO, Program::Metadata &Prog) {
+    YamlIO.mapRequired(KeyName::MDVersion, Prog.MDVersionSeq);
+    YamlIO.mapOptional(KeyName::PrintfInfo, Prog.PrintfInfo);
+    YamlIO.mapOptional(KeyName::Kernels, Prog.Kernels);
+  }
+  static const bool flow = true;
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+// Get a vector of three integer values from MDNode \p Node;
+static std::vector<uint32_t> getThreeInt32(MDNode *Node) {
+  assert(Node->getNumOperands() == 3);
+  std::vector<uint32_t> V;
+  for (const MDOperand &Op : Node->operands()) {
+    const ConstantInt *CI = mdconst::extract<ConstantInt>(Op);
+    V.push_back(CI->getZExtValue());
+  }
+  return V;
+}
+
+static std::string getOCLTypeName(Type *Ty, bool Signed) {
+  switch (Ty->getTypeID()) {
+  case Type::HalfTyID:
+    return "half";
+  case Type::FloatTyID:
+    return "float";
+  case Type::DoubleTyID:
+    return "double";
+  case Type::IntegerTyID: {
+    if (!Signed)
+      return (Twine('u') + getOCLTypeName(Ty, true)).str();
+    unsigned BW = Ty->getIntegerBitWidth();
+    switch (BW) {
+    case 8:
+      return "char";
+    case 16:
+      return "short";
+    case 32:
+      return "int";
+    case 64:
+      return "long";
+    default:
+      return (Twine('i') + Twine(BW)).str();
+    }
+  }
+  case Type::VectorTyID: {
+    VectorType *VecTy = cast<VectorType>(Ty);
+    Type *EleTy = VecTy->getElementType();
+    unsigned Size = VecTy->getVectorNumElements();
+    return (Twine(getOCLTypeName(EleTy, Signed)) + Twine(Size)).str();
+  }
+  default:
+    return "unknown";
+  }
+}
+
+static KernelArg::ValueType getRuntimeMDValueType(
+  Type *Ty, StringRef TypeName) {
+  switch (Ty->getTypeID()) {
+  case Type::HalfTyID:
+    return KernelArg::F16;
+  case Type::FloatTyID:
+    return KernelArg::F32;
+  case Type::DoubleTyID:
+    return KernelArg::F64;
+  case Type::IntegerTyID: {
+    bool Signed = !TypeName.startswith("u");
+    switch (Ty->getIntegerBitWidth()) {
+    case 8:
+      return Signed ? KernelArg::I8 : KernelArg::U8;
+    case 16:
+      return Signed ? KernelArg::I16 : KernelArg::U16;
+    case 32:
+      return Signed ? KernelArg::I32 : KernelArg::U32;
+    case 64:
+      return Signed ? KernelArg::I64 : KernelArg::U64;
+    default:
+      // Runtime does not recognize other integer types. Report as struct type.
+      return KernelArg::Struct;
+    }
+  }
+  case Type::VectorTyID:
+    return getRuntimeMDValueType(Ty->getVectorElementType(), TypeName);
+  case Type::PointerTyID:
+    return getRuntimeMDValueType(Ty->getPointerElementType(), TypeName);
+  default:
+    return KernelArg::Struct;
+  }
+}
+
+static KernelArg::AddressSpaceQualifer getRuntimeAddrSpace(
+    AMDGPUAS::AddressSpaces A) {
+  switch (A) {
+  case AMDGPUAS::GLOBAL_ADDRESS:
+    return KernelArg::Global;
+  case AMDGPUAS::CONSTANT_ADDRESS:
+    return KernelArg::Constant;
+  case AMDGPUAS::LOCAL_ADDRESS:
+    return KernelArg::Local;
+  case AMDGPUAS::FLAT_ADDRESS:
+    return KernelArg::Generic;
+  case AMDGPUAS::REGION_ADDRESS:
+    return KernelArg::Region;
+  default:
+    return KernelArg::Private;
+  }
+}
+
+static KernelArg::Metadata getRuntimeMDForKernelArg(const DataLayout &DL,
+    Type *T, KernelArg::Kind Kind, StringRef BaseTypeName = "",
+    StringRef TypeName = "", StringRef ArgName = "", StringRef TypeQual = "",
+    StringRef AccQual = "") {
+
+  KernelArg::Metadata Arg;
+
+  // Set ArgSize and ArgAlign.
+  Arg.Size = DL.getTypeAllocSize(T);
+  Arg.Align = DL.getABITypeAlignment(T);
+  if (auto PT = dyn_cast<PointerType>(T)) {
+    auto ET = PT->getElementType();
+    if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && ET->isSized())
+      Arg.PointeeAlign = DL.getABITypeAlignment(ET);
+  }
+
+  // Set ArgTypeName.
+  Arg.TypeName = TypeName;
+
+  // Set ArgName.
+  Arg.Name = ArgName;
+
+  // Set ArgIsVolatile, ArgIsRestrict, ArgIsConst and ArgIsPipe.
+  SmallVector<StringRef, 1> SplitQ;
+  TypeQual.split(SplitQ, " ", -1, false /* Drop empty entry */);
+
+  for (StringRef KeyName : SplitQ) {
+    auto *P = StringSwitch<uint8_t *>(KeyName)
+      .Case("volatile", &Arg.IsVolatile)
+      .Case("restrict", &Arg.IsRestrict)
+      .Case("const",    &Arg.IsConst)
+      .Case("pipe",     &Arg.IsPipe)
+      .Default(nullptr);
+    if (P)
+      *P = 1;
+  }
+
+  // Set ArgKind.
+  Arg.Kind = Kind;
+
+  // Set ArgValueType.
+  Arg.ValueType = getRuntimeMDValueType(T, BaseTypeName);
+
+  // Set ArgAccQual.
+  if (!AccQual.empty()) {
+    Arg.AccQual = StringSwitch<KernelArg::AccessQualifer>(AccQual)
+      .Case("read_only",  KernelArg::ReadOnly)
+      .Case("write_only", KernelArg::WriteOnly)
+      .Case("read_write", KernelArg::ReadWrite)
+      .Default(KernelArg::AccNone);
+  }
+
+  // Set ArgAddrQual.
+  if (auto *PT = dyn_cast<PointerType>(T)) {
+    Arg.AddrQual = getRuntimeAddrSpace(static_cast<AMDGPUAS::AddressSpaces>(
+        PT->getAddressSpace()));
+  }
+
+  return Arg;
+}
+
+static Kernel::Metadata getRuntimeMDForKernel(const Function &F) {
+  Kernel::Metadata Kernel;
+  Kernel.Name = F.getName();
+  auto &M = *F.getParent();
+
+  // Set Language and LanguageVersion.
+  if (auto MD = M.getNamedMetadata("opencl.ocl.version")) {
+    if (MD->getNumOperands() != 0) {
+      auto Node = MD->getOperand(0);
+      if (Node->getNumOperands() > 1) {
+        Kernel.Language = "OpenCL C";
+        uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
+                         ->getZExtValue();
+        uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
+                         ->getZExtValue();
+        Kernel.LanguageVersion.push_back(Major);
+        Kernel.LanguageVersion.push_back(Minor);
+      }
+    }
+  }
+
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  for (auto &Arg : F.args()) {
+    unsigned I = Arg.getArgNo();
+    Type *T = Arg.getType();
+    auto TypeName = dyn_cast<MDString>(F.getMetadata(
+        "kernel_arg_type")->getOperand(I))->getString();
+    auto BaseTypeName = cast<MDString>(F.getMetadata(
+        "kernel_arg_base_type")->getOperand(I))->getString();
+    StringRef ArgName;
+    if (auto ArgNameMD = F.getMetadata("kernel_arg_name"))
+      ArgName = cast<MDString>(ArgNameMD->getOperand(I))->getString();
+    auto TypeQual = cast<MDString>(F.getMetadata(
+        "kernel_arg_type_qual")->getOperand(I))->getString();
+    auto AccQual = cast<MDString>(F.getMetadata(
+        "kernel_arg_access_qual")->getOperand(I))->getString();
+    KernelArg::Kind Kind;
+    if (TypeQual.find("pipe") != StringRef::npos)
+      Kind = KernelArg::Pipe;
+    else Kind = StringSwitch<KernelArg::Kind>(BaseTypeName)
+      .Case("sampler_t", KernelArg::Sampler)
+      .Case("queue_t",   KernelArg::Queue)
+      .Cases("image1d_t", "image1d_array_t", "image1d_buffer_t",
+             "image2d_t" , "image2d_array_t",  KernelArg::Image)
+      .Cases("image2d_depth_t", "image2d_array_depth_t",
+             "image2d_msaa_t", "image2d_array_msaa_t",
+             "image2d_msaa_depth_t",  KernelArg::Image)
+      .Cases("image2d_array_msaa_depth_t", "image3d_t",
+             KernelArg::Image)
+      .Default(isa<PointerType>(T) ?
+                   (T->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ?
+                   KernelArg::DynamicSharedPointer :
+                   KernelArg::GlobalBuffer) :
+                   KernelArg::ByValue);
+    Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, T, Kind,
+        BaseTypeName, TypeName, ArgName, TypeQual, AccQual));
+  }
+
+  // Emit hidden kernel arguments for OpenCL kernels.
+  if (F.getParent()->getNamedMetadata("opencl.ocl.version")) {
+    auto Int64T = Type::getInt64Ty(F.getContext());
+    Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
+        KernelArg::HiddenGlobalOffsetX));
+    Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
+        KernelArg::HiddenGlobalOffsetY));
+    Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
+        KernelArg::HiddenGlobalOffsetZ));
+    if (F.getParent()->getNamedMetadata("llvm.printf.fmts")) {
+      auto Int8PtrT = Type::getInt8PtrTy(F.getContext(),
+          KernelArg::Global);
+      Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int8PtrT,
+          KernelArg::HiddenPrintfBuffer));
+    }
+  }
+
+  // Set ReqdWorkGroupSize, WorkGroupSizeHint, and VecTypeHint.
+  if (auto RWGS = F.getMetadata("reqd_work_group_size"))
+    Kernel.ReqdWorkGroupSize = getThreeInt32(RWGS);
+
+  if (auto WGSH = F.getMetadata("work_group_size_hint"))
+    Kernel.WorkGroupSizeHint = getThreeInt32(WGSH);
+
+  if (auto VTH = F.getMetadata("vec_type_hint"))
+    Kernel.VecTypeHint = getOCLTypeName(cast<ValueAsMetadata>(
+      VTH->getOperand(0))->getType(), mdconst::extract<ConstantInt>(
+      VTH->getOperand(1))->getZExtValue());
+
+  return Kernel;
+}
+
+Program::Metadata::Metadata(const std::string &YAML) {
+  yaml::Input Input(YAML);
+  Input >> *this;
+}
+
+std::string Program::Metadata::toYAML(void) {
+  std::string Text;
+  raw_string_ostream Stream(Text);
+  yaml::Output Output(Stream, nullptr, INT_MAX /* do not wrap line */);
+  Output << *this;
+  return Stream.str();
+}
+
+Program::Metadata Program::Metadata::fromYAML(const std::string &S) {
+  return Program::Metadata(S);
+}
+
+// Check if the YAML string can be parsed.
+static void checkRuntimeMDYAMLString(const std::string &YAML) {
+  auto P = Program::Metadata::fromYAML(YAML);
+  auto S = P.toYAML();
+  llvm::errs() << "AMDGPU runtime metadata parser test "
+               << (YAML == S ? "passes" : "fails") << ".\n";
+  if (YAML != S) {
+    llvm::errs() << "First output: " << YAML << '\n'
+                 << "Second output: " << S << '\n';
+  }
+}
+
+std::string llvm::getRuntimeMDYAMLString(Module &M) {
+  Program::Metadata Prog;
+  Prog.MDVersionSeq.push_back(MDVersion);
+  Prog.MDVersionSeq.push_back(MDRevision);
+
+  // Set PrintfInfo.
+  if (auto MD = M.getNamedMetadata("llvm.printf.fmts")) {
+    for (unsigned I = 0; I < MD->getNumOperands(); ++I) {
+      auto Node = MD->getOperand(I);
+      if (Node->getNumOperands() > 0)
+        Prog.PrintfInfo.push_back(cast<MDString>(Node->getOperand(0))
+            ->getString());
+    }
+  }
+
+  // Set Kernels.
+  for (auto &F: M.functions()) {
+    if (!F.getMetadata("kernel_arg_type"))
+      continue;
+    Prog.Kernels.emplace_back(getRuntimeMDForKernel(F));
+  }
+
+  auto YAML = Prog.toYAML();
+
+  if (DumpRuntimeMD)
+    llvm::errs() << "AMDGPU runtime metadata:\n" << YAML << '\n';
+
+  if (CheckRuntimeMDParser)
+    checkRuntimeMDYAMLString(YAML);
+
+  return YAML;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h
new file mode 100644
index 000000000000..a92fdd4bebc2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h
@@ -0,0 +1,26 @@
+//===- AMDGPURuntimeMD.h - Generate runtime metadata ---------------*- C++ -*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares functions for generating runtime metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H
+
+#include <string>
+
+namespace llvm {
+class Module;
+
+// Get runtime metadata as YAML string.
+std::string getRuntimeMDYAMLString(Module &M);
+
+}
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
new file mode 100644
index 000000000000..3392183d33c3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -0,0 +1,242 @@
+//===-- AMDGPUTargetStreamer.cpp - Mips Target Streamer Methods -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AMDGPU specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetStreamer.h"
+#include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDKernelCodeTUtils.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/FormattedStream.h"
+#include "AMDGPURuntimeMD.h"
+
+namespace llvm {
+#include "AMDGPUPTNote.h"
+}
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+AMDGPUTargetStreamer::AMDGPUTargetStreamer(MCStreamer &S)
+    : MCTargetStreamer(S) {}
+
+//===----------------------------------------------------------------------===//
+// AMDGPUTargetAsmStreamer
+//===----------------------------------------------------------------------===//
+
+AMDGPUTargetAsmStreamer::AMDGPUTargetAsmStreamer(MCStreamer &S,
+                                                 formatted_raw_ostream &OS)
+    : AMDGPUTargetStreamer(S), OS(OS) { }
+
+void
+AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
+                                                           uint32_t Minor) {
+  OS << "\t.hsa_code_object_version " <<
+        Twine(Major) << "," << Twine(Minor) << '\n';
+}
+
+void
+AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
+                                                       uint32_t Minor,
+                                                       uint32_t Stepping,
+                                                       StringRef VendorName,
+                                                       StringRef ArchName) {
+  OS << "\t.hsa_code_object_isa " <<
+        Twine(Major) << "," << Twine(Minor) << "," << Twine(Stepping) <<
+        ",\"" << VendorName << "\",\"" << ArchName << "\"\n";
+
+}
+
+void
+AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
+  OS << "\t.amd_kernel_code_t\n";
+  dumpAmdKernelCode(&Header, OS, "\t\t");
+  OS << "\t.end_amd_kernel_code_t\n";
+}
+
+void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
+                                                   unsigned Type) {
+  switch (Type) {
+    default: llvm_unreachable("Invalid AMDGPU symbol type");
+    case ELF::STT_AMDGPU_HSA_KERNEL:
+      OS << "\t.amdgpu_hsa_kernel " << SymbolName << '\n' ;
+      break;
+  }
+}
+
+void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaModuleScopeGlobal(
+    StringRef GlobalName) {
+  OS << "\t.amdgpu_hsa_module_global " << GlobalName << '\n';
+}
+
+void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal(
+    StringRef GlobalName) {
+  OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n';
+}
+
+void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(Module &M) {
+  OS << "\t.amdgpu_runtime_metadata\n";
+  OS << getRuntimeMDYAMLString(M);
+  OS << "\n\t.end_amdgpu_runtime_metadata\n";
+}
+
+void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(StringRef Metadata) {
+  OS << "\t.amdgpu_runtime_metadata";
+  OS << Metadata;
+  OS << "\t.end_amdgpu_runtime_metadata\n";
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPUTargetELFStreamer
+//===----------------------------------------------------------------------===//
+
+AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S)
+    : AMDGPUTargetStreamer(S), Streamer(S) {}
+
+MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
+  return static_cast<MCELFStreamer &>(Streamer);
+}
+
+void
+AMDGPUTargetELFStreamer::EmitAMDGPUNote(const MCExpr* DescSZ,
+                                        PT_NOTE::NoteType Type,
+                              std::function<void(MCELFStreamer &)> EmitDesc) {
+  auto &S = getStreamer();
+  auto &Context = S.getContext();
+
+  auto NameSZ = sizeof(PT_NOTE::NoteName);
+
+  S.PushSection();
+  S.SwitchSection(Context.getELFSection(
+    PT_NOTE::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC));
+  S.EmitIntValue(NameSZ, 4);                                  // namesz
+  S.EmitValue(DescSZ, 4);                                     // descz
+  S.EmitIntValue(Type, 4); // type
+  S.EmitBytes(StringRef(PT_NOTE::NoteName, NameSZ));          // name
+  S.EmitValueToAlignment(4, 0, 1, 0);                         // padding 0
+  EmitDesc(S);                                                // desc
+  S.EmitValueToAlignment(4, 0, 1, 0);                         // padding 0
+  S.PopSection();
+}
+
+void
+AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
+                                                           uint32_t Minor) {
+
+  EmitAMDGPUNote(
+    MCConstantExpr::create(8, getContext()),
+    PT_NOTE::NT_AMDGPU_HSA_CODE_OBJECT_VERSION,
+    [&](MCELFStreamer &OS){
+      OS.EmitIntValue(Major, 4);
+      OS.EmitIntValue(Minor, 4);
+    }
+  );
+}
+
+void
+AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
+                                                       uint32_t Minor,
+                                                       uint32_t Stepping,
+                                                       StringRef VendorName,
+                                                       StringRef ArchName) {
+  uint16_t VendorNameSize = VendorName.size() + 1;
+  uint16_t ArchNameSize = ArchName.size() + 1;
+  
+  unsigned DescSZ = sizeof(VendorNameSize) + sizeof(ArchNameSize) +
+    sizeof(Major) + sizeof(Minor) + sizeof(Stepping) +
+    VendorNameSize + ArchNameSize;
+
+  EmitAMDGPUNote(
+    MCConstantExpr::create(DescSZ, getContext()),
+    PT_NOTE::NT_AMDGPU_HSA_ISA,
+    [&](MCELFStreamer &OS) {
+      OS.EmitIntValue(VendorNameSize, 2);
+      OS.EmitIntValue(ArchNameSize, 2);
+      OS.EmitIntValue(Major, 4);
+      OS.EmitIntValue(Minor, 4);
+      OS.EmitIntValue(Stepping, 4);
+      OS.EmitBytes(VendorName);
+      OS.EmitIntValue(0, 1); // NULL terminate VendorName
+      OS.EmitBytes(ArchName);
+      OS.EmitIntValue(0, 1); // NULL terminte ArchName
+    }
+  );
+}
+
+void
+AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
+
+  MCStreamer &OS = getStreamer();
+  OS.PushSection();
+  OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header)));
+  OS.PopSection();
+}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
+                                                   unsigned Type) {
+  MCSymbolELF *Symbol = cast<MCSymbolELF>(
+      getStreamer().getContext().getOrCreateSymbol(SymbolName));
+  Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL);
+}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUHsaModuleScopeGlobal(
+    StringRef GlobalName) {
+
+  MCSymbolELF *Symbol = cast<MCSymbolELF>(
+      getStreamer().getContext().getOrCreateSymbol(GlobalName));
+  Symbol->setType(ELF::STT_OBJECT);
+  Symbol->setBinding(ELF::STB_LOCAL);
+}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal(
+    StringRef GlobalName) {
+
+  MCSymbolELF *Symbol = cast<MCSymbolELF>(
+      getStreamer().getContext().getOrCreateSymbol(GlobalName));
+  Symbol->setType(ELF::STT_OBJECT);
+  Symbol->setBinding(ELF::STB_GLOBAL);
+}
+
+void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(StringRef Metadata) {
+  // Create two labels to mark the beginning and end of the desc field
+  // and a MCExpr to calculate the size of the desc field.
+  auto &Context = getContext();
+  auto *DescBegin = Context.createTempSymbol();
+  auto *DescEnd = Context.createTempSymbol();
+  auto *DescSZ = MCBinaryExpr::createSub(
+    MCSymbolRefExpr::create(DescEnd, Context),
+    MCSymbolRefExpr::create(DescBegin, Context), Context);
+
+  EmitAMDGPUNote(
+    DescSZ,
+    PT_NOTE::NT_AMDGPU_HSA_RUNTIME_METADATA,
+    [&](MCELFStreamer &OS) {
+      OS.EmitLabel(DescBegin);
+      OS.EmitBytes(Metadata);
+      OS.EmitLabel(DescEnd);
+    }
+  );
+}
+
+void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(Module &M) {
+  EmitRuntimeMetadata(getRuntimeMDYAMLString(M));
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
new file mode 100644
index 000000000000..e2f20586903d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -0,0 +1,111 @@
+//===-- AMDGPUTargetStreamer.h - AMDGPU Target Streamer --------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+
+#include "AMDKernelCodeT.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+#include "AMDGPUPTNote.h"
+
+class DataLayout;
+class Function;
+class MCELFStreamer;
+class MCSymbol;
+class MDNode;
+class Module;
+class Type;
+
+class AMDGPUTargetStreamer : public MCTargetStreamer {
+protected:
+  MCContext &getContext() const { return Streamer.getContext(); }
+
+public:
+  AMDGPUTargetStreamer(MCStreamer &S);
+  virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
+                                                 uint32_t Minor) = 0;
+
+  virtual void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor,
+                                             uint32_t Stepping,
+                                             StringRef VendorName,
+                                             StringRef ArchName) = 0;
+
+  virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0;
+
+  virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0;
+
+  virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0;
+
+  virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0;
+
+  virtual void EmitRuntimeMetadata(Module &M) = 0;
+
+  virtual void EmitRuntimeMetadata(StringRef Metadata) = 0;
+};
+
+class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer {
+  formatted_raw_ostream &OS;
+public:
+  AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+  void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
+                                         uint32_t Minor) override;
+
+  void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor,
+                                     uint32_t Stepping, StringRef VendorName,
+                                     StringRef ArchName) override;
+
+  void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
+
+  void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+
+  void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
+
+  void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
+
+  void EmitRuntimeMetadata(Module &M) override;
+
+  void EmitRuntimeMetadata(StringRef Metadata) override;
+};
+
+class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer {
+  MCStreamer &Streamer;
+
+  void EmitAMDGPUNote(const MCExpr* DescSize,
+                      AMDGPU::PT_NOTE::NoteType Type,
+                      std::function<void(MCELFStreamer &)> EmitDesc);
+
+public:
+  AMDGPUTargetELFStreamer(MCStreamer &S);
+
+  MCELFStreamer &getStreamer();
+
+  void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
+                                         uint32_t Minor) override;
+
+  void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor,
+                                     uint32_t Stepping, StringRef VendorName,
+                                     StringRef ArchName) override;
+
+  void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
+
+  void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+
+  void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
+
+  void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
+
+  void EmitRuntimeMetadata(Module &M) override;
+
+  void EmitRuntimeMetadata(StringRef Metadata) override;
+};
+
+}
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
new file mode 100644
index 000000000000..6015ec190fd4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -0,0 +1,189 @@
+//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// \brief The R600 code emitter produces machine code that can be executed
+/// directly on the GPU device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600Defines.h"
+#include "MCTargetDesc/AMDGPUFixupKinds.h"
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+namespace {
+
+class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
+  const MCRegisterInfo &MRI;
+
+public:
+  R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri)
+    : AMDGPUMCCodeEmitter(mcii), MRI(mri) {}
+  R600MCCodeEmitter(const R600MCCodeEmitter &) = delete;
+  R600MCCodeEmitter &operator=(const R600MCCodeEmitter &) = delete;
+
+  /// \brief Encode the instruction and write it to the OS.
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  /// \returns the encoding for an MCOperand.
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const override;
+
+private:
+  void Emit(uint32_t value, raw_ostream &OS) const;
+  void Emit(uint64_t value, raw_ostream &OS) const;
+
+  unsigned getHWReg(unsigned regNo) const;
+};
+
+} // end anonymous namespace
+
+enum RegElement {
+  ELEMENT_X = 0,
+  ELEMENT_Y,
+  ELEMENT_Z,
+  ELEMENT_W
+};
+
+enum FCInstr {
+  FC_IF_PREDICATE = 0,
+  FC_ELSE,
+  FC_ENDIF,
+  FC_BGNLOOP,
+  FC_ENDLOOP,
+  FC_BREAK_PREDICATE,
+  FC_CONTINUE
+};
+
+MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
+                                             const MCRegisterInfo &MRI,
+                                             MCContext &Ctx) {
+  return new R600MCCodeEmitter(MCII, MRI);
+}
+
+void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  verifyInstructionPredicates(MI,
+                              computeAvailableFeatures(STI.getFeatureBits()));
+
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  if (MI.getOpcode() == AMDGPU::RETURN ||
+    MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
+    MI.getOpcode() == AMDGPU::ALU_CLAUSE ||
+    MI.getOpcode() == AMDGPU::BUNDLE ||
+    MI.getOpcode() == AMDGPU::KILL) {
+    return;
+  } else if (IS_VTX(Desc)) {
+    uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI);
+    uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
+    if (!(STI.getFeatureBits()[AMDGPU::FeatureCaymanISA])) {
+      InstWord2 |= 1 << 19; // Mega-Fetch bit
+    }
+
+    Emit(InstWord01, OS);
+    Emit(InstWord2, OS);
+    Emit((uint32_t) 0, OS);
+  } else if (IS_TEX(Desc)) {
+      int64_t Sampler = MI.getOperand(14).getImm();
+
+      int64_t SrcSelect[4] = {
+        MI.getOperand(2).getImm(),
+        MI.getOperand(3).getImm(),
+        MI.getOperand(4).getImm(),
+        MI.getOperand(5).getImm()
+      };
+      int64_t Offsets[3] = {
+        MI.getOperand(6).getImm() & 0x1F,
+        MI.getOperand(7).getImm() & 0x1F,
+        MI.getOperand(8).getImm() & 0x1F
+      };
+
+      uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups, STI);
+      uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 |
+          SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 |
+          SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 |
+          Offsets[2] << 10;
+
+      Emit(Word01, OS);
+      Emit(Word2, OS);
+      Emit((uint32_t) 0, OS);
+  } else {
+    uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI);
+    if ((STI.getFeatureBits()[AMDGPU::FeatureR600ALUInst]) &&
+       ((Desc.TSFlags & R600_InstFlag::OP1) ||
+         Desc.TSFlags & R600_InstFlag::OP2)) {
+      uint64_t ISAOpCode = Inst & (0x3FFULL << 39);
+      Inst &= ~(0x3FFULL << 39);
+      Inst |= ISAOpCode << 1;
+    }
+    Emit(Inst, OS);
+  }
+}
+
+void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
+  support::endian::Writer<support::little>(OS).write(Value);
+}
+
+void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
+  support::endian::Writer<support::little>(OS).write(Value);
+}
+
+unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
+  return MRI.getEncodingValue(RegNo) & HW_REG_MASK;
+}
+
+uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                              const MCOperand &MO,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  if (MO.isReg()) {
+    if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags))
+      return MRI.getEncodingValue(MO.getReg());
+    return getHWReg(MO.getReg());
+  }
+
+  if (MO.isExpr()) {
+    // We put rodata at the end of code section, then map the entire
+    // code secetion as vtx buf. Thus the section relative address is the
+    // correct one.
+    // Each R600 literal instruction has two operands
+    // We can't easily get the order of the current one, so compare against
+    // the first one and adjust offset.
+    const unsigned offset = (&MO == &MI.getOperand(0)) ? 0 : 4;
+    Fixups.push_back(MCFixup::create(offset, MO.getExpr(), FK_SecRel_4, MI.getLoc()));
+    return 0;
+  }
+
+  assert(MO.isImm());
+  return MO.getImm();
+}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
new file mode 100644
index 000000000000..0c5bb0648a16
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -0,0 +1,335 @@
+//===-- SIMCCodeEmitter.cpp - SI Code Emitter -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The SI code emitter produces machine code that can be executed
+/// directly on the GPU device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "MCTargetDesc/AMDGPUFixupKinds.h"
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+
+using namespace llvm;
+
+namespace {
+
+class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
+  const MCRegisterInfo &MRI;
+
+  /// \brief Encode an fp or int literal
+  uint32_t getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo,
+                          const MCSubtargetInfo &STI) const;
+
+public:
+  SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
+                  MCContext &ctx)
+      : AMDGPUMCCodeEmitter(mcii), MRI(mri) {}
+  SIMCCodeEmitter(const SIMCCodeEmitter &) = delete;
+  SIMCCodeEmitter &operator=(const SIMCCodeEmitter &) = delete;
+
+  /// \brief Encode the instruction and write it to the OS.
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  /// \returns the encoding for an MCOperand.
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const override;
+
+  /// \brief Use a fixup to encode the simm16 field for SOPP branch
+  ///        instructions.
+  unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const override;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
+                                           const MCRegisterInfo &MRI,
+                                           MCContext &Ctx) {
+  return new SIMCCodeEmitter(MCII, MRI, Ctx);
+}
+
+// Returns the encoding value to use if the given integer is an integer inline
+// immediate value, or 0 if it is not.
+template <typename IntTy>
+static uint32_t getIntInlineImmEncoding(IntTy Imm) {
+  if (Imm >= 0 && Imm <= 64)
+    return 128 + Imm;
+
+  if (Imm >= -16 && Imm <= -1)
+    return 192 + std::abs(Imm);
+
+  return 0;
+}
+
+static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo &STI) {
+  uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val));
+  if (IntImm != 0)
+    return IntImm;
+
+  if (Val == 0x3800) // 0.5
+    return 240;
+
+  if (Val == 0xB800) // -0.5
+    return 241;
+
+  if (Val == 0x3C00) // 1.0
+    return 242;
+
+  if (Val == 0xBC00) // -1.0
+    return 243;
+
+  if (Val == 0x4000) // 2.0
+    return 244;
+
+  if (Val == 0xC000) // -2.0
+    return 245;
+
+  if (Val == 0x4400) // 4.0
+    return 246;
+
+  if (Val == 0xC400) // -4.0
+    return 247;
+
+  if (Val == 0x3118 && // 1.0 / (2.0 * pi)
+      STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+    return 248;
+
+  return 255;
+}
+
+static uint32_t getLit32Encoding(uint32_t Val, const MCSubtargetInfo &STI) {
+  uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val));
+  if (IntImm != 0)
+    return IntImm;
+
+  if (Val == FloatToBits(0.5f))
+    return 240;
+
+  if (Val == FloatToBits(-0.5f))
+    return 241;
+
+  if (Val == FloatToBits(1.0f))
+    return 242;
+
+  if (Val == FloatToBits(-1.0f))
+    return 243;
+
+  if (Val == FloatToBits(2.0f))
+    return 244;
+
+  if (Val == FloatToBits(-2.0f))
+    return 245;
+
+  if (Val == FloatToBits(4.0f))
+    return 246;
+
+  if (Val == FloatToBits(-4.0f))
+    return 247;
+
+  if (Val == 0x3e22f983 && // 1.0 / (2.0 * pi)
+      STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+    return 248;
+
+  return 255;
+}
+
+static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI) {
+  uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val));
+  if (IntImm != 0)
+    return IntImm;
+
+  if (Val == DoubleToBits(0.5))
+    return 240;
+
+  if (Val == DoubleToBits(-0.5))
+    return 241;
+
+  if (Val == DoubleToBits(1.0))
+    return 242;
+
+  if (Val == DoubleToBits(-1.0))
+    return 243;
+
+  if (Val == DoubleToBits(2.0))
+    return 244;
+
+  if (Val == DoubleToBits(-2.0))
+    return 245;
+
+  if (Val == DoubleToBits(4.0))
+    return 246;
+
+  if (Val == DoubleToBits(-4.0))
+    return 247;
+
+  if (Val == 0x3fc45f306dc9c882 && // 1.0 / (2.0 * pi)
+      STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+    return 248;
+
+  return 255;
+}
+
+uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
+                                         const MCOperandInfo &OpInfo,
+                                         const MCSubtargetInfo &STI) const {
+  int64_t Imm;
+  if (MO.isExpr()) {
+    const auto *C = dyn_cast<MCConstantExpr>(MO.getExpr());
+    if (!C)
+      return 255;
+
+    Imm = C->getValue();
+  } else {
+
+    assert(!MO.isFPImm());
+
+    if (!MO.isImm())
+      return ~0;
+
+    Imm = MO.getImm();
+  }
+
+  switch (AMDGPU::getOperandSize(OpInfo)) {
+  case 4:
+    return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
+  case 8:
+    return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
+  case 2:
+    return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
+  default:
+    llvm_unreachable("invalid operand size");
+  }
+}
+
+void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  verifyInstructionPredicates(MI,
+                              computeAvailableFeatures(STI.getFeatureBits()));
+
+  uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI);
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  unsigned bytes = Desc.getSize();
+
+  for (unsigned i = 0; i < bytes; i++) {
+    OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
+  }
+
+  if (bytes > 4)
+    return;
+
+  // Check for additional literals in SRC0/1/2 (Op 1/2/3)
+  for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
+
+    // Check if this operand should be encoded as [SV]Src
+    if (!AMDGPU::isSISrcOperand(Desc, i))
+      continue;
+
+    // Is this operand a literal immediate?
+    const MCOperand &Op = MI.getOperand(i);
+    if (getLitEncoding(Op, Desc.OpInfo[i], STI) != 255)
+      continue;
+
+    // Yes! Encode it
+    int64_t Imm = 0;
+
+    if (Op.isImm())
+      Imm = Op.getImm();
+    else if (Op.isExpr()) {
+      if (const auto *C = dyn_cast<MCConstantExpr>(Op.getExpr()))
+        Imm = C->getValue();
+
+    } else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
+      llvm_unreachable("Must be immediate or expr");
+
+    for (unsigned j = 0; j < 4; j++) {
+      OS.write((uint8_t) ((Imm >> (8 * j)) & 0xff));
+    }
+
+    // Only one literal value allowed
+    break;
+  }
+}
+
+unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  if (MO.isExpr()) {
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br;
+    Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+    return 0;
+  }
+
+  return getMachineOpValue(MI, MO, Fixups, STI);
+}
+
+uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                            const MCOperand &MO,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return MRI.getEncodingValue(MO.getReg());
+
+  if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) {
+    const auto *Expr = dyn_cast<MCSymbolRefExpr>(MO.getExpr());
+    MCFixupKind Kind;
+    if (Expr && Expr->getSymbol().isExternal())
+      Kind = FK_Data_4;
+    else
+      Kind = FK_PCRel_4;
+    Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc()));
+  }
+
+  // Figure out the operand number, needed for isSrcOperand check
+  unsigned OpNo = 0;
+  for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) {
+    if (&MO == &MI.getOperand(OpNo))
+      break;
+  }
+
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
+    uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
+    if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
+      return Enc;
+
+  } else if (MO.isImm())
+    return MO.getImm();
+
+  llvm_unreachable("Encoding of this operand type is not supported yet.");
+  return 0;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td
new file mode 100644
index 000000000000..46803e555711
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -0,0 +1,763 @@
+//===-- MIMGInstructions.td - MIMG Instruction Defintions -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class MIMG_Mask <string op, int channels> {
+  string Op = op;
+  int Channels = channels;
+}
+
+class mimg <bits<7> si, bits<7> vi = si> {
+  field bits<7> SI = si;
+  field bits<7> VI = vi;
+}
+
+class MIMG_Helper <dag outs, dag ins, string asm,
+                   string dns=""> : MIMG<outs, ins, asm,[]> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasPostISelHook = 1;
+  let DecoderNamespace = dns;
+  let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
+  let AsmMatchConverter = "cvtMIMG";
+  let usesCustomInserter = 1;
+}
+
+class MIMG_NoSampler_Helper <bits<7> op, string asm,
+                             RegisterClass dst_rc,
+                             RegisterClass addr_rc,
+                             string dns=""> : MIMG_Helper <
+  (outs dst_rc:$vdata),
+  (ins addr_rc:$vaddr, SReg_256:$srsrc,
+       dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
+       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+  asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da",
+  dns>, MIMGe<op> {
+  let ssamp = 0;
+}
+
+multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
+                                      RegisterClass dst_rc,
+                                      int channels> {
+  def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
+                                   !if(!eq(channels, 1), "AMDGPU", "")>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>,
+            MIMG_Mask<asm#"_V4", channels>;
+}
+
+multiclass MIMG_NoSampler <bits<7> op, string asm> {
+  defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>;
+  defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
+  defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
+  defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
+}
+
+class MIMG_Store_Helper <bits<7> op, string asm,
+                         RegisterClass data_rc,
+                         RegisterClass addr_rc> : MIMG_Helper <
+  (outs),
+  (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+       dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
+       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+  asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+     >, MIMGe<op> {
+  let ssamp = 0;
+  let mayLoad = 1; // TableGen requires this for matching with the intrinsics
+  let mayStore = 1;
+  let hasSideEffects = 1;
+  let hasPostISelHook = 0;
+  let DisableWQM = 1;
+}
+
+multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
+                                  RegisterClass data_rc,
+                                  int channels> {
+  def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>,
+            MIMG_Mask<asm#"_V4", channels>;
+}
+
+multiclass MIMG_Store <bits<7> op, string asm> {
+  defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>;
+  defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 2>;
+  defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 3>;
+  defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 4>;
+}
+
+class MIMG_Atomic_Helper <string asm, RegisterClass data_rc,
+                          RegisterClass addr_rc> : MIMG_Helper <
+    (outs data_rc:$vdst),
+    (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
+         dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
+         r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+    asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"
+  > {
+  let mayStore = 1;
+  let hasSideEffects = 1;
+  let hasPostISelHook = 0;
+  let DisableWQM = 1;
+  let Constraints = "$vdst = $vdata";
+  let AsmMatchConverter = "cvtMIMGAtomic";
+}
+
+class MIMG_Atomic_Real_si<mimg op, string name, string asm,
+  RegisterClass data_rc, RegisterClass addr_rc> :
+  MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
+  SIMCInstr<name, SIEncodingFamily.SI>,
+  MIMGe<op.SI> {
+  let isCodeGenOnly = 0;
+  let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
+}
+
+class MIMG_Atomic_Real_vi<mimg op, string name, string asm,
+  RegisterClass data_rc, RegisterClass addr_rc> :
+  MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
+  SIMCInstr<name, SIEncodingFamily.VI>,
+  MIMGe<op.VI> {
+  let isCodeGenOnly = 0;
+  let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
+}
+
+multiclass MIMG_Atomic_Helper_m <mimg op, string name, string asm,
+                                 RegisterClass data_rc, RegisterClass addr_rc> {
+  let isPseudo = 1, isCodeGenOnly = 1 in {
+    def "" : MIMG_Atomic_Helper<asm, data_rc, addr_rc>,
+             SIMCInstr<name, SIEncodingFamily.NONE>;
+  }
+
+  let ssamp = 0 in {
+    def _si : MIMG_Atomic_Real_si<op, name, asm, data_rc, addr_rc>;
+
+    def _vi : MIMG_Atomic_Real_vi<op, name, asm, data_rc, addr_rc>;
+  }
+}
+
+multiclass MIMG_Atomic <mimg op, string asm, RegisterClass data_rc = VGPR_32> {
+  defm _V1 : MIMG_Atomic_Helper_m <op, asm # "_V1", asm, data_rc, VGPR_32>;
+  defm _V2 : MIMG_Atomic_Helper_m <op, asm # "_V2", asm, data_rc, VReg_64>;
+  defm _V4 : MIMG_Atomic_Helper_m <op, asm # "_V3", asm, data_rc, VReg_128>;
+}
+
+class MIMG_Sampler_Helper <bits<7> op, string asm,
+                           RegisterClass dst_rc,
+                           RegisterClass src_rc,
+                           bit wqm,
+                           string dns=""> : MIMG_Helper <
+  (outs dst_rc:$vdata),
+  (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+       dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
+       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+  asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
+  dns>, MIMGe<op> {
+  let WQM = wqm;
+}
+
+multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
+                                    RegisterClass dst_rc,
+                                    int channels, bit wqm> {
+  def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm,
+                                 !if(!eq(channels, 1), "AMDGPU", "")>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>,
+            MIMG_Mask<asm#"_V4", channels>;
+  def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>,
+            MIMG_Mask<asm#"_V8", channels>;
+  def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>,
+            MIMG_Mask<asm#"_V16", channels>;
+}
+
+multiclass MIMG_Sampler <bits<7> op, string asm, bit wqm=0> {
+  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, wqm>;
+  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, wqm>;
+  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, wqm>;
+  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, wqm>;
+}
+
+multiclass MIMG_Sampler_WQM <bits<7> op, string asm> : MIMG_Sampler<op, asm, 1>;
+
+class MIMG_Gather_Helper <bits<7> op, string asm,
+                          RegisterClass dst_rc,
+                          RegisterClass src_rc, bit wqm> : MIMG <
+  (outs dst_rc:$vdata),
+  (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp,
+       dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc,
+       r128:$r128, tfe:$tfe, lwe:$lwe, da:$da),
+  asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da",
+  []>, MIMGe<op> {
+  let mayLoad = 1;
+  let mayStore = 0;
+
+  // DMASK was repurposed for GATHER4. 4 components are always
+  // returned and DMASK works like a swizzle - it selects
+  // the component to fetch. The only useful DMASK values are
+  // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
+  // (red,red,red,red) etc.) The ISA document doesn't mention
+  // this.
+  // Therefore, disable all code which updates DMASK by setting this:
+  let Gather4 = 1;
+  let hasPostISelHook = 0;
+  let WQM = wqm;
+
+  let isAsmParserOnly = 1; // TBD: fix it later
+}
+
+multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm,
+                                    RegisterClass dst_rc,
+                                    int channels, bit wqm> {
+  def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>,
+            MIMG_Mask<asm#"_V4", channels>;
+  def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>,
+            MIMG_Mask<asm#"_V8", channels>;
+  def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>,
+            MIMG_Mask<asm#"_V16", channels>;
+}
+
+multiclass MIMG_Gather <bits<7> op, string asm, bit wqm=0> {
+  defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, wqm>;
+  defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, wqm>;
+  defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, wqm>;
+  defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, wqm>;
+}
+
+multiclass MIMG_Gather_WQM <bits<7> op, string asm> : MIMG_Gather<op, asm, 1>;
+
+//===----------------------------------------------------------------------===//
+// MIMG Instructions
+//===----------------------------------------------------------------------===//
+let SubtargetPredicate = isGCN in {
+defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">;
+//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>;
+//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>;
+//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>;
+//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>;
+defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">;
+defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">;
+//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>;
+//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>;
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
+defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">;
+defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>;
+defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">;
+defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">;
+//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI
+defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimg<0x14>, "image_atomic_smin">;
+defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimg<0x15>, "image_atomic_umin">;
+defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimg<0x16>, "image_atomic_smax">;
+defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimg<0x17>, "image_atomic_umax">;
+defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimg<0x18>, "image_atomic_and">;
+defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">;
+defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">;
+defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">;
+defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">;
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -- not on VI
+//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
+//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
+defm IMAGE_SAMPLE           : MIMG_Sampler_WQM <0x00000020, "image_sample">;
+defm IMAGE_SAMPLE_CL        : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">;
+defm IMAGE_SAMPLE_D         : MIMG_Sampler <0x00000022, "image_sample_d">;
+defm IMAGE_SAMPLE_D_CL      : MIMG_Sampler <0x00000023, "image_sample_d_cl">;
+defm IMAGE_SAMPLE_L         : MIMG_Sampler <0x00000024, "image_sample_l">;
+defm IMAGE_SAMPLE_B         : MIMG_Sampler_WQM <0x00000025, "image_sample_b">;
+defm IMAGE_SAMPLE_B_CL      : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">;
+defm IMAGE_SAMPLE_LZ        : MIMG_Sampler <0x00000027, "image_sample_lz">;
+defm IMAGE_SAMPLE_C         : MIMG_Sampler_WQM <0x00000028, "image_sample_c">;
+defm IMAGE_SAMPLE_C_CL      : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">;
+defm IMAGE_SAMPLE_C_D       : MIMG_Sampler <0x0000002a, "image_sample_c_d">;
+defm IMAGE_SAMPLE_C_D_CL    : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">;
+defm IMAGE_SAMPLE_C_L       : MIMG_Sampler <0x0000002c, "image_sample_c_l">;
+defm IMAGE_SAMPLE_C_B       : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">;
+defm IMAGE_SAMPLE_C_B_CL    : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">;
+defm IMAGE_SAMPLE_C_LZ      : MIMG_Sampler <0x0000002f, "image_sample_c_lz">;
+defm IMAGE_SAMPLE_O         : MIMG_Sampler_WQM <0x00000030, "image_sample_o">;
+defm IMAGE_SAMPLE_CL_O      : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">;
+defm IMAGE_SAMPLE_D_O       : MIMG_Sampler <0x00000032, "image_sample_d_o">;
+defm IMAGE_SAMPLE_D_CL_O    : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">;
+defm IMAGE_SAMPLE_L_O       : MIMG_Sampler <0x00000034, "image_sample_l_o">;
+defm IMAGE_SAMPLE_B_O       : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">;
+defm IMAGE_SAMPLE_B_CL_O    : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">;
+defm IMAGE_SAMPLE_LZ_O      : MIMG_Sampler <0x00000037, "image_sample_lz_o">;
+defm IMAGE_SAMPLE_C_O       : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">;
+defm IMAGE_SAMPLE_C_CL_O    : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">;
+defm IMAGE_SAMPLE_C_D_O     : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">;
+defm IMAGE_SAMPLE_C_D_CL_O  : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">;
+defm IMAGE_SAMPLE_C_L_O     : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">;
+defm IMAGE_SAMPLE_C_B_O     : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">;
+defm IMAGE_SAMPLE_C_B_CL_O  : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">;
+defm IMAGE_SAMPLE_C_LZ_O    : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">;
+defm IMAGE_GATHER4          : MIMG_Gather_WQM <0x00000040, "image_gather4">;
+defm IMAGE_GATHER4_CL       : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">;
+defm IMAGE_GATHER4_L        : MIMG_Gather <0x00000044, "image_gather4_l">;
+defm IMAGE_GATHER4_B        : MIMG_Gather_WQM <0x00000045, "image_gather4_b">;
+defm IMAGE_GATHER4_B_CL     : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">;
+defm IMAGE_GATHER4_LZ       : MIMG_Gather <0x00000047, "image_gather4_lz">;
+defm IMAGE_GATHER4_C        : MIMG_Gather_WQM <0x00000048, "image_gather4_c">;
+defm IMAGE_GATHER4_C_CL     : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">;
+defm IMAGE_GATHER4_C_L      : MIMG_Gather <0x0000004c, "image_gather4_c_l">;
+defm IMAGE_GATHER4_C_B      : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">;
+defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">;
+defm IMAGE_GATHER4_C_LZ     : MIMG_Gather <0x0000004f, "image_gather4_c_lz">;
+defm IMAGE_GATHER4_O        : MIMG_Gather_WQM <0x00000050, "image_gather4_o">;
+defm IMAGE_GATHER4_CL_O     : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">;
+defm IMAGE_GATHER4_L_O      : MIMG_Gather <0x00000054, "image_gather4_l_o">;
+defm IMAGE_GATHER4_B_O      : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">;
+defm IMAGE_GATHER4_B_CL_O   : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">;
+defm IMAGE_GATHER4_LZ_O     : MIMG_Gather <0x00000057, "image_gather4_lz_o">;
+defm IMAGE_GATHER4_C_O      : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">;
+defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">;
+defm IMAGE_GATHER4_C_L_O    : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">;
+defm IMAGE_GATHER4_C_B_O    : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">;
+defm IMAGE_GATHER4_C_LZ_O   : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
+defm IMAGE_GET_LOD          : MIMG_Sampler_WQM <0x00000060, "image_get_lod">;
+defm IMAGE_SAMPLE_CD        : MIMG_Sampler <0x00000068, "image_sample_cd">;
+defm IMAGE_SAMPLE_CD_CL     : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
+defm IMAGE_SAMPLE_C_CD      : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
+defm IMAGE_SAMPLE_C_CD_CL   : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">;
+defm IMAGE_SAMPLE_CD_O      : MIMG_Sampler <0x0000006c, "image_sample_cd_o">;
+defm IMAGE_SAMPLE_CD_CL_O   : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">;
+defm IMAGE_SAMPLE_C_CD_O    : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">;
+defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">;
+//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
+//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
+}
+
+/********** ======================= **********/
+/********** Image sampling patterns **********/
+/********** ======================= **********/
+
+// Image + sampler
+class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+  (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
+        i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
+  (opcode $addr, $rsrc, $sampler,
+          (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
+          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
+>;
+
+multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>;
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>;
+}
+
+// Image + sampler for amdgcn
+// TODO:
+// 1. Handle half data type like v4f16, and add D16 bit support;
+// 2. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128).
+// 3. Add A16 support when we pass address of half type.
+multiclass AMDGCNSamplePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt>  {
+  def : Pat<
+    (dt (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i1:$unorm, i1:$glc,
+        i1:$slc, i1:$lwe, i1:$da)),
+    (opcode $addr, $rsrc, $sampler,
+          (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
+          0, 0, (as_i1imm $lwe), (as_i1imm $da))
+    >;
+}
+
+multiclass AMDGCNSampleDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
+  defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V1), dt, f32>;
+  defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V2), dt, v2f32>;
+  defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V4), dt, v4f32>;
+  defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V8), dt, v8f32>;
+  defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V16), dt, v16f32>;
+}
+
+// TODO: support v3f32.
+multiclass AMDGCNSamplePatterns<SDPatternOperator name, string opcode> {
+  defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V1), f32>;
+  defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
+  defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
+}
+
+// Image only
+class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+  (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm,
+        imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe),
+  (opcode $addr, $rsrc,
+          (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc),
+          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da))
+>;
+
+multiclass ImagePatterns<SDPatternOperator name, string opcode> {
+  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
+  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
+  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
+}
+
+multiclass ImageLoadPattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
+  def : Pat <
+    (dt (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe,
+                i1:$da)),
+    (opcode $addr, $rsrc,
+          (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
+          0, 0, (as_i1imm $lwe), (as_i1imm $da))
+  >;
+}
+
+multiclass ImageLoadDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
+  defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V1), dt, i32>;
+  defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>;
+  defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>;
+}
+
+// TODO: support v3f32.
+multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> {
+  defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f32>;
+  defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
+  defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
+}
+
+multiclass ImageStorePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> {
+  def : Pat <
+    (name dt:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc,
+          i1:$lwe, i1:$da),
+    (opcode $data, $addr, $rsrc,
+          (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc),
+          0, 0, (as_i1imm $lwe), (as_i1imm $da))
+  >;
+}
+
+multiclass ImageStoreDataPatterns<SDPatternOperator name, string opcode, ValueType dt> {
+  defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V1), dt, i32>;
+  defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>;
+  defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>;
+}
+
+// TODO: support v3f32.
+multiclass ImageStorePatterns<SDPatternOperator name, string opcode> {
+  defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V1), f32>;
+  defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V2), v2f32>;
+  defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
+}
+
+class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+  (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc),
+  (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da))
+>;
+
+multiclass ImageAtomicPatterns<SDPatternOperator name, string opcode> {
+  def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V1), i32>;
+  def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V2), v2i32>;
+  def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V4), v4i32>;
+}
+
+class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : Pat <
+  (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc,
+                                   imm:$r128, imm:$da, imm:$slc),
+  (EXTRACT_SUBREG
+    (opcode (REG_SEQUENCE VReg_64, $vsrc, sub0, $vcmp, sub1),
+            $addr, $rsrc, 3, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)),
+    sub0)
+>;
+
+// ======= SI Image Intrinsics ================
+
+// Image load
+defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">;
+defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">;
+def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>;
+
+// Basic sample
+defm : SampleRawPatterns<int_SI_image_sample,           "IMAGE_SAMPLE">;
+defm : SampleRawPatterns<int_SI_image_sample_cl,        "IMAGE_SAMPLE_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_d,         "IMAGE_SAMPLE_D">;
+defm : SampleRawPatterns<int_SI_image_sample_d_cl,      "IMAGE_SAMPLE_D_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_l,         "IMAGE_SAMPLE_L">;
+defm : SampleRawPatterns<int_SI_image_sample_b,         "IMAGE_SAMPLE_B">;
+defm : SampleRawPatterns<int_SI_image_sample_b_cl,      "IMAGE_SAMPLE_B_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_lz,        "IMAGE_SAMPLE_LZ">;
+defm : SampleRawPatterns<int_SI_image_sample_cd,        "IMAGE_SAMPLE_CD">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_cl,     "IMAGE_SAMPLE_CD_CL">;
+
+// Sample with comparison
+defm : SampleRawPatterns<int_SI_image_sample_c,         "IMAGE_SAMPLE_C">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cl,      "IMAGE_SAMPLE_C_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d,       "IMAGE_SAMPLE_C_D">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_cl,    "IMAGE_SAMPLE_C_D_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_l,       "IMAGE_SAMPLE_C_L">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b,       "IMAGE_SAMPLE_C_B">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_cl,    "IMAGE_SAMPLE_C_B_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_lz,      "IMAGE_SAMPLE_C_LZ">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd,      "IMAGE_SAMPLE_C_CD">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl,   "IMAGE_SAMPLE_C_CD_CL">;
+
+// Sample with offsets
+defm : SampleRawPatterns<int_SI_image_sample_o,         "IMAGE_SAMPLE_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cl_o,      "IMAGE_SAMPLE_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_d_o,       "IMAGE_SAMPLE_D_O">;
+defm : SampleRawPatterns<int_SI_image_sample_d_cl_o,    "IMAGE_SAMPLE_D_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_l_o,       "IMAGE_SAMPLE_L_O">;
+defm : SampleRawPatterns<int_SI_image_sample_b_o,       "IMAGE_SAMPLE_B_O">;
+defm : SampleRawPatterns<int_SI_image_sample_b_cl_o,    "IMAGE_SAMPLE_B_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_lz_o,      "IMAGE_SAMPLE_LZ_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_o,      "IMAGE_SAMPLE_CD_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o,   "IMAGE_SAMPLE_CD_CL_O">;
+
+// Sample with comparison and offsets
+defm : SampleRawPatterns<int_SI_image_sample_c_o,       "IMAGE_SAMPLE_C_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cl_o,    "IMAGE_SAMPLE_C_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_o,     "IMAGE_SAMPLE_C_D_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o,  "IMAGE_SAMPLE_C_D_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_l_o,     "IMAGE_SAMPLE_C_L_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_o,     "IMAGE_SAMPLE_C_B_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o,  "IMAGE_SAMPLE_C_B_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_lz_o,    "IMAGE_SAMPLE_C_LZ_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_o,    "IMAGE_SAMPLE_C_CD_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
+
+// Gather opcodes
+// Only the variants which make sense are defined.
+def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V2,        v2i32>;
+def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V4,        v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl,        IMAGE_GATHER4_CL_V4_V4,     v4i32>;
+def : SampleRawPattern<int_SI_gather4_l,         IMAGE_GATHER4_L_V4_V4,      v4i32>;
+def : SampleRawPattern<int_SI_gather4_b,         IMAGE_GATHER4_B_V4_V4,      v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V4,   v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V8,   v8i32>;
+def : SampleRawPattern<int_SI_gather4_lz,        IMAGE_GATHER4_LZ_V4_V2,     v2i32>;
+def : SampleRawPattern<int_SI_gather4_lz,        IMAGE_GATHER4_LZ_V4_V4,     v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_c,         IMAGE_GATHER4_C_V4_V4,      v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V4,   v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V8,   v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_l,       IMAGE_GATHER4_C_L_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_l,       IMAGE_GATHER4_C_L_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b,       IMAGE_GATHER4_C_B_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_b,       IMAGE_GATHER4_C_B_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_cl,    IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz,      IMAGE_GATHER4_C_LZ_V4_V4,   v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_o,         IMAGE_GATHER4_O_V4_V4,      v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V4,   v4i32>;
+def : SampleRawPattern<int_SI_gather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V8,   v8i32>;
+def : SampleRawPattern<int_SI_gather4_l_o,       IMAGE_GATHER4_L_O_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_l_o,       IMAGE_GATHER4_L_O_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_b_o,       IMAGE_GATHER4_B_O_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_b_o,       IMAGE_GATHER4_B_O_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_b_cl_o,    IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_lz_o,      IMAGE_GATHER4_LZ_O_V4_V4,   v4i32>;
+
+def : SampleRawPattern<int_SI_gather4_c_o,       IMAGE_GATHER4_C_O_V4_V4,    v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_o,       IMAGE_GATHER4_C_O_V4_V8,    v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_cl_o,    IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_l_o,     IMAGE_GATHER4_C_L_O_V4_V8,  v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_o,     IMAGE_GATHER4_C_B_O_V4_V8,  v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_b_cl_o,  IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>;
+def : SampleRawPattern<int_SI_gather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>;
+
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>;
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>;
+def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>;
+
+// ======= amdgcn Image Intrinsics ==============
+
+// Image load
+defm : ImageLoadPatterns<int_amdgcn_image_load, "IMAGE_LOAD">;
+defm : ImageLoadPatterns<int_amdgcn_image_load_mip, "IMAGE_LOAD_MIP">;
+defm : ImageLoadPatterns<int_amdgcn_image_getresinfo, "IMAGE_GET_RESINFO">;
+
+// Image store
+defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">;
+defm : ImageStorePatterns<int_amdgcn_image_store_mip, "IMAGE_STORE_MIP">;
+
+// Basic sample
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample,           "IMAGE_SAMPLE">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl,        "IMAGE_SAMPLE_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d,         "IMAGE_SAMPLE_D">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl,      "IMAGE_SAMPLE_D_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l,         "IMAGE_SAMPLE_L">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b,         "IMAGE_SAMPLE_B">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl,      "IMAGE_SAMPLE_B_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz,        "IMAGE_SAMPLE_LZ">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd,        "IMAGE_SAMPLE_CD">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl,     "IMAGE_SAMPLE_CD_CL">;
+
+// Sample with comparison
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c,         "IMAGE_SAMPLE_C">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl,      "IMAGE_SAMPLE_C_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d,       "IMAGE_SAMPLE_C_D">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl,    "IMAGE_SAMPLE_C_D_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l,       "IMAGE_SAMPLE_C_L">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b,       "IMAGE_SAMPLE_C_B">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl,    "IMAGE_SAMPLE_C_B_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz,      "IMAGE_SAMPLE_C_LZ">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd,      "IMAGE_SAMPLE_C_CD">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl,   "IMAGE_SAMPLE_C_CD_CL">;
+
+// Sample with offsets
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_o,         "IMAGE_SAMPLE_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl_o,      "IMAGE_SAMPLE_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_o,       "IMAGE_SAMPLE_D_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl_o,    "IMAGE_SAMPLE_D_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l_o,       "IMAGE_SAMPLE_L_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_o,       "IMAGE_SAMPLE_B_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl_o,    "IMAGE_SAMPLE_B_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz_o,      "IMAGE_SAMPLE_LZ_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_o,      "IMAGE_SAMPLE_CD_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl_o,   "IMAGE_SAMPLE_CD_CL_O">;
+
+// Sample with comparison and offsets
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_o,       "IMAGE_SAMPLE_C_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl_o,    "IMAGE_SAMPLE_C_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_o,     "IMAGE_SAMPLE_C_D_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl_o,  "IMAGE_SAMPLE_C_D_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l_o,     "IMAGE_SAMPLE_C_L_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_o,     "IMAGE_SAMPLE_C_B_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl_o,  "IMAGE_SAMPLE_C_B_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz_o,    "IMAGE_SAMPLE_C_LZ_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_o,    "IMAGE_SAMPLE_C_CD_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
+
+// Gather opcodes
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4,           "IMAGE_GATHER4">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl,        "IMAGE_GATHER4_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l,         "IMAGE_GATHER4_L">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b,         "IMAGE_GATHER4_B">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl,      "IMAGE_GATHER4_B_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz,        "IMAGE_GATHER4_LZ">;
+
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c,         "IMAGE_GATHER4_C">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl,      "IMAGE_GATHER4_C_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l,       "IMAGE_GATHER4_C_L">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b,       "IMAGE_GATHER4_C_B">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl,    "IMAGE_GATHER4_C_B_CL">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz,      "IMAGE_GATHER4_C_LZ">;
+
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_o,         "IMAGE_GATHER4_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl_o,      "IMAGE_GATHER4_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l_o,       "IMAGE_GATHER4_L_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_o,       "IMAGE_GATHER4_B_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl_o,    "IMAGE_GATHER4_B_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz_o,      "IMAGE_GATHER4_LZ_O">;
+
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_o,       "IMAGE_GATHER4_C_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl_o,    "IMAGE_GATHER4_C_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l_o,     "IMAGE_GATHER4_C_L_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_o,     "IMAGE_GATHER4_C_B_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl_o,  "IMAGE_GATHER4_C_B_CL_O">;
+defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz_o,    "IMAGE_GATHER4_C_LZ_O">;
+
+defm : AMDGCNSamplePatterns<int_amdgcn_image_getlod, "IMAGE_GET_LOD">;
+
+// Image atomics
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_swap, "IMAGE_ATOMIC_SWAP">;
+def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V1, i32>;
+def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V2, v2i32>;
+def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V4, v4i32>;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_add, "IMAGE_ATOMIC_ADD">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_sub, "IMAGE_ATOMIC_SUB">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smin, "IMAGE_ATOMIC_SMIN">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umin, "IMAGE_ATOMIC_UMIN">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smax, "IMAGE_ATOMIC_SMAX">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umax, "IMAGE_ATOMIC_UMAX">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_and, "IMAGE_ATOMIC_AND">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_or, "IMAGE_ATOMIC_OR">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_xor, "IMAGE_ATOMIC_XOR">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_inc, "IMAGE_ATOMIC_INC">;
+defm : ImageAtomicPatterns<int_amdgcn_image_atomic_dec, "IMAGE_ATOMIC_DEC">;
+
+/* SIsample for simple 1D texture lookup */
+def : Pat <
+  (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
+  (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
+>;
+
+class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm),
+    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
+>;
+
+class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT),
+    (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0)
+>;
+
+class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY),
+    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
+>;
+
+class SampleShadowPattern<SDNode name, MIMG opcode,
+                          ValueType vt> : Pat <
+    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW),
+    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0)
+>;
+
+class SampleShadowArrayPattern<SDNode name, MIMG opcode,
+                               ValueType vt> : Pat <
+    (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
+    (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1)
+>;
+
+/* SIsample* for texture lookups consuming more address parameters */
+multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l,
+                          MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b,
+MIMG sample_d, MIMG sample_c_d, ValueType addr_type> {
+  def : SamplePattern <SIsample, sample, addr_type>;
+  def : SampleRectPattern <SIsample, sample, addr_type>;
+  def : SampleArrayPattern <SIsample, sample, addr_type>;
+  def : SampleShadowPattern <SIsample, sample_c, addr_type>;
+  def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>;
+
+  def : SamplePattern <SIsamplel, sample_l, addr_type>;
+  def : SampleArrayPattern <SIsamplel, sample_l, addr_type>;
+  def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>;
+  def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>;
+
+  def : SamplePattern <SIsampleb, sample_b, addr_type>;
+  def : SampleArrayPattern <SIsampleb, sample_b, addr_type>;
+  def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>;
+  def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>;
+
+  def : SamplePattern <SIsampled, sample_d, addr_type>;
+  def : SampleArrayPattern <SIsampled, sample_d, addr_type>;
+  def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>;
+  def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>;
+}
+
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2,
+                      IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2,
+                      IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2,
+                      IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2,
+                      v2i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4,
+                      IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4,
+                      IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4,
+                      IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4,
+                      v4i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8,
+                      IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8,
+                      IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8,
+                      IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8,
+                      v8i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16,
+                      IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16,
+                      IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16,
+                      IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16,
+                      v16i32>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/Processors.td b/contrib/llvm/lib/Target/AMDGPU/Processors.td
new file mode 100644
index 000000000000..3c07cc76b9a1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Processors.td
@@ -0,0 +1,189 @@
+//===-- Processors.td - R600 Processor definitions ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
+: Processor<Name, itin, Features>;
+
+//===----------------------------------------------------------------------===//
+// R600
+//===----------------------------------------------------------------------===//
+def : Proc<"r600",       R600_VLIW5_Itin,
+    [FeatureR600, FeatureVertexCache, FeatureWavefrontSize64]>;
+
+def : Proc<"r630",       R600_VLIW5_Itin,
+    [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>;
+
+def : Proc<"rs880",      R600_VLIW5_Itin,
+    [FeatureR600, FeatureWavefrontSize16]>;
+
+def : Proc<"rv670",      R600_VLIW5_Itin,
+    [FeatureR600, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
+
+//===----------------------------------------------------------------------===//
+// R700
+//===----------------------------------------------------------------------===//
+
+def : Proc<"rv710",      R600_VLIW5_Itin,
+    [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
+
+def : Proc<"rv730",      R600_VLIW5_Itin,
+    [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
+
+def : Proc<"rv770",      R600_VLIW5_Itin,
+    [FeatureR700, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
+
+//===----------------------------------------------------------------------===//
+// Evergreen
+//===----------------------------------------------------------------------===//
+
+def : Proc<"cedar",      R600_VLIW5_Itin,
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32,
+     FeatureCFALUBug]>;
+
+def : Proc<"redwood",    R600_VLIW5_Itin,
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64,
+     FeatureCFALUBug]>;
+
+def : Proc<"sumo",       R600_VLIW5_Itin,
+    [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>;
+
+def : Proc<"juniper",    R600_VLIW5_Itin,
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>;
+
+def : Proc<"cypress",    R600_VLIW5_Itin,
+    [FeatureEvergreen, FeatureFP64, FeatureVertexCache,
+     FeatureWavefrontSize64]>;
+
+//===----------------------------------------------------------------------===//
+// Northern Islands
+//===----------------------------------------------------------------------===//
+
+def : Proc<"barts",      R600_VLIW5_Itin,
+    [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
+
+def : Proc<"turks",      R600_VLIW5_Itin,
+    [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
+
+def : Proc<"caicos",     R600_VLIW5_Itin,
+    [FeatureNorthernIslands, FeatureCFALUBug]>;
+
+def : Proc<"cayman",     R600_VLIW4_Itin,
+    [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>;
+
+//===----------------------------------------------------------------------===//
+// Southern Islands
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"SI", SIFullSpeedModel,
+  [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops]
+>;
+
+def : ProcessorModel<"tahiti", SIFullSpeedModel,
+  [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops]
+>;
+
+def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
+
+def : ProcessorModel<"verde",    SIQuarterSpeedModel, [FeatureSouthernIslands]>;
+
+def : ProcessorModel<"oland",    SIQuarterSpeedModel, [FeatureSouthernIslands]>;
+
+def : ProcessorModel<"hainan",   SIQuarterSpeedModel, [FeatureSouthernIslands]>;
+
+//===----------------------------------------------------------------------===//
+// Sea Islands
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"bonaire",    SIQuarterSpeedModel,
+  [FeatureISAVersion7_0_0]
+>;
+
+def : ProcessorModel<"kabini",     SIQuarterSpeedModel,
+  [FeatureISAVersion7_0_2]
+>;
+
+def : ProcessorModel<"kaveri",     SIQuarterSpeedModel,
+  [FeatureISAVersion7_0_0]
+>;
+
+def : ProcessorModel<"hawaii",     SIFullSpeedModel,
+  [FeatureISAVersion7_0_1]
+>;
+
+def : ProcessorModel<"mullins",    SIQuarterSpeedModel,
+  [FeatureISAVersion7_0_2]>;
+
+def : ProcessorModel<"gfx700",     SIQuarterSpeedModel,
+  [FeatureISAVersion7_0_0]
+>;
+
+def : ProcessorModel<"gfx701",     SIFullSpeedModel,
+  [FeatureISAVersion7_0_1]
+>;
+
+def : ProcessorModel<"gfx702",     SIQuarterSpeedModel,
+  [FeatureISAVersion7_0_2]
+>;
+
+//===----------------------------------------------------------------------===//
+// Volcanic Islands
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"tonga",   SIQuarterSpeedModel,
+  [FeatureISAVersion8_0_2]
+>;
+
+def : ProcessorModel<"iceland", SIQuarterSpeedModel,
+  [FeatureISAVersion8_0_0]
+>;
+
+def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
+  [FeatureISAVersion8_0_1]
+>;
+
+def : ProcessorModel<"fiji",    SIQuarterSpeedModel,
+  [FeatureISAVersion8_0_3]
+>;
+
+def : ProcessorModel<"stoney",  SIQuarterSpeedModel,
+  [FeatureISAVersion8_1_0]
+>;
+
+def : ProcessorModel<"polaris10", SIQuarterSpeedModel,
+  [FeatureISAVersion8_0_3]
+>;
+
+def : ProcessorModel<"polaris11", SIQuarterSpeedModel,
+  [FeatureISAVersion8_0_3]
+>;
+
+def : ProcessorModel<"gfx800", SIQuarterSpeedModel,
+  [FeatureISAVersion8_0_0]
+>;
+
+def : ProcessorModel<"gfx801", SIQuarterSpeedModel,
+  [FeatureISAVersion8_0_1]
+>;
+
+def : ProcessorModel<"gfx802", SIQuarterSpeedModel,
+  [FeatureISAVersion8_0_2]
+>;
+
+def : ProcessorModel<"gfx803", SIQuarterSpeedModel,
+  [FeatureISAVersion8_0_3]
+>;
+
+def : ProcessorModel<"gfx804", SIQuarterSpeedModel,
+  [FeatureISAVersion8_0_4]
+>;
+
+def : ProcessorModel<"gfx810", SIQuarterSpeedModel,
+  [FeatureISAVersion8_1_0]
+>;
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
new file mode 100644
index 000000000000..d0aba38f786d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -0,0 +1,213 @@
+//===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer.
+/// This pass is merging consecutive CFAlus where applicable.
+/// It needs to be called after IfCvt for best results.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "r600mergeclause"
+
+namespace {
+
+static bool isCFAlu(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AMDGPU::CF_ALU:
+  case AMDGPU::CF_ALU_PUSH_BEFORE:
+    return true;
+  default:
+    return false;
+  }
+}
+
+class R600ClauseMergePass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const R600InstrInfo *TII;
+
+  unsigned getCFAluSize(const MachineInstr &MI) const;
+  bool isCFAluEnabled(const MachineInstr &MI) const;
+
+  /// IfCvt pass can generate "disabled" ALU clause marker that need to be
+  /// removed and their content affected to the previous alu clause.
+  /// This function parse instructions after CFAlu until it find a disabled
+  /// CFAlu and merge the content, or an enabled CFAlu.
+  void cleanPotentialDisabledCFAlu(MachineInstr &CFAlu) const;
+
+  /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if
+  /// it is the case.
+  bool mergeIfPossible(MachineInstr &RootCFAlu,
+                       const MachineInstr &LatrCFAlu) const;
+
+public:
+  R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override;
+};
+
+char R600ClauseMergePass::ID = 0;
+
+unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr &MI) const {
+  assert(isCFAlu(MI));
+  return MI
+      .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::COUNT))
+      .getImm();
+}
+
+bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr &MI) const {
+  assert(isCFAlu(MI));
+  return MI
+      .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::Enabled))
+      .getImm();
+}
+
+void R600ClauseMergePass::cleanPotentialDisabledCFAlu(
+    MachineInstr &CFAlu) const {
+  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+  MachineBasicBlock::iterator I = CFAlu, E = CFAlu.getParent()->end();
+  I++;
+  do {
+    while (I != E && !isCFAlu(*I))
+      I++;
+    if (I == E)
+      return;
+    MachineInstr &MI = *I++;
+    if (isCFAluEnabled(MI))
+      break;
+    CFAlu.getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI));
+    MI.eraseFromParent();
+  } while (I != E);
+}
+
+bool R600ClauseMergePass::mergeIfPossible(MachineInstr &RootCFAlu,
+                                          const MachineInstr &LatrCFAlu) const {
+  assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu));
+  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+  unsigned RootInstCount = getCFAluSize(RootCFAlu),
+      LaterInstCount = getCFAluSize(LatrCFAlu);
+  unsigned CumuledInsts = RootInstCount + LaterInstCount;
+  if (CumuledInsts >= TII->getMaxAlusPerClause()) {
+    DEBUG(dbgs() << "Excess inst counts\n");
+    return false;
+  }
+  if (RootCFAlu.getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+    return false;
+  // Is KCache Bank 0 compatible ?
+  int Mode0Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0);
+  int KBank0Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0);
+  int KBank0LineIdx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0);
+  if (LatrCFAlu.getOperand(Mode0Idx).getImm() &&
+      RootCFAlu.getOperand(Mode0Idx).getImm() &&
+      (LatrCFAlu.getOperand(KBank0Idx).getImm() !=
+           RootCFAlu.getOperand(KBank0Idx).getImm() ||
+       LatrCFAlu.getOperand(KBank0LineIdx).getImm() !=
+           RootCFAlu.getOperand(KBank0LineIdx).getImm())) {
+    DEBUG(dbgs() << "Wrong KC0\n");
+    return false;
+  }
+  // Is KCache Bank 1 compatible ?
+  int Mode1Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1);
+  int KBank1Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1);
+  int KBank1LineIdx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1);
+  if (LatrCFAlu.getOperand(Mode1Idx).getImm() &&
+      RootCFAlu.getOperand(Mode1Idx).getImm() &&
+      (LatrCFAlu.getOperand(KBank1Idx).getImm() !=
+           RootCFAlu.getOperand(KBank1Idx).getImm() ||
+       LatrCFAlu.getOperand(KBank1LineIdx).getImm() !=
+           RootCFAlu.getOperand(KBank1LineIdx).getImm())) {
+    DEBUG(dbgs() << "Wrong KC0\n");
+    return false;
+  }
+  if (LatrCFAlu.getOperand(Mode0Idx).getImm()) {
+    RootCFAlu.getOperand(Mode0Idx).setImm(
+        LatrCFAlu.getOperand(Mode0Idx).getImm());
+    RootCFAlu.getOperand(KBank0Idx).setImm(
+        LatrCFAlu.getOperand(KBank0Idx).getImm());
+    RootCFAlu.getOperand(KBank0LineIdx)
+        .setImm(LatrCFAlu.getOperand(KBank0LineIdx).getImm());
+  }
+  if (LatrCFAlu.getOperand(Mode1Idx).getImm()) {
+    RootCFAlu.getOperand(Mode1Idx).setImm(
+        LatrCFAlu.getOperand(Mode1Idx).getImm());
+    RootCFAlu.getOperand(KBank1Idx).setImm(
+        LatrCFAlu.getOperand(KBank1Idx).getImm());
+    RootCFAlu.getOperand(KBank1LineIdx)
+        .setImm(LatrCFAlu.getOperand(KBank1LineIdx).getImm());
+  }
+  RootCFAlu.getOperand(CntIdx).setImm(CumuledInsts);
+  RootCFAlu.setDesc(TII->get(LatrCFAlu.getOpcode()));
+  return true;
+}
+
+bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+  TII = ST.getInstrInfo();
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    MachineBasicBlock::iterator I = MBB.begin(),  E = MBB.end();
+    MachineBasicBlock::iterator LatestCFAlu = E;
+    while (I != E) {
+      MachineInstr &MI = *I++;
+      if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) ||
+          TII->mustBeLastInClause(MI.getOpcode()))
+        LatestCFAlu = E;
+      if (!isCFAlu(MI))
+        continue;
+      cleanPotentialDisabledCFAlu(MI);
+
+      if (LatestCFAlu != E && mergeIfPossible(*LatestCFAlu, MI)) {
+        MI.eraseFromParent();
+      } else {
+        assert(MI.getOperand(8).getImm() && "CF ALU instruction disabled");
+        LatestCFAlu = MI;
+      }
+    }
+  }
+  return false;
+}
+
+StringRef R600ClauseMergePass::getPassName() const {
+  return "R600 Merge Clause Markers Pass";
+}
+
+} // end anonymous namespace
+
+
+llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) {
+  return new R600ClauseMergePass(TM);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
new file mode 100644
index 000000000000..45b36d3d3ebb
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -0,0 +1,699 @@
+//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass compute turns all control flow pseudo instructions into native one
+/// computing their address on the fly ; it also sets STACK_SIZE info.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Debug.h"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "r600cf"
+
+namespace {
+
+struct CFStack {
+
+  enum StackItem {
+    ENTRY = 0,
+    SUB_ENTRY = 1,
+    FIRST_NON_WQM_PUSH = 2,
+    FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
+  };
+
+  const R600Subtarget *ST;
+  std::vector<StackItem> BranchStack;
+  std::vector<StackItem> LoopStack;
+  unsigned MaxStackSize;
+  unsigned CurrentEntries;
+  unsigned CurrentSubEntries;
+
+  CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st),
+      // We need to reserve a stack entry for CALL_FS in vertex shaders.
+      MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0),
+      CurrentEntries(0), CurrentSubEntries(0) { }
+
+  unsigned getLoopDepth();
+  bool branchStackContains(CFStack::StackItem);
+  bool requiresWorkAroundForInst(unsigned Opcode);
+  unsigned getSubEntrySize(CFStack::StackItem Item);
+  void updateMaxStackSize();
+  void pushBranch(unsigned Opcode, bool isWQM = false);
+  void pushLoop();
+  void popBranch();
+  void popLoop();
+};
+
+unsigned CFStack::getLoopDepth() {
+  return LoopStack.size();
+}
+
+bool CFStack::branchStackContains(CFStack::StackItem Item) {
+  for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
+       E = BranchStack.end(); I != E; ++I) {
+    if (*I == Item)
+      return true;
+  }
+  return false;
+}
+
+bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
+  if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
+      getLoopDepth() > 1)
+    return true;
+
+  if (!ST->hasCFAluBug())
+    return false;
+
+  switch(Opcode) {
+  default: return false;
+  case AMDGPU::CF_ALU_PUSH_BEFORE:
+  case AMDGPU::CF_ALU_ELSE_AFTER:
+  case AMDGPU::CF_ALU_BREAK:
+  case AMDGPU::CF_ALU_CONTINUE:
+    if (CurrentSubEntries == 0)
+      return false;
+    if (ST->getWavefrontSize() == 64) {
+      // We are being conservative here.  We only require this work-around if
+      // CurrentSubEntries > 3 &&
+      // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
+      //
+      // We have to be conservative, because we don't know for certain that
+      // our stack allocation algorithm for Evergreen/NI is correct.  Applying this
+      // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
+      // resources without any problems.
+      return CurrentSubEntries > 3;
+    } else {
+      assert(ST->getWavefrontSize() == 32);
+      // We are being conservative here.  We only require the work-around if
+      // CurrentSubEntries > 7 &&
+      // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
+      // See the comment on the wavefront size == 64 case for why we are
+      // being conservative.
+      return CurrentSubEntries > 7;
+    }
+  }
+}
+
+unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
+  switch(Item) {
+  default:
+    return 0;
+  case CFStack::FIRST_NON_WQM_PUSH:
+  assert(!ST->hasCaymanISA());
+  if (ST->getGeneration() <= R600Subtarget::R700) {
+    // +1 For the push operation.
+    // +2 Extra space required.
+    return 3;
+  } else {
+    // Some documentation says that this is not necessary on Evergreen,
+    // but experimentation has show that we need to allocate 1 extra
+    // sub-entry for the first non-WQM push.
+    // +1 For the push operation.
+    // +1 Extra space required.
+    return 2;
+  }
+  case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
+    assert(ST->getGeneration() >= R600Subtarget::EVERGREEN);
+    // +1 For the push operation.
+    // +1 Extra space required.
+    return 2;
+  case CFStack::SUB_ENTRY:
+    return 1;
+  }
+}
+
+void CFStack::updateMaxStackSize() {
+  unsigned CurrentStackSize =
+      CurrentEntries + (alignTo(CurrentSubEntries, 4) / 4);
+  MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
+}
+
+void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
+  CFStack::StackItem Item = CFStack::ENTRY;
+  switch(Opcode) {
+  case AMDGPU::CF_PUSH_EG:
+  case AMDGPU::CF_ALU_PUSH_BEFORE:
+    if (!isWQM) {
+      if (!ST->hasCaymanISA() &&
+          !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
+        Item = CFStack::FIRST_NON_WQM_PUSH;  // May not be required on Evergreen/NI
+                                             // See comment in
+                                             // CFStack::getSubEntrySize()
+      else if (CurrentEntries > 0 &&
+               ST->getGeneration() > R600Subtarget::EVERGREEN &&
+               !ST->hasCaymanISA() &&
+               !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
+        Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
+      else
+        Item = CFStack::SUB_ENTRY;
+    } else
+      Item = CFStack::ENTRY;
+    break;
+  }
+  BranchStack.push_back(Item);
+  if (Item == CFStack::ENTRY)
+    CurrentEntries++;
+  else
+    CurrentSubEntries += getSubEntrySize(Item);
+  updateMaxStackSize();
+}
+
+void CFStack::pushLoop() {
+  LoopStack.push_back(CFStack::ENTRY);
+  CurrentEntries++;
+  updateMaxStackSize();
+}
+
+void CFStack::popBranch() {
+  CFStack::StackItem Top = BranchStack.back();
+  if (Top == CFStack::ENTRY)
+    CurrentEntries--;
+  else
+    CurrentSubEntries-= getSubEntrySize(Top);
+  BranchStack.pop_back();
+}
+
+void CFStack::popLoop() {
+  CurrentEntries--;
+  LoopStack.pop_back();
+}
+
+class R600ControlFlowFinalizer : public MachineFunctionPass {
+
+private:
+  typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
+
+  enum ControlFlowInstruction {
+    CF_TC,
+    CF_VC,
+    CF_CALL_FS,
+    CF_WHILE_LOOP,
+    CF_END_LOOP,
+    CF_LOOP_BREAK,
+    CF_LOOP_CONTINUE,
+    CF_JUMP,
+    CF_ELSE,
+    CF_POP,
+    CF_END
+  };
+
+  static char ID;
+  const R600InstrInfo *TII;
+  const R600RegisterInfo *TRI;
+  unsigned MaxFetchInst;
+  const R600Subtarget *ST;
+
+  bool IsTrivialInst(MachineInstr &MI) const {
+    switch (MI.getOpcode()) {
+    case AMDGPU::KILL:
+    case AMDGPU::RETURN:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
+    unsigned Opcode = 0;
+    bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN);
+    switch (CFI) {
+    case CF_TC:
+      Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
+      break;
+    case CF_VC:
+      Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
+      break;
+    case CF_CALL_FS:
+      Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
+      break;
+    case CF_WHILE_LOOP:
+      Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
+      break;
+    case CF_END_LOOP:
+      Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
+      break;
+    case CF_LOOP_BREAK:
+      Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
+      break;
+    case CF_LOOP_CONTINUE:
+      Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
+      break;
+    case CF_JUMP:
+      Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
+      break;
+    case CF_ELSE:
+      Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
+      break;
+    case CF_POP:
+      Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
+      break;
+    case CF_END:
+      if (ST->hasCaymanISA()) {
+        Opcode = AMDGPU::CF_END_CM;
+        break;
+      }
+      Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
+      break;
+    }
+    assert (Opcode && "No opcode selected");
+    return TII->get(Opcode);
+  }
+
+  bool isCompatibleWithClause(const MachineInstr &MI,
+                              std::set<unsigned> &DstRegs) const {
+    unsigned DstMI, SrcMI;
+    for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
+                                          E = MI.operands_end();
+         I != E; ++I) {
+      const MachineOperand &MO = *I;
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef()) {
+        unsigned Reg = MO.getReg();
+        if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+          DstMI = Reg;
+        else
+          DstMI = TRI->getMatchingSuperReg(Reg,
+              TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+              &AMDGPU::R600_Reg128RegClass);
+      }
+      if (MO.isUse()) {
+        unsigned Reg = MO.getReg();
+        if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+          SrcMI = Reg;
+        else
+          SrcMI = TRI->getMatchingSuperReg(Reg,
+              TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+              &AMDGPU::R600_Reg128RegClass);
+      }
+    }
+    if ((DstRegs.find(SrcMI) == DstRegs.end())) {
+      DstRegs.insert(DstMI);
+      return true;
+    } else
+      return false;
+  }
+
+  ClauseFile
+  MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
+      const {
+    MachineBasicBlock::iterator ClauseHead = I;
+    std::vector<MachineInstr *> ClauseContent;
+    unsigned AluInstCount = 0;
+    bool IsTex = TII->usesTextureCache(*ClauseHead);
+    std::set<unsigned> DstRegs;
+    for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
+      if (IsTrivialInst(*I))
+        continue;
+      if (AluInstCount >= MaxFetchInst)
+        break;
+      if ((IsTex && !TII->usesTextureCache(*I)) ||
+          (!IsTex && !TII->usesVertexCache(*I)))
+        break;
+      if (!isCompatibleWithClause(*I, DstRegs))
+        break;
+      AluInstCount ++;
+      ClauseContent.push_back(&*I);
+    }
+    MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
+        getHWInstrDesc(IsTex?CF_TC:CF_VC))
+        .addImm(0) // ADDR
+        .addImm(AluInstCount - 1); // COUNT
+    return ClauseFile(MIb, std::move(ClauseContent));
+  }
+
+  void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const {
+    static const unsigned LiteralRegs[] = {
+      AMDGPU::ALU_LITERAL_X,
+      AMDGPU::ALU_LITERAL_Y,
+      AMDGPU::ALU_LITERAL_Z,
+      AMDGPU::ALU_LITERAL_W
+    };
+    const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs =
+        TII->getSrcs(MI);
+    for (const auto &Src:Srcs) {
+      if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X)
+        continue;
+      int64_t Imm = Src.second;
+      std::vector<MachineOperand *>::iterator It =
+          find_if(Lits, [&](MachineOperand *val) {
+            return val->isImm() && (val->getImm() == Imm);
+          });
+
+      // Get corresponding Operand
+      MachineOperand &Operand = MI.getOperand(
+          TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
+
+      if (It != Lits.end()) {
+        // Reuse existing literal reg
+        unsigned Index = It - Lits.begin();
+        Src.first->setReg(LiteralRegs[Index]);
+      } else {
+        // Allocate new literal reg
+        assert(Lits.size() < 4 && "Too many literals in Instruction Group");
+        Src.first->setReg(LiteralRegs[Lits.size()]);
+        Lits.push_back(&Operand);
+      }
+    }
+  }
+
+  MachineBasicBlock::iterator insertLiterals(
+      MachineBasicBlock::iterator InsertPos,
+      const std::vector<unsigned> &Literals) const {
+    MachineBasicBlock *MBB = InsertPos->getParent();
+    for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
+      unsigned LiteralPair0 = Literals[i];
+      unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
+      InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
+          TII->get(AMDGPU::LITERALS))
+          .addImm(LiteralPair0)
+          .addImm(LiteralPair1);
+    }
+    return InsertPos;
+  }
+
+  ClauseFile
+  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
+      const {
+    MachineInstr &ClauseHead = *I;
+    std::vector<MachineInstr *> ClauseContent;
+    I++;
+    for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
+      if (IsTrivialInst(*I)) {
+        ++I;
+        continue;
+      }
+      if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
+        break;
+      std::vector<MachineOperand *>Literals;
+      if (I->isBundle()) {
+        MachineInstr &DeleteMI = *I;
+        MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
+        while (++BI != E && BI->isBundledWithPred()) {
+          BI->unbundleFromPred();
+          for (MachineOperand &MO : BI->operands()) {
+            if (MO.isReg() && MO.isInternalRead())
+              MO.setIsInternalRead(false);
+          }
+          getLiteral(*BI, Literals);
+          ClauseContent.push_back(&*BI);
+        }
+        I = BI;
+        DeleteMI.eraseFromParent();
+      } else {
+        getLiteral(*I, Literals);
+        ClauseContent.push_back(&*I);
+        I++;
+      }
+      for (unsigned i = 0, e = Literals.size(); i < e; i += 2) {
+        MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(),
+            TII->get(AMDGPU::LITERALS));
+        if (Literals[i]->isImm()) {
+            MILit.addImm(Literals[i]->getImm());
+        } else {
+            MILit.addGlobalAddress(Literals[i]->getGlobal(),
+                                   Literals[i]->getOffset());
+        }
+        if (i + 1 < e) {
+          if (Literals[i + 1]->isImm()) {
+            MILit.addImm(Literals[i + 1]->getImm());
+          } else {
+            MILit.addGlobalAddress(Literals[i + 1]->getGlobal(),
+                                   Literals[i + 1]->getOffset());
+          }
+        } else
+          MILit.addImm(0);
+        ClauseContent.push_back(MILit);
+      }
+    }
+    assert(ClauseContent.size() < 128 && "ALU clause is too big");
+    ClauseHead.getOperand(7).setImm(ClauseContent.size() - 1);
+    return ClauseFile(&ClauseHead, std::move(ClauseContent));
+  }
+
+  void EmitFetchClause(MachineBasicBlock::iterator InsertPos,
+                       const DebugLoc &DL, ClauseFile &Clause,
+                       unsigned &CfCount) {
+    CounterPropagateAddr(*Clause.first, CfCount);
+    MachineBasicBlock *BB = Clause.first->getParent();
+    BuildMI(BB, DL, TII->get(AMDGPU::FETCH_CLAUSE)).addImm(CfCount);
+    for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
+      BB->splice(InsertPos, BB, Clause.second[i]);
+    }
+    CfCount += 2 * Clause.second.size();
+  }
+
+  void EmitALUClause(MachineBasicBlock::iterator InsertPos, const DebugLoc &DL,
+                     ClauseFile &Clause, unsigned &CfCount) {
+    Clause.first->getOperand(0).setImm(0);
+    CounterPropagateAddr(*Clause.first, CfCount);
+    MachineBasicBlock *BB = Clause.first->getParent();
+    BuildMI(BB, DL, TII->get(AMDGPU::ALU_CLAUSE)).addImm(CfCount);
+    for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
+      BB->splice(InsertPos, BB, Clause.second[i]);
+    }
+    CfCount += Clause.second.size();
+  }
+
+  void CounterPropagateAddr(MachineInstr &MI, unsigned Addr) const {
+    MI.getOperand(0).setImm(Addr + MI.getOperand(0).getImm());
+  }
+  void CounterPropagateAddr(const std::set<MachineInstr *> &MIs,
+                            unsigned Addr) const {
+    for (MachineInstr *MI : MIs) {
+      CounterPropagateAddr(*MI, Addr);
+    }
+  }
+
+public:
+  R600ControlFlowFinalizer(TargetMachine &tm)
+      : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    ST = &MF.getSubtarget<R600Subtarget>();
+    MaxFetchInst = ST->getTexVTXClauseSize();
+    TII = ST->getInstrInfo();
+    TRI = ST->getRegisterInfo();
+
+    R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+
+    CFStack CFStack(ST, MF.getFunction()->getCallingConv());
+    for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
+        ++MB) {
+      MachineBasicBlock &MBB = *MB;
+      unsigned CfCount = 0;
+      std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
+      std::vector<MachineInstr * > IfThenElseStack;
+      if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) {
+        BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
+            getHWInstrDesc(CF_CALL_FS));
+        CfCount++;
+      }
+      std::vector<ClauseFile> FetchClauses, AluClauses;
+      std::vector<MachineInstr *> LastAlu(1);
+      std::vector<MachineInstr *> ToPopAfter;
+
+      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+          I != E;) {
+        if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) {
+          DEBUG(dbgs() << CfCount << ":"; I->dump(););
+          FetchClauses.push_back(MakeFetchClause(MBB, I));
+          CfCount++;
+          LastAlu.back() = nullptr;
+          continue;
+        }
+
+        MachineBasicBlock::iterator MI = I;
+        if (MI->getOpcode() != AMDGPU::ENDIF)
+          LastAlu.back() = nullptr;
+        if (MI->getOpcode() == AMDGPU::CF_ALU)
+          LastAlu.back() = &*MI;
+        I++;
+        bool RequiresWorkAround =
+            CFStack.requiresWorkAroundForInst(MI->getOpcode());
+        switch (MI->getOpcode()) {
+        case AMDGPU::CF_ALU_PUSH_BEFORE:
+          if (RequiresWorkAround) {
+            DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
+            BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
+                .addImm(CfCount + 1)
+                .addImm(1);
+            MI->setDesc(TII->get(AMDGPU::CF_ALU));
+            CfCount++;
+            CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
+          } else
+            CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
+
+        case AMDGPU::CF_ALU:
+          I = MI;
+          AluClauses.push_back(MakeALUClause(MBB, I));
+          DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+          CfCount++;
+          break;
+        case AMDGPU::WHILELOOP: {
+          CFStack.pushLoop();
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              getHWInstrDesc(CF_WHILE_LOOP))
+              .addImm(1);
+          std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
+              std::set<MachineInstr *>());
+          Pair.second.insert(MIb);
+          LoopStack.push_back(std::move(Pair));
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::ENDLOOP: {
+          CFStack.popLoop();
+          std::pair<unsigned, std::set<MachineInstr *> > Pair =
+              std::move(LoopStack.back());
+          LoopStack.pop_back();
+          CounterPropagateAddr(Pair.second, CfCount);
+          BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
+              .addImm(Pair.first + 1);
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::IF_PREDICATE_SET: {
+          LastAlu.push_back(nullptr);
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              getHWInstrDesc(CF_JUMP))
+              .addImm(0)
+              .addImm(0);
+          IfThenElseStack.push_back(MIb);
+          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::ELSE: {
+          MachineInstr * JumpInst = IfThenElseStack.back();
+          IfThenElseStack.pop_back();
+          CounterPropagateAddr(*JumpInst, CfCount);
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              getHWInstrDesc(CF_ELSE))
+              .addImm(0)
+              .addImm(0);
+          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+          IfThenElseStack.push_back(MIb);
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::ENDIF: {
+          CFStack.popBranch();
+          if (LastAlu.back()) {
+            ToPopAfter.push_back(LastAlu.back());
+          } else {
+            MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+                getHWInstrDesc(CF_POP))
+                .addImm(CfCount + 1)
+                .addImm(1);
+            (void)MIb;
+            DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+            CfCount++;
+          }
+
+          MachineInstr *IfOrElseInst = IfThenElseStack.back();
+          IfThenElseStack.pop_back();
+          CounterPropagateAddr(*IfOrElseInst, CfCount);
+          IfOrElseInst->getOperand(1).setImm(1);
+          LastAlu.pop_back();
+          MI->eraseFromParent();
+          break;
+        }
+        case AMDGPU::BREAK: {
+          CfCount ++;
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              getHWInstrDesc(CF_LOOP_BREAK))
+              .addImm(0);
+          LoopStack.back().second.insert(MIb);
+          MI->eraseFromParent();
+          break;
+        }
+        case AMDGPU::CONTINUE: {
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              getHWInstrDesc(CF_LOOP_CONTINUE))
+              .addImm(0);
+          LoopStack.back().second.insert(MIb);
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::RETURN: {
+          DebugLoc DL = MBB.findDebugLoc(MI);
+          BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END));
+          CfCount++;
+          if (CfCount % 2) {
+            BuildMI(MBB, I, DL, TII->get(AMDGPU::PAD));
+            CfCount++;
+          }
+          MI->eraseFromParent();
+          for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
+            EmitFetchClause(I, DL, FetchClauses[i], CfCount);
+          for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
+            EmitALUClause(I, DL, AluClauses[i], CfCount);
+          break;
+        }
+        default:
+          if (TII->isExport(MI->getOpcode())) {
+            DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+            CfCount++;
+          }
+          break;
+        }
+      }
+      for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
+        MachineInstr *Alu = ToPopAfter[i];
+        BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
+            TII->get(AMDGPU::CF_ALU_POP_AFTER))
+            .addImm(Alu->getOperand(0).getImm())
+            .addImm(Alu->getOperand(1).getImm())
+            .addImm(Alu->getOperand(2).getImm())
+            .addImm(Alu->getOperand(3).getImm())
+            .addImm(Alu->getOperand(4).getImm())
+            .addImm(Alu->getOperand(5).getImm())
+            .addImm(Alu->getOperand(6).getImm())
+            .addImm(Alu->getOperand(7).getImm())
+            .addImm(Alu->getOperand(8).getImm());
+        Alu->eraseFromParent();
+      }
+      MFI->CFStackSize = CFStack.MaxStackSize;
+    }
+
+    return false;
+  }
+
+  StringRef getPassName() const override {
+    return "R600 Control Flow Finalizer Pass";
+  }
+};
+
+char R600ControlFlowFinalizer::ID = 0;
+
+} // end anonymous namespace
+
+
+llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
+  return new R600ControlFlowFinalizer(TM);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Defines.h b/contrib/llvm/lib/Target/AMDGPU/R600Defines.h
new file mode 100644
index 000000000000..534461adc59f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Defines.h
@@ -0,0 +1,171 @@
+//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600DEFINES_H
+#define LLVM_LIB_TARGET_AMDGPU_R600DEFINES_H
+
+#include "llvm/MC/MCRegisterInfo.h"
+
+// Operand Flags
+#define MO_FLAG_CLAMP (1 << 0)
+#define MO_FLAG_NEG   (1 << 1)
+#define MO_FLAG_ABS   (1 << 2)
+#define MO_FLAG_MASK  (1 << 3)
+#define MO_FLAG_PUSH  (1 << 4)
+#define MO_FLAG_NOT_LAST  (1 << 5)
+#define MO_FLAG_LAST  (1 << 6)
+#define NUM_MO_FLAGS 7
+
+/// \brief Helper for getting the operand index for the instruction flags
+/// operand.
+#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3)
+
+namespace R600_InstFlag {
+  enum TIF {
+    TRANS_ONLY = (1 << 0),
+    TEX = (1 << 1),
+    REDUCTION = (1 << 2),
+    FC = (1 << 3),
+    TRIG = (1 << 4),
+    OP3 = (1 << 5),
+    VECTOR = (1 << 6),
+    //FlagOperand bits 7, 8
+    NATIVE_OPERANDS = (1 << 9),
+    OP1 = (1 << 10),
+    OP2 = (1 << 11),
+    VTX_INST  = (1 << 12),
+    TEX_INST = (1 << 13),
+    ALU_INST = (1 << 14),
+    LDS_1A = (1 << 15),
+    LDS_1A1D = (1 << 16),
+    IS_EXPORT = (1 << 17),
+    LDS_1A2D = (1 << 18)
+  };
+}
+
+#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS)
+
+/// \brief Defines for extracting register information from register encoding
+#define HW_REG_MASK 0x1ff
+#define HW_CHAN_SHIFT 9
+
+#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT)
+#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK)
+
+#define IS_VTX(desc) ((desc).TSFlags & R600_InstFlag::VTX_INST)
+#define IS_TEX(desc) ((desc).TSFlags & R600_InstFlag::TEX_INST)
+
+namespace OpName {
+
+  enum VecOps {
+    UPDATE_EXEC_MASK_X,
+    UPDATE_PREDICATE_X,
+    WRITE_X,
+    OMOD_X,
+    DST_REL_X,
+    CLAMP_X,
+    SRC0_X,
+    SRC0_NEG_X,
+    SRC0_REL_X,
+    SRC0_ABS_X,
+    SRC0_SEL_X,
+    SRC1_X,
+    SRC1_NEG_X,
+    SRC1_REL_X,
+    SRC1_ABS_X,
+    SRC1_SEL_X,
+    PRED_SEL_X,
+    UPDATE_EXEC_MASK_Y,
+    UPDATE_PREDICATE_Y,
+    WRITE_Y,
+    OMOD_Y,
+    DST_REL_Y,
+    CLAMP_Y,
+    SRC0_Y,
+    SRC0_NEG_Y,
+    SRC0_REL_Y,
+    SRC0_ABS_Y,
+    SRC0_SEL_Y,
+    SRC1_Y,
+    SRC1_NEG_Y,
+    SRC1_REL_Y,
+    SRC1_ABS_Y,
+    SRC1_SEL_Y,
+    PRED_SEL_Y,
+    UPDATE_EXEC_MASK_Z,
+    UPDATE_PREDICATE_Z,
+    WRITE_Z,
+    OMOD_Z,
+    DST_REL_Z,
+    CLAMP_Z,
+    SRC0_Z,
+    SRC0_NEG_Z,
+    SRC0_REL_Z,
+    SRC0_ABS_Z,
+    SRC0_SEL_Z,
+    SRC1_Z,
+    SRC1_NEG_Z,
+    SRC1_REL_Z,
+    SRC1_ABS_Z,
+    SRC1_SEL_Z,
+    PRED_SEL_Z,
+    UPDATE_EXEC_MASK_W,
+    UPDATE_PREDICATE_W,
+    WRITE_W,
+    OMOD_W,
+    DST_REL_W,
+    CLAMP_W,
+    SRC0_W,
+    SRC0_NEG_W,
+    SRC0_REL_W,
+    SRC0_ABS_W,
+    SRC0_SEL_W,
+    SRC1_W,
+    SRC1_NEG_W,
+    SRC1_REL_W,
+    SRC1_ABS_W,
+    SRC1_SEL_W,
+    PRED_SEL_W,
+    IMM_0,
+    IMM_1,
+    VEC_COUNT
+ };
+
+}
+
+//===----------------------------------------------------------------------===//
+// Config register definitions
+//===----------------------------------------------------------------------===//
+
+#define R_02880C_DB_SHADER_CONTROL                    0x02880C
+#define   S_02880C_KILL_ENABLE(x)                      (((x) & 0x1) << 6)
+
+// These fields are the same for all shader types and families.
+#define   S_NUM_GPRS(x)                         (((x) & 0xFF) << 0)
+#define   S_STACK_SIZE(x)                       (((x) & 0xFF) << 8)
+//===----------------------------------------------------------------------===//
+// R600, R700 Registers
+//===----------------------------------------------------------------------===//
+
+#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
+#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
+
+//===----------------------------------------------------------------------===//
+// Evergreen, Northern Islands Registers
+//===----------------------------------------------------------------------===//
+
+#define R_028844_SQ_PGM_RESOURCES_PS                 0x028844
+#define R_028860_SQ_PGM_RESOURCES_VS                 0x028860
+#define R_028878_SQ_PGM_RESOURCES_GS                 0x028878
+#define R_0288D4_SQ_PGM_RESOURCES_LS                 0x0288d4
+
+#define R_0288E8_SQ_LDS_ALLOC                        0x0288E8
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
new file mode 100644
index 000000000000..9a5db6ccc672
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -0,0 +1,339 @@
+//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold
+/// 128 Alu instructions ; these instructions can access up to 4 prefetched
+/// 4 lines of 16 registers from constant buffers. Such ALU clauses are
+/// initiated by CF_ALU instructions.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+  void initializeR600EmitClauseMarkersPass(PassRegistry&);
+}
+
+namespace {
+
+class R600EmitClauseMarkers : public MachineFunctionPass {
+
+private:
+  const R600InstrInfo *TII;
+  int Address;
+
+  unsigned OccupiedDwords(MachineInstr &MI) const {
+    switch (MI.getOpcode()) {
+    case AMDGPU::INTERP_PAIR_XY:
+    case AMDGPU::INTERP_PAIR_ZW:
+    case AMDGPU::INTERP_VEC_LOAD:
+    case AMDGPU::DOT_4:
+      return 4;
+    case AMDGPU::KILL:
+      return 0;
+    default:
+      break;
+    }
+
+    // These will be expanded to two ALU instructions in the
+    // ExpandSpecialInstructions pass.
+    if (TII->isLDSRetInstr(MI.getOpcode()))
+      return 2;
+
+    if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode()) ||
+        TII->isReductionOp(MI.getOpcode()))
+      return 4;
+
+    unsigned NumLiteral = 0;
+    for (MachineInstr::mop_iterator It = MI.operands_begin(),
+                                    E = MI.operands_end();
+         It != E; ++It) {
+      MachineOperand &MO = *It;
+      if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+        ++NumLiteral;
+    }
+    return 1 + NumLiteral;
+  }
+
+  bool isALU(const MachineInstr &MI) const {
+    if (TII->isALUInstr(MI.getOpcode()))
+      return true;
+    if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode()))
+      return true;
+    switch (MI.getOpcode()) {
+    case AMDGPU::PRED_X:
+    case AMDGPU::INTERP_PAIR_XY:
+    case AMDGPU::INTERP_PAIR_ZW:
+    case AMDGPU::INTERP_VEC_LOAD:
+    case AMDGPU::COPY:
+    case AMDGPU::DOT_4:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  bool IsTrivialInst(MachineInstr &MI) const {
+    switch (MI.getOpcode()) {
+    case AMDGPU::KILL:
+    case AMDGPU::RETURN:
+    case AMDGPU::IMPLICIT_DEF:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const {
+    // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2
+    // (See also R600ISelLowering.cpp)
+    // ConstIndex value is in [0, 4095];
+    return std::pair<unsigned, unsigned>(
+        ((Sel >> 2) - 512) >> 12, // KC_BANK
+        // Line Number of ConstIndex
+        // A line contains 16 constant registers however KCX bank can lock
+        // two line at the same time ; thus we want to get an even line number.
+        // Line number can be retrieved with (>>4), using (>>5) <<1 generates
+        // an even number.
+        ((((Sel >> 2) - 512) & 4095) >> 5) << 1);
+  }
+
+  bool
+  SubstituteKCacheBank(MachineInstr &MI,
+                       std::vector<std::pair<unsigned, unsigned>> &CachedConsts,
+                       bool UpdateInstr = true) const {
+    std::vector<std::pair<unsigned, unsigned> > UsedKCache;
+
+    if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4)
+      return true;
+
+    const SmallVectorImpl<std::pair<MachineOperand *, int64_t>> &Consts =
+        TII->getSrcs(MI);
+    assert(
+        (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == AMDGPU::DOT_4) &&
+        "Can't assign Const");
+    for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
+      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+        continue;
+      unsigned Sel = Consts[i].second;
+      unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31;
+      unsigned KCacheIndex = Index * 4 + Chan;
+      const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel);
+      if (CachedConsts.empty()) {
+        CachedConsts.push_back(BankLine);
+        UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
+        continue;
+      }
+      if (CachedConsts[0] == BankLine) {
+        UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
+        continue;
+      }
+      if (CachedConsts.size() == 1) {
+        CachedConsts.push_back(BankLine);
+        UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
+        continue;
+      }
+      if (CachedConsts[1] == BankLine) {
+        UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
+        continue;
+      }
+      return false;
+    }
+
+    if (!UpdateInstr)
+      return true;
+
+    for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) {
+      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+        continue;
+      switch(UsedKCache[j].first) {
+      case 0:
+        Consts[i].first->setReg(
+            AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second));
+        break;
+      case 1:
+        Consts[i].first->setReg(
+            AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second));
+        break;
+      default:
+        llvm_unreachable("Wrong Cache Line");
+      }
+      j++;
+    }
+    return true;
+  }
+
+  bool canClauseLocalKillFitInClause(
+                        unsigned AluInstCount,
+                        std::vector<std::pair<unsigned, unsigned> > KCacheBanks,
+                        MachineBasicBlock::iterator Def,
+                        MachineBasicBlock::iterator BBEnd) {
+    const R600RegisterInfo &TRI = TII->getRegisterInfo();
+    for (MachineInstr::const_mop_iterator
+           MOI = Def->operands_begin(),
+           MOE = Def->operands_end(); MOI != MOE; ++MOI) {
+      if (!MOI->isReg() || !MOI->isDef() ||
+          TRI.isPhysRegLiveAcrossClauses(MOI->getReg()))
+        continue;
+
+      // Def defines a clause local register, so check that its use will fit
+      // in the clause.
+      unsigned LastUseCount = 0;
+      for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) {
+        AluInstCount += OccupiedDwords(*UseI);
+        // Make sure we won't need to end the clause due to KCache limitations.
+        if (!SubstituteKCacheBank(*UseI, KCacheBanks, false))
+          return false;
+
+        // We have reached the maximum instruction limit before finding the
+        // use that kills this register, so we cannot use this def in the
+        // current clause.
+        if (AluInstCount >= TII->getMaxAlusPerClause())
+          return false;
+
+        // Register kill flags have been cleared by the time we get to this
+        // pass, but it is safe to assume that all uses of this register
+        // occur in the same basic block as its definition, because
+        // it is illegal for the scheduler to schedule them in
+        // different blocks.
+        if (UseI->findRegisterUseOperandIdx(MOI->getReg()))
+          LastUseCount = AluInstCount;
+
+        if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1)
+          break;
+      }
+      if (LastUseCount)
+        return LastUseCount <= TII->getMaxAlusPerClause();
+      llvm_unreachable("Clause local register live at end of clause.");
+    }
+    return true;
+  }
+
+  MachineBasicBlock::iterator
+  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
+    MachineBasicBlock::iterator ClauseHead = I;
+    std::vector<std::pair<unsigned, unsigned> > KCacheBanks;
+    bool PushBeforeModifier = false;
+    unsigned AluInstCount = 0;
+    for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
+      if (IsTrivialInst(*I))
+        continue;
+      if (!isALU(*I))
+        break;
+      if (AluInstCount > TII->getMaxAlusPerClause())
+        break;
+      if (I->getOpcode() == AMDGPU::PRED_X) {
+        // We put PRED_X in its own clause to ensure that ifcvt won't create
+        // clauses with more than 128 insts.
+        // IfCvt is indeed checking that "then" and "else" branches of an if
+        // statement have less than ~60 insts thus converted clauses can't be
+        // bigger than ~121 insts (predicate setter needs to be in the same
+        // clause as predicated alus).
+        if (AluInstCount > 0)
+          break;
+        if (TII->getFlagOp(*I).getImm() & MO_FLAG_PUSH)
+          PushBeforeModifier = true;
+        AluInstCount ++;
+        continue;
+      }
+      // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as:
+      //
+      // * KILL or INTERP instructions
+      // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits
+      // * Uses waterfalling (i.e. INDEX_MODE = AR.X)
+      //
+      // XXX: These checks have not been implemented yet.
+      if (TII->mustBeLastInClause(I->getOpcode())) {
+        I++;
+        break;
+      }
+
+      // If this instruction defines a clause local register, make sure
+      // its use can fit in this clause.
+      if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E))
+        break;
+
+      if (!SubstituteKCacheBank(*I, KCacheBanks))
+        break;
+      AluInstCount += OccupiedDwords(*I);
+    }
+    unsigned Opcode = PushBeforeModifier ?
+        AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
+    BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
+    // We don't use the ADDR field until R600ControlFlowFinalizer pass, where
+    // it is safe to assume it is 0. However if we always put 0 here, the ifcvt
+    // pass may assume that identical ALU clause starter at the beginning of a
+    // true and false branch can be factorized which is not the case.
+        .addImm(Address++) // ADDR
+        .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0
+        .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1
+        .addImm(KCacheBanks.empty()?0:2) // KM0
+        .addImm((KCacheBanks.size() < 2)?0:2) // KM1
+        .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0
+        .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1
+        .addImm(AluInstCount) // COUNT
+        .addImm(1); // Enabled
+    return I;
+  }
+
+public:
+  static char ID;
+  R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) {
+
+    initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+    TII = ST.getInstrInfo();
+
+    for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                    BB != BB_E; ++BB) {
+      MachineBasicBlock &MBB = *BB;
+      MachineBasicBlock::iterator I = MBB.begin();
+      if (I != MBB.end() && I->getOpcode() == AMDGPU::CF_ALU)
+        continue; // BB was already parsed
+      for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
+        if (isALU(*I))
+          I = MakeALUClause(MBB, I);
+        else
+          ++I;
+      }
+    }
+    return false;
+  }
+
+  StringRef getPassName() const override {
+    return "R600 Emit Clause Markers Pass";
+  }
+};
+
+char R600EmitClauseMarkers::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers",
+                      "R600 Emit Clause Markters", false, false)
+INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers",
+                      "R600 Emit Clause Markters", false, false)
+
+llvm::FunctionPass *llvm::createR600EmitClauseMarkers() {
+  return new R600EmitClauseMarkers();
+}
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
new file mode 100644
index 000000000000..3e46e6387614
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -0,0 +1,270 @@
+//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Vector, Reduction, and Cube instructions need to fill the entire instruction
+/// group to work correctly.  This pass expands these individual instructions
+/// into several instructions that will completely fill the instruction group.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class R600ExpandSpecialInstrsPass : public MachineFunctionPass {
+private:
+  static char ID;
+  const R600InstrInfo *TII;
+
+  void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI,
+      unsigned Op);
+
+public:
+  R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
+    TII(nullptr) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "R600 Expand special instructions pass";
+  }
+};
+
+} // End anonymous namespace
+
+char R600ExpandSpecialInstrsPass::ID = 0;
+
+FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
+  return new R600ExpandSpecialInstrsPass(TM);
+}
+
+void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI,
+    const MachineInstr *OldMI, unsigned Op) {
+  int OpIdx = TII->getOperandIdx(*OldMI, Op);
+  if (OpIdx > -1) {
+    uint64_t Val = OldMI->getOperand(OpIdx).getImm();
+    TII->setImmOperand(*NewMI, Op, Val);
+  }
+}
+
+bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
+  const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+  TII = ST.getInstrInfo();
+
+  const R600RegisterInfo &TRI = TII->getRegisterInfo();
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    MachineBasicBlock::iterator I = MBB.begin();
+    while (I != MBB.end()) {
+      MachineInstr &MI = *I;
+      I = std::next(I);
+
+      // Expand LDS_*_RET instructions
+      if (TII->isLDSRetInstr(MI.getOpcode())) {
+        int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+        assert(DstIdx != -1);
+        MachineOperand &DstOp = MI.getOperand(DstIdx);
+        MachineInstr *Mov = TII->buildMovInstr(&MBB, I,
+                                               DstOp.getReg(), AMDGPU::OQAP);
+        DstOp.setReg(AMDGPU::OQAP);
+        int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(),
+                                           AMDGPU::OpName::pred_sel);
+        int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(),
+                                           AMDGPU::OpName::pred_sel);
+        // Copy the pred_sel bit
+        Mov->getOperand(MovPredSelIdx).setReg(
+            MI.getOperand(LDSPredSelIdx).getReg());
+      }
+
+      switch (MI.getOpcode()) {
+      default: break;
+      // Expand PRED_X to one of the PRED_SET instructions.
+      case AMDGPU::PRED_X: {
+        uint64_t Flags = MI.getOperand(3).getImm();
+        // The native opcode used by PRED_X is stored as an immediate in the
+        // third operand.
+        MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
+                                            MI.getOperand(2).getImm(), // opcode
+                                            MI.getOperand(0).getReg(), // dst
+                                            MI.getOperand(1).getReg(), // src0
+                                            AMDGPU::ZERO);             // src1
+        TII->addFlag(*PredSet, 0, MO_FLAG_MASK);
+        if (Flags & MO_FLAG_PUSH) {
+          TII->setImmOperand(*PredSet, AMDGPU::OpName::update_exec_mask, 1);
+        } else {
+          TII->setImmOperand(*PredSet, AMDGPU::OpName::update_pred, 1);
+        }
+        MI.eraseFromParent();
+        continue;
+        }
+      case AMDGPU::DOT_4: {
+
+        const R600RegisterInfo &TRI = TII->getRegisterInfo();
+
+        unsigned DstReg = MI.getOperand(0).getReg();
+        unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          bool Mask = (Chan != TRI.getHWRegChan(DstReg));
+          unsigned SubDstReg =
+              AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+          MachineInstr *BMI =
+              TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Mask) {
+            TII->addFlag(*BMI, 0, MO_FLAG_MASK);
+          }
+          if (Chan != 3)
+            TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST);
+          unsigned Opcode = BMI->getOpcode();
+          // While not strictly necessary from hw point of view, we force
+          // all src operands of a dot4 inst to belong to the same slot.
+          unsigned Src0 = BMI->getOperand(
+              TII->getOperandIdx(Opcode, AMDGPU::OpName::src0))
+              .getReg();
+          unsigned Src1 = BMI->getOperand(
+              TII->getOperandIdx(Opcode, AMDGPU::OpName::src1))
+              .getReg();
+          (void) Src0;
+          (void) Src1;
+          if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
+              (TRI.getEncodingValue(Src1) & 0xff) < 127)
+            assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
+        }
+        MI.eraseFromParent();
+        continue;
+      }
+      }
+
+      bool IsReduction = TII->isReductionOp(MI.getOpcode());
+      bool IsVector = TII->isVector(MI);
+      bool IsCube = TII->isCubeOp(MI.getOpcode());
+      if (!IsReduction && !IsVector && !IsCube) {
+        continue;
+      }
+
+      // Expand the instruction
+      //
+      // Reduction instructions:
+      // T0_X = DP4 T1_XYZW, T2_XYZW
+      // becomes:
+      // TO_X = DP4 T1_X, T2_X
+      // TO_Y (write masked) = DP4 T1_Y, T2_Y
+      // TO_Z (write masked) = DP4 T1_Z, T2_Z
+      // TO_W (write masked) = DP4 T1_W, T2_W
+      //
+      // Vector instructions:
+      // T0_X = MULLO_INT T1_X, T2_X
+      // becomes:
+      // T0_X = MULLO_INT T1_X, T2_X
+      // T0_Y (write masked) = MULLO_INT T1_X, T2_X
+      // T0_Z (write masked) = MULLO_INT T1_X, T2_X
+      // T0_W (write masked) = MULLO_INT T1_X, T2_X
+      //
+      // Cube instructions:
+      // T0_XYZW = CUBE T1_XYZW
+      // becomes:
+      // TO_X = CUBE T1_Z, T1_Y
+      // T0_Y = CUBE T1_Z, T1_X
+      // T0_Z = CUBE T1_X, T1_Z
+      // T0_W = CUBE T1_Y, T1_Z
+      for (unsigned Chan = 0; Chan < 4; Chan++) {
+        unsigned DstReg = MI.getOperand(
+                            TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg();
+        unsigned Src0 = MI.getOperand(
+                           TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg();
+        unsigned Src1 = 0;
+
+        // Determine the correct source registers
+        if (!IsCube) {
+          int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1);
+          if (Src1Idx != -1) {
+            Src1 = MI.getOperand(Src1Idx).getReg();
+          }
+        }
+        if (IsReduction) {
+          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+          Src0 = TRI.getSubReg(Src0, SubRegIndex);
+          Src1 = TRI.getSubReg(Src1, SubRegIndex);
+        } else if (IsCube) {
+          static const int CubeSrcSwz[] = {2, 2, 0, 1};
+          unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
+          unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
+          Src1 = TRI.getSubReg(Src0, SubRegIndex1);
+          Src0 = TRI.getSubReg(Src0, SubRegIndex0);
+        }
+
+        // Determine the correct destination registers;
+        bool Mask = false;
+        bool NotLast = true;
+        if (IsCube) {
+          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+          DstReg = TRI.getSubReg(DstReg, SubRegIndex);
+        } else {
+          // Mask the write if the original instruction does not write to
+          // the current Channel.
+          Mask = (Chan != TRI.getHWRegChan(DstReg));
+          unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
+          DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+        }
+
+        // Set the IsLast bit
+        NotLast = (Chan != 3 );
+
+        // Add the new instruction
+        unsigned Opcode = MI.getOpcode();
+        switch (Opcode) {
+        case AMDGPU::CUBE_r600_pseudo:
+          Opcode = AMDGPU::CUBE_r600_real;
+          break;
+        case AMDGPU::CUBE_eg_pseudo:
+          Opcode = AMDGPU::CUBE_eg_real;
+          break;
+        default:
+          break;
+        }
+
+        MachineInstr *NewMI =
+          TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1);
+
+        if (Chan != 0)
+          NewMI->bundleWithPred();
+        if (Mask) {
+          TII->addFlag(*NewMI, 0, MO_FLAG_MASK);
+        }
+        if (NotLast) {
+          TII->addFlag(*NewMI, 0, MO_FLAG_NOT_LAST);
+        }
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg);
+        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg);
+      }
+      MI.eraseFromParent();
+    }
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
new file mode 100644
index 000000000000..5813786abe01
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -0,0 +1,14 @@
+//===----------------------- R600FrameLowering.cpp ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#include "R600FrameLowering.h"
+
+using namespace llvm;
+
+R600FrameLowering::~R600FrameLowering() = default;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h
new file mode 100644
index 000000000000..874435f35ce4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h
@@ -0,0 +1,32 @@
+//===--------------------- R600FrameLowering.h ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600FRAMELOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_R600FRAMELOWERING_H
+
+#include "AMDGPUFrameLowering.h"
+
+namespace llvm {
+
+class R600FrameLowering : public AMDGPUFrameLowering {
+public:
+  R600FrameLowering(StackDirection D, unsigned StackAl, int LAO,
+                    unsigned TransAl = 1) :
+    AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
+  ~R600FrameLowering() override;
+
+  void emitPrologue(MachineFunction &MF,
+                    MachineBasicBlock &MBB) const override {}
+  void emitEpilogue(MachineFunction &MF,
+                    MachineBasicBlock &MBB) const override {}
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_R600FRAMELOWERING_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
new file mode 100644
index 000000000000..89c9266746ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -0,0 +1,2202 @@
+//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Custom DAG lowering for R600
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600ISelLowering.h"
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600FrameLowering.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/DAGCombine.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
+                                       const R600Subtarget &STI)
+    : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
+  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
+  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
+  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
+  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
+  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
+  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
+
+  computeRegisterProperties(STI.getRegisterInfo());
+
+  // Legalize loads and stores to the private address space.
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+
+  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
+  // spaces, so it is custom lowered to handle those where it isn't.
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
+
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
+
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
+  }
+
+  // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
+
+  setOperationAction(ISD::STORE, MVT::i8, Custom);
+  setOperationAction(ISD::STORE, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+
+  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
+  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
+
+  // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
+  setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
+  setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
+
+  // Set condition code actions
+  setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
+
+  setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
+
+  setOperationAction(ISD::FCOS, MVT::f32, Custom);
+  setOperationAction(ISD::FSIN, MVT::f32, Custom);
+
+  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
+
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+
+  setOperationAction(ISD::FSUB, MVT::f32, Expand);
+
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+
+  setOperationAction(ISD::SETCC, MVT::i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f32, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+
+  // ADD, SUB overflow.
+  // TODO: turn these into Legal?
+  if (Subtarget->hasCARRY())
+    setOperationAction(ISD::UADDO, MVT::i32, Custom);
+
+  if (Subtarget->hasBORROW())
+    setOperationAction(ISD::USUBO, MVT::i32, Custom);
+
+  // Expand sign extension of vectors
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
+
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
+
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
+
+  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+  // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
+  //  to be Legal/Custom in order to avoid library calls.
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+
+  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
+  for (MVT VT : ScalarIntVTs) {
+    setOperationAction(ISD::ADDC, VT, Expand);
+    setOperationAction(ISD::SUBC, VT, Expand);
+    setOperationAction(ISD::ADDE, VT, Expand);
+    setOperationAction(ISD::SUBE, VT, Expand);
+  }
+
+  setSchedulingPreference(Sched::Source);
+
+  setTargetDAGCombine(ISD::FP_ROUND);
+  setTargetDAGCombine(ISD::FP_TO_SINT);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::SELECT_CC);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::LOAD);
+}
+
+const R600Subtarget *R600TargetLowering::getSubtarget() const {
+  return static_cast<const R600Subtarget *>(Subtarget);
+}
+
+static inline bool isEOP(MachineBasicBlock::iterator I) {
+  if (std::next(I) == I->getParent()->end())
+    return false;
+  return std::next(I)->getOpcode() == AMDGPU::RETURN;
+}
+
+MachineBasicBlock *
+R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                                MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineBasicBlock::iterator I = MI;
+  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+
+  switch (MI.getOpcode()) {
+  default:
+    // Replace LDS_*_RET instruction that don't have any uses with the
+    // equivalent LDS_*_NORET instruction.
+    if (TII->isLDSRetInstr(MI.getOpcode())) {
+      int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+      assert(DstIdx != -1);
+      MachineInstrBuilder NewMI;
+      // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
+      //        LDS_1A2D support and remove this special case.
+      if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
+          MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
+        return BB;
+
+      NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
+                      TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
+      for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
+        NewMI.addOperand(MI.getOperand(i));
+      }
+    } else {
+      return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+    }
+    break;
+  case AMDGPU::CLAMP_R600: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(
+        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+        MI.getOperand(1).getReg());
+    TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
+    break;
+  }
+
+  case AMDGPU::FABS_R600: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(
+        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+        MI.getOperand(1).getReg());
+    TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
+    break;
+  }
+
+  case AMDGPU::FNEG_R600: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(
+        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
+        MI.getOperand(1).getReg());
+    TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
+    break;
+  }
+
+  case AMDGPU::MASK_WRITE: {
+    unsigned maskedRegister = MI.getOperand(0).getReg();
+    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
+    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
+    TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
+    break;
+  }
+
+  case AMDGPU::MOV_IMM_F32:
+    TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
+                                                            .getFPImm()
+                                                            ->getValueAPF()
+                                                            .bitcastToAPInt()
+                                                            .getZExtValue());
+    break;
+
+  case AMDGPU::MOV_IMM_I32:
+    TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
+                     MI.getOperand(1).getImm());
+    break;
+
+  case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
+    //TODO: Perhaps combine this instruction with the next if possible
+    auto MIB = TII->buildDefaultInstruction(
+        *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
+    int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
+    //TODO: Ugh this is rather ugly
+    MIB->getOperand(Idx) = MI.getOperand(1);
+    break;
+  }
+
+  case AMDGPU::CONST_COPY: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(
+        *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
+    TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
+                       MI.getOperand(1).getImm());
+    break;
+  }
+
+  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
+  case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
+        .addOperand(MI.getOperand(0))
+        .addOperand(MI.getOperand(1))
+        .addImm(isEOP(I)); // Set End of program bit
+    break;
+
+  case AMDGPU::RAT_STORE_TYPED_eg:
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
+        .addOperand(MI.getOperand(0))
+        .addOperand(MI.getOperand(1))
+        .addOperand(MI.getOperand(2))
+        .addImm(isEOP(I)); // Set End of program bit
+    break;
+
+  case AMDGPU::BRANCH:
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+        .addOperand(MI.getOperand(0));
+    break;
+
+  case AMDGPU::BRANCH_COND_f32: {
+    MachineInstr *NewMI =
+        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
+                AMDGPU::PREDICATE_BIT)
+            .addOperand(MI.getOperand(1))
+            .addImm(AMDGPU::PRED_SETNE)
+            .addImm(0); // Flags
+    TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+        .addOperand(MI.getOperand(0))
+        .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+    break;
+  }
+
+  case AMDGPU::BRANCH_COND_i32: {
+    MachineInstr *NewMI =
+        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
+                AMDGPU::PREDICATE_BIT)
+            .addOperand(MI.getOperand(1))
+            .addImm(AMDGPU::PRED_SETNE_INT)
+            .addImm(0); // Flags
+    TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+        .addOperand(MI.getOperand(0))
+        .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+    break;
+  }
+
+  case AMDGPU::EG_ExportSwz:
+  case AMDGPU::R600_ExportSwz: {
+    // Instruction is left unmodified if its not the last one of its type
+    bool isLastInstructionOfItsType = true;
+    unsigned InstExportType = MI.getOperand(1).getImm();
+    for (MachineBasicBlock::iterator NextExportInst = std::next(I),
+         EndBlock = BB->end(); NextExportInst != EndBlock;
+         NextExportInst = std::next(NextExportInst)) {
+      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
+          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
+        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
+            .getImm();
+        if (CurrentInstExportType == InstExportType) {
+          isLastInstructionOfItsType = false;
+          break;
+        }
+      }
+    }
+    bool EOP = isEOP(I);
+    if (!EOP && !isLastInstructionOfItsType)
+      return BB;
+    unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
+        .addOperand(MI.getOperand(0))
+        .addOperand(MI.getOperand(1))
+        .addOperand(MI.getOperand(2))
+        .addOperand(MI.getOperand(3))
+        .addOperand(MI.getOperand(4))
+        .addOperand(MI.getOperand(5))
+        .addOperand(MI.getOperand(6))
+        .addImm(CfInst)
+        .addImm(EOP);
+    break;
+  }
+  case AMDGPU::RETURN: {
+    return BB;
+  }
+  }
+
+  MI.eraseFromParent();
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Lowering Operations
+//===----------------------------------------------------------------------===//
+
+SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+  switch (Op.getOpcode()) {
+  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
+  case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
+  case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
+  case ISD::FCOS:
+  case ISD::FSIN: return LowerTrig(Op, DAG);
+  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+  case ISD::STORE: return LowerSTORE(Op, DAG);
+  case ISD::LOAD: {
+    SDValue Result = LowerLOAD(Op, DAG);
+    assert((!Result.getNode() ||
+            Result.getNode()->getNumValues() == 2) &&
+           "Load should return a value and a chain");
+    return Result;
+  }
+
+  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
+  case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
+  case ISD::INTRINSIC_VOID: {
+    SDValue Chain = Op.getOperand(0);
+    unsigned IntrinsicID =
+                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    switch (IntrinsicID) {
+    case AMDGPUIntrinsic::r600_store_swizzle: {
+      SDLoc DL(Op);
+      const SDValue Args[8] = {
+        Chain,
+        Op.getOperand(2), // Export Value
+        Op.getOperand(3), // ArrayBase
+        Op.getOperand(4), // Type
+        DAG.getConstant(0, DL, MVT::i32), // SWZ_X
+        DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
+        DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
+        DAG.getConstant(3, DL, MVT::i32) // SWZ_W
+      };
+      return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, Op.getValueType(), Args);
+    }
+
+    // default for switch(IntrinsicID)
+    default: break;
+    }
+    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntrinsicID =
+                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    EVT VT = Op.getValueType();
+    SDLoc DL(Op);
+    switch(IntrinsicID) {
+    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+    case AMDGPUIntrinsic::r600_tex:
+    case AMDGPUIntrinsic::r600_texc: {
+      unsigned TextureOp;
+      switch (IntrinsicID) {
+      case AMDGPUIntrinsic::r600_tex:
+        TextureOp = 0;
+        break;
+      case AMDGPUIntrinsic::r600_texc:
+        TextureOp = 1;
+        break;
+      default:
+        llvm_unreachable("unhandled texture operation");
+      }
+
+      SDValue TexArgs[19] = {
+        DAG.getConstant(TextureOp, DL, MVT::i32),
+        Op.getOperand(1),
+        DAG.getConstant(0, DL, MVT::i32),
+        DAG.getConstant(1, DL, MVT::i32),
+        DAG.getConstant(2, DL, MVT::i32),
+        DAG.getConstant(3, DL, MVT::i32),
+        Op.getOperand(2),
+        Op.getOperand(3),
+        Op.getOperand(4),
+        DAG.getConstant(0, DL, MVT::i32),
+        DAG.getConstant(1, DL, MVT::i32),
+        DAG.getConstant(2, DL, MVT::i32),
+        DAG.getConstant(3, DL, MVT::i32),
+        Op.getOperand(5),
+        Op.getOperand(6),
+        Op.getOperand(7),
+        Op.getOperand(8),
+        Op.getOperand(9),
+        Op.getOperand(10)
+      };
+      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
+    }
+    case AMDGPUIntrinsic::r600_dot4: {
+      SDValue Args[8] = {
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(0, DL, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(0, DL, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(1, DL, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(1, DL, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(2, DL, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(2, DL, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(3, DL, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(3, DL, MVT::i32))
+      };
+      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
+    }
+
+    case Intrinsic::r600_implicitarg_ptr: {
+      MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
+      uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+      return DAG.getConstant(ByteOffset, DL, PtrVT);
+    }
+    case Intrinsic::r600_read_ngroups_x:
+      return LowerImplicitParameter(DAG, VT, DL, 0);
+    case Intrinsic::r600_read_ngroups_y:
+      return LowerImplicitParameter(DAG, VT, DL, 1);
+    case Intrinsic::r600_read_ngroups_z:
+      return LowerImplicitParameter(DAG, VT, DL, 2);
+    case Intrinsic::r600_read_global_size_x:
+      return LowerImplicitParameter(DAG, VT, DL, 3);
+    case Intrinsic::r600_read_global_size_y:
+      return LowerImplicitParameter(DAG, VT, DL, 4);
+    case Intrinsic::r600_read_global_size_z:
+      return LowerImplicitParameter(DAG, VT, DL, 5);
+    case Intrinsic::r600_read_local_size_x:
+      return LowerImplicitParameter(DAG, VT, DL, 6);
+    case Intrinsic::r600_read_local_size_y:
+      return LowerImplicitParameter(DAG, VT, DL, 7);
+    case Intrinsic::r600_read_local_size_z:
+      return LowerImplicitParameter(DAG, VT, DL, 8);
+
+    case Intrinsic::r600_read_tgid_x:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T1_X, VT);
+    case Intrinsic::r600_read_tgid_y:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T1_Y, VT);
+    case Intrinsic::r600_read_tgid_z:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T1_Z, VT);
+    case Intrinsic::r600_read_tidig_x:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T0_X, VT);
+    case Intrinsic::r600_read_tidig_y:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T0_Y, VT);
+    case Intrinsic::r600_read_tidig_z:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T0_Z, VT);
+
+    case Intrinsic::r600_recipsqrt_ieee:
+      return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+
+    case Intrinsic::r600_recipsqrt_clamped:
+      return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
+    }
+
+    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
+    break;
+  }
+  } // end switch(Op.getOpcode())
+  return SDValue();
+}
+
+void R600TargetLowering::ReplaceNodeResults(SDNode *N,
+                                            SmallVectorImpl<SDValue> &Results,
+                                            SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
+    return;
+  case ISD::FP_TO_UINT:
+    if (N->getValueType(0) == MVT::i1) {
+      Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG));
+      return;
+    }
+    // Since we don't care about out of bounds values we can use FP_TO_SINT for
+    // uints too. The DAGLegalizer code for uint considers some extra cases
+    // which are not necessary here.
+    LLVM_FALLTHROUGH;
+  case ISD::FP_TO_SINT: {
+    if (N->getValueType(0) == MVT::i1) {
+      Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG));
+      return;
+    }
+
+    SDValue Result;
+    if (expandFP_TO_SINT(N, Result, DAG))
+      Results.push_back(Result);
+    return;
+  }
+  case ISD::SDIVREM: {
+    SDValue Op = SDValue(N, 1);
+    SDValue RES = LowerSDIVREM(Op, DAG);
+    Results.push_back(RES);
+    Results.push_back(RES.getValue(1));
+    break;
+  }
+  case ISD::UDIVREM: {
+    SDValue Op = SDValue(N, 0);
+    LowerUDIVREM64(Op, DAG, Results);
+    break;
+  }
+  }
+}
+
+SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
+                                                   SDValue Vector) const {
+  SDLoc DL(Vector);
+  EVT VecVT = Vector.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  SmallVector<SDValue, 8> Args;
+
+  for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) {
+    Args.push_back(DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
+        DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
+  }
+
+  return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
+}
+
+SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Vector = Op.getOperand(0);
+  SDValue Index = Op.getOperand(1);
+
+  if (isa<ConstantSDNode>(Index) ||
+      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+    return Op;
+
+  Vector = vectorToVerticalVector(DAG, Vector);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
+                     Vector, Index);
+}
+
+SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Vector = Op.getOperand(0);
+  SDValue Value = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+
+  if (isa<ConstantSDNode>(Index) ||
+      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
+    return Op;
+
+  Vector = vectorToVerticalVector(DAG, Vector);
+  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
+                               Vector, Value, Index);
+  return vectorToVerticalVector(DAG, Insert);
+}
+
+SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
+                                               SDValue Op,
+                                               SelectionDAG &DAG) const {
+  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+    return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
+
+  const DataLayout &DL = DAG.getDataLayout();
+  const GlobalValue *GV = GSD->getGlobal();
+  MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+
+  SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
+  return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
+}
+
+SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
+  // On hw >= R700, COS/SIN input must be between -1. and 1.
+  // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
+  EVT VT = Op.getValueType();
+  SDValue Arg = Op.getOperand(0);
+  SDLoc DL(Op);
+
+  // TODO: Should this propagate fast-math-flags?
+  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
+      DAG.getNode(ISD::FADD, DL, VT,
+        DAG.getNode(ISD::FMUL, DL, VT, Arg,
+          DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
+        DAG.getConstantFP(0.5, DL, MVT::f32)));
+  unsigned TrigNode;
+  switch (Op.getOpcode()) {
+  case ISD::FCOS:
+    TrigNode = AMDGPUISD::COS_HW;
+    break;
+  case ISD::FSIN:
+    TrigNode = AMDGPUISD::SIN_HW;
+    break;
+  default:
+    llvm_unreachable("Wrong trig opcode");
+  }
+  SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
+      DAG.getNode(ISD::FADD, DL, VT, FractPart,
+        DAG.getConstantFP(-0.5, DL, MVT::f32)));
+  if (Gen >= R600Subtarget::R700)
+    return TrigVal;
+  // On R600 hw, COS/SIN input must be between -Pi and Pi.
+  return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
+      DAG.getConstantFP(3.14159265359, DL, MVT::f32));
+}
+
+SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Shift = Op.getOperand(2);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue One  = DAG.getConstant(1, DL, VT);
+
+  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
+  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
+  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
+  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
+
+  // The dance around Width1 is necessary for 0 special case.
+  // Without it the CompShift might be 32, producing incorrect results in
+  // Overflow. So we do the shift in two steps, the alternative is to
+  // add a conditional to filter the special case.
+
+  SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
+  Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
+
+  SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
+  HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
+  SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
+
+  SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
+  SDValue LoBig = Zero;
+
+  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
+  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
+}
+
+SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Shift = Op.getOperand(2);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue One  = DAG.getConstant(1, DL, VT);
+
+  const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
+
+  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
+  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
+  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
+  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
+
+  // The dance around Width1 is necessary for 0 special case.
+  // Without it the CompShift might be 32, producing incorrect results in
+  // Overflow. So we do the shift in two steps, the alternative is to
+  // add a conditional to filter the special case.
+
+  SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
+  Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
+
+  SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
+  SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
+  LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
+
+  SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
+  SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
+
+  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
+  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
+}
+
+SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
+                                          unsigned mainop, unsigned ovf) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+
+  SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
+  // Extend sign.
+  OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
+                    DAG.getValueType(MVT::i1));
+
+  SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
+}
+
+SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(
+      ISD::SETCC,
+      DL,
+      MVT::i1,
+      Op, DAG.getConstantFP(1.0f, DL, MVT::f32),
+      DAG.getCondCode(ISD::SETEQ));
+}
+
+SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(
+      ISD::SETCC,
+      DL,
+      MVT::i1,
+      Op, DAG.getConstantFP(-1.0f, DL, MVT::f32),
+      DAG.getCondCode(ISD::SETEQ));
+}
+
+SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
+                                                   const SDLoc &DL,
+                                                   unsigned DwordOffset) const {
+  unsigned ByteOffset = DwordOffset * 4;
+  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+                                      AMDGPUAS::CONSTANT_BUFFER_0);
+
+  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
+  assert(isInt<16>(ByteOffset));
+
+  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+                     DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
+                     MachinePointerInfo(ConstantPointerNull::get(PtrType)));
+}
+
+bool R600TargetLowering::isZero(SDValue Op) const {
+  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
+    return Cst->isNullValue();
+  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
+    return CstFP->isZero();
+  } else {
+    return false;
+  }
+}
+
+bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
+  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    return CFP->isExactlyValue(1.0);
+  }
+  return isAllOnesConstant(Op);
+}
+
+bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
+  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    return CFP->getValueAPF().isZero();
+  }
+  return isNullConstant(Op);
+}
+
+SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue True = Op.getOperand(2);
+  SDValue False = Op.getOperand(3);
+  SDValue CC = Op.getOperand(4);
+  SDValue Temp;
+
+  if (VT == MVT::f32) {
+    DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
+    SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
+    if (MinMax)
+      return MinMax;
+  }
+
+  // LHS and RHS are guaranteed to be the same value type
+  EVT CompareVT = LHS.getValueType();
+
+  // Check if we can lower this to a native operation.
+
+  // Try to lower to a SET* instruction:
+  //
+  // SET* can match the following patterns:
+  //
+  // select_cc f32, f32, -1,  0, cc_supported
+  // select_cc f32, f32, 1.0f, 0.0f, cc_supported
+  // select_cc i32, i32, -1,  0, cc_supported
+  //
+
+  // Move hardware True/False values to the correct operand.
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+  ISD::CondCode InverseCC =
+     ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
+  if (isHWTrueValue(False) && isHWFalseValue(True)) {
+    if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
+      std::swap(False, True);
+      CC = DAG.getCondCode(InverseCC);
+    } else {
+      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
+      if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
+        std::swap(False, True);
+        std::swap(LHS, RHS);
+        CC = DAG.getCondCode(SwapInvCC);
+      }
+    }
+  }
+
+  if (isHWTrueValue(True) && isHWFalseValue(False) &&
+      (CompareVT == VT || VT == MVT::i32)) {
+    // This can be matched by a SET* instruction.
+    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
+  }
+
+  // Try to lower to a CND* instruction:
+  //
+  // CND* can match the following patterns:
+  //
+  // select_cc f32, 0.0, f32, f32, cc_supported
+  // select_cc f32, 0.0, i32, i32, cc_supported
+  // select_cc i32, 0,   f32, f32, cc_supported
+  // select_cc i32, 0,   i32, i32, cc_supported
+  //
+
+  // Try to move the zero value to the RHS
+  if (isZero(LHS)) {
+    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+    // Try swapping the operands
+    ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
+    if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
+      std::swap(LHS, RHS);
+      CC = DAG.getCondCode(CCSwapped);
+    } else {
+      // Try inverting the conditon and then swapping the operands
+      ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
+      CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
+      if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
+        std::swap(True, False);
+        std::swap(LHS, RHS);
+        CC = DAG.getCondCode(CCSwapped);
+      }
+    }
+  }
+  if (isZero(RHS)) {
+    SDValue Cond = LHS;
+    SDValue Zero = RHS;
+    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+    if (CompareVT != VT) {
+      // Bitcast True / False to the correct types.  This will end up being
+      // a nop, but it allows us to define only a single pattern in the
+      // .TD files for each CND* instruction rather than having to have
+      // one pattern for integer True/False and one for fp True/False
+      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
+      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
+    }
+
+    switch (CCOpcode) {
+    case ISD::SETONE:
+    case ISD::SETUNE:
+    case ISD::SETNE:
+      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
+      Temp = True;
+      True = False;
+      False = Temp;
+      break;
+    default:
+      break;
+    }
+    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
+        Cond, Zero,
+        True, False,
+        DAG.getCondCode(CCOpcode));
+    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
+  }
+
+  // If we make it this for it means we have no native instructions to handle
+  // this SELECT_CC, so we must lower it.
+  SDValue HWTrue, HWFalse;
+
+  if (CompareVT == MVT::f32) {
+    HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
+    HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
+  } else if (CompareVT == MVT::i32) {
+    HWTrue = DAG.getConstant(-1, DL, CompareVT);
+    HWFalse = DAG.getConstant(0, DL, CompareVT);
+  }
+  else {
+    llvm_unreachable("Unhandled value type in LowerSELECT_CC");
+  }
+
+  // Lower this unsupported SELECT_CC into a combination of two supported
+  // SELECT_CC operations.
+  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
+
+  return DAG.getNode(ISD::SELECT_CC, DL, VT,
+      Cond, HWFalse,
+      True, False,
+      DAG.getCondCode(ISD::SETNE));
+}
+
+/// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
+/// convert these pointers to a register index.  Each register holds
+/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
+/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
+/// for indirect addressing.
+SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
+                                               unsigned StackWidth,
+                                               SelectionDAG &DAG) const {
+  unsigned SRLPad;
+  switch(StackWidth) {
+  case 1:
+    SRLPad = 2;
+    break;
+  case 2:
+    SRLPad = 3;
+    break;
+  case 4:
+    SRLPad = 4;
+    break;
+  default: llvm_unreachable("Invalid stack width");
+  }
+
+  SDLoc DL(Ptr);
+  return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
+                     DAG.getConstant(SRLPad, DL, MVT::i32));
+}
+
+void R600TargetLowering::getStackAddress(unsigned StackWidth,
+                                         unsigned ElemIdx,
+                                         unsigned &Channel,
+                                         unsigned &PtrIncr) const {
+  switch (StackWidth) {
+  default:
+  case 1:
+    Channel = 0;
+    if (ElemIdx > 0) {
+      PtrIncr = 1;
+    } else {
+      PtrIncr = 0;
+    }
+    break;
+  case 2:
+    Channel = ElemIdx % 2;
+    if (ElemIdx == 2) {
+      PtrIncr = 1;
+    } else {
+      PtrIncr = 0;
+    }
+    break;
+  case 4:
+    Channel = ElemIdx;
+    PtrIncr = 0;
+    break;
+  }
+}
+
+SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
+                                                   SelectionDAG &DAG) const {
+  SDLoc DL(Store);
+
+  unsigned Mask = 0;
+  if (Store->getMemoryVT() == MVT::i8) {
+    Mask = 0xff;
+  } else if (Store->getMemoryVT() == MVT::i16) {
+    Mask = 0xffff;
+  }
+
+  SDValue Chain = Store->getChain();
+  SDValue BasePtr = Store->getBasePtr();
+  EVT MemVT = Store->getMemoryVT();
+
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
+                            DAG.getConstant(2, DL, MVT::i32));
+  SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+                            Chain, Ptr,
+                            DAG.getTargetConstant(0, DL, MVT::i32));
+
+  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
+                                DAG.getConstant(0x3, DL, MVT::i32));
+
+  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+                                 DAG.getConstant(3, DL, MVT::i32));
+
+  SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
+                                  Store->getValue());
+
+  SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
+
+  SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
+                                     MaskedValue, ShiftAmt);
+
+  SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
+                                DAG.getConstant(Mask, DL, MVT::i32),
+                                ShiftAmt);
+  DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
+                        DAG.getConstant(0xffffffff, DL, MVT::i32));
+  Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+
+  SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
+  return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+                     Chain, Value, Ptr,
+                     DAG.getTargetConstant(0, DL, MVT::i32));
+}
+
+SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
+  unsigned AS = StoreNode->getAddressSpace();
+  SDValue Value = StoreNode->getValue();
+  EVT ValueVT = Value.getValueType();
+  EVT MemVT = StoreNode->getMemoryVT();
+  unsigned Align = StoreNode->getAlignment();
+
+  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
+      ValueVT.isVector()) {
+    return SplitVectorStore(Op, DAG);
+  }
+
+  // Private AS needs special fixes
+  if (Align < MemVT.getStoreSize() && (AS != AMDGPUAS::PRIVATE_ADDRESS) &&
+      !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
+    return expandUnalignedStore(StoreNode, DAG);
+  }
+
+  SDLoc DL(Op);
+  SDValue Chain = StoreNode->getChain();
+  SDValue Ptr = StoreNode->getBasePtr();
+
+  if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
+    // It is beneficial to create MSKOR here instead of combiner to avoid
+    // artificial dependencies introduced by RMW
+    if (StoreNode->isTruncatingStore()) {
+      EVT VT = Value.getValueType();
+      assert(VT.bitsLE(MVT::i32));
+      SDValue MaskConstant;
+      if (MemVT == MVT::i8) {
+        MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
+      } else {
+        assert(MemVT == MVT::i16);
+        assert(StoreNode->getAlignment() >= 2);
+        MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
+      }
+      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
+                                      DAG.getConstant(2, DL, MVT::i32));
+      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
+                                      DAG.getConstant(0x00000003, DL, VT));
+      SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
+      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
+                                   DAG.getConstant(3, DL, VT));
+      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
+      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
+      // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
+      // vector instead.
+      SDValue Src[4] = {
+        ShiftedValue,
+        DAG.getConstant(0, DL, MVT::i32),
+        DAG.getConstant(0, DL, MVT::i32),
+        Mask
+      };
+      SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
+      SDValue Args[3] = { Chain, Input, DWordAddr };
+      return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
+                                     Op->getVTList(), Args, MemVT,
+                                     StoreNode->getMemOperand());
+    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
+               ValueVT.bitsGE(MVT::i32)) {
+      // Convert pointer from byte address to dword address.
+      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
+                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
+                                    Ptr, DAG.getConstant(2, DL, MVT::i32)));
+
+      if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
+        llvm_unreachable("Truncated and indexed stores not supported yet");
+      } else {
+        Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
+      }
+      return Chain;
+    }
+  }
+
+  if (AS != AMDGPUAS::PRIVATE_ADDRESS)
+    return SDValue();
+
+  if (MemVT.bitsLT(MVT::i32))
+    return lowerPrivateTruncStore(StoreNode, DAG);
+
+  // Lowering for indirect addressing
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
+  unsigned StackWidth = TFL->getStackWidth(MF);
+
+  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
+
+  if (ValueVT.isVector()) {
+    unsigned NumElemVT = ValueVT.getVectorNumElements();
+    EVT ElemVT = ValueVT.getVectorElementType();
+    SmallVector<SDValue, 4> Stores(NumElemVT);
+
+    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
+                                      "vector width in load");
+
+    for (unsigned i = 0; i < NumElemVT; ++i) {
+      unsigned Channel, PtrIncr;
+      getStackAddress(StackWidth, i, Channel, PtrIncr);
+      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
+                        DAG.getConstant(PtrIncr, DL, MVT::i32));
+      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
+                                 Value, DAG.getConstant(i, DL, MVT::i32));
+
+      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+                              Chain, Elem, Ptr,
+                              DAG.getTargetConstant(Channel, DL, MVT::i32));
+    }
+     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
+   } else {
+    if (ValueVT == MVT::i8) {
+      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
+    }
+    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
+    DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
+  }
+
+  return Chain;
+}
+
+// return (512 + (kc_bank << 12)
+static int
+ConstantAddressBlock(unsigned AddressSpace) {
+  switch (AddressSpace) {
+  case AMDGPUAS::CONSTANT_BUFFER_0:
+    return 512;
+  case AMDGPUAS::CONSTANT_BUFFER_1:
+    return 512 + 4096;
+  case AMDGPUAS::CONSTANT_BUFFER_2:
+    return 512 + 4096 * 2;
+  case AMDGPUAS::CONSTANT_BUFFER_3:
+    return 512 + 4096 * 3;
+  case AMDGPUAS::CONSTANT_BUFFER_4:
+    return 512 + 4096 * 4;
+  case AMDGPUAS::CONSTANT_BUFFER_5:
+    return 512 + 4096 * 5;
+  case AMDGPUAS::CONSTANT_BUFFER_6:
+    return 512 + 4096 * 6;
+  case AMDGPUAS::CONSTANT_BUFFER_7:
+    return 512 + 4096 * 7;
+  case AMDGPUAS::CONSTANT_BUFFER_8:
+    return 512 + 4096 * 8;
+  case AMDGPUAS::CONSTANT_BUFFER_9:
+    return 512 + 4096 * 9;
+  case AMDGPUAS::CONSTANT_BUFFER_10:
+    return 512 + 4096 * 10;
+  case AMDGPUAS::CONSTANT_BUFFER_11:
+    return 512 + 4096 * 11;
+  case AMDGPUAS::CONSTANT_BUFFER_12:
+    return 512 + 4096 * 12;
+  case AMDGPUAS::CONSTANT_BUFFER_13:
+    return 512 + 4096 * 13;
+  case AMDGPUAS::CONSTANT_BUFFER_14:
+    return 512 + 4096 * 14;
+  case AMDGPUAS::CONSTANT_BUFFER_15:
+    return 512 + 4096 * 15;
+  default:
+    return -1;
+  }
+}
+
+SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+  ISD::LoadExtType ExtType = Load->getExtensionType();
+  EVT MemVT = Load->getMemoryVT();
+
+  // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
+  // register (2-)byte extract.
+
+  // Get Register holding the target.
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
+                            DAG.getConstant(2, DL, MVT::i32));
+  // Load the Register.
+  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+                            Load->getChain(),
+                            Ptr,
+                            DAG.getTargetConstant(0, DL, MVT::i32),
+                            Op.getOperand(2));
+
+  // Get offset within the register.
+  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
+                                Load->getBasePtr(),
+                                DAG.getConstant(0x3, DL, MVT::i32));
+
+  // Bit offset of target byte (byteIdx * 8).
+  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+                                 DAG.getConstant(3, DL, MVT::i32));
+
+  // Shift to the right.
+  Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
+
+  // Eliminate the upper bits by setting them to ...
+  EVT MemEltVT = MemVT.getScalarType();
+
+  // ... ones.
+  if (ExtType == ISD::SEXTLOAD) {
+    SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
+      Load->getChain()
+    };
+
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  // ... or zeros.
+  SDValue Ops[] = {
+    DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
+    Load->getChain()
+  };
+
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
+  unsigned AS = LoadNode->getAddressSpace();
+  EVT MemVT = LoadNode->getMemoryVT();
+  ISD::LoadExtType ExtType = LoadNode->getExtensionType();
+
+  if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
+      ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
+    return lowerPrivateExtLoad(Op, DAG);
+  }
+
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Chain = LoadNode->getChain();
+  SDValue Ptr = LoadNode->getBasePtr();
+
+  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
+    SDValue MergedValues[2] = {
+      scalarizeVectorLoad(LoadNode, DAG),
+      Chain
+    };
+    return DAG.getMergeValues(MergedValues, DL);
+  }
+
+  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
+  if (ConstantBlock > -1 &&
+      ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
+       (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
+    SDValue Result;
+    if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
+        isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
+        isa<ConstantSDNode>(Ptr)) {
+      SDValue Slots[4];
+      for (unsigned i = 0; i < 4; i++) {
+        // We want Const position encoded with the following formula :
+        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
+        // const_index is Ptr computed by llvm using an alignment of 16.
+        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
+        // then div by 4 at the ISel step
+        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+            DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
+        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
+      }
+      EVT NewVT = MVT::v4i32;
+      unsigned NumElements = 4;
+      if (VT.isVector()) {
+        NewVT = VT;
+        NumElements = VT.getVectorNumElements();
+      }
+      Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
+    } else {
+      // non-constant ptr can't be folded, keeps it as a v4f32 load
+      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
+          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
+                      DAG.getConstant(4, DL, MVT::i32)),
+                      DAG.getConstant(LoadNode->getAddressSpace() -
+                                      AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
+          );
+    }
+
+    if (!VT.isVector()) {
+      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
+                           DAG.getConstant(0, DL, MVT::i32));
+    }
+
+    SDValue MergedValues[2] = {
+      Result,
+      Chain
+    };
+    return DAG.getMergeValues(MergedValues, DL);
+  }
+
+  SDValue LoweredLoad;
+
+  // For most operations returning SDValue() will result in the node being
+  // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
+  // need to manually expand loads that may be legal in some address spaces and
+  // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
+  // compute shaders, since the data is sign extended when it is uploaded to the
+  // buffer. However SEXT loads from other address spaces are not supported, so
+  // we need to expand them here.
+  if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
+    EVT MemVT = LoadNode->getMemoryVT();
+    assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
+    SDValue NewLoad = DAG.getExtLoad(
+        ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT,
+        LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags());
+    SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
+                              DAG.getValueType(MemVT));
+
+    SDValue MergedValues[2] = { Res, Chain };
+    return DAG.getMergeValues(MergedValues, DL);
+  }
+
+  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+    return SDValue();
+  }
+
+  // Lowering for indirect addressing
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
+  unsigned StackWidth = TFL->getStackWidth(MF);
+
+  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
+
+  if (VT.isVector()) {
+    unsigned NumElemVT = VT.getVectorNumElements();
+    EVT ElemVT = VT.getVectorElementType();
+    SDValue Loads[4];
+
+    assert(NumElemVT <= 4);
+    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
+                                      "vector width in load");
+
+    for (unsigned i = 0; i < NumElemVT; ++i) {
+      unsigned Channel, PtrIncr;
+      getStackAddress(StackWidth, i, Channel, PtrIncr);
+      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
+                        DAG.getConstant(PtrIncr, DL, MVT::i32));
+      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
+                             Chain, Ptr,
+                             DAG.getTargetConstant(Channel, DL, MVT::i32),
+                             Op.getOperand(2));
+    }
+    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
+    LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
+  } else {
+    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
+                              Chain, Ptr,
+                              DAG.getTargetConstant(0, DL, MVT::i32), // Channel
+                              Op.getOperand(2));
+  }
+
+  SDValue Ops[2] = {
+    LoweredLoad,
+    Chain
+  };
+
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond  = Op.getOperand(1);
+  SDValue Jump  = Op.getOperand(2);
+
+  return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
+                     Chain, Jump, Cond);
+}
+
+SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
+
+  FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
+
+  unsigned FrameIndex = FIN->getIndex();
+  unsigned IgnoredFrameReg;
+  unsigned Offset =
+    TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
+  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
+                         Op.getValueType());
+}
+
+/// XXX Only kernel functions are supported, so we can assume for now that
+/// every function is a kernel function, but in the future we should use
+/// separate calling conventions for kernel and non-kernel functions.
+SDValue R600TargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  MachineFunction &MF = DAG.getMachineFunction();
+  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+
+  SmallVector<ISD::InputArg, 8> LocalIns;
+
+  if (AMDGPU::isShader(CallConv)) {
+    AnalyzeFormalArguments(CCInfo, Ins);
+  } else {
+    analyzeFormalArgumentsCompute(CCInfo, Ins);
+  }
+
+  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    const ISD::InputArg &In = Ins[i];
+    EVT VT = In.VT;
+    EVT MemVT = VA.getLocVT();
+    if (!VT.isVector() && MemVT.isVector()) {
+      // Get load source type if scalarized.
+      MemVT = MemVT.getVectorElementType();
+    }
+
+    if (AMDGPU::isShader(CallConv)) {
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
+      SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+      InVals.push_back(Register);
+      continue;
+    }
+
+    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+                                          AMDGPUAS::CONSTANT_BUFFER_0);
+
+    // i64 isn't a legal type, so the register type used ends up as i32, which
+    // isn't expected here. It attempts to create this sextload, but it ends up
+    // being invalid. Somehow this seems to work with i64 arguments, but breaks
+    // for <1 x i64>.
+
+    // The first 36 bytes of the input buffer contains information about
+    // thread group and global sizes.
+    ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
+    if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
+      // FIXME: This should really check the extload type, but the handling of
+      // extload vector parameters seems to be broken.
+
+      // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+      Ext = ISD::SEXTLOAD;
+    }
+
+    // Compute the offset from the value.
+    // XXX - I think PartOffset should give you this, but it seems to give the
+    // size of the register which isn't useful.
+
+    unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
+    unsigned PartOffset = VA.getLocMemOffset();
+    unsigned Offset = Subtarget->getExplicitKernelArgOffset() + VA.getLocMemOffset();
+
+    MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
+    SDValue Arg = DAG.getLoad(
+        ISD::UNINDEXED, Ext, VT, DL, Chain,
+        DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
+        MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
+                                        MachineMemOperand::MODereferenceable |
+                                        MachineMemOperand::MOInvariant);
+
+    // 4 is the preferred alignment for the CONSTANT memory space.
+    InVals.push_back(Arg);
+    MFI->setABIArgOffset(Offset + MemVT.getStoreSize());
+  }
+  return Chain;
+}
+
+EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                                           EVT VT) const {
+   if (!VT.isVector())
+     return MVT::i32;
+   return VT.changeVectorElementTypeToInteger();
+}
+
+bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                        unsigned AddrSpace,
+                                                        unsigned Align,
+                                                        bool *IsFast) const {
+  if (IsFast)
+    *IsFast = false;
+
+  if (!VT.isSimple() || VT == MVT::Other)
+    return false;
+
+  if (VT.bitsLT(MVT::i32))
+    return false;
+
+  // TODO: This is a rough estimate.
+  if (IsFast)
+    *IsFast = true;
+
+  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
+}
+
+static SDValue CompactSwizzlableVector(
+  SelectionDAG &DAG, SDValue VectorEntry,
+  DenseMap<unsigned, unsigned> &RemapSwizzle) {
+  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
+  assert(RemapSwizzle.empty());
+  SDValue NewBldVec[4] = {
+    VectorEntry.getOperand(0),
+    VectorEntry.getOperand(1),
+    VectorEntry.getOperand(2),
+    VectorEntry.getOperand(3)
+  };
+
+  for (unsigned i = 0; i < 4; i++) {
+    if (NewBldVec[i].isUndef())
+      // We mask write here to teach later passes that the ith element of this
+      // vector is undef. Thus we can use it to reduce 128 bits reg usage,
+      // break false dependencies and additionnaly make assembly easier to read.
+      RemapSwizzle[i] = 7; // SEL_MASK_WRITE
+    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
+      if (C->isZero()) {
+        RemapSwizzle[i] = 4; // SEL_0
+        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
+      } else if (C->isExactlyValue(1.0)) {
+        RemapSwizzle[i] = 5; // SEL_1
+        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
+      }
+    }
+
+    if (NewBldVec[i].isUndef())
+      continue;
+    for (unsigned j = 0; j < i; j++) {
+      if (NewBldVec[i] == NewBldVec[j]) {
+        NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
+        RemapSwizzle[i] = j;
+        break;
+      }
+    }
+  }
+
+  return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
+                            NewBldVec);
+}
+
+static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
+                                DenseMap<unsigned, unsigned> &RemapSwizzle) {
+  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
+  assert(RemapSwizzle.empty());
+  SDValue NewBldVec[4] = {
+      VectorEntry.getOperand(0),
+      VectorEntry.getOperand(1),
+      VectorEntry.getOperand(2),
+      VectorEntry.getOperand(3)
+  };
+  bool isUnmovable[4] = { false, false, false, false };
+  for (unsigned i = 0; i < 4; i++) {
+    RemapSwizzle[i] = i;
+    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
+          ->getZExtValue();
+      if (i == Idx)
+        isUnmovable[Idx] = true;
+    }
+  }
+
+  for (unsigned i = 0; i < 4; i++) {
+    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
+          ->getZExtValue();
+      if (isUnmovable[Idx])
+        continue;
+      // Swap i and Idx
+      std::swap(NewBldVec[Idx], NewBldVec[i]);
+      std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
+      break;
+    }
+  }
+
+  return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
+                            NewBldVec);
+}
+
+SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
+                                            SelectionDAG &DAG,
+                                            const SDLoc &DL) const {
+  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
+  // Old -> New swizzle values
+  DenseMap<unsigned, unsigned> SwizzleRemap;
+
+  BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
+  for (unsigned i = 0; i < 4; i++) {
+    unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
+    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
+      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
+  }
+
+  SwizzleRemap.clear();
+  BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
+  for (unsigned i = 0; i < 4; i++) {
+    unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
+    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
+      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
+  }
+
+  return BuildVector;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Optimizations
+//===----------------------------------------------------------------------===//
+
+SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  switch (N->getOpcode()) {
+  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
+  case ISD::FP_ROUND: {
+      SDValue Arg = N->getOperand(0);
+      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
+        return DAG.getNode(ISD::UINT_TO_FP, DL, N->getValueType(0),
+                           Arg.getOperand(0));
+      }
+      break;
+    }
+
+  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
+  // (i32 select_cc f32, f32, -1, 0 cc)
+  //
+  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
+  // this to one of the SET*_DX10 instructions.
+  case ISD::FP_TO_SINT: {
+    SDValue FNeg = N->getOperand(0);
+    if (FNeg.getOpcode() != ISD::FNEG) {
+      return SDValue();
+    }
+    SDValue SelectCC = FNeg.getOperand(0);
+    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
+        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
+        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
+        !isHWTrueValue(SelectCC.getOperand(2)) ||
+        !isHWFalseValue(SelectCC.getOperand(3))) {
+      return SDValue();
+    }
+
+    return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(0),
+                           SelectCC.getOperand(0), // LHS
+                           SelectCC.getOperand(1), // RHS
+                           DAG.getConstant(-1, DL, MVT::i32), // True
+                           DAG.getConstant(0, DL, MVT::i32),  // False
+                           SelectCC.getOperand(4)); // CC
+
+    break;
+  }
+
+  // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
+  // => build_vector elt0, ... , NewEltIdx, ... , eltN
+  case ISD::INSERT_VECTOR_ELT: {
+    SDValue InVec = N->getOperand(0);
+    SDValue InVal = N->getOperand(1);
+    SDValue EltNo = N->getOperand(2);
+
+    // If the inserted element is an UNDEF, just use the input vector.
+    if (InVal.isUndef())
+      return InVec;
+
+    EVT VT = InVec.getValueType();
+
+    // If we can't generate a legal BUILD_VECTOR, exit
+    if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
+      return SDValue();
+
+    // Check that we know which element is being inserted
+    if (!isa<ConstantSDNode>(EltNo))
+      return SDValue();
+    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+
+    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
+    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
+    // vector elements.
+    SmallVector<SDValue, 8> Ops;
+    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+      Ops.append(InVec.getNode()->op_begin(),
+                 InVec.getNode()->op_end());
+    } else if (InVec.isUndef()) {
+      unsigned NElts = VT.getVectorNumElements();
+      Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
+    } else {
+      return SDValue();
+    }
+
+    // Insert the element
+    if (Elt < Ops.size()) {
+      // All the operands of BUILD_VECTOR must have the same type;
+      // we enforce that here.
+      EVT OpVT = Ops[0].getValueType();
+      if (InVal.getValueType() != OpVT)
+        InVal = OpVT.bitsGT(InVal.getValueType()) ?
+          DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) :
+          DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal);
+      Ops[Elt] = InVal;
+    }
+
+    // Return the new vector
+    return DAG.getBuildVector(VT, DL, Ops);
+  }
+
+  // Extract_vec (Build_vector) generated by custom lowering
+  // also needs to be customly combined
+  case ISD::EXTRACT_VECTOR_ELT: {
+    SDValue Arg = N->getOperand(0);
+    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
+      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+        unsigned Element = Const->getZExtValue();
+        return Arg->getOperand(Element);
+      }
+    }
+    if (Arg.getOpcode() == ISD::BITCAST &&
+        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
+        (Arg.getOperand(0).getValueType().getVectorNumElements() ==
+         Arg.getValueType().getVectorNumElements())) {
+      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+        unsigned Element = Const->getZExtValue();
+        return DAG.getNode(ISD::BITCAST, DL, N->getVTList(),
+                           Arg->getOperand(0).getOperand(Element));
+      }
+    }
+    break;
+  }
+
+  case ISD::SELECT_CC: {
+    // Try common optimizations
+    if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
+      return Ret;
+
+    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
+    //      selectcc x, y, a, b, inv(cc)
+    //
+    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
+    //      selectcc x, y, a, b, cc
+    SDValue LHS = N->getOperand(0);
+    if (LHS.getOpcode() != ISD::SELECT_CC) {
+      return SDValue();
+    }
+
+    SDValue RHS = N->getOperand(1);
+    SDValue True = N->getOperand(2);
+    SDValue False = N->getOperand(3);
+    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+
+    if (LHS.getOperand(2).getNode() != True.getNode() ||
+        LHS.getOperand(3).getNode() != False.getNode() ||
+        RHS.getNode() != False.getNode()) {
+      return SDValue();
+    }
+
+    switch (NCC) {
+    default: return SDValue();
+    case ISD::SETNE: return LHS;
+    case ISD::SETEQ: {
+      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
+      LHSCC = ISD::getSetCCInverse(LHSCC,
+                                  LHS.getOperand(0).getValueType().isInteger());
+      if (DCI.isBeforeLegalizeOps() ||
+          isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
+        return DAG.getSelectCC(DL,
+                               LHS.getOperand(0),
+                               LHS.getOperand(1),
+                               LHS.getOperand(2),
+                               LHS.getOperand(3),
+                               LHSCC);
+      break;
+    }
+    }
+    return SDValue();
+  }
+
+  case AMDGPUISD::R600_EXPORT: {
+    SDValue Arg = N->getOperand(1);
+    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
+      break;
+
+    SDValue NewArgs[8] = {
+      N->getOperand(0), // Chain
+      SDValue(),
+      N->getOperand(2), // ArrayBase
+      N->getOperand(3), // Type
+      N->getOperand(4), // SWZ_X
+      N->getOperand(5), // SWZ_Y
+      N->getOperand(6), // SWZ_Z
+      N->getOperand(7) // SWZ_W
+    };
+    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
+    return DAG.getNode(AMDGPUISD::R600_EXPORT, DL, N->getVTList(), NewArgs);
+  }
+  case AMDGPUISD::TEXTURE_FETCH: {
+    SDValue Arg = N->getOperand(1);
+    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
+      break;
+
+    SDValue NewArgs[19] = {
+      N->getOperand(0),
+      N->getOperand(1),
+      N->getOperand(2),
+      N->getOperand(3),
+      N->getOperand(4),
+      N->getOperand(5),
+      N->getOperand(6),
+      N->getOperand(7),
+      N->getOperand(8),
+      N->getOperand(9),
+      N->getOperand(10),
+      N->getOperand(11),
+      N->getOperand(12),
+      N->getOperand(13),
+      N->getOperand(14),
+      N->getOperand(15),
+      N->getOperand(16),
+      N->getOperand(17),
+      N->getOperand(18),
+    };
+    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
+    return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
+  }
+  default: break;
+  }
+
+  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+}
+
+bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
+                                     SDValue &Src, SDValue &Neg, SDValue &Abs,
+                                     SDValue &Sel, SDValue &Imm,
+                                     SelectionDAG &DAG) const {
+  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+  if (!Src.isMachineOpcode())
+    return false;
+
+  switch (Src.getMachineOpcode()) {
+  case AMDGPU::FNEG_R600:
+    if (!Neg.getNode())
+      return false;
+    Src = Src.getOperand(0);
+    Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
+    return true;
+  case AMDGPU::FABS_R600:
+    if (!Abs.getNode())
+      return false;
+    Src = Src.getOperand(0);
+    Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
+    return true;
+  case AMDGPU::CONST_COPY: {
+    unsigned Opcode = ParentNode->getMachineOpcode();
+    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+
+    if (!Sel.getNode())
+      return false;
+
+    SDValue CstOffset = Src.getOperand(0);
+    if (ParentNode->getValueType(0).isVector())
+      return false;
+
+    // Gather constants values
+    int SrcIndices[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+    };
+    std::vector<unsigned> Consts;
+    for (int OtherSrcIdx : SrcIndices) {
+      int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
+      if (OtherSrcIdx < 0 || OtherSelIdx < 0)
+        continue;
+      if (HasDst) {
+        OtherSrcIdx--;
+        OtherSelIdx--;
+      }
+      if (RegisterSDNode *Reg =
+          dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
+        if (Reg->getReg() == AMDGPU::ALU_CONST) {
+          ConstantSDNode *Cst
+            = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
+          Consts.push_back(Cst->getZExtValue());
+        }
+      }
+    }
+
+    ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
+    Consts.push_back(Cst->getZExtValue());
+    if (!TII->fitsConstReadLimitations(Consts)) {
+      return false;
+    }
+
+    Sel = CstOffset;
+    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
+    return true;
+  }
+  case AMDGPU::MOV_IMM_GLOBAL_ADDR:
+    // Check if the Imm slot is used. Taken from below.
+    if (cast<ConstantSDNode>(Imm)->getZExtValue())
+      return false;
+    Imm = Src.getOperand(0);
+    Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
+    return true;
+  case AMDGPU::MOV_IMM_I32:
+  case AMDGPU::MOV_IMM_F32: {
+    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
+    uint64_t ImmValue = 0;
+
+    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
+      ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
+      float FloatValue = FPC->getValueAPF().convertToFloat();
+      if (FloatValue == 0.0) {
+        ImmReg = AMDGPU::ZERO;
+      } else if (FloatValue == 0.5) {
+        ImmReg = AMDGPU::HALF;
+      } else if (FloatValue == 1.0) {
+        ImmReg = AMDGPU::ONE;
+      } else {
+        ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
+      }
+    } else {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
+      uint64_t Value = C->getZExtValue();
+      if (Value == 0) {
+        ImmReg = AMDGPU::ZERO;
+      } else if (Value == 1) {
+        ImmReg = AMDGPU::ONE_INT;
+      } else {
+        ImmValue = Value;
+      }
+    }
+
+    // Check that we aren't already using an immediate.
+    // XXX: It's possible for an instruction to have more than one
+    // immediate operand, but this is not supported yet.
+    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+      if (!Imm.getNode())
+        return false;
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
+      assert(C);
+      if (C->getZExtValue())
+        return false;
+      Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
+    }
+    Src = DAG.getRegister(ImmReg, MVT::i32);
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
+/// \brief Fold the instructions after selecting them
+SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
+                                            SelectionDAG &DAG) const {
+  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
+  if (!Node->isMachineOpcode())
+    return Node;
+
+  unsigned Opcode = Node->getMachineOpcode();
+  SDValue FakeOp;
+
+  std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
+
+  if (Opcode == AMDGPU::DOT_4) {
+    int OperandIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+        };
+    int NegIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
+    };
+    int AbsIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
+    };
+    for (unsigned i = 0; i < 8; i++) {
+      if (OperandIdx[i] < 0)
+        return Node;
+      SDValue &Src = Ops[OperandIdx[i] - 1];
+      SDValue &Neg = Ops[NegIdx[i] - 1];
+      SDValue &Abs = Ops[AbsIdx[i] - 1];
+      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
+      if (HasDst)
+        SelIdx--;
+      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
+      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
+    for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
+      SDValue &Src = Ops[i];
+      if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  } else if (Opcode == AMDGPU::CLAMP_R600) {
+    SDValue Src = Node->getOperand(0);
+    if (!Src.isMachineOpcode() ||
+        !TII->hasInstrModifiers(Src.getMachineOpcode()))
+      return Node;
+    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
+        AMDGPU::OpName::clamp);
+    if (ClampIdx < 0)
+      return Node;
+    SDLoc DL(Node);
+    std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
+    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
+    return DAG.getMachineNode(Src.getMachineOpcode(), DL,
+                              Node->getVTList(), Ops);
+  } else {
+    if (!TII->hasInstrModifiers(Opcode))
+      return Node;
+    int OperandIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
+    };
+    int NegIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
+    };
+    int AbsIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
+      -1
+    };
+    for (unsigned i = 0; i < 3; i++) {
+      if (OperandIdx[i] < 0)
+        return Node;
+      SDValue &Src = Ops[OperandIdx[i] - 1];
+      SDValue &Neg = Ops[NegIdx[i] - 1];
+      SDValue FakeAbs;
+      SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
+      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
+      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
+      if (HasDst) {
+        SelIdx--;
+        ImmIdx--;
+      }
+      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
+      SDValue &Imm = Ops[ImmIdx];
+      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  }
+
+  return Node;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h
new file mode 100644
index 000000000000..9700ce14c6f3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -0,0 +1,104 @@
+//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 DAG Lowering interface definition
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600ISELLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_R600ISELLOWERING_H
+
+#include "AMDGPUISelLowering.h"
+
+namespace llvm {
+
+class R600InstrInfo;
+class R600Subtarget;
+
+class R600TargetLowering final : public AMDGPUTargetLowering {
+public:
+  R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI);
+
+  const R600Subtarget *getSubtarget() const;
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *BB) const override;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+  void ReplaceNodeResults(SDNode * N,
+                          SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                         EVT VT) const override;
+
+  bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
+                                      unsigned Align,
+                                      bool *IsFast) const override;
+
+private:
+  unsigned Gen;
+  /// Each OpenCL kernel has nine implicit parameters that are stored in the
+  /// first nine dwords of a Vertex Buffer.  These implicit parameters are
+  /// lowered to load instructions which retrieve the values from the Vertex
+  /// Buffer.
+  SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT, const SDLoc &DL,
+                                 unsigned DwordOffset) const;
+
+  void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
+      MachineRegisterInfo & MRI, unsigned dword_offset) const;
+  SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG,
+                          const SDLoc &DL) const;
+  SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const;
+
+  SDValue lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
+                             SelectionDAG &DAG) const override;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue lowerPrivateTruncStore(StoreSDNode *Store, SelectionDAG &DAG) const;
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue lowerPrivateExtLoad(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
+                        unsigned mainop, unsigned ovf) const;
+
+  SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth,
+                                          SelectionDAG &DAG) const;
+  void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
+                       unsigned &Channel, unsigned &PtrIncr) const;
+  bool isZero(SDValue Op) const;
+  bool isHWTrueValue(SDValue Op) const;
+  bool isHWFalseValue(SDValue Op) const;
+
+ bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src,
+                  SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm,
+                  SelectionDAG &DAG) const;
+
+  SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
+};
+
+} // End namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td
new file mode 100644
index 000000000000..68fcc545916a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrFormats.td
@@ -0,0 +1,495 @@
+//===-- R600InstrFormats.td - R600 Instruction Encodings ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 Instruction format definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
+                InstrItinClass itin>
+    : AMDGPUInst <outs, ins, asm, pattern> {
+
+  field bits<64> Inst;
+  bit Trig = 0;
+  bit Op3 = 0;
+  bit isVector = 0;
+  bits<2> FlagOperandIdx = 0;
+  bit Op1 = 0;
+  bit Op2 = 0;
+  bit LDS_1A = 0;
+  bit LDS_1A1D = 0;
+  bit HasNativeOperands = 0;
+  bit VTXInst = 0;
+  bit TEXInst = 0;
+  bit ALUInst = 0;
+  bit IsExport = 0;
+  bit LDS_1A2D = 0;
+
+  let Namespace = "AMDGPU";
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let Itinerary = itin;
+
+  // No AsmMatcher support.
+  let isCodeGenOnly = 1;
+
+  let TSFlags{4} = Trig;
+  let TSFlags{5} = Op3;
+
+  // Vector instructions are instructions that must fill all slots in an
+  // instruction group
+  let TSFlags{6} = isVector;
+  let TSFlags{8-7} = FlagOperandIdx;
+  let TSFlags{9} = HasNativeOperands;
+  let TSFlags{10} = Op1;
+  let TSFlags{11} = Op2;
+  let TSFlags{12} = VTXInst;
+  let TSFlags{13} = TEXInst;
+  let TSFlags{14} = ALUInst;
+  let TSFlags{15} = LDS_1A;
+  let TSFlags{16} = LDS_1A1D;
+  let TSFlags{17} = IsExport;
+  let TSFlags{18} = LDS_1A2D;
+}
+
+//===----------------------------------------------------------------------===//
+// ALU instructions
+//===----------------------------------------------------------------------===//
+
+class R600_ALU_LDS_Word0 {
+  field bits<32> Word0;
+
+  bits<11> src0;
+  bits<1>  src0_rel;
+  bits<11> src1;
+  bits<1>  src1_rel;
+  bits<3>  index_mode = 0;
+  bits<2>  pred_sel;
+  bits<1>  last;
+
+  bits<9>  src0_sel  = src0{8-0};
+  bits<2>  src0_chan = src0{10-9};
+  bits<9>  src1_sel  = src1{8-0};
+  bits<2>  src1_chan = src1{10-9};
+
+  let Word0{8-0}   = src0_sel;
+  let Word0{9}     = src0_rel;
+  let Word0{11-10} = src0_chan;
+  let Word0{21-13} = src1_sel;
+  let Word0{22}    = src1_rel;
+  let Word0{24-23} = src1_chan;
+  let Word0{28-26} = index_mode;
+  let Word0{30-29} = pred_sel;
+  let Word0{31}    = last;
+}
+
+class R600ALU_Word0 : R600_ALU_LDS_Word0 {
+
+  bits<1>  src0_neg;
+  bits<1>  src1_neg;
+
+  let Word0{12}    = src0_neg;
+  let Word0{25}    = src1_neg;
+}
+
+class R600ALU_Word1 {
+  field bits<32> Word1;
+
+  bits<11> dst;
+  bits<3>  bank_swizzle;
+  bits<1>  dst_rel;
+  bits<1>  clamp;
+
+  bits<7>  dst_sel  = dst{6-0};
+  bits<2>  dst_chan = dst{10-9};
+
+  let Word1{20-18} = bank_swizzle;
+  let Word1{27-21} = dst_sel;
+  let Word1{28}    = dst_rel;
+  let Word1{30-29} = dst_chan;
+  let Word1{31}    = clamp;
+}
+
+class R600ALU_Word1_OP2 <bits<11> alu_inst> : R600ALU_Word1{
+
+  bits<1>  src0_abs;
+  bits<1>  src1_abs;
+  bits<1>  update_exec_mask;
+  bits<1>  update_pred;
+  bits<1>  write;
+  bits<2>  omod;
+
+  let Word1{0}     = src0_abs;
+  let Word1{1}     = src1_abs;
+  let Word1{2}     = update_exec_mask;
+  let Word1{3}     = update_pred;
+  let Word1{4}     = write;
+  let Word1{6-5}   = omod;
+  let Word1{17-7}  = alu_inst;
+}
+
+class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{
+
+  bits<11> src2;
+  bits<1>  src2_rel;
+  bits<1>  src2_neg;
+
+  bits<9>  src2_sel = src2{8-0};
+  bits<2>  src2_chan = src2{10-9};
+
+  let Word1{8-0}   = src2_sel;
+  let Word1{9}     = src2_rel;
+  let Word1{11-10} = src2_chan;
+  let Word1{12}    = src2_neg;
+  let Word1{17-13} = alu_inst;
+}
+
+class R600LDS_Word1 {
+  field bits<32> Word1;
+
+  bits<11> src2;
+  bits<9>  src2_sel  = src2{8-0};
+  bits<2>  src2_chan = src2{10-9};
+  bits<1>  src2_rel;
+  // offset specifies the stride offset to the second set of data to be read
+  // from.  This is a dword offset.
+  bits<5>  alu_inst = 17; // OP3_INST_LDS_IDX_OP
+  bits<3>  bank_swizzle;
+  bits<6>  lds_op;
+  bits<2>  dst_chan = 0;
+
+  let Word1{8-0}   = src2_sel;
+  let Word1{9}     = src2_rel;
+  let Word1{11-10} = src2_chan;
+  let Word1{17-13} = alu_inst;
+  let Word1{20-18} = bank_swizzle;
+  let Word1{26-21} = lds_op;
+  let Word1{30-29} = dst_chan;
+}
+
+
+/*
+XXX: R600 subtarget uses a slightly different encoding than the other
+subtargets.  We currently handle this in R600MCCodeEmitter, but we may
+want to use these instruction classes in the future.
+
+class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 {
+
+  bits<1>  fog_merge;
+  bits<10> alu_inst;
+
+  let Inst{37}    = fog_merge;
+  let Inst{39-38} = omod;
+  let Inst{49-40} = alu_inst;
+}
+
+class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 {
+
+  bits<11> alu_inst;
+
+  let Inst{38-37} = omod;
+  let Inst{49-39} = alu_inst;
+}
+*/
+
+//===----------------------------------------------------------------------===//
+// Vertex Fetch instructions
+//===----------------------------------------------------------------------===//
+
+class VTX_WORD0 {
+  field bits<32> Word0;
+  bits<7> src_gpr;
+  bits<5> VC_INST;
+  bits<2> FETCH_TYPE;
+  bits<1> FETCH_WHOLE_QUAD;
+  bits<8> buffer_id;
+  bits<1> SRC_REL;
+  bits<2> SRC_SEL_X;
+
+  let Word0{4-0}   = VC_INST;
+  let Word0{6-5}   = FETCH_TYPE;
+  let Word0{7}     = FETCH_WHOLE_QUAD;
+  let Word0{15-8}  = buffer_id;
+  let Word0{22-16} = src_gpr;
+  let Word0{23}    = SRC_REL;
+  let Word0{25-24} = SRC_SEL_X;
+}
+
+class VTX_WORD0_eg : VTX_WORD0 {
+
+  bits<6> MEGA_FETCH_COUNT;
+
+  let Word0{31-26} = MEGA_FETCH_COUNT;
+}
+
+class VTX_WORD0_cm : VTX_WORD0 {
+
+  bits<2> SRC_SEL_Y;
+  bits<2> STRUCTURED_READ;
+  bits<1> LDS_REQ;
+  bits<1> COALESCED_READ;
+
+  let Word0{27-26} = SRC_SEL_Y;
+  let Word0{29-28} = STRUCTURED_READ;
+  let Word0{30}    = LDS_REQ;
+  let Word0{31}    = COALESCED_READ;
+}
+
+class VTX_WORD1_GPR {
+  field bits<32> Word1;
+  bits<7> dst_gpr;
+  bits<1> DST_REL;
+  bits<3> DST_SEL_X;
+  bits<3> DST_SEL_Y;
+  bits<3> DST_SEL_Z;
+  bits<3> DST_SEL_W;
+  bits<1> USE_CONST_FIELDS;
+  bits<6> DATA_FORMAT;
+  bits<2> NUM_FORMAT_ALL;
+  bits<1> FORMAT_COMP_ALL;
+  bits<1> SRF_MODE_ALL;
+
+  let Word1{6-0} = dst_gpr;
+  let Word1{7}    = DST_REL;
+  let Word1{8}    = 0; // Reserved
+  let Word1{11-9} = DST_SEL_X;
+  let Word1{14-12} = DST_SEL_Y;
+  let Word1{17-15} = DST_SEL_Z;
+  let Word1{20-18} = DST_SEL_W;
+  let Word1{21}    = USE_CONST_FIELDS;
+  let Word1{27-22} = DATA_FORMAT;
+  let Word1{29-28} = NUM_FORMAT_ALL;
+  let Word1{30}    = FORMAT_COMP_ALL;
+  let Word1{31}    = SRF_MODE_ALL;
+}
+
+//===----------------------------------------------------------------------===//
+// Texture fetch instructions
+//===----------------------------------------------------------------------===//
+
+class TEX_WORD0 {
+  field bits<32> Word0;
+
+  bits<5> TEX_INST;
+  bits<2> INST_MOD;
+  bits<1> FETCH_WHOLE_QUAD;
+  bits<8> RESOURCE_ID;
+  bits<7> SRC_GPR;
+  bits<1> SRC_REL;
+  bits<1> ALT_CONST;
+  bits<2> RESOURCE_INDEX_MODE;
+  bits<2> SAMPLER_INDEX_MODE;
+
+  let Word0{4-0} = TEX_INST;
+  let Word0{6-5} = INST_MOD;
+  let Word0{7} = FETCH_WHOLE_QUAD;
+  let Word0{15-8} = RESOURCE_ID;
+  let Word0{22-16} = SRC_GPR;
+  let Word0{23} = SRC_REL;
+  let Word0{24} = ALT_CONST;
+  let Word0{26-25} = RESOURCE_INDEX_MODE;
+  let Word0{28-27} = SAMPLER_INDEX_MODE;
+}
+
+class TEX_WORD1 {
+  field bits<32> Word1;
+
+  bits<7> DST_GPR;
+  bits<1> DST_REL;
+  bits<3> DST_SEL_X;
+  bits<3> DST_SEL_Y;
+  bits<3> DST_SEL_Z;
+  bits<3> DST_SEL_W;
+  bits<7> LOD_BIAS;
+  bits<1> COORD_TYPE_X;
+  bits<1> COORD_TYPE_Y;
+  bits<1> COORD_TYPE_Z;
+  bits<1> COORD_TYPE_W;
+
+  let Word1{6-0} = DST_GPR;
+  let Word1{7} = DST_REL;
+  let Word1{11-9} = DST_SEL_X;
+  let Word1{14-12} = DST_SEL_Y;
+  let Word1{17-15} = DST_SEL_Z;
+  let Word1{20-18} = DST_SEL_W;
+  let Word1{27-21} = LOD_BIAS;
+  let Word1{28} = COORD_TYPE_X;
+  let Word1{29} = COORD_TYPE_Y;
+  let Word1{30} = COORD_TYPE_Z;
+  let Word1{31} = COORD_TYPE_W;
+}
+
+class TEX_WORD2 {
+  field bits<32> Word2;
+
+  bits<5> OFFSET_X;
+  bits<5> OFFSET_Y;
+  bits<5> OFFSET_Z;
+  bits<5> SAMPLER_ID;
+  bits<3> SRC_SEL_X;
+  bits<3> SRC_SEL_Y;
+  bits<3> SRC_SEL_Z;
+  bits<3> SRC_SEL_W;
+
+  let Word2{4-0} = OFFSET_X;
+  let Word2{9-5} = OFFSET_Y;
+  let Word2{14-10} = OFFSET_Z;
+  let Word2{19-15} = SAMPLER_ID;
+  let Word2{22-20} = SRC_SEL_X;
+  let Word2{25-23} = SRC_SEL_Y;
+  let Word2{28-26} = SRC_SEL_Z;
+  let Word2{31-29} = SRC_SEL_W;
+}
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions
+//===----------------------------------------------------------------------===//
+
+class CF_WORD1_R600 {
+  field bits<32> Word1;
+
+  bits<3> POP_COUNT;
+  bits<5> CF_CONST;
+  bits<2> COND;
+  bits<3> COUNT;
+  bits<6> CALL_COUNT;
+  bits<1> COUNT_3;
+  bits<1> END_OF_PROGRAM;
+  bits<1> VALID_PIXEL_MODE;
+  bits<7> CF_INST;
+  bits<1> WHOLE_QUAD_MODE;
+  bits<1> BARRIER;
+
+  let Word1{2-0} = POP_COUNT;
+  let Word1{7-3} = CF_CONST;
+  let Word1{9-8} = COND;
+  let Word1{12-10} = COUNT;
+  let Word1{18-13} = CALL_COUNT;
+  let Word1{19} = COUNT_3;
+  let Word1{21} = END_OF_PROGRAM;
+  let Word1{22} = VALID_PIXEL_MODE;
+  let Word1{29-23} = CF_INST;
+  let Word1{30} = WHOLE_QUAD_MODE;
+  let Word1{31} = BARRIER;
+}
+
+class CF_WORD0_EG {
+  field bits<32> Word0;
+
+  bits<24> ADDR;
+  bits<3> JUMPTABLE_SEL;
+
+  let Word0{23-0} = ADDR;
+  let Word0{26-24} = JUMPTABLE_SEL;
+}
+
+class CF_WORD1_EG {
+  field bits<32> Word1;
+
+  bits<3> POP_COUNT;
+  bits<5> CF_CONST;
+  bits<2> COND;
+  bits<6> COUNT;
+  bits<1> VALID_PIXEL_MODE;
+  bits<1> END_OF_PROGRAM;
+  bits<8> CF_INST;
+  bits<1> BARRIER;
+
+  let Word1{2-0} = POP_COUNT;
+  let Word1{7-3} = CF_CONST;
+  let Word1{9-8} = COND;
+  let Word1{15-10} = COUNT;
+  let Word1{20} = VALID_PIXEL_MODE;
+  let Word1{21} = END_OF_PROGRAM;
+  let Word1{29-22} = CF_INST;
+  let Word1{31} = BARRIER;
+}
+
+class CF_ALU_WORD0 {
+  field bits<32> Word0;
+
+  bits<22> ADDR;
+  bits<4> KCACHE_BANK0;
+  bits<4> KCACHE_BANK1;
+  bits<2> KCACHE_MODE0;
+
+  let Word0{21-0} = ADDR;
+  let Word0{25-22} = KCACHE_BANK0;
+  let Word0{29-26} = KCACHE_BANK1;
+  let Word0{31-30} = KCACHE_MODE0;
+}
+
+class CF_ALU_WORD1 {
+  field bits<32> Word1;
+
+  bits<2> KCACHE_MODE1;
+  bits<8> KCACHE_ADDR0;
+  bits<8> KCACHE_ADDR1;
+  bits<7> COUNT;
+  bits<1> ALT_CONST;
+  bits<4> CF_INST;
+  bits<1> WHOLE_QUAD_MODE;
+  bits<1> BARRIER;
+
+  let Word1{1-0} = KCACHE_MODE1;
+  let Word1{9-2} = KCACHE_ADDR0;
+  let Word1{17-10} = KCACHE_ADDR1;
+  let Word1{24-18} = COUNT;
+  let Word1{25} = ALT_CONST;
+  let Word1{29-26} = CF_INST;
+  let Word1{30} = WHOLE_QUAD_MODE;
+  let Word1{31} = BARRIER;
+}
+
+class CF_ALLOC_EXPORT_WORD0_RAT {
+  field bits<32> Word0;
+
+  bits<4> rat_id;
+  bits<6> rat_inst;
+  bits<2> rim;
+  bits<2> type;
+  bits<7> rw_gpr;
+  bits<1> rw_rel;
+  bits<7> index_gpr;
+  bits<2> elem_size;
+
+  let Word0{3-0}   = rat_id;
+  let Word0{9-4}   = rat_inst;
+  let Word0{10}    = 0; // Reserved
+  let Word0{12-11} = rim;
+  let Word0{14-13} = type;
+  let Word0{21-15} = rw_gpr;
+  let Word0{22}    = rw_rel;
+  let Word0{29-23} = index_gpr;
+  let Word0{31-30} = elem_size;
+}
+
+class CF_ALLOC_EXPORT_WORD1_BUF {
+  field bits<32> Word1;
+
+  bits<12> array_size;
+  bits<4>  comp_mask;
+  bits<4>  burst_count;
+  bits<1>  vpm;
+  bits<1>  eop;
+  bits<8>  cf_inst;
+  bits<1>  mark;
+  bits<1>  barrier;
+
+  let Word1{11-0} = array_size;
+  let Word1{15-12} = comp_mask;
+  let Word1{19-16} = burst_count;
+  let Word1{20}    = vpm;
+  let Word1{21}    = eop;
+  let Word1{29-22} = cf_inst;
+  let Word1{30}    = mark;
+  let Word1{31}    = barrier;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
new file mode 100644
index 000000000000..e88bd076718e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -0,0 +1,1483 @@
+//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Implementation of TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600InstrInfo.h"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "AMDGPUGenDFAPacketizer.inc"
+
+R600InstrInfo::R600InstrInfo(const R600Subtarget &ST)
+  : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
+
+bool R600InstrInfo::isVector(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
+}
+
+void R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI,
+                                const DebugLoc &DL, unsigned DestReg,
+                                unsigned SrcReg, bool KillSrc) const {
+  unsigned VectorComponents = 0;
+  if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
+      AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
+      (AMDGPU::R600_Reg128RegClass.contains(SrcReg) ||
+       AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) {
+    VectorComponents = 4;
+  } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) ||
+            AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) &&
+            (AMDGPU::R600_Reg64RegClass.contains(SrcReg) ||
+             AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) {
+    VectorComponents = 2;
+  }
+
+  if (VectorComponents > 0) {
+    for (unsigned I = 0; I < VectorComponents; I++) {
+      unsigned SubRegIndex = RI.getSubRegFromChannel(I);
+      buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+                              RI.getSubReg(DestReg, SubRegIndex),
+                              RI.getSubReg(SrcReg, SubRegIndex))
+                              .addReg(DestReg,
+                                      RegState::Define | RegState::Implicit);
+    }
+  } else {
+    MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+                                                  DestReg, SrcReg);
+    NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0))
+                                    .setIsKill(KillSrc);
+  }
+}
+
+/// \returns true if \p MBBI can be moved into a new basic.
+bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI) const {
+  for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(),
+                                        E = MBBI->operands_end(); I != E; ++I) {
+    if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) &&
+        I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg()))
+      return false;
+  }
+  return true;
+}
+
+bool R600InstrInfo::isMov(unsigned Opcode) const {
+  switch(Opcode) {
+  default:
+    return false;
+  case AMDGPU::MOV:
+  case AMDGPU::MOV_IMM_F32:
+  case AMDGPU::MOV_IMM_I32:
+    return true;
+  }
+}
+
+bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
+  return false;
+}
+
+bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
+  switch(Opcode) {
+    default: return false;
+    case AMDGPU::CUBE_r600_pseudo:
+    case AMDGPU::CUBE_r600_real:
+    case AMDGPU::CUBE_eg_pseudo:
+    case AMDGPU::CUBE_eg_real:
+      return true;
+  }
+}
+
+bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
+  return (TargetFlags & R600_InstFlag::ALU_INST);
+}
+
+bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
+  return ((TargetFlags & R600_InstFlag::OP1) |
+          (TargetFlags & R600_InstFlag::OP2) |
+          (TargetFlags & R600_InstFlag::OP3));
+}
+
+bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
+  return ((TargetFlags & R600_InstFlag::LDS_1A) |
+          (TargetFlags & R600_InstFlag::LDS_1A1D) |
+          (TargetFlags & R600_InstFlag::LDS_1A2D));
+}
+
+bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const {
+  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1;
+}
+
+bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const {
+  if (isALUInstr(MI.getOpcode()))
+    return true;
+  if (isVector(MI) || isCubeOp(MI.getOpcode()))
+    return true;
+  switch (MI.getOpcode()) {
+  case AMDGPU::PRED_X:
+  case AMDGPU::INTERP_PAIR_XY:
+  case AMDGPU::INTERP_PAIR_ZW:
+  case AMDGPU::INTERP_VEC_LOAD:
+  case AMDGPU::COPY:
+  case AMDGPU::DOT_4:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
+  if (ST.hasCaymanISA())
+    return false;
+  return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU);
+}
+
+bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const {
+  return isTransOnly(MI.getOpcode());
+}
+
+bool R600InstrInfo::isVectorOnly(unsigned Opcode) const {
+  return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU);
+}
+
+bool R600InstrInfo::isVectorOnly(const MachineInstr &MI) const {
+  return isVectorOnly(MI.getOpcode());
+}
+
+bool R600InstrInfo::isExport(unsigned Opcode) const {
+  return (get(Opcode).TSFlags & R600_InstFlag::IS_EXPORT);
+}
+
+bool R600InstrInfo::usesVertexCache(unsigned Opcode) const {
+  return ST.hasVertexCache() && IS_VTX(get(Opcode));
+}
+
+bool R600InstrInfo::usesVertexCache(const MachineInstr &MI) const {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  return !AMDGPU::isCompute(MF->getFunction()->getCallingConv()) &&
+         usesVertexCache(MI.getOpcode());
+}
+
+bool R600InstrInfo::usesTextureCache(unsigned Opcode) const {
+  return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode));
+}
+
+bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  return (AMDGPU::isCompute(MF->getFunction()->getCallingConv()) &&
+          usesVertexCache(MI.getOpcode())) ||
+         usesTextureCache(MI.getOpcode());
+}
+
+bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
+  switch (Opcode) {
+  case AMDGPU::KILLGT:
+  case AMDGPU::GROUP_BARRIER:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const {
+  return MI.findRegisterUseOperandIdx(AMDGPU::AR_X) != -1;
+}
+
+bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const {
+  return MI.findRegisterDefOperandIdx(AMDGPU::AR_X) != -1;
+}
+
+bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
+  if (!isALUInstr(MI.getOpcode())) {
+    return false;
+  }
+  for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
+                                        E = MI.operands_end();
+       I != E; ++I) {
+    if (!I->isReg() || !I->isUse() ||
+        TargetRegisterInfo::isVirtualRegister(I->getReg()))
+      continue;
+
+    if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
+      return true;
+  }
+  return false;
+}
+
+int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const {
+  static const unsigned SrcSelTable[][2] = {
+    {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
+    {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
+    {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
+    {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
+    {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
+    {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
+    {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
+    {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
+    {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
+    {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
+    {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}
+  };
+
+  for (const auto &Row : SrcSelTable) {
+    if (getOperandIdx(Opcode, Row[0]) == (int)SrcIdx) {
+      return getOperandIdx(Opcode, Row[1]);
+    }
+  }
+  return -1;
+}
+
+SmallVector<std::pair<MachineOperand *, int64_t>, 3>
+R600InstrInfo::getSrcs(MachineInstr &MI) const {
+  SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result;
+
+  if (MI.getOpcode() == AMDGPU::DOT_4) {
+    static const unsigned OpTable[8][2] = {
+      {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
+      {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
+      {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
+      {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
+      {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
+      {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
+      {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
+      {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W},
+    };
+
+    for (unsigned j = 0; j < 8; j++) {
+      MachineOperand &MO =
+          MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0]));
+      unsigned Reg = MO.getReg();
+      if (Reg == AMDGPU::ALU_CONST) {
+        MachineOperand &Sel =
+            MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
+        Result.push_back(std::make_pair(&MO, Sel.getImm()));
+        continue;
+      }
+
+    }
+    return Result;
+  }
+
+  static const unsigned OpTable[3][2] = {
+    {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
+    {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
+    {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
+  };
+
+  for (unsigned j = 0; j < 3; j++) {
+    int SrcIdx = getOperandIdx(MI.getOpcode(), OpTable[j][0]);
+    if (SrcIdx < 0)
+      break;
+    MachineOperand &MO = MI.getOperand(SrcIdx);
+    unsigned Reg = MO.getReg();
+    if (Reg == AMDGPU::ALU_CONST) {
+      MachineOperand &Sel =
+          MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
+      Result.push_back(std::make_pair(&MO, Sel.getImm()));
+      continue;
+    }
+    if (Reg == AMDGPU::ALU_LITERAL_X) {
+      MachineOperand &Operand =
+          MI.getOperand(getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
+      if (Operand.isImm()) {
+        Result.push_back(std::make_pair(&MO, Operand.getImm()));
+        continue;
+      }
+      assert(Operand.isGlobal());
+    }
+    Result.push_back(std::make_pair(&MO, 0));
+  }
+  return Result;
+}
+
+std::vector<std::pair<int, unsigned>>
+R600InstrInfo::ExtractSrcs(MachineInstr &MI,
+                           const DenseMap<unsigned, unsigned> &PV,
+                           unsigned &ConstCount) const {
+  ConstCount = 0;
+  const std::pair<int, unsigned> DummyPair(-1, 0);
+  std::vector<std::pair<int, unsigned> > Result;
+  unsigned i = 0;
+  for (const auto &Src : getSrcs(MI)) {
+    ++i;
+    unsigned Reg = Src.first->getReg();
+    int Index = RI.getEncodingValue(Reg) & 0xff;
+    if (Reg == AMDGPU::OQAP) {
+      Result.push_back(std::make_pair(Index, 0U));
+    }
+    if (PV.find(Reg) != PV.end()) {
+      // 255 is used to tells its a PS/PV reg
+      Result.push_back(std::make_pair(255, 0U));
+      continue;
+    }
+    if (Index > 127) {
+      ConstCount++;
+      Result.push_back(DummyPair);
+      continue;
+    }
+    unsigned Chan = RI.getHWRegChan(Reg);
+    Result.push_back(std::make_pair(Index, Chan));
+  }
+  for (; i < 3; ++i)
+    Result.push_back(DummyPair);
+  return Result;
+}
+
+static std::vector<std::pair<int, unsigned> >
+Swizzle(std::vector<std::pair<int, unsigned> > Src,
+        R600InstrInfo::BankSwizzle Swz) {
+  if (Src[0] == Src[1])
+    Src[1].first = -1;
+  switch (Swz) {
+  case R600InstrInfo::ALU_VEC_012_SCL_210:
+    break;
+  case R600InstrInfo::ALU_VEC_021_SCL_122:
+    std::swap(Src[1], Src[2]);
+    break;
+  case R600InstrInfo::ALU_VEC_102_SCL_221:
+    std::swap(Src[0], Src[1]);
+    break;
+  case R600InstrInfo::ALU_VEC_120_SCL_212:
+    std::swap(Src[0], Src[1]);
+    std::swap(Src[0], Src[2]);
+    break;
+  case R600InstrInfo::ALU_VEC_201:
+    std::swap(Src[0], Src[2]);
+    std::swap(Src[0], Src[1]);
+    break;
+  case R600InstrInfo::ALU_VEC_210:
+    std::swap(Src[0], Src[2]);
+    break;
+  }
+  return Src;
+}
+
+static unsigned getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
+  switch (Swz) {
+  case R600InstrInfo::ALU_VEC_012_SCL_210: {
+    unsigned Cycles[3] = { 2, 1, 0};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_021_SCL_122: {
+    unsigned Cycles[3] = { 1, 2, 2};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_120_SCL_212: {
+    unsigned Cycles[3] = { 2, 1, 2};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_102_SCL_221: {
+    unsigned Cycles[3] = { 2, 2, 1};
+    return Cycles[Op];
+  }
+  default:
+    llvm_unreachable("Wrong Swizzle for Trans Slot");
+  }
+}
+
+/// returns how many MIs (whose inputs are represented by IGSrcs) can be packed
+/// in the same Instruction Group while meeting read port limitations given a
+/// Swz swizzle sequence.
+unsigned  R600InstrInfo::isLegalUpTo(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const {
+  int Vector[4][3];
+  memset(Vector, -1, sizeof(Vector));
+  for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) {
+    const std::vector<std::pair<int, unsigned> > &Srcs =
+        Swizzle(IGSrcs[i], Swz[i]);
+    for (unsigned j = 0; j < 3; j++) {
+      const std::pair<int, unsigned> &Src = Srcs[j];
+      if (Src.first < 0 || Src.first == 255)
+        continue;
+      if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) {
+        if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 &&
+            Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) {
+            // The value from output queue A (denoted by register OQAP) can
+            // only be fetched during the first cycle.
+            return false;
+        }
+        // OQAP does not count towards the normal read port restrictions
+        continue;
+      }
+      if (Vector[Src.second][j] < 0)
+        Vector[Src.second][j] = Src.first;
+      if (Vector[Src.second][j] != Src.first)
+        return i;
+    }
+  }
+  // Now check Trans Alu
+  for (unsigned i = 0, e = TransSrcs.size(); i < e; ++i) {
+    const std::pair<int, unsigned> &Src = TransSrcs[i];
+    unsigned Cycle = getTransSwizzle(TransSwz, i);
+    if (Src.first < 0)
+      continue;
+    if (Src.first == 255)
+      continue;
+    if (Vector[Src.second][Cycle] < 0)
+      Vector[Src.second][Cycle] = Src.first;
+    if (Vector[Src.second][Cycle] != Src.first)
+      return IGSrcs.size() - 1;
+  }
+  return IGSrcs.size();
+}
+
+/// Given a swizzle sequence SwzCandidate and an index Idx, returns the next
+/// (in lexicographic term) swizzle sequence assuming that all swizzles after
+/// Idx can be skipped
+static bool
+NextPossibleSolution(
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    unsigned Idx) {
+  assert(Idx < SwzCandidate.size());
+  int ResetIdx = Idx;
+  while (ResetIdx > -1 && SwzCandidate[ResetIdx] == R600InstrInfo::ALU_VEC_210)
+    ResetIdx --;
+  for (unsigned i = ResetIdx + 1, e = SwzCandidate.size(); i < e; i++) {
+    SwzCandidate[i] = R600InstrInfo::ALU_VEC_012_SCL_210;
+  }
+  if (ResetIdx == -1)
+    return false;
+  int NextSwizzle = SwzCandidate[ResetIdx] + 1;
+  SwzCandidate[ResetIdx] = (R600InstrInfo::BankSwizzle)NextSwizzle;
+  return true;
+}
+
+/// Enumerate all possible Swizzle sequence to find one that can meet all
+/// read port requirements.
+bool R600InstrInfo::FindSwizzleForVectorSlot(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const {
+  unsigned ValidUpTo = 0;
+  do {
+    ValidUpTo = isLegalUpTo(IGSrcs, SwzCandidate, TransSrcs, TransSwz);
+    if (ValidUpTo == IGSrcs.size())
+      return true;
+  } while (NextPossibleSolution(SwzCandidate, ValidUpTo));
+  return false;
+}
+
+/// Instructions in Trans slot can't read gpr at cycle 0 if they also read
+/// a const, and can't read a gpr at cycle 1 if they read 2 const.
+static bool
+isConstCompatible(R600InstrInfo::BankSwizzle TransSwz,
+                  const std::vector<std::pair<int, unsigned> > &TransOps,
+                  unsigned ConstCount) {
+  // TransALU can't read 3 constants
+  if (ConstCount > 2)
+    return false;
+  for (unsigned i = 0, e = TransOps.size(); i < e; ++i) {
+    const std::pair<int, unsigned> &Src = TransOps[i];
+    unsigned Cycle = getTransSwizzle(TransSwz, i);
+    if (Src.first < 0)
+      continue;
+    if (ConstCount > 0 && Cycle == 0)
+      return false;
+    if (ConstCount > 1 && Cycle == 1)
+      return false;
+  }
+  return true;
+}
+
+bool
+R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
+                                       const DenseMap<unsigned, unsigned> &PV,
+                                       std::vector<BankSwizzle> &ValidSwizzle,
+                                       bool isLastAluTrans)
+    const {
+  //Todo : support shared src0 - src1 operand
+
+  std::vector<std::vector<std::pair<int, unsigned> > > IGSrcs;
+  ValidSwizzle.clear();
+  unsigned ConstCount;
+  BankSwizzle TransBS = ALU_VEC_012_SCL_210;
+  for (unsigned i = 0, e = IG.size(); i < e; ++i) {
+    IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount));
+    unsigned Op = getOperandIdx(IG[i]->getOpcode(),
+        AMDGPU::OpName::bank_swizzle);
+    ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
+        IG[i]->getOperand(Op).getImm());
+  }
+  std::vector<std::pair<int, unsigned> > TransOps;
+  if (!isLastAluTrans)
+    return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS);
+
+  TransOps = std::move(IGSrcs.back());
+  IGSrcs.pop_back();
+  ValidSwizzle.pop_back();
+
+  static const R600InstrInfo::BankSwizzle TransSwz[] = {
+    ALU_VEC_012_SCL_210,
+    ALU_VEC_021_SCL_122,
+    ALU_VEC_120_SCL_212,
+    ALU_VEC_102_SCL_221
+  };
+  for (unsigned i = 0; i < 4; i++) {
+    TransBS = TransSwz[i];
+    if (!isConstCompatible(TransBS, TransOps, ConstCount))
+      continue;
+    bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps,
+        TransBS);
+    if (Result) {
+      ValidSwizzle.push_back(TransBS);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+bool
+R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
+    const {
+  assert (Consts.size() <= 12 && "Too many operands in instructions group");
+  unsigned Pair1 = 0, Pair2 = 0;
+  for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
+    unsigned ReadConstHalf = Consts[i] & 2;
+    unsigned ReadConstIndex = Consts[i] & (~3);
+    unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf;
+    if (!Pair1) {
+      Pair1 = ReadHalfConst;
+      continue;
+    }
+    if (Pair1 == ReadHalfConst)
+      continue;
+    if (!Pair2) {
+      Pair2 = ReadHalfConst;
+      continue;
+    }
+    if (Pair2 != ReadHalfConst)
+      return false;
+  }
+  return true;
+}
+
+bool
+R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
+    const {
+  std::vector<unsigned> Consts;
+  SmallSet<int64_t, 4> Literals;
+  for (unsigned i = 0, n = MIs.size(); i < n; i++) {
+    MachineInstr &MI = *MIs[i];
+    if (!isALUInstr(MI.getOpcode()))
+      continue;
+
+    for (const auto &Src : getSrcs(MI)) {
+      if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X)
+        Literals.insert(Src.second);
+      if (Literals.size() > 4)
+        return false;
+      if (Src.first->getReg() == AMDGPU::ALU_CONST)
+        Consts.push_back(Src.second);
+      if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) ||
+          AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) {
+        unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff;
+        unsigned Chan = RI.getHWRegChan(Src.first->getReg());
+        Consts.push_back((Index << 2) | Chan);
+      }
+    }
+  }
+  return fitsConstReadLimitations(Consts);
+}
+
+DFAPacketizer *
+R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const {
+  const InstrItineraryData *II = STI.getInstrItineraryData();
+  return static_cast<const R600Subtarget &>(STI).createDFAPacketizer(II);
+}
+
+static bool
+isPredicateSetter(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::PRED_X:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static MachineInstr *
+findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I) {
+  while (I != MBB.begin()) {
+    --I;
+    MachineInstr &MI = *I;
+    if (isPredicateSetter(MI.getOpcode()))
+      return &MI;
+  }
+
+  return nullptr;
+}
+
+static
+bool isJump(unsigned Opcode) {
+  return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND;
+}
+
+static bool isBranch(unsigned Opcode) {
+  return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 ||
+      Opcode == AMDGPU::BRANCH_COND_f32;
+}
+
+bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                  MachineBasicBlock *&TBB,
+                                  MachineBasicBlock *&FBB,
+                                  SmallVectorImpl<MachineOperand> &Cond,
+                                  bool AllowModify) const {
+  // Most of the following comes from the ARM implementation of AnalyzeBranch
+
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return false;
+
+  // AMDGPU::BRANCH* instructions are only available after isel and are not
+  // handled
+  if (isBranch(I->getOpcode()))
+    return true;
+  if (!isJump(I->getOpcode())) {
+    return false;
+  }
+
+  // Remove successive JUMP
+  while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) {
+      MachineBasicBlock::iterator PriorI = std::prev(I);
+      if (AllowModify)
+        I->removeFromParent();
+      I = PriorI;
+  }
+  MachineInstr &LastInst = *I;
+
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst.getOpcode();
+  if (I == MBB.begin() || !isJump((--I)->getOpcode())) {
+    if (LastOpc == AMDGPU::JUMP) {
+      TBB = LastInst.getOperand(0).getMBB();
+      return false;
+    } else if (LastOpc == AMDGPU::JUMP_COND) {
+      auto predSet = I;
+      while (!isPredicateSetter(predSet->getOpcode())) {
+        predSet = --I;
+      }
+      TBB = LastInst.getOperand(0).getMBB();
+      Cond.push_back(predSet->getOperand(1));
+      Cond.push_back(predSet->getOperand(2));
+      Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+      return false;
+    }
+    return true;  // Can't handle indirect branch.
+  }
+
+  // Get the instruction before it if it is a terminator.
+  MachineInstr &SecondLastInst = *I;
+  unsigned SecondLastOpc = SecondLastInst.getOpcode();
+
+  // If the block ends with a B and a Bcc, handle it.
+  if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) {
+    auto predSet = --I;
+    while (!isPredicateSetter(predSet->getOpcode())) {
+      predSet = --I;
+    }
+    TBB = SecondLastInst.getOperand(0).getMBB();
+    FBB = LastInst.getOperand(0).getMBB();
+    Cond.push_back(predSet->getOperand(1));
+    Cond.push_back(predSet->getOperand(2));
+    Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+static
+MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
+  for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend();
+      It != E; ++It) {
+    if (It->getOpcode() == AMDGPU::CF_ALU ||
+        It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+      return It.getReverse();
+  }
+  return MBB.end();
+}
+
+unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *TBB,
+                                     MachineBasicBlock *FBB,
+                                     ArrayRef<MachineOperand> Cond,
+                                     const DebugLoc &DL,
+                                     int *BytesAdded) const {
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+  assert(!BytesAdded && "code size not handled");
+
+  if (!FBB) {
+    if (Cond.empty()) {
+      BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB);
+      return 1;
+    } else {
+      MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
+      assert(PredSet && "No previous predicate !");
+      addFlag(*PredSet, 0, MO_FLAG_PUSH);
+      PredSet->getOperand(2).setImm(Cond[1].getImm());
+
+      BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+             .addMBB(TBB)
+             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+      MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+      if (CfAlu == MBB.end())
+        return 1;
+      assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
+      CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
+      return 1;
+    }
+  } else {
+    MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
+    assert(PredSet && "No previous predicate !");
+    addFlag(*PredSet, 0, MO_FLAG_PUSH);
+    PredSet->getOperand(2).setImm(Cond[1].getImm());
+    BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+            .addMBB(TBB)
+            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+    BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB);
+    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+    if (CfAlu == MBB.end())
+      return 2;
+    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
+    CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
+    return 2;
+  }
+}
+
+unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                     int *BytesRemoved) const {
+    assert(!BytesRemoved && "code size not handled");
+
+  // Note : we leave PRED* instructions there.
+  // They may be needed when predicating instructions.
+
+  MachineBasicBlock::iterator I = MBB.end();
+
+  if (I == MBB.begin()) {
+    return 0;
+  }
+  --I;
+  switch (I->getOpcode()) {
+  default:
+    return 0;
+  case AMDGPU::JUMP_COND: {
+    MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
+    clearFlag(*predSet, 0, MO_FLAG_PUSH);
+    I->eraseFromParent();
+    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+    if (CfAlu == MBB.end())
+      break;
+    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
+    CfAlu->setDesc(get(AMDGPU::CF_ALU));
+    break;
+  }
+  case AMDGPU::JUMP:
+    I->eraseFromParent();
+    break;
+  }
+  I = MBB.end();
+
+  if (I == MBB.begin()) {
+    return 1;
+  }
+  --I;
+  switch (I->getOpcode()) {
+    // FIXME: only one case??
+  default:
+    return 1;
+  case AMDGPU::JUMP_COND: {
+    MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
+    clearFlag(*predSet, 0, MO_FLAG_PUSH);
+    I->eraseFromParent();
+    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+    if (CfAlu == MBB.end())
+      break;
+    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
+    CfAlu->setDesc(get(AMDGPU::CF_ALU));
+    break;
+  }
+  case AMDGPU::JUMP:
+    I->eraseFromParent();
+    break;
+  }
+  return 2;
+}
+
+bool R600InstrInfo::isPredicated(const MachineInstr &MI) const {
+  int idx = MI.findFirstPredOperandIdx();
+  if (idx < 0)
+    return false;
+
+  unsigned Reg = MI.getOperand(idx).getReg();
+  switch (Reg) {
+  default: return false;
+  case AMDGPU::PRED_SEL_ONE:
+  case AMDGPU::PRED_SEL_ZERO:
+  case AMDGPU::PREDICATE_BIT:
+    return true;
+  }
+}
+
+bool R600InstrInfo::isPredicable(MachineInstr &MI) const {
+  // XXX: KILL* instructions can be predicated, but they must be the last
+  // instruction in a clause, so this means any instructions after them cannot
+  // be predicated.  Until we have proper support for instruction clauses in the
+  // backend, we will mark KILL* instructions as unpredicable.
+
+  if (MI.getOpcode() == AMDGPU::KILLGT) {
+    return false;
+  } else if (MI.getOpcode() == AMDGPU::CF_ALU) {
+    // If the clause start in the middle of MBB then the MBB has more
+    // than a single clause, unable to predicate several clauses.
+    if (MI.getParent()->begin() != MachineBasicBlock::iterator(MI))
+      return false;
+    // TODO: We don't support KC merging atm
+    return MI.getOperand(3).getImm() == 0 && MI.getOperand(4).getImm() == 0;
+  } else if (isVector(MI)) {
+    return false;
+  } else {
+    return AMDGPUInstrInfo::isPredicable(MI);
+  }
+}
+
+
+bool
+R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
+                                   unsigned NumCyles,
+                                   unsigned ExtraPredCycles,
+                                   BranchProbability Probability) const{
+  return true;
+}
+
+bool
+R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                                   unsigned NumTCycles,
+                                   unsigned ExtraTCycles,
+                                   MachineBasicBlock &FMBB,
+                                   unsigned NumFCycles,
+                                   unsigned ExtraFCycles,
+                                   BranchProbability Probability) const {
+  return true;
+}
+
+bool
+R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
+                                         unsigned NumCyles,
+                                         BranchProbability Probability)
+                                         const {
+  return true;
+}
+
+bool
+R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                         MachineBasicBlock &FMBB) const {
+  return false;
+}
+
+
+bool
+R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  MachineOperand &MO = Cond[1];
+  switch (MO.getImm()) {
+  case AMDGPU::PRED_SETE_INT:
+    MO.setImm(AMDGPU::PRED_SETNE_INT);
+    break;
+  case AMDGPU::PRED_SETNE_INT:
+    MO.setImm(AMDGPU::PRED_SETE_INT);
+    break;
+  case AMDGPU::PRED_SETE:
+    MO.setImm(AMDGPU::PRED_SETNE);
+    break;
+  case AMDGPU::PRED_SETNE:
+    MO.setImm(AMDGPU::PRED_SETE);
+    break;
+  default:
+    return true;
+  }
+
+  MachineOperand &MO2 = Cond[2];
+  switch (MO2.getReg()) {
+  case AMDGPU::PRED_SEL_ZERO:
+    MO2.setReg(AMDGPU::PRED_SEL_ONE);
+    break;
+  case AMDGPU::PRED_SEL_ONE:
+    MO2.setReg(AMDGPU::PRED_SEL_ZERO);
+    break;
+  default:
+    return true;
+  }
+  return false;
+}
+
+bool R600InstrInfo::DefinesPredicate(MachineInstr &MI,
+                                     std::vector<MachineOperand> &Pred) const {
+  return isPredicateSetter(MI.getOpcode());
+}
+
+
+bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
+                                         ArrayRef<MachineOperand> Pred) const {
+  int PIdx = MI.findFirstPredOperandIdx();
+
+  if (MI.getOpcode() == AMDGPU::CF_ALU) {
+    MI.getOperand(8).setImm(0);
+    return true;
+  }
+
+  if (MI.getOpcode() == AMDGPU::DOT_4) {
+    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_X))
+        .setReg(Pred[2].getReg());
+    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Y))
+        .setReg(Pred[2].getReg());
+    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Z))
+        .setReg(Pred[2].getReg());
+    MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_W))
+        .setReg(Pred[2].getReg());
+    MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+    MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+    return true;
+  }
+
+  if (PIdx != -1) {
+    MachineOperand &PMO = MI.getOperand(PIdx);
+    PMO.setReg(Pred[2].getReg());
+    MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+    MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+    return true;
+  }
+
+  return false;
+}
+
+unsigned int R600InstrInfo::getPredicationCost(const MachineInstr &) const {
+  return 2;
+}
+
+unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                            const MachineInstr &,
+                                            unsigned *PredCost) const {
+  if (PredCost)
+    *PredCost = 2;
+  return 2;
+}
+
+unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
+                                                   unsigned Channel) const {
+  assert(Channel == 0);
+  return RegIndex;
+}
+
+bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default: {
+    MachineBasicBlock *MBB = MI.getParent();
+    int OffsetOpIdx =
+        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::addr);
+    // addr is a custom operand with multiple MI operands, and only the
+    // first MI operand is given a name.
+    int RegOpIdx = OffsetOpIdx + 1;
+    int ChanOpIdx =
+        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::chan);
+    if (isRegisterLoad(MI)) {
+      int DstOpIdx =
+          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+      unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
+      unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
+      unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+      unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
+      if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+        buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(),
+                      getIndirectAddrRegClass()->getRegister(Address));
+      } else {
+        buildIndirectRead(MBB, MI, MI.getOperand(DstOpIdx).getReg(), Address,
+                          OffsetReg);
+      }
+    } else if (isRegisterStore(MI)) {
+      int ValOpIdx =
+          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::val);
+      unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
+      unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
+      unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+      unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
+      if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+        buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
+                      MI.getOperand(ValOpIdx).getReg());
+      } else {
+        buildIndirectWrite(MBB, MI, MI.getOperand(ValOpIdx).getReg(),
+                           calculateIndirectAddress(RegIndex, Channel),
+                           OffsetReg);
+      }
+    } else {
+      return false;
+    }
+
+    MBB->erase(MI);
+    return true;
+  }
+  case AMDGPU::R600_EXTRACT_ELT_V2:
+  case AMDGPU::R600_EXTRACT_ELT_V4:
+    buildIndirectRead(MI.getParent(), MI, MI.getOperand(0).getReg(),
+                      RI.getHWRegIndex(MI.getOperand(1).getReg()), //  Address
+                      MI.getOperand(2).getReg(),
+                      RI.getHWRegChan(MI.getOperand(1).getReg()));
+    break;
+  case AMDGPU::R600_INSERT_ELT_V2:
+  case AMDGPU::R600_INSERT_ELT_V4:
+    buildIndirectWrite(MI.getParent(), MI, MI.getOperand(2).getReg(), // Value
+                       RI.getHWRegIndex(MI.getOperand(1).getReg()),   // Address
+                       MI.getOperand(3).getReg(),                     // Offset
+                       RI.getHWRegChan(MI.getOperand(1).getReg()));   // Channel
+    break;
+  }
+  MI.eraseFromParent();
+  return true;
+}
+
+void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
+                                             const MachineFunction &MF) const {
+  const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+  const R600FrameLowering *TFL = ST.getFrameLowering();
+
+  unsigned StackWidth = TFL->getStackWidth(MF);
+  int End = getIndirectIndexEnd(MF);
+
+  if (End == -1)
+    return;
+
+  for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) {
+    unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index);
+    Reserved.set(SuperReg);
+    for (unsigned Chan = 0; Chan < StackWidth; ++Chan) {
+      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
+      Reserved.set(Reg);
+    }
+  }
+}
+
+const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const {
+  return &AMDGPU::R600_TReg32_XRegClass;
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg) const {
+  return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0);
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg,
+                                       unsigned AddrChan) const {
+  unsigned AddrReg;
+  switch (AddrChan) {
+    default: llvm_unreachable("Invalid Channel");
+    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
+    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
+    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
+    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+  }
+  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
+                                               AMDGPU::AR_X, OffsetReg);
+  setImmOperand(*MOVA, AMDGPU::OpName::write, 0);
+
+  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+                                      AddrReg, ValueReg)
+                                      .addReg(AMDGPU::AR_X,
+                                           RegState::Implicit | RegState::Kill);
+  setImmOperand(*Mov, AMDGPU::OpName::dst_rel, 1);
+  return Mov;
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg) const {
+  return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0);
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg,
+                                       unsigned AddrChan) const {
+  unsigned AddrReg;
+  switch (AddrChan) {
+    default: llvm_unreachable("Invalid Channel");
+    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
+    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
+    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
+    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
+  }
+  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
+                                                       AMDGPU::AR_X,
+                                                       OffsetReg);
+  setImmOperand(*MOVA, AMDGPU::OpName::write, 0);
+  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+                                      ValueReg,
+                                      AddrReg)
+                                      .addReg(AMDGPU::AR_X,
+                                           RegState::Implicit | RegState::Kill);
+  setImmOperand(*Mov, AMDGPU::OpName::src0_rel, 1);
+
+  return Mov;
+}
+
+int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  int Offset = -1;
+
+  if (MFI.getNumObjects() == 0) {
+    return -1;
+  }
+
+  if (MRI.livein_empty()) {
+    return 0;
+  }
+
+  const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
+  for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
+                                            LE = MRI.livein_end();
+                                            LI != LE; ++LI) {
+    unsigned Reg = LI->first;
+    if (TargetRegisterInfo::isVirtualRegister(Reg) ||
+        !IndirectRC->contains(Reg))
+      continue;
+
+    unsigned RegIndex;
+    unsigned RegEnd;
+    for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd;
+                                                          ++RegIndex) {
+      if (IndirectRC->getRegister(RegIndex) == Reg)
+        break;
+    }
+    Offset = std::max(Offset, (int)RegIndex);
+  }
+
+  return Offset + 1;
+}
+
+int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
+  int Offset = 0;
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // Variable sized objects are not supported
+  if (MFI.hasVarSizedObjects()) {
+    return -1;
+  }
+
+  if (MFI.getNumObjects() == 0) {
+    return -1;
+  }
+
+  const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+  const R600FrameLowering *TFL = ST.getFrameLowering();
+
+  unsigned IgnoredFrameReg;
+  Offset = TFL->getFrameIndexReference(MF, -1, IgnoredFrameReg);
+
+  return getIndirectIndexBegin(MF) + Offset;
+}
+
+unsigned R600InstrInfo::getMaxAlusPerClause() const {
+  return 115;
+}
+
+MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB,
+                                                  MachineBasicBlock::iterator I,
+                                                  unsigned Opcode,
+                                                  unsigned DstReg,
+                                                  unsigned Src0Reg,
+                                                  unsigned Src1Reg) const {
+  MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode),
+    DstReg);           // $dst
+
+  if (Src1Reg) {
+    MIB.addImm(0)     // $update_exec_mask
+       .addImm(0);    // $update_predicate
+  }
+  MIB.addImm(1)        // $write
+     .addImm(0)        // $omod
+     .addImm(0)        // $dst_rel
+     .addImm(0)        // $dst_clamp
+     .addReg(Src0Reg)  // $src0
+     .addImm(0)        // $src0_neg
+     .addImm(0)        // $src0_rel
+     .addImm(0)        // $src0_abs
+     .addImm(-1);       // $src0_sel
+
+  if (Src1Reg) {
+    MIB.addReg(Src1Reg) // $src1
+       .addImm(0)       // $src1_neg
+       .addImm(0)       // $src1_rel
+       .addImm(0)       // $src1_abs
+       .addImm(-1);      // $src1_sel
+  }
+
+  //XXX: The r600g finalizer expects this to be 1, once we've moved the
+  //scheduling to the backend, we can change the default to 0.
+  MIB.addImm(1)        // $last
+      .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel
+      .addImm(0)         // $literal
+      .addImm(0);        // $bank_swizzle
+
+  return MIB;
+}
+
+#define OPERAND_CASE(Label) \
+  case Label: { \
+    static const unsigned Ops[] = \
+    { \
+      Label##_X, \
+      Label##_Y, \
+      Label##_Z, \
+      Label##_W \
+    }; \
+    return Ops[Slot]; \
+  }
+
+static unsigned getSlotedOps(unsigned  Op, unsigned Slot) {
+  switch (Op) {
+  OPERAND_CASE(AMDGPU::OpName::update_exec_mask)
+  OPERAND_CASE(AMDGPU::OpName::update_pred)
+  OPERAND_CASE(AMDGPU::OpName::write)
+  OPERAND_CASE(AMDGPU::OpName::omod)
+  OPERAND_CASE(AMDGPU::OpName::dst_rel)
+  OPERAND_CASE(AMDGPU::OpName::clamp)
+  OPERAND_CASE(AMDGPU::OpName::src0)
+  OPERAND_CASE(AMDGPU::OpName::src0_neg)
+  OPERAND_CASE(AMDGPU::OpName::src0_rel)
+  OPERAND_CASE(AMDGPU::OpName::src0_abs)
+  OPERAND_CASE(AMDGPU::OpName::src0_sel)
+  OPERAND_CASE(AMDGPU::OpName::src1)
+  OPERAND_CASE(AMDGPU::OpName::src1_neg)
+  OPERAND_CASE(AMDGPU::OpName::src1_rel)
+  OPERAND_CASE(AMDGPU::OpName::src1_abs)
+  OPERAND_CASE(AMDGPU::OpName::src1_sel)
+  OPERAND_CASE(AMDGPU::OpName::pred_sel)
+  default:
+    llvm_unreachable("Wrong Operand");
+  }
+}
+
+#undef OPERAND_CASE
+
+MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
+    MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg)
+    const {
+  assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
+  unsigned Opcode;
+  if (ST.getGeneration() <= R600Subtarget::R700)
+    Opcode = AMDGPU::DOT4_r600;
+  else
+    Opcode = AMDGPU::DOT4_eg;
+  MachineBasicBlock::iterator I = MI;
+  MachineOperand &Src0 = MI->getOperand(
+      getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot)));
+  MachineOperand &Src1 = MI->getOperand(
+      getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot)));
+  MachineInstr *MIB = buildDefaultInstruction(
+      MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg());
+  static const unsigned  Operands[14] = {
+    AMDGPU::OpName::update_exec_mask,
+    AMDGPU::OpName::update_pred,
+    AMDGPU::OpName::write,
+    AMDGPU::OpName::omod,
+    AMDGPU::OpName::dst_rel,
+    AMDGPU::OpName::clamp,
+    AMDGPU::OpName::src0_neg,
+    AMDGPU::OpName::src0_rel,
+    AMDGPU::OpName::src0_abs,
+    AMDGPU::OpName::src0_sel,
+    AMDGPU::OpName::src1_neg,
+    AMDGPU::OpName::src1_rel,
+    AMDGPU::OpName::src1_abs,
+    AMDGPU::OpName::src1_sel,
+  };
+
+  MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
+      getSlotedOps(AMDGPU::OpName::pred_sel, Slot)));
+  MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel))
+      .setReg(MO.getReg());
+
+  for (unsigned i = 0; i < 14; i++) {
+    MachineOperand &MO = MI->getOperand(
+        getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot)));
+    assert (MO.isImm());
+    setImmOperand(*MIB, Operands[i], MO.getImm());
+  }
+  MIB->getOperand(20).setImm(0);
+  return MIB;
+}
+
+MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned DstReg,
+                                         uint64_t Imm) const {
+  MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
+                                                  AMDGPU::ALU_LITERAL_X);
+  setImmOperand(*MovImm, AMDGPU::OpName::literal, Imm);
+  return MovImm;
+}
+
+MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned DstReg, unsigned SrcReg) const {
+  return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg);
+}
+
+int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const {
+  return getOperandIdx(MI.getOpcode(), Op);
+}
+
+int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const {
+  return AMDGPU::getNamedOperandIdx(Opcode, Op);
+}
+
+void R600InstrInfo::setImmOperand(MachineInstr &MI, unsigned Op,
+                                  int64_t Imm) const {
+  int Idx = getOperandIdx(MI, Op);
+  assert(Idx != -1 && "Operand not supported for this instruction.");
+  assert(MI.getOperand(Idx).isImm());
+  MI.getOperand(Idx).setImm(Imm);
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction flag getters/setters
+//===----------------------------------------------------------------------===//
+
+MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx,
+                                         unsigned Flag) const {
+  unsigned TargetFlags = get(MI.getOpcode()).TSFlags;
+  int FlagIndex = 0;
+  if (Flag != 0) {
+    // If we pass something other than the default value of Flag to this
+    // function, it means we are want to set a flag on an instruction
+    // that uses native encoding.
+    assert(HAS_NATIVE_OPERANDS(TargetFlags));
+    bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
+    switch (Flag) {
+    case MO_FLAG_CLAMP:
+      FlagIndex = getOperandIdx(MI, AMDGPU::OpName::clamp);
+      break;
+    case MO_FLAG_MASK:
+      FlagIndex = getOperandIdx(MI, AMDGPU::OpName::write);
+      break;
+    case MO_FLAG_NOT_LAST:
+    case MO_FLAG_LAST:
+      FlagIndex = getOperandIdx(MI, AMDGPU::OpName::last);
+      break;
+    case MO_FLAG_NEG:
+      switch (SrcIdx) {
+      case 0:
+        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_neg);
+        break;
+      case 1:
+        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_neg);
+        break;
+      case 2:
+        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src2_neg);
+        break;
+      }
+      break;
+
+    case MO_FLAG_ABS:
+      assert(!IsOP3 && "Cannot set absolute value modifier for OP3 "
+                       "instructions.");
+      (void)IsOP3;
+      switch (SrcIdx) {
+      case 0:
+        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_abs);
+        break;
+      case 1:
+        FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_abs);
+        break;
+      }
+      break;
+
+    default:
+      FlagIndex = -1;
+      break;
+    }
+    assert(FlagIndex != -1 && "Flag not supported for this instruction");
+  } else {
+      FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags);
+      assert(FlagIndex != 0 &&
+         "Instruction flags not supported for this instruction");
+  }
+
+  MachineOperand &FlagOp = MI.getOperand(FlagIndex);
+  assert(FlagOp.isImm());
+  return FlagOp;
+}
+
+void R600InstrInfo::addFlag(MachineInstr &MI, unsigned Operand,
+                            unsigned Flag) const {
+  unsigned TargetFlags = get(MI.getOpcode()).TSFlags;
+  if (Flag == 0) {
+    return;
+  }
+  if (HAS_NATIVE_OPERANDS(TargetFlags)) {
+    MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
+    if (Flag == MO_FLAG_NOT_LAST) {
+      clearFlag(MI, Operand, MO_FLAG_LAST);
+    } else if (Flag == MO_FLAG_MASK) {
+      clearFlag(MI, Operand, Flag);
+    } else {
+      FlagOp.setImm(1);
+    }
+  } else {
+      MachineOperand &FlagOp = getFlagOp(MI, Operand);
+      FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand)));
+  }
+}
+
+void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand,
+                              unsigned Flag) const {
+  unsigned TargetFlags = get(MI.getOpcode()).TSFlags;
+  if (HAS_NATIVE_OPERANDS(TargetFlags)) {
+    MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
+    FlagOp.setImm(0);
+  } else {
+    MachineOperand &FlagOp = getFlagOp(MI);
+    unsigned InstFlags = FlagOp.getImm();
+    InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand));
+    FlagOp.setImm(InstFlags);
+  }
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
new file mode 100644
index 000000000000..a280052dbd4a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -0,0 +1,331 @@
+//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for R600InstrInfo
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H
+
+#include "AMDGPUInstrInfo.h"
+#include "R600RegisterInfo.h"
+
+namespace llvm {
+
+namespace R600InstrFlags {
+enum : uint64_t {
+ REGISTER_STORE = UINT64_C(1) << 62,
+ REGISTER_LOAD = UINT64_C(1) << 63
+};
+}
+
+class AMDGPUTargetMachine;
+class DFAPacketizer;
+class MachineFunction;
+class MachineInstr;
+class MachineInstrBuilder;
+class R600Subtarget;
+
+class R600InstrInfo final : public AMDGPUInstrInfo {
+private:
+  const R600RegisterInfo RI;
+  const R600Subtarget &ST;
+
+  std::vector<std::pair<int, unsigned>>
+  ExtractSrcs(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PV,
+              unsigned &ConstCount) const;
+
+  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg, unsigned Address,
+                                        unsigned OffsetReg,
+                                        unsigned AddrChan) const;
+
+  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned ValueReg, unsigned Address,
+                                         unsigned OffsetReg,
+                                         unsigned AddrChan) const;
+public:
+  enum BankSwizzle {
+    ALU_VEC_012_SCL_210 = 0,
+    ALU_VEC_021_SCL_122,
+    ALU_VEC_120_SCL_212,
+    ALU_VEC_102_SCL_221,
+    ALU_VEC_201,
+    ALU_VEC_210
+  };
+
+  explicit R600InstrInfo(const R600Subtarget &);
+
+  const R600RegisterInfo &getRegisterInfo() const {
+    return RI;
+  }
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+  bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI) const override;
+
+  bool isReductionOp(unsigned opcode) const;
+  bool isCubeOp(unsigned opcode) const;
+
+  /// \returns true if this \p Opcode represents an ALU instruction.
+  bool isALUInstr(unsigned Opcode) const;
+  bool hasInstrModifiers(unsigned Opcode) const;
+  bool isLDSInstr(unsigned Opcode) const;
+  bool isLDSRetInstr(unsigned Opcode) const;
+
+  /// \returns true if this \p Opcode represents an ALU instruction or an
+  /// instruction that will be lowered in ExpandSpecialInstrs Pass.
+  bool canBeConsideredALU(const MachineInstr &MI) const;
+
+  bool isTransOnly(unsigned Opcode) const;
+  bool isTransOnly(const MachineInstr &MI) const;
+  bool isVectorOnly(unsigned Opcode) const;
+  bool isVectorOnly(const MachineInstr &MI) const;
+  bool isExport(unsigned Opcode) const;
+
+  bool usesVertexCache(unsigned Opcode) const;
+  bool usesVertexCache(const MachineInstr &MI) const;
+  bool usesTextureCache(unsigned Opcode) const;
+  bool usesTextureCache(const MachineInstr &MI) const;
+
+  bool mustBeLastInClause(unsigned Opcode) const;
+  bool usesAddressRegister(MachineInstr &MI) const;
+  bool definesAddressRegister(MachineInstr &MI) const;
+  bool readsLDSSrcReg(const MachineInstr &MI) const;
+
+  /// \returns The operand Index for the Sel operand given an index to one
+  /// of the instruction's src operands.
+  int getSelIdx(unsigned Opcode, unsigned SrcIdx) const;
+
+  /// \returns a pair for each src of an ALU instructions.
+  /// The first member of a pair is the register id.
+  /// If register is ALU_CONST, second member is SEL.
+  /// If register is ALU_LITERAL, second member is IMM.
+  /// Otherwise, second member value is undefined.
+  SmallVector<std::pair<MachineOperand *, int64_t>, 3>
+  getSrcs(MachineInstr &MI) const;
+
+  unsigned  isLegalUpTo(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const;
+
+  bool FindSwizzleForVectorSlot(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const;
+
+  /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210
+  /// returns true and the first (in lexical order) BankSwizzle affectation
+  /// starting from the one already provided in the Instruction Group MIs that
+  /// fits Read Port limitations in BS if available. Otherwise returns false
+  /// and undefined content in BS.
+  /// isLastAluTrans should be set if the last Alu of MIs will be executed on
+  /// Trans ALU. In this case, ValidTSwizzle returns the BankSwizzle value to
+  /// apply to the last instruction.
+  /// PV holds GPR to PV registers in the Instruction Group MIs.
+  bool fitsReadPortLimitations(const std::vector<MachineInstr *> &MIs,
+                               const DenseMap<unsigned, unsigned> &PV,
+                               std::vector<BankSwizzle> &BS,
+                               bool isLastAluTrans) const;
+
+  /// An instruction group can only access 2 channel pair (either [XY] or [ZW])
+  /// from KCache bank on R700+. This function check if MI set in input meet
+  /// this limitations
+  bool fitsConstReadLimitations(const std::vector<MachineInstr *> &) const;
+  /// Same but using const index set instead of MI set.
+  bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
+
+  /// \brief Vector instructions are instructions that must fill all
+  /// instruction slots within an instruction group.
+  bool isVector(const MachineInstr &MI) const;
+
+  bool isMov(unsigned Opcode) const;
+
+  DFAPacketizer *
+  CreateTargetScheduleState(const TargetSubtargetInfo &) const override;
+
+  bool reverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemvoed = nullptr) const override;
+
+  bool isPredicated(const MachineInstr &MI) const override;
+
+  bool isPredicable(MachineInstr &MI) const override;
+
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+                                 BranchProbability Probability) const override;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+                           unsigned ExtraPredCycles,
+                           BranchProbability Probability) const override ;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                           unsigned NumTCycles, unsigned ExtraTCycles,
+                           MachineBasicBlock &FMBB,
+                           unsigned NumFCycles, unsigned ExtraFCycles,
+                           BranchProbability Probability) const override;
+
+  bool DefinesPredicate(MachineInstr &MI,
+                        std::vector<MachineOperand> &Pred) const override;
+
+  bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                 MachineBasicBlock &FMBB) const override;
+
+  bool PredicateInstruction(MachineInstr &MI,
+                            ArrayRef<MachineOperand> Pred) const override;
+
+  unsigned int getPredicationCost(const MachineInstr &) const override;
+
+  unsigned int getInstrLatency(const InstrItineraryData *ItinData,
+                               const MachineInstr &MI,
+                               unsigned *PredCost = nullptr) const override;
+
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+  /// \brief Reserve the registers that may be accesed using indirect addressing.
+  void reserveIndirectRegisters(BitVector &Reserved,
+                                const MachineFunction &MF) const;
+
+  /// Calculate the "Indirect Address" for the given \p RegIndex and
+  /// \p Channel
+  ///
+  /// We model indirect addressing using a virtual address space that can be
+  /// accesed with loads and stores.  The "Indirect Address" is the memory
+  /// address in this virtual address space that maps to the given \p RegIndex
+  /// and \p Channel.
+  unsigned calculateIndirectAddress(unsigned RegIndex, unsigned Channel) const;
+
+
+  /// \returns The register class to be used for loading and storing values
+  /// from an "Indirect Address" .
+  const TargetRegisterClass *getIndirectAddrRegClass() const;
+
+  /// \returns the smallest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  int getIndirectIndexBegin(const MachineFunction &MF) const;
+
+  /// \returns the largest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  int getIndirectIndexEnd(const MachineFunction &MF) const;
+
+  /// \brief Build instruction(s) for an indirect register write.
+  ///
+  /// \returns The instruction that performs the indirect register write
+  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned ValueReg, unsigned Address,
+                                         unsigned OffsetReg) const;
+
+  /// \brief Build instruction(s) for an indirect register read.
+  ///
+  /// \returns The instruction that performs the indirect register read
+  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg, unsigned Address,
+                                        unsigned OffsetReg) const;
+
+  unsigned getMaxAlusPerClause() const;
+
+  /// buildDefaultInstruction - This function returns a MachineInstr with all
+  /// the instruction modifiers initialized to their default values.  You can
+  /// use this function to avoid manually specifying each instruction modifier
+  /// operand when building a new instruction.
+  ///
+  /// \returns a MachineInstr with all the instruction modifiers initialized
+  /// to their default values.
+  MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB,
+                                              MachineBasicBlock::iterator I,
+                                              unsigned Opcode,
+                                              unsigned DstReg,
+                                              unsigned Src0Reg,
+                                              unsigned Src1Reg = 0) const;
+
+  MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB,
+                                             MachineInstr *MI,
+                                             unsigned Slot,
+                                             unsigned DstReg) const;
+
+  MachineInstr *buildMovImm(MachineBasicBlock &BB,
+                            MachineBasicBlock::iterator I,
+                            unsigned DstReg,
+                            uint64_t Imm) const;
+
+  MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator I,
+                              unsigned DstReg, unsigned SrcReg) const;
+
+  /// \brief Get the index of Op in the MachineInstr.
+  ///
+  /// \returns -1 if the Instruction does not contain the specified \p Op.
+  int getOperandIdx(const MachineInstr &MI, unsigned Op) const;
+
+  /// \brief Get the index of \p Op for the given Opcode.
+  ///
+  /// \returns -1 if the Instruction does not contain the specified \p Op.
+  int getOperandIdx(unsigned Opcode, unsigned Op) const;
+
+  /// \brief Helper function for setting instruction flag values.
+  void setImmOperand(MachineInstr &MI, unsigned Op, int64_t Imm) const;
+
+  ///\brief Add one of the MO_FLAG* flags to the specified \p Operand.
+  void addFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
+
+  ///\brief Determine if the specified \p Flag is set on this \p Operand.
+  bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const;
+
+  /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2)
+  /// \param Flag The flag being set.
+  ///
+  /// \returns the operand containing the flags for this instruction.
+  MachineOperand &getFlagOp(MachineInstr &MI, unsigned SrcIdx = 0,
+                            unsigned Flag = 0) const;
+
+  /// \brief Clear the specified flag on the instruction.
+  void clearFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const;
+
+  // Helper functions that check the opcode for status information
+  bool isRegisterStore(const MachineInstr &MI) const {
+    return get(MI.getOpcode()).TSFlags & R600InstrFlags::REGISTER_STORE;
+  }
+
+  bool isRegisterLoad(const MachineInstr &MI) const {
+    return get(MI.getOpcode()).TSFlags & R600InstrFlags::REGISTER_LOAD;
+  }
+};
+
+namespace AMDGPU {
+
+int getLDSNoRetOp(uint16_t Opcode);
+
+} //End namespace AMDGPU
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
new file mode 100644
index 000000000000..3a72e0791fd6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -0,0 +1,1722 @@
+//===-- R600Instructions.td - R600 Instruction defs  -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are available on R600 family
+// GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+include "R600Intrinsics.td"
+include "R600InstrFormats.td"
+
+class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern = []> :
+    InstR600 <outs, ins, asm, pattern, NullALU> {
+
+  let Namespace = "AMDGPU";
+}
+
+def MEMxi : Operand<iPTR> {
+  let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index);
+  let PrintMethod = "printMemOperand";
+}
+
+def MEMrr : Operand<iPTR> {
+  let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index);
+}
+
+// Operands for non-registers
+
+class InstFlag<string PM = "printOperand", int Default = 0>
+    : OperandWithDefaultOps <i32, (ops (i32 Default))> {
+  let PrintMethod = PM;
+}
+
+// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers
+def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> {
+  let PrintMethod = "printSel";
+}
+def BANK_SWIZZLE : OperandWithDefaultOps <i32, (ops (i32 0))> {
+  let PrintMethod = "printBankSwizzle";
+}
+
+def LITERAL : InstFlag<"printLiteral">;
+
+def WRITE : InstFlag <"printWrite", 1>;
+def OMOD : InstFlag <"printOMOD">;
+def REL : InstFlag <"printRel">;
+def CLAMP : InstFlag <"printClamp">;
+def NEG : InstFlag <"printNeg">;
+def ABS : InstFlag <"printAbs">;
+def UEM : InstFlag <"printUpdateExecMask">;
+def UP : InstFlag <"printUpdatePred">;
+
+// XXX: The r600g finalizer in Mesa expects last to be one in most cases.
+// Once we start using the packetizer in this backend we should have this
+// default to 0.
+def LAST : InstFlag<"printLast", 1>;
+def RSel : Operand<i32> {
+  let PrintMethod = "printRSel";
+}
+def CT: Operand<i32> {
+  let PrintMethod = "printCT";
+}
+
+def FRAMEri : Operand<iPTR> {
+  let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index);
+}
+
+def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>;
+def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
+def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
+def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
+def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
+
+
+def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
+                                     (ops PRED_SEL_OFF)>;
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+
+// Class for instructions with only one source register.
+// If you add new ins to this instruction, make sure they are listed before
+// $literal, because the backend currently assumes that the last operand is
+// a literal.  Also be sure to update the enum R600Op1OperandIndex::ROI in
+// R600Defines.h, R600InstrInfo::buildDefaultInstruction(),
+// and R600InstrInfo::getOperandIdx().
+class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+    InstR600 <(outs R600_Reg32:$dst),
+              (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
+                   R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
+                   LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
+                   BANK_SWIZZLE:$bank_swizzle),
+              !strconcat("  ", opName,
+                   "$clamp $last $dst$write$dst_rel$omod, "
+                   "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
+                   "$pred_sel $bank_swizzle"),
+              pattern,
+              itin>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP2 <inst> {
+
+  let src1 = 0;
+  let src1_rel = 0;
+  let src1_neg = 0;
+  let src1_abs = 0;
+  let update_exec_mask = 0;
+  let update_pred = 0;
+  let HasNativeOperands = 1;
+  let Op1 = 1;
+  let ALUInst = 1;
+  let DisableEncoding = "$literal";
+  let UseNamedOperandTable = 1;
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
+                    InstrItinClass itin = AnyALU> :
+    R600_1OP <inst, opName,
+              [(set R600_Reg32:$dst, (node R600_Reg32:$src0))], itin
+>;
+
+// If you add or change the operands for R600_2OP instructions, you must
+// also update the R600Op2OperandIndex::ROI enum in R600Defines.h,
+// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx().
+class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+  InstR600 <(outs R600_Reg32:$dst),
+          (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write,
+               OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
+               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
+               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel,
+               LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
+               BANK_SWIZZLE:$bank_swizzle),
+          !strconcat("  ", opName,
+                "$clamp $last $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
+                "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
+                "$src1_neg$src1_abs$src1$src1_abs$src1_rel, "
+                "$pred_sel $bank_swizzle"),
+          pattern,
+          itin>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP2 <inst> {
+
+  let HasNativeOperands = 1;
+  let Op2 = 1;
+  let ALUInst = 1;
+  let DisableEncoding = "$literal";
+  let UseNamedOperandTable = 1;
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class R600_2OP_Helper <bits<11> inst, string opName,
+                       SDPatternOperator node = null_frag,
+                       InstrItinClass itin = AnyALU> :
+    R600_2OP <inst, opName,
+              [(set R600_Reg32:$dst, (node R600_Reg32:$src0,
+                                           R600_Reg32:$src1))], itin
+>;
+
+// If you add our change the operands for R600_3OP instructions, you must
+// also update the R600Op3OperandIndex::ROI enum in R600Defines.h,
+// R600InstrInfo::buildDefaultInstruction(), and
+// R600InstrInfo::getOperandIdx().
+class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+  InstR600 <(outs R600_Reg32:$dst),
+          (ins REL:$dst_rel, CLAMP:$clamp,
+               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel,
+               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel,
+               R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel,
+               LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
+               BANK_SWIZZLE:$bank_swizzle),
+          !strconcat("  ", opName, "$clamp $last $dst$dst_rel, "
+                             "$src0_neg$src0$src0_rel, "
+                             "$src1_neg$src1$src1_rel, "
+                             "$src2_neg$src2$src2_rel, "
+                             "$pred_sel"
+                             "$bank_swizzle"),
+          pattern,
+          itin>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP3<inst>{
+
+  let HasNativeOperands = 1;
+  let DisableEncoding = "$literal";
+  let Op3 = 1;
+  let UseNamedOperandTable = 1;
+  let ALUInst = 1;
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
+                      InstrItinClass itin = VecALU> :
+  InstR600 <(outs R600_Reg32:$dst),
+          ins,
+          asm,
+          pattern,
+          itin>;
+
+
+
+} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
+
+def TEX_SHADOW : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return (TType >= 6 && TType <= 8) || TType == 13;
+  }]
+>;
+
+def TEX_RECT : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 5;
+  }]
+>;
+
+def TEX_ARRAY : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 9 || TType == 10 || TType == 16;
+  }]
+>;
+
+def TEX_SHADOW_ARRAY : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 11 || TType == 12 || TType == 17;
+  }]
+>;
+
+class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
+                 dag outs, dag ins, string asm, list<dag> pattern> :
+    InstR600ISA <outs, ins, asm, pattern>,
+    CF_ALLOC_EXPORT_WORD0_RAT, CF_ALLOC_EXPORT_WORD1_BUF  {
+
+  let rat_id = ratid;
+  let rat_inst = ratinst;
+  let rim         = 0;
+  // XXX: Have a separate instruction for non-indexed writes.
+  let type        = 1;
+  let rw_rel      = 0;
+  let elem_size   = 0;
+
+  let array_size  = 0;
+  let comp_mask   = mask;
+  let burst_count = 0;
+  let vpm         = 0;
+  let cf_inst = cfinst;
+  let mark        = 0;
+  let barrier     = 1;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+  let IsExport = 1;
+
+}
+
+class VTX_READ <string name, dag outs, list<dag> pattern>
+    : InstR600ISA <outs, (ins MEMxi:$src_gpr, i8imm:$buffer_id), !strconcat("  ", name, ", #$buffer_id"), pattern>,
+      VTX_WORD1_GPR {
+
+  // Static fields
+  let DST_REL = 0;
+  // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL,
+  // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored,
+  // however, based on my testing if USE_CONST_FIELDS is set, then all
+  // these fields need to be set to 0.
+  let USE_CONST_FIELDS = 0;
+  let NUM_FORMAT_ALL = 1;
+  let FORMAT_COMP_ALL = 0;
+  let SRF_MODE_ALL = 0;
+
+  let Inst{63-32} = Word1;
+  // LLVM can only encode 64-bit instructions, so these fields are manually
+  // encoded in R600CodeEmitter
+  //
+  // bits<16> OFFSET;
+  // bits<2>  ENDIAN_SWAP = 0;
+  // bits<1>  CONST_BUF_NO_STRIDE = 0;
+  // bits<1>  MEGA_FETCH = 0;
+  // bits<1>  ALT_CONST = 0;
+  // bits<2>  BUFFER_INDEX_MODE = 0;
+
+  // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+  // is done in R600CodeEmitter
+  //
+  // Inst{79-64} = OFFSET;
+  // Inst{81-80} = ENDIAN_SWAP;
+  // Inst{82}    = CONST_BUF_NO_STRIDE;
+  // Inst{83}    = MEGA_FETCH;
+  // Inst{84}    = ALT_CONST;
+  // Inst{86-85} = BUFFER_INDEX_MODE;
+  // Inst{95-86} = 0; Reserved
+
+  // VTX_WORD3 (Padding)
+  //
+  // Inst{127-96} = 0;
+
+  let VTXInst = 1;
+}
+
+class LoadParamFrag <PatFrag load_type> : PatFrag <
+  (ops node:$ptr), (load_type node:$ptr),
+  [{ return isConstantLoad(cast<LoadSDNode>(N), 0) ||
+            (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS); }]
+>;
+
+def vtx_id3_az_extloadi8 : LoadParamFrag<az_extloadi8>;
+def vtx_id3_az_extloadi16 : LoadParamFrag<az_extloadi16>;
+def vtx_id3_load : LoadParamFrag<load>;
+
+class LoadVtxId1 <PatFrag load> : PatFrag <
+  (ops node:$ptr), (load node:$ptr), [{
+  const MemSDNode *LD = cast<MemSDNode>(N);
+  return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+         (LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+           !isa<GlobalValue>(GetUnderlyingObject(
+           LD->getMemOperand()->getValue(), CurDAG->getDataLayout())));
+}]>;
+
+def vtx_id1_az_extloadi8 : LoadVtxId1 <az_extloadi8>;
+def vtx_id1_az_extloadi16 : LoadVtxId1 <az_extloadi16>;
+def vtx_id1_load : LoadVtxId1 <load>;
+
+class LoadVtxId2 <PatFrag load> : PatFrag <
+  (ops node:$ptr), (load node:$ptr), [{
+  const MemSDNode *LD = cast<MemSDNode>(N);
+  return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+         isa<GlobalValue>(GetUnderlyingObject(
+         LD->getMemOperand()->getValue(), CurDAG->getDataLayout()));
+}]>;
+
+def vtx_id2_az_extloadi8 : LoadVtxId2 <az_extloadi8>;
+def vtx_id2_az_extloadi16 : LoadVtxId2 <az_extloadi16>;
+def vtx_id2_load : LoadVtxId2 <load>;
+
+def isR600 : Predicate<"Subtarget->getGeneration() <= R600Subtarget::R700">;
+
+def isR600toCayman
+    : Predicate<
+          "Subtarget->getGeneration() <= R600Subtarget::NORTHERN_ISLANDS">;
+
+//===----------------------------------------------------------------------===//
+// R600 SDNodes
+//===----------------------------------------------------------------------===//
+
+def INTERP_PAIR_XY :  AMDGPUShaderInst <
+  (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1),
+  (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
+  "INTERP_PAIR_XY $src0 $src1 $src2 : $dst0 dst1",
+  []>;
+
+def INTERP_PAIR_ZW :  AMDGPUShaderInst <
+  (outs R600_TReg32_Z:$dst0, R600_TReg32_W:$dst1),
+  (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
+  "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1",
+  []>;
+
+def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
+  SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
+  [SDNPVariadic]
+>;
+
+def DOT4 : SDNode<"AMDGPUISD::DOT4",
+  SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>,
+      SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>,
+      SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>,
+  []
+>;
+
+def COS_HW : SDNode<"AMDGPUISD::COS_HW",
+  SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>
+>;
+
+def SIN_HW : SDNode<"AMDGPUISD::SIN_HW",
+  SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>
+>;
+
+def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>;
+
+def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", TEXTURE_FETCH_Type, []>;
+
+multiclass TexPattern<bits<32> TextureOp, Instruction inst, ValueType vt = v4f32> {
+def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR,
+          (i32 imm:$srcx), (i32 imm:$srcy), (i32 imm:$srcz), (i32 imm:$srcw),
+          (i32 imm:$offsetx), (i32 imm:$offsety), (i32 imm:$offsetz),
+          (i32 imm:$DST_SEL_X), (i32 imm:$DST_SEL_Y), (i32 imm:$DST_SEL_Z),
+          (i32 imm:$DST_SEL_W),
+          (i32 imm:$RESOURCE_ID), (i32 imm:$SAMPLER_ID),
+          (i32 imm:$COORD_TYPE_X), (i32 imm:$COORD_TYPE_Y), (i32 imm:$COORD_TYPE_Z),
+          (i32 imm:$COORD_TYPE_W)),
+          (inst R600_Reg128:$SRC_GPR,
+          imm:$srcx, imm:$srcy, imm:$srcz, imm:$srcw,
+          imm:$offsetx, imm:$offsety, imm:$offsetz,
+          imm:$DST_SEL_X, imm:$DST_SEL_Y, imm:$DST_SEL_Z,
+          imm:$DST_SEL_W,
+          imm:$RESOURCE_ID, imm:$SAMPLER_ID,
+          imm:$COORD_TYPE_X, imm:$COORD_TYPE_Y, imm:$COORD_TYPE_Z,
+          imm:$COORD_TYPE_W)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Interpolation Instructions
+//===----------------------------------------------------------------------===//
+
+def INTERP_VEC_LOAD :  AMDGPUShaderInst <
+  (outs R600_Reg128:$dst),
+  (ins i32imm:$src0),
+  "INTERP_LOAD $src0 : $dst">;
+
+def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
+  let bank_swizzle = 5;
+}
+
+def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> {
+  let bank_swizzle = 5;
+}
+
+def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>;
+
+//===----------------------------------------------------------------------===//
+// Export Instructions
+//===----------------------------------------------------------------------===//
+
+class ExportWord0 {
+  field bits<32> Word0;
+
+  bits<13> arraybase;
+  bits<2> type;
+  bits<7> gpr;
+  bits<2> elem_size;
+
+  let Word0{12-0} = arraybase;
+  let Word0{14-13} = type;
+  let Word0{21-15} = gpr;
+  let Word0{22} = 0; // RW_REL
+  let Word0{29-23} = 0; // INDEX_GPR
+  let Word0{31-30} = elem_size;
+}
+
+class ExportSwzWord1 {
+  field bits<32> Word1;
+
+  bits<3> sw_x;
+  bits<3> sw_y;
+  bits<3> sw_z;
+  bits<3> sw_w;
+  bits<1> eop;
+  bits<8> inst;
+
+  let Word1{2-0} = sw_x;
+  let Word1{5-3} = sw_y;
+  let Word1{8-6} = sw_z;
+  let Word1{11-9} = sw_w;
+}
+
+class ExportBufWord1 {
+  field bits<32> Word1;
+
+  bits<12> arraySize;
+  bits<4> compMask;
+  bits<1> eop;
+  bits<8> inst;
+
+  let Word1{11-0} = arraySize;
+  let Word1{15-12} = compMask;
+}
+
+multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
+  def : Pat<(R600_EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
+    (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)),
+        (ExportInst R600_Reg128:$src, imm:$type, imm:$base,
+        imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0)
+  >;
+
+}
+
+multiclass SteamOutputExportPattern<Instruction ExportInst,
+    bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> {
+// Stream0
+  def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+      4095, imm:$mask, buf0inst, 0)>;
+// Stream1
+  def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)),
+      (ExportInst $src, 0, imm:$arraybase,
+      4095, imm:$mask, buf1inst, 0)>;
+// Stream2
+  def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)),
+      (ExportInst $src, 0, imm:$arraybase,
+      4095, imm:$mask, buf2inst, 0)>;
+// Stream3
+  def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)),
+      (ExportInst $src, 0, imm:$arraybase,
+      4095, imm:$mask, buf3inst, 0)>;
+}
+
+// Export Instructions should not be duplicated by TailDuplication pass
+// (which assumes that duplicable instruction are affected by exec mask)
+let usesCustomInserter = 1, isNotDuplicable = 1 in {
+
+class ExportSwzInst : InstR600ISA<(
+    outs),
+    (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
+    RSel:$sw_x, RSel:$sw_y, RSel:$sw_z, RSel:$sw_w, i32imm:$inst,
+    i32imm:$eop),
+    !strconcat("EXPORT", " $gpr.$sw_x$sw_y$sw_z$sw_w"),
+    []>, ExportWord0, ExportSwzWord1 {
+  let elem_size = 3;
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+  let IsExport = 1;
+}
+
+} // End usesCustomInserter = 1
+
+class ExportBufInst : InstR600ISA<(
+    outs),
+    (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
+    i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop),
+    !strconcat("EXPORT", " $gpr"),
+    []>, ExportWord0, ExportBufWord1 {
+  let elem_size = 0;
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+  let IsExport = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions
+//===----------------------------------------------------------------------===//
+
+
+def KCACHE : InstFlag<"printKCache">;
+
+class ALU_CLAUSE<bits<4> inst, string OpName> : AMDGPUInst <(outs),
+(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1,
+KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1,
+i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1,
+i32imm:$COUNT, i32imm:$Enabled),
+!strconcat(OpName, " $COUNT, @$ADDR, "
+"KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"),
+[] >, CF_ALU_WORD0, CF_ALU_WORD1 {
+  field bits<64> Inst;
+
+  let CF_INST = inst;
+  let ALT_CONST = 0;
+  let WHOLE_QUAD_MODE = 0;
+  let BARRIER = 1;
+  let isCodeGenOnly = 1;
+  let UseNamedOperandTable = 1;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class CF_WORD0_R600 {
+  field bits<32> Word0;
+
+  bits<32> ADDR;
+
+  let Word0 = ADDR;
+}
+
+class CF_CLAUSE_R600 <bits<7> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
+ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 {
+  field bits<64> Inst;
+  bits<4> CNT;
+
+  let CF_INST = inst;
+  let BARRIER = 1;
+  let CF_CONST = 0;
+  let VALID_PIXEL_MODE = 0;
+  let COND = 0;
+  let COUNT = CNT{2-0};
+  let CALL_COUNT = 0;
+  let COUNT_3 = CNT{3};
+  let END_OF_PROGRAM = 0;
+  let WHOLE_QUAD_MODE = 0;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class CF_CLAUSE_EG <bits<8> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
+ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG {
+  field bits<64> Inst;
+
+  let CF_INST = inst;
+  let BARRIER = 1;
+  let JUMPTABLE_SEL = 0;
+  let CF_CONST = 0;
+  let VALID_PIXEL_MODE = 0;
+  let COND = 0;
+  let END_OF_PROGRAM = 0;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+}
+
+def CF_ALU : ALU_CLAUSE<8, "ALU">;
+def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">;
+def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">;
+def CF_ALU_CONTINUE : ALU_CLAUSE<13, "ALU_CONTINUE">;
+def CF_ALU_BREAK : ALU_CLAUSE<14, "ALU_BREAK">;
+def CF_ALU_ELSE_AFTER : ALU_CLAUSE<15, "ALU_ELSE_AFTER">;
+
+def FETCH_CLAUSE : AMDGPUInst <(outs),
+(ins i32imm:$addr), "Fetch clause starting at $addr:", [] > {
+  field bits<8> Inst;
+  bits<8> num;
+  let Inst = num;
+  let isCodeGenOnly = 1;
+}
+
+def ALU_CLAUSE : AMDGPUInst <(outs),
+(ins i32imm:$addr), "ALU clause starting at $addr:", [] > {
+  field bits<8> Inst;
+  bits<8> num;
+  let Inst = num;
+  let isCodeGenOnly = 1;
+}
+
+def LITERALS : AMDGPUInst <(outs),
+(ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > {
+  let isCodeGenOnly = 1;
+
+  field bits<64> Inst;
+  bits<32> literal1;
+  bits<32> literal2;
+
+  let Inst{31-0} = literal1;
+  let Inst{63-32} = literal2;
+}
+
+def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > {
+  field bits<64> Inst;
+}
+
+let Predicates = [isR600toCayman] in {
+
+//===----------------------------------------------------------------------===//
+// Common Instructions R600, R700, Evergreen, Cayman
+//===----------------------------------------------------------------------===//
+
+def ADD : R600_2OP_Helper <0x0, "ADD", fadd>;
+// Non-IEEE MUL: 0 * anything = 0
+def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE">;
+def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>;
+// TODO: Do these actually match the regular fmin/fmax behavior?
+def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>;
+def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin_legacy>;
+// According to https://msdn.microsoft.com/en-us/library/windows/desktop/cc308050%28v=vs.85%29.aspx
+// DX10 min/max returns the other operand if one is NaN,
+// this matches http://llvm.org/docs/LangRef.html#llvm-minnum-intrinsic
+def MAX_DX10 : R600_2OP_Helper <0x5, "MAX_DX10", fmaxnum>;
+def MIN_DX10 : R600_2OP_Helper <0x6, "MIN_DX10", fminnum>;
+
+// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td,
+// so some of the instruction names don't match the asm string.
+// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
+def SETE : R600_2OP <
+  0x08, "SETE",
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OEQ))]
+>;
+
+def SGT : R600_2OP <
+  0x09, "SETGT",
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGT))]
+>;
+
+def SGE : R600_2OP <
+  0xA, "SETGE",
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGE))]
+>;
+
+def SNE : R600_2OP <
+  0xB, "SETNE",
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))]
+>;
+
+def SETE_DX10 : R600_2OP <
+  0xC, "SETE_DX10",
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OEQ))]
+>;
+
+def SETGT_DX10 : R600_2OP <
+  0xD, "SETGT_DX10",
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGT))]
+>;
+
+def SETGE_DX10 : R600_2OP <
+  0xE, "SETGE_DX10",
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))]
+>;
+
+// FIXME: This should probably be COND_ONE
+def SETNE_DX10 : R600_2OP <
+  0xF, "SETNE_DX10",
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))]
+>;
+
+// FIXME: Need combine for AMDGPUfract
+def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
+def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>;
+def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
+def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>;
+def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
+
+def MOV : R600_1OP <0x19, "MOV", []>;
+
+let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
+
+class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst <
+  (outs R600_Reg32:$dst),
+  (ins immType:$imm),
+  "",
+  []
+>;
+
+} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1
+
+def MOV_IMM_I32 : MOV_IMM<i32, i32imm>;
+def : Pat <
+  (imm:$val),
+  (MOV_IMM_I32 imm:$val)
+>;
+
+def MOV_IMM_GLOBAL_ADDR : MOV_IMM<iPTR, i32imm>;
+def : Pat <
+  (AMDGPUconstdata_ptr tglobaladdr:$addr),
+  (MOV_IMM_GLOBAL_ADDR tglobaladdr:$addr)
+>;
+
+
+def MOV_IMM_F32 : MOV_IMM<f32, f32imm>;
+def : Pat <
+  (fpimm:$val),
+  (MOV_IMM_F32  fpimm:$val)
+>;
+
+def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>;
+def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>;
+def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>;
+def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>;
+
+let hasSideEffects = 1 in {
+
+def KILLGT : R600_2OP <0x2D, "KILLGT", []>;
+
+} // end hasSideEffects
+
+def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>;
+def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>;
+def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>;
+def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>;
+def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>;
+def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>;
+def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", smax>;
+def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", smin>;
+def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", umax>;
+def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", umin>;
+
+def SETE_INT : R600_2OP <
+  0x3A, "SETE_INT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETEQ))]
+>;
+
+def SETGT_INT : R600_2OP <
+  0x3B, "SETGT_INT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGT))]
+>;
+
+def SETGE_INT : R600_2OP <
+  0x3C, "SETGE_INT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGE))]
+>;
+
+def SETNE_INT : R600_2OP <
+  0x3D, "SETNE_INT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETNE))]
+>;
+
+def SETGT_UINT : R600_2OP <
+  0x3E, "SETGT_UINT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGT))]
+>;
+
+def SETGE_UINT : R600_2OP <
+  0x3F, "SETGE_UINT",
+  [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGE))]
+>;
+
+def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>;
+def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>;
+def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>;
+def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>;
+
+def CNDE_INT : R600_3OP <
+  0x1C, "CNDE_INT",
+  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_EQ))]
+>;
+
+def CNDGE_INT : R600_3OP <
+  0x1E, "CNDGE_INT",
+  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGE))]
+>;
+
+def CNDGT_INT : R600_3OP <
+  0x1D, "CNDGT_INT",
+  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGT))]
+>;
+
+//===----------------------------------------------------------------------===//
+// Texture instructions
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+
+class R600_TEX <bits<11> inst, string opName> :
+  InstR600 <(outs R600_Reg128:$DST_GPR),
+          (ins R600_Reg128:$SRC_GPR,
+          RSel:$srcx, RSel:$srcy, RSel:$srcz, RSel:$srcw,
+          i32imm:$offsetx, i32imm:$offsety, i32imm:$offsetz,
+          RSel:$DST_SEL_X, RSel:$DST_SEL_Y, RSel:$DST_SEL_Z, RSel:$DST_SEL_W,
+          i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID,
+          CT:$COORD_TYPE_X, CT:$COORD_TYPE_Y, CT:$COORD_TYPE_Z,
+          CT:$COORD_TYPE_W),
+          !strconcat("  ", opName,
+          " $DST_GPR.$DST_SEL_X$DST_SEL_Y$DST_SEL_Z$DST_SEL_W, "
+          "$SRC_GPR.$srcx$srcy$srcz$srcw "
+          "RID:$RESOURCE_ID SID:$SAMPLER_ID "
+          "CT:$COORD_TYPE_X$COORD_TYPE_Y$COORD_TYPE_Z$COORD_TYPE_W"),
+          [],
+          NullALU>, TEX_WORD0, TEX_WORD1, TEX_WORD2 {
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+
+  let TEX_INST = inst{4-0};
+  let SRC_REL = 0;
+  let DST_REL = 0;
+  let LOD_BIAS = 0;
+
+  let INST_MOD = 0;
+  let FETCH_WHOLE_QUAD = 0;
+  let ALT_CONST = 0;
+  let SAMPLER_INDEX_MODE = 0;
+  let RESOURCE_INDEX_MODE = 0;
+
+  let TEXInst = 1;
+}
+
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
+
+
+
+def TEX_SAMPLE : R600_TEX <0x10, "TEX_SAMPLE">;
+def TEX_SAMPLE_C : R600_TEX <0x18, "TEX_SAMPLE_C">;
+def TEX_SAMPLE_L : R600_TEX <0x11, "TEX_SAMPLE_L">;
+def TEX_SAMPLE_C_L : R600_TEX <0x19, "TEX_SAMPLE_C_L">;
+def TEX_SAMPLE_LB : R600_TEX <0x12, "TEX_SAMPLE_LB">;
+def TEX_SAMPLE_C_LB : R600_TEX <0x1A, "TEX_SAMPLE_C_LB">;
+def TEX_LD : R600_TEX <0x03, "TEX_LD">;
+def TEX_LDPTR : R600_TEX <0x03, "TEX_LDPTR"> {
+  let INST_MOD = 1;
+}
+def TEX_GET_TEXTURE_RESINFO : R600_TEX <0x04, "TEX_GET_TEXTURE_RESINFO">;
+def TEX_GET_GRADIENTS_H : R600_TEX <0x07, "TEX_GET_GRADIENTS_H">;
+def TEX_GET_GRADIENTS_V : R600_TEX <0x08, "TEX_GET_GRADIENTS_V">;
+def TEX_SET_GRADIENTS_H : R600_TEX <0x0B, "TEX_SET_GRADIENTS_H">;
+def TEX_SET_GRADIENTS_V : R600_TEX <0x0C, "TEX_SET_GRADIENTS_V">;
+def TEX_SAMPLE_G : R600_TEX <0x14, "TEX_SAMPLE_G">;
+def TEX_SAMPLE_C_G : R600_TEX <0x1C, "TEX_SAMPLE_C_G">;
+
+defm : TexPattern<0, TEX_SAMPLE>;
+defm : TexPattern<1, TEX_SAMPLE_C>;
+defm : TexPattern<2, TEX_SAMPLE_L>;
+defm : TexPattern<3, TEX_SAMPLE_C_L>;
+defm : TexPattern<4, TEX_SAMPLE_LB>;
+defm : TexPattern<5, TEX_SAMPLE_C_LB>;
+defm : TexPattern<6, TEX_LD, v4i32>;
+defm : TexPattern<7, TEX_GET_TEXTURE_RESINFO, v4i32>;
+defm : TexPattern<8, TEX_GET_GRADIENTS_H>;
+defm : TexPattern<9, TEX_GET_GRADIENTS_V>;
+defm : TexPattern<10, TEX_LDPTR, v4i32>;
+
+//===----------------------------------------------------------------------===//
+// Helper classes for common instructions
+//===----------------------------------------------------------------------===//
+
+class MUL_LIT_Common <bits<5> inst> : R600_3OP <
+  inst, "MUL_LIT",
+  []
+>;
+
+class MULADD_Common <bits<5> inst> : R600_3OP <
+  inst, "MULADD",
+  []
+>;
+
+class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
+  inst, "MULADD_IEEE",
+  [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))]
+>;
+
+class FMA_Common <bits<5> inst> : R600_3OP <
+  inst, "FMA",
+  [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))], VecALU
+>;
+
+class CNDE_Common <bits<5> inst> : R600_3OP <
+  inst, "CNDE",
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))]
+>;
+
+class CNDGT_Common <bits<5> inst> : R600_3OP <
+  inst, "CNDGT",
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGT))]
+> {
+  let Itinerary = VecALU;
+}
+
+class CNDGE_Common <bits<5> inst> : R600_3OP <
+  inst, "CNDGE",
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGE))]
+> {
+  let Itinerary = VecALU;
+}
+
+
+let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
+class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins
+// Slot X
+   UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X,
+   OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X,
+   R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X,
+   R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, ABS:$src1_abs_X, SEL:$src1_sel_X,
+   R600_Pred:$pred_sel_X,
+// Slot Y
+   UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y,
+   OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y,
+   R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y,
+   R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, ABS:$src1_abs_Y, SEL:$src1_sel_Y,
+   R600_Pred:$pred_sel_Y,
+// Slot Z
+   UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z,
+   OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z,
+   R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z,
+   R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, ABS:$src1_abs_Z, SEL:$src1_sel_Z,
+   R600_Pred:$pred_sel_Z,
+// Slot W
+   UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W,
+   OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W,
+   R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W,
+   R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, ABS:$src1_abs_W, SEL:$src1_sel_W,
+   R600_Pred:$pred_sel_W,
+   LITERAL:$literal0, LITERAL:$literal1),
+  "",
+  pattern,
+  AnyALU> {
+
+  let UseNamedOperandTable = 1;
+
+}
+}
+
+def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4
+  R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X,
+  R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y,
+  R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z,
+  R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>;
+
+
+class DOT4_Common <bits<11> inst> : R600_2OP <inst, "DOT4", []>;
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+multiclass CUBE_Common <bits<11> inst> {
+
+  def _pseudo : InstR600 <
+    (outs R600_Reg128:$dst),
+    (ins R600_Reg128:$src0),
+    "CUBE $dst $src0",
+    [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))],
+    VecALU
+  > {
+    let isPseudo = 1;
+    let UseNamedOperandTable = 1;
+  }
+
+  def _real : R600_2OP <inst, "CUBE", []>;
+}
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
+
+class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "EXP_IEEE", fexp2
+> {
+  let Itinerary = TransALU;
+}
+
+class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "FLT_TO_INT", fp_to_sint
+> {
+  let Itinerary = TransALU;
+}
+
+class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "INT_TO_FLT", sint_to_fp
+> {
+  let Itinerary = TransALU;
+}
+
+class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "FLT_TO_UINT", fp_to_uint
+> {
+  let Itinerary = TransALU;
+}
+
+class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "UINT_TO_FLT", uint_to_fp
+> {
+  let Itinerary = TransALU;
+}
+
+class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP <
+  inst, "LOG_CLAMPED", []
+>;
+
+class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "LOG_IEEE", flog2
+> {
+  let Itinerary = TransALU;
+}
+
+class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>;
+class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>;
+class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>;
+class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULHI_INT", mulhs> {
+  let Itinerary = TransALU;
+}
+
+class MULHI_INT24_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULHI_INT24", AMDGPUmulhi_i24> {
+  let Itinerary = VecALU;
+}
+
+class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULHI", mulhu> {
+  let Itinerary = TransALU;
+}
+
+class MULHI_UINT24_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULHI_UINT24", AMDGPUmulhi_u24> {
+  let Itinerary = VecALU;
+}
+
+class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULLO_INT", mul> {
+  let Itinerary = TransALU;
+}
+class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []> {
+  let Itinerary = TransALU;
+}
+
+class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP <
+  inst, "RECIP_CLAMPED", []
+> {
+  let Itinerary = TransALU;
+}
+
+class RECIP_IEEE_Common <bits<11> inst> : R600_1OP <
+  inst, "RECIP_IEEE", [(set f32:$dst, (AMDGPUrcp f32:$src0))]
+> {
+  let Itinerary = TransALU;
+}
+
+class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "RECIP_UINT", AMDGPUurecip
+> {
+  let Itinerary = TransALU;
+}
+
+// Clamped to maximum.
+class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamp
+> {
+  let Itinerary = TransALU;
+}
+
+class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "RECIPSQRT_IEEE", AMDGPUrsq> {
+  let Itinerary = TransALU;
+}
+
+// TODO: There is also RECIPSQRT_FF which clamps to zero.
+
+class SIN_Common <bits<11> inst> : R600_1OP <
+  inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{
+  let Trig = 1;
+  let Itinerary = TransALU;
+}
+
+class COS_Common <bits<11> inst> : R600_1OP <
+  inst, "COS", [(set f32:$dst, (COS_HW f32:$src0))]> {
+  let Trig = 1;
+  let Itinerary = TransALU;
+}
+
+def CLAMP_R600 :  CLAMP <R600_Reg32>;
+def FABS_R600 : FABS<R600_Reg32>;
+def FNEG_R600 : FNEG<R600_Reg32>;
+
+//===----------------------------------------------------------------------===//
+// Helper patterns for complex intrinsics
+//===----------------------------------------------------------------------===//
+
+// FIXME: Should be predicated on unsafe fp math.
+multiclass DIV_Common <InstR600 recip_ieee> {
+def : Pat<
+  (fdiv f32:$src0, f32:$src1),
+  (MUL_IEEE $src0, (recip_ieee $src1))
+>;
+
+def : RcpPat<recip_ieee, f32>;
+}
+
+//===----------------------------------------------------------------------===//
+// R600 / R700 Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isR600] in {
+
+  def MUL_LIT_r600 : MUL_LIT_Common<0x0C>;
+  def MULADD_r600 : MULADD_Common<0x10>;
+  def MULADD_IEEE_r600 : MULADD_IEEE_Common<0x14>;
+  def CNDE_r600 : CNDE_Common<0x18>;
+  def CNDGT_r600 : CNDGT_Common<0x19>;
+  def CNDGE_r600 : CNDGE_Common<0x1A>;
+  def DOT4_r600 : DOT4_Common<0x50>;
+  defm CUBE_r600 : CUBE_Common<0x52>;
+  def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>;
+  def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>;
+  def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>;
+  def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>;
+  def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>;
+  def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>;
+  def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>;
+  def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>;
+  def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>;
+  def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>;
+  def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>;
+  def SIN_r600 : SIN_Common<0x6E>;
+  def COS_r600 : COS_Common<0x6F>;
+  def ASHR_r600 : ASHR_Common<0x70>;
+  def LSHR_r600 : LSHR_Common<0x71>;
+  def LSHL_r600 : LSHL_Common<0x72>;
+  def MULLO_INT_r600 : MULLO_INT_Common<0x73>;
+  def MULHI_INT_r600 : MULHI_INT_Common<0x74>;
+  def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>;
+  def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>;
+  def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>;
+
+  defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
+  def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
+
+  def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
+  def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
+
+  def R600_ExportSwz : ExportSwzInst {
+    let Word1{20-17} = 0; // BURST_COUNT
+    let Word1{21} = eop;
+    let Word1{22} = 0; // VALID_PIXEL_MODE
+    let Word1{30-23} = inst;
+    let Word1{31} = 1; // BARRIER
+  }
+  defm : ExportPattern<R600_ExportSwz, 39>;
+
+  def R600_ExportBuf : ExportBufInst {
+    let Word1{20-17} = 0; // BURST_COUNT
+    let Word1{21} = eop;
+    let Word1{22} = 0; // VALID_PIXEL_MODE
+    let Word1{30-23} = inst;
+    let Word1{31} = 1; // BARRIER
+  }
+  defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>;
+
+  def CF_TC_R600 : CF_CLAUSE_R600<1, (ins i32imm:$ADDR, i32imm:$CNT),
+  "TEX $CNT @$ADDR"> {
+    let POP_COUNT = 0;
+  }
+  def CF_VC_R600 : CF_CLAUSE_R600<2, (ins i32imm:$ADDR, i32imm:$CNT),
+  "VTX $CNT @$ADDR"> {
+    let POP_COUNT = 0;
+  }
+  def WHILE_LOOP_R600 : CF_CLAUSE_R600<6, (ins i32imm:$ADDR),
+  "LOOP_START_DX10 @$ADDR"> {
+    let POP_COUNT = 0;
+    let CNT = 0;
+  }
+  def END_LOOP_R600 : CF_CLAUSE_R600<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
+    let POP_COUNT = 0;
+    let CNT = 0;
+  }
+  def LOOP_BREAK_R600 : CF_CLAUSE_R600<9, (ins i32imm:$ADDR),
+  "LOOP_BREAK @$ADDR"> {
+    let POP_COUNT = 0;
+    let CNT = 0;
+  }
+  def CF_CONTINUE_R600 : CF_CLAUSE_R600<8, (ins i32imm:$ADDR),
+  "CONTINUE @$ADDR"> {
+    let POP_COUNT = 0;
+    let CNT = 0;
+  }
+  def CF_JUMP_R600 : CF_CLAUSE_R600<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "JUMP @$ADDR POP:$POP_COUNT"> {
+    let CNT = 0;
+  }
+  def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR),
+  "PUSH_ELSE @$ADDR"> {
+    let CNT = 0;
+    let POP_COUNT = 0; // FIXME?
+  }
+  def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "ELSE @$ADDR POP:$POP_COUNT"> {
+    let CNT = 0;
+  }
+  def CF_CALL_FS_R600 : CF_CLAUSE_R600<19, (ins), "CALL_FS"> {
+    let ADDR = 0;
+    let CNT = 0;
+    let POP_COUNT = 0;
+  }
+  def POP_R600 : CF_CLAUSE_R600<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+  "POP @$ADDR POP:$POP_COUNT"> {
+    let CNT = 0;
+  }
+  def CF_END_R600 : CF_CLAUSE_R600<0, (ins), "CF_END"> {
+    let CNT = 0;
+    let POP_COUNT = 0;
+    let ADDR = 0;
+    let END_OF_PROGRAM = 1;
+  }
+
+}
+
+
+//===----------------------------------------------------------------------===//
+// Regist loads and stores - for indirect addressing
+//===----------------------------------------------------------------------===//
+
+defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
+
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+let isPseudo = 1 in {
+
+def PRED_X : InstR600 <
+  (outs R600_Predicate_Bit:$dst),
+  (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
+  "", [], NullALU> {
+  let FlagOperandIdx = 3;
+}
+
+let isTerminator = 1, isBranch = 1 in {
+def JUMP_COND : InstR600 <
+          (outs),
+          (ins brtarget:$target, R600_Predicate_Bit:$p),
+          "JUMP $target ($p)",
+          [], AnyALU
+  >;
+
+def JUMP : InstR600 <
+          (outs),
+          (ins brtarget:$target),
+          "JUMP $target",
+          [], AnyALU
+  >
+{
+  let isPredicable = 1;
+  let isBarrier = 1;
+}
+
+}  // End isTerminator = 1, isBranch = 1
+
+let usesCustomInserter = 1 in {
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
+
+def MASK_WRITE : AMDGPUShaderInst <
+    (outs),
+    (ins R600_Reg32:$src),
+    "MASK_WRITE $src",
+    []
+>;
+
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
+
+
+def TXD: InstR600 <
+  (outs R600_Reg128:$dst),
+  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
+       i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
+  "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", [],
+  NullALU > {
+  let TEXInst = 1;
+}
+
+def TXD_SHADOW: InstR600 <
+  (outs R600_Reg128:$dst),
+  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
+       i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
+  "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
+  [], NullALU> {
+  let TEXInst = 1;
+}
+} // End isPseudo = 1
+} // End usesCustomInserter = 1
+
+
+//===----------------------------------------------------------------------===//
+// Constant Buffer Addressing Support
+//===----------------------------------------------------------------------===//
+
+let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
+def CONST_COPY : Instruction {
+  let OutOperandList = (outs R600_Reg32:$dst);
+  let InOperandList = (ins i32imm:$src);
+  let Pattern =
+      [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
+  let AsmString = "CONST_COPY";
+  let hasSideEffects = 0;
+  let isAsCheapAsAMove = 1;
+  let Itinerary = NullALU;
+}
+} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"
+
+def TEX_VTX_CONSTBUF :
+  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$buffer_id), "VTX_READ_eg $dst, $ptr",
+      [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$buffer_id)))]>,
+  VTX_WORD1_GPR, VTX_WORD0_eg {
+
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let SRC_REL = 0;
+  let SRC_SEL_X = 0;
+  let DST_REL = 0;
+  let USE_CONST_FIELDS = 0;
+  let NUM_FORMAT_ALL = 2;
+  let FORMAT_COMP_ALL = 1;
+  let SRF_MODE_ALL = 1;
+  let MEGA_FETCH_COUNT = 16;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 1;
+  let DST_SEL_Z        = 2;
+  let DST_SEL_W        = 3;
+  let DATA_FORMAT      = 35;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+
+// LLVM can only encode 64-bit instructions, so these fields are manually
+// encoded in R600CodeEmitter
+//
+// bits<16> OFFSET;
+// bits<2>  ENDIAN_SWAP = 0;
+// bits<1>  CONST_BUF_NO_STRIDE = 0;
+// bits<1>  MEGA_FETCH = 0;
+// bits<1>  ALT_CONST = 0;
+// bits<2>  BUFFER_INDEX_MODE = 0;
+
+
+
+// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+// is done in R600CodeEmitter
+//
+// Inst{79-64} = OFFSET;
+// Inst{81-80} = ENDIAN_SWAP;
+// Inst{82}    = CONST_BUF_NO_STRIDE;
+// Inst{83}    = MEGA_FETCH;
+// Inst{84}    = ALT_CONST;
+// Inst{86-85} = BUFFER_INDEX_MODE;
+// Inst{95-86} = 0; Reserved
+
+// VTX_WORD3 (Padding)
+//
+// Inst{127-96} = 0;
+  let VTXInst = 1;
+}
+
+def TEX_VTX_TEXBUF:
+  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$buffer_id), "TEX_VTX_EXPLICIT_READ $dst, $ptr">,
+VTX_WORD1_GPR, VTX_WORD0_eg {
+
+let VC_INST = 0;
+let FETCH_TYPE = 2;
+let FETCH_WHOLE_QUAD = 0;
+let SRC_REL = 0;
+let SRC_SEL_X = 0;
+let DST_REL = 0;
+let USE_CONST_FIELDS = 1;
+let NUM_FORMAT_ALL = 0;
+let FORMAT_COMP_ALL = 0;
+let SRF_MODE_ALL = 1;
+let MEGA_FETCH_COUNT = 16;
+let DST_SEL_X        = 0;
+let DST_SEL_Y        = 1;
+let DST_SEL_Z        = 2;
+let DST_SEL_W        = 3;
+let DATA_FORMAT      = 0;
+
+let Inst{31-0} = Word0;
+let Inst{63-32} = Word1;
+
+// LLVM can only encode 64-bit instructions, so these fields are manually
+// encoded in R600CodeEmitter
+//
+// bits<16> OFFSET;
+// bits<2>  ENDIAN_SWAP = 0;
+// bits<1>  CONST_BUF_NO_STRIDE = 0;
+// bits<1>  MEGA_FETCH = 0;
+// bits<1>  ALT_CONST = 0;
+// bits<2>  BUFFER_INDEX_MODE = 0;
+
+
+
+// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+// is done in R600CodeEmitter
+//
+// Inst{79-64} = OFFSET;
+// Inst{81-80} = ENDIAN_SWAP;
+// Inst{82}    = CONST_BUF_NO_STRIDE;
+// Inst{83}    = MEGA_FETCH;
+// Inst{84}    = ALT_CONST;
+// Inst{86-85} = BUFFER_INDEX_MODE;
+// Inst{95-86} = 0; Reserved
+
+// VTX_WORD3 (Padding)
+//
+// Inst{127-96} = 0;
+  let VTXInst = 1;
+}
+
+//===---------------------------------------------------------------------===//
+// Flow and Program control Instructions
+//===---------------------------------------------------------------------===//
+class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
+: Instruction {
+
+     let Namespace = "AMDGPU";
+     dag OutOperandList = outs;
+     dag InOperandList = ins;
+     let Pattern = pattern;
+     let AsmString = !strconcat(asmstr, "\n");
+     let isPseudo = 1;
+     let Itinerary = NullALU;
+     bit hasIEEEFlag = 0;
+     bit hasZeroOpFlag = 0;
+     let mayLoad = 0;
+     let mayStore = 0;
+     let hasSideEffects = 0;
+     let isCodeGenOnly = 1;
+}
+
+multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
+    def _i32 : ILFormat<(outs),
+  (ins brtarget:$target, rci:$src0),
+        "; i32 Pseudo branch instruction",
+  [(Op bb:$target, (i32 rci:$src0))]>;
+    def _f32 : ILFormat<(outs),
+  (ins brtarget:$target, rcf:$src0),
+        "; f32 Pseudo branch instruction",
+  [(Op bb:$target, (f32 rcf:$src0))]>;
+}
+
+// Only scalar types should generate flow control
+multiclass BranchInstr<string name> {
+  def _i32 : ILFormat<(outs), (ins R600_Reg32:$src),
+      !strconcat(name, " $src"), []>;
+  def _f32 : ILFormat<(outs), (ins R600_Reg32:$src),
+      !strconcat(name, " $src"), []>;
+}
+// Only scalar types should generate flow control
+multiclass BranchInstr2<string name> {
+  def _i32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1),
+      !strconcat(name, " $src0, $src1"), []>;
+  def _f32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1),
+      !strconcat(name, " $src0, $src1"), []>;
+}
+
+//===---------------------------------------------------------------------===//
+// Custom Inserter for Branches and returns, this eventually will be a
+// separate pass
+//===---------------------------------------------------------------------===//
+let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
+  def BRANCH : ILFormat<(outs), (ins brtarget:$target),
+      "; Pseudo unconditional branch instruction",
+      [(br bb:$target)]>;
+  defm BRANCH_COND : BranchConditional<IL_brcond, R600_Reg32, R600_Reg32>;
+}
+
+//===---------------------------------------------------------------------===//
+// Return instruction
+//===---------------------------------------------------------------------===//
+let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
+    usesCustomInserter = 1 in {
+  def RETURN : ILFormat<(outs), (ins variable_ops),
+    "RETURN", [(AMDGPUendpgm)]
+  >;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Instructions
+//===----------------------------------------------------------------------===//
+
+def IF_PREDICATE_SET  : ILFormat<(outs), (ins R600_Reg32:$src),
+  "IF_PREDICATE_SET $src", []>;
+
+let isTerminator=1 in {
+  def BREAK       : ILFormat< (outs), (ins),
+      "BREAK", []>;
+  def CONTINUE    : ILFormat< (outs), (ins),
+      "CONTINUE", []>;
+  def DEFAULT     : ILFormat< (outs), (ins),
+      "DEFAULT", []>;
+  def ELSE        : ILFormat< (outs), (ins),
+      "ELSE", []>;
+  def ENDSWITCH   : ILFormat< (outs), (ins),
+      "ENDSWITCH", []>;
+  def ENDMAIN     : ILFormat< (outs), (ins),
+      "ENDMAIN", []>;
+  def END         : ILFormat< (outs), (ins),
+      "END", []>;
+  def ENDFUNC     : ILFormat< (outs), (ins),
+      "ENDFUNC", []>;
+  def ENDIF       : ILFormat< (outs), (ins),
+      "ENDIF", []>;
+  def WHILELOOP   : ILFormat< (outs), (ins),
+      "WHILE", []>;
+  def ENDLOOP     : ILFormat< (outs), (ins),
+      "ENDLOOP", []>;
+  def FUNC        : ILFormat< (outs), (ins),
+      "FUNC", []>;
+  def RETDYN      : ILFormat< (outs), (ins),
+      "RET_DYN", []>;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm IF_LOGICALNZ  : BranchInstr<"IF_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm IF_LOGICALZ   : BranchInstr<"IF_LOGICALZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">;
+  defm IFC         : BranchInstr2<"IFC">;
+  defm BREAKC      : BranchInstr2<"BREAKC">;
+  defm CONTINUEC   : BranchInstr2<"CONTINUEC">;
+}
+
+//===----------------------------------------------------------------------===//
+// Indirect addressing pseudo instructions
+//===----------------------------------------------------------------------===//
+
+let isPseudo = 1 in {
+
+class ExtractVertical <RegisterClass vec_rc> : InstR600 <
+  (outs R600_Reg32:$dst),
+  (ins vec_rc:$vec, R600_Reg32:$index), "",
+  [],
+  AnyALU
+>;
+
+let Constraints = "$dst = $vec" in {
+
+class InsertVertical <RegisterClass vec_rc> : InstR600 <
+  (outs vec_rc:$dst),
+  (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "",
+  [],
+  AnyALU
+>;
+
+} // End Constraints = "$dst = $vec"
+
+} // End isPseudo = 1
+
+def R600_EXTRACT_ELT_V2 : ExtractVertical <R600_Reg64Vertical>;
+def R600_EXTRACT_ELT_V4 : ExtractVertical <R600_Reg128Vertical>;
+
+def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>;
+def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>;
+
+class ExtractVerticalPat <Instruction inst, ValueType vec_ty,
+                          ValueType scalar_ty> : Pat <
+  (scalar_ty (extractelt vec_ty:$vec, i32:$index)),
+  (inst $vec, $index)
+>;
+
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2i32, i32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2f32, f32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>;
+def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>;
+
+class InsertVerticalPat <Instruction inst, ValueType vec_ty,
+                         ValueType scalar_ty> : Pat <
+  (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)),
+  (inst $vec, $value, $index)
+>;
+
+def : InsertVerticalPat <R600_INSERT_ELT_V2, v2i32, i32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V2, v2f32, f32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V4, v4i32, i32>;
+def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>;
+
+//===----------------------------------------------------------------------===//
+// ISel Patterns
+//===----------------------------------------------------------------------===//
+
+// CND*_INT Patterns for f32 True / False values
+
+class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat <
+  (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc),
+  (cnd $src0, $src1, $src2)
+>;
+
+def : CND_INT_f32 <CNDE_INT,  SETEQ>;
+def : CND_INT_f32 <CNDGT_INT, SETGT>;
+def : CND_INT_f32 <CNDGE_INT, SETGE>;
+
+//CNDGE_INT extra pattern
+def : Pat <
+  (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_SGT),
+  (CNDGE_INT $src0, $src1, $src2)
+>;
+
+// KIL Patterns
+def KILP : Pat <
+  (int_AMDGPU_kilp),
+  (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
+>;
+
+def KIL : Pat <
+  (int_AMDGPU_kill f32:$src0),
+  (MASK_WRITE (KILLGT (f32 ZERO), $src0))
+>;
+
+def : Extract_Element <f32, v4f32, 0, sub0>;
+def : Extract_Element <f32, v4f32, 1, sub1>;
+def : Extract_Element <f32, v4f32, 2, sub2>;
+def : Extract_Element <f32, v4f32, 3, sub3>;
+
+def : Insert_Element <f32, v4f32, 0, sub0>;
+def : Insert_Element <f32, v4f32, 1, sub1>;
+def : Insert_Element <f32, v4f32, 2, sub2>;
+def : Insert_Element <f32, v4f32, 3, sub3>;
+
+def : Extract_Element <i32, v4i32, 0, sub0>;
+def : Extract_Element <i32, v4i32, 1, sub1>;
+def : Extract_Element <i32, v4i32, 2, sub2>;
+def : Extract_Element <i32, v4i32, 3, sub3>;
+
+def : Insert_Element <i32, v4i32, 0, sub0>;
+def : Insert_Element <i32, v4i32, 1, sub1>;
+def : Insert_Element <i32, v4i32, 2, sub2>;
+def : Insert_Element <i32, v4i32, 3, sub3>;
+
+def : Extract_Element <f32, v2f32, 0, sub0>;
+def : Extract_Element <f32, v2f32, 1, sub1>;
+
+def : Insert_Element <f32, v2f32, 0, sub0>;
+def : Insert_Element <f32, v2f32, 1, sub1>;
+
+def : Extract_Element <i32, v2i32, 0, sub0>;
+def : Extract_Element <i32, v2i32, 1, sub1>;
+
+def : Insert_Element <i32, v2i32, 0, sub0>;
+def : Insert_Element <i32, v2i32, 1, sub1>;
+
+// bitconvert patterns
+
+def : BitConvert <i32, f32, R600_Reg32>;
+def : BitConvert <f32, i32, R600_Reg32>;
+def : BitConvert <v2f32, v2i32, R600_Reg64>;
+def : BitConvert <v2i32, v2f32, R600_Reg64>;
+def : BitConvert <v4f32, v4i32, R600_Reg128>;
+def : BitConvert <v4i32, v4f32, R600_Reg128>;
+
+// DWORDADDR pattern
+def : DwordAddrPat  <i32, R600_Reg32>;
+
+} // End isR600toCayman Predicate
+
+def getLDSNoRetOp : InstrMapping {
+  let FilterClass = "R600_LDS_1A1D";
+  let RowFields = ["BaseOp"];
+  let ColFields = ["DisableEncoding"];
+  let KeyCol = ["$dst"];
+  let ValueCols = [[""""]];
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td b/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td
new file mode 100644
index 000000000000..a5310e9fd6d0
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td
@@ -0,0 +1,67 @@
+//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 Intrinsic Definitions
+//
+//===----------------------------------------------------------------------===//
+
+class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [
+  llvm_v4f32_ty, // Coord
+  llvm_i32_ty,   // offset_x
+  llvm_i32_ty,   // offset_y,
+  llvm_i32_ty,   // offset_z,
+  llvm_i32_ty,   // resource_id
+  llvm_i32_ty,   // samplerid
+  llvm_i32_ty,   // coord_type_x
+  llvm_i32_ty,   // coord_type_y
+  llvm_i32_ty,   // coord_type_z
+  llvm_i32_ty],  // coord_type_w
+  [IntrNoMem]
+>;
+
+class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [
+    llvm_v4i32_ty, // Coord
+    llvm_i32_ty,   // offset_x
+    llvm_i32_ty,   // offset_y,
+    llvm_i32_ty,   // offset_z,
+    llvm_i32_ty,   // resource_id
+    llvm_i32_ty,   // samplerid
+    llvm_i32_ty,   // coord_type_x
+    llvm_i32_ty,   // coord_type_y
+    llvm_i32_ty,   // coord_type_z
+    llvm_i32_ty],  // coord_type_w
+    [IntrNoMem]
+>;
+
+let TargetPrefix = "r600", isTarget = 1 in {
+
+def int_r600_store_swizzle :
+  Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []
+>;
+
+def int_r600_store_stream_output : Intrinsic<
+  [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []
+>;
+
+def int_r600_tex : TextureIntrinsicFloatInput;
+def int_r600_texc : TextureIntrinsicFloatInput;
+def int_r600_txl : TextureIntrinsicFloatInput;
+def int_r600_txlc : TextureIntrinsicFloatInput;
+def int_r600_txb : TextureIntrinsicFloatInput;
+def int_r600_txbc : TextureIntrinsicFloatInput;
+def int_r600_txf : TextureIntrinsicInt32Input;
+def int_r600_txq : TextureIntrinsicInt32Input;
+def int_r600_ddx : TextureIntrinsicFloatInput;
+def int_r600_ddy : TextureIntrinsicFloatInput;
+
+def int_r600_dot4 : Intrinsic<[llvm_float_ty],
+  [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]
+>;
+
+} // End TargetPrefix = "r600", isTarget = 1
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
new file mode 100644
index 000000000000..3ca319c6c6c2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp
@@ -0,0 +1,16 @@
+//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "R600MachineFunctionInfo.h"
+
+using namespace llvm;
+
+R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
+  : AMDGPUMachineFunction(MF) { }
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h
new file mode 100644
index 000000000000..29ac0920f997
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineFunctionInfo.h
@@ -0,0 +1,28 @@
+//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_R600MACHINEFUNCTIONINFO_H
+
+#include "AMDGPUMachineFunction.h"
+
+namespace llvm {
+
+class R600MachineFunctionInfo final : public AMDGPUMachineFunction {
+public:
+  R600MachineFunctionInfo(const MachineFunction &MF);
+  unsigned CFStackSize;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
new file mode 100644
index 000000000000..db18e5bd1afa
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -0,0 +1,467 @@
+//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600MachineScheduler.h"
+#include "R600InstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
+  assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
+  DAG = static_cast<ScheduleDAGMILive*>(dag);
+  const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>();
+  TII = static_cast<const R600InstrInfo*>(DAG->TII);
+  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
+  VLIW5 = !ST.hasCaymanISA();
+  MRI = &DAG->MRI;
+  CurInstKind = IDOther;
+  CurEmitted = 0;
+  OccupedSlotsMask = 31;
+  InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
+  InstKindLimit[IDOther] = 32;
+  InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
+  AluInstCount = 0;
+  FetchInstCount = 0;
+}
+
+void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
+                                  std::vector<SUnit *> &QDst)
+{
+  QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
+  QSrc.clear();
+}
+
+static unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
+  assert (GPRCount && "GPRCount cannot be 0");
+  return 248 / GPRCount;
+}
+
+SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
+  SUnit *SU = nullptr;
+  NextInstKind = IDOther;
+
+  IsTopNode = false;
+
+  // check if we might want to switch current clause type
+  bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
+      (Available[CurInstKind].empty());
+  bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
+      (!Available[IDFetch].empty() || !Available[IDOther].empty());
+
+  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
+    // We use the heuristic provided by AMD Accelerated Parallel Processing
+    // OpenCL Programming Guide :
+    // The approx. number of WF that allows TEX inst to hide ALU inst is :
+    // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
+    float ALUFetchRationEstimate =
+        (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
+        (FetchInstCount + Available[IDFetch].size());
+    if (ALUFetchRationEstimate == 0) {
+      AllowSwitchFromAlu = true;
+    } else {
+      unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
+      DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+      // We assume the local GPR requirements to be "dominated" by the requirement
+      // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
+      // after TEX are indeed likely to consume or generate values from/for the
+      // TEX clause.
+      // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
+      // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
+      // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
+      // (TODO : use RegisterPressure)
+      // If we are going too use too many GPR, we flush Fetch instruction to lower
+      // register pressure on 128 bits regs.
+      unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
+      if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+        AllowSwitchFromAlu = true;
+    }
+  }
+
+  if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
+      (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
+    // try to pick ALU
+    SU = pickAlu();
+    if (!SU && !PhysicalRegCopy.empty()) {
+      SU = PhysicalRegCopy.front();
+      PhysicalRegCopy.erase(PhysicalRegCopy.begin());
+    }
+    if (SU) {
+      if (CurEmitted >= InstKindLimit[IDAlu])
+        CurEmitted = 0;
+      NextInstKind = IDAlu;
+    }
+  }
+
+  if (!SU) {
+    // try to pick FETCH
+    SU = pickOther(IDFetch);
+    if (SU)
+      NextInstKind = IDFetch;
+  }
+
+  // try to pick other
+  if (!SU) {
+    SU = pickOther(IDOther);
+    if (SU)
+      NextInstKind = IDOther;
+  }
+
+  DEBUG(
+      if (SU) {
+        dbgs() << " ** Pick node **\n";
+        SU->dump(DAG);
+      } else {
+        dbgs() << "NO NODE \n";
+        for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
+          const SUnit &S = DAG->SUnits[i];
+          if (!S.isScheduled)
+            S.dump(DAG);
+        }
+      }
+  );
+
+  return SU;
+}
+
+void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+  if (NextInstKind != CurInstKind) {
+    DEBUG(dbgs() << "Instruction Type Switch\n");
+    if (NextInstKind != IDAlu)
+      OccupedSlotsMask |= 31;
+    CurEmitted = 0;
+    CurInstKind = NextInstKind;
+  }
+
+  if (CurInstKind == IDAlu) {
+    AluInstCount ++;
+    switch (getAluKind(SU)) {
+    case AluT_XYZW:
+      CurEmitted += 4;
+      break;
+    case AluDiscarded:
+      break;
+    default: {
+      ++CurEmitted;
+      for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
+          E = SU->getInstr()->operands_end(); It != E; ++It) {
+        MachineOperand &MO = *It;
+        if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+          ++CurEmitted;
+      }
+    }
+    }
+  } else {
+    ++CurEmitted;
+  }
+
+
+  DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
+
+  if (CurInstKind != IDFetch) {
+    MoveUnits(Pending[IDFetch], Available[IDFetch]);
+  } else
+    FetchInstCount++;
+}
+
+static bool
+isPhysicalRegCopy(MachineInstr *MI) {
+  if (MI->getOpcode() != AMDGPU::COPY)
+    return false;
+
+  return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
+}
+
+void R600SchedStrategy::releaseTopNode(SUnit *SU) {
+  DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
+}
+
+void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
+  DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
+  if (isPhysicalRegCopy(SU->getInstr())) {
+    PhysicalRegCopy.push_back(SU);
+    return;
+  }
+
+  int IK = getInstKind(SU);
+
+  // There is no export clause, we can schedule one as soon as its ready
+  if (IK == IDOther)
+    Available[IDOther].push_back(SU);
+  else
+    Pending[IK].push_back(SU);
+
+}
+
+bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
+                                          const TargetRegisterClass *RC) const {
+  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+    return RC->contains(Reg);
+  } else {
+    return MRI->getRegClass(Reg) == RC;
+  }
+}
+
+R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
+  MachineInstr *MI = SU->getInstr();
+
+  if (TII->isTransOnly(*MI))
+    return AluTrans;
+
+  switch (MI->getOpcode()) {
+  case AMDGPU::PRED_X:
+    return AluPredX;
+  case AMDGPU::INTERP_PAIR_XY:
+  case AMDGPU::INTERP_PAIR_ZW:
+  case AMDGPU::INTERP_VEC_LOAD:
+  case AMDGPU::DOT_4:
+    return AluT_XYZW;
+  case AMDGPU::COPY:
+    if (MI->getOperand(1).isUndef()) {
+      // MI will become a KILL, don't considers it in scheduling
+      return AluDiscarded;
+    }
+  default:
+    break;
+  }
+
+  // Does the instruction take a whole IG ?
+  // XXX: Is it possible to add a helper function in R600InstrInfo that can
+  // be used here and in R600PacketizerList::isSoloInstruction() ?
+  if(TII->isVector(*MI) ||
+     TII->isCubeOp(MI->getOpcode()) ||
+     TII->isReductionOp(MI->getOpcode()) ||
+     MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
+    return AluT_XYZW;
+  }
+
+  if (TII->isLDSInstr(MI->getOpcode())) {
+    return AluT_X;
+  }
+
+  // Is the result already assigned to a channel ?
+  unsigned DestSubReg = MI->getOperand(0).getSubReg();
+  switch (DestSubReg) {
+  case AMDGPU::sub0:
+    return AluT_X;
+  case AMDGPU::sub1:
+    return AluT_Y;
+  case AMDGPU::sub2:
+    return AluT_Z;
+  case AMDGPU::sub3:
+    return AluT_W;
+  default:
+    break;
+  }
+
+  // Is the result already member of a X/Y/Z/W class ?
+  unsigned DestReg = MI->getOperand(0).getReg();
+  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
+      regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
+    return AluT_X;
+  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
+    return AluT_Y;
+  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
+    return AluT_Z;
+  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
+    return AluT_W;
+  if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
+    return AluT_XYZW;
+
+  // LDS src registers cannot be used in the Trans slot.
+  if (TII->readsLDSSrcReg(*MI))
+    return AluT_XYZW;
+
+  return AluAny;
+}
+
+int R600SchedStrategy::getInstKind(SUnit* SU) {
+  int Opcode = SU->getInstr()->getOpcode();
+
+  if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
+    return IDFetch;
+
+  if (TII->isALUInstr(Opcode)) {
+    return IDAlu;
+  }
+
+  switch (Opcode) {
+  case AMDGPU::PRED_X:
+  case AMDGPU::COPY:
+  case AMDGPU::CONST_COPY:
+  case AMDGPU::INTERP_PAIR_XY:
+  case AMDGPU::INTERP_PAIR_ZW:
+  case AMDGPU::INTERP_VEC_LOAD:
+  case AMDGPU::DOT_4:
+    return IDAlu;
+  default:
+    return IDOther;
+  }
+}
+
+SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
+  if (Q.empty())
+    return nullptr;
+  for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
+      It != E; ++It) {
+    SUnit *SU = *It;
+    InstructionsGroupCandidate.push_back(SU->getInstr());
+    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) &&
+        (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) {
+      InstructionsGroupCandidate.pop_back();
+      Q.erase((It + 1).base());
+      return SU;
+    } else {
+      InstructionsGroupCandidate.pop_back();
+    }
+  }
+  return nullptr;
+}
+
+void R600SchedStrategy::LoadAlu() {
+  std::vector<SUnit *> &QSrc = Pending[IDAlu];
+  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
+    AluKind AK = getAluKind(QSrc[i]);
+    AvailableAlus[AK].push_back(QSrc[i]);
+  }
+  QSrc.clear();
+}
+
+void R600SchedStrategy::PrepareNextSlot() {
+  DEBUG(dbgs() << "New Slot\n");
+  assert (OccupedSlotsMask && "Slot wasn't filled");
+  OccupedSlotsMask = 0;
+//  if (HwGen == R600Subtarget::NORTHERN_ISLANDS)
+//    OccupedSlotsMask |= 16;
+  InstructionsGroupCandidate.clear();
+  LoadAlu();
+}
+
+void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
+  int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+  if (DstIndex == -1) {
+    return;
+  }
+  unsigned DestReg = MI->getOperand(DstIndex).getReg();
+  // PressureRegister crashes if an operand is def and used in the same inst
+  // and we try to constraint its regclass
+  for (MachineInstr::mop_iterator It = MI->operands_begin(),
+      E = MI->operands_end(); It != E; ++It) {
+    MachineOperand &MO = *It;
+    if (MO.isReg() && !MO.isDef() &&
+        MO.getReg() == DestReg)
+      return;
+  }
+  // Constrains the regclass of DestReg to assign it to Slot
+  switch (Slot) {
+  case 0:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
+    break;
+  case 1:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
+    break;
+  case 2:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
+    break;
+  case 3:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
+    break;
+  }
+}
+
+SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
+  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
+  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
+  if (SlotedSU)
+    return SlotedSU;
+  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
+  if (UnslotedSU)
+    AssignSlot(UnslotedSU->getInstr(), Slot);
+  return UnslotedSU;
+}
+
+unsigned R600SchedStrategy::AvailablesAluCount() const {
+  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
+      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
+      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
+      AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
+      AvailableAlus[AluPredX].size();
+}
+
+SUnit* R600SchedStrategy::pickAlu() {
+  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
+    if (!OccupedSlotsMask) {
+      // Bottom up scheduling : predX must comes first
+      if (!AvailableAlus[AluPredX].empty()) {
+        OccupedSlotsMask |= 31;
+        return PopInst(AvailableAlus[AluPredX], false);
+      }
+      // Flush physical reg copies (RA will discard them)
+      if (!AvailableAlus[AluDiscarded].empty()) {
+        OccupedSlotsMask |= 31;
+        return PopInst(AvailableAlus[AluDiscarded], false);
+      }
+      // If there is a T_XYZW alu available, use it
+      if (!AvailableAlus[AluT_XYZW].empty()) {
+        OccupedSlotsMask |= 15;
+        return PopInst(AvailableAlus[AluT_XYZW], false);
+      }
+    }
+    bool TransSlotOccuped = OccupedSlotsMask & 16;
+    if (!TransSlotOccuped && VLIW5) {
+      if (!AvailableAlus[AluTrans].empty()) {
+        OccupedSlotsMask |= 16;
+        return PopInst(AvailableAlus[AluTrans], false);
+      }
+      SUnit *SU = AttemptFillSlot(3, true);
+      if (SU) {
+        OccupedSlotsMask |= 16;
+        return SU;
+      }
+    }
+    for (int Chan = 3; Chan > -1; --Chan) {
+      bool isOccupied = OccupedSlotsMask & (1 << Chan);
+      if (!isOccupied) {
+        SUnit *SU = AttemptFillSlot(Chan, false);
+        if (SU) {
+          OccupedSlotsMask |= (1 << Chan);
+          InstructionsGroupCandidate.push_back(SU->getInstr());
+          return SU;
+        }
+      }
+    }
+    PrepareNextSlot();
+  }
+  return nullptr;
+}
+
+SUnit* R600SchedStrategy::pickOther(int QID) {
+  SUnit *SU = nullptr;
+  std::vector<SUnit *> &AQ = Available[QID];
+
+  if (AQ.empty()) {
+    MoveUnits(Pending[QID], AQ);
+  }
+  if (!AQ.empty()) {
+    SU = AQ.back();
+    AQ.resize(AQ.size() - 1);
+  }
+  return SU;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
new file mode 100644
index 000000000000..9a6770570477
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.h
@@ -0,0 +1,100 @@
+//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H
+
+#include "llvm/CodeGen/MachineScheduler.h"
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+
+class R600InstrInfo;
+struct R600RegisterInfo;
+
+class R600SchedStrategy final : public MachineSchedStrategy {
+  const ScheduleDAGMILive *DAG = nullptr;
+  const R600InstrInfo *TII = nullptr;
+  const R600RegisterInfo *TRI = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+
+  enum InstKind {
+    IDAlu,
+    IDFetch,
+    IDOther,
+    IDLast
+  };
+
+  enum AluKind {
+    AluAny,
+    AluT_X,
+    AluT_Y,
+    AluT_Z,
+    AluT_W,
+    AluT_XYZW,
+    AluPredX,
+    AluTrans,
+    AluDiscarded, // LLVM Instructions that are going to be eliminated
+    AluLast
+  };
+
+  std::vector<SUnit *> Available[IDLast], Pending[IDLast];
+  std::vector<SUnit *> AvailableAlus[AluLast];
+  std::vector<SUnit *> PhysicalRegCopy;
+
+  InstKind CurInstKind;
+  int CurEmitted;
+  InstKind NextInstKind;
+
+  unsigned AluInstCount;
+  unsigned FetchInstCount;
+
+  int InstKindLimit[IDLast];
+
+  int OccupedSlotsMask;
+
+public:
+  R600SchedStrategy() = default;
+  ~R600SchedStrategy() override = default;
+
+  void initialize(ScheduleDAGMI *dag) override;
+  SUnit *pickNode(bool &IsTopNode) override;
+  void schedNode(SUnit *SU, bool IsTopNode) override;
+  void releaseTopNode(SUnit *SU) override;
+  void releaseBottomNode(SUnit *SU) override;
+
+private:
+  std::vector<MachineInstr *> InstructionsGroupCandidate;
+  bool VLIW5;
+
+  int getInstKind(SUnit *SU);
+  bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
+  AluKind getAluKind(SUnit *SU) const;
+  void LoadAlu();
+  unsigned AvailablesAluCount() const;
+  SUnit *AttemptFillSlot (unsigned Slot, bool AnyAlu);
+  void PrepareNextSlot();
+  SUnit *PopInst(std::vector<SUnit*> &Q, bool AnyALU);
+
+  void AssignSlot(MachineInstr *MI, unsigned Slot);
+  SUnit* pickAlu();
+  SUnit* pickOther(int QID);
+  void MoveUnits(std::vector<SUnit *> &QSrc, std::vector<SUnit *> &QDst);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
new file mode 100644
index 000000000000..d90008a550ae
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -0,0 +1,401 @@
+//===--------------------- R600MergeVectorRegisters.cpp -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass merges inputs of swizzeable instructions into vector sharing
+/// common data and/or have enough undef subreg using swizzle abilities.
+///
+/// For instance let's consider the following pseudo code :
+/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
+/// ...
+/// vreg7<def> = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3
+/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3
+///
+/// is turned into :
+/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
+/// ...
+/// vreg7<def> = INSERT_SUBREG vreg4, sub3
+/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3
+///
+/// This allow regalloc to reduce register pressure for vector registers and
+/// to reduce MOV count.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vec-merger"
+
+static bool
+isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
+  for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg),
+      E = MRI.def_instr_end(); It != E; ++It) {
+    return (*It).isImplicitDef();
+  }
+  if (MRI.isReserved(Reg)) {
+    return false;
+  }
+  llvm_unreachable("Reg without a def");
+  return false;
+}
+
+namespace {
+
+class RegSeqInfo {
+public:
+  MachineInstr *Instr;
+  DenseMap<unsigned, unsigned> RegToChan;
+  std::vector<unsigned> UndefReg;
+
+  RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
+    assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE);
+    for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
+      MachineOperand &MO = Instr->getOperand(i);
+      unsigned Chan = Instr->getOperand(i + 1).getImm();
+      if (isImplicitlyDef(MRI, MO.getReg()))
+        UndefReg.push_back(Chan);
+      else
+        RegToChan[MO.getReg()] = Chan;
+    }
+  }
+
+  RegSeqInfo() = default;
+
+  bool operator==(const RegSeqInfo &RSI) const {
+    return RSI.Instr == Instr;
+  }
+};
+
+class R600VectorRegMerger : public MachineFunctionPass {
+private:
+  MachineRegisterInfo *MRI;
+  const R600InstrInfo *TII;
+
+  bool canSwizzle(const MachineInstr &MI) const;
+  bool areAllUsesSwizzeable(unsigned Reg) const;
+  void SwizzleInput(MachineInstr &,
+      const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const;
+  bool tryMergeVector(const RegSeqInfo *Untouched, RegSeqInfo *ToMerge,
+      std::vector<std::pair<unsigned, unsigned>> &Remap) const;
+  bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
+      std::vector<std::pair<unsigned, unsigned>> &RemapChan);
+  bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
+      std::vector<std::pair<unsigned, unsigned>> &RemapChan);
+  MachineInstr *RebuildVector(RegSeqInfo *MI, const RegSeqInfo *BaseVec,
+      const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const;
+  void RemoveMI(MachineInstr *);
+  void trackRSI(const RegSeqInfo &RSI);
+
+  typedef DenseMap<unsigned, std::vector<MachineInstr *>> InstructionSetMap;
+  DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq;
+  InstructionSetMap PreviousRegSeqByReg;
+  InstructionSetMap PreviousRegSeqByUndefCount;
+
+public:
+  static char ID;
+
+  R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
+  TII(nullptr) { }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override {
+    return "R600 Vector Registers Merge Pass";
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+};
+
+} // end anonymous namespace.
+
+char R600VectorRegMerger::ID = 0;
+
+bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
+    const {
+  if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
+    return true;
+  switch (MI.getOpcode()) {
+  case AMDGPU::R600_ExportSwz:
+  case AMDGPU::EG_ExportSwz:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
+    RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned>> &Remap)
+    const {
+  unsigned CurrentUndexIdx = 0;
+  for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(),
+      E = ToMerge->RegToChan.end(); It != E; ++It) {
+    DenseMap<unsigned, unsigned>::const_iterator PosInUntouched =
+        Untouched->RegToChan.find((*It).first);
+    if (PosInUntouched != Untouched->RegToChan.end()) {
+      Remap.push_back(std::pair<unsigned, unsigned>
+          ((*It).second, (*PosInUntouched).second));
+      continue;
+    }
+    if (CurrentUndexIdx >= Untouched->UndefReg.size())
+      return false;
+    Remap.push_back(std::pair<unsigned, unsigned>
+        ((*It).second, Untouched->UndefReg[CurrentUndexIdx++]));
+  }
+
+  return true;
+}
+
+static
+unsigned getReassignedChan(
+    const std::vector<std::pair<unsigned, unsigned>> &RemapChan,
+    unsigned Chan) {
+  for (unsigned j = 0, je = RemapChan.size(); j < je; j++) {
+    if (RemapChan[j].first == Chan)
+      return RemapChan[j].second;
+  }
+  llvm_unreachable("Chan wasn't reassigned");
+}
+
+MachineInstr *R600VectorRegMerger::RebuildVector(
+    RegSeqInfo *RSI, const RegSeqInfo *BaseRSI,
+    const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const {
+  unsigned Reg = RSI->Instr->getOperand(0).getReg();
+  MachineBasicBlock::iterator Pos = RSI->Instr;
+  MachineBasicBlock &MBB = *Pos->getParent();
+  DebugLoc DL = Pos->getDebugLoc();
+
+  unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg();
+  DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
+  std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
+  for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
+      E = RSI->RegToChan.end(); It != E; ++It) {
+    unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    unsigned SubReg = (*It).first;
+    unsigned Swizzle = (*It).second;
+    unsigned Chan = getReassignedChan(RemapChan, Swizzle);
+
+    MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG),
+        DstReg)
+        .addReg(SrcVec)
+        .addReg(SubReg)
+        .addImm(Chan);
+    UpdatedRegToChan[SubReg] = Chan;
+    std::vector<unsigned>::iterator ChanPos = llvm::find(UpdatedUndef, Chan);
+    if (ChanPos != UpdatedUndef.end())
+      UpdatedUndef.erase(ChanPos);
+    assert(!is_contained(UpdatedUndef, Chan) &&
+           "UpdatedUndef shouldn't contain Chan more than once!");
+    DEBUG(dbgs() << "    ->"; Tmp->dump(););
+    (void)Tmp;
+    SrcVec = DstReg;
+  }
+  MachineInstr *NewMI =
+      BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg).addReg(SrcVec);
+  DEBUG(dbgs() << "    ->"; NewMI->dump(););
+
+  DEBUG(dbgs() << "  Updating Swizzle:\n");
+  for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
+      E = MRI->use_instr_end(); It != E; ++It) {
+    DEBUG(dbgs() << "    ";(*It).dump(); dbgs() << "    ->");
+    SwizzleInput(*It, RemapChan);
+    DEBUG((*It).dump());
+  }
+  RSI->Instr->eraseFromParent();
+
+  // Update RSI
+  RSI->Instr = NewMI;
+  RSI->RegToChan = UpdatedRegToChan;
+  RSI->UndefReg = UpdatedUndef;
+
+  return NewMI;
+}
+
+void R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
+  for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(),
+      E = PreviousRegSeqByReg.end(); It != E; ++It) {
+    std::vector<MachineInstr *> &MIs = (*It).second;
+    MIs.erase(llvm::find(MIs, MI), MIs.end());
+  }
+  for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(),
+      E = PreviousRegSeqByUndefCount.end(); It != E; ++It) {
+    std::vector<MachineInstr *> &MIs = (*It).second;
+    MIs.erase(llvm::find(MIs, MI), MIs.end());
+  }
+}
+
+void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
+    const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const {
+  unsigned Offset;
+  if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
+    Offset = 2;
+  else
+    Offset = 3;
+  for (unsigned i = 0; i < 4; i++) {
+    unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1;
+    for (unsigned j = 0, e = RemapChan.size(); j < e; j++) {
+      if (RemapChan[j].first == Swizzle) {
+        MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1);
+        break;
+      }
+    }
+  }
+}
+
+bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const {
+  for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
+      E = MRI->use_instr_end(); It != E; ++It) {
+    if (!canSwizzle(*It))
+      return false;
+  }
+  return true;
+}
+
+bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI,
+    RegSeqInfo &CompatibleRSI,
+    std::vector<std::pair<unsigned, unsigned>> &RemapChan) {
+  for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(),
+      MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) {
+    if (!MOp->isReg())
+      continue;
+    if (PreviousRegSeqByReg[MOp->getReg()].empty())
+      continue;
+    for (MachineInstr *MI : PreviousRegSeqByReg[MOp->getReg()]) {
+      CompatibleRSI = PreviousRegSeq[MI];
+      if (RSI == CompatibleRSI)
+        continue;
+      if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
+    RegSeqInfo &CompatibleRSI,
+    std::vector<std::pair<unsigned, unsigned>> &RemapChan) {
+  unsigned NeededUndefs = 4 - RSI.UndefReg.size();
+  if (PreviousRegSeqByUndefCount[NeededUndefs].empty())
+    return false;
+  std::vector<MachineInstr *> &MIs =
+      PreviousRegSeqByUndefCount[NeededUndefs];
+  CompatibleRSI = PreviousRegSeq[MIs.back()];
+  tryMergeVector(&CompatibleRSI, &RSI, RemapChan);
+  return true;
+}
+
+void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) {
+  for (DenseMap<unsigned, unsigned>::const_iterator
+  It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) {
+    PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr);
+  }
+  PreviousRegSeqByUndefCount[RSI.UndefReg.size()].push_back(RSI.Instr);
+  PreviousRegSeq[RSI.Instr] = RSI;
+}
+
+bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
+  const R600Subtarget &ST = Fn.getSubtarget<R600Subtarget>();
+  TII = ST.getInstrInfo();
+  MRI = &Fn.getRegInfo();
+
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    MachineBasicBlock *MB = &*MBB;
+    PreviousRegSeq.clear();
+    PreviousRegSeqByReg.clear();
+    PreviousRegSeqByUndefCount.clear();
+
+    for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
+         MII != MIIE; ++MII) {
+      MachineInstr &MI = *MII;
+      if (MI.getOpcode() != AMDGPU::REG_SEQUENCE) {
+        if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
+          unsigned Reg = MI.getOperand(1).getReg();
+          for (MachineRegisterInfo::def_instr_iterator
+               It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end();
+               It != E; ++It) {
+            RemoveMI(&(*It));
+          }
+        }
+        continue;
+      }
+
+      RegSeqInfo RSI(*MRI, &MI);
+
+      // All uses of MI are swizzeable ?
+      unsigned Reg = MI.getOperand(0).getReg();
+      if (!areAllUsesSwizzeable(Reg))
+        continue;
+
+      DEBUG({
+        dbgs() << "Trying to optimize ";
+        MI.dump();
+      });
+
+      RegSeqInfo CandidateRSI;
+      std::vector<std::pair<unsigned, unsigned>> RemapChan;
+      DEBUG(dbgs() << "Using common slots...\n";);
+      if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) {
+        // Remove CandidateRSI mapping
+        RemoveMI(CandidateRSI.Instr);
+        MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
+        trackRSI(RSI);
+        continue;
+      }
+      DEBUG(dbgs() << "Using free slots...\n";);
+      RemapChan.clear();
+      if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) {
+        RemoveMI(CandidateRSI.Instr);
+        MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
+        trackRSI(RSI);
+        continue;
+      }
+      //Failed to merge
+      trackRSI(RSI);
+    }
+  }
+  return false;
+}
+
+llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) {
+  return new R600VectorRegMerger(tm);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
new file mode 100644
index 000000000000..5b6dd1ed128d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -0,0 +1,409 @@
+//===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass implements instructions packetization for R600. It unsets isLast
+/// bit of instructions inside a bundle and substitutes src register with
+/// PreviousVector when applicable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Debug.h"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "packets"
+
+namespace {
+
+class R600Packetizer : public MachineFunctionPass {
+
+public:
+  static char ID;
+  R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return "R600 Packetizer"; }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+};
+char R600Packetizer::ID = 0;
+
+class R600PacketizerList : public VLIWPacketizerList {
+private:
+  const R600InstrInfo *TII;
+  const R600RegisterInfo &TRI;
+  bool VLIW5;
+  bool ConsideredInstUsesAlreadyWrittenVectorElement;
+
+  unsigned getSlot(const MachineInstr &MI) const {
+    return TRI.getHWRegChan(MI.getOperand(0).getReg());
+  }
+
+  /// \returns register to PV chan mapping for bundle/single instructions that
+  /// immediately precedes I.
+  DenseMap<unsigned, unsigned> getPreviousVector(MachineBasicBlock::iterator I)
+      const {
+    DenseMap<unsigned, unsigned> Result;
+    I--;
+    if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle())
+      return Result;
+    MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
+    if (I->isBundle())
+      BI++;
+    int LastDstChan = -1;
+    do {
+      bool isTrans = false;
+      int BISlot = getSlot(*BI);
+      if (LastDstChan >= BISlot)
+        isTrans = true;
+      LastDstChan = BISlot;
+      if (TII->isPredicated(*BI))
+        continue;
+      int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
+      if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
+        continue;
+      int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst);
+      if (DstIdx == -1) {
+        continue;
+      }
+      unsigned Dst = BI->getOperand(DstIdx).getReg();
+      if (isTrans || TII->isTransOnly(*BI)) {
+        Result[Dst] = AMDGPU::PS;
+        continue;
+      }
+      if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
+          BI->getOpcode() == AMDGPU::DOT4_eg) {
+        Result[Dst] = AMDGPU::PV_X;
+        continue;
+      }
+      if (Dst == AMDGPU::OQAP) {
+        continue;
+      }
+      unsigned PVReg = 0;
+      switch (TRI.getHWRegChan(Dst)) {
+      case 0:
+        PVReg = AMDGPU::PV_X;
+        break;
+      case 1:
+        PVReg = AMDGPU::PV_Y;
+        break;
+      case 2:
+        PVReg = AMDGPU::PV_Z;
+        break;
+      case 3:
+        PVReg = AMDGPU::PV_W;
+        break;
+      default:
+        llvm_unreachable("Invalid Chan");
+      }
+      Result[Dst] = PVReg;
+    } while ((++BI)->isBundledWithPred());
+    return Result;
+  }
+
+  void substitutePV(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PVs)
+      const {
+    unsigned Ops[] = {
+      AMDGPU::OpName::src0,
+      AMDGPU::OpName::src1,
+      AMDGPU::OpName::src2
+    };
+    for (unsigned i = 0; i < 3; i++) {
+      int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]);
+      if (OperandIdx < 0)
+        continue;
+      unsigned Src = MI.getOperand(OperandIdx).getReg();
+      const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src);
+      if (It != PVs.end())
+        MI.getOperand(OperandIdx).setReg(It->second);
+    }
+  }
+public:
+  // Ctor.
+  R600PacketizerList(MachineFunction &MF, const R600Subtarget &ST,
+                     MachineLoopInfo &MLI)
+      : VLIWPacketizerList(MF, MLI, nullptr),
+        TII(ST.getInstrInfo()),
+        TRI(TII->getRegisterInfo()) {
+    VLIW5 = !ST.hasCaymanISA();
+  }
+
+  // initPacketizerState - initialize some internal flags.
+  void initPacketizerState() override {
+    ConsideredInstUsesAlreadyWrittenVectorElement = false;
+  }
+
+  // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+  bool ignorePseudoInstruction(const MachineInstr &MI,
+                               const MachineBasicBlock *MBB) override {
+    return false;
+  }
+
+  // isSoloInstruction - return true if instruction MI can not be packetized
+  // with any other instruction, which means that MI itself is a packet.
+  bool isSoloInstruction(const MachineInstr &MI) override {
+    if (TII->isVector(MI))
+      return true;
+    if (!TII->isALUInstr(MI.getOpcode()))
+      return true;
+    if (MI.getOpcode() == AMDGPU::GROUP_BARRIER)
+      return true;
+    // XXX: This can be removed once the packetizer properly handles all the
+    // LDS instruction group restrictions.
+    return TII->isLDSInstr(MI.getOpcode());
+  }
+
+  // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
+  // together.
+  bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override {
+    MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
+    if (getSlot(*MII) == getSlot(*MIJ))
+      ConsideredInstUsesAlreadyWrittenVectorElement = true;
+    // Does MII and MIJ share the same pred_sel ?
+    int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
+        OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel);
+    unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0,
+        PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0;
+    if (PredI != PredJ)
+      return false;
+    if (SUJ->isSucc(SUI)) {
+      for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) {
+        const SDep &Dep = SUJ->Succs[i];
+        if (Dep.getSUnit() != SUI)
+          continue;
+        if (Dep.getKind() == SDep::Anti)
+          continue;
+        if (Dep.getKind() == SDep::Output)
+          if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg())
+            continue;
+        return false;
+      }
+    }
+
+    bool ARDef =
+        TII->definesAddressRegister(*MII) || TII->definesAddressRegister(*MIJ);
+    bool ARUse =
+        TII->usesAddressRegister(*MII) || TII->usesAddressRegister(*MIJ);
+
+    return !ARDef || !ARUse;
+  }
+
+  // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
+  // and SUJ.
+  bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override {
+    return false;
+  }
+
+  void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
+    unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last);
+    MI->getOperand(LastOp).setImm(Bit);
+  }
+
+  bool isBundlableWithCurrentPMI(MachineInstr &MI,
+                                 const DenseMap<unsigned, unsigned> &PV,
+                                 std::vector<R600InstrInfo::BankSwizzle> &BS,
+                                 bool &isTransSlot) {
+    isTransSlot = TII->isTransOnly(MI);
+    assert (!isTransSlot || VLIW5);
+
+    // Is the dst reg sequence legal ?
+    if (!isTransSlot && !CurrentPacketMIs.empty()) {
+      if (getSlot(MI) <= getSlot(*CurrentPacketMIs.back())) {
+        if (ConsideredInstUsesAlreadyWrittenVectorElement &&
+            !TII->isVectorOnly(MI) && VLIW5) {
+          isTransSlot = true;
+          DEBUG({
+            dbgs() << "Considering as Trans Inst :";
+            MI.dump();
+          });
+        }
+        else
+          return false;
+      }
+    }
+
+    // Are the Constants limitations met ?
+    CurrentPacketMIs.push_back(&MI);
+    if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) {
+      DEBUG({
+        dbgs() << "Couldn't pack :\n";
+        MI.dump();
+        dbgs() << "with the following packets :\n";
+        for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
+          CurrentPacketMIs[i]->dump();
+          dbgs() << "\n";
+        }
+        dbgs() << "because of Consts read limitations\n";
+      });
+      CurrentPacketMIs.pop_back();
+      return false;
+    }
+
+    // Is there a BankSwizzle set that meet Read Port limitations ?
+    if (!TII->fitsReadPortLimitations(CurrentPacketMIs,
+            PV, BS, isTransSlot)) {
+      DEBUG({
+        dbgs() << "Couldn't pack :\n";
+        MI.dump();
+        dbgs() << "with the following packets :\n";
+        for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
+          CurrentPacketMIs[i]->dump();
+          dbgs() << "\n";
+        }
+        dbgs() << "because of Read port limitations\n";
+      });
+      CurrentPacketMIs.pop_back();
+      return false;
+    }
+
+    // We cannot read LDS source registers from the Trans slot.
+    if (isTransSlot && TII->readsLDSSrcReg(MI))
+      return false;
+
+    CurrentPacketMIs.pop_back();
+    return true;
+  }
+
+  MachineBasicBlock::iterator addToPacket(MachineInstr &MI) override {
+    MachineBasicBlock::iterator FirstInBundle =
+        CurrentPacketMIs.empty() ? &MI : CurrentPacketMIs.front();
+    const DenseMap<unsigned, unsigned> &PV =
+        getPreviousVector(FirstInBundle);
+    std::vector<R600InstrInfo::BankSwizzle> BS;
+    bool isTransSlot;
+
+    if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) {
+      for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) {
+        MachineInstr *MI = CurrentPacketMIs[i];
+        unsigned Op = TII->getOperandIdx(MI->getOpcode(),
+            AMDGPU::OpName::bank_swizzle);
+        MI->getOperand(Op).setImm(BS[i]);
+      }
+      unsigned Op =
+          TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::bank_swizzle);
+      MI.getOperand(Op).setImm(BS.back());
+      if (!CurrentPacketMIs.empty())
+        setIsLastBit(CurrentPacketMIs.back(), 0);
+      substitutePV(MI, PV);
+      MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI);
+      if (isTransSlot) {
+        endPacket(std::next(It)->getParent(), std::next(It));
+      }
+      return It;
+    }
+    endPacket(MI.getParent(), MI);
+    if (TII->isTransOnly(MI))
+      return MI;
+    return VLIWPacketizerList::addToPacket(MI);
+  }
+};
+
+bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
+  const R600Subtarget &ST = Fn.getSubtarget<R600Subtarget>();
+  const R600InstrInfo *TII = ST.getInstrInfo();
+
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+
+  // Instantiate the packetizer.
+  R600PacketizerList Packetizer(Fn, ST, MLI);
+
+  // DFA state table should not be empty.
+  assert(Packetizer.getResourceTracker() && "Empty DFA table!");
+
+  if (Packetizer.getResourceTracker()->getInstrItins()->isEmpty())
+    return false;
+
+  //
+  // Loop over all basic blocks and remove KILL pseudo-instructions
+  // These instructions confuse the dependence analysis. Consider:
+  // D0 = ...   (Insn 0)
+  // R0 = KILL R0, D0 (Insn 1)
+  // R0 = ... (Insn 2)
+  // Here, Insn 1 will result in the dependence graph not emitting an output
+  // dependence between Insn 0 and Insn 2. This can lead to incorrect
+  // packetization
+  //
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    MachineBasicBlock::iterator End = MBB->end();
+    MachineBasicBlock::iterator MI = MBB->begin();
+    while (MI != End) {
+      if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF ||
+          (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) {
+        MachineBasicBlock::iterator DeleteMI = MI;
+        ++MI;
+        MBB->erase(DeleteMI);
+        End = MBB->end();
+        continue;
+      }
+      ++MI;
+    }
+  }
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    // Find scheduling regions and schedule / packetize each region.
+    unsigned RemainingCount = MBB->size();
+    for(MachineBasicBlock::iterator RegionEnd = MBB->end();
+        RegionEnd != MBB->begin();) {
+      // The next region starts above the previous region. Look backward in the
+      // instruction stream until we find the nearest boundary.
+      MachineBasicBlock::iterator I = RegionEnd;
+      for(;I != MBB->begin(); --I, --RemainingCount) {
+        if (TII->isSchedulingBoundary(*std::prev(I), &*MBB, Fn))
+          break;
+      }
+      I = MBB->begin();
+
+      // Skip empty scheduling regions.
+      if (I == RegionEnd) {
+        RegionEnd = std::prev(RegionEnd);
+        --RemainingCount;
+        continue;
+      }
+      // Skip regions with one instruction.
+      if (I == std::prev(RegionEnd)) {
+        RegionEnd = std::prev(RegionEnd);
+        continue;
+      }
+
+      Packetizer.PacketizeMIs(&*MBB, &*I, RegionEnd);
+      RegionEnd = I;
+    }
+  }
+
+  return true;
+
+}
+
+} // end anonymous namespace
+
+llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) {
+  return new R600Packetizer(tm);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
new file mode 100644
index 000000000000..dfdc602b80cd
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -0,0 +1,98 @@
+//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600RegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+
+using namespace llvm;
+
+R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() {
+  RCW.RegWeight = 0;
+  RCW.WeightLimit = 0;
+}
+
+BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+
+  const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
+  const R600InstrInfo *TII = ST.getInstrInfo();
+
+  Reserved.set(AMDGPU::ZERO);
+  Reserved.set(AMDGPU::HALF);
+  Reserved.set(AMDGPU::ONE);
+  Reserved.set(AMDGPU::ONE_INT);
+  Reserved.set(AMDGPU::NEG_HALF);
+  Reserved.set(AMDGPU::NEG_ONE);
+  Reserved.set(AMDGPU::PV_X);
+  Reserved.set(AMDGPU::ALU_LITERAL_X);
+  Reserved.set(AMDGPU::ALU_CONST);
+  Reserved.set(AMDGPU::PREDICATE_BIT);
+  Reserved.set(AMDGPU::PRED_SEL_OFF);
+  Reserved.set(AMDGPU::PRED_SEL_ZERO);
+  Reserved.set(AMDGPU::PRED_SEL_ONE);
+  Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
+
+  for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(),
+                        E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) {
+    Reserved.set(*I);
+  }
+
+  TII->reserveIndirectRegisters(Reserved, MF);
+
+  return Reserved;
+}
+
+unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
+  return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
+}
+
+unsigned R600RegisterInfo::getHWRegIndex(unsigned Reg) const {
+  return GET_REG_INDEX(getEncodingValue(Reg));
+}
+
+const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
+                                                                   MVT VT) const {
+  switch(VT.SimpleTy) {
+  default:
+  case MVT::i32: return &AMDGPU::R600_TReg32RegClass;
+  }
+}
+
+const RegClassWeight &R600RegisterInfo::getRegClassWeight(
+  const TargetRegisterClass *RC) const {
+  return RCW;
+}
+
+bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
+  assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+
+  switch (Reg) {
+  case AMDGPU::OQAP:
+  case AMDGPU::OQBP:
+  case AMDGPU::AR_X:
+    return false;
+  default:
+    return true;
+  }
+}
+
+void R600RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                           int SPAdj,
+                                           unsigned FIOperandNum,
+                                           RegScavenger *RS) const {
+  llvm_unreachable("Subroutines not supported yet");
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
new file mode 100644
index 000000000000..9dfb3106c6cc
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -0,0 +1,54 @@
+//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for R600RegisterInfo
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H
+
+#include "AMDGPURegisterInfo.h"
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+
+struct R600RegisterInfo final : public AMDGPURegisterInfo {
+  RegClassWeight RCW;
+
+  R600RegisterInfo();
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  /// \brief get the HW encoding for a register's channel.
+  unsigned getHWRegChan(unsigned reg) const;
+
+  unsigned getHWRegIndex(unsigned Reg) const;
+
+  /// \brief get the register class of the specified type to use in the
+  /// CFGStructurizer
+  const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const;
+
+  const RegClassWeight &
+    getRegClassWeight(const TargetRegisterClass *RC) const override;
+
+  // \returns true if \p Reg can be defined in one ALU clause and used in
+  // another.
+  bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td
new file mode 100644
index 000000000000..cc667d985a82
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td
@@ -0,0 +1,252 @@
+
+class R600Reg <string name, bits<16> encoding> : Register<name> {
+  let Namespace = "AMDGPU";
+  let HWEncoding = encoding;
+}
+
+class R600RegWithChan <string name, bits<9> sel, string chan> :
+    Register <name> {
+
+  field bits<2> chan_encoding = !if(!eq(chan, "X"), 0,
+                                !if(!eq(chan, "Y"), 1,
+                                !if(!eq(chan, "Z"), 2,
+                                !if(!eq(chan, "W"), 3, 0))));
+  let HWEncoding{8-0}  = sel;
+  let HWEncoding{10-9} = chan_encoding;
+  let Namespace = "AMDGPU";
+}
+
+class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
+    RegisterWithSubRegs<n, subregs> {
+  field bits<2> chan_encoding = 0;
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1, sub2, sub3];
+  let HWEncoding{8-0} = encoding{8-0};
+  let HWEncoding{10-9} = chan_encoding;
+}
+
+class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
+    RegisterWithSubRegs<n, subregs> {
+  field bits<2> chan_encoding = 0;
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = encoding;
+  let HWEncoding{8-0} = encoding{8-0};
+  let HWEncoding{10-9} = chan_encoding;
+}
+
+class R600Reg_64Vertical<int lo, int hi, string chan> : R600Reg_64 <
+  "V"#lo#hi#"_"#chan,
+  [!cast<Register>("T"#lo#"_"#chan), !cast<Register>("T"#hi#"_"#chan)],
+  lo
+>;
+
+foreach Index = 0-127 in {
+  foreach Chan = [ "X", "Y", "Z", "W" ] in {
+    // 32-bit Temporary Registers
+    def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>;
+
+    // Indirect addressing offset registers
+    def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan,
+                                              Index, Chan>;
+  }
+  // 128-bit Temporary Registers
+  def T#Index#_XYZW : R600Reg_128 <"T"#Index#"",
+                                   [!cast<Register>("T"#Index#"_X"),
+                                    !cast<Register>("T"#Index#"_Y"),
+                                    !cast<Register>("T"#Index#"_Z"),
+                                    !cast<Register>("T"#Index#"_W")],
+                                   Index>;
+
+  def T#Index#_XY : R600Reg_64 <"T"#Index#"",
+                                   [!cast<Register>("T"#Index#"_X"),
+                                    !cast<Register>("T"#Index#"_Y")],
+                                   Index>;
+}
+
+foreach Chan = [ "X", "Y", "Z", "W"] in {
+
+  let chan_encoding = !if(!eq(Chan, "X"), 0,
+                      !if(!eq(Chan, "Y"), 1,
+                      !if(!eq(Chan, "Z"), 2,
+                      !if(!eq(Chan, "W"), 3, 0)))) in {
+    def V0123_#Chan : R600Reg_128 <"V0123_"#Chan,
+                                   [!cast<Register>("T0_"#Chan),
+                                    !cast<Register>("T1_"#Chan),
+                                    !cast<Register>("T2_"#Chan),
+                                    !cast<Register>("T3_"#Chan)],
+                                    0>;
+    def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>;
+    def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>;
+  }
+}
+
+
+// KCACHE_BANK0
+foreach Index = 159-128 in {
+  foreach Chan = [ "X", "Y", "Z", "W" ] in {
+    // 32-bit Temporary Registers
+    def KC0_#Index#_#Chan : R600RegWithChan <"KC0["#!add(Index,-128)#"]."#Chan, Index, Chan>;
+  }
+  // 128-bit Temporary Registers
+  def KC0_#Index#_XYZW : R600Reg_128 <"KC0["#!add(Index, -128)#"].XYZW",
+                                 [!cast<Register>("KC0_"#Index#"_X"),
+                                  !cast<Register>("KC0_"#Index#"_Y"),
+                                  !cast<Register>("KC0_"#Index#"_Z"),
+                                  !cast<Register>("KC0_"#Index#"_W")],
+                                 Index>;
+}
+
+// KCACHE_BANK1
+foreach Index = 191-160 in {
+  foreach Chan = [ "X", "Y", "Z", "W" ] in {
+    // 32-bit Temporary Registers
+    def KC1_#Index#_#Chan : R600RegWithChan <"KC1["#!add(Index,-160)#"]."#Chan, Index, Chan>;
+  }
+  // 128-bit Temporary Registers
+  def KC1_#Index#_XYZW : R600Reg_128 <"KC1["#!add(Index, -160)#"].XYZW",
+                                 [!cast<Register>("KC1_"#Index#"_X"),
+                                  !cast<Register>("KC1_"#Index#"_Y"),
+                                  !cast<Register>("KC1_"#Index#"_Z"),
+                                  !cast<Register>("KC1_"#Index#"_W")],
+                                 Index>;
+}
+
+
+// Array Base Register holding input in FS
+foreach Index = 448-480 in {
+  def ArrayBase#Index :  R600Reg<"ARRAY_BASE", Index>;
+}
+
+
+// Special Registers
+
+def OQA : R600Reg<"OQA", 219>;
+def OQB : R600Reg<"OQB", 220>;
+def OQAP : R600Reg<"OQAP", 221>;
+def OQBP : R600Reg<"OQAP", 222>;
+def LDS_DIRECT_A : R600Reg<"LDS_DIRECT_A", 223>;
+def LDS_DIRECT_B : R600Reg<"LDS_DIRECT_B", 224>;
+def ZERO : R600Reg<"0.0", 248>;
+def ONE : R600Reg<"1.0", 249>;
+def NEG_ONE : R600Reg<"-1.0", 249>;
+def ONE_INT : R600Reg<"1", 250>;
+def HALF : R600Reg<"0.5", 252>;
+def NEG_HALF : R600Reg<"-0.5", 252>;
+def ALU_LITERAL_X : R600RegWithChan<"literal.x", 253, "X">;
+def ALU_LITERAL_Y : R600RegWithChan<"literal.y", 253, "Y">;
+def ALU_LITERAL_Z : R600RegWithChan<"literal.z", 253, "Z">;
+def ALU_LITERAL_W : R600RegWithChan<"literal.w", 253, "W">;
+def PV_X : R600RegWithChan<"PV.X", 254, "X">;
+def PV_Y : R600RegWithChan<"PV.Y", 254, "Y">;
+def PV_Z : R600RegWithChan<"PV.Z", 254, "Z">;
+def PV_W : R600RegWithChan<"PV.W", 254, "W">;
+def PS: R600Reg<"PS", 255>;
+def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
+def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
+def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
+def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
+def AR_X : R600Reg<"AR.x", 0>;
+
+def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
+                          (add (sequence "ArrayBase%u", 448, 480))>;
+// special registers for ALU src operands
+// const buffer reference, SRCx_SEL contains index
+def ALU_CONST : R600Reg<"CBuf", 0>;
+// interpolation param reference, SRCx_SEL contains index
+def ALU_PARAM : R600Reg<"Param", 0>;
+
+let isAllocatable = 0 in {
+
+def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>;
+
+// We only use Addr_[YZW] for vertical vectors.
+// FIXME if we add more vertical vector registers we will need to ad more
+// registers to these classes.
+def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>;
+def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>;
+def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>;
+
+def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32,
+  (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>;
+
+def R600_KC0_X : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC0_%u_X", 128, 159))>;
+
+def R600_KC0_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC0_%u_Y", 128, 159))>;
+
+def R600_KC0_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC0_%u_Z", 128, 159))>;
+
+def R600_KC0_W : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC0_%u_W", 128, 159))>;
+
+def R600_KC0 : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (interleave R600_KC0_X, R600_KC0_Y,
+                                               R600_KC0_Z, R600_KC0_W)>;
+
+def R600_KC1_X : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC1_%u_X", 160, 191))>;
+
+def R600_KC1_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC1_%u_Y", 160, 191))>;
+
+def R600_KC1_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC1_%u_Z", 160, 191))>;
+
+def R600_KC1_W : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC1_%u_W", 160, 191))>;
+
+def R600_KC1 : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (interleave R600_KC1_X, R600_KC1_Y,
+                                               R600_KC1_Z, R600_KC1_W)>;
+
+} // End isAllocatable = 0
+
+def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_X", 0, 127), AR_X)>;
+
+def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_Y", 0, 127))>;
+
+def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_Z", 0, 127))>;
+
+def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_W", 0, 127))>;
+
+def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (interleave R600_TReg32_X, R600_TReg32_Y,
+                                               R600_TReg32_Z, R600_TReg32_W)>;
+
+def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
+    R600_TReg32,
+    R600_ArrayBase,
+    R600_Addr,
+    R600_KC0, R600_KC1,
+    ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
+    ALU_CONST, ALU_PARAM, OQAP
+    )>;
+
+def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
+    PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>;
+
+def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add
+    PREDICATE_BIT)>;
+
+def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
+                                (add (sequence "T%u_XYZW", 0, 127))> {
+  let CopyCost = -1;
+}
+
+def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
+  (add V0123_W, V0123_Z, V0123_Y, V0123_X)
+>;
+
+def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+                                (add (sequence "T%u_XY", 0, 63))>;
+
+def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+                                      (add V01_X, V01_Y, V01_Z, V01_W,
+                                           V23_X, V23_Y, V23_Z, V23_W)>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Schedule.td b/contrib/llvm/lib/Target/AMDGPU/R600Schedule.td
new file mode 100644
index 000000000000..70fb46c1a7d6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Schedule.td
@@ -0,0 +1,49 @@
+//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 has a VLIW architecture.  On pre-cayman cards there are 5 instruction
+// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS.  For cayman cards, the TRANS
+// slot has been removed.
+//
+//===----------------------------------------------------------------------===//
+
+
+def ALU_X : FuncUnit;
+def ALU_Y : FuncUnit;
+def ALU_Z : FuncUnit;
+def ALU_W : FuncUnit;
+def TRANS : FuncUnit;
+
+def AnyALU : InstrItinClass;
+def VecALU : InstrItinClass;
+def TransALU : InstrItinClass;
+def XALU : InstrItinClass;
+
+def R600_VLIW5_Itin : ProcessorItineraries <
+  [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL],
+  [],
+  [
+    InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>,
+    InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
+    InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>,
+    InstrItinData<XALU, [InstrStage<1, [ALU_X]>]>,
+    InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
+  ]
+>;
+
+def R600_VLIW4_Itin : ProcessorItineraries <
+  [ALU_X, ALU_Y, ALU_Z, ALU_W, ALU_NULL],
+  [],
+  [
+    InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
+    InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
+    InstrItinData<TransALU, [InstrStage<1, [ALU_NULL]>]>,
+    InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
+  ]
+>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R700Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R700Instructions.td
new file mode 100644
index 000000000000..613a0d729bb3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/R700Instructions.td
@@ -0,0 +1,21 @@
+//===-- R700Instructions.td - R700 Instruction defs  -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen definitions for instructions which are:
+// - Available to R700 and newer VLIW4/VLIW5 GPUs
+// - Available only on R700 family GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+def isR700 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::R700">;
+
+let Predicates = [isR700] in {
+  def SIN_r700 : SIN_Common<0x6E>;
+  def COS_r700 : COS_Common<0x6F>;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
new file mode 100644
index 000000000000..d70f52e0f295
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -0,0 +1,425 @@
+//===-- SIAnnotateControlFlow.cpp -  ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Annotates the control flow with hardware specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-annotate-control-flow"
+
+namespace {
+
+// Complex types used in this pass
+typedef std::pair<BasicBlock *, Value *> StackEntry;
+typedef SmallVector<StackEntry, 16> StackVector;
+
+// Intrinsic names the control flow is annotated with
+static const char *const IfIntrinsic = "llvm.amdgcn.if";
+static const char *const ElseIntrinsic = "llvm.amdgcn.else";
+static const char *const BreakIntrinsic = "llvm.amdgcn.break";
+static const char *const IfBreakIntrinsic = "llvm.amdgcn.if.break";
+static const char *const ElseBreakIntrinsic = "llvm.amdgcn.else.break";
+static const char *const LoopIntrinsic = "llvm.amdgcn.loop";
+static const char *const EndCfIntrinsic = "llvm.amdgcn.end.cf";
+
+class SIAnnotateControlFlow : public FunctionPass {
+  DivergenceAnalysis *DA;
+
+  Type *Boolean;
+  Type *Void;
+  Type *Int64;
+  Type *ReturnStruct;
+
+  ConstantInt *BoolTrue;
+  ConstantInt *BoolFalse;
+  UndefValue *BoolUndef;
+  Constant *Int64Zero;
+
+  Constant *If;
+  Constant *Else;
+  Constant *Break;
+  Constant *IfBreak;
+  Constant *ElseBreak;
+  Constant *Loop;
+  Constant *EndCf;
+
+  DominatorTree *DT;
+  StackVector Stack;
+
+  LoopInfo *LI;
+
+  bool isUniform(BranchInst *T);
+
+  bool isTopOfStack(BasicBlock *BB);
+
+  Value *popSaved();
+
+  void push(BasicBlock *BB, Value *Saved);
+
+  bool isElse(PHINode *Phi);
+
+  void eraseIfUnused(PHINode *Phi);
+
+  void openIf(BranchInst *Term);
+
+  void insertElse(BranchInst *Term);
+
+  Value *handleLoopCondition(Value *Cond, PHINode *Broken,
+                             llvm::Loop *L, BranchInst *Term);
+
+  void handleLoop(BranchInst *Term);
+
+  void closeControlFlow(BasicBlock *BB);
+
+public:
+  static char ID;
+
+  SIAnnotateControlFlow():
+    FunctionPass(ID) { }
+
+  bool doInitialization(Module &M) override;
+
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override { return "SI annotate control flow"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<DivergenceAnalysis>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE,
+                      "Annotate SI Control Flow", false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
+                    "Annotate SI Control Flow", false, false)
+
+char SIAnnotateControlFlow::ID = 0;
+
+/// \brief Initialize all the types and constants used in the pass
+bool SIAnnotateControlFlow::doInitialization(Module &M) {
+  LLVMContext &Context = M.getContext();
+
+  Void = Type::getVoidTy(Context);
+  Boolean = Type::getInt1Ty(Context);
+  Int64 = Type::getInt64Ty(Context);
+  ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr);
+
+  BoolTrue = ConstantInt::getTrue(Context);
+  BoolFalse = ConstantInt::getFalse(Context);
+  BoolUndef = UndefValue::get(Boolean);
+  Int64Zero = ConstantInt::get(Int64, 0);
+
+  If = M.getOrInsertFunction(
+    IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr);
+
+  Else = M.getOrInsertFunction(
+    ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr);
+
+  Break = M.getOrInsertFunction(
+    BreakIntrinsic, Int64, Int64, (Type *)nullptr);
+  cast<Function>(Break)->setDoesNotAccessMemory();
+
+  IfBreak = M.getOrInsertFunction(
+    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr);
+  cast<Function>(IfBreak)->setDoesNotAccessMemory();;
+
+  ElseBreak = M.getOrInsertFunction(
+    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr);
+  cast<Function>(ElseBreak)->setDoesNotAccessMemory();
+
+  Loop = M.getOrInsertFunction(
+    LoopIntrinsic, Boolean, Int64, (Type *)nullptr);
+
+  EndCf = M.getOrInsertFunction(
+    EndCfIntrinsic, Void, Int64, (Type *)nullptr);
+
+  return false;
+}
+
+/// \brief Is the branch condition uniform or did the StructurizeCFG pass
+/// consider it as such?
+bool SIAnnotateControlFlow::isUniform(BranchInst *T) {
+  return DA->isUniform(T->getCondition()) ||
+         T->getMetadata("structurizecfg.uniform") != nullptr;
+}
+
+/// \brief Is BB the last block saved on the stack ?
+bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
+  return !Stack.empty() && Stack.back().first == BB;
+}
+
+/// \brief Pop the last saved value from the control flow stack
+Value *SIAnnotateControlFlow::popSaved() {
+  return Stack.pop_back_val().second;
+}
+
+/// \brief Push a BB and saved value to the control flow stack
+void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) {
+  Stack.push_back(std::make_pair(BB, Saved));
+}
+
+/// \brief Can the condition represented by this PHI node treated like
+/// an "Else" block?
+bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
+  BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock();
+  for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+    if (Phi->getIncomingBlock(i) == IDom) {
+
+      if (Phi->getIncomingValue(i) != BoolTrue)
+        return false;
+
+    } else {
+      if (Phi->getIncomingValue(i) != BoolFalse)
+        return false;
+
+    }
+  }
+  return true;
+}
+
+// \brief Erase "Phi" if it is not used any more
+void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
+  if (!Phi->hasNUsesOrMore(1))
+    Phi->eraseFromParent();
+}
+
+/// \brief Open a new "If" block
+void SIAnnotateControlFlow::openIf(BranchInst *Term) {
+  if (isUniform(Term)) {
+    return;
+  }
+  Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
+  Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
+  push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+}
+
+/// \brief Close the last "If" block and open a new "Else" block
+void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
+  if (isUniform(Term)) {
+    return;
+  }
+  Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
+  Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
+  push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+}
+
+/// \brief Recursively handle the condition leading to a loop
+Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
+                                             llvm::Loop *L, BranchInst *Term) {
+
+  // Only search through PHI nodes which are inside the loop.  If we try this
+  // with PHI nodes that are outside of the loop, we end up inserting new PHI
+  // nodes outside of the loop which depend on values defined inside the loop.
+  // This will break the module with
+  // 'Instruction does not dominate all users!' errors.
+  PHINode *Phi = nullptr;
+  if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) {
+
+    BasicBlock *Parent = Phi->getParent();
+    PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front());
+    Value *Ret = NewPhi;
+
+    // Handle all non-constant incoming values first
+    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+      Value *Incoming = Phi->getIncomingValue(i);
+      BasicBlock *From = Phi->getIncomingBlock(i);
+      if (isa<ConstantInt>(Incoming)) {
+        NewPhi->addIncoming(Broken, From);
+        continue;
+      }
+
+      Phi->setIncomingValue(i, BoolFalse);
+      Value *PhiArg = handleLoopCondition(Incoming, Broken, L, Term);
+      NewPhi->addIncoming(PhiArg, From);
+    }
+
+    BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
+
+    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+
+      Value *Incoming = Phi->getIncomingValue(i);
+      if (Incoming != BoolTrue)
+        continue;
+
+      BasicBlock *From = Phi->getIncomingBlock(i);
+      if (From == IDom) {
+        // We're in the following situation:
+        //   IDom/From
+        //      |   \
+        //      |   If-block
+        //      |   /
+        //     Parent
+        // where we want to break out of the loop if the If-block is not taken.
+        // Due to the depth-first traversal, there should be an end.cf
+        // intrinsic in Parent, and we insert an else.break before it.
+        //
+        // Note that the end.cf need not be the first non-phi instruction
+        // of parent, particularly when we're dealing with a multi-level
+        // break, but it should occur within a group of intrinsic calls
+        // at the beginning of the block.
+        CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
+        while (OldEnd && OldEnd->getCalledFunction() != EndCf)
+          OldEnd = dyn_cast<CallInst>(OldEnd->getNextNode());
+        if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
+          Value *Args[] = { OldEnd->getArgOperand(0), NewPhi };
+          Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
+          continue;
+        }
+      }
+      TerminatorInst *Insert = From->getTerminator();
+      Value *PhiArg = CallInst::Create(Break, Broken, "", Insert);
+      NewPhi->setIncomingValue(i, PhiArg);
+    }
+    eraseIfUnused(Phi);
+    return Ret;
+
+  } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
+    BasicBlock *Parent = Inst->getParent();
+    Instruction *Insert;
+    if (L->contains(Inst)) {
+      Insert = Parent->getTerminator();
+    } else {
+      Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
+    }
+    Value *Args[] = { Cond, Broken };
+    return CallInst::Create(IfBreak, Args, "", Insert);
+
+  // Insert IfBreak before TERM for constant COND.
+  } else if (isa<ConstantInt>(Cond)) {
+    Value *Args[] = { Cond, Broken };
+    return CallInst::Create(IfBreak, Args, "", Term);
+
+  } else {
+    llvm_unreachable("Unhandled loop condition!");
+  }
+  return nullptr;
+}
+
+/// \brief Handle a back edge (loop)
+void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
+  if (isUniform(Term)) {
+    return;
+  }
+
+  BasicBlock *BB = Term->getParent();
+  llvm::Loop *L = LI->getLoopFor(BB);
+  if (!L)
+    return;
+  BasicBlock *Target = Term->getSuccessor(1);
+  PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());
+
+  Value *Cond = Term->getCondition();
+  Term->setCondition(BoolTrue);
+  Value *Arg = handleLoopCondition(Cond, Broken, L, Term);
+
+  for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
+       PI != PE; ++PI) {
+
+    Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI);
+  }
+
+  Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
+  push(Term->getSuccessor(0), Arg);
+}/// \brief Close the last opened control flow
+void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
+  llvm::Loop *L = LI->getLoopFor(BB);
+
+  assert(Stack.back().first == BB);
+
+  if (L && L->getHeader() == BB) {
+    // We can't insert an EndCF call into a loop header, because it will
+    // get executed on every iteration of the loop, when it should be
+    // executed only once before the loop.
+    SmallVector <BasicBlock*, 8> Latches;
+    L->getLoopLatches(Latches);
+
+    std::vector<BasicBlock*> Preds;
+    for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
+      if (!is_contained(Latches, *PI))
+        Preds.push_back(*PI);
+    }
+    BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
+  }
+
+  Value *Exec = popSaved();
+  if (!isa<UndefValue>(Exec))
+    CallInst::Create(EndCf, Exec, "", &*BB->getFirstInsertionPt());
+}
+
+/// \brief Annotate the control flow with intrinsics so the backend can
+/// recognize if/then/else and loops.
+bool SIAnnotateControlFlow::runOnFunction(Function &F) {
+
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  DA = &getAnalysis<DivergenceAnalysis>();
+
+  for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
+       E = df_end(&F.getEntryBlock()); I != E; ++I) {
+
+    BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator());
+
+    if (!Term || Term->isUnconditional()) {
+      if (isTopOfStack(*I))
+        closeControlFlow(*I);
+
+      continue;
+    }
+
+    if (I.nodeVisited(Term->getSuccessor(1))) {
+      if (isTopOfStack(*I))
+        closeControlFlow(*I);
+
+      handleLoop(Term);
+      continue;
+    }
+
+    if (isTopOfStack(*I)) {
+      PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
+      if (Phi && Phi->getParent() == *I && isElse(Phi)) {
+        insertElse(Term);
+        eraseIfUnused(Phi);
+        continue;
+      }
+      closeControlFlow(*I);
+    }
+    openIf(Term);
+  }
+
+  assert(Stack.empty());
+  return true;
+}
+
+/// \brief Create the annotation pass
+FunctionPass *llvm::createSIAnnotateControlFlowPass() {
+  return new SIAnnotateControlFlow();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
new file mode 100644
index 000000000000..62ebef8e91af
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
@@ -0,0 +1,96 @@
+//===--- SIDebuggerInsertNops.cpp - Inserts nops for debugger usage -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Inserts one nop instruction for each high level source statement for
+/// debugger usage.
+///
+/// Tools, such as a debugger, need to pause execution based on user input (i.e.
+/// breakpoint). In order to do this, one nop instruction is inserted before the
+/// first isa instruction of each high level source statement. Further, the
+/// debugger may replace nop instructions with trap instructions based on user
+/// input.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "si-debugger-insert-nops"
+#define PASS_NAME "SI Debugger Insert Nops"
+
+namespace {
+
+class SIDebuggerInsertNops : public MachineFunctionPass {
+public:
+  static char ID;
+
+  SIDebuggerInsertNops() : MachineFunctionPass(ID) { }
+  StringRef getPassName() const override { return PASS_NAME; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // anonymous namespace
+
+INITIALIZE_PASS(SIDebuggerInsertNops, DEBUG_TYPE, PASS_NAME, false, false)
+
+char SIDebuggerInsertNops::ID = 0;
+char &llvm::SIDebuggerInsertNopsID = SIDebuggerInsertNops::ID;
+
+FunctionPass *llvm::createSIDebuggerInsertNopsPass() {
+  return new SIDebuggerInsertNops();
+}
+
+bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) {
+  // Skip this pass if "amdgpu-debugger-insert-nops" attribute was not
+  // specified.
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  if (!ST.debuggerInsertNops())
+    return false;
+
+  // Skip machine functions without debug info.
+  if (!MF.getMMI().hasDebugInfo())
+    return false;
+
+  // Target instruction info.
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  // Set containing line numbers that have nop inserted.
+  DenseSet<unsigned> NopInserted;
+
+  for (auto &MBB : MF) {
+    for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
+      // Skip DBG_VALUE instructions and instructions without location.
+      if (MI->isDebugValue() || !MI->getDebugLoc())
+        continue;
+
+      // Insert nop instruction if line number does not have nop inserted.
+      auto DL = MI->getDebugLoc();
+      if (NopInserted.find(DL.getLine()) == NopInserted.end()) {
+        BuildMI(MBB, *MI, DL, TII->get(AMDGPU::S_NOP))
+          .addImm(0);
+        NopInserted.insert(DL.getLine());
+      }
+    }
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
new file mode 100644
index 000000000000..ff4e32147184
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -0,0 +1,393 @@
+//===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCInstrDesc.h"
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
+#define LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H
+
+namespace llvm {
+
+namespace SIInstrFlags {
+// This needs to be kept in sync with the field bits in InstSI.
+enum : uint64_t {
+  // Low bits - basic encoding information.
+  SALU = 1 << 0,
+  VALU = 1 << 1,
+
+  // SALU instruction formats.
+  SOP1 = 1 << 2,
+  SOP2 = 1 << 3,
+  SOPC = 1 << 4,
+  SOPK = 1 << 5,
+  SOPP = 1 << 6,
+
+  // VALU instruction formats.
+  VOP1 = 1 << 7,
+  VOP2 = 1 << 8,
+  VOPC = 1 << 9,
+
+ // TODO: Should this be spilt into VOP3 a and b?
+  VOP3 = 1 << 10,
+
+  VINTRP = 1 << 13,
+  SDWA = 1 << 14,
+  DPP = 1 << 15,
+
+  // Memory instruction formats.
+  MUBUF = 1 << 16,
+  MTBUF = 1 << 17,
+  SMRD = 1 << 18,
+  MIMG = 1 << 19,
+  EXP = 1 << 20,
+  FLAT = 1 << 21,
+  DS = 1 << 22,
+
+  // Pseudo instruction formats.
+  VGPRSpill = 1 << 23,
+  SGPRSpill = 1 << 24,
+
+  // High bits - other information.
+  VM_CNT = UINT64_C(1) << 32,
+  EXP_CNT = UINT64_C(1) << 33,
+  LGKM_CNT = UINT64_C(1) << 34,
+
+  WQM = UINT64_C(1) << 35,
+  DisableWQM = UINT64_C(1) << 36,
+  Gather4 = UINT64_C(1) << 37,
+  SOPK_ZEXT = UINT64_C(1) << 38,
+  SCALAR_STORE = UINT64_C(1) << 39,
+  FIXED_SIZE = UINT64_C(1) << 40,
+  VOPAsmPrefer32Bit = UINT64_C(1) << 41
+
+};
+
+// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
+// The result is true if any of these tests are true.
+enum ClassFlags {
+  S_NAN = 1 << 0,        // Signaling NaN
+  Q_NAN = 1 << 1,        // Quiet NaN
+  N_INFINITY = 1 << 2,   // Negative infinity
+  N_NORMAL = 1 << 3,     // Negative normal
+  N_SUBNORMAL = 1 << 4,  // Negative subnormal
+  N_ZERO = 1 << 5,       // Negative zero
+  P_ZERO = 1 << 6,       // Positive zero
+  P_SUBNORMAL = 1 << 7,  // Positive subnormal
+  P_NORMAL = 1 << 8,     // Positive normal
+  P_INFINITY = 1 << 9    // Positive infinity
+};
+}
+
+namespace AMDGPU {
+  enum OperandType {
+    /// Operands with register or 32-bit immediate
+    OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET,
+    OPERAND_REG_IMM_INT64,
+    OPERAND_REG_IMM_INT16,
+    OPERAND_REG_IMM_FP32,
+    OPERAND_REG_IMM_FP64,
+    OPERAND_REG_IMM_FP16,
+
+    /// Operands with register or inline constant
+    OPERAND_REG_INLINE_C_INT16,
+    OPERAND_REG_INLINE_C_INT32,
+    OPERAND_REG_INLINE_C_INT64,
+    OPERAND_REG_INLINE_C_FP16,
+    OPERAND_REG_INLINE_C_FP32,
+    OPERAND_REG_INLINE_C_FP64,
+
+    OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
+    OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16,
+
+    OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
+    OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_FP64,
+
+    OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
+    OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
+
+    // Operand for source modifiers for VOP instructions
+    OPERAND_INPUT_MODS,
+
+    /// Operand with 32-bit immediate that uses the constant bus.
+    OPERAND_KIMM32,
+    OPERAND_KIMM16
+  };
+}
+
+// Input operand modifiers bit-masks
+// NEG and SEXT share same bit-mask because they can't be set simultaneously.
+namespace SISrcMods {
+  enum {
+   NEG = 1 << 0,  // Floating-point negate modifier
+   ABS = 1 << 1,  // Floating-point absolute modifier
+   SEXT = 1 << 0  // Integer sign-extend modifier
+  };
+}
+
+namespace SIOutMods {
+  enum {
+    NONE = 0,
+    MUL2 = 1,
+    MUL4 = 2,
+    DIV2 = 3
+  };
+}
+
+namespace VGPRIndexMode {
+  enum {
+    SRC0_ENABLE = 1 << 0,
+    SRC1_ENABLE = 1 << 1,
+    SRC2_ENABLE = 1 << 2,
+    DST_ENABLE = 1 << 3
+  };
+}
+
+namespace AMDGPUAsmVariants {
+  enum {
+    DEFAULT = 0,
+    VOP3 = 1,
+    SDWA = 2,
+    DPP = 3
+  };
+}
+
+namespace AMDGPU {
+namespace EncValues { // Encoding values of enum9/8/7 operands
+
+enum {
+  SGPR_MIN = 0,
+  SGPR_MAX = 101,
+  TTMP_MIN = 112,
+  TTMP_MAX = 123,
+  INLINE_INTEGER_C_MIN = 128,
+  INLINE_INTEGER_C_POSITIVE_MAX = 192, // 64
+  INLINE_INTEGER_C_MAX = 208,
+  INLINE_FLOATING_C_MIN = 240,
+  INLINE_FLOATING_C_MAX = 248,
+  LITERAL_CONST = 255,
+  VGPR_MIN = 256,
+  VGPR_MAX = 511
+};
+
+} // namespace EncValues
+} // namespace AMDGPU
+
+namespace AMDGPU {
+namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns.
+
+enum Id { // Message ID, width(4) [3:0].
+  ID_UNKNOWN_ = -1,
+  ID_INTERRUPT = 1,
+  ID_GS,
+  ID_GS_DONE,
+  ID_SYSMSG = 15,
+  ID_GAPS_LAST_, // Indicate that sequence has gaps.
+  ID_GAPS_FIRST_ = ID_INTERRUPT,
+  ID_SHIFT_ = 0,
+  ID_WIDTH_ = 4,
+  ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
+};
+
+enum Op { // Both GS and SYS operation IDs.
+  OP_UNKNOWN_ = -1,
+  OP_SHIFT_ = 4,
+  // width(2) [5:4]
+  OP_GS_NOP = 0,
+  OP_GS_CUT,
+  OP_GS_EMIT,
+  OP_GS_EMIT_CUT,
+  OP_GS_LAST_,
+  OP_GS_FIRST_ = OP_GS_NOP,
+  OP_GS_WIDTH_ = 2,
+  OP_GS_MASK_ = (((1 << OP_GS_WIDTH_) - 1) << OP_SHIFT_),
+  // width(3) [6:4]
+  OP_SYS_ECC_ERR_INTERRUPT = 1,
+  OP_SYS_REG_RD,
+  OP_SYS_HOST_TRAP_ACK,
+  OP_SYS_TTRACE_PC,
+  OP_SYS_LAST_,
+  OP_SYS_FIRST_ = OP_SYS_ECC_ERR_INTERRUPT,
+  OP_SYS_WIDTH_ = 3,
+  OP_SYS_MASK_ = (((1 << OP_SYS_WIDTH_) - 1) << OP_SHIFT_)
+};
+
+enum StreamId { // Stream ID, (2) [9:8].
+  STREAM_ID_DEFAULT_ = 0,
+  STREAM_ID_LAST_ = 4,
+  STREAM_ID_FIRST_ = STREAM_ID_DEFAULT_,
+  STREAM_ID_SHIFT_ = 8,
+  STREAM_ID_WIDTH_=  2,
+  STREAM_ID_MASK_ = (((1 << STREAM_ID_WIDTH_) - 1) << STREAM_ID_SHIFT_)
+};
+
+} // namespace SendMsg
+
+namespace Hwreg { // Encoding of SIMM16 used in s_setreg/getreg* insns.
+
+enum Id { // HwRegCode, (6) [5:0]
+  ID_UNKNOWN_ = -1,
+  ID_SYMBOLIC_FIRST_ = 1, // There are corresponding symbolic names defined.
+  ID_MODE = 1,
+  ID_STATUS = 2,
+  ID_TRAPSTS = 3,
+  ID_HW_ID = 4,
+  ID_GPR_ALLOC = 5,
+  ID_LDS_ALLOC = 6,
+  ID_IB_STS = 7,
+  ID_SYMBOLIC_LAST_ = 8,
+  ID_SHIFT_ = 0,
+  ID_WIDTH_ = 6,
+  ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
+};
+
+enum Offset { // Offset, (5) [10:6]
+  OFFSET_DEFAULT_ = 0,
+  OFFSET_SHIFT_ = 6,
+  OFFSET_WIDTH_ = 5,
+  OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_)
+};
+
+enum WidthMinusOne { // WidthMinusOne, (5) [15:11]
+  WIDTH_M1_DEFAULT_ = 31,
+  WIDTH_M1_SHIFT_ = 11,
+  WIDTH_M1_WIDTH_ = 5,
+  WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_)
+};
+
+} // namespace Hwreg
+
+namespace SDWA {
+
+enum SdwaSel {
+  BYTE_0 = 0,
+  BYTE_1 = 1,
+  BYTE_2 = 2,
+  BYTE_3 = 3,
+  WORD_0 = 4,
+  WORD_1 = 5,
+  DWORD = 6,
+};
+
+enum DstUnused {
+  UNUSED_PAD = 0,
+  UNUSED_SEXT = 1,
+  UNUSED_PRESERVE = 2,
+};
+
+} // namespace SDWA
+} // namespace AMDGPU
+
+#define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
+#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS                                0x00B02C
+#define   S_00B02C_EXTRA_LDS_SIZE(x)                                  (((x) & 0xFF) << 8)
+#define R_00B128_SPI_SHADER_PGM_RSRC1_VS                                0x00B128
+#define R_00B228_SPI_SHADER_PGM_RSRC1_GS                                0x00B228
+#define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
+#define   S_00B028_VGPRS(x)                                           (((x) & 0x3F) << 0)
+#define   S_00B028_SGPRS(x)                                           (((x) & 0x0F) << 6)
+
+#define R_00B84C_COMPUTE_PGM_RSRC2                                      0x00B84C
+#define   S_00B84C_SCRATCH_EN(x)                                      (((x) & 0x1) << 0)
+#define   G_00B84C_SCRATCH_EN(x)                                      (((x) >> 0) & 0x1)
+#define   C_00B84C_SCRATCH_EN                                         0xFFFFFFFE
+#define   S_00B84C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
+#define   G_00B84C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
+#define   C_00B84C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B84C_TGID_X_EN(x)                                       (((x) & 0x1) << 7)
+#define   G_00B84C_TGID_X_EN(x)                                       (((x) >> 7) & 0x1)
+#define   C_00B84C_TGID_X_EN                                          0xFFFFFF7F
+#define   S_00B84C_TGID_Y_EN(x)                                       (((x) & 0x1) << 8)
+#define   G_00B84C_TGID_Y_EN(x)                                       (((x) >> 8) & 0x1)
+#define   C_00B84C_TGID_Y_EN                                          0xFFFFFEFF
+#define   S_00B84C_TGID_Z_EN(x)                                       (((x) & 0x1) << 9)
+#define   G_00B84C_TGID_Z_EN(x)                                       (((x) >> 9) & 0x1)
+#define   C_00B84C_TGID_Z_EN                                          0xFFFFFDFF
+#define   S_00B84C_TG_SIZE_EN(x)                                      (((x) & 0x1) << 10)
+#define   G_00B84C_TG_SIZE_EN(x)                                      (((x) >> 10) & 0x1)
+#define   C_00B84C_TG_SIZE_EN                                         0xFFFFFBFF
+#define   S_00B84C_TIDIG_COMP_CNT(x)                                  (((x) & 0x03) << 11)
+#define   G_00B84C_TIDIG_COMP_CNT(x)                                  (((x) >> 11) & 0x03)
+#define   C_00B84C_TIDIG_COMP_CNT                                     0xFFFFE7FF
+/* CIK */
+#define   S_00B84C_EXCP_EN_MSB(x)                                     (((x) & 0x03) << 13)
+#define   G_00B84C_EXCP_EN_MSB(x)                                     (((x) >> 13) & 0x03)
+#define   C_00B84C_EXCP_EN_MSB                                        0xFFFF9FFF
+/*     */
+#define   S_00B84C_LDS_SIZE(x)                                        (((x) & 0x1FF) << 15)
+#define   G_00B84C_LDS_SIZE(x)                                        (((x) >> 15) & 0x1FF)
+#define   C_00B84C_LDS_SIZE                                           0xFF007FFF
+#define   S_00B84C_EXCP_EN(x)                                         (((x) & 0x7F) << 24)
+#define   G_00B84C_EXCP_EN(x)                                         (((x) >> 24) & 0x7F)
+#define   C_00B84C_EXCP_EN
+
+#define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
+#define R_0286D0_SPI_PS_INPUT_ADDR                                      0x0286D0
+
+#define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
+#define   S_00B848_VGPRS(x)                                           (((x) & 0x3F) << 0)
+#define   G_00B848_VGPRS(x)                                           (((x) >> 0) & 0x3F)
+#define   C_00B848_VGPRS                                              0xFFFFFFC0
+#define   S_00B848_SGPRS(x)                                           (((x) & 0x0F) << 6)
+#define   G_00B848_SGPRS(x)                                           (((x) >> 6) & 0x0F)
+#define   C_00B848_SGPRS                                              0xFFFFFC3F
+#define   S_00B848_PRIORITY(x)                                        (((x) & 0x03) << 10)
+#define   G_00B848_PRIORITY(x)                                        (((x) >> 10) & 0x03)
+#define   C_00B848_PRIORITY                                           0xFFFFF3FF
+#define   S_00B848_FLOAT_MODE(x)                                      (((x) & 0xFF) << 12)
+#define   G_00B848_FLOAT_MODE(x)                                      (((x) >> 12) & 0xFF)
+#define   C_00B848_FLOAT_MODE                                         0xFFF00FFF
+#define   S_00B848_PRIV(x)                                            (((x) & 0x1) << 20)
+#define   G_00B848_PRIV(x)                                            (((x) >> 20) & 0x1)
+#define   C_00B848_PRIV                                               0xFFEFFFFF
+#define   S_00B848_DX10_CLAMP(x)                                      (((x) & 0x1) << 21)
+#define   G_00B848_DX10_CLAMP(x)                                      (((x) >> 21) & 0x1)
+#define   C_00B848_DX10_CLAMP                                         0xFFDFFFFF
+#define   S_00B848_DEBUG_MODE(x)                                      (((x) & 0x1) << 22)
+#define   G_00B848_DEBUG_MODE(x)                                      (((x) >> 22) & 0x1)
+#define   C_00B848_DEBUG_MODE                                         0xFFBFFFFF
+#define   S_00B848_IEEE_MODE(x)                                       (((x) & 0x1) << 23)
+#define   G_00B848_IEEE_MODE(x)                                       (((x) >> 23) & 0x1)
+#define   C_00B848_IEEE_MODE                                          0xFF7FFFFF
+
+
+// Helpers for setting FLOAT_MODE
+#define FP_ROUND_ROUND_TO_NEAREST 0
+#define FP_ROUND_ROUND_TO_INF 1
+#define FP_ROUND_ROUND_TO_NEGINF 2
+#define FP_ROUND_ROUND_TO_ZERO 3
+
+// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double
+// precision.
+#define FP_ROUND_MODE_SP(x) ((x) & 0x3)
+#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2)
+
+#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0
+#define FP_DENORM_FLUSH_OUT 1
+#define FP_DENORM_FLUSH_IN 2
+#define FP_DENORM_FLUSH_NONE 3
+
+
+// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double
+// precision.
+#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4)
+#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)
+
+#define R_00B860_COMPUTE_TMPRING_SIZE                                   0x00B860
+#define   S_00B860_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
+
+#define R_0286E8_SPI_TMPRING_SIZE                                       0x0286E8
+#define   S_0286E8_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
+
+#define R_SPILLED_SGPRS         0x4
+#define R_SPILLED_VGPRS         0x8
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
new file mode 100644
index 000000000000..d4d3959658e7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
@@ -0,0 +1,88 @@
+//===-- SIFixControlFlowLiveIntervals.cpp - Fix CF live intervals ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Spilling of EXEC masks used for control flow messes up control flow
+/// lowering, so mark all live intervals associated with CF instructions as
+/// non-spillable.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-cf-live-intervals"
+
+namespace {
+
+class SIFixControlFlowLiveIntervals : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIFixControlFlowLiveIntervals() : MachineFunctionPass(ID) {
+    initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "SI Fix CF Live Intervals"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIFixControlFlowLiveIntervals, DEBUG_TYPE,
+                      "SI Fix CF Live Intervals", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIFixControlFlowLiveIntervals, DEBUG_TYPE,
+                    "SI Fix CF Live Intervals", false, false)
+
+char SIFixControlFlowLiveIntervals::ID = 0;
+
+char &llvm::SIFixControlFlowLiveIntervalsID = SIFixControlFlowLiveIntervals::ID;
+
+FunctionPass *llvm::createSIFixControlFlowLiveIntervalsPass() {
+  return new SIFixControlFlowLiveIntervals();
+}
+
+bool SIFixControlFlowLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
+  LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
+
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      switch (MI.getOpcode()) {
+        case AMDGPU::SI_IF:
+        case AMDGPU::SI_ELSE:
+        case AMDGPU::SI_BREAK:
+        case AMDGPU::SI_IF_BREAK:
+        case AMDGPU::SI_ELSE_BREAK:
+        case AMDGPU::SI_END_CF: {
+          unsigned Reg = MI.getOperand(0).getReg();
+          LIS->getInterval(Reg).markNotSpillable();
+          break;
+        }
+        default:
+          break;
+      }
+    }
+  }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
new file mode 100644
index 000000000000..6a422e70fe1f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -0,0 +1,462 @@
+//===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Copies from VGPR to SGPR registers are illegal and the register coalescer
+/// will sometimes generate these illegal copies in situations like this:
+///
+///  Register Class <vsrc> is the union of <vgpr> and <sgpr>
+///
+/// BB0:
+///   %vreg0 <sgpr> = SCALAR_INST
+///   %vreg1 <vsrc> = COPY %vreg0 <sgpr>
+///    ...
+///    BRANCH %cond BB1, BB2
+///  BB1:
+///    %vreg2 <vgpr> = VECTOR_INST
+///    %vreg3 <vsrc> = COPY %vreg2 <vgpr>
+///  BB2:
+///    %vreg4 <vsrc> = PHI %vreg1 <vsrc>, <BB#0>, %vreg3 <vrsc>, <BB#1>
+///    %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc>
+///
+///
+/// The coalescer will begin at BB0 and eliminate its copy, then the resulting
+/// code will look like this:
+///
+/// BB0:
+///   %vreg0 <sgpr> = SCALAR_INST
+///    ...
+///    BRANCH %cond BB1, BB2
+/// BB1:
+///   %vreg2 <vgpr> = VECTOR_INST
+///   %vreg3 <vsrc> = COPY %vreg2 <vgpr>
+/// BB2:
+///   %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <vsrc>, <BB#1>
+///   %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
+///
+/// Now that the result of the PHI instruction is an SGPR, the register
+/// allocator is now forced to constrain the register class of %vreg3 to
+/// <sgpr> so we end up with final code like this:
+///
+/// BB0:
+///   %vreg0 <sgpr> = SCALAR_INST
+///    ...
+///    BRANCH %cond BB1, BB2
+/// BB1:
+///   %vreg2 <vgpr> = VECTOR_INST
+///   %vreg3 <sgpr> = COPY %vreg2 <vgpr>
+/// BB2:
+///   %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <sgpr>, <BB#1>
+///   %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
+///
+/// Now this code contains an illegal copy from a VGPR to an SGPR.
+///
+/// In order to avoid this problem, this pass searches for PHI instructions
+/// which define a <vsrc> register and constrains its definition class to
+/// <vgpr> if the user of the PHI's definition register is a vector instruction.
+/// If the PHI's definition class is constrained to <vgpr> then the coalescer
+/// will be unable to perform the COPY removal from the above example  which
+/// ultimately led to the creation of an illegal COPY.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-sgpr-copies"
+
+namespace {
+
+class SIFixSGPRCopies : public MachineFunctionPass {
+
+  MachineDominatorTree *MDT;
+
+public:
+  static char ID;
+
+  SIFixSGPRCopies() : MachineFunctionPass(ID) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "SI Fix SGPR copies"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace
+
+INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
+                     "SI Fix SGPR copies", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
+                     "SI Fix SGPR copies", false, false)
+
+
+char SIFixSGPRCopies::ID = 0;
+
+char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID;
+
+FunctionPass *llvm::createSIFixSGPRCopiesPass() {
+  return new SIFixSGPRCopies();
+}
+
+static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    if (!MI.getOperand(i).isReg() ||
+        !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
+      continue;
+
+    if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg())))
+      return true;
+  }
+  return false;
+}
+
+static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
+getCopyRegClasses(const MachineInstr &Copy,
+                  const SIRegisterInfo &TRI,
+                  const MachineRegisterInfo &MRI) {
+  unsigned DstReg = Copy.getOperand(0).getReg();
+  unsigned SrcReg = Copy.getOperand(1).getReg();
+
+  const TargetRegisterClass *SrcRC =
+    TargetRegisterInfo::isVirtualRegister(SrcReg) ?
+    MRI.getRegClass(SrcReg) :
+    TRI.getPhysRegClass(SrcReg);
+
+  // We don't really care about the subregister here.
+  // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
+
+  const TargetRegisterClass *DstRC =
+    TargetRegisterInfo::isVirtualRegister(DstReg) ?
+    MRI.getRegClass(DstReg) :
+    TRI.getPhysRegClass(DstReg);
+
+  return std::make_pair(SrcRC, DstRC);
+}
+
+static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
+                             const TargetRegisterClass *DstRC,
+                             const SIRegisterInfo &TRI) {
+  return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC);
+}
+
+static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
+                             const TargetRegisterClass *DstRC,
+                             const SIRegisterInfo &TRI) {
+  return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
+}
+
+// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
+//
+// SGPRx = ...
+// SGPRy = REG_SEQUENCE SGPRx, sub0 ...
+// VGPRz = COPY SGPRy
+//
+// ==>
+//
+// VGPRx = COPY SGPRx
+// VGPRz = REG_SEQUENCE VGPRx, sub0
+//
+// This exposes immediate folding opportunities when materializing 64-bit
+// immediates.
+static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
+                                        const SIRegisterInfo *TRI,
+                                        const SIInstrInfo *TII,
+                                        MachineRegisterInfo &MRI) {
+  assert(MI.isRegSequence());
+
+  unsigned DstReg = MI.getOperand(0).getReg();
+  if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
+    return false;
+
+  if (!MRI.hasOneUse(DstReg))
+    return false;
+
+  MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg);
+  if (!CopyUse.isCopy())
+    return false;
+
+  const TargetRegisterClass *SrcRC, *DstRC;
+  std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
+
+  if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
+    return false;
+
+  // TODO: Could have multiple extracts?
+  unsigned SubReg = CopyUse.getOperand(1).getSubReg();
+  if (SubReg != AMDGPU::NoSubRegister)
+    return false;
+
+  MRI.setRegClass(DstReg, DstRC);
+
+  // SGPRx = ...
+  // SGPRy = REG_SEQUENCE SGPRx, sub0 ...
+  // VGPRz = COPY SGPRy
+
+  // =>
+  // VGPRx = COPY SGPRx
+  // VGPRz = REG_SEQUENCE VGPRx, sub0
+
+  MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
+
+  for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
+    unsigned SrcReg = MI.getOperand(I).getReg();
+    unsigned SrcSubReg = MI.getOperand(I).getSubReg();
+
+    const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+    assert(TRI->isSGPRClass(SrcRC) &&
+           "Expected SGPR REG_SEQUENCE to only have SGPR inputs");
+
+    SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
+    const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
+
+    unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC);
+
+    BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg)
+      .addOperand(MI.getOperand(I));
+
+    MI.getOperand(I).setReg(TmpReg);
+  }
+
+  CopyUse.eraseFromParent();
+  return true;
+}
+
+static bool phiHasVGPROperands(const MachineInstr &PHI,
+                               const MachineRegisterInfo &MRI,
+                               const SIRegisterInfo *TRI,
+                               const SIInstrInfo *TII) {
+
+  for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
+    unsigned Reg = PHI.getOperand(i).getReg();
+    if (TRI->hasVGPRs(MRI.getRegClass(Reg)))
+      return true;
+  }
+  return false;
+}
+static bool phiHasBreakDef(const MachineInstr &PHI,
+                           const MachineRegisterInfo &MRI,
+                           SmallSet<unsigned, 8> &Visited) {
+
+  for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
+    unsigned Reg = PHI.getOperand(i).getReg();
+    if (Visited.count(Reg))
+      continue;
+
+    Visited.insert(Reg);
+
+    MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg);
+    assert(DefInstr);
+    switch (DefInstr->getOpcode()) {
+    default:
+      break;
+    case AMDGPU::SI_BREAK:
+    case AMDGPU::SI_IF_BREAK:
+    case AMDGPU::SI_ELSE_BREAK:
+      return true;
+    case AMDGPU::PHI:
+      if (phiHasBreakDef(*DefInstr, MRI, Visited))
+        return true;
+    }
+  }
+  return false;
+}
+
+static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB,
+                                          const TargetRegisterInfo &TRI) {
+  for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(),
+       E = MBB.end(); I != E; ++I) {
+    if (I->modifiesRegister(AMDGPU::EXEC, &TRI))
+      return true;
+  }
+  return false;
+}
+
+static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
+                                    const MachineInstr *MoveImm,
+                                    const SIInstrInfo *TII,
+                                    unsigned &SMovOp,
+                                    int64_t &Imm) {
+
+  if (!MoveImm->isMoveImmediate())
+    return false;
+
+  const MachineOperand *ImmOp =
+      TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0);
+  if (!ImmOp->isImm())
+    return false;
+
+  // FIXME: Handle copies with sub-regs.
+  if (Copy->getOperand(0).getSubReg())
+    return false;
+
+  switch (MoveImm->getOpcode()) {
+  default:
+    return false;
+  case AMDGPU::V_MOV_B32_e32:
+    SMovOp = AMDGPU::S_MOV_B32;
+    break;
+  case AMDGPU::V_MOV_B64_PSEUDO:
+    SMovOp = AMDGPU::S_MOV_B64;
+    break;
+  }
+  Imm = ImmOp->getImm();
+  return true;
+}
+
+bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
+
+  SmallVector<MachineInstr *, 16> Worklist;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      MachineInstr &MI = *I;
+
+      switch (MI.getOpcode()) {
+      default:
+        continue;
+      case AMDGPU::COPY: {
+        // If the destination register is a physical register there isn't really
+        // much we can do to fix this.
+        if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
+          continue;
+
+        const TargetRegisterClass *SrcRC, *DstRC;
+        std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
+        if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
+          MachineInstr *DefMI = MRI.getVRegDef(MI.getOperand(1).getReg());
+          unsigned SMovOp;
+          int64_t Imm;
+          // If we are just copying an immediate, we can replace the copy with
+          // s_mov_b32.
+          if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) {
+            MI.getOperand(1).ChangeToImmediate(Imm);
+            MI.addImplicitDefUseOperands(MF);
+            MI.setDesc(TII->get(SMovOp));
+            break;
+          }
+          TII->moveToVALU(MI);
+        }
+
+        break;
+      }
+      case AMDGPU::PHI: {
+        unsigned Reg = MI.getOperand(0).getReg();
+        if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
+          break;
+
+        // We don't need to fix the PHI if the common dominator of the
+        // two incoming blocks terminates with a uniform branch.
+        if (MI.getNumExplicitOperands() == 5) {
+          MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
+          MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
+
+          MachineBasicBlock *NCD = MDT->findNearestCommonDominator(MBB0, MBB1);
+          if (NCD && !hasTerminatorThatModifiesExec(*NCD, *TRI)) {
+            DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
+            break;
+          }
+        }
+
+        // If a PHI node defines an SGPR and any of its operands are VGPRs,
+        // then we need to move it to the VALU.
+        //
+        // Also, if a PHI node defines an SGPR and has all SGPR operands
+        // we must move it to the VALU, because the SGPR operands will
+        // all end up being assigned the same register, which means
+        // there is a potential for a conflict if different threads take
+        // different control flow paths.
+        //
+        // For Example:
+        //
+        // sgpr0 = def;
+        // ...
+        // sgpr1 = def;
+        // ...
+        // sgpr2 = PHI sgpr0, sgpr1
+        // use sgpr2;
+        //
+        // Will Become:
+        //
+        // sgpr2 = def;
+        // ...
+        // sgpr2 = def;
+        // ...
+        // use sgpr2
+        //
+        // The one exception to this rule is when one of the operands
+        // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
+        // instruction.  In this case, there we know the program will
+        // never enter the second block (the loop) without entering
+        // the first block (where the condition is computed), so there
+        // is no chance for values to be over-written.
+
+        SmallSet<unsigned, 8> Visited;
+        if (phiHasVGPROperands(MI, MRI, TRI, TII) ||
+            !phiHasBreakDef(MI, MRI, Visited)) {
+          DEBUG(dbgs() << "Fixing PHI: " << MI);
+          TII->moveToVALU(MI);
+        }
+        break;
+      }
+      case AMDGPU::REG_SEQUENCE: {
+        if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
+            !hasVGPROperands(MI, TRI)) {
+          foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
+          continue;
+        }
+
+        DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
+
+        TII->moveToVALU(MI);
+        break;
+      }
+      case AMDGPU::INSERT_SUBREG: {
+        const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
+        DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
+        Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
+        Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
+        if (TRI->isSGPRClass(DstRC) &&
+            (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
+          DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
+          TII->moveToVALU(MI);
+        }
+        break;
+      }
+      }
+    }
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
new file mode 100644
index 000000000000..831ac5948a68
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -0,0 +1,622 @@
+//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "si-fold-operands"
+using namespace llvm;
+
+namespace {
+
+class SIFoldOperands : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIFoldOperands() : MachineFunctionPass(ID) {
+    initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "SI Fold Operands"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+struct FoldCandidate {
+  MachineInstr *UseMI;
+  union {
+    MachineOperand *OpToFold;
+    uint64_t ImmToFold;
+    int FrameIndexToFold;
+  };
+  unsigned char UseOpNo;
+  MachineOperand::MachineOperandType Kind;
+
+  FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) :
+    UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()) {
+    if (FoldOp->isImm()) {
+      ImmToFold = FoldOp->getImm();
+    } else if (FoldOp->isFI()) {
+      FrameIndexToFold = FoldOp->getIndex();
+    } else {
+      assert(FoldOp->isReg());
+      OpToFold = FoldOp;
+    }
+  }
+
+  bool isFI() const {
+    return Kind == MachineOperand::MO_FrameIndex;
+  }
+
+  bool isImm() const {
+    return Kind == MachineOperand::MO_Immediate;
+  }
+
+  bool isReg() const {
+    return Kind == MachineOperand::MO_Register;
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
+                "SI Fold Operands", false, false)
+
+char SIFoldOperands::ID = 0;
+
+char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
+
+FunctionPass *llvm::createSIFoldOperandsPass() {
+  return new SIFoldOperands();
+}
+
+static bool isSafeToFold(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::V_MOV_B32_e64:
+  case AMDGPU::V_MOV_B64_PSEUDO: {
+    // If there are additional implicit register operands, this may be used for
+    // register indexing so the source register operand isn't simply copied.
+    unsigned NumOps = MI.getDesc().getNumOperands() +
+      MI.getDesc().getNumImplicitUses();
+
+    return MI.getNumOperands() == NumOps;
+  }
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::COPY:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool updateOperand(FoldCandidate &Fold,
+                          const TargetRegisterInfo &TRI) {
+  MachineInstr *MI = Fold.UseMI;
+  MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
+  assert(Old.isReg());
+
+  if (Fold.isImm()) {
+    Old.ChangeToImmediate(Fold.ImmToFold);
+    return true;
+  }
+
+  if (Fold.isFI()) {
+    Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
+    return true;
+  }
+
+  MachineOperand *New = Fold.OpToFold;
+  if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
+      TargetRegisterInfo::isVirtualRegister(New->getReg())) {
+    Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
+    return true;
+  }
+
+  // FIXME: Handle physical registers.
+
+  return false;
+}
+
+static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
+                              const MachineInstr *MI) {
+  for (auto Candidate : FoldList) {
+    if (Candidate.UseMI == MI)
+      return true;
+  }
+  return false;
+}
+
+static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
+                             MachineInstr *MI, unsigned OpNo,
+                             MachineOperand *OpToFold,
+                             const SIInstrInfo *TII) {
+  if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
+
+    // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
+    unsigned Opc = MI->getOpcode();
+    if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) &&
+        (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
+      bool IsF32  = Opc == AMDGPU::V_MAC_F32_e64;
+
+      // Check if changing this to a v_mad_{f16, f32} instruction will allow us
+      // to fold the operand.
+      MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16));
+      bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
+      if (FoldAsMAD) {
+        MI->untieRegOperand(OpNo);
+        return true;
+      }
+      MI->setDesc(TII->get(Opc));
+    }
+
+    // Special case for s_setreg_b32
+    if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
+      MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
+      FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+      return true;
+    }
+
+    // If we are already folding into another operand of MI, then
+    // we can't commute the instruction, otherwise we risk making the
+    // other fold illegal.
+    if (isUseMIInFoldList(FoldList, MI))
+      return false;
+
+    // Operand is not legal, so try to commute the instruction to
+    // see if this makes it possible to fold.
+    unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
+    unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
+    bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
+
+    if (CanCommute) {
+      if (CommuteIdx0 == OpNo)
+        OpNo = CommuteIdx1;
+      else if (CommuteIdx1 == OpNo)
+        OpNo = CommuteIdx0;
+    }
+
+    // One of operands might be an Imm operand, and OpNo may refer to it after
+    // the call of commuteInstruction() below. Such situations are avoided
+    // here explicitly as OpNo must be a register operand to be a candidate
+    // for memory folding.
+    if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
+                       !MI->getOperand(CommuteIdx1).isReg()))
+      return false;
+
+    if (!CanCommute ||
+        !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
+      return false;
+
+    if (!TII->isOperandLegal(*MI, OpNo, OpToFold))
+      return false;
+  }
+
+  FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+  return true;
+}
+
+// If the use operand doesn't care about the value, this may be an operand only
+// used for register indexing, in which case it is unsafe to fold.
+static bool isUseSafeToFold(const MachineInstr &MI,
+                            const MachineOperand &UseMO) {
+  return !UseMO.isUndef();
+  //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
+}
+
+static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
+                        unsigned UseOpIdx,
+                        std::vector<FoldCandidate> &FoldList,
+                        SmallVectorImpl<MachineInstr *> &CopiesToReplace,
+                        const SIInstrInfo *TII, const SIRegisterInfo &TRI,
+                        MachineRegisterInfo &MRI) {
+  const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+
+  if (!isUseSafeToFold(*UseMI, UseOp))
+    return;
+
+  // FIXME: Fold operands with subregs.
+  if (UseOp.isReg() && OpToFold.isReg()) {
+    if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
+      return;
+
+    // Don't fold subregister extracts into tied operands, only if it is a full
+    // copy since a subregister use tied to a full register def doesn't really
+    // make sense. e.g. don't fold:
+    //
+    // %vreg1 = COPY %vreg0:sub1
+    // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg1<tied0>
+    //
+    //  into
+    // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg0:sub1<tied0>
+    if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
+      return;
+  }
+
+  // Special case for REG_SEQUENCE: We can't fold literals into
+  // REG_SEQUENCE instructions, so we have to fold them into the
+  // uses of REG_SEQUENCE.
+  if (UseMI->isRegSequence()) {
+    unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
+    unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
+
+    for (MachineRegisterInfo::use_iterator
+           RSUse = MRI.use_begin(RegSeqDstReg), RSE = MRI.use_end();
+         RSUse != RSE; ++RSUse) {
+
+      MachineInstr *RSUseMI = RSUse->getParent();
+      if (RSUse->getSubReg() != RegSeqDstSubReg)
+        continue;
+
+      foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
+                  CopiesToReplace, TII, TRI, MRI);
+    }
+
+    return;
+  }
+
+
+  bool FoldingImm = OpToFold.isImm();
+
+  // In order to fold immediates into copies, we need to change the
+  // copy to a MOV.
+  if (FoldingImm && UseMI->isCopy()) {
+    unsigned DestReg = UseMI->getOperand(0).getReg();
+    const TargetRegisterClass *DestRC
+      = TargetRegisterInfo::isVirtualRegister(DestReg) ?
+      MRI.getRegClass(DestReg) :
+      TRI.getPhysRegClass(DestReg);
+
+    unsigned MovOp = TII->getMovOpcode(DestRC);
+    if (MovOp == AMDGPU::COPY)
+      return;
+
+    UseMI->setDesc(TII->get(MovOp));
+    CopiesToReplace.push_back(UseMI);
+  } else {
+    const MCInstrDesc &UseDesc = UseMI->getDesc();
+
+    // Don't fold into target independent nodes.  Target independent opcodes
+    // don't have defined register classes.
+    if (UseDesc.isVariadic() ||
+        UseDesc.OpInfo[UseOpIdx].RegClass == -1)
+      return;
+  }
+
+  if (!FoldingImm) {
+    tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
+
+    // FIXME: We could try to change the instruction from 64-bit to 32-bit
+    // to enable more folding opportunites.  The shrink operands pass
+    // already does this.
+    return;
+  }
+
+
+  const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
+  const TargetRegisterClass *FoldRC =
+    TRI.getRegClass(FoldDesc.OpInfo[0].RegClass);
+
+  APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
+            OpToFold.getImm());
+
+  // Split 64-bit constants into 32-bits for folding.
+  if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
+    unsigned UseReg = UseOp.getReg();
+    const TargetRegisterClass *UseRC
+      = TargetRegisterInfo::isVirtualRegister(UseReg) ?
+      MRI.getRegClass(UseReg) :
+      TRI.getPhysRegClass(UseReg);
+
+    assert(Imm.getBitWidth() == 64);
+
+    if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
+      return;
+
+    if (UseOp.getSubReg() == AMDGPU::sub0) {
+      Imm = Imm.getLoBits(32);
+    } else {
+      assert(UseOp.getSubReg() == AMDGPU::sub1);
+      Imm = Imm.getHiBits(32);
+    }
+  }
+
+  MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+}
+
+static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
+                                  int32_t LHS, int32_t RHS) {
+  switch (Opcode) {
+  case AMDGPU::V_AND_B32_e64:
+  case AMDGPU::S_AND_B32:
+    Result = LHS & RHS;
+    return true;
+  case AMDGPU::V_OR_B32_e64:
+  case AMDGPU::S_OR_B32:
+    Result = LHS | RHS;
+    return true;
+  case AMDGPU::V_XOR_B32_e64:
+  case AMDGPU::S_XOR_B32:
+    Result = LHS ^ RHS;
+    return true;
+  default:
+    return false;
+  }
+}
+
+static unsigned getMovOpc(bool IsScalar) {
+  return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+}
+
+/// Remove any leftover implicit operands from mutating the instruction. e.g.
+/// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
+/// anymore.
+static void stripExtraCopyOperands(MachineInstr &MI) {
+  const MCInstrDesc &Desc = MI.getDesc();
+  unsigned NumOps = Desc.getNumOperands() +
+                    Desc.getNumImplicitUses() +
+                    Desc.getNumImplicitDefs();
+
+  for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
+    MI.RemoveOperand(I);
+}
+
+static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
+  MI.setDesc(NewDesc);
+  stripExtraCopyOperands(MI);
+}
+
+// Try to simplify operations with a constant that may appear after instruction
+// selection.
+static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
+                              const SIInstrInfo *TII,
+                              MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+
+  if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
+      Opc == AMDGPU::S_NOT_B32) {
+    MachineOperand &Src0 = MI->getOperand(1);
+    if (Src0.isImm()) {
+      Src0.setImm(~Src0.getImm());
+      mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
+      return true;
+    }
+
+    return false;
+  }
+
+  if (!MI->isCommutable())
+    return false;
+
+  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+
+  MachineOperand *Src0 = &MI->getOperand(Src0Idx);
+  MachineOperand *Src1 = &MI->getOperand(Src1Idx);
+  if (!Src0->isImm() && !Src1->isImm())
+    return false;
+
+  // and k0, k1 -> v_mov_b32 (k0 & k1)
+  // or k0, k1 -> v_mov_b32 (k0 | k1)
+  // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
+  if (Src0->isImm() && Src1->isImm()) {
+    int32_t NewImm;
+    if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
+      return false;
+
+    const SIRegisterInfo &TRI = TII->getRegisterInfo();
+    bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
+
+    Src0->setImm(NewImm);
+    MI->RemoveOperand(Src1Idx);
+    mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
+    return true;
+  }
+
+  if (Src0->isImm() && !Src1->isImm()) {
+    std::swap(Src0, Src1);
+    std::swap(Src0Idx, Src1Idx);
+  }
+
+  int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
+  if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::S_OR_B32) {
+    if (Src1Val == 0) {
+      // y = or x, 0 => y = copy x
+      MI->RemoveOperand(Src1Idx);
+      mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+    } else if (Src1Val == -1) {
+      // y = or x, -1 => y = v_mov_b32 -1
+      MI->RemoveOperand(Src1Idx);
+      mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
+    } else
+      return false;
+
+    return true;
+  }
+
+  if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
+      MI->getOpcode() == AMDGPU::S_AND_B32) {
+    if (Src1Val == 0) {
+      // y = and x, 0 => y = v_mov_b32 0
+      MI->RemoveOperand(Src0Idx);
+      mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
+    } else if (Src1Val == -1) {
+      // y = and x, -1 => y = copy x
+      MI->RemoveOperand(Src1Idx);
+      mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+      stripExtraCopyOperands(*MI);
+    } else
+      return false;
+
+    return true;
+  }
+
+  if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
+      MI->getOpcode() == AMDGPU::S_XOR_B32) {
+    if (Src1Val == 0) {
+      // y = xor x, 0 => y = copy x
+      MI->RemoveOperand(Src1Idx);
+      mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+    }
+  }
+
+  return false;
+}
+
+bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      if (!isSafeToFold(MI))
+        continue;
+
+      MachineOperand &OpToFold = MI.getOperand(1);
+      bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+
+      // FIXME: We could also be folding things like FrameIndexes and
+      // TargetIndexes.
+      if (!FoldingImm && !OpToFold.isReg())
+        continue;
+
+      if (OpToFold.isReg() &&
+          !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
+        continue;
+
+      // Prevent folding operands backwards in the function. For example,
+      // the COPY opcode must not be replaced by 1 in this example:
+      //
+      //    %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3
+      //    ...
+      //    %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use>
+      MachineOperand &Dst = MI.getOperand(0);
+      if (Dst.isReg() &&
+          !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+        continue;
+
+      // We need mutate the operands of new mov instructions to add implicit
+      // uses of EXEC, but adding them invalidates the use_iterator, so defer
+      // this.
+      SmallVector<MachineInstr *, 4> CopiesToReplace;
+
+      std::vector<FoldCandidate> FoldList;
+      if (FoldingImm) {
+        unsigned NumLiteralUses = 0;
+        MachineOperand *NonInlineUse = nullptr;
+        int NonInlineUseOpNo = -1;
+
+        // Try to fold any inline immediate uses, and then only fold other
+        // constants if they have one use.
+        //
+        // The legality of the inline immediate must be checked based on the use
+        // operand, not the defining instruction, because 32-bit instructions
+        // with 32-bit inline immediate sources may be used to materialize
+        // constants used in 16-bit operands.
+        //
+        // e.g. it is unsafe to fold:
+        //  s_mov_b32 s0, 1.0    // materializes 0x3f800000
+        //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
+
+        // Folding immediates with more than one use will increase program size.
+        // FIXME: This will also reduce register usage, which may be better
+        // in some cases. A better heuristic is needed.
+        for (MachineRegisterInfo::use_iterator
+               Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
+             Use != E; ++Use) {
+          MachineInstr *UseMI = Use->getParent();
+          unsigned OpNo = Use.getOperandNo();
+
+          if (TII->isInlineConstant(*UseMI, OpNo, OpToFold)) {
+            foldOperand(OpToFold, UseMI, OpNo, FoldList,
+                        CopiesToReplace, TII, TRI, MRI);
+          } else {
+            if (++NumLiteralUses == 1) {
+              NonInlineUse = &*Use;
+              NonInlineUseOpNo = OpNo;
+            }
+          }
+        }
+
+        if (NumLiteralUses == 1) {
+          MachineInstr *UseMI = NonInlineUse->getParent();
+          foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList,
+                      CopiesToReplace, TII, TRI, MRI);
+        }
+      } else {
+        // Folding register.
+        for (MachineRegisterInfo::use_iterator
+               Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
+             Use != E; ++Use) {
+          MachineInstr *UseMI = Use->getParent();
+
+          foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
+                      CopiesToReplace, TII, TRI, MRI);
+        }
+      }
+
+      // Make sure we add EXEC uses to any new v_mov instructions created.
+      for (MachineInstr *Copy : CopiesToReplace)
+        Copy->addImplicitDefUseOperands(MF);
+
+      for (FoldCandidate &Fold : FoldList) {
+        if (updateOperand(Fold, TRI)) {
+          // Clear kill flags.
+          if (Fold.isReg()) {
+            assert(Fold.OpToFold && Fold.OpToFold->isReg());
+            // FIXME: Probably shouldn't bother trying to fold if not an
+            // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
+            // copies.
+            MRI.clearKillFlags(Fold.OpToFold->getReg());
+          }
+          DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
+                static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
+
+          // Folding the immediate may reveal operations that can be constant
+          // folded or replaced with a copy. This can happen for example after
+          // frame indices are lowered to constants or from splitting 64-bit
+          // constants.
+          tryConstantFoldOp(MRI, TII, Fold.UseMI);
+        }
+      }
+    }
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
new file mode 100644
index 000000000000..d0a69eafc58e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -0,0 +1,400 @@
+//===----------------------- SIFrameLowering.cpp --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#include "SIFrameLowering.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+
+using namespace llvm;
+
+
+static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF,
+                                         const SIRegisterInfo *TRI) {
+  return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
+                      TRI->getMaxNumSGPRs(MF) / 4);
+}
+
+static ArrayRef<MCPhysReg> getAllSGPRs(const MachineFunction &MF,
+                                       const SIRegisterInfo *TRI) {
+  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
+                      TRI->getMaxNumSGPRs(MF));
+}
+
+void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII,
+                                          const SIRegisterInfo* TRI,
+                                          MachineFunction &MF,
+                                          MachineBasicBlock &MBB) const {
+  // We don't need this if we only have spills since there is no user facing
+  // scratch.
+
+  // TODO: If we know we don't have flat instructions earlier, we can omit
+  // this from the input registers.
+  //
+  // TODO: We only need to know if we access scratch space through a flat
+  // pointer. Because we only detect if flat instructions are used at all,
+  // this will be used more often than necessary on VI.
+
+  // Debug location must be unknown since the first debug location is used to
+  // determine the end of the prologue.
+  DebugLoc DL;
+  MachineBasicBlock::iterator I = MBB.begin();
+
+  unsigned FlatScratchInitReg
+    = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MRI.addLiveIn(FlatScratchInitReg);
+  MBB.addLiveIn(FlatScratchInitReg);
+
+  // Copy the size in bytes.
+  unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
+    .addReg(FlatScrInitHi, RegState::Kill);
+
+  unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+
+  // Add wave offset in bytes to private base offset.
+  // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
+    .addReg(FlatScrInitLo)
+    .addReg(ScratchWaveOffsetReg);
+
+  // Convert offset to 256-byte units.
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
+    .addReg(FlatScrInitLo, RegState::Kill)
+    .addImm(8);
+}
+
+unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
+  const SISubtarget &ST,
+  const SIInstrInfo *TII,
+  const SIRegisterInfo *TRI,
+  SIMachineFunctionInfo *MFI,
+  MachineFunction &MF) const {
+
+  // We need to insert initialization of the scratch resource descriptor.
+  unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
+  if (ScratchRsrcReg == AMDGPU::NoRegister)
+    return AMDGPU::NoRegister;
+
+  if (ST.hasSGPRInitBug() ||
+      ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
+    return ScratchRsrcReg;
+
+  // We reserved the last registers for this. Shift it down to the end of those
+  // which were actually used.
+  //
+  // FIXME: It might be safer to use a pseudoregister before replacement.
+
+  // FIXME: We should be able to eliminate unused input registers. We only
+  // cannot do this for the resources required for scratch access. For now we
+  // skip over user SGPRs and may leave unused holes.
+
+  // We find the resource first because it has an alignment requirement.
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
+  ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(MF, TRI);
+  AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
+
+  // Skip the last 2 elements because the last one is reserved for VCC, and
+  // this is the 2nd to last element already.
+  for (MCPhysReg Reg : AllSGPR128s) {
+    // Pick the first unallocated one. Make sure we don't clobber the other
+    // reserved input we needed.
+    if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
+      //assert(MRI.isAllocatable(Reg));
+      MRI.replaceRegWith(ScratchRsrcReg, Reg);
+      MFI->setScratchRSrcReg(Reg);
+      return Reg;
+    }
+  }
+
+  return ScratchRsrcReg;
+}
+
+unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
+  const SISubtarget &ST,
+  const SIInstrInfo *TII,
+  const SIRegisterInfo *TRI,
+  SIMachineFunctionInfo *MFI,
+  MachineFunction &MF) const {
+  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+  if (ST.hasSGPRInitBug() ||
+      ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF))
+    return ScratchWaveOffsetReg;
+
+  unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+
+  ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(MF, TRI);
+  if (NumPreloaded > AllSGPRs.size())
+    return ScratchWaveOffsetReg;
+
+  AllSGPRs = AllSGPRs.slice(NumPreloaded);
+
+  // We need to drop register from the end of the list that we cannot use
+  // for the scratch wave offset.
+  // + 2 s102 and s103 do not exist on VI.
+  // + 2 for vcc
+  // + 2 for xnack_mask
+  // + 2 for flat_scratch
+  // + 4 for registers reserved for scratch resource register
+  // + 1 for register reserved for scratch wave offset.  (By exluding this
+  //     register from the list to consider, it means that when this
+  //     register is being used for the scratch wave offset and there
+  //     are no other free SGPRs, then the value will stay in this register.
+  // ----
+  //  13
+  if (AllSGPRs.size() < 13)
+    return ScratchWaveOffsetReg;
+
+  for (MCPhysReg Reg : AllSGPRs.drop_back(13)) {
+    // Pick the first unallocated SGPR. Be careful not to pick an alias of the
+    // scratch descriptor, since we haven’t added its uses yet.
+    if (!MRI.isPhysRegUsed(Reg)) {
+      if (!MRI.isAllocatable(Reg) ||
+          TRI->isSubRegisterEq(ScratchRsrcReg, Reg))
+        continue;
+
+      MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+      MFI->setScratchWaveOffsetReg(Reg);
+      return Reg;
+    }
+  }
+
+  return ScratchWaveOffsetReg;
+}
+
+void SIFrameLowering::emitPrologue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
+  // specified.
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  if (ST.debuggerEmitPrologue())
+    emitDebuggerPrologue(MF, MBB);
+
+  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  // If we only have SGPR spills, we won't actually be using scratch memory
+  // since these spill to VGPRs.
+  //
+  // FIXME: We should be cleaning up these unused SGPR spill frame indices
+  // somewhere.
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned ScratchRsrcReg
+    = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
+  unsigned ScratchWaveOffsetReg
+    = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
+
+  if (ScratchRsrcReg == AMDGPU::NoRegister) {
+    assert(ScratchWaveOffsetReg == AMDGPU::NoRegister);
+    return;
+  }
+
+  assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
+
+  // We need to do the replacement of the private segment buffer and wave offset
+  // register even if there are no stack objects. There could be stores to undef
+  // or a constant without an associated object.
+
+  // FIXME: We still have implicit uses on SGPR spill instructions in case they
+  // need to spill to vector memory. It's likely that will not happen, but at
+  // this point it appears we need the setup. This part of the prolog should be
+  // emitted after frame indices are eliminated.
+
+  if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
+    emitFlatScratchInit(TII, TRI, MF, MBB);
+
+  // We need to insert initialization of the scratch resource descriptor.
+  unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
+    MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+
+
+  unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
+  if (ST.isAmdCodeObjectV2()) {
+    PreloadedPrivateBufferReg = TRI->getPreloadedValue(
+      MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+  }
+
+  bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg);
+  bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg);
+
+  // We added live-ins during argument lowering, but since they were not used
+  // they were deleted. We're adding the uses now, so add them back.
+  if (OffsetRegUsed) {
+    assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister &&
+           "scratch wave offset input is required");
+    MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+    MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+  }
+
+  if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
+    assert(ST.isAmdCodeObjectV2());
+    MRI.addLiveIn(PreloadedPrivateBufferReg);
+    MBB.addLiveIn(PreloadedPrivateBufferReg);
+  }
+
+  // Make the register selected live throughout the function.
+  for (MachineBasicBlock &OtherBB : MF) {
+    if (&OtherBB == &MBB)
+      continue;
+
+    if (OffsetRegUsed)
+      OtherBB.addLiveIn(ScratchWaveOffsetReg);
+
+    if (ResourceRegUsed)
+      OtherBB.addLiveIn(ScratchRsrcReg);
+  }
+
+  DebugLoc DL;
+  MachineBasicBlock::iterator I = MBB.begin();
+
+  // If we reserved the original input registers, we don't need to copy to the
+  // reserved registers.
+
+  bool CopyBuffer = ResourceRegUsed &&
+    PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
+    ScratchRsrcReg != PreloadedPrivateBufferReg;
+
+  // This needs to be careful of the copying order to avoid overwriting one of
+  // the input registers before it's been copied to it's final
+  // destination. Usually the offset should be copied first.
+  bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
+                                              ScratchWaveOffsetReg);
+  if (CopyBuffer && CopyBufferFirst) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+      .addReg(PreloadedPrivateBufferReg, RegState::Kill);
+  }
+
+  if (OffsetRegUsed &&
+      PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
+      .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
+  }
+
+  if (CopyBuffer && !CopyBufferFirst) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+      .addReg(PreloadedPrivateBufferReg, RegState::Kill);
+  }
+
+  if (ResourceRegUsed && PreloadedPrivateBufferReg == AMDGPU::NoRegister) {
+    assert(!ST.isAmdCodeObjectV2());
+    const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
+
+    unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+    unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+    unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+    unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+
+    // Use relocations to get the pointer, and setup the other bits manually.
+    uint64_t Rsrc23 = TII->getScratchRsrcWords23();
+    BuildMI(MBB, I, DL, SMovB32, Rsrc0)
+      .addExternalSymbol("SCRATCH_RSRC_DWORD0")
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+    BuildMI(MBB, I, DL, SMovB32, Rsrc1)
+      .addExternalSymbol("SCRATCH_RSRC_DWORD1")
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+    BuildMI(MBB, I, DL, SMovB32, Rsrc2)
+      .addImm(Rsrc23 & 0xffffffff)
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+    BuildMI(MBB, I, DL, SMovB32, Rsrc3)
+      .addImm(Rsrc23 >> 32)
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+  }
+}
+
+void SIFrameLowering::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+
+}
+
+void SIFrameLowering::processFunctionBeforeFrameFinalized(
+  MachineFunction &MF,
+  RegScavenger *RS) const {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  if (!MFI.hasStackObjects())
+    return;
+
+  bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects();
+
+  assert((RS || !MayNeedScavengingEmergencySlot) &&
+         "RegScavenger required if spilling");
+
+  if (MayNeedScavengingEmergencySlot) {
+    int ScavengeFI = MFI.CreateStackObject(
+      AMDGPU::SGPR_32RegClass.getSize(),
+      AMDGPU::SGPR_32RegClass.getAlignment(), false);
+    RS->addScavengingFrameIndex(ScavengeFI);
+  }
+}
+
+void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
+                                           MachineBasicBlock &MBB) const {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  MachineBasicBlock::iterator I = MBB.begin();
+  DebugLoc DL;
+
+  // For each dimension:
+  for (unsigned i = 0; i < 3; ++i) {
+    // Get work group ID SGPR, and make it live-in again.
+    unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i);
+    MF.getRegInfo().addLiveIn(WorkGroupIDSGPR);
+    MBB.addLiveIn(WorkGroupIDSGPR);
+
+    // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in
+    // order to spill it to scratch.
+    unsigned WorkGroupIDVGPR =
+      MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR)
+      .addReg(WorkGroupIDSGPR);
+
+    // Spill work group ID.
+    int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i);
+    TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false,
+      WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
+
+    // Get work item ID VGPR, and make it live-in again.
+    unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i);
+    MF.getRegInfo().addLiveIn(WorkItemIDVGPR);
+    MBB.addLiveIn(WorkItemIDVGPR);
+
+    // Spill work item ID.
+    int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i);
+    TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false,
+      WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
+  }
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
new file mode 100644
index 000000000000..7657b4e03864
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -0,0 +1,64 @@
+//===--------------------- SIFrameLowering.h --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
+
+#include "AMDGPUFrameLowering.h"
+
+namespace llvm {
+
+class SIInstrInfo;
+class SIMachineFunctionInfo;
+class SIRegisterInfo;
+class SISubtarget;
+
+class SIFrameLowering final : public AMDGPUFrameLowering {
+public:
+  SIFrameLowering(StackDirection D, unsigned StackAl, int LAO,
+                  unsigned TransAl = 1) :
+    AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
+  ~SIFrameLowering() override = default;
+
+  void emitPrologue(MachineFunction &MF,
+                    MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF,
+                    MachineBasicBlock &MBB) const override;
+
+  void processFunctionBeforeFrameFinalized(
+    MachineFunction &MF,
+    RegScavenger *RS = nullptr) const override;
+
+private:
+  void emitFlatScratchInit(const SIInstrInfo *TII,
+                           const SIRegisterInfo* TRI,
+                           MachineFunction &MF,
+                           MachineBasicBlock &MBB) const;
+
+  unsigned getReservedPrivateSegmentBufferReg(
+    const SISubtarget &ST,
+    const SIInstrInfo *TII,
+    const SIRegisterInfo *TRI,
+    SIMachineFunctionInfo *MFI,
+    MachineFunction &MF) const;
+
+  unsigned getReservedPrivateSegmentWaveByteOffsetReg(
+    const SISubtarget &ST,
+    const SIInstrInfo *TII,
+    const SIRegisterInfo *TRI,
+    SIMachineFunctionInfo *MFI,
+    MachineFunction &MF) const;
+
+  /// \brief Emits debugger prologue.
+  void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
new file mode 100644
index 000000000000..fa53831cbe16
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -0,0 +1,4519 @@
+//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Custom DAG lowering for SI
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef _MSC_VER
+// Provide M_PI.
+#define _USE_MATH_DEFINES
+#include <cmath>
+#endif
+
+#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIISelLowering.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableVGPRIndexMode(
+  "amdgpu-vgpr-index-mode",
+  cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
+  cl::init(false));
+
+
+static unsigned findFirstFreeSGPR(CCState &CCInfo) {
+  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
+    if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
+      return AMDGPU::SGPR0 + Reg;
+    }
+  }
+  llvm_unreachable("Cannot allocate sgpr");
+}
+
+SITargetLowering::SITargetLowering(const TargetMachine &TM,
+                                   const SISubtarget &STI)
+    : AMDGPUTargetLowering(TM, STI) {
+  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
+  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
+
+  addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
+  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
+
+  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
+  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
+  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+
+  addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
+  addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
+
+  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
+  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+
+  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
+  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
+
+  addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
+  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
+
+  if (Subtarget->has16BitInsts()) {
+    addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
+    addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
+  }
+
+  computeRegisterProperties(STI.getRegisterInfo());
+
+  // We need to custom lower vector stores from local memory
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::i1, Custom);
+
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v8i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v16i32, Custom);
+  setOperationAction(ISD::STORE, MVT::i1, Custom);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
+
+  setOperationAction(ISD::SELECT, MVT::i1, Promote);
+  setOperationAction(ISD::SELECT, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT, MVT::f64, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
+
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+
+  setOperationAction(ISD::SETCC, MVT::i1, Promote);
+  setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
+  setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
+  AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
+
+  setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
+  setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
+
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+
+  // We only support LOAD/STORE and vector manipulation ops for vectors
+  // with > 4 elements.
+  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
+    for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
+      switch (Op) {
+      case ISD::LOAD:
+      case ISD::STORE:
+      case ISD::BUILD_VECTOR:
+      case ISD::BITCAST:
+      case ISD::EXTRACT_VECTOR_ELT:
+      case ISD::INSERT_VECTOR_ELT:
+      case ISD::INSERT_SUBVECTOR:
+      case ISD::EXTRACT_SUBVECTOR:
+      case ISD::SCALAR_TO_VECTOR:
+        break;
+      case ISD::CONCAT_VECTORS:
+        setOperationAction(Op, VT, Custom);
+        break;
+      default:
+        setOperationAction(Op, VT, Expand);
+        break;
+      }
+    }
+  }
+
+  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
+  // is expanded to avoid having two separate loops in case the index is a VGPR.
+
+  // Most operations are naturally 32-bit vector operations. We only support
+  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
+  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
+    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
+
+    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
+  }
+
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+
+  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
+  // and output demarshalling
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
+
+  // We can't return success/failure, only the old value,
+  // let LLVM add the comparison
+  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
+
+  if (getSubtarget()->hasFlatAddressSpace()) {
+    setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
+    setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
+  }
+
+  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+
+  // On SI this is s_memtime and s_memrealtime on VI.
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+  setOperationAction(ISD::TRAP, MVT::Other, Custom);
+
+  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+
+  if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
+    setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+    setOperationAction(ISD::FCEIL, MVT::f64, Legal);
+    setOperationAction(ISD::FRINT, MVT::f64, Legal);
+  }
+
+  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+
+  setOperationAction(ISD::FSIN, MVT::f32, Custom);
+  setOperationAction(ISD::FCOS, MVT::f32, Custom);
+  setOperationAction(ISD::FDIV, MVT::f32, Custom);
+  setOperationAction(ISD::FDIV, MVT::f64, Custom);
+
+  if (Subtarget->has16BitInsts()) {
+    setOperationAction(ISD::Constant, MVT::i16, Legal);
+
+    setOperationAction(ISD::SMIN, MVT::i16, Legal);
+    setOperationAction(ISD::SMAX, MVT::i16, Legal);
+
+    setOperationAction(ISD::UMIN, MVT::i16, Legal);
+    setOperationAction(ISD::UMAX, MVT::i16, Legal);
+
+    setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
+    AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
+
+    setOperationAction(ISD::ROTR, MVT::i16, Promote);
+    setOperationAction(ISD::ROTL, MVT::i16, Promote);
+
+    setOperationAction(ISD::SDIV, MVT::i16, Promote);
+    setOperationAction(ISD::UDIV, MVT::i16, Promote);
+    setOperationAction(ISD::SREM, MVT::i16, Promote);
+    setOperationAction(ISD::UREM, MVT::i16, Promote);
+
+    setOperationAction(ISD::BSWAP, MVT::i16, Promote);
+    setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
+
+    setOperationAction(ISD::CTTZ, MVT::i16, Promote);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
+    setOperationAction(ISD::CTLZ, MVT::i16, Promote);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
+
+    setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
+
+    setOperationAction(ISD::BR_CC, MVT::i16, Expand);
+
+    setOperationAction(ISD::LOAD, MVT::i16, Custom);
+
+    setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+
+    setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
+    AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
+    setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
+    AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
+
+    setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+
+    // F16 - Constant Actions.
+    setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
+
+    // F16 - Load/Store Actions.
+    setOperationAction(ISD::LOAD, MVT::f16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
+    setOperationAction(ISD::STORE, MVT::f16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
+
+    // F16 - VOP1 Actions.
+    setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+    setOperationAction(ISD::FCOS, MVT::f16, Promote);
+    setOperationAction(ISD::FSIN, MVT::f16, Promote);
+    setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
+    setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
+
+    // F16 - VOP2 Actions.
+    setOperationAction(ISD::BR_CC, MVT::f16, Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
+    setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
+    setOperationAction(ISD::FDIV, MVT::f16, Custom);
+
+    // F16 - VOP3 Actions.
+    setOperationAction(ISD::FMA, MVT::f16, Legal);
+    if (!Subtarget->hasFP16Denormals())
+      setOperationAction(ISD::FMAD, MVT::f16, Legal);
+  }
+
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::FSUB);
+  setTargetDAGCombine(ISD::FMINNUM);
+  setTargetDAGCombine(ISD::FMAXNUM);
+  setTargetDAGCombine(ISD::SMIN);
+  setTargetDAGCombine(ISD::SMAX);
+  setTargetDAGCombine(ISD::UMIN);
+  setTargetDAGCombine(ISD::UMAX);
+  setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
+  setTargetDAGCombine(ISD::FCANONICALIZE);
+
+  // All memory operations. Some folding on the pointer operand is done to help
+  // matching the constant offsets in the addressing modes.
+  setTargetDAGCombine(ISD::LOAD);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD);
+  setTargetDAGCombine(ISD::ATOMIC_STORE);
+  setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
+  setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
+  setTargetDAGCombine(ISD::ATOMIC_SWAP);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
+
+  setSchedulingPreference(Sched::RegPressure);
+}
+
+const SISubtarget *SITargetLowering::getSubtarget() const {
+  return static_cast<const SISubtarget *>(Subtarget);
+}
+
+//===----------------------------------------------------------------------===//
+// TargetLowering queries
+//===----------------------------------------------------------------------===//
+
+bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                          const CallInst &CI,
+                                          unsigned IntrID) const {
+  switch (IntrID) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(CI.getType());
+    Info.ptrVal = CI.getOperand(0);
+    Info.align = 0;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = true;
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
+                                          EVT) const {
+  // SI has some legal vector types, but no legal vector operations. Say no
+  // shuffles are legal in order to prefer scalarizing some vector operations.
+  return false;
+}
+
+bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
+  // Flat instructions do not have offsets, and only have the register
+  // address.
+  return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
+}
+
+bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
+  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
+  // additionally can do r + r + i with addr64. 32-bit has more addressing
+  // mode options. Depending on the resource constant, it can also do
+  // (i64 r0) + (i32 r1) * (i14 i).
+  //
+  // Private arrays end up using a scratch buffer most of the time, so also
+  // assume those use MUBUF instructions. Scratch loads / stores are currently
+  // implemented as mubuf instructions with offen bit set, so slightly
+  // different than the normal addr64.
+  if (!isUInt<12>(AM.BaseOffs))
+    return false;
+
+  // FIXME: Since we can split immediate into soffset and immediate offset,
+  // would it make sense to allow any immediate?
+
+  switch (AM.Scale) {
+  case 0: // r + i or just i, depending on HasBaseReg.
+    return true;
+  case 1:
+    return true; // We have r + r or r + i.
+  case 2:
+    if (AM.HasBaseReg) {
+      // Reject 2 * r + r.
+      return false;
+    }
+
+    // Allow 2 * r as r + r
+    // Or  2 * r + i is allowed as r + r + i.
+    return true;
+  default: // Don't allow n * r
+    return false;
+  }
+}
+
+bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                             const AddrMode &AM, Type *Ty,
+                                             unsigned AS) const {
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  switch (AS) {
+  case AMDGPUAS::GLOBAL_ADDRESS: {
+    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+      // Assume the we will use FLAT for all global memory accesses
+      // on VI.
+      // FIXME: This assumption is currently wrong.  On VI we still use
+      // MUBUF instructions for the r + i addressing mode.  As currently
+      // implemented, the MUBUF instructions only work on buffer < 4GB.
+      // It may be possible to support > 4GB buffers with MUBUF instructions,
+      // by setting the stride value in the resource descriptor which would
+      // increase the size limit to (stride * 4GB).  However, this is risky,
+      // because it has never been validated.
+      return isLegalFlatAddressingMode(AM);
+    }
+
+    return isLegalMUBUFAddressingMode(AM);
+  }
+  case AMDGPUAS::CONSTANT_ADDRESS: {
+    // If the offset isn't a multiple of 4, it probably isn't going to be
+    // correctly aligned.
+    // FIXME: Can we get the real alignment here?
+    if (AM.BaseOffs % 4 != 0)
+      return isLegalMUBUFAddressingMode(AM);
+
+    // There are no SMRD extloads, so if we have to do a small type access we
+    // will use a MUBUF load.
+    // FIXME?: We also need to do this if unaligned, but we don't know the
+    // alignment here.
+    if (DL.getTypeStoreSize(Ty) < 4)
+      return isLegalMUBUFAddressingMode(AM);
+
+    if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
+      // SMRD instructions have an 8-bit, dword offset on SI.
+      if (!isUInt<8>(AM.BaseOffs / 4))
+        return false;
+    } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
+      // On CI+, this can also be a 32-bit literal constant offset. If it fits
+      // in 8-bits, it can use a smaller encoding.
+      if (!isUInt<32>(AM.BaseOffs / 4))
+        return false;
+    } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) {
+      // On VI, these use the SMEM format and the offset is 20-bit in bytes.
+      if (!isUInt<20>(AM.BaseOffs))
+        return false;
+    } else
+      llvm_unreachable("unhandled generation");
+
+    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
+      return true;
+
+    if (AM.Scale == 1 && AM.HasBaseReg)
+      return true;
+
+    return false;
+  }
+
+  case AMDGPUAS::PRIVATE_ADDRESS:
+    return isLegalMUBUFAddressingMode(AM);
+
+  case AMDGPUAS::LOCAL_ADDRESS:
+  case AMDGPUAS::REGION_ADDRESS: {
+    // Basic, single offset DS instructions allow a 16-bit unsigned immediate
+    // field.
+    // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
+    // an 8-bit dword offset but we don't know the alignment here.
+    if (!isUInt<16>(AM.BaseOffs))
+      return false;
+
+    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
+      return true;
+
+    if (AM.Scale == 1 && AM.HasBaseReg)
+      return true;
+
+    return false;
+  }
+  case AMDGPUAS::FLAT_ADDRESS:
+  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+    // For an unknown address space, this usually means that this is for some
+    // reason being used for pure arithmetic, and not based on some addressing
+    // computation. We don't have instructions that compute pointers with any
+    // addressing modes, so treat them as having no offset like flat
+    // instructions.
+    return isLegalFlatAddressingMode(AM);
+
+  default:
+    llvm_unreachable("unhandled address space");
+  }
+}
+
+bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                      unsigned AddrSpace,
+                                                      unsigned Align,
+                                                      bool *IsFast) const {
+  if (IsFast)
+    *IsFast = false;
+
+  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
+  // which isn't a simple VT.
+  // Until MVT is extended to handle this, simply check for the size and
+  // rely on the condition below: allow accesses if the size is a multiple of 4.
+  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
+                           VT.getStoreSize() > 16)) {
+    return false;
+  }
+
+  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+      AddrSpace == AMDGPUAS::REGION_ADDRESS) {
+    // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
+    // aligned, 8 byte access in a single operation using ds_read2/write2_b32
+    // with adjacent offsets.
+    bool AlignedBy4 = (Align % 4 == 0);
+    if (IsFast)
+      *IsFast = AlignedBy4;
+
+    return AlignedBy4;
+  }
+
+  // FIXME: We have to be conservative here and assume that flat operations
+  // will access scratch.  If we had access to the IR function, then we
+  // could determine if any private memory was used in the function.
+  if (!Subtarget->hasUnalignedScratchAccess() &&
+      (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
+       AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
+    return false;
+  }
+
+  if (Subtarget->hasUnalignedBufferAccess()) {
+    // If we have an uniform constant load, it still requires using a slow
+    // buffer instruction if unaligned.
+    if (IsFast) {
+      *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ?
+        (Align % 4 == 0) : true;
+    }
+
+    return true;
+  }
+
+  // Smaller than dword value must be aligned.
+  if (VT.bitsLT(MVT::i32))
+    return false;
+
+  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
+  // byte-address are ignored, thus forcing Dword alignment.
+  // This applies to private, global, and constant memory.
+  if (IsFast)
+    *IsFast = true;
+
+  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
+}
+
+EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                          unsigned SrcAlign, bool IsMemset,
+                                          bool ZeroMemset,
+                                          bool MemcpyStrSrc,
+                                          MachineFunction &MF) const {
+  // FIXME: Should account for address space here.
+
+  // The default fallback uses the private pointer size as a guess for a type to
+  // use. Make sure we switch these to 64-bit accesses.
+
+  if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
+    return MVT::v4i32;
+
+  if (Size >= 8 && DstAlign >= 4)
+    return MVT::v2i32;
+
+  // Use the default.
+  return MVT::Other;
+}
+
+static bool isFlatGlobalAddrSpace(unsigned AS) {
+  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+         AS == AMDGPUAS::FLAT_ADDRESS ||
+         AS == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
+bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+                                           unsigned DestAS) const {
+  return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
+}
+
+bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
+  const MemSDNode *MemNode = cast<MemSDNode>(N);
+  const Value *Ptr = MemNode->getMemOperand()->getValue();
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
+  return I && I->getMetadata("amdgpu.noclobber");
+}
+
+bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
+                                            unsigned DestAS) const {
+  // Flat -> private/local is a simple truncate.
+  // Flat -> global is no-op
+  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
+    return true;
+
+  return isNoopAddrSpaceCast(SrcAS, DestAS);
+}
+
+bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
+  const MemSDNode *MemNode = cast<MemSDNode>(N);
+  const Value *Ptr = MemNode->getMemOperand()->getValue();
+
+  // UndefValue means this is a load of a kernel input.  These are uniform.
+  // Sometimes LDS instructions have constant pointers.
+  // If Ptr is null, then that means this mem operand contains a
+  // PseudoSourceValue like GOT.
+  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
+      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
+    return true;
+
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
+  return I && I->getMetadata("amdgpu.uniform");
+}
+
+TargetLoweringBase::LegalizeTypeAction
+SITargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
+    return TypeSplitVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                         Type *Ty) const {
+  // FIXME: Could be smarter if called for vector constants.
+  return true;
+}
+
+bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
+  if (Subtarget->has16BitInsts() && VT == MVT::i16) {
+    switch (Op) {
+    case ISD::LOAD:
+    case ISD::STORE:
+
+    // These operations are done with 32-bit instructions anyway.
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+    case ISD::SELECT:
+      // TODO: Extensions?
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  // SimplifySetCC uses this function to determine whether or not it should
+  // create setcc with i1 operands.  We don't have instructions for i1 setcc.
+  if (VT == MVT::i1 && Op == ISD::SETCC)
+    return false;
+
+  return TargetLowering::isTypeDesirableForOp(Op, VT);
+}
+
+SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
+                                            const SDLoc &SL, SDValue Chain,
+                                            unsigned Offset) const {
+  const DataLayout &DL = DAG.getDataLayout();
+  MachineFunction &MF = DAG.getMachineFunction();
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+
+  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
+                                       MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
+  return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+                     DAG.getConstant(Offset, SL, PtrVT));
+}
+
+SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
+                                         const SDLoc &SL, SDValue Chain,
+                                         unsigned Offset, bool Signed) const {
+  const DataLayout &DL = DAG.getDataLayout();
+  Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
+  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+
+  unsigned Align = DL.getABITypeAlignment(Ty);
+
+  SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset);
+  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
+                             MachineMemOperand::MONonTemporal |
+                             MachineMemOperand::MODereferenceable |
+                             MachineMemOperand::MOInvariant);
+
+  SDValue Val;
+  if (MemVT.isFloatingPoint())
+    Val = getFPExtOrFPTrunc(DAG, Load, SL, VT);
+  else if (Signed)
+    Val = DAG.getSExtOrTrunc(Load, SL, VT);
+  else
+    Val = DAG.getZExtOrTrunc(Load, SL, VT);
+
+  SDValue Ops[] = {
+    Val,
+    Load.getValue(1)
+  };
+
+  return DAG.getMergeValues(Ops, SL);
+}
+
+SDValue SITargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  FunctionType *FType = MF.getFunction()->getFunctionType();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
+    const Function *Fn = MF.getFunction();
+    DiagnosticInfoUnsupported NoGraphicsHSA(
+        *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
+    DAG.getContext()->diagnose(NoGraphicsHSA);
+    return DAG.getEntryNode();
+  }
+
+  // Create stack objects that are used for emitting debugger prologue if
+  // "amdgpu-debugger-emit-prologue" attribute was specified.
+  if (ST.debuggerEmitPrologue())
+    createDebuggerPrologueStackObjects(MF);
+
+  SmallVector<ISD::InputArg, 16> Splits;
+  BitVector Skipped(Ins.size());
+
+  for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
+    const ISD::InputArg &Arg = Ins[i];
+
+    // First check if it's a PS input addr
+    if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
+        !Arg.Flags.isByVal() && PSInputNum <= 15) {
+
+      if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
+        // We can safely skip PS inputs
+        Skipped.set(i);
+        ++PSInputNum;
+        continue;
+      }
+
+      Info->markPSInputAllocated(PSInputNum);
+      if (Arg.Used)
+        Info->PSInputEna |= 1 << PSInputNum;
+
+      ++PSInputNum;
+    }
+
+    if (AMDGPU::isShader(CallConv)) {
+      // Second split vertices into their elements
+      if (Arg.VT.isVector()) {
+        ISD::InputArg NewArg = Arg;
+        NewArg.Flags.setSplit();
+        NewArg.VT = Arg.VT.getVectorElementType();
+
+        // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
+        // three or five element vertex only needs three or five registers,
+        // NOT four or eight.
+        Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+        unsigned NumElements = ParamType->getVectorNumElements();
+
+        for (unsigned j = 0; j != NumElements; ++j) {
+          Splits.push_back(NewArg);
+          NewArg.PartOffset += NewArg.VT.getStoreSize();
+        }
+      } else {
+        Splits.push_back(Arg);
+      }
+    }
+  }
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+
+  // At least one interpolation mode must be enabled or else the GPU will hang.
+  //
+  // Check PSInputAddr instead of PSInputEna. The idea is that if the user set
+  // PSInputAddr, the user wants to enable some bits after the compilation
+  // based on run-time states. Since we can't know what the final PSInputEna
+  // will look like, so we shouldn't do anything here and the user should take
+  // responsibility for the correct programming.
+  //
+  // Otherwise, the following restrictions apply:
+  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
+  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
+  //   enabled too.
+  if (CallConv == CallingConv::AMDGPU_PS &&
+      ((Info->getPSInputAddr() & 0x7F) == 0 ||
+       ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) {
+    CCInfo.AllocateReg(AMDGPU::VGPR0);
+    CCInfo.AllocateReg(AMDGPU::VGPR1);
+    Info->markPSInputAllocated(0);
+    Info->PSInputEna |= 1;
+  }
+
+  if (!AMDGPU::isShader(CallConv)) {
+    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
+  } else {
+    assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
+           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
+           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
+           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+           !Info->hasWorkItemIDZ());
+  }
+
+  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
+  if (Info->hasPrivateSegmentBuffer()) {
+    unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
+    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+    CCInfo.AllocateReg(PrivateSegmentBufferReg);
+  }
+
+  if (Info->hasDispatchPtr()) {
+    unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+    MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(DispatchPtrReg);
+  }
+
+  if (Info->hasQueuePtr()) {
+    unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
+    MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(QueuePtrReg);
+  }
+
+  if (Info->hasKernargSegmentPtr()) {
+    unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+    MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(InputPtrReg);
+  }
+
+  if (Info->hasDispatchID()) {
+    unsigned DispatchIDReg = Info->addDispatchID(*TRI);
+    MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(DispatchIDReg);
+  }
+
+  if (Info->hasFlatScratchInit()) {
+    unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
+    MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(FlatScratchInitReg);
+  }
+
+  if (!AMDGPU::isShader(CallConv))
+    analyzeFormalArgumentsCompute(CCInfo, Ins);
+  else
+    AnalyzeFormalArguments(CCInfo, Splits);
+
+  SmallVector<SDValue, 16> Chains;
+
+  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+
+    const ISD::InputArg &Arg = Ins[i];
+    if (Skipped[i]) {
+      InVals.push_back(DAG.getUNDEF(Arg.VT));
+      continue;
+    }
+
+    CCValAssign &VA = ArgLocs[ArgIdx++];
+    MVT VT = VA.getLocVT();
+
+    if (VA.isMemLoc()) {
+      VT = Ins[i].VT;
+      EVT MemVT = VA.getLocVT();
+      const unsigned Offset = Subtarget->getExplicitKernelArgOffset() +
+                              VA.getLocMemOffset();
+      // The first 36 bytes of the input buffer contains information about
+      // thread group and global sizes.
+      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, Chain,
+                                   Offset, Ins[i].Flags.isSExt());
+      Chains.push_back(Arg.getValue(1));
+
+      auto *ParamTy =
+        dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
+      if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+          ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+        // On SI local pointers are just offsets into LDS, so they are always
+        // less than 16-bits.  On CI and newer they could potentially be
+        // real pointers, so we can't guarantee their size.
+        Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
+                          DAG.getValueType(MVT::i16));
+      }
+
+      InVals.push_back(Arg);
+      Info->setABIArgOffset(Offset + MemVT.getStoreSize());
+      continue;
+    }
+    assert(VA.isRegLoc() && "Parameter must be in a register!");
+
+    unsigned Reg = VA.getLocReg();
+
+    if (VT == MVT::i64) {
+      // For now assume it is a pointer
+      Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
+                                     &AMDGPU::SGPR_64RegClass);
+      Reg = MF.addLiveIn(Reg, &AMDGPU::SGPR_64RegClass);
+      SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+      InVals.push_back(Copy);
+      continue;
+    }
+
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+
+    Reg = MF.addLiveIn(Reg, RC);
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+
+    if (Arg.VT.isVector()) {
+
+      // Build a vector from the registers
+      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+      unsigned NumElements = ParamType->getVectorNumElements();
+
+      SmallVector<SDValue, 4> Regs;
+      Regs.push_back(Val);
+      for (unsigned j = 1; j != NumElements; ++j) {
+        Reg = ArgLocs[ArgIdx++].getLocReg();
+        Reg = MF.addLiveIn(Reg, RC);
+
+        SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+        Regs.push_back(Copy);
+      }
+
+      // Fill up the missing vector elements
+      NumElements = Arg.VT.getVectorNumElements() - NumElements;
+      Regs.append(NumElements, DAG.getUNDEF(VT));
+
+      InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
+      continue;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+  // these from the dispatch pointer.
+
+  // Start adding system SGPRs.
+  if (Info->hasWorkGroupIDX()) {
+    unsigned Reg = Info->addWorkGroupIDX();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkGroupIDY()) {
+    unsigned Reg = Info->addWorkGroupIDY();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkGroupIDZ()) {
+    unsigned Reg = Info->addWorkGroupIDZ();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkGroupInfo()) {
+    unsigned Reg = Info->addWorkGroupInfo();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasPrivateSegmentWaveByteOffset()) {
+    // Scratch wave offset passed in system SGPR.
+    unsigned PrivateSegmentWaveByteOffsetReg;
+
+    if (AMDGPU::isShader(CallConv)) {
+      PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
+      Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
+    } else
+      PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
+
+    MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
+    CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
+  }
+
+  // Now that we've figured out where the scratch register inputs are, see if
+  // should reserve the arguments and use them directly.
+  bool HasStackObjects = MF.getFrameInfo().hasStackObjects();
+  // Record that we know we have non-spill stack objects so we don't need to
+  // check all stack objects later.
+  if (HasStackObjects)
+    Info->setHasNonSpillStackObjects(true);
+
+  // Everything live out of a block is spilled with fast regalloc, so it's
+  // almost certain that spilling will be required.
+  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+    HasStackObjects = true;
+
+  if (ST.isAmdCodeObjectV2()) {
+    if (HasStackObjects) {
+      // If we have stack objects, we unquestionably need the private buffer
+      // resource. For the Code Object V2 ABI, this will be the first 4 user
+      // SGPR inputs. We can reserve those and use them directly.
+
+      unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+      Info->setScratchRSrcReg(PrivateSegmentBufferReg);
+
+      unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+      Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+    } else {
+      unsigned ReservedBufferReg
+        = TRI->reservedPrivateSegmentBufferReg(MF);
+      unsigned ReservedOffsetReg
+        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+
+      // We tentatively reserve the last registers (skipping the last two
+      // which may contain VCC). After register allocation, we'll replace
+      // these with the ones immediately after those which were really
+      // allocated. In the prologue copies will be inserted from the argument
+      // to these reserved registers.
+      Info->setScratchRSrcReg(ReservedBufferReg);
+      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+    }
+  } else {
+    unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
+
+    // Without HSA, relocations are used for the scratch pointer and the
+    // buffer resource setup is always inserted in the prologue. Scratch wave
+    // offset is still in an input SGPR.
+    Info->setScratchRSrcReg(ReservedBufferReg);
+
+    if (HasStackObjects) {
+      unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+      Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+    } else {
+      unsigned ReservedOffsetReg
+        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+    }
+  }
+
+  if (Info->hasWorkItemIDX()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkItemIDY()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkItemIDZ()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Chains.empty())
+    return Chain;
+
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+}
+
+SDValue
+SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::OutputArg> &Outs,
+                              const SmallVectorImpl<SDValue> &OutVals,
+                              const SDLoc &DL, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+  if (!AMDGPU::isShader(CallConv))
+    return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
+                                             OutVals, DL, DAG);
+
+  Info->setIfReturnsVoid(Outs.size() == 0);
+
+  SmallVector<ISD::OutputArg, 48> Splits;
+  SmallVector<SDValue, 48> SplitVals;
+
+  // Split vectors into their elements.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    const ISD::OutputArg &Out = Outs[i];
+
+    if (Out.VT.isVector()) {
+      MVT VT = Out.VT.getVectorElementType();
+      ISD::OutputArg NewOut = Out;
+      NewOut.Flags.setSplit();
+      NewOut.VT = VT;
+
+      // We want the original number of vector elements here, e.g.
+      // three or five, not four or eight.
+      unsigned NumElements = Out.ArgVT.getVectorNumElements();
+
+      for (unsigned j = 0; j != NumElements; ++j) {
+        SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
+                                   DAG.getConstant(j, DL, MVT::i32));
+        SplitVals.push_back(Elem);
+        Splits.push_back(NewOut);
+        NewOut.PartOffset += NewOut.VT.getStoreSize();
+      }
+    } else {
+      SplitVals.push_back(OutVals[i]);
+      Splits.push_back(Out);
+    }
+  }
+
+  // CCValAssign - represent the assignment of the return value to a location.
+  SmallVector<CCValAssign, 48> RVLocs;
+
+  // CCState - Info about the registers and stack slots.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  // Analyze outgoing return values.
+  AnalyzeReturn(CCInfo, Splits);
+
+  SDValue Flag;
+  SmallVector<SDValue, 48> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0, realRVLocIdx = 0;
+       i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    SDValue Arg = SplitVals[realRVLocIdx];
+
+    // Copied from other backends.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  // Update chain and glue.
+  RetOps[0] = Chain;
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN;
+  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
+}
+
+unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                             SelectionDAG &DAG) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+    .Case("m0", AMDGPU::M0)
+    .Case("exec", AMDGPU::EXEC)
+    .Case("exec_lo", AMDGPU::EXEC_LO)
+    .Case("exec_hi", AMDGPU::EXEC_HI)
+    .Case("flat_scratch", AMDGPU::FLAT_SCR)
+    .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
+    .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
+    .Default(AMDGPU::NoRegister);
+
+  if (Reg == AMDGPU::NoRegister) {
+    report_fatal_error(Twine("invalid register name \""
+                             + StringRef(RegName)  + "\"."));
+
+  }
+
+  if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+      Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
+    report_fatal_error(Twine("invalid register \""
+                             + StringRef(RegName)  + "\" for subtarget."));
+  }
+
+  switch (Reg) {
+  case AMDGPU::M0:
+  case AMDGPU::EXEC_LO:
+  case AMDGPU::EXEC_HI:
+  case AMDGPU::FLAT_SCR_LO:
+  case AMDGPU::FLAT_SCR_HI:
+    if (VT.getSizeInBits() == 32)
+      return Reg;
+    break;
+  case AMDGPU::EXEC:
+  case AMDGPU::FLAT_SCR:
+    if (VT.getSizeInBits() == 64)
+      return Reg;
+    break;
+  default:
+    llvm_unreachable("missing register type checking");
+  }
+
+  report_fatal_error(Twine("invalid type for register \""
+                           + StringRef(RegName) + "\"."));
+}
+
+// If kill is not the last instruction, split the block so kill is always a
+// proper terminator.
+MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
+                                                    MachineBasicBlock *BB) const {
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+  MachineBasicBlock::iterator SplitPoint(&MI);
+  ++SplitPoint;
+
+  if (SplitPoint == BB->end()) {
+    // Don't bother with a new block.
+    MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+    return BB;
+  }
+
+  MachineFunction *MF = BB->getParent();
+  MachineBasicBlock *SplitBB
+    = MF->CreateMachineBasicBlock(BB->getBasicBlock());
+
+  MF->insert(++MachineFunction::iterator(BB), SplitBB);
+  SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
+
+  SplitBB->transferSuccessorsAndUpdatePHIs(BB);
+  BB->addSuccessor(SplitBB);
+
+  MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+  return SplitBB;
+}
+
+// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
+// wavefront. If the value is uniform and just happens to be in a VGPR, this
+// will only do one iteration. In the worst case, this will loop 64 times.
+//
+// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
+static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
+  const SIInstrInfo *TII,
+  MachineRegisterInfo &MRI,
+  MachineBasicBlock &OrigBB,
+  MachineBasicBlock &LoopBB,
+  const DebugLoc &DL,
+  const MachineOperand &IdxReg,
+  unsigned InitReg,
+  unsigned ResultReg,
+  unsigned PhiReg,
+  unsigned InitSaveExecReg,
+  int Offset,
+  bool UseGPRIdxMode) {
+  MachineBasicBlock::iterator I = LoopBB.begin();
+
+  unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+  unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
+    .addReg(InitReg)
+    .addMBB(&OrigBB)
+    .addReg(ResultReg)
+    .addMBB(&LoopBB);
+
+  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
+    .addReg(InitSaveExecReg)
+    .addMBB(&OrigBB)
+    .addReg(NewExec)
+    .addMBB(&LoopBB);
+
+  // Read the next variant <- also loop target.
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
+    .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
+
+  // Compare the just read M0 value to all possible Idx values.
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
+    .addReg(CurrentIdxReg)
+    .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
+
+  if (UseGPRIdxMode) {
+    unsigned IdxReg;
+    if (Offset == 0) {
+      IdxReg = CurrentIdxReg;
+    } else {
+      IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
+        .addReg(CurrentIdxReg, RegState::Kill)
+        .addImm(Offset);
+    }
+
+    MachineInstr *SetIdx =
+      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
+      .addReg(IdxReg, RegState::Kill);
+    SetIdx->getOperand(2).setIsUndef();
+  } else {
+    // Move index from VCC into M0
+    if (Offset == 0) {
+      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+        .addReg(CurrentIdxReg, RegState::Kill);
+    } else {
+      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+        .addReg(CurrentIdxReg, RegState::Kill)
+        .addImm(Offset);
+    }
+  }
+
+  // Update EXEC, save the original EXEC value to VCC.
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
+    .addReg(CondReg, RegState::Kill);
+
+  MRI.setSimpleHint(NewExec, CondReg);
+
+  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
+  MachineInstr *InsertPt =
+    BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+    .addReg(AMDGPU::EXEC)
+    .addReg(NewExec);
+
+  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
+  // s_cbranch_scc0?
+
+  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+    .addMBB(&LoopBB);
+
+  return InsertPt->getIterator();
+}
+
+// This has slightly sub-optimal regalloc when the source vector is killed by
+// the read. The register allocator does not understand that the kill is
+// per-workitem, so is kept alive for the whole loop so we end up not re-using a
+// subregister from it, using 1 more VGPR than necessary. This was saved when
+// this was expanded after register allocation.
+static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
+                                                  MachineBasicBlock &MBB,
+                                                  MachineInstr &MI,
+                                                  unsigned InitResultReg,
+                                                  unsigned PhiReg,
+                                                  int Offset,
+                                                  bool UseGPRIdxMode) {
+  MachineFunction *MF = MBB.getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock::iterator I(&MI);
+
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
+
+  // Save the EXEC mask
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
+    .addReg(AMDGPU::EXEC);
+
+  // To insert the loop we need to split the block. Move everything after this
+  // point to a new block, and insert a new empty block between the two.
+  MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
+  MachineFunction::iterator MBBI(MBB);
+  ++MBBI;
+
+  MF->insert(MBBI, LoopBB);
+  MF->insert(MBBI, RemainderBB);
+
+  LoopBB->addSuccessor(LoopBB);
+  LoopBB->addSuccessor(RemainderBB);
+
+  // Move the rest of the block into a new block.
+  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
+  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+
+  MBB.addSuccessor(LoopBB);
+
+  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+
+  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
+                                      InitResultReg, DstReg, PhiReg, TmpExec,
+                                      Offset, UseGPRIdxMode);
+
+  MachineBasicBlock::iterator First = RemainderBB->begin();
+  BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+    .addReg(SaveExec);
+
+  return InsPt;
+}
+
+// Returns subreg index, offset
+static std::pair<unsigned, int>
+computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
+                            const TargetRegisterClass *SuperRC,
+                            unsigned VecReg,
+                            int Offset) {
+  int NumElts = SuperRC->getSize() / 4;
+
+  // Skip out of bounds offsets, or else we would end up using an undefined
+  // register.
+  if (Offset >= NumElts || Offset < 0)
+    return std::make_pair(AMDGPU::sub0, Offset);
+
+  return std::make_pair(AMDGPU::sub0 + Offset, 0);
+}
+
+// Return true if the index is an SGPR and was set.
+static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
+                                 MachineRegisterInfo &MRI,
+                                 MachineInstr &MI,
+                                 int Offset,
+                                 bool UseGPRIdxMode,
+                                 bool IsIndirectSrc) {
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock::iterator I(&MI);
+
+  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
+
+  assert(Idx->getReg() != AMDGPU::NoRegister);
+
+  if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
+    return false;
+
+  if (UseGPRIdxMode) {
+    unsigned IdxMode = IsIndirectSrc ?
+      VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
+    if (Offset == 0) {
+      MachineInstr *SetOn =
+        BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+        .addOperand(*Idx)
+        .addImm(IdxMode);
+
+      SetOn->getOperand(3).setIsUndef();
+    } else {
+      unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
+        .addOperand(*Idx)
+        .addImm(Offset);
+      MachineInstr *SetOn =
+        BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+        .addReg(Tmp, RegState::Kill)
+        .addImm(IdxMode);
+
+      SetOn->getOperand(3).setIsUndef();
+    }
+
+    return true;
+  }
+
+  if (Offset == 0) {
+    BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+      .addOperand(*Idx);
+  } else {
+    BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+      .addOperand(*Idx)
+      .addImm(Offset);
+  }
+
+  return true;
+}
+
+// Control flow needs to be inserted if indexing with a VGPR.
+static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
+                                          MachineBasicBlock &MBB,
+                                          const SISubtarget &ST) {
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  MachineFunction *MF = MBB.getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
+  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
+
+  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
+
+  unsigned SubReg;
+  std::tie(SubReg, Offset)
+    = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
+
+  bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
+
+  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
+    MachineBasicBlock::iterator I(&MI);
+    const DebugLoc &DL = MI.getDebugLoc();
+
+    if (UseGPRIdxMode) {
+      // TODO: Look at the uses to avoid the copy. This may require rescheduling
+      // to avoid interfering with other uses, so probably requires a new
+      // optimization pass.
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
+        .addReg(SrcReg, RegState::Undef, SubReg)
+        .addReg(SrcReg, RegState::Implicit)
+        .addReg(AMDGPU::M0, RegState::Implicit);
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+    } else {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+        .addReg(SrcReg, RegState::Undef, SubReg)
+        .addReg(SrcReg, RegState::Implicit);
+    }
+
+    MI.eraseFromParent();
+
+    return &MBB;
+  }
+
+
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock::iterator I(&MI);
+
+  unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
+
+  if (UseGPRIdxMode) {
+    MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+      .addImm(0) // Reset inside loop.
+      .addImm(VGPRIndexMode::SRC0_ENABLE);
+    SetOn->getOperand(3).setIsUndef();
+
+    // Disable again after the loop.
+    BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+  }
+
+  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
+  MachineBasicBlock *LoopBB = InsPt->getParent();
+
+  if (UseGPRIdxMode) {
+    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
+      .addReg(SrcReg, RegState::Undef, SubReg)
+      .addReg(SrcReg, RegState::Implicit)
+      .addReg(AMDGPU::M0, RegState::Implicit);
+  } else {
+    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+      .addReg(SrcReg, RegState::Undef, SubReg)
+      .addReg(SrcReg, RegState::Implicit);
+  }
+
+  MI.eraseFromParent();
+
+  return LoopBB;
+}
+
+static unsigned getMOVRELDPseudo(const TargetRegisterClass *VecRC) {
+  switch (VecRC->getSize()) {
+  case 4:
+    return AMDGPU::V_MOVRELD_B32_V1;
+  case 8:
+    return AMDGPU::V_MOVRELD_B32_V2;
+  case 16:
+    return AMDGPU::V_MOVRELD_B32_V4;
+  case 32:
+    return AMDGPU::V_MOVRELD_B32_V8;
+  case 64:
+    return AMDGPU::V_MOVRELD_B32_V16;
+  default:
+    llvm_unreachable("unsupported size for MOVRELD pseudos");
+  }
+}
+
+static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
+                                          MachineBasicBlock &MBB,
+                                          const SISubtarget &ST) {
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  MachineFunction *MF = MBB.getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
+  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
+  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
+  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
+
+  // This can be an immediate, but will be folded later.
+  assert(Val->getReg());
+
+  unsigned SubReg;
+  std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
+                                                         SrcVec->getReg(),
+                                                         Offset);
+  bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
+
+  if (Idx->getReg() == AMDGPU::NoRegister) {
+    MachineBasicBlock::iterator I(&MI);
+    const DebugLoc &DL = MI.getDebugLoc();
+
+    assert(Offset == 0);
+
+    BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
+      .addOperand(*SrcVec)
+      .addOperand(*Val)
+      .addImm(SubReg);
+
+    MI.eraseFromParent();
+    return &MBB;
+  }
+
+  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
+    MachineBasicBlock::iterator I(&MI);
+    const DebugLoc &DL = MI.getDebugLoc();
+
+    if (UseGPRIdxMode) {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
+        .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
+        .addOperand(*Val)
+        .addReg(Dst, RegState::ImplicitDefine)
+        .addReg(SrcVec->getReg(), RegState::Implicit)
+        .addReg(AMDGPU::M0, RegState::Implicit);
+
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+    } else {
+      const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
+
+      BuildMI(MBB, I, DL, MovRelDesc)
+          .addReg(Dst, RegState::Define)
+          .addReg(SrcVec->getReg())
+          .addOperand(*Val)
+          .addImm(SubReg - AMDGPU::sub0);
+    }
+
+    MI.eraseFromParent();
+    return &MBB;
+  }
+
+  if (Val->isReg())
+    MRI.clearKillFlags(Val->getReg());
+
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  if (UseGPRIdxMode) {
+    MachineBasicBlock::iterator I(&MI);
+
+    MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+      .addImm(0) // Reset inside loop.
+      .addImm(VGPRIndexMode::DST_ENABLE);
+    SetOn->getOperand(3).setIsUndef();
+
+    // Disable again after the loop.
+    BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+  }
+
+  unsigned PhiReg = MRI.createVirtualRegister(VecRC);
+
+  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
+                              Offset, UseGPRIdxMode);
+  MachineBasicBlock *LoopBB = InsPt->getParent();
+
+  if (UseGPRIdxMode) {
+    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
+      .addReg(PhiReg, RegState::Undef, SubReg) // vdst
+      .addOperand(*Val) // src0
+      .addReg(Dst, RegState::ImplicitDefine)
+      .addReg(PhiReg, RegState::Implicit)
+      .addReg(AMDGPU::M0, RegState::Implicit);
+  } else {
+    const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
+
+    BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
+        .addReg(Dst, RegState::Define)
+        .addReg(PhiReg)
+        .addOperand(*Val)
+        .addImm(SubReg - AMDGPU::sub0);
+  }
+
+  MI.eraseFromParent();
+
+  return LoopBB;
+}
+
+MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
+  MachineInstr &MI, MachineBasicBlock *BB) const {
+
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+  MachineFunction *MF = BB->getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+
+  if (TII->isMIMG(MI)) {
+      if (!MI.memoperands_empty())
+        return BB;
+    // Add a memoperand for mimg instructions so that they aren't assumed to
+    // be ordered memory instuctions.
+
+    MachinePointerInfo PtrInfo(MFI->getImagePSV());
+    MachineMemOperand::Flags Flags = MachineMemOperand::MODereferenceable;
+    if (MI.mayStore())
+      Flags |= MachineMemOperand::MOStore;
+
+    if (MI.mayLoad())
+      Flags |= MachineMemOperand::MOLoad;
+
+    auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
+    MI.addMemOperand(*MF, MMO);
+    return BB;
+  }
+
+  switch (MI.getOpcode()) {
+  case AMDGPU::SI_INIT_M0: {
+    BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
+            TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+      .addOperand(MI.getOperand(0));
+    MI.eraseFromParent();
+    return BB;
+  }
+  case AMDGPU::GET_GROUPSTATICSIZE: {
+    DebugLoc DL = MI.getDebugLoc();
+    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
+      .addOperand(MI.getOperand(0))
+      .addImm(MFI->getLDSSize());
+    MI.eraseFromParent();
+    return BB;
+  }
+  case AMDGPU::SI_INDIRECT_SRC_V1:
+  case AMDGPU::SI_INDIRECT_SRC_V2:
+  case AMDGPU::SI_INDIRECT_SRC_V4:
+  case AMDGPU::SI_INDIRECT_SRC_V8:
+  case AMDGPU::SI_INDIRECT_SRC_V16:
+    return emitIndirectSrc(MI, *BB, *getSubtarget());
+  case AMDGPU::SI_INDIRECT_DST_V1:
+  case AMDGPU::SI_INDIRECT_DST_V2:
+  case AMDGPU::SI_INDIRECT_DST_V4:
+  case AMDGPU::SI_INDIRECT_DST_V8:
+  case AMDGPU::SI_INDIRECT_DST_V16:
+    return emitIndirectDst(MI, *BB, *getSubtarget());
+  case AMDGPU::SI_KILL:
+    return splitKillBlock(MI, BB);
+  case AMDGPU::V_CNDMASK_B64_PSEUDO: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+
+    unsigned Dst = MI.getOperand(0).getReg();
+    unsigned Src0 = MI.getOperand(1).getReg();
+    unsigned Src1 = MI.getOperand(2).getReg();
+    const DebugLoc &DL = MI.getDebugLoc();
+    unsigned SrcCond = MI.getOperand(3).getReg();
+
+    unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+    BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
+      .addReg(Src0, 0, AMDGPU::sub0)
+      .addReg(Src1, 0, AMDGPU::sub0)
+      .addReg(SrcCond);
+    BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
+      .addReg(Src0, 0, AMDGPU::sub1)
+      .addReg(Src1, 0, AMDGPU::sub1)
+      .addReg(SrcCond);
+
+    BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
+      .addReg(DstLo)
+      .addImm(AMDGPU::sub0)
+      .addReg(DstHi)
+      .addImm(AMDGPU::sub1);
+    MI.eraseFromParent();
+    return BB;
+  }
+  case AMDGPU::SI_BR_UNDEF: {
+    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+    const DebugLoc &DL = MI.getDebugLoc();
+    MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+      .addOperand(MI.getOperand(0));
+    Br->getOperand(1).setIsUndef(true); // read undef SCC
+    MI.eraseFromParent();
+    return BB;
+  }
+  default:
+    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  }
+}
+
+bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
+  // This currently forces unfolding various combinations of fsub into fma with
+  // free fneg'd operands. As long as we have fast FMA (controlled by
+  // isFMAFasterThanFMulAndFAdd), we should perform these.
+
+  // When fma is quarter rate, for f64 where add / sub are at best half rate,
+  // most of these combines appear to be cycle neutral but save on instruction
+  // count / code size.
+  return true;
+}
+
+EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
+                                         EVT VT) const {
+  if (!VT.isVector()) {
+    return MVT::i1;
+  }
+  return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
+}
+
+MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
+  // TODO: Should i16 be used always if legal? For now it would force VALU
+  // shifts.
+  return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
+}
+
+// Answering this is somewhat tricky and depends on the specific device which
+// have different rates for fma or all f64 operations.
+//
+// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
+// regardless of which device (although the number of cycles differs between
+// devices), so it is always profitable for f64.
+//
+// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
+// only on full rate devices. Normally, we should prefer selecting v_mad_f32
+// which we can always do even without fused FP ops since it returns the same
+// result as the separate operations and since it is always full
+// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
+// however does not support denormals, so we do report fma as faster if we have
+// a fast fma device and require denormals.
+//
+bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+    // This is as fast on some subtargets. However, we always have full rate f32
+    // mad available which returns the same result as the separate operations
+    // which we should prefer over fma. We can't use this if we want to support
+    // denormals, so only report this in these cases.
+    return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
+  case MVT::f64:
+    return true;
+  case MVT::f16:
+    return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
+  default:
+    break;
+  }
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Lowering Operations
+//===----------------------------------------------------------------------===//
+
+SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+  case ISD::LOAD: {
+    SDValue Result = LowerLOAD(Op, DAG);
+    assert((!Result.getNode() ||
+            Result.getNode()->getNumValues() == 2) &&
+           "Load should return a value and a chain");
+    return Result;
+  }
+
+  case ISD::FSIN:
+  case ISD::FCOS:
+    return LowerTrig(Op, DAG);
+  case ISD::SELECT: return LowerSELECT(Op, DAG);
+  case ISD::FDIV: return LowerFDIV(Op, DAG);
+  case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
+  case ISD::STORE: return LowerSTORE(Op, DAG);
+  case ISD::GlobalAddress: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    return LowerGlobalAddress(MFI, Op, DAG);
+  }
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
+  case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
+  case ISD::TRAP: return lowerTRAP(Op, DAG);
+  case ISD::FP_ROUND:
+    return lowerFP_ROUND(Op, DAG);
+  }
+  return SDValue();
+}
+
+/// \brief Helper function for LowerBRCOND
+static SDNode *findUser(SDValue Value, unsigned Opcode) {
+
+  SDNode *Parent = Value.getNode();
+  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
+       I != E; ++I) {
+
+    if (I.getUse().get() != Value)
+      continue;
+
+    if (I->getOpcode() == Opcode)
+      return *I;
+  }
+  return nullptr;
+}
+
+bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
+  if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+    switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
+    case AMDGPUIntrinsic::amdgcn_if:
+    case AMDGPUIntrinsic::amdgcn_else:
+    case AMDGPUIntrinsic::amdgcn_end_cf:
+    case AMDGPUIntrinsic::amdgcn_loop:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+    switch (cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue()) {
+    case AMDGPUIntrinsic::amdgcn_break:
+    case AMDGPUIntrinsic::amdgcn_if_break:
+    case AMDGPUIntrinsic::amdgcn_else_break:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  return false;
+}
+
+void SITargetLowering::createDebuggerPrologueStackObjects(
+    MachineFunction &MF) const {
+  // Create stack objects that are used for emitting debugger prologue.
+  //
+  // Debugger prologue writes work group IDs and work item IDs to scratch memory
+  // at fixed location in the following format:
+  //   offset 0:  work group ID x
+  //   offset 4:  work group ID y
+  //   offset 8:  work group ID z
+  //   offset 16: work item ID x
+  //   offset 20: work item ID y
+  //   offset 24: work item ID z
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  int ObjectIdx = 0;
+
+  // For each dimension:
+  for (unsigned i = 0; i < 3; ++i) {
+    // Create fixed stack object for work group ID.
+    ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
+    Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
+    // Create fixed stack object for work item ID.
+    ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
+    Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
+  }
+}
+
+bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
+  const Triple &TT = getTargetMachine().getTargetTriple();
+  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+         AMDGPU::shouldEmitConstantsToTextSection(TT);
+}
+
+bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
+  return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+              GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) &&
+         !shouldEmitFixup(GV) &&
+         !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+}
+
+bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
+  return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
+}
+
+/// This transforms the control flow intrinsics to get the branch destination as
+/// last parameter, also switches branch target with BR if the need arise
+SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
+                                      SelectionDAG &DAG) const {
+
+  SDLoc DL(BRCOND);
+
+  SDNode *Intr = BRCOND.getOperand(1).getNode();
+  SDValue Target = BRCOND.getOperand(2);
+  SDNode *BR = nullptr;
+  SDNode *SetCC = nullptr;
+
+  if (Intr->getOpcode() == ISD::SETCC) {
+    // As long as we negate the condition everything is fine
+    SetCC = Intr;
+    Intr = SetCC->getOperand(0).getNode();
+
+  } else {
+    // Get the target from BR if we don't negate the condition
+    BR = findUser(BRCOND, ISD::BR);
+    Target = BR->getOperand(1);
+  }
+
+  // FIXME: This changes the types of the intrinsics instead of introducing new
+  // nodes with the correct types.
+  // e.g. llvm.amdgcn.loop
+
+  // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
+  // =>     t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
+
+  if (!isCFIntrinsic(Intr)) {
+    // This is a uniform branch so we don't need to legalize.
+    return BRCOND;
+  }
+
+  bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
+                   Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
+
+  assert(!SetCC ||
+        (SetCC->getConstantOperandVal(1) == 1 &&
+         cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
+                                                             ISD::SETNE));
+
+  // operands of the new intrinsic call
+  SmallVector<SDValue, 4> Ops;
+  if (HaveChain)
+    Ops.push_back(BRCOND.getOperand(0));
+
+  Ops.append(Intr->op_begin() + (HaveChain ?  1 : 0), Intr->op_end());
+  Ops.push_back(Target);
+
+  ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
+
+  // build the new intrinsic call
+  SDNode *Result = DAG.getNode(
+    Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
+    DAG.getVTList(Res), Ops).getNode();
+
+  if (!HaveChain) {
+    SDValue Ops[] =  {
+      SDValue(Result, 0),
+      BRCOND.getOperand(0)
+    };
+
+    Result = DAG.getMergeValues(Ops, DL).getNode();
+  }
+
+  if (BR) {
+    // Give the branch instruction our target
+    SDValue Ops[] = {
+      BR->getOperand(0),
+      BRCOND.getOperand(2)
+    };
+    SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
+    DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
+    BR = NewBR.getNode();
+  }
+
+  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
+
+  // Copy the intrinsic results to registers
+  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
+    SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
+    if (!CopyToReg)
+      continue;
+
+    Chain = DAG.getCopyToReg(
+      Chain, DL,
+      CopyToReg->getOperand(1),
+      SDValue(Result, i - 1),
+      SDValue());
+
+    DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
+  }
+
+  // Remove the old intrinsic from the chain
+  DAG.ReplaceAllUsesOfValueWith(
+    SDValue(Intr, Intr->getNumValues() - 1),
+    Intr->getOperand(0));
+
+  return Chain;
+}
+
+SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
+                                            SDValue Op,
+                                            const SDLoc &DL,
+                                            EVT VT) const {
+  return Op.getValueType().bitsLE(VT) ?
+      DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
+      DAG.getNode(ISD::FTRUNC, DL, VT, Op);
+}
+
+SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::f16 &&
+         "Do not know how to custom lower FP_ROUND for non-f16 type");
+
+  SDValue Src = Op.getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  if (SrcVT != MVT::f64)
+    return Op;
+
+  SDLoc DL(Op);
+
+  SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
+  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
+  return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);;
+}
+
+SDValue SITargetLowering::getSegmentAperture(unsigned AS,
+                                             SelectionDAG &DAG) const {
+  SDLoc SL;
+  MachineFunction &MF = DAG.getMachineFunction();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+  assert(UserSGPR != AMDGPU::NoRegister);
+
+  SDValue QueuePtr = CreateLiveInRegister(
+    DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+
+  // Offset into amd_queue_t for group_segment_aperture_base_hi /
+  // private_segment_aperture_base_hi.
+  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+
+  SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
+                            DAG.getConstant(StructOffset, SL, MVT::i64));
+
+  // TODO: Use custom target PseudoSourceValue.
+  // TODO: We should use the value from the IR intrinsic call, but it might not
+  // be available and how do we get it?
+  Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
+                                              AMDGPUAS::CONSTANT_ADDRESS));
+
+  MachinePointerInfo PtrInfo(V, StructOffset);
+  return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo,
+                     MinAlign(64, StructOffset),
+                     MachineMemOperand::MODereferenceable |
+                         MachineMemOperand::MOInvariant);
+}
+
+SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
+
+  SDValue Src = ASC->getOperand(0);
+
+  // FIXME: Really support non-0 null pointers.
+  SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
+  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
+
+  // flat -> local/private
+  if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+    if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+        ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+      SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
+      SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
+
+      return DAG.getNode(ISD::SELECT, SL, MVT::i32,
+                         NonNull, Ptr, SegmentNullPtr);
+    }
+  }
+
+  // local/private -> flat
+  if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+    if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+        ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+      SDValue NonNull
+        = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
+
+      SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
+      SDValue CvtPtr
+        = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+
+      return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
+                         DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
+                         FlatNullPtr);
+    }
+  }
+
+  // global <-> flat are no-ops and never emitted.
+
+  const MachineFunction &MF = DAG.getMachineFunction();
+  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
+    *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
+  DAG.getContext()->diagnose(InvalidAddrSpaceCast);
+
+  return DAG.getUNDEF(ASC->getValueType(0));
+}
+
+bool
+SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // We can fold offsets for anything that doesn't require a GOT relocation.
+  return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+              GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) &&
+         !shouldEmitGOTReloc(GA->getGlobal());
+}
+
+static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
+                                      SDLoc DL, unsigned Offset, EVT PtrVT,
+                                      unsigned GAFlags = SIInstrInfo::MO_NONE) {
+  // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
+  // lowered to the following code sequence:
+  //
+  // For constant address space:
+  //   s_getpc_b64 s[0:1]
+  //   s_add_u32 s0, s0, $symbol
+  //   s_addc_u32 s1, s1, 0
+  //
+  //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
+  //   a fixup or relocation is emitted to replace $symbol with a literal
+  //   constant, which is a pc-relative offset from the encoding of the $symbol
+  //   operand to the global variable.
+  //
+  // For global address space:
+  //   s_getpc_b64 s[0:1]
+  //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
+  //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
+  //
+  //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
+  //   fixups or relocations are emitted to replace $symbol@*@lo and
+  //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
+  //   which is a 64-bit pc-relative offset from the encoding of the $symbol
+  //   operand to the global variable.
+  //
+  // What we want here is an offset from the value returned by s_getpc
+  // (which is the address of the s_add_u32 instruction) to the global
+  // variable, but since the encoding of $symbol starts 4 bytes after the start
+  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
+  // small. This requires us to add 4 to the global variable offset in order to
+  // compute the correct address.
+  SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
+                                             GAFlags);
+  SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
+                                             GAFlags == SIInstrInfo::MO_NONE ?
+                                             GAFlags : GAFlags + 1);
+  return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
+}
+
+SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
+                                             SDValue Op,
+                                             SelectionDAG &DAG) const {
+  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+
+  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
+      GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS)
+    return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
+
+  SDLoc DL(GSD);
+  const GlobalValue *GV = GSD->getGlobal();
+  EVT PtrVT = Op.getValueType();
+
+  if (shouldEmitFixup(GV))
+    return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
+  else if (shouldEmitPCReloc(GV))
+    return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
+                                   SIInstrInfo::MO_REL32);
+
+  SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
+                                            SIInstrInfo::MO_GOTPCREL32);
+
+  Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
+  const DataLayout &DataLayout = DAG.getDataLayout();
+  unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
+  // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
+  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+
+  return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
+                     MachineMemOperand::MODereferenceable |
+                         MachineMemOperand::MOInvariant);
+}
+
+SDValue SITargetLowering::lowerTRAP(SDValue Op,
+                                    SelectionDAG &DAG) const {
+  const MachineFunction &MF = DAG.getMachineFunction();
+  DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
+                                   "trap handler not supported",
+                                   Op.getDebugLoc(),
+                                   DS_Warning);
+  DAG.getContext()->diagnose(NoTrap);
+
+  // Emit s_endpgm.
+
+  // FIXME: This should really be selected to s_trap, but that requires
+  // setting up the trap handler for it o do anything.
+  return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other,
+                     Op.getOperand(0));
+}
+
+SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
+                                   const SDLoc &DL, SDValue V) const {
+  // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
+  // the destination register.
+  //
+  // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
+  // so we will end up with redundant moves to m0.
+  //
+  // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
+
+  // A Null SDValue creates a glue result.
+  SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
+                                  V, Chain);
+  return SDValue(M0, 0);
+}
+
+SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
+                                                 SDValue Op,
+                                                 MVT VT,
+                                                 unsigned Offset) const {
+  SDLoc SL(Op);
+  SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
+                                 DAG.getEntryNode(), Offset, false);
+  // The local size values will have the hi 16-bits as zero.
+  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
+                     DAG.getValueType(VT));
+}
+
+static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
+  DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
+                                      "non-hsa intrinsic with hsa target",
+                                      DL.getDebugLoc());
+  DAG.getContext()->diagnose(BadIntrin);
+  return DAG.getUNDEF(VT);
+}
+
+static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
+  DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
+                                      "intrinsic not supported on subtarget",
+                                      DL.getDebugLoc());
+  DAG.getContext()->diagnose(BadIntrin);
+  return DAG.getUNDEF(VT);
+}
+
+SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  auto MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+
+  // TODO: Should this propagate fast-math-flags?
+
+  switch (IntrinsicID) {
+  case Intrinsic::amdgcn_dispatch_ptr:
+  case Intrinsic::amdgcn_queue_ptr: {
+    if (!Subtarget->isAmdCodeObjectV2()) {
+      DiagnosticInfoUnsupported BadIntrin(
+          *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
+          DL.getDebugLoc());
+      DAG.getContext()->diagnose(BadIntrin);
+      return DAG.getUNDEF(VT);
+    }
+
+    auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
+      SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR;
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
+                                TRI->getPreloadedValue(MF, Reg), VT);
+  }
+  case Intrinsic::amdgcn_implicitarg_ptr: {
+    unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+    return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
+  }
+  case Intrinsic::amdgcn_kernarg_segment_ptr: {
+    unsigned Reg
+      = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+  }
+  case Intrinsic::amdgcn_dispatch_id: {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID);
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+  }
+  case Intrinsic::amdgcn_rcp:
+    return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
+  case Intrinsic::amdgcn_rsq:
+  case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
+    return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+  case Intrinsic::amdgcn_rsq_legacy: {
+    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+      return emitRemovedIntrinsicError(DAG, DL, VT);
+
+    return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+  }
+  case Intrinsic::amdgcn_rcp_legacy: {
+    if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+      return emitRemovedIntrinsicError(DAG, DL, VT);
+    return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
+  }
+  case Intrinsic::amdgcn_rsq_clamp: {
+    if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+      return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
+
+    Type *Type = VT.getTypeForEVT(*DAG.getContext());
+    APFloat Max = APFloat::getLargest(Type->getFltSemantics());
+    APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
+
+    SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+    SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
+                              DAG.getConstantFP(Max, DL, VT));
+    return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
+                       DAG.getConstantFP(Min, DL, VT));
+  }
+  case Intrinsic::r600_read_ngroups_x:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::NGROUPS_X, false);
+  case Intrinsic::r600_read_ngroups_y:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::NGROUPS_Y, false);
+  case Intrinsic::r600_read_ngroups_z:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::NGROUPS_Z, false);
+  case Intrinsic::r600_read_global_size_x:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
+  case Intrinsic::r600_read_global_size_y:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
+  case Intrinsic::r600_read_global_size_z:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
+  case Intrinsic::r600_read_local_size_x:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_X);
+  case Intrinsic::r600_read_local_size_y:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_Y);
+  case Intrinsic::r600_read_local_size_z:
+    if (Subtarget->isAmdHsaOS())
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_Z);
+  case Intrinsic::amdgcn_workgroup_id_x:
+  case Intrinsic::r600_read_tgid_x:
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
+  case Intrinsic::amdgcn_workgroup_id_y:
+  case Intrinsic::r600_read_tgid_y:
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
+  case Intrinsic::amdgcn_workgroup_id_z:
+  case Intrinsic::r600_read_tgid_z:
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
+  case Intrinsic::amdgcn_workitem_id_x:
+  case Intrinsic::r600_read_tidig_x:
+    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
+  case Intrinsic::amdgcn_workitem_id_y:
+  case Intrinsic::r600_read_tidig_y:
+    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
+  case Intrinsic::amdgcn_workitem_id_z:
+  case Intrinsic::r600_read_tidig_z:
+    return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
+  case AMDGPUIntrinsic::SI_load_const: {
+    SDValue Ops[] = {
+      Op.getOperand(1),
+      Op.getOperand(2)
+    };
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo(),
+        MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+            MachineMemOperand::MOInvariant,
+        VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
+  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+    return lowerFDIV_FAST(Op, DAG);
+  }
+  case AMDGPUIntrinsic::SI_vs_load_input:
+    return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
+                       Op.getOperand(1),
+                       Op.getOperand(2),
+                       Op.getOperand(3));
+
+  case AMDGPUIntrinsic::SI_fs_constant: {
+    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
+    SDValue Glue = M0.getValue(1);
+    return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
+                       DAG.getConstant(2, DL, MVT::i32), // P0
+                       Op.getOperand(1), Op.getOperand(2), Glue);
+  }
+  case AMDGPUIntrinsic::SI_packf16:
+    if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
+      return DAG.getUNDEF(MVT::i32);
+    return Op;
+  case AMDGPUIntrinsic::SI_fs_interp: {
+    SDValue IJ = Op.getOperand(4);
+    SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
+                            DAG.getConstant(0, DL, MVT::i32));
+    SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
+                            DAG.getConstant(1, DL, MVT::i32));
+    I = DAG.getNode(ISD::BITCAST, DL, MVT::f32, I);
+    J = DAG.getNode(ISD::BITCAST, DL, MVT::f32, J);
+    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
+    SDValue Glue = M0.getValue(1);
+    SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL,
+                             DAG.getVTList(MVT::f32, MVT::Glue),
+                             I, Op.getOperand(1), Op.getOperand(2), Glue);
+    Glue = SDValue(P1.getNode(), 1);
+    return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
+                             Op.getOperand(1), Op.getOperand(2), Glue);
+  }
+  case Intrinsic::amdgcn_interp_mov: {
+    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
+    SDValue Glue = M0.getValue(1);
+    return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3), Glue);
+  }
+  case Intrinsic::amdgcn_interp_p1: {
+    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
+    SDValue Glue = M0.getValue(1);
+    return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3), Glue);
+  }
+  case Intrinsic::amdgcn_interp_p2: {
+    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
+    SDValue Glue = SDValue(M0.getNode(), 1);
+    return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
+                       Glue);
+  }
+  case Intrinsic::amdgcn_sin:
+    return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
+
+  case Intrinsic::amdgcn_cos:
+    return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
+
+  case Intrinsic::amdgcn_log_clamp: {
+    if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+      return SDValue();
+
+    DiagnosticInfoUnsupported BadIntrin(
+      *MF.getFunction(), "intrinsic not supported on subtarget",
+      DL.getDebugLoc());
+      DAG.getContext()->diagnose(BadIntrin);
+      return DAG.getUNDEF(VT);
+  }
+  case Intrinsic::amdgcn_ldexp:
+    return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::amdgcn_fract:
+    return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
+
+  case Intrinsic::amdgcn_class:
+    return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::amdgcn_div_fmas:
+    return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                       Op.getOperand(4));
+
+  case Intrinsic::amdgcn_div_fixup:
+    return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+  case Intrinsic::amdgcn_trig_preop:
+    return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::amdgcn_div_scale: {
+    // 3rd parameter required to be a constant.
+    const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+    if (!Param)
+      return DAG.getUNDEF(VT);
+
+    // Translate to the operands expected by the machine instruction. The
+    // first parameter must be the same as the first instruction.
+    SDValue Numerator = Op.getOperand(1);
+    SDValue Denominator = Op.getOperand(2);
+
+    // Note this order is opposite of the machine instruction's operations,
+    // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
+    // intrinsic has the numerator as the first operand to match a normal
+    // division operation.
+
+    SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
+
+    return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
+                       Denominator, Numerator);
+  }
+  case Intrinsic::amdgcn_icmp: {
+    const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+    int CondCode = CD->getSExtValue();
+
+    if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
+        CondCode >= ICmpInst::Predicate::BAD_ICMP_PREDICATE)
+      return DAG.getUNDEF(VT);
+
+    ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
+    ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
+    return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
+                       Op.getOperand(2), DAG.getCondCode(CCOpcode));
+  }
+  case Intrinsic::amdgcn_fcmp: {
+    const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+    int CondCode = CD->getSExtValue();
+
+    if (CondCode <= FCmpInst::Predicate::FCMP_FALSE ||
+        CondCode >= FCmpInst::Predicate::FCMP_TRUE)
+      return DAG.getUNDEF(VT);
+
+    FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
+    ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
+    return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
+                       Op.getOperand(2), DAG.getCondCode(CCOpcode));
+  }
+  case Intrinsic::amdgcn_fmul_legacy:
+    return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::amdgcn_sffbh:
+  case AMDGPUIntrinsic::AMDGPU_flbit_i32: // Legacy name.
+    return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
+  default:
+    return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  }
+}
+
+SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  SDLoc DL(Op);
+  switch (IntrID) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec: {
+    MemSDNode *M = cast<MemSDNode>(Op);
+    unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
+      AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
+    SDValue Ops[] = {
+      M->getOperand(0), // Chain
+      M->getOperand(2), // Ptr
+      M->getOperand(3)  // Value
+    };
+
+    return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
+                                   M->getMemoryVT(), M->getMemOperand());
+  }
+  case Intrinsic::amdgcn_buffer_load:
+  case Intrinsic::amdgcn_buffer_load_format: {
+    SDValue Ops[] = {
+      Op.getOperand(0), // Chain
+      Op.getOperand(2), // rsrc
+      Op.getOperand(3), // vindex
+      Op.getOperand(4), // offset
+      Op.getOperand(5), // glc
+      Op.getOperand(6)  // slc
+    };
+    MachineFunction &MF = DAG.getMachineFunction();
+    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+    unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
+        AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
+    EVT VT = Op.getValueType();
+    EVT IntVT = VT.changeTypeToInteger();
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(MFI->getBufferPSV()),
+      MachineMemOperand::MOLoad,
+      VT.getStoreSize(), VT.getStoreSize());
+
+    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
+  }
+  default:
+    return SDValue();
+  }
+}
+
+SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDLoc DL(Op);
+  SDValue Chain = Op.getOperand(0);
+  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+  switch (IntrinsicID) {
+  case AMDGPUIntrinsic::SI_sendmsg: {
+    Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
+    SDValue Glue = Chain.getValue(1);
+    return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain,
+                       Op.getOperand(2), Glue);
+  }
+  case AMDGPUIntrinsic::SI_tbuffer_store: {
+    SDValue Ops[] = {
+      Chain,
+      Op.getOperand(2),
+      Op.getOperand(3),
+      Op.getOperand(4),
+      Op.getOperand(5),
+      Op.getOperand(6),
+      Op.getOperand(7),
+      Op.getOperand(8),
+      Op.getOperand(9),
+      Op.getOperand(10),
+      Op.getOperand(11),
+      Op.getOperand(12),
+      Op.getOperand(13),
+      Op.getOperand(14)
+    };
+
+    EVT VT = Op.getOperand(3).getValueType();
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOStore,
+      VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
+  case AMDGPUIntrinsic::AMDGPU_kill: {
+    SDValue Src = Op.getOperand(2);
+    if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
+      if (!K->isNegative())
+        return Chain;
+
+      SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
+      return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
+    }
+
+    SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
+    return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
+  }
+  case AMDGPUIntrinsic::SI_export: {
+    const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(2));
+    const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(3));
+    const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(4));
+    const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(5));
+    const ConstantSDNode *Compr = cast<ConstantSDNode>(Op.getOperand(6));
+
+    const SDValue Ops[] = {
+      Chain,
+      DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),
+      DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1),
+      DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8),
+      DAG.getTargetConstant(Compr->getZExtValue(), DL, MVT::i1),
+      Op.getOperand(7), // src0
+      Op.getOperand(8), // src1
+      Op.getOperand(9), // src2
+      Op.getOperand(10) // src3
+    };
+
+    unsigned Opc = Done->isNullValue() ?
+      AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
+    return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
+  }
+  default:
+    return SDValue();
+  }
+}
+
+SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+  ISD::LoadExtType ExtType = Load->getExtensionType();
+  EVT MemVT = Load->getMemoryVT();
+
+  if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
+    // FIXME: Copied from PPC
+    // First, load into 32 bits, then truncate to 1 bit.
+
+    SDValue Chain = Load->getChain();
+    SDValue BasePtr = Load->getBasePtr();
+    MachineMemOperand *MMO = Load->getMemOperand();
+
+    EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
+
+    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
+                                   BasePtr, RealMemVT, MMO);
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
+      NewLD.getValue(1)
+    };
+
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  if (!MemVT.isVector())
+    return SDValue();
+
+  assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
+         "Custom lowering for non-i32 vectors hasn't been implemented.");
+
+  unsigned AS = Load->getAddressSpace();
+  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
+                          AS, Load->getAlignment())) {
+    SDValue Ops[2];
+    std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  // If there is a possibilty that flat instruction access scratch memory
+  // then we need to use the same legalization rules we use for private.
+  if (AS == AMDGPUAS::FLAT_ADDRESS)
+    AS = MFI->hasFlatScratchInit() ?
+         AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
+
+  unsigned NumElements = MemVT.getVectorNumElements();
+  switch (AS) {
+  case AMDGPUAS::CONSTANT_ADDRESS:
+    if (isMemOpUniform(Load))
+      return SDValue();
+    // Non-uniform loads will be selected to MUBUF instructions, so they
+    // have the same legalization requirements as global and private
+    // loads.
+    //
+    LLVM_FALLTHROUGH;
+  case AMDGPUAS::GLOBAL_ADDRESS: {
+    if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
+                  isMemOpHasNoClobberedMemOperand(Load))
+      return SDValue();
+    // Non-uniform loads will be selected to MUBUF instructions, so they
+    // have the same legalization requirements as global and private
+    // loads.
+    //
+  }
+    LLVM_FALLTHROUGH;
+  case AMDGPUAS::FLAT_ADDRESS:
+    if (NumElements > 4)
+      return SplitVectorLoad(Op, DAG);
+    // v4 loads are supported for private and global memory.
+    return SDValue();
+  case AMDGPUAS::PRIVATE_ADDRESS: {
+    // Depending on the setting of the private_element_size field in the
+    // resource descriptor, we can only make private accesses up to a certain
+    // size.
+    switch (Subtarget->getMaxPrivateElementSize()) {
+    case 4:
+      return scalarizeVectorLoad(Load, DAG);
+    case 8:
+      if (NumElements > 2)
+        return SplitVectorLoad(Op, DAG);
+      return SDValue();
+    case 16:
+      // Same as global/flat
+      if (NumElements > 4)
+        return SplitVectorLoad(Op, DAG);
+      return SDValue();
+    default:
+      llvm_unreachable("unsupported private_element_size");
+    }
+  }
+  case AMDGPUAS::LOCAL_ADDRESS: {
+    if (NumElements > 2)
+      return SplitVectorLoad(Op, DAG);
+
+    if (NumElements == 2)
+      return SDValue();
+
+    // If properly aligned, if we split we might be able to use ds_read_b64.
+    return SplitVectorLoad(Op, DAG);
+  }
+  default:
+    return SDValue();
+  }
+}
+
+SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  if (Op.getValueType() != MVT::i64)
+    return SDValue();
+
+  SDLoc DL(Op);
+  SDValue Cond = Op.getOperand(0);
+
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
+  SDValue One = DAG.getConstant(1, DL, MVT::i32);
+
+  SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
+  SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
+
+  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
+  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
+
+  SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
+
+  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
+  SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
+
+  SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
+
+  SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
+  return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
+}
+
+// Catch division cases where we can use shortcuts with rcp and rsq
+// instructions.
+SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
+
+  if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
+    if (Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
+        VT == MVT::f16) {
+      if (CLHS->isExactlyValue(1.0)) {
+        // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+        // the CI documentation has a worst case error of 1 ulp.
+        // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+        // use it as long as we aren't trying to use denormals.
+        //
+        // v_rcp_f16 and v_rsq_f16 DO support denormals.
+
+        // 1.0 / sqrt(x) -> rsq(x)
+
+        // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
+        // error seems really high at 2^29 ULP.
+        if (RHS.getOpcode() == ISD::FSQRT)
+          return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
+
+        // 1.0 / x -> rcp(x)
+        return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+      }
+
+      // Same as for 1.0, but expand the sign out of the constant.
+      if (CLHS->isExactlyValue(-1.0)) {
+        // -1.0 / x -> rcp (fneg x)
+        SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+        return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
+      }
+    }
+  }
+
+  const SDNodeFlags *Flags = Op->getFlags();
+
+  if (Unsafe || Flags->hasAllowReciprocal()) {
+    // Turn into multiply by the reciprocal.
+    // x / y -> x * (1.0 / y)
+    SDNodeFlags Flags;
+    Flags.setUnsafeAlgebra(true);
+    SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
+  }
+
+  return SDValue();
+}
+
+static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
+                          EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
+  if (GlueChain->getNumValues() <= 1) {
+    return DAG.getNode(Opcode, SL, VT, A, B);
+  }
+
+  assert(GlueChain->getNumValues() == 3);
+
+  SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
+  switch (Opcode) {
+  default: llvm_unreachable("no chain equivalent for opcode");
+  case ISD::FMUL:
+    Opcode = AMDGPUISD::FMUL_W_CHAIN;
+    break;
+  }
+
+  return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
+                     GlueChain.getValue(2));
+}
+
+static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
+                           EVT VT, SDValue A, SDValue B, SDValue C,
+                           SDValue GlueChain) {
+  if (GlueChain->getNumValues() <= 1) {
+    return DAG.getNode(Opcode, SL, VT, A, B, C);
+  }
+
+  assert(GlueChain->getNumValues() == 3);
+
+  SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
+  switch (Opcode) {
+  default: llvm_unreachable("no chain equivalent for opcode");
+  case ISD::FMA:
+    Opcode = AMDGPUISD::FMA_W_CHAIN;
+    break;
+  }
+
+  return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
+                     GlueChain.getValue(2));
+}
+
+SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
+  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
+    return FastLowered;
+
+  SDLoc SL(Op);
+  SDValue Src0 = Op.getOperand(0);
+  SDValue Src1 = Op.getOperand(1);
+
+  SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
+  SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
+
+  SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
+  SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
+
+  SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
+  SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
+
+  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
+}
+
+// Faster 2.5 ULP division that does not support denormals.
+SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+
+  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+
+  const APFloat K0Val(BitsToFloat(0x6f800000));
+  const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+
+  const APFloat K1Val(BitsToFloat(0x2f800000));
+  const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+
+  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+
+  EVT SetCCVT =
+    getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+
+  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+
+  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+
+  // TODO: Should this propagate fast-math-flags?
+  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
+
+  // rcp does not support denormals.
+  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+
+  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+}
+
+SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
+  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
+    return FastLowered;
+
+  SDLoc SL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+
+  SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
+
+  SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
+                                          RHS, RHS, LHS);
+  SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
+                                        LHS, RHS, LHS);
+
+  // Denominator is scaled to not be denormal, so using rcp is ok.
+  SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
+                                  DenominatorScaled);
+  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
+                                     DenominatorScaled);
+
+  const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
+                               (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
+                               (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
+
+  const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
+
+  if (!Subtarget->hasFP32Denormals()) {
+    SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
+                                                      SL, MVT::i32);
+    SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
+                                       DAG.getEntryNode(),
+                                       EnableDenormValue, BitField);
+    SDValue Ops[3] = {
+      NegDivScale0,
+      EnableDenorm.getValue(0),
+      EnableDenorm.getValue(1)
+    };
+
+    NegDivScale0 = DAG.getMergeValues(Ops, SL);
+  }
+
+  SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
+                             ApproxRcp, One, NegDivScale0);
+
+  SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
+                             ApproxRcp, Fma0);
+
+  SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
+                           Fma1, Fma1);
+
+  SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
+                             NumeratorScaled, Mul);
+
+  SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
+
+  SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
+                             NumeratorScaled, Fma3);
+
+  if (!Subtarget->hasFP32Denormals()) {
+    const SDValue DisableDenormValue =
+        DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
+    SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
+                                        Fma4.getValue(1),
+                                        DisableDenormValue,
+                                        BitField,
+                                        Fma4.getValue(2));
+
+    SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
+                                      DisableDenorm, DAG.getRoot());
+    DAG.setRoot(OutputChain);
+  }
+
+  SDValue Scale = NumeratorScaled.getValue(1);
+  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
+                             Fma4, Fma1, Fma3, Scale);
+
+  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
+}
+
+SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
+  if (DAG.getTarget().Options.UnsafeFPMath)
+    return lowerFastUnsafeFDIV(Op, DAG);
+
+  SDLoc SL(Op);
+  SDValue X = Op.getOperand(0);
+  SDValue Y = Op.getOperand(1);
+
+  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
+
+  SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
+
+  SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
+
+  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
+
+  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
+
+  SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
+
+  SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
+
+  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
+
+  SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
+
+  SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
+
+  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
+                             NegDivScale0, Mul, DivScale1);
+
+  SDValue Scale;
+
+  if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
+    // Workaround a hardware bug on SI where the condition output from div_scale
+    // is not usable.
+
+    const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
+
+    // Figure out if the scale to use for div_fmas.
+    SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
+    SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
+    SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
+    SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
+
+    SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
+    SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
+
+    SDValue Scale0Hi
+      = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
+    SDValue Scale1Hi
+      = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
+
+    SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
+    SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
+    Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
+  } else {
+    Scale = DivScale1.getValue(1);
+  }
+
+  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
+                             Fma4, Fma3, Mul, Scale);
+
+  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
+}
+
+SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  if (VT == MVT::f32)
+    return LowerFDIV32(Op, DAG);
+
+  if (VT == MVT::f64)
+    return LowerFDIV64(Op, DAG);
+
+  if (VT == MVT::f16)
+    return LowerFDIV16(Op, DAG);
+
+  llvm_unreachable("Unexpected type for fdiv");
+}
+
+SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT VT = Store->getMemoryVT();
+
+  if (VT == MVT::i1) {
+    return DAG.getTruncStore(Store->getChain(), DL,
+       DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
+       Store->getBasePtr(), MVT::i1, Store->getMemOperand());
+  }
+
+  assert(VT.isVector() &&
+         Store->getValue().getValueType().getScalarType() == MVT::i32);
+
+  unsigned AS = Store->getAddressSpace();
+  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                          AS, Store->getAlignment())) {
+    return expandUnalignedStore(Store, DAG);
+  }
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  // If there is a possibilty that flat instruction access scratch memory
+  // then we need to use the same legalization rules we use for private.
+  if (AS == AMDGPUAS::FLAT_ADDRESS)
+    AS = MFI->hasFlatScratchInit() ?
+         AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
+
+  unsigned NumElements = VT.getVectorNumElements();
+  switch (AS) {
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::FLAT_ADDRESS:
+    if (NumElements > 4)
+      return SplitVectorStore(Op, DAG);
+    return SDValue();
+  case AMDGPUAS::PRIVATE_ADDRESS: {
+    switch (Subtarget->getMaxPrivateElementSize()) {
+    case 4:
+      return scalarizeVectorStore(Store, DAG);
+    case 8:
+      if (NumElements > 2)
+        return SplitVectorStore(Op, DAG);
+      return SDValue();
+    case 16:
+      if (NumElements > 4)
+        return SplitVectorStore(Op, DAG);
+      return SDValue();
+    default:
+      llvm_unreachable("unsupported private_element_size");
+    }
+  }
+  case AMDGPUAS::LOCAL_ADDRESS: {
+    if (NumElements > 2)
+      return SplitVectorStore(Op, DAG);
+
+    if (NumElements == 2)
+      return Op;
+
+    // If properly aligned, if we split we might be able to use ds_write_b64.
+    return SplitVectorStore(Op, DAG);
+  }
+  default:
+    llvm_unreachable("unhandled address space");
+  }
+}
+
+SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Arg = Op.getOperand(0);
+  // TODO: Should this propagate fast-math-flags?
+  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
+                                  DAG.getNode(ISD::FMUL, DL, VT, Arg,
+                                              DAG.getConstantFP(0.5/M_PI, DL,
+                                                                VT)));
+
+  switch (Op.getOpcode()) {
+  case ISD::FCOS:
+    return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
+  case ISD::FSIN:
+    return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
+  default:
+    llvm_unreachable("Wrong trig opcode");
+  }
+}
+
+SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
+  AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
+  assert(AtomicNode->isCompareAndSwap());
+  unsigned AS = AtomicNode->getAddressSpace();
+
+  // No custom lowering required for local address space
+  if (!isFlatGlobalAddrSpace(AS))
+    return Op;
+
+  // Non-local address space requires custom lowering for atomic compare
+  // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
+  SDLoc DL(Op);
+  SDValue ChainIn = Op.getOperand(0);
+  SDValue Addr = Op.getOperand(1);
+  SDValue Old = Op.getOperand(2);
+  SDValue New = Op.getOperand(3);
+  EVT VT = Op.getValueType();
+  MVT SimpleVT = VT.getSimpleVT();
+  MVT VecType = MVT::getVectorVT(SimpleVT, 2);
+
+  SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
+  SDValue Ops[] = { ChainIn, Addr, NewOld };
+
+  return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
+                                 Ops, VT, AtomicNode->getMemOperand());
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
+
+SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
+                                                     DAGCombinerInfo &DCI) const {
+  EVT VT = N->getValueType(0);
+  EVT ScalarVT = VT.getScalarType();
+  if (ScalarVT != MVT::f32)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  // TODO: We could try to match extracting the higher bytes, which would be
+  // easier if i8 vectors weren't promoted to i32 vectors, particularly after
+  // types are legalized. v4i8 -> v4f32 is probably the only case to worry
+  // about in practice.
+  if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
+    if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
+      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
+      DCI.AddToWorklist(Cvt.getNode());
+      return Cvt;
+    }
+  }
+
+  return SDValue();
+}
+
+/// \brief Return true if the given offset Size in bytes can be folded into
+/// the immediate offsets of a memory instruction for the given address space.
+static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
+                          const SISubtarget &STI) {
+  switch (AS) {
+  case AMDGPUAS::GLOBAL_ADDRESS: {
+    // MUBUF instructions a 12-bit offset in bytes.
+    return isUInt<12>(OffsetSize);
+  }
+  case AMDGPUAS::CONSTANT_ADDRESS: {
+    // SMRD instructions have an 8-bit offset in dwords on SI and
+    // a 20-bit offset in bytes on VI.
+    if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+      return isUInt<20>(OffsetSize);
+    else
+      return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
+  }
+  case AMDGPUAS::LOCAL_ADDRESS:
+  case AMDGPUAS::REGION_ADDRESS: {
+    // The single offset versions have a 16-bit offset in bytes.
+    return isUInt<16>(OffsetSize);
+  }
+  case AMDGPUAS::PRIVATE_ADDRESS:
+  // Indirect register addressing does not use any offsets.
+  default:
+    return 0;
+  }
+}
+
+// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
+
+// This is a variant of
+// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
+//
+// The normal DAG combiner will do this, but only if the add has one use since
+// that would increase the number of instructions.
+//
+// This prevents us from seeing a constant offset that can be folded into a
+// memory instruction's addressing mode. If we know the resulting add offset of
+// a pointer can be folded into an addressing offset, we can replace the pointer
+// operand with the add of new constant offset. This eliminates one of the uses,
+// and may allow the remaining use to also be simplified.
+//
+SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
+                                               unsigned AddrSpace,
+                                               DAGCombinerInfo &DCI) const {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  if (N0.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
+  if (!CN1)
+    return SDValue();
+
+  const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  if (!CAdd)
+    return SDValue();
+
+  // If the resulting offset is too large, we can't fold it into the addressing
+  // mode offset.
+  APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
+  if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+  EVT VT = N->getValueType(0);
+
+  SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
+  SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
+
+  return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
+}
+
+SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  SDValue Ptr = N->getBasePtr();
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+
+  // TODO: We could also do this for multiplies.
+  unsigned AS = N->getAddressSpace();
+  if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
+    SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
+    if (NewPtr) {
+      SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
+
+      NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
+      return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+    }
+  }
+
+  return SDValue();
+}
+
+static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
+  return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
+         (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
+         (Opc == ISD::XOR && Val == 0);
+}
+
+// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
+// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
+// integer combine opportunities since most 64-bit operations are decomposed
+// this way.  TODO: We won't want this for SALU especially if it is an inline
+// immediate.
+SDValue SITargetLowering::splitBinaryBitConstantOp(
+  DAGCombinerInfo &DCI,
+  const SDLoc &SL,
+  unsigned Opc, SDValue LHS,
+  const ConstantSDNode *CRHS) const {
+  uint64_t Val = CRHS->getZExtValue();
+  uint32_t ValLo = Lo_32(Val);
+  uint32_t ValHi = Hi_32(Val);
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+    if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
+         bitOpWithConstantIsReducible(Opc, ValHi)) ||
+        (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
+    // If we need to materialize a 64-bit immediate, it will be split up later
+    // anyway. Avoid creating the harder to understand 64-bit immediate
+    // materialization.
+    return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performAndCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+
+  if (VT == MVT::i64) {
+    const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+    if (CRHS) {
+      if (SDValue Split
+          = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
+        return Split;
+    }
+  }
+
+  // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
+  // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
+  if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
+    ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
+    ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
+
+    SDValue X = LHS.getOperand(0);
+    SDValue Y = RHS.getOperand(0);
+    if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
+      return SDValue();
+
+    if (LCC == ISD::SETO) {
+      if (X != LHS.getOperand(1))
+        return SDValue();
+
+      if (RCC == ISD::SETUNE) {
+        const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
+        if (!C1 || !C1->isInfinity() || C1->isNegative())
+          return SDValue();
+
+        const uint32_t Mask = SIInstrFlags::N_NORMAL |
+                              SIInstrFlags::N_SUBNORMAL |
+                              SIInstrFlags::N_ZERO |
+                              SIInstrFlags::P_ZERO |
+                              SIInstrFlags::P_SUBNORMAL |
+                              SIInstrFlags::P_NORMAL;
+
+        static_assert(((~(SIInstrFlags::S_NAN |
+                          SIInstrFlags::Q_NAN |
+                          SIInstrFlags::N_INFINITY |
+                          SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
+                      "mask not equal");
+
+        SDLoc DL(N);
+        return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
+                           X, DAG.getConstant(Mask, DL, MVT::i32));
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performOrCombine(SDNode *N,
+                                           DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::i1) {
+    // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
+    if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
+        RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
+      SDValue Src = LHS.getOperand(0);
+      if (Src != RHS.getOperand(0))
+        return SDValue();
+
+      const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+      const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+      if (!CLHS || !CRHS)
+        return SDValue();
+
+      // Only 10 bits are used.
+      static const uint32_t MaxMask = 0x3ff;
+
+      uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
+      SDLoc DL(N);
+      return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
+                         Src, DAG.getConstant(NewMask, DL, MVT::i32));
+    }
+
+    return SDValue();
+  }
+
+  if (VT != MVT::i64)
+    return SDValue();
+
+  // TODO: This could be a generic combine with a predicate for extracting the
+  // high half of an integer being free.
+
+  // (or i64:x, (zero_extend i32:y)) ->
+  //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
+  if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
+      RHS.getOpcode() != ISD::ZERO_EXTEND)
+    std::swap(LHS, RHS);
+
+  if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
+    SDValue ExtSrc = RHS.getOperand(0);
+    EVT SrcVT = ExtSrc.getValueType();
+    if (SrcVT == MVT::i32) {
+      SDLoc SL(N);
+      SDValue LowLHS, HiBits;
+      std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
+      SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
+
+      DCI.AddToWorklist(LowOr.getNode());
+      DCI.AddToWorklist(HiBits.getNode());
+
+      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+                                LowOr, HiBits);
+      return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+    }
+  }
+
+  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (CRHS) {
+    if (SDValue Split
+          = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
+      return Split;
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performXorCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i64)
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+  if (CRHS) {
+    if (SDValue Split
+          = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
+      return Split;
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performClassCombine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Mask = N->getOperand(1);
+
+  // fp_class x, 0 -> false
+  if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
+    if (CMask->isNullValue())
+      return DAG.getConstant(0, SDLoc(N), MVT::i1);
+  }
+
+  if (N->getOperand(0).isUndef())
+    return DAG.getUNDEF(MVT::i1);
+
+  return SDValue();
+}
+
+// Constant fold canonicalize.
+SDValue SITargetLowering::performFCanonicalizeCombine(
+  SDNode *N,
+  DAGCombinerInfo &DCI) const {
+  ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+  if (!CFP)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  const APFloat &C = CFP->getValueAPF();
+
+  // Flush denormals to 0 if not enabled.
+  if (C.isDenormal()) {
+    EVT VT = N->getValueType(0);
+    if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
+      return DAG.getConstantFP(0.0, SDLoc(N), VT);
+
+    if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
+      return DAG.getConstantFP(0.0, SDLoc(N), VT);
+
+    if (VT == MVT::f16 && !Subtarget->hasFP16Denormals())
+      return DAG.getConstantFP(0.0, SDLoc(N), VT);
+  }
+
+  if (C.isNaN()) {
+    EVT VT = N->getValueType(0);
+    APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
+    if (C.isSignaling()) {
+      // Quiet a signaling NaN.
+      return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
+    }
+
+    // Make sure it is the canonical NaN bitpattern.
+    //
+    // TODO: Can we use -1 as the canonical NaN value since it's an inline
+    // immediate?
+    if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
+      return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
+  }
+
+  return SDValue(CFP, 0);
+}
+
+static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
+  switch (Opc) {
+  case ISD::FMAXNUM:
+    return AMDGPUISD::FMAX3;
+  case ISD::SMAX:
+    return AMDGPUISD::SMAX3;
+  case ISD::UMAX:
+    return AMDGPUISD::UMAX3;
+  case ISD::FMINNUM:
+    return AMDGPUISD::FMIN3;
+  case ISD::SMIN:
+    return AMDGPUISD::SMIN3;
+  case ISD::UMIN:
+    return AMDGPUISD::UMIN3;
+  default:
+    llvm_unreachable("Not a min/max opcode");
+  }
+}
+
+static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+                                        SDValue Op0, SDValue Op1, bool Signed) {
+  ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
+  if (!K1)
+    return SDValue();
+
+  ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
+  if (!K0)
+    return SDValue();
+
+  if (Signed) {
+    if (K0->getAPIntValue().sge(K1->getAPIntValue()))
+      return SDValue();
+  } else {
+    if (K0->getAPIntValue().uge(K1->getAPIntValue()))
+      return SDValue();
+  }
+
+  EVT VT = K0->getValueType(0);
+
+  MVT NVT = MVT::i32;
+  unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+
+  SDValue Tmp1, Tmp2, Tmp3;
+  Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
+  Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
+  Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
+
+  if (VT == MVT::i16) {
+    Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT,
+                       Tmp1, Tmp2, Tmp3);
+
+    return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1);
+  } else
+    return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
+                       Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+}
+
+static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
+  if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
+    return true;
+
+  return DAG.isKnownNeverNaN(Op);
+}
+
+static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+                                       SDValue Op0, SDValue Op1) {
+  ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
+  if (!K1)
+    return SDValue();
+
+  ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1));
+  if (!K0)
+    return SDValue();
+
+  // Ordered >= (although NaN inputs should have folded away by now).
+  APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
+  if (Cmp == APFloat::cmpGreaterThan)
+    return SDValue();
+
+  // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
+  // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
+  // give the other result, which is different from med3 with a NaN input.
+  SDValue Var = Op0.getOperand(0);
+  if (!isKnownNeverSNan(DAG, Var))
+    return SDValue();
+
+  return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
+                     Var, SDValue(K0, 0), SDValue(K1, 0));
+}
+
+SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  unsigned Opc = N->getOpcode();
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // Only do this if the inner op has one use since this will just increases
+  // register pressure for no benefit.
+
+  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) {
+    // max(max(a, b), c) -> max3(a, b, c)
+    // min(min(a, b), c) -> min3(a, b, c)
+    if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
+      SDLoc DL(N);
+      return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+                         DL,
+                         N->getValueType(0),
+                         Op0.getOperand(0),
+                         Op0.getOperand(1),
+                         Op1);
+    }
+
+    // Try commuted.
+    // max(a, max(b, c)) -> max3(a, b, c)
+    // min(a, min(b, c)) -> min3(a, b, c)
+    if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
+      SDLoc DL(N);
+      return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+                         DL,
+                         N->getValueType(0),
+                         Op0,
+                         Op1.getOperand(0),
+                         Op1.getOperand(1));
+    }
+  }
+
+  // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
+  if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
+    if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
+      return Med3;
+  }
+
+  if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
+    if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
+      return Med3;
+  }
+
+  // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
+  if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
+       (Opc == AMDGPUISD::FMIN_LEGACY &&
+        Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
+      N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
+    if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
+      return Res;
+  }
+
+  return SDValue();
+}
+
+unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
+                                          const SDNode *N0,
+                                          const SDNode *N1) const {
+  EVT VT = N0->getValueType(0);
+
+  // Only do this if we are not trying to support denormals. v_mad_f32 does not
+  // support denormals ever.
+  if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
+      (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
+    return ISD::FMAD;
+
+  const TargetOptions &Options = DAG.getTarget().Options;
+  if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
+       Options.UnsafeFPMath ||
+       (cast<BinaryWithFlagsSDNode>(N0)->Flags.hasUnsafeAlgebra() &&
+        cast<BinaryWithFlagsSDNode>(N1)->Flags.hasUnsafeAlgebra())) &&
+      isFMAFasterThanFMulAndFAdd(VT)) {
+    return ISD::FMA;
+  }
+
+  return 0;
+}
+
+SDValue SITargetLowering::performFAddCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  assert(!VT.isVector());
+
+  SDLoc SL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // These should really be instruction patterns, but writing patterns with
+  // source modiifiers is a pain.
+
+  // fadd (fadd (a, a), b) -> mad 2.0, a, b
+  if (LHS.getOpcode() == ISD::FADD) {
+    SDValue A = LHS.getOperand(0);
+    if (A == LHS.getOperand(1)) {
+      unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
+      if (FusedOp != 0) {
+        const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
+        return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
+      }
+    }
+  }
+
+  // fadd (b, fadd (a, a)) -> mad 2.0, a, b
+  if (RHS.getOpcode() == ISD::FADD) {
+    SDValue A = RHS.getOperand(0);
+    if (A == RHS.getOperand(1)) {
+      unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
+      if (FusedOp != 0) {
+        const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
+        return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performFSubCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+  EVT VT = N->getValueType(0);
+  assert(!VT.isVector());
+
+  // Try to get the fneg to fold into the source modifier. This undoes generic
+  // DAG combines and folds them into the mad.
+  //
+  // Only do this if we are not trying to support denormals. v_mad_f32 does
+  // not support denormals ever.
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if (LHS.getOpcode() == ISD::FADD) {
+    // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
+    SDValue A = LHS.getOperand(0);
+    if (A == LHS.getOperand(1)) {
+      unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
+      if (FusedOp != 0){
+        const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
+        SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+
+        return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
+      }
+    }
+  }
+
+  if (RHS.getOpcode() == ISD::FADD) {
+    // (fsub c, (fadd a, a)) -> mad -2.0, a, c
+
+    SDValue A = RHS.getOperand(0);
+    if (A == RHS.getOperand(1)) {
+      unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
+      if (FusedOp != 0){
+        const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
+        return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performSetCCCombine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  EVT VT = LHS.getValueType();
+
+  if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
+                                           VT != MVT::f16))
+    return SDValue();
+
+  // Match isinf pattern
+  // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
+    const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
+    if (!CRHS)
+      return SDValue();
+
+    const APFloat &APF = CRHS->getValueAPF();
+    if (APF.isInfinity() && !APF.isNegative()) {
+      unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
+      return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
+                         DAG.getConstant(Mask, SL, MVT::i32));
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
+                                                     DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+  unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
+
+  SDValue Src = N->getOperand(0);
+  SDValue Srl = N->getOperand(0);
+  if (Srl.getOpcode() == ISD::ZERO_EXTEND)
+    Srl = Srl.getOperand(0);
+
+  // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
+  if (Srl.getOpcode() == ISD::SRL) {
+    // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
+    // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
+    // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
+
+    if (const ConstantSDNode *C =
+        dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
+      Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
+                               EVT(MVT::i32));
+
+      unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
+      if (SrcOffset < 32 && SrcOffset % 8 == 0) {
+        return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
+                           MVT::f32, Srl);
+      }
+    }
+  }
+
+  APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
+
+  APInt KnownZero, KnownOne;
+  TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                        !DCI.isBeforeLegalizeOps());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
+      TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
+    DCI.CommitTargetLoweringOpt(TLO);
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  switch (N->getOpcode()) {
+  default:
+    return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+  case ISD::FADD:
+    return performFAddCombine(N, DCI);
+  case ISD::FSUB:
+    return performFSubCombine(N, DCI);
+  case ISD::SETCC:
+    return performSetCCCombine(N, DCI);
+  case ISD::FMAXNUM:
+  case ISD::FMINNUM:
+  case ISD::SMAX:
+  case ISD::SMIN:
+  case ISD::UMAX:
+  case ISD::UMIN:
+  case AMDGPUISD::FMIN_LEGACY:
+  case AMDGPUISD::FMAX_LEGACY: {
+    if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
+        N->getValueType(0) != MVT::f64 &&
+        getTargetMachine().getOptLevel() > CodeGenOpt::None)
+      return performMinMaxCombine(N, DCI);
+    break;
+  }
+  case ISD::LOAD:
+  case ISD::STORE:
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE:
+  case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case AMDGPUISD::ATOMIC_INC:
+  case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics.
+    if (DCI.isBeforeLegalize())
+      break;
+    return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
+  }
+  case ISD::AND:
+    return performAndCombine(N, DCI);
+  case ISD::OR:
+    return performOrCombine(N, DCI);
+  case ISD::XOR:
+    return performXorCombine(N, DCI);
+  case AMDGPUISD::FP_CLASS:
+    return performClassCombine(N, DCI);
+  case ISD::FCANONICALIZE:
+    return performFCanonicalizeCombine(N, DCI);
+  case AMDGPUISD::FRACT:
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RSQ:
+  case AMDGPUISD::RCP_LEGACY:
+  case AMDGPUISD::RSQ_LEGACY:
+  case AMDGPUISD::RSQ_CLAMP:
+  case AMDGPUISD::LDEXP: {
+    SDValue Src = N->getOperand(0);
+    if (Src.isUndef())
+      return Src;
+    break;
+  }
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return performUCharToFloatCombine(N, DCI);
+  case AMDGPUISD::CVT_F32_UBYTE0:
+  case AMDGPUISD::CVT_F32_UBYTE1:
+  case AMDGPUISD::CVT_F32_UBYTE2:
+  case AMDGPUISD::CVT_F32_UBYTE3:
+    return performCvtF32UByteNCombine(N, DCI);
+  }
+  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+}
+
+/// \brief Helper function for adjustWritemask
+static unsigned SubIdx2Lane(unsigned Idx) {
+  switch (Idx) {
+  default: return 0;
+  case AMDGPU::sub0: return 0;
+  case AMDGPU::sub1: return 1;
+  case AMDGPU::sub2: return 2;
+  case AMDGPU::sub3: return 3;
+  }
+}
+
+/// \brief Adjust the writemask of MIMG instructions
+void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
+                                       SelectionDAG &DAG) const {
+  SDNode *Users[4] = { };
+  unsigned Lane = 0;
+  unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
+  unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
+  unsigned NewDmask = 0;
+
+  // Try to figure out the used register components
+  for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
+       I != E; ++I) {
+
+    // Abort if we can't understand the usage
+    if (!I->isMachineOpcode() ||
+        I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
+      return;
+
+    // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
+    // Note that subregs are packed, i.e. Lane==0 is the first bit set
+    // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
+    // set, etc.
+    Lane = SubIdx2Lane(I->getConstantOperandVal(1));
+
+    // Set which texture component corresponds to the lane.
+    unsigned Comp;
+    for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
+      assert(Dmask);
+      Comp = countTrailingZeros(Dmask);
+      Dmask &= ~(1 << Comp);
+    }
+
+    // Abort if we have more than one user per component
+    if (Users[Lane])
+      return;
+
+    Users[Lane] = *I;
+    NewDmask |= 1 << Comp;
+  }
+
+  // Abort if there's no change
+  if (NewDmask == OldDmask)
+    return;
+
+  // Adjust the writemask in the node
+  std::vector<SDValue> Ops;
+  Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
+  Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
+  Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
+  Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
+
+  // If we only got one lane, replace it with a copy
+  // (if NewDmask has only one bit set...)
+  if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
+    SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
+                                       MVT::i32);
+    SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                      SDLoc(), Users[Lane]->getValueType(0),
+                                      SDValue(Node, 0), RC);
+    DAG.ReplaceAllUsesWith(Users[Lane], Copy);
+    return;
+  }
+
+  // Update the users of the node with the new indices
+  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
+
+    SDNode *User = Users[i];
+    if (!User)
+      continue;
+
+    SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
+    DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
+
+    switch (Idx) {
+    default: break;
+    case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
+    case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
+    case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
+    }
+  }
+}
+
+static bool isFrameIndexOp(SDValue Op) {
+  if (Op.getOpcode() == ISD::AssertZext)
+    Op = Op.getOperand(0);
+
+  return isa<FrameIndexSDNode>(Op);
+}
+
+/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
+/// with frame index operands.
+/// LLVM assumes that inputs are to these instructions are registers.
+void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
+                                                     SelectionDAG &DAG) const {
+
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
+    if (!isFrameIndexOp(Node->getOperand(i))) {
+      Ops.push_back(Node->getOperand(i));
+      continue;
+    }
+
+    SDLoc DL(Node);
+    Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
+                                     Node->getOperand(i).getValueType(),
+                                     Node->getOperand(i)), 0));
+  }
+
+  DAG.UpdateNodeOperands(Node, Ops);
+}
+
+/// \brief Fold the instructions after selecting them.
+SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
+                                          SelectionDAG &DAG) const {
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+  unsigned Opcode = Node->getMachineOpcode();
+
+  if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
+      !TII->isGather4(Opcode))
+    adjustWritemask(Node, DAG);
+
+  if (Opcode == AMDGPU::INSERT_SUBREG ||
+      Opcode == AMDGPU::REG_SEQUENCE) {
+    legalizeTargetIndependentNode(Node, DAG);
+    return Node;
+  }
+  return Node;
+}
+
+/// \brief Assign the register class depending on the number of
+/// bits set in the writemask
+void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                                     SDNode *Node) const {
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+
+  if (TII->isVOP3(MI.getOpcode())) {
+    // Make sure constant bus requirements are respected.
+    TII->legalizeOperandsVOP3(MRI, MI);
+    return;
+  }
+
+  if (TII->isMIMG(MI)) {
+    unsigned VReg = MI.getOperand(0).getReg();
+    const TargetRegisterClass *RC = MRI.getRegClass(VReg);
+    // TODO: Need mapping tables to handle other cases (register classes).
+    if (RC != &AMDGPU::VReg_128RegClass)
+      return;
+
+    unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
+    unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
+    unsigned BitsSet = 0;
+    for (unsigned i = 0; i < 4; ++i)
+      BitsSet += Writemask & (1 << i) ? 1 : 0;
+    switch (BitsSet) {
+    default: return;
+    case 1:  RC = &AMDGPU::VGPR_32RegClass; break;
+    case 2:  RC = &AMDGPU::VReg_64RegClass; break;
+    case 3:  RC = &AMDGPU::VReg_96RegClass; break;
+    }
+
+    unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
+    MI.setDesc(TII->get(NewOpcode));
+    MRI.setRegClass(VReg, RC);
+    return;
+  }
+
+  // Replace unused atomics with the no return version.
+  int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
+  if (NoRetAtomicOp != -1) {
+    if (!Node->hasAnyUseOfValue(0)) {
+      MI.setDesc(TII->get(NoRetAtomicOp));
+      MI.RemoveOperand(0);
+      return;
+    }
+
+    // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
+    // instruction, because the return type of these instructions is a vec2 of
+    // the memory type, so it can be tied to the input operand.
+    // This means these instructions always have a use, so we need to add a
+    // special case to check if the atomic has only one extract_subreg use,
+    // which itself has no uses.
+    if ((Node->hasNUsesOfValue(1, 0) &&
+         Node->use_begin()->isMachineOpcode() &&
+         Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
+         !Node->use_begin()->hasAnyUseOfValue(0))) {
+      unsigned Def = MI.getOperand(0).getReg();
+
+      // Change this into a noret atomic.
+      MI.setDesc(TII->get(NoRetAtomicOp));
+      MI.RemoveOperand(0);
+
+      // If we only remove the def operand from the atomic instruction, the
+      // extract_subreg will be left with a use of a vreg without a def.
+      // So we need to insert an implicit_def to avoid machine verifier
+      // errors.
+      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+              TII->get(AMDGPU::IMPLICIT_DEF), Def);
+    }
+    return;
+  }
+}
+
+static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
+                              uint64_t Val) {
+  SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
+  return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
+}
+
+MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
+                                                const SDLoc &DL,
+                                                SDValue Ptr) const {
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+  // Build the half of the subregister with the constants before building the
+  // full 128-bit register. If we are building multiple resource descriptors,
+  // this will allow CSEing of the 2-component register.
+  const SDValue Ops0[] = {
+    DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
+    buildSMovImm32(DAG, DL, 0),
+    DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+    buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
+    DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
+  };
+
+  SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                                MVT::v2i32, Ops0), 0);
+
+  // Combine the constants and the pointer.
+  const SDValue Ops1[] = {
+    DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+    Ptr,
+    DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
+    SubRegHi,
+    DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
+  };
+
+  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
+}
+
+/// \brief Return a resource descriptor with the 'Add TID' bit enabled
+///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
+///        of the resource descriptor) to create an offset, which is added to
+///        the resource pointer.
+MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
+                                           SDValue Ptr, uint32_t RsrcDword1,
+                                           uint64_t RsrcDword2And3) const {
+  SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
+  SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
+  if (RsrcDword1) {
+    PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
+                                     DAG.getConstant(RsrcDword1, DL, MVT::i32)),
+                    0);
+  }
+
+  SDValue DataLo = buildSMovImm32(DAG, DL,
+                                  RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
+  SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
+
+  const SDValue Ops[] = {
+    DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+    PtrLo,
+    DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+    PtrHi,
+    DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
+    DataLo,
+    DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
+    DataHi,
+    DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
+  };
+
+  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
+}
+
+SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
+                                               const TargetRegisterClass *RC,
+                                               unsigned Reg, EVT VT) const {
+  SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
+
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
+                            cast<RegisterSDNode>(VReg)->getReg(), VT);
+}
+
+//===----------------------------------------------------------------------===//
+//                         SI Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+std::pair<unsigned, const TargetRegisterClass *>
+SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                               StringRef Constraint,
+                                               MVT VT) const {
+  if (!isTypeLegal(VT))
+    return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 's':
+    case 'r':
+      switch (VT.getSizeInBits()) {
+      default:
+        return std::make_pair(0U, nullptr);
+      case 32:
+      case 16:
+        return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
+      case 64:
+        return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
+      case 128:
+        return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
+      case 256:
+        return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+      }
+
+    case 'v':
+      switch (VT.getSizeInBits()) {
+      default:
+        return std::make_pair(0U, nullptr);
+      case 32:
+      case 16:
+        return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
+      case 64:
+        return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
+      case 96:
+        return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
+      case 128:
+        return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
+      case 256:
+        return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
+      case 512:
+        return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
+      }
+    }
+  }
+
+  if (Constraint.size() > 1) {
+    const TargetRegisterClass *RC = nullptr;
+    if (Constraint[1] == 'v') {
+      RC = &AMDGPU::VGPR_32RegClass;
+    } else if (Constraint[1] == 's') {
+      RC = &AMDGPU::SGPR_32RegClass;
+    }
+
+    if (RC) {
+      uint32_t Idx;
+      bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
+      if (!Failed && Idx < RC->getNumRegs())
+        return std::make_pair(RC->getRegister(Idx), RC);
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+SITargetLowering::ConstraintType
+SITargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 's':
+    case 'v':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
new file mode 100644
index 000000000000..9583f6db6faa
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -0,0 +1,200 @@
+//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI DAG Lowering interface definition
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H
+
+#include "AMDGPUISelLowering.h"
+#include "SIInstrInfo.h"
+
+namespace llvm {
+
+class SITargetLowering final : public AMDGPUTargetLowering {
+  SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain,
+                            unsigned Offset) const;
+  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL,
+                         SDValue Chain, unsigned Offset, bool Signed) const;
+  SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
+                             SelectionDAG &DAG) const override;
+  SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
+                                 MVT VT, unsigned Offset) const;
+
+  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFDIV16(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+
+  /// \brief Converts \p Op, which must be of floating point type, to the
+  /// floating point type \p VT, by either extending or truncating it.
+  SDValue getFPExtOrFPTrunc(SelectionDAG &DAG,
+                            SDValue Op,
+                            const SDLoc &DL,
+                            EVT VT) const;
+
+  /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16.
+  SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const;
+  SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
+
+  void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
+
+  SDValue performUCharToFloatCombine(SDNode *N,
+                                     DAGCombinerInfo &DCI) const;
+  SDValue performSHLPtrCombine(SDNode *N,
+                               unsigned AS,
+                               DAGCombinerInfo &DCI) const;
+
+  SDValue performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const;
+
+  SDValue splitBinaryBitConstantOp(DAGCombinerInfo &DCI, const SDLoc &SL,
+                                   unsigned Opc, SDValue LHS,
+                                   const ConstantSDNode *CRHS) const;
+
+  SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+  SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+  unsigned getFusedOpcode(const SelectionDAG &DAG,
+                          const SDNode *N0, const SDNode *N1) const;
+  SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+  bool isLegalFlatAddressingMode(const AddrMode &AM) const;
+  bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
+
+  bool isCFIntrinsic(const SDNode *Intr) const;
+
+  void createDebuggerPrologueStackObjects(MachineFunction &MF) const;
+
+  /// \returns True if fixup needs to be emitted for given global value \p GV,
+  /// false otherwise.
+  bool shouldEmitFixup(const GlobalValue *GV) const;
+
+  /// \returns True if GOT relocation needs to be emitted for given global value
+  /// \p GV, false otherwise.
+  bool shouldEmitGOTReloc(const GlobalValue *GV) const;
+
+  /// \returns True if PC-relative relocation needs to be emitted for given
+  /// global value \p GV, false otherwise.
+  bool shouldEmitPCReloc(const GlobalValue *GV) const;
+
+public:
+  SITargetLowering(const TargetMachine &tm, const SISubtarget &STI);
+
+  const SISubtarget *getSubtarget() const;
+
+  bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
+                          unsigned IntrinsicID) const override;
+
+  bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
+                          EVT /*VT*/) const override;
+
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
+
+  bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
+                                      unsigned Align,
+                                      bool *IsFast) const override;
+
+  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                          unsigned SrcAlign, bool IsMemset,
+                          bool ZeroMemset,
+                          bool MemcpyStrSrc,
+                          MachineFunction &MF) const override;
+
+  bool isMemOpUniform(const SDNode *N) const;
+  bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const;
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+  bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
+  TargetLoweringBase::LegalizeTypeAction
+  getPreferredVectorAction(EVT VT) const override;
+
+  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                        Type *Ty) const override;
+
+  bool isTypeDesirableForOp(unsigned Op, EVT VT) const override;
+
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+                      SelectionDAG &DAG) const override;
+
+  unsigned getRegisterByName(const char* RegName, EVT VT,
+                             SelectionDAG &DAG) const override;
+
+  MachineBasicBlock *splitKillBlock(MachineInstr &MI,
+                                    MachineBasicBlock *BB) const;
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *BB) const override;
+  bool enableAggressiveFMAFusion(EVT VT) const override;
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                         EVT VT) const override;
+  MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
+  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+  SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
+  void AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                     SDNode *Node) const override;
+
+  SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
+                               unsigned Reg, EVT VT) const override;
+  void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
+
+  MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL,
+                                SDValue Ptr) const;
+  MachineSDNode *buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr,
+                           uint32_t RsrcDword1, uint64_t RsrcDword2And3) const;
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+  ConstraintType getConstraintType(StringRef Constraint) const override;
+  SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
+                   SDValue V) const;
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
new file mode 100644
index 000000000000..91e4bf755c53
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -0,0 +1,329 @@
+//===-- SIInsertSkips.cpp - Use predicates for control flow ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass inserts branches on the 0 exec mask over divergent branches
+/// branches when it's expected that jumping over the untaken control flow will
+/// be cheaper than having every workitem no-op through it.
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MCAsmInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-insert-skips"
+
+namespace {
+
+static cl::opt<unsigned> SkipThresholdFlag(
+  "amdgpu-skip-threshold",
+  cl::desc("Number of instructions before jumping over divergent control flow"),
+  cl::init(12), cl::Hidden);
+
+class SIInsertSkips : public MachineFunctionPass {
+private:
+  const SIRegisterInfo *TRI;
+  const SIInstrInfo *TII;
+  unsigned SkipThreshold;
+
+  bool shouldSkip(const MachineBasicBlock &From,
+                  const MachineBasicBlock &To) const;
+
+  bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
+
+  void kill(MachineInstr &MI);
+
+  MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
+
+public:
+  static char ID;
+
+  SIInsertSkips() :
+    MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "SI insert s_cbranch_execz instructions";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace
+
+char SIInsertSkips::ID = 0;
+
+INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
+                "SI insert s_cbranch_execz instructions", false, false)
+
+char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
+
+static bool opcodeEmitsNoInsts(unsigned Opc) {
+  switch (Opc) {
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::BUNDLE:
+  case TargetOpcode::CFI_INSTRUCTION:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::GC_LABEL:
+  case TargetOpcode::DBG_VALUE:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
+                               const MachineBasicBlock &To) const {
+  if (From.succ_empty())
+    return false;
+
+  unsigned NumInstr = 0;
+  const MachineFunction *MF = From.getParent();
+
+  for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
+       MBBI != End && MBBI != ToI; ++MBBI) {
+    const MachineBasicBlock &MBB = *MBBI;
+
+    for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
+         NumInstr < SkipThreshold && I != E; ++I) {
+      if (opcodeEmitsNoInsts(I->getOpcode()))
+        continue;
+
+      // FIXME: Since this is required for correctness, this should be inserted
+      // during SILowerControlFlow.
+
+      // When a uniform loop is inside non-uniform control flow, the branch
+      // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
+      // when EXEC = 0. We should skip the loop lest it becomes infinite.
+      if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
+          I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
+        return true;
+
+      if (I->isInlineAsm()) {
+        const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+        const char *AsmStr = I->getOperand(0).getSymbolName();
+
+        // inlineasm length estimate is number of bytes assuming the longest
+        // instruction.
+        uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
+        NumInstr += MaxAsmSize / MAI->getMaxInstLength();
+      } else {
+        ++NumInstr;
+      }
+
+      if (NumInstr >= SkipThreshold)
+        return true;
+    }
+  }
+
+  return false;
+}
+
+bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction *MF = MBB.getParent();
+
+  if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
+      !shouldSkip(MBB, MBB.getParent()->back()))
+    return false;
+
+  MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
+
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  // If the exec mask is non-zero, skip the next two instructions
+  BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+    .addMBB(&NextBB);
+
+  MachineBasicBlock::iterator Insert = SkipBB->begin();
+
+  // Exec mask is zero: Export to NULL target...
+  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
+    .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+    .addReg(AMDGPU::VGPR0, RegState::Undef)
+    .addReg(AMDGPU::VGPR0, RegState::Undef)
+    .addReg(AMDGPU::VGPR0, RegState::Undef)
+    .addReg(AMDGPU::VGPR0, RegState::Undef)
+    .addImm(1)  // vm
+    .addImm(0)  // compr
+    .addImm(0); // en
+
+  // ... and terminate wavefront.
+  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+
+  return true;
+}
+
+void SIInsertSkips::kill(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  const MachineOperand &Op = MI.getOperand(0);
+
+#ifndef NDEBUG
+  CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
+  // Kill is only allowed in pixel / geometry shaders.
+  assert(CallConv == CallingConv::AMDGPU_PS ||
+         CallConv == CallingConv::AMDGPU_GS);
+#endif
+  // Clear this thread from the exec mask if the operand is negative.
+  if (Op.isImm()) {
+    // Constant operand: Set exec mask to 0 or do nothing
+    if (Op.getImm() & 0x80000000) {
+      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+        .addImm(0);
+    }
+  } else {
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
+      .addImm(0)
+      .addOperand(Op);
+  }
+}
+
+MachineBasicBlock *SIInsertSkips::insertSkipBlock(
+  MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
+  MachineFunction *MF = MBB.getParent();
+
+  MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
+  MachineFunction::iterator MBBI(MBB);
+  ++MBBI;
+
+  MF->insert(MBBI, SkipBB);
+  MBB.addSuccessor(SkipBB);
+
+  return SkipBB;
+}
+
+// Returns true if a branch over the block was inserted.
+bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
+                                   MachineBasicBlock &SrcMBB) {
+  MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
+
+  if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
+    return false;
+
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
+
+  BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
+    .addMBB(DestBB);
+
+  return true;
+}
+
+bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+  SkipThreshold = SkipThresholdFlag;
+
+  bool HaveKill = false;
+  bool MadeChange = false;
+
+  // Track depth of exec mask, divergent branches.
+  SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
+
+  MachineFunction::iterator NextBB;
+
+  MachineBasicBlock *EmptyMBBAtEnd = nullptr;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; BI = NextBB) {
+    NextBB = std::next(BI);
+    MachineBasicBlock &MBB = *BI;
+
+    if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
+      // Reached convergence point for last divergent branch.
+      ExecBranchStack.pop_back();
+    }
+
+    if (HaveKill && ExecBranchStack.empty()) {
+      HaveKill = false;
+
+      // TODO: Insert skip if exec is 0?
+    }
+
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+
+      MachineInstr &MI = *I;
+
+      switch (MI.getOpcode()) {
+      case AMDGPU::SI_MASK_BRANCH: {
+        ExecBranchStack.push_back(MI.getOperand(0).getMBB());
+        MadeChange |= skipMaskBranch(MI, MBB);
+        break;
+      }
+      case AMDGPU::S_BRANCH: {
+        // Optimize out branches to the next block.
+        // FIXME: Shouldn't this be handled by BranchFolding?
+        if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB()))
+          MI.eraseFromParent();
+        break;
+      }
+      case AMDGPU::SI_KILL_TERMINATOR: {
+        MadeChange = true;
+        kill(MI);
+
+        if (ExecBranchStack.empty()) {
+          if (skipIfDead(MI, *NextBB)) {
+            NextBB = std::next(BI);
+            BE = MF.end();
+            Next = MBB.end();
+          }
+        } else {
+          HaveKill = true;
+        }
+
+        MI.eraseFromParent();
+        break;
+      }
+      case AMDGPU::SI_RETURN: {
+        // FIXME: Should move somewhere else
+        assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
+
+        // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
+        // because external bytecode will be appended at the end.
+        if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
+          // SI_RETURN is not the last instruction. Add an empty block at
+          // the end and jump there.
+          if (!EmptyMBBAtEnd) {
+            EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+            MF.insert(MF.end(), EmptyMBBAtEnd);
+          }
+
+          MBB.addSuccessor(EmptyMBBAtEnd);
+          BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+            .addMBB(EmptyMBBAtEnd);
+          I->eraseFromParent();
+        }
+      }
+      default:
+        break;
+      }
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
new file mode 100644
index 000000000000..202a1e9ed8ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -0,0 +1,679 @@
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Insert wait instructions for memory reads and writes.
+///
+/// Memory reads and writes are issued asynchronously, so we need to insert
+/// S_WAITCNT instructions when we want to access any of their results or
+/// overwrite any register that's used asynchronously.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define DEBUG_TYPE "si-insert-waits"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+
+namespace {
+
+/// \brief One variable for each of the hardware counters
+typedef union {
+  struct {
+    unsigned VM;
+    unsigned EXP;
+    unsigned LGKM;
+  } Named;
+  unsigned Array[3];
+
+} Counters;
+
+typedef enum {
+  OTHER,
+  SMEM,
+  VMEM
+} InstType;
+
+typedef Counters RegCounters[512];
+typedef std::pair<unsigned, unsigned> RegInterval;
+
+class SIInsertWaits : public MachineFunctionPass {
+
+private:
+  const SISubtarget *ST;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  IsaVersion IV;
+
+  /// \brief Constant zero value
+  static const Counters ZeroCounts;
+
+  /// \brief Hardware limits
+  Counters HardwareLimits;
+
+  /// \brief Counter values we have already waited on.
+  Counters WaitedOn;
+
+  /// \brief Counter values that we must wait on before the next counter
+  /// increase.
+  Counters DelayedWaitOn;
+
+  /// \brief Counter values for last instruction issued.
+  Counters LastIssued;
+
+  /// \brief Registers used by async instructions.
+  RegCounters UsedRegs;
+
+  /// \brief Registers defined by async instructions.
+  RegCounters DefinedRegs;
+
+  /// \brief Different export instruction types seen since last wait.
+  unsigned ExpInstrTypesSeen;
+
+  /// \brief Type of the last opcode.
+  InstType LastOpcodeType;
+
+  bool LastInstWritesM0;
+
+  /// Whether or not we have flat operations outstanding.
+  bool IsFlatOutstanding;
+
+  /// \brief Whether the machine function returns void
+  bool ReturnsVoid;
+
+  /// Whether the VCCZ bit is possibly corrupt
+  bool VCCZCorrupt;
+
+  /// \brief Get increment/decrement amount for this instruction.
+  Counters getHwCounts(MachineInstr &MI);
+
+  /// \brief Is operand relevant for async execution?
+  bool isOpRelevant(MachineOperand &Op);
+
+  /// \brief Get register interval an operand affects.
+  RegInterval getRegInterval(const TargetRegisterClass *RC,
+                             const MachineOperand &Reg) const;
+
+  /// \brief Handle instructions async components
+  void pushInstruction(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator I,
+                       const Counters& Increment);
+
+  /// \brief Insert the actual wait instruction
+  bool insertWait(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  const Counters &Counts);
+
+  /// \brief Handle existing wait instructions (from intrinsics)
+  void handleExistingWait(MachineBasicBlock::iterator I);
+
+  /// \brief Do we need def2def checks?
+  bool unorderedDefines(MachineInstr &MI);
+
+  /// \brief Resolve all operand dependencies to counter requirements
+  Counters handleOperands(MachineInstr &MI);
+
+  /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
+  void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
+
+  /// Return true if there are LGKM instrucitons that haven't been waited on
+  /// yet.
+  bool hasOutstandingLGKM() const;
+
+public:
+  static char ID;
+
+  SIInsertWaits() :
+    MachineFunctionPass(ID),
+    ST(nullptr),
+    TII(nullptr),
+    TRI(nullptr),
+    ExpInstrTypesSeen(0),
+    VCCZCorrupt(false) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "SI insert wait instructions";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace
+
+INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
+                      "SI Insert Waits", false, false)
+INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
+                    "SI Insert Waits", false, false)
+
+char SIInsertWaits::ID = 0;
+
+char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
+
+FunctionPass *llvm::createSIInsertWaitsPass() {
+  return new SIInsertWaits();
+}
+
+const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
+
+static bool readsVCCZ(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
+         !MI.getOperand(1).isUndef();
+}
+
+bool SIInsertWaits::hasOutstandingLGKM() const {
+  return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
+}
+
+Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
+  uint64_t TSFlags = MI.getDesc().TSFlags;
+  Counters Result = { { 0, 0, 0 } };
+
+  Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
+
+  // Only consider stores or EXP for EXP_CNT
+  Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
+
+  // LGKM may uses larger values
+  if (TSFlags & SIInstrFlags::LGKM_CNT) {
+
+    if (TII->isSMRD(MI)) {
+
+      if (MI.getNumOperands() != 0) {
+        assert(MI.getOperand(0).isReg() &&
+               "First LGKM operand must be a register!");
+
+        // XXX - What if this is a write into a super register?
+        const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
+        unsigned Size = RC->getSize();
+        Result.Named.LGKM = Size > 4 ? 2 : 1;
+      } else {
+        // s_dcache_inv etc. do not have a a destination register. Assume we
+        // want a wait on these.
+        // XXX - What is the right value?
+        Result.Named.LGKM = 1;
+      }
+    } else {
+      // DS
+      Result.Named.LGKM = 1;
+    }
+
+  } else {
+    Result.Named.LGKM = 0;
+  }
+
+  return Result;
+}
+
+bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
+  // Constants are always irrelevant
+  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
+    return false;
+
+  // Defines are always relevant
+  if (Op.isDef())
+    return true;
+
+  // For exports all registers are relevant.
+  // TODO: Skip undef/disabled registers.
+  MachineInstr &MI = *Op.getParent();
+  if (TII->isEXP(MI))
+    return true;
+
+  // For stores the stored value is also relevant
+  if (!MI.getDesc().mayStore())
+    return false;
+
+  // Check if this operand is the value being stored.
+  // Special case for DS/FLAT instructions, since the address
+  // operand comes before the value operand and it may have
+  // multiple data operands.
+
+  if (TII->isDS(MI)) {
+    MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
+    if (Data0 && Op.isIdenticalTo(*Data0))
+      return true;
+
+    MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
+    return Data1 && Op.isIdenticalTo(*Data1);
+  }
+
+  if (TII->isFLAT(MI)) {
+    MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+    if (Data && Op.isIdenticalTo(*Data))
+      return true;
+  }
+
+  // NOTE: This assumes that the value operand is before the
+  // address operand, and that there is only one value operand.
+  for (MachineInstr::mop_iterator I = MI.operands_begin(),
+       E = MI.operands_end(); I != E; ++I) {
+
+    if (I->isReg() && I->isUse())
+      return Op.isIdenticalTo(*I);
+  }
+
+  return false;
+}
+
+RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
+                                          const MachineOperand &Reg) const {
+  unsigned Size = RC->getSize();
+  assert(Size >= 4);
+
+  RegInterval Result;
+  Result.first = TRI->getEncodingValue(Reg.getReg());
+  Result.second = Result.first + Size / 4;
+
+  return Result;
+}
+
+void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    const Counters &Increment) {
+
+  // Get the hardware counter increments and sum them up
+  Counters Limit = ZeroCounts;
+  unsigned Sum = 0;
+
+  if (TII->mayAccessFlatAddressSpace(*I))
+    IsFlatOutstanding = true;
+
+  for (unsigned i = 0; i < 3; ++i) {
+    LastIssued.Array[i] += Increment.Array[i];
+    if (Increment.Array[i])
+      Limit.Array[i] = LastIssued.Array[i];
+    Sum += Increment.Array[i];
+  }
+
+  // If we don't increase anything then that's it
+  if (Sum == 0) {
+    LastOpcodeType = OTHER;
+    return;
+  }
+
+  if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+    // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
+    // or SMEM clause, respectively.
+    //
+    // The temporary workaround is to break the clauses with S_NOP.
+    //
+    // The proper solution would be to allocate registers such that all source
+    // and destination registers don't overlap, e.g. this is illegal:
+    //   r0 = load r2
+    //   r2 = load r0
+    if (LastOpcodeType == VMEM && Increment.Named.VM) {
+      // Insert a NOP to break the clause.
+      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
+          .addImm(0);
+      LastInstWritesM0 = false;
+    }
+
+    if (TII->isSMRD(*I))
+      LastOpcodeType = SMEM;
+    else if (Increment.Named.VM)
+      LastOpcodeType = VMEM;
+  }
+
+  // Remember which export instructions we have seen
+  if (Increment.Named.EXP) {
+    ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
+  }
+
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+    MachineOperand &Op = I->getOperand(i);
+    if (!isOpRelevant(Op))
+      continue;
+
+    const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
+    RegInterval Interval = getRegInterval(RC, Op);
+    for (unsigned j = Interval.first; j < Interval.second; ++j) {
+
+      // Remember which registers we define
+      if (Op.isDef())
+        DefinedRegs[j] = Limit;
+
+      // and which one we are using
+      if (Op.isUse())
+        UsedRegs[j] = Limit;
+    }
+  }
+}
+
+bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I,
+                               const Counters &Required) {
+
+  // End of program? No need to wait on anything
+  // A function not returning void needs to wait, because other bytecode will
+  // be appended after it and we don't know what it will be.
+  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
+    return false;
+
+  // Figure out if the async instructions execute in order
+  bool Ordered[3];
+
+  // VM_CNT is always ordered except when there are flat instructions, which
+  // can return out of order.
+  Ordered[0] = !IsFlatOutstanding;
+
+  // EXP_CNT is unordered if we have both EXP & VM-writes
+  Ordered[1] = ExpInstrTypesSeen == 3;
+
+  // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
+  Ordered[2] = false;
+
+  // The values we are going to put into the S_WAITCNT instruction
+  Counters Counts = HardwareLimits;
+
+  // Do we really need to wait?
+  bool NeedWait = false;
+
+  for (unsigned i = 0; i < 3; ++i) {
+
+    if (Required.Array[i] <= WaitedOn.Array[i])
+      continue;
+
+    NeedWait = true;
+
+    if (Ordered[i]) {
+      unsigned Value = LastIssued.Array[i] - Required.Array[i];
+
+      // Adjust the value to the real hardware possibilities.
+      Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
+
+    } else
+      Counts.Array[i] = 0;
+
+    // Remember on what we have waited on.
+    WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
+  }
+
+  if (!NeedWait)
+    return false;
+
+  // Reset EXP_CNT instruction types
+  if (Counts.Named.EXP == 0)
+    ExpInstrTypesSeen = 0;
+
+  // Build the wait instruction
+  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+    .addImm(encodeWaitcnt(IV,
+                          Counts.Named.VM,
+                          Counts.Named.EXP,
+                          Counts.Named.LGKM));
+
+  LastOpcodeType = OTHER;
+  LastInstWritesM0 = false;
+  IsFlatOutstanding = false;
+  return true;
+}
+
+/// \brief helper function for handleOperands
+static void increaseCounters(Counters &Dst, const Counters &Src) {
+
+  for (unsigned i = 0; i < 3; ++i)
+    Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
+}
+
+/// \brief check whether any of the counters is non-zero
+static bool countersNonZero(const Counters &Counter) {
+  for (unsigned i = 0; i < 3; ++i)
+    if (Counter.Array[i])
+      return true;
+  return false;
+}
+
+void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
+  assert(I->getOpcode() == AMDGPU::S_WAITCNT);
+
+  unsigned Imm = I->getOperand(0).getImm();
+  Counters Counts, WaitOn;
+
+  Counts.Named.VM = decodeVmcnt(IV, Imm);
+  Counts.Named.EXP = decodeExpcnt(IV, Imm);
+  Counts.Named.LGKM = decodeLgkmcnt(IV, Imm);
+
+  for (unsigned i = 0; i < 3; ++i) {
+    if (Counts.Array[i] <= LastIssued.Array[i])
+      WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
+    else
+      WaitOn.Array[i] = 0;
+  }
+
+  increaseCounters(DelayedWaitOn, WaitOn);
+}
+
+Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
+
+  Counters Result = ZeroCounts;
+
+  // For each register affected by this instruction increase the result
+  // sequence.
+  //
+  // TODO: We could probably just look at explicit operands if we removed VCC /
+  // EXEC from SMRD dest reg classes.
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &Op = MI.getOperand(i);
+    if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
+      continue;
+
+    const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
+    RegInterval Interval = getRegInterval(RC, Op);
+    for (unsigned j = Interval.first; j < Interval.second; ++j) {
+
+      if (Op.isDef()) {
+        increaseCounters(Result, UsedRegs[j]);
+        increaseCounters(Result, DefinedRegs[j]);
+      }
+
+      if (Op.isUse())
+        increaseCounters(Result, DefinedRegs[j]);
+    }
+  }
+
+  return Result;
+}
+
+void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) {
+  if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+    return;
+
+  // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
+  if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
+    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
+    LastInstWritesM0 = false;
+    return;
+  }
+
+  // Set whether this instruction sets M0
+  LastInstWritesM0 = false;
+
+  unsigned NumOperands = I->getNumOperands();
+  for (unsigned i = 0; i < NumOperands; i++) {
+    const MachineOperand &Op = I->getOperand(i);
+
+    if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
+      LastInstWritesM0 = true;
+  }
+}
+
+// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
+// around other non-memory instructions.
+bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
+  bool Changes = false;
+
+  ST = &MF.getSubtarget<SISubtarget>();
+  TII = ST->getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  IV = getIsaVersion(ST->getFeatureBits());
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  HardwareLimits.Named.VM = getVmcntBitMask(IV);
+  HardwareLimits.Named.EXP = getExpcntBitMask(IV);
+  HardwareLimits.Named.LGKM = getLgkmcntBitMask(IV);
+
+  WaitedOn = ZeroCounts;
+  DelayedWaitOn = ZeroCounts;
+  LastIssued = ZeroCounts;
+  LastOpcodeType = OTHER;
+  LastInstWritesM0 = false;
+  IsFlatOutstanding = false;
+  ReturnsVoid = MFI->returnsVoid();
+
+  memset(&UsedRegs, 0, sizeof(UsedRegs));
+  memset(&DefinedRegs, 0, sizeof(DefinedRegs));
+
+  SmallVector<MachineInstr *, 4> RemoveMI;
+  SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
+
+  bool HaveScalarStores = false;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+
+      if (!HaveScalarStores && TII->isScalarStore(*I))
+        HaveScalarStores = true;
+
+      if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
+        // There is a hardware bug on CI/SI where SMRD instruction may corrupt
+        // vccz bit, so when we detect that an instruction may read from a
+        // corrupt vccz bit, we need to:
+        // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
+        //    complete.
+        // 2. Restore the correct value of vccz by writing the current value
+        //    of vcc back to vcc.
+
+        if (TII->isSMRD(I->getOpcode())) {
+          VCCZCorrupt = true;
+        } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
+          // FIXME: We only care about SMRD instructions here, not LDS or GDS.
+          // Whenever we store a value in vcc, the correct value of vccz is
+          // restored.
+          VCCZCorrupt = false;
+        }
+
+        // Check if we need to apply the bug work-around
+        if (VCCZCorrupt && readsVCCZ(*I)) {
+          DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
+
+          // Wait on everything, not just LGKM.  vccz reads usually come from
+          // terminators, and we always wait on everything at the end of the
+          // block, so if we only wait on LGKM here, we might end up with
+          // another s_waitcnt inserted right after this if there are non-LGKM
+          // instructions still outstanding.
+          insertWait(MBB, I, LastIssued);
+
+          // Restore the vccz bit.  Any time a value is written to vcc, the vcc
+          // bit is updated, so we can restore the bit by reading the value of
+          // vcc and then writing it back to the register.
+          BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
+                  AMDGPU::VCC)
+            .addReg(AMDGPU::VCC);
+        }
+      }
+
+      // Record pre-existing, explicitly requested waits
+      if (I->getOpcode() == AMDGPU::S_WAITCNT) {
+        handleExistingWait(*I);
+        RemoveMI.push_back(&*I);
+        continue;
+      }
+
+      Counters Required;
+
+      // Wait for everything before a barrier.
+      //
+      // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
+      // but we also want to wait for any other outstanding transfers before
+      // signalling other hardware blocks
+      if ((I->getOpcode() == AMDGPU::S_BARRIER &&
+               ST->needWaitcntBeforeBarrier()) ||
+           I->getOpcode() == AMDGPU::S_SENDMSG)
+        Required = LastIssued;
+      else
+        Required = handleOperands(*I);
+
+      Counters Increment = getHwCounts(*I);
+
+      if (countersNonZero(Required) || countersNonZero(Increment))
+        increaseCounters(Required, DelayedWaitOn);
+
+      Changes |= insertWait(MBB, I, Required);
+
+      pushInstruction(MBB, I, Increment);
+      handleSendMsg(MBB, I);
+
+      if (I->getOpcode() == AMDGPU::S_ENDPGM ||
+          I->getOpcode() == AMDGPU::SI_RETURN)
+        EndPgmBlocks.push_back(&MBB);
+    }
+
+    // Wait for everything at the end of the MBB
+    Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
+  }
+
+  if (HaveScalarStores) {
+    // If scalar writes are used, the cache must be flushed or else the next
+    // wave to reuse the same scratch memory can be clobbered.
+    //
+    // Insert s_dcache_wb at wave termination points if there were any scalar
+    // stores, and only if the cache hasn't already been flushed. This could be
+    // improved by looking across blocks for flushes in postdominating blocks
+    // from the stores but an explicitly requested flush is probably very rare.
+    for (MachineBasicBlock *MBB : EndPgmBlocks) {
+      bool SeenDCacheWB = false;
+
+      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+           I != E; ++I) {
+
+        if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
+          SeenDCacheWB = true;
+        else if (TII->isScalarStore(*I))
+          SeenDCacheWB = false;
+
+        // FIXME: It would be better to insert this before a waitcnt if any.
+        if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
+             I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) {
+          Changes = true;
+          BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+        }
+      }
+    }
+  }
+
+  for (MachineInstr *I : RemoveMI)
+    I->eraseFromParent();
+
+  return Changes;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
new file mode 100644
index 000000000000..5523ec142ba7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -0,0 +1,285 @@
+//===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// SI Instruction format definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class InstSI <dag outs, dag ins, string asm = "",
+              list<dag> pattern = []> :
+  AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
+
+  // Low bits - basic encoding information.
+  field bit SALU = 0;
+  field bit VALU = 0;
+
+  // SALU instruction formats.
+  field bit SOP1 = 0;
+  field bit SOP2 = 0;
+  field bit SOPC = 0;
+  field bit SOPK = 0;
+  field bit SOPP = 0;
+
+  // VALU instruction formats.
+  field bit VOP1 = 0;
+  field bit VOP2 = 0;
+  field bit VOPC = 0;
+  field bit VOP3 = 0;
+  field bit VINTRP = 0;
+  field bit SDWA = 0;
+  field bit DPP = 0;
+
+  // Memory instruction formats.
+  field bit MUBUF = 0;
+  field bit MTBUF = 0;
+  field bit SMRD = 0;
+  field bit MIMG = 0;
+  field bit EXP = 0;
+  field bit FLAT = 0;
+  field bit DS = 0;
+
+   // Pseudo instruction formats.
+  field bit VGPRSpill = 0;
+  field bit SGPRSpill = 0;
+
+  // High bits - other information.
+  field bit VM_CNT = 0;
+  field bit EXP_CNT = 0;
+  field bit LGKM_CNT = 0;
+
+  // Whether WQM _must_ be enabled for this instruction.
+  field bit WQM = 0;
+
+  // Whether WQM _must_ be disabled for this instruction.
+  field bit DisableWQM = 0;
+
+  field bit Gather4 = 0;
+
+  // Most sopk treat the immediate as a signed 16-bit, however some
+  // use it as unsigned.
+  field bit SOPKZext = 0;
+
+  // This is an s_store_dword* instruction that requires a cache flush
+  // on wave termination. It is necessary to distinguish from mayStore
+  // SMEM instructions like the cache flush ones.
+  field bit ScalarStore = 0;
+
+  // Whether the operands can be ignored when computing the
+  // instruction size.
+  field bit FixedSize = 0;
+
+  // This bit tells the assembler to use the 32-bit encoding in case it
+  // is unable to infer the encoding from the operands.
+  field bit VOPAsmPrefer32Bit = 0;
+
+  // These need to be kept in sync with the enum in SIInstrFlags.
+  let TSFlags{0} = SALU;
+  let TSFlags{1} = VALU;
+
+  let TSFlags{2} = SOP1;
+  let TSFlags{3} = SOP2;
+  let TSFlags{4} = SOPC;
+  let TSFlags{5} = SOPK;
+  let TSFlags{6} = SOPP;
+
+  let TSFlags{7} = VOP1;
+  let TSFlags{8} = VOP2;
+  let TSFlags{9} = VOPC;
+  let TSFlags{10} = VOP3;
+
+  let TSFlags{13} = VINTRP;
+  let TSFlags{14} = SDWA;
+  let TSFlags{15} = DPP;
+
+  let TSFlags{16} = MUBUF;
+  let TSFlags{17} = MTBUF;
+  let TSFlags{18} = SMRD;
+  let TSFlags{19} = MIMG;
+  let TSFlags{20} = EXP;
+  let TSFlags{21} = FLAT;
+  let TSFlags{22} = DS;
+
+  let TSFlags{23} = VGPRSpill;
+  let TSFlags{24} = SGPRSpill;
+
+  let TSFlags{32} = VM_CNT;
+  let TSFlags{33} = EXP_CNT;
+  let TSFlags{34} = LGKM_CNT;
+
+  let TSFlags{35} = WQM;
+  let TSFlags{36} = DisableWQM;
+  let TSFlags{37} = Gather4;
+
+  let TSFlags{38} = SOPKZext;
+  let TSFlags{39} = ScalarStore;
+  let TSFlags{40} = FixedSize;
+  let TSFlags{41} = VOPAsmPrefer32Bit;
+
+  let SchedRW = [Write32Bit];
+
+  field bits<1> DisableSIDecoder = 0;
+  field bits<1> DisableVIDecoder = 0;
+  field bits<1> DisableDecoder = 0;
+
+  let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1);
+  let AsmVariantName = AMDGPUAsmVariants.Default;
+}
+
+class PseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
+  : InstSI<outs, ins, "", pattern> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
+  : PseudoInstSI<outs, ins, pattern> {
+  let SALU = 1;
+}
+
+class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
+  : PseudoInstSI<outs, ins, pattern> {
+  let VALU = 1;
+  let Uses = [EXEC];
+}
+
+class CFPseudoInstSI<dag outs, dag ins, list<dag> pattern = [],
+  bit UseExec = 0, bit DefExec = 0> :
+  SPseudoInstSI<outs, ins, pattern> {
+
+  let Uses = !if(UseExec, [EXEC], []);
+  let Defs = !if(DefExec, [EXEC, SCC], [SCC]);
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class Enc32 {
+  field bits<32> Inst;
+  int Size = 4;
+}
+
+class Enc64 {
+  field bits<64> Inst;
+  int Size = 8;
+}
+
+class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
+
+class VINTRPe <bits<2> op> : Enc32 {
+  bits<8> vdst;
+  bits<8> vsrc;
+  bits<2> attrchan;
+  bits<6> attr;
+
+  let Inst{7-0} = vsrc;
+  let Inst{9-8} = attrchan;
+  let Inst{15-10} = attr;
+  let Inst{17-16} = op;
+  let Inst{25-18} = vdst;
+  let Inst{31-26} = 0x32; // encoding
+}
+
+class MIMGe <bits<7> op> : Enc64 {
+  bits<8> vdata;
+  bits<4> dmask;
+  bits<1> unorm;
+  bits<1> glc;
+  bits<1> da;
+  bits<1> r128;
+  bits<1> tfe;
+  bits<1> lwe;
+  bits<1> slc;
+  bits<8> vaddr;
+  bits<7> srsrc;
+  bits<7> ssamp;
+
+  let Inst{11-8} = dmask;
+  let Inst{12} = unorm;
+  let Inst{13} = glc;
+  let Inst{14} = da;
+  let Inst{15} = r128;
+  let Inst{16} = tfe;
+  let Inst{17} = lwe;
+  let Inst{24-18} = op;
+  let Inst{25} = slc;
+  let Inst{31-26} = 0x3c;
+  let Inst{39-32} = vaddr;
+  let Inst{47-40} = vdata;
+  let Inst{52-48} = srsrc{6-2};
+  let Inst{57-53} = ssamp{6-2};
+}
+
+class EXPe : Enc64 {
+  bits<4> en;
+  bits<6> tgt;
+  bits<1> compr;
+  bits<1> done;
+  bits<1> vm;
+  bits<8> vsrc0;
+  bits<8> vsrc1;
+  bits<8> vsrc2;
+  bits<8> vsrc3;
+
+  let Inst{3-0} = en;
+  let Inst{9-4} = tgt;
+  let Inst{10} = compr;
+  let Inst{11} = done;
+  let Inst{12} = vm;
+  let Inst{31-26} = 0x3e;
+  let Inst{39-32} = vsrc0;
+  let Inst{47-40} = vsrc1;
+  let Inst{55-48} = vsrc2;
+  let Inst{63-56} = vsrc3;
+}
+
+let Uses = [EXEC] in {
+
+class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
+  let VINTRP = 1;
+  // VINTRP instructions read parameter values from LDS, but these parameter
+  // values are stored outside of the LDS memory that is allocated to the
+  // shader for general purpose use.
+  //
+  // While it may be possible for ds_read/ds_write instructions to access
+  // the parameter values in LDS, this would essentially be an out-of-bounds
+  // memory access which we consider to be undefined behavior.
+  //
+  // So even though these instructions read memory, this memory is outside the
+  // addressable memory space for the shader, and we consider these instructions
+  // to be readnone.
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> :
+  InstSI<outs, ins, asm, pattern> {
+  let EXP = 1;
+  let EXP_CNT = 1;
+  let mayLoad = 0; // Set to 1 if done bit is set.
+  let mayStore = 1;
+  let UseNamedOperandTable = 1;
+  let Uses = [EXEC];
+  let SchedRW = [WriteExport];
+}
+
+} // End Uses = [EXEC]
+
+class MIMG <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+  let MIMG = 1;
+  let Uses = [EXEC];
+
+  let UseNamedOperandTable = 1;
+  let hasSideEffects = 0; // XXX ????
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
new file mode 100644
index 000000000000..26a8d22062a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -0,0 +1,3642 @@
+//===-- SIInstrInfo.cpp - SI Instruction Information  ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Implementation of TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIInstrInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNHazardRecognizer.h"
+#include "SIDefines.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+// Must be at least 4 to be able to branch over minimum unconditional branch
+// code. This is only for making it possible to write reasonably small tests for
+// long branches.
+static cl::opt<unsigned>
+BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
+                 cl::desc("Restrict range of branch instructions (DEBUG)"));
+
+SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
+  : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
+
+//===----------------------------------------------------------------------===//
+// TargetInstrInfo callbacks
+//===----------------------------------------------------------------------===//
+
+static unsigned getNumOperandsNoGlue(SDNode *Node) {
+  unsigned N = Node->getNumOperands();
+  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
+    --N;
+  return N;
+}
+
+static SDValue findChainOperand(SDNode *Load) {
+  SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
+  assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
+  return LastOp;
+}
+
+/// \brief Returns true if both nodes have the same value for the given
+///        operand \p Op, or if both nodes do not have this operand.
+static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
+  unsigned Opc0 = N0->getMachineOpcode();
+  unsigned Opc1 = N1->getMachineOpcode();
+
+  int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
+  int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
+
+  if (Op0Idx == -1 && Op1Idx == -1)
+    return true;
+
+
+  if ((Op0Idx == -1 && Op1Idx != -1) ||
+      (Op1Idx == -1 && Op0Idx != -1))
+    return false;
+
+  // getNamedOperandIdx returns the index for the MachineInstr's operands,
+  // which includes the result as the first operand. We are indexing into the
+  // MachineSDNode's operands, so we need to skip the result operand to get
+  // the real index.
+  --Op0Idx;
+  --Op1Idx;
+
+  return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
+}
+
+bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
+                                                    AliasAnalysis *AA) const {
+  // TODO: The generic check fails for VALU instructions that should be
+  // rematerializable due to implicit reads of exec. We really want all of the
+  // generic logic for this except for this.
+  switch (MI.getOpcode()) {
+  case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::V_MOV_B32_e64:
+  case AMDGPU::V_MOV_B64_PSEUDO:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
+                                          int64_t &Offset0,
+                                          int64_t &Offset1) const {
+  if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
+    return false;
+
+  unsigned Opc0 = Load0->getMachineOpcode();
+  unsigned Opc1 = Load1->getMachineOpcode();
+
+  // Make sure both are actually loads.
+  if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
+    return false;
+
+  if (isDS(Opc0) && isDS(Opc1)) {
+
+    // FIXME: Handle this case:
+    if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
+      return false;
+
+    // Check base reg.
+    if (Load0->getOperand(1) != Load1->getOperand(1))
+      return false;
+
+    // Check chain.
+    if (findChainOperand(Load0) != findChainOperand(Load1))
+      return false;
+
+    // Skip read2 / write2 variants for simplicity.
+    // TODO: We should report true if the used offsets are adjacent (excluded
+    // st64 versions).
+    if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
+        AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
+      return false;
+
+    Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
+    Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
+    return true;
+  }
+
+  if (isSMRD(Opc0) && isSMRD(Opc1)) {
+    assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
+
+    // Check base reg.
+    if (Load0->getOperand(0) != Load1->getOperand(0))
+      return false;
+
+    const ConstantSDNode *Load0Offset =
+        dyn_cast<ConstantSDNode>(Load0->getOperand(1));
+    const ConstantSDNode *Load1Offset =
+        dyn_cast<ConstantSDNode>(Load1->getOperand(1));
+
+    if (!Load0Offset || !Load1Offset)
+      return false;
+
+    // Check chain.
+    if (findChainOperand(Load0) != findChainOperand(Load1))
+      return false;
+
+    Offset0 = Load0Offset->getZExtValue();
+    Offset1 = Load1Offset->getZExtValue();
+    return true;
+  }
+
+  // MUBUF and MTBUF can access the same addresses.
+  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
+
+    // MUBUF and MTBUF have vaddr at different indices.
+    if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
+        findChainOperand(Load0) != findChainOperand(Load1) ||
+        !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
+        !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
+      return false;
+
+    int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
+    int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
+
+    if (OffIdx0 == -1 || OffIdx1 == -1)
+      return false;
+
+    // getNamedOperandIdx returns the index for MachineInstrs.  Since they
+    // inlcude the output in the operand list, but SDNodes don't, we need to
+    // subtract the index by one.
+    --OffIdx0;
+    --OffIdx1;
+
+    SDValue Off0 = Load0->getOperand(OffIdx0);
+    SDValue Off1 = Load1->getOperand(OffIdx1);
+
+    // The offset might be a FrameIndexSDNode.
+    if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
+      return false;
+
+    Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
+    Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
+    return true;
+  }
+
+  return false;
+}
+
+static bool isStride64(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::DS_READ2ST64_B32:
+  case AMDGPU::DS_READ2ST64_B64:
+  case AMDGPU::DS_WRITE2ST64_B32:
+  case AMDGPU::DS_WRITE2ST64_B64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+                                        int64_t &Offset,
+                                        const TargetRegisterInfo *TRI) const {
+  unsigned Opc = LdSt.getOpcode();
+
+  if (isDS(LdSt)) {
+    const MachineOperand *OffsetImm =
+        getNamedOperand(LdSt, AMDGPU::OpName::offset);
+    if (OffsetImm) {
+      // Normal, single offset LDS instruction.
+      const MachineOperand *AddrReg =
+          getNamedOperand(LdSt, AMDGPU::OpName::addr);
+
+      BaseReg = AddrReg->getReg();
+      Offset = OffsetImm->getImm();
+      return true;
+    }
+
+    // The 2 offset instructions use offset0 and offset1 instead. We can treat
+    // these as a load with a single offset if the 2 offsets are consecutive. We
+    // will use this for some partially aligned loads.
+    const MachineOperand *Offset0Imm =
+        getNamedOperand(LdSt, AMDGPU::OpName::offset0);
+    const MachineOperand *Offset1Imm =
+        getNamedOperand(LdSt, AMDGPU::OpName::offset1);
+
+    uint8_t Offset0 = Offset0Imm->getImm();
+    uint8_t Offset1 = Offset1Imm->getImm();
+
+    if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
+      // Each of these offsets is in element sized units, so we need to convert
+      // to bytes of the individual reads.
+
+      unsigned EltSize;
+      if (LdSt.mayLoad())
+        EltSize = getOpRegClass(LdSt, 0)->getSize() / 2;
+      else {
+        assert(LdSt.mayStore());
+        int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+        EltSize = getOpRegClass(LdSt, Data0Idx)->getSize();
+      }
+
+      if (isStride64(Opc))
+        EltSize *= 64;
+
+      const MachineOperand *AddrReg =
+          getNamedOperand(LdSt, AMDGPU::OpName::addr);
+      BaseReg = AddrReg->getReg();
+      Offset = EltSize * Offset0;
+      return true;
+    }
+
+    return false;
+  }
+
+  if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
+    const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
+    if (SOffset && SOffset->isReg())
+      return false;
+
+    const MachineOperand *AddrReg =
+        getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+    if (!AddrReg)
+      return false;
+
+    const MachineOperand *OffsetImm =
+        getNamedOperand(LdSt, AMDGPU::OpName::offset);
+    BaseReg = AddrReg->getReg();
+    Offset = OffsetImm->getImm();
+
+    if (SOffset) // soffset can be an inline immediate.
+      Offset += SOffset->getImm();
+
+    return true;
+  }
+
+  if (isSMRD(LdSt)) {
+    const MachineOperand *OffsetImm =
+        getNamedOperand(LdSt, AMDGPU::OpName::offset);
+    if (!OffsetImm)
+      return false;
+
+    const MachineOperand *SBaseReg =
+        getNamedOperand(LdSt, AMDGPU::OpName::sbase);
+    BaseReg = SBaseReg->getReg();
+    Offset = OffsetImm->getImm();
+    return true;
+  }
+
+  if (isFLAT(LdSt)) {
+    const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+    BaseReg = AddrReg->getReg();
+    Offset = 0;
+    return true;
+  }
+
+  return false;
+}
+
+bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
+                                      MachineInstr &SecondLdSt,
+                                      unsigned NumLoads) const {
+  const MachineOperand *FirstDst = nullptr;
+  const MachineOperand *SecondDst = nullptr;
+
+  if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
+      (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) {
+    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
+    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
+  } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
+    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
+    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
+  } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
+    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
+    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
+  }
+
+  if (!FirstDst || !SecondDst)
+    return false;
+
+  // Try to limit clustering based on the total number of bytes loaded
+  // rather than the number of instructions.  This is done to help reduce
+  // register pressure.  The method used is somewhat inexact, though,
+  // because it assumes that all loads in the cluster will load the
+  // same number of bytes as FirstLdSt.
+
+  // The unit of this value is bytes.
+  // FIXME: This needs finer tuning.
+  unsigned LoadClusterThreshold = 16;
+
+  const MachineRegisterInfo &MRI =
+      FirstLdSt.getParent()->getParent()->getRegInfo();
+  const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
+
+  return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold;
+}
+
+void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              const DebugLoc &DL, unsigned DestReg,
+                              unsigned SrcReg, bool KillSrc) const {
+  const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
+
+  if (RC == &AMDGPU::VGPR_32RegClass) {
+    assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
+           AMDGPU::SReg_32RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (RC == &AMDGPU::SReg_32_XM0RegClass ||
+      RC == &AMDGPU::SReg_32RegClass) {
+    if (SrcReg == AMDGPU::SCC) {
+      BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
+          .addImm(-1)
+          .addImm(0);
+      return;
+    }
+
+    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (RC == &AMDGPU::SReg_64RegClass) {
+    if (DestReg == AMDGPU::VCC) {
+      if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+      } else {
+        // FIXME: Hack until VReg_1 removed.
+        assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
+        BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
+          .addImm(0)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+      }
+
+      return;
+    }
+
+    assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (DestReg == AMDGPU::SCC) {
+    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
+      .addReg(SrcReg, getKillRegState(KillSrc))
+      .addImm(0);
+    return;
+  }
+
+  unsigned EltSize = 4;
+  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+  if (RI.isSGPRClass(RC)) {
+    if (RC->getSize() > 4) {
+      Opcode =  AMDGPU::S_MOV_B64;
+      EltSize = 8;
+    } else {
+      Opcode = AMDGPU::S_MOV_B32;
+      EltSize = 4;
+    }
+  }
+
+  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
+  bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
+
+  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
+    unsigned SubIdx;
+    if (Forward)
+      SubIdx = SubIndices[Idx];
+    else
+      SubIdx = SubIndices[SubIndices.size() - Idx - 1];
+
+    MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
+      get(Opcode), RI.getSubReg(DestReg, SubIdx));
+
+    Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
+
+    if (Idx == SubIndices.size() - 1)
+      Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
+
+    if (Idx == 0)
+      Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
+
+    Builder.addReg(SrcReg, RegState::Implicit);
+  }
+}
+
+int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
+  int NewOpc;
+
+  // Try to map original to commuted opcode
+  NewOpc = AMDGPU::getCommuteRev(Opcode);
+  if (NewOpc != -1)
+    // Check if the commuted (REV) opcode exists on the target.
+    return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
+
+  // Try to map commuted to original opcode
+  NewOpc = AMDGPU::getCommuteOrig(Opcode);
+  if (NewOpc != -1)
+    // Check if the original (non-REV) opcode exists on the target.
+    return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
+
+  return Opcode;
+}
+
+unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
+
+  if (DstRC->getSize() == 4) {
+    return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+  } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
+    return AMDGPU::S_MOV_B64;
+  } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) {
+    return  AMDGPU::V_MOV_B64_PSEUDO;
+  }
+  return AMDGPU::COPY;
+}
+
+static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_S32_SAVE;
+  case 8:
+    return AMDGPU::SI_SPILL_S64_SAVE;
+  case 16:
+    return AMDGPU::SI_SPILL_S128_SAVE;
+  case 32:
+    return AMDGPU::SI_SPILL_S256_SAVE;
+  case 64:
+    return AMDGPU::SI_SPILL_S512_SAVE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
+static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_V32_SAVE;
+  case 8:
+    return AMDGPU::SI_SPILL_V64_SAVE;
+  case 12:
+    return AMDGPU::SI_SPILL_V96_SAVE;
+  case 16:
+    return AMDGPU::SI_SPILL_V128_SAVE;
+  case 32:
+    return AMDGPU::SI_SPILL_V256_SAVE;
+  case 64:
+    return AMDGPU::SI_SPILL_V512_SAVE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
+void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MI,
+                                      unsigned SrcReg, bool isKill,
+                                      int FrameIndex,
+                                      const TargetRegisterClass *RC,
+                                      const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+
+  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
+  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
+  MachinePointerInfo PtrInfo
+    = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
+  MachineMemOperand *MMO
+    = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+                               Size, Align);
+
+  if (RI.isSGPRClass(RC)) {
+    MFI->setHasSpilledSGPRs();
+
+    // We are only allowed to create one new instruction when spilling
+    // registers, so we need to use pseudo instruction for spilling SGPRs.
+    const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize()));
+
+    // The SGPR spill/restore instructions only work on number sgprs, so we need
+    // to make sure we are using the correct register class.
+    if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) {
+      MachineRegisterInfo &MRI = MF->getRegInfo();
+      MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
+    }
+
+    MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
+      .addReg(SrcReg, getKillRegState(isKill)) // data
+      .addFrameIndex(FrameIndex)               // addr
+      .addMemOperand(MMO)
+      .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+      .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
+    // Add the scratch resource registers as implicit uses because we may end up
+    // needing them, and need to ensure that the reserved registers are
+    // correctly handled.
+
+    if (ST.hasScalarStores()) {
+      // m0 is used for offset to scalar stores if used to spill.
+      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
+    }
+
+    return;
+  }
+
+  if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
+    LLVMContext &Ctx = MF->getFunction()->getContext();
+    Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
+                  " spill register");
+    BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
+      .addReg(SrcReg);
+
+    return;
+  }
+
+  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
+
+  unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
+  MFI->setHasSpilledVGPRs();
+  BuildMI(MBB, MI, DL, get(Opcode))
+    .addReg(SrcReg, getKillRegState(isKill)) // data
+    .addFrameIndex(FrameIndex)               // addr
+    .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
+    .addReg(MFI->getScratchWaveOffsetReg())  // scratch_offset
+    .addImm(0)                               // offset
+    .addMemOperand(MMO);
+}
+
+static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_S32_RESTORE;
+  case 8:
+    return AMDGPU::SI_SPILL_S64_RESTORE;
+  case 16:
+    return AMDGPU::SI_SPILL_S128_RESTORE;
+  case 32:
+    return AMDGPU::SI_SPILL_S256_RESTORE;
+  case 64:
+    return AMDGPU::SI_SPILL_S512_RESTORE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
+static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_V32_RESTORE;
+  case 8:
+    return AMDGPU::SI_SPILL_V64_RESTORE;
+  case 12:
+    return AMDGPU::SI_SPILL_V96_RESTORE;
+  case 16:
+    return AMDGPU::SI_SPILL_V128_RESTORE;
+  case 32:
+    return AMDGPU::SI_SPILL_V256_RESTORE;
+  case 64:
+    return AMDGPU::SI_SPILL_V512_RESTORE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
+void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       unsigned DestReg, int FrameIndex,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
+  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
+
+  MachinePointerInfo PtrInfo
+    = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
+
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+    PtrInfo, MachineMemOperand::MOLoad, Size, Align);
+
+  if (RI.isSGPRClass(RC)) {
+    // FIXME: Maybe this should not include a memoperand because it will be
+    // lowered to non-memory instructions.
+    const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize()));
+    if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) {
+      MachineRegisterInfo &MRI = MF->getRegInfo();
+      MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
+    }
+
+    MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
+      .addFrameIndex(FrameIndex) // addr
+      .addMemOperand(MMO)
+      .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+      .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
+
+    if (ST.hasScalarStores()) {
+      // m0 is used for offset to scalar stores if used to spill.
+      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
+    }
+
+    return;
+  }
+
+  if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
+    LLVMContext &Ctx = MF->getFunction()->getContext();
+    Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
+                  " restore register");
+    BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
+
+    return;
+  }
+
+  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
+
+  unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
+  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+    .addFrameIndex(FrameIndex)              // vaddr
+    .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
+    .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+    .addImm(0)                              // offset
+    .addMemOperand(MMO);
+}
+
+/// \param @Offset Offset in bytes of the FrameIndex being spilled
+unsigned SIInstrInfo::calculateLDSSpillAddress(
+    MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
+    unsigned FrameOffset, unsigned Size) const {
+  MachineFunction *MF = MBB.getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
+  unsigned WavefrontSize = ST.getWavefrontSize();
+
+  unsigned TIDReg = MFI->getTIDReg();
+  if (!MFI->hasCalculatedTID()) {
+    MachineBasicBlock &Entry = MBB.getParent()->front();
+    MachineBasicBlock::iterator Insert = Entry.front();
+    DebugLoc DL = Insert->getDebugLoc();
+
+    TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
+                                   *MF);
+    if (TIDReg == AMDGPU::NoRegister)
+      return TIDReg;
+
+    if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) &&
+        WorkGroupSize > WavefrontSize) {
+
+      unsigned TIDIGXReg
+        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
+      unsigned TIDIGYReg
+        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
+      unsigned TIDIGZReg
+        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
+      unsigned InputPtrReg =
+          TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+      for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
+        if (!Entry.isLiveIn(Reg))
+          Entry.addLiveIn(Reg);
+      }
+
+      RS->enterBasicBlock(Entry);
+      // FIXME: Can we scavenge an SReg_64 and access the subregs?
+      unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+      unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+      BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
+              .addReg(InputPtrReg)
+              .addImm(SI::KernelInputOffsets::NGROUPS_Z);
+      BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
+              .addReg(InputPtrReg)
+              .addImm(SI::KernelInputOffsets::NGROUPS_Y);
+
+      // NGROUPS.X * NGROUPS.Y
+      BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
+              .addReg(STmp1)
+              .addReg(STmp0);
+      // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
+              .addReg(STmp1)
+              .addReg(TIDIGXReg);
+      // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
+              .addReg(STmp0)
+              .addReg(TIDIGYReg)
+              .addReg(TIDReg);
+      // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
+              .addReg(TIDReg)
+              .addReg(TIDIGZReg);
+    } else {
+      // Get the wave id
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
+              TIDReg)
+              .addImm(-1)
+              .addImm(0);
+
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
+              TIDReg)
+              .addImm(-1)
+              .addReg(TIDReg);
+    }
+
+    BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
+            TIDReg)
+            .addImm(2)
+            .addReg(TIDReg);
+    MFI->setTIDReg(TIDReg);
+  }
+
+  // Add FrameIndex to LDS offset
+  unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
+  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
+          .addImm(LDSOffset)
+          .addReg(TIDReg);
+
+  return TmpReg;
+}
+
+void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   int Count) const {
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  while (Count > 0) {
+    int Arg;
+    if (Count >= 8)
+      Arg = 7;
+    else
+      Arg = Count - 1;
+    Count -= 8;
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
+            .addImm(Arg);
+  }
+}
+
+void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MI) const {
+  insertWaitStates(MBB, MI, 1);
+}
+
+unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default: return 1; // FIXME: Do wait states equal cycles?
+
+  case AMDGPU::S_NOP:
+    return MI.getOperand(0).getImm() + 1;
+  }
+}
+
+bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  switch (MI.getOpcode()) {
+  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+  case AMDGPU::S_MOV_B64_term: {
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_MOV_B64));
+    break;
+  }
+  case AMDGPU::S_XOR_B64_term: {
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_XOR_B64));
+    break;
+  }
+  case AMDGPU::S_ANDN2_B64_term: {
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_ANDN2_B64));
+    break;
+  }
+  case AMDGPU::V_MOV_B64_PSEUDO: {
+    unsigned Dst = MI.getOperand(0).getReg();
+    unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
+    unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+
+    const MachineOperand &SrcOp = MI.getOperand(1);
+    // FIXME: Will this work for 64-bit floating point immediates?
+    assert(!SrcOp.isFPImm());
+    if (SrcOp.isImm()) {
+      APInt Imm(64, SrcOp.getImm());
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+        .addImm(Imm.getLoBits(32).getZExtValue())
+        .addReg(Dst, RegState::Implicit | RegState::Define);
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+        .addImm(Imm.getHiBits(32).getZExtValue())
+        .addReg(Dst, RegState::Implicit | RegState::Define);
+    } else {
+      assert(SrcOp.isReg());
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
+        .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
+        .addReg(Dst, RegState::Implicit | RegState::Define);
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
+        .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
+        .addReg(Dst, RegState::Implicit | RegState::Define);
+    }
+    MI.eraseFromParent();
+    break;
+  }
+  case AMDGPU::V_MOVRELD_B32_V1:
+  case AMDGPU::V_MOVRELD_B32_V2:
+  case AMDGPU::V_MOVRELD_B32_V4:
+  case AMDGPU::V_MOVRELD_B32_V8:
+  case AMDGPU::V_MOVRELD_B32_V16: {
+    const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
+    unsigned VecReg = MI.getOperand(0).getReg();
+    bool IsUndef = MI.getOperand(1).isUndef();
+    unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
+    assert(VecReg == MI.getOperand(1).getReg());
+
+    MachineInstr *MovRel =
+        BuildMI(MBB, MI, DL, MovRelDesc)
+            .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
+            .addOperand(MI.getOperand(2))
+            .addReg(VecReg, RegState::ImplicitDefine)
+            .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
+
+    const int ImpDefIdx =
+        MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
+    const int ImpUseIdx = ImpDefIdx + 1;
+    MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
+
+    MI.eraseFromParent();
+    break;
+  }
+  case AMDGPU::SI_PC_ADD_REL_OFFSET: {
+    MachineFunction &MF = *MBB.getParent();
+    unsigned Reg = MI.getOperand(0).getReg();
+    unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
+    unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
+
+    // Create a bundle so these instructions won't be re-ordered by the
+    // post-RA scheduler.
+    MIBundleBuilder Bundler(MBB, MI);
+    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
+
+    // Add 32-bit offset from this instruction to the start of the
+    // constant data.
+    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
+                       .addReg(RegLo)
+                       .addOperand(MI.getOperand(1)));
+
+    MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
+                                  .addReg(RegHi);
+    if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
+      MIB.addImm(0);
+    else
+      MIB.addOperand(MI.getOperand(2));
+
+    Bundler.append(MIB);
+    llvm::finalizeBundle(MBB, Bundler.begin());
+
+    MI.eraseFromParent();
+    break;
+  }
+  }
+  return true;
+}
+
+bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
+                                      MachineOperand &Src0,
+                                      unsigned Src0OpName,
+                                      MachineOperand &Src1,
+                                      unsigned Src1OpName) const {
+  MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
+  if (!Src0Mods)
+    return false;
+
+  MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
+  assert(Src1Mods &&
+         "All commutable instructions have both src0 and src1 modifiers");
+
+  int Src0ModsVal = Src0Mods->getImm();
+  int Src1ModsVal = Src1Mods->getImm();
+
+  Src1Mods->setImm(Src0ModsVal);
+  Src0Mods->setImm(Src1ModsVal);
+  return true;
+}
+
+static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
+                                             MachineOperand &RegOp,
+                                             MachineOperand &NonRegOp) {
+  unsigned Reg = RegOp.getReg();
+  unsigned SubReg = RegOp.getSubReg();
+  bool IsKill = RegOp.isKill();
+  bool IsDead = RegOp.isDead();
+  bool IsUndef = RegOp.isUndef();
+  bool IsDebug = RegOp.isDebug();
+
+  if (NonRegOp.isImm())
+    RegOp.ChangeToImmediate(NonRegOp.getImm());
+  else if (NonRegOp.isFI())
+    RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
+  else
+    return nullptr;
+
+  NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
+  NonRegOp.setSubReg(SubReg);
+
+  return &MI;
+}
+
+MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+                                                  unsigned Src0Idx,
+                                                  unsigned Src1Idx) const {
+  assert(!NewMI && "this should never be used");
+
+  unsigned Opc = MI.getOpcode();
+  int CommutedOpcode = commuteOpcode(Opc);
+  if (CommutedOpcode == -1)
+    return nullptr;
+
+  assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
+           static_cast<int>(Src0Idx) &&
+         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
+           static_cast<int>(Src1Idx) &&
+         "inconsistency with findCommutedOpIndices");
+
+  MachineOperand &Src0 = MI.getOperand(Src0Idx);
+  MachineOperand &Src1 = MI.getOperand(Src1Idx);
+
+  MachineInstr *CommutedMI = nullptr;
+  if (Src0.isReg() && Src1.isReg()) {
+    if (isOperandLegal(MI, Src1Idx, &Src0)) {
+      // Be sure to copy the source modifiers to the right place.
+      CommutedMI
+        = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
+    }
+
+  } else if (Src0.isReg() && !Src1.isReg()) {
+    // src0 should always be able to support any operand type, so no need to
+    // check operand legality.
+    CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
+  } else if (!Src0.isReg() && Src1.isReg()) {
+    if (isOperandLegal(MI, Src1Idx, &Src0))
+      CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
+  } else {
+    // FIXME: Found two non registers to commute. This does happen.
+    return nullptr;
+  }
+
+
+  if (CommutedMI) {
+    swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
+                        Src1, AMDGPU::OpName::src1_modifiers);
+
+    CommutedMI->setDesc(get(CommutedOpcode));
+  }
+
+  return CommutedMI;
+}
+
+// This needs to be implemented because the source modifiers may be inserted
+// between the true commutable operands, and the base
+// TargetInstrInfo::commuteInstruction uses it.
+bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
+                                        unsigned &SrcOpIdx1) const {
+  if (!MI.isCommutable())
+    return false;
+
+  unsigned Opc = MI.getOpcode();
+  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+  if (Src0Idx == -1)
+    return false;
+
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+  if (Src1Idx == -1)
+    return false;
+
+  return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
+}
+
+bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
+                                        int64_t BrOffset) const {
+  // BranchRelaxation should never have to check s_setpc_b64 because its dest
+  // block is unanalyzable.
+  assert(BranchOp != AMDGPU::S_SETPC_B64);
+
+  // Convert to dwords.
+  BrOffset /= 4;
+
+  // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
+  // from the next instruction.
+  BrOffset -= 1;
+
+  return isIntN(BranchOffsetBits, BrOffset);
+}
+
+MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
+  const MachineInstr &MI) const {
+  if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
+    // This would be a difficult analysis to perform, but can always be legal so
+    // there's no need to analyze it.
+    return nullptr;
+  }
+
+  return MI.getOperand(0).getMBB();
+}
+
+unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
+                                           MachineBasicBlock &DestBB,
+                                           const DebugLoc &DL,
+                                           int64_t BrOffset,
+                                           RegScavenger *RS) const {
+  assert(RS && "RegScavenger required for long branching");
+  assert(MBB.empty() &&
+         "new block should be inserted for expanding unconditional branch");
+  assert(MBB.pred_size() == 1);
+
+  MachineFunction *MF = MBB.getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // FIXME: Virtual register workaround for RegScavenger not working with empty
+  // blocks.
+  unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+  auto I = MBB.end();
+
+  // We need to compute the offset relative to the instruction immediately after
+  // s_getpc_b64. Insert pc arithmetic code before last terminator.
+  MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
+
+  // TODO: Handle > 32-bit block address.
+  if (BrOffset >= 0) {
+    BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
+      .addReg(PCReg, RegState::Define, AMDGPU::sub0)
+      .addReg(PCReg, 0, AMDGPU::sub0)
+      .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
+    BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
+      .addReg(PCReg, RegState::Define, AMDGPU::sub1)
+      .addReg(PCReg, 0, AMDGPU::sub1)
+      .addImm(0);
+  } else {
+    // Backwards branch.
+    BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
+      .addReg(PCReg, RegState::Define, AMDGPU::sub0)
+      .addReg(PCReg, 0, AMDGPU::sub0)
+      .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
+    BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
+      .addReg(PCReg, RegState::Define, AMDGPU::sub1)
+      .addReg(PCReg, 0, AMDGPU::sub1)
+      .addImm(0);
+  }
+
+  // Insert the indirect branch after the other terminator.
+  BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
+    .addReg(PCReg);
+
+  // FIXME: If spilling is necessary, this will fail because this scavenger has
+  // no emergency stack slots. It is non-trivial to spill in this situation,
+  // because the restore code needs to be specially placed after the
+  // jump. BranchRelaxation then needs to be made aware of the newly inserted
+  // block.
+  //
+  // If a spill is needed for the pc register pair, we need to insert a spill
+  // restore block right before the destination block, and insert a short branch
+  // into the old destination block's fallthrough predecessor.
+  // e.g.:
+  //
+  // s_cbranch_scc0 skip_long_branch:
+  //
+  // long_branch_bb:
+  //   spill s[8:9]
+  //   s_getpc_b64 s[8:9]
+  //   s_add_u32 s8, s8, restore_bb
+  //   s_addc_u32 s9, s9, 0
+  //   s_setpc_b64 s[8:9]
+  //
+  // skip_long_branch:
+  //   foo;
+  //
+  // .....
+  //
+  // dest_bb_fallthrough_predecessor:
+  // bar;
+  // s_branch dest_bb
+  //
+  // restore_bb:
+  //  restore s[8:9]
+  //  fallthrough dest_bb
+  ///
+  // dest_bb:
+  //   buzz;
+
+  RS->enterBasicBlockEnd(MBB);
+  unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
+                                       MachineBasicBlock::iterator(GetPC), 0);
+  MRI.replaceRegWith(PCReg, Scav);
+  MRI.clearVirtRegs();
+  RS->setRegUsed(Scav);
+
+  return 4 + 8 + 4 + 4;
+}
+
+unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
+  switch (Cond) {
+  case SIInstrInfo::SCC_TRUE:
+    return AMDGPU::S_CBRANCH_SCC1;
+  case SIInstrInfo::SCC_FALSE:
+    return AMDGPU::S_CBRANCH_SCC0;
+  case SIInstrInfo::VCCNZ:
+    return AMDGPU::S_CBRANCH_VCCNZ;
+  case SIInstrInfo::VCCZ:
+    return AMDGPU::S_CBRANCH_VCCZ;
+  case SIInstrInfo::EXECNZ:
+    return AMDGPU::S_CBRANCH_EXECNZ;
+  case SIInstrInfo::EXECZ:
+    return AMDGPU::S_CBRANCH_EXECZ;
+  default:
+    llvm_unreachable("invalid branch predicate");
+  }
+}
+
+SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::S_CBRANCH_SCC0:
+    return SCC_FALSE;
+  case AMDGPU::S_CBRANCH_SCC1:
+    return SCC_TRUE;
+  case AMDGPU::S_CBRANCH_VCCNZ:
+    return VCCNZ;
+  case AMDGPU::S_CBRANCH_VCCZ:
+    return VCCZ;
+  case AMDGPU::S_CBRANCH_EXECNZ:
+    return EXECNZ;
+  case AMDGPU::S_CBRANCH_EXECZ:
+    return EXECZ;
+  default:
+    return INVALID_BR;
+  }
+}
+
+bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    MachineBasicBlock *&TBB,
+                                    MachineBasicBlock *&FBB,
+                                    SmallVectorImpl<MachineOperand> &Cond,
+                                    bool AllowModify) const {
+  if (I->getOpcode() == AMDGPU::S_BRANCH) {
+    // Unconditional Branch
+    TBB = I->getOperand(0).getMBB();
+    return false;
+  }
+
+  BranchPredicate Pred = getBranchPredicate(I->getOpcode());
+  if (Pred == INVALID_BR)
+    return true;
+
+  MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
+  Cond.push_back(MachineOperand::CreateImm(Pred));
+  Cond.push_back(I->getOperand(1)); // Save the branch register.
+
+  ++I;
+
+  if (I == MBB.end()) {
+    // Conditional branch followed by fall-through.
+    TBB = CondBB;
+    return false;
+  }
+
+  if (I->getOpcode() == AMDGPU::S_BRANCH) {
+    TBB = CondBB;
+    FBB = I->getOperand(0).getMBB();
+    return false;
+  }
+
+  return true;
+}
+
+bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                                MachineBasicBlock *&FBB,
+                                SmallVectorImpl<MachineOperand> &Cond,
+                                bool AllowModify) const {
+  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+  if (I == MBB.end())
+    return false;
+
+  if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
+    return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
+
+  ++I;
+
+  // TODO: Should be able to treat as fallthrough?
+  if (I == MBB.end())
+    return true;
+
+  if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
+    return true;
+
+  MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
+
+  // Specifically handle the case where the conditional branch is to the same
+  // destination as the mask branch. e.g.
+  //
+  // si_mask_branch BB8
+  // s_cbranch_execz BB8
+  // s_cbranch BB9
+  //
+  // This is required to understand divergent loops which may need the branches
+  // to be relaxed.
+  if (TBB != MaskBrDest || Cond.empty())
+    return true;
+
+  auto Pred = Cond[0].getImm();
+  return (Pred != EXECZ && Pred != EXECNZ);
+}
+
+unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                   int *BytesRemoved) const {
+  MachineBasicBlock::iterator I = MBB.getFirstTerminator();
+
+  unsigned Count = 0;
+  unsigned RemovedSize = 0;
+  while (I != MBB.end()) {
+    MachineBasicBlock::iterator Next = std::next(I);
+    if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
+      I = Next;
+      continue;
+    }
+
+    RemovedSize += getInstSizeInBytes(*I);
+    I->eraseFromParent();
+    ++Count;
+    I = Next;
+  }
+
+  if (BytesRemoved)
+    *BytesRemoved = RemovedSize;
+
+  return Count;
+}
+
+unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *TBB,
+                                   MachineBasicBlock *FBB,
+                                   ArrayRef<MachineOperand> Cond,
+                                   const DebugLoc &DL,
+                                   int *BytesAdded) const {
+
+  if (!FBB && Cond.empty()) {
+    BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
+      .addMBB(TBB);
+    if (BytesAdded)
+      *BytesAdded = 4;
+    return 1;
+  }
+
+  assert(TBB && Cond[0].isImm());
+
+  unsigned Opcode
+    = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
+
+  if (!FBB) {
+    Cond[1].isUndef();
+    MachineInstr *CondBr =
+      BuildMI(&MBB, DL, get(Opcode))
+      .addMBB(TBB);
+
+    // Copy the flags onto the implicit condition register operand.
+    MachineOperand &CondReg = CondBr->getOperand(1);
+    CondReg.setIsUndef(Cond[1].isUndef());
+    CondReg.setIsKill(Cond[1].isKill());
+
+    if (BytesAdded)
+      *BytesAdded = 4;
+    return 1;
+  }
+
+  assert(TBB && FBB);
+
+  MachineInstr *CondBr =
+    BuildMI(&MBB, DL, get(Opcode))
+    .addMBB(TBB);
+  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
+    .addMBB(FBB);
+
+  MachineOperand &CondReg = CondBr->getOperand(1);
+  CondReg.setIsUndef(Cond[1].isUndef());
+  CondReg.setIsKill(Cond[1].isKill());
+
+  if (BytesAdded)
+      *BytesAdded = 8;
+
+  return 2;
+}
+
+bool SIInstrInfo::reverseBranchCondition(
+  SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 2);
+  Cond[0].setImm(-Cond[0].getImm());
+  return false;
+}
+
+static void removeModOperands(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
+                                              AMDGPU::OpName::src0_modifiers);
+  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
+                                              AMDGPU::OpName::src1_modifiers);
+  int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
+                                              AMDGPU::OpName::src2_modifiers);
+
+  MI.RemoveOperand(Src2ModIdx);
+  MI.RemoveOperand(Src1ModIdx);
+  MI.RemoveOperand(Src0ModIdx);
+}
+
+bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+                                unsigned Reg, MachineRegisterInfo *MRI) const {
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return false;
+
+  unsigned Opc = UseMI.getOpcode();
+  if (Opc == AMDGPU::COPY) {
+    bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
+    switch (DefMI.getOpcode()) {
+    default:
+      return false;
+    case AMDGPU::S_MOV_B64:
+      // TODO: We could fold 64-bit immediates, but this get compilicated
+      // when there are sub-registers.
+      return false;
+
+    case AMDGPU::V_MOV_B32_e32:
+    case AMDGPU::S_MOV_B32:
+      break;
+    }
+    unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
+    const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
+    assert(ImmOp);
+    // FIXME: We could handle FrameIndex values here.
+    if (!ImmOp->isImm()) {
+      return false;
+    }
+    UseMI.setDesc(get(NewOpc));
+    UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
+    UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
+    return true;
+  }
+
+  if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
+      Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
+    bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
+
+    // Don't fold if we are using source modifiers. The new VOP2 instructions
+    // don't have them.
+    if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) ||
+        hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) ||
+        hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) {
+      return false;
+    }
+
+    const MachineOperand &ImmOp = DefMI.getOperand(1);
+
+    // If this is a free constant, there's no reason to do this.
+    // TODO: We could fold this here instead of letting SIFoldOperands do it
+    // later.
+    MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
+
+    // Any src operand can be used for the legality check.
+    if (isInlineConstant(UseMI, *Src0, ImmOp))
+      return false;
+
+    MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
+    MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
+
+    // Multiplied part is the constant: Use v_madmk_{f16, f32}.
+    // We should only expect these to be on src0 due to canonicalizations.
+    if (Src0->isReg() && Src0->getReg() == Reg) {
+      if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
+        return false;
+
+      if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
+        return false;
+
+      // We need to swap operands 0 and 1 since madmk constant is at operand 1.
+
+      const int64_t Imm = DefMI.getOperand(1).getImm();
+
+      // FIXME: This would be a lot easier if we could return a new instruction
+      // instead of having to modify in place.
+
+      // Remove these first since they are at the end.
+      UseMI.RemoveOperand(
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
+      UseMI.RemoveOperand(
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
+
+      unsigned Src1Reg = Src1->getReg();
+      unsigned Src1SubReg = Src1->getSubReg();
+      Src0->setReg(Src1Reg);
+      Src0->setSubReg(Src1SubReg);
+      Src0->setIsKill(Src1->isKill());
+
+      if (Opc == AMDGPU::V_MAC_F32_e64 ||
+          Opc == AMDGPU::V_MAC_F16_e64)
+        UseMI.untieRegOperand(
+            AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
+
+      Src1->ChangeToImmediate(Imm);
+
+      removeModOperands(UseMI);
+      UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
+
+      bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+      if (DeleteDef)
+        DefMI.eraseFromParent();
+
+      return true;
+    }
+
+    // Added part is the constant: Use v_madak_{f16, f32}.
+    if (Src2->isReg() && Src2->getReg() == Reg) {
+      // Not allowed to use constant bus for another operand.
+      // We can however allow an inline immediate as src0.
+      if (!Src0->isImm() &&
+          (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
+        return false;
+
+      if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
+        return false;
+
+      const int64_t Imm = DefMI.getOperand(1).getImm();
+
+      // FIXME: This would be a lot easier if we could return a new instruction
+      // instead of having to modify in place.
+
+      // Remove these first since they are at the end.
+      UseMI.RemoveOperand(
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
+      UseMI.RemoveOperand(
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
+
+      if (Opc == AMDGPU::V_MAC_F32_e64 ||
+          Opc == AMDGPU::V_MAC_F16_e64)
+        UseMI.untieRegOperand(
+            AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
+
+      // ChangingToImmediate adds Src2 back to the instruction.
+      Src2->ChangeToImmediate(Imm);
+
+      // These come before src2.
+      removeModOperands(UseMI);
+      UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
+
+      bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+      if (DeleteDef)
+        DefMI.eraseFromParent();
+
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
+                                int WidthB, int OffsetB) {
+  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
+  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
+  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+  return LowOffset + LowWidth <= HighOffset;
+}
+
+bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
+                                               MachineInstr &MIb) const {
+  unsigned BaseReg0, BaseReg1;
+  int64_t Offset0, Offset1;
+
+  if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
+      getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
+
+    if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
+      // FIXME: Handle ds_read2 / ds_write2.
+      return false;
+    }
+    unsigned Width0 = (*MIa.memoperands_begin())->getSize();
+    unsigned Width1 = (*MIb.memoperands_begin())->getSize();
+    if (BaseReg0 == BaseReg1 &&
+        offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
+                                                  MachineInstr &MIb,
+                                                  AliasAnalysis *AA) const {
+  assert((MIa.mayLoad() || MIa.mayStore()) &&
+         "MIa must load from or modify a memory location");
+  assert((MIb.mayLoad() || MIb.mayStore()) &&
+         "MIb must load from or modify a memory location");
+
+  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
+    return false;
+
+  // XXX - Can we relax this between address spaces?
+  if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
+    return false;
+
+  if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
+    const MachineMemOperand *MMOa = *MIa.memoperands_begin();
+    const MachineMemOperand *MMOb = *MIb.memoperands_begin();
+    if (MMOa->getValue() && MMOb->getValue()) {
+      MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
+      MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
+      if (!AA->alias(LocA, LocB))
+        return true;
+    }
+  }
+
+  // TODO: Should we check the address space from the MachineMemOperand? That
+  // would allow us to distinguish objects we know don't alias based on the
+  // underlying address space, even if it was lowered to a different one,
+  // e.g. private accesses lowered to use MUBUF instructions on a scratch
+  // buffer.
+  if (isDS(MIa)) {
+    if (isDS(MIb))
+      return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+    return !isFLAT(MIb);
+  }
+
+  if (isMUBUF(MIa) || isMTBUF(MIa)) {
+    if (isMUBUF(MIb) || isMTBUF(MIb))
+      return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+    return !isFLAT(MIb) && !isSMRD(MIb);
+  }
+
+  if (isSMRD(MIa)) {
+    if (isSMRD(MIb))
+      return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+    return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
+  }
+
+  if (isFLAT(MIa)) {
+    if (isFLAT(MIb))
+      return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+    return false;
+  }
+
+  return false;
+}
+
+MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
+                                                 MachineInstr &MI,
+                                                 LiveVariables *LV) const {
+  bool IsF16 = false;
+
+  switch (MI.getOpcode()) {
+  default:
+    return nullptr;
+  case AMDGPU::V_MAC_F16_e64:
+    IsF16 = true;
+  case AMDGPU::V_MAC_F32_e64:
+    break;
+  case AMDGPU::V_MAC_F16_e32:
+    IsF16 = true;
+  case AMDGPU::V_MAC_F32_e32: {
+    int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                             AMDGPU::OpName::src0);
+    const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
+    if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
+      return nullptr;
+    break;
+  }
+  }
+
+  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
+  const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
+  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
+  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
+
+  return BuildMI(*MBB, MI, MI.getDebugLoc(),
+                 get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
+      .addOperand(*Dst)
+      .addImm(0) // Src0 mods
+      .addOperand(*Src0)
+      .addImm(0) // Src1 mods
+      .addOperand(*Src1)
+      .addImm(0) // Src mods
+      .addOperand(*Src2)
+      .addImm(0)  // clamp
+      .addImm(0); // omod
+}
+
+// It's not generally safe to move VALU instructions across these since it will
+// start using the register as a base index rather than directly.
+// XXX - Why isn't hasSideEffects sufficient for these?
+static bool changesVGPRIndexingMode(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AMDGPU::S_SET_GPR_IDX_ON:
+  case AMDGPU::S_SET_GPR_IDX_MODE:
+  case AMDGPU::S_SET_GPR_IDX_OFF:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                       const MachineBasicBlock *MBB,
+                                       const MachineFunction &MF) const {
+  // XXX - Do we want the SP check in the base implementation?
+
+  // Target-independent instructions do not have an implicit-use of EXEC, even
+  // when they operate on VGPRs. Treating EXEC modifications as scheduling
+  // boundaries prevents incorrect movements of such instructions.
+  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
+         MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
+         MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
+         MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
+         changesVGPRIndexingMode(MI);
+}
+
+bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
+  switch (Imm.getBitWidth()) {
+  case 32:
+    return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
+                                        ST.hasInv2PiInlineImm());
+  case 64:
+    return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
+                                        ST.hasInv2PiInlineImm());
+  case 16:
+    return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
+                                        ST.hasInv2PiInlineImm());
+  default:
+    llvm_unreachable("invalid bitwidth");
+  }
+}
+
+bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
+                                   uint8_t OperandType) const {
+  if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET)
+    return false;
+
+  // MachineOperand provides no way to tell the true operand size, since it only
+  // records a 64-bit value. We need to know the size to determine if a 32-bit
+  // floating point immediate bit pattern is legal for an integer immediate. It
+  // would be for any 32-bit integer operand, but would not be for a 64-bit one.
+
+  int64_t Imm = MO.getImm();
+  switch (operandBitWidth(OperandType)) {
+  case 32: {
+    int32_t Trunc = static_cast<int32_t>(Imm);
+    return Trunc == Imm &&
+           AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
+  }
+  case 64: {
+    return AMDGPU::isInlinableLiteral64(MO.getImm(),
+                                        ST.hasInv2PiInlineImm());
+  }
+  case 16: {
+    if (isInt<16>(Imm) || isUInt<16>(Imm)) {
+      int16_t Trunc = static_cast<int16_t>(Imm);
+      return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
+    }
+
+    return false;
+  }
+  default:
+    llvm_unreachable("invalid bitwidth");
+  }
+}
+
+bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
+                                        const MCOperandInfo &OpInfo) const {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    return false;
+  case MachineOperand::MO_Immediate:
+    return !isInlineConstant(MO, OpInfo);
+  case MachineOperand::MO_FrameIndex:
+  case MachineOperand::MO_MachineBasicBlock:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_MCSymbol:
+    return true;
+  default:
+    llvm_unreachable("unexpected operand type");
+  }
+}
+
+static bool compareMachineOp(const MachineOperand &Op0,
+                             const MachineOperand &Op1) {
+  if (Op0.getType() != Op1.getType())
+    return false;
+
+  switch (Op0.getType()) {
+  case MachineOperand::MO_Register:
+    return Op0.getReg() == Op1.getReg();
+  case MachineOperand::MO_Immediate:
+    return Op0.getImm() == Op1.getImm();
+  default:
+    llvm_unreachable("Didn't expect to be comparing these operand types");
+  }
+}
+
+bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
+                                    const MachineOperand &MO) const {
+  const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
+
+  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+
+  if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
+    return true;
+
+  if (OpInfo.RegClass < 0)
+    return false;
+
+  if (MO.isImm() && isInlineConstant(MO, OpInfo))
+    return RI.opCanUseInlineConstant(OpInfo.OperandType);
+
+  return RI.opCanUseLiteralConstant(OpInfo.OperandType);
+}
+
+bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
+  int Op32 = AMDGPU::getVOPe32(Opcode);
+  if (Op32 == -1)
+    return false;
+
+  return pseudoToMCOpcode(Op32) != -1;
+}
+
+bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
+  // The src0_modifier operand is present on all instructions
+  // that have modifiers.
+
+  return AMDGPU::getNamedOperandIdx(Opcode,
+                                    AMDGPU::OpName::src0_modifiers) != -1;
+}
+
+bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
+                                  unsigned OpName) const {
+  const MachineOperand *Mods = getNamedOperand(MI, OpName);
+  return Mods && Mods->getImm();
+}
+
+bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
+                                  const MachineOperand &MO,
+                                  const MCOperandInfo &OpInfo) const {
+  // Literal constants use the constant bus.
+  //if (isLiteralConstantLike(MO, OpInfo))
+  // return true;
+  if (MO.isImm())
+    return !isInlineConstant(MO, OpInfo);
+
+  if (!MO.isReg())
+    return true; // Misc other operands like FrameIndex
+
+  if (!MO.isUse())
+    return false;
+
+  if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
+
+  // FLAT_SCR is just an SGPR pair.
+  if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
+    return true;
+
+  // EXEC register uses the constant bus.
+  if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
+    return true;
+
+  // SGPRs use the constant bus
+  return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
+          (!MO.isImplicit() &&
+           (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
+            AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
+}
+
+static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.implicit_operands()) {
+    // We only care about reads.
+    if (MO.isDef())
+      continue;
+
+    switch (MO.getReg()) {
+    case AMDGPU::VCC:
+    case AMDGPU::M0:
+    case AMDGPU::FLAT_SCR:
+      return MO.getReg();
+
+    default:
+      break;
+    }
+  }
+
+  return AMDGPU::NoRegister;
+}
+
+static bool shouldReadExec(const MachineInstr &MI) {
+  if (SIInstrInfo::isVALU(MI)) {
+    switch (MI.getOpcode()) {
+    case AMDGPU::V_READLANE_B32:
+    case AMDGPU::V_READLANE_B32_si:
+    case AMDGPU::V_READLANE_B32_vi:
+    case AMDGPU::V_WRITELANE_B32:
+    case AMDGPU::V_WRITELANE_B32_si:
+    case AMDGPU::V_WRITELANE_B32_vi:
+      return false;
+    }
+
+    return true;
+  }
+
+  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
+      SIInstrInfo::isSALU(MI) ||
+      SIInstrInfo::isSMRD(MI))
+    return false;
+
+  return true;
+}
+
+static bool isSubRegOf(const SIRegisterInfo &TRI,
+                       const MachineOperand &SuperVec,
+                       const MachineOperand &SubReg) {
+  if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
+    return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
+
+  return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
+         SubReg.getReg() == SuperVec.getReg();
+}
+
+bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
+                                    StringRef &ErrInfo) const {
+  uint16_t Opcode = MI.getOpcode();
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+  // Make sure the number of operands is correct.
+  const MCInstrDesc &Desc = get(Opcode);
+  if (!Desc.isVariadic() &&
+      Desc.getNumOperands() != MI.getNumExplicitOperands()) {
+    ErrInfo = "Instruction has wrong number of operands.";
+    return false;
+  }
+
+  if (MI.isInlineAsm()) {
+    // Verify register classes for inlineasm constraints.
+    for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
+         I != E; ++I) {
+      const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
+      if (!RC)
+        continue;
+
+      const MachineOperand &Op = MI.getOperand(I);
+      if (!Op.isReg())
+        continue;
+
+      unsigned Reg = Op.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
+        ErrInfo = "inlineasm operand has incorrect register class.";
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  // Make sure the register classes are correct.
+  for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
+    if (MI.getOperand(i).isFPImm()) {
+      ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
+                "all fp values to integers.";
+      return false;
+    }
+
+    int RegClass = Desc.OpInfo[i].RegClass;
+
+    switch (Desc.OpInfo[i].OperandType) {
+    case MCOI::OPERAND_REGISTER:
+      if (MI.getOperand(i).isImm()) {
+        ErrInfo = "Illegal immediate value for operand.";
+        return false;
+      }
+      break;
+    case AMDGPU::OPERAND_REG_IMM_INT32:
+    case AMDGPU::OPERAND_REG_IMM_FP32:
+      break;
+    case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
+      const MachineOperand &MO = MI.getOperand(i);
+      if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
+        ErrInfo = "Illegal immediate value for operand.";
+        return false;
+      }
+      break;
+    }
+    case MCOI::OPERAND_IMMEDIATE:
+    case AMDGPU::OPERAND_KIMM32:
+      // Check if this operand is an immediate.
+      // FrameIndex operands will be replaced by immediates, so they are
+      // allowed.
+      if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
+        ErrInfo = "Expected immediate, but got non-immediate";
+        return false;
+      }
+      LLVM_FALLTHROUGH;
+    default:
+      continue;
+    }
+
+    if (!MI.getOperand(i).isReg())
+      continue;
+
+    if (RegClass != -1) {
+      unsigned Reg = MI.getOperand(i).getReg();
+      if (Reg == AMDGPU::NoRegister ||
+          TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+
+      const TargetRegisterClass *RC = RI.getRegClass(RegClass);
+      if (!RC->contains(Reg)) {
+        ErrInfo = "Operand has incorrect register class.";
+        return false;
+      }
+    }
+  }
+
+  // Verify VOP*
+  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) {
+    // Only look at the true operands. Only a real operand can use the constant
+    // bus, and we don't want to check pseudo-operands like the source modifier
+    // flags.
+    const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+
+    unsigned ConstantBusCount = 0;
+
+    if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
+      ++ConstantBusCount;
+
+    unsigned SGPRUsed = findImplicitSGPRRead(MI);
+    if (SGPRUsed != AMDGPU::NoRegister)
+      ++ConstantBusCount;
+
+    for (int OpIdx : OpIndices) {
+      if (OpIdx == -1)
+        break;
+      const MachineOperand &MO = MI.getOperand(OpIdx);
+      if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
+        if (MO.isReg()) {
+          if (MO.getReg() != SGPRUsed)
+            ++ConstantBusCount;
+          SGPRUsed = MO.getReg();
+        } else {
+          ++ConstantBusCount;
+        }
+      }
+    }
+    if (ConstantBusCount > 1) {
+      ErrInfo = "VOP* instruction uses the constant bus more than once";
+      return false;
+    }
+  }
+
+  // Verify misc. restrictions on specific instructions.
+  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
+      Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
+    const MachineOperand &Src0 = MI.getOperand(Src0Idx);
+    const MachineOperand &Src1 = MI.getOperand(Src1Idx);
+    const MachineOperand &Src2 = MI.getOperand(Src2Idx);
+    if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
+      if (!compareMachineOp(Src0, Src1) &&
+          !compareMachineOp(Src0, Src2)) {
+        ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
+        return false;
+      }
+    }
+  }
+
+  if (isSOPK(MI)) {
+    int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
+    if (sopkIsZext(MI)) {
+      if (!isUInt<16>(Imm)) {
+        ErrInfo = "invalid immediate for SOPK instruction";
+        return false;
+      }
+    } else {
+      if (!isInt<16>(Imm)) {
+        ErrInfo = "invalid immediate for SOPK instruction";
+        return false;
+      }
+    }
+  }
+
+  if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
+      Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
+      Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
+      Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
+    const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
+                       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
+
+    const unsigned StaticNumOps = Desc.getNumOperands() +
+      Desc.getNumImplicitUses();
+    const unsigned NumImplicitOps = IsDst ? 2 : 1;
+
+    // Allow additional implicit operands. This allows a fixup done by the post
+    // RA scheduler where the main implicit operand is killed and implicit-defs
+    // are added for sub-registers that remain live after this instruction.
+    if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
+      ErrInfo = "missing implicit register operands";
+      return false;
+    }
+
+    const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
+    if (IsDst) {
+      if (!Dst->isUse()) {
+        ErrInfo = "v_movreld_b32 vdst should be a use operand";
+        return false;
+      }
+
+      unsigned UseOpIdx;
+      if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
+          UseOpIdx != StaticNumOps + 1) {
+        ErrInfo = "movrel implicit operands should be tied";
+        return false;
+      }
+    }
+
+    const MachineOperand &Src0 = MI.getOperand(Src0Idx);
+    const MachineOperand &ImpUse
+      = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
+    if (!ImpUse.isReg() || !ImpUse.isUse() ||
+        !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
+      ErrInfo = "src0 should be subreg of implicit vector use";
+      return false;
+    }
+  }
+
+  // Make sure we aren't losing exec uses in the td files. This mostly requires
+  // being careful when using let Uses to try to add other use registers.
+  if (shouldReadExec(MI)) {
+    if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
+      ErrInfo = "VALU instruction does not implicitly read exec mask";
+      return false;
+    }
+  }
+
+  if (isSMRD(MI)) {
+    if (MI.mayStore()) {
+      // The register offset form of scalar stores may only use m0 as the
+      // soffset register.
+      const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
+      if (Soff && Soff->getReg() != AMDGPU::M0) {
+        ErrInfo = "scalar stores must use m0 as offset register";
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default: return AMDGPU::INSTRUCTION_LIST_END;
+  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
+  case AMDGPU::COPY: return AMDGPU::COPY;
+  case AMDGPU::PHI: return AMDGPU::PHI;
+  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
+  case AMDGPU::S_MOV_B32:
+    return MI.getOperand(1).isReg() ?
+           AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
+  case AMDGPU::S_ADD_I32:
+  case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
+  case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
+  case AMDGPU::S_SUB_I32:
+  case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
+  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
+  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
+  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
+  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
+  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
+  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
+  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
+  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
+  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
+  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
+  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
+  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
+  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
+  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
+  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
+  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
+  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
+  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
+  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
+  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
+  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
+  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
+  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
+  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
+  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
+  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
+  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
+  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
+  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
+  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
+  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
+  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
+  case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
+  case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
+  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
+  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
+  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
+  case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
+  case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
+  case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
+  }
+}
+
+bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
+  return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
+}
+
+const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
+                                                      unsigned OpNo) const {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  const MCInstrDesc &Desc = get(MI.getOpcode());
+  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
+      Desc.OpInfo[OpNo].RegClass == -1) {
+    unsigned Reg = MI.getOperand(OpNo).getReg();
+
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      return MRI.getRegClass(Reg);
+    return RI.getPhysRegClass(Reg);
+  }
+
+  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
+  return RI.getRegClass(RCID);
+}
+
+bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
+  switch (MI.getOpcode()) {
+  case AMDGPU::COPY:
+  case AMDGPU::REG_SEQUENCE:
+  case AMDGPU::PHI:
+  case AMDGPU::INSERT_SUBREG:
+    return RI.hasVGPRs(getOpRegClass(MI, 0));
+  default:
+    return RI.hasVGPRs(getOpRegClass(MI, OpNo));
+  }
+}
+
+void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
+  MachineBasicBlock::iterator I = MI;
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineOperand &MO = MI.getOperand(OpIdx);
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
+  const TargetRegisterClass *RC = RI.getRegClass(RCID);
+  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+  if (MO.isReg())
+    Opcode = AMDGPU::COPY;
+  else if (RI.isSGPRClass(RC))
+    Opcode = AMDGPU::S_MOV_B32;
+
+  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
+  if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
+    VRC = &AMDGPU::VReg_64RegClass;
+  else
+    VRC = &AMDGPU::VGPR_32RegClass;
+
+  unsigned Reg = MRI.createVirtualRegister(VRC);
+  DebugLoc DL = MBB->findDebugLoc(I);
+  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).addOperand(MO);
+  MO.ChangeToRegister(Reg, false);
+}
+
+unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
+                                         MachineRegisterInfo &MRI,
+                                         MachineOperand &SuperReg,
+                                         const TargetRegisterClass *SuperRC,
+                                         unsigned SubIdx,
+                                         const TargetRegisterClass *SubRC)
+                                         const {
+  MachineBasicBlock *MBB = MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned SubReg = MRI.createVirtualRegister(SubRC);
+
+  if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
+    BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
+      .addReg(SuperReg.getReg(), 0, SubIdx);
+    return SubReg;
+  }
+
+  // Just in case the super register is itself a sub-register, copy it to a new
+  // value so we don't need to worry about merging its subreg index with the
+  // SubIdx passed to this function. The register coalescer should be able to
+  // eliminate this extra copy.
+  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
+
+  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
+    .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
+
+  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
+    .addReg(NewSuperReg, 0, SubIdx);
+
+  return SubReg;
+}
+
+MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
+  MachineBasicBlock::iterator MII,
+  MachineRegisterInfo &MRI,
+  MachineOperand &Op,
+  const TargetRegisterClass *SuperRC,
+  unsigned SubIdx,
+  const TargetRegisterClass *SubRC) const {
+  if (Op.isImm()) {
+    if (SubIdx == AMDGPU::sub0)
+      return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
+    if (SubIdx == AMDGPU::sub1)
+      return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
+
+    llvm_unreachable("Unhandled register index for immediate");
+  }
+
+  unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
+                                       SubIdx, SubRC);
+  return MachineOperand::CreateReg(SubReg, false);
+}
+
+// Change the order of operands from (0, 1, 2) to (0, 2, 1)
+void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
+  assert(Inst.getNumExplicitOperands() == 3);
+  MachineOperand Op1 = Inst.getOperand(1);
+  Inst.RemoveOperand(1);
+  Inst.addOperand(Op1);
+}
+
+bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
+                                    const MCOperandInfo &OpInfo,
+                                    const MachineOperand &MO) const {
+  if (!MO.isReg())
+    return false;
+
+  unsigned Reg = MO.getReg();
+  const TargetRegisterClass *RC =
+    TargetRegisterInfo::isVirtualRegister(Reg) ?
+    MRI.getRegClass(Reg) :
+    RI.getPhysRegClass(Reg);
+
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
+  RC = TRI->getSubRegClass(RC, MO.getSubReg());
+
+  // In order to be legal, the common sub-class must be equal to the
+  // class of the current operand.  For example:
+  //
+  // v_mov_b32 s0 ; Operand defined as vsrc_b32
+  //              ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
+  //
+  // s_sendmsg 0, s0 ; Operand defined as m0reg
+  //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
+
+  return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+}
+
+bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
+                                     const MCOperandInfo &OpInfo,
+                                     const MachineOperand &MO) const {
+  if (MO.isReg())
+    return isLegalRegOperand(MRI, OpInfo, MO);
+
+  // Handle non-register types that are treated like immediates.
+  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+  return true;
+}
+
+bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
+                                 const MachineOperand *MO) const {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  const MCInstrDesc &InstDesc = MI.getDesc();
+  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
+  const TargetRegisterClass *DefinedRC =
+      OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
+  if (!MO)
+    MO = &MI.getOperand(OpIdx);
+
+  if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
+
+    RegSubRegPair SGPRUsed;
+    if (MO->isReg())
+      SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
+
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      if (i == OpIdx)
+        continue;
+      const MachineOperand &Op = MI.getOperand(i);
+      if (Op.isReg()) {
+        if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
+            usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
+          return false;
+        }
+      } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
+        return false;
+      }
+    }
+  }
+
+  if (MO->isReg()) {
+    assert(DefinedRC);
+    return isLegalRegOperand(MRI, OpInfo, *MO);
+  }
+
+  // Handle non-register types that are treated like immediates.
+  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
+
+  if (!DefinedRC) {
+    // This operand expects an immediate.
+    return true;
+  }
+
+  return isImmOperandLegal(MI, OpIdx, *MO);
+}
+
+void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
+                                       MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
+  const MCInstrDesc &InstrDesc = get(Opc);
+
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+  MachineOperand &Src1 = MI.getOperand(Src1Idx);
+
+  // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
+  // we need to only have one constant bus use.
+  //
+  // Note we do not need to worry about literal constants here. They are
+  // disabled for the operand type for instructions because they will always
+  // violate the one constant bus use rule.
+  bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
+  if (HasImplicitSGPR) {
+    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+    MachineOperand &Src0 = MI.getOperand(Src0Idx);
+
+    if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
+      legalizeOpWithMove(MI, Src0Idx);
+  }
+
+  // VOP2 src0 instructions support all operand types, so we don't need to check
+  // their legality. If src1 is already legal, we don't need to do anything.
+  if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
+    return;
+
+  // We do not use commuteInstruction here because it is too aggressive and will
+  // commute if it is possible. We only want to commute here if it improves
+  // legality. This can be called a fairly large number of times so don't waste
+  // compile time pointlessly swapping and checking legality again.
+  if (HasImplicitSGPR || !MI.isCommutable()) {
+    legalizeOpWithMove(MI, Src1Idx);
+    return;
+  }
+
+  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+  MachineOperand &Src0 = MI.getOperand(Src0Idx);
+
+  // If src0 can be used as src1, commuting will make the operands legal.
+  // Otherwise we have to give up and insert a move.
+  //
+  // TODO: Other immediate-like operand kinds could be commuted if there was a
+  // MachineOperand::ChangeTo* for them.
+  if ((!Src1.isImm() && !Src1.isReg()) ||
+      !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
+    legalizeOpWithMove(MI, Src1Idx);
+    return;
+  }
+
+  int CommutedOpc = commuteOpcode(MI);
+  if (CommutedOpc == -1) {
+    legalizeOpWithMove(MI, Src1Idx);
+    return;
+  }
+
+  MI.setDesc(get(CommutedOpc));
+
+  unsigned Src0Reg = Src0.getReg();
+  unsigned Src0SubReg = Src0.getSubReg();
+  bool Src0Kill = Src0.isKill();
+
+  if (Src1.isImm())
+    Src0.ChangeToImmediate(Src1.getImm());
+  else if (Src1.isReg()) {
+    Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
+    Src0.setSubReg(Src1.getSubReg());
+  } else
+    llvm_unreachable("Should only have register or immediate operands");
+
+  Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
+  Src1.setSubReg(Src0SubReg);
+}
+
+// Legalize VOP3 operands. Because all operand types are supported for any
+// operand, and since literal constants are not allowed and should never be
+// seen, we only need to worry about inserting copies if we use multiple SGPR
+// operands.
+void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
+                                       MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
+
+  int VOP3Idx[3] = {
+    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
+    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
+    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
+  };
+
+  // Find the one SGPR operand we are allowed to use.
+  unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+
+  for (unsigned i = 0; i < 3; ++i) {
+    int Idx = VOP3Idx[i];
+    if (Idx == -1)
+      break;
+    MachineOperand &MO = MI.getOperand(Idx);
+
+    // We should never see a VOP3 instruction with an illegal immediate operand.
+    if (!MO.isReg())
+      continue;
+
+    if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
+      continue; // VGPRs are legal
+
+    if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
+      SGPRReg = MO.getReg();
+      // We can use one SGPR in each VOP3 instruction.
+      continue;
+    }
+
+    // If we make it this far, then the operand is not legal and we must
+    // legalize it.
+    legalizeOpWithMove(MI, Idx);
+  }
+}
+
+unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
+                                         MachineRegisterInfo &MRI) const {
+  const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
+  const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
+  unsigned DstReg = MRI.createVirtualRegister(SRC);
+  unsigned SubRegs = VRC->getSize() / 4;
+
+  SmallVector<unsigned, 8> SRegs;
+  for (unsigned i = 0; i < SubRegs; ++i) {
+    unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+            get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
+        .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
+    SRegs.push_back(SGPR);
+  }
+
+  MachineInstrBuilder MIB =
+      BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+              get(AMDGPU::REG_SEQUENCE), DstReg);
+  for (unsigned i = 0; i < SubRegs; ++i) {
+    MIB.addReg(SRegs[i]);
+    MIB.addImm(RI.getSubRegFromChannel(i));
+  }
+  return DstReg;
+}
+
+void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
+                                       MachineInstr &MI) const {
+
+  // If the pointer is store in VGPRs, then we need to move them to
+  // SGPRs using v_readfirstlane.  This is safe because we only select
+  // loads with uniform pointers to SMRD instruction so we know the
+  // pointer value is uniform.
+  MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
+  if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
+      unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
+      SBase->setReg(SGPR);
+  }
+}
+
+void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
+                                         MachineBasicBlock::iterator I,
+                                         const TargetRegisterClass *DstRC,
+                                         MachineOperand &Op,
+                                         MachineRegisterInfo &MRI,
+                                         const DebugLoc &DL) const {
+
+  unsigned OpReg = Op.getReg();
+  unsigned OpSubReg = Op.getSubReg();
+
+  const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
+      RI.getRegClassForReg(MRI, OpReg), OpSubReg);
+
+  // Check if operand is already the correct register class.
+  if (DstRC == OpRC)
+    return;
+
+  unsigned DstReg = MRI.createVirtualRegister(DstRC);
+  MachineInstr *Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg)
+                               .addOperand(Op);
+
+  Op.setReg(DstReg);
+  Op.setSubReg(0);
+
+  MachineInstr *Def = MRI.getVRegDef(OpReg);
+  if (!Def)
+    return;
+
+  // Try to eliminate the copy if it is copying an immediate value.
+  if (Def->isMoveImmediate())
+    FoldImmediate(*Copy, *Def, OpReg, &MRI);
+}
+
+void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Legalize VOP2
+  if (isVOP2(MI) || isVOPC(MI)) {
+    legalizeOperandsVOP2(MRI, MI);
+    return;
+  }
+
+  // Legalize VOP3
+  if (isVOP3(MI)) {
+    legalizeOperandsVOP3(MRI, MI);
+    return;
+  }
+
+  // Legalize SMRD
+  if (isSMRD(MI)) {
+    legalizeOperandsSMRD(MRI, MI);
+    return;
+  }
+
+  // Legalize REG_SEQUENCE and PHI
+  // The register class of the operands much be the same type as the register
+  // class of the output.
+  if (MI.getOpcode() == AMDGPU::PHI) {
+    const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
+      if (!MI.getOperand(i).isReg() ||
+          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
+        continue;
+      const TargetRegisterClass *OpRC =
+          MRI.getRegClass(MI.getOperand(i).getReg());
+      if (RI.hasVGPRs(OpRC)) {
+        VRC = OpRC;
+      } else {
+        SRC = OpRC;
+      }
+    }
+
+    // If any of the operands are VGPR registers, then they all most be
+    // otherwise we will create illegal VGPR->SGPR copies when legalizing
+    // them.
+    if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
+      if (!VRC) {
+        assert(SRC);
+        VRC = RI.getEquivalentVGPRClass(SRC);
+      }
+      RC = VRC;
+    } else {
+      RC = SRC;
+    }
+
+    // Update all the operands so they have the same type.
+    for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
+      MachineOperand &Op = MI.getOperand(I);
+      if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+        continue;
+
+      // MI is a PHI instruction.
+      MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
+      MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
+
+      // Avoid creating no-op copies with the same src and dst reg class.  These
+      // confuse some of the machine passes.
+      legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
+    }
+  }
+
+  // REG_SEQUENCE doesn't really require operand legalization, but if one has a
+  // VGPR dest type and SGPR sources, insert copies so all operands are
+  // VGPRs. This seems to help operand folding / the register coalescer.
+  if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
+    MachineBasicBlock *MBB = MI.getParent();
+    const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
+    if (RI.hasVGPRs(DstRC)) {
+      // Update all the operands so they are VGPR register classes. These may
+      // not be the same register class because REG_SEQUENCE supports mixing
+      // subregister index types e.g. sub0_sub1 + sub2 + sub3
+      for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
+        MachineOperand &Op = MI.getOperand(I);
+        if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+          continue;
+
+        const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
+        const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
+        if (VRC == OpRC)
+          continue;
+
+        legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
+        Op.setIsKill();
+      }
+    }
+
+    return;
+  }
+
+  // Legalize INSERT_SUBREG
+  // src0 must have the same register class as dst
+  if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
+    unsigned Dst = MI.getOperand(0).getReg();
+    unsigned Src0 = MI.getOperand(1).getReg();
+    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
+    if (DstRC != Src0RC) {
+      MachineBasicBlock *MBB = MI.getParent();
+      MachineOperand &Op = MI.getOperand(1);
+      legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
+    }
+    return;
+  }
+
+  // Legalize MIMG and MUBUF/MTBUF for shaders.
+  //
+  // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
+  // scratch memory access. In both cases, the legalization never involves
+  // conversion to the addr64 form.
+  if (isMIMG(MI) ||
+      (AMDGPU::isShader(MF.getFunction()->getCallingConv()) &&
+       (isMUBUF(MI) || isMTBUF(MI)))) {
+    MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
+    if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
+      unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
+      SRsrc->setReg(SGPR);
+    }
+
+    MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
+    if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
+      unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
+      SSamp->setReg(SGPR);
+    }
+    return;
+  }
+
+  // Legalize MUBUF* instructions by converting to addr64 form.
+  // FIXME: If we start using the non-addr64 instructions for compute, we
+  // may need to legalize them as above. This especially applies to the
+  // buffer_load_format_* variants and variants with idxen (or bothen).
+  int SRsrcIdx =
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
+  if (SRsrcIdx != -1) {
+    // We have an MUBUF instruction
+    MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
+    unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
+    if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
+                                             RI.getRegClass(SRsrcRC))) {
+      // The operands are legal.
+      // FIXME: We may need to legalize operands besided srsrc.
+      return;
+    }
+
+    MachineBasicBlock &MBB = *MI.getParent();
+
+    // Extract the ptr from the resource descriptor.
+    unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
+      &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
+
+    // Create an empty resource descriptor
+    unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+    uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
+
+    // Zero64 = 0
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
+        .addImm(0);
+
+    // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
+        .addImm(RsrcDataFormat & 0xFFFFFFFF);
+
+    // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
+        .addImm(RsrcDataFormat >> 32);
+
+    // NewSRsrc = {Zero64, SRsrcFormat}
+    BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
+        .addReg(Zero64)
+        .addImm(AMDGPU::sub0_sub1)
+        .addReg(SRsrcFormatLo)
+        .addImm(AMDGPU::sub2)
+        .addReg(SRsrcFormatHi)
+        .addImm(AMDGPU::sub3);
+
+    MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
+    unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+    if (VAddr) {
+      // This is already an ADDR64 instruction so we need to add the pointer
+      // extracted from the resource descriptor to the current value of VAddr.
+      unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+      // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
+      DebugLoc DL = MI.getDebugLoc();
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
+        .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+        .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
+
+      // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
+        .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+        .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
+
+      // NewVaddr = {NewVaddrHi, NewVaddrLo}
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
+          .addReg(NewVAddrLo)
+          .addImm(AMDGPU::sub0)
+          .addReg(NewVAddrHi)
+          .addImm(AMDGPU::sub1);
+    } else {
+      // This instructions is the _OFFSET variant, so we need to convert it to
+      // ADDR64.
+      assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
+             < SISubtarget::VOLCANIC_ISLANDS &&
+             "FIXME: Need to emit flat atomics here");
+
+      MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
+      MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
+      MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
+      unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
+
+      // Atomics rith return have have an additional tied operand and are
+      // missing some of the special bits.
+      MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
+      MachineInstr *Addr64;
+
+      if (!VDataIn) {
+        // Regular buffer load / store.
+        MachineInstrBuilder MIB =
+            BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
+                .addOperand(*VData)
+                .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+                // This will be replaced later
+                // with the new value of vaddr.
+                .addOperand(*SRsrc)
+                .addOperand(*SOffset)
+                .addOperand(*Offset);
+
+        // Atomics do not have this operand.
+        if (const MachineOperand *GLC =
+                getNamedOperand(MI, AMDGPU::OpName::glc)) {
+          MIB.addImm(GLC->getImm());
+        }
+
+        MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
+
+        if (const MachineOperand *TFE =
+                getNamedOperand(MI, AMDGPU::OpName::tfe)) {
+          MIB.addImm(TFE->getImm());
+        }
+
+        MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+        Addr64 = MIB;
+      } else {
+        // Atomics with return.
+        Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
+                     .addOperand(*VData)
+                     .addOperand(*VDataIn)
+                     .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+                     // This will be replaced later
+                     // with the new value of vaddr.
+                     .addOperand(*SRsrc)
+                     .addOperand(*SOffset)
+                     .addOperand(*Offset)
+                     .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
+                     .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      }
+
+      MI.removeFromParent();
+
+      // NewVaddr = {NewVaddrHi, NewVaddrLo}
+      BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
+              NewVAddr)
+          .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+          .addImm(AMDGPU::sub0)
+          .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+          .addImm(AMDGPU::sub1);
+
+      VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
+      SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
+    }
+
+    // Update the instruction to use NewVaddr
+    VAddr->setReg(NewVAddr);
+    // Update the instruction to use NewSRsrc
+    SRsrc->setReg(NewSRsrc);
+  }
+}
+
+void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
+  SmallVector<MachineInstr *, 128> Worklist;
+  Worklist.push_back(&TopInst);
+
+  while (!Worklist.empty()) {
+    MachineInstr &Inst = *Worklist.pop_back_val();
+    MachineBasicBlock *MBB = Inst.getParent();
+    MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+    unsigned Opcode = Inst.getOpcode();
+    unsigned NewOpcode = getVALUOp(Inst);
+
+    // Handle some special cases
+    switch (Opcode) {
+    default:
+      break;
+    case AMDGPU::S_AND_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
+      Inst.eraseFromParent();
+      continue;
+
+    case AMDGPU::S_OR_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
+      Inst.eraseFromParent();
+      continue;
+
+    case AMDGPU::S_XOR_B64:
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
+      Inst.eraseFromParent();
+      continue;
+
+    case AMDGPU::S_NOT_B64:
+      splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
+      Inst.eraseFromParent();
+      continue;
+
+    case AMDGPU::S_BCNT1_I32_B64:
+      splitScalar64BitBCNT(Worklist, Inst);
+      Inst.eraseFromParent();
+      continue;
+
+    case AMDGPU::S_BFE_I64: {
+      splitScalar64BitBFE(Worklist, Inst);
+      Inst.eraseFromParent();
+      continue;
+    }
+
+    case AMDGPU::S_LSHL_B32:
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_ASHR_I32:
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_LSHR_B32:
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_LSHL_B64:
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_LSHLREV_B64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_ASHR_I64:
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_ASHRREV_I64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_LSHR_B64:
+      if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_LSHRREV_B64;
+        swapOperands(Inst);
+      }
+      break;
+
+    case AMDGPU::S_ABS_I32:
+      lowerScalarAbs(Worklist, Inst);
+      Inst.eraseFromParent();
+      continue;
+
+    case AMDGPU::S_CBRANCH_SCC0:
+    case AMDGPU::S_CBRANCH_SCC1:
+      // Clear unused bits of vcc
+      BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
+              AMDGPU::VCC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(AMDGPU::VCC);
+      break;
+
+    case AMDGPU::S_BFE_U64:
+    case AMDGPU::S_BFM_B64:
+      llvm_unreachable("Moving this op to VALU not implemented");
+    }
+
+    if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
+      // We cannot move this instruction to the VALU, so we should try to
+      // legalize its operands instead.
+      legalizeOperands(Inst);
+      continue;
+    }
+
+    // Use the new VALU Opcode.
+    const MCInstrDesc &NewDesc = get(NewOpcode);
+    Inst.setDesc(NewDesc);
+
+    // Remove any references to SCC. Vector instructions can't read from it, and
+    // We're just about to add the implicit use / defs of VCC, and we don't want
+    // both.
+    for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
+      MachineOperand &Op = Inst.getOperand(i);
+      if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
+        Inst.RemoveOperand(i);
+        addSCCDefUsersToVALUWorklist(Inst, Worklist);
+      }
+    }
+
+    if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
+      // We are converting these to a BFE, so we need to add the missing
+      // operands for the size and offset.
+      unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
+      Inst.addOperand(MachineOperand::CreateImm(0));
+      Inst.addOperand(MachineOperand::CreateImm(Size));
+
+    } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
+      // The VALU version adds the second operand to the result, so insert an
+      // extra 0 operand.
+      Inst.addOperand(MachineOperand::CreateImm(0));
+    }
+
+    Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
+
+    if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+      const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
+      // If we need to move this to VGPRs, we need to unpack the second operand
+      // back into the 2 separate ones for bit offset and width.
+      assert(OffsetWidthOp.isImm() &&
+             "Scalar BFE is only implemented for constant width and offset");
+      uint32_t Imm = OffsetWidthOp.getImm();
+
+      uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+      uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+      Inst.RemoveOperand(2);                     // Remove old immediate.
+      Inst.addOperand(MachineOperand::CreateImm(Offset));
+      Inst.addOperand(MachineOperand::CreateImm(BitWidth));
+    }
+
+    bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
+    unsigned NewDstReg = AMDGPU::NoRegister;
+    if (HasDst) {
+      // Update the destination register class.
+      const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
+      if (!NewDstRC)
+        continue;
+
+      unsigned DstReg = Inst.getOperand(0).getReg();
+      if (Inst.isCopy() &&
+          TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
+          NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
+        // Instead of creating a copy where src and dst are the same register
+        // class, we just replace all uses of dst with src.  These kinds of
+        // copies interfere with the heuristics MachineSink uses to decide
+        // whether or not to split a critical edge.  Since the pass assumes
+        // that copies will end up as machine instructions and not be
+        // eliminated.
+        addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+        MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
+        MRI.clearKillFlags(Inst.getOperand(1).getReg());
+        Inst.getOperand(0).setReg(DstReg);
+        continue;
+      }
+
+      NewDstReg = MRI.createVirtualRegister(NewDstRC);
+      MRI.replaceRegWith(DstReg, NewDstReg);
+    }
+
+    // Legalize the operands
+    legalizeOperands(Inst);
+
+    if (HasDst)
+     addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
+  }
+}
+
+void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+                                 MachineInstr &Inst) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineBasicBlock::iterator MII = Inst;
+  DebugLoc DL = Inst.getDebugLoc();
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src = Inst.getOperand(1);
+  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg)
+    .addImm(0)
+    .addReg(Src.getReg());
+
+  BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
+    .addReg(Src.getReg())
+    .addReg(TmpReg);
+
+  MRI.replaceRegWith(Dest.getReg(), ResultReg);
+  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
+void SIInstrInfo::splitScalar64BitUnaryOp(
+    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+    unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src0 = Inst.getOperand(1);
+  DebugLoc DL = Inst.getDebugLoc();
+
+  MachineBasicBlock::iterator MII = Inst;
+
+  const MCInstrDesc &InstDesc = get(Opcode);
+  const TargetRegisterClass *Src0RC = Src0.isReg() ?
+    MRI.getRegClass(Src0.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+
+  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub0, Src0SubRC);
+
+  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
+
+  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
+  BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+    .addOperand(SrcReg0Sub0);
+
+  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub1, Src0SubRC);
+
+  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
+  BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+    .addOperand(SrcReg0Sub1);
+
+  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
+  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+    .addReg(DestSub0)
+    .addImm(AMDGPU::sub0)
+    .addReg(DestSub1)
+    .addImm(AMDGPU::sub1);
+
+  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+  // We don't need to legalizeOperands here because for a single operand, src0
+  // will support any kind of input.
+
+  // Move all users of this moved value.
+  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
+void SIInstrInfo::splitScalar64BitBinaryOp(
+    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+    unsigned Opcode) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src0 = Inst.getOperand(1);
+  MachineOperand &Src1 = Inst.getOperand(2);
+  DebugLoc DL = Inst.getDebugLoc();
+
+  MachineBasicBlock::iterator MII = Inst;
+
+  const MCInstrDesc &InstDesc = get(Opcode);
+  const TargetRegisterClass *Src0RC = Src0.isReg() ?
+    MRI.getRegClass(Src0.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
+  const TargetRegisterClass *Src1RC = Src1.isReg() ?
+    MRI.getRegClass(Src1.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
+
+  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub0, Src0SubRC);
+  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+                                                       AMDGPU::sub0, Src1SubRC);
+
+  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
+
+  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
+  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+                              .addOperand(SrcReg0Sub0)
+                              .addOperand(SrcReg1Sub0);
+
+  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+                                                       AMDGPU::sub1, Src0SubRC);
+  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+                                                       AMDGPU::sub1, Src1SubRC);
+
+  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
+  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+                              .addOperand(SrcReg0Sub1)
+                              .addOperand(SrcReg1Sub1);
+
+  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
+  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+    .addReg(DestSub0)
+    .addImm(AMDGPU::sub0)
+    .addReg(DestSub1)
+    .addImm(AMDGPU::sub1);
+
+  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+
+  // Try to legalize the operands in case we need to swap the order to keep it
+  // valid.
+  legalizeOperands(LoHalf);
+  legalizeOperands(HiHalf);
+
+  // Move all users of this moved vlaue.
+  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
+void SIInstrInfo::splitScalar64BitBCNT(
+    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineBasicBlock::iterator MII = Inst;
+  DebugLoc DL = Inst.getDebugLoc();
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src = Inst.getOperand(1);
+
+  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
+  const TargetRegisterClass *SrcRC = Src.isReg() ?
+    MRI.getRegClass(Src.getReg()) :
+    &AMDGPU::SGPR_32RegClass;
+
+  unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
+
+  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+                                                      AMDGPU::sub0, SrcSubRC);
+  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
+                                                      AMDGPU::sub1, SrcSubRC);
+
+  BuildMI(MBB, MII, DL, InstDesc, MidReg)
+    .addOperand(SrcRegSub0)
+    .addImm(0);
+
+  BuildMI(MBB, MII, DL, InstDesc, ResultReg)
+    .addOperand(SrcRegSub1)
+    .addReg(MidReg);
+
+  MRI.replaceRegWith(Dest.getReg(), ResultReg);
+
+  // We don't need to legalize operands here. src0 for etiher instruction can be
+  // an SGPR, and the second input is unused or determined here.
+  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
+void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+                                      MachineInstr &Inst) const {
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineBasicBlock::iterator MII = Inst;
+  DebugLoc DL = Inst.getDebugLoc();
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  uint32_t Imm = Inst.getOperand(2).getImm();
+  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+
+  (void) Offset;
+
+  // Only sext_inreg cases handled.
+  assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
+         Offset == 0 && "Not implemented");
+
+  if (BitWidth < 32) {
+    unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+
+    BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
+        .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
+        .addImm(0)
+        .addImm(BitWidth);
+
+    BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
+      .addImm(31)
+      .addReg(MidRegLo);
+
+    BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
+      .addReg(MidRegLo)
+      .addImm(AMDGPU::sub0)
+      .addReg(MidRegHi)
+      .addImm(AMDGPU::sub1);
+
+    MRI.replaceRegWith(Dest.getReg(), ResultReg);
+    addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+    return;
+  }
+
+  MachineOperand &Src = Inst.getOperand(1);
+  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+
+  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
+    .addImm(31)
+    .addReg(Src.getReg(), 0, AMDGPU::sub0);
+
+  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
+    .addReg(Src.getReg(), 0, AMDGPU::sub0)
+    .addImm(AMDGPU::sub0)
+    .addReg(TmpReg)
+    .addImm(AMDGPU::sub1);
+
+  MRI.replaceRegWith(Dest.getReg(), ResultReg);
+  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
+void SIInstrInfo::addUsersToMoveToVALUWorklist(
+  unsigned DstReg,
+  MachineRegisterInfo &MRI,
+  SmallVectorImpl<MachineInstr *> &Worklist) const {
+  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
+         E = MRI.use_end(); I != E;) {
+    MachineInstr &UseMI = *I->getParent();
+    if (!canReadVGPR(UseMI, I.getOperandNo())) {
+      Worklist.push_back(&UseMI);
+
+      do {
+        ++I;
+      } while (I != E && I->getParent() == &UseMI);
+    } else {
+      ++I;
+    }
+  }
+}
+
+void SIInstrInfo::addSCCDefUsersToVALUWorklist(
+    MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const {
+  // This assumes that all the users of SCC are in the same block
+  // as the SCC def.
+  for (MachineInstr &MI :
+       llvm::make_range(MachineBasicBlock::iterator(SCCDefInst),
+                        SCCDefInst.getParent()->end())) {
+    // Exit if we find another SCC def.
+    if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
+      return;
+
+    if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
+      Worklist.push_back(&MI);
+  }
+}
+
+const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
+  const MachineInstr &Inst) const {
+  const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
+
+  switch (Inst.getOpcode()) {
+  // For target instructions, getOpRegClass just returns the virtual register
+  // class associated with the operand, so we need to find an equivalent VGPR
+  // register class in order to move the instruction to the VALU.
+  case AMDGPU::COPY:
+  case AMDGPU::PHI:
+  case AMDGPU::REG_SEQUENCE:
+  case AMDGPU::INSERT_SUBREG:
+    if (RI.hasVGPRs(NewDstRC))
+      return nullptr;
+
+    NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
+    if (!NewDstRC)
+      return nullptr;
+    return NewDstRC;
+  default:
+    return NewDstRC;
+  }
+}
+
+// Find the one SGPR operand we are allowed to use.
+unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
+                                   int OpIndices[3]) const {
+  const MCInstrDesc &Desc = MI.getDesc();
+
+  // Find the one SGPR operand we are allowed to use.
+  //
+  // First we need to consider the instruction's operand requirements before
+  // legalizing. Some operands are required to be SGPRs, such as implicit uses
+  // of VCC, but we are still bound by the constant bus requirement to only use
+  // one.
+  //
+  // If the operand's class is an SGPR, we can never move it.
+
+  unsigned SGPRReg = findImplicitSGPRRead(MI);
+  if (SGPRReg != AMDGPU::NoRegister)
+    return SGPRReg;
+
+  unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+
+  for (unsigned i = 0; i < 3; ++i) {
+    int Idx = OpIndices[i];
+    if (Idx == -1)
+      break;
+
+    const MachineOperand &MO = MI.getOperand(Idx);
+    if (!MO.isReg())
+      continue;
+
+    // Is this operand statically required to be an SGPR based on the operand
+    // constraints?
+    const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
+    bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
+    if (IsRequiredSGPR)
+      return MO.getReg();
+
+    // If this could be a VGPR or an SGPR, Check the dynamic register class.
+    unsigned Reg = MO.getReg();
+    const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
+    if (RI.isSGPRClass(RegRC))
+      UsedSGPRs[i] = Reg;
+  }
+
+  // We don't have a required SGPR operand, so we have a bit more freedom in
+  // selecting operands to move.
+
+  // Try to select the most used SGPR. If an SGPR is equal to one of the
+  // others, we choose that.
+  //
+  // e.g.
+  // V_FMA_F32 v0, s0, s0, s0 -> No moves
+  // V_FMA_F32 v0, s0, s1, s0 -> Move s1
+
+  // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
+  // prefer those.
+
+  if (UsedSGPRs[0] != AMDGPU::NoRegister) {
+    if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
+      SGPRReg = UsedSGPRs[0];
+  }
+
+  if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
+    if (UsedSGPRs[1] == UsedSGPRs[2])
+      SGPRReg = UsedSGPRs[1];
+  }
+
+  return SGPRReg;
+}
+
+MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
+                                             unsigned OperandName) const {
+  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
+  if (Idx == -1)
+    return nullptr;
+
+  return &MI.getOperand(Idx);
+}
+
+uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
+  uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
+  if (ST.isAmdHsaOS()) {
+    RsrcDataFormat |= (1ULL << 56);
+
+    if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+      // Set MTYPE = 2
+      RsrcDataFormat |= (2ULL << 59);
+  }
+
+  return RsrcDataFormat;
+}
+
+uint64_t SIInstrInfo::getScratchRsrcWords23() const {
+  uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
+                    AMDGPU::RSRC_TID_ENABLE |
+                    0xffffffff; // Size;
+
+  uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
+
+  Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) |
+            // IndexStride = 64
+            (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT);
+
+  // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
+  // Clear them unless we want a huge stride.
+  if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+    Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
+
+  return Rsrc23;
+}
+
+bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
+
+  return isSMRD(Opc);
+}
+
+bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
+
+  return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
+}
+
+unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
+                                    int &FrameIndex) const {
+  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
+  if (!Addr || !Addr->isFI())
+    return AMDGPU::NoRegister;
+
+  assert(!MI.memoperands_empty() &&
+         (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
+
+  FrameIndex = Addr->getIndex();
+  return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
+}
+
+unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
+                                        int &FrameIndex) const {
+  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
+  assert(Addr && Addr->isFI());
+  FrameIndex = Addr->getIndex();
+  return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
+}
+
+unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                          int &FrameIndex) const {
+
+  if (!MI.mayLoad())
+    return AMDGPU::NoRegister;
+
+  if (isMUBUF(MI) || isVGPRSpill(MI))
+    return isStackAccess(MI, FrameIndex);
+
+  if (isSGPRSpill(MI))
+    return isSGPRStackAccess(MI, FrameIndex);
+
+  return AMDGPU::NoRegister;
+}
+
+unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                         int &FrameIndex) const {
+  if (!MI.mayStore())
+    return AMDGPU::NoRegister;
+
+  if (isMUBUF(MI) || isVGPRSpill(MI))
+    return isStackAccess(MI, FrameIndex);
+
+  if (isSGPRSpill(MI))
+    return isSGPRStackAccess(MI, FrameIndex);
+
+  return AMDGPU::NoRegister;
+}
+
+unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+  unsigned Opc = MI.getOpcode();
+  const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
+  unsigned DescSize = Desc.getSize();
+
+  // If we have a definitive size, we can use it. Otherwise we need to inspect
+  // the operands to know the size.
+  //
+  // FIXME: Instructions that have a base 32-bit encoding report their size as
+  // 4, even though they are really 8 bytes if they have a literal operand.
+  if (DescSize != 0 && DescSize != 4)
+    return DescSize;
+
+  if (Opc == AMDGPU::WAVE_BARRIER)
+    return 0;
+
+  // 4-byte instructions may have a 32-bit literal encoded after them. Check
+  // operands that coud ever be literals.
+  if (isVALU(MI) || isSALU(MI)) {
+    if (isFixedSize(MI)) {
+      assert(DescSize == 4);
+      return DescSize;
+    }
+
+    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+    if (Src0Idx == -1)
+      return 4; // No operands.
+
+    if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
+      return 8;
+
+    int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+    if (Src1Idx == -1)
+      return 4;
+
+    if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
+      return 8;
+
+    return 4;
+  }
+
+  if (DescSize == 4)
+    return 4;
+
+  switch (Opc) {
+  case AMDGPU::SI_MASK_BRANCH:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::DBG_VALUE:
+  case TargetOpcode::BUNDLE:
+  case TargetOpcode::EH_LABEL:
+    return 0;
+  case TargetOpcode::INLINEASM: {
+    const MachineFunction *MF = MI.getParent()->getParent();
+    const char *AsmStr = MI.getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  }
+  default:
+    llvm_unreachable("unable to find instruction size");
+  }
+}
+
+bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
+  if (!isFLAT(MI))
+    return false;
+
+  if (MI.memoperands_empty())
+    return true;
+
+  for (const MachineMemOperand *MMO : MI.memoperands()) {
+    if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
+      return true;
+  }
+  return false;
+}
+
+ArrayRef<std::pair<int, const char *>>
+SIInstrInfo::getSerializableTargetIndices() const {
+  static const std::pair<int, const char *> TargetIndices[] = {
+      {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
+  return makeArrayRef(TargetIndices);
+}
+
+/// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
+/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
+ScheduleHazardRecognizer *
+SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                            const ScheduleDAG *DAG) const {
+  return new GCNHazardRecognizer(DAG->MF);
+}
+
+/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
+/// pass.
+ScheduleHazardRecognizer *
+SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
+  return new GCNHazardRecognizer(MF);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
new file mode 100644
index 000000000000..e68f6f92ba96
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -0,0 +1,794 @@
+//===-- SIInstrInfo.h - SI Instruction Info Interface -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for SIInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H
+
+#include "AMDGPUInstrInfo.h"
+#include "SIDefines.h"
+#include "SIRegisterInfo.h"
+
+namespace llvm {
+
+class SIInstrInfo final : public AMDGPUInstrInfo {
+private:
+  const SIRegisterInfo RI;
+  const SISubtarget &ST;
+
+  // The the inverse predicate should have the negative value.
+  enum BranchPredicate {
+    INVALID_BR = 0,
+    SCC_TRUE = 1,
+    SCC_FALSE = -1,
+    VCCNZ = 2,
+    VCCZ = -2,
+    EXECNZ = -3,
+    EXECZ = 3
+  };
+
+  static unsigned getBranchOpcode(BranchPredicate Cond);
+  static BranchPredicate getBranchPredicate(unsigned Opcode);
+
+  unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
+                              MachineRegisterInfo &MRI,
+                              MachineOperand &SuperReg,
+                              const TargetRegisterClass *SuperRC,
+                              unsigned SubIdx,
+                              const TargetRegisterClass *SubRC) const;
+  MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI,
+                                         MachineRegisterInfo &MRI,
+                                         MachineOperand &SuperReg,
+                                         const TargetRegisterClass *SuperRC,
+                                         unsigned SubIdx,
+                                         const TargetRegisterClass *SubRC) const;
+
+  void swapOperands(MachineInstr &Inst) const;
+
+  void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+                      MachineInstr &Inst) const;
+
+  void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+                               MachineInstr &Inst, unsigned Opcode) const;
+
+  void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+                                MachineInstr &Inst, unsigned Opcode) const;
+
+  void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+                            MachineInstr &Inst) const;
+  void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+                           MachineInstr &Inst) const;
+
+  void addUsersToMoveToVALUWorklist(
+    unsigned Reg, MachineRegisterInfo &MRI,
+    SmallVectorImpl<MachineInstr *> &Worklist) const;
+
+  void
+  addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst,
+                               SmallVectorImpl<MachineInstr *> &Worklist) const;
+
+  const TargetRegisterClass *
+  getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
+
+  bool checkInstOffsetsDoNotOverlap(MachineInstr &MIa, MachineInstr &MIb) const;
+
+  unsigned findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
+
+protected:
+  bool swapSourceModifiers(MachineInstr &MI,
+                           MachineOperand &Src0, unsigned Src0OpName,
+                           MachineOperand &Src1, unsigned Src1OpName) const;
+
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+                                       unsigned OpIdx0,
+                                       unsigned OpIdx1) const override;
+
+public:
+
+  enum TargetOperandFlags {
+    MO_NONE = 0,
+    // MO_GOTPCREL -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL.
+    MO_GOTPCREL = 1,
+    // MO_GOTPCREL32_LO -> symbol@gotpcrel32@lo -> R_AMDGPU_GOTPCREL32_LO.
+    MO_GOTPCREL32 = 2,
+    MO_GOTPCREL32_LO = 2,
+    // MO_GOTPCREL32_HI -> symbol@gotpcrel32@hi -> R_AMDGPU_GOTPCREL32_HI.
+    MO_GOTPCREL32_HI = 3,
+    // MO_REL32_LO -> symbol@rel32@lo -> R_AMDGPU_REL32_LO.
+    MO_REL32 = 4,
+    MO_REL32_LO = 4,
+    // MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI.
+    MO_REL32_HI = 5
+  };
+
+  explicit SIInstrInfo(const SISubtarget &);
+
+  const SIRegisterInfo &getRegisterInfo() const {
+    return RI;
+  }
+
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
+                                         AliasAnalysis *AA) const override;
+
+  bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                               int64_t &Offset1,
+                               int64_t &Offset2) const override;
+
+  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+                             int64_t &Offset,
+                             const TargetRegisterInfo *TRI) const final;
+
+  bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
+                           unsigned NumLoads) const final;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, MachineInstr &MI,
+                                    RegScavenger *RS, unsigned TmpReg,
+                                    unsigned Offset, unsigned Size) const;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, unsigned SrcReg,
+                           bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI, unsigned DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+  // \brief Returns an opcode that can be used to move a value to a \p DstRC
+  // register.  If there is no hardware instruction that can store to \p
+  // DstRC, then AMDGPU::COPY is returned.
+  unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
+
+  LLVM_READONLY
+  int commuteOpcode(unsigned Opc) const;
+
+  LLVM_READONLY
+  inline int commuteOpcode(const MachineInstr &MI) const {
+    return commuteOpcode(MI.getOpcode());
+  }
+
+  bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+                             unsigned &SrcOpIdx2) const override;
+
+  bool isBranchOffsetInRange(unsigned BranchOpc,
+                             int64_t BrOffset) const override;
+
+  MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override;
+
+  unsigned insertIndirectBranch(MachineBasicBlock &MBB,
+                                MachineBasicBlock &NewDestBB,
+                                const DebugLoc &DL,
+                                int64_t BrOffset,
+                                RegScavenger *RS = nullptr) const override;
+
+  bool analyzeBranchImpl(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator I,
+                         MachineBasicBlock *&TBB,
+                         MachineBasicBlock *&FBB,
+                         SmallVectorImpl<MachineOperand> &Cond,
+                         bool AllowModify) const;
+
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+
+  bool reverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  bool
+  areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+                                  AliasAnalysis *AA = nullptr) const override;
+
+  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+                     MachineRegisterInfo *MRI) const final;
+
+  unsigned getMachineCSELookAheadLimit() const override { return 500; }
+
+  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MBB,
+                                      MachineInstr &MI,
+                                      LiveVariables *LV) const override;
+
+  bool isSchedulingBoundary(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
+  static bool isSALU(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SALU;
+  }
+
+  bool isSALU(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SALU;
+  }
+
+  static bool isVALU(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VALU;
+  }
+
+  bool isVALU(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VALU;
+  }
+
+  static bool isVMEM(const MachineInstr &MI) {
+    return isMUBUF(MI) || isMTBUF(MI) || isMIMG(MI);
+  }
+
+  bool isVMEM(uint16_t Opcode) const {
+    return isMUBUF(Opcode) || isMTBUF(Opcode) || isMIMG(Opcode);
+  }
+
+  static bool isSOP1(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SOP1;
+  }
+
+  bool isSOP1(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SOP1;
+  }
+
+  static bool isSOP2(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SOP2;
+  }
+
+  bool isSOP2(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SOP2;
+  }
+
+  static bool isSOPC(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SOPC;
+  }
+
+  bool isSOPC(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SOPC;
+  }
+
+  static bool isSOPK(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SOPK;
+  }
+
+  bool isSOPK(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SOPK;
+  }
+
+  static bool isSOPP(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SOPP;
+  }
+
+  bool isSOPP(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SOPP;
+  }
+
+  static bool isVOP1(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VOP1;
+  }
+
+  bool isVOP1(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VOP1;
+  }
+
+  static bool isVOP2(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VOP2;
+  }
+
+  bool isVOP2(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VOP2;
+  }
+
+  static bool isVOP3(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VOP3;
+  }
+
+  bool isVOP3(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VOP3;
+  }
+
+  static bool isVOPC(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VOPC;
+  }
+
+  bool isVOPC(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VOPC;
+  }
+
+  static bool isMUBUF(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::MUBUF;
+  }
+
+  bool isMUBUF(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
+  }
+
+  static bool isMTBUF(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::MTBUF;
+  }
+
+  bool isMTBUF(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
+  }
+
+  static bool isSMRD(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SMRD;
+  }
+
+  bool isSMRD(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SMRD;
+  }
+
+  static bool isDS(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::DS;
+  }
+
+  bool isDS(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::DS;
+  }
+
+  static bool isMIMG(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::MIMG;
+  }
+
+  bool isMIMG(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::MIMG;
+  }
+
+  static bool isGather4(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::Gather4;
+  }
+
+  bool isGather4(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::Gather4;
+  }
+
+  static bool isFLAT(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::FLAT;
+  }
+
+  bool isFLAT(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::FLAT;
+  }
+
+  static bool isEXP(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::EXP;
+  }
+
+  bool isEXP(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::EXP;
+  }
+
+  static bool isWQM(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::WQM;
+  }
+
+  bool isWQM(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::WQM;
+  }
+
+  static bool isDisableWQM(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM;
+  }
+
+  bool isDisableWQM(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::DisableWQM;
+  }
+
+  static bool isVGPRSpill(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill;
+  }
+
+  bool isVGPRSpill(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill;
+  }
+
+  static bool isSGPRSpill(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SGPRSpill;
+  }
+
+  bool isSGPRSpill(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill;
+  }
+
+  static bool isDPP(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::DPP;
+  }
+
+  bool isDPP(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::DPP;
+  }
+
+  static bool isScalarUnit(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD);
+  }
+
+  static bool usesVM_CNT(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VM_CNT;
+  }
+
+  static bool sopkIsZext(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SOPK_ZEXT;
+  }
+
+  bool sopkIsZext(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SOPK_ZEXT;
+  }
+
+  /// \returns true if this is an s_store_dword* instruction. This is more
+  /// specific than than isSMEM && mayStore.
+  static bool isScalarStore(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SCALAR_STORE;
+  }
+
+  bool isScalarStore(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SCALAR_STORE;
+  }
+
+  static bool isFixedSize(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::FIXED_SIZE;
+  }
+
+  bool isFixedSize(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE;
+  }
+
+  bool isVGPRCopy(const MachineInstr &MI) const {
+    assert(MI.isCopy());
+    unsigned Dest = MI.getOperand(0).getReg();
+    const MachineFunction &MF = *MI.getParent()->getParent();
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    return !RI.isSGPRReg(MRI, Dest);
+  }
+
+  static int operandBitWidth(uint8_t OperandType) {
+    switch (OperandType) {
+    case AMDGPU::OPERAND_REG_IMM_INT32:
+    case AMDGPU::OPERAND_REG_IMM_FP32:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+      return 32;
+    case AMDGPU::OPERAND_REG_IMM_INT64:
+    case AMDGPU::OPERAND_REG_IMM_FP64:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+      return 64;
+    case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+    case AMDGPU::OPERAND_REG_IMM_INT16:
+    case AMDGPU::OPERAND_REG_IMM_FP16:
+      return 16;
+    default:
+      llvm_unreachable("unexpected operand type");
+    }
+  }
+
+  bool isInlineConstant(const APInt &Imm) const;
+
+  bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const;
+
+  bool isInlineConstant(const MachineOperand &MO,
+                        const MCOperandInfo &OpInfo) const {
+    return isInlineConstant(MO, OpInfo.OperandType);
+  }
+
+  /// \p returns true if \p UseMO is substituted with \p DefMO in \p MI it would
+  /// be an inline immediate.
+  bool isInlineConstant(const MachineInstr &MI,
+                        const MachineOperand &UseMO,
+                        const MachineOperand &DefMO) const {
+    assert(UseMO.getParent() == &MI);
+    int OpIdx = MI.getOperandNo(&UseMO);
+    if (!MI.getDesc().OpInfo || OpIdx >= MI.getDesc().NumOperands) {
+      return false;
+    }
+
+    return isInlineConstant(DefMO, MI.getDesc().OpInfo[OpIdx]);
+  }
+
+  /// \p returns true if the operand \p OpIdx in \p MI is a valid inline
+  /// immediate.
+  bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx) const {
+    const MachineOperand &MO = MI.getOperand(OpIdx);
+    return isInlineConstant(MO, MI.getDesc().OpInfo[OpIdx].OperandType);
+  }
+
+  bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx,
+                        const MachineOperand &MO) const {
+    if (!MI.getDesc().OpInfo || OpIdx >= MI.getDesc().NumOperands)
+      return false;
+
+    if (MI.isCopy()) {
+      unsigned Size = getOpSize(MI, OpIdx);
+      assert(Size == 8 || Size == 4);
+
+      uint8_t OpType = (Size == 8) ?
+        AMDGPU::OPERAND_REG_IMM_INT64 : AMDGPU::OPERAND_REG_IMM_INT32;
+      return isInlineConstant(MO, OpType);
+    }
+
+    return isInlineConstant(MO, MI.getDesc().OpInfo[OpIdx].OperandType);
+  }
+
+  bool isInlineConstant(const MachineOperand &MO) const {
+    const MachineInstr *Parent = MO.getParent();
+    return isInlineConstant(*Parent, Parent->getOperandNo(&MO));
+  }
+
+  bool isLiteralConstant(const MachineOperand &MO,
+                         const MCOperandInfo &OpInfo) const {
+    return MO.isImm() && !isInlineConstant(MO, OpInfo.OperandType);
+  }
+
+  bool isLiteralConstant(const MachineInstr &MI, int OpIdx) const {
+    const MachineOperand &MO = MI.getOperand(OpIdx);
+    return MO.isImm() && !isInlineConstant(MI, OpIdx);
+  }
+
+  // Returns true if this operand could potentially require a 32-bit literal
+  // operand, but not necessarily. A FrameIndex for example could resolve to an
+  // inline immediate value that will not require an additional 4-bytes; this
+  // assumes that it will.
+  bool isLiteralConstantLike(const MachineOperand &MO,
+                             const MCOperandInfo &OpInfo) const;
+
+  bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
+                         const MachineOperand &MO) const;
+
+  /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
+  /// This function will return false if you pass it a 32-bit instruction.
+  bool hasVALU32BitEncoding(unsigned Opcode) const;
+
+  /// \brief Returns true if this operand uses the constant bus.
+  bool usesConstantBus(const MachineRegisterInfo &MRI,
+                       const MachineOperand &MO,
+                       const MCOperandInfo &OpInfo) const;
+
+  /// \brief Return true if this instruction has any modifiers.
+  ///  e.g. src[012]_mod, omod, clamp.
+  bool hasModifiers(unsigned Opcode) const;
+
+  bool hasModifiersSet(const MachineInstr &MI,
+                       unsigned OpName) const;
+
+  bool verifyInstruction(const MachineInstr &MI,
+                         StringRef &ErrInfo) const override;
+
+  static unsigned getVALUOp(const MachineInstr &MI);
+
+  bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const;
+
+  /// \brief Return the correct register class for \p OpNo.  For target-specific
+  /// instructions, this will return the register class that has been defined
+  /// in tablegen.  For generic instructions, like REG_SEQUENCE it will return
+  /// the register class of its machine operand.
+  /// to infer the correct register class base on the other operands.
+  const TargetRegisterClass *getOpRegClass(const MachineInstr &MI,
+                                           unsigned OpNo) const;
+
+  /// \brief Return the size in bytes of the operand OpNo on the given
+  // instruction opcode.
+  unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const {
+    const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo];
+
+    if (OpInfo.RegClass == -1) {
+      // If this is an immediate operand, this must be a 32-bit literal.
+      assert(OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE);
+      return 4;
+    }
+
+    return RI.getRegClass(OpInfo.RegClass)->getSize();
+  }
+
+  /// \brief This form should usually be preferred since it handles operands
+  /// with unknown register classes.
+  unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const {
+    return getOpRegClass(MI, OpNo)->getSize();
+  }
+
+  /// \returns true if it is legal for the operand at index \p OpNo
+  /// to read a VGPR.
+  bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const;
+
+  /// \brief Legalize the \p OpIndex operand of this instruction by inserting
+  /// a MOV.  For example:
+  /// ADD_I32_e32 VGPR0, 15
+  /// to
+  /// MOV VGPR1, 15
+  /// ADD_I32_e32 VGPR0, VGPR1
+  ///
+  /// If the operand being legalized is a register, then a COPY will be used
+  /// instead of MOV.
+  void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const;
+
+  /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand
+  /// for \p MI.
+  bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
+                      const MachineOperand *MO = nullptr) const;
+
+  /// \brief Check if \p MO would be a valid operand for the given operand
+  /// definition \p OpInfo. Note this does not attempt to validate constant bus
+  /// restrictions (e.g. literal constant usage).
+  bool isLegalVSrcOperand(const MachineRegisterInfo &MRI,
+                          const MCOperandInfo &OpInfo,
+                          const MachineOperand &MO) const;
+
+  /// \brief Check if \p MO (a register operand) is a legal register for the
+  /// given operand description.
+  bool isLegalRegOperand(const MachineRegisterInfo &MRI,
+                         const MCOperandInfo &OpInfo,
+                         const MachineOperand &MO) const;
+
+  /// \brief Legalize operands in \p MI by either commuting it or inserting a
+  /// copy of src1.
+  void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;
+
+  /// \brief Fix operands in \p MI to satisfy constant bus requirements.
+  void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const;
+
+  /// Copy a value from a VGPR (\p SrcReg) to SGPR.  This function can only
+  /// be used when it is know that the value in SrcReg is same across all
+  /// threads in the wave.
+  /// \returns The SGPR register that \p SrcReg was copied to.
+  unsigned readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
+                              MachineRegisterInfo &MRI) const;
+
+  void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const;
+
+  void legalizeGenericOperand(MachineBasicBlock &InsertMBB,
+                              MachineBasicBlock::iterator I,
+                              const TargetRegisterClass *DstRC,
+                              MachineOperand &Op, MachineRegisterInfo &MRI,
+                              const DebugLoc &DL) const;
+
+  /// \brief Legalize all operands in this instruction.  This function may
+  /// create new instruction and insert them before \p MI.
+  void legalizeOperands(MachineInstr &MI) const;
+
+  /// \brief Replace this instruction's opcode with the equivalent VALU
+  /// opcode.  This function will also move the users of \p MI to the
+  /// VALU if necessary.
+  void moveToVALU(MachineInstr &MI) const;
+
+  void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::iterator MI,
+                        int Count) const;
+
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const override;
+
+  /// \brief Return the number of wait states that result from executing this
+  /// instruction.
+  unsigned getNumWaitStates(const MachineInstr &MI) const;
+
+  /// \brief Returns the operand named \p Op.  If \p MI does not have an
+  /// operand named \c Op, this function returns nullptr.
+  LLVM_READONLY
+  MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const;
+
+  LLVM_READONLY
+  const MachineOperand *getNamedOperand(const MachineInstr &MI,
+                                        unsigned OpName) const {
+    return getNamedOperand(const_cast<MachineInstr &>(MI), OpName);
+  }
+
+  /// Get required immediate operand
+  int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const {
+    int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
+    return MI.getOperand(Idx).getImm();
+  }
+
+  uint64_t getDefaultRsrcDataFormat() const;
+  uint64_t getScratchRsrcWords23() const;
+
+  bool isLowLatencyInstruction(const MachineInstr &MI) const;
+  bool isHighLatencyInstruction(const MachineInstr &MI) const;
+
+  /// \brief Return the descriptor of the target-specific machine instruction
+  /// that corresponds to the specified pseudo or native opcode.
+  const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const {
+    return get(pseudoToMCOpcode(Opcode));
+  }
+
+  unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const;
+  unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const;
+
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+
+  unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+  bool mayAccessFlatAddressSpace(const MachineInstr &MI) const;
+
+  ArrayRef<std::pair<int, const char *>>
+  getSerializableTargetIndices() const override;
+
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                 const ScheduleDAG *DAG) const override;
+
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override;
+};
+
+namespace AMDGPU {
+  LLVM_READONLY
+  int getVOPe64(uint16_t Opcode);
+
+  LLVM_READONLY
+  int getVOPe32(uint16_t Opcode);
+
+  LLVM_READONLY
+  int getCommuteRev(uint16_t Opcode);
+
+  LLVM_READONLY
+  int getCommuteOrig(uint16_t Opcode);
+
+  LLVM_READONLY
+  int getAddr64Inst(uint16_t Opcode);
+
+  LLVM_READONLY
+  int getAtomicRetOp(uint16_t Opcode);
+
+  LLVM_READONLY
+  int getAtomicNoRetOp(uint16_t Opcode);
+
+  LLVM_READONLY
+  int getSOPKOp(uint16_t Opcode);
+
+  const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
+  const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
+  const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
+  const uint64_t RSRC_TID_ENABLE = UINT64_C(1) << (32 + 23);
+
+  // For MachineOperands.
+  enum TargetFlags {
+    TF_LONG_BRANCH_FORWARD = 1 << 0,
+    TF_LONG_BRANCH_BACKWARD = 1 << 1
+  };
+} // End namespace AMDGPU
+
+namespace SI {
+namespace KernelInputOffsets {
+
+/// Offsets in bytes from the start of the input buffer
+enum Offsets {
+  NGROUPS_X = 0,
+  NGROUPS_Y = 4,
+  NGROUPS_Z = 8,
+  GLOBAL_SIZE_X = 12,
+  GLOBAL_SIZE_Y = 16,
+  GLOBAL_SIZE_Z = 20,
+  LOCAL_SIZE_X = 24,
+  LOCAL_SIZE_Y = 28,
+  LOCAL_SIZE_Z = 32
+};
+
+} // End namespace KernelInputOffsets
+} // End namespace SI
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
new file mode 100644
index 000000000000..34096e158039
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -0,0 +1,1254 @@
+//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+def isCI : Predicate<"Subtarget->getGeneration() "
+                      ">= SISubtarget::SEA_ISLANDS">;
+def isCIOnly : Predicate<"Subtarget->getGeneration() =="
+                         "SISubtarget::SEA_ISLANDS">,
+  AssemblerPredicate <"FeatureSeaIslands">;
+
+def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
+
+// Execpt for the NONE field, this must be kept in sync with the
+// SIEncodingFamily enum in AMDGPUInstrInfo.cpp
+def SIEncodingFamily {
+  int NONE = -1;
+  int SI = 0;
+  int VI = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// SI DAG Nodes
+//===----------------------------------------------------------------------===//
+
+def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
+  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
+                      [SDNPMayLoad, SDNPMemOperand]
+>;
+
+def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2,
+  [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
+  [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
+  SDTypeProfile<0, 13,
+    [SDTCisVT<0, v4i32>,   // rsrc(SGPR)
+     SDTCisVT<1, iAny>,   // vdata(VGPR)
+     SDTCisVT<2, i32>,    // num_channels(imm)
+     SDTCisVT<3, i32>,    // vaddr(VGPR)
+     SDTCisVT<4, i32>,    // soffset(SGPR)
+     SDTCisVT<5, i32>,    // inst_offset(imm)
+     SDTCisVT<6, i32>,    // dfmt(imm)
+     SDTCisVT<7, i32>,    // nfmt(imm)
+     SDTCisVT<8, i32>,    // offen(imm)
+     SDTCisVT<9, i32>,    // idxen(imm)
+     SDTCisVT<10, i32>,   // glc(imm)
+     SDTCisVT<11, i32>,   // slc(imm)
+     SDTCisVT<12, i32>    // tfe(imm)
+    ]>,
+  [SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SDTBufferLoad : SDTypeProfile<1, 5,
+    [                    // vdata
+     SDTCisVT<1, v4i32>, // rsrc
+     SDTCisVT<2, i32>,   // vindex
+     SDTCisVT<3, i32>,   // offset
+     SDTCisVT<4, i1>,    // glc
+     SDTCisVT<5, i1>]>;  // slc
+
+def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
+                            [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
+                            [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
+
+def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
+  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>,
+                       SDTCisVT<3, i32>]>
+>;
+
+class SDSample<string opcode> : SDNode <opcode,
+  SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>,
+                       SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
+>;
+
+def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
+def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
+def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
+def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
+
+def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
+  SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
+>;
+
+//===----------------------------------------------------------------------===//
+// PatFrags for global memory operations
+//===----------------------------------------------------------------------===//
+
+defm atomic_inc_global : global_binary_atomic_op<SIatomic_inc>;
+defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
+
+//===----------------------------------------------------------------------===//
+// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
+// to be glued to the memory instructions.
+//===----------------------------------------------------------------------===//
+
+def SIld_local : SDNode <"ISD::LOAD", SDTLoad,
+  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+>;
+
+def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED &&
+         cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+}]>;
+
+def si_load_local_align8 : Aligned8Bytes <
+  (ops node:$ptr), (si_load_local node:$ptr)
+>;
+
+def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
+  return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
+}]>;
+def si_az_extload_local : AZExtLoadBase <si_ld_local>;
+
+multiclass SIExtLoadLocal <PatFrag ld_node> {
+
+  def _i8 : PatFrag <(ops node:$ptr), (ld_node node:$ptr),
+                     [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;}]
+  >;
+
+  def _i16 : PatFrag <(ops node:$ptr), (ld_node node:$ptr),
+                     [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;}]
+  >;
+}
+
+defm si_sextload_local : SIExtLoadLocal <si_sextload_local>;
+defm si_az_extload_local : SIExtLoadLocal <si_az_extload_local>;
+
+def SIst_local : SDNode <"ISD::STORE", SDTStore,
+  [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
+>;
+
+def si_st_local : PatFrag <
+  (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+def si_store_local : PatFrag <
+  (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED &&
+         !cast<StoreSDNode>(N)->isTruncatingStore();
+}]>;
+
+def si_store_local_align8 : Aligned8Bytes <
+  (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr)
+>;
+
+def si_truncstore_local : PatFrag <
+  (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->isTruncatingStore();
+}]>;
+
+def si_truncstore_local_i8 : PatFrag <
+  (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def si_truncstore_local_i16 : PatFrag <
+  (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def si_setcc_uniform : PatFrag <
+  (ops node:$lhs, node:$rhs, node:$cond),
+  (setcc node:$lhs, node:$rhs, node:$cond), [{
+  for (SDNode *Use : N->uses()) {
+    if (Use->isMachineOpcode() || Use->getOpcode() != ISD::CopyToReg)
+      return false;
+
+    unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
+    if (Reg != AMDGPU::SCC)
+      return false;
+  }
+  return true;
+}]>;
+
+def si_uniform_br : PatFrag <
+  (ops node:$cond, node:$bb), (brcond node:$cond, node:$bb), [{
+  return isUniformBr(N);
+}]>;
+
+def si_uniform_br_scc : PatFrag <
+  (ops node:$cond, node:$bb), (si_uniform_br node:$cond, node:$bb), [{
+  return isCBranchSCC(N);
+}]>;
+
+multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> {
+
+  def _glue : SDNode <
+    !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, SDTAtomic2,
+    [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+  >;
+
+  def _local : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+}
+
+defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
+defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
+defm si_atomic_inc : SIAtomicM0Glue2 <"INC", 1>;
+defm si_atomic_dec : SIAtomicM0Glue2 <"DEC", 1>;
+defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
+defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
+defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
+defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">;
+defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
+defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
+defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
+defm si_atomic_swap : SIAtomicM0Glue2 <"SWAP">;
+
+def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
+  [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+>;
+
+defm si_atomic_cmp_swap : AtomicCmpSwapLocal <si_atomic_cmp_swap_glue>;
+
+def as_i1imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);
+}]>;
+
+def as_i8imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i8);
+}]>;
+
+def as_i16imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16);
+}]>;
+
+def as_i32imm: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def as_i64imm: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64);
+}]>;
+
+// Copied from the AArch64 backend:
+def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+  N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def frameindex_to_targetframeindex : SDNodeXForm<frameindex, [{
+  auto FI = cast<FrameIndexSDNode>(N);
+  return CurDAG->getTargetFrameIndex(FI->getIndex(), MVT::i32);
+}]>;
+
+// Copied from the AArch64 backend:
+def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+  N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
+
+def SIMM16bit : PatLeaf <(imm),
+  [{return isInt<16>(N->getSExtValue());}]
+>;
+
+def IMM20bit : PatLeaf <(imm),
+  [{return isUInt<20>(N->getZExtValue());}]
+>;
+
+class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
+  return isInlineImmediate(N);
+}]>;
+
+class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
+  return isInlineImmediate(N);
+}]>;
+
+class VGPRImm <dag frag> : PatLeaf<frag, [{
+  if (Subtarget->getGeneration() < SISubtarget::SOUTHERN_ISLANDS) {
+    return false;
+  }
+  const SIRegisterInfo *SIRI =
+      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+  unsigned Limit = 0;
+  for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
+         Limit < 10 && U != E; ++U, ++Limit) {
+    const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
+
+    // If the register class is unknown, it could be an unknown
+    // register class that needs to be an SGPR, e.g. an inline asm
+    // constraint
+    if (!RC || SIRI->isSGPRClass(RC))
+      return false;
+  }
+
+  return Limit < 10;
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Custom Operands
+//===----------------------------------------------------------------------===//
+
+def SoppBrTarget : AsmOperandClass {
+  let Name = "SoppBrTarget";
+  let ParserMethod = "parseSOppBrTarget";
+}
+
+def sopp_brtarget : Operand<OtherVT> {
+  let EncoderMethod = "getSOPPBrEncoding";
+  let DecoderMethod = "decodeSoppBrTarget";
+  let OperandType = "OPERAND_PCREL";
+  let ParserMatchClass = SoppBrTarget;
+}
+
+def si_ga : Operand<iPTR>;
+
+def InterpSlotMatchClass : AsmOperandClass {
+  let Name = "InterpSlot";
+  let PredicateMethod = "isInterpSlot";
+  let ParserMethod = "parseInterpSlot";
+  let RenderMethod = "addImmOperands";
+}
+
+def InterpSlot : Operand<i32> {
+  let PrintMethod = "printInterpSlot";
+  let ParserMatchClass = InterpSlotMatchClass;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def AttrMatchClass : AsmOperandClass {
+  let Name = "Attr";
+  let PredicateMethod = "isInterpAttr";
+  let ParserMethod = "parseInterpAttr";
+  let RenderMethod = "addImmOperands";
+}
+
+// It appears to be necessary to create a separate operand for this to
+// be able to parse attr<num> with no space.
+def Attr : Operand<i32> {
+  let PrintMethod = "printInterpAttr";
+  let ParserMatchClass = AttrMatchClass;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def AttrChanMatchClass : AsmOperandClass {
+  let Name = "AttrChan";
+  let PredicateMethod = "isAttrChan";
+  let RenderMethod = "addImmOperands";
+}
+
+def AttrChan : Operand<i32> {
+  let PrintMethod = "printInterpAttrChan";
+  let ParserMatchClass = AttrChanMatchClass;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def SendMsgMatchClass : AsmOperandClass {
+  let Name = "SendMsg";
+  let PredicateMethod = "isSendMsg";
+  let ParserMethod = "parseSendMsgOp";
+  let RenderMethod = "addImmOperands";
+}
+
+def ExpTgtMatchClass : AsmOperandClass {
+  let Name = "ExpTgt";
+  let PredicateMethod = "isExpTgt";
+  let ParserMethod = "parseExpTgt";
+  let RenderMethod = "printExpTgt";
+}
+
+def SendMsgImm : Operand<i32> {
+  let PrintMethod = "printSendMsg";
+  let ParserMatchClass = SendMsgMatchClass;
+}
+
+def SWaitMatchClass : AsmOperandClass {
+  let Name = "SWaitCnt";
+  let RenderMethod = "addImmOperands";
+  let ParserMethod = "parseSWaitCntOps";
+}
+
+def VReg32OrOffClass : AsmOperandClass {
+  let Name = "VReg32OrOff";
+  let ParserMethod = "parseVReg32OrOff";
+}
+
+def WAIT_FLAG : Operand <i32> {
+  let ParserMatchClass = SWaitMatchClass;
+  let PrintMethod = "printWaitFlag";
+}
+
+include "SIInstrFormats.td"
+include "VIInstrFormats.td"
+
+// ===----------------------------------------------------------------------===//
+// ExpSrc* Special cases for exp src operands which are printed as
+// "off" depending on en operand.
+// ===----------------------------------------------------------------------===//
+
+def ExpSrc0 : RegisterOperand<VGPR_32> {
+  let PrintMethod = "printExpSrc0";
+  let ParserMatchClass = VReg32OrOffClass;
+}
+
+def ExpSrc1 : RegisterOperand<VGPR_32> {
+  let PrintMethod = "printExpSrc1";
+  let ParserMatchClass = VReg32OrOffClass;
+}
+
+def ExpSrc2 : RegisterOperand<VGPR_32> {
+  let PrintMethod = "printExpSrc2";
+  let ParserMatchClass = VReg32OrOffClass;
+}
+
+def ExpSrc3 : RegisterOperand<VGPR_32> {
+  let PrintMethod = "printExpSrc3";
+  let ParserMatchClass = VReg32OrOffClass;
+}
+
+class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
+  let Name = "Imm"#CName;
+  let PredicateMethod = "is"#CName;
+  let ParserMethod = !if(Optional, "parseOptionalOperand", "parse"#CName);
+  let RenderMethod = "addImmOperands";
+  let IsOptional = Optional;
+  let DefaultMethod = !if(Optional, "default"#CName, ?);
+}
+
+class NamedOperandBit<string Name, AsmOperandClass MatchClass> : Operand<i1> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
+}
+
+class NamedOperandU8<string Name, AsmOperandClass MatchClass> : Operand<i8> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
+}
+
+class NamedOperandU16<string Name, AsmOperandClass MatchClass> : Operand<i16> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
+}
+
+class NamedOperandU32<string Name, AsmOperandClass MatchClass> : Operand<i32> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
+}
+
+let OperandType = "OPERAND_IMMEDIATE" in {
+
+def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>;
+def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>;
+def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>;
+
+def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>;
+def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>;
+def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>;
+
+def gds : NamedOperandBit<"GDS", NamedMatchClass<"GDS">>;
+
+def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>;
+def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>;
+
+def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
+def slc : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
+def tfe : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
+def unorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
+def da : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
+def r128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>;
+def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
+def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
+def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
+
+def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
+
+def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
+def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>;
+def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>;
+def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>;
+
+def dst_sel : NamedOperandU32<"SDWADstSel", NamedMatchClass<"SDWADstSel">>;
+def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>;
+def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>;
+def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>;
+
+def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
+
+def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
+
+}
+
+} // End OperandType = "OPERAND_IMMEDIATE"
+
+class KImmMatchClass<int size> : AsmOperandClass {
+  let Name = "KImmFP"#size;
+  let PredicateMethod = "isKImmFP"#size;
+  let ParserMethod = "parseImm";
+  let RenderMethod = "addKImmFP"#size#"Operands";
+}
+
+class kimmOperand<ValueType vt> : Operand<vt> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_KIMM"#vt.Size;
+  let PrintMethod = "printU"#vt.Size#"ImmOperand";
+  let ParserMatchClass = !cast<AsmOperandClass>("KImmFP"#vt.Size#"MatchClass");
+}
+
+// 32-bit VALU immediate operand that uses the constant bus.
+def KImmFP32MatchClass : KImmMatchClass<32>;
+def f32kimm : kimmOperand<i32>;
+
+// 32-bit VALU immediate operand with a 16-bit value that uses the
+// constant bus.
+def KImmFP16MatchClass : KImmMatchClass<16>;
+def f16kimm : kimmOperand<i16>;
+
+
+def VOPDstS64 : VOPDstOperand <SReg_64>;
+
+class FPInputModsMatchClass <int opSize> : AsmOperandClass {
+  let Name = "RegOrImmWithFP"#opSize#"InputMods";
+  let ParserMethod = "parseRegOrImmWithFPInputMods";
+  let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods";
+}
+def FP16InputModsMatchClass : FPInputModsMatchClass<16>;
+def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
+def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
+
+class InputMods <AsmOperandClass matchClass> : Operand <i32> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_INPUT_MODS";
+  let ParserMatchClass = matchClass;
+}
+
+class FPInputMods <FPInputModsMatchClass matchClass> : InputMods <matchClass> {
+  let PrintMethod = "printOperandAndFPInputMods";
+}
+
+def FP16InputMods : FPInputMods<FP16InputModsMatchClass>;
+def FP32InputMods : FPInputMods<FP32InputModsMatchClass>;
+def FP64InputMods : FPInputMods<FP64InputModsMatchClass>;
+
+class IntInputModsMatchClass <int opSize> : AsmOperandClass {
+  let Name = "RegOrImmWithInt"#opSize#"InputMods";
+  let ParserMethod = "parseRegOrImmWithIntInputMods";
+  let PredicateMethod = "isRegOrImmWithInt"#opSize#"InputMods";
+}
+def Int32InputModsMatchClass : IntInputModsMatchClass<32>;
+def Int64InputModsMatchClass : IntInputModsMatchClass<64>;
+
+class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass> {
+  let PrintMethod = "printOperandAndIntInputMods";
+}
+def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
+def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
+
+//===----------------------------------------------------------------------===//
+// Complex patterns
+//===----------------------------------------------------------------------===//
+
+def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">;
+def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
+
+def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">;
+
+def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
+def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
+def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
+def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">;
+def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
+def VOP3NoMods : ComplexPattern<untyped, 2, "SelectVOP3NoMods">;
+
+//===----------------------------------------------------------------------===//
+// SI assembler operands
+//===----------------------------------------------------------------------===//
+
+def SIOperand {
+  int ZERO = 0x80;
+  int VCC = 0x6A;
+  int FLAT_SCR = 0x68;
+}
+
+def SRCMODS {
+  int NONE = 0;
+  int NEG = 1;
+}
+
+def DSTCLAMP {
+  int NONE = 0;
+}
+
+def DSTOMOD {
+  int NONE = 0;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// SI Instruction multiclass helpers.
+//
+// Instructions with _32 take 32-bit operands.
+// Instructions with _64 take 64-bit operands.
+//
+// VOP_* instructions can use either a 32-bit or 64-bit encoding.  The 32-bit
+// encoding is the standard encoding, but instruction that make use of
+// any of the instruction modifiers must use the 64-bit encoding.
+//
+// Instructions with _e32 use the 32-bit encoding.
+// Instructions with _e64 use the 64-bit encoding.
+//
+//===----------------------------------------------------------------------===//
+
+class SIMCInstr <string pseudo, int subtarget> {
+  string PseudoInstr = pseudo;
+  int Subtarget = subtarget;
+}
+
+//===----------------------------------------------------------------------===//
+// EXP classes
+//===----------------------------------------------------------------------===//
+
+class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon<
+  (outs),
+  (ins exp_tgt:$tgt,
+       ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3,
+       exp_vm:$vm, exp_compr:$compr, i8imm:$en),
+  "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm",
+  [(node (i8 timm:$en), (i1 timm:$vm), (i8 timm:$tgt), (i1 timm:$compr),
+         f32:$src0, f32:$src1, f32:$src2, f32:$src3)]> {
+  let AsmMatchConverter = "cvtExp";
+}
+
+// Split EXP instruction into EXP and EXP_DONE so we can set
+// mayLoad for done=1.
+multiclass EXP_m<bit done, SDPatternOperator node> {
+  let mayLoad = done in {
+    let isPseudo = 1, isCodeGenOnly = 1 in {
+      def "" : EXP_Helper<done, node>,
+               SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>;
+    }
+
+    let done = done in {
+      def _si : EXP_Helper<done>,
+                SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>,
+                EXPe {
+        let DecoderNamespace = "SICI";
+        let DisableDecoder = DisableSIDecoder;
+      }
+
+      def _vi : EXP_Helper<done>,
+                SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>,
+                EXPe_vi {
+        let DecoderNamespace = "VI";
+        let DisableDecoder = DisableVIDecoder;
+      }
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Vector ALU classes
+//===----------------------------------------------------------------------===//
+
+class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> {
+  int ret =
+    !if (!eq(Src0.Value, untyped.Value),      0,
+      !if (!eq(Src1.Value, untyped.Value),    1,   // VOP1
+         !if (!eq(Src2.Value, untyped.Value), 2,   // VOP2
+                                              3))); // VOP3
+}
+
+// Returns the register class to use for the destination of VOP[123C]
+// instructions for the given VT.
+class getVALUDstForVT<ValueType VT> {
+  RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>,
+                          !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>,
+                            !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
+                              !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>,
+                              VOPDstOperand<SReg_64>)))); // else VT == i1
+}
+
+// Returns the register class to use for source 0 of VOP[12C]
+// instructions for the given VT.
+class getVOPSrc0ForVT<ValueType VT> {
+  bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+             !if(!eq(VT.Value, f32.Value), 1,
+             !if(!eq(VT.Value, f64.Value), 1,
+             0)));
+  RegisterOperand ret = !if(isFP,
+                            !if(!eq(VT.Size, 64), VSrc_f64, !if(!eq(VT.Size, 16), VSrc_f16, VSrc_f32)),
+                            !if(!eq(VT.Size, 64), VSrc_b64, !if(!eq(VT.Size, 16), VSrc_b16, VSrc_b32)));
+}
+
+// Returns the vreg register class to use for source operand given VT
+class getVregSrcForVT<ValueType VT> {
+  RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
+                        !if(!eq(VT.Size, 64), VReg_64, VGPR_32));
+}
+
+
+// Returns the register class to use for sources of VOP3 instructions for the
+// given VT.
+class getVOP3SrcForVT<ValueType VT> {
+  bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+             !if(!eq(VT.Value, f32.Value), 1,
+             !if(!eq(VT.Value, f64.Value), 1,
+             0)));
+  RegisterOperand ret =
+  !if(!eq(VT.Size, 128),
+      VSrc_128,
+    !if(!eq(VT.Size, 64),
+        !if(isFP,
+            VCSrc_f64,
+            VCSrc_b64),
+        !if(!eq(VT.Value, i1.Value),
+            SCSrc_b64,
+            !if(isFP,
+                !if(!eq(VT.Size, 16), VCSrc_f16, VCSrc_f32),
+                !if(!eq(VT.Size, 16), VCSrc_b16, VCSrc_b32)
+            )
+         )
+	   )
+     );
+}
+
+// Returns 1 if the source arguments have modifiers, 0 if they do not.
+// XXX - do f16 instructions?
+class isFloatType<ValueType SrcVT> {
+  bit ret =
+    !if(!eq(SrcVT.Value, f16.Value), 1,
+    !if(!eq(SrcVT.Value, f32.Value), 1,
+    !if(!eq(SrcVT.Value, f64.Value), 1,
+    0)));
+}
+
+class isIntType<ValueType SrcVT> {
+  bit ret =
+    !if(!eq(SrcVT.Value, i16.Value), 1,
+    !if(!eq(SrcVT.Value, i32.Value), 1,
+    !if(!eq(SrcVT.Value, i64.Value), 1,
+    0)));
+}
+
+
+// Return type of input modifiers operand for specified input operand
+class getSrcMod <ValueType VT> {
+  bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+               !if(!eq(VT.Value, f32.Value), 1,
+               !if(!eq(VT.Value, f64.Value), 1,
+               0)));
+  Operand ret =  !if(!eq(VT.Size, 64),
+                     !if(isFP, FP64InputMods, Int64InputMods),
+                       !if(isFP,
+                         !if(!eq(VT.Value, f16.Value),
+                            FP16InputMods,
+                            FP32InputMods
+                          ),
+                         Int32InputMods)
+                     );
+}
+
+// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
+class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
+  dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0),               // VOP1
+            !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2
+                                    (ins)));
+}
+
+// Returns the input arguments for VOP3 instructions for the given SrcVT.
+class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
+                RegisterOperand Src2RC, int NumSrcArgs,
+                bit HasModifiers, Operand Src0Mod, Operand Src1Mod,
+                Operand Src2Mod> {
+
+  dag ret =
+    !if (!eq(NumSrcArgs, 0),
+      // VOP1 without input operands (V_NOP, V_CLREXCP)
+      (ins),
+      /* else */
+    !if (!eq(NumSrcArgs, 1),
+      !if (!eq(HasModifiers, 1),
+        // VOP1 with modifiers
+        (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+             clampmod:$clamp, omod:$omod)
+      /* else */,
+        // VOP1 without modifiers
+        (ins Src0RC:$src0)
+      /* endif */ ),
+    !if (!eq(NumSrcArgs, 2),
+      !if (!eq(HasModifiers, 1),
+        // VOP 2 with modifiers
+        (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+             Src1Mod:$src1_modifiers, Src1RC:$src1,
+             clampmod:$clamp, omod:$omod)
+      /* else */,
+        // VOP2 without modifiers
+        (ins Src0RC:$src0, Src1RC:$src1)
+      /* endif */ )
+    /* NumSrcArgs == 3 */,
+      !if (!eq(HasModifiers, 1),
+        // VOP3 with modifiers
+        (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+             Src1Mod:$src1_modifiers, Src1RC:$src1,
+             Src2Mod:$src2_modifiers, Src2RC:$src2,
+             clampmod:$clamp, omod:$omod)
+      /* else */,
+        // VOP3 without modifiers
+        (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2)
+      /* endif */ ))));
+}
+
+class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
+                 bit HasModifiers, Operand Src0Mod, Operand Src1Mod> {
+
+  dag ret = !if (!eq(NumSrcArgs, 0),
+                // VOP1 without input operands (V_NOP)
+                (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl),
+            !if (!eq(NumSrcArgs, 1),
+              !if (!eq(HasModifiers, 1),
+                // VOP1_DPP with modifiers
+                (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+              /* else */,
+                // VOP1_DPP without modifiers
+                (ins Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+              /* endif */)
+              /* NumSrcArgs == 2 */,
+              !if (!eq(HasModifiers, 1),
+                // VOP2_DPP with modifiers
+                (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                     Src1Mod:$src1_modifiers, Src1RC:$src1,
+                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+              /* else */,
+                // VOP2_DPP without modifiers
+                (ins Src0RC:$src0, Src1RC:$src1, dpp_ctrl:$dpp_ctrl,
+                row_mask:$row_mask, bank_mask:$bank_mask,
+                bound_ctrl:$bound_ctrl)
+             /* endif */)));
+}
+
+class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
+                  bit HasFloatModifiers, Operand Src0Mod, Operand Src1Mod,
+                  ValueType DstVT> {
+
+  dag ret = !if(!eq(NumSrcArgs, 0),
+               // VOP1 without input operands (V_NOP)
+               (ins),
+            !if(!eq(NumSrcArgs, 1),
+               (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                    clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                    src0_sel:$src0_sel),
+            !if(!eq(NumSrcArgs, 2),
+               !if(!eq(DstVT.Size, 1),
+                  // VOPC_SDWA with modifiers
+                  (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                       Src1Mod:$src1_modifiers, Src1RC:$src1,
+                       clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
+                  // VOP2_SDWA or VOPC_SDWA with modifiers
+                  (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                       Src1Mod:$src1_modifiers, Src1RC:$src1,
+                       clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                       src0_sel:$src0_sel, src1_sel:$src1_sel)),
+            (ins)/* endif */)));
+}
+
+// Outs for DPP and SDWA
+class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCDPP> {
+  dag ret = !if(HasDst,
+                !if(!eq(DstVT.Size, 1),
+                    (outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions
+                    (outs DstRCDPP:$vdst)),
+                (outs)); // V_NOP
+}
+
+// Returns the assembly string for the inputs and outputs of a VOP[12C]
+// instruction.  This does not add the _e32 suffix, so it can be reused
+// by getAsm64.
+class getAsm32 <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
+  string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC
+  string src0 = ", $src0";
+  string src1 = ", $src1";
+  string src2 = ", $src2";
+  string ret = !if(HasDst, dst, "") #
+               !if(!eq(NumSrcArgs, 1), src0, "") #
+               !if(!eq(NumSrcArgs, 2), src0#src1, "") #
+               !if(!eq(NumSrcArgs, 3), src0#src1#src2, "");
+}
+
+// Returns the assembly string for the inputs and outputs of a VOP3
+// instruction.
+class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
+  string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC
+  string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
+  string src1 = !if(!eq(NumSrcArgs, 1), "",
+                   !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
+                                           " $src1_modifiers,"));
+  string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
+  string ret =
+  !if(!eq(HasModifiers, 0),
+      getAsm32<HasDst, NumSrcArgs, DstVT>.ret,
+      dst#", "#src0#src1#src2#"$clamp"#"$omod");
+}
+
+class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
+  string dst = !if(HasDst,
+                   !if(!eq(DstVT.Size, 1),
+                       "$sdst",
+                       "$vdst"),
+                    ""); // use $sdst for VOPC
+  string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
+  string src1 = !if(!eq(NumSrcArgs, 1), "",
+                   !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
+                                           " $src1_modifiers,"));
+  string args = !if(!eq(HasModifiers, 0),
+                     getAsm32<0, NumSrcArgs, DstVT>.ret,
+                     ", "#src0#src1);
+  string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+}
+
+class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers,
+                  ValueType DstVT = i32> {
+  string dst = !if(HasDst,
+                   !if(!eq(DstVT.Size, 1),
+                       " vcc", // use vcc token as dst for VOPC instructioins
+                       "$vdst"),
+                    "");
+  string src0 = "$src0_modifiers";
+  string src1 = "$src1_modifiers";
+  string args = !if(!eq(NumSrcArgs, 0),
+                    "",
+                    !if(!eq(NumSrcArgs, 1),
+                        ", "#src0#"$clamp",
+                        ", "#src0#", "#src1#"$clamp"
+                     )
+                );
+  string sdwa = !if(!eq(NumSrcArgs, 0),
+                    "",
+                    !if(!eq(NumSrcArgs, 1),
+                        " $dst_sel $dst_unused $src0_sel",
+                        !if(!eq(DstVT.Size, 1),
+                            " $src0_sel $src1_sel", // No dst_sel and dst_unused for VOPC
+                            " $dst_sel $dst_unused $src0_sel $src1_sel"
+                        )
+                    )
+                );
+  string ret = dst#args#sdwa;
+}
+
+// Function that checks if instruction supports DPP and SDWA
+class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
+                 ValueType Src1VT = i32> {
+  bit ret = !if(!eq(NumSrcArgs, 3),
+                0, // NumSrcArgs == 3 - No DPP or SDWA for VOP3
+                !if(!eq(DstVT.Size, 64),
+                    0, // 64-bit dst - No DPP or SDWA for 64-bit operands
+                    !if(!eq(Src0VT.Size, 64),
+                        0, // 64-bit src0
+                        !if(!eq(Src0VT.Size, 64),
+                            0, // 64-bit src2
+                            1
+                        )
+                    )
+                )
+            );
+}
+
+class BitOr<bit a, bit b> {
+  bit ret = !if(a, 1, !if(b, 1, 0));
+}
+
+class BitAnd<bit a, bit b> {
+  bit ret = !if(a, !if(b, 1, 0), 0);
+}
+
+class VOPProfile <list<ValueType> _ArgVT> {
+
+  field list<ValueType> ArgVT = _ArgVT;
+
+  field ValueType DstVT = ArgVT[0];
+  field ValueType Src0VT = ArgVT[1];
+  field ValueType Src1VT = ArgVT[2];
+  field ValueType Src2VT = ArgVT[3];
+  field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;
+  field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret;
+  field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret;
+  field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
+  field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret;
+  field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
+  field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
+  field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
+  field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret;
+  field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
+  field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret;
+  field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret;
+  field Operand Src0Mod = getSrcMod<Src0VT>.ret;
+  field Operand Src1Mod = getSrcMod<Src1VT>.ret;
+  field Operand Src2Mod = getSrcMod<Src2VT>.ret;
+
+  field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
+  field bit HasDst32 = HasDst;
+  field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case
+  field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret;
+  field bit HasSrc0 = !if(!eq(Src0VT.Value, untyped.Value), 0, 1);
+  field bit HasSrc1 = !if(!eq(Src1VT.Value, untyped.Value), 0, 1);
+  field bit HasSrc2 = !if(!eq(Src2VT.Value, untyped.Value), 0, 1);
+
+  // TODO: Modifiers logic is somewhat adhoc here, to be refined later
+  field bit HasModifiers = isFloatType<Src0VT>.ret;
+
+  field bit HasSrc0FloatMods = isFloatType<Src0VT>.ret;
+  field bit HasSrc1FloatMods = isFloatType<Src1VT>.ret;
+  field bit HasSrc2FloatMods = isFloatType<Src2VT>.ret;
+
+  field bit HasSrc0IntMods = isIntType<Src0VT>.ret;
+  field bit HasSrc1IntMods = isIntType<Src1VT>.ret;
+  field bit HasSrc2IntMods = isIntType<Src2VT>.ret;
+
+  field bit HasSrc0Mods = HasModifiers;
+  field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0);
+  field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);
+
+  field bit HasOMod = HasModifiers;
+  field bit HasClamp = HasModifiers;
+  field bit HasSDWAClamp = HasSrc0;
+
+  field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
+
+  field dag Outs = !if(HasDst,(outs DstRC:$vdst),(outs));
+
+  // VOP3b instructions are a special case with a second explicit
+  // output. This is manually overridden for them.
+  field dag Outs32 = Outs;
+  field dag Outs64 = Outs;
+  field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
+  field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
+
+  field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
+  field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+                             HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
+  field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs,
+                               HasModifiers, Src0Mod, Src1Mod>.ret;
+  field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
+                                 HasModifiers, Src0Mod, Src1Mod, DstVT>.ret;
+
+  field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
+  field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+  field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+  field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+}
+
+class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
+  let HasExt = 0;
+}
+
+def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;
+def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>;
+def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
+
+def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
+def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
+def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
+def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
+
+def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>;
+def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
+
+def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>;
+
+def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>;
+def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>;
+def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>;
+def VOP_F64_F32 : VOPProfile <[f64, f32, untyped, untyped]>;
+def VOP_F64_F64 : VOPProfile <[f64, f64, untyped, untyped]>;
+def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>;
+def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>;
+def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>;
+def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
+
+def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>;
+def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
+def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>;
+def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>;
+def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
+def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
+def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
+def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
+
+def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
+def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
+def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
+
+def VOP_F16_F32_F16_F32 : VOPProfile <[f16, f32, f16, f32]>;
+def VOP_F32_F32_F16_F16 : VOPProfile <[f32, f32, f16, f16]>;
+def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>;
+def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
+def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
+def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
+def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>;
+def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>;
+def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>;
+
+class Commutable_REV <string revOp, bit isOrig> {
+  string RevOp = revOp;
+  bit IsOrig = isOrig;
+}
+
+class AtomicNoRet <string noRetOp, bit isRet> {
+  string NoRetOp = noRetOp;
+  bit IsRet = isRet;
+}
+
+//===----------------------------------------------------------------------===//
+// Interpolation opcodes
+//===----------------------------------------------------------------------===//
+
+class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+  VINTRPCommon <outs, ins, "", pattern>,
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins,
+                      string asm> :
+  VINTRPCommon <outs, ins, asm, []>,
+  VINTRPe <op>,
+  SIMCInstr<opName, SIEncodingFamily.SI> {
+  let AssemblerPredicate = SIAssemblerPredicate;
+  let DecoderNamespace = "SICI";
+  let DisableDecoder = DisableSIDecoder;
+}
+
+class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins,
+                      string asm> :
+  VINTRPCommon <outs, ins, asm, []>,
+  VINTRPe_vi <op>,
+  SIMCInstr<opName, SIEncodingFamily.VI> {
+  let AssemblerPredicate = VIAssemblerPredicate;
+  let DecoderNamespace = "VI";
+  let DisableDecoder = DisableVIDecoder;
+}
+
+multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm,
+                     list<dag> pattern = []> {
+  def "" : VINTRP_Pseudo <NAME, outs, ins, pattern>;
+
+  def _si : VINTRP_Real_si <op, NAME, outs, ins, asm>;
+
+  def _vi : VINTRP_Real_vi <op, NAME, outs, ins, asm>;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector instruction mappings
+//===----------------------------------------------------------------------===//
+
+// Maps an opcode in e32 form to its e64 equivalent
+def getVOPe64 : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["Size", "VOP3"];
+  let KeyCol = ["4", "0"];
+  let ValueCols = [["8", "1"]];
+}
+
+// Maps an opcode in e64 form to its e32 equivalent
+def getVOPe32 : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["Size", "VOP3"];
+  let KeyCol = ["8", "1"];
+  let ValueCols = [["4", "0"]];
+}
+
+def getMaskedMIMGOp : InstrMapping {
+  let FilterClass = "MIMG_Mask";
+  let RowFields = ["Op"];
+  let ColFields = ["Channels"];
+  let KeyCol = ["4"];
+  let ValueCols = [["1"], ["2"], ["3"] ];
+}
+
+// Maps an commuted opcode to its original version
+def getCommuteOrig : InstrMapping {
+  let FilterClass = "Commutable_REV";
+  let RowFields = ["RevOp"];
+  let ColFields = ["IsOrig"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
+// Maps an original opcode to its commuted version
+def getCommuteRev : InstrMapping {
+  let FilterClass = "Commutable_REV";
+  let RowFields = ["RevOp"];
+  let ColFields = ["IsOrig"];
+  let KeyCol = ["1"];
+  let ValueCols = [["0"]];
+}
+
+def getMCOpcodeGen : InstrMapping {
+  let FilterClass = "SIMCInstr";
+  let RowFields = ["PseudoInstr"];
+  let ColFields = ["Subtarget"];
+  let KeyCol = [!cast<string>(SIEncodingFamily.NONE)];
+  let ValueCols = [[!cast<string>(SIEncodingFamily.SI)],
+                   [!cast<string>(SIEncodingFamily.VI)]];
+}
+
+// Get equivalent SOPK instruction.
+def getSOPKOp : InstrMapping {
+  let FilterClass = "SOPKInstTable";
+  let RowFields = ["BaseCmpOp"];
+  let ColFields = ["IsSOPK"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
+def getAddr64Inst : InstrMapping {
+  let FilterClass = "MUBUFAddr64Table";
+  let RowFields = ["OpName"];
+  let ColFields = ["IsAddr64"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
+// Maps an atomic opcode to its version with a return value.
+def getAtomicRetOp : InstrMapping {
+  let FilterClass = "AtomicNoRet";
+  let RowFields = ["NoRetOp"];
+  let ColFields = ["IsRet"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
+// Maps an atomic opcode to its returnless version.
+def getAtomicNoRetOp : InstrMapping {
+  let FilterClass = "AtomicNoRet";
+  let RowFields = ["NoRetOp"];
+  let ColFields = ["IsRet"];
+  let KeyCol = ["1"];
+  let ValueCols = [["0"]];
+}
+
+include "SIInstructions.td"
+include "CIInstructions.td"
+
+include "DSInstructions.td"
+include "MIMGInstructions.td"
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
new file mode 100644
index 000000000000..bc35c2edc8d3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -0,0 +1,1089 @@
+//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file was originally auto-generated from a GPU register header file and
+// all the instruction definitions were originally commented out.  Instructions
+// that are not yet supported remain commented out.
+//===----------------------------------------------------------------------===//
+
+def isGCN : Predicate<"Subtarget->getGeneration() "
+                      ">= SISubtarget::SOUTHERN_ISLANDS">,
+            AssemblerPredicate<"FeatureGCN">;
+def isSI : Predicate<"Subtarget->getGeneration() "
+                      "== SISubtarget::SOUTHERN_ISLANDS">,
+           AssemblerPredicate<"FeatureSouthernIslands">;
+
+def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
+def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
+def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
+                      AssemblerPredicate<"FeatureVGPRIndexMode">;
+def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
+                AssemblerPredicate<"FeatureMovrel">;
+
+include "VOPInstructions.td"
+include "SOPInstructions.td"
+include "SMInstructions.td"
+include "FLATInstructions.td"
+include "BUFInstructions.td"
+
+let SubtargetPredicate = isGCN in {
+
+//===----------------------------------------------------------------------===//
+// EXP Instructions
+//===----------------------------------------------------------------------===//
+
+defm EXP : EXP_m<0, AMDGPUexport>;
+defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
+
+//===----------------------------------------------------------------------===//
+// VINTRP Instructions
+//===----------------------------------------------------------------------===//
+
+let Uses = [M0, EXEC] in {
+
+// FIXME: Specify SchedRW for VINTRP insturctions.
+
+multiclass V_INTERP_P1_F32_m : VINTRP_m <
+  0x00000000,
+  (outs VGPR_32:$vdst),
+  (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
+  "v_interp_p1_f32 $vdst, $vsrc, $attr$attrchan",
+  [(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan),
+                                               (i32 imm:$attr)))]
+>;
+
+let OtherPredicates = [has32BankLDS] in {
+
+defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;
+
+} // End OtherPredicates = [has32BankLDS]
+
+let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
+
+defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
+
+} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
+
+let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
+
+defm V_INTERP_P2_F32 : VINTRP_m <
+  0x00000001,
+  (outs VGPR_32:$vdst),
+  (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
+  "v_interp_p2_f32 $vdst, $vsrc, $attr$attrchan",
+  [(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan),
+                                                          (i32 imm:$attr)))]>;
+
+} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"
+
+defm V_INTERP_MOV_F32 : VINTRP_m <
+  0x00000002,
+  (outs VGPR_32:$vdst),
+  (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
+  "v_interp_mov_f32 $vdst, $vsrc, $attr$attrchan",
+  [(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan),
+                                     (i32 imm:$attr)))]>;
+
+} // End Uses = [M0, EXEC]
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
+
+// For use in patterns
+def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
+  (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let usesCustomInserter = 1;
+}
+
+// 64-bit vector move instruction.  This is mainly used by the SIFoldOperands
+// pass to enable folding of inline immediates.
+def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
+                                      (ins VSrc_b64:$src0)>;
+} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
+
+let usesCustomInserter = 1, SALU = 1 in {
+def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
+  [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
+} // End let usesCustomInserter = 1, SALU = 1
+
+def S_MOV_B64_term : PseudoInstSI<(outs SReg_64:$dst),
+   (ins SSrc_b64:$src0)> {
+  let SALU = 1;
+  let isAsCheapAsAMove = 1;
+  let isTerminator = 1;
+}
+
+def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst),
+   (ins SSrc_b64:$src0, SSrc_b64:$src1)> {
+  let SALU = 1;
+  let isAsCheapAsAMove = 1;
+  let isTerminator = 1;
+}
+
+def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst),
+   (ins SSrc_b64:$src0, SSrc_b64:$src1)> {
+  let SALU = 1;
+  let isAsCheapAsAMove = 1;
+  let isTerminator = 1;
+}
+
+def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
+  [(int_amdgcn_wave_barrier)]> {
+  let SchedRW = [];
+  let hasNoSchedulingInfo = 1;
+  let hasSideEffects = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let isBarrier = 1;
+  let isConvergent = 1;
+}
+
+// SI pseudo instructions. These are used by the CFG structurizer pass
+// and should be lowered to ISA instructions prior to codegen.
+
+// Dummy terminator instruction to use after control flow instructions
+// replaced with exec mask operations.
+def SI_MASK_BRANCH : PseudoInstSI <
+  (outs), (ins brtarget:$target)> {
+  let isBranch = 0;
+  let isTerminator = 1;
+  let isBarrier = 0;
+  let Uses = [EXEC];
+  let SchedRW = [];
+  let hasNoSchedulingInfo = 1;
+}
+
+let isTerminator = 1 in {
+
+def SI_IF: CFPseudoInstSI <
+  (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
+  [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))], 1, 1> {
+  let Constraints = "";
+  let Size = 12;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let hasSideEffects = 1;
+}
+
+def SI_ELSE : CFPseudoInstSI <
+  (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
+  let Constraints = "$src = $dst";
+  let Size = 12;
+  let mayStore = 1;
+  let mayLoad = 1;
+  let hasSideEffects = 1;
+}
+
+def SI_LOOP : CFPseudoInstSI <
+  (outs), (ins SReg_64:$saved, brtarget:$target),
+  [(int_amdgcn_loop i64:$saved, bb:$target)], 1, 1> {
+  let Size = 8;
+  let isBranch = 1;
+  let hasSideEffects = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+} // End isBranch = 1, isTerminator = 1
+
+def SI_END_CF : CFPseudoInstSI <
+  (outs), (ins SReg_64:$saved),
+  [(int_amdgcn_end_cf i64:$saved)], 1, 1> {
+  let Size = 4;
+  let isAsCheapAsAMove = 1;
+  let isReMaterializable = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let hasSideEffects = 1;
+}
+
+def SI_BREAK : CFPseudoInstSI <
+  (outs SReg_64:$dst), (ins SReg_64:$src),
+  [(set i64:$dst, (int_amdgcn_break i64:$src))], 1> {
+  let Size = 4;
+  let isAsCheapAsAMove = 1;
+  let isReMaterializable = 1;
+}
+
+def SI_IF_BREAK : CFPseudoInstSI <
+  (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
+  [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
+  let Size = 4;
+  let isAsCheapAsAMove = 1;
+  let isReMaterializable = 1;
+}
+
+def SI_ELSE_BREAK : CFPseudoInstSI <
+  (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1),
+  [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]> {
+  let Size = 4;
+  let isAsCheapAsAMove = 1;
+  let isReMaterializable = 1;
+}
+
+let Uses = [EXEC], Defs = [EXEC,VCC] in {
+def SI_KILL : PseudoInstSI <
+  (outs), (ins VSrc_b32:$src),
+  [(AMDGPUkill i32:$src)]> {
+  let isConvergent = 1;
+  let usesCustomInserter = 1;
+}
+
+def SI_KILL_TERMINATOR : SPseudoInstSI <
+  (outs), (ins VSrc_b32:$src)> {
+  let isTerminator = 1;
+}
+
+} // End Uses = [EXEC], Defs = [EXEC,VCC]
+
+// Branch on undef scc. Used to avoid intermediate copy from
+// IMPLICIT_DEF to SCC.
+def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
+  let isTerminator = 1;
+  let usesCustomInserter = 1;
+}
+
+def SI_PS_LIVE : PseudoInstSI <
+  (outs SReg_64:$dst), (ins),
+  [(set i1:$dst, (int_amdgcn_ps_live))]> {
+  let SALU = 1;
+}
+
+// Used as an isel pseudo to directly emit initialization with an
+// s_mov_b32 rather than a copy of another initialized
+// register. MachineCSE skips copies, and we don't want to have to
+// fold operands before it runs.
+def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
+  let Defs = [M0];
+  let usesCustomInserter = 1;
+  let isAsCheapAsAMove = 1;
+  let isReMaterializable = 1;
+}
+
+def SI_RETURN : SPseudoInstSI <
+  (outs), (ins variable_ops), [(AMDGPUreturn)]> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
+  let hasSideEffects = 1;
+  let hasNoSchedulingInfo = 1;
+  let DisableWQM = 1;
+}
+
+let Defs = [M0, EXEC],
+  UseNamedOperandTable = 1 in {
+
+class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
+  (outs VGPR_32:$vdst),
+  (ins rc:$src, VS_32:$idx, i32imm:$offset)> {
+  let usesCustomInserter = 1;
+}
+
+class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI <
+  (outs rc:$vdst),
+  (ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> {
+  let Constraints = "$src = $vdst";
+  let usesCustomInserter = 1;
+}
+
+// TODO: We can support indirect SGPR access.
+def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
+def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
+def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
+def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
+def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
+
+def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
+def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
+def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
+def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
+def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
+
+} // End Uses = [EXEC], Defs = [M0, EXEC]
+
+multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
+  let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in {
+    def _SAVE : PseudoInstSI <
+      (outs),
+      (ins sgpr_class:$data, i32imm:$addr)> {
+      let mayStore = 1;
+      let mayLoad = 0;
+    }
+
+    def _RESTORE : PseudoInstSI <
+      (outs sgpr_class:$data),
+      (ins i32imm:$addr)> {
+      let mayStore = 0;
+      let mayLoad = 1;
+    }
+  } // End UseNamedOperandTable = 1
+}
+
+// You cannot use M0 as the output of v_readlane_b32 instructions or
+// use it in the sdata operand of SMEM instructions. We still need to
+// be able to spill the physical register m0, so allow it for
+// SI_SPILL_32_* instructions.
+defm SI_SPILL_S32  : SI_SPILL_SGPR <SReg_32>;
+defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
+defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
+defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
+defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+
+multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
+  let UseNamedOperandTable = 1, VGPRSpill = 1,
+       SchedRW = [WriteVMEM] in {
+    def _SAVE : VPseudoInstSI <
+      (outs),
+      (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
+           SReg_32:$soffset, i32imm:$offset)> {
+      let mayStore = 1;
+      let mayLoad = 0;
+      // (2 * 4) + (8 * num_subregs) bytes maximum
+      let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+    }
+
+    def _RESTORE : VPseudoInstSI <
+      (outs vgpr_class:$vdata),
+      (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
+           i32imm:$offset)> {
+      let mayStore = 0;
+      let mayLoad = 1;
+
+      // (2 * 4) + (8 * num_subregs) bytes maximum
+      let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+    }
+  } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
+}
+
+defm SI_SPILL_V32  : SI_SPILL_VGPR <VGPR_32>;
+defm SI_SPILL_V64  : SI_SPILL_VGPR <VReg_64>;
+defm SI_SPILL_V96  : SI_SPILL_VGPR <VReg_96>;
+defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
+defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
+defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
+
+def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
+  (outs SReg_64:$dst),
+  (ins si_ga:$ptr_lo, si_ga:$ptr_hi),
+  [(set SReg_64:$dst,
+   (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr_lo), (tglobaladdr:$ptr_hi))))]> {
+  let Defs = [SCC];
+}
+
+} // End SubtargetPredicate = isGCN
+
+let Predicates = [isGCN] in {
+
+def : Pat<
+  (int_amdgcn_else i64:$src, bb:$target),
+  (SI_ELSE $src, $target, 0)
+>;
+
+def : Pat <
+  (int_AMDGPU_kilp),
+  (SI_KILL (i32 0xbf800000))
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP1 Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [UnsafeFPMath] in {
+
+//def : RcpPat<V_RCP_F64_e32, f64>;
+//defm : RsqPat<V_RSQ_F64_e32, f64>;
+//defm : RsqPat<V_RSQ_F32_e32, f32>;
+
+def : RsqPat<V_RSQ_F32_e32, f32>;
+def : RsqPat<V_RSQ_F64_e32, f64>;
+
+// Convert (x - floor(x)) to fract(x)
+def : Pat <
+  (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
+             (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
+  (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+// Convert (x + (-floor(x))) to fract(x)
+def : Pat <
+  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
+             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
+  (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+} // End Predicates = [UnsafeFPMath]
+
+def : Pat <
+  (f32 (fpextend f16:$src)),
+  (V_CVT_F32_F16_e32 $src)
+>;
+
+def : Pat <
+  (f64 (fpextend f16:$src)),
+  (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
+>;
+
+def : Pat <
+  (f16 (fpround f32:$src)),
+  (V_CVT_F16_F32_e32 $src)
+>;
+
+def : Pat <
+  (f16 (fpround f64:$src)),
+  (V_CVT_F16_F32_e32 (V_CVT_F32_F64_e32 $src))
+>;
+
+def : Pat <
+  (i32 (fp_to_sint f16:$src)),
+  (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src))
+>;
+
+def : Pat <
+  (i32 (fp_to_uint f16:$src)),
+  (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src))
+>;
+
+def : Pat <
+  (f16 (sint_to_fp i32:$src)),
+  (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src))
+>;
+
+def : Pat <
+  (f16 (uint_to_fp i32:$src)),
+  (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src))
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+multiclass FMADPat <ValueType vt, Instruction inst> {
+  def : Pat <
+    (vt (fmad (VOP3NoMods0 vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+              (VOP3NoMods  vt:$src1, i32:$src1_modifiers),
+              (VOP3NoMods  vt:$src2, i32:$src2_modifiers))),
+    (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
+          $src2_modifiers, $src2, $clamp, $omod)
+  >;
+}
+
+defm : FMADPat <f16, V_MAC_F16_e64>;
+defm : FMADPat <f32, V_MAC_F32_e64>;
+
+multiclass SelectPat <ValueType vt, Instruction inst> {
+  def : Pat <
+    (vt (select i1:$src0, vt:$src1, vt:$src2)),
+    (inst $src2, $src1, $src0)
+  >;
+}
+
+defm : SelectPat <i16, V_CNDMASK_B32_e64>;
+defm : SelectPat <i32, V_CNDMASK_B32_e64>;
+defm : SelectPat <f16, V_CNDMASK_B32_e64>;
+defm : SelectPat <f32, V_CNDMASK_B32_e64>;
+
+def : Pat <
+  (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
+  (V_BCNT_U32_B32_e64 $popcnt, $val)
+>;
+
+/********** ============================================ **********/
+/********** Extraction, Insertion, Building and Casting  **********/
+/********** ============================================ **********/
+
+foreach Index = 0-2 in {
+  def Extract_Element_v2i32_#Index : Extract_Element <
+    i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v2i32_#Index : Insert_Element <
+    i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v2f32_#Index : Extract_Element <
+    f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v2f32_#Index : Insert_Element <
+    f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-3 in {
+  def Extract_Element_v4i32_#Index : Extract_Element <
+    i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v4i32_#Index : Insert_Element <
+    i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v4f32_#Index : Extract_Element <
+    f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v4f32_#Index : Insert_Element <
+    f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-7 in {
+  def Extract_Element_v8i32_#Index : Extract_Element <
+    i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v8i32_#Index : Insert_Element <
+    i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v8f32_#Index : Extract_Element <
+    f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v8f32_#Index : Insert_Element <
+    f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-15 in {
+  def Extract_Element_v16i32_#Index : Extract_Element <
+    i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v16i32_#Index : Insert_Element <
+    i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v16f32_#Index : Extract_Element <
+    f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v16f32_#Index : Insert_Element <
+    f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+// FIXME: Why do only some of these type combinations for SReg and
+// VReg?
+// 16-bit bitcast
+def : BitConvert <i16, f16, VGPR_32>;
+def : BitConvert <f16, i16, VGPR_32>;
+def : BitConvert <i16, f16, SReg_32>;
+def : BitConvert <f16, i16, SReg_32>;
+
+// 32-bit bitcast
+def : BitConvert <i32, f32, VGPR_32>;
+def : BitConvert <f32, i32, VGPR_32>;
+def : BitConvert <i32, f32, SReg_32>;
+def : BitConvert <f32, i32, SReg_32>;
+
+// 64-bit bitcast
+def : BitConvert <i64, f64, VReg_64>;
+def : BitConvert <f64, i64, VReg_64>;
+def : BitConvert <v2i32, v2f32, VReg_64>;
+def : BitConvert <v2f32, v2i32, VReg_64>;
+def : BitConvert <i64, v2i32, VReg_64>;
+def : BitConvert <v2i32, i64, VReg_64>;
+def : BitConvert <i64, v2f32, VReg_64>;
+def : BitConvert <v2f32, i64, VReg_64>;
+def : BitConvert <f64, v2f32, VReg_64>;
+def : BitConvert <v2f32, f64, VReg_64>;
+def : BitConvert <f64, v2i32, VReg_64>;
+def : BitConvert <v2i32, f64, VReg_64>;
+def : BitConvert <v4i32, v4f32, VReg_128>;
+def : BitConvert <v4f32, v4i32, VReg_128>;
+
+// 128-bit bitcast
+def : BitConvert <v2i64, v4i32, SReg_128>;
+def : BitConvert <v4i32, v2i64, SReg_128>;
+def : BitConvert <v2f64, v4f32, VReg_128>;
+def : BitConvert <v2f64, v4i32, VReg_128>;
+def : BitConvert <v4f32, v2f64, VReg_128>;
+def : BitConvert <v4i32, v2f64, VReg_128>;
+def : BitConvert <v2i64, v2f64, VReg_128>;
+def : BitConvert <v2f64, v2i64, VReg_128>;
+
+// 256-bit bitcast
+def : BitConvert <v8i32, v8f32, SReg_256>;
+def : BitConvert <v8f32, v8i32, SReg_256>;
+def : BitConvert <v8i32, v8f32, VReg_256>;
+def : BitConvert <v8f32, v8i32, VReg_256>;
+
+// 512-bit bitcast
+def : BitConvert <v16i32, v16f32, VReg_512>;
+def : BitConvert <v16f32, v16i32, VReg_512>;
+
+/********** =================== **********/
+/********** Src & Dst modifiers **********/
+/********** =================== **********/
+
+def : Pat <
+  (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
+               (f32 FP_ZERO), (f32 FP_ONE)),
+  (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod)
+>;
+
+/********** ================================ **********/
+/********** Floating point absolute/negative **********/
+/********** ================================ **********/
+
+// Prevent expanding both fneg and fabs.
+
+def : Pat <
+  (fneg (fabs f32:$src)),
+  (S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit
+>;
+
+// FIXME: Should use S_OR_B32
+def : Pat <
+  (fneg (fabs f64:$src)),
+  (REG_SEQUENCE VReg_64,
+    (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+    sub0,
+    (V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+                  (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
+    sub1)
+>;
+
+def : Pat <
+  (fabs f32:$src),
+  (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff)))
+>;
+
+def : Pat <
+  (fneg f32:$src),
+  (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000)))
+>;
+
+def : Pat <
+  (fabs f64:$src),
+  (REG_SEQUENCE VReg_64,
+    (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+    sub0,
+    (V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+                   (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit.
+     sub1)
+>;
+
+def : Pat <
+  (fneg f64:$src),
+  (REG_SEQUENCE VReg_64,
+    (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+    sub0,
+    (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+                   (i32 (V_MOV_B32_e32 (i32 0x80000000)))),
+    sub1)
+>;
+
+def : Pat <
+  (fneg f16:$src),
+  (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000)))
+>;
+
+def : Pat <
+  (fabs f16:$src),
+  (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff)))
+>;
+
+def : Pat <
+  (fneg (fabs f16:$src)),
+  (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
+>;
+
+/********** ================== **********/
+/********** Immediate Patterns **********/
+/********** ================== **********/
+
+def : Pat <
+  (VGPRImm<(i32 imm)>:$imm),
+  (V_MOV_B32_e32 imm:$imm)
+>;
+
+def : Pat <
+  (VGPRImm<(f32 fpimm)>:$imm),
+  (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
+>;
+
+def : Pat <
+  (i32 imm:$imm),
+  (S_MOV_B32 imm:$imm)
+>;
+
+// FIXME: Workaround for ordering issue with peephole optimizer where
+// a register class copy interferes with immediate folding.  Should
+// use s_mov_b32, which can be shrunk to s_movk_i32
+def : Pat <
+  (VGPRImm<(f16 fpimm)>:$imm),
+  (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
+>;
+
+def : Pat <
+  (f32 fpimm:$imm),
+  (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
+>;
+
+def : Pat <
+  (f16 fpimm:$imm),
+  (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
+>;
+
+def : Pat <
+ (i32 frameindex:$fi),
+ (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
+>;
+
+def : Pat <
+  (i64 InlineImm<i64>:$imm),
+  (S_MOV_B64 InlineImm<i64>:$imm)
+>;
+
+// XXX - Should this use a s_cmp to set SCC?
+
+// Set to sign-extended 64-bit value (true = -1, false = 0)
+def : Pat <
+  (i1 imm:$imm),
+  (S_MOV_B64 (i64 (as_i64imm $imm)))
+>;
+
+def : Pat <
+  (f64 InlineFPImm<f64>:$imm),
+  (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm)))
+>;
+
+/********** ================== **********/
+/********** Intrinsic Patterns **********/
+/********** ================== **********/
+
+def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
+
+def : Pat <
+  (int_AMDGPU_cube v4f32:$src),
+  (REG_SEQUENCE VReg_128,
+    (V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
+                  0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)),
+                  0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)),
+                  0 /* clamp */, 0 /* omod */), sub0,
+    (V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
+                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
+                  0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
+                  0 /* clamp */, 0 /* omod */), sub1,
+    (V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
+                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
+                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
+                  0 /* clamp */, 0 /* omod */), sub2,
+    (V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
+                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
+                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
+                  0 /* clamp */, 0 /* omod */), sub3)
+>;
+
+def : Pat <
+  (i32 (sext i1:$src0)),
+  (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
+>;
+
+class Ext32Pat <SDNode ext> : Pat <
+  (i32 (ext i1:$src0)),
+  (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0)
+>;
+
+def : Ext32Pat <zext>;
+def : Ext32Pat <anyext>;
+
+// The multiplication scales from [0,1] to the unsigned integer range
+def : Pat <
+  (AMDGPUurecip i32:$src0),
+  (V_CVT_U32_F32_e32
+    (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1),
+                   (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP3 Patterns
+//===----------------------------------------------------------------------===//
+
+def : IMad24Pat<V_MAD_I32_I24>;
+def : UMad24Pat<V_MAD_U32_U24>;
+
+defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
+def : ROTRPattern <V_ALIGNBIT_B32>;
+
+/********** ====================== **********/
+/**********   Indirect addressing  **********/
+/********** ====================== **********/
+
+multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
+  // Extract with offset
+  def : Pat<
+    (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))),
+    (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset)
+  >;
+
+  // Insert with offset
+  def : Pat<
+    (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))),
+    (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val)
+  >;
+}
+
+defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
+defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
+defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
+defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
+
+defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
+defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
+defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
+defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
+
+//===----------------------------------------------------------------------===//
+// SAD Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (add (sub_oneuse (umax i32:$src0, i32:$src1),
+                   (umin i32:$src0, i32:$src1)),
+       i32:$src2),
+  (V_SAD_U32 $src0, $src1, $src2)
+>;
+
+def : Pat <
+  (add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)),
+                      (sub i32:$src0, i32:$src1),
+                      (sub i32:$src1, i32:$src0)),
+       i32:$src2),
+  (V_SAD_U32 $src0, $src1, $src2)
+>;
+
+//===----------------------------------------------------------------------===//
+// Conversion Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat<(i32 (sext_inreg i32:$src, i1)),
+  (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
+
+// Handle sext_inreg in i64
+def : Pat <
+  (i64 (sext_inreg i64:$src, i1)),
+  (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16
+>;
+
+def : Pat <
+  (i16 (sext_inreg i16:$src, i8)),
+  (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
+>;
+
+def : Pat <
+  (i64 (sext_inreg i64:$src, i8)),
+  (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16
+>;
+
+def : Pat <
+  (i64 (sext_inreg i64:$src, i16)),
+  (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16
+>;
+
+def : Pat <
+  (i64 (sext_inreg i64:$src, i32)),
+  (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
+>;
+
+def : Pat <
+  (i64 (zext i32:$src)),
+  (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
+>;
+
+def : Pat <
+  (i64 (anyext i32:$src)),
+  (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
+>;
+
+class ZExt_i64_i1_Pat <SDNode ext> : Pat <
+  (i64 (ext i1:$src)),
+    (REG_SEQUENCE VReg_64,
+      (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
+      (S_MOV_B32 (i32 0)), sub1)
+>;
+
+
+def : ZExt_i64_i1_Pat<zext>;
+def : ZExt_i64_i1_Pat<anyext>;
+
+// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
+// REG_SEQUENCE patterns don't support instructions with multiple outputs.
+def : Pat <
+  (i64 (sext i32:$src)),
+    (REG_SEQUENCE SReg_64, $src, sub0,
+    (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
+>;
+
+def : Pat <
+  (i64 (sext i1:$src)),
+  (REG_SEQUENCE VReg_64,
+    (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0,
+    (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1)
+>;
+
+class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat <
+  (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
+  (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
+>;
+
+def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
+def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
+
+// If we need to perform a logical operation on i1 values, we need to
+// use vector comparisons since there is only one SCC register. Vector
+// comparisons still write to a pair of SGPRs, so treat these as
+// 64-bit comparisons. When legalizing SGPR copies, instructions
+// resulting in the copies from SCC to these instructions will be
+// moved to the VALU.
+def : Pat <
+  (i1 (and i1:$src0, i1:$src1)),
+  (S_AND_B64 $src0, $src1)
+>;
+
+def : Pat <
+  (i1 (or i1:$src0, i1:$src1)),
+  (S_OR_B64 $src0, $src1)
+>;
+
+def : Pat <
+  (i1 (xor i1:$src0, i1:$src1)),
+  (S_XOR_B64 $src0, $src1)
+>;
+
+def : Pat <
+  (f32 (sint_to_fp i1:$src)),
+  (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
+>;
+
+def : Pat <
+  (f32 (uint_to_fp i1:$src)),
+  (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)
+>;
+
+def : Pat <
+  (f64 (sint_to_fp i1:$src)),
+  (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
+>;
+
+def : Pat <
+  (f64 (uint_to_fp i1:$src)),
+  (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src))
+>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (i32 (trunc i64:$a)),
+  (EXTRACT_SUBREG $a, sub0)
+>;
+
+def : Pat <
+  (i1 (trunc i32:$a)),
+  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
+>;
+
+def : Pat <
+  (i1 (trunc i64:$a)),
+  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
+                    (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
+>;
+
+def : Pat <
+  (i32 (bswap i32:$a)),
+  (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
+             (V_ALIGNBIT_B32 $a, $a, (i32 24)),
+             (V_ALIGNBIT_B32 $a, $a, (i32 8)))
+>;
+
+multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
+  def : Pat <
+    (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
+    (BFM $a, $b)
+  >;
+
+  def : Pat <
+    (vt (add (vt (shl 1, vt:$a)), -1)),
+    (BFM $a, (MOV (i32 0)))
+  >;
+}
+
+defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
+// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
+
+def : BFEPattern <V_BFE_U32, S_MOV_B32>;
+
+def : Pat<
+  (fcanonicalize f16:$src),
+  (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), 0, $src, 0, 0)
+>;
+
+def : Pat<
+  (fcanonicalize f32:$src),
+  (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0)
+>;
+
+def : Pat<
+  (fcanonicalize f64:$src),
+  (V_MUL_F64 0, CONST.FP64_ONE, 0, $src, 0, 0)
+>;
+
+//===----------------------------------------------------------------------===//
+// Fract Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isSI] in {
+
+// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
+// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
+// way to implement it is using V_FRACT_F64.
+// The workaround for the V_FRACT bug is:
+//    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
+
+// Convert floor(x) to (x - fract(x))
+def : Pat <
+  (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
+  (V_ADD_F64
+      $mods,
+      $x,
+      SRCMODS.NEG,
+      (V_CNDMASK_B64_PSEUDO
+         (V_MIN_F64
+             SRCMODS.NONE,
+             (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
+             SRCMODS.NONE,
+             (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
+             DSTCLAMP.NONE, DSTOMOD.NONE),
+         $x,
+         (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))),
+      DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+} // End Predicates = [isSI]
+
+//============================================================================//
+// Miscellaneous Optimization Patterns
+//============================================================================//
+
+def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
+
+def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
+def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
+
+//============================================================================//
+// Assembler aliases
+//============================================================================//
+
+def : MnemonicAlias<"v_add_u32", "v_add_i32">;
+def : MnemonicAlias<"v_sub_u32", "v_sub_i32">;
+def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">;
+
+} // End isGCN predicate
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td
new file mode 100644
index 000000000000..5da375468713
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td
@@ -0,0 +1,209 @@
+//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Backend internal SI Intrinsic Definitions. User code should not
+// directly use these.
+//
+//===----------------------------------------------------------------------===//
+
+
+let TargetPrefix = "SI", isTarget = 1 in {
+  def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+
+  def int_SI_export : Intrinsic <[],
+    [llvm_i32_ty,   // en
+    llvm_i32_ty,    // vm   (FIXME: should be i1)
+    llvm_i32_ty,    // done (FIXME: should be i1)
+    llvm_i32_ty,    // tgt
+    llvm_i32_ty,    // compr (FIXME: should be i1)
+    llvm_float_ty,  // src0
+    llvm_float_ty,  // src1
+    llvm_float_ty,  // src2
+    llvm_float_ty], // src3
+    []
+  >;
+
+  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
+
+  // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed
+  def int_SI_tbuffer_store : Intrinsic <
+    [],
+    [llvm_anyint_ty, // rsrc(SGPR)
+     llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32
+     llvm_i32_ty,    // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW
+     llvm_i32_ty,    // vaddr(VGPR)
+     llvm_i32_ty,    // soffset(SGPR)
+     llvm_i32_ty,    // inst_offset(imm)
+     llvm_i32_ty,    // dfmt(imm)
+     llvm_i32_ty,    // nfmt(imm)
+     llvm_i32_ty,    // offen(imm)
+     llvm_i32_ty,    // idxen(imm)
+     llvm_i32_ty,    // glc(imm)
+     llvm_i32_ty,    // slc(imm)
+     llvm_i32_ty],   // tfe(imm)
+    []>;
+
+  // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed
+  def int_SI_buffer_load_dword : Intrinsic <
+    [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32
+    [llvm_anyint_ty,  // rsrc(SGPR)
+     llvm_anyint_ty,  // vaddr(VGPR)
+     llvm_i32_ty,     // soffset(SGPR)
+     llvm_i32_ty,     // inst_offset(imm)
+     llvm_i32_ty,     // offen(imm)
+     llvm_i32_ty,     // idxen(imm)
+     llvm_i32_ty,     // glc(imm)
+     llvm_i32_ty,     // slc(imm)
+     llvm_i32_ty],    // tfe(imm)
+    [IntrReadMem, IntrArgMemOnly]>;
+
+  def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>;
+
+  // Fully-flexible SAMPLE instruction.
+  class SampleRaw : Intrinsic <
+    [llvm_v4f32_ty],    // vdata(VGPR)
+    [llvm_anyint_ty,    // vaddr(VGPR)
+     llvm_v8i32_ty,     // rsrc(SGPR)
+     llvm_v4i32_ty,     // sampler(SGPR)
+     llvm_i32_ty,       // dmask(imm)
+     llvm_i32_ty,       // unorm(imm)
+     llvm_i32_ty,       // r128(imm)
+     llvm_i32_ty,       // da(imm)
+     llvm_i32_ty,       // glc(imm)
+     llvm_i32_ty,       // slc(imm)
+     llvm_i32_ty,       // tfe(imm)
+     llvm_i32_ty],      // lwe(imm)
+    [IntrNoMem]>;
+
+  // Image instruction without a sampler.
+  class Image : Intrinsic <
+    [llvm_v4f32_ty],    // vdata(VGPR)
+    [llvm_anyint_ty,    // vaddr(VGPR)
+     llvm_v8i32_ty,     // rsrc(SGPR)
+     llvm_i32_ty,       // dmask(imm)
+     llvm_i32_ty,       // unorm(imm)
+     llvm_i32_ty,       // r128(imm)
+     llvm_i32_ty,       // da(imm)
+     llvm_i32_ty,       // glc(imm)
+     llvm_i32_ty,       // slc(imm)
+     llvm_i32_ty,       // tfe(imm)
+     llvm_i32_ty],      // lwe(imm)
+    [IntrNoMem]>;
+
+  // Basic sample
+  def int_SI_image_sample : SampleRaw;
+  def int_SI_image_sample_cl : SampleRaw;
+  def int_SI_image_sample_d : SampleRaw;
+  def int_SI_image_sample_d_cl : SampleRaw;
+  def int_SI_image_sample_l : SampleRaw;
+  def int_SI_image_sample_b : SampleRaw;
+  def int_SI_image_sample_b_cl : SampleRaw;
+  def int_SI_image_sample_lz : SampleRaw;
+  def int_SI_image_sample_cd : SampleRaw;
+  def int_SI_image_sample_cd_cl : SampleRaw;
+
+  // Sample with comparison
+  def int_SI_image_sample_c : SampleRaw;
+  def int_SI_image_sample_c_cl : SampleRaw;
+  def int_SI_image_sample_c_d : SampleRaw;
+  def int_SI_image_sample_c_d_cl : SampleRaw;
+  def int_SI_image_sample_c_l : SampleRaw;
+  def int_SI_image_sample_c_b : SampleRaw;
+  def int_SI_image_sample_c_b_cl : SampleRaw;
+  def int_SI_image_sample_c_lz : SampleRaw;
+  def int_SI_image_sample_c_cd : SampleRaw;
+  def int_SI_image_sample_c_cd_cl : SampleRaw;
+
+  // Sample with offsets
+  def int_SI_image_sample_o : SampleRaw;
+  def int_SI_image_sample_cl_o : SampleRaw;
+  def int_SI_image_sample_d_o : SampleRaw;
+  def int_SI_image_sample_d_cl_o : SampleRaw;
+  def int_SI_image_sample_l_o : SampleRaw;
+  def int_SI_image_sample_b_o : SampleRaw;
+  def int_SI_image_sample_b_cl_o : SampleRaw;
+  def int_SI_image_sample_lz_o : SampleRaw;
+  def int_SI_image_sample_cd_o : SampleRaw;
+  def int_SI_image_sample_cd_cl_o : SampleRaw;
+
+  // Sample with comparison and offsets
+  def int_SI_image_sample_c_o : SampleRaw;
+  def int_SI_image_sample_c_cl_o : SampleRaw;
+  def int_SI_image_sample_c_d_o : SampleRaw;
+  def int_SI_image_sample_c_d_cl_o : SampleRaw;
+  def int_SI_image_sample_c_l_o : SampleRaw;
+  def int_SI_image_sample_c_b_o : SampleRaw;
+  def int_SI_image_sample_c_b_cl_o : SampleRaw;
+  def int_SI_image_sample_c_lz_o : SampleRaw;
+  def int_SI_image_sample_c_cd_o : SampleRaw;
+  def int_SI_image_sample_c_cd_cl_o : SampleRaw;
+
+  // Basic gather4
+  def int_SI_gather4 : SampleRaw;
+  def int_SI_gather4_cl : SampleRaw;
+  def int_SI_gather4_l : SampleRaw;
+  def int_SI_gather4_b : SampleRaw;
+  def int_SI_gather4_b_cl : SampleRaw;
+  def int_SI_gather4_lz : SampleRaw;
+
+  // Gather4 with comparison
+  def int_SI_gather4_c : SampleRaw;
+  def int_SI_gather4_c_cl : SampleRaw;
+  def int_SI_gather4_c_l : SampleRaw;
+  def int_SI_gather4_c_b : SampleRaw;
+  def int_SI_gather4_c_b_cl : SampleRaw;
+  def int_SI_gather4_c_lz : SampleRaw;
+
+  // Gather4 with offsets
+  def int_SI_gather4_o : SampleRaw;
+  def int_SI_gather4_cl_o : SampleRaw;
+  def int_SI_gather4_l_o : SampleRaw;
+  def int_SI_gather4_b_o : SampleRaw;
+  def int_SI_gather4_b_cl_o : SampleRaw;
+  def int_SI_gather4_lz_o : SampleRaw;
+
+  // Gather4 with comparison and offsets
+  def int_SI_gather4_c_o : SampleRaw;
+  def int_SI_gather4_c_cl_o : SampleRaw;
+  def int_SI_gather4_c_l_o : SampleRaw;
+  def int_SI_gather4_c_b_o : SampleRaw;
+  def int_SI_gather4_c_b_cl_o : SampleRaw;
+  def int_SI_gather4_c_lz_o : SampleRaw;
+
+  def int_SI_getlod : SampleRaw;
+
+  // Image instrinsics.
+  def int_SI_image_load : Image;
+  def int_SI_image_load_mip : Image;
+  def int_SI_getresinfo : Image;
+
+  /* Interpolation Intrinsics */
+
+  def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>;
+} // End TargetPrefix = "SI", isTarget = 1
+
+let TargetPrefix = "amdgcn", isTarget = 1 in {
+  // Emit 2.5 ulp, no denormal division. Should only be inserted by
+  // pass based on !fpmath metadata.
+  def int_amdgcn_fdiv_fast : Intrinsic<
+    [llvm_float_ty], [llvm_float_ty], [IntrNoMem]
+  >;
+
+  /* Control flow Intrinsics */
+
+  def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], [IntrConvergent]>;
+  def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>;
+  def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
+  def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
+  def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
+  def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>;
+  def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
new file mode 100644
index 000000000000..99fe96c0be22
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -0,0 +1,531 @@
+//===-- SILoadStoreOptimizer.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to fuse DS instructions with close by immediate offsets.
+// This will fuse operations such as
+//  ds_read_b32 v0, v2 offset:16
+//  ds_read_b32 v1, v2 offset:32
+// ==>
+//   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
+//
+//
+// Future improvements:
+//
+// - This currently relies on the scheduler to place loads and stores next to
+//   each other, and then only merges adjacent pairs of instructions. It would
+//   be good to be more flexible with interleaved instructions, and possibly run
+//   before scheduling. It currently missing stores of constants because loading
+//   the constant into the data register is placed between the stores, although
+//   this is arguably a scheduling problem.
+//
+// - Live interval recomputing seems inefficient. This currently only matches
+//   one pair, and recomputes live intervals and moves on to the next pair. It
+//   would be better to compute a list of all merges that need to occur.
+//
+// - With a list of instructions to process, we can also merge more. If a
+//   cluster of loads have offsets that are too large to fit in the 8-bit
+//   offsets, but are close enough to fit in the 8 bits, we can add to the base
+//   pointer and use the new reduced offsets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-load-store-opt"
+
+namespace {
+
+class SILoadStoreOptimizer : public MachineFunctionPass {
+private:
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+  AliasAnalysis *AA;
+
+  static bool offsetsCanBeCombined(unsigned Offset0,
+                                   unsigned Offset1,
+                                   unsigned EltSize);
+
+  MachineBasicBlock::iterator findMatchingDSInst(
+    MachineBasicBlock::iterator I,
+    unsigned EltSize,
+    SmallVectorImpl<MachineInstr*> &InstsToMove);
+
+  MachineBasicBlock::iterator mergeRead2Pair(
+    MachineBasicBlock::iterator I,
+    MachineBasicBlock::iterator Paired,
+    unsigned EltSize,
+    ArrayRef<MachineInstr*> InstsToMove);
+
+  MachineBasicBlock::iterator mergeWrite2Pair(
+    MachineBasicBlock::iterator I,
+    MachineBasicBlock::iterator Paired,
+    unsigned EltSize,
+    ArrayRef<MachineInstr*> InstsToMove);
+
+public:
+  static char ID;
+
+  SILoadStoreOptimizer()
+      : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
+        AA(nullptr) {}
+
+  SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
+    initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool optimizeBlock(MachineBasicBlock &MBB);
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "SI Load / Store Optimizer"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AAResultsWrapperPass>();
+
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
+                      "SI Load / Store Optimizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
+                    "SI Load / Store Optimizer", false, false)
+
+char SILoadStoreOptimizer::ID = 0;
+
+char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
+
+FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
+  return new SILoadStoreOptimizer(TM);
+}
+
+static void moveInstsAfter(MachineBasicBlock::iterator I,
+                           ArrayRef<MachineInstr*> InstsToMove) {
+  MachineBasicBlock *MBB = I->getParent();
+  ++I;
+  for (MachineInstr *MI : InstsToMove) {
+    MI->removeFromParent();
+    MBB->insert(I, MI);
+  }
+}
+
+static void addDefsToList(const MachineInstr &MI,
+                          SmallVectorImpl<const MachineOperand *> &Defs) {
+  for (const MachineOperand &Def : MI.defs()) {
+    Defs.push_back(&Def);
+  }
+}
+
+static bool memAccessesCanBeReordered(
+  MachineBasicBlock::iterator A,
+  MachineBasicBlock::iterator B,
+  const SIInstrInfo *TII,
+  llvm::AliasAnalysis * AA) {
+  return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) ||
+    // RAW or WAR - cannot reorder
+    // WAW - cannot reorder
+    // RAR - safe to reorder
+    !(A->mayStore() || B->mayStore()));
+}
+
+// Add MI and its defs to the lists if MI reads one of the defs that are
+// already in the list. Returns true in that case.
+static bool
+addToListsIfDependent(MachineInstr &MI,
+                      SmallVectorImpl<const MachineOperand *> &Defs,
+                      SmallVectorImpl<MachineInstr*> &Insts) {
+  for (const MachineOperand *Def : Defs) {
+    bool ReadDef = MI.readsVirtualRegister(Def->getReg());
+    // If ReadDef is true, then there is a use of Def between I
+    // and the instruction that I will potentially be merged with. We
+    // will need to move this instruction after the merged instructions.
+    if (ReadDef) {
+      Insts.push_back(&MI);
+      addDefsToList(MI, Defs);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static bool
+canMoveInstsAcrossMemOp(MachineInstr &MemOp,
+                        ArrayRef<MachineInstr*> InstsToMove,
+                        const SIInstrInfo *TII,
+                        AliasAnalysis *AA) {
+
+  assert(MemOp.mayLoadOrStore());
+
+  for (MachineInstr *InstToMove : InstsToMove) {
+    if (!InstToMove->mayLoadOrStore())
+      continue;
+    if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
+        return false;
+  }
+  return true;
+}
+
+bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
+                                                unsigned Offset1,
+                                                unsigned Size) {
+  // XXX - Would the same offset be OK? Is there any reason this would happen or
+  // be useful?
+  if (Offset0 == Offset1)
+    return false;
+
+  // This won't be valid if the offset isn't aligned.
+  if ((Offset0 % Size != 0) || (Offset1 % Size != 0))
+    return false;
+
+  unsigned EltOffset0 = Offset0 / Size;
+  unsigned EltOffset1 = Offset1 / Size;
+
+  // Check if the new offsets fit in the reduced 8-bit range.
+  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1))
+    return true;
+
+  // If the offset in elements doesn't fit in 8-bits, we might be able to use
+  // the stride 64 versions.
+  if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0)
+    return false;
+
+  return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64);
+}
+
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
+                                  unsigned EltSize,
+                                  SmallVectorImpl<MachineInstr*> &InstsToMove) {
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator MBBI = I;
+  ++MBBI;
+
+  SmallVector<const MachineOperand *, 8> DefsToMove;
+  addDefsToList(*I, DefsToMove);
+
+  for ( ; MBBI != E; ++MBBI) {
+
+    if (MBBI->getOpcode() != I->getOpcode()) {
+
+      // This is not a matching DS instruction, but we can keep looking as
+      // long as one of these conditions are met:
+      // 1. It is safe to move I down past MBBI.
+      // 2. It is safe to move MBBI down past the instruction that I will
+      //    be merged into.
+
+      if (MBBI->hasUnmodeledSideEffects())
+        // We can't re-order this instruction with respect to other memory
+        // opeations, so we fail both conditions mentioned above.
+        return E;
+
+      if (MBBI->mayLoadOrStore() &&
+        !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) {
+        // We fail condition #1, but we may still be able to satisfy condition
+        // #2.  Add this instruction to the move list and then we will check
+        // if condition #2 holds once we have selected the matching instruction.
+        InstsToMove.push_back(&*MBBI);
+        addDefsToList(*MBBI, DefsToMove);
+        continue;
+      }
+
+      // When we match I with another DS instruction we will be moving I down
+      // to the location of the matched instruction any uses of I will need to
+      // be moved down as well.
+      addToListsIfDependent(*MBBI, DefsToMove, InstsToMove);
+      continue;
+    }
+
+    // Don't merge volatiles.
+    if (MBBI->hasOrderedMemoryRef())
+      return E;
+
+    // Handle a case like
+    //   DS_WRITE_B32 addr, v, idx0
+    //   w = DS_READ_B32 addr, idx0
+    //   DS_WRITE_B32 addr, f(w), idx1
+    // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
+    // merging of the two writes.
+    if (addToListsIfDependent(*MBBI, DefsToMove, InstsToMove))
+      continue;
+
+    int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
+    const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
+    const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
+
+    // Check same base pointer. Be careful of subregisters, which can occur with
+    // vectors of pointers.
+    if (AddrReg0.getReg() == AddrReg1.getReg() &&
+        AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
+      int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
+                                                 AMDGPU::OpName::offset);
+      unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
+      unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+
+      // Check both offsets fit in the reduced range.
+      // We also need to go through the list of instructions that we plan to
+      // move and make sure they are all safe to move down past the merged
+      // instruction.
+      if (offsetsCanBeCombined(Offset0, Offset1, EltSize) &&
+          canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA))
+        return MBBI;
+    }
+
+    // We've found a load/store that we couldn't merge for some reason.
+    // We could potentially keep looking, but we'd need to make sure that
+    // it was safe to move I and also all the instruction in InstsToMove
+    // down past this instruction.
+    if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) ||   // check if we can move I across MBBI
+      !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users
+     )
+      break;
+  }
+  return E;
+}
+
+MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
+  MachineBasicBlock::iterator I,
+  MachineBasicBlock::iterator Paired,
+  unsigned EltSize,
+  ArrayRef<MachineInstr*> InstsToMove) {
+  MachineBasicBlock *MBB = I->getParent();
+
+  // Be careful, since the addresses could be subregisters themselves in weird
+  // cases, like vectors of pointers.
+  const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
+
+  const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst);
+  const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst);
+
+  unsigned Offset0
+    = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
+  unsigned Offset1
+    = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
+
+  unsigned NewOffset0 = Offset0 / EltSize;
+  unsigned NewOffset1 = Offset1 / EltSize;
+  unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
+
+  // Prefer the st64 form if we can use it, even if we can fit the offset in the
+  // non st64 version. I'm not sure if there's any real reason to do this.
+  bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
+  if (UseST64) {
+    NewOffset0 /= 64;
+    NewOffset1 /= 64;
+    Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
+  }
+
+  unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
+  unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
+
+  if (NewOffset0 > NewOffset1) {
+    // Canonicalize the merged instruction so the smaller offset comes first.
+    std::swap(NewOffset0, NewOffset1);
+    std::swap(SubRegIdx0, SubRegIdx1);
+  }
+
+  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
+         (NewOffset0 != NewOffset1) &&
+         "Computed offset doesn't fit");
+
+  const MCInstrDesc &Read2Desc = TII->get(Opc);
+
+  const TargetRegisterClass *SuperRC
+    = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
+  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+
+  DebugLoc DL = I->getDebugLoc();
+  MachineInstrBuilder Read2
+    = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg)
+    .addOperand(*AddrReg) // addr
+    .addImm(NewOffset0) // offset0
+    .addImm(NewOffset1) // offset1
+    .addImm(0) // gds
+    .addMemOperand(*I->memoperands_begin())
+    .addMemOperand(*Paired->memoperands_begin());
+  (void)Read2;
+
+  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+
+  // Copy to the old destination registers.
+  BuildMI(*MBB, Paired, DL, CopyDesc)
+    .addOperand(*Dest0) // Copy to same destination including flags and sub reg.
+    .addReg(DestReg, 0, SubRegIdx0);
+  MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc)
+    .addOperand(*Dest1)
+    .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+  moveInstsAfter(Copy1, InstsToMove);
+
+  MachineBasicBlock::iterator Next = std::next(I);
+  I->eraseFromParent();
+  Paired->eraseFromParent();
+
+  DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
+  return Next;
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
+  MachineBasicBlock::iterator I,
+  MachineBasicBlock::iterator Paired,
+  unsigned EltSize,
+  ArrayRef<MachineInstr*> InstsToMove) {
+  MachineBasicBlock *MBB = I->getParent();
+
+  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
+  // sure we preserve the subregister index and any register flags set on them.
+  const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
+  const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
+  const MachineOperand *Data1
+    = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);
+
+
+  unsigned Offset0
+    = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
+  unsigned Offset1
+    = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
+
+  unsigned NewOffset0 = Offset0 / EltSize;
+  unsigned NewOffset1 = Offset1 / EltSize;
+  unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
+
+  // Prefer the st64 form if we can use it, even if we can fit the offset in the
+  // non st64 version. I'm not sure if there's any real reason to do this.
+  bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
+  if (UseST64) {
+    NewOffset0 /= 64;
+    NewOffset1 /= 64;
+    Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
+  }
+
+  if (NewOffset0 > NewOffset1) {
+    // Canonicalize the merged instruction so the smaller offset comes first.
+    std::swap(NewOffset0, NewOffset1);
+    std::swap(Data0, Data1);
+  }
+
+  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
+         (NewOffset0 != NewOffset1) &&
+         "Computed offset doesn't fit");
+
+  const MCInstrDesc &Write2Desc = TII->get(Opc);
+  DebugLoc DL = I->getDebugLoc();
+
+  MachineInstrBuilder Write2
+    = BuildMI(*MBB, Paired, DL, Write2Desc)
+    .addOperand(*Addr) // addr
+    .addOperand(*Data0) // data0
+    .addOperand(*Data1) // data1
+    .addImm(NewOffset0) // offset0
+    .addImm(NewOffset1) // offset1
+    .addImm(0) // gds
+    .addMemOperand(*I->memoperands_begin())
+    .addMemOperand(*Paired->memoperands_begin());
+
+  moveInstsAfter(Write2, InstsToMove);
+
+  MachineBasicBlock::iterator Next = std::next(I);
+  I->eraseFromParent();
+  Paired->eraseFromParent();
+
+  DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
+  return Next;
+}
+
+// Scan through looking for adjacent LDS operations with constant offsets from
+// the same base register. We rely on the scheduler to do the hard work of
+// clustering nearby loads, and assume these are all adjacent.
+bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
+    MachineInstr &MI = *I;
+
+    // Don't combine if volatile.
+    if (MI.hasOrderedMemoryRef()) {
+      ++I;
+      continue;
+    }
+
+    SmallVector<MachineInstr*, 8> InstsToMove;
+    unsigned Opc = MI.getOpcode();
+    if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
+      unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
+      MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
+                                                             InstsToMove);
+      if (Match != E) {
+        Modified = true;
+        I = mergeRead2Pair(I, Match, Size, InstsToMove);
+      } else {
+        ++I;
+      }
+
+      continue;
+    } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
+      unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
+      MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
+                                                             InstsToMove);
+      if (Match != E) {
+        Modified = true;
+        I = mergeWrite2Pair(I, Match, Size, InstsToMove);
+      } else {
+        ++I;
+      }
+
+      continue;
+    }
+
+    ++I;
+  }
+
+  return Modified;
+}
+
+bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  if (!STM.loadStoreOptEnabled())
+    return false;
+
+  TII = STM.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+
+  MRI = &MF.getRegInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
+
+  bool Modified = false;
+
+  for (MachineBasicBlock &MBB : MF)
+    Modified |= optimizeBlock(MBB);
+
+  return Modified;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
new file mode 100644
index 000000000000..7ed18f27e591
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -0,0 +1,468 @@
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass lowers the pseudo control flow instructions to real
+/// machine instructions.
+///
+/// All control flow is handled using predicated instructions and
+/// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
+/// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
+/// by writting to the 64-bit EXEC register (each bit corresponds to a
+/// single vector ALU).  Typically, for predicates, a vector ALU will write
+/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
+/// Vector ALU) and then the ScalarALU will AND the VCC register with the
+/// EXEC to update the predicates.
+///
+/// For example:
+/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
+/// %SGPR0 = SI_IF %VCC
+///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
+/// %SGPR0 = SI_ELSE %SGPR0
+///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
+/// SI_END_CF %SGPR0
+///
+/// becomes:
+///
+/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
+/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
+/// S_CBRANCH_EXECZ label0            // This instruction is an optional
+///                                   // optimization which allows us to
+///                                   // branch if all the bits of
+///                                   // EXEC are zero.
+/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
+///
+/// label0:
+/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
+/// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
+/// S_BRANCH_EXECZ label1              // Use our branch optimization
+///                                    // instruction again.
+/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
+/// label1:
+/// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-lower-control-flow"
+
+namespace {
+
+class SILowerControlFlow : public MachineFunctionPass {
+private:
+  const SIRegisterInfo *TRI;
+  const SIInstrInfo *TII;
+  LiveIntervals *LIS;
+  MachineRegisterInfo *MRI;
+
+  void emitIf(MachineInstr &MI);
+  void emitElse(MachineInstr &MI);
+  void emitBreak(MachineInstr &MI);
+  void emitIfBreak(MachineInstr &MI);
+  void emitElseBreak(MachineInstr &MI);
+  void emitLoop(MachineInstr &MI);
+  void emitEndCf(MachineInstr &MI);
+
+  void findMaskOperands(MachineInstr &MI, unsigned OpNo,
+                        SmallVectorImpl<MachineOperand> &Src) const;
+
+  void combineMasks(MachineInstr &MI);
+
+public:
+  static char ID;
+
+  SILowerControlFlow() :
+    MachineFunctionPass(ID),
+    TRI(nullptr),
+    TII(nullptr),
+    LIS(nullptr),
+    MRI(nullptr) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "SI Lower control flow pseudo instructions";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // Should preserve the same set that TwoAddressInstructions does.
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreservedID(LiveVariablesID);
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace
+
+char SILowerControlFlow::ID = 0;
+
+INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
+               "SI lower control flow", false, false)
+
+static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
+  MachineOperand &ImpDefSCC = MI.getOperand(3);
+  assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
+
+  ImpDefSCC.setIsDead(IsDead);
+}
+
+char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
+
+void SILowerControlFlow::emitIf(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock::iterator I(&MI);
+
+  MachineOperand &SaveExec = MI.getOperand(0);
+  MachineOperand &Cond = MI.getOperand(1);
+  assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister &&
+         Cond.getSubReg() == AMDGPU::NoSubRegister);
+
+  unsigned SaveExecReg = SaveExec.getReg();
+
+  MachineOperand &ImpDefSCC = MI.getOperand(4);
+  assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
+
+  // Add an implicit def of exec to discourage scheduling VALU after this which
+  // will interfere with trying to form s_and_saveexec_b64 later.
+  unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  MachineInstr *CopyExec =
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
+    .addReg(AMDGPU::EXEC)
+    .addReg(AMDGPU::EXEC, RegState::ImplicitDefine);
+
+  unsigned Tmp = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+  MachineInstr *And =
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), Tmp)
+    .addReg(CopyReg)
+    //.addReg(AMDGPU::EXEC)
+    .addReg(Cond.getReg());
+  setImpSCCDefDead(*And, true);
+
+  MachineInstr *Xor =
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
+    .addReg(Tmp)
+    .addReg(CopyReg);
+  setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
+
+  // Use a copy that is a terminator to get correct spill code placement it with
+  // fast regalloc.
+  MachineInstr *SetExec =
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64_term), AMDGPU::EXEC)
+    .addReg(Tmp, RegState::Kill);
+
+  // Insert a pseudo terminator to help keep the verifier happy. This will also
+  // be used later when inserting skips.
+  MachineInstr *NewBr =
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+    .addOperand(MI.getOperand(2));
+
+  if (!LIS) {
+    MI.eraseFromParent();
+    return;
+  }
+
+  LIS->InsertMachineInstrInMaps(*CopyExec);
+
+  // Replace with and so we don't need to fix the live interval for condition
+  // register.
+  LIS->ReplaceMachineInstrInMaps(MI, *And);
+
+  LIS->InsertMachineInstrInMaps(*Xor);
+  LIS->InsertMachineInstrInMaps(*SetExec);
+  LIS->InsertMachineInstrInMaps(*NewBr);
+
+  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
+  MI.eraseFromParent();
+
+  // FIXME: Is there a better way of adjusting the liveness? It shouldn't be
+  // hard to add another def here but I'm not sure how to correctly update the
+  // valno.
+  LIS->removeInterval(SaveExecReg);
+  LIS->createAndComputeVirtRegInterval(SaveExecReg);
+  LIS->createAndComputeVirtRegInterval(Tmp);
+  LIS->createAndComputeVirtRegInterval(CopyReg);
+}
+
+void SILowerControlFlow::emitElse(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  unsigned DstReg = MI.getOperand(0).getReg();
+  assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister);
+
+  bool ExecModified = MI.getOperand(3).getImm() != 0;
+  MachineBasicBlock::iterator Start = MBB.begin();
+
+  // We are running before TwoAddressInstructions, and si_else's operands are
+  // tied. In order to correctly tie the registers, split this into a copy of
+  // the src like it does.
+  unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg)
+    .addOperand(MI.getOperand(1)); // Saved EXEC
+
+  // This must be inserted before phis and any spill code inserted before the
+  // else.
+  unsigned SaveReg = ExecModified ?
+    MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass) : DstReg;
+  MachineInstr *OrSaveExec =
+    BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), SaveReg)
+    .addReg(CopyReg);
+
+  MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
+
+  MachineBasicBlock::iterator ElsePt(MI);
+
+  if (ExecModified) {
+    MachineInstr *And =
+      BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg)
+      .addReg(AMDGPU::EXEC)
+      .addReg(SaveReg);
+
+    if (LIS)
+      LIS->InsertMachineInstrInMaps(*And);
+  }
+
+  MachineInstr *Xor =
+    BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
+    .addReg(AMDGPU::EXEC)
+    .addReg(DstReg);
+
+  MachineInstr *Branch =
+    BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+    .addMBB(DestBB);
+
+  if (!LIS) {
+    MI.eraseFromParent();
+    return;
+  }
+
+  LIS->RemoveMachineInstrFromMaps(MI);
+  MI.eraseFromParent();
+
+  LIS->InsertMachineInstrInMaps(*OrSaveExec);
+
+  LIS->InsertMachineInstrInMaps(*Xor);
+  LIS->InsertMachineInstrInMaps(*Branch);
+
+  // src reg is tied to dst reg.
+  LIS->removeInterval(DstReg);
+  LIS->createAndComputeVirtRegInterval(DstReg);
+  LIS->createAndComputeVirtRegInterval(CopyReg);
+  if (ExecModified)
+    LIS->createAndComputeVirtRegInterval(SaveReg);
+
+  // Let this be recomputed.
+  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
+}
+
+void SILowerControlFlow::emitBreak(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+  unsigned Dst = MI.getOperand(0).getReg();
+
+  MachineInstr *Or =
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+    .addReg(AMDGPU::EXEC)
+    .addOperand(MI.getOperand(1));
+
+  if (LIS)
+    LIS->ReplaceMachineInstrInMaps(MI, *Or);
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
+  MI.setDesc(TII->get(AMDGPU::S_OR_B64));
+}
+
+void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
+  MI.setDesc(TII->get(AMDGPU::S_OR_B64));
+}
+
+void SILowerControlFlow::emitLoop(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  MachineInstr *AndN2 =
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC)
+    .addReg(AMDGPU::EXEC)
+    .addOperand(MI.getOperand(0));
+
+  MachineInstr *Branch =
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+    .addOperand(MI.getOperand(1));
+
+  if (LIS) {
+    LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
+    LIS->InsertMachineInstrInMaps(*Branch);
+  }
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  MachineBasicBlock::iterator InsPt = MBB.begin();
+  MachineInstr *NewMI =
+    BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
+    .addReg(AMDGPU::EXEC)
+    .addOperand(MI.getOperand(0));
+
+  if (LIS)
+    LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
+
+  MI.eraseFromParent();
+
+  if (LIS)
+    LIS->handleMove(*NewMI);
+}
+
+// Returns replace operands for a logical operation, either single result
+// for exec or two operands if source was another equivalent operation.
+void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
+       SmallVectorImpl<MachineOperand> &Src) const {
+  MachineOperand &Op = MI.getOperand(OpNo);
+  if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+    Src.push_back(Op);
+    return;
+  }
+
+  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
+  if (!Def || Def->getParent() != MI.getParent() ||
+      !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode())))
+    return;
+
+  // Make sure we do not modify exec between def and use.
+  // A copy with implcitly defined exec inserted earlier is an exclusion, it
+  // does not really modify exec.
+  for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
+    if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
+        !(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC))
+      return;
+
+  for (const auto &SrcOp : Def->explicit_operands())
+    if (SrcOp.isUse() && (!SrcOp.isReg() ||
+        TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
+        SrcOp.getReg() == AMDGPU::EXEC))
+      Src.push_back(SrcOp);
+}
+
+// Search and combine pairs of equivalent instructions, like
+// S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
+// S_OR_B64  x, (S_OR_B64  x, y) => S_OR_B64  x, y
+// One of the operands is exec mask.
+void SILowerControlFlow::combineMasks(MachineInstr &MI) {
+  assert(MI.getNumExplicitOperands() == 3);
+  SmallVector<MachineOperand, 4> Ops;
+  unsigned OpToReplace = 1;
+  findMaskOperands(MI, 1, Ops);
+  if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy
+  findMaskOperands(MI, 2, Ops);
+  if (Ops.size() != 3) return;
+
+  unsigned UniqueOpndIdx;
+  if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2;
+  else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
+  else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
+  else return;
+
+  unsigned Reg = MI.getOperand(OpToReplace).getReg();
+  MI.RemoveOperand(OpToReplace);
+  MI.addOperand(Ops[UniqueOpndIdx]);
+  if (MRI->use_empty(Reg))
+    MRI->getUniqueVRegDef(Reg)->eraseFromParent();
+}
+
+bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+
+  // This doesn't actually need LiveIntervals, but we can preserve them.
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
+  MRI = &MF.getRegInfo();
+
+  MachineFunction::iterator NextBB;
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; BI = NextBB) {
+    NextBB = std::next(BI);
+    MachineBasicBlock &MBB = *BI;
+
+    MachineBasicBlock::iterator I, Next, Last;
+
+    for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      switch (MI.getOpcode()) {
+      case AMDGPU::SI_IF:
+        emitIf(MI);
+        break;
+
+      case AMDGPU::SI_ELSE:
+        emitElse(MI);
+        break;
+
+      case AMDGPU::SI_BREAK:
+        emitBreak(MI);
+        break;
+
+      case AMDGPU::SI_IF_BREAK:
+        emitIfBreak(MI);
+        break;
+
+      case AMDGPU::SI_ELSE_BREAK:
+        emitElseBreak(MI);
+        break;
+
+      case AMDGPU::SI_LOOP:
+        emitLoop(MI);
+        break;
+
+      case AMDGPU::SI_END_CF:
+        emitEndCf(MI);
+        break;
+
+      case AMDGPU::S_AND_B64:
+      case AMDGPU::S_OR_B64:
+        // Cleanup bit manipulations on exec mask
+        combineMasks(MI);
+        Last = I;
+        continue;
+
+      default:
+        Last = I;
+        continue;
+      }
+
+      // Replay newly inserted code to combine masks
+      Next = (Last == MBB.end()) ? MBB.begin() : Last;
+    }
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
new file mode 100644
index 000000000000..be2e14fd4623
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -0,0 +1,161 @@
+//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// i1 values are usually inserted by the CFG Structurize pass and they are
+/// unique in that they can be copied from VALU to SALU registers.
+/// This is not possible for any other value type.  Since there are no
+/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
+///
+//===----------------------------------------------------------------------===//
+//
+
+#define DEBUG_TYPE "si-i1-copies"
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+class SILowerI1Copies : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SILowerI1Copies() : MachineFunctionPass(ID) {
+    initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "SI Lower i1 Copies"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SILowerI1Copies, DEBUG_TYPE,
+                "SI Lower i1 Copies", false, false)
+
+char SILowerI1Copies::ID = 0;
+
+char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;
+
+FunctionPass *llvm::createSILowerI1CopiesPass() {
+  return new SILowerI1Copies();
+}
+
+bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
+
+  std::vector<unsigned> I1Defs;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
+        unsigned Reg = MI.getOperand(0).getReg();
+        const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+        if (RC == &AMDGPU::VReg_1RegClass)
+          MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass);
+        continue;
+      }
+
+      if (MI.getOpcode() != AMDGPU::COPY)
+        continue;
+
+      const MachineOperand &Dst = MI.getOperand(0);
+      const MachineOperand &Src = MI.getOperand(1);
+
+      if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) ||
+          !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+        continue;
+
+      const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
+      const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
+
+      DebugLoc DL = MI.getDebugLoc();
+      MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
+      if (DstRC == &AMDGPU::VReg_1RegClass &&
+          TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
+        I1Defs.push_back(Dst.getReg());
+
+        if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
+          if (DefInst->getOperand(1).isImm()) {
+            I1Defs.push_back(Dst.getReg());
+
+            int64_t Val = DefInst->getOperand(1).getImm();
+            assert(Val == 0 || Val == -1);
+
+            BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
+              .addOperand(Dst)
+              .addImm(Val);
+            MI.eraseFromParent();
+            continue;
+          }
+        }
+
+        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
+          .addOperand(Dst)
+          .addImm(0)
+          .addImm(-1)
+          .addOperand(Src);
+        MI.eraseFromParent();
+      } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
+                 SrcRC == &AMDGPU::VReg_1RegClass) {
+        if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&
+            DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() &&
+            DefInst->getOperand(1).getImm() == 0 &&
+            DefInst->getOperand(2).getImm() != 0 &&
+            DefInst->getOperand(3).isReg() &&
+            TargetRegisterInfo::isVirtualRegister(
+              DefInst->getOperand(3).getReg()) &&
+            TRI->getCommonSubClass(
+              MRI.getRegClass(DefInst->getOperand(3).getReg()),
+              &AMDGPU::SGPR_64RegClass)) {
+          BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
+            .addOperand(Dst)
+            .addReg(AMDGPU::EXEC)
+            .addOperand(DefInst->getOperand(3));
+        } else {
+          BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))
+            .addOperand(Dst)
+            .addOperand(Src)
+            .addImm(0);
+        }
+        MI.eraseFromParent();
+      }
+    }
+  }
+
+  for (unsigned Reg : I1Defs)
+    MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass);
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..e911817c451d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -0,0 +1,226 @@
+//===-- SIMachineFunctionInfo.cpp -------- SI Machine Function Info -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIMachineFunctionInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+
+#define MAX_LANES 64
+
+using namespace llvm;
+
+static cl::opt<bool> EnableSpillSGPRToVGPR(
+  "amdgpu-spill-sgpr-to-vgpr",
+  cl::desc("Enable spilling VGPRs to SGPRs"),
+  cl::ReallyHidden,
+  cl::init(true));
+
+SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
+  : AMDGPUMachineFunction(MF),
+    TIDReg(AMDGPU::NoRegister),
+    ScratchRSrcReg(AMDGPU::NoRegister),
+    ScratchWaveOffsetReg(AMDGPU::NoRegister),
+    PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
+    DispatchPtrUserSGPR(AMDGPU::NoRegister),
+    QueuePtrUserSGPR(AMDGPU::NoRegister),
+    KernargSegmentPtrUserSGPR(AMDGPU::NoRegister),
+    DispatchIDUserSGPR(AMDGPU::NoRegister),
+    FlatScratchInitUserSGPR(AMDGPU::NoRegister),
+    PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister),
+    GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister),
+    GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister),
+    GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
+    WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
+    WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
+    WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
+    WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
+    PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
+    PSInputAddr(0),
+    ReturnsVoid(true),
+    FlatWorkGroupSizes(0, 0),
+    WavesPerEU(0, 0),
+    DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}),
+    DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}),
+    LDSWaveSpillSize(0),
+    PSInputEna(0),
+    NumUserSGPRs(0),
+    NumSystemSGPRs(0),
+    HasSpilledSGPRs(false),
+    HasSpilledVGPRs(false),
+    HasNonSpillStackObjects(false),
+    NumSpilledSGPRs(0),
+    NumSpilledVGPRs(0),
+    PrivateSegmentBuffer(false),
+    DispatchPtr(false),
+    QueuePtr(false),
+    KernargSegmentPtr(false),
+    DispatchID(false),
+    FlatScratchInit(false),
+    GridWorkgroupCountX(false),
+    GridWorkgroupCountY(false),
+    GridWorkgroupCountZ(false),
+    WorkGroupIDX(false),
+    WorkGroupIDY(false),
+    WorkGroupIDZ(false),
+    WorkGroupInfo(false),
+    PrivateSegmentWaveByteOffset(false),
+    WorkItemIDX(false),
+    WorkItemIDY(false),
+    WorkItemIDZ(false) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const Function *F = MF.getFunction();
+
+  PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
+
+  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+
+  if (!AMDGPU::isShader(F->getCallingConv())) {
+    KernargSegmentPtr = true;
+    WorkGroupIDX = true;
+    WorkItemIDX = true;
+  }
+
+  if (F->hasFnAttribute("amdgpu-work-group-id-y") || ST.debuggerEmitPrologue())
+    WorkGroupIDY = true;
+
+  if (F->hasFnAttribute("amdgpu-work-group-id-z") || ST.debuggerEmitPrologue())
+    WorkGroupIDZ = true;
+
+  if (F->hasFnAttribute("amdgpu-work-item-id-y") || ST.debuggerEmitPrologue())
+    WorkItemIDY = true;
+
+  if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue())
+    WorkItemIDZ = true;
+
+  // X, XY, and XYZ are the only supported combinations, so make sure Y is
+  // enabled if Z is.
+  if (WorkItemIDZ)
+    WorkItemIDY = true;
+
+  bool MaySpill = ST.isVGPRSpillingEnabled(*F);
+  bool HasStackObjects = FrameInfo.hasStackObjects();
+
+  if (HasStackObjects || MaySpill)
+    PrivateSegmentWaveByteOffset = true;
+
+  if (ST.isAmdCodeObjectV2()) {
+    if (HasStackObjects || MaySpill)
+      PrivateSegmentBuffer = true;
+
+    if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
+      DispatchPtr = true;
+
+    if (F->hasFnAttribute("amdgpu-queue-ptr"))
+      QueuePtr = true;
+
+    if (F->hasFnAttribute("amdgpu-dispatch-id"))
+      DispatchID = true;
+  }
+
+  // We don't need to worry about accessing spills with flat instructions.
+  // TODO: On VI where we must use flat for global, we should be able to omit
+  // this if it is never used for generic access.
+  if (HasStackObjects && ST.getGeneration() >= SISubtarget::SEA_ISLANDS &&
+      ST.isAmdHsaOS())
+    FlatScratchInit = true;
+
+  FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
+  WavesPerEU = ST.getWavesPerEU(*F);
+}
+
+unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
+  const SIRegisterInfo &TRI) {
+  PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+  NumUserSGPRs += 4;
+  return PrivateSegmentBufferUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
+  DispatchPtrUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return DispatchPtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
+  QueuePtrUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return QueuePtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
+  KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return KernargSegmentPtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
+  DispatchIDUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return DispatchIDUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
+  FlatScratchInitUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return FlatScratchInitUserSGPR;
+}
+
+SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
+                                                       MachineFunction *MF,
+                                                       unsigned FrameIndex,
+                                                       unsigned SubIdx) {
+  if (!EnableSpillSGPRToVGPR)
+    return SpilledReg();
+
+  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  int64_t Offset = FrameInfo.getObjectOffset(FrameIndex);
+  Offset += SubIdx * 4;
+
+  unsigned LaneVGPRIdx = Offset / (64 * 4);
+  unsigned Lane = (Offset / 4) % 64;
+
+  struct SpilledReg Spill;
+  Spill.Lane = Lane;
+
+  if (!LaneVGPRs.count(LaneVGPRIdx)) {
+    unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass,
+                                                *MF);
+
+    if (LaneVGPR == AMDGPU::NoRegister)
+      // We have no VGPRs left for spilling SGPRs.
+      return Spill;
+
+    LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
+
+    // Add this register as live-in to all blocks to avoid machine verifer
+    // complaining about use of an undefined physical register.
+    for (MachineFunction::iterator BI = MF->begin(), BE = MF->end();
+         BI != BE; ++BI) {
+      BI->addLiveIn(LaneVGPR);
+    }
+  }
+
+  Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
+  return Spill;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
new file mode 100644
index 000000000000..3b4e233cd787
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -0,0 +1,500 @@
+//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
+
+#include "AMDGPUMachineFunction.h"
+#include "SIRegisterInfo.h"
+#include <array>
+#include <map>
+
+namespace llvm {
+
+class MachineRegisterInfo;
+
+class AMDGPUImagePseudoSourceValue : public PseudoSourceValue {
+public:
+  explicit AMDGPUImagePseudoSourceValue() :
+    PseudoSourceValue(PseudoSourceValue::TargetCustom) { }
+
+  bool isConstant(const MachineFrameInfo *) const override {
+    // This should probably be true for most images, but we will start by being
+    // conservative.
+    return false;
+  }
+
+  bool isAliased(const MachineFrameInfo *) const override {
+    // FIXME: If we ever change image intrinsics to accept fat pointers, then
+    // this could be true for some cases.
+    return false;
+  }
+
+  bool mayAlias(const MachineFrameInfo*) const override {
+    // FIXME: If we ever change image intrinsics to accept fat pointers, then
+    // this could be true for some cases.
+    return false;
+  }
+};
+
+class AMDGPUBufferPseudoSourceValue : public PseudoSourceValue {
+public:
+  explicit AMDGPUBufferPseudoSourceValue() :
+    PseudoSourceValue(PseudoSourceValue::TargetCustom) { }
+
+  bool isConstant(const MachineFrameInfo *) const override {
+    // This should probably be true for most images, but we will start by being
+    // conservative.
+    return false;
+  }
+
+  bool isAliased(const MachineFrameInfo *) const override {
+    // FIXME: If we ever change image intrinsics to accept fat pointers, then
+    // this could be true for some cases.
+    return false;
+  }
+
+  bool mayAlias(const MachineFrameInfo*) const override {
+    // FIXME: If we ever change image intrinsics to accept fat pointers, then
+    // this could be true for some cases.
+    return false;
+  }
+};
+
+/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
+/// tells the hardware which interpolation parameters to load.
+class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
+  // FIXME: This should be removed and getPreloadedValue moved here.
+  friend class SIRegisterInfo;
+
+  unsigned TIDReg;
+
+  // Registers that may be reserved for spilling purposes. These may be the same
+  // as the input registers.
+  unsigned ScratchRSrcReg;
+  unsigned ScratchWaveOffsetReg;
+
+  // Input registers setup for the HSA ABI.
+  // User SGPRs in allocation order.
+  unsigned PrivateSegmentBufferUserSGPR;
+  unsigned DispatchPtrUserSGPR;
+  unsigned QueuePtrUserSGPR;
+  unsigned KernargSegmentPtrUserSGPR;
+  unsigned DispatchIDUserSGPR;
+  unsigned FlatScratchInitUserSGPR;
+  unsigned PrivateSegmentSizeUserSGPR;
+  unsigned GridWorkGroupCountXUserSGPR;
+  unsigned GridWorkGroupCountYUserSGPR;
+  unsigned GridWorkGroupCountZUserSGPR;
+
+  // System SGPRs in allocation order.
+  unsigned WorkGroupIDXSystemSGPR;
+  unsigned WorkGroupIDYSystemSGPR;
+  unsigned WorkGroupIDZSystemSGPR;
+  unsigned WorkGroupInfoSystemSGPR;
+  unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
+
+  // Graphics info.
+  unsigned PSInputAddr;
+  bool ReturnsVoid;
+
+  // A pair of default/requested minimum/maximum flat work group sizes.
+  // Minimum - first, maximum - second.
+  std::pair<unsigned, unsigned> FlatWorkGroupSizes;
+
+  // A pair of default/requested minimum/maximum number of waves per execution
+  // unit. Minimum - first, maximum - second.
+  std::pair<unsigned, unsigned> WavesPerEU;
+
+  // Stack object indices for work group IDs.
+  std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices;
+  // Stack object indices for work item IDs.
+  std::array<int, 3> DebuggerWorkItemIDStackObjectIndices;
+
+  AMDGPUBufferPseudoSourceValue BufferPSV;
+  AMDGPUImagePseudoSourceValue ImagePSV;
+
+public:
+  // FIXME: Make private
+  unsigned LDSWaveSpillSize;
+  unsigned PSInputEna;
+  std::map<unsigned, unsigned> LaneVGPRs;
+  unsigned ScratchOffsetReg;
+  unsigned NumUserSGPRs;
+  unsigned NumSystemSGPRs;
+
+private:
+  bool HasSpilledSGPRs;
+  bool HasSpilledVGPRs;
+  bool HasNonSpillStackObjects;
+
+  unsigned NumSpilledSGPRs;
+  unsigned NumSpilledVGPRs;
+
+  // Feature bits required for inputs passed in user SGPRs.
+  bool PrivateSegmentBuffer : 1;
+  bool DispatchPtr : 1;
+  bool QueuePtr : 1;
+  bool KernargSegmentPtr : 1;
+  bool DispatchID : 1;
+  bool FlatScratchInit : 1;
+  bool GridWorkgroupCountX : 1;
+  bool GridWorkgroupCountY : 1;
+  bool GridWorkgroupCountZ : 1;
+
+  // Feature bits required for inputs passed in system SGPRs.
+  bool WorkGroupIDX : 1; // Always initialized.
+  bool WorkGroupIDY : 1;
+  bool WorkGroupIDZ : 1;
+  bool WorkGroupInfo : 1;
+  bool PrivateSegmentWaveByteOffset : 1;
+
+  bool WorkItemIDX : 1; // Always initialized.
+  bool WorkItemIDY : 1;
+  bool WorkItemIDZ : 1;
+
+  MCPhysReg getNextUserSGPR() const {
+    assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
+    return AMDGPU::SGPR0 + NumUserSGPRs;
+  }
+
+  MCPhysReg getNextSystemSGPR() const {
+    return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
+  }
+
+public:
+  struct SpilledReg {
+    unsigned VGPR;
+    int Lane;
+    SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { }
+    SpilledReg() : VGPR(AMDGPU::NoRegister), Lane(-1) { }
+    bool hasLane() { return Lane != -1;}
+    bool hasReg() { return VGPR != AMDGPU::NoRegister;}
+  };
+
+  // SIMachineFunctionInfo definition
+
+  SIMachineFunctionInfo(const MachineFunction &MF);
+  SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex,
+                           unsigned SubIdx);
+  bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
+  unsigned getTIDReg() const { return TIDReg; };
+  void setTIDReg(unsigned Reg) { TIDReg = Reg; }
+
+  // Add user SGPRs.
+  unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
+  unsigned addDispatchPtr(const SIRegisterInfo &TRI);
+  unsigned addQueuePtr(const SIRegisterInfo &TRI);
+  unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
+  unsigned addDispatchID(const SIRegisterInfo &TRI);
+  unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
+
+  // Add system SGPRs.
+  unsigned addWorkGroupIDX() {
+    WorkGroupIDXSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupIDXSystemSGPR;
+  }
+
+  unsigned addWorkGroupIDY() {
+    WorkGroupIDYSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupIDYSystemSGPR;
+  }
+
+  unsigned addWorkGroupIDZ() {
+    WorkGroupIDZSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupIDZSystemSGPR;
+  }
+
+  unsigned addWorkGroupInfo() {
+    WorkGroupInfoSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupInfoSystemSGPR;
+  }
+
+  unsigned addPrivateSegmentWaveByteOffset() {
+    PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return PrivateSegmentWaveByteOffsetSystemSGPR;
+  }
+
+  void setPrivateSegmentWaveByteOffset(unsigned Reg) {
+    PrivateSegmentWaveByteOffsetSystemSGPR = Reg;
+  }
+
+  bool hasPrivateSegmentBuffer() const {
+    return PrivateSegmentBuffer;
+  }
+
+  bool hasDispatchPtr() const {
+    return DispatchPtr;
+  }
+
+  bool hasQueuePtr() const {
+    return QueuePtr;
+  }
+
+  bool hasKernargSegmentPtr() const {
+    return KernargSegmentPtr;
+  }
+
+  bool hasDispatchID() const {
+    return DispatchID;
+  }
+
+  bool hasFlatScratchInit() const {
+    return FlatScratchInit;
+  }
+
+  bool hasGridWorkgroupCountX() const {
+    return GridWorkgroupCountX;
+  }
+
+  bool hasGridWorkgroupCountY() const {
+    return GridWorkgroupCountY;
+  }
+
+  bool hasGridWorkgroupCountZ() const {
+    return GridWorkgroupCountZ;
+  }
+
+  bool hasWorkGroupIDX() const {
+    return WorkGroupIDX;
+  }
+
+  bool hasWorkGroupIDY() const {
+    return WorkGroupIDY;
+  }
+
+  bool hasWorkGroupIDZ() const {
+    return WorkGroupIDZ;
+  }
+
+  bool hasWorkGroupInfo() const {
+    return WorkGroupInfo;
+  }
+
+  bool hasPrivateSegmentWaveByteOffset() const {
+    return PrivateSegmentWaveByteOffset;
+  }
+
+  bool hasWorkItemIDX() const {
+    return WorkItemIDX;
+  }
+
+  bool hasWorkItemIDY() const {
+    return WorkItemIDY;
+  }
+
+  bool hasWorkItemIDZ() const {
+    return WorkItemIDZ;
+  }
+
+  unsigned getNumUserSGPRs() const {
+    return NumUserSGPRs;
+  }
+
+  unsigned getNumPreloadedSGPRs() const {
+    return NumUserSGPRs + NumSystemSGPRs;
+  }
+
+  unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
+    return PrivateSegmentWaveByteOffsetSystemSGPR;
+  }
+
+  /// \brief Returns the physical register reserved for use as the resource
+  /// descriptor for scratch accesses.
+  unsigned getScratchRSrcReg() const {
+    return ScratchRSrcReg;
+  }
+
+  void setScratchRSrcReg(unsigned Reg) {
+    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+    ScratchRSrcReg = Reg;
+  }
+
+  unsigned getScratchWaveOffsetReg() const {
+    return ScratchWaveOffsetReg;
+  }
+
+  void setScratchWaveOffsetReg(unsigned Reg) {
+    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+    ScratchWaveOffsetReg = Reg;
+  }
+
+  unsigned getQueuePtrUserSGPR() const {
+    return QueuePtrUserSGPR;
+  }
+
+  bool hasSpilledSGPRs() const {
+    return HasSpilledSGPRs;
+  }
+
+  void setHasSpilledSGPRs(bool Spill = true) {
+    HasSpilledSGPRs = Spill;
+  }
+
+  bool hasSpilledVGPRs() const {
+    return HasSpilledVGPRs;
+  }
+
+  void setHasSpilledVGPRs(bool Spill = true) {
+    HasSpilledVGPRs = Spill;
+  }
+
+  bool hasNonSpillStackObjects() const {
+    return HasNonSpillStackObjects;
+  }
+
+  void setHasNonSpillStackObjects(bool StackObject = true) {
+    HasNonSpillStackObjects = StackObject;
+  }
+
+  unsigned getNumSpilledSGPRs() const {
+    return NumSpilledSGPRs;
+  }
+
+  unsigned getNumSpilledVGPRs() const {
+    return NumSpilledVGPRs;
+  }
+
+  void addToSpilledSGPRs(unsigned num) {
+    NumSpilledSGPRs += num;
+  }
+
+  void addToSpilledVGPRs(unsigned num) {
+    NumSpilledVGPRs += num;
+  }
+
+  unsigned getPSInputAddr() const {
+    return PSInputAddr;
+  }
+
+  bool isPSInputAllocated(unsigned Index) const {
+    return PSInputAddr & (1 << Index);
+  }
+
+  void markPSInputAllocated(unsigned Index) {
+    PSInputAddr |= 1 << Index;
+  }
+
+  bool returnsVoid() const {
+    return ReturnsVoid;
+  }
+
+  void setIfReturnsVoid(bool Value) {
+    ReturnsVoid = Value;
+  }
+
+  /// \returns A pair of default/requested minimum/maximum flat work group sizes
+  /// for this function.
+  std::pair<unsigned, unsigned> getFlatWorkGroupSizes() const {
+    return FlatWorkGroupSizes;
+  }
+
+  /// \returns Default/requested minimum flat work group size for this function.
+  unsigned getMinFlatWorkGroupSize() const {
+    return FlatWorkGroupSizes.first;
+  }
+
+  /// \returns Default/requested maximum flat work group size for this function.
+  unsigned getMaxFlatWorkGroupSize() const {
+    return FlatWorkGroupSizes.second;
+  }
+
+  /// \returns A pair of default/requested minimum/maximum number of waves per
+  /// execution unit.
+  std::pair<unsigned, unsigned> getWavesPerEU() const {
+    return WavesPerEU;
+  }
+
+  /// \returns Default/requested minimum number of waves per execution unit.
+  unsigned getMinWavesPerEU() const {
+    return WavesPerEU.first;
+  }
+
+  /// \returns Default/requested maximum number of waves per execution unit.
+  unsigned getMaxWavesPerEU() const {
+    return WavesPerEU.second;
+  }
+
+  /// \returns Stack object index for \p Dim's work group ID.
+  int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const {
+    assert(Dim < 3);
+    return DebuggerWorkGroupIDStackObjectIndices[Dim];
+  }
+
+  /// \brief Sets stack object index for \p Dim's work group ID to \p ObjectIdx.
+  void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
+    assert(Dim < 3);
+    DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx;
+  }
+
+  /// \returns Stack object index for \p Dim's work item ID.
+  int getDebuggerWorkItemIDStackObjectIndex(unsigned Dim) const {
+    assert(Dim < 3);
+    return DebuggerWorkItemIDStackObjectIndices[Dim];
+  }
+
+  /// \brief Sets stack object index for \p Dim's work item ID to \p ObjectIdx.
+  void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) {
+    assert(Dim < 3);
+    DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx;
+  }
+
+  /// \returns SGPR used for \p Dim's work group ID.
+  unsigned getWorkGroupIDSGPR(unsigned Dim) const {
+    switch (Dim) {
+    case 0:
+      assert(hasWorkGroupIDX());
+      return WorkGroupIDXSystemSGPR;
+    case 1:
+      assert(hasWorkGroupIDY());
+      return WorkGroupIDYSystemSGPR;
+    case 2:
+      assert(hasWorkGroupIDZ());
+      return WorkGroupIDZSystemSGPR;
+    }
+    llvm_unreachable("unexpected dimension");
+  }
+
+  /// \returns VGPR used for \p Dim' work item ID.
+  unsigned getWorkItemIDVGPR(unsigned Dim) const {
+    switch (Dim) {
+    case 0:
+      assert(hasWorkItemIDX());
+      return AMDGPU::VGPR0;
+    case 1:
+      assert(hasWorkItemIDY());
+      return AMDGPU::VGPR1;
+    case 2:
+      assert(hasWorkItemIDZ());
+      return AMDGPU::VGPR2;
+    }
+    llvm_unreachable("unexpected dimension");
+  }
+
+  const AMDGPUBufferPseudoSourceValue *getBufferPSV() const {
+    return &BufferPSV;
+  }
+
+  const AMDGPUImagePseudoSourceValue *getImagePSV() const {
+    return &ImagePSV;
+  }
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
new file mode 100644
index 000000000000..da86bbf9dd2a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -0,0 +1,1898 @@
+//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIMachineScheduler.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+// This scheduler implements a different scheduling algorithm than
+// GenericScheduler.
+//
+// There are several specific architecture behaviours that can't be modelled
+// for GenericScheduler:
+// . When accessing the result of an SGPR load instruction, you have to wait
+// for all the SGPR load instructions before your current instruction to
+// have finished.
+// . When accessing the result of an VGPR load instruction, you have to wait
+// for all the VGPR load instructions previous to the VGPR load instruction
+// you are interested in to finish.
+// . The less the register pressure, the best load latencies are hidden
+//
+// Moreover some specifities (like the fact a lot of instructions in the shader
+// have few dependencies) makes the generic scheduler have some unpredictable
+// behaviours. For example when register pressure becomes high, it can either
+// manage to prevent register pressure from going too high, or it can
+// increase register pressure even more than if it hadn't taken register
+// pressure into account.
+//
+// Also some other bad behaviours are generated, like loading at the beginning
+// of the shader a constant in VGPR you won't need until the end of the shader.
+//
+// The scheduling problem for SI can distinguish three main parts:
+// . Hiding high latencies (texture sampling, etc)
+// . Hiding low latencies (SGPR constant loading, etc)
+// . Keeping register usage low for better latency hiding and general
+//   performance
+//
+// Some other things can also affect performance, but are hard to predict
+// (cache usage, the fact the HW can issue several instructions from different
+// wavefronts if different types, etc)
+//
+// This scheduler tries to solve the scheduling problem by dividing it into
+// simpler sub-problems. It divides the instructions into blocks, schedules
+// locally inside the blocks where it takes care of low latencies, and then
+// chooses the order of the blocks by taking care of high latencies.
+// Dividing the instructions into blocks helps control keeping register
+// usage low.
+//
+// First the instructions are put into blocks.
+//   We want the blocks help control register usage and hide high latencies
+//   later. To help control register usage, we typically want all local
+//   computations, when for example you create a result that can be comsummed
+//   right away, to be contained in a block. Block inputs and outputs would
+//   typically be important results that are needed in several locations of
+//   the shader. Since we do want blocks to help hide high latencies, we want
+//   the instructions inside the block to have a minimal set of dependencies
+//   on high latencies. It will make it easy to pick blocks to hide specific
+//   high latencies.
+//   The block creation algorithm is divided into several steps, and several
+//   variants can be tried during the scheduling process.
+//
+// Second the order of the instructions inside the blocks is chosen.
+//   At that step we do take into account only register usage and hiding
+//   low latency instructions
+//
+// Third the block order is chosen, there we try to hide high latencies
+// and keep register usage low.
+//
+// After the third step, a pass is done to improve the hiding of low
+// latencies.
+//
+// Actually when talking about 'low latency' or 'high latency' it includes
+// both the latency to get the cache (or global mem) data go to the register,
+// and the bandwidth limitations.
+// Increasing the number of active wavefronts helps hide the former, but it
+// doesn't solve the latter, thus why even if wavefront count is high, we have
+// to try have as many instructions hiding high latencies as possible.
+// The OpenCL doc says for example latency of 400 cycles for a global mem access,
+// which is hidden by 10 instructions if the wavefront count is 10.
+
+// Some figures taken from AMD docs:
+// Both texture and constant L1 caches are 4-way associative with 64 bytes
+// lines.
+// Constant cache is shared with 4 CUs.
+// For texture sampling, the address generation unit receives 4 texture
+// addresses per cycle, thus we could expect texture sampling latency to be
+// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
+// instructions in a wavefront group are executed every 4 cycles),
+// or 16 instructions if the other wavefronts associated to the 3 other VALUs
+// of the CU do texture sampling too. (Don't take these figures too seriously,
+// as I'm not 100% sure of the computation)
+// Data exports should get similar latency.
+// For constant loading, the cache is shader with 4 CUs.
+// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
+// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
+// It means a simple s_buffer_load should take one instruction to hide, as
+// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
+// cache line.
+//
+// As of today the driver doesn't preload the constants in cache, thus the
+// first loads get extra latency. The doc says global memory access can be
+// 300-600 cycles. We do not specially take that into account when scheduling
+// As we expect the driver to be able to preload the constants soon.
+
+// common code //
+
+#ifndef NDEBUG
+
+static const char *getReasonStr(SIScheduleCandReason Reason) {
+  switch (Reason) {
+  case NoCand:         return "NOCAND";
+  case RegUsage:       return "REGUSAGE";
+  case Latency:        return "LATENCY";
+  case Successor:      return "SUCCESSOR";
+  case Depth:          return "DEPTH";
+  case NodeOrder:      return "ORDER";
+  }
+  llvm_unreachable("Unknown reason!");
+}
+
+#endif
+
+static bool tryLess(int TryVal, int CandVal,
+                    SISchedulerCandidate &TryCand,
+                    SISchedulerCandidate &Cand,
+                    SIScheduleCandReason Reason) {
+  if (TryVal < CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal > CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  Cand.setRepeat(Reason);
+  return false;
+}
+
+static bool tryGreater(int TryVal, int CandVal,
+                       SISchedulerCandidate &TryCand,
+                       SISchedulerCandidate &Cand,
+                       SIScheduleCandReason Reason) {
+  if (TryVal > CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal < CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  Cand.setRepeat(Reason);
+  return false;
+}
+
+// SIScheduleBlock //
+
+void SIScheduleBlock::addUnit(SUnit *SU) {
+  NodeNum2Index[SU->NodeNum] = SUnits.size();
+  SUnits.push_back(SU);
+}
+
+#ifndef NDEBUG
+void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
+
+  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
+  dbgs() << '\n';
+}
+#endif
+
+void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
+                                          SISchedCandidate &TryCand) {
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return;
+  }
+
+  if (Cand.SGPRUsage > 60 &&
+      tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
+    return;
+
+  // Schedule low latency instructions as top as possible.
+  // Order of priority is:
+  // . Low latency instructions which do not depend on other low latency
+  //   instructions we haven't waited for
+  // . Other instructions which do not depend on low latency instructions
+  //   we haven't waited for
+  // . Low latencies
+  // . All other instructions
+  // Goal is to get: low latency instructions - independent instructions
+  //     - (eventually some more low latency instructions)
+  //     - instructions that depend on the first low latency instructions.
+  // If in the block there is a lot of constant loads, the SGPR usage
+  // could go quite high, thus above the arbitrary limit of 60 will encourage
+  // use the already loaded constants (in order to release some SGPRs) before
+  // loading more.
+  if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
+              Cand.HasLowLatencyNonWaitedParent,
+              TryCand, Cand, SIScheduleCandReason::Depth))
+    return;
+
+  if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
+                 TryCand, Cand, SIScheduleCandReason::Depth))
+    return;
+
+  if (TryCand.IsLowLatency &&
+      tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
+              TryCand, Cand, SIScheduleCandReason::Depth))
+    return;
+
+  if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
+    return;
+
+  // Fall through to original instruction order.
+  if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
+    TryCand.Reason = NodeOrder;
+  }
+}
+
+SUnit* SIScheduleBlock::pickNode() {
+  SISchedCandidate TopCand;
+
+  for (SUnit* SU : TopReadySUs) {
+    SISchedCandidate TryCand;
+    std::vector<unsigned> pressure;
+    std::vector<unsigned> MaxPressure;
+    // Predict register usage after this instruction.
+    TryCand.SU = SU;
+    TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
+    TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
+    TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
+    TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
+    TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
+    TryCand.HasLowLatencyNonWaitedParent =
+      HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
+    tryCandidateTopDown(TopCand, TryCand);
+    if (TryCand.Reason != NoCand)
+      TopCand.setBest(TryCand);
+  }
+
+  return TopCand.SU;
+}
+
+
+// Schedule something valid.
+void SIScheduleBlock::fastSchedule() {
+  TopReadySUs.clear();
+  if (Scheduled)
+    undoSchedule();
+
+  for (SUnit* SU : SUnits) {
+    if (!SU->NumPredsLeft)
+      TopReadySUs.push_back(SU);
+  }
+
+  while (!TopReadySUs.empty()) {
+    SUnit *SU = TopReadySUs[0];
+    ScheduledSUnits.push_back(SU);
+    nodeScheduled(SU);
+  }
+
+  Scheduled = true;
+}
+
+// Returns if the register was set between first and last.
+static bool isDefBetween(unsigned Reg,
+                           SlotIndex First, SlotIndex Last,
+                           const MachineRegisterInfo *MRI,
+                           const LiveIntervals *LIS) {
+  for (MachineRegisterInfo::def_instr_iterator
+       UI = MRI->def_instr_begin(Reg),
+       UE = MRI->def_instr_end(); UI != UE; ++UI) {
+    const MachineInstr* MI = &*UI;
+    if (MI->isDebugValue())
+      continue;
+    SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
+    if (InstSlot >= First && InstSlot <= Last)
+      return true;
+  }
+  return false;
+}
+
+void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
+                                      MachineBasicBlock::iterator EndBlock) {
+  IntervalPressure Pressure, BotPressure;
+  RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
+  LiveIntervals *LIS = DAG->getLIS();
+  MachineRegisterInfo *MRI = DAG->getMRI();
+  DAG->initRPTracker(TopRPTracker);
+  DAG->initRPTracker(BotRPTracker);
+  DAG->initRPTracker(RPTracker);
+
+  // Goes though all SU. RPTracker captures what had to be alive for the SUs
+  // to execute, and what is still alive at the end.
+  for (SUnit* SU : ScheduledSUnits) {
+    RPTracker.setPos(SU->getInstr());
+    RPTracker.advance();
+  }
+
+  // Close the RPTracker to finalize live ins/outs.
+  RPTracker.closeRegion();
+
+  // Initialize the live ins and live outs.
+  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
+  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
+
+  // Do not Track Physical Registers, because it messes up.
+  for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
+    if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
+      LiveInRegs.insert(RegMaskPair.RegUnit);
+  }
+  LiveOutRegs.clear();
+  // There is several possibilities to distinguish:
+  // 1) Reg is not input to any instruction in the block, but is output of one
+  // 2) 1) + read in the block and not needed after it
+  // 3) 1) + read in the block but needed in another block
+  // 4) Reg is input of an instruction but another block will read it too
+  // 5) Reg is input of an instruction and then rewritten in the block.
+  //    result is not read in the block (implies used in another block)
+  // 6) Reg is input of an instruction and then rewritten in the block.
+  //    result is read in the block and not needed in another block
+  // 7) Reg is input of an instruction and then rewritten in the block.
+  //    result is read in the block but also needed in another block
+  // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
+  // We want LiveOutRegs to contain only Regs whose content will be read after
+  // in another block, and whose content was written in the current block,
+  // that is we want it to get 1, 3, 5, 7
+  // Since we made the MIs of a block to be packed all together before
+  // scheduling, then the LiveIntervals were correct, and the RPTracker was
+  // able to correctly handle 5 vs 6, 2 vs 3.
+  // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
+  // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
+  // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
+  // The use of findDefBetween removes the case 4.
+  for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
+    unsigned Reg = RegMaskPair.RegUnit;
+    if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+        isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
+                     LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
+                     LIS)) {
+      LiveOutRegs.insert(Reg);
+    }
+  }
+
+  // Pressure = sum_alive_registers register size
+  // Internally llvm will represent some registers as big 128 bits registers
+  // for example, but they actually correspond to 4 actual 32 bits registers.
+  // Thus Pressure is not equal to num_alive_registers * constant.
+  LiveInPressure = TopPressure.MaxSetPressure;
+  LiveOutPressure = BotPressure.MaxSetPressure;
+
+  // Prepares TopRPTracker for top down scheduling.
+  TopRPTracker.closeTop();
+}
+
+void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
+                               MachineBasicBlock::iterator EndBlock) {
+  if (!Scheduled)
+    fastSchedule();
+
+  // PreScheduling phase to set LiveIn and LiveOut.
+  initRegPressure(BeginBlock, EndBlock);
+  undoSchedule();
+
+  // Schedule for real now.
+
+  TopReadySUs.clear();
+
+  for (SUnit* SU : SUnits) {
+    if (!SU->NumPredsLeft)
+      TopReadySUs.push_back(SU);
+  }
+
+  while (!TopReadySUs.empty()) {
+    SUnit *SU = pickNode();
+    ScheduledSUnits.push_back(SU);
+    TopRPTracker.setPos(SU->getInstr());
+    TopRPTracker.advance();
+    nodeScheduled(SU);
+  }
+
+  // TODO: compute InternalAdditionnalPressure.
+  InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
+
+  // Check everything is right.
+#ifndef NDEBUG
+  assert(SUnits.size() == ScheduledSUnits.size() &&
+            TopReadySUs.empty());
+  for (SUnit* SU : SUnits) {
+    assert(SU->isScheduled &&
+              SU->NumPredsLeft == 0);
+  }
+#endif
+
+  Scheduled = true;
+}
+
+void SIScheduleBlock::undoSchedule() {
+  for (SUnit* SU : SUnits) {
+    SU->isScheduled = false;
+    for (SDep& Succ : SU->Succs) {
+      if (BC->isSUInBlock(Succ.getSUnit(), ID))
+        undoReleaseSucc(SU, &Succ);
+    }
+  }
+  HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
+  ScheduledSUnits.clear();
+  Scheduled = false;
+}
+
+void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+  if (SuccEdge->isWeak()) {
+    ++SuccSU->WeakPredsLeft;
+    return;
+  }
+  ++SuccSU->NumPredsLeft;
+}
+
+void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+  if (SuccEdge->isWeak()) {
+    --SuccSU->WeakPredsLeft;
+    return;
+  }
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    SuccSU->dump(DAG);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+
+  --SuccSU->NumPredsLeft;
+}
+
+/// Release Successors of the SU that are in the block or not.
+void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
+  for (SDep& Succ : SU->Succs) {
+    SUnit *SuccSU = Succ.getSUnit();
+
+    if (SuccSU->NodeNum >= DAG->SUnits.size())
+        continue;
+
+    if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
+      continue;
+
+    releaseSucc(SU, &Succ);
+    if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
+      TopReadySUs.push_back(SuccSU);
+  }
+}
+
+void SIScheduleBlock::nodeScheduled(SUnit *SU) {
+  // Is in TopReadySUs
+  assert (!SU->NumPredsLeft);
+  std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
+  if (I == TopReadySUs.end()) {
+    dbgs() << "Data Structure Bug in SI Scheduler\n";
+    llvm_unreachable(nullptr);
+  }
+  TopReadySUs.erase(I);
+
+  releaseSuccessors(SU, true);
+  // Scheduling this node will trigger a wait,
+  // thus propagate to other instructions that they do not need to wait either.
+  if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
+    HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
+
+  if (DAG->IsLowLatencySU[SU->NodeNum]) {
+     for (SDep& Succ : SU->Succs) {
+      std::map<unsigned, unsigned>::iterator I =
+        NodeNum2Index.find(Succ.getSUnit()->NodeNum);
+      if (I != NodeNum2Index.end())
+        HasLowLatencyNonWaitedParent[I->second] = 1;
+    }
+  }
+  SU->isScheduled = true;
+}
+
+void SIScheduleBlock::finalizeUnits() {
+  // We remove links from outside blocks to enable scheduling inside the block.
+  for (SUnit* SU : SUnits) {
+    releaseSuccessors(SU, false);
+    if (DAG->IsHighLatencySU[SU->NodeNum])
+      HighLatencyBlock = true;
+  }
+  HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
+}
+
+// we maintain ascending order of IDs
+void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
+  unsigned PredID = Pred->getID();
+
+  // Check if not already predecessor.
+  for (SIScheduleBlock* P : Preds) {
+    if (PredID == P->getID())
+      return;
+  }
+  Preds.push_back(Pred);
+
+  assert(none_of(Succs,
+                 [=](SIScheduleBlock *S) { return PredID == S->getID(); }) &&
+         "Loop in the Block Graph!");
+}
+
+void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
+  unsigned SuccID = Succ->getID();
+
+  // Check if not already predecessor.
+  for (SIScheduleBlock* S : Succs) {
+    if (SuccID == S->getID())
+      return;
+  }
+  if (Succ->isHighLatencyBlock())
+    ++NumHighLatencySuccessors;
+  Succs.push_back(Succ);
+  assert(none_of(Preds,
+                 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
+         "Loop in the Block Graph!");
+}
+
+#ifndef NDEBUG
+void SIScheduleBlock::printDebug(bool full) {
+  dbgs() << "Block (" << ID << ")\n";
+  if (!full)
+    return;
+
+  dbgs() << "\nContains High Latency Instruction: "
+         << HighLatencyBlock << '\n';
+  dbgs() << "\nDepends On:\n";
+  for (SIScheduleBlock* P : Preds) {
+    P->printDebug(false);
+  }
+
+  dbgs() << "\nSuccessors:\n";
+  for (SIScheduleBlock* S : Succs) {
+    S->printDebug(false);
+  }
+
+  if (Scheduled) {
+    dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
+           << LiveInPressure[DAG->getVGPRSetID()] << '\n';
+    dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
+           << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
+    dbgs() << "LiveIns:\n";
+    for (unsigned Reg : LiveInRegs)
+      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+
+    dbgs() << "\nLiveOuts:\n";
+    for (unsigned Reg : LiveOutRegs)
+      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+  }
+
+  dbgs() << "\nInstructions:\n";
+  if (!Scheduled) {
+    for (SUnit* SU : SUnits) {
+      SU->dump(DAG);
+    }
+  } else {
+    for (SUnit* SU : SUnits) {
+      SU->dump(DAG);
+    }
+  }
+
+  dbgs() << "///////////////////////\n";
+}
+#endif
+
+// SIScheduleBlockCreator //
+
+SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
+DAG(DAG) {
+}
+
+SIScheduleBlockCreator::~SIScheduleBlockCreator() = default;
+
+SIScheduleBlocks
+SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
+  std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
+    Blocks.find(BlockVariant);
+  if (B == Blocks.end()) {
+    SIScheduleBlocks Res;
+    createBlocksForVariant(BlockVariant);
+    topologicalSort();
+    scheduleInsideBlocks();
+    fillStats();
+    Res.Blocks = CurrentBlocks;
+    Res.TopDownIndex2Block = TopDownIndex2Block;
+    Res.TopDownBlock2Index = TopDownBlock2Index;
+    Blocks[BlockVariant] = Res;
+    return Res;
+  } else {
+    return B->second;
+  }
+}
+
+bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
+  if (SU->NodeNum >= DAG->SUnits.size())
+    return false;
+  return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
+}
+
+void SIScheduleBlockCreator::colorHighLatenciesAlone() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    if (DAG->IsHighLatencySU[SU->NodeNum]) {
+      CurrentColoring[SU->NodeNum] = NextReservedID++;
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorHighLatenciesGroups() {
+  unsigned DAGSize = DAG->SUnits.size();
+  unsigned NumHighLatencies = 0;
+  unsigned GroupSize;
+  unsigned Color = NextReservedID;
+  unsigned Count = 0;
+  std::set<unsigned> FormingGroup;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    if (DAG->IsHighLatencySU[SU->NodeNum])
+      ++NumHighLatencies;
+  }
+
+  if (NumHighLatencies == 0)
+    return;
+
+  if (NumHighLatencies <= 6)
+    GroupSize = 2;
+  else if (NumHighLatencies <= 12)
+    GroupSize = 3;
+  else
+    GroupSize = 4;
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    if (DAG->IsHighLatencySU[SU->NodeNum]) {
+      unsigned CompatibleGroup = true;
+      unsigned ProposedColor = Color;
+      for (unsigned j : FormingGroup) {
+        // TODO: Currently CompatibleGroup will always be false,
+        // because the graph enforces the load order. This
+        // can be fixed, but as keeping the load order is often
+        // good for performance that causes a performance hit (both
+        // the default scheduler and this scheduler).
+        // When this scheduler determines a good load order,
+        // this can be fixed.
+        if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) ||
+            !DAG->canAddEdge(&DAG->SUnits[j], SU))
+          CompatibleGroup = false;
+      }
+      if (!CompatibleGroup || ++Count == GroupSize) {
+        FormingGroup.clear();
+        Color = ++NextReservedID;
+        if (!CompatibleGroup) {
+          ProposedColor = Color;
+          FormingGroup.insert(SU->NodeNum);
+        }
+        Count = 0;
+      } else {
+        FormingGroup.insert(SU->NodeNum);
+      }
+      CurrentColoring[SU->NodeNum] = ProposedColor;
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorComputeReservedDependencies() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<std::set<unsigned>, unsigned> ColorCombinations;
+
+  CurrentTopDownReservedDependencyColoring.clear();
+  CurrentBottomUpReservedDependencyColoring.clear();
+
+  CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
+  CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
+
+  // Traverse TopDown, and give different colors to SUs depending
+  // on which combination of High Latencies they depend on.
+
+  for (unsigned SUNum : DAG->TopDownIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
+    std::set<unsigned> SUColors;
+
+    // Already given.
+    if (CurrentColoring[SU->NodeNum]) {
+      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+        CurrentColoring[SU->NodeNum];
+      continue;
+    }
+
+   for (SDep& PredDep : SU->Preds) {
+      SUnit *Pred = PredDep.getSUnit();
+      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
+        continue;
+      if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
+        SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
+    }
+    // Color 0 by default.
+    if (SUColors.empty())
+      continue;
+    // Same color than parents.
+    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
+      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+        *SUColors.begin();
+    else {
+      std::map<std::set<unsigned>, unsigned>::iterator Pos =
+        ColorCombinations.find(SUColors);
+      if (Pos != ColorCombinations.end()) {
+          CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
+      } else {
+        CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+          NextNonReservedID;
+        ColorCombinations[SUColors] = NextNonReservedID++;
+      }
+    }
+  }
+
+  ColorCombinations.clear();
+
+  // Same as before, but BottomUp.
+
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
+    std::set<unsigned> SUColors;
+
+    // Already given.
+    if (CurrentColoring[SU->NodeNum]) {
+      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+        CurrentColoring[SU->NodeNum];
+      continue;
+    }
+
+    for (SDep& SuccDep : SU->Succs) {
+      SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
+        SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
+    }
+    // Keep color 0.
+    if (SUColors.empty())
+      continue;
+    // Same color than parents.
+    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
+      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+        *SUColors.begin();
+    else {
+      std::map<std::set<unsigned>, unsigned>::iterator Pos =
+        ColorCombinations.find(SUColors);
+      if (Pos != ColorCombinations.end()) {
+        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
+      } else {
+        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+          NextNonReservedID;
+        ColorCombinations[SUColors] = NextNonReservedID++;
+      }
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
+
+  // Every combination of colors given by the top down
+  // and bottom up Reserved node dependency
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    std::pair<unsigned, unsigned> SUColors;
+
+    // High latency instructions: already given.
+    if (CurrentColoring[SU->NodeNum])
+      continue;
+
+    SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
+    SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
+
+    std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
+      ColorCombinations.find(SUColors);
+    if (Pos != ColorCombinations.end()) {
+      CurrentColoring[SU->NodeNum] = Pos->second;
+    } else {
+      CurrentColoring[SU->NodeNum] = NextNonReservedID;
+      ColorCombinations[SUColors] = NextNonReservedID++;
+    }
+  }
+}
+
+void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::vector<int> PendingColoring = CurrentColoring;
+
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
+    std::set<unsigned> SUColors;
+    std::set<unsigned> SUColorsPending;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
+        CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+      SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
+          CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
+        SUColors.insert(CurrentColoring[Succ->NodeNum]);
+      SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1 && SUColorsPending.size() == 1)
+      PendingColoring[SU->NodeNum] = *SUColors.begin();
+    else // TODO: Attribute new colors depending on color
+         // combination of children.
+      PendingColoring[SU->NodeNum] = NextNonReservedID++;
+  }
+  CurrentColoring = PendingColoring;
+}
+
+
+void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+  unsigned PreviousColor;
+  std::set<unsigned> SeenColors;
+
+  if (DAGSize <= 1)
+    return;
+
+  PreviousColor = CurrentColoring[0];
+
+  for (unsigned i = 1, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    unsigned CurrentColor = CurrentColoring[i];
+    unsigned PreviousColorSave = PreviousColor;
+    assert(i == SU->NodeNum);
+
+    if (CurrentColor != PreviousColor)
+      SeenColors.insert(PreviousColor);
+    PreviousColor = CurrentColor;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    if (SeenColors.find(CurrentColor) == SeenColors.end())
+      continue;
+
+    if (PreviousColorSave != CurrentColor)
+      CurrentColoring[i] = NextNonReservedID++;
+    else
+      CurrentColoring[i] = CurrentColoring[i-1];
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    // No predecessor: Vgpr constant loading.
+    // Low latency instructions usually have a predecessor (the address)
+    if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+      SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1)
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1)
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
+  unsigned DAGSize = DAG->SUnits.size();
+
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+  }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<unsigned, unsigned> ColorCount;
+
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
+    unsigned color = CurrentColoring[SU->NodeNum];
+    std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
+      if (Pos != ColorCount.end()) {
+        ++ColorCount[color];
+      } else {
+        ColorCount[color] = 1;
+      }
+  }
+
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
+    unsigned color = CurrentColoring[SU->NodeNum];
+    std::set<unsigned> SUColors;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    if (ColorCount[color] > 1)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      SUColors.insert(CurrentColoring[Succ->NodeNum]);
+    }
+    if (SUColors.size() == 1 && *SUColors.begin() != color) {
+      --ColorCount[color];
+      CurrentColoring[SU->NodeNum] = *SUColors.begin();
+      ++ColorCount[*SUColors.begin()];
+    }
+  }
+}
+
+void SIScheduleBlockCreator::cutHugeBlocks() {
+  // TODO
+}
+
+void SIScheduleBlockCreator::regroupNoUserInstructions() {
+  unsigned DAGSize = DAG->SUnits.size();
+  int GroupID = NextNonReservedID++;
+
+  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+    SUnit *SU = &DAG->SUnits[SUNum];
+    bool hasSuccessor = false;
+
+    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+      continue;
+
+    for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      hasSuccessor = true;
+    }
+    if (!hasSuccessor)
+      CurrentColoring[SU->NodeNum] = GroupID;
+  }
+}
+
+void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
+  unsigned DAGSize = DAG->SUnits.size();
+  std::map<unsigned,unsigned> RealID;
+
+  CurrentBlocks.clear();
+  CurrentColoring.clear();
+  CurrentColoring.resize(DAGSize, 0);
+  Node2CurrentBlock.clear();
+
+  // Restore links previous scheduling variant has overridden.
+  DAG->restoreSULinksLeft();
+
+  NextReservedID = 1;
+  NextNonReservedID = DAGSize + 1;
+
+  DEBUG(dbgs() << "Coloring the graph\n");
+
+  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
+    colorHighLatenciesGroups();
+  else
+    colorHighLatenciesAlone();
+  colorComputeReservedDependencies();
+  colorAccordingToReservedDependencies();
+  colorEndsAccordingToDependencies();
+  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
+    colorForceConsecutiveOrderInGroup();
+  regroupNoUserInstructions();
+  colorMergeConstantLoadsNextGroup();
+  colorMergeIfPossibleNextGroupOnlyForReserved();
+
+  // Put SUs of same color into same block
+  Node2CurrentBlock.resize(DAGSize, -1);
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    unsigned Color = CurrentColoring[SU->NodeNum];
+    if (RealID.find(Color) == RealID.end()) {
+      int ID = CurrentBlocks.size();
+      BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID));
+      CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
+      RealID[Color] = ID;
+    }
+    CurrentBlocks[RealID[Color]]->addUnit(SU);
+    Node2CurrentBlock[SU->NodeNum] = RealID[Color];
+  }
+
+  // Build dependencies between blocks.
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &DAG->SUnits[i];
+    int SUID = Node2CurrentBlock[i];
+     for (SDep& SuccDep : SU->Succs) {
+       SUnit *Succ = SuccDep.getSUnit();
+      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+        continue;
+      if (Node2CurrentBlock[Succ->NodeNum] != SUID)
+        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
+    }
+    for (SDep& PredDep : SU->Preds) {
+      SUnit *Pred = PredDep.getSUnit();
+      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
+        continue;
+      if (Node2CurrentBlock[Pred->NodeNum] != SUID)
+        CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
+    }
+  }
+
+  // Free root and leafs of all blocks to enable scheduling inside them.
+  for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    Block->finalizeUnits();
+  }
+  DEBUG(
+    dbgs() << "Blocks created:\n\n";
+    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+      SIScheduleBlock *Block = CurrentBlocks[i];
+      Block->printDebug(true);
+    }
+  );
+}
+
+// Two functions taken from Codegen/MachineScheduler.cpp
+
+/// Non-const version.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I,
+            MachineBasicBlock::const_iterator End) {
+  for (; I != End; ++I) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
+void SIScheduleBlockCreator::topologicalSort() {
+  unsigned DAGSize = CurrentBlocks.size();
+  std::vector<int> WorkList;
+
+  DEBUG(dbgs() << "Topological Sort\n");
+
+  WorkList.reserve(DAGSize);
+  TopDownIndex2Block.resize(DAGSize);
+  TopDownBlock2Index.resize(DAGSize);
+  BottomUpIndex2Block.resize(DAGSize);
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    unsigned Degree = Block->getSuccs().size();
+    TopDownBlock2Index[i] = Degree;
+    if (Degree == 0) {
+      WorkList.push_back(i);
+    }
+  }
+
+  int Id = DAGSize;
+  while (!WorkList.empty()) {
+    int i = WorkList.back();
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    WorkList.pop_back();
+    TopDownBlock2Index[i] = --Id;
+    TopDownIndex2Block[Id] = i;
+    for (SIScheduleBlock* Pred : Block->getPreds()) {
+      if (!--TopDownBlock2Index[Pred->getID()])
+        WorkList.push_back(Pred->getID());
+    }
+  }
+
+#ifndef NDEBUG
+  // Check correctness of the ordering.
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    for (SIScheduleBlock* Pred : Block->getPreds()) {
+      assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
+      "Wrong Top Down topological sorting");
+    }
+  }
+#endif
+
+  BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
+                                         TopDownIndex2Block.rend());
+}
+
+void SIScheduleBlockCreator::scheduleInsideBlocks() {
+  unsigned DAGSize = CurrentBlocks.size();
+
+  DEBUG(dbgs() << "\nScheduling Blocks\n\n");
+
+  // We do schedule a valid scheduling such that a Block corresponds
+  // to a range of instructions.
+  DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    Block->fastSchedule();
+  }
+
+  // Note: the following code, and the part restoring previous position
+  // is by far the most expensive operation of the Scheduler.
+
+  // Do not update CurrentTop.
+  MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
+  std::vector<MachineBasicBlock::iterator> PosOld;
+  std::vector<MachineBasicBlock::iterator> PosNew;
+  PosOld.reserve(DAG->SUnits.size());
+  PosNew.reserve(DAG->SUnits.size());
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    int BlockIndice = TopDownIndex2Block[i];
+    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+    std::vector<SUnit*> SUs = Block->getScheduledUnits();
+
+    for (SUnit* SU : SUs) {
+      MachineInstr *MI = SU->getInstr();
+      MachineBasicBlock::iterator Pos = MI;
+      PosOld.push_back(Pos);
+      if (&*CurrentTopFastSched == MI) {
+        PosNew.push_back(Pos);
+        CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
+                                          DAG->getCurrentBottom());
+      } else {
+        // Update the instruction stream.
+        DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
+
+        // Update LiveIntervals.
+        // Note: Moving all instructions and calling handleMove every time
+        // is the most cpu intensive operation of the scheduler.
+        // It would gain a lot if there was a way to recompute the
+        // LiveIntervals for the entire scheduling region.
+        DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
+        PosNew.push_back(CurrentTopFastSched);
+      }
+    }
+  }
+
+  // Now we have Block of SUs == Block of MI.
+  // We do the final schedule for the instructions inside the block.
+  // The property that all the SUs of the Block are grouped together as MI
+  // is used for correct reg usage tracking.
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SIScheduleBlock *Block = CurrentBlocks[i];
+    std::vector<SUnit*> SUs = Block->getScheduledUnits();
+    Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
+  }
+
+  DEBUG(dbgs() << "Restoring MI Pos\n");
+  // Restore old ordering (which prevents a LIS->handleMove bug).
+  for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
+    MachineBasicBlock::iterator POld = PosOld[i-1];
+    MachineBasicBlock::iterator PNew = PosNew[i-1];
+    if (PNew != POld) {
+      // Update the instruction stream.
+      DAG->getBB()->splice(POld, DAG->getBB(), PNew);
+
+      // Update LiveIntervals.
+      DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
+    }
+  }
+
+  DEBUG(
+    for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+      SIScheduleBlock *Block = CurrentBlocks[i];
+      Block->printDebug(true);
+    }
+  );
+}
+
+void SIScheduleBlockCreator::fillStats() {
+  unsigned DAGSize = CurrentBlocks.size();
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    int BlockIndice = TopDownIndex2Block[i];
+    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+    if (Block->getPreds().empty())
+      Block->Depth = 0;
+    else {
+      unsigned Depth = 0;
+      for (SIScheduleBlock *Pred : Block->getPreds()) {
+        if (Depth < Pred->Depth + 1)
+          Depth = Pred->Depth + 1;
+      }
+      Block->Depth = Depth;
+    }
+  }
+
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    int BlockIndice = BottomUpIndex2Block[i];
+    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+    if (Block->getSuccs().empty())
+      Block->Height = 0;
+    else {
+      unsigned Height = 0;
+      for (SIScheduleBlock *Succ : Block->getSuccs()) {
+        if (Height < Succ->Height + 1)
+          Height = Succ->Height + 1;
+      }
+      Block->Height = Height;
+    }
+  }
+}
+
+// SIScheduleBlockScheduler //
+
+SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
+                                                   SISchedulerBlockSchedulerVariant Variant,
+                                                   SIScheduleBlocks  BlocksStruct) :
+  DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
+  LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
+  SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
+
+  // Fill the usage of every output
+  // Warning: while by construction we always have a link between two blocks
+  // when one needs a result from the other, the number of users of an output
+  // is not the sum of child blocks having as input the same virtual register.
+  // Here is an example. A produces x and y. B eats x and produces x'.
+  // C eats x' and y. The register coalescer may have attributed the same
+  // virtual register to x and x'.
+  // To count accurately, we do a topological sort. In case the register is
+  // found for several parents, we increment the usage of the one with the
+  // highest topological index.
+  LiveOutRegsNumUsages.resize(Blocks.size());
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    for (unsigned Reg : Block->getInRegs()) {
+      bool Found = false;
+      int topoInd = -1;
+      for (SIScheduleBlock* Pred: Block->getPreds()) {
+        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
+        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+
+        if (RegPos != PredOutRegs.end()) {
+          Found = true;
+          if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
+            topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
+          }
+        }
+      }
+
+      if (!Found)
+        continue;
+
+      int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
+      std::map<unsigned, unsigned>::iterator RegPos =
+        LiveOutRegsNumUsages[PredID].find(Reg);
+      if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
+        ++LiveOutRegsNumUsages[PredID][Reg];
+      } else {
+        LiveOutRegsNumUsages[PredID][Reg] = 1;
+      }
+    }
+  }
+
+  LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
+  BlockNumPredsLeft.resize(Blocks.size());
+  BlockNumSuccsLeft.resize(Blocks.size());
+
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    BlockNumPredsLeft[i] = Block->getPreds().size();
+    BlockNumSuccsLeft[i] = Block->getSuccs().size();
+  }
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    assert(Block->getID() == i);
+  }
+#endif
+
+  std::set<unsigned> InRegs = DAG->getInRegs();
+  addLiveRegs(InRegs);
+
+  // Fill LiveRegsConsumers for regs that were already
+  // defined before scheduling.
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    for (unsigned Reg : Block->getInRegs()) {
+      bool Found = false;
+      for (SIScheduleBlock* Pred: Block->getPreds()) {
+        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
+        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+
+        if (RegPos != PredOutRegs.end()) {
+          Found = true;
+          break;
+        }
+      }
+
+      if (!Found) {
+        if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
+          LiveRegsConsumers[Reg] = 1;
+        else
+          ++LiveRegsConsumers[Reg];
+      }
+    }
+  }
+
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    SIScheduleBlock *Block = Blocks[i];
+    if (BlockNumPredsLeft[i] == 0) {
+      ReadyBlocks.push_back(Block);
+    }
+  }
+
+  while (SIScheduleBlock *Block = pickBlock()) {
+    BlocksScheduled.push_back(Block);
+    blockScheduled(Block);
+  }
+
+  DEBUG(
+    dbgs() << "Block Order:";
+    for (SIScheduleBlock* Block : BlocksScheduled) {
+      dbgs() << ' ' << Block->getID();
+    }
+  );
+}
+
+bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
+                                                   SIBlockSchedCandidate &TryCand) {
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  // Try to hide high latencies.
+  if (tryLess(TryCand.LastPosHighLatParentScheduled,
+              Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
+    return true;
+  // Schedule high latencies early so you can hide them better.
+  if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
+                 TryCand, Cand, Latency))
+    return true;
+  if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
+                                          TryCand, Cand, Depth))
+    return true;
+  if (tryGreater(TryCand.NumHighLatencySuccessors,
+                 Cand.NumHighLatencySuccessors,
+                 TryCand, Cand, Successor))
+    return true;
+  return false;
+}
+
+bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
+                                                    SIBlockSchedCandidate &TryCand) {
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
+              TryCand, Cand, RegUsage))
+    return true;
+  if (tryGreater(TryCand.NumSuccessors > 0,
+                 Cand.NumSuccessors > 0,
+                 TryCand, Cand, Successor))
+    return true;
+  if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
+    return true;
+  if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
+              TryCand, Cand, RegUsage))
+    return true;
+  return false;
+}
+
+SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
+  SIBlockSchedCandidate Cand;
+  std::vector<SIScheduleBlock*>::iterator Best;
+  SIScheduleBlock *Block;
+  if (ReadyBlocks.empty())
+    return nullptr;
+
+  DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
+                        VregCurrentUsage, SregCurrentUsage);
+  if (VregCurrentUsage > maxVregUsage)
+    maxVregUsage = VregCurrentUsage;
+  if (VregCurrentUsage > maxSregUsage)
+    maxSregUsage = VregCurrentUsage;
+  DEBUG(
+    dbgs() << "Picking New Blocks\n";
+    dbgs() << "Available: ";
+    for (SIScheduleBlock* Block : ReadyBlocks)
+      dbgs() << Block->getID() << ' ';
+    dbgs() << "\nCurrent Live:\n";
+    for (unsigned Reg : LiveRegs)
+      dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+    dbgs() << '\n';
+    dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
+    dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
+  );
+
+  Cand.Block = nullptr;
+  for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
+       E = ReadyBlocks.end(); I != E; ++I) {
+    SIBlockSchedCandidate TryCand;
+    TryCand.Block = *I;
+    TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
+    TryCand.VGPRUsageDiff =
+      checkRegUsageImpact(TryCand.Block->getInRegs(),
+                          TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
+    TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
+    TryCand.NumHighLatencySuccessors =
+      TryCand.Block->getNumHighLatencySuccessors();
+    TryCand.LastPosHighLatParentScheduled =
+      (unsigned int) std::max<int> (0,
+         LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
+           LastPosWaitedHighLatency);
+    TryCand.Height = TryCand.Block->Height;
+    // Try not to increase VGPR usage too much, else we may spill.
+    if (VregCurrentUsage > 120 ||
+        Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
+      if (!tryCandidateRegUsage(Cand, TryCand) &&
+          Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
+        tryCandidateLatency(Cand, TryCand);
+    } else {
+      if (!tryCandidateLatency(Cand, TryCand))
+        tryCandidateRegUsage(Cand, TryCand);
+    }
+    if (TryCand.Reason != NoCand) {
+      Cand.setBest(TryCand);
+      Best = I;
+      DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
+                   << getReasonStr(Cand.Reason) << '\n');
+    }
+  }
+
+  DEBUG(
+    dbgs() << "Picking: " << Cand.Block->getID() << '\n';
+    dbgs() << "Is a block with high latency instruction: "
+      << (Cand.IsHighLatency ? "yes\n" : "no\n");
+    dbgs() << "Position of last high latency dependency: "
+           << Cand.LastPosHighLatParentScheduled << '\n';
+    dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
+    dbgs() << '\n';
+  );
+
+  Block = Cand.Block;
+  ReadyBlocks.erase(Best);
+  return Block;
+}
+
+// Tracking of currently alive registers to determine VGPR Usage.
+
+void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
+  for (unsigned Reg : Regs) {
+    // For now only track virtual registers.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    // If not already in the live set, then add it.
+    (void) LiveRegs.insert(Reg);
+  }
+}
+
+void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
+                                       std::set<unsigned> &Regs) {
+  for (unsigned Reg : Regs) {
+    // For now only track virtual registers.
+    std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
+    assert (Pos != LiveRegs.end() && // Reg must be live.
+               LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
+               LiveRegsConsumers[Reg] >= 1);
+    --LiveRegsConsumers[Reg];
+    if (LiveRegsConsumers[Reg] == 0)
+      LiveRegs.erase(Pos);
+  }
+}
+
+void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
+  for (SIScheduleBlock* Block : Parent->getSuccs()) {
+    --BlockNumPredsLeft[Block->getID()];
+    if (BlockNumPredsLeft[Block->getID()] == 0) {
+      ReadyBlocks.push_back(Block);
+    }
+    // TODO: Improve check. When the dependency between the high latency
+    // instructions and the instructions of the other blocks are WAR or WAW
+    // there will be no wait triggered. We would like these cases to not
+    // update LastPosHighLatencyParentScheduled.
+    if (Parent->isHighLatencyBlock())
+      LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
+  }
+}
+
+void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
+  decreaseLiveRegs(Block, Block->getInRegs());
+  addLiveRegs(Block->getOutRegs());
+  releaseBlockSuccs(Block);
+  for (std::map<unsigned, unsigned>::iterator RegI =
+       LiveOutRegsNumUsages[Block->getID()].begin(),
+       E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
+    std::pair<unsigned, unsigned> RegP = *RegI;
+    if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
+      LiveRegsConsumers[RegP.first] = RegP.second;
+    else {
+      assert(LiveRegsConsumers[RegP.first] == 0);
+      LiveRegsConsumers[RegP.first] += RegP.second;
+    }
+  }
+  if (LastPosHighLatencyParentScheduled[Block->getID()] >
+        (unsigned)LastPosWaitedHighLatency)
+    LastPosWaitedHighLatency =
+      LastPosHighLatencyParentScheduled[Block->getID()];
+  ++NumBlockScheduled;
+}
+
+std::vector<int>
+SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
+                                     std::set<unsigned> &OutRegs) {
+  std::vector<int> DiffSetPressure;
+  DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
+
+  for (unsigned Reg : InRegs) {
+    // For now only track virtual registers.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    if (LiveRegsConsumers[Reg] > 1)
+      continue;
+    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+    for (; PSetI.isValid(); ++PSetI) {
+      DiffSetPressure[*PSetI] -= PSetI.getWeight();
+    }
+  }
+
+  for (unsigned Reg : OutRegs) {
+    // For now only track virtual registers.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+    for (; PSetI.isValid(); ++PSetI) {
+      DiffSetPressure[*PSetI] += PSetI.getWeight();
+    }
+  }
+
+  return DiffSetPressure;
+}
+
+// SIScheduler //
+
+struct SIScheduleBlockResult
+SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
+                             SISchedulerBlockSchedulerVariant ScheduleVariant) {
+  SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
+  SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
+  std::vector<SIScheduleBlock*> ScheduledBlocks;
+  struct SIScheduleBlockResult Res;
+
+  ScheduledBlocks = Scheduler.getBlocks();
+
+  for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
+    SIScheduleBlock *Block = ScheduledBlocks[b];
+    std::vector<SUnit*> SUs = Block->getScheduledUnits();
+
+    for (SUnit* SU : SUs)
+      Res.SUs.push_back(SU->NodeNum);
+  }
+
+  Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
+  Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
+  return Res;
+}
+
+// SIScheduleDAGMI //
+
+SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
+  ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) {
+  SITII = static_cast<const SIInstrInfo*>(TII);
+  SITRI = static_cast<const SIRegisterInfo*>(TRI);
+
+  VGPRSetID = SITRI->getVGPRPressureSet();
+  SGPRSetID = SITRI->getSGPRPressureSet();
+}
+
+SIScheduleDAGMI::~SIScheduleDAGMI() = default;
+
+// Code adapted from scheduleDAG.cpp
+// Does a topological sort over the SUs.
+// Both TopDown and BottomUp
+void SIScheduleDAGMI::topologicalSort() {
+  Topo.InitDAGTopologicalSorting();
+
+  TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
+  BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
+}
+
+// Move low latencies further from their user without
+// increasing SGPR usage (in general)
+// This is to be replaced by a better pass that would
+// take into account SGPR usage (based on VGPR Usage
+// and the corresponding wavefront count), that would
+// try to merge groups of loads if it make sense, etc
+void SIScheduleDAGMI::moveLowLatencies() {
+   unsigned DAGSize = SUnits.size();
+   int LastLowLatencyUser = -1;
+   int LastLowLatencyPos = -1;
+
+   for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
+    SUnit *SU = &SUnits[ScheduledSUnits[i]];
+    bool IsLowLatencyUser = false;
+    unsigned MinPos = 0;
+
+    for (SDep& PredDep : SU->Preds) {
+      SUnit *Pred = PredDep.getSUnit();
+      if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
+        IsLowLatencyUser = true;
+      }
+      if (Pred->NodeNum >= DAGSize)
+        continue;
+      unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
+      if (PredPos >= MinPos)
+        MinPos = PredPos + 1;
+    }
+
+    if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
+      unsigned BestPos = LastLowLatencyUser + 1;
+      if ((int)BestPos <= LastLowLatencyPos)
+        BestPos = LastLowLatencyPos + 1;
+      if (BestPos < MinPos)
+        BestPos = MinPos;
+      if (BestPos < i) {
+        for (unsigned u = i; u > BestPos; --u) {
+          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
+          ScheduledSUnits[u] = ScheduledSUnits[u-1];
+        }
+        ScheduledSUnits[BestPos] = SU->NodeNum;
+        ScheduledSUnitsInv[SU->NodeNum] = BestPos;
+      }
+      LastLowLatencyPos = BestPos;
+      if (IsLowLatencyUser)
+        LastLowLatencyUser = BestPos;
+    } else if (IsLowLatencyUser) {
+      LastLowLatencyUser = i;
+    // Moves COPY instructions on which depends
+    // the low latency instructions too.
+    } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
+      bool CopyForLowLat = false;
+      for (SDep& SuccDep : SU->Succs) {
+        SUnit *Succ = SuccDep.getSUnit();
+        if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
+          CopyForLowLat = true;
+        }
+      }
+      if (!CopyForLowLat)
+        continue;
+      if (MinPos < i) {
+        for (unsigned u = i; u > MinPos; --u) {
+          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
+          ScheduledSUnits[u] = ScheduledSUnits[u-1];
+        }
+        ScheduledSUnits[MinPos] = SU->NodeNum;
+        ScheduledSUnitsInv[SU->NodeNum] = MinPos;
+      }
+    }
+  }
+}
+
+void SIScheduleDAGMI::restoreSULinksLeft() {
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    SUnits[i].isScheduled = false;
+    SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
+    SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
+    SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
+    SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
+  }
+}
+
+// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
+template<typename _Iterator> void
+SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
+                                  unsigned &VgprUsage, unsigned &SgprUsage) {
+  VgprUsage = 0;
+  SgprUsage = 0;
+  for (_Iterator RegI = First; RegI != End; ++RegI) {
+    unsigned Reg = *RegI;
+    // For now only track virtual registers
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    PSetIterator PSetI = MRI.getPressureSets(Reg);
+    for (; PSetI.isValid(); ++PSetI) {
+      if (*PSetI == VGPRSetID)
+        VgprUsage += PSetI.getWeight();
+      else if (*PSetI == SGPRSetID)
+        SgprUsage += PSetI.getWeight();
+    }
+  }
+}
+
+void SIScheduleDAGMI::schedule()
+{
+  SmallVector<SUnit*, 8> TopRoots, BotRoots;
+  SIScheduleBlockResult Best, Temp;
+  DEBUG(dbgs() << "Preparing Scheduling\n");
+
+  buildDAGWithRegPressure();
+  DEBUG(
+    for(SUnit& SU : SUnits)
+       SU.dumpAll(this)
+  );
+
+  topologicalSort();
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+  // We reuse several ScheduleDAGMI and ScheduleDAGMILive
+  // functions, but to make them happy we must initialize
+  // the default Scheduler implementation (even if we do not
+  // run it)
+  SchedImpl->initialize(this);
+  initQueues(TopRoots, BotRoots);
+
+  // Fill some stats to help scheduling.
+
+  SUnitsLinksBackup = SUnits;
+  IsLowLatencySU.clear();
+  LowLatencyOffset.clear();
+  IsHighLatencySU.clear();
+
+  IsLowLatencySU.resize(SUnits.size(), 0);
+  LowLatencyOffset.resize(SUnits.size(), 0);
+  IsHighLatencySU.resize(SUnits.size(), 0);
+
+  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    unsigned BaseLatReg;
+    int64_t OffLatReg;
+    if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
+      IsLowLatencySU[i] = 1;
+      if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
+                                       TRI))
+        LowLatencyOffset[i] = OffLatReg;
+    } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
+      IsHighLatencySU[i] = 1;
+  }
+
+  SIScheduler Scheduler(this);
+  Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
+                                   SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
+
+  // if VGPR usage is extremely high, try other good performing variants
+  // which could lead to lower VGPR usage
+  if (Best.MaxVGPRUsage > 180) {
+    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+      { LatenciesAlone, BlockRegUsageLatency },
+//      { LatenciesAlone, BlockRegUsage },
+      { LatenciesGrouped, BlockLatencyRegUsage },
+//      { LatenciesGrouped, BlockRegUsageLatency },
+//      { LatenciesGrouped, BlockRegUsage },
+      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
+//      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
+//      { LatenciesAlonePlusConsecutive, BlockRegUsage }
+    };
+    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
+      Temp = Scheduler.scheduleVariant(v.first, v.second);
+      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
+        Best = Temp;
+    }
+  }
+  // if VGPR usage is still extremely high, we may spill. Try other variants
+  // which are less performing, but that could lead to lower VGPR usage.
+  if (Best.MaxVGPRUsage > 200) {
+    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+//      { LatenciesAlone, BlockRegUsageLatency },
+      { LatenciesAlone, BlockRegUsage },
+//      { LatenciesGrouped, BlockLatencyRegUsage },
+      { LatenciesGrouped, BlockRegUsageLatency },
+      { LatenciesGrouped, BlockRegUsage },
+//      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
+      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
+      { LatenciesAlonePlusConsecutive, BlockRegUsage }
+    };
+    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
+      Temp = Scheduler.scheduleVariant(v.first, v.second);
+      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
+        Best = Temp;
+    }
+  }
+
+  ScheduledSUnits = Best.SUs;
+  ScheduledSUnitsInv.resize(SUnits.size());
+
+  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
+    ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
+  }
+
+  moveLowLatencies();
+
+  // Tell the outside world about the result of the scheduling.
+
+  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
+  TopRPTracker.setPos(CurrentTop);
+
+  for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
+       E = ScheduledSUnits.end(); I != E; ++I) {
+    SUnit *SU = &SUnits[*I];
+
+    scheduleMI(SU, true);
+
+    DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+                 << *SU->getInstr());
+  }
+
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+
+  DEBUG({
+      unsigned BBNum = begin()->getParent()->getNumber();
+      dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
new file mode 100644
index 000000000000..77c07350d325
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -0,0 +1,493 @@
+//===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
+
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include <cassert>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+
+namespace llvm {
+
+enum SIScheduleCandReason {
+  NoCand,
+  RegUsage,
+  Latency,
+  Successor,
+  Depth,
+  NodeOrder
+};
+
+struct SISchedulerCandidate {
+  // The reason for this candidate.
+  SIScheduleCandReason Reason;
+
+  // Set of reasons that apply to multiple candidates.
+  uint32_t RepeatReasonSet;
+
+  SISchedulerCandidate()
+    :  Reason(NoCand), RepeatReasonSet(0) {}
+
+  bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
+  void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
+};
+
+class SIScheduleDAGMI;
+class SIScheduleBlockCreator;
+
+class SIScheduleBlock {
+  SIScheduleDAGMI *DAG;
+  SIScheduleBlockCreator *BC;
+
+  std::vector<SUnit*> SUnits;
+  std::map<unsigned, unsigned> NodeNum2Index;
+  std::vector<SUnit*> TopReadySUs;
+  std::vector<SUnit*> ScheduledSUnits;
+
+  /// The top of the unscheduled zone.
+  IntervalPressure TopPressure;
+  RegPressureTracker TopRPTracker;
+
+  // Pressure: number of said class of registers needed to
+  // store the live virtual and real registers.
+  // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
+  // Pressure of additional registers required inside the block.
+  std::vector<unsigned> InternalAdditionnalPressure;
+  // Pressure of input and output registers
+  std::vector<unsigned> LiveInPressure;
+  std::vector<unsigned> LiveOutPressure;
+  // Registers required by the block, and outputs.
+  // We do track only virtual registers.
+  // Note that some registers are not 32 bits,
+  // and thus the pressure is not equal
+  // to the number of live registers.
+  std::set<unsigned> LiveInRegs;
+  std::set<unsigned> LiveOutRegs;
+
+  bool Scheduled;
+  bool HighLatencyBlock;
+
+  std::vector<unsigned> HasLowLatencyNonWaitedParent;
+
+  // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
+  unsigned ID;
+
+  std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
+  std::vector<SIScheduleBlock*> Succs;  // All blocks successors.
+  unsigned NumHighLatencySuccessors;
+
+public:
+  SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
+                  unsigned ID):
+    DAG(DAG), BC(BC), TopRPTracker(TopPressure), Scheduled(false),
+    HighLatencyBlock(false), ID(ID), NumHighLatencySuccessors(0) {}
+
+  ~SIScheduleBlock() = default;
+
+  unsigned getID() const { return ID; }
+
+  /// Functions for Block construction.
+  void addUnit(SUnit *SU);
+
+  // When all SUs have been added.
+  void finalizeUnits();
+
+  // Add block pred, which has instruction predecessor of SU.
+  void addPred(SIScheduleBlock *Pred);
+  void addSucc(SIScheduleBlock *Succ);
+
+  const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
+  const std::vector<SIScheduleBlock*>& getSuccs() const { return Succs; }
+
+  unsigned Height;  // Maximum topdown path length to block without outputs
+  unsigned Depth;   // Maximum bottomup path length to block without inputs
+
+  unsigned getNumHighLatencySuccessors() const {
+    return NumHighLatencySuccessors;
+  }
+
+  bool isHighLatencyBlock() { return HighLatencyBlock; }
+
+  // This is approximative.
+  // Ideally should take into accounts some instructions (rcp, etc)
+  // are 4 times slower.
+  int getCost() { return SUnits.size(); }
+
+  // The block Predecessors and Successors must be all registered
+  // before fastSchedule().
+  // Fast schedule with no particular requirement.
+  void fastSchedule();
+
+  std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
+
+  // Complete schedule that will try to minimize reg pressure and
+  // low latencies, and will fill liveins and liveouts.
+  // Needs all MIs to be grouped between BeginBlock and EndBlock.
+  // The MIs can be moved after the scheduling,
+  // it is just used to allow correct track of live registers.
+  void schedule(MachineBasicBlock::iterator BeginBlock,
+                MachineBasicBlock::iterator EndBlock);
+
+  bool isScheduled() { return Scheduled; }
+
+  // Needs the block to be scheduled inside
+  // TODO: find a way to compute it.
+  std::vector<unsigned> &getInternalAdditionnalRegUsage() {
+    return InternalAdditionnalPressure;
+  }
+
+  std::set<unsigned> &getInRegs() { return LiveInRegs; }
+  std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
+
+  void printDebug(bool Full);
+
+private:
+  struct SISchedCandidate : SISchedulerCandidate {
+    // The best SUnit candidate.
+    SUnit *SU = nullptr;
+
+    unsigned SGPRUsage;
+    unsigned VGPRUsage;
+    bool IsLowLatency;
+    unsigned LowLatencyOffset;
+    bool HasLowLatencyNonWaitedParent;
+
+    SISchedCandidate() = default;
+
+    bool isValid() const { return SU; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SISchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      SU = Best.SU;
+      Reason = Best.Reason;
+      SGPRUsage = Best.SGPRUsage;
+      VGPRUsage = Best.VGPRUsage;
+      IsLowLatency = Best.IsLowLatency;
+      LowLatencyOffset = Best.LowLatencyOffset;
+      HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
+    }
+  };
+
+  void undoSchedule();
+
+  void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
+  void releaseSucc(SUnit *SU, SDep *SuccEdge);
+  // InOrOutBlock: restrict to links pointing inside the block (true),
+  // or restrict to links pointing outside the block (false).
+  void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
+
+  void nodeScheduled(SUnit *SU);
+  void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
+  void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
+  SUnit* pickNode();
+  void traceCandidate(const SISchedCandidate &Cand);
+  void initRegPressure(MachineBasicBlock::iterator BeginBlock,
+                       MachineBasicBlock::iterator EndBlock);
+};
+
+struct SIScheduleBlocks {
+  std::vector<SIScheduleBlock*> Blocks;
+  std::vector<int> TopDownIndex2Block;
+  std::vector<int> TopDownBlock2Index;
+};
+
+enum SISchedulerBlockCreatorVariant {
+    LatenciesAlone,
+    LatenciesGrouped,
+    LatenciesAlonePlusConsecutive
+};
+
+class SIScheduleBlockCreator {
+  SIScheduleDAGMI *DAG;
+  // unique_ptr handles freeing memory for us.
+  std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
+  std::map<SISchedulerBlockCreatorVariant,
+           SIScheduleBlocks> Blocks;
+  std::vector<SIScheduleBlock*> CurrentBlocks;
+  std::vector<int> Node2CurrentBlock;
+
+  // Topological sort
+  // Maps topological index to the node number.
+  std::vector<int> TopDownIndex2Block;
+  std::vector<int> TopDownBlock2Index;
+  std::vector<int> BottomUpIndex2Block;
+
+  // 0 -> Color not given.
+  // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
+  // Above -> Other groups.
+  int NextReservedID;
+  int NextNonReservedID;
+  std::vector<int> CurrentColoring;
+  std::vector<int> CurrentTopDownReservedDependencyColoring;
+  std::vector<int> CurrentBottomUpReservedDependencyColoring;
+
+public:
+  SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
+  ~SIScheduleBlockCreator();
+
+  SIScheduleBlocks
+  getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
+
+  bool isSUInBlock(SUnit *SU, unsigned ID);
+
+private:
+  // Give a Reserved color to every high latency.
+  void colorHighLatenciesAlone();
+
+  // Create groups of high latencies with a Reserved color.
+  void colorHighLatenciesGroups();
+
+  // Compute coloring for topdown and bottom traversals with
+  // different colors depending on dependencies on Reserved colors.
+  void colorComputeReservedDependencies();
+
+  // Give color to all non-colored SUs according to Reserved groups dependencies.
+  void colorAccordingToReservedDependencies();
+
+  // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
+  // The new colors are computed according to the dependencies on the other blocks
+  // formed with colorAccordingToReservedDependencies.
+  void colorEndsAccordingToDependencies();
+
+  // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
+  void colorForceConsecutiveOrderInGroup();
+
+  // Merge Constant loads that have all their users into another group to the group.
+  // (TODO: else if all their users depend on the same group, put them there)
+  void colorMergeConstantLoadsNextGroup();
+
+  // Merge SUs that have all their users into another group to the group
+  void colorMergeIfPossibleNextGroup();
+
+  // Merge SUs that have all their users into another group to the group,
+  // but only for Reserved groups.
+  void colorMergeIfPossibleNextGroupOnlyForReserved();
+
+  // Merge SUs that have all their users into another group to the group,
+  // but only if the group is no more than a few SUs.
+  void colorMergeIfPossibleSmallGroupsToNextGroup();
+
+  // Divides Blocks with important size.
+  // Idea of implementation: attribute new colors depending on topdown and
+  // bottom up links to other blocks.
+  void cutHugeBlocks();
+
+  // Put in one group all instructions with no users in this scheduling region
+  // (we'd want these groups be at the end).
+  void regroupNoUserInstructions();
+
+  void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
+
+  void topologicalSort();
+
+  void scheduleInsideBlocks();
+
+  void fillStats();
+};
+
+enum SISchedulerBlockSchedulerVariant {
+  BlockLatencyRegUsage,
+  BlockRegUsageLatency,
+  BlockRegUsage
+};
+
+class SIScheduleBlockScheduler {
+  SIScheduleDAGMI *DAG;
+  SISchedulerBlockSchedulerVariant Variant;
+  std::vector<SIScheduleBlock*> Blocks;
+
+  std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
+  std::set<unsigned> LiveRegs;
+  // Num of schedulable unscheduled blocks reading the register.
+  std::map<unsigned, unsigned> LiveRegsConsumers;
+
+  std::vector<unsigned> LastPosHighLatencyParentScheduled;
+  int LastPosWaitedHighLatency;
+
+  std::vector<SIScheduleBlock*> BlocksScheduled;
+  unsigned NumBlockScheduled;
+  std::vector<SIScheduleBlock*> ReadyBlocks;
+
+  unsigned VregCurrentUsage;
+  unsigned SregCurrentUsage;
+
+  // Currently is only approximation.
+  unsigned maxVregUsage;
+  unsigned maxSregUsage;
+
+  std::vector<unsigned> BlockNumPredsLeft;
+  std::vector<unsigned> BlockNumSuccsLeft;
+
+public:
+  SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
+                           SISchedulerBlockSchedulerVariant Variant,
+                           SIScheduleBlocks BlocksStruct);
+  ~SIScheduleBlockScheduler() = default;
+
+  std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }
+
+  unsigned getVGPRUsage() { return maxVregUsage; }
+  unsigned getSGPRUsage() { return maxSregUsage; }
+
+private:
+  struct SIBlockSchedCandidate : SISchedulerCandidate {
+    // The best Block candidate.
+    SIScheduleBlock *Block = nullptr;
+
+    bool IsHighLatency;
+    int VGPRUsageDiff;
+    unsigned NumSuccessors;
+    unsigned NumHighLatencySuccessors;
+    unsigned LastPosHighLatParentScheduled;
+    unsigned Height;
+
+    SIBlockSchedCandidate() = default;
+
+    bool isValid() const { return Block; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SIBlockSchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      Block = Best.Block;
+      Reason = Best.Reason;
+      IsHighLatency = Best.IsHighLatency;
+      VGPRUsageDiff = Best.VGPRUsageDiff;
+      NumSuccessors = Best.NumSuccessors;
+      NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
+      LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
+      Height = Best.Height;
+    }
+  };
+
+  bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
+                           SIBlockSchedCandidate &TryCand);
+  bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
+                            SIBlockSchedCandidate &TryCand);
+  SIScheduleBlock *pickBlock();
+
+  void addLiveRegs(std::set<unsigned> &Regs);
+  void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
+  void releaseBlockSuccs(SIScheduleBlock *Parent);
+  void blockScheduled(SIScheduleBlock *Block);
+
+  // Check register pressure change
+  // by scheduling a block with these LiveIn and LiveOut.
+  std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
+                                       std::set<unsigned> &OutRegs);
+
+  void schedule();
+};
+
+struct SIScheduleBlockResult {
+  std::vector<unsigned> SUs;
+  unsigned MaxSGPRUsage;
+  unsigned MaxVGPRUsage;
+};
+
+class SIScheduler {
+  SIScheduleDAGMI *DAG;
+  SIScheduleBlockCreator BlockCreator;
+
+public:
+  SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}
+
+  ~SIScheduler() = default;
+
+  struct SIScheduleBlockResult
+  scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
+                  SISchedulerBlockSchedulerVariant ScheduleVariant);
+};
+
+class SIScheduleDAGMI final : public ScheduleDAGMILive {
+  const SIInstrInfo *SITII;
+  const SIRegisterInfo *SITRI;
+
+  std::vector<SUnit> SUnitsLinksBackup;
+
+  // For moveLowLatencies. After all Scheduling variants are tested.
+  std::vector<unsigned> ScheduledSUnits;
+  std::vector<unsigned> ScheduledSUnitsInv;
+
+  unsigned VGPRSetID;
+  unsigned SGPRSetID;
+
+public:
+  SIScheduleDAGMI(MachineSchedContext *C);
+
+  ~SIScheduleDAGMI() override;
+
+  // Entry point for the schedule.
+  void schedule() override;
+
+  // To init Block's RPTracker.
+  void initRPTracker(RegPressureTracker &RPTracker) {
+    RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
+  }
+
+  MachineBasicBlock *getBB() { return BB; }
+  MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }
+  MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }
+  LiveIntervals *getLIS() { return LIS; }
+  MachineRegisterInfo *getMRI() { return &MRI; }
+  const TargetRegisterInfo *getTRI() { return TRI; }
+  SUnit& getEntrySU() { return EntrySU; }
+  SUnit& getExitSU() { return ExitSU; }
+
+  void restoreSULinksLeft();
+
+  template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
+                                                     _Iterator End,
+                                                     unsigned &VgprUsage,
+                                                     unsigned &SgprUsage);
+
+  std::set<unsigned> getInRegs() {
+    std::set<unsigned> InRegs;
+    for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
+      InRegs.insert(RegMaskPair.RegUnit);
+    }
+    return InRegs;
+  }
+
+  unsigned getVGPRSetID() const { return VGPRSetID; }
+  unsigned getSGPRSetID() const { return SGPRSetID; }
+
+private:
+  void topologicalSort();
+  // After scheduling is done, improve low latency placements.
+  void moveLowLatencies();
+
+public:
+  // Some stats for scheduling inside blocks.
+  std::vector<unsigned> IsLowLatencySU;
+  std::vector<unsigned> LowLatencyOffset;
+  std::vector<unsigned> IsHighLatencySU;
+  // Topological sort
+  // Maps topological index to the node number.
+  std::vector<int> TopDownIndex2SU;
+  std::vector<int> BottomUpIndex2SU;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
new file mode 100644
index 000000000000..4d2f917278e9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -0,0 +1,304 @@
+//===-- SIOptimizeExecMasking.cpp -----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-optimize-exec-masking"
+
+namespace {
+
+class SIOptimizeExecMasking : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIOptimizeExecMasking() : MachineFunctionPass(ID) {
+    initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "SI optimize exec mask operations";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE,
+                      "SI optimize exec mask operations", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE,
+                    "SI optimize exec mask operations", false, false)
+
+char SIOptimizeExecMasking::ID = 0;
+
+char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID;
+
+/// If \p MI is a copy from exec, return the register copied to.
+static unsigned isCopyFromExec(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AMDGPU::COPY:
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::S_MOV_B64_term: {
+    const MachineOperand &Src = MI.getOperand(1);
+    if (Src.isReg() && Src.getReg() == AMDGPU::EXEC)
+      return MI.getOperand(0).getReg();
+  }
+  }
+
+  return AMDGPU::NoRegister;
+}
+
+/// If \p MI is a copy to exec, return the register copied from.
+static unsigned isCopyToExec(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AMDGPU::COPY:
+  case AMDGPU::S_MOV_B64: {
+    const MachineOperand &Dst = MI.getOperand(0);
+    if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC)
+      return MI.getOperand(1).getReg();
+    break;
+  }
+  case AMDGPU::S_MOV_B64_term:
+    llvm_unreachable("should have been replaced");
+  }
+
+  return AMDGPU::NoRegister;
+}
+
+static unsigned getSaveExecOp(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::S_AND_B64:
+    return AMDGPU::S_AND_SAVEEXEC_B64;
+  case AMDGPU::S_OR_B64:
+    return AMDGPU::S_OR_SAVEEXEC_B64;
+  case AMDGPU::S_XOR_B64:
+    return AMDGPU::S_XOR_SAVEEXEC_B64;
+  case AMDGPU::S_ANDN2_B64:
+    return AMDGPU::S_ANDN2_SAVEEXEC_B64;
+  case AMDGPU::S_ORN2_B64:
+    return AMDGPU::S_ORN2_SAVEEXEC_B64;
+  case AMDGPU::S_NAND_B64:
+    return AMDGPU::S_NAND_SAVEEXEC_B64;
+  case AMDGPU::S_NOR_B64:
+    return AMDGPU::S_NOR_SAVEEXEC_B64;
+  case AMDGPU::S_XNOR_B64:
+    return AMDGPU::S_XNOR_SAVEEXEC_B64;
+  default:
+    return AMDGPU::INSTRUCTION_LIST_END;
+  }
+}
+
+// These are only terminators to get correct spill code placement during
+// register allocation, so turn them back into normal instructions. Only one of
+// these is expected per block.
+static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AMDGPU::S_MOV_B64_term: {
+    MI.setDesc(TII.get(AMDGPU::COPY));
+    return true;
+  }
+  case AMDGPU::S_XOR_B64_term: {
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(TII.get(AMDGPU::S_XOR_B64));
+    return true;
+  }
+  case AMDGPU::S_ANDN2_B64_term: {
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64));
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
+static MachineBasicBlock::reverse_iterator fixTerminators(
+  const SIInstrInfo &TII,
+  MachineBasicBlock &MBB) {
+  MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
+  for (; I != E; ++I) {
+    if (!I->isTerminator())
+      return I;
+
+    if (removeTerminatorBit(TII, *I))
+      return I;
+  }
+
+  return E;
+}
+
+static MachineBasicBlock::reverse_iterator findExecCopy(
+  const SIInstrInfo &TII,
+  MachineBasicBlock &MBB,
+  MachineBasicBlock::reverse_iterator I,
+  unsigned CopyToExec) {
+  const unsigned InstLimit = 25;
+
+  auto E = MBB.rend();
+  for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) {
+    unsigned CopyFromExec = isCopyFromExec(*I);
+    if (CopyFromExec != AMDGPU::NoRegister)
+      return I;
+  }
+
+  return E;
+}
+
+// XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly
+// repor tthe register as unavailable because a super-register with a lane mask
+// as unavailable.
+static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
+  for (MachineBasicBlock *Succ : MBB.successors()) {
+    if (Succ->isLiveIn(Reg))
+      return true;
+  }
+
+  return false;
+}
+
+bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  // Optimize sequences emitted for control flow lowering. They are originally
+  // emitted as the separate operations because spill code may need to be
+  // inserted for the saved copy of exec.
+  //
+  //     x = copy exec
+  //     z = s_<op>_b64 x, y
+  //     exec = copy z
+  // =>
+  //     x = s_<op>_saveexec_b64 y
+  //
+
+  for (MachineBasicBlock &MBB : MF) {
+    MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB);
+    MachineBasicBlock::reverse_iterator E = MBB.rend();
+    if (I == E)
+      continue;
+
+    unsigned CopyToExec = isCopyToExec(*I);
+    if (CopyToExec == AMDGPU::NoRegister)
+      continue;
+
+    // Scan backwards to find the def.
+    auto CopyToExecInst = &*I;
+    auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec);
+    if (CopyFromExecInst == E)
+      continue;
+
+    if (isLiveOut(MBB, CopyToExec)) {
+      // The copied register is live out and has a second use in another block.
+      DEBUG(dbgs() << "Exec copy source register is live out\n");
+      continue;
+    }
+
+    unsigned CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
+    MachineInstr *SaveExecInst = nullptr;
+    SmallVector<MachineInstr *, 4> OtherUseInsts;
+
+    for (MachineBasicBlock::iterator J
+           = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
+         J != JE; ++J) {
+      if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) {
+        DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
+        // Make sure this is inserted after any VALU ops that may have been
+        // scheduled in between.
+        SaveExecInst = nullptr;
+        break;
+      }
+
+      if (J->modifiesRegister(CopyToExec, TRI)) {
+        if (SaveExecInst) {
+          DEBUG(dbgs() << "Multiple instructions modify "
+                << PrintReg(CopyToExec, TRI) << '\n');
+          SaveExecInst = nullptr;
+          break;
+        }
+
+        unsigned SaveExecOp = getSaveExecOp(J->getOpcode());
+        if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
+          break;
+
+        if (J->readsRegister(CopyFromExec, TRI)) {
+          SaveExecInst = &*J;
+          DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
+          continue;
+        } else {
+          DEBUG(dbgs() << "Instruction does not read exec copy: " << *J << '\n');
+          break;
+        }
+      }
+
+      if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) {
+        assert(SaveExecInst != &*J);
+        OtherUseInsts.push_back(&*J);
+      }
+    }
+
+    if (!SaveExecInst)
+      continue;
+
+    DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
+
+    MachineOperand &Src0 = SaveExecInst->getOperand(1);
+    MachineOperand &Src1 = SaveExecInst->getOperand(2);
+
+    MachineOperand *OtherOp = nullptr;
+
+    if (Src0.isReg() && Src0.getReg() == CopyFromExec) {
+      OtherOp = &Src1;
+    } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) {
+      if (!SaveExecInst->isCommutable())
+        break;
+
+      OtherOp = &Src0;
+    } else
+      llvm_unreachable("unexpected");
+
+    CopyFromExecInst->eraseFromParent();
+
+    auto InsPt = SaveExecInst->getIterator();
+    const DebugLoc &DL = SaveExecInst->getDebugLoc();
+
+    BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())),
+            CopyFromExec)
+      .addReg(OtherOp->getReg());
+    SaveExecInst->eraseFromParent();
+
+    CopyToExecInst->eraseFromParent();
+
+    for (MachineInstr *OtherInst : OtherUseInsts) {
+      OtherInst->substituteRegister(CopyToExec, AMDGPU::EXEC,
+                                    AMDGPU::NoSubRegister, *TRI);
+    }
+  }
+
+  return true;
+
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
new file mode 100644
index 000000000000..8c4b24a4504d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -0,0 +1,1476 @@
+//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIRegisterInfo.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableSpillSGPRToSMEM(
+  "amdgpu-spill-sgpr-to-smem",
+  cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
+  cl::init(false));
+
+
+static bool hasPressureSet(const int *PSets, unsigned PSetID) {
+  for (unsigned i = 0; PSets[i] != -1; ++i) {
+    if (PSets[i] == (int)PSetID)
+      return true;
+  }
+  return false;
+}
+
+void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
+                                         BitVector &PressureSets) const {
+  for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) {
+    const int *PSets = getRegUnitPressureSets(*U);
+    if (hasPressureSet(PSets, PSetID)) {
+      PressureSets.set(PSetID);
+      break;
+    }
+  }
+}
+
+SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(),
+                                   SGPRPressureSets(getNumRegPressureSets()),
+                                   VGPRPressureSets(getNumRegPressureSets()) {
+  unsigned NumRegPressureSets = getNumRegPressureSets();
+
+  SGPRSetID = NumRegPressureSets;
+  VGPRSetID = NumRegPressureSets;
+
+  for (unsigned i = 0; i < NumRegPressureSets; ++i) {
+    classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets);
+    classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets);
+  }
+
+  // Determine the number of reg units for each pressure set.
+  std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0);
+  for (unsigned i = 0, e = getNumRegUnits(); i != e; ++i) {
+    const int *PSets = getRegUnitPressureSets(i);
+    for (unsigned j = 0; PSets[j] != -1; ++j) {
+      ++PressureSetRegUnits[PSets[j]];
+    }
+  }
+
+  unsigned VGPRMax = 0, SGPRMax = 0;
+  for (unsigned i = 0; i < NumRegPressureSets; ++i) {
+    if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) {
+      VGPRSetID = i;
+      VGPRMax = PressureSetRegUnits[i];
+      continue;
+    }
+    if (isSGPRPressureSet(i) && PressureSetRegUnits[i] > SGPRMax) {
+      SGPRSetID = i;
+      SGPRMax = PressureSetRegUnits[i];
+    }
+  }
+
+  assert(SGPRSetID < NumRegPressureSets &&
+         VGPRSetID < NumRegPressureSets);
+}
+
+void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
+  MCRegAliasIterator R(Reg, this, true);
+
+  for (; R.isValid(); ++R)
+    Reserved.set(*R);
+}
+
+unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
+  const MachineFunction &MF) const {
+  unsigned BaseIdx = alignDown(getMaxNumSGPRs(MF), 4) - 4;
+  unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+  return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+}
+
+unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
+  const MachineFunction &MF) const {
+  unsigned RegCount = getMaxNumSGPRs(MF);
+  unsigned Reg;
+
+  // Try to place it in a hole after PrivateSegmentbufferReg.
+  if (RegCount & 3) {
+    // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
+    // alignment constraints, so we have a hole where can put the wave offset.
+    Reg = RegCount - 1;
+  } else {
+    // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
+    // wave offset before it.
+    Reg = RegCount - 5;
+  }
+  return AMDGPU::SGPR_32RegClass.getRegister(Reg);
+}
+
+BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
+
+  // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
+  // this seems likely to result in bugs, so I'm marking them as reserved.
+  reserveRegisterTuples(Reserved, AMDGPU::EXEC);
+  reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
+
+  // Reserve Trap Handler registers - support is not implemented in Codegen.
+  reserveRegisterTuples(Reserved, AMDGPU::TBA);
+  reserveRegisterTuples(Reserved, AMDGPU::TMA);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
+  reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
+
+  unsigned MaxNumSGPRs = getMaxNumSGPRs(MF);
+  unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+  for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
+    unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
+    reserveRegisterTuples(Reserved, Reg);
+  }
+
+  unsigned MaxNumVGPRs = getMaxNumVGPRs(MF);
+  unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+  for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
+    unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
+    reserveRegisterTuples(Reserved, Reg);
+  }
+
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+  if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
+    // Reserve 1 SGPR for scratch wave offset in case we need to spill.
+    reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
+  }
+
+  unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+  if (ScratchRSrcReg != AMDGPU::NoRegister) {
+    // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
+    // to spill.
+    // TODO: May need to reserve a VGPR if doing LDS spilling.
+    reserveRegisterTuples(Reserved, ScratchRSrcReg);
+    assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
+  }
+
+  return Reserved;
+}
+
+bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
+  return Fn.getFrameInfo().hasStackObjects();
+}
+
+bool
+SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
+  return MF.getFrameInfo().hasStackObjects();
+}
+
+bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
+  const MachineFunction &MF) const {
+  // m0 is needed for the scalar store offset. m0 is unallocatable, so we can't
+  // create a virtual register for it during frame index elimination, so the
+  // scavenger is directly needed.
+  return MF.getFrameInfo().hasStackObjects() &&
+         MF.getSubtarget<SISubtarget>().hasScalarStores() &&
+         MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
+}
+
+bool SIRegisterInfo::requiresVirtualBaseRegisters(
+  const MachineFunction &) const {
+  // There are no special dedicated stack or frame pointers.
+  return true;
+}
+
+bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  // This helps catch bugs as verifier errors.
+  return true;
+}
+
+int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const {
+  assert(SIInstrInfo::isMUBUF(*MI));
+
+  int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                          AMDGPU::OpName::offset);
+  return MI->getOperand(OffIdx).getImm();
+}
+
+int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
+                                                 int Idx) const {
+  if (!SIInstrInfo::isMUBUF(*MI))
+    return 0;
+
+  assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::vaddr) &&
+         "Should never see frame index on non-address operand");
+
+  return getMUBUFInstrOffset(MI);
+}
+
+bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
+  if (!MI->mayLoadOrStore())
+    return false;
+
+  int64_t FullOffset = Offset + getMUBUFInstrOffset(MI);
+
+  return !isUInt<12>(FullOffset);
+}
+
+void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                  unsigned BaseReg,
+                                                  int FrameIdx,
+                                                  int64_t Offset) const {
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL; // Defaults to "unknown"
+
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+
+  MachineFunction *MF = MBB->getParent();
+  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = Subtarget.getInstrInfo();
+
+  if (Offset == 0) {
+    BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
+      .addFrameIndex(FrameIdx);
+    return;
+  }
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+  unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
+    .addImm(Offset);
+  BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg)
+    .addFrameIndex(FrameIdx);
+
+  BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg)
+    .addReg(UnusedCarry, RegState::Define | RegState::Dead)
+    .addReg(OffsetReg, RegState::Kill)
+    .addReg(FIReg);
+}
+
+void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                       int64_t Offset) const {
+
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineFunction *MF = MBB->getParent();
+  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = Subtarget.getInstrInfo();
+
+#ifndef NDEBUG
+  // FIXME: Is it possible to be storing a frame index to itself?
+  bool SeenFI = false;
+  for (const MachineOperand &MO: MI.operands()) {
+    if (MO.isFI()) {
+      if (SeenFI)
+        llvm_unreachable("should not see multiple frame indices");
+
+      SeenFI = true;
+    }
+  }
+#endif
+
+  MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+  assert(FIOp && FIOp->isFI() && "frame index must be address operand");
+
+  assert(TII->isMUBUF(MI));
+
+  MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
+  int64_t NewOffset = OffsetOp->getImm() + Offset;
+  assert(isUInt<12>(NewOffset) && "offset should be legal");
+
+  FIOp->ChangeToRegister(BaseReg, false);
+  OffsetOp->setImm(NewOffset);
+}
+
+bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                        unsigned BaseReg,
+                                        int64_t Offset) const {
+  if (!SIInstrInfo::isMUBUF(*MI))
+    return false;
+
+  int64_t NewOffset = Offset + getMUBUFInstrOffset(MI);
+
+  return isUInt<12>(NewOffset);
+}
+
+const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
+  const MachineFunction &MF, unsigned Kind) const {
+  // This is inaccurate. It depends on the instruction and address space. The
+  // only place where we should hit this is for dealing with frame indexes /
+  // private accesses, so this is correct in that case.
+  return &AMDGPU::VGPR_32RegClass;
+}
+
+static unsigned getNumSubRegsForSpillOp(unsigned Op) {
+
+  switch (Op) {
+  case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S512_RESTORE:
+  case AMDGPU::SI_SPILL_V512_SAVE:
+  case AMDGPU::SI_SPILL_V512_RESTORE:
+    return 16;
+  case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S256_RESTORE:
+  case AMDGPU::SI_SPILL_V256_SAVE:
+  case AMDGPU::SI_SPILL_V256_RESTORE:
+    return 8;
+  case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S128_RESTORE:
+  case AMDGPU::SI_SPILL_V128_SAVE:
+  case AMDGPU::SI_SPILL_V128_RESTORE:
+    return 4;
+  case AMDGPU::SI_SPILL_V96_SAVE:
+  case AMDGPU::SI_SPILL_V96_RESTORE:
+    return 3;
+  case AMDGPU::SI_SPILL_S64_SAVE:
+  case AMDGPU::SI_SPILL_S64_RESTORE:
+  case AMDGPU::SI_SPILL_V64_SAVE:
+  case AMDGPU::SI_SPILL_V64_RESTORE:
+    return 2;
+  case AMDGPU::SI_SPILL_S32_SAVE:
+  case AMDGPU::SI_SPILL_S32_RESTORE:
+  case AMDGPU::SI_SPILL_V32_SAVE:
+  case AMDGPU::SI_SPILL_V32_RESTORE:
+    return 1;
+  default: llvm_unreachable("Invalid spill opcode");
+  }
+}
+
+static int getOffsetMUBUFStore(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
+    return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+  case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
+    return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
+  case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
+    return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
+  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
+    return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
+  case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
+    return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
+  default:
+    return -1;
+  }
+}
+
+static int getOffsetMUBUFLoad(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
+    return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+  case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
+    return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
+  case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
+    return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
+  case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
+    return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
+  case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
+    return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
+  case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
+    return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
+  case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
+    return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
+  default:
+    return -1;
+  }
+}
+
+// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
+// need to handle the case where an SGPR may need to be spilled while spilling.
+static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
+                                      MachineFrameInfo &MFI,
+                                      MachineBasicBlock::iterator MI,
+                                      int Index,
+                                      int64_t Offset) {
+  MachineBasicBlock *MBB = MI->getParent();
+  const DebugLoc &DL = MI->getDebugLoc();
+  bool IsStore = MI->mayStore();
+
+  unsigned Opc = MI->getOpcode();
+  int LoadStoreOp = IsStore ?
+    getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
+  if (LoadStoreOp == -1)
+    return false;
+
+  unsigned Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata)->getReg();
+
+  BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
+    .addReg(Reg, getDefRegState(!IsStore))
+    .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
+    .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
+    .addImm(Offset)
+    .addImm(0) // glc
+    .addImm(0) // slc
+    .addImm(0) // tfe
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+  return true;
+}
+
+void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
+                                         unsigned LoadStoreOp,
+                                         int Index,
+                                         unsigned ValueReg,
+                                         bool IsKill,
+                                         unsigned ScratchRsrcReg,
+                                         unsigned ScratchOffsetReg,
+                                         int64_t InstOffset,
+                                         MachineMemOperand *MMO,
+                                         RegScavenger *RS) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction *MF = MI->getParent()->getParent();
+  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+
+  const MCInstrDesc &Desc = TII->get(LoadStoreOp);
+  const DebugLoc &DL = MI->getDebugLoc();
+  bool IsStore = Desc.mayStore();
+
+  bool RanOutOfSGPRs = false;
+  bool Scavenged = false;
+  unsigned SOffset = ScratchOffsetReg;
+
+  const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
+  unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32;
+  unsigned Size = NumSubRegs * 4;
+  int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
+  const int64_t OriginalImmOffset = Offset;
+
+  unsigned Align = MFI.getObjectAlignment(Index);
+  const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
+
+  if (!isUInt<12>(Offset + Size)) {
+    SOffset = AMDGPU::NoRegister;
+
+    // We don't have access to the register scavenger if this function is called
+    // during  PEI::scavengeFrameVirtualRegs().
+    if (RS)
+      SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass);
+
+    if (SOffset == AMDGPU::NoRegister) {
+      // There are no free SGPRs, and since we are in the process of spilling
+      // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
+      // on SI/CI and on VI it is true until we implement spilling using scalar
+      // stores), we have no way to free up an SGPR.  Our solution here is to
+      // add the offset directly to the ScratchOffset register, and then
+      // subtract the offset after the spill to return ScratchOffset to it's
+      // original value.
+      RanOutOfSGPRs = true;
+      SOffset = ScratchOffsetReg;
+    } else {
+      Scavenged = true;
+    }
+
+    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
+      .addReg(ScratchOffsetReg)
+      .addImm(Offset);
+
+    Offset = 0;
+  }
+
+  const unsigned EltSize = 4;
+
+  for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
+    unsigned SubReg = NumSubRegs == 1 ?
+      ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
+
+    unsigned SOffsetRegState = 0;
+    unsigned SrcDstRegState = getDefRegState(!IsStore);
+    if (i + 1 == e) {
+      SOffsetRegState |= getKillRegState(Scavenged);
+      // The last implicit use carries the "Kill" flag.
+      SrcDstRegState |= getKillRegState(IsKill);
+    }
+
+    MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
+    MachineMemOperand *NewMMO
+      = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
+                                 EltSize, MinAlign(Align, EltSize * i));
+
+    auto MIB = BuildMI(*MBB, MI, DL, Desc)
+      .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
+      .addReg(ScratchRsrcReg)
+      .addReg(SOffset, SOffsetRegState)
+      .addImm(Offset)
+      .addImm(0) // glc
+      .addImm(0) // slc
+      .addImm(0) // tfe
+      .addMemOperand(NewMMO);
+
+    if (NumSubRegs > 1)
+      MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
+  }
+
+  if (RanOutOfSGPRs) {
+    // Subtract the offset we added to the ScratchOffset register.
+    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
+      .addReg(ScratchOffsetReg)
+      .addImm(OriginalImmOffset);
+  }
+}
+
+static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
+                                                     bool Store) {
+  if (SuperRegSize % 16 == 0) {
+    return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR :
+                         AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR };
+  }
+
+  if (SuperRegSize % 8 == 0) {
+    return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR :
+                        AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR };
+  }
+
+  return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR :
+                      AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
+}
+
+void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
+                               int Index,
+                               RegScavenger *RS) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  unsigned SuperReg = MI->getOperand(0).getReg();
+  bool IsKill = MI->getOperand(0).isKill();
+  const DebugLoc &DL = MI->getDebugLoc();
+
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+
+  bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
+
+  assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+
+  unsigned OffsetReg = AMDGPU::M0;
+  unsigned M0CopyReg = AMDGPU::NoRegister;
+
+  if (SpillToSMEM) {
+    if (RS->isRegUsed(AMDGPU::M0)) {
+      M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
+        .addReg(AMDGPU::M0);
+    }
+  }
+
+  unsigned ScalarStoreOp;
+  unsigned EltSize = 4;
+  const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
+  if (SpillToSMEM && isSGPRClass(RC)) {
+    // XXX - if private_element_size is larger than 4 it might be useful to be
+    // able to spill wider vmem spills.
+    std::tie(EltSize, ScalarStoreOp) = getSpillEltSize(RC->getSize(), true);
+  }
+
+  ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
+  unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+
+  // SubReg carries the "Kill" flag when SubReg == SuperReg.
+  unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
+  for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+    unsigned SubReg = NumSubRegs == 1 ?
+      SuperReg : getSubReg(SuperReg, SplitParts[i]);
+
+    if (SpillToSMEM) {
+      int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+
+      // The allocated memory size is really the wavefront size * the frame
+      // index size. The widest register class is 64 bytes, so a 4-byte scratch
+      // allocation is enough to spill this in a single stack object.
+      //
+      // FIXME: Frame size/offsets are computed earlier than this, so the extra
+      // space is still unnecessarily allocated.
+
+      unsigned Align = FrameInfo.getObjectAlignment(Index);
+      MachinePointerInfo PtrInfo
+        = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
+      MachineMemOperand *MMO
+        = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+                                   EltSize, MinAlign(Align, EltSize * i));
+
+      // SMEM instructions only support a single offset, so increment the wave
+      // offset.
+
+      int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
+      if (Offset != 0) {
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
+          .addReg(MFI->getScratchWaveOffsetReg())
+          .addImm(Offset);
+      } else {
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
+          .addReg(MFI->getScratchWaveOffsetReg());
+      }
+
+      BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
+        .addReg(SubReg, getKillRegState(IsKill)) // sdata
+        .addReg(MFI->getScratchRSrcReg())        // sbase
+        .addReg(OffsetReg, RegState::Kill)       // soff
+        .addImm(0)                               // glc
+        .addMemOperand(MMO);
+
+      continue;
+    }
+
+    struct SIMachineFunctionInfo::SpilledReg Spill =
+      MFI->getSpilledReg(MF, Index, i);
+    if (Spill.hasReg()) {
+      BuildMI(*MBB, MI, DL,
+              TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+              Spill.VGPR)
+        .addReg(SubReg, getKillRegState(IsKill))
+        .addImm(Spill.Lane);
+
+      // FIXME: Since this spills to another register instead of an actual
+      // frame index, we should delete the frame index when all references to
+      // it are fixed.
+    } else {
+      // Spill SGPR to a frame index.
+      // TODO: Should VI try to spill to VGPR and then spill to SMEM?
+      unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      // TODO: Should VI try to spill to VGPR and then spill to SMEM?
+
+      MachineInstrBuilder Mov
+        = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+        .addReg(SubReg, SubKillState);
+
+
+      // There could be undef components of a spilled super register.
+      // TODO: Can we detect this and skip the spill?
+      if (NumSubRegs > 1) {
+        // The last implicit use of the SuperReg carries the "Kill" flag.
+        unsigned SuperKillState = 0;
+        if (i + 1 == e)
+          SuperKillState |= getKillRegState(IsKill);
+        Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
+      }
+
+      unsigned Align = FrameInfo.getObjectAlignment(Index);
+      MachinePointerInfo PtrInfo
+        = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
+      MachineMemOperand *MMO
+        = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+                                   EltSize, MinAlign(Align, EltSize * i));
+      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
+        .addReg(TmpReg, RegState::Kill)         // src
+        .addFrameIndex(Index)                   // vaddr
+        .addReg(MFI->getScratchRSrcReg())       // srrsrc
+        .addReg(MFI->getScratchWaveOffsetReg()) // soffset
+        .addImm(i * 4)                          // offset
+        .addMemOperand(MMO);
+    }
+  }
+
+  if (M0CopyReg != AMDGPU::NoRegister) {
+    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
+      .addReg(M0CopyReg, RegState::Kill);
+  }
+
+  MI->eraseFromParent();
+  MFI->addToSpilledSGPRs(NumSubRegs);
+}
+
+void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
+                                 int Index,
+                                 RegScavenger *RS) const {
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineBasicBlock *MBB = MI->getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const DebugLoc &DL = MI->getDebugLoc();
+
+  unsigned SuperReg = MI->getOperand(0).getReg();
+  bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
+
+  assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+
+  unsigned OffsetReg = AMDGPU::M0;
+  unsigned M0CopyReg = AMDGPU::NoRegister;
+
+  if (SpillToSMEM) {
+    if (RS->isRegUsed(AMDGPU::M0)) {
+      M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
+        .addReg(AMDGPU::M0);
+    }
+  }
+
+  unsigned EltSize = 4;
+  unsigned ScalarLoadOp;
+
+  const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
+  if (SpillToSMEM && isSGPRClass(RC)) {
+    // XXX - if private_element_size is larger than 4 it might be useful to be
+    // able to spill wider vmem spills.
+    std::tie(EltSize, ScalarLoadOp) = getSpillEltSize(RC->getSize(), false);
+  }
+
+  ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
+  unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+
+  // SubReg carries the "Kill" flag when SubReg == SuperReg.
+  int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+
+  for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+    unsigned SubReg = NumSubRegs == 1 ?
+      SuperReg : getSubReg(SuperReg, SplitParts[i]);
+
+    if (SpillToSMEM) {
+      // FIXME: Size may be > 4 but extra bytes wasted.
+      unsigned Align = FrameInfo.getObjectAlignment(Index);
+      MachinePointerInfo PtrInfo
+        = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
+      MachineMemOperand *MMO
+        = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
+                                   EltSize, MinAlign(Align, EltSize * i));
+
+      // Add i * 4 offset
+      int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
+      if (Offset != 0) {
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
+          .addReg(MFI->getScratchWaveOffsetReg())
+          .addImm(Offset);
+      } else {
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
+          .addReg(MFI->getScratchWaveOffsetReg());
+      }
+
+      auto MIB =
+        BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg)
+        .addReg(MFI->getScratchRSrcReg()) // sbase
+        .addReg(OffsetReg, RegState::Kill)                // soff
+        .addImm(0)                        // glc
+        .addMemOperand(MMO);
+
+      if (NumSubRegs > 1)
+        MIB.addReg(SuperReg, RegState::ImplicitDefine);
+
+      continue;
+    }
+
+    SIMachineFunctionInfo::SpilledReg Spill
+      = MFI->getSpilledReg(MF, Index, i);
+
+    if (Spill.hasReg()) {
+      auto MIB =
+        BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+                SubReg)
+        .addReg(Spill.VGPR)
+        .addImm(Spill.Lane);
+
+      if (NumSubRegs > 1)
+        MIB.addReg(SuperReg, RegState::ImplicitDefine);
+    } else {
+      // Restore SGPR from a stack slot.
+      // FIXME: We should use S_LOAD_DWORD here for VI.
+      unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      unsigned Align = FrameInfo.getObjectAlignment(Index);
+
+      MachinePointerInfo PtrInfo
+        = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
+
+      MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo,
+        MachineMemOperand::MOLoad, EltSize,
+        MinAlign(Align, EltSize * i));
+
+      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
+        .addFrameIndex(Index)                   // vaddr
+        .addReg(MFI->getScratchRSrcReg())       // srsrc
+        .addReg(MFI->getScratchWaveOffsetReg()) // soffset
+        .addImm(i * 4)                          // offset
+        .addMemOperand(MMO);
+
+      auto MIB =
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
+        .addReg(TmpReg, RegState::Kill);
+
+      if (NumSubRegs > 1)
+        MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+    }
+  }
+
+  if (M0CopyReg != AMDGPU::NoRegister) {
+    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
+      .addReg(M0CopyReg, RegState::Kill);
+  }
+
+  MI->eraseFromParent();
+}
+
+void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                        int SPAdj, unsigned FIOperandNum,
+                                        RegScavenger *RS) const {
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineBasicBlock *MBB = MI->getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  MachineOperand &FIOp = MI->getOperand(FIOperandNum);
+  int Index = MI->getOperand(FIOperandNum).getIndex();
+
+  switch (MI->getOpcode()) {
+    // SGPR register spill
+    case AMDGPU::SI_SPILL_S512_SAVE:
+    case AMDGPU::SI_SPILL_S256_SAVE:
+    case AMDGPU::SI_SPILL_S128_SAVE:
+    case AMDGPU::SI_SPILL_S64_SAVE:
+    case AMDGPU::SI_SPILL_S32_SAVE: {
+      spillSGPR(MI, Index, RS);
+      break;
+    }
+
+    // SGPR register restore
+    case AMDGPU::SI_SPILL_S512_RESTORE:
+    case AMDGPU::SI_SPILL_S256_RESTORE:
+    case AMDGPU::SI_SPILL_S128_RESTORE:
+    case AMDGPU::SI_SPILL_S64_RESTORE:
+    case AMDGPU::SI_SPILL_S32_RESTORE: {
+      restoreSGPR(MI, Index, RS);
+      break;
+    }
+
+    // VGPR register spill
+    case AMDGPU::SI_SPILL_V512_SAVE:
+    case AMDGPU::SI_SPILL_V256_SAVE:
+    case AMDGPU::SI_SPILL_V128_SAVE:
+    case AMDGPU::SI_SPILL_V96_SAVE:
+    case AMDGPU::SI_SPILL_V64_SAVE:
+    case AMDGPU::SI_SPILL_V32_SAVE: {
+      const MachineOperand *VData = TII->getNamedOperand(*MI,
+                                                         AMDGPU::OpName::vdata);
+      buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
+            Index,
+            VData->getReg(), VData->isKill(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+            *MI->memoperands_begin(),
+            RS);
+      MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
+      MI->eraseFromParent();
+      break;
+    }
+    case AMDGPU::SI_SPILL_V32_RESTORE:
+    case AMDGPU::SI_SPILL_V64_RESTORE:
+    case AMDGPU::SI_SPILL_V96_RESTORE:
+    case AMDGPU::SI_SPILL_V128_RESTORE:
+    case AMDGPU::SI_SPILL_V256_RESTORE:
+    case AMDGPU::SI_SPILL_V512_RESTORE: {
+      const MachineOperand *VData = TII->getNamedOperand(*MI,
+                                                         AMDGPU::OpName::vdata);
+
+      buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+            Index,
+            VData->getReg(), VData->isKill(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+            *MI->memoperands_begin(),
+            RS);
+      MI->eraseFromParent();
+      break;
+    }
+
+    default: {
+      if (TII->isMUBUF(*MI)) {
+        // Disable offen so we don't need a 0 vgpr base.
+        assert(static_cast<int>(FIOperandNum) ==
+               AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                          AMDGPU::OpName::vaddr));
+
+        int64_t Offset = FrameInfo.getObjectOffset(Index);
+        int64_t OldImm
+          = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
+        int64_t NewOffset = OldImm + Offset;
+
+        if (isUInt<12>(NewOffset) &&
+            buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) {
+          MI->eraseFromParent();
+          break;
+        }
+      }
+
+      int64_t Offset = FrameInfo.getObjectOffset(Index);
+      FIOp.ChangeToImmediate(Offset);
+      if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
+        unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+        BuildMI(*MBB, MI, MI->getDebugLoc(),
+                TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+                .addImm(Offset);
+        FIOp.ChangeToRegister(TmpReg, false, false, true);
+      }
+    }
+  }
+}
+
+// FIXME: This is very slow. It might be worth creating a map from physreg to
+// register class.
+const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
+  assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+
+  static const TargetRegisterClass *const BaseClasses[] = {
+    &AMDGPU::VGPR_32RegClass,
+    &AMDGPU::SReg_32RegClass,
+    &AMDGPU::VReg_64RegClass,
+    &AMDGPU::SReg_64RegClass,
+    &AMDGPU::VReg_96RegClass,
+    &AMDGPU::VReg_128RegClass,
+    &AMDGPU::SReg_128RegClass,
+    &AMDGPU::VReg_256RegClass,
+    &AMDGPU::SReg_256RegClass,
+    &AMDGPU::VReg_512RegClass,
+    &AMDGPU::SReg_512RegClass,
+    &AMDGPU::SCC_CLASSRegClass,
+  };
+
+  for (const TargetRegisterClass *BaseClass : BaseClasses) {
+    if (BaseClass->contains(Reg)) {
+      return BaseClass;
+    }
+  }
+  return nullptr;
+}
+
+// TODO: It might be helpful to have some target specific flags in
+// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
+bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
+  switch (RC->getSize()) {
+  case 0: return false;
+  case 1: return false;
+  case 4:
+    return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
+  case 8:
+    return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
+  case 12:
+    return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
+  case 16:
+    return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
+  case 32:
+    return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
+  case 64:
+    return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
+  default:
+    llvm_unreachable("Invalid register class size");
+  }
+}
+
+const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
+                                         const TargetRegisterClass *SRC) const {
+  switch (SRC->getSize()) {
+  case 4:
+    return &AMDGPU::VGPR_32RegClass;
+  case 8:
+    return &AMDGPU::VReg_64RegClass;
+  case 12:
+    return &AMDGPU::VReg_96RegClass;
+  case 16:
+    return &AMDGPU::VReg_128RegClass;
+  case 32:
+    return &AMDGPU::VReg_256RegClass;
+  case 64:
+    return &AMDGPU::VReg_512RegClass;
+  default:
+    llvm_unreachable("Invalid register class size");
+  }
+}
+
+const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
+                                         const TargetRegisterClass *VRC) const {
+  switch (VRC->getSize()) {
+  case 4:
+    return &AMDGPU::SGPR_32RegClass;
+  case 8:
+    return &AMDGPU::SReg_64RegClass;
+  case 16:
+    return &AMDGPU::SReg_128RegClass;
+  case 32:
+    return &AMDGPU::SReg_256RegClass;
+  case 64:
+    return &AMDGPU::SReg_512RegClass;
+  default:
+    llvm_unreachable("Invalid register class size");
+  }
+}
+
+const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
+                         const TargetRegisterClass *RC, unsigned SubIdx) const {
+  if (SubIdx == AMDGPU::NoSubRegister)
+    return RC;
+
+  // We can assume that each lane corresponds to one 32-bit register.
+  LaneBitmask::Type Mask = getSubRegIndexLaneMask(SubIdx).getAsInteger();
+  unsigned Count = countPopulation(Mask);
+  if (isSGPRClass(RC)) {
+    switch (Count) {
+    case 1:
+      return &AMDGPU::SGPR_32RegClass;
+    case 2:
+      return &AMDGPU::SReg_64RegClass;
+    case 4:
+      return &AMDGPU::SReg_128RegClass;
+    case 8:
+      return &AMDGPU::SReg_256RegClass;
+    case 16: /* fall-through */
+    default:
+      llvm_unreachable("Invalid sub-register class size");
+    }
+  } else {
+    switch (Count) {
+    case 1:
+      return &AMDGPU::VGPR_32RegClass;
+    case 2:
+      return &AMDGPU::VReg_64RegClass;
+    case 3:
+      return &AMDGPU::VReg_96RegClass;
+    case 4:
+      return &AMDGPU::VReg_128RegClass;
+    case 8:
+      return &AMDGPU::VReg_256RegClass;
+    case 16: /* fall-through */
+    default:
+      llvm_unreachable("Invalid sub-register class size");
+    }
+  }
+}
+
+bool SIRegisterInfo::shouldRewriteCopySrc(
+  const TargetRegisterClass *DefRC,
+  unsigned DefSubReg,
+  const TargetRegisterClass *SrcRC,
+  unsigned SrcSubReg) const {
+  // We want to prefer the smallest register class possible, so we don't want to
+  // stop and rewrite on anything that looks like a subregister
+  // extract. Operations mostly don't care about the super register class, so we
+  // only want to stop on the most basic of copies between the same register
+  // class.
+  //
+  // e.g. if we have something like
+  // vreg0 = ...
+  // vreg1 = ...
+  // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2
+  // vreg3 = COPY vreg2, sub0
+  //
+  // We want to look through the COPY to find:
+  //  => vreg3 = COPY vreg0
+
+  // Plain copy.
+  return getCommonSubClass(DefRC, SrcRC) != nullptr;
+}
+
+// FIXME: Most of these are flexible with HSA and we don't need to reserve them
+// as input registers if unused. Whether the dispatch ptr is necessary should be
+// easy to detect from used intrinsics. Scratch setup is harder to know.
+unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
+                                           enum PreloadedValue Value) const {
+
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  (void)ST;
+  switch (Value) {
+  case SIRegisterInfo::WORKGROUP_ID_X:
+    assert(MFI->hasWorkGroupIDX());
+    return MFI->WorkGroupIDXSystemSGPR;
+  case SIRegisterInfo::WORKGROUP_ID_Y:
+    assert(MFI->hasWorkGroupIDY());
+    return MFI->WorkGroupIDYSystemSGPR;
+  case SIRegisterInfo::WORKGROUP_ID_Z:
+    assert(MFI->hasWorkGroupIDZ());
+    return MFI->WorkGroupIDZSystemSGPR;
+  case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
+    return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
+  case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
+    assert(ST.isAmdCodeObjectV2() &&
+           "Non-CodeObjectV2 ABI currently uses relocations");
+    assert(MFI->hasPrivateSegmentBuffer());
+    return MFI->PrivateSegmentBufferUserSGPR;
+  case SIRegisterInfo::KERNARG_SEGMENT_PTR:
+    assert(MFI->hasKernargSegmentPtr());
+    return MFI->KernargSegmentPtrUserSGPR;
+  case SIRegisterInfo::DISPATCH_ID:
+    assert(MFI->hasDispatchID());
+    return MFI->DispatchIDUserSGPR;
+  case SIRegisterInfo::FLAT_SCRATCH_INIT:
+    assert(MFI->hasFlatScratchInit());
+    return MFI->FlatScratchInitUserSGPR;
+  case SIRegisterInfo::DISPATCH_PTR:
+    assert(MFI->hasDispatchPtr());
+    return MFI->DispatchPtrUserSGPR;
+  case SIRegisterInfo::QUEUE_PTR:
+    assert(MFI->hasQueuePtr());
+    return MFI->QueuePtrUserSGPR;
+  case SIRegisterInfo::WORKITEM_ID_X:
+    assert(MFI->hasWorkItemIDX());
+    return AMDGPU::VGPR0;
+  case SIRegisterInfo::WORKITEM_ID_Y:
+    assert(MFI->hasWorkItemIDY());
+    return AMDGPU::VGPR1;
+  case SIRegisterInfo::WORKITEM_ID_Z:
+    assert(MFI->hasWorkItemIDZ());
+    return AMDGPU::VGPR2;
+  }
+  llvm_unreachable("unexpected preloaded value type");
+}
+
+/// \brief Returns a register that is not used at any point in the function.
+///        If all registers are used, then this function will return
+//         AMDGPU::NoRegister.
+unsigned
+SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
+                                   const TargetRegisterClass *RC,
+                                   const MachineFunction &MF) const {
+
+  for (unsigned Reg : *RC)
+    if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
+      return Reg;
+  return AMDGPU::NoRegister;
+}
+
+unsigned SIRegisterInfo::getTotalNumSGPRs(const SISubtarget &ST) const {
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return 800;
+  return 512;
+}
+
+unsigned SIRegisterInfo::getNumAddressableSGPRs(const SISubtarget &ST) const {
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return 102;
+  return 104;
+}
+
+unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST,
+                                             const SIMachineFunctionInfo &MFI) const {
+  if (MFI.hasFlatScratchInit()) {
+    if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+      return 6; // FLAT_SCRATCH, XNACK, VCC (in that order)
+
+    if (ST.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
+      return 4; // FLAT_SCRATCH, VCC (in that order)
+  }
+
+  if (ST.isXNACKEnabled())
+    return 4; // XNACK, VCC (in that order)
+
+  return 2; // VCC.
+}
+
+unsigned SIRegisterInfo::getMinNumSGPRs(const SISubtarget &ST,
+                                        unsigned WavesPerEU) const {
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    switch (WavesPerEU) {
+      case 0:  return 0;
+      case 10: return 0;
+      case 9:  return 0;
+      case 8:  return 81;
+      default: return 97;
+    }
+  } else {
+    switch (WavesPerEU) {
+      case 0:  return 0;
+      case 10: return 0;
+      case 9:  return 49;
+      case 8:  return 57;
+      case 7:  return 65;
+      case 6:  return 73;
+      case 5:  return 81;
+      default: return 97;
+    }
+  }
+}
+
+unsigned SIRegisterInfo::getMaxNumSGPRs(const SISubtarget &ST,
+                                        unsigned WavesPerEU,
+                                        bool Addressable) const {
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    switch (WavesPerEU) {
+      case 0:  return 80;
+      case 10: return 80;
+      case 9:  return 80;
+      case 8:  return 96;
+      default: return Addressable ? getNumAddressableSGPRs(ST) : 112;
+    }
+  } else {
+    switch (WavesPerEU) {
+      case 0:  return 48;
+      case 10: return 48;
+      case 9:  return 56;
+      case 8:  return 64;
+      case 7:  return 72;
+      case 6:  return 80;
+      case 5:  return 96;
+      default: return getNumAddressableSGPRs(ST);
+    }
+  }
+}
+
+unsigned SIRegisterInfo::getMaxNumSGPRs(const MachineFunction &MF) const {
+  const Function &F = *MF.getFunction();
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+  // Compute maximum number of SGPRs function can use using default/requested
+  // minimum number of waves per execution unit.
+  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
+  unsigned MaxNumSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, false);
+  unsigned MaxNumAddressableSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, true);
+
+  // Check if maximum number of SGPRs was explicitly requested using
+  // "amdgpu-num-sgpr" attribute.
+  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
+    unsigned Requested = AMDGPU::getIntegerAttribute(
+      F, "amdgpu-num-sgpr", MaxNumSGPRs);
+
+    // Make sure requested value does not violate subtarget's specifications.
+    if (Requested && (Requested <= getNumReservedSGPRs(ST, MFI)))
+      Requested = 0;
+
+    // If more SGPRs are required to support the input user/system SGPRs,
+    // increase to accommodate them.
+    //
+    // FIXME: This really ends up using the requested number of SGPRs + number
+    // of reserved special registers in total. Theoretically you could re-use
+    // the last input registers for these special registers, but this would
+    // require a lot of complexity to deal with the weird aliasing.
+    unsigned NumInputSGPRs = MFI.getNumPreloadedSGPRs();
+    if (Requested && Requested < NumInputSGPRs)
+      Requested = NumInputSGPRs;
+
+    // Make sure requested value is compatible with values implied by
+    // default/requested minimum/maximum number of waves per execution unit.
+    if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first, false))
+      Requested = 0;
+    if (WavesPerEU.second &&
+        Requested && Requested < getMinNumSGPRs(ST, WavesPerEU.second))
+      Requested = 0;
+
+    if (Requested)
+      MaxNumSGPRs = Requested;
+  }
+
+  if (ST.hasSGPRInitBug())
+    MaxNumSGPRs = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+
+  return std::min(MaxNumSGPRs - getNumReservedSGPRs(ST, MFI),
+                  MaxNumAddressableSGPRs);
+}
+
+unsigned SIRegisterInfo::getNumDebuggerReservedVGPRs(
+  const SISubtarget &ST) const {
+  if (ST.debuggerReserveRegs())
+    return 4;
+  return 0;
+}
+
+unsigned SIRegisterInfo::getMinNumVGPRs(unsigned WavesPerEU) const {
+  switch (WavesPerEU) {
+    case 0:  return 0;
+    case 10: return 0;
+    case 9:  return 25;
+    case 8:  return 29;
+    case 7:  return 33;
+    case 6:  return 37;
+    case 5:  return 41;
+    case 4:  return 49;
+    case 3:  return 65;
+    case 2:  return 85;
+    default: return 129;
+  }
+}
+
+unsigned SIRegisterInfo::getMaxNumVGPRs(unsigned WavesPerEU) const {
+  switch (WavesPerEU) {
+    case 0:  return 24;
+    case 10: return 24;
+    case 9:  return 28;
+    case 8:  return 32;
+    case 7:  return 36;
+    case 6:  return 40;
+    case 5:  return 48;
+    case 4:  return 64;
+    case 3:  return 84;
+    case 2:  return 128;
+    default: return getTotalNumVGPRs();
+  }
+}
+
+unsigned SIRegisterInfo::getMaxNumVGPRs(const MachineFunction &MF) const {
+  const Function &F = *MF.getFunction();
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+  // Compute maximum number of VGPRs function can use using default/requested
+  // minimum number of waves per execution unit.
+  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
+  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
+
+  // Check if maximum number of VGPRs was explicitly requested using
+  // "amdgpu-num-vgpr" attribute.
+  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
+    unsigned Requested = AMDGPU::getIntegerAttribute(
+      F, "amdgpu-num-vgpr", MaxNumVGPRs);
+
+    // Make sure requested value does not violate subtarget's specifications.
+    if (Requested && Requested <= getNumDebuggerReservedVGPRs(ST))
+      Requested = 0;
+
+    // Make sure requested value is compatible with values implied by
+    // default/requested minimum/maximum number of waves per execution unit.
+    if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
+      Requested = 0;
+    if (WavesPerEU.second &&
+        Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
+      Requested = 0;
+
+    if (Requested)
+      MaxNumVGPRs = Requested;
+  }
+
+  return MaxNumVGPRs - getNumDebuggerReservedVGPRs(ST);
+}
+
+ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
+                                                   unsigned EltSize) const {
+  if (EltSize == 4) {
+    static const int16_t Sub0_15[] = {
+      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+      AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+      AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+      AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+    };
+
+    static const int16_t Sub0_7[] = {
+      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+      AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+    };
+
+    static const int16_t Sub0_3[] = {
+      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+    };
+
+    static const int16_t Sub0_2[] = {
+      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
+    };
+
+    static const int16_t Sub0_1[] = {
+      AMDGPU::sub0, AMDGPU::sub1,
+    };
+
+    switch (AMDGPU::getRegBitWidth(*RC->MC)) {
+    case 32:
+      return {};
+    case 64:
+      return makeArrayRef(Sub0_1);
+    case 96:
+      return makeArrayRef(Sub0_2);
+    case 128:
+      return makeArrayRef(Sub0_3);
+    case 256:
+      return makeArrayRef(Sub0_7);
+    case 512:
+      return makeArrayRef(Sub0_15);
+    default:
+      llvm_unreachable("unhandled register size");
+    }
+  }
+
+  if (EltSize == 8) {
+    static const int16_t Sub0_15_64[] = {
+      AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+      AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
+      AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
+      AMDGPU::sub12_sub13, AMDGPU::sub14_sub15
+    };
+
+    static const int16_t Sub0_7_64[] = {
+      AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+      AMDGPU::sub4_sub5, AMDGPU::sub6_sub7
+    };
+
+
+    static const int16_t Sub0_3_64[] = {
+      AMDGPU::sub0_sub1, AMDGPU::sub2_sub3
+    };
+
+    switch (AMDGPU::getRegBitWidth(*RC->MC)) {
+    case 64:
+      return {};
+    case 128:
+      return makeArrayRef(Sub0_3_64);
+    case 256:
+      return makeArrayRef(Sub0_7_64);
+    case 512:
+      return makeArrayRef(Sub0_15_64);
+    default:
+      llvm_unreachable("unhandled register size");
+    }
+  }
+
+  assert(EltSize == 16 && "unhandled register spill split size");
+
+  static const int16_t Sub0_15_128[] = {
+    AMDGPU::sub0_sub1_sub2_sub3,
+    AMDGPU::sub4_sub5_sub6_sub7,
+    AMDGPU::sub8_sub9_sub10_sub11,
+    AMDGPU::sub12_sub13_sub14_sub15
+  };
+
+  static const int16_t Sub0_7_128[] = {
+    AMDGPU::sub0_sub1_sub2_sub3,
+    AMDGPU::sub4_sub5_sub6_sub7
+  };
+
+  switch (AMDGPU::getRegBitWidth(*RC->MC)) {
+  case 128:
+    return {};
+  case 256:
+    return makeArrayRef(Sub0_7_128);
+  case 512:
+    return makeArrayRef(Sub0_15_128);
+  default:
+    llvm_unreachable("unhandled register size");
+  }
+}
+
+const TargetRegisterClass*
+SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
+                                  unsigned Reg) const {
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return  MRI.getRegClass(Reg);
+
+  return getPhysRegClass(Reg);
+}
+
+bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
+                            unsigned Reg) const {
+  return hasVGPRs(getRegClassForReg(MRI, Reg));
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
new file mode 100644
index 000000000000..0bcae7d9840c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -0,0 +1,282 @@
+//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for SIRegisterInfo
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
+
+#include "AMDGPURegisterInfo.h"
+#include "SIDefines.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+namespace llvm {
+
+class SISubtarget;
+class MachineRegisterInfo;
+class SIMachineFunctionInfo;
+
+class SIRegisterInfo final : public AMDGPURegisterInfo {
+private:
+  unsigned SGPRSetID;
+  unsigned VGPRSetID;
+  BitVector SGPRPressureSets;
+  BitVector VGPRPressureSets;
+
+  void reserveRegisterTuples(BitVector &, unsigned Reg) const;
+  void classifyPressureSet(unsigned PSetID, unsigned Reg,
+                           BitVector &PressureSets) const;
+
+public:
+  SIRegisterInfo();
+
+  /// Return the end register initially reserved for the scratch buffer in case
+  /// spilling is needed.
+  unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
+
+  /// Return the end register initially reserved for the scratch wave offset in
+  /// case spilling is needed.
+  unsigned reservedPrivateSegmentWaveByteOffsetReg(
+    const MachineFunction &MF) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+  bool requiresFrameIndexReplacementScavenging(
+    const MachineFunction &MF) const override;
+  bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override;
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+  int64_t getMUBUFInstrOffset(const MachineInstr *MI) const;
+
+  int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
+                                   int Idx) const override;
+
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                    unsigned BaseReg, int FrameIdx,
+                                    int64_t Offset) const override;
+
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+
+  bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+                          int64_t Offset) const override;
+
+  const TargetRegisterClass *getPointerRegClass(
+    const MachineFunction &MF, unsigned Kind = 0) const override;
+
+  void spillSGPR(MachineBasicBlock::iterator MI,
+                 int FI, RegScavenger *RS) const;
+
+  void restoreSGPR(MachineBasicBlock::iterator MI,
+                   int FI, RegScavenger *RS) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS) const override;
+
+  unsigned getHWRegIndex(unsigned Reg) const {
+    return getEncodingValue(Reg) & 0xff;
+  }
+
+  /// \brief Return the 'base' register class for this register.
+  /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc.
+  const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
+
+  /// \returns true if this class contains only SGPR registers
+  bool isSGPRClass(const TargetRegisterClass *RC) const {
+    return !hasVGPRs(RC);
+  }
+
+  /// \returns true if this class ID contains only SGPR registers
+  bool isSGPRClassID(unsigned RCID) const {
+    return isSGPRClass(getRegClass(RCID));
+  }
+
+  bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
+    const TargetRegisterClass *RC;
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      RC = MRI.getRegClass(Reg);
+    else
+      RC = getPhysRegClass(Reg);
+    return isSGPRClass(RC);
+  }
+
+  /// \returns true if this class contains VGPR registers.
+  bool hasVGPRs(const TargetRegisterClass *RC) const;
+
+  /// \returns A VGPR reg class with the same width as \p SRC
+  const TargetRegisterClass *getEquivalentVGPRClass(
+                                          const TargetRegisterClass *SRC) const;
+
+  /// \returns A SGPR reg class with the same width as \p SRC
+  const TargetRegisterClass *getEquivalentSGPRClass(
+                                           const TargetRegisterClass *VRC) const;
+
+  /// \returns The register class that is used for a sub-register of \p RC for
+  /// the given \p SubIdx.  If \p SubIdx equals NoSubRegister, \p RC will
+  /// be returned.
+  const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
+                                            unsigned SubIdx) const;
+
+  bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+                            unsigned DefSubReg,
+                            const TargetRegisterClass *SrcRC,
+                            unsigned SrcSubReg) const override;
+
+  /// \returns True if operands defined with this operand type can accept
+  /// a literal constant (i.e. any 32-bit immediate).
+  bool opCanUseLiteralConstant(unsigned OpType) const {
+    // TODO: 64-bit operands have extending behavior from 32-bit literal.
+    return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
+           OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
+  }
+
+  /// \returns True if operands defined with this operand type can accept
+  /// an inline constant. i.e. An integer value in the range (-16, 64) or
+  /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
+  bool opCanUseInlineConstant(unsigned OpType) const {
+    return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
+           OpType <= AMDGPU::OPERAND_SRC_LAST;
+  }
+
+  enum PreloadedValue {
+    // SGPRS:
+    PRIVATE_SEGMENT_BUFFER = 0,
+    DISPATCH_PTR        =  1,
+    QUEUE_PTR           =  2,
+    KERNARG_SEGMENT_PTR =  3,
+    DISPATCH_ID         =  4,
+    FLAT_SCRATCH_INIT   =  5,
+    WORKGROUP_ID_X      = 10,
+    WORKGROUP_ID_Y      = 11,
+    WORKGROUP_ID_Z      = 12,
+    PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
+
+    // VGPRS:
+    FIRST_VGPR_VALUE    = 15,
+    WORKITEM_ID_X       = FIRST_VGPR_VALUE,
+    WORKITEM_ID_Y       = 16,
+    WORKITEM_ID_Z       = 17
+  };
+
+  /// \brief Returns the physical register that \p Value is stored in.
+  unsigned getPreloadedValue(const MachineFunction &MF,
+                             enum PreloadedValue Value) const;
+
+  unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
+                              const TargetRegisterClass *RC,
+                              const MachineFunction &MF) const;
+
+  unsigned getSGPRPressureSet() const { return SGPRSetID; };
+  unsigned getVGPRPressureSet() const { return VGPRSetID; };
+
+  const TargetRegisterClass *getRegClassForReg(const MachineRegisterInfo &MRI,
+                                               unsigned Reg) const;
+  bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;
+
+  bool isSGPRPressureSet(unsigned SetID) const {
+    return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID);
+  }
+  bool isVGPRPressureSet(unsigned SetID) const {
+    return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID);
+  }
+
+  /// \returns SGPR allocation granularity supported by the subtarget.
+  unsigned getSGPRAllocGranule() const {
+    return 8;
+  }
+
+  /// \returns Total number of SGPRs supported by the subtarget.
+  unsigned getTotalNumSGPRs(const SISubtarget &ST) const;
+
+  /// \returns Number of addressable SGPRs supported by the subtarget.
+  unsigned getNumAddressableSGPRs(const SISubtarget &ST) const;
+
+  /// \returns Number of reserved SGPRs supported by the subtarget.
+  unsigned getNumReservedSGPRs(const SISubtarget &ST,
+                               const SIMachineFunctionInfo &MFI) const;
+
+  /// \returns Minimum number of SGPRs that meets given number of waves per
+  /// execution unit requirement for given subtarget.
+  unsigned getMinNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU) const;
+
+  /// \returns Maximum number of SGPRs that meets given number of waves per
+  /// execution unit requirement for given subtarget.
+  unsigned getMaxNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU,
+                          bool Addressable) const;
+
+  /// \returns Maximum number of SGPRs that meets number of waves per execution
+  /// unit requirement for function \p MF, or number of SGPRs explicitly
+  /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
+  ///
+  /// \returns Value that meets number of waves per execution unit requirement
+  /// if explicitly requested value cannot be converted to integer, violates
+  /// subtarget's specifications, or does not meet number of waves per execution
+  /// unit requirement.
+  unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
+
+  /// \returns VGPR allocation granularity supported by the subtarget.
+  unsigned getVGPRAllocGranule() const {
+    return 4;
+  }
+
+  /// \returns Total number of VGPRs supported by the subtarget.
+  unsigned getTotalNumVGPRs() const {
+    return 256;
+  }
+
+  /// \returns Number of reserved VGPRs for debugger use supported by the
+  /// subtarget.
+  unsigned getNumDebuggerReservedVGPRs(const SISubtarget &ST) const;
+
+  /// \returns Minimum number of SGPRs that meets given number of waves per
+  /// execution unit requirement.
+  unsigned getMinNumVGPRs(unsigned WavesPerEU) const;
+
+  /// \returns Maximum number of VGPRs that meets given number of waves per
+  /// execution unit requirement.
+  unsigned getMaxNumVGPRs(unsigned WavesPerEU) const;
+
+  /// \returns Maximum number of VGPRs that meets number of waves per execution
+  /// unit requirement for function \p MF, or number of VGPRs explicitly
+  /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
+  ///
+  /// \returns Value that meets number of waves per execution unit requirement
+  /// if explicitly requested value cannot be converted to integer, violates
+  /// subtarget's specifications, or does not meet number of waves per execution
+  /// unit requirement.
+  unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
+
+  ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
+                                     unsigned EltSize) const;
+
+private:
+  void buildSpillLoadStore(MachineBasicBlock::iterator MI,
+                           unsigned LoadStoreOp,
+                           int Index,
+                           unsigned ValueReg,
+                           bool ValueIsKill,
+                           unsigned ScratchRsrcReg,
+                           unsigned ScratchOffsetReg,
+                           int64_t InstrOffset,
+                           MachineMemOperand *MMO,
+                           RegScavenger *RS) const;
+};
+
+} // End namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
new file mode 100644
index 000000000000..31e714b9f6b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -0,0 +1,465 @@
+//===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the SI registers
+//===----------------------------------------------------------------------===//
+class SIReg <string n, bits<16> regIdx = 0> : Register<n>,
+  DwarfRegNum<[!cast<int>(HWEncoding)]> {
+  let Namespace = "AMDGPU";
+
+  // This is the not yet the complete register encoding. An additional
+  // bit is set for VGPRs.
+  let HWEncoding = regIdx;
+}
+
+// Special Registers
+def VCC_LO : SIReg<"vcc_lo", 106>;
+def VCC_HI : SIReg<"vcc_hi", 107>;
+
+// VCC for 64-bit instructions
+def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
+          DwarfRegAlias<VCC_LO> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 106;
+}
+
+def EXEC_LO : SIReg<"exec_lo", 126>;
+def EXEC_HI : SIReg<"exec_hi", 127>;
+
+def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>,
+           DwarfRegAlias<EXEC_LO> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 126;
+}
+
+def SCC : SIReg<"scc", 253>;
+def M0 : SIReg <"m0", 124>;
+
+// Trap handler registers
+def TBA_LO : SIReg<"tba_lo", 108>;
+def TBA_HI : SIReg<"tba_hi", 109>;
+
+def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
+          DwarfRegAlias<TBA_LO> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 108;
+}
+
+def TMA_LO : SIReg<"tma_lo", 110>;
+def TMA_HI : SIReg<"tma_hi", 111>;
+
+def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
+          DwarfRegAlias<TMA_LO> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 110;
+}
+
+def TTMP0 : SIReg <"ttmp0", 112>;
+def TTMP1 : SIReg <"ttmp1", 113>;
+def TTMP2 : SIReg <"ttmp2", 114>;
+def TTMP3 : SIReg <"ttmp3", 115>;
+def TTMP4 : SIReg <"ttmp4", 116>;
+def TTMP5 : SIReg <"ttmp5", 117>;
+def TTMP6 : SIReg <"ttmp6", 118>;
+def TTMP7 : SIReg <"ttmp7", 119>;
+def TTMP8 : SIReg <"ttmp8", 120>;
+def TTMP9 : SIReg <"ttmp9", 121>;
+def TTMP10 : SIReg <"ttmp10", 122>;
+def TTMP11 : SIReg <"ttmp11", 123>;
+
+multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
+  def _ci : SIReg<n, ci_e>;
+  def _vi : SIReg<n, vi_e>;
+  def "" : SIReg<"", 0>;
+}
+
+class FlatReg <Register lo, Register hi, bits<16> encoding> :
+    RegisterWithSubRegs<"flat_scratch", [lo, hi]>,
+    DwarfRegAlias<lo> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = encoding;
+}
+
+defm FLAT_SCR_LO : FLAT_SCR_LOHI_m<"flat_scratch_lo", 104, 102>; // Offset in units of 256-bytes.
+defm FLAT_SCR_HI : FLAT_SCR_LOHI_m<"flat_scratch_hi", 105, 103>; // Size is the per-thread scratch size, in bytes.
+
+def FLAT_SCR_ci : FlatReg<FLAT_SCR_LO_ci, FLAT_SCR_HI_ci, 104>;
+def FLAT_SCR_vi : FlatReg<FLAT_SCR_LO_vi, FLAT_SCR_HI_vi, 102>;
+def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>;
+
+// SGPR registers
+foreach Index = 0-103 in {
+  def SGPR#Index : SIReg <"SGPR"#Index, Index>;
+}
+
+// VGPR registers
+foreach Index = 0-255 in {
+  def VGPR#Index : SIReg <"VGPR"#Index, Index> {
+    let HWEncoding{8} = 1;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Groupings using register classes and tuples
+//===----------------------------------------------------------------------===//
+
+def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> {
+  let CopyCost = -1;
+  let isAllocatable = 0;
+}
+
+def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
+  let CopyCost = 1;
+  let isAllocatable = 0;
+}
+
+// TODO: Do we need to set DwarfRegAlias on register tuples?
+
+// SGPR 32-bit registers
+def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+                            (add (sequence "SGPR%u", 0, 103))> {
+  // Give all SGPR classes higher priority than VGPR classes, because
+  // we want to spill SGPRs to VGPRs.
+  let AllocationPriority = 7;
+}
+
+// SGPR 64-bit registers
+def SGPR_64Regs : RegisterTuples<[sub0, sub1],
+                             [(add (decimate SGPR_32, 2)),
+                              (add (decimate (shl SGPR_32, 1), 2))]>;
+
+// SGPR 128-bit registers
+def SGPR_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
+                              [(add (decimate SGPR_32, 4)),
+                               (add (decimate (shl SGPR_32, 1), 4)),
+                               (add (decimate (shl SGPR_32, 2), 4)),
+                               (add (decimate (shl SGPR_32, 3), 4))]>;
+
+// SGPR 256-bit registers
+def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
+                              [(add (decimate SGPR_32, 4)),
+                               (add (decimate (shl SGPR_32, 1), 4)),
+                               (add (decimate (shl SGPR_32, 2), 4)),
+                               (add (decimate (shl SGPR_32, 3), 4)),
+                               (add (decimate (shl SGPR_32, 4), 4)),
+                               (add (decimate (shl SGPR_32, 5), 4)),
+                               (add (decimate (shl SGPR_32, 6), 4)),
+                               (add (decimate (shl SGPR_32, 7), 4))]>;
+
+// SGPR 512-bit registers
+def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
+                               sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
+                              [(add (decimate SGPR_32, 4)),
+                               (add (decimate (shl SGPR_32, 1), 4)),
+                               (add (decimate (shl SGPR_32, 2), 4)),
+                               (add (decimate (shl SGPR_32, 3), 4)),
+                               (add (decimate (shl SGPR_32, 4), 4)),
+                               (add (decimate (shl SGPR_32, 5), 4)),
+                               (add (decimate (shl SGPR_32, 6), 4)),
+                               (add (decimate (shl SGPR_32, 7), 4)),
+                               (add (decimate (shl SGPR_32, 8), 4)),
+                               (add (decimate (shl SGPR_32, 9), 4)),
+                               (add (decimate (shl SGPR_32, 10), 4)),
+                               (add (decimate (shl SGPR_32, 11), 4)),
+                               (add (decimate (shl SGPR_32, 12), 4)),
+                               (add (decimate (shl SGPR_32, 13), 4)),
+                               (add (decimate (shl SGPR_32, 14), 4)),
+                               (add (decimate (shl SGPR_32, 15), 4))]>;
+
+// Trap handler TMP 32-bit registers
+def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
+                            (add (sequence "TTMP%u", 0, 11))> {
+  let isAllocatable = 0;
+}
+
+// Trap handler TMP 64-bit registers
+def TTMP_64Regs : RegisterTuples<[sub0, sub1],
+                             [(add (decimate TTMP_32, 2)),
+                              (add (decimate (shl TTMP_32, 1), 2))]>;
+
+// Trap handler TMP 128-bit registers
+def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
+                              [(add (decimate TTMP_32, 4)),
+                               (add (decimate (shl TTMP_32, 1), 4)),
+                               (add (decimate (shl TTMP_32, 2), 4)),
+                               (add (decimate (shl TTMP_32, 3), 4))]>;
+
+// VGPR 32-bit registers
+def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+                            (add (sequence "VGPR%u", 0, 255))> {
+  let AllocationPriority = 1;
+  let Size = 32;
+}
+
+// VGPR 64-bit registers
+def VGPR_64 : RegisterTuples<[sub0, sub1],
+                             [(add (trunc VGPR_32, 255)),
+                              (add (shl VGPR_32, 1))]>;
+
+// VGPR 96-bit registers
+def VGPR_96 : RegisterTuples<[sub0, sub1, sub2],
+                             [(add (trunc VGPR_32, 254)),
+                              (add (shl VGPR_32, 1)),
+                              (add (shl VGPR_32, 2))]>;
+
+// VGPR 128-bit registers
+def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
+                              [(add (trunc VGPR_32, 253)),
+                               (add (shl VGPR_32, 1)),
+                               (add (shl VGPR_32, 2)),
+                               (add (shl VGPR_32, 3))]>;
+
+// VGPR 256-bit registers
+def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
+                              [(add (trunc VGPR_32, 249)),
+                               (add (shl VGPR_32, 1)),
+                               (add (shl VGPR_32, 2)),
+                               (add (shl VGPR_32, 3)),
+                               (add (shl VGPR_32, 4)),
+                               (add (shl VGPR_32, 5)),
+                               (add (shl VGPR_32, 6)),
+                               (add (shl VGPR_32, 7))]>;
+
+// VGPR 512-bit registers
+def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
+                               sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
+                              [(add (trunc VGPR_32, 241)),
+                               (add (shl VGPR_32, 1)),
+                               (add (shl VGPR_32, 2)),
+                               (add (shl VGPR_32, 3)),
+                               (add (shl VGPR_32, 4)),
+                               (add (shl VGPR_32, 5)),
+                               (add (shl VGPR_32, 6)),
+                               (add (shl VGPR_32, 7)),
+                               (add (shl VGPR_32, 8)),
+                               (add (shl VGPR_32, 9)),
+                               (add (shl VGPR_32, 10)),
+                               (add (shl VGPR_32, 11)),
+                               (add (shl VGPR_32, 12)),
+                               (add (shl VGPR_32, 13)),
+                               (add (shl VGPR_32, 14)),
+                               (add (shl VGPR_32, 15))]>;
+
+//===----------------------------------------------------------------------===//
+//  Register classes used as source and destination
+//===----------------------------------------------------------------------===//
+
+// Subset of SReg_32 without M0 for SMRD instructions and alike.
+// See comments in SIInstructions.td for more info.
+def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+  (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
+   TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> {
+  let AllocationPriority = 7;
+}
+
+def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+  (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
+  let AllocationPriority = 7;
+}
+
+// Register class for all scalar registers (SGPRs + Special Registers)
+def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+  (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI)> {
+  let AllocationPriority = 7;
+}
+
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> {
+  let CopyCost = 1;
+  let AllocationPriority = 8;
+}
+
+def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> {
+  let isAllocatable = 0;
+}
+
+def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
+  (add SGPR_64, VCC, FLAT_SCR, TTMP_64, TBA, TMA)> {
+  let CopyCost = 1;
+  let AllocationPriority = 8;
+}
+
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
+  (add SReg_64_XEXEC, EXEC)> {
+  let CopyCost = 1;
+  let AllocationPriority = 8;
+}
+
+// Requires 2 s_mov_b64 to copy
+let CopyCost = 2 in {
+
+def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> {
+  let AllocationPriority = 10;
+}
+
+def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)> {
+  let isAllocatable = 0;
+}
+
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> {
+  let AllocationPriority = 10;
+}
+
+} // End CopyCost = 2
+
+def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> {
+  // Requires 4 s_mov_b64 to copy
+  let CopyCost = 4;
+  let AllocationPriority = 11;
+}
+
+def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {
+  // Requires 8 s_mov_b64 to copy
+  let CopyCost = 8;
+  let AllocationPriority = 12;
+}
+
+// Register class for all vector registers (VGPRs + Interploation Registers)
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
+  let Size = 64;
+
+  // Requires 2 v_mov_b32 to copy
+  let CopyCost = 2;
+  let AllocationPriority = 2;
+}
+
+def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
+  let Size = 96;
+
+  // Requires 3 v_mov_b32 to copy
+  let CopyCost = 3;
+  let AllocationPriority = 3;
+}
+
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> {
+  let Size = 128;
+
+  // Requires 4 v_mov_b32 to copy
+  let CopyCost = 4;
+  let AllocationPriority = 4;
+}
+
+def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> {
+  let Size = 256;
+  let CopyCost = 8;
+  let AllocationPriority = 5;
+}
+
+def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> {
+  let Size = 512;
+  let CopyCost = 16;
+  let AllocationPriority = 6;
+}
+
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
+  let Size = 32;
+}
+
+def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+                          (add VGPR_32, SReg_32)> {
+  let isAllocatable = 0;
+}
+
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
+  let isAllocatable = 0;
+}
+
+//===----------------------------------------------------------------------===//
+//  Register operands
+//===----------------------------------------------------------------------===//
+
+class RegImmMatcher<string name> : AsmOperandClass {
+  let Name = name;
+  let RenderMethod = "addRegOrImmOperands";
+}
+
+multiclass SIRegOperand <string rc, string MatchName, string opType> {
+  let OperandNamespace = "AMDGPU" in {
+    def _b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+      let OperandType = opType#"_INT16";
+      let ParserMatchClass = RegImmMatcher<MatchName#"B16">;
+      let DecoderMethod = "decodeOperand_VSrc16";
+    }
+
+    def _f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+      let OperandType = opType#"_FP16";
+      let ParserMatchClass = RegImmMatcher<MatchName#"F16">;
+      let DecoderMethod = "decodeOperand_VSrc16";
+    }
+
+    def _b32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+      let OperandType = opType#"_INT32";
+      let ParserMatchClass = RegImmMatcher<MatchName#"B32">;
+    }
+
+    def _f32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+      let OperandType = opType#"_FP32";
+      let ParserMatchClass = RegImmMatcher<MatchName#"F32">;
+    }
+
+    def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
+      let OperandType = opType#"_INT64";
+      let ParserMatchClass = RegImmMatcher<MatchName#"B64">;
+    }
+
+    def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
+      let OperandType = opType#"_FP64";
+      let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
+    }
+  }
+}
+
+// FIXME: 64-bit sources can sometimes use 32-bit constants.
+multiclass RegImmOperand <string rc, string MatchName>
+  : SIRegOperand<rc, MatchName, "OPERAND_REG_IMM">;
+
+multiclass RegInlineOperand <string rc, string MatchName>
+  : SIRegOperand<rc, MatchName, "OPERAND_REG_INLINE_C">;
+
+//===----------------------------------------------------------------------===//
+//  SSrc_* Operands with an SGPR or a 32-bit immediate
+//===----------------------------------------------------------------------===//
+
+defm SSrc : RegImmOperand<"SReg", "SSrc">;
+
+//===----------------------------------------------------------------------===//
+//  SCSrc_* Operands with an SGPR or a inline constant
+//===----------------------------------------------------------------------===//
+
+defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ;
+
+//===----------------------------------------------------------------------===//
+//  VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
+//===----------------------------------------------------------------------===//
+
+defm VSrc : RegImmOperand<"VS", "VSrc">;
+
+def VSrc_128 : RegisterOperand<VReg_128>;
+
+//===----------------------------------------------------------------------===//
+//  VSrc_* Operands with an VGPR
+//===----------------------------------------------------------------------===//
+
+// This is for operands with the enum(9), VSrc encoding restriction,
+// but only allows VGPRs.
+def VRegSrc_32 : RegisterOperand<VGPR_32> {
+  //let ParserMatchClass = RegImmMatcher<"VRegSrc32">;
+  let DecoderMethod = "DecodeVS_32RegisterClass";
+}
+
+//===----------------------------------------------------------------------===//
+//  VCSrc_* Operands with an SGPR, VGPR or an inline constant
+//===----------------------------------------------------------------------===//
+
+defm VCSrc : RegInlineOperand<"VS", "VCSrc">;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
new file mode 100644
index 000000000000..be27966fd5f1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -0,0 +1,138 @@
+//===-- SISchedule.td - SI Scheduling definitons -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineModel definitions for Southern Islands (SI)
+//
+//===----------------------------------------------------------------------===//
+
+def : PredicateProlog<[{
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(SchedModel->getInstrInfo());
+  (void)TII;
+}]>;
+
+def WriteBranch : SchedWrite;
+def WriteExport : SchedWrite;
+def WriteLDS    : SchedWrite;
+def WriteSALU   : SchedWrite;
+def WriteSMEM   : SchedWrite;
+def WriteVMEM   : SchedWrite;
+def WriteBarrier : SchedWrite;
+
+// Vector ALU instructions
+def Write32Bit         : SchedWrite;
+def WriteQuarterRate32 : SchedWrite;
+def WriteFullOrQuarterRate32 : SchedWrite;
+
+def WriteFloatFMA   : SchedWrite;
+
+// Slow quarter rate f64 instruction.
+def WriteDouble : SchedWrite;
+
+// half rate f64 instruction (same as v_add_f64)
+def WriteDoubleAdd  : SchedWrite;
+
+// Half rate 64-bit instructions.
+def Write64Bit : SchedWrite;
+
+// FIXME: Should there be a class for instructions which are VALU
+// instructions and have VALU rates, but write to the SALU (i.e. VOPC
+// instructions)
+
+class SISchedMachineModel : SchedMachineModel {
+  let CompleteModel = 1;
+  // MicroOpBufferSize = 1 means that instructions will always be added
+  // the ready queue when they become available.  This exposes them
+  // to the register pressure analysis.
+  let MicroOpBufferSize = 1;
+  let IssueWidth = 1;
+  let PostRAScheduler = 1;
+}
+
+def SIFullSpeedModel : SISchedMachineModel;
+def SIQuarterSpeedModel : SISchedMachineModel;
+
+// XXX: Are the resource counts correct?
+def HWBranch : ProcResource<1> {
+  let BufferSize = 1;
+}
+def HWExport : ProcResource<1> {
+  let BufferSize = 7; // Taken from S_WAITCNT
+}
+def HWLGKM   : ProcResource<1> {
+  let BufferSize = 31;  // Taken from S_WAITCNT
+}
+def HWSALU   : ProcResource<1> {
+  let BufferSize = 1;
+}
+def HWVMEM   : ProcResource<1> {
+  let BufferSize = 15;  // Taken from S_WAITCNT
+}
+def HWVALU   : ProcResource<1> {
+  let BufferSize = 1;
+}
+
+class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
+                 int latency> : WriteRes<write, resources> {
+  let Latency = latency;
+}
+
+class HWVALUWriteRes<SchedWrite write, int latency> :
+  HWWriteRes<write, [HWVALU], latency>;
+
+
+// The latency numbers are taken from AMD Accelerated Parallel Processing
+// guide. They may not be accurate.
+
+// The latency values are 1 / (operations / cycle) / 4.
+multiclass SICommonWriteRes {
+
+  def : HWWriteRes<WriteBranch,  [HWBranch], 8>;
+  def : HWWriteRes<WriteExport,  [HWExport], 4>;
+  def : HWWriteRes<WriteLDS,     [HWLGKM],   5>; // Can be between 2 and 64
+  def : HWWriteRes<WriteSALU,    [HWSALU],   1>;
+  def : HWWriteRes<WriteSMEM,    [HWLGKM],   5>;
+  def : HWWriteRes<WriteVMEM,    [HWVMEM],   80>;
+  def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ???
+
+  def : HWVALUWriteRes<Write32Bit,         1>;
+  def : HWVALUWriteRes<Write64Bit,         2>;
+  def : HWVALUWriteRes<WriteQuarterRate32, 4>;
+}
+
+def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>;
+def PredIsVGPR64Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) > 32}]>;
+def WriteCopy : SchedWriteVariant<[
+    SchedVar<PredIsVGPR32Copy, [Write32Bit]>,
+    SchedVar<PredIsVGPR64Copy, [Write64Bit]>,
+    SchedVar<NoSchedPred, [WriteSALU]>]>;
+
+let SchedModel = SIFullSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA,   1>;
+def : HWVALUWriteRes<WriteDouble,     4>;
+def : HWVALUWriteRes<WriteDoubleAdd,  2>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+
+} // End SchedModel = SIFullSpeedModel
+
+let SchedModel = SIQuarterSpeedModel in {
+
+defm : SICommonWriteRes;
+
+def : HWVALUWriteRes<WriteFloatFMA, 16>;
+def : HWVALUWriteRes<WriteDouble,   16>;
+def : HWVALUWriteRes<WriteDoubleAdd, 8>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+
+}  // End SchedModel = SIQuarterSpeedModel
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
new file mode 100644
index 000000000000..b27d7c691032
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -0,0 +1,512 @@
+//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// The pass tries to use the 32-bit encoding for instructions when possible.
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUMCInstLower.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "si-shrink-instructions"
+
+STATISTIC(NumInstructionsShrunk,
+          "Number of 64-bit instruction reduced to 32-bit.");
+STATISTIC(NumLiteralConstantsFolded,
+          "Number of literal constants folded into 32-bit instructions.");
+
+using namespace llvm;
+
+namespace {
+
+class SIShrinkInstructions : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIShrinkInstructions() : MachineFunctionPass(ID) {
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "SI Shrink Instructions"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
+                "SI Shrink Instructions", false, false)
+
+char SIShrinkInstructions::ID = 0;
+
+FunctionPass *llvm::createSIShrinkInstructionsPass() {
+  return new SIShrinkInstructions();
+}
+
+static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
+                   const MachineRegisterInfo &MRI) {
+  if (!MO->isReg())
+    return false;
+
+  if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
+    return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
+
+  return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
+}
+
+static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
+                      const SIRegisterInfo &TRI,
+                      const MachineRegisterInfo &MRI) {
+
+  const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+  // Can't shrink instruction with three operands.
+  // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
+  // a special case for it.  It can only be shrunk if the third operand
+  // is vcc.  We should handle this the same way we handle vopc, by addding
+  // a register allocation hint pre-regalloc and then do the shrining
+  // post-regalloc.
+  if (Src2) {
+    switch (MI.getOpcode()) {
+      default: return false;
+
+      case AMDGPU::V_MAC_F32_e64:
+      case AMDGPU::V_MAC_F16_e64:
+        if (!isVGPR(Src2, TRI, MRI) ||
+            TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
+          return false;
+        break;
+
+      case AMDGPU::V_CNDMASK_B32_e64:
+        break;
+    }
+  }
+
+  const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+  const MachineOperand *Src1Mod =
+      TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
+
+  if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0)))
+    return false;
+
+  // We don't need to check src0, all input types are legal, so just make sure
+  // src0 isn't using any modifiers.
+  if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
+    return false;
+
+  // Check output modifiers
+  if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+    return false;
+
+  return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
+}
+
+/// \brief This function checks \p MI for operands defined by a move immediate
+/// instruction and then folds the literal constant into the instruction if it
+/// can.  This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction
+/// and will only fold literal constants if we are still in SSA.
+static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
+                           MachineRegisterInfo &MRI, bool TryToCommute = true) {
+
+  if (!MRI.isSSA())
+    return;
+
+  assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
+
+  int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+
+  // Only one literal constant is allowed per instruction, so if src0 is a
+  // literal constant then we can't do any folding.
+  if (TII->isLiteralConstant(MI, Src0Idx))
+    return;
+
+  // Try to fold Src0
+  MachineOperand &Src0 = MI.getOperand(Src0Idx);
+  if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
+    unsigned Reg = Src0.getReg();
+    MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
+    if (Def && Def->isMoveImmediate()) {
+      MachineOperand &MovSrc = Def->getOperand(1);
+      bool ConstantFolded = false;
+
+      if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
+                             isUInt<32>(MovSrc.getImm()))) {
+        Src0.ChangeToImmediate(MovSrc.getImm());
+        ConstantFolded = true;
+      }
+      if (ConstantFolded) {
+        if (MRI.use_empty(Reg))
+          Def->eraseFromParent();
+        ++NumLiteralConstantsFolded;
+        return;
+      }
+    }
+  }
+
+  // We have failed to fold src0, so commute the instruction and try again.
+  if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI))
+    foldImmediates(MI, TII, MRI, false);
+
+}
+
+// Copy MachineOperand with all flags except setting it as implicit.
+static void copyFlagsToImplicitVCC(MachineInstr &MI,
+                                   const MachineOperand &Orig) {
+
+  for (MachineOperand &Use : MI.implicit_operands()) {
+    if (Use.getReg() == AMDGPU::VCC) {
+      Use.setIsUndef(Orig.isUndef());
+      Use.setIsKill(Orig.isKill());
+      return;
+    }
+  }
+}
+
+static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
+  return isInt<16>(Src.getImm()) &&
+    !TII->isInlineConstant(*Src.getParent(),
+                           Src.getParent()->getOperandNo(&Src));
+}
+
+static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
+  return isUInt<16>(Src.getImm()) &&
+    !TII->isInlineConstant(*Src.getParent(),
+                           Src.getParent()->getOperandNo(&Src));
+}
+
+static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
+                                 const MachineOperand &Src,
+                                 bool &IsUnsigned) {
+  if (isInt<16>(Src.getImm())) {
+    IsUnsigned = false;
+    return !TII->isInlineConstant(Src);
+  }
+
+  if (isUInt<16>(Src.getImm())) {
+    IsUnsigned = true;
+    return !TII->isInlineConstant(Src);
+  }
+
+  return false;
+}
+
+/// \returns true if the constant in \p Src should be replaced with a bitreverse
+/// of an inline immediate.
+static bool isReverseInlineImm(const SIInstrInfo *TII,
+                               const MachineOperand &Src,
+                               int32_t &ReverseImm) {
+  if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
+    return false;
+
+  ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
+  return ReverseImm >= -16 && ReverseImm <= 64;
+}
+
+/// Copy implicit register operands from specified instruction to this
+/// instruction that are not part of the instruction definition.
+static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
+                                 const MachineInstr &MI) {
+  for (unsigned i = MI.getDesc().getNumOperands() +
+         MI.getDesc().getNumImplicitUses() +
+         MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
+       i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
+      NewMI.addOperand(MF, MO);
+  }
+}
+
+static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
+  // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
+  // get constants on the RHS.
+  if (!MI.getOperand(0).isReg())
+    TII->commuteInstruction(MI, false, 0, 1);
+
+  const MachineOperand &Src1 = MI.getOperand(1);
+  if (!Src1.isImm())
+    return;
+
+  int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
+  if (SOPKOpc == -1)
+    return;
+
+  // eq/ne is special because the imm16 can be treated as signed or unsigned,
+  // and initially selectd to the unsigned versions.
+  if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
+    bool HasUImm;
+    if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
+      if (!HasUImm) {
+        SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
+          AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
+      }
+
+      MI.setDesc(TII->get(SOPKOpc));
+    }
+
+    return;
+  }
+
+  const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
+
+  if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) ||
+      (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) {
+    MI.setDesc(NewDesc);
+  }
+}
+
+bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+  std::vector<unsigned> I1Defs;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
+        // If this has a literal constant source that is the same as the
+        // reversed bits of an inline immediate, replace with a bitreverse of
+        // that constant. This saves 4 bytes in the common case of materializing
+        // sign bits.
+
+        // Test if we are after regalloc. We only want to do this after any
+        // optimizations happen because this will confuse them.
+        // XXX - not exactly a check for post-regalloc run.
+        MachineOperand &Src = MI.getOperand(1);
+        if (Src.isImm() &&
+            TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
+          int32_t ReverseImm;
+          if (isReverseInlineImm(TII, Src, ReverseImm)) {
+            MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
+            Src.setImm(ReverseImm);
+            continue;
+          }
+        }
+      }
+
+      // Combine adjacent s_nops to use the immediate operand encoding how long
+      // to wait.
+      //
+      // s_nop N
+      // s_nop M
+      //  =>
+      // s_nop (N + M)
+      if (MI.getOpcode() == AMDGPU::S_NOP &&
+          Next != MBB.end() &&
+          (*Next).getOpcode() == AMDGPU::S_NOP) {
+
+        MachineInstr &NextMI = *Next;
+        // The instruction encodes the amount to wait with an offset of 1,
+        // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
+        // after adding.
+        uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
+        uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
+
+        // Make sure we don't overflow the bounds.
+        if (Nop0 + Nop1 <= 8) {
+          NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
+          MI.eraseFromParent();
+        }
+
+        continue;
+      }
+
+      // FIXME: We also need to consider movs of constant operands since
+      // immediate operands are not folded if they have more than one use, and
+      // the operand folding pass is unaware if the immediate will be free since
+      // it won't know if the src == dest constraint will end up being
+      // satisfied.
+      if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
+          MI.getOpcode() == AMDGPU::S_MUL_I32) {
+        const MachineOperand *Dest = &MI.getOperand(0);
+        MachineOperand *Src0 = &MI.getOperand(1);
+        MachineOperand *Src1 = &MI.getOperand(2);
+
+        if (!Src0->isReg() && Src1->isReg()) {
+          if (TII->commuteInstruction(MI, false, 1, 2))
+            std::swap(Src0, Src1);
+        }
+
+        // FIXME: This could work better if hints worked with subregisters. If
+        // we have a vector add of a constant, we usually don't get the correct
+        // allocation due to the subregister usage.
+        if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
+            Src0->isReg()) {
+          MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
+          MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
+          continue;
+        }
+
+        if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
+          if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
+            unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
+              AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
+
+            MI.setDesc(TII->get(Opc));
+            MI.tieOperands(0, 1);
+          }
+        }
+      }
+
+      // Try to use s_cmpk_*
+      if (MI.isCompare() && TII->isSOPC(MI)) {
+        shrinkScalarCompare(TII, MI);
+        continue;
+      }
+
+      // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
+      if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
+        const MachineOperand &Dst = MI.getOperand(0);
+        MachineOperand &Src = MI.getOperand(1);
+
+        if (Src.isImm() &&
+            TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) {
+          int32_t ReverseImm;
+          if (isKImmOperand(TII, Src))
+            MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
+          else if (isReverseInlineImm(TII, Src, ReverseImm)) {
+            MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
+            Src.setImm(ReverseImm);
+          }
+        }
+
+        continue;
+      }
+
+      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
+        continue;
+
+      if (!canShrink(MI, TII, TRI, MRI)) {
+        // Try commuting the instruction and see if that enables us to shrink
+        // it.
+        if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
+            !canShrink(MI, TII, TRI, MRI))
+          continue;
+      }
+
+      // getVOPe32 could be -1 here if we started with an instruction that had
+      // a 32-bit encoding and then commuted it to an instruction that did not.
+      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
+        continue;
+
+      int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
+
+      if (TII->isVOPC(Op32)) {
+        unsigned DstReg = MI.getOperand(0).getReg();
+        if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
+          // VOPC instructions can only write to the VCC register. We can't
+          // force them to use VCC here, because this is only one register and
+          // cannot deal with sequences which would require multiple copies of
+          // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
+          //
+          // So, instead of forcing the instruction to write to VCC, we provide
+          // a hint to the register allocator to use VCC and then we we will run
+          // this pass again after RA and shrink it if it outputs to VCC.
+          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
+          continue;
+        }
+        if (DstReg != AMDGPU::VCC)
+          continue;
+      }
+
+      if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
+        // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
+        // instructions.
+        const MachineOperand *Src2 =
+            TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+        if (!Src2->isReg())
+          continue;
+        unsigned SReg = Src2->getReg();
+        if (TargetRegisterInfo::isVirtualRegister(SReg)) {
+          MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
+          continue;
+        }
+        if (SReg != AMDGPU::VCC)
+          continue;
+      }
+
+      // We can shrink this instruction
+      DEBUG(dbgs() << "Shrinking " << MI);
+
+      MachineInstrBuilder Inst32 =
+          BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
+
+      // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
+      // For VOPC instructions, this is replaced by an implicit def of vcc.
+      int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
+      if (Op32DstIdx != -1) {
+        // dst
+        Inst32.addOperand(MI.getOperand(0));
+      } else {
+        assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
+               "Unexpected case");
+      }
+
+
+      Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
+
+      const MachineOperand *Src1 =
+          TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+      if (Src1)
+        Inst32.addOperand(*Src1);
+
+      const MachineOperand *Src2 =
+        TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+      if (Src2) {
+        int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
+        if (Op32Src2Idx != -1) {
+          Inst32.addOperand(*Src2);
+        } else {
+          // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
+          // replaced with an implicit read of vcc. This was already added
+          // during the initial BuildMI, so find it to preserve the flags.
+          copyFlagsToImplicitVCC(*Inst32, *Src2);
+        }
+      }
+
+      ++NumInstructionsShrunk;
+
+      // Copy extra operands not present in the instruction definition.
+      copyExtraImplicitOps(*Inst32, MF, MI);
+
+      MI.eraseFromParent();
+      foldImmediates(*Inst32, TII, MRI);
+
+      DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
+
+
+    }
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
new file mode 100644
index 000000000000..aad68537f779
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
@@ -0,0 +1,156 @@
+//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass removes performs the following type substitution on all
+/// non-compute shaders:
+///
+/// v16i8 => i128
+///   - v16i8 is used for constant memory resource descriptors.  This type is
+///      legal for some compute APIs, and we don't want to declare it as legal
+///      in the backend, because we want the legalizer to expand all v16i8
+///      operations.
+/// v1* => *
+///   - Having v1* types complicates the legalizer and we can easily replace
+///   - them with the element type.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+
+using namespace llvm;
+
+namespace {
+
+class SITypeRewriter : public FunctionPass,
+                       public InstVisitor<SITypeRewriter> {
+
+  static char ID;
+  Module *Mod;
+  Type *v16i8;
+  Type *v4i32;
+
+public:
+  SITypeRewriter() : FunctionPass(ID) { }
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+  StringRef getPassName() const override { return "SI Type Rewriter"; }
+  void visitLoadInst(LoadInst &I);
+  void visitCallInst(CallInst &I);
+  void visitBitCast(BitCastInst &I);
+};
+
+} // End anonymous namespace
+
+char SITypeRewriter::ID = 0;
+
+bool SITypeRewriter::doInitialization(Module &M) {
+  Mod = &M;
+  v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16);
+  v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4);
+  return false;
+}
+
+bool SITypeRewriter::runOnFunction(Function &F) {
+  if (!AMDGPU::isShader(F.getCallingConv()))
+    return false;
+
+  visit(F);
+  visit(F);
+
+  return false;
+}
+
+void SITypeRewriter::visitLoadInst(LoadInst &I) {
+  Value *Ptr = I.getPointerOperand();
+  Type *PtrTy = Ptr->getType();
+  Type *ElemTy = PtrTy->getPointerElementType();
+  IRBuilder<> Builder(&I);
+  if (ElemTy == v16i8)  {
+    Value *BitCast = Builder.CreateBitCast(Ptr,
+        PointerType::get(v4i32,PtrTy->getPointerAddressSpace()));
+    LoadInst *Load = Builder.CreateLoad(BitCast);
+    SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+    I.getAllMetadataOtherThanDebugLoc(MD);
+    for (unsigned i = 0, e = MD.size(); i != e; ++i) {
+      Load->setMetadata(MD[i].first, MD[i].second);
+    }
+    Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType());
+    I.replaceAllUsesWith(BitCastLoad);
+    I.eraseFromParent();
+  }
+}
+
+void SITypeRewriter::visitCallInst(CallInst &I) {
+  IRBuilder<> Builder(&I);
+
+  SmallVector <Value*, 8> Args;
+  SmallVector <Type*, 8> Types;
+  bool NeedToReplace = false;
+  Function *F = I.getCalledFunction();
+  if (!F)
+    return;
+
+  std::string Name = F->getName();
+  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
+    Value *Arg = I.getArgOperand(i);
+    if (Arg->getType() == v16i8) {
+      Args.push_back(Builder.CreateBitCast(Arg, v4i32));
+      Types.push_back(v4i32);
+      NeedToReplace = true;
+      Name = Name + ".v4i32";
+    } else if (Arg->getType()->isVectorTy() &&
+               Arg->getType()->getVectorNumElements() == 1 &&
+               Arg->getType()->getVectorElementType() ==
+                                              Type::getInt32Ty(I.getContext())){
+      Type *ElementTy = Arg->getType()->getVectorElementType();
+      std::string TypeName = "i32";
+      InsertElementInst *Def = cast<InsertElementInst>(Arg);
+      Args.push_back(Def->getOperand(1));
+      Types.push_back(ElementTy);
+      std::string VecTypeName = "v1" + TypeName;
+      Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName);
+      NeedToReplace = true;
+    } else {
+      Args.push_back(Arg);
+      Types.push_back(Arg->getType());
+    }
+  }
+
+  if (!NeedToReplace) {
+    return;
+  }
+  Function *NewF = Mod->getFunction(Name);
+  if (!NewF) {
+    NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod);
+    NewF->setAttributes(F->getAttributes());
+  }
+  I.replaceAllUsesWith(Builder.CreateCall(NewF, Args));
+  I.eraseFromParent();
+}
+
+void SITypeRewriter::visitBitCast(BitCastInst &I) {
+  IRBuilder<> Builder(&I);
+  if (I.getDestTy() != v4i32) {
+    return;
+  }
+
+  if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) {
+    if (Op->getSrcTy() == v4i32) {
+      I.replaceAllUsesWith(Op->getOperand(0));
+      I.eraseFromParent();
+    }
+  }
+}
+
+FunctionPass *llvm::createSITypeRewriter() {
+  return new SITypeRewriter();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
new file mode 100644
index 000000000000..a613a220e29d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -0,0 +1,730 @@
+//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass adds instructions to enable whole quad mode for pixel
+/// shaders.
+///
+/// Whole quad mode is required for derivative computations, but it interferes
+/// with shader side effects (stores and atomics). This pass is run on the
+/// scheduled machine IR but before register coalescing, so that machine SSA is
+/// available for analysis. It ensures that WQM is enabled when necessary, but
+/// disabled around stores and atomics.
+///
+/// When necessary, this pass creates a function prolog
+///
+///   S_MOV_B64 LiveMask, EXEC
+///   S_WQM_B64 EXEC, EXEC
+///
+/// to enter WQM at the top of the function and surrounds blocks of Exact
+/// instructions by
+///
+///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
+///   ...
+///   S_MOV_B64 EXEC, Tmp
+///
+/// In order to avoid excessive switching during sequences of Exact
+/// instructions, the pass first analyzes which instructions must be run in WQM
+/// (aka which instructions produce values that lead to derivative
+/// computations).
+///
+/// Basic blocks are always exited in WQM as long as some successor needs WQM.
+///
+/// There is room for improvement given better control flow analysis:
+///
+///  (1) at the top level (outside of control flow statements, and as long as
+///      kill hasn't been used), one SGPR can be saved by recovering WQM from
+///      the LiveMask (this is implemented for the entry block).
+///
+///  (2) when entire regions (e.g. if-else blocks or entire loops) only
+///      consist of exact and don't-care instructions, the switch only has to
+///      be done at the entry and exit points rather than potentially in each
+///      block of the region.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-wqm"
+
+namespace {
+
+enum {
+  StateWQM = 0x1,
+  StateExact = 0x2,
+};
+
+struct PrintState {
+public:
+  int State;
+
+  explicit PrintState(int State) : State(State) {}
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
+  if (PS.State & StateWQM)
+    OS << "WQM";
+  if (PS.State & StateExact) {
+    if (PS.State & StateWQM)
+      OS << '|';
+    OS << "Exact";
+  }
+
+  return OS;
+}
+
+struct InstrInfo {
+  char Needs = 0;
+  char OutNeeds = 0;
+};
+
+struct BlockInfo {
+  char Needs = 0;
+  char InNeeds = 0;
+  char OutNeeds = 0;
+};
+
+struct WorkItem {
+  MachineBasicBlock *MBB = nullptr;
+  MachineInstr *MI = nullptr;
+
+  WorkItem() = default;
+  WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
+  WorkItem(MachineInstr *MI) : MI(MI) {}
+};
+
+class SIWholeQuadMode : public MachineFunctionPass {
+private:
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+  LiveIntervals *LIS;
+
+  DenseMap<const MachineInstr *, InstrInfo> Instructions;
+  DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
+  SmallVector<MachineInstr *, 1> LiveMaskQueries;
+
+  void printInfo();
+
+  void markInstruction(MachineInstr &MI, char Flag,
+                       std::vector<WorkItem> &Worklist);
+  void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
+  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
+  void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
+  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
+  char analyzeFunction(MachineFunction &MF);
+
+  bool requiresCorrectState(const MachineInstr &MI) const;
+
+  MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator Before);
+  MachineBasicBlock::iterator
+  prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
+                   MachineBasicBlock::iterator Last, bool PreferLast,
+                   bool SaveSCC);
+  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+               unsigned SaveWQM, unsigned LiveMaskReg);
+  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+             unsigned SavedWQM);
+  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
+
+  void lowerLiveMaskQueries(unsigned LiveMaskReg);
+
+public:
+  static char ID;
+
+  SIWholeQuadMode() :
+    MachineFunctionPass(ID) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "SI Whole Quad Mode"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char SIWholeQuadMode::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
+                    false)
+
+char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
+
+FunctionPass *llvm::createSIWholeQuadModePass() {
+  return new SIWholeQuadMode;
+}
+
+void SIWholeQuadMode::printInfo() {
+  for (const auto &BII : Blocks) {
+    dbgs() << "\nBB#" << BII.first->getNumber() << ":\n"
+           << "  InNeeds = " << PrintState(BII.second.InNeeds)
+           << ", Needs = " << PrintState(BII.second.Needs)
+           << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
+
+    for (const MachineInstr &MI : *BII.first) {
+      auto III = Instructions.find(&MI);
+      if (III == Instructions.end())
+        continue;
+
+      dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
+             << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
+    }
+  }
+}
+
+void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
+                                      std::vector<WorkItem> &Worklist) {
+  InstrInfo &II = Instructions[&MI];
+
+  assert(Flag == StateWQM || Flag == StateExact);
+
+  // Ignore if the instruction is already marked. The typical case is that we
+  // mark an instruction WQM multiple times, but for atomics it can happen that
+  // Flag is StateWQM, but Needs is already set to StateExact. In this case,
+  // letting the atomic run in StateExact is correct as per the relevant specs.
+  if (II.Needs)
+    return;
+
+  II.Needs = Flag;
+  Worklist.push_back(&MI);
+}
+
+/// Mark all instructions defining the uses in \p MI as WQM.
+void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
+                                  std::vector<WorkItem> &Worklist) {
+  for (const MachineOperand &Use : MI.uses()) {
+    if (!Use.isReg() || !Use.isUse())
+      continue;
+
+    unsigned Reg = Use.getReg();
+
+    // Handle physical registers that we need to track; this is mostly relevant
+    // for VCC, which can appear as the (implicit) input of a uniform branch,
+    // e.g. when a loop counter is stored in a VGPR.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Reg == AMDGPU::EXEC)
+        continue;
+
+      for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
+        LiveRange &LR = LIS->getRegUnit(*RegUnit);
+        const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
+        if (!Value)
+          continue;
+
+        // Since we're in machine SSA, we do not need to track physical
+        // registers across basic blocks.
+        if (Value->isPHIDef())
+          continue;
+
+        markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
+                        Worklist);
+      }
+
+      continue;
+    }
+
+    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
+      markInstruction(DefMI, StateWQM, Worklist);
+  }
+}
+
+// Scan instructions to determine which ones require an Exact execmask and
+// which ones seed WQM requirements.
+char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
+                                       std::vector<WorkItem> &Worklist) {
+  char GlobalFlags = 0;
+  bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");
+
+  for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
+    MachineBasicBlock &MBB = *BI;
+
+    for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
+      MachineInstr &MI = *II;
+      unsigned Opcode = MI.getOpcode();
+      char Flags = 0;
+
+      if (TII->isDS(Opcode)) {
+        Flags = StateWQM;
+      } else if (TII->isWQM(Opcode)) {
+        // Sampling instructions don't need to produce results for all pixels
+        // in a quad, they just require all inputs of a quad to have been
+        // computed for derivatives.
+        markUsesWQM(MI, Worklist);
+        GlobalFlags |= StateWQM;
+        continue;
+      } else if (TII->isDisableWQM(MI)) {
+        Flags = StateExact;
+      } else {
+        if (Opcode == AMDGPU::SI_PS_LIVE) {
+          LiveMaskQueries.push_back(&MI);
+        } else if (WQMOutputs) {
+          // The function is in machine SSA form, which means that physical
+          // VGPRs correspond to shader inputs and outputs. Inputs are
+          // only used, outputs are only defined.
+          for (const MachineOperand &MO : MI.defs()) {
+            if (!MO.isReg())
+              continue;
+
+            unsigned Reg = MO.getReg();
+
+            if (!TRI->isVirtualRegister(Reg) &&
+                TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
+              Flags = StateWQM;
+              break;
+            }
+          }
+        }
+
+        if (!Flags)
+          continue;
+      }
+
+      markInstruction(MI, Flags, Worklist);
+      GlobalFlags |= Flags;
+    }
+  }
+
+  return GlobalFlags;
+}
+
+void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
+                                           std::vector<WorkItem>& Worklist) {
+  MachineBasicBlock *MBB = MI.getParent();
+  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
+  BlockInfo &BI = Blocks[MBB];
+
+  // Control flow-type instructions and stores to temporary memory that are
+  // followed by WQM computations must themselves be in WQM.
+  if ((II.OutNeeds & StateWQM) && !II.Needs &&
+      (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
+    Instructions[&MI].Needs = StateWQM;
+    II.Needs = StateWQM;
+  }
+
+  // Propagate to block level
+  BI.Needs |= II.Needs;
+  if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
+    BI.InNeeds |= II.Needs;
+    Worklist.push_back(MBB);
+  }
+
+  // Propagate backwards within block
+  if (MachineInstr *PrevMI = MI.getPrevNode()) {
+    char InNeeds = II.Needs | II.OutNeeds;
+    if (!PrevMI->isPHI()) {
+      InstrInfo &PrevII = Instructions[PrevMI];
+      if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
+        PrevII.OutNeeds |= InNeeds;
+        Worklist.push_back(PrevMI);
+      }
+    }
+  }
+
+  // Propagate WQM flag to instruction inputs
+  assert(II.Needs != (StateWQM | StateExact));
+
+  if (II.Needs == StateWQM)
+    markUsesWQM(MI, Worklist);
+}
+
+void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
+                                     std::vector<WorkItem>& Worklist) {
+  BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
+
+  // Propagate through instructions
+  if (!MBB.empty()) {
+    MachineInstr *LastMI = &*MBB.rbegin();
+    InstrInfo &LastII = Instructions[LastMI];
+    if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
+      LastII.OutNeeds |= BI.OutNeeds;
+      Worklist.push_back(LastMI);
+    }
+  }
+
+  // Predecessor blocks must provide for our WQM/Exact needs.
+  for (MachineBasicBlock *Pred : MBB.predecessors()) {
+    BlockInfo &PredBI = Blocks[Pred];
+    if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
+      continue;
+
+    PredBI.OutNeeds |= BI.InNeeds;
+    PredBI.InNeeds |= BI.InNeeds;
+    Worklist.push_back(Pred);
+  }
+
+  // All successors must be prepared to accept the same set of WQM/Exact data.
+  for (MachineBasicBlock *Succ : MBB.successors()) {
+    BlockInfo &SuccBI = Blocks[Succ];
+    if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
+      continue;
+
+    SuccBI.InNeeds |= BI.OutNeeds;
+    Worklist.push_back(Succ);
+  }
+}
+
+char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
+  std::vector<WorkItem> Worklist;
+  char GlobalFlags = scanInstructions(MF, Worklist);
+
+  while (!Worklist.empty()) {
+    WorkItem WI = Worklist.back();
+    Worklist.pop_back();
+
+    if (WI.MI)
+      propagateInstruction(*WI.MI, Worklist);
+    else
+      propagateBlock(*WI.MBB, Worklist);
+  }
+
+  return GlobalFlags;
+}
+
+/// Whether \p MI really requires the exec state computed during analysis.
+///
+/// Scalar instructions must occasionally be marked WQM for correct propagation
+/// (e.g. thread masks leading up to branches), but when it comes to actual
+/// execution, they don't care about EXEC.
+bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
+  if (MI.isTerminator())
+    return true;
+
+  // Skip instructions that are not affected by EXEC
+  if (TII->isScalarUnit(MI))
+    return false;
+
+  // Generic instructions such as COPY will either disappear by register
+  // coalescing or be lowered to SALU or VALU instructions.
+  if (MI.isTransient()) {
+    if (MI.getNumExplicitOperands() >= 1) {
+      const MachineOperand &Op = MI.getOperand(0);
+      if (Op.isReg()) {
+        if (TRI->isSGPRReg(*MRI, Op.getReg())) {
+          // SGPR instructions are not affected by EXEC
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+MachineBasicBlock::iterator
+SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator Before) {
+  unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+  MachineInstr *Save =
+      BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
+          .addReg(AMDGPU::SCC);
+  MachineInstr *Restore =
+      BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
+          .addReg(SaveReg);
+
+  LIS->InsertMachineInstrInMaps(*Save);
+  LIS->InsertMachineInstrInMaps(*Restore);
+  LIS->createAndComputeVirtRegInterval(SaveReg);
+
+  return Restore;
+}
+
+// Return an iterator in the (inclusive) range [First, Last] at which
+// instructions can be safely inserted, keeping in mind that some of the
+// instructions we want to add necessarily clobber SCC.
+MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
+    MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
+  if (!SaveSCC)
+    return PreferLast ? Last : First;
+
+  LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
+  auto MBBE = MBB.end();
+  SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
+                                     : LIS->getMBBEndIdx(&MBB);
+  SlotIndex LastIdx =
+      Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
+  SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
+  const LiveRange::Segment *S;
+
+  for (;;) {
+    S = LR.getSegmentContaining(Idx);
+    if (!S)
+      break;
+
+    if (PreferLast) {
+      SlotIndex Next = S->start.getBaseIndex();
+      if (Next < FirstIdx)
+        break;
+      Idx = Next;
+    } else {
+      SlotIndex Next = S->end.getNextIndex().getBaseIndex();
+      if (Next > LastIdx)
+        break;
+      Idx = Next;
+    }
+  }
+
+  MachineBasicBlock::iterator MBBI;
+
+  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
+    MBBI = MI;
+  else {
+    assert(Idx == LIS->getMBBEndIdx(&MBB));
+    MBBI = MBB.end();
+  }
+
+  if (S)
+    MBBI = saveSCC(MBB, MBBI);
+
+  return MBBI;
+}
+
+void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator Before,
+                              unsigned SaveWQM, unsigned LiveMaskReg) {
+  MachineInstr *MI;
+
+  if (SaveWQM) {
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
+                 SaveWQM)
+             .addReg(LiveMaskReg);
+  } else {
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
+                 AMDGPU::EXEC)
+             .addReg(AMDGPU::EXEC)
+             .addReg(LiveMaskReg);
+  }
+
+  LIS->InsertMachineInstrInMaps(*MI);
+}
+
+void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator Before,
+                            unsigned SavedWQM) {
+  MachineInstr *MI;
+
+  if (SavedWQM) {
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
+             .addReg(SavedWQM);
+  } else {
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+                 AMDGPU::EXEC)
+             .addReg(AMDGPU::EXEC);
+  }
+
+  LIS->InsertMachineInstrInMaps(*MI);
+}
+
+void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
+                                   bool isEntry) {
+  auto BII = Blocks.find(&MBB);
+  if (BII == Blocks.end())
+    return;
+
+  const BlockInfo &BI = BII->second;
+
+  if (!(BI.InNeeds & StateWQM))
+    return;
+
+  // This is a non-entry block that is WQM throughout, so no need to do
+  // anything.
+  if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
+    return;
+
+  DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
+
+  unsigned SavedWQMReg = 0;
+  bool WQMFromExec = isEntry;
+  char State = isEntry ? StateExact : StateWQM;
+
+  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+  if (isEntry)
+    ++II; // Skip the instruction that saves LiveMask
+
+  MachineBasicBlock::iterator First = IE;
+  for (;;) {
+    MachineBasicBlock::iterator Next = II;
+    char Needs = 0;
+    char OutNeeds = 0;
+
+    if (First == IE)
+      First = II;
+
+    if (II != IE) {
+      MachineInstr &MI = *II;
+
+      if (requiresCorrectState(MI)) {
+        auto III = Instructions.find(&MI);
+        if (III != Instructions.end()) {
+          Needs = III->second.Needs;
+          OutNeeds = III->second.OutNeeds;
+        }
+      }
+
+      if (MI.isTerminator() && !Needs && OutNeeds == StateExact)
+        Needs = StateExact;
+
+      if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
+        MI.getOperand(3).setImm(1);
+
+      ++Next;
+    } else {
+      // End of basic block
+      if (BI.OutNeeds & StateWQM)
+        Needs = StateWQM;
+      else if (BI.OutNeeds == StateExact)
+        Needs = StateExact;
+    }
+
+    if (Needs) {
+      if (Needs != State) {
+        MachineBasicBlock::iterator Before =
+            prepareInsertion(MBB, First, II, Needs == StateWQM,
+                             Needs == StateExact || WQMFromExec);
+
+        if (Needs == StateExact) {
+          if (!WQMFromExec && (OutNeeds & StateWQM))
+            SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+          toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
+        } else {
+          assert(WQMFromExec == (SavedWQMReg == 0));
+
+          toWQM(MBB, Before, SavedWQMReg);
+
+          if (SavedWQMReg) {
+            LIS->createAndComputeVirtRegInterval(SavedWQMReg);
+            SavedWQMReg = 0;
+          }
+        }
+
+        State = Needs;
+      }
+
+      First = IE;
+    }
+
+    if (II == IE)
+      break;
+    II = Next;
+  }
+}
+
+void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
+  for (MachineInstr *MI : LiveMaskQueries) {
+    const DebugLoc &DL = MI->getDebugLoc();
+    unsigned Dest = MI->getOperand(0).getReg();
+    MachineInstr *Copy =
+        BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
+            .addReg(LiveMaskReg);
+
+    LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
+    MI->eraseFromParent();
+  }
+}
+
+bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
+    return false;
+
+  Instructions.clear();
+  Blocks.clear();
+  LiveMaskQueries.clear();
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  LIS = &getAnalysis<LiveIntervals>();
+
+  char GlobalFlags = analyzeFunction(MF);
+  if (!(GlobalFlags & StateWQM)) {
+    lowerLiveMaskQueries(AMDGPU::EXEC);
+    return !LiveMaskQueries.empty();
+  }
+
+  // Store a copy of the original live mask when required
+  unsigned LiveMaskReg = 0;
+  {
+    MachineBasicBlock &Entry = MF.front();
+    MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
+
+    if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
+      LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
+                                 TII->get(AMDGPU::COPY), LiveMaskReg)
+                             .addReg(AMDGPU::EXEC);
+      LIS->InsertMachineInstrInMaps(*MI);
+    }
+
+    if (GlobalFlags == StateWQM) {
+      // For a shader that needs only WQM, we can just set it once.
+      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+              AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC);
+
+      lowerLiveMaskQueries(LiveMaskReg);
+      // EntryMI may become invalid here
+      return true;
+    }
+  }
+
+  DEBUG(printInfo());
+
+  lowerLiveMaskQueries(LiveMaskReg);
+
+  // Handle the general case
+  for (auto BII : Blocks)
+    processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
+
+  // Physical registers like SCC aren't tracked by default anyway, so just
+  // removing the ranges we computed is the simplest option for maintaining
+  // the analysis results.
+  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td
new file mode 100644
index 000000000000..02656483cd74
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -0,0 +1,535 @@
+//===---- SMInstructions.td - Scalar Memory Instruction Defintions --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def smrd_offset_8 : NamedOperandU32<"SMRDOffset8",
+                                  NamedMatchClass<"SMRDOffset8">> {
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def smrd_offset_20 : NamedOperandU32<"SMRDOffset20",
+                                  NamedMatchClass<"SMRDOffset20">> {
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+//===----------------------------------------------------------------------===//
+// Scalar Memory classes
+//===----------------------------------------------------------------------===//
+
+class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> :
+  InstSI <outs, ins, "", pattern>,
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+
+  let LGKM_CNT = 1;
+  let SMRD = 1;
+  let mayStore = 0;
+  let mayLoad = 1;
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let SchedRW = [WriteSMEM];
+  let SubtargetPredicate = isGCN;
+
+  string Mnemonic = opName;
+  string AsmOperands = asmOps;
+
+  bits<1> has_sbase = 1;
+  bits<1> has_sdst = 1;
+  bit has_glc = 0;
+  bits<1> has_offset = 1;
+  bits<1> offset_is_imm = 0;
+}
+
+class SM_Real <SM_Pseudo ps>
+  : InstSI<ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
+
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  // copy relevant pseudo op flags
+  let SubtargetPredicate = ps.SubtargetPredicate;
+  let AsmMatchConverter  = ps.AsmMatchConverter;
+
+  // encoding
+  bits<7>  sbase;
+  bits<7>  sdst;
+  bits<32> offset;
+  bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0);
+}
+
+class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]>
+  : SM_Pseudo<opName, outs, ins, asmOps, pattern> {
+  RegisterClass BaseClass;
+  let mayLoad = 1;
+  let mayStore = 0;
+  let has_glc = 1;
+}
+
+class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern = []>
+  : SM_Pseudo<opName, (outs), ins, asmOps, pattern> {
+  RegisterClass BaseClass;
+  RegisterClass SrcClass;
+  let mayLoad = 0;
+  let mayStore = 1;
+  let has_glc = 1;
+  let ScalarStore = 1;
+}
+
+multiclass SM_Pseudo_Loads<string opName,
+                           RegisterClass baseClass,
+                           RegisterClass dstClass> {
+  def _IMM  : SM_Load_Pseudo <opName,
+                              (outs dstClass:$sdst),
+                              (ins baseClass:$sbase, i32imm:$offset, i1imm:$glc),
+                              " $sdst, $sbase, $offset$glc", []> {
+    let offset_is_imm = 1;
+    let BaseClass = baseClass;
+    let PseudoInstr = opName # "_IMM";
+    let has_glc = 1;
+  }
+
+  def _SGPR  : SM_Load_Pseudo <opName,
+                              (outs dstClass:$sdst),
+                              (ins baseClass:$sbase, SReg_32:$soff, i1imm:$glc),
+                              " $sdst, $sbase, $offset$glc", []> {
+    let BaseClass = baseClass;
+    let PseudoInstr = opName # "_SGPR";
+    let has_glc = 1;
+  }
+}
+
+multiclass SM_Pseudo_Stores<string opName,
+                           RegisterClass baseClass,
+                           RegisterClass srcClass> {
+  def _IMM  : SM_Store_Pseudo <opName,
+    (ins srcClass:$sdata, baseClass:$sbase, i32imm:$offset, i1imm:$glc),
+    " $sdata, $sbase, $offset$glc", []> {
+    let offset_is_imm = 1;
+    let BaseClass = baseClass;
+    let SrcClass = srcClass;
+    let PseudoInstr = opName # "_IMM";
+  }
+
+  def _SGPR  : SM_Store_Pseudo <opName,
+    (ins srcClass:$sdata, baseClass:$sbase, SReg_32:$soff, i1imm:$glc),
+    " $sdata, $sbase, $offset$glc", []> {
+    let BaseClass = baseClass;
+    let SrcClass = srcClass;
+    let PseudoInstr = opName # "_SGPR";
+  }
+}
+
+class SM_Time_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
+  opName, (outs SReg_64_XEXEC:$sdst), (ins),
+  " $sdst", [(set i64:$sdst, (node))]> {
+  let hasSideEffects = 1;
+  // FIXME: mayStore = ? is a workaround for tablegen bug for different
+  // inferred mayStore flags for the instruction pattern vs. standalone
+  // Pat. Each considers the other contradictory.
+  let mayStore = ?;
+  let mayLoad = ?;
+  let has_sbase = 0;
+  let has_offset = 0;
+}
+
+class SM_Inval_Pseudo <string opName, SDPatternOperator node> : SM_Pseudo<
+  opName, (outs), (ins), "", [(node)]> {
+  let hasSideEffects = 1;
+  let mayStore = 1;
+  let has_sdst = 0;
+  let has_sbase = 0;
+  let has_offset = 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Scalar Memory Instructions
+//===----------------------------------------------------------------------===//
+
+// We are using the SReg_32_XM0 and not the SReg_32 register class for 32-bit
+// SMRD instructions, because the SReg_32_XM0 register class does not include M0
+// and writing to M0 from an SMRD instruction will hang the GPU.
+
+// XXX - SMEM instructions do not allow exec for data operand, but
+// does sdst for SMRD on SI/CI?
+defm S_LOAD_DWORD    : SM_Pseudo_Loads <"s_load_dword", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_LOAD_DWORDX2  : SM_Pseudo_Loads <"s_load_dwordx2", SReg_64, SReg_64_XEXEC>;
+defm S_LOAD_DWORDX4  : SM_Pseudo_Loads <"s_load_dwordx4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8  : SM_Pseudo_Loads <"s_load_dwordx8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <"s_load_dwordx16", SReg_64, SReg_512>;
+
+defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <
+  "s_buffer_load_dword", SReg_128, SReg_32_XM0_XEXEC
+>;
+
+// FIXME: exec_lo/exec_hi appear to be allowed for SMRD loads on
+// SI/CI, bit disallowed for SMEM on VI.
+defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads <
+  "s_buffer_load_dwordx2", SReg_128, SReg_64_XEXEC
+>;
+
+defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <
+  "s_buffer_load_dwordx4", SReg_128, SReg_128
+>;
+
+defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <
+  "s_buffer_load_dwordx8", SReg_128, SReg_256
+>;
+
+defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <
+  "s_buffer_load_dwordx16", SReg_128, SReg_512
+>;
+
+defm S_STORE_DWORD : SM_Pseudo_Stores <"s_store_dword", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_STORE_DWORDX2 : SM_Pseudo_Stores <"s_store_dwordx2", SReg_64, SReg_64_XEXEC>;
+defm S_STORE_DWORDX4 : SM_Pseudo_Stores <"s_store_dwordx4", SReg_64, SReg_128>;
+
+defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores <
+  "s_buffer_store_dword", SReg_128, SReg_32_XM0_XEXEC
+>;
+
+defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores <
+  "s_buffer_store_dwordx2", SReg_128, SReg_64_XEXEC
+>;
+
+defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores <
+  "s_buffer_store_dwordx4", SReg_128, SReg_128
+>;
+
+
+def S_MEMTIME : SM_Time_Pseudo <"s_memtime", int_amdgcn_s_memtime>;
+def S_DCACHE_INV : SM_Inval_Pseudo <"s_dcache_inv", int_amdgcn_s_dcache_inv>;
+
+let SubtargetPredicate = isCIVI in {
+def S_DCACHE_INV_VOL : SM_Inval_Pseudo <"s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>;
+} // let SubtargetPredicate = isCIVI
+
+let SubtargetPredicate = isVI in {
+def S_DCACHE_WB     : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>;
+def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
+def S_MEMREALTIME   : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>;
+} // SubtargetPredicate = isVI
+
+
+
+//===----------------------------------------------------------------------===//
+// Scalar Memory Patterns
+//===----------------------------------------------------------------------===//
+
+
+def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
+  auto Ld = cast<LoadSDNode>(N);
+  return Ld->getAlignment() >= 4  &&
+    ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+    static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
+    (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+    static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) &&
+    static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
+}]>;
+
+def SMRDImm         : ComplexPattern<i64, 2, "SelectSMRDImm">;
+def SMRDImm32       : ComplexPattern<i64, 2, "SelectSMRDImm32">;
+def SMRDSgpr        : ComplexPattern<i64, 2, "SelectSMRDSgpr">;
+def SMRDBufferImm   : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
+def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
+def SMRDBufferSgpr  : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">;
+
+let Predicates = [isGCN] in {
+
+multiclass SMRD_Pattern <string Instr, ValueType vt> {
+
+  // 1. IMM offset
+  def : Pat <
+    (smrd_load (SMRDImm i64:$sbase, i32:$offset)),
+    (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
+  >;
+
+  // 2. SGPR offset
+  def : Pat <
+    (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0))
+  >;
+}
+
+let Predicates = [isSICI] in {
+def : Pat <
+  (i64 (readcyclecounter)),
+  (S_MEMTIME)
+>;
+}
+
+// Global and constant loads can be selected to either MUBUF or SMRD
+// instructions, but SMRD instructions are faster so we want the instruction
+// selector to prefer those.
+let AddedComplexity = 100 in {
+
+defm : SMRD_Pattern <"S_LOAD_DWORD",    i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX2",  v2i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX4",  v4i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX8",  v8i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
+
+// 1. Offset as an immediate
+def SM_LOAD_PATTERN : Pat <  // name this pattern to reuse AddedComplexity on CI
+  (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)),
+  (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0)
+>;
+
+// 2. Offset loaded in an 32bit SGPR
+def : Pat <
+  (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)),
+  (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0)
+>;
+
+} // End let AddedComplexity = 100
+
+} // let Predicates = [isGCN]
+
+let Predicates = [isVI] in {
+
+// 1. Offset as 20bit DWORD immediate
+def : Pat <
+  (SIload_constant v4i32:$sbase, IMM20bit:$offset),
+  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset), 0)
+>;
+
+def : Pat <
+  (i64 (readcyclecounter)),
+  (S_MEMREALTIME)
+>;
+
+} // let Predicates = [isVI]
+
+
+//===----------------------------------------------------------------------===//
+// Targets
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+class SMRD_Real_si <bits<5> op, SM_Pseudo ps>
+  : SM_Real<ps>
+  , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI>
+  , Enc32 {
+
+  let AssemblerPredicates = [isSICI];
+  let DecoderNamespace = "SICI";
+
+  let Inst{7-0}   = !if(ps.has_offset, offset{7-0}, ?);
+  let Inst{8}     = imm;
+  let Inst{14-9}  = !if(ps.has_sbase, sbase{6-1}, ?);
+  let Inst{21-15} = !if(ps.has_sdst, sdst{6-0}, ?);
+  let Inst{26-22} = op;
+  let Inst{31-27} = 0x18; //encoding
+}
+
+// FIXME: Assembler should reject trying to use glc on SMRD
+// instructions on SI.
+multiclass SM_Real_Loads_si<bits<5> op, string ps,
+                            SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
+                            SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
+
+  def _IMM_si : SMRD_Real_si <op, immPs> {
+    let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, GLC:$glc);
+  }
+
+  // FIXME: The operand name $offset is inconsistent with $soff used
+  // in the pseudo
+  def _SGPR_si : SMRD_Real_si <op, sgprPs> {
+    let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc);
+  }
+
+}
+
+defm S_LOAD_DWORD           : SM_Real_Loads_si <0x00, "S_LOAD_DWORD">;
+defm S_LOAD_DWORDX2         : SM_Real_Loads_si <0x01, "S_LOAD_DWORDX2">;
+defm S_LOAD_DWORDX4         : SM_Real_Loads_si <0x02, "S_LOAD_DWORDX4">;
+defm S_LOAD_DWORDX8         : SM_Real_Loads_si <0x03, "S_LOAD_DWORDX8">;
+defm S_LOAD_DWORDX16        : SM_Real_Loads_si <0x04, "S_LOAD_DWORDX16">;
+defm S_BUFFER_LOAD_DWORD    : SM_Real_Loads_si <0x08, "S_BUFFER_LOAD_DWORD">;
+defm S_BUFFER_LOAD_DWORDX2  : SM_Real_Loads_si <0x09, "S_BUFFER_LOAD_DWORDX2">;
+defm S_BUFFER_LOAD_DWORDX4  : SM_Real_Loads_si <0x0a, "S_BUFFER_LOAD_DWORDX4">;
+defm S_BUFFER_LOAD_DWORDX8  : SM_Real_Loads_si <0x0b, "S_BUFFER_LOAD_DWORDX8">;
+defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_si <0x0c, "S_BUFFER_LOAD_DWORDX16">;
+
+def S_MEMTIME_si    : SMRD_Real_si <0x1e, S_MEMTIME>;
+def S_DCACHE_INV_si : SMRD_Real_si <0x1f, S_DCACHE_INV>;
+
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class SMEM_Real_vi <bits<8> op, SM_Pseudo ps>
+  : SM_Real<ps>
+  , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI>
+  , Enc64 {
+  bit glc;
+
+  let AssemblerPredicates = [isVI];
+  let DecoderNamespace = "VI";
+
+  let Inst{5-0}   = !if(ps.has_sbase, sbase{6-1}, ?);
+  let Inst{12-6}  = !if(ps.has_sdst, sdst{6-0}, ?);
+
+  let Inst{16} = !if(ps.has_glc, glc, ?);
+  let Inst{17} = imm;
+  let Inst{25-18} = op;
+  let Inst{31-26} = 0x30; //encoding
+  let Inst{51-32} = !if(ps.has_offset, offset{19-0}, ?);
+}
+
+multiclass SM_Real_Loads_vi<bits<8> op, string ps,
+                            SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
+                            SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
+  def _IMM_vi : SMEM_Real_vi <op, immPs> {
+    let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc);
+  }
+  def _SGPR_vi : SMEM_Real_vi <op, sgprPs> {
+    let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc);
+  }
+}
+
+class SMEM_Real_Store_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> {
+  // encoding
+  bits<7> sdata;
+
+  let sdst = ?;
+  let Inst{12-6}  = !if(ps.has_sdst, sdata{6-0}, ?);
+}
+
+multiclass SM_Real_Stores_vi<bits<8> op, string ps,
+                            SM_Store_Pseudo immPs = !cast<SM_Store_Pseudo>(ps#_IMM),
+                            SM_Store_Pseudo sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR)> {
+  // FIXME: The operand name $offset is inconsistent with $soff used
+  // in the pseudo
+  def _IMM_vi : SMEM_Real_Store_vi <op, immPs> {
+    let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc);
+  }
+
+  def _SGPR_vi : SMEM_Real_Store_vi <op, sgprPs> {
+    let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc);
+  }
+}
+
+defm S_LOAD_DWORD           : SM_Real_Loads_vi <0x00, "S_LOAD_DWORD">;
+defm S_LOAD_DWORDX2         : SM_Real_Loads_vi <0x01, "S_LOAD_DWORDX2">;
+defm S_LOAD_DWORDX4         : SM_Real_Loads_vi <0x02, "S_LOAD_DWORDX4">;
+defm S_LOAD_DWORDX8         : SM_Real_Loads_vi <0x03, "S_LOAD_DWORDX8">;
+defm S_LOAD_DWORDX16        : SM_Real_Loads_vi <0x04, "S_LOAD_DWORDX16">;
+defm S_BUFFER_LOAD_DWORD    : SM_Real_Loads_vi <0x08, "S_BUFFER_LOAD_DWORD">;
+defm S_BUFFER_LOAD_DWORDX2  : SM_Real_Loads_vi <0x09, "S_BUFFER_LOAD_DWORDX2">;
+defm S_BUFFER_LOAD_DWORDX4  : SM_Real_Loads_vi <0x0a, "S_BUFFER_LOAD_DWORDX4">;
+defm S_BUFFER_LOAD_DWORDX8  : SM_Real_Loads_vi <0x0b, "S_BUFFER_LOAD_DWORDX8">;
+defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_vi <0x0c, "S_BUFFER_LOAD_DWORDX16">;
+
+defm S_STORE_DWORD : SM_Real_Stores_vi <0x10, "S_STORE_DWORD">;
+defm S_STORE_DWORDX2 : SM_Real_Stores_vi <0x11, "S_STORE_DWORDX2">;
+defm S_STORE_DWORDX4 : SM_Real_Stores_vi <0x12, "S_STORE_DWORDX4">;
+
+defm S_BUFFER_STORE_DWORD    : SM_Real_Stores_vi <0x18, "S_BUFFER_STORE_DWORD">;
+defm S_BUFFER_STORE_DWORDX2  : SM_Real_Stores_vi <0x19, "S_BUFFER_STORE_DWORDX2">;
+defm S_BUFFER_STORE_DWORDX4  : SM_Real_Stores_vi <0x1a, "S_BUFFER_STORE_DWORDX4">;
+
+// These instructions use same encoding
+def S_DCACHE_INV_vi         : SMEM_Real_vi <0x20, S_DCACHE_INV>;
+def S_DCACHE_WB_vi          : SMEM_Real_vi <0x21, S_DCACHE_WB>;
+def S_DCACHE_INV_VOL_vi     : SMEM_Real_vi <0x22, S_DCACHE_INV_VOL>;
+def S_DCACHE_WB_VOL_vi      : SMEM_Real_vi <0x23, S_DCACHE_WB_VOL>;
+def S_MEMTIME_vi            : SMEM_Real_vi <0x24, S_MEMTIME>;
+def S_MEMREALTIME_vi        : SMEM_Real_vi <0x25, S_MEMREALTIME>;
+
+
+//===----------------------------------------------------------------------===//
+// CI
+//===----------------------------------------------------------------------===//
+
+def smrd_literal_offset : NamedOperandU32<"SMRDLiteralOffset",
+                                          NamedMatchClass<"SMRDLiteralOffset">> {
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+class SMRD_Real_Load_IMM_ci <bits<5> op, SM_Load_Pseudo ps> :
+  SM_Real<ps>,
+  Enc64 {
+
+  let AssemblerPredicates = [isCIOnly];
+  let DecoderNamespace = "CI";
+  let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc);
+
+  let LGKM_CNT = ps.LGKM_CNT;
+  let SMRD = ps.SMRD;
+  let mayLoad = ps.mayLoad;
+  let mayStore = ps.mayStore;
+  let hasSideEffects = ps.hasSideEffects;
+  let SchedRW = ps.SchedRW;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+
+  let Inst{7-0}   = 0xff;
+  let Inst{8}     = 0;
+  let Inst{14-9}  = sbase{6-1};
+  let Inst{21-15} = sdst{6-0};
+  let Inst{26-22} = op;
+  let Inst{31-27} = 0x18; //encoding
+  let Inst{63-32} = offset{31-0};
+}
+
+def S_LOAD_DWORD_IMM_ci           : SMRD_Real_Load_IMM_ci <0x00, S_LOAD_DWORD_IMM>;
+def S_LOAD_DWORDX2_IMM_ci         : SMRD_Real_Load_IMM_ci <0x01, S_LOAD_DWORDX2_IMM>;
+def S_LOAD_DWORDX4_IMM_ci         : SMRD_Real_Load_IMM_ci <0x02, S_LOAD_DWORDX4_IMM>;
+def S_LOAD_DWORDX8_IMM_ci         : SMRD_Real_Load_IMM_ci <0x03, S_LOAD_DWORDX8_IMM>;
+def S_LOAD_DWORDX16_IMM_ci        : SMRD_Real_Load_IMM_ci <0x04, S_LOAD_DWORDX16_IMM>;
+def S_BUFFER_LOAD_DWORD_IMM_ci    : SMRD_Real_Load_IMM_ci <0x08, S_BUFFER_LOAD_DWORD_IMM>;
+def S_BUFFER_LOAD_DWORDX2_IMM_ci  : SMRD_Real_Load_IMM_ci <0x09, S_BUFFER_LOAD_DWORDX2_IMM>;
+def S_BUFFER_LOAD_DWORDX4_IMM_ci  : SMRD_Real_Load_IMM_ci <0x0a, S_BUFFER_LOAD_DWORDX4_IMM>;
+def S_BUFFER_LOAD_DWORDX8_IMM_ci  : SMRD_Real_Load_IMM_ci <0x0b, S_BUFFER_LOAD_DWORDX8_IMM>;
+def S_BUFFER_LOAD_DWORDX16_IMM_ci : SMRD_Real_Load_IMM_ci <0x0c, S_BUFFER_LOAD_DWORDX16_IMM>;
+
+class SMRD_Real_ci <bits<5> op, SM_Pseudo ps>
+  : SM_Real<ps>
+  , SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI>
+  , Enc32 {
+
+  let AssemblerPredicates = [isCIOnly];
+  let DecoderNamespace = "CI";
+
+  let Inst{7-0}   = !if(ps.has_offset, offset{7-0}, ?);
+  let Inst{8}     = imm;
+  let Inst{14-9}  = !if(ps.has_sbase, sbase{6-1}, ?);
+  let Inst{21-15} = !if(ps.has_sdst, sdst{6-0}, ?);
+  let Inst{26-22} = op;
+  let Inst{31-27} = 0x18; //encoding
+}
+
+def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>;
+
+let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in {
+
+class SMRD_Pattern_ci <string Instr, ValueType vt> : Pat <
+  (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
+  (vt (!cast<SM_Pseudo>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
+  let Predicates = [isCIOnly];
+}
+
+def : SMRD_Pattern_ci <"S_LOAD_DWORD",    i32>;
+def : SMRD_Pattern_ci <"S_LOAD_DWORDX2",  v2i32>;
+def : SMRD_Pattern_ci <"S_LOAD_DWORDX4",  v4i32>;
+def : SMRD_Pattern_ci <"S_LOAD_DWORDX8",  v8i32>;
+def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>;
+
+def : Pat <
+  (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)),
+  (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> {
+  let Predicates = [isCI]; // should this be isCIOnly?
+}
+
+} // End let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td
new file mode 100644
index 000000000000..0aeb1297d3a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -0,0 +1,1229 @@
+//===-- SOPInstructions.td - SOP Instruction Defintions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def GPRIdxModeMatchClass : AsmOperandClass {
+  let Name = "GPRIdxMode";
+  let PredicateMethod = "isGPRIdxMode";
+  let RenderMethod = "addImmOperands";
+}
+
+def GPRIdxMode : Operand<i32> {
+  let PrintMethod = "printVGPRIndexMode";
+  let ParserMatchClass = GPRIdxModeMatchClass;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+//===----------------------------------------------------------------------===//
+// SOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+class SOP1_Pseudo <string opName, dag outs, dag ins,
+                   string asmOps, list<dag> pattern=[]> :
+  InstSI <outs, ins, "", pattern>,
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let SubtargetPredicate = isGCN;
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+  let SOP1 = 1;
+  let SchedRW = [WriteSALU];
+  let Size = 4;
+  let UseNamedOperandTable = 1;
+
+  string Mnemonic = opName;
+  string AsmOperands = asmOps;
+
+  bits<1> has_src0 = 1;
+  bits<1> has_sdst = 1;
+}
+
+class SOP1_Real<bits<8> op, SOP1_Pseudo ps> :
+  InstSI <ps.OutOperandList, ps.InOperandList,
+          ps.Mnemonic # " " # ps.AsmOperands, []>,
+  Enc32 {
+
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+  let Size = 4;
+
+  // copy relevant pseudo op flags
+  let SubtargetPredicate = ps.SubtargetPredicate;
+  let AsmMatchConverter  = ps.AsmMatchConverter;
+
+  // encoding
+  bits<7> sdst;
+  bits<8> src0;
+
+  let Inst{7-0} = !if(ps.has_src0, src0, ?);
+  let Inst{15-8} = op;
+  let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
+  let Inst{31-23} = 0x17d; //encoding;
+}
+
+class SOP1_32 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+  opName, (outs SReg_32:$sdst), (ins SSrc_b32:$src0),
+  "$sdst, $src0", pattern
+>;
+
+// 32-bit input, no output.
+class SOP1_0_32 <string opName, list<dag> pattern = []> : SOP1_Pseudo <
+  opName, (outs), (ins SSrc_b32:$src0),
+  "$src0", pattern> {
+  let has_sdst = 0;
+}
+
+class SOP1_64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+  opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0),
+  "$sdst, $src0", pattern
+>;
+
+// 64-bit input, 32-bit output.
+class SOP1_32_64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+  opName, (outs SReg_32:$sdst), (ins SSrc_b64:$src0),
+  "$sdst, $src0", pattern
+>;
+
+// 32-bit input, 64-bit output.
+class SOP1_64_32 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+  opName, (outs SReg_64:$sdst), (ins SSrc_b32:$src0),
+  "$sdst, $src0", pattern
+>;
+
+// no input, 64-bit output.
+class SOP1_64_0 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+  opName, (outs SReg_64:$sdst), (ins), "$sdst", pattern> {
+  let has_src0 = 0;
+}
+
+// 64-bit input, no output
+class SOP1_1 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+  opName, (outs), (ins SReg_64:$src0), "$src0", pattern> {
+  let has_sdst = 0;
+}
+
+
+let isMoveImm = 1 in {
+  let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+    def S_MOV_B32 : SOP1_32 <"s_mov_b32">;
+    def S_MOV_B64 : SOP1_64 <"s_mov_b64">;
+  } // End isRematerializeable = 1
+
+  let Uses = [SCC] in {
+    def S_CMOV_B32 : SOP1_32 <"s_cmov_b32">;
+    def S_CMOV_B64 : SOP1_64 <"s_cmov_b64">;
+  } // End Uses = [SCC]
+} // End isMoveImm = 1
+
+let Defs = [SCC] in {
+  def S_NOT_B32 : SOP1_32 <"s_not_b32",
+    [(set i32:$sdst, (not i32:$src0))]
+  >;
+
+  def S_NOT_B64 : SOP1_64 <"s_not_b64",
+    [(set i64:$sdst, (not i64:$src0))]
+  >;
+  def S_WQM_B32 : SOP1_32 <"s_wqm_b32">;
+  def S_WQM_B64 : SOP1_64 <"s_wqm_b64">;
+} // End Defs = [SCC]
+
+
+def S_BREV_B32 : SOP1_32 <"s_brev_b32",
+  [(set i32:$sdst, (bitreverse i32:$src0))]
+>;
+def S_BREV_B64 : SOP1_64 <"s_brev_b64">;
+
+let Defs = [SCC] in {
+def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
+def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
+def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
+  [(set i32:$sdst, (ctpop i32:$src0))]
+>;
+def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64">;
+} // End Defs = [SCC]
+
+def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
+def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;
+def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32",
+  [(set i32:$sdst, (cttz_zero_undef i32:$src0))]
+>;
+def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">;
+
+def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32",
+  [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))]
+>;
+
+def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64">;
+def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32",
+  [(set i32:$sdst, (AMDGPUffbh_i32 i32:$src0))]
+>;
+def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">;
+def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8",
+  [(set i32:$sdst, (sext_inreg i32:$src0, i8))]
+>;
+def S_SEXT_I32_I16 : SOP1_32 <"s_sext_i32_i16",
+  [(set i32:$sdst, (sext_inreg i32:$src0, i16))]
+>;
+
+def S_BITSET0_B32 : SOP1_32    <"s_bitset0_b32">;
+def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">;
+def S_BITSET1_B32 : SOP1_32    <"s_bitset1_b32">;
+def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">;
+def S_GETPC_B64 : SOP1_64_0  <"s_getpc_b64">;
+
+let isTerminator = 1, isBarrier = 1,
+    isBranch = 1, isIndirectBranch = 1 in {
+def S_SETPC_B64 : SOP1_1  <"s_setpc_b64">;
+}
+def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64">;
+def S_RFE_B64 : SOP1_1  <"s_rfe_b64">;
+
+let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
+
+def S_AND_SAVEEXEC_B64 : SOP1_64 <"s_and_saveexec_b64">;
+def S_OR_SAVEEXEC_B64 : SOP1_64 <"s_or_saveexec_b64">;
+def S_XOR_SAVEEXEC_B64 : SOP1_64 <"s_xor_saveexec_b64">;
+def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <"s_andn2_saveexec_b64">;
+def S_ORN2_SAVEEXEC_B64 : SOP1_64 <"s_orn2_saveexec_b64">;
+def S_NAND_SAVEEXEC_B64 : SOP1_64 <"s_nand_saveexec_b64">;
+def S_NOR_SAVEEXEC_B64 : SOP1_64 <"s_nor_saveexec_b64">;
+def S_XNOR_SAVEEXEC_B64 : SOP1_64 <"s_xnor_saveexec_b64">;
+
+} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC]
+
+def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32">;
+def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64">;
+
+let Uses = [M0] in {
+def S_MOVRELS_B32 : SOP1_32 <"s_movrels_b32">;
+def S_MOVRELS_B64 : SOP1_64 <"s_movrels_b64">;
+def S_MOVRELD_B32 : SOP1_32 <"s_movreld_b32">;
+def S_MOVRELD_B64 : SOP1_64 <"s_movreld_b64">;
+} // End Uses = [M0]
+
+def S_CBRANCH_JOIN : SOP1_1  <"s_cbranch_join">;
+def S_MOV_REGRD_B32 : SOP1_32 <"s_mov_regrd_b32">;
+let Defs = [SCC] in {
+def S_ABS_I32 : SOP1_32 <"s_abs_i32">;
+} // End Defs = [SCC]
+def S_MOV_FED_B32 : SOP1_32 <"s_mov_fed_b32">;
+
+let SubtargetPredicate = HasVGPRIndexMode in {
+def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> {
+  let Uses = [M0];
+  let Defs = [M0];
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SOP2 Instructions
+//===----------------------------------------------------------------------===//
+
+class SOP2_Pseudo<string opName, dag outs, dag ins,
+                  string asmOps, list<dag> pattern=[]> :
+  InstSI<outs, ins, "", pattern>,
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let SubtargetPredicate = isGCN;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+  let SOP2 = 1;
+  let SchedRW = [WriteSALU];
+  let UseNamedOperandTable = 1;
+
+  string Mnemonic = opName;
+  string AsmOperands = asmOps;
+
+  bits<1> has_sdst = 1;
+
+  // Pseudo instructions have no encodings, but adding this field here allows
+  // us to do:
+  // let sdst = xxx in {
+  // for multiclasses that include both real and pseudo instructions.
+  // field bits<7> sdst = 0;
+  // let Size = 4; // Do we need size here?
+}
+
+class SOP2_Real<bits<7> op, SOP2_Pseudo ps> :
+  InstSI <ps.OutOperandList, ps.InOperandList,
+          ps.Mnemonic # " " # ps.AsmOperands, []>,
+  Enc32 {
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  // copy relevant pseudo op flags
+  let SubtargetPredicate = ps.SubtargetPredicate;
+  let AsmMatchConverter  = ps.AsmMatchConverter;
+
+  // encoding
+  bits<7> sdst;
+  bits<8> src0;
+  bits<8> src1;
+
+  let Inst{7-0}   = src0;
+  let Inst{15-8}  = src1;
+  let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
+  let Inst{29-23} = op;
+  let Inst{31-30} = 0x2; // encoding
+}
+
+
+class SOP2_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
+  opName, (outs SReg_32:$sdst), (ins SSrc_b32:$src0, SSrc_b32:$src1),
+  "$sdst, $src0, $src1", pattern
+>;
+
+class SOP2_64 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
+  opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+  "$sdst, $src0, $src1", pattern
+>;
+
+class SOP2_64_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
+  opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b32:$src1),
+  "$sdst, $src0, $src1", pattern
+>;
+
+class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
+  opName, (outs SReg_64:$sdst), (ins SSrc_b32:$src0, SSrc_b32:$src1),
+  "$sdst, $src0, $src1", pattern
+>;
+
+let Defs = [SCC] in { // Carry out goes to SCC
+let isCommutable = 1 in {
+def S_ADD_U32 : SOP2_32 <"s_add_u32">;
+def S_ADD_I32 : SOP2_32 <"s_add_i32",
+  [(set i32:$sdst, (add SSrc_b32:$src0, SSrc_b32:$src1))]
+>;
+} // End isCommutable = 1
+
+def S_SUB_U32 : SOP2_32 <"s_sub_u32">;
+def S_SUB_I32 : SOP2_32 <"s_sub_i32",
+  [(set i32:$sdst, (sub SSrc_b32:$src0, SSrc_b32:$src1))]
+>;
+
+let Uses = [SCC] in { // Carry in comes from SCC
+let isCommutable = 1 in {
+def S_ADDC_U32 : SOP2_32 <"s_addc_u32",
+  [(set i32:$sdst, (adde (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
+} // End isCommutable = 1
+
+def S_SUBB_U32 : SOP2_32 <"s_subb_u32",
+  [(set i32:$sdst, (sube (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
+} // End Uses = [SCC]
+
+
+let isCommutable = 1 in {
+def S_MIN_I32 : SOP2_32 <"s_min_i32",
+  [(set i32:$sdst, (smin i32:$src0, i32:$src1))]
+>;
+def S_MIN_U32 : SOP2_32 <"s_min_u32",
+  [(set i32:$sdst, (umin i32:$src0, i32:$src1))]
+>;
+def S_MAX_I32 : SOP2_32 <"s_max_i32",
+  [(set i32:$sdst, (smax i32:$src0, i32:$src1))]
+>;
+def S_MAX_U32 : SOP2_32 <"s_max_u32",
+  [(set i32:$sdst, (umax i32:$src0, i32:$src1))]
+>;
+} // End isCommutable = 1
+} // End Defs = [SCC]
+
+
+let Uses = [SCC] in {
+  def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32">;
+  def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">;
+} // End Uses = [SCC]
+
+let Defs = [SCC] in {
+let isCommutable = 1 in {
+def S_AND_B32 : SOP2_32 <"s_and_b32",
+  [(set i32:$sdst, (and i32:$src0, i32:$src1))]
+>;
+
+def S_AND_B64 : SOP2_64 <"s_and_b64",
+  [(set i64:$sdst, (and i64:$src0, i64:$src1))]
+>;
+
+def S_OR_B32 : SOP2_32 <"s_or_b32",
+  [(set i32:$sdst, (or i32:$src0, i32:$src1))]
+>;
+
+def S_OR_B64 : SOP2_64 <"s_or_b64",
+  [(set i64:$sdst, (or i64:$src0, i64:$src1))]
+>;
+
+def S_XOR_B32 : SOP2_32 <"s_xor_b32",
+  [(set i32:$sdst, (xor i32:$src0, i32:$src1))]
+>;
+
+def S_XOR_B64 : SOP2_64 <"s_xor_b64",
+  [(set i64:$sdst, (xor i64:$src0, i64:$src1))]
+>;
+} // End isCommutable = 1
+
+def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32">;
+def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64">;
+def S_ORN2_B32 : SOP2_32 <"s_orn2_b32">;
+def S_ORN2_B64 : SOP2_64 <"s_orn2_b64">;
+def S_NAND_B32 : SOP2_32 <"s_nand_b32">;
+def S_NAND_B64 : SOP2_64 <"s_nand_b64">;
+def S_NOR_B32 : SOP2_32 <"s_nor_b32">;
+def S_NOR_B64 : SOP2_64 <"s_nor_b64">;
+def S_XNOR_B32 : SOP2_32 <"s_xnor_b32">;
+def S_XNOR_B64 : SOP2_64 <"s_xnor_b64">;
+} // End Defs = [SCC]
+
+// Use added complexity so these patterns are preferred to the VALU patterns.
+let AddedComplexity = 1 in {
+
+let Defs = [SCC] in {
+def S_LSHL_B32 : SOP2_32 <"s_lshl_b32",
+  [(set i32:$sdst, (shl i32:$src0, i32:$src1))]
+>;
+def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64",
+  [(set i64:$sdst, (shl i64:$src0, i32:$src1))]
+>;
+def S_LSHR_B32 : SOP2_32 <"s_lshr_b32",
+  [(set i32:$sdst, (srl i32:$src0, i32:$src1))]
+>;
+def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64",
+  [(set i64:$sdst, (srl i64:$src0, i32:$src1))]
+>;
+def S_ASHR_I32 : SOP2_32 <"s_ashr_i32",
+  [(set i32:$sdst, (sra i32:$src0, i32:$src1))]
+>;
+def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64",
+  [(set i64:$sdst, (sra i64:$src0, i32:$src1))]
+>;
+} // End Defs = [SCC]
+
+def S_BFM_B32 : SOP2_32 <"s_bfm_b32",
+  [(set i32:$sdst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
+def S_BFM_B64 : SOP2_64_32_32 <"s_bfm_b64">;
+def S_MUL_I32 : SOP2_32 <"s_mul_i32",
+  [(set i32:$sdst, (mul i32:$src0, i32:$src1))]> {
+  let isCommutable = 1;
+}
+
+} // End AddedComplexity = 1
+
+let Defs = [SCC] in {
+def S_BFE_U32 : SOP2_32 <"s_bfe_u32">;
+def S_BFE_I32 : SOP2_32 <"s_bfe_i32">;
+def S_BFE_U64 : SOP2_64_32 <"s_bfe_u64">;
+def S_BFE_I64 : SOP2_64_32 <"s_bfe_i64">;
+} // End Defs = [SCC]
+
+def S_CBRANCH_G_FORK : SOP2_Pseudo <
+  "s_cbranch_g_fork", (outs),
+  (ins SReg_64:$src0, SReg_64:$src1),
+  "$src0, $src1"
+> {
+  let has_sdst = 0;
+}
+
+let Defs = [SCC] in {
+def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">;
+} // End Defs = [SCC]
+
+
+//===----------------------------------------------------------------------===//
+// SOPK Instructions
+//===----------------------------------------------------------------------===//
+
+class SOPK_Pseudo <string opName, dag outs, dag ins,
+                   string asmOps, list<dag> pattern=[]> :
+  InstSI <outs, ins, "", pattern>,
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let SubtargetPredicate = isGCN;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+  let SOPK = 1;
+  let SchedRW = [WriteSALU];
+  let UseNamedOperandTable = 1;
+  string Mnemonic = opName;
+  string AsmOperands = asmOps;
+
+  bits<1> has_sdst = 1;
+}
+
+class SOPK_Real<bits<5> op, SOPK_Pseudo ps> :
+  InstSI <ps.OutOperandList, ps.InOperandList,
+          ps.Mnemonic # " " # ps.AsmOperands, []> {
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  // copy relevant pseudo op flags
+  let SubtargetPredicate = ps.SubtargetPredicate;
+  let AsmMatchConverter  = ps.AsmMatchConverter;
+  let DisableEncoding    = ps.DisableEncoding;
+  let Constraints        = ps.Constraints;
+
+  // encoding
+  bits<7>  sdst;
+  bits<16> simm16;
+  bits<32> imm;
+}
+
+class SOPK_Real32<bits<5> op, SOPK_Pseudo ps> :
+  SOPK_Real <op, ps>,
+  Enc32 {
+  let Inst{15-0}  = simm16;
+  let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
+  let Inst{27-23} = op;
+  let Inst{31-28} = 0xb; //encoding
+}
+
+class SOPK_Real64<bits<5> op, SOPK_Pseudo ps> :
+  SOPK_Real<op, ps>,
+  Enc64 {
+  let Inst{15-0}  = simm16;
+  let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
+  let Inst{27-23} = op;
+  let Inst{31-28} = 0xb; //encoding
+  let Inst{63-32} = imm;
+}
+
+class SOPKInstTable <bit is_sopk, string cmpOp = ""> {
+  bit IsSOPK = is_sopk;
+  string BaseCmpOp = cmpOp;
+}
+
+class SOPK_32 <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
+  opName,
+  (outs SReg_32:$sdst),
+  (ins u16imm:$simm16),
+  "$sdst, $simm16",
+  pattern>;
+
+class SOPK_SCC <string opName, string base_op = ""> : SOPK_Pseudo <
+  opName,
+  (outs),
+  (ins SReg_32:$sdst, u16imm:$simm16),
+  "$sdst, $simm16", []>,
+  SOPKInstTable<1, base_op>{
+  let Defs = [SCC];
+}
+
+class SOPK_32TIE <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
+  opName,
+  (outs SReg_32:$sdst),
+  (ins SReg_32:$src0, u16imm:$simm16),
+  "$sdst, $simm16",
+  pattern
+>;
+
+let isReMaterializable = 1, isMoveImm = 1 in {
+def S_MOVK_I32 : SOPK_32 <"s_movk_i32">;
+} // End isReMaterializable = 1
+let Uses = [SCC] in {
+def S_CMOVK_I32 : SOPK_32 <"s_cmovk_i32">;
+}
+
+let isCompare = 1 in {
+
+// This instruction is disabled for now until we can figure out how to teach
+// the instruction selector to correctly use the  S_CMP* vs V_CMP*
+// instructions.
+//
+// When this instruction is enabled the code generator sometimes produces this
+// invalid sequence:
+//
+// SCC = S_CMPK_EQ_I32 SGPR0, imm
+// VCC = COPY SCC
+// VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1
+//
+// def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32",
+//   [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))]
+// >;
+
+def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", "s_cmp_eq_i32">;
+def S_CMPK_LG_I32 : SOPK_SCC <"s_cmpk_lg_i32", "s_cmp_lg_i32">;
+def S_CMPK_GT_I32 : SOPK_SCC <"s_cmpk_gt_i32", "s_cmp_gt_i32">;
+def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32">;
+def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32">;
+def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32">;
+
+let SOPKZext = 1 in {
+def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32">;
+def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32">;
+def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32">;
+def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32">;
+def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32">;
+def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32">;
+} // End SOPKZext = 1
+} // End isCompare = 1
+
+let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0",
+    Constraints = "$sdst = $src0" in {
+  def S_ADDK_I32 : SOPK_32TIE <"s_addk_i32">;
+  def S_MULK_I32 : SOPK_32TIE <"s_mulk_i32">;
+}
+
+def S_CBRANCH_I_FORK : SOPK_Pseudo <
+  "s_cbranch_i_fork",
+  (outs), (ins SReg_64:$sdst, u16imm:$simm16),
+  "$sdst, $simm16"
+>;
+
+let mayLoad = 1 in {
+def S_GETREG_B32 : SOPK_Pseudo <
+  "s_getreg_b32",
+  (outs SReg_32:$sdst), (ins hwreg:$simm16),
+  "$sdst, $simm16"
+>;
+}
+
+let hasSideEffects = 1 in {
+
+def S_SETREG_B32 : SOPK_Pseudo <
+  "s_setreg_b32",
+  (outs), (ins SReg_32:$sdst, hwreg:$simm16),
+  "$simm16, $sdst",
+  [(AMDGPUsetreg i32:$sdst, (i16 timm:$simm16))]
+>;
+
+// FIXME: Not on SI?
+//def S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32">;
+
+def S_SETREG_IMM32_B32 : SOPK_Pseudo <
+  "s_setreg_imm32_b32",
+  (outs), (ins i32imm:$imm, hwreg:$simm16),
+  "$simm16, $imm"> {
+  let Size = 8; // Unlike every other SOPK instruction.
+  let has_sdst = 0;
+}
+
+} // End hasSideEffects = 1
+
+//===----------------------------------------------------------------------===//
+// SOPC Instructions
+//===----------------------------------------------------------------------===//
+
+class SOPCe <bits<7> op> : Enc32 {
+  bits<8> src0;
+  bits<8> src1;
+
+  let Inst{7-0} = src0;
+  let Inst{15-8} = src1;
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x17e;
+}
+
+class SOPC <bits<7> op, dag outs, dag ins, string asm,
+            list<dag> pattern = []> :
+  InstSI<outs, ins, asm, pattern>, SOPCe <op> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+  let SOPC = 1;
+  let isCodeGenOnly = 0;
+  let Defs = [SCC];
+  let SchedRW = [WriteSALU];
+  let UseNamedOperandTable = 1;
+  let SubtargetPredicate = isGCN;
+}
+
+class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1,
+                 string opName, list<dag> pattern = []> : SOPC <
+  op, (outs), (ins rc0:$src0, rc1:$src1),
+  opName#" $src0, $src1", pattern > {
+  let Defs = [SCC];
+}
+class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
+                    string opName, PatLeaf cond> : SOPC_Base <
+  op, rc, rc, opName,
+  [(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > {
+}
+
+class SOPC_CMP_32<bits<7> op, string opName,
+                  PatLeaf cond = COND_NULL, string revOp = opName>
+  : SOPC_Helper<op, SSrc_b32, i32, opName, cond>,
+    Commutable_REV<revOp, !eq(revOp, opName)>,
+    SOPKInstTable<0, opName> {
+  let isCompare = 1;
+  let isCommutable = 1;
+}
+
+class SOPC_CMP_64<bits<7> op, string opName,
+                  PatLeaf cond = COND_NULL, string revOp = opName>
+  : SOPC_Helper<op, SSrc_b64, i64, opName, cond>,
+    Commutable_REV<revOp, !eq(revOp, opName)> {
+  let isCompare = 1;
+  let isCommutable = 1;
+}
+
+class SOPC_32<bits<7> op, string opName, list<dag> pattern = []>
+  : SOPC_Base<op, SSrc_b32, SSrc_b32, opName, pattern>;
+
+class SOPC_64_32<bits<7> op, string opName, list<dag> pattern = []>
+  : SOPC_Base<op, SSrc_b64, SSrc_b32, opName, pattern>;
+
+def S_CMP_EQ_I32 : SOPC_CMP_32 <0x00, "s_cmp_eq_i32">;
+def S_CMP_LG_I32 : SOPC_CMP_32 <0x01, "s_cmp_lg_i32">;
+def S_CMP_GT_I32 : SOPC_CMP_32 <0x02, "s_cmp_gt_i32", COND_SGT>;
+def S_CMP_GE_I32 : SOPC_CMP_32 <0x03, "s_cmp_ge_i32", COND_SGE>;
+def S_CMP_LT_I32 : SOPC_CMP_32 <0x04, "s_cmp_lt_i32", COND_SLT, "s_cmp_gt_i32">;
+def S_CMP_LE_I32 : SOPC_CMP_32 <0x05, "s_cmp_le_i32", COND_SLE, "s_cmp_ge_i32">;
+def S_CMP_EQ_U32 : SOPC_CMP_32 <0x06, "s_cmp_eq_u32", COND_EQ>;
+def S_CMP_LG_U32 : SOPC_CMP_32 <0x07, "s_cmp_lg_u32", COND_NE>;
+def S_CMP_GT_U32 : SOPC_CMP_32 <0x08, "s_cmp_gt_u32", COND_UGT>;
+def S_CMP_GE_U32 : SOPC_CMP_32 <0x09, "s_cmp_ge_u32", COND_UGE>;
+def S_CMP_LT_U32 : SOPC_CMP_32 <0x0a, "s_cmp_lt_u32", COND_ULT, "s_cmp_gt_u32">;
+def S_CMP_LE_U32 : SOPC_CMP_32 <0x0b, "s_cmp_le_u32", COND_ULE, "s_cmp_ge_u32">;
+
+def S_BITCMP0_B32 : SOPC_32 <0x0c, "s_bitcmp0_b32">;
+def S_BITCMP1_B32 : SOPC_32 <0x0d, "s_bitcmp1_b32">;
+def S_BITCMP0_B64 : SOPC_64_32 <0x0e, "s_bitcmp0_b64">;
+def S_BITCMP1_B64 : SOPC_64_32 <0x0f, "s_bitcmp1_b64">;
+def S_SETVSKIP : SOPC_32 <0x10, "s_setvskip">;
+
+let SubtargetPredicate = isVI in {
+def S_CMP_EQ_U64 : SOPC_CMP_64 <0x12, "s_cmp_eq_u64", COND_EQ>;
+def S_CMP_LG_U64 : SOPC_CMP_64 <0x13, "s_cmp_lg_u64", COND_NE>;
+}
+
+let SubtargetPredicate = HasVGPRIndexMode in {
+def S_SET_GPR_IDX_ON : SOPC <0x11,
+  (outs),
+  (ins SSrc_b32:$src0, GPRIdxMode:$src1),
+  "s_set_gpr_idx_on $src0,$src1"> {
+  let Defs = [M0]; // No scc def
+  let Uses = [M0]; // Other bits of m0 unmodified.
+  let hasSideEffects = 1; // Sets mode.gpr_idx_en
+  let FixedSize = 1;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SOPP Instructions
+//===----------------------------------------------------------------------===//
+
+class SOPPe <bits<7> op> : Enc32 {
+  bits <16> simm16;
+
+  let Inst{15-0} = simm16;
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x17f; // encoding
+}
+
+class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
+  InstSI <(outs), ins, asm, pattern >, SOPPe <op> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+  let SOPP = 1;
+  let Size = 4;
+  let SchedRW = [WriteSALU];
+
+  let UseNamedOperandTable = 1;
+  let SubtargetPredicate = isGCN;
+}
+
+
+def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">;
+
+let isTerminator = 1 in {
+
+def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
+  [(AMDGPUendpgm)]> {
+  let simm16 = 0;
+  let isBarrier = 1;
+  let isReturn = 1;
+}
+
+let isBranch = 1, SchedRW = [WriteBranch] in {
+def S_BRANCH : SOPP <
+  0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
+  [(br bb:$simm16)]> {
+  let isBarrier = 1;
+}
+
+let Uses = [SCC] in {
+def S_CBRANCH_SCC0 : SOPP <
+  0x00000004, (ins sopp_brtarget:$simm16),
+  "s_cbranch_scc0 $simm16"
+>;
+def S_CBRANCH_SCC1 : SOPP <
+  0x00000005, (ins sopp_brtarget:$simm16),
+  "s_cbranch_scc1 $simm16",
+  [(si_uniform_br_scc SCC, bb:$simm16)]
+>;
+} // End Uses = [SCC]
+
+let Uses = [VCC] in {
+def S_CBRANCH_VCCZ : SOPP <
+  0x00000006, (ins sopp_brtarget:$simm16),
+  "s_cbranch_vccz $simm16"
+>;
+def S_CBRANCH_VCCNZ : SOPP <
+  0x00000007, (ins sopp_brtarget:$simm16),
+  "s_cbranch_vccnz $simm16"
+>;
+} // End Uses = [VCC]
+
+let Uses = [EXEC] in {
+def S_CBRANCH_EXECZ : SOPP <
+  0x00000008, (ins sopp_brtarget:$simm16),
+  "s_cbranch_execz $simm16"
+>;
+def S_CBRANCH_EXECNZ : SOPP <
+  0x00000009, (ins sopp_brtarget:$simm16),
+  "s_cbranch_execnz $simm16"
+>;
+} // End Uses = [EXEC]
+
+
+} // End isBranch = 1
+} // End isTerminator = 1
+
+let hasSideEffects = 1 in {
+def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
+  [(int_amdgcn_s_barrier)]> {
+  let SchedRW = [WriteBarrier];
+  let simm16 = 0;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let isConvergent = 1;
+}
+
+let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
+def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
+def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
+
+// On SI the documentation says sleep for approximately 64 * low 2
+// bits, consistent with the reported maximum of 448. On VI the
+// maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the
+// maximum really 15 on VI?
+def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16),
+  "s_sleep $simm16", [(int_amdgcn_s_sleep SIMM16bit:$simm16)]> {
+  let hasSideEffects = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">;
+
+let Uses = [EXEC, M0] in {
+// FIXME: Should this be mayLoad+mayStore?
+def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16",
+  [(AMDGPUsendmsg (i32 imm:$simm16))]
+>;
+} // End Uses = [EXEC, M0]
+
+def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16">;
+def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">;
+def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
+  let simm16 = 0;
+}
+def S_INCPERFLEVEL : SOPP <0x00000014, (ins i32imm:$simm16), "s_incperflevel $simm16",
+  [(int_amdgcn_s_incperflevel SIMM16bit:$simm16)]> {
+  let hasSideEffects = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+def S_DECPERFLEVEL : SOPP <0x00000015, (ins i32imm:$simm16), "s_decperflevel $simm16",
+  [(int_amdgcn_s_decperflevel SIMM16bit:$simm16)]> {
+  let hasSideEffects = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> {
+  let simm16 = 0;
+}
+
+let SubtargetPredicate = HasVGPRIndexMode in {
+def S_SET_GPR_IDX_OFF : SOPP<0x1c, (ins), "s_set_gpr_idx_off"> {
+  let simm16 = 0;
+}
+}
+} // End hasSideEffects
+
+let SubtargetPredicate = HasVGPRIndexMode in {
+def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16),
+  "s_set_gpr_idx_mode$simm16"> {
+  let Defs = [M0];
+}
+}
+
+let Predicates = [isGCN] in {
+
+//===----------------------------------------------------------------------===//
+// S_GETREG_B32 Intrinsic Pattern.
+//===----------------------------------------------------------------------===//
+def : Pat <
+  (int_amdgcn_s_getreg imm:$simm16),
+  (S_GETREG_B32 (as_i16imm $simm16))
+>;
+
+//===----------------------------------------------------------------------===//
+// SOP1 Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (i64 (ctpop i64:$src)),
+    (i64 (REG_SEQUENCE SReg_64,
+     (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
+     (S_MOV_B32 (i32 0)), sub1))
+>;
+
+def : Pat <
+  (i32 (smax i32:$x, (i32 (ineg i32:$x)))),
+  (S_ABS_I32 $x)
+>;
+
+def : Pat <
+  (i16 imm:$imm),
+  (S_MOV_B32 imm:$imm)
+>;
+
+// Same as a 32-bit inreg
+def : Pat<
+  (i32 (sext i16:$src)),
+  (S_SEXT_I32_I16 $src)
+>;
+
+
+//===----------------------------------------------------------------------===//
+// SOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
+// case, the sgpr-copies pass will fix this to use the vector version.
+def : Pat <
+  (i32 (addc i32:$src0, i32:$src1)),
+  (S_ADD_U32 $src0, $src1)
+>;
+
+// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
+// REG_SEQUENCE patterns don't support instructions with multiple
+// outputs.
+def : Pat<
+  (i64 (zext i16:$src)),
+    (REG_SEQUENCE SReg_64,
+      (i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0,
+      (S_MOV_B32 (i32 0)), sub1)
+>;
+
+def : Pat <
+  (i64 (sext i16:$src)),
+    (REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0,
+    (i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1)
+>;
+
+def : Pat<
+  (i32 (zext i16:$src)),
+  (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
+>;
+
+
+
+//===----------------------------------------------------------------------===//
+// SOPP Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (int_amdgcn_s_waitcnt i32:$simm16),
+  (S_WAITCNT (as_i16imm $simm16))
+>;
+
+} // End isGCN predicate
+
+
+//===----------------------------------------------------------------------===//
+// Real target instructions, move this to the appropriate subtarget TD file
+//===----------------------------------------------------------------------===//
+
+class Select_si<string opName> :
+  SIMCInstr<opName, SIEncodingFamily.SI> {
+  list<Predicate> AssemblerPredicates = [isSICI];
+  string DecoderNamespace = "SICI";
+}
+
+class SOP1_Real_si<bits<8> op, SOP1_Pseudo ps> :
+  SOP1_Real<op, ps>,
+  Select_si<ps.Mnemonic>;
+
+class SOP2_Real_si<bits<7> op, SOP2_Pseudo ps> :
+  SOP2_Real<op, ps>,
+  Select_si<ps.Mnemonic>;
+
+class SOPK_Real_si<bits<5> op, SOPK_Pseudo ps> :
+  SOPK_Real32<op, ps>,
+  Select_si<ps.Mnemonic>;
+
+def S_MOV_B32_si           : SOP1_Real_si <0x03, S_MOV_B32>;
+def S_MOV_B64_si           : SOP1_Real_si <0x04, S_MOV_B64>;
+def S_CMOV_B32_si          : SOP1_Real_si <0x05, S_CMOV_B32>;
+def S_CMOV_B64_si          : SOP1_Real_si <0x06, S_CMOV_B64>;
+def S_NOT_B32_si           : SOP1_Real_si <0x07, S_NOT_B32>;
+def S_NOT_B64_si           : SOP1_Real_si <0x08, S_NOT_B64>;
+def S_WQM_B32_si           : SOP1_Real_si <0x09, S_WQM_B32>;
+def S_WQM_B64_si           : SOP1_Real_si <0x0a, S_WQM_B64>;
+def S_BREV_B32_si          : SOP1_Real_si <0x0b, S_BREV_B32>;
+def S_BREV_B64_si          : SOP1_Real_si <0x0c, S_BREV_B64>;
+def S_BCNT0_I32_B32_si     : SOP1_Real_si <0x0d, S_BCNT0_I32_B32>;
+def S_BCNT0_I32_B64_si     : SOP1_Real_si <0x0e, S_BCNT0_I32_B64>;
+def S_BCNT1_I32_B32_si     : SOP1_Real_si <0x0f, S_BCNT1_I32_B32>;
+def S_BCNT1_I32_B64_si     : SOP1_Real_si <0x10, S_BCNT1_I32_B64>;
+def S_FF0_I32_B32_si       : SOP1_Real_si <0x11, S_FF0_I32_B32>;
+def S_FF0_I32_B64_si       : SOP1_Real_si <0x12, S_FF0_I32_B64>;
+def S_FF1_I32_B32_si       : SOP1_Real_si <0x13, S_FF1_I32_B32>;
+def S_FF1_I32_B64_si       : SOP1_Real_si <0x14, S_FF1_I32_B64>;
+def S_FLBIT_I32_B32_si     : SOP1_Real_si <0x15, S_FLBIT_I32_B32>;
+def S_FLBIT_I32_B64_si     : SOP1_Real_si <0x16, S_FLBIT_I32_B64>;
+def S_FLBIT_I32_si         : SOP1_Real_si <0x17, S_FLBIT_I32>;
+def S_FLBIT_I32_I64_si     : SOP1_Real_si <0x18, S_FLBIT_I32_I64>;
+def S_SEXT_I32_I8_si       : SOP1_Real_si <0x19, S_SEXT_I32_I8>;
+def S_SEXT_I32_I16_si      : SOP1_Real_si <0x1a, S_SEXT_I32_I16>;
+def S_BITSET0_B32_si       : SOP1_Real_si <0x1b, S_BITSET0_B32>;
+def S_BITSET0_B64_si       : SOP1_Real_si <0x1c, S_BITSET0_B64>;
+def S_BITSET1_B32_si       : SOP1_Real_si <0x1d, S_BITSET1_B32>;
+def S_BITSET1_B64_si       : SOP1_Real_si <0x1e, S_BITSET1_B64>;
+def S_GETPC_B64_si         : SOP1_Real_si <0x1f, S_GETPC_B64>;
+def S_SETPC_B64_si         : SOP1_Real_si <0x20, S_SETPC_B64>;
+def S_SWAPPC_B64_si        : SOP1_Real_si <0x21, S_SWAPPC_B64>;
+def S_RFE_B64_si           : SOP1_Real_si <0x22, S_RFE_B64>;
+def S_AND_SAVEEXEC_B64_si  : SOP1_Real_si <0x24, S_AND_SAVEEXEC_B64>;
+def S_OR_SAVEEXEC_B64_si   : SOP1_Real_si <0x25, S_OR_SAVEEXEC_B64>;
+def S_XOR_SAVEEXEC_B64_si  : SOP1_Real_si <0x26, S_XOR_SAVEEXEC_B64>;
+def S_ANDN2_SAVEEXEC_B64_si: SOP1_Real_si <0x27, S_ANDN2_SAVEEXEC_B64>;
+def S_ORN2_SAVEEXEC_B64_si : SOP1_Real_si <0x28, S_ORN2_SAVEEXEC_B64>;
+def S_NAND_SAVEEXEC_B64_si : SOP1_Real_si <0x29, S_NAND_SAVEEXEC_B64>;
+def S_NOR_SAVEEXEC_B64_si  : SOP1_Real_si <0x2a, S_NOR_SAVEEXEC_B64>;
+def S_XNOR_SAVEEXEC_B64_si : SOP1_Real_si <0x2b, S_XNOR_SAVEEXEC_B64>;
+def S_QUADMASK_B32_si      : SOP1_Real_si <0x2c, S_QUADMASK_B32>;
+def S_QUADMASK_B64_si      : SOP1_Real_si <0x2d, S_QUADMASK_B64>;
+def S_MOVRELS_B32_si       : SOP1_Real_si <0x2e, S_MOVRELS_B32>;
+def S_MOVRELS_B64_si       : SOP1_Real_si <0x2f, S_MOVRELS_B64>;
+def S_MOVRELD_B32_si       : SOP1_Real_si <0x30, S_MOVRELD_B32>;
+def S_MOVRELD_B64_si       : SOP1_Real_si <0x31, S_MOVRELD_B64>;
+def S_CBRANCH_JOIN_si      : SOP1_Real_si <0x32, S_CBRANCH_JOIN>;
+def S_MOV_REGRD_B32_si     : SOP1_Real_si <0x33, S_MOV_REGRD_B32>;
+def S_ABS_I32_si           : SOP1_Real_si <0x34, S_ABS_I32>;
+def S_MOV_FED_B32_si       : SOP1_Real_si <0x35, S_MOV_FED_B32>;
+
+def S_ADD_U32_si           : SOP2_Real_si <0x00, S_ADD_U32>;
+def S_ADD_I32_si           : SOP2_Real_si <0x02, S_ADD_I32>;
+def S_SUB_U32_si           : SOP2_Real_si <0x01, S_SUB_U32>;
+def S_SUB_I32_si           : SOP2_Real_si <0x03, S_SUB_I32>;
+def S_ADDC_U32_si          : SOP2_Real_si <0x04, S_ADDC_U32>;
+def S_SUBB_U32_si          : SOP2_Real_si <0x05, S_SUBB_U32>;
+def S_MIN_I32_si           : SOP2_Real_si <0x06, S_MIN_I32>;
+def S_MIN_U32_si           : SOP2_Real_si <0x07, S_MIN_U32>;
+def S_MAX_I32_si           : SOP2_Real_si <0x08, S_MAX_I32>;
+def S_MAX_U32_si           : SOP2_Real_si <0x09, S_MAX_U32>;
+def S_CSELECT_B32_si       : SOP2_Real_si <0x0a, S_CSELECT_B32>;
+def S_CSELECT_B64_si       : SOP2_Real_si <0x0b, S_CSELECT_B64>;
+def S_AND_B32_si           : SOP2_Real_si <0x0e, S_AND_B32>;
+def S_AND_B64_si           : SOP2_Real_si <0x0f, S_AND_B64>;
+def S_OR_B32_si            : SOP2_Real_si <0x10, S_OR_B32>;
+def S_OR_B64_si            : SOP2_Real_si <0x11, S_OR_B64>;
+def S_XOR_B32_si           : SOP2_Real_si <0x12, S_XOR_B32>;
+def S_XOR_B64_si           : SOP2_Real_si <0x13, S_XOR_B64>;
+def S_ANDN2_B32_si         : SOP2_Real_si <0x14, S_ANDN2_B32>;
+def S_ANDN2_B64_si         : SOP2_Real_si <0x15, S_ANDN2_B64>;
+def S_ORN2_B32_si          : SOP2_Real_si <0x16, S_ORN2_B32>;
+def S_ORN2_B64_si          : SOP2_Real_si <0x17, S_ORN2_B64>;
+def S_NAND_B32_si          : SOP2_Real_si <0x18, S_NAND_B32>;
+def S_NAND_B64_si          : SOP2_Real_si <0x19, S_NAND_B64>;
+def S_NOR_B32_si           : SOP2_Real_si <0x1a, S_NOR_B32>;
+def S_NOR_B64_si           : SOP2_Real_si <0x1b, S_NOR_B64>;
+def S_XNOR_B32_si          : SOP2_Real_si <0x1c, S_XNOR_B32>;
+def S_XNOR_B64_si          : SOP2_Real_si <0x1d, S_XNOR_B64>;
+def S_LSHL_B32_si          : SOP2_Real_si <0x1e, S_LSHL_B32>;
+def S_LSHL_B64_si          : SOP2_Real_si <0x1f, S_LSHL_B64>;
+def S_LSHR_B32_si          : SOP2_Real_si <0x20, S_LSHR_B32>;
+def S_LSHR_B64_si          : SOP2_Real_si <0x21, S_LSHR_B64>;
+def S_ASHR_I32_si          : SOP2_Real_si <0x22, S_ASHR_I32>;
+def S_ASHR_I64_si          : SOP2_Real_si <0x23, S_ASHR_I64>;
+def S_BFM_B32_si           : SOP2_Real_si <0x24, S_BFM_B32>;
+def S_BFM_B64_si           : SOP2_Real_si <0x25, S_BFM_B64>;
+def S_MUL_I32_si           : SOP2_Real_si <0x26, S_MUL_I32>;
+def S_BFE_U32_si           : SOP2_Real_si <0x27, S_BFE_U32>;
+def S_BFE_I32_si           : SOP2_Real_si <0x28, S_BFE_I32>;
+def S_BFE_U64_si           : SOP2_Real_si <0x29, S_BFE_U64>;
+def S_BFE_I64_si           : SOP2_Real_si <0x2a, S_BFE_I64>;
+def S_CBRANCH_G_FORK_si    : SOP2_Real_si <0x2b, S_CBRANCH_G_FORK>;
+def S_ABSDIFF_I32_si       : SOP2_Real_si <0x2c, S_ABSDIFF_I32>;
+
+def S_MOVK_I32_si          : SOPK_Real_si <0x00, S_MOVK_I32>;
+def S_CMOVK_I32_si         : SOPK_Real_si <0x02, S_CMOVK_I32>;
+def S_CMPK_EQ_I32_si       : SOPK_Real_si <0x03, S_CMPK_EQ_I32>;
+def S_CMPK_LG_I32_si       : SOPK_Real_si <0x04, S_CMPK_LG_I32>;
+def S_CMPK_GT_I32_si       : SOPK_Real_si <0x05, S_CMPK_GT_I32>;
+def S_CMPK_GE_I32_si       : SOPK_Real_si <0x06, S_CMPK_GE_I32>;
+def S_CMPK_LT_I32_si       : SOPK_Real_si <0x07, S_CMPK_LT_I32>;
+def S_CMPK_LE_I32_si       : SOPK_Real_si <0x08, S_CMPK_LE_I32>;
+def S_CMPK_EQ_U32_si       : SOPK_Real_si <0x09, S_CMPK_EQ_U32>;
+def S_CMPK_LG_U32_si       : SOPK_Real_si <0x0a, S_CMPK_LG_U32>;
+def S_CMPK_GT_U32_si       : SOPK_Real_si <0x0b, S_CMPK_GT_U32>;
+def S_CMPK_GE_U32_si       : SOPK_Real_si <0x0c, S_CMPK_GE_U32>;
+def S_CMPK_LT_U32_si       : SOPK_Real_si <0x0d, S_CMPK_LT_U32>;
+def S_CMPK_LE_U32_si       : SOPK_Real_si <0x0e, S_CMPK_LE_U32>;
+def S_ADDK_I32_si          : SOPK_Real_si <0x0f, S_ADDK_I32>;
+def S_MULK_I32_si          : SOPK_Real_si <0x10, S_MULK_I32>;
+def S_CBRANCH_I_FORK_si    : SOPK_Real_si <0x11, S_CBRANCH_I_FORK>;
+def S_GETREG_B32_si        : SOPK_Real_si <0x12, S_GETREG_B32>;
+def S_SETREG_B32_si        : SOPK_Real_si <0x13, S_SETREG_B32>;
+//def S_GETREG_REGRD_B32_si  : SOPK_Real_si <0x14, S_GETREG_REGRD_B32>; // see pseudo for comments
+def S_SETREG_IMM32_B32_si  : SOPK_Real64<0x15, S_SETREG_IMM32_B32>,
+                             Select_si<S_SETREG_IMM32_B32.Mnemonic>;
+
+
+class Select_vi<string opName> :
+  SIMCInstr<opName, SIEncodingFamily.VI> {
+  list<Predicate> AssemblerPredicates = [isVI];
+  string DecoderNamespace = "VI";
+}
+
+class SOP1_Real_vi<bits<8> op, SOP1_Pseudo ps> :
+  SOP1_Real<op, ps>,
+  Select_vi<ps.Mnemonic>;
+
+
+class SOP2_Real_vi<bits<7> op, SOP2_Pseudo ps> :
+  SOP2_Real<op, ps>,
+  Select_vi<ps.Mnemonic>;
+
+class SOPK_Real_vi<bits<5> op, SOPK_Pseudo ps> :
+  SOPK_Real32<op, ps>,
+  Select_vi<ps.Mnemonic>;
+
+def S_MOV_B32_vi           : SOP1_Real_vi <0x00, S_MOV_B32>;
+def S_MOV_B64_vi           : SOP1_Real_vi <0x01, S_MOV_B64>;
+def S_CMOV_B32_vi          : SOP1_Real_vi <0x02, S_CMOV_B32>;
+def S_CMOV_B64_vi          : SOP1_Real_vi <0x03, S_CMOV_B64>;
+def S_NOT_B32_vi           : SOP1_Real_vi <0x04, S_NOT_B32>;
+def S_NOT_B64_vi           : SOP1_Real_vi <0x05, S_NOT_B64>;
+def S_WQM_B32_vi           : SOP1_Real_vi <0x06, S_WQM_B32>;
+def S_WQM_B64_vi           : SOP1_Real_vi <0x07, S_WQM_B64>;
+def S_BREV_B32_vi          : SOP1_Real_vi <0x08, S_BREV_B32>;
+def S_BREV_B64_vi          : SOP1_Real_vi <0x09, S_BREV_B64>;
+def S_BCNT0_I32_B32_vi     : SOP1_Real_vi <0x0a, S_BCNT0_I32_B32>;
+def S_BCNT0_I32_B64_vi     : SOP1_Real_vi <0x0b, S_BCNT0_I32_B64>;
+def S_BCNT1_I32_B32_vi     : SOP1_Real_vi <0x0c, S_BCNT1_I32_B32>;
+def S_BCNT1_I32_B64_vi     : SOP1_Real_vi <0x0d, S_BCNT1_I32_B64>;
+def S_FF0_I32_B32_vi       : SOP1_Real_vi <0x0e, S_FF0_I32_B32>;
+def S_FF0_I32_B64_vi       : SOP1_Real_vi <0x0f, S_FF0_I32_B64>;
+def S_FF1_I32_B32_vi       : SOP1_Real_vi <0x10, S_FF1_I32_B32>;
+def S_FF1_I32_B64_vi       : SOP1_Real_vi <0x11, S_FF1_I32_B64>;
+def S_FLBIT_I32_B32_vi     : SOP1_Real_vi <0x12, S_FLBIT_I32_B32>;
+def S_FLBIT_I32_B64_vi     : SOP1_Real_vi <0x13, S_FLBIT_I32_B64>;
+def S_FLBIT_I32_vi         : SOP1_Real_vi <0x14, S_FLBIT_I32>;
+def S_FLBIT_I32_I64_vi     : SOP1_Real_vi <0x15, S_FLBIT_I32_I64>;
+def S_SEXT_I32_I8_vi       : SOP1_Real_vi <0x16, S_SEXT_I32_I8>;
+def S_SEXT_I32_I16_vi      : SOP1_Real_vi <0x17, S_SEXT_I32_I16>;
+def S_BITSET0_B32_vi       : SOP1_Real_vi <0x18, S_BITSET0_B32>;
+def S_BITSET0_B64_vi       : SOP1_Real_vi <0x19, S_BITSET0_B64>;
+def S_BITSET1_B32_vi       : SOP1_Real_vi <0x1a, S_BITSET1_B32>;
+def S_BITSET1_B64_vi       : SOP1_Real_vi <0x1b, S_BITSET1_B64>;
+def S_GETPC_B64_vi         : SOP1_Real_vi <0x1c, S_GETPC_B64>;
+def S_SETPC_B64_vi         : SOP1_Real_vi <0x1d, S_SETPC_B64>;
+def S_SWAPPC_B64_vi        : SOP1_Real_vi <0x1e, S_SWAPPC_B64>;
+def S_RFE_B64_vi           : SOP1_Real_vi <0x1f, S_RFE_B64>;
+def S_AND_SAVEEXEC_B64_vi  : SOP1_Real_vi <0x20, S_AND_SAVEEXEC_B64>;
+def S_OR_SAVEEXEC_B64_vi   : SOP1_Real_vi <0x21, S_OR_SAVEEXEC_B64>;
+def S_XOR_SAVEEXEC_B64_vi  : SOP1_Real_vi <0x22, S_XOR_SAVEEXEC_B64>;
+def S_ANDN2_SAVEEXEC_B64_vi: SOP1_Real_vi <0x23, S_ANDN2_SAVEEXEC_B64>;
+def S_ORN2_SAVEEXEC_B64_vi : SOP1_Real_vi <0x24, S_ORN2_SAVEEXEC_B64>;
+def S_NAND_SAVEEXEC_B64_vi : SOP1_Real_vi <0x25, S_NAND_SAVEEXEC_B64>;
+def S_NOR_SAVEEXEC_B64_vi  : SOP1_Real_vi <0x26, S_NOR_SAVEEXEC_B64>;
+def S_XNOR_SAVEEXEC_B64_vi : SOP1_Real_vi <0x27, S_XNOR_SAVEEXEC_B64>;
+def S_QUADMASK_B32_vi      : SOP1_Real_vi <0x28, S_QUADMASK_B32>;
+def S_QUADMASK_B64_vi      : SOP1_Real_vi <0x29, S_QUADMASK_B64>;
+def S_MOVRELS_B32_vi       : SOP1_Real_vi <0x2a, S_MOVRELS_B32>;
+def S_MOVRELS_B64_vi       : SOP1_Real_vi <0x2b, S_MOVRELS_B64>;
+def S_MOVRELD_B32_vi       : SOP1_Real_vi <0x2c, S_MOVRELD_B32>;
+def S_MOVRELD_B64_vi       : SOP1_Real_vi <0x2d, S_MOVRELD_B64>;
+def S_CBRANCH_JOIN_vi      : SOP1_Real_vi <0x2e, S_CBRANCH_JOIN>;
+def S_MOV_REGRD_B32_vi     : SOP1_Real_vi <0x2f, S_MOV_REGRD_B32>;
+def S_ABS_I32_vi           : SOP1_Real_vi <0x30, S_ABS_I32>;
+def S_MOV_FED_B32_vi       : SOP1_Real_vi <0x31, S_MOV_FED_B32>;
+def S_SET_GPR_IDX_IDX_vi   : SOP1_Real_vi <0x32, S_SET_GPR_IDX_IDX>;
+
+def S_ADD_U32_vi           : SOP2_Real_vi <0x00, S_ADD_U32>;
+def S_ADD_I32_vi           : SOP2_Real_vi <0x02, S_ADD_I32>;
+def S_SUB_U32_vi           : SOP2_Real_vi <0x01, S_SUB_U32>;
+def S_SUB_I32_vi           : SOP2_Real_vi <0x03, S_SUB_I32>;
+def S_ADDC_U32_vi          : SOP2_Real_vi <0x04, S_ADDC_U32>;
+def S_SUBB_U32_vi          : SOP2_Real_vi <0x05, S_SUBB_U32>;
+def S_MIN_I32_vi           : SOP2_Real_vi <0x06, S_MIN_I32>;
+def S_MIN_U32_vi           : SOP2_Real_vi <0x07, S_MIN_U32>;
+def S_MAX_I32_vi           : SOP2_Real_vi <0x08, S_MAX_I32>;
+def S_MAX_U32_vi           : SOP2_Real_vi <0x09, S_MAX_U32>;
+def S_CSELECT_B32_vi       : SOP2_Real_vi <0x0a, S_CSELECT_B32>;
+def S_CSELECT_B64_vi       : SOP2_Real_vi <0x0b, S_CSELECT_B64>;
+def S_AND_B32_vi           : SOP2_Real_vi <0x0c, S_AND_B32>;
+def S_AND_B64_vi           : SOP2_Real_vi <0x0d, S_AND_B64>;
+def S_OR_B32_vi            : SOP2_Real_vi <0x0e, S_OR_B32>;
+def S_OR_B64_vi            : SOP2_Real_vi <0x0f, S_OR_B64>;
+def S_XOR_B32_vi           : SOP2_Real_vi <0x10, S_XOR_B32>;
+def S_XOR_B64_vi           : SOP2_Real_vi <0x11, S_XOR_B64>;
+def S_ANDN2_B32_vi         : SOP2_Real_vi <0x12, S_ANDN2_B32>;
+def S_ANDN2_B64_vi         : SOP2_Real_vi <0x13, S_ANDN2_B64>;
+def S_ORN2_B32_vi          : SOP2_Real_vi <0x14, S_ORN2_B32>;
+def S_ORN2_B64_vi          : SOP2_Real_vi <0x15, S_ORN2_B64>;
+def S_NAND_B32_vi          : SOP2_Real_vi <0x16, S_NAND_B32>;
+def S_NAND_B64_vi          : SOP2_Real_vi <0x17, S_NAND_B64>;
+def S_NOR_B32_vi           : SOP2_Real_vi <0x18, S_NOR_B32>;
+def S_NOR_B64_vi           : SOP2_Real_vi <0x19, S_NOR_B64>;
+def S_XNOR_B32_vi          : SOP2_Real_vi <0x1a, S_XNOR_B32>;
+def S_XNOR_B64_vi          : SOP2_Real_vi <0x1b, S_XNOR_B64>;
+def S_LSHL_B32_vi          : SOP2_Real_vi <0x1c, S_LSHL_B32>;
+def S_LSHL_B64_vi          : SOP2_Real_vi <0x1d, S_LSHL_B64>;
+def S_LSHR_B32_vi          : SOP2_Real_vi <0x1e, S_LSHR_B32>;
+def S_LSHR_B64_vi          : SOP2_Real_vi <0x1f, S_LSHR_B64>;
+def S_ASHR_I32_vi          : SOP2_Real_vi <0x20, S_ASHR_I32>;
+def S_ASHR_I64_vi          : SOP2_Real_vi <0x21, S_ASHR_I64>;
+def S_BFM_B32_vi           : SOP2_Real_vi <0x22, S_BFM_B32>;
+def S_BFM_B64_vi           : SOP2_Real_vi <0x23, S_BFM_B64>;
+def S_MUL_I32_vi           : SOP2_Real_vi <0x24, S_MUL_I32>;
+def S_BFE_U32_vi           : SOP2_Real_vi <0x25, S_BFE_U32>;
+def S_BFE_I32_vi           : SOP2_Real_vi <0x26, S_BFE_I32>;
+def S_BFE_U64_vi           : SOP2_Real_vi <0x27, S_BFE_U64>;
+def S_BFE_I64_vi           : SOP2_Real_vi <0x28, S_BFE_I64>;
+def S_CBRANCH_G_FORK_vi    : SOP2_Real_vi <0x29, S_CBRANCH_G_FORK>;
+def S_ABSDIFF_I32_vi       : SOP2_Real_vi <0x2a, S_ABSDIFF_I32>;
+
+def S_MOVK_I32_vi          : SOPK_Real_vi <0x00, S_MOVK_I32>;
+def S_CMOVK_I32_vi         : SOPK_Real_vi <0x01, S_CMOVK_I32>;
+def S_CMPK_EQ_I32_vi       : SOPK_Real_vi <0x02, S_CMPK_EQ_I32>;
+def S_CMPK_LG_I32_vi       : SOPK_Real_vi <0x03, S_CMPK_LG_I32>;
+def S_CMPK_GT_I32_vi       : SOPK_Real_vi <0x04, S_CMPK_GT_I32>;
+def S_CMPK_GE_I32_vi       : SOPK_Real_vi <0x05, S_CMPK_GE_I32>;
+def S_CMPK_LT_I32_vi       : SOPK_Real_vi <0x06, S_CMPK_LT_I32>;
+def S_CMPK_LE_I32_vi       : SOPK_Real_vi <0x07, S_CMPK_LE_I32>;
+def S_CMPK_EQ_U32_vi       : SOPK_Real_vi <0x08, S_CMPK_EQ_U32>;
+def S_CMPK_LG_U32_vi       : SOPK_Real_vi <0x09, S_CMPK_LG_U32>;
+def S_CMPK_GT_U32_vi       : SOPK_Real_vi <0x0A, S_CMPK_GT_U32>;
+def S_CMPK_GE_U32_vi       : SOPK_Real_vi <0x0B, S_CMPK_GE_U32>;
+def S_CMPK_LT_U32_vi       : SOPK_Real_vi <0x0C, S_CMPK_LT_U32>;
+def S_CMPK_LE_U32_vi       : SOPK_Real_vi <0x0D, S_CMPK_LE_U32>;
+def S_ADDK_I32_vi          : SOPK_Real_vi <0x0E, S_ADDK_I32>;
+def S_MULK_I32_vi          : SOPK_Real_vi <0x0F, S_MULK_I32>;
+def S_CBRANCH_I_FORK_vi    : SOPK_Real_vi <0x10, S_CBRANCH_I_FORK>;
+def S_GETREG_B32_vi        : SOPK_Real_vi <0x11, S_GETREG_B32>;
+def S_SETREG_B32_vi        : SOPK_Real_vi <0x12, S_SETREG_B32>;
+//def S_GETREG_REGRD_B32_vi  : SOPK_Real_vi <0x13, S_GETREG_REGRD_B32>; // see pseudo for comments
+def S_SETREG_IMM32_B32_vi  : SOPK_Real64<0x14, S_SETREG_IMM32_B32>,
+                             Select_vi<S_SETREG_IMM32_B32.Mnemonic>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
new file mode 100644
index 000000000000..9908fc003ce7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -0,0 +1,37 @@
+//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+/// \brief The target which suports all AMD GPUs.  This will eventually
+///         be deprecated and there will be a R600 target and a GCN target.
+Target &llvm::getTheAMDGPUTarget() {
+  static Target TheAMDGPUTarget;
+  return TheAMDGPUTarget;
+}
+/// \brief The target for GCN GPUs
+Target &llvm::getTheGCNTarget() {
+  static Target TheGCNTarget;
+  return TheGCNTarget;
+}
+
+/// \brief Extern function to initialize the targets for the AMDGPU backend
+extern "C" void LLVMInitializeAMDGPUTargetInfo() {
+  RegisterTarget<Triple::r600, false> R600(getTheAMDGPUTarget(), "r600",
+                                           "AMD GPUs HD2XXX-HD6XXX");
+  RegisterTarget<Triple::amdgcn, false> GCN(getTheGCNTarget(), "amdgcn",
+                                            "AMD GCN GPUs");
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
new file mode 100644
index 000000000000..b6868de6a74e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -0,0 +1,69 @@
+//===-- AMDGPUAsmUtils.cpp - AsmParser/InstPrinter common -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUAsmUtils.h"
+
+namespace llvm {
+namespace AMDGPU {
+namespace SendMsg {
+
+// This must be in sync with llvm::AMDGPU::SendMsg::Id enum members, see SIDefines.h.
+const char* const IdSymbolic[] = {
+  nullptr,
+  "MSG_INTERRUPT",
+  "MSG_GS",
+  "MSG_GS_DONE",
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  nullptr,
+  "MSG_SYSMSG"
+};
+
+// These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h.
+const char* const OpSysSymbolic[] = {
+  nullptr,
+  "SYSMSG_OP_ECC_ERR_INTERRUPT",
+  "SYSMSG_OP_REG_RD",
+  "SYSMSG_OP_HOST_TRAP_ACK",
+  "SYSMSG_OP_TTRACE_PC"
+};
+
+const char* const OpGsSymbolic[] = {
+  "GS_OP_NOP",
+  "GS_OP_CUT",
+  "GS_OP_EMIT",
+  "GS_OP_EMIT_CUT"
+};
+
+} // namespace SendMsg
+
+namespace Hwreg {
+
+// This must be in sync with llvm::AMDGPU::Hwreg::ID_SYMBOLIC_FIRST_/LAST_, see SIDefines.h.
+const char* const IdSymbolic[] = {
+  nullptr,
+  "HW_REG_MODE",
+  "HW_REG_STATUS",
+  "HW_REG_TRAPSTS",
+  "HW_REG_HW_ID",
+  "HW_REG_GPR_ALLOC",
+  "HW_REG_LDS_ALLOC",
+  "HW_REG_IB_STS"
+};
+
+} // namespace Hwreg
+} // namespace AMDGPU
+} // namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
new file mode 100644
index 000000000000..b2dc2c0e364c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -0,0 +1,31 @@
+//===-- AMDGPUAsmUtils.h - AsmParser/InstPrinter common ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H
+
+namespace llvm {
+namespace AMDGPU {
+namespace SendMsg { // Symbolic names for the sendmsg(...) syntax.
+
+extern const char* const IdSymbolic[];
+extern const char* const OpSysSymbolic[];
+extern const char* const OpGsSymbolic[];
+
+} // namespace SendMsg
+
+namespace Hwreg { // Symbolic names for the hwreg(...) syntax.
+
+extern const char* const IdSymbolic[];
+
+} // namespace Hwreg
+} // namespace AMDGPU
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
new file mode 100644
index 000000000000..85cbadf0a570
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -0,0 +1,461 @@
+//===-- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information--------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUBaseInfo.h"
+#include "AMDGPU.h"
+#include "SIDefines.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/SubtargetFeature.h"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AMDGPUGenSubtargetInfo.inc"
+#undef GET_SUBTARGETINFO_ENUM
+
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+#undef GET_REGINFO_ENUM
+
+#define GET_INSTRINFO_NAMED_OPS
+#define GET_INSTRINFO_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+#undef GET_INSTRINFO_NAMED_OPS
+#undef GET_INSTRINFO_ENUM
+
+namespace {
+
+/// \returns Bit mask for given bit \p Shift and bit \p Width.
+unsigned getBitMask(unsigned Shift, unsigned Width) {
+  return ((1 << Width) - 1) << Shift;
+}
+
+/// \brief Packs \p Src into \p Dst for given bit \p Shift and bit \p Width.
+///
+/// \returns Packed \p Dst.
+unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) {
+  Dst &= ~(1 << Shift) & ~getBitMask(Shift, Width);
+  Dst |= (Src << Shift) & getBitMask(Shift, Width);
+  return Dst;
+}
+
+/// \brief Unpacks bits from \p Src for given bit \p Shift and bit \p Width.
+///
+/// \returns Unpacked bits.
+unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) {
+  return (Src & getBitMask(Shift, Width)) >> Shift;
+}
+
+/// \returns Vmcnt bit shift.
+unsigned getVmcntBitShift() { return 0; }
+
+/// \returns Vmcnt bit width.
+unsigned getVmcntBitWidth() { return 4; }
+
+/// \returns Expcnt bit shift.
+unsigned getExpcntBitShift() { return 4; }
+
+/// \returns Expcnt bit width.
+unsigned getExpcntBitWidth() { return 3; }
+
+/// \returns Lgkmcnt bit shift.
+unsigned getLgkmcntBitShift() { return 8; }
+
+/// \returns Lgkmcnt bit width.
+unsigned getLgkmcntBitWidth() { return 4; }
+
+} // anonymous namespace
+
+namespace llvm {
+namespace AMDGPU {
+
+IsaVersion getIsaVersion(const FeatureBitset &Features) {
+
+  if (Features.test(FeatureISAVersion7_0_0))
+    return {7, 0, 0};
+
+  if (Features.test(FeatureISAVersion7_0_1))
+    return {7, 0, 1};
+
+  if (Features.test(FeatureISAVersion7_0_2))
+    return {7, 0, 2};
+
+  if (Features.test(FeatureISAVersion8_0_0))
+    return {8, 0, 0};
+
+  if (Features.test(FeatureISAVersion8_0_1))
+    return {8, 0, 1};
+
+  if (Features.test(FeatureISAVersion8_0_2))
+    return {8, 0, 2};
+
+  if (Features.test(FeatureISAVersion8_0_3))
+    return {8, 0, 3};
+
+  if (Features.test(FeatureISAVersion8_0_4))
+    return {8, 0, 4};
+
+  if (Features.test(FeatureISAVersion8_1_0))
+    return {8, 1, 0};
+
+  return {0, 0, 0};
+}
+
+void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
+                               const FeatureBitset &Features) {
+
+  IsaVersion ISA = getIsaVersion(Features);
+
+  memset(&Header, 0, sizeof(Header));
+
+  Header.amd_kernel_code_version_major = 1;
+  Header.amd_kernel_code_version_minor = 0;
+  Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
+  Header.amd_machine_version_major = ISA.Major;
+  Header.amd_machine_version_minor = ISA.Minor;
+  Header.amd_machine_version_stepping = ISA.Stepping;
+  Header.kernel_code_entry_byte_offset = sizeof(Header);
+  // wavefront_size is specified as a power of 2: 2^6 = 64 threads.
+  Header.wavefront_size = 6;
+  // These alignment values are specified in powers of two, so alignment =
+  // 2^n.  The minimum alignment is 2^4 = 16.
+  Header.kernarg_segment_alignment = 4;
+  Header.group_segment_alignment = 4;
+  Header.private_segment_alignment = 4;
+}
+
+MCSection *getHSATextSection(MCContext &Ctx) {
+  return Ctx.getELFSection(".hsatext", ELF::SHT_PROGBITS,
+                           ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                           ELF::SHF_EXECINSTR |
+                           ELF::SHF_AMDGPU_HSA_AGENT |
+                           ELF::SHF_AMDGPU_HSA_CODE);
+}
+
+MCSection *getHSADataGlobalAgentSection(MCContext &Ctx) {
+  return Ctx.getELFSection(".hsadata_global_agent", ELF::SHT_PROGBITS,
+                           ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                           ELF::SHF_AMDGPU_HSA_GLOBAL |
+                           ELF::SHF_AMDGPU_HSA_AGENT);
+}
+
+MCSection *getHSADataGlobalProgramSection(MCContext &Ctx) {
+  return  Ctx.getELFSection(".hsadata_global_program", ELF::SHT_PROGBITS,
+                            ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                            ELF::SHF_AMDGPU_HSA_GLOBAL);
+}
+
+MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) {
+  return Ctx.getELFSection(".hsarodata_readonly_agent", ELF::SHT_PROGBITS,
+                           ELF::SHF_ALLOC | ELF::SHF_AMDGPU_HSA_READONLY |
+                           ELF::SHF_AMDGPU_HSA_AGENT);
+}
+
+bool isGroupSegment(const GlobalValue *GV) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}
+
+bool isGlobalSegment(const GlobalValue *GV) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}
+
+bool isReadOnlySegment(const GlobalValue *GV) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
+bool shouldEmitConstantsToTextSection(const Triple &TT) {
+  return TT.getOS() != Triple::AMDHSA;
+}
+
+int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
+  Attribute A = F.getFnAttribute(Name);
+  int Result = Default;
+
+  if (A.isStringAttribute()) {
+    StringRef Str = A.getValueAsString();
+    if (Str.getAsInteger(0, Result)) {
+      LLVMContext &Ctx = F.getContext();
+      Ctx.emitError("can't parse integer attribute " + Name);
+    }
+  }
+
+  return Result;
+}
+
+std::pair<int, int> getIntegerPairAttribute(const Function &F,
+                                            StringRef Name,
+                                            std::pair<int, int> Default,
+                                            bool OnlyFirstRequired) {
+  Attribute A = F.getFnAttribute(Name);
+  if (!A.isStringAttribute())
+    return Default;
+
+  LLVMContext &Ctx = F.getContext();
+  std::pair<int, int> Ints = Default;
+  std::pair<StringRef, StringRef> Strs = A.getValueAsString().split(',');
+  if (Strs.first.trim().getAsInteger(0, Ints.first)) {
+    Ctx.emitError("can't parse first integer attribute " + Name);
+    return Default;
+  }
+  if (Strs.second.trim().getAsInteger(0, Ints.second)) {
+    if (!OnlyFirstRequired || Strs.second.trim().size()) {
+      Ctx.emitError("can't parse second integer attribute " + Name);
+      return Default;
+    }
+  }
+
+  return Ints;
+}
+
+unsigned getWaitcntBitMask(IsaVersion Version) {
+  unsigned Vmcnt = getBitMask(getVmcntBitShift(), getVmcntBitWidth());
+  unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
+  unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth());
+  return Vmcnt | Expcnt | Lgkmcnt;
+}
+
+unsigned getVmcntBitMask(IsaVersion Version) {
+  return (1 << getVmcntBitWidth()) - 1;
+}
+
+unsigned getExpcntBitMask(IsaVersion Version) {
+  return (1 << getExpcntBitWidth()) - 1;
+}
+
+unsigned getLgkmcntBitMask(IsaVersion Version) {
+  return (1 << getLgkmcntBitWidth()) - 1;
+}
+
+unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt) {
+  return unpackBits(Waitcnt, getVmcntBitShift(), getVmcntBitWidth());
+}
+
+unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt) {
+  return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
+}
+
+unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt) {
+  return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
+}
+
+void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt,
+                   unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt) {
+  Vmcnt = decodeVmcnt(Version, Waitcnt);
+  Expcnt = decodeExpcnt(Version, Waitcnt);
+  Lgkmcnt = decodeLgkmcnt(Version, Waitcnt);
+}
+
+unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt) {
+  return packBits(Vmcnt, Waitcnt, getVmcntBitShift(), getVmcntBitWidth());
+}
+
+unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt) {
+  return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
+}
+
+unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt) {
+  return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
+}
+
+unsigned encodeWaitcnt(IsaVersion Version,
+                       unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) {
+  unsigned Waitcnt = getWaitcntBitMask(Version);;
+  Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt);
+  Waitcnt = encodeExpcnt(Version, Waitcnt, Expcnt);
+  Waitcnt = encodeLgkmcnt(Version, Waitcnt, Lgkmcnt);
+  return Waitcnt;
+}
+
+unsigned getInitialPSInputAddr(const Function &F) {
+  return getIntegerAttribute(F, "InitialPSInputAddr", 0);
+}
+
+bool isShader(CallingConv::ID cc) {
+  switch(cc) {
+    case CallingConv::AMDGPU_VS:
+    case CallingConv::AMDGPU_GS:
+    case CallingConv::AMDGPU_PS:
+    case CallingConv::AMDGPU_CS:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool isCompute(CallingConv::ID cc) {
+  return !isShader(cc) || cc == CallingConv::AMDGPU_CS;
+}
+
+bool isSI(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
+}
+
+bool isCI(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureSeaIslands];
+}
+
+bool isVI(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
+}
+
+unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
+
+  switch(Reg) {
+  default: break;
+  case AMDGPU::FLAT_SCR:
+    assert(!isSI(STI));
+    return isCI(STI) ? AMDGPU::FLAT_SCR_ci : AMDGPU::FLAT_SCR_vi;
+
+  case AMDGPU::FLAT_SCR_LO:
+    assert(!isSI(STI));
+    return isCI(STI) ? AMDGPU::FLAT_SCR_LO_ci : AMDGPU::FLAT_SCR_LO_vi;
+
+  case AMDGPU::FLAT_SCR_HI:
+    assert(!isSI(STI));
+    return isCI(STI) ? AMDGPU::FLAT_SCR_HI_ci : AMDGPU::FLAT_SCR_HI_vi;
+  }
+  return Reg;
+}
+
+bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+  unsigned OpType = Desc.OpInfo[OpNo].OperandType;
+  return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
+         OpType <= AMDGPU::OPERAND_SRC_LAST;
+}
+
+bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+  unsigned OpType = Desc.OpInfo[OpNo].OperandType;
+  switch (OpType) {
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+  unsigned OpType = Desc.OpInfo[OpNo].OperandType;
+  return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST &&
+         OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST;
+}
+
+// Avoid using MCRegisterClass::getSize, since that function will go away
+// (move from MC* level to Target* level). Return size in bits.
+unsigned getRegBitWidth(unsigned RCID) {
+  switch (RCID) {
+  case AMDGPU::SGPR_32RegClassID:
+  case AMDGPU::VGPR_32RegClassID:
+  case AMDGPU::VS_32RegClassID:
+  case AMDGPU::SReg_32RegClassID:
+  case AMDGPU::SReg_32_XM0RegClassID:
+    return 32;
+  case AMDGPU::SGPR_64RegClassID:
+  case AMDGPU::VS_64RegClassID:
+  case AMDGPU::SReg_64RegClassID:
+  case AMDGPU::VReg_64RegClassID:
+    return 64;
+  case AMDGPU::VReg_96RegClassID:
+    return 96;
+  case AMDGPU::SGPR_128RegClassID:
+  case AMDGPU::SReg_128RegClassID:
+  case AMDGPU::VReg_128RegClassID:
+    return 128;
+  case AMDGPU::SReg_256RegClassID:
+  case AMDGPU::VReg_256RegClassID:
+    return 256;
+  case AMDGPU::SReg_512RegClassID:
+  case AMDGPU::VReg_512RegClassID:
+    return 512;
+  default:
+    llvm_unreachable("Unexpected register class");
+  }
+}
+
+unsigned getRegBitWidth(const MCRegisterClass &RC) {
+  return getRegBitWidth(RC.getID());
+}
+
+unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
+                           unsigned OpNo) {
+  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
+  return getRegBitWidth(MRI->getRegClass(RCID)) / 8;
+}
+
+bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
+  if (Literal >= -16 && Literal <= 64)
+    return true;
+
+  uint64_t Val = static_cast<uint64_t>(Literal);
+  return (Val == DoubleToBits(0.0)) ||
+         (Val == DoubleToBits(1.0)) ||
+         (Val == DoubleToBits(-1.0)) ||
+         (Val == DoubleToBits(0.5)) ||
+         (Val == DoubleToBits(-0.5)) ||
+         (Val == DoubleToBits(2.0)) ||
+         (Val == DoubleToBits(-2.0)) ||
+         (Val == DoubleToBits(4.0)) ||
+         (Val == DoubleToBits(-4.0)) ||
+         (Val == 0x3fc45f306dc9c882 && HasInv2Pi);
+}
+
+bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) {
+  if (Literal >= -16 && Literal <= 64)
+    return true;
+
+  // The actual type of the operand does not seem to matter as long
+  // as the bits match one of the inline immediate values.  For example:
+  //
+  // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
+  // so it is a legal inline immediate.
+  //
+  // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
+  // floating-point, so it is a legal inline immediate.
+
+  uint32_t Val = static_cast<uint32_t>(Literal);
+  return (Val == FloatToBits(0.0f)) ||
+         (Val == FloatToBits(1.0f)) ||
+         (Val == FloatToBits(-1.0f)) ||
+         (Val == FloatToBits(0.5f)) ||
+         (Val == FloatToBits(-0.5f)) ||
+         (Val == FloatToBits(2.0f)) ||
+         (Val == FloatToBits(-2.0f)) ||
+         (Val == FloatToBits(4.0f)) ||
+         (Val == FloatToBits(-4.0f)) ||
+         (Val == 0x3e22f983 && HasInv2Pi);
+}
+
+bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
+  assert(HasInv2Pi);
+
+  if (Literal >= -16 && Literal <= 64)
+    return true;
+
+  uint16_t Val = static_cast<uint16_t>(Literal);
+  return Val == 0x3C00 || // 1.0
+         Val == 0xBC00 || // -1.0
+         Val == 0x3800 || // 0.5
+         Val == 0xB800 || // -0.5
+         Val == 0x4000 || // 2.0
+         Val == 0xC000 || // -2.0
+         Val == 0x4400 || // 4.0
+         Val == 0xC400 || // -4.0
+         Val == 0x3118;   // 1/2pi
+}
+
+} // End namespace AMDGPU
+} // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
new file mode 100644
index 000000000000..ea5fc366d205
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -0,0 +1,216 @@
+//===-- AMDGPUBaseInfo.h - Top level definitions for AMDGPU -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
+
+#include "AMDKernelCodeT.h"
+#include "llvm/IR/CallingConv.h"
+
+#include "SIDefines.h"
+
+#define GET_INSTRINFO_OPERAND_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+#undef GET_INSTRINFO_OPERAND_ENUM
+
+namespace llvm {
+
+class FeatureBitset;
+class Function;
+class GlobalValue;
+class MCContext;
+class MCInstrDesc;
+class MCRegisterClass;
+class MCRegisterInfo;
+class MCSection;
+class MCSubtargetInfo;
+
+namespace AMDGPU {
+
+LLVM_READONLY
+int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
+
+struct IsaVersion {
+  unsigned Major;
+  unsigned Minor;
+  unsigned Stepping;
+};
+
+IsaVersion getIsaVersion(const FeatureBitset &Features);
+void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
+                               const FeatureBitset &Features);
+MCSection *getHSATextSection(MCContext &Ctx);
+
+MCSection *getHSADataGlobalAgentSection(MCContext &Ctx);
+
+MCSection *getHSADataGlobalProgramSection(MCContext &Ctx);
+
+MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx);
+
+bool isGroupSegment(const GlobalValue *GV);
+bool isGlobalSegment(const GlobalValue *GV);
+bool isReadOnlySegment(const GlobalValue *GV);
+
+/// \returns True if constants should be emitted to .text section for given
+/// target triple \p TT, false otherwise.
+bool shouldEmitConstantsToTextSection(const Triple &TT);
+
+/// \returns Integer value requested using \p F's \p Name attribute.
+///
+/// \returns \p Default if attribute is not present.
+///
+/// \returns \p Default and emits error if requested value cannot be converted
+/// to integer.
+int getIntegerAttribute(const Function &F, StringRef Name, int Default);
+
+/// \returns A pair of integer values requested using \p F's \p Name attribute
+/// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
+/// is false).
+///
+/// \returns \p Default if attribute is not present.
+///
+/// \returns \p Default and emits error if one of the requested values cannot be
+/// converted to integer, or \p OnlyFirstRequired is false and "second" value is
+/// not present.
+std::pair<int, int> getIntegerPairAttribute(const Function &F,
+                                            StringRef Name,
+                                            std::pair<int, int> Default,
+                                            bool OnlyFirstRequired = false);
+
+/// \returns Waitcnt bit mask for given isa \p Version.
+unsigned getWaitcntBitMask(IsaVersion Version);
+
+/// \returns Vmcnt bit mask for given isa \p Version.
+unsigned getVmcntBitMask(IsaVersion Version);
+
+/// \returns Expcnt bit mask for given isa \p Version.
+unsigned getExpcntBitMask(IsaVersion Version);
+
+/// \returns Lgkmcnt bit mask for given isa \p Version.
+unsigned getLgkmcntBitMask(IsaVersion Version);
+
+/// \returns Decoded Vmcnt from given \p Waitcnt for given isa \p Version.
+unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt);
+
+/// \returns Decoded Expcnt from given \p Waitcnt for given isa \p Version.
+unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt);
+
+/// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version.
+unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt);
+
+/// \brief Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
+/// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and
+/// \p Lgkmcnt respectively.
+///
+/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
+///     \p Vmcnt = \p Waitcnt[3:0]
+///     \p Expcnt = \p Waitcnt[6:4]
+///     \p Lgkmcnt = \p Waitcnt[11:8]
+void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt,
+                   unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt);
+
+/// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version.
+unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt);
+
+/// \returns \p Waitcnt with encoded \p Expcnt for given isa \p Version.
+unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt);
+
+/// \returns \p Waitcnt with encoded \p Lgkmcnt for given isa \p Version.
+unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt);
+
+/// \brief Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
+/// \p Version.
+///
+/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows:
+///     Waitcnt[3:0]  = \p Vmcnt
+///     Waitcnt[6:4]  = \p Expcnt
+///     Waitcnt[11:8] = \p Lgkmcnt
+///
+/// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given
+/// isa \p Version.
+unsigned encodeWaitcnt(IsaVersion Version,
+                       unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt);
+
+unsigned getInitialPSInputAddr(const Function &F);
+
+bool isShader(CallingConv::ID cc);
+bool isCompute(CallingConv::ID cc);
+
+bool isSI(const MCSubtargetInfo &STI);
+bool isCI(const MCSubtargetInfo &STI);
+bool isVI(const MCSubtargetInfo &STI);
+
+/// If \p Reg is a pseudo reg, return the correct hardware register given
+/// \p STI otherwise return \p Reg.
+unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
+
+/// \brief Can this operand also contain immediate values?
+bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo);
+
+/// \brief Is this floating-point operand?
+bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo);
+
+/// \brief Does this opearnd support only inlinable literals?
+bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo);
+
+/// \brief Get the size in bits of a register from the register class \p RC.
+unsigned getRegBitWidth(unsigned RCID);
+
+/// \brief Get the size in bits of a register from the register class \p RC.
+unsigned getRegBitWidth(const MCRegisterClass &RC);
+
+/// \brief Get size of register operand
+unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
+                           unsigned OpNo);
+
+LLVM_READNONE
+inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
+  switch (OpInfo.OperandType) {
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+    return 4;
+
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+    return 8;
+
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+    return 2;
+
+  default:
+    llvm_unreachable("unhandled operand type");
+  }
+}
+
+LLVM_READNONE
+inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) {
+  return getOperandSize(Desc.OpInfo[OpNo]);
+}
+
+/// \brief Is this literal inlinable
+LLVM_READNONE
+bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi);
+
+LLVM_READNONE
+bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi);
+
+LLVM_READNONE
+bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
+
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
new file mode 100644
index 000000000000..c55eaab077d1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -0,0 +1,152 @@
+//===--------------------- AMDKernelCodeTInfo.h ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file - specifies tables for amd_kernel_code_t structure parsing/printing
+//
+//===----------------------------------------------------------------------===//
+
+#define QNAME(name) amd_kernel_code_t::name
+#define FLD_T(name) decltype(QNAME(name)), &QNAME(name)
+
+#define FIELD2(sname, aname, name) \
+  RECORD(sname, aname, printField<FLD_T(name)>, parseField<FLD_T(name)>)
+
+#define FIELD(name) FIELD2(name, name, name)
+
+
+#define PRINTCODEPROP(name) \
+  printBitField<FLD_T(code_properties),\
+                AMD_CODE_PROPERTY_##name##_SHIFT,\
+                AMD_CODE_PROPERTY_##name##_WIDTH>
+
+#define PARSECODEPROP(name) \
+  parseBitField<FLD_T(code_properties),\
+                AMD_CODE_PROPERTY_##name##_SHIFT,\
+                AMD_CODE_PROPERTY_##name##_WIDTH>
+
+#define CODEPROP(name, shift) \
+  RECORD(name, name, PRINTCODEPROP(shift), PARSECODEPROP(shift))
+
+// have to define these lambdas because of Set/GetMacro
+#define PRINTCOMP(GetMacro, Shift) \
+[](StringRef Name, const amd_kernel_code_t &C, raw_ostream &OS) { \
+   printName(OS, Name) << \
+     (int)GetMacro(C.compute_pgm_resource_registers >> Shift); \
+}
+#define PARSECOMP(SetMacro, Shift) \
+[](amd_kernel_code_t &C, MCAsmParser &MCParser, raw_ostream &Err) { \
+   int64_t Value = 0; \
+   if (!expectAbsExpression(MCParser, Value, Err)) \
+     return false; \
+   C.compute_pgm_resource_registers |= SetMacro(Value) << Shift; \
+   return true; \
+}
+
+#define COMPPGM(name, aname, GetMacro, SetMacro, Shift) \
+  RECORD(name, aname, PRINTCOMP(GetMacro, Shift), PARSECOMP(SetMacro, Shift))
+
+#define COMPPGM1(name, aname, AccMacro) \
+  COMPPGM(name, aname, G_00B848_##AccMacro, S_00B848_##AccMacro, 0)
+
+#define COMPPGM2(name, aname, AccMacro) \
+  COMPPGM(name, aname, G_00B84C_##AccMacro, S_00B84C_##AccMacro, 32)
+
+///////////////////////////////////////////////////////////////////////////////
+// Begin of the table
+// Define RECORD(name, print, parse) in your code to get field definitions
+// and include this file
+
+FIELD2(amd_code_version_major,        kernel_code_version_major,  amd_kernel_code_version_major),
+FIELD2(amd_code_version_minor,        kernel_code_version_minor,  amd_kernel_code_version_minor),
+FIELD2(amd_machine_kind,              machine_kind,               amd_machine_kind),
+FIELD2(amd_machine_version_major,     machine_version_major,      amd_machine_version_major),
+FIELD2(amd_machine_version_minor,     machine_version_minor,      amd_machine_version_minor),
+FIELD2(amd_machine_version_stepping,  machine_version_stepping,   amd_machine_version_stepping),
+
+FIELD(kernel_code_entry_byte_offset),
+FIELD(kernel_code_prefetch_byte_size),
+FIELD(max_scratch_backing_memory_byte_size),
+
+COMPPGM1(granulated_workitem_vgpr_count,  compute_pgm_rsrc1_vgprs,          VGPRS),
+COMPPGM1(granulated_wavefront_sgpr_count, compute_pgm_rsrc1_sgprs,          SGPRS),
+COMPPGM1(priority,                        compute_pgm_rsrc1_priority,       PRIORITY),
+COMPPGM1(float_mode,                      compute_pgm_rsrc1_float_mode,     FLOAT_MODE), // TODO: split float_mode
+COMPPGM1(priv,                            compute_pgm_rsrc1_priv,           PRIV),
+COMPPGM1(enable_dx10_clamp,               compute_pgm_rsrc1_dx10_clamp,     DX10_CLAMP),
+COMPPGM1(debug_mode,                      compute_pgm_rsrc1_debug_mode,     DEBUG_MODE),
+COMPPGM1(enable_ieee_mode,                compute_pgm_rsrc1_ieee_mode,      IEEE_MODE),
+// TODO: bulky
+// TODO: cdbg_user
+COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN),
+COMPPGM2(user_sgpr_count,                 compute_pgm_rsrc2_user_sgpr,      USER_SGPR),
+// TODO: enable_trap_handler
+COMPPGM2(enable_sgpr_workgroup_id_x,      compute_pgm_rsrc2_tgid_x_en,      TGID_X_EN),
+COMPPGM2(enable_sgpr_workgroup_id_y,      compute_pgm_rsrc2_tgid_y_en,      TGID_Y_EN),
+COMPPGM2(enable_sgpr_workgroup_id_z,      compute_pgm_rsrc2_tgid_z_en,      TGID_Z_EN),
+COMPPGM2(enable_sgpr_workgroup_info,      compute_pgm_rsrc2_tg_size_en,     TG_SIZE_EN),
+COMPPGM2(enable_vgpr_workitem_id,         compute_pgm_rsrc2_tidig_comp_cnt, TIDIG_COMP_CNT),
+COMPPGM2(enable_exception_msb,            compute_pgm_rsrc2_excp_en_msb,    EXCP_EN_MSB), // TODO: split enable_exception_msb
+COMPPGM2(granulated_lds_size,             compute_pgm_rsrc2_lds_size,       LDS_SIZE),
+COMPPGM2(enable_exception,                compute_pgm_rsrc2_excp_en,        EXCP_EN), // TODO: split enable_exception
+
+CODEPROP(enable_sgpr_private_segment_buffer,  ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER),
+CODEPROP(enable_sgpr_dispatch_ptr,            ENABLE_SGPR_DISPATCH_PTR),
+CODEPROP(enable_sgpr_queue_ptr,               ENABLE_SGPR_QUEUE_PTR),
+CODEPROP(enable_sgpr_kernarg_segment_ptr,     ENABLE_SGPR_KERNARG_SEGMENT_PTR),
+CODEPROP(enable_sgpr_dispatch_id,             ENABLE_SGPR_DISPATCH_ID),
+CODEPROP(enable_sgpr_flat_scratch_init,       ENABLE_SGPR_FLAT_SCRATCH_INIT),
+CODEPROP(enable_sgpr_private_segment_size,    ENABLE_SGPR_PRIVATE_SEGMENT_SIZE),
+CODEPROP(enable_sgpr_grid_workgroup_count_x,  ENABLE_SGPR_GRID_WORKGROUP_COUNT_X),
+CODEPROP(enable_sgpr_grid_workgroup_count_y,  ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y),
+CODEPROP(enable_sgpr_grid_workgroup_count_z,  ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z),
+CODEPROP(enable_ordered_append_gds,           ENABLE_ORDERED_APPEND_GDS),
+CODEPROP(private_element_size,                PRIVATE_ELEMENT_SIZE),
+CODEPROP(is_ptr64,                            IS_PTR64),
+CODEPROP(is_dynamic_callstack,                IS_DYNAMIC_CALLSTACK),
+CODEPROP(is_debug_enabled,                    IS_DEBUG_SUPPORTED),
+CODEPROP(is_xnack_enabled,                    IS_XNACK_SUPPORTED),
+
+FIELD(workitem_private_segment_byte_size),
+FIELD(workgroup_group_segment_byte_size),
+FIELD(gds_segment_byte_size),
+FIELD(kernarg_segment_byte_size),
+FIELD(workgroup_fbarrier_count),
+FIELD(wavefront_sgpr_count),
+FIELD(workitem_vgpr_count),
+FIELD(reserved_vgpr_first),
+FIELD(reserved_vgpr_count),
+FIELD(reserved_sgpr_first),
+FIELD(reserved_sgpr_count),
+FIELD(debug_wavefront_private_segment_offset_sgpr),
+FIELD(debug_private_segment_buffer_sgpr),
+FIELD(kernarg_segment_alignment),
+FIELD(group_segment_alignment),
+FIELD(private_segment_alignment),
+FIELD(wavefront_size),
+FIELD(call_convention),
+FIELD(runtime_loader_kernel_symbol)
+// TODO: control_directive
+
+// end of the table
+///////////////////////////////////////////////////////////////////////////////
+
+#undef QNAME
+#undef FLD_T
+#undef FIELD2
+#undef FIELD
+#undef PRINTCODEPROP
+#undef PARSECODEPROP
+#undef CODEPROP
+#undef PRINTCOMP
+#undef PAPSECOMP
+#undef COMPPGM
+#undef COMPPGM1
+#undef COMPPGM2
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
new file mode 100644
index 000000000000..0333b0a14d29
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
@@ -0,0 +1,181 @@
+//===--------------------AMDKernelCodeTUtils.cpp --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file - utility functions to parse/print amd_kernel_code_t structure
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDKernelCodeTUtils.h"
+#include "SIDefines.h"
+#include <llvm/MC/MCParser/MCAsmLexer.h>
+#include <llvm/MC/MCParser/MCAsmParser.h>
+#include <llvm/Support/raw_ostream.h>
+
+using namespace llvm;
+
+static ArrayRef<StringRef> get_amd_kernel_code_t_FldNames() {
+  static StringRef const Table[] = {
+    "", // not found placeholder
+#define RECORD(name, altName, print, parse) #name
+#include "AMDKernelCodeTInfo.h"
+#undef RECORD
+  };
+  return makeArrayRef(Table);
+}
+
+static ArrayRef<StringRef> get_amd_kernel_code_t_FldAltNames() {
+  static StringRef const Table[] = {
+    "", // not found placeholder
+#define RECORD(name, altName, print, parse) #altName
+#include "AMDKernelCodeTInfo.h"
+#undef RECORD
+  };
+  return makeArrayRef(Table);
+}
+
+static StringMap<int> createIndexMap(const ArrayRef<StringRef> &names,
+                                     const ArrayRef<StringRef> &altNames) {
+  StringMap<int> map;
+  assert(names.size() == altNames.size());
+  for (unsigned i = 0; i < names.size(); ++i) {
+    map.insert(std::make_pair(names[i], i));
+    map.insert(std::make_pair(altNames[i], i));
+  }
+  return map;
+}
+
+static int get_amd_kernel_code_t_FieldIndex(StringRef name) {
+  static const auto map = createIndexMap(get_amd_kernel_code_t_FldNames(),
+                                         get_amd_kernel_code_t_FldAltNames());
+  return map.lookup(name) - 1; // returns -1 if not found
+}
+
+static StringRef get_amd_kernel_code_t_FieldName(int index) {
+  return get_amd_kernel_code_t_FldNames()[index + 1];
+}
+
+
+// Field printing
+
+static raw_ostream &printName(raw_ostream &OS, StringRef Name) {
+  return OS << Name << " = ";
+}
+
+template <typename T, T amd_kernel_code_t::*ptr>
+static void printField(StringRef Name, const amd_kernel_code_t &C,
+                       raw_ostream &OS) {
+  printName(OS, Name) << (int)(C.*ptr);
+}
+
+template <typename T, T amd_kernel_code_t::*ptr, int shift, int width = 1>
+static void printBitField(StringRef Name, const amd_kernel_code_t &c,
+                          raw_ostream &OS) {
+  const auto Mask = (static_cast<T>(1) << width) - 1;
+  printName(OS, Name) << (int)((c.*ptr >> shift) & Mask);
+}
+
+typedef void(*PrintFx)(StringRef,
+                       const amd_kernel_code_t &,
+                       raw_ostream &);
+
+static ArrayRef<PrintFx> getPrinterTable() {
+  static const PrintFx Table[] = {
+#define RECORD(name, altName, print, parse) print
+#include "AMDKernelCodeTInfo.h"
+#undef RECORD
+  };
+  return makeArrayRef(Table);
+}
+
+void llvm::printAmdKernelCodeField(const amd_kernel_code_t &C,
+                                   int FldIndex,
+                                   raw_ostream &OS) {
+  auto Printer = getPrinterTable()[FldIndex];
+  if (Printer)
+    Printer(get_amd_kernel_code_t_FieldName(FldIndex), C, OS);
+}
+
+void llvm::dumpAmdKernelCode(const amd_kernel_code_t *C,
+                             raw_ostream &OS,
+                             const char *tab) {
+  const int Size = getPrinterTable().size();
+  for (int i = 0; i < Size; ++i) {
+    OS << tab;
+    printAmdKernelCodeField(*C, i, OS);
+    OS << '\n';
+  }
+}
+
+
+// Field parsing
+
+static bool expectAbsExpression(MCAsmParser &MCParser, int64_t &Value, raw_ostream& Err) {
+
+  if (MCParser.getLexer().isNot(AsmToken::Equal)) {
+    Err << "expected '='";
+    return false;
+  }
+  MCParser.getLexer().Lex();
+
+  if (MCParser.parseAbsoluteExpression(Value)) {
+    Err << "integer absolute expression expected";
+    return false;
+  }
+  return true;
+}
+
+template <typename T, T amd_kernel_code_t::*ptr>
+static bool parseField(amd_kernel_code_t &C, MCAsmParser &MCParser,
+                       raw_ostream &Err) {
+  int64_t Value = 0;
+  if (!expectAbsExpression(MCParser, Value, Err))
+    return false;
+  C.*ptr = (T)Value;
+  return true;
+}
+
+template <typename T, T amd_kernel_code_t::*ptr, int shift, int width = 1>
+static bool parseBitField(amd_kernel_code_t &C, MCAsmParser &MCParser,
+                          raw_ostream &Err) {
+  int64_t Value = 0;
+  if (!expectAbsExpression(MCParser, Value, Err))
+    return false;
+  const uint64_t Mask = ((UINT64_C(1)  << width) - 1) << shift;
+  C.*ptr &= (T)~Mask;
+  C.*ptr |= (T)((Value << shift) & Mask);
+  return true;
+}
+
+typedef bool(*ParseFx)(amd_kernel_code_t &,
+                       MCAsmParser &MCParser,
+                       raw_ostream &Err);
+
+static ArrayRef<ParseFx> getParserTable() {
+  static const ParseFx Table[] = {
+#define RECORD(name, altName, print, parse) parse
+#include "AMDKernelCodeTInfo.h"
+#undef RECORD
+  };
+  return makeArrayRef(Table);
+}
+
+bool llvm::parseAmdKernelCodeField(StringRef ID,
+                                   MCAsmParser &MCParser,
+                                   amd_kernel_code_t &C,
+                                   raw_ostream &Err) {
+  const int Idx = get_amd_kernel_code_t_FieldIndex(ID);
+  if (Idx < 0) {
+    Err << "unexpected amd_kernel_code_t field name " << ID;
+    return false;
+  }
+  auto Parser = getParserTable()[Idx];
+  return Parser ? Parser(C, MCParser, Err) : false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
new file mode 100644
index 000000000000..d9edca7a82ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
@@ -0,0 +1,39 @@
+//===- AMDGPUKernelCodeTUtils.h - helpers for amd_kernel_code_t  *- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file AMDKernelCodeTUtils.h
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDKERNELCODETUTILS_H
+#define AMDKERNELCODETUTILS_H
+
+#include "AMDKernelCodeT.h"
+
+namespace llvm {
+
+class MCAsmLexer;
+class MCAsmParser;
+class raw_ostream;
+class StringRef;
+
+void printAmdKernelCodeField(const amd_kernel_code_t &C,
+  int FldIndex,
+  raw_ostream &OS);
+
+void dumpAmdKernelCode(const amd_kernel_code_t *C,
+  raw_ostream &OS,
+  const char *tab);
+
+bool parseAmdKernelCodeField(StringRef ID,
+  MCAsmParser &Parser,
+  amd_kernel_code_t &C,
+  raw_ostream &Err);
+
+}
+
+#endif // AMDKERNELCODETUTILS_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td
new file mode 100644
index 000000000000..1fd1c1e21527
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VIInstrFormats.td
@@ -0,0 +1,20 @@
+//===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// VI Instruction format definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class EXPe_vi : EXPe {
+  let Inst{31-26} = 0x31; //encoding
+}
+
+class VINTRPe_vi <bits<2> op> : VINTRPe <op> {
+  let Inst{31-26} = 0x35; // encoding
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td
new file mode 100644
index 000000000000..b45c8fc9c7d5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td
@@ -0,0 +1,14 @@
+//===-- VIInstructions.td - VI Instruction Defintions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Instruction definitions for VI and newer.
+//===----------------------------------------------------------------------===//
+
+FIXME: Deleting this file broke buildbots that don't do full rebuilds.  This
+file is no longer used by the backend, so it can be deleted once all
+the buildbots update there dependencies.
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
new file mode 100644
index 000000000000..bff706cdc1dc
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -0,0 +1,621 @@
+//===-- VOP1Instructions.td - Vector Instruction Defintions ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VOP1 Classes
+//===----------------------------------------------------------------------===//
+
+class VOP1e <bits<8> op, VOPProfile P> : Enc32 {
+  bits<8> vdst;
+  bits<9> src0;
+
+  let Inst{8-0}   = !if(P.HasSrc0, src0{8-0}, 0);
+  let Inst{16-9}  = op;
+  let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+  let Inst{31-25} = 0x3f; //encoding
+}
+
+class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
+  bits<8> vdst;
+  
+  let Inst{8-0}   = 0xf9; // sdwa
+  let Inst{16-9}  = op;
+  let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+  let Inst{31-25} = 0x3f; // encoding
+}
+
+class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
+  InstSI <P.Outs32, P.Ins32, "", pattern>,
+  VOP <opName>,
+  SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>,
+  MnemonicAlias<opName#"_e32", opName> {
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let UseNamedOperandTable = 1;
+
+  string Mnemonic = opName;
+  string AsmOperands = P.Asm32;
+
+  let Size = 4;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SubtargetPredicate = isGCN;
+
+  let VOP1 = 1;
+  let VALU = 1;
+  let Uses = [EXEC];
+
+  let AsmVariantName = AMDGPUAsmVariants.Default;
+
+  VOPProfile Pfl = P;
+}
+
+class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+  SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  let Constraints     = ps.Constraints;
+  let DisableEncoding = ps.DisableEncoding;
+
+  // copy relevant pseudo op flags
+  let SubtargetPredicate = ps.SubtargetPredicate;
+  let AsmMatchConverter  = ps.AsmMatchConverter;
+  let AsmVariantName     = ps.AsmVariantName;
+  let Constraints        = ps.Constraints;
+  let DisableEncoding    = ps.DisableEncoding;
+  let TSFlags            = ps.TSFlags;
+}
+
+class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+  VOP_SDWA_Pseudo <OpName, P, pattern> {
+  let AsmMatchConverter = "cvtSdwaVOP1";
+}
+
+class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
+  list<dag> ret = !if(P.HasModifiers,
+    [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+                                i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
+    [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]);
+}
+
+multiclass VOP1Inst <string opName, VOPProfile P,
+                     SDPatternOperator node = null_frag> {
+  def _e32 : VOP1_Pseudo <opName, P>;
+  def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
+  def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
+}
+
+//===----------------------------------------------------------------------===//
+// VOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+let VOPAsmPrefer32Bit = 1 in {
+defm V_NOP : VOP1Inst <"v_nop", VOP_NONE>;
+}
+
+let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>;
+} // End isMoveImm = 1
+
+// FIXME: Specify SchedRW for READFIRSTLANE_B32
+// TODO: Make profile for this, there is VOP3 encoding also
+def V_READFIRSTLANE_B32 :
+  InstSI <(outs SReg_32:$vdst),
+    (ins VGPR_32:$src0),
+    "v_readfirstlane_b32 $vdst, $src0",
+    [(set i32:$vdst, (int_amdgcn_readfirstlane i32:$src0))]>,
+  Enc32 {
+
+  let isCodeGenOnly = 0;
+  let UseNamedOperandTable = 1;
+
+  let Size = 4;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SubtargetPredicate = isGCN;
+
+  let VOP1 = 1;
+  let VALU = 1;
+  let Uses = [EXEC];
+  let isConvergent = 1;
+
+  bits<8> vdst;
+  bits<9> src0;
+
+  let Inst{8-0}   = src0;
+  let Inst{16-9}  = 0x2;
+  let Inst{24-17} = vdst;
+  let Inst{31-25} = 0x3f; //encoding
+}
+
+let SchedRW = [WriteQuarterRate32] in {
+defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>;
+defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP_F64_I32, sint_to_fp>;
+defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP_F32_I32, sint_to_fp>;
+defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP_F32_I32, uint_to_fp>;
+defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
+defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
+defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_I32_F32, fp_to_f16>;
+defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_I32, f16_to_fp>;
+defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
+defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
+defm V_CVT_OFF_F32_I4 : VOP1Inst  <"v_cvt_off_f32_i4", VOP_F32_I32>;
+defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
+defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
+defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP_F32_I32, AMDGPUcvt_f32_ubyte0>;
+defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP_F32_I32, AMDGPUcvt_f32_ubyte1>;
+defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP_F32_I32, AMDGPUcvt_f32_ubyte2>;
+defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP_F32_I32, AMDGPUcvt_f32_ubyte3>;
+defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>;
+defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP_F64_I32, uint_to_fp>;
+} // End SchedRW = [WriteQuarterRate32]
+
+defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>;
+defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>;
+defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>;
+defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
+defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
+defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
+
+let SchedRW = [WriteQuarterRate32] in {
+defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>;
+defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
+defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32>;
+defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
+} // End SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
+defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
+} // End SchedRW = [WriteDouble];
+
+defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;
+
+let SchedRW = [WriteDouble] in {
+defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>;
+} // End SchedRW = [WriteDouble]
+
+let SchedRW = [WriteQuarterRate32] in {
+defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
+defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
+} // End SchedRW = [WriteQuarterRate32]
+
+defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
+defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>;
+defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>;
+defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>;
+defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
+defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
+
+let SchedRW = [WriteDoubleAdd] in {
+defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
+defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
+} // End SchedRW = [WriteDoubleAdd]
+
+defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>;
+defm V_FREXP_MANT_F32 : VOP1Inst <"v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant>;
+
+let VOPAsmPrefer32Bit = 1 in {
+defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
+}
+
+// Restrict src0 to be VGPR
+def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> {
+  let Src0RC32 = VRegSrc_32;
+  let Src0RC64 = VRegSrc_32;
+
+  let HasExt = 0;
+}
+
+// Special case because there are no true output operands.  Hack vdst
+// to be a src operand. The custom inserter must add a tied implicit
+// def and use of the super register since there seems to be no way to
+// add an implicit def of a virtual register in tablegen.
+def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
+  let Src0RC32 = VOPDstOperand<VGPR_32>;
+  let Src0RC64 = VOPDstOperand<VGPR_32>;
+
+  let Outs = (outs);
+  let Ins32 = (ins Src0RC32:$vdst, VSrc_b32:$src0);
+  let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
+  let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                    bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+  let InsSDWA = (ins Src0RC32:$vdst, Int32InputMods:$src0_modifiers, VCSrc_b32:$src0,
+                     clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                     src0_sel:$src0_sel);
+
+  let Asm32 = getAsm32<1, 1>.ret;
+  let Asm64 = getAsm64<1, 1, 0>.ret;
+  let AsmDPP = getAsmDPP<1, 1, 0>.ret;
+  let AsmSDWA = getAsmSDWA<1, 1, 0>.ret;
+
+  let HasExt = 0;
+  let HasDst = 0;
+  let EmitDst = 1; // force vdst emission
+}
+
+let SubtargetPredicate = HasMovrel, Uses = [M0, EXEC] in {
+// v_movreld_b32 is a special case because the destination output
+ // register is really a source. It isn't actually read (but may be
+ // written), and is only to provide the base register to start
+ // indexing from. Tablegen seems to not let you define an implicit
+ // virtual register output for the super register being written into,
+ // so this must have an implicit def of the register added to it.
+defm V_MOVRELD_B32 : VOP1Inst <"v_movreld_b32", VOP_MOVRELD>;
+defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>;
+defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
+} // End Uses = [M0, EXEC]
+
+// These instruction only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+
+let SchedRW = [WriteQuarterRate32] in {
+defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
+defm V_LOG_CLAMP_F32 : VOP1Inst <"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
+defm V_RCP_CLAMP_F32 : VOP1Inst <"v_rcp_clamp_f32", VOP_F32_F32>;
+defm V_RCP_LEGACY_F32 : VOP1Inst <"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>;
+defm V_RSQ_CLAMP_F32 : VOP1Inst <"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>;
+defm V_RSQ_LEGACY_F32 : VOP1Inst <"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>;
+} // End SchedRW = [WriteQuarterRate32]
+
+let SchedRW = [WriteDouble] in {
+defm V_RCP_CLAMP_F64 : VOP1Inst <"v_rcp_clamp_f64", VOP_F64_F64>;
+defm V_RSQ_CLAMP_F64 : VOP1Inst <"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>;
+} // End SchedRW = [WriteDouble]
+
+} // End SubtargetPredicate = isSICI
+
+
+let SubtargetPredicate = isCIVI in {
+
+let SchedRW = [WriteDoubleAdd] in {
+defm V_TRUNC_F64 : VOP1Inst <"v_trunc_f64", VOP_F64_F64, ftrunc>;
+defm V_CEIL_F64 : VOP1Inst <"v_ceil_f64", VOP_F64_F64, fceil>;
+defm V_FLOOR_F64 : VOP1Inst <"v_floor_f64", VOP_F64_F64, ffloor>;
+defm V_RNDNE_F64 : VOP1Inst <"v_rndne_f64", VOP_F64_F64, frint>;
+} // End SchedRW = [WriteDoubleAdd]
+
+let SchedRW = [WriteQuarterRate32] in {
+defm V_LOG_LEGACY_F32 : VOP1Inst <"v_log_legacy_f32", VOP_F32_F32>;
+defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
+} // End SchedRW = [WriteQuarterRate32]
+
+} // End SubtargetPredicate = isCIVI
+
+
+let SubtargetPredicate = isVI in {
+
+defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16, uint_to_fp>;
+defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16, sint_to_fp>;
+defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
+defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
+defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
+defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>;
+defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
+defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>;
+defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>;
+defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
+defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>;
+defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
+defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
+defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
+defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>;
+defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
+defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
+defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
+
+}
+
+let Predicates = [isVI] in {
+
+def : Pat<
+    (f32 (f16_to_fp i16:$src)),
+    (V_CVT_F32_F16_e32 $src)
+>;
+
+def : Pat<
+    (i16 (fp_to_f16 f32:$src)),
+    (V_CVT_F16_F32_e32 $src)
+>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Target
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+multiclass VOP1_Real_si <bits<9> op> {
+  let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
+    def _e32_si :
+      VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+      VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
+    def _e64_si :
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+      VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+  }
+}
+
+defm V_NOP               : VOP1_Real_si <0x0>;
+defm V_MOV_B32           : VOP1_Real_si <0x1>;
+defm V_CVT_I32_F64       : VOP1_Real_si <0x3>;
+defm V_CVT_F64_I32       : VOP1_Real_si <0x4>;
+defm V_CVT_F32_I32       : VOP1_Real_si <0x5>;
+defm V_CVT_F32_U32       : VOP1_Real_si <0x6>;
+defm V_CVT_U32_F32       : VOP1_Real_si <0x7>;
+defm V_CVT_I32_F32       : VOP1_Real_si <0x8>;
+defm V_MOV_FED_B32       : VOP1_Real_si <0x9>;
+defm V_CVT_F16_F32       : VOP1_Real_si <0xa>;
+defm V_CVT_F32_F16       : VOP1_Real_si <0xb>;
+defm V_CVT_RPI_I32_F32   : VOP1_Real_si <0xc>;
+defm V_CVT_FLR_I32_F32   : VOP1_Real_si <0xd>;
+defm V_CVT_OFF_F32_I4    : VOP1_Real_si <0xe>;
+defm V_CVT_F32_F64       : VOP1_Real_si <0xf>;
+defm V_CVT_F64_F32       : VOP1_Real_si <0x10>;
+defm V_CVT_F32_UBYTE0    : VOP1_Real_si <0x11>;
+defm V_CVT_F32_UBYTE1    : VOP1_Real_si <0x12>;
+defm V_CVT_F32_UBYTE2    : VOP1_Real_si <0x13>;
+defm V_CVT_F32_UBYTE3    : VOP1_Real_si <0x14>;
+defm V_CVT_U32_F64       : VOP1_Real_si <0x15>;
+defm V_CVT_F64_U32       : VOP1_Real_si <0x16>;
+defm V_FRACT_F32         : VOP1_Real_si <0x20>;
+defm V_TRUNC_F32         : VOP1_Real_si <0x21>;
+defm V_CEIL_F32          : VOP1_Real_si <0x22>;
+defm V_RNDNE_F32         : VOP1_Real_si <0x23>;
+defm V_FLOOR_F32         : VOP1_Real_si <0x24>;
+defm V_EXP_F32           : VOP1_Real_si <0x25>;
+defm V_LOG_CLAMP_F32     : VOP1_Real_si <0x26>;
+defm V_LOG_F32           : VOP1_Real_si <0x27>;
+defm V_RCP_CLAMP_F32     : VOP1_Real_si <0x28>;
+defm V_RCP_LEGACY_F32    : VOP1_Real_si <0x29>;
+defm V_RCP_F32           : VOP1_Real_si <0x2a>;
+defm V_RCP_IFLAG_F32     : VOP1_Real_si <0x2b>;
+defm V_RSQ_CLAMP_F32     : VOP1_Real_si <0x2c>;
+defm V_RSQ_LEGACY_F32    : VOP1_Real_si <0x2d>;
+defm V_RSQ_F32           : VOP1_Real_si <0x2e>;
+defm V_RCP_F64           : VOP1_Real_si <0x2f>;
+defm V_RCP_CLAMP_F64     : VOP1_Real_si <0x30>;
+defm V_RSQ_F64           : VOP1_Real_si <0x31>;
+defm V_RSQ_CLAMP_F64     : VOP1_Real_si <0x32>;
+defm V_SQRT_F32          : VOP1_Real_si <0x33>;
+defm V_SQRT_F64          : VOP1_Real_si <0x34>;
+defm V_SIN_F32           : VOP1_Real_si <0x35>;
+defm V_COS_F32           : VOP1_Real_si <0x36>;
+defm V_NOT_B32           : VOP1_Real_si <0x37>;
+defm V_BFREV_B32         : VOP1_Real_si <0x38>;
+defm V_FFBH_U32          : VOP1_Real_si <0x39>;
+defm V_FFBL_B32          : VOP1_Real_si <0x3a>;
+defm V_FFBH_I32          : VOP1_Real_si <0x3b>;
+defm V_FREXP_EXP_I32_F64 : VOP1_Real_si <0x3c>;
+defm V_FREXP_MANT_F64    : VOP1_Real_si <0x3d>;
+defm V_FRACT_F64         : VOP1_Real_si <0x3e>;
+defm V_FREXP_EXP_I32_F32 : VOP1_Real_si <0x3f>;
+defm V_FREXP_MANT_F32    : VOP1_Real_si <0x40>;
+defm V_CLREXCP           : VOP1_Real_si <0x41>;
+defm V_MOVRELD_B32       : VOP1_Real_si <0x42>;
+defm V_MOVRELS_B32       : VOP1_Real_si <0x43>;
+defm V_MOVRELSD_B32      : VOP1_Real_si <0x44>;
+
+//===----------------------------------------------------------------------===//
+// CI
+//===----------------------------------------------------------------------===//
+
+multiclass VOP1_Real_ci <bits<9> op> {
+  let AssemblerPredicates = [isCIOnly], DecoderNamespace = "CI" in {
+    def _e32_ci :
+      VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+      VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
+    def _e64_ci :
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+      VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+  }
+}
+
+defm V_TRUNC_F64         : VOP1_Real_ci <0x17>;
+defm V_CEIL_F64          : VOP1_Real_ci <0x18>;
+defm V_FLOOR_F64         : VOP1_Real_ci <0x1A>;
+defm V_RNDNE_F64         : VOP1_Real_ci <0x19>;
+defm V_LOG_LEGACY_F32    : VOP1_Real_ci <0x45>;
+defm V_EXP_LEGACY_F32    : VOP1_Real_ci <0x46>;
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> :
+  VOP_DPP <ps.OpName, P> {
+  let Defs = ps.Defs;
+  let Uses = ps.Uses;
+  let SchedRW = ps.SchedRW;
+  let hasSideEffects = ps.hasSideEffects;
+  let Constraints = ps.Constraints;
+  let DisableEncoding = ps.DisableEncoding;
+
+  bits<8> vdst;
+  let Inst{8-0}   = 0xfa; // dpp
+  let Inst{16-9}  = op;
+  let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+  let Inst{31-25} = 0x3f; //encoding
+}
+
+multiclass VOP1_Real_vi <bits<10> op> {
+  let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+    def _e32_vi :
+      VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
+      VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
+    def _e64_vi :
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+      VOP3e_vi <!add(0x140, op), !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+  }
+
+  def _sdwa_vi :
+    VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
+  // For now left dpp only for asm/dasm
+  // TODO: add corresponding pseudo
+  def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
+}
+
+defm V_NOP               : VOP1_Real_vi <0x0>;
+defm V_MOV_B32           : VOP1_Real_vi <0x1>;
+defm V_CVT_I32_F64       : VOP1_Real_vi <0x3>;
+defm V_CVT_F64_I32       : VOP1_Real_vi <0x4>;
+defm V_CVT_F32_I32       : VOP1_Real_vi <0x5>;
+defm V_CVT_F32_U32       : VOP1_Real_vi <0x6>;
+defm V_CVT_U32_F32       : VOP1_Real_vi <0x7>;
+defm V_CVT_I32_F32       : VOP1_Real_vi <0x8>;
+defm V_CVT_F16_F32       : VOP1_Real_vi <0xa>;
+defm V_CVT_F32_F16       : VOP1_Real_vi <0xb>;
+defm V_CVT_RPI_I32_F32   : VOP1_Real_vi <0xc>;
+defm V_CVT_FLR_I32_F32   : VOP1_Real_vi <0xd>;
+defm V_CVT_OFF_F32_I4    : VOP1_Real_vi <0xe>;
+defm V_CVT_F32_F64       : VOP1_Real_vi <0xf>;
+defm V_CVT_F64_F32       : VOP1_Real_vi <0x10>;
+defm V_CVT_F32_UBYTE0    : VOP1_Real_vi <0x11>;
+defm V_CVT_F32_UBYTE1    : VOP1_Real_vi <0x12>;
+defm V_CVT_F32_UBYTE2    : VOP1_Real_vi <0x13>;
+defm V_CVT_F32_UBYTE3    : VOP1_Real_vi <0x14>;
+defm V_CVT_U32_F64       : VOP1_Real_vi <0x15>;
+defm V_CVT_F64_U32       : VOP1_Real_vi <0x16>;
+defm V_FRACT_F32         : VOP1_Real_vi <0x1b>;
+defm V_TRUNC_F32         : VOP1_Real_vi <0x1c>;
+defm V_CEIL_F32          : VOP1_Real_vi <0x1d>;
+defm V_RNDNE_F32         : VOP1_Real_vi <0x1e>;
+defm V_FLOOR_F32         : VOP1_Real_vi <0x1f>;
+defm V_EXP_F32           : VOP1_Real_vi <0x20>;
+defm V_LOG_F32           : VOP1_Real_vi <0x21>;
+defm V_RCP_F32           : VOP1_Real_vi <0x22>;
+defm V_RCP_IFLAG_F32     : VOP1_Real_vi <0x23>;
+defm V_RSQ_F32           : VOP1_Real_vi <0x24>;
+defm V_RCP_F64           : VOP1_Real_vi <0x25>;
+defm V_RSQ_F64           : VOP1_Real_vi <0x26>;
+defm V_SQRT_F32          : VOP1_Real_vi <0x27>;
+defm V_SQRT_F64          : VOP1_Real_vi <0x28>;
+defm V_SIN_F32           : VOP1_Real_vi <0x29>;
+defm V_COS_F32           : VOP1_Real_vi <0x2a>;
+defm V_NOT_B32           : VOP1_Real_vi <0x2b>;
+defm V_BFREV_B32         : VOP1_Real_vi <0x2c>;
+defm V_FFBH_U32          : VOP1_Real_vi <0x2d>;
+defm V_FFBL_B32          : VOP1_Real_vi <0x2e>;
+defm V_FFBH_I32          : VOP1_Real_vi <0x2f>;
+defm V_FREXP_EXP_I32_F64 : VOP1_Real_vi <0x30>;
+defm V_FREXP_MANT_F64    : VOP1_Real_vi <0x31>;
+defm V_FRACT_F64         : VOP1_Real_vi <0x32>;
+defm V_FREXP_EXP_I32_F32 : VOP1_Real_vi <0x33>;
+defm V_FREXP_MANT_F32    : VOP1_Real_vi <0x34>;
+defm V_CLREXCP           : VOP1_Real_vi <0x35>;
+defm V_MOVRELD_B32       : VOP1_Real_vi <0x36>;
+defm V_MOVRELS_B32       : VOP1_Real_vi <0x37>;
+defm V_MOVRELSD_B32      : VOP1_Real_vi <0x38>;
+defm V_TRUNC_F64         : VOP1_Real_vi <0x17>;
+defm V_CEIL_F64          : VOP1_Real_vi <0x18>;
+defm V_FLOOR_F64         : VOP1_Real_vi <0x1A>;
+defm V_RNDNE_F64         : VOP1_Real_vi <0x19>;
+defm V_LOG_LEGACY_F32    : VOP1_Real_vi <0x4c>;
+defm V_EXP_LEGACY_F32    : VOP1_Real_vi <0x4b>;
+defm V_CVT_F16_U16       : VOP1_Real_vi <0x39>;
+defm V_CVT_F16_I16       : VOP1_Real_vi <0x3a>;
+defm V_CVT_U16_F16       : VOP1_Real_vi <0x3b>;
+defm V_CVT_I16_F16       : VOP1_Real_vi <0x3c>;
+defm V_RCP_F16           : VOP1_Real_vi <0x3d>;
+defm V_SQRT_F16          : VOP1_Real_vi <0x3e>;
+defm V_RSQ_F16           : VOP1_Real_vi <0x3f>;
+defm V_LOG_F16           : VOP1_Real_vi <0x40>;
+defm V_EXP_F16           : VOP1_Real_vi <0x41>;
+defm V_FREXP_MANT_F16    : VOP1_Real_vi <0x42>;
+defm V_FREXP_EXP_I16_F16 : VOP1_Real_vi <0x43>;
+defm V_FLOOR_F16         : VOP1_Real_vi <0x44>;
+defm V_CEIL_F16          : VOP1_Real_vi <0x45>;
+defm V_TRUNC_F16         : VOP1_Real_vi <0x46>;
+defm V_RNDNE_F16         : VOP1_Real_vi <0x47>;
+defm V_FRACT_F16         : VOP1_Real_vi <0x48>;
+defm V_SIN_F16           : VOP1_Real_vi <0x49>;
+defm V_COS_F16           : VOP1_Real_vi <0x4a>;
+
+
+// Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
+// indexing mode. vdst can't be treated as a def for codegen purposes,
+// and an implicit use and def of the super register should be added.
+def V_MOV_B32_indirect : VPseudoInstSI<(outs),
+  (ins getVALUDstForVT<i32>.ret:$vdst, getVOPSrc0ForVT<i32>.ret:$src0)>,
+  PseudoInstExpansion<(V_MOV_B32_e32_vi getVALUDstForVT<i32>.ret:$vdst,
+                                        getVOPSrc0ForVT<i32>.ret:$src0)> {
+  let VOP1 = 1;
+  let SubtargetPredicate = isVI;
+}
+
+// This is a pseudo variant of the v_movreld_b32 instruction in which the
+// vector operand appears only twice, once as def and once as use. Using this
+// pseudo avoids problems with the Two Address instructions pass.
+class V_MOVRELD_B32_pseudo<RegisterClass rc> : VPseudoInstSI <
+  (outs rc:$vdst),
+  (ins rc:$vsrc, VSrc_b32:$val, i32imm:$offset)> {
+  let VOP1 = 1;
+
+  let Constraints = "$vsrc = $vdst";
+  let Uses = [M0, EXEC];
+
+  let SubtargetPredicate = HasMovrel;
+}
+
+def V_MOVRELD_B32_V1 : V_MOVRELD_B32_pseudo<VGPR_32>;
+def V_MOVRELD_B32_V2 : V_MOVRELD_B32_pseudo<VReg_64>;
+def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>;
+def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>;
+def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>;
+
+let Predicates = [isVI] in {
+
+def : Pat <
+  (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
+                      imm:$bound_ctrl)),
+  (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask),
+                       (as_i32imm $bank_mask), (as_i1imm $bound_ctrl))
+>;
+
+
+def : Pat<
+  (i32 (anyext i16:$src)),
+  (COPY $src)
+>;
+
+def : Pat<
+   (i64 (anyext i16:$src)),
+   (REG_SEQUENCE VReg_64,
+     (i32 (COPY $src)), sub0,
+     (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+
+def : Pat<
+  (i16 (trunc i32:$src)),
+  (COPY $src)
+>;
+
+def : Pat<
+  (i1 (trunc i16:$src)),
+  (COPY $src)
+>;
+
+
+def : Pat <
+  (i16 (trunc i64:$src)),
+  (EXTRACT_SUBREG $src, sub0)
+>;
+
+} // End Predicates = [isVI]
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td
new file mode 100644
index 000000000000..20fb7f7bcab7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -0,0 +1,757 @@
+//===-- VOP2Instructions.td - Vector Instruction Defintions ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VOP2 Classes
+//===----------------------------------------------------------------------===//
+
+class VOP2e <bits<6> op, VOPProfile P> : Enc32 {
+  bits<8> vdst;
+  bits<9> src0;
+  bits<8> src1;
+
+  let Inst{8-0}   = !if(P.HasSrc0, src0, 0);
+  let Inst{16-9}  = !if(P.HasSrc1, src1, 0);
+  let Inst{24-17} = !if(P.EmitDst, vdst, 0);
+  let Inst{30-25} = op;
+  let Inst{31}    = 0x0; //encoding
+}
+
+class VOP2_MADKe <bits<6> op, VOPProfile P> : Enc64 {
+  bits<8>  vdst;
+  bits<9>  src0;
+  bits<8>  src1;
+  bits<32> imm;
+
+  let Inst{8-0}   = !if(P.HasSrc0, src0, 0);
+  let Inst{16-9}  = !if(P.HasSrc1, src1, 0);
+  let Inst{24-17} = !if(P.EmitDst, vdst, 0);
+  let Inst{30-25} = op;
+  let Inst{31}    = 0x0; // encoding
+  let Inst{63-32} = imm;
+}
+
+class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> {
+  bits<8> vdst;
+  bits<8> src1;
+  
+  let Inst{8-0}   = 0xf9; // sdwa
+  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0);
+  let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+  let Inst{30-25} = op;
+  let Inst{31}    = 0x0; // encoding
+}
+
+class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> :
+  InstSI <P.Outs32, P.Ins32, "", pattern>,
+  VOP <opName>,
+  SIMCInstr <opName#suffix, SIEncodingFamily.NONE>,
+  MnemonicAlias<opName#suffix, opName> {
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let UseNamedOperandTable = 1;
+
+  string Mnemonic = opName;
+  string AsmOperands = P.Asm32;
+
+  let Size = 4;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SubtargetPredicate = isGCN;
+
+  let VOP2 = 1;
+  let VALU = 1;
+  let Uses = [EXEC];
+
+  let AsmVariantName = AMDGPUAsmVariants.Default;
+
+  VOPProfile Pfl = P;
+}
+
+class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+  SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  let Constraints     = ps.Constraints;
+  let DisableEncoding = ps.DisableEncoding;
+
+  // copy relevant pseudo op flags
+  let SubtargetPredicate = ps.SubtargetPredicate;
+  let AsmMatchConverter  = ps.AsmMatchConverter;
+  let AsmVariantName     = ps.AsmVariantName;
+  let Constraints        = ps.Constraints;
+  let DisableEncoding    = ps.DisableEncoding;
+  let TSFlags            = ps.TSFlags;
+}
+
+class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+  VOP_SDWA_Pseudo <OpName, P, pattern> {
+  let AsmMatchConverter = "cvtSdwaVOP2";
+}
+
+class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
+  list<dag> ret = !if(P.HasModifiers,
+    [(set P.DstVT:$vdst,
+      (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+            (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+    [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]);
+}
+
+multiclass VOP2Inst <string opName,
+                     VOPProfile P,
+                     SDPatternOperator node = null_frag,
+                     string revOp = opName> {
+
+  def _e32 : VOP2_Pseudo <opName, P>,
+             Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+  def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+             Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+
+  def _sdwa : VOP2_SDWA_Pseudo <opName, P>,
+              Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>;
+}
+
+// TODO: add SDWA pseudo instructions for VOP2bInst and VOP2eInst
+multiclass VOP2bInst <string opName,
+                      VOPProfile P,
+                      SDPatternOperator node = null_frag,
+                      string revOp = opName,
+                      bit useSGPRInput = !eq(P.NumSrcArgs, 3)> {
+
+  let SchedRW = [Write32Bit, WriteSALU] in {
+    let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
+      def _e32 : VOP2_Pseudo <opName, P>,
+                 Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+      
+      def _sdwa : VOP2_SDWA_Pseudo <opName, P>,
+              Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>;
+    }
+    def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+               Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+  }
+}
+
+multiclass VOP2eInst <string opName,
+                      VOPProfile P,
+                      SDPatternOperator node = null_frag,
+                      string revOp = opName,
+                      bit useSGPRInput = !eq(P.NumSrcArgs, 3)> {
+
+  let SchedRW = [Write32Bit] in {
+    let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in {
+      def _e32 : VOP2_Pseudo <opName, P>,
+                 Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
+    }
+    def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
+               Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
+  }
+}
+
+class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+  field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
+  field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
+  field string Asm32 = "$vdst, $src0, $src1, $imm";
+  field bit HasExt = 0;
+}
+
+def VOP_MADAK_F16 : VOP_MADAK <f16>;
+def VOP_MADAK_F32 : VOP_MADAK <f32>;
+
+class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+  field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
+  field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1);
+  field string Asm32 = "$vdst, $src0, $imm, $src1";
+  field bit HasExt = 0;
+}
+
+def VOP_MADMK_F16 : VOP_MADMK <f16>;
+def VOP_MADMK_F32 : VOP_MADMK <f32>;
+
+class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
+  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
+  let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
+                       HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
+  let InsDPP = (ins FP32InputMods:$src0_modifiers, Src0DPP:$src0,
+                    FP32InputMods:$src1_modifiers, Src1DPP:$src1,
+                    VGPR_32:$src2, // stub argument
+                    dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                    bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+  let InsSDWA = (ins FP32InputMods:$src0_modifiers, Src0SDWA:$src0,
+                     FP32InputMods:$src1_modifiers, Src1SDWA:$src1,
+                     VGPR_32:$src2, // stub argument
+                     clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                     src0_sel:$src0_sel, src1_sel:$src1_sel);
+  let Asm32 = getAsm32<1, 2, vt>.ret;
+  let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
+  let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret;
+  let HasSrc2 = 0;
+  let HasSrc2Mods = 0;
+  let HasExt = 1;
+}
+
+def VOP_MAC_F16 : VOP_MAC <f16> {
+  // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
+  // 'not a string initializer' error.
+  let Asm64 = getAsm64<1, 2, HasModifiers, f16>.ret;
+}
+
+def VOP_MAC_F32 : VOP_MAC <f32> {
+  // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
+  // 'not a string initializer' error.
+  let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret;
+}
+
+// Write out to vcc or arbitrary SGPR.
+def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
+  let Asm32 = "$vdst, vcc, $src0, $src1";
+  let Asm64 = "$vdst, $sdst, $src0, $src1";
+  let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+  let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+  let Outs32 = (outs DstRC:$vdst);
+  let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+}
+
+// Write out to vcc or arbitrary SGPR and read in from vcc or
+// arbitrary SGPR.
+def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
+  // We use VCSrc_b32 to exclude literal constants, even though the
+  // encoding normally allows them since the implicit VCC use means
+  // using one would always violate the constant bus
+  // restriction. SGPRs are still allowed because it should
+  // technically be possible to use VCC again as src0.
+  let Src0RC32 = VCSrc_b32;
+  let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
+  let Asm64 = "$vdst, $sdst, $src0, $src1, $src2";
+  let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+  let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
+  let Outs32 = (outs DstRC:$vdst);
+  let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+
+  // Suppress src2 implied by type since the 32-bit encoding uses an
+  // implicit VCC use.
+  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
+
+  let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0SDWA:$src0,
+                     Src1Mod:$src1_modifiers, Src1SDWA:$src1,
+                     clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                     src0_sel:$src0_sel, src1_sel:$src1_sel);
+
+  let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0,
+                    Src1Mod:$src1_modifiers, Src1DPP:$src1,
+                    dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                    bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+  let HasExt = 1;
+}
+
+// Read in from vcc or arbitrary SGPR
+def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
+  let Src0RC32 = VCSrc_b32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above.
+  let Asm32 = "$vdst, $src0, $src1, vcc";
+  let Asm64 = "$vdst, $src0, $src1, $src2";
+  let Outs32 = (outs DstRC:$vdst);
+  let Outs64 = (outs DstRC:$vdst);
+
+  // Suppress src2 implied by type since the 32-bit encoding uses an
+  // implicit VCC use.
+  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
+}
+
+def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
+  let Outs32 = (outs SReg_32:$vdst);
+  let Outs64 = Outs32;
+  let Ins32 = (ins VGPR_32:$src0, SCSrc_b32:$src1);
+  let Ins64 = Ins32;
+  let Asm32 = " $vdst, $src0, $src1";
+  let Asm64 = Asm32;
+}
+
+def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
+  let Outs32 = (outs VGPR_32:$vdst);
+  let Outs64 = Outs32;
+  let Ins32 = (ins SReg_32:$src0, SCSrc_b32:$src1);
+  let Ins64 = Ins32;
+  let Asm32 = " $vdst, $src0, $src1";
+  let Asm64 = Asm32;
+}
+
+//===----------------------------------------------------------------------===//
+// VOP2 Instructions
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isGCN in {
+
+defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
+def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32>;
+
+let isCommutable = 1 in {
+defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>;
+defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, fsub>;
+defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">;
+defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>;
+defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, fmul>;
+defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24>;
+defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_I32_I32_I32, AMDGPUmulhi_i24>;
+defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32, AMDGPUmul_u24>;
+defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_I32_I32_I32, AMDGPUmulhi_u24>;
+defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>;
+defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>;
+defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_I32_I32_I32>;
+defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_I32_I32_I32>;
+defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_I32_I32_I32>;
+defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_I32_I32_I32>;
+defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32">;
+defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32">;
+defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32">;
+defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_I32_I32_I32>;
+defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_I32_I32_I32>;
+defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_I32_I32_I32>;
+
+let Constraints = "$vdst = $src2", DisableEncoding="$src2",
+    isConvertibleToThreeAddress = 1 in {
+defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
+}
+
+def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32>;
+
+// No patterns so that the scalar instructions are always selected.
+// The scalar versions will be replaced with vector when needed later.
+
+// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
+// but the VI instructions behave the same as the SI versions.
+defm V_ADD_I32 : VOP2bInst <"v_add_i32", VOP2b_I32_I1_I32_I32>;
+defm V_SUB_I32 : VOP2bInst <"v_sub_i32", VOP2b_I32_I1_I32_I32>;
+defm V_SUBREV_I32 : VOP2bInst <"v_subrev_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32">;
+defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1>;
+defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1>;
+defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32">;
+} // End isCommutable = 1
+
+// These are special and do not read the exec mask.
+let isConvergent = 1, Uses = []<Register> in {
+def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
+  [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">;
+
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">;
+} // End isConvergent = 1
+
+defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
+defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32>;
+defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>;
+defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>;
+defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst"
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, int_SI_packf16>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>;
+
+} // End SubtargetPredicate = isGCN
+
+
+// These instructions only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+
+defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy>;
+defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>;
+
+let isCommutable = 1 in {
+defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
+defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>;
+defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>;
+defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
+} // End isCommutable = 1
+
+} // End let SubtargetPredicate = SICI
+
+let SubtargetPredicate = isVI in {
+
+def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16>;
+defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
+defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
+defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
+defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
+
+let isCommutable = 1 in {
+defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>;
+defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
+defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
+defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
+def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16>;
+defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
+defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
+defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
+defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
+defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>;
+defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>;
+defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>;
+defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>;
+defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>;
+defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>;
+
+let Constraints = "$vdst = $src2", DisableEncoding="$src2",
+    isConvertibleToThreeAddress = 1 in {
+defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
+}
+} // End isCommutable = 1
+
+} // End SubtargetPredicate = isVI
+
+// Note: 16-bit instructions produce a 0 result in the high 16-bits.
+multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {
+
+def : Pat<
+  (op i16:$src0, i16:$src1),
+  (inst $src0, $src1)
+>;
+
+def : Pat<
+  (i32 (zext (op i16:$src0, i16:$src1))),
+  (inst $src0, $src1)
+>;
+
+def : Pat<
+  (i64 (zext (op i16:$src0, i16:$src1))),
+   (REG_SEQUENCE VReg_64,
+     (inst $src0, $src1), sub0,
+     (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+
+}
+
+multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst> {
+
+def : Pat<
+  (op i16:$src0, i16:$src1),
+  (inst $src1, $src0)
+>;
+
+def : Pat<
+  (i32 (zext (op i16:$src0, i16:$src1))),
+  (inst $src1, $src0)
+>;
+
+
+def : Pat<
+  (i64 (zext (op i16:$src0, i16:$src1))),
+   (REG_SEQUENCE VReg_64,
+     (inst $src1, $src0), sub0,
+     (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+}
+
+class ZExt_i16_i1_Pat <SDNode ext> : Pat <
+  (i16 (ext i1:$src)),
+  (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)
+>;
+
+let Predicates = [isVI] in {
+
+defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>;
+defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>;
+defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64>;
+defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64>;
+defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>;
+defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>;
+defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>;
+
+def : Pat <
+  (and i16:$src0, i16:$src1),
+  (V_AND_B32_e64 $src0, $src1)
+>;
+
+def : Pat <
+  (or i16:$src0, i16:$src1),
+  (V_OR_B32_e64 $src0, $src1)
+>;
+
+def : Pat <
+  (xor i16:$src0, i16:$src1),
+  (V_XOR_B32_e64 $src0, $src1)
+>;
+
+defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64>;
+defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64>;
+defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64>;
+
+def : ZExt_i16_i1_Pat<zext>;
+def : ZExt_i16_i1_Pat<anyext>;
+
+def : Pat <
+  (i16 (sext i1:$src)),
+  (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)
+>;
+
+} // End Predicates = [isVI]
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
+
+multiclass VOP2_Real_si <bits<6> op> {
+  def _si :
+    VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
+    VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP2_Real_MADK_si <bits<6> op> {
+  def _si : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
+            VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP2_Real_e32_si <bits<6> op> {
+  def _e32_si :
+    VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+    VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+}
+
+multiclass VOP2_Real_e32e64_si <bits<6> op> : VOP2_Real_e32_si<op> {
+  def _e64_si :
+    VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+    VOP3e_si <{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+}
+
+multiclass VOP2be_Real_e32e64_si <bits<6> op> : VOP2_Real_e32_si<op> {
+  def _e64_si :
+    VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+    VOP3be_si <{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+}
+
+} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI"
+
+defm V_CNDMASK_B32        : VOP2_Real_e32e64_si <0x0>;
+defm V_ADD_F32            : VOP2_Real_e32e64_si <0x3>;
+defm V_SUB_F32            : VOP2_Real_e32e64_si <0x4>;
+defm V_SUBREV_F32         : VOP2_Real_e32e64_si <0x5>;
+defm V_MUL_LEGACY_F32     : VOP2_Real_e32e64_si <0x7>;
+defm V_MUL_F32            : VOP2_Real_e32e64_si <0x8>;
+defm V_MUL_I32_I24        : VOP2_Real_e32e64_si <0x9>;
+defm V_MUL_HI_I32_I24     : VOP2_Real_e32e64_si <0xa>;
+defm V_MUL_U32_U24        : VOP2_Real_e32e64_si <0xb>;
+defm V_MUL_HI_U32_U24     : VOP2_Real_e32e64_si <0xc>;
+defm V_MIN_F32            : VOP2_Real_e32e64_si <0xf>;
+defm V_MAX_F32            : VOP2_Real_e32e64_si <0x10>;
+defm V_MIN_I32            : VOP2_Real_e32e64_si <0x11>;
+defm V_MAX_I32            : VOP2_Real_e32e64_si <0x12>;
+defm V_MIN_U32            : VOP2_Real_e32e64_si <0x13>;
+defm V_MAX_U32            : VOP2_Real_e32e64_si <0x14>;
+defm V_LSHRREV_B32        : VOP2_Real_e32e64_si <0x16>;
+defm V_ASHRREV_I32        : VOP2_Real_e32e64_si <0x18>;
+defm V_LSHLREV_B32        : VOP2_Real_e32e64_si <0x1a>;
+defm V_AND_B32            : VOP2_Real_e32e64_si <0x1b>;
+defm V_OR_B32             : VOP2_Real_e32e64_si <0x1c>;
+defm V_XOR_B32            : VOP2_Real_e32e64_si <0x1d>;
+defm V_MAC_F32            : VOP2_Real_e32e64_si <0x1f>;
+defm V_MADMK_F32          : VOP2_Real_MADK_si <0x20>;
+defm V_MADAK_F32          : VOP2_Real_MADK_si <0x21>;
+defm V_ADD_I32            : VOP2be_Real_e32e64_si <0x25>;
+defm V_SUB_I32            : VOP2be_Real_e32e64_si <0x26>;
+defm V_SUBREV_I32         : VOP2be_Real_e32e64_si <0x27>;
+defm V_ADDC_U32           : VOP2be_Real_e32e64_si <0x28>;
+defm V_SUBB_U32           : VOP2be_Real_e32e64_si <0x29>;
+defm V_SUBBREV_U32        : VOP2be_Real_e32e64_si <0x2a>;
+
+defm V_READLANE_B32       : VOP2_Real_si <0x01>;
+defm V_WRITELANE_B32      : VOP2_Real_si <0x02>;
+
+defm V_MAC_LEGACY_F32     : VOP2_Real_e32e64_si <0x6>;
+defm V_MIN_LEGACY_F32     : VOP2_Real_e32e64_si <0xd>;
+defm V_MAX_LEGACY_F32     : VOP2_Real_e32e64_si <0xe>;
+defm V_LSHR_B32           : VOP2_Real_e32e64_si <0x15>;
+defm V_ASHR_I32           : VOP2_Real_e32e64_si <0x17>;
+defm V_LSHL_B32           : VOP2_Real_e32e64_si <0x19>;
+
+defm V_BFM_B32            : VOP2_Real_e32e64_si <0x1e>;
+defm V_BCNT_U32_B32       : VOP2_Real_e32e64_si <0x22>;
+defm V_MBCNT_LO_U32_B32   : VOP2_Real_e32e64_si <0x23>;
+defm V_MBCNT_HI_U32_B32   : VOP2_Real_e32e64_si <0x24>;
+defm V_LDEXP_F32          : VOP2_Real_e32e64_si <0x2b>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e32e64_si <0x2c>;
+defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e32e64_si <0x2d>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e32e64_si <0x2e>;
+defm V_CVT_PKRTZ_F16_F32  : VOP2_Real_e32e64_si <0x2f>;
+defm V_CVT_PK_U16_U32     : VOP2_Real_e32e64_si <0x30>;
+defm V_CVT_PK_I16_I32     : VOP2_Real_e32e64_si <0x31>;
+
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, VOPProfile P = ps.Pfl> :
+  VOP_DPP <ps.OpName, P> {
+  let Defs = ps.Defs;
+  let Uses = ps.Uses;
+  let SchedRW = ps.SchedRW;
+  let hasSideEffects = ps.hasSideEffects;
+  let Constraints = ps.Constraints;
+  let DisableEncoding = ps.DisableEncoding;
+
+  bits<8> vdst;
+  bits<8> src1;
+  let Inst{8-0}   = 0xfa; //dpp
+  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0);
+  let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+  let Inst{30-25} = op;
+  let Inst{31}    = 0x0; //encoding
+}
+
+let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+
+multiclass VOP32_Real_vi <bits<10> op> {
+  def _vi :
+    VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>,
+    VOP3e_vi<op, !cast<VOP2_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP2_Real_MADK_vi <bits<6> op> {
+  def _vi : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>,
+            VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP2_Real_e32_vi <bits<6> op> {
+  def _e32_vi :
+    VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
+    VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+}
+
+multiclass VOP2_Real_e64_vi <bits<10> op> {
+  def _e64_vi :
+    VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+    VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+}
+
+multiclass Base_VOP2be_Real_e32e64_vi <bits<6> op> : VOP2_Real_e32_vi<op> {
+  def _e64_vi :
+    VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+    VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+}
+
+multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> :
+  VOP2_Real_e32_vi<op>,
+  VOP2_Real_e64_vi<{0, 1, 0, 0, op{5-0}}>;
+
+} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
+ 
+multiclass VOP2_SDWA_Real <bits<6> op> {
+  def _sdwa_vi :
+    VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+}
+
+multiclass VOP2be_Real_e32e64_vi <bits<6> op> :
+  Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> {
+  // For now left dpp only for asm/dasm
+  // TODO: add corresponding pseudo
+  def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
+}
+
+multiclass VOP2_Real_e32e64_vi <bits<6> op> :
+  Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> {
+  // For now left dpp only for asm/dasm
+  // TODO: add corresponding pseudo
+  def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
+}
+
+defm V_CNDMASK_B32        : Base_VOP2_Real_e32e64_vi <0x0>;
+defm V_ADD_F32            : VOP2_Real_e32e64_vi <0x1>;
+defm V_SUB_F32            : VOP2_Real_e32e64_vi <0x2>;
+defm V_SUBREV_F32         : VOP2_Real_e32e64_vi <0x3>;
+defm V_MUL_LEGACY_F32     : VOP2_Real_e32e64_vi <0x4>;
+defm V_MUL_F32            : VOP2_Real_e32e64_vi <0x5>;
+defm V_MUL_I32_I24        : VOP2_Real_e32e64_vi <0x6>;
+defm V_MUL_HI_I32_I24     : VOP2_Real_e32e64_vi <0x7>;
+defm V_MUL_U32_U24        : VOP2_Real_e32e64_vi <0x8>;
+defm V_MUL_HI_U32_U24     : VOP2_Real_e32e64_vi <0x9>;
+defm V_MIN_F32            : VOP2_Real_e32e64_vi <0xa>;
+defm V_MAX_F32            : VOP2_Real_e32e64_vi <0xb>;
+defm V_MIN_I32            : VOP2_Real_e32e64_vi <0xc>;
+defm V_MAX_I32            : VOP2_Real_e32e64_vi <0xd>;
+defm V_MIN_U32            : VOP2_Real_e32e64_vi <0xe>;
+defm V_MAX_U32            : VOP2_Real_e32e64_vi <0xf>;
+defm V_LSHRREV_B32        : VOP2_Real_e32e64_vi <0x10>;
+defm V_ASHRREV_I32        : VOP2_Real_e32e64_vi <0x11>;
+defm V_LSHLREV_B32        : VOP2_Real_e32e64_vi <0x12>;
+defm V_AND_B32            : VOP2_Real_e32e64_vi <0x13>;
+defm V_OR_B32             : VOP2_Real_e32e64_vi <0x14>;
+defm V_XOR_B32            : VOP2_Real_e32e64_vi <0x15>;
+defm V_MAC_F32            : VOP2_Real_e32e64_vi <0x16>;
+defm V_MADMK_F32          : VOP2_Real_MADK_vi <0x17>;
+defm V_MADAK_F32          : VOP2_Real_MADK_vi <0x18>;
+defm V_ADD_I32            : VOP2be_Real_e32e64_vi <0x19>;
+defm V_SUB_I32            : VOP2be_Real_e32e64_vi <0x1a>;
+defm V_SUBREV_I32         : VOP2be_Real_e32e64_vi <0x1b>;
+defm V_ADDC_U32           : VOP2be_Real_e32e64_vi <0x1c>;
+defm V_SUBB_U32           : VOP2be_Real_e32e64_vi <0x1d>;
+defm V_SUBBREV_U32        : VOP2be_Real_e32e64_vi <0x1e>;
+
+defm V_READLANE_B32       : VOP32_Real_vi <0x289>;
+defm V_WRITELANE_B32      : VOP32_Real_vi <0x28a>;
+
+defm V_BFM_B32            : VOP2_Real_e64_vi <0x293>;
+defm V_BCNT_U32_B32       : VOP2_Real_e64_vi <0x28b>;
+defm V_MBCNT_LO_U32_B32   : VOP2_Real_e64_vi <0x28c>;
+defm V_MBCNT_HI_U32_B32   : VOP2_Real_e64_vi <0x28d>;
+defm V_LDEXP_F32          : VOP2_Real_e64_vi <0x288>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64_vi <0x1f0>;
+defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64_vi <0x294>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64_vi <0x295>;
+defm V_CVT_PKRTZ_F16_F32  : VOP2_Real_e64_vi <0x296>;
+defm V_CVT_PK_U16_U32     : VOP2_Real_e64_vi <0x297>;
+defm V_CVT_PK_I16_I32     : VOP2_Real_e64_vi <0x298>;
+
+defm V_ADD_F16            : VOP2_Real_e32e64_vi <0x1f>;
+defm V_SUB_F16            : VOP2_Real_e32e64_vi <0x20>;
+defm V_SUBREV_F16         : VOP2_Real_e32e64_vi <0x21>;
+defm V_MUL_F16            : VOP2_Real_e32e64_vi <0x22>;
+defm V_MAC_F16            : VOP2_Real_e32e64_vi <0x23>;
+defm V_MADMK_F16          : VOP2_Real_MADK_vi <0x24>;
+defm V_MADAK_F16          : VOP2_Real_MADK_vi <0x25>;
+defm V_ADD_U16            : VOP2_Real_e32e64_vi <0x26>;
+defm V_SUB_U16            : VOP2_Real_e32e64_vi <0x27>;
+defm V_SUBREV_U16         : VOP2_Real_e32e64_vi <0x28>;
+defm V_MUL_LO_U16         : VOP2_Real_e32e64_vi <0x29>;
+defm V_LSHLREV_B16        : VOP2_Real_e32e64_vi <0x2a>;
+defm V_LSHRREV_B16        : VOP2_Real_e32e64_vi <0x2b>;
+defm V_ASHRREV_I16        : VOP2_Real_e32e64_vi <0x2c>;
+defm V_MAX_F16            : VOP2_Real_e32e64_vi <0x2d>;
+defm V_MIN_F16            : VOP2_Real_e32e64_vi <0x2e>;
+defm V_MAX_U16            : VOP2_Real_e32e64_vi <0x2f>;
+defm V_MAX_I16            : VOP2_Real_e32e64_vi <0x30>;
+defm V_MIN_U16            : VOP2_Real_e32e64_vi <0x31>;
+defm V_MIN_I16            : VOP2_Real_e32e64_vi <0x32>;
+defm V_LDEXP_F16          : VOP2_Real_e32e64_vi <0x33>;
+
+let SubtargetPredicate = isVI in {
+
+// Aliases to simplify matching of floating-point instructions that
+// are VOP2 on SI and VOP3 on VI.
+class SI2_VI3Alias <string name, Instruction inst> : InstAlias <
+  name#" $dst, $src0, $src1",
+  (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0)
+>, PredicateControl {
+  let UseInstAsmMatchConverter = 0;
+  let AsmVariantName = AMDGPUAsmVariants.VOP3;
+}
+
+def : SI2_VI3Alias <"v_ldexp_f32", V_LDEXP_F32_e64_vi>;
+def : SI2_VI3Alias <"v_cvt_pkaccum_u8_f32", V_CVT_PKACCUM_U8_F32_e64_vi>;
+def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>;
+def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
+def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
+
+} // End SubtargetPredicate = isVI
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td
new file mode 100644
index 000000000000..5efa64d25ce1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -0,0 +1,447 @@
+//===-- VOP3Instructions.td - Vector Instruction Defintions ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VOP3 Classes
+//===----------------------------------------------------------------------===//
+
+class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
+  list<dag> ret3 = [(set P.DstVT:$vdst,
+    (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+          (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+          (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))];
+
+  list<dag> ret2 = [(set P.DstVT:$vdst,
+    (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+          (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))];
+
+  list<dag> ret1 = [(set P.DstVT:$vdst,
+    (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))];
+
+  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+                  !if(!eq(P.NumSrcArgs, 2), ret2,
+                  ret1));
+}
+
+class getVOP3Pat<VOPProfile P, SDPatternOperator node> {
+  list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))];
+  list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))];
+  list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0))];
+  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+                  !if(!eq(P.NumSrcArgs, 2), ret2,
+                  ret1));
+}
+
+class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> :
+  VOP3_Pseudo<OpName, P,
+    !if(P.HasModifiers, getVOP3ModPat<P, node>.ret, getVOP3Pat<P, node>.ret),
+    VOP3Only>;
+
+// Special case for v_div_fmas_{f32|f64}, since it seems to be the
+// only VOP instruction that implicitly reads VCC.
+let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in {
+def VOP_F32_F32_F32_F32_VCC : VOPProfile<[f32, f32, f32, f32]> {
+  let Outs64 = (outs DstRC.RegClass:$vdst);
+}
+def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> {
+  let Outs64 = (outs DstRC.RegClass:$vdst);
+}
+}
+
+class getVOP3VCC<VOPProfile P, SDPatternOperator node> {
+  list<dag> ret =
+    [(set P.DstVT:$vdst,
+      (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+            (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+            (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)),
+            (i1 VCC)))];
+}
+
+class VOP3_Profile<VOPProfile P> : VOPProfile<P.ArgVT> {
+  // FIXME: Hack to stop printing _e64
+  let Outs64 = (outs DstRC.RegClass:$vdst);
+  let Asm64 = " " # P.Asm64;
+}
+
+class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
+  let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+  let Asm64 = " $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod";
+}
+
+def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> {
+  // FIXME: Hack to stop printing _e64
+  let DstRC = RegisterOperand<VGPR_32>;
+}
+
+def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> {
+  // FIXME: Hack to stop printing _e64
+  let DstRC = RegisterOperand<VReg_64>;
+}
+
+//===----------------------------------------------------------------------===//
+// VOP3 Instructions
+//===----------------------------------------------------------------------===//
+
+let isCommutable = 1 in {
+
+def V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
+def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUmad_i24>;
+def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUmad_u24>;
+def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
+def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
+def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
+
+let SchedRW = [WriteDoubleAdd] in {
+def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
+def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
+def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>;
+def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum, 1>;
+} // End SchedRW = [WriteDoubleAdd]
+
+let SchedRW = [WriteQuarterRate32] in {
+def V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>>;
+def V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
+def V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
+def V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
+} // End SchedRW = [WriteQuarterRate32]
+
+let Uses = [VCC, EXEC] in {
+// v_div_fmas_f32:
+//   result = src0 * src1 + src2
+//   if (vcc)
+//     result *= 2^32
+//
+def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC,
+  getVOP3VCC<VOP_F32_F32_F32_F32_VCC, AMDGPUdiv_fmas>.ret> {
+  let SchedRW = [WriteFloatFMA];
+}
+// v_div_fmas_f64:
+//   result = src0 * src1 + src2
+//   if (vcc)
+//     result *= 2^64
+//
+def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC,
+  getVOP3VCC<VOP_F64_F64_F64_F64_VCC, AMDGPUdiv_fmas>.ret> {
+  let SchedRW = [WriteDouble];
+}
+} // End Uses = [VCC, EXEC]
+
+} // End isCommutable = 1
+
+def V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>;
+def V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>;
+def V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>;
+def V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>;
+def V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
+def V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
+def V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>;
+def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
+def V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
+def V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
+def V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmax3>;
+def V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmax3>;
+def V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumax3>;
+def V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
+def V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
+def V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
+def V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_u8>;
+def V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_hi_u8>;
+def V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_u16>;
+def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
+def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
+
+let SchedRW = [WriteDoubleAdd] in {
+def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
+def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
+} // End SchedRW = [WriteDoubleAdd]
+
+def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
+  let SchedRW = [WriteFloatFMA, WriteSALU];
+  let hasExtraSrcRegAllocReq = 1;
+}
+
+// Double precision division pre-scale.
+def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
+  let SchedRW = [WriteDouble, WriteSALU];
+  let hasExtraSrcRegAllocReq = 1;
+}
+
+def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_msad_u8>;
+def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_mqsad_pk_u16_u8>;
+
+def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> {
+  let SchedRW = [WriteDouble];
+}
+
+// These instructions only exist on SI and CI
+let SubtargetPredicate = isSICI in {
+def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>>;
+def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>>;
+def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>>;
+def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+} // End SubtargetPredicate = isSICI
+
+let SubtargetPredicate = isVI in {
+def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
+def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>>;
+def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
+} // End SubtargetPredicate = isVI
+
+
+let SubtargetPredicate = isCIVI in {
+
+def V_MQSAD_U16_U8 : VOP3Inst <"v_mqsad_u16_u8", VOP3_Profile<VOP_I32_I32_I32>>;
+def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>;
+def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>;
+
+let isCommutable = 1 in {
+def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3_Profile<VOP_I64_I32_I32_I64>>;
+
+// XXX - Does this set VCC?
+def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3_Profile<VOP_I64_I32_I32_I64>>;
+} // End isCommutable = 1
+
+} // End SubtargetPredicate = isCIVI
+
+
+let SubtargetPredicate = isVI in {
+
+let isCommutable = 1 in {
+
+def V_DIV_FIXUP_F16   : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
+def V_FMA_F16         : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>;
+def V_INTERP_P1LL_F16 : VOP3Inst <"v_interp_p1ll_f16", VOP3_Profile<VOP_F32_F32_F16>>;
+def V_INTERP_P1LV_F16 : VOP3Inst <"v_interp_p1lv_f16", VOP3_Profile<VOP_F32_F32_F16_F16>>;
+def V_INTERP_P2_F16   : VOP3Inst <"v_interp_p2_f16", VOP3_Profile<VOP_F16_F32_F16_F32>>;
+def V_MAD_F16         : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
+
+def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
+def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
+
+}  // End isCommutable = 1
+
+} // End SubtargetPredicate = isVI
+
+let Predicates = [isVI] in {
+
+multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
+                            Instruction inst, SDPatternOperator op3> {
+def : Pat<
+  (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
+  (inst i16:$src0, i16:$src1, i16:$src2)
+>;
+
+def : Pat<
+  (i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
+  (inst i16:$src0, i16:$src1, i16:$src2)
+>;
+
+def : Pat<
+  (i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
+   (REG_SEQUENCE VReg_64,
+     (inst i16:$src0, i16:$src1, i16:$src2), sub0,
+     (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+}
+
+defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>;
+defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>;
+
+} // End Predicates = [isVI]
+
+
+//===----------------------------------------------------------------------===//
+// Target
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
+
+multiclass VOP3_Real_si<bits<9> op> {
+  def _si : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+            VOP3e_si <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP3be_Real_si<bits<9> op> {
+  def _si : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+            VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+}
+
+} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI"
+
+defm V_MAD_LEGACY_F32   : VOP3_Real_si <0x140>;
+defm V_MAD_F32          : VOP3_Real_si <0x141>;
+defm V_MAD_I32_I24      : VOP3_Real_si <0x142>;
+defm V_MAD_U32_U24      : VOP3_Real_si <0x143>;
+defm V_CUBEID_F32       : VOP3_Real_si <0x144>;
+defm V_CUBESC_F32       : VOP3_Real_si <0x145>;
+defm V_CUBETC_F32       : VOP3_Real_si <0x146>;
+defm V_CUBEMA_F32       : VOP3_Real_si <0x147>;
+defm V_BFE_U32          : VOP3_Real_si <0x148>;
+defm V_BFE_I32          : VOP3_Real_si <0x149>;
+defm V_BFI_B32          : VOP3_Real_si <0x14a>;
+defm V_FMA_F32          : VOP3_Real_si <0x14b>;
+defm V_FMA_F64          : VOP3_Real_si <0x14c>;
+defm V_LERP_U8          : VOP3_Real_si <0x14d>;
+defm V_ALIGNBIT_B32     : VOP3_Real_si <0x14e>;
+defm V_ALIGNBYTE_B32    : VOP3_Real_si <0x14f>;
+defm V_MULLIT_F32       : VOP3_Real_si <0x150>;
+defm V_MIN3_F32         : VOP3_Real_si <0x151>;
+defm V_MIN3_I32         : VOP3_Real_si <0x152>;
+defm V_MIN3_U32         : VOP3_Real_si <0x153>;
+defm V_MAX3_F32         : VOP3_Real_si <0x154>;
+defm V_MAX3_I32         : VOP3_Real_si <0x155>;
+defm V_MAX3_U32         : VOP3_Real_si <0x156>;
+defm V_MED3_F32         : VOP3_Real_si <0x157>;
+defm V_MED3_I32         : VOP3_Real_si <0x158>;
+defm V_MED3_U32         : VOP3_Real_si <0x159>;
+defm V_SAD_U8           : VOP3_Real_si <0x15a>;
+defm V_SAD_HI_U8        : VOP3_Real_si <0x15b>;
+defm V_SAD_U16          : VOP3_Real_si <0x15c>;
+defm V_SAD_U32          : VOP3_Real_si <0x15d>;
+defm V_CVT_PK_U8_F32    : VOP3_Real_si <0x15e>;
+defm V_DIV_FIXUP_F32    : VOP3_Real_si <0x15f>;
+defm V_DIV_FIXUP_F64    : VOP3_Real_si <0x160>;
+defm V_LSHL_B64         : VOP3_Real_si <0x161>;
+defm V_LSHR_B64         : VOP3_Real_si <0x162>;
+defm V_ASHR_I64         : VOP3_Real_si <0x163>;
+defm V_ADD_F64          : VOP3_Real_si <0x164>;
+defm V_MUL_F64          : VOP3_Real_si <0x165>;
+defm V_MIN_F64          : VOP3_Real_si <0x166>;
+defm V_MAX_F64          : VOP3_Real_si <0x167>;
+defm V_LDEXP_F64        : VOP3_Real_si <0x168>;
+defm V_MUL_LO_U32       : VOP3_Real_si <0x169>;
+defm V_MUL_HI_U32       : VOP3_Real_si <0x16a>;
+defm V_MUL_LO_I32       : VOP3_Real_si <0x16b>;
+defm V_MUL_HI_I32       : VOP3_Real_si <0x16c>;
+defm V_DIV_SCALE_F32    : VOP3be_Real_si <0x16d>;
+defm V_DIV_SCALE_F64    : VOP3be_Real_si <0x16e>;
+defm V_DIV_FMAS_F32     : VOP3_Real_si <0x16f>;
+defm V_DIV_FMAS_F64     : VOP3_Real_si <0x170>;
+defm V_MSAD_U8          : VOP3_Real_si <0x171>;
+defm V_MQSAD_PK_U16_U8  : VOP3_Real_si <0x173>;
+defm V_TRIG_PREOP_F64   : VOP3_Real_si <0x174>;
+
+//===----------------------------------------------------------------------===//
+// CI
+//===----------------------------------------------------------------------===//
+
+multiclass VOP3_Real_ci<bits<9> op> {
+  def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+            VOP3e_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
+    let AssemblerPredicates = [isCIOnly];
+    let DecoderNamespace = "CI";
+  }
+}
+
+defm V_MQSAD_U16_U8     : VOP3_Real_ci <0x172>;
+defm V_QSAD_PK_U16_U8   : VOP3_Real_ci <0x172>;
+defm V_MQSAD_U32_U8     : VOP3_Real_ci <0x174>;
+defm V_MAD_U64_U32      : VOP3_Real_ci <0x176>;
+defm V_MAD_I64_I32      : VOP3_Real_ci <0x177>;
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+
+multiclass VOP3_Real_vi<bits<10> op> {
+  def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
+            VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP3be_Real_vi<bits<10> op> {
+  def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>,
+            VOP3be_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+}
+
+} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
+
+defm V_MQSAD_U16_U8     : VOP3_Real_vi <0x172>;
+defm V_MAD_U64_U32      : VOP3_Real_vi <0x176>;
+defm V_MAD_I64_I32      : VOP3_Real_vi <0x177>;
+
+defm V_MAD_LEGACY_F32   : VOP3_Real_vi <0x1c0>;
+defm V_MAD_F32          : VOP3_Real_vi <0x1c1>;
+defm V_MAD_I32_I24      : VOP3_Real_vi <0x1c2>;
+defm V_MAD_U32_U24      : VOP3_Real_vi <0x1c3>;
+defm V_CUBEID_F32       : VOP3_Real_vi <0x1c4>;
+defm V_CUBESC_F32       : VOP3_Real_vi <0x1c5>;
+defm V_CUBETC_F32       : VOP3_Real_vi <0x1c6>;
+defm V_CUBEMA_F32       : VOP3_Real_vi <0x1c7>;
+defm V_BFE_U32          : VOP3_Real_vi <0x1c8>;
+defm V_BFE_I32          : VOP3_Real_vi <0x1c9>;
+defm V_BFI_B32          : VOP3_Real_vi <0x1ca>;
+defm V_FMA_F32          : VOP3_Real_vi <0x1cb>;
+defm V_FMA_F64          : VOP3_Real_vi <0x1cc>;
+defm V_LERP_U8          : VOP3_Real_vi <0x1cd>;
+defm V_ALIGNBIT_B32     : VOP3_Real_vi <0x1ce>;
+defm V_ALIGNBYTE_B32    : VOP3_Real_vi <0x1cf>;
+defm V_MIN3_F32         : VOP3_Real_vi <0x1d0>;
+defm V_MIN3_I32         : VOP3_Real_vi <0x1d1>;
+defm V_MIN3_U32         : VOP3_Real_vi <0x1d2>;
+defm V_MAX3_F32         : VOP3_Real_vi <0x1d3>;
+defm V_MAX3_I32         : VOP3_Real_vi <0x1d4>;
+defm V_MAX3_U32         : VOP3_Real_vi <0x1d5>;
+defm V_MED3_F32         : VOP3_Real_vi <0x1d6>;
+defm V_MED3_I32         : VOP3_Real_vi <0x1d7>;
+defm V_MED3_U32         : VOP3_Real_vi <0x1d8>;
+defm V_SAD_U8           : VOP3_Real_vi <0x1d9>;
+defm V_SAD_HI_U8        : VOP3_Real_vi <0x1da>;
+defm V_SAD_U16          : VOP3_Real_vi <0x1db>;
+defm V_SAD_U32          : VOP3_Real_vi <0x1dc>;
+defm V_CVT_PK_U8_F32    : VOP3_Real_vi <0x1dd>;
+defm V_DIV_FIXUP_F32    : VOP3_Real_vi <0x1de>;
+defm V_DIV_FIXUP_F64    : VOP3_Real_vi <0x1df>;
+defm V_DIV_SCALE_F32    : VOP3be_Real_vi <0x1e0>;
+defm V_DIV_SCALE_F64    : VOP3be_Real_vi <0x1e1>;
+defm V_DIV_FMAS_F32     : VOP3_Real_vi <0x1e2>;
+defm V_DIV_FMAS_F64     : VOP3_Real_vi <0x1e3>;
+defm V_MSAD_U8          : VOP3_Real_vi <0x1e4>;
+defm V_QSAD_PK_U16_U8   : VOP3_Real_vi <0x1e5>;
+defm V_MQSAD_PK_U16_U8  : VOP3_Real_vi <0x1e6>;
+defm V_MQSAD_U32_U8     : VOP3_Real_vi <0x1e7>;
+
+defm V_MAD_F16          : VOP3_Real_vi <0x1ea>;
+defm V_MAD_U16          : VOP3_Real_vi <0x1eb>;
+defm V_MAD_I16          : VOP3_Real_vi <0x1ec>;
+
+defm V_FMA_F16          : VOP3_Real_vi <0x1ee>;
+defm V_DIV_FIXUP_F16    : VOP3_Real_vi <0x1ef>;
+
+defm V_INTERP_P1LL_F16  : VOP3_Real_vi <0x274>;
+defm V_INTERP_P1LV_F16  : VOP3_Real_vi <0x275>;
+defm V_INTERP_P2_F16    : VOP3_Real_vi <0x276>;
+defm V_ADD_F64          : VOP3_Real_vi <0x280>;
+defm V_MUL_F64          : VOP3_Real_vi <0x281>;
+defm V_MIN_F64          : VOP3_Real_vi <0x282>;
+defm V_MAX_F64          : VOP3_Real_vi <0x283>;
+defm V_LDEXP_F64        : VOP3_Real_vi <0x284>;
+defm V_MUL_LO_U32       : VOP3_Real_vi <0x285>;
+
+// removed from VI as identical to V_MUL_LO_U32
+let isAsmParserOnly = 1 in {
+defm V_MUL_LO_I32       : VOP3_Real_vi <0x285>;
+}
+
+defm V_MUL_HI_U32       : VOP3_Real_vi <0x286>;
+defm V_MUL_HI_I32       : VOP3_Real_vi <0x287>;
+
+defm V_LSHLREV_B64      : VOP3_Real_vi <0x28f>;
+defm V_LSHRREV_B64      : VOP3_Real_vi <0x290>;
+defm V_ASHRREV_I64      : VOP3_Real_vi <0x291>;
+defm V_TRIG_PREOP_F64   : VOP3_Real_vi <0x292>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td
new file mode 100644
index 000000000000..c431d9db801e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -0,0 +1,1144 @@
+//===-- VOPCInstructions.td - Vector Instruction Defintions ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Encodings
+//===----------------------------------------------------------------------===//
+
+class VOPCe <bits<8> op> : Enc32 {
+  bits<9> src0;
+  bits<8> src1;
+
+  let Inst{8-0} = src0;
+  let Inst{16-9} = src1;
+  let Inst{24-17} = op;
+  let Inst{31-25} = 0x3e;
+}
+
+class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
+  bits<8> src1;
+
+  let Inst{8-0}   = 0xf9; // sdwa
+  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0);
+  let Inst{24-17} = op;
+  let Inst{31-25} = 0x3e; // encoding
+
+  // VOPC disallows dst_sel and dst_unused as they have no effect on destination
+  let Inst{42-40} = SDWA.DWORD;
+  let Inst{44-43} = SDWA.UNUSED_PRESERVE;
+}
+
+//===----------------------------------------------------------------------===//
+// VOPC classes
+//===----------------------------------------------------------------------===//
+
+// VOPC instructions are a special case because for the 32-bit
+// encoding, we want to display the implicit vcc write as if it were
+// an explicit $dst.
+class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt0> :
+  VOPProfile <[i1, vt0, vt1, untyped]> {
+  let Asm32 = "vcc, $src0, $src1";
+  // The destination for 32-bit encoding is implicit.
+  let HasDst32 = 0;
+  let Outs64 = (outs VOPDstS64:$sdst);
+  list<SchedReadWrite> Schedule = sched;
+}
+
+class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[]> :
+  InstSI<(outs), P.Ins32, "", pattern>,
+  VOP <opName>,
+  SIMCInstr<opName#"_e32", SIEncodingFamily.NONE> {
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let UseNamedOperandTable = 1;
+
+  string Mnemonic = opName;
+  string AsmOperands = P.Asm32;
+
+  let Size = 4;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+
+  let VALU = 1;
+  let VOPC = 1;
+  let Uses = [EXEC];
+  let Defs = [VCC];
+
+  let SubtargetPredicate = isGCN;
+
+  VOPProfile Pfl = P;
+}
+
+class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> :
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.PseudoInstr # " " # ps.AsmOperands, []>,
+  SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  let Constraints     = ps.Constraints;
+  let DisableEncoding = ps.DisableEncoding;
+
+  // copy relevant pseudo op flags
+  let SubtargetPredicate = ps.SubtargetPredicate;
+  let AsmMatchConverter  = ps.AsmMatchConverter;
+  let Constraints        = ps.Constraints;
+  let DisableEncoding    = ps.DisableEncoding;
+  let TSFlags            = ps.TSFlags;
+}
+
+class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+  VOP_SDWA_Pseudo <OpName, P, pattern> {
+  let AsmMatchConverter = "cvtSdwaVOPC";
+}
+
+// This class is used only with VOPC instructions. Use $sdst for out operand
+class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
+  InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl {
+
+  field bit isCompare;
+  field bit isCommutable;
+
+  let ResultInst =
+    !if (p.HasDst32,
+      !if (!eq(p.NumSrcArgs, 0),
+        // 1 dst, 0 src
+        (inst p.DstRC:$sdst),
+      !if (!eq(p.NumSrcArgs, 1),
+        // 1 dst, 1 src
+        (inst p.DstRC:$sdst, p.Src0RC32:$src0),
+      !if (!eq(p.NumSrcArgs, 2),
+        // 1 dst, 2 src
+        (inst p.DstRC:$sdst, p.Src0RC32:$src0, p.Src1RC32:$src1),
+      // else - unreachable
+        (inst)))),
+    // else
+      !if (!eq(p.NumSrcArgs, 2),
+        // 0 dst, 2 src
+        (inst p.Src0RC32:$src0, p.Src1RC32:$src1),
+      !if (!eq(p.NumSrcArgs, 1),
+        // 0 dst, 1 src
+        (inst p.Src0RC32:$src1),
+      // else
+        // 0 dst, 0 src
+        (inst))));
+
+  let AsmVariantName = AMDGPUAsmVariants.Default;
+  let SubtargetPredicate = AssemblerPredicate;
+}
+
+multiclass VOPC_Pseudos <string opName,
+                         VOPC_Profile P,
+                         PatLeaf cond = COND_NULL,
+                         string revOp = opName,
+                         bit DefExec = 0> {
+
+  def _e32 : VOPC_Pseudo <opName, P>,
+             Commutable_REV<revOp#"_e32", !eq(revOp, opName)> {
+    let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+    let SchedRW = P.Schedule;
+    let isConvergent = DefExec;
+    let isCompare = 1;
+    let isCommutable = 1;
+  }
+
+  def _e64 : VOP3_Pseudo<opName, P,
+    !if(P.HasModifiers,
+      [(set i1:$sdst,
+          (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                      i1:$clamp, i32:$omod)),
+                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+                 cond))],
+      [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))])>,
+    Commutable_REV<revOp#"_e64", !eq(revOp, opName)> {
+    let Defs = !if(DefExec, [EXEC], []);
+    let SchedRW = P.Schedule;
+    let isCompare = 1;
+    let isCommutable = 1;
+  }
+
+  def _sdwa : VOPC_SDWA_Pseudo <opName, P>,
+              Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)> {
+    let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+    let SchedRW = P.Schedule;
+    let isConvergent = DefExec;
+    let isCompare = 1;
+    let isCommutable = 1;
+  }
+}
+
+def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>;
+def VOPC_I1_F32_F32 : VOPC_Profile<[Write32Bit], f32>;
+def VOPC_I1_F64_F64 : VOPC_Profile<[WriteDoubleAdd], f64>;
+def VOPC_I1_I16_I16 : VOPC_Profile<[Write32Bit], i16>;
+def VOPC_I1_I32_I32 : VOPC_Profile<[Write32Bit], i32>;
+def VOPC_I1_I64_I64 : VOPC_Profile<[Write64Bit], i64>;
+
+multiclass VOPC_F16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+  VOPC_Pseudos <opName, VOPC_I1_F16_F16, cond, revOp, 0>;
+
+multiclass VOPC_F32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+  VOPC_Pseudos <opName, VOPC_I1_F32_F32, cond, revOp, 0>;
+
+multiclass VOPC_F64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+  VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>;
+
+multiclass VOPC_I16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+  VOPC_Pseudos <opName, VOPC_I1_I16_I16, cond, revOp, 0>;
+
+multiclass VOPC_I32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+  VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>;
+
+multiclass VOPC_I64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+  VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
+
+multiclass VOPCX_F16 <string opName, string revOp = opName> :
+  VOPC_Pseudos <opName, VOPC_I1_F16_F16, COND_NULL, revOp, 1>;
+
+multiclass VOPCX_F32 <string opName, string revOp = opName> :
+  VOPC_Pseudos <opName, VOPC_I1_F32_F32, COND_NULL, revOp, 1>;
+
+multiclass VOPCX_F64 <string opName, string revOp = opName> :
+  VOPC_Pseudos <opName, VOPC_I1_F64_F64, COND_NULL, revOp, 1>;
+
+multiclass VOPCX_I16 <string opName, string revOp = opName> :
+  VOPC_Pseudos <opName, VOPC_I1_I16_I16, COND_NULL, revOp, 1>;
+
+multiclass VOPCX_I32 <string opName, string revOp = opName> :
+  VOPC_Pseudos <opName, VOPC_I1_I32_I32, COND_NULL, revOp, 1>;
+
+multiclass VOPCX_I64 <string opName, string revOp = opName> :
+  VOPC_Pseudos <opName, VOPC_I1_I64_I64, COND_NULL, revOp, 1>;
+
+
+//===----------------------------------------------------------------------===//
+// Compare instructions
+//===----------------------------------------------------------------------===//
+
+defm V_CMP_F_F32 : VOPC_F32 <"v_cmp_f_f32">;
+defm V_CMP_LT_F32 : VOPC_F32 <"v_cmp_lt_f32", COND_OLT, "v_cmp_gt_f32">;
+defm V_CMP_EQ_F32 : VOPC_F32 <"v_cmp_eq_f32", COND_OEQ>;
+defm V_CMP_LE_F32 : VOPC_F32 <"v_cmp_le_f32", COND_OLE, "v_cmp_ge_f32">;
+defm V_CMP_GT_F32 : VOPC_F32 <"v_cmp_gt_f32", COND_OGT>;
+defm V_CMP_LG_F32 : VOPC_F32 <"v_cmp_lg_f32", COND_ONE>;
+defm V_CMP_GE_F32 : VOPC_F32 <"v_cmp_ge_f32", COND_OGE>;
+defm V_CMP_O_F32 : VOPC_F32 <"v_cmp_o_f32", COND_O>;
+defm V_CMP_U_F32 : VOPC_F32 <"v_cmp_u_f32", COND_UO>;
+defm V_CMP_NGE_F32 : VOPC_F32 <"v_cmp_nge_f32",  COND_ULT, "v_cmp_nle_f32">;
+defm V_CMP_NLG_F32 : VOPC_F32 <"v_cmp_nlg_f32", COND_UEQ>;
+defm V_CMP_NGT_F32 : VOPC_F32 <"v_cmp_ngt_f32", COND_ULE, "v_cmp_nlt_f32">;
+defm V_CMP_NLE_F32 : VOPC_F32 <"v_cmp_nle_f32", COND_UGT>;
+defm V_CMP_NEQ_F32 : VOPC_F32 <"v_cmp_neq_f32", COND_UNE>;
+defm V_CMP_NLT_F32 : VOPC_F32 <"v_cmp_nlt_f32", COND_UGE>;
+defm V_CMP_TRU_F32 : VOPC_F32 <"v_cmp_tru_f32">;
+
+defm V_CMPX_F_F32 : VOPCX_F32 <"v_cmpx_f_f32">;
+defm V_CMPX_LT_F32 : VOPCX_F32 <"v_cmpx_lt_f32", "v_cmpx_gt_f32">;
+defm V_CMPX_EQ_F32 : VOPCX_F32 <"v_cmpx_eq_f32">;
+defm V_CMPX_LE_F32 : VOPCX_F32 <"v_cmpx_le_f32", "v_cmpx_ge_f32">;
+defm V_CMPX_GT_F32 : VOPCX_F32 <"v_cmpx_gt_f32">;
+defm V_CMPX_LG_F32 : VOPCX_F32 <"v_cmpx_lg_f32">;
+defm V_CMPX_GE_F32 : VOPCX_F32 <"v_cmpx_ge_f32">;
+defm V_CMPX_O_F32 : VOPCX_F32 <"v_cmpx_o_f32">;
+defm V_CMPX_U_F32 : VOPCX_F32 <"v_cmpx_u_f32">;
+defm V_CMPX_NGE_F32 : VOPCX_F32 <"v_cmpx_nge_f32", "v_cmpx_nle_f32">;
+defm V_CMPX_NLG_F32 : VOPCX_F32 <"v_cmpx_nlg_f32">;
+defm V_CMPX_NGT_F32 : VOPCX_F32 <"v_cmpx_ngt_f32", "v_cmpx_nlt_f32">;
+defm V_CMPX_NLE_F32 : VOPCX_F32 <"v_cmpx_nle_f32">;
+defm V_CMPX_NEQ_F32 : VOPCX_F32 <"v_cmpx_neq_f32">;
+defm V_CMPX_NLT_F32 : VOPCX_F32 <"v_cmpx_nlt_f32">;
+defm V_CMPX_TRU_F32 : VOPCX_F32 <"v_cmpx_tru_f32">;
+
+defm V_CMP_F_F64 : VOPC_F64 <"v_cmp_f_f64">;
+defm V_CMP_LT_F64 : VOPC_F64 <"v_cmp_lt_f64", COND_OLT, "v_cmp_gt_f64">;
+defm V_CMP_EQ_F64 : VOPC_F64 <"v_cmp_eq_f64", COND_OEQ>;
+defm V_CMP_LE_F64 : VOPC_F64 <"v_cmp_le_f64", COND_OLE, "v_cmp_ge_f64">;
+defm V_CMP_GT_F64 : VOPC_F64 <"v_cmp_gt_f64", COND_OGT>;
+defm V_CMP_LG_F64 : VOPC_F64 <"v_cmp_lg_f64", COND_ONE>;
+defm V_CMP_GE_F64 : VOPC_F64 <"v_cmp_ge_f64", COND_OGE>;
+defm V_CMP_O_F64 : VOPC_F64 <"v_cmp_o_f64", COND_O>;
+defm V_CMP_U_F64 : VOPC_F64 <"v_cmp_u_f64", COND_UO>;
+defm V_CMP_NGE_F64 : VOPC_F64 <"v_cmp_nge_f64", COND_ULT, "v_cmp_nle_f64">;
+defm V_CMP_NLG_F64 : VOPC_F64 <"v_cmp_nlg_f64", COND_UEQ>;
+defm V_CMP_NGT_F64 : VOPC_F64 <"v_cmp_ngt_f64", COND_ULE, "v_cmp_nlt_f64">;
+defm V_CMP_NLE_F64 : VOPC_F64 <"v_cmp_nle_f64", COND_UGT>;
+defm V_CMP_NEQ_F64 : VOPC_F64 <"v_cmp_neq_f64", COND_UNE>;
+defm V_CMP_NLT_F64 : VOPC_F64 <"v_cmp_nlt_f64", COND_UGE>;
+defm V_CMP_TRU_F64 : VOPC_F64 <"v_cmp_tru_f64">;
+
+defm V_CMPX_F_F64 : VOPCX_F64 <"v_cmpx_f_f64">;
+defm V_CMPX_LT_F64 : VOPCX_F64 <"v_cmpx_lt_f64", "v_cmpx_gt_f64">;
+defm V_CMPX_EQ_F64 : VOPCX_F64 <"v_cmpx_eq_f64">;
+defm V_CMPX_LE_F64 : VOPCX_F64 <"v_cmpx_le_f64", "v_cmpx_ge_f64">;
+defm V_CMPX_GT_F64 : VOPCX_F64 <"v_cmpx_gt_f64">;
+defm V_CMPX_LG_F64 : VOPCX_F64 <"v_cmpx_lg_f64">;
+defm V_CMPX_GE_F64 : VOPCX_F64 <"v_cmpx_ge_f64">;
+defm V_CMPX_O_F64 : VOPCX_F64 <"v_cmpx_o_f64">;
+defm V_CMPX_U_F64 : VOPCX_F64 <"v_cmpx_u_f64">;
+defm V_CMPX_NGE_F64 : VOPCX_F64 <"v_cmpx_nge_f64", "v_cmpx_nle_f64">;
+defm V_CMPX_NLG_F64 : VOPCX_F64 <"v_cmpx_nlg_f64">;
+defm V_CMPX_NGT_F64 : VOPCX_F64 <"v_cmpx_ngt_f64", "v_cmpx_nlt_f64">;
+defm V_CMPX_NLE_F64 : VOPCX_F64 <"v_cmpx_nle_f64">;
+defm V_CMPX_NEQ_F64 : VOPCX_F64 <"v_cmpx_neq_f64">;
+defm V_CMPX_NLT_F64 : VOPCX_F64 <"v_cmpx_nlt_f64">;
+defm V_CMPX_TRU_F64 : VOPCX_F64 <"v_cmpx_tru_f64">;
+
+let SubtargetPredicate = isSICI in {
+
+defm V_CMPS_F_F32 : VOPC_F32 <"v_cmps_f_f32">;
+defm V_CMPS_LT_F32 : VOPC_F32 <"v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">;
+defm V_CMPS_EQ_F32 : VOPC_F32 <"v_cmps_eq_f32">;
+defm V_CMPS_LE_F32 : VOPC_F32 <"v_cmps_le_f32", COND_NULL, "v_cmps_ge_f32">;
+defm V_CMPS_GT_F32 : VOPC_F32 <"v_cmps_gt_f32">;
+defm V_CMPS_LG_F32 : VOPC_F32 <"v_cmps_lg_f32">;
+defm V_CMPS_GE_F32 : VOPC_F32 <"v_cmps_ge_f32">;
+defm V_CMPS_O_F32 : VOPC_F32 <"v_cmps_o_f32">;
+defm V_CMPS_U_F32 : VOPC_F32 <"v_cmps_u_f32">;
+defm V_CMPS_NGE_F32 : VOPC_F32 <"v_cmps_nge_f32", COND_NULL, "v_cmps_nle_f32">;
+defm V_CMPS_NLG_F32 : VOPC_F32 <"v_cmps_nlg_f32">;
+defm V_CMPS_NGT_F32 : VOPC_F32 <"v_cmps_ngt_f32", COND_NULL, "v_cmps_nlt_f32">;
+defm V_CMPS_NLE_F32 : VOPC_F32 <"v_cmps_nle_f32">;
+defm V_CMPS_NEQ_F32 : VOPC_F32 <"v_cmps_neq_f32">;
+defm V_CMPS_NLT_F32 : VOPC_F32 <"v_cmps_nlt_f32">;
+defm V_CMPS_TRU_F32 : VOPC_F32 <"v_cmps_tru_f32">;
+
+defm V_CMPSX_F_F32 : VOPCX_F32 <"v_cmpsx_f_f32">;
+defm V_CMPSX_LT_F32 : VOPCX_F32 <"v_cmpsx_lt_f32", "v_cmpsx_gt_f32">;
+defm V_CMPSX_EQ_F32 : VOPCX_F32 <"v_cmpsx_eq_f32">;
+defm V_CMPSX_LE_F32 : VOPCX_F32 <"v_cmpsx_le_f32", "v_cmpsx_ge_f32">;
+defm V_CMPSX_GT_F32 : VOPCX_F32 <"v_cmpsx_gt_f32">;
+defm V_CMPSX_LG_F32 : VOPCX_F32 <"v_cmpsx_lg_f32">;
+defm V_CMPSX_GE_F32 : VOPCX_F32 <"v_cmpsx_ge_f32">;
+defm V_CMPSX_O_F32 : VOPCX_F32 <"v_cmpsx_o_f32">;
+defm V_CMPSX_U_F32 : VOPCX_F32 <"v_cmpsx_u_f32">;
+defm V_CMPSX_NGE_F32 : VOPCX_F32 <"v_cmpsx_nge_f32", "v_cmpsx_nle_f32">;
+defm V_CMPSX_NLG_F32 : VOPCX_F32 <"v_cmpsx_nlg_f32">;
+defm V_CMPSX_NGT_F32 : VOPCX_F32 <"v_cmpsx_ngt_f32", "v_cmpsx_nlt_f32">;
+defm V_CMPSX_NLE_F32 : VOPCX_F32 <"v_cmpsx_nle_f32">;
+defm V_CMPSX_NEQ_F32 : VOPCX_F32 <"v_cmpsx_neq_f32">;
+defm V_CMPSX_NLT_F32 : VOPCX_F32 <"v_cmpsx_nlt_f32">;
+defm V_CMPSX_TRU_F32 : VOPCX_F32 <"v_cmpsx_tru_f32">;
+
+defm V_CMPS_F_F64 : VOPC_F64 <"v_cmps_f_f64">;
+defm V_CMPS_LT_F64 : VOPC_F64 <"v_cmps_lt_f64", COND_NULL, "v_cmps_gt_f64">;
+defm V_CMPS_EQ_F64 : VOPC_F64 <"v_cmps_eq_f64">;
+defm V_CMPS_LE_F64 : VOPC_F64 <"v_cmps_le_f64", COND_NULL, "v_cmps_ge_f64">;
+defm V_CMPS_GT_F64 : VOPC_F64 <"v_cmps_gt_f64">;
+defm V_CMPS_LG_F64 : VOPC_F64 <"v_cmps_lg_f64">;
+defm V_CMPS_GE_F64 : VOPC_F64 <"v_cmps_ge_f64">;
+defm V_CMPS_O_F64 : VOPC_F64 <"v_cmps_o_f64">;
+defm V_CMPS_U_F64 : VOPC_F64 <"v_cmps_u_f64">;
+defm V_CMPS_NGE_F64 : VOPC_F64 <"v_cmps_nge_f64", COND_NULL, "v_cmps_nle_f64">;
+defm V_CMPS_NLG_F64 : VOPC_F64 <"v_cmps_nlg_f64">;
+defm V_CMPS_NGT_F64 : VOPC_F64 <"v_cmps_ngt_f64", COND_NULL, "v_cmps_nlt_f64">;
+defm V_CMPS_NLE_F64 : VOPC_F64 <"v_cmps_nle_f64">;
+defm V_CMPS_NEQ_F64 : VOPC_F64 <"v_cmps_neq_f64">;
+defm V_CMPS_NLT_F64 : VOPC_F64 <"v_cmps_nlt_f64">;
+defm V_CMPS_TRU_F64 : VOPC_F64 <"v_cmps_tru_f64">;
+
+defm V_CMPSX_F_F64 : VOPCX_F64 <"v_cmpsx_f_f64">;
+defm V_CMPSX_LT_F64 : VOPCX_F64 <"v_cmpsx_lt_f64", "v_cmpsx_gt_f64">;
+defm V_CMPSX_EQ_F64 : VOPCX_F64 <"v_cmpsx_eq_f64">;
+defm V_CMPSX_LE_F64 : VOPCX_F64 <"v_cmpsx_le_f64", "v_cmpsx_ge_f64">;
+defm V_CMPSX_GT_F64 : VOPCX_F64 <"v_cmpsx_gt_f64">;
+defm V_CMPSX_LG_F64 : VOPCX_F64 <"v_cmpsx_lg_f64">;
+defm V_CMPSX_GE_F64 : VOPCX_F64 <"v_cmpsx_ge_f64">;
+defm V_CMPSX_O_F64 : VOPCX_F64 <"v_cmpsx_o_f64">;
+defm V_CMPSX_U_F64 : VOPCX_F64 <"v_cmpsx_u_f64">;
+defm V_CMPSX_NGE_F64 : VOPCX_F64 <"v_cmpsx_nge_f64", "v_cmpsx_nle_f64">;
+defm V_CMPSX_NLG_F64 : VOPCX_F64 <"v_cmpsx_nlg_f64">;
+defm V_CMPSX_NGT_F64 : VOPCX_F64 <"v_cmpsx_ngt_f64", "v_cmpsx_nlt_f64">;
+defm V_CMPSX_NLE_F64 : VOPCX_F64 <"v_cmpsx_nle_f64">;
+defm V_CMPSX_NEQ_F64 : VOPCX_F64 <"v_cmpsx_neq_f64">;
+defm V_CMPSX_NLT_F64 : VOPCX_F64 <"v_cmpsx_nlt_f64">;
+defm V_CMPSX_TRU_F64 : VOPCX_F64 <"v_cmpsx_tru_f64">;
+
+} // End SubtargetPredicate = isSICI
+
+let SubtargetPredicate = Has16BitInsts in {
+
+defm V_CMP_F_F16    : VOPC_F16 <"v_cmp_f_f16">;
+defm V_CMP_LT_F16   : VOPC_F16 <"v_cmp_lt_f16", COND_OLT, "v_cmp_gt_f16">;
+defm V_CMP_EQ_F16   : VOPC_F16 <"v_cmp_eq_f16", COND_OEQ>;
+defm V_CMP_LE_F16   : VOPC_F16 <"v_cmp_le_f16", COND_OLE, "v_cmp_ge_f16">;
+defm V_CMP_GT_F16   : VOPC_F16 <"v_cmp_gt_f16", COND_OGT>;
+defm V_CMP_LG_F16   : VOPC_F16 <"v_cmp_lg_f16", COND_ONE>;
+defm V_CMP_GE_F16   : VOPC_F16 <"v_cmp_ge_f16", COND_OGE>;
+defm V_CMP_O_F16    : VOPC_F16 <"v_cmp_o_f16", COND_O>;
+defm V_CMP_U_F16    : VOPC_F16 <"v_cmp_u_f16", COND_UO>;
+defm V_CMP_NGE_F16  : VOPC_F16 <"v_cmp_nge_f16", COND_ULT, "v_cmp_nle_f16">;
+defm V_CMP_NLG_F16  : VOPC_F16 <"v_cmp_nlg_f16", COND_UEQ>;
+defm V_CMP_NGT_F16  : VOPC_F16 <"v_cmp_ngt_f16", COND_ULE, "v_cmp_nlt_f16">;
+defm V_CMP_NLE_F16  : VOPC_F16 <"v_cmp_nle_f16", COND_UGT>;
+defm V_CMP_NEQ_F16  : VOPC_F16 <"v_cmp_neq_f16", COND_UNE>;
+defm V_CMP_NLT_F16  : VOPC_F16 <"v_cmp_nlt_f16", COND_UGE>;
+defm V_CMP_TRU_F16  : VOPC_F16 <"v_cmp_tru_f16">;
+
+defm V_CMPX_F_F16   : VOPCX_F16 <"v_cmpx_f_f16">;
+defm V_CMPX_LT_F16  : VOPCX_F16 <"v_cmpx_lt_f16", "v_cmpx_gt_f16">;
+defm V_CMPX_EQ_F16  : VOPCX_F16 <"v_cmpx_eq_f16">;
+defm V_CMPX_LE_F16  : VOPCX_F16 <"v_cmpx_le_f16", "v_cmpx_ge_f16">;
+defm V_CMPX_GT_F16  : VOPCX_F16 <"v_cmpx_gt_f16">;
+defm V_CMPX_LG_F16  : VOPCX_F16 <"v_cmpx_lg_f16">;
+defm V_CMPX_GE_F16  : VOPCX_F16 <"v_cmpx_ge_f16">;
+defm V_CMPX_O_F16   : VOPCX_F16 <"v_cmpx_o_f16">;
+defm V_CMPX_U_F16   : VOPCX_F16 <"v_cmpx_u_f16">;
+defm V_CMPX_NGE_F16 : VOPCX_F16 <"v_cmpx_nge_f16", "v_cmpx_nle_f16">;
+defm V_CMPX_NLG_F16 : VOPCX_F16 <"v_cmpx_nlg_f16">;
+defm V_CMPX_NGT_F16 : VOPCX_F16 <"v_cmpx_ngt_f16", "v_cmpx_nlt_f16">;
+defm V_CMPX_NLE_F16 : VOPCX_F16 <"v_cmpx_nle_f16">;
+defm V_CMPX_NEQ_F16 : VOPCX_F16 <"v_cmpx_neq_f16">;
+defm V_CMPX_NLT_F16 : VOPCX_F16 <"v_cmpx_nlt_f16">;
+defm V_CMPX_TRU_F16 : VOPCX_F16 <"v_cmpx_tru_f16">;
+
+defm V_CMP_F_I16 : VOPC_I16 <"v_cmp_f_i16">;
+defm V_CMP_LT_I16 : VOPC_I16 <"v_cmp_lt_i16", COND_SLT, "v_cmp_gt_i16">;
+defm V_CMP_EQ_I16 : VOPC_I16 <"v_cmp_eq_i16">;
+defm V_CMP_LE_I16 : VOPC_I16 <"v_cmp_le_i16", COND_SLE, "v_cmp_ge_i16">;
+defm V_CMP_GT_I16 : VOPC_I16 <"v_cmp_gt_i16", COND_SGT>;
+defm V_CMP_NE_I16 : VOPC_I16 <"v_cmp_ne_i16">;
+defm V_CMP_GE_I16 : VOPC_I16 <"v_cmp_ge_i16", COND_SGE>;
+defm V_CMP_T_I16 : VOPC_I16 <"v_cmp_t_i16">;
+
+defm V_CMP_F_U16 : VOPC_I16 <"v_cmp_f_u16">;
+defm V_CMP_LT_U16 : VOPC_I16 <"v_cmp_lt_u16", COND_ULT, "v_cmp_gt_u16">;
+defm V_CMP_EQ_U16 : VOPC_I16 <"v_cmp_eq_u16", COND_EQ>;
+defm V_CMP_LE_U16 : VOPC_I16 <"v_cmp_le_u16", COND_ULE, "v_cmp_ge_u16">;
+defm V_CMP_GT_U16 : VOPC_I16 <"v_cmp_gt_u16", COND_UGT>;
+defm V_CMP_NE_U16 : VOPC_I16 <"v_cmp_ne_u16", COND_NE>;
+defm V_CMP_GE_U16 : VOPC_I16 <"v_cmp_ge_u16", COND_UGE>;
+defm V_CMP_T_U16 : VOPC_I16 <"v_cmp_t_u16">;
+
+defm V_CMPX_F_I16 : VOPCX_I16 <"v_cmpx_f_i16">;
+defm V_CMPX_LT_I16 : VOPCX_I16 <"v_cmpx_lt_i16", "v_cmpx_gt_i16">;
+defm V_CMPX_EQ_I16 : VOPCX_I16 <"v_cmpx_eq_i16">;
+defm V_CMPX_LE_I16 : VOPCX_I16 <"v_cmpx_le_i16", "v_cmpx_ge_i16">;
+defm V_CMPX_GT_I16 : VOPCX_I16 <"v_cmpx_gt_i16">;
+defm V_CMPX_NE_I16 : VOPCX_I16 <"v_cmpx_ne_i16">;
+defm V_CMPX_GE_I16 : VOPCX_I16 <"v_cmpx_ge_i16">;
+defm V_CMPX_T_I16 : VOPCX_I16 <"v_cmpx_t_i16">;
+defm V_CMPX_F_U16 : VOPCX_I16 <"v_cmpx_f_u16">;
+
+defm V_CMPX_LT_U16 : VOPCX_I16 <"v_cmpx_lt_u16", "v_cmpx_gt_u16">;
+defm V_CMPX_EQ_U16 : VOPCX_I16 <"v_cmpx_eq_u16">;
+defm V_CMPX_LE_U16 : VOPCX_I16 <"v_cmpx_le_u16", "v_cmpx_ge_u16">;
+defm V_CMPX_GT_U16 : VOPCX_I16 <"v_cmpx_gt_u16">;
+defm V_CMPX_NE_U16 : VOPCX_I16 <"v_cmpx_ne_u16">;
+defm V_CMPX_GE_U16 : VOPCX_I16 <"v_cmpx_ge_u16">;
+defm V_CMPX_T_U16 : VOPCX_I16 <"v_cmpx_t_u16">;
+
+} // End SubtargetPredicate = Has16BitInsts
+
+defm V_CMP_F_I32 : VOPC_I32 <"v_cmp_f_i32">;
+defm V_CMP_LT_I32 : VOPC_I32 <"v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">;
+defm V_CMP_EQ_I32 : VOPC_I32 <"v_cmp_eq_i32">;
+defm V_CMP_LE_I32 : VOPC_I32 <"v_cmp_le_i32", COND_SLE, "v_cmp_ge_i32">;
+defm V_CMP_GT_I32 : VOPC_I32 <"v_cmp_gt_i32", COND_SGT>;
+defm V_CMP_NE_I32 : VOPC_I32 <"v_cmp_ne_i32">;
+defm V_CMP_GE_I32 : VOPC_I32 <"v_cmp_ge_i32", COND_SGE>;
+defm V_CMP_T_I32 : VOPC_I32 <"v_cmp_t_i32">;
+
+defm V_CMPX_F_I32 : VOPCX_I32 <"v_cmpx_f_i32">;
+defm V_CMPX_LT_I32 : VOPCX_I32 <"v_cmpx_lt_i32", "v_cmpx_gt_i32">;
+defm V_CMPX_EQ_I32 : VOPCX_I32 <"v_cmpx_eq_i32">;
+defm V_CMPX_LE_I32 : VOPCX_I32 <"v_cmpx_le_i32", "v_cmpx_ge_i32">;
+defm V_CMPX_GT_I32 : VOPCX_I32 <"v_cmpx_gt_i32">;
+defm V_CMPX_NE_I32 : VOPCX_I32 <"v_cmpx_ne_i32">;
+defm V_CMPX_GE_I32 : VOPCX_I32 <"v_cmpx_ge_i32">;
+defm V_CMPX_T_I32 : VOPCX_I32 <"v_cmpx_t_i32">;
+
+defm V_CMP_F_I64 : VOPC_I64 <"v_cmp_f_i64">;
+defm V_CMP_LT_I64 : VOPC_I64 <"v_cmp_lt_i64", COND_SLT, "v_cmp_gt_i64">;
+defm V_CMP_EQ_I64 : VOPC_I64 <"v_cmp_eq_i64">;
+defm V_CMP_LE_I64 : VOPC_I64 <"v_cmp_le_i64", COND_SLE, "v_cmp_ge_i64">;
+defm V_CMP_GT_I64 : VOPC_I64 <"v_cmp_gt_i64", COND_SGT>;
+defm V_CMP_NE_I64 : VOPC_I64 <"v_cmp_ne_i64">;
+defm V_CMP_GE_I64 : VOPC_I64 <"v_cmp_ge_i64", COND_SGE>;
+defm V_CMP_T_I64 : VOPC_I64 <"v_cmp_t_i64">;
+
+defm V_CMPX_F_I64 : VOPCX_I64 <"v_cmpx_f_i64">;
+defm V_CMPX_LT_I64 : VOPCX_I64 <"v_cmpx_lt_i64", "v_cmpx_gt_i64">;
+defm V_CMPX_EQ_I64 : VOPCX_I64 <"v_cmpx_eq_i64">;
+defm V_CMPX_LE_I64 : VOPCX_I64 <"v_cmpx_le_i64", "v_cmpx_ge_i64">;
+defm V_CMPX_GT_I64 : VOPCX_I64 <"v_cmpx_gt_i64">;
+defm V_CMPX_NE_I64 : VOPCX_I64 <"v_cmpx_ne_i64">;
+defm V_CMPX_GE_I64 : VOPCX_I64 <"v_cmpx_ge_i64">;
+defm V_CMPX_T_I64 : VOPCX_I64 <"v_cmpx_t_i64">;
+
+defm V_CMP_F_U32 : VOPC_I32 <"v_cmp_f_u32">;
+defm V_CMP_LT_U32 : VOPC_I32 <"v_cmp_lt_u32", COND_ULT, "v_cmp_gt_u32">;
+defm V_CMP_EQ_U32 : VOPC_I32 <"v_cmp_eq_u32", COND_EQ>;
+defm V_CMP_LE_U32 : VOPC_I32 <"v_cmp_le_u32", COND_ULE, "v_cmp_ge_u32">;
+defm V_CMP_GT_U32 : VOPC_I32 <"v_cmp_gt_u32", COND_UGT>;
+defm V_CMP_NE_U32 : VOPC_I32 <"v_cmp_ne_u32", COND_NE>;
+defm V_CMP_GE_U32 : VOPC_I32 <"v_cmp_ge_u32", COND_UGE>;
+defm V_CMP_T_U32 : VOPC_I32 <"v_cmp_t_u32">;
+
+defm V_CMPX_F_U32 : VOPCX_I32 <"v_cmpx_f_u32">;
+defm V_CMPX_LT_U32 : VOPCX_I32 <"v_cmpx_lt_u32", "v_cmpx_gt_u32">;
+defm V_CMPX_EQ_U32 : VOPCX_I32 <"v_cmpx_eq_u32">;
+defm V_CMPX_LE_U32 : VOPCX_I32 <"v_cmpx_le_u32", "v_cmpx_le_u32">;
+defm V_CMPX_GT_U32 : VOPCX_I32 <"v_cmpx_gt_u32">;
+defm V_CMPX_NE_U32 : VOPCX_I32 <"v_cmpx_ne_u32">;
+defm V_CMPX_GE_U32 : VOPCX_I32 <"v_cmpx_ge_u32">;
+defm V_CMPX_T_U32 : VOPCX_I32 <"v_cmpx_t_u32">;
+
+defm V_CMP_F_U64 : VOPC_I64 <"v_cmp_f_u64">;
+defm V_CMP_LT_U64 : VOPC_I64 <"v_cmp_lt_u64", COND_ULT, "v_cmp_gt_u64">;
+defm V_CMP_EQ_U64 : VOPC_I64 <"v_cmp_eq_u64", COND_EQ>;
+defm V_CMP_LE_U64 : VOPC_I64 <"v_cmp_le_u64", COND_ULE, "v_cmp_ge_u64">;
+defm V_CMP_GT_U64 : VOPC_I64 <"v_cmp_gt_u64", COND_UGT>;
+defm V_CMP_NE_U64 : VOPC_I64 <"v_cmp_ne_u64", COND_NE>;
+defm V_CMP_GE_U64 : VOPC_I64 <"v_cmp_ge_u64", COND_UGE>;
+defm V_CMP_T_U64 : VOPC_I64 <"v_cmp_t_u64">;
+
+defm V_CMPX_F_U64 : VOPCX_I64 <"v_cmpx_f_u64">;
+defm V_CMPX_LT_U64 : VOPCX_I64 <"v_cmpx_lt_u64", "v_cmpx_gt_u64">;
+defm V_CMPX_EQ_U64 : VOPCX_I64 <"v_cmpx_eq_u64">;
+defm V_CMPX_LE_U64 : VOPCX_I64 <"v_cmpx_le_u64", "v_cmpx_ge_u64">;
+defm V_CMPX_GT_U64 : VOPCX_I64 <"v_cmpx_gt_u64">;
+defm V_CMPX_NE_U64 : VOPCX_I64 <"v_cmpx_ne_u64">;
+defm V_CMPX_GE_U64 : VOPCX_I64 <"v_cmpx_ge_u64">;
+defm V_CMPX_T_U64 : VOPCX_I64 <"v_cmpx_t_u64">;
+
+//===----------------------------------------------------------------------===//
+// Class instructions
+//===----------------------------------------------------------------------===//
+
+class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
+  VOPC_Profile<sched, vt, i32> {
+  let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
+  let Asm64 = "$sdst, $src0_modifiers, $src1";
+  let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0,
+                     Int32InputMods:$src1_modifiers, Src1RC64:$src1,
+                     clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
+  let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel";
+  let HasSrc1Mods = 0;
+  let HasClamp = 0;
+  let HasOMod = 0;
+}
+
+class getVOPCClassPat64 <VOPProfile P> {
+  list<dag> ret =
+    [(set i1:$sdst,
+      (AMDGPUfp_class
+        (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)),
+        P.Src1VT:$src1))];
+}
+
+// Special case for class instructions which only have modifiers on
+// the 1st source operand.
+multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> {
+  def _e32 : VOPC_Pseudo <opName, p> {
+    let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+    let SchedRW = p.Schedule;
+    let isConvergent = DefExec;
+  }
+
+  def _e64 : VOP3_Pseudo<opName, p, getVOPCClassPat64<p>.ret> {
+    let Defs = !if(DefExec, [EXEC], []);
+    let SchedRW = p.Schedule;
+  }
+
+  def _sdwa : VOPC_SDWA_Pseudo <opName, p> {
+    let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+    let SchedRW = p.Schedule;
+    let isConvergent = DefExec;
+  }
+}
+
+def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>;
+def VOPC_I1_F32_I32 : VOPC_Class_Profile<[Write32Bit], f32>;
+def VOPC_I1_F64_I32 : VOPC_Class_Profile<[WriteDoubleAdd], f64>;
+
+multiclass VOPC_CLASS_F16 <string opName> :
+  VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 0>;
+
+multiclass VOPCX_CLASS_F16 <string opName> :
+  VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>;
+
+multiclass VOPC_CLASS_F32 <string opName> :
+  VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 0>;
+
+multiclass VOPCX_CLASS_F32 <string opName> :
+  VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>;
+
+multiclass VOPC_CLASS_F64 <string opName> :
+  VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 0>;
+
+multiclass VOPCX_CLASS_F64 <string opName> :
+  VOPC_Class_Pseudos <opName, VOPC_I1_F64_I32, 1>;
+
+defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">;
+defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">;
+defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">;
+defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <"v_cmpx_class_f64">;
+defm V_CMP_CLASS_F16  : VOPC_CLASS_F16 <"v_cmp_class_f16">;
+defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
+
+//===----------------------------------------------------------------------===//
+// V_ICMPIntrinsic Pattern.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isGCN] in {
+
+class ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat <
+  (AMDGPUsetcc vt:$src0, vt:$src1, cond),
+  (inst $src0, $src1)
+>;
+
+def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U32_e64, i32>;
+def : ICMP_Pattern <COND_NE, V_CMP_NE_U32_e64, i32>;
+def : ICMP_Pattern <COND_UGT, V_CMP_GT_U32_e64, i32>;
+def : ICMP_Pattern <COND_UGE, V_CMP_GE_U32_e64, i32>;
+def : ICMP_Pattern <COND_ULT, V_CMP_LT_U32_e64, i32>;
+def : ICMP_Pattern <COND_ULE, V_CMP_LE_U32_e64, i32>;
+def : ICMP_Pattern <COND_SGT, V_CMP_GT_I32_e64, i32>;
+def : ICMP_Pattern <COND_SGE, V_CMP_GE_I32_e64, i32>;
+def : ICMP_Pattern <COND_SLT, V_CMP_LT_I32_e64, i32>;
+def : ICMP_Pattern <COND_SLE, V_CMP_LE_I32_e64, i32>;
+
+def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U64_e64, i64>;
+def : ICMP_Pattern <COND_NE, V_CMP_NE_U64_e64, i64>;
+def : ICMP_Pattern <COND_UGT, V_CMP_GT_U64_e64, i64>;
+def : ICMP_Pattern <COND_UGE, V_CMP_GE_U64_e64, i64>;
+def : ICMP_Pattern <COND_ULT, V_CMP_LT_U64_e64, i64>;
+def : ICMP_Pattern <COND_ULE, V_CMP_LE_U64_e64, i64>;
+def : ICMP_Pattern <COND_SGT, V_CMP_GT_I64_e64, i64>;
+def : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>;
+def : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>;
+def : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>;
+
+class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat <
+  (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
+                   (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
+  (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
+        DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;
+def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F32_e64, f32>;
+def : FCMP_Pattern <COND_OGT, V_CMP_GT_F32_e64, f32>;
+def : FCMP_Pattern <COND_OGE, V_CMP_GE_F32_e64, f32>;
+def : FCMP_Pattern <COND_OLT, V_CMP_LT_F32_e64, f32>;
+def : FCMP_Pattern <COND_OLE, V_CMP_LE_F32_e64, f32>;
+
+def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F64_e64, f64>;
+def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F64_e64, f64>;
+def : FCMP_Pattern <COND_OGT, V_CMP_GT_F64_e64, f64>;
+def : FCMP_Pattern <COND_OGE, V_CMP_GE_F64_e64, f64>;
+def : FCMP_Pattern <COND_OLT, V_CMP_LT_F64_e64, f64>;
+def : FCMP_Pattern <COND_OLE, V_CMP_LE_F64_e64, f64>;
+
+def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F32_e64, f32>;
+def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F32_e64, f32>;
+def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F32_e64, f32>;
+def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F32_e64, f32>;
+def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F32_e64, f32>;
+def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F32_e64, f32>;
+
+def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F64_e64, f64>;
+def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F64_e64, f64>;
+def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F64_e64, f64>;
+def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>;
+def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>;
+def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>;
+
+} // End Predicates = [isGCN]
+
+//===----------------------------------------------------------------------===//
+// Target
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI
+//===----------------------------------------------------------------------===//
+
+multiclass VOPC_Real_si <bits<9> op> {
+  let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
+    def _e32_si :
+      VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+      VOPCe<op{7-0}>;
+
+    def _e64_si :
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+      VOP3a_si <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+      // Encoding used for VOPC instructions encoded as VOP3
+      // Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst
+      bits<8> sdst;
+      let Inst{7-0} = sdst;
+    }
+  }
+  def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
+                       !cast<Instruction>(NAME#"_e32_si")> {
+    let AssemblerPredicate = isSICI;
+  }
+}
+
+defm V_CMP_F_F32     : VOPC_Real_si <0x0>;
+defm V_CMP_LT_F32    : VOPC_Real_si <0x1>;
+defm V_CMP_EQ_F32    : VOPC_Real_si <0x2>;
+defm V_CMP_LE_F32    : VOPC_Real_si <0x3>;
+defm V_CMP_GT_F32    : VOPC_Real_si <0x4>;
+defm V_CMP_LG_F32    : VOPC_Real_si <0x5>;
+defm V_CMP_GE_F32    : VOPC_Real_si <0x6>;
+defm V_CMP_O_F32     : VOPC_Real_si <0x7>;
+defm V_CMP_U_F32     : VOPC_Real_si <0x8>;
+defm V_CMP_NGE_F32   : VOPC_Real_si <0x9>;
+defm V_CMP_NLG_F32   : VOPC_Real_si <0xa>;
+defm V_CMP_NGT_F32   : VOPC_Real_si <0xb>;
+defm V_CMP_NLE_F32   : VOPC_Real_si <0xc>;
+defm V_CMP_NEQ_F32   : VOPC_Real_si <0xd>;
+defm V_CMP_NLT_F32   : VOPC_Real_si <0xe>;
+defm V_CMP_TRU_F32   : VOPC_Real_si <0xf>;
+
+defm V_CMPX_F_F32    : VOPC_Real_si <0x10>;
+defm V_CMPX_LT_F32   : VOPC_Real_si <0x11>;
+defm V_CMPX_EQ_F32   : VOPC_Real_si <0x12>;
+defm V_CMPX_LE_F32   : VOPC_Real_si <0x13>;
+defm V_CMPX_GT_F32   : VOPC_Real_si <0x14>;
+defm V_CMPX_LG_F32   : VOPC_Real_si <0x15>;
+defm V_CMPX_GE_F32   : VOPC_Real_si <0x16>;
+defm V_CMPX_O_F32    : VOPC_Real_si <0x17>;
+defm V_CMPX_U_F32    : VOPC_Real_si <0x18>;
+defm V_CMPX_NGE_F32  : VOPC_Real_si <0x19>;
+defm V_CMPX_NLG_F32  : VOPC_Real_si <0x1a>;
+defm V_CMPX_NGT_F32  : VOPC_Real_si <0x1b>;
+defm V_CMPX_NLE_F32  : VOPC_Real_si <0x1c>;
+defm V_CMPX_NEQ_F32  : VOPC_Real_si <0x1d>;
+defm V_CMPX_NLT_F32  : VOPC_Real_si <0x1e>;
+defm V_CMPX_TRU_F32  : VOPC_Real_si <0x1f>;
+
+defm V_CMP_F_F64     : VOPC_Real_si <0x20>;
+defm V_CMP_LT_F64    : VOPC_Real_si <0x21>;
+defm V_CMP_EQ_F64    : VOPC_Real_si <0x22>;
+defm V_CMP_LE_F64    : VOPC_Real_si <0x23>;
+defm V_CMP_GT_F64    : VOPC_Real_si <0x24>;
+defm V_CMP_LG_F64    : VOPC_Real_si <0x25>;
+defm V_CMP_GE_F64    : VOPC_Real_si <0x26>;
+defm V_CMP_O_F64     : VOPC_Real_si <0x27>;
+defm V_CMP_U_F64     : VOPC_Real_si <0x28>;
+defm V_CMP_NGE_F64   : VOPC_Real_si <0x29>;
+defm V_CMP_NLG_F64   : VOPC_Real_si <0x2a>;
+defm V_CMP_NGT_F64   : VOPC_Real_si <0x2b>;
+defm V_CMP_NLE_F64   : VOPC_Real_si <0x2c>;
+defm V_CMP_NEQ_F64   : VOPC_Real_si <0x2d>;
+defm V_CMP_NLT_F64   : VOPC_Real_si <0x2e>;
+defm V_CMP_TRU_F64   : VOPC_Real_si <0x2f>;
+
+defm V_CMPX_F_F64    : VOPC_Real_si <0x30>;
+defm V_CMPX_LT_F64   : VOPC_Real_si <0x31>;
+defm V_CMPX_EQ_F64   : VOPC_Real_si <0x32>;
+defm V_CMPX_LE_F64   : VOPC_Real_si <0x33>;
+defm V_CMPX_GT_F64   : VOPC_Real_si <0x34>;
+defm V_CMPX_LG_F64   : VOPC_Real_si <0x35>;
+defm V_CMPX_GE_F64   : VOPC_Real_si <0x36>;
+defm V_CMPX_O_F64    : VOPC_Real_si <0x37>;
+defm V_CMPX_U_F64    : VOPC_Real_si <0x38>;
+defm V_CMPX_NGE_F64  : VOPC_Real_si <0x39>;
+defm V_CMPX_NLG_F64  : VOPC_Real_si <0x3a>;
+defm V_CMPX_NGT_F64  : VOPC_Real_si <0x3b>;
+defm V_CMPX_NLE_F64  : VOPC_Real_si <0x3c>;
+defm V_CMPX_NEQ_F64  : VOPC_Real_si <0x3d>;
+defm V_CMPX_NLT_F64  : VOPC_Real_si <0x3e>;
+defm V_CMPX_TRU_F64  : VOPC_Real_si <0x3f>;
+
+defm V_CMPS_F_F32    : VOPC_Real_si <0x40>;
+defm V_CMPS_LT_F32   : VOPC_Real_si <0x41>;
+defm V_CMPS_EQ_F32   : VOPC_Real_si <0x42>;
+defm V_CMPS_LE_F32   : VOPC_Real_si <0x43>;
+defm V_CMPS_GT_F32   : VOPC_Real_si <0x44>;
+defm V_CMPS_LG_F32   : VOPC_Real_si <0x45>;
+defm V_CMPS_GE_F32   : VOPC_Real_si <0x46>;
+defm V_CMPS_O_F32    : VOPC_Real_si <0x47>;
+defm V_CMPS_U_F32    : VOPC_Real_si <0x48>;
+defm V_CMPS_NGE_F32  : VOPC_Real_si <0x49>;
+defm V_CMPS_NLG_F32  : VOPC_Real_si <0x4a>;
+defm V_CMPS_NGT_F32  : VOPC_Real_si <0x4b>;
+defm V_CMPS_NLE_F32  : VOPC_Real_si <0x4c>;
+defm V_CMPS_NEQ_F32  : VOPC_Real_si <0x4d>;
+defm V_CMPS_NLT_F32  : VOPC_Real_si <0x4e>;
+defm V_CMPS_TRU_F32  : VOPC_Real_si <0x4f>;
+
+defm V_CMPSX_F_F32   : VOPC_Real_si <0x50>;
+defm V_CMPSX_LT_F32  : VOPC_Real_si <0x51>;
+defm V_CMPSX_EQ_F32  : VOPC_Real_si <0x52>;
+defm V_CMPSX_LE_F32  : VOPC_Real_si <0x53>;
+defm V_CMPSX_GT_F32  : VOPC_Real_si <0x54>;
+defm V_CMPSX_LG_F32  : VOPC_Real_si <0x55>;
+defm V_CMPSX_GE_F32  : VOPC_Real_si <0x56>;
+defm V_CMPSX_O_F32   : VOPC_Real_si <0x57>;
+defm V_CMPSX_U_F32   : VOPC_Real_si <0x58>;
+defm V_CMPSX_NGE_F32 : VOPC_Real_si <0x59>;
+defm V_CMPSX_NLG_F32 : VOPC_Real_si <0x5a>;
+defm V_CMPSX_NGT_F32 : VOPC_Real_si <0x5b>;
+defm V_CMPSX_NLE_F32 : VOPC_Real_si <0x5c>;
+defm V_CMPSX_NEQ_F32 : VOPC_Real_si <0x5d>;
+defm V_CMPSX_NLT_F32 : VOPC_Real_si <0x5e>;
+defm V_CMPSX_TRU_F32 : VOPC_Real_si <0x5f>;
+
+defm V_CMPS_F_F64    : VOPC_Real_si <0x60>;
+defm V_CMPS_LT_F64   : VOPC_Real_si <0x61>;
+defm V_CMPS_EQ_F64   : VOPC_Real_si <0x62>;
+defm V_CMPS_LE_F64   : VOPC_Real_si <0x63>;
+defm V_CMPS_GT_F64   : VOPC_Real_si <0x64>;
+defm V_CMPS_LG_F64   : VOPC_Real_si <0x65>;
+defm V_CMPS_GE_F64   : VOPC_Real_si <0x66>;
+defm V_CMPS_O_F64    : VOPC_Real_si <0x67>;
+defm V_CMPS_U_F64    : VOPC_Real_si <0x68>;
+defm V_CMPS_NGE_F64  : VOPC_Real_si <0x69>;
+defm V_CMPS_NLG_F64  : VOPC_Real_si <0x6a>;
+defm V_CMPS_NGT_F64  : VOPC_Real_si <0x6b>;
+defm V_CMPS_NLE_F64  : VOPC_Real_si <0x6c>;
+defm V_CMPS_NEQ_F64  : VOPC_Real_si <0x6d>;
+defm V_CMPS_NLT_F64  : VOPC_Real_si <0x6e>;
+defm V_CMPS_TRU_F64  : VOPC_Real_si <0x6f>;
+
+defm V_CMPSX_F_F64   : VOPC_Real_si <0x70>;
+defm V_CMPSX_LT_F64  : VOPC_Real_si <0x71>;
+defm V_CMPSX_EQ_F64  : VOPC_Real_si <0x72>;
+defm V_CMPSX_LE_F64  : VOPC_Real_si <0x73>;
+defm V_CMPSX_GT_F64  : VOPC_Real_si <0x74>;
+defm V_CMPSX_LG_F64  : VOPC_Real_si <0x75>;
+defm V_CMPSX_GE_F64  : VOPC_Real_si <0x76>;
+defm V_CMPSX_O_F64   : VOPC_Real_si <0x77>;
+defm V_CMPSX_U_F64   : VOPC_Real_si <0x78>;
+defm V_CMPSX_NGE_F64 : VOPC_Real_si <0x79>;
+defm V_CMPSX_NLG_F64 : VOPC_Real_si <0x7a>;
+defm V_CMPSX_NGT_F64 : VOPC_Real_si <0x7b>;
+defm V_CMPSX_NLE_F64 : VOPC_Real_si <0x7c>;
+defm V_CMPSX_NEQ_F64 : VOPC_Real_si <0x7d>;
+defm V_CMPSX_NLT_F64 : VOPC_Real_si <0x7e>;
+defm V_CMPSX_TRU_F64 : VOPC_Real_si <0x7f>;
+
+defm V_CMP_F_I32     : VOPC_Real_si <0x80>;
+defm V_CMP_LT_I32    : VOPC_Real_si <0x81>;
+defm V_CMP_EQ_I32    : VOPC_Real_si <0x82>;
+defm V_CMP_LE_I32    : VOPC_Real_si <0x83>;
+defm V_CMP_GT_I32    : VOPC_Real_si <0x84>;
+defm V_CMP_NE_I32    : VOPC_Real_si <0x85>;
+defm V_CMP_GE_I32    : VOPC_Real_si <0x86>;
+defm V_CMP_T_I32     : VOPC_Real_si <0x87>;
+
+defm V_CMPX_F_I32    : VOPC_Real_si <0x90>;
+defm V_CMPX_LT_I32   : VOPC_Real_si <0x91>;
+defm V_CMPX_EQ_I32   : VOPC_Real_si <0x92>;
+defm V_CMPX_LE_I32   : VOPC_Real_si <0x93>;
+defm V_CMPX_GT_I32   : VOPC_Real_si <0x94>;
+defm V_CMPX_NE_I32   : VOPC_Real_si <0x95>;
+defm V_CMPX_GE_I32   : VOPC_Real_si <0x96>;
+defm V_CMPX_T_I32    : VOPC_Real_si <0x97>;
+
+defm V_CMP_F_I64     : VOPC_Real_si <0xa0>;
+defm V_CMP_LT_I64    : VOPC_Real_si <0xa1>;
+defm V_CMP_EQ_I64    : VOPC_Real_si <0xa2>;
+defm V_CMP_LE_I64    : VOPC_Real_si <0xa3>;
+defm V_CMP_GT_I64    : VOPC_Real_si <0xa4>;
+defm V_CMP_NE_I64    : VOPC_Real_si <0xa5>;
+defm V_CMP_GE_I64    : VOPC_Real_si <0xa6>;
+defm V_CMP_T_I64     : VOPC_Real_si <0xa7>;
+
+defm V_CMPX_F_I64    : VOPC_Real_si <0xb0>;
+defm V_CMPX_LT_I64   : VOPC_Real_si <0xb1>;
+defm V_CMPX_EQ_I64   : VOPC_Real_si <0xb2>;
+defm V_CMPX_LE_I64   : VOPC_Real_si <0xb3>;
+defm V_CMPX_GT_I64   : VOPC_Real_si <0xb4>;
+defm V_CMPX_NE_I64   : VOPC_Real_si <0xb5>;
+defm V_CMPX_GE_I64   : VOPC_Real_si <0xb6>;
+defm V_CMPX_T_I64    : VOPC_Real_si <0xb7>;
+
+defm V_CMP_F_U32     : VOPC_Real_si <0xc0>;
+defm V_CMP_LT_U32    : VOPC_Real_si <0xc1>;
+defm V_CMP_EQ_U32    : VOPC_Real_si <0xc2>;
+defm V_CMP_LE_U32    : VOPC_Real_si <0xc3>;
+defm V_CMP_GT_U32    : VOPC_Real_si <0xc4>;
+defm V_CMP_NE_U32    : VOPC_Real_si <0xc5>;
+defm V_CMP_GE_U32    : VOPC_Real_si <0xc6>;
+defm V_CMP_T_U32     : VOPC_Real_si <0xc7>;
+
+defm V_CMPX_F_U32    : VOPC_Real_si <0xd0>;
+defm V_CMPX_LT_U32   : VOPC_Real_si <0xd1>;
+defm V_CMPX_EQ_U32   : VOPC_Real_si <0xd2>;
+defm V_CMPX_LE_U32   : VOPC_Real_si <0xd3>;
+defm V_CMPX_GT_U32   : VOPC_Real_si <0xd4>;
+defm V_CMPX_NE_U32   : VOPC_Real_si <0xd5>;
+defm V_CMPX_GE_U32   : VOPC_Real_si <0xd6>;
+defm V_CMPX_T_U32    : VOPC_Real_si <0xd7>;
+
+defm V_CMP_F_U64     : VOPC_Real_si <0xe0>;
+defm V_CMP_LT_U64    : VOPC_Real_si <0xe1>;
+defm V_CMP_EQ_U64    : VOPC_Real_si <0xe2>;
+defm V_CMP_LE_U64    : VOPC_Real_si <0xe3>;
+defm V_CMP_GT_U64    : VOPC_Real_si <0xe4>;
+defm V_CMP_NE_U64    : VOPC_Real_si <0xe5>;
+defm V_CMP_GE_U64    : VOPC_Real_si <0xe6>;
+defm V_CMP_T_U64     : VOPC_Real_si <0xe7>;
+
+defm V_CMPX_F_U64    : VOPC_Real_si <0xf0>;
+defm V_CMPX_LT_U64   : VOPC_Real_si <0xf1>;
+defm V_CMPX_EQ_U64   : VOPC_Real_si <0xf2>;
+defm V_CMPX_LE_U64   : VOPC_Real_si <0xf3>;
+defm V_CMPX_GT_U64   : VOPC_Real_si <0xf4>;
+defm V_CMPX_NE_U64   : VOPC_Real_si <0xf5>;
+defm V_CMPX_GE_U64   : VOPC_Real_si <0xf6>;
+defm V_CMPX_T_U64    : VOPC_Real_si <0xf7>;
+
+defm V_CMP_CLASS_F32  : VOPC_Real_si <0x88>;
+defm V_CMPX_CLASS_F32 : VOPC_Real_si <0x98>;
+defm V_CMP_CLASS_F64  : VOPC_Real_si <0xa8>;
+defm V_CMPX_CLASS_F64 : VOPC_Real_si <0xb8>;
+
+//===----------------------------------------------------------------------===//
+// VI
+//===----------------------------------------------------------------------===//
+
+multiclass VOPC_Real_vi <bits<10> op> {
+  let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+    def _e32_vi :
+      VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
+      VOPCe<op{7-0}>;
+
+    def _e64_vi :
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+      VOP3a_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+      // Encoding used for VOPC instructions encoded as VOP3
+      // Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst
+      bits<8> sdst;
+      let Inst{7-0} = sdst;
+    }
+  }
+
+  def _sdwa_vi :
+    VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
+  def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
+                       !cast<Instruction>(NAME#"_e32_vi")> {
+    let AssemblerPredicate = isVI;
+  }
+}
+
+defm V_CMP_CLASS_F32  : VOPC_Real_vi <0x10>;
+defm V_CMPX_CLASS_F32 : VOPC_Real_vi <0x11>;
+defm V_CMP_CLASS_F64  : VOPC_Real_vi <0x12>;
+defm V_CMPX_CLASS_F64 : VOPC_Real_vi <0x13>;
+defm V_CMP_CLASS_F16  : VOPC_Real_vi <0x14>;
+defm V_CMPX_CLASS_F16 : VOPC_Real_vi <0x15>;
+
+defm V_CMP_F_F16      : VOPC_Real_vi <0x20>;
+defm V_CMP_LT_F16     : VOPC_Real_vi <0x21>;
+defm V_CMP_EQ_F16     : VOPC_Real_vi <0x22>;
+defm V_CMP_LE_F16     : VOPC_Real_vi <0x23>;
+defm V_CMP_GT_F16     : VOPC_Real_vi <0x24>;
+defm V_CMP_LG_F16     : VOPC_Real_vi <0x25>;
+defm V_CMP_GE_F16     : VOPC_Real_vi <0x26>;
+defm V_CMP_O_F16      : VOPC_Real_vi <0x27>;
+defm V_CMP_U_F16      : VOPC_Real_vi <0x28>;
+defm V_CMP_NGE_F16    : VOPC_Real_vi <0x29>;
+defm V_CMP_NLG_F16    : VOPC_Real_vi <0x2a>;
+defm V_CMP_NGT_F16    : VOPC_Real_vi <0x2b>;
+defm V_CMP_NLE_F16    : VOPC_Real_vi <0x2c>;
+defm V_CMP_NEQ_F16    : VOPC_Real_vi <0x2d>;
+defm V_CMP_NLT_F16    : VOPC_Real_vi <0x2e>;
+defm V_CMP_TRU_F16    : VOPC_Real_vi <0x2f>;
+
+defm V_CMPX_F_F16     : VOPC_Real_vi <0x30>;
+defm V_CMPX_LT_F16    : VOPC_Real_vi <0x31>;
+defm V_CMPX_EQ_F16    : VOPC_Real_vi <0x32>;
+defm V_CMPX_LE_F16    : VOPC_Real_vi <0x33>;
+defm V_CMPX_GT_F16    : VOPC_Real_vi <0x34>;
+defm V_CMPX_LG_F16    : VOPC_Real_vi <0x35>;
+defm V_CMPX_GE_F16    : VOPC_Real_vi <0x36>;
+defm V_CMPX_O_F16     : VOPC_Real_vi <0x37>;
+defm V_CMPX_U_F16     : VOPC_Real_vi <0x38>;
+defm V_CMPX_NGE_F16   : VOPC_Real_vi <0x39>;
+defm V_CMPX_NLG_F16   : VOPC_Real_vi <0x3a>;
+defm V_CMPX_NGT_F16   : VOPC_Real_vi <0x3b>;
+defm V_CMPX_NLE_F16   : VOPC_Real_vi <0x3c>;
+defm V_CMPX_NEQ_F16   : VOPC_Real_vi <0x3d>;
+defm V_CMPX_NLT_F16   : VOPC_Real_vi <0x3e>;
+defm V_CMPX_TRU_F16   : VOPC_Real_vi <0x3f>;
+
+defm V_CMP_F_F32      : VOPC_Real_vi <0x40>;
+defm V_CMP_LT_F32     : VOPC_Real_vi <0x41>;
+defm V_CMP_EQ_F32     : VOPC_Real_vi <0x42>;
+defm V_CMP_LE_F32     : VOPC_Real_vi <0x43>;
+defm V_CMP_GT_F32     : VOPC_Real_vi <0x44>;
+defm V_CMP_LG_F32     : VOPC_Real_vi <0x45>;
+defm V_CMP_GE_F32     : VOPC_Real_vi <0x46>;
+defm V_CMP_O_F32      : VOPC_Real_vi <0x47>;
+defm V_CMP_U_F32      : VOPC_Real_vi <0x48>;
+defm V_CMP_NGE_F32    : VOPC_Real_vi <0x49>;
+defm V_CMP_NLG_F32    : VOPC_Real_vi <0x4a>;
+defm V_CMP_NGT_F32    : VOPC_Real_vi <0x4b>;
+defm V_CMP_NLE_F32    : VOPC_Real_vi <0x4c>;
+defm V_CMP_NEQ_F32    : VOPC_Real_vi <0x4d>;
+defm V_CMP_NLT_F32    : VOPC_Real_vi <0x4e>;
+defm V_CMP_TRU_F32    : VOPC_Real_vi <0x4f>;
+
+defm V_CMPX_F_F32     : VOPC_Real_vi <0x50>;
+defm V_CMPX_LT_F32    : VOPC_Real_vi <0x51>;
+defm V_CMPX_EQ_F32    : VOPC_Real_vi <0x52>;
+defm V_CMPX_LE_F32    : VOPC_Real_vi <0x53>;
+defm V_CMPX_GT_F32    : VOPC_Real_vi <0x54>;
+defm V_CMPX_LG_F32    : VOPC_Real_vi <0x55>;
+defm V_CMPX_GE_F32    : VOPC_Real_vi <0x56>;
+defm V_CMPX_O_F32     : VOPC_Real_vi <0x57>;
+defm V_CMPX_U_F32     : VOPC_Real_vi <0x58>;
+defm V_CMPX_NGE_F32   : VOPC_Real_vi <0x59>;
+defm V_CMPX_NLG_F32   : VOPC_Real_vi <0x5a>;
+defm V_CMPX_NGT_F32   : VOPC_Real_vi <0x5b>;
+defm V_CMPX_NLE_F32   : VOPC_Real_vi <0x5c>;
+defm V_CMPX_NEQ_F32   : VOPC_Real_vi <0x5d>;
+defm V_CMPX_NLT_F32   : VOPC_Real_vi <0x5e>;
+defm V_CMPX_TRU_F32   : VOPC_Real_vi <0x5f>;
+
+defm V_CMP_F_F64      : VOPC_Real_vi <0x60>;
+defm V_CMP_LT_F64     : VOPC_Real_vi <0x61>;
+defm V_CMP_EQ_F64     : VOPC_Real_vi <0x62>;
+defm V_CMP_LE_F64     : VOPC_Real_vi <0x63>;
+defm V_CMP_GT_F64     : VOPC_Real_vi <0x64>;
+defm V_CMP_LG_F64     : VOPC_Real_vi <0x65>;
+defm V_CMP_GE_F64     : VOPC_Real_vi <0x66>;
+defm V_CMP_O_F64      : VOPC_Real_vi <0x67>;
+defm V_CMP_U_F64      : VOPC_Real_vi <0x68>;
+defm V_CMP_NGE_F64    : VOPC_Real_vi <0x69>;
+defm V_CMP_NLG_F64    : VOPC_Real_vi <0x6a>;
+defm V_CMP_NGT_F64    : VOPC_Real_vi <0x6b>;
+defm V_CMP_NLE_F64    : VOPC_Real_vi <0x6c>;
+defm V_CMP_NEQ_F64    : VOPC_Real_vi <0x6d>;
+defm V_CMP_NLT_F64    : VOPC_Real_vi <0x6e>;
+defm V_CMP_TRU_F64    : VOPC_Real_vi <0x6f>;
+
+defm V_CMPX_F_F64     : VOPC_Real_vi <0x70>;
+defm V_CMPX_LT_F64    : VOPC_Real_vi <0x71>;
+defm V_CMPX_EQ_F64    : VOPC_Real_vi <0x72>;
+defm V_CMPX_LE_F64    : VOPC_Real_vi <0x73>;
+defm V_CMPX_GT_F64    : VOPC_Real_vi <0x74>;
+defm V_CMPX_LG_F64    : VOPC_Real_vi <0x75>;
+defm V_CMPX_GE_F64    : VOPC_Real_vi <0x76>;
+defm V_CMPX_O_F64     : VOPC_Real_vi <0x77>;
+defm V_CMPX_U_F64     : VOPC_Real_vi <0x78>;
+defm V_CMPX_NGE_F64   : VOPC_Real_vi <0x79>;
+defm V_CMPX_NLG_F64   : VOPC_Real_vi <0x7a>;
+defm V_CMPX_NGT_F64   : VOPC_Real_vi <0x7b>;
+defm V_CMPX_NLE_F64   : VOPC_Real_vi <0x7c>;
+defm V_CMPX_NEQ_F64   : VOPC_Real_vi <0x7d>;
+defm V_CMPX_NLT_F64   : VOPC_Real_vi <0x7e>;
+defm V_CMPX_TRU_F64   : VOPC_Real_vi <0x7f>;
+
+defm V_CMP_F_I16      : VOPC_Real_vi <0xa0>;
+defm V_CMP_LT_I16     : VOPC_Real_vi <0xa1>;
+defm V_CMP_EQ_I16     : VOPC_Real_vi <0xa2>;
+defm V_CMP_LE_I16     : VOPC_Real_vi <0xa3>;
+defm V_CMP_GT_I16     : VOPC_Real_vi <0xa4>;
+defm V_CMP_NE_I16     : VOPC_Real_vi <0xa5>;
+defm V_CMP_GE_I16     : VOPC_Real_vi <0xa6>;
+defm V_CMP_T_I16      : VOPC_Real_vi <0xa7>;
+
+defm V_CMP_F_U16      : VOPC_Real_vi <0xa8>;
+defm V_CMP_LT_U16     : VOPC_Real_vi <0xa9>;
+defm V_CMP_EQ_U16     : VOPC_Real_vi <0xaa>;
+defm V_CMP_LE_U16     : VOPC_Real_vi <0xab>;
+defm V_CMP_GT_U16     : VOPC_Real_vi <0xac>;
+defm V_CMP_NE_U16     : VOPC_Real_vi <0xad>;
+defm V_CMP_GE_U16     : VOPC_Real_vi <0xae>;
+defm V_CMP_T_U16      : VOPC_Real_vi <0xaf>;
+
+defm V_CMPX_F_I16 : VOPC_Real_vi <0xb0>;
+defm V_CMPX_LT_I16 : VOPC_Real_vi <0xb1>;
+defm V_CMPX_EQ_I16 : VOPC_Real_vi <0xb2>;
+defm V_CMPX_LE_I16 : VOPC_Real_vi <0xb3>;
+defm V_CMPX_GT_I16 : VOPC_Real_vi <0xb4>;
+defm V_CMPX_NE_I16 : VOPC_Real_vi <0xb5>;
+defm V_CMPX_GE_I16 : VOPC_Real_vi <0xb6>;
+defm V_CMPX_T_I16 : VOPC_Real_vi <0xb7>;
+
+defm V_CMPX_F_U16 : VOPC_Real_vi <0xb8>;
+defm V_CMPX_LT_U16 : VOPC_Real_vi <0xb9>;
+defm V_CMPX_EQ_U16 : VOPC_Real_vi <0xba>;
+defm V_CMPX_LE_U16 : VOPC_Real_vi <0xbb>;
+defm V_CMPX_GT_U16 : VOPC_Real_vi <0xbc>;
+defm V_CMPX_NE_U16 : VOPC_Real_vi <0xbd>;
+defm V_CMPX_GE_U16 : VOPC_Real_vi <0xbe>;
+defm V_CMPX_T_U16 : VOPC_Real_vi <0xbf>;
+
+defm V_CMP_F_I32      : VOPC_Real_vi <0xc0>;
+defm V_CMP_LT_I32     : VOPC_Real_vi <0xc1>;
+defm V_CMP_EQ_I32     : VOPC_Real_vi <0xc2>;
+defm V_CMP_LE_I32     : VOPC_Real_vi <0xc3>;
+defm V_CMP_GT_I32     : VOPC_Real_vi <0xc4>;
+defm V_CMP_NE_I32     : VOPC_Real_vi <0xc5>;
+defm V_CMP_GE_I32     : VOPC_Real_vi <0xc6>;
+defm V_CMP_T_I32      : VOPC_Real_vi <0xc7>;
+
+defm V_CMPX_F_I32     : VOPC_Real_vi <0xd0>;
+defm V_CMPX_LT_I32    : VOPC_Real_vi <0xd1>;
+defm V_CMPX_EQ_I32    : VOPC_Real_vi <0xd2>;
+defm V_CMPX_LE_I32    : VOPC_Real_vi <0xd3>;
+defm V_CMPX_GT_I32    : VOPC_Real_vi <0xd4>;
+defm V_CMPX_NE_I32    : VOPC_Real_vi <0xd5>;
+defm V_CMPX_GE_I32    : VOPC_Real_vi <0xd6>;
+defm V_CMPX_T_I32     : VOPC_Real_vi <0xd7>;
+
+defm V_CMP_F_I64      : VOPC_Real_vi <0xe0>;
+defm V_CMP_LT_I64     : VOPC_Real_vi <0xe1>;
+defm V_CMP_EQ_I64     : VOPC_Real_vi <0xe2>;
+defm V_CMP_LE_I64     : VOPC_Real_vi <0xe3>;
+defm V_CMP_GT_I64     : VOPC_Real_vi <0xe4>;
+defm V_CMP_NE_I64     : VOPC_Real_vi <0xe5>;
+defm V_CMP_GE_I64     : VOPC_Real_vi <0xe6>;
+defm V_CMP_T_I64      : VOPC_Real_vi <0xe7>;
+
+defm V_CMPX_F_I64     : VOPC_Real_vi <0xf0>;
+defm V_CMPX_LT_I64    : VOPC_Real_vi <0xf1>;
+defm V_CMPX_EQ_I64    : VOPC_Real_vi <0xf2>;
+defm V_CMPX_LE_I64    : VOPC_Real_vi <0xf3>;
+defm V_CMPX_GT_I64    : VOPC_Real_vi <0xf4>;
+defm V_CMPX_NE_I64    : VOPC_Real_vi <0xf5>;
+defm V_CMPX_GE_I64    : VOPC_Real_vi <0xf6>;
+defm V_CMPX_T_I64     : VOPC_Real_vi <0xf7>;
+
+defm V_CMP_F_U32      : VOPC_Real_vi <0xc8>;
+defm V_CMP_LT_U32     : VOPC_Real_vi <0xc9>;
+defm V_CMP_EQ_U32     : VOPC_Real_vi <0xca>;
+defm V_CMP_LE_U32     : VOPC_Real_vi <0xcb>;
+defm V_CMP_GT_U32     : VOPC_Real_vi <0xcc>;
+defm V_CMP_NE_U32     : VOPC_Real_vi <0xcd>;
+defm V_CMP_GE_U32     : VOPC_Real_vi <0xce>;
+defm V_CMP_T_U32      : VOPC_Real_vi <0xcf>;
+
+defm V_CMPX_F_U32     : VOPC_Real_vi <0xd8>;
+defm V_CMPX_LT_U32    : VOPC_Real_vi <0xd9>;
+defm V_CMPX_EQ_U32    : VOPC_Real_vi <0xda>;
+defm V_CMPX_LE_U32    : VOPC_Real_vi <0xdb>;
+defm V_CMPX_GT_U32    : VOPC_Real_vi <0xdc>;
+defm V_CMPX_NE_U32    : VOPC_Real_vi <0xdd>;
+defm V_CMPX_GE_U32    : VOPC_Real_vi <0xde>;
+defm V_CMPX_T_U32     : VOPC_Real_vi <0xdf>;
+
+defm V_CMP_F_U64      : VOPC_Real_vi <0xe8>;
+defm V_CMP_LT_U64     : VOPC_Real_vi <0xe9>;
+defm V_CMP_EQ_U64     : VOPC_Real_vi <0xea>;
+defm V_CMP_LE_U64     : VOPC_Real_vi <0xeb>;
+defm V_CMP_GT_U64     : VOPC_Real_vi <0xec>;
+defm V_CMP_NE_U64     : VOPC_Real_vi <0xed>;
+defm V_CMP_GE_U64     : VOPC_Real_vi <0xee>;
+defm V_CMP_T_U64      : VOPC_Real_vi <0xef>;
+
+defm V_CMPX_F_U64     : VOPC_Real_vi <0xf8>;
+defm V_CMPX_LT_U64    : VOPC_Real_vi <0xf9>;
+defm V_CMPX_EQ_U64    : VOPC_Real_vi <0xfa>;
+defm V_CMPX_LE_U64    : VOPC_Real_vi <0xfb>;
+defm V_CMPX_GT_U64    : VOPC_Real_vi <0xfc>;
+defm V_CMPX_NE_U64    : VOPC_Real_vi <0xfd>;
+defm V_CMPX_GE_U64    : VOPC_Real_vi <0xfe>;
+defm V_CMPX_T_U64     : VOPC_Real_vi <0xff>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td
new file mode 100644
index 000000000000..5f72f97d9e28
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -0,0 +1,350 @@
+//===-- VOPInstructions.td - Vector Instruction Defintions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// dummies for outer let
+class LetDummies {
+  bit isCommutable;
+  bit isConvertibleToThreeAddress;
+  bit isMoveImm;
+  bit isReMaterializable;
+  bit isAsCheapAsAMove;
+  bit VOPAsmPrefer32Bit;
+  Predicate SubtargetPredicate;
+  string Constraints;
+  string DisableEncoding;
+  list<SchedReadWrite> SchedRW;
+  list<Register> Uses;
+  list<Register> Defs;
+}
+
+class VOP <string opName> {
+  string OpName = opName;
+}
+
+class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VALU = 1;
+  let Uses = [EXEC];
+}
+
+class VOP3Common <dag outs, dag ins, string asm = "",
+                  list<dag> pattern = [], bit HasMods = 0,
+                  bit VOP3Only = 0> :
+  VOPAnyCommon <outs, ins, asm, pattern> {
+
+  // Using complex patterns gives VOP3 patterns a very high complexity rating,
+  // but standalone patterns are almost always preferred, so we need to adjust the
+  // priority lower.  The goal is to use a high number to reduce complexity to
+  // zero (or less than zero).
+  let AddedComplexity = -1000;
+
+  let VOP3 = 1;
+
+  let AsmMatchConverter =
+    !if(!eq(VOP3Only,1),
+        "cvtVOP3",
+        !if(!eq(HasMods,1), "cvtVOP3_2_mod", ""));
+
+  let AsmVariantName = AMDGPUAsmVariants.VOP3;
+
+  let isCodeGenOnly = 0;
+
+  int Size = 8;
+
+  // Because SGPRs may be allowed if there are multiple operands, we
+  // need a post-isel hook to insert copies in order to avoid
+  // violating constant bus requirements.
+  let hasPostISelHook = 1;
+}
+
+class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3Only = 0> :
+  InstSI <P.Outs64, P.Ins64, "", pattern>,
+  VOP <opName>,
+  SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>,
+  MnemonicAlias<opName#"_e64", opName> {
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let UseNamedOperandTable = 1;
+
+  string Mnemonic = opName;
+  string AsmOperands = P.Asm64;
+
+  let Size = 8;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SubtargetPredicate = isGCN;
+
+  // Because SGPRs may be allowed if there are multiple operands, we
+  // need a post-isel hook to insert copies in order to avoid
+  // violating constant bus requirements.
+  let hasPostISelHook = 1;
+
+  // Using complex patterns gives VOP3 patterns a very high complexity rating,
+  // but standalone patterns are almost always preferred, so we need to adjust the
+  // priority lower.  The goal is to use a high number to reduce complexity to
+  // zero (or less than zero).
+  let AddedComplexity = -1000;
+
+  let VOP3 = 1;
+  let VALU = 1;
+  let Uses = [EXEC];
+
+  let AsmVariantName = AMDGPUAsmVariants.VOP3;
+  let AsmMatchConverter =
+    !if(!eq(VOP3Only,1),
+        "cvtVOP3",
+        !if(!eq(P.HasModifiers, 1), "cvtVOP3_2_mod", ""));
+
+  VOPProfile Pfl = P;
+}
+
+class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+  SIMCInstr <ps.PseudoInstr, EncodingFamily> {
+
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  let Constraints     = ps.Constraints;
+  let DisableEncoding = ps.DisableEncoding;
+
+  // copy relevant pseudo op flags
+  let SubtargetPredicate = ps.SubtargetPredicate;
+  let AsmMatchConverter  = ps.AsmMatchConverter;
+  let AsmVariantName     = ps.AsmVariantName;
+  let Constraints        = ps.Constraints;
+  let DisableEncoding    = ps.DisableEncoding;
+  let TSFlags            = ps.TSFlags;
+}
+
+class VOP3a<VOPProfile P> : Enc64 {
+  bits<2> src0_modifiers;
+  bits<9> src0;
+  bits<2> src1_modifiers;
+  bits<9> src1;
+  bits<2> src2_modifiers;
+  bits<9> src2;
+  bits<1> clamp;
+  bits<2> omod;
+
+  let Inst{8}     = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
+  let Inst{9}     = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
+  let Inst{10}    = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
+
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = !if(P.HasSrc0, src0, 0);
+  let Inst{49-41} = !if(P.HasSrc1, src1, 0);
+  let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+  let Inst{60-59} = !if(P.HasOMod, omod, 0);
+  let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
+  let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
+  let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
+}
+
+class VOP3a_si <bits<9> op, VOPProfile P> : VOP3a<P> {
+  let Inst{25-17} = op;
+  let Inst{11}    = !if(P.HasClamp, clamp{0}, 0);
+}
+
+class VOP3a_vi <bits<10> op, VOPProfile P> : VOP3a<P> {
+  let Inst{25-16} = op;
+  let Inst{15}    = !if(P.HasClamp, clamp{0}, 0);
+}
+
+class VOP3e_si <bits<9> op, VOPProfile P> : VOP3a_si <op, P> {
+  bits<8> vdst;
+  let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
+}
+
+class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> {
+  bits<8> vdst;
+  let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
+}
+
+class VOP3be <VOPProfile P> : Enc64 {
+  bits<8> vdst;
+  bits<2> src0_modifiers;
+  bits<9> src0;
+  bits<2> src1_modifiers;
+  bits<9> src1;
+  bits<2> src2_modifiers;
+  bits<9> src2;
+  bits<7> sdst;
+  bits<2> omod;
+
+  let Inst{7-0}   = vdst;
+  let Inst{14-8}  = sdst;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = !if(P.HasSrc0, src0, 0);
+  let Inst{49-41} = !if(P.HasSrc1, src1, 0);
+  let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+  let Inst{60-59} = !if(P.HasOMod, omod, 0);
+  let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
+  let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
+  let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
+}
+
+class VOP3be_si <bits<9> op, VOPProfile P> : VOP3be<P> {
+  let Inst{25-17} = op;
+}
+
+class VOP3be_vi <bits<10> op, VOPProfile P> : VOP3be<P> {
+  bits<1> clamp;
+  let Inst{25-16} = op;
+  let Inst{15}    = !if(P.HasClamp, clamp{0}, 0);
+}
+
+def SDWA {
+  // sdwa_sel
+  int BYTE_0 = 0;
+  int BYTE_1 = 1;
+  int BYTE_2 = 2;
+  int BYTE_3 = 3;
+  int WORD_0 = 4;
+  int WORD_1 = 5;
+  int DWORD = 6;
+
+  // dst_unused
+  int UNUSED_PAD = 0;
+  int UNUSED_SEXT = 1;
+  int UNUSED_PRESERVE = 2;
+}
+
+class VOP_SDWAe<VOPProfile P> : Enc64 {
+  bits<8> src0;
+  bits<3> src0_sel;
+  bits<2> src0_modifiers; // float: {abs,neg}, int {sext}
+  bits<3> src1_sel;
+  bits<2> src1_modifiers;
+  bits<3> dst_sel;
+  bits<2> dst_unused;
+  bits<1> clamp;
+
+  let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD);
+  let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
+  let Inst{45}    = !if(P.HasSDWAClamp, clamp{0}, 0);
+  let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
+  let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
+  let Inst{51}    = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
+  let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
+  let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
+  let Inst{59}    = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
+}
+
+class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
+  InstSI <P.OutsSDWA, P.InsSDWA, "", pattern>,
+  VOP <opName>,
+  SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE>,
+  MnemonicAlias <opName#"_sdwa", opName> {
+  
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let UseNamedOperandTable = 1;
+
+  string Mnemonic = opName;
+  string AsmOperands = P.AsmSDWA;
+
+  let Size = 8;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;  
+
+  let VALU = 1;
+  let SDWA = 1;
+  let Uses = [EXEC];
+  
+  let SubtargetPredicate = isVI;
+  let AssemblerPredicate = !if(P.HasExt, isVI, DisableInst);
+  let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.SDWA,
+                                     AMDGPUAsmVariants.Disable);
+  let DecoderNamespace = "SDWA";
+
+  VOPProfile Pfl = P;
+}
+
+class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  let Defs = ps.Defs;
+  let Uses = ps.Uses;
+  let SchedRW = ps.SchedRW;
+  let hasSideEffects = ps.hasSideEffects;
+
+  let Constraints     = ps.Constraints;
+  let DisableEncoding = ps.DisableEncoding;
+
+  // Copy relevant pseudo op flags
+  let SubtargetPredicate   = ps.SubtargetPredicate;
+  let AssemblerPredicate   = ps.AssemblerPredicate;
+  let AsmMatchConverter    = ps.AsmMatchConverter;
+  let AsmVariantName       = ps.AsmVariantName;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let DecoderNamespace     = ps.DecoderNamespace;
+  let Constraints          = ps.Constraints;
+  let DisableEncoding      = ps.DisableEncoding;
+  let TSFlags              = ps.TSFlags;
+}
+
+class VOP_DPPe<VOPProfile P> : Enc64 {
+  bits<2> src0_modifiers;
+  bits<8> src0;
+  bits<2> src1_modifiers;
+  bits<9> dpp_ctrl;
+  bits<1> bound_ctrl;
+  bits<4> bank_mask;
+  bits<4> row_mask;
+
+  let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{48-40} = dpp_ctrl;
+  let Inst{51}    = bound_ctrl;
+  let Inst{52}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // src0_neg
+  let Inst{53}    = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // src0_abs
+  let Inst{54}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // src1_neg
+  let Inst{55}    = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // src1_abs
+  let Inst{59-56} = bank_mask;
+  let Inst{63-60} = row_mask;
+}
+
+class VOP_DPP <string OpName, VOPProfile P> :
+  InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, []>,
+  VOP_DPPe<P> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+
+  let VALU = 1;
+  let DPP = 1;
+  let Size = 8;
+
+  let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
+  let SubtargetPredicate = isVI;
+  let AssemblerPredicate = !if(P.HasExt, isVI, DisableInst);
+  let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP,
+                                     AMDGPUAsmVariants.Disable);
+  let DecoderNamespace = "DPP";
+}
+
+include "VOPCInstructions.td"
+include "VOP1Instructions.td"
+include "VOP2Instructions.td"
+include "VOP3Instructions.td"
diff --git a/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp b/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp
new file mode 100644
index 000000000000..89859ba063d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp
@@ -0,0 +1,710 @@
+//=== A15SDOptimizerPass.cpp - Optimize DPR and SPR register accesses on A15==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The Cortex-A15 processor employs a tracking scheme in its register renaming
+// in order to process each instruction's micro-ops speculatively and
+// out-of-order with appropriate forwarding. The ARM architecture allows VFP
+// instructions to read and write 32-bit S-registers.  Each S-register
+// corresponds to one half (upper or lower) of an overlaid 64-bit D-register.
+//
+// There are several instruction patterns which can be used to provide this
+// capability which can provide higher performance than other, potentially more
+// direct patterns, specifically around when one micro-op reads a D-register
+// operand that has recently been written as one or more S-register results.
+//
+// This file defines a pre-regalloc pass which looks for SPR producers which
+// are going to be used by a DPR (or QPR) consumers and creates the more
+// optimized access pattern.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <map>
+#include <set>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "a15-sd-optimizer"
+
+namespace {
+  struct A15SDOptimizer : public MachineFunctionPass {
+    static char ID;
+    A15SDOptimizer() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    StringRef getPassName() const override { return "ARM A15 S->D optimizer"; }
+
+  private:
+    const ARMBaseInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineRegisterInfo *MRI;
+
+    bool runOnInstruction(MachineInstr *MI);
+
+    //
+    // Instruction builder helpers
+    //
+    unsigned createDupLane(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator InsertBefore,
+                           const DebugLoc &DL, unsigned Reg, unsigned Lane,
+                           bool QPR = false);
+
+    unsigned createExtractSubreg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator InsertBefore,
+                                 const DebugLoc &DL, unsigned DReg,
+                                 unsigned Lane, const TargetRegisterClass *TRC);
+
+    unsigned createVExt(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator InsertBefore,
+                        const DebugLoc &DL, unsigned Ssub0, unsigned Ssub1);
+
+    unsigned createRegSequence(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator InsertBefore,
+                               const DebugLoc &DL, unsigned Reg1,
+                               unsigned Reg2);
+
+    unsigned createInsertSubreg(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator InsertBefore,
+                                const DebugLoc &DL, unsigned DReg,
+                                unsigned Lane, unsigned ToInsert);
+
+    unsigned createImplicitDef(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator InsertBefore,
+                               const DebugLoc &DL);
+
+    //
+    // Various property checkers
+    //
+    bool usesRegClass(MachineOperand &MO, const TargetRegisterClass *TRC);
+    bool hasPartialWrite(MachineInstr *MI);
+    SmallVector<unsigned, 8> getReadDPRs(MachineInstr *MI);
+    unsigned getDPRLaneFromSPR(unsigned SReg);
+
+    //
+    // Methods used for getting the definitions of partial registers
+    //
+
+    MachineInstr *elideCopies(MachineInstr *MI);
+    void elideCopiesAndPHIs(MachineInstr *MI,
+                            SmallVectorImpl<MachineInstr*> &Outs);
+
+    //
+    // Pattern optimization methods
+    //
+    unsigned optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg);
+    unsigned optimizeSDPattern(MachineInstr *MI);
+    unsigned getPrefSPRLane(unsigned SReg);
+
+    //
+    // Sanitizing method - used to make sure if don't leave dead code around.
+    //
+    void eraseInstrWithNoUses(MachineInstr *MI);
+
+    //
+    // A map used to track the changes done by this pass.
+    //
+    std::map<MachineInstr*, unsigned> Replacements;
+    std::set<MachineInstr *> DeadInstr;
+  };
+  char A15SDOptimizer::ID = 0;
+} // end anonymous namespace
+
+// Returns true if this is a use of a SPR register.
+bool A15SDOptimizer::usesRegClass(MachineOperand &MO,
+                                  const TargetRegisterClass *TRC) {
+  if (!MO.isReg())
+    return false;
+  unsigned Reg = MO.getReg();
+
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return MRI->getRegClass(Reg)->hasSuperClassEq(TRC);
+  else
+    return TRC->contains(Reg);
+}
+
+unsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) {
+  unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1,
+                                           &ARM::DPRRegClass);
+  if (DReg != ARM::NoRegister) return ARM::ssub_1;
+  return ARM::ssub_0;
+}
+
+// Get the subreg type that is most likely to be coalesced
+// for an SPR register that will be used in VDUP32d pseudo.
+unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) {
+  if (!TRI->isVirtualRegister(SReg))
+    return getDPRLaneFromSPR(SReg);
+
+  MachineInstr *MI = MRI->getVRegDef(SReg);
+  if (!MI) return ARM::ssub_0;
+  MachineOperand *MO = MI->findRegisterDefOperand(SReg);
+
+  assert(MO->isReg() && "Non-register operand found!");
+  if (!MO) return ARM::ssub_0;
+
+  if (MI->isCopy() && usesRegClass(MI->getOperand(1),
+                                    &ARM::SPRRegClass)) {
+    SReg = MI->getOperand(1).getReg();
+  }
+
+  if (TargetRegisterInfo::isVirtualRegister(SReg)) {
+    if (MO->getSubReg() == ARM::ssub_1) return ARM::ssub_1;
+    return ARM::ssub_0;
+  }
+  return getDPRLaneFromSPR(SReg);
+}
+
+// MI is known to be dead. Figure out what instructions
+// are also made dead by this and mark them for removal.
+void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
+  SmallVector<MachineInstr *, 8> Front;
+  DeadInstr.insert(MI);
+
+  DEBUG(dbgs() << "Deleting base instruction " << *MI << "\n");
+  Front.push_back(MI);
+
+  while (Front.size() != 0) {
+    MI = Front.back();
+    Front.pop_back();
+
+    // MI is already known to be dead. We need to see
+    // if other instructions can also be removed.
+    for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if ((!MO.isReg()) || (!MO.isUse()))
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!TRI->isVirtualRegister(Reg))
+        continue;
+      MachineOperand *Op = MI->findRegisterDefOperand(Reg);
+
+      if (!Op)
+        continue;
+
+      MachineInstr *Def = Op->getParent();
+
+      // We don't need to do anything if we have already marked
+      // this instruction as being dead.
+      if (DeadInstr.find(Def) != DeadInstr.end())
+        continue;
+
+      // Check if all the uses of this instruction are marked as
+      // dead. If so, we can also mark this instruction as being
+      // dead.
+      bool IsDead = true;
+      for (unsigned int j = 0; j < Def->getNumOperands(); ++j) {
+        MachineOperand &MODef = Def->getOperand(j);
+        if ((!MODef.isReg()) || (!MODef.isDef()))
+          continue;
+        unsigned DefReg = MODef.getReg();
+        if (!TRI->isVirtualRegister(DefReg)) {
+          IsDead = false;
+          break;
+        }
+        for (MachineRegisterInfo::use_instr_iterator
+             II = MRI->use_instr_begin(Reg), EE = MRI->use_instr_end();
+             II != EE; ++II) {
+          // We don't care about self references.
+          if (&*II == Def)
+            continue;
+          if (DeadInstr.find(&*II) == DeadInstr.end()) {
+            IsDead = false;
+            break;
+          }
+        }
+      }
+
+      if (!IsDead) continue;
+
+      DEBUG(dbgs() << "Deleting instruction " << *Def << "\n");
+      DeadInstr.insert(Def);
+    }
+  }
+}
+
+// Creates the more optimized patterns and generally does all the code
+// transformations in this pass.
+unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
+  if (MI->isCopy()) {
+    return optimizeAllLanesPattern(MI, MI->getOperand(1).getReg());
+  }
+
+  if (MI->isInsertSubreg()) {
+    unsigned DPRReg = MI->getOperand(1).getReg();
+    unsigned SPRReg = MI->getOperand(2).getReg();
+
+    if (TRI->isVirtualRegister(DPRReg) && TRI->isVirtualRegister(SPRReg)) {
+      MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg());
+      MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg());
+
+      if (DPRMI && SPRMI) {
+        // See if the first operand of this insert_subreg is IMPLICIT_DEF
+        MachineInstr *ECDef = elideCopies(DPRMI);
+        if (ECDef && ECDef->isImplicitDef()) {
+          // Another corner case - if we're inserting something that is purely
+          // a subreg copy of a DPR, just use that DPR.
+
+          MachineInstr *EC = elideCopies(SPRMI);
+          // Is it a subreg copy of ssub_0?
+          if (EC && EC->isCopy() &&
+              EC->getOperand(1).getSubReg() == ARM::ssub_0) {
+            DEBUG(dbgs() << "Found a subreg copy: " << *SPRMI);
+
+            // Find the thing we're subreg copying out of - is it of the same
+            // regclass as DPRMI? (i.e. a DPR or QPR).
+            unsigned FullReg = SPRMI->getOperand(1).getReg();
+            const TargetRegisterClass *TRC =
+              MRI->getRegClass(MI->getOperand(1).getReg());
+            if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) {
+              DEBUG(dbgs() << "Subreg copy is compatible - returning ");
+              DEBUG(dbgs() << PrintReg(FullReg) << "\n");
+              eraseInstrWithNoUses(MI);
+              return FullReg;
+            }
+          }
+
+          return optimizeAllLanesPattern(MI, MI->getOperand(2).getReg());
+        }
+      }
+    }
+    return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
+  }
+
+  if (MI->isRegSequence() && usesRegClass(MI->getOperand(1),
+                                          &ARM::SPRRegClass)) {
+    // See if all bar one of the operands are IMPLICIT_DEF and insert the
+    // optimizer pattern accordingly.
+    unsigned NumImplicit = 0, NumTotal = 0;
+    unsigned NonImplicitReg = ~0U;
+
+    for (unsigned I = 1; I < MI->getNumExplicitOperands(); ++I) {
+      if (!MI->getOperand(I).isReg())
+        continue;
+      ++NumTotal;
+      unsigned OpReg = MI->getOperand(I).getReg();
+
+      if (!TRI->isVirtualRegister(OpReg))
+        break;
+
+      MachineInstr *Def = MRI->getVRegDef(OpReg);
+      if (!Def)
+        break;
+      if (Def->isImplicitDef())
+        ++NumImplicit;
+      else
+        NonImplicitReg = MI->getOperand(I).getReg();
+    }
+
+    if (NumImplicit == NumTotal - 1)
+      return optimizeAllLanesPattern(MI, NonImplicitReg);
+    else
+      return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
+  }
+
+  llvm_unreachable("Unhandled update pattern!");
+}
+
+// Return true if this MachineInstr inserts a scalar (SPR) value into
+// a D or Q register.
+bool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) {
+  // The only way we can do a partial register update is through a COPY,
+  // INSERT_SUBREG or REG_SEQUENCE.
+  if (MI->isCopy() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass))
+    return true;
+
+  if (MI->isInsertSubreg() && usesRegClass(MI->getOperand(2),
+                                           &ARM::SPRRegClass))
+    return true;
+
+  if (MI->isRegSequence() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass))
+    return true;
+
+  return false;
+}
+
+// Looks through full copies to get the instruction that defines the input
+// operand for MI.
+MachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) {
+  if (!MI->isFullCopy())
+    return MI;
+  if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
+    return nullptr;
+  MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg());
+  if (!Def)
+    return nullptr;
+  return elideCopies(Def);
+}
+
+// Look through full copies and PHIs to get the set of non-copy MachineInstrs
+// that can produce MI.
+void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
+                                        SmallVectorImpl<MachineInstr*> &Outs) {
+   // Looking through PHIs may create loops so we need to track what
+   // instructions we have visited before.
+   std::set<MachineInstr *> Reached;
+   SmallVector<MachineInstr *, 8> Front;
+   Front.push_back(MI);
+   while (Front.size() != 0) {
+     MI = Front.back();
+     Front.pop_back();
+
+     // If we have already explored this MachineInstr, ignore it.
+     if (Reached.find(MI) != Reached.end())
+       continue;
+     Reached.insert(MI);
+     if (MI->isPHI()) {
+       for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
+         unsigned Reg = MI->getOperand(I).getReg();
+         if (!TRI->isVirtualRegister(Reg)) {
+           continue;
+         }
+         MachineInstr *NewMI = MRI->getVRegDef(Reg);
+         if (!NewMI)
+           continue;
+         Front.push_back(NewMI);
+       }
+     } else if (MI->isFullCopy()) {
+       if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
+         continue;
+       MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg());
+       if (!NewMI)
+         continue;
+       Front.push_back(NewMI);
+     } else {
+       DEBUG(dbgs() << "Found partial copy" << *MI <<"\n");
+       Outs.push_back(MI);
+     }
+   }
+}
+
+// Return the DPR virtual registers that are read by this machine instruction
+// (if any).
+SmallVector<unsigned, 8> A15SDOptimizer::getReadDPRs(MachineInstr *MI) {
+  if (MI->isCopyLike() || MI->isInsertSubreg() || MI->isRegSequence() ||
+      MI->isKill())
+    return SmallVector<unsigned, 8>();
+
+  SmallVector<unsigned, 8> Defs;
+  for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    if (!usesRegClass(MO, &ARM::DPRRegClass) &&
+        !usesRegClass(MO, &ARM::QPRRegClass) &&
+        !usesRegClass(MO, &ARM::DPairRegClass)) // Treat DPair as QPR
+      continue;
+
+    Defs.push_back(MO.getReg());
+  }
+  return Defs;
+}
+
+// Creates a DPR register from an SPR one by using a VDUP.
+unsigned A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator InsertBefore,
+                                       const DebugLoc &DL, unsigned Reg,
+                                       unsigned Lane, bool QPR) {
+  unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass :
+                                                  &ARM::DPRRegClass);
+  AddDefaultPred(BuildMI(MBB,
+                         InsertBefore,
+                         DL,
+                         TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d),
+                         Out)
+                   .addReg(Reg)
+                   .addImm(Lane));
+
+  return Out;
+}
+
+// Creates a SPR register from a DPR by copying the value in lane 0.
+unsigned A15SDOptimizer::createExtractSubreg(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    const DebugLoc &DL, unsigned DReg, unsigned Lane,
+    const TargetRegisterClass *TRC) {
+  unsigned Out = MRI->createVirtualRegister(TRC);
+  BuildMI(MBB,
+          InsertBefore,
+          DL,
+          TII->get(TargetOpcode::COPY), Out)
+    .addReg(DReg, 0, Lane);
+
+  return Out;
+}
+
+// Takes two SPR registers and creates a DPR by using a REG_SEQUENCE.
+unsigned A15SDOptimizer::createRegSequence(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    const DebugLoc &DL, unsigned Reg1, unsigned Reg2) {
+  unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass);
+  BuildMI(MBB,
+          InsertBefore,
+          DL,
+          TII->get(TargetOpcode::REG_SEQUENCE), Out)
+    .addReg(Reg1)
+    .addImm(ARM::dsub_0)
+    .addReg(Reg2)
+    .addImm(ARM::dsub_1);
+  return Out;
+}
+
+// Takes two DPR registers that have previously been VDUPed (Ssub0 and Ssub1)
+// and merges them into one DPR register.
+unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator InsertBefore,
+                                    const DebugLoc &DL, unsigned Ssub0,
+                                    unsigned Ssub1) {
+  unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
+  AddDefaultPred(BuildMI(MBB,
+                         InsertBefore,
+                         DL,
+                         TII->get(ARM::VEXTd32), Out)
+                   .addReg(Ssub0)
+                   .addReg(Ssub1)
+                   .addImm(1));
+  return Out;
+}
+
+unsigned A15SDOptimizer::createInsertSubreg(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    const DebugLoc &DL, unsigned DReg, unsigned Lane, unsigned ToInsert) {
+  unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass);
+  BuildMI(MBB,
+          InsertBefore,
+          DL,
+          TII->get(TargetOpcode::INSERT_SUBREG), Out)
+    .addReg(DReg)
+    .addReg(ToInsert)
+    .addImm(Lane);
+
+  return Out;
+}
+
+unsigned
+A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator InsertBefore,
+                                  const DebugLoc &DL) {
+  unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
+  BuildMI(MBB,
+          InsertBefore,
+          DL,
+          TII->get(TargetOpcode::IMPLICIT_DEF), Out);
+  return Out;
+}
+
+// This function inserts instructions in order to optimize interactions between
+// SPR registers and DPR/QPR registers. It does so by performing VDUPs on all
+// lanes, and the using VEXT instructions to recompose the result.
+unsigned
+A15SDOptimizer::optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg) {
+  MachineBasicBlock::iterator InsertPt(MI);
+  DebugLoc DL = MI->getDebugLoc();
+  MachineBasicBlock &MBB = *MI->getParent();
+  InsertPt++;
+  unsigned Out;
+
+  // DPair has the same length as QPR and also has two DPRs as subreg.
+  // Treat DPair as QPR.
+  if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::QPRRegClass) ||
+      MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPairRegClass)) {
+    unsigned DSub0 = createExtractSubreg(MBB, InsertPt, DL, Reg,
+                                         ARM::dsub_0, &ARM::DPRRegClass);
+    unsigned DSub1 = createExtractSubreg(MBB, InsertPt, DL, Reg,
+                                         ARM::dsub_1, &ARM::DPRRegClass);
+
+    unsigned Out1 = createDupLane(MBB, InsertPt, DL, DSub0, 0);
+    unsigned Out2 = createDupLane(MBB, InsertPt, DL, DSub0, 1);
+    Out = createVExt(MBB, InsertPt, DL, Out1, Out2);
+
+    unsigned Out3 = createDupLane(MBB, InsertPt, DL, DSub1, 0);
+    unsigned Out4 = createDupLane(MBB, InsertPt, DL, DSub1, 1);
+    Out2 = createVExt(MBB, InsertPt, DL, Out3, Out4);
+
+    Out = createRegSequence(MBB, InsertPt, DL, Out, Out2);
+
+  } else if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPRRegClass)) {
+    unsigned Out1 = createDupLane(MBB, InsertPt, DL, Reg, 0);
+    unsigned Out2 = createDupLane(MBB, InsertPt, DL, Reg, 1);
+    Out = createVExt(MBB, InsertPt, DL, Out1, Out2);
+
+  } else {
+    assert(MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::SPRRegClass) &&
+           "Found unexpected regclass!");
+
+    unsigned PrefLane = getPrefSPRLane(Reg);
+    unsigned Lane;
+    switch (PrefLane) {
+      case ARM::ssub_0: Lane = 0; break;
+      case ARM::ssub_1: Lane = 1; break;
+      default: llvm_unreachable("Unknown preferred lane!");
+    }
+
+    // Treat DPair as QPR
+    bool UsesQPR = usesRegClass(MI->getOperand(0), &ARM::QPRRegClass) ||
+                   usesRegClass(MI->getOperand(0), &ARM::DPairRegClass);
+
+    Out = createImplicitDef(MBB, InsertPt, DL);
+    Out = createInsertSubreg(MBB, InsertPt, DL, Out, PrefLane, Reg);
+    Out = createDupLane(MBB, InsertPt, DL, Out, Lane, UsesQPR);
+    eraseInstrWithNoUses(MI);
+  }
+  return Out;
+}
+
+bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
+  // We look for instructions that write S registers that are then read as
+  // D/Q registers. These can only be caused by COPY, INSERT_SUBREG and
+  // REG_SEQUENCE pseudos that insert an SPR value into a DPR register or
+  // merge two SPR values to form a DPR register.  In order avoid false
+  // positives we make sure that there is an SPR producer so we look past
+  // COPY and PHI nodes to find it.
+  //
+  // The best code pattern for when an SPR producer is going to be used by a
+  // DPR or QPR consumer depends on whether the other lanes of the
+  // corresponding DPR/QPR are currently defined.
+  //
+  // We can handle these efficiently, depending on the type of
+  // pseudo-instruction that is producing the pattern
+  //
+  //   * COPY:          * VDUP all lanes and merge the results together
+  //                      using VEXTs.
+  //
+  //   * INSERT_SUBREG: * If the SPR value was originally in another DPR/QPR
+  //                      lane, and the other lane(s) of the DPR/QPR register
+  //                      that we are inserting in are undefined, use the
+  //                      original DPR/QPR value.
+  //                    * Otherwise, fall back on the same stategy as COPY.
+  //
+  //   * REG_SEQUENCE:  * If all except one of the input operands are
+  //                      IMPLICIT_DEFs, insert the VDUP pattern for just the
+  //                      defined input operand
+  //                    * Otherwise, fall back on the same stategy as COPY.
+  //
+
+  // First, get all the reads of D-registers done by this instruction.
+  SmallVector<unsigned, 8> Defs = getReadDPRs(MI);
+  bool Modified = false;
+
+  for (SmallVectorImpl<unsigned>::iterator I = Defs.begin(), E = Defs.end();
+     I != E; ++I) {
+    // Follow the def-use chain for this DPR through COPYs, and also through
+    // PHIs (which are essentially multi-way COPYs). It is because of PHIs that
+    // we can end up with multiple defs of this DPR.
+
+    SmallVector<MachineInstr *, 8> DefSrcs;
+    if (!TRI->isVirtualRegister(*I))
+      continue;
+    MachineInstr *Def = MRI->getVRegDef(*I);
+    if (!Def)
+      continue;
+
+    elideCopiesAndPHIs(Def, DefSrcs);
+
+    for (SmallVectorImpl<MachineInstr *>::iterator II = DefSrcs.begin(),
+      EE = DefSrcs.end(); II != EE; ++II) {
+      MachineInstr *MI = *II;
+
+      // If we've already analyzed and replaced this operand, don't do
+      // anything.
+      if (Replacements.find(MI) != Replacements.end())
+        continue;
+
+      // Now, work out if the instruction causes a SPR->DPR dependency.
+      if (!hasPartialWrite(MI))
+        continue;
+
+      // Collect all the uses of this MI's DPR def for updating later.
+      SmallVector<MachineOperand*, 8> Uses;
+      unsigned DPRDefReg = MI->getOperand(0).getReg();
+      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg),
+             E = MRI->use_end(); I != E; ++I)
+        Uses.push_back(&*I);
+
+      // We can optimize this.
+      unsigned NewReg = optimizeSDPattern(MI);
+
+      if (NewReg != 0) {
+        Modified = true;
+        for (SmallVectorImpl<MachineOperand *>::const_iterator I = Uses.begin(),
+               E = Uses.end(); I != E; ++I) {
+          // Make sure to constrain the register class of the new register to
+          // match what we're replacing. Otherwise we can optimize a DPR_VFP2
+          // reference into a plain DPR, and that will end poorly. NewReg is
+          // always virtual here, so there will always be a matching subclass
+          // to find.
+          MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg()));
+
+          DEBUG(dbgs() << "Replacing operand "
+                       << **I << " with "
+                       << PrintReg(NewReg) << "\n");
+          (*I)->substVirtReg(NewReg, 0, *TRI);
+        }
+      }
+      Replacements[MI] = NewReg;
+    }
+  }
+  return Modified;
+}
+
+bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
+  const ARMSubtarget &STI = Fn.getSubtarget<ARMSubtarget>();
+  // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be
+  // enabled when NEON is available.
+  if (!(STI.isCortexA15() && STI.hasNEON()))
+    return false;
+  TII = STI.getInstrInfo();
+  TRI = STI.getRegisterInfo();
+  MRI = &Fn.getRegInfo();
+  bool Modified = false;
+
+  DEBUG(dbgs() << "Running on function " << Fn.getName()<< "\n");
+
+  DeadInstr.clear();
+  Replacements.clear();
+
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+
+    for (MachineBasicBlock::iterator MI = MFI->begin(), ME = MFI->end();
+      MI != ME;) {
+      Modified |= runOnInstruction(&*MI++);
+    }
+
+  }
+
+  for (std::set<MachineInstr *>::iterator I = DeadInstr.begin(),
+                                            E = DeadInstr.end();
+                                            I != E; ++I) {
+    (*I)->eraseFromParent();
+  }
+
+  return Modified;
+}
+
+FunctionPass *llvm::createA15SDOptimizerPass() {
+  return new A15SDOptimizer();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARM.h b/contrib/llvm/lib/Target/ARM/ARM.h
new file mode 100644
index 000000000000..be3048252bbc
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARM.h
@@ -0,0 +1,59 @@
+//===-- ARM.h - Top-level interface for ARM representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// ARM back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARM_H
+#define LLVM_LIB_TARGET_ARM_ARM_H
+
+#include "llvm/Support/CodeGen.h"
+#include "ARMBasicBlockInfo.h"
+#include <functional>
+
+namespace llvm {
+
+class ARMAsmPrinter;
+class ARMBaseTargetMachine;
+class Function;
+class FunctionPass;
+class ImmutablePass;
+class MachineInstr;
+class MCInst;
+class PassRegistry;
+class TargetLowering;
+class TargetMachine;
+
+FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
+                               CodeGenOpt::Level OptLevel);
+FunctionPass *createA15SDOptimizerPass();
+FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
+FunctionPass *createARMExpandPseudoPass();
+FunctionPass *createARMConstantIslandPass();
+FunctionPass *createMLxExpansionPass();
+FunctionPass *createThumb2ITBlockPass();
+FunctionPass *createARMOptimizeBarriersPass();
+FunctionPass *createThumb2SizeReductionPass(
+    std::function<bool(const Function &)> Ftor = nullptr);
+
+void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                  ARMAsmPrinter &AP);
+
+void computeBlockSize(MachineFunction *MF, MachineBasicBlock *MBB,
+                      BasicBlockInfo &BBI);
+std::vector<BasicBlockInfo> computeAllBlockSizes(MachineFunction *MF);
+
+void initializeARMLoadStoreOptPass(PassRegistry &);
+void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td
new file mode 100644
index 000000000000..2a090faeee6a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARM.td
@@ -0,0 +1,873 @@
+//===-- ARM.td - Describe the ARM Target Machine -----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// ARM Helper classes.
+//
+
+class ProcNoItin<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+class Architecture<string fname, string aname, list<SubtargetFeature> features >
+  : SubtargetFeature<fname, "ARMArch", aname,
+                     !strconcat(aname, " architecture"), features>;
+
+//===----------------------------------------------------------------------===//
+// ARM Subtarget state.
+//
+
+def ModeThumb  : SubtargetFeature<"thumb-mode", "InThumbMode", "true",
+                                  "Thumb mode">;
+
+def ModeSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
+                                     "Use software floating point features.">;
+
+//===----------------------------------------------------------------------===//
+// ARM Subtarget features.
+//
+
+def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true",
+                                   "Enable VFP2 instructions">;
+def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true",
+                                   "Enable VFP3 instructions",
+                                   [FeatureVFP2]>;
+def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
+                                   "Enable NEON instructions",
+                                   [FeatureVFP3]>;
+def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true",
+                                     "Enable Thumb2 instructions">;
+def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
+                                     "Does not support ARM mode execution",
+                                     [ModeThumb]>;
+def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
+                                     "Enable half-precision floating point">;
+def FeatureVFP4   : SubtargetFeature<"vfp4", "HasVFPv4", "true",
+                                     "Enable VFP4 instructions",
+                                     [FeatureVFP3, FeatureFP16]>;
+def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8",
+                                   "true", "Enable ARMv8 FP",
+                                   [FeatureVFP4]>;
+def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
+                                       "Enable full half-precision floating point",
+                                       [FeatureFPARMv8]>;
+def FeatureD16    : SubtargetFeature<"d16", "HasD16", "true",
+                                     "Restrict FP to 16 double registers">;
+def FeatureHWDiv  : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
+                                     "Enable divide instructions">;
+def FeatureHWDivARM  : SubtargetFeature<"hwdiv-arm",
+                                        "HasHardwareDivideInARM", "true",
+                                      "Enable divide instructions in ARM mode">;
+def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true",
+                                 "Enable Thumb2 extract and pack instructions">;
+def FeatureDB     : SubtargetFeature<"db", "HasDataBarrier", "true",
+                                   "Has data barrier (dmb / dsb) instructions">;
+def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",
+                                      "Has v7 clrex instruction">;
+def FeatureAcquireRelease : SubtargetFeature<"acquire-release",
+                                             "HasAcquireRelease", "true",
+                         "Has v8 acquire/release (lda/ldaex etc) instructions">;
+def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
+                                         "FP compare + branch is slow">;
+def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
+                          "Floating point unit supports single precision only">;
+def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
+                           "Enable support for Performance Monitor extensions">;
+def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true",
+                          "Enable support for TrustZone security extensions">;
+def Feature8MSecExt : SubtargetFeature<"8msecext", "Has8MSecExt", "true",
+                          "Enable support for ARMv8-M Security Extensions">;
+def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
+                          "Enable support for Cryptography extensions",
+                          [FeatureNEON]>;
+def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
+                          "Enable support for CRC instructions">;
+// Not to be confused with FeatureHasRetAddrStack (return address stack)
+def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
+                "Enable Reliability, Availability and Serviceability extensions">;
+def FeatureFPAO : SubtargetFeature<"fpao", "HasFPAO", "true",
+                "Enable fast computation of positive address offsets">;
+
+
+// Cyclone has preferred instructions for zeroing VFP registers, which can
+// execute in 0 cycles.
+def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+                                        "Has zero-cycle zeroing instructions">;
+
+// Whether or not it may be profitable to unpredicate certain instructions
+// during if conversion.
+def FeatureProfUnpredicate : SubtargetFeature<"prof-unpr",
+                                              "IsProfitableToUnpredicate",
+                                              "true",
+                                              "Is profitable to unpredicate">;
+
+// Some targets (e.g. Swift) have microcoded VGETLNi32.
+def FeatureSlowVGETLNi32 : SubtargetFeature<"slow-vgetlni32",
+                                            "HasSlowVGETLNi32", "true",
+                                            "Has slow VGETLNi32 - prefer VMOV">;
+
+// Some targets (e.g. Swift) have microcoded VDUP32.
+def FeatureSlowVDUP32 : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32", "true",
+                                         "Has slow VDUP32 - prefer VMOV">;
+
+// Some targets (e.g. Cortex-A9) prefer VMOVSR to VMOVDRR even when using NEON
+// for scalar FP, as this allows more effective execution domain optimization.
+def FeaturePreferVMOVSR : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR",
+                                           "true", "Prefer VMOVSR">;
+
+// Swift has ISHST barriers compatible with Atomic Release semantics but weaker
+// than ISH
+def FeaturePrefISHSTBarrier : SubtargetFeature<"prefer-ishst", "PreferISHST",
+                                           "true", "Prefer ISHST barriers">;
+
+// Some targets (e.g. Cortex-A9) have muxed AGU and NEON/FPU.
+def FeatureMuxedUnits : SubtargetFeature<"muxed-units", "HasMuxedUnits", "true",
+                                         "Has muxed AGU and NEON/FPU">;
+
+// On some targets, a VLDM/VSTM starting with an odd register number needs more
+// microops than single VLDRS.
+def FeatureSlowOddRegister : SubtargetFeature<"slow-odd-reg", "SlowOddRegister",
+                     "true", "VLDM/VSTM starting with an odd register is slow">;
+
+// Some targets have a renaming dependency when loading into D subregisters.
+def FeatureSlowLoadDSubreg : SubtargetFeature<"slow-load-D-subreg",
+                                              "SlowLoadDSubregister", "true",
+                                              "Loading into D subregs is slow">;
+// Some targets (e.g. Cortex-A15) never want VMOVS to be widened to VMOVD.
+def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs",
+                                             "DontWidenVMOVS", "true",
+                                             "Don't widen VMOVS to VMOVD">;
+
+// Whether or not it is profitable to expand VFP/NEON MLA/MLS instructions.
+def FeatureExpandMLx : SubtargetFeature<"expand-fp-mlx", "ExpandMLx", "true",
+                                        "Expand VFP/NEON MLA/MLS instructions">;
+
+// Some targets have special RAW hazards for VFP/NEON VMLA/VMLS.
+def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards",
+                                             "true", "Has VMLx hazards">;
+
+// Some targets (e.g. Cortex-A9) want to convert VMOVRS, VMOVSR and VMOVS from
+// VFP to NEON, as an execution domain optimization.
+def FeatureNEONForFPMovs : SubtargetFeature<"neon-fpmovs", "UseNEONForFPMovs",
+                              "true", "Convert VMOVSR, VMOVRS, VMOVS to NEON">;
+
+// Some processors benefit from using NEON instructions for scalar
+// single-precision FP operations. This affects instruction selection and should
+// only be enabled if the handling of denormals is not important.
+def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
+                                        "true",
+                                        "Use NEON for single precision FP">;
+
+// On some processors, VLDn instructions that access unaligned data take one
+// extra cycle. Take that into account when computing operand latencies.
+def FeatureCheckVLDnAlign : SubtargetFeature<"vldn-align", "CheckVLDnAlign",
+                                             "true",
+                                             "Check for VLDn unaligned access">;
+
+// Some processors have a nonpipelined VFP coprocessor.
+def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp",
+                                              "NonpipelinedVFP", "true",
+                                          "VFP instructions are not pipelined">;
+
+// Some processors have FP multiply-accumulate instructions that don't
+// play nicely with other VFP / NEON instructions, and it's generally better
+// to just not use them.
+def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
+                                         "Disable VFP / NEON MAC instructions">;
+
+// Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding.
+def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
+                                       "HasVMLxForwarding", "true",
+                                       "Has multiplier accumulator forwarding">;
+
+// Disable 32-bit to 16-bit narrowing for experimentation.
+def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
+                                             "Prefer 32-bit Thumb instrs">;
+
+/// Some instructions update CPSR partially, which can add false dependency for
+/// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is
+/// mapped to a separate physical register. Avoid partial CPSR update for these
+/// processors.
+def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr",
+                                               "AvoidCPSRPartialUpdate", "true",
+                                 "Avoid CPSR partial update for OOO execution">;
+
+def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
+                                            "AvoidMOVsShifterOperand", "true",
+                                "Avoid movs instructions with shifter operand">;
+
+// Some processors perform return stack prediction. CodeGen should avoid issue
+// "normal" call instructions to callees which do not return.
+def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true",
+                                     "Has return address stack">;
+
+/// DSP extension.
+def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true",
+                              "Supports DSP instructions in ARM and/or Thumb2">;
+
+// Multiprocessing extension.
+def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
+                                 "Supports Multiprocessing extension">;
+
+// Virtualization extension - requires HW divide (ARMv7-AR ARMARM - 4.4.8).
+def FeatureVirtualization : SubtargetFeature<"virtualization",
+                                 "HasVirtualization", "true",
+                                 "Supports Virtualization extension",
+                                 [FeatureHWDiv, FeatureHWDivARM]>;
+
+// M-series ISA
+def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass",
+                                     "Is microcontroller profile ('M' series)">;
+
+// R-series ISA
+def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass",
+                                     "Is realtime profile ('R' series)">;
+
+// A-series ISA
+def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass",
+                                     "Is application profile ('A' series)">;
+
+// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
+// See ARMInstrInfo.td for details.
+def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
+                                       "NaCl trap">;
+
+def FeatureStrictAlign : SubtargetFeature<"strict-align",
+                                          "StrictAlign", "true",
+                                          "Disallow all unaligned memory "
+                                          "access">;
+
+def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true",
+                                        "Generate calls via indirect call "
+                                        "instructions">;
+
+def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true",
+                                        "Reserve R9, making it unavailable as "
+                                        "GPR">;
+
+def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true",
+                                     "Don't use movt/movw pairs for 32-bit "
+                                     "imms">;
+
+
+//===----------------------------------------------------------------------===//
+// ARM ISAa.
+//
+
+def HasV4TOps   : SubtargetFeature<"v4t", "HasV4TOps", "true",
+                                   "Support ARM v4T instructions">;
+def HasV5TOps   : SubtargetFeature<"v5t", "HasV5TOps", "true",
+                                   "Support ARM v5T instructions",
+                                   [HasV4TOps]>;
+def HasV5TEOps  : SubtargetFeature<"v5te", "HasV5TEOps", "true",
+                             "Support ARM v5TE, v5TEj, and v5TExp instructions",
+                                   [HasV5TOps]>;
+def HasV6Ops    : SubtargetFeature<"v6", "HasV6Ops", "true",
+                                   "Support ARM v6 instructions",
+                                   [HasV5TEOps]>;
+def HasV6MOps   : SubtargetFeature<"v6m", "HasV6MOps", "true",
+                                   "Support ARM v6M instructions",
+                                   [HasV6Ops]>;
+def HasV8MBaselineOps : SubtargetFeature<"v8m", "HasV8MBaselineOps", "true",
+                                         "Support ARM v8M Baseline instructions",
+                                         [HasV6MOps]>;
+def HasV6KOps   : SubtargetFeature<"v6k", "HasV6KOps", "true",
+                                   "Support ARM v6k instructions",
+                                   [HasV6Ops]>;
+def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
+                                   "Support ARM v6t2 instructions",
+                                   [HasV8MBaselineOps, HasV6KOps, FeatureThumb2]>;
+def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
+                                   "Support ARM v7 instructions",
+                                   [HasV6T2Ops, FeaturePerfMon,
+                                    FeatureV7Clrex]>;
+def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
+                                   "Support ARM v8 instructions",
+                                   [HasV7Ops, FeatureAcquireRelease,
+                                    FeatureT2XtPk]>;
+def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
+                                   "Support ARM v8.1a instructions",
+                                   [HasV8Ops]>;
+def HasV8_2aOps   : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
+                                   "Support ARM v8.2a instructions",
+                                   [HasV8_1aOps]>;
+def HasV8MMainlineOps : SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true",
+                                         "Support ARM v8M Mainline instructions",
+                                         [HasV7Ops]>;
+
+
+//===----------------------------------------------------------------------===//
+// ARM Processor subtarget features.
+//
+
+def ProcA5      : SubtargetFeature<"a5", "ARMProcFamily", "CortexA5",
+                                   "Cortex-A5 ARM processors", []>;
+def ProcA7      : SubtargetFeature<"a7", "ARMProcFamily", "CortexA7",
+                                   "Cortex-A7 ARM processors", []>;
+def ProcA8      : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8",
+                                   "Cortex-A8 ARM processors", []>;
+def ProcA9      : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
+                                   "Cortex-A9 ARM processors", []>;
+def ProcA12     : SubtargetFeature<"a12", "ARMProcFamily", "CortexA12",
+                                   "Cortex-A12 ARM processors", []>;
+def ProcA15     : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
+                                   "Cortex-A15 ARM processors", []>;
+def ProcA17     : SubtargetFeature<"a17", "ARMProcFamily", "CortexA17",
+                                   "Cortex-A17 ARM processors", []>;
+def ProcA32     : SubtargetFeature<"a32", "ARMProcFamily", "CortexA32",
+                                   "Cortex-A32 ARM processors", []>;
+def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
+                                   "Cortex-A35 ARM processors", []>;
+def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
+                                   "Cortex-A53 ARM processors", []>;
+def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
+                                   "Cortex-A57 ARM processors", []>;
+def ProcA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
+                                   "Cortex-A72 ARM processors", []>;
+def ProcA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
+                                   "Cortex-A73 ARM processors", []>;
+
+def ProcKrait   : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
+                                   "Qualcomm ARM processors", []>;
+def ProcSwift   : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
+                                   "Swift ARM processors", []>;
+
+def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
+                                    "Samsung Exynos-M1 processors", []>;
+
+def ProcR4      : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4",
+                                   "Cortex-R4 ARM processors", []>;
+def ProcR5      : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5",
+                                   "Cortex-R5 ARM processors", []>;
+def ProcR7      : SubtargetFeature<"r7", "ARMProcFamily", "CortexR7",
+                                   "Cortex-R7 ARM processors", []>;
+def ProcR52     : SubtargetFeature<"r52", "ARMProcFamily", "CortexR52",
+                                   "Cortex-R52 ARM processors", []>;
+
+def ProcM3      : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3",
+                                   "Cortex-M3 ARM processors", []>;
+
+//===----------------------------------------------------------------------===//
+// ARM schedules.
+//
+
+include "ARMSchedule.td"
+
+
+//===----------------------------------------------------------------------===//
+// ARM architectures
+//
+
+def ARMv2     : Architecture<"armv2",     "ARMv2",    []>;
+
+def ARMv2a    : Architecture<"armv2a",    "ARMv2a",   []>;
+
+def ARMv3     : Architecture<"armv3",     "ARMv3",    []>;
+
+def ARMv3m    : Architecture<"armv3m",    "ARMv3m",   []>;
+
+def ARMv4     : Architecture<"armv4",     "ARMv4",    []>;
+
+def ARMv4t    : Architecture<"armv4t",    "ARMv4t",   [HasV4TOps]>;
+
+def ARMv5t    : Architecture<"armv5t",    "ARMv5t",   [HasV5TOps]>;
+
+def ARMv5te   : Architecture<"armv5te",   "ARMv5te",  [HasV5TEOps]>;
+
+def ARMv5tej  : Architecture<"armv5tej",  "ARMv5tej", [HasV5TEOps]>;
+
+def ARMv6     : Architecture<"armv6",     "ARMv6",    [HasV6Ops]>;
+
+def ARMv6t2   : Architecture<"armv6t2",   "ARMv6t2",  [HasV6T2Ops,
+                                                       FeatureDSP,
+                                                       FeatureT2XtPk]>;
+
+def ARMv6k    : Architecture<"armv6k",    "ARMv6k",   [HasV6KOps]>;
+
+def ARMv6kz   : Architecture<"armv6kz",   "ARMv6kz",  [HasV6KOps,
+                                                       FeatureTrustZone]>;
+
+def ARMv6m    : Architecture<"armv6-m",   "ARMv6m",   [HasV6MOps,
+                                                       FeatureNoARM,
+                                                       FeatureDB,
+                                                       FeatureMClass]>;
+
+def ARMv6sm   : Architecture<"armv6s-m",  "ARMv6sm",  [HasV6MOps,
+                                                       FeatureNoARM,
+                                                       FeatureDB,
+                                                       FeatureMClass]>;
+
+def ARMv7a    : Architecture<"armv7-a",   "ARMv7a",   [HasV7Ops,
+                                                       FeatureNEON,
+                                                       FeatureDB,
+                                                       FeatureDSP,
+                                                       FeatureAClass,
+                                                       FeatureT2XtPk]>;
+
+def ARMv7r    : Architecture<"armv7-r",   "ARMv7r",   [HasV7Ops,
+                                                       FeatureDB,
+                                                       FeatureDSP,
+                                                       FeatureHWDiv,
+                                                       FeatureRClass,
+                                                       FeatureT2XtPk]>;
+
+def ARMv7m    : Architecture<"armv7-m",   "ARMv7m",   [HasV7Ops,
+                                                       FeatureThumb2,
+                                                       FeatureNoARM,
+                                                       FeatureDB,
+                                                       FeatureHWDiv,
+                                                       FeatureMClass]>;
+
+def ARMv7em   : Architecture<"armv7e-m",  "ARMv7em",  [HasV7Ops,
+                                                       FeatureThumb2,
+                                                       FeatureNoARM,
+                                                       FeatureDB,
+                                                       FeatureHWDiv,
+                                                       FeatureMClass,
+                                                       FeatureDSP,
+                                                       FeatureT2XtPk]>;
+
+def ARMv8a    : Architecture<"armv8-a",   "ARMv8a",   [HasV8Ops,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCrypto,
+                                                       FeatureCRC]>;
+
+def ARMv81a   : Architecture<"armv8.1-a", "ARMv81a",  [HasV8_1aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCrypto,
+                                                       FeatureCRC]>;
+
+def ARMv82a   : Architecture<"armv8.2-a", "ARMv82a",  [HasV8_2aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCrypto,
+                                                       FeatureCRC,
+                                                       FeatureRAS]>;
+
+def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,
+                                                       FeatureRClass,
+                                                       FeatureDB,
+                                                       FeatureHWDiv,
+                                                       FeatureHWDivARM,
+                                                       FeatureT2XtPk,
+                                                       FeatureDSP,
+                                                       FeatureCRC,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON]>;
+
+def ARMv8mBaseline : Architecture<"armv8-m.base", "ARMv8mBaseline",
+                                                      [HasV8MBaselineOps,
+                                                       FeatureNoARM,
+                                                       FeatureDB,
+                                                       FeatureHWDiv,
+                                                       FeatureV7Clrex,
+                                                       Feature8MSecExt,
+                                                       FeatureAcquireRelease,
+                                                       FeatureMClass]>;
+
+def ARMv8mMainline : Architecture<"armv8-m.main", "ARMv8mMainline",
+                                                      [HasV8MMainlineOps,
+                                                       FeatureNoARM,
+                                                       FeatureDB,
+                                                       FeatureHWDiv,
+                                                       Feature8MSecExt,
+                                                       FeatureAcquireRelease,
+                                                       FeatureMClass]>;
+
+// Aliases
+def IWMMXT   : Architecture<"iwmmxt",      "ARMv5te",  [ARMv5te]>;
+def IWMMXT2  : Architecture<"iwmmxt2",     "ARMv5te",  [ARMv5te]>;
+def XScale   : Architecture<"xscale",      "ARMv5te",  [ARMv5te]>;
+def ARMv6j   : Architecture<"armv6j",      "ARMv7a",   [ARMv6]>;
+def ARMv7k   : Architecture<"armv7k",      "ARMv7a",   [ARMv7a]>;
+def ARMv7s   : Architecture<"armv7s",      "ARMv7a",   [ARMv7a]>;
+
+
+//===----------------------------------------------------------------------===//
+// ARM processors
+//
+
+// Dummy CPU, used to target architectures
+def : ProcNoItin<"generic",                             []>;
+
+def : ProcNoItin<"arm8",                                [ARMv4]>;
+def : ProcNoItin<"arm810",                              [ARMv4]>;
+def : ProcNoItin<"strongarm",                           [ARMv4]>;
+def : ProcNoItin<"strongarm110",                        [ARMv4]>;
+def : ProcNoItin<"strongarm1100",                       [ARMv4]>;
+def : ProcNoItin<"strongarm1110",                       [ARMv4]>;
+
+def : ProcNoItin<"arm7tdmi",                            [ARMv4t]>;
+def : ProcNoItin<"arm7tdmi-s",                          [ARMv4t]>;
+def : ProcNoItin<"arm710t",                             [ARMv4t]>;
+def : ProcNoItin<"arm720t",                             [ARMv4t]>;
+def : ProcNoItin<"arm9",                                [ARMv4t]>;
+def : ProcNoItin<"arm9tdmi",                            [ARMv4t]>;
+def : ProcNoItin<"arm920",                              [ARMv4t]>;
+def : ProcNoItin<"arm920t",                             [ARMv4t]>;
+def : ProcNoItin<"arm922t",                             [ARMv4t]>;
+def : ProcNoItin<"arm940t",                             [ARMv4t]>;
+def : ProcNoItin<"ep9312",                              [ARMv4t]>;
+
+def : ProcNoItin<"arm10tdmi",                           [ARMv5t]>;
+def : ProcNoItin<"arm1020t",                            [ARMv5t]>;
+
+def : ProcNoItin<"arm9e",                               [ARMv5te]>;
+def : ProcNoItin<"arm926ej-s",                          [ARMv5te]>;
+def : ProcNoItin<"arm946e-s",                           [ARMv5te]>;
+def : ProcNoItin<"arm966e-s",                           [ARMv5te]>;
+def : ProcNoItin<"arm968e-s",                           [ARMv5te]>;
+def : ProcNoItin<"arm10e",                              [ARMv5te]>;
+def : ProcNoItin<"arm1020e",                            [ARMv5te]>;
+def : ProcNoItin<"arm1022e",                            [ARMv5te]>;
+def : ProcNoItin<"xscale",                              [ARMv5te]>;
+def : ProcNoItin<"iwmmxt",                              [ARMv5te]>;
+
+def : Processor<"arm1136j-s",       ARMV6Itineraries,   [ARMv6]>;
+def : Processor<"arm1136jf-s",      ARMV6Itineraries,   [ARMv6,
+                                                         FeatureVFP2,
+                                                         FeatureHasSlowFPVMLx]>;
+
+def : Processor<"cortex-m0",        ARMV6Itineraries,   [ARMv6m]>;
+def : Processor<"cortex-m0plus",    ARMV6Itineraries,   [ARMv6m]>;
+def : Processor<"cortex-m1",        ARMV6Itineraries,   [ARMv6m]>;
+def : Processor<"sc000",            ARMV6Itineraries,   [ARMv6m]>;
+
+def : Processor<"arm1176jz-s",      ARMV6Itineraries,   [ARMv6kz]>;
+def : Processor<"arm1176jzf-s",     ARMV6Itineraries,   [ARMv6kz,
+                                                         FeatureVFP2,
+                                                         FeatureHasSlowFPVMLx]>;
+
+def : Processor<"mpcorenovfp",      ARMV6Itineraries,   [ARMv6k]>;
+def : Processor<"mpcore",           ARMV6Itineraries,   [ARMv6k,
+                                                         FeatureVFP2,
+                                                         FeatureHasSlowFPVMLx]>;
+
+def : Processor<"arm1156t2-s",      ARMV6Itineraries,   [ARMv6t2]>;
+def : Processor<"arm1156t2f-s",     ARMV6Itineraries,   [ARMv6t2,
+                                                         FeatureVFP2,
+                                                         FeatureHasSlowFPVMLx]>;
+
+// FIXME: A5 has currently the same Schedule model as A8
+def : ProcessorModel<"cortex-a5",   CortexA8Model,      [ARMv7a, ProcA5,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureTrustZone,
+                                                         FeatureSlowFPBrcc,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureVMLxForwarding,
+                                                         FeatureMP,
+                                                         FeatureVFP4]>;
+
+def : ProcessorModel<"cortex-a7",   CortexA8Model,      [ARMv7a, ProcA7,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureTrustZone,
+                                                         FeatureSlowFPBrcc,
+                                                         FeatureHasVMLxHazards,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureVMLxForwarding,
+                                                         FeatureMP,
+                                                         FeatureVFP4,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureVirtualization]>;
+
+def : ProcessorModel<"cortex-a8",   CortexA8Model,      [ARMv7a, ProcA8,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureNonpipelinedVFP,
+                                                         FeatureTrustZone,
+                                                         FeatureSlowFPBrcc,
+                                                         FeatureHasVMLxHazards,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureVMLxForwarding]>;
+
+def : ProcessorModel<"cortex-a9",   CortexA9Model,      [ARMv7a, ProcA9,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureTrustZone,
+                                                         FeatureHasVMLxHazards,
+                                                         FeatureVMLxForwarding,
+                                                         FeatureFP16,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureExpandMLx,
+                                                         FeaturePreferVMOVSR,
+                                                         FeatureMuxedUnits,
+                                                         FeatureNEONForFPMovs,
+                                                         FeatureCheckVLDnAlign,
+                                                         FeatureMP]>;
+
+// FIXME: A12 has currently the same Schedule model as A9
+def : ProcessorModel<"cortex-a12",  CortexA9Model,      [ARMv7a, ProcA12,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureTrustZone,
+                                                         FeatureVMLxForwarding,
+                                                         FeatureVFP4,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureVirtualization,
+                                                         FeatureMP]>;
+
+// FIXME: A15 has currently the same Schedule model as A9.
+def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
+                                                         FeatureDontWidenVMOVS,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureMuxedUnits,
+                                                         FeatureTrustZone,
+                                                         FeatureVFP4,
+                                                         FeatureMP,
+                                                         FeatureCheckVLDnAlign,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureVirtualization]>;
+
+// FIXME: A17 has currently the same Schedule model as A9
+def : ProcessorModel<"cortex-a17",  CortexA9Model,      [ARMv7a, ProcA17,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureTrustZone,
+                                                         FeatureMP,
+                                                         FeatureVMLxForwarding,
+                                                         FeatureVFP4,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureVirtualization]>;
+
+// FIXME: krait has currently the same Schedule model as A9
+// FIXME: krait has currently the same features as A9 plus VFP4 and hardware
+//        division features.
+def : ProcessorModel<"krait",       CortexA9Model,      [ARMv7a, ProcKrait,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureMuxedUnits,
+                                                         FeatureCheckVLDnAlign,
+                                                         FeatureVMLxForwarding,
+                                                         FeatureFP16,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureVFP4,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM]>;
+
+def : ProcessorModel<"swift",       SwiftModel,         [ARMv7a, ProcSwift,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureNEONForFP,
+                                                         FeatureVFP4,
+                                                         FeatureMP,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureAvoidMOVsShOp,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureHasVMLxHazards,
+                                                         FeatureProfUnpredicate,
+                                                         FeaturePrefISHSTBarrier,
+                                                         FeatureSlowOddRegister,
+                                                         FeatureSlowLoadDSubreg,
+                                                         FeatureSlowVGETLNi32,
+                                                         FeatureSlowVDUP32]>;
+
+// FIXME: R4 has currently the same ProcessorModel as A8.
+def : ProcessorModel<"cortex-r4",   CortexA8Model,      [ARMv7r, ProcR4,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureAvoidPartialCPSR]>;
+
+// FIXME: R4F has currently the same ProcessorModel as A8.
+def : ProcessorModel<"cortex-r4f",  CortexA8Model,      [ARMv7r, ProcR4,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureSlowFPBrcc,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureVFP3,
+                                                         FeatureD16,
+                                                         FeatureAvoidPartialCPSR]>;
+
+// FIXME: R5 has currently the same ProcessorModel as A8.
+def : ProcessorModel<"cortex-r5",   CortexA8Model,      [ARMv7r, ProcR5,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureVFP3,
+                                                         FeatureD16,
+                                                         FeatureSlowFPBrcc,
+                                                         FeatureHWDivARM,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureAvoidPartialCPSR]>;
+
+// FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5.
+def : ProcessorModel<"cortex-r7",   CortexA8Model,      [ARMv7r, ProcR7,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureVFP3,
+                                                         FeatureD16,
+                                                         FeatureFP16,
+                                                         FeatureMP,
+                                                         FeatureSlowFPBrcc,
+                                                         FeatureHWDivARM,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureAvoidPartialCPSR]>;
+
+def : ProcessorModel<"cortex-r8",   CortexA8Model,      [ARMv7r,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureVFP3,
+                                                         FeatureD16,
+                                                         FeatureFP16,
+                                                         FeatureMP,
+                                                         FeatureSlowFPBrcc,
+                                                         FeatureHWDivARM,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureAvoidPartialCPSR]>;
+
+def : ProcNoItin<"cortex-m3",                           [ARMv7m, ProcM3]>;
+def : ProcNoItin<"sc300",                               [ARMv7m, ProcM3]>;
+
+def : ProcNoItin<"cortex-m4",                           [ARMv7em,
+                                                         FeatureVFP4,
+                                                         FeatureVFPOnlySP,
+                                                         FeatureD16]>;
+
+def : ProcNoItin<"cortex-m7",                           [ARMv7em,
+                                                         FeatureFPARMv8,
+                                                         FeatureD16]>;
+
+def : ProcNoItin<"cortex-a32",                           [ARMv8a,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
+def : ProcNoItin<"cortex-a35",                          [ARMv8a, ProcA35,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
+def : ProcNoItin<"cortex-a53",                          [ARMv8a, ProcA53,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC,
+                                                         FeatureFPAO]>;
+
+def : ProcNoItin<"cortex-a57",                          [ARMv8a, ProcA57,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC,
+                                                         FeatureFPAO]>;
+
+def : ProcNoItin<"cortex-a72",                          [ARMv8a, ProcA72,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
+def : ProcNoItin<"cortex-a73",                          [ARMv8a, ProcA73,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
+// Cyclone is very similar to swift
+def : ProcessorModel<"cyclone",     SwiftModel,         [ARMv8a, ProcSwift,
+                                                         FeatureHasRetAddrStack,
+                                                         FeatureNEONForFP,
+                                                         FeatureVFP4,
+                                                         FeatureMP,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureAvoidMOVsShOp,
+                                                         FeatureHasSlowFPVMLx,
+                                                         FeatureCrypto,
+                                                         FeatureZCZeroing]>;
+
+def : ProcNoItin<"exynos-m1",                           [ARMv8a, ProcExynosM1,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
+def : ProcNoItin<"exynos-m2",                           [ARMv8a, ProcExynosM1,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
+def : ProcNoItin<"exynos-m3",                           [ARMv8a, ProcExynosM1,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
+def : ProcessorModel<"cortex-r52", CortexR52Model,      [ARMv8r, ProcR52,
+                                                         FeatureFPAO]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "ARMRegisterInfo.td"
+
+include "ARMCallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "ARMInstrInfo.td"
+
+def ARMInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def ARMAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  int PassSubtarget = 1;
+  int Variant = 0;
+  bit isMCAsmWriter = 1;
+}
+
+def ARMAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+  string Name = "ARM";
+  string BreakCharacters = ".";
+}
+
+def ARM : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = ARMInstrInfo;
+  let AssemblyWriters = [ARMAsmWriter];
+  let AssemblyParserVariants = [ARMAsmParserVariant];
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
new file mode 100644
index 000000000000..f20768ab77a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -0,0 +1,2165 @@
+//===-- ARMAsmPrinter.cpp - Print machine code to an ARM .s file ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format ARM assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMAsmPrinter.h"
+#include "ARM.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMTargetMachine.h"
+#include "ARMTargetObjectFile.h"
+#include "InstPrinter/ARMInstPrinter.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM,
+                             std::unique_ptr<MCStreamer> Streamer)
+    : AsmPrinter(TM, std::move(Streamer)), AFI(nullptr), MCP(nullptr),
+      InConstantPool(false), OptimizationGoals(-1) {}
+
+void ARMAsmPrinter::EmitFunctionBodyEnd() {
+  // Make sure to terminate any constant pools that were at the end
+  // of the function.
+  if (!InConstantPool)
+    return;
+  InConstantPool = false;
+  OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+}
+
+void ARMAsmPrinter::EmitFunctionEntryLabel() {
+  if (AFI->isThumbFunction()) {
+    OutStreamer->EmitAssemblerFlag(MCAF_Code16);
+    OutStreamer->EmitThumbFunc(CurrentFnSym);
+  } else {
+    OutStreamer->EmitAssemblerFlag(MCAF_Code32);
+  }
+  OutStreamer->EmitLabel(CurrentFnSym);
+}
+
+void ARMAsmPrinter::EmitXXStructor(const DataLayout &DL, const Constant *CV) {
+  uint64_t Size = getDataLayout().getTypeAllocSize(CV->getType());
+  assert(Size && "C++ constructor pointer had zero size!");
+
+  const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts());
+  assert(GV && "C++ constructor pointer was not a GlobalValue!");
+
+  const MCExpr *E = MCSymbolRefExpr::create(GetARMGVSymbol(GV,
+                                                           ARMII::MO_NO_FLAG),
+                                            (Subtarget->isTargetELF()
+                                             ? MCSymbolRefExpr::VK_ARM_TARGET1
+                                             : MCSymbolRefExpr::VK_None),
+                                            OutContext);
+
+  OutStreamer->EmitValue(E, Size);
+}
+
+void ARMAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+  if (PromotedGlobals.count(GV))
+    // The global was promoted into a constant pool. It should not be emitted.
+    return;
+  AsmPrinter::EmitGlobalVariable(GV);
+}
+
+/// runOnMachineFunction - This uses the EmitInstruction()
+/// method to print assembly for each instruction.
+///
+bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  AFI = MF.getInfo<ARMFunctionInfo>();
+  MCP = MF.getConstantPool();
+  Subtarget = &MF.getSubtarget<ARMSubtarget>();
+
+  SetupMachineFunction(MF);
+  const Function* F = MF.getFunction();
+  const TargetMachine& TM = MF.getTarget();
+
+  // Collect all globals that had their storage promoted to a constant pool.
+  // Functions are emitted before variables, so this accumulates promoted
+  // globals from all functions in PromotedGlobals.
+  for (auto *GV : AFI->getGlobalsPromotedToConstantPool())
+    PromotedGlobals.insert(GV);
+  
+  // Calculate this function's optimization goal.
+  unsigned OptimizationGoal;
+  if (F->hasFnAttribute(Attribute::OptimizeNone))
+    // For best debugging illusion, speed and small size sacrificed
+    OptimizationGoal = 6;
+  else if (F->optForMinSize())
+    // Aggressively for small size, speed and debug illusion sacrificed
+    OptimizationGoal = 4;
+  else if (F->optForSize())
+    // For small size, but speed and debugging illusion preserved
+    OptimizationGoal = 3;
+  else if (TM.getOptLevel() == CodeGenOpt::Aggressive)
+    // Aggressively for speed, small size and debug illusion sacrificed
+    OptimizationGoal = 2;
+  else if (TM.getOptLevel() > CodeGenOpt::None)
+    // For speed, but small size and good debug illusion preserved
+    OptimizationGoal = 1;
+  else // TM.getOptLevel() == CodeGenOpt::None
+    // For good debugging, but speed and small size preserved
+    OptimizationGoal = 5;
+
+  // Combine a new optimization goal with existing ones.
+  if (OptimizationGoals == -1) // uninitialized goals
+    OptimizationGoals = OptimizationGoal;
+  else if (OptimizationGoals != (int)OptimizationGoal) // conflicting goals
+    OptimizationGoals = 0;
+
+  if (Subtarget->isTargetCOFF()) {
+    bool Internal = F->hasInternalLinkage();
+    COFF::SymbolStorageClass Scl = Internal ? COFF::IMAGE_SYM_CLASS_STATIC
+                                            : COFF::IMAGE_SYM_CLASS_EXTERNAL;
+    int Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
+
+    OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
+    OutStreamer->EmitCOFFSymbolStorageClass(Scl);
+    OutStreamer->EmitCOFFSymbolType(Type);
+    OutStreamer->EndCOFFSymbolDef();
+  }
+
+  // Emit the rest of the function body.
+  EmitFunctionBody();
+
+  // Emit the XRay table for this function.
+  EmitXRayTable();
+
+  // If we need V4T thumb mode Register Indirect Jump pads, emit them.
+  // These are created per function, rather than per TU, since it's
+  // relatively easy to exceed the thumb branch range within a TU.
+  if (! ThumbIndirectPads.empty()) {
+    OutStreamer->EmitAssemblerFlag(MCAF_Code16);
+    EmitAlignment(1);
+    for (unsigned i = 0, e = ThumbIndirectPads.size(); i < e; i++) {
+      OutStreamer->EmitLabel(ThumbIndirectPads[i].second);
+      EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBX)
+        .addReg(ThumbIndirectPads[i].first)
+        // Add predicate operands.
+        .addImm(ARMCC::AL)
+        .addReg(0));
+    }
+    ThumbIndirectPads.clear();
+  }
+
+  // We didn't modify anything.
+  return false;
+}
+
+void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                 raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  unsigned TF = MO.getTargetFlags();
+
+  switch (MO.getType()) {
+  default: llvm_unreachable("<unknown operand type>");
+  case MachineOperand::MO_Register: {
+    unsigned Reg = MO.getReg();
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    if(ARM::GPRPairRegClass.contains(Reg)) {
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+      Reg = TRI->getSubReg(Reg, ARM::gsub_0);
+    }
+    O << ARMInstPrinter::getRegisterName(Reg);
+    break;
+  }
+  case MachineOperand::MO_Immediate: {
+    int64_t Imm = MO.getImm();
+    O << '#';
+    if (TF == ARMII::MO_LO16)
+      O << ":lower16:";
+    else if (TF == ARMII::MO_HI16)
+      O << ":upper16:";
+    O << Imm;
+    break;
+  }
+  case MachineOperand::MO_MachineBasicBlock:
+    MO.getMBB()->getSymbol()->print(O, MAI);
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MO.getGlobal();
+    if (TF & ARMII::MO_LO16)
+      O << ":lower16:";
+    else if (TF & ARMII::MO_HI16)
+      O << ":upper16:";
+    GetARMGVSymbol(GV, TF)->print(O, MAI);
+
+    printOffset(MO.getOffset(), O);
+    break;
+  }
+  case MachineOperand::MO_ConstantPoolIndex:
+    if (Subtarget->genExecuteOnly())
+      llvm_unreachable("execute-only should not generate constant pools");
+    GetCPISymbol(MO.getIndex())->print(O, MAI);
+    break;
+  }
+}
+
+//===--------------------------------------------------------------------===//
+
+MCSymbol *ARMAsmPrinter::
+GetARMJTIPICJumpTableLabel(unsigned uid) const {
+  const DataLayout &DL = getDataLayout();
+  SmallString<60> Name;
+  raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "JTI"
+                            << getFunctionNumber() << '_' << uid;
+  return OutContext.getOrCreateSymbol(Name);
+}
+
+bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                                    unsigned AsmVariant, const char *ExtraCode,
+                                    raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+    case 'a': // Print as a memory address.
+      if (MI->getOperand(OpNum).isReg()) {
+        O << "["
+          << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg())
+          << "]";
+        return false;
+      }
+      LLVM_FALLTHROUGH;
+    case 'c': // Don't print "#" before an immediate operand.
+      if (!MI->getOperand(OpNum).isImm())
+        return true;
+      O << MI->getOperand(OpNum).getImm();
+      return false;
+    case 'P': // Print a VFP double precision register.
+    case 'q': // Print a NEON quad precision register.
+      printOperand(MI, OpNum, O);
+      return false;
+    case 'y': // Print a VFP single precision register as indexed double.
+      if (MI->getOperand(OpNum).isReg()) {
+        unsigned Reg = MI->getOperand(OpNum).getReg();
+        const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+        // Find the 'd' register that has this 's' register as a sub-register,
+        // and determine the lane number.
+        for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) {
+          if (!ARM::DPRRegClass.contains(*SR))
+            continue;
+          bool Lane0 = TRI->getSubReg(*SR, ARM::ssub_0) == Reg;
+          O << ARMInstPrinter::getRegisterName(*SR) << (Lane0 ? "[0]" : "[1]");
+          return false;
+        }
+      }
+      return true;
+    case 'B': // Bitwise inverse of integer or symbol without a preceding #.
+      if (!MI->getOperand(OpNum).isImm())
+        return true;
+      O << ~(MI->getOperand(OpNum).getImm());
+      return false;
+    case 'L': // The low 16 bits of an immediate constant.
+      if (!MI->getOperand(OpNum).isImm())
+        return true;
+      O << (MI->getOperand(OpNum).getImm() & 0xffff);
+      return false;
+    case 'M': { // A register range suitable for LDM/STM.
+      if (!MI->getOperand(OpNum).isReg())
+        return true;
+      const MachineOperand &MO = MI->getOperand(OpNum);
+      unsigned RegBegin = MO.getReg();
+      // This takes advantage of the 2 operand-ness of ldm/stm and that we've
+      // already got the operands in registers that are operands to the
+      // inline asm statement.
+      O << "{";
+      if (ARM::GPRPairRegClass.contains(RegBegin)) {
+        const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+        unsigned Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0);
+        O << ARMInstPrinter::getRegisterName(Reg0) << ", ";
+        RegBegin = TRI->getSubReg(RegBegin, ARM::gsub_1);
+      }
+      O << ARMInstPrinter::getRegisterName(RegBegin);
+
+      // FIXME: The register allocator not only may not have given us the
+      // registers in sequence, but may not be in ascending registers. This
+      // will require changes in the register allocator that'll need to be
+      // propagated down here if the operands change.
+      unsigned RegOps = OpNum + 1;
+      while (MI->getOperand(RegOps).isReg()) {
+        O << ", "
+          << ARMInstPrinter::getRegisterName(MI->getOperand(RegOps).getReg());
+        RegOps++;
+      }
+
+      O << "}";
+
+      return false;
+    }
+    case 'R': // The most significant register of a pair.
+    case 'Q': { // The least significant register of a pair.
+      if (OpNum == 0)
+        return true;
+      const MachineOperand &FlagsOP = MI->getOperand(OpNum - 1);
+      if (!FlagsOP.isImm())
+        return true;
+      unsigned Flags = FlagsOP.getImm();
+
+      // This operand may not be the one that actually provides the register. If
+      // it's tied to a previous one then we should refer instead to that one
+      // for registers and their classes.
+      unsigned TiedIdx;
+      if (InlineAsm::isUseOperandTiedToDef(Flags, TiedIdx)) {
+        for (OpNum = InlineAsm::MIOp_FirstOperand; TiedIdx; --TiedIdx) {
+          unsigned OpFlags = MI->getOperand(OpNum).getImm();
+          OpNum += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
+        }
+        Flags = MI->getOperand(OpNum).getImm();
+
+        // Later code expects OpNum to be pointing at the register rather than
+        // the flags.
+        OpNum += 1;
+      }
+
+      unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+      unsigned RC;
+      InlineAsm::hasRegClassConstraint(Flags, RC);
+      if (RC == ARM::GPRPairRegClassID) {
+        if (NumVals != 1)
+          return true;
+        const MachineOperand &MO = MI->getOperand(OpNum);
+        if (!MO.isReg())
+          return true;
+        const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+        unsigned Reg = TRI->getSubReg(MO.getReg(), ExtraCode[0] == 'Q' ?
+            ARM::gsub_0 : ARM::gsub_1);
+        O << ARMInstPrinter::getRegisterName(Reg);
+        return false;
+      }
+      if (NumVals != 2)
+        return true;
+      unsigned RegOp = ExtraCode[0] == 'Q' ? OpNum : OpNum + 1;
+      if (RegOp >= MI->getNumOperands())
+        return true;
+      const MachineOperand &MO = MI->getOperand(RegOp);
+      if (!MO.isReg())
+        return true;
+      unsigned Reg = MO.getReg();
+      O << ARMInstPrinter::getRegisterName(Reg);
+      return false;
+    }
+
+    case 'e': // The low doubleword register of a NEON quad register.
+    case 'f': { // The high doubleword register of a NEON quad register.
+      if (!MI->getOperand(OpNum).isReg())
+        return true;
+      unsigned Reg = MI->getOperand(OpNum).getReg();
+      if (!ARM::QPRRegClass.contains(Reg))
+        return true;
+      const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+      unsigned SubReg = TRI->getSubReg(Reg, ExtraCode[0] == 'e' ?
+                                       ARM::dsub_0 : ARM::dsub_1);
+      O << ARMInstPrinter::getRegisterName(SubReg);
+      return false;
+    }
+
+    // This modifier is not yet supported.
+    case 'h': // A range of VFP/NEON registers suitable for VLD1/VST1.
+      return true;
+    case 'H': { // The highest-numbered register of a pair.
+      const MachineOperand &MO = MI->getOperand(OpNum);
+      if (!MO.isReg())
+        return true;
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+      unsigned Reg = MO.getReg();
+      if(!ARM::GPRPairRegClass.contains(Reg))
+        return false;
+      Reg = TRI->getSubReg(Reg, ARM::gsub_1);
+      O << ARMInstPrinter::getRegisterName(Reg);
+      return false;
+    }
+    }
+  }
+
+  printOperand(MI, OpNum, O);
+  return false;
+}
+
+bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                          unsigned OpNum, unsigned AsmVariant,
+                                          const char *ExtraCode,
+                                          raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+      case 'A': // A memory operand for a VLD1/VST1 instruction.
+      default: return true;  // Unknown modifier.
+      case 'm': // The base register of a memory operand.
+        if (!MI->getOperand(OpNum).isReg())
+          return true;
+        O << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg());
+        return false;
+    }
+  }
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isReg() && "unexpected inline asm memory operand");
+  O << "[" << ARMInstPrinter::getRegisterName(MO.getReg()) << "]";
+  return false;
+}
+
+static bool isThumb(const MCSubtargetInfo& STI) {
+  return STI.getFeatureBits()[ARM::ModeThumb];
+}
+
+void ARMAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+                                     const MCSubtargetInfo *EndInfo) const {
+  // If either end mode is unknown (EndInfo == NULL) or different than
+  // the start mode, then restore the start mode.
+  const bool WasThumb = isThumb(StartInfo);
+  if (!EndInfo || WasThumb != isThumb(*EndInfo)) {
+    OutStreamer->EmitAssemblerFlag(WasThumb ? MCAF_Code16 : MCAF_Code32);
+  }
+}
+
+void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  const Triple &TT = TM.getTargetTriple();
+  // Use unified assembler syntax.
+  OutStreamer->EmitAssemblerFlag(MCAF_SyntaxUnified);
+
+  // Emit ARM Build Attributes
+  if (TT.isOSBinFormatELF())
+    emitAttributes();
+
+  // Use the triple's architecture and subarchitecture to determine
+  // if we're thumb for the purposes of the top level code16 assembler
+  // flag.
+  bool isThumb = TT.getArch() == Triple::thumb ||
+                 TT.getArch() == Triple::thumbeb ||
+                 TT.getSubArch() == Triple::ARMSubArch_v7m ||
+                 TT.getSubArch() == Triple::ARMSubArch_v6m;
+  if (!M.getModuleInlineAsm().empty() && isThumb)
+    OutStreamer->EmitAssemblerFlag(MCAF_Code16);
+}
+
+static void
+emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
+                         MachineModuleInfoImpl::StubValueTy &MCSym) {
+  // L_foo$stub:
+  OutStreamer.EmitLabel(StubLabel);
+  //   .indirect_symbol _foo
+  OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+
+  if (MCSym.getInt())
+    // External to current translation unit.
+    OutStreamer.EmitIntValue(0, 4/*size*/);
+  else
+    // Internal to current translation unit.
+    //
+    // When we place the LSDA into the TEXT section, the type info
+    // pointers need to be indirect and pc-rel. We accomplish this by
+    // using NLPs; however, sometimes the types are local to the file.
+    // We need to fill in the value for the NLP in those cases.
+    OutStreamer.EmitValue(
+        MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()),
+        4 /*size*/);
+}
+
+
+void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  const Triple &TT = TM.getTargetTriple();
+  if (TT.isOSBinFormatMachO()) {
+    // All darwin targets use mach-o.
+    const TargetLoweringObjectFileMachO &TLOFMacho =
+      static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+    MachineModuleInfoMachO &MMIMacho =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+    // Output non-lazy-pointers for external and common global variables.
+    MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetGVStubList();
+
+    if (!Stubs.empty()) {
+      // Switch with ".non_lazy_symbol_pointer" directive.
+      OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+      EmitAlignment(2);
+
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
+
+      Stubs.clear();
+      OutStreamer->AddBlankLine();
+    }
+
+    Stubs = MMIMacho.GetThreadLocalGVStubList();
+    if (!Stubs.empty()) {
+      // Switch with ".non_lazy_symbol_pointer" directive.
+      OutStreamer->SwitchSection(TLOFMacho.getThreadLocalPointerSection());
+      EmitAlignment(2);
+
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
+
+      Stubs.clear();
+      OutStreamer->AddBlankLine();
+    }
+
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+  }
+
+  if (TT.isOSBinFormatCOFF()) {
+    const auto &TLOF =
+        static_cast<const TargetLoweringObjectFileCOFF &>(getObjFileLowering());
+
+    std::string Flags;
+    raw_string_ostream OS(Flags);
+
+    for (const auto &Function : M)
+      TLOF.emitLinkerFlagsForGlobal(OS, &Function);
+    for (const auto &Global : M.globals())
+      TLOF.emitLinkerFlagsForGlobal(OS, &Global);
+    for (const auto &Alias : M.aliases())
+      TLOF.emitLinkerFlagsForGlobal(OS, &Alias);
+
+    OS.flush();
+
+    // Output collected flags
+    if (!Flags.empty()) {
+      OutStreamer->SwitchSection(TLOF.getDrectveSection());
+      OutStreamer->EmitBytes(Flags);
+    }
+  }
+
+  // The last attribute to be emitted is ABI_optimization_goals
+  MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+
+  if (OptimizationGoals > 0 &&
+      (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
+       Subtarget->isTargetMuslAEABI()))
+    ATS.emitAttribute(ARMBuildAttrs::ABI_optimization_goals, OptimizationGoals);
+  OptimizationGoals = -1;
+
+  ATS.finishAttributeSection();
+}
+
+static bool isV8M(const ARMSubtarget *Subtarget) {
+  // Note that v8M Baseline is a subset of v6T2!
+  return (Subtarget->hasV8MBaselineOps() && !Subtarget->hasV6T2Ops()) ||
+         Subtarget->hasV8MMainlineOps();
+}
+
+//===----------------------------------------------------------------------===//
+// Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile()
+// FIXME:
+// The following seem like one-off assembler flags, but they actually need
+// to appear in the .ARM.attributes section in ELF.
+// Instead of subclassing the MCELFStreamer, we do the work here.
+
+static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU,
+                                            const ARMSubtarget *Subtarget) {
+  if (CPU == "xscale")
+    return ARMBuildAttrs::v5TEJ;
+
+  if (Subtarget->hasV8Ops()) {
+    if (Subtarget->isRClass())
+      return ARMBuildAttrs::v8_R;
+    return ARMBuildAttrs::v8_A;
+  } else if (Subtarget->hasV8MMainlineOps())
+    return ARMBuildAttrs::v8_M_Main;
+  else if (Subtarget->hasV7Ops()) {
+    if (Subtarget->isMClass() && Subtarget->hasDSP())
+      return ARMBuildAttrs::v7E_M;
+    return ARMBuildAttrs::v7;
+  } else if (Subtarget->hasV6T2Ops())
+    return ARMBuildAttrs::v6T2;
+  else if (Subtarget->hasV8MBaselineOps())
+    return ARMBuildAttrs::v8_M_Base;
+  else if (Subtarget->hasV6MOps())
+    return ARMBuildAttrs::v6S_M;
+  else if (Subtarget->hasV6Ops())
+    return ARMBuildAttrs::v6;
+  else if (Subtarget->hasV5TEOps())
+    return ARMBuildAttrs::v5TE;
+  else if (Subtarget->hasV5TOps())
+    return ARMBuildAttrs::v5T;
+  else if (Subtarget->hasV4TOps())
+    return ARMBuildAttrs::v4T;
+  else
+    return ARMBuildAttrs::v4;
+}
+
+// Returns true if all functions have the same function attribute value.
+// It also returns true when the module has no functions.
+static bool checkFunctionsAttributeConsistency(const Module &M, StringRef Attr,
+                                               StringRef Value) {
+  return !any_of(M, [&](const Function &F) {
+    return F.getFnAttribute(Attr).getValueAsString() != Value;
+  });
+}
+
+void ARMAsmPrinter::emitAttributes() {
+  MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+
+  ATS.emitTextAttribute(ARMBuildAttrs::conformance, "2.09");
+
+  ATS.switchVendor("aeabi");
+
+  // Compute ARM ELF Attributes based on the default subtarget that
+  // we'd have constructed. The existing ARM behavior isn't LTO clean
+  // anyhow.
+  // FIXME: For ifunc related functions we could iterate over and look
+  // for a feature string that doesn't match the default one.
+  const Triple &TT = TM.getTargetTriple();
+  StringRef CPU = TM.getTargetCPU();
+  StringRef FS = TM.getTargetFeatureString();
+  std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU);
+  if (!FS.empty()) {
+    if (!ArchFS.empty())
+      ArchFS = (Twine(ArchFS) + "," + FS).str();
+    else
+      ArchFS = FS;
+  }
+  const ARMBaseTargetMachine &ATM =
+      static_cast<const ARMBaseTargetMachine &>(TM);
+  const ARMSubtarget STI(TT, CPU, ArchFS, ATM, ATM.isLittleEndian());
+
+  const std::string &CPUString = STI.getCPUString();
+
+  if (!StringRef(CPUString).startswith("generic")) {
+    // FIXME: remove krait check when GNU tools support krait cpu
+    if (STI.isKrait()) {
+      ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a9");
+      // We consider krait as a "cortex-a9" + hwdiv CPU
+      // Enable hwdiv through ".arch_extension idiv"
+      if (STI.hasDivide() || STI.hasDivideInARMMode())
+        ATS.emitArchExtension(ARM::AEK_HWDIV | ARM::AEK_HWDIVARM);
+    } else
+      ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString);
+  }
+
+  ATS.emitAttribute(ARMBuildAttrs::CPU_arch, getArchForCPU(CPUString, &STI));
+
+  // Tag_CPU_arch_profile must have the default value of 0 when "Architecture
+  // profile is not applicable (e.g. pre v7, or cross-profile code)".
+  if (STI.hasV7Ops() || isV8M(&STI)) {
+    if (STI.isAClass()) {
+      ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                        ARMBuildAttrs::ApplicationProfile);
+    } else if (STI.isRClass()) {
+      ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                        ARMBuildAttrs::RealTimeProfile);
+    } else if (STI.isMClass()) {
+      ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                        ARMBuildAttrs::MicroControllerProfile);
+    }
+  }
+
+  ATS.emitAttribute(ARMBuildAttrs::ARM_ISA_use,
+                    STI.hasARMOps() ? ARMBuildAttrs::Allowed
+                                    : ARMBuildAttrs::Not_Allowed);
+  if (isV8M(&STI)) {
+    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                      ARMBuildAttrs::AllowThumbDerived);
+  } else if (STI.isThumb1Only()) {
+    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use, ARMBuildAttrs::Allowed);
+  } else if (STI.hasThumb2()) {
+    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                      ARMBuildAttrs::AllowThumb32);
+  }
+
+  if (STI.hasNEON()) {
+    /* NEON is not exactly a VFP architecture, but GAS emit one of
+     * neon/neon-fp-armv8/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */
+    if (STI.hasFPARMv8()) {
+      if (STI.hasCrypto())
+        ATS.emitFPU(ARM::FK_CRYPTO_NEON_FP_ARMV8);
+      else
+        ATS.emitFPU(ARM::FK_NEON_FP_ARMV8);
+    } else if (STI.hasVFP4())
+      ATS.emitFPU(ARM::FK_NEON_VFPV4);
+    else
+      ATS.emitFPU(STI.hasFP16() ? ARM::FK_NEON_FP16 : ARM::FK_NEON);
+    // Emit Tag_Advanced_SIMD_arch for ARMv8 architecture
+    if (STI.hasV8Ops())
+      ATS.emitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
+                        STI.hasV8_1aOps() ? ARMBuildAttrs::AllowNeonARMv8_1a:
+                                            ARMBuildAttrs::AllowNeonARMv8);
+  } else {
+    if (STI.hasFPARMv8())
+      // FPv5 and FP-ARMv8 have the same instructions, so are modeled as one
+      // FPU, but there are two different names for it depending on the CPU.
+      ATS.emitFPU(STI.hasD16()
+                  ? (STI.isFPOnlySP() ? ARM::FK_FPV5_SP_D16 : ARM::FK_FPV5_D16)
+                  : ARM::FK_FP_ARMV8);
+    else if (STI.hasVFP4())
+      ATS.emitFPU(STI.hasD16()
+                  ? (STI.isFPOnlySP() ? ARM::FK_FPV4_SP_D16 : ARM::FK_VFPV4_D16)
+                  : ARM::FK_VFPV4);
+    else if (STI.hasVFP3())
+      ATS.emitFPU(STI.hasD16()
+                  // +d16
+                  ? (STI.isFPOnlySP()
+                     ? (STI.hasFP16() ? ARM::FK_VFPV3XD_FP16 : ARM::FK_VFPV3XD)
+                     : (STI.hasFP16() ? ARM::FK_VFPV3_D16_FP16 : ARM::FK_VFPV3_D16))
+                  // -d16
+                  : (STI.hasFP16() ? ARM::FK_VFPV3_FP16 : ARM::FK_VFPV3));
+    else if (STI.hasVFP2())
+      ATS.emitFPU(ARM::FK_VFPV2);
+  }
+
+  // RW data addressing.
+  if (isPositionIndependent()) {
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RW_data,
+                      ARMBuildAttrs::AddressRWPCRel);
+  } else if (STI.isRWPI()) {
+    // RWPI specific attributes.
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RW_data,
+                      ARMBuildAttrs::AddressRWSBRel);
+  }
+
+  // RO data addressing.
+  if (isPositionIndependent() || STI.isROPI()) {
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RO_data,
+                      ARMBuildAttrs::AddressROPCRel);
+  }
+
+  // GOT use.
+  if (isPositionIndependent()) {
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_GOT_use,
+                      ARMBuildAttrs::AddressGOT);
+  } else {
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_GOT_use,
+                      ARMBuildAttrs::AddressDirect);
+  }
+
+  // Set FP Denormals.
+  if (checkFunctionsAttributeConsistency(*MMI->getModule(),
+                                         "denormal-fp-math",
+                                         "preserve-sign") ||
+      TM.Options.FPDenormalMode == FPDenormal::PreserveSign)
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
+                      ARMBuildAttrs::PreserveFPSign);
+  else if (checkFunctionsAttributeConsistency(*MMI->getModule(),
+                                              "denormal-fp-math",
+                                              "positive-zero") ||
+           TM.Options.FPDenormalMode == FPDenormal::PositiveZero)
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
+                      ARMBuildAttrs::PositiveZero);
+  else if (!TM.Options.UnsafeFPMath)
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
+                      ARMBuildAttrs::IEEEDenormals);
+  else {
+    if (!STI.hasVFP2()) {
+      // When the target doesn't have an FPU (by design or
+      // intention), the assumptions made on the software support
+      // mirror that of the equivalent hardware support *if it
+      // existed*. For v7 and better we indicate that denormals are
+      // flushed preserving sign, and for V6 we indicate that
+      // denormals are flushed to positive zero.
+      if (STI.hasV7Ops())
+        ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
+                          ARMBuildAttrs::PreserveFPSign);
+    } else if (STI.hasVFP3()) {
+      // In VFPv4, VFPv4U, VFPv3, or VFPv3U, it is preserved. That is,
+      // the sign bit of the zero matches the sign bit of the input or
+      // result that is being flushed to zero.
+      ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
+                        ARMBuildAttrs::PreserveFPSign);
+    }
+    // For VFPv2 implementations it is implementation defined as
+    // to whether denormals are flushed to positive zero or to
+    // whatever the sign of zero is (ARM v7AR ARM 2.7.5). Historically
+    // LLVM has chosen to flush this to positive zero (most likely for
+    // GCC compatibility), so that's the chosen value here (the
+    // absence of its emission implies zero).
+  }
+
+  // Set FP exceptions and rounding
+  if (checkFunctionsAttributeConsistency(*MMI->getModule(),
+                                         "no-trapping-math", "true") ||
+      TM.Options.NoTrappingFPMath)
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions,
+                      ARMBuildAttrs::Not_Allowed);
+  else if (!TM.Options.UnsafeFPMath) {
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions, ARMBuildAttrs::Allowed);
+
+    // If the user has permitted this code to choose the IEEE 754
+    // rounding at run-time, emit the rounding attribute.
+    if (TM.Options.HonorSignDependentRoundingFPMathOption)
+      ATS.emitAttribute(ARMBuildAttrs::ABI_FP_rounding, ARMBuildAttrs::Allowed);
+  }
+
+  // TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath is the
+  // equivalent of GCC's -ffinite-math-only flag.
+  if (TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath)
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
+                      ARMBuildAttrs::Allowed);
+  else
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
+                      ARMBuildAttrs::AllowIEE754);
+
+  if (STI.allowsUnalignedMem())
+    ATS.emitAttribute(ARMBuildAttrs::CPU_unaligned_access,
+                      ARMBuildAttrs::Allowed);
+  else
+    ATS.emitAttribute(ARMBuildAttrs::CPU_unaligned_access,
+                      ARMBuildAttrs::Not_Allowed);
+
+  // FIXME: add more flags to ARMBuildAttributes.h
+  // 8-bytes alignment stuff.
+  ATS.emitAttribute(ARMBuildAttrs::ABI_align_needed, 1);
+  ATS.emitAttribute(ARMBuildAttrs::ABI_align_preserved, 1);
+
+  // ABI_HardFP_use attribute to indicate single precision FP.
+  if (STI.isFPOnlySP())
+    ATS.emitAttribute(ARMBuildAttrs::ABI_HardFP_use,
+                      ARMBuildAttrs::HardFPSinglePrecision);
+
+  // Hard float.  Use both S and D registers and conform to AAPCS-VFP.
+  if (STI.isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard)
+    ATS.emitAttribute(ARMBuildAttrs::ABI_VFP_args, ARMBuildAttrs::HardFPAAPCS);
+
+  // FIXME: Should we signal R9 usage?
+
+  if (STI.hasFP16())
+    ATS.emitAttribute(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP);
+
+  // FIXME: To support emitting this build attribute as GCC does, the
+  // -mfp16-format option and associated plumbing must be
+  // supported. For now the __fp16 type is exposed by default, so this
+  // attribute should be emitted with value 1.
+  ATS.emitAttribute(ARMBuildAttrs::ABI_FP_16bit_format,
+                    ARMBuildAttrs::FP16FormatIEEE);
+
+  if (STI.hasMPExtension())
+    ATS.emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP);
+
+  // Hardware divide in ARM mode is part of base arch, starting from ARMv8.
+  // If only Thumb hwdiv is present, it must also be in base arch (ARMv7-R/M).
+  // It is not possible to produce DisallowDIV: if hwdiv is present in the base
+  // arch, supplying -hwdiv downgrades the effective arch, via ClearImpliedBits.
+  // AllowDIVExt is only emitted if hwdiv isn't available in the base arch;
+  // otherwise, the default value (AllowDIVIfExists) applies.
+  if (STI.hasDivideInARMMode() && !STI.hasV8Ops())
+    ATS.emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt);
+
+  if (STI.hasDSP() && isV8M(&STI))
+    ATS.emitAttribute(ARMBuildAttrs::DSP_extension, ARMBuildAttrs::Allowed);
+
+  if (MMI) {
+    if (const Module *SourceModule = MMI->getModule()) {
+      // ABI_PCS_wchar_t to indicate wchar_t width
+      // FIXME: There is no way to emit value 0 (wchar_t prohibited).
+      if (auto WCharWidthValue = mdconst::extract_or_null<ConstantInt>(
+              SourceModule->getModuleFlag("wchar_size"))) {
+        int WCharWidth = WCharWidthValue->getZExtValue();
+        assert((WCharWidth == 2 || WCharWidth == 4) &&
+               "wchar_t width must be 2 or 4 bytes");
+        ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_wchar_t, WCharWidth);
+      }
+
+      // ABI_enum_size to indicate enum width
+      // FIXME: There is no way to emit value 0 (enums prohibited) or value 3
+      //        (all enums contain a value needing 32 bits to encode).
+      if (auto EnumWidthValue = mdconst::extract_or_null<ConstantInt>(
+              SourceModule->getModuleFlag("min_enum_size"))) {
+        int EnumWidth = EnumWidthValue->getZExtValue();
+        assert((EnumWidth == 1 || EnumWidth == 4) &&
+               "Minimum enum width must be 1 or 4 bytes");
+        int EnumBuildAttr = EnumWidth == 1 ? 1 : 2;
+        ATS.emitAttribute(ARMBuildAttrs::ABI_enum_size, EnumBuildAttr);
+      }
+    }
+  }
+
+  // We currently do not support using R9 as the TLS pointer.
+  if (STI.isRWPI())
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use,
+                      ARMBuildAttrs::R9IsSB);
+  else if (STI.isR9Reserved())
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use,
+                      ARMBuildAttrs::R9Reserved);
+  else
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use,
+                      ARMBuildAttrs::R9IsGPR);
+
+  if (STI.hasTrustZone() && STI.hasVirtualization())
+    ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                      ARMBuildAttrs::AllowTZVirtualization);
+  else if (STI.hasTrustZone())
+    ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                      ARMBuildAttrs::AllowTZ);
+  else if (STI.hasVirtualization())
+    ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                      ARMBuildAttrs::AllowVirtualization);
+}
+
+//===----------------------------------------------------------------------===//
+
+static MCSymbol *getPICLabel(StringRef Prefix, unsigned FunctionNumber,
+                             unsigned LabelId, MCContext &Ctx) {
+
+  MCSymbol *Label = Ctx.getOrCreateSymbol(Twine(Prefix)
+                       + "PC" + Twine(FunctionNumber) + "_" + Twine(LabelId));
+  return Label;
+}
+
+static MCSymbolRefExpr::VariantKind
+getModifierVariantKind(ARMCP::ARMCPModifier Modifier) {
+  switch (Modifier) {
+  case ARMCP::no_modifier:
+    return MCSymbolRefExpr::VK_None;
+  case ARMCP::TLSGD:
+    return MCSymbolRefExpr::VK_TLSGD;
+  case ARMCP::TPOFF:
+    return MCSymbolRefExpr::VK_TPOFF;
+  case ARMCP::GOTTPOFF:
+    return MCSymbolRefExpr::VK_GOTTPOFF;
+  case ARMCP::SBREL:
+    return MCSymbolRefExpr::VK_ARM_SBREL;
+  case ARMCP::GOT_PREL:
+    return MCSymbolRefExpr::VK_ARM_GOT_PREL;
+  case ARMCP::SECREL:
+    return MCSymbolRefExpr::VK_SECREL;
+  }
+  llvm_unreachable("Invalid ARMCPModifier!");
+}
+
+MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
+                                        unsigned char TargetFlags) {
+  if (Subtarget->isTargetMachO()) {
+    bool IsIndirect =
+        (TargetFlags & ARMII::MO_NONLAZY) && Subtarget->isGVIndirectSymbol(GV);
+
+    if (!IsIndirect)
+      return getSymbol(GV);
+
+    // FIXME: Remove this when Darwin transition to @GOT like syntax.
+    MCSymbol *MCSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+    MachineModuleInfoMachO &MMIMachO =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+        GV->isThreadLocal() ? MMIMachO.getThreadLocalGVStubEntry(MCSym)
+                            : MMIMachO.getGVStubEntry(MCSym);
+
+    if (!StubSym.getPointer())
+      StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
+                                                   !GV->hasInternalLinkage());
+    return MCSym;
+  } else if (Subtarget->isTargetCOFF()) {
+    assert(Subtarget->isTargetWindows() &&
+           "Windows is the only supported COFF target");
+
+    bool IsIndirect = (TargetFlags & ARMII::MO_DLLIMPORT);
+    if (!IsIndirect)
+      return getSymbol(GV);
+
+    SmallString<128> Name;
+    Name = "__imp_";
+    getNameWithPrefix(Name, GV);
+
+    return OutContext.getOrCreateSymbol(Name);
+  } else if (Subtarget->isTargetELF()) {
+    return getSymbol(GV);
+  }
+  llvm_unreachable("unexpected target");
+}
+
+void ARMAsmPrinter::
+EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+  const DataLayout &DL = getDataLayout();
+  int Size = DL.getTypeAllocSize(MCPV->getType());
+
+  ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV);
+
+  if (ACPV->isPromotedGlobal()) {
+    // This constant pool entry is actually a global whose storage has been
+    // promoted into the constant pool. This global may be referenced still
+    // by debug information, and due to the way AsmPrinter is set up, the debug
+    // info is immutable by the time we decide to promote globals to constant
+    // pools. Because of this, we need to ensure we emit a symbol for the global
+    // with private linkage (the default) so debug info can refer to it.
+    //
+    // However, if this global is promoted into several functions we must ensure
+    // we don't try and emit duplicate symbols!
+    auto *ACPC = cast<ARMConstantPoolConstant>(ACPV);
+    auto *GV = ACPC->getPromotedGlobal();
+    if (!EmittedPromotedGlobalLabels.count(GV)) {
+      MCSymbol *GVSym = getSymbol(GV);
+      OutStreamer->EmitLabel(GVSym);
+      EmittedPromotedGlobalLabels.insert(GV);
+    }
+    return EmitGlobalConstant(DL, ACPC->getPromotedGlobalInit());
+  }
+
+  MCSymbol *MCSym;
+  if (ACPV->isLSDA()) {
+    MCSym = getCurExceptionSym();
+  } else if (ACPV->isBlockAddress()) {
+    const BlockAddress *BA =
+      cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress();
+    MCSym = GetBlockAddressSymbol(BA);
+  } else if (ACPV->isGlobalValue()) {
+    const GlobalValue *GV = cast<ARMConstantPoolConstant>(ACPV)->getGV();
+
+    // On Darwin, const-pool entries may get the "FOO$non_lazy_ptr" mangling, so
+    // flag the global as MO_NONLAZY.
+    unsigned char TF = Subtarget->isTargetMachO() ? ARMII::MO_NONLAZY : 0;
+    MCSym = GetARMGVSymbol(GV, TF);
+  } else if (ACPV->isMachineBasicBlock()) {
+    const MachineBasicBlock *MBB = cast<ARMConstantPoolMBB>(ACPV)->getMBB();
+    MCSym = MBB->getSymbol();
+  } else {
+    assert(ACPV->isExtSymbol() && "unrecognized constant pool value");
+    auto Sym = cast<ARMConstantPoolSymbol>(ACPV)->getSymbol();
+    MCSym = GetExternalSymbolSymbol(Sym);
+  }
+
+  // Create an MCSymbol for the reference.
+  const MCExpr *Expr =
+    MCSymbolRefExpr::create(MCSym, getModifierVariantKind(ACPV->getModifier()),
+                            OutContext);
+
+  if (ACPV->getPCAdjustment()) {
+    MCSymbol *PCLabel =
+        getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(),
+                    ACPV->getLabelId(), OutContext);
+    const MCExpr *PCRelExpr = MCSymbolRefExpr::create(PCLabel, OutContext);
+    PCRelExpr =
+      MCBinaryExpr::createAdd(PCRelExpr,
+                              MCConstantExpr::create(ACPV->getPCAdjustment(),
+                                                     OutContext),
+                              OutContext);
+    if (ACPV->mustAddCurrentAddress()) {
+      // We want "(<expr> - .)", but MC doesn't have a concept of the '.'
+      // label, so just emit a local label end reference that instead.
+      MCSymbol *DotSym = OutContext.createTempSymbol();
+      OutStreamer->EmitLabel(DotSym);
+      const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
+      PCRelExpr = MCBinaryExpr::createSub(PCRelExpr, DotExpr, OutContext);
+    }
+    Expr = MCBinaryExpr::createSub(Expr, PCRelExpr, OutContext);
+  }
+  OutStreamer->EmitValue(Expr, Size);
+}
+
+void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) {
+  const MachineOperand &MO1 = MI->getOperand(1);
+  unsigned JTI = MO1.getIndex();
+
+  // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for
+  // ARM mode tables.
+  EmitAlignment(2);
+
+  // Emit a label for the jump table.
+  MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
+  OutStreamer->EmitLabel(JTISymbol);
+
+  // Mark the jump table as data-in-code.
+  OutStreamer->EmitDataRegion(MCDR_DataRegionJT32);
+
+  // Emit each entry of the table.
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    // Construct an MCExpr for the entry. We want a value of the form:
+    // (BasicBlockAddr - TableBeginAddr)
+    //
+    // For example, a table with entries jumping to basic blocks BB0 and BB1
+    // would look like:
+    // LJTI_0_0:
+    //    .word (LBB0 - LJTI_0_0)
+    //    .word (LBB1 - LJTI_0_0)
+    const MCExpr *Expr = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+
+    if (isPositionIndependent() || Subtarget->isROPI())
+      Expr = MCBinaryExpr::createSub(Expr, MCSymbolRefExpr::create(JTISymbol,
+                                                                   OutContext),
+                                     OutContext);
+    // If we're generating a table of Thumb addresses in static relocation
+    // model, we need to add one to keep interworking correctly.
+    else if (AFI->isThumbFunction())
+      Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(1,OutContext),
+                                     OutContext);
+    OutStreamer->EmitValue(Expr, 4);
+  }
+  // Mark the end of jump table data-in-code region.
+  OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+}
+
+void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) {
+  const MachineOperand &MO1 = MI->getOperand(1);
+  unsigned JTI = MO1.getIndex();
+
+  MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
+  OutStreamer->EmitLabel(JTISymbol);
+
+  // Emit each entry of the table.
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::create(MBB->getSymbol(),
+                                                          OutContext);
+    // If this isn't a TBB or TBH, the entries are direct branch instructions.
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2B)
+        .addExpr(MBBSymbolExpr)
+        .addImm(ARMCC::AL)
+        .addReg(0));
+  }
+}
+
+void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI,
+                                        unsigned OffsetWidth) {
+  assert((OffsetWidth == 1 || OffsetWidth == 2) && "invalid tbb/tbh width");
+  const MachineOperand &MO1 = MI->getOperand(1);
+  unsigned JTI = MO1.getIndex();
+
+  if (Subtarget->isThumb1Only())
+    EmitAlignment(2);
+  
+  MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
+  OutStreamer->EmitLabel(JTISymbol);
+
+  // Emit each entry of the table.
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+
+  // Mark the jump table as data-in-code.
+  OutStreamer->EmitDataRegion(OffsetWidth == 1 ? MCDR_DataRegionJT8
+                                               : MCDR_DataRegionJT16);
+
+  for (auto MBB : JTBBs) {
+    const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::create(MBB->getSymbol(),
+                                                          OutContext);
+    // Otherwise it's an offset from the dispatch instruction. Construct an
+    // MCExpr for the entry. We want a value of the form:
+    // (BasicBlockAddr - TBBInstAddr + 4) / 2
+    //
+    // For example, a TBB table with entries jumping to basic blocks BB0 and BB1
+    // would look like:
+    // LJTI_0_0:
+    //    .byte (LBB0 - (LCPI0_0 + 4)) / 2
+    //    .byte (LBB1 - (LCPI0_0 + 4)) / 2
+    // where LCPI0_0 is a label defined just before the TBB instruction using
+    // this table.
+    MCSymbol *TBInstPC = GetCPISymbol(MI->getOperand(0).getImm());
+    const MCExpr *Expr = MCBinaryExpr::createAdd(
+        MCSymbolRefExpr::create(TBInstPC, OutContext),
+        MCConstantExpr::create(4, OutContext), OutContext);
+    Expr = MCBinaryExpr::createSub(MBBSymbolExpr, Expr, OutContext);
+    Expr = MCBinaryExpr::createDiv(Expr, MCConstantExpr::create(2, OutContext),
+                                   OutContext);
+    OutStreamer->EmitValue(Expr, OffsetWidth);
+  }
+  // Mark the end of jump table data-in-code region. 32-bit offsets use
+  // actual branch instructions here, so we don't mark those as a data-region
+  // at all.
+  OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+
+  // Make sure the next instruction is 2-byte aligned.
+  EmitAlignment(1);
+}
+
+void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
+  assert(MI->getFlag(MachineInstr::FrameSetup) &&
+      "Only instruction which are involved into frame setup code are allowed");
+
+  MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+  const MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  const ARMFunctionInfo &AFI = *MF.getInfo<ARMFunctionInfo>();
+
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  unsigned Opc = MI->getOpcode();
+  unsigned SrcReg, DstReg;
+
+  if (Opc == ARM::tPUSH || Opc == ARM::tLDRpci) {
+    // Two special cases:
+    // 1) tPUSH does not have src/dst regs.
+    // 2) for Thumb1 code we sometimes materialize the constant via constpool
+    // load. Yes, this is pretty fragile, but for now I don't see better
+    // way... :(
+    SrcReg = DstReg = ARM::SP;
+  } else {
+    SrcReg = MI->getOperand(1).getReg();
+    DstReg = MI->getOperand(0).getReg();
+  }
+
+  // Try to figure out the unwinding opcode out of src / dst regs.
+  if (MI->mayStore()) {
+    // Register saves.
+    assert(DstReg == ARM::SP &&
+           "Only stack pointer as a destination reg is supported");
+
+    SmallVector<unsigned, 4> RegList;
+    // Skip src & dst reg, and pred ops.
+    unsigned StartOp = 2 + 2;
+    // Use all the operands.
+    unsigned NumOffset = 0;
+
+    switch (Opc) {
+    default:
+      MI->dump();
+      llvm_unreachable("Unsupported opcode for unwinding information");
+    case ARM::tPUSH:
+      // Special case here: no src & dst reg, but two extra imp ops.
+      StartOp = 2; NumOffset = 2;
+    case ARM::STMDB_UPD:
+    case ARM::t2STMDB_UPD:
+    case ARM::VSTMDDB_UPD:
+      assert(SrcReg == ARM::SP &&
+             "Only stack pointer as a source reg is supported");
+      for (unsigned i = StartOp, NumOps = MI->getNumOperands() - NumOffset;
+           i != NumOps; ++i) {
+        const MachineOperand &MO = MI->getOperand(i);
+        // Actually, there should never be any impdef stuff here. Skip it
+        // temporary to workaround PR11902.
+        if (MO.isImplicit())
+          continue;
+        RegList.push_back(MO.getReg());
+      }
+      break;
+    case ARM::STR_PRE_IMM:
+    case ARM::STR_PRE_REG:
+    case ARM::t2STR_PRE:
+      assert(MI->getOperand(2).getReg() == ARM::SP &&
+             "Only stack pointer as a source reg is supported");
+      RegList.push_back(SrcReg);
+      break;
+    }
+    if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM)
+      ATS.emitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
+  } else {
+    // Changes of stack / frame pointer.
+    if (SrcReg == ARM::SP) {
+      int64_t Offset = 0;
+      switch (Opc) {
+      default:
+        MI->dump();
+        llvm_unreachable("Unsupported opcode for unwinding information");
+      case ARM::MOVr:
+      case ARM::tMOVr:
+        Offset = 0;
+        break;
+      case ARM::ADDri:
+      case ARM::t2ADDri:
+        Offset = -MI->getOperand(2).getImm();
+        break;
+      case ARM::SUBri:
+      case ARM::t2SUBri:
+        Offset = MI->getOperand(2).getImm();
+        break;
+      case ARM::tSUBspi:
+        Offset = MI->getOperand(2).getImm()*4;
+        break;
+      case ARM::tADDspi:
+      case ARM::tADDrSPi:
+        Offset = -MI->getOperand(2).getImm()*4;
+        break;
+      case ARM::tLDRpci: {
+        // Grab the constpool index and check, whether it corresponds to
+        // original or cloned constpool entry.
+        unsigned CPI = MI->getOperand(1).getIndex();
+        const MachineConstantPool *MCP = MF.getConstantPool();
+        if (CPI >= MCP->getConstants().size())
+          CPI = AFI.getOriginalCPIdx(CPI);
+        assert(CPI != -1U && "Invalid constpool index");
+
+        // Derive the actual offset.
+        const MachineConstantPoolEntry &CPE = MCP->getConstants()[CPI];
+        assert(!CPE.isMachineConstantPoolEntry() && "Invalid constpool entry");
+        // FIXME: Check for user, it should be "add" instruction!
+        Offset = -cast<ConstantInt>(CPE.Val.ConstVal)->getSExtValue();
+        break;
+      }
+      }
+
+      if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM) {
+        if (DstReg == FramePtr && FramePtr != ARM::SP)
+          // Set-up of the frame pointer. Positive values correspond to "add"
+          // instruction.
+          ATS.emitSetFP(FramePtr, ARM::SP, -Offset);
+        else if (DstReg == ARM::SP) {
+          // Change of SP by an offset. Positive values correspond to "sub"
+          // instruction.
+          ATS.emitPad(Offset);
+        } else {
+          // Move of SP to a register.  Positive values correspond to an "add"
+          // instruction.
+          ATS.emitMovSP(DstReg, -Offset);
+        }
+      }
+    } else if (DstReg == ARM::SP) {
+      MI->dump();
+      llvm_unreachable("Unsupported opcode for unwinding information");
+    }
+    else {
+      MI->dump();
+      llvm_unreachable("Unsupported opcode for unwinding information");
+    }
+  }
+}
+
+// Simple pseudo-instructions have their lowering (with expansion to real
+// instructions) auto-generated.
+#include "ARMGenMCPseudoLowering.inc"
+
+void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  const DataLayout &DL = getDataLayout();
+  MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+
+  // If we just ended a constant pool, mark it as such.
+  if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
+    OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+    InConstantPool = false;
+  }
+
+  // Emit unwinding stuff for frame-related instructions
+  if (Subtarget->isTargetEHABICompatible() &&
+       MI->getFlag(MachineInstr::FrameSetup))
+    EmitUnwindingInstruction(MI);
+
+  // Do any auto-generated pseudo lowerings.
+  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+    return;
+
+  assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
+         "Pseudo flag setting opcode should be expanded early");
+
+  // Check for manual lowerings.
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  case ARM::t2MOVi32imm: llvm_unreachable("Should be lowered by thumb2it pass");
+  case ARM::DBG_VALUE: llvm_unreachable("Should be handled by generic printing");
+  case ARM::LEApcrel:
+  case ARM::tLEApcrel:
+  case ARM::t2LEApcrel: {
+    // FIXME: Need to also handle globals and externals
+    MCSymbol *CPISymbol = GetCPISymbol(MI->getOperand(1).getIndex());
+    EmitToStreamer(*OutStreamer, MCInstBuilder(MI->getOpcode() ==
+                                               ARM::t2LEApcrel ? ARM::t2ADR
+                  : (MI->getOpcode() == ARM::tLEApcrel ? ARM::tADR
+                     : ARM::ADR))
+      .addReg(MI->getOperand(0).getReg())
+      .addExpr(MCSymbolRefExpr::create(CPISymbol, OutContext))
+      // Add predicate operands.
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(MI->getOperand(3).getReg()));
+    return;
+  }
+  case ARM::LEApcrelJT:
+  case ARM::tLEApcrelJT:
+  case ARM::t2LEApcrelJT: {
+    MCSymbol *JTIPICSymbol =
+      GetARMJTIPICJumpTableLabel(MI->getOperand(1).getIndex());
+    EmitToStreamer(*OutStreamer, MCInstBuilder(MI->getOpcode() ==
+                                               ARM::t2LEApcrelJT ? ARM::t2ADR
+                  : (MI->getOpcode() == ARM::tLEApcrelJT ? ARM::tADR
+                     : ARM::ADR))
+      .addReg(MI->getOperand(0).getReg())
+      .addExpr(MCSymbolRefExpr::create(JTIPICSymbol, OutContext))
+      // Add predicate operands.
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(MI->getOperand(3).getReg()));
+    return;
+  }
+  // Darwin call instructions are just normal call instructions with different
+  // clobber semantics (they clobber R9).
+  case ARM::BX_CALL: {
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVr)
+      .addReg(ARM::LR)
+      .addReg(ARM::PC)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::BX)
+      .addReg(MI->getOperand(0).getReg()));
+    return;
+  }
+  case ARM::tBX_CALL: {
+    if (Subtarget->hasV5TOps())
+      llvm_unreachable("Expected BLX to be selected for v5t+");
+
+    // On ARM v4t, when doing a call from thumb mode, we need to ensure
+    // that the saved lr has its LSB set correctly (the arch doesn't
+    // have blx).
+    // So here we generate a bl to a small jump pad that does bx rN.
+    // The jump pads are emitted after the function body.
+
+    unsigned TReg = MI->getOperand(0).getReg();
+    MCSymbol *TRegSym = nullptr;
+    for (unsigned i = 0, e = ThumbIndirectPads.size(); i < e; i++) {
+      if (ThumbIndirectPads[i].first == TReg) {
+        TRegSym = ThumbIndirectPads[i].second;
+        break;
+      }
+    }
+
+    if (!TRegSym) {
+      TRegSym = OutContext.createTempSymbol();
+      ThumbIndirectPads.push_back(std::make_pair(TReg, TRegSym));
+    }
+
+    // Create a link-saving branch to the Reg Indirect Jump Pad.
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBL)
+        // Predicate comes first here.
+        .addImm(ARMCC::AL).addReg(0)
+        .addExpr(MCSymbolRefExpr::create(TRegSym, OutContext)));
+    return;
+  }
+  case ARM::BMOVPCRX_CALL: {
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVr)
+      .addReg(ARM::LR)
+      .addReg(ARM::PC)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVr)
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(0).getReg())
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+    return;
+  }
+  case ARM::BMOVPCB_CALL: {
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVr)
+      .addReg(ARM::LR)
+      .addReg(ARM::PC)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+
+    const MachineOperand &Op = MI->getOperand(0);
+    const GlobalValue *GV = Op.getGlobal();
+    const unsigned TF = Op.getTargetFlags();
+    MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
+    const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext);
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::Bcc)
+      .addExpr(GVSymExpr)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+    return;
+  }
+  case ARM::MOVi16_ga_pcrel:
+  case ARM::t2MOVi16_ga_pcrel: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opc == ARM::MOVi16_ga_pcrel? ARM::MOVi16 : ARM::t2MOVi16);
+    TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
+
+    unsigned TF = MI->getOperand(1).getTargetFlags();
+    const GlobalValue *GV = MI->getOperand(1).getGlobal();
+    MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
+    const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext);
+
+    MCSymbol *LabelSym =
+        getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(),
+                    MI->getOperand(2).getImm(), OutContext);
+    const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext);
+    unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4;
+    const MCExpr *PCRelExpr =
+      ARMMCExpr::createLower16(MCBinaryExpr::createSub(GVSymExpr,
+                                      MCBinaryExpr::createAdd(LabelSymExpr,
+                                      MCConstantExpr::create(PCAdj, OutContext),
+                                      OutContext), OutContext), OutContext);
+      TmpInst.addOperand(MCOperand::createExpr(PCRelExpr));
+
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::createImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::createReg(0));
+    // Add 's' bit operand (always reg0 for this)
+    TmpInst.addOperand(MCOperand::createReg(0));
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
+  }
+  case ARM::MOVTi16_ga_pcrel:
+  case ARM::t2MOVTi16_ga_pcrel: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opc == ARM::MOVTi16_ga_pcrel
+                      ? ARM::MOVTi16 : ARM::t2MOVTi16);
+    TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
+    TmpInst.addOperand(MCOperand::createReg(MI->getOperand(1).getReg()));
+
+    unsigned TF = MI->getOperand(2).getTargetFlags();
+    const GlobalValue *GV = MI->getOperand(2).getGlobal();
+    MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
+    const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext);
+
+    MCSymbol *LabelSym =
+        getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(),
+                    MI->getOperand(3).getImm(), OutContext);
+    const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext);
+    unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4;
+    const MCExpr *PCRelExpr =
+        ARMMCExpr::createUpper16(MCBinaryExpr::createSub(GVSymExpr,
+                                   MCBinaryExpr::createAdd(LabelSymExpr,
+                                      MCConstantExpr::create(PCAdj, OutContext),
+                                          OutContext), OutContext), OutContext);
+      TmpInst.addOperand(MCOperand::createExpr(PCRelExpr));
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::createImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::createReg(0));
+    // Add 's' bit operand (always reg0 for this)
+    TmpInst.addOperand(MCOperand::createReg(0));
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
+  }
+  case ARM::tPICADD: {
+    // This is a pseudo op for a label + instruction sequence, which looks like:
+    // LPC0:
+    //     add r0, pc
+    // This adds the address of LPC0 to r0.
+
+    // Emit the label.
+    OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
+                                       getFunctionNumber(),
+                                       MI->getOperand(2).getImm(), OutContext));
+
+    // Form and emit the add.
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(ARM::PC)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+    return;
+  }
+  case ARM::PICADD: {
+    // This is a pseudo op for a label + instruction sequence, which looks like:
+    // LPC0:
+    //     add r0, pc, r0
+    // This adds the address of LPC0 to r0.
+
+    // Emit the label.
+    OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
+                                       getFunctionNumber(),
+                                       MI->getOperand(2).getImm(), OutContext));
+
+    // Form and emit the add.
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDrr)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(1).getReg())
+      // Add predicate operands.
+      .addImm(MI->getOperand(3).getImm())
+      .addReg(MI->getOperand(4).getReg())
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+    return;
+  }
+  case ARM::PICSTR:
+  case ARM::PICSTRB:
+  case ARM::PICSTRH:
+  case ARM::PICLDR:
+  case ARM::PICLDRB:
+  case ARM::PICLDRH:
+  case ARM::PICLDRSB:
+  case ARM::PICLDRSH: {
+    // This is a pseudo op for a label + instruction sequence, which looks like:
+    // LPC0:
+    //     OP r0, [pc, r0]
+    // The LCP0 label is referenced by a constant pool entry in order to get
+    // a PC-relative address at the ldr instruction.
+
+    // Emit the label.
+    OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
+                                       getFunctionNumber(),
+                                       MI->getOperand(2).getImm(), OutContext));
+
+    // Form and emit the load
+    unsigned Opcode;
+    switch (MI->getOpcode()) {
+    default:
+      llvm_unreachable("Unexpected opcode!");
+    case ARM::PICSTR:   Opcode = ARM::STRrs; break;
+    case ARM::PICSTRB:  Opcode = ARM::STRBrs; break;
+    case ARM::PICSTRH:  Opcode = ARM::STRH; break;
+    case ARM::PICLDR:   Opcode = ARM::LDRrs; break;
+    case ARM::PICLDRB:  Opcode = ARM::LDRBrs; break;
+    case ARM::PICLDRH:  Opcode = ARM::LDRH; break;
+    case ARM::PICLDRSB: Opcode = ARM::LDRSB; break;
+    case ARM::PICLDRSH: Opcode = ARM::LDRSH; break;
+    }
+    EmitToStreamer(*OutStreamer, MCInstBuilder(Opcode)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(0)
+      // Add predicate operands.
+      .addImm(MI->getOperand(3).getImm())
+      .addReg(MI->getOperand(4).getReg()));
+
+    return;
+  }
+  case ARM::CONSTPOOL_ENTRY: {
+    /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool
+    /// in the function.  The first operand is the ID# for this instruction, the
+    /// second is the index into the MachineConstantPool that this is, the third
+    /// is the size in bytes of this constant pool entry.
+    /// The required alignment is specified on the basic block holding this MI.
+    unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
+    unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
+
+    // If this is the first entry of the pool, mark it.
+    if (!InConstantPool) {
+      OutStreamer->EmitDataRegion(MCDR_DataRegion);
+      InConstantPool = true;
+    }
+
+    OutStreamer->EmitLabel(GetCPISymbol(LabelId));
+
+    const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
+    if (MCPE.isMachineConstantPoolEntry())
+      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+    else
+      EmitGlobalConstant(DL, MCPE.Val.ConstVal);
+    return;
+  }
+  case ARM::JUMPTABLE_ADDRS:
+    EmitJumpTableAddrs(MI);
+    return;
+  case ARM::JUMPTABLE_INSTS:
+    EmitJumpTableInsts(MI);
+    return;
+  case ARM::JUMPTABLE_TBB:
+  case ARM::JUMPTABLE_TBH:
+    EmitJumpTableTBInst(MI, MI->getOpcode() == ARM::JUMPTABLE_TBB ? 1 : 2);
+    return;
+  case ARM::t2BR_JT: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(0).getReg())
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+    return;
+  }
+  case ARM::t2TBB_JT:
+  case ARM::t2TBH_JT: {
+    unsigned Opc = MI->getOpcode() == ARM::t2TBB_JT ? ARM::t2TBB : ARM::t2TBH;
+    // Lower and emit the PC label, then the instruction itself.
+    OutStreamer->EmitLabel(GetCPISymbol(MI->getOperand(3).getImm()));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(Opc)
+                                     .addReg(MI->getOperand(0).getReg())
+                                     .addReg(MI->getOperand(1).getReg())
+                                     // Add predicate operands.
+                                     .addImm(ARMCC::AL)
+                                     .addReg(0));
+    return;
+  }
+  case ARM::tTBB_JT:
+  case ARM::tTBH_JT: {
+
+    bool Is8Bit = MI->getOpcode() == ARM::tTBB_JT;
+    unsigned Base = MI->getOperand(0).getReg();
+    unsigned Idx = MI->getOperand(1).getReg();
+    assert(MI->getOperand(1).isKill() && "We need the index register as scratch!");
+
+    // Multiply up idx if necessary.
+    if (!Is8Bit)
+      EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLSLri)
+                                       .addReg(Idx)
+                                       .addReg(ARM::CPSR)
+                                       .addReg(Idx)
+                                       .addImm(1)
+                                       // Add predicate operands.
+                                       .addImm(ARMCC::AL)
+                                       .addReg(0));
+
+    if (Base == ARM::PC) {
+      // TBB [base, idx] =
+      //    ADDS idx, idx, base
+      //    LDRB idx, [idx, #4] ; or LDRH if TBH
+      //    LSLS idx, #1
+      //    ADDS pc, pc, idx
+
+      // When using PC as the base, it's important that there is no padding
+      // between the last ADDS and the start of the jump table. The jump table
+      // is 4-byte aligned, so we ensure we're 4 byte aligned here too.
+      //
+      // FIXME: Ideally we could vary the LDRB index based on the padding
+      // between the sequence and jump table, however that relies on MCExprs
+      // for load indexes which are currently not supported.
+      OutStreamer->EmitCodeAlignment(4);
+      EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr)
+                                       .addReg(Idx)
+                                       .addReg(Idx)
+                                       .addReg(Base)
+                                       // Add predicate operands.
+                                       .addImm(ARMCC::AL)
+                                       .addReg(0));
+
+      unsigned Opc = Is8Bit ? ARM::tLDRBi : ARM::tLDRHi;
+      EmitToStreamer(*OutStreamer, MCInstBuilder(Opc)
+                                       .addReg(Idx)
+                                       .addReg(Idx)
+                                       .addImm(Is8Bit ? 4 : 2)
+                                       // Add predicate operands.
+                                       .addImm(ARMCC::AL)
+                                       .addReg(0));
+    } else {
+      // TBB [base, idx] =
+      //    LDRB idx, [base, idx] ; or LDRH if TBH
+      //    LSLS idx, #1
+      //    ADDS pc, pc, idx
+
+      unsigned Opc = Is8Bit ? ARM::tLDRBr : ARM::tLDRHr;
+      EmitToStreamer(*OutStreamer, MCInstBuilder(Opc)
+                                       .addReg(Idx)
+                                       .addReg(Base)
+                                       .addReg(Idx)
+                                       // Add predicate operands.
+                                       .addImm(ARMCC::AL)
+                                       .addReg(0));
+    }
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLSLri)
+                                     .addReg(Idx)
+                                     .addReg(ARM::CPSR)
+                                     .addReg(Idx)
+                                     .addImm(1)
+                                     // Add predicate operands.
+                                     .addImm(ARMCC::AL)
+                                     .addReg(0));
+
+    OutStreamer->EmitLabel(GetCPISymbol(MI->getOperand(3).getImm()));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr)
+                                     .addReg(ARM::PC)
+                                     .addReg(ARM::PC)
+                                     .addReg(Idx)
+                                     // Add predicate operands.
+                                     .addImm(ARMCC::AL)
+                                     .addReg(0));
+    return;
+  }
+  case ARM::tBR_JTr:
+  case ARM::BR_JTr: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    // mov pc, target
+    MCInst TmpInst;
+    unsigned Opc = MI->getOpcode() == ARM::BR_JTr ?
+      ARM::MOVr : ARM::tMOVr;
+    TmpInst.setOpcode(Opc);
+    TmpInst.addOperand(MCOperand::createReg(ARM::PC));
+    TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::createImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::createReg(0));
+    // Add 's' bit operand (always reg0 for this)
+    if (Opc == ARM::MOVr)
+      TmpInst.addOperand(MCOperand::createReg(0));
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
+  }
+  case ARM::BR_JTm: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    // ldr pc, target
+    MCInst TmpInst;
+    if (MI->getOperand(1).getReg() == 0) {
+      // literal offset
+      TmpInst.setOpcode(ARM::LDRi12);
+      TmpInst.addOperand(MCOperand::createReg(ARM::PC));
+      TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
+      TmpInst.addOperand(MCOperand::createImm(MI->getOperand(2).getImm()));
+    } else {
+      TmpInst.setOpcode(ARM::LDRrs);
+      TmpInst.addOperand(MCOperand::createReg(ARM::PC));
+      TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
+      TmpInst.addOperand(MCOperand::createReg(MI->getOperand(1).getReg()));
+      TmpInst.addOperand(MCOperand::createImm(0));
+    }
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::createImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::createReg(0));
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
+  }
+  case ARM::BR_JTadd: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    // add pc, target, idx
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDrr)
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+    return;
+  }
+  case ARM::SPACE:
+    OutStreamer->EmitZeros(MI->getOperand(1).getImm());
+    return;
+  case ARM::TRAP: {
+    // Non-Darwin binutils don't yet support the "trap" mnemonic.
+    // FIXME: Remove this special case when they do.
+    if (!Subtarget->isTargetMachO()) {
+      uint32_t Val = 0xe7ffdefeUL;
+      OutStreamer->AddComment("trap");
+      ATS.emitInst(Val);
+      return;
+    }
+    break;
+  }
+  case ARM::TRAPNaCl: {
+    uint32_t Val = 0xe7fedef0UL;
+    OutStreamer->AddComment("trap");
+    ATS.emitInst(Val);
+    return;
+  }
+  case ARM::tTRAP: {
+    // Non-Darwin binutils don't yet support the "trap" mnemonic.
+    // FIXME: Remove this special case when they do.
+    if (!Subtarget->isTargetMachO()) {
+      uint16_t Val = 0xdefe;
+      OutStreamer->AddComment("trap");
+      ATS.emitInst(Val, 'n');
+      return;
+    }
+    break;
+  }
+  case ARM::t2Int_eh_sjlj_setjmp:
+  case ARM::t2Int_eh_sjlj_setjmp_nofp:
+  case ARM::tInt_eh_sjlj_setjmp: {
+    // Two incoming args: GPR:$src, GPR:$val
+    // mov $val, pc
+    // adds $val, #7
+    // str $val, [$src, #4]
+    // movs r0, #0
+    // b LSJLJEH
+    // movs r0, #1
+    // LSJLJEH:
+    unsigned SrcReg = MI->getOperand(0).getReg();
+    unsigned ValReg = MI->getOperand(1).getReg();
+    MCSymbol *Label = OutContext.createTempSymbol("SJLJEH", false, true);
+    OutStreamer->AddComment("eh_setjmp begin");
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
+      .addReg(ValReg)
+      .addReg(ARM::PC)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDi3)
+      .addReg(ValReg)
+      // 's' bit operand
+      .addReg(ARM::CPSR)
+      .addReg(ValReg)
+      .addImm(7)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tSTRi)
+      .addReg(ValReg)
+      .addReg(SrcReg)
+      // The offset immediate is #4. The operand value is scaled by 4 for the
+      // tSTR instruction.
+      .addImm(1)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVi8)
+      .addReg(ARM::R0)
+      .addReg(ARM::CPSR)
+      .addImm(0)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    const MCExpr *SymbolExpr = MCSymbolRefExpr::create(Label, OutContext);
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tB)
+      .addExpr(SymbolExpr)
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer->AddComment("eh_setjmp end");
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVi8)
+      .addReg(ARM::R0)
+      .addReg(ARM::CPSR)
+      .addImm(1)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer->EmitLabel(Label);
+    return;
+  }
+
+  case ARM::Int_eh_sjlj_setjmp_nofp:
+  case ARM::Int_eh_sjlj_setjmp: {
+    // Two incoming args: GPR:$src, GPR:$val
+    // add $val, pc, #8
+    // str $val, [$src, #+4]
+    // mov r0, #0
+    // add pc, pc, #0
+    // mov r0, #1
+    unsigned SrcReg = MI->getOperand(0).getReg();
+    unsigned ValReg = MI->getOperand(1).getReg();
+
+    OutStreamer->AddComment("eh_setjmp begin");
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDri)
+      .addReg(ValReg)
+      .addReg(ARM::PC)
+      .addImm(8)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // 's' bit operand (always reg0 for this).
+      .addReg(0));
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::STRi12)
+      .addReg(ValReg)
+      .addReg(SrcReg)
+      .addImm(4)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVi)
+      .addReg(ARM::R0)
+      .addImm(0)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // 's' bit operand (always reg0 for this).
+      .addReg(0));
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDri)
+      .addReg(ARM::PC)
+      .addReg(ARM::PC)
+      .addImm(0)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // 's' bit operand (always reg0 for this).
+      .addReg(0));
+
+    OutStreamer->AddComment("eh_setjmp end");
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::MOVi)
+      .addReg(ARM::R0)
+      .addImm(1)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // 's' bit operand (always reg0 for this).
+      .addReg(0));
+    return;
+  }
+  case ARM::Int_eh_sjlj_longjmp: {
+    // ldr sp, [$src, #8]
+    // ldr $scratch, [$src, #4]
+    // ldr r7, [$src]
+    // bx $scratch
+    unsigned SrcReg = MI->getOperand(0).getReg();
+    unsigned ScratchReg = MI->getOperand(1).getReg();
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12)
+      .addReg(ARM::SP)
+      .addReg(SrcReg)
+      .addImm(8)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12)
+      .addReg(ScratchReg)
+      .addReg(SrcReg)
+      .addImm(4)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12)
+      .addReg(ARM::R7)
+      .addReg(SrcReg)
+      .addImm(0)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::BX)
+      .addReg(ScratchReg)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+    return;
+  }
+  case ARM::tInt_eh_sjlj_longjmp: {
+    // ldr $scratch, [$src, #8]
+    // mov sp, $scratch
+    // ldr $scratch, [$src, #4]
+    // ldr r7, [$src]
+    // bx $scratch
+    unsigned SrcReg = MI->getOperand(0).getReg();
+    unsigned ScratchReg = MI->getOperand(1).getReg();
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi)
+      .addReg(ScratchReg)
+      .addReg(SrcReg)
+      // The offset immediate is #8. The operand value is scaled by 4 for the
+      // tLDR instruction.
+      .addImm(2)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
+      .addReg(ARM::SP)
+      .addReg(ScratchReg)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi)
+      .addReg(ScratchReg)
+      .addReg(SrcReg)
+      .addImm(1)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi)
+      .addReg(ARM::R7)
+      .addReg(SrcReg)
+      .addImm(0)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBX)
+      .addReg(ScratchReg)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+    return;
+  }
+  case ARM::tInt_WIN_eh_sjlj_longjmp: {
+    // ldr.w r11, [$src, #0]
+    // ldr.w  sp, [$src, #8]
+    // ldr.w  pc, [$src, #4]
+
+    unsigned SrcReg = MI->getOperand(0).getReg();
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12)
+                                     .addReg(ARM::R11)
+                                     .addReg(SrcReg)
+                                     .addImm(0)
+                                     // Predicate
+                                     .addImm(ARMCC::AL)
+                                     .addReg(0));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12)
+                                     .addReg(ARM::SP)
+                                     .addReg(SrcReg)
+                                     .addImm(8)
+                                     // Predicate
+                                     .addImm(ARMCC::AL)
+                                     .addReg(0));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi12)
+                                     .addReg(ARM::PC)
+                                     .addReg(SrcReg)
+                                     .addImm(4)
+                                     // Predicate
+                                     .addImm(ARMCC::AL)
+                                     .addReg(0));
+    return;
+  }
+  case ARM::PATCHABLE_FUNCTION_ENTER:
+    LowerPATCHABLE_FUNCTION_ENTER(*MI);
+    return;
+  case ARM::PATCHABLE_FUNCTION_EXIT:
+    LowerPATCHABLE_FUNCTION_EXIT(*MI);
+    return;
+  case ARM::PATCHABLE_TAIL_CALL:
+    LowerPATCHABLE_TAIL_CALL(*MI);
+    return;
+  }
+
+  MCInst TmpInst;
+  LowerARMMachineInstrToMCInst(MI, TmpInst, *this);
+
+  EmitToStreamer(*OutStreamer, TmpInst);
+}
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+// Force static initialization.
+extern "C" void LLVMInitializeARMAsmPrinter() {
+  RegisterAsmPrinter<ARMAsmPrinter> X(getTheARMLETarget());
+  RegisterAsmPrinter<ARMAsmPrinter> Y(getTheARMBETarget());
+  RegisterAsmPrinter<ARMAsmPrinter> A(getTheThumbLETarget());
+  RegisterAsmPrinter<ARMAsmPrinter> B(getTheThumbBETarget());
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h
new file mode 100644
index 000000000000..ce0b04d56d9e
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h
@@ -0,0 +1,161 @@
+//===-- ARMAsmPrinter.h - ARM implementation of AsmPrinter ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMASMPRINTER_H
+#define LLVM_LIB_TARGET_ARM_ARMASMPRINTER_H
+
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class ARMFunctionInfo;
+class MCOperand;
+class MachineConstantPool;
+class MachineOperand;
+class MCSymbol;
+
+namespace ARM {
+  enum DW_ISA {
+    DW_ISA_ARM_thumb = 1,
+    DW_ISA_ARM_arm = 2
+  };
+}
+
+class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
+
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when printing asm code for different targets.
+  const ARMSubtarget *Subtarget;
+
+  /// AFI - Keep a pointer to ARMFunctionInfo for the current
+  /// MachineFunction.
+  ARMFunctionInfo *AFI;
+
+  /// MCP - Keep a pointer to constantpool entries of the current
+  /// MachineFunction.
+  const MachineConstantPool *MCP;
+
+  /// InConstantPool - Maintain state when emitting a sequence of constant
+  /// pool entries so we can properly mark them as data regions.
+  bool InConstantPool;
+
+  /// ThumbIndirectPads - These maintain a per-function list of jump pad
+  /// labels used for ARMv4t thumb code to make register indirect calls.
+  SmallVector<std::pair<unsigned, MCSymbol*>, 4> ThumbIndirectPads;
+
+  /// OptimizationGoals - Maintain a combined optimization goal for all
+  /// functions in a module: one of Tag_ABI_optimization_goals values,
+  /// -1 if uninitialized, 0 if conflicting goals
+  int OptimizationGoals;
+
+  /// List of globals that have had their storage promoted to a constant
+  /// pool. This lives between calls to runOnMachineFunction and collects
+  /// data from every MachineFunction. It is used during doFinalization
+  /// when all non-function globals are emitted.
+  SmallPtrSet<const GlobalVariable*,2> PromotedGlobals;
+  /// Set of globals in PromotedGlobals that we've emitted labels for.
+  /// We need to emit labels even for promoted globals so that DWARF
+  /// debug info can link properly.
+  SmallPtrSet<const GlobalVariable*,2> EmittedPromotedGlobalLabels;
+
+public:
+  explicit ARMAsmPrinter(TargetMachine &TM,
+                         std::unique_ptr<MCStreamer> Streamer);
+
+  StringRef getPassName() const override {
+    return "ARM Assembly Printer";
+  }
+
+  void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O);
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O) override;
+
+  void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+                        const MCSubtargetInfo *EndInfo) const override;
+
+  void EmitJumpTableAddrs(const MachineInstr *MI);
+  void EmitJumpTableInsts(const MachineInstr *MI);
+  void EmitJumpTableTBInst(const MachineInstr *MI, unsigned OffsetWidth);
+  void EmitInstruction(const MachineInstr *MI) override;
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  void EmitConstantPool() override {
+    // we emit constant pools customly!
+  }
+  void EmitFunctionBodyEnd() override;
+  void EmitFunctionEntryLabel() override;
+  void EmitStartOfAsmFile(Module &M) override;
+  void EmitEndOfAsmFile(Module &M) override;
+  void EmitXXStructor(const DataLayout &DL, const Constant *CV) override;
+  void EmitGlobalVariable(const GlobalVariable *GV) override;
+  
+  // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
+
+  //===------------------------------------------------------------------===//
+  // XRay implementation
+  //===------------------------------------------------------------------===//
+public:
+  // XRay-specific lowering for ARM.
+  void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
+  void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
+  void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
+  // Helper function that emits the XRay sleds we've collected for a particular
+  // function.
+  void EmitXRayTable();
+
+private:
+  void EmitSled(const MachineInstr &MI, SledKind Kind);
+
+  // Helpers for EmitStartOfAsmFile() and EmitEndOfAsmFile()
+  void emitAttributes();
+
+  // Generic helper used to emit e.g. ARMv5 mul pseudos
+  void EmitPatchedInstruction(const MachineInstr *MI, unsigned TargetOpc);
+
+  void EmitUnwindingInstruction(const MachineInstr *MI);
+
+  // emitPseudoExpansionLowering - tblgen'erated.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+public:
+  unsigned getISAEncoding() override {
+    // ARM/Darwin adds ISA to the DWARF info for each function.
+    const Triple &TT = TM.getTargetTriple();
+    if (!TT.isOSBinFormatMachO())
+      return 0;
+    bool isThumb = TT.getArch() == Triple::thumb ||
+                   TT.getArch() == Triple::thumbeb ||
+                   TT.getSubArch() == Triple::ARMSubArch_v7m ||
+                   TT.getSubArch() == Triple::ARMSubArch_v6m;
+    return isThumb ? ARM::DW_ISA_ARM_thumb : ARM::DW_ISA_ARM_arm;
+  }
+
+private:
+  MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol);
+  MCSymbol *GetARMJTIPICJumpTableLabel(unsigned uid) const;
+
+  MCSymbol *GetARMGVSymbol(const GlobalValue *GV, unsigned char TargetFlags);
+
+public:
+  /// EmitMachineConstantPoolValue - Print a machine constantpool value to
+  /// the .s file.
+  void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
new file mode 100644
index 000000000000..70a3246e34f1
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -0,0 +1,4707 @@
+//===-- ARMBaseInstrInfo.cpp - ARM Instruction Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Base ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMFeatures.h"
+#include "ARMHazardRecognizer.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-instrinfo"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "ARMGenInstrInfo.inc"
+
+static cl::opt<bool>
+EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
+               cl::desc("Enable ARM 2-addr to 3-addr conv"));
+
+/// ARM_MLxEntry - Record information about MLA / MLS instructions.
+struct ARM_MLxEntry {
+  uint16_t MLxOpc;     // MLA / MLS opcode
+  uint16_t MulOpc;     // Expanded multiplication opcode
+  uint16_t AddSubOpc;  // Expanded add / sub opcode
+  bool NegAcc;         // True if the acc is negated before the add / sub.
+  bool HasLane;        // True if instruction has an extra "lane" operand.
+};
+
+static const ARM_MLxEntry ARM_MLxTable[] = {
+  // MLxOpc,          MulOpc,           AddSubOpc,       NegAcc, HasLane
+  // fp scalar ops
+  { ARM::VMLAS,       ARM::VMULS,       ARM::VADDS,      false,  false },
+  { ARM::VMLSS,       ARM::VMULS,       ARM::VSUBS,      false,  false },
+  { ARM::VMLAD,       ARM::VMULD,       ARM::VADDD,      false,  false },
+  { ARM::VMLSD,       ARM::VMULD,       ARM::VSUBD,      false,  false },
+  { ARM::VNMLAS,      ARM::VNMULS,      ARM::VSUBS,      true,   false },
+  { ARM::VNMLSS,      ARM::VMULS,       ARM::VSUBS,      true,   false },
+  { ARM::VNMLAD,      ARM::VNMULD,      ARM::VSUBD,      true,   false },
+  { ARM::VNMLSD,      ARM::VMULD,       ARM::VSUBD,      true,   false },
+
+  // fp SIMD ops
+  { ARM::VMLAfd,      ARM::VMULfd,      ARM::VADDfd,     false,  false },
+  { ARM::VMLSfd,      ARM::VMULfd,      ARM::VSUBfd,     false,  false },
+  { ARM::VMLAfq,      ARM::VMULfq,      ARM::VADDfq,     false,  false },
+  { ARM::VMLSfq,      ARM::VMULfq,      ARM::VSUBfq,     false,  false },
+  { ARM::VMLAslfd,    ARM::VMULslfd,    ARM::VADDfd,     false,  true  },
+  { ARM::VMLSslfd,    ARM::VMULslfd,    ARM::VSUBfd,     false,  true  },
+  { ARM::VMLAslfq,    ARM::VMULslfq,    ARM::VADDfq,     false,  true  },
+  { ARM::VMLSslfq,    ARM::VMULslfq,    ARM::VSUBfq,     false,  true  },
+};
+
+ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI)
+  : ARMGenInstrInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
+    Subtarget(STI) {
+  for (unsigned i = 0, e = array_lengthof(ARM_MLxTable); i != e; ++i) {
+    if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second)
+      llvm_unreachable("Duplicated entries?");
+    MLxHazardOpcodes.insert(ARM_MLxTable[i].AddSubOpc);
+    MLxHazardOpcodes.insert(ARM_MLxTable[i].MulOpc);
+  }
+}
+
+// Use a ScoreboardHazardRecognizer for prepass ARM scheduling. TargetInstrImpl
+// currently defaults to no prepass hazard recognizer.
+ScheduleHazardRecognizer *
+ARMBaseInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+                                               const ScheduleDAG *DAG) const {
+  if (usePreRAHazardRecognizer()) {
+    const InstrItineraryData *II =
+        static_cast<const ARMSubtarget *>(STI)->getInstrItineraryData();
+    return new ScoreboardHazardRecognizer(II, DAG, "pre-RA-sched");
+  }
+  return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG);
+}
+
+ScheduleHazardRecognizer *ARMBaseInstrInfo::
+CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                   const ScheduleDAG *DAG) const {
+  if (Subtarget.isThumb2() || Subtarget.hasVFP2())
+    return (ScheduleHazardRecognizer *)new ARMHazardRecognizer(II, DAG);
+  return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
+}
+
+MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
+    MachineFunction::iterator &MFI, MachineInstr &MI, LiveVariables *LV) const {
+  // FIXME: Thumb2 support.
+
+  if (!EnableARM3Addr)
+    return nullptr;
+
+  MachineFunction &MF = *MI.getParent()->getParent();
+  uint64_t TSFlags = MI.getDesc().TSFlags;
+  bool isPre = false;
+  switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
+  default: return nullptr;
+  case ARMII::IndexModePre:
+    isPre = true;
+    break;
+  case ARMII::IndexModePost:
+    break;
+  }
+
+  // Try splitting an indexed load/store to an un-indexed one plus an add/sub
+  // operation.
+  unsigned MemOpc = getUnindexedOpcode(MI.getOpcode());
+  if (MemOpc == 0)
+    return nullptr;
+
+  MachineInstr *UpdateMI = nullptr;
+  MachineInstr *MemMI = nullptr;
+  unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
+  const MCInstrDesc &MCID = MI.getDesc();
+  unsigned NumOps = MCID.getNumOperands();
+  bool isLoad = !MI.mayStore();
+  const MachineOperand &WB = isLoad ? MI.getOperand(1) : MI.getOperand(0);
+  const MachineOperand &Base = MI.getOperand(2);
+  const MachineOperand &Offset = MI.getOperand(NumOps - 3);
+  unsigned WBReg = WB.getReg();
+  unsigned BaseReg = Base.getReg();
+  unsigned OffReg = Offset.getReg();
+  unsigned OffImm = MI.getOperand(NumOps - 2).getImm();
+  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI.getOperand(NumOps - 1).getImm();
+  switch (AddrMode) {
+  default: llvm_unreachable("Unknown indexed op!");
+  case ARMII::AddrMode2: {
+    bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
+    unsigned Amt = ARM_AM::getAM2Offset(OffImm);
+    if (OffReg == 0) {
+      if (ARM_AM::getSOImmVal(Amt) == -1)
+        // Can't encode it in a so_imm operand. This transformation will
+        // add more than 1 instruction. Abandon!
+        return nullptr;
+      UpdateMI = BuildMI(MF, MI.getDebugLoc(),
+                         get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+                     .addReg(BaseReg)
+                     .addImm(Amt)
+                     .addImm(Pred)
+                     .addReg(0)
+                     .addReg(0);
+    } else if (Amt != 0) {
+      ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
+      unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
+      UpdateMI = BuildMI(MF, MI.getDebugLoc(),
+                         get(isSub ? ARM::SUBrsi : ARM::ADDrsi), WBReg)
+                     .addReg(BaseReg)
+                     .addReg(OffReg)
+                     .addReg(0)
+                     .addImm(SOOpc)
+                     .addImm(Pred)
+                     .addReg(0)
+                     .addReg(0);
+    } else
+      UpdateMI = BuildMI(MF, MI.getDebugLoc(),
+                         get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+                     .addReg(BaseReg)
+                     .addReg(OffReg)
+                     .addImm(Pred)
+                     .addReg(0)
+                     .addReg(0);
+    break;
+  }
+  case ARMII::AddrMode3 : {
+    bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub;
+    unsigned Amt = ARM_AM::getAM3Offset(OffImm);
+    if (OffReg == 0)
+      // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand.
+      UpdateMI = BuildMI(MF, MI.getDebugLoc(),
+                         get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+                     .addReg(BaseReg)
+                     .addImm(Amt)
+                     .addImm(Pred)
+                     .addReg(0)
+                     .addReg(0);
+    else
+      UpdateMI = BuildMI(MF, MI.getDebugLoc(),
+                         get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+                     .addReg(BaseReg)
+                     .addReg(OffReg)
+                     .addImm(Pred)
+                     .addReg(0)
+                     .addReg(0);
+    break;
+  }
+  }
+
+  std::vector<MachineInstr*> NewMIs;
+  if (isPre) {
+    if (isLoad)
+      MemMI =
+          BuildMI(MF, MI.getDebugLoc(), get(MemOpc), MI.getOperand(0).getReg())
+              .addReg(WBReg)
+              .addImm(0)
+              .addImm(Pred);
+    else
+      MemMI = BuildMI(MF, MI.getDebugLoc(), get(MemOpc))
+                  .addReg(MI.getOperand(1).getReg())
+                  .addReg(WBReg)
+                  .addReg(0)
+                  .addImm(0)
+                  .addImm(Pred);
+    NewMIs.push_back(MemMI);
+    NewMIs.push_back(UpdateMI);
+  } else {
+    if (isLoad)
+      MemMI =
+          BuildMI(MF, MI.getDebugLoc(), get(MemOpc), MI.getOperand(0).getReg())
+              .addReg(BaseReg)
+              .addImm(0)
+              .addImm(Pred);
+    else
+      MemMI = BuildMI(MF, MI.getDebugLoc(), get(MemOpc))
+                  .addReg(MI.getOperand(1).getReg())
+                  .addReg(BaseReg)
+                  .addReg(0)
+                  .addImm(0)
+                  .addImm(Pred);
+    if (WB.isDead())
+      UpdateMI->getOperand(0).setIsDead();
+    NewMIs.push_back(UpdateMI);
+    NewMIs.push_back(MemMI);
+  }
+
+  // Transfer LiveVariables states, kill / dead info.
+  if (LV) {
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        unsigned Reg = MO.getReg();
+
+        LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
+        if (MO.isDef()) {
+          MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI;
+          if (MO.isDead())
+            LV->addVirtualRegisterDead(Reg, *NewMI);
+        }
+        if (MO.isUse() && MO.isKill()) {
+          for (unsigned j = 0; j < 2; ++j) {
+            // Look at the two new MI's in reverse order.
+            MachineInstr *NewMI = NewMIs[j];
+            if (!NewMI->readsRegister(Reg))
+              continue;
+            LV->addVirtualRegisterKilled(Reg, *NewMI);
+            if (VI.removeKill(MI))
+              VI.Kills.push_back(NewMI);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  MachineBasicBlock::iterator MBBI = MI.getIterator();
+  MFI->insert(MBBI, NewMIs[1]);
+  MFI->insert(MBBI, NewMIs[0]);
+  return NewMIs[0];
+}
+
+// Branch analysis.
+bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *&TBB,
+                                     MachineBasicBlock *&FBB,
+                                     SmallVectorImpl<MachineOperand> &Cond,
+                                     bool AllowModify) const {
+  TBB = nullptr;
+  FBB = nullptr;
+
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false; // Empty blocks are easy.
+  --I;
+
+  // Walk backwards from the end of the basic block until the branch is
+  // analyzed or we give up.
+  while (isPredicated(*I) || I->isTerminator() || I->isDebugValue()) {
+
+    // Flag to be raised on unanalyzeable instructions. This is useful in cases
+    // where we want to clean up on the end of the basic block before we bail
+    // out.
+    bool CantAnalyze = false;
+
+    // Skip over DEBUG values and predicated nonterminators.
+    while (I->isDebugValue() || !I->isTerminator()) {
+      if (I == MBB.begin())
+        return false;
+      --I;
+    }
+
+    if (isIndirectBranchOpcode(I->getOpcode()) ||
+        isJumpTableBranchOpcode(I->getOpcode())) {
+      // Indirect branches and jump tables can't be analyzed, but we still want
+      // to clean up any instructions at the tail of the basic block.
+      CantAnalyze = true;
+    } else if (isUncondBranchOpcode(I->getOpcode())) {
+      TBB = I->getOperand(0).getMBB();
+    } else if (isCondBranchOpcode(I->getOpcode())) {
+      // Bail out if we encounter multiple conditional branches.
+      if (!Cond.empty())
+        return true;
+
+      assert(!FBB && "FBB should have been null.");
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(I->getOperand(1));
+      Cond.push_back(I->getOperand(2));
+    } else if (I->isReturn()) {
+      // Returns can't be analyzed, but we should run cleanup.
+      CantAnalyze = !isPredicated(*I);
+    } else {
+      // We encountered other unrecognized terminator. Bail out immediately.
+      return true;
+    }
+
+    // Cleanup code - to be run for unpredicated unconditional branches and
+    //                returns.
+    if (!isPredicated(*I) &&
+          (isUncondBranchOpcode(I->getOpcode()) ||
+           isIndirectBranchOpcode(I->getOpcode()) ||
+           isJumpTableBranchOpcode(I->getOpcode()) ||
+           I->isReturn())) {
+      // Forget any previous condition branch information - it no longer applies.
+      Cond.clear();
+      FBB = nullptr;
+
+      // If we can modify the function, delete everything below this
+      // unconditional branch.
+      if (AllowModify) {
+        MachineBasicBlock::iterator DI = std::next(I);
+        while (DI != MBB.end()) {
+          MachineInstr &InstToDelete = *DI;
+          ++DI;
+          InstToDelete.eraseFromParent();
+        }
+      }
+    }
+
+    if (CantAnalyze)
+      return true;
+
+    if (I == MBB.begin())
+      return false;
+
+    --I;
+  }
+
+  // We made it past the terminators without bailing out - we must have
+  // analyzed this branch successfully.
+  return false;
+}
+
+
+unsigned ARMBaseInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                        int *BytesRemoved) const {
+  assert(!BytesRemoved && "code size not handled");
+
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return 0;
+
+  if (!isUncondBranchOpcode(I->getOpcode()) &&
+      !isCondBranchOpcode(I->getOpcode()))
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (!isCondBranchOpcode(I->getOpcode()))
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned ARMBaseInstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                        MachineBasicBlock *TBB,
+                                        MachineBasicBlock *FBB,
+                                        ArrayRef<MachineOperand> Cond,
+                                        const DebugLoc &DL,
+                                        int *BytesAdded) const {
+  assert(!BytesAdded && "code size not handled");
+  ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>();
+  int BOpc   = !AFI->isThumbFunction()
+    ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB);
+  int BccOpc = !AFI->isThumbFunction()
+    ? ARM::Bcc : (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc);
+  bool isThumb = AFI->isThumbFunction() || AFI->isThumb2Function();
+
+  // Shouldn't be a fall through.
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "ARM branch conditions have two components!");
+
+  // For conditional branches, we use addOperand to preserve CPSR flags.
+
+  if (!FBB) {
+    if (Cond.empty()) { // Unconditional branch?
+      if (isThumb)
+        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB).addImm(ARMCC::AL).addReg(0);
+      else
+        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
+    } else
+      BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
+        .addImm(Cond[0].getImm()).addOperand(Cond[1]);
+    return 1;
+  }
+
+  // Two-way conditional branch.
+  BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
+    .addImm(Cond[0].getImm()).addOperand(Cond[1]);
+  if (isThumb)
+    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB).addImm(ARMCC::AL).addReg(0);
+  else
+    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
+  return 2;
+}
+
+bool ARMBaseInstrInfo::
+reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm();
+  Cond[0].setImm(ARMCC::getOppositeCondition(CC));
+  return false;
+}
+
+bool ARMBaseInstrInfo::isPredicated(const MachineInstr &MI) const {
+  if (MI.isBundle()) {
+    MachineBasicBlock::const_instr_iterator I = MI.getIterator();
+    MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
+    while (++I != E && I->isInsideBundle()) {
+      int PIdx = I->findFirstPredOperandIdx();
+      if (PIdx != -1 && I->getOperand(PIdx).getImm() != ARMCC::AL)
+        return true;
+    }
+    return false;
+  }
+
+  int PIdx = MI.findFirstPredOperandIdx();
+  return PIdx != -1 && MI.getOperand(PIdx).getImm() != ARMCC::AL;
+}
+
+bool ARMBaseInstrInfo::PredicateInstruction(
+    MachineInstr &MI, ArrayRef<MachineOperand> Pred) const {
+  unsigned Opc = MI.getOpcode();
+  if (isUncondBranchOpcode(Opc)) {
+    MI.setDesc(get(getMatchingCondBranchOpcode(Opc)));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+      .addImm(Pred[0].getImm())
+      .addReg(Pred[1].getReg());
+    return true;
+  }
+
+  int PIdx = MI.findFirstPredOperandIdx();
+  if (PIdx != -1) {
+    MachineOperand &PMO = MI.getOperand(PIdx);
+    PMO.setImm(Pred[0].getImm());
+    MI.getOperand(PIdx+1).setReg(Pred[1].getReg());
+    return true;
+  }
+  return false;
+}
+
+bool ARMBaseInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+                                         ArrayRef<MachineOperand> Pred2) const {
+  if (Pred1.size() > 2 || Pred2.size() > 2)
+    return false;
+
+  ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImm();
+  ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImm();
+  if (CC1 == CC2)
+    return true;
+
+  switch (CC1) {
+  default:
+    return false;
+  case ARMCC::AL:
+    return true;
+  case ARMCC::HS:
+    return CC2 == ARMCC::HI;
+  case ARMCC::LS:
+    return CC2 == ARMCC::LO || CC2 == ARMCC::EQ;
+  case ARMCC::GE:
+    return CC2 == ARMCC::GT;
+  case ARMCC::LE:
+    return CC2 == ARMCC::LT;
+  }
+}
+
+bool ARMBaseInstrInfo::DefinesPredicate(
+    MachineInstr &MI, std::vector<MachineOperand> &Pred) const {
+  bool Found = false;
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if ((MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) ||
+        (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)) {
+      Pred.push_back(MO);
+      Found = true;
+    }
+  }
+
+  return Found;
+}
+
+static bool isCPSRDefined(const MachineInstr *MI) {
+  for (const auto &MO : MI->operands())
+    if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef() && !MO.isDead())
+      return true;
+  return false;
+}
+
+static bool isEligibleForITBlock(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return true;
+  case ARM::tADC:   // ADC (register) T1
+  case ARM::tADDi3: // ADD (immediate) T1
+  case ARM::tADDi8: // ADD (immediate) T2
+  case ARM::tADDrr: // ADD (register) T1
+  case ARM::tAND:   // AND (register) T1
+  case ARM::tASRri: // ASR (immediate) T1
+  case ARM::tASRrr: // ASR (register) T1
+  case ARM::tBIC:   // BIC (register) T1
+  case ARM::tEOR:   // EOR (register) T1
+  case ARM::tLSLri: // LSL (immediate) T1
+  case ARM::tLSLrr: // LSL (register) T1
+  case ARM::tLSRri: // LSR (immediate) T1
+  case ARM::tLSRrr: // LSR (register) T1
+  case ARM::tMUL:   // MUL T1
+  case ARM::tMVN:   // MVN (register) T1
+  case ARM::tORR:   // ORR (register) T1
+  case ARM::tROR:   // ROR (register) T1
+  case ARM::tRSB:   // RSB (immediate) T1
+  case ARM::tSBC:   // SBC (register) T1
+  case ARM::tSUBi3: // SUB (immediate) T1
+  case ARM::tSUBi8: // SUB (immediate) T2
+  case ARM::tSUBrr: // SUB (register) T1
+    return !isCPSRDefined(MI);
+  }
+}
+
+/// isPredicable - Return true if the specified instruction can be predicated.
+/// By default, this returns true for every instruction with a
+/// PredicateOperand.
+bool ARMBaseInstrInfo::isPredicable(MachineInstr &MI) const {
+  if (!MI.isPredicable())
+    return false;
+
+  if (MI.isBundle())
+    return false;
+
+  if (!isEligibleForITBlock(&MI))
+    return false;
+
+  ARMFunctionInfo *AFI =
+      MI.getParent()->getParent()->getInfo<ARMFunctionInfo>();
+
+  if (AFI->isThumb2Function()) {
+    if (getSubtarget().restrictIT())
+      return isV8EligibleForIT(&MI);
+  } else { // non-Thumb
+    if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
+      return false;
+  }
+
+  return true;
+}
+
+namespace llvm {
+template <> bool IsCPSRDead<MachineInstr>(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isUndef() || MO.isUse())
+      continue;
+    if (MO.getReg() != ARM::CPSR)
+      continue;
+    if (!MO.isDead())
+      return false;
+  }
+  // all definitions of CPSR are dead
+  return true;
+}
+}
+
+/// GetInstSize - Return the size of the specified MachineInstr.
+///
+unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+  const MachineBasicBlock &MBB = *MI.getParent();
+  const MachineFunction *MF = MBB.getParent();
+  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (MCID.getSize())
+    return MCID.getSize();
+
+  // If this machine instr is an inline asm, measure it.
+  if (MI.getOpcode() == ARM::INLINEASM)
+    return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default:
+    // pseudo-instruction sizes are zero.
+    return 0;
+  case TargetOpcode::BUNDLE:
+    return getInstBundleLength(MI);
+  case ARM::MOVi16_ga_pcrel:
+  case ARM::MOVTi16_ga_pcrel:
+  case ARM::t2MOVi16_ga_pcrel:
+  case ARM::t2MOVTi16_ga_pcrel:
+    return 4;
+  case ARM::MOVi32imm:
+  case ARM::t2MOVi32imm:
+    return 8;
+  case ARM::CONSTPOOL_ENTRY:
+  case ARM::JUMPTABLE_INSTS:
+  case ARM::JUMPTABLE_ADDRS:
+  case ARM::JUMPTABLE_TBB:
+  case ARM::JUMPTABLE_TBH:
+    // If this machine instr is a constant pool entry, its size is recorded as
+    // operand #2.
+    return MI.getOperand(2).getImm();
+  case ARM::Int_eh_sjlj_longjmp:
+    return 16;
+  case ARM::tInt_eh_sjlj_longjmp:
+    return 10;
+  case ARM::tInt_WIN_eh_sjlj_longjmp:
+    return 12;
+  case ARM::Int_eh_sjlj_setjmp:
+  case ARM::Int_eh_sjlj_setjmp_nofp:
+    return 20;
+  case ARM::tInt_eh_sjlj_setjmp:
+  case ARM::t2Int_eh_sjlj_setjmp:
+  case ARM::t2Int_eh_sjlj_setjmp_nofp:
+    return 12;
+  case ARM::SPACE:
+    return MI.getOperand(1).getImm();
+  }
+}
+
+unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr &MI) const {
+  unsigned Size = 0;
+  MachineBasicBlock::const_instr_iterator I = MI.getIterator();
+  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
+  while (++I != E && I->isInsideBundle()) {
+    assert(!I->isBundle() && "No nested bundle!");
+    Size += getInstSizeInBytes(*I);
+  }
+  return Size;
+}
+
+void ARMBaseInstrInfo::copyFromCPSR(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned DestReg, bool KillSrc,
+                                    const ARMSubtarget &Subtarget) const {
+  unsigned Opc = Subtarget.isThumb()
+                     ? (Subtarget.isMClass() ? ARM::t2MRS_M : ARM::t2MRS_AR)
+                     : ARM::MRS;
+
+  MachineInstrBuilder MIB =
+      BuildMI(MBB, I, I->getDebugLoc(), get(Opc), DestReg);
+
+  // There is only 1 A/R class MRS instruction, and it always refers to
+  // APSR. However, there are lots of other possibilities on M-class cores.
+  if (Subtarget.isMClass())
+    MIB.addImm(0x800);
+
+  AddDefaultPred(MIB);
+
+  MIB.addReg(ARM::CPSR, RegState::Implicit | getKillRegState(KillSrc));
+}
+
+void ARMBaseInstrInfo::copyToCPSR(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned SrcReg, bool KillSrc,
+                                  const ARMSubtarget &Subtarget) const {
+  unsigned Opc = Subtarget.isThumb()
+                     ? (Subtarget.isMClass() ? ARM::t2MSR_M : ARM::t2MSR_AR)
+                     : ARM::MSR;
+
+  MachineInstrBuilder MIB = BuildMI(MBB, I, I->getDebugLoc(), get(Opc));
+
+  if (Subtarget.isMClass())
+    MIB.addImm(0x800);
+  else
+    MIB.addImm(8);
+
+  MIB.addReg(SrcReg, getKillRegState(KillSrc));
+
+  AddDefaultPred(MIB);
+
+  MIB.addReg(ARM::CPSR, RegState::Implicit | RegState::Define);
+}
+
+void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I,
+                                   const DebugLoc &DL, unsigned DestReg,
+                                   unsigned SrcReg, bool KillSrc) const {
+  bool GPRDest = ARM::GPRRegClass.contains(DestReg);
+  bool GPRSrc = ARM::GPRRegClass.contains(SrcReg);
+
+  if (GPRDest && GPRSrc) {
+    AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg)
+                                    .addReg(SrcReg, getKillRegState(KillSrc))));
+    return;
+  }
+
+  bool SPRDest = ARM::SPRRegClass.contains(DestReg);
+  bool SPRSrc = ARM::SPRRegClass.contains(SrcReg);
+
+  unsigned Opc = 0;
+  if (SPRDest && SPRSrc)
+    Opc = ARM::VMOVS;
+  else if (GPRDest && SPRSrc)
+    Opc = ARM::VMOVRS;
+  else if (SPRDest && GPRSrc)
+    Opc = ARM::VMOVSR;
+  else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && !Subtarget.isFPOnlySP())
+    Opc = ARM::VMOVD;
+  else if (ARM::QPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VORRq;
+
+  if (Opc) {
+    MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg);
+    MIB.addReg(SrcReg, getKillRegState(KillSrc));
+    if (Opc == ARM::VORRq)
+      MIB.addReg(SrcReg, getKillRegState(KillSrc));
+    AddDefaultPred(MIB);
+    return;
+  }
+
+  // Handle register classes that require multiple instructions.
+  unsigned BeginIdx = 0;
+  unsigned SubRegs = 0;
+  int Spacing = 1;
+
+  // Use VORRq when possible.
+  if (ARM::QQPRRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VORRq;
+    BeginIdx = ARM::qsub_0;
+    SubRegs = 2;
+  } else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VORRq;
+    BeginIdx = ARM::qsub_0;
+    SubRegs = 4;
+  // Fall back to VMOVD.
+  } else if (ARM::DPairRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 2;
+  } else if (ARM::DTripleRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 3;
+  } else if (ARM::DQuadRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 4;
+  } else if (ARM::GPRPairRegClass.contains(DestReg, SrcReg)) {
+    Opc = Subtarget.isThumb2() ? ARM::tMOVr : ARM::MOVr;
+    BeginIdx = ARM::gsub_0;
+    SubRegs = 2;
+  } else if (ARM::DPairSpcRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 2;
+    Spacing = 2;
+  } else if (ARM::DTripleSpcRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 3;
+    Spacing = 2;
+  } else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 4;
+    Spacing = 2;
+  } else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && Subtarget.isFPOnlySP()) {
+    Opc = ARM::VMOVS;
+    BeginIdx = ARM::ssub_0;
+    SubRegs = 2;
+  } else if (SrcReg == ARM::CPSR) {
+    copyFromCPSR(MBB, I, DestReg, KillSrc, Subtarget);
+    return;
+  } else if (DestReg == ARM::CPSR) {
+    copyToCPSR(MBB, I, SrcReg, KillSrc, Subtarget);
+    return;
+  }
+
+  assert(Opc && "Impossible reg-to-reg copy");
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineInstrBuilder Mov;
+
+  // Copy register tuples backward when the first Dest reg overlaps with SrcReg.
+  if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) {
+    BeginIdx = BeginIdx + ((SubRegs - 1) * Spacing);
+    Spacing = -Spacing;
+  }
+#ifndef NDEBUG
+  SmallSet<unsigned, 4> DstRegs;
+#endif
+  for (unsigned i = 0; i != SubRegs; ++i) {
+    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
+    unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
+    assert(Dst && Src && "Bad sub-register");
+#ifndef NDEBUG
+    assert(!DstRegs.count(Src) && "destructive vector copy");
+    DstRegs.insert(Dst);
+#endif
+    Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst).addReg(Src);
+    // VORR takes two source operands.
+    if (Opc == ARM::VORRq)
+      Mov.addReg(Src);
+    Mov = AddDefaultPred(Mov);
+    // MOVr can set CC.
+    if (Opc == ARM::MOVr)
+      Mov = AddDefaultCC(Mov);
+  }
+  // Add implicit super-register defs and kills to the last instruction.
+  Mov->addRegisterDefined(DestReg, TRI);
+  if (KillSrc)
+    Mov->addRegisterKilled(SrcReg, TRI);
+}
+
+const MachineInstrBuilder &
+ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
+                          unsigned SubIdx, unsigned State,
+                          const TargetRegisterInfo *TRI) const {
+  if (!SubIdx)
+    return MIB.addReg(Reg, State);
+
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
+  return MIB.addReg(Reg, State, SubIdx);
+}
+
+void ARMBaseInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+      MFI.getObjectSize(FI), Align);
+
+  switch (RC->getSize()) {
+    case 4:
+      if (ARM::GPRRegClass.hasSubClassEq(RC)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STRi12))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 8:
+      if (ARM::DPRRegClass.hasSubClassEq(RC)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
+        if (Subtarget.hasV5TEOps()) {
+          MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::STRD));
+          AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
+          AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+          MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO);
+
+          AddDefaultPred(MIB);
+        } else {
+          // Fallback to STM instruction, which has existed since the dawn of
+          // time.
+          MachineInstrBuilder MIB =
+            AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STMIA))
+                             .addFrameIndex(FI).addMemOperand(MMO));
+          AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
+          AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+        }
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 16:
+      if (ARM::DPairRegClass.hasSubClassEq(RC)) {
+        // Use aligned spills if the stack can be realigned.
+        if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64))
+                     .addFrameIndex(FI).addImm(16)
+                     .addReg(SrcReg, getKillRegState(isKill))
+                     .addMemOperand(MMO));
+        } else {
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQIA))
+                     .addReg(SrcReg, getKillRegState(isKill))
+                     .addFrameIndex(FI)
+                     .addMemOperand(MMO));
+        }
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 24:
+      if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
+        // Use aligned spills if the stack can be realigned.
+        if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64TPseudo))
+                     .addFrameIndex(FI).addImm(16)
+                     .addReg(SrcReg, getKillRegState(isKill))
+                     .addMemOperand(MMO));
+        } else {
+          MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                       .addFrameIndex(FI))
+                       .addMemOperand(MMO);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+          AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+        }
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 32:
+      if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
+        if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+          // FIXME: It's possible to only store part of the QQ register if the
+          // spilled def has a sub-register index.
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo))
+                     .addFrameIndex(FI).addImm(16)
+                     .addReg(SrcReg, getKillRegState(isKill))
+                     .addMemOperand(MMO));
+        } else {
+          MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                       .addFrameIndex(FI))
+                       .addMemOperand(MMO);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+                AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
+        }
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 64:
+      if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
+        MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                         .addFrameIndex(FI))
+                         .addMemOperand(MMO);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI);
+              AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI);
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    default:
+      llvm_unreachable("Unknown reg class!");
+  }
+}
+
+unsigned ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                              int &FrameIndex) const {
+  switch (MI.getOpcode()) {
+  default: break;
+  case ARM::STRrs:
+  case ARM::t2STRs: // FIXME: don't use t2STRs to access frame.
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isReg() &&
+        MI.getOperand(3).isImm() && MI.getOperand(2).getReg() == 0 &&
+        MI.getOperand(3).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
+  case ARM::STRi12:
+  case ARM::t2STRi12:
+  case ARM::tSTRspi:
+  case ARM::VSTRD:
+  case ARM::VSTRS:
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+        MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
+  case ARM::VST1q64:
+  case ARM::VST1d64TPseudo:
+  case ARM::VST1d64QPseudo:
+    if (MI.getOperand(0).isFI() && MI.getOperand(2).getSubReg() == 0) {
+      FrameIndex = MI.getOperand(0).getIndex();
+      return MI.getOperand(2).getReg();
+    }
+    break;
+  case ARM::VSTMQIA:
+    if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
+  }
+
+  return 0;
+}
+
+unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
+                                                    int &FrameIndex) const {
+  const MachineMemOperand *Dummy;
+  return MI.mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex);
+}
+
+void ARMBaseInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+      MFI.getObjectSize(FI), Align);
+
+  switch (RC->getSize()) {
+  case 4:
+    if (ARM::GPRRegClass.hasSubClassEq(RC)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+
+    } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+  case 8:
+    if (ARM::DPRRegClass.hasSubClassEq(RC)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
+      MachineInstrBuilder MIB;
+
+      if (Subtarget.hasV5TEOps()) {
+        MIB = BuildMI(MBB, I, DL, get(ARM::LDRD));
+        AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
+        AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+        MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO);
+
+        AddDefaultPred(MIB);
+      } else {
+        // Fallback to LDM instruction, which has existed since the dawn of
+        // time.
+        MIB = AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDMIA))
+                                 .addFrameIndex(FI).addMemOperand(MMO));
+        MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+      }
+
+      if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+        MIB.addReg(DestReg, RegState::ImplicitDefine);
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+  case 16:
+    if (ARM::DPairRegClass.hasSubClassEq(RC)) {
+      if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg)
+                     .addFrameIndex(FI).addImm(16)
+                     .addMemOperand(MMO));
+      } else {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQIA), DestReg)
+                       .addFrameIndex(FI)
+                       .addMemOperand(MMO));
+      }
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+  case 24:
+    if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
+      if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg)
+                     .addFrameIndex(FI).addImm(16)
+                     .addMemOperand(MMO));
+      } else {
+        MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                         .addFrameIndex(FI)
+                         .addMemOperand(MMO));
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+        if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+          MIB.addReg(DestReg, RegState::ImplicitDefine);
+      }
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+   case 32:
+    if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
+      if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
+                     .addFrameIndex(FI).addImm(16)
+                     .addMemOperand(MMO));
+      } else {
+        MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                       .addFrameIndex(FI))
+                       .addMemOperand(MMO);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
+        if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+          MIB.addReg(DestReg, RegState::ImplicitDefine);
+      }
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+  case 64:
+    if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
+      MachineInstrBuilder MIB =
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                     .addFrameIndex(FI))
+                     .addMemOperand(MMO);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead, TRI);
+      if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+        MIB.addReg(DestReg, RegState::ImplicitDefine);
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+  default:
+    llvm_unreachable("Unknown regclass!");
+  }
+}
+
+unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                               int &FrameIndex) const {
+  switch (MI.getOpcode()) {
+  default: break;
+  case ARM::LDRrs:
+  case ARM::t2LDRs:  // FIXME: don't use t2LDRs to access frame.
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isReg() &&
+        MI.getOperand(3).isImm() && MI.getOperand(2).getReg() == 0 &&
+        MI.getOperand(3).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
+  case ARM::LDRi12:
+  case ARM::t2LDRi12:
+  case ARM::tLDRspi:
+  case ARM::VLDRD:
+  case ARM::VLDRS:
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+        MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
+  case ARM::VLD1q64:
+  case ARM::VLD1d64TPseudo:
+  case ARM::VLD1d64QPseudo:
+    if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
+  case ARM::VLDMQIA:
+    if (MI.getOperand(1).isFI() && MI.getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
+  }
+
+  return 0;
+}
+
+unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
+                                                     int &FrameIndex) const {
+  const MachineMemOperand *Dummy;
+  return MI.mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+}
+
+/// \brief Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD
+/// depending on whether the result is used.
+void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
+  bool isThumb1 = Subtarget.isThumb1Only();
+  bool isThumb2 = Subtarget.isThumb2();
+  const ARMBaseInstrInfo *TII = Subtarget.getInstrInfo();
+
+  DebugLoc dl = MI->getDebugLoc();
+  MachineBasicBlock *BB = MI->getParent();
+
+  MachineInstrBuilder LDM, STM;
+  if (isThumb1 || !MI->getOperand(1).isDead()) {
+    LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA_UPD
+                                                 : isThumb1 ? ARM::tLDMIA_UPD
+                                                            : ARM::LDMIA_UPD))
+             .addOperand(MI->getOperand(1));
+  } else {
+    LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA : ARM::LDMIA));
+  }
+
+  if (isThumb1 || !MI->getOperand(0).isDead()) {
+    STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA_UPD
+                                                 : isThumb1 ? ARM::tSTMIA_UPD
+                                                            : ARM::STMIA_UPD))
+             .addOperand(MI->getOperand(0));
+  } else {
+    STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA : ARM::STMIA));
+  }
+
+  AddDefaultPred(LDM.addOperand(MI->getOperand(3)));
+  AddDefaultPred(STM.addOperand(MI->getOperand(2)));
+
+  // Sort the scratch registers into ascending order.
+  const TargetRegisterInfo &TRI = getRegisterInfo();
+  llvm::SmallVector<unsigned, 6> ScratchRegs;
+  for(unsigned I = 5; I < MI->getNumOperands(); ++I)
+    ScratchRegs.push_back(MI->getOperand(I).getReg());
+  std::sort(ScratchRegs.begin(), ScratchRegs.end(),
+            [&TRI](const unsigned &Reg1,
+                   const unsigned &Reg2) -> bool {
+              return TRI.getEncodingValue(Reg1) <
+                     TRI.getEncodingValue(Reg2);
+            });
+
+  for (const auto &Reg : ScratchRegs) {
+    LDM.addReg(Reg, RegState::Define);
+    STM.addReg(Reg, RegState::Kill);
+  }
+
+  BB->erase(MI);
+}
+
+
+bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  if (MI.getOpcode() == TargetOpcode::LOAD_STACK_GUARD) {
+    assert(getSubtarget().getTargetTriple().isOSBinFormatMachO() &&
+           "LOAD_STACK_GUARD currently supported only for MachO.");
+    expandLoadStackGuard(MI);
+    MI.getParent()->erase(MI);
+    return true;
+  }
+
+  if (MI.getOpcode() == ARM::MEMCPY) {
+    expandMEMCPY(MI);
+    return true;
+  }
+
+  // This hook gets to expand COPY instructions before they become
+  // copyPhysReg() calls.  Look for VMOVS instructions that can legally be
+  // widened to VMOVD.  We prefer the VMOVD when possible because it may be
+  // changed into a VORR that can go down the NEON pipeline.
+  if (!MI.isCopy() || Subtarget.dontWidenVMOVS() || Subtarget.isFPOnlySP())
+    return false;
+
+  // Look for a copy between even S-registers.  That is where we keep floats
+  // when using NEON v2f32 instructions for f32 arithmetic.
+  unsigned DstRegS = MI.getOperand(0).getReg();
+  unsigned SrcRegS = MI.getOperand(1).getReg();
+  if (!ARM::SPRRegClass.contains(DstRegS, SrcRegS))
+    return false;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  unsigned DstRegD = TRI->getMatchingSuperReg(DstRegS, ARM::ssub_0,
+                                              &ARM::DPRRegClass);
+  unsigned SrcRegD = TRI->getMatchingSuperReg(SrcRegS, ARM::ssub_0,
+                                              &ARM::DPRRegClass);
+  if (!DstRegD || !SrcRegD)
+    return false;
+
+  // We want to widen this into a DstRegD = VMOVD SrcRegD copy.  This is only
+  // legal if the COPY already defines the full DstRegD, and it isn't a
+  // sub-register insertion.
+  if (!MI.definesRegister(DstRegD, TRI) || MI.readsRegister(DstRegD, TRI))
+    return false;
+
+  // A dead copy shouldn't show up here, but reject it just in case.
+  if (MI.getOperand(0).isDead())
+    return false;
+
+  // All clear, widen the COPY.
+  DEBUG(dbgs() << "widening:    " << MI);
+  MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+
+  // Get rid of the old <imp-def> of DstRegD.  Leave it if it defines a Q-reg
+  // or some other super-register.
+  int ImpDefIdx = MI.findRegisterDefOperandIdx(DstRegD);
+  if (ImpDefIdx != -1)
+    MI.RemoveOperand(ImpDefIdx);
+
+  // Change the opcode and operands.
+  MI.setDesc(get(ARM::VMOVD));
+  MI.getOperand(0).setReg(DstRegD);
+  MI.getOperand(1).setReg(SrcRegD);
+  AddDefaultPred(MIB);
+
+  // We are now reading SrcRegD instead of SrcRegS.  This may upset the
+  // register scavenger and machine verifier, so we need to indicate that we
+  // are reading an undefined value from SrcRegD, but a proper value from
+  // SrcRegS.
+  MI.getOperand(1).setIsUndef();
+  MIB.addReg(SrcRegS, RegState::Implicit);
+
+  // SrcRegD may actually contain an unrelated value in the ssub_1
+  // sub-register.  Don't kill it.  Only kill the ssub_0 sub-register.
+  if (MI.getOperand(1).isKill()) {
+    MI.getOperand(1).setIsKill(false);
+    MI.addRegisterKilled(SrcRegS, TRI, true);
+  }
+
+  DEBUG(dbgs() << "replaced by: " << MI);
+  return true;
+}
+
+/// Create a copy of a const pool value. Update CPI to the new index and return
+/// the label UID.
+static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
+  MachineConstantPool *MCP = MF.getConstantPool();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPI];
+  assert(MCPE.isMachineConstantPoolEntry() &&
+         "Expecting a machine constantpool entry!");
+  ARMConstantPoolValue *ACPV =
+    static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal);
+
+  unsigned PCLabelId = AFI->createPICLabelUId();
+  ARMConstantPoolValue *NewCPV = nullptr;
+
+  // FIXME: The below assumes PIC relocation model and that the function
+  // is Thumb mode (t1 or t2). PCAdjustment would be 8 for ARM mode PIC, and
+  // zero for non-PIC in ARM or Thumb. The callers are all of thumb LDR
+  // instructions, so that's probably OK, but is PIC always correct when
+  // we get here?
+  if (ACPV->isGlobalValue())
+    NewCPV = ARMConstantPoolConstant::Create(
+        cast<ARMConstantPoolConstant>(ACPV)->getGV(), PCLabelId, ARMCP::CPValue,
+        4, ACPV->getModifier(), ACPV->mustAddCurrentAddress());
+  else if (ACPV->isExtSymbol())
+    NewCPV = ARMConstantPoolSymbol::
+      Create(MF.getFunction()->getContext(),
+             cast<ARMConstantPoolSymbol>(ACPV)->getSymbol(), PCLabelId, 4);
+  else if (ACPV->isBlockAddress())
+    NewCPV = ARMConstantPoolConstant::
+      Create(cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress(), PCLabelId,
+             ARMCP::CPBlockAddress, 4);
+  else if (ACPV->isLSDA())
+    NewCPV = ARMConstantPoolConstant::Create(MF.getFunction(), PCLabelId,
+                                             ARMCP::CPLSDA, 4);
+  else if (ACPV->isMachineBasicBlock())
+    NewCPV = ARMConstantPoolMBB::
+      Create(MF.getFunction()->getContext(),
+             cast<ARMConstantPoolMBB>(ACPV)->getMBB(), PCLabelId, 4);
+  else
+    llvm_unreachable("Unexpected ARM constantpool value type!!");
+  CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlignment());
+  return PCLabelId;
+}
+
+void ARMBaseInstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     unsigned DestReg, unsigned SubIdx,
+                                     const MachineInstr &Orig,
+                                     const TargetRegisterInfo &TRI) const {
+  unsigned Opcode = Orig.getOpcode();
+  switch (Opcode) {
+  default: {
+    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
+    MI->substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
+    MBB.insert(I, MI);
+    break;
+  }
+  case ARM::tLDRpci_pic:
+  case ARM::t2LDRpci_pic: {
+    MachineFunction &MF = *MBB.getParent();
+    unsigned CPI = Orig.getOperand(1).getIndex();
+    unsigned PCLabelId = duplicateCPV(MF, CPI);
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, I, Orig.getDebugLoc(), get(Opcode), DestReg)
+            .addConstantPoolIndex(CPI)
+            .addImm(PCLabelId);
+    MIB->setMemRefs(Orig.memoperands_begin(), Orig.memoperands_end());
+    break;
+  }
+  }
+}
+
+MachineInstr *ARMBaseInstrInfo::duplicate(MachineInstr &Orig,
+                                          MachineFunction &MF) const {
+  MachineInstr *MI = TargetInstrInfo::duplicate(Orig, MF);
+  switch (Orig.getOpcode()) {
+  case ARM::tLDRpci_pic:
+  case ARM::t2LDRpci_pic: {
+    unsigned CPI = Orig.getOperand(1).getIndex();
+    unsigned PCLabelId = duplicateCPV(MF, CPI);
+    Orig.getOperand(1).setIndex(CPI);
+    Orig.getOperand(2).setImm(PCLabelId);
+    break;
+  }
+  }
+  return MI;
+}
+
+bool ARMBaseInstrInfo::produceSameValue(const MachineInstr &MI0,
+                                        const MachineInstr &MI1,
+                                        const MachineRegisterInfo *MRI) const {
+  unsigned Opcode = MI0.getOpcode();
+  if (Opcode == ARM::t2LDRpci ||
+      Opcode == ARM::t2LDRpci_pic ||
+      Opcode == ARM::tLDRpci ||
+      Opcode == ARM::tLDRpci_pic ||
+      Opcode == ARM::LDRLIT_ga_pcrel ||
+      Opcode == ARM::LDRLIT_ga_pcrel_ldr ||
+      Opcode == ARM::tLDRLIT_ga_pcrel ||
+      Opcode == ARM::MOV_ga_pcrel ||
+      Opcode == ARM::MOV_ga_pcrel_ldr ||
+      Opcode == ARM::t2MOV_ga_pcrel) {
+    if (MI1.getOpcode() != Opcode)
+      return false;
+    if (MI0.getNumOperands() != MI1.getNumOperands())
+      return false;
+
+    const MachineOperand &MO0 = MI0.getOperand(1);
+    const MachineOperand &MO1 = MI1.getOperand(1);
+    if (MO0.getOffset() != MO1.getOffset())
+      return false;
+
+    if (Opcode == ARM::LDRLIT_ga_pcrel ||
+        Opcode == ARM::LDRLIT_ga_pcrel_ldr ||
+        Opcode == ARM::tLDRLIT_ga_pcrel ||
+        Opcode == ARM::MOV_ga_pcrel ||
+        Opcode == ARM::MOV_ga_pcrel_ldr ||
+        Opcode == ARM::t2MOV_ga_pcrel)
+      // Ignore the PC labels.
+      return MO0.getGlobal() == MO1.getGlobal();
+
+    const MachineFunction *MF = MI0.getParent()->getParent();
+    const MachineConstantPool *MCP = MF->getConstantPool();
+    int CPI0 = MO0.getIndex();
+    int CPI1 = MO1.getIndex();
+    const MachineConstantPoolEntry &MCPE0 = MCP->getConstants()[CPI0];
+    const MachineConstantPoolEntry &MCPE1 = MCP->getConstants()[CPI1];
+    bool isARMCP0 = MCPE0.isMachineConstantPoolEntry();
+    bool isARMCP1 = MCPE1.isMachineConstantPoolEntry();
+    if (isARMCP0 && isARMCP1) {
+      ARMConstantPoolValue *ACPV0 =
+        static_cast<ARMConstantPoolValue*>(MCPE0.Val.MachineCPVal);
+      ARMConstantPoolValue *ACPV1 =
+        static_cast<ARMConstantPoolValue*>(MCPE1.Val.MachineCPVal);
+      return ACPV0->hasSameValue(ACPV1);
+    } else if (!isARMCP0 && !isARMCP1) {
+      return MCPE0.Val.ConstVal == MCPE1.Val.ConstVal;
+    }
+    return false;
+  } else if (Opcode == ARM::PICLDR) {
+    if (MI1.getOpcode() != Opcode)
+      return false;
+    if (MI0.getNumOperands() != MI1.getNumOperands())
+      return false;
+
+    unsigned Addr0 = MI0.getOperand(1).getReg();
+    unsigned Addr1 = MI1.getOperand(1).getReg();
+    if (Addr0 != Addr1) {
+      if (!MRI ||
+          !TargetRegisterInfo::isVirtualRegister(Addr0) ||
+          !TargetRegisterInfo::isVirtualRegister(Addr1))
+        return false;
+
+      // This assumes SSA form.
+      MachineInstr *Def0 = MRI->getVRegDef(Addr0);
+      MachineInstr *Def1 = MRI->getVRegDef(Addr1);
+      // Check if the loaded value, e.g. a constantpool of a global address, are
+      // the same.
+      if (!produceSameValue(*Def0, *Def1, MRI))
+        return false;
+    }
+
+    for (unsigned i = 3, e = MI0.getNumOperands(); i != e; ++i) {
+      // %vreg12<def> = PICLDR %vreg11, 0, pred:14, pred:%noreg
+      const MachineOperand &MO0 = MI0.getOperand(i);
+      const MachineOperand &MO1 = MI1.getOperand(i);
+      if (!MO0.isIdenticalTo(MO1))
+        return false;
+    }
+    return true;
+  }
+
+  return MI0.isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
+}
+
+/// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
+/// determine if two loads are loading from the same base address. It should
+/// only return true if the base pointers are the same and the only differences
+/// between the two addresses is the offset. It also returns the offsets by
+/// reference.
+///
+/// FIXME: remove this in favor of the MachineInstr interface once pre-RA-sched
+/// is permanently disabled.
+bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                               int64_t &Offset1,
+                                               int64_t &Offset2) const {
+  // Don't worry about Thumb: just ARM and Thumb2.
+  if (Subtarget.isThumb1Only()) return false;
+
+  if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+    return false;
+
+  switch (Load1->getMachineOpcode()) {
+  default:
+    return false;
+  case ARM::LDRi12:
+  case ARM::LDRBi12:
+  case ARM::LDRD:
+  case ARM::LDRH:
+  case ARM::LDRSB:
+  case ARM::LDRSH:
+  case ARM::VLDRD:
+  case ARM::VLDRS:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRBi8:
+  case ARM::t2LDRDi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRi12:
+  case ARM::t2LDRBi12:
+  case ARM::t2LDRSHi12:
+    break;
+  }
+
+  switch (Load2->getMachineOpcode()) {
+  default:
+    return false;
+  case ARM::LDRi12:
+  case ARM::LDRBi12:
+  case ARM::LDRD:
+  case ARM::LDRH:
+  case ARM::LDRSB:
+  case ARM::LDRSH:
+  case ARM::VLDRD:
+  case ARM::VLDRS:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRBi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRi12:
+  case ARM::t2LDRBi12:
+  case ARM::t2LDRSHi12:
+    break;
+  }
+
+  // Check if base addresses and chain operands match.
+  if (Load1->getOperand(0) != Load2->getOperand(0) ||
+      Load1->getOperand(4) != Load2->getOperand(4))
+    return false;
+
+  // Index should be Reg0.
+  if (Load1->getOperand(3) != Load2->getOperand(3))
+    return false;
+
+  // Determine the offsets.
+  if (isa<ConstantSDNode>(Load1->getOperand(1)) &&
+      isa<ConstantSDNode>(Load2->getOperand(1))) {
+    Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getSExtValue();
+    Offset2 = cast<ConstantSDNode>(Load2->getOperand(1))->getSExtValue();
+    return true;
+  }
+
+  return false;
+}
+
+/// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+/// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
+/// be scheduled togther. On some targets if two loads are loading from
+/// addresses in the same cache line, it's better if they are scheduled
+/// together. This function takes two integers that represent the load offsets
+/// from the common base address. It returns true if it decides it's desirable
+/// to schedule the two loads together. "NumLoads" is the number of loads that
+/// have already been scheduled after Load1.
+///
+/// FIXME: remove this in favor of the MachineInstr interface once pre-RA-sched
+/// is permanently disabled.
+bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                               int64_t Offset1, int64_t Offset2,
+                                               unsigned NumLoads) const {
+  // Don't worry about Thumb: just ARM and Thumb2.
+  if (Subtarget.isThumb1Only()) return false;
+
+  assert(Offset2 > Offset1);
+
+  if ((Offset2 - Offset1) / 8 > 64)
+    return false;
+
+  // Check if the machine opcodes are different. If they are different
+  // then we consider them to not be of the same base address,
+  // EXCEPT in the case of Thumb2 byte loads where one is LDRBi8 and the other LDRBi12.
+  // In this case, they are considered to be the same because they are different
+  // encoding forms of the same basic instruction.
+  if ((Load1->getMachineOpcode() != Load2->getMachineOpcode()) &&
+      !((Load1->getMachineOpcode() == ARM::t2LDRBi8 &&
+         Load2->getMachineOpcode() == ARM::t2LDRBi12) ||
+        (Load1->getMachineOpcode() == ARM::t2LDRBi12 &&
+         Load2->getMachineOpcode() == ARM::t2LDRBi8)))
+    return false;  // FIXME: overly conservative?
+
+  // Four loads in a row should be sufficient.
+  if (NumLoads >= 3)
+    return false;
+
+  return true;
+}
+
+bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                            const MachineBasicBlock *MBB,
+                                            const MachineFunction &MF) const {
+  // Debug info is never a scheduling boundary. It's necessary to be explicit
+  // due to the special treatment of IT instructions below, otherwise a
+  // dbg_value followed by an IT will result in the IT instruction being
+  // considered a scheduling hazard, which is wrong. It should be the actual
+  // instruction preceding the dbg_value instruction(s), just like it is
+  // when debug info is not present.
+  if (MI.isDebugValue())
+    return false;
+
+  // Terminators and labels can't be scheduled around.
+  if (MI.isTerminator() || MI.isPosition())
+    return true;
+
+  // Treat the start of the IT block as a scheduling boundary, but schedule
+  // t2IT along with all instructions following it.
+  // FIXME: This is a big hammer. But the alternative is to add all potential
+  // true and anti dependencies to IT block instructions as implicit operands
+  // to the t2IT instruction. The added compile time and complexity does not
+  // seem worth it.
+  MachineBasicBlock::const_iterator I = MI;
+  // Make sure to skip any dbg_value instructions
+  while (++I != MBB->end() && I->isDebugValue())
+    ;
+  if (I != MBB->end() && I->getOpcode() == ARM::t2IT)
+    return true;
+
+  // Don't attempt to schedule around any instruction that defines
+  // a stack-oriented pointer, as it's unlikely to be profitable. This
+  // saves compile time, because it doesn't require every single
+  // stack slot reference to depend on the instruction that does the
+  // modification.
+  // Calls don't actually change the stack pointer, even if they have imp-defs.
+  // No ARM calling conventions change the stack pointer. (X86 calling
+  // conventions sometimes do).
+  if (!MI.isCall() && MI.definesRegister(ARM::SP))
+    return true;
+
+  return false;
+}
+
+bool ARMBaseInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &MBB,
+                    unsigned NumCycles, unsigned ExtraPredCycles,
+                    BranchProbability Probability) const {
+  if (!NumCycles)
+    return false;
+
+  // If we are optimizing for size, see if the branch in the predecessor can be
+  // lowered to cbn?z by the constant island lowering pass, and return false if
+  // so. This results in a shorter instruction sequence.
+  if (MBB.getParent()->getFunction()->optForSize()) {
+    MachineBasicBlock *Pred = *MBB.pred_begin();
+    if (!Pred->empty()) {
+      MachineInstr *LastMI = &*Pred->rbegin();
+      if (LastMI->getOpcode() == ARM::t2Bcc) {
+        MachineBasicBlock::iterator CmpMI = LastMI;
+        if (CmpMI != Pred->begin()) {
+          --CmpMI;
+          if (CmpMI->getOpcode() == ARM::tCMPi8 ||
+              CmpMI->getOpcode() == ARM::t2CMPri) {
+            unsigned Reg = CmpMI->getOperand(0).getReg();
+            unsigned PredReg = 0;
+            ARMCC::CondCodes P = getInstrPredicate(*CmpMI, PredReg);
+            if (P == ARMCC::AL && CmpMI->getOperand(1).getImm() == 0 &&
+                isARMLowRegister(Reg))
+              return false;
+          }
+        }
+      }
+    }
+  }
+
+  // Attempt to estimate the relative costs of predication versus branching.
+  // Here we scale up each component of UnpredCost to avoid precision issue when
+  // scaling NumCycles by Probability.
+  const unsigned ScalingUpFactor = 1024;
+  unsigned UnpredCost = Probability.scale(NumCycles * ScalingUpFactor);
+  UnpredCost += ScalingUpFactor; // The branch itself
+  UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+
+  return (NumCycles + ExtraPredCycles) * ScalingUpFactor <= UnpredCost;
+}
+
+bool ARMBaseInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                    unsigned TCycles, unsigned TExtra,
+                    MachineBasicBlock &FMBB,
+                    unsigned FCycles, unsigned FExtra,
+                    BranchProbability Probability) const {
+  if (!TCycles || !FCycles)
+    return false;
+
+  // Attempt to estimate the relative costs of predication versus branching.
+  // Here we scale up each component of UnpredCost to avoid precision issue when
+  // scaling TCycles/FCycles by Probability.
+  const unsigned ScalingUpFactor = 1024;
+  unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
+  unsigned FUnpredCost =
+      Probability.getCompl().scale(FCycles * ScalingUpFactor);
+  unsigned UnpredCost = TUnpredCost + FUnpredCost;
+  UnpredCost += 1 * ScalingUpFactor; // The branch itself
+  UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+
+  return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost;
+}
+
+bool
+ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                            MachineBasicBlock &FMBB) const {
+  // Reduce false anti-dependencies to let the target's out-of-order execution
+  // engine do its thing.
+  return Subtarget.isProfitableToUnpredicate();
+}
+
+/// getInstrPredicate - If instruction is predicated, returns its predicate
+/// condition, otherwise returns AL. It also returns the condition code
+/// register by reference.
+ARMCC::CondCodes llvm::getInstrPredicate(const MachineInstr &MI,
+                                         unsigned &PredReg) {
+  int PIdx = MI.findFirstPredOperandIdx();
+  if (PIdx == -1) {
+    PredReg = 0;
+    return ARMCC::AL;
+  }
+
+  PredReg = MI.getOperand(PIdx+1).getReg();
+  return (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
+}
+
+
+unsigned llvm::getMatchingCondBranchOpcode(unsigned Opc) {
+  if (Opc == ARM::B)
+    return ARM::Bcc;
+  if (Opc == ARM::tB)
+    return ARM::tBcc;
+  if (Opc == ARM::t2B)
+    return ARM::t2Bcc;
+
+  llvm_unreachable("Unknown unconditional branch opcode!");
+}
+
+MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI,
+                                                       bool NewMI,
+                                                       unsigned OpIdx1,
+                                                       unsigned OpIdx2) const {
+  switch (MI.getOpcode()) {
+  case ARM::MOVCCr:
+  case ARM::t2MOVCCr: {
+    // MOVCC can be commuted by inverting the condition.
+    unsigned PredReg = 0;
+    ARMCC::CondCodes CC = getInstrPredicate(MI, PredReg);
+    // MOVCC AL can't be inverted. Shouldn't happen.
+    if (CC == ARMCC::AL || PredReg != ARM::CPSR)
+      return nullptr;
+    MachineInstr *CommutedMI =
+        TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+    if (!CommutedMI)
+      return nullptr;
+    // After swapping the MOVCC operands, also invert the condition.
+    CommutedMI->getOperand(CommutedMI->findFirstPredOperandIdx())
+        .setImm(ARMCC::getOppositeCondition(CC));
+    return CommutedMI;
+  }
+  }
+  return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+}
+
+/// Identify instructions that can be folded into a MOVCC instruction, and
+/// return the defining instruction.
+static MachineInstr *canFoldIntoMOVCC(unsigned Reg,
+                                      const MachineRegisterInfo &MRI,
+                                      const TargetInstrInfo *TII) {
+  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    return nullptr;
+  if (!MRI.hasOneNonDBGUse(Reg))
+    return nullptr;
+  MachineInstr *MI = MRI.getVRegDef(Reg);
+  if (!MI)
+    return nullptr;
+  // MI is folded into the MOVCC by predicating it.
+  if (!MI->isPredicable())
+    return nullptr;
+  // Check if MI has any non-dead defs or physreg uses. This also detects
+  // predicated instructions which will be reading CPSR.
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    // Reject frame index operands, PEI can't handle the predicated pseudos.
+    if (MO.isFI() || MO.isCPI() || MO.isJTI())
+      return nullptr;
+    if (!MO.isReg())
+      continue;
+    // MI can't have any tied operands, that would conflict with predication.
+    if (MO.isTied())
+      return nullptr;
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      return nullptr;
+    if (MO.isDef() && !MO.isDead())
+      return nullptr;
+  }
+  bool DontMoveAcrossStores = true;
+  if (!MI->isSafeToMove(/* AliasAnalysis = */ nullptr, DontMoveAcrossStores))
+    return nullptr;
+  return MI;
+}
+
+bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr &MI,
+                                     SmallVectorImpl<MachineOperand> &Cond,
+                                     unsigned &TrueOp, unsigned &FalseOp,
+                                     bool &Optimizable) const {
+  assert((MI.getOpcode() == ARM::MOVCCr || MI.getOpcode() == ARM::t2MOVCCr) &&
+         "Unknown select instruction");
+  // MOVCC operands:
+  // 0: Def.
+  // 1: True use.
+  // 2: False use.
+  // 3: Condition code.
+  // 4: CPSR use.
+  TrueOp = 1;
+  FalseOp = 2;
+  Cond.push_back(MI.getOperand(3));
+  Cond.push_back(MI.getOperand(4));
+  // We can always fold a def.
+  Optimizable = true;
+  return false;
+}
+
+MachineInstr *
+ARMBaseInstrInfo::optimizeSelect(MachineInstr &MI,
+                                 SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+                                 bool PreferFalse) const {
+  assert((MI.getOpcode() == ARM::MOVCCr || MI.getOpcode() == ARM::t2MOVCCr) &&
+         "Unknown select instruction");
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  MachineInstr *DefMI = canFoldIntoMOVCC(MI.getOperand(2).getReg(), MRI, this);
+  bool Invert = !DefMI;
+  if (!DefMI)
+    DefMI = canFoldIntoMOVCC(MI.getOperand(1).getReg(), MRI, this);
+  if (!DefMI)
+    return nullptr;
+
+  // Find new register class to use.
+  MachineOperand FalseReg = MI.getOperand(Invert ? 2 : 1);
+  unsigned DestReg = MI.getOperand(0).getReg();
+  const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
+  if (!MRI.constrainRegClass(DestReg, PreviousClass))
+    return nullptr;
+
+  // Create a new predicated version of DefMI.
+  // Rfalse is the first use.
+  MachineInstrBuilder NewMI =
+      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), DefMI->getDesc(), DestReg);
+
+  // Copy all the DefMI operands, excluding its (null) predicate.
+  const MCInstrDesc &DefDesc = DefMI->getDesc();
+  for (unsigned i = 1, e = DefDesc.getNumOperands();
+       i != e && !DefDesc.OpInfo[i].isPredicate(); ++i)
+    NewMI.addOperand(DefMI->getOperand(i));
+
+  unsigned CondCode = MI.getOperand(3).getImm();
+  if (Invert)
+    NewMI.addImm(ARMCC::getOppositeCondition(ARMCC::CondCodes(CondCode)));
+  else
+    NewMI.addImm(CondCode);
+  NewMI.addOperand(MI.getOperand(4));
+
+  // DefMI is not the -S version that sets CPSR, so add an optional %noreg.
+  if (NewMI->hasOptionalDef())
+    AddDefaultCC(NewMI);
+
+  // The output register value when the predicate is false is an implicit
+  // register operand tied to the first def.
+  // The tie makes the register allocator ensure the FalseReg is allocated the
+  // same register as operand 0.
+  FalseReg.setImplicit();
+  NewMI.addOperand(FalseReg);
+  NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
+
+  // Update SeenMIs set: register newly created MI and erase removed DefMI.
+  SeenMIs.insert(NewMI);
+  SeenMIs.erase(DefMI);
+
+  // If MI is inside a loop, and DefMI is outside the loop, then kill flags on
+  // DefMI would be invalid when tranferred inside the loop.  Checking for a
+  // loop is expensive, but at least remove kill flags if they are in different
+  // BBs.
+  if (DefMI->getParent() != MI.getParent())
+    NewMI->clearKillInfo();
+
+  // The caller will erase MI, but not DefMI.
+  DefMI->eraseFromParent();
+  return NewMI;
+}
+
+/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether the
+/// instruction is encoded with an 'S' bit is determined by the optional CPSR
+/// def operand.
+///
+/// This will go away once we can teach tblgen how to set the optional CPSR def
+/// operand itself.
+struct AddSubFlagsOpcodePair {
+  uint16_t PseudoOpc;
+  uint16_t MachineOpc;
+};
+
+static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
+  {ARM::ADDSri, ARM::ADDri},
+  {ARM::ADDSrr, ARM::ADDrr},
+  {ARM::ADDSrsi, ARM::ADDrsi},
+  {ARM::ADDSrsr, ARM::ADDrsr},
+
+  {ARM::SUBSri, ARM::SUBri},
+  {ARM::SUBSrr, ARM::SUBrr},
+  {ARM::SUBSrsi, ARM::SUBrsi},
+  {ARM::SUBSrsr, ARM::SUBrsr},
+
+  {ARM::RSBSri, ARM::RSBri},
+  {ARM::RSBSrsi, ARM::RSBrsi},
+  {ARM::RSBSrsr, ARM::RSBrsr},
+
+  {ARM::t2ADDSri, ARM::t2ADDri},
+  {ARM::t2ADDSrr, ARM::t2ADDrr},
+  {ARM::t2ADDSrs, ARM::t2ADDrs},
+
+  {ARM::t2SUBSri, ARM::t2SUBri},
+  {ARM::t2SUBSrr, ARM::t2SUBrr},
+  {ARM::t2SUBSrs, ARM::t2SUBrs},
+
+  {ARM::t2RSBSri, ARM::t2RSBri},
+  {ARM::t2RSBSrs, ARM::t2RSBrs},
+};
+
+unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) {
+  for (unsigned i = 0, e = array_lengthof(AddSubFlagsOpcodeMap); i != e; ++i)
+    if (OldOpc == AddSubFlagsOpcodeMap[i].PseudoOpc)
+      return AddSubFlagsOpcodeMap[i].MachineOpc;
+  return 0;
+}
+
+void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator &MBBI,
+                                   const DebugLoc &dl, unsigned DestReg,
+                                   unsigned BaseReg, int NumBytes,
+                                   ARMCC::CondCodes Pred, unsigned PredReg,
+                                   const ARMBaseInstrInfo &TII,
+                                   unsigned MIFlags) {
+  if (NumBytes == 0 && DestReg != BaseReg) {
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), DestReg)
+      .addReg(BaseReg, RegState::Kill)
+      .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+      .setMIFlags(MIFlags);
+    return;
+  }
+
+  bool isSub = NumBytes < 0;
+  if (isSub) NumBytes = -NumBytes;
+
+  while (NumBytes) {
+    unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes);
+    unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt);
+    assert(ThisVal && "Didn't extract field correctly");
+
+    // We will handle these bits from offset, clear them.
+    NumBytes &= ~ThisVal;
+
+    assert(ARM_AM::getSOImmVal(ThisVal) != -1 && "Bit extraction didn't work?");
+
+    // Build the new ADD / SUB.
+    unsigned Opc = isSub ? ARM::SUBri : ARM::ADDri;
+    BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+      .addReg(BaseReg, RegState::Kill).addImm(ThisVal)
+      .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+      .setMIFlags(MIFlags);
+    BaseReg = DestReg;
+  }
+}
+
+bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
+                                      MachineFunction &MF, MachineInstr *MI,
+                                      unsigned NumBytes) {
+  // This optimisation potentially adds lots of load and store
+  // micro-operations, it's only really a great benefit to code-size.
+  if (!MF.getFunction()->optForMinSize())
+    return false;
+
+  // If only one register is pushed/popped, LLVM can use an LDR/STR
+  // instead. We can't modify those so make sure we're dealing with an
+  // instruction we understand.
+  bool IsPop = isPopOpcode(MI->getOpcode());
+  bool IsPush = isPushOpcode(MI->getOpcode());
+  if (!IsPush && !IsPop)
+    return false;
+
+  bool IsVFPPushPop = MI->getOpcode() == ARM::VSTMDDB_UPD ||
+                      MI->getOpcode() == ARM::VLDMDIA_UPD;
+  bool IsT1PushPop = MI->getOpcode() == ARM::tPUSH ||
+                     MI->getOpcode() == ARM::tPOP ||
+                     MI->getOpcode() == ARM::tPOP_RET;
+
+  assert((IsT1PushPop || (MI->getOperand(0).getReg() == ARM::SP &&
+                          MI->getOperand(1).getReg() == ARM::SP)) &&
+         "trying to fold sp update into non-sp-updating push/pop");
+
+  // The VFP push & pop act on D-registers, so we can only fold an adjustment
+  // by a multiple of 8 bytes in correctly. Similarly rN is 4-bytes. Don't try
+  // if this is violated.
+  if (NumBytes % (IsVFPPushPop ? 8 : 4) != 0)
+    return false;
+
+  // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+
+  // pred) so the list starts at 4. Thumb1 starts after the predicate.
+  int RegListIdx = IsT1PushPop ? 2 : 4;
+
+  // Calculate the space we'll need in terms of registers.
+  unsigned RegsNeeded;
+  const TargetRegisterClass *RegClass;
+  if (IsVFPPushPop) {
+    RegsNeeded = NumBytes / 8;
+    RegClass = &ARM::DPRRegClass;
+  } else {
+    RegsNeeded = NumBytes / 4;
+    RegClass = &ARM::GPRRegClass;
+  }
+
+  // We're going to have to strip all list operands off before
+  // re-adding them since the order matters, so save the existing ones
+  // for later.
+  SmallVector<MachineOperand, 4> RegList;
+
+  // We're also going to need the first register transferred by this
+  // instruction, which won't necessarily be the first register in the list.
+  unsigned FirstRegEnc = -1;
+
+  const TargetRegisterInfo *TRI = MF.getRegInfo().getTargetRegisterInfo();
+  for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i) {
+    MachineOperand &MO = MI->getOperand(i);
+    RegList.push_back(MO);
+
+    if (MO.isReg() && TRI->getEncodingValue(MO.getReg()) < FirstRegEnc)
+      FirstRegEnc = TRI->getEncodingValue(MO.getReg());
+  }
+
+  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+
+  // Now try to find enough space in the reglist to allocate NumBytes.
+  for (int CurRegEnc = FirstRegEnc - 1; CurRegEnc >= 0 && RegsNeeded;
+       --CurRegEnc) {
+    unsigned CurReg = RegClass->getRegister(CurRegEnc);
+    if (!IsPop) {
+      // Pushing any register is completely harmless, mark the
+      // register involved as undef since we don't care about it in
+      // the slightest.
+      RegList.push_back(MachineOperand::CreateReg(CurReg, false, false,
+                                                  false, false, true));
+      --RegsNeeded;
+      continue;
+    }
+
+    // However, we can only pop an extra register if it's not live. For
+    // registers live within the function we might clobber a return value
+    // register; the other way a register can be live here is if it's
+    // callee-saved.
+    if (isCalleeSavedRegister(CurReg, CSRegs) ||
+        MI->getParent()->computeRegisterLiveness(TRI, CurReg, MI) !=
+        MachineBasicBlock::LQR_Dead) {
+      // VFP pops don't allow holes in the register list, so any skip is fatal
+      // for our transformation. GPR pops do, so we should just keep looking.
+      if (IsVFPPushPop)
+        return false;
+      else
+        continue;
+    }
+
+    // Mark the unimportant registers as <def,dead> in the POP.
+    RegList.push_back(MachineOperand::CreateReg(CurReg, true, false, false,
+                                                true));
+    --RegsNeeded;
+  }
+
+  if (RegsNeeded > 0)
+    return false;
+
+  // Finally we know we can profitably perform the optimisation so go
+  // ahead: strip all existing registers off and add them back again
+  // in the right order.
+  for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i)
+    MI->RemoveOperand(i);
+
+  // Add the complete list back in.
+  MachineInstrBuilder MIB(MF, &*MI);
+  for (int i = RegList.size() - 1; i >= 0; --i)
+    MIB.addOperand(RegList[i]);
+
+  return true;
+}
+
+bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                                unsigned FrameReg, int &Offset,
+                                const ARMBaseInstrInfo &TII) {
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MI.getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  bool isSub = false;
+
+  // Memory operands in inline assembly always use AddrMode2.
+  if (Opcode == ARM::INLINEASM)
+    AddrMode = ARMII::AddrMode2;
+
+  if (Opcode == ARM::ADDri) {
+    Offset += MI.getOperand(FrameRegIdx+1).getImm();
+    if (Offset == 0) {
+      // Turn it into a move.
+      MI.setDesc(TII.get(ARM::MOVr));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.RemoveOperand(FrameRegIdx+1);
+      Offset = 0;
+      return true;
+    } else if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+      MI.setDesc(TII.get(ARM::SUBri));
+    }
+
+    // Common case: small offset, fits into instruction.
+    if (ARM_AM::getSOImmVal(Offset) != -1) {
+      // Replace the FrameIndex with sp / fp
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+      Offset = 0;
+      return true;
+    }
+
+    // Otherwise, pull as much of the immedidate into this ADDri/SUBri
+    // as possible.
+    unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset);
+    unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt);
+
+    // We will handle these bits from offset, clear them.
+    Offset &= ~ThisImmVal;
+
+    // Get the properly encoded SOImmVal field.
+    assert(ARM_AM::getSOImmVal(ThisImmVal) != -1 &&
+           "Bit extraction didn't work?");
+    MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal);
+ } else {
+    unsigned ImmIdx = 0;
+    int InstrOffs = 0;
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    switch (AddrMode) {
+    case ARMII::AddrMode_i12: {
+      ImmIdx = FrameRegIdx + 1;
+      InstrOffs = MI.getOperand(ImmIdx).getImm();
+      NumBits = 12;
+      break;
+    }
+    case ARMII::AddrMode2: {
+      ImmIdx = FrameRegIdx+2;
+      InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 12;
+      break;
+    }
+    case ARMII::AddrMode3: {
+      ImmIdx = FrameRegIdx+2;
+      InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      break;
+    }
+    case ARMII::AddrMode4:
+    case ARMII::AddrMode6:
+      // Can't fold any offset even if it's zero.
+      return false;
+    case ARMII::AddrMode5: {
+      ImmIdx = FrameRegIdx+1;
+      InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      Scale = 4;
+      break;
+    }
+    default:
+      llvm_unreachable("Unsupported addressing mode!");
+    }
+
+    Offset += InstrOffs * Scale;
+    assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+    if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+    }
+
+    // Attempt to fold address comp. if opcode has offset bits
+    if (NumBits > 0) {
+      // Common case: small offset, fits into instruction.
+      MachineOperand &ImmOp = MI.getOperand(ImmIdx);
+      int ImmedOffset = Offset / Scale;
+      unsigned Mask = (1 << NumBits) - 1;
+      if ((unsigned)Offset <= Mask * Scale) {
+        // Replace the FrameIndex with sp
+        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+        // FIXME: When addrmode2 goes away, this will simplify (like the
+        // T2 version), as the LDR.i12 versions don't need the encoding
+        // tricks for the offset value.
+        if (isSub) {
+          if (AddrMode == ARMII::AddrMode_i12)
+            ImmedOffset = -ImmedOffset;
+          else
+            ImmedOffset |= 1 << NumBits;
+        }
+        ImmOp.ChangeToImmediate(ImmedOffset);
+        Offset = 0;
+        return true;
+      }
+
+      // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
+      ImmedOffset = ImmedOffset & Mask;
+      if (isSub) {
+        if (AddrMode == ARMII::AddrMode_i12)
+          ImmedOffset = -ImmedOffset;
+        else
+          ImmedOffset |= 1 << NumBits;
+      }
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      Offset &= ~(Mask*Scale);
+    }
+  }
+
+  Offset = (isSub) ? -Offset : Offset;
+  return Offset == 0;
+}
+
+/// analyzeCompare - For a comparison instruction, return the source registers
+/// in SrcReg and SrcReg2 if having two register operands, and the value it
+/// compares against in CmpValue. Return true if the comparison instruction
+/// can be analyzed.
+bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                                      unsigned &SrcReg2, int &CmpMask,
+                                      int &CmpValue) const {
+  switch (MI.getOpcode()) {
+  default: break;
+  case ARM::CMPri:
+  case ARM::t2CMPri:
+  case ARM::tCMPi8:
+    SrcReg = MI.getOperand(0).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI.getOperand(1).getImm();
+    return true;
+  case ARM::CMPrr:
+  case ARM::t2CMPrr:
+    SrcReg = MI.getOperand(0).getReg();
+    SrcReg2 = MI.getOperand(1).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case ARM::TSTri:
+  case ARM::t2TSTri:
+    SrcReg = MI.getOperand(0).getReg();
+    SrcReg2 = 0;
+    CmpMask = MI.getOperand(1).getImm();
+    CmpValue = 0;
+    return true;
+  }
+
+  return false;
+}
+
+/// isSuitableForMask - Identify a suitable 'and' instruction that
+/// operates on the given source register and applies the same mask
+/// as a 'tst' instruction. Provide a limited look-through for copies.
+/// When successful, MI will hold the found instruction.
+static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg,
+                              int CmpMask, bool CommonUse) {
+  switch (MI->getOpcode()) {
+    case ARM::ANDri:
+    case ARM::t2ANDri:
+      if (CmpMask != MI->getOperand(2).getImm())
+        return false;
+      if (SrcReg == MI->getOperand(CommonUse ? 1 : 0).getReg())
+        return true;
+      break;
+  }
+
+  return false;
+}
+
+/// getSwappedCondition - assume the flags are set by MI(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by MI(b,a).
+inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) {
+  switch (CC) {
+  default: return ARMCC::AL;
+  case ARMCC::EQ: return ARMCC::EQ;
+  case ARMCC::NE: return ARMCC::NE;
+  case ARMCC::HS: return ARMCC::LS;
+  case ARMCC::LO: return ARMCC::HI;
+  case ARMCC::HI: return ARMCC::LO;
+  case ARMCC::LS: return ARMCC::HS;
+  case ARMCC::GE: return ARMCC::LE;
+  case ARMCC::LT: return ARMCC::GT;
+  case ARMCC::GT: return ARMCC::LT;
+  case ARMCC::LE: return ARMCC::GE;
+  }
+}
+
+/// isRedundantFlagInstr - check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// CMPri can be made redundant by SUBri if the operands are the same.
+/// This function can be extended later on.
+inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
+                                        unsigned SrcReg2, int ImmValue,
+                                        MachineInstr *OI) {
+  if ((CmpI->getOpcode() == ARM::CMPrr ||
+       CmpI->getOpcode() == ARM::t2CMPrr) &&
+      (OI->getOpcode() == ARM::SUBrr ||
+       OI->getOpcode() == ARM::t2SUBrr) &&
+      ((OI->getOperand(1).getReg() == SrcReg &&
+        OI->getOperand(2).getReg() == SrcReg2) ||
+       (OI->getOperand(1).getReg() == SrcReg2 &&
+        OI->getOperand(2).getReg() == SrcReg)))
+    return true;
+
+  if ((CmpI->getOpcode() == ARM::CMPri ||
+       CmpI->getOpcode() == ARM::t2CMPri) &&
+      (OI->getOpcode() == ARM::SUBri ||
+       OI->getOpcode() == ARM::t2SUBri) &&
+      OI->getOperand(1).getReg() == SrcReg &&
+      OI->getOperand(2).getImm() == ImmValue)
+    return true;
+  return false;
+}
+
+/// optimizeCompareInstr - Convert the instruction supplying the argument to the
+/// comparison into one that sets the zero bit in the flags register;
+/// Remove a redundant Compare instruction if an earlier instruction can set the
+/// flags in the same way as Compare.
+/// E.g. SUBrr(r1,r2) and CMPrr(r1,r2). We also handle the case where two
+/// operands are swapped: SUBrr(r1,r2) and CMPrr(r2,r1), by updating the
+/// condition code of instructions which use the flags.
+bool ARMBaseInstrInfo::optimizeCompareInstr(
+    MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+    int CmpValue, const MachineRegisterInfo *MRI) const {
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI) return false;
+
+  // Masked compares sometimes use the same register as the corresponding 'and'.
+  if (CmpMask != ~0) {
+    if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(*MI)) {
+      MI = nullptr;
+      for (MachineRegisterInfo::use_instr_iterator
+           UI = MRI->use_instr_begin(SrcReg), UE = MRI->use_instr_end();
+           UI != UE; ++UI) {
+        if (UI->getParent() != CmpInstr.getParent())
+          continue;
+        MachineInstr *PotentialAND = &*UI;
+        if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true) ||
+            isPredicated(*PotentialAND))
+          continue;
+        MI = PotentialAND;
+        break;
+      }
+      if (!MI) return false;
+    }
+  }
+
+  // Get ready to iterate backward from CmpInstr.
+  MachineBasicBlock::iterator I = CmpInstr, E = MI,
+                              B = CmpInstr.getParent()->begin();
+
+  // Early exit if CmpInstr is at the beginning of the BB.
+  if (I == B) return false;
+
+  // There are two possible candidates which can be changed to set CPSR:
+  // One is MI, the other is a SUB instruction.
+  // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
+  // For CMPri(r1, CmpValue), we are looking for SUBri(r1, CmpValue).
+  MachineInstr *Sub = nullptr;
+  if (SrcReg2 != 0)
+    // MI is not a candidate for CMPrr.
+    MI = nullptr;
+  else if (MI->getParent() != CmpInstr.getParent() || CmpValue != 0) {
+    // Conservatively refuse to convert an instruction which isn't in the same
+    // BB as the comparison.
+    // For CMPri w/ CmpValue != 0, a Sub may still be a candidate.
+    // Thus we cannot return here.
+    if (CmpInstr.getOpcode() == ARM::CMPri ||
+        CmpInstr.getOpcode() == ARM::t2CMPri)
+      MI = nullptr;
+    else
+      return false;
+  }
+
+  // Check that CPSR isn't set between the comparison instruction and the one we
+  // want to change. At the same time, search for Sub.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  --I;
+  for (; I != E; --I) {
+    const MachineInstr &Instr = *I;
+
+    if (Instr.modifiesRegister(ARM::CPSR, TRI) ||
+        Instr.readsRegister(ARM::CPSR, TRI))
+      // This instruction modifies or uses CPSR after the one we want to
+      // change. We can't do this transformation.
+      return false;
+
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
+      Sub = &*I;
+      break;
+    }
+
+    if (I == B)
+      // The 'and' is below the comparison instruction.
+      return false;
+  }
+
+  // Return false if no candidates exist.
+  if (!MI && !Sub)
+    return false;
+
+  // The single candidate is called MI.
+  if (!MI) MI = Sub;
+
+  // We can't use a predicated instruction - it doesn't always write the flags.
+  if (isPredicated(*MI))
+    return false;
+
+  bool IsThumb1 = false;
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::tLSLri:
+  case ARM::tLSRri:
+  case ARM::tLSLrr:
+  case ARM::tLSRrr:
+  case ARM::tSUBrr:
+  case ARM::tADDrr:
+  case ARM::tADDi3:
+  case ARM::tADDi8:
+  case ARM::tSUBi3:
+  case ARM::tSUBi8:
+    IsThumb1 = true;
+    LLVM_FALLTHROUGH;
+  case ARM::RSBrr:
+  case ARM::RSBri:
+  case ARM::RSCrr:
+  case ARM::RSCri:
+  case ARM::ADDrr:
+  case ARM::ADDri:
+  case ARM::ADCrr:
+  case ARM::ADCri:
+  case ARM::SUBrr:
+  case ARM::SUBri:
+  case ARM::SBCrr:
+  case ARM::SBCri:
+  case ARM::t2RSBri:
+  case ARM::t2ADDrr:
+  case ARM::t2ADDri:
+  case ARM::t2ADCrr:
+  case ARM::t2ADCri:
+  case ARM::t2SUBrr:
+  case ARM::t2SUBri:
+  case ARM::t2SBCrr:
+  case ARM::t2SBCri:
+  case ARM::ANDrr:
+  case ARM::ANDri:
+  case ARM::t2ANDrr:
+  case ARM::t2ANDri:
+  case ARM::ORRrr:
+  case ARM::ORRri:
+  case ARM::t2ORRrr:
+  case ARM::t2ORRri:
+  case ARM::EORrr:
+  case ARM::EORri:
+  case ARM::t2EORrr:
+  case ARM::t2EORri:
+  case ARM::t2LSRri:
+  case ARM::t2LSRrr:
+  case ARM::t2LSLri:
+  case ARM::t2LSLrr: {
+    // Scan forward for the use of CPSR
+    // When checking against MI: if it's a conditional code that requires
+    // checking of the V bit or C bit, then this is not safe to do.
+    // It is safe to remove CmpInstr if CPSR is redefined or killed.
+    // If we are done with the basic block, we need to check whether CPSR is
+    // live-out.
+    SmallVector<std::pair<MachineOperand*, ARMCC::CondCodes>, 4>
+        OperandsToUpdate;
+    bool isSafe = false;
+    I = CmpInstr;
+    E = CmpInstr.getParent()->end();
+    while (!isSafe && ++I != E) {
+      const MachineInstr &Instr = *I;
+      for (unsigned IO = 0, EO = Instr.getNumOperands();
+           !isSafe && IO != EO; ++IO) {
+        const MachineOperand &MO = Instr.getOperand(IO);
+        if (MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) {
+          isSafe = true;
+          break;
+        }
+        if (!MO.isReg() || MO.getReg() != ARM::CPSR)
+          continue;
+        if (MO.isDef()) {
+          isSafe = true;
+          break;
+        }
+        // Condition code is after the operand before CPSR except for VSELs.
+        ARMCC::CondCodes CC;
+        bool IsInstrVSel = true;
+        switch (Instr.getOpcode()) {
+        default:
+          IsInstrVSel = false;
+          CC = (ARMCC::CondCodes)Instr.getOperand(IO - 1).getImm();
+          break;
+        case ARM::VSELEQD:
+        case ARM::VSELEQS:
+          CC = ARMCC::EQ;
+          break;
+        case ARM::VSELGTD:
+        case ARM::VSELGTS:
+          CC = ARMCC::GT;
+          break;
+        case ARM::VSELGED:
+        case ARM::VSELGES:
+          CC = ARMCC::GE;
+          break;
+        case ARM::VSELVSS:
+        case ARM::VSELVSD:
+          CC = ARMCC::VS;
+          break;
+        }
+
+        if (Sub) {
+          ARMCC::CondCodes NewCC = getSwappedCondition(CC);
+          if (NewCC == ARMCC::AL)
+            return false;
+          // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based
+          // on CMP needs to be updated to be based on SUB.
+          // Push the condition code operands to OperandsToUpdate.
+          // If it is safe to remove CmpInstr, the condition code of these
+          // operands will be modified.
+          if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+              Sub->getOperand(2).getReg() == SrcReg) {
+            // VSel doesn't support condition code update.
+            if (IsInstrVSel)
+              return false;
+            OperandsToUpdate.push_back(
+                std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
+          }
+        } else {
+          // No Sub, so this is x = <op> y, z; cmp x, 0.
+          switch (CC) {
+          case ARMCC::EQ: // Z
+          case ARMCC::NE: // Z
+          case ARMCC::MI: // N
+          case ARMCC::PL: // N
+          case ARMCC::AL: // none
+            // CPSR can be used multiple times, we should continue.
+            break;
+          case ARMCC::HS: // C
+          case ARMCC::LO: // C
+          case ARMCC::VS: // V
+          case ARMCC::VC: // V
+          case ARMCC::HI: // C Z
+          case ARMCC::LS: // C Z
+          case ARMCC::GE: // N V
+          case ARMCC::LT: // N V
+          case ARMCC::GT: // Z N V
+          case ARMCC::LE: // Z N V
+            // The instruction uses the V bit or C bit which is not safe.
+            return false;
+          }
+        }
+      }
+    }
+
+    // If CPSR is not killed nor re-defined, we should check whether it is
+    // live-out. If it is live-out, do not optimize.
+    if (!isSafe) {
+      MachineBasicBlock *MBB = CmpInstr.getParent();
+      for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+               SE = MBB->succ_end(); SI != SE; ++SI)
+        if ((*SI)->isLiveIn(ARM::CPSR))
+          return false;
+    }
+
+    // Toggle the optional operand to CPSR (if it exists - in Thumb1 we always
+    // set CPSR so this is represented as an explicit output)
+    if (!IsThumb1) {
+      MI->getOperand(5).setReg(ARM::CPSR);
+      MI->getOperand(5).setIsDef(true);
+    }
+    assert(!isPredicated(*MI) && "Can't use flags from predicated instruction");
+    CmpInstr.eraseFromParent();
+
+    // Modify the condition code of operands in OperandsToUpdate.
+    // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
+    // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+    for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++)
+      OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second);
+    return true;
+  }
+  }
+  
+  return false;
+}
+
+bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+                                     unsigned Reg,
+                                     MachineRegisterInfo *MRI) const {
+  // Fold large immediates into add, sub, or, xor.
+  unsigned DefOpc = DefMI.getOpcode();
+  if (DefOpc != ARM::t2MOVi32imm && DefOpc != ARM::MOVi32imm)
+    return false;
+  if (!DefMI.getOperand(1).isImm())
+    // Could be t2MOVi32imm <ga:xx>
+    return false;
+
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return false;
+
+  const MCInstrDesc &DefMCID = DefMI.getDesc();
+  if (DefMCID.hasOptionalDef()) {
+    unsigned NumOps = DefMCID.getNumOperands();
+    const MachineOperand &MO = DefMI.getOperand(NumOps - 1);
+    if (MO.getReg() == ARM::CPSR && !MO.isDead())
+      // If DefMI defines CPSR and it is not dead, it's obviously not safe
+      // to delete DefMI.
+      return false;
+  }
+
+  const MCInstrDesc &UseMCID = UseMI.getDesc();
+  if (UseMCID.hasOptionalDef()) {
+    unsigned NumOps = UseMCID.getNumOperands();
+    if (UseMI.getOperand(NumOps - 1).getReg() == ARM::CPSR)
+      // If the instruction sets the flag, do not attempt this optimization
+      // since it may change the semantics of the code.
+      return false;
+  }
+
+  unsigned UseOpc = UseMI.getOpcode();
+  unsigned NewUseOpc = 0;
+  uint32_t ImmVal = (uint32_t)DefMI.getOperand(1).getImm();
+  uint32_t SOImmValV1 = 0, SOImmValV2 = 0;
+  bool Commute = false;
+  switch (UseOpc) {
+  default: return false;
+  case ARM::SUBrr:
+  case ARM::ADDrr:
+  case ARM::ORRrr:
+  case ARM::EORrr:
+  case ARM::t2SUBrr:
+  case ARM::t2ADDrr:
+  case ARM::t2ORRrr:
+  case ARM::t2EORrr: {
+    Commute = UseMI.getOperand(2).getReg() != Reg;
+    switch (UseOpc) {
+    default: break;
+    case ARM::ADDrr:
+    case ARM::SUBrr: {
+      if (UseOpc == ARM::SUBrr && Commute)
+        return false;
+
+      // ADD/SUB are special because they're essentially the same operation, so
+      // we can handle a larger range of immediates.
+      if (ARM_AM::isSOImmTwoPartVal(ImmVal))
+        NewUseOpc = UseOpc == ARM::ADDrr ? ARM::ADDri : ARM::SUBri;
+      else if (ARM_AM::isSOImmTwoPartVal(-ImmVal)) {
+        ImmVal = -ImmVal;
+        NewUseOpc = UseOpc == ARM::ADDrr ? ARM::SUBri : ARM::ADDri;
+      } else
+        return false;
+      SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal);
+      SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal);
+      break;
+    }
+    case ARM::ORRrr:
+    case ARM::EORrr: {
+      if (!ARM_AM::isSOImmTwoPartVal(ImmVal))
+        return false;
+      SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal);
+      SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal);
+      switch (UseOpc) {
+      default: break;
+      case ARM::ORRrr: NewUseOpc = ARM::ORRri; break;
+      case ARM::EORrr: NewUseOpc = ARM::EORri; break;
+      }
+      break;
+    }
+    case ARM::t2ADDrr:
+    case ARM::t2SUBrr: {
+      if (UseOpc == ARM::t2SUBrr && Commute)
+        return false;
+
+      // ADD/SUB are special because they're essentially the same operation, so
+      // we can handle a larger range of immediates.
+      if (ARM_AM::isT2SOImmTwoPartVal(ImmVal))
+        NewUseOpc = UseOpc == ARM::t2ADDrr ? ARM::t2ADDri : ARM::t2SUBri;
+      else if (ARM_AM::isT2SOImmTwoPartVal(-ImmVal)) {
+        ImmVal = -ImmVal;
+        NewUseOpc = UseOpc == ARM::t2ADDrr ? ARM::t2SUBri : ARM::t2ADDri;
+      } else
+        return false;
+      SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal);
+      SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal);
+      break;
+    }
+    case ARM::t2ORRrr:
+    case ARM::t2EORrr: {
+      if (!ARM_AM::isT2SOImmTwoPartVal(ImmVal))
+        return false;
+      SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal);
+      SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal);
+      switch (UseOpc) {
+      default: break;
+      case ARM::t2ORRrr: NewUseOpc = ARM::t2ORRri; break;
+      case ARM::t2EORrr: NewUseOpc = ARM::t2EORri; break;
+      }
+      break;
+    }
+    }
+  }
+  }
+
+  unsigned OpIdx = Commute ? 2 : 1;
+  unsigned Reg1 = UseMI.getOperand(OpIdx).getReg();
+  bool isKill = UseMI.getOperand(OpIdx).isKill();
+  unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
+  AddDefaultCC(
+      AddDefaultPred(BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
+                             get(NewUseOpc), NewReg)
+                         .addReg(Reg1, getKillRegState(isKill))
+                         .addImm(SOImmValV1)));
+  UseMI.setDesc(get(NewUseOpc));
+  UseMI.getOperand(1).setReg(NewReg);
+  UseMI.getOperand(1).setIsKill();
+  UseMI.getOperand(2).ChangeToImmediate(SOImmValV2);
+  DefMI.eraseFromParent();
+  return true;
+}
+
+static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
+                                        const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default: {
+    const MCInstrDesc &Desc = MI.getDesc();
+    int UOps = ItinData->getNumMicroOps(Desc.getSchedClass());
+    assert(UOps >= 0 && "bad # UOps");
+    return UOps;
+  }
+
+  case ARM::LDRrs:
+  case ARM::LDRBrs:
+  case ARM::STRrs:
+  case ARM::STRBrs: {
+    unsigned ShOpVal = MI.getOperand(3).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 1;
+    return 2;
+  }
+
+  case ARM::LDRH:
+  case ARM::STRH: {
+    if (!MI.getOperand(2).getReg())
+      return 1;
+
+    unsigned ShOpVal = MI.getOperand(3).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 1;
+    return 2;
+  }
+
+  case ARM::LDRSB:
+  case ARM::LDRSH:
+    return (ARM_AM::getAM3Op(MI.getOperand(3).getImm()) == ARM_AM::sub) ? 3 : 2;
+
+  case ARM::LDRSB_POST:
+  case ARM::LDRSH_POST: {
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
+    return (Rt == Rm) ? 4 : 3;
+  }
+
+  case ARM::LDR_PRE_REG:
+  case ARM::LDRB_PRE_REG: {
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
+    if (Rt == Rm)
+      return 3;
+    unsigned ShOpVal = MI.getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 2;
+    return 3;
+  }
+
+  case ARM::STR_PRE_REG:
+  case ARM::STRB_PRE_REG: {
+    unsigned ShOpVal = MI.getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 2;
+    return 3;
+  }
+
+  case ARM::LDRH_PRE:
+  case ARM::STRH_PRE: {
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
+    if (!Rm)
+      return 2;
+    if (Rt == Rm)
+      return 3;
+    return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 3 : 2;
+  }
+
+  case ARM::LDR_POST_REG:
+  case ARM::LDRB_POST_REG:
+  case ARM::LDRH_POST: {
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
+    return (Rt == Rm) ? 3 : 2;
+  }
+
+  case ARM::LDR_PRE_IMM:
+  case ARM::LDRB_PRE_IMM:
+  case ARM::LDR_POST_IMM:
+  case ARM::LDRB_POST_IMM:
+  case ARM::STRB_POST_IMM:
+  case ARM::STRB_POST_REG:
+  case ARM::STRB_PRE_IMM:
+  case ARM::STRH_POST:
+  case ARM::STR_POST_IMM:
+  case ARM::STR_POST_REG:
+  case ARM::STR_PRE_IMM:
+    return 2;
+
+  case ARM::LDRSB_PRE:
+  case ARM::LDRSH_PRE: {
+    unsigned Rm = MI.getOperand(3).getReg();
+    if (Rm == 0)
+      return 3;
+    unsigned Rt = MI.getOperand(0).getReg();
+    if (Rt == Rm)
+      return 4;
+    unsigned ShOpVal = MI.getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 3;
+    return 4;
+  }
+
+  case ARM::LDRD: {
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rn = MI.getOperand(2).getReg();
+    unsigned Rm = MI.getOperand(3).getReg();
+    if (Rm)
+      return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4
+                                                                          : 3;
+    return (Rt == Rn) ? 3 : 2;
+  }
+
+  case ARM::STRD: {
+    unsigned Rm = MI.getOperand(3).getReg();
+    if (Rm)
+      return (ARM_AM::getAM3Op(MI.getOperand(4).getImm()) == ARM_AM::sub) ? 4
+                                                                          : 3;
+    return 2;
+  }
+
+  case ARM::LDRD_POST:
+  case ARM::t2LDRD_POST:
+    return 3;
+
+  case ARM::STRD_POST:
+  case ARM::t2STRD_POST:
+    return 4;
+
+  case ARM::LDRD_PRE: {
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rn = MI.getOperand(3).getReg();
+    unsigned Rm = MI.getOperand(4).getReg();
+    if (Rm)
+      return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5
+                                                                          : 4;
+    return (Rt == Rn) ? 4 : 3;
+  }
+
+  case ARM::t2LDRD_PRE: {
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rn = MI.getOperand(3).getReg();
+    return (Rt == Rn) ? 4 : 3;
+  }
+
+  case ARM::STRD_PRE: {
+    unsigned Rm = MI.getOperand(4).getReg();
+    if (Rm)
+      return (ARM_AM::getAM3Op(MI.getOperand(5).getImm()) == ARM_AM::sub) ? 5
+                                                                          : 4;
+    return 3;
+  }
+
+  case ARM::t2STRD_PRE:
+    return 3;
+
+  case ARM::t2LDR_POST:
+  case ARM::t2LDRB_POST:
+  case ARM::t2LDRB_PRE:
+  case ARM::t2LDRSBi12:
+  case ARM::t2LDRSBi8:
+  case ARM::t2LDRSBpci:
+  case ARM::t2LDRSBs:
+  case ARM::t2LDRH_POST:
+  case ARM::t2LDRH_PRE:
+  case ARM::t2LDRSBT:
+  case ARM::t2LDRSB_POST:
+  case ARM::t2LDRSB_PRE:
+  case ARM::t2LDRSH_POST:
+  case ARM::t2LDRSH_PRE:
+  case ARM::t2LDRSHi12:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRSHpci:
+  case ARM::t2LDRSHs:
+    return 2;
+
+  case ARM::t2LDRDi8: {
+    unsigned Rt = MI.getOperand(0).getReg();
+    unsigned Rn = MI.getOperand(2).getReg();
+    return (Rt == Rn) ? 3 : 2;
+  }
+
+  case ARM::t2STRB_POST:
+  case ARM::t2STRB_PRE:
+  case ARM::t2STRBs:
+  case ARM::t2STRDi8:
+  case ARM::t2STRH_POST:
+  case ARM::t2STRH_PRE:
+  case ARM::t2STRHs:
+  case ARM::t2STR_POST:
+  case ARM::t2STR_PRE:
+  case ARM::t2STRs:
+    return 2;
+  }
+}
+
+// Return the number of 32-bit words loaded by LDM or stored by STM. If this
+// can't be easily determined return 0 (missing MachineMemOperand).
+//
+// FIXME: The current MachineInstr design does not support relying on machine
+// mem operands to determine the width of a memory access. Instead, we expect
+// the target to provide this information based on the instruction opcode and
+// operands. However, using MachineMemOperand is the best solution now for
+// two reasons:
+//
+// 1) getNumMicroOps tries to infer LDM memory width from the total number of MI
+// operands. This is much more dangerous than using the MachineMemOperand
+// sizes because CodeGen passes can insert/remove optional machine operands. In
+// fact, it's totally incorrect for preRA passes and appears to be wrong for
+// postRA passes as well.
+//
+// 2) getNumLDMAddresses is only used by the scheduling machine model and any
+// machine model that calls this should handle the unknown (zero size) case.
+//
+// Long term, we should require a target hook that verifies MachineMemOperand
+// sizes during MC lowering. That target hook should be local to MC lowering
+// because we can't ensure that it is aware of other MI forms. Doing this will
+// ensure that MachineMemOperands are correctly propagated through all passes.
+unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr &MI) const {
+  unsigned Size = 0;
+  for (MachineInstr::mmo_iterator I = MI.memoperands_begin(),
+                                  E = MI.memoperands_end();
+       I != E; ++I) {
+    Size += (*I)->getSize();
+  }
+  return Size / 4;
+}
+
+static unsigned getNumMicroOpsSingleIssuePlusExtras(unsigned Opc,
+                                                    unsigned NumRegs) {
+  unsigned UOps = 1 + NumRegs; // 1 for address computation.
+  switch (Opc) {
+  default:
+    break;
+  case ARM::VLDMDIA_UPD:
+  case ARM::VLDMDDB_UPD:
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMSDB_UPD:
+  case ARM::VSTMDIA_UPD:
+  case ARM::VSTMDDB_UPD:
+  case ARM::VSTMSIA_UPD:
+  case ARM::VSTMSDB_UPD:
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::STMIA_UPD:
+  case ARM::STMDA_UPD:
+  case ARM::STMDB_UPD:
+  case ARM::STMIB_UPD:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD:
+    ++UOps; // One for base register writeback.
+    break;
+  case ARM::LDMIA_RET:
+  case ARM::tPOP_RET:
+  case ARM::t2LDMIA_RET:
+    UOps += 2; // One for base reg wb, one for write to pc.
+    break;
+  }
+  return UOps;
+}
+
+unsigned ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
+                                          const MachineInstr &MI) const {
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  const MCInstrDesc &Desc = MI.getDesc();
+  unsigned Class = Desc.getSchedClass();
+  int ItinUOps = ItinData->getNumMicroOps(Class);
+  if (ItinUOps >= 0) {
+    if (Subtarget.isSwift() && (Desc.mayLoad() || Desc.mayStore()))
+      return getNumMicroOpsSwiftLdSt(ItinData, MI);
+
+    return ItinUOps;
+  }
+
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unexpected multi-uops instruction!");
+  case ARM::VLDMQIA:
+  case ARM::VSTMQIA:
+    return 2;
+
+  // The number of uOps for load / store multiple are determined by the number
+  // registers.
+  //
+  // On Cortex-A8, each pair of register loads / stores can be scheduled on the
+  // same cycle. The scheduling for the first load / store must be done
+  // separately by assuming the address is not 64-bit aligned.
+  //
+  // On Cortex-A9, the formula is simply (#reg / 2) + (#reg % 2). If the address
+  // is not 64-bit aligned, then AGU would take an extra cycle.  For VFP / NEON
+  // load / store multiple, the formula is (#reg / 2) + (#reg % 2) + 1.
+  case ARM::VLDMDIA:
+  case ARM::VLDMDIA_UPD:
+  case ARM::VLDMDDB_UPD:
+  case ARM::VLDMSIA:
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMSDB_UPD:
+  case ARM::VSTMDIA:
+  case ARM::VSTMDIA_UPD:
+  case ARM::VSTMDDB_UPD:
+  case ARM::VSTMSIA:
+  case ARM::VSTMSIA_UPD:
+  case ARM::VSTMSDB_UPD: {
+    unsigned NumRegs = MI.getNumOperands() - Desc.getNumOperands();
+    return (NumRegs / 2) + (NumRegs % 2) + 1;
+  }
+
+  case ARM::LDMIA_RET:
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+  case ARM::STMIA_UPD:
+  case ARM::STMDA_UPD:
+  case ARM::STMDB_UPD:
+  case ARM::STMIB_UPD:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
+  case ARM::tPOP_RET:
+  case ARM::tPOP:
+  case ARM::tPUSH:
+  case ARM::t2LDMIA_RET:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD: {
+    unsigned NumRegs = MI.getNumOperands() - Desc.getNumOperands() + 1;
+    switch (Subtarget.getLdStMultipleTiming()) {
+    case ARMSubtarget::SingleIssuePlusExtras:
+      return getNumMicroOpsSingleIssuePlusExtras(Opc, NumRegs);
+    case ARMSubtarget::SingleIssue:
+      // Assume the worst.
+      return NumRegs;
+    case ARMSubtarget::DoubleIssue: {
+      if (NumRegs < 4)
+        return 2;
+      // 4 registers would be issued: 2, 2.
+      // 5 registers would be issued: 2, 2, 1.
+      unsigned UOps = (NumRegs / 2);
+      if (NumRegs % 2)
+        ++UOps;
+      return UOps;
+    }
+    case ARMSubtarget::DoubleIssueCheckUnalignedAccess: {
+      unsigned UOps = (NumRegs / 2);
+      // If there are odd number of registers or if it's not 64-bit aligned,
+      // then it takes an extra AGU (Address Generation Unit) cycle.
+      if ((NumRegs % 2) || !MI.hasOneMemOperand() ||
+          (*MI.memoperands_begin())->getAlignment() < 8)
+        ++UOps;
+      return UOps;
+      }
+    }
+  }
+  }
+  llvm_unreachable("Didn't find the number of microops");
+}
+
+int
+ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
+                                  const MCInstrDesc &DefMCID,
+                                  unsigned DefClass,
+                                  unsigned DefIdx, unsigned DefAlign) const {
+  int RegNo = (int)(DefIdx+1) - DefMCID.getNumOperands() + 1;
+  if (RegNo <= 0)
+    // Def is the address writeback.
+    return ItinData->getOperandCycle(DefClass, DefIdx);
+
+  int DefCycle;
+  if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
+    // (regno / 2) + (regno % 2) + 1
+    DefCycle = RegNo / 2 + 1;
+    if (RegNo % 2)
+      ++DefCycle;
+  } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+    DefCycle = RegNo;
+    bool isSLoad = false;
+
+    switch (DefMCID.getOpcode()) {
+    default: break;
+    case ARM::VLDMSIA:
+    case ARM::VLDMSIA_UPD:
+    case ARM::VLDMSDB_UPD:
+      isSLoad = true;
+      break;
+    }
+
+    // If there are odd number of 'S' registers or if it's not 64-bit aligned,
+    // then it takes an extra cycle.
+    if ((isSLoad && (RegNo % 2)) || DefAlign < 8)
+      ++DefCycle;
+  } else {
+    // Assume the worst.
+    DefCycle = RegNo + 2;
+  }
+
+  return DefCycle;
+}
+
+int
+ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
+                                 const MCInstrDesc &DefMCID,
+                                 unsigned DefClass,
+                                 unsigned DefIdx, unsigned DefAlign) const {
+  int RegNo = (int)(DefIdx+1) - DefMCID.getNumOperands() + 1;
+  if (RegNo <= 0)
+    // Def is the address writeback.
+    return ItinData->getOperandCycle(DefClass, DefIdx);
+
+  int DefCycle;
+  if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
+    // 4 registers would be issued: 1, 2, 1.
+    // 5 registers would be issued: 1, 2, 2.
+    DefCycle = RegNo / 2;
+    if (DefCycle < 1)
+      DefCycle = 1;
+    // Result latency is issue cycle + 2: E2.
+    DefCycle += 2;
+  } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+    DefCycle = (RegNo / 2);
+    // If there are odd number of registers or if it's not 64-bit aligned,
+    // then it takes an extra AGU (Address Generation Unit) cycle.
+    if ((RegNo % 2) || DefAlign < 8)
+      ++DefCycle;
+    // Result latency is AGU cycles + 2.
+    DefCycle += 2;
+  } else {
+    // Assume the worst.
+    DefCycle = RegNo + 2;
+  }
+
+  return DefCycle;
+}
+
+int
+ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData,
+                                  const MCInstrDesc &UseMCID,
+                                  unsigned UseClass,
+                                  unsigned UseIdx, unsigned UseAlign) const {
+  int RegNo = (int)(UseIdx+1) - UseMCID.getNumOperands() + 1;
+  if (RegNo <= 0)
+    return ItinData->getOperandCycle(UseClass, UseIdx);
+
+  int UseCycle;
+  if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
+    // (regno / 2) + (regno % 2) + 1
+    UseCycle = RegNo / 2 + 1;
+    if (RegNo % 2)
+      ++UseCycle;
+  } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+    UseCycle = RegNo;
+    bool isSStore = false;
+
+    switch (UseMCID.getOpcode()) {
+    default: break;
+    case ARM::VSTMSIA:
+    case ARM::VSTMSIA_UPD:
+    case ARM::VSTMSDB_UPD:
+      isSStore = true;
+      break;
+    }
+
+    // If there are odd number of 'S' registers or if it's not 64-bit aligned,
+    // then it takes an extra cycle.
+    if ((isSStore && (RegNo % 2)) || UseAlign < 8)
+      ++UseCycle;
+  } else {
+    // Assume the worst.
+    UseCycle = RegNo + 2;
+  }
+
+  return UseCycle;
+}
+
+int
+ARMBaseInstrInfo::getSTMUseCycle(const InstrItineraryData *ItinData,
+                                 const MCInstrDesc &UseMCID,
+                                 unsigned UseClass,
+                                 unsigned UseIdx, unsigned UseAlign) const {
+  int RegNo = (int)(UseIdx+1) - UseMCID.getNumOperands() + 1;
+  if (RegNo <= 0)
+    return ItinData->getOperandCycle(UseClass, UseIdx);
+
+  int UseCycle;
+  if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) {
+    UseCycle = RegNo / 2;
+    if (UseCycle < 2)
+      UseCycle = 2;
+    // Read in E3.
+    UseCycle += 2;
+  } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+    UseCycle = (RegNo / 2);
+    // If there are odd number of registers or if it's not 64-bit aligned,
+    // then it takes an extra AGU (Address Generation Unit) cycle.
+    if ((RegNo % 2) || UseAlign < 8)
+      ++UseCycle;
+  } else {
+    // Assume the worst.
+    UseCycle = 1;
+  }
+  return UseCycle;
+}
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    const MCInstrDesc &DefMCID,
+                                    unsigned DefIdx, unsigned DefAlign,
+                                    const MCInstrDesc &UseMCID,
+                                    unsigned UseIdx, unsigned UseAlign) const {
+  unsigned DefClass = DefMCID.getSchedClass();
+  unsigned UseClass = UseMCID.getSchedClass();
+
+  if (DefIdx < DefMCID.getNumDefs() && UseIdx < UseMCID.getNumOperands())
+    return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
+
+  // This may be a def / use of a variable_ops instruction, the operand
+  // latency might be determinable dynamically. Let the target try to
+  // figure it out.
+  int DefCycle = -1;
+  bool LdmBypass = false;
+  switch (DefMCID.getOpcode()) {
+  default:
+    DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+    break;
+
+  case ARM::VLDMDIA:
+  case ARM::VLDMDIA_UPD:
+  case ARM::VLDMDDB_UPD:
+  case ARM::VLDMSIA:
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMSDB_UPD:
+    DefCycle = getVLDMDefCycle(ItinData, DefMCID, DefClass, DefIdx, DefAlign);
+    break;
+
+  case ARM::LDMIA_RET:
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tPUSH:
+  case ARM::t2LDMIA_RET:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+    LdmBypass = 1;
+    DefCycle = getLDMDefCycle(ItinData, DefMCID, DefClass, DefIdx, DefAlign);
+    break;
+  }
+
+  if (DefCycle == -1)
+    // We can't seem to determine the result latency of the def, assume it's 2.
+    DefCycle = 2;
+
+  int UseCycle = -1;
+  switch (UseMCID.getOpcode()) {
+  default:
+    UseCycle = ItinData->getOperandCycle(UseClass, UseIdx);
+    break;
+
+  case ARM::VSTMDIA:
+  case ARM::VSTMDIA_UPD:
+  case ARM::VSTMDDB_UPD:
+  case ARM::VSTMSIA:
+  case ARM::VSTMSIA_UPD:
+  case ARM::VSTMSDB_UPD:
+    UseCycle = getVSTMUseCycle(ItinData, UseMCID, UseClass, UseIdx, UseAlign);
+    break;
+
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+  case ARM::STMIA_UPD:
+  case ARM::STMDA_UPD:
+  case ARM::STMDB_UPD:
+  case ARM::STMIB_UPD:
+  case ARM::tSTMIA_UPD:
+  case ARM::tPOP_RET:
+  case ARM::tPOP:
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD:
+    UseCycle = getSTMUseCycle(ItinData, UseMCID, UseClass, UseIdx, UseAlign);
+    break;
+  }
+
+  if (UseCycle == -1)
+    // Assume it's read in the first stage.
+    UseCycle = 1;
+
+  UseCycle = DefCycle - UseCycle + 1;
+  if (UseCycle > 0) {
+    if (LdmBypass) {
+      // It's a variable_ops instruction so we can't use DefIdx here. Just use
+      // first def operand.
+      if (ItinData->hasPipelineForwarding(DefClass, DefMCID.getNumOperands()-1,
+                                          UseClass, UseIdx))
+        --UseCycle;
+    } else if (ItinData->hasPipelineForwarding(DefClass, DefIdx,
+                                               UseClass, UseIdx)) {
+      --UseCycle;
+    }
+  }
+
+  return UseCycle;
+}
+
+static const MachineInstr *getBundledDefMI(const TargetRegisterInfo *TRI,
+                                           const MachineInstr *MI, unsigned Reg,
+                                           unsigned &DefIdx, unsigned &Dist) {
+  Dist = 0;
+
+  MachineBasicBlock::const_iterator I = MI; ++I;
+  MachineBasicBlock::const_instr_iterator II = std::prev(I.getInstrIterator());
+  assert(II->isInsideBundle() && "Empty bundle?");
+
+  int Idx = -1;
+  while (II->isInsideBundle()) {
+    Idx = II->findRegisterDefOperandIdx(Reg, false, true, TRI);
+    if (Idx != -1)
+      break;
+    --II;
+    ++Dist;
+  }
+
+  assert(Idx != -1 && "Cannot find bundled definition!");
+  DefIdx = Idx;
+  return &*II;
+}
+
+static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
+                                           const MachineInstr &MI, unsigned Reg,
+                                           unsigned &UseIdx, unsigned &Dist) {
+  Dist = 0;
+
+  MachineBasicBlock::const_instr_iterator II = ++MI.getIterator();
+  assert(II->isInsideBundle() && "Empty bundle?");
+  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
+
+  // FIXME: This doesn't properly handle multiple uses.
+  int Idx = -1;
+  while (II != E && II->isInsideBundle()) {
+    Idx = II->findRegisterUseOperandIdx(Reg, false, TRI);
+    if (Idx != -1)
+      break;
+    if (II->getOpcode() != ARM::t2IT)
+      ++Dist;
+    ++II;
+  }
+
+  if (Idx == -1) {
+    Dist = 0;
+    return nullptr;
+  }
+
+  UseIdx = Idx;
+  return &*II;
+}
+
+/// Return the number of cycles to add to (or subtract from) the static
+/// itinerary based on the def opcode and alignment. The caller will ensure that
+/// adjusted latency is at least one cycle.
+static int adjustDefLatency(const ARMSubtarget &Subtarget,
+                            const MachineInstr &DefMI,
+                            const MCInstrDesc &DefMCID, unsigned DefAlign) {
+  int Adjust = 0;
+  if (Subtarget.isCortexA8() || Subtarget.isLikeA9() || Subtarget.isCortexA7()) {
+    // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
+    // variants are one cycle cheaper.
+    switch (DefMCID.getOpcode()) {
+    default: break;
+    case ARM::LDRrs:
+    case ARM::LDRBrs: {
+      unsigned ShOpVal = DefMI.getOperand(3).getImm();
+      unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+      if (ShImm == 0 ||
+          (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
+        --Adjust;
+      break;
+    }
+    case ARM::t2LDRs:
+    case ARM::t2LDRBs:
+    case ARM::t2LDRHs:
+    case ARM::t2LDRSHs: {
+      // Thumb2 mode: lsl only.
+      unsigned ShAmt = DefMI.getOperand(3).getImm();
+      if (ShAmt == 0 || ShAmt == 2)
+        --Adjust;
+      break;
+    }
+    }
+  } else if (Subtarget.isSwift()) {
+    // FIXME: Properly handle all of the latency adjustments for address
+    // writeback.
+    switch (DefMCID.getOpcode()) {
+    default: break;
+    case ARM::LDRrs:
+    case ARM::LDRBrs: {
+      unsigned ShOpVal = DefMI.getOperand(3).getImm();
+      bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+      unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+      if (!isSub &&
+          (ShImm == 0 ||
+           ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+            ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+        Adjust -= 2;
+      else if (!isSub &&
+               ShImm == 1 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsr)
+        --Adjust;
+      break;
+    }
+    case ARM::t2LDRs:
+    case ARM::t2LDRBs:
+    case ARM::t2LDRHs:
+    case ARM::t2LDRSHs: {
+      // Thumb2 mode: lsl only.
+      unsigned ShAmt = DefMI.getOperand(3).getImm();
+      if (ShAmt == 0 || ShAmt == 1 || ShAmt == 2 || ShAmt == 3)
+        Adjust -= 2;
+      break;
+    }
+    }
+  }
+
+  if (DefAlign < 8 && Subtarget.checkVLDnAccessAlignment()) {
+    switch (DefMCID.getOpcode()) {
+    default: break;
+    case ARM::VLD1q8:
+    case ARM::VLD1q16:
+    case ARM::VLD1q32:
+    case ARM::VLD1q64:
+    case ARM::VLD1q8wb_fixed:
+    case ARM::VLD1q16wb_fixed:
+    case ARM::VLD1q32wb_fixed:
+    case ARM::VLD1q64wb_fixed:
+    case ARM::VLD1q8wb_register:
+    case ARM::VLD1q16wb_register:
+    case ARM::VLD1q32wb_register:
+    case ARM::VLD1q64wb_register:
+    case ARM::VLD2d8:
+    case ARM::VLD2d16:
+    case ARM::VLD2d32:
+    case ARM::VLD2q8:
+    case ARM::VLD2q16:
+    case ARM::VLD2q32:
+    case ARM::VLD2d8wb_fixed:
+    case ARM::VLD2d16wb_fixed:
+    case ARM::VLD2d32wb_fixed:
+    case ARM::VLD2q8wb_fixed:
+    case ARM::VLD2q16wb_fixed:
+    case ARM::VLD2q32wb_fixed:
+    case ARM::VLD2d8wb_register:
+    case ARM::VLD2d16wb_register:
+    case ARM::VLD2d32wb_register:
+    case ARM::VLD2q8wb_register:
+    case ARM::VLD2q16wb_register:
+    case ARM::VLD2q32wb_register:
+    case ARM::VLD3d8:
+    case ARM::VLD3d16:
+    case ARM::VLD3d32:
+    case ARM::VLD1d64T:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD1d64Twb_fixed:
+    case ARM::VLD1d64Twb_register:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD1d64Q:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+    case ARM::VLD1d64Qwb_fixed:
+    case ARM::VLD1d64Qwb_register:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+    case ARM::VLD1DUPq8:
+    case ARM::VLD1DUPq16:
+    case ARM::VLD1DUPq32:
+    case ARM::VLD1DUPq8wb_fixed:
+    case ARM::VLD1DUPq16wb_fixed:
+    case ARM::VLD1DUPq32wb_fixed:
+    case ARM::VLD1DUPq8wb_register:
+    case ARM::VLD1DUPq16wb_register:
+    case ARM::VLD1DUPq32wb_register:
+    case ARM::VLD2DUPd8:
+    case ARM::VLD2DUPd16:
+    case ARM::VLD2DUPd32:
+    case ARM::VLD2DUPd8wb_fixed:
+    case ARM::VLD2DUPd16wb_fixed:
+    case ARM::VLD2DUPd32wb_fixed:
+    case ARM::VLD2DUPd8wb_register:
+    case ARM::VLD2DUPd16wb_register:
+    case ARM::VLD2DUPd32wb_register:
+    case ARM::VLD4DUPd8:
+    case ARM::VLD4DUPd16:
+    case ARM::VLD4DUPd32:
+    case ARM::VLD4DUPd8_UPD:
+    case ARM::VLD4DUPd16_UPD:
+    case ARM::VLD4DUPd32_UPD:
+    case ARM::VLD1LNd8:
+    case ARM::VLD1LNd16:
+    case ARM::VLD1LNd32:
+    case ARM::VLD1LNd8_UPD:
+    case ARM::VLD1LNd16_UPD:
+    case ARM::VLD1LNd32_UPD:
+    case ARM::VLD2LNd8:
+    case ARM::VLD2LNd16:
+    case ARM::VLD2LNd32:
+    case ARM::VLD2LNq16:
+    case ARM::VLD2LNq32:
+    case ARM::VLD2LNd8_UPD:
+    case ARM::VLD2LNd16_UPD:
+    case ARM::VLD2LNd32_UPD:
+    case ARM::VLD2LNq16_UPD:
+    case ARM::VLD2LNq32_UPD:
+    case ARM::VLD4LNd8:
+    case ARM::VLD4LNd16:
+    case ARM::VLD4LNd32:
+    case ARM::VLD4LNq16:
+    case ARM::VLD4LNq32:
+    case ARM::VLD4LNd8_UPD:
+    case ARM::VLD4LNd16_UPD:
+    case ARM::VLD4LNd32_UPD:
+    case ARM::VLD4LNq16_UPD:
+    case ARM::VLD4LNq32_UPD:
+      // If the address is not 64-bit aligned, the latencies of these
+      // instructions increases by one.
+      ++Adjust;
+      break;
+    }
+  }
+  return Adjust;
+}
+
+int ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                        const MachineInstr &DefMI,
+                                        unsigned DefIdx,
+                                        const MachineInstr &UseMI,
+                                        unsigned UseIdx) const {
+  // No operand latency. The caller may fall back to getInstrLatency.
+  if (!ItinData || ItinData->isEmpty())
+    return -1;
+
+  const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
+  unsigned Reg = DefMO.getReg();
+
+  const MachineInstr *ResolvedDefMI = &DefMI;
+  unsigned DefAdj = 0;
+  if (DefMI.isBundle())
+    ResolvedDefMI =
+        getBundledDefMI(&getRegisterInfo(), &DefMI, Reg, DefIdx, DefAdj);
+  if (ResolvedDefMI->isCopyLike() || ResolvedDefMI->isInsertSubreg() ||
+      ResolvedDefMI->isRegSequence() || ResolvedDefMI->isImplicitDef()) {
+    return 1;
+  }
+
+  const MachineInstr *ResolvedUseMI = &UseMI;
+  unsigned UseAdj = 0;
+  if (UseMI.isBundle()) {
+    ResolvedUseMI =
+        getBundledUseMI(&getRegisterInfo(), UseMI, Reg, UseIdx, UseAdj);
+    if (!ResolvedUseMI)
+      return -1;
+  }
+
+  return getOperandLatencyImpl(
+      ItinData, *ResolvedDefMI, DefIdx, ResolvedDefMI->getDesc(), DefAdj, DefMO,
+      Reg, *ResolvedUseMI, UseIdx, ResolvedUseMI->getDesc(), UseAdj);
+}
+
+int ARMBaseInstrInfo::getOperandLatencyImpl(
+    const InstrItineraryData *ItinData, const MachineInstr &DefMI,
+    unsigned DefIdx, const MCInstrDesc &DefMCID, unsigned DefAdj,
+    const MachineOperand &DefMO, unsigned Reg, const MachineInstr &UseMI,
+    unsigned UseIdx, const MCInstrDesc &UseMCID, unsigned UseAdj) const {
+  if (Reg == ARM::CPSR) {
+    if (DefMI.getOpcode() == ARM::FMSTAT) {
+      // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?)
+      return Subtarget.isLikeA9() ? 1 : 20;
+    }
+
+    // CPSR set and branch can be paired in the same cycle.
+    if (UseMI.isBranch())
+      return 0;
+
+    // Otherwise it takes the instruction latency (generally one).
+    unsigned Latency = getInstrLatency(ItinData, DefMI);
+
+    // For Thumb2 and -Os, prefer scheduling CPSR setting instruction close to
+    // its uses. Instructions which are otherwise scheduled between them may
+    // incur a code size penalty (not able to use the CPSR setting 16-bit
+    // instructions).
+    if (Latency > 0 && Subtarget.isThumb2()) {
+      const MachineFunction *MF = DefMI.getParent()->getParent();
+      // FIXME: Use Function::optForSize().
+      if (MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
+        --Latency;
+    }
+    return Latency;
+  }
+
+  if (DefMO.isImplicit() || UseMI.getOperand(UseIdx).isImplicit())
+    return -1;
+
+  unsigned DefAlign = DefMI.hasOneMemOperand()
+                          ? (*DefMI.memoperands_begin())->getAlignment()
+                          : 0;
+  unsigned UseAlign = UseMI.hasOneMemOperand()
+                          ? (*UseMI.memoperands_begin())->getAlignment()
+                          : 0;
+
+  // Get the itinerary's latency if possible, and handle variable_ops.
+  int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign, UseMCID,
+                                  UseIdx, UseAlign);
+  // Unable to find operand latency. The caller may resort to getInstrLatency.
+  if (Latency < 0)
+    return Latency;
+
+  // Adjust for IT block position.
+  int Adj = DefAdj + UseAdj;
+
+  // Adjust for dynamic def-side opcode variants not captured by the itinerary.
+  Adj += adjustDefLatency(Subtarget, DefMI, DefMCID, DefAlign);
+  if (Adj >= 0 || (int)Latency > -Adj) {
+    return Latency + Adj;
+  }
+  // Return the itinerary latency, which may be zero but not less than zero.
+  return Latency;
+}
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    SDNode *DefNode, unsigned DefIdx,
+                                    SDNode *UseNode, unsigned UseIdx) const {
+  if (!DefNode->isMachineOpcode())
+    return 1;
+
+  const MCInstrDesc &DefMCID = get(DefNode->getMachineOpcode());
+
+  if (isZeroCost(DefMCID.Opcode))
+    return 0;
+
+  if (!ItinData || ItinData->isEmpty())
+    return DefMCID.mayLoad() ? 3 : 1;
+
+  if (!UseNode->isMachineOpcode()) {
+    int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx);
+    int Adj = Subtarget.getPreISelOperandLatencyAdjustment();
+    int Threshold = 1 + Adj;
+    return Latency <= Threshold ? 1 : Latency - Adj;
+  }
+
+  const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode());
+  const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode);
+  unsigned DefAlign = !DefMN->memoperands_empty()
+    ? (*DefMN->memoperands_begin())->getAlignment() : 0;
+  const MachineSDNode *UseMN = dyn_cast<MachineSDNode>(UseNode);
+  unsigned UseAlign = !UseMN->memoperands_empty()
+    ? (*UseMN->memoperands_begin())->getAlignment() : 0;
+  int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign,
+                                  UseMCID, UseIdx, UseAlign);
+
+  if (Latency > 1 &&
+      (Subtarget.isCortexA8() || Subtarget.isLikeA9() ||
+       Subtarget.isCortexA7())) {
+    // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
+    // variants are one cycle cheaper.
+    switch (DefMCID.getOpcode()) {
+    default: break;
+    case ARM::LDRrs:
+    case ARM::LDRBrs: {
+      unsigned ShOpVal =
+        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+      if (ShImm == 0 ||
+          (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
+        --Latency;
+      break;
+    }
+    case ARM::t2LDRs:
+    case ARM::t2LDRBs:
+    case ARM::t2LDRHs:
+    case ARM::t2LDRSHs: {
+      // Thumb2 mode: lsl only.
+      unsigned ShAmt =
+        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      if (ShAmt == 0 || ShAmt == 2)
+        --Latency;
+      break;
+    }
+    }
+  } else if (DefIdx == 0 && Latency > 2 && Subtarget.isSwift()) {
+    // FIXME: Properly handle all of the latency adjustments for address
+    // writeback.
+    switch (DefMCID.getOpcode()) {
+    default: break;
+    case ARM::LDRrs:
+    case ARM::LDRBrs: {
+      unsigned ShOpVal =
+        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+      if (ShImm == 0 ||
+          ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+           ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
+        Latency -= 2;
+      else if (ShImm == 1 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsr)
+        --Latency;
+      break;
+    }
+    case ARM::t2LDRs:
+    case ARM::t2LDRBs:
+    case ARM::t2LDRHs:
+    case ARM::t2LDRSHs: {
+      // Thumb2 mode: lsl 0-3 only.
+      Latency -= 2;
+      break;
+    }
+    }
+  }
+
+  if (DefAlign < 8 && Subtarget.checkVLDnAccessAlignment())
+    switch (DefMCID.getOpcode()) {
+    default: break;
+    case ARM::VLD1q8:
+    case ARM::VLD1q16:
+    case ARM::VLD1q32:
+    case ARM::VLD1q64:
+    case ARM::VLD1q8wb_register:
+    case ARM::VLD1q16wb_register:
+    case ARM::VLD1q32wb_register:
+    case ARM::VLD1q64wb_register:
+    case ARM::VLD1q8wb_fixed:
+    case ARM::VLD1q16wb_fixed:
+    case ARM::VLD1q32wb_fixed:
+    case ARM::VLD1q64wb_fixed:
+    case ARM::VLD2d8:
+    case ARM::VLD2d16:
+    case ARM::VLD2d32:
+    case ARM::VLD2q8Pseudo:
+    case ARM::VLD2q16Pseudo:
+    case ARM::VLD2q32Pseudo:
+    case ARM::VLD2d8wb_fixed:
+    case ARM::VLD2d16wb_fixed:
+    case ARM::VLD2d32wb_fixed:
+    case ARM::VLD2q8PseudoWB_fixed:
+    case ARM::VLD2q16PseudoWB_fixed:
+    case ARM::VLD2q32PseudoWB_fixed:
+    case ARM::VLD2d8wb_register:
+    case ARM::VLD2d16wb_register:
+    case ARM::VLD2d32wb_register:
+    case ARM::VLD2q8PseudoWB_register:
+    case ARM::VLD2q16PseudoWB_register:
+    case ARM::VLD2q32PseudoWB_register:
+    case ARM::VLD3d8Pseudo:
+    case ARM::VLD3d16Pseudo:
+    case ARM::VLD3d32Pseudo:
+    case ARM::VLD1d64TPseudo:
+    case ARM::VLD1d64TPseudoWB_fixed:
+    case ARM::VLD3d8Pseudo_UPD:
+    case ARM::VLD3d16Pseudo_UPD:
+    case ARM::VLD3d32Pseudo_UPD:
+    case ARM::VLD3q8Pseudo_UPD:
+    case ARM::VLD3q16Pseudo_UPD:
+    case ARM::VLD3q32Pseudo_UPD:
+    case ARM::VLD3q8oddPseudo:
+    case ARM::VLD3q16oddPseudo:
+    case ARM::VLD3q32oddPseudo:
+    case ARM::VLD3q8oddPseudo_UPD:
+    case ARM::VLD3q16oddPseudo_UPD:
+    case ARM::VLD3q32oddPseudo_UPD:
+    case ARM::VLD4d8Pseudo:
+    case ARM::VLD4d16Pseudo:
+    case ARM::VLD4d32Pseudo:
+    case ARM::VLD1d64QPseudo:
+    case ARM::VLD1d64QPseudoWB_fixed:
+    case ARM::VLD4d8Pseudo_UPD:
+    case ARM::VLD4d16Pseudo_UPD:
+    case ARM::VLD4d32Pseudo_UPD:
+    case ARM::VLD4q8Pseudo_UPD:
+    case ARM::VLD4q16Pseudo_UPD:
+    case ARM::VLD4q32Pseudo_UPD:
+    case ARM::VLD4q8oddPseudo:
+    case ARM::VLD4q16oddPseudo:
+    case ARM::VLD4q32oddPseudo:
+    case ARM::VLD4q8oddPseudo_UPD:
+    case ARM::VLD4q16oddPseudo_UPD:
+    case ARM::VLD4q32oddPseudo_UPD:
+    case ARM::VLD1DUPq8:
+    case ARM::VLD1DUPq16:
+    case ARM::VLD1DUPq32:
+    case ARM::VLD1DUPq8wb_fixed:
+    case ARM::VLD1DUPq16wb_fixed:
+    case ARM::VLD1DUPq32wb_fixed:
+    case ARM::VLD1DUPq8wb_register:
+    case ARM::VLD1DUPq16wb_register:
+    case ARM::VLD1DUPq32wb_register:
+    case ARM::VLD2DUPd8:
+    case ARM::VLD2DUPd16:
+    case ARM::VLD2DUPd32:
+    case ARM::VLD2DUPd8wb_fixed:
+    case ARM::VLD2DUPd16wb_fixed:
+    case ARM::VLD2DUPd32wb_fixed:
+    case ARM::VLD2DUPd8wb_register:
+    case ARM::VLD2DUPd16wb_register:
+    case ARM::VLD2DUPd32wb_register:
+    case ARM::VLD4DUPd8Pseudo:
+    case ARM::VLD4DUPd16Pseudo:
+    case ARM::VLD4DUPd32Pseudo:
+    case ARM::VLD4DUPd8Pseudo_UPD:
+    case ARM::VLD4DUPd16Pseudo_UPD:
+    case ARM::VLD4DUPd32Pseudo_UPD:
+    case ARM::VLD1LNq8Pseudo:
+    case ARM::VLD1LNq16Pseudo:
+    case ARM::VLD1LNq32Pseudo:
+    case ARM::VLD1LNq8Pseudo_UPD:
+    case ARM::VLD1LNq16Pseudo_UPD:
+    case ARM::VLD1LNq32Pseudo_UPD:
+    case ARM::VLD2LNd8Pseudo:
+    case ARM::VLD2LNd16Pseudo:
+    case ARM::VLD2LNd32Pseudo:
+    case ARM::VLD2LNq16Pseudo:
+    case ARM::VLD2LNq32Pseudo:
+    case ARM::VLD2LNd8Pseudo_UPD:
+    case ARM::VLD2LNd16Pseudo_UPD:
+    case ARM::VLD2LNd32Pseudo_UPD:
+    case ARM::VLD2LNq16Pseudo_UPD:
+    case ARM::VLD2LNq32Pseudo_UPD:
+    case ARM::VLD4LNd8Pseudo:
+    case ARM::VLD4LNd16Pseudo:
+    case ARM::VLD4LNd32Pseudo:
+    case ARM::VLD4LNq16Pseudo:
+    case ARM::VLD4LNq32Pseudo:
+    case ARM::VLD4LNd8Pseudo_UPD:
+    case ARM::VLD4LNd16Pseudo_UPD:
+    case ARM::VLD4LNd32Pseudo_UPD:
+    case ARM::VLD4LNq16Pseudo_UPD:
+    case ARM::VLD4LNq32Pseudo_UPD:
+      // If the address is not 64-bit aligned, the latencies of these
+      // instructions increases by one.
+      ++Latency;
+      break;
+    }
+
+  return Latency;
+}
+
+unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr &MI) const {
+  if (MI.isCopyLike() || MI.isInsertSubreg() || MI.isRegSequence() ||
+      MI.isImplicitDef())
+    return 0;
+
+  if (MI.isBundle())
+    return 0;
+
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  if (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) {
+    // When predicated, CPSR is an additional source operand for CPSR updating
+    // instructions, this apparently increases their latencies.
+    return 1;
+  }
+  return 0;
+}
+
+unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                           const MachineInstr &MI,
+                                           unsigned *PredCost) const {
+  if (MI.isCopyLike() || MI.isInsertSubreg() || MI.isRegSequence() ||
+      MI.isImplicitDef())
+    return 1;
+
+  // An instruction scheduler typically runs on unbundled instructions, however
+  // other passes may query the latency of a bundled instruction.
+  if (MI.isBundle()) {
+    unsigned Latency = 0;
+    MachineBasicBlock::const_instr_iterator I = MI.getIterator();
+    MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
+    while (++I != E && I->isInsideBundle()) {
+      if (I->getOpcode() != ARM::t2IT)
+        Latency += getInstrLatency(ItinData, *I, PredCost);
+    }
+    return Latency;
+  }
+
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) {
+    // When predicated, CPSR is an additional source operand for CPSR updating
+    // instructions, this apparently increases their latencies.
+    *PredCost = 1;
+  }
+  // Be sure to call getStageLatency for an empty itinerary in case it has a
+  // valid MinLatency property.
+  if (!ItinData)
+    return MI.mayLoad() ? 3 : 1;
+
+  unsigned Class = MCID.getSchedClass();
+
+  // For instructions with variable uops, use uops as latency.
+  if (!ItinData->isEmpty() && ItinData->getNumMicroOps(Class) < 0)
+    return getNumMicroOps(ItinData, MI);
+
+  // For the common case, fall back on the itinerary's latency.
+  unsigned Latency = ItinData->getStageLatency(Class);
+
+  // Adjust for dynamic def-side opcode variants not captured by the itinerary.
+  unsigned DefAlign =
+      MI.hasOneMemOperand() ? (*MI.memoperands_begin())->getAlignment() : 0;
+  int Adj = adjustDefLatency(Subtarget, MI, MCID, DefAlign);
+  if (Adj >= 0 || (int)Latency > -Adj) {
+    return Latency + Adj;
+  }
+  return Latency;
+}
+
+int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                      SDNode *Node) const {
+  if (!Node->isMachineOpcode())
+    return 1;
+
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  unsigned Opcode = Node->getMachineOpcode();
+  switch (Opcode) {
+  default:
+    return ItinData->getStageLatency(get(Opcode).getSchedClass());
+  case ARM::VLDMQIA:
+  case ARM::VSTMQIA:
+    return 2;
+  }
+}
+
+bool ARMBaseInstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel,
+                                             const MachineRegisterInfo *MRI,
+                                             const MachineInstr &DefMI,
+                                             unsigned DefIdx,
+                                             const MachineInstr &UseMI,
+                                             unsigned UseIdx) const {
+  unsigned DDomain = DefMI.getDesc().TSFlags & ARMII::DomainMask;
+  unsigned UDomain = UseMI.getDesc().TSFlags & ARMII::DomainMask;
+  if (Subtarget.nonpipelinedVFP() &&
+      (DDomain == ARMII::DomainVFP || UDomain == ARMII::DomainVFP))
+    return true;
+
+  // Hoist VFP / NEON instructions with 4 or higher latency.
+  unsigned Latency =
+      SchedModel.computeOperandLatency(&DefMI, DefIdx, &UseMI, UseIdx);
+  if (Latency <= 3)
+    return false;
+  return DDomain == ARMII::DomainVFP || DDomain == ARMII::DomainNEON ||
+         UDomain == ARMII::DomainVFP || UDomain == ARMII::DomainNEON;
+}
+
+bool ARMBaseInstrInfo::hasLowDefLatency(const TargetSchedModel &SchedModel,
+                                        const MachineInstr &DefMI,
+                                        unsigned DefIdx) const {
+  const InstrItineraryData *ItinData = SchedModel.getInstrItineraries();
+  if (!ItinData || ItinData->isEmpty())
+    return false;
+
+  unsigned DDomain = DefMI.getDesc().TSFlags & ARMII::DomainMask;
+  if (DDomain == ARMII::DomainGeneral) {
+    unsigned DefClass = DefMI.getDesc().getSchedClass();
+    int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+    return (DefCycle != -1 && DefCycle <= 2);
+  }
+  return false;
+}
+
+bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI,
+                                         StringRef &ErrInfo) const {
+  if (convertAddSubFlagsOpcode(MI.getOpcode())) {
+    ErrInfo = "Pseudo flag setting opcodes only exist in Selection DAG";
+    return false;
+  }
+  return true;
+}
+
+// LoadStackGuard has so far only been implemented for MachO. Different code
+// sequence is needed for other targets.
+void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
+                                                unsigned LoadImmOpc,
+                                                unsigned LoadOpc) const {
+  assert(!Subtarget.isROPI() && !Subtarget.isRWPI() &&
+         "ROPI/RWPI not currently supported with stack guard");
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Reg = MI->getOperand(0).getReg();
+  const GlobalValue *GV =
+      cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+  MachineInstrBuilder MIB;
+
+  BuildMI(MBB, MI, DL, get(LoadImmOpc), Reg)
+      .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY);
+
+  if (Subtarget.isGVIndirectSymbol(GV)) {
+    MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg);
+    MIB.addReg(Reg, RegState::Kill).addImm(0);
+    auto Flags = MachineMemOperand::MOLoad |
+                 MachineMemOperand::MODereferenceable |
+                 MachineMemOperand::MOInvariant;
+    MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+        MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
+    MIB.addMemOperand(MMO);
+    AddDefaultPred(MIB);
+  }
+
+  MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg);
+  MIB.addReg(Reg, RegState::Kill).addImm(0);
+  MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+  AddDefaultPred(MIB);
+}
+
+bool
+ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
+                                     unsigned &AddSubOpc,
+                                     bool &NegAcc, bool &HasLane) const {
+  DenseMap<unsigned, unsigned>::const_iterator I = MLxEntryMap.find(Opcode);
+  if (I == MLxEntryMap.end())
+    return false;
+
+  const ARM_MLxEntry &Entry = ARM_MLxTable[I->second];
+  MulOpc = Entry.MulOpc;
+  AddSubOpc = Entry.AddSubOpc;
+  NegAcc = Entry.NegAcc;
+  HasLane = Entry.HasLane;
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Execution domains.
+//===----------------------------------------------------------------------===//
+//
+// Some instructions go down the NEON pipeline, some go down the VFP pipeline,
+// and some can go down both.  The vmov instructions go down the VFP pipeline,
+// but they can be changed to vorr equivalents that are executed by the NEON
+// pipeline.
+//
+// We use the following execution domain numbering:
+//
+enum ARMExeDomain {
+  ExeGeneric = 0,
+  ExeVFP = 1,
+  ExeNEON = 2
+};
+//
+// Also see ARMInstrFormats.td and Domain* enums in ARMBaseInfo.h
+//
+std::pair<uint16_t, uint16_t>
+ARMBaseInstrInfo::getExecutionDomain(const MachineInstr &MI) const {
+  // If we don't have access to NEON instructions then we won't be able
+  // to swizzle anything to the NEON domain. Check to make sure.
+  if (Subtarget.hasNEON()) {
+    // VMOVD, VMOVRS and VMOVSR are VFP instructions, but can be changed to NEON
+    // if they are not predicated.
+    if (MI.getOpcode() == ARM::VMOVD && !isPredicated(MI))
+      return std::make_pair(ExeVFP, (1 << ExeVFP) | (1 << ExeNEON));
+
+    // CortexA9 is particularly picky about mixing the two and wants these
+    // converted.
+    if (Subtarget.useNEONForFPMovs() && !isPredicated(MI) &&
+        (MI.getOpcode() == ARM::VMOVRS || MI.getOpcode() == ARM::VMOVSR ||
+         MI.getOpcode() == ARM::VMOVS))
+      return std::make_pair(ExeVFP, (1 << ExeVFP) | (1 << ExeNEON));
+  }
+  // No other instructions can be swizzled, so just determine their domain.
+  unsigned Domain = MI.getDesc().TSFlags & ARMII::DomainMask;
+
+  if (Domain & ARMII::DomainNEON)
+    return std::make_pair(ExeNEON, 0);
+
+  // Certain instructions can go either way on Cortex-A8.
+  // Treat them as NEON instructions.
+  if ((Domain & ARMII::DomainNEONA8) && Subtarget.isCortexA8())
+    return std::make_pair(ExeNEON, 0);
+
+  if (Domain & ARMII::DomainVFP)
+    return std::make_pair(ExeVFP, 0);
+
+  return std::make_pair(ExeGeneric, 0);
+}
+
+static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI,
+                                            unsigned SReg, unsigned &Lane) {
+  unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_0, &ARM::DPRRegClass);
+  Lane = 0;
+
+  if (DReg != ARM::NoRegister)
+   return DReg;
+
+  Lane = 1;
+  DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1, &ARM::DPRRegClass);
+
+  assert(DReg && "S-register with no D super-register?");
+  return DReg;
+}
+
+/// getImplicitSPRUseForDPRUse - Given a use of a DPR register and lane,
+/// set ImplicitSReg to a register number that must be marked as implicit-use or
+/// zero if no register needs to be defined as implicit-use.
+///
+/// If the function cannot determine if an SPR should be marked implicit use or
+/// not, it returns false.
+///
+/// This function handles cases where an instruction is being modified from taking
+/// an SPR to a DPR[Lane]. A use of the DPR is being added, which may conflict
+/// with an earlier def of an SPR corresponding to DPR[Lane^1] (i.e. the other
+/// lane of the DPR).
+///
+/// If the other SPR is defined, an implicit-use of it should be added. Else,
+/// (including the case where the DPR itself is defined), it should not.
+///
+static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI,
+                                       MachineInstr &MI, unsigned DReg,
+                                       unsigned Lane, unsigned &ImplicitSReg) {
+  // If the DPR is defined or used already, the other SPR lane will be chained
+  // correctly, so there is nothing to be done.
+  if (MI.definesRegister(DReg, TRI) || MI.readsRegister(DReg, TRI)) {
+    ImplicitSReg = 0;
+    return true;
+  }
+
+  // Otherwise we need to go searching to see if the SPR is set explicitly.
+  ImplicitSReg = TRI->getSubReg(DReg,
+                                (Lane & 1) ? ARM::ssub_0 : ARM::ssub_1);
+  MachineBasicBlock::LivenessQueryResult LQR =
+      MI.getParent()->computeRegisterLiveness(TRI, ImplicitSReg, MI);
+
+  if (LQR == MachineBasicBlock::LQR_Live)
+    return true;
+  else if (LQR == MachineBasicBlock::LQR_Unknown)
+    return false;
+
+  // If the register is known not to be live, there is no need to add an
+  // implicit-use.
+  ImplicitSReg = 0;
+  return true;
+}
+
+void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
+                                          unsigned Domain) const {
+  unsigned DstReg, SrcReg, DReg;
+  unsigned Lane;
+  MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("cannot handle opcode!");
+    break;
+  case ARM::VMOVD:
+    if (Domain != ExeNEON)
+      break;
+
+    // Zap the predicate operands.
+    assert(!isPredicated(MI) && "Cannot predicate a VORRd");
+
+    // Make sure we've got NEON instructions.
+    assert(Subtarget.hasNEON() && "VORRd requires NEON");
+
+    // Source instruction is %DDst = VMOVD %DSrc, 14, %noreg (; implicits)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+
+    for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
+      MI.RemoveOperand(i - 1);
+
+    // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits)
+    MI.setDesc(get(ARM::VORRd));
+    AddDefaultPred(
+        MIB.addReg(DstReg, RegState::Define).addReg(SrcReg).addReg(SrcReg));
+    break;
+  case ARM::VMOVRS:
+    if (Domain != ExeNEON)
+      break;
+    assert(!isPredicated(MI) && "Cannot predicate a VGETLN");
+
+    // Source instruction is %RDst = VMOVRS %SSrc, 14, %noreg (; implicits)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+
+    for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
+      MI.RemoveOperand(i - 1);
+
+    DReg = getCorrespondingDRegAndLane(TRI, SrcReg, Lane);
+
+    // Convert to %RDst = VGETLNi32 %DSrc, Lane, 14, %noreg (; imps)
+    // Note that DSrc has been widened and the other lane may be undef, which
+    // contaminates the entire register.
+    MI.setDesc(get(ARM::VGETLNi32));
+    AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
+                       .addReg(DReg, RegState::Undef)
+                       .addImm(Lane));
+
+    // The old source should be an implicit use, otherwise we might think it
+    // was dead before here.
+    MIB.addReg(SrcReg, RegState::Implicit);
+    break;
+  case ARM::VMOVSR: {
+    if (Domain != ExeNEON)
+      break;
+    assert(!isPredicated(MI) && "Cannot predicate a VSETLN");
+
+    // Source instruction is %SDst = VMOVSR %RSrc, 14, %noreg (; implicits)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+
+    DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane);
+
+    unsigned ImplicitSReg;
+    if (!getImplicitSPRUseForDPRUse(TRI, MI, DReg, Lane, ImplicitSReg))
+      break;
+
+    for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
+      MI.RemoveOperand(i - 1);
+
+    // Convert to %DDst = VSETLNi32 %DDst, %RSrc, Lane, 14, %noreg (; imps)
+    // Again DDst may be undefined at the beginning of this instruction.
+    MI.setDesc(get(ARM::VSETLNi32));
+    MIB.addReg(DReg, RegState::Define)
+        .addReg(DReg, getUndefRegState(!MI.readsRegister(DReg, TRI)))
+        .addReg(SrcReg)
+        .addImm(Lane);
+    AddDefaultPred(MIB);
+
+    // The narrower destination must be marked as set to keep previous chains
+    // in place.
+    MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+    if (ImplicitSReg != 0)
+      MIB.addReg(ImplicitSReg, RegState::Implicit);
+    break;
+    }
+    case ARM::VMOVS: {
+      if (Domain != ExeNEON)
+        break;
+
+      // Source instruction is %SDst = VMOVS %SSrc, 14, %noreg (; implicits)
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(1).getReg();
+
+      unsigned DstLane = 0, SrcLane = 0, DDst, DSrc;
+      DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane);
+      DSrc = getCorrespondingDRegAndLane(TRI, SrcReg, SrcLane);
+
+      unsigned ImplicitSReg;
+      if (!getImplicitSPRUseForDPRUse(TRI, MI, DSrc, SrcLane, ImplicitSReg))
+        break;
+
+      for (unsigned i = MI.getDesc().getNumOperands(); i; --i)
+        MI.RemoveOperand(i - 1);
+
+      if (DSrc == DDst) {
+        // Destination can be:
+        //     %DDst = VDUPLN32d %DDst, Lane, 14, %noreg (; implicits)
+        MI.setDesc(get(ARM::VDUPLN32d));
+        MIB.addReg(DDst, RegState::Define)
+            .addReg(DDst, getUndefRegState(!MI.readsRegister(DDst, TRI)))
+            .addImm(SrcLane);
+        AddDefaultPred(MIB);
+
+        // Neither the source or the destination are naturally represented any
+        // more, so add them in manually.
+        MIB.addReg(DstReg, RegState::Implicit | RegState::Define);
+        MIB.addReg(SrcReg, RegState::Implicit);
+        if (ImplicitSReg != 0)
+          MIB.addReg(ImplicitSReg, RegState::Implicit);
+        break;
+      }
+
+      // In general there's no single instruction that can perform an S <-> S
+      // move in NEON space, but a pair of VEXT instructions *can* do the
+      // job. It turns out that the VEXTs needed will only use DSrc once, with
+      // the position based purely on the combination of lane-0 and lane-1
+      // involved. For example
+      //     vmov s0, s2 -> vext.32 d0, d0, d1, #1  vext.32 d0, d0, d0, #1
+      //     vmov s1, s3 -> vext.32 d0, d1, d0, #1  vext.32 d0, d0, d0, #1
+      //     vmov s0, s3 -> vext.32 d0, d0, d0, #1  vext.32 d0, d1, d0, #1
+      //     vmov s1, s2 -> vext.32 d0, d0, d0, #1  vext.32 d0, d0, d1, #1
+      //
+      // Pattern of the MachineInstrs is:
+      //     %DDst = VEXTd32 %DSrc1, %DSrc2, Lane, 14, %noreg (;implicits)
+      MachineInstrBuilder NewMIB;
+      NewMIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(ARM::VEXTd32),
+                       DDst);
+
+      // On the first instruction, both DSrc and DDst may be <undef> if present.
+      // Specifically when the original instruction didn't have them as an
+      // <imp-use>.
+      unsigned CurReg = SrcLane == 1 && DstLane == 1 ? DSrc : DDst;
+      bool CurUndef = !MI.readsRegister(CurReg, TRI);
+      NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      CurReg = SrcLane == 0 && DstLane == 0 ? DSrc : DDst;
+      CurUndef = !MI.readsRegister(CurReg, TRI);
+      NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      NewMIB.addImm(1);
+      AddDefaultPred(NewMIB);
+
+      if (SrcLane == DstLane)
+        NewMIB.addReg(SrcReg, RegState::Implicit);
+
+      MI.setDesc(get(ARM::VEXTd32));
+      MIB.addReg(DDst, RegState::Define);
+
+      // On the second instruction, DDst has definitely been defined above, so
+      // it is not <undef>. DSrc, if present, can be <undef> as above.
+      CurReg = SrcLane == 1 && DstLane == 0 ? DSrc : DDst;
+      CurUndef = CurReg == DSrc && !MI.readsRegister(CurReg, TRI);
+      MIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      CurReg = SrcLane == 0 && DstLane == 1 ? DSrc : DDst;
+      CurUndef = CurReg == DSrc && !MI.readsRegister(CurReg, TRI);
+      MIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      MIB.addImm(1);
+      AddDefaultPred(MIB);
+
+      if (SrcLane != DstLane)
+        MIB.addReg(SrcReg, RegState::Implicit);
+
+      // As before, the original destination is no longer represented, add it
+      // implicitly.
+      MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+      if (ImplicitSReg != 0)
+        MIB.addReg(ImplicitSReg, RegState::Implicit);
+      break;
+    }
+  }
+
+}
+
+//===----------------------------------------------------------------------===//
+// Partial register updates
+//===----------------------------------------------------------------------===//
+//
+// Swift renames NEON registers with 64-bit granularity.  That means any
+// instruction writing an S-reg implicitly reads the containing D-reg.  The
+// problem is mostly avoided by translating f32 operations to v2f32 operations
+// on D-registers, but f32 loads are still a problem.
+//
+// These instructions can load an f32 into a NEON register:
+//
+// VLDRS - Only writes S, partial D update.
+// VLD1LNd32 - Writes all D-regs, explicit partial D update, 2 uops.
+// VLD1DUPd32 - Writes all D-regs, no partial reg update, 2 uops.
+//
+// FCONSTD can be used as a dependency-breaking instruction.
+unsigned ARMBaseInstrInfo::getPartialRegUpdateClearance(
+    const MachineInstr &MI, unsigned OpNum,
+    const TargetRegisterInfo *TRI) const {
+  auto PartialUpdateClearance = Subtarget.getPartialUpdateClearance();
+  if (!PartialUpdateClearance)
+    return 0;
+
+  assert(TRI && "Need TRI instance");
+
+  const MachineOperand &MO = MI.getOperand(OpNum);
+  if (MO.readsReg())
+    return 0;
+  unsigned Reg = MO.getReg();
+  int UseOp = -1;
+
+  switch (MI.getOpcode()) {
+  // Normal instructions writing only an S-register.
+  case ARM::VLDRS:
+  case ARM::FCONSTS:
+  case ARM::VMOVSR:
+  case ARM::VMOVv8i8:
+  case ARM::VMOVv4i16:
+  case ARM::VMOVv2i32:
+  case ARM::VMOVv2f32:
+  case ARM::VMOVv1i64:
+    UseOp = MI.findRegisterUseOperandIdx(Reg, false, TRI);
+    break;
+
+    // Explicitly reads the dependency.
+  case ARM::VLD1LNd32:
+    UseOp = 3;
+    break;
+  default:
+    return 0;
+  }
+
+  // If this instruction actually reads a value from Reg, there is no unwanted
+  // dependency.
+  if (UseOp != -1 && MI.getOperand(UseOp).readsReg())
+    return 0;
+
+  // We must be able to clobber the whole D-reg.
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    // Virtual register must be a foo:ssub_0<def,undef> operand.
+    if (!MO.getSubReg() || MI.readsVirtualRegister(Reg))
+      return 0;
+  } else if (ARM::SPRRegClass.contains(Reg)) {
+    // Physical register: MI must define the full D-reg.
+    unsigned DReg = TRI->getMatchingSuperReg(Reg, ARM::ssub_0,
+                                             &ARM::DPRRegClass);
+    if (!DReg || !MI.definesRegister(DReg, TRI))
+      return 0;
+  }
+
+  // MI has an unwanted D-register dependency.
+  // Avoid defs in the previous N instructrions.
+  return PartialUpdateClearance;
+}
+
+// Break a partial register dependency after getPartialRegUpdateClearance
+// returned non-zero.
+void ARMBaseInstrInfo::breakPartialRegDependency(
+    MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
+  assert(OpNum < MI.getDesc().getNumDefs() && "OpNum is not a def");
+  assert(TRI && "Need TRI instance");
+
+  const MachineOperand &MO = MI.getOperand(OpNum);
+  unsigned Reg = MO.getReg();
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         "Can't break virtual register dependencies.");
+  unsigned DReg = Reg;
+
+  // If MI defines an S-reg, find the corresponding D super-register.
+  if (ARM::SPRRegClass.contains(Reg)) {
+    DReg = ARM::D0 + (Reg - ARM::S0) / 2;
+    assert(TRI->isSuperRegister(Reg, DReg) && "Register enums broken");
+  }
+
+  assert(ARM::DPRRegClass.contains(DReg) && "Can only break D-reg deps");
+  assert(MI.definesRegister(DReg, TRI) && "MI doesn't clobber full D-reg");
+
+  // FIXME: In some cases, VLDRS can be changed to a VLD1DUPd32 which defines
+  // the full D-register by loading the same value to both lanes.  The
+  // instruction is micro-coded with 2 uops, so don't do this until we can
+  // properly schedule micro-coded instructions.  The dispatcher stalls cause
+  // too big regressions.
+
+  // Insert the dependency-breaking FCONSTD before MI.
+  // 96 is the encoding of 0.5, but the actual value doesn't matter here.
+  AddDefaultPred(
+      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(ARM::FCONSTD), DReg)
+          .addImm(96));
+  MI.addRegisterKilled(DReg, TRI, true);
+}
+
+bool ARMBaseInstrInfo::hasNOP() const {
+  return Subtarget.getFeatureBits()[ARM::HasV6KOps];
+}
+
+bool ARMBaseInstrInfo::isSwiftFastImmShift(const MachineInstr *MI) const {
+  if (MI->getNumOperands() < 4)
+    return true;
+  unsigned ShOpVal = MI->getOperand(3).getImm();
+  unsigned ShImm = ARM_AM::getSORegOffset(ShOpVal);
+  // Swift supports faster shifts for: lsl 2, lsl 1, and lsr 1.
+  if ((ShImm == 1 && ARM_AM::getSORegShOp(ShOpVal) == ARM_AM::lsr) ||
+      ((ShImm == 1 || ShImm == 2) &&
+       ARM_AM::getSORegShOp(ShOpVal) == ARM_AM::lsl))
+    return true;
+
+  return false;
+}
+
+bool ARMBaseInstrInfo::getRegSequenceLikeInputs(
+    const MachineInstr &MI, unsigned DefIdx,
+    SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const {
+  assert(DefIdx < MI.getDesc().getNumDefs() && "Invalid definition index");
+  assert(MI.isRegSequenceLike() && "Invalid kind of instruction");
+
+  switch (MI.getOpcode()) {
+  case ARM::VMOVDRR:
+    // dX = VMOVDRR rY, rZ
+    // is the same as:
+    // dX = REG_SEQUENCE rY, ssub_0, rZ, ssub_1
+    // Populate the InputRegs accordingly.
+    // rY
+    const MachineOperand *MOReg = &MI.getOperand(1);
+    InputRegs.push_back(
+        RegSubRegPairAndIdx(MOReg->getReg(), MOReg->getSubReg(), ARM::ssub_0));
+    // rZ
+    MOReg = &MI.getOperand(2);
+    InputRegs.push_back(
+        RegSubRegPairAndIdx(MOReg->getReg(), MOReg->getSubReg(), ARM::ssub_1));
+    return true;
+  }
+  llvm_unreachable("Target dependent opcode missing");
+}
+
+bool ARMBaseInstrInfo::getExtractSubregLikeInputs(
+    const MachineInstr &MI, unsigned DefIdx,
+    RegSubRegPairAndIdx &InputReg) const {
+  assert(DefIdx < MI.getDesc().getNumDefs() && "Invalid definition index");
+  assert(MI.isExtractSubregLike() && "Invalid kind of instruction");
+
+  switch (MI.getOpcode()) {
+  case ARM::VMOVRRD:
+    // rX, rY = VMOVRRD dZ
+    // is the same as:
+    // rX = EXTRACT_SUBREG dZ, ssub_0
+    // rY = EXTRACT_SUBREG dZ, ssub_1
+    const MachineOperand &MOReg = MI.getOperand(2);
+    InputReg.Reg = MOReg.getReg();
+    InputReg.SubReg = MOReg.getSubReg();
+    InputReg.SubIdx = DefIdx == 0 ? ARM::ssub_0 : ARM::ssub_1;
+    return true;
+  }
+  llvm_unreachable("Target dependent opcode missing");
+}
+
+bool ARMBaseInstrInfo::getInsertSubregLikeInputs(
+    const MachineInstr &MI, unsigned DefIdx, RegSubRegPair &BaseReg,
+    RegSubRegPairAndIdx &InsertedReg) const {
+  assert(DefIdx < MI.getDesc().getNumDefs() && "Invalid definition index");
+  assert(MI.isInsertSubregLike() && "Invalid kind of instruction");
+
+  switch (MI.getOpcode()) {
+  case ARM::VSETLNi32:
+    // dX = VSETLNi32 dY, rZ, imm
+    const MachineOperand &MOBaseReg = MI.getOperand(1);
+    const MachineOperand &MOInsertedReg = MI.getOperand(2);
+    const MachineOperand &MOIndex = MI.getOperand(3);
+    BaseReg.Reg = MOBaseReg.getReg();
+    BaseReg.SubReg = MOBaseReg.getSubReg();
+
+    InsertedReg.Reg = MOInsertedReg.getReg();
+    InsertedReg.SubReg = MOInsertedReg.getSubReg();
+    InsertedReg.SubIdx = MOIndex.getImm() == 0 ? ARM::ssub_0 : ARM::ssub_1;
+    return true;
+  }
+  llvm_unreachable("Target dependent opcode missing");
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
new file mode 100644
index 000000000000..b01d5c8ec85f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -0,0 +1,522 @@
+//===-- ARMBaseInstrInfo.h - ARM Base Instruction Information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Base ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
+
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "ARMGenInstrInfo.inc"
+
+namespace llvm {
+  class ARMSubtarget;
+  class ARMBaseRegisterInfo;
+
+class ARMBaseInstrInfo : public ARMGenInstrInfo {
+  const ARMSubtarget &Subtarget;
+
+protected:
+  // Can be only subclassed.
+  explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
+
+  void expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
+                                unsigned LoadImmOpc, unsigned LoadOpc) const;
+
+  /// Build the equivalent inputs of a REG_SEQUENCE for the given \p MI
+  /// and \p DefIdx.
+  /// \p [out] InputRegs of the equivalent REG_SEQUENCE. Each element of
+  /// the list is modeled as <Reg:SubReg, SubIdx>.
+  /// E.g., REG_SEQUENCE vreg1:sub1, sub0, vreg2, sub1 would produce
+  /// two elements:
+  /// - vreg1:sub1, sub0
+  /// - vreg2<:0>, sub1
+  ///
+  /// \returns true if it is possible to build such an input sequence
+  /// with the pair \p MI, \p DefIdx. False otherwise.
+  ///
+  /// \pre MI.isRegSequenceLike().
+  bool getRegSequenceLikeInputs(
+      const MachineInstr &MI, unsigned DefIdx,
+      SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const override;
+
+  /// Build the equivalent inputs of a EXTRACT_SUBREG for the given \p MI
+  /// and \p DefIdx.
+  /// \p [out] InputReg of the equivalent EXTRACT_SUBREG.
+  /// E.g., EXTRACT_SUBREG vreg1:sub1, sub0, sub1 would produce:
+  /// - vreg1:sub1, sub0
+  ///
+  /// \returns true if it is possible to build such an input sequence
+  /// with the pair \p MI, \p DefIdx. False otherwise.
+  ///
+  /// \pre MI.isExtractSubregLike().
+  bool getExtractSubregLikeInputs(const MachineInstr &MI, unsigned DefIdx,
+                                  RegSubRegPairAndIdx &InputReg) const override;
+
+  /// Build the equivalent inputs of a INSERT_SUBREG for the given \p MI
+  /// and \p DefIdx.
+  /// \p [out] BaseReg and \p [out] InsertedReg contain
+  /// the equivalent inputs of INSERT_SUBREG.
+  /// E.g., INSERT_SUBREG vreg0:sub0, vreg1:sub1, sub3 would produce:
+  /// - BaseReg: vreg0:sub0
+  /// - InsertedReg: vreg1:sub1, sub3
+  ///
+  /// \returns true if it is possible to build such an input sequence
+  /// with the pair \p MI, \p DefIdx. False otherwise.
+  ///
+  /// \pre MI.isInsertSubregLike().
+  bool
+  getInsertSubregLikeInputs(const MachineInstr &MI, unsigned DefIdx,
+                            RegSubRegPair &BaseReg,
+                            RegSubRegPairAndIdx &InsertedReg) const override;
+
+  /// Commutes the operands in the given instruction.
+  /// The commutable operands are specified by their indices OpIdx1 and OpIdx2.
+  ///
+  /// Do not call this method for a non-commutable instruction or for
+  /// non-commutable pair of operand indices OpIdx1 and OpIdx2.
+  /// Even though the instruction is commutable, the method may still
+  /// fail to commute the operands, null pointer is returned in such cases.
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+                                       unsigned OpIdx1,
+                                       unsigned OpIdx2) const override;
+
+public:
+  // Return whether the target has an explicit NOP encoding.
+  bool hasNOP() const;
+
+  virtual void getNoopForElfTarget(MCInst &NopInst) const {
+    getNoopForMachoTarget(NopInst);
+  }
+
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  virtual unsigned getUnindexedOpcode(unsigned Opc) const =0;
+
+  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                      MachineInstr &MI,
+                                      LiveVariables *LV) const override;
+
+  virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0;
+  const ARMSubtarget &getSubtarget() const { return Subtarget; }
+
+  ScheduleHazardRecognizer *
+  CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+                               const ScheduleDAG *DAG) const override;
+
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                     const ScheduleDAG *DAG) const override;
+
+  // Branch analysis.
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override;
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+
+  bool
+  reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  // Predication support.
+  bool isPredicated(const MachineInstr &MI) const override;
+
+  ARMCC::CondCodes getPredicate(const MachineInstr &MI) const {
+    int PIdx = MI.findFirstPredOperandIdx();
+    return PIdx != -1 ? (ARMCC::CondCodes)MI.getOperand(PIdx).getImm()
+                      : ARMCC::AL;
+  }
+
+  bool PredicateInstruction(MachineInstr &MI,
+                            ArrayRef<MachineOperand> Pred) const override;
+
+  bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+                         ArrayRef<MachineOperand> Pred2) const override;
+
+  bool DefinesPredicate(MachineInstr &MI,
+                        std::vector<MachineOperand> &Pred) const override;
+
+  bool isPredicable(MachineInstr &MI) const override;
+
+  /// GetInstSize - Returns the size of the specified MachineInstr.
+  ///
+  unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
+                                     int &FrameIndex) const override;
+  unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
+                                    int &FrameIndex) const override;
+
+  void copyToCPSR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                  unsigned SrcReg, bool KillSrc,
+                  const ARMSubtarget &Subtarget) const;
+  void copyFromCPSR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned DestReg, bool KillSrc,
+                    const ARMSubtarget &Subtarget) const;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, unsigned SubIdx,
+                     const MachineInstr &Orig,
+                     const TargetRegisterInfo &TRI) const override;
+
+  MachineInstr *duplicate(MachineInstr &Orig,
+                          MachineFunction &MF) const override;
+
+  const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
+                                     unsigned SubIdx, unsigned State,
+                                     const TargetRegisterInfo *TRI) const;
+
+  bool produceSameValue(const MachineInstr &MI0, const MachineInstr &MI1,
+                        const MachineRegisterInfo *MRI) const override;
+
+  /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
+  /// determine if two loads are loading from the same base address. It should
+  /// only return true if the base pointers are the same and the only
+  /// differences between the two addresses is the offset. It also returns the
+  /// offsets by reference.
+  bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
+                               int64_t &Offset2) const override;
+
+  /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+  /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads
+  /// should be scheduled togther. On some targets if two loads are loading from
+  /// addresses in the same cache line, it's better if they are scheduled
+  /// together. This function takes two integers that represent the load offsets
+  /// from the common base address. It returns true if it decides it's desirable
+  /// to schedule the two loads together. "NumLoads" is the number of loads that
+  /// have already been scheduled after Load1.
+  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                               int64_t Offset1, int64_t Offset2,
+                               unsigned NumLoads) const override;
+
+  bool isSchedulingBoundary(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB,
+                           unsigned NumCycles, unsigned ExtraPredCycles,
+                           BranchProbability Probability) const override;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT,
+                           unsigned ExtraT, MachineBasicBlock &FMBB,
+                           unsigned NumF, unsigned ExtraF,
+                           BranchProbability Probability) const override;
+
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                                 BranchProbability Probability) const override {
+    return NumCycles == 1;
+  }
+
+  bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                 MachineBasicBlock &FMBB) const override;
+
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2 if having two register operands, and the value it
+  /// compares against in CmpValue. Return true if the comparison instruction
+  /// can be analyzed.
+  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &CmpMask,
+                      int &CmpValue) const override;
+
+  /// optimizeCompareInstr - Convert the instruction to set the zero flag so
+  /// that we can remove a "comparison with zero"; Remove a redundant CMP
+  /// instruction if the flags can be updated in the same way by an earlier
+  /// instruction such as SUB.
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int CmpMask, int CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
+
+  bool analyzeSelect(const MachineInstr &MI,
+                     SmallVectorImpl<MachineOperand> &Cond, unsigned &TrueOp,
+                     unsigned &FalseOp, bool &Optimizable) const override;
+
+  MachineInstr *optimizeSelect(MachineInstr &MI,
+                               SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+                               bool) const override;
+
+  /// FoldImmediate - 'Reg' is known to be defined by a move immediate
+  /// instruction, try to fold the immediate into the use instruction.
+  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+                     MachineRegisterInfo *MRI) const override;
+
+  unsigned getNumMicroOps(const InstrItineraryData *ItinData,
+                          const MachineInstr &MI) const override;
+
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        const MachineInstr &DefMI, unsigned DefIdx,
+                        const MachineInstr &UseMI,
+                        unsigned UseIdx) const override;
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        SDNode *DefNode, unsigned DefIdx,
+                        SDNode *UseNode, unsigned UseIdx) const override;
+
+  /// VFP/NEON execution domains.
+  std::pair<uint16_t, uint16_t>
+  getExecutionDomain(const MachineInstr &MI) const override;
+  void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override;
+
+  unsigned
+  getPartialRegUpdateClearance(const MachineInstr &, unsigned,
+                               const TargetRegisterInfo *) const override;
+  void breakPartialRegDependency(MachineInstr &, unsigned,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  /// Get the number of addresses by LDM or VLDM or zero for unknown.
+  unsigned getNumLDMAddresses(const MachineInstr &MI) const;
+
+private:
+  unsigned getInstBundleLength(const MachineInstr &MI) const;
+
+  int getVLDMDefCycle(const InstrItineraryData *ItinData,
+                      const MCInstrDesc &DefMCID,
+                      unsigned DefClass,
+                      unsigned DefIdx, unsigned DefAlign) const;
+  int getLDMDefCycle(const InstrItineraryData *ItinData,
+                     const MCInstrDesc &DefMCID,
+                     unsigned DefClass,
+                     unsigned DefIdx, unsigned DefAlign) const;
+  int getVSTMUseCycle(const InstrItineraryData *ItinData,
+                      const MCInstrDesc &UseMCID,
+                      unsigned UseClass,
+                      unsigned UseIdx, unsigned UseAlign) const;
+  int getSTMUseCycle(const InstrItineraryData *ItinData,
+                     const MCInstrDesc &UseMCID,
+                     unsigned UseClass,
+                     unsigned UseIdx, unsigned UseAlign) const;
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        const MCInstrDesc &DefMCID,
+                        unsigned DefIdx, unsigned DefAlign,
+                        const MCInstrDesc &UseMCID,
+                        unsigned UseIdx, unsigned UseAlign) const;
+
+  int getOperandLatencyImpl(const InstrItineraryData *ItinData,
+                            const MachineInstr &DefMI, unsigned DefIdx,
+                            const MCInstrDesc &DefMCID, unsigned DefAdj,
+                            const MachineOperand &DefMO, unsigned Reg,
+                            const MachineInstr &UseMI, unsigned UseIdx,
+                            const MCInstrDesc &UseMCID, unsigned UseAdj) const;
+
+  unsigned getPredicationCost(const MachineInstr &MI) const override;
+
+  unsigned getInstrLatency(const InstrItineraryData *ItinData,
+                           const MachineInstr &MI,
+                           unsigned *PredCost = nullptr) const override;
+
+  int getInstrLatency(const InstrItineraryData *ItinData,
+                      SDNode *Node) const override;
+
+  bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
+                             const MachineRegisterInfo *MRI,
+                             const MachineInstr &DefMI, unsigned DefIdx,
+                             const MachineInstr &UseMI,
+                             unsigned UseIdx) const override;
+  bool hasLowDefLatency(const TargetSchedModel &SchedModel,
+                        const MachineInstr &DefMI,
+                        unsigned DefIdx) const override;
+
+  /// verifyInstruction - Perform target specific instruction verification.
+  bool verifyInstruction(const MachineInstr &MI,
+                         StringRef &ErrInfo) const override;
+
+  virtual void expandLoadStackGuard(MachineBasicBlock::iterator MI) const = 0;
+
+  void expandMEMCPY(MachineBasicBlock::iterator) const;
+
+private:
+  /// Modeling special VFP / NEON fp MLA / MLS hazards.
+
+  /// MLxEntryMap - Map fp MLA / MLS to the corresponding entry in the internal
+  /// MLx table.
+  DenseMap<unsigned, unsigned> MLxEntryMap;
+
+  /// MLxHazardOpcodes - Set of add / sub and multiply opcodes that would cause
+  /// stalls when scheduled together with fp MLA / MLS opcodes.
+  SmallSet<unsigned, 16> MLxHazardOpcodes;
+
+public:
+  /// isFpMLxInstruction - Return true if the specified opcode is a fp MLA / MLS
+  /// instruction.
+  bool isFpMLxInstruction(unsigned Opcode) const {
+    return MLxEntryMap.count(Opcode);
+  }
+
+  /// isFpMLxInstruction - This version also returns the multiply opcode and the
+  /// addition / subtraction opcode to expand to. Return true for 'HasLane' for
+  /// the MLX instructions with an extra lane operand.
+  bool isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
+                          unsigned &AddSubOpc, bool &NegAcc,
+                          bool &HasLane) const;
+
+  /// canCauseFpMLxStall - Return true if an instruction of the specified opcode
+  /// will cause stalls when scheduled after (within 4-cycle window) a fp
+  /// MLA / MLS instruction.
+  bool canCauseFpMLxStall(unsigned Opcode) const {
+    return MLxHazardOpcodes.count(Opcode);
+  }
+
+  /// Returns true if the instruction has a shift by immediate that can be
+  /// executed in one cycle less.
+  bool isSwiftFastImmShift(const MachineInstr *MI) const;
+};
+
+static inline
+const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) {
+  return MIB.addImm((int64_t)ARMCC::AL).addReg(0);
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) {
+  return MIB.addReg(0);
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultT1CC(const MachineInstrBuilder &MIB,
+                                          bool isDead = false) {
+  return MIB.addReg(ARM::CPSR, getDefRegState(true) | getDeadRegState(isDead));
+}
+
+static inline
+const MachineInstrBuilder &AddNoT1CC(const MachineInstrBuilder &MIB) {
+  return MIB.addReg(0);
+}
+
+static inline
+bool isUncondBranchOpcode(int Opc) {
+  return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B;
+}
+
+static inline
+bool isCondBranchOpcode(int Opc) {
+  return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc;
+}
+
+static inline
+bool isJumpTableBranchOpcode(int Opc) {
+  return Opc == ARM::BR_JTr || Opc == ARM::BR_JTm || Opc == ARM::BR_JTadd ||
+    Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT;
+}
+
+static inline
+bool isIndirectBranchOpcode(int Opc) {
+  return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;
+}
+
+static inline bool isPopOpcode(int Opc) {
+  return Opc == ARM::tPOP_RET || Opc == ARM::LDMIA_RET ||
+         Opc == ARM::t2LDMIA_RET || Opc == ARM::tPOP || Opc == ARM::LDMIA_UPD ||
+         Opc == ARM::t2LDMIA_UPD || Opc == ARM::VLDMDIA_UPD;
+}
+
+static inline bool isPushOpcode(int Opc) {
+  return Opc == ARM::tPUSH || Opc == ARM::t2STMDB_UPD ||
+         Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD;
+}
+
+/// getInstrPredicate - If instruction is predicated, returns its predicate
+/// condition, otherwise returns AL. It also returns the condition code
+/// register by reference.
+ARMCC::CondCodes getInstrPredicate(const MachineInstr &MI, unsigned &PredReg);
+
+unsigned getMatchingCondBranchOpcode(unsigned Opc);
+
+/// Determine if MI can be folded into an ARM MOVCC instruction, and return the
+/// opcode of the SSA instruction representing the conditional MI.
+unsigned canFoldARMInstrIntoMOVCC(unsigned Reg,
+                                  MachineInstr *&MI,
+                                  const MachineRegisterInfo &MRI);
+
+/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether
+/// the instruction is encoded with an 'S' bit is determined by the optional
+/// CPSR def operand.
+unsigned convertAddSubFlagsOpcode(unsigned OldOpc);
+
+/// emitARMRegPlusImmediate / emitT2RegPlusImmediate - Emits a series of
+/// instructions to materializea destreg = basereg + immediate in ARM / Thumb2
+/// code.
+void emitARMRegPlusImmediate(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator &MBBI,
+                             const DebugLoc &dl, unsigned DestReg,
+                             unsigned BaseReg, int NumBytes,
+                             ARMCC::CondCodes Pred, unsigned PredReg,
+                             const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
+
+void emitT2RegPlusImmediate(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator &MBBI,
+                            const DebugLoc &dl, unsigned DestReg,
+                            unsigned BaseReg, int NumBytes,
+                            ARMCC::CondCodes Pred, unsigned PredReg,
+                            const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
+void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator &MBBI,
+                               const DebugLoc &dl, unsigned DestReg,
+                               unsigned BaseReg, int NumBytes,
+                               const TargetInstrInfo &TII,
+                               const ARMBaseRegisterInfo &MRI,
+                               unsigned MIFlags = 0);
+
+/// Tries to add registers to the reglist of a given base-updating
+/// push/pop instruction to adjust the stack by an additional
+/// NumBytes. This can save a few bytes per function in code-size, but
+/// obviously generates more memory traffic. As such, it only takes
+/// effect in functions being optimised for size.
+bool tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
+                                MachineFunction &MF, MachineInstr *MI,
+                                unsigned NumBytes);
+
+/// rewriteARMFrameIndex / rewriteT2FrameIndex -
+/// Rewrite MI to access 'Offset' bytes from the FP. Return false if the
+/// offset could not be handled directly in MI, and return the left-over
+/// portion by reference.
+bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                          unsigned FrameReg, int &Offset,
+                          const ARMBaseInstrInfo &TII);
+
+bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                         unsigned FrameReg, int &Offset,
+                         const ARMBaseInstrInfo &TII);
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
new file mode 100644
index 000000000000..d995c631dd1c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -0,0 +1,843 @@
+//===-- ARMBaseRegisterInfo.cpp - ARM Register Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the base ARM implementation of TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMBaseRegisterInfo.h"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMFrameLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define DEBUG_TYPE "arm-register-info"
+
+#define GET_REGINFO_TARGET_DESC
+#include "ARMGenRegisterInfo.inc"
+
+using namespace llvm;
+
+ARMBaseRegisterInfo::ARMBaseRegisterInfo()
+    : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), BasePtr(ARM::R6) {}
+
+static unsigned getFramePointerReg(const ARMSubtarget &STI) {
+  return STI.useR7AsFramePointer() ? ARM::R7 : ARM::R11;
+}
+
+const MCPhysReg*
+ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  const ARMSubtarget &STI = MF->getSubtarget<ARMSubtarget>();
+  bool UseSplitPush = STI.splitFramePushPop(*MF);
+  const MCPhysReg *RegList =
+      STI.isTargetDarwin()
+          ? CSR_iOS_SaveList
+          : (UseSplitPush ? CSR_AAPCS_SplitPush_SaveList : CSR_AAPCS_SaveList);
+
+  const Function *F = MF->getFunction();
+  if (F->getCallingConv() == CallingConv::GHC) {
+    // GHC set of callee saved regs is empty as all those regs are
+    // used for passing STG regs around
+    return CSR_NoRegs_SaveList;
+  } else if (F->hasFnAttribute("interrupt")) {
+    if (STI.isMClass()) {
+      // M-class CPUs have hardware which saves the registers needed to allow a
+      // function conforming to the AAPCS to function as a handler.
+      return UseSplitPush ? CSR_AAPCS_SplitPush_SaveList : CSR_AAPCS_SaveList;
+    } else if (F->getFnAttribute("interrupt").getValueAsString() == "FIQ") {
+      // Fast interrupt mode gives the handler a private copy of R8-R14, so less
+      // need to be saved to restore user-mode state.
+      return CSR_FIQ_SaveList;
+    } else {
+      // Generally only R13-R14 (i.e. SP, LR) are automatically preserved by
+      // exception handling.
+      return CSR_GenericInt_SaveList;
+    }
+  }
+
+  if (STI.isTargetDarwin() && STI.getTargetLowering()->supportSwiftError() &&
+      F->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return CSR_iOS_SwiftError_SaveList;
+
+  if (STI.isTargetDarwin() && F->getCallingConv() == CallingConv::CXX_FAST_TLS)
+    return MF->getInfo<ARMFunctionInfo>()->isSplitCSR()
+               ? CSR_iOS_CXX_TLS_PE_SaveList
+               : CSR_iOS_CXX_TLS_SaveList;
+  return RegList;
+}
+
+const MCPhysReg *ARMBaseRegisterInfo::getCalleeSavedRegsViaCopy(
+    const MachineFunction *MF) const {
+  assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+      MF->getInfo<ARMFunctionInfo>()->isSplitCSR())
+    return CSR_iOS_CXX_TLS_ViaCopy_SaveList;
+  return nullptr;
+}
+
+const uint32_t *
+ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                          CallingConv::ID CC) const {
+  const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+  if (CC == CallingConv::GHC)
+    // This is academic becase all GHC calls are (supposed to be) tail calls
+    return CSR_NoRegs_RegMask;
+
+  if (STI.isTargetDarwin() && STI.getTargetLowering()->supportSwiftError() &&
+      MF.getFunction()->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return CSR_iOS_SwiftError_RegMask;
+
+  if (STI.isTargetDarwin() && CC == CallingConv::CXX_FAST_TLS)
+    return CSR_iOS_CXX_TLS_RegMask;
+  return STI.isTargetDarwin() ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
+}
+
+const uint32_t*
+ARMBaseRegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
+}
+
+const uint32_t *
+ARMBaseRegisterInfo::getTLSCallPreservedMask(const MachineFunction &MF) const {
+  assert(MF.getSubtarget<ARMSubtarget>().isTargetDarwin() &&
+         "only know about special TLS call on Darwin");
+  return CSR_iOS_TLSCall_RegMask;
+}
+
+const uint32_t *
+ARMBaseRegisterInfo::getSjLjDispatchPreservedMask(const MachineFunction &MF) const {
+  const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+  if (!STI.useSoftFloat() && STI.hasVFP2() && !STI.isThumb1Only())
+    return CSR_NoRegs_RegMask;
+  else
+    return CSR_FPRegs_RegMask;
+}
+
+
+const uint32_t *
+ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
+                                                CallingConv::ID CC) const {
+  const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+  // This should return a register mask that is the same as that returned by
+  // getCallPreservedMask but that additionally preserves the register used for
+  // the first i32 argument (which must also be the register used to return a
+  // single i32 return value)
+  //
+  // In case that the calling convention does not use the same register for
+  // both or otherwise does not want to enable this optimization, the function
+  // should return NULL
+  if (CC == CallingConv::GHC)
+    // This is academic becase all GHC calls are (supposed to be) tail calls
+    return nullptr;
+  return STI.isTargetDarwin() ? CSR_iOS_ThisReturn_RegMask
+                              : CSR_AAPCS_ThisReturn_RegMask;
+}
+
+BitVector ARMBaseRegisterInfo::
+getReservedRegs(const MachineFunction &MF) const {
+  const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
+
+  // FIXME: avoid re-calculating this every time.
+  BitVector Reserved(getNumRegs());
+  markSuperRegs(Reserved, ARM::SP);
+  markSuperRegs(Reserved, ARM::PC);
+  markSuperRegs(Reserved, ARM::FPSCR);
+  markSuperRegs(Reserved, ARM::APSR_NZCV);
+  if (TFI->hasFP(MF))
+    markSuperRegs(Reserved, getFramePointerReg(STI));
+  if (hasBasePointer(MF))
+    markSuperRegs(Reserved, BasePtr);
+  // Some targets reserve R9.
+  if (STI.isR9Reserved())
+    markSuperRegs(Reserved, ARM::R9);
+  // Reserve D16-D31 if the subtarget doesn't support them.
+  if (!STI.hasVFP3() || STI.hasD16()) {
+    static_assert(ARM::D31 == ARM::D16 + 15, "Register list not consecutive!");
+    for (unsigned R = 0; R < 16; ++R)
+      markSuperRegs(Reserved, ARM::D16 + R);
+  }
+  const TargetRegisterClass *RC  = &ARM::GPRPairRegClass;
+  for(TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I!=E; ++I)
+    for (MCSubRegIterator SI(*I, this); SI.isValid(); ++SI)
+      if (Reserved.test(*SI)) markSuperRegs(Reserved, *I);
+
+  assert(checkAllSuperRegsMarked(Reserved));
+  return Reserved;
+}
+
+const TargetRegisterClass *
+ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                                               const MachineFunction &) const {
+  const TargetRegisterClass *Super = RC;
+  TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
+  do {
+    switch (Super->getID()) {
+    case ARM::GPRRegClassID:
+    case ARM::SPRRegClassID:
+    case ARM::DPRRegClassID:
+    case ARM::QPRRegClassID:
+    case ARM::QQPRRegClassID:
+    case ARM::QQQQPRRegClassID:
+    case ARM::GPRPairRegClassID:
+      return Super;
+    }
+    Super = *I++;
+  } while (Super);
+  return RC;
+}
+
+const TargetRegisterClass *
+ARMBaseRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
+  return &ARM::GPRRegClass;
+}
+
+const TargetRegisterClass *
+ARMBaseRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &ARM::CCRRegClass)
+    return &ARM::rGPRRegClass;  // Can't copy CCR registers.
+  return RC;
+}
+
+unsigned
+ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                         MachineFunction &MF) const {
+  const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
+
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case ARM::tGPRRegClassID:
+    return TFI->hasFP(MF) ? 4 : 5;
+  case ARM::GPRRegClassID: {
+    unsigned FP = TFI->hasFP(MF) ? 1 : 0;
+    return 10 - FP - (STI.isR9Reserved() ? 1 : 0);
+  }
+  case ARM::SPRRegClassID:  // Currently not used as 'rep' register class.
+  case ARM::DPRRegClassID:
+    return 32 - 10;
+  }
+}
+
+// Get the other register in a GPRPair.
+static unsigned getPairedGPR(unsigned Reg, bool Odd, const MCRegisterInfo *RI) {
+  for (MCSuperRegIterator Supers(Reg, RI); Supers.isValid(); ++Supers)
+    if (ARM::GPRPairRegClass.contains(*Supers))
+      return RI->getSubReg(*Supers, Odd ? ARM::gsub_1 : ARM::gsub_0);
+  return 0;
+}
+
+// Resolve the RegPairEven / RegPairOdd register allocator hints.
+void
+ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
+                                           ArrayRef<MCPhysReg> Order,
+                                           SmallVectorImpl<MCPhysReg> &Hints,
+                                           const MachineFunction &MF,
+                                           const VirtRegMap *VRM,
+                                           const LiveRegMatrix *Matrix) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg);
+
+  unsigned Odd;
+  switch (Hint.first) {
+  case ARMRI::RegPairEven:
+    Odd = 0;
+    break;
+  case ARMRI::RegPairOdd:
+    Odd = 1;
+    break;
+  default:
+    TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM);
+    return;
+  }
+
+  // This register should preferably be even (Odd == 0) or odd (Odd == 1).
+  // Check if the other part of the pair has already been assigned, and provide
+  // the paired register as the first hint.
+  unsigned Paired = Hint.second;
+  if (Paired == 0)
+    return;
+
+  unsigned PairedPhys = 0;
+  if (TargetRegisterInfo::isPhysicalRegister(Paired)) {
+    PairedPhys = Paired;
+  } else if (VRM && VRM->hasPhys(Paired)) {
+    PairedPhys = getPairedGPR(VRM->getPhys(Paired), Odd, this);
+  }
+
+  // First prefer the paired physreg.
+  if (PairedPhys && is_contained(Order, PairedPhys))
+    Hints.push_back(PairedPhys);
+
+  // Then prefer even or odd registers.
+  for (unsigned I = 0, E = Order.size(); I != E; ++I) {
+    unsigned Reg = Order[I];
+    if (Reg == PairedPhys || (getEncodingValue(Reg) & 1) != Odd)
+      continue;
+    // Don't provide hints that are paired to a reserved register.
+    unsigned Paired = getPairedGPR(Reg, !Odd, this);
+    if (!Paired || MRI.isReserved(Paired))
+      continue;
+    Hints.push_back(Reg);
+  }
+}
+
+void
+ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg,
+                                        MachineFunction &MF) const {
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
+  if ((Hint.first == (unsigned)ARMRI::RegPairOdd ||
+       Hint.first == (unsigned)ARMRI::RegPairEven) &&
+      TargetRegisterInfo::isVirtualRegister(Hint.second)) {
+    // If 'Reg' is one of the even / odd register pair and it's now changed
+    // (e.g. coalesced) into a different register. The other register of the
+    // pair allocation hint must be updated to reflect the relationship
+    // change.
+    unsigned OtherReg = Hint.second;
+    Hint = MRI->getRegAllocationHint(OtherReg);
+    // Make sure the pair has not already divorced.
+    if (Hint.second == Reg) {
+      MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg);
+      if (TargetRegisterInfo::isVirtualRegister(NewReg))
+        MRI->setRegAllocationHint(NewReg,
+            Hint.first == (unsigned)ARMRI::RegPairOdd ? ARMRI::RegPairEven
+            : ARMRI::RegPairOdd, OtherReg);
+    }
+  }
+}
+
+bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
+
+  // When outgoing call frames are so large that we adjust the stack pointer
+  // around the call, we can no longer use the stack pointer to reach the
+  // emergency spill slot.
+  if (needsStackRealignment(MF) && !TFI->hasReservedCallFrame(MF))
+    return true;
+
+  // Thumb has trouble with negative offsets from the FP. Thumb2 has a limited
+  // negative range for ldr/str (255), and thumb1 is positive offsets only.
+  // It's going to be better to use the SP or Base Pointer instead. When there
+  // are variable sized objects, we can't reference off of the SP, so we
+  // reserve a Base Pointer.
+  if (AFI->isThumbFunction() && MFI.hasVarSizedObjects()) {
+    // Conservatively estimate whether the negative offset from the frame
+    // pointer will be sufficient to reach. If a function has a smallish
+    // frame, it's less likely to have lots of spills and callee saved
+    // space, so it's all more likely to be within range of the frame pointer.
+    // If it's wrong, the scavenger will still enable access to work, it just
+    // won't be optimal.
+    if (AFI->isThumb2Function() && MFI.getLocalFrameSize() < 128)
+      return false;
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  const MachineRegisterInfo *MRI = &MF.getRegInfo();
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
+  // We can't realign the stack if:
+  // 1. Dynamic stack realignment is explicitly disabled,
+  // 2. This is a Thumb1 function (it's not useful, so we don't bother), or
+  // 3. There are VLAs in the function and the base pointer is disabled.
+  if (!TargetRegisterInfo::canRealignStack(MF))
+    return false;
+  if (AFI->isThumb1OnlyFunction())
+    return false;
+  // Stack realignment requires a frame pointer.  If we already started
+  // register allocation with frame pointer elimination, it is too late now.
+  if (!MRI->canReserveReg(getFramePointerReg(MF.getSubtarget<ARMSubtarget>())))
+    return false;
+  // We may also need a base pointer if there are dynamic allocas or stack
+  // pointer adjustments around calls.
+  if (TFI->hasReservedCallFrame(MF))
+    return true;
+  // A base pointer is required and allowed.  Check that it isn't too late to
+  // reserve it.
+  return MRI->canReserveReg(BasePtr);
+}
+
+bool ARMBaseRegisterInfo::
+cannotEliminateFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI.adjustsStack())
+    return true;
+  return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken()
+    || needsStackRealignment(MF);
+}
+
+unsigned
+ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
+
+  if (TFI->hasFP(MF))
+    return getFramePointerReg(STI);
+  return ARM::SP;
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+void ARMBaseRegisterInfo::emitLoadConstPool(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+    const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, int Val,
+    ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  MachineConstantPool *ConstantPool = MF.getConstantPool();
+  const Constant *C =
+        ConstantInt::get(Type::getInt32Ty(MF.getFunction()->getContext()), Val);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp))
+    .addReg(DestReg, getDefRegState(true), SubIdx)
+    .addConstantPoolIndex(Idx)
+    .addImm(0).addImm(Pred).addReg(PredReg)
+    .setMIFlags(MIFlags);
+}
+
+bool ARMBaseRegisterInfo::
+requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool ARMBaseRegisterInfo::
+trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return true;
+}
+
+bool ARMBaseRegisterInfo::
+requiresFrameIndexScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool ARMBaseRegisterInfo::
+requiresVirtualBaseRegisters(const MachineFunction &MF) const {
+  return true;
+}
+
+int64_t ARMBaseRegisterInfo::
+getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const {
+  const MCInstrDesc &Desc = MI->getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  int64_t InstrOffs = 0;
+  int Scale = 1;
+  unsigned ImmIdx = 0;
+  switch (AddrMode) {
+  case ARMII::AddrModeT2_i8:
+  case ARMII::AddrModeT2_i12:
+  case ARMII::AddrMode_i12:
+    InstrOffs = MI->getOperand(Idx+1).getImm();
+    Scale = 1;
+    break;
+  case ARMII::AddrMode5: {
+    // VFP address mode.
+    const MachineOperand &OffOp = MI->getOperand(Idx+1);
+    InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm());
+    if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub)
+      InstrOffs = -InstrOffs;
+    Scale = 4;
+    break;
+  }
+  case ARMII::AddrMode2: {
+    ImmIdx = Idx+2;
+    InstrOffs = ARM_AM::getAM2Offset(MI->getOperand(ImmIdx).getImm());
+    if (ARM_AM::getAM2Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+      InstrOffs = -InstrOffs;
+    break;
+  }
+  case ARMII::AddrMode3: {
+    ImmIdx = Idx+2;
+    InstrOffs = ARM_AM::getAM3Offset(MI->getOperand(ImmIdx).getImm());
+    if (ARM_AM::getAM3Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+      InstrOffs = -InstrOffs;
+    break;
+  }
+  case ARMII::AddrModeT1_s: {
+    ImmIdx = Idx+1;
+    InstrOffs = MI->getOperand(ImmIdx).getImm();
+    Scale = 4;
+    break;
+  }
+  default:
+    llvm_unreachable("Unsupported addressing mode!");
+  }
+
+  return InstrOffs * Scale;
+}
+
+/// needsFrameBaseReg - Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool ARMBaseRegisterInfo::
+needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
+  for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i) {
+    assert(i < MI->getNumOperands() &&"Instr doesn't have FrameIndex operand!");
+  }
+
+  // It's the load/store FI references that cause issues, as it can be difficult
+  // to materialize the offset if it won't fit in the literal field. Estimate
+  // based on the size of the local frame and some conservative assumptions
+  // about the rest of the stack frame (note, this is pre-regalloc, so
+  // we don't know everything for certain yet) whether this offset is likely
+  // to be out of range of the immediate. Return true if so.
+
+  // We only generate virtual base registers for loads and stores, so
+  // return false for everything else.
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  case ARM::LDRi12: case ARM::LDRH: case ARM::LDRBi12:
+  case ARM::STRi12: case ARM::STRH: case ARM::STRBi12:
+  case ARM::t2LDRi12: case ARM::t2LDRi8:
+  case ARM::t2STRi12: case ARM::t2STRi8:
+  case ARM::VLDRS: case ARM::VLDRD:
+  case ARM::VSTRS: case ARM::VSTRD:
+  case ARM::tSTRspi: case ARM::tLDRspi:
+    break;
+  default:
+    return false;
+  }
+
+  // Without a virtual base register, if the function has variable sized
+  // objects, all fixed-size local references will be via the frame pointer,
+  // Approximate the offset and see if it's legal for the instruction.
+  // Note that the incoming offset is based on the SP value at function entry,
+  // so it'll be negative.
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // Estimate an offset from the frame pointer.
+  // Conservatively assume all callee-saved registers get pushed. R4-R6
+  // will be earlier than the FP, so we ignore those.
+  // R7, LR
+  int64_t FPOffset = Offset - 8;
+  // ARM and Thumb2 functions also need to consider R8-R11 and D8-D15
+  if (!AFI->isThumbFunction() || !AFI->isThumb1OnlyFunction())
+    FPOffset -= 80;
+  // Estimate an offset from the stack pointer.
+  // The incoming offset is relating to the SP at the start of the function,
+  // but when we access the local it'll be relative to the SP after local
+  // allocation, so adjust our SP-relative offset by that allocation size.
+  Offset += MFI.getLocalFrameSize();
+  // Assume that we'll have at least some spill slots allocated.
+  // FIXME: This is a total SWAG number. We should run some statistics
+  //        and pick a real one.
+  Offset += 128; // 128 bytes of spill slots
+
+  // If there's a frame pointer and the addressing mode allows it, try using it.
+  // The FP is only available if there is no dynamic realignment. We
+  // don't know for sure yet whether we'll need that, so we guess based
+  // on whether there are any local variables that would trigger it.
+  unsigned StackAlign = TFI->getStackAlignment();
+  if (TFI->hasFP(MF) && 
+      !((MFI.getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) {
+    if (isFrameOffsetLegal(MI, getFrameRegister(MF), FPOffset))
+      return false;
+  }
+  // If we can reference via the stack pointer, try that.
+  // FIXME: This (and the code that resolves the references) can be improved
+  //        to only disallow SP relative references in the live range of
+  //        the VLA(s). In practice, it's unclear how much difference that
+  //        would make, but it may be worth doing.
+  if (!MFI.hasVarSizedObjects() && isFrameOffsetLegal(MI, ARM::SP, Offset))
+    return false;
+
+  // The offset likely isn't legal, we want to allocate a virtual base register.
+  return true;
+}
+
+/// materializeFrameBaseRegister - Insert defining instruction(s) for BaseReg to
+/// be a pointer to FrameIdx at the beginning of the basic block.
+void ARMBaseRegisterInfo::
+materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                             unsigned BaseReg, int FrameIdx,
+                             int64_t Offset) const {
+  ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
+  unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri :
+    (AFI->isThumb1OnlyFunction() ? ARM::tADDframe : ARM::t2ADDri);
+
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL;                  // Defaults to "unknown"
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+
+  const MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const MCInstrDesc &MCID = TII.get(ADDriOpc);
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
+
+  MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+    .addFrameIndex(FrameIdx).addImm(Offset);
+
+  if (!AFI->isThumb1OnlyFunction())
+    AddDefaultCC(AddDefaultPred(MIB));
+}
+
+void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                            int64_t Offset) const {
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const ARMBaseInstrInfo &TII =
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  unsigned i = 0;
+
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This resolveFrameIndex does not support Thumb1!");
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  bool Done = false;
+  if (!AFI->isThumbFunction())
+    Done = rewriteARMFrameIndex(MI, i, BaseReg, Off, TII);
+  else {
+    assert(AFI->isThumb2Function());
+    Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII);
+  }
+  assert (Done && "Unable to resolve frame index!");
+  (void)Done;
+}
+
+bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+                                             int64_t Offset) const {
+  const MCInstrDesc &Desc = MI->getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  unsigned i = 0;
+
+  while (!MI->getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI->getNumOperands() &&"Instr doesn't have FrameIndex operand!");
+  }
+
+  // AddrMode4 and AddrMode6 cannot handle any offset.
+  if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6)
+    return Offset == 0;
+
+  unsigned NumBits = 0;
+  unsigned Scale = 1;
+  bool isSigned = true;
+  switch (AddrMode) {
+  case ARMII::AddrModeT2_i8:
+  case ARMII::AddrModeT2_i12:
+    // i8 supports only negative, and i12 supports only positive, so
+    // based on Offset sign, consider the appropriate instruction
+    Scale = 1;
+    if (Offset < 0) {
+      NumBits = 8;
+      Offset = -Offset;
+    } else {
+      NumBits = 12;
+    }
+    break;
+  case ARMII::AddrMode5:
+    // VFP address mode.
+    NumBits = 8;
+    Scale = 4;
+    break;
+  case ARMII::AddrMode_i12:
+  case ARMII::AddrMode2:
+    NumBits = 12;
+    break;
+  case ARMII::AddrMode3:
+    NumBits = 8;
+    break;
+  case ARMII::AddrModeT1_s:
+    NumBits = (BaseReg == ARM::SP ? 8 : 5);
+    Scale = 4;
+    isSigned = false;
+    break;
+  default:
+    llvm_unreachable("Unsupported addressing mode!");
+  }
+
+  Offset += getFrameIndexInstrOffset(MI, i);
+  // Make sure the offset is encodable for instructions that scale the
+  // immediate.
+  if ((Offset & (Scale-1)) != 0)
+    return false;
+
+  if (isSigned && Offset < 0)
+    Offset = -Offset;
+
+  unsigned Mask = (1 << NumBits) - 1;
+  if ((unsigned)Offset <= Mask * Scale)
+    return true;
+
+  return false;
+}
+
+void
+ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                         int SPAdj, unsigned FIOperandNum,
+                                         RegScavenger *RS) const {
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const ARMBaseInstrInfo &TII =
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This eliminateFrameIndex does not support Thumb1!");
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  unsigned FrameReg;
+
+  int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj);
+
+  // PEI::scavengeFrameVirtualRegs() cannot accurately track SPAdj because the
+  // call frame setup/destroy instructions have already been eliminated.  That
+  // means the stack pointer cannot be used to access the emergency spill slot
+  // when !hasReservedCallFrame().
+#ifndef NDEBUG
+  if (RS && FrameReg == ARM::SP && RS->isScavengingFrameIndex(FrameIndex)){
+    assert(TFI->hasReservedCallFrame(MF) &&
+           "Cannot use SP to access the emergency spill slot in "
+           "functions without a reserved call frame");
+    assert(!MF.getFrameInfo().hasVarSizedObjects() &&
+           "Cannot use SP to access the emergency spill slot in "
+           "functions with variable sized frame objects");
+  }
+#endif // NDEBUG
+
+  assert(!MI.isDebugValue() && "DBG_VALUEs should be handled in target-independent code");
+
+  // Modify MI as necessary to handle as much of 'Offset' as possible
+  bool Done = false;
+  if (!AFI->isThumbFunction())
+    Done = rewriteARMFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII);
+  else {
+    assert(AFI->isThumb2Function());
+    Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII);
+  }
+  if (Done)
+    return;
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above, handle the rest, providing a register that is
+  // SP+LargeImm.
+  assert((Offset ||
+          (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 ||
+          (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6) &&
+         "This code isn't needed if offset already handled!");
+
+  unsigned ScratchReg = 0;
+  int PIdx = MI.findFirstPredOperandIdx();
+  ARMCC::CondCodes Pred = (PIdx == -1)
+    ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
+  unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg();
+  if (Offset == 0)
+    // Must be addrmode4/6.
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, false);
+  else {
+    ScratchReg = MF.getRegInfo().createVirtualRegister(&ARM::GPRRegClass);
+    if (!AFI->isThumbFunction())
+      emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
+                              Offset, Pred, PredReg, TII);
+    else {
+      assert(AFI->isThumb2Function());
+      emitT2RegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
+                             Offset, Pred, PredReg, TII);
+    }
+    // Update the original instruction to use the scratch register.
+    MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false,true);
+  }
+}
+
+bool ARMBaseRegisterInfo::shouldCoalesce(MachineInstr *MI,
+                                  const TargetRegisterClass *SrcRC,
+                                  unsigned SubReg,
+                                  const TargetRegisterClass *DstRC,
+                                  unsigned DstSubReg,
+                                  const TargetRegisterClass *NewRC) const {
+  auto MBB = MI->getParent();
+  auto MF = MBB->getParent();
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  // If not copying into a sub-register this should be ok because we shouldn't
+  // need to split the reg.
+  if (!DstSubReg)
+    return true;
+  // Small registers don't frequently cause a problem, so we can coalesce them.
+  if (NewRC->getSize() < 32 && DstRC->getSize() < 32 && SrcRC->getSize() < 32)
+    return true;
+
+  auto NewRCWeight =
+              MRI.getTargetRegisterInfo()->getRegClassWeight(NewRC);
+  auto SrcRCWeight =
+              MRI.getTargetRegisterInfo()->getRegClassWeight(SrcRC);
+  auto DstRCWeight =
+              MRI.getTargetRegisterInfo()->getRegClassWeight(DstRC);
+  // If the source register class is more expensive than the destination, the
+  // coalescing is probably profitable.
+  if (SrcRCWeight.RegWeight > NewRCWeight.RegWeight)
+    return true;
+  if (DstRCWeight.RegWeight > NewRCWeight.RegWeight)
+    return true;
+
+  // If the register allocator isn't constrained, we can always allow coalescing
+  // unfortunately we don't know yet if we will be constrained.
+  // The goal of this heuristic is to restrict how many expensive registers
+  // we allow to coalesce in a given basic block.
+  auto AFI = MF->getInfo<ARMFunctionInfo>();
+  auto It = AFI->getCoalescedWeight(MBB);
+
+  DEBUG(dbgs() << "\tARM::shouldCoalesce - Coalesced Weight: "
+    << It->second << "\n");
+  DEBUG(dbgs() << "\tARM::shouldCoalesce - Reg Weight: "
+    << NewRCWeight.RegWeight << "\n");
+
+  // This number is the largest round number that which meets the criteria:
+  //  (1) addresses PR18825
+  //  (2) generates better code in some test cases (like vldm-shed-a9.ll)
+  //  (3) Doesn't regress any test cases (in-tree, test-suite, and SPEC)
+  // In practice the SizeMultiplier will only factor in for straight line code
+  // that uses a lot of NEON vectors, which isn't terribly common.
+  unsigned SizeMultiplier = MBB->size()/100;
+  SizeMultiplier = SizeMultiplier ? SizeMultiplier : 1;
+  if (It->second < NewRCWeight.WeightLimit * SizeMultiplier) {
+    It->second += NewRCWeight.RegWeight;
+    return true;
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
new file mode 100644
index 000000000000..330e1535e863
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -0,0 +1,201 @@
+//===-- ARMBaseRegisterInfo.h - ARM Register Information Impl ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the base ARM implementation of TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMBASEREGISTERINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMBASEREGISTERINFO_H
+
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "ARMGenRegisterInfo.inc"
+
+namespace llvm {
+/// Register allocation hints.
+namespace ARMRI {
+  enum {
+    RegPairOdd  = 1,
+    RegPairEven = 2
+  };
+}
+
+/// isARMArea1Register - Returns true if the register is a low register (r0-r7)
+/// or a stack/pc register that we should push/pop.
+static inline bool isARMArea1Register(unsigned Reg, bool isIOS) {
+  using namespace ARM;
+  switch (Reg) {
+    case R0:  case R1:  case R2:  case R3:
+    case R4:  case R5:  case R6:  case R7:
+    case LR:  case SP:  case PC:
+      return true;
+    case R8:  case R9:  case R10: case R11: case R12:
+      // For iOS we want r7 and lr to be next to each other.
+      return !isIOS;
+    default:
+      return false;
+  }
+}
+
+static inline bool isARMArea2Register(unsigned Reg, bool isIOS) {
+  using namespace ARM;
+  switch (Reg) {
+    case R8: case R9: case R10: case R11: case R12:
+      // iOS has this second area.
+      return isIOS;
+    default:
+      return false;
+  }
+}
+
+static inline bool isARMArea3Register(unsigned Reg, bool isIOS) {
+  using namespace ARM;
+  switch (Reg) {
+    case D15: case D14: case D13: case D12:
+    case D11: case D10: case D9:  case D8:
+    case D7:  case D6:  case D5:  case D4:
+    case D3:  case D2:  case D1:  case D0:
+    case D31: case D30: case D29: case D28:
+    case D27: case D26: case D25: case D24:
+    case D23: case D22: case D21: case D20:
+    case D19: case D18: case D17: case D16:
+      return true;
+    default:
+      return false;
+  }
+}
+
+static inline bool isCalleeSavedRegister(unsigned Reg,
+                                         const MCPhysReg *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
+protected:
+  /// BasePtr - ARM physical register used as a base ptr in complex stack
+  /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
+  /// variable size stack objects.
+  unsigned BasePtr;
+
+  // Can be only subclassed.
+  explicit ARMBaseRegisterInfo();
+
+  // Return the opcode that implements 'Op', or 0 if no opcode
+  unsigned getOpcode(int Op) const;
+
+public:
+  /// Code Generation virtual methods...
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  const MCPhysReg *
+  getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID) const override;
+  const uint32_t *getNoPreservedMask() const override;
+  const uint32_t *getTLSCallPreservedMask(const MachineFunction &MF) const;
+  const uint32_t *getSjLjDispatchPreservedMask(const MachineFunction &MF) const;
+
+  /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
+  /// case that 'returned' is on an i32 first argument if the calling convention
+  /// is one that can (partially) model this attribute with a preserved mask
+  /// (i.e. it is a calling convention that uses the same register for the first
+  /// i32 argument and an i32 return value)
+  ///
+  /// Should return NULL in the case that the calling convention does not have
+  /// this property
+  const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF,
+                                             CallingConv::ID) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+  const TargetRegisterClass *
+  getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                            const MachineFunction &MF) const override;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
+
+  void getRegAllocationHints(unsigned VirtReg,
+                             ArrayRef<MCPhysReg> Order,
+                             SmallVectorImpl<MCPhysReg> &Hints,
+                             const MachineFunction &MF,
+                             const VirtRegMap *VRM,
+                             const LiveRegMatrix *Matrix) const override;
+
+  void updateRegAllocHint(unsigned Reg, unsigned NewReg,
+                          MachineFunction &MF) const override;
+
+  bool hasBasePointer(const MachineFunction &MF) const;
+
+  bool canRealignStack(const MachineFunction &MF) const override;
+  int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
+                                   int Idx) const override;
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                    unsigned BaseReg, int FrameIdx,
+                                    int64_t Offset) const override;
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+  bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+                          int64_t Offset) const override;
+
+  bool cannotEliminateFrame(const MachineFunction &MF) const;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+  unsigned getBaseRegister() const { return BasePtr; }
+
+  bool isLowRegister(unsigned Reg) const;
+
+
+  /// emitLoadConstPool - Emits a load from constpool to materialize the
+  /// specified immediate.
+  virtual void
+  emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                    const DebugLoc &dl, unsigned DestReg, unsigned SubIdx,
+                    int Val, ARMCC::CondCodes Pred = ARMCC::AL,
+                    unsigned PredReg = 0,
+                    unsigned MIFlags = MachineInstr::NoFlags) const;
+
+  /// Code Generation virtual methods...
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true
+  bool shouldCoalesce(MachineInstr *MI,
+                      const TargetRegisterClass *SrcRC,
+                      unsigned SubReg,
+                      const TargetRegisterClass *DstRC,
+                      unsigned DstSubReg,
+                      const TargetRegisterClass *NewRC) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMBasicBlockInfo.h b/contrib/llvm/lib/Target/ARM/ARMBasicBlockInfo.h
new file mode 100644
index 000000000000..780544f865df
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBasicBlockInfo.h
@@ -0,0 +1,110 @@
+//===-- ARMBasicBlockInfo.h - Basic Block Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility functions and data structure for computing block size.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMBASICBLOCKINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMBASICBLOCKINFO_H
+
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+using namespace llvm;
+
+namespace llvm {
+
+/// UnknownPadding - Return the worst case padding that could result from
+/// unknown offset bits.  This does not include alignment padding caused by
+/// known offset bits.
+///
+/// @param LogAlign log2(alignment)
+/// @param KnownBits Number of known low offset bits.
+inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) {
+  if (KnownBits < LogAlign)
+    return (1u << LogAlign) - (1u << KnownBits);
+  return 0;
+}
+
+/// BasicBlockInfo - Information about the offset and size of a single
+/// basic block.
+struct BasicBlockInfo {
+  /// Offset - Distance from the beginning of the function to the beginning
+  /// of this basic block.
+  ///
+  /// Offsets are computed assuming worst case padding before an aligned
+  /// block. This means that subtracting basic block offsets always gives a
+  /// conservative estimate of the real distance which may be smaller.
+  ///
+  /// Because worst case padding is used, the computed offset of an aligned
+  /// block may not actually be aligned.
+  unsigned Offset;
+
+  /// Size - Size of the basic block in bytes.  If the block contains
+  /// inline assembly, this is a worst case estimate.
+  ///
+  /// The size does not include any alignment padding whether from the
+  /// beginning of the block, or from an aligned jump table at the end.
+  unsigned Size;
+
+  /// KnownBits - The number of low bits in Offset that are known to be
+  /// exact.  The remaining bits of Offset are an upper bound.
+  uint8_t KnownBits;
+
+  /// Unalign - When non-zero, the block contains instructions (inline asm)
+  /// of unknown size.  The real size may be smaller than Size bytes by a
+  /// multiple of 1 << Unalign.
+  uint8_t Unalign;
+
+  /// PostAlign - When non-zero, the block terminator contains a .align
+  /// directive, so the end of the block is aligned to 1 << PostAlign
+  /// bytes.
+  uint8_t PostAlign;
+
+  BasicBlockInfo() : Offset(0), Size(0), KnownBits(0), Unalign(0),
+    PostAlign(0) {}
+
+  /// Compute the number of known offset bits internally to this block.
+  /// This number should be used to predict worst case padding when
+  /// splitting the block.
+  unsigned internalKnownBits() const {
+    unsigned Bits = Unalign ? Unalign : KnownBits;
+    // If the block size isn't a multiple of the known bits, assume the
+    // worst case padding.
+    if (Size & ((1u << Bits) - 1))
+      Bits = countTrailingZeros(Size);
+    return Bits;
+  }
+
+  /// Compute the offset immediately following this block.  If LogAlign is
+  /// specified, return the offset the successor block will get if it has
+  /// this alignment.
+  unsigned postOffset(unsigned LogAlign = 0) const {
+    unsigned PO = Offset + Size;
+    unsigned LA = std::max(unsigned(PostAlign), LogAlign);
+    if (!LA)
+      return PO;
+    // Add alignment padding from the terminator.
+    return PO + UnknownPadding(LA, internalKnownBits());
+  }
+
+  /// Compute the number of known low bits of postOffset.  If this block
+  /// contains inline asm, the number of known bits drops to the
+  /// instruction alignment.  An aligned terminator may increase the number
+  /// of know bits.
+  /// If LogAlign is given, also consider the alignment of the next block.
+  unsigned postKnownBits(unsigned LogAlign = 0) const {
+    return std::max(std::max(unsigned(PostAlign), LogAlign),
+                    internalKnownBits());
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp
new file mode 100644
index 000000000000..52c95b6244ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp
@@ -0,0 +1,203 @@
+//===-- llvm/lib/Target/ARM/ARMCallLowering.cpp - Call lowering -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ARMCallLowering.h"
+
+#include "ARMBaseInstrInfo.h"
+#include "ARMISelLowering.h"
+
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "This shouldn't be built without GISel"
+#endif
+
+ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI)
+    : CallLowering(&TLI) {}
+
+static bool isSupportedType(const DataLayout DL, const ARMTargetLowering &TLI,
+                            Type *T) {
+  EVT VT = TLI.getValueType(DL, T);
+  if (!VT.isSimple() || !VT.isInteger() || VT.isVector())
+    return false;
+
+  unsigned VTSize = VT.getSimpleVT().getSizeInBits();
+  return VTSize == 8 || VTSize == 16 || VTSize == 32;
+}
+
+namespace {
+struct FuncReturnHandler : public CallLowering::ValueHandler {
+  FuncReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                    MachineInstrBuilder &MIB)
+      : ValueHandler(MIRBuilder, MRI), MIB(MIB) {}
+
+  unsigned getStackAddress(uint64_t Size, int64_t Offset,
+                           MachinePointerInfo &MPO) override {
+    llvm_unreachable("Don't know how to get a stack address yet");
+  }
+
+  void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+                        CCValAssign &VA) override {
+    assert(VA.isRegLoc() && "Value shouldn't be assigned to reg");
+    assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?");
+
+    assert(VA.getValVT().getSizeInBits() <= 32 && "Unsupported value size");
+    assert(VA.getLocVT().getSizeInBits() == 32 && "Unsupported location size");
+
+    assert(VA.getLocInfo() != CCValAssign::SExt &&
+           VA.getLocInfo() != CCValAssign::ZExt &&
+           "ABI extensions not supported yet");
+
+    MIRBuilder.buildCopy(PhysReg, ValVReg);
+    MIB.addUse(PhysReg, RegState::Implicit);
+  }
+
+  void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+                            MachinePointerInfo &MPO, CCValAssign &VA) override {
+    llvm_unreachable("Don't know how to assign a value to an address yet");
+  }
+
+  MachineInstrBuilder &MIB;
+};
+} // End anonymous namespace.
+
+/// Lower the return value for the already existing \p Ret. This assumes that
+/// \p MIRBuilder's insertion point is correct.
+bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
+                                     const Value *Val, unsigned VReg,
+                                     MachineInstrBuilder &Ret) const {
+  if (!Val)
+    // Nothing to do here.
+    return true;
+
+  auto &MF = MIRBuilder.getMF();
+  const auto &F = *MF.getFunction();
+
+  auto DL = MF.getDataLayout();
+  auto &TLI = *getTLI<ARMTargetLowering>();
+  if (!isSupportedType(DL, TLI, Val->getType()))
+    return false;
+
+  CCAssignFn *AssignFn =
+      TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg());
+
+  ArgInfo RetInfo(VReg, Val->getType());
+  setArgFlags(RetInfo, AttributeSet::ReturnIndex, DL, F);
+
+  FuncReturnHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret);
+  return handleAssignments(MIRBuilder, AssignFn, RetInfo, RetHandler);
+}
+
+bool ARMCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+                                  const Value *Val, unsigned VReg) const {
+  assert(!Val == !VReg && "Return value without a vreg");
+
+  auto Ret = AddDefaultPred(MIRBuilder.buildInstrNoInsert(ARM::BX_RET));
+
+  if (!lowerReturnVal(MIRBuilder, Val, VReg, Ret))
+    return false;
+
+  MIRBuilder.insertInstr(Ret);
+  return true;
+}
+
+namespace {
+struct FormalArgHandler : public CallLowering::ValueHandler {
+  FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
+      : ValueHandler(MIRBuilder, MRI) {}
+
+  unsigned getStackAddress(uint64_t Size, int64_t Offset,
+                           MachinePointerInfo &MPO) override {
+    assert(Size == 4 && "Unsupported size");
+
+    auto &MFI = MIRBuilder.getMF().getFrameInfo();
+
+    int FI = MFI.CreateFixedObject(Size, Offset, true);
+    MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
+
+    unsigned AddrReg =
+        MRI.createGenericVirtualRegister(LLT::pointer(MPO.getAddrSpace(), 32));
+    MIRBuilder.buildFrameIndex(AddrReg, FI);
+
+    return AddrReg;
+  }
+
+  void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+                            MachinePointerInfo &MPO, CCValAssign &VA) override {
+    assert(Size == 4 && "Unsupported size");
+
+    auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+        MPO, MachineMemOperand::MOLoad, Size, /* Alignment */ 0);
+    MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
+  }
+
+  void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+                        CCValAssign &VA) override {
+    assert(VA.isRegLoc() && "Value shouldn't be assigned to reg");
+    assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?");
+
+    assert(VA.getValVT().getSizeInBits() <= 32 && "Unsupported value size");
+    assert(VA.getLocVT().getSizeInBits() == 32 && "Unsupported location size");
+
+    MIRBuilder.getMBB().addLiveIn(PhysReg);
+    MIRBuilder.buildCopy(ValVReg, PhysReg);
+  }
+};
+} // End anonymous namespace
+
+bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+                                           const Function &F,
+                                           ArrayRef<unsigned> VRegs) const {
+  // Quick exit if there aren't any args
+  if (F.arg_empty())
+    return true;
+
+  if (F.isVarArg())
+    return false;
+
+  auto DL = MIRBuilder.getMF().getDataLayout();
+  auto &TLI = *getTLI<ARMTargetLowering>();
+
+  auto &Args = F.getArgumentList();
+  unsigned ArgIdx = 0;
+  for (auto &Arg : Args) {
+    ArgIdx++;
+    if (!isSupportedType(DL, TLI, Arg.getType()))
+      return false;
+
+    // FIXME: This check as well as ArgIdx are going away as soon as we support
+    // loading values < 32 bits.
+    if (ArgIdx > 4 && Arg.getType()->getIntegerBitWidth() != 32)
+      return false;
+  }
+
+  CCAssignFn *AssignFn =
+      TLI.CCAssignFnForCall(F.getCallingConv(), F.isVarArg());
+
+  SmallVector<ArgInfo, 8> ArgInfos;
+  unsigned Idx = 0;
+  for (auto &Arg : Args) {
+    ArgInfo AInfo(VRegs[Idx], Arg.getType());
+    setArgFlags(AInfo, Idx + 1, DL, F);
+    ArgInfos.push_back(AInfo);
+    Idx++;
+  }
+
+  FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo());
+  return handleAssignments(MIRBuilder, AssignFn, ArgInfos, ArgHandler);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallLowering.h b/contrib/llvm/lib/Target/ARM/ARMCallLowering.h
new file mode 100644
index 000000000000..6a1b886b501f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMCallLowering.h
@@ -0,0 +1,42 @@
+//===-- llvm/lib/Target/ARM/ARMCallLowering.h - Call lowering -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMCALLLOWERING
+#define LLVM_LIB_TARGET_ARM_ARMCALLLOWERING
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/ValueTypes.h"
+
+namespace llvm {
+
+class ARMTargetLowering;
+class MachineInstrBuilder;
+
+class ARMCallLowering : public CallLowering {
+public:
+  ARMCallLowering(const ARMTargetLowering &TLI);
+
+  bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+                   unsigned VReg) const override;
+
+  bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+                            ArrayRef<unsigned> VRegs) const override;
+
+private:
+  bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val,
+                      unsigned VReg, MachineInstrBuilder &Ret) const;
+};
+} // End of namespace llvm
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h
new file mode 100644
index 000000000000..71b819362404
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h
@@ -0,0 +1,288 @@
+//=== ARMCallingConv.h - ARM Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the ARM Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMCALLINGCONV_H
+#define LLVM_LIB_TARGET_ARM_ARMCALLINGCONV_H
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+// APCS f64 is in register pairs, possibly split to stack
+static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          CCState &State, bool CanFail) {
+  static const MCPhysReg RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+
+  // Try to get the first register.
+  if (unsigned Reg = State.AllocateReg(RegList))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else {
+    // For the 2nd half of a v2f64, do not fail.
+    if (CanFail)
+      return false;
+
+    // Put the whole thing on the stack.
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(8, 4),
+                                           LocVT, LocInfo));
+    return true;
+  }
+
+  // Try to get the second register.
+  if (unsigned Reg = State.AllocateReg(RegList))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(4, 4),
+                                           LocVT, LocInfo));
+  return true;
+}
+
+static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                   CCValAssign::LocInfo &LocInfo,
+                                   ISD::ArgFlagsTy &ArgFlags,
+                                   CCState &State) {
+  if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
+    return false;
+  if (LocVT == MVT::v2f64 &&
+      !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
+    return false;
+  return true;  // we handled it
+}
+
+// AAPCS f64 is in aligned register pairs
+static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                           CCValAssign::LocInfo &LocInfo,
+                           CCState &State, bool CanFail) {
+  static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
+  static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
+  static const MCPhysReg ShadowRegList[] = { ARM::R0, ARM::R1 };
+  static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+
+  unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList);
+  if (Reg == 0) {
+
+    // If we had R3 unallocated only, now we still must to waste it.
+    Reg = State.AllocateReg(GPRArgRegs);
+    assert((!Reg || Reg == ARM::R3) && "Wrong GPRs usage for f64");
+
+    // For the 2nd half of a v2f64, do not just fail.
+    if (CanFail)
+      return false;
+
+    // Put the whole thing on the stack.
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(8, 8),
+                                           LocVT, LocInfo));
+    return true;
+  }
+
+  unsigned i;
+  for (i = 0; i < 2; ++i)
+    if (HiRegList[i] == Reg)
+      break;
+
+  unsigned T = State.AllocateReg(LoRegList[i]);
+  (void)T;
+  assert(T == LoRegList[i] && "Could not allocate register");
+
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+                                         LocVT, LocInfo));
+  return true;
+}
+
+static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                    CCValAssign::LocInfo &LocInfo,
+                                    ISD::ArgFlagsTy &ArgFlags,
+                                    CCState &State) {
+  if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
+    return false;
+  if (LocVT == MVT::v2f64 &&
+      !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
+    return false;
+  return true;  // we handled it
+}
+
+static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                         CCValAssign::LocInfo &LocInfo, CCState &State) {
+  static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
+  static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
+
+  unsigned Reg = State.AllocateReg(HiRegList, LoRegList);
+  if (Reg == 0)
+    return false; // we didn't handle it
+
+  unsigned i;
+  for (i = 0; i < 2; ++i)
+    if (HiRegList[i] == Reg)
+      break;
+
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+                                         LocVT, LocInfo));
+  return true;
+}
+
+static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                      CCValAssign::LocInfo &LocInfo,
+                                      ISD::ArgFlagsTy &ArgFlags,
+                                      CCState &State) {
+  if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
+    return false;
+  if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
+    return false;
+  return true;  // we handled it
+}
+
+static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                       CCValAssign::LocInfo &LocInfo,
+                                       ISD::ArgFlagsTy &ArgFlags,
+                                       CCState &State) {
+  return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
+                                   State);
+}
+
+static const MCPhysReg RRegList[] = { ARM::R0,  ARM::R1,  ARM::R2,  ARM::R3 };
+
+static const MCPhysReg SRegList[] = { ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
+                                      ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
+                                      ARM::S8,  ARM::S9,  ARM::S10, ARM::S11,
+                                      ARM::S12, ARM::S13, ARM::S14,  ARM::S15 };
+static const MCPhysReg DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+                                      ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
+static const MCPhysReg QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
+
+
+// Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA
+// has InConsecutiveRegs set, and that the last member also has
+// InConsecutiveRegsLast set. We must process all members of the HA before
+// we can allocate it, as we need to know the total number of registers that
+// will be needed in order to (attempt to) allocate a contiguous block.
+static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
+                                          MVT &LocVT,
+                                          CCValAssign::LocInfo &LocInfo,
+                                          ISD::ArgFlagsTy &ArgFlags,
+                                          CCState &State) {
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+  // AAPCS HFAs must have 1-4 elements, all of the same type
+  if (PendingMembers.size() > 0)
+    assert(PendingMembers[0].getLocVT() == LocVT);
+
+  // Add the argument to the list to be allocated once we know the size of the
+  // aggregate. Store the type's required alignmnent as extra info for later: in
+  // the [N x i64] case all trace has been removed by the time we actually get
+  // to do allocation.
+  PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo,
+                                                   ArgFlags.getOrigAlign()));
+
+  if (!ArgFlags.isInConsecutiveRegsLast())
+    return true;
+
+  // Try to allocate a contiguous block of registers, each of the correct
+  // size to hold one member.
+  auto &DL = State.getMachineFunction().getDataLayout();
+  unsigned StackAlign = DL.getStackAlignment();
+  unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign);
+
+  ArrayRef<MCPhysReg> RegList;
+  switch (LocVT.SimpleTy) {
+  case MVT::i32: {
+    RegList = RRegList;
+    unsigned RegIdx = State.getFirstUnallocated(RegList);
+
+    // First consume all registers that would give an unaligned object. Whether
+    // we go on stack or in regs, no-one will be using them in future.
+    unsigned RegAlign = alignTo(Align, 4) / 4;
+    while (RegIdx % RegAlign != 0 && RegIdx < RegList.size())
+      State.AllocateReg(RegList[RegIdx++]);
+
+    break;
+  }
+  case MVT::f32:
+    RegList = SRegList;
+    break;
+  case MVT::f64:
+    RegList = DRegList;
+    break;
+  case MVT::v2f64:
+    RegList = QRegList;
+    break;
+  default:
+    llvm_unreachable("Unexpected member type for block aggregate");
+    break;
+  }
+
+  unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
+  if (RegResult) {
+    for (SmallVectorImpl<CCValAssign>::iterator It = PendingMembers.begin();
+         It != PendingMembers.end(); ++It) {
+      It->convertToReg(RegResult);
+      State.addLoc(*It);
+      ++RegResult;
+    }
+    PendingMembers.clear();
+    return true;
+  }
+
+  // Register allocation failed, we'll be needing the stack
+  unsigned Size = LocVT.getSizeInBits() / 8;
+  if (LocVT == MVT::i32 && State.getNextStackOffset() == 0) {
+    // If nothing else has used the stack until this point, a non-HFA aggregate
+    // can be split between regs and stack.
+    unsigned RegIdx = State.getFirstUnallocated(RegList);
+    for (auto &It : PendingMembers) {
+      if (RegIdx >= RegList.size())
+        It.convertToMem(State.AllocateStack(Size, Size));
+      else
+        It.convertToReg(State.AllocateReg(RegList[RegIdx++]));
+
+      State.addLoc(It);
+    }
+    PendingMembers.clear();
+    return true;
+  } else if (LocVT != MVT::i32)
+    RegList = SRegList;
+
+  // Mark all regs as unavailable (AAPCS rule C.2.vfp for VFP, C.6 for core)
+  for (auto Reg : RegList)
+    State.AllocateReg(Reg);
+
+  for (auto &It : PendingMembers) {
+    It.convertToMem(State.AllocateStack(Size, Align));
+    State.addLoc(It);
+
+    // After the first item has been allocated, the rest are packed as tightly
+    // as possible. (E.g. an incoming i64 would have starting Align of 8, but
+    // we'll be allocating a bunch of i32 slots).
+    Align = Size;
+  }
+
+  // All pending members have now been allocated
+  PendingMembers.clear();
+
+  // This will be allocated by the last member of the aggregate
+  return true;
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
new file mode 100644
index 000000000000..9c278a52a7ff
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -0,0 +1,310 @@
+//===-- ARMCallingConv.td - Calling Conventions for ARM ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for ARM architecture.
+//===----------------------------------------------------------------------===//
+
+/// CCIfAlign - Match of the original alignment of the arg
+class CCIfAlign<string Align, CCAction A>:
+  CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+
+//===----------------------------------------------------------------------===//
+// ARM APCS Calling Convention
+//===----------------------------------------------------------------------===//
+def CC_ARM_APCS : CallingConv<[
+
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+    
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is passed in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack
+  CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
+
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+
+  CCIfType<[i32], CCAssignToStack<4, 4>>,
+  CCIfType<[f64], CCAssignToStack<8, 4>>,
+  CCIfType<[v2f64], CCAssignToStack<16, 4>>
+]>;
+
+def RetCC_ARM_APCS : CallingConv<[
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is returned in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
+
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM APCS Calling Convention for FastCC (when VFP2 or later is available)
+//===----------------------------------------------------------------------===//
+def FastCC_ARM_APCS : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+
+  // CPRCs may be allocated to co-processor registers or the stack - they
+  // may never be allocated to core registers. 
+  CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToStackWithShadow<8, 4, [Q0, Q1, Q2, Q3]>>,
+  CCIfType<[v2f64], CCAssignToStackWithShadow<16, 4, [Q0, Q1, Q2, Q3]>>,
+
+  CCDelegateTo<CC_ARM_APCS>
+]>;
+
+def RetFastCC_ARM_APCS : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCDelegateTo<RetCC_ARM_APCS>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM APCS Calling Convention for GHC
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_APCS_GHC : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>,
+  CCIfType<[f64], CCAssignToReg<[D8, D9, D10, D11]>>,
+  CCIfType<[f32], CCAssignToReg<[S16, S17, S18, S19, S20, S21, S22, S23]>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, SpLim
+  CCIfType<[i32], CCAssignToReg<[R4, R5, R6, R7, R8, R9, R10, R11]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS (EABI) Calling Convention, common parts
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS_Common : CallingConv<[
+
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // i64/f64 is passed in even pairs of GPRs
+  // i64 is 8-aligned i32 here, so we may need to eat R1 as a pad register
+  // (and the same is true for f64 if VFP is not enabled)
+  CCIfType<[i32], CCIfAlign<"8", CCAssignToRegWithShadow<[R0, R2], [R0, R1]>>>,
+  CCIfType<[i32], CCIf<"ArgFlags.getOrigAlign() != 8",
+                       CCAssignToReg<[R0, R1, R2, R3]>>>,
+
+  CCIfType<[i32], CCIfAlign<"8", CCAssignToStackWithShadow<4, 8, [R0, R1, R2, R3]>>>,
+  CCIfType<[i32], CCAssignToStackWithShadow<4, 4, [R0, R1, R2, R3]>>,
+  CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToStackWithShadow<8, 8, [Q0, Q1, Q2, Q3]>>,
+  CCIfType<[v2f64], CCIfAlign<"16",
+           CCAssignToStackWithShadow<16, 16, [Q0, Q1, Q2, Q3]>>>,
+  CCIfType<[v2f64], CCAssignToStackWithShadow<16, 8, [Q0, Q1, Q2, Q3]>>
+]>;
+
+def RetCC_ARM_AAPCS_Common : CallingConv<[
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS (EABI) Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // The 'nest' parameter, if any, is passed in R12.
+  CCIfNest<CCAssignToReg<[R12]>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is passed in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCDelegateTo<CC_ARM_AAPCS_Common>
+]>;
+
+def RetCC_ARM_AAPCS : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is returned in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCDelegateTo<RetCC_ARM_AAPCS_Common>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS-VFP (EABI) Calling Convention
+// Also used for FastCC (when VFP2 or later is available)
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS_VFP : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is passed in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
+  // HFAs are passed in a contiguous block of registers, or on the stack
+  CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_Aggregate">>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCDelegateTo<CC_ARM_AAPCS_Common>
+]>;
+
+def RetCC_ARM_AAPCS_VFP : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
+
+  // A SwiftError is returned in R6.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCDelegateTo<RetCC_ARM_AAPCS_Common>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved register lists.
+//===----------------------------------------------------------------------===//
+
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+def CSR_FPRegs : CalleeSavedRegs<(add (sequence "D%u", 0, 31))>;
+
+def CSR_AAPCS : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, R5, R4,
+                                     (sequence "D%u", 15, 8))>;
+
+// The order of callee-saved registers needs to match the order we actually push
+// them in FrameLowering, because this order is what's used by
+// PrologEpilogInserter to allocate frame index slots. So when R7 is the frame
+// pointer, we use this AAPCS alternative.
+def CSR_AAPCS_SplitPush : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
+                                               R11, R10, R9, R8,
+                                               (sequence "D%u", 15, 8))>;
+
+// Constructors and destructors return 'this' in the ARM C++ ABI; since 'this'
+// and the pointer return value are both passed in R0 in these cases, this can
+// be partially modelled by treating R0 as a callee-saved register
+// Only the resulting RegMask is used; the SaveList is ignored
+def CSR_AAPCS_ThisReturn : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6,
+                                            R5, R4, (sequence "D%u", 15, 8),
+                                            R0)>;
+
+// iOS ABI deviates from ARM standard ABI. R9 is not a callee-saved register.
+// Also save R7-R4 first to match the stack frame fixed spill areas.
+def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
+
+// R6 is used to pass swifterror, remove it from CSR.
+def CSR_iOS_SwiftError : CalleeSavedRegs<(sub CSR_iOS, R6)>;
+
+def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
+                                         (sub CSR_AAPCS_ThisReturn, R9))>;
+
+def CSR_iOS_TLSCall : CalleeSavedRegs<(add LR, SP,
+                                           (sequence "R%u", 12, 1),
+                                           (sequence "D%u", 31, 0))>;
+
+// C++ TLS access function saves all registers except SP. Try to match
+// the order of CSRs in CSR_iOS.
+def CSR_iOS_CXX_TLS : CalleeSavedRegs<(add CSR_iOS, (sequence "R%u", 12, 1),
+                                           (sequence "D%u", 31, 0))>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_iOS_CXX_TLS_PE : CalleeSavedRegs<(add LR, R12, R11, R7, R5, R4)>;
+
+// CSRs that are handled explicitly via copies.
+def CSR_iOS_CXX_TLS_ViaCopy : CalleeSavedRegs<(sub CSR_iOS_CXX_TLS,
+                                                   CSR_iOS_CXX_TLS_PE)>;
+
+// The "interrupt" attribute is used to generate code that is acceptable in
+// exception-handlers of various kinds. It makes us use a different return
+// instruction (handled elsewhere) and affects which registers we must return to
+// our "caller" in the same state as we receive them.
+
+// For most interrupts, all registers except SP and LR are shared with
+// user-space. We mark LR to be saved anyway, since this is what the ARM backend
+// generally does rather than tracking its liveness as a normal register.
+def CSR_GenericInt : CalleeSavedRegs<(add LR, (sequence "R%u", 12, 0))>;
+
+// The fast interrupt handlers have more private state and get their own copies
+// of R8-R12, in addition to SP and LR. As before, mark LR for saving too.
+
+// FIXME: we mark R11 as callee-saved since it's often the frame-pointer, and
+// current frame lowering expects to encounter it while processing callee-saved
+// registers.
+def CSR_FIQ : CalleeSavedRegs<(add LR, R11, (sequence "R%u", 7, 0))>;
+
+
diff --git a/contrib/llvm/lib/Target/ARM/ARMComputeBlockSize.cpp b/contrib/llvm/lib/Target/ARM/ARMComputeBlockSize.cpp
new file mode 100644
index 000000000000..64f187d17e64
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMComputeBlockSize.cpp
@@ -0,0 +1,72 @@
+//===--- ARMComputeBlockSize.cpp - Compute machine block sizes ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBasicBlockInfo.h"
+using namespace llvm;
+
+namespace llvm {
+
+// mayOptimizeThumb2Instruction - Returns true if optimizeThumb2Instructions
+// below may shrink MI.
+static bool
+mayOptimizeThumb2Instruction(const MachineInstr *MI) {
+  switch(MI->getOpcode()) {
+    // optimizeThumb2Instructions.
+    case ARM::t2LEApcrel:
+    case ARM::t2LDRpci:
+    // optimizeThumb2Branches.
+    case ARM::t2B:
+    case ARM::t2Bcc:
+    case ARM::tBcc:
+    // optimizeThumb2JumpTables.
+    case ARM::t2BR_JT:
+      return true;
+  }
+  return false;
+}
+
+void computeBlockSize(MachineFunction *MF, MachineBasicBlock *MBB,
+                      BasicBlockInfo &BBI) {
+  const ARMBaseInstrInfo *TII =
+    static_cast<const ARMBaseInstrInfo *>(MF->getSubtarget().getInstrInfo());
+  bool isThumb = MF->getInfo<ARMFunctionInfo>()->isThumbFunction();
+  BBI.Size = 0;
+  BBI.Unalign = 0;
+  BBI.PostAlign = 0;
+
+  for (MachineInstr &I : *MBB) {
+    BBI.Size += TII->getInstSizeInBytes(I);
+    // For inline asm, getInstSizeInBytes returns a conservative estimate.
+    // The actual size may be smaller, but still a multiple of the instr size.
+    if (I.isInlineAsm())
+      BBI.Unalign = isThumb ? 1 : 2;
+    // Also consider instructions that may be shrunk later.
+    else if (isThumb && mayOptimizeThumb2Instruction(&I))
+      BBI.Unalign = 1;
+  }
+
+  // tBR_JTr contains a .align 2 directive.
+  if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) {
+    BBI.PostAlign = 2;
+    MBB->getParent()->ensureAlignment(2);
+  }
+}
+
+std::vector<BasicBlockInfo> computeAllBlockSizes(MachineFunction *MF) {
+  std::vector<BasicBlockInfo> BBInfo;
+  BBInfo.resize(MF->getNumBlockIDs());
+
+  for (MachineBasicBlock &MBB : *MF)
+    computeBlockSize(MF, &MBB, BBInfo[MBB.getNumber()]);
+
+  return BBInfo;
+}
+
+} // end namespace
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
new file mode 100644
index 000000000000..be1a37e3e362
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -0,0 +1,2258 @@
+//===-- ARMConstantIslandPass.cpp - ARM constant islands ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that splits the constant pool up into 'islands'
+// which are scattered through-out the function.  This is required due to the
+// limited pc-relative displacements that ARM has.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBasicBlockInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-cp-islands"
+
+STATISTIC(NumCPEs,       "Number of constpool entries");
+STATISTIC(NumSplit,      "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
+STATISTIC(NumUBrFixed,   "Number of uncond branches fixed");
+STATISTIC(NumTBs,        "Number of table branches generated");
+STATISTIC(NumT2CPShrunk, "Number of Thumb2 constantpool instructions shrunk");
+STATISTIC(NumT2BrShrunk, "Number of Thumb2 immediate branches shrunk");
+STATISTIC(NumCBZ,        "Number of CBZ / CBNZ formed");
+STATISTIC(NumJTMoved,    "Number of jump table destination blocks moved");
+STATISTIC(NumJTInserted, "Number of jump table intermediate blocks inserted");
+
+
+static cl::opt<bool>
+AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true),
+          cl::desc("Adjust basic block layout to better use TB[BH]"));
+
+static cl::opt<unsigned>
+CPMaxIteration("arm-constant-island-max-iteration", cl::Hidden, cl::init(30),
+          cl::desc("The max number of iteration for converge"));
+
+static cl::opt<bool> SynthesizeThumb1TBB(
+    "arm-synthesize-thumb-1-tbb", cl::Hidden, cl::init(true),
+    cl::desc("Use compressed jump tables in Thumb-1 by synthesizing an "
+             "equivalent to the TBB/TBH instructions"));
+
+namespace {
+  /// ARMConstantIslands - Due to limited PC-relative displacements, ARM
+  /// requires constant pool entries to be scattered among the instructions
+  /// inside a function.  To do this, it completely ignores the normal LLVM
+  /// constant pool; instead, it places constants wherever it feels like with
+  /// special instructions.
+  ///
+  /// The terminology used in this pass includes:
+  ///   Islands - Clumps of constants placed in the function.
+  ///   Water   - Potential places where an island could be formed.
+  ///   CPE     - A constant pool entry that has been placed somewhere, which
+  ///             tracks a list of users.
+  class ARMConstantIslands : public MachineFunctionPass {
+
+    std::vector<BasicBlockInfo> BBInfo;
+
+    /// WaterList - A sorted list of basic blocks where islands could be placed
+    /// (i.e. blocks that don't fall through to the following block, due
+    /// to a return, unreachable, or unconditional branch).
+    std::vector<MachineBasicBlock*> WaterList;
+
+    /// NewWaterList - The subset of WaterList that was created since the
+    /// previous iteration by inserting unconditional branches.
+    SmallSet<MachineBasicBlock*, 4> NewWaterList;
+
+    typedef std::vector<MachineBasicBlock*>::iterator water_iterator;
+
+    /// CPUser - One user of a constant pool, keeping the machine instruction
+    /// pointer, the constant pool being referenced, and the max displacement
+    /// allowed from the instruction to the CP.  The HighWaterMark records the
+    /// highest basic block where a new CPEntry can be placed.  To ensure this
+    /// pass terminates, the CP entries are initially placed at the end of the
+    /// function and then move monotonically to lower addresses.  The
+    /// exception to this rule is when the current CP entry for a particular
+    /// CPUser is out of range, but there is another CP entry for the same
+    /// constant value in range.  We want to use the existing in-range CP
+    /// entry, but if it later moves out of range, the search for new water
+    /// should resume where it left off.  The HighWaterMark is used to record
+    /// that point.
+    struct CPUser {
+      MachineInstr *MI;
+      MachineInstr *CPEMI;
+      MachineBasicBlock *HighWaterMark;
+      unsigned MaxDisp;
+      bool NegOk;
+      bool IsSoImm;
+      bool KnownAlignment;
+      CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
+             bool neg, bool soimm)
+        : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp), NegOk(neg), IsSoImm(soimm),
+          KnownAlignment(false) {
+        HighWaterMark = CPEMI->getParent();
+      }
+      /// getMaxDisp - Returns the maximum displacement supported by MI.
+      /// Correct for unknown alignment.
+      /// Conservatively subtract 2 bytes to handle weird alignment effects.
+      unsigned getMaxDisp() const {
+        return (KnownAlignment ? MaxDisp : MaxDisp - 2) - 2;
+      }
+    };
+
+    /// CPUsers - Keep track of all of the machine instructions that use various
+    /// constant pools and their max displacement.
+    std::vector<CPUser> CPUsers;
+
+    /// CPEntry - One per constant pool entry, keeping the machine instruction
+    /// pointer, the constpool index, and the number of CPUser's which
+    /// reference this entry.
+    struct CPEntry {
+      MachineInstr *CPEMI;
+      unsigned CPI;
+      unsigned RefCount;
+      CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
+        : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
+    };
+
+    /// CPEntries - Keep track of all of the constant pool entry machine
+    /// instructions. For each original constpool index (i.e. those that existed
+    /// upon entry to this pass), it keeps a vector of entries.  Original
+    /// elements are cloned as we go along; the clones are put in the vector of
+    /// the original element, but have distinct CPIs.
+    ///
+    /// The first half of CPEntries contains generic constants, the second half
+    /// contains jump tables. Use getCombinedIndex on a generic CPEMI to look up
+    /// which vector it will be in here.
+    std::vector<std::vector<CPEntry> > CPEntries;
+
+    /// Maps a JT index to the offset in CPEntries containing copies of that
+    /// table. The equivalent map for a CONSTPOOL_ENTRY is the identity.
+    DenseMap<int, int> JumpTableEntryIndices;
+
+    /// Maps a JT index to the LEA that actually uses the index to calculate its
+    /// base address.
+    DenseMap<int, int> JumpTableUserIndices;
+
+    /// ImmBranch - One per immediate branch, keeping the machine instruction
+    /// pointer, conditional or unconditional, the max displacement,
+    /// and (if isCond is true) the corresponding unconditional branch
+    /// opcode.
+    struct ImmBranch {
+      MachineInstr *MI;
+      unsigned MaxDisp : 31;
+      bool isCond : 1;
+      unsigned UncondBr;
+      ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, unsigned ubr)
+        : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
+    };
+
+    /// ImmBranches - Keep track of all the immediate branch instructions.
+    ///
+    std::vector<ImmBranch> ImmBranches;
+
+    /// PushPopMIs - Keep track of all the Thumb push / pop instructions.
+    ///
+    SmallVector<MachineInstr*, 4> PushPopMIs;
+
+    /// T2JumpTables - Keep track of all the Thumb2 jumptable instructions.
+    SmallVector<MachineInstr*, 4> T2JumpTables;
+
+    /// HasFarJump - True if any far jump instruction has been emitted during
+    /// the branch fix up pass.
+    bool HasFarJump;
+
+    MachineFunction *MF;
+    MachineConstantPool *MCP;
+    const ARMBaseInstrInfo *TII;
+    const ARMSubtarget *STI;
+    ARMFunctionInfo *AFI;
+    bool isThumb;
+    bool isThumb1;
+    bool isThumb2;
+    bool isPositionIndependentOrROPI;
+  public:
+    static char ID;
+    ARMConstantIslands() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    StringRef getPassName() const override {
+      return "ARM constant island placement and branch shortening pass";
+    }
+
+  private:
+    void doInitialConstPlacement(std::vector<MachineInstr *> &CPEMIs);
+    void doInitialJumpTablePlacement(std::vector<MachineInstr *> &CPEMIs);
+    bool BBHasFallthrough(MachineBasicBlock *MBB);
+    CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
+    unsigned getCPELogAlign(const MachineInstr *CPEMI);
+    void scanFunctionJumpTables();
+    void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
+    MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+    void updateForInsertedWaterBlock(MachineBasicBlock *NewBB);
+    void adjustBBOffsetsAfter(MachineBasicBlock *BB);
+    bool decrementCPEReferenceCount(unsigned CPI, MachineInstr* CPEMI);
+    unsigned getCombinedIndex(const MachineInstr *CPEMI);
+    int findInRangeCPEntry(CPUser& U, unsigned UserOffset);
+    bool findAvailableWater(CPUser&U, unsigned UserOffset,
+                            water_iterator &WaterIter, bool CloserWater);
+    void createNewWater(unsigned CPUserIndex, unsigned UserOffset,
+                        MachineBasicBlock *&NewMBB);
+    bool handleConstantPoolUser(unsigned CPUserIndex, bool CloserWater);
+    void removeDeadCPEMI(MachineInstr *CPEMI);
+    bool removeUnusedCPEntries();
+    bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
+                          MachineInstr *CPEMI, unsigned Disp, bool NegOk,
+                          bool DoDump = false);
+    bool isWaterInRange(unsigned UserOffset, MachineBasicBlock *Water,
+                        CPUser &U, unsigned &Growth);
+    bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+    bool fixupImmediateBr(ImmBranch &Br);
+    bool fixupConditionalBr(ImmBranch &Br);
+    bool fixupUnconditionalBr(ImmBranch &Br);
+    bool undoLRSpillRestore();
+    bool optimizeThumb2Instructions();
+    bool optimizeThumb2Branches();
+    bool reorderThumb2JumpTables();
+    bool preserveBaseRegister(MachineInstr *JumpMI, MachineInstr *LEAMI,
+                              unsigned &DeadSize, bool &CanDeleteLEA,
+                              bool &BaseRegKill);
+    bool optimizeThumb2JumpTables();
+    MachineBasicBlock *adjustJTTargetBlockForward(MachineBasicBlock *BB,
+                                                  MachineBasicBlock *JTBB);
+
+    unsigned getOffsetOf(MachineInstr *MI) const;
+    unsigned getUserOffset(CPUser&) const;
+    void dumpBBs();
+    void verify();
+
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                         unsigned Disp, bool NegativeOK, bool IsSoImm = false);
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                         const CPUser &U) {
+      return isOffsetInRange(UserOffset, TrialOffset,
+                             U.getMaxDisp(), U.NegOk, U.IsSoImm);
+    }
+  };
+  char ARMConstantIslands::ID = 0;
+}
+
+/// verify - check BBOffsets, BBSizes, alignment of islands
+void ARMConstantIslands::verify() {
+#ifndef NDEBUG
+  assert(std::is_sorted(MF->begin(), MF->end(),
+                        [this](const MachineBasicBlock &LHS,
+                               const MachineBasicBlock &RHS) {
+                          return BBInfo[LHS.getNumber()].postOffset() <
+                                 BBInfo[RHS.getNumber()].postOffset();
+                        }));
+  DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n");
+  for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
+    CPUser &U = CPUsers[i];
+    unsigned UserOffset = getUserOffset(U);
+    // Verify offset using the real max displacement without the safety
+    // adjustment.
+    if (isCPEntryInRange(U.MI, UserOffset, U.CPEMI, U.getMaxDisp()+2, U.NegOk,
+                         /* DoDump = */ true)) {
+      DEBUG(dbgs() << "OK\n");
+      continue;
+    }
+    DEBUG(dbgs() << "Out of range.\n");
+    dumpBBs();
+    DEBUG(MF->dump());
+    llvm_unreachable("Constant pool entry out of range!");
+  }
+#endif
+}
+
+/// print block size and offset information - debugging
+void ARMConstantIslands::dumpBBs() {
+  DEBUG({
+    for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
+      const BasicBlockInfo &BBI = BBInfo[J];
+      dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
+             << " kb=" << unsigned(BBI.KnownBits)
+             << " ua=" << unsigned(BBI.Unalign)
+             << " pa=" << unsigned(BBI.PostAlign)
+             << format(" size=%#x\n", BBInfo[J].Size);
+    }
+  });
+}
+
+/// createARMConstantIslandPass - returns an instance of the constpool
+/// island pass.
+FunctionPass *llvm::createARMConstantIslandPass() {
+  return new ARMConstantIslands();
+}
+
+bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  MCP = mf.getConstantPool();
+
+  DEBUG(dbgs() << "***** ARMConstantIslands: "
+               << MCP->getConstants().size() << " CP entries, aligned to "
+               << MCP->getConstantPoolAlignment() << " bytes *****\n");
+
+  STI = &static_cast<const ARMSubtarget &>(MF->getSubtarget());
+  TII = STI->getInstrInfo();
+  isPositionIndependentOrROPI =
+      STI->getTargetLowering()->isPositionIndependent() || STI->isROPI();
+  AFI = MF->getInfo<ARMFunctionInfo>();
+
+  isThumb = AFI->isThumbFunction();
+  isThumb1 = AFI->isThumb1OnlyFunction();
+  isThumb2 = AFI->isThumb2Function();
+
+  HasFarJump = false;
+  bool GenerateTBB = isThumb2 || (isThumb1 && SynthesizeThumb1TBB);
+
+  // This pass invalidates liveness information when it splits basic blocks.
+  MF->getRegInfo().invalidateLiveness();
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  // Try to reorder and otherwise adjust the block layout to make good use
+  // of the TB[BH] instructions.
+  bool MadeChange = false;
+  if (GenerateTBB && AdjustJumpTableBlocks) {
+    scanFunctionJumpTables();
+    MadeChange |= reorderThumb2JumpTables();
+    // Data is out of date, so clear it. It'll be re-computed later.
+    T2JumpTables.clear();
+    // Blocks may have shifted around. Keep the numbering up to date.
+    MF->RenumberBlocks();
+  }
+
+  // Perform the initial placement of the constant pool entries.  To start with,
+  // we put them all at the end of the function.
+  std::vector<MachineInstr*> CPEMIs;
+  if (!MCP->isEmpty())
+    doInitialConstPlacement(CPEMIs);
+
+  if (MF->getJumpTableInfo())
+    doInitialJumpTablePlacement(CPEMIs);
+
+  /// The next UID to take is the first unused one.
+  AFI->initPICLabelUId(CPEMIs.size());
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block, the location of all the water, and finding all of the
+  // constant pool users.
+  initializeFunctionInfo(CPEMIs);
+  CPEMIs.clear();
+  DEBUG(dumpBBs());
+
+  // Functions with jump tables need an alignment of 4 because they use the ADR
+  // instruction, which aligns the PC to 4 bytes before adding an offset.
+  if (!T2JumpTables.empty())
+    MF->ensureAlignment(2);
+
+  /// Remove dead constant pool entries.
+  MadeChange |= removeUnusedCPEntries();
+
+  // Iteratively place constant pool entries and fix up branches until there
+  // is no change.
+  unsigned NoCPIters = 0, NoBRIters = 0;
+  while (true) {
+    DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
+    bool CPChange = false;
+    for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
+      // For most inputs, it converges in no more than 5 iterations.
+      // If it doesn't end in 10, the input may have huge BB or many CPEs.
+      // In this case, we will try different heuristics.
+      CPChange |= handleConstantPoolUser(i, NoCPIters >= CPMaxIteration / 2);
+    if (CPChange && ++NoCPIters > CPMaxIteration)
+      report_fatal_error("Constant Island pass failed to converge!");
+    DEBUG(dumpBBs());
+
+    // Clear NewWaterList now.  If we split a block for branches, it should
+    // appear as "new water" for the next iteration of constant pool placement.
+    NewWaterList.clear();
+
+    DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
+    bool BRChange = false;
+    for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
+      BRChange |= fixupImmediateBr(ImmBranches[i]);
+    if (BRChange && ++NoBRIters > 30)
+      report_fatal_error("Branch Fix Up pass failed to converge!");
+    DEBUG(dumpBBs());
+
+    if (!CPChange && !BRChange)
+      break;
+    MadeChange = true;
+  }
+
+  // Shrink 32-bit Thumb2 load and store instructions.
+  if (isThumb2 && !STI->prefers32BitThumb())
+    MadeChange |= optimizeThumb2Instructions();
+
+  // Shrink 32-bit branch instructions.
+  if (isThumb && STI->hasV8MBaselineOps())
+    MadeChange |= optimizeThumb2Branches();
+
+  // Optimize jump tables using TBB / TBH.
+  if (GenerateTBB && !STI->genExecuteOnly())
+    MadeChange |= optimizeThumb2JumpTables();
+
+  // After a while, this might be made debug-only, but it is not expensive.
+  verify();
+
+  // If LR has been forced spilled and no far jump (i.e. BL) has been issued,
+  // undo the spill / restore of LR if possible.
+  if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump())
+    MadeChange |= undoLRSpillRestore();
+
+  // Save the mapping between original and cloned constpool entries.
+  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+    for (unsigned j = 0, je = CPEntries[i].size(); j != je; ++j) {
+      const CPEntry & CPE = CPEntries[i][j];
+      if (CPE.CPEMI && CPE.CPEMI->getOperand(1).isCPI())
+        AFI->recordCPEClone(i, CPE.CPI);
+    }
+  }
+
+  DEBUG(dbgs() << '\n'; dumpBBs());
+
+  BBInfo.clear();
+  WaterList.clear();
+  CPUsers.clear();
+  CPEntries.clear();
+  JumpTableEntryIndices.clear();
+  JumpTableUserIndices.clear();
+  ImmBranches.clear();
+  PushPopMIs.clear();
+  T2JumpTables.clear();
+
+  return MadeChange;
+}
+
+/// \brief Perform the initial placement of the regular constant pool entries.
+/// To start with, we put them all at the end of the function.
+void
+ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) {
+  // Create the basic block to hold the CPE's.
+  MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
+  MF->push_back(BB);
+
+  // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
+  unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
+
+  // Mark the basic block as required by the const-pool.
+  BB->setAlignment(MaxAlign);
+
+  // The function needs to be as aligned as the basic blocks. The linker may
+  // move functions around based on their alignment.
+  MF->ensureAlignment(BB->getAlignment());
+
+  // Order the entries in BB by descending alignment.  That ensures correct
+  // alignment of all entries as long as BB is sufficiently aligned.  Keep
+  // track of the insertion point for each alignment.  We are going to bucket
+  // sort the entries as they are created.
+  SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end());
+
+  // Add all of the constants from the constant pool to the end block, use an
+  // identity mapping of CPI's to CPE's.
+  const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
+
+  const DataLayout &TD = MF->getDataLayout();
+  for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
+    unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
+    assert(Size >= 4 && "Too small constant pool entry");
+    unsigned Align = CPs[i].getAlignment();
+    assert(isPowerOf2_32(Align) && "Invalid alignment");
+    // Verify that all constant pool entries are a multiple of their alignment.
+    // If not, we would have to pad them out so that instructions stay aligned.
+    assert((Size % Align) == 0 && "CP Entry not multiple of 4 bytes!");
+
+    // Insert CONSTPOOL_ENTRY before entries with a smaller alignment.
+    unsigned LogAlign = Log2_32(Align);
+    MachineBasicBlock::iterator InsAt = InsPoint[LogAlign];
+    MachineInstr *CPEMI =
+      BuildMI(*BB, InsAt, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY))
+        .addImm(i).addConstantPoolIndex(i).addImm(Size);
+    CPEMIs.push_back(CPEMI);
+
+    // Ensure that future entries with higher alignment get inserted before
+    // CPEMI. This is bucket sort with iterators.
+    for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a)
+      if (InsPoint[a] == InsAt)
+        InsPoint[a] = CPEMI;
+
+    // Add a new CPEntry, but no corresponding CPUser yet.
+    CPEntries.emplace_back(1, CPEntry(CPEMI, i));
+    ++NumCPEs;
+    DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
+                 << Size << ", align = " << Align <<'\n');
+  }
+  DEBUG(BB->dump());
+}
+
+/// \brief Do initial placement of the jump tables. Because Thumb2's TBB and TBH
+/// instructions can be made more efficient if the jump table immediately
+/// follows the instruction, it's best to place them immediately next to their
+/// jumps to begin with. In almost all cases they'll never be moved from that
+/// position.
+void ARMConstantIslands::doInitialJumpTablePlacement(
+    std::vector<MachineInstr *> &CPEMIs) {
+  unsigned i = CPEntries.size();
+  auto MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+
+  MachineBasicBlock *LastCorrectlyNumberedBB = nullptr;
+  for (MachineBasicBlock &MBB : *MF) {
+    auto MI = MBB.getLastNonDebugInstr();
+    if (MI == MBB.end())
+      continue;
+
+    unsigned JTOpcode;
+    switch (MI->getOpcode()) {
+    default:
+      continue;
+    case ARM::BR_JTadd:
+    case ARM::BR_JTr:
+    case ARM::tBR_JTr:
+    case ARM::BR_JTm:
+      JTOpcode = ARM::JUMPTABLE_ADDRS;
+      break;
+    case ARM::t2BR_JT:
+      JTOpcode = ARM::JUMPTABLE_INSTS;
+      break;
+    case ARM::tTBB_JT:
+    case ARM::t2TBB_JT:
+      JTOpcode = ARM::JUMPTABLE_TBB;
+      break;
+    case ARM::tTBH_JT:
+    case ARM::t2TBH_JT:
+      JTOpcode = ARM::JUMPTABLE_TBH;
+      break;
+    }
+
+    unsigned NumOps = MI->getDesc().getNumOperands();
+    MachineOperand JTOp =
+      MI->getOperand(NumOps - (MI->isPredicable() ? 2 : 1));
+    unsigned JTI = JTOp.getIndex();
+    unsigned Size = JT[JTI].MBBs.size() * sizeof(uint32_t);
+    MachineBasicBlock *JumpTableBB = MF->CreateMachineBasicBlock();
+    MF->insert(std::next(MachineFunction::iterator(MBB)), JumpTableBB);
+    MachineInstr *CPEMI = BuildMI(*JumpTableBB, JumpTableBB->begin(),
+                                  DebugLoc(), TII->get(JTOpcode))
+                              .addImm(i++)
+                              .addJumpTableIndex(JTI)
+                              .addImm(Size);
+    CPEMIs.push_back(CPEMI);
+    CPEntries.emplace_back(1, CPEntry(CPEMI, JTI));
+    JumpTableEntryIndices.insert(std::make_pair(JTI, CPEntries.size() - 1));
+    if (!LastCorrectlyNumberedBB)
+      LastCorrectlyNumberedBB = &MBB;
+  }
+
+  // If we did anything then we need to renumber the subsequent blocks.
+  if (LastCorrectlyNumberedBB)
+    MF->RenumberBlocks(LastCorrectlyNumberedBB);
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+bool ARMConstantIslands::BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB->getIterator();
+  // Can't fall off end of function.
+  if (std::next(MBBI) == MBB->getParent()->end())
+    return false;
+
+  MachineBasicBlock *NextBB = &*std::next(MBBI);
+  if (!MBB->isSuccessor(NextBB))
+    return false;
+
+  // Try to analyze the end of the block. A potential fallthrough may already
+  // have an unconditional branch for whatever reason.
+  MachineBasicBlock *TBB, *FBB;
+  SmallVector<MachineOperand, 4> Cond;
+  bool TooDifficult = TII->analyzeBranch(*MBB, TBB, FBB, Cond);
+  return TooDifficult || FBB == nullptr;
+}
+
+/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
+/// look up the corresponding CPEntry.
+ARMConstantIslands::CPEntry
+*ARMConstantIslands::findConstPoolEntry(unsigned CPI,
+                                        const MachineInstr *CPEMI) {
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  // Number of entries per constpool index should be small, just do a
+  // linear search.
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    if (CPEs[i].CPEMI == CPEMI)
+      return &CPEs[i];
+  }
+  return nullptr;
+}
+
+/// getCPELogAlign - Returns the required alignment of the constant pool entry
+/// represented by CPEMI.  Alignment is measured in log2(bytes) units.
+unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
+  switch (CPEMI->getOpcode()) {
+  case ARM::CONSTPOOL_ENTRY:
+    break;
+  case ARM::JUMPTABLE_TBB:
+    return isThumb1 ? 2 : 0;
+  case ARM::JUMPTABLE_TBH:
+    return isThumb1 ? 2 : 1;
+  case ARM::JUMPTABLE_INSTS:
+    return 1;
+  case ARM::JUMPTABLE_ADDRS:
+    return 2;
+  default:
+    llvm_unreachable("unknown constpool entry kind");
+  }
+
+  unsigned CPI = getCombinedIndex(CPEMI);
+  assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
+  unsigned Align = MCP->getConstants()[CPI].getAlignment();
+  assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
+  return Log2_32(Align);
+}
+
+/// scanFunctionJumpTables - Do a scan of the function, building up
+/// information about the sizes of each block and the locations of all
+/// the jump tables.
+void ARMConstantIslands::scanFunctionJumpTables() {
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &I : MBB)
+      if (I.isBranch() &&
+          (I.getOpcode() == ARM::t2BR_JT || I.getOpcode() == ARM::tBR_JTr))
+        T2JumpTables.push_back(&I);
+  }
+}
+
+/// initializeFunctionInfo - Do the initial scan of the function, building up
+/// information about the sizes of each block, the location of all the water,
+/// and finding all of the constant pool users.
+void ARMConstantIslands::
+initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
+
+  BBInfo = computeAllBlockSizes(MF);
+
+  // The known bits of the entry block offset are determined by the function
+  // alignment.
+  BBInfo.front().KnownBits = MF->getAlignment();
+
+  // Compute block offsets and known bits.
+  adjustBBOffsetsAfter(&MF->front());
+
+  // Now go back through the instructions and build up our data structures.
+  for (MachineBasicBlock &MBB : *MF) {
+    // If this block doesn't fall through into the next MBB, then this is
+    // 'water' that a constant pool island could be placed.
+    if (!BBHasFallthrough(&MBB))
+      WaterList.push_back(&MBB);
+
+    for (MachineInstr &I : MBB) {
+      if (I.isDebugValue())
+        continue;
+
+      unsigned Opc = I.getOpcode();
+      if (I.isBranch()) {
+        bool isCond = false;
+        unsigned Bits = 0;
+        unsigned Scale = 1;
+        int UOpc = Opc;
+        switch (Opc) {
+        default:
+          continue;  // Ignore other JT branches
+        case ARM::t2BR_JT:
+        case ARM::tBR_JTr:
+          T2JumpTables.push_back(&I);
+          continue;   // Does not get an entry in ImmBranches
+        case ARM::Bcc:
+          isCond = true;
+          UOpc = ARM::B;
+          LLVM_FALLTHROUGH;
+        case ARM::B:
+          Bits = 24;
+          Scale = 4;
+          break;
+        case ARM::tBcc:
+          isCond = true;
+          UOpc = ARM::tB;
+          Bits = 8;
+          Scale = 2;
+          break;
+        case ARM::tB:
+          Bits = 11;
+          Scale = 2;
+          break;
+        case ARM::t2Bcc:
+          isCond = true;
+          UOpc = ARM::t2B;
+          Bits = 20;
+          Scale = 2;
+          break;
+        case ARM::t2B:
+          Bits = 24;
+          Scale = 2;
+          break;
+        }
+
+        // Record this immediate branch.
+        unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+        ImmBranches.push_back(ImmBranch(&I, MaxOffs, isCond, UOpc));
+      }
+
+      if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET)
+        PushPopMIs.push_back(&I);
+
+      if (Opc == ARM::CONSTPOOL_ENTRY || Opc == ARM::JUMPTABLE_ADDRS ||
+          Opc == ARM::JUMPTABLE_INSTS || Opc == ARM::JUMPTABLE_TBB ||
+          Opc == ARM::JUMPTABLE_TBH)
+        continue;
+
+      // Scan the instructions for constant pool operands.
+      for (unsigned op = 0, e = I.getNumOperands(); op != e; ++op)
+        if (I.getOperand(op).isCPI() || I.getOperand(op).isJTI()) {
+          // We found one.  The addressing mode tells us the max displacement
+          // from the PC that this instruction permits.
+
+          // Basic size info comes from the TSFlags field.
+          unsigned Bits = 0;
+          unsigned Scale = 1;
+          bool NegOk = false;
+          bool IsSoImm = false;
+
+          switch (Opc) {
+          default:
+            llvm_unreachable("Unknown addressing mode for CP reference!");
+
+          // Taking the address of a CP entry.
+          case ARM::LEApcrel:
+          case ARM::LEApcrelJT:
+            // This takes a SoImm, which is 8 bit immediate rotated. We'll
+            // pretend the maximum offset is 255 * 4. Since each instruction
+            // 4 byte wide, this is always correct. We'll check for other
+            // displacements that fits in a SoImm as well.
+            Bits = 8;
+            Scale = 4;
+            NegOk = true;
+            IsSoImm = true;
+            break;
+          case ARM::t2LEApcrel:
+          case ARM::t2LEApcrelJT:
+            Bits = 12;
+            NegOk = true;
+            break;
+          case ARM::tLEApcrel:
+          case ARM::tLEApcrelJT:
+            Bits = 8;
+            Scale = 4;
+            break;
+
+          case ARM::LDRBi12:
+          case ARM::LDRi12:
+          case ARM::LDRcp:
+          case ARM::t2LDRpci:
+          case ARM::t2LDRHpci:
+            Bits = 12;  // +-offset_12
+            NegOk = true;
+            break;
+
+          case ARM::tLDRpci:
+            Bits = 8;
+            Scale = 4;  // +(offset_8*4)
+            break;
+
+          case ARM::VLDRD:
+          case ARM::VLDRS:
+            Bits = 8;
+            Scale = 4;  // +-(offset_8*4)
+            NegOk = true;
+            break;
+
+          case ARM::tLDRHi:
+            Bits = 5;
+            Scale = 2; // +(offset_5*2)
+            break;
+          }
+
+          // Remember that this is a user of a CP entry.
+          unsigned CPI = I.getOperand(op).getIndex();
+          if (I.getOperand(op).isJTI()) {
+            JumpTableUserIndices.insert(std::make_pair(CPI, CPUsers.size()));
+            CPI = JumpTableEntryIndices[CPI];
+          }
+
+          MachineInstr *CPEMI = CPEMIs[CPI];
+          unsigned MaxOffs = ((1 << Bits)-1) * Scale;
+          CPUsers.push_back(CPUser(&I, CPEMI, MaxOffs, NegOk, IsSoImm));
+
+          // Increment corresponding CPEntry reference count.
+          CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+          assert(CPE && "Cannot find a corresponding CPEntry!");
+          CPE->RefCount++;
+
+          // Instructions can only use one CP entry, don't bother scanning the
+          // rest of the operands.
+          break;
+        }
+    }
+  }
+}
+
+/// getOffsetOf - Return the current offset of the specified machine instruction
+/// from the start of the function.  This offset changes as stuff is moved
+/// around inside the function.
+unsigned ARMConstantIslands::getOffsetOf(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BBInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->getInstSizeInBytes(*I);
+  }
+  return Offset;
+}
+
+/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB
+/// ID.
+static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
+                              const MachineBasicBlock *RHS) {
+  return LHS->getNumber() < RHS->getNumber();
+}
+
+/// updateForInsertedWaterBlock - When a block is newly inserted into the
+/// machine function, it upsets all of the block numbers.  Renumber the blocks
+/// and update the arrays that parallel this numbering.
+void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
+  // Renumber the MBB's to keep them consecutive.
+  NewBB->getParent()->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add NewMBB as having
+  // available water after it.
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
+                     CompareMBBNumbers);
+  WaterList.insert(IP, NewBB);
+}
+
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+    MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = ++OrigBB->getIterator();
+  MF->insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  unsigned Opc = isThumb ? (isThumb2 ? ARM::t2B : ARM::tB) : ARM::B;
+  if (!isThumb)
+    BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB);
+  else
+    BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB)
+            .addImm(ARMCC::AL).addReg(0);
+  ++NumSplit;
+
+  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
+  NewBB->transferSuccessors(OrigBB);
+
+  // OrigBB branches to NewBB.
+  OrigBB->addSuccessor(NewBB);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  // This is almost the same as updateForInsertedWaterBlock, except that
+  // the Water goes after OrigBB, not NewBB.
+  MF->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add OrigMBB as having
+  // available water after it (but not if it's already there, which happens
+  // when splitting before a conditional branch that is followed by an
+  // unconditional branch - in that case we want to insert NewBB).
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
+                     CompareMBBNumbers);
+  MachineBasicBlock* WaterBB = *IP;
+  if (WaterBB == OrigBB)
+    WaterList.insert(std::next(IP), NewBB);
+  else
+    WaterList.insert(IP, OrigBB);
+  NewWaterList.insert(OrigBB);
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  computeBlockSize(MF, OrigBB, BBInfo[OrigBB->getNumber()]);
+
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  computeBlockSize(MF, NewBB, BBInfo[NewBB->getNumber()]);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBBOffsetsAfter(OrigBB);
+
+  return NewBB;
+}
+
+/// getUserOffset - Compute the offset of U.MI as seen by the hardware
+/// displacement computation.  Update U.KnownAlignment to match its current
+/// basic block location.
+unsigned ARMConstantIslands::getUserOffset(CPUser &U) const {
+  unsigned UserOffset = getOffsetOf(U.MI);
+  const BasicBlockInfo &BBI = BBInfo[U.MI->getParent()->getNumber()];
+  unsigned KnownBits = BBI.internalKnownBits();
+
+  // The value read from PC is offset from the actual instruction address.
+  UserOffset += (isThumb ? 4 : 8);
+
+  // Because of inline assembly, we may not know the alignment (mod 4) of U.MI.
+  // Make sure U.getMaxDisp() returns a constrained range.
+  U.KnownAlignment = (KnownBits >= 2);
+
+  // On Thumb, offsets==2 mod 4 are rounded down by the hardware for
+  // purposes of the displacement computation; compensate for that here.
+  // For unknown alignments, getMaxDisp() constrains the range instead.
+  if (isThumb && U.KnownAlignment)
+    UserOffset &= ~3u;
+
+  return UserOffset;
+}
+
+/// isOffsetInRange - Checks whether UserOffset (the location of a constant pool
+/// reference) is within MaxDisp of TrialOffset (a proposed location of a
+/// constant pool entry).
+/// UserOffset is computed by getUserOffset above to include PC adjustments. If
+/// the mod 4 alignment of UserOffset is not known, the uncertainty must be
+/// subtracted from MaxDisp instead. CPUser::getMaxDisp() does that.
+bool ARMConstantIslands::isOffsetInRange(unsigned UserOffset,
+                                         unsigned TrialOffset, unsigned MaxDisp,
+                                         bool NegativeOK, bool IsSoImm) {
+  if (UserOffset <= TrialOffset) {
+    // User before the Trial.
+    if (TrialOffset - UserOffset <= MaxDisp)
+      return true;
+    // FIXME: Make use full range of soimm values.
+  } else if (NegativeOK) {
+    if (UserOffset - TrialOffset <= MaxDisp)
+      return true;
+    // FIXME: Make use full range of soimm values.
+  }
+  return false;
+}
+
+/// isWaterInRange - Returns true if a CPE placed after the specified
+/// Water (a basic block) will be in range for the specific MI.
+///
+/// Compute how much the function will grow by inserting a CPE after Water.
+bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
+                                        MachineBasicBlock* Water, CPUser &U,
+                                        unsigned &Growth) {
+  unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
+  unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
+  unsigned NextBlockOffset, NextBlockAlignment;
+  MachineFunction::const_iterator NextBlock = Water->getIterator();
+  if (++NextBlock == MF->end()) {
+    NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
+    NextBlockAlignment = 0;
+  } else {
+    NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
+    NextBlockAlignment = NextBlock->getAlignment();
+  }
+  unsigned Size = U.CPEMI->getOperand(2).getImm();
+  unsigned CPEEnd = CPEOffset + Size;
+
+  // The CPE may be able to hide in the alignment padding before the next
+  // block. It may also cause more padding to be required if it is more aligned
+  // that the next block.
+  if (CPEEnd > NextBlockOffset) {
+    Growth = CPEEnd - NextBlockOffset;
+    // Compute the padding that would go at the end of the CPE to align the next
+    // block.
+    Growth += OffsetToAlignment(CPEEnd, 1ULL << NextBlockAlignment);
+
+    // If the CPE is to be inserted before the instruction, that will raise
+    // the offset of the instruction. Also account for unknown alignment padding
+    // in blocks between CPE and the user.
+    if (CPEOffset < UserOffset)
+      UserOffset += Growth + UnknownPadding(MF->getAlignment(), CPELogAlign);
+  } else
+    // CPE fits in existing padding.
+    Growth = 0;
+
+  return isOffsetInRange(UserOffset, CPEOffset, U);
+}
+
+/// isCPEntryInRange - Returns true if the distance between specific MI and
+/// specific ConstPool entry instruction can fit in MI's displacement field.
+bool ARMConstantIslands::isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
+                                      MachineInstr *CPEMI, unsigned MaxDisp,
+                                      bool NegOk, bool DoDump) {
+  unsigned CPEOffset  = getOffsetOf(CPEMI);
+
+  if (DoDump) {
+    DEBUG({
+      unsigned Block = MI->getParent()->getNumber();
+      const BasicBlockInfo &BBI = BBInfo[Block];
+      dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
+             << " max delta=" << MaxDisp
+             << format(" insn address=%#x", UserOffset)
+             << " in BB#" << Block << ": "
+             << format("%#x-%x\t", BBI.Offset, BBI.postOffset()) << *MI
+             << format("CPE address=%#x offset=%+d: ", CPEOffset,
+                       int(CPEOffset-UserOffset));
+    });
+  }
+
+  return isOffsetInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
+}
+
+#ifndef NDEBUG
+/// BBIsJumpedOver - Return true of the specified basic block's only predecessor
+/// unconditionally branches to its only successor.
+static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
+  if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+    return false;
+
+  MachineBasicBlock *Succ = *MBB->succ_begin();
+  MachineBasicBlock *Pred = *MBB->pred_begin();
+  MachineInstr *PredMI = &Pred->back();
+  if (PredMI->getOpcode() == ARM::B || PredMI->getOpcode() == ARM::tB
+      || PredMI->getOpcode() == ARM::t2B)
+    return PredMI->getOperand(0).getMBB() == Succ;
+  return false;
+}
+#endif // NDEBUG
+
+void ARMConstantIslands::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
+  unsigned BBNum = BB->getNumber();
+  for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned LogAlign = MF->getBlockNumbered(i)->getAlignment();
+    unsigned Offset = BBInfo[i - 1].postOffset(LogAlign);
+    unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign);
+
+    // This is where block i begins.  Stop if the offset is already correct,
+    // and we have updated 2 blocks.  This is the maximum number of blocks
+    // changed before calling this function.
+    if (i > BBNum + 2 &&
+        BBInfo[i].Offset == Offset &&
+        BBInfo[i].KnownBits == KnownBits)
+      break;
+
+    BBInfo[i].Offset = Offset;
+    BBInfo[i].KnownBits = KnownBits;
+  }
+}
+
+/// decrementCPEReferenceCount - find the constant pool entry with index CPI
+/// and instruction CPEMI, and decrement its refcount.  If the refcount
+/// becomes 0 remove the entry and instruction.  Returns true if we removed
+/// the entry, false if we didn't.
+
+bool ARMConstantIslands::decrementCPEReferenceCount(unsigned CPI,
+                                                    MachineInstr *CPEMI) {
+  // Find the old entry. Eliminate it if it is no longer used.
+  CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+  assert(CPE && "Unexpected!");
+  if (--CPE->RefCount == 0) {
+    removeDeadCPEMI(CPEMI);
+    CPE->CPEMI = nullptr;
+    --NumCPEs;
+    return true;
+  }
+  return false;
+}
+
+unsigned ARMConstantIslands::getCombinedIndex(const MachineInstr *CPEMI) {
+  if (CPEMI->getOperand(1).isCPI())
+    return CPEMI->getOperand(1).getIndex();
+
+  return JumpTableEntryIndices[CPEMI->getOperand(1).getIndex()];
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
+{
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk,
+                       true)) {
+    DEBUG(dbgs() << "In range\n");
+    return 1;
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = getCombinedIndex(CPEMI);
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    // We already tried this one
+    if (CPEs[i].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[i].CPEMI == nullptr)
+      continue;
+    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
+                     U.NegOk)) {
+      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                   << CPEs[i].CPI << "\n");
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[i].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+        if (UserMI->getOperand(j).isCPI()) {
+          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[i].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in
+/// the specific unconditional branch instruction.
+static inline unsigned getUnconditionalBrDisp(int Opc) {
+  switch (Opc) {
+  case ARM::tB:
+    return ((1<<10)-1)*2;
+  case ARM::t2B:
+    return ((1<<23)-1)*2;
+  default:
+    break;
+  }
+
+  return ((1<<23)-1)*4;
+}
+
+/// findAvailableWater - Look for an existing entry in the WaterList in which
+/// we can place the CPE referenced from U so it's within range of U's MI.
+/// Returns true if found, false if not.  If it returns true, WaterIter
+/// is set to the WaterList entry.  For Thumb, prefer water that will not
+/// introduce padding to water that will.  To ensure that this pass
+/// terminates, the CPE location for a particular CPUser is only allowed to
+/// move to a lower address, so search backward from the end of the list and
+/// prefer the first water that is in range.
+bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
+                                            water_iterator &WaterIter,
+                                            bool CloserWater) {
+  if (WaterList.empty())
+    return false;
+
+  unsigned BestGrowth = ~0u;
+  // The nearest water without splitting the UserBB is right after it.
+  // If the distance is still large (we have a big BB), then we need to split it
+  // if we don't converge after certain iterations. This helps the following
+  // situation to converge:
+  //   BB0:
+  //      Big BB
+  //   BB1:
+  //      Constant Pool
+  // When a CP access is out of range, BB0 may be used as water. However,
+  // inserting islands between BB0 and BB1 makes other accesses out of range.
+  MachineBasicBlock *UserBB = U.MI->getParent();
+  unsigned MinNoSplitDisp =
+      BBInfo[UserBB->getNumber()].postOffset(getCPELogAlign(U.CPEMI));
+  if (CloserWater && MinNoSplitDisp > U.getMaxDisp() / 2)
+    return false;
+  for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();;
+       --IP) {
+    MachineBasicBlock* WaterBB = *IP;
+    // Check if water is in range and is either at a lower address than the
+    // current "high water mark" or a new water block that was created since
+    // the previous iteration by inserting an unconditional branch.  In the
+    // latter case, we want to allow resetting the high water mark back to
+    // this new water since we haven't seen it before.  Inserting branches
+    // should be relatively uncommon and when it does happen, we want to be
+    // sure to take advantage of it for all the CPEs near that block, so that
+    // we don't insert more branches than necessary.
+    // When CloserWater is true, we try to find the lowest address after (or
+    // equal to) user MI's BB no matter of padding growth.
+    unsigned Growth;
+    if (isWaterInRange(UserOffset, WaterBB, U, Growth) &&
+        (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
+         NewWaterList.count(WaterBB) || WaterBB == U.MI->getParent()) &&
+        Growth < BestGrowth) {
+      // This is the least amount of required padding seen so far.
+      BestGrowth = Growth;
+      WaterIter = IP;
+      DEBUG(dbgs() << "Found water after BB#" << WaterBB->getNumber()
+                   << " Growth=" << Growth << '\n');
+
+      if (CloserWater && WaterBB == U.MI->getParent())
+        return true;
+      // Keep looking unless it is perfect and we're not looking for the lowest
+      // possible address.
+      if (!CloserWater && BestGrowth == 0)
+        return true;
+    }
+    if (IP == B)
+      break;
+  }
+  return BestGrowth != ~0u;
+}
+
+/// createNewWater - No existing WaterList entry will work for
+/// CPUsers[CPUserIndex], so create a place to put the CPE.  The end of the
+/// block is used if in range, and the conditional branch munged so control
+/// flow is correct.  Otherwise the block is split to create a hole with an
+/// unconditional branch around it.  In either case NewMBB is set to a
+/// block following which the new island can be inserted (the WaterList
+/// is not adjusted).
+void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
+                                        unsigned UserOffset,
+                                        MachineBasicBlock *&NewMBB) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPELogAlign = getCPELogAlign(CPEMI);
+  MachineBasicBlock *UserMBB = UserMI->getParent();
+  const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
+
+  // If the block does not end in an unconditional branch already, and if the
+  // end of the block is within range, make new water there.  (The addition
+  // below is for the unconditional branch we will be adding: 4 bytes on ARM +
+  // Thumb2, 2 on Thumb1.
+  if (BBHasFallthrough(UserMBB)) {
+    // Size of branch to insert.
+    unsigned Delta = isThumb1 ? 2 : 4;
+    // Compute the offset where the CPE will begin.
+    unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
+
+    if (isOffsetInRange(UserOffset, CPEOffset, U)) {
+      DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
+            << format(", expected CPE offset %#x\n", CPEOffset));
+      NewMBB = &*++UserMBB->getIterator();
+      // Add an unconditional branch from UserMBB to fallthrough block.  Record
+      // it for branch lengthening; this new branch will not get out of range,
+      // but if the preceding conditional branch is out of range, the targets
+      // will be exchanged, and the altered branch may be out of range, so the
+      // machinery has to know about it.
+      int UncondBr = isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B;
+      if (!isThumb)
+        BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
+      else
+        BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB)
+          .addImm(ARMCC::AL).addReg(0);
+      unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
+      ImmBranches.push_back(ImmBranch(&UserMBB->back(),
+                                      MaxDisp, false, UncondBr));
+      computeBlockSize(MF, UserMBB, BBInfo[UserMBB->getNumber()]);
+      adjustBBOffsetsAfter(UserMBB);
+      return;
+    }
+  }
+
+  // What a big block.  Find a place within the block to split it.  This is a
+  // little tricky on Thumb1 since instructions are 2 bytes and constant pool
+  // entries are 4 bytes: if instruction I references island CPE, and
+  // instruction I+1 references CPE', it will not work well to put CPE as far
+  // forward as possible, since then CPE' cannot immediately follow it (that
+  // location is 2 bytes farther away from I+1 than CPE was from I) and we'd
+  // need to create a new island.  So, we make a first guess, then walk through
+  // the instructions between the one currently being looked at and the
+  // possible insertion point, and make sure any other instructions that
+  // reference CPEs will be able to use the same island area; if not, we back
+  // up the insertion point.
+
+  // Try to split the block so it's fully aligned.  Compute the latest split
+  // point where we can add a 4-byte branch instruction, and then align to
+  // LogAlign which is the largest possible alignment in the function.
+  unsigned LogAlign = MF->getAlignment();
+  assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
+  unsigned KnownBits = UserBBI.internalKnownBits();
+  unsigned UPad = UnknownPadding(LogAlign, KnownBits);
+  unsigned BaseInsertOffset = UserOffset + U.getMaxDisp() - UPad;
+  DEBUG(dbgs() << format("Split in middle of big block before %#x",
+                         BaseInsertOffset));
+
+  // The 4 in the following is for the unconditional branch we'll be inserting
+  // (allows for long branch on Thumb1).  Alignment of the island is handled
+  // inside isOffsetInRange.
+  BaseInsertOffset -= 4;
+
+  DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
+               << " la=" << LogAlign
+               << " kb=" << KnownBits
+               << " up=" << UPad << '\n');
+
+  // This could point off the end of the block if we've already got constant
+  // pool entries following this block; only the last one is in the water list.
+  // Back past any possible branches (allow for a conditional and a maximally
+  // long unconditional).
+  if (BaseInsertOffset + 8 >= UserBBI.postOffset()) {
+    // Ensure BaseInsertOffset is larger than the offset of the instruction
+    // following UserMI so that the loop which searches for the split point
+    // iterates at least once.
+    BaseInsertOffset =
+        std::max(UserBBI.postOffset() - UPad - 8,
+                 UserOffset + TII->getInstSizeInBytes(*UserMI) + 1);
+    DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+  }
+  unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad +
+    CPEMI->getOperand(2).getImm();
+  MachineBasicBlock::iterator MI = UserMI;
+  ++MI;
+  unsigned CPUIndex = CPUserIndex+1;
+  unsigned NumCPUsers = CPUsers.size();
+  MachineInstr *LastIT = nullptr;
+  for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI);
+       Offset < BaseInsertOffset;
+       Offset += TII->getInstSizeInBytes(*MI), MI = std::next(MI)) {
+    assert(MI != UserMBB->end() && "Fell off end of block");
+    if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == &*MI) {
+      CPUser &U = CPUsers[CPUIndex];
+      if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
+        // Shift intertion point by one unit of alignment so it is within reach.
+        BaseInsertOffset -= 1u << LogAlign;
+        EndInsertOffset  -= 1u << LogAlign;
+      }
+      // This is overly conservative, as we don't account for CPEMIs being
+      // reused within the block, but it doesn't matter much.  Also assume CPEs
+      // are added in order with alignment padding.  We may eventually be able
+      // to pack the aligned CPEs better.
+      EndInsertOffset += U.CPEMI->getOperand(2).getImm();
+      CPUIndex++;
+    }
+
+    // Remember the last IT instruction.
+    if (MI->getOpcode() == ARM::t2IT)
+      LastIT = &*MI;
+  }
+
+  --MI;
+
+  // Avoid splitting an IT block.
+  if (LastIT) {
+    unsigned PredReg = 0;
+    ARMCC::CondCodes CC = getITInstrPredicate(*MI, PredReg);
+    if (CC != ARMCC::AL)
+      MI = LastIT;
+  }
+
+  // We really must not split an IT block.
+  DEBUG(unsigned PredReg;
+        assert(!isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL));
+
+  NewMBB = splitBlockBeforeInstr(&*MI);
+}
+
+/// handleConstantPoolUser - Analyze the specified user, checking to see if it
+/// is out-of-range.  If so, pick up the constant pool value and move it some
+/// place in-range.  Return true if we changed any addresses (thus must run
+/// another pass of branch lengthening), false otherwise.
+bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
+                                                bool CloserWater) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPI = getCombinedIndex(CPEMI);
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  // Compute this only once, it's expensive.
+  unsigned UserOffset = getUserOffset(U);
+
+  // See if the current entry is within range, or there is a clone of it
+  // in range.
+  int result = findInRangeCPEntry(U, UserOffset);
+  if (result==1) return false;
+  else if (result==2) return true;
+
+  // No existing clone of this CPE is within range.
+  // We will be generating a new clone.  Get a UID for it.
+  unsigned ID = AFI->createPICLabelUId();
+
+  // Look for water where we can place this CPE.
+  MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *NewMBB;
+  water_iterator IP;
+  if (findAvailableWater(U, UserOffset, IP, CloserWater)) {
+    DEBUG(dbgs() << "Found water in range\n");
+    MachineBasicBlock *WaterBB = *IP;
+
+    // If the original WaterList entry was "new water" on this iteration,
+    // propagate that to the new island.  This is just keeping NewWaterList
+    // updated to match the WaterList, which will be updated below.
+    if (NewWaterList.erase(WaterBB))
+      NewWaterList.insert(NewIsland);
+
+    // The new CPE goes before the following block (NewMBB).
+    NewMBB = &*++WaterBB->getIterator();
+  } else {
+    // No water found.
+    DEBUG(dbgs() << "No water found\n");
+    createNewWater(CPUserIndex, UserOffset, NewMBB);
+
+    // splitBlockBeforeInstr adds to WaterList, which is important when it is
+    // called while handling branches so that the water will be seen on the
+    // next iteration for constant pools, but in this context, we don't want
+    // it.  Check for this so it will be removed from the WaterList.
+    // Also remove any entry from NewWaterList.
+    MachineBasicBlock *WaterBB = &*--NewMBB->getIterator();
+    IP = find(WaterList, WaterBB);
+    if (IP != WaterList.end())
+      NewWaterList.erase(WaterBB);
+
+    // We are adding new water.  Update NewWaterList.
+    NewWaterList.insert(NewIsland);
+  }
+
+  // Remove the original WaterList entry; we want subsequent insertions in
+  // this vicinity to go after the one we're about to insert.  This
+  // considerably reduces the number of times we have to move the same CPE
+  // more than once and is also important to ensure the algorithm terminates.
+  if (IP != WaterList.end())
+    WaterList.erase(IP);
+
+  // Okay, we know we can put an island before NewMBB now, do it!
+  MF->insert(NewMBB->getIterator(), NewIsland);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  updateForInsertedWaterBlock(NewIsland);
+
+  // Now that we have an island to add the CPE to, clone the original CPE and
+  // add it to the island.
+  U.HighWaterMark = NewIsland;
+  U.CPEMI = BuildMI(NewIsland, DebugLoc(), CPEMI->getDesc())
+                .addImm(ID).addOperand(CPEMI->getOperand(1)).addImm(Size);
+  CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
+  ++NumCPEs;
+
+  // Decrement the old entry, and remove it if refcount becomes 0.
+  decrementCPEReferenceCount(CPI, CPEMI);
+
+  // Mark the basic block as aligned as required by the const-pool entry.
+  NewIsland->setAlignment(getCPELogAlign(U.CPEMI));
+
+  // Increase the size of the island block to account for the new entry.
+  BBInfo[NewIsland->getNumber()].Size += Size;
+  adjustBBOffsetsAfter(&*--NewIsland->getIterator());
+
+  // Finally, change the CPI in the instruction operand to be ID.
+  for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
+    if (UserMI->getOperand(i).isCPI()) {
+      UserMI->getOperand(i).setIndex(ID);
+      break;
+    }
+
+  DEBUG(dbgs() << "  Moved CPE to #" << ID << " CPI=" << CPI
+        << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
+
+  return true;
+}
+
+/// removeDeadCPEMI - Remove a dead constant pool entry instruction. Update
+/// sizes and offsets of impacted basic blocks.
+void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
+  MachineBasicBlock *CPEBB = CPEMI->getParent();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  CPEMI->eraseFromParent();
+  BBInfo[CPEBB->getNumber()].Size -= Size;
+  // All succeeding offsets have the current size value added in, fix this.
+  if (CPEBB->empty()) {
+    BBInfo[CPEBB->getNumber()].Size = 0;
+
+    // This block no longer needs to be aligned.
+    CPEBB->setAlignment(0);
+  } else
+    // Entries are sorted by descending alignment, so realign from the front.
+    CPEBB->setAlignment(getCPELogAlign(&*CPEBB->begin()));
+
+  adjustBBOffsetsAfter(CPEBB);
+  // An island has only one predecessor BB and one successor BB. Check if
+  // this BB's predecessor jumps directly to this BB's successor. This
+  // shouldn't happen currently.
+  assert(!BBIsJumpedOver(CPEBB) && "How did this happen?");
+  // FIXME: remove the empty blocks after all the work is done?
+}
+
+/// removeUnusedCPEntries - Remove constant pool entries whose refcounts
+/// are zero.
+bool ARMConstantIslands::removeUnusedCPEntries() {
+  unsigned MadeChange = false;
+  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+      std::vector<CPEntry> &CPEs = CPEntries[i];
+      for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
+        if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
+          removeDeadCPEMI(CPEs[j].CPEMI);
+          CPEs[j].CPEMI = nullptr;
+          MadeChange = true;
+        }
+      }
+  }
+  return MadeChange;
+}
+
+/// isBBInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool ARMConstantIslands::isBBInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
+                                     unsigned MaxDisp) {
+  unsigned PCAdj      = isThumb ? 4 : 8;
+  unsigned BrOffset   = getOffsetOf(MI) + PCAdj;
+  unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
+
+  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " max delta=" << MaxDisp
+               << " from " << getOffsetOf(MI) << " to " << DestOffset
+               << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
+
+  if (BrOffset <= DestOffset) {
+    // Branch before the Dest.
+    if (DestOffset-BrOffset <= MaxDisp)
+      return true;
+  } else {
+    if (BrOffset-DestOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// fixupImmediateBr - Fix up an immediate branch whose destination is too far
+/// away to fit in its displacement field.
+bool ARMConstantIslands::fixupImmediateBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+  // Check to see if the DestBB is already in-range.
+  if (isBBInRange(MI, DestBB, Br.MaxDisp))
+    return false;
+
+  if (!Br.isCond)
+    return fixupUnconditionalBr(Br);
+  return fixupConditionalBr(Br);
+}
+
+/// fixupUnconditionalBr - Fix up an unconditional branch whose destination is
+/// too far away to fit in its displacement field. If the LR register has been
+/// spilled in the epilogue, then we can use BL to implement a far jump.
+/// Otherwise, add an intermediate branch instruction to a branch.
+bool
+ARMConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *MBB = MI->getParent();
+  if (!isThumb1)
+    llvm_unreachable("fixupUnconditionalBr is Thumb1 only!");
+
+  // Use BL to implement far jump.
+  Br.MaxDisp = (1 << 21) * 2;
+  MI->setDesc(TII->get(ARM::tBfar));
+  BBInfo[MBB->getNumber()].Size += 2;
+  adjustBBOffsetsAfter(MBB);
+  HasFarJump = true;
+  ++NumUBrFixed;
+
+  DEBUG(dbgs() << "  Changed B to long jump " << *MI);
+
+  return true;
+}
+
+/// fixupConditionalBr - Fix up a conditional branch whose destination is too
+/// far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool
+ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // blt L1
+  // =>
+  // bge L2
+  // b   L1
+  // L2:
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(1).getImm();
+  CC = ARMCC::getOppositeCondition(CC);
+  unsigned CCReg = MI->getOperand(2).getReg();
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+  ++NumCBrFixed;
+  if (BMI != MI) {
+    if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) &&
+        BMI->getOpcode() == Br.UncondBr) {
+      // Last MI in the BB is an unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beq L1
+      // b   L2
+      // =>
+      // bne L2
+      // b   L1
+      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+      if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
+        DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
+                     << *BMI);
+        BMI->getOperand(0).setMBB(DestBB);
+        MI->getOperand(0).setMBB(NewDest);
+        MI->getOperand(1).setImm(CC);
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    splitBlockBeforeInstr(MI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->getInstSizeInBytes(MBB->back());
+    BBInfo[MBB->getNumber()].Size -= delta;
+    MBB->back().eraseFromParent();
+    // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
+  }
+  MachineBasicBlock *NextBB = &*++MBB->getIterator();
+
+  DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
+               << " also invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
+
+  // Insert a new conditional branch and a new unconditional branch.
+  // Also update the ImmBranch as well as adding a new entry for the new branch.
+  BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode()))
+    .addMBB(NextBB).addImm(CC).addReg(CCReg);
+  Br.MI = &MBB->back();
+  BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
+  if (isThumb)
+    BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB)
+            .addImm(ARMCC::AL).addReg(0);
+  else
+    BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
+  BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
+  unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
+  ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BBInfo[MI->getParent()->getNumber()].Size -= TII->getInstSizeInBytes(*MI);
+  MI->eraseFromParent();
+  adjustBBOffsetsAfter(MBB);
+  return true;
+}
+
+/// undoLRSpillRestore - Remove Thumb push / pop instructions that only spills
+/// LR / restores LR to pc. FIXME: This is done here because it's only possible
+/// to do this if tBfar is not used.
+bool ARMConstantIslands::undoLRSpillRestore() {
+  bool MadeChange = false;
+  for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) {
+    MachineInstr *MI = PushPopMIs[i];
+    // First two operands are predicates.
+    if (MI->getOpcode() == ARM::tPOP_RET &&
+        MI->getOperand(2).getReg() == ARM::PC &&
+        MI->getNumExplicitOperands() == 3) {
+      // Create the new insn and copy the predicate from the old.
+      BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET))
+        .addOperand(MI->getOperand(0))
+        .addOperand(MI->getOperand(1));
+      MI->eraseFromParent();
+      MadeChange = true;
+    }
+  }
+  return MadeChange;
+}
+
+bool ARMConstantIslands::optimizeThumb2Instructions() {
+  bool MadeChange = false;
+
+  // Shrink ADR and LDR from constantpool.
+  for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
+    CPUser &U = CPUsers[i];
+    unsigned Opcode = U.MI->getOpcode();
+    unsigned NewOpc = 0;
+    unsigned Scale = 1;
+    unsigned Bits = 0;
+    switch (Opcode) {
+    default: break;
+    case ARM::t2LEApcrel:
+      if (isARMLowRegister(U.MI->getOperand(0).getReg())) {
+        NewOpc = ARM::tLEApcrel;
+        Bits = 8;
+        Scale = 4;
+      }
+      break;
+    case ARM::t2LDRpci:
+      if (isARMLowRegister(U.MI->getOperand(0).getReg())) {
+        NewOpc = ARM::tLDRpci;
+        Bits = 8;
+        Scale = 4;
+      }
+      break;
+    }
+
+    if (!NewOpc)
+      continue;
+
+    unsigned UserOffset = getUserOffset(U);
+    unsigned MaxOffs = ((1 << Bits) - 1) * Scale;
+
+    // Be conservative with inline asm.
+    if (!U.KnownAlignment)
+      MaxOffs -= 2;
+
+    // FIXME: Check if offset is multiple of scale if scale is not 4.
+    if (isCPEntryInRange(U.MI, UserOffset, U.CPEMI, MaxOffs, false, true)) {
+      DEBUG(dbgs() << "Shrink: " << *U.MI);
+      U.MI->setDesc(TII->get(NewOpc));
+      MachineBasicBlock *MBB = U.MI->getParent();
+      BBInfo[MBB->getNumber()].Size -= 2;
+      adjustBBOffsetsAfter(MBB);
+      ++NumT2CPShrunk;
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+bool ARMConstantIslands::optimizeThumb2Branches() {
+  bool MadeChange = false;
+
+  // The order in which branches appear in ImmBranches is approximately their
+  // order within the function body. By visiting later branches first, we reduce
+  // the distance between earlier forward branches and their targets, making it
+  // more likely that the cbn?z optimization, which can only apply to forward
+  // branches, will succeed.
+  for (unsigned i = ImmBranches.size(); i != 0; --i) {
+    ImmBranch &Br = ImmBranches[i-1];
+    unsigned Opcode = Br.MI->getOpcode();
+    unsigned NewOpc = 0;
+    unsigned Scale = 1;
+    unsigned Bits = 0;
+    switch (Opcode) {
+    default: break;
+    case ARM::t2B:
+      NewOpc = ARM::tB;
+      Bits = 11;
+      Scale = 2;
+      break;
+    case ARM::t2Bcc: {
+      NewOpc = ARM::tBcc;
+      Bits = 8;
+      Scale = 2;
+      break;
+    }
+    }
+    if (NewOpc) {
+      unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+      MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+      if (isBBInRange(Br.MI, DestBB, MaxOffs)) {
+        DEBUG(dbgs() << "Shrink branch: " << *Br.MI);
+        Br.MI->setDesc(TII->get(NewOpc));
+        MachineBasicBlock *MBB = Br.MI->getParent();
+        BBInfo[MBB->getNumber()].Size -= 2;
+        adjustBBOffsetsAfter(MBB);
+        ++NumT2BrShrunk;
+        MadeChange = true;
+      }
+    }
+
+    Opcode = Br.MI->getOpcode();
+    if (Opcode != ARM::tBcc)
+      continue;
+
+    // If the conditional branch doesn't kill CPSR, then CPSR can be liveout
+    // so this transformation is not safe.
+    if (!Br.MI->killsRegister(ARM::CPSR))
+      continue;
+
+    NewOpc = 0;
+    unsigned PredReg = 0;
+    ARMCC::CondCodes Pred = getInstrPredicate(*Br.MI, PredReg);
+    if (Pred == ARMCC::EQ)
+      NewOpc = ARM::tCBZ;
+    else if (Pred == ARMCC::NE)
+      NewOpc = ARM::tCBNZ;
+    if (!NewOpc)
+      continue;
+    MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+    // Check if the distance is within 126. Subtract starting offset by 2
+    // because the cmp will be eliminated.
+    unsigned BrOffset = getOffsetOf(Br.MI) + 4 - 2;
+    unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
+    if (BrOffset < DestOffset && (DestOffset - BrOffset) <= 126) {
+      MachineBasicBlock::iterator CmpMI = Br.MI;
+      if (CmpMI != Br.MI->getParent()->begin()) {
+        --CmpMI;
+        if (CmpMI->getOpcode() == ARM::tCMPi8) {
+          unsigned Reg = CmpMI->getOperand(0).getReg();
+          Pred = getInstrPredicate(*CmpMI, PredReg);
+          if (Pred == ARMCC::AL &&
+              CmpMI->getOperand(1).getImm() == 0 &&
+              isARMLowRegister(Reg)) {
+            MachineBasicBlock *MBB = Br.MI->getParent();
+            DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI);
+            MachineInstr *NewBR =
+              BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc))
+              .addReg(Reg).addMBB(DestBB,Br.MI->getOperand(0).getTargetFlags());
+            CmpMI->eraseFromParent();
+            Br.MI->eraseFromParent();
+            Br.MI = NewBR;
+            BBInfo[MBB->getNumber()].Size -= 2;
+            adjustBBOffsetsAfter(MBB);
+            ++NumCBZ;
+            MadeChange = true;
+          }
+        }
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+static bool isSimpleIndexCalc(MachineInstr &I, unsigned EntryReg,
+                              unsigned BaseReg) {
+  if (I.getOpcode() != ARM::t2ADDrs)
+    return false;
+
+  if (I.getOperand(0).getReg() != EntryReg)
+    return false;
+
+  if (I.getOperand(1).getReg() != BaseReg)
+    return false;
+
+  // FIXME: what about CC and IdxReg?
+  return true;
+}
+
+/// \brief While trying to form a TBB/TBH instruction, we may (if the table
+/// doesn't immediately follow the BR_JT) need access to the start of the
+/// jump-table. We know one instruction that produces such a register; this
+/// function works out whether that definition can be preserved to the BR_JT,
+/// possibly by removing an intervening addition (which is usually needed to
+/// calculate the actual entry to jump to).
+bool ARMConstantIslands::preserveBaseRegister(MachineInstr *JumpMI,
+                                              MachineInstr *LEAMI,
+                                              unsigned &DeadSize,
+                                              bool &CanDeleteLEA,
+                                              bool &BaseRegKill) {
+  if (JumpMI->getParent() != LEAMI->getParent())
+    return false;
+
+  // Now we hope that we have at least these instructions in the basic block:
+  //     BaseReg = t2LEA ...
+  //     [...]
+  //     EntryReg = t2ADDrs BaseReg, ...
+  //     [...]
+  //     t2BR_JT EntryReg
+  //
+  // We have to be very conservative about what we recognise here though. The
+  // main perturbing factors to watch out for are:
+  //    + Spills at any point in the chain: not direct problems but we would
+  //      expect a blocking Def of the spilled register so in practice what we
+  //      can do is limited.
+  //    + EntryReg == BaseReg: this is the one situation we should allow a Def
+  //      of BaseReg, but only if the t2ADDrs can be removed.
+  //    + Some instruction other than t2ADDrs computing the entry. Not seen in
+  //      the wild, but we should be careful.
+  unsigned EntryReg = JumpMI->getOperand(0).getReg();
+  unsigned BaseReg = LEAMI->getOperand(0).getReg();
+
+  CanDeleteLEA = true;
+  BaseRegKill = false;
+  MachineInstr *RemovableAdd = nullptr;
+  MachineBasicBlock::iterator I(LEAMI);
+  for (++I; &*I != JumpMI; ++I) {
+    if (isSimpleIndexCalc(*I, EntryReg, BaseReg)) {
+      RemovableAdd = &*I;
+      break;
+    }
+
+    for (unsigned K = 0, E = I->getNumOperands(); K != E; ++K) {
+      const MachineOperand &MO = I->getOperand(K);
+      if (!MO.isReg() || !MO.getReg())
+        continue;
+      if (MO.isDef() && MO.getReg() == BaseReg)
+        return false;
+      if (MO.isUse() && MO.getReg() == BaseReg) {
+        BaseRegKill = BaseRegKill || MO.isKill();
+        CanDeleteLEA = false;
+      }
+    }
+  }
+
+  if (!RemovableAdd)
+    return true;
+
+  // Check the add really is removable, and that nothing else in the block
+  // clobbers BaseReg.
+  for (++I; &*I != JumpMI; ++I) {
+    for (unsigned K = 0, E = I->getNumOperands(); K != E; ++K) {
+      const MachineOperand &MO = I->getOperand(K);
+      if (!MO.isReg() || !MO.getReg())
+        continue;
+      if (MO.isDef() && MO.getReg() == BaseReg)
+        return false;
+      if (MO.isUse() && MO.getReg() == EntryReg)
+        RemovableAdd = nullptr;
+    }
+  }
+
+  if (RemovableAdd) {
+    RemovableAdd->eraseFromParent();
+    DeadSize += isThumb2 ? 4 : 2;
+  } else if (BaseReg == EntryReg) {
+    // The add wasn't removable, but clobbered the base for the TBB. So we can't
+    // preserve it.
+    return false;
+  }
+
+  // We reached the end of the block without seeing another definition of
+  // BaseReg (except, possibly the t2ADDrs, which was removed). BaseReg can be
+  // used in the TBB/TBH if necessary.
+  return true;
+}
+
+/// \brief Returns whether CPEMI is the first instruction in the block
+/// immediately following JTMI (assumed to be a TBB or TBH terminator). If so,
+/// we can switch the first register to PC and usually remove the address
+/// calculation that preceded it.
+static bool jumpTableFollowsTB(MachineInstr *JTMI, MachineInstr *CPEMI) {
+  MachineFunction::iterator MBB = JTMI->getParent()->getIterator();
+  MachineFunction *MF = MBB->getParent();
+  ++MBB;
+
+  return MBB != MF->end() && MBB->begin() != MBB->end() &&
+         &*MBB->begin() == CPEMI;
+}
+
+/// optimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller
+/// jumptables when it's possible.
+bool ARMConstantIslands::optimizeThumb2JumpTables() {
+  bool MadeChange = false;
+
+  // FIXME: After the tables are shrunk, can we get rid some of the
+  // constantpool tables?
+  MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  if (!MJTI) return false;
+
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
+    MachineInstr *MI = T2JumpTables[i];
+    const MCInstrDesc &MCID = MI->getDesc();
+    unsigned NumOps = MCID.getNumOperands();
+    unsigned JTOpIdx = NumOps - (MI->isPredicable() ? 2 : 1);
+    MachineOperand JTOP = MI->getOperand(JTOpIdx);
+    unsigned JTI = JTOP.getIndex();
+    assert(JTI < JT.size());
+
+    bool ByteOk = true;
+    bool HalfWordOk = true;
+    unsigned JTOffset = getOffsetOf(MI) + 4;
+    const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+    for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
+      MachineBasicBlock *MBB = JTBBs[j];
+      unsigned DstOffset = BBInfo[MBB->getNumber()].Offset;
+      // Negative offset is not ok. FIXME: We should change BB layout to make
+      // sure all the branches are forward.
+      if (ByteOk && (DstOffset - JTOffset) > ((1<<8)-1)*2)
+        ByteOk = false;
+      unsigned TBHLimit = ((1<<16)-1)*2;
+      if (HalfWordOk && (DstOffset - JTOffset) > TBHLimit)
+        HalfWordOk = false;
+      if (!ByteOk && !HalfWordOk)
+        break;
+    }
+
+    if (!ByteOk && !HalfWordOk)
+      continue;
+
+    CPUser &User = CPUsers[JumpTableUserIndices[JTI]];
+    MachineBasicBlock *MBB = MI->getParent();
+    if (!MI->getOperand(0).isKill()) // FIXME: needed now?
+      continue;
+
+    unsigned DeadSize = 0;
+    bool CanDeleteLEA = false;
+    bool BaseRegKill = false;
+    
+    unsigned IdxReg = ~0U;
+    bool IdxRegKill = true;
+    if (isThumb2) {
+      IdxReg = MI->getOperand(1).getReg();
+      IdxRegKill = MI->getOperand(1).isKill();
+
+      bool PreservedBaseReg =
+        preserveBaseRegister(MI, User.MI, DeadSize, CanDeleteLEA, BaseRegKill);
+      if (!jumpTableFollowsTB(MI, User.CPEMI) && !PreservedBaseReg)
+        continue;
+    } else {
+      // We're in thumb-1 mode, so we must have something like:
+      //   %idx = tLSLri %idx, 2
+      //   %base = tLEApcrelJT
+      //   %t = tLDRr %idx, %base
+      unsigned BaseReg = User.MI->getOperand(0).getReg();
+
+      if (User.MI->getIterator() == User.MI->getParent()->begin())
+        continue;
+      MachineInstr *Shift = User.MI->getPrevNode();
+      if (Shift->getOpcode() != ARM::tLSLri ||
+          Shift->getOperand(3).getImm() != 2 ||
+          !Shift->getOperand(2).isKill())
+        continue;
+      IdxReg = Shift->getOperand(2).getReg();
+      unsigned ShiftedIdxReg = Shift->getOperand(0).getReg();
+
+      MachineInstr *Load = User.MI->getNextNode();
+      if (Load->getOpcode() != ARM::tLDRr)
+        continue;
+      if (Load->getOperand(1).getReg() != ShiftedIdxReg ||
+          Load->getOperand(2).getReg() != BaseReg ||
+          !Load->getOperand(1).isKill())
+        continue;
+
+      // If we're in PIC mode, there should be another ADD following.
+      if (isPositionIndependentOrROPI) {
+        MachineInstr *Add = Load->getNextNode();
+        if (Add->getOpcode() != ARM::tADDrr ||
+            Add->getOperand(2).getReg() != Load->getOperand(0).getReg() ||
+            Add->getOperand(3).getReg() != BaseReg ||
+            !Add->getOperand(2).isKill())
+          continue;
+        if (Add->getOperand(0).getReg() != MI->getOperand(0).getReg())
+          continue;
+
+        Add->eraseFromParent();
+        DeadSize += 2;
+      } else {
+        if (Load->getOperand(0).getReg() != MI->getOperand(0).getReg())
+          continue;
+      }
+      
+      
+      // Now safe to delete the load and lsl. The LEA will be removed later.
+      CanDeleteLEA = true;
+      Shift->eraseFromParent();
+      Load->eraseFromParent();
+      DeadSize += 4;
+    }
+    
+    DEBUG(dbgs() << "Shrink JT: " << *MI);
+    MachineInstr *CPEMI = User.CPEMI;
+    unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT;
+    if (!isThumb2)
+      Opc = ByteOk ? ARM::tTBB_JT : ARM::tTBH_JT;
+
+    MachineBasicBlock::iterator MI_JT = MI;
+    MachineInstr *NewJTMI =
+        BuildMI(*MBB, MI_JT, MI->getDebugLoc(), TII->get(Opc))
+            .addReg(User.MI->getOperand(0).getReg(),
+                    getKillRegState(BaseRegKill))
+            .addReg(IdxReg, getKillRegState(IdxRegKill))
+            .addJumpTableIndex(JTI, JTOP.getTargetFlags())
+            .addImm(CPEMI->getOperand(0).getImm());
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": " << *NewJTMI);
+
+    unsigned JTOpc = ByteOk ? ARM::JUMPTABLE_TBB : ARM::JUMPTABLE_TBH;
+    CPEMI->setDesc(TII->get(JTOpc));
+
+    if (jumpTableFollowsTB(MI, User.CPEMI)) {
+      NewJTMI->getOperand(0).setReg(ARM::PC);
+      NewJTMI->getOperand(0).setIsKill(false);
+
+      if (CanDeleteLEA)  {
+        User.MI->eraseFromParent();
+        DeadSize += isThumb2 ? 4 : 2;
+
+        // The LEA was eliminated, the TBB instruction becomes the only new user
+        // of the jump table.
+        User.MI = NewJTMI;
+        User.MaxDisp = 4;
+        User.NegOk = false;
+        User.IsSoImm = false;
+        User.KnownAlignment = false;
+      } else {
+        // The LEA couldn't be eliminated, so we must add another CPUser to
+        // record the TBB or TBH use.
+        int CPEntryIdx = JumpTableEntryIndices[JTI];
+        auto &CPEs = CPEntries[CPEntryIdx];
+        auto Entry =
+            find_if(CPEs, [&](CPEntry &E) { return E.CPEMI == User.CPEMI; });
+        ++Entry->RefCount;
+        CPUsers.emplace_back(CPUser(NewJTMI, User.CPEMI, 4, false, false));
+      }
+    }
+
+    unsigned NewSize = TII->getInstSizeInBytes(*NewJTMI);
+    unsigned OrigSize = TII->getInstSizeInBytes(*MI);
+    MI->eraseFromParent();
+
+    int Delta = OrigSize - NewSize + DeadSize;
+    BBInfo[MBB->getNumber()].Size -= Delta;
+    adjustBBOffsetsAfter(MBB);
+
+    ++NumTBs;
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+/// reorderThumb2JumpTables - Adjust the function's block layout to ensure that
+/// jump tables always branch forwards, since that's what tbb and tbh need.
+bool ARMConstantIslands::reorderThumb2JumpTables() {
+  bool MadeChange = false;
+
+  MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  if (!MJTI) return false;
+
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
+    MachineInstr *MI = T2JumpTables[i];
+    const MCInstrDesc &MCID = MI->getDesc();
+    unsigned NumOps = MCID.getNumOperands();
+    unsigned JTOpIdx = NumOps - (MI->isPredicable() ? 2 : 1);
+    MachineOperand JTOP = MI->getOperand(JTOpIdx);
+    unsigned JTI = JTOP.getIndex();
+    assert(JTI < JT.size());
+
+    // We prefer if target blocks for the jump table come after the jump
+    // instruction so we can use TB[BH]. Loop through the target blocks
+    // and try to adjust them such that that's true.
+    int JTNumber = MI->getParent()->getNumber();
+    const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+    for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
+      MachineBasicBlock *MBB = JTBBs[j];
+      int DTNumber = MBB->getNumber();
+
+      if (DTNumber < JTNumber) {
+        // The destination precedes the switch. Try to move the block forward
+        // so we have a positive offset.
+        MachineBasicBlock *NewBB =
+          adjustJTTargetBlockForward(MBB, MI->getParent());
+        if (NewBB)
+          MJTI->ReplaceMBBInJumpTable(JTI, JTBBs[j], NewBB);
+        MadeChange = true;
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+MachineBasicBlock *ARMConstantIslands::
+adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
+  // If the destination block is terminated by an unconditional branch,
+  // try to move it; otherwise, create a new block following the jump
+  // table that branches back to the actual target. This is a very simple
+  // heuristic. FIXME: We can definitely improve it.
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  SmallVector<MachineOperand, 4> CondPrior;
+  MachineFunction::iterator BBi = BB->getIterator();
+  MachineFunction::iterator OldPrior = std::prev(BBi);
+
+  // If the block terminator isn't analyzable, don't try to move the block
+  bool B = TII->analyzeBranch(*BB, TBB, FBB, Cond);
+
+  // If the block ends in an unconditional branch, move it. The prior block
+  // has to have an analyzable terminator for us to move this one. Be paranoid
+  // and make sure we're not trying to move the entry block of the function.
+  if (!B && Cond.empty() && BB != &MF->front() &&
+      !TII->analyzeBranch(*OldPrior, TBB, FBB, CondPrior)) {
+    BB->moveAfter(JTBB);
+    OldPrior->updateTerminator();
+    BB->updateTerminator();
+    // Update numbering to account for the block being moved.
+    MF->RenumberBlocks();
+    ++NumJTMoved;
+    return nullptr;
+  }
+
+  // Create a new MBB for the code after the jump BB.
+  MachineBasicBlock *NewBB =
+    MF->CreateMachineBasicBlock(JTBB->getBasicBlock());
+  MachineFunction::iterator MBBI = ++JTBB->getIterator();
+  MF->insert(MBBI, NewBB);
+
+  // Add an unconditional branch from NewBB to BB.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond directly to anything in the source.
+  if (isThumb2)
+    BuildMI(NewBB, DebugLoc(), TII->get(ARM::t2B))
+        .addMBB(BB)
+        .addImm(ARMCC::AL)
+        .addReg(0);
+  else
+    BuildMI(NewBB, DebugLoc(), TII->get(ARM::tB))
+        .addMBB(BB)
+        .addImm(ARMCC::AL)
+        .addReg(0);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  MF->RenumberBlocks(NewBB);
+
+  // Update the CFG.
+  NewBB->addSuccessor(BB);
+  JTBB->replaceSuccessor(BB, NewBB);
+
+  ++NumJTInserted;
+  return NewBB;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
new file mode 100644
index 000000000000..2d1602873ce0
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -0,0 +1,281 @@
+//===-- ARMConstantPoolValue.cpp - ARM constantpool value -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMConstantPoolValue.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdlib>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolValue
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolValue::ARMConstantPoolValue(Type *Ty, unsigned id,
+                                           ARMCP::ARMCPKind kind,
+                                           unsigned char PCAdj,
+                                           ARMCP::ARMCPModifier modifier,
+                                           bool addCurrentAddress)
+  : MachineConstantPoolValue(Ty), LabelId(id), Kind(kind),
+    PCAdjust(PCAdj), Modifier(modifier),
+    AddCurrentAddress(addCurrentAddress) {}
+
+ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C, unsigned id,
+                                           ARMCP::ARMCPKind kind,
+                                           unsigned char PCAdj,
+                                           ARMCP::ARMCPModifier modifier,
+                                           bool addCurrentAddress)
+  : MachineConstantPoolValue((Type*)Type::getInt32Ty(C)),
+    LabelId(id), Kind(kind), PCAdjust(PCAdj), Modifier(modifier),
+    AddCurrentAddress(addCurrentAddress) {}
+
+ARMConstantPoolValue::~ARMConstantPoolValue() {}
+
+StringRef ARMConstantPoolValue::getModifierText() const {
+  switch (Modifier) {
+    // FIXME: Are these case sensitive? It'd be nice to lower-case all the
+    // strings if that's legal.
+  case ARMCP::no_modifier:
+    return "none";
+  case ARMCP::TLSGD:
+    return "tlsgd";
+  case ARMCP::GOT_PREL:
+    return "GOT_PREL";
+  case ARMCP::GOTTPOFF:
+    return "gottpoff";
+  case ARMCP::TPOFF:
+    return "tpoff";
+  case ARMCP::SBREL:
+    return "SBREL";
+  case ARMCP::SECREL:
+    return "secrel32";
+  }
+  llvm_unreachable("Unknown modifier!");
+}
+
+int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                    unsigned Alignment) {
+  llvm_unreachable("Shouldn't be calling this directly!");
+}
+
+void
+ARMConstantPoolValue::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddInteger(LabelId);
+  ID.AddInteger(PCAdjust);
+}
+
+bool
+ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) {
+  if (ACPV->Kind == Kind &&
+      ACPV->PCAdjust == PCAdjust &&
+      ACPV->Modifier == Modifier &&
+      ACPV->LabelId == LabelId &&
+      ACPV->AddCurrentAddress == AddCurrentAddress) {
+    // Two PC relative constpool entries containing the same GV address or
+    // external symbols. FIXME: What about blockaddress?
+    if (Kind == ARMCP::CPValue || Kind == ARMCP::CPExtSymbol)
+      return true;
+  }
+  return false;
+}
+
+LLVM_DUMP_METHOD void ARMConstantPoolValue::dump() const {
+  errs() << "  " << *this;
+}
+
+void ARMConstantPoolValue::print(raw_ostream &O) const {
+  if (Modifier) O << "(" << getModifierText() << ")";
+  if (PCAdjust != 0) {
+    O << "-(LPC" << LabelId << "+" << (unsigned)PCAdjust;
+    if (AddCurrentAddress) O << "-.";
+    O << ")";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolConstant
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolConstant::ARMConstantPoolConstant(Type *Ty,
+                                                 const Constant *C,
+                                                 unsigned ID,
+                                                 ARMCP::ARMCPKind Kind,
+                                                 unsigned char PCAdj,
+                                                 ARMCP::ARMCPModifier Modifier,
+                                                 bool AddCurrentAddress)
+  : ARMConstantPoolValue(Ty, ID, Kind, PCAdj, Modifier, AddCurrentAddress),
+    CVal(C) {}
+
+ARMConstantPoolConstant::ARMConstantPoolConstant(const Constant *C,
+                                                 unsigned ID,
+                                                 ARMCP::ARMCPKind Kind,
+                                                 unsigned char PCAdj,
+                                                 ARMCP::ARMCPModifier Modifier,
+                                                 bool AddCurrentAddress)
+  : ARMConstantPoolValue((Type*)C->getType(), ID, Kind, PCAdj, Modifier,
+                         AddCurrentAddress),
+    CVal(C) {}
+
+ARMConstantPoolConstant::ARMConstantPoolConstant(const GlobalVariable *GV,
+                                                 const Constant *C)
+    : ARMConstantPoolValue((Type *)C->getType(), 0, ARMCP::CPPromotedGlobal, 0,
+                           ARMCP::no_modifier, false),
+      CVal(C), GVar(GV) {}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID) {
+  return new ARMConstantPoolConstant(C, ID, ARMCP::CPValue, 0,
+                                     ARMCP::no_modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const GlobalVariable *GVar,
+                                const Constant *Initializer) {
+  return new ARMConstantPoolConstant(GVar, Initializer);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const GlobalValue *GV,
+                                ARMCP::ARMCPModifier Modifier) {
+  return new ARMConstantPoolConstant((Type*)Type::getInt32Ty(GV->getContext()),
+                                     GV, 0, ARMCP::CPValue, 0,
+                                     Modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID,
+                                ARMCP::ARMCPKind Kind, unsigned char PCAdj) {
+  return new ARMConstantPoolConstant(C, ID, Kind, PCAdj,
+                                     ARMCP::no_modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID,
+                                ARMCP::ARMCPKind Kind, unsigned char PCAdj,
+                                ARMCP::ARMCPModifier Modifier,
+                                bool AddCurrentAddress) {
+  return new ARMConstantPoolConstant(C, ID, Kind, PCAdj, Modifier,
+                                     AddCurrentAddress);
+}
+
+const GlobalValue *ARMConstantPoolConstant::getGV() const {
+  return dyn_cast_or_null<GlobalValue>(CVal);
+}
+
+const BlockAddress *ARMConstantPoolConstant::getBlockAddress() const {
+  return dyn_cast_or_null<BlockAddress>(CVal);
+}
+
+int ARMConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                       unsigned Alignment) {
+  return getExistingMachineCPValueImpl<ARMConstantPoolConstant>(CP, Alignment);
+}
+
+bool ARMConstantPoolConstant::hasSameValue(ARMConstantPoolValue *ACPV) {
+  const ARMConstantPoolConstant *ACPC = dyn_cast<ARMConstantPoolConstant>(ACPV);
+  return ACPC && ACPC->CVal == CVal && ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolConstant::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(CVal);
+  ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolConstant::print(raw_ostream &O) const {
+  O << CVal->getName();
+  ARMConstantPoolValue::print(O);
+}
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolSymbol
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolSymbol::ARMConstantPoolSymbol(LLVMContext &C, StringRef s,
+                                             unsigned id, unsigned char PCAdj,
+                                             ARMCP::ARMCPModifier Modifier,
+                                             bool AddCurrentAddress)
+    : ARMConstantPoolValue(C, id, ARMCP::CPExtSymbol, PCAdj, Modifier,
+                           AddCurrentAddress),
+      S(s) {}
+
+ARMConstantPoolSymbol *ARMConstantPoolSymbol::Create(LLVMContext &C,
+                                                     StringRef s, unsigned ID,
+                                                     unsigned char PCAdj) {
+  return new ARMConstantPoolSymbol(C, s, ID, PCAdj, ARMCP::no_modifier, false);
+}
+
+int ARMConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                     unsigned Alignment) {
+  return getExistingMachineCPValueImpl<ARMConstantPoolSymbol>(CP, Alignment);
+}
+
+bool ARMConstantPoolSymbol::hasSameValue(ARMConstantPoolValue *ACPV) {
+  const ARMConstantPoolSymbol *ACPS = dyn_cast<ARMConstantPoolSymbol>(ACPV);
+  return ACPS && ACPS->S == S && ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolSymbol::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddString(S);
+  ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolSymbol::print(raw_ostream &O) const {
+  O << S;
+  ARMConstantPoolValue::print(O);
+}
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolMBB
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolMBB::ARMConstantPoolMBB(LLVMContext &C,
+                                       const MachineBasicBlock *mbb,
+                                       unsigned id, unsigned char PCAdj,
+                                       ARMCP::ARMCPModifier Modifier,
+                                       bool AddCurrentAddress)
+  : ARMConstantPoolValue(C, id, ARMCP::CPMachineBasicBlock, PCAdj,
+                         Modifier, AddCurrentAddress),
+    MBB(mbb) {}
+
+ARMConstantPoolMBB *ARMConstantPoolMBB::Create(LLVMContext &C,
+                                               const MachineBasicBlock *mbb,
+                                               unsigned ID,
+                                               unsigned char PCAdj) {
+  return new ARMConstantPoolMBB(C, mbb, ID, PCAdj, ARMCP::no_modifier, false);
+}
+
+int ARMConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                  unsigned Alignment) {
+  return getExistingMachineCPValueImpl<ARMConstantPoolMBB>(CP, Alignment);
+}
+
+bool ARMConstantPoolMBB::hasSameValue(ARMConstantPoolValue *ACPV) {
+  const ARMConstantPoolMBB *ACPMBB = dyn_cast<ARMConstantPoolMBB>(ACPV);
+  return ACPMBB && ACPMBB->MBB == MBB &&
+    ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolMBB::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(MBB);
+  ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolMBB::print(raw_ostream &O) const {
+  O << "BB#" << MBB->getNumber();
+  ARMConstantPoolValue::print(O);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
new file mode 100644
index 000000000000..5f61832aa740
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
@@ -0,0 +1,272 @@
+//===-- ARMConstantPoolValue.h - ARM constantpool value ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMCONSTANTPOOLVALUE_H
+#define LLVM_LIB_TARGET_ARM_ARMCONSTANTPOOLVALUE_H
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstddef>
+
+namespace llvm {
+
+class BlockAddress;
+class Constant;
+class GlobalValue;
+class GlobalVariable;
+class LLVMContext;
+class MachineBasicBlock;
+
+namespace ARMCP {
+  enum ARMCPKind {
+    CPValue,
+    CPExtSymbol,
+    CPBlockAddress,
+    CPLSDA,
+    CPMachineBasicBlock,
+    CPPromotedGlobal
+  };
+
+  enum ARMCPModifier {
+    no_modifier, /// None
+    TLSGD,       /// Thread Local Storage (General Dynamic Mode)
+    GOT_PREL,    /// Global Offset Table, PC Relative
+    GOTTPOFF,    /// Global Offset Table, Thread Pointer Offset
+    TPOFF,       /// Thread Pointer Offset
+    SECREL,      /// Section Relative (Windows TLS)
+    SBREL,       /// Static Base Relative (RWPI)
+  };
+}
+
+/// ARMConstantPoolValue - ARM specific constantpool value. This is used to
+/// represent PC-relative displacement between the address of the load
+/// instruction and the constant being loaded, i.e. (&GV-(LPIC+8)).
+class ARMConstantPoolValue : public MachineConstantPoolValue {
+  unsigned LabelId;        // Label id of the load.
+  ARMCP::ARMCPKind Kind;   // Kind of constant.
+  unsigned char PCAdjust;  // Extra adjustment if constantpool is pc-relative.
+                           // 8 for ARM, 4 for Thumb.
+  ARMCP::ARMCPModifier Modifier;   // GV modifier i.e. (&GV(modifier)-(LPIC+8))
+  bool AddCurrentAddress;
+
+protected:
+  ARMConstantPoolValue(Type *Ty, unsigned id, ARMCP::ARMCPKind Kind,
+                       unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                       bool AddCurrentAddress);
+
+  ARMConstantPoolValue(LLVMContext &C, unsigned id, ARMCP::ARMCPKind Kind,
+                       unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                       bool AddCurrentAddress);
+
+  template <typename Derived>
+  int getExistingMachineCPValueImpl(MachineConstantPool *CP,
+                                    unsigned Alignment) {
+    unsigned AlignMask = Alignment - 1;
+    const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants();
+    for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+      if (Constants[i].isMachineConstantPoolEntry() &&
+          (Constants[i].getAlignment() & AlignMask) == 0) {
+        ARMConstantPoolValue *CPV =
+            (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+        if (Derived *APC = dyn_cast<Derived>(CPV))
+          if (cast<Derived>(this)->equals(APC))
+            return i;
+      }
+    }
+
+    return -1;
+  }
+
+public:
+  ~ARMConstantPoolValue() override;
+
+  ARMCP::ARMCPModifier getModifier() const { return Modifier; }
+  StringRef getModifierText() const;
+  bool hasModifier() const { return Modifier != ARMCP::no_modifier; }
+
+  bool mustAddCurrentAddress() const { return AddCurrentAddress; }
+
+  unsigned getLabelId() const { return LabelId; }
+  unsigned char getPCAdjustment() const { return PCAdjust; }
+
+  bool isGlobalValue() const { return Kind == ARMCP::CPValue; }
+  bool isExtSymbol() const { return Kind == ARMCP::CPExtSymbol; }
+  bool isBlockAddress() const { return Kind == ARMCP::CPBlockAddress; }
+  bool isLSDA() const { return Kind == ARMCP::CPLSDA; }
+  bool isMachineBasicBlock() const{ return Kind == ARMCP::CPMachineBasicBlock; }
+  bool isPromotedGlobal() const{ return Kind == ARMCP::CPPromotedGlobal; }
+  
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                unsigned Alignment) override;
+
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+
+  bool equals(const ARMConstantPoolValue *A) const {
+    return this->LabelId == A->LabelId &&
+      this->PCAdjust == A->PCAdjust &&
+      this->Modifier == A->Modifier;
+  }
+
+  void print(raw_ostream &O) const override;
+  void print(raw_ostream *O) const { if (O) print(*O); }
+  void dump() const;
+};
+
+inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) {
+  V.print(O);
+  return O;
+}
+
+/// ARMConstantPoolConstant - ARM-specific constant pool values for Constants,
+/// Functions, and BlockAddresses.
+class ARMConstantPoolConstant : public ARMConstantPoolValue {
+  const Constant *CVal;         // Constant being loaded.
+  const GlobalVariable *GVar = nullptr;
+
+  ARMConstantPoolConstant(const Constant *C,
+                          unsigned ID,
+                          ARMCP::ARMCPKind Kind,
+                          unsigned char PCAdj,
+                          ARMCP::ARMCPModifier Modifier,
+                          bool AddCurrentAddress);
+  ARMConstantPoolConstant(Type *Ty, const Constant *C,
+                          unsigned ID,
+                          ARMCP::ARMCPKind Kind,
+                          unsigned char PCAdj,
+                          ARMCP::ARMCPModifier Modifier,
+                          bool AddCurrentAddress);
+  ARMConstantPoolConstant(const GlobalVariable *GV, const Constant *Init);
+
+public:
+  static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID);
+  static ARMConstantPoolConstant *Create(const GlobalValue *GV,
+                                         ARMCP::ARMCPModifier Modifier);
+  static ARMConstantPoolConstant *Create(const GlobalVariable *GV,
+                                         const Constant *Initializer);
+  static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID,
+                                         ARMCP::ARMCPKind Kind,
+                                         unsigned char PCAdj);
+  static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID,
+                                         ARMCP::ARMCPKind Kind,
+                                         unsigned char PCAdj,
+                                         ARMCP::ARMCPModifier Modifier,
+                                         bool AddCurrentAddress);
+
+  const GlobalValue *getGV() const;
+  const BlockAddress *getBlockAddress() const;
+  const GlobalVariable *getPromotedGlobal() const {
+    return dyn_cast_or_null<GlobalVariable>(GVar);
+  }
+  const Constant *getPromotedGlobalInit() const {
+    return CVal;
+  }
+
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                unsigned Alignment) override;
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  bool hasSameValue(ARMConstantPoolValue *ACPV) override;
+
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+
+  void print(raw_ostream &O) const override;
+  static bool classof(const ARMConstantPoolValue *APV) {
+    return APV->isGlobalValue() || APV->isBlockAddress() || APV->isLSDA() ||
+           APV->isPromotedGlobal();
+  }
+
+  bool equals(const ARMConstantPoolConstant *A) const {
+    return CVal == A->CVal && ARMConstantPoolValue::equals(A);
+  }
+};
+
+/// ARMConstantPoolSymbol - ARM-specific constantpool values for external
+/// symbols.
+class ARMConstantPoolSymbol : public ARMConstantPoolValue {
+  const std::string S;          // ExtSymbol being loaded.
+
+  ARMConstantPoolSymbol(LLVMContext &C, StringRef s, unsigned id,
+                        unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                        bool AddCurrentAddress);
+
+public:
+  static ARMConstantPoolSymbol *Create(LLVMContext &C, StringRef s, unsigned ID,
+                                       unsigned char PCAdj);
+
+  StringRef getSymbol() const { return S; }
+
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                unsigned Alignment) override;
+
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  bool hasSameValue(ARMConstantPoolValue *ACPV) override;
+
+  void print(raw_ostream &O) const override;
+
+  static bool classof(const ARMConstantPoolValue *ACPV) {
+    return ACPV->isExtSymbol();
+  }
+
+  bool equals(const ARMConstantPoolSymbol *A) const {
+    return S == A->S && ARMConstantPoolValue::equals(A);
+  }
+};
+
+/// ARMConstantPoolMBB - ARM-specific constantpool value of a machine basic
+/// block.
+class ARMConstantPoolMBB : public ARMConstantPoolValue {
+  const MachineBasicBlock *MBB; // Machine basic block.
+
+  ARMConstantPoolMBB(LLVMContext &C, const MachineBasicBlock *mbb, unsigned id,
+                     unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                     bool AddCurrentAddress);
+
+public:
+  static ARMConstantPoolMBB *Create(LLVMContext &C,
+                                    const MachineBasicBlock *mbb,
+                                    unsigned ID, unsigned char PCAdj);
+
+  const MachineBasicBlock *getMBB() const { return MBB; }
+
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                unsigned Alignment) override;
+
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  bool hasSameValue(ARMConstantPoolValue *ACPV) override;
+
+  void print(raw_ostream &O) const override;
+
+  static bool classof(const ARMConstantPoolValue *ACPV) {
+    return ACPV->isMachineBasicBlock();
+  }
+
+  bool equals(const ARMConstantPoolMBB *A) const {
+    return MBB == A->MBB && ARMConstantPoolValue::equals(A);
+  }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
new file mode 100644
index 000000000000..95fcc8dcb453
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -0,0 +1,1686 @@
+//===-- ARMExpandPseudoInsts.cpp - Expand pseudo instructions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling, if-conversion, and other late
+// optimizations. This pass should be run after register allocation but before
+// the post-regalloc scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h" // FIXME: for debug only. remove!
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-pseudo"
+
+static cl::opt<bool>
+VerifyARMPseudo("verify-arm-pseudo-expand", cl::Hidden,
+                cl::desc("Verify machine code after expanding ARM pseudos"));
+
+namespace {
+  class ARMExpandPseudo : public MachineFunctionPass {
+  public:
+    static char ID;
+    ARMExpandPseudo() : MachineFunctionPass(ID) {}
+
+    const ARMBaseInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const ARMSubtarget *STI;
+    ARMFunctionInfo *AFI;
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    StringRef getPassName() const override {
+      return "ARM pseudo instruction expansion pass";
+    }
+
+  private:
+    void TransferImpOps(MachineInstr &OldMI,
+                        MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI);
+    bool ExpandMI(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MBBI,
+                  MachineBasicBlock::iterator &NextMBBI);
+    bool ExpandMBB(MachineBasicBlock &MBB);
+    void ExpandVLD(MachineBasicBlock::iterator &MBBI);
+    void ExpandVST(MachineBasicBlock::iterator &MBBI);
+    void ExpandLaneOp(MachineBasicBlock::iterator &MBBI);
+    void ExpandVTBL(MachineBasicBlock::iterator &MBBI,
+                    unsigned Opc, bool IsExt);
+    void ExpandMOV32BitImm(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator &MBBI);
+    bool ExpandCMP_SWAP(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI, unsigned LdrexOp,
+                        unsigned StrexOp, unsigned UxtOp,
+                        MachineBasicBlock::iterator &NextMBBI);
+
+    bool ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           MachineBasicBlock::iterator &NextMBBI);
+  };
+  char ARMExpandPseudo::ID = 0;
+}
+
+/// TransferImpOps - Transfer implicit operands on the pseudo instruction to
+/// the instructions created from the expansion.
+void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI,
+                                     MachineInstrBuilder &UseMI,
+                                     MachineInstrBuilder &DefMI) {
+  const MCInstrDesc &Desc = OldMI.getDesc();
+  for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands();
+       i != e; ++i) {
+    const MachineOperand &MO = OldMI.getOperand(i);
+    assert(MO.isReg() && MO.getReg());
+    if (MO.isUse())
+      UseMI.addOperand(MO);
+    else
+      DefMI.addOperand(MO);
+  }
+}
+
+namespace {
+  // Constants for register spacing in NEON load/store instructions.
+  // For quad-register load-lane and store-lane pseudo instructors, the
+  // spacing is initially assumed to be EvenDblSpc, and that is changed to
+  // OddDblSpc depending on the lane number operand.
+  enum NEONRegSpacing {
+    SingleSpc,
+    EvenDblSpc,
+    OddDblSpc
+  };
+
+  // Entries for NEON load/store information table.  The table is sorted by
+  // PseudoOpc for fast binary-search lookups.
+  struct NEONLdStTableEntry {
+    uint16_t PseudoOpc;
+    uint16_t RealOpc;
+    bool IsLoad;
+    bool isUpdating;
+    bool hasWritebackOperand;
+    uint8_t RegSpacing; // One of type NEONRegSpacing
+    uint8_t NumRegs; // D registers loaded or stored
+    uint8_t RegElts; // elements per D register; used for lane ops
+    // FIXME: Temporary flag to denote whether the real instruction takes
+    // a single register (like the encoding) or all of the registers in
+    // the list (like the asm syntax and the isel DAG). When all definitions
+    // are converted to take only the single encoded register, this will
+    // go away.
+    bool copyAllListRegs;
+
+    // Comparison methods for binary search of the table.
+    bool operator<(const NEONLdStTableEntry &TE) const {
+      return PseudoOpc < TE.PseudoOpc;
+    }
+    friend bool operator<(const NEONLdStTableEntry &TE, unsigned PseudoOpc) {
+      return TE.PseudoOpc < PseudoOpc;
+    }
+    friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned PseudoOpc,
+                                                const NEONLdStTableEntry &TE) {
+      return PseudoOpc < TE.PseudoOpc;
+    }
+  };
+}
+
+static const NEONLdStTableEntry NEONLdStTable[] = {
+{ ARM::VLD1LNq16Pseudo,     ARM::VLD1LNd16,     true, false, false, EvenDblSpc, 1, 4 ,true},
+{ ARM::VLD1LNq16Pseudo_UPD, ARM::VLD1LNd16_UPD, true, true, true,  EvenDblSpc, 1, 4 ,true},
+{ ARM::VLD1LNq32Pseudo,     ARM::VLD1LNd32,     true, false, false, EvenDblSpc, 1, 2 ,true},
+{ ARM::VLD1LNq32Pseudo_UPD, ARM::VLD1LNd32_UPD, true, true, true,  EvenDblSpc, 1, 2 ,true},
+{ ARM::VLD1LNq8Pseudo,      ARM::VLD1LNd8,      true, false, false, EvenDblSpc, 1, 8 ,true},
+{ ARM::VLD1LNq8Pseudo_UPD,  ARM::VLD1LNd8_UPD, true, true, true,  EvenDblSpc, 1, 8 ,true},
+
+{ ARM::VLD1d64QPseudo,      ARM::VLD1d64Q,     true,  false, false, SingleSpc,  4, 1 ,false},
+{ ARM::VLD1d64QPseudoWB_fixed,  ARM::VLD1d64Qwb_fixed,   true,  true, false, SingleSpc,  4, 1 ,false},
+{ ARM::VLD1d64TPseudo,      ARM::VLD1d64T,     true,  false, false, SingleSpc,  3, 1 ,false},
+{ ARM::VLD1d64TPseudoWB_fixed,  ARM::VLD1d64Twb_fixed,   true,  true, false, SingleSpc,  3, 1 ,false},
+
+{ ARM::VLD2LNd16Pseudo,     ARM::VLD2LNd16,     true, false, false, SingleSpc,  2, 4 ,true},
+{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true,  SingleSpc,  2, 4 ,true},
+{ ARM::VLD2LNd32Pseudo,     ARM::VLD2LNd32,     true, false, false, SingleSpc,  2, 2 ,true},
+{ ARM::VLD2LNd32Pseudo_UPD, ARM::VLD2LNd32_UPD, true, true, true,  SingleSpc,  2, 2 ,true},
+{ ARM::VLD2LNd8Pseudo,      ARM::VLD2LNd8,      true, false, false, SingleSpc,  2, 8 ,true},
+{ ARM::VLD2LNd8Pseudo_UPD,  ARM::VLD2LNd8_UPD, true, true, true,  SingleSpc,  2, 8 ,true},
+{ ARM::VLD2LNq16Pseudo,     ARM::VLD2LNq16,     true, false, false, EvenDblSpc, 2, 4 ,true},
+{ ARM::VLD2LNq16Pseudo_UPD, ARM::VLD2LNq16_UPD, true, true, true,  EvenDblSpc, 2, 4 ,true},
+{ ARM::VLD2LNq32Pseudo,     ARM::VLD2LNq32,     true, false, false, EvenDblSpc, 2, 2 ,true},
+{ ARM::VLD2LNq32Pseudo_UPD, ARM::VLD2LNq32_UPD, true, true, true,  EvenDblSpc, 2, 2 ,true},
+
+{ ARM::VLD2q16Pseudo,       ARM::VLD2q16,      true,  false, false, SingleSpc,  4, 4 ,false},
+{ ARM::VLD2q16PseudoWB_fixed,   ARM::VLD2q16wb_fixed, true, true, false,  SingleSpc,  4, 4 ,false},
+{ ARM::VLD2q16PseudoWB_register,   ARM::VLD2q16wb_register, true, true, true,  SingleSpc,  4, 4 ,false},
+{ ARM::VLD2q32Pseudo,       ARM::VLD2q32,      true,  false, false, SingleSpc,  4, 2 ,false},
+{ ARM::VLD2q32PseudoWB_fixed,   ARM::VLD2q32wb_fixed, true, true, false,  SingleSpc,  4, 2 ,false},
+{ ARM::VLD2q32PseudoWB_register,   ARM::VLD2q32wb_register, true, true, true,  SingleSpc,  4, 2 ,false},
+{ ARM::VLD2q8Pseudo,        ARM::VLD2q8,       true,  false, false, SingleSpc,  4, 8 ,false},
+{ ARM::VLD2q8PseudoWB_fixed,    ARM::VLD2q8wb_fixed, true, true, false,  SingleSpc,  4, 8 ,false},
+{ ARM::VLD2q8PseudoWB_register,    ARM::VLD2q8wb_register, true, true, true,  SingleSpc,  4, 8 ,false},
+
+{ ARM::VLD3DUPd16Pseudo,     ARM::VLD3DUPd16,     true, false, false, SingleSpc, 3, 4,true},
+{ ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd16_UPD, true, true, true,  SingleSpc, 3, 4,true},
+{ ARM::VLD3DUPd32Pseudo,     ARM::VLD3DUPd32,     true, false, false, SingleSpc, 3, 2,true},
+{ ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true, true,  SingleSpc, 3, 2,true},
+{ ARM::VLD3DUPd8Pseudo,      ARM::VLD3DUPd8,      true, false, false, SingleSpc, 3, 8,true},
+{ ARM::VLD3DUPd8Pseudo_UPD,  ARM::VLD3DUPd8_UPD, true, true, true,  SingleSpc, 3, 8,true},
+
+{ ARM::VLD3LNd16Pseudo,     ARM::VLD3LNd16,     true, false, false, SingleSpc,  3, 4 ,true},
+{ ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, true,  SingleSpc,  3, 4 ,true},
+{ ARM::VLD3LNd32Pseudo,     ARM::VLD3LNd32,     true, false, false, SingleSpc,  3, 2 ,true},
+{ ARM::VLD3LNd32Pseudo_UPD, ARM::VLD3LNd32_UPD, true, true, true,  SingleSpc,  3, 2 ,true},
+{ ARM::VLD3LNd8Pseudo,      ARM::VLD3LNd8,      true, false, false, SingleSpc,  3, 8 ,true},
+{ ARM::VLD3LNd8Pseudo_UPD,  ARM::VLD3LNd8_UPD, true, true, true,  SingleSpc,  3, 8 ,true},
+{ ARM::VLD3LNq16Pseudo,     ARM::VLD3LNq16,     true, false, false, EvenDblSpc, 3, 4 ,true},
+{ ARM::VLD3LNq16Pseudo_UPD, ARM::VLD3LNq16_UPD, true, true, true,  EvenDblSpc, 3, 4 ,true},
+{ ARM::VLD3LNq32Pseudo,     ARM::VLD3LNq32,     true, false, false, EvenDblSpc, 3, 2 ,true},
+{ ARM::VLD3LNq32Pseudo_UPD, ARM::VLD3LNq32_UPD, true, true, true,  EvenDblSpc, 3, 2 ,true},
+
+{ ARM::VLD3d16Pseudo,       ARM::VLD3d16,      true,  false, false, SingleSpc,  3, 4 ,true},
+{ ARM::VLD3d16Pseudo_UPD,   ARM::VLD3d16_UPD, true, true, true,  SingleSpc,  3, 4 ,true},
+{ ARM::VLD3d32Pseudo,       ARM::VLD3d32,      true,  false, false, SingleSpc,  3, 2 ,true},
+{ ARM::VLD3d32Pseudo_UPD,   ARM::VLD3d32_UPD, true, true, true,  SingleSpc,  3, 2 ,true},
+{ ARM::VLD3d8Pseudo,        ARM::VLD3d8,       true,  false, false, SingleSpc,  3, 8 ,true},
+{ ARM::VLD3d8Pseudo_UPD,    ARM::VLD3d8_UPD, true, true, true,  SingleSpc,  3, 8 ,true},
+
+{ ARM::VLD3q16Pseudo_UPD,    ARM::VLD3q16_UPD, true, true, true,  EvenDblSpc, 3, 4 ,true},
+{ ARM::VLD3q16oddPseudo,     ARM::VLD3q16,     true,  false, false, OddDblSpc,  3, 4 ,true},
+{ ARM::VLD3q16oddPseudo_UPD, ARM::VLD3q16_UPD, true, true, true,  OddDblSpc,  3, 4 ,true},
+{ ARM::VLD3q32Pseudo_UPD,    ARM::VLD3q32_UPD, true, true, true,  EvenDblSpc, 3, 2 ,true},
+{ ARM::VLD3q32oddPseudo,     ARM::VLD3q32,     true,  false, false, OddDblSpc,  3, 2 ,true},
+{ ARM::VLD3q32oddPseudo_UPD, ARM::VLD3q32_UPD, true, true, true,  OddDblSpc,  3, 2 ,true},
+{ ARM::VLD3q8Pseudo_UPD,     ARM::VLD3q8_UPD, true, true, true,  EvenDblSpc, 3, 8 ,true},
+{ ARM::VLD3q8oddPseudo,      ARM::VLD3q8,      true,  false, false, OddDblSpc,  3, 8 ,true},
+{ ARM::VLD3q8oddPseudo_UPD,  ARM::VLD3q8_UPD, true, true, true,  OddDblSpc,  3, 8 ,true},
+
+{ ARM::VLD4DUPd16Pseudo,     ARM::VLD4DUPd16,     true, false, false, SingleSpc, 4, 4,true},
+{ ARM::VLD4DUPd16Pseudo_UPD, ARM::VLD4DUPd16_UPD, true, true, true,  SingleSpc, 4, 4,true},
+{ ARM::VLD4DUPd32Pseudo,     ARM::VLD4DUPd32,     true, false, false, SingleSpc, 4, 2,true},
+{ ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, true,  SingleSpc, 4, 2,true},
+{ ARM::VLD4DUPd8Pseudo,      ARM::VLD4DUPd8,      true, false, false, SingleSpc, 4, 8,true},
+{ ARM::VLD4DUPd8Pseudo_UPD,  ARM::VLD4DUPd8_UPD, true, true, true,  SingleSpc, 4, 8,true},
+
+{ ARM::VLD4LNd16Pseudo,     ARM::VLD4LNd16,     true, false, false, SingleSpc,  4, 4 ,true},
+{ ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, true,  SingleSpc,  4, 4 ,true},
+{ ARM::VLD4LNd32Pseudo,     ARM::VLD4LNd32,     true, false, false, SingleSpc,  4, 2 ,true},
+{ ARM::VLD4LNd32Pseudo_UPD, ARM::VLD4LNd32_UPD, true, true, true,  SingleSpc,  4, 2 ,true},
+{ ARM::VLD4LNd8Pseudo,      ARM::VLD4LNd8,      true, false, false, SingleSpc,  4, 8 ,true},
+{ ARM::VLD4LNd8Pseudo_UPD,  ARM::VLD4LNd8_UPD, true, true, true,  SingleSpc,  4, 8 ,true},
+{ ARM::VLD4LNq16Pseudo,     ARM::VLD4LNq16,     true, false, false, EvenDblSpc, 4, 4 ,true},
+{ ARM::VLD4LNq16Pseudo_UPD, ARM::VLD4LNq16_UPD, true, true, true,  EvenDblSpc, 4, 4 ,true},
+{ ARM::VLD4LNq32Pseudo,     ARM::VLD4LNq32,     true, false, false, EvenDblSpc, 4, 2 ,true},
+{ ARM::VLD4LNq32Pseudo_UPD, ARM::VLD4LNq32_UPD, true, true, true,  EvenDblSpc, 4, 2 ,true},
+
+{ ARM::VLD4d16Pseudo,       ARM::VLD4d16,      true,  false, false, SingleSpc,  4, 4 ,true},
+{ ARM::VLD4d16Pseudo_UPD,   ARM::VLD4d16_UPD, true, true, true,  SingleSpc,  4, 4 ,true},
+{ ARM::VLD4d32Pseudo,       ARM::VLD4d32,      true,  false, false, SingleSpc,  4, 2 ,true},
+{ ARM::VLD4d32Pseudo_UPD,   ARM::VLD4d32_UPD, true, true, true,  SingleSpc,  4, 2 ,true},
+{ ARM::VLD4d8Pseudo,        ARM::VLD4d8,       true,  false, false, SingleSpc,  4, 8 ,true},
+{ ARM::VLD4d8Pseudo_UPD,    ARM::VLD4d8_UPD, true, true, true,  SingleSpc,  4, 8 ,true},
+
+{ ARM::VLD4q16Pseudo_UPD,    ARM::VLD4q16_UPD, true, true, true,  EvenDblSpc, 4, 4 ,true},
+{ ARM::VLD4q16oddPseudo,     ARM::VLD4q16,     true,  false, false, OddDblSpc,  4, 4 ,true},
+{ ARM::VLD4q16oddPseudo_UPD, ARM::VLD4q16_UPD, true, true, true,  OddDblSpc,  4, 4 ,true},
+{ ARM::VLD4q32Pseudo_UPD,    ARM::VLD4q32_UPD, true, true, true,  EvenDblSpc, 4, 2 ,true},
+{ ARM::VLD4q32oddPseudo,     ARM::VLD4q32,     true,  false, false, OddDblSpc,  4, 2 ,true},
+{ ARM::VLD4q32oddPseudo_UPD, ARM::VLD4q32_UPD, true, true, true,  OddDblSpc,  4, 2 ,true},
+{ ARM::VLD4q8Pseudo_UPD,     ARM::VLD4q8_UPD, true, true, true,  EvenDblSpc, 4, 8 ,true},
+{ ARM::VLD4q8oddPseudo,      ARM::VLD4q8,      true,  false, false, OddDblSpc,  4, 8 ,true},
+{ ARM::VLD4q8oddPseudo_UPD,  ARM::VLD4q8_UPD, true, true, true,  OddDblSpc,  4, 8 ,true},
+
+{ ARM::VST1LNq16Pseudo,     ARM::VST1LNd16,    false, false, false, EvenDblSpc, 1, 4 ,true},
+{ ARM::VST1LNq16Pseudo_UPD, ARM::VST1LNd16_UPD, false, true, true,  EvenDblSpc, 1, 4 ,true},
+{ ARM::VST1LNq32Pseudo,     ARM::VST1LNd32,    false, false, false, EvenDblSpc, 1, 2 ,true},
+{ ARM::VST1LNq32Pseudo_UPD, ARM::VST1LNd32_UPD, false, true, true,  EvenDblSpc, 1, 2 ,true},
+{ ARM::VST1LNq8Pseudo,      ARM::VST1LNd8,     false, false, false, EvenDblSpc, 1, 8 ,true},
+{ ARM::VST1LNq8Pseudo_UPD,  ARM::VST1LNd8_UPD, false, true, true,  EvenDblSpc, 1, 8 ,true},
+
+{ ARM::VST1d64QPseudo,      ARM::VST1d64Q,     false, false, false, SingleSpc,  4, 1 ,false},
+{ ARM::VST1d64QPseudoWB_fixed,  ARM::VST1d64Qwb_fixed, false, true, false,  SingleSpc,  4, 1 ,false},
+{ ARM::VST1d64QPseudoWB_register, ARM::VST1d64Qwb_register, false, true, true,  SingleSpc,  4, 1 ,false},
+{ ARM::VST1d64TPseudo,      ARM::VST1d64T,     false, false, false, SingleSpc,  3, 1 ,false},
+{ ARM::VST1d64TPseudoWB_fixed,  ARM::VST1d64Twb_fixed, false, true, false,  SingleSpc,  3, 1 ,false},
+{ ARM::VST1d64TPseudoWB_register,  ARM::VST1d64Twb_register, false, true, true,  SingleSpc,  3, 1 ,false},
+
+{ ARM::VST2LNd16Pseudo,     ARM::VST2LNd16,     false, false, false, SingleSpc, 2, 4 ,true},
+{ ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd16_UPD, false, true, true,  SingleSpc, 2, 4 ,true},
+{ ARM::VST2LNd32Pseudo,     ARM::VST2LNd32,     false, false, false, SingleSpc, 2, 2 ,true},
+{ ARM::VST2LNd32Pseudo_UPD, ARM::VST2LNd32_UPD, false, true, true,  SingleSpc, 2, 2 ,true},
+{ ARM::VST2LNd8Pseudo,      ARM::VST2LNd8,      false, false, false, SingleSpc, 2, 8 ,true},
+{ ARM::VST2LNd8Pseudo_UPD,  ARM::VST2LNd8_UPD, false, true, true,  SingleSpc, 2, 8 ,true},
+{ ARM::VST2LNq16Pseudo,     ARM::VST2LNq16,     false, false, false, EvenDblSpc, 2, 4,true},
+{ ARM::VST2LNq16Pseudo_UPD, ARM::VST2LNq16_UPD, false, true, true,  EvenDblSpc, 2, 4,true},
+{ ARM::VST2LNq32Pseudo,     ARM::VST2LNq32,     false, false, false, EvenDblSpc, 2, 2,true},
+{ ARM::VST2LNq32Pseudo_UPD, ARM::VST2LNq32_UPD, false, true, true,  EvenDblSpc, 2, 2,true},
+
+{ ARM::VST2q16Pseudo,       ARM::VST2q16,      false, false, false, SingleSpc,  4, 4 ,false},
+{ ARM::VST2q16PseudoWB_fixed,   ARM::VST2q16wb_fixed, false, true, false,  SingleSpc,  4, 4 ,false},
+{ ARM::VST2q16PseudoWB_register,   ARM::VST2q16wb_register, false, true, true,  SingleSpc,  4, 4 ,false},
+{ ARM::VST2q32Pseudo,       ARM::VST2q32,      false, false, false, SingleSpc,  4, 2 ,false},
+{ ARM::VST2q32PseudoWB_fixed,   ARM::VST2q32wb_fixed, false, true, false,  SingleSpc,  4, 2 ,false},
+{ ARM::VST2q32PseudoWB_register,   ARM::VST2q32wb_register, false, true, true,  SingleSpc,  4, 2 ,false},
+{ ARM::VST2q8Pseudo,        ARM::VST2q8,       false, false, false, SingleSpc,  4, 8 ,false},
+{ ARM::VST2q8PseudoWB_fixed,    ARM::VST2q8wb_fixed, false, true, false,  SingleSpc,  4, 8 ,false},
+{ ARM::VST2q8PseudoWB_register,    ARM::VST2q8wb_register, false, true, true,  SingleSpc,  4, 8 ,false},
+
+{ ARM::VST3LNd16Pseudo,     ARM::VST3LNd16,     false, false, false, SingleSpc, 3, 4 ,true},
+{ ARM::VST3LNd16Pseudo_UPD, ARM::VST3LNd16_UPD, false, true, true,  SingleSpc, 3, 4 ,true},
+{ ARM::VST3LNd32Pseudo,     ARM::VST3LNd32,     false, false, false, SingleSpc, 3, 2 ,true},
+{ ARM::VST3LNd32Pseudo_UPD, ARM::VST3LNd32_UPD, false, true, true,  SingleSpc, 3, 2 ,true},
+{ ARM::VST3LNd8Pseudo,      ARM::VST3LNd8,      false, false, false, SingleSpc, 3, 8 ,true},
+{ ARM::VST3LNd8Pseudo_UPD,  ARM::VST3LNd8_UPD, false, true, true,  SingleSpc, 3, 8 ,true},
+{ ARM::VST3LNq16Pseudo,     ARM::VST3LNq16,     false, false, false, EvenDblSpc, 3, 4,true},
+{ ARM::VST3LNq16Pseudo_UPD, ARM::VST3LNq16_UPD, false, true, true,  EvenDblSpc, 3, 4,true},
+{ ARM::VST3LNq32Pseudo,     ARM::VST3LNq32,     false, false, false, EvenDblSpc, 3, 2,true},
+{ ARM::VST3LNq32Pseudo_UPD, ARM::VST3LNq32_UPD, false, true, true,  EvenDblSpc, 3, 2,true},
+
+{ ARM::VST3d16Pseudo,       ARM::VST3d16,      false, false, false, SingleSpc,  3, 4 ,true},
+{ ARM::VST3d16Pseudo_UPD,   ARM::VST3d16_UPD, false, true, true,  SingleSpc,  3, 4 ,true},
+{ ARM::VST3d32Pseudo,       ARM::VST3d32,      false, false, false, SingleSpc,  3, 2 ,true},
+{ ARM::VST3d32Pseudo_UPD,   ARM::VST3d32_UPD, false, true, true,  SingleSpc,  3, 2 ,true},
+{ ARM::VST3d8Pseudo,        ARM::VST3d8,       false, false, false, SingleSpc,  3, 8 ,true},
+{ ARM::VST3d8Pseudo_UPD,    ARM::VST3d8_UPD, false, true, true,  SingleSpc,  3, 8 ,true},
+
+{ ARM::VST3q16Pseudo_UPD,    ARM::VST3q16_UPD, false, true, true,  EvenDblSpc, 3, 4 ,true},
+{ ARM::VST3q16oddPseudo,     ARM::VST3q16,     false, false, false, OddDblSpc,  3, 4 ,true},
+{ ARM::VST3q16oddPseudo_UPD, ARM::VST3q16_UPD, false, true, true,  OddDblSpc,  3, 4 ,true},
+{ ARM::VST3q32Pseudo_UPD,    ARM::VST3q32_UPD, false, true, true,  EvenDblSpc, 3, 2 ,true},
+{ ARM::VST3q32oddPseudo,     ARM::VST3q32,     false, false, false, OddDblSpc,  3, 2 ,true},
+{ ARM::VST3q32oddPseudo_UPD, ARM::VST3q32_UPD, false, true, true,  OddDblSpc,  3, 2 ,true},
+{ ARM::VST3q8Pseudo_UPD,     ARM::VST3q8_UPD, false, true, true,  EvenDblSpc, 3, 8 ,true},
+{ ARM::VST3q8oddPseudo,      ARM::VST3q8,      false, false, false, OddDblSpc,  3, 8 ,true},
+{ ARM::VST3q8oddPseudo_UPD,  ARM::VST3q8_UPD, false, true, true,  OddDblSpc,  3, 8 ,true},
+
+{ ARM::VST4LNd16Pseudo,     ARM::VST4LNd16,     false, false, false, SingleSpc, 4, 4 ,true},
+{ ARM::VST4LNd16Pseudo_UPD, ARM::VST4LNd16_UPD, false, true, true,  SingleSpc, 4, 4 ,true},
+{ ARM::VST4LNd32Pseudo,     ARM::VST4LNd32,     false, false, false, SingleSpc, 4, 2 ,true},
+{ ARM::VST4LNd32Pseudo_UPD, ARM::VST4LNd32_UPD, false, true, true,  SingleSpc, 4, 2 ,true},
+{ ARM::VST4LNd8Pseudo,      ARM::VST4LNd8,      false, false, false, SingleSpc, 4, 8 ,true},
+{ ARM::VST4LNd8Pseudo_UPD,  ARM::VST4LNd8_UPD, false, true, true,  SingleSpc, 4, 8 ,true},
+{ ARM::VST4LNq16Pseudo,     ARM::VST4LNq16,     false, false, false, EvenDblSpc, 4, 4,true},
+{ ARM::VST4LNq16Pseudo_UPD, ARM::VST4LNq16_UPD, false, true, true,  EvenDblSpc, 4, 4,true},
+{ ARM::VST4LNq32Pseudo,     ARM::VST4LNq32,     false, false, false, EvenDblSpc, 4, 2,true},
+{ ARM::VST4LNq32Pseudo_UPD, ARM::VST4LNq32_UPD, false, true, true,  EvenDblSpc, 4, 2,true},
+
+{ ARM::VST4d16Pseudo,       ARM::VST4d16,      false, false, false, SingleSpc,  4, 4 ,true},
+{ ARM::VST4d16Pseudo_UPD,   ARM::VST4d16_UPD, false, true, true,  SingleSpc,  4, 4 ,true},
+{ ARM::VST4d32Pseudo,       ARM::VST4d32,      false, false, false, SingleSpc,  4, 2 ,true},
+{ ARM::VST4d32Pseudo_UPD,   ARM::VST4d32_UPD, false, true, true,  SingleSpc,  4, 2 ,true},
+{ ARM::VST4d8Pseudo,        ARM::VST4d8,       false, false, false, SingleSpc,  4, 8 ,true},
+{ ARM::VST4d8Pseudo_UPD,    ARM::VST4d8_UPD, false, true, true,  SingleSpc,  4, 8 ,true},
+
+{ ARM::VST4q16Pseudo_UPD,    ARM::VST4q16_UPD, false, true, true,  EvenDblSpc, 4, 4 ,true},
+{ ARM::VST4q16oddPseudo,     ARM::VST4q16,     false, false, false, OddDblSpc,  4, 4 ,true},
+{ ARM::VST4q16oddPseudo_UPD, ARM::VST4q16_UPD, false, true, true,  OddDblSpc,  4, 4 ,true},
+{ ARM::VST4q32Pseudo_UPD,    ARM::VST4q32_UPD, false, true, true,  EvenDblSpc, 4, 2 ,true},
+{ ARM::VST4q32oddPseudo,     ARM::VST4q32,     false, false, false, OddDblSpc,  4, 2 ,true},
+{ ARM::VST4q32oddPseudo_UPD, ARM::VST4q32_UPD, false, true, true,  OddDblSpc,  4, 2 ,true},
+{ ARM::VST4q8Pseudo_UPD,     ARM::VST4q8_UPD, false, true, true,  EvenDblSpc, 4, 8 ,true},
+{ ARM::VST4q8oddPseudo,      ARM::VST4q8,      false, false, false, OddDblSpc,  4, 8 ,true},
+{ ARM::VST4q8oddPseudo_UPD,  ARM::VST4q8_UPD, false, true, true,  OddDblSpc,  4, 8 ,true}
+};
+
+/// LookupNEONLdSt - Search the NEONLdStTable for information about a NEON
+/// load or store pseudo instruction.
+static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
+#ifndef NDEBUG
+  // Make sure the table is sorted.
+  static bool TableChecked = false;
+  if (!TableChecked) {
+    assert(std::is_sorted(std::begin(NEONLdStTable), std::end(NEONLdStTable)) &&
+           "NEONLdStTable is not sorted!");
+    TableChecked = true;
+  }
+#endif
+
+  auto I = std::lower_bound(std::begin(NEONLdStTable),
+                            std::end(NEONLdStTable), Opcode);
+  if (I != std::end(NEONLdStTable) && I->PseudoOpc == Opcode)
+    return I;
+  return nullptr;
+}
+
+/// GetDSubRegs - Get 4 D subregisters of a Q, QQ, or QQQQ register,
+/// corresponding to the specified register spacing.  Not all of the results
+/// are necessarily valid, e.g., a Q register only has 2 D subregisters.
+static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc,
+                        const TargetRegisterInfo *TRI, unsigned &D0,
+                        unsigned &D1, unsigned &D2, unsigned &D3) {
+  if (RegSpc == SingleSpc) {
+    D0 = TRI->getSubReg(Reg, ARM::dsub_0);
+    D1 = TRI->getSubReg(Reg, ARM::dsub_1);
+    D2 = TRI->getSubReg(Reg, ARM::dsub_2);
+    D3 = TRI->getSubReg(Reg, ARM::dsub_3);
+  } else if (RegSpc == EvenDblSpc) {
+    D0 = TRI->getSubReg(Reg, ARM::dsub_0);
+    D1 = TRI->getSubReg(Reg, ARM::dsub_2);
+    D2 = TRI->getSubReg(Reg, ARM::dsub_4);
+    D3 = TRI->getSubReg(Reg, ARM::dsub_6);
+  } else {
+    assert(RegSpc == OddDblSpc && "unknown register spacing");
+    D0 = TRI->getSubReg(Reg, ARM::dsub_1);
+    D1 = TRI->getSubReg(Reg, ARM::dsub_3);
+    D2 = TRI->getSubReg(Reg, ARM::dsub_5);
+    D3 = TRI->getSubReg(Reg, ARM::dsub_7);
+  }
+}
+
+/// ExpandVLD - Translate VLD pseudo instructions with Q, QQ or QQQQ register
+/// operands to real VLD instructions with D register operands.
+void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
+  assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed");
+  NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
+  unsigned NumRegs = TableEntry->NumRegs;
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                    TII->get(TableEntry->RealOpc));
+  unsigned OpIdx = 0;
+
+  bool DstIsDead = MI.getOperand(OpIdx).isDead();
+  unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+  unsigned D0, D1, D2, D3;
+  GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
+  MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead));
+  if (NumRegs > 1 && TableEntry->copyAllListRegs)
+    MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+  if (NumRegs > 2 && TableEntry->copyAllListRegs)
+    MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead));
+  if (NumRegs > 3 && TableEntry->copyAllListRegs)
+    MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
+
+  if (TableEntry->isUpdating)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the addrmode6 operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  // Copy the am6offset operand.
+  if (TableEntry->hasWritebackOperand)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // For an instruction writing double-spaced subregs, the pseudo instruction
+  // has an extra operand that is a use of the super-register.  Record the
+  // operand index and skip over it.
+  unsigned SrcOpIdx = 0;
+  if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc)
+    SrcOpIdx = OpIdx++;
+
+  // Copy the predicate operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the super-register source operand used for double-spaced subregs over
+  // to the new instruction as an implicit operand.
+  if (SrcOpIdx != 0) {
+    MachineOperand MO = MI.getOperand(SrcOpIdx);
+    MO.setImplicit(true);
+    MIB.addOperand(MO);
+  }
+  // Add an implicit def for the super-register.
+  MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
+  TransferImpOps(MI, MIB, MIB);
+
+  // Transfer memoperands.
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+}
+
+/// ExpandVST - Translate VST pseudo instructions with Q, QQ or QQQQ register
+/// operands to real VST instructions with D register operands.
+void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
+  assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed");
+  NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
+  unsigned NumRegs = TableEntry->NumRegs;
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                    TII->get(TableEntry->RealOpc));
+  unsigned OpIdx = 0;
+  if (TableEntry->isUpdating)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the addrmode6 operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  // Copy the am6offset operand.
+  if (TableEntry->hasWritebackOperand)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+  bool SrcIsUndef = MI.getOperand(OpIdx).isUndef();
+  unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+  unsigned D0, D1, D2, D3;
+  GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3);
+  MIB.addReg(D0, getUndefRegState(SrcIsUndef));
+  if (NumRegs > 1 && TableEntry->copyAllListRegs)
+    MIB.addReg(D1, getUndefRegState(SrcIsUndef));
+  if (NumRegs > 2 && TableEntry->copyAllListRegs)
+    MIB.addReg(D2, getUndefRegState(SrcIsUndef));
+  if (NumRegs > 3 && TableEntry->copyAllListRegs)
+    MIB.addReg(D3, getUndefRegState(SrcIsUndef));
+
+  // Copy the predicate operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  if (SrcIsKill && !SrcIsUndef) // Add an implicit kill for the super-reg.
+    MIB->addRegisterKilled(SrcReg, TRI, true);
+  else if (!SrcIsUndef)
+    MIB.addReg(SrcReg, RegState::Implicit); // Add implicit uses for src reg.
+  TransferImpOps(MI, MIB, MIB);
+
+  // Transfer memoperands.
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+}
+
+/// ExpandLaneOp - Translate VLD*LN and VST*LN instructions with Q, QQ or QQQQ
+/// register operands to real instructions with D register operands.
+void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
+  assert(TableEntry && "NEONLdStTable lookup failed");
+  NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
+  unsigned NumRegs = TableEntry->NumRegs;
+  unsigned RegElts = TableEntry->RegElts;
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                    TII->get(TableEntry->RealOpc));
+  unsigned OpIdx = 0;
+  // The lane operand is always the 3rd from last operand, before the 2
+  // predicate operands.
+  unsigned Lane = MI.getOperand(MI.getDesc().getNumOperands() - 3).getImm();
+
+  // Adjust the lane and spacing as needed for Q registers.
+  assert(RegSpc != OddDblSpc && "unexpected register spacing for VLD/VST-lane");
+  if (RegSpc == EvenDblSpc && Lane >= RegElts) {
+    RegSpc = OddDblSpc;
+    Lane -= RegElts;
+  }
+  assert(Lane < RegElts && "out of range lane for VLD/VST-lane");
+
+  unsigned D0 = 0, D1 = 0, D2 = 0, D3 = 0;
+  unsigned DstReg = 0;
+  bool DstIsDead = false;
+  if (TableEntry->IsLoad) {
+    DstIsDead = MI.getOperand(OpIdx).isDead();
+    DstReg = MI.getOperand(OpIdx++).getReg();
+    GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
+    MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead));
+    if (NumRegs > 1)
+      MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+    if (NumRegs > 2)
+      MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead));
+    if (NumRegs > 3)
+      MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
+  }
+
+  if (TableEntry->isUpdating)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the addrmode6 operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  // Copy the am6offset operand.
+  if (TableEntry->hasWritebackOperand)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Grab the super-register source.
+  MachineOperand MO = MI.getOperand(OpIdx++);
+  if (!TableEntry->IsLoad)
+    GetDSubRegs(MO.getReg(), RegSpc, TRI, D0, D1, D2, D3);
+
+  // Add the subregs as sources of the new instruction.
+  unsigned SrcFlags = (getUndefRegState(MO.isUndef()) |
+                       getKillRegState(MO.isKill()));
+  MIB.addReg(D0, SrcFlags);
+  if (NumRegs > 1)
+    MIB.addReg(D1, SrcFlags);
+  if (NumRegs > 2)
+    MIB.addReg(D2, SrcFlags);
+  if (NumRegs > 3)
+    MIB.addReg(D3, SrcFlags);
+
+  // Add the lane number operand.
+  MIB.addImm(Lane);
+  OpIdx += 1;
+
+  // Copy the predicate operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the super-register source to be an implicit source.
+  MO.setImplicit(true);
+  MIB.addOperand(MO);
+  if (TableEntry->IsLoad)
+    // Add an implicit def for the super-register.
+    MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
+  TransferImpOps(MI, MIB, MIB);
+  // Transfer memoperands.
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  MI.eraseFromParent();
+}
+
+/// ExpandVTBL - Translate VTBL and VTBX pseudo instructions with Q or QQ
+/// register operands to real instructions with D register operands.
+void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
+                                 unsigned Opc, bool IsExt) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc));
+  unsigned OpIdx = 0;
+
+  // Transfer the destination register operand.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  if (IsExt)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+  unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+  unsigned D0, D1, D2, D3;
+  GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3);
+  MIB.addReg(D0);
+
+  // Copy the other source register operand.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the predicate operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Add an implicit kill and use for the super-reg.
+  MIB.addReg(SrcReg, RegState::Implicit | getKillRegState(SrcIsKill));
+  TransferImpOps(MI, MIB, MIB);
+  MI.eraseFromParent();
+}
+
+static bool IsAnAddressOperand(const MachineOperand &MO) {
+  // This check is overly conservative.  Unless we are certain that the machine
+  // operand is not a symbol reference, we return that it is a symbol reference.
+  // This is important as the load pair may not be split up Windows.
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+  case MachineOperand::MO_Immediate:
+  case MachineOperand::MO_CImmediate:
+  case MachineOperand::MO_FPImmediate:
+    return false;
+  case MachineOperand::MO_MachineBasicBlock:
+    return true;
+  case MachineOperand::MO_FrameIndex:
+    return false;
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_TargetIndex:
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_BlockAddress:
+    return true;
+  case MachineOperand::MO_RegisterMask:
+  case MachineOperand::MO_RegisterLiveOut:
+    return false;
+  case MachineOperand::MO_Metadata:
+  case MachineOperand::MO_MCSymbol:
+    return true;
+  case MachineOperand::MO_CFIIndex:
+    return false;
+  case MachineOperand::MO_IntrinsicID:
+  case MachineOperand::MO_Predicate:
+    llvm_unreachable("should not exist post-isel");
+  }
+  llvm_unreachable("unhandled machine operand type");
+}
+
+void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm;
+  const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1);
+  bool RequiresBundling = STI->isTargetWindows() && IsAnAddressOperand(MO);
+  MachineInstrBuilder LO16, HI16;
+
+  if (!STI->hasV6T2Ops() &&
+      (Opcode == ARM::MOVi32imm || Opcode == ARM::MOVCCi32imm)) {
+    // FIXME Windows CE supports older ARM CPUs
+    assert(!STI->isTargetWindows() && "Windows on ARM requires ARMv7+");
+
+    // Expand into a movi + orr.
+    LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg);
+    HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri))
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstReg);
+
+    assert (MO.isImm() && "MOVi32imm w/ non-immediate source operand!");
+    unsigned ImmVal = (unsigned)MO.getImm();
+    unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
+    unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
+    LO16 = LO16.addImm(SOImmValV1);
+    HI16 = HI16.addImm(SOImmValV2);
+    LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    LO16.addImm(Pred).addReg(PredReg).addReg(0);
+    HI16.addImm(Pred).addReg(PredReg).addReg(0);
+    TransferImpOps(MI, LO16, HI16);
+    MI.eraseFromParent();
+    return;
+  }
+
+  unsigned LO16Opc = 0;
+  unsigned HI16Opc = 0;
+  if (Opcode == ARM::t2MOVi32imm || Opcode == ARM::t2MOVCCi32imm) {
+    LO16Opc = ARM::t2MOVi16;
+    HI16Opc = ARM::t2MOVTi16;
+  } else {
+    LO16Opc = ARM::MOVi16;
+    HI16Opc = ARM::MOVTi16;
+  }
+
+  LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LO16Opc), DstReg);
+  HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc))
+    .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstReg);
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Immediate: {
+    unsigned Imm = MO.getImm();
+    unsigned Lo16 = Imm & 0xffff;
+    unsigned Hi16 = (Imm >> 16) & 0xffff;
+    LO16 = LO16.addImm(Lo16);
+    HI16 = HI16.addImm(Hi16);
+    break;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    const char *ES = MO.getSymbolName();
+    unsigned TF = MO.getTargetFlags();
+    LO16 = LO16.addExternalSymbol(ES, TF | ARMII::MO_LO16);
+    HI16 = HI16.addExternalSymbol(ES, TF | ARMII::MO_HI16);
+    break;
+  }
+  default: {
+    const GlobalValue *GV = MO.getGlobal();
+    unsigned TF = MO.getTargetFlags();
+    LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16);
+    HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16);
+    break;
+  }
+  }
+
+  LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  LO16.addImm(Pred).addReg(PredReg);
+  HI16.addImm(Pred).addReg(PredReg);
+
+  if (RequiresBundling)
+    finalizeBundle(MBB, LO16->getIterator(), MBBI->getIterator());
+
+  TransferImpOps(MI, LO16, HI16);
+  MI.eraseFromParent();
+}
+
+static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) {
+  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+    MBB->addLiveIn(*I);
+}
+
+/// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as
+/// possible. This only gets used at -O0 so we don't care about efficiency of the
+/// generated code.
+bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     unsigned LdrexOp, unsigned StrexOp,
+                                     unsigned UxtOp,
+                                     MachineBasicBlock::iterator &NextMBBI) {
+  bool IsThumb = STI->isThumb();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineOperand &Dest = MI.getOperand(0);
+  unsigned StatusReg = MI.getOperand(1).getReg();
+  MachineOperand &Addr = MI.getOperand(2);
+  MachineOperand &Desired = MI.getOperand(3);
+  MachineOperand &New = MI.getOperand(4);
+
+  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+    LiveRegs.stepBackward(*I);
+
+  MachineFunction *MF = MBB.getParent();
+  auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  MF->insert(++MBB.getIterator(), LoadCmpBB);
+  MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+  MF->insert(++StoreBB->getIterator(), DoneBB);
+
+  if (UxtOp) {
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, DL, TII->get(UxtOp), Desired.getReg())
+            .addReg(Desired.getReg(), RegState::Kill);
+    if (!IsThumb)
+      MIB.addImm(0);
+    AddDefaultPred(MIB);
+  }
+
+  // .Lloadcmp:
+  //     ldrex rDest, [rAddr]
+  //     cmp rDest, rDesired
+  //     bne .Ldone
+  LoadCmpBB->addLiveIn(Addr.getReg());
+  LoadCmpBB->addLiveIn(Dest.getReg());
+  LoadCmpBB->addLiveIn(Desired.getReg());
+  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(LoadCmpBB, DL, TII->get(LdrexOp), Dest.getReg());
+  MIB.addReg(Addr.getReg());
+  if (LdrexOp == ARM::t2LDREX)
+    MIB.addImm(0); // a 32-bit Thumb ldrex (only) allows an offset.
+  AddDefaultPred(MIB);
+
+  unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
+  AddDefaultPred(BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
+                     .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
+                     .addOperand(Desired));
+  unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc;
+  BuildMI(LoadCmpBB, DL, TII->get(Bcc))
+      .addMBB(DoneBB)
+      .addImm(ARMCC::NE)
+      .addReg(ARM::CPSR, RegState::Kill);
+  LoadCmpBB->addSuccessor(DoneBB);
+  LoadCmpBB->addSuccessor(StoreBB);
+
+  // .Lstore:
+  //     strex rStatus, rNew, [rAddr]
+  //     cmp rStatus, #0
+  //     bne .Lloadcmp
+  StoreBB->addLiveIn(Addr.getReg());
+  StoreBB->addLiveIn(New.getReg());
+  addPostLoopLiveIns(StoreBB, LiveRegs);
+
+
+  MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg);
+  MIB.addOperand(New);
+  MIB.addOperand(Addr);
+  if (StrexOp == ARM::t2STREX)
+    MIB.addImm(0); // a 32-bit Thumb strex (only) allows an offset.
+  AddDefaultPred(MIB);
+
+  unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
+  AddDefaultPred(BuildMI(StoreBB, DL, TII->get(CMPri))
+                     .addReg(StatusReg, RegState::Kill)
+                     .addImm(0));
+  BuildMI(StoreBB, DL, TII->get(Bcc))
+      .addMBB(LoadCmpBB)
+      .addImm(ARMCC::NE)
+      .addReg(ARM::CPSR, RegState::Kill);
+  StoreBB->addSuccessor(LoadCmpBB);
+  StoreBB->addSuccessor(DoneBB);
+
+  DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+  DoneBB->transferSuccessors(&MBB);
+  addPostLoopLiveIns(DoneBB, LiveRegs);
+
+  MBB.addSuccessor(LoadCmpBB);
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+  return true;
+}
+
+/// ARM's ldrexd/strexd take a consecutive register pair (represented as a
+/// single GPRPair register), Thumb's take two separate registers so we need to
+/// extract the subregs from the pair.
+static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg,
+                                unsigned Flags, bool IsThumb,
+                                const TargetRegisterInfo *TRI) {
+  if (IsThumb) {
+    unsigned RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0);
+    unsigned RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1);
+    MIB.addReg(RegLo, Flags | getKillRegState(Reg.isDead()));
+    MIB.addReg(RegHi, Flags | getKillRegState(Reg.isDead()));
+  } else
+    MIB.addReg(Reg.getReg(), Flags | getKillRegState(Reg.isDead()));
+}
+
+/// Expand a 64-bit CMP_SWAP to an ldrexd/strexd loop.
+bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        MachineBasicBlock::iterator &NextMBBI) {
+  bool IsThumb = STI->isThumb();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineOperand &Dest = MI.getOperand(0);
+  unsigned StatusReg = MI.getOperand(1).getReg();
+  MachineOperand &Addr = MI.getOperand(2);
+  MachineOperand &Desired = MI.getOperand(3);
+  MachineOperand &New = MI.getOperand(4);
+
+  unsigned DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0);
+  unsigned DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1);
+  unsigned DesiredLo = TRI->getSubReg(Desired.getReg(), ARM::gsub_0);
+  unsigned DesiredHi = TRI->getSubReg(Desired.getReg(), ARM::gsub_1);
+
+  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+    LiveRegs.stepBackward(*I);
+
+  MachineFunction *MF = MBB.getParent();
+  auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  MF->insert(++MBB.getIterator(), LoadCmpBB);
+  MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+  MF->insert(++StoreBB->getIterator(), DoneBB);
+
+  // .Lloadcmp:
+  //     ldrexd rDestLo, rDestHi, [rAddr]
+  //     cmp rDestLo, rDesiredLo
+  //     sbcs rStatus<dead>, rDestHi, rDesiredHi
+  //     bne .Ldone
+  LoadCmpBB->addLiveIn(Addr.getReg());
+  LoadCmpBB->addLiveIn(Dest.getReg());
+  LoadCmpBB->addLiveIn(Desired.getReg());
+  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+  unsigned LDREXD = IsThumb ? ARM::t2LDREXD : ARM::LDREXD;
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(LoadCmpBB, DL, TII->get(LDREXD));
+  addExclusiveRegPair(MIB, Dest, RegState::Define, IsThumb, TRI);
+  MIB.addReg(Addr.getReg());
+  AddDefaultPred(MIB);
+
+  unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
+  AddDefaultPred(BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
+                     .addReg(DestLo, getKillRegState(Dest.isDead()))
+                     .addReg(DesiredLo, getKillRegState(Desired.isDead())));
+
+  BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
+      .addReg(DestHi, getKillRegState(Dest.isDead()))
+      .addReg(DesiredHi, getKillRegState(Desired.isDead()))
+      .addImm(ARMCC::EQ).addReg(ARM::CPSR, RegState::Kill);
+
+  unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc;
+  BuildMI(LoadCmpBB, DL, TII->get(Bcc))
+      .addMBB(DoneBB)
+      .addImm(ARMCC::NE)
+      .addReg(ARM::CPSR, RegState::Kill);
+  LoadCmpBB->addSuccessor(DoneBB);
+  LoadCmpBB->addSuccessor(StoreBB);
+
+  // .Lstore:
+  //     strexd rStatus, rNewLo, rNewHi, [rAddr]
+  //     cmp rStatus, #0
+  //     bne .Lloadcmp
+  StoreBB->addLiveIn(Addr.getReg());
+  StoreBB->addLiveIn(New.getReg());
+  addPostLoopLiveIns(StoreBB, LiveRegs);
+
+  unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD;
+  MIB = BuildMI(StoreBB, DL, TII->get(STREXD), StatusReg);
+  addExclusiveRegPair(MIB, New, 0, IsThumb, TRI);
+  MIB.addOperand(Addr);
+  AddDefaultPred(MIB);
+
+  unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
+  AddDefaultPred(BuildMI(StoreBB, DL, TII->get(CMPri))
+                     .addReg(StatusReg, RegState::Kill)
+                     .addImm(0));
+  BuildMI(StoreBB, DL, TII->get(Bcc))
+      .addMBB(LoadCmpBB)
+      .addImm(ARMCC::NE)
+      .addReg(ARM::CPSR, RegState::Kill);
+  StoreBB->addSuccessor(LoadCmpBB);
+  StoreBB->addSuccessor(DoneBB);
+
+  DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+  DoneBB->transferSuccessors(&MBB);
+  addPostLoopLiveIns(DoneBB, LiveRegs);
+
+  MBB.addSuccessor(LoadCmpBB);
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+  return true;
+}
+
+
+bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+                               MachineBasicBlock::iterator &NextMBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+    default:
+      return false;
+
+    case ARM::TCRETURNdi:
+    case ARM::TCRETURNri: {
+      MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+      assert(MBBI->isReturn() &&
+             "Can only insert epilog into returning blocks");
+      unsigned RetOpcode = MBBI->getOpcode();
+      DebugLoc dl = MBBI->getDebugLoc();
+      const ARMBaseInstrInfo &TII = *static_cast<const ARMBaseInstrInfo *>(
+          MBB.getParent()->getSubtarget().getInstrInfo());
+
+      // Tail call return: adjust the stack pointer and jump to callee.
+      MBBI = MBB.getLastNonDebugInstr();
+      MachineOperand &JumpTarget = MBBI->getOperand(0);
+
+      // Jump to label or value in register.
+      if (RetOpcode == ARM::TCRETURNdi) {
+        unsigned TCOpcode =
+            STI->isThumb()
+                ? (STI->isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND)
+                : ARM::TAILJMPd;
+        MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode));
+        if (JumpTarget.isGlobal())
+          MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                               JumpTarget.getTargetFlags());
+        else {
+          assert(JumpTarget.isSymbol());
+          MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+                                JumpTarget.getTargetFlags());
+        }
+
+        // Add the default predicate in Thumb mode.
+        if (STI->isThumb())
+          MIB.addImm(ARMCC::AL).addReg(0);
+      } else if (RetOpcode == ARM::TCRETURNri) {
+        BuildMI(MBB, MBBI, dl,
+                TII.get(STI->isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr))
+            .addReg(JumpTarget.getReg(), RegState::Kill);
+      }
+
+      auto NewMI = std::prev(MBBI);
+      for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
+        NewMI->addOperand(MBBI->getOperand(i));
+
+      // Delete the pseudo instruction TCRETURN.
+      MBB.erase(MBBI);
+      MBBI = NewMI;
+      return true;
+    }
+    case ARM::VMOVScc:
+    case ARM::VMOVDcc: {
+      unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc),
+              MI.getOperand(1).getReg())
+        .addOperand(MI.getOperand(2))
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addOperand(MI.getOperand(4));
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::t2MOVCCr:
+    case ARM::MOVCCr: {
+      unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVr : ARM::MOVr;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
+              MI.getOperand(1).getReg())
+        .addOperand(MI.getOperand(2))
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addOperand(MI.getOperand(4))
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::MOVCCsi: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
+              (MI.getOperand(1).getReg()))
+        .addOperand(MI.getOperand(2))
+        .addImm(MI.getOperand(3).getImm())
+        .addImm(MI.getOperand(4).getImm()) // 'pred'
+        .addOperand(MI.getOperand(5))
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::MOVCCsr: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsr),
+              (MI.getOperand(1).getReg()))
+        .addOperand(MI.getOperand(2))
+        .addOperand(MI.getOperand(3))
+        .addImm(MI.getOperand(4).getImm())
+        .addImm(MI.getOperand(5).getImm()) // 'pred'
+        .addOperand(MI.getOperand(6))
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::t2MOVCCi16:
+    case ARM::MOVCCi16: {
+      unsigned NewOpc = AFI->isThumbFunction() ? ARM::t2MOVi16 : ARM::MOVi16;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
+              MI.getOperand(1).getReg())
+        .addImm(MI.getOperand(2).getImm())
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addOperand(MI.getOperand(4));
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::t2MOVCCi:
+    case ARM::MOVCCi: {
+      unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVi : ARM::MOVi;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
+              MI.getOperand(1).getReg())
+        .addImm(MI.getOperand(2).getImm())
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addOperand(MI.getOperand(4))
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::t2MVNCCi:
+    case ARM::MVNCCi: {
+      unsigned Opc = AFI->isThumbFunction() ? ARM::t2MVNi : ARM::MVNi;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
+              MI.getOperand(1).getReg())
+        .addImm(MI.getOperand(2).getImm())
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addOperand(MI.getOperand(4))
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::t2MOVCClsl:
+    case ARM::t2MOVCClsr:
+    case ARM::t2MOVCCasr:
+    case ARM::t2MOVCCror: {
+      unsigned NewOpc;
+      switch (Opcode) {
+      case ARM::t2MOVCClsl: NewOpc = ARM::t2LSLri; break;
+      case ARM::t2MOVCClsr: NewOpc = ARM::t2LSRri; break;
+      case ARM::t2MOVCCasr: NewOpc = ARM::t2ASRri; break;
+      case ARM::t2MOVCCror: NewOpc = ARM::t2RORri; break;
+      default: llvm_unreachable("unexpeced conditional move");
+      }
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
+              MI.getOperand(1).getReg())
+        .addOperand(MI.getOperand(2))
+        .addImm(MI.getOperand(3).getImm())
+        .addImm(MI.getOperand(4).getImm()) // 'pred'
+        .addOperand(MI.getOperand(5))
+        .addReg(0); // 's' bit
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::Int_eh_sjlj_dispatchsetup: {
+      MachineFunction &MF = *MI.getParent()->getParent();
+      const ARMBaseInstrInfo *AII =
+        static_cast<const ARMBaseInstrInfo*>(TII);
+      const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+      // For functions using a base pointer, we rematerialize it (via the frame
+      // pointer) here since eh.sjlj.setjmp and eh.sjlj.longjmp don't do it
+      // for us. Otherwise, expand to nothing.
+      if (RI.hasBasePointer(MF)) {
+        int32_t NumBytes = AFI->getFramePtrSpillOffset();
+        unsigned FramePtr = RI.getFrameRegister(MF);
+        assert(MF.getSubtarget().getFrameLowering()->hasFP(MF) &&
+               "base pointer without frame pointer?");
+
+        if (AFI->isThumb2Function()) {
+          emitT2RegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+                                 FramePtr, -NumBytes, ARMCC::AL, 0, *TII);
+        } else if (AFI->isThumbFunction()) {
+          emitThumbRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+                                    FramePtr, -NumBytes, *TII, RI);
+        } else {
+          emitARMRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+                                  FramePtr, -NumBytes, ARMCC::AL, 0,
+                                  *TII);
+        }
+        // If there's dynamic realignment, adjust for it.
+        if (RI.needsStackRealignment(MF)) {
+          MachineFrameInfo &MFI = MF.getFrameInfo();
+          unsigned MaxAlign = MFI.getMaxAlignment();
+          assert (!AFI->isThumb1OnlyFunction());
+          // Emit bic r6, r6, MaxAlign
+          assert(MaxAlign <= 256 && "The BIC instruction cannot encode "
+                                    "immediates larger than 256 with all lower "
+                                    "bits set.");
+          unsigned bicOpc = AFI->isThumbFunction() ?
+            ARM::t2BICri : ARM::BICri;
+          AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                              TII->get(bicOpc), ARM::R6)
+                                      .addReg(ARM::R6, RegState::Kill)
+                                      .addImm(MaxAlign-1)));
+        }
+
+      }
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::MOVsrl_flag:
+    case ARM::MOVsra_flag: {
+      // These are just fancy MOVs instructions.
+      AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
+                             MI.getOperand(0).getReg())
+                     .addOperand(MI.getOperand(1))
+                     .addImm(ARM_AM::getSORegOpc((Opcode == ARM::MOVsrl_flag ?
+                                                  ARM_AM::lsr : ARM_AM::asr),
+                                                 1)))
+        .addReg(ARM::CPSR, RegState::Define);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::RRX: {
+      // This encodes as "MOVs Rd, Rm, rrx
+      MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),TII->get(ARM::MOVsi),
+                               MI.getOperand(0).getReg())
+                       .addOperand(MI.getOperand(1))
+                       .addImm(ARM_AM::getSORegOpc(ARM_AM::rrx, 0)))
+        .addReg(0);
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::tTPsoft:
+    case ARM::TPsoft: {
+      MachineInstrBuilder MIB;
+      if (Opcode == ARM::tTPsoft)
+        MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                      TII->get( ARM::tBL))
+              .addImm((unsigned)ARMCC::AL).addReg(0)
+              .addExternalSymbol("__aeabi_read_tp", 0);
+      else
+        MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                      TII->get( ARM::BL))
+              .addExternalSymbol("__aeabi_read_tp", 0);
+
+      MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::tLDRpci_pic:
+    case ARM::t2LDRpci_pic: {
+      unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic)
+        ? ARM::tLDRpci : ARM::t2LDRpci;
+      unsigned DstReg = MI.getOperand(0).getReg();
+      bool DstIsDead = MI.getOperand(0).isDead();
+      MachineInstrBuilder MIB1 =
+        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                               TII->get(NewLdOpc), DstReg)
+                       .addOperand(MI.getOperand(1)));
+      MIB1->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                         TII->get(ARM::tPICADD))
+        .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+        .addReg(DstReg)
+        .addOperand(MI.getOperand(2));
+      TransferImpOps(MI, MIB1, MIB2);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::LDRLIT_ga_abs:
+    case ARM::LDRLIT_ga_pcrel:
+    case ARM::LDRLIT_ga_pcrel_ldr:
+    case ARM::tLDRLIT_ga_abs:
+    case ARM::tLDRLIT_ga_pcrel: {
+      unsigned DstReg = MI.getOperand(0).getReg();
+      bool DstIsDead = MI.getOperand(0).isDead();
+      const MachineOperand &MO1 = MI.getOperand(1);
+      const GlobalValue *GV = MO1.getGlobal();
+      bool IsARM =
+          Opcode != ARM::tLDRLIT_ga_pcrel && Opcode != ARM::tLDRLIT_ga_abs;
+      bool IsPIC =
+          Opcode != ARM::LDRLIT_ga_abs && Opcode != ARM::tLDRLIT_ga_abs;
+      unsigned LDRLITOpc = IsARM ? ARM::LDRi12 : ARM::tLDRpci;
+      unsigned PICAddOpc =
+          IsARM
+              ? (Opcode == ARM::LDRLIT_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD)
+              : ARM::tPICADD;
+
+      // We need a new const-pool entry to load from.
+      MachineConstantPool *MCP = MBB.getParent()->getConstantPool();
+      unsigned ARMPCLabelIndex = 0;
+      MachineConstantPoolValue *CPV;
+
+      if (IsPIC) {
+        unsigned PCAdj = IsARM ? 8 : 4;
+        ARMPCLabelIndex = AFI->createPICLabelUId();
+        CPV = ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex,
+                                              ARMCP::CPValue, PCAdj);
+      } else
+        CPV = ARMConstantPoolConstant::Create(GV, ARMCP::no_modifier);
+
+      MachineInstrBuilder MIB =
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LDRLITOpc), DstReg)
+            .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
+      if (IsARM)
+        MIB.addImm(0);
+      AddDefaultPred(MIB);
+
+      if (IsPIC) {
+        MachineInstrBuilder MIB =
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(PICAddOpc))
+            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstReg)
+            .addImm(ARMPCLabelIndex);
+
+        if (IsARM)
+          AddDefaultPred(MIB);
+      }
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::MOV_ga_pcrel:
+    case ARM::MOV_ga_pcrel_ldr:
+    case ARM::t2MOV_ga_pcrel: {
+      // Expand into movw + movw. Also "add pc" / ldr [pc] in PIC mode.
+      unsigned LabelId = AFI->createPICLabelUId();
+      unsigned DstReg = MI.getOperand(0).getReg();
+      bool DstIsDead = MI.getOperand(0).isDead();
+      const MachineOperand &MO1 = MI.getOperand(1);
+      const GlobalValue *GV = MO1.getGlobal();
+      unsigned TF = MO1.getTargetFlags();
+      bool isARM = Opcode != ARM::t2MOV_ga_pcrel;
+      unsigned LO16Opc = isARM ? ARM::MOVi16_ga_pcrel : ARM::t2MOVi16_ga_pcrel;
+      unsigned HI16Opc = isARM ? ARM::MOVTi16_ga_pcrel :ARM::t2MOVTi16_ga_pcrel;
+      unsigned LO16TF = TF | ARMII::MO_LO16;
+      unsigned HI16TF = TF | ARMII::MO_HI16;
+      unsigned PICAddOpc = isARM
+        ? (Opcode == ARM::MOV_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD)
+        : ARM::tPICADD;
+      MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                         TII->get(LO16Opc), DstReg)
+        .addGlobalAddress(GV, MO1.getOffset(), TF | LO16TF)
+        .addImm(LabelId);
+
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc), DstReg)
+        .addReg(DstReg)
+        .addGlobalAddress(GV, MO1.getOffset(), TF | HI16TF)
+        .addImm(LabelId);
+
+      MachineInstrBuilder MIB3 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                         TII->get(PICAddOpc))
+        .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+        .addReg(DstReg).addImm(LabelId);
+      if (isARM) {
+        AddDefaultPred(MIB3);
+        if (Opcode == ARM::MOV_ga_pcrel_ldr)
+          MIB3->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      }
+      TransferImpOps(MI, MIB1, MIB3);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::MOVi32imm:
+    case ARM::MOVCCi32imm:
+    case ARM::t2MOVi32imm:
+    case ARM::t2MOVCCi32imm:
+      ExpandMOV32BitImm(MBB, MBBI);
+      return true;
+
+    case ARM::SUBS_PC_LR: {
+      MachineInstrBuilder MIB =
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri), ARM::PC)
+              .addReg(ARM::LR)
+              .addOperand(MI.getOperand(0))
+              .addOperand(MI.getOperand(1))
+              .addOperand(MI.getOperand(2))
+              .addReg(ARM::CPSR, RegState::Undef);
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::VLDMQIA: {
+      unsigned NewOpc = ARM::VLDMDIA;
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
+      unsigned OpIdx = 0;
+
+      // Grab the Q register destination.
+      bool DstIsDead = MI.getOperand(OpIdx).isDead();
+      unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+
+      // Copy the source register.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      // Copy the predicate operands.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      // Add the destination operands (D subregs).
+      unsigned D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
+      unsigned D1 = TRI->getSubReg(DstReg, ARM::dsub_1);
+      MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead))
+        .addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+
+      // Add an implicit def for the super-register.
+      MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
+      TransferImpOps(MI, MIB, MIB);
+      MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::VSTMQIA: {
+      unsigned NewOpc = ARM::VSTMDIA;
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
+      unsigned OpIdx = 0;
+
+      // Grab the Q register source.
+      bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+      unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+
+      // Copy the destination register.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      // Copy the predicate operands.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      // Add the source operands (D subregs).
+      unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
+      unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
+      MIB.addReg(D0, SrcIsKill ? RegState::Kill : 0)
+         .addReg(D1, SrcIsKill ? RegState::Kill : 0);
+
+      if (SrcIsKill)      // Add an implicit kill for the Q register.
+        MIB->addRegisterKilled(SrcReg, TRI, true);
+
+      TransferImpOps(MI, MIB, MIB);
+      MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::VLD2q8Pseudo:
+    case ARM::VLD2q16Pseudo:
+    case ARM::VLD2q32Pseudo:
+    case ARM::VLD2q8PseudoWB_fixed:
+    case ARM::VLD2q16PseudoWB_fixed:
+    case ARM::VLD2q32PseudoWB_fixed:
+    case ARM::VLD2q8PseudoWB_register:
+    case ARM::VLD2q16PseudoWB_register:
+    case ARM::VLD2q32PseudoWB_register:
+    case ARM::VLD3d8Pseudo:
+    case ARM::VLD3d16Pseudo:
+    case ARM::VLD3d32Pseudo:
+    case ARM::VLD1d64TPseudo:
+    case ARM::VLD1d64TPseudoWB_fixed:
+    case ARM::VLD3d8Pseudo_UPD:
+    case ARM::VLD3d16Pseudo_UPD:
+    case ARM::VLD3d32Pseudo_UPD:
+    case ARM::VLD3q8Pseudo_UPD:
+    case ARM::VLD3q16Pseudo_UPD:
+    case ARM::VLD3q32Pseudo_UPD:
+    case ARM::VLD3q8oddPseudo:
+    case ARM::VLD3q16oddPseudo:
+    case ARM::VLD3q32oddPseudo:
+    case ARM::VLD3q8oddPseudo_UPD:
+    case ARM::VLD3q16oddPseudo_UPD:
+    case ARM::VLD3q32oddPseudo_UPD:
+    case ARM::VLD4d8Pseudo:
+    case ARM::VLD4d16Pseudo:
+    case ARM::VLD4d32Pseudo:
+    case ARM::VLD1d64QPseudo:
+    case ARM::VLD1d64QPseudoWB_fixed:
+    case ARM::VLD4d8Pseudo_UPD:
+    case ARM::VLD4d16Pseudo_UPD:
+    case ARM::VLD4d32Pseudo_UPD:
+    case ARM::VLD4q8Pseudo_UPD:
+    case ARM::VLD4q16Pseudo_UPD:
+    case ARM::VLD4q32Pseudo_UPD:
+    case ARM::VLD4q8oddPseudo:
+    case ARM::VLD4q16oddPseudo:
+    case ARM::VLD4q32oddPseudo:
+    case ARM::VLD4q8oddPseudo_UPD:
+    case ARM::VLD4q16oddPseudo_UPD:
+    case ARM::VLD4q32oddPseudo_UPD:
+    case ARM::VLD3DUPd8Pseudo:
+    case ARM::VLD3DUPd16Pseudo:
+    case ARM::VLD3DUPd32Pseudo:
+    case ARM::VLD3DUPd8Pseudo_UPD:
+    case ARM::VLD3DUPd16Pseudo_UPD:
+    case ARM::VLD3DUPd32Pseudo_UPD:
+    case ARM::VLD4DUPd8Pseudo:
+    case ARM::VLD4DUPd16Pseudo:
+    case ARM::VLD4DUPd32Pseudo:
+    case ARM::VLD4DUPd8Pseudo_UPD:
+    case ARM::VLD4DUPd16Pseudo_UPD:
+    case ARM::VLD4DUPd32Pseudo_UPD:
+      ExpandVLD(MBBI);
+      return true;
+
+    case ARM::VST2q8Pseudo:
+    case ARM::VST2q16Pseudo:
+    case ARM::VST2q32Pseudo:
+    case ARM::VST2q8PseudoWB_fixed:
+    case ARM::VST2q16PseudoWB_fixed:
+    case ARM::VST2q32PseudoWB_fixed:
+    case ARM::VST2q8PseudoWB_register:
+    case ARM::VST2q16PseudoWB_register:
+    case ARM::VST2q32PseudoWB_register:
+    case ARM::VST3d8Pseudo:
+    case ARM::VST3d16Pseudo:
+    case ARM::VST3d32Pseudo:
+    case ARM::VST1d64TPseudo:
+    case ARM::VST3d8Pseudo_UPD:
+    case ARM::VST3d16Pseudo_UPD:
+    case ARM::VST3d32Pseudo_UPD:
+    case ARM::VST1d64TPseudoWB_fixed:
+    case ARM::VST1d64TPseudoWB_register:
+    case ARM::VST3q8Pseudo_UPD:
+    case ARM::VST3q16Pseudo_UPD:
+    case ARM::VST3q32Pseudo_UPD:
+    case ARM::VST3q8oddPseudo:
+    case ARM::VST3q16oddPseudo:
+    case ARM::VST3q32oddPseudo:
+    case ARM::VST3q8oddPseudo_UPD:
+    case ARM::VST3q16oddPseudo_UPD:
+    case ARM::VST3q32oddPseudo_UPD:
+    case ARM::VST4d8Pseudo:
+    case ARM::VST4d16Pseudo:
+    case ARM::VST4d32Pseudo:
+    case ARM::VST1d64QPseudo:
+    case ARM::VST4d8Pseudo_UPD:
+    case ARM::VST4d16Pseudo_UPD:
+    case ARM::VST4d32Pseudo_UPD:
+    case ARM::VST1d64QPseudoWB_fixed:
+    case ARM::VST1d64QPseudoWB_register:
+    case ARM::VST4q8Pseudo_UPD:
+    case ARM::VST4q16Pseudo_UPD:
+    case ARM::VST4q32Pseudo_UPD:
+    case ARM::VST4q8oddPseudo:
+    case ARM::VST4q16oddPseudo:
+    case ARM::VST4q32oddPseudo:
+    case ARM::VST4q8oddPseudo_UPD:
+    case ARM::VST4q16oddPseudo_UPD:
+    case ARM::VST4q32oddPseudo_UPD:
+      ExpandVST(MBBI);
+      return true;
+
+    case ARM::VLD1LNq8Pseudo:
+    case ARM::VLD1LNq16Pseudo:
+    case ARM::VLD1LNq32Pseudo:
+    case ARM::VLD1LNq8Pseudo_UPD:
+    case ARM::VLD1LNq16Pseudo_UPD:
+    case ARM::VLD1LNq32Pseudo_UPD:
+    case ARM::VLD2LNd8Pseudo:
+    case ARM::VLD2LNd16Pseudo:
+    case ARM::VLD2LNd32Pseudo:
+    case ARM::VLD2LNq16Pseudo:
+    case ARM::VLD2LNq32Pseudo:
+    case ARM::VLD2LNd8Pseudo_UPD:
+    case ARM::VLD2LNd16Pseudo_UPD:
+    case ARM::VLD2LNd32Pseudo_UPD:
+    case ARM::VLD2LNq16Pseudo_UPD:
+    case ARM::VLD2LNq32Pseudo_UPD:
+    case ARM::VLD3LNd8Pseudo:
+    case ARM::VLD3LNd16Pseudo:
+    case ARM::VLD3LNd32Pseudo:
+    case ARM::VLD3LNq16Pseudo:
+    case ARM::VLD3LNq32Pseudo:
+    case ARM::VLD3LNd8Pseudo_UPD:
+    case ARM::VLD3LNd16Pseudo_UPD:
+    case ARM::VLD3LNd32Pseudo_UPD:
+    case ARM::VLD3LNq16Pseudo_UPD:
+    case ARM::VLD3LNq32Pseudo_UPD:
+    case ARM::VLD4LNd8Pseudo:
+    case ARM::VLD4LNd16Pseudo:
+    case ARM::VLD4LNd32Pseudo:
+    case ARM::VLD4LNq16Pseudo:
+    case ARM::VLD4LNq32Pseudo:
+    case ARM::VLD4LNd8Pseudo_UPD:
+    case ARM::VLD4LNd16Pseudo_UPD:
+    case ARM::VLD4LNd32Pseudo_UPD:
+    case ARM::VLD4LNq16Pseudo_UPD:
+    case ARM::VLD4LNq32Pseudo_UPD:
+    case ARM::VST1LNq8Pseudo:
+    case ARM::VST1LNq16Pseudo:
+    case ARM::VST1LNq32Pseudo:
+    case ARM::VST1LNq8Pseudo_UPD:
+    case ARM::VST1LNq16Pseudo_UPD:
+    case ARM::VST1LNq32Pseudo_UPD:
+    case ARM::VST2LNd8Pseudo:
+    case ARM::VST2LNd16Pseudo:
+    case ARM::VST2LNd32Pseudo:
+    case ARM::VST2LNq16Pseudo:
+    case ARM::VST2LNq32Pseudo:
+    case ARM::VST2LNd8Pseudo_UPD:
+    case ARM::VST2LNd16Pseudo_UPD:
+    case ARM::VST2LNd32Pseudo_UPD:
+    case ARM::VST2LNq16Pseudo_UPD:
+    case ARM::VST2LNq32Pseudo_UPD:
+    case ARM::VST3LNd8Pseudo:
+    case ARM::VST3LNd16Pseudo:
+    case ARM::VST3LNd32Pseudo:
+    case ARM::VST3LNq16Pseudo:
+    case ARM::VST3LNq32Pseudo:
+    case ARM::VST3LNd8Pseudo_UPD:
+    case ARM::VST3LNd16Pseudo_UPD:
+    case ARM::VST3LNd32Pseudo_UPD:
+    case ARM::VST3LNq16Pseudo_UPD:
+    case ARM::VST3LNq32Pseudo_UPD:
+    case ARM::VST4LNd8Pseudo:
+    case ARM::VST4LNd16Pseudo:
+    case ARM::VST4LNd32Pseudo:
+    case ARM::VST4LNq16Pseudo:
+    case ARM::VST4LNq32Pseudo:
+    case ARM::VST4LNd8Pseudo_UPD:
+    case ARM::VST4LNd16Pseudo_UPD:
+    case ARM::VST4LNd32Pseudo_UPD:
+    case ARM::VST4LNq16Pseudo_UPD:
+    case ARM::VST4LNq32Pseudo_UPD:
+      ExpandLaneOp(MBBI);
+      return true;
+
+    case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false); return true;
+    case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true;
+    case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;
+    case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true;
+
+    case ARM::CMP_SWAP_8:
+      if (STI->isThumb())
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXB, ARM::t2STREXB,
+                              ARM::tUXTB, NextMBBI);
+      else
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXB, ARM::STREXB,
+                              ARM::UXTB, NextMBBI);
+    case ARM::CMP_SWAP_16:
+      if (STI->isThumb())
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXH, ARM::t2STREXH,
+                              ARM::tUXTH, NextMBBI);
+      else
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXH, ARM::STREXH,
+                              ARM::UXTH, NextMBBI);
+    case ARM::CMP_SWAP_32:
+      if (STI->isThumb())
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREX, ARM::t2STREX, 0,
+                              NextMBBI);
+      else
+        return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREX, ARM::STREX, 0, NextMBBI);
+
+    case ARM::CMP_SWAP_64:
+      return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI);
+  }
+}
+
+bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= ExpandMI(MBB, MBBI, NMBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  STI = &static_cast<const ARMSubtarget &>(MF.getSubtarget());
+  TII = STI->getInstrInfo();
+  TRI = STI->getRegisterInfo();
+  AFI = MF.getInfo<ARMFunctionInfo>();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
+       ++MFI)
+    Modified |= ExpandMBB(*MFI);
+  if (VerifyARMPseudo)
+    MF.verify(this, "After expanding ARM pseudo instructions.");
+  return Modified;
+}
+
+/// createARMExpandPseudoPass - returns an instance of the pseudo instruction
+/// expansion pass.
+FunctionPass *llvm::createARMExpandPseudoPass() {
+  return new ARMExpandPseudo();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
new file mode 100644
index 000000000000..df4dcb375750
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -0,0 +1,3045 @@
+//===-- ARMFastISel.cpp - ARM FastISel implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM-specific support for the FastISel class. Some
+// of the target-specific code is generated by tablegen in the file
+// ARMGenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMCallingConv.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMISelLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+
+  // All possible address modes, plus some.
+  typedef struct Address {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    union {
+      unsigned Reg;
+      int FI;
+    } Base;
+
+    int Offset;
+
+    // Innocuous defaults for our address.
+    Address()
+     : BaseType(RegBase), Offset(0) {
+       Base.Reg = 0;
+     }
+  } Address;
+
+class ARMFastISel final : public FastISel {
+
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARMSubtarget *Subtarget;
+  Module &M;
+  const TargetMachine &TM;
+  const TargetInstrInfo &TII;
+  const TargetLowering &TLI;
+  ARMFunctionInfo *AFI;
+
+  // Convenience variables to avoid some queries.
+  bool isThumb2;
+  LLVMContext *Context;
+
+  public:
+    explicit ARMFastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo)
+        : FastISel(funcInfo, libInfo),
+          Subtarget(
+              &static_cast<const ARMSubtarget &>(funcInfo.MF->getSubtarget())),
+          M(const_cast<Module &>(*funcInfo.Fn->getParent())),
+          TM(funcInfo.MF->getTarget()), TII(*Subtarget->getInstrInfo()),
+          TLI(*Subtarget->getTargetLowering()) {
+      AFI = funcInfo.MF->getInfo<ARMFunctionInfo>();
+      isThumb2 = AFI->isThumbFunction();
+      Context = &funcInfo.Fn->getContext();
+    }
+
+    // Code from FastISel.cpp.
+  private:
+    unsigned fastEmitInst_r(unsigned MachineInstOpcode,
+                            const TargetRegisterClass *RC,
+                            unsigned Op0, bool Op0IsKill);
+    unsigned fastEmitInst_rr(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             unsigned Op1, bool Op1IsKill);
+    unsigned fastEmitInst_ri(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             uint64_t Imm);
+    unsigned fastEmitInst_i(unsigned MachineInstOpcode,
+                            const TargetRegisterClass *RC,
+                            uint64_t Imm);
+
+    // Backend specific FastISel code.
+  private:
+    bool fastSelectInstruction(const Instruction *I) override;
+    unsigned fastMaterializeConstant(const Constant *C) override;
+    unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
+    bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                             const LoadInst *LI) override;
+    bool fastLowerArguments() override;
+  private:
+  #include "ARMGenFastISel.inc"
+
+    // Instruction selection routines.
+  private:
+    bool SelectLoad(const Instruction *I);
+    bool SelectStore(const Instruction *I);
+    bool SelectBranch(const Instruction *I);
+    bool SelectIndirectBr(const Instruction *I);
+    bool SelectCmp(const Instruction *I);
+    bool SelectFPExt(const Instruction *I);
+    bool SelectFPTrunc(const Instruction *I);
+    bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode);
+    bool SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode);
+    bool SelectIToFP(const Instruction *I, bool isSigned);
+    bool SelectFPToI(const Instruction *I, bool isSigned);
+    bool SelectDiv(const Instruction *I, bool isSigned);
+    bool SelectRem(const Instruction *I, bool isSigned);
+    bool SelectCall(const Instruction *I, const char *IntrMemName);
+    bool SelectIntrinsicCall(const IntrinsicInst &I);
+    bool SelectSelect(const Instruction *I);
+    bool SelectRet(const Instruction *I);
+    bool SelectTrunc(const Instruction *I);
+    bool SelectIntExt(const Instruction *I);
+    bool SelectShift(const Instruction *I, ARM_AM::ShiftOpc ShiftTy);
+
+    // Utility routines.
+  private:
+    bool isPositionIndependent() const;
+    bool isTypeLegal(Type *Ty, MVT &VT);
+    bool isLoadTypeLegal(Type *Ty, MVT &VT);
+    bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
+                    bool isZExt);
+    bool ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                     unsigned Alignment = 0, bool isZExt = true,
+                     bool allocReg = true);
+    bool ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
+                      unsigned Alignment = 0);
+    bool ARMComputeAddress(const Value *Obj, Address &Addr);
+    void ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3);
+    bool ARMIsMemCpySmall(uint64_t Len);
+    bool ARMTryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
+                               unsigned Alignment);
+    unsigned ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+    unsigned ARMMaterializeFP(const ConstantFP *CFP, MVT VT);
+    unsigned ARMMaterializeInt(const Constant *C, MVT VT);
+    unsigned ARMMaterializeGV(const GlobalValue *GV, MVT VT);
+    unsigned ARMMoveToFPReg(MVT VT, unsigned SrcReg);
+    unsigned ARMMoveToIntReg(MVT VT, unsigned SrcReg);
+    unsigned ARMSelectCallOp(bool UseReg);
+    unsigned ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT);
+
+    const TargetLowering *getTargetLowering() { return &TLI; }
+
+    // Call handling routines.
+  private:
+    CCAssignFn *CCAssignFnForCall(CallingConv::ID CC,
+                                  bool Return,
+                                  bool isVarArg);
+    bool ProcessCallArgs(SmallVectorImpl<Value*> &Args,
+                         SmallVectorImpl<unsigned> &ArgRegs,
+                         SmallVectorImpl<MVT> &ArgVTs,
+                         SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                         SmallVectorImpl<unsigned> &RegArgs,
+                         CallingConv::ID CC,
+                         unsigned &NumBytes,
+                         bool isVarArg);
+    unsigned getLibcallReg(const Twine &Name);
+    bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                    const Instruction *I, CallingConv::ID CC,
+                    unsigned &NumBytes, bool isVarArg);
+    bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call);
+
+    // OptionalDef handling routines.
+  private:
+    bool isARMNEONPred(const MachineInstr *MI);
+    bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR);
+    const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB);
+    void AddLoadStoreOperands(MVT VT, Address &Addr,
+                              const MachineInstrBuilder &MIB,
+                              MachineMemOperand::Flags Flags, bool useAM3);
+};
+
+} // end anonymous namespace
+
+#include "ARMGenCallingConv.inc"
+
+// DefinesOptionalPredicate - This is different from DefinesPredicate in that
+// we don't care about implicit defs here, just places we'll need to add a
+// default CCReg argument. Sets CPSR if we're setting CPSR instead of CCR.
+bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
+  if (!MI->hasOptionalDef())
+    return false;
+
+  // Look to see if our OptionalDef is defining CPSR or CCR.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    if (MO.getReg() == ARM::CPSR)
+      *CPSR = true;
+  }
+  return true;
+}
+
+bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) {
+  const MCInstrDesc &MCID = MI->getDesc();
+
+  // If we're a thumb2 or not NEON function we'll be handled via isPredicable.
+  if ((MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainNEON ||
+       AFI->isThumb2Function())
+    return MI->isPredicable();
+
+  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
+    if (MCID.OpInfo[i].isPredicate())
+      return true;
+
+  return false;
+}
+
+// If the machine is predicable go ahead and add the predicate operands, if
+// it needs default CC operands add those.
+// TODO: If we want to support thumb1 then we'll need to deal with optional
+// CPSR defs that need to be added before the remaining operands. See s_cc_out
+// for descriptions why.
+const MachineInstrBuilder &
+ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
+  MachineInstr *MI = &*MIB;
+
+  // Do we use a predicate? or...
+  // Are we NEON in ARM mode and have a predicate operand? If so, I know
+  // we're not predicable but add it anyways.
+  if (isARMNEONPred(MI))
+    AddDefaultPred(MIB);
+
+  // Do we optionally set a predicate?  Preds is size > 0 iff the predicate
+  // defines CPSR. All other OptionalDefines in ARM are the CCR register.
+  bool CPSR = false;
+  if (DefinesOptionalPredicate(MI, &CPSR)) {
+    if (CPSR)
+      AddDefaultT1CC(MIB);
+    else
+      AddDefaultCC(MIB);
+  }
+  return MIB;
+}
+
+unsigned ARMFastISel::fastEmitInst_r(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  // Make sure the input operand is sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II,
+                            ResultReg).addReg(Op0, Op0IsKill * RegState::Kill));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                   TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      unsigned Op1, bool Op1IsKill) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  // Make sure the input operands are sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  Op1 = constrainOperandRegClass(II, Op1, 2);
+
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+            .addReg(Op0, Op0IsKill * RegState::Kill)
+            .addReg(Op1, Op1IsKill * RegState::Kill));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addReg(Op1, Op1IsKill * RegState::Kill));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  // Make sure the input operand is sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+            .addReg(Op0, Op0IsKill * RegState::Kill)
+            .addImm(Imm));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addImm(Imm));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::fastEmitInst_i(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II,
+                            ResultReg).addImm(Imm));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+                   .addImm(Imm));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+// TODO: Don't worry about 64-bit now, but when this is fixed remove the
+// checks from the various callers.
+unsigned ARMFastISel::ARMMoveToFPReg(MVT VT, unsigned SrcReg) {
+  if (VT == MVT::f64) return 0;
+
+  unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(ARM::VMOVSR), MoveReg)
+                  .addReg(SrcReg));
+  return MoveReg;
+}
+
+unsigned ARMFastISel::ARMMoveToIntReg(MVT VT, unsigned SrcReg) {
+  if (VT == MVT::i64) return 0;
+
+  unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(ARM::VMOVRS), MoveReg)
+                  .addReg(SrcReg));
+  return MoveReg;
+}
+
+// For double width floating point we need to materialize two constants
+// (the high and the low) into integer registers then use a move to get
+// the combined constant into an FP reg.
+unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) {
+  const APFloat Val = CFP->getValueAPF();
+  bool is64bit = VT == MVT::f64;
+
+  // This checks to see if we can use VFP3 instructions to materialize
+  // a constant, otherwise we have to go through the constant pool.
+  if (TLI.isFPImmLegal(Val, VT)) {
+    int Imm;
+    unsigned Opc;
+    if (is64bit) {
+      Imm = ARM_AM::getFP64Imm(Val);
+      Opc = ARM::FCONSTD;
+    } else {
+      Imm = ARM_AM::getFP32Imm(Val);
+      Opc = ARM::FCONSTS;
+    }
+    unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(Opc), DestReg).addImm(Imm));
+    return DestReg;
+  }
+
+  // Require VFP2 for loading fp constants.
+  if (!Subtarget->hasVFP2()) return false;
+
+  // MachineConstantPool wants an explicit alignment.
+  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+  if (Align == 0) {
+    // TODO: Figure out if this is correct.
+    Align = DL.getTypeAllocSize(CFP->getType());
+  }
+  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+  unsigned Opc = is64bit ? ARM::VLDRD : ARM::VLDRS;
+
+  // The extra reg is for addrmode5.
+  AddOptionalDefs(
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+          .addConstantPoolIndex(Idx)
+          .addReg(0));
+  return DestReg;
+}
+
+unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
+
+  if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1)
+    return 0;
+
+  // If we can do this in a single instruction without a constant pool entry
+  // do so now.
+  const ConstantInt *CI = cast<ConstantInt>(C);
+  if (Subtarget->hasV6T2Ops() && isUInt<16>(CI->getZExtValue())) {
+    unsigned Opc = isThumb2 ? ARM::t2MOVi16 : ARM::MOVi16;
+    const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass :
+      &ARM::GPRRegClass;
+    unsigned ImmReg = createResultReg(RC);
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(Opc), ImmReg)
+                    .addImm(CI->getZExtValue()));
+    return ImmReg;
+  }
+
+  // Use MVN to emit negative constants.
+  if (VT == MVT::i32 && Subtarget->hasV6T2Ops() && CI->isNegative()) {
+    unsigned Imm = (unsigned)~(CI->getSExtValue());
+    bool UseImm = isThumb2 ? (ARM_AM::getT2SOImmVal(Imm) != -1) :
+      (ARM_AM::getSOImmVal(Imm) != -1);
+    if (UseImm) {
+      unsigned Opc = isThumb2 ? ARM::t2MVNi : ARM::MVNi;
+      const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass :
+                                                 &ARM::GPRRegClass;
+      unsigned ImmReg = createResultReg(RC);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                              TII.get(Opc), ImmReg)
+                      .addImm(Imm));
+      return ImmReg;
+    }
+  }
+
+  unsigned ResultReg = 0;
+  if (Subtarget->useMovt(*FuncInfo.MF))
+    ResultReg = fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
+
+  if (ResultReg)
+    return ResultReg;
+
+  // Load from constant pool.  For now 32-bit only.
+  if (VT != MVT::i32)
+    return 0;
+
+  // MachineConstantPool wants an explicit alignment.
+  unsigned Align = DL.getPrefTypeAlignment(C->getType());
+  if (Align == 0) {
+    // TODO: Figure out if this is correct.
+    Align = DL.getTypeAllocSize(C->getType());
+  }
+  unsigned Idx = MCP.getConstantPoolIndex(C, Align);
+  ResultReg = createResultReg(TLI.getRegClassFor(VT));
+  if (isThumb2)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(ARM::t2LDRpci), ResultReg)
+                      .addConstantPoolIndex(Idx));
+  else {
+    // The extra immediate is for addrmode2.
+    ResultReg = constrainOperandRegClass(TII.get(ARM::LDRcp), ResultReg, 0);
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(ARM::LDRcp), ResultReg)
+                      .addConstantPoolIndex(Idx)
+                      .addImm(0));
+  }
+  return ResultReg;
+}
+
+bool ARMFastISel::isPositionIndependent() const {
+  return TLI.isPositionIndependent();
+}
+
+unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
+  // For now 32-bit only.
+  if (VT != MVT::i32 || GV->isThreadLocal()) return 0;
+
+  // ROPI/RWPI not currently supported.
+  if (Subtarget->isROPI() || Subtarget->isRWPI())
+    return 0;
+
+  bool IsIndirect = Subtarget->isGVIndirectSymbol(GV);
+  const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass
+                                           : &ARM::GPRRegClass;
+  unsigned DestReg = createResultReg(RC);
+
+  // FastISel TLS support on non-MachO is broken, punt to SelectionDAG.
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  bool IsThreadLocal = GVar && GVar->isThreadLocal();
+  if (!Subtarget->isTargetMachO() && IsThreadLocal) return 0;
+
+  bool IsPositionIndependent = isPositionIndependent();
+  // Use movw+movt when possible, it avoids constant pool entries.
+  // Non-darwin targets only support static movt relocations in FastISel.
+  if (Subtarget->useMovt(*FuncInfo.MF) &&
+      (Subtarget->isTargetMachO() || !IsPositionIndependent)) {
+    unsigned Opc;
+    unsigned char TF = 0;
+    if (Subtarget->isTargetMachO())
+      TF = ARMII::MO_NONLAZY;
+
+    if (IsPositionIndependent)
+      Opc = isThumb2 ? ARM::t2MOV_ga_pcrel : ARM::MOV_ga_pcrel;
+    else
+      Opc = isThumb2 ? ARM::t2MOVi32imm : ARM::MOVi32imm;
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(Opc), DestReg).addGlobalAddress(GV, 0, TF));
+  } else {
+    // MachineConstantPool wants an explicit alignment.
+    unsigned Align = DL.getPrefTypeAlignment(GV->getType());
+    if (Align == 0) {
+      // TODO: Figure out if this is correct.
+      Align = DL.getTypeAllocSize(GV->getType());
+    }
+
+    if (Subtarget->isTargetELF() && IsPositionIndependent)
+      return ARMLowerPICELF(GV, Align, VT);
+
+    // Grab index.
+    unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
+    unsigned Id = AFI->createPICLabelUId();
+    ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id,
+                                                                ARMCP::CPValue,
+                                                                PCAdj);
+    unsigned Idx = MCP.getConstantPoolIndex(CPV, Align);
+
+    // Load value.
+    MachineInstrBuilder MIB;
+    if (isThumb2) {
+      unsigned Opc = IsPositionIndependent ? ARM::t2LDRpci_pic : ARM::t2LDRpci;
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                    DestReg).addConstantPoolIndex(Idx);
+      if (IsPositionIndependent)
+        MIB.addImm(Id);
+      AddOptionalDefs(MIB);
+    } else {
+      // The extra immediate is for addrmode2.
+      DestReg = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg, 0);
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                    TII.get(ARM::LDRcp), DestReg)
+                .addConstantPoolIndex(Idx)
+                .addImm(0);
+      AddOptionalDefs(MIB);
+
+      if (IsPositionIndependent) {
+        unsigned Opc = IsIndirect ? ARM::PICLDR : ARM::PICADD;
+        unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+
+        MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                          DbgLoc, TII.get(Opc), NewDestReg)
+                                  .addReg(DestReg)
+                                  .addImm(Id);
+        AddOptionalDefs(MIB);
+        return NewDestReg;
+      }
+    }
+  }
+
+  if (IsIndirect) {
+    MachineInstrBuilder MIB;
+    unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+    if (isThumb2)
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                    TII.get(ARM::t2LDRi12), NewDestReg)
+            .addReg(DestReg)
+            .addImm(0);
+    else
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                    TII.get(ARM::LDRi12), NewDestReg)
+                .addReg(DestReg)
+                .addImm(0);
+    DestReg = NewDestReg;
+    AddOptionalDefs(MIB);
+  }
+
+  return DestReg;
+}
+
+unsigned ARMFastISel::fastMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(DL, C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple()) return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return ARMMaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return ARMMaterializeGV(GV, VT);
+  else if (isa<ConstantInt>(C))
+    return ARMMaterializeInt(C, VT);
+
+  return 0;
+}
+
+// TODO: unsigned ARMFastISel::TargetMaterializeFloatZero(const ConstantFP *CF);
+
+unsigned ARMFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
+  // Don't handle dynamic allocas.
+  if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
+
+  MVT VT;
+  if (!isLoadTypeLegal(AI->getType(), VT)) return 0;
+
+  DenseMap<const AllocaInst*, int>::iterator SI =
+    FuncInfo.StaticAllocaMap.find(AI);
+
+  // This will get lowered later into the correct offsets and registers
+  // via rewriteXFrameIndex.
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
+    const TargetRegisterClass* RC = TLI.getRegClassFor(VT);
+    unsigned ResultReg = createResultReg(RC);
+    ResultReg = constrainOperandRegClass(TII.get(Opc), ResultReg, 0);
+
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(Opc), ResultReg)
+                            .addFrameIndex(SI->second)
+                            .addImm(0));
+    return ResultReg;
+  }
+
+  return 0;
+}
+
+bool ARMFastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT evt = TLI.getValueType(DL, Ty, true);
+
+  // Only handle simple types.
+  if (evt == MVT::Other || !evt.isSimple()) return false;
+  VT = evt.getSimpleVT();
+
+  // Handle all legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+bool ARMFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT)) return true;
+
+  // If this is a type than can be sign or zero-extended to a basic operation
+  // go ahead and accept it now.
+  if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+    return true;
+
+  return false;
+}
+
+// Computes the address to get to an object.
+bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
+  // Some boilerplate from the X86 FastISel.
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  if (PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+  switch (Opcode) {
+    default:
+    break;
+    case Instruction::BitCast:
+      // Look through bitcasts.
+      return ARMComputeAddress(U->getOperand(0), Addr);
+    case Instruction::IntToPtr:
+      // Look past no-op inttoptrs.
+      if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+          TLI.getPointerTy(DL))
+        return ARMComputeAddress(U->getOperand(0), Addr);
+      break;
+    case Instruction::PtrToInt:
+      // Look past no-op ptrtoints.
+      if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+        return ARMComputeAddress(U->getOperand(0), Addr);
+      break;
+    case Instruction::GetElementPtr: {
+      Address SavedAddr = Addr;
+      int TmpOffset = Addr.Offset;
+
+      // Iterate through the GEP folding the constants into offsets where
+      // we can.
+      gep_type_iterator GTI = gep_type_begin(U);
+      for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
+           i != e; ++i, ++GTI) {
+        const Value *Op = *i;
+        if (StructType *STy = GTI.getStructTypeOrNull()) {
+          const StructLayout *SL = DL.getStructLayout(STy);
+          unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+          TmpOffset += SL->getElementOffset(Idx);
+        } else {
+          uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+          for (;;) {
+            if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+              // Constant-offset addressing.
+              TmpOffset += CI->getSExtValue() * S;
+              break;
+            }
+            if (canFoldAddIntoGEP(U, Op)) {
+              // A compatible add with a constant operand. Fold the constant.
+              ConstantInt *CI =
+              cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+              TmpOffset += CI->getSExtValue() * S;
+              // Iterate on the other operand.
+              Op = cast<AddOperator>(Op)->getOperand(0);
+              continue;
+            }
+            // Unsupported
+            goto unsupported_gep;
+          }
+        }
+      }
+
+      // Try to grab the base operand now.
+      Addr.Offset = TmpOffset;
+      if (ARMComputeAddress(U->getOperand(0), Addr)) return true;
+
+      // We failed, restore everything and try the other options.
+      Addr = SavedAddr;
+
+      unsupported_gep:
+      break;
+    }
+    case Instruction::Alloca: {
+      const AllocaInst *AI = cast<AllocaInst>(Obj);
+      DenseMap<const AllocaInst*, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+      if (SI != FuncInfo.StaticAllocaMap.end()) {
+        Addr.BaseType = Address::FrameIndexBase;
+        Addr.Base.FI = SI->second;
+        return true;
+      }
+      break;
+    }
+  }
+
+  // Try to get this in a register if nothing else has worked.
+  if (Addr.Base.Reg == 0) Addr.Base.Reg = getRegForValue(Obj);
+  return Addr.Base.Reg != 0;
+}
+
+void ARMFastISel::ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3) {
+  bool needsLowering = false;
+  switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unhandled load/store type!");
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      if (!useAM3) {
+        // Integer loads/stores handle 12-bit offsets.
+        needsLowering = ((Addr.Offset & 0xfff) != Addr.Offset);
+        // Handle negative offsets.
+        if (needsLowering && isThumb2)
+          needsLowering = !(Subtarget->hasV6T2Ops() && Addr.Offset < 0 &&
+                            Addr.Offset > -256);
+      } else {
+        // ARM halfword load/stores and signed byte loads use +/-imm8 offsets.
+        needsLowering = (Addr.Offset > 255 || Addr.Offset < -255);
+      }
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      // Floating point operands handle 8-bit offsets.
+      needsLowering = ((Addr.Offset & 0xff) != Addr.Offset);
+      break;
+  }
+
+  // If this is a stack pointer and the offset needs to be simplified then
+  // put the alloca address into a register, set the base type back to
+  // register and continue. This should almost never happen.
+  if (needsLowering && Addr.BaseType == Address::FrameIndexBase) {
+    const TargetRegisterClass *RC = isThumb2 ? &ARM::tGPRRegClass
+                                             : &ARM::GPRRegClass;
+    unsigned ResultReg = createResultReg(RC);
+    unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(Opc), ResultReg)
+                            .addFrameIndex(Addr.Base.FI)
+                            .addImm(0));
+    Addr.Base.Reg = ResultReg;
+    Addr.BaseType = Address::RegBase;
+  }
+
+  // Since the offset is too large for the load/store instruction
+  // get the reg+offset into a register.
+  if (needsLowering) {
+    Addr.Base.Reg = fastEmit_ri_(MVT::i32, ISD::ADD, Addr.Base.Reg,
+                                 /*Op0IsKill*/false, Addr.Offset, MVT::i32);
+    Addr.Offset = 0;
+  }
+}
+
+void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
+                                       const MachineInstrBuilder &MIB,
+                                       MachineMemOperand::Flags Flags,
+                                       bool useAM3) {
+  // addrmode5 output depends on the selection dag addressing dividing the
+  // offset by 4 that it then later multiplies. Do this here as well.
+  if (VT.SimpleTy == MVT::f32 || VT.SimpleTy == MVT::f64)
+    Addr.Offset /= 4;
+
+  // Frame base works a bit differently. Handle it separately.
+  if (Addr.BaseType == Address::FrameIndexBase) {
+    int FI = Addr.Base.FI;
+    int Offset = Addr.Offset;
+    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags,
+        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+    // Now add the rest of the operands.
+    MIB.addFrameIndex(FI);
+
+    // ARM halfword load/stores and signed byte loads need an additional
+    // operand.
+    if (useAM3) {
+      int Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
+      MIB.addReg(0);
+      MIB.addImm(Imm);
+    } else {
+      MIB.addImm(Addr.Offset);
+    }
+    MIB.addMemOperand(MMO);
+  } else {
+    // Now add the rest of the operands.
+    MIB.addReg(Addr.Base.Reg);
+
+    // ARM halfword load/stores and signed byte loads need an additional
+    // operand.
+    if (useAM3) {
+      int Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
+      MIB.addReg(0);
+      MIB.addImm(Imm);
+    } else {
+      MIB.addImm(Addr.Offset);
+    }
+  }
+  AddOptionalDefs(MIB);
+}
+
+bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                              unsigned Alignment, bool isZExt, bool allocReg) {
+  unsigned Opc;
+  bool useAM3 = false;
+  bool needVMOV = false;
+  const TargetRegisterClass *RC;
+  switch (VT.SimpleTy) {
+    // This is mostly going to be Neon/vector support.
+    default: return false;
+    case MVT::i1:
+    case MVT::i8:
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          Opc = isZExt ? ARM::t2LDRBi8 : ARM::t2LDRSBi8;
+        else
+          Opc = isZExt ? ARM::t2LDRBi12 : ARM::t2LDRSBi12;
+      } else {
+        if (isZExt) {
+          Opc = ARM::LDRBi12;
+        } else {
+          Opc = ARM::LDRSB;
+          useAM3 = true;
+        }
+      }
+      RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
+      break;
+    case MVT::i16:
+      if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
+        return false;
+
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          Opc = isZExt ? ARM::t2LDRHi8 : ARM::t2LDRSHi8;
+        else
+          Opc = isZExt ? ARM::t2LDRHi12 : ARM::t2LDRSHi12;
+      } else {
+        Opc = isZExt ? ARM::LDRH : ARM::LDRSH;
+        useAM3 = true;
+      }
+      RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
+      break;
+    case MVT::i32:
+      if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
+        return false;
+
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          Opc = ARM::t2LDRi8;
+        else
+          Opc = ARM::t2LDRi12;
+      } else {
+        Opc = ARM::LDRi12;
+      }
+      RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
+      break;
+    case MVT::f32:
+      if (!Subtarget->hasVFP2()) return false;
+      // Unaligned loads need special handling. Floats require word-alignment.
+      if (Alignment && Alignment < 4) {
+        needVMOV = true;
+        VT = MVT::i32;
+        Opc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12;
+        RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
+      } else {
+        Opc = ARM::VLDRS;
+        RC = TLI.getRegClassFor(VT);
+      }
+      break;
+    case MVT::f64:
+      if (!Subtarget->hasVFP2()) return false;
+      // FIXME: Unaligned loads need special handling.  Doublewords require
+      // word-alignment.
+      if (Alignment && Alignment < 4)
+        return false;
+
+      Opc = ARM::VLDRD;
+      RC = TLI.getRegClassFor(VT);
+      break;
+  }
+  // Simplify this down to something we can handle.
+  ARMSimplifyAddress(Addr, VT, useAM3);
+
+  // Create the base instruction, then add the operands.
+  if (allocReg)
+    ResultReg = createResultReg(RC);
+  assert (ResultReg > 255 && "Expected an allocated virtual register.");
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(Opc), ResultReg);
+  AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOLoad, useAM3);
+
+  // If we had an unaligned load of a float we've converted it to an regular
+  // load.  Now we must move from the GRP to the FP register.
+  if (needVMOV) {
+    unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::f32));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(ARM::VMOVSR), MoveReg)
+                    .addReg(ResultReg));
+    ResultReg = MoveReg;
+  }
+  return true;
+}
+
+bool ARMFastISel::SelectLoad(const Instruction *I) {
+  // Atomic loads need special handling.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  const Value *SV = I->getOperand(0);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(SV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return false;
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
+      if (Alloca->isSwiftError())
+        return false;
+    }
+  }
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getType(), VT))
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ARMComputeAddress(I->getOperand(0), Addr)) return false;
+
+  unsigned ResultReg;
+  if (!ARMEmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
+    return false;
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
+                               unsigned Alignment) {
+  unsigned StrOpc;
+  bool useAM3 = false;
+  switch (VT.SimpleTy) {
+    // This is mostly going to be Neon/vector support.
+    default: return false;
+    case MVT::i1: {
+      unsigned Res = createResultReg(isThumb2 ? &ARM::tGPRRegClass
+                                              : &ARM::GPRRegClass);
+      unsigned Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri;
+      SrcReg = constrainOperandRegClass(TII.get(Opc), SrcReg, 1);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                              TII.get(Opc), Res)
+                      .addReg(SrcReg).addImm(1));
+      SrcReg = Res;
+      LLVM_FALLTHROUGH;
+    }
+    case MVT::i8:
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          StrOpc = ARM::t2STRBi8;
+        else
+          StrOpc = ARM::t2STRBi12;
+      } else {
+        StrOpc = ARM::STRBi12;
+      }
+      break;
+    case MVT::i16:
+      if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
+        return false;
+
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          StrOpc = ARM::t2STRHi8;
+        else
+          StrOpc = ARM::t2STRHi12;
+      } else {
+        StrOpc = ARM::STRH;
+        useAM3 = true;
+      }
+      break;
+    case MVT::i32:
+      if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
+        return false;
+
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          StrOpc = ARM::t2STRi8;
+        else
+          StrOpc = ARM::t2STRi12;
+      } else {
+        StrOpc = ARM::STRi12;
+      }
+      break;
+    case MVT::f32:
+      if (!Subtarget->hasVFP2()) return false;
+      // Unaligned stores need special handling. Floats require word-alignment.
+      if (Alignment && Alignment < 4) {
+        unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+        AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                TII.get(ARM::VMOVRS), MoveReg)
+                        .addReg(SrcReg));
+        SrcReg = MoveReg;
+        VT = MVT::i32;
+        StrOpc = isThumb2 ? ARM::t2STRi12 : ARM::STRi12;
+      } else {
+        StrOpc = ARM::VSTRS;
+      }
+      break;
+    case MVT::f64:
+      if (!Subtarget->hasVFP2()) return false;
+      // FIXME: Unaligned stores need special handling.  Doublewords require
+      // word-alignment.
+      if (Alignment && Alignment < 4)
+          return false;
+
+      StrOpc = ARM::VSTRD;
+      break;
+  }
+  // Simplify this down to something we can handle.
+  ARMSimplifyAddress(Addr, VT, useAM3);
+
+  // Create the base instruction, then add the operands.
+  SrcReg = constrainOperandRegClass(TII.get(StrOpc), SrcReg, 0);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(StrOpc))
+                            .addReg(SrcReg);
+  AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOStore, useAM3);
+  return true;
+}
+
+bool ARMFastISel::SelectStore(const Instruction *I) {
+  Value *Op0 = I->getOperand(0);
+  unsigned SrcReg = 0;
+
+  // Atomic stores need special handling.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  const Value *PtrV = I->getOperand(1);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return false;
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
+      if (Alloca->isSwiftError())
+        return false;
+    }
+  }
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT))
+    return false;
+
+  // Get the value to be stored into a register.
+  SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0) return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ARMComputeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!ARMEmitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlignment()))
+    return false;
+  return true;
+}
+
+static ARMCC::CondCodes getComparePred(CmpInst::Predicate Pred) {
+  switch (Pred) {
+    // Needs two compares...
+    case CmpInst::FCMP_ONE:
+    case CmpInst::FCMP_UEQ:
+    default:
+      // AL is our "false" for now. The other two need more compares.
+      return ARMCC::AL;
+    case CmpInst::ICMP_EQ:
+    case CmpInst::FCMP_OEQ:
+      return ARMCC::EQ;
+    case CmpInst::ICMP_SGT:
+    case CmpInst::FCMP_OGT:
+      return ARMCC::GT;
+    case CmpInst::ICMP_SGE:
+    case CmpInst::FCMP_OGE:
+      return ARMCC::GE;
+    case CmpInst::ICMP_UGT:
+    case CmpInst::FCMP_UGT:
+      return ARMCC::HI;
+    case CmpInst::FCMP_OLT:
+      return ARMCC::MI;
+    case CmpInst::ICMP_ULE:
+    case CmpInst::FCMP_OLE:
+      return ARMCC::LS;
+    case CmpInst::FCMP_ORD:
+      return ARMCC::VC;
+    case CmpInst::FCMP_UNO:
+      return ARMCC::VS;
+    case CmpInst::FCMP_UGE:
+      return ARMCC::PL;
+    case CmpInst::ICMP_SLT:
+    case CmpInst::FCMP_ULT:
+      return ARMCC::LT;
+    case CmpInst::ICMP_SLE:
+    case CmpInst::FCMP_ULE:
+      return ARMCC::LE;
+    case CmpInst::FCMP_UNE:
+    case CmpInst::ICMP_NE:
+      return ARMCC::NE;
+    case CmpInst::ICMP_UGE:
+      return ARMCC::HS;
+    case CmpInst::ICMP_ULT:
+      return ARMCC::LO;
+  }
+}
+
+bool ARMFastISel::SelectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  // Simple branch support.
+
+  // If we can, avoid recomputing the compare - redoing it could lead to wonky
+  // behavior.
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
+
+      // Get the compare predicate.
+      // Try to take advantage of fallthrough opportunities.
+      CmpInst::Predicate Predicate = CI->getPredicate();
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        Predicate = CmpInst::getInversePredicate(Predicate);
+      }
+
+      ARMCC::CondCodes ARMPred = getComparePred(Predicate);
+
+      // We may not handle every CC for now.
+      if (ARMPred == ARMCC::AL) return false;
+
+      // Emit the compare.
+      if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+        return false;
+
+      unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
+      .addMBB(TBB).addImm(ARMPred).addReg(ARM::CPSR);
+      finishCondBranch(BI->getParent(), TBB, FBB);
+      return true;
+    }
+  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+    MVT SourceVT;
+    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+        (isLoadTypeLegal(TI->getOperand(0)->getType(), SourceVT))) {
+      unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
+      unsigned OpReg = getRegForValue(TI->getOperand(0));
+      OpReg = constrainOperandRegClass(TII.get(TstOpc), OpReg, 0);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                              TII.get(TstOpc))
+                      .addReg(OpReg).addImm(1));
+
+      unsigned CCMode = ARMCC::NE;
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        CCMode = ARMCC::EQ;
+      }
+
+      unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
+      .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
+
+      finishCondBranch(BI->getParent(), TBB, FBB);
+      return true;
+    }
+  } else if (const ConstantInt *CI =
+             dyn_cast<ConstantInt>(BI->getCondition())) {
+    uint64_t Imm = CI->getZExtValue();
+    MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+    fastEmitBranch(Target, DbgLoc);
+    return true;
+  }
+
+  unsigned CmpReg = getRegForValue(BI->getCondition());
+  if (CmpReg == 0) return false;
+
+  // We've been divorced from our compare!  Our block was split, and
+  // now our compare lives in a predecessor block.  We musn't
+  // re-compare here, as the children of the compare aren't guaranteed
+  // live across the block boundary (we *could* check for this).
+  // Regardless, the compare has been done in the predecessor block,
+  // and it left a value for us in a virtual register.  Ergo, we test
+  // the one-bit value left in the virtual register.
+  unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
+  CmpReg = constrainOperandRegClass(TII.get(TstOpc), CmpReg, 0);
+  AddOptionalDefs(
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TstOpc))
+          .addReg(CmpReg)
+          .addImm(1));
+
+  unsigned CCMode = ARMCC::NE;
+  if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+    std::swap(TBB, FBB);
+    CCMode = ARMCC::EQ;
+  }
+
+  unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
+                  .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
+  finishCondBranch(BI->getParent(), TBB, FBB);
+  return true;
+}
+
+bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
+  unsigned AddrReg = getRegForValue(I->getOperand(0));
+  if (AddrReg == 0) return false;
+
+  unsigned Opc = isThumb2 ? ARM::tBRIND : ARM::BX;
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(Opc)).addReg(AddrReg));
+
+  const IndirectBrInst *IB = cast<IndirectBrInst>(I);
+  for (const BasicBlock *SuccBB : IB->successors())
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[SuccBB]);
+
+  return true;
+}
+
+bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
+                             bool isZExt) {
+  Type *Ty = Src1Value->getType();
+  EVT SrcEVT = TLI.getValueType(DL, Ty, true);
+  if (!SrcEVT.isSimple()) return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  bool isFloat = (Ty->isFloatTy() || Ty->isDoubleTy());
+  if (isFloat && !Subtarget->hasVFP2())
+    return false;
+
+  // Check to see if the 2nd operand is a constant that we can encode directly
+  // in the compare.
+  int Imm = 0;
+  bool UseImm = false;
+  bool isNegativeImm = false;
+  // FIXME: At -O0 we don't have anything that canonicalizes operand order.
+  // Thus, Src1Value may be a ConstantInt, but we're missing it.
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) {
+    if (SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8 ||
+        SrcVT == MVT::i1) {
+      const APInt &CIVal = ConstInt->getValue();
+      Imm = (isZExt) ? (int)CIVal.getZExtValue() : (int)CIVal.getSExtValue();
+      // For INT_MIN/LONG_MIN (i.e., 0x80000000) we need to use a cmp, rather
+      // then a cmn, because there is no way to represent 2147483648 as a
+      // signed 32-bit int.
+      if (Imm < 0 && Imm != (int)0x80000000) {
+        isNegativeImm = true;
+        Imm = -Imm;
+      }
+      UseImm = isThumb2 ? (ARM_AM::getT2SOImmVal(Imm) != -1) :
+        (ARM_AM::getSOImmVal(Imm) != -1);
+    }
+  } else if (const ConstantFP *ConstFP = dyn_cast<ConstantFP>(Src2Value)) {
+    if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
+      if (ConstFP->isZero() && !ConstFP->isNegative())
+        UseImm = true;
+  }
+
+  unsigned CmpOpc;
+  bool isICmp = true;
+  bool needsExt = false;
+  switch (SrcVT.SimpleTy) {
+    default: return false;
+    // TODO: Verify compares.
+    case MVT::f32:
+      isICmp = false;
+      CmpOpc = UseImm ? ARM::VCMPEZS : ARM::VCMPES;
+      break;
+    case MVT::f64:
+      isICmp = false;
+      CmpOpc = UseImm ? ARM::VCMPEZD : ARM::VCMPED;
+      break;
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+      needsExt = true;
+    // Intentional fall-through.
+    case MVT::i32:
+      if (isThumb2) {
+        if (!UseImm)
+          CmpOpc = ARM::t2CMPrr;
+        else
+          CmpOpc = isNegativeImm ? ARM::t2CMNri : ARM::t2CMPri;
+      } else {
+        if (!UseImm)
+          CmpOpc = ARM::CMPrr;
+        else
+          CmpOpc = isNegativeImm ? ARM::CMNri : ARM::CMPri;
+      }
+      break;
+  }
+
+  unsigned SrcReg1 = getRegForValue(Src1Value);
+  if (SrcReg1 == 0) return false;
+
+  unsigned SrcReg2 = 0;
+  if (!UseImm) {
+    SrcReg2 = getRegForValue(Src2Value);
+    if (SrcReg2 == 0) return false;
+  }
+
+  // We have i1, i8, or i16, we need to either zero extend or sign extend.
+  if (needsExt) {
+    SrcReg1 = ARMEmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt);
+    if (SrcReg1 == 0) return false;
+    if (!UseImm) {
+      SrcReg2 = ARMEmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt);
+      if (SrcReg2 == 0) return false;
+    }
+  }
+
+  const MCInstrDesc &II = TII.get(CmpOpc);
+  SrcReg1 = constrainOperandRegClass(II, SrcReg1, 0);
+  if (!UseImm) {
+    SrcReg2 = constrainOperandRegClass(II, SrcReg2, 1);
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+                    .addReg(SrcReg1).addReg(SrcReg2));
+  } else {
+    MachineInstrBuilder MIB;
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+      .addReg(SrcReg1);
+
+    // Only add immediate for icmp as the immediate for fcmp is an implicit 0.0.
+    if (isICmp)
+      MIB.addImm(Imm);
+    AddOptionalDefs(MIB);
+  }
+
+  // For floating point we need to move the result to a comparison register
+  // that we can then use for branches.
+  if (Ty->isFloatTy() || Ty->isDoubleTy())
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                            TII.get(ARM::FMSTAT)));
+  return true;
+}
+
+bool ARMFastISel::SelectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
+
+  // Get the compare predicate.
+  ARMCC::CondCodes ARMPred = getComparePred(CI->getPredicate());
+
+  // We may not handle every CC for now.
+  if (ARMPred == ARMCC::AL) return false;
+
+  // Emit the compare.
+  if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+    return false;
+
+  // Now set a register based on the comparison. Explicitly set the predicates
+  // here.
+  unsigned MovCCOpc = isThumb2 ? ARM::t2MOVCCi : ARM::MOVCCi;
+  const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass
+                                           : &ARM::GPRRegClass;
+  unsigned DestReg = createResultReg(RC);
+  Constant *Zero = ConstantInt::get(Type::getInt32Ty(*Context), 0);
+  unsigned ZeroReg = fastMaterializeConstant(Zero);
+  // ARMEmitCmp emits a FMSTAT when necessary, so it's always safe to use CPSR.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovCCOpc), DestReg)
+          .addReg(ZeroReg).addImm(1)
+          .addImm(ARMPred).addReg(ARM::CPSR);
+
+  updateValueMap(I, DestReg);
+  return true;
+}
+
+bool ARMFastISel::SelectFPExt(const Instruction *I) {
+  // Make sure we have VFP and that we're extending float to double.
+  if (!Subtarget->hasVFP2()) return false;
+
+  Value *V = I->getOperand(0);
+  if (!I->getType()->isDoubleTy() ||
+      !V->getType()->isFloatTy()) return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0) return false;
+
+  unsigned Result = createResultReg(&ARM::DPRRegClass);
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(ARM::VCVTDS), Result)
+                  .addReg(Op));
+  updateValueMap(I, Result);
+  return true;
+}
+
+bool ARMFastISel::SelectFPTrunc(const Instruction *I) {
+  // Make sure we have VFP and that we're truncating double to float.
+  if (!Subtarget->hasVFP2()) return false;
+
+  Value *V = I->getOperand(0);
+  if (!(I->getType()->isFloatTy() &&
+        V->getType()->isDoubleTy())) return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0) return false;
+
+  unsigned Result = createResultReg(&ARM::SPRRegClass);
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(ARM::VCVTSD), Result)
+                  .addReg(Op));
+  updateValueMap(I, Result);
+  return true;
+}
+
+bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
+  // Make sure we have VFP.
+  if (!Subtarget->hasVFP2()) return false;
+
+  MVT DstVT;
+  Type *Ty = I->getType();
+  if (!isTypeLegal(Ty, DstVT))
+    return false;
+
+  Value *Src = I->getOperand(0);
+  EVT SrcEVT = TLI.getValueType(DL, Src->getType(), true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (SrcReg == 0) return false;
+
+  // Handle sign-extension.
+  if (SrcVT == MVT::i16 || SrcVT == MVT::i8) {
+    SrcReg = ARMEmitIntExt(SrcVT, SrcReg, MVT::i32,
+                                       /*isZExt*/!isSigned);
+    if (SrcReg == 0) return false;
+  }
+
+  // The conversion routine works on fp-reg to fp-reg and the operand above
+  // was an integer, move it to the fp registers if possible.
+  unsigned FP = ARMMoveToFPReg(MVT::f32, SrcReg);
+  if (FP == 0) return false;
+
+  unsigned Opc;
+  if (Ty->isFloatTy()) Opc = isSigned ? ARM::VSITOS : ARM::VUITOS;
+  else if (Ty->isDoubleTy()) Opc = isSigned ? ARM::VSITOD : ARM::VUITOD;
+  else return false;
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(Opc), ResultReg).addReg(FP));
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) {
+  // Make sure we have VFP.
+  if (!Subtarget->hasVFP2()) return false;
+
+  MVT DstVT;
+  Type *RetTy = I->getType();
+  if (!isTypeLegal(RetTy, DstVT))
+    return false;
+
+  unsigned Op = getRegForValue(I->getOperand(0));
+  if (Op == 0) return false;
+
+  unsigned Opc;
+  Type *OpTy = I->getOperand(0)->getType();
+  if (OpTy->isFloatTy()) Opc = isSigned ? ARM::VTOSIZS : ARM::VTOUIZS;
+  else if (OpTy->isDoubleTy()) Opc = isSigned ? ARM::VTOSIZD : ARM::VTOUIZD;
+  else return false;
+
+  // f64->s32/u32 or f32->s32/u32 both need an intermediate f32 reg.
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(Opc), ResultReg).addReg(Op));
+
+  // This result needs to be in an integer register, but the conversion only
+  // takes place in fp-regs.
+  unsigned IntReg = ARMMoveToIntReg(DstVT, ResultReg);
+  if (IntReg == 0) return false;
+
+  updateValueMap(I, IntReg);
+  return true;
+}
+
+bool ARMFastISel::SelectSelect(const Instruction *I) {
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT))
+    return false;
+
+  // Things need to be register sized for register moves.
+  if (VT != MVT::i32) return false;
+
+  unsigned CondReg = getRegForValue(I->getOperand(0));
+  if (CondReg == 0) return false;
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (Op1Reg == 0) return false;
+
+  // Check to see if we can use an immediate in the conditional move.
+  int Imm = 0;
+  bool UseImm = false;
+  bool isNegativeImm = false;
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(2))) {
+    assert (VT == MVT::i32 && "Expecting an i32.");
+    Imm = (int)ConstInt->getValue().getZExtValue();
+    if (Imm < 0) {
+      isNegativeImm = true;
+      Imm = ~Imm;
+    }
+    UseImm = isThumb2 ? (ARM_AM::getT2SOImmVal(Imm) != -1) :
+      (ARM_AM::getSOImmVal(Imm) != -1);
+  }
+
+  unsigned Op2Reg = 0;
+  if (!UseImm) {
+    Op2Reg = getRegForValue(I->getOperand(2));
+    if (Op2Reg == 0) return false;
+  }
+
+  unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
+  CondReg = constrainOperandRegClass(TII.get(TstOpc), CondReg, 0);
+  AddOptionalDefs(
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TstOpc))
+          .addReg(CondReg)
+          .addImm(1));
+
+  unsigned MovCCOpc;
+  const TargetRegisterClass *RC;
+  if (!UseImm) {
+    RC = isThumb2 ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
+    MovCCOpc = isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr;
+  } else {
+    RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass;
+    if (!isNegativeImm)
+      MovCCOpc = isThumb2 ? ARM::t2MOVCCi : ARM::MOVCCi;
+    else
+      MovCCOpc = isThumb2 ? ARM::t2MVNCCi : ARM::MVNCCi;
+  }
+  unsigned ResultReg = createResultReg(RC);
+  if (!UseImm) {
+    Op2Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op2Reg, 1);
+    Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovCCOpc),
+            ResultReg)
+        .addReg(Op2Reg)
+        .addReg(Op1Reg)
+        .addImm(ARMCC::NE)
+        .addReg(ARM::CPSR);
+  } else {
+    Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 1);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovCCOpc),
+            ResultReg)
+        .addReg(Op1Reg)
+        .addImm(Imm)
+        .addImm(ARMCC::EQ)
+        .addReg(ARM::CPSR);
+  }
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::SelectDiv(const Instruction *I, bool isSigned) {
+  MVT VT;
+  Type *Ty = I->getType();
+  if (!isTypeLegal(Ty, VT))
+    return false;
+
+  // If we have integer div support we should have selected this automagically.
+  // In case we have a real miss go ahead and return false and we'll pick
+  // it up later.
+  if (Subtarget->hasDivide()) return false;
+
+  // Otherwise emit a libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i8)
+    LC = isSigned ? RTLIB::SDIV_I8 : RTLIB::UDIV_I8;
+  else if (VT == MVT::i16)
+    LC = isSigned ? RTLIB::SDIV_I16 : RTLIB::UDIV_I16;
+  else if (VT == MVT::i32)
+    LC = isSigned ? RTLIB::SDIV_I32 : RTLIB::UDIV_I32;
+  else if (VT == MVT::i64)
+    LC = isSigned ? RTLIB::SDIV_I64 : RTLIB::UDIV_I64;
+  else if (VT == MVT::i128)
+    LC = isSigned ? RTLIB::SDIV_I128 : RTLIB::UDIV_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
+
+  return ARMEmitLibcall(I, LC);
+}
+
+bool ARMFastISel::SelectRem(const Instruction *I, bool isSigned) {
+  MVT VT;
+  Type *Ty = I->getType();
+  if (!isTypeLegal(Ty, VT))
+    return false;
+
+  // Many ABIs do not provide a libcall for standalone remainder, so we need to
+  // use divrem (see the RTABI 4.3.1). Since FastISel can't handle non-double
+  // multi-reg returns, we'll have to bail out.
+  if (!TLI.hasStandaloneRem(VT)) {
+    return false;
+  }
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i8)
+    LC = isSigned ? RTLIB::SREM_I8 : RTLIB::UREM_I8;
+  else if (VT == MVT::i16)
+    LC = isSigned ? RTLIB::SREM_I16 : RTLIB::UREM_I16;
+  else if (VT == MVT::i32)
+    LC = isSigned ? RTLIB::SREM_I32 : RTLIB::UREM_I32;
+  else if (VT == MVT::i64)
+    LC = isSigned ? RTLIB::SREM_I64 : RTLIB::UREM_I64;
+  else if (VT == MVT::i128)
+    LC = isSigned ? RTLIB::SREM_I128 : RTLIB::UREM_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
+
+  return ARMEmitLibcall(I, LC);
+}
+
+bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
+
+  // We can get here in the case when we have a binary operation on a non-legal
+  // type and the target independent selector doesn't know how to handle it.
+  if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
+    return false;
+
+  unsigned Opc;
+  switch (ISDOpcode) {
+    default: return false;
+    case ISD::ADD:
+      Opc = isThumb2 ? ARM::t2ADDrr : ARM::ADDrr;
+      break;
+    case ISD::OR:
+      Opc = isThumb2 ? ARM::t2ORRrr : ARM::ORRrr;
+      break;
+    case ISD::SUB:
+      Opc = isThumb2 ? ARM::t2SUBrr : ARM::SUBrr;
+      break;
+  }
+
+  unsigned SrcReg1 = getRegForValue(I->getOperand(0));
+  if (SrcReg1 == 0) return false;
+
+  // TODO: Often the 2nd operand is an immediate, which can be encoded directly
+  // in the instruction, rather then materializing the value in a register.
+  unsigned SrcReg2 = getRegForValue(I->getOperand(1));
+  if (SrcReg2 == 0) return false;
+
+  unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass);
+  SrcReg1 = constrainOperandRegClass(TII.get(Opc), SrcReg1, 1);
+  SrcReg2 = constrainOperandRegClass(TII.get(Opc), SrcReg2, 2);
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(Opc), ResultReg)
+                  .addReg(SrcReg1).addReg(SrcReg2));
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) {
+  EVT FPVT = TLI.getValueType(DL, I->getType(), true);
+  if (!FPVT.isSimple()) return false;
+  MVT VT = FPVT.getSimpleVT();
+
+  // FIXME: Support vector types where possible.
+  if (VT.isVector())
+    return false;
+
+  // We can get here in the case when we want to use NEON for our fp
+  // operations, but can't figure out how to. Just use the vfp instructions
+  // if we have them.
+  // FIXME: It'd be nice to use NEON instructions.
+  Type *Ty = I->getType();
+  bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy());
+  if (isFloat && !Subtarget->hasVFP2())
+    return false;
+
+  unsigned Opc;
+  bool is64bit = VT == MVT::f64 || VT == MVT::i64;
+  switch (ISDOpcode) {
+    default: return false;
+    case ISD::FADD:
+      Opc = is64bit ? ARM::VADDD : ARM::VADDS;
+      break;
+    case ISD::FSUB:
+      Opc = is64bit ? ARM::VSUBD : ARM::VSUBS;
+      break;
+    case ISD::FMUL:
+      Opc = is64bit ? ARM::VMULD : ARM::VMULS;
+      break;
+  }
+  unsigned Op1 = getRegForValue(I->getOperand(0));
+  if (Op1 == 0) return false;
+
+  unsigned Op2 = getRegForValue(I->getOperand(1));
+  if (Op2 == 0) return false;
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(Opc), ResultReg)
+                  .addReg(Op1).addReg(Op2));
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+// Call Handling Code
+
+// This is largely taken directly from CCAssignFnForNode
+// TODO: We may not support all of this.
+CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
+                                           bool Return,
+                                           bool isVarArg) {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::Fast:
+    if (Subtarget->hasVFP2() && !isVarArg) {
+      if (!Subtarget->isAAPCS_ABI())
+        return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
+      // For AAPCS ABI targets, just use VFP variant of the calling convention.
+      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
+    }
+    LLVM_FALLTHROUGH;
+  case CallingConv::C:
+  case CallingConv::CXX_FAST_TLS:
+    // Use target triple & subtarget features to do actual dispatch.
+    if (Subtarget->isAAPCS_ABI()) {
+      if (Subtarget->hasVFP2() &&
+          TM.Options.FloatABIType == FloatABI::Hard && !isVarArg)
+        return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+      else
+        return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+    } else {
+      return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+    }
+  case CallingConv::ARM_AAPCS_VFP:
+  case CallingConv::Swift:
+    if (!isVarArg)
+      return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+    // Fall through to soft float variant, variadic functions don't
+    // use hard floating point ABI.
+    LLVM_FALLTHROUGH;
+  case CallingConv::ARM_AAPCS:
+    return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+  case CallingConv::ARM_APCS:
+    return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+  case CallingConv::GHC:
+    if (Return)
+      llvm_unreachable("Can't return in GHC call convention");
+    else
+      return CC_ARM_APCS_GHC;
+  }
+}
+
+bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
+                                  SmallVectorImpl<unsigned> &ArgRegs,
+                                  SmallVectorImpl<MVT> &ArgVTs,
+                                  SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                                  SmallVectorImpl<unsigned> &RegArgs,
+                                  CallingConv::ID CC,
+                                  unsigned &NumBytes,
+                                  bool isVarArg) {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, *FuncInfo.MF, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags,
+                             CCAssignFnForCall(CC, false, isVarArg));
+
+  // Check that we can handle all of the arguments. If we can't, then bail out
+  // now before we add code to the MBB.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // We don't handle NEON/vector parameters yet.
+    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
+      return false;
+
+    // Now copy/store arg to correct locations.
+    if (VA.isRegLoc() && !VA.needsCustom()) {
+      continue;
+    } else if (VA.needsCustom()) {
+      // TODO: We need custom lowering for vector (v2f64) args.
+      if (VA.getLocVT() != MVT::f64 ||
+          // TODO: Only handle register args for now.
+          !VA.isRegLoc() || !ArgLocs[++i].isRegLoc())
+        return false;
+    } else {
+      switch (ArgVT.SimpleTy) {
+      default:
+        return false;
+      case MVT::i1:
+      case MVT::i8:
+      case MVT::i16:
+      case MVT::i32:
+        break;
+      case MVT::f32:
+        if (!Subtarget->hasVFP2())
+          return false;
+        break;
+      case MVT::f64:
+        if (!Subtarget->hasVFP2())
+          return false;
+        break;
+      }
+    }
+  }
+
+  // At the point, we are able to handle the call's arguments in fast isel.
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(AdjStackDown))
+                  .addImm(NumBytes));
+
+  // Process the args.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    const Value *ArgVal = Args[VA.getValNo()];
+    unsigned Arg = ArgRegs[VA.getValNo()];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    assert((!ArgVT.isVector() && ArgVT.getSizeInBits() <= 64) &&
+           "We don't handle NEON/vector parameters yet.");
+
+    // Handle arg promotion, etc.
+    switch (VA.getLocInfo()) {
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt: {
+        MVT DestVT = VA.getLocVT();
+        Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/false);
+        assert (Arg != 0 && "Failed to emit a sext");
+        ArgVT = DestVT;
+        break;
+      }
+      case CCValAssign::AExt:
+        // Intentional fall-through.  Handle AExt and ZExt.
+      case CCValAssign::ZExt: {
+        MVT DestVT = VA.getLocVT();
+        Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/true);
+        assert (Arg != 0 && "Failed to emit a zext");
+        ArgVT = DestVT;
+        break;
+      }
+      case CCValAssign::BCvt: {
+        unsigned BC = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, Arg,
+                                 /*TODO: Kill=*/false);
+        assert(BC != 0 && "Failed to emit a bitcast!");
+        Arg = BC;
+        ArgVT = VA.getLocVT();
+        break;
+      }
+      default: llvm_unreachable("Unknown arg promotion!");
+    }
+
+    // Now copy/store arg to correct locations.
+    if (VA.isRegLoc() && !VA.needsCustom()) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg);
+      RegArgs.push_back(VA.getLocReg());
+    } else if (VA.needsCustom()) {
+      // TODO: We need custom lowering for vector (v2f64) args.
+      assert(VA.getLocVT() == MVT::f64 &&
+             "Custom lowering for v2f64 args not available");
+
+      CCValAssign &NextVA = ArgLocs[++i];
+
+      assert(VA.isRegLoc() && NextVA.isRegLoc() &&
+             "We only handle register args!");
+
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                              TII.get(ARM::VMOVRRD), VA.getLocReg())
+                      .addReg(NextVA.getLocReg(), RegState::Define)
+                      .addReg(Arg));
+      RegArgs.push_back(VA.getLocReg());
+      RegArgs.push_back(NextVA.getLocReg());
+    } else {
+      assert(VA.isMemLoc());
+      // Need to store on the stack.
+
+      // Don't emit stores for undef values.
+      if (isa<UndefValue>(ArgVal))
+        continue;
+
+      Address Addr;
+      Addr.BaseType = Address::RegBase;
+      Addr.Base.Reg = ARM::SP;
+      Addr.Offset = VA.getLocMemOffset();
+
+      bool EmitRet = ARMEmitStore(ArgVT, Arg, Addr); (void)EmitRet;
+      assert(EmitRet && "Could not emit a store for argument!");
+    }
+  }
+
+  return true;
+}
+
+bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                             const Instruction *I, CallingConv::ID CC,
+                             unsigned &NumBytes, bool isVarArg) {
+  // Issue CALLSEQ_END
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                          TII.get(AdjStackUp))
+                  .addImm(NumBytes).addImm(0));
+
+  // Now the return value.
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, isVarArg, *FuncInfo.MF, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
+
+    // Copy all of the result registers out of their specified physreg.
+    if (RVLocs.size() == 2 && RetVT == MVT::f64) {
+      // For this move we copy into two registers and then move into the
+      // double fp reg we want.
+      MVT DestVT = RVLocs[0].getValVT();
+      const TargetRegisterClass* DstRC = TLI.getRegClassFor(DestVT);
+      unsigned ResultReg = createResultReg(DstRC);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                              TII.get(ARM::VMOVDRR), ResultReg)
+                      .addReg(RVLocs[0].getLocReg())
+                      .addReg(RVLocs[1].getLocReg()));
+
+      UsedRegs.push_back(RVLocs[0].getLocReg());
+      UsedRegs.push_back(RVLocs[1].getLocReg());
+
+      // Finally update the result.
+      updateValueMap(I, ResultReg);
+    } else {
+      assert(RVLocs.size() == 1 &&"Can't handle non-double multi-reg retvals!");
+      MVT CopyVT = RVLocs[0].getValVT();
+
+      // Special handling for extended integers.
+      if (RetVT == MVT::i1 || RetVT == MVT::i8 || RetVT == MVT::i16)
+        CopyVT = MVT::i32;
+
+      const TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
+
+      unsigned ResultReg = createResultReg(DstRC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY),
+              ResultReg).addReg(RVLocs[0].getLocReg());
+      UsedRegs.push_back(RVLocs[0].getLocReg());
+
+      // Finally update the result.
+      updateValueMap(I, ResultReg);
+    }
+  }
+
+  return true;
+}
+
+bool ARMFastISel::SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  if (TLI.supportSwiftError() &&
+      F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return false;
+
+  if (TLI.supportSplitCSR(FuncInfo.MF))
+    return false;
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+
+  CallingConv::ID CC = F.getCallingConv();
+  if (Ret->getNumOperands() > 0) {
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
+    CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */,
+                                                 F.isVarArg()));
+
+    const Value *RV = Ret->getOperand(0);
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+
+    // Don't bother handling odd stuff for now.
+    if (VA.getLocInfo() != CCValAssign::Full)
+      return false;
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+
+    unsigned SrcReg = Reg + VA.getValNo();
+    EVT RVEVT = TLI.getValueType(DL, RV->getType());
+    if (!RVEVT.isSimple()) return false;
+    MVT RVVT = RVEVT.getSimpleVT();
+    MVT DestVT = VA.getValVT();
+    // Special handling for extended integers.
+    if (RVVT != DestVT) {
+      if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
+        return false;
+
+      assert(DestVT == MVT::i32 && "ARM should always ext to i32");
+
+      // Perform extension if flagged as either zext or sext.  Otherwise, do
+      // nothing.
+      if (Outs[0].Flags.isZExt() || Outs[0].Flags.isSExt()) {
+        SrcReg = ARMEmitIntExt(RVVT, SrcReg, DestVT, Outs[0].Flags.isZExt());
+        if (SrcReg == 0) return false;
+      }
+    }
+
+    // Make the copy.
+    unsigned DstReg = VA.getLocReg();
+    const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg);
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!SrcRC->contains(DstReg))
+      return false;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
+
+    // Add register to return instruction.
+    RetRegs.push_back(VA.getLocReg());
+  }
+
+  unsigned RetOpc = isThumb2 ? ARM::tBX_RET : ARM::BX_RET;
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(RetOpc));
+  AddOptionalDefs(MIB);
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+  return true;
+}
+
+unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) {
+  if (UseReg)
+    return isThumb2 ? ARM::tBLXr : ARM::BLX;
+  else
+    return isThumb2 ? ARM::tBL : ARM::BL;
+}
+
+unsigned ARMFastISel::getLibcallReg(const Twine &Name) {
+  // Manually compute the global's type to avoid building it when unnecessary.
+  Type *GVTy = Type::getInt32PtrTy(*Context, /*AS=*/0);
+  EVT LCREVT = TLI.getValueType(DL, GVTy);
+  if (!LCREVT.isSimple()) return 0;
+
+  GlobalValue *GV = new GlobalVariable(M, Type::getInt32Ty(*Context), false,
+                                       GlobalValue::ExternalLinkage, nullptr,
+                                       Name);
+  assert(GV->getType() == GVTy && "We miscomputed the type for the global!");
+  return ARMMaterializeGV(GV, LCREVT.getSimpleVT());
+}
+
+// A quick function that will emit a call for a named libcall in F with the
+// vector of passed arguments for the Instruction in I. We can assume that we
+// can emit a call for any libcall we can produce. This is an abridged version
+// of the full call infrastructure since we won't need to worry about things
+// like computed function pointers or strange arguments at call sites.
+// TODO: Try to unify this and the normal call bits for ARM, then try to unify
+// with X86.
+bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
+  CallingConv::ID CC = TLI.getLibcallCallingConv(Call);
+
+  // Handle *simple* calls for now.
+  Type *RetTy = I->getType();
+  MVT RetVT;
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT))
+    return false;
+
+  // Can't handle non-double multi-reg retvals.
+  if (RetVT != MVT::isVoid && RetVT != MVT::i32) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, false));
+    if (RVLocs.size() >= 2 && RetVT != MVT::f64)
+      return false;
+  }
+
+  // Set up the argument vectors.
+  SmallVector<Value*, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+  Args.reserve(I->getNumOperands());
+  ArgRegs.reserve(I->getNumOperands());
+  ArgVTs.reserve(I->getNumOperands());
+  ArgFlags.reserve(I->getNumOperands());
+  for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+    Value *Op = I->getOperand(i);
+    unsigned Arg = getRegForValue(Op);
+    if (Arg == 0) return false;
+
+    Type *ArgTy = Op->getType();
+    MVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT)) return false;
+
+    ISD::ArgFlagsTy Flags;
+    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(Op);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Handle the arguments now that we've gotten them.
+  SmallVector<unsigned, 4> RegArgs;
+  unsigned NumBytes;
+  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+                       RegArgs, CC, NumBytes, false))
+    return false;
+
+  unsigned CalleeReg = 0;
+  if (Subtarget->genLongCalls()) {
+    CalleeReg = getLibcallReg(TLI.getLibcallName(Call));
+    if (CalleeReg == 0) return false;
+  }
+
+  // Issue the call.
+  unsigned CallOpc = ARMSelectCallOp(Subtarget->genLongCalls());
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                    DbgLoc, TII.get(CallOpc));
+  // BL / BLX don't take a predicate, but tBL / tBLX do.
+  if (isThumb2)
+    AddDefaultPred(MIB);
+  if (Subtarget->genLongCalls())
+    MIB.addReg(CalleeReg);
+  else
+    MIB.addExternalSymbol(TLI.getLibcallName(Call));
+
+  // Add implicit physical register uses to the call.
+  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+    MIB.addReg(RegArgs[i], RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, false)) return false;
+
+  // Set all unused physreg defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
+bool ARMFastISel::SelectCall(const Instruction *I,
+                             const char *IntrMemName = nullptr) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
+  // Can't handle inline asm.
+  if (isa<InlineAsm>(Callee)) return false;
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (CI->isTailCall()) return false;
+
+  // Check the calling convention.
+  ImmutableCallSite CS(CI);
+  CallingConv::ID CC = CS.getCallingConv();
+
+  // TODO: Avoid some calling conventions?
+
+  FunctionType *FTy = CS.getFunctionType();
+  bool isVarArg = FTy->isVarArg();
+
+  // Handle *simple* calls for now.
+  Type *RetTy = I->getType();
+  MVT RetVT;
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 &&
+           RetVT != MVT::i8  && RetVT != MVT::i1)
+    return false;
+
+  // Can't handle non-double multi-reg retvals.
+  if (RetVT != MVT::isVoid && RetVT != MVT::i1 && RetVT != MVT::i8 &&
+      RetVT != MVT::i16 && RetVT != MVT::i32) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, isVarArg, *FuncInfo.MF, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
+    if (RVLocs.size() >= 2 && RetVT != MVT::f64)
+      return false;
+  }
+
+  // Set up the argument vectors.
+  SmallVector<Value*, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+  unsigned arg_size = CS.arg_size();
+  Args.reserve(arg_size);
+  ArgRegs.reserve(arg_size);
+  ArgVTs.reserve(arg_size);
+  ArgFlags.reserve(arg_size);
+  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    // If we're lowering a memory intrinsic instead of a regular call, skip the
+    // last two arguments, which shouldn't be passed to the underlying function.
+    if (IntrMemName && e-i <= 2)
+      break;
+
+    ISD::ArgFlagsTy Flags;
+    unsigned AttrInd = i - CS.arg_begin() + 1;
+    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+      Flags.setSExt();
+    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+      Flags.setZExt();
+
+    // FIXME: Only handle *easy* calls for now.
+    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
+        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrInd, Attribute::SwiftSelf) ||
+        CS.paramHasAttr(AttrInd, Attribute::SwiftError) ||
+        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
+        CS.paramHasAttr(AttrInd, Attribute::ByVal))
+      return false;
+
+    Type *ArgTy = (*i)->getType();
+    MVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8 &&
+        ArgVT != MVT::i1)
+      return false;
+
+    unsigned Arg = getRegForValue(*i);
+    if (Arg == 0)
+      return false;
+
+    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(*i);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Handle the arguments now that we've gotten them.
+  SmallVector<unsigned, 4> RegArgs;
+  unsigned NumBytes;
+  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+                       RegArgs, CC, NumBytes, isVarArg))
+    return false;
+
+  bool UseReg = false;
+  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+  if (!GV || Subtarget->genLongCalls()) UseReg = true;
+
+  unsigned CalleeReg = 0;
+  if (UseReg) {
+    if (IntrMemName)
+      CalleeReg = getLibcallReg(IntrMemName);
+    else
+      CalleeReg = getRegForValue(Callee);
+
+    if (CalleeReg == 0) return false;
+  }
+
+  // Issue the call.
+  unsigned CallOpc = ARMSelectCallOp(UseReg);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                    DbgLoc, TII.get(CallOpc));
+
+  // ARM calls don't take a predicate, but tBL / tBLX do.
+  if(isThumb2)
+    AddDefaultPred(MIB);
+  if (UseReg)
+    MIB.addReg(CalleeReg);
+  else if (!IntrMemName)
+    MIB.addGlobalAddress(GV, 0, 0);
+  else
+    MIB.addExternalSymbol(IntrMemName, 0);
+
+  // Add implicit physical register uses to the call.
+  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+    MIB.addReg(RegArgs[i], RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, isVarArg))
+    return false;
+
+  // Set all unused physreg defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
+bool ARMFastISel::ARMIsMemCpySmall(uint64_t Len) {
+  return Len <= 16;
+}
+
+bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src,
+                                        uint64_t Len, unsigned Alignment) {
+  // Make sure we don't bloat code by inlining very large memcpy's.
+  if (!ARMIsMemCpySmall(Len))
+    return false;
+
+  while (Len) {
+    MVT VT;
+    if (!Alignment || Alignment >= 4) {
+      if (Len >= 4)
+        VT = MVT::i32;
+      else if (Len >= 2)
+        VT = MVT::i16;
+      else {
+        assert (Len == 1 && "Expected a length of 1!");
+        VT = MVT::i8;
+      }
+    } else {
+      // Bound based on alignment.
+      if (Len >= 2 && Alignment == 2)
+        VT = MVT::i16;
+      else {
+        VT = MVT::i8;
+      }
+    }
+
+    bool RV;
+    unsigned ResultReg;
+    RV = ARMEmitLoad(VT, ResultReg, Src);
+    assert (RV == true && "Should be able to handle this load.");
+    RV = ARMEmitStore(VT, ResultReg, Dest);
+    assert (RV == true && "Should be able to handle this store.");
+    (void)RV;
+
+    unsigned Size = VT.getSizeInBits()/8;
+    Len -= Size;
+    Dest.Offset += Size;
+    Src.Offset += Size;
+  }
+
+  return true;
+}
+
+bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
+  // FIXME: Handle more intrinsics.
+  switch (I.getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::frameaddress: {
+    MachineFrameInfo &MFI = FuncInfo.MF->getFrameInfo();
+    MFI.setFrameAddressIsTaken(true);
+
+    unsigned LdrOpc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12;
+    const TargetRegisterClass *RC = isThumb2 ? &ARM::tGPRRegClass
+                                             : &ARM::GPRRegClass;
+
+    const ARMBaseRegisterInfo *RegInfo =
+        static_cast<const ARMBaseRegisterInfo *>(Subtarget->getRegisterInfo());
+    unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+    unsigned SrcReg = FramePtr;
+
+    // Recursively load frame address
+    // ldr r0 [fp]
+    // ldr r0 [r0]
+    // ldr r0 [r0]
+    // ...
+    unsigned DestReg;
+    unsigned Depth = cast<ConstantInt>(I.getOperand(0))->getZExtValue();
+    while (Depth--) {
+      DestReg = createResultReg(RC);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                              TII.get(LdrOpc), DestReg)
+                      .addReg(SrcReg).addImm(0));
+      SrcReg = DestReg;
+    }
+    updateValueMap(&I, SrcReg);
+    return true;
+  }
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove: {
+    const MemTransferInst &MTI = cast<MemTransferInst>(I);
+    // Don't handle volatile.
+    if (MTI.isVolatile())
+      return false;
+
+    // Disable inlining for memmove before calls to ComputeAddress.  Otherwise,
+    // we would emit dead code because we don't currently handle memmoves.
+    bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy);
+    if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) {
+      // Small memcpy's are common enough that we want to do them without a call
+      // if possible.
+      uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue();
+      if (ARMIsMemCpySmall(Len)) {
+        Address Dest, Src;
+        if (!ARMComputeAddress(MTI.getRawDest(), Dest) ||
+            !ARMComputeAddress(MTI.getRawSource(), Src))
+          return false;
+        unsigned Alignment = MTI.getAlignment();
+        if (ARMTryEmitSmallMemCpy(Dest, Src, Len, Alignment))
+          return true;
+      }
+    }
+
+    if (!MTI.getLength()->getType()->isIntegerTy(32))
+      return false;
+
+    if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
+      return false;
+
+    const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove";
+    return SelectCall(&I, IntrMemName);
+  }
+  case Intrinsic::memset: {
+    const MemSetInst &MSI = cast<MemSetInst>(I);
+    // Don't handle volatile.
+    if (MSI.isVolatile())
+      return false;
+
+    if (!MSI.getLength()->getType()->isIntegerTy(32))
+      return false;
+
+    if (MSI.getDestAddressSpace() > 255)
+      return false;
+
+    return SelectCall(&I, "memset");
+  }
+  case Intrinsic::trap: {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(
+      Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP));
+    return true;
+  }
+  }
+}
+
+bool ARMFastISel::SelectTrunc(const Instruction *I) {
+  // The high bits for a type smaller than the register size are assumed to be
+  // undefined.
+  Value *Op = I->getOperand(0);
+
+  EVT SrcVT, DestVT;
+  SrcVT = TLI.getValueType(DL, Op->getType(), true);
+  DestVT = TLI.getValueType(DL, I->getType(), true);
+
+  if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
+    return false;
+  if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Op);
+  if (!SrcReg) return false;
+
+  // Because the high bits are undefined, a truncate doesn't generate
+  // any code.
+  updateValueMap(I, SrcReg);
+  return true;
+}
+
+unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                    bool isZExt) {
+  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
+    return 0;
+  if (SrcVT != MVT::i16 && SrcVT != MVT::i8 && SrcVT != MVT::i1)
+    return 0;
+
+  // Table of which combinations can be emitted as a single instruction,
+  // and which will require two.
+  static const uint8_t isSingleInstrTbl[3][2][2][2] = {
+    //            ARM                     Thumb
+    //           !hasV6Ops  hasV6Ops     !hasV6Ops  hasV6Ops
+    //    ext:     s  z      s  z          s  z      s  z
+    /*  1 */ { { { 0, 1 }, { 0, 1 } }, { { 0, 0 }, { 0, 1 } } },
+    /*  8 */ { { { 0, 1 }, { 1, 1 } }, { { 0, 0 }, { 1, 1 } } },
+    /* 16 */ { { { 0, 0 }, { 1, 1 } }, { { 0, 0 }, { 1, 1 } } }
+  };
+
+  // Target registers for:
+  //  - For ARM can never be PC.
+  //  - For 16-bit Thumb are restricted to lower 8 registers.
+  //  - For 32-bit Thumb are restricted to non-SP and non-PC.
+  static const TargetRegisterClass *RCTbl[2][2] = {
+    // Instructions: Two                     Single
+    /* ARM      */ { &ARM::GPRnopcRegClass, &ARM::GPRnopcRegClass },
+    /* Thumb    */ { &ARM::tGPRRegClass,    &ARM::rGPRRegClass    }
+  };
+
+  // Table governing the instruction(s) to be emitted.
+  static const struct InstructionTable {
+    uint32_t Opc   : 16;
+    uint32_t hasS  :  1; // Some instructions have an S bit, always set it to 0.
+    uint32_t Shift :  7; // For shift operand addressing mode, used by MOVsi.
+    uint32_t Imm   :  8; // All instructions have either a shift or a mask.
+  } IT[2][2][3][2] = {
+    { // Two instructions (first is left shift, second is in this table).
+      { // ARM                Opc           S  Shift             Imm
+        /*  1 bit sext */ { { ARM::MOVsi  , 1, ARM_AM::asr     ,  31 },
+        /*  1 bit zext */   { ARM::MOVsi  , 1, ARM_AM::lsr     ,  31 } },
+        /*  8 bit sext */ { { ARM::MOVsi  , 1, ARM_AM::asr     ,  24 },
+        /*  8 bit zext */   { ARM::MOVsi  , 1, ARM_AM::lsr     ,  24 } },
+        /* 16 bit sext */ { { ARM::MOVsi  , 1, ARM_AM::asr     ,  16 },
+        /* 16 bit zext */   { ARM::MOVsi  , 1, ARM_AM::lsr     ,  16 } }
+      },
+      { // Thumb              Opc           S  Shift             Imm
+        /*  1 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift,  31 },
+        /*  1 bit zext */   { ARM::tLSRri , 0, ARM_AM::no_shift,  31 } },
+        /*  8 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift,  24 },
+        /*  8 bit zext */   { ARM::tLSRri , 0, ARM_AM::no_shift,  24 } },
+        /* 16 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift,  16 },
+        /* 16 bit zext */   { ARM::tLSRri , 0, ARM_AM::no_shift,  16 } }
+      }
+    },
+    { // Single instruction.
+      { // ARM                Opc           S  Shift             Imm
+        /*  1 bit sext */ { { ARM::KILL   , 0, ARM_AM::no_shift,   0 },
+        /*  1 bit zext */   { ARM::ANDri  , 1, ARM_AM::no_shift,   1 } },
+        /*  8 bit sext */ { { ARM::SXTB   , 0, ARM_AM::no_shift,   0 },
+        /*  8 bit zext */   { ARM::ANDri  , 1, ARM_AM::no_shift, 255 } },
+        /* 16 bit sext */ { { ARM::SXTH   , 0, ARM_AM::no_shift,   0 },
+        /* 16 bit zext */   { ARM::UXTH   , 0, ARM_AM::no_shift,   0 } }
+      },
+      { // Thumb              Opc           S  Shift             Imm
+        /*  1 bit sext */ { { ARM::KILL   , 0, ARM_AM::no_shift,   0 },
+        /*  1 bit zext */   { ARM::t2ANDri, 1, ARM_AM::no_shift,   1 } },
+        /*  8 bit sext */ { { ARM::t2SXTB , 0, ARM_AM::no_shift,   0 },
+        /*  8 bit zext */   { ARM::t2ANDri, 1, ARM_AM::no_shift, 255 } },
+        /* 16 bit sext */ { { ARM::t2SXTH , 0, ARM_AM::no_shift,   0 },
+        /* 16 bit zext */   { ARM::t2UXTH , 0, ARM_AM::no_shift,   0 } }
+      }
+    }
+  };
+
+  unsigned SrcBits = SrcVT.getSizeInBits();
+  unsigned DestBits = DestVT.getSizeInBits();
+  (void) DestBits;
+  assert((SrcBits < DestBits) && "can only extend to larger types");
+  assert((DestBits == 32 || DestBits == 16 || DestBits == 8) &&
+         "other sizes unimplemented");
+  assert((SrcBits == 16 || SrcBits == 8 || SrcBits == 1) &&
+         "other sizes unimplemented");
+
+  bool hasV6Ops = Subtarget->hasV6Ops();
+  unsigned Bitness = SrcBits / 8;  // {1,8,16}=>{0,1,2}
+  assert((Bitness < 3) && "sanity-check table bounds");
+
+  bool isSingleInstr = isSingleInstrTbl[Bitness][isThumb2][hasV6Ops][isZExt];
+  const TargetRegisterClass *RC = RCTbl[isThumb2][isSingleInstr];
+  const InstructionTable *ITP = &IT[isSingleInstr][isThumb2][Bitness][isZExt];
+  unsigned Opc = ITP->Opc;
+  assert(ARM::KILL != Opc && "Invalid table entry");
+  unsigned hasS = ITP->hasS;
+  ARM_AM::ShiftOpc Shift = (ARM_AM::ShiftOpc) ITP->Shift;
+  assert(((Shift == ARM_AM::no_shift) == (Opc != ARM::MOVsi)) &&
+         "only MOVsi has shift operand addressing mode");
+  unsigned Imm = ITP->Imm;
+
+  // 16-bit Thumb instructions always set CPSR (unless they're in an IT block).
+  bool setsCPSR = &ARM::tGPRRegClass == RC;
+  unsigned LSLOpc = isThumb2 ? ARM::tLSLri : ARM::MOVsi;
+  unsigned ResultReg;
+  // MOVsi encodes shift and immediate in shift operand addressing mode.
+  // The following condition has the same value when emitting two
+  // instruction sequences: both are shifts.
+  bool ImmIsSO = (Shift != ARM_AM::no_shift);
+
+  // Either one or two instructions are emitted.
+  // They're always of the form:
+  //   dst = in OP imm
+  // CPSR is set only by 16-bit Thumb instructions.
+  // Predicate, if any, is AL.
+  // S bit, if available, is always 0.
+  // When two are emitted the first's result will feed as the second's input,
+  // that value is then dead.
+  unsigned NumInstrsEmitted = isSingleInstr ? 1 : 2;
+  for (unsigned Instr = 0; Instr != NumInstrsEmitted; ++Instr) {
+    ResultReg = createResultReg(RC);
+    bool isLsl = (0 == Instr) && !isSingleInstr;
+    unsigned Opcode = isLsl ? LSLOpc : Opc;
+    ARM_AM::ShiftOpc ShiftAM = isLsl ? ARM_AM::lsl : Shift;
+    unsigned ImmEnc = ImmIsSO ? ARM_AM::getSORegOpc(ShiftAM, Imm) : Imm;
+    bool isKill = 1 == Instr;
+    MachineInstrBuilder MIB = BuildMI(
+        *FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opcode), ResultReg);
+    if (setsCPSR)
+      MIB.addReg(ARM::CPSR, RegState::Define);
+    SrcReg = constrainOperandRegClass(TII.get(Opcode), SrcReg, 1 + setsCPSR);
+    AddDefaultPred(MIB.addReg(SrcReg, isKill * RegState::Kill).addImm(ImmEnc));
+    if (hasS)
+      AddDefaultCC(MIB);
+    // Second instruction consumes the first's result.
+    SrcReg = ResultReg;
+  }
+
+  return ResultReg;
+}
+
+bool ARMFastISel::SelectIntExt(const Instruction *I) {
+  // On ARM, in general, integer casts don't involve legal types; this code
+  // handles promotable integers.
+  Type *DestTy = I->getType();
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+
+  bool isZExt = isa<ZExtInst>(I);
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg) return false;
+
+  EVT SrcEVT, DestEVT;
+  SrcEVT = TLI.getValueType(DL, SrcTy, true);
+  DestEVT = TLI.getValueType(DL, DestTy, true);
+  if (!SrcEVT.isSimple()) return false;
+  if (!DestEVT.isSimple()) return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+  unsigned ResultReg = ARMEmitIntExt(SrcVT, SrcReg, DestVT, isZExt);
+  if (ResultReg == 0) return false;
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::SelectShift(const Instruction *I,
+                              ARM_AM::ShiftOpc ShiftTy) {
+  // We handle thumb2 mode by target independent selector
+  // or SelectionDAG ISel.
+  if (isThumb2)
+    return false;
+
+  // Only handle i32 now.
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
+  if (DestVT != MVT::i32)
+    return false;
+
+  unsigned Opc = ARM::MOVsr;
+  unsigned ShiftImm;
+  Value *Src2Value = I->getOperand(1);
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Src2Value)) {
+    ShiftImm = CI->getZExtValue();
+
+    // Fall back to selection DAG isel if the shift amount
+    // is zero or greater than the width of the value type.
+    if (ShiftImm == 0 || ShiftImm >=32)
+      return false;
+
+    Opc = ARM::MOVsi;
+  }
+
+  Value *Src1Value = I->getOperand(0);
+  unsigned Reg1 = getRegForValue(Src1Value);
+  if (Reg1 == 0) return false;
+
+  unsigned Reg2 = 0;
+  if (Opc == ARM::MOVsr) {
+    Reg2 = getRegForValue(Src2Value);
+    if (Reg2 == 0) return false;
+  }
+
+  unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass);
+  if(ResultReg == 0) return false;
+
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(Opc), ResultReg)
+                            .addReg(Reg1);
+
+  if (Opc == ARM::MOVsi)
+    MIB.addImm(ARM_AM::getSORegOpc(ShiftTy, ShiftImm));
+  else if (Opc == ARM::MOVsr) {
+    MIB.addReg(Reg2);
+    MIB.addImm(ARM_AM::getSORegOpc(ShiftTy, 0));
+  }
+
+  AddOptionalDefs(MIB);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+// TODO: SoftFP support.
+bool ARMFastISel::fastSelectInstruction(const Instruction *I) {
+
+  switch (I->getOpcode()) {
+    case Instruction::Load:
+      return SelectLoad(I);
+    case Instruction::Store:
+      return SelectStore(I);
+    case Instruction::Br:
+      return SelectBranch(I);
+    case Instruction::IndirectBr:
+      return SelectIndirectBr(I);
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+      return SelectCmp(I);
+    case Instruction::FPExt:
+      return SelectFPExt(I);
+    case Instruction::FPTrunc:
+      return SelectFPTrunc(I);
+    case Instruction::SIToFP:
+      return SelectIToFP(I, /*isSigned*/ true);
+    case Instruction::UIToFP:
+      return SelectIToFP(I, /*isSigned*/ false);
+    case Instruction::FPToSI:
+      return SelectFPToI(I, /*isSigned*/ true);
+    case Instruction::FPToUI:
+      return SelectFPToI(I, /*isSigned*/ false);
+    case Instruction::Add:
+      return SelectBinaryIntOp(I, ISD::ADD);
+    case Instruction::Or:
+      return SelectBinaryIntOp(I, ISD::OR);
+    case Instruction::Sub:
+      return SelectBinaryIntOp(I, ISD::SUB);
+    case Instruction::FAdd:
+      return SelectBinaryFPOp(I, ISD::FADD);
+    case Instruction::FSub:
+      return SelectBinaryFPOp(I, ISD::FSUB);
+    case Instruction::FMul:
+      return SelectBinaryFPOp(I, ISD::FMUL);
+    case Instruction::SDiv:
+      return SelectDiv(I, /*isSigned*/ true);
+    case Instruction::UDiv:
+      return SelectDiv(I, /*isSigned*/ false);
+    case Instruction::SRem:
+      return SelectRem(I, /*isSigned*/ true);
+    case Instruction::URem:
+      return SelectRem(I, /*isSigned*/ false);
+    case Instruction::Call:
+      if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+        return SelectIntrinsicCall(*II);
+      return SelectCall(I);
+    case Instruction::Select:
+      return SelectSelect(I);
+    case Instruction::Ret:
+      return SelectRet(I);
+    case Instruction::Trunc:
+      return SelectTrunc(I);
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      return SelectIntExt(I);
+    case Instruction::Shl:
+      return SelectShift(I, ARM_AM::lsl);
+    case Instruction::LShr:
+      return SelectShift(I, ARM_AM::lsr);
+    case Instruction::AShr:
+      return SelectShift(I, ARM_AM::asr);
+    default: break;
+  }
+  return false;
+}
+
+namespace {
+// This table describes sign- and zero-extend instructions which can be
+// folded into a preceding load. All of these extends have an immediate
+// (sometimes a mask and sometimes a shift) that's applied after
+// extension.
+const struct FoldableLoadExtendsStruct {
+  uint16_t Opc[2];  // ARM, Thumb.
+  uint8_t ExpectedImm;
+  uint8_t isZExt     : 1;
+  uint8_t ExpectedVT : 7;
+} FoldableLoadExtends[] = {
+  { { ARM::SXTH,  ARM::t2SXTH  },   0, 0, MVT::i16 },
+  { { ARM::UXTH,  ARM::t2UXTH  },   0, 1, MVT::i16 },
+  { { ARM::ANDri, ARM::t2ANDri }, 255, 1, MVT::i8  },
+  { { ARM::SXTB,  ARM::t2SXTB  },   0, 0, MVT::i8  },
+  { { ARM::UXTB,  ARM::t2UXTB  },   0, 1, MVT::i8  }
+};
+}
+
+/// \brief The specified machine instr operand is a vreg, and that
+/// vreg is being provided by the specified load instruction.  If possible,
+/// try to fold the load as an operand to the instruction, returning true if
+/// successful.
+bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                                      const LoadInst *LI) {
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(LI->getType(), VT))
+    return false;
+
+  // Combine load followed by zero- or sign-extend.
+  // ldrb r1, [r0]       ldrb r1, [r0]
+  // uxtb r2, r1     =>
+  // mov  r3, r2         mov  r3, r1
+  if (MI->getNumOperands() < 3 || !MI->getOperand(2).isImm())
+    return false;
+  const uint64_t Imm = MI->getOperand(2).getImm();
+
+  bool Found = false;
+  bool isZExt;
+  for (unsigned i = 0, e = array_lengthof(FoldableLoadExtends);
+       i != e; ++i) {
+    if (FoldableLoadExtends[i].Opc[isThumb2] == MI->getOpcode() &&
+        (uint64_t)FoldableLoadExtends[i].ExpectedImm == Imm &&
+        MVT((MVT::SimpleValueType)FoldableLoadExtends[i].ExpectedVT) == VT) {
+      Found = true;
+      isZExt = FoldableLoadExtends[i].isZExt;
+    }
+  }
+  if (!Found) return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false;
+
+  unsigned ResultReg = MI->getOperand(0).getReg();
+  if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false))
+    return false;
+  MI->eraseFromParent();
+  return true;
+}
+
+unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
+                                     unsigned Align, MVT VT) {
+  bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
+
+  LLVMContext *Context = &MF->getFunction()->getContext();
+  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+  unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+  ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
+      GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj,
+      UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier,
+      /*AddCurrentAddress=*/UseGOT_PREL);
+
+  unsigned ConstAlign =
+      MF->getDataLayout().getPrefTypeAlignment(Type::getInt32PtrTy(*Context));
+  unsigned Idx = MF->getConstantPool()->getConstantPoolIndex(CPV, ConstAlign);
+
+  unsigned TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
+  unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp;
+  MachineInstrBuilder MIB =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), TempReg)
+          .addConstantPoolIndex(Idx);
+  if (Opc == ARM::LDRcp)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
+
+  // Fix the address by adding pc.
+  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+  Opc = Subtarget->isThumb() ? ARM::tPICADD : UseGOT_PREL ? ARM::PICLDR
+                                                          : ARM::PICADD;
+  DestReg = constrainOperandRegClass(TII.get(Opc), DestReg, 0);
+  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+            .addReg(TempReg)
+            .addImm(ARMPCLabelIndex);
+  if (!Subtarget->isThumb())
+    AddDefaultPred(MIB);
+
+  if (UseGOT_PREL && Subtarget->isThumb()) {
+    unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                  TII.get(ARM::t2LDRi12), NewDestReg)
+              .addReg(DestReg)
+              .addImm(0);
+    DestReg = NewDestReg;
+    AddOptionalDefs(MIB);
+  }
+  return DestReg;
+}
+
+bool ARMFastISel::fastLowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const Function *F = FuncInfo.Fn;
+  if (F->isVarArg())
+    return false;
+
+  CallingConv::ID CC = F->getCallingConv();
+  switch (CC) {
+  default:
+    return false;
+  case CallingConv::Fast:
+  case CallingConv::C:
+  case CallingConv::ARM_AAPCS_VFP:
+  case CallingConv::ARM_AAPCS:
+  case CallingConv::ARM_APCS:
+  case CallingConv::Swift:
+    break;
+  }
+
+  // Only handle simple cases. i.e. Up to 4 i8/i16/i32 scalar arguments
+  // which are passed in r0 - r3.
+  unsigned Idx = 1;
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++Idx) {
+    if (Idx > 4)
+      return false;
+
+    if (F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::ByVal))
+      return false;
+
+    Type *ArgTy = I->getType();
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+      return false;
+
+    EVT ArgVT = TLI.getValueType(DL, ArgTy);
+    if (!ArgVT.isSimple()) return false;
+    switch (ArgVT.getSimpleVT().SimpleTy) {
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      break;
+    default:
+      return false;
+    }
+  }
+
+
+  static const MCPhysReg GPRArgRegs[] = {
+    ARM::R0, ARM::R1, ARM::R2, ARM::R3
+  };
+
+  const TargetRegisterClass *RC = &ARM::rGPRRegClass;
+  Idx = 0;
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++Idx) {
+    unsigned SrcReg = GPRArgRegs[Idx];
+    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+    // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+    // Without this, EmitLiveInCopies may eliminate the livein if its only
+    // use is a bitcast (which isn't turned into an instruction).
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(DstReg, getKillRegState(true));
+    updateValueMap(&*I, ResultReg);
+  }
+
+  return true;
+}
+
+namespace llvm {
+  FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo,
+                                const TargetLibraryInfo *libInfo) {
+    if (funcInfo.MF->getSubtarget<ARMSubtarget>().useFastISel())
+      return new ARMFastISel(funcInfo, libInfo);
+
+    return nullptr;
+  }
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMFeatures.h b/contrib/llvm/lib/Target/ARM/ARMFeatures.h
new file mode 100644
index 000000000000..0c910ab6130f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFeatures.h
@@ -0,0 +1,97 @@
+//===-- ARMFeatures.h - Checks for ARM instruction features -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the code shared between ARM CodeGen and ARM MC
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMFEATURES_H
+#define LLVM_LIB_TARGET_ARM_ARMFEATURES_H
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+
+namespace llvm {
+
+template<typename InstrType> // could be MachineInstr or MCInst
+bool IsCPSRDead(InstrType *Instr);
+
+template<typename InstrType> // could be MachineInstr or MCInst
+inline bool isV8EligibleForIT(InstrType *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case ARM::tADC:
+  case ARM::tADDi3:
+  case ARM::tADDi8:
+  case ARM::tADDrr:
+  case ARM::tAND:
+  case ARM::tASRri:
+  case ARM::tASRrr:
+  case ARM::tBIC:
+  case ARM::tEOR:
+  case ARM::tLSLri:
+  case ARM::tLSLrr:
+  case ARM::tLSRri:
+  case ARM::tLSRrr:
+  case ARM::tMOVi8:
+  case ARM::tMUL:
+  case ARM::tMVN:
+  case ARM::tORR:
+  case ARM::tROR:
+  case ARM::tRSB:
+  case ARM::tSBC:
+  case ARM::tSUBi3:
+  case ARM::tSUBi8:
+  case ARM::tSUBrr:
+    // Outside of an IT block, these set CPSR.
+    return IsCPSRDead(Instr);
+  case ARM::tADDrSPi:
+  case ARM::tCMNz:
+  case ARM::tCMPi8:
+  case ARM::tCMPr:
+  case ARM::tLDRBi:
+  case ARM::tLDRBr:
+  case ARM::tLDRHi:
+  case ARM::tLDRHr:
+  case ARM::tLDRSB:
+  case ARM::tLDRSH:
+  case ARM::tLDRi:
+  case ARM::tLDRr:
+  case ARM::tLDRspi:
+  case ARM::tSTRBi:
+  case ARM::tSTRBr:
+  case ARM::tSTRHi:
+  case ARM::tSTRHr:
+  case ARM::tSTRi:
+  case ARM::tSTRr:
+  case ARM::tSTRspi:
+  case ARM::tTST:
+    return true;
+// there are some "conditionally deprecated" opcodes
+  case ARM::tADDspr:
+  case ARM::tBLXr:
+    return Instr->getOperand(2).getReg() != ARM::PC;
+  // ADD PC, SP and BLX PC were always unpredictable,
+  // now on top of it they're deprecated
+  case ARM::tADDrSP:
+  case ARM::tBX:
+    return Instr->getOperand(0).getReg() != ARM::PC;
+  case ARM::tADDhirr:
+    return Instr->getOperand(0).getReg() != ARM::PC &&
+           Instr->getOperand(2).getReg() != ARM::PC;
+  case ARM::tCMPhir:
+  case ARM::tMOVr:
+    return Instr->getOperand(0).getReg() != ARM::PC &&
+           Instr->getOperand(1).getReg() != ARM::PC;
+  }
+}
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
new file mode 100644
index 000000000000..c72db8aca108
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -0,0 +1,2321 @@
+//===-- ARMFrameLowering.cpp - ARM Frame Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMFrameLowering.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define DEBUG_TYPE "arm-frame-lowering"
+
+using namespace llvm;
+
+static cl::opt<bool>
+SpillAlignedNEONRegs("align-neon-spills", cl::Hidden, cl::init(true),
+                     cl::desc("Align ARM NEON spills in prolog and epilog"));
+
+static MachineBasicBlock::iterator
+skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
+                        unsigned NumAlignedDPRCS2Regs);
+
+ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti)
+    : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4),
+      STI(sti) {}
+
+bool ARMFrameLowering::noFramePointerElim(const MachineFunction &MF) const {
+  // iOS always has a FP for backtracking, force other targets to keep their FP
+  // when doing FastISel. The emitted code is currently superior, and in cases
+  // like test-suite's lencod FastISel isn't quite correct when FP is eliminated.
+  return TargetFrameLowering::noFramePointerElim(MF) ||
+         MF.getSubtarget<ARMSubtarget>().useFastISel();
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.  This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+bool ARMFrameLowering::hasFP(const MachineFunction &MF) const {
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // ABI-required frame pointer.
+  if (MF.getTarget().Options.DisableFramePointerElim(MF))
+    return true;
+
+  // Frame pointer required for use within this function.
+  return (RegInfo->needsStackRealignment(MF) ||
+          MFI.hasVarSizedObjects() ||
+          MFI.isFrameAddressTaken());
+}
+
+/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+/// not required, we reserve argument space for call sites in the function
+/// immediately on entry to the current function.  This eliminates the need for
+/// add/sub sp brackets around call sites.  Returns true if the call frame is
+/// included as part of the stack frame.
+bool ARMFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  unsigned CFSize = MFI.getMaxCallFrameSize();
+  // It's not always a good idea to include the call frame as part of the
+  // stack frame. ARM (especially Thumb) has small immediate offset to
+  // address the stack frame. So a large call frame can cause poor codegen
+  // and may even makes it impossible to scavenge a register.
+  if (CFSize >= ((1 << 12) - 1) / 2)  // Half of imm12
+    return false;
+
+  return !MFI.hasVarSizedObjects();
+}
+
+/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
+/// call frame pseudos can be simplified.  Unlike most targets, having a FP
+/// is not sufficient here since we still may reference some objects via SP
+/// even when FP is available in Thumb2 mode.
+bool
+ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
+  return hasReservedCallFrame(MF) || MF.getFrameInfo().hasVarSizedObjects();
+}
+
+static bool isCSRestore(MachineInstr &MI, const ARMBaseInstrInfo &TII,
+                        const MCPhysReg *CSRegs) {
+  // Integer spill area is handled with "pop".
+  if (isPopOpcode(MI.getOpcode())) {
+    // The first two operands are predicates. The last two are
+    // imp-def and imp-use of SP. Check everything in between.
+    for (int i = 5, e = MI.getNumOperands(); i != e; ++i)
+      if (!isCalleeSavedRegister(MI.getOperand(i).getReg(), CSRegs))
+        return false;
+    return true;
+  }
+  if ((MI.getOpcode() == ARM::LDR_POST_IMM ||
+       MI.getOpcode() == ARM::LDR_POST_REG ||
+       MI.getOpcode() == ARM::t2LDR_POST) &&
+      isCalleeSavedRegister(MI.getOperand(0).getReg(), CSRegs) &&
+      MI.getOperand(1).getReg() == ARM::SP)
+    return true;
+
+  return false;
+}
+
+static void emitRegPlusImmediate(
+    bool isARM, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+    const DebugLoc &dl, const ARMBaseInstrInfo &TII, unsigned DestReg,
+    unsigned SrcReg, int NumBytes, unsigned MIFlags = MachineInstr::NoFlags,
+    ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
+  if (isARM)
+    emitARMRegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes,
+                            Pred, PredReg, TII, MIFlags);
+  else
+    emitT2RegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes,
+                           Pred, PredReg, TII, MIFlags);
+}
+
+static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator &MBBI, const DebugLoc &dl,
+                         const ARMBaseInstrInfo &TII, int NumBytes,
+                         unsigned MIFlags = MachineInstr::NoFlags,
+                         ARMCC::CondCodes Pred = ARMCC::AL,
+                         unsigned PredReg = 0) {
+  emitRegPlusImmediate(isARM, MBB, MBBI, dl, TII, ARM::SP, ARM::SP, NumBytes,
+                       MIFlags, Pred, PredReg);
+}
+
+static int sizeOfSPAdjustment(const MachineInstr &MI) {
+  int RegSize;
+  switch (MI.getOpcode()) {
+  case ARM::VSTMDDB_UPD:
+    RegSize = 8;
+    break;
+  case ARM::STMDB_UPD:
+  case ARM::t2STMDB_UPD:
+    RegSize = 4;
+    break;
+  case ARM::t2STR_PRE:
+  case ARM::STR_PRE_IMM:
+    return 4;
+  default:
+    llvm_unreachable("Unknown push or pop like instruction");
+  }
+
+  int count = 0;
+  // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+
+  // pred) so the list starts at 4.
+  for (int i = MI.getNumOperands() - 1; i >= 4; --i)
+    count += RegSize;
+  return count;
+}
+
+static bool WindowsRequiresStackProbe(const MachineFunction &MF,
+                                      size_t StackSizeInBytes) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const Function *F = MF.getFunction();
+  unsigned StackProbeSize = (MFI.getStackProtectorIndex() > 0) ? 4080 : 4096;
+  if (F->hasFnAttribute("stack-probe-size"))
+    F->getFnAttribute("stack-probe-size")
+        .getValueAsString()
+        .getAsInteger(0, StackProbeSize);
+  return StackSizeInBytes >= StackProbeSize;
+}
+
+namespace {
+struct StackAdjustingInsts {
+  struct InstInfo {
+    MachineBasicBlock::iterator I;
+    unsigned SPAdjust;
+    bool BeforeFPSet;
+  };
+
+  SmallVector<InstInfo, 4> Insts;
+
+  void addInst(MachineBasicBlock::iterator I, unsigned SPAdjust,
+               bool BeforeFPSet = false) {
+    InstInfo Info = {I, SPAdjust, BeforeFPSet};
+    Insts.push_back(Info);
+  }
+
+  void addExtraBytes(const MachineBasicBlock::iterator I, unsigned ExtraBytes) {
+    auto Info = find_if(Insts, [&](InstInfo &Info) { return Info.I == I; });
+    assert(Info != Insts.end() && "invalid sp adjusting instruction");
+    Info->SPAdjust += ExtraBytes;
+  }
+
+  void emitDefCFAOffsets(MachineBasicBlock &MBB, const DebugLoc &dl,
+                         const ARMBaseInstrInfo &TII, bool HasFP) {
+    MachineFunction &MF = *MBB.getParent();
+    unsigned CFAOffset = 0;
+    for (auto &Info : Insts) {
+      if (HasFP && !Info.BeforeFPSet)
+        return;
+
+      CFAOffset -= Info.SPAdjust;
+      unsigned CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+      BuildMI(MBB, std::next(Info.I), dl,
+              TII.get(TargetOpcode::CFI_INSTRUCTION))
+              .addCFIIndex(CFIIndex)
+              .setMIFlags(MachineInstr::FrameSetup);
+    }
+  }
+};
+}
+
+/// Emit an instruction sequence that will align the address in
+/// register Reg by zero-ing out the lower bits.  For versions of the
+/// architecture that support Neon, this must be done in a single
+/// instruction, since skipAlignedDPRCS2Spills assumes it is done in a
+/// single instruction. That function only gets called when optimizing
+/// spilling of D registers on a core with the Neon instruction set
+/// present.
+static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
+                                     const TargetInstrInfo &TII,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     const DebugLoc &DL, const unsigned Reg,
+                                     const unsigned Alignment,
+                                     const bool MustBeSingleInstruction) {
+  const ARMSubtarget &AST =
+      static_cast<const ARMSubtarget &>(MF.getSubtarget());
+  const bool CanUseBFC = AST.hasV6T2Ops() || AST.hasV7Ops();
+  const unsigned AlignMask = Alignment - 1;
+  const unsigned NrBitsToZero = countTrailingZeros(Alignment);
+  assert(!AFI->isThumb1OnlyFunction() && "Thumb1 not supported");
+  if (!AFI->isThumbFunction()) {
+    // if the BFC instruction is available, use that to zero the lower
+    // bits:
+    //   bfc Reg, #0, log2(Alignment)
+    // otherwise use BIC, if the mask to zero the required number of bits
+    // can be encoded in the bic immediate field
+    //   bic Reg, Reg, Alignment-1
+    // otherwise, emit
+    //   lsr Reg, Reg, log2(Alignment)
+    //   lsl Reg, Reg, log2(Alignment)
+    if (CanUseBFC) {
+      AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BFC), Reg)
+                         .addReg(Reg, RegState::Kill)
+                         .addImm(~AlignMask));
+    } else if (AlignMask <= 255) {
+      AddDefaultCC(
+          AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BICri), Reg)
+                             .addReg(Reg, RegState::Kill)
+                             .addImm(AlignMask)));
+    } else {
+      assert(!MustBeSingleInstruction &&
+             "Shouldn't call emitAligningInstructions demanding a single "
+             "instruction to be emitted for large stack alignment for a target "
+             "without BFC.");
+      AddDefaultCC(AddDefaultPred(
+          BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
+              .addReg(Reg, RegState::Kill)
+              .addImm(ARM_AM::getSORegOpc(ARM_AM::lsr, NrBitsToZero))));
+      AddDefaultCC(AddDefaultPred(
+          BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
+              .addReg(Reg, RegState::Kill)
+              .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, NrBitsToZero))));
+    }
+  } else {
+    // Since this is only reached for Thumb-2 targets, the BFC instruction
+    // should always be available.
+    assert(CanUseBFC);
+    AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::t2BFC), Reg)
+                       .addReg(Reg, RegState::Kill)
+                       .addImm(~AlignMask));
+  }
+}
+
+void ARMFrameLowering::emitPrologue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo  &MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  MachineModuleInfo &MMI = MF.getMMI();
+  MCContext &Context = MMI.getContext();
+  const TargetMachine &TM = MF.getTarget();
+  const MCRegisterInfo *MRI = Context.getRegisterInfo();
+  const ARMBaseRegisterInfo *RegInfo = STI.getRegisterInfo();
+  const ARMBaseInstrInfo &TII = *STI.getInstrInfo();
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This emitPrologue does not support Thumb1!");
+  bool isARM = !AFI->isThumbFunction();
+  unsigned Align = STI.getFrameLowering()->getStackAlignment();
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+  unsigned NumBytes = MFI.getStackSize();
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc dl;
+  
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+  // Determine the sizes of each callee-save spill areas and record which frame
+  // belongs to which callee-save spill areas.
+  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+  int FramePtrSpillFI = 0;
+  int D8SpillFI = 0;
+
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
+  StackAdjustingInsts DefCFAOffsetCandidates;
+  bool HasFP = hasFP(MF);
+
+  // Allocate the vararg register save area.
+  if (ArgRegsSaveSize) {
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize,
+                 MachineInstr::FrameSetup);
+    DefCFAOffsetCandidates.addInst(std::prev(MBBI), ArgRegsSaveSize, true);
+  }
+
+  if (!AFI->hasStackFrame() &&
+      (!STI.isTargetWindows() || !WindowsRequiresStackProbe(MF, NumBytes))) {
+    if (NumBytes - ArgRegsSaveSize != 0) {
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -(NumBytes - ArgRegsSaveSize),
+                   MachineInstr::FrameSetup);
+      DefCFAOffsetCandidates.addInst(std::prev(MBBI),
+                                     NumBytes - ArgRegsSaveSize, true);
+    }
+    DefCFAOffsetCandidates.emitDefCFAOffsets(MBB, dl, TII, HasFP);
+    return;
+  }
+
+  // Determine spill area sizes.
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    int FI = CSI[i].getFrameIdx();
+    switch (Reg) {
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+    case ARM::R12:
+      if (STI.splitFramePushPop(MF)) {
+        GPRCS2Size += 4;
+        break;
+      }
+      LLVM_FALLTHROUGH;
+    case ARM::R0:
+    case ARM::R1:
+    case ARM::R2:
+    case ARM::R3:
+    case ARM::R4:
+    case ARM::R5:
+    case ARM::R6:
+    case ARM::R7:
+    case ARM::LR:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      GPRCS1Size += 4;
+      break;
+    default:
+      // This is a DPR. Exclude the aligned DPRCS2 spills.
+      if (Reg == ARM::D8)
+        D8SpillFI = FI;
+      if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())
+        DPRCSSize += 8;
+    }
+  }
+
+  // Move past area 1.
+  MachineBasicBlock::iterator LastPush = MBB.end(), GPRCS1Push, GPRCS2Push;
+  if (GPRCS1Size > 0) {
+    GPRCS1Push = LastPush = MBBI++;
+    DefCFAOffsetCandidates.addInst(LastPush, GPRCS1Size, true);
+  }
+
+  // Determine starting offsets of spill areas.
+  unsigned GPRCS1Offset = NumBytes - ArgRegsSaveSize - GPRCS1Size;
+  unsigned GPRCS2Offset = GPRCS1Offset - GPRCS2Size;
+  unsigned DPRAlign = DPRCSSize ? std::min(8U, Align) : 4U;
+  unsigned DPRGapSize = (GPRCS1Size + GPRCS2Size + ArgRegsSaveSize) % DPRAlign;
+  unsigned DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize;
+  int FramePtrOffsetInPush = 0;
+  if (HasFP) {
+    FramePtrOffsetInPush =
+        MFI.getObjectOffset(FramePtrSpillFI) + ArgRegsSaveSize;
+    AFI->setFramePtrSpillOffset(MFI.getObjectOffset(FramePtrSpillFI) +
+                                NumBytes);
+  }
+  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
+  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
+  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+
+  // Move past area 2.
+  if (GPRCS2Size > 0) {
+    GPRCS2Push = LastPush = MBBI++;
+    DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size);
+  }
+
+  // Prolog/epilog inserter assumes we correctly align DPRs on the stack, so our
+  // .cfi_offset operations will reflect that.
+  if (DPRGapSize) {
+    assert(DPRGapSize == 4 && "unexpected alignment requirements for DPRs");
+    if (LastPush != MBB.end() &&
+        tryFoldSPUpdateIntoPushPop(STI, MF, &*LastPush, DPRGapSize))
+      DefCFAOffsetCandidates.addExtraBytes(LastPush, DPRGapSize);
+    else {
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRGapSize,
+                   MachineInstr::FrameSetup);
+      DefCFAOffsetCandidates.addInst(std::prev(MBBI), DPRGapSize);
+    }
+  }
+
+  // Move past area 3.
+  if (DPRCSSize > 0) {
+    // Since vpush register list cannot have gaps, there may be multiple vpush
+    // instructions in the prologue.
+    while (MBBI->getOpcode() == ARM::VSTMDDB_UPD) {
+      DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(*MBBI));
+      LastPush = MBBI++;
+    }
+  }
+
+  // Move past the aligned DPRCS2 area.
+  if (AFI->getNumAlignedDPRCS2Regs() > 0) {
+    MBBI = skipAlignedDPRCS2Spills(MBBI, AFI->getNumAlignedDPRCS2Regs());
+    // The code inserted by emitAlignedDPRCS2Spills realigns the stack, and
+    // leaves the stack pointer pointing to the DPRCS2 area.
+    //
+    // Adjust NumBytes to represent the stack slots below the DPRCS2 area.
+    NumBytes += MFI.getObjectOffset(D8SpillFI);
+  } else
+    NumBytes = DPRCSOffset;
+
+  if (STI.isTargetWindows() && WindowsRequiresStackProbe(MF, NumBytes)) {
+    uint32_t NumWords = NumBytes >> 2;
+
+    if (NumWords < 65536)
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), ARM::R4)
+                     .addImm(NumWords)
+                     .setMIFlags(MachineInstr::FrameSetup));
+    else
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R4)
+        .addImm(NumWords)
+        .setMIFlags(MachineInstr::FrameSetup);
+
+    switch (TM.getCodeModel()) {
+    case CodeModel::Small:
+    case CodeModel::Medium:
+    case CodeModel::Default:
+    case CodeModel::Kernel:
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tBL))
+        .addImm((unsigned)ARMCC::AL).addReg(0)
+        .addExternalSymbol("__chkstk")
+        .addReg(ARM::R4, RegState::Implicit)
+        .setMIFlags(MachineInstr::FrameSetup);
+      break;
+    case CodeModel::Large:
+    case CodeModel::JITDefault:
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R12)
+        .addExternalSymbol("__chkstk")
+        .setMIFlags(MachineInstr::FrameSetup);
+
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tBLXr))
+        .addImm((unsigned)ARMCC::AL).addReg(0)
+        .addReg(ARM::R12, RegState::Kill)
+        .addReg(ARM::R4, RegState::Implicit)
+        .setMIFlags(MachineInstr::FrameSetup);
+      break;
+    }
+
+    AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr),
+                                        ARM::SP)
+                                .addReg(ARM::SP, RegState::Kill)
+                                .addReg(ARM::R4, RegState::Kill)
+                                .setMIFlags(MachineInstr::FrameSetup)));
+    NumBytes = 0;
+  }
+
+  if (NumBytes) {
+    // Adjust SP after all the callee-save spills.
+    if (AFI->getNumAlignedDPRCS2Regs() == 0 &&
+        tryFoldSPUpdateIntoPushPop(STI, MF, &*LastPush, NumBytes))
+      DefCFAOffsetCandidates.addExtraBytes(LastPush, NumBytes);
+    else {
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
+                   MachineInstr::FrameSetup);
+      DefCFAOffsetCandidates.addInst(std::prev(MBBI), NumBytes);
+    }
+
+    if (HasFP && isARM)
+      // Restore from fp only in ARM mode: e.g. sub sp, r7, #24
+      // Note it's not safe to do this in Thumb2 mode because it would have
+      // taken two instructions:
+      // mov sp, r7
+      // sub sp, #24
+      // If an interrupt is taken between the two instructions, then sp is in
+      // an inconsistent state (pointing to the middle of callee-saved area).
+      // The interrupt handler can end up clobbering the registers.
+      AFI->setShouldRestoreSPFromFP(true);
+  }
+
+  // Set FP to point to the stack slot that contains the previous FP.
+  // For iOS, FP is R7, which has now been stored in spill area 1.
+  // Otherwise, if this is not iOS, all the callee-saved registers go
+  // into spill area 1, including the FP in R11.  In either case, it
+  // is in area one and the adjustment needs to take place just after
+  // that push.
+  if (HasFP) {
+    MachineBasicBlock::iterator AfterPush = std::next(GPRCS1Push);
+    unsigned PushSize = sizeOfSPAdjustment(*GPRCS1Push);
+    emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush,
+                         dl, TII, FramePtr, ARM::SP,
+                         PushSize + FramePtrOffsetInPush,
+                         MachineInstr::FrameSetup);
+    if (FramePtrOffsetInPush + PushSize != 0) {
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+          nullptr, MRI->getDwarfRegNum(FramePtr, true),
+          -(ArgRegsSaveSize - FramePtrOffsetInPush)));
+      BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+    } else {
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+              nullptr, MRI->getDwarfRegNum(FramePtr, true)));
+      BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+  }
+
+  // Now that the prologue's actual instructions are finalised, we can insert
+  // the necessary DWARF cf instructions to describe the situation. Start by
+  // recording where each register ended up:
+  if (GPRCS1Size > 0) {
+    MachineBasicBlock::iterator Pos = std::next(GPRCS1Push);
+    int CFIIndex;
+    for (const auto &Entry : CSI) {
+      unsigned Reg = Entry.getReg();
+      int FI = Entry.getFrameIdx();
+      switch (Reg) {
+      case ARM::R8:
+      case ARM::R9:
+      case ARM::R10:
+      case ARM::R11:
+      case ARM::R12:
+        if (STI.splitFramePushPop(MF))
+          break;
+        LLVM_FALLTHROUGH;
+      case ARM::R0:
+      case ARM::R1:
+      case ARM::R2:
+      case ARM::R3:
+      case ARM::R4:
+      case ARM::R5:
+      case ARM::R6:
+      case ARM::R7:
+      case ARM::LR:
+        CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+            nullptr, MRI->getDwarfRegNum(Reg, true), MFI.getObjectOffset(FI)));
+        BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex)
+            .setMIFlags(MachineInstr::FrameSetup);
+        break;
+      }
+    }
+  }
+
+  if (GPRCS2Size > 0) {
+    MachineBasicBlock::iterator Pos = std::next(GPRCS2Push);
+    for (const auto &Entry : CSI) {
+      unsigned Reg = Entry.getReg();
+      int FI = Entry.getFrameIdx();
+      switch (Reg) {
+      case ARM::R8:
+      case ARM::R9:
+      case ARM::R10:
+      case ARM::R11:
+      case ARM::R12:
+        if (STI.splitFramePushPop(MF)) {
+          unsigned DwarfReg =  MRI->getDwarfRegNum(Reg, true);
+          unsigned Offset = MFI.getObjectOffset(FI);
+          unsigned CFIIndex = MF.addFrameInst(
+              MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+          BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+              .addCFIIndex(CFIIndex)
+              .setMIFlags(MachineInstr::FrameSetup);
+        }
+        break;
+      }
+    }
+  }
+
+  if (DPRCSSize > 0) {
+    // Since vpush register list cannot have gaps, there may be multiple vpush
+    // instructions in the prologue.
+    MachineBasicBlock::iterator Pos = std::next(LastPush);
+    for (const auto &Entry : CSI) {
+      unsigned Reg = Entry.getReg();
+      int FI = Entry.getFrameIdx();
+      if ((Reg >= ARM::D0 && Reg <= ARM::D31) &&
+          (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())) {
+        unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+        unsigned Offset = MFI.getObjectOffset(FI);
+        unsigned CFIIndex = MF.addFrameInst(
+            MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+        BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex)
+            .setMIFlags(MachineInstr::FrameSetup);
+      }
+    }
+  }
+
+  // Now we can emit descriptions of where the canonical frame address was
+  // throughout the process. If we have a frame pointer, it takes over the job
+  // half-way through, so only the first few .cfi_def_cfa_offset instructions
+  // actually get emitted.
+  DefCFAOffsetCandidates.emitDefCFAOffsets(MBB, dl, TII, HasFP);
+
+  if (STI.isTargetELF() && hasFP(MF))
+    MFI.setOffsetAdjustment(MFI.getOffsetAdjustment() -
+                            AFI->getFramePtrSpillOffset());
+
+  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
+  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+  AFI->setDPRCalleeSavedGapSize(DPRGapSize);
+  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+
+  // If we need dynamic stack realignment, do it here. Be paranoid and make
+  // sure if we also have VLAs, we have a base pointer for frame access.
+  // If aligned NEON registers were spilled, the stack has already been
+  // realigned.
+  if (!AFI->getNumAlignedDPRCS2Regs() && RegInfo->needsStackRealignment(MF)) {
+    unsigned MaxAlign = MFI.getMaxAlignment();
+    assert(!AFI->isThumb1OnlyFunction());
+    if (!AFI->isThumbFunction()) {
+      emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::SP, MaxAlign,
+                               false);
+    } else {
+      // We cannot use sp as source/dest register here, thus we're using r4 to
+      // perform the calculations. We're emitting the following sequence:
+      // mov r4, sp
+      // -- use emitAligningInstructions to produce best sequence to zero
+      // -- out lower bits in r4
+      // mov sp, r4
+      // FIXME: It will be better just to find spare register here.
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
+                         .addReg(ARM::SP, RegState::Kill));
+      emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign,
+                               false);
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+                         .addReg(ARM::R4, RegState::Kill));
+    }
+
+    AFI->setShouldRestoreSPFromFP(true);
+  }
+
+  // If we need a base pointer, set it up here. It's whatever the value
+  // of the stack pointer is at this point. Any variable size objects
+  // will be allocated after this, so we can still use the base pointer
+  // to reference locals.
+  // FIXME: Clarify FrameSetup flags here.
+  if (RegInfo->hasBasePointer(MF)) {
+    if (isARM)
+      BuildMI(MBB, MBBI, dl,
+              TII.get(ARM::MOVr), RegInfo->getBaseRegister())
+        .addReg(ARM::SP)
+        .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+    else
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+                             RegInfo->getBaseRegister())
+        .addReg(ARM::SP));
+  }
+
+  // If the frame has variable sized objects then the epilogue must restore
+  // the sp from fp. We can assume there's an FP here since hasFP already
+  // checks for hasVarSizedObjects.
+  if (MFI.hasVarSizedObjects())
+    AFI->setShouldRestoreSPFromFP(true);
+}
+
+void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  const ARMBaseInstrInfo &TII =
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This emitEpilogue does not support Thumb1!");
+  bool isARM = !AFI->isThumbFunction();
+
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+  int NumBytes = (int)MFI.getStackSize();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
+  // First put ourselves on the first (from top) terminator instructions.
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes - ArgRegsSaveSize != 0)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize);
+  } else {
+    // Unwind MBBI to point to first LDR / VLDRD.
+    const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+    if (MBBI != MBB.begin()) {
+      do {
+        --MBBI;
+      } while (MBBI != MBB.begin() && isCSRestore(*MBBI, TII, CSRegs));
+      if (!isCSRestore(*MBBI, TII, CSRegs))
+        ++MBBI;
+    }
+
+    // Move SP to start of FP callee save spill area.
+    NumBytes -= (ArgRegsSaveSize +
+                 AFI->getGPRCalleeSavedArea1Size() +
+                 AFI->getGPRCalleeSavedArea2Size() +
+                 AFI->getDPRCalleeSavedGapSize() +
+                 AFI->getDPRCalleeSavedAreaSize());
+
+    // Reset SP based on frame pointer only if the stack frame extends beyond
+    // frame pointer stack slot or target is ELF and the function has FP.
+    if (AFI->shouldRestoreSPFromFP()) {
+      NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+      if (NumBytes) {
+        if (isARM)
+          emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
+                                  ARMCC::AL, 0, TII);
+        else {
+          // It's not possible to restore SP from FP in a single instruction.
+          // For iOS, this looks like:
+          // mov sp, r7
+          // sub sp, #24
+          // This is bad, if an interrupt is taken after the mov, sp is in an
+          // inconsistent state.
+          // Use the first callee-saved register as a scratch register.
+          assert(!MFI.getPristineRegs(MF).test(ARM::R4) &&
+                 "No scratch register to restore SP from FP!");
+          emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
+                                 ARMCC::AL, 0, TII);
+          AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+                                 ARM::SP)
+            .addReg(ARM::R4));
+        }
+      } else {
+        // Thumb2 or ARM.
+        if (isARM)
+          BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP)
+            .addReg(FramePtr).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+        else
+          AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+                                 ARM::SP)
+            .addReg(FramePtr));
+      }
+    } else if (NumBytes &&
+               !tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes))
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+
+    // Increment past our save areas.
+    if (MBBI != MBB.end() && AFI->getDPRCalleeSavedAreaSize()) {
+      MBBI++;
+      // Since vpop register list cannot have gaps, there may be multiple vpop
+      // instructions in the epilogue.
+      while (MBBI != MBB.end() && MBBI->getOpcode() == ARM::VLDMDIA_UPD)
+        MBBI++;
+    }
+    if (AFI->getDPRCalleeSavedGapSize()) {
+      assert(AFI->getDPRCalleeSavedGapSize() == 4 &&
+             "unexpected DPR alignment gap");
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedGapSize());
+    }
+
+    if (AFI->getGPRCalleeSavedArea2Size()) MBBI++;
+    if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
+  }
+
+  if (ArgRegsSaveSize)
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize);
+}
+
+/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
+/// debug info.  It's the same as what we use for resolving the code-gen
+/// references for now.  FIXME: This can go wrong when references are
+/// SP-relative and simple call frames aren't used.
+int
+ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                         unsigned &FrameReg) const {
+  return ResolveFrameIndexReference(MF, FI, FrameReg, 0);
+}
+
+int
+ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
+                                             int FI, unsigned &FrameReg,
+                                             int SPAdj) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize();
+  int FPOffset = Offset - AFI->getFramePtrSpillOffset();
+  bool isFixed = MFI.isFixedObjectIndex(FI);
+
+  FrameReg = ARM::SP;
+  Offset += SPAdj;
+
+  // SP can move around if there are allocas.  We may also lose track of SP
+  // when emergency spilling inside a non-reserved call frame setup.
+  bool hasMovingSP = !hasReservedCallFrame(MF);
+
+  // When dynamically realigning the stack, use the frame pointer for
+  // parameters, and the stack/base pointer for locals.
+  if (RegInfo->needsStackRealignment(MF)) {
+    assert (hasFP(MF) && "dynamic stack realignment without a FP!");
+    if (isFixed) {
+      FrameReg = RegInfo->getFrameRegister(MF);
+      Offset = FPOffset;
+    } else if (hasMovingSP) {
+      assert(RegInfo->hasBasePointer(MF) &&
+             "VLAs and dynamic stack alignment, but missing base pointer!");
+      FrameReg = RegInfo->getBaseRegister();
+    }
+    return Offset;
+  }
+
+  // If there is a frame pointer, use it when we can.
+  if (hasFP(MF) && AFI->hasStackFrame()) {
+    // Use frame pointer to reference fixed objects. Use it for locals if
+    // there are VLAs (and thus the SP isn't reliable as a base).
+    if (isFixed || (hasMovingSP && !RegInfo->hasBasePointer(MF))) {
+      FrameReg = RegInfo->getFrameRegister(MF);
+      return FPOffset;
+    } else if (hasMovingSP) {
+      assert(RegInfo->hasBasePointer(MF) && "missing base pointer!");
+      if (AFI->isThumb2Function()) {
+        // Try to use the frame pointer if we can, else use the base pointer
+        // since it's available. This is handy for the emergency spill slot, in
+        // particular.
+        if (FPOffset >= -255 && FPOffset < 0) {
+          FrameReg = RegInfo->getFrameRegister(MF);
+          return FPOffset;
+        }
+      }
+    } else if (AFI->isThumb2Function()) {
+      // Use  add <rd>, sp, #<imm8>
+      //      ldr <rd>, [sp, #<imm8>]
+      // if at all possible to save space.
+      if (Offset >= 0 && (Offset & 3) == 0 && Offset <= 1020)
+        return Offset;
+      // In Thumb2 mode, the negative offset is very limited. Try to avoid
+      // out of range references. ldr <rt>,[<rn>, #-<imm8>]
+      if (FPOffset >= -255 && FPOffset < 0) {
+        FrameReg = RegInfo->getFrameRegister(MF);
+        return FPOffset;
+      }
+    } else if (Offset > (FPOffset < 0 ? -FPOffset : FPOffset)) {
+      // Otherwise, use SP or FP, whichever is closer to the stack slot.
+      FrameReg = RegInfo->getFrameRegister(MF);
+      return FPOffset;
+    }
+  }
+  // Use the base pointer if we have one.
+  if (RegInfo->hasBasePointer(MF))
+    FrameReg = RegInfo->getBaseRegister();
+  return Offset;
+}
+
+void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    const std::vector<CalleeSavedInfo> &CSI,
+                                    unsigned StmOpc, unsigned StrOpc,
+                                    bool NoGap,
+                                    bool(*Func)(unsigned, bool),
+                                    unsigned NumAlignedDPRCS2Regs,
+                                    unsigned MIFlags) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+
+  DebugLoc DL;
+
+  typedef std::pair<unsigned, bool> RegAndKill;
+  SmallVector<RegAndKill, 4> Regs;
+  unsigned i = CSI.size();
+  while (i != 0) {
+    unsigned LastReg = 0;
+    for (; i != 0; --i) {
+      unsigned Reg = CSI[i-1].getReg();
+      if (!(Func)(Reg, STI.splitFramePushPop(MF))) continue;
+
+      // D-registers in the aligned area DPRCS2 are NOT spilled here.
+      if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
+        continue;
+
+      bool isLiveIn = MF.getRegInfo().isLiveIn(Reg);
+      if (!isLiveIn)
+        MBB.addLiveIn(Reg);
+      // If NoGap is true, push consecutive registers and then leave the rest
+      // for other instructions. e.g.
+      // vpush {d8, d10, d11} -> vpush {d8}, vpush {d10, d11}
+      if (NoGap && LastReg && LastReg != Reg-1)
+        break;
+      LastReg = Reg;
+      // Do not set a kill flag on values that are also marked as live-in. This
+      // happens with the @llvm-returnaddress intrinsic and with arguments
+      // passed in callee saved registers.
+      // Omitting the kill flags is conservatively correct even if the live-in
+      // is not used after all.
+      Regs.push_back(std::make_pair(Reg, /*isKill=*/!isLiveIn));
+    }
+
+    if (Regs.empty())
+      continue;
+
+    std::sort(Regs.begin(), Regs.end(), [&](const RegAndKill &LHS,
+                                            const RegAndKill &RHS) {
+      return TRI.getEncodingValue(LHS.first) < TRI.getEncodingValue(RHS.first);
+    });
+
+    if (Regs.size() > 1 || StrOpc== 0) {
+      MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(StmOpc), ARM::SP)
+                       .addReg(ARM::SP).setMIFlags(MIFlags));
+      for (unsigned i = 0, e = Regs.size(); i < e; ++i)
+        MIB.addReg(Regs[i].first, getKillRegState(Regs[i].second));
+    } else if (Regs.size() == 1) {
+      MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc),
+                                        ARM::SP)
+        .addReg(Regs[0].first, getKillRegState(Regs[0].second))
+        .addReg(ARM::SP).setMIFlags(MIFlags)
+        .addImm(-4);
+      AddDefaultPred(MIB);
+    }
+    Regs.clear();
+
+    // Put any subsequent vpush instructions before this one: they will refer to
+    // higher register numbers so need to be pushed first in order to preserve
+    // monotonicity.
+    if (MI != MBB.begin())
+      --MI;
+  }
+}
+
+void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   unsigned LdmOpc, unsigned LdrOpc,
+                                   bool isVarArg, bool NoGap,
+                                   bool(*Func)(unsigned, bool),
+                                   unsigned NumAlignedDPRCS2Regs) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc DL;
+  bool isTailCall = false;
+  bool isInterrupt = false;
+  bool isTrap = false;
+  if (MBB.end() != MI) {
+    DL = MI->getDebugLoc();
+    unsigned RetOpcode = MI->getOpcode();
+    isTailCall = (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri);
+    isInterrupt =
+        RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR;
+    isTrap =
+        RetOpcode == ARM::TRAP || RetOpcode == ARM::TRAPNaCl ||
+        RetOpcode == ARM::tTRAP;
+  }
+
+  SmallVector<unsigned, 4> Regs;
+  unsigned i = CSI.size();
+  while (i != 0) {
+    unsigned LastReg = 0;
+    bool DeleteRet = false;
+    for (; i != 0; --i) {
+      unsigned Reg = CSI[i-1].getReg();
+      if (!(Func)(Reg, STI.splitFramePushPop(MF))) continue;
+
+      // The aligned reloads from area DPRCS2 are not inserted here.
+      if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
+        continue;
+
+      if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt &&
+          !isTrap && STI.hasV5TOps()) {
+        if (MBB.succ_empty()) {
+          Reg = ARM::PC;
+          DeleteRet = true;
+          LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
+        } else
+          LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD;
+        // Fold the return instruction into the LDM.
+      }
+
+      // If NoGap is true, pop consecutive registers and then leave the rest
+      // for other instructions. e.g.
+      // vpop {d8, d10, d11} -> vpop {d8}, vpop {d10, d11}
+      if (NoGap && LastReg && LastReg != Reg-1)
+        break;
+
+      LastReg = Reg;
+      Regs.push_back(Reg);
+    }
+
+    if (Regs.empty())
+      continue;
+
+    std::sort(Regs.begin(), Regs.end(), [&](unsigned LHS, unsigned RHS) {
+      return TRI.getEncodingValue(LHS) < TRI.getEncodingValue(RHS);
+    });
+
+    if (Regs.size() > 1 || LdrOpc == 0) {
+      MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(LdmOpc), ARM::SP)
+                       .addReg(ARM::SP));
+      for (unsigned i = 0, e = Regs.size(); i < e; ++i)
+        MIB.addReg(Regs[i], getDefRegState(true));
+      if (DeleteRet && MI != MBB.end()) {
+        MIB.copyImplicitOps(*MI);
+        MI->eraseFromParent();
+      }
+      MI = MIB;
+    } else if (Regs.size() == 1) {
+      // If we adjusted the reg to PC from LR above, switch it back here. We
+      // only do that for LDM.
+      if (Regs[0] == ARM::PC)
+        Regs[0] = ARM::LR;
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MI, DL, TII.get(LdrOpc), Regs[0])
+          .addReg(ARM::SP, RegState::Define)
+          .addReg(ARM::SP);
+      // ARM mode needs an extra reg0 here due to addrmode2. Will go away once
+      // that refactoring is complete (eventually).
+      if (LdrOpc == ARM::LDR_POST_REG || LdrOpc == ARM::LDR_POST_IMM) {
+        MIB.addReg(0);
+        MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::add, 4, ARM_AM::no_shift));
+      } else
+        MIB.addImm(4);
+      AddDefaultPred(MIB);
+    }
+    Regs.clear();
+
+    // Put any subsequent vpop instructions after this one: they will refer to
+    // higher register numbers so need to be popped afterwards.
+    if (MI != MBB.end())
+      ++MI;
+  }
+}
+
+/// Emit aligned spill instructions for NumAlignedDPRCS2Regs D-registers
+/// starting from d8.  Also insert stack realignment code and leave the stack
+/// pointer pointing to the d8 spill slot.
+static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned NumAlignedDPRCS2Regs,
+                                    const std::vector<CalleeSavedInfo> &CSI,
+                                    const TargetRegisterInfo *TRI) {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // Mark the D-register spill slots as properly aligned.  Since MFI computes
+  // stack slot layout backwards, this can actually mean that the d-reg stack
+  // slot offsets can be wrong. The offset for d8 will always be correct.
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned DNum = CSI[i].getReg() - ARM::D8;
+    if (DNum > NumAlignedDPRCS2Regs - 1)
+      continue;
+    int FI = CSI[i].getFrameIdx();
+    // The even-numbered registers will be 16-byte aligned, the odd-numbered
+    // registers will be 8-byte aligned.
+    MFI.setObjectAlignment(FI, DNum % 2 ? 8 : 16);
+
+    // The stack slot for D8 needs to be maximally aligned because this is
+    // actually the point where we align the stack pointer.  MachineFrameInfo
+    // computes all offsets relative to the incoming stack pointer which is a
+    // bit weird when realigning the stack.  Any extra padding for this
+    // over-alignment is not realized because the code inserted below adjusts
+    // the stack pointer by numregs * 8 before aligning the stack pointer.
+    if (DNum == 0)
+      MFI.setObjectAlignment(FI, MFI.getMaxAlignment());
+  }
+
+  // Move the stack pointer to the d8 spill slot, and align it at the same
+  // time. Leave the stack slot address in the scratch register r4.
+  //
+  //   sub r4, sp, #numregs * 8
+  //   bic r4, r4, #align - 1
+  //   mov sp, r4
+  //
+  bool isThumb = AFI->isThumbFunction();
+  assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1");
+  AFI->setShouldRestoreSPFromFP(true);
+
+  // sub r4, sp, #numregs * 8
+  // The immediate is <= 64, so it doesn't need any special encoding.
+  unsigned Opc = isThumb ? ARM::t2SUBri : ARM::SUBri;
+  AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
+                                  .addReg(ARM::SP)
+                                  .addImm(8 * NumAlignedDPRCS2Regs)));
+
+  unsigned MaxAlign = MF.getFrameInfo().getMaxAlignment();
+  // We must set parameter MustBeSingleInstruction to true, since
+  // skipAlignedDPRCS2Spills expects exactly 3 instructions to perform
+  // stack alignment.  Luckily, this can always be done since all ARM
+  // architecture versions that support Neon also support the BFC
+  // instruction.
+  emitAligningInstructions(MF, AFI, TII, MBB, MI, DL, ARM::R4, MaxAlign, true);
+
+  // mov sp, r4
+  // The stack pointer must be adjusted before spilling anything, otherwise
+  // the stack slots could be clobbered by an interrupt handler.
+  // Leave r4 live, it is used below.
+  Opc = isThumb ? ARM::tMOVr : ARM::MOVr;
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(Opc), ARM::SP)
+                            .addReg(ARM::R4);
+  MIB = AddDefaultPred(MIB);
+  if (!isThumb)
+    AddDefaultCC(MIB);
+
+  // Now spill NumAlignedDPRCS2Regs registers starting from d8.
+  // r4 holds the stack slot address.
+  unsigned NextReg = ARM::D8;
+
+  // 16-byte aligned vst1.64 with 4 d-regs and address writeback.
+  // The writeback is only needed when emitting two vst1.64 instructions.
+  if (NumAlignedDPRCS2Regs >= 6) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QQPRRegClass);
+    MBB.addLiveIn(SupReg);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed),
+                           ARM::R4)
+                   .addReg(ARM::R4, RegState::Kill).addImm(16)
+                   .addReg(NextReg)
+                   .addReg(SupReg, RegState::ImplicitKill));
+    NextReg += 4;
+    NumAlignedDPRCS2Regs -= 4;
+  }
+
+  // We won't modify r4 beyond this point.  It currently points to the next
+  // register to be spilled.
+  unsigned R4BaseReg = NextReg;
+
+  // 16-byte aligned vst1.64 with 4 d-regs, no writeback.
+  if (NumAlignedDPRCS2Regs >= 4) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QQPRRegClass);
+    MBB.addLiveIn(SupReg);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Q))
+                   .addReg(ARM::R4).addImm(16).addReg(NextReg)
+                   .addReg(SupReg, RegState::ImplicitKill));
+    NextReg += 4;
+    NumAlignedDPRCS2Regs -= 4;
+  }
+
+  // 16-byte aligned vst1.64 with 2 d-regs.
+  if (NumAlignedDPRCS2Regs >= 2) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QPRRegClass);
+    MBB.addLiveIn(SupReg);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1q64))
+                   .addReg(ARM::R4).addImm(16).addReg(SupReg));
+    NextReg += 2;
+    NumAlignedDPRCS2Regs -= 2;
+  }
+
+  // Finally, use a vanilla vstr.64 for the odd last register.
+  if (NumAlignedDPRCS2Regs) {
+    MBB.addLiveIn(NextReg);
+    // vstr.64 uses addrmode5 which has an offset scale of 4.
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VSTRD))
+                   .addReg(NextReg)
+                   .addReg(ARM::R4).addImm((NextReg-R4BaseReg)*2));
+  }
+
+  // The last spill instruction inserted should kill the scratch register r4.
+  std::prev(MI)->addRegisterKilled(ARM::R4, TRI);
+}
+
+/// Skip past the code inserted by emitAlignedDPRCS2Spills, and return an
+/// iterator to the following instruction.
+static MachineBasicBlock::iterator
+skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
+                        unsigned NumAlignedDPRCS2Regs) {
+  //   sub r4, sp, #numregs * 8
+  //   bic r4, r4, #align - 1
+  //   mov sp, r4
+  ++MI; ++MI; ++MI;
+  assert(MI->mayStore() && "Expecting spill instruction");
+
+  // These switches all fall through.
+  switch(NumAlignedDPRCS2Regs) {
+  case 7:
+    ++MI;
+    assert(MI->mayStore() && "Expecting spill instruction");
+  default:
+    ++MI;
+    assert(MI->mayStore() && "Expecting spill instruction");
+  case 1:
+  case 2:
+  case 4:
+    assert(MI->killsRegister(ARM::R4) && "Missed kill flag");
+    ++MI;
+  }
+  return MI;
+}
+
+/// Emit aligned reload instructions for NumAlignedDPRCS2Regs D-registers
+/// starting from d8.  These instructions are assumed to execute while the
+/// stack is still aligned, unlike the code inserted by emitPopInst.
+static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MI,
+                                      unsigned NumAlignedDPRCS2Regs,
+                                      const std::vector<CalleeSavedInfo> &CSI,
+                                      const TargetRegisterInfo *TRI) {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+
+  // Find the frame index assigned to d8.
+  int D8SpillFI = 0;
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i)
+    if (CSI[i].getReg() == ARM::D8) {
+      D8SpillFI = CSI[i].getFrameIdx();
+      break;
+    }
+
+  // Materialize the address of the d8 spill slot into the scratch register r4.
+  // This can be fairly complicated if the stack frame is large, so just use
+  // the normal frame index elimination mechanism to do it.  This code runs as
+  // the initial part of the epilog where the stack and base pointers haven't
+  // been changed yet.
+  bool isThumb = AFI->isThumbFunction();
+  assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1");
+
+  unsigned Opc = isThumb ? ARM::t2ADDri : ARM::ADDri;
+  AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
+                              .addFrameIndex(D8SpillFI).addImm(0)));
+
+  // Now restore NumAlignedDPRCS2Regs registers starting from d8.
+  unsigned NextReg = ARM::D8;
+
+  // 16-byte aligned vld1.64 with 4 d-regs and writeback.
+  if (NumAlignedDPRCS2Regs >= 6) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QQPRRegClass);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Qwb_fixed), NextReg)
+                   .addReg(ARM::R4, RegState::Define)
+                   .addReg(ARM::R4, RegState::Kill).addImm(16)
+                   .addReg(SupReg, RegState::ImplicitDefine));
+    NextReg += 4;
+    NumAlignedDPRCS2Regs -= 4;
+  }
+
+  // We won't modify r4 beyond this point.  It currently points to the next
+  // register to be spilled.
+  unsigned R4BaseReg = NextReg;
+
+  // 16-byte aligned vld1.64 with 4 d-regs, no writeback.
+  if (NumAlignedDPRCS2Regs >= 4) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QQPRRegClass);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Q), NextReg)
+                   .addReg(ARM::R4).addImm(16)
+                   .addReg(SupReg, RegState::ImplicitDefine));
+    NextReg += 4;
+    NumAlignedDPRCS2Regs -= 4;
+  }
+
+  // 16-byte aligned vld1.64 with 2 d-regs.
+  if (NumAlignedDPRCS2Regs >= 2) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QPRRegClass);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1q64), SupReg)
+                   .addReg(ARM::R4).addImm(16));
+    NextReg += 2;
+    NumAlignedDPRCS2Regs -= 2;
+  }
+
+  // Finally, use a vanilla vldr.64 for the remaining odd register.
+  if (NumAlignedDPRCS2Regs)
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLDRD), NextReg)
+                   .addReg(ARM::R4).addImm(2*(NextReg-R4BaseReg)));
+
+  // Last store kills r4.
+  std::prev(MI)->addRegisterKilled(ARM::R4, TRI);
+}
+
+bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  unsigned PushOpc = AFI->isThumbFunction() ? ARM::t2STMDB_UPD : ARM::STMDB_UPD;
+  unsigned PushOneOpc = AFI->isThumbFunction() ?
+    ARM::t2STR_PRE : ARM::STR_PRE_IMM;
+  unsigned FltOpc = ARM::VSTMDDB_UPD;
+  unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs();
+  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register, 0,
+               MachineInstr::FrameSetup);
+  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register, 0,
+               MachineInstr::FrameSetup);
+  emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register,
+               NumAlignedDPRCS2Regs, MachineInstr::FrameSetup);
+
+  // The code above does not insert spill code for the aligned DPRCS2 registers.
+  // The stack realignment code will be inserted between the push instructions
+  // and these spills.
+  if (NumAlignedDPRCS2Regs)
+    emitAlignedDPRCS2Spills(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI);
+
+  return true;
+}
+
+bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  bool isVarArg = AFI->getArgRegsSaveSize() > 0;
+  unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs();
+
+  // The emitPopInst calls below do not insert reloads for the aligned DPRCS2
+  // registers. Do that here instead.
+  if (NumAlignedDPRCS2Regs)
+    emitAlignedDPRCS2Restores(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI);
+
+  unsigned PopOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD;
+  unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST :ARM::LDR_POST_IMM;
+  unsigned FltOpc = ARM::VLDMDIA_UPD;
+  emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, &isARMArea3Register,
+              NumAlignedDPRCS2Regs);
+  emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
+              &isARMArea2Register, 0);
+  emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
+              &isARMArea1Register, 0);
+
+  return true;
+}
+
+// FIXME: Make generic?
+static unsigned GetFunctionSizeInBytes(const MachineFunction &MF,
+                                       const ARMBaseInstrInfo &TII) {
+  unsigned FnSize = 0;
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB)
+      FnSize += TII.getInstSizeInBytes(MI);
+  }
+  return FnSize;
+}
+
+/// estimateRSStackSizeLimit - Look at each instruction that references stack
+/// frames and return the stack size limit beyond which some of these
+/// instructions will require a scratch register during their expansion later.
+// FIXME: Move to TII?
+static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
+                                         const TargetFrameLowering *TFI) {
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned Limit = (1 << 12) - 1;
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        if (!MI.getOperand(i).isFI())
+          continue;
+
+        // When using ADDri to get the address of a stack object, 255 is the
+        // largest offset guaranteed to fit in the immediate offset.
+        if (MI.getOpcode() == ARM::ADDri) {
+          Limit = std::min(Limit, (1U << 8) - 1);
+          break;
+        }
+
+        // Otherwise check the addressing mode.
+        switch (MI.getDesc().TSFlags & ARMII::AddrModeMask) {
+        case ARMII::AddrMode3:
+        case ARMII::AddrModeT2_i8:
+          Limit = std::min(Limit, (1U << 8) - 1);
+          break;
+        case ARMII::AddrMode5:
+        case ARMII::AddrModeT2_i8s4:
+          Limit = std::min(Limit, ((1U << 8) - 1) * 4);
+          break;
+        case ARMII::AddrModeT2_i12:
+          // i12 supports only positive offset so these will be converted to
+          // i8 opcodes. See llvm::rewriteT2FrameIndex.
+          if (TFI->hasFP(MF) && AFI->hasStackFrame())
+            Limit = std::min(Limit, (1U << 8) - 1);
+          break;
+        case ARMII::AddrMode4:
+        case ARMII::AddrMode6:
+          // Addressing modes 4 & 6 (load/store) instructions can't encode an
+          // immediate offset for stack references.
+          return 0;
+        default:
+          break;
+        }
+        break; // At most one FI per instruction
+      }
+    }
+  }
+
+  return Limit;
+}
+
+// In functions that realign the stack, it can be an advantage to spill the
+// callee-saved vector registers after realigning the stack. The vst1 and vld1
+// instructions take alignment hints that can improve performance.
+//
+static void
+checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) {
+  MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(0);
+  if (!SpillAlignedNEONRegs)
+    return;
+
+  // Naked functions don't spill callee-saved registers.
+  if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
+    return;
+
+  // We are planning to use NEON instructions vst1 / vld1.
+  if (!static_cast<const ARMSubtarget &>(MF.getSubtarget()).hasNEON())
+    return;
+
+  // Don't bother if the default stack alignment is sufficiently high.
+  if (MF.getSubtarget().getFrameLowering()->getStackAlignment() >= 8)
+    return;
+
+  // Aligned spills require stack realignment.
+  if (!static_cast<const ARMBaseRegisterInfo *>(
+           MF.getSubtarget().getRegisterInfo())->canRealignStack(MF))
+    return;
+
+  // We always spill contiguous d-registers starting from d8. Count how many
+  // needs spilling.  The register allocator will almost always use the
+  // callee-saved registers in order, but it can happen that there are holes in
+  // the range.  Registers above the hole will be spilled to the standard DPRCS
+  // area.
+  unsigned NumSpills = 0;
+  for (; NumSpills < 8; ++NumSpills)
+    if (!SavedRegs.test(ARM::D8 + NumSpills))
+      break;
+
+  // Don't do this for just one d-register. It's not worth it.
+  if (NumSpills < 2)
+    return;
+
+  // Spill the first NumSpills D-registers after realigning the stack.
+  MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(NumSpills);
+
+  // A scratch register is required for the vst1 / vld1 instructions.
+  SavedRegs.set(ARM::R4);
+}
+
+void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                            BitVector &SavedRegs,
+                                            RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+  // This tells PEI to spill the FP as if it is any other callee-save register
+  // to take advantage the eliminateFrameIndex machinery. This also ensures it
+  // is spilled in the order specified by getCalleeSavedRegs() to make it easier
+  // to combine multiple loads / stores.
+  bool CanEliminateFrame = true;
+  bool CS1Spilled = false;
+  bool LRSpilled = false;
+  unsigned NumGPRSpills = 0;
+  unsigned NumFPRSpills = 0;
+  SmallVector<unsigned, 4> UnspilledCS1GPRs;
+  SmallVector<unsigned, 4> UnspilledCS2GPRs;
+  const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
+  const ARMBaseInstrInfo &TII =
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  (void)TRI;  // Silence unused warning in non-assert builds.
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+  // Spill R4 if Thumb2 function requires stack realignment - it will be used as
+  // scratch register. Also spill R4 if Thumb2 function has varsized objects,
+  // since it's not always possible to restore sp from fp in a single
+  // instruction.
+  // FIXME: It will be better just to find spare register here.
+  if (AFI->isThumb2Function() &&
+      (MFI.hasVarSizedObjects() || RegInfo->needsStackRealignment(MF)))
+    SavedRegs.set(ARM::R4);
+
+  if (AFI->isThumb1OnlyFunction()) {
+    // Spill LR if Thumb1 function uses variable length argument lists.
+    if (AFI->getArgRegsSaveSize() > 0)
+      SavedRegs.set(ARM::LR);
+
+    // Spill R4 if Thumb1 epilogue has to restore SP from FP. We don't know
+    // for sure what the stack size will be, but for this, an estimate is good
+    // enough. If there anything changes it, it'll be a spill, which implies
+    // we've used all the registers and so R4 is already used, so not marking
+    // it here will be OK.
+    // FIXME: It will be better just to find spare register here.
+    unsigned StackSize = MFI.estimateStackSize(MF);
+    if (MFI.hasVarSizedObjects() || StackSize > 508)
+      SavedRegs.set(ARM::R4);
+  }
+
+  // See if we can spill vector registers to aligned stack.
+  checkNumAlignedDPRCS2Regs(MF, SavedRegs);
+
+  // Spill the BasePtr if it's used.
+  if (RegInfo->hasBasePointer(MF))
+    SavedRegs.set(RegInfo->getBaseRegister());
+
+  // Don't spill FP if the frame can be eliminated. This is determined
+  // by scanning the callee-save registers to see if any is modified.
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    bool Spilled = false;
+    if (SavedRegs.test(Reg)) {
+      Spilled = true;
+      CanEliminateFrame = false;
+    }
+
+    if (!ARM::GPRRegClass.contains(Reg)) {
+      if (Spilled) {
+        if (ARM::SPRRegClass.contains(Reg))
+          NumFPRSpills++;
+        else if (ARM::DPRRegClass.contains(Reg))
+          NumFPRSpills += 2;
+        else if (ARM::QPRRegClass.contains(Reg))
+          NumFPRSpills += 4;
+      }
+      continue;
+    }
+
+    if (Spilled) {
+      NumGPRSpills++;
+
+      if (!STI.splitFramePushPop(MF)) {
+        if (Reg == ARM::LR)
+          LRSpilled = true;
+        CS1Spilled = true;
+        continue;
+      }
+
+      // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
+      switch (Reg) {
+      case ARM::LR:
+        LRSpilled = true;
+        LLVM_FALLTHROUGH;
+      case ARM::R0: case ARM::R1:
+      case ARM::R2: case ARM::R3:
+      case ARM::R4: case ARM::R5:
+      case ARM::R6: case ARM::R7:
+        CS1Spilled = true;
+        break;
+      default:
+        break;
+      }
+    } else {
+      if (!STI.splitFramePushPop(MF)) {
+        UnspilledCS1GPRs.push_back(Reg);
+        continue;
+      }
+
+      switch (Reg) {
+      case ARM::R0: case ARM::R1:
+      case ARM::R2: case ARM::R3:
+      case ARM::R4: case ARM::R5:
+      case ARM::R6: case ARM::R7:
+      case ARM::LR:
+        UnspilledCS1GPRs.push_back(Reg);
+        break;
+      default:
+        UnspilledCS2GPRs.push_back(Reg);
+        break;
+      }
+    }
+  }
+
+  bool ForceLRSpill = false;
+  if (!LRSpilled && AFI->isThumb1OnlyFunction()) {
+    unsigned FnSize = GetFunctionSizeInBytes(MF, TII);
+    // Force LR to be spilled if the Thumb function size is > 2048. This enables
+    // use of BL to implement far jump. If it turns out that it's not needed
+    // then the branch fix up path will undo it.
+    if (FnSize >= (1 << 11)) {
+      CanEliminateFrame = false;
+      ForceLRSpill = true;
+    }
+  }
+
+  // If any of the stack slot references may be out of range of an immediate
+  // offset, make sure a register (or a spill slot) is available for the
+  // register scavenger. Note that if we're indexing off the frame pointer, the
+  // effective stack size is 4 bytes larger since the FP points to the stack
+  // slot of the previous FP. Also, if we have variable sized objects in the
+  // function, stack slot references will often be negative, and some of
+  // our instructions are positive-offset only, so conservatively consider
+  // that case to want a spill slot (or register) as well. Similarly, if
+  // the function adjusts the stack pointer during execution and the
+  // adjustments aren't already part of our stack size estimate, our offset
+  // calculations may be off, so be conservative.
+  // FIXME: We could add logic to be more precise about negative offsets
+  //        and which instructions will need a scratch register for them. Is it
+  //        worth the effort and added fragility?
+  unsigned EstimatedStackSize =
+      MFI.estimateStackSize(MF) + 4 * (NumGPRSpills + NumFPRSpills);
+  if (hasFP(MF)) {
+    if (AFI->hasStackFrame())
+      EstimatedStackSize += 4;
+  } else {
+    // If FP is not used, SP will be used to access arguments, so count the
+    // size of arguments into the estimation.
+    EstimatedStackSize += MF.getInfo<ARMFunctionInfo>()->getArgumentStackSize();
+  }
+  EstimatedStackSize += 16; // For possible paddings.
+
+  bool BigStack = EstimatedStackSize >= estimateRSStackSizeLimit(MF, this) ||
+                  MFI.hasVarSizedObjects() ||
+                  (MFI.adjustsStack() && !canSimplifyCallFramePseudos(MF));
+  bool ExtraCSSpill = false;
+  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
+    AFI->setHasStackFrame(true);
+
+    if (hasFP(MF)) {
+      SavedRegs.set(FramePtr);
+      // If the frame pointer is required by the ABI, also spill LR so that we
+      // emit a complete frame record.
+      if (MF.getTarget().Options.DisableFramePointerElim(MF) && !LRSpilled) {
+        SavedRegs.set(ARM::LR);
+        LRSpilled = true;
+        NumGPRSpills++;
+        auto LRPos = find(UnspilledCS1GPRs, ARM::LR);
+        if (LRPos != UnspilledCS1GPRs.end())
+          UnspilledCS1GPRs.erase(LRPos);
+      }
+      auto FPPos = find(UnspilledCS1GPRs, FramePtr);
+      if (FPPos != UnspilledCS1GPRs.end())
+        UnspilledCS1GPRs.erase(FPPos);
+      NumGPRSpills++;
+      if (FramePtr == ARM::R7)
+        CS1Spilled = true;
+    }
+
+    if (AFI->isThumb1OnlyFunction()) {
+      // For Thumb1-only targets, we need some low registers when we save and
+      // restore the high registers (which aren't allocatable, but could be
+      // used by inline assembly) because the push/pop instructions can not
+      // access high registers. If necessary, we might need to push more low
+      // registers to ensure that there is at least one free that can be used
+      // for the saving & restoring, and preferably we should ensure that as
+      // many as are needed are available so that fewer push/pop instructions
+      // are required.
+
+      // Low registers which are not currently pushed, but could be (r4-r7).
+      SmallVector<unsigned, 4> AvailableRegs;
+
+      // Unused argument registers (r0-r3) can be clobbered in the prologue for
+      // free.
+      int EntryRegDeficit = 0;
+      for (unsigned Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) {
+        if (!MF.getRegInfo().isLiveIn(Reg)) {
+          --EntryRegDeficit;
+          DEBUG(dbgs() << PrintReg(Reg, TRI)
+                       << " is unused argument register, EntryRegDeficit = "
+                       << EntryRegDeficit << "\n");
+        }
+      }
+
+      // Unused return registers can be clobbered in the epilogue for free.
+      int ExitRegDeficit = AFI->getReturnRegsCount() - 4;
+      DEBUG(dbgs() << AFI->getReturnRegsCount()
+                   << " return regs used, ExitRegDeficit = " << ExitRegDeficit
+                   << "\n");
+
+      int RegDeficit = std::max(EntryRegDeficit, ExitRegDeficit);
+      DEBUG(dbgs() << "RegDeficit = " << RegDeficit << "\n");
+
+      // r4-r6 can be used in the prologue if they are pushed by the first push
+      // instruction.
+      for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6}) {
+        if (SavedRegs.test(Reg)) {
+          --RegDeficit;
+          DEBUG(dbgs() << PrintReg(Reg, TRI)
+                       << " is saved low register, RegDeficit = " << RegDeficit
+                       << "\n");
+        } else {
+          AvailableRegs.push_back(Reg);
+          DEBUG(dbgs()
+                << PrintReg(Reg, TRI)
+                << " is non-saved low register, adding to AvailableRegs\n");
+        }
+      }
+
+      // r7 can be used if it is not being used as the frame pointer.
+      if (!hasFP(MF)) {
+        if (SavedRegs.test(ARM::R7)) {
+          --RegDeficit;
+          DEBUG(dbgs() << "%R7 is saved low register, RegDeficit = "
+                       << RegDeficit << "\n");
+        } else {
+          AvailableRegs.push_back(ARM::R7);
+          DEBUG(dbgs()
+                << "%R7 is non-saved low register, adding to AvailableRegs\n");
+        }
+      }
+
+      // Each of r8-r11 needs to be copied to a low register, then pushed.
+      for (unsigned Reg : {ARM::R8, ARM::R9, ARM::R10, ARM::R11}) {
+        if (SavedRegs.test(Reg)) {
+          ++RegDeficit;
+          DEBUG(dbgs() << PrintReg(Reg, TRI)
+                       << " is saved high register, RegDeficit = " << RegDeficit
+                       << "\n");
+        }
+      }
+
+      // LR can only be used by PUSH, not POP, and can't be used at all if the
+      // llvm.returnaddress intrinsic is used. This is only worth doing if we
+      // are more limited at function entry than exit.
+      if ((EntryRegDeficit > ExitRegDeficit) &&
+          !(MF.getRegInfo().isLiveIn(ARM::LR) &&
+            MF.getFrameInfo().isReturnAddressTaken())) {
+        if (SavedRegs.test(ARM::LR)) {
+          --RegDeficit;
+          DEBUG(dbgs() << "%LR is saved register, RegDeficit = " << RegDeficit
+                       << "\n");
+        } else {
+          AvailableRegs.push_back(ARM::LR);
+          DEBUG(dbgs() << "%LR is not saved, adding to AvailableRegs\n");
+        }
+      }
+
+      // If there are more high registers that need pushing than low registers
+      // available, push some more low registers so that we can use fewer push
+      // instructions. This might not reduce RegDeficit all the way to zero,
+      // because we can only guarantee that r4-r6 are available, but r8-r11 may
+      // need saving.
+      DEBUG(dbgs() << "Final RegDeficit = " << RegDeficit << "\n");
+      for (; RegDeficit > 0 && !AvailableRegs.empty(); --RegDeficit) {
+        unsigned Reg = AvailableRegs.pop_back_val();
+        DEBUG(dbgs() << "Spilling " << PrintReg(Reg, TRI)
+                     << " to make up reg deficit\n");
+        SavedRegs.set(Reg);
+        NumGPRSpills++;
+        CS1Spilled = true;
+        ExtraCSSpill = true;
+        UnspilledCS1GPRs.erase(find(UnspilledCS1GPRs, Reg));
+        if (Reg == ARM::LR)
+          LRSpilled = true;
+      }
+      DEBUG(dbgs() << "After adding spills, RegDeficit = " << RegDeficit << "\n");
+    }
+
+    // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
+    // Spill LR as well so we can fold BX_RET to the registers restore (LDM).
+    if (!LRSpilled && CS1Spilled) {
+      SavedRegs.set(ARM::LR);
+      NumGPRSpills++;
+      SmallVectorImpl<unsigned>::iterator LRPos;
+      LRPos = find(UnspilledCS1GPRs, (unsigned)ARM::LR);
+      if (LRPos != UnspilledCS1GPRs.end())
+        UnspilledCS1GPRs.erase(LRPos);
+
+      ForceLRSpill = false;
+      ExtraCSSpill = true;
+    }
+
+    // If stack and double are 8-byte aligned and we are spilling an odd number
+    // of GPRs, spill one extra callee save GPR so we won't have to pad between
+    // the integer and double callee save areas.
+    DEBUG(dbgs() << "NumGPRSpills = " << NumGPRSpills << "\n");
+    unsigned TargetAlign = getStackAlignment();
+    if (TargetAlign >= 8 && (NumGPRSpills & 1)) {
+      if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
+        for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
+          unsigned Reg = UnspilledCS1GPRs[i];
+          // Don't spill high register if the function is thumb.  In the case of
+          // Windows on ARM, accept R11 (frame pointer)
+          if (!AFI->isThumbFunction() ||
+              (STI.isTargetWindows() && Reg == ARM::R11) ||
+              isARMLowRegister(Reg) || Reg == ARM::LR) {
+            SavedRegs.set(Reg);
+            DEBUG(dbgs() << "Spilling " << PrintReg(Reg, TRI)
+                         << " to make up alignment\n");
+            if (!MRI.isReserved(Reg))
+              ExtraCSSpill = true;
+            break;
+          }
+        }
+      } else if (!UnspilledCS2GPRs.empty() && !AFI->isThumb1OnlyFunction()) {
+        unsigned Reg = UnspilledCS2GPRs.front();
+        SavedRegs.set(Reg);
+        DEBUG(dbgs() << "Spilling " << PrintReg(Reg, TRI)
+                     << " to make up alignment\n");
+        if (!MRI.isReserved(Reg))
+          ExtraCSSpill = true;
+      }
+    }
+
+    // Estimate if we might need to scavenge a register at some point in order
+    // to materialize a stack offset. If so, either spill one additional
+    // callee-saved register or reserve a special spill slot to facilitate
+    // register scavenging. Thumb1 needs a spill slot for stack pointer
+    // adjustments also, even when the frame itself is small.
+    if (BigStack && !ExtraCSSpill) {
+      // If any non-reserved CS register isn't spilled, just spill one or two
+      // extra. That should take care of it!
+      unsigned NumExtras = TargetAlign / 4;
+      SmallVector<unsigned, 2> Extras;
+      while (NumExtras && !UnspilledCS1GPRs.empty()) {
+        unsigned Reg = UnspilledCS1GPRs.back();
+        UnspilledCS1GPRs.pop_back();
+        if (!MRI.isReserved(Reg) &&
+            (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) ||
+             Reg == ARM::LR)) {
+          Extras.push_back(Reg);
+          NumExtras--;
+        }
+      }
+      // For non-Thumb1 functions, also check for hi-reg CS registers
+      if (!AFI->isThumb1OnlyFunction()) {
+        while (NumExtras && !UnspilledCS2GPRs.empty()) {
+          unsigned Reg = UnspilledCS2GPRs.back();
+          UnspilledCS2GPRs.pop_back();
+          if (!MRI.isReserved(Reg)) {
+            Extras.push_back(Reg);
+            NumExtras--;
+          }
+        }
+      }
+      if (Extras.size() && NumExtras == 0) {
+        for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
+          SavedRegs.set(Extras[i]);
+        }
+      } else if (!AFI->isThumb1OnlyFunction()) {
+        // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
+        // closest to SP or frame pointer.
+        assert(RS && "Register scavenging not provided");
+        const TargetRegisterClass *RC = &ARM::GPRRegClass;
+        RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
+                                                          RC->getAlignment(),
+                                                          false));
+      }
+    }
+  }
+
+  if (ForceLRSpill) {
+    SavedRegs.set(ARM::LR);
+    AFI->setLRIsSpilledForFarJump(true);
+  }
+}
+
+MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  const ARMBaseInstrInfo &TII =
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  if (!hasReservedCallFrame(MF)) {
+    // If we have alloca, convert as follows:
+    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+    // ADJCALLSTACKUP   -> add, sp, sp, amount
+    MachineInstr &Old = *I;
+    DebugLoc dl = Old.getDebugLoc();
+    unsigned Amount = Old.getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      Amount = alignSPAdjust(Amount);
+
+      ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+      assert(!AFI->isThumb1OnlyFunction() &&
+             "This eliminateCallFramePseudoInstr does not support Thumb1!");
+      bool isARM = !AFI->isThumbFunction();
+
+      // Replace the pseudo instruction with a new instruction...
+      unsigned Opc = Old.getOpcode();
+      int PIdx = Old.findFirstPredOperandIdx();
+      ARMCC::CondCodes Pred =
+          (PIdx == -1) ? ARMCC::AL
+                       : (ARMCC::CondCodes)Old.getOperand(PIdx).getImm();
+      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
+        // Note: PredReg is operand 2 for ADJCALLSTACKDOWN.
+        unsigned PredReg = Old.getOperand(2).getReg();
+        emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags,
+                     Pred, PredReg);
+      } else {
+        // Note: PredReg is operand 3 for ADJCALLSTACKUP.
+        unsigned PredReg = Old.getOperand(3).getReg();
+        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
+        emitSPUpdate(isARM, MBB, I, dl, TII, Amount, MachineInstr::NoFlags,
+                     Pred, PredReg);
+      }
+    }
+  }
+  return MBB.erase(I);
+}
+
+/// Get the minimum constant for ARM that is greater than or equal to the
+/// argument. In ARM, constants can have any value that can be produced by
+/// rotating an 8-bit value to the right by an even number of bits within a
+/// 32-bit word.
+static uint32_t alignToARMConstant(uint32_t Value) {
+  unsigned Shifted = 0;
+
+  if (Value == 0)
+      return 0;
+
+  while (!(Value & 0xC0000000)) {
+      Value = Value << 2;
+      Shifted += 2;
+  }
+
+  bool Carry = (Value & 0x00FFFFFF);
+  Value = ((Value & 0xFF000000) >> 24) + Carry;
+
+  if (Value & 0x0000100)
+      Value = Value & 0x000001FC;
+
+  if (Shifted > 24)
+      Value = Value >> (Shifted - 24);
+  else
+      Value = Value << (24 - Shifted);
+
+  return Value;
+}
+
+// The stack limit in the TCB is set to this many bytes above the actual
+// stack limit.
+static const uint64_t kSplitStackAvailable = 256;
+
+// Adjust the function prologue to enable split stacks. This currently only
+// supports android and linux.
+//
+// The ABI of the segmented stack prologue is a little arbitrarily chosen, but
+// must be well defined in order to allow for consistent implementations of the
+// __morestack helper function. The ABI is also not a normal ABI in that it
+// doesn't follow the normal calling conventions because this allows the
+// prologue of each function to be optimized further.
+//
+// Currently, the ABI looks like (when calling __morestack)
+//
+//  * r4 holds the minimum stack size requested for this function call
+//  * r5 holds the stack size of the arguments to the function
+//  * the beginning of the function is 3 instructions after the call to
+//    __morestack
+//
+// Implementations of __morestack should use r4 to allocate a new stack, r5 to
+// place the arguments on to the new stack, and the 3-instruction knowledge to
+// jump directly to the body of the function when working on the new stack.
+//
+// An old (and possibly no longer compatible) implementation of __morestack for
+// ARM can be found at [1].
+//
+// [1] - https://github.com/mozilla/rust/blob/86efd9/src/rt/arch/arm/morestack.S
+void ARMFrameLowering::adjustForSegmentedStacks(
+    MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
+  unsigned Opcode;
+  unsigned CFIIndex;
+  const ARMSubtarget *ST = &MF.getSubtarget<ARMSubtarget>();
+  bool Thumb = ST->isThumb();
+
+  // Sadly, this currently doesn't support varargs, platforms other than
+  // android/linux. Note that thumb1/thumb2 are support for android/linux.
+  if (MF.getFunction()->isVarArg())
+    report_fatal_error("Segmented stacks do not support vararg functions.");
+  if (!ST->isTargetAndroid() && !ST->isTargetLinux())
+    report_fatal_error("Segmented stacks not supported on this platform.");
+
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  MCContext &Context = MMI.getContext();
+  const MCRegisterInfo *MRI = Context.getRegisterInfo();
+  const ARMBaseInstrInfo &TII =
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  ARMFunctionInfo *ARMFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc DL;
+
+  uint64_t StackSize = MFI.getStackSize();
+
+  // Do not generate a prologue for functions with a stack of size zero
+  if (StackSize == 0)
+    return;
+
+  // Use R4 and R5 as scratch registers.
+  // We save R4 and R5 before use and restore them before leaving the function.
+  unsigned ScratchReg0 = ARM::R4;
+  unsigned ScratchReg1 = ARM::R5;
+  uint64_t AlignedStackSize;
+
+  MachineBasicBlock *PrevStackMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *PostStackMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *AllocMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *GetMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *McrMBB = MF.CreateMachineBasicBlock();
+
+  // Grab everything that reaches PrologueMBB to update there liveness as well.
+  SmallPtrSet<MachineBasicBlock *, 8> BeforePrologueRegion;
+  SmallVector<MachineBasicBlock *, 2> WalkList;
+  WalkList.push_back(&PrologueMBB);
+
+  do {
+    MachineBasicBlock *CurMBB = WalkList.pop_back_val();
+    for (MachineBasicBlock *PredBB : CurMBB->predecessors()) {
+      if (BeforePrologueRegion.insert(PredBB).second)
+        WalkList.push_back(PredBB);
+    }
+  } while (!WalkList.empty());
+
+  // The order in that list is important.
+  // The blocks will all be inserted before PrologueMBB using that order.
+  // Therefore the block that should appear first in the CFG should appear
+  // first in the list.
+  MachineBasicBlock *AddedBlocks[] = {PrevStackMBB, McrMBB, GetMBB, AllocMBB,
+                                      PostStackMBB};
+
+  for (MachineBasicBlock *B : AddedBlocks)
+    BeforePrologueRegion.insert(B);
+
+  for (const auto &LI : PrologueMBB.liveins()) {
+    for (MachineBasicBlock *PredBB : BeforePrologueRegion)
+      PredBB->addLiveIn(LI);
+  }
+
+  // Remove the newly added blocks from the list, since we know
+  // we do not have to do the following updates for them.
+  for (MachineBasicBlock *B : AddedBlocks) {
+    BeforePrologueRegion.erase(B);
+    MF.insert(PrologueMBB.getIterator(), B);
+  }
+
+  for (MachineBasicBlock *MBB : BeforePrologueRegion) {
+    // Make sure the LiveIns are still sorted and unique.
+    MBB->sortUniqueLiveIns();
+    // Replace the edges to PrologueMBB by edges to the sequences
+    // we are about to add.
+    MBB->ReplaceUsesOfBlockWith(&PrologueMBB, AddedBlocks[0]);
+  }
+
+  // The required stack size that is aligned to ARM constant criterion.
+  AlignedStackSize = alignToARMConstant(StackSize);
+
+  // When the frame size is less than 256 we just compare the stack
+  // boundary directly to the value of the stack pointer, per gcc.
+  bool CompareStackPointer = AlignedStackSize < kSplitStackAvailable;
+
+  // We will use two of the callee save registers as scratch registers so we
+  // need to save those registers onto the stack.
+  // We will use SR0 to hold stack limit and SR1 to hold the stack size
+  // requested and arguments for __morestack().
+  // SR0: Scratch Register #0
+  // SR1: Scratch Register #1
+  // push {SR0, SR1}
+  if (Thumb) {
+    AddDefaultPred(BuildMI(PrevStackMBB, DL, TII.get(ARM::tPUSH)))
+        .addReg(ScratchReg0).addReg(ScratchReg1);
+  } else {
+    AddDefaultPred(BuildMI(PrevStackMBB, DL, TII.get(ARM::STMDB_UPD))
+                   .addReg(ARM::SP, RegState::Define).addReg(ARM::SP))
+        .addReg(ScratchReg0).addReg(ScratchReg1);
+  }
+
+  // Emit the relevant DWARF information about the change in stack pointer as
+  // well as where to find both r4 and r5 (the callee-save registers)
+  CFIIndex =
+      MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -8));
+  BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+  CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+      nullptr, MRI->getDwarfRegNum(ScratchReg1, true), -4));
+  BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+  CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+      nullptr, MRI->getDwarfRegNum(ScratchReg0, true), -8));
+  BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // mov SR1, sp
+  if (Thumb) {
+    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::tMOVr), ScratchReg1)
+                      .addReg(ARM::SP));
+  } else if (CompareStackPointer) {
+    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::MOVr), ScratchReg1)
+                      .addReg(ARM::SP)).addReg(0);
+  }
+
+  // sub SR1, sp, #StackSize
+  if (!CompareStackPointer && Thumb) {
+    AddDefaultPred(
+        AddDefaultCC(BuildMI(McrMBB, DL, TII.get(ARM::tSUBi8), ScratchReg1))
+            .addReg(ScratchReg1).addImm(AlignedStackSize));
+  } else if (!CompareStackPointer) {
+    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::SUBri), ScratchReg1)
+                      .addReg(ARM::SP).addImm(AlignedStackSize)).addReg(0);
+  }
+
+  if (Thumb && ST->isThumb1Only()) {
+    unsigned PCLabelId = ARMFI->createPICLabelUId();
+    ARMConstantPoolValue *NewCPV = ARMConstantPoolSymbol::Create(
+        MF.getFunction()->getContext(), "__STACK_LIMIT", PCLabelId, 0);
+    MachineConstantPool *MCP = MF.getConstantPool();
+    unsigned CPI = MCP->getConstantPoolIndex(NewCPV, 4);
+
+    // ldr SR0, [pc, offset(STACK_LIMIT)]
+    AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0)
+                      .addConstantPoolIndex(CPI));
+
+    // ldr SR0, [SR0]
+    AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRi), ScratchReg0)
+                      .addReg(ScratchReg0).addImm(0));
+  } else {
+    // Get TLS base address from the coprocessor
+    // mrc p15, #0, SR0, c13, c0, #3
+    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::MRC), ScratchReg0)
+                     .addImm(15)
+                     .addImm(0)
+                     .addImm(13)
+                     .addImm(0)
+                     .addImm(3));
+
+    // Use the last tls slot on android and a private field of the TCP on linux.
+    assert(ST->isTargetAndroid() || ST->isTargetLinux());
+    unsigned TlsOffset = ST->isTargetAndroid() ? 63 : 1;
+
+    // Get the stack limit from the right offset
+    // ldr SR0, [sr0, #4 * TlsOffset]
+    AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::LDRi12), ScratchReg0)
+                      .addReg(ScratchReg0).addImm(4 * TlsOffset));
+  }
+
+  // Compare stack limit with stack size requested.
+  // cmp SR0, SR1
+  Opcode = Thumb ? ARM::tCMPr : ARM::CMPrr;
+  AddDefaultPred(BuildMI(GetMBB, DL, TII.get(Opcode))
+                    .addReg(ScratchReg0)
+                    .addReg(ScratchReg1));
+
+  // This jump is taken if StackLimit < SP - stack required.
+  Opcode = Thumb ? ARM::tBcc : ARM::Bcc;
+  BuildMI(GetMBB, DL, TII.get(Opcode)).addMBB(PostStackMBB)
+       .addImm(ARMCC::LO)
+       .addReg(ARM::CPSR);
+
+
+  // Calling __morestack(StackSize, Size of stack arguments).
+  // __morestack knows that the stack size requested is in SR0(r4)
+  // and amount size of stack arguments is in SR1(r5).
+
+  // Pass first argument for the __morestack by Scratch Register #0.
+  //   The amount size of stack required
+  if (Thumb) {
+    AddDefaultPred(AddDefaultCC(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8),
+                                        ScratchReg0)).addImm(AlignedStackSize));
+  } else {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg0)
+                      .addImm(AlignedStackSize)).addReg(0);
+  }
+  // Pass second argument for the __morestack by Scratch Register #1.
+  //   The amount size of stack consumed to save function arguments.
+  if (Thumb) {
+    AddDefaultPred(
+        AddDefaultCC(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg1))
+            .addImm(alignToARMConstant(ARMFI->getArgumentStackSize())));
+  } else {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg1)
+                   .addImm(alignToARMConstant(ARMFI->getArgumentStackSize())))
+                   .addReg(0);
+  }
+
+  // push {lr} - Save return address of this function.
+  if (Thumb) {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPUSH)))
+        .addReg(ARM::LR);
+  } else {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::STMDB_UPD))
+                   .addReg(ARM::SP, RegState::Define)
+                   .addReg(ARM::SP))
+        .addReg(ARM::LR);
+  }
+
+  // Emit the DWARF info about the change in stack as well as where to find the
+  // previous link register
+  CFIIndex =
+      MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -12));
+  BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+  CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+        nullptr, MRI->getDwarfRegNum(ARM::LR, true), -12));
+  BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // Call __morestack().
+  if (Thumb) {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tBL)))
+        .addExternalSymbol("__morestack");
+  } else {
+    BuildMI(AllocMBB, DL, TII.get(ARM::BL))
+        .addExternalSymbol("__morestack");
+  }
+
+  // pop {lr} - Restore return address of this original function.
+  if (Thumb) {
+    if (ST->isThumb1Only()) {
+      AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPOP)))
+                     .addReg(ScratchReg0);
+      AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVr), ARM::LR)
+                     .addReg(ScratchReg0));
+    } else {
+      AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::t2LDR_POST))
+                     .addReg(ARM::LR, RegState::Define)
+                     .addReg(ARM::SP, RegState::Define)
+                     .addReg(ARM::SP)
+                     .addImm(4));
+    }
+  } else {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
+                   .addReg(ARM::SP, RegState::Define)
+                   .addReg(ARM::SP))
+      .addReg(ARM::LR);
+  }
+
+  // Restore SR0 and SR1 in case of __morestack() was called.
+  // __morestack() will skip PostStackMBB block so we need to restore
+  // scratch registers from here.
+  // pop {SR0, SR1}
+  if (Thumb) {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPOP)))
+      .addReg(ScratchReg0)
+      .addReg(ScratchReg1);
+  } else {
+    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
+                   .addReg(ARM::SP, RegState::Define)
+                   .addReg(ARM::SP))
+      .addReg(ScratchReg0)
+      .addReg(ScratchReg1);
+  }
+
+  // Update the CFA offset now that we've popped
+  CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0));
+  BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // bx lr - Return from this function.
+  Opcode = Thumb ? ARM::tBX_RET : ARM::BX_RET;
+  AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(Opcode)));
+
+  // Restore SR0 and SR1 in case of __morestack() was not called.
+  // pop {SR0, SR1}
+  if (Thumb) {
+    AddDefaultPred(BuildMI(PostStackMBB, DL, TII.get(ARM::tPOP)))
+      .addReg(ScratchReg0)
+      .addReg(ScratchReg1);
+  } else {
+    AddDefaultPred(BuildMI(PostStackMBB, DL, TII.get(ARM::LDMIA_UPD))
+                   .addReg(ARM::SP, RegState::Define)
+                   .addReg(ARM::SP))
+      .addReg(ScratchReg0)
+      .addReg(ScratchReg1);
+  }
+
+  // Update the CFA offset now that we've popped
+  CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0));
+  BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // Tell debuggers that r4 and r5 are now the same as they were in the
+  // previous function, that they're the "Same Value".
+  CFIIndex = MF.addFrameInst(MCCFIInstruction::createSameValue(
+      nullptr, MRI->getDwarfRegNum(ScratchReg0, true)));
+  BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+  CFIIndex = MF.addFrameInst(MCCFIInstruction::createSameValue(
+      nullptr, MRI->getDwarfRegNum(ScratchReg1, true)));
+  BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // Organizing MBB lists
+  PostStackMBB->addSuccessor(&PrologueMBB);
+
+  AllocMBB->addSuccessor(PostStackMBB);
+
+  GetMBB->addSuccessor(PostStackMBB);
+  GetMBB->addSuccessor(AllocMBB);
+
+  McrMBB->addSuccessor(GetMBB);
+
+  PrevStackMBB->addSuccessor(McrMBB);
+
+#ifdef EXPENSIVE_CHECKS
+  MF.verify();
+#endif
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
new file mode 100644
index 000000000000..21cd78da395c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -0,0 +1,85 @@
+//==-- ARMTargetFrameLowering.h - Define frame lowering for ARM --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
+#define LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+class ARMFrameLowering : public TargetFrameLowering {
+protected:
+  const ARMSubtarget &STI;
+
+public:
+  explicit ARMFrameLowering(const ARMSubtarget &sti);
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  bool noFramePointerElim(const MachineFunction &MF) const override;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+  int ResolveFrameIndexReference(const MachineFunction &MF, int FI,
+                                 unsigned &FrameReg, int SPAdj) const;
+
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
+
+  void adjustForSegmentedStacks(MachineFunction &MF,
+                                MachineBasicBlock &MBB) const override;
+
+  /// Returns true if the target will correctly handle shrink wrapping.
+  bool enableShrinkWrapping(const MachineFunction &MF) const override {
+    return true;
+  }
+
+ private:
+  void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                    const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc,
+                    unsigned StrOpc, bool NoGap,
+                    bool(*Func)(unsigned, bool), unsigned NumAlignedDPRCS2Regs,
+                    unsigned MIFlags = 0) const;
+  void emitPopInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const std::vector<CalleeSavedInfo> &CSI, unsigned LdmOpc,
+                   unsigned LdrOpc, bool isVarArg, bool NoGap,
+                   bool(*Func)(unsigned, bool),
+                   unsigned NumAlignedDPRCS2Regs) const;
+
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
new file mode 100644
index 000000000000..0d904ecb6296
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -0,0 +1,101 @@
+//===-- ARMHazardRecognizer.cpp - ARM postra hazard recognizer ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMHazardRecognizer.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI,
+                         const TargetRegisterInfo &TRI) {
+  // FIXME: Detect integer instructions properly.
+  const MCInstrDesc &MCID = MI->getDesc();
+  unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
+  if (MI->mayStore())
+    return false;
+  unsigned Opcode = MCID.getOpcode();
+  if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+    return false;
+  if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON))
+    return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI);
+  return false;
+}
+
+ScheduleHazardRecognizer::HazardType
+ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  assert(Stalls == 0 && "ARM hazards don't support scoreboard lookahead");
+
+  MachineInstr *MI = SU->getInstr();
+
+  if (!MI->isDebugValue()) {
+    // Look for special VMLA / VMLS hazards. A VMUL / VADD / VSUB following
+    // a VMLA / VMLS will cause 4 cycle stall.
+    const MCInstrDesc &MCID = MI->getDesc();
+    if (LastMI && (MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) {
+      MachineInstr *DefMI = LastMI;
+      const MCInstrDesc &LastMCID = LastMI->getDesc();
+      const MachineFunction *MF = MI->getParent()->getParent();
+      const ARMBaseInstrInfo &TII = *static_cast<const ARMBaseInstrInfo *>(
+                                        MF->getSubtarget().getInstrInfo());
+
+      // Skip over one non-VFP / NEON instruction.
+      if (!LastMI->isBarrier() &&
+          !(TII.getSubtarget().hasMuxedUnits() && LastMI->mayLoadOrStore()) &&
+          (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
+        MachineBasicBlock::iterator I = LastMI;
+        if (I != LastMI->getParent()->begin()) {
+          I = std::prev(I);
+          DefMI = &*I;
+        }
+      }
+
+      if (TII.isFpMLxInstruction(DefMI->getOpcode()) &&
+          (TII.canCauseFpMLxStall(MI->getOpcode()) ||
+           hasRAWHazard(DefMI, MI, TII.getRegisterInfo()))) {
+        // Try to schedule another instruction for the next 4 cycles.
+        if (FpMLxStalls == 0)
+          FpMLxStalls = 4;
+        return Hazard;
+      }
+    }
+  }
+
+  return ScoreboardHazardRecognizer::getHazardType(SU, Stalls);
+}
+
+void ARMHazardRecognizer::Reset() {
+  LastMI = nullptr;
+  FpMLxStalls = 0;
+  ScoreboardHazardRecognizer::Reset();
+}
+
+void ARMHazardRecognizer::EmitInstruction(SUnit *SU) {
+  MachineInstr *MI = SU->getInstr();
+  if (!MI->isDebugValue()) {
+    LastMI = MI;
+    FpMLxStalls = 0;
+  }
+
+  ScoreboardHazardRecognizer::EmitInstruction(SU);
+}
+
+void ARMHazardRecognizer::AdvanceCycle() {
+  if (FpMLxStalls && --FpMLxStalls == 0)
+    // Stalled for 4 cycles but still can't schedule any other instructions.
+    LastMI = nullptr;
+  ScoreboardHazardRecognizer::AdvanceCycle();
+}
+
+void ARMHazardRecognizer::RecedeCycle() {
+  llvm_unreachable("reverse ARM hazard checking unsupported");
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h
new file mode 100644
index 000000000000..ccf09db69937
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h
@@ -0,0 +1,49 @@
+//===-- ARMHazardRecognizer.h - ARM Hazard Recognizers ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling ARM functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMHAZARDRECOGNIZER_H
+#define LLVM_LIB_TARGET_ARM_ARMHAZARDRECOGNIZER_H
+
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+
+namespace llvm {
+
+class ARMBaseInstrInfo;
+class ARMBaseRegisterInfo;
+class ARMSubtarget;
+class MachineInstr;
+
+/// ARMHazardRecognizer handles special constraints that are not expressed in
+/// the scheduling itinerary. This is only used during postRA scheduling. The
+/// ARM preRA scheduler uses an unspecialized instance of the
+/// ScoreboardHazardRecognizer.
+class ARMHazardRecognizer : public ScoreboardHazardRecognizer {
+  MachineInstr *LastMI;
+  unsigned FpMLxStalls;
+
+public:
+  ARMHazardRecognizer(const InstrItineraryData *ItinData,
+                      const ScheduleDAG *DAG)
+    : ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"),
+      LastMI(nullptr) {}
+
+  HazardType getHazardType(SUnit *SU, int Stalls) override;
+  void Reset() override;
+  void EmitInstruction(SUnit *SU) override;
+  void AdvanceCycle() override;
+  void RecedeCycle() override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
new file mode 100644
index 000000000000..c3e9591d5c70
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -0,0 +1,4682 @@
+//===-- ARMISelDAGToDAG.cpp - A dag to dag inst selector for ARM ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the ARM target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-isel"
+
+static cl::opt<bool>
+DisableShifterOp("disable-shifter-op", cl::Hidden,
+  cl::desc("Disable isel of shifter-op"),
+  cl::init(false));
+
+//===--------------------------------------------------------------------===//
+/// ARMDAGToDAGISel - ARM specific code to select ARM machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+
+enum AddrMode2Type {
+  AM2_BASE, // Simple AM2 (+-imm12)
+  AM2_SHOP  // Shifter-op AM2
+};
+
+class ARMDAGToDAGISel : public SelectionDAGISel {
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARMSubtarget *Subtarget;
+
+public:
+  explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(tm, OptLevel) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    // Reset the subtarget each time through.
+    Subtarget = &MF.getSubtarget<ARMSubtarget>();
+    SelectionDAGISel::runOnMachineFunction(MF);
+    return true;
+  }
+
+  StringRef getPassName() const override { return "ARM Instruction Selection"; }
+
+  void PreprocessISelDAG() override;
+
+  /// getI32Imm - Return a target constant of type i32 with the specified
+  /// value.
+  inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
+    return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+  }
+
+  void Select(SDNode *N) override;
+
+  bool hasNoVMLxHazardUse(SDNode *N) const;
+  bool isShifterOpProfitable(const SDValue &Shift,
+                             ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt);
+  bool SelectRegShifterOperand(SDValue N, SDValue &A,
+                               SDValue &B, SDValue &C,
+                               bool CheckProfitability = true);
+  bool SelectImmShifterOperand(SDValue N, SDValue &A,
+                               SDValue &B, bool CheckProfitability = true);
+  bool SelectShiftRegShifterOperand(SDValue N, SDValue &A,
+                                    SDValue &B, SDValue &C) {
+    // Don't apply the profitability check
+    return SelectRegShifterOperand(N, A, B, C, false);
+  }
+  bool SelectShiftImmShifterOperand(SDValue N, SDValue &A,
+                                    SDValue &B) {
+    // Don't apply the profitability check
+    return SelectImmShifterOperand(N, A, B, false);
+  }
+
+  bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
+  bool SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc);
+
+  AddrMode2Type SelectAddrMode2Worker(SDValue N, SDValue &Base,
+                                      SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode2Base(SDValue N, SDValue &Base, SDValue &Offset,
+                           SDValue &Opc) {
+    return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_BASE;
+  }
+
+  bool SelectAddrMode2ShOp(SDValue N, SDValue &Base, SDValue &Offset,
+                           SDValue &Opc) {
+    return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_SHOP;
+  }
+
+  bool SelectAddrMode2(SDValue N, SDValue &Base, SDValue &Offset,
+                       SDValue &Opc) {
+    SelectAddrMode2Worker(N, Base, Offset, Opc);
+//    return SelectAddrMode2ShOp(N, Base, Offset, Opc);
+    // This always matches one way or another.
+    return true;
+  }
+
+  bool SelectCMOVPred(SDValue N, SDValue &Pred, SDValue &Reg) {
+    const ConstantSDNode *CN = cast<ConstantSDNode>(N);
+    Pred = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(N), MVT::i32);
+    Reg = CurDAG->getRegister(ARM::CPSR, MVT::i32);
+    return true;
+  }
+
+  bool SelectAddrMode2OffsetReg(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode2OffsetImm(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrOffsetNone(SDValue N, SDValue &Base);
+  bool SelectAddrMode3(SDValue N, SDValue &Base,
+                       SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode3Offset(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode5(SDValue N, SDValue &Base,
+                       SDValue &Offset);
+  bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align);
+  bool SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset);
+
+  bool SelectAddrModePC(SDValue N, SDValue &Offset, SDValue &Label);
+
+  // Thumb Addressing Modes:
+  bool SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset);
+  bool SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base,
+                                SDValue &OffImm);
+  bool SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base,
+                                 SDValue &OffImm);
+  bool SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base,
+                                 SDValue &OffImm);
+  bool SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base,
+                                 SDValue &OffImm);
+  bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm);
+
+  // Thumb 2 Addressing Modes:
+  bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
+  bool SelectT2AddrModeImm8(SDValue N, SDValue &Base,
+                            SDValue &OffImm);
+  bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
+                                 SDValue &OffImm);
+  bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base,
+                             SDValue &OffReg, SDValue &ShImm);
+  bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm);
+
+  inline bool is_so_imm(unsigned Imm) const {
+    return ARM_AM::getSOImmVal(Imm) != -1;
+  }
+
+  inline bool is_so_imm_not(unsigned Imm) const {
+    return ARM_AM::getSOImmVal(~Imm) != -1;
+  }
+
+  inline bool is_t2_so_imm(unsigned Imm) const {
+    return ARM_AM::getT2SOImmVal(Imm) != -1;
+  }
+
+  inline bool is_t2_so_imm_not(unsigned Imm) const {
+    return ARM_AM::getT2SOImmVal(~Imm) != -1;
+  }
+
+  // Include the pieces autogenerated from the target description.
+#include "ARMGenDAGISel.inc"
+
+private:
+  void transferMemOperands(SDNode *Src, SDNode *Dst);
+
+  /// Indexed (pre/post inc/dec) load matching code for ARM.
+  bool tryARMIndexedLoad(SDNode *N);
+  bool tryT1IndexedLoad(SDNode *N);
+  bool tryT2IndexedLoad(SDNode *N);
+
+  /// SelectVLD - Select NEON load intrinsics.  NumVecs should be
+  /// 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// loads of D registers and even subregs and odd subregs of Q registers.
+  /// For NumVecs <= 2, QOpcodes1 is not used.
+  void SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+                 const uint16_t *DOpcodes, const uint16_t *QOpcodes0,
+                 const uint16_t *QOpcodes1);
+
+  /// SelectVST - Select NEON store intrinsics.  NumVecs should
+  /// be 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// stores of D registers and even subregs and odd subregs of Q registers.
+  /// For NumVecs <= 2, QOpcodes1 is not used.
+  void SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+                 const uint16_t *DOpcodes, const uint16_t *QOpcodes0,
+                 const uint16_t *QOpcodes1);
+
+  /// SelectVLDSTLane - Select NEON load/store lane intrinsics.  NumVecs should
+  /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// load/store of D registers and Q registers.
+  void SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
+                       unsigned NumVecs, const uint16_t *DOpcodes,
+                       const uint16_t *QOpcodes);
+
+  /// SelectVLDDup - Select NEON load-duplicate intrinsics.  NumVecs
+  /// should be 1, 2, 3 or 4.  The opcode array specifies the instructions used
+  /// for loading D registers.  (Q registers are not supported.)
+  void SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+                    const uint16_t *DOpcodes,
+                    const uint16_t *QOpcodes = nullptr);
+
+  /// SelectVTBL - Select NEON VTBL and VTBX intrinsics.  NumVecs should be 2,
+  /// 3 or 4.  These are custom-selected so that a REG_SEQUENCE can be
+  /// generated to force the table registers to be consecutive.
+  void SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc);
+
+  /// Try to select SBFX/UBFX instructions for ARM.
+  bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
+
+  // Select special operations if node forms integer ABS pattern
+  bool tryABSOp(SDNode *N);
+
+  bool tryReadRegister(SDNode *N);
+  bool tryWriteRegister(SDNode *N);
+
+  bool tryInlineAsm(SDNode *N);
+
+  void SelectConcatVector(SDNode *N);
+  void SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI);
+
+  bool trySMLAWSMULW(SDNode *N);
+
+  void SelectCMP_SWAP(SDNode *N);
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                                    std::vector<SDValue> &OutOps) override;
+
+  // Form pairs of consecutive R, S, D, or Q registers.
+  SDNode *createGPRPairNode(EVT VT, SDValue V0, SDValue V1);
+  SDNode *createSRegPairNode(EVT VT, SDValue V0, SDValue V1);
+  SDNode *createDRegPairNode(EVT VT, SDValue V0, SDValue V1);
+  SDNode *createQRegPairNode(EVT VT, SDValue V0, SDValue V1);
+
+  // Form sequences of 4 consecutive S, D, or Q registers.
+  SDNode *createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
+  SDNode *createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
+  SDNode *createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
+
+  // Get the alignment operand for a NEON VLD or VST instruction.
+  SDValue GetVLDSTAlign(SDValue Align, const SDLoc &dl, unsigned NumVecs,
+                        bool is64BitVector);
+
+  /// Returns the number of instructions required to materialize the given
+  /// constant in a register, or 3 if a literal pool load is needed.
+  unsigned ConstantMaterializationCost(unsigned Val) const;
+
+  /// Checks if N is a multiplication by a constant where we can extract out a
+  /// power of two from the constant so that it can be used in a shift, but only
+  /// if it simplifies the materialization of the constant. Returns true if it
+  /// is, and assigns to PowerOfTwo the power of two that should be extracted
+  /// out and to NewMulConst the new constant to be multiplied by.
+  bool canExtractShiftFromMul(const SDValue &N, unsigned MaxShift,
+                              unsigned &PowerOfTwo, SDValue &NewMulConst) const;
+
+  /// Replace N with M in CurDAG, in a way that also ensures that M gets
+  /// selected when N would have been selected.
+  void replaceDAGValue(const SDValue &N, SDValue M);
+};
+}
+
+/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
+    Imm = cast<ConstantSDNode>(N)->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+// isInt32Immediate - This method tests to see if a constant operand.
+// If so Imm will receive the 32 bit value.
+static bool isInt32Immediate(SDValue N, unsigned &Imm) {
+  return isInt32Immediate(N.getNode(), Imm);
+}
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
+  return N->getOpcode() == Opc &&
+         isInt32Immediate(N->getOperand(1).getNode(), Imm);
+}
+
+/// \brief Check whether a particular node is a constant value representable as
+/// (N * Scale) where (N in [\p RangeMin, \p RangeMax).
+///
+/// \param ScaledConstant [out] - On success, the pre-scaled constant value.
+static bool isScaledConstantInRange(SDValue Node, int Scale,
+                                    int RangeMin, int RangeMax,
+                                    int &ScaledConstant) {
+  assert(Scale > 0 && "Invalid scale!");
+
+  // Check that this is a constant.
+  const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Node);
+  if (!C)
+    return false;
+
+  ScaledConstant = (int) C->getZExtValue();
+  if ((ScaledConstant % Scale) != 0)
+    return false;
+
+  ScaledConstant /= Scale;
+  return ScaledConstant >= RangeMin && ScaledConstant < RangeMax;
+}
+
+void ARMDAGToDAGISel::PreprocessISelDAG() {
+  if (!Subtarget->hasV6T2Ops())
+    return;
+
+  bool isThumb2 = Subtarget->isThumb();
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+       E = CurDAG->allnodes_end(); I != E; ) {
+    SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
+
+    if (N->getOpcode() != ISD::ADD)
+      continue;
+
+    // Look for (add X1, (and (srl X2, c1), c2)) where c2 is constant with
+    // leading zeros, followed by consecutive set bits, followed by 1 or 2
+    // trailing zeros, e.g. 1020.
+    // Transform the expression to
+    // (add X1, (shl (and (srl X2, c1), (c2>>tz)), tz)) where tz is the number
+    // of trailing zeros of c2. The left shift would be folded as an shifter
+    // operand of 'add' and the 'and' and 'srl' would become a bits extraction
+    // node (UBFX).
+
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    unsigned And_imm = 0;
+    if (!isOpcWithIntImmediate(N1.getNode(), ISD::AND, And_imm)) {
+      if (isOpcWithIntImmediate(N0.getNode(), ISD::AND, And_imm))
+        std::swap(N0, N1);
+    }
+    if (!And_imm)
+      continue;
+
+    // Check if the AND mask is an immediate of the form: 000.....1111111100
+    unsigned TZ = countTrailingZeros(And_imm);
+    if (TZ != 1 && TZ != 2)
+      // Be conservative here. Shifter operands aren't always free. e.g. On
+      // Swift, left shifter operand of 1 / 2 for free but others are not.
+      // e.g.
+      //  ubfx   r3, r1, #16, #8
+      //  ldr.w  r3, [r0, r3, lsl #2]
+      // vs.
+      //  mov.w  r9, #1020
+      //  and.w  r2, r9, r1, lsr #14
+      //  ldr    r2, [r0, r2]
+      continue;
+    And_imm >>= TZ;
+    if (And_imm & (And_imm + 1))
+      continue;
+
+    // Look for (and (srl X, c1), c2).
+    SDValue Srl = N1.getOperand(0);
+    unsigned Srl_imm = 0;
+    if (!isOpcWithIntImmediate(Srl.getNode(), ISD::SRL, Srl_imm) ||
+        (Srl_imm <= 2))
+      continue;
+
+    // Make sure first operand is not a shifter operand which would prevent
+    // folding of the left shift.
+    SDValue CPTmp0;
+    SDValue CPTmp1;
+    SDValue CPTmp2;
+    if (isThumb2) {
+      if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1))
+        continue;
+    } else {
+      if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1) ||
+          SelectRegShifterOperand(N0, CPTmp0, CPTmp1, CPTmp2))
+        continue;
+    }
+
+    // Now make the transformation.
+    Srl = CurDAG->getNode(ISD::SRL, SDLoc(Srl), MVT::i32,
+                          Srl.getOperand(0),
+                          CurDAG->getConstant(Srl_imm + TZ, SDLoc(Srl),
+                                              MVT::i32));
+    N1 = CurDAG->getNode(ISD::AND, SDLoc(N1), MVT::i32,
+                         Srl,
+                         CurDAG->getConstant(And_imm, SDLoc(Srl), MVT::i32));
+    N1 = CurDAG->getNode(ISD::SHL, SDLoc(N1), MVT::i32,
+                         N1, CurDAG->getConstant(TZ, SDLoc(Srl), MVT::i32));
+    CurDAG->UpdateNodeOperands(N, N0, N1);
+  }
+}
+
+/// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS
+/// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at
+/// least on current ARM implementations) which should be avoidded.
+bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
+  if (OptLevel == CodeGenOpt::None)
+    return true;
+
+  if (!Subtarget->hasVMLxHazards())
+    return true;
+
+  if (!N->hasOneUse())
+    return false;
+
+  SDNode *Use = *N->use_begin();
+  if (Use->getOpcode() == ISD::CopyToReg)
+    return true;
+  if (Use->isMachineOpcode()) {
+    const ARMBaseInstrInfo *TII = static_cast<const ARMBaseInstrInfo *>(
+        CurDAG->getSubtarget().getInstrInfo());
+
+    const MCInstrDesc &MCID = TII->get(Use->getMachineOpcode());
+    if (MCID.mayStore())
+      return true;
+    unsigned Opcode = MCID.getOpcode();
+    if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+      return true;
+    // vmlx feeding into another vmlx. We actually want to unfold
+    // the use later in the MLxExpansion pass. e.g.
+    // vmla
+    // vmla (stall 8 cycles)
+    //
+    // vmul (5 cycles)
+    // vadd (5 cycles)
+    // vmla
+    // This adds up to about 18 - 19 cycles.
+    //
+    // vmla
+    // vmul (stall 4 cycles)
+    // vadd adds up to about 14 cycles.
+    return TII->isFpMLxInstruction(Opcode);
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
+                                            ARM_AM::ShiftOpc ShOpcVal,
+                                            unsigned ShAmt) {
+  if (!Subtarget->isLikeA9() && !Subtarget->isSwift())
+    return true;
+  if (Shift.hasOneUse())
+    return true;
+  // R << 2 is free.
+  return ShOpcVal == ARM_AM::lsl &&
+         (ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1));
+}
+
+unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const {
+  if (Subtarget->isThumb()) {
+    if (Val <= 255) return 1;                               // MOV
+    if (Subtarget->hasV6T2Ops() &&
+        (Val <= 0xffff || ARM_AM::getT2SOImmValSplatVal(Val) != -1))
+      return 1; // MOVW
+    if (Val <= 510) return 2;                               // MOV + ADDi8
+    if (~Val <= 255) return 2;                              // MOV + MVN
+    if (ARM_AM::isThumbImmShiftedVal(Val)) return 2;        // MOV + LSL
+  } else {
+    if (ARM_AM::getSOImmVal(Val) != -1) return 1;           // MOV
+    if (ARM_AM::getSOImmVal(~Val) != -1) return 1;          // MVN
+    if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
+    if (ARM_AM::isSOImmTwoPartVal(Val)) return 2;           // two instrs
+  }
+  if (Subtarget->useMovt(*MF)) return 2; // MOVW + MOVT
+  return 3; // Literal pool load
+}
+
+bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N,
+                                             unsigned MaxShift,
+                                             unsigned &PowerOfTwo,
+                                             SDValue &NewMulConst) const {
+  assert(N.getOpcode() == ISD::MUL);
+  assert(MaxShift > 0);
+
+  // If the multiply is used in more than one place then changing the constant
+  // will make other uses incorrect, so don't.
+  if (!N.hasOneUse()) return false;
+  // Check if the multiply is by a constant
+  ConstantSDNode *MulConst = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!MulConst) return false;
+  // If the constant is used in more than one place then modifying it will mean
+  // we need to materialize two constants instead of one, which is a bad idea.
+  if (!MulConst->hasOneUse()) return false;
+  unsigned MulConstVal = MulConst->getZExtValue();
+  if (MulConstVal == 0) return false;
+
+  // Find the largest power of 2 that MulConstVal is a multiple of
+  PowerOfTwo = MaxShift;
+  while ((MulConstVal % (1 << PowerOfTwo)) != 0) {
+    --PowerOfTwo;
+    if (PowerOfTwo == 0) return false;
+  }
+
+  // Only optimise if the new cost is better
+  unsigned NewMulConstVal = MulConstVal / (1 << PowerOfTwo);
+  NewMulConst = CurDAG->getConstant(NewMulConstVal, SDLoc(N), MVT::i32);
+  unsigned OldCost = ConstantMaterializationCost(MulConstVal);
+  unsigned NewCost = ConstantMaterializationCost(NewMulConstVal);
+  return NewCost < OldCost;
+}
+
+void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) {
+  CurDAG->RepositionNode(N.getNode()->getIterator(), M.getNode());
+  CurDAG->ReplaceAllUsesWith(N, M);
+}
+
+bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
+                                              SDValue &BaseReg,
+                                              SDValue &Opc,
+                                              bool CheckProfitability) {
+  if (DisableShifterOp)
+    return false;
+
+  // If N is a multiply-by-constant and it's profitable to extract a shift and
+  // use it in a shifted operand do so.
+  if (N.getOpcode() == ISD::MUL) {
+    unsigned PowerOfTwo = 0;
+    SDValue NewMulConst;
+    if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) {
+      HandleSDNode Handle(N);
+      replaceDAGValue(N.getOperand(1), NewMulConst);
+      BaseReg = Handle.getValue();
+      Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl,
+                                                          PowerOfTwo),
+                                      SDLoc(N), MVT::i32);
+      return true;
+    }
+  }
+
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
+
+  // Don't match base register only case. That is matched to a separate
+  // lower complexity pattern with explicit register operand.
+  if (ShOpcVal == ARM_AM::no_shift) return false;
+
+  BaseReg = N.getOperand(0);
+  unsigned ShImmVal = 0;
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!RHS) return false;
+  ShImmVal = RHS->getZExtValue() & 31;
+  Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
+                                  SDLoc(N), MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectRegShifterOperand(SDValue N,
+                                              SDValue &BaseReg,
+                                              SDValue &ShReg,
+                                              SDValue &Opc,
+                                              bool CheckProfitability) {
+  if (DisableShifterOp)
+    return false;
+
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
+
+  // Don't match base register only case. That is matched to a separate
+  // lower complexity pattern with explicit register operand.
+  if (ShOpcVal == ARM_AM::no_shift) return false;
+
+  BaseReg = N.getOperand(0);
+  unsigned ShImmVal = 0;
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (RHS) return false;
+
+  ShReg = N.getOperand(1);
+  if (CheckProfitability && !isShifterOpProfitable(N, ShOpcVal, ShImmVal))
+    return false;
+  Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
+                                  SDLoc(N), MVT::i32);
+  return true;
+}
+
+
+bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
+                                          SDValue &Base,
+                                          SDValue &OffImm) {
+  // Match simple R + imm12 operands.
+
+  // Base only.
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      !CurDAG->isBaseWithConstantOffset(N)) {
+    if (N.getOpcode() == ISD::FrameIndex) {
+      // Match frame index.
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+      OffImm  = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+      return true;
+    }
+
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
+      Base = N.getOperand(0);
+    } else
+      Base = N;
+    OffImm  = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+    return true;
+  }
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getSExtValue();
+    if (N.getOpcode() == ISD::SUB)
+      RHSC = -RHSC;
+
+    if (RHSC > -0x1000 && RHSC < 0x1000) { // 12 bits
+      Base   = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
+      return true;
+    }
+  }
+
+  // Base only.
+  Base = N;
+  OffImm  = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+  return true;
+}
+
+
+
+bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
+                                      SDValue &Opc) {
+  if (N.getOpcode() == ISD::MUL &&
+      ((!Subtarget->isLikeA9() && !Subtarget->isSwift()) || N.hasOneUse())) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      // X * [3,5,9] -> X + X * [2,4,8] etc.
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC & 1) {
+        RHSC = RHSC & ~1;
+        ARM_AM::AddrOpc AddSub = ARM_AM::add;
+        if (RHSC < 0) {
+          AddSub = ARM_AM::sub;
+          RHSC = - RHSC;
+        }
+        if (isPowerOf2_32(RHSC)) {
+          unsigned ShAmt = Log2_32(RHSC);
+          Base = Offset = N.getOperand(0);
+          Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt,
+                                                            ARM_AM::lsl),
+                                          SDLoc(N), MVT::i32);
+          return true;
+        }
+      }
+    }
+  }
+
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      // ISD::OR that is equivalent to an ISD::ADD.
+      !CurDAG->isBaseWithConstantOffset(N))
+    return false;
+
+  // Leave simple R +/- imm12 operands for LDRi12
+  if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::OR) {
+    int RHSC;
+    if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1,
+                                -0x1000+1, 0x1000, RHSC)) // 12 bits.
+      return false;
+  }
+
+  // Otherwise this is R +/- [possibly shifted] R.
+  ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::SUB ? ARM_AM::sub:ARM_AM::add;
+  ARM_AM::ShiftOpc ShOpcVal =
+    ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode());
+  unsigned ShAmt = 0;
+
+  Base   = N.getOperand(0);
+  Offset = N.getOperand(1);
+
+  if (ShOpcVal != ARM_AM::no_shift) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh =
+           dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt))
+        Offset = N.getOperand(1).getOperand(0);
+      else {
+        ShAmt = 0;
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  // Try matching (R shl C) + (R).
+  if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
+      !(Subtarget->isLikeA9() || Subtarget->isSwift() ||
+        N.getOperand(0).hasOneUse())) {
+    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
+    if (ShOpcVal != ARM_AM::no_shift) {
+      // Check to see if the RHS of the shift is a constant, if not, we can't
+      // fold it.
+      if (ConstantSDNode *Sh =
+          dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
+        ShAmt = Sh->getZExtValue();
+        if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) {
+          Offset = N.getOperand(0).getOperand(0);
+          Base = N.getOperand(1);
+        } else {
+          ShAmt = 0;
+          ShOpcVal = ARM_AM::no_shift;
+        }
+      } else {
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    }
+  }
+
+  // If Offset is a multiply-by-constant and it's profitable to extract a shift
+  // and use it in a shifted operand do so.
+  if (Offset.getOpcode() == ISD::MUL && N.hasOneUse()) {
+    unsigned PowerOfTwo = 0;
+    SDValue NewMulConst;
+    if (canExtractShiftFromMul(Offset, 31, PowerOfTwo, NewMulConst)) {
+      replaceDAGValue(Offset.getOperand(1), NewMulConst);
+      ShAmt = PowerOfTwo;
+      ShOpcVal = ARM_AM::lsl;
+    }
+  }
+
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+                                  SDLoc(N), MVT::i32);
+  return true;
+}
+
+
+//-----
+
+AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
+                                                     SDValue &Base,
+                                                     SDValue &Offset,
+                                                     SDValue &Opc) {
+  if (N.getOpcode() == ISD::MUL &&
+      (!(Subtarget->isLikeA9() || Subtarget->isSwift()) || N.hasOneUse())) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      // X * [3,5,9] -> X + X * [2,4,8] etc.
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC & 1) {
+        RHSC = RHSC & ~1;
+        ARM_AM::AddrOpc AddSub = ARM_AM::add;
+        if (RHSC < 0) {
+          AddSub = ARM_AM::sub;
+          RHSC = - RHSC;
+        }
+        if (isPowerOf2_32(RHSC)) {
+          unsigned ShAmt = Log2_32(RHSC);
+          Base = Offset = N.getOperand(0);
+          Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt,
+                                                            ARM_AM::lsl),
+                                          SDLoc(N), MVT::i32);
+          return AM2_SHOP;
+        }
+      }
+    }
+  }
+
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      // ISD::OR that is equivalent to an ADD.
+      !CurDAG->isBaseWithConstantOffset(N)) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+    } else if (N.getOpcode() == ARMISD::Wrapper &&
+               N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+               N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
+               N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
+      Base = N.getOperand(0);
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0,
+                                                      ARM_AM::no_shift),
+                                    SDLoc(N), MVT::i32);
+    return AM2_BASE;
+  }
+
+  // Match simple R +/- imm12 operands.
+  if (N.getOpcode() != ISD::SUB) {
+    int RHSC;
+    if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1,
+                                -0x1000+1, 0x1000, RHSC)) { // 12 bits.
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+      }
+      Offset = CurDAG->getRegister(0, MVT::i32);
+
+      ARM_AM::AddrOpc AddSub = ARM_AM::add;
+      if (RHSC < 0) {
+        AddSub = ARM_AM::sub;
+        RHSC = - RHSC;
+      }
+      Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, RHSC,
+                                                        ARM_AM::no_shift),
+                                      SDLoc(N), MVT::i32);
+      return AM2_BASE;
+    }
+  }
+
+  if ((Subtarget->isLikeA9() || Subtarget->isSwift()) && !N.hasOneUse()) {
+    // Compute R +/- (R << N) and reuse it.
+    Base = N;
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0,
+                                                      ARM_AM::no_shift),
+                                    SDLoc(N), MVT::i32);
+    return AM2_BASE;
+  }
+
+  // Otherwise this is R +/- [possibly shifted] R.
+  ARM_AM::AddrOpc AddSub = N.getOpcode() != ISD::SUB ? ARM_AM::add:ARM_AM::sub;
+  ARM_AM::ShiftOpc ShOpcVal =
+    ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode());
+  unsigned ShAmt = 0;
+
+  Base   = N.getOperand(0);
+  Offset = N.getOperand(1);
+
+  if (ShOpcVal != ARM_AM::no_shift) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh =
+           dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt))
+        Offset = N.getOperand(1).getOperand(0);
+      else {
+        ShAmt = 0;
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  // Try matching (R shl C) + (R).
+  if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
+      !(Subtarget->isLikeA9() || Subtarget->isSwift() ||
+        N.getOperand(0).hasOneUse())) {
+    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
+    if (ShOpcVal != ARM_AM::no_shift) {
+      // Check to see if the RHS of the shift is a constant, if not, we can't
+      // fold it.
+      if (ConstantSDNode *Sh =
+          dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
+        ShAmt = Sh->getZExtValue();
+        if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) {
+          Offset = N.getOperand(0).getOperand(0);
+          Base = N.getOperand(1);
+        } else {
+          ShAmt = 0;
+          ShOpcVal = ARM_AM::no_shift;
+        }
+      } else {
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    }
+  }
+
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+                                  SDLoc(N), MVT::i32);
+  return AM2_SHOP;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetReg(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val))
+    return false;
+
+  Offset = N;
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
+  unsigned ShAmt = 0;
+  if (ShOpcVal != ARM_AM::no_shift) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      if (isShifterOpProfitable(N, ShOpcVal, ShAmt))
+        Offset = N.getOperand(0);
+      else {
+        ShAmt = 0;
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+                                  SDLoc(N), MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits.
+    if (AddSub == ARM_AM::sub) Val *= -1;
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(Val, SDLoc(Op), MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetImm(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits.
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val,
+                                                      ARM_AM::no_shift),
+                                    SDLoc(Op), MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectAddrOffsetNone(SDValue N, SDValue &Base) {
+  Base = N;
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
+                                      SDValue &Base, SDValue &Offset,
+                                      SDValue &Opc) {
+  if (N.getOpcode() == ISD::SUB) {
+    // X - C  is canonicalize to X + -C, no need to handle it here.
+    Base = N.getOperand(0);
+    Offset = N.getOperand(1);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::sub, 0), SDLoc(N),
+                                    MVT::i32);
+    return true;
+  }
+
+  if (!CurDAG->isBaseWithConstantOffset(N)) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), SDLoc(N),
+                                    MVT::i32);
+    return true;
+  }
+
+  // If the RHS is +/- imm8, fold into addr mode.
+  int RHSC;
+  if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1,
+                              -256 + 1, 256, RHSC)) { // 8 bits.
+    Base = N.getOperand(0);
+    if (Base.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
+
+    ARM_AM::AddrOpc AddSub = ARM_AM::add;
+    if (RHSC < 0) {
+      AddSub = ARM_AM::sub;
+      RHSC = -RHSC;
+    }
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, RHSC), SDLoc(N),
+                                    MVT::i32);
+    return true;
+  }
+
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), SDLoc(N),
+                                  MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 256, Val)) { // 12 bits.
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, Val), SDLoc(Op),
+                                    MVT::i32);
+    return true;
+  }
+
+  Offset = N;
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, 0), SDLoc(Op),
+                                  MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
+                                      SDValue &Base, SDValue &Offset) {
+  if (!CurDAG->isBaseWithConstantOffset(N)) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+    } else if (N.getOpcode() == ARMISD::Wrapper &&
+               N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+               N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
+               N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
+      Base = N.getOperand(0);
+    }
+    Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+                                       SDLoc(N), MVT::i32);
+    return true;
+  }
+
+  // If the RHS is +/- imm8, fold into addr mode.
+  int RHSC;
+  if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4,
+                              -256 + 1, 256, RHSC)) {
+    Base = N.getOperand(0);
+    if (Base.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+    }
+
+    ARM_AM::AddrOpc AddSub = ARM_AM::add;
+    if (RHSC < 0) {
+      AddSub = ARM_AM::sub;
+      RHSC = -RHSC;
+    }
+    Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
+                                       SDLoc(N), MVT::i32);
+    return true;
+  }
+
+  Base = N;
+  Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+                                     SDLoc(N), MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,
+                                      SDValue &Align) {
+  Addr = N;
+
+  unsigned Alignment = 0;
+
+  MemSDNode *MemN = cast<MemSDNode>(Parent);
+
+  if (isa<LSBaseSDNode>(MemN) ||
+      ((MemN->getOpcode() == ARMISD::VST1_UPD ||
+        MemN->getOpcode() == ARMISD::VLD1_UPD) &&
+       MemN->getConstantOperandVal(MemN->getNumOperands() - 1) == 1)) {
+    // This case occurs only for VLD1-lane/dup and VST1-lane instructions.
+    // The maximum alignment is equal to the memory size being referenced.
+    unsigned MMOAlign = MemN->getAlignment();
+    unsigned MemSize = MemN->getMemoryVT().getSizeInBits() / 8;
+    if (MMOAlign >= MemSize && MemSize > 1)
+      Alignment = MemSize;
+  } else {
+    // All other uses of addrmode6 are for intrinsics.  For now just record
+    // the raw alignment value; it will be refined later based on the legal
+    // alignment operands for the intrinsic.
+    Alignment = MemN->getAlignment();
+  }
+
+  Align = CurDAG->getTargetConstant(Alignment, SDLoc(N), MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode6Offset(SDNode *Op, SDValue N,
+                                            SDValue &Offset) {
+  LSBaseSDNode *LdSt = cast<LSBaseSDNode>(Op);
+  ISD::MemIndexedMode AM = LdSt->getAddressingMode();
+  if (AM != ISD::POST_INC)
+    return false;
+  Offset = N;
+  if (ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N)) {
+    if (NC->getZExtValue() * 8 == LdSt->getMemoryVT().getSizeInBits())
+      Offset = CurDAG->getRegister(0, MVT::i32);
+  }
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrModePC(SDValue N,
+                                       SDValue &Offset, SDValue &Label) {
+  if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) {
+    Offset = N.getOperand(0);
+    SDValue N1 = N.getOperand(1);
+    Label = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
+                                      SDLoc(N), MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Thumb Addressing Modes
+//===----------------------------------------------------------------------===//
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N,
+                                            SDValue &Base, SDValue &Offset){
+  if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N)) {
+    ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N);
+    if (!NC || !NC->isNullValue())
+      return false;
+
+    Base = Offset = N;
+    return true;
+  }
+
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  return true;
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
+                                          SDValue &Base, SDValue &OffImm) {
+  if (!CurDAG->isBaseWithConstantOffset(N)) {
+    if (N.getOpcode() == ISD::ADD) {
+      return false; // We want to select register offset instead
+    } else if (N.getOpcode() == ARMISD::Wrapper &&
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
+        N.getOperand(0).getOpcode() != ISD::TargetConstantPool &&
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
+      Base = N.getOperand(0);
+    } else {
+      Base = N;
+    }
+
+    OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+    return true;
+  }
+
+  // If the RHS is + imm5 * scale, fold into addr mode.
+  int RHSC;
+  if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) {
+    Base = N.getOperand(0);
+    OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
+    return true;
+  }
+
+  // Offset is too large, so use register offset instead.
+  return false;
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base,
+                                           SDValue &OffImm) {
+  return SelectThumbAddrModeImm5S(N, 4, Base, OffImm);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base,
+                                           SDValue &OffImm) {
+  return SelectThumbAddrModeImm5S(N, 2, Base, OffImm);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base,
+                                           SDValue &OffImm) {
+  return SelectThumbAddrModeImm5S(N, 1, Base, OffImm);
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
+                                            SDValue &Base, SDValue &OffImm) {
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    // Only multiples of 4 are allowed for the offset, so the frame object
+    // alignment must be at least 4.
+    MachineFrameInfo &MFI = MF->getFrameInfo();
+    if (MFI.getObjectAlignment(FI) < 4)
+      MFI.setObjectAlignment(FI, 4);
+    Base = CurDAG->getTargetFrameIndex(
+        FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+    OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+    return true;
+  }
+
+  if (!CurDAG->isBaseWithConstantOffset(N))
+    return false;
+
+  RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
+  if (N.getOperand(0).getOpcode() == ISD::FrameIndex ||
+      (LHSR && LHSR->getReg() == ARM::SP)) {
+    // If the RHS is + imm8 * scale, fold into addr mode.
+    int RHSC;
+    if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, 0, 256, RHSC)) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        // For LHS+RHS to result in an offset that's a multiple of 4 the object
+        // indexed by the LHS must be 4-byte aligned.
+        MachineFrameInfo &MFI = MF->getFrameInfo();
+        if (MFI.getObjectAlignment(FI) < 4)
+          MFI.setObjectAlignment(FI, 4);
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                        Thumb 2 Addressing Modes
+//===----------------------------------------------------------------------===//
+
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
+                                            SDValue &Base, SDValue &OffImm) {
+  // Match simple R + imm12 operands.
+
+  // Base only.
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      !CurDAG->isBaseWithConstantOffset(N)) {
+    if (N.getOpcode() == ISD::FrameIndex) {
+      // Match frame index.
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(
+          FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+      OffImm  = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+      return true;
+    }
+
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
+        N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::TargetConstantPool)
+        return false;  // We want to select t2LDRpci instead.
+    } else
+      Base = N;
+    OffImm  = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+    return true;
+  }
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    if (SelectT2AddrModeImm8(N, Base, OffImm))
+      // Let t2LDRi8 handle (R - imm8).
+      return false;
+
+    int RHSC = (int)RHS->getZExtValue();
+    if (N.getOpcode() == ISD::SUB)
+      RHSC = -RHSC;
+
+    if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned)
+      Base   = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
+      return true;
+    }
+  }
+
+  // Base only.
+  Base = N;
+  OffImm  = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N,
+                                           SDValue &Base, SDValue &OffImm) {
+  // Match simple R - imm8 operands.
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      !CurDAG->isBaseWithConstantOffset(N))
+    return false;
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getSExtValue();
+    if (N.getOpcode() == ISD::SUB)
+      RHSC = -RHSC;
+
+    if ((RHSC >= -255) && (RHSC < 0)) { // 8 bits (always negative)
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(
+            FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
+                                                 SDValue &OffImm){
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  int RHSC;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x100, RHSC)) { // 8 bits.
+    OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
+      ? CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32)
+      : CurDAG->getTargetConstant(-RHSC, SDLoc(N), MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
+                                            SDValue &Base,
+                                            SDValue &OffReg, SDValue &ShImm) {
+  // (R - imm8) should be handled by t2LDRi8. The rest are handled by t2LDRi12.
+  if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N))
+    return false;
+
+  // Leave (R + imm12) for t2LDRi12, (R - imm8) for t2LDRi8.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if (RHSC >= 0 && RHSC < 0x1000) // 12 bits (unsigned)
+      return false;
+    else if (RHSC < 0 && RHSC >= -255) // 8 bits
+      return false;
+  }
+
+  // Look for (R + R) or (R + (R << [1,2,3])).
+  unsigned ShAmt = 0;
+  Base   = N.getOperand(0);
+  OffReg = N.getOperand(1);
+
+  // Swap if it is ((R << c) + R).
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(OffReg.getOpcode());
+  if (ShOpcVal != ARM_AM::lsl) {
+    ShOpcVal = ARM_AM::getShiftOpcForNode(Base.getOpcode());
+    if (ShOpcVal == ARM_AM::lsl)
+      std::swap(Base, OffReg);
+  }
+
+  if (ShOpcVal == ARM_AM::lsl) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(OffReg.getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      if (ShAmt < 4 && isShifterOpProfitable(OffReg, ShOpcVal, ShAmt))
+        OffReg = OffReg.getOperand(0);
+      else {
+        ShAmt = 0;
+      }
+    }
+  }
+
+  // If OffReg is a multiply-by-constant and it's profitable to extract a shift
+  // and use it in a shifted operand do so.
+  if (OffReg.getOpcode() == ISD::MUL && N.hasOneUse()) {
+    unsigned PowerOfTwo = 0;
+    SDValue NewMulConst;
+    if (canExtractShiftFromMul(OffReg, 3, PowerOfTwo, NewMulConst)) {
+      replaceDAGValue(OffReg.getOperand(1), NewMulConst);
+      ShAmt = PowerOfTwo;
+    }
+  }
+
+  ShImm = CurDAG->getTargetConstant(ShAmt, SDLoc(N), MVT::i32);
+
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeExclusive(SDValue N, SDValue &Base,
+                                                SDValue &OffImm) {
+  // This *must* succeed since it's used for the irreplaceable ldrex and strex
+  // instructions.
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+
+  if (N.getOpcode() != ISD::ADD || !CurDAG->isBaseWithConstantOffset(N))
+    return true;
+
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!RHS)
+    return true;
+
+  uint32_t RHSC = (int)RHS->getZExtValue();
+  if (RHSC > 1020 || RHSC % 4 != 0)
+    return true;
+
+  Base = N.getOperand(0);
+  if (Base.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(
+        FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+  }
+
+  OffImm = CurDAG->getTargetConstant(RHSC/4, SDLoc(N), MVT::i32);
+  return true;
+}
+
+//===--------------------------------------------------------------------===//
+
+/// getAL - Returns a ARMCC::AL immediate node.
+static inline SDValue getAL(SelectionDAG *CurDAG, const SDLoc &dl) {
+  return CurDAG->getTargetConstant((uint64_t)ARMCC::AL, dl, MVT::i32);
+}
+
+void ARMDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+}
+
+bool ARMDAGToDAGISel::tryARMIndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM == ISD::UNINDEXED)
+    return false;
+
+  EVT LoadedVT = LD->getMemoryVT();
+  SDValue Offset, AMOpc;
+  bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+  unsigned Opcode = 0;
+  bool Match = false;
+  if (LoadedVT == MVT::i32 && isPre &&
+      SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) {
+    Opcode = ARM::LDR_PRE_IMM;
+    Match = true;
+  } else if (LoadedVT == MVT::i32 && !isPre &&
+      SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) {
+    Opcode = ARM::LDR_POST_IMM;
+    Match = true;
+  } else if (LoadedVT == MVT::i32 &&
+      SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) {
+    Opcode = isPre ? ARM::LDR_PRE_REG : ARM::LDR_POST_REG;
+    Match = true;
+
+  } else if (LoadedVT == MVT::i16 &&
+             SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) {
+    Match = true;
+    Opcode = (LD->getExtensionType() == ISD::SEXTLOAD)
+      ? (isPre ? ARM::LDRSH_PRE : ARM::LDRSH_POST)
+      : (isPre ? ARM::LDRH_PRE : ARM::LDRH_POST);
+  } else if (LoadedVT == MVT::i8 || LoadedVT == MVT::i1) {
+    if (LD->getExtensionType() == ISD::SEXTLOAD) {
+      if (SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = isPre ? ARM::LDRSB_PRE : ARM::LDRSB_POST;
+      }
+    } else {
+      if (isPre &&
+          SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = ARM::LDRB_PRE_IMM;
+      } else if (!isPre &&
+                  SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = ARM::LDRB_POST_IMM;
+      } else if (SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = isPre ? ARM::LDRB_PRE_REG : ARM::LDRB_POST_REG;
+      }
+    }
+  }
+
+  if (Match) {
+    if (Opcode == ARM::LDR_PRE_IMM || Opcode == ARM::LDRB_PRE_IMM) {
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[]= { Base, AMOpc, getAL(CurDAG, SDLoc(N)),
+                       CurDAG->getRegister(0, MVT::i32), Chain };
+      SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32,
+                                           MVT::Other, Ops);
+      transferMemOperands(N, New);
+      ReplaceNode(N, New);
+      return true;
+    } else {
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG, SDLoc(N)),
+                       CurDAG->getRegister(0, MVT::i32), Chain };
+      SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32,
+                                           MVT::Other, Ops);
+      transferMemOperands(N, New);
+      ReplaceNode(N, New);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::tryT1IndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  EVT LoadedVT = LD->getMemoryVT();
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM != ISD::POST_INC || LD->getExtensionType() != ISD::NON_EXTLOAD ||
+      LoadedVT.getSimpleVT().SimpleTy != MVT::i32)
+    return false;
+
+  auto *COffs = dyn_cast<ConstantSDNode>(LD->getOffset());
+  if (!COffs || COffs->getZExtValue() != 4)
+    return false;
+
+  // A T1 post-indexed load is just a single register LDM: LDM r0!, {r1}.
+  // The encoding of LDM is not how the rest of ISel expects a post-inc load to
+  // look however, so we use a pseudo here and switch it for a tLDMIA_UPD after
+  // ISel.
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  SDValue Ops[]= { Base, getAL(CurDAG, SDLoc(N)),
+                   CurDAG->getRegister(0, MVT::i32), Chain };
+  SDNode *New = CurDAG->getMachineNode(ARM::tLDR_postidx, SDLoc(N), MVT::i32,
+                                       MVT::i32, MVT::Other, Ops);
+  transferMemOperands(N, New);
+  ReplaceNode(N, New);
+  return true;
+}
+
+bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM == ISD::UNINDEXED)
+    return false;
+
+  EVT LoadedVT = LD->getMemoryVT();
+  bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
+  SDValue Offset;
+  bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+  unsigned Opcode = 0;
+  bool Match = false;
+  if (SelectT2AddrModeImm8Offset(N, LD->getOffset(), Offset)) {
+    switch (LoadedVT.getSimpleVT().SimpleTy) {
+    case MVT::i32:
+      Opcode = isPre ? ARM::t2LDR_PRE : ARM::t2LDR_POST;
+      break;
+    case MVT::i16:
+      if (isSExtLd)
+        Opcode = isPre ? ARM::t2LDRSH_PRE : ARM::t2LDRSH_POST;
+      else
+        Opcode = isPre ? ARM::t2LDRH_PRE : ARM::t2LDRH_POST;
+      break;
+    case MVT::i8:
+    case MVT::i1:
+      if (isSExtLd)
+        Opcode = isPre ? ARM::t2LDRSB_PRE : ARM::t2LDRSB_POST;
+      else
+        Opcode = isPre ? ARM::t2LDRB_PRE : ARM::t2LDRB_POST;
+      break;
+    default:
+      return false;
+    }
+    Match = true;
+  }
+
+  if (Match) {
+    SDValue Chain = LD->getChain();
+    SDValue Base = LD->getBasePtr();
+    SDValue Ops[]= { Base, Offset, getAL(CurDAG, SDLoc(N)),
+                     CurDAG->getRegister(0, MVT::i32), Chain };
+    SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32,
+                                         MVT::Other, Ops);
+    transferMemOperands(N, New);
+    ReplaceNode(N, New);
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Form a GPRPair pseudo register from a pair of GPR regs.
+SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
+  SDLoc dl(V0.getNode());
+  SDValue RegClass =
+    CurDAG->getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::gsub_0, dl, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::gsub_1, dl, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form a D register from a pair of S registers.
+SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) {
+  SDLoc dl(V0.getNode());
+  SDValue RegClass =
+    CurDAG->getTargetConstant(ARM::DPR_VFP2RegClassID, dl, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, dl, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, dl, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form a quad register from a pair of D registers.
+SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) {
+  SDLoc dl(V0.getNode());
+  SDValue RegClass = CurDAG->getTargetConstant(ARM::QPRRegClassID, dl,
+                                               MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, dl, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, dl, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form 4 consecutive D registers from a pair of Q registers.
+SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) {
+  SDLoc dl(V0.getNode());
+  SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, dl,
+                                               MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, dl, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, dl, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form 4 consecutive S registers.
+SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1,
+                                   SDValue V2, SDValue V3) {
+  SDLoc dl(V0.getNode());
+  SDValue RegClass =
+    CurDAG->getTargetConstant(ARM::QPR_VFP2RegClassID, dl, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, dl, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, dl, MVT::i32);
+  SDValue SubReg2 = CurDAG->getTargetConstant(ARM::ssub_2, dl, MVT::i32);
+  SDValue SubReg3 = CurDAG->getTargetConstant(ARM::ssub_3, dl, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1,
+                                    V2, SubReg2, V3, SubReg3 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form 4 consecutive D registers.
+SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1,
+                                   SDValue V2, SDValue V3) {
+  SDLoc dl(V0.getNode());
+  SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, dl,
+                                               MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, dl, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, dl, MVT::i32);
+  SDValue SubReg2 = CurDAG->getTargetConstant(ARM::dsub_2, dl, MVT::i32);
+  SDValue SubReg3 = CurDAG->getTargetConstant(ARM::dsub_3, dl, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1,
+                                    V2, SubReg2, V3, SubReg3 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// \brief Form 4 consecutive Q registers.
+SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1,
+                                   SDValue V2, SDValue V3) {
+  SDLoc dl(V0.getNode());
+  SDValue RegClass = CurDAG->getTargetConstant(ARM::QQQQPRRegClassID, dl,
+                                               MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, dl, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, dl, MVT::i32);
+  SDValue SubReg2 = CurDAG->getTargetConstant(ARM::qsub_2, dl, MVT::i32);
+  SDValue SubReg3 = CurDAG->getTargetConstant(ARM::qsub_3, dl, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1,
+                                    V2, SubReg2, V3, SubReg3 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
+}
+
+/// GetVLDSTAlign - Get the alignment (in bytes) for the alignment operand
+/// of a NEON VLD or VST instruction.  The supported values depend on the
+/// number of registers being loaded.
+SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, const SDLoc &dl,
+                                       unsigned NumVecs, bool is64BitVector) {
+  unsigned NumRegs = NumVecs;
+  if (!is64BitVector && NumVecs < 3)
+    NumRegs *= 2;
+
+  unsigned Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+  if (Alignment >= 32 && NumRegs == 4)
+    Alignment = 32;
+  else if (Alignment >= 16 && (NumRegs == 2 || NumRegs == 4))
+    Alignment = 16;
+  else if (Alignment >= 8)
+    Alignment = 8;
+  else
+    Alignment = 0;
+
+  return CurDAG->getTargetConstant(Alignment, dl, MVT::i32);
+}
+
+static bool isVLDfixed(unsigned Opc)
+{
+  switch (Opc) {
+  default: return false;
+  case ARM::VLD1d8wb_fixed : return true;
+  case ARM::VLD1d16wb_fixed : return true;
+  case ARM::VLD1d64Qwb_fixed : return true;
+  case ARM::VLD1d32wb_fixed : return true;
+  case ARM::VLD1d64wb_fixed : return true;
+  case ARM::VLD1d64TPseudoWB_fixed : return true;
+  case ARM::VLD1d64QPseudoWB_fixed : return true;
+  case ARM::VLD1q8wb_fixed : return true;
+  case ARM::VLD1q16wb_fixed : return true;
+  case ARM::VLD1q32wb_fixed : return true;
+  case ARM::VLD1q64wb_fixed : return true;
+  case ARM::VLD1DUPd8wb_fixed : return true;
+  case ARM::VLD1DUPd16wb_fixed : return true;
+  case ARM::VLD1DUPd32wb_fixed : return true;
+  case ARM::VLD1DUPq8wb_fixed : return true;
+  case ARM::VLD1DUPq16wb_fixed : return true;
+  case ARM::VLD1DUPq32wb_fixed : return true;
+  case ARM::VLD2d8wb_fixed : return true;
+  case ARM::VLD2d16wb_fixed : return true;
+  case ARM::VLD2d32wb_fixed : return true;
+  case ARM::VLD2q8PseudoWB_fixed : return true;
+  case ARM::VLD2q16PseudoWB_fixed : return true;
+  case ARM::VLD2q32PseudoWB_fixed : return true;
+  case ARM::VLD2DUPd8wb_fixed : return true;
+  case ARM::VLD2DUPd16wb_fixed : return true;
+  case ARM::VLD2DUPd32wb_fixed : return true;
+  }
+}
+
+static bool isVSTfixed(unsigned Opc)
+{
+  switch (Opc) {
+  default: return false;
+  case ARM::VST1d8wb_fixed : return true;
+  case ARM::VST1d16wb_fixed : return true;
+  case ARM::VST1d32wb_fixed : return true;
+  case ARM::VST1d64wb_fixed : return true;
+  case ARM::VST1q8wb_fixed : return true;
+  case ARM::VST1q16wb_fixed : return true;
+  case ARM::VST1q32wb_fixed : return true;
+  case ARM::VST1q64wb_fixed : return true;
+  case ARM::VST1d64TPseudoWB_fixed : return true;
+  case ARM::VST1d64QPseudoWB_fixed : return true;
+  case ARM::VST2d8wb_fixed : return true;
+  case ARM::VST2d16wb_fixed : return true;
+  case ARM::VST2d32wb_fixed : return true;
+  case ARM::VST2q8PseudoWB_fixed : return true;
+  case ARM::VST2q16PseudoWB_fixed : return true;
+  case ARM::VST2q32PseudoWB_fixed : return true;
+  }
+}
+
+// Get the register stride update opcode of a VLD/VST instruction that
+// is otherwise equivalent to the given fixed stride updating instruction.
+static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
+  assert((isVLDfixed(Opc) || isVSTfixed(Opc))
+    && "Incorrect fixed stride updating instruction.");
+  switch (Opc) {
+  default: break;
+  case ARM::VLD1d8wb_fixed: return ARM::VLD1d8wb_register;
+  case ARM::VLD1d16wb_fixed: return ARM::VLD1d16wb_register;
+  case ARM::VLD1d32wb_fixed: return ARM::VLD1d32wb_register;
+  case ARM::VLD1d64wb_fixed: return ARM::VLD1d64wb_register;
+  case ARM::VLD1q8wb_fixed: return ARM::VLD1q8wb_register;
+  case ARM::VLD1q16wb_fixed: return ARM::VLD1q16wb_register;
+  case ARM::VLD1q32wb_fixed: return ARM::VLD1q32wb_register;
+  case ARM::VLD1q64wb_fixed: return ARM::VLD1q64wb_register;
+  case ARM::VLD1d64Twb_fixed: return ARM::VLD1d64Twb_register;
+  case ARM::VLD1d64Qwb_fixed: return ARM::VLD1d64Qwb_register;
+  case ARM::VLD1d64TPseudoWB_fixed: return ARM::VLD1d64TPseudoWB_register;
+  case ARM::VLD1d64QPseudoWB_fixed: return ARM::VLD1d64QPseudoWB_register;
+  case ARM::VLD1DUPd8wb_fixed : return ARM::VLD1DUPd8wb_register;
+  case ARM::VLD1DUPd16wb_fixed : return ARM::VLD1DUPd16wb_register;
+  case ARM::VLD1DUPd32wb_fixed : return ARM::VLD1DUPd32wb_register;
+  case ARM::VLD1DUPq8wb_fixed : return ARM::VLD1DUPq8wb_register;
+  case ARM::VLD1DUPq16wb_fixed : return ARM::VLD1DUPq16wb_register;
+  case ARM::VLD1DUPq32wb_fixed : return ARM::VLD1DUPq32wb_register;
+
+  case ARM::VST1d8wb_fixed: return ARM::VST1d8wb_register;
+  case ARM::VST1d16wb_fixed: return ARM::VST1d16wb_register;
+  case ARM::VST1d32wb_fixed: return ARM::VST1d32wb_register;
+  case ARM::VST1d64wb_fixed: return ARM::VST1d64wb_register;
+  case ARM::VST1q8wb_fixed: return ARM::VST1q8wb_register;
+  case ARM::VST1q16wb_fixed: return ARM::VST1q16wb_register;
+  case ARM::VST1q32wb_fixed: return ARM::VST1q32wb_register;
+  case ARM::VST1q64wb_fixed: return ARM::VST1q64wb_register;
+  case ARM::VST1d64TPseudoWB_fixed: return ARM::VST1d64TPseudoWB_register;
+  case ARM::VST1d64QPseudoWB_fixed: return ARM::VST1d64QPseudoWB_register;
+
+  case ARM::VLD2d8wb_fixed: return ARM::VLD2d8wb_register;
+  case ARM::VLD2d16wb_fixed: return ARM::VLD2d16wb_register;
+  case ARM::VLD2d32wb_fixed: return ARM::VLD2d32wb_register;
+  case ARM::VLD2q8PseudoWB_fixed: return ARM::VLD2q8PseudoWB_register;
+  case ARM::VLD2q16PseudoWB_fixed: return ARM::VLD2q16PseudoWB_register;
+  case ARM::VLD2q32PseudoWB_fixed: return ARM::VLD2q32PseudoWB_register;
+
+  case ARM::VST2d8wb_fixed: return ARM::VST2d8wb_register;
+  case ARM::VST2d16wb_fixed: return ARM::VST2d16wb_register;
+  case ARM::VST2d32wb_fixed: return ARM::VST2d32wb_register;
+  case ARM::VST2q8PseudoWB_fixed: return ARM::VST2q8PseudoWB_register;
+  case ARM::VST2q16PseudoWB_fixed: return ARM::VST2q16PseudoWB_register;
+  case ARM::VST2q32PseudoWB_fixed: return ARM::VST2q32PseudoWB_register;
+
+  case ARM::VLD2DUPd8wb_fixed: return ARM::VLD2DUPd8wb_register;
+  case ARM::VLD2DUPd16wb_fixed: return ARM::VLD2DUPd16wb_register;
+  case ARM::VLD2DUPd32wb_fixed: return ARM::VLD2DUPd32wb_register;
+  }
+  return Opc; // If not one we handle, return it unchanged.
+}
+
+void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+                                const uint16_t *DOpcodes,
+                                const uint16_t *QOpcodes0,
+                                const uint16_t *QOpcodes1) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
+  SDLoc dl(N);
+
+  SDValue MemAddr, Align;
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
+    return;
+
+  SDValue Chain = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  bool is64BitVector = VT.is64BitVector();
+  Align = GetVLDSTAlign(Align, dl, NumVecs, is64BitVector);
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vld type");
+    // Double-register operations:
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+  case MVT::v1i64: OpcodeIndex = 3; break;
+    // Quad-register operations:
+  case MVT::v16i8: OpcodeIndex = 0; break;
+  case MVT::v8i16: OpcodeIndex = 1; break;
+  case MVT::v4f32:
+  case MVT::v4i32: OpcodeIndex = 2; break;
+  case MVT::v2f64:
+  case MVT::v2i64: OpcodeIndex = 3;
+    assert(NumVecs == 1 && "v2i64 type only supported for VLD1");
+    break;
+  }
+
+  EVT ResTy;
+  if (NumVecs == 1)
+    ResTy = VT;
+  else {
+    unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+    if (!is64BitVector)
+      ResTyElts *= 2;
+    ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
+  }
+  std::vector<EVT> ResTys;
+  ResTys.push_back(ResTy);
+  if (isUpdating)
+    ResTys.push_back(MVT::i32);
+  ResTys.push_back(MVT::Other);
+
+  SDValue Pred = getAL(CurDAG, dl);
+  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+  SDNode *VLd;
+  SmallVector<SDValue, 7> Ops;
+
+  // Double registers and VLD1/VLD2 quad registers are directly supported.
+  if (is64BitVector || NumVecs <= 2) {
+    unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+                    QOpcodes0[OpcodeIndex]);
+    Ops.push_back(MemAddr);
+    Ops.push_back(Align);
+    if (isUpdating) {
+      SDValue Inc = N->getOperand(AddrOpIdx + 1);
+      // FIXME: VLD1/VLD2 fixed increment doesn't need Reg0. Remove the reg0
+      // case entirely when the rest are updated to that form, too.
+      if ((NumVecs <= 2) && !isa<ConstantSDNode>(Inc.getNode()))
+        Opc = getVLDSTRegisterUpdateOpcode(Opc);
+      // FIXME: We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
+      // check for that explicitly too. Horribly hacky, but temporary.
+      if ((NumVecs > 2 && !isVLDfixed(Opc)) ||
+          !isa<ConstantSDNode>(Inc.getNode()))
+        Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+    }
+    Ops.push_back(Pred);
+    Ops.push_back(Reg0);
+    Ops.push_back(Chain);
+    VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  } else {
+    // Otherwise, quad registers are loaded with two separate instructions,
+    // where one loads the even registers and the other loads the odd registers.
+    EVT AddrTy = MemAddr.getValueType();
+
+    // Load the even subregs.  This is always an updating load, so that it
+    // provides the address to the second load for the odd subregs.
+    SDValue ImplDef =
+      SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
+    const SDValue OpsA[] = { MemAddr, Align, Reg0, ImplDef, Pred, Reg0, Chain };
+    SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
+                                          ResTy, AddrTy, MVT::Other, OpsA);
+    Chain = SDValue(VLdA, 2);
+
+    // Load the odd subregs.
+    Ops.push_back(SDValue(VLdA, 1));
+    Ops.push_back(Align);
+    if (isUpdating) {
+      SDValue Inc = N->getOperand(AddrOpIdx + 1);
+      assert(isa<ConstantSDNode>(Inc.getNode()) &&
+             "only constant post-increment update allowed for VLD3/4");
+      (void)Inc;
+      Ops.push_back(Reg0);
+    }
+    Ops.push_back(SDValue(VLdA, 0));
+    Ops.push_back(Pred);
+    Ops.push_back(Reg0);
+    Ops.push_back(Chain);
+    VLd = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, Ops);
+  }
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
+
+  if (NumVecs == 1) {
+    ReplaceNode(N, VLd);
+    return;
+  }
+
+  // Extract out the subregisters.
+  SDValue SuperReg = SDValue(VLd, 0);
+  static_assert(ARM::dsub_7 == ARM::dsub_0 + 7 &&
+                    ARM::qsub_3 == ARM::qsub_0 + 3,
+                "Unexpected subreg numbering");
+  unsigned Sub0 = (is64BitVector ? ARM::dsub_0 : ARM::qsub_0);
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
+  CurDAG->RemoveDeadNode(N);
+}
+
+void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+                                const uint16_t *DOpcodes,
+                                const uint16_t *QOpcodes0,
+                                const uint16_t *QOpcodes1) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
+  SDLoc dl(N);
+
+  SDValue MemAddr, Align;
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
+  if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
+    return;
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
+  SDValue Chain = N->getOperand(0);
+  EVT VT = N->getOperand(Vec0Idx).getValueType();
+  bool is64BitVector = VT.is64BitVector();
+  Align = GetVLDSTAlign(Align, dl, NumVecs, is64BitVector);
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vst type");
+    // Double-register operations:
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+  case MVT::v1i64: OpcodeIndex = 3; break;
+    // Quad-register operations:
+  case MVT::v16i8: OpcodeIndex = 0; break;
+  case MVT::v8i16: OpcodeIndex = 1; break;
+  case MVT::v4f32:
+  case MVT::v4i32: OpcodeIndex = 2; break;
+  case MVT::v2f64:
+  case MVT::v2i64: OpcodeIndex = 3;
+    assert(NumVecs == 1 && "v2i64 type only supported for VST1");
+    break;
+  }
+
+  std::vector<EVT> ResTys;
+  if (isUpdating)
+    ResTys.push_back(MVT::i32);
+  ResTys.push_back(MVT::Other);
+
+  SDValue Pred = getAL(CurDAG, dl);
+  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+  SmallVector<SDValue, 7> Ops;
+
+  // Double registers and VST1/VST2 quad registers are directly supported.
+  if (is64BitVector || NumVecs <= 2) {
+    SDValue SrcReg;
+    if (NumVecs == 1) {
+      SrcReg = N->getOperand(Vec0Idx);
+    } else if (is64BitVector) {
+      // Form a REG_SEQUENCE to force register allocation.
+      SDValue V0 = N->getOperand(Vec0Idx + 0);
+      SDValue V1 = N->getOperand(Vec0Idx + 1);
+      if (NumVecs == 2)
+        SrcReg = SDValue(createDRegPairNode(MVT::v2i64, V0, V1), 0);
+      else {
+        SDValue V2 = N->getOperand(Vec0Idx + 2);
+        // If it's a vst3, form a quad D-register and leave the last part as
+        // an undef.
+        SDValue V3 = (NumVecs == 3)
+          ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
+          : N->getOperand(Vec0Idx + 3);
+        SrcReg = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0);
+      }
+    } else {
+      // Form a QQ register.
+      SDValue Q0 = N->getOperand(Vec0Idx);
+      SDValue Q1 = N->getOperand(Vec0Idx + 1);
+      SrcReg = SDValue(createQRegPairNode(MVT::v4i64, Q0, Q1), 0);
+    }
+
+    unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+                    QOpcodes0[OpcodeIndex]);
+    Ops.push_back(MemAddr);
+    Ops.push_back(Align);
+    if (isUpdating) {
+      SDValue Inc = N->getOperand(AddrOpIdx + 1);
+      // FIXME: VST1/VST2 fixed increment doesn't need Reg0. Remove the reg0
+      // case entirely when the rest are updated to that form, too.
+      if (NumVecs <= 2 && !isa<ConstantSDNode>(Inc.getNode()))
+        Opc = getVLDSTRegisterUpdateOpcode(Opc);
+      // FIXME: We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so
+      // check for that explicitly too. Horribly hacky, but temporary.
+      if  (!isa<ConstantSDNode>(Inc.getNode()))
+        Ops.push_back(Inc);
+      else if (NumVecs > 2 && !isVSTfixed(Opc))
+        Ops.push_back(Reg0);
+    }
+    Ops.push_back(SrcReg);
+    Ops.push_back(Pred);
+    Ops.push_back(Reg0);
+    Ops.push_back(Chain);
+    SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+    // Transfer memoperands.
+    cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
+
+    ReplaceNode(N, VSt);
+    return;
+  }
+
+  // Otherwise, quad registers are stored with two separate instructions,
+  // where one stores the even registers and the other stores the odd registers.
+
+  // Form the QQQQ REG_SEQUENCE.
+  SDValue V0 = N->getOperand(Vec0Idx + 0);
+  SDValue V1 = N->getOperand(Vec0Idx + 1);
+  SDValue V2 = N->getOperand(Vec0Idx + 2);
+  SDValue V3 = (NumVecs == 3)
+    ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+    : N->getOperand(Vec0Idx + 3);
+  SDValue RegSeq = SDValue(createQuadQRegsNode(MVT::v8i64, V0, V1, V2, V3), 0);
+
+  // Store the even D registers.  This is always an updating store, so that it
+  // provides the address to the second store for the odd subregs.
+  const SDValue OpsA[] = { MemAddr, Align, Reg0, RegSeq, Pred, Reg0, Chain };
+  SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
+                                        MemAddr.getValueType(),
+                                        MVT::Other, OpsA);
+  cast<MachineSDNode>(VStA)->setMemRefs(MemOp, MemOp + 1);
+  Chain = SDValue(VStA, 1);
+
+  // Store the odd D registers.
+  Ops.push_back(SDValue(VStA, 0));
+  Ops.push_back(Align);
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    assert(isa<ConstantSDNode>(Inc.getNode()) &&
+           "only constant post-increment update allowed for VST3/4");
+    (void)Inc;
+    Ops.push_back(Reg0);
+  }
+  Ops.push_back(RegSeq);
+  Ops.push_back(Pred);
+  Ops.push_back(Reg0);
+  Ops.push_back(Chain);
+  SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
+                                        Ops);
+  cast<MachineSDNode>(VStB)->setMemRefs(MemOp, MemOp + 1);
+  ReplaceNode(N, VStB);
+}
+
+void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
+                                      unsigned NumVecs,
+                                      const uint16_t *DOpcodes,
+                                      const uint16_t *QOpcodes) {
+  assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
+  SDLoc dl(N);
+
+  SDValue MemAddr, Align;
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
+  if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
+    return;
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
+  SDValue Chain = N->getOperand(0);
+  unsigned Lane =
+    cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
+  EVT VT = N->getOperand(Vec0Idx).getValueType();
+  bool is64BitVector = VT.is64BitVector();
+
+  unsigned Alignment = 0;
+  if (NumVecs != 3) {
+    Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+    unsigned NumBytes = NumVecs * VT.getScalarSizeInBits() / 8;
+    if (Alignment > NumBytes)
+      Alignment = NumBytes;
+    if (Alignment < 8 && Alignment < NumBytes)
+      Alignment = 0;
+    // Alignment must be a power of two; make sure of that.
+    Alignment = (Alignment & -Alignment);
+    if (Alignment == 1)
+      Alignment = 0;
+  }
+  Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32);
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vld/vst lane type");
+    // Double-register operations:
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+    // Quad-register operations:
+  case MVT::v8i16: OpcodeIndex = 0; break;
+  case MVT::v4f32:
+  case MVT::v4i32: OpcodeIndex = 1; break;
+  }
+
+  std::vector<EVT> ResTys;
+  if (IsLoad) {
+    unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+    if (!is64BitVector)
+      ResTyElts *= 2;
+    ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(),
+                                      MVT::i64, ResTyElts));
+  }
+  if (isUpdating)
+    ResTys.push_back(MVT::i32);
+  ResTys.push_back(MVT::Other);
+
+  SDValue Pred = getAL(CurDAG, dl);
+  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(MemAddr);
+  Ops.push_back(Align);
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+  }
+
+  SDValue SuperReg;
+  SDValue V0 = N->getOperand(Vec0Idx + 0);
+  SDValue V1 = N->getOperand(Vec0Idx + 1);
+  if (NumVecs == 2) {
+    if (is64BitVector)
+      SuperReg = SDValue(createDRegPairNode(MVT::v2i64, V0, V1), 0);
+    else
+      SuperReg = SDValue(createQRegPairNode(MVT::v4i64, V0, V1), 0);
+  } else {
+    SDValue V2 = N->getOperand(Vec0Idx + 2);
+    SDValue V3 = (NumVecs == 3)
+      ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+      : N->getOperand(Vec0Idx + 3);
+    if (is64BitVector)
+      SuperReg = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0);
+    else
+      SuperReg = SDValue(createQuadQRegsNode(MVT::v8i64, V0, V1, V2, V3), 0);
+  }
+  Ops.push_back(SuperReg);
+  Ops.push_back(getI32Imm(Lane, dl));
+  Ops.push_back(Pred);
+  Ops.push_back(Reg0);
+  Ops.push_back(Chain);
+
+  unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+                                  QOpcodes[OpcodeIndex]);
+  SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
+  if (!IsLoad) {
+    ReplaceNode(N, VLdLn);
+    return;
+  }
+
+  // Extract the subregisters.
+  SuperReg = SDValue(VLdLn, 0);
+  static_assert(ARM::dsub_7 == ARM::dsub_0 + 7 &&
+                    ARM::qsub_3 == ARM::qsub_0 + 3,
+                "Unexpected subreg numbering");
+  unsigned Sub0 = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
+  CurDAG->RemoveDeadNode(N);
+}
+
+void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+                                   const uint16_t *DOpcodes,
+                                   const uint16_t *QOpcodes) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
+  SDLoc dl(N);
+
+  SDValue MemAddr, Align;
+  if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
+    return;
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
+  SDValue Chain = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  unsigned Alignment = 0;
+  if (NumVecs != 3) {
+    Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+    unsigned NumBytes = NumVecs * VT.getScalarSizeInBits() / 8;
+    if (Alignment > NumBytes)
+      Alignment = NumBytes;
+    if (Alignment < 8 && Alignment < NumBytes)
+      Alignment = 0;
+    // Alignment must be a power of two; make sure of that.
+    Alignment = (Alignment & -Alignment);
+    if (Alignment == 1)
+      Alignment = 0;
+  }
+  Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32);
+
+  unsigned Opc;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vld-dup type");
+  case MVT::v8i8:  Opc = DOpcodes[0]; break;
+  case MVT::v16i8: Opc = QOpcodes[0]; break;
+  case MVT::v4i16: Opc = DOpcodes[1]; break;
+  case MVT::v8i16: Opc = QOpcodes[1]; break;
+  case MVT::v2f32:
+  case MVT::v2i32: Opc = DOpcodes[2]; break;
+  case MVT::v4f32:
+  case MVT::v4i32: Opc = QOpcodes[2]; break;
+  }
+
+  SDValue Pred = getAL(CurDAG, dl);
+  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(MemAddr);
+  Ops.push_back(Align);
+  if (isUpdating) {
+    // fixed-stride update instructions don't have an explicit writeback
+    // operand. It's implicit in the opcode itself.
+    SDValue Inc = N->getOperand(2);
+    if (NumVecs <= 2 && !isa<ConstantSDNode>(Inc.getNode()))
+      Opc = getVLDSTRegisterUpdateOpcode(Opc);
+    if (!isa<ConstantSDNode>(Inc.getNode()))
+      Ops.push_back(Inc);
+    // FIXME: VLD3 and VLD4 haven't been updated to that form yet.
+    else if (NumVecs > 2)
+      Ops.push_back(Reg0);
+  }
+  Ops.push_back(Pred);
+  Ops.push_back(Reg0);
+  Ops.push_back(Chain);
+
+  unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+  std::vector<EVT> ResTys;
+  ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,ResTyElts));
+  if (isUpdating)
+    ResTys.push_back(MVT::i32);
+  ResTys.push_back(MVT::Other);
+  SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
+
+  // Extract the subregisters.
+  if (NumVecs == 1) {
+    ReplaceUses(SDValue(N, 0), SDValue(VLdDup, 0));
+  } else {
+    SDValue SuperReg = SDValue(VLdDup, 0);
+    static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering");
+    unsigned SubIdx = ARM::dsub_0;
+    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+      ReplaceUses(SDValue(N, Vec),
+                  CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
+  }
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
+  CurDAG->RemoveDeadNode(N);
+}
+
+void ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
+                                 unsigned Opc) {
+  assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range");
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  unsigned FirstTblReg = IsExt ? 2 : 1;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SDValue RegSeq;
+  SDValue V0 = N->getOperand(FirstTblReg + 0);
+  SDValue V1 = N->getOperand(FirstTblReg + 1);
+  if (NumVecs == 2)
+    RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0);
+  else {
+    SDValue V2 = N->getOperand(FirstTblReg + 2);
+    // If it's a vtbl3, form a quad D-register and leave the last part as
+    // an undef.
+    SDValue V3 = (NumVecs == 3)
+      ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+      : N->getOperand(FirstTblReg + 3);
+    RegSeq = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0);
+  }
+
+  SmallVector<SDValue, 6> Ops;
+  if (IsExt)
+    Ops.push_back(N->getOperand(1));
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(FirstTblReg + NumVecs));
+  Ops.push_back(getAL(CurDAG, dl)); // predicate
+  Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register
+  ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
+}
+
+bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
+  if (!Subtarget->hasV6T2Ops())
+    return false;
+
+  unsigned Opc = isSigned
+    ? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX)
+    : (Subtarget->isThumb() ? ARM::t2UBFX : ARM::UBFX);
+  SDLoc dl(N);
+
+  // For unsigned extracts, check for a shift right and mask
+  unsigned And_imm = 0;
+  if (N->getOpcode() == ISD::AND) {
+    if (isOpcWithIntImmediate(N, ISD::AND, And_imm)) {
+
+      // The immediate is a mask of the low bits iff imm & (imm+1) == 0
+      if (And_imm & (And_imm + 1))
+        return false;
+
+      unsigned Srl_imm = 0;
+      if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL,
+                                Srl_imm)) {
+        assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+
+        // Note: The width operand is encoded as width-1.
+        unsigned Width = countTrailingOnes(And_imm) - 1;
+        unsigned LSB = Srl_imm;
+
+        SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+
+        if ((LSB + Width + 1) == N->getValueType(0).getSizeInBits()) {
+          // It's cheaper to use a right shift to extract the top bits.
+          if (Subtarget->isThumb()) {
+            Opc = isSigned ? ARM::t2ASRri : ARM::t2LSRri;
+            SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                              CurDAG->getTargetConstant(LSB, dl, MVT::i32),
+                              getAL(CurDAG, dl), Reg0, Reg0 };
+            CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+            return true;
+          }
+
+          // ARM models shift instructions as MOVsi with shifter operand.
+          ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(ISD::SRL);
+          SDValue ShOpc =
+            CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, LSB), dl,
+                                      MVT::i32);
+          SDValue Ops[] = { N->getOperand(0).getOperand(0), ShOpc,
+                            getAL(CurDAG, dl), Reg0, Reg0 };
+          CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops);
+          return true;
+        }
+
+        SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                          CurDAG->getTargetConstant(LSB, dl, MVT::i32),
+                          CurDAG->getTargetConstant(Width, dl, MVT::i32),
+                          getAL(CurDAG, dl), Reg0 };
+        CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Otherwise, we're looking for a shift of a shift
+  unsigned Shl_imm = 0;
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
+    assert(Shl_imm > 0 && Shl_imm < 32 && "bad amount in shift node!");
+    unsigned Srl_imm = 0;
+    if (isInt32Immediate(N->getOperand(1), Srl_imm)) {
+      assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+      // Note: The width operand is encoded as width-1.
+      unsigned Width = 32 - Srl_imm - 1;
+      int LSB = Srl_imm - Shl_imm;
+      if (LSB < 0)
+        return false;
+      SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                        CurDAG->getTargetConstant(LSB, dl, MVT::i32),
+                        CurDAG->getTargetConstant(Width, dl, MVT::i32),
+                        getAL(CurDAG, dl), Reg0 };
+      CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      return true;
+    }
+  }
+
+  // Or we are looking for a shift of an and, with a mask operand
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_imm) &&
+      isShiftedMask_32(And_imm)) {
+    unsigned Srl_imm = 0;
+    unsigned LSB = countTrailingZeros(And_imm);
+    // Shift must be the same as the ands lsb
+    if (isInt32Immediate(N->getOperand(1), Srl_imm) && Srl_imm == LSB) {
+      assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+      unsigned MSB = 31 - countLeadingZeros(And_imm);
+      // Note: The width operand is encoded as width-1.
+      unsigned Width = MSB - LSB;
+      SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                        CurDAG->getTargetConstant(Srl_imm, dl, MVT::i32),
+                        CurDAG->getTargetConstant(Width, dl, MVT::i32),
+                        getAL(CurDAG, dl), Reg0 };
+      CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      return true;
+    }
+  }
+
+  if (N->getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
+    unsigned LSB = 0;
+    if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL, LSB) &&
+        !isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRA, LSB))
+      return false;
+
+    if (LSB + Width > 32)
+      return false;
+
+    SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+    SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                      CurDAG->getTargetConstant(LSB, dl, MVT::i32),
+                      CurDAG->getTargetConstant(Width - 1, dl, MVT::i32),
+                      getAL(CurDAG, dl), Reg0 };
+    CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+    return true;
+  }
+
+  return false;
+}
+
+/// Target-specific DAG combining for ISD::XOR.
+/// Target-independent combining lowers SELECT_CC nodes of the form
+/// select_cc setg[ge] X,  0,  X, -X
+/// select_cc setgt    X, -1,  X, -X
+/// select_cc setl[te] X,  0, -X,  X
+/// select_cc setlt    X,  1, -X,  X
+/// which represent Integer ABS into:
+/// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
+/// ARM instruction selection detects the latter and matches it to
+/// ARM::ABS or ARM::t2ABS machine node.
+bool ARMDAGToDAGISel::tryABSOp(SDNode *N){
+  SDValue XORSrc0 = N->getOperand(0);
+  SDValue XORSrc1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  if (Subtarget->isThumb1Only())
+    return false;
+
+  if (XORSrc0.getOpcode() != ISD::ADD || XORSrc1.getOpcode() != ISD::SRA)
+    return false;
+
+  SDValue ADDSrc0 = XORSrc0.getOperand(0);
+  SDValue ADDSrc1 = XORSrc0.getOperand(1);
+  SDValue SRASrc0 = XORSrc1.getOperand(0);
+  SDValue SRASrc1 = XORSrc1.getOperand(1);
+  ConstantSDNode *SRAConstant =  dyn_cast<ConstantSDNode>(SRASrc1);
+  EVT XType = SRASrc0.getValueType();
+  unsigned Size = XType.getSizeInBits() - 1;
+
+  if (ADDSrc1 == XORSrc1 && ADDSrc0 == SRASrc0 &&
+      XType.isInteger() && SRAConstant != nullptr &&
+      Size == SRAConstant->getZExtValue()) {
+    unsigned Opcode = Subtarget->isThumb2() ? ARM::t2ABS : ARM::ABS;
+    CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
+    return true;
+  }
+
+  return false;
+}
+
+static bool SearchSignedMulShort(SDValue SignExt, unsigned *Opc, SDValue &Src1,
+                                 bool Accumulate) {
+  // For SM*WB, we need to some form of sext.
+  // For SM*WT, we need to search for (sra X, 16)
+  // Src1 then gets set to X.
+  if ((SignExt.getOpcode() == ISD::SIGN_EXTEND ||
+       SignExt.getOpcode() == ISD::SIGN_EXTEND_INREG ||
+       SignExt.getOpcode() == ISD::AssertSext) &&
+       SignExt.getValueType() == MVT::i32) {
+
+    *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
+    Src1 = SignExt.getOperand(0);
+    return true;
+  }
+
+  if (SignExt.getOpcode() != ISD::SRA)
+    return false;
+
+  ConstantSDNode *SRASrc1 = dyn_cast<ConstantSDNode>(SignExt.getOperand(1));
+  if (!SRASrc1 || SRASrc1->getZExtValue() != 16)
+    return false;
+
+  SDValue Op0 = SignExt.getOperand(0);
+
+  // The sign extend operand for SM*WB could be generated by a shl and ashr.
+  if (Op0.getOpcode() == ISD::SHL) {
+    SDValue SHL = Op0;
+    ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
+    if (!SHLSrc1 || SHLSrc1->getZExtValue() != 16)
+      return false;
+
+    *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
+    Src1 = Op0.getOperand(0);
+    return true;
+  }
+  *Opc = Accumulate ? ARM::SMLAWT : ARM::SMULWT;
+  Src1 = SignExt.getOperand(0);
+  return true;
+}
+
+static bool SearchSignedMulLong(SDValue OR, unsigned *Opc, SDValue &Src0,
+                                SDValue &Src1, bool Accumulate) {
+  // First we look for:
+  // (add (or (srl ?, 16), (shl ?, 16)))
+  if (OR.getOpcode() != ISD::OR)
+    return false;
+
+  SDValue SRL = OR.getOperand(0);
+  SDValue SHL = OR.getOperand(1);
+
+  if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
+    SRL = OR.getOperand(1);
+    SHL = OR.getOperand(0);
+    if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL)
+      return false;
+  }
+
+  ConstantSDNode *SRLSrc1 = dyn_cast<ConstantSDNode>(SRL.getOperand(1));
+  ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
+  if (!SRLSrc1 || !SHLSrc1 || SRLSrc1->getZExtValue() != 16 ||
+      SHLSrc1->getZExtValue() != 16)
+    return false;
+
+  // The first operands to the shifts need to be the two results from the
+  // same smul_lohi node.
+  if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
+       SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
+    return false;
+
+  SDNode *SMULLOHI = SRL.getOperand(0).getNode();
+  if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
+      SHL.getOperand(0) != SDValue(SMULLOHI, 1))
+    return false;
+
+  // Now we have:
+  // (add (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
+  // For SMLAW[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
+  // For SMLAWB the 16-bit value will signed extended somehow.
+  // For SMLAWT only the SRA is required.
+
+  // Check both sides of SMUL_LOHI
+  if (SearchSignedMulShort(SMULLOHI->getOperand(0), Opc, Src1, Accumulate)) {
+    Src0 = SMULLOHI->getOperand(1);
+  } else if (SearchSignedMulShort(SMULLOHI->getOperand(1), Opc, Src1,
+                                  Accumulate)) {
+    Src0 = SMULLOHI->getOperand(0);
+  } else {
+    return false;
+  }
+  return true;
+}
+
+bool ARMDAGToDAGISel::trySMLAWSMULW(SDNode *N) {
+  if (!Subtarget->hasV6Ops() ||
+      (Subtarget->isThumb() && !Subtarget->hasThumb2()))
+    return false;
+
+  SDLoc dl(N);
+  SDValue Src0 = N->getOperand(0);
+  SDValue Src1 = N->getOperand(1);
+  SDValue A, B;
+  unsigned Opc = 0;
+
+  if (N->getOpcode() == ISD::ADD) {
+    if (Src0.getOpcode() != ISD::OR && Src1.getOpcode() != ISD::OR)
+      return false;
+
+    SDValue Acc;
+    if (SearchSignedMulLong(Src0, &Opc, A, B, true)) {
+      Acc = Src1;
+    } else if (SearchSignedMulLong(Src1, &Opc, A, B, true)) {
+      Acc = Src0;
+    } else {
+      return false;
+    }
+    if (Opc == 0)
+      return false;
+
+    SDValue Ops[] = { A, B, Acc, getAL(CurDAG, dl),
+                      CurDAG->getRegister(0, MVT::i32) };
+    CurDAG->SelectNodeTo(N, Opc, MVT::i32, MVT::Other, Ops);
+    return true;
+  } else if (N->getOpcode() == ISD::OR &&
+             SearchSignedMulLong(SDValue(N, 0), &Opc, A, B, false)) {
+    if (Opc == 0)
+      return false;
+
+    SDValue Ops[] = { A, B, getAL(CurDAG, dl),
+                      CurDAG->getRegister(0, MVT::i32)};
+    CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+    return true;
+  }
+  return false;
+}
+
+/// We've got special pseudo-instructions for these
+void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
+  unsigned Opcode;
+  EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
+  if (MemTy == MVT::i8)
+    Opcode = ARM::CMP_SWAP_8;
+  else if (MemTy == MVT::i16)
+    Opcode = ARM::CMP_SWAP_16;
+  else if (MemTy == MVT::i32)
+    Opcode = ARM::CMP_SWAP_32;
+  else
+    llvm_unreachable("Unknown AtomicCmpSwap type");
+
+  SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
+                   N->getOperand(0)};
+  SDNode *CmpSwap = CurDAG->getMachineNode(
+      Opcode, SDLoc(N),
+      CurDAG->getVTList(MVT::i32, MVT::i32, MVT::Other), Ops);
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+  ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
+  ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
+  CurDAG->RemoveDeadNode(N);
+}
+
+void ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
+  // The only time a CONCAT_VECTORS operation can have legal types is when
+  // two 64-bit vectors are concatenated to a 128-bit vector.
+  EVT VT = N->getValueType(0);
+  if (!VT.is128BitVector() || N->getNumOperands() != 2)
+    llvm_unreachable("unexpected CONCAT_VECTORS");
+  ReplaceNode(N, createDRegPairNode(VT, N->getOperand(0), N->getOperand(1)));
+}
+
+static Optional<std::pair<unsigned, unsigned>>
+getContiguousRangeOfSetBits(const APInt &A) {
+  unsigned FirstOne = A.getBitWidth() - A.countLeadingZeros() - 1;
+  unsigned LastOne = A.countTrailingZeros();
+  if (A.countPopulation() != (FirstOne - LastOne + 1))
+    return Optional<std::pair<unsigned,unsigned>>();
+  return std::make_pair(FirstOne, LastOne);
+}
+
+void ARMDAGToDAGISel::SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI) {
+  assert(N->getOpcode() == ARMISD::CMPZ);
+  SwitchEQNEToPLMI = false;
+
+  if (!Subtarget->isThumb())
+    // FIXME: Work out whether it is profitable to do this in A32 mode - LSL and
+    // LSR don't exist as standalone instructions - they need the barrel shifter.
+    return;
+
+  // select (cmpz (and X, C), #0) -> (LSLS X) or (LSRS X) or (LSRS (LSLS X))
+  SDValue And = N->getOperand(0);
+  if (!And->hasOneUse())
+    return;
+
+  SDValue Zero = N->getOperand(1);
+  if (!isa<ConstantSDNode>(Zero) || !cast<ConstantSDNode>(Zero)->isNullValue() ||
+      And->getOpcode() != ISD::AND)
+    return;
+  SDValue X = And.getOperand(0);
+  auto C = dyn_cast<ConstantSDNode>(And.getOperand(1));
+
+  if (!C || !X->hasOneUse())
+    return;
+  auto Range = getContiguousRangeOfSetBits(C->getAPIntValue());
+  if (!Range)
+    return;
+
+  // There are several ways to lower this:
+  SDNode *NewN;
+  SDLoc dl(N);
+
+  auto EmitShift = [&](unsigned Opc, SDValue Src, unsigned Imm) -> SDNode* {
+    if (Subtarget->isThumb2()) {
+      Opc = (Opc == ARM::tLSLri) ? ARM::t2LSLri : ARM::t2LSRri;
+      SDValue Ops[] = { Src, CurDAG->getTargetConstant(Imm, dl, MVT::i32),
+                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(Opc, dl, MVT::i32, Ops);
+    } else {
+      SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), Src,
+                       CurDAG->getTargetConstant(Imm, dl, MVT::i32),
+                       getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)};
+      return CurDAG->getMachineNode(Opc, dl, MVT::i32, Ops);
+    }
+  };
+  
+  if (Range->second == 0) {
+    //  1. Mask includes the LSB -> Simply shift the top N bits off
+    NewN = EmitShift(ARM::tLSLri, X, 31 - Range->first);
+    ReplaceNode(And.getNode(), NewN);
+  } else if (Range->first == 31) {
+    //  2. Mask includes the MSB -> Simply shift the bottom N bits off
+    NewN = EmitShift(ARM::tLSRri, X, Range->second);
+    ReplaceNode(And.getNode(), NewN);
+  } else if (Range->first == Range->second) {
+    //  3. Only one bit is set. We can shift this into the sign bit and use a
+    //     PL/MI comparison.
+    NewN = EmitShift(ARM::tLSLri, X, 31 - Range->first);
+    ReplaceNode(And.getNode(), NewN);
+
+    SwitchEQNEToPLMI = true;
+  } else if (!Subtarget->hasV6T2Ops()) {
+    //  4. Do a double shift to clear bottom and top bits, but only in
+    //     thumb-1 mode as in thumb-2 we can use UBFX.
+    NewN = EmitShift(ARM::tLSLri, X, 31 - Range->first);
+    NewN = EmitShift(ARM::tLSRri, SDValue(NewN, 0),
+                     Range->second + (31 - Range->first));
+    ReplaceNode(And.getNode(), NewN);
+  }
+
+}
+
+void ARMDAGToDAGISel::Select(SDNode *N) {
+  SDLoc dl(N);
+
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return;   // Already selected.
+  }
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::ADD:
+  case ISD::OR:
+    if (trySMLAWSMULW(N))
+      return;
+    break;
+  case ISD::WRITE_REGISTER:
+    if (tryWriteRegister(N))
+      return;
+    break;
+  case ISD::READ_REGISTER:
+    if (tryReadRegister(N))
+      return;
+    break;
+  case ISD::INLINEASM:
+    if (tryInlineAsm(N))
+      return;
+    break;
+  case ISD::XOR:
+    // Select special operations if XOR node forms integer ABS pattern
+    if (tryABSOp(N))
+      return;
+    // Other cases are autogenerated.
+    break;
+  case ISD::Constant: {
+    unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
+    // If we can't materialize the constant we need to use a literal pool
+    if (ConstantMaterializationCost(Val) > 2) {
+      SDValue CPIdx = CurDAG->getTargetConstantPool(
+          ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
+          TLI->getPointerTy(CurDAG->getDataLayout()));
+
+      SDNode *ResNode;
+      if (Subtarget->isThumb()) {
+        SDValue Pred = getAL(CurDAG, dl);
+        SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+        SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() };
+        ResNode = CurDAG->getMachineNode(ARM::tLDRpci, dl, MVT::i32, MVT::Other,
+                                         Ops);
+      } else {
+        SDValue Ops[] = {
+          CPIdx,
+          CurDAG->getTargetConstant(0, dl, MVT::i32),
+          getAL(CurDAG, dl),
+          CurDAG->getRegister(0, MVT::i32),
+          CurDAG->getEntryNode()
+        };
+        ResNode = CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
+                                         Ops);
+      }
+      ReplaceNode(N, ResNode);
+      return;
+    }
+
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::FrameIndex: {
+    // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm.
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(
+        FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+    if (Subtarget->isThumb1Only()) {
+      // Set the alignment of the frame object to 4, to avoid having to generate
+      // more than one ADD
+      MachineFrameInfo &MFI = MF->getFrameInfo();
+      if (MFI.getObjectAlignment(FI) < 4)
+        MFI.setObjectAlignment(FI, 4);
+      CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI,
+                           CurDAG->getTargetConstant(0, dl, MVT::i32));
+      return;
+    } else {
+      unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ?
+                      ARM::t2ADDri : ARM::ADDri);
+      SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, dl, MVT::i32),
+                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      return;
+    }
+  }
+  case ISD::SRL:
+    if (tryV6T2BitfieldExtractOp(N, false))
+      return;
+    break;
+  case ISD::SIGN_EXTEND_INREG:
+  case ISD::SRA:
+    if (tryV6T2BitfieldExtractOp(N, true))
+      return;
+    break;
+  case ISD::MUL:
+    if (Subtarget->isThumb1Only())
+      break;
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+      unsigned RHSV = C->getZExtValue();
+      if (!RHSV) break;
+      if (isPowerOf2_32(RHSV-1)) {  // 2^n+1?
+        unsigned ShImm = Log2_32(RHSV-1);
+        if (ShImm >= 32)
+          break;
+        SDValue V = N->getOperand(0);
+        ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm);
+        SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, dl, MVT::i32);
+        SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+        if (Subtarget->isThumb()) {
+          SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 };
+          CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops);
+          return;
+        } else {
+          SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0,
+                            Reg0 };
+          CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops);
+          return;
+        }
+      }
+      if (isPowerOf2_32(RHSV+1)) {  // 2^n-1?
+        unsigned ShImm = Log2_32(RHSV+1);
+        if (ShImm >= 32)
+          break;
+        SDValue V = N->getOperand(0);
+        ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm);
+        SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, dl, MVT::i32);
+        SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+        if (Subtarget->isThumb()) {
+          SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 };
+          CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops);
+          return;
+        } else {
+          SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0,
+                            Reg0 };
+          CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops);
+          return;
+        }
+      }
+    }
+    break;
+  case ISD::AND: {
+    // Check for unsigned bitfield extract
+    if (tryV6T2BitfieldExtractOp(N, false))
+      return;
+
+    // If an immediate is used in an AND node, it is possible that the immediate
+    // can be more optimally materialized when negated. If this is the case we
+    // can negate the immediate and use a BIC instead.
+    auto *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (N1C && N1C->hasOneUse() && Subtarget->isThumb()) {
+      uint32_t Imm = (uint32_t) N1C->getZExtValue();
+
+      // In Thumb2 mode, an AND can take a 12-bit immediate. If this
+      // immediate can be negated and fit in the immediate operand of
+      // a t2BIC, don't do any manual transform here as this can be
+      // handled by the generic ISel machinery.
+      bool PreferImmediateEncoding =
+        Subtarget->hasThumb2() && (is_t2_so_imm(Imm) || is_t2_so_imm_not(Imm));
+      if (!PreferImmediateEncoding &&
+          ConstantMaterializationCost(Imm) >
+              ConstantMaterializationCost(~Imm)) {
+        // The current immediate costs more to materialize than a negated
+        // immediate, so negate the immediate and use a BIC.
+        SDValue NewImm =
+          CurDAG->getConstant(~N1C->getZExtValue(), dl, MVT::i32);
+        // If the new constant didn't exist before, reposition it in the topological
+        // ordering so it is just before N. Otherwise, don't touch its location.
+        if (NewImm->getNodeId() == -1)
+          CurDAG->RepositionNode(N->getIterator(), NewImm.getNode());
+
+        if (!Subtarget->hasThumb2()) {
+          SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32),
+                           N->getOperand(0), NewImm, getAL(CurDAG, dl),
+                           CurDAG->getRegister(0, MVT::i32)};
+          ReplaceNode(N, CurDAG->getMachineNode(ARM::tBIC, dl, MVT::i32, Ops));
+          return;
+        } else {
+          SDValue Ops[] = {N->getOperand(0), NewImm, getAL(CurDAG, dl),
+                           CurDAG->getRegister(0, MVT::i32),
+                           CurDAG->getRegister(0, MVT::i32)};
+          ReplaceNode(N,
+                      CurDAG->getMachineNode(ARM::t2BICrr, dl, MVT::i32, Ops));
+          return;
+        }
+      }
+    }
+
+    // (and (or x, c2), c1) and top 16-bits of c1 and c2 match, lower 16-bits
+    // of c1 are 0xffff, and lower 16-bit of c2 are 0. That is, the top 16-bits
+    // are entirely contributed by c2 and lower 16-bits are entirely contributed
+    // by x. That's equal to (or (and x, 0xffff), (and c1, 0xffff0000)).
+    // Select it to: "movt x, ((c1 & 0xffff) >> 16)
+    EVT VT = N->getValueType(0);
+    if (VT != MVT::i32)
+      break;
+    unsigned Opc = (Subtarget->isThumb() && Subtarget->hasThumb2())
+      ? ARM::t2MOVTi16
+      : (Subtarget->hasV6T2Ops() ? ARM::MOVTi16 : 0);
+    if (!Opc)
+      break;
+    SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+    N1C = dyn_cast<ConstantSDNode>(N1);
+    if (!N1C)
+      break;
+    if (N0.getOpcode() == ISD::OR && N0.getNode()->hasOneUse()) {
+      SDValue N2 = N0.getOperand(1);
+      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
+      if (!N2C)
+        break;
+      unsigned N1CVal = N1C->getZExtValue();
+      unsigned N2CVal = N2C->getZExtValue();
+      if ((N1CVal & 0xffff0000U) == (N2CVal & 0xffff0000U) &&
+          (N1CVal & 0xffffU) == 0xffffU &&
+          (N2CVal & 0xffffU) == 0x0U) {
+        SDValue Imm16 = CurDAG->getTargetConstant((N2CVal & 0xFFFF0000U) >> 16,
+                                                  dl, MVT::i32);
+        SDValue Ops[] = { N0.getOperand(0), Imm16,
+                          getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
+        ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
+        return;
+      }
+    }
+
+    break;
+  }
+  case ARMISD::VMOVRRD:
+    ReplaceNode(N, CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32,
+                                          N->getOperand(0), getAL(CurDAG, dl),
+                                          CurDAG->getRegister(0, MVT::i32)));
+    return;
+  case ISD::UMUL_LOHI: {
+    if (Subtarget->isThumb1Only())
+      break;
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
+      ReplaceNode(
+          N, CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops));
+      return;
+    } else {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      ReplaceNode(N, CurDAG->getMachineNode(
+                         Subtarget->hasV6Ops() ? ARM::UMULL : ARM::UMULLv5, dl,
+                         MVT::i32, MVT::i32, Ops));
+      return;
+    }
+  }
+  case ISD::SMUL_LOHI: {
+    if (Subtarget->isThumb1Only())
+      break;
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
+      ReplaceNode(
+          N, CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops));
+      return;
+    } else {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      ReplaceNode(N, CurDAG->getMachineNode(
+                         Subtarget->hasV6Ops() ? ARM::SMULL : ARM::SMULLv5, dl,
+                         MVT::i32, MVT::i32, Ops));
+      return;
+    }
+  }
+  case ARMISD::UMAAL: {
+    unsigned Opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL;
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2), N->getOperand(3),
+                      getAL(CurDAG, dl),
+                      CurDAG->getRegister(0, MVT::i32) };
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, MVT::i32, MVT::i32, Ops));
+    return;
+  }
+  case ARMISD::UMLAL:{
+    // UMAAL is similar to UMLAL but it adds two 32-bit values to the
+    // 64-bit multiplication result.
+    if (Subtarget->hasV6Ops() && Subtarget->hasDSP() &&
+        N->getOperand(2).getOpcode() == ARMISD::ADDC &&
+        N->getOperand(3).getOpcode() == ARMISD::ADDE) {
+
+      SDValue Addc = N->getOperand(2);
+      SDValue Adde = N->getOperand(3);
+
+      if (Adde.getOperand(2).getNode() == Addc.getNode()) {
+
+        ConstantSDNode *Op0 = dyn_cast<ConstantSDNode>(Adde.getOperand(0));
+        ConstantSDNode *Op1 = dyn_cast<ConstantSDNode>(Adde.getOperand(1));
+
+        if (Op0 && Op1 && Op0->getZExtValue() == 0 && Op1->getZExtValue() == 0)
+        {
+          // Select UMAAL instead: UMAAL RdLo, RdHi, Rn, Rm
+          // RdLo = one operand to be added, lower 32-bits of res
+          // RdHi = other operand to be added, upper 32-bits of res
+          // Rn = first multiply operand
+          // Rm = second multiply operand
+          SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                            Addc.getOperand(0), Addc.getOperand(1),
+                            getAL(CurDAG, dl),
+                            CurDAG->getRegister(0, MVT::i32) };
+          unsigned opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL;
+          CurDAG->SelectNodeTo(N, opc, MVT::i32, MVT::i32, Ops);
+          return;
+        }
+      }
+    }
+
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG, dl),
+                        CurDAG->getRegister(0, MVT::i32)};
+      ReplaceNode(
+          N, CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops));
+      return;
+    }else{
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG, dl),
+                        CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      ReplaceNode(N, CurDAG->getMachineNode(
+                         Subtarget->hasV6Ops() ? ARM::UMLAL : ARM::UMLALv5, dl,
+                         MVT::i32, MVT::i32, Ops));
+      return;
+    }
+  }
+  case ARMISD::SMLAL:{
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG, dl),
+                        CurDAG->getRegister(0, MVT::i32)};
+      ReplaceNode(
+          N, CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops));
+      return;
+    }else{
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG, dl),
+                        CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      ReplaceNode(N, CurDAG->getMachineNode(
+                         Subtarget->hasV6Ops() ? ARM::SMLAL : ARM::SMLALv5, dl,
+                         MVT::i32, MVT::i32, Ops));
+      return;
+    }
+  }
+  case ARMISD::SUBE: {
+    if (!Subtarget->hasV6Ops())
+      break;
+    // Look for a pattern to match SMMLS
+    // (sube a, (smul_loHi a, b), (subc 0, (smul_LOhi(a, b))))
+    if (N->getOperand(1).getOpcode() != ISD::SMUL_LOHI ||
+        N->getOperand(2).getOpcode() != ARMISD::SUBC ||
+        !SDValue(N, 1).use_empty())
+      break;
+
+    if (Subtarget->isThumb())
+      assert(Subtarget->hasThumb2() &&
+             "This pattern should not be generated for Thumb");
+
+    SDValue SmulLoHi = N->getOperand(1);
+    SDValue Subc = N->getOperand(2);
+    auto *Zero = dyn_cast<ConstantSDNode>(Subc.getOperand(0));
+
+    if (!Zero || Zero->getZExtValue() != 0 ||
+        Subc.getOperand(1) != SmulLoHi.getValue(0) ||
+        N->getOperand(1) != SmulLoHi.getValue(1) ||
+        N->getOperand(2) != Subc.getValue(1))
+      break;
+
+    unsigned Opc = Subtarget->isThumb2() ? ARM::t2SMMLS : ARM::SMMLS;
+    SDValue Ops[] = { SmulLoHi.getOperand(0), SmulLoHi.getOperand(1),
+                      N->getOperand(0), getAL(CurDAG, dl),
+                      CurDAG->getRegister(0, MVT::i32) };
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, MVT::i32, Ops));
+    return;
+  }
+  case ISD::LOAD: {
+    if (Subtarget->isThumb() && Subtarget->hasThumb2()) {
+      if (tryT2IndexedLoad(N))
+        return;
+    } else if (Subtarget->isThumb()) {
+      if (tryT1IndexedLoad(N))
+        return;
+    } else if (tryARMIndexedLoad(N))
+      return;
+    // Other cases are autogenerated.
+    break;
+  }
+  case ARMISD::BRCOND: {
+    // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+    // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+
+    // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+    // Emits: (tBcc:void (bb:Other):$dst, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+
+    // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+    // Emits: (t2Bcc:void (bb:Other):$dst, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+
+    unsigned Opc = Subtarget->isThumb() ?
+      ((Subtarget->hasThumb2()) ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc;
+    SDValue Chain = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    SDValue N3 = N->getOperand(3);
+    SDValue InFlag = N->getOperand(4);
+    assert(N1.getOpcode() == ISD::BasicBlock);
+    assert(N2.getOpcode() == ISD::Constant);
+    assert(N3.getOpcode() == ISD::Register);
+
+    unsigned CC = (unsigned) cast<ConstantSDNode>(N2)->getZExtValue();
+    
+    if (InFlag.getOpcode() == ARMISD::CMPZ) {
+      bool SwitchEQNEToPLMI;
+      SelectCMPZ(InFlag.getNode(), SwitchEQNEToPLMI);
+      InFlag = N->getOperand(4);
+
+      if (SwitchEQNEToPLMI) {
+        switch ((ARMCC::CondCodes)CC) {
+        default: llvm_unreachable("CMPZ must be either NE or EQ!");
+        case ARMCC::NE:
+          CC = (unsigned)ARMCC::MI;
+          break;
+        case ARMCC::EQ:
+          CC = (unsigned)ARMCC::PL;
+          break;
+        }
+      }
+    }
+
+    SDValue Tmp2 = CurDAG->getTargetConstant(CC, dl, MVT::i32);
+    SDValue Ops[] = { N1, Tmp2, N3, Chain, InFlag };
+    SDNode *ResNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
+                                             MVT::Glue, Ops);
+    Chain = SDValue(ResNode, 0);
+    if (N->getNumValues() == 2) {
+      InFlag = SDValue(ResNode, 1);
+      ReplaceUses(SDValue(N, 1), InFlag);
+    }
+    ReplaceUses(SDValue(N, 0),
+                SDValue(Chain.getNode(), Chain.getResNo()));
+    CurDAG->RemoveDeadNode(N);
+    return;
+  }
+
+  case ARMISD::CMPZ: {
+    // select (CMPZ X, #-C) -> (CMPZ (ADDS X, #C), #0)
+    //   This allows us to avoid materializing the expensive negative constant.
+    //   The CMPZ #0 is useless and will be peepholed away but we need to keep it
+    //   for its glue output.
+    SDValue X = N->getOperand(0);
+    auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1).getNode());
+    if (C && C->getSExtValue() < 0 && Subtarget->isThumb()) {
+      int64_t Addend = -C->getSExtValue();
+
+      SDNode *Add = nullptr;
+      // In T2 mode, ADDS can be better than CMN if the immediate fits in a
+      // 16-bit ADDS, which means either [0,256) for tADDi8 or [0,8) for tADDi3.
+      // Outside that range we can just use a CMN which is 32-bit but has a
+      // 12-bit immediate range.
+      if (Subtarget->isThumb2() && Addend < 1<<8) {
+        SDValue Ops[] = { X, CurDAG->getTargetConstant(Addend, dl, MVT::i32),
+                          getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
+                          CurDAG->getRegister(0, MVT::i32) };
+        Add = CurDAG->getMachineNode(ARM::t2ADDri, dl, MVT::i32, Ops);
+      } else if (!Subtarget->isThumb2() && Addend < 1<<8) {
+        // FIXME: Add T1 tADDi8 code.
+        SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), X,
+                         CurDAG->getTargetConstant(Addend, dl, MVT::i32),
+                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)};
+        Add = CurDAG->getMachineNode(ARM::tADDi8, dl, MVT::i32, Ops);
+      } else if (!Subtarget->isThumb2() && Addend < 1<<3) {
+        SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), X,
+                         CurDAG->getTargetConstant(Addend, dl, MVT::i32),
+                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)};
+        Add = CurDAG->getMachineNode(ARM::tADDi3, dl, MVT::i32, Ops);
+      }
+      if (Add) {
+        SDValue Ops2[] = {SDValue(Add, 0), CurDAG->getConstant(0, dl, MVT::i32)};
+        CurDAG->MorphNodeTo(N, ARMISD::CMPZ, CurDAG->getVTList(MVT::Glue), Ops2);
+      }
+    }
+    // Other cases are autogenerated.
+    break;
+  }
+
+  case ARMISD::CMOV: {
+    SDValue InFlag = N->getOperand(4);
+
+    if (InFlag.getOpcode() == ARMISD::CMPZ) {
+      bool SwitchEQNEToPLMI;
+      SelectCMPZ(InFlag.getNode(), SwitchEQNEToPLMI);
+
+      if (SwitchEQNEToPLMI) {
+        SDValue ARMcc = N->getOperand(2);
+        ARMCC::CondCodes CC =
+          (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
+
+        switch (CC) {
+        default: llvm_unreachable("CMPZ must be either NE or EQ!");
+        case ARMCC::NE:
+          CC = ARMCC::MI;
+          break;
+        case ARMCC::EQ:
+          CC = ARMCC::PL;
+          break;
+        }
+        SDValue NewARMcc = CurDAG->getConstant((unsigned)CC, dl, MVT::i32);
+        SDValue Ops[] = {N->getOperand(0), N->getOperand(1), NewARMcc,
+                         N->getOperand(3), N->getOperand(4)};
+        CurDAG->MorphNodeTo(N, ARMISD::CMOV, N->getVTList(), Ops);
+      }
+
+    }
+    // Other cases are autogenerated.
+    break;
+  }
+    
+  case ARMISD::VZIP: {
+    unsigned Opc = 0;
+    EVT VT = N->getValueType(0);
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return;
+    case MVT::v8i8:  Opc = ARM::VZIPd8; break;
+    case MVT::v4i16: Opc = ARM::VZIPd16; break;
+    case MVT::v2f32:
+    // vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+    case MVT::v2i32: Opc = ARM::VTRNd32; break;
+    case MVT::v16i8: Opc = ARM::VZIPq8; break;
+    case MVT::v8i16: Opc = ARM::VZIPq16; break;
+    case MVT::v4f32:
+    case MVT::v4i32: Opc = ARM::VZIPq32; break;
+    }
+    SDValue Pred = getAL(CurDAG, dl);
+    SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops));
+    return;
+  }
+  case ARMISD::VUZP: {
+    unsigned Opc = 0;
+    EVT VT = N->getValueType(0);
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return;
+    case MVT::v8i8:  Opc = ARM::VUZPd8; break;
+    case MVT::v4i16: Opc = ARM::VUZPd16; break;
+    case MVT::v2f32:
+    // vuzp.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+    case MVT::v2i32: Opc = ARM::VTRNd32; break;
+    case MVT::v16i8: Opc = ARM::VUZPq8; break;
+    case MVT::v8i16: Opc = ARM::VUZPq16; break;
+    case MVT::v4f32:
+    case MVT::v4i32: Opc = ARM::VUZPq32; break;
+    }
+    SDValue Pred = getAL(CurDAG, dl);
+    SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops));
+    return;
+  }
+  case ARMISD::VTRN: {
+    unsigned Opc = 0;
+    EVT VT = N->getValueType(0);
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return;
+    case MVT::v8i8:  Opc = ARM::VTRNd8; break;
+    case MVT::v4i16: Opc = ARM::VTRNd16; break;
+    case MVT::v2f32:
+    case MVT::v2i32: Opc = ARM::VTRNd32; break;
+    case MVT::v16i8: Opc = ARM::VTRNq8; break;
+    case MVT::v8i16: Opc = ARM::VTRNq16; break;
+    case MVT::v4f32:
+    case MVT::v4i32: Opc = ARM::VTRNq32; break;
+    }
+    SDValue Pred = getAL(CurDAG, dl);
+    SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops));
+    return;
+  }
+  case ARMISD::BUILD_VECTOR: {
+    EVT VecVT = N->getValueType(0);
+    EVT EltVT = VecVT.getVectorElementType();
+    unsigned NumElts = VecVT.getVectorNumElements();
+    if (EltVT == MVT::f64) {
+      assert(NumElts == 2 && "unexpected type for BUILD_VECTOR");
+      ReplaceNode(
+          N, createDRegPairNode(VecVT, N->getOperand(0), N->getOperand(1)));
+      return;
+    }
+    assert(EltVT == MVT::f32 && "unexpected type for BUILD_VECTOR");
+    if (NumElts == 2) {
+      ReplaceNode(
+          N, createSRegPairNode(VecVT, N->getOperand(0), N->getOperand(1)));
+      return;
+    }
+    assert(NumElts == 4 && "unexpected type for BUILD_VECTOR");
+    ReplaceNode(N,
+                createQuadSRegsNode(VecVT, N->getOperand(0), N->getOperand(1),
+                                    N->getOperand(2), N->getOperand(3)));
+    return;
+  }
+
+  case ARMISD::VLD1DUP: {
+    static const uint16_t DOpcodes[] = { ARM::VLD1DUPd8, ARM::VLD1DUPd16,
+                                         ARM::VLD1DUPd32 };
+    static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8, ARM::VLD1DUPq16,
+                                         ARM::VLD1DUPq32 };
+    SelectVLDDup(N, false, 1, DOpcodes, QOpcodes);
+    return;
+  }
+
+  case ARMISD::VLD2DUP: {
+    static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
+                                        ARM::VLD2DUPd32 };
+    SelectVLDDup(N, false, 2, Opcodes);
+    return;
+  }
+
+  case ARMISD::VLD3DUP: {
+    static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo,
+                                        ARM::VLD3DUPd16Pseudo,
+                                        ARM::VLD3DUPd32Pseudo };
+    SelectVLDDup(N, false, 3, Opcodes);
+    return;
+  }
+
+  case ARMISD::VLD4DUP: {
+    static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo,
+                                        ARM::VLD4DUPd16Pseudo,
+                                        ARM::VLD4DUPd32Pseudo };
+    SelectVLDDup(N, false, 4, Opcodes);
+    return;
+  }
+
+  case ARMISD::VLD1DUP_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD1DUPd8wb_fixed,
+                                         ARM::VLD1DUPd16wb_fixed,
+                                         ARM::VLD1DUPd32wb_fixed };
+    static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8wb_fixed,
+                                         ARM::VLD1DUPq16wb_fixed,
+                                         ARM::VLD1DUPq32wb_fixed };
+    SelectVLDDup(N, true, 1, DOpcodes, QOpcodes);
+    return;
+  }
+
+  case ARMISD::VLD2DUP_UPD: {
+    static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
+                                        ARM::VLD2DUPd16wb_fixed,
+                                        ARM::VLD2DUPd32wb_fixed };
+    SelectVLDDup(N, true, 2, Opcodes);
+    return;
+  }
+
+  case ARMISD::VLD3DUP_UPD: {
+    static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD,
+                                        ARM::VLD3DUPd16Pseudo_UPD,
+                                        ARM::VLD3DUPd32Pseudo_UPD };
+    SelectVLDDup(N, true, 3, Opcodes);
+    return;
+  }
+
+  case ARMISD::VLD4DUP_UPD: {
+    static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD,
+                                        ARM::VLD4DUPd16Pseudo_UPD,
+                                        ARM::VLD4DUPd32Pseudo_UPD };
+    SelectVLDDup(N, true, 4, Opcodes);
+    return;
+  }
+
+  case ARMISD::VLD1_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD1d8wb_fixed,
+                                         ARM::VLD1d16wb_fixed,
+                                         ARM::VLD1d32wb_fixed,
+                                         ARM::VLD1d64wb_fixed };
+    static const uint16_t QOpcodes[] = { ARM::VLD1q8wb_fixed,
+                                         ARM::VLD1q16wb_fixed,
+                                         ARM::VLD1q32wb_fixed,
+                                         ARM::VLD1q64wb_fixed };
+    SelectVLD(N, true, 1, DOpcodes, QOpcodes, nullptr);
+    return;
+  }
+
+  case ARMISD::VLD2_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD2d8wb_fixed,
+                                         ARM::VLD2d16wb_fixed,
+                                         ARM::VLD2d32wb_fixed,
+                                         ARM::VLD1q64wb_fixed};
+    static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed,
+                                         ARM::VLD2q16PseudoWB_fixed,
+                                         ARM::VLD2q32PseudoWB_fixed };
+    SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr);
+    return;
+  }
+
+  case ARMISD::VLD3_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo_UPD,
+                                         ARM::VLD3d16Pseudo_UPD,
+                                         ARM::VLD3d32Pseudo_UPD,
+                                         ARM::VLD1d64TPseudoWB_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
+                                          ARM::VLD3q16Pseudo_UPD,
+                                          ARM::VLD3q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD,
+                                          ARM::VLD3q16oddPseudo_UPD,
+                                          ARM::VLD3q32oddPseudo_UPD };
+    SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    return;
+  }
+
+  case ARMISD::VLD4_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo_UPD,
+                                         ARM::VLD4d16Pseudo_UPD,
+                                         ARM::VLD4d32Pseudo_UPD,
+                                         ARM::VLD1d64QPseudoWB_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
+                                          ARM::VLD4q16Pseudo_UPD,
+                                          ARM::VLD4q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
+                                          ARM::VLD4q16oddPseudo_UPD,
+                                          ARM::VLD4q32oddPseudo_UPD };
+    SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    return;
+  }
+
+  case ARMISD::VLD2LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD,
+                                         ARM::VLD2LNd16Pseudo_UPD,
+                                         ARM::VLD2LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD,
+                                         ARM::VLD2LNq32Pseudo_UPD };
+    SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes);
+    return;
+  }
+
+  case ARMISD::VLD3LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD,
+                                         ARM::VLD3LNd16Pseudo_UPD,
+                                         ARM::VLD3LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD,
+                                         ARM::VLD3LNq32Pseudo_UPD };
+    SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes);
+    return;
+  }
+
+  case ARMISD::VLD4LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD,
+                                         ARM::VLD4LNd16Pseudo_UPD,
+                                         ARM::VLD4LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD,
+                                         ARM::VLD4LNq32Pseudo_UPD };
+    SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes);
+    return;
+  }
+
+  case ARMISD::VST1_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST1d8wb_fixed,
+                                         ARM::VST1d16wb_fixed,
+                                         ARM::VST1d32wb_fixed,
+                                         ARM::VST1d64wb_fixed };
+    static const uint16_t QOpcodes[] = { ARM::VST1q8wb_fixed,
+                                         ARM::VST1q16wb_fixed,
+                                         ARM::VST1q32wb_fixed,
+                                         ARM::VST1q64wb_fixed };
+    SelectVST(N, true, 1, DOpcodes, QOpcodes, nullptr);
+    return;
+  }
+
+  case ARMISD::VST2_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST2d8wb_fixed,
+                                         ARM::VST2d16wb_fixed,
+                                         ARM::VST2d32wb_fixed,
+                                         ARM::VST1q64wb_fixed};
+    static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed,
+                                         ARM::VST2q16PseudoWB_fixed,
+                                         ARM::VST2q32PseudoWB_fixed };
+    SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr);
+    return;
+  }
+
+  case ARMISD::VST3_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo_UPD,
+                                         ARM::VST3d16Pseudo_UPD,
+                                         ARM::VST3d32Pseudo_UPD,
+                                         ARM::VST1d64TPseudoWB_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
+                                          ARM::VST3q16Pseudo_UPD,
+                                          ARM::VST3q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD,
+                                          ARM::VST3q16oddPseudo_UPD,
+                                          ARM::VST3q32oddPseudo_UPD };
+    SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    return;
+  }
+
+  case ARMISD::VST4_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo_UPD,
+                                         ARM::VST4d16Pseudo_UPD,
+                                         ARM::VST4d32Pseudo_UPD,
+                                         ARM::VST1d64QPseudoWB_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
+                                          ARM::VST4q16Pseudo_UPD,
+                                          ARM::VST4q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
+                                          ARM::VST4q16oddPseudo_UPD,
+                                          ARM::VST4q32oddPseudo_UPD };
+    SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    return;
+  }
+
+  case ARMISD::VST2LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD,
+                                         ARM::VST2LNd16Pseudo_UPD,
+                                         ARM::VST2LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD,
+                                         ARM::VST2LNq32Pseudo_UPD };
+    SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes);
+    return;
+  }
+
+  case ARMISD::VST3LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD,
+                                         ARM::VST3LNd16Pseudo_UPD,
+                                         ARM::VST3LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD,
+                                         ARM::VST3LNq32Pseudo_UPD };
+    SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes);
+    return;
+  }
+
+  case ARMISD::VST4LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD,
+                                         ARM::VST4LNd16Pseudo_UPD,
+                                         ARM::VST4LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD,
+                                         ARM::VST4LNq32Pseudo_UPD };
+    SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes);
+    return;
+  }
+
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+
+    case Intrinsic::arm_mrrc:
+    case Intrinsic::arm_mrrc2: {
+      SDLoc dl(N);
+      SDValue Chain = N->getOperand(0);
+      unsigned Opc;
+
+      if (Subtarget->isThumb())
+        Opc = (IntNo == Intrinsic::arm_mrrc ? ARM::t2MRRC : ARM::t2MRRC2);
+      else
+        Opc = (IntNo == Intrinsic::arm_mrrc ? ARM::MRRC : ARM::MRRC2);
+
+      SmallVector<SDValue, 5> Ops;
+      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(), dl)); /* coproc */
+      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), dl)); /* opc */
+      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(4))->getZExtValue(), dl)); /* CRm */
+
+      // The mrrc2 instruction in ARM doesn't allow predicates, the top 4 bits of the encoded
+      // instruction will always be '1111' but it is possible in assembly language to specify
+      // AL as a predicate to mrrc2 but it doesn't make any difference to the encoded instruction.
+      if (Opc != ARM::MRRC2) {
+        Ops.push_back(getAL(CurDAG, dl));
+        Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+      }
+
+      Ops.push_back(Chain);
+
+      // Writes to two registers.
+      const EVT RetType[] = {MVT::i32, MVT::i32, MVT::Other};
+
+      ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, RetType, Ops));
+      return;
+    }
+    case Intrinsic::arm_ldaexd:
+    case Intrinsic::arm_ldrexd: {
+      SDLoc dl(N);
+      SDValue Chain = N->getOperand(0);
+      SDValue MemAddr = N->getOperand(2);
+      bool isThumb = Subtarget->isThumb() && Subtarget->hasV8MBaselineOps();
+
+      bool IsAcquire = IntNo == Intrinsic::arm_ldaexd;
+      unsigned NewOpc = isThumb ? (IsAcquire ? ARM::t2LDAEXD : ARM::t2LDREXD)
+                                : (IsAcquire ? ARM::LDAEXD : ARM::LDREXD);
+
+      // arm_ldrexd returns a i64 value in {i32, i32}
+      std::vector<EVT> ResTys;
+      if (isThumb) {
+        ResTys.push_back(MVT::i32);
+        ResTys.push_back(MVT::i32);
+      } else
+        ResTys.push_back(MVT::Untyped);
+      ResTys.push_back(MVT::Other);
+
+      // Place arguments in the right order.
+      SDValue Ops[] = {MemAddr, getAL(CurDAG, dl),
+                       CurDAG->getRegister(0, MVT::i32), Chain};
+      SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops);
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+      cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+
+      // Remap uses.
+      SDValue OutChain = isThumb ? SDValue(Ld, 2) : SDValue(Ld, 1);
+      if (!SDValue(N, 0).use_empty()) {
+        SDValue Result;
+        if (isThumb)
+          Result = SDValue(Ld, 0);
+        else {
+          SDValue SubRegIdx =
+            CurDAG->getTargetConstant(ARM::gsub_0, dl, MVT::i32);
+          SDNode *ResNode = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+              dl, MVT::i32, SDValue(Ld, 0), SubRegIdx);
+          Result = SDValue(ResNode,0);
+        }
+        ReplaceUses(SDValue(N, 0), Result);
+      }
+      if (!SDValue(N, 1).use_empty()) {
+        SDValue Result;
+        if (isThumb)
+          Result = SDValue(Ld, 1);
+        else {
+          SDValue SubRegIdx =
+            CurDAG->getTargetConstant(ARM::gsub_1, dl, MVT::i32);
+          SDNode *ResNode = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+              dl, MVT::i32, SDValue(Ld, 0), SubRegIdx);
+          Result = SDValue(ResNode,0);
+        }
+        ReplaceUses(SDValue(N, 1), Result);
+      }
+      ReplaceUses(SDValue(N, 2), OutChain);
+      CurDAG->RemoveDeadNode(N);
+      return;
+    }
+    case Intrinsic::arm_stlexd:
+    case Intrinsic::arm_strexd: {
+      SDLoc dl(N);
+      SDValue Chain = N->getOperand(0);
+      SDValue Val0 = N->getOperand(2);
+      SDValue Val1 = N->getOperand(3);
+      SDValue MemAddr = N->getOperand(4);
+
+      // Store exclusive double return a i32 value which is the return status
+      // of the issued store.
+      const EVT ResTys[] = {MVT::i32, MVT::Other};
+
+      bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2();
+      // Place arguments in the right order.
+      SmallVector<SDValue, 7> Ops;
+      if (isThumb) {
+        Ops.push_back(Val0);
+        Ops.push_back(Val1);
+      } else
+        // arm_strexd uses GPRPair.
+        Ops.push_back(SDValue(createGPRPairNode(MVT::Untyped, Val0, Val1), 0));
+      Ops.push_back(MemAddr);
+      Ops.push_back(getAL(CurDAG, dl));
+      Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+      Ops.push_back(Chain);
+
+      bool IsRelease = IntNo == Intrinsic::arm_stlexd;
+      unsigned NewOpc = isThumb ? (IsRelease ? ARM::t2STLEXD : ARM::t2STREXD)
+                                : (IsRelease ? ARM::STLEXD : ARM::STREXD);
+
+      SDNode *St = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops);
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+      cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+      ReplaceNode(N, St);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vld1: {
+      static const uint16_t DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16,
+                                           ARM::VLD1d32, ARM::VLD1d64 };
+      static const uint16_t QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
+                                           ARM::VLD1q32, ARM::VLD1q64};
+      SelectVLD(N, false, 1, DOpcodes, QOpcodes, nullptr);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vld2: {
+      static const uint16_t DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
+                                           ARM::VLD2d32, ARM::VLD1q64 };
+      static const uint16_t QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
+                                           ARM::VLD2q32Pseudo };
+      SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vld3: {
+      static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo,
+                                           ARM::VLD3d16Pseudo,
+                                           ARM::VLD3d32Pseudo,
+                                           ARM::VLD1d64TPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
+                                            ARM::VLD3q16Pseudo_UPD,
+                                            ARM::VLD3q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo,
+                                            ARM::VLD3q16oddPseudo,
+                                            ARM::VLD3q32oddPseudo };
+      SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vld4: {
+      static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo,
+                                           ARM::VLD4d16Pseudo,
+                                           ARM::VLD4d32Pseudo,
+                                           ARM::VLD1d64QPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
+                                            ARM::VLD4q16Pseudo_UPD,
+                                            ARM::VLD4q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo,
+                                            ARM::VLD4q16oddPseudo,
+                                            ARM::VLD4q32oddPseudo };
+      SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vld2lane: {
+      static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo,
+                                           ARM::VLD2LNd16Pseudo,
+                                           ARM::VLD2LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo,
+                                           ARM::VLD2LNq32Pseudo };
+      SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vld3lane: {
+      static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo,
+                                           ARM::VLD3LNd16Pseudo,
+                                           ARM::VLD3LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo,
+                                           ARM::VLD3LNq32Pseudo };
+      SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vld4lane: {
+      static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo,
+                                           ARM::VLD4LNd16Pseudo,
+                                           ARM::VLD4LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo,
+                                           ARM::VLD4LNq32Pseudo };
+      SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vst1: {
+      static const uint16_t DOpcodes[] = { ARM::VST1d8, ARM::VST1d16,
+                                           ARM::VST1d32, ARM::VST1d64 };
+      static const uint16_t QOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
+                                           ARM::VST1q32, ARM::VST1q64 };
+      SelectVST(N, false, 1, DOpcodes, QOpcodes, nullptr);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vst2: {
+      static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
+                                           ARM::VST2d32, ARM::VST1q64 };
+      static const uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
+                                           ARM::VST2q32Pseudo };
+      SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vst3: {
+      static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo,
+                                           ARM::VST3d16Pseudo,
+                                           ARM::VST3d32Pseudo,
+                                           ARM::VST1d64TPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
+                                            ARM::VST3q16Pseudo_UPD,
+                                            ARM::VST3q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo,
+                                            ARM::VST3q16oddPseudo,
+                                            ARM::VST3q32oddPseudo };
+      SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vst4: {
+      static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo,
+                                           ARM::VST4d16Pseudo,
+                                           ARM::VST4d32Pseudo,
+                                           ARM::VST1d64QPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
+                                            ARM::VST4q16Pseudo_UPD,
+                                            ARM::VST4q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo,
+                                            ARM::VST4q16oddPseudo,
+                                            ARM::VST4q32oddPseudo };
+      SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vst2lane: {
+      static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo,
+                                           ARM::VST2LNd16Pseudo,
+                                           ARM::VST2LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo,
+                                           ARM::VST2LNq32Pseudo };
+      SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vst3lane: {
+      static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo,
+                                           ARM::VST3LNd16Pseudo,
+                                           ARM::VST3LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo,
+                                           ARM::VST3LNq32Pseudo };
+      SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vst4lane: {
+      static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo,
+                                           ARM::VST4LNd16Pseudo,
+                                           ARM::VST4LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo,
+                                           ARM::VST4LNq32Pseudo };
+      SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes);
+      return;
+    }
+    }
+    break;
+  }
+
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+
+    case Intrinsic::arm_neon_vtbl2:
+      SelectVTBL(N, false, 2, ARM::VTBL2);
+      return;
+    case Intrinsic::arm_neon_vtbl3:
+      SelectVTBL(N, false, 3, ARM::VTBL3Pseudo);
+      return;
+    case Intrinsic::arm_neon_vtbl4:
+      SelectVTBL(N, false, 4, ARM::VTBL4Pseudo);
+      return;
+
+    case Intrinsic::arm_neon_vtbx2:
+      SelectVTBL(N, true, 2, ARM::VTBX2);
+      return;
+    case Intrinsic::arm_neon_vtbx3:
+      SelectVTBL(N, true, 3, ARM::VTBX3Pseudo);
+      return;
+    case Intrinsic::arm_neon_vtbx4:
+      SelectVTBL(N, true, 4, ARM::VTBX4Pseudo);
+      return;
+    }
+    break;
+  }
+
+  case ARMISD::VTBL1: {
+    SDLoc dl(N);
+    EVT VT = N->getValueType(0);
+    SDValue Ops[] = {N->getOperand(0), N->getOperand(1),
+                     getAL(CurDAG, dl),                 // Predicate
+                     CurDAG->getRegister(0, MVT::i32)}; // Predicate Register
+    ReplaceNode(N, CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops));
+    return;
+  }
+  case ARMISD::VTBL2: {
+    SDLoc dl(N);
+    EVT VT = N->getValueType(0);
+
+    // Form a REG_SEQUENCE to force register allocation.
+    SDValue V0 = N->getOperand(0);
+    SDValue V1 = N->getOperand(1);
+    SDValue RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0);
+
+    SDValue Ops[] = {RegSeq, N->getOperand(2), getAL(CurDAG, dl), // Predicate
+                     CurDAG->getRegister(0, MVT::i32)}; // Predicate Register
+    ReplaceNode(N, CurDAG->getMachineNode(ARM::VTBL2, dl, VT, Ops));
+    return;
+  }
+
+  case ISD::CONCAT_VECTORS:
+    SelectConcatVector(N);
+    return;
+
+  case ISD::ATOMIC_CMP_SWAP:
+    SelectCMP_SWAP(N);
+    return;
+  }
+
+  SelectCode(N);
+}
+
+// Inspect a register string of the form
+// cp<coprocessor>:<opc1>:c<CRn>:c<CRm>:<opc2> (32bit) or
+// cp<coprocessor>:<opc1>:c<CRm> (64bit) inspect the fields of the string
+// and obtain the integer operands from them, adding these operands to the
+// provided vector.
+static void getIntOperandsFromRegisterString(StringRef RegString,
+                                             SelectionDAG *CurDAG,
+                                             const SDLoc &DL,
+                                             std::vector<SDValue> &Ops) {
+  SmallVector<StringRef, 5> Fields;
+  RegString.split(Fields, ':');
+
+  if (Fields.size() > 1) {
+    bool AllIntFields = true;
+
+    for (StringRef Field : Fields) {
+      // Need to trim out leading 'cp' characters and get the integer field.
+      unsigned IntField;
+      AllIntFields &= !Field.trim("CPcp").getAsInteger(10, IntField);
+      Ops.push_back(CurDAG->getTargetConstant(IntField, DL, MVT::i32));
+    }
+
+    assert(AllIntFields &&
+            "Unexpected non-integer value in special register string.");
+  }
+}
+
+// Maps a Banked Register string to its mask value. The mask value returned is
+// for use in the MRSbanked / MSRbanked instruction nodes as the Banked Register
+// mask operand, which expresses which register is to be used, e.g. r8, and in
+// which mode it is to be used, e.g. usr. Returns -1 to signify that the string
+// was invalid.
+static inline int getBankedRegisterMask(StringRef RegString) {
+  return StringSwitch<int>(RegString.lower())
+          .Case("r8_usr", 0x00)
+          .Case("r9_usr", 0x01)
+          .Case("r10_usr", 0x02)
+          .Case("r11_usr", 0x03)
+          .Case("r12_usr", 0x04)
+          .Case("sp_usr", 0x05)
+          .Case("lr_usr", 0x06)
+          .Case("r8_fiq", 0x08)
+          .Case("r9_fiq", 0x09)
+          .Case("r10_fiq", 0x0a)
+          .Case("r11_fiq", 0x0b)
+          .Case("r12_fiq", 0x0c)
+          .Case("sp_fiq", 0x0d)
+          .Case("lr_fiq", 0x0e)
+          .Case("lr_irq", 0x10)
+          .Case("sp_irq", 0x11)
+          .Case("lr_svc", 0x12)
+          .Case("sp_svc", 0x13)
+          .Case("lr_abt", 0x14)
+          .Case("sp_abt", 0x15)
+          .Case("lr_und", 0x16)
+          .Case("sp_und", 0x17)
+          .Case("lr_mon", 0x1c)
+          .Case("sp_mon", 0x1d)
+          .Case("elr_hyp", 0x1e)
+          .Case("sp_hyp", 0x1f)
+          .Case("spsr_fiq", 0x2e)
+          .Case("spsr_irq", 0x30)
+          .Case("spsr_svc", 0x32)
+          .Case("spsr_abt", 0x34)
+          .Case("spsr_und", 0x36)
+          .Case("spsr_mon", 0x3c)
+          .Case("spsr_hyp", 0x3e)
+          .Default(-1);
+}
+
+// Maps a MClass special register string to its value for use in the
+// t2MRS_M / t2MSR_M instruction nodes as the SYSm value operand.
+// Returns -1 to signify that the string was invalid.
+static inline int getMClassRegisterSYSmValueMask(StringRef RegString) {
+  return StringSwitch<int>(RegString.lower())
+          .Case("apsr", 0x0)
+          .Case("iapsr", 0x1)
+          .Case("eapsr", 0x2)
+          .Case("xpsr", 0x3)
+          .Case("ipsr", 0x5)
+          .Case("epsr", 0x6)
+          .Case("iepsr", 0x7)
+          .Case("msp", 0x8)
+          .Case("psp", 0x9)
+          .Case("primask", 0x10)
+          .Case("basepri", 0x11)
+          .Case("basepri_max", 0x12)
+          .Case("faultmask", 0x13)
+          .Case("control", 0x14)
+          .Case("msplim", 0x0a)
+          .Case("psplim", 0x0b)
+          .Case("sp", 0x18)
+          .Default(-1);
+}
+
+// The flags here are common to those allowed for apsr in the A class cores and
+// those allowed for the special registers in the M class cores. Returns a
+// value representing which flags were present, -1 if invalid.
+static inline int getMClassFlagsMask(StringRef Flags, bool hasDSP) {
+  if (Flags.empty())
+    return 0x2 | (int)hasDSP;
+
+  return StringSwitch<int>(Flags)
+          .Case("g", 0x1)
+          .Case("nzcvq", 0x2)
+          .Case("nzcvqg", 0x3)
+          .Default(-1);
+}
+
+static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead,
+                                 const ARMSubtarget *Subtarget) {
+  // Ensure that the register (without flags) was a valid M Class special
+  // register.
+  int SYSmvalue = getMClassRegisterSYSmValueMask(Reg);
+  if (SYSmvalue == -1)
+    return -1;
+
+  // basepri, basepri_max and faultmask are only valid for V7m.
+  if (!Subtarget->hasV7Ops() && SYSmvalue >= 0x11 && SYSmvalue <= 0x13)
+    return -1;
+
+  if (Subtarget->has8MSecExt() && Flags.lower() == "ns") {
+    Flags = "";
+    SYSmvalue |= 0x80;
+  }
+
+  if (!Subtarget->has8MSecExt() &&
+      (SYSmvalue == 0xa || SYSmvalue == 0xb || SYSmvalue > 0x14))
+    return -1;
+
+  if (!Subtarget->hasV8MMainlineOps() &&
+      (SYSmvalue == 0x8a || SYSmvalue == 0x8b || SYSmvalue == 0x91 ||
+       SYSmvalue == 0x93))
+    return -1;
+
+  // If it was a read then we won't be expecting flags and so at this point
+  // we can return the mask.
+  if (IsRead) {
+    if (Flags.empty())
+      return SYSmvalue;
+    else
+      return -1;
+  }
+
+  // We know we are now handling a write so need to get the mask for the flags.
+  int Mask = getMClassFlagsMask(Flags, Subtarget->hasDSP());
+
+  // Only apsr, iapsr, eapsr, xpsr can have flags. The other register values
+  // shouldn't have flags present.
+  if ((SYSmvalue < 0x4 && Mask == -1) || (SYSmvalue > 0x4 && !Flags.empty()))
+    return -1;
+
+  // The _g and _nzcvqg versions are only valid if the DSP extension is
+  // available.
+  if (!Subtarget->hasDSP() && (Mask & 0x1))
+    return -1;
+
+  // The register was valid so need to put the mask in the correct place
+  // (the flags need to be in bits 11-10) and combine with the SYSmvalue to
+  // construct the operand for the instruction node.
+  if (SYSmvalue < 0x4)
+    return SYSmvalue | Mask << 10;
+
+  return SYSmvalue;
+}
+
+static int getARClassRegisterMask(StringRef Reg, StringRef Flags) {
+  // The mask operand contains the special register (R Bit) in bit 4, whether
+  // the register is spsr (R bit is 1) or one of cpsr/apsr (R bit is 0), and
+  // bits 3-0 contains the fields to be accessed in the special register, set by
+  // the flags provided with the register.
+  int Mask = 0;
+  if (Reg == "apsr") {
+    // The flags permitted for apsr are the same flags that are allowed in
+    // M class registers. We get the flag value and then shift the flags into
+    // the correct place to combine with the mask.
+    Mask = getMClassFlagsMask(Flags, true);
+    if (Mask == -1)
+      return -1;
+    return Mask << 2;
+  }
+
+  if (Reg != "cpsr" && Reg != "spsr") {
+    return -1;
+  }
+
+  // This is the same as if the flags were "fc"
+  if (Flags.empty() || Flags == "all")
+    return Mask | 0x9;
+
+  // Inspect the supplied flags string and set the bits in the mask for
+  // the relevant and valid flags allowed for cpsr and spsr.
+  for (char Flag : Flags) {
+    int FlagVal;
+    switch (Flag) {
+      case 'c':
+        FlagVal = 0x1;
+        break;
+      case 'x':
+        FlagVal = 0x2;
+        break;
+      case 's':
+        FlagVal = 0x4;
+        break;
+      case 'f':
+        FlagVal = 0x8;
+        break;
+      default:
+        FlagVal = 0;
+    }
+
+    // This avoids allowing strings where the same flag bit appears twice.
+    if (!FlagVal || (Mask & FlagVal))
+      return -1;
+    Mask |= FlagVal;
+  }
+
+  // If the register is spsr then we need to set the R bit.
+  if (Reg == "spsr")
+    Mask |= 0x10;
+
+  return Mask;
+}
+
+// Lower the read_register intrinsic to ARM specific DAG nodes
+// using the supplied metadata string to select the instruction node to use
+// and the registers/masks to construct as operands for the node.
+bool ARMDAGToDAGISel::tryReadRegister(SDNode *N){
+  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
+  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  bool IsThumb2 = Subtarget->isThumb2();
+  SDLoc DL(N);
+
+  std::vector<SDValue> Ops;
+  getIntOperandsFromRegisterString(RegString->getString(), CurDAG, DL, Ops);
+
+  if (!Ops.empty()) {
+    // If the special register string was constructed of fields (as defined
+    // in the ACLE) then need to lower to MRC node (32 bit) or
+    // MRRC node(64 bit), we can make the distinction based on the number of
+    // operands we have.
+    unsigned Opcode;
+    SmallVector<EVT, 3> ResTypes;
+    if (Ops.size() == 5){
+      Opcode = IsThumb2 ? ARM::t2MRC : ARM::MRC;
+      ResTypes.append({ MVT::i32, MVT::Other });
+    } else {
+      assert(Ops.size() == 3 &&
+              "Invalid number of fields in special register string.");
+      Opcode = IsThumb2 ? ARM::t2MRRC : ARM::MRRC;
+      ResTypes.append({ MVT::i32, MVT::i32, MVT::Other });
+    }
+
+    Ops.push_back(getAL(CurDAG, DL));
+    Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+    Ops.push_back(N->getOperand(0));
+    ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, ResTypes, Ops));
+    return true;
+  }
+
+  std::string SpecialReg = RegString->getString().lower();
+
+  int BankedReg = getBankedRegisterMask(SpecialReg);
+  if (BankedReg != -1) {
+    Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32),
+            getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+            N->getOperand(0) };
+    ReplaceNode(
+        N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSbanked : ARM::MRSbanked,
+                                  DL, MVT::i32, MVT::Other, Ops));
+    return true;
+  }
+
+  // The VFP registers are read by creating SelectionDAG nodes with opcodes
+  // corresponding to the register that is being read from. So we switch on the
+  // string to find which opcode we need to use.
+  unsigned Opcode = StringSwitch<unsigned>(SpecialReg)
+                    .Case("fpscr", ARM::VMRS)
+                    .Case("fpexc", ARM::VMRS_FPEXC)
+                    .Case("fpsid", ARM::VMRS_FPSID)
+                    .Case("mvfr0", ARM::VMRS_MVFR0)
+                    .Case("mvfr1", ARM::VMRS_MVFR1)
+                    .Case("mvfr2", ARM::VMRS_MVFR2)
+                    .Case("fpinst", ARM::VMRS_FPINST)
+                    .Case("fpinst2", ARM::VMRS_FPINST2)
+                    .Default(0);
+
+  // If an opcode was found then we can lower the read to a VFP instruction.
+  if (Opcode) {
+    if (!Subtarget->hasVFP2())
+      return false;
+    if (Opcode == ARM::VMRS_MVFR2 && !Subtarget->hasFPARMv8())
+      return false;
+
+    Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+            N->getOperand(0) };
+    ReplaceNode(N,
+                CurDAG->getMachineNode(Opcode, DL, MVT::i32, MVT::Other, Ops));
+    return true;
+  }
+
+  // If the target is M Class then need to validate that the register string
+  // is an acceptable value, so check that a mask can be constructed from the
+  // string.
+  if (Subtarget->isMClass()) {
+    StringRef Flags = "", Reg = SpecialReg;
+    if (Reg.endswith("_ns")) {
+      Flags = "ns";
+      Reg = Reg.drop_back(3);
+    }
+
+    int SYSmValue = getMClassRegisterMask(Reg, Flags, true, Subtarget);
+    if (SYSmValue == -1)
+      return false;
+
+    SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32),
+                      getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+                      N->getOperand(0) };
+    ReplaceNode(
+        N, CurDAG->getMachineNode(ARM::t2MRS_M, DL, MVT::i32, MVT::Other, Ops));
+    return true;
+  }
+
+  // Here we know the target is not M Class so we need to check if it is one
+  // of the remaining possible values which are apsr, cpsr or spsr.
+  if (SpecialReg == "apsr" || SpecialReg == "cpsr") {
+    Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+            N->getOperand(0) };
+    ReplaceNode(N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRS_AR : ARM::MRS,
+                                          DL, MVT::i32, MVT::Other, Ops));
+    return true;
+  }
+
+  if (SpecialReg == "spsr") {
+    Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+            N->getOperand(0) };
+    ReplaceNode(
+        N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSsys_AR : ARM::MRSsys, DL,
+                                  MVT::i32, MVT::Other, Ops));
+    return true;
+  }
+
+  return false;
+}
+
+// Lower the write_register intrinsic to ARM specific DAG nodes
+// using the supplied metadata string to select the instruction node to use
+// and the registers/masks to use in the nodes
+bool ARMDAGToDAGISel::tryWriteRegister(SDNode *N){
+  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
+  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  bool IsThumb2 = Subtarget->isThumb2();
+  SDLoc DL(N);
+
+  std::vector<SDValue> Ops;
+  getIntOperandsFromRegisterString(RegString->getString(), CurDAG, DL, Ops);
+
+  if (!Ops.empty()) {
+    // If the special register string was constructed of fields (as defined
+    // in the ACLE) then need to lower to MCR node (32 bit) or
+    // MCRR node(64 bit), we can make the distinction based on the number of
+    // operands we have.
+    unsigned Opcode;
+    if (Ops.size() == 5) {
+      Opcode = IsThumb2 ? ARM::t2MCR : ARM::MCR;
+      Ops.insert(Ops.begin()+2, N->getOperand(2));
+    } else {
+      assert(Ops.size() == 3 &&
+              "Invalid number of fields in special register string.");
+      Opcode = IsThumb2 ? ARM::t2MCRR : ARM::MCRR;
+      SDValue WriteValue[] = { N->getOperand(2), N->getOperand(3) };
+      Ops.insert(Ops.begin()+2, WriteValue, WriteValue+2);
+    }
+
+    Ops.push_back(getAL(CurDAG, DL));
+    Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+    Ops.push_back(N->getOperand(0));
+
+    ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops));
+    return true;
+  }
+
+  std::string SpecialReg = RegString->getString().lower();
+  int BankedReg = getBankedRegisterMask(SpecialReg);
+  if (BankedReg != -1) {
+    Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32), N->getOperand(2),
+            getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+            N->getOperand(0) };
+    ReplaceNode(
+        N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSRbanked : ARM::MSRbanked,
+                                  DL, MVT::Other, Ops));
+    return true;
+  }
+
+  // The VFP registers are written to by creating SelectionDAG nodes with
+  // opcodes corresponding to the register that is being written. So we switch
+  // on the string to find which opcode we need to use.
+  unsigned Opcode = StringSwitch<unsigned>(SpecialReg)
+                    .Case("fpscr", ARM::VMSR)
+                    .Case("fpexc", ARM::VMSR_FPEXC)
+                    .Case("fpsid", ARM::VMSR_FPSID)
+                    .Case("fpinst", ARM::VMSR_FPINST)
+                    .Case("fpinst2", ARM::VMSR_FPINST2)
+                    .Default(0);
+
+  if (Opcode) {
+    if (!Subtarget->hasVFP2())
+      return false;
+    Ops = { N->getOperand(2), getAL(CurDAG, DL),
+            CurDAG->getRegister(0, MVT::i32), N->getOperand(0) };
+    ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops));
+    return true;
+  }
+
+  std::pair<StringRef, StringRef> Fields;
+  Fields = StringRef(SpecialReg).rsplit('_');
+  std::string Reg = Fields.first.str();
+  StringRef Flags = Fields.second;
+
+  // If the target was M Class then need to validate the special register value
+  // and retrieve the mask for use in the instruction node.
+  if (Subtarget->isMClass()) {
+    // basepri_max gets split so need to correct Reg and Flags.
+    if (SpecialReg == "basepri_max") {
+      Reg = SpecialReg;
+      Flags = "";
+    }
+    int SYSmValue = getMClassRegisterMask(Reg, Flags, false, Subtarget);
+    if (SYSmValue == -1)
+      return false;
+
+    SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32),
+                      N->getOperand(2), getAL(CurDAG, DL),
+                      CurDAG->getRegister(0, MVT::i32), N->getOperand(0) };
+    ReplaceNode(N, CurDAG->getMachineNode(ARM::t2MSR_M, DL, MVT::Other, Ops));
+    return true;
+  }
+
+  // We then check to see if a valid mask can be constructed for one of the
+  // register string values permitted for the A and R class cores. These values
+  // are apsr, spsr and cpsr; these are also valid on older cores.
+  int Mask = getARClassRegisterMask(Reg, Flags);
+  if (Mask != -1) {
+    Ops = { CurDAG->getTargetConstant(Mask, DL, MVT::i32), N->getOperand(2),
+            getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+            N->getOperand(0) };
+    ReplaceNode(N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSR_AR : ARM::MSR,
+                                          DL, MVT::Other, Ops));
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){
+  std::vector<SDValue> AsmNodeOperands;
+  unsigned Flag, Kind;
+  bool Changed = false;
+  unsigned NumOps = N->getNumOperands();
+
+  // Normally, i64 data is bounded to two arbitrary GRPs for "%r" constraint.
+  // However, some instrstions (e.g. ldrexd/strexd in ARM mode) require
+  // (even/even+1) GPRs and use %n and %Hn to refer to the individual regs
+  // respectively. Since there is no constraint to explicitly specify a
+  // reg pair, we use GPRPair reg class for "%r" for 64-bit data. For Thumb,
+  // the 64-bit data may be referred by H, Q, R modifiers, so we still pack
+  // them into a GPRPair.
+
+  SDLoc dl(N);
+  SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1)
+                                   : SDValue(nullptr,0);
+
+  SmallVector<bool, 8> OpChanged;
+  // Glue node will be appended late.
+  for(unsigned i = 0, e = N->getGluedNode() ? NumOps - 1 : NumOps; i < e; ++i) {
+    SDValue op = N->getOperand(i);
+    AsmNodeOperands.push_back(op);
+
+    if (i < InlineAsm::Op_FirstOperand)
+      continue;
+
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i))) {
+      Flag = C->getZExtValue();
+      Kind = InlineAsm::getKind(Flag);
+    }
+    else
+      continue;
+
+    // Immediate operands to inline asm in the SelectionDAG are modeled with
+    // two operands. The first is a constant of value InlineAsm::Kind_Imm, and
+    // the second is a constant with the value of the immediate. If we get here
+    // and we have a Kind_Imm, skip the next operand, and continue.
+    if (Kind == InlineAsm::Kind_Imm) {
+      SDValue op = N->getOperand(++i);
+      AsmNodeOperands.push_back(op);
+      continue;
+    }
+
+    unsigned NumRegs = InlineAsm::getNumOperandRegisters(Flag);
+    if (NumRegs)
+      OpChanged.push_back(false);
+
+    unsigned DefIdx = 0;
+    bool IsTiedToChangedOp = false;
+    // If it's a use that is tied with a previous def, it has no
+    // reg class constraint.
+    if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx))
+      IsTiedToChangedOp = OpChanged[DefIdx];
+
+    // Memory operands to inline asm in the SelectionDAG are modeled with two
+    // operands: a constant of value InlineAsm::Kind_Mem followed by the input
+    // operand. If we get here and we have a Kind_Mem, skip the next operand (so
+    // it doesn't get misinterpreted), and continue. We do this here because
+    // it's important to update the OpChanged array correctly before moving on.
+    if (Kind == InlineAsm::Kind_Mem) {
+      SDValue op = N->getOperand(++i);
+      AsmNodeOperands.push_back(op);
+      continue;
+    }
+
+    if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef
+        && Kind != InlineAsm::Kind_RegDefEarlyClobber)
+      continue;
+
+    unsigned RC;
+    bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC);
+    if ((!IsTiedToChangedOp && (!HasRC || RC != ARM::GPRRegClassID))
+        || NumRegs != 2)
+      continue;
+
+    assert((i+2 < NumOps) && "Invalid number of operands in inline asm");
+    SDValue V0 = N->getOperand(i+1);
+    SDValue V1 = N->getOperand(i+2);
+    unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg();
+    unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg();
+    SDValue PairedReg;
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+
+    if (Kind == InlineAsm::Kind_RegDef ||
+        Kind == InlineAsm::Kind_RegDefEarlyClobber) {
+      // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to
+      // the original GPRs.
+
+      unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+      PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped);
+      SDValue Chain = SDValue(N,0);
+
+      SDNode *GU = N->getGluedUser();
+      SDValue RegCopy = CurDAG->getCopyFromReg(Chain, dl, GPVR, MVT::Untyped,
+                                               Chain.getValue(1));
+
+      // Extract values from a GPRPair reg and copy to the original GPR reg.
+      SDValue Sub0 = CurDAG->getTargetExtractSubreg(ARM::gsub_0, dl, MVT::i32,
+                                                    RegCopy);
+      SDValue Sub1 = CurDAG->getTargetExtractSubreg(ARM::gsub_1, dl, MVT::i32,
+                                                    RegCopy);
+      SDValue T0 = CurDAG->getCopyToReg(Sub0, dl, Reg0, Sub0,
+                                        RegCopy.getValue(1));
+      SDValue T1 = CurDAG->getCopyToReg(Sub1, dl, Reg1, Sub1, T0.getValue(1));
+
+      // Update the original glue user.
+      std::vector<SDValue> Ops(GU->op_begin(), GU->op_end()-1);
+      Ops.push_back(T1.getValue(1));
+      CurDAG->UpdateNodeOperands(GU, Ops);
+    }
+    else {
+      // For Kind  == InlineAsm::Kind_RegUse, we first copy two GPRs into a
+      // GPRPair and then pass the GPRPair to the inline asm.
+      SDValue Chain = AsmNodeOperands[InlineAsm::Op_InputChain];
+
+      // As REG_SEQ doesn't take RegisterSDNode, we copy them first.
+      SDValue T0 = CurDAG->getCopyFromReg(Chain, dl, Reg0, MVT::i32,
+                                          Chain.getValue(1));
+      SDValue T1 = CurDAG->getCopyFromReg(Chain, dl, Reg1, MVT::i32,
+                                          T0.getValue(1));
+      SDValue Pair = SDValue(createGPRPairNode(MVT::Untyped, T0, T1), 0);
+
+      // Copy REG_SEQ into a GPRPair-typed VR and replace the original two
+      // i32 VRs of inline asm with it.
+      unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+      PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped);
+      Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1));
+
+      AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
+      Glue = Chain.getValue(1);
+    }
+
+    Changed = true;
+
+    if(PairedReg.getNode()) {
+      OpChanged[OpChanged.size() -1 ] = true;
+      Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/);
+      if (IsTiedToChangedOp)
+        Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx);
+      else
+        Flag = InlineAsm::getFlagWordForRegClass(Flag, ARM::GPRPairRegClassID);
+      // Replace the current flag.
+      AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant(
+          Flag, dl, MVT::i32);
+      // Add the new register node and skip the original two GPRs.
+      AsmNodeOperands.push_back(PairedReg);
+      // Skip the next two GPRs.
+      i += 2;
+    }
+  }
+
+  if (Glue.getNode())
+    AsmNodeOperands.push_back(Glue);
+  if (!Changed)
+    return false;
+
+  SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
+      CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
+  New->setNodeId(-1);
+  ReplaceNode(N, New.getNode());
+  return true;
+}
+
+
+bool ARMDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                             std::vector<SDValue> &OutOps) {
+  switch(ConstraintID) {
+  default:
+    llvm_unreachable("Unexpected asm memory constraint");
+  case InlineAsm::Constraint_i:
+    // FIXME: It seems strange that 'i' is needed here since it's supposed to
+    //        be an immediate and not a memory constraint.
+    LLVM_FALLTHROUGH;
+  case InlineAsm::Constraint_m:
+  case InlineAsm::Constraint_o:
+  case InlineAsm::Constraint_Q:
+  case InlineAsm::Constraint_Um:
+  case InlineAsm::Constraint_Un:
+  case InlineAsm::Constraint_Uq:
+  case InlineAsm::Constraint_Us:
+  case InlineAsm::Constraint_Ut:
+  case InlineAsm::Constraint_Uv:
+  case InlineAsm::Constraint_Uy:
+    // Require the address to be in a register.  That is safe for all ARM
+    // variants and it is hard to do anything much smarter without knowing
+    // how the operand is used.
+    OutOps.push_back(Op);
+    return false;
+  }
+  return true;
+}
+
+/// createARMISelDag - This pass converts a legalized DAG into a
+/// ARM-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel) {
+  return new ARMDAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
new file mode 100644
index 000000000000..afba1587a743
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -0,0 +1,13504 @@
+//===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMISelLowering.h"
+#include "ARMCallingConv.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMPerfectShuffle.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "ARMTargetObjectFile.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-isel"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
+STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
+STATISTIC(NumConstpoolPromoted,
+  "Number of constants with their storage promoted into constant pools");
+
+static cl::opt<bool>
+ARMInterworking("arm-interworking", cl::Hidden,
+  cl::desc("Enable / disable ARM interworking (for debugging only)"),
+  cl::init(true));
+
+static cl::opt<bool> EnableConstpoolPromotion(
+    "arm-promote-constant", cl::Hidden,
+    cl::desc("Enable / disable promotion of unnamed_addr constants into "
+             "constant pools"),
+    cl::init(true));
+static cl::opt<unsigned> ConstpoolPromotionMaxSize(
+    "arm-promote-constant-max-size", cl::Hidden,
+    cl::desc("Maximum size of constant to promote into a constant pool"),
+    cl::init(64));
+static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
+    "arm-promote-constant-max-total", cl::Hidden,
+    cl::desc("Maximum size of ALL constants to promote into a constant pool"),
+    cl::init(128));
+
+namespace {
+  class ARMCCState : public CCState {
+  public:
+    ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+               SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
+               ParmContext PC)
+        : CCState(CC, isVarArg, MF, locs, C) {
+      assert(((PC == Call) || (PC == Prologue)) &&
+             "ARMCCState users must specify whether their context is call"
+             "or prologue generation.");
+      CallOrPrologue = PC;
+    }
+  };
+}
+
+void ARMTargetLowering::InitLibcallCallingConvs() {
+  // The builtins on ARM always use AAPCS, irrespective of wheter C is AAPCS or
+  // AAPCS_VFP.
+  for (const auto LC : {
+           RTLIB::SHL_I16,
+           RTLIB::SHL_I32,
+           RTLIB::SHL_I64,
+           RTLIB::SHL_I128,
+           RTLIB::SRL_I16,
+           RTLIB::SRL_I32,
+           RTLIB::SRL_I64,
+           RTLIB::SRL_I128,
+           RTLIB::SRA_I16,
+           RTLIB::SRA_I32,
+           RTLIB::SRA_I64,
+           RTLIB::SRA_I128,
+           RTLIB::MUL_I8,
+           RTLIB::MUL_I16,
+           RTLIB::MUL_I32,
+           RTLIB::MUL_I64,
+           RTLIB::MUL_I128,
+           RTLIB::MULO_I32,
+           RTLIB::MULO_I64,
+           RTLIB::MULO_I128,
+           RTLIB::SDIV_I8,
+           RTLIB::SDIV_I16,
+           RTLIB::SDIV_I32,
+           RTLIB::SDIV_I64,
+           RTLIB::SDIV_I128,
+           RTLIB::UDIV_I8,
+           RTLIB::UDIV_I16,
+           RTLIB::UDIV_I32,
+           RTLIB::UDIV_I64,
+           RTLIB::UDIV_I128,
+           RTLIB::SREM_I8,
+           RTLIB::SREM_I16,
+           RTLIB::SREM_I32,
+           RTLIB::SREM_I64,
+           RTLIB::SREM_I128,
+           RTLIB::UREM_I8,
+           RTLIB::UREM_I16,
+           RTLIB::UREM_I32,
+           RTLIB::UREM_I64,
+           RTLIB::UREM_I128,
+           RTLIB::SDIVREM_I8,
+           RTLIB::SDIVREM_I16,
+           RTLIB::SDIVREM_I32,
+           RTLIB::SDIVREM_I64,
+           RTLIB::SDIVREM_I128,
+           RTLIB::UDIVREM_I8,
+           RTLIB::UDIVREM_I16,
+           RTLIB::UDIVREM_I32,
+           RTLIB::UDIVREM_I64,
+           RTLIB::UDIVREM_I128,
+           RTLIB::NEG_I32,
+           RTLIB::NEG_I64,
+           RTLIB::ADD_F32,
+           RTLIB::ADD_F64,
+           RTLIB::ADD_F80,
+           RTLIB::ADD_F128,
+           RTLIB::SUB_F32,
+           RTLIB::SUB_F64,
+           RTLIB::SUB_F80,
+           RTLIB::SUB_F128,
+           RTLIB::MUL_F32,
+           RTLIB::MUL_F64,
+           RTLIB::MUL_F80,
+           RTLIB::MUL_F128,
+           RTLIB::DIV_F32,
+           RTLIB::DIV_F64,
+           RTLIB::DIV_F80,
+           RTLIB::DIV_F128,
+           RTLIB::POWI_F32,
+           RTLIB::POWI_F64,
+           RTLIB::POWI_F80,
+           RTLIB::POWI_F128,
+           RTLIB::FPEXT_F64_F128,
+           RTLIB::FPEXT_F32_F128,
+           RTLIB::FPEXT_F32_F64,
+           RTLIB::FPEXT_F16_F32,
+           RTLIB::FPROUND_F32_F16,
+           RTLIB::FPROUND_F64_F16,
+           RTLIB::FPROUND_F80_F16,
+           RTLIB::FPROUND_F128_F16,
+           RTLIB::FPROUND_F64_F32,
+           RTLIB::FPROUND_F80_F32,
+           RTLIB::FPROUND_F128_F32,
+           RTLIB::FPROUND_F80_F64,
+           RTLIB::FPROUND_F128_F64,
+           RTLIB::FPTOSINT_F32_I32,
+           RTLIB::FPTOSINT_F32_I64,
+           RTLIB::FPTOSINT_F32_I128,
+           RTLIB::FPTOSINT_F64_I32,
+           RTLIB::FPTOSINT_F64_I64,
+           RTLIB::FPTOSINT_F64_I128,
+           RTLIB::FPTOSINT_F80_I32,
+           RTLIB::FPTOSINT_F80_I64,
+           RTLIB::FPTOSINT_F80_I128,
+           RTLIB::FPTOSINT_F128_I32,
+           RTLIB::FPTOSINT_F128_I64,
+           RTLIB::FPTOSINT_F128_I128,
+           RTLIB::FPTOUINT_F32_I32,
+           RTLIB::FPTOUINT_F32_I64,
+           RTLIB::FPTOUINT_F32_I128,
+           RTLIB::FPTOUINT_F64_I32,
+           RTLIB::FPTOUINT_F64_I64,
+           RTLIB::FPTOUINT_F64_I128,
+           RTLIB::FPTOUINT_F80_I32,
+           RTLIB::FPTOUINT_F80_I64,
+           RTLIB::FPTOUINT_F80_I128,
+           RTLIB::FPTOUINT_F128_I32,
+           RTLIB::FPTOUINT_F128_I64,
+           RTLIB::FPTOUINT_F128_I128,
+           RTLIB::SINTTOFP_I32_F32,
+           RTLIB::SINTTOFP_I32_F64,
+           RTLIB::SINTTOFP_I32_F80,
+           RTLIB::SINTTOFP_I32_F128,
+           RTLIB::SINTTOFP_I64_F32,
+           RTLIB::SINTTOFP_I64_F64,
+           RTLIB::SINTTOFP_I64_F80,
+           RTLIB::SINTTOFP_I64_F128,
+           RTLIB::SINTTOFP_I128_F32,
+           RTLIB::SINTTOFP_I128_F64,
+           RTLIB::SINTTOFP_I128_F80,
+           RTLIB::SINTTOFP_I128_F128,
+           RTLIB::UINTTOFP_I32_F32,
+           RTLIB::UINTTOFP_I32_F64,
+           RTLIB::UINTTOFP_I32_F80,
+           RTLIB::UINTTOFP_I32_F128,
+           RTLIB::UINTTOFP_I64_F32,
+           RTLIB::UINTTOFP_I64_F64,
+           RTLIB::UINTTOFP_I64_F80,
+           RTLIB::UINTTOFP_I64_F128,
+           RTLIB::UINTTOFP_I128_F32,
+           RTLIB::UINTTOFP_I128_F64,
+           RTLIB::UINTTOFP_I128_F80,
+           RTLIB::UINTTOFP_I128_F128,
+           RTLIB::OEQ_F32,
+           RTLIB::OEQ_F64,
+           RTLIB::OEQ_F128,
+           RTLIB::UNE_F32,
+           RTLIB::UNE_F64,
+           RTLIB::UNE_F128,
+           RTLIB::OGE_F32,
+           RTLIB::OGE_F64,
+           RTLIB::OGE_F128,
+           RTLIB::OLT_F32,
+           RTLIB::OLT_F64,
+           RTLIB::OLT_F128,
+           RTLIB::OLE_F32,
+           RTLIB::OLE_F64,
+           RTLIB::OLE_F128,
+           RTLIB::OGT_F32,
+           RTLIB::OGT_F64,
+           RTLIB::OGT_F128,
+           RTLIB::UO_F32,
+           RTLIB::UO_F64,
+           RTLIB::UO_F128,
+           RTLIB::O_F32,
+           RTLIB::O_F64,
+           RTLIB::O_F128,
+       })
+  setLibcallCallingConv(LC, CallingConv::ARM_AAPCS);
+}
+
+// The APCS parameter registers.
+static const MCPhysReg GPRArgRegs[] = {
+  ARM::R0, ARM::R1, ARM::R2, ARM::R3
+};
+
+void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
+                                       MVT PromotedBitwiseVT) {
+  if (VT != PromotedLdStVT) {
+    setOperationAction(ISD::LOAD, VT, Promote);
+    AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
+
+    setOperationAction(ISD::STORE, VT, Promote);
+    AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
+  }
+
+  MVT ElemTy = VT.getVectorElementType();
+  if (ElemTy != MVT::f64)
+    setOperationAction(ISD::SETCC, VT, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+  if (ElemTy == MVT::i32) {
+    setOperationAction(ISD::SINT_TO_FP, VT, Custom);
+    setOperationAction(ISD::UINT_TO_FP, VT, Custom);
+    setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+    setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+  } else {
+    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+  }
+  setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+  setOperationAction(ISD::SELECT,            VT, Expand);
+  setOperationAction(ISD::SELECT_CC,         VT, Expand);
+  setOperationAction(ISD::VSELECT,           VT, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+  if (VT.isInteger()) {
+    setOperationAction(ISD::SHL, VT, Custom);
+    setOperationAction(ISD::SRA, VT, Custom);
+    setOperationAction(ISD::SRL, VT, Custom);
+  }
+
+  // Promote all bit-wise operations.
+  if (VT.isInteger() && VT != PromotedBitwiseVT) {
+    setOperationAction(ISD::AND, VT, Promote);
+    AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
+    setOperationAction(ISD::OR,  VT, Promote);
+    AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
+    setOperationAction(ISD::XOR, VT, Promote);
+    AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
+  }
+
+  // Neon does not support vector divide/remainder operations.
+  setOperationAction(ISD::SDIV, VT, Expand);
+  setOperationAction(ISD::UDIV, VT, Expand);
+  setOperationAction(ISD::FDIV, VT, Expand);
+  setOperationAction(ISD::SREM, VT, Expand);
+  setOperationAction(ISD::UREM, VT, Expand);
+  setOperationAction(ISD::FREM, VT, Expand);
+
+  if (!VT.isFloatingPoint() &&
+      VT != MVT::v2i64 && VT != MVT::v1i64)
+    for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
+      setOperationAction(Opcode, VT, Legal);
+}
+
+void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &ARM::DPRRegClass);
+  addTypeForNEON(VT, MVT::f64, MVT::v2i32);
+}
+
+void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &ARM::DPairRegClass);
+  addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
+}
+
+ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
+                                     const ARMSubtarget &STI)
+    : TargetLowering(TM), Subtarget(&STI) {
+  RegInfo = Subtarget->getRegisterInfo();
+  Itins = Subtarget->getInstrItineraryData();
+
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  InitLibcallCallingConvs();
+
+  if (Subtarget->isTargetMachO()) {
+    // Uses VFP for Thumb libfuncs if available.
+    if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
+        Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
+      static const struct {
+        const RTLIB::Libcall Op;
+        const char * const Name;
+        const ISD::CondCode Cond;
+      } LibraryCalls[] = {
+        // Single-precision floating-point arithmetic.
+        { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
+
+        // Double-precision floating-point arithmetic.
+        { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
+
+        // Single-precision comparisons.
+        { RTLIB::OEQ_F32, "__eqsf2vfp",    ISD::SETNE },
+        { RTLIB::UNE_F32, "__nesf2vfp",    ISD::SETNE },
+        { RTLIB::OLT_F32, "__ltsf2vfp",    ISD::SETNE },
+        { RTLIB::OLE_F32, "__lesf2vfp",    ISD::SETNE },
+        { RTLIB::OGE_F32, "__gesf2vfp",    ISD::SETNE },
+        { RTLIB::OGT_F32, "__gtsf2vfp",    ISD::SETNE },
+        { RTLIB::UO_F32,  "__unordsf2vfp", ISD::SETNE },
+        { RTLIB::O_F32,   "__unordsf2vfp", ISD::SETEQ },
+
+        // Double-precision comparisons.
+        { RTLIB::OEQ_F64, "__eqdf2vfp",    ISD::SETNE },
+        { RTLIB::UNE_F64, "__nedf2vfp",    ISD::SETNE },
+        { RTLIB::OLT_F64, "__ltdf2vfp",    ISD::SETNE },
+        { RTLIB::OLE_F64, "__ledf2vfp",    ISD::SETNE },
+        { RTLIB::OGE_F64, "__gedf2vfp",    ISD::SETNE },
+        { RTLIB::OGT_F64, "__gtdf2vfp",    ISD::SETNE },
+        { RTLIB::UO_F64,  "__unorddf2vfp", ISD::SETNE },
+        { RTLIB::O_F64,   "__unorddf2vfp", ISD::SETEQ },
+
+        // Floating-point to integer conversions.
+        // i64 conversions are done via library routines even when generating VFP
+        // instructions, so use the same ones.
+        { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    ISD::SETCC_INVALID },
+        { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
+        { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    ISD::SETCC_INVALID },
+        { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
+
+        // Conversions between floating types.
+        { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  ISD::SETCC_INVALID },
+        { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", ISD::SETCC_INVALID },
+
+        // Integer to floating-point conversions.
+        // i64 conversions are done via library routines even when generating VFP
+        // instructions, so use the same ones.
+        // FIXME: There appears to be some naming inconsistency in ARM libgcc:
+        // e.g., __floatunsidf vs. __floatunssidfvfp.
+        { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    ISD::SETCC_INVALID },
+        { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
+        { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    ISD::SETCC_INVALID },
+        { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
+      };
+
+      for (const auto &LC : LibraryCalls) {
+        setLibcallName(LC.Op, LC.Name);
+        if (LC.Cond != ISD::SETCC_INVALID)
+          setCmpLibcallCC(LC.Op, LC.Cond);
+      }
+    }
+
+    // Set the correct calling convention for ARMv7k WatchOS. It's just
+    // AAPCS_VFP for functions as simple as libcalls.
+    if (Subtarget->isTargetWatchABI()) {
+      for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i)
+        setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP);
+    }
+  }
+
+  // These libcalls are not available in 32-bit.
+  setLibcallName(RTLIB::SHL_I128, nullptr);
+  setLibcallName(RTLIB::SRL_I128, nullptr);
+  setLibcallName(RTLIB::SRA_I128, nullptr);
+
+  // RTLIB
+  if (Subtarget->isAAPCS_ABI() &&
+      (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
+       Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
+    static const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+      const CallingConv::ID CC;
+      const ISD::CondCode Cond;
+    } LibraryCalls[] = {
+      // Double-precision floating-point arithmetic helper functions
+      // RTABI chapter 4.1.2, Table 2
+      { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Double-precision floating-point comparison helper functions
+      // RTABI chapter 4.1.2, Table 3
+      { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
+      { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::O_F64,   "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
+
+      // Single-precision floating-point arithmetic helper functions
+      // RTABI chapter 4.1.2, Table 4
+      { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Single-precision floating-point comparison helper functions
+      // RTABI chapter 4.1.2, Table 5
+      { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
+      { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::O_F32,   "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
+
+      // Floating-point to integer conversions.
+      // RTABI chapter 4.1.2, Table 6
+      { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Conversions between floating types.
+      // RTABI chapter 4.1.2, Table 7
+      { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Integer to floating-point conversions.
+      // RTABI chapter 4.1.2, Table 8
+      { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Long long helper functions
+      // RTABI chapter 4.2, Table 9
+      { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Integer division functions
+      // RTABI chapter 4.3.1
+      { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+    };
+
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+      setLibcallCallingConv(LC.Op, LC.CC);
+      if (LC.Cond != ISD::SETCC_INVALID)
+        setCmpLibcallCC(LC.Op, LC.Cond);
+    }
+
+    // EABI dependent RTLIB
+    if (TM.Options.EABIVersion == EABI::EABI4 ||
+        TM.Options.EABIVersion == EABI::EABI5) {
+      static const struct {
+        const RTLIB::Libcall Op;
+        const char *const Name;
+        const CallingConv::ID CC;
+        const ISD::CondCode Cond;
+      } MemOpsLibraryCalls[] = {
+        // Memory operations
+        // RTABI chapter 4.3.4
+        { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+        { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+        { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      };
+
+      for (const auto &LC : MemOpsLibraryCalls) {
+        setLibcallName(LC.Op, LC.Name);
+        setLibcallCallingConv(LC.Op, LC.CC);
+        if (LC.Cond != ISD::SETCC_INVALID)
+          setCmpLibcallCC(LC.Op, LC.Cond);
+      }
+    }
+  }
+
+  if (Subtarget->isTargetWindows()) {
+    static const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+      const CallingConv::ID CC;
+    } LibraryCalls[] = {
+      { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
+    };
+
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+      setLibcallCallingConv(LC.Op, LC.CC);
+    }
+  }
+
+  // Use divmod compiler-rt calls for iOS 5.0 and later.
+  if (Subtarget->isTargetWatchOS() ||
+      (Subtarget->isTargetIOS() &&
+       !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
+    setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
+    setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+  }
+
+  // The half <-> float conversion functions are always soft-float on
+  // non-watchos platforms, but are needed for some targets which use a
+  // hard-float calling convention by default.
+  if (!Subtarget->isTargetWatchABI()) {
+    if (Subtarget->isAAPCS_ABI()) {
+      setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
+      setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
+      setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
+    } else {
+      setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
+      setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
+      setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
+    }
+  }
+
+  // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
+  // a __gnu_ prefix (which is the default).
+  if (Subtarget->isTargetAEABI()) {
+    setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h");
+    setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h");
+    setLibcallName(RTLIB::FPEXT_F16_F32,   "__aeabi_h2f");
+  }
+
+  if (Subtarget->isThumb1Only())
+    addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
+  else
+    addRegisterClass(MVT::i32, &ARM::GPRRegClass);
+  if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
+      !Subtarget->isThumb1Only()) {
+    addRegisterClass(MVT::f32, &ARM::SPRRegClass);
+    addRegisterClass(MVT::f64, &ARM::DPRRegClass);
+  }
+
+  for (MVT VT : MVT::vector_valuetypes()) {
+    for (MVT InnerVT : MVT::vector_valuetypes()) {
+      setTruncStoreAction(VT, InnerVT, Expand);
+      setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+    }
+
+    setOperationAction(ISD::MULHS, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+    setOperationAction(ISD::BSWAP, VT, Expand);
+  }
+
+  setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
+
+  setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
+  setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
+
+  if (Subtarget->hasNEON()) {
+    addDRTypeForNEON(MVT::v2f32);
+    addDRTypeForNEON(MVT::v8i8);
+    addDRTypeForNEON(MVT::v4i16);
+    addDRTypeForNEON(MVT::v2i32);
+    addDRTypeForNEON(MVT::v1i64);
+
+    addQRTypeForNEON(MVT::v4f32);
+    addQRTypeForNEON(MVT::v2f64);
+    addQRTypeForNEON(MVT::v16i8);
+    addQRTypeForNEON(MVT::v8i16);
+    addQRTypeForNEON(MVT::v4i32);
+    addQRTypeForNEON(MVT::v2i64);
+
+    // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
+    // neither Neon nor VFP support any arithmetic operations on it.
+    // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
+    // supported for v4f32.
+    setOperationAction(ISD::FADD, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
+    // FIXME: Code duplication: FDIV and FREM are expanded always, see
+    // ARMTargetLowering::addTypeForNEON method for details.
+    setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
+    setOperationAction(ISD::FREM, MVT::v2f64, Expand);
+    // FIXME: Create unittest.
+    // In another words, find a way when "copysign" appears in DAG with vector
+    // operands.
+    setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
+    // FIXME: Code duplication: SETCC has custom operation action, see
+    // ARMTargetLowering::addTypeForNEON method for details.
+    setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
+    // FIXME: Create unittest for FNEG and for FABS.
+    setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
+    setOperationAction(ISD::FABS, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
+    setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
+    setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
+    setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
+    setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
+    setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
+    setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
+    setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
+    setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
+    // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
+    setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
+    setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
+    setOperationAction(ISD::FMA, MVT::v2f64, Expand);
+
+    setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
+    setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
+    setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
+    setOperationAction(ISD::FPOWI, MVT::v4f32, Expand);
+    setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
+    setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
+    setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
+    setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
+
+    // Mark v2f32 intrinsics.
+    setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
+    setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
+    setOperationAction(ISD::FPOWI, MVT::v2f32, Expand);
+    setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
+    setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
+    setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
+    setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
+
+    // Neon does not support some operations on v1i64 and v2i64 types.
+    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+    // Custom handling for some quad-vector types to detect VMULL.
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+    // Custom handling for some vector types to avoid expensive expansions
+    setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
+    setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
+    setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
+    setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
+    // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
+    // a destination type that is wider than the source, and nor does
+    // it have a FP_TO_[SU]INT instruction with a narrower destination than
+    // source.
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
+
+    setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
+    setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
+
+    // NEON does not have single instruction CTPOP for vectors with element
+    // types wider than 8-bits.  However, custom lowering can leverage the
+    // v8i8/v16i8 vcnt instruction.
+    setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v1i64, Expand);
+    setOperationAction(ISD::CTPOP,      MVT::v2i64, Expand);
+
+    setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
+    setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
+
+    // NEON does not have single instruction CTTZ for vectors.
+    setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
+
+    setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
+    setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
+
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
+
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
+
+    // NEON only has FMA instructions as of VFP4.
+    if (!Subtarget->hasVFP4()) {
+      setOperationAction(ISD::FMA, MVT::v2f32, Expand);
+      setOperationAction(ISD::FMA, MVT::v4f32, Expand);
+    }
+
+    setTargetDAGCombine(ISD::INTRINSIC_VOID);
+    setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+    setTargetDAGCombine(ISD::SHL);
+    setTargetDAGCombine(ISD::SRL);
+    setTargetDAGCombine(ISD::SRA);
+    setTargetDAGCombine(ISD::SIGN_EXTEND);
+    setTargetDAGCombine(ISD::ZERO_EXTEND);
+    setTargetDAGCombine(ISD::ANY_EXTEND);
+    setTargetDAGCombine(ISD::BUILD_VECTOR);
+    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+    setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+    setTargetDAGCombine(ISD::STORE);
+    setTargetDAGCombine(ISD::FP_TO_SINT);
+    setTargetDAGCombine(ISD::FP_TO_UINT);
+    setTargetDAGCombine(ISD::FDIV);
+    setTargetDAGCombine(ISD::LOAD);
+
+    // It is legal to extload from v4i8 to v4i16 or v4i32.
+    for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
+                   MVT::v2i32}) {
+      for (MVT VT : MVT::integer_vector_valuetypes()) {
+        setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
+        setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
+        setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
+      }
+    }
+  }
+
+  // ARM and Thumb2 support UMLAL/SMLAL.
+  if (!Subtarget->isThumb1Only())
+    setTargetDAGCombine(ISD::ADDC);
+
+  if (Subtarget->isFPOnlySP()) {
+    // When targeting a floating-point unit with only single-precision
+    // operations, f64 is legal for the few double-precision instructions which
+    // are present However, no double-precision operations other than moves,
+    // loads and stores are provided by the hardware.
+    setOperationAction(ISD::FADD,       MVT::f64, Expand);
+    setOperationAction(ISD::FSUB,       MVT::f64, Expand);
+    setOperationAction(ISD::FMUL,       MVT::f64, Expand);
+    setOperationAction(ISD::FMA,        MVT::f64, Expand);
+    setOperationAction(ISD::FDIV,       MVT::f64, Expand);
+    setOperationAction(ISD::FREM,       MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN,  MVT::f64, Expand);
+    setOperationAction(ISD::FGETSIGN,   MVT::f64, Expand);
+    setOperationAction(ISD::FNEG,       MVT::f64, Expand);
+    setOperationAction(ISD::FABS,       MVT::f64, Expand);
+    setOperationAction(ISD::FSQRT,      MVT::f64, Expand);
+    setOperationAction(ISD::FSIN,       MVT::f64, Expand);
+    setOperationAction(ISD::FCOS,       MVT::f64, Expand);
+    setOperationAction(ISD::FPOWI,      MVT::f64, Expand);
+    setOperationAction(ISD::FPOW,       MVT::f64, Expand);
+    setOperationAction(ISD::FLOG,       MVT::f64, Expand);
+    setOperationAction(ISD::FLOG2,      MVT::f64, Expand);
+    setOperationAction(ISD::FLOG10,     MVT::f64, Expand);
+    setOperationAction(ISD::FEXP,       MVT::f64, Expand);
+    setOperationAction(ISD::FEXP2,      MVT::f64, Expand);
+    setOperationAction(ISD::FCEIL,      MVT::f64, Expand);
+    setOperationAction(ISD::FTRUNC,     MVT::f64, Expand);
+    setOperationAction(ISD::FRINT,      MVT::f64, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
+    setOperationAction(ISD::FFLOOR,     MVT::f64, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
+    setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
+    setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
+  }
+
+  computeRegisterProperties(Subtarget->getRegisterInfo());
+
+  // ARM does not have floating-point extending loads.
+  for (MVT VT : MVT::fp_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+  }
+
+  // ... or truncating stores
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+
+  // ARM does not have i1 sign extending load.
+  for (MVT VT : MVT::integer_valuetypes())
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+
+  // ARM supports all 4 flavors of integer indexed load / store.
+  if (!Subtarget->isThumb1Only()) {
+    for (unsigned im = (unsigned)ISD::PRE_INC;
+         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+      setIndexedLoadAction(im,  MVT::i1,  Legal);
+      setIndexedLoadAction(im,  MVT::i8,  Legal);
+      setIndexedLoadAction(im,  MVT::i16, Legal);
+      setIndexedLoadAction(im,  MVT::i32, Legal);
+      setIndexedStoreAction(im, MVT::i1,  Legal);
+      setIndexedStoreAction(im, MVT::i8,  Legal);
+      setIndexedStoreAction(im, MVT::i16, Legal);
+      setIndexedStoreAction(im, MVT::i32, Legal);
+    }
+  } else {
+    // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
+    setIndexedLoadAction(ISD::POST_INC, MVT::i32,  Legal);
+    setIndexedStoreAction(ISD::POST_INC, MVT::i32,  Legal);
+  }
+
+  setOperationAction(ISD::SADDO, MVT::i32, Custom);
+  setOperationAction(ISD::UADDO, MVT::i32, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+  setOperationAction(ISD::USUBO, MVT::i32, Custom);
+
+  // i64 operation support.
+  setOperationAction(ISD::MUL,     MVT::i64, Expand);
+  setOperationAction(ISD::MULHU,   MVT::i32, Expand);
+  if (Subtarget->isThumb1Only()) {
+    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  }
+  if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
+      || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
+    setOperationAction(ISD::MULHS, MVT::i32, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRL,       MVT::i64, Custom);
+  setOperationAction(ISD::SRA,       MVT::i64, Custom);
+
+  if (!Subtarget->isThumb1Only()) {
+    // FIXME: We should do this for Thumb1 as well.
+    setOperationAction(ISD::ADDC,    MVT::i32, Custom);
+    setOperationAction(ISD::ADDE,    MVT::i32, Custom);
+    setOperationAction(ISD::SUBC,    MVT::i32, Custom);
+    setOperationAction(ISD::SUBE,    MVT::i32, Custom);
+  }
+
+  if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
+    setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+
+  // ARM does not have ROTL.
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  for (MVT VT : MVT::vector_valuetypes()) {
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
+  }
+  setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
+    setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+
+  // @llvm.readcyclecounter requires the Performance Monitors extension.
+  // Default to the 0 expansion on unsupported platforms.
+  // FIXME: Technically there are older ARM CPUs that have
+  // implementation-specific ways of obtaining this information.
+  if (Subtarget->hasPerfMon())
+    setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
+
+  // Only ARMv6 has BSWAP.
+  if (!Subtarget->hasV6Ops())
+    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+
+  bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivide()
+                                        : Subtarget->hasDivideInARMMode();
+  if (!hasDivide) {
+    // These are expanded into libcalls if the cpu doesn't have HW divider.
+    setOperationAction(ISD::SDIV,  MVT::i32, LibCall);
+    setOperationAction(ISD::UDIV,  MVT::i32, LibCall);
+  }
+
+  if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) {
+    setOperationAction(ISD::SDIV, MVT::i32, Custom);
+    setOperationAction(ISD::UDIV, MVT::i32, Custom);
+
+    setOperationAction(ISD::SDIV, MVT::i64, Custom);
+    setOperationAction(ISD::UDIV, MVT::i64, Custom);
+  }
+
+  setOperationAction(ISD::SREM,  MVT::i32, Expand);
+  setOperationAction(ISD::UREM,  MVT::i32, Expand);
+  // Register based DivRem for AEABI (RTABI 4.2)
+  if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
+      Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
+      Subtarget->isTargetWindows()) {
+    setOperationAction(ISD::SREM, MVT::i64, Custom);
+    setOperationAction(ISD::UREM, MVT::i64, Custom);
+    HasStandaloneRem = false;
+
+    for (const auto &LC :
+         {RTLIB::SDIVREM_I8, RTLIB::SDIVREM_I16, RTLIB::SDIVREM_I32})
+      setLibcallName(LC, Subtarget->isTargetWindows() ? "__rt_sdiv"
+                                                      : "__aeabi_idivmod");
+    setLibcallName(RTLIB::SDIVREM_I64, Subtarget->isTargetWindows()
+                                           ? "__rt_sdiv64"
+                                           : "__aeabi_ldivmod");
+    for (const auto &LC :
+         {RTLIB::UDIVREM_I8, RTLIB::UDIVREM_I16, RTLIB::UDIVREM_I32})
+      setLibcallName(LC, Subtarget->isTargetWindows() ? "__rt_udiv"
+                                                      : "__aeabi_uidivmod");
+    setLibcallName(RTLIB::UDIVREM_I64, Subtarget->isTargetWindows()
+                                           ? "__rt_udiv64"
+                                           : "__aeabi_uldivmod");
+
+    setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS);
+
+    setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+    setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
+  } else {
+    setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  }
+
+  if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT())
+    for (auto &VT : {MVT::f32, MVT::f64})
+      setOperationAction(ISD::FPOWI, VT, Custom);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
+
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VASTART,            MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,              MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
+  setOperationAction(ISD::VAEND,              MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
+
+  if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  else
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+
+  // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
+  // the default expansion.
+  InsertFencesForAtomic = false;
+  if (Subtarget->hasAnyDataBarrier() &&
+      (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
+    // ATOMIC_FENCE needs custom lowering; the others should have been expanded
+    // to ldrex/strex loops already.
+    setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
+    if (!Subtarget->isThumb() || !Subtarget->isMClass())
+      setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
+
+    // On v8, we have particularly efficient implementations of atomic fences
+    // if they can be combined with nearby atomic loads and stores.
+    if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) {
+      // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
+      InsertFencesForAtomic = true;
+    }
+  } else {
+    // If there's anything we can use as a barrier, go through custom lowering
+    // for ATOMIC_FENCE.
+    // If target has DMB in thumb, Fences can be inserted.
+    if (Subtarget->hasDataBarrier())
+      InsertFencesForAtomic = true;
+
+    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
+                       Subtarget->hasAnyDataBarrier() ? Custom : Expand);
+
+    // Set them all for expansion, which will force libcalls.
+    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
+    // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
+    // Unordered/Monotonic case.
+    if (!InsertFencesForAtomic) {
+      setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
+      setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
+    }
+  }
+
+  setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
+
+  // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
+  if (!Subtarget->hasV6Ops()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
+  }
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
+      !Subtarget->isThumb1Only()) {
+    // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
+    // iff target supports vfp2.
+    setOperationAction(ISD::BITCAST, MVT::i64, Custom);
+    setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
+  }
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+  setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
+  if (Subtarget->useSjLjEH())
+    setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
+
+  setOperationAction(ISD::SETCC,     MVT::i32, Expand);
+  setOperationAction(ISD::SETCC,     MVT::f32, Expand);
+  setOperationAction(ISD::SETCC,     MVT::f64, Expand);
+  setOperationAction(ISD::SELECT,    MVT::i32, Custom);
+  setOperationAction(ISD::SELECT,    MVT::f32, Custom);
+  setOperationAction(ISD::SELECT,    MVT::f64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  // Thumb-1 cannot currently select ARMISD::SUBE.
+  if (!Subtarget->isThumb1Only())
+    setOperationAction(ISD::SETCCE, MVT::i32, Custom);
+
+  setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
+  setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
+  setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
+  setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
+
+  // We don't support sin/cos/fmod/copysign/pow
+  setOperationAction(ISD::FSIN,      MVT::f64, Expand);
+  setOperationAction(ISD::FSIN,      MVT::f32, Expand);
+  setOperationAction(ISD::FCOS,      MVT::f32, Expand);
+  setOperationAction(ISD::FCOS,      MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
+  setOperationAction(ISD::FREM,      MVT::f64, Expand);
+  setOperationAction(ISD::FREM,      MVT::f32, Expand);
+  if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
+      !Subtarget->isThumb1Only()) {
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+  }
+  setOperationAction(ISD::FPOW,      MVT::f64, Expand);
+  setOperationAction(ISD::FPOW,      MVT::f32, Expand);
+
+  if (!Subtarget->hasVFP4()) {
+    setOperationAction(ISD::FMA, MVT::f64, Expand);
+    setOperationAction(ISD::FMA, MVT::f32, Expand);
+  }
+
+  // Various VFP goodness
+  if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
+    // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
+    if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) {
+      setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+      setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+    }
+
+    // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
+    if (!Subtarget->hasFP16()) {
+      setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+      setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+    }
+  }
+
+  // Combine sin / cos into one node or libcall if possible.
+  if (Subtarget->hasSinCos()) {
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+    if (Subtarget->isTargetWatchABI()) {
+      setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP);
+      setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP);
+    }
+    if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) {
+      // For iOS, we don't want to the normal expansion of a libcall to
+      // sincos. We want to issue a libcall to __sincos_stret.
+      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+    }
+  }
+
+  // FP-ARMv8 implements a lot of rounding-like FP operations.
+  if (Subtarget->hasFPARMv8()) {
+    setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
+    setOperationAction(ISD::FRINT, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
+
+    if (!Subtarget->isFPOnlySP()) {
+      setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+      setOperationAction(ISD::FCEIL, MVT::f64, Legal);
+      setOperationAction(ISD::FROUND, MVT::f64, Legal);
+      setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+      setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
+      setOperationAction(ISD::FRINT, MVT::f64, Legal);
+      setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+      setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+    }
+  }
+
+  if (Subtarget->hasNEON()) {
+    // vmin and vmax aren't available in a scalar form, so we use
+    // a NEON instruction with an undef lane instead.
+    setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
+  }
+
+  // We have target-specific dag combine patterns for the following nodes:
+  // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::XOR);
+
+  if (Subtarget->hasV6Ops())
+    setTargetDAGCombine(ISD::SRL);
+
+  setStackPointerRegisterToSaveRestore(ARM::SP);
+
+  if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
+      !Subtarget->hasVFP2())
+    setSchedulingPreference(Sched::RegPressure);
+  else
+    setSchedulingPreference(Sched::Hybrid);
+
+  //// temporary - rewrite interface to use type
+  MaxStoresPerMemset = 8;
+  MaxStoresPerMemsetOptSize = 4;
+  MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
+  MaxStoresPerMemcpyOptSize = 2;
+  MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
+  MaxStoresPerMemmoveOptSize = 2;
+
+  // On ARM arguments smaller than 4 bytes are extended, so all arguments
+  // are at least 4 bytes aligned.
+  setMinStackArgumentAlignment(4);
+
+  // Prefer likely predicted branches to selects on out-of-order cores.
+  PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
+
+  setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
+}
+
+bool ARMTargetLowering::useSoftFloat() const {
+  return Subtarget->useSoftFloat();
+}
+
+// FIXME: It might make sense to define the representative register class as the
+// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
+// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
+// SPR's representative would be DPR_VFP2. This should work well if register
+// pressure tracking were modified such that a register use would increment the
+// pressure of the register class's representative and all of it's super
+// classes' representatives transitively. We have not implemented this because
+// of the difficulty prior to coalescing of modeling operand register classes
+// due to the common occurrence of cross class copies and subregister insertions
+// and extractions.
+std::pair<const TargetRegisterClass *, uint8_t>
+ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+                                           MVT VT) const {
+  const TargetRegisterClass *RRC = nullptr;
+  uint8_t Cost = 1;
+  switch (VT.SimpleTy) {
+  default:
+    return TargetLowering::findRepresentativeClass(TRI, VT);
+  // Use DPR as representative register class for all floating point
+  // and vector types. Since there are 32 SPR registers and 32 DPR registers so
+  // the cost is 1 for both f32 and f64.
+  case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
+  case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
+    RRC = &ARM::DPRRegClass;
+    // When NEON is used for SP, only half of the register file is available
+    // because operations that define both SP and DP results will be constrained
+    // to the VFP2 class (D0-D15). We currently model this constraint prior to
+    // coalescing by double-counting the SP regs. See the FIXME above.
+    if (Subtarget->useNEONForSinglePrecisionFP())
+      Cost = 2;
+    break;
+  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
+  case MVT::v4f32: case MVT::v2f64:
+    RRC = &ARM::DPRRegClass;
+    Cost = 2;
+    break;
+  case MVT::v4i64:
+    RRC = &ARM::DPRRegClass;
+    Cost = 4;
+    break;
+  case MVT::v8i64:
+    RRC = &ARM::DPRRegClass;
+    Cost = 8;
+    break;
+  }
+  return std::make_pair(RRC, Cost);
+}
+
+const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch ((ARMISD::NodeType)Opcode) {
+  case ARMISD::FIRST_NUMBER:  break;
+  case ARMISD::Wrapper:       return "ARMISD::Wrapper";
+  case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
+  case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
+  case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
+  case ARMISD::CALL:          return "ARMISD::CALL";
+  case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
+  case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
+  case ARMISD::BRCOND:        return "ARMISD::BRCOND";
+  case ARMISD::BR_JT:         return "ARMISD::BR_JT";
+  case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
+  case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
+  case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
+  case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
+  case ARMISD::CMP:           return "ARMISD::CMP";
+  case ARMISD::CMN:           return "ARMISD::CMN";
+  case ARMISD::CMPZ:          return "ARMISD::CMPZ";
+  case ARMISD::CMPFP:         return "ARMISD::CMPFP";
+  case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
+  case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
+  case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
+
+  case ARMISD::CMOV:          return "ARMISD::CMOV";
+
+  case ARMISD::SSAT:          return "ARMISD::SSAT";
+
+  case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
+  case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
+  case ARMISD::RRX:           return "ARMISD::RRX";
+
+  case ARMISD::ADDC:          return "ARMISD::ADDC";
+  case ARMISD::ADDE:          return "ARMISD::ADDE";
+  case ARMISD::SUBC:          return "ARMISD::SUBC";
+  case ARMISD::SUBE:          return "ARMISD::SUBE";
+
+  case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
+  case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
+
+  case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
+  case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
+  case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
+
+  case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
+
+  case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
+
+  case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
+
+  case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
+
+  case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
+
+  case ARMISD::WIN__CHKSTK:   return "ARMISD::WIN__CHKSTK";
+  case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";
+
+  case ARMISD::VCEQ:          return "ARMISD::VCEQ";
+  case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
+  case ARMISD::VCGE:          return "ARMISD::VCGE";
+  case ARMISD::VCGEZ:         return "ARMISD::VCGEZ";
+  case ARMISD::VCLEZ:         return "ARMISD::VCLEZ";
+  case ARMISD::VCGEU:         return "ARMISD::VCGEU";
+  case ARMISD::VCGT:          return "ARMISD::VCGT";
+  case ARMISD::VCGTZ:         return "ARMISD::VCGTZ";
+  case ARMISD::VCLTZ:         return "ARMISD::VCLTZ";
+  case ARMISD::VCGTU:         return "ARMISD::VCGTU";
+  case ARMISD::VTST:          return "ARMISD::VTST";
+
+  case ARMISD::VSHL:          return "ARMISD::VSHL";
+  case ARMISD::VSHRs:         return "ARMISD::VSHRs";
+  case ARMISD::VSHRu:         return "ARMISD::VSHRu";
+  case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
+  case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
+  case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
+  case ARMISD::VQSHLs:        return "ARMISD::VQSHLs";
+  case ARMISD::VQSHLu:        return "ARMISD::VQSHLu";
+  case ARMISD::VQSHLsu:       return "ARMISD::VQSHLsu";
+  case ARMISD::VQSHRNs:       return "ARMISD::VQSHRNs";
+  case ARMISD::VQSHRNu:       return "ARMISD::VQSHRNu";
+  case ARMISD::VQSHRNsu:      return "ARMISD::VQSHRNsu";
+  case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
+  case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
+  case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
+  case ARMISD::VSLI:          return "ARMISD::VSLI";
+  case ARMISD::VSRI:          return "ARMISD::VSRI";
+  case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
+  case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
+  case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
+  case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
+  case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
+  case ARMISD::VDUP:          return "ARMISD::VDUP";
+  case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
+  case ARMISD::VEXT:          return "ARMISD::VEXT";
+  case ARMISD::VREV64:        return "ARMISD::VREV64";
+  case ARMISD::VREV32:        return "ARMISD::VREV32";
+  case ARMISD::VREV16:        return "ARMISD::VREV16";
+  case ARMISD::VZIP:          return "ARMISD::VZIP";
+  case ARMISD::VUZP:          return "ARMISD::VUZP";
+  case ARMISD::VTRN:          return "ARMISD::VTRN";
+  case ARMISD::VTBL1:         return "ARMISD::VTBL1";
+  case ARMISD::VTBL2:         return "ARMISD::VTBL2";
+  case ARMISD::VMULLs:        return "ARMISD::VMULLs";
+  case ARMISD::VMULLu:        return "ARMISD::VMULLu";
+  case ARMISD::UMAAL:         return "ARMISD::UMAAL";
+  case ARMISD::UMLAL:         return "ARMISD::UMLAL";
+  case ARMISD::SMLAL:         return "ARMISD::SMLAL";
+  case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
+  case ARMISD::BFI:           return "ARMISD::BFI";
+  case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
+  case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
+  case ARMISD::VBSL:          return "ARMISD::VBSL";
+  case ARMISD::MEMCPY:        return "ARMISD::MEMCPY";
+  case ARMISD::VLD1DUP:       return "ARMISD::VLD1DUP";
+  case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
+  case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
+  case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
+  case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
+  case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
+  case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
+  case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
+  case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
+  case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
+  case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
+  case ARMISD::VLD1DUP_UPD:   return "ARMISD::VLD1DUP_UPD";
+  case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
+  case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
+  case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
+  case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
+  case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
+  case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
+  case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
+  case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
+  case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
+  case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
+  }
+  return nullptr;
+}
+
+EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                                          EVT VT) const {
+  if (!VT.isVector())
+    return getPointerTy(DL);
+  return VT.changeVectorElementTypeToInteger();
+}
+
+/// getRegClassFor - Return the register class that should be used for the
+/// specified value type.
+const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
+  // Map v4i64 to QQ registers but do not make the type legal. Similarly map
+  // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
+  // load / store 4 to 8 consecutive D registers.
+  if (Subtarget->hasNEON()) {
+    if (VT == MVT::v4i64)
+      return &ARM::QQPRRegClass;
+    if (VT == MVT::v8i64)
+      return &ARM::QQQQPRRegClass;
+  }
+  return TargetLowering::getRegClassFor(VT);
+}
+
+// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
+// source/dest is aligned and the copy size is large enough. We therefore want
+// to align such objects passed to memory intrinsics.
+bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
+                                               unsigned &PrefAlign) const {
+  if (!isa<MemIntrinsic>(CI))
+    return false;
+  MinSize = 8;
+  // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
+  // cycle faster than 4-byte aligned LDM.
+  PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
+  return true;
+}
+
+// Create a fast isel object.
+FastISel *
+ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                  const TargetLibraryInfo *libInfo) const {
+  return ARM::createFastISel(funcInfo, libInfo);
+}
+
+Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
+  unsigned NumVals = N->getNumValues();
+  if (!NumVals)
+    return Sched::RegPressure;
+
+  for (unsigned i = 0; i != NumVals; ++i) {
+    EVT VT = N->getValueType(i);
+    if (VT == MVT::Glue || VT == MVT::Other)
+      continue;
+    if (VT.isFloatingPoint() || VT.isVector())
+      return Sched::ILP;
+  }
+
+  if (!N->isMachineOpcode())
+    return Sched::RegPressure;
+
+  // Load are scheduled for latency even if there instruction itinerary
+  // is not available.
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+
+  if (MCID.getNumDefs() == 0)
+    return Sched::RegPressure;
+  if (!Itins->isEmpty() &&
+      Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
+    return Sched::ILP;
+
+  return Sched::RegPressure;
+}
+
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
+static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown condition code!");
+  case ISD::SETNE:  return ARMCC::NE;
+  case ISD::SETEQ:  return ARMCC::EQ;
+  case ISD::SETGT:  return ARMCC::GT;
+  case ISD::SETGE:  return ARMCC::GE;
+  case ISD::SETLT:  return ARMCC::LT;
+  case ISD::SETLE:  return ARMCC::LE;
+  case ISD::SETUGT: return ARMCC::HI;
+  case ISD::SETUGE: return ARMCC::HS;
+  case ISD::SETULT: return ARMCC::LO;
+  case ISD::SETULE: return ARMCC::LS;
+  }
+}
+
+/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
+static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
+                        ARMCC::CondCodes &CondCode2) {
+  CondCode2 = ARMCC::AL;
+  switch (CC) {
+  default: llvm_unreachable("Unknown FP condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
+  case ISD::SETGT:
+  case ISD::SETOGT: CondCode = ARMCC::GT; break;
+  case ISD::SETGE:
+  case ISD::SETOGE: CondCode = ARMCC::GE; break;
+  case ISD::SETOLT: CondCode = ARMCC::MI; break;
+  case ISD::SETOLE: CondCode = ARMCC::LS; break;
+  case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
+  case ISD::SETO:   CondCode = ARMCC::VC; break;
+  case ISD::SETUO:  CondCode = ARMCC::VS; break;
+  case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
+  case ISD::SETUGT: CondCode = ARMCC::HI; break;
+  case ISD::SETUGE: CondCode = ARMCC::PL; break;
+  case ISD::SETLT:
+  case ISD::SETULT: CondCode = ARMCC::LT; break;
+  case ISD::SETLE:
+  case ISD::SETULE: CondCode = ARMCC::LE; break;
+  case ISD::SETNE:
+  case ISD::SETUNE: CondCode = ARMCC::NE; break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "ARMGenCallingConv.inc"
+
+/// getEffectiveCallingConv - Get the effective calling convention, taking into
+/// account presence of floating point hardware and calling convention
+/// limitations, such as support for variadic functions.
+CallingConv::ID
+ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
+                                           bool isVarArg) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::ARM_AAPCS:
+  case CallingConv::ARM_APCS:
+  case CallingConv::GHC:
+    return CC;
+  case CallingConv::PreserveMost:
+    return CallingConv::PreserveMost;
+  case CallingConv::ARM_AAPCS_VFP:
+  case CallingConv::Swift:
+    return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
+  case CallingConv::C:
+    if (!Subtarget->isAAPCS_ABI())
+      return CallingConv::ARM_APCS;
+    else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() &&
+             getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
+             !isVarArg)
+      return CallingConv::ARM_AAPCS_VFP;
+    else
+      return CallingConv::ARM_AAPCS;
+  case CallingConv::Fast:
+  case CallingConv::CXX_FAST_TLS:
+    if (!Subtarget->isAAPCS_ABI()) {
+      if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
+        return CallingConv::Fast;
+      return CallingConv::ARM_APCS;
+    } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
+      return CallingConv::ARM_AAPCS_VFP;
+    else
+      return CallingConv::ARM_AAPCS;
+  }
+}
+
+CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                 bool isVarArg) const {
+  return CCAssignFnForNode(CC, false, isVarArg);
+}
+
+CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
+                                                   bool isVarArg) const {
+  return CCAssignFnForNode(CC, true, isVarArg);
+}
+
+/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
+/// CallingConvention.
+CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
+                                                 bool Return,
+                                                 bool isVarArg) const {
+  switch (getEffectiveCallingConv(CC, isVarArg)) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::ARM_APCS:
+    return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
+  case CallingConv::ARM_AAPCS:
+    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
+  case CallingConv::ARM_AAPCS_VFP:
+    return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
+  case CallingConv::Fast:
+    return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
+  case CallingConv::GHC:
+    return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
+  case CallingConv::PreserveMost:
+    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
+  }
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue ARMTargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+    SDValue ThisVal) const {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                    *DAG.getContext(), Call);
+  CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign VA = RVLocs[i];
+
+    // Pass 'this' value directly from the argument to return value, to avoid
+    // reg unit interference
+    if (i == 0 && isThisReturn) {
+      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
+             "unexpected return calling convention register assignment");
+      InVals.push_back(ThisVal);
+      continue;
+    }
+
+    SDValue Val;
+    if (VA.needsCustom()) {
+      // Handle f64 or half of a v2f64.
+      SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
+                                      InFlag);
+      Chain = Lo.getValue(1);
+      InFlag = Lo.getValue(2);
+      VA = RVLocs[++i]; // skip ahead to next loc
+      SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
+                                      InFlag);
+      Chain = Hi.getValue(1);
+      InFlag = Hi.getValue(2);
+      if (!Subtarget->isLittle())
+        std::swap (Lo, Hi);
+      Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+
+      if (VA.getLocVT() == MVT::v2f64) {
+        SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
+        Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
+                          DAG.getConstant(0, dl, MVT::i32));
+
+        VA = RVLocs[++i]; // skip ahead to next loc
+        Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
+        Chain = Lo.getValue(1);
+        InFlag = Lo.getValue(2);
+        VA = RVLocs[++i]; // skip ahead to next loc
+        Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
+        Chain = Hi.getValue(1);
+        InFlag = Hi.getValue(2);
+        if (!Subtarget->isLittle())
+          std::swap (Lo, Hi);
+        Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
+                          DAG.getConstant(1, dl, MVT::i32));
+      }
+    } else {
+      Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
+                               InFlag);
+      Chain = Val.getValue(1);
+      InFlag = Val.getValue(2);
+    }
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+/// LowerMemOpCallTo - Store the argument to the stack.
+SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
+                                            SDValue Arg, const SDLoc &dl,
+                                            SelectionDAG &DAG,
+                                            const CCValAssign &VA,
+                                            ISD::ArgFlagsTy Flags) const {
+  unsigned LocMemOffset = VA.getLocMemOffset();
+  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                       StackPtr, PtrOff);
+  return DAG.getStore(
+      Chain, dl, Arg, PtrOff,
+      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
+}
+
+void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
+                                         SDValue Chain, SDValue &Arg,
+                                         RegsToPassVector &RegsToPass,
+                                         CCValAssign &VA, CCValAssign &NextVA,
+                                         SDValue &StackPtr,
+                                         SmallVectorImpl<SDValue> &MemOpChains,
+                                         ISD::ArgFlagsTy Flags) const {
+
+  SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
+                              DAG.getVTList(MVT::i32, MVT::i32), Arg);
+  unsigned id = Subtarget->isLittle() ? 0 : 1;
+  RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
+
+  if (NextVA.isRegLoc())
+    RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
+  else {
+    assert(NextVA.isMemLoc());
+    if (!StackPtr.getNode())
+      StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
+                                    getPointerTy(DAG.getDataLayout()));
+
+    MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
+                                           dl, DAG, NextVA,
+                                           Flags));
+  }
+}
+
+/// LowerCall - Lowering a call into a callseq_start <-
+/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
+/// nodes.
+SDValue
+ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                             SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool doesNotRet                       = CLI.DoesNotReturn;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool isStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+  bool isThisReturn   = false;
+  bool isSibCall      = false;
+  auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
+
+  // Disable tail calls if they're not supported.
+  if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
+    isTailCall = false;
+
+  if (isTailCall) {
+    // Check if it's really possible to do a tail call.
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                    isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(),
+                                                   Outs, OutVals, Ins, DAG);
+    if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
+      report_fatal_error("failed to perform tail call elimination on a call "
+                         "site marked musttail");
+    // We don't support GuaranteedTailCallOpt for ARM, only automatically
+    // detected sibcalls.
+    if (isTailCall) {
+      ++NumTailCalls;
+      isSibCall = true;
+    }
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                    *DAG.getContext(), Call);
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  // For tail calls, memory operands are available in our caller's stack.
+  if (isSibCall)
+    NumBytes = 0;
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  if (!isSibCall)
+    Chain = DAG.getCALLSEQ_START(Chain,
+                                 DAG.getIntPtrConstant(NumBytes, dl, true), dl);
+
+  SDValue StackPtr =
+      DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
+
+  RegsToPassVector RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.  In the case
+  // of tail call optimization, arguments are handled later.
+  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
+       i != e;
+       ++i, ++realArgIdx) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[realArgIdx];
+    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+    bool isByVal = Flags.isByVal();
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
+    if (VA.needsCustom()) {
+      if (VA.getLocVT() == MVT::v2f64) {
+        SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                  DAG.getConstant(0, dl, MVT::i32));
+        SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                  DAG.getConstant(1, dl, MVT::i32));
+
+        PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
+                         VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
+
+        VA = ArgLocs[++i]; // skip ahead to next loc
+        if (VA.isRegLoc()) {
+          PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
+                           VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
+        } else {
+          assert(VA.isMemLoc());
+
+          MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
+                                                 dl, DAG, VA, Flags));
+        }
+      } else {
+        PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
+                         StackPtr, MemOpChains, Flags);
+      }
+    } else if (VA.isRegLoc()) {
+      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) {
+        assert(VA.getLocVT() == MVT::i32 &&
+               "unexpected calling convention register assignment");
+        assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
+               "unexpected use of 'returned'");
+        isThisReturn = true;
+      }
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else if (isByVal) {
+      assert(VA.isMemLoc());
+      unsigned offset = 0;
+
+      // True if this byval aggregate will be split between registers
+      // and memory.
+      unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
+      unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
+
+      if (CurByValIdx < ByValArgsCount) {
+
+        unsigned RegBegin, RegEnd;
+        CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
+
+        EVT PtrVT =
+            DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+        unsigned int i, j;
+        for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
+          SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
+          SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
+                                     MachinePointerInfo(),
+                                     DAG.InferPtrAlignment(AddArg));
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(j, Load));
+        }
+
+        // If parameter size outsides register area, "offset" value
+        // helps us to calculate stack slot for remained part properly.
+        offset = RegEnd - RegBegin;
+
+        CCInfo.nextInRegsParam();
+      }
+
+      if (Flags.getByValSize() > 4*offset) {
+        auto PtrVT = getPointerTy(DAG.getDataLayout());
+        unsigned LocMemOffset = VA.getLocMemOffset();
+        SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
+        SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
+        SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
+        SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
+        SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
+                                           MVT::i32);
+        SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
+                                            MVT::i32);
+
+        SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
+        MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
+                                          Ops));
+      }
+    } else if (!isSibCall) {
+      assert(VA.isMemLoc());
+
+      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+                                             dl, DAG, VA, Flags));
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  // Tail call byval lowering might overwrite argument registers so in case of
+  // tail call optimization the copies to registers are lowered later.
+  if (!isTailCall)
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+
+  // For tail calls lower the arguments to the 'real' stack slot.
+  if (isTailCall) {
+    // Force all the incoming stack arguments to be loaded from the stack
+    // before any new outgoing arguments are stored to the stack, because the
+    // outgoing stack slots may alias the incoming argument stack slots, and
+    // the alias isn't otherwise explicit. This is slightly more conservative
+    // than necessary, because it means that each store effectively depends
+    // on every argument instead of just those arguments it would clobber.
+
+    // Do not flag preceding copytoreg stuff together with the following stuff.
+    InFlag = SDValue();
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+    InFlag = SDValue();
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  bool isDirect = false;
+
+  const TargetMachine &TM = getTargetMachine();
+  const Module *Mod = MF.getFunction()->getParent();
+  const GlobalValue *GV = nullptr;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    GV = G->getGlobal();
+  bool isStub =
+      !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
+
+  bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
+  bool isLocalARMFunc = false;
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  auto PtrVt = getPointerTy(DAG.getDataLayout());
+
+  if (Subtarget->genLongCalls()) {
+    assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
+           "long-calls codegen is not position independent!");
+    // Handle a global address or an external symbol. If it's not one of
+    // those, the target's already in a register, so we don't need to do
+    // anything extra.
+    if (isa<GlobalAddressSDNode>(Callee)) {
+      // Create a constant pool entry for the callee address
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
+
+      // Get the address of the callee into a register
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(
+          PtrVt, dl, DAG.getEntryNode(), CPAddr,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+    } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
+      const char *Sym = S->getSymbol();
+
+      // Create a constant pool entry for the callee address
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
+                                      ARMPCLabelIndex, 0);
+      // Get the address of the callee into a register
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(
+          PtrVt, dl, DAG.getEntryNode(), CPAddr,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+    }
+  } else if (isa<GlobalAddressSDNode>(Callee)) {
+    // If we're optimizing for minimum size and the function is called three or
+    // more times in this block, we can improve codesize by calling indirectly
+    // as BLXr has a 16-bit encoding.
+    auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
+    auto *BB = CLI.CS->getParent();
+    bool PreferIndirect =
+        Subtarget->isThumb() && MF.getFunction()->optForMinSize() &&
+        count_if(GV->users(), [&BB](const User *U) {
+          return isa<Instruction>(U) && cast<Instruction>(U)->getParent() == BB;
+        }) > 2;
+
+    if (!PreferIndirect) {
+      isDirect = true;
+      bool isDef = GV->isStrongDefinitionForLinker();
+
+      // ARM call to a local ARM function is predicable.
+      isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
+      // tBX takes a register source operand.
+      if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+        assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
+        Callee = DAG.getNode(
+            ARMISD::WrapperPIC, dl, PtrVt,
+            DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
+        Callee = DAG.getLoad(
+            PtrVt, dl, DAG.getEntryNode(), Callee,
+            MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+            /* Alignment = */ 0, MachineMemOperand::MODereferenceable |
+                                     MachineMemOperand::MOInvariant);
+      } else if (Subtarget->isTargetCOFF()) {
+        assert(Subtarget->isTargetWindows() &&
+               "Windows is the only supported COFF target");
+        unsigned TargetFlags = GV->hasDLLImportStorageClass()
+                                   ? ARMII::MO_DLLIMPORT
+                                   : ARMII::MO_NO_FLAG;
+        Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0,
+                                            TargetFlags);
+        if (GV->hasDLLImportStorageClass())
+          Callee =
+              DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
+                          DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
+                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+      } else {
+        Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
+      }
+    }
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    isDirect = true;
+    // tBX takes a register source operand.
+    const char *Sym = S->getSymbol();
+    if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
+                                      ARMPCLabelIndex, 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(
+          PtrVt, dl, DAG.getEntryNode(), CPAddr,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
+      Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
+    } else {
+      Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
+    }
+  }
+
+  // FIXME: handle tail calls differently.
+  unsigned CallOpc;
+  if (Subtarget->isThumb()) {
+    if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
+      CallOpc = ARMISD::CALL_NOLINK;
+    else
+      CallOpc = ARMISD::CALL;
+  } else {
+    if (!isDirect && !Subtarget->hasV5TOps())
+      CallOpc = ARMISD::CALL_NOLINK;
+    else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
+             // Emit regular call when code size is the priority
+             !MF.getFunction()->optForMinSize())
+      // "mov lr, pc; b _foo" to avoid confusing the RSP
+      CallOpc = ARMISD::CALL_NOLINK;
+    else
+      CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
+  }
+
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  if (!isTailCall) {
+    const uint32_t *Mask;
+    const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
+    if (isThisReturn) {
+      // For 'this' returns, use the R0-preserving mask if applicable
+      Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
+      if (!Mask) {
+        // Set isThisReturn to false if the calling convention is not one that
+        // allows 'returned' to be modeled in this way, so LowerCallResult does
+        // not try to pass 'this' straight through
+        isThisReturn = false;
+        Mask = ARI->getCallPreservedMask(MF, CallConv);
+      }
+    } else
+      Mask = ARI->getCallPreservedMask(MF, CallConv);
+
+    assert(Mask && "Missing call preserved mask for calling convention");
+    Ops.push_back(DAG.getRegisterMask(Mask));
+  }
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  if (isTailCall) {
+    MF.getFrameInfo().setHasTailCall();
+    return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
+  }
+
+  // Returns a chain and a flag for retval copy to use.
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
+                             DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
+                         InVals, isThisReturn,
+                         isThisReturn ? OutVals[0] : SDValue());
+}
+
+/// HandleByVal - Every parameter *after* a byval parameter is passed
+/// on the stack.  Remember the next parameter register to allocate,
+/// and then confiscate the rest of the parameter registers to insure
+/// this.
+void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
+                                    unsigned Align) const {
+  assert((State->getCallOrPrologue() == Prologue ||
+          State->getCallOrPrologue() == Call) &&
+         "unhandled ParmContext");
+
+  // Byval (as with any stack) slots are always at least 4 byte aligned.
+  Align = std::max(Align, 4U);
+
+  unsigned Reg = State->AllocateReg(GPRArgRegs);
+  if (!Reg)
+    return;
+
+  unsigned AlignInRegs = Align / 4;
+  unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
+  for (unsigned i = 0; i < Waste; ++i)
+    Reg = State->AllocateReg(GPRArgRegs);
+
+  if (!Reg)
+    return;
+
+  unsigned Excess = 4 * (ARM::R4 - Reg);
+
+  // Special case when NSAA != SP and parameter size greater than size of
+  // all remained GPR regs. In that case we can't split parameter, we must
+  // send it to stack. We also must set NCRN to R4, so waste all
+  // remained registers.
+  const unsigned NSAAOffset = State->getNextStackOffset();
+  if (NSAAOffset != 0 && Size > Excess) {
+    while (State->AllocateReg(GPRArgRegs))
+      ;
+    return;
+  }
+
+  // First register for byval parameter is the first register that wasn't
+  // allocated before this method call, so it would be "reg".
+  // If parameter is small enough to be saved in range [reg, r4), then
+  // the end (first after last) register would be reg + param-size-in-regs,
+  // else parameter would be splitted between registers and stack,
+  // end register would be r4 in this case.
+  unsigned ByValRegBegin = Reg;
+  unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
+  State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
+  // Note, first register is allocated in the beginning of function already,
+  // allocate remained amount of registers we need.
+  for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
+    State->AllocateReg(GPRArgRegs);
+  // A byval parameter that is split between registers and memory needs its
+  // size truncated here.
+  // In the case where the entire structure fits in registers, we set the
+  // size in memory to zero.
+  Size = std::max<int>(Size - Excess, 0);
+}
+
+/// MatchingStackOffset - Return true if the given stack call argument is
+/// already available in the same position (relatively) of the caller's
+/// incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+                         MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
+                         const TargetInstrInfo *TII) {
+  unsigned Bytes = Arg.getValueSizeInBits() / 8;
+  int FI = INT_MAX;
+  if (Arg.getOpcode() == ISD::CopyFromReg) {
+    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(VR))
+      return false;
+    MachineInstr *Def = MRI->getVRegDef(VR);
+    if (!Def)
+      return false;
+    if (!Flags.isByVal()) {
+      if (!TII->isLoadFromStackSlot(*Def, FI))
+        return false;
+    } else {
+      return false;
+    }
+  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+    if (Flags.isByVal())
+      // ByVal argument is passed in as a pointer but it's now being
+      // dereferenced. e.g.
+      // define @foo(%struct.X* %A) {
+      //   tail call @bar(%struct.X* byval %A)
+      // }
+      return false;
+    SDValue Ptr = Ld->getBasePtr();
+    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+    if (!FINode)
+      return false;
+    FI = FINode->getIndex();
+  } else
+    return false;
+
+  assert(FI != INT_MAX);
+  if (!MFI.isFixedObjectIndex(FI))
+    return false;
+  return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                                     CallingConv::ID CalleeCC,
+                                                     bool isVarArg,
+                                                     bool isCalleeStructRet,
+                                                     bool isCallerStructRet,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                                     SelectionDAG& DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *CallerF = MF.getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+
+  assert(Subtarget->supportsTailCall());
+
+  // Look for obvious safe cases to perform tail call optimization that do not
+  // require ABI changes. This is what gcc calls sibcall.
+
+  // Exception-handling functions need a special set of instructions to indicate
+  // a return to the hardware. Tail-calling another function would probably
+  // break this.
+  if (CallerF->hasFnAttribute("interrupt"))
+    return false;
+
+  // Also avoid sibcall optimization if either caller or callee uses struct
+  // return semantics.
+  if (isCalleeStructRet || isCallerStructRet)
+    return false;
+
+  // Externally-defined functions with weak linkage should not be
+  // tail-called on ARM when the OS does not support dynamic
+  // pre-emption of symbols, as the AAELF spec requires normal calls
+  // to undefined weak functions to be replaced with a NOP or jump to the
+  // next instruction. The behaviour of branch instructions in this
+  // situation (as used for tail calls) is implementation-defined, so we
+  // cannot rely on the linker replacing the tail call with a return.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    const Triple &TT = getTargetMachine().getTargetTriple();
+    if (GV->hasExternalWeakLinkage() &&
+        (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
+      return false;
+  }
+
+  // Check that the call results are passed in the same way.
+  LLVMContext &C = *DAG.getContext();
+  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
+                                  CCAssignFnForReturn(CalleeCC, isVarArg),
+                                  CCAssignFnForReturn(CallerCC, isVarArg)))
+    return false;
+  // The callee has to preserve all registers the caller needs to preserve.
+  const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+  if (CalleeCC != CallerCC) {
+    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
+      return false;
+  }
+
+  // If Caller's vararg or byval argument has been split between registers and
+  // stack, do not perform tail call, since part of the argument is in caller's
+  // local frame.
+  const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
+  if (AFI_Caller->getArgRegsSaveSize())
+    return false;
+
+  // If the callee takes no arguments then go on to check the results of the
+  // call.
+  if (!Outs.empty()) {
+    // Check if stack adjustment is needed. For now, do not do this if any
+    // argument is passed on the stack.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    ARMCCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C, Call);
+    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
+    if (CCInfo.getNextStackOffset()) {
+      // Check if the arguments are already laid out in the right way as
+      // the caller's fixed stack objects.
+      MachineFrameInfo &MFI = MF.getFrameInfo();
+      const MachineRegisterInfo *MRI = &MF.getRegInfo();
+      const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+      for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
+           i != e;
+           ++i, ++realArgIdx) {
+        CCValAssign &VA = ArgLocs[i];
+        EVT RegVT = VA.getLocVT();
+        SDValue Arg = OutVals[realArgIdx];
+        ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+        if (VA.getLocInfo() == CCValAssign::Indirect)
+          return false;
+        if (VA.needsCustom()) {
+          // f64 and vector types are split into multiple registers or
+          // register/stack-slot combinations.  The types will not match
+          // the registers; give up on memory f64 refs until we figure
+          // out what to do about this.
+          if (!VA.isRegLoc())
+            return false;
+          if (!ArgLocs[++i].isRegLoc())
+            return false;
+          if (RegVT == MVT::v2f64) {
+            if (!ArgLocs[++i].isRegLoc())
+              return false;
+            if (!ArgLocs[++i].isRegLoc())
+              return false;
+          }
+        } else if (!VA.isRegLoc()) {
+          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+                                   MFI, MRI, TII))
+            return false;
+        }
+      }
+    }
+
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+      return false;
+  }
+
+  return true;
+}
+
+bool
+ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                  MachineFunction &MF, bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
+}
+
+static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
+                                    const SDLoc &DL, SelectionDAG &DAG) {
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const Function *F = MF.getFunction();
+
+  StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString();
+
+  // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
+  // version of the "preferred return address". These offsets affect the return
+  // instruction if this is a return from PL1 without hypervisor extensions.
+  //    IRQ/FIQ: +4     "subs pc, lr, #4"
+  //    SWI:     0      "subs pc, lr, #0"
+  //    ABORT:   +4     "subs pc, lr, #4"
+  //    UNDEF:   +4/+2  "subs pc, lr, #0"
+  // UNDEF varies depending on where the exception came from ARM or Thumb
+  // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
+
+  int64_t LROffset;
+  if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
+      IntKind == "ABORT")
+    LROffset = 4;
+  else if (IntKind == "SWI" || IntKind == "UNDEF")
+    LROffset = 0;
+  else
+    report_fatal_error("Unsupported interrupt attribute. If present, value "
+                       "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
+
+  RetOps.insert(RetOps.begin() + 1,
+                DAG.getConstant(LROffset, DL, MVT::i32, false));
+
+  return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
+}
+
+SDValue
+ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
+                               const SDLoc &dl, SelectionDAG &DAG) const {
+
+  // CCValAssign - represent the assignment of the return value to a location.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slots.
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                    *DAG.getContext(), Call);
+
+  // Analyze outgoing return values.
+  CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+  bool isLittleEndian = Subtarget->isLittle();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  AFI->setReturnRegsCount(RVLocs.size());
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0, realRVLocIdx = 0;
+       i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    SDValue Arg = OutVals[realRVLocIdx];
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (VA.needsCustom()) {
+      if (VA.getLocVT() == MVT::v2f64) {
+        // Extract the first half and return it in two registers.
+        SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                   DAG.getConstant(0, dl, MVT::i32));
+        SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
+                                       DAG.getVTList(MVT::i32, MVT::i32), Half);
+
+        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                                 HalfGPRs.getValue(isLittleEndian ? 0 : 1),
+                                 Flag);
+        Flag = Chain.getValue(1);
+        RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+        VA = RVLocs[++i]; // skip ahead to next loc
+        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                                 HalfGPRs.getValue(isLittleEndian ? 1 : 0),
+                                 Flag);
+        Flag = Chain.getValue(1);
+        RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+        VA = RVLocs[++i]; // skip ahead to next loc
+
+        // Extract the 2nd half and fall through to handle it as an f64 value.
+        Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                          DAG.getConstant(1, dl, MVT::i32));
+      }
+      // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
+      // available.
+      SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
+                                  DAG.getVTList(MVT::i32, MVT::i32), Arg);
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                               fmrrd.getValue(isLittleEndian ? 0 : 1),
+                               Flag);
+      Flag = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+      VA = RVLocs[++i]; // skip ahead to next loc
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                               fmrrd.getValue(isLittleEndian ? 1 : 0),
+                               Flag);
+    } else
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
+
+    // Guarantee that all emitted copies are
+    // stuck together, avoiding something bad.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+  const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const MCPhysReg *I =
+      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+  if (I) {
+    for (; *I; ++I) {
+      if (ARM::GPRRegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::i32));
+      else if (ARM::DPRRegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
+      else
+        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+    }
+  }
+
+  // Update chain and glue.
+  RetOps[0] = Chain;
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  // CPUs which aren't M-class use a special sequence to return from
+  // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
+  // though we use "subs pc, lr, #N").
+  //
+  // M-class CPUs actually use a normal return sequence with a special
+  // (hardware-provided) value in LR, so the normal code path works.
+  if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") &&
+      !Subtarget->isMClass()) {
+    if (Subtarget->isThumb1Only())
+      report_fatal_error("interrupt attribute is not supported in Thumb1");
+    return LowerInterruptReturn(RetOps, dl, DAG);
+  }
+
+  return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
+}
+
+bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
+  if (N->getNumValues() != 1)
+    return false;
+  if (!N->hasNUsesOfValue(1, 0))
+    return false;
+
+  SDValue TCChain = Chain;
+  SDNode *Copy = *N->use_begin();
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
+    SDNode *VMov = Copy;
+    // f64 returned in a pair of GPRs.
+    SmallPtrSet<SDNode*, 2> Copies;
+    for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
+         UI != UE; ++UI) {
+      if (UI->getOpcode() != ISD::CopyToReg)
+        return false;
+      Copies.insert(*UI);
+    }
+    if (Copies.size() > 2)
+      return false;
+
+    for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
+         UI != UE; ++UI) {
+      SDValue UseChain = UI->getOperand(0);
+      if (Copies.count(UseChain.getNode()))
+        // Second CopyToReg
+        Copy = *UI;
+      else {
+        // We are at the top of this chain.
+        // If the copy has a glue operand, we conservatively assume it
+        // isn't safe to perform a tail call.
+        if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
+          return false;
+        // First CopyToReg
+        TCChain = UseChain;
+      }
+    }
+  } else if (Copy->getOpcode() == ISD::BITCAST) {
+    // f32 returned in a single GPR.
+    if (!Copy->hasOneUse())
+      return false;
+    Copy = *Copy->use_begin();
+    if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
+      return false;
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else {
+    return false;
+  }
+
+  bool HasRet = false;
+  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+       UI != UE; ++UI) {
+    if (UI->getOpcode() != ARMISD::RET_FLAG &&
+        UI->getOpcode() != ARMISD::INTRET_FLAG)
+      return false;
+    HasRet = true;
+  }
+
+  if (!HasRet)
+    return false;
+
+  Chain = TCChain;
+  return true;
+}
+
+bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!Subtarget->supportsTailCall())
+    return false;
+
+  auto Attr =
+      CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
+  if (!CI->isTailCall() || Attr.getValueAsString() == "true")
+    return false;
+
+  return true;
+}
+
+// Trying to write a 64 bit value so need to split into two 32 bit values first,
+// and pass the lower and high parts through.
+static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  SDValue WriteValue = Op->getOperand(2);
+
+  // This function is only supposed to be called for i64 type argument.
+  assert(WriteValue.getValueType() == MVT::i64
+          && "LowerWRITE_REGISTER called for non-i64 type argument.");
+
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
+                           DAG.getConstant(0, DL, MVT::i32));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
+                           DAG.getConstant(1, DL, MVT::i32));
+  SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
+  return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOVi.
+static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
+  EVT PtrVT = Op.getValueType();
+  // FIXME there is no actual debug info here
+  SDLoc dl(Op);
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  SDValue Res;
+  if (CP->isMachineConstantPoolEntry())
+    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+                                    CP->getAlignment());
+  else
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+                                    CP->getAlignment());
+  return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
+}
+
+unsigned ARMTargetLowering::getJumpTableEncoding() const {
+  return MachineJumpTableInfo::EK_Inline;
+}
+
+SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned ARMPCLabelIndex = 0;
+  SDLoc DL(Op);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  SDValue CPAddr;
+  bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
+  if (!IsPositionIndependent) {
+    CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
+  } else {
+    unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+    ARMPCLabelIndex = AFI->createPICLabelUId();
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
+                                      ARMCP::CPBlockAddress, PCAdj);
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  }
+  CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
+  SDValue Result = DAG.getLoad(
+      PtrVT, DL, DAG.getEntryNode(), CPAddr,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+  if (!IsPositionIndependent)
+    return Result;
+  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
+  return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
+}
+
+/// \brief Convert a TLS address reference into the correct sequence of loads
+/// and calls to compute the variable's address for Darwin, and return an
+/// SDValue containing the final node.
+
+/// Darwin only has one TLS scheme which must be capable of dealing with the
+/// fully general situation, in the worst case. This means:
+///     + "extern __thread" declaration.
+///     + Defined in a possibly unknown dynamic library.
+///
+/// The general system is that each __thread variable has a [3 x i32] descriptor
+/// which contains information used by the runtime to calculate the address. The
+/// only part of this the compiler needs to know about is the first word, which
+/// contains a function pointer that must be called with the address of the
+/// entire descriptor in "r0".
+///
+/// Since this descriptor may be in a different unit, in general access must
+/// proceed along the usual ARM rules. A common sequence to produce is:
+///
+///     movw rT1, :lower16:_var$non_lazy_ptr
+///     movt rT1, :upper16:_var$non_lazy_ptr
+///     ldr r0, [rT1]
+///     ldr rT2, [r0]
+///     blx rT2
+///     [...address now in r0...]
+SDValue
+ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
+  SDLoc DL(Op);
+
+  // First step is to get the address of the actua global symbol. This is where
+  // the TLS descriptor lives.
+  SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
+
+  // The first entry in the descriptor is a function pointer that we must call
+  // to obtain the address of the variable.
+  SDValue Chain = DAG.getEntryNode();
+  SDValue FuncTLVGet = DAG.getLoad(
+      MVT::i32, DL, Chain, DescAddr,
+      MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+      /* Alignment = */ 4,
+      MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
+          MachineMemOperand::MOInvariant);
+  Chain = FuncTLVGet.getValue(1);
+
+  MachineFunction &F = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = F.getFrameInfo();
+  MFI.setAdjustsStack(true);
+
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
+  // silly).
+  auto TRI =
+      getTargetMachine().getSubtargetImpl(*F.getFunction())->getRegisterInfo();
+  auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
+  const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
+
+  // Finally, we can make the call. This is just a degenerate version of a
+  // normal AArch64 call node: r0 takes the address of the descriptor, and
+  // returns the address of the variable in this thread.
+  Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
+  Chain =
+      DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+                  Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
+                  DAG.getRegisterMask(Mask), Chain.getValue(1));
+  return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
+}
+
+SDValue
+ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
+
+  SDValue Chain = DAG.getEntryNode();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDLoc DL(Op);
+
+  // Load the current TEB (thread environment block)
+  SDValue Ops[] = {Chain,
+                   DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
+                   DAG.getConstant(15, DL, MVT::i32),
+                   DAG.getConstant(0, DL, MVT::i32),
+                   DAG.getConstant(13, DL, MVT::i32),
+                   DAG.getConstant(0, DL, MVT::i32),
+                   DAG.getConstant(2, DL, MVT::i32)};
+  SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+                                   DAG.getVTList(MVT::i32, MVT::Other), Ops);
+
+  SDValue TEB = CurrentTEB.getValue(0);
+  Chain = CurrentTEB.getValue(1);
+
+  // Load the ThreadLocalStoragePointer from the TEB
+  // A pointer to the TLS array is located at offset 0x2c from the TEB.
+  SDValue TLSArray =
+      DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
+  TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
+
+  // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
+  // offset into the TLSArray.
+
+  // Load the TLS index from the C runtime
+  SDValue TLSIndex =
+      DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
+  TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
+  TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
+
+  SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
+                              DAG.getConstant(2, DL, MVT::i32));
+  SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
+                            DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
+                            MachinePointerInfo());
+
+  // Get the offset of the start of the .tls section (section base)
+  const auto *GA = cast<GlobalAddressSDNode>(Op);
+  auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
+  SDValue Offset = DAG.getLoad(
+      PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
+                                    DAG.getTargetConstantPool(CPV, PtrVT, 4)),
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+
+  return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model
+SDValue
+ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+                                                 SelectionDAG &DAG) const {
+  SDLoc dl(GA);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+  ARMConstantPoolValue *CPV =
+    ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
+                                    ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
+  SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
+  Argument = DAG.getLoad(
+      PtrVT, dl, DAG.getEntryNode(), Argument,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+  SDValue Chain = Argument.getValue(1);
+
+  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
+  Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
+
+  // call __tls_get_addr.
+  ArgListTy Args;
+  ArgListEntry Entry;
+  Entry.Node = Argument;
+  Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
+  Args.push_back(Entry);
+
+  // FIXME: is there useful debug info available here?
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
+               DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
+
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.first;
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or
+// "local exec" model.
+SDValue
+ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
+                                        SelectionDAG &DAG,
+                                        TLSModel::Model model) const {
+  const GlobalValue *GV = GA->getGlobal();
+  SDLoc dl(GA);
+  SDValue Offset;
+  SDValue Chain = DAG.getEntryNode();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  // Get the Thread Pointer
+  SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
+
+  if (model == TLSModel::InitialExec) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+    // Initial exec model.
+    unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
+                                      ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
+                                      true);
+    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
+    Offset = DAG.getLoad(
+        PtrVT, dl, Chain, Offset,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+    Chain = Offset.getValue(1);
+
+    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
+    Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
+
+    Offset = DAG.getLoad(
+        PtrVT, dl, Chain, Offset,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+  } else {
+    // local exec model
+    assert(model == TLSModel::LocalExec);
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
+    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
+    Offset = DAG.getLoad(
+        PtrVT, dl, Chain, Offset,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+  }
+
+  // The address of the thread local variable is the add of the thread
+  // pointer with the offset of the variable.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+  if (Subtarget->isTargetDarwin())
+    return LowerGlobalTLSAddressDarwin(Op, DAG);
+
+  if (Subtarget->isTargetWindows())
+    return LowerGlobalTLSAddressWindows(Op, DAG);
+
+  // TODO: implement the "local dynamic" model
+  assert(Subtarget->isTargetELF() && "Only ELF implemented here");
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  if (DAG.getTarget().Options.EmulatedTLS)
+    return LowerToTLSEmulatedModel(GA, DAG);
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
+
+  switch (model) {
+    case TLSModel::GeneralDynamic:
+    case TLSModel::LocalDynamic:
+      return LowerToTLSGeneralDynamicModel(GA, DAG);
+    case TLSModel::InitialExec:
+    case TLSModel::LocalExec:
+      return LowerToTLSExecModels(GA, DAG, model);
+  }
+  llvm_unreachable("bogus TLS model");
+}
+
+/// Return true if all users of V are within function F, looking through
+/// ConstantExprs.
+static bool allUsersAreInFunction(const Value *V, const Function *F) {
+  SmallVector<const User*,4> Worklist;
+  for (auto *U : V->users())
+    Worklist.push_back(U);
+  while (!Worklist.empty()) {
+    auto *U = Worklist.pop_back_val();
+    if (isa<ConstantExpr>(U)) {
+      for (auto *UU : U->users())
+        Worklist.push_back(UU);
+      continue;
+    }
+
+    auto *I = dyn_cast<Instruction>(U);
+    if (!I || I->getParent()->getParent() != F)
+      return false;
+  }
+  return true;
+}
+
+/// Return true if all users of V are within some (any) function, looking through
+/// ConstantExprs. In other words, are there any global constant users?
+static bool allUsersAreInFunctions(const Value *V) {
+  SmallVector<const User*,4> Worklist;
+  for (auto *U : V->users())
+    Worklist.push_back(U);
+  while (!Worklist.empty()) {
+    auto *U = Worklist.pop_back_val();
+    if (isa<ConstantExpr>(U)) {
+      for (auto *UU : U->users())
+        Worklist.push_back(UU);
+      continue;
+    }
+
+    if (!isa<Instruction>(U))
+      return false;
+  }
+  return true;
+}
+
+// Return true if T is an integer, float or an array/vector of either.
+static bool isSimpleType(Type *T) {
+  if (T->isIntegerTy() || T->isFloatingPointTy())
+    return true;
+  Type *SubT = nullptr;
+  if (T->isArrayTy())
+    SubT = T->getArrayElementType();
+  else if (T->isVectorTy())
+    SubT = T->getVectorElementType();
+  else
+    return false;
+  return SubT->isIntegerTy() || SubT->isFloatingPointTy();
+}
+
+static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
+                                     EVT PtrVT, SDLoc dl) {
+  // If we're creating a pool entry for a constant global with unnamed address,
+  // and the global is small enough, we can emit it inline into the constant pool
+  // to save ourselves an indirection.
+  //
+  // This is a win if the constant is only used in one function (so it doesn't
+  // need to be duplicated) or duplicating the constant wouldn't increase code
+  // size (implying the constant is no larger than 4 bytes).
+  const Function *F = DAG.getMachineFunction().getFunction();
+  
+  // We rely on this decision to inline being idemopotent and unrelated to the
+  // use-site. We know that if we inline a variable at one use site, we'll
+  // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
+  // doesn't know about this optimization, so bail out if it's enabled else
+  // we could decide to inline here (and thus never emit the GV) but require
+  // the GV from fast-isel generated code.
+  if (!EnableConstpoolPromotion ||
+      DAG.getMachineFunction().getTarget().Options.EnableFastISel)
+      return SDValue();
+
+  auto *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar || !GVar->hasInitializer() ||
+      !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
+      !GVar->hasLocalLinkage())
+    return SDValue();
+
+  // Ensure that we don't try and inline any type that contains pointers. If
+  // we inline a value that contains relocations, we move the relocations from
+  // .data to .text which is not ideal.
+  auto *Init = GVar->getInitializer();
+  if (!isSimpleType(Init->getType()))
+    return SDValue();
+
+  // The constant islands pass can only really deal with alignment requests
+  // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
+  // any type wanting greater alignment requirements than 4 bytes. We also
+  // can only promote constants that are multiples of 4 bytes in size or
+  // are paddable to a multiple of 4. Currently we only try and pad constants
+  // that are strings for simplicity.
+  auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
+  unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
+  unsigned Align = GVar->getAlignment();
+  unsigned RequiredPadding = 4 - (Size % 4);
+  bool PaddingPossible =
+    RequiredPadding == 4 || (CDAInit && CDAInit->isString());
+  if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize)
+    return SDValue();
+
+  unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // We can't bloat the constant pool too much, else the ConstantIslands pass
+  // may fail to converge. If we haven't promoted this global yet (it may have
+  // multiple uses), and promoting it would increase the constant pool size (Sz
+  // > 4), ensure we have space to do so up to MaxTotal.
+  if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
+    if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
+        ConstpoolPromotionMaxTotal)
+      return SDValue();
+
+  // This is only valid if all users are in a single function OR it has users
+  // in multiple functions but it no larger than a pointer. We also check if
+  // GVar has constant (non-ConstantExpr) users. If so, it essentially has its
+  // address taken.
+  if (!allUsersAreInFunction(GVar, F) &&
+      !(Size <= 4 && allUsersAreInFunctions(GVar)))
+    return SDValue();
+
+  // We're going to inline this global. Pad it out if needed.
+  if (RequiredPadding != 4) {
+    StringRef S = CDAInit->getAsString();
+
+    SmallVector<uint8_t,16> V(S.size());
+    std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
+    while (RequiredPadding--)
+      V.push_back(0);
+    Init = ConstantDataArray::get(*DAG.getContext(), V);
+  }
+
+  auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
+  SDValue CPAddr =
+    DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4);
+  if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
+    AFI->markGlobalAsPromotedToConstantPool(GVar);
+    AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
+                                      PaddedSize - 4);
+  }
+  ++NumConstpoolPromoted;
+  return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+}
+
+SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDLoc dl(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  const TargetMachine &TM = getTargetMachine();
+  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+    GV = GA->getBaseObject();
+  bool IsRO =
+      (isa<GlobalVariable>(GV) && cast<GlobalVariable>(GV)->isConstant()) ||
+      isa<Function>(GV);
+
+  // promoteToConstantPool only if not generating XO text section
+  if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly())
+    if (SDValue V = promoteToConstantPool(GV, DAG, PtrVT, dl))
+      return V;
+
+  if (isPositionIndependent()) {
+    bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
+
+    MachineFunction &MF = DAG.getMachineFunction();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
+    SDLoc dl(Op);
+    unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+    ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
+        GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj,
+        UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier,
+        /*AddCurrentAddress=*/UseGOT_PREL);
+    SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    SDValue Result = DAG.getLoad(
+        PtrVT, dl, DAG.getEntryNode(), CPAddr,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+    SDValue Chain = Result.getValue(1);
+    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
+    Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+    if (UseGOT_PREL)
+      Result =
+          DAG.getLoad(PtrVT, dl, Chain, Result,
+                      MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+    return Result;
+  } else if (Subtarget->isROPI() && IsRO) {
+    // PC-relative.
+    SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
+    SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
+    return Result;
+  } else if (Subtarget->isRWPI() && !IsRO) {
+    // SB-relative.
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
+    SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    SDValue G = DAG.getLoad(
+        PtrVT, dl, DAG.getEntryNode(), CPAddr,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+    SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
+    SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, G);
+    return Result;
+  }
+
+  // If we have T2 ops, we can materialize the address directly via movt/movw
+  // pair. This is always cheaper.
+  if (Subtarget->useMovt(DAG.getMachineFunction())) {
+    ++NumMovwMovt;
+    // FIXME: Once remat is capable of dealing with instructions with register
+    // operands, expand this into two nodes.
+    return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
+                       DAG.getTargetGlobalAddress(GV, dl, PtrVT));
+  } else {
+    SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    return DAG.getLoad(
+        PtrVT, dl, DAG.getEntryNode(), CPAddr,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+  }
+}
+
+SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
+         "ROPI/RWPI not currently supported for Darwin");
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDLoc dl(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+
+  if (Subtarget->useMovt(DAG.getMachineFunction()))
+    ++NumMovwMovt;
+
+  // FIXME: Once remat is capable of dealing with instructions with register
+  // operands, expand this into multiple nodes
+  unsigned Wrapper =
+      isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
+
+  SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
+  SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
+
+  if (Subtarget->isGVIndirectSymbol(GV))
+    Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+  return Result;
+}
+
+SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
+  assert(Subtarget->useMovt(DAG.getMachineFunction()) &&
+         "Windows on ARM expects to use movw/movt");
+  assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
+         "ROPI/RWPI not currently supported for Windows");
+
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  const ARMII::TOF TargetFlags =
+    (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result;
+  SDLoc DL(Op);
+
+  ++NumMovwMovt;
+
+  // FIXME: Once remat is capable of dealing with instructions with register
+  // operands, expand this into two nodes.
+  Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
+                       DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0,
+                                                  TargetFlags));
+  if (GV->hasDLLImportStorageClass())
+    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+  return Result;
+}
+
+SDValue
+ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue Val = DAG.getConstant(0, dl, MVT::i32);
+  return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
+                     DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
+                     Op.getOperand(1), Val);
+}
+
+SDValue
+ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
+                     Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
+}
+
+SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
+                     Op.getOperand(0));
+}
+
+SDValue
+ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
+                                          const ARMSubtarget *Subtarget) const {
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDLoc dl(Op);
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+  case Intrinsic::arm_rbit: {
+    assert(Op.getOperand(1).getValueType() == MVT::i32 &&
+           "RBIT intrinsic must have i32 type!");
+    return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1));
+  }
+  case Intrinsic::thread_pointer: {
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
+    return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
+  }
+  case Intrinsic::eh_sjlj_lsda: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
+    SDValue CPAddr;
+    bool IsPositionIndependent = isPositionIndependent();
+    unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex,
+                                      ARMCP::CPLSDA, PCAdj);
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    SDValue Result = DAG.getLoad(
+        PtrVT, dl, DAG.getEntryNode(), CPAddr,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+
+    if (IsPositionIndependent) {
+      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
+      Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+    }
+    return Result;
+  }
+  case Intrinsic::arm_neon_vmulls:
+  case Intrinsic::arm_neon_vmullu: {
+    unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
+      ? ARMISD::VMULLs : ARMISD::VMULLu;
+    return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+  case Intrinsic::arm_neon_vminnm:
+  case Intrinsic::arm_neon_vmaxnm: {
+    unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
+      ? ISD::FMINNUM : ISD::FMAXNUM;
+    return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+  case Intrinsic::arm_neon_vminu:
+  case Intrinsic::arm_neon_vmaxu: {
+    if (Op.getValueType().isFloatingPoint())
+      return SDValue();
+    unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
+      ? ISD::UMIN : ISD::UMAX;
+    return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+                         Op.getOperand(1), Op.getOperand(2));
+  }
+  case Intrinsic::arm_neon_vmins:
+  case Intrinsic::arm_neon_vmaxs: {
+    // v{min,max}s is overloaded between signed integers and floats.
+    if (!Op.getValueType().isFloatingPoint()) {
+      unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
+        ? ISD::SMIN : ISD::SMAX;
+      return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+                         Op.getOperand(1), Op.getOperand(2));
+    }
+    unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
+      ? ISD::FMINNAN : ISD::FMAXNAN;
+    return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+  }
+}
+
+static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
+                                 const ARMSubtarget *Subtarget) {
+  // FIXME: handle "fence singlethread" more efficiently.
+  SDLoc dl(Op);
+  if (!Subtarget->hasDataBarrier()) {
+    // Some ARMv6 cpus can support data barriers with an mcr instruction.
+    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
+    // here.
+    assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
+           "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
+    return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
+                       DAG.getConstant(0, dl, MVT::i32));
+  }
+
+  ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
+  ARM_MB::MemBOpt Domain = ARM_MB::ISH;
+  if (Subtarget->isMClass()) {
+    // Only a full system barrier exists in the M-class architectures.
+    Domain = ARM_MB::SY;
+  } else if (Subtarget->preferISHSTBarriers() &&
+             Ord == AtomicOrdering::Release) {
+    // Swift happens to implement ISHST barriers in a way that's compatible with
+    // Release semantics but weaker than ISH so we'd be fools not to use
+    // it. Beware: other processors probably don't!
+    Domain = ARM_MB::ISHST;
+  }
+
+  return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
+                     DAG.getConstant(Domain, dl, MVT::i32));
+}
+
+static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
+                             const ARMSubtarget *Subtarget) {
+  // ARM pre v5TE and Thumb1 does not have preload instructions.
+  if (!(Subtarget->isThumb2() ||
+        (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
+    // Just preserve the chain.
+    return Op.getOperand(0);
+
+  SDLoc dl(Op);
+  unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
+  if (!isRead &&
+      (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
+    // ARMv7 with MP extension has PLDW.
+    return Op.getOperand(0);
+
+  unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  if (Subtarget->isThumb()) {
+    // Invert the bits.
+    isRead = ~isRead & 1;
+    isData = ~isData & 1;
+  }
+
+  return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
+                     Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
+                     DAG.getConstant(isData, dl, MVT::i32));
+}
+
+static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  SDLoc dl(Op);
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
+SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
+                                                CCValAssign &NextVA,
+                                                SDValue &Root,
+                                                SelectionDAG &DAG,
+                                                const SDLoc &dl) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  const TargetRegisterClass *RC;
+  if (AFI->isThumb1OnlyFunction())
+    RC = &ARM::tGPRRegClass;
+  else
+    RC = &ARM::GPRRegClass;
+
+  // Transform the arguments stored in physical registers into virtual ones.
+  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+  SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
+
+  SDValue ArgValue2;
+  if (NextVA.isMemLoc()) {
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
+
+    // Create load node to retrieve arguments from the stack.
+    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+    ArgValue2 = DAG.getLoad(
+        MVT::i32, dl, Root, FIN,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+  } else {
+    Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
+    ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
+  }
+  if (!Subtarget->isLittle())
+    std::swap (ArgValue, ArgValue2);
+  return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
+}
+
+// The remaining GPRs hold either the beginning of variable-argument
+// data, or the beginning of an aggregate passed by value (usually
+// byval).  Either way, we allocate stack slots adjacent to the data
+// provided by our caller, and store the unallocated registers there.
+// If this is a variadic function, the va_list pointer will begin with
+// these values; otherwise, this reassembles a (byval) structure that
+// was split between registers and memory.
+// Return: The frame index registers were stored into.
+int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
+                                      const SDLoc &dl, SDValue &Chain,
+                                      const Value *OrigArg,
+                                      unsigned InRegsParamRecordIdx,
+                                      int ArgOffset, unsigned ArgSize) const {
+  // Currently, two use-cases possible:
+  // Case #1. Non-var-args function, and we meet first byval parameter.
+  //          Setup first unallocated register as first byval register;
+  //          eat all remained registers
+  //          (these two actions are performed by HandleByVal method).
+  //          Then, here, we initialize stack frame with
+  //          "store-reg" instructions.
+  // Case #2. Var-args function, that doesn't contain byval parameters.
+  //          The same: eat all remained unallocated registers,
+  //          initialize stack frame.
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned RBegin, REnd;
+  if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
+    CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
+  } else {
+    unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
+    RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
+    REnd = ARM::R4;
+  }
+
+  if (REnd != RBegin)
+    ArgOffset = -4 * (ARM::R4 - RBegin);
+
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
+  SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
+
+  SmallVector<SDValue, 4> MemOps;
+  const TargetRegisterClass *RC =
+      AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
+
+  for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
+    unsigned VReg = MF.addLiveIn(Reg, RC);
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+    SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                 MachinePointerInfo(OrigArg, 4 * i));
+    MemOps.push_back(Store);
+    FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+  return FrameIndex;
+}
+
+// Setup stack frame, the va_list pointer will start from.
+void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                                             const SDLoc &dl, SDValue &Chain,
+                                             unsigned ArgOffset,
+                                             unsigned TotalArgRegsSaveSize,
+                                             bool ForceMutable) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // Try to store any remaining integer argument regs
+  // to their spots on the stack so that they may be loaded by dereferencing
+  // the result of va_next.
+  // If there is no regs to be stored, just point address after last
+  // argument passed via stack.
+  int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
+                                  CCInfo.getInRegsParamsCount(),
+                                  CCInfo.getNextStackOffset(), 4);
+  AFI->setVarArgsFrameIndex(FrameIndex);
+}
+
+SDValue ARMTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                    *DAG.getContext(), Prologue);
+  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
+
+  SmallVector<SDValue, 16> ArgValues;
+  SDValue ArgValue;
+  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
+
+  // Initially ArgRegsSaveSize is zero.
+  // Then we increase this value each time we meet byval parameter.
+  // We also increase this value in case of varargs function.
+  AFI->setArgRegsSaveSize(0);
+
+  // Calculate the amount of stack space that we need to allocate to store
+  // byval and variadic arguments that are passed in registers.
+  // We need to know this before we allocate the first byval or variadic
+  // argument, as they will be allocated a stack slot below the CFA (Canonical
+  // Frame Address, the stack pointer at entry to the function).
+  unsigned ArgRegBegin = ARM::R4;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
+      break;
+
+    CCValAssign &VA = ArgLocs[i];
+    unsigned Index = VA.getValNo();
+    ISD::ArgFlagsTy Flags = Ins[Index].Flags;
+    if (!Flags.isByVal())
+      continue;
+
+    assert(VA.isMemLoc() && "unexpected byval pointer in reg");
+    unsigned RBegin, REnd;
+    CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
+    ArgRegBegin = std::min(ArgRegBegin, RBegin);
+
+    CCInfo.nextInRegsParam();
+  }
+  CCInfo.rewindByValRegsInfo();
+
+  int lastInsIndex = -1;
+  if (isVarArg && MFI.hasVAStart()) {
+    unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
+    if (RegIdx != array_lengthof(GPRArgRegs))
+      ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
+  }
+
+  unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
+  AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (Ins[VA.getValNo()].isOrigArg()) {
+      std::advance(CurOrigArg,
+                   Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
+    }
+    // Arguments stored in registers.
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+
+      if (VA.needsCustom()) {
+        // f64 and vector types are split up into multiple registers or
+        // combinations of registers and stack slots.
+        if (VA.getLocVT() == MVT::v2f64) {
+          SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
+                                                   Chain, DAG, dl);
+          VA = ArgLocs[++i]; // skip ahead to next loc
+          SDValue ArgValue2;
+          if (VA.isMemLoc()) {
+            int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
+            SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+            ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
+                                    MachinePointerInfo::getFixedStack(
+                                        DAG.getMachineFunction(), FI));
+          } else {
+            ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
+                                             Chain, DAG, dl);
+          }
+          ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
+          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
+                                 ArgValue, ArgValue1,
+                                 DAG.getIntPtrConstant(0, dl));
+          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
+                                 ArgValue, ArgValue2,
+                                 DAG.getIntPtrConstant(1, dl));
+        } else
+          ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
+
+      } else {
+        const TargetRegisterClass *RC;
+
+        if (RegVT == MVT::f32)
+          RC = &ARM::SPRRegClass;
+        else if (RegVT == MVT::f64)
+          RC = &ARM::DPRRegClass;
+        else if (RegVT == MVT::v2f64)
+          RC = &ARM::QPRRegClass;
+        else if (RegVT == MVT::i32)
+          RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
+                                           : &ARM::GPRRegClass;
+        else
+          llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
+
+        // Transform the arguments in physical registers into virtual ones.
+        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+        ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+      }
+
+      // If this is an 8 or 16-bit value, it is really passed promoted
+      // to 32 bits.  Insert an assert[sz]ext to capture this, then
+      // truncate to the right size.
+      switch (VA.getLocInfo()) {
+      default: llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::BCvt:
+        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::SExt:
+        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::ZExt:
+        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+        break;
+      }
+
+      InVals.push_back(ArgValue);
+
+    } else { // VA.isRegLoc()
+
+      // sanity check
+      assert(VA.isMemLoc());
+      assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
+
+      int index = VA.getValNo();
+
+      // Some Ins[] entries become multiple ArgLoc[] entries.
+      // Process them only once.
+      if (index != lastInsIndex)
+        {
+          ISD::ArgFlagsTy Flags = Ins[index].Flags;
+          // FIXME: For now, all byval parameter objects are marked mutable.
+          // This can be changed with more analysis.
+          // In case of tail call optimization mark all arguments mutable.
+          // Since they could be overwritten by lowering of arguments in case of
+          // a tail call.
+          if (Flags.isByVal()) {
+            assert(Ins[index].isOrigArg() &&
+                   "Byval arguments cannot be implicit");
+            unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
+
+            int FrameIndex = StoreByValRegs(
+                CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
+                VA.getLocMemOffset(), Flags.getByValSize());
+            InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
+            CCInfo.nextInRegsParam();
+          } else {
+            unsigned FIOffset = VA.getLocMemOffset();
+            int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
+                                           FIOffset, true);
+
+            // Create load nodes to retrieve arguments from the stack.
+            SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+            InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
+                                         MachinePointerInfo::getFixedStack(
+                                             DAG.getMachineFunction(), FI)));
+          }
+          lastInsIndex = index;
+        }
+    }
+  }
+
+  // varargs
+  if (isVarArg && MFI.hasVAStart())
+    VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
+                         CCInfo.getNextStackOffset(),
+                         TotalArgRegsSaveSize);
+
+  AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
+
+  return Chain;
+}
+
+/// isFloatingPointZero - Return true if this is +0.0.
+static bool isFloatingPointZero(SDValue Op) {
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
+    return CFP->getValueAPF().isPosZero();
+  else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
+    // Maybe this has already been legalized into the constant pool?
+    if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
+      SDValue WrapperOp = Op.getOperand(1).getOperand(0);
+      if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
+        if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
+          return CFP->getValueAPF().isPosZero();
+    }
+  } else if (Op->getOpcode() == ISD::BITCAST &&
+             Op->getValueType(0) == MVT::f64) {
+    // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
+    // created by LowerConstantFP().
+    SDValue BitcastOp = Op->getOperand(0);
+    if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
+        isNullConstant(BitcastOp->getOperand(0)))
+      return true;
+  }
+  return false;
+}
+
+/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
+/// the given operands.
+SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                     SDValue &ARMcc, SelectionDAG &DAG,
+                                     const SDLoc &dl) const {
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+    unsigned C = RHSC->getZExtValue();
+    if (!isLegalICmpImmediate(C)) {
+      // Constant does not fit, try adjusting it by one?
+      switch (CC) {
+      default: break;
+      case ISD::SETLT:
+      case ISD::SETGE:
+        if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
+          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+          RHS = DAG.getConstant(C - 1, dl, MVT::i32);
+        }
+        break;
+      case ISD::SETULT:
+      case ISD::SETUGE:
+        if (C != 0 && isLegalICmpImmediate(C-1)) {
+          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+          RHS = DAG.getConstant(C - 1, dl, MVT::i32);
+        }
+        break;
+      case ISD::SETLE:
+      case ISD::SETGT:
+        if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
+          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+          RHS = DAG.getConstant(C + 1, dl, MVT::i32);
+        }
+        break;
+      case ISD::SETULE:
+      case ISD::SETUGT:
+        if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
+          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+          RHS = DAG.getConstant(C + 1, dl, MVT::i32);
+        }
+        break;
+      }
+    }
+  }
+
+  ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+  ARMISD::NodeType CompareType;
+  switch (CondCode) {
+  default:
+    CompareType = ARMISD::CMP;
+    break;
+  case ARMCC::EQ:
+  case ARMCC::NE:
+    // Uses only Z Flag
+    CompareType = ARMISD::CMPZ;
+    break;
+  }
+  ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
+  return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
+}
+
+/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
+SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
+                                     SelectionDAG &DAG, const SDLoc &dl) const {
+  assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
+  SDValue Cmp;
+  if (!isFloatingPointZero(RHS))
+    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
+  else
+    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
+  return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
+}
+
+/// duplicateCmp - Glue values can have only one use, so this function
+/// duplicates a comparison node.
+SDValue
+ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
+  unsigned Opc = Cmp.getOpcode();
+  SDLoc DL(Cmp);
+  if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
+    return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
+
+  assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
+  Cmp = Cmp.getOperand(0);
+  Opc = Cmp.getOpcode();
+  if (Opc == ARMISD::CMPFP)
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
+  else {
+    assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
+  }
+  return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
+}
+
+std::pair<SDValue, SDValue>
+ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
+                                 SDValue &ARMcc) const {
+  assert(Op.getValueType() == MVT::i32 &&  "Unsupported value type");
+
+  SDValue Value, OverflowCmp;
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDLoc dl(Op);
+
+  // FIXME: We are currently always generating CMPs because we don't support
+  // generating CMN through the backend. This is not as good as the natural
+  // CMP case because it causes a register dependency and cannot be folded
+  // later.
+
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Unknown overflow instruction!");
+  case ISD::SADDO:
+    ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
+    Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
+    break;
+  case ISD::UADDO:
+    ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
+    Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
+    break;
+  case ISD::SSUBO:
+    ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
+    Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
+    break;
+  case ISD::USUBO:
+    ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
+    Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
+    break;
+  } // switch (...)
+
+  return std::make_pair(Value, OverflowCmp);
+}
+
+
+SDValue
+ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+    return SDValue();
+
+  SDValue Value, OverflowCmp;
+  SDValue ARMcc;
+  std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDLoc dl(Op);
+  // We use 0 and 1 as false and true values.
+  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
+  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
+  EVT VT = Op.getValueType();
+
+  SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
+                                 ARMcc, CCR, OverflowCmp);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
+}
+
+
+SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Cond = Op.getOperand(0);
+  SDValue SelectTrue = Op.getOperand(1);
+  SDValue SelectFalse = Op.getOperand(2);
+  SDLoc dl(Op);
+  unsigned Opc = Cond.getOpcode();
+
+  if (Cond.getResNo() == 1 &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO)) {
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
+      return SDValue();
+
+    SDValue Value, OverflowCmp;
+    SDValue ARMcc;
+    std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
+    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+    EVT VT = Op.getValueType();
+
+    return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
+                   OverflowCmp, DAG);
+  }
+
+  // Convert:
+  //
+  //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
+  //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
+  //
+  if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
+    const ConstantSDNode *CMOVTrue =
+      dyn_cast<ConstantSDNode>(Cond.getOperand(0));
+    const ConstantSDNode *CMOVFalse =
+      dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+
+    if (CMOVTrue && CMOVFalse) {
+      unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
+      unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
+
+      SDValue True;
+      SDValue False;
+      if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
+        True = SelectTrue;
+        False = SelectFalse;
+      } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
+        True = SelectFalse;
+        False = SelectTrue;
+      }
+
+      if (True.getNode() && False.getNode()) {
+        EVT VT = Op.getValueType();
+        SDValue ARMcc = Cond.getOperand(2);
+        SDValue CCR = Cond.getOperand(3);
+        SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
+        assert(True.getValueType() == VT);
+        return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
+      }
+    }
+  }
+
+  // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
+  // undefined bits before doing a full-word comparison with zero.
+  Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
+                     DAG.getConstant(1, dl, Cond.getValueType()));
+
+  return DAG.getSelectCC(dl, Cond,
+                         DAG.getConstant(0, dl, Cond.getValueType()),
+                         SelectTrue, SelectFalse, ISD::SETNE);
+}
+
+static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
+                                 bool &swpCmpOps, bool &swpVselOps) {
+  // Start by selecting the GE condition code for opcodes that return true for
+  // 'equality'
+  if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
+      CC == ISD::SETULE)
+    CondCode = ARMCC::GE;
+
+  // and GT for opcodes that return false for 'equality'.
+  else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
+           CC == ISD::SETULT)
+    CondCode = ARMCC::GT;
+
+  // Since we are constrained to GE/GT, if the opcode contains 'less', we need
+  // to swap the compare operands.
+  if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
+      CC == ISD::SETULT)
+    swpCmpOps = true;
+
+  // Both GT and GE are ordered comparisons, and return false for 'unordered'.
+  // If we have an unordered opcode, we need to swap the operands to the VSEL
+  // instruction (effectively negating the condition).
+  //
+  // This also has the effect of swapping which one of 'less' or 'greater'
+  // returns true, so we also swap the compare operands. It also switches
+  // whether we return true for 'equality', so we compensate by picking the
+  // opposite condition code to our original choice.
+  if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
+      CC == ISD::SETUGT) {
+    swpCmpOps = !swpCmpOps;
+    swpVselOps = !swpVselOps;
+    CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
+  }
+
+  // 'ordered' is 'anything but unordered', so use the VS condition code and
+  // swap the VSEL operands.
+  if (CC == ISD::SETO) {
+    CondCode = ARMCC::VS;
+    swpVselOps = true;
+  }
+
+  // 'unordered or not equal' is 'anything but equal', so use the EQ condition
+  // code and swap the VSEL operands.
+  if (CC == ISD::SETUNE) {
+    CondCode = ARMCC::EQ;
+    swpVselOps = true;
+  }
+}
+
+SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
+                                   SDValue TrueVal, SDValue ARMcc, SDValue CCR,
+                                   SDValue Cmp, SelectionDAG &DAG) const {
+  if (Subtarget->isFPOnlySP() && VT == MVT::f64) {
+    FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
+                           DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
+    TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
+                          DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
+
+    SDValue TrueLow = TrueVal.getValue(0);
+    SDValue TrueHigh = TrueVal.getValue(1);
+    SDValue FalseLow = FalseVal.getValue(0);
+    SDValue FalseHigh = FalseVal.getValue(1);
+
+    SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
+                              ARMcc, CCR, Cmp);
+    SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
+                               ARMcc, CCR, duplicateCmp(Cmp, DAG));
+
+    return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
+  } else {
+    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
+                       Cmp);
+  }
+}
+
+static bool isGTorGE(ISD::CondCode CC) {
+  return CC == ISD::SETGT || CC == ISD::SETGE;
+}
+
+static bool isLTorLE(ISD::CondCode CC) {
+  return CC == ISD::SETLT || CC == ISD::SETLE;
+}
+
+// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
+// All of these conditions (and their <= and >= counterparts) will do:
+//          x < k ? k : x
+//          x > k ? x : k
+//          k < x ? x : k
+//          k > x ? k : x
+static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
+                            const SDValue TrueVal, const SDValue FalseVal,
+                            const ISD::CondCode CC, const SDValue K) {
+  return (isGTorGE(CC) &&
+          ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
+         (isLTorLE(CC) &&
+          ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
+}
+
+// Similar to isLowerSaturate(), but checks for upper-saturating conditions.
+static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
+                            const SDValue TrueVal, const SDValue FalseVal,
+                            const ISD::CondCode CC, const SDValue K) {
+  return (isGTorGE(CC) &&
+          ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) ||
+         (isLTorLE(CC) &&
+          ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));
+}
+
+// Check if two chained conditionals could be converted into SSAT.
+//
+// SSAT can replace a set of two conditional selectors that bound a number to an
+// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
+//
+//     x < -k ? -k : (x > k ? k : x)
+//     x < -k ? -k : (x < k ? x : k)
+//     x > -k ? (x > k ? k : x) : -k
+//     x < k ? (x < -k ? -k : x) : k
+//     etc.
+//
+// It returns true if the conversion can be done, false otherwise.
+// Additionally, the variable is returned in parameter V and the constant in K.
+static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
+                                    uint64_t &K) {
+
+  SDValue LHS1 = Op.getOperand(0);
+  SDValue RHS1 = Op.getOperand(1);
+  SDValue TrueVal1 = Op.getOperand(2);
+  SDValue FalseVal1 = Op.getOperand(3);
+  ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+  const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
+  if (Op2.getOpcode() != ISD::SELECT_CC)
+    return false;
+
+  SDValue LHS2 = Op2.getOperand(0);
+  SDValue RHS2 = Op2.getOperand(1);
+  SDValue TrueVal2 = Op2.getOperand(2);
+  SDValue FalseVal2 = Op2.getOperand(3);
+  ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
+
+  // Find out which are the constants and which are the variables
+  // in each conditional
+  SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
+                                                        ? &RHS1
+                                                        : NULL;
+  SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
+                                                        ? &RHS2
+                                                        : NULL;
+  SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
+  SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
+  SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
+  SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2;
+
+  // We must detect cases where the original operations worked with 16- or
+  // 8-bit values. In such case, V2Tmp != V2 because the comparison operations
+  // must work with sign-extended values but the select operations return
+  // the original non-extended value.
+  SDValue V2TmpReg = V2Tmp;
+  if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG)
+    V2TmpReg = V2Tmp->getOperand(0);
+
+  // Check that the registers and the constants have the correct values
+  // in both conditionals
+  if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp ||
+      V2TmpReg != V2)
+    return false;
+
+  // Figure out which conditional is saturating the lower/upper bound.
+  const SDValue *LowerCheckOp =
+      isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
+          ? &Op
+          : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2
+                                                                       : NULL;
+  const SDValue *UpperCheckOp =
+      isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
+          ? &Op
+          : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2
+                                                                       : NULL;
+
+  if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
+    return false;
+
+  // Check that the constant in the lower-bound check is
+  // the opposite of the constant in the upper-bound check
+  // in 1's complement.
+  int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
+  int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
+  int64_t PosVal = std::max(Val1, Val2);
+
+  if (((Val1 > Val2 && UpperCheckOp == &Op) ||
+       (Val1 < Val2 && UpperCheckOp == &Op2)) &&
+      Val1 == ~Val2 && isPowerOf2_64(PosVal + 1)) {
+
+    V = V2;
+    K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive
+    return true;
+  }
+
+  return false;
+}
+
+SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+
+  // Try to convert two saturating conditional selects into a single SSAT
+  SDValue SatValue;
+  uint64_t SatConstant;
+  if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
+      isSaturatingConditional(Op, SatValue, SatConstant))
+    return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
+                       DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+
+  if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
+    DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
+                                                    dl);
+
+    // If softenSetCCOperands only returned one value, we should compare it to
+    // zero.
+    if (!RHS.getNode()) {
+      RHS = DAG.getConstant(0, dl, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
+  if (LHS.getValueType() == MVT::i32) {
+    // Try to generate VSEL on ARMv8.
+    // The VSEL instruction can't use all the usual ARM condition
+    // codes: it only has two bits to select the condition code, so it's
+    // constrained to use only GE, GT, VS and EQ.
+    //
+    // To implement all the various ISD::SETXXX opcodes, we sometimes need to
+    // swap the operands of the previous compare instruction (effectively
+    // inverting the compare condition, swapping 'less' and 'greater') and
+    // sometimes need to swap the operands to the VSEL (which inverts the
+    // condition in the sense of firing whenever the previous condition didn't)
+    if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
+                                    TrueVal.getValueType() == MVT::f64)) {
+      ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+      if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
+          CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
+        CC = ISD::getSetCCInverse(CC, true);
+        std::swap(TrueVal, FalseVal);
+      }
+    }
+
+    SDValue ARMcc;
+    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+    return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
+  }
+
+  ARMCC::CondCodes CondCode, CondCode2;
+  FPCCToARMCC(CC, CondCode, CondCode2);
+
+  // Try to generate VMAXNM/VMINNM on ARMv8.
+  if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
+                                  TrueVal.getValueType() == MVT::f64)) {
+    bool swpCmpOps = false;
+    bool swpVselOps = false;
+    checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
+
+    if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
+        CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
+      if (swpCmpOps)
+        std::swap(LHS, RHS);
+      if (swpVselOps)
+        std::swap(TrueVal, FalseVal);
+    }
+  }
+
+  SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
+  if (CondCode2 != ARMCC::AL) {
+    SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
+    // FIXME: Needs another CMP because flag can have but one use.
+    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
+    Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
+  }
+  return Result;
+}
+
+/// canChangeToInt - Given the fp compare operand, return true if it is suitable
+/// to morph to an integer compare sequence.
+static bool canChangeToInt(SDValue Op, bool &SeenZero,
+                           const ARMSubtarget *Subtarget) {
+  SDNode *N = Op.getNode();
+  if (!N->hasOneUse())
+    // Otherwise it requires moving the value from fp to integer registers.
+    return false;
+  if (!N->getNumValues())
+    return false;
+  EVT VT = Op.getValueType();
+  if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
+    // f32 case is generally profitable. f64 case only makes sense when vcmpe +
+    // vmrs are very slow, e.g. cortex-a8.
+    return false;
+
+  if (isFloatingPointZero(Op)) {
+    SeenZero = true;
+    return true;
+  }
+  return ISD::isNormalLoad(N);
+}
+
+static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
+  if (isFloatingPointZero(Op))
+    return DAG.getConstant(0, SDLoc(Op), MVT::i32);
+
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
+    return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
+                       Ld->getPointerInfo(), Ld->getAlignment(),
+                       Ld->getMemOperand()->getFlags());
+
+  llvm_unreachable("Unknown VFP cmp argument!");
+}
+
+static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
+                           SDValue &RetVal1, SDValue &RetVal2) {
+  SDLoc dl(Op);
+
+  if (isFloatingPointZero(Op)) {
+    RetVal1 = DAG.getConstant(0, dl, MVT::i32);
+    RetVal2 = DAG.getConstant(0, dl, MVT::i32);
+    return;
+  }
+
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
+    SDValue Ptr = Ld->getBasePtr();
+    RetVal1 =
+        DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+                    Ld->getAlignment(), Ld->getMemOperand()->getFlags());
+
+    EVT PtrType = Ptr.getValueType();
+    unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
+    SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
+                                 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
+    RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
+                          Ld->getPointerInfo().getWithOffset(4), NewAlign,
+                          Ld->getMemOperand()->getFlags());
+    return;
+  }
+
+  llvm_unreachable("Unknown VFP cmp argument!");
+}
+
+/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
+/// f32 and even f64 comparisons to integer ones.
+SDValue
+ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  SDLoc dl(Op);
+
+  bool LHSSeenZero = false;
+  bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
+  bool RHSSeenZero = false;
+  bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
+  if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
+    // If unsafe fp math optimization is enabled and there are no other uses of
+    // the CMP operands, and the condition code is EQ or NE, we can optimize it
+    // to an integer comparison.
+    if (CC == ISD::SETOEQ)
+      CC = ISD::SETEQ;
+    else if (CC == ISD::SETUNE)
+      CC = ISD::SETNE;
+
+    SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
+    SDValue ARMcc;
+    if (LHS.getValueType() == MVT::f32) {
+      LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
+                        bitcastf32Toi32(LHS, DAG), Mask);
+      RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
+                        bitcastf32Toi32(RHS, DAG), Mask);
+      SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+      SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+      return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
+                         Chain, Dest, ARMcc, CCR, Cmp);
+    }
+
+    SDValue LHS1, LHS2;
+    SDValue RHS1, RHS2;
+    expandf64Toi32(LHS, DAG, LHS1, LHS2);
+    expandf64Toi32(RHS, DAG, RHS1, RHS2);
+    LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
+    RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
+    ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+    ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
+    SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
+    return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
+  }
+
+  return SDValue();
+}
+
+SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  SDLoc dl(Op);
+
+  if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
+    DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
+                                                    dl);
+
+    // If softenSetCCOperands only returned one value, we should compare it to
+    // zero.
+    if (!RHS.getNode()) {
+      RHS = DAG.getConstant(0, dl, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
+  if (LHS.getValueType() == MVT::i32) {
+    SDValue ARMcc;
+    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+    return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
+                       Chain, Dest, ARMcc, CCR, Cmp);
+  }
+
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+  if (getTargetMachine().Options.UnsafeFPMath &&
+      (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
+       CC == ISD::SETNE || CC == ISD::SETUNE)) {
+    if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
+      return Result;
+  }
+
+  ARMCC::CondCodes CondCode, CondCode2;
+  FPCCToARMCC(CC, CondCode, CondCode2);
+
+  SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
+  SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
+  if (CondCode2 != ARMCC::AL) {
+    ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
+    SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
+    Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
+  }
+  return Res;
+}
+
+SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Table = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+  SDLoc dl(Op);
+
+  EVT PTy = getPointerTy(DAG.getDataLayout());
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
+  Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
+  Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
+  SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
+  if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
+    // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
+    // which does another jump to the destination. This also makes it easier
+    // to translate it to TBB / TBH later (Thumb2 only).
+    // FIXME: This might not work if the function is extremely large.
+    return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
+                       Addr, Op.getOperand(2), JTI);
+  }
+  if (isPositionIndependent() || Subtarget->isROPI()) {
+    Addr =
+        DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
+                    MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
+    Chain = Addr.getValue(1);
+    Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
+    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
+  } else {
+    Addr =
+        DAG.getLoad(PTy, dl, Chain, Addr,
+                    MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
+    Chain = Addr.getValue(1);
+    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
+  }
+}
+
+static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+
+  if (Op.getValueType().getVectorElementType() == MVT::i32) {
+    if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
+      return Op;
+    return DAG.UnrollVectorOp(Op.getNode());
+  }
+
+  assert(Op.getOperand(0).getValueType() == MVT::v4f32 &&
+         "Invalid type for custom lowering!");
+  if (VT != MVT::v4i16)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0));
+  return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
+}
+
+SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  if (VT.isVector())
+    return LowerVectorFP_TO_INT(Op, DAG);
+  if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) {
+    RTLIB::Libcall LC;
+    if (Op.getOpcode() == ISD::FP_TO_SINT)
+      LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
+                              Op.getValueType());
+    else
+      LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
+                              Op.getValueType());
+    return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
+                       /*isSigned*/ false, SDLoc(Op)).first;
+  }
+
+  return Op;
+}
+
+static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+
+  if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
+    if (VT.getVectorElementType() == MVT::f32)
+      return Op;
+    return DAG.UnrollVectorOp(Op.getNode());
+  }
+
+  assert(Op.getOperand(0).getValueType() == MVT::v4i16 &&
+         "Invalid type for custom lowering!");
+  if (VT != MVT::v4f32)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  unsigned CastOpc;
+  unsigned Opc;
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid opcode!");
+  case ISD::SINT_TO_FP:
+    CastOpc = ISD::SIGN_EXTEND;
+    Opc = ISD::SINT_TO_FP;
+    break;
+  case ISD::UINT_TO_FP:
+    CastOpc = ISD::ZERO_EXTEND;
+    Opc = ISD::UINT_TO_FP;
+    break;
+  }
+
+  Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
+  return DAG.getNode(Opc, dl, VT, Op);
+}
+
+SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  if (VT.isVector())
+    return LowerVectorINT_TO_FP(Op, DAG);
+  if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) {
+    RTLIB::Libcall LC;
+    if (Op.getOpcode() == ISD::SINT_TO_FP)
+      LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
+                              Op.getValueType());
+    else
+      LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
+                              Op.getValueType());
+    return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
+                       /*isSigned*/ false, SDLoc(Op)).first;
+  }
+
+  return Op;
+}
+
+SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
+  // Implement fcopysign with a fabs and a conditional fneg.
+  SDValue Tmp0 = Op.getOperand(0);
+  SDValue Tmp1 = Op.getOperand(1);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  EVT SrcVT = Tmp1.getValueType();
+  bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
+    Tmp0.getOpcode() == ARMISD::VMOVDRR;
+  bool UseNEON = !InGPR && Subtarget->hasNEON();
+
+  if (UseNEON) {
+    // Use VBSL to copy the sign bit.
+    unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
+    SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
+                               DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
+    EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
+    if (VT == MVT::f64)
+      Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
+                         DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
+                         DAG.getConstant(32, dl, MVT::i32));
+    else /*if (VT == MVT::f32)*/
+      Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
+    if (SrcVT == MVT::f32) {
+      Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
+      if (VT == MVT::f64)
+        Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
+                           DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
+                           DAG.getConstant(32, dl, MVT::i32));
+    } else if (VT == MVT::f32)
+      Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
+                         DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
+                         DAG.getConstant(32, dl, MVT::i32));
+    Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
+    Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
+
+    SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
+                                            dl, MVT::i32);
+    AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
+    SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
+                                  DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
+
+    SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
+                              DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
+                              DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
+    if (VT == MVT::f32) {
+      Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
+      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
+                        DAG.getConstant(0, dl, MVT::i32));
+    } else {
+      Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
+    }
+
+    return Res;
+  }
+
+  // Bitcast operand 1 to i32.
+  if (SrcVT == MVT::f64)
+    Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                       Tmp1).getValue(1);
+  Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
+
+  // Or in the signbit with integer operations.
+  SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
+  SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
+  Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
+  if (VT == MVT::f32) {
+    Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
+                       DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
+                       DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
+  }
+
+  // f64: Or the high part with signbit and then combine two parts.
+  Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                     Tmp0);
+  SDValue Lo = Tmp0.getValue(0);
+  SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
+  Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
+  return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+}
+
+SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setReturnAddressIsTaken(true);
+
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
+    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
+                       MachinePointerInfo());
+  }
+
+  // Return LR, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+}
+
+SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+  const ARMBaseRegisterInfo &ARI =
+    *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);  // FIXME probably not meaningful
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned FrameReg = ARI.getFrameRegister(MF);
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo());
+  return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                              SelectionDAG &DAG) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("sp", ARM::SP)
+                       .Default(0);
+  if (Reg)
+    return Reg;
+  report_fatal_error(Twine("Invalid register name \""
+                              + StringRef(RegName)  + "\"."));
+}
+
+// Result is 64 bit value so split into two 32 bit values and return as a
+// pair of values.
+static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                SelectionDAG &DAG) {
+  SDLoc DL(N);
+
+  // This function is only supposed to be called for i64 type destination.
+  assert(N->getValueType(0) == MVT::i64
+          && "ExpandREAD_REGISTER called for non-i64 type result.");
+
+  SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
+                             DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
+                             N->getOperand(0),
+                             N->getOperand(1));
+
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
+                    Read.getValue(1)));
+  Results.push_back(Read.getOperand(0));
+}
+
+/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
+/// When \p DstVT, the destination type of \p BC, is on the vector
+/// register bank and the source of bitcast, \p Op, operates on the same bank,
+/// it might be possible to combine them, such that everything stays on the
+/// vector register bank.
+/// \p return The node that would replace \p BT, if the combine
+/// is possible.
+static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
+                                                SelectionDAG &DAG) {
+  SDValue Op = BC->getOperand(0);
+  EVT DstVT = BC->getValueType(0);
+
+  // The only vector instruction that can produce a scalar (remember,
+  // since the bitcast was about to be turned into VMOVDRR, the source
+  // type is i64) from a vector is EXTRACT_VECTOR_ELT.
+  // Moreover, we can do this combine only if there is one use.
+  // Finally, if the destination type is not a vector, there is not
+  // much point on forcing everything on the vector bank.
+  if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !Op.hasOneUse())
+    return SDValue();
+
+  // If the index is not constant, we will introduce an additional
+  // multiply that will stick.
+  // Give up in that case.
+  ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!Index)
+    return SDValue();
+  unsigned DstNumElt = DstVT.getVectorNumElements();
+
+  // Compute the new index.
+  const APInt &APIntIndex = Index->getAPIntValue();
+  APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
+  NewIndex *= APIntIndex;
+  // Check if the new constant index fits into i32.
+  if (NewIndex.getBitWidth() > 32)
+    return SDValue();
+
+  // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
+  // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
+  SDLoc dl(Op);
+  SDValue ExtractSrc = Op.getOperand(0);
+  EVT VecVT = EVT::getVectorVT(
+      *DAG.getContext(), DstVT.getScalarType(),
+      ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
+  SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
+                     DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
+}
+
+/// ExpandBITCAST - If the target supports VFP, this function is called to
+/// expand a bit convert where either the source or destination type is i64 to
+/// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
+/// operand type is illegal (e.g., v2f32 for a target that doesn't support
+/// vectors), since the legalizer won't know what to do with that.
+static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc dl(N);
+  SDValue Op = N->getOperand(0);
+
+  // This function is only supposed to be called for i64 types, either as the
+  // source or destination of the bit convert.
+  EVT SrcVT = Op.getValueType();
+  EVT DstVT = N->getValueType(0);
+  assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
+         "ExpandBITCAST called for non-i64 type");
+
+  // Turn i64->f64 into VMOVDRR.
+  if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
+    // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
+    // if we can combine the bitcast with its source.
+    if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
+      return Val;
+
+    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
+                             DAG.getConstant(0, dl, MVT::i32));
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
+                             DAG.getConstant(1, dl, MVT::i32));
+    return DAG.getNode(ISD::BITCAST, dl, DstVT,
+                       DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
+  }
+
+  // Turn f64->i64 into VMOVRRD.
+  if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
+    SDValue Cvt;
+    if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
+        SrcVT.getVectorNumElements() > 1)
+      Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
+                        DAG.getVTList(MVT::i32, MVT::i32),
+                        DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
+    else
+      Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
+                        DAG.getVTList(MVT::i32, MVT::i32), Op);
+    // Merge the pieces into a single i64 value.
+    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
+  }
+
+  return SDValue();
+}
+
+/// getZeroVector - Returns a vector of specified type with all zero elements.
+/// Zero vectors are used to represent vector negation and in those cases
+/// will be implemented with the NEON VNEG instruction.  However, VNEG does
+/// not support i64 elements, so sometimes the zero vectors will need to be
+/// explicitly constructed.  Regardless, use a canonical VMOV to create the
+/// zero vector.
+static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
+  assert(VT.isVector() && "Expected a vector type");
+  // The canonical modified immediate encoding of a zero vector is....0!
+  SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
+  EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
+  SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
+  return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
+}
+
+/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
+/// i32 values and take a 2 x i32 value to shift plus a shift amount.
+SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  SDValue ARMcc;
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                   DAG.getConstant(VTBits, dl, MVT::i32));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+  SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+  SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
+                            ISD::SETGE, ARMcc, DAG, dl);
+  SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift,
+                           ARMcc, CCR, CmpLo);
+
+
+  SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+  SDValue HiBigShift = Opc == ISD::SRA
+                           ? DAG.getNode(Opc, dl, VT, ShOpHi,
+                                         DAG.getConstant(VTBits - 1, dl, VT))
+                           : DAG.getConstant(0, dl, VT);
+  SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
+                            ISD::SETGE, ARMcc, DAG, dl);
+  SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
+                           ARMcc, CCR, CmpHi);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
+/// i32 values and take a 2 x i32 value to shift plus a shift amount.
+SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  SDValue ARMcc;
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+
+  assert(Op.getOpcode() == ISD::SHL_PARTS);
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                   DAG.getConstant(VTBits, dl, MVT::i32));
+  SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+  SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
+                            ISD::SETGE, ARMcc, DAG, dl);
+  SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
+                           ARMcc, CCR, CmpHi);
+
+  SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
+                          ISD::SETGE, ARMcc, DAG, dl);
+  SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
+                           DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  // The rounding mode is in bits 23:22 of the FPSCR.
+  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
+  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
+  // so that the shift + and get folded into a bitfield extract.
+  SDLoc dl(Op);
+  SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
+                              DAG.getConstant(Intrinsic::arm_get_fpscr, dl,
+                                              MVT::i32));
+  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
+                                  DAG.getConstant(1U << 22, dl, MVT::i32));
+  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
+                              DAG.getConstant(22, dl, MVT::i32));
+  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
+                     DAG.getConstant(3, dl, MVT::i32));
+}
+
+static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
+                         const ARMSubtarget *ST) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  if (VT.isVector()) {
+    assert(ST->hasNEON());
+
+    // Compute the least significant set bit: LSB = X & -X
+    SDValue X = N->getOperand(0);
+    SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
+    SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
+
+    EVT ElemTy = VT.getVectorElementType();
+
+    if (ElemTy == MVT::i8) {
+      // Compute with: cttz(x) = ctpop(lsb - 1)
+      SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+                                DAG.getTargetConstant(1, dl, ElemTy));
+      SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
+      return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
+    }
+
+    if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
+        (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
+      // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
+      unsigned NumBits = ElemTy.getSizeInBits();
+      SDValue WidthMinus1 =
+          DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+                      DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
+      SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
+      return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
+    }
+
+    // Compute with: cttz(x) = ctpop(lsb - 1)
+
+    // Since we can only compute the number of bits in a byte with vcnt.8, we
+    // have to gather the result with pairwise addition (vpaddl) for i16, i32,
+    // and i64.
+
+    // Compute LSB - 1.
+    SDValue Bits;
+    if (ElemTy == MVT::i64) {
+      // Load constant 0xffff'ffff'ffff'ffff to register.
+      SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+                               DAG.getTargetConstant(0x1eff, dl, MVT::i32));
+      Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
+    } else {
+      SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+                                DAG.getTargetConstant(1, dl, ElemTy));
+      Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
+    }
+
+    // Count #bits with vcnt.8.
+    EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+    SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits);
+    SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8);
+
+    // Gather the #bits with vpaddl (pairwise add.)
+    EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+    SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit,
+        DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
+        Cnt8);
+    if (ElemTy == MVT::i16)
+      return Cnt16;
+
+    EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
+    SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit,
+        DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
+        Cnt16);
+    if (ElemTy == MVT::i32)
+      return Cnt32;
+
+    assert(ElemTy == MVT::i64);
+    SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+        DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
+        Cnt32);
+    return Cnt64;
+  }
+
+  if (!ST->hasV6T2Ops())
+    return SDValue();
+
+  SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
+  return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
+}
+
+/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
+/// for each 16-bit element from operand, repeated.  The basic idea is to
+/// leverage vcnt to get the 8-bit counts, gather and add the results.
+///
+/// Trace for v4i16:
+/// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
+/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
+/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
+/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
+///            [b0 b1 b2 b3 b4 b5 b6 b7]
+///           +[b1 b0 b3 b2 b5 b4 b7 b6]
+/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
+/// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
+static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+  SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
+  SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
+  SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
+  SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
+  return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
+}
+
+/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
+/// bit-count for each 16-bit element from the operand.  We need slightly
+/// different sequencing for v4i16 and v8i16 to stay within NEON's available
+/// 64/128-bit registers.
+///
+/// Trace for v4i16:
+/// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
+/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
+/// v8i16:Extended  = [k0    k1    k2    k3    k0    k1    k2    k3    ]
+/// v4i16:Extracted = [k0    k1    k2    k3    ]
+static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
+  if (VT.is64BitVector()) {
+    SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
+                       DAG.getIntPtrConstant(0, DL));
+  } else {
+    SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
+                                    BitCounts, DAG.getIntPtrConstant(0, DL));
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
+  }
+}
+
+/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
+/// bit-count for each 32-bit element from the operand.  The idea here is
+/// to split the vector into 16-bit elements, leverage the 16-bit count
+/// routine, and then combine the results.
+///
+/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
+/// input    = [v0    v1    ] (vi: 32-bit elements)
+/// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
+/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
+/// vrev: N0 = [k1 k0 k3 k2 ]
+///            [k0 k1 k2 k3 ]
+///       N1 =+[k1 k0 k3 k2 ]
+///            [k0 k2 k1 k3 ]
+///       N2 =+[k1 k3 k0 k2 ]
+///            [k0    k2    k1    k3    ]
+/// Extended =+[k1    k3    k0    k2    ]
+///            [k0    k2    ]
+/// Extracted=+[k1    k3    ]
+///
+static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+
+  SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
+  SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
+  SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
+  SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
+  SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
+
+  if (VT.is64BitVector()) {
+    SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
+                       DAG.getIntPtrConstant(0, DL));
+  } else {
+    SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
+                                    DAG.getIntPtrConstant(0, DL));
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
+  }
+}
+
+static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
+                          const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+
+  assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
+  assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
+          VT == MVT::v4i16 || VT == MVT::v8i16) &&
+         "Unexpected type for custom ctpop lowering");
+
+  if (VT.getVectorElementType() == MVT::i32)
+    return lowerCTPOP32BitElements(N, DAG);
+  else
+    return lowerCTPOP16BitElements(N, DAG);
+}
+
+static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
+                          const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  if (!VT.isVector())
+    return SDValue();
+
+  // Lower vector shifts on NEON to use VSHL.
+  assert(ST->hasNEON() && "unexpected vector shift");
+
+  // Left shifts translate directly to the vshiftu intrinsic.
+  if (N->getOpcode() == ISD::SHL)
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl,
+                                       MVT::i32),
+                       N->getOperand(0), N->getOperand(1));
+
+  assert((N->getOpcode() == ISD::SRA ||
+          N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
+
+  // NEON uses the same intrinsics for both left and right shifts.  For
+  // right shifts, the shift amounts are negative, so negate the vector of
+  // shift amounts.
+  EVT ShiftVT = N->getOperand(1).getValueType();
+  SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
+                                     getZeroVector(ShiftVT, DAG, dl),
+                                     N->getOperand(1));
+  Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
+                             Intrinsic::arm_neon_vshifts :
+                             Intrinsic::arm_neon_vshiftu);
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(vshiftInt, dl, MVT::i32),
+                     N->getOperand(0), NegatedCount);
+}
+
+static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
+                                const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  // We can get here for a node like i32 = ISD::SHL i32, i64
+  if (VT != MVT::i64)
+    return SDValue();
+
+  assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+         "Unknown shift to lower!");
+
+  // We only lower SRA, SRL of 1 here, all others use generic lowering.
+  if (!isOneConstant(N->getOperand(1)))
+    return SDValue();
+
+  // If we are in thumb mode, we don't have RRX.
+  if (ST->isThumb1Only()) return SDValue();
+
+  // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
+                           DAG.getConstant(0, dl, MVT::i32));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
+                           DAG.getConstant(1, dl, MVT::i32));
+
+  // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
+  // captures the result into a carry flag.
+  unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
+  Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
+
+  // The low part is an ARMISD::RRX operand, which shifts the carry in.
+  Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
+
+  // Merge the pieces into a single i64 value.
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+}
+
+static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
+  SDValue TmpOp0, TmpOp1;
+  bool Invert = false;
+  bool Swap = false;
+  unsigned Opc = 0;
+
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
+  EVT VT = Op.getValueType();
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  SDLoc dl(Op);
+
+  if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
+      (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
+    // Special-case integer 64-bit equality comparisons. They aren't legal,
+    // but they can be lowered with a few vector instructions.
+    unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
+    EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
+    SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
+    SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
+    SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
+                              DAG.getCondCode(ISD::SETEQ));
+    SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
+    SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
+    Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
+    if (SetCCOpcode == ISD::SETNE)
+      Merged = DAG.getNOT(dl, Merged, CmpVT);
+    Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
+    return Merged;
+  }
+
+  if (CmpVT.getVectorElementType() == MVT::i64)
+    // 64-bit comparisons are not legal in general.
+    return SDValue();
+
+  if (Op1.getValueType().isFloatingPoint()) {
+    switch (SetCCOpcode) {
+    default: llvm_unreachable("Illegal FP comparison");
+    case ISD::SETUNE:
+    case ISD::SETNE:  Invert = true; LLVM_FALLTHROUGH;
+    case ISD::SETOEQ:
+    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
+    case ISD::SETOLT:
+    case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
+    case ISD::SETOGT:
+    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
+    case ISD::SETOLE:
+    case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
+    case ISD::SETOGE:
+    case ISD::SETGE: Opc = ARMISD::VCGE; break;
+    case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
+    case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
+    case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
+    case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
+    case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
+    case ISD::SETONE:
+      // Expand this to (OLT | OGT).
+      TmpOp0 = Op0;
+      TmpOp1 = Op1;
+      Opc = ISD::OR;
+      Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
+      Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1);
+      break;
+    case ISD::SETUO:
+      Invert = true;
+      LLVM_FALLTHROUGH;
+    case ISD::SETO:
+      // Expand this to (OLT | OGE).
+      TmpOp0 = Op0;
+      TmpOp1 = Op1;
+      Opc = ISD::OR;
+      Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
+      Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1);
+      break;
+    }
+  } else {
+    // Integer comparisons.
+    switch (SetCCOpcode) {
+    default: llvm_unreachable("Illegal integer comparison");
+    case ISD::SETNE:  Invert = true;
+    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
+    case ISD::SETLT:  Swap = true;
+    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
+    case ISD::SETLE:  Swap = true;
+    case ISD::SETGE:  Opc = ARMISD::VCGE; break;
+    case ISD::SETULT: Swap = true;
+    case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
+    case ISD::SETULE: Swap = true;
+    case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
+    }
+
+    // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
+    if (Opc == ARMISD::VCEQ) {
+
+      SDValue AndOp;
+      if (ISD::isBuildVectorAllZeros(Op1.getNode()))
+        AndOp = Op0;
+      else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
+        AndOp = Op1;
+
+      // Ignore bitconvert.
+      if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
+        AndOp = AndOp.getOperand(0);
+
+      if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
+        Opc = ARMISD::VTST;
+        Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
+        Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
+        Invert = !Invert;
+      }
+    }
+  }
+
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  // If one of the operands is a constant vector zero, attempt to fold the
+  // comparison to a specialized compare-against-zero form.
+  SDValue SingleOp;
+  if (ISD::isBuildVectorAllZeros(Op1.getNode()))
+    SingleOp = Op0;
+  else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
+    if (Opc == ARMISD::VCGE)
+      Opc = ARMISD::VCLEZ;
+    else if (Opc == ARMISD::VCGT)
+      Opc = ARMISD::VCLTZ;
+    SingleOp = Op1;
+  }
+
+  SDValue Result;
+  if (SingleOp.getNode()) {
+    switch (Opc) {
+    case ARMISD::VCEQ:
+      Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break;
+    case ARMISD::VCGE:
+      Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break;
+    case ARMISD::VCLEZ:
+      Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break;
+    case ARMISD::VCGT:
+      Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break;
+    case ARMISD::VCLTZ:
+      Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break;
+    default:
+      Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
+    }
+  } else {
+     Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
+  }
+
+  Result = DAG.getSExtOrTrunc(Result, dl, VT);
+
+  if (Invert)
+    Result = DAG.getNOT(dl, Result, VT);
+
+  return Result;
+}
+
+static SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Carry = Op.getOperand(2);
+  SDValue Cond = Op.getOperand(3);
+  SDLoc DL(Op);
+
+  assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
+
+  assert(Carry.getOpcode() != ISD::CARRY_FALSE);
+  SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+  SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
+
+  SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
+  SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
+  SDValue ARMcc = DAG.getConstant(
+      IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
+                                   Cmp.getValue(1), SDValue());
+  return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
+                     CCR, Chain.getValue(1));
+}
+
+/// isNEONModifiedImm - Check if the specified splat value corresponds to a
+/// valid vector constant for a NEON instruction with a "modified immediate"
+/// operand (e.g., VMOV).  If so, return the encoded value.
+static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
+                                 unsigned SplatBitSize, SelectionDAG &DAG,
+                                 const SDLoc &dl, EVT &VT, bool is128Bits,
+                                 NEONModImmType type) {
+  unsigned OpCmode, Imm;
+
+  // SplatBitSize is set to the smallest size that splats the vector, so a
+  // zero vector will always have SplatBitSize == 8.  However, NEON modified
+  // immediate instructions others than VMOV do not support the 8-bit encoding
+  // of a zero vector, and the default encoding of zero is supposed to be the
+  // 32-bit version.
+  if (SplatBits == 0)
+    SplatBitSize = 32;
+
+  switch (SplatBitSize) {
+  case 8:
+    if (type != VMOVModImm)
+      return SDValue();
+    // Any 1-byte value is OK.  Op=0, Cmode=1110.
+    assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
+    OpCmode = 0xe;
+    Imm = SplatBits;
+    VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
+    break;
+
+  case 16:
+    // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
+    VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
+    if ((SplatBits & ~0xff) == 0) {
+      // Value = 0x00nn: Op=x, Cmode=100x.
+      OpCmode = 0x8;
+      Imm = SplatBits;
+      break;
+    }
+    if ((SplatBits & ~0xff00) == 0) {
+      // Value = 0xnn00: Op=x, Cmode=101x.
+      OpCmode = 0xa;
+      Imm = SplatBits >> 8;
+      break;
+    }
+    return SDValue();
+
+  case 32:
+    // NEON's 32-bit VMOV supports splat values where:
+    // * only one byte is nonzero, or
+    // * the least significant byte is 0xff and the second byte is nonzero, or
+    // * the least significant 2 bytes are 0xff and the third is nonzero.
+    VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
+    if ((SplatBits & ~0xff) == 0) {
+      // Value = 0x000000nn: Op=x, Cmode=000x.
+      OpCmode = 0;
+      Imm = SplatBits;
+      break;
+    }
+    if ((SplatBits & ~0xff00) == 0) {
+      // Value = 0x0000nn00: Op=x, Cmode=001x.
+      OpCmode = 0x2;
+      Imm = SplatBits >> 8;
+      break;
+    }
+    if ((SplatBits & ~0xff0000) == 0) {
+      // Value = 0x00nn0000: Op=x, Cmode=010x.
+      OpCmode = 0x4;
+      Imm = SplatBits >> 16;
+      break;
+    }
+    if ((SplatBits & ~0xff000000) == 0) {
+      // Value = 0xnn000000: Op=x, Cmode=011x.
+      OpCmode = 0x6;
+      Imm = SplatBits >> 24;
+      break;
+    }
+
+    // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
+    if (type == OtherModImm) return SDValue();
+
+    if ((SplatBits & ~0xffff) == 0 &&
+        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
+      // Value = 0x0000nnff: Op=x, Cmode=1100.
+      OpCmode = 0xc;
+      Imm = SplatBits >> 8;
+      break;
+    }
+
+    if ((SplatBits & ~0xffffff) == 0 &&
+        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
+      // Value = 0x00nnffff: Op=x, Cmode=1101.
+      OpCmode = 0xd;
+      Imm = SplatBits >> 16;
+      break;
+    }
+
+    // Note: there are a few 32-bit splat values (specifically: 00ffff00,
+    // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
+    // VMOV.I32.  A (very) minor optimization would be to replicate the value
+    // and fall through here to test for a valid 64-bit splat.  But, then the
+    // caller would also need to check and handle the change in size.
+    return SDValue();
+
+  case 64: {
+    if (type != VMOVModImm)
+      return SDValue();
+    // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
+    uint64_t BitMask = 0xff;
+    uint64_t Val = 0;
+    unsigned ImmMask = 1;
+    Imm = 0;
+    for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
+      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
+        Val |= BitMask;
+        Imm |= ImmMask;
+      } else if ((SplatBits & BitMask) != 0) {
+        return SDValue();
+      }
+      BitMask <<= 8;
+      ImmMask <<= 1;
+    }
+
+    if (DAG.getDataLayout().isBigEndian())
+      // swap higher and lower 32 bit word
+      Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
+
+    // Op=1, Cmode=1110.
+    OpCmode = 0x1e;
+    VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
+    break;
+  }
+
+  default:
+    llvm_unreachable("unexpected size for isNEONModifiedImm");
+  }
+
+  unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
+  return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
+}
+
+SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
+                                           const ARMSubtarget *ST) const {
+  bool IsDouble = Op.getValueType() == MVT::f64;
+  ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
+  const APFloat &FPVal = CFP->getValueAPF();
+
+  // Prevent floating-point constants from using literal loads
+  // when execute-only is enabled.
+  if (ST->genExecuteOnly()) {
+    APInt INTVal = FPVal.bitcastToAPInt();
+    SDLoc DL(CFP);
+    if (IsDouble) {
+      SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
+      SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
+      if (!ST->isLittle())
+        std::swap(Lo, Hi);
+      return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
+    } else {
+      return DAG.getConstant(INTVal, DL, MVT::i32);
+    }
+  }
+
+  if (!ST->hasVFP3())
+    return SDValue();
+
+  // Use the default (constant pool) lowering for double constants when we have
+  // an SP-only FPU
+  if (IsDouble && Subtarget->isFPOnlySP())
+    return SDValue();
+
+  // Try splatting with a VMOV.f32...
+  int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
+
+  if (ImmVal != -1) {
+    if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
+      // We have code in place to select a valid ConstantFP already, no need to
+      // do any mangling.
+      return Op;
+    }
+
+    // It's a float and we are trying to use NEON operations where
+    // possible. Lower it to a splat followed by an extract.
+    SDLoc DL(Op);
+    SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
+    SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
+                                      NewVal);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
+                       DAG.getConstant(0, DL, MVT::i32));
+  }
+
+  // The rest of our options are NEON only, make sure that's allowed before
+  // proceeding..
+  if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
+    return SDValue();
+
+  EVT VMovVT;
+  uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
+
+  // It wouldn't really be worth bothering for doubles except for one very
+  // important value, which does happen to match: 0.0. So make sure we don't do
+  // anything stupid.
+  if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
+    return SDValue();
+
+  // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
+  SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
+                                     VMovVT, false, VMOVModImm);
+  if (NewVal != SDValue()) {
+    SDLoc DL(Op);
+    SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
+                                      NewVal);
+    if (IsDouble)
+      return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
+
+    // It's a float: cast and extract a vector element.
+    SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
+                                       VecConstant);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
+                       DAG.getConstant(0, DL, MVT::i32));
+  }
+
+  // Finally, try a VMVN.i32
+  NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
+                             false, VMVNModImm);
+  if (NewVal != SDValue()) {
+    SDLoc DL(Op);
+    SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
+
+    if (IsDouble)
+      return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
+
+    // It's a float: cast and extract a vector element.
+    SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
+                                       VecConstant);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
+                       DAG.getConstant(0, DL, MVT::i32));
+  }
+
+  return SDValue();
+}
+
+// check if an VEXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are the same.
+static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
+
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, just follow it
+    // back to index zero and keep going.
+    ++ExpectedElt;
+    if (ExpectedElt == NumElts)
+      ExpectedElt = 0;
+
+    if (M[i] < 0) continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  return true;
+}
+
+
+static bool isVEXTMask(ArrayRef<int> M, EVT VT,
+                       bool &ReverseVEXT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+  ReverseVEXT = false;
+
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
+
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, it may still be
+    // a VEXT but the source vectors must be swapped.
+    ExpectedElt += 1;
+    if (ExpectedElt == NumElts * 2) {
+      ExpectedElt = 0;
+      ReverseVEXT = true;
+    }
+
+    if (M[i] < 0) continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  // Adjust the index value if the source operands will be swapped.
+  if (ReverseVEXT)
+    Imm -= NumElts;
+
+  return true;
+}
+
+/// isVREVMask - Check if a vector shuffle corresponds to a VREV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+  assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
+         "Only possible block sizes for VREV are: 16, 32, 64");
+
+  unsigned EltSz = VT.getScalarSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BlockElts = M[0] + 1;
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSz;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if (M[i] < 0) continue; // ignore UNDEF indices
+    if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
+static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
+  // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
+  // range, then 0 is placed into the resulting vector. So pretty much any mask
+  // of 8 elements can work here.
+  return VT == MVT::v8i8 && M.size() == 8;
+}
+
+// Checks whether the shuffle mask represents a vector transpose (VTRN) by
+// checking that pairs of elements in the shuffle mask represent the same index
+// in each vector, incrementing the expected index by 2 at each step.
+// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
+//  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
+//  v2={e,f,g,h}
+// WhichResult gives the offset for each element in the mask based on which
+// of the two results it belongs to.
+//
+// The transpose can be represented either as:
+// result1 = shufflevector v1, v2, result1_shuffle_mask
+// result2 = shufflevector v1, v2, result2_shuffle_mask
+// where v1/v2 and the shuffle masks have the same number of elements
+// (here WhichResult (see below) indicates which result is being checked)
+//
+// or as:
+// results = shufflevector v1, v2, shuffle_mask
+// where both results are returned in one vector and the shuffle mask has twice
+// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
+// want to check the low half and high half of the shuffle mask as if it were
+// the other case
+static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned EltSz = VT.getScalarSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  if (M.size() != NumElts && M.size() != NumElts*2)
+    return false;
+
+  // If the mask is twice as long as the input vector then we need to check the
+  // upper and lower parts of the mask with a matching value for WhichResult
+  // FIXME: A mask with only even values will be rejected in case the first
+  // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
+  // M[0] is used to determine WhichResult
+  for (unsigned i = 0; i < M.size(); i += NumElts) {
+    if (M.size() == NumElts * 2)
+      WhichResult = i / NumElts;
+    else
+      WhichResult = M[i] == 0 ? 0 : 1;
+    for (unsigned j = 0; j < NumElts; j += 2) {
+      if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
+          (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
+        return false;
+    }
+  }
+
+  if (M.size() == NumElts*2)
+    WhichResult = 0;
+
+  return true;
+}
+
+/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
+static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
+  unsigned EltSz = VT.getScalarSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  if (M.size() != NumElts && M.size() != NumElts*2)
+    return false;
+
+  for (unsigned i = 0; i < M.size(); i += NumElts) {
+    if (M.size() == NumElts * 2)
+      WhichResult = i / NumElts;
+    else
+      WhichResult = M[i] == 0 ? 0 : 1;
+    for (unsigned j = 0; j < NumElts; j += 2) {
+      if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
+          (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
+        return false;
+    }
+  }
+
+  if (M.size() == NumElts*2)
+    WhichResult = 0;
+
+  return true;
+}
+
+// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
+// that the mask elements are either all even and in steps of size 2 or all odd
+// and in steps of size 2.
+// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
+//  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
+//  v2={e,f,g,h}
+// Requires similar checks to that of isVTRNMask with
+// respect the how results are returned.
+static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned EltSz = VT.getScalarSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  if (M.size() != NumElts && M.size() != NumElts*2)
+    return false;
+
+  for (unsigned i = 0; i < M.size(); i += NumElts) {
+    WhichResult = M[i] == 0 ? 0 : 1;
+    for (unsigned j = 0; j < NumElts; ++j) {
+      if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
+        return false;
+    }
+  }
+
+  if (M.size() == NumElts*2)
+    WhichResult = 0;
+
+  // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
+static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
+  unsigned EltSz = VT.getScalarSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  if (M.size() != NumElts && M.size() != NumElts*2)
+    return false;
+
+  unsigned Half = NumElts / 2;
+  for (unsigned i = 0; i < M.size(); i += NumElts) {
+    WhichResult = M[i] == 0 ? 0 : 1;
+    for (unsigned j = 0; j < NumElts; j += Half) {
+      unsigned Idx = WhichResult;
+      for (unsigned k = 0; k < Half; ++k) {
+        int MIdx = M[i + j + k];
+        if (MIdx >= 0 && (unsigned) MIdx != Idx)
+          return false;
+        Idx += 2;
+      }
+    }
+  }
+
+  if (M.size() == NumElts*2)
+    WhichResult = 0;
+
+  // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
+// that pairs of elements of the shufflemask represent the same index in each
+// vector incrementing sequentially through the vectors.
+// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
+//  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
+//  v2={e,f,g,h}
+// Requires similar checks to that of isVTRNMask with respect the how results
+// are returned.
+static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned EltSz = VT.getScalarSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  if (M.size() != NumElts && M.size() != NumElts*2)
+    return false;
+
+  for (unsigned i = 0; i < M.size(); i += NumElts) {
+    WhichResult = M[i] == 0 ? 0 : 1;
+    unsigned Idx = WhichResult * NumElts / 2;
+    for (unsigned j = 0; j < NumElts; j += 2) {
+      if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
+          (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
+        return false;
+      Idx += 1;
+    }
+  }
+
+  if (M.size() == NumElts*2)
+    WhichResult = 0;
+
+  // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
+static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
+  unsigned EltSz = VT.getScalarSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  if (M.size() != NumElts && M.size() != NumElts*2)
+    return false;
+
+  for (unsigned i = 0; i < M.size(); i += NumElts) {
+    WhichResult = M[i] == 0 ? 0 : 1;
+    unsigned Idx = WhichResult * NumElts / 2;
+    for (unsigned j = 0; j < NumElts; j += 2) {
+      if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
+          (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
+        return false;
+      Idx += 1;
+    }
+  }
+
+  if (M.size() == NumElts*2)
+    WhichResult = 0;
+
+  // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
+/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
+static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
+                                           unsigned &WhichResult,
+                                           bool &isV_UNDEF) {
+  isV_UNDEF = false;
+  if (isVTRNMask(ShuffleMask, VT, WhichResult))
+    return ARMISD::VTRN;
+  if (isVUZPMask(ShuffleMask, VT, WhichResult))
+    return ARMISD::VUZP;
+  if (isVZIPMask(ShuffleMask, VT, WhichResult))
+    return ARMISD::VZIP;
+
+  isV_UNDEF = true;
+  if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
+    return ARMISD::VTRN;
+  if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+    return ARMISD::VUZP;
+  if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+    return ARMISD::VZIP;
+
+  return 0;
+}
+
+/// \return true if this is a reverse operation on an vector.
+static bool isReverseMask(ArrayRef<int> M, EVT VT) {
+  unsigned NumElts = VT.getVectorNumElements();
+  // Make sure the mask has the right size.
+  if (NumElts != M.size())
+      return false;
+
+  // Look for <15, ..., 3, -1, 1, 0>.
+  for (unsigned i = 0; i != NumElts; ++i)
+    if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
+      return false;
+
+  return true;
+}
+
+// If N is an integer constant that can be moved into a register in one
+// instruction, return an SDValue of such a constant (will become a MOV
+// instruction).  Otherwise return null.
+static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
+                                     const ARMSubtarget *ST, const SDLoc &dl) {
+  uint64_t Val;
+  if (!isa<ConstantSDNode>(N))
+    return SDValue();
+  Val = cast<ConstantSDNode>(N)->getZExtValue();
+
+  if (ST->isThumb1Only()) {
+    if (Val <= 255 || ~Val <= 255)
+      return DAG.getConstant(Val, dl, MVT::i32);
+  } else {
+    if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
+      return DAG.getConstant(Val, dl, MVT::i32);
+  }
+  return SDValue();
+}
+
+// If this is a case we can't handle, return null and let the default
+// expansion code take care of it.
+SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
+                                             const ARMSubtarget *ST) const {
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatUndef.isAllOnesValue())
+      return DAG.getUNDEF(VT);
+
+    if (SplatBitSize <= 64) {
+      // Check if an immediate VMOV works.
+      EVT VmovVT;
+      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, dl, VmovVT, VT.is128BitVector(),
+                                      VMOVModImm);
+      if (Val.getNode()) {
+        SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
+      }
+
+      // Try an immediate VMVN.
+      uint64_t NegatedImm = (~SplatBits).getZExtValue();
+      Val = isNEONModifiedImm(NegatedImm,
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, dl, VmovVT, VT.is128BitVector(),
+                                      VMVNModImm);
+      if (Val.getNode()) {
+        SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
+      }
+
+      // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
+      if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
+        int ImmVal = ARM_AM::getFP32Imm(SplatBits);
+        if (ImmVal != -1) {
+          SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
+          return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
+        }
+      }
+    }
+  }
+
+  // Scan through the operands to see if only one value is used.
+  //
+  // As an optimisation, even if more than one value is used it may be more
+  // profitable to splat with one value then change some lanes.
+  //
+  // Heuristically we decide to do this if the vector has a "dominant" value,
+  // defined as splatted to more than half of the lanes.
+  unsigned NumElts = VT.getVectorNumElements();
+  bool isOnlyLowElement = true;
+  bool usesOnlyOneValue = true;
+  bool hasDominantValue = false;
+  bool isConstant = true;
+
+  // Map of the number of times a particular SDValue appears in the
+  // element list.
+  DenseMap<SDValue, unsigned> ValueCounts;
+  SDValue Value;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.isUndef())
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    ValueCounts.insert(std::make_pair(V, 0));
+    unsigned &Count = ValueCounts[V];
+
+    // Is this value dominant? (takes up more than half of the lanes)
+    if (++Count > (NumElts / 2)) {
+      hasDominantValue = true;
+      Value = V;
+    }
+  }
+  if (ValueCounts.size() != 1)
+    usesOnlyOneValue = false;
+  if (!Value.getNode() && ValueCounts.size() > 0)
+    Value = ValueCounts.begin()->first;
+
+  if (ValueCounts.size() == 0)
+    return DAG.getUNDEF(VT);
+
+  // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
+  // Keep going if we are hitting this case.
+  if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
+
+  unsigned EltSize = VT.getScalarSizeInBits();
+
+  // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
+  // i32 and try again.
+  if (hasDominantValue && EltSize <= 32) {
+    if (!isConstant) {
+      SDValue N;
+
+      // If we are VDUPing a value that comes directly from a vector, that will
+      // cause an unnecessary move to and from a GPR, where instead we could
+      // just use VDUPLANE. We can only do this if the lane being extracted
+      // is at a constant index, as the VDUP from lane instructions only have
+      // constant-index forms.
+      ConstantSDNode *constIndex;
+      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
+        // We need to create a new undef vector to use for the VDUPLANE if the
+        // size of the vector from which we get the value is different than the
+        // size of the vector that we need to create. We will insert the element
+        // such that the register coalescer will remove unnecessary copies.
+        if (VT != Value->getOperand(0).getValueType()) {
+          unsigned index = constIndex->getAPIntValue().getLimitedValue() %
+                             VT.getVectorNumElements();
+          N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
+                        Value, DAG.getConstant(index, dl, MVT::i32)),
+                           DAG.getConstant(index, dl, MVT::i32));
+        } else
+          N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                        Value->getOperand(0), Value->getOperand(1));
+      } else
+        N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+
+      if (!usesOnlyOneValue) {
+        // The dominant value was splatted as 'N', but we now have to insert
+        // all differing elements.
+        for (unsigned I = 0; I < NumElts; ++I) {
+          if (Op.getOperand(I) == Value)
+            continue;
+          SmallVector<SDValue, 3> Ops;
+          Ops.push_back(N);
+          Ops.push_back(Op.getOperand(I));
+          Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
+          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
+        }
+      }
+      return N;
+    }
+    if (VT.getVectorElementType().isFloatingPoint()) {
+      SmallVector<SDValue, 8> Ops;
+      for (unsigned i = 0; i < NumElts; ++i)
+        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
+                                  Op.getOperand(i)));
+      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+      SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
+      Val = LowerBUILD_VECTOR(Val, DAG, ST);
+      if (Val.getNode())
+        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+    }
+    if (usesOnlyOneValue) {
+      SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
+      if (isConstant && Val.getNode())
+        return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
+    }
+  }
+
+  // If all elements are constants and the case above didn't get hit, fall back
+  // to the default expansion, which will generate a load from the constant
+  // pool.
+  if (isConstant)
+    return SDValue();
+
+  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+  if (NumElts >= 4) {
+    SDValue shuffle = ReconstructShuffle(Op, DAG);
+    if (shuffle != SDValue())
+      return shuffle;
+  }
+
+  if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
+    // If we haven't found an efficient lowering, try splitting a 128-bit vector
+    // into two 64-bit vectors; we might discover a better way to lower it.
+    SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
+    EVT ExtVT = VT.getVectorElementType();
+    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
+    SDValue Lower =
+        DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
+    if (Lower.getOpcode() == ISD::BUILD_VECTOR)
+      Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
+    SDValue Upper = DAG.getBuildVector(
+        HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
+    if (Upper.getOpcode() == ISD::BUILD_VECTOR)
+      Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
+    if (Lower && Upper)
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
+  }
+
+  // Vectors with 32- or 64-bit elements can be built by directly assigning
+  // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
+  // will be legalized.
+  if (EltSize >= 32) {
+    // Do the expansion with floating-point types, since that is what the VFP
+    // registers are defined to use, and since i64 is not legal.
+    EVT EltVT = EVT::getFloatingPointVT(EltSize);
+    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i < NumElts; ++i)
+      Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
+    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
+    return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+  }
+
+  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+  // know the default expansion would otherwise fall back on something even
+  // worse. For a vector with one or two non-undef values, that's
+  // scalar_to_vector for the elements followed by a shuffle (provided the
+  // shuffle is valid for the target) and materialization element by element
+  // on the stack followed by a load for everything else.
+  if (!isConstant && !usesOnlyOneValue) {
+    SDValue Vec = DAG.getUNDEF(VT);
+    for (unsigned i = 0 ; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      if (V.isUndef())
+        continue;
+      SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
+    }
+    return Vec;
+  }
+
+  return SDValue();
+}
+
+// Gather data to see if the operation can be modelled as a
+// shuffle in combination with VEXTs.
+SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  struct ShuffleSourceInfo {
+    SDValue Vec;
+    unsigned MinElt;
+    unsigned MaxElt;
+
+    // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
+    // be compatible with the shuffle we intend to construct. As a result
+    // ShuffleVec will be some sliding window into the original Vec.
+    SDValue ShuffleVec;
+
+    // Code should guarantee that element i in Vec starts at element "WindowBase
+    // + i * WindowScale in ShuffleVec".
+    int WindowBase;
+    int WindowScale;
+
+    bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
+    ShuffleSourceInfo(SDValue Vec)
+        : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
+          WindowScale(1) {}
+  };
+
+  // First gather all vectors used as an immediate source for this BUILD_VECTOR
+  // node.
+  SmallVector<ShuffleSourceInfo, 2> Sources;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.isUndef())
+      continue;
+    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+      // A shuffle can only come from building a vector from various
+      // elements of other vectors.
+      return SDValue();
+    } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
+      // Furthermore, shuffles require a constant mask, whereas extractelts
+      // accept variable indices.
+      return SDValue();
+    }
+
+    // Add this element source to the list if it's not already there.
+    SDValue SourceVec = V.getOperand(0);
+    auto Source = find(Sources, SourceVec);
+    if (Source == Sources.end())
+      Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
+
+    // Update the minimum and maximum lane number seen.
+    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    Source->MinElt = std::min(Source->MinElt, EltNo);
+    Source->MaxElt = std::max(Source->MaxElt, EltNo);
+  }
+
+  // Currently only do something sane when at most two source vectors
+  // are involved.
+  if (Sources.size() > 2)
+    return SDValue();
+
+  // Find out the smallest element size among result and two sources, and use
+  // it as element size to build the shuffle_vector.
+  EVT SmallestEltTy = VT.getVectorElementType();
+  for (auto &Source : Sources) {
+    EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
+    if (SrcEltTy.bitsLT(SmallestEltTy))
+      SmallestEltTy = SrcEltTy;
+  }
+  unsigned ResMultiplier =
+      VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
+  NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
+  EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
+
+  // If the source vector is too wide or too narrow, we may nevertheless be able
+  // to construct a compatible shuffle either by concatenating it with UNDEF or
+  // extracting a suitable range of elements.
+  for (auto &Src : Sources) {
+    EVT SrcVT = Src.ShuffleVec.getValueType();
+
+    if (SrcVT.getSizeInBits() == VT.getSizeInBits())
+      continue;
+
+    // This stage of the search produces a source with the same element type as
+    // the original, but with a total width matching the BUILD_VECTOR output.
+    EVT EltVT = SrcVT.getVectorElementType();
+    unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
+    EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
+
+    if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
+      if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
+        return SDValue();
+      // We can pad out the smaller vector for free, so if it's part of a
+      // shuffle...
+      Src.ShuffleVec =
+          DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
+                      DAG.getUNDEF(Src.ShuffleVec.getValueType()));
+      continue;
+    }
+
+    if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
+      return SDValue();
+
+    if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
+      // Span too large for a VEXT to cope
+      return SDValue();
+    }
+
+    if (Src.MinElt >= NumSrcElts) {
+      // The extraction can just take the second half
+      Src.ShuffleVec =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(NumSrcElts, dl, MVT::i32));
+      Src.WindowBase = -NumSrcElts;
+    } else if (Src.MaxElt < NumSrcElts) {
+      // The extraction can just take the first half
+      Src.ShuffleVec =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(0, dl, MVT::i32));
+    } else {
+      // An actual VEXT is needed
+      SDValue VEXTSrc1 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(0, dl, MVT::i32));
+      SDValue VEXTSrc2 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(NumSrcElts, dl, MVT::i32));
+
+      Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
+                                   VEXTSrc2,
+                                   DAG.getConstant(Src.MinElt, dl, MVT::i32));
+      Src.WindowBase = -Src.MinElt;
+    }
+  }
+
+  // Another possible incompatibility occurs from the vector element types. We
+  // can fix this by bitcasting the source vectors to the same type we intend
+  // for the shuffle.
+  for (auto &Src : Sources) {
+    EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
+    if (SrcEltTy == SmallestEltTy)
+      continue;
+    assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
+    Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+    Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
+    Src.WindowBase *= Src.WindowScale;
+  }
+
+  // Final sanity check before we try to actually produce a shuffle.
+  DEBUG(
+    for (auto Src : Sources)
+      assert(Src.ShuffleVec.getValueType() == ShuffleVT);
+  );
+
+  // The stars all align, our next step is to produce the mask for the shuffle.
+  SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
+  int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
+  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
+    SDValue Entry = Op.getOperand(i);
+    if (Entry.isUndef())
+      continue;
+
+    auto Src = find(Sources, Entry.getOperand(0));
+    int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
+
+    // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
+    // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
+    // segment.
+    EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
+    int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
+                               VT.getScalarSizeInBits());
+    int LanesDefined = BitsDefined / BitsPerShuffleLane;
+
+    // This source is expected to fill ResMultiplier lanes of the final shuffle,
+    // starting at the appropriate offset.
+    int *LaneMask = &Mask[i * ResMultiplier];
+
+    int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
+    ExtractBase += NumElts * (Src - Sources.begin());
+    for (int j = 0; j < LanesDefined; ++j)
+      LaneMask[j] = ExtractBase + j;
+  }
+
+  // Final check before we try to produce nonsense...
+  if (!isShuffleMaskLegal(Mask, ShuffleVT))
+    return SDValue();
+
+  // We can't handle more than two sources. This should have already
+  // been checked before this point.
+  assert(Sources.size() <= 2 && "Too many sources!");
+
+  SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
+  for (unsigned i = 0; i < Sources.size(); ++i)
+    ShuffleOps[i] = Sources[i].ShuffleVec;
+
+  SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
+                                         ShuffleOps[1], Mask);
+  return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+}
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                      EVT VT) const {
+  if (VT.getVectorNumElements() == 4 &&
+      (VT.is128BitVector() || VT.is64BitVector())) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (M[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = M[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex =
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return true;
+  }
+
+  bool ReverseVEXT, isV_UNDEF;
+  unsigned Imm, WhichResult;
+
+  unsigned EltSize = VT.getScalarSizeInBits();
+  return (EltSize >= 32 ||
+          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+          isVREVMask(M, VT, 64) ||
+          isVREVMask(M, VT, 32) ||
+          isVREVMask(M, VT, 16) ||
+          isVEXTMask(M, VT, ReverseVEXT, Imm) ||
+          isVTBLMask(M, VT) ||
+          isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) ||
+          ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      const SDLoc &dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
+  unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
+
+  enum {
+    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VREV,
+    OP_VDUP0,
+    OP_VDUP1,
+    OP_VDUP2,
+    OP_VDUP3,
+    OP_VEXT1,
+    OP_VEXT2,
+    OP_VEXT3,
+    OP_VUZPL, // VUZP, left result
+    OP_VUZPR, // VUZP, right result
+    OP_VZIPL, // VZIP, left result
+    OP_VZIPR, // VZIP, right result
+    OP_VTRNL, // VTRN, left result
+    OP_VTRNR  // VTRN, right result
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1*9+2)*9+3) return LHS;
+    assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+  EVT VT = OpLHS.getValueType();
+
+  switch (OpNum) {
+  default: llvm_unreachable("Unknown shuffle opcode!");
+  case OP_VREV:
+    // VREV divides the vector in half and swaps within the half.
+    if (VT.getVectorElementType() == MVT::i32 ||
+        VT.getVectorElementType() == MVT::f32)
+      return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
+    // vrev <4 x i16> -> VREV32
+    if (VT.getVectorElementType() == MVT::i16)
+      return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
+    // vrev <4 x i8> -> VREV16
+    assert(VT.getVectorElementType() == MVT::i8);
+    return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
+  case OP_VDUP0:
+  case OP_VDUP1:
+  case OP_VDUP2:
+  case OP_VDUP3:
+    return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                       OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
+  case OP_VEXT1:
+  case OP_VEXT2:
+  case OP_VEXT3:
+    return DAG.getNode(ARMISD::VEXT, dl, VT,
+                       OpLHS, OpRHS,
+                       DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
+  case OP_VUZPL:
+  case OP_VUZPR:
+    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                       OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
+  case OP_VZIPL:
+  case OP_VZIPR:
+    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                       OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
+  case OP_VTRNL:
+  case OP_VTRNR:
+    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                       OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
+  }
+}
+
+static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
+                                       ArrayRef<int> ShuffleMask,
+                                       SelectionDAG &DAG) {
+  // Check to see if we can use the VTBL instruction.
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc DL(Op);
+
+  SmallVector<SDValue, 8> VTBLMask;
+  for (ArrayRef<int>::iterator
+         I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
+    VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
+
+  if (V2.getNode()->isUndef())
+    return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
+                       DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
+
+  return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
+                     DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
+}
+
+static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
+                                                      SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  SDValue OpLHS = Op.getOperand(0);
+  EVT VT = OpLHS.getValueType();
+
+  assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
+         "Expect an v8i16/v16i8 type");
+  OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
+  // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
+  // extract the first 8 bytes into the top double word and the last 8 bytes
+  // into the bottom double word. The v8i16 case is similar.
+  unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
+  return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
+                     DAG.getConstant(ExtractNum, DL, MVT::i32));
+}
+
+static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+
+  // Convert shuffles that are directly supported on NEON to target-specific
+  // DAG nodes, instead of keeping them as shuffles and matching them again
+  // during code selection.  This is more efficient and avoids the possibility
+  // of inconsistencies between legalization and selection.
+  // FIXME: floating-point vectors should be canonicalized to integer vectors
+  // of the same time so that they get CSEd properly.
+  ArrayRef<int> ShuffleMask = SVN->getMask();
+
+  unsigned EltSize = VT.getScalarSizeInBits();
+  if (EltSize <= 32) {
+    if (SVN->isSplat()) {
+      int Lane = SVN->getSplatIndex();
+      // If this is undef splat, generate it via "just" vdup, if possible.
+      if (Lane == -1) Lane = 0;
+
+      // Test if V1 is a SCALAR_TO_VECTOR.
+      if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+        return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+      }
+      // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
+      // (and probably will turn into a SCALAR_TO_VECTOR once legalization
+      // reaches it).
+      if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
+          !isa<ConstantSDNode>(V1.getOperand(0))) {
+        bool IsScalarToVector = true;
+        for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
+          if (!V1.getOperand(i).isUndef()) {
+            IsScalarToVector = false;
+            break;
+          }
+        if (IsScalarToVector)
+          return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+      }
+      return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
+                         DAG.getConstant(Lane, dl, MVT::i32));
+    }
+
+    bool ReverseVEXT;
+    unsigned Imm;
+    if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
+      if (ReverseVEXT)
+        std::swap(V1, V2);
+      return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
+                         DAG.getConstant(Imm, dl, MVT::i32));
+    }
+
+    if (isVREVMask(ShuffleMask, VT, 64))
+      return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
+    if (isVREVMask(ShuffleMask, VT, 32))
+      return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
+    if (isVREVMask(ShuffleMask, VT, 16))
+      return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
+
+    if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
+      return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
+                         DAG.getConstant(Imm, dl, MVT::i32));
+    }
+
+    // Check for Neon shuffles that modify both input vectors in place.
+    // If both results are used, i.e., if there are two shuffles with the same
+    // source operands and with masks corresponding to both results of one of
+    // these operations, DAG memoization will ensure that a single node is
+    // used for both shuffles.
+    unsigned WhichResult;
+    bool isV_UNDEF;
+    if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
+            ShuffleMask, VT, WhichResult, isV_UNDEF)) {
+      if (isV_UNDEF)
+        V2 = V1;
+      return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
+          .getValue(WhichResult);
+    }
+
+    // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
+    // shuffles that produce a result larger than their operands with:
+    //   shuffle(concat(v1, undef), concat(v2, undef))
+    // ->
+    //   shuffle(concat(v1, v2), undef)
+    // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
+    //
+    // This is useful in the general case, but there are special cases where
+    // native shuffles produce larger results: the two-result ops.
+    //
+    // Look through the concat when lowering them:
+    //   shuffle(concat(v1, v2), undef)
+    // ->
+    //   concat(VZIP(v1, v2):0, :1)
+    //
+    if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
+      SDValue SubV1 = V1->getOperand(0);
+      SDValue SubV2 = V1->getOperand(1);
+      EVT SubVT = SubV1.getValueType();
+
+      // We expect these to have been canonicalized to -1.
+      assert(all_of(ShuffleMask, [&](int i) {
+        return i < (int)VT.getVectorNumElements();
+      }) && "Unexpected shuffle index into UNDEF operand!");
+
+      if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
+              ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
+        if (isV_UNDEF)
+          SubV2 = SubV1;
+        assert((WhichResult == 0) &&
+               "In-place shuffle of concat can only have one result!");
+        SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
+                                  SubV1, SubV2);
+        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
+                           Res.getValue(1));
+      }
+    }
+  }
+
+  // If the shuffle is not directly supported and it has 4 elements, use
+  // the PerfectShuffle-generated table to synthesize it from other shuffles.
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts == 4) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (ShuffleMask[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = ShuffleMask[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex =
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+  }
+
+  // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
+  if (EltSize >= 32) {
+    // Do the expansion with floating-point types, since that is what the VFP
+    // registers are defined to use, and since i64 is not legal.
+    EVT EltVT = EVT::getFloatingPointVT(EltSize);
+    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
+    V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
+    V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i < NumElts; ++i) {
+      if (ShuffleMask[i] < 0)
+        Ops.push_back(DAG.getUNDEF(EltVT));
+      else
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                  ShuffleMask[i] < (int)NumElts ? V1 : V2,
+                                  DAG.getConstant(ShuffleMask[i] & (NumElts-1),
+                                                  dl, MVT::i32)));
+    }
+    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
+    return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+  }
+
+  if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
+    return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
+
+  if (VT == MVT::v8i8)
+    if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
+      return NewOp;
+
+  return SDValue();
+}
+
+static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  // INSERT_VECTOR_ELT is legal only for immediate indexes.
+  SDValue Lane = Op.getOperand(2);
+  if (!isa<ConstantSDNode>(Lane))
+    return SDValue();
+
+  return Op;
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
+  SDValue Lane = Op.getOperand(1);
+  if (!isa<ConstantSDNode>(Lane))
+    return SDValue();
+
+  SDValue Vec = Op.getOperand(0);
+  if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
+    SDLoc dl(Op);
+    return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
+  }
+
+  return Op;
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+  // The only time a CONCAT_VECTORS operation can have legal types is when
+  // two 64-bit vectors are concatenated to a 128-bit vector.
+  assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
+         "unexpected CONCAT_VECTORS");
+  SDLoc dl(Op);
+  SDValue Val = DAG.getUNDEF(MVT::v2f64);
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  if (!Op0.isUndef())
+    Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
+                      DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
+                      DAG.getIntPtrConstant(0, dl));
+  if (!Op1.isUndef())
+    Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
+                      DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
+                      DAG.getIntPtrConstant(1, dl));
+  return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
+}
+
+/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
+/// element has been zero/sign-extended, depending on the isSigned parameter,
+/// from an integer type half its size.
+static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
+                                   bool isSigned) {
+  // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
+    SDNode *BVN = N->getOperand(0).getNode();
+    if (BVN->getValueType(0) != MVT::v4i32 ||
+        BVN->getOpcode() != ISD::BUILD_VECTOR)
+      return false;
+    unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
+    unsigned HiElt = 1 - LoElt;
+    ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
+    ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
+    ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
+    ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
+    if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
+      return false;
+    if (isSigned) {
+      if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
+          Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
+        return true;
+    } else {
+      if (Hi0->isNullValue() && Hi1->isNullValue())
+        return true;
+    }
+    return false;
+  }
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDNode *Elt = N->getOperand(i).getNode();
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+      unsigned EltSize = VT.getScalarSizeInBits();
+      unsigned HalfSize = EltSize / 2;
+      if (isSigned) {
+        if (!isIntN(HalfSize, C->getSExtValue()))
+          return false;
+      } else {
+        if (!isUIntN(HalfSize, C->getZExtValue()))
+          return false;
+      }
+      continue;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+/// isSignExtended - Check if a node is a vector value that is sign-extended
+/// or a constant BUILD_VECTOR with sign-extended elements.
+static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, true))
+    return true;
+  return false;
+}
+
+/// isZeroExtended - Check if a node is a vector value that is zero-extended
+/// or a constant BUILD_VECTOR with zero-extended elements.
+static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, false))
+    return true;
+  return false;
+}
+
+static EVT getExtensionTo64Bits(const EVT &OrigVT) {
+  if (OrigVT.getSizeInBits() >= 64)
+    return OrigVT;
+
+  assert(OrigVT.isSimple() && "Expecting a simple value type");
+
+  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
+  switch (OrigSimpleTy) {
+  default: llvm_unreachable("Unexpected Vector Type");
+  case MVT::v2i8:
+  case MVT::v2i16:
+     return MVT::v2i32;
+  case MVT::v4i8:
+    return  MVT::v4i16;
+  }
+}
+
+/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
+/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
+/// We insert the required extension here to get the vector to fill a D register.
+static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
+                                            const EVT &OrigTy,
+                                            const EVT &ExtTy,
+                                            unsigned ExtOpcode) {
+  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
+  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
+  // 64-bits we need to insert a new extension so that it will be 64-bits.
+  assert(ExtTy.is128BitVector() && "Unexpected extension size");
+  if (OrigTy.getSizeInBits() >= 64)
+    return N;
+
+  // Must extend size to at least 64 bits to be used as an operand for VMULL.
+  EVT NewVT = getExtensionTo64Bits(OrigTy);
+
+  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
+}
+
+/// SkipLoadExtensionForVMULL - return a load of the original vector size that
+/// does not do any sign/zero extension. If the original vector is less
+/// than 64 bits, an appropriate extension will be added after the load to
+/// reach a total size of 64 bits. We have to add the extension separately
+/// because ARM does not have a sign/zero extending load for vectors.
+static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
+  EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
+
+  // The load already has the right type.
+  if (ExtendedTy == LD->getMemoryVT())
+    return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
+                       LD->getBasePtr(), LD->getPointerInfo(),
+                       LD->getAlignment(), LD->getMemOperand()->getFlags());
+
+  // We need to create a zextload/sextload. We cannot just create a load
+  // followed by a zext/zext node because LowerMUL is also run during normal
+  // operation legalization where we can't create illegal types.
+  return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
+                        LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
+                        LD->getMemoryVT(), LD->getAlignment(),
+                        LD->getMemOperand()->getFlags());
+}
+
+/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
+/// extending load, or BUILD_VECTOR with extended elements, return the
+/// unextended value. The unextended vector should be 64 bits so that it can
+/// be used as an operand to a VMULL instruction. If the original vector size
+/// before extension is less than 64 bits we add a an extension to resize
+/// the vector to 64 bits.
+static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+    return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
+                                        N->getOperand(0)->getValueType(0),
+                                        N->getValueType(0),
+                                        N->getOpcode());
+
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return SkipLoadExtensionForVMULL(LD, DAG);
+
+  // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
+  // have been legalized as a BITCAST from v4i32.
+  if (N->getOpcode() == ISD::BITCAST) {
+    SDNode *BVN = N->getOperand(0).getNode();
+    assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
+           BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
+    unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
+    return DAG.getBuildVector(
+        MVT::v2i32, SDLoc(N),
+        {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
+  }
+  // Construct a new BUILD_VECTOR with elements truncated to half the size.
+  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
+  EVT VT = N->getValueType(0);
+  unsigned EltSize = VT.getScalarSizeInBits() / 2;
+  unsigned NumElts = VT.getVectorNumElements();
+  MVT TruncVT = MVT::getIntegerVT(EltSize);
+  SmallVector<SDValue, 8> Ops;
+  SDLoc dl(N);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
+    const APInt &CInt = C->getAPIntValue();
+    // Element types smaller than 32 bits are not legal, so use i32 elements.
+    // The values are implicitly truncated so sext vs. zext doesn't matter.
+    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
+  }
+  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
+}
+
+static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
+  }
+  return false;
+}
+
+static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
+  }
+  return false;
+}
+
+static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
+  // Multiplications are only custom-lowered for 128-bit vectors so that
+  // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
+  EVT VT = Op.getValueType();
+  assert(VT.is128BitVector() && VT.isInteger() &&
+         "unexpected type for custom-lowering ISD::MUL");
+  SDNode *N0 = Op.getOperand(0).getNode();
+  SDNode *N1 = Op.getOperand(1).getNode();
+  unsigned NewOpc = 0;
+  bool isMLA = false;
+  bool isN0SExt = isSignExtended(N0, DAG);
+  bool isN1SExt = isSignExtended(N1, DAG);
+  if (isN0SExt && isN1SExt)
+    NewOpc = ARMISD::VMULLs;
+  else {
+    bool isN0ZExt = isZeroExtended(N0, DAG);
+    bool isN1ZExt = isZeroExtended(N1, DAG);
+    if (isN0ZExt && isN1ZExt)
+      NewOpc = ARMISD::VMULLu;
+    else if (isN1SExt || isN1ZExt) {
+      // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
+      // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
+      if (isN1SExt && isAddSubSExt(N0, DAG)) {
+        NewOpc = ARMISD::VMULLs;
+        isMLA = true;
+      } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
+        NewOpc = ARMISD::VMULLu;
+        isMLA = true;
+      } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
+        std::swap(N0, N1);
+        NewOpc = ARMISD::VMULLu;
+        isMLA = true;
+      }
+    }
+
+    if (!NewOpc) {
+      if (VT == MVT::v2i64)
+        // Fall through to expand this.  It is not legal.
+        return SDValue();
+      else
+        // Other vector multiplications are legal.
+        return Op;
+    }
+  }
+
+  // Legalize to a VMULL instruction.
+  SDLoc DL(Op);
+  SDValue Op0;
+  SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
+  if (!isMLA) {
+    Op0 = SkipExtensionForVMULL(N0, DAG);
+    assert(Op0.getValueType().is64BitVector() &&
+           Op1.getValueType().is64BitVector() &&
+           "unexpected types for extended operands to VMULL");
+    return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+  }
+
+  // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
+  // isel lowering to take advantage of no-stall back to back vmul + vmla.
+  //   vmull q0, d4, d6
+  //   vmlal q0, d5, d6
+  // is faster than
+  //   vaddl q0, d4, d5
+  //   vmovl q1, d6
+  //   vmul  q0, q0, q1
+  SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
+  SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
+  EVT Op1VT = Op1.getValueType();
+  return DAG.getNode(N0->getOpcode(), DL, VT,
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
+}
+
+static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
+                              SelectionDAG &DAG) {
+  // TODO: Should this propagate fast-math-flags?
+
+  // Convert to float
+  // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
+  // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
+  X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
+  Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
+  X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
+  Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
+  // Get reciprocal estimate.
+  // float4 recip = vrecpeq_f32(yf);
+  Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
+                   Y);
+  // Because char has a smaller range than uchar, we can actually get away
+  // without any newton steps.  This requires that we use a weird bias
+  // of 0xb000, however (again, this has been exhaustively tested).
+  // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
+  X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
+  X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
+  Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
+  X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
+  X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
+  // Convert back to short.
+  X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
+  X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
+  return X;
+}
+
+static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
+                               SelectionDAG &DAG) {
+  // TODO: Should this propagate fast-math-flags?
+
+  SDValue N2;
+  // Convert to float.
+  // float4 yf = vcvt_f32_s32(vmovl_s16(y));
+  // float4 xf = vcvt_f32_s32(vmovl_s16(x));
+  N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
+  N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
+  N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
+  N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
+
+  // Use reciprocal estimate and one refinement step.
+  // float4 recip = vrecpeq_f32(yf);
+  // recip *= vrecpsq_f32(yf, recip);
+  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
+                   N1);
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
+                   N1, N2);
+  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+  // Because short has a smaller range than ushort, we can actually get away
+  // with only a single newton step.  This requires that we use a weird bias
+  // of 89, however (again, this has been exhaustively tested).
+  // float4 result = as_float4(as_int4(xf*recip) + 0x89);
+  N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
+  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
+  N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
+  N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
+  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
+  // Convert back to integer and return.
+  // return vmovn_s32(vcvt_s32_f32(result));
+  N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
+  N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
+  return N0;
+}
+
+static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
+         "unexpected type for custom-lowering ISD::SDIV");
+
+  SDLoc dl(Op);
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2, N3;
+
+  if (VT == MVT::v8i8) {
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
+
+    N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+                     DAG.getIntPtrConstant(4, dl));
+    N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+                     DAG.getIntPtrConstant(4, dl));
+    N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+                     DAG.getIntPtrConstant(0, dl));
+    N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+                     DAG.getIntPtrConstant(0, dl));
+
+    N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
+    N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
+
+    N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
+    N0 = LowerCONCAT_VECTORS(N0, DAG);
+
+    N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
+    return N0;
+  }
+  return LowerSDIV_v4i16(N0, N1, dl, DAG);
+}
+
+static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
+  // TODO: Should this propagate fast-math-flags?
+  EVT VT = Op.getValueType();
+  assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
+         "unexpected type for custom-lowering ISD::UDIV");
+
+  SDLoc dl(Op);
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2, N3;
+
+  if (VT == MVT::v8i8) {
+    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
+    N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
+
+    N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+                     DAG.getIntPtrConstant(4, dl));
+    N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+                     DAG.getIntPtrConstant(4, dl));
+    N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+                     DAG.getIntPtrConstant(0, dl));
+    N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+                     DAG.getIntPtrConstant(0, dl));
+
+    N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
+    N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
+
+    N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
+    N0 = LowerCONCAT_VECTORS(N0, DAG);
+
+    N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
+                     DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
+                                     MVT::i32),
+                     N0);
+    return N0;
+  }
+
+  // v4i16 sdiv ... Convert to float.
+  // float4 yf = vcvt_f32_s32(vmovl_u16(y));
+  // float4 xf = vcvt_f32_s32(vmovl_u16(x));
+  N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
+  N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
+  N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
+  SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
+
+  // Use reciprocal estimate and two refinement steps.
+  // float4 recip = vrecpeq_f32(yf);
+  // recip *= vrecpsq_f32(yf, recip);
+  // recip *= vrecpsq_f32(yf, recip);
+  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
+                   BN1);
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
+                   BN1, N2);
+  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
+                   BN1, N2);
+  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+  // Simply multiplying by the reciprocal estimate can leave us a few ulps
+  // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
+  // and that it will never cause us to return an answer too large).
+  // float4 result = as_float4(as_int4(xf*recip) + 2);
+  N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
+  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
+  N1 = DAG.getConstant(2, dl, MVT::v4i32);
+  N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
+  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
+  // Convert back to integer and return.
+  // return vmovn_u32(vcvt_s32_f32(result));
+  N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
+  N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
+  return N0;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getNode()->getValueType(0);
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+  unsigned Opc;
+  bool ExtraOp = false;
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid code");
+  case ISD::ADDC: Opc = ARMISD::ADDC; break;
+  case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
+  case ISD::SUBC: Opc = ARMISD::SUBC; break;
+  case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
+  }
+
+  if (!ExtraOp)
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
+                       Op.getOperand(1));
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
+                     Op.getOperand(1), Op.getOperand(2));
+}
+
+SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin());
+
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // return values are passed via sret.
+  SDLoc dl(Op);
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Pair of floats / doubles used to pass the result.
+  Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
+  auto &DL = DAG.getDataLayout();
+
+  ArgListTy Args;
+  bool ShouldUseSRet = Subtarget->isAPCS_ABI();
+  SDValue SRet;
+  if (ShouldUseSRet) {
+    // Create stack object for sret.
+    const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
+    const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
+    int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
+    SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
+
+    ArgListEntry Entry;
+    Entry.Node = SRet;
+    Entry.Ty = RetTy->getPointerTo();
+    Entry.isSExt = false;
+    Entry.isZExt = false;
+    Entry.isSRet = true;
+    Args.push_back(Entry);
+    RetTy = Type::getVoidTy(*DAG.getContext());
+  }
+
+  ArgListEntry Entry;
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  const char *LibcallName =
+      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
+  RTLIB::Libcall LC =
+      (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32;
+  CallingConv::ID CC = getLibcallCallingConv(LC);
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setCallee(CC, RetTy, Callee, std::move(Args))
+      .setDiscardResult(ShouldUseSRet);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+  if (!ShouldUseSRet)
+    return CallResult.first;
+
+  SDValue LoadSin =
+      DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
+
+  // Address of cos field.
+  SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
+                            DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
+  SDValue LoadCos =
+      DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
+
+  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
+                     LoadSin.getValue(0), LoadCos.getValue(0));
+}
+
+SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
+                                                  bool Signed,
+                                                  SDValue &Chain) const {
+  EVT VT = Op.getValueType();
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "unexpected type for custom lowering DIV");
+  SDLoc dl(Op);
+
+  const auto &DL = DAG.getDataLayout();
+  const auto &TLI = DAG.getTargetLoweringInfo();
+
+  const char *Name = nullptr;
+  if (Signed)
+    Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
+  else
+    Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
+
+  SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
+
+  ARMTargetLowering::ArgListTy Args;
+
+  for (auto AI : {1, 0}) {
+    ArgListEntry Arg;
+    Arg.Node = Op.getOperand(AI);
+    Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
+    Args.push_back(Arg);
+  }
+
+  CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+    .setChain(Chain)
+    .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
+               ES, std::move(Args));
+
+  return LowerCallTo(CLI).first;
+}
+
+SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
+                                            bool Signed) const {
+  assert(Op.getValueType() == MVT::i32 &&
+         "unexpected type for custom lowering DIV");
+  SDLoc dl(Op);
+
+  SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
+                               DAG.getEntryNode(), Op.getOperand(1));
+
+  return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
+}
+
+static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {
+  SDLoc DL(N);
+  SDValue Op = N->getOperand(1);
+  if (N->getValueType(0) == MVT::i32)
+    return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
+                           DAG.getConstant(0, DL, MVT::i32));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
+                           DAG.getConstant(1, DL, MVT::i32));
+  return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
+                     DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
+}
+
+void ARMTargetLowering::ExpandDIV_Windows(
+    SDValue Op, SelectionDAG &DAG, bool Signed,
+    SmallVectorImpl<SDValue> &Results) const {
+  const auto &DL = DAG.getDataLayout();
+  const auto &TLI = DAG.getTargetLoweringInfo();
+
+  assert(Op.getValueType() == MVT::i64 &&
+         "unexpected type for custom lowering DIV");
+  SDLoc dl(Op);
+
+  SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
+
+  SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
+
+  SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
+  SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
+                              DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
+  Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
+
+  Results.push_back(Lower);
+  Results.push_back(Upper);
+}
+
+static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
+  if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
+    // Acquire/Release load/store is not legal for targets without a dmb or
+    // equivalent available.
+    return SDValue();
+
+  // Monotonic load/store is legal for all targets.
+  return Op;
+}
+
+static void ReplaceREADCYCLECOUNTER(SDNode *N,
+                                    SmallVectorImpl<SDValue> &Results,
+                                    SelectionDAG &DAG,
+                                    const ARMSubtarget *Subtarget) {
+  SDLoc DL(N);
+  // Under Power Management extensions, the cycle-count is:
+  //    mrc p15, #0, <Rt>, c9, c13, #0
+  SDValue Ops[] = { N->getOperand(0), // Chain
+                    DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
+                    DAG.getConstant(15, DL, MVT::i32),
+                    DAG.getConstant(0, DL, MVT::i32),
+                    DAG.getConstant(9, DL, MVT::i32),
+                    DAG.getConstant(13, DL, MVT::i32),
+                    DAG.getConstant(0, DL, MVT::i32)
+  };
+
+  SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+                                 DAG.getVTList(MVT::i32, MVT::Other), Ops);
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
+                                DAG.getConstant(0, DL, MVT::i32)));
+  Results.push_back(Cycles32.getValue(1));
+}
+
+static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
+  SDLoc dl(V.getNode());
+  SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
+  SDValue VHi = DAG.getAnyExtOrTrunc(
+      DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
+      dl, MVT::i32);
+  SDValue RegClass =
+      DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
+  SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
+  SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
+  const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
+  return SDValue(
+      DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
+}
+
+static void ReplaceCMP_SWAP_64Results(SDNode *N,
+                                       SmallVectorImpl<SDValue> & Results,
+                                       SelectionDAG &DAG) {
+  assert(N->getValueType(0) == MVT::i64 &&
+         "AtomicCmpSwap on types less than 64 should be legal");
+  SDValue Ops[] = {N->getOperand(1),
+                   createGPRPairNode(DAG, N->getOperand(2)),
+                   createGPRPairNode(DAG, N->getOperand(3)),
+                   N->getOperand(0)};
+  SDNode *CmpSwap = DAG.getMachineNode(
+      ARM::CMP_SWAP_64, SDLoc(N),
+      DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+  Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_0, SDLoc(N), MVT::i32,
+                                               SDValue(CmpSwap, 0)));
+  Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_1, SDLoc(N), MVT::i32,
+                                               SDValue(CmpSwap, 0)));
+  Results.push_back(SDValue(CmpSwap, 2));
+}
+
+static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
+                          SelectionDAG &DAG) {
+  const auto &TLI = DAG.getTargetLoweringInfo();
+
+  assert(Subtarget.getTargetTriple().isOSMSVCRT() &&
+         "Custom lowering is MSVCRT specific!");
+
+  SDLoc dl(Op);
+  SDValue Val = Op.getOperand(0);
+  MVT Ty = Val->getSimpleValueType(0);
+  SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1));
+  SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow",
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Node = Val;
+  Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext());
+  Entry.isZExt = true;
+  Args.push_back(Entry);
+
+  Entry.Node = Exponent;
+  Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext());
+  Entry.isZExt = true;
+  Args.push_back(Entry);
+
+  Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext());
+
+  // In the in-chain to the call is the entry node  If we are emitting a
+  // tailcall, the chain will be mutated if the node has a non-entry input
+  // chain.
+  SDValue InChain = DAG.getEntryNode();
+  SDValue TCChain = InChain;
+
+  const auto *F = DAG.getMachineFunction().getFunction();
+  bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) &&
+              F->getReturnType() == LCRTy;
+  if (IsTC)
+    InChain = TCChain;
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+      .setChain(InChain)
+      .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args))
+      .setTailCall(IsTC);
+  std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI);
+
+  // Return the chain (the DAG root) if it is a tail call
+  return !CI.second.getNode() ? DAG.getRoot() : CI.first;
+}
+
+SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Don't know how to custom lower this!");
+  case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
+  case ISD::ConstantPool:
+    if (Subtarget->genExecuteOnly())
+      llvm_unreachable("execute-only should not generate constant pools");
+    return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
+  case ISD::GlobalAddress:
+    switch (Subtarget->getTargetTriple().getObjectFormat()) {
+    default: llvm_unreachable("unknown object format");
+    case Triple::COFF:
+      return LowerGlobalAddressWindows(Op, DAG);
+    case Triple::ELF:
+      return LowerGlobalAddressELF(Op, DAG);
+    case Triple::MachO:
+      return LowerGlobalAddressDarwin(Op, DAG);
+    }
+  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::SELECT:        return LowerSELECT(Op, DAG);
+  case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
+  case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
+  case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
+  case ISD::VASTART:       return LowerVASTART(Op, DAG);
+  case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
+  case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
+  case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
+  case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
+  case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
+  case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
+  case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
+                                                               Subtarget);
+  case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
+  case ISD::SREM:          return LowerREM(Op.getNode(), DAG);
+  case ISD::UREM:          return LowerREM(Op.getNode(), DAG);
+  case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRL_PARTS:
+  case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+  case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
+  case ISD::SETCC:         return LowerVSETCC(Op, DAG);
+  case ISD::SETCCE:        return LowerSETCCE(Op, DAG);
+  case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
+  case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
+  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
+  case ISD::MUL:           return LowerMUL(Op, DAG);
+  case ISD::SDIV:
+    if (Subtarget->isTargetWindows())
+      return LowerDIV_Windows(Op, DAG, /* Signed */ true);
+    return LowerSDIV(Op, DAG);
+  case ISD::UDIV:
+    if (Subtarget->isTargetWindows())
+      return LowerDIV_Windows(Op, DAG, /* Signed */ false);
+    return LowerUDIV(Op, DAG);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+    return LowerXALUO(Op, DAG);
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
+  case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
+      return LowerDYNAMIC_STACKALLOC(Op, DAG);
+    llvm_unreachable("Don't know how to custom lower this!");
+  case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
+  case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+  case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG);
+  case ARMISD::WIN__DBZCHK: return SDValue();
+  }
+}
+
+/// ReplaceNodeResults - Replace the results of node with an illegal result
+/// type with new values built out of custom code.
+void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue> &Results,
+                                           SelectionDAG &DAG) const {
+  SDValue Res;
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to custom expand this!");
+  case ISD::READ_REGISTER:
+    ExpandREAD_REGISTER(N, Results, DAG);
+    break;
+  case ISD::BITCAST:
+    Res = ExpandBITCAST(N, DAG);
+    break;
+  case ISD::SRL:
+  case ISD::SRA:
+    Res = Expand64BitShift(N, DAG, Subtarget);
+    break;
+  case ISD::SREM:
+  case ISD::UREM:
+    Res = LowerREM(N, DAG);
+    break;
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+    Res = LowerDivRem(SDValue(N, 0), DAG);
+    assert(Res.getNumOperands() == 2 && "DivRem needs two values");
+    Results.push_back(Res.getValue(0));
+    Results.push_back(Res.getValue(1));
+    return;
+  case ISD::READCYCLECOUNTER:
+    ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
+    return;
+  case ISD::UDIV:
+  case ISD::SDIV:
+    assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
+    return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
+                             Results);
+  case ISD::ATOMIC_CMP_SWAP:
+    ReplaceCMP_SWAP_64Results(N, Results, DAG);
+    return;
+  }
+  if (Res.getNode())
+    Results.push_back(Res);
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
+/// registers the function context.
+void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
+                                               MachineBasicBlock *MBB,
+                                               MachineBasicBlock *DispatchBB,
+                                               int FI) const {
+  assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
+         "ROPI/RWPI not currently supported with SjLj");
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  MachineConstantPool *MCP = MF->getConstantPool();
+  ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
+  const Function *F = MF->getFunction();
+
+  bool isThumb = Subtarget->isThumb();
+  bool isThumb2 = Subtarget->isThumb2();
+
+  unsigned PCLabelId = AFI->createPICLabelUId();
+  unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
+  ARMConstantPoolValue *CPV =
+    ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
+  unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
+
+  const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
+                                           : &ARM::GPRRegClass;
+
+  // Grab constant pool and fixed stack memory operands.
+  MachineMemOperand *CPMMO =
+      MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
+                               MachineMemOperand::MOLoad, 4, 4);
+
+  MachineMemOperand *FIMMOSt =
+      MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
+                               MachineMemOperand::MOStore, 4, 4);
+
+  // Load the address of the dispatch MBB into the jump buffer.
+  if (isThumb2) {
+    // Incoming value: jbuf
+    //   ldr.n  r5, LCPI1_1
+    //   orr    r5, r5, #1
+    //   add    r5, pc
+    //   str    r5, [$jbuf, #+4] ; &jbuf[1]
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
+                   .addConstantPoolIndex(CPI)
+                   .addMemOperand(CPMMO));
+    // Set the low bit because of thumb mode.
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultCC(
+      AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
+                     .addReg(NewVReg1, RegState::Kill)
+                     .addImm(0x01)));
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
+      .addReg(NewVReg2, RegState::Kill)
+      .addImm(PCLabelId);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
+                   .addReg(NewVReg3, RegState::Kill)
+                   .addFrameIndex(FI)
+                   .addImm(36)  // &jbuf[1] :: pc
+                   .addMemOperand(FIMMOSt));
+  } else if (isThumb) {
+    // Incoming value: jbuf
+    //   ldr.n  r1, LCPI1_4
+    //   add    r1, pc
+    //   mov    r2, #1
+    //   orrs   r1, r2
+    //   add    r2, $jbuf, #+4 ; &jbuf[1]
+    //   str    r1, [r2]
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
+                   .addConstantPoolIndex(CPI)
+                   .addMemOperand(CPMMO));
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
+      .addReg(NewVReg1, RegState::Kill)
+      .addImm(PCLabelId);
+    // Set the low bit because of thumb mode.
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addImm(1));
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg2, RegState::Kill)
+                   .addReg(NewVReg3, RegState::Kill));
+    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
+            .addFrameIndex(FI)
+            .addImm(36); // &jbuf[1] :: pc
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
+                   .addReg(NewVReg4, RegState::Kill)
+                   .addReg(NewVReg5, RegState::Kill)
+                   .addImm(0)
+                   .addMemOperand(FIMMOSt));
+  } else {
+    // Incoming value: jbuf
+    //   ldr  r1, LCPI1_1
+    //   add  r1, pc, r1
+    //   str  r1, [$jbuf, #+4] ; &jbuf[1]
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12),  NewVReg1)
+                   .addConstantPoolIndex(CPI)
+                   .addImm(0)
+                   .addMemOperand(CPMMO));
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
+                   .addReg(NewVReg1, RegState::Kill)
+                   .addImm(PCLabelId));
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
+                   .addReg(NewVReg2, RegState::Kill)
+                   .addFrameIndex(FI)
+                   .addImm(36)  // &jbuf[1] :: pc
+                   .addMemOperand(FIMMOSt));
+  }
+}
+
+void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
+                                              MachineBasicBlock *MBB) const {
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  int FI = MFI.getFunctionContextIndex();
+
+  const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
+                                                        : &ARM::GPRnopcRegClass;
+
+  // Get a mapping of the call site numbers to all of the landing pads they're
+  // associated with.
+  DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad;
+  unsigned MaxCSNum = 0;
+  for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
+       ++BB) {
+    if (!BB->isEHPad()) continue;
+
+    // FIXME: We should assert that the EH_LABEL is the first MI in the landing
+    // pad.
+    for (MachineBasicBlock::iterator
+           II = BB->begin(), IE = BB->end(); II != IE; ++II) {
+      if (!II->isEHLabel()) continue;
+
+      MCSymbol *Sym = II->getOperand(0).getMCSymbol();
+      if (!MF->hasCallSiteLandingPad(Sym)) continue;
+
+      SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
+      for (SmallVectorImpl<unsigned>::iterator
+             CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
+           CSI != CSE; ++CSI) {
+        CallSiteNumToLPad[*CSI].push_back(&*BB);
+        MaxCSNum = std::max(MaxCSNum, *CSI);
+      }
+      break;
+    }
+  }
+
+  // Get an ordered list of the machine basic blocks for the jump table.
+  std::vector<MachineBasicBlock*> LPadList;
+  SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
+  LPadList.reserve(CallSiteNumToLPad.size());
+  for (unsigned I = 1; I <= MaxCSNum; ++I) {
+    SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
+    for (SmallVectorImpl<MachineBasicBlock*>::iterator
+           II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
+      LPadList.push_back(*II);
+      InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
+    }
+  }
+
+  assert(!LPadList.empty() &&
+         "No landing pad destinations for the dispatch jump table!");
+
+  // Create the jump table and associated information.
+  MachineJumpTableInfo *JTI =
+    MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
+  unsigned MJTI = JTI->createJumpTableIndex(LPadList);
+
+  // Create the MBBs for the dispatch code.
+
+  // Shove the dispatch's address into the return slot in the function context.
+  MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
+  DispatchBB->setIsEHPad();
+
+  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+  unsigned trap_opcode;
+  if (Subtarget->isThumb())
+    trap_opcode = ARM::tTRAP;
+  else
+    trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
+
+  BuildMI(TrapBB, dl, TII->get(trap_opcode));
+  DispatchBB->addSuccessor(TrapBB);
+
+  MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
+  DispatchBB->addSuccessor(DispContBB);
+
+  // Insert and MBBs.
+  MF->insert(MF->end(), DispatchBB);
+  MF->insert(MF->end(), DispContBB);
+  MF->insert(MF->end(), TrapBB);
+
+  // Insert code into the entry block that creates and registers the function
+  // context.
+  SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
+
+  MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FI),
+      MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
+
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
+
+  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
+  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+
+  // Add a register mask with no preserved registers.  This results in all
+  // registers being marked as clobbered. This can't work if the dispatch block
+  // is in a Thumb1 function and is linked with ARM code which uses the FP
+  // registers, as there is no way to preserve the FP registers in Thumb1 mode.
+  MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF));
+
+  bool IsPositionIndependent = isPositionIndependent();
+  unsigned NumLPads = LPadList.size();
+  if (Subtarget->isThumb2()) {
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
+                   .addFrameIndex(FI)
+                   .addImm(4)
+                   .addMemOperand(FIMMOLd));
+
+    if (NumLPads < 256) {
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
+                     .addReg(NewVReg1)
+                     .addImm(LPadList.size()));
+    } else {
+      unsigned VReg1 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
+                     .addImm(NumLPads & 0xFFFF));
+
+      unsigned VReg2 = VReg1;
+      if ((NumLPads & 0xFFFF0000) != 0) {
+        VReg2 = MRI->createVirtualRegister(TRC);
+        AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
+                       .addReg(VReg1)
+                       .addImm(NumLPads >> 16));
+      }
+
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
+                     .addReg(NewVReg1)
+                     .addReg(VReg2));
+    }
+
+    BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
+      .addMBB(TrapBB)
+      .addImm(ARMCC::HI)
+      .addReg(ARM::CPSR);
+
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3)
+                   .addJumpTableIndex(MJTI));
+
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultCC(
+      AddDefaultPred(
+        BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
+        .addReg(NewVReg3, RegState::Kill)
+        .addReg(NewVReg1)
+        .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
+
+    BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
+      .addReg(NewVReg4, RegState::Kill)
+      .addReg(NewVReg1)
+      .addJumpTableIndex(MJTI);
+  } else if (Subtarget->isThumb()) {
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
+                   .addFrameIndex(FI)
+                   .addImm(1)
+                   .addMemOperand(FIMMOLd));
+
+    if (NumLPads < 256) {
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
+                     .addReg(NewVReg1)
+                     .addImm(NumLPads));
+    } else {
+      MachineConstantPool *ConstantPool = MF->getConstantPool();
+      Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
+      const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
+
+      // MachineConstantPool wants an explicit alignment.
+      unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
+      if (Align == 0)
+        Align = MF->getDataLayout().getTypeAllocSize(C->getType());
+      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+
+      unsigned VReg1 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
+                     .addReg(VReg1, RegState::Define)
+                     .addConstantPoolIndex(Idx));
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
+                     .addReg(NewVReg1)
+                     .addReg(VReg1));
+    }
+
+    BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
+      .addMBB(TrapBB)
+      .addImm(ARMCC::HI)
+      .addReg(ARM::CPSR);
+
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg1)
+                   .addImm(2));
+
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
+                   .addJumpTableIndex(MJTI));
+
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg2, RegState::Kill)
+                   .addReg(NewVReg3));
+
+    MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
+        MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
+
+    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
+                   .addReg(NewVReg4, RegState::Kill)
+                   .addImm(0)
+                   .addMemOperand(JTMMOLd));
+
+    unsigned NewVReg6 = NewVReg5;
+    if (IsPositionIndependent) {
+      NewVReg6 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
+                     .addReg(ARM::CPSR, RegState::Define)
+                     .addReg(NewVReg5, RegState::Kill)
+                     .addReg(NewVReg3));
+    }
+
+    BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
+      .addReg(NewVReg6, RegState::Kill)
+      .addJumpTableIndex(MJTI);
+  } else {
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
+                   .addFrameIndex(FI)
+                   .addImm(4)
+                   .addMemOperand(FIMMOLd));
+
+    if (NumLPads < 256) {
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
+                     .addReg(NewVReg1)
+                     .addImm(NumLPads));
+    } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
+      unsigned VReg1 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
+                     .addImm(NumLPads & 0xFFFF));
+
+      unsigned VReg2 = VReg1;
+      if ((NumLPads & 0xFFFF0000) != 0) {
+        VReg2 = MRI->createVirtualRegister(TRC);
+        AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
+                       .addReg(VReg1)
+                       .addImm(NumLPads >> 16));
+      }
+
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
+                     .addReg(NewVReg1)
+                     .addReg(VReg2));
+    } else {
+      MachineConstantPool *ConstantPool = MF->getConstantPool();
+      Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
+      const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
+
+      // MachineConstantPool wants an explicit alignment.
+      unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
+      if (Align == 0)
+        Align = MF->getDataLayout().getTypeAllocSize(C->getType());
+      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+
+      unsigned VReg1 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
+                     .addReg(VReg1, RegState::Define)
+                     .addConstantPoolIndex(Idx)
+                     .addImm(0));
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
+                     .addReg(NewVReg1)
+                     .addReg(VReg1, RegState::Kill));
+    }
+
+    BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
+      .addMBB(TrapBB)
+      .addImm(ARMCC::HI)
+      .addReg(ARM::CPSR);
+
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultCC(
+      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
+                     .addReg(NewVReg1)
+                     .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
+                   .addJumpTableIndex(MJTI));
+
+    MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
+        MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
+    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(
+      BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
+      .addReg(NewVReg3, RegState::Kill)
+      .addReg(NewVReg4)
+      .addImm(0)
+      .addMemOperand(JTMMOLd));
+
+    if (IsPositionIndependent) {
+      BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
+        .addReg(NewVReg5, RegState::Kill)
+        .addReg(NewVReg4)
+        .addJumpTableIndex(MJTI);
+    } else {
+      BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
+        .addReg(NewVReg5, RegState::Kill)
+        .addJumpTableIndex(MJTI);
+    }
+  }
+
+  // Add the jump table entries as successors to the MBB.
+  SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
+  for (std::vector<MachineBasicBlock*>::iterator
+         I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
+    MachineBasicBlock *CurMBB = *I;
+    if (SeenMBBs.insert(CurMBB).second)
+      DispContBB->addSuccessor(CurMBB);
+  }
+
+  // N.B. the order the invoke BBs are processed in doesn't matter here.
+  const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
+  SmallVector<MachineBasicBlock*, 64> MBBLPads;
+  for (MachineBasicBlock *BB : InvokeBBs) {
+
+    // Remove the landing pad successor from the invoke block and replace it
+    // with the new dispatch block.
+    SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
+                                                  BB->succ_end());
+    while (!Successors.empty()) {
+      MachineBasicBlock *SMBB = Successors.pop_back_val();
+      if (SMBB->isEHPad()) {
+        BB->removeSuccessor(SMBB);
+        MBBLPads.push_back(SMBB);
+      }
+    }
+
+    BB->addSuccessor(DispatchBB, BranchProbability::getZero());
+    BB->normalizeSuccProbs();
+
+    // Find the invoke call and mark all of the callee-saved registers as
+    // 'implicit defined' so that they're spilled. This prevents code from
+    // moving instructions to before the EH block, where they will never be
+    // executed.
+    for (MachineBasicBlock::reverse_iterator
+           II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
+      if (!II->isCall()) continue;
+
+      DenseMap<unsigned, bool> DefRegs;
+      for (MachineInstr::mop_iterator
+             OI = II->operands_begin(), OE = II->operands_end();
+           OI != OE; ++OI) {
+        if (!OI->isReg()) continue;
+        DefRegs[OI->getReg()] = true;
+      }
+
+      MachineInstrBuilder MIB(*MF, &*II);
+
+      for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
+        unsigned Reg = SavedRegs[i];
+        if (Subtarget->isThumb2() &&
+            !ARM::tGPRRegClass.contains(Reg) &&
+            !ARM::hGPRRegClass.contains(Reg))
+          continue;
+        if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
+          continue;
+        if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
+          continue;
+        if (!DefRegs[Reg])
+          MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
+      }
+
+      break;
+    }
+  }
+
+  // Mark all former landing pads as non-landing pads. The dispatch is the only
+  // landing pad now.
+  for (SmallVectorImpl<MachineBasicBlock*>::iterator
+         I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
+    (*I)->setIsEHPad(false);
+
+  // The instruction is gone now.
+  MI.eraseFromParent();
+}
+
+static
+MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I != Succ)
+      return *I;
+  llvm_unreachable("Expecting a BB with two successors!");
+}
+
+/// Return the load opcode for a given load size. If load size >= 8,
+/// neon opcode will be returned.
+static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
+  if (LdSize >= 8)
+    return LdSize == 16 ? ARM::VLD1q32wb_fixed
+                        : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
+  if (IsThumb1)
+    return LdSize == 4 ? ARM::tLDRi
+                       : LdSize == 2 ? ARM::tLDRHi
+                                     : LdSize == 1 ? ARM::tLDRBi : 0;
+  if (IsThumb2)
+    return LdSize == 4 ? ARM::t2LDR_POST
+                       : LdSize == 2 ? ARM::t2LDRH_POST
+                                     : LdSize == 1 ? ARM::t2LDRB_POST : 0;
+  return LdSize == 4 ? ARM::LDR_POST_IMM
+                     : LdSize == 2 ? ARM::LDRH_POST
+                                   : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
+}
+
+/// Return the store opcode for a given store size. If store size >= 8,
+/// neon opcode will be returned.
+static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
+  if (StSize >= 8)
+    return StSize == 16 ? ARM::VST1q32wb_fixed
+                        : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
+  if (IsThumb1)
+    return StSize == 4 ? ARM::tSTRi
+                       : StSize == 2 ? ARM::tSTRHi
+                                     : StSize == 1 ? ARM::tSTRBi : 0;
+  if (IsThumb2)
+    return StSize == 4 ? ARM::t2STR_POST
+                       : StSize == 2 ? ARM::t2STRH_POST
+                                     : StSize == 1 ? ARM::t2STRB_POST : 0;
+  return StSize == 4 ? ARM::STR_POST_IMM
+                     : StSize == 2 ? ARM::STRH_POST
+                                   : StSize == 1 ? ARM::STRB_POST_IMM : 0;
+}
+
+/// Emit a post-increment load operation with given size. The instructions
+/// will be added to BB at Pos.
+static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
+                       const TargetInstrInfo *TII, const DebugLoc &dl,
+                       unsigned LdSize, unsigned Data, unsigned AddrIn,
+                       unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
+  unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
+  assert(LdOpc != 0 && "Should have a load opcode");
+  if (LdSize >= 8) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+                       .addImm(0));
+  } else if (IsThumb1) {
+    // load + update AddrIn
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrIn).addImm(0));
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
+    MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(AddrIn).addImm(LdSize);
+    AddDefaultPred(MIB);
+  } else if (IsThumb2) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+                       .addImm(LdSize));
+  } else { // arm
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+                       .addReg(0).addImm(LdSize));
+  }
+}
+
+/// Emit a post-increment store operation with given size. The instructions
+/// will be added to BB at Pos.
+static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
+                       const TargetInstrInfo *TII, const DebugLoc &dl,
+                       unsigned StSize, unsigned Data, unsigned AddrIn,
+                       unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
+  unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
+  assert(StOpc != 0 && "Should have a store opcode");
+  if (StSize >= 8) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+                       .addReg(AddrIn).addImm(0).addReg(Data));
+  } else if (IsThumb1) {
+    // store + update AddrIn
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data)
+                       .addReg(AddrIn).addImm(0));
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
+    MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(AddrIn).addImm(StSize);
+    AddDefaultPred(MIB);
+  } else if (IsThumb2) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+                       .addReg(Data).addReg(AddrIn).addImm(StSize));
+  } else { // arm
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+                       .addReg(Data).addReg(AddrIn).addReg(0)
+                       .addImm(StSize));
+  }
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitStructByval(MachineInstr &MI,
+                                   MachineBasicBlock *BB) const {
+  // This pseudo instruction has 3 operands: dst, src, size
+  // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
+  // Otherwise, we will generate unrolled scalar copies.
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = ++BB->getIterator();
+
+  unsigned dest = MI.getOperand(0).getReg();
+  unsigned src = MI.getOperand(1).getReg();
+  unsigned SizeVal = MI.getOperand(2).getImm();
+  unsigned Align = MI.getOperand(3).getImm();
+  DebugLoc dl = MI.getDebugLoc();
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned UnitSize = 0;
+  const TargetRegisterClass *TRC = nullptr;
+  const TargetRegisterClass *VecTRC = nullptr;
+
+  bool IsThumb1 = Subtarget->isThumb1Only();
+  bool IsThumb2 = Subtarget->isThumb2();
+  bool IsThumb = Subtarget->isThumb();
+
+  if (Align & 1) {
+    UnitSize = 1;
+  } else if (Align & 2) {
+    UnitSize = 2;
+  } else {
+    // Check whether we can use NEON instructions.
+    if (!MF->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
+        Subtarget->hasNEON()) {
+      if ((Align % 16 == 0) && SizeVal >= 16)
+        UnitSize = 16;
+      else if ((Align % 8 == 0) && SizeVal >= 8)
+        UnitSize = 8;
+    }
+    // Can't use NEON instructions.
+    if (UnitSize == 0)
+      UnitSize = 4;
+  }
+
+  // Select the correct opcode and register class for unit size load/store
+  bool IsNeon = UnitSize >= 8;
+  TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
+  if (IsNeon)
+    VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
+                            : UnitSize == 8 ? &ARM::DPRRegClass
+                                            : nullptr;
+
+  unsigned BytesLeft = SizeVal % UnitSize;
+  unsigned LoopSize = SizeVal - BytesLeft;
+
+  if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
+    // Use LDR and STR to copy.
+    // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
+    // [destOut] = STR_POST(scratch, destIn, UnitSize)
+    unsigned srcIn = src;
+    unsigned destIn = dest;
+    for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
+      unsigned srcOut = MRI.createVirtualRegister(TRC);
+      unsigned destOut = MRI.createVirtualRegister(TRC);
+      unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+      emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
+                 IsThumb1, IsThumb2);
+      emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
+                 IsThumb1, IsThumb2);
+      srcIn = srcOut;
+      destIn = destOut;
+    }
+
+    // Handle the leftover bytes with LDRB and STRB.
+    // [scratch, srcOut] = LDRB_POST(srcIn, 1)
+    // [destOut] = STRB_POST(scratch, destIn, 1)
+    for (unsigned i = 0; i < BytesLeft; i++) {
+      unsigned srcOut = MRI.createVirtualRegister(TRC);
+      unsigned destOut = MRI.createVirtualRegister(TRC);
+      unsigned scratch = MRI.createVirtualRegister(TRC);
+      emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
+                 IsThumb1, IsThumb2);
+      emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
+                 IsThumb1, IsThumb2);
+      srcIn = srcOut;
+      destIn = destOut;
+    }
+    MI.eraseFromParent(); // The instruction is gone now.
+    return BB;
+  }
+
+  // Expand the pseudo op to a loop.
+  // thisMBB:
+  //   ...
+  //   movw varEnd, # --> with thumb2
+  //   movt varEnd, #
+  //   ldrcp varEnd, idx --> without thumb2
+  //   fallthrough --> loopMBB
+  // loopMBB:
+  //   PHI varPhi, varEnd, varLoop
+  //   PHI srcPhi, src, srcLoop
+  //   PHI destPhi, dst, destLoop
+  //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
+  //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
+  //   subs varLoop, varPhi, #UnitSize
+  //   bne loopMBB
+  //   fallthrough --> exitMBB
+  // exitMBB:
+  //   epilogue to handle left-over bytes
+  //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
+  //   [destOut] = STRB_POST(scratch, destLoop, 1)
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Load an immediate to varEnd.
+  unsigned varEnd = MRI.createVirtualRegister(TRC);
+  if (Subtarget->useMovt(*MF)) {
+    unsigned Vtmp = varEnd;
+    if ((LoopSize & 0xFFFF0000) != 0)
+      Vtmp = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(BB, dl,
+                           TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16),
+                           Vtmp).addImm(LoopSize & 0xFFFF));
+
+    if ((LoopSize & 0xFFFF0000) != 0)
+      AddDefaultPred(BuildMI(BB, dl,
+                             TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16),
+                             varEnd)
+                         .addReg(Vtmp)
+                         .addImm(LoopSize >> 16));
+  } else {
+    MachineConstantPool *ConstantPool = MF->getConstantPool();
+    Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
+    const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
+
+    // MachineConstantPool wants an explicit alignment.
+    unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
+    if (Align == 0)
+      Align = MF->getDataLayout().getTypeAllocSize(C->getType());
+    unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+
+    if (IsThumb)
+      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg(
+          varEnd, RegState::Define).addConstantPoolIndex(Idx));
+    else
+      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg(
+          varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0));
+  }
+  BB->addSuccessor(loopMBB);
+
+  // Generate the loop body:
+  //   varPhi = PHI(varLoop, varEnd)
+  //   srcPhi = PHI(srcLoop, src)
+  //   destPhi = PHI(destLoop, dst)
+  MachineBasicBlock *entryBB = BB;
+  BB = loopMBB;
+  unsigned varLoop = MRI.createVirtualRegister(TRC);
+  unsigned varPhi = MRI.createVirtualRegister(TRC);
+  unsigned srcLoop = MRI.createVirtualRegister(TRC);
+  unsigned srcPhi = MRI.createVirtualRegister(TRC);
+  unsigned destLoop = MRI.createVirtualRegister(TRC);
+  unsigned destPhi = MRI.createVirtualRegister(TRC);
+
+  BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
+    .addReg(varLoop).addMBB(loopMBB)
+    .addReg(varEnd).addMBB(entryBB);
+  BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
+    .addReg(srcLoop).addMBB(loopMBB)
+    .addReg(src).addMBB(entryBB);
+  BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
+    .addReg(destLoop).addMBB(loopMBB)
+    .addReg(dest).addMBB(entryBB);
+
+  //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
+  //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
+  unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+  emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
+             IsThumb1, IsThumb2);
+  emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
+             IsThumb1, IsThumb2);
+
+  // Decrement loop variable by UnitSize.
+  if (IsThumb1) {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop);
+    MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(varPhi).addImm(UnitSize);
+    AddDefaultPred(MIB);
+  } else {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, BB->end(), dl,
+                TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
+    AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
+    MIB->getOperand(5).setReg(ARM::CPSR);
+    MIB->getOperand(5).setIsDef(true);
+  }
+  BuildMI(*BB, BB->end(), dl,
+          TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
+      .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+
+  // loopMBB can loop back to loopMBB or fall through to exitMBB.
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  // Add epilogue to handle BytesLeft.
+  BB = exitMBB;
+  auto StartOfExit = exitMBB->begin();
+
+  //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
+  //   [destOut] = STRB_POST(scratch, destLoop, 1)
+  unsigned srcIn = srcLoop;
+  unsigned destIn = destLoop;
+  for (unsigned i = 0; i < BytesLeft; i++) {
+    unsigned srcOut = MRI.createVirtualRegister(TRC);
+    unsigned destOut = MRI.createVirtualRegister(TRC);
+    unsigned scratch = MRI.createVirtualRegister(TRC);
+    emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
+               IsThumb1, IsThumb2);
+    emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
+               IsThumb1, IsThumb2);
+    srcIn = srcOut;
+    destIn = destOut;
+  }
+
+  MI.eraseFromParent(); // The instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
+                                       MachineBasicBlock *MBB) const {
+  const TargetMachine &TM = getTargetMachine();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  assert(Subtarget->isTargetWindows() &&
+         "__chkstk is only supported on Windows");
+  assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
+
+  // __chkstk takes the number of words to allocate on the stack in R4, and
+  // returns the stack adjustment in number of bytes in R4.  This will not
+  // clober any other registers (other than the obvious lr).
+  //
+  // Although, technically, IP should be considered a register which may be
+  // clobbered, the call itself will not touch it.  Windows on ARM is a pure
+  // thumb-2 environment, so there is no interworking required.  As a result, we
+  // do not expect a veneer to be emitted by the linker, clobbering IP.
+  //
+  // Each module receives its own copy of __chkstk, so no import thunk is
+  // required, again, ensuring that IP is not clobbered.
+  //
+  // Finally, although some linkers may theoretically provide a trampoline for
+  // out of range calls (which is quite common due to a 32M range limitation of
+  // branches for Thumb), we can generate the long-call version via
+  // -mcmodel=large, alleviating the need for the trampoline which may clobber
+  // IP.
+
+  switch (TM.getCodeModel()) {
+  case CodeModel::Small:
+  case CodeModel::Medium:
+  case CodeModel::Default:
+  case CodeModel::Kernel:
+    BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
+      .addImm((unsigned)ARMCC::AL).addReg(0)
+      .addExternalSymbol("__chkstk")
+      .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
+      .addReg(ARM::R4, RegState::Implicit | RegState::Define)
+      .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
+    break;
+  case CodeModel::Large:
+  case CodeModel::JITDefault: {
+    MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+    unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+
+    BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
+      .addExternalSymbol("__chkstk");
+    BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
+      .addImm((unsigned)ARMCC::AL).addReg(0)
+      .addReg(Reg, RegState::Kill)
+      .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
+      .addReg(ARM::R4, RegState::Implicit | RegState::Define)
+      .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
+    break;
+  }
+  }
+
+  AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr),
+                                      ARM::SP)
+                         .addReg(ARM::SP, RegState::Kill)
+                         .addReg(ARM::R4, RegState::Kill)
+                         .setMIFlags(MachineInstr::FrameSetup)));
+
+  MI.eraseFromParent();
+  return MBB;
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
+                                       MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+  MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
+  MF->insert(++MBB->getIterator(), ContBB);
+  ContBB->splice(ContBB->begin(), MBB,
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  ContBB->transferSuccessorsAndUpdatePHIs(MBB);
+  MBB->addSuccessor(ContBB);
+
+  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+  BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
+  MF->push_back(TrapBB);
+  MBB->addSuccessor(TrapBB);
+
+  AddDefaultPred(BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
+                     .addReg(MI.getOperand(0).getReg())
+                     .addImm(0));
+  BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
+      .addMBB(TrapBB)
+      .addImm(ARMCC::EQ)
+      .addReg(ARM::CPSR);
+
+  MI.eraseFromParent();
+  return ContBB;
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                               MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
+  switch (MI.getOpcode()) {
+  default: {
+    MI.dump();
+    llvm_unreachable("Unexpected instr type to insert");
+  }
+
+  // Thumb1 post-indexed loads are really just single-register LDMs.
+  case ARM::tLDR_postidx: {
+    BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
+      .addOperand(MI.getOperand(1)) // Rn_wb
+      .addOperand(MI.getOperand(2)) // Rn
+      .addOperand(MI.getOperand(3)) // PredImm
+      .addOperand(MI.getOperand(4)) // PredReg
+      .addOperand(MI.getOperand(0)); // Rt
+    MI.eraseFromParent();
+    return BB;
+  }
+
+  // The Thumb2 pre-indexed stores have the same MI operands, they just
+  // define them differently in the .td files from the isel patterns, so
+  // they need pseudos.
+  case ARM::t2STR_preidx:
+    MI.setDesc(TII->get(ARM::t2STR_PRE));
+    return BB;
+  case ARM::t2STRB_preidx:
+    MI.setDesc(TII->get(ARM::t2STRB_PRE));
+    return BB;
+  case ARM::t2STRH_preidx:
+    MI.setDesc(TII->get(ARM::t2STRH_PRE));
+    return BB;
+
+  case ARM::STRi_preidx:
+  case ARM::STRBi_preidx: {
+    unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
+                                                         : ARM::STRB_PRE_IMM;
+    // Decode the offset.
+    unsigned Offset = MI.getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
+    Offset = ARM_AM::getAM2Offset(Offset);
+    if (isSub)
+      Offset = -Offset;
+
+    MachineMemOperand *MMO = *MI.memoperands_begin();
+    BuildMI(*BB, MI, dl, TII->get(NewOpc))
+        .addOperand(MI.getOperand(0)) // Rn_wb
+        .addOperand(MI.getOperand(1)) // Rt
+        .addOperand(MI.getOperand(2)) // Rn
+        .addImm(Offset)               // offset (skip GPR==zero_reg)
+        .addOperand(MI.getOperand(5)) // pred
+        .addOperand(MI.getOperand(6))
+        .addMemOperand(MMO);
+    MI.eraseFromParent();
+    return BB;
+  }
+  case ARM::STRr_preidx:
+  case ARM::STRBr_preidx:
+  case ARM::STRH_preidx: {
+    unsigned NewOpc;
+    switch (MI.getOpcode()) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
+    case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
+    case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
+    }
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
+    for (unsigned i = 0; i < MI.getNumOperands(); ++i)
+      MIB.addOperand(MI.getOperand(i));
+    MI.eraseFromParent();
+    return BB;
+  }
+
+  case ARM::tMOVCCr_pseudo: {
+    // To "insert" a SELECT_CC instruction, we actually have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // destination vreg to set, the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use.
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    MachineFunction::iterator It = ++BB->getIterator();
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   cmpTY ccX, r1, r2
+    //   bCC copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineBasicBlock *thisMBB  = BB;
+    MachineFunction *F = BB->getParent();
+    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, copy0MBB);
+    F->insert(It, sinkMBB);
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    sinkMBB->splice(sinkMBB->begin(), BB,
+                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
+    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    BuildMI(BB, dl, TII->get(ARM::tBcc))
+        .addMBB(sinkMBB)
+        .addImm(MI.getOperand(3).getImm())
+        .addReg(MI.getOperand(4).getReg());
+
+    //  copy0MBB:
+    //   %FalseValue = ...
+    //   # fallthrough to sinkMBB
+    BB = copy0MBB;
+
+    // Update machine-CFG edges
+    BB->addSuccessor(sinkMBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    BB = sinkMBB;
+    BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
+        .addReg(MI.getOperand(1).getReg())
+        .addMBB(copy0MBB)
+        .addReg(MI.getOperand(2).getReg())
+        .addMBB(thisMBB);
+
+    MI.eraseFromParent(); // The pseudo instruction is gone now.
+    return BB;
+  }
+
+  case ARM::BCCi64:
+  case ARM::BCCZi64: {
+    // If there is an unconditional branch to the other successor, remove it.
+    BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
+
+    // Compare both parts that make up the double comparison separately for
+    // equality.
+    bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
+
+    unsigned LHS1 = MI.getOperand(1).getReg();
+    unsigned LHS2 = MI.getOperand(2).getReg();
+    if (RHSisZero) {
+      AddDefaultPred(BuildMI(BB, dl,
+                             TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                     .addReg(LHS1).addImm(0));
+      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+        .addReg(LHS2).addImm(0)
+        .addImm(ARMCC::EQ).addReg(ARM::CPSR);
+    } else {
+      unsigned RHS1 = MI.getOperand(3).getReg();
+      unsigned RHS2 = MI.getOperand(4).getReg();
+      AddDefaultPred(BuildMI(BB, dl,
+                             TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
+                     .addReg(LHS1).addReg(RHS1));
+      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
+        .addReg(LHS2).addReg(RHS2)
+        .addImm(ARMCC::EQ).addReg(ARM::CPSR);
+    }
+
+    MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
+    MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
+    if (MI.getOperand(0).getImm() == ARMCC::NE)
+      std::swap(destMBB, exitMBB);
+
+    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+      .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
+    if (isThumb2)
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB));
+    else
+      BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
+
+    MI.eraseFromParent(); // The pseudo instruction is gone now.
+    return BB;
+  }
+
+  case ARM::Int_eh_sjlj_setjmp:
+  case ARM::Int_eh_sjlj_setjmp_nofp:
+  case ARM::tInt_eh_sjlj_setjmp:
+  case ARM::t2Int_eh_sjlj_setjmp:
+  case ARM::t2Int_eh_sjlj_setjmp_nofp:
+    return BB;
+
+  case ARM::Int_eh_sjlj_setup_dispatch:
+    EmitSjLjDispatchBlock(MI, BB);
+    return BB;
+
+  case ARM::ABS:
+  case ARM::t2ABS: {
+    // To insert an ABS instruction, we have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // source vreg to test against 0, the destination vreg to set,
+    // the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use.
+    // It transforms
+    //     V1 = ABS V0
+    // into
+    //     V2 = MOVS V0
+    //     BCC                      (branch to SinkBB if V0 >= 0)
+    //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
+    //     SinkBB: V1 = PHI(V2, V3)
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    MachineFunction::iterator BBI = ++BB->getIterator();
+    MachineFunction *Fn = BB->getParent();
+    MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
+    Fn->insert(BBI, RSBBB);
+    Fn->insert(BBI, SinkBB);
+
+    unsigned int ABSSrcReg = MI.getOperand(1).getReg();
+    unsigned int ABSDstReg = MI.getOperand(0).getReg();
+    bool ABSSrcKIll = MI.getOperand(1).isKill();
+    bool isThumb2 = Subtarget->isThumb2();
+    MachineRegisterInfo &MRI = Fn->getRegInfo();
+    // In Thumb mode S must not be specified if source register is the SP or
+    // PC and if destination register is the SP, so restrict register class
+    unsigned NewRsbDstReg =
+      MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    SinkBB->splice(SinkBB->begin(), BB,
+                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
+    SinkBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    BB->addSuccessor(RSBBB);
+    BB->addSuccessor(SinkBB);
+
+    // fall through to SinkMBB
+    RSBBB->addSuccessor(SinkBB);
+
+    // insert a cmp at the end of BB
+    AddDefaultPred(BuildMI(BB, dl,
+                           TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                   .addReg(ABSSrcReg).addImm(0));
+
+    // insert a bcc with opposite CC to ARMCC::MI at the end of BB
+    BuildMI(BB, dl,
+      TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
+      .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
+
+    // insert rsbri in RSBBB
+    // Note: BCC and rsbri will be converted into predicated rsbmi
+    // by if-conversion pass
+    BuildMI(*RSBBB, RSBBB->begin(), dl,
+      TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
+      .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
+      .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+
+    // insert PHI in SinkBB,
+    // reuse ABSDstReg to not change uses of ABS instruction
+    BuildMI(*SinkBB, SinkBB->begin(), dl,
+      TII->get(ARM::PHI), ABSDstReg)
+      .addReg(NewRsbDstReg).addMBB(RSBBB)
+      .addReg(ABSSrcReg).addMBB(BB);
+
+    // remove ABS instruction
+    MI.eraseFromParent();
+
+    // return last added BB
+    return SinkBB;
+  }
+  case ARM::COPY_STRUCT_BYVAL_I32:
+    ++NumLoopByVals;
+    return EmitStructByval(MI, BB);
+  case ARM::WIN__CHKSTK:
+    return EmitLowered__chkstk(MI, BB);
+  case ARM::WIN__DBZCHK:
+    return EmitLowered__dbzchk(MI, BB);
+  }
+}
+
+/// \brief Attaches vregs to MEMCPY that it will use as scratch registers
+/// when it is expanded into LDM/STM. This is done as a post-isel lowering
+/// instead of as a custom inserter because we need the use list from the SDNode.
+static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
+                                    MachineInstr &MI, const SDNode *Node) {
+  bool isThumb1 = Subtarget->isThumb1Only();
+
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MI.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineInstrBuilder MIB(*MF, MI);
+
+  // If the new dst/src is unused mark it as dead.
+  if (!Node->hasAnyUseOfValue(0)) {
+    MI.getOperand(0).setIsDead(true);
+  }
+  if (!Node->hasAnyUseOfValue(1)) {
+    MI.getOperand(1).setIsDead(true);
+  }
+
+  // The MEMCPY both defines and kills the scratch registers.
+  for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
+    unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
+                                                         : &ARM::GPRRegClass);
+    MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
+  }
+}
+
+void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                                      SDNode *Node) const {
+  if (MI.getOpcode() == ARM::MEMCPY) {
+    attachMEMCPYScratchRegs(Subtarget, MI, Node);
+    return;
+  }
+
+  const MCInstrDesc *MCID = &MI.getDesc();
+  // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
+  // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
+  // operand is still set to noreg. If needed, set the optional operand's
+  // register to CPSR, and remove the redundant implicit def.
+  //
+  // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>).
+
+  // Rename pseudo opcodes.
+  unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
+  if (NewOpc) {
+    const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
+    MCID = &TII->get(NewOpc);
+
+    assert(MCID->getNumOperands() == MI.getDesc().getNumOperands() + 1 &&
+           "converted opcode should be the same except for cc_out");
+
+    MI.setDesc(*MCID);
+
+    // Add the optional cc_out operand
+    MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
+  }
+  unsigned ccOutIdx = MCID->getNumOperands() - 1;
+
+  // Any ARM instruction that sets the 's' bit should specify an optional
+  // "cc_out" operand in the last operand position.
+  if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
+    assert(!NewOpc && "Optional cc_out operand required");
+    return;
+  }
+  // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
+  // since we already have an optional CPSR def.
+  bool definesCPSR = false;
+  bool deadCPSR = false;
+  for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
+       ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
+      definesCPSR = true;
+      if (MO.isDead())
+        deadCPSR = true;
+      MI.RemoveOperand(i);
+      break;
+    }
+  }
+  if (!definesCPSR) {
+    assert(!NewOpc && "Optional cc_out operand required");
+    return;
+  }
+  assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
+  if (deadCPSR) {
+    assert(!MI.getOperand(ccOutIdx).getReg() &&
+           "expect uninitialized optional cc_out operand");
+    return;
+  }
+
+  // If this instruction was defined with an optional CPSR def and its dag node
+  // had a live implicit CPSR def, then activate the optional CPSR def.
+  MachineOperand &MO = MI.getOperand(ccOutIdx);
+  MO.setReg(ARM::CPSR);
+  MO.setIsDef(true);
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+// Helper function that checks if N is a null or all ones constant.
+static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
+  return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
+}
+
+// Return true if N is conditionally 0 or all ones.
+// Detects these expressions where cc is an i1 value:
+//
+//   (select cc 0, y)   [AllOnes=0]
+//   (select cc y, 0)   [AllOnes=0]
+//   (zext cc)          [AllOnes=0]
+//   (sext cc)          [AllOnes=0/1]
+//   (select cc -1, y)  [AllOnes=1]
+//   (select cc y, -1)  [AllOnes=1]
+//
+// Invert is set when N is the null/all ones constant when CC is false.
+// OtherOp is set to the alternative value of N.
+static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
+                                       SDValue &CC, bool &Invert,
+                                       SDValue &OtherOp,
+                                       SelectionDAG &DAG) {
+  switch (N->getOpcode()) {
+  default: return false;
+  case ISD::SELECT: {
+    CC = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    if (isZeroOrAllOnes(N1, AllOnes)) {
+      Invert = false;
+      OtherOp = N2;
+      return true;
+    }
+    if (isZeroOrAllOnes(N2, AllOnes)) {
+      Invert = true;
+      OtherOp = N1;
+      return true;
+    }
+    return false;
+  }
+  case ISD::ZERO_EXTEND:
+    // (zext cc) can never be the all ones value.
+    if (AllOnes)
+      return false;
+    LLVM_FALLTHROUGH;
+  case ISD::SIGN_EXTEND: {
+    SDLoc dl(N);
+    EVT VT = N->getValueType(0);
+    CC = N->getOperand(0);
+    if (CC.getValueType() != MVT::i1)
+      return false;
+    Invert = !AllOnes;
+    if (AllOnes)
+      // When looking for an AllOnes constant, N is an sext, and the 'other'
+      // value is 0.
+      OtherOp = DAG.getConstant(0, dl, VT);
+    else if (N->getOpcode() == ISD::ZERO_EXTEND)
+      // When looking for a 0 constant, N can be zext or sext.
+      OtherOp = DAG.getConstant(1, dl, VT);
+    else
+      OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
+                                VT);
+    return true;
+  }
+  }
+}
+
+// Combine a constant select operand into its use:
+//
+//   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
+//   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
+//   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
+//   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
+//   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
+//
+// The transform is rejected if the select doesn't have a constant operand that
+// is null, or all ones when AllOnes is set.
+//
+// Also recognize sext/zext from i1:
+//
+//   (add (zext cc), x) -> (select cc (add x, 1), x)
+//   (add (sext cc), x) -> (select cc (add x, -1), x)
+//
+// These transformations eventually create predicated instructions.
+//
+// @param N       The node to transform.
+// @param Slct    The N operand that is a select.
+// @param OtherOp The other N operand (x above).
+// @param DCI     Context.
+// @param AllOnes Require the select constant to be all ones instead of null.
+// @returns The new node, or SDValue() on failure.
+static
+SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            bool AllOnes = false) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  SDValue NonConstantVal;
+  SDValue CCOp;
+  bool SwapSelectOps;
+  if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
+                                  NonConstantVal, DAG))
+    return SDValue();
+
+  // Slct is now know to be the desired identity constant when CC is true.
+  SDValue TrueVal = OtherOp;
+  SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
+                                 OtherOp, NonConstantVal);
+  // Unless SwapSelectOps says CC should be false.
+  if (SwapSelectOps)
+    std::swap(TrueVal, FalseVal);
+
+  return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
+                     CCOp, TrueVal, FalseVal);
+}
+
+// Attempt combineSelectAndUse on each operand of a commutative operator N.
+static
+SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
+                                       TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (N0.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
+      return Result;
+  if (N1.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
+      return Result;
+  return SDValue();
+}
+
+// AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
+// (only after legalization).
+static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+
+  // Only perform optimization if after legalize, and if NEON is available. We
+  // also expected both operands to be BUILD_VECTORs.
+  if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
+      || N0.getOpcode() != ISD::BUILD_VECTOR
+      || N1.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  // Check output type since VPADDL operand elements can only be 8, 16, or 32.
+  EVT VT = N->getValueType(0);
+  if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
+    return SDValue();
+
+  // Check that the vector operands are of the right form.
+  // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
+  // operands, where N is the size of the formed vector.
+  // Each EXTRACT_VECTOR should have the same input vector and odd or even
+  // index such that we have a pair wise add pattern.
+
+  // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
+  if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+  SDValue Vec = N0->getOperand(0)->getOperand(0);
+  SDNode *V = Vec.getNode();
+  unsigned nextIndex = 0;
+
+  // For each operands to the ADD which are BUILD_VECTORs,
+  // check to see if each of their operands are an EXTRACT_VECTOR with
+  // the same vector and appropriate index.
+  for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
+    if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
+        && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+
+      SDValue ExtVec0 = N0->getOperand(i);
+      SDValue ExtVec1 = N1->getOperand(i);
+
+      // First operand is the vector, verify its the same.
+      if (V != ExtVec0->getOperand(0).getNode() ||
+          V != ExtVec1->getOperand(0).getNode())
+        return SDValue();
+
+      // Second is the constant, verify its correct.
+      ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
+      ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
+
+      // For the constant, we want to see all the even or all the odd.
+      if (!C0 || !C1 || C0->getZExtValue() != nextIndex
+          || C1->getZExtValue() != nextIndex+1)
+        return SDValue();
+
+      // Increment index.
+      nextIndex+=2;
+    } else
+      return SDValue();
+  }
+
+  // Create VPADDL node.
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  SDLoc dl(N);
+
+  // Build operand list.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
+                                TLI.getPointerTy(DAG.getDataLayout())));
+
+  // Input is the vector.
+  Ops.push_back(Vec);
+
+  // Get widened type and narrowed type.
+  MVT widenType;
+  unsigned numElem = VT.getVectorNumElements();
+
+  EVT inputLaneType = Vec.getValueType().getVectorElementType();
+  switch (inputLaneType.getSimpleVT().SimpleTy) {
+    case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
+    case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
+    case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
+    default:
+      llvm_unreachable("Invalid vector element type for padd optimization.");
+  }
+
+  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
+  unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
+  return DAG.getNode(ExtOp, dl, VT, tmp);
+}
+
+static SDValue findMUL_LOHI(SDValue V) {
+  if (V->getOpcode() == ISD::UMUL_LOHI ||
+      V->getOpcode() == ISD::SMUL_LOHI)
+    return V;
+  return SDValue();
+}
+
+static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const ARMSubtarget *Subtarget) {
+
+  // Look for multiply add opportunities.
+  // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
+  // each add nodes consumes a value from ISD::UMUL_LOHI and there is
+  // a glue link from the first add to the second add.
+  // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
+  // a S/UMLAL instruction.
+  //                  UMUL_LOHI
+  //                 / :lo    \ :hi
+  //                /          \          [no multiline comment]
+  //    loAdd ->  ADDE         |
+  //                 \ :glue  /
+  //                  \      /
+  //                    ADDC   <- hiAdd
+  //
+  assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
+  SDValue AddcOp0 = AddcNode->getOperand(0);
+  SDValue AddcOp1 = AddcNode->getOperand(1);
+
+  // Check if the two operands are from the same mul_lohi node.
+  if (AddcOp0.getNode() == AddcOp1.getNode())
+    return SDValue();
+
+  assert(AddcNode->getNumValues() == 2 &&
+         AddcNode->getValueType(0) == MVT::i32 &&
+         "Expect ADDC with two result values. First: i32");
+
+  // Check that we have a glued ADDC node.
+  if (AddcNode->getValueType(1) != MVT::Glue)
+    return SDValue();
+
+  // Check that the ADDC adds the low result of the S/UMUL_LOHI.
+  if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
+      AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
+      AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
+      AddcOp1->getOpcode() != ISD::SMUL_LOHI)
+    return SDValue();
+
+  // Look for the glued ADDE.
+  SDNode* AddeNode = AddcNode->getGluedUser();
+  if (!AddeNode)
+    return SDValue();
+
+  // Make sure it is really an ADDE.
+  if (AddeNode->getOpcode() != ISD::ADDE)
+    return SDValue();
+
+  assert(AddeNode->getNumOperands() == 3 &&
+         AddeNode->getOperand(2).getValueType() == MVT::Glue &&
+         "ADDE node has the wrong inputs");
+
+  // Check for the triangle shape.
+  SDValue AddeOp0 = AddeNode->getOperand(0);
+  SDValue AddeOp1 = AddeNode->getOperand(1);
+
+  // Make sure that the ADDE operands are not coming from the same node.
+  if (AddeOp0.getNode() == AddeOp1.getNode())
+    return SDValue();
+
+  // Find the MUL_LOHI node walking up ADDE's operands.
+  bool IsLeftOperandMUL = false;
+  SDValue MULOp = findMUL_LOHI(AddeOp0);
+  if (MULOp == SDValue())
+   MULOp = findMUL_LOHI(AddeOp1);
+  else
+    IsLeftOperandMUL = true;
+  if (MULOp == SDValue())
+    return SDValue();
+
+  // Figure out the right opcode.
+  unsigned Opc = MULOp->getOpcode();
+  unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
+
+  // Figure out the high and low input values to the MLAL node.
+  SDValue* HiAdd = nullptr;
+  SDValue* LoMul = nullptr;
+  SDValue* LowAdd = nullptr;
+
+  // Ensure that ADDE is from high result of ISD::SMUL_LOHI.
+  if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1)))
+    return SDValue();
+
+  if (IsLeftOperandMUL)
+    HiAdd = &AddeOp1;
+  else
+    HiAdd = &AddeOp0;
+
+
+  // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node
+  // whose low result is fed to the ADDC we are checking.
+
+  if (AddcOp0 == MULOp.getValue(0)) {
+    LoMul = &AddcOp0;
+    LowAdd = &AddcOp1;
+  }
+  if (AddcOp1 == MULOp.getValue(0)) {
+    LoMul = &AddcOp1;
+    LowAdd = &AddcOp0;
+  }
+
+  if (!LoMul)
+    return SDValue();
+
+  // Create the merged node.
+  SelectionDAG &DAG = DCI.DAG;
+
+  // Build operand list.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(LoMul->getOperand(0));
+  Ops.push_back(LoMul->getOperand(1));
+  Ops.push_back(*LowAdd);
+  Ops.push_back(*HiAdd);
+
+  SDValue MLALNode =  DAG.getNode(FinalOpc, SDLoc(AddcNode),
+                                 DAG.getVTList(MVT::i32, MVT::i32), Ops);
+
+  // Replace the ADDs' nodes uses by the MLA node's values.
+  SDValue HiMLALResult(MLALNode.getNode(), 1);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
+
+  SDValue LoMLALResult(MLALNode.getNode(), 0);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
+
+  // Return original node to notify the driver to stop replacing.
+  SDValue resNode(AddcNode, 0);
+  return resNode;
+}
+
+static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const ARMSubtarget *Subtarget) {
+  // UMAAL is similar to UMLAL except that it adds two unsigned values.
+  // While trying to combine for the other MLAL nodes, first search for the
+  // chance to use UMAAL. Check if Addc uses another addc node which can first
+  // be combined into a UMLAL. The other pattern is AddcNode being combined
+  // into an UMLAL and then using another addc is handled in ISelDAGToDAG.
+
+  if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() ||
+      (Subtarget->isThumb() && !Subtarget->hasThumb2()))
+    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
+
+  SDNode *PrevAddc = nullptr;
+  if (AddcNode->getOperand(0).getOpcode() == ISD::ADDC)
+    PrevAddc = AddcNode->getOperand(0).getNode();
+  else if (AddcNode->getOperand(1).getOpcode() == ISD::ADDC)
+    PrevAddc = AddcNode->getOperand(1).getNode();
+
+  // If there's no addc chains, just return a search for any MLAL.
+  if (PrevAddc == nullptr)
+    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
+
+  // Try to convert the addc operand to an MLAL and if that fails try to
+  // combine AddcNode.
+  SDValue MLAL = AddCombineTo64bitMLAL(PrevAddc, DCI, Subtarget);
+  if (MLAL != SDValue(PrevAddc, 0))
+    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
+
+  // Find the converted UMAAL or quit if it doesn't exist.
+  SDNode *UmlalNode = nullptr;
+  SDValue AddHi;
+  if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
+    UmlalNode = AddcNode->getOperand(0).getNode();
+    AddHi = AddcNode->getOperand(1);
+  } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
+    UmlalNode = AddcNode->getOperand(1).getNode();
+    AddHi = AddcNode->getOperand(0);
+  } else {
+    return SDValue();
+  }
+
+  // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
+  // the ADDC as well as Zero.
+  auto *Zero = dyn_cast<ConstantSDNode>(UmlalNode->getOperand(3));
+
+  if (!Zero || Zero->getZExtValue() != 0)
+    return SDValue();
+
+  // Check that we have a glued ADDC node.
+  if (AddcNode->getValueType(1) != MVT::Glue)
+    return SDValue();
+
+  // Look for the glued ADDE.
+  SDNode* AddeNode = AddcNode->getGluedUser();
+  if (!AddeNode)
+    return SDValue();
+
+  if ((AddeNode->getOperand(0).getNode() == Zero &&
+       AddeNode->getOperand(1).getNode() == UmlalNode) ||
+      (AddeNode->getOperand(0).getNode() == UmlalNode &&
+       AddeNode->getOperand(1).getNode() == Zero)) {
+
+    SelectionDAG &DAG = DCI.DAG;
+    SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
+                      UmlalNode->getOperand(2), AddHi };
+    SDValue UMAAL =  DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
+                                 DAG.getVTList(MVT::i32, MVT::i32), Ops);
+
+    // Replace the ADDs' nodes uses by the UMAAL node's values.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
+    DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
+
+    // Return original node to notify the driver to stop replacing.
+    return SDValue(AddcNode, 0);
+  }
+  return SDValue();
+}
+
+/// PerformADDCCombine - Target-specific dag combine transform from
+/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL or
+/// ISD::ADDC, ISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
+static SDValue PerformADDCCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+
+  if (Subtarget->isThumb1Only()) return SDValue();
+
+  // Only perform the checks after legalize when the pattern is available.
+  if (DCI.isBeforeLegalize()) return SDValue();
+
+  return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
+}
+
+/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
+/// operands N0 and N1.  This is a helper for PerformADDCombine that is
+/// called with the default operands, and if that fails, with commuted
+/// operands.
+static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          const ARMSubtarget *Subtarget){
+
+  // Attempt to create vpaddl for this add.
+  if (SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget))
+    return Result;
+
+  // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
+  if (N0.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
+      return Result;
+  return SDValue();
+}
+
+/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
+///
+static SDValue PerformADDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // First try with the default operand order.
+  if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
+    return Result;
+
+  // If that didn't work, try again with the operands commuted.
+  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
+}
+
+/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
+///
+static SDValue PerformSUBCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
+  if (N1.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
+      return Result;
+
+  return SDValue();
+}
+
+/// PerformVMULCombine
+/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
+/// special multiplier accumulator forwarding.
+///   vmul d3, d0, d2
+///   vmla d3, d1, d2
+/// is faster than
+///   vadd d3, d0, d1
+///   vmul d3, d3, d2
+//  However, for (A + B) * (A + B),
+//    vadd d2, d0, d1
+//    vmul d3, d0, d2
+//    vmla d3, d1, d2
+//  is slower than
+//    vadd d2, d0, d1
+//    vmul d3, d2, d2
+static SDValue PerformVMULCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasVMLxForwarding())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  unsigned Opcode = N0.getOpcode();
+  if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+      Opcode != ISD::FADD && Opcode != ISD::FSUB) {
+    Opcode = N1.getOpcode();
+    if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+        Opcode != ISD::FADD && Opcode != ISD::FSUB)
+      return SDValue();
+    std::swap(N0, N1);
+  }
+
+  if (N0 == N1)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  SDValue N00 = N0->getOperand(0);
+  SDValue N01 = N0->getOperand(1);
+  return DAG.getNode(Opcode, DL, VT,
+                     DAG.getNode(ISD::MUL, DL, VT, N00, N1),
+                     DAG.getNode(ISD::MUL, DL, VT, N01, N1));
+}
+
+static SDValue PerformMULCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+
+  if (Subtarget->isThumb1Only())
+    return SDValue();
+
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT.is64BitVector() || VT.is128BitVector())
+    return PerformVMULCombine(N, DCI, Subtarget);
+  if (VT != MVT::i32)
+    return SDValue();
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!C)
+    return SDValue();
+
+  int64_t MulAmt = C->getSExtValue();
+  unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
+
+  ShiftAmt = ShiftAmt & (32 - 1);
+  SDValue V = N->getOperand(0);
+  SDLoc DL(N);
+
+  SDValue Res;
+  MulAmt >>= ShiftAmt;
+
+  if (MulAmt >= 0) {
+    if (isPowerOf2_32(MulAmt - 1)) {
+      // (mul x, 2^N + 1) => (add (shl x, N), x)
+      Res = DAG.getNode(ISD::ADD, DL, VT,
+                        V,
+                        DAG.getNode(ISD::SHL, DL, VT,
+                                    V,
+                                    DAG.getConstant(Log2_32(MulAmt - 1), DL,
+                                                    MVT::i32)));
+    } else if (isPowerOf2_32(MulAmt + 1)) {
+      // (mul x, 2^N - 1) => (sub (shl x, N), x)
+      Res = DAG.getNode(ISD::SUB, DL, VT,
+                        DAG.getNode(ISD::SHL, DL, VT,
+                                    V,
+                                    DAG.getConstant(Log2_32(MulAmt + 1), DL,
+                                                    MVT::i32)),
+                        V);
+    } else
+      return SDValue();
+  } else {
+    uint64_t MulAmtAbs = -MulAmt;
+    if (isPowerOf2_32(MulAmtAbs + 1)) {
+      // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+      Res = DAG.getNode(ISD::SUB, DL, VT,
+                        V,
+                        DAG.getNode(ISD::SHL, DL, VT,
+                                    V,
+                                    DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
+                                                    MVT::i32)));
+    } else if (isPowerOf2_32(MulAmtAbs - 1)) {
+      // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
+      Res = DAG.getNode(ISD::ADD, DL, VT,
+                        V,
+                        DAG.getNode(ISD::SHL, DL, VT,
+                                    V,
+                                    DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
+                                                    MVT::i32)));
+      Res = DAG.getNode(ISD::SUB, DL, VT,
+                        DAG.getConstant(0, DL, MVT::i32), Res);
+
+    } else
+      return SDValue();
+  }
+
+  if (ShiftAmt != 0)
+    Res = DAG.getNode(ISD::SHL, DL, VT,
+                      Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
+
+  // Do not add new nodes to DAG combiner worklist.
+  DCI.CombineTo(N, Res, false);
+  return SDValue();
+}
+
+static SDValue PerformANDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+
+  // Attempt to use immediate-form VBIC
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN &&
+      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize <= 64) {
+      EVT VbicVT;
+      SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, dl, VbicVT, VT.is128BitVector(),
+                                      OtherModImm);
+      if (Val.getNode()) {
+        SDValue Input =
+          DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
+        SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
+      }
+    }
+  }
+
+  if (!Subtarget->isThumb1Only()) {
+    // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
+    if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
+      return Result;
+  }
+
+  return SDValue();
+}
+
+/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
+static SDValue PerformORCombine(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const ARMSubtarget *Subtarget) {
+  // Attempt to use immediate-form VORR
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN && Subtarget->hasNEON() &&
+      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize <= 64) {
+      EVT VorrVT;
+      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, dl, VorrVT, VT.is128BitVector(),
+                                      OtherModImm);
+      if (Val.getNode()) {
+        SDValue Input =
+          DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
+        SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
+      }
+    }
+  }
+
+  if (!Subtarget->isThumb1Only()) {
+    // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
+    if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
+      return Result;
+  }
+
+  // The code below optimizes (or (and X, Y), Z).
+  // The AND operand needs to have a single user to make these optimizations
+  // profitable.
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
+    return SDValue();
+  SDValue N1 = N->getOperand(1);
+
+  // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
+  if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
+      DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+    APInt SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+
+    APInt SplatBits0, SplatBits1;
+    BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
+    BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
+    // Ensure that the second operand of both ands are constants
+    if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
+                                      HasAnyUndefs) && !HasAnyUndefs) {
+        if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
+                                          HasAnyUndefs) && !HasAnyUndefs) {
+            // Ensure that the bit width of the constants are the same and that
+            // the splat arguments are logical inverses as per the pattern we
+            // are trying to simplify.
+            if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
+                SplatBits0 == ~SplatBits1) {
+                // Canonicalize the vector type to make instruction selection
+                // simpler.
+                EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
+                SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
+                                             N0->getOperand(1),
+                                             N0->getOperand(0),
+                                             N1->getOperand(0));
+                return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+            }
+        }
+    }
+  }
+
+  // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
+  // reasonable.
+
+  // BFI is only available on V6T2+
+  if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
+    return SDValue();
+
+  SDLoc DL(N);
+  // 1) or (and A, mask), val => ARMbfi A, val, mask
+  //      iff (val & mask) == val
+  //
+  // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
+  //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
+  //          && mask == ~mask2
+  //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
+  //          && ~mask == mask2
+  //  (i.e., copy a bitfield value into another bitfield of the same width)
+
+  if (VT != MVT::i32)
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+
+  // The value and the mask need to be constants so we can verify this is
+  // actually a bitfield set. If the mask is 0xffff, we can do better
+  // via a movt instruction, so don't use BFI in that case.
+  SDValue MaskOp = N0.getOperand(1);
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
+  if (!MaskC)
+    return SDValue();
+  unsigned Mask = MaskC->getZExtValue();
+  if (Mask == 0xffff)
+    return SDValue();
+  SDValue Res;
+  // Case (1): or (and A, mask), val => ARMbfi A, val, mask
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N1C) {
+    unsigned Val = N1C->getZExtValue();
+    if ((Val & ~Mask) != Val)
+      return SDValue();
+
+    if (ARM::isBitFieldInvertedMask(Mask)) {
+      Val >>= countTrailingZeros(~Mask);
+
+      Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
+                        DAG.getConstant(Val, DL, MVT::i32),
+                        DAG.getConstant(Mask, DL, MVT::i32));
+
+      // Do not add new nodes to DAG combiner worklist.
+      DCI.CombineTo(N, Res, false);
+      return SDValue();
+    }
+  } else if (N1.getOpcode() == ISD::AND) {
+    // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
+    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    if (!N11C)
+      return SDValue();
+    unsigned Mask2 = N11C->getZExtValue();
+
+    // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
+    // as is to match.
+    if (ARM::isBitFieldInvertedMask(Mask) &&
+        (Mask == ~Mask2)) {
+      // The pack halfword instruction works better for masks that fit it,
+      // so use that when it's available.
+      if (Subtarget->hasT2ExtractPack() &&
+          (Mask == 0xffff || Mask == 0xffff0000))
+        return SDValue();
+      // 2a
+      unsigned amt = countTrailingZeros(Mask2);
+      Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
+                        DAG.getConstant(amt, DL, MVT::i32));
+      Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
+                        DAG.getConstant(Mask, DL, MVT::i32));
+      // Do not add new nodes to DAG combiner worklist.
+      DCI.CombineTo(N, Res, false);
+      return SDValue();
+    } else if (ARM::isBitFieldInvertedMask(~Mask) &&
+               (~Mask == Mask2)) {
+      // The pack halfword instruction works better for masks that fit it,
+      // so use that when it's available.
+      if (Subtarget->hasT2ExtractPack() &&
+          (Mask2 == 0xffff || Mask2 == 0xffff0000))
+        return SDValue();
+      // 2b
+      unsigned lsb = countTrailingZeros(Mask);
+      Res = DAG.getNode(ISD::SRL, DL, VT, N00,
+                        DAG.getConstant(lsb, DL, MVT::i32));
+      Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
+                        DAG.getConstant(Mask2, DL, MVT::i32));
+      // Do not add new nodes to DAG combiner worklist.
+      DCI.CombineTo(N, Res, false);
+      return SDValue();
+    }
+  }
+
+  if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
+      N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
+      ARM::isBitFieldInvertedMask(~Mask)) {
+    // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
+    // where lsb(mask) == #shamt and masked bits of B are known zero.
+    SDValue ShAmt = N00.getOperand(1);
+    unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
+    unsigned LSB = countTrailingZeros(Mask);
+    if (ShAmtC != LSB)
+      return SDValue();
+
+    Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
+                      DAG.getConstant(~Mask, DL, MVT::i32));
+
+    // Do not add new nodes to DAG combiner worklist.
+    DCI.CombineTo(N, Res, false);
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformXORCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  if (!Subtarget->isThumb1Only()) {
+    // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
+    if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
+      return Result;
+  }
+
+  return SDValue();
+}
+
+// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
+// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
+// their position in "to" (Rd).
+static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
+  assert(N->getOpcode() == ARMISD::BFI);
+
+  SDValue From = N->getOperand(1);
+  ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
+  FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
+
+  // If the Base came from a SHR #C, we can deduce that it is really testing bit
+  // #C in the base of the SHR.
+  if (From->getOpcode() == ISD::SRL &&
+      isa<ConstantSDNode>(From->getOperand(1))) {
+    APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
+    assert(Shift.getLimitedValue() < 32 && "Shift too large!");
+    FromMask <<= Shift.getLimitedValue(31);
+    From = From->getOperand(0);
+  }
+
+  return From;
+}
+
+// If A and B contain one contiguous set of bits, does A | B == A . B?
+//
+// Neither A nor B must be zero.
+static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
+  unsigned LastActiveBitInA =  A.countTrailingZeros();
+  unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
+  return LastActiveBitInA - 1 == FirstActiveBitInB;
+}
+
+static SDValue FindBFIToCombineWith(SDNode *N) {
+  // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
+  // if one exists.
+  APInt ToMask, FromMask;
+  SDValue From = ParseBFI(N, ToMask, FromMask);
+  SDValue To = N->getOperand(0);
+
+  // Now check for a compatible BFI to merge with. We can pass through BFIs that
+  // aren't compatible, but not if they set the same bit in their destination as
+  // we do (or that of any BFI we're going to combine with).
+  SDValue V = To;
+  APInt CombinedToMask = ToMask;
+  while (V.getOpcode() == ARMISD::BFI) {
+    APInt NewToMask, NewFromMask;
+    SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
+    if (NewFrom != From) {
+      // This BFI has a different base. Keep going.
+      CombinedToMask |= NewToMask;
+      V = V.getOperand(0);
+      continue;
+    }
+
+    // Do the written bits conflict with any we've seen so far?
+    if ((NewToMask & CombinedToMask).getBoolValue())
+      // Conflicting bits - bail out because going further is unsafe.
+      return SDValue();
+
+    // Are the new bits contiguous when combined with the old bits?
+    if (BitsProperlyConcatenate(ToMask, NewToMask) &&
+        BitsProperlyConcatenate(FromMask, NewFromMask))
+      return V;
+    if (BitsProperlyConcatenate(NewToMask, ToMask) &&
+        BitsProperlyConcatenate(NewFromMask, FromMask))
+      return V;
+
+    // We've seen a write to some bits, so track it.
+    CombinedToMask |= NewToMask;
+    // Keep going...
+    V = V.getOperand(0);
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformBFICombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() == ISD::AND) {
+    // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
+    // the bits being cleared by the AND are not demanded by the BFI.
+    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    if (!N11C)
+      return SDValue();
+    unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    unsigned LSB = countTrailingZeros(~InvMask);
+    unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
+    assert(Width <
+               static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
+           "undefined behavior");
+    unsigned Mask = (1u << Width) - 1;
+    unsigned Mask2 = N11C->getZExtValue();
+    if ((Mask & (~Mask2)) == 0)
+      return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
+                             N->getOperand(0), N1.getOperand(0),
+                             N->getOperand(2));
+  } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
+    // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
+    // Keep track of any consecutive bits set that all come from the same base
+    // value. We can combine these together into a single BFI.
+    SDValue CombineBFI = FindBFIToCombineWith(N);
+    if (CombineBFI == SDValue())
+      return SDValue();
+
+    // We've found a BFI.
+    APInt ToMask1, FromMask1;
+    SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
+
+    APInt ToMask2, FromMask2;
+    SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
+    assert(From1 == From2);
+    (void)From2;
+
+    // First, unlink CombineBFI.
+    DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
+    // Then create a new BFI, combining the two together.
+    APInt NewFromMask = FromMask1 | FromMask2;
+    APInt NewToMask = ToMask1 | ToMask2;
+
+    EVT VT = N->getValueType(0);
+    SDLoc dl(N);
+
+    if (NewFromMask[0] == 0)
+      From1 = DCI.DAG.getNode(
+        ISD::SRL, dl, VT, From1,
+        DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
+    return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
+                           DCI.DAG.getConstant(~NewToMask, dl, VT));
+  }
+  return SDValue();
+}
+
+/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
+/// ARMISD::VMOVRRD.
+static SDValue PerformVMOVRRDCombine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const ARMSubtarget *Subtarget) {
+  // vmovrrd(vmovdrr x, y) -> x,y
+  SDValue InDouble = N->getOperand(0);
+  if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP())
+    return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
+
+  // vmovrrd(load f64) -> (load i32), (load i32)
+  SDNode *InNode = InDouble.getNode();
+  if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
+      InNode->getValueType(0) == MVT::f64 &&
+      InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
+      !cast<LoadSDNode>(InNode)->isVolatile()) {
+    // TODO: Should this be done for non-FrameIndex operands?
+    LoadSDNode *LD = cast<LoadSDNode>(InNode);
+
+    SelectionDAG &DAG = DCI.DAG;
+    SDLoc DL(LD);
+    SDValue BasePtr = LD->getBasePtr();
+    SDValue NewLD1 =
+        DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
+                    LD->getAlignment(), LD->getMemOperand()->getFlags());
+
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+                                    DAG.getConstant(4, DL, MVT::i32));
+    SDValue NewLD2 = DAG.getLoad(
+        MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(),
+        std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags());
+
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
+    if (DCI.DAG.getDataLayout().isBigEndian())
+      std::swap (NewLD1, NewLD2);
+    SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
+    return Result;
+  }
+
+  return SDValue();
+}
+
+/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
+/// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
+static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
+  // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  if (Op0.getOpcode() == ISD::BITCAST)
+    Op0 = Op0.getOperand(0);
+  if (Op1.getOpcode() == ISD::BITCAST)
+    Op1 = Op1.getOperand(0);
+  if (Op0.getOpcode() == ARMISD::VMOVRRD &&
+      Op0.getNode() == Op1.getNode() &&
+      Op0.getResNo() == 0 && Op1.getResNo() == 1)
+    return DAG.getNode(ISD::BITCAST, SDLoc(N),
+                       N->getValueType(0), Op0.getOperand(0));
+  return SDValue();
+}
+
+/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
+/// are normal, non-volatile loads.  If so, it is profitable to bitcast an
+/// i64 vector to have f64 elements, since the value can then be loaded
+/// directly into a VFP register.
+static bool hasNormalLoadOperand(SDNode *N) {
+  unsigned NumElts = N->getValueType(0).getVectorNumElements();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDNode *Elt = N->getOperand(i).getNode();
+    if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
+      return true;
+  }
+  return false;
+}
+
+/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
+/// ISD::BUILD_VECTOR.
+static SDValue PerformBUILD_VECTORCombine(SDNode *N,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          const ARMSubtarget *Subtarget) {
+  // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
+  // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
+  // into a pair of GPRs, which is fine when the value is used as a scalar,
+  // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
+  SelectionDAG &DAG = DCI.DAG;
+  if (N->getNumOperands() == 2)
+    if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
+      return RV;
+
+  // Load i64 elements as f64 values so that type legalization does not split
+  // them up into i32 values.
+  EVT VT = N->getValueType(0);
+  if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
+    return SDValue();
+  SDLoc dl(N);
+  SmallVector<SDValue, 8> Ops;
+  unsigned NumElts = VT.getVectorNumElements();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
+    Ops.push_back(V);
+    // Make the DAGCombiner fold the bitcast.
+    DCI.AddToWorklist(V.getNode());
+  }
+  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
+  SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
+  return DAG.getNode(ISD::BITCAST, dl, VT, BV);
+}
+
+/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
+static SDValue
+PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
+  // At that time, we may have inserted bitcasts from integer to float.
+  // If these bitcasts have survived DAGCombine, change the lowering of this
+  // BUILD_VECTOR in something more vector friendly, i.e., that does not
+  // force to use floating point types.
+
+  // Make sure we can change the type of the vector.
+  // This is possible iff:
+  // 1. The vector is only used in a bitcast to a integer type. I.e.,
+  //    1.1. Vector is used only once.
+  //    1.2. Use is a bit convert to an integer type.
+  // 2. The size of its operands are 32-bits (64-bits are not legal).
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+
+  // Check 1.1. and 2.
+  if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
+    return SDValue();
+
+  // By construction, the input type must be float.
+  assert(EltVT == MVT::f32 && "Unexpected type!");
+
+  // Check 1.2.
+  SDNode *Use = *N->use_begin();
+  if (Use->getOpcode() != ISD::BITCAST ||
+      Use->getValueType(0).isFloatingPoint())
+    return SDValue();
+
+  // Check profitability.
+  // Model is, if more than half of the relevant operands are bitcast from
+  // i32, turn the build_vector into a sequence of insert_vector_elt.
+  // Relevant operands are everything that is not statically
+  // (i.e., at compile time) bitcasted.
+  unsigned NumOfBitCastedElts = 0;
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumOfRelevantElts = NumElts;
+  for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
+    SDValue Elt = N->getOperand(Idx);
+    if (Elt->getOpcode() == ISD::BITCAST) {
+      // Assume only bit cast to i32 will go away.
+      if (Elt->getOperand(0).getValueType() == MVT::i32)
+        ++NumOfBitCastedElts;
+    } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
+      // Constants are statically casted, thus do not count them as
+      // relevant operands.
+      --NumOfRelevantElts;
+  }
+
+  // Check if more than half of the elements require a non-free bitcast.
+  if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  // Create the new vector type.
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+  // Check if the type is legal.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isTypeLegal(VecVT))
+    return SDValue();
+
+  // Combine:
+  // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
+  // => BITCAST INSERT_VECTOR_ELT
+  //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
+  //                      (BITCAST EN), N.
+  SDValue Vec = DAG.getUNDEF(VecVT);
+  SDLoc dl(N);
+  for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
+    SDValue V = N->getOperand(Idx);
+    if (V.isUndef())
+      continue;
+    if (V.getOpcode() == ISD::BITCAST &&
+        V->getOperand(0).getValueType() == MVT::i32)
+      // Fold obvious case.
+      V = V.getOperand(0);
+    else {
+      V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
+      // Make the DAGCombiner fold the bitcasts.
+      DCI.AddToWorklist(V.getNode());
+    }
+    SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
+    Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
+  }
+  Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+  // Make the DAGCombiner fold the bitcasts.
+  DCI.AddToWorklist(Vec.getNode());
+  return Vec;
+}
+
+/// PerformInsertEltCombine - Target-specific dag combine xforms for
+/// ISD::INSERT_VECTOR_ELT.
+static SDValue PerformInsertEltCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI) {
+  // Bitcast an i64 load inserted into a vector to f64.
+  // Otherwise, the i64 value will be legalized to a pair of i32 values.
+  EVT VT = N->getValueType(0);
+  SDNode *Elt = N->getOperand(1).getNode();
+  if (VT.getVectorElementType() != MVT::i64 ||
+      !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
+                                 VT.getVectorNumElements());
+  SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
+  SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
+  // Make the DAGCombiner fold the bitcasts.
+  DCI.AddToWorklist(Vec.getNode());
+  DCI.AddToWorklist(V.getNode());
+  SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
+                               Vec, V, N->getOperand(2));
+  return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
+}
+
+/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
+/// ISD::VECTOR_SHUFFLE.
+static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
+  // The LLVM shufflevector instruction does not require the shuffle mask
+  // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
+  // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
+  // operands do not match the mask length, they are extended by concatenating
+  // them with undef vectors.  That is probably the right thing for other
+  // targets, but for NEON it is better to concatenate two double-register
+  // size vector operands into a single quad-register size vector.  Do that
+  // transformation here:
+  //   shuffle(concat(v1, undef), concat(v2, undef)) ->
+  //   shuffle(concat(v1, v2), undef)
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
+      Op1.getOpcode() != ISD::CONCAT_VECTORS ||
+      Op0.getNumOperands() != 2 ||
+      Op1.getNumOperands() != 2)
+    return SDValue();
+  SDValue Concat0Op1 = Op0.getOperand(1);
+  SDValue Concat1Op1 = Op1.getOperand(1);
+  if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
+    return SDValue();
+  // Skip the transformation if any of the types are illegal.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = N->getValueType(0);
+  if (!TLI.isTypeLegal(VT) ||
+      !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
+      !TLI.isTypeLegal(Concat1Op1.getValueType()))
+    return SDValue();
+
+  SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
+                                  Op0.getOperand(0), Op1.getOperand(0));
+  // Translate the shuffle mask.
+  SmallVector<int, 16> NewMask;
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned HalfElts = NumElts/2;
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+  for (unsigned n = 0; n < NumElts; ++n) {
+    int MaskElt = SVN->getMaskElt(n);
+    int NewElt = -1;
+    if (MaskElt < (int)HalfElts)
+      NewElt = MaskElt;
+    else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
+      NewElt = HalfElts + MaskElt - NumElts;
+    NewMask.push_back(NewElt);
+  }
+  return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
+                              DAG.getUNDEF(VT), NewMask);
+}
+
+/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
+/// NEON load/store intrinsics, and generic vector load/stores, to merge
+/// base address updates.
+/// For generic load/stores, the memory type is assumed to be a vector.
+/// The caller is assumed to have checked legality.
+static SDValue CombineBaseUpdate(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
+                            N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+  const bool isStore = N->getOpcode() == ISD::STORE;
+  const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
+  SDValue Addr = N->getOperand(AddrOpIdx);
+  MemSDNode *MemN = cast<MemSDNode>(N);
+  SDLoc dl(N);
+
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+         UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD ||
+        UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load/store.  Otherwise, folding
+    // it would create a cycle.
+    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+      continue;
+
+    // Find the new opcode for the updating load/store.
+    bool isLoadOp = true;
+    bool isLaneOp = false;
+    unsigned NewOpc = 0;
+    unsigned NumVecs = 0;
+    if (isIntrinsic) {
+      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+      switch (IntNo) {
+      default: llvm_unreachable("unexpected intrinsic for Neon base update");
+      case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
+        NumVecs = 1; break;
+      case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
+        NumVecs = 2; break;
+      case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
+        NumVecs = 3; break;
+      case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
+        NumVecs = 4; break;
+      case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
+        NumVecs = 2; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
+        NumVecs = 3; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
+        NumVecs = 4; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
+        NumVecs = 1; isLoadOp = false; break;
+      case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
+        NumVecs = 2; isLoadOp = false; break;
+      case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
+        NumVecs = 3; isLoadOp = false; break;
+      case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
+        NumVecs = 4; isLoadOp = false; break;
+      case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
+        NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
+        NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
+        NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
+      }
+    } else {
+      isLaneOp = true;
+      switch (N->getOpcode()) {
+      default: llvm_unreachable("unexpected opcode for Neon base update");
+      case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
+      case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
+      case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
+      case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
+      case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
+        NumVecs = 1; isLaneOp = false; break;
+      case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
+        NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
+      }
+    }
+
+    // Find the size of memory referenced by the load/store.
+    EVT VecTy;
+    if (isLoadOp) {
+      VecTy = N->getValueType(0);
+    } else if (isIntrinsic) {
+      VecTy = N->getOperand(AddrOpIdx+1).getValueType();
+    } else {
+      assert(isStore && "Node has to be a load, a store, or an intrinsic!");
+      VecTy = N->getOperand(1).getValueType();
+    }
+
+    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+    if (isLaneOp)
+      NumBytes /= VecTy.getVectorNumElements();
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint64_t IncVal = CInc->getZExtValue();
+      if (IncVal != NumBytes)
+        continue;
+    } else if (NumBytes >= 3 * 16) {
+      // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
+      // separate instructions that make it harder to use a non-constant update.
+      continue;
+    }
+
+    // OK, we found an ADD we can fold into the base update.
+    // Now, create a _UPD node, taking care of not breaking alignment.
+
+    EVT AlignedVecTy = VecTy;
+    unsigned Alignment = MemN->getAlignment();
+
+    // If this is a less-than-standard-aligned load/store, change the type to
+    // match the standard alignment.
+    // The alignment is overlooked when selecting _UPD variants; and it's
+    // easier to introduce bitcasts here than fix that.
+    // There are 3 ways to get to this base-update combine:
+    // - intrinsics: they are assumed to be properly aligned (to the standard
+    //   alignment of the memory type), so we don't need to do anything.
+    // - ARMISD::VLDx nodes: they are only generated from the aforementioned
+    //   intrinsics, so, likewise, there's nothing to do.
+    // - generic load/store instructions: the alignment is specified as an
+    //   explicit operand, rather than implicitly as the standard alignment
+    //   of the memory type (like the intrisics).  We need to change the
+    //   memory type to match the explicit alignment.  That way, we don't
+    //   generate non-standard-aligned ARMISD::VLDx nodes.
+    if (isa<LSBaseSDNode>(N)) {
+      if (Alignment == 0)
+        Alignment = 1;
+      if (Alignment < VecTy.getScalarSizeInBits() / 8) {
+        MVT EltTy = MVT::getIntegerVT(Alignment * 8);
+        assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
+        assert(!isLaneOp && "Unexpected generic load/store lane.");
+        unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
+        AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
+      }
+      // Don't set an explicit alignment on regular load/stores that we want
+      // to transform to VLD/VST 1_UPD nodes.
+      // This matches the behavior of regular load/stores, which only get an
+      // explicit alignment if the MMO alignment is larger than the standard
+      // alignment of the memory type.
+      // Intrinsics, however, always get an explicit alignment, set to the
+      // alignment of the MMO.
+      Alignment = 1;
+    }
+
+    // Create the new updating load/store node.
+    // First, create an SDVTList for the new updating node's results.
+    EVT Tys[6];
+    unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
+    unsigned n;
+    for (n = 0; n < NumResultVecs; ++n)
+      Tys[n] = AlignedVecTy;
+    Tys[n++] = MVT::i32;
+    Tys[n] = MVT::Other;
+    SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
+
+    // Then, gather the new node's operands.
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(N->getOperand(0)); // incoming chain
+    Ops.push_back(N->getOperand(AddrOpIdx));
+    Ops.push_back(Inc);
+
+    if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
+      // Try to match the intrinsic's signature
+      Ops.push_back(StN->getValue());
+    } else {
+      // Loads (and of course intrinsics) match the intrinsics' signature,
+      // so just add all but the alignment operand.
+      for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
+        Ops.push_back(N->getOperand(i));
+    }
+
+    // For all node types, the alignment operand is always the last one.
+    Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
+
+    // If this is a non-standard-aligned STORE, the penultimate operand is the
+    // stored value.  Bitcast it to the aligned type.
+    if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
+      SDValue &StVal = Ops[Ops.size()-2];
+      StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
+    }
+
+    EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
+                                           MemN->getMemOperand());
+
+    // Update the uses.
+    SmallVector<SDValue, 5> NewResults;
+    for (unsigned i = 0; i < NumResultVecs; ++i)
+      NewResults.push_back(SDValue(UpdN.getNode(), i));
+
+    // If this is an non-standard-aligned LOAD, the first result is the loaded
+    // value.  Bitcast it to the expected result type.
+    if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
+      SDValue &LdVal = NewResults[0];
+      LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
+    }
+
+    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
+    DCI.CombineTo(N, NewResults);
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+    break;
+  }
+  return SDValue();
+}
+
+static SDValue PerformVLDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  return CombineBaseUpdate(N, DCI);
+}
+
+/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
+/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
+/// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
+/// return true.
+static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  // vldN-dup instructions only support 64-bit vectors for N > 1.
+  if (!VT.is64BitVector())
+    return false;
+
+  // Check if the VDUPLANE operand is a vldN-dup intrinsic.
+  SDNode *VLD = N->getOperand(0).getNode();
+  if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return false;
+  unsigned NumVecs = 0;
+  unsigned NewOpc = 0;
+  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
+  if (IntNo == Intrinsic::arm_neon_vld2lane) {
+    NumVecs = 2;
+    NewOpc = ARMISD::VLD2DUP;
+  } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
+    NumVecs = 3;
+    NewOpc = ARMISD::VLD3DUP;
+  } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
+    NumVecs = 4;
+    NewOpc = ARMISD::VLD4DUP;
+  } else {
+    return false;
+  }
+
+  // First check that all the vldN-lane uses are VDUPLANEs and that the lane
+  // numbers match the load.
+  unsigned VLDLaneNo =
+    cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
+  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+       UI != UE; ++UI) {
+    // Ignore uses of the chain result.
+    if (UI.getUse().getResNo() == NumVecs)
+      continue;
+    SDNode *User = *UI;
+    if (User->getOpcode() != ARMISD::VDUPLANE ||
+        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
+      return false;
+  }
+
+  // Create the vldN-dup node.
+  EVT Tys[5];
+  unsigned n;
+  for (n = 0; n < NumVecs; ++n)
+    Tys[n] = VT;
+  Tys[n] = MVT::Other;
+  SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
+  SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
+  MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
+  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
+                                           Ops, VLDMemInt->getMemoryVT(),
+                                           VLDMemInt->getMemOperand());
+
+  // Update the uses.
+  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+       UI != UE; ++UI) {
+    unsigned ResNo = UI.getUse().getResNo();
+    // Ignore uses of the chain result.
+    if (ResNo == NumVecs)
+      continue;
+    SDNode *User = *UI;
+    DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
+  }
+
+  // Now the vldN-lane intrinsic is dead except for its chain result.
+  // Update uses of the chain.
+  std::vector<SDValue> VLDDupResults;
+  for (unsigned n = 0; n < NumVecs; ++n)
+    VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
+  VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
+  DCI.CombineTo(VLD, VLDDupResults);
+
+  return true;
+}
+
+/// PerformVDUPLANECombine - Target-specific dag combine xforms for
+/// ARMISD::VDUPLANE.
+static SDValue PerformVDUPLANECombine(SDNode *N,
+                                      TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue Op = N->getOperand(0);
+
+  // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
+  // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
+  if (CombineVLDDUP(N, DCI))
+    return SDValue(N, 0);
+
+  // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
+  // redundant.  Ignore bit_converts for now; element sizes are checked below.
+  while (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
+    return SDValue();
+
+  // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
+  unsigned EltSize = Op.getScalarValueSizeInBits();
+  // The canonical VMOV for a zero vector uses a 32-bit element size.
+  unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned EltBits;
+  if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
+    EltSize = 8;
+  EVT VT = N->getValueType(0);
+  if (EltSize > VT.getScalarSizeInBits())
+    return SDValue();
+
+  return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
+}
+
+/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
+static SDValue PerformVDUPCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Op = N->getOperand(0);
+
+  // Match VDUP(LOAD) -> VLD1DUP.
+  // We match this pattern here rather than waiting for isel because the
+  // transform is only legal for unindexed loads.
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
+  if (LD && Op.hasOneUse() && LD->isUnindexed() &&
+      LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
+    SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1),
+                      DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) };
+    SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
+    SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys,
+                                             Ops, LD->getMemoryVT(),
+                                             LD->getMemOperand());
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
+    return VLDDup;
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformLOADCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+
+  // If this is a legal vector load, try to combine it into a VLD1_UPD.
+  if (ISD::isNormalLoad(N) && VT.isVector() &&
+      DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return CombineBaseUpdate(N, DCI);
+
+  return SDValue();
+}
+
+/// PerformSTORECombine - Target-specific dag combine xforms for
+/// ISD::STORE.
+static SDValue PerformSTORECombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  if (St->isVolatile())
+    return SDValue();
+
+  // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
+  // pack all of the elements in one place.  Next, store to memory in fewer
+  // chunks.
+  SDValue StVal = St->getValue();
+  EVT VT = StVal.getValueType();
+  if (St->isTruncatingStore() && VT.isVector()) {
+    SelectionDAG &DAG = DCI.DAG;
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    EVT StVT = St->getMemoryVT();
+    unsigned NumElems = VT.getVectorNumElements();
+    assert(StVT != VT && "Cannot truncate to the same type");
+    unsigned FromEltSz = VT.getScalarSizeInBits();
+    unsigned ToEltSz = StVT.getScalarSizeInBits();
+
+    // From, To sizes and ElemCount must be pow of two
+    if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
+
+    // We are going to use the original vector elt for storing.
+    // Accumulated smaller vector elements must be a multiple of the store size.
+    if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
+
+    unsigned SizeRatio  = FromEltSz / ToEltSz;
+    assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
+
+    // Create a type on which we perform the shuffle.
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
+                                     NumElems*SizeRatio);
+    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SDLoc DL(St);
+    SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i < NumElems; ++i)
+      ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
+                          ? (i + 1) * SizeRatio - 1
+                          : i * SizeRatio;
+
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
+                                DAG.getUNDEF(WideVec.getValueType()),
+                                ShuffleVec);
+    // At this point all of the data is stored at the bottom of the
+    // register. We now need to save it to mem.
+
+    // Find the largest store unit
+    MVT StoreType = MVT::i8;
+    for (MVT Tp : MVT::integer_valuetypes()) {
+      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
+        StoreType = Tp;
+    }
+    // Didn't find a legal store type.
+    if (!TLI.isTypeLegal(StoreType))
+      return SDValue();
+
+    // Bitcast the original vector into a vector of store-size units
+    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
+    SmallVector<SDValue, 8> Chains;
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
+                                        TLI.getPointerTy(DAG.getDataLayout()));
+    SDValue BasePtr = St->getBasePtr();
+
+    // Perform one or more big stores into memory.
+    unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
+    for (unsigned I = 0; I < E; I++) {
+      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                                   StoreType, ShuffWide,
+                                   DAG.getIntPtrConstant(I, DL));
+      SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
+                                St->getPointerInfo(), St->getAlignment(),
+                                St->getMemOperand()->getFlags());
+      BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
+                            Increment);
+      Chains.push_back(Ch);
+    }
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  }
+
+  if (!ISD::isNormalStore(St))
+    return SDValue();
+
+  // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
+  // ARM stores of arguments in the same cache line.
+  if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
+      StVal.getNode()->hasOneUse()) {
+    SelectionDAG  &DAG = DCI.DAG;
+    bool isBigEndian = DAG.getDataLayout().isBigEndian();
+    SDLoc DL(St);
+    SDValue BasePtr = St->getBasePtr();
+    SDValue NewST1 = DAG.getStore(
+        St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
+        BasePtr, St->getPointerInfo(), St->getAlignment(),
+        St->getMemOperand()->getFlags());
+
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+                                    DAG.getConstant(4, DL, MVT::i32));
+    return DAG.getStore(NewST1.getValue(0), DL,
+                        StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
+                        OffsetPtr, St->getPointerInfo(),
+                        std::min(4U, St->getAlignment() / 2),
+                        St->getMemOperand()->getFlags());
+  }
+
+  if (StVal.getValueType() == MVT::i64 &&
+      StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+
+    // Bitcast an i64 store extracted from a vector to f64.
+    // Otherwise, the i64 value will be legalized to a pair of i32 values.
+    SelectionDAG &DAG = DCI.DAG;
+    SDLoc dl(StVal);
+    SDValue IntVec = StVal.getOperand(0);
+    EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
+                                   IntVec.getValueType().getVectorNumElements());
+    SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
+    SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                                 Vec, StVal.getOperand(1));
+    dl = SDLoc(N);
+    SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
+    // Make the DAGCombiner fold the bitcasts.
+    DCI.AddToWorklist(Vec.getNode());
+    DCI.AddToWorklist(ExtElt.getNode());
+    DCI.AddToWorklist(V.getNode());
+    return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
+                        St->getPointerInfo(), St->getAlignment(),
+                        St->getMemOperand()->getFlags(), St->getAAInfo());
+  }
+
+  // If this is a legal vector store, try to combine it into a VST1_UPD.
+  if (ISD::isNormalStore(N) && VT.isVector() &&
+      DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return CombineBaseUpdate(N, DCI);
+
+  return SDValue();
+}
+
+/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
+/// can replace combinations of VMUL and VCVT (floating-point to integer)
+/// when the VMUL has a constant operand that is a power of 2.
+///
+/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
+///  vmul.f32        d16, d17, d16
+///  vcvt.s32.f32    d16, d16
+/// becomes:
+///  vcvt.s32.f32    d16, d16, #3
+static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
+                                  const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
+  SDValue Op = N->getOperand(0);
+  if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
+      Op.getOpcode() != ISD::FMUL)
+    return SDValue();
+
+  SDValue ConstVec = Op->getOperand(1);
+  if (!isa<BuildVectorSDNode>(ConstVec))
+    return SDValue();
+
+  MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
+  uint32_t FloatBits = FloatTy.getSizeInBits();
+  MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
+  uint32_t IntBits = IntTy.getSizeInBits();
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
+    // These instructions only exist converting from f32 to i32. We can handle
+    // smaller integers by generating an extra truncate, but larger ones would
+    // be lossy. We also can't handle more then 4 lanes, since these intructions
+    // only support v2i32/v4i32 types.
+    return SDValue();
+  }
+
+  BitVector UndefElements;
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
+  if (C == -1 || C == 0 || C > 32)
+    return SDValue();
+
+  SDLoc dl(N);
+  bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
+  unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
+    Intrinsic::arm_neon_vcvtfp2fxu;
+  SDValue FixConv = DAG.getNode(
+      ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
+      DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
+      DAG.getConstant(C, dl, MVT::i32));
+
+  if (IntBits < FloatBits)
+    FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
+
+  return FixConv;
+}
+
+/// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
+/// can replace combinations of VCVT (integer to floating-point) and VDIV
+/// when the VDIV has a constant operand that is a power of 2.
+///
+/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
+///  vcvt.f32.s32    d16, d16
+///  vdiv.f32        d16, d17, d16
+/// becomes:
+///  vcvt.f32.s32    d16, d16, #3
+static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
+                                  const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
+  SDValue Op = N->getOperand(0);
+  unsigned OpOpcode = Op.getNode()->getOpcode();
+  if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
+      (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
+    return SDValue();
+
+  SDValue ConstVec = N->getOperand(1);
+  if (!isa<BuildVectorSDNode>(ConstVec))
+    return SDValue();
+
+  MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
+  uint32_t FloatBits = FloatTy.getSizeInBits();
+  MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
+  uint32_t IntBits = IntTy.getSizeInBits();
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
+    // These instructions only exist converting from i32 to f32. We can handle
+    // smaller integers by generating an extra extend, but larger ones would
+    // be lossy. We also can't handle more then 4 lanes, since these intructions
+    // only support v2i32/v4i32 types.
+    return SDValue();
+  }
+
+  BitVector UndefElements;
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
+  if (C == -1 || C == 0 || C > 32)
+    return SDValue();
+
+  SDLoc dl(N);
+  bool isSigned = OpOpcode == ISD::SINT_TO_FP;
+  SDValue ConvInput = Op.getOperand(0);
+  if (IntBits < FloatBits)
+    ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
+                            dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
+                            ConvInput);
+
+  unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
+    Intrinsic::arm_neon_vcvtfxu2fp;
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
+                     Op.getValueType(),
+                     DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
+                     ConvInput, DAG.getConstant(C, dl, MVT::i32));
+}
+
+/// Getvshiftimm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+  // Ignore bit_converts.
+  while (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                      HasAnyUndefs, ElementBits) ||
+      SplatBitSize > ElementBits)
+    return false;
+  Cnt = SplatBits.getSExtValue();
+  return true;
+}
+
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation.  That value must be in the range:
+///   0 <= Value < ElementBits for a left shift; or
+///   0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  int64_t ElementBits = VT.getScalarSizeInBits();
+  if (! getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
+}
+
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation.  For a shift opcode, the value
+/// is positive, but for an intrinsic the value count must be negative. The
+/// absolute value must be in the range:
+///   1 <= |Value| <= ElementBits for a right shift; or
+///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
+                         int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  int64_t ElementBits = VT.getScalarSizeInBits();
+  if (! getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  if (!isIntrinsic)
+    return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
+  if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) {
+    Cnt = -Cnt;
+    return true;
+  }
+  return false;
+}
+
+/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
+static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
+  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  switch (IntNo) {
+  default:
+    // Don't do anything for most intrinsics.
+    break;
+
+  // Vector shifts: check for immediate versions and lower them.
+  // Note: This is done during DAG combining instead of DAG legalizing because
+  // the build_vectors for 64-bit vector element shift counts are generally
+  // not legal, and it is hard to see their values after they get legalized to
+  // loads from a constant pool.
+  case Intrinsic::arm_neon_vshifts:
+  case Intrinsic::arm_neon_vshiftu:
+  case Intrinsic::arm_neon_vrshifts:
+  case Intrinsic::arm_neon_vrshiftu:
+  case Intrinsic::arm_neon_vrshiftn:
+  case Intrinsic::arm_neon_vqshifts:
+  case Intrinsic::arm_neon_vqshiftu:
+  case Intrinsic::arm_neon_vqshiftsu:
+  case Intrinsic::arm_neon_vqshiftns:
+  case Intrinsic::arm_neon_vqshiftnu:
+  case Intrinsic::arm_neon_vqshiftnsu:
+  case Intrinsic::arm_neon_vqrshiftns:
+  case Intrinsic::arm_neon_vqrshiftnu:
+  case Intrinsic::arm_neon_vqrshiftnsu: {
+    EVT VT = N->getOperand(1).getValueType();
+    int64_t Cnt;
+    unsigned VShiftOpc = 0;
+
+    switch (IntNo) {
+    case Intrinsic::arm_neon_vshifts:
+    case Intrinsic::arm_neon_vshiftu:
+      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
+        VShiftOpc = ARMISD::VSHL;
+        break;
+      }
+      if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
+        VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
+                     ARMISD::VSHRs : ARMISD::VSHRu);
+        break;
+      }
+      return SDValue();
+
+    case Intrinsic::arm_neon_vrshifts:
+    case Intrinsic::arm_neon_vrshiftu:
+      if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
+        break;
+      return SDValue();
+
+    case Intrinsic::arm_neon_vqshifts:
+    case Intrinsic::arm_neon_vqshiftu:
+      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
+        break;
+      return SDValue();
+
+    case Intrinsic::arm_neon_vqshiftsu:
+      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
+        break;
+      llvm_unreachable("invalid shift count for vqshlu intrinsic");
+
+    case Intrinsic::arm_neon_vrshiftn:
+    case Intrinsic::arm_neon_vqshiftns:
+    case Intrinsic::arm_neon_vqshiftnu:
+    case Intrinsic::arm_neon_vqshiftnsu:
+    case Intrinsic::arm_neon_vqrshiftns:
+    case Intrinsic::arm_neon_vqrshiftnu:
+    case Intrinsic::arm_neon_vqrshiftnsu:
+      // Narrowing shifts require an immediate right shift.
+      if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
+        break;
+      llvm_unreachable("invalid shift count for narrowing vector shift "
+                       "intrinsic");
+
+    default:
+      llvm_unreachable("unhandled vector shift");
+    }
+
+    switch (IntNo) {
+    case Intrinsic::arm_neon_vshifts:
+    case Intrinsic::arm_neon_vshiftu:
+      // Opcode already set above.
+      break;
+    case Intrinsic::arm_neon_vrshifts:
+      VShiftOpc = ARMISD::VRSHRs; break;
+    case Intrinsic::arm_neon_vrshiftu:
+      VShiftOpc = ARMISD::VRSHRu; break;
+    case Intrinsic::arm_neon_vrshiftn:
+      VShiftOpc = ARMISD::VRSHRN; break;
+    case Intrinsic::arm_neon_vqshifts:
+      VShiftOpc = ARMISD::VQSHLs; break;
+    case Intrinsic::arm_neon_vqshiftu:
+      VShiftOpc = ARMISD::VQSHLu; break;
+    case Intrinsic::arm_neon_vqshiftsu:
+      VShiftOpc = ARMISD::VQSHLsu; break;
+    case Intrinsic::arm_neon_vqshiftns:
+      VShiftOpc = ARMISD::VQSHRNs; break;
+    case Intrinsic::arm_neon_vqshiftnu:
+      VShiftOpc = ARMISD::VQSHRNu; break;
+    case Intrinsic::arm_neon_vqshiftnsu:
+      VShiftOpc = ARMISD::VQSHRNsu; break;
+    case Intrinsic::arm_neon_vqrshiftns:
+      VShiftOpc = ARMISD::VQRSHRNs; break;
+    case Intrinsic::arm_neon_vqrshiftnu:
+      VShiftOpc = ARMISD::VQRSHRNu; break;
+    case Intrinsic::arm_neon_vqrshiftnsu:
+      VShiftOpc = ARMISD::VQRSHRNsu; break;
+    }
+
+    SDLoc dl(N);
+    return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
+                       N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
+  }
+
+  case Intrinsic::arm_neon_vshiftins: {
+    EVT VT = N->getOperand(1).getValueType();
+    int64_t Cnt;
+    unsigned VShiftOpc = 0;
+
+    if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
+      VShiftOpc = ARMISD::VSLI;
+    else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
+      VShiftOpc = ARMISD::VSRI;
+    else {
+      llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
+    }
+
+    SDLoc dl(N);
+    return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2),
+                       DAG.getConstant(Cnt, dl, MVT::i32));
+  }
+
+  case Intrinsic::arm_neon_vqrshifts:
+  case Intrinsic::arm_neon_vqrshiftu:
+    // No immediate versions of these to check for.
+    break;
+  }
+
+  return SDValue();
+}
+
+/// PerformShiftCombine - Checks for immediate versions of vector shifts and
+/// lowers them.  As with the vector shift intrinsics, this is done during DAG
+/// combining instead of DAG legalizing because the build_vectors for 64-bit
+/// vector element shift counts are generally not legal, and it is hard to see
+/// their values after they get legalized to loads from a constant pool.
+static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
+                                   const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
+    // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
+    // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
+    SDValue N1 = N->getOperand(1);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+      SDValue N0 = N->getOperand(0);
+      if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
+          DAG.MaskedValueIsZero(N0.getOperand(0),
+                                APInt::getHighBitsSet(32, 16)))
+        return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
+    }
+  }
+
+  // Nothing to be done for scalar shifts.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!VT.isVector() || !TLI.isTypeLegal(VT))
+    return SDValue();
+
+  assert(ST->hasNEON() && "unexpected vector shift");
+  int64_t Cnt;
+
+  switch (N->getOpcode()) {
+  default: llvm_unreachable("unexpected shift opcode");
+
+  case ISD::SHL:
+    if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
+      SDLoc dl(N);
+      return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0),
+                         DAG.getConstant(Cnt, dl, MVT::i32));
+    }
+    break;
+
+  case ISD::SRA:
+  case ISD::SRL:
+    if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
+      unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
+                            ARMISD::VSHRs : ARMISD::VSHRu);
+      SDLoc dl(N);
+      return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
+                         DAG.getConstant(Cnt, dl, MVT::i32));
+    }
+  }
+  return SDValue();
+}
+
+/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
+/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
+static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
+                                    const ARMSubtarget *ST) {
+  SDValue N0 = N->getOperand(0);
+
+  // Check for sign- and zero-extensions of vector extract operations of 8-
+  // and 16-bit vector elements.  NEON supports these directly.  They are
+  // handled during DAG combining because type legalization will promote them
+  // to 32-bit types and it is messy to recognize the operations after that.
+  if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    SDValue Vec = N0.getOperand(0);
+    SDValue Lane = N0.getOperand(1);
+    EVT VT = N->getValueType(0);
+    EVT EltVT = N0.getValueType();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+    if (VT == MVT::i32 &&
+        (EltVT == MVT::i8 || EltVT == MVT::i16) &&
+        TLI.isTypeLegal(Vec.getValueType()) &&
+        isa<ConstantSDNode>(Lane)) {
+
+      unsigned Opc = 0;
+      switch (N->getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case ISD::SIGN_EXTEND:
+        Opc = ARMISD::VGETLANEs;
+        break;
+      case ISD::ZERO_EXTEND:
+      case ISD::ANY_EXTEND:
+        Opc = ARMISD::VGETLANEu;
+        break;
+      }
+      return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
+    }
+  }
+
+  return SDValue();
+}
+
+static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero,
+                             APInt &KnownOne) {
+  if (Op.getOpcode() == ARMISD::BFI) {
+    // Conservatively, we can recurse down the first operand
+    // and just mask out all affected bits.
+    computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne);
+
+    // The operand to BFI is already a mask suitable for removing the bits it
+    // sets.
+    ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
+    const APInt &Mask = CI->getAPIntValue();
+    KnownZero &= Mask;
+    KnownOne &= Mask;
+    return;
+  }
+  if (Op.getOpcode() == ARMISD::CMOV) {
+    APInt KZ2(KnownZero.getBitWidth(), 0);
+    APInt KO2(KnownOne.getBitWidth(), 0);
+    computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne);
+    computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2);
+
+    KnownZero &= KZ2;
+    KnownOne &= KO2;
+    return;
+  }
+  return DAG.computeKnownBits(Op, KnownZero, KnownOne);
+}
+
+SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
+  // If we have a CMOV, OR and AND combination such as:
+  //   if (x & CN)
+  //     y |= CM;
+  //
+  // And:
+  //   * CN is a single bit;
+  //   * All bits covered by CM are known zero in y
+  //
+  // Then we can convert this into a sequence of BFI instructions. This will
+  // always be a win if CM is a single bit, will always be no worse than the
+  // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
+  // three bits (due to the extra IT instruction).
+
+  SDValue Op0 = CMOV->getOperand(0);
+  SDValue Op1 = CMOV->getOperand(1);
+  auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
+  auto CC = CCNode->getAPIntValue().getLimitedValue();
+  SDValue CmpZ = CMOV->getOperand(4);
+
+  // The compare must be against zero.
+  if (!isNullConstant(CmpZ->getOperand(1)))
+    return SDValue();
+
+  assert(CmpZ->getOpcode() == ARMISD::CMPZ);
+  SDValue And = CmpZ->getOperand(0);
+  if (And->getOpcode() != ISD::AND)
+    return SDValue();
+  ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(And->getOperand(1));
+  if (!AndC || !AndC->getAPIntValue().isPowerOf2())
+    return SDValue();
+  SDValue X = And->getOperand(0);
+
+  if (CC == ARMCC::EQ) {
+    // We're performing an "equal to zero" compare. Swap the operands so we
+    // canonicalize on a "not equal to zero" compare.
+    std::swap(Op0, Op1);
+  } else {
+    assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
+  }
+
+  if (Op1->getOpcode() != ISD::OR)
+    return SDValue();
+
+  ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
+  if (!OrC)
+    return SDValue();
+  SDValue Y = Op1->getOperand(0);
+
+  if (Op0 != Y)
+    return SDValue();
+
+  // Now, is it profitable to continue?
+  APInt OrCI = OrC->getAPIntValue();
+  unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
+  if (OrCI.countPopulation() > Heuristic)
+    return SDValue();
+
+  // Lastly, can we determine that the bits defined by OrCI
+  // are zero in Y?
+  APInt KnownZero, KnownOne;
+  computeKnownBits(DAG, Y, KnownZero, KnownOne);
+  if ((OrCI & KnownZero) != OrCI)
+    return SDValue();
+
+  // OK, we can do the combine.
+  SDValue V = Y;
+  SDLoc dl(X);
+  EVT VT = X.getValueType();
+  unsigned BitInX = AndC->getAPIntValue().logBase2();
+
+  if (BitInX != 0) {
+    // We must shift X first.
+    X = DAG.getNode(ISD::SRL, dl, VT, X,
+                    DAG.getConstant(BitInX, dl, VT));
+  }
+
+  for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
+       BitInY < NumActiveBits; ++BitInY) {
+    if (OrCI[BitInY] == 0)
+      continue;
+    APInt Mask(VT.getSizeInBits(), 0);
+    Mask.setBit(BitInY);
+    V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
+                    // Confusingly, the operand is an *inverted* mask.
+                    DAG.getConstant(~Mask, dl, VT));
+  }
+
+  return V;
+}
+
+/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
+SDValue
+ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
+  SDValue Cmp = N->getOperand(4);
+  if (Cmp.getOpcode() != ARMISD::CMPZ)
+    // Only looking at NE cases.
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  SDValue LHS = Cmp.getOperand(0);
+  SDValue RHS = Cmp.getOperand(1);
+  SDValue Chain = N->getOperand(0);
+  SDValue BB = N->getOperand(1);
+  SDValue ARMcc = N->getOperand(2);
+  ARMCC::CondCodes CC =
+    (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
+
+  // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
+  // -> (brcond Chain BB CC CPSR Cmp)
+  if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
+      LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
+      LHS->getOperand(0)->hasOneUse()) {
+    auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
+    auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
+    auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
+    auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
+    if ((LHS00C && LHS00C->getZExtValue() == 0) &&
+        (LHS01C && LHS01C->getZExtValue() == 1) &&
+        (LHS1C && LHS1C->getZExtValue() == 1) &&
+        (RHSC && RHSC->getZExtValue() == 0)) {
+      return DAG.getNode(
+          ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
+          LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
+    }
+  }
+
+  return SDValue();
+}
+
+/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
+SDValue
+ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
+  SDValue Cmp = N->getOperand(4);
+  if (Cmp.getOpcode() != ARMISD::CMPZ)
+    // Only looking at EQ and NE cases.
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  SDValue LHS = Cmp.getOperand(0);
+  SDValue RHS = Cmp.getOperand(1);
+  SDValue FalseVal = N->getOperand(0);
+  SDValue TrueVal = N->getOperand(1);
+  SDValue ARMcc = N->getOperand(2);
+  ARMCC::CondCodes CC =
+    (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
+
+  // BFI is only available on V6T2+.
+  if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
+    SDValue R = PerformCMOVToBFICombine(N, DAG);
+    if (R)
+      return R;
+  }
+
+  // Simplify
+  //   mov     r1, r0
+  //   cmp     r1, x
+  //   mov     r0, y
+  //   moveq   r0, x
+  // to
+  //   cmp     r0, x
+  //   movne   r0, y
+  //
+  //   mov     r1, r0
+  //   cmp     r1, x
+  //   mov     r0, x
+  //   movne   r0, y
+  // to
+  //   cmp     r0, x
+  //   movne   r0, y
+  /// FIXME: Turn this into a target neutral optimization?
+  SDValue Res;
+  if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
+    Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
+                      N->getOperand(3), Cmp);
+  } else if (CC == ARMCC::EQ && TrueVal == RHS) {
+    SDValue ARMcc;
+    SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
+    Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
+                      N->getOperand(3), NewCmp);
+  }
+
+  // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
+  // -> (cmov F T CC CPSR Cmp)
+  if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
+    auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
+    auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
+    auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
+    if ((LHS0C && LHS0C->getZExtValue() == 0) &&
+        (LHS1C && LHS1C->getZExtValue() == 1) &&
+        (RHSC && RHSC->getZExtValue() == 0)) {
+      return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
+                         LHS->getOperand(2), LHS->getOperand(3),
+                         LHS->getOperand(4));
+    }
+  }
+
+  if (Res.getNode()) {
+    APInt KnownZero, KnownOne;
+    DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne);
+    // Capture demanded bits information that would be otherwise lost.
+    if (KnownZero == 0xfffffffe)
+      Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
+                        DAG.getValueType(MVT::i1));
+    else if (KnownZero == 0xffffff00)
+      Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
+                        DAG.getValueType(MVT::i8));
+    else if (KnownZero == 0xffff0000)
+      Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
+                        DAG.getValueType(MVT::i16));
+  }
+
+  return Res;
+}
+
+SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::ADDC:       return PerformADDCCombine(N, DCI, Subtarget);
+  case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
+  case ISD::SUB:        return PerformSUBCombine(N, DCI);
+  case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
+  case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
+  case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
+  case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
+  case ARMISD::BFI:     return PerformBFICombine(N, DCI);
+  case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
+  case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
+  case ISD::STORE:      return PerformSTORECombine(N, DCI);
+  case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
+  case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
+  case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
+  case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
+  case ARMISD::VDUP: return PerformVDUPCombine(N, DCI);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return PerformVCVTCombine(N, DCI.DAG, Subtarget);
+  case ISD::FDIV:
+    return PerformVDIVCombine(N, DCI.DAG, Subtarget);
+  case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
+  case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
+  case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
+  case ISD::LOAD:       return PerformLOADCombine(N, DCI);
+  case ARMISD::VLD1DUP:
+  case ARMISD::VLD2DUP:
+  case ARMISD::VLD3DUP:
+  case ARMISD::VLD4DUP:
+    return PerformVLDCombine(N, DCI);
+  case ARMISD::BUILD_VECTOR:
+    return PerformARMBUILD_VECTORCombine(N, DCI);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    case Intrinsic::arm_neon_vld1:
+    case Intrinsic::arm_neon_vld2:
+    case Intrinsic::arm_neon_vld3:
+    case Intrinsic::arm_neon_vld4:
+    case Intrinsic::arm_neon_vld2lane:
+    case Intrinsic::arm_neon_vld3lane:
+    case Intrinsic::arm_neon_vld4lane:
+    case Intrinsic::arm_neon_vst1:
+    case Intrinsic::arm_neon_vst2:
+    case Intrinsic::arm_neon_vst3:
+    case Intrinsic::arm_neon_vst4:
+    case Intrinsic::arm_neon_vst2lane:
+    case Intrinsic::arm_neon_vst3lane:
+    case Intrinsic::arm_neon_vst4lane:
+      return PerformVLDCombine(N, DCI);
+    default: break;
+    }
+    break;
+  }
+  return SDValue();
+}
+
+bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
+                                                          EVT VT) const {
+  return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
+}
+
+bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                       unsigned,
+                                                       unsigned,
+                                                       bool *Fast) const {
+  // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
+  bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    return false;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32: {
+    // Unaligned access can use (for example) LRDB, LRDH, LDR
+    if (AllowsUnaligned) {
+      if (Fast)
+        *Fast = Subtarget->hasV7Ops();
+      return true;
+    }
+    return false;
+  }
+  case MVT::f64:
+  case MVT::v2f64: {
+    // For any little-endian targets with neon, we can support unaligned ld/st
+    // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
+    // A big-endian target may also explicitly support unaligned accesses
+    if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
+      if (Fast)
+        *Fast = true;
+      return true;
+    }
+    return false;
+  }
+  }
+}
+
+static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
+                       unsigned AlignCheck) {
+  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
+          (DstAlign == 0 || DstAlign % AlignCheck == 0));
+}
+
+EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
+                                           unsigned DstAlign, unsigned SrcAlign,
+                                           bool IsMemset, bool ZeroMemset,
+                                           bool MemcpyStrSrc,
+                                           MachineFunction &MF) const {
+  const Function *F = MF.getFunction();
+
+  // See if we can use NEON instructions for this...
+  if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
+      !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+    bool Fast;
+    if (Size >= 16 &&
+        (memOpAlign(SrcAlign, DstAlign, 16) ||
+         (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) {
+      return MVT::v2f64;
+    } else if (Size >= 8 &&
+               (memOpAlign(SrcAlign, DstAlign, 8) ||
+                (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) &&
+                 Fast))) {
+      return MVT::f64;
+    }
+  }
+
+  // Lowering to i32/i16 if the size permits.
+  if (Size >= 4)
+    return MVT::i32;
+  else if (Size >= 2)
+    return MVT::i16;
+
+  // Let the target-independent logic figure it out.
+  return MVT::Other;
+}
+
+bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  EVT VT1 = Val.getValueType();
+  if (!VT1.isSimple() || !VT1.isInteger() ||
+      !VT2.isSimple() || !VT2.isInteger())
+    return false;
+
+  switch (VT1.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+    // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
+  EVT VT = ExtVal.getValueType();
+
+  if (!isTypeLegal(VT))
+    return false;
+
+  // Don't create a loadext if we can fold the extension into a wide/long
+  // instruction.
+  // If there's more than one user instruction, the loadext is desirable no
+  // matter what.  There can be two uses by the same instruction.
+  if (ExtVal->use_empty() ||
+      !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
+    return true;
+
+  SDNode *U = *ExtVal->use_begin();
+  if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
+       U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL))
+    return false;
+
+  return true;
+}
+
+bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+
+  if (!isTypeLegal(EVT::getEVT(Ty1)))
+    return false;
+
+  assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
+
+  // Assuming the caller doesn't have a zeroext or signext return parameter,
+  // truncation all the way down to i1 is valid.
+  return true;
+}
+
+int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
+                                                const AddrMode &AM, Type *Ty,
+                                                unsigned AS) const {
+  if (isLegalAddressingMode(DL, AM, Ty, AS)) {
+    if (Subtarget->hasFPAO())
+      return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
+    return 0;
+  }
+  return -1;
+}
+
+
+static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
+  if (V < 0)
+    return false;
+
+  unsigned Scale = 1;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+    // Scale == 1;
+    break;
+  case MVT::i16:
+    // Scale == 2;
+    Scale = 2;
+    break;
+  case MVT::i32:
+    // Scale == 4;
+    Scale = 4;
+    break;
+  }
+
+  if ((V & (Scale - 1)) != 0)
+    return false;
+  V /= Scale;
+  return V == (V & ((1LL << 5) - 1));
+}
+
+static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
+                                      const ARMSubtarget *Subtarget) {
+  bool isNeg = false;
+  if (V < 0) {
+    isNeg = true;
+    V = - V;
+  }
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    // + imm12 or - imm8
+    if (isNeg)
+      return V == (V & ((1LL << 8) - 1));
+    return V == (V & ((1LL << 12) - 1));
+  case MVT::f32:
+  case MVT::f64:
+    // Same as ARM mode. FIXME: NEON?
+    if (!Subtarget->hasVFP2())
+      return false;
+    if ((V & 3) != 0)
+      return false;
+    V >>= 2;
+    return V == (V & ((1LL << 8) - 1));
+  }
+}
+
+/// isLegalAddressImmediate - Return true if the integer value can be used
+/// as the offset of the target addressing mode for load / store of the
+/// given type.
+static bool isLegalAddressImmediate(int64_t V, EVT VT,
+                                    const ARMSubtarget *Subtarget) {
+  if (V == 0)
+    return true;
+
+  if (!VT.isSimple())
+    return false;
+
+  if (Subtarget->isThumb1Only())
+    return isLegalT1AddressImmediate(V, VT);
+  else if (Subtarget->isThumb2())
+    return isLegalT2AddressImmediate(V, VT, Subtarget);
+
+  // ARM mode.
+  if (V < 0)
+    V = - V;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i32:
+    // +- imm12
+    return V == (V & ((1LL << 12) - 1));
+  case MVT::i16:
+    // +- imm8
+    return V == (V & ((1LL << 8) - 1));
+  case MVT::f32:
+  case MVT::f64:
+    if (!Subtarget->hasVFP2()) // FIXME: NEON?
+      return false;
+    if ((V & 3) != 0)
+      return false;
+    V >>= 2;
+    return V == (V & ((1LL << 8) - 1));
+  }
+}
+
+bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
+                                                      EVT VT) const {
+  int Scale = AM.Scale;
+  if (Scale < 0)
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    if (Scale == 1)
+      return true;
+    // r + r << imm
+    Scale = Scale & ~1;
+    return Scale == 2 || Scale == 4 || Scale == 8;
+  case MVT::i64:
+    // r + r
+    if (((unsigned)AM.HasBaseReg + Scale) <= 2)
+      return true;
+    return false;
+  case MVT::isVoid:
+    // Note, we allow "void" uses (basically, uses that aren't loads or
+    // stores), because arm allows folding a scale into many arithmetic
+    // operations.  This should be made more precise and revisited later.
+
+    // Allow r << imm, but the imm has to be a multiple of two.
+    if (Scale & 1) return false;
+    return isPowerOf2_32(Scale);
+  }
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                              const AddrMode &AM, Type *Ty,
+                                              unsigned AS) const {
+  EVT VT = getValueType(DL, Ty, true);
+  if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
+    return false;
+
+  // Can never fold addr of global into load/store.
+  if (AM.BaseGV)
+    return false;
+
+  switch (AM.Scale) {
+  case 0:  // no scale reg, must be "r+i" or "r", or "i".
+    break;
+  case 1:
+    if (Subtarget->isThumb1Only())
+      return false;
+    LLVM_FALLTHROUGH;
+  default:
+    // ARM doesn't support any R+R*scale+imm addr modes.
+    if (AM.BaseOffs)
+      return false;
+
+    if (!VT.isSimple())
+      return false;
+
+    if (Subtarget->isThumb2())
+      return isLegalT2ScaledAddressingMode(AM, VT);
+
+    int Scale = AM.Scale;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return false;
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i32:
+      if (Scale < 0) Scale = -Scale;
+      if (Scale == 1)
+        return true;
+      // r + r << imm
+      return isPowerOf2_32(Scale & ~1);
+    case MVT::i16:
+    case MVT::i64:
+      // r + r
+      if (((unsigned)AM.HasBaseReg + Scale) <= 2)
+        return true;
+      return false;
+
+    case MVT::isVoid:
+      // Note, we allow "void" uses (basically, uses that aren't loads or
+      // stores), because arm allows folding a scale into many arithmetic
+      // operations.  This should be made more precise and revisited later.
+
+      // Allow r << imm, but the imm has to be a multiple of two.
+      if (Scale & 1) return false;
+      return isPowerOf2_32(Scale);
+    }
+  }
+  return true;
+}
+
+/// isLegalICmpImmediate - Return true if the specified immediate is legal
+/// icmp immediate, that is the target has icmp instructions which can compare
+/// a register against the immediate without having to materialize the
+/// immediate into a register.
+bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  // Thumb2 and ARM modes can use cmn for negative immediates.
+  if (!Subtarget->isThumb())
+    return ARM_AM::getSOImmVal(std::abs(Imm)) != -1;
+  if (Subtarget->isThumb2())
+    return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1;
+  // Thumb1 doesn't have cmn, and only 8-bit immediates.
+  return Imm >= 0 && Imm <= 255;
+}
+
+/// isLegalAddImmediate - Return true if the specified immediate is a legal add
+/// *or sub* immediate, that is the target has add or sub instructions which can
+/// add a register with the immediate without having to materialize the
+/// immediate into a register.
+bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  // Same encoding for add/sub, just flip the sign.
+  int64_t AbsImm = std::abs(Imm);
+  if (!Subtarget->isThumb())
+    return ARM_AM::getSOImmVal(AbsImm) != -1;
+  if (Subtarget->isThumb2())
+    return ARM_AM::getT2SOImmVal(AbsImm) != -1;
+  // Thumb1 only has 8-bit unsigned immediate.
+  return AbsImm >= 0 && AbsImm <= 255;
+}
+
+static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
+                                      bool isSEXTLoad, SDValue &Base,
+                                      SDValue &Offset, bool &isInc,
+                                      SelectionDAG &DAG) {
+  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
+    return false;
+
+  if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
+    // AddressingMode 3
+    Base = Ptr->getOperand(0);
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC < 0 && RHSC > -256) {
+        assert(Ptr->getOpcode() == ISD::ADD);
+        isInc = false;
+        Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
+        return true;
+      }
+    }
+    isInc = (Ptr->getOpcode() == ISD::ADD);
+    Offset = Ptr->getOperand(1);
+    return true;
+  } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
+    // AddressingMode 2
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC < 0 && RHSC > -0x1000) {
+        assert(Ptr->getOpcode() == ISD::ADD);
+        isInc = false;
+        Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
+        Base = Ptr->getOperand(0);
+        return true;
+      }
+    }
+
+    if (Ptr->getOpcode() == ISD::ADD) {
+      isInc = true;
+      ARM_AM::ShiftOpc ShOpcVal=
+        ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
+      if (ShOpcVal != ARM_AM::no_shift) {
+        Base = Ptr->getOperand(1);
+        Offset = Ptr->getOperand(0);
+      } else {
+        Base = Ptr->getOperand(0);
+        Offset = Ptr->getOperand(1);
+      }
+      return true;
+    }
+
+    isInc = (Ptr->getOpcode() == ISD::ADD);
+    Base = Ptr->getOperand(0);
+    Offset = Ptr->getOperand(1);
+    return true;
+  }
+
+  // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
+  return false;
+}
+
+static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
+                                     bool isSEXTLoad, SDValue &Base,
+                                     SDValue &Offset, bool &isInc,
+                                     SelectionDAG &DAG) {
+  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
+    return false;
+
+  Base = Ptr->getOperand(0);
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
+      assert(Ptr->getOpcode() == ISD::ADD);
+      isInc = false;
+      Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
+      return true;
+    } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
+      isInc = Ptr->getOpcode() == ISD::ADD;
+      Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// getPreIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if the node's address
+/// can be legally represented as pre-indexed load / store address.
+bool
+ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                             SDValue &Offset,
+                                             ISD::MemIndexedMode &AM,
+                                             SelectionDAG &DAG) const {
+  if (Subtarget->isThumb1Only())
+    return false;
+
+  EVT VT;
+  SDValue Ptr;
+  bool isSEXTLoad = false;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    Ptr = LD->getBasePtr();
+    VT  = LD->getMemoryVT();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    Ptr = ST->getBasePtr();
+    VT  = ST->getMemoryVT();
+  } else
+    return false;
+
+  bool isInc;
+  bool isLegal = false;
+  if (Subtarget->isThumb2())
+    isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
+                                       Offset, isInc, DAG);
+  else
+    isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
+                                        Offset, isInc, DAG);
+  if (!isLegal)
+    return false;
+
+  AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
+  return true;
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                                   SDValue &Base,
+                                                   SDValue &Offset,
+                                                   ISD::MemIndexedMode &AM,
+                                                   SelectionDAG &DAG) const {
+  EVT VT;
+  SDValue Ptr;
+  bool isSEXTLoad = false, isNonExt;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT  = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+    isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT  = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+    isNonExt = !ST->isTruncatingStore();
+  } else
+    return false;
+
+  if (Subtarget->isThumb1Only()) {
+    // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
+    // must be non-extending/truncating, i32, with an offset of 4.
+    assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
+    if (Op->getOpcode() != ISD::ADD || !isNonExt)
+      return false;
+    auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+    if (!RHS || RHS->getZExtValue() != 4)
+      return false;
+    
+    Offset = Op->getOperand(1);
+    Base = Op->getOperand(0);
+    AM = ISD::POST_INC;
+    return true;
+  }
+  
+  bool isInc;
+  bool isLegal = false;
+  if (Subtarget->isThumb2())
+    isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+                                       isInc, DAG);
+  else
+    isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+                                        isInc, DAG);
+  if (!isLegal)
+    return false;
+
+  if (Ptr != Base) {
+    // Swap base ptr and offset to catch more post-index load / store when
+    // it's legal. In Thumb2 mode, offset must be an immediate.
+    if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
+        !Subtarget->isThumb2())
+      std::swap(Base, Offset);
+
+    // Post-indexed load / store update the base pointer.
+    if (Ptr != Base)
+      return false;
+  }
+
+  AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
+  return true;
+}
+
+void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                      APInt &KnownZero,
+                                                      APInt &KnownOne,
+                                                      const SelectionDAG &DAG,
+                                                      unsigned Depth) const {
+  unsigned BitWidth = KnownOne.getBitWidth();
+  KnownZero = KnownOne = APInt(BitWidth, 0);
+  switch (Op.getOpcode()) {
+  default: break;
+  case ARMISD::ADDC:
+  case ARMISD::ADDE:
+  case ARMISD::SUBC:
+  case ARMISD::SUBE:
+    // These nodes' second result is a boolean
+    if (Op.getResNo() == 0)
+      break;
+    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    break;
+  case ARMISD::CMOV: {
+    // Bits are known zero/one if known on the LHS and RHS.
+    DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    if (KnownZero == 0 && KnownOne == 0) return;
+
+    APInt KnownZeroRHS, KnownOneRHS;
+    DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
+    KnownZero &= KnownZeroRHS;
+    KnownOne  &= KnownOneRHS;
+    return;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
+    Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
+    switch (IntID) {
+    default: return;
+    case Intrinsic::arm_ldaex:
+    case Intrinsic::arm_ldrex: {
+      EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
+      unsigned MemBits = VT.getScalarSizeInBits();
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+      return;
+    }
+    }
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
+  // Looking for "rev" which is V6+.
+  if (!Subtarget->hasV6Ops())
+    return false;
+
+  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+  std::string AsmStr = IA->getAsmString();
+  SmallVector<StringRef, 4> AsmPieces;
+  SplitString(AsmStr, AsmPieces, ";\n");
+
+  switch (AsmPieces.size()) {
+  default: return false;
+  case 1:
+    AsmStr = AsmPieces[0];
+    AsmPieces.clear();
+    SplitString(AsmStr, AsmPieces, " \t,");
+
+    // rev $0, $1
+    if (AsmPieces.size() == 3 &&
+        AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
+        IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
+      IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+      if (Ty && Ty->getBitWidth() == 32)
+        return IntrinsicLowering::LowerToByteSwap(CI);
+    }
+    break;
+  }
+
+  return false;
+}
+
+const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
+  // At this point, we have to lower this constraint to something else, so we
+  // lower it to an "r" or "w". However, by doing this we will force the result
+  // to be in register, while the X constraint is much more permissive.
+  //
+  // Although we are correct (we are free to emit anything, without
+  // constraints), we might break use cases that would expect us to be more
+  // efficient and emit something else.
+  if (!Subtarget->hasVFP2())
+    return "r";
+  if (ConstraintVT.isFloatingPoint())
+    return "w";
+  if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
+     (ConstraintVT.getSizeInBits() == 64 ||
+      ConstraintVT.getSizeInBits() == 128))
+    return "w";
+
+  return "r";
+}
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+ARMTargetLowering::ConstraintType
+ARMTargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:  break;
+    case 'l': return C_RegisterClass;
+    case 'w': return C_RegisterClass;
+    case 'h': return C_RegisterClass;
+    case 'x': return C_RegisterClass;
+    case 't': return C_RegisterClass;
+    case 'j': return C_Other; // Constant for movw.
+      // An address with a single base register. Due to the way we
+      // currently handle addresses it is the same as an 'r' memory constraint.
+    case 'Q': return C_Memory;
+    }
+  } else if (Constraint.size() == 2) {
+    switch (Constraint[0]) {
+    default: break;
+    // All 'U+' constraints are addresses.
+    case 'U': return C_Memory;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+ARMTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'l':
+    if (type->isIntegerTy()) {
+      if (Subtarget->isThumb())
+        weight = CW_SpecificReg;
+      else
+        weight = CW_Register;
+    }
+    break;
+  case 'w':
+    if (type->isFloatingPointTy())
+      weight = CW_Register;
+    break;
+  }
+  return weight;
+}
+
+typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
+RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC ARM Constraint Letters
+    switch (Constraint[0]) {
+    case 'l': // Low regs or general regs.
+      if (Subtarget->isThumb())
+        return RCPair(0U, &ARM::tGPRRegClass);
+      return RCPair(0U, &ARM::GPRRegClass);
+    case 'h': // High regs or no regs.
+      if (Subtarget->isThumb())
+        return RCPair(0U, &ARM::hGPRRegClass);
+      break;
+    case 'r':
+      if (Subtarget->isThumb1Only())
+        return RCPair(0U, &ARM::tGPRRegClass);
+      return RCPair(0U, &ARM::GPRRegClass);
+    case 'w':
+      if (VT == MVT::Other)
+        break;
+      if (VT == MVT::f32)
+        return RCPair(0U, &ARM::SPRRegClass);
+      if (VT.getSizeInBits() == 64)
+        return RCPair(0U, &ARM::DPRRegClass);
+      if (VT.getSizeInBits() == 128)
+        return RCPair(0U, &ARM::QPRRegClass);
+      break;
+    case 'x':
+      if (VT == MVT::Other)
+        break;
+      if (VT == MVT::f32)
+        return RCPair(0U, &ARM::SPR_8RegClass);
+      if (VT.getSizeInBits() == 64)
+        return RCPair(0U, &ARM::DPR_8RegClass);
+      if (VT.getSizeInBits() == 128)
+        return RCPair(0U, &ARM::QPR_8RegClass);
+      break;
+    case 't':
+      if (VT == MVT::f32)
+        return RCPair(0U, &ARM::SPRRegClass);
+      break;
+    }
+  }
+  if (StringRef("{cc}").equals_lower(Constraint))
+    return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
+
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result;
+
+  // Currently only support length 1 constraints.
+  if (Constraint.length() != 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break;
+  case 'j':
+  case 'I': case 'J': case 'K': case 'L':
+  case 'M': case 'N': case 'O':
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C)
+      return;
+
+    int64_t CVal64 = C->getSExtValue();
+    int CVal = (int) CVal64;
+    // None of these constraints allow values larger than 32 bits.  Check
+    // that the value fits in an int.
+    if (CVal != CVal64)
+      return;
+
+    switch (ConstraintLetter) {
+      case 'j':
+        // Constant suitable for movw, must be between 0 and
+        // 65535.
+        if (Subtarget->hasV6T2Ops())
+          if (CVal >= 0 && CVal <= 65535)
+            break;
+        return;
+      case 'I':
+        if (Subtarget->isThumb1Only()) {
+          // This must be a constant between 0 and 255, for ADD
+          // immediates.
+          if (CVal >= 0 && CVal <= 255)
+            break;
+        } else if (Subtarget->isThumb2()) {
+          // A constant that can be used as an immediate value in a
+          // data-processing instruction.
+          if (ARM_AM::getT2SOImmVal(CVal) != -1)
+            break;
+        } else {
+          // A constant that can be used as an immediate value in a
+          // data-processing instruction.
+          if (ARM_AM::getSOImmVal(CVal) != -1)
+            break;
+        }
+        return;
+
+      case 'J':
+        if (Subtarget->isThumb1Only()) {
+          // This must be a constant between -255 and -1, for negated ADD
+          // immediates. This can be used in GCC with an "n" modifier that
+          // prints the negated value, for use with SUB instructions. It is
+          // not useful otherwise but is implemented for compatibility.
+          if (CVal >= -255 && CVal <= -1)
+            break;
+        } else {
+          // This must be a constant between -4095 and 4095. It is not clear
+          // what this constraint is intended for. Implemented for
+          // compatibility with GCC.
+          if (CVal >= -4095 && CVal <= 4095)
+            break;
+        }
+        return;
+
+      case 'K':
+        if (Subtarget->isThumb1Only()) {
+          // A 32-bit value where only one byte has a nonzero value. Exclude
+          // zero to match GCC. This constraint is used by GCC internally for
+          // constants that can be loaded with a move/shift combination.
+          // It is not useful otherwise but is implemented for compatibility.
+          if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
+            break;
+        } else if (Subtarget->isThumb2()) {
+          // A constant whose bitwise inverse can be used as an immediate
+          // value in a data-processing instruction. This can be used in GCC
+          // with a "B" modifier that prints the inverted value, for use with
+          // BIC and MVN instructions. It is not useful otherwise but is
+          // implemented for compatibility.
+          if (ARM_AM::getT2SOImmVal(~CVal) != -1)
+            break;
+        } else {
+          // A constant whose bitwise inverse can be used as an immediate
+          // value in a data-processing instruction. This can be used in GCC
+          // with a "B" modifier that prints the inverted value, for use with
+          // BIC and MVN instructions. It is not useful otherwise but is
+          // implemented for compatibility.
+          if (ARM_AM::getSOImmVal(~CVal) != -1)
+            break;
+        }
+        return;
+
+      case 'L':
+        if (Subtarget->isThumb1Only()) {
+          // This must be a constant between -7 and 7,
+          // for 3-operand ADD/SUB immediate instructions.
+          if (CVal >= -7 && CVal < 7)
+            break;
+        } else if (Subtarget->isThumb2()) {
+          // A constant whose negation can be used as an immediate value in a
+          // data-processing instruction. This can be used in GCC with an "n"
+          // modifier that prints the negated value, for use with SUB
+          // instructions. It is not useful otherwise but is implemented for
+          // compatibility.
+          if (ARM_AM::getT2SOImmVal(-CVal) != -1)
+            break;
+        } else {
+          // A constant whose negation can be used as an immediate value in a
+          // data-processing instruction. This can be used in GCC with an "n"
+          // modifier that prints the negated value, for use with SUB
+          // instructions. It is not useful otherwise but is implemented for
+          // compatibility.
+          if (ARM_AM::getSOImmVal(-CVal) != -1)
+            break;
+        }
+        return;
+
+      case 'M':
+        if (Subtarget->isThumb1Only()) {
+          // This must be a multiple of 4 between 0 and 1020, for
+          // ADD sp + immediate.
+          if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
+            break;
+        } else {
+          // A power of two or a constant between 0 and 32.  This is used in
+          // GCC for the shift amount on shifted register operands, but it is
+          // useful in general for any shift amounts.
+          if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
+            break;
+        }
+        return;
+
+      case 'N':
+        if (Subtarget->isThumb()) {  // FIXME thumb2
+          // This must be a constant between 0 and 31, for shift amounts.
+          if (CVal >= 0 && CVal <= 31)
+            break;
+        }
+        return;
+
+      case 'O':
+        if (Subtarget->isThumb()) {  // FIXME thumb2
+          // This must be a multiple of 4 between -508 and 508, for
+          // ADD/SUB sp = sp + immediate.
+          if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
+            break;
+        }
+        return;
+    }
+    Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
+    break;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+static RTLIB::Libcall getDivRemLibcall(
+    const SDNode *N, MVT::SimpleValueType SVT) {
+  assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
+          N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
+         "Unhandled Opcode in getDivRemLibcall");
+  bool isSigned = N->getOpcode() == ISD::SDIVREM ||
+                  N->getOpcode() == ISD::SREM;
+  RTLIB::Libcall LC;
+  switch (SVT) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::i8:  LC = isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
+  case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
+  case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
+  case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
+  }
+  return LC;
+}
+
+static TargetLowering::ArgListTy getDivRemArgList(
+    const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
+  assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
+          N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
+         "Unhandled Opcode in getDivRemArgList");
+  bool isSigned = N->getOpcode() == ISD::SDIVREM ||
+                  N->getOpcode() == ISD::SREM;
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = N->getOperand(i).getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*Context);
+    Entry.Node = N->getOperand(i);
+    Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  if (Subtarget->isTargetWindows() && Args.size() >= 2)
+    std::swap(Args[0], Args[1]);
+  return Args;
+}
+
+SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
+  assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
+          Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
+          Subtarget->isTargetWindows()) &&
+         "Register-based DivRem lowering only");
+  unsigned Opcode = Op->getOpcode();
+  assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
+         "Invalid opcode for Div/Rem lowering");
+  bool isSigned = (Opcode == ISD::SDIVREM);
+  EVT VT = Op->getValueType(0);
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+  SDLoc dl(Op);
+
+  // If the target has hardware divide, use divide + multiply + subtract:
+  //     div = a / b
+  //     rem = a - b * div
+  //     return {div, rem}
+  // This should be lowered into UDIV/SDIV + MLS later on.
+  if (Subtarget->hasDivide() && Op->getValueType(0).isSimple() &&
+      Op->getSimpleValueType(0) == MVT::i32) {
+    unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
+    const SDValue Dividend = Op->getOperand(0);
+    const SDValue Divisor = Op->getOperand(1);
+    SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
+    SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
+    SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
+
+    SDValue Values[2] = {Div, Rem};
+    return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
+  }
+
+  RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
+                                       VT.getSimpleVT().SimpleTy);
+  SDValue InChain = DAG.getEntryNode();
+
+  TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
+                                                    DAG.getContext(),
+                                                    Subtarget);
+
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy(DAG.getDataLayout()));
+
+  Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
+
+  if (Subtarget->isTargetWindows())
+    InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(InChain)
+    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+    .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+  return CallInfo.first;
+}
+
+// Lowers REM using divmod helpers
+// see RTABI section 4.2/4.3
+SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
+  // Build return types (div and rem)
+  std::vector<Type*> RetTyParams;
+  Type *RetTyElement;
+
+  switch (N->getValueType(0).getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::i8:   RetTyElement = Type::getInt8Ty(*DAG.getContext());  break;
+  case MVT::i16:  RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
+  case MVT::i32:  RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
+  case MVT::i64:  RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
+  }
+
+  RetTyParams.push_back(RetTyElement);
+  RetTyParams.push_back(RetTyElement);
+  ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
+  Type *RetTy = StructType::get(*DAG.getContext(), ret);
+
+  RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
+                                                             SimpleTy);
+  SDValue InChain = DAG.getEntryNode();
+  TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(),
+                                                    Subtarget);
+  bool isSigned = N->getOpcode() == ISD::SREM;
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy(DAG.getDataLayout()));
+
+  if (Subtarget->isTargetWindows())
+    InChain = WinDBZCheckDenominator(DAG, N, InChain);
+
+  // Lower call
+  CallLoweringInfo CLI(DAG);
+  CLI.setChain(InChain)
+     .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
+     .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+  // Return second (rem) result operand (first contains div)
+  SDNode *ResNode = CallResult.first.getNode();
+  assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
+  return ResNode->getOperand(1);
+}
+
+SDValue
+ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWindows() && "unsupported target platform");
+  SDLoc DL(Op);
+
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+
+  SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
+                              DAG.getConstant(2, DL, MVT::i32));
+
+  SDValue Flag;
+  Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
+  Flag = Chain.getValue(1);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
+
+  SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
+  Chain = NewSP.getValue(1);
+
+  SDValue Ops[2] = { NewSP, Chain };
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() &&
+         "Unexpected type for custom-lowering FP_EXTEND");
+
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  SDValue SrcVal = Op.getOperand(0);
+  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+                     SDLoc(Op)).first;
+}
+
+SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getOperand(0).getValueType() == MVT::f64 &&
+         Subtarget->isFPOnlySP() &&
+         "Unexpected type for custom-lowering FP_ROUND");
+
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  SDValue SrcVal = Op.getOperand(0);
+  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+                     SDLoc(Op)).first;
+}
+
+bool
+ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The ARM target isn't yet aware of offsets.
+  return false;
+}
+
+bool ARM::isBitFieldInvertedMask(unsigned v) {
+  if (v == 0xffffffff)
+    return false;
+
+  // there can be 1's on either or both "outsides", all the "inside"
+  // bits must be 0's
+  return isShiftedMask_32(~v);
+}
+
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  if (!Subtarget->hasVFP3())
+    return false;
+  if (VT == MVT::f32)
+    return ARM_AM::getFP32Imm(Imm) != -1;
+  if (VT == MVT::f64 && !Subtarget->isFPOnlySP())
+    return ARM_AM::getFP64Imm(Imm) != -1;
+  return false;
+}
+
+/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
+/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
+/// specified in the intrinsic calls.
+bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                           const CallInst &I,
+                                           unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  case Intrinsic::arm_neon_vld1:
+  case Intrinsic::arm_neon_vld2:
+  case Intrinsic::arm_neon_vld3:
+  case Intrinsic::arm_neon_vld4:
+  case Intrinsic::arm_neon_vld2lane:
+  case Intrinsic::arm_neon_vld3lane:
+  case Intrinsic::arm_neon_vld4lane: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    // Conservatively set memVT to the entire set of vectors loaded.
+    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.vol = false; // volatile loads with NEON intrinsics not supported
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::arm_neon_vst1:
+  case Intrinsic::arm_neon_vst2:
+  case Intrinsic::arm_neon_vst3:
+  case Intrinsic::arm_neon_vst4:
+  case Intrinsic::arm_neon_vst2lane:
+  case Intrinsic::arm_neon_vst3lane:
+  case Intrinsic::arm_neon_vst4lane: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    // Conservatively set memVT to the entire set of vectors stored.
+    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    unsigned NumElts = 0;
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+      Type *ArgTy = I.getArgOperand(ArgI)->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.vol = false; // volatile stores with NEON intrinsics not supported
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::arm_ldaex:
+  case Intrinsic::arm_ldrex: {
+    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::arm_stlex:
+  case Intrinsic::arm_strex: {
+    auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = 0;
+    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::arm_stlexd:
+  case Intrinsic::arm_strexd: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i64;
+    Info.ptrVal = I.getArgOperand(2);
+    Info.offset = 0;
+    Info.align = 8;
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::arm_ldaexd:
+  case Intrinsic::arm_ldrexd: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i64;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = 8;
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
+
+/// \brief Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                          Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned Bits = Ty->getPrimitiveSizeInBits();
+  if (Bits == 0 || Bits > 32)
+    return false;
+  return true;
+}
+
+bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT,
+                                                unsigned Index) const {
+  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+    return false;
+
+  return (Index == 0 || Index == ResVT.getVectorNumElements());
+}
+
+Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
+                                        ARM_MB::MemBOpt Domain) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+
+  // First, if the target has no DMB, see what fallback we can use.
+  if (!Subtarget->hasDataBarrier()) {
+    // Some ARMv6 cpus can support data barriers with an mcr instruction.
+    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
+    // here.
+    if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
+      Function *MCR = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
+      Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
+                        Builder.getInt32(0), Builder.getInt32(7),
+                        Builder.getInt32(10), Builder.getInt32(5)};
+      return Builder.CreateCall(MCR, args);
+    } else {
+      // Instead of using barriers, atomic accesses on these subtargets use
+      // libcalls.
+      llvm_unreachable("makeDMB on a target so old that it has no barriers");
+    }
+  } else {
+    Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
+    // Only a full system barrier exists in the M-class architectures.
+    Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
+    Constant *CDomain = Builder.getInt32(Domain);
+    return Builder.CreateCall(DMB, CDomain);
+  }
+}
+
+// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
+Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+                                         AtomicOrdering Ord, bool IsStore,
+                                         bool IsLoad) const {
+  switch (Ord) {
+  case AtomicOrdering::NotAtomic:
+  case AtomicOrdering::Unordered:
+    llvm_unreachable("Invalid fence: unordered/non-atomic");
+  case AtomicOrdering::Monotonic:
+  case AtomicOrdering::Acquire:
+    return nullptr; // Nothing to do
+  case AtomicOrdering::SequentiallyConsistent:
+    if (!IsStore)
+      return nullptr; // Nothing to do
+    /*FALLTHROUGH*/
+  case AtomicOrdering::Release:
+  case AtomicOrdering::AcquireRelease:
+    if (Subtarget->preferISHSTBarriers())
+      return makeDMB(Builder, ARM_MB::ISHST);
+    // FIXME: add a comment with a link to documentation justifying this.
+    else
+      return makeDMB(Builder, ARM_MB::ISH);
+  }
+  llvm_unreachable("Unknown fence ordering in emitLeadingFence");
+}
+
+Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+                                          AtomicOrdering Ord, bool IsStore,
+                                          bool IsLoad) const {
+  switch (Ord) {
+  case AtomicOrdering::NotAtomic:
+  case AtomicOrdering::Unordered:
+    llvm_unreachable("Invalid fence: unordered/not-atomic");
+  case AtomicOrdering::Monotonic:
+  case AtomicOrdering::Release:
+    return nullptr; // Nothing to do
+  case AtomicOrdering::Acquire:
+  case AtomicOrdering::AcquireRelease:
+  case AtomicOrdering::SequentiallyConsistent:
+    return makeDMB(Builder, ARM_MB::ISH);
+  }
+  llvm_unreachable("Unknown fence ordering in emitTrailingFence");
+}
+
+// Loads and stores less than 64-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
+// anything for those.
+bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
+  return (Size == 64) && !Subtarget->isMClass();
+}
+
+// Loads and stores less than 64-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
+// anything for those.
+// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
+// guarantee, see DDI0406C ARM architecture reference manual,
+// sections A8.8.72-74 LDRD)
+TargetLowering::AtomicExpansionKind
+ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  unsigned Size = LI->getType()->getPrimitiveSizeInBits();
+  return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
+                                                  : AtomicExpansionKind::None;
+}
+
+// For the real atomic operations, we have ldrex/strex up to 32 bits,
+// and up to 64 bits on the non-M profiles
+TargetLowering::AtomicExpansionKind
+ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
+  bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
+  return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
+             ? AtomicExpansionKind::LLSC
+             : AtomicExpansionKind::None;
+}
+
+bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(
+    AtomicCmpXchgInst *AI) const {
+  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+  // implement cmpxchg without spilling. If the address being exchanged is also
+  // on the stack and close enough to the spill slot, this can lead to a
+  // situation where the monitor always gets cleared and the atomic operation
+  // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
+  bool hasAtomicCmpXchg =
+      !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
+  return getTargetMachine().getOptLevel() != 0 && hasAtomicCmpXchg;
+}
+
+bool ARMTargetLowering::shouldInsertFencesForAtomic(
+    const Instruction *I) const {
+  return InsertFencesForAtomic;
+}
+
+// This has so far only been implemented for MachO.
+bool ARMTargetLowering::useLoadStackGuardNode() const {
+  return Subtarget->isTargetMachO();
+}
+
+bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
+                                                  unsigned &Cost) const {
+  // If we do not have NEON, vector types are not natively supported.
+  if (!Subtarget->hasNEON())
+    return false;
+
+  // Floating point values and vector values map to the same register file.
+  // Therefore, although we could do a store extract of a vector type, this is
+  // better to leave at float as we have more freedom in the addressing mode for
+  // those.
+  if (VectorTy->isFPOrFPVectorTy())
+    return false;
+
+  // If the index is unknown at compile time, this is very expensive to lower
+  // and it is not possible to combine the store with the extract.
+  if (!isa<ConstantInt>(Idx))
+    return false;
+
+  assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
+  unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
+  // We can do a store + vector extract on any vector that fits perfectly in a D
+  // or Q register.
+  if (BitWidth == 64 || BitWidth == 128) {
+    Cost = 0;
+    return true;
+  }
+  return false;
+}
+
+bool ARMTargetLowering::isCheapToSpeculateCttz() const {
+  return Subtarget->hasV6T2Ops();
+}
+
+bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
+  return Subtarget->hasV6T2Ops();
+}
+
+Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                                         AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
+  bool IsAcquire = isAcquireOrStronger(Ord);
+
+  // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
+  // intrinsic must return {i32, i32} and we have to recombine them into a
+  // single i64 here.
+  if (ValTy->getPrimitiveSizeInBits() == 64) {
+    Intrinsic::ID Int =
+        IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
+    Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int);
+
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
+
+    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+    if (!Subtarget->isLittle())
+      std::swap (Lo, Hi);
+    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+    return Builder.CreateOr(
+        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
+  }
+
+  Type *Tys[] = { Addr->getType() };
+  Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
+  Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateTruncOrBitCast(
+      Builder.CreateCall(Ldrex, Addr),
+      cast<PointerType>(Addr->getType())->getElementType());
+}
+
+void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
+    IRBuilder<> &Builder) const {
+  if (!Subtarget->hasV7Ops())
+    return;
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
+}
+
+Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+                                               Value *Addr,
+                                               AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  bool IsRelease = isReleaseOrStronger(Ord);
+
+  // Since the intrinsics must have legal type, the i64 intrinsics take two
+  // parameters: "i32, i32". We must marshal Val into the appropriate form
+  // before the call.
+  if (Val->getType()->getPrimitiveSizeInBits() == 64) {
+    Intrinsic::ID Int =
+        IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
+    Function *Strex = Intrinsic::getDeclaration(M, Int);
+    Type *Int32Ty = Type::getInt32Ty(M->getContext());
+
+    Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
+    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
+    if (!Subtarget->isLittle())
+      std::swap (Lo, Hi);
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    return Builder.CreateCall(Strex, {Lo, Hi, Addr});
+  }
+
+  Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
+  Type *Tys[] = { Addr->getType() };
+  Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateCall(
+      Strex, {Builder.CreateZExtOrBitCast(
+                  Val, Strex->getFunctionType()->getParamType(0)),
+              Addr});
+}
+
+/// \brief Lower an interleaved load into a vldN intrinsic.
+///
+/// E.g. Lower an interleaved load (Factor = 2):
+///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
+///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
+///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
+///
+///      Into:
+///        %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
+///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
+///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
+bool ARMTargetLowering::lowerInterleavedLoad(
+    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    ArrayRef<unsigned> Indices, unsigned Factor) const {
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
+  assert(!Shuffles.empty() && "Empty shufflevector input");
+  assert(Shuffles.size() == Indices.size() &&
+         "Unmatched number of shufflevectors and indices");
+
+  VectorType *VecTy = Shuffles[0]->getType();
+  Type *EltTy = VecTy->getVectorElementType();
+
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
+  bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
+
+  // Skip if we do not have NEON and skip illegal vector types and vector types
+  // with i64/f64 elements (vldN doesn't support i64/f64 elements).
+  if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits)
+    return false;
+
+  // A pointer vector can not be the return type of the ldN intrinsics. Need to
+  // load integer vectors first and then convert to pointer vectors.
+  if (EltTy->isPointerTy())
+    VecTy =
+        VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
+
+  static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
+                                            Intrinsic::arm_neon_vld3,
+                                            Intrinsic::arm_neon_vld4};
+
+  IRBuilder<> Builder(LI);
+  SmallVector<Value *, 2> Ops;
+
+  Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
+  Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
+  Ops.push_back(Builder.getInt32(LI->getAlignment()));
+
+  Type *Tys[] = { VecTy, Int8Ptr };
+  Function *VldnFunc =
+      Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
+  CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
+
+  // Replace uses of each shufflevector with the corresponding vector loaded
+  // by ldN.
+  for (unsigned i = 0; i < Shuffles.size(); i++) {
+    ShuffleVectorInst *SV = Shuffles[i];
+    unsigned Index = Indices[i];
+
+    Value *SubVec = Builder.CreateExtractValue(VldN, Index);
+
+    // Convert the integer vector to pointer vector if the element is pointer.
+    if (EltTy->isPointerTy())
+      SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
+
+    SV->replaceAllUsesWith(SubVec);
+  }
+
+  return true;
+}
+
+/// \brief Get a mask consisting of sequential integers starting from \p Start.
+///
+/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
+                                   unsigned NumElts) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < NumElts; i++)
+    Mask.push_back(Builder.getInt32(Start + i));
+
+  return ConstantVector::get(Mask);
+}
+
+/// \brief Lower an interleaved store into a vstN intrinsic.
+///
+/// E.g. Lower an interleaved store (Factor = 3):
+///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
+///                                  <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+///        store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
+///
+///      Into:
+///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
+///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
+///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
+///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
+///
+/// Note that the new shufflevectors will be removed and we'll only generate one
+/// vst3 instruction in CodeGen.
+///
+/// Example for a more general valid mask (Factor 3). Lower:
+///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
+///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
+///        store <12 x i32> %i.vec, <12 x i32>* %ptr
+///
+///      Into:
+///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
+///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
+///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
+///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
+bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
+                                              ShuffleVectorInst *SVI,
+                                              unsigned Factor) const {
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
+
+  VectorType *VecTy = SVI->getType();
+  assert(VecTy->getVectorNumElements() % Factor == 0 &&
+         "Invalid interleaved store");
+
+  unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
+  Type *EltTy = VecTy->getVectorElementType();
+  VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
+
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+  bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
+
+  // Skip if we do not have NEON and skip illegal vector types and vector types
+  // with i64/f64 elements (vstN doesn't support i64/f64 elements).
+  if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) ||
+      EltIs64Bits)
+    return false;
+
+  Value *Op0 = SVI->getOperand(0);
+  Value *Op1 = SVI->getOperand(1);
+  IRBuilder<> Builder(SI);
+
+  // StN intrinsics don't support pointer vectors as arguments. Convert pointer
+  // vectors to integer vectors.
+  if (EltTy->isPointerTy()) {
+    Type *IntTy = DL.getIntPtrType(EltTy);
+
+    // Convert to the corresponding integer vector.
+    Type *IntVecTy =
+        VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
+    Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
+    Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
+
+    SubVecTy = VectorType::get(IntTy, LaneLen);
+  }
+
+  static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
+                                             Intrinsic::arm_neon_vst3,
+                                             Intrinsic::arm_neon_vst4};
+  SmallVector<Value *, 6> Ops;
+
+  Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
+  Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
+
+  Type *Tys[] = { Int8Ptr, SubVecTy };
+  Function *VstNFunc = Intrinsic::getDeclaration(
+      SI->getModule(), StoreInts[Factor - 2], Tys);
+
+  // Split the shufflevector operands into sub vectors for the new vstN call.
+  auto Mask = SVI->getShuffleMask();
+  for (unsigned i = 0; i < Factor; i++) {
+    if (Mask[i] >= 0) {
+      Ops.push_back(Builder.CreateShuffleVector(
+        Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
+    } else {
+      unsigned StartMask = 0;
+      for (unsigned j = 1; j < LaneLen; j++) {
+        if (Mask[j*Factor + i] >= 0) {
+          StartMask = Mask[j*Factor + i] - j;
+          break;
+        }
+      }
+      // Note: If all elements in a chunk are undefs, StartMask=0!
+      // Note: Filling undef gaps with random elements is ok, since
+      // those elements were being written anyway (with undefs).
+      // In the case of all undefs we're defaulting to using elems from 0
+      // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
+      Ops.push_back(Builder.CreateShuffleVector(
+        Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
+    }
+  }
+
+  Ops.push_back(Builder.getInt32(SI->getAlignment()));
+  Builder.CreateCall(VstNFunc, Ops);
+  return true;
+}
+
+enum HABaseType {
+  HA_UNKNOWN = 0,
+  HA_FLOAT,
+  HA_DOUBLE,
+  HA_VECT64,
+  HA_VECT128
+};
+
+static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
+                                   uint64_t &Members) {
+  if (auto *ST = dyn_cast<StructType>(Ty)) {
+    for (unsigned i = 0; i < ST->getNumElements(); ++i) {
+      uint64_t SubMembers = 0;
+      if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
+        return false;
+      Members += SubMembers;
+    }
+  } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
+    uint64_t SubMembers = 0;
+    if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
+      return false;
+    Members += SubMembers * AT->getNumElements();
+  } else if (Ty->isFloatTy()) {
+    if (Base != HA_UNKNOWN && Base != HA_FLOAT)
+      return false;
+    Members = 1;
+    Base = HA_FLOAT;
+  } else if (Ty->isDoubleTy()) {
+    if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
+      return false;
+    Members = 1;
+    Base = HA_DOUBLE;
+  } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
+    Members = 1;
+    switch (Base) {
+    case HA_FLOAT:
+    case HA_DOUBLE:
+      return false;
+    case HA_VECT64:
+      return VT->getBitWidth() == 64;
+    case HA_VECT128:
+      return VT->getBitWidth() == 128;
+    case HA_UNKNOWN:
+      switch (VT->getBitWidth()) {
+      case 64:
+        Base = HA_VECT64;
+        return true;
+      case 128:
+        Base = HA_VECT128;
+        return true;
+      default:
+        return false;
+      }
+    }
+  }
+
+  return (Members > 0 && Members <= 4);
+}
+
+/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
+/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
+/// passing according to AAPCS rules.
+bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
+    Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
+  if (getEffectiveCallingConv(CallConv, isVarArg) !=
+      CallingConv::ARM_AAPCS_VFP)
+    return false;
+
+  HABaseType Base = HA_UNKNOWN;
+  uint64_t Members = 0;
+  bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
+  DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
+
+  bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
+  return IsHA || IsIntArray;
+}
+
+unsigned ARMTargetLowering::getExceptionPointerRegister(
+    const Constant *PersonalityFn) const {
+  // Platforms which do not use SjLj EH may return values in these registers
+  // via the personality function.
+  return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
+}
+
+unsigned ARMTargetLowering::getExceptionSelectorRegister(
+    const Constant *PersonalityFn) const {
+  // Platforms which do not use SjLj EH may return values in these registers
+  // via the personality function.
+  return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
+}
+
+void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+  // Update IsSplitCSR in ARMFunctionInfo.
+  ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
+  AFI->setIsSplitCSR(true);
+}
+
+void ARMTargetLowering::insertCopiesSplitCSR(
+    MachineBasicBlock *Entry,
+    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+  const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+  if (!IStart)
+    return;
+
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  MachineBasicBlock::iterator MBBI = Entry->begin();
+  for (const MCPhysReg *I = IStart; *I; ++I) {
+    const TargetRegisterClass *RC = nullptr;
+    if (ARM::GPRRegClass.contains(*I))
+      RC = &ARM::GPRRegClass;
+    else if (ARM::DPRRegClass.contains(*I))
+      RC = &ARM::DPRRegClass;
+    else
+      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+    unsigned NewVR = MRI->createVirtualRegister(RC);
+    // Create copy from CSR to a virtual register.
+    // FIXME: this currently does not emit CFI pseudo-instructions, it works
+    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+    // nounwind. If we want to generalize this later, we may need to emit
+    // CFI pseudo-instructions.
+    assert(Entry->getParent()->getFunction()->hasFnAttribute(
+               Attribute::NoUnwind) &&
+           "Function should be nounwind in insertCopiesSplitCSR!");
+    Entry->addLiveIn(*I);
+    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+        .addReg(*I);
+
+    // Insert the copy-back instructions right before the terminator.
+    for (auto *Exit : Exits)
+      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), *I)
+          .addReg(NewVR);
+  }
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
new file mode 100644
index 000000000000..5255d82d647a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -0,0 +1,720 @@
+//===-- ARMISelLowering.h - ARM DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H
+#define LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H
+
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include <vector>
+
+namespace llvm {
+  class ARMConstantPoolValue;
+  class ARMSubtarget;
+
+  namespace ARMISD {
+    // ARM Specific DAG Nodes
+    enum NodeType : unsigned {
+      // Start the numbering where the builtin ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      Wrapper,      // Wrapper - A wrapper node for TargetConstantPool,
+                    // TargetExternalSymbol, and TargetGlobalAddress.
+      WrapperPIC,   // WrapperPIC - A wrapper node for TargetGlobalAddress in
+                    // PIC mode.
+      WrapperJT,    // WrapperJT - A wrapper node for TargetJumpTable
+
+      // Add pseudo op to model memcpy for struct byval.
+      COPY_STRUCT_BYVAL,
+
+      CALL,         // Function call.
+      CALL_PRED,    // Function call that's predicable.
+      CALL_NOLINK,  // Function call with branch not branch-and-link.
+      BRCOND,       // Conditional branch.
+      BR_JT,        // Jumptable branch.
+      BR2_JT,       // Jumptable branch (2 level - jumptable entry is a jump).
+      RET_FLAG,     // Return with a flag operand.
+      INTRET_FLAG,  // Interrupt return with an LR-offset and a flag operand.
+
+      PIC_ADD,      // Add with a PC operand and a PIC label.
+
+      CMP,          // ARM compare instructions.
+      CMN,          // ARM CMN instructions.
+      CMPZ,         // ARM compare that sets only Z flag.
+      CMPFP,        // ARM VFP compare instruction, sets FPSCR.
+      CMPFPw0,      // ARM VFP compare against zero instruction, sets FPSCR.
+      FMSTAT,       // ARM fmstat instruction.
+
+      CMOV,         // ARM conditional move instructions.
+
+      SSAT,         // Signed saturation
+
+      BCC_i64,
+
+      SRL_FLAG,     // V,Flag = srl_flag X -> srl X, 1 + save carry out.
+      SRA_FLAG,     // V,Flag = sra_flag X -> sra X, 1 + save carry out.
+      RRX,          // V = RRX X, Flag     -> srl X, 1 + shift in carry flag.
+
+      ADDC,         // Add with carry
+      ADDE,         // Add using carry
+      SUBC,         // Sub with carry
+      SUBE,         // Sub using carry
+
+      VMOVRRD,      // double to two gprs.
+      VMOVDRR,      // Two gprs to double.
+
+      EH_SJLJ_SETJMP,         // SjLj exception handling setjmp.
+      EH_SJLJ_LONGJMP,        // SjLj exception handling longjmp.
+      EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch.
+
+      TC_RETURN,    // Tail call return pseudo.
+
+      THREAD_POINTER,
+
+      DYN_ALLOC,    // Dynamic allocation on the stack.
+
+      MEMBARRIER_MCR, // Memory barrier (MCR)
+
+      PRELOAD,      // Preload
+
+      WIN__CHKSTK,  // Windows' __chkstk call to do stack probing.
+      WIN__DBZCHK,  // Windows' divide by zero check
+
+      VCEQ,         // Vector compare equal.
+      VCEQZ,        // Vector compare equal to zero.
+      VCGE,         // Vector compare greater than or equal.
+      VCGEZ,        // Vector compare greater than or equal to zero.
+      VCLEZ,        // Vector compare less than or equal to zero.
+      VCGEU,        // Vector compare unsigned greater than or equal.
+      VCGT,         // Vector compare greater than.
+      VCGTZ,        // Vector compare greater than zero.
+      VCLTZ,        // Vector compare less than zero.
+      VCGTU,        // Vector compare unsigned greater than.
+      VTST,         // Vector test bits.
+
+      // Vector shift by immediate:
+      VSHL,         // ...left
+      VSHRs,        // ...right (signed)
+      VSHRu,        // ...right (unsigned)
+
+      // Vector rounding shift by immediate:
+      VRSHRs,       // ...right (signed)
+      VRSHRu,       // ...right (unsigned)
+      VRSHRN,       // ...right narrow
+
+      // Vector saturating shift by immediate:
+      VQSHLs,       // ...left (signed)
+      VQSHLu,       // ...left (unsigned)
+      VQSHLsu,      // ...left (signed to unsigned)
+      VQSHRNs,      // ...right narrow (signed)
+      VQSHRNu,      // ...right narrow (unsigned)
+      VQSHRNsu,     // ...right narrow (signed to unsigned)
+
+      // Vector saturating rounding shift by immediate:
+      VQRSHRNs,     // ...right narrow (signed)
+      VQRSHRNu,     // ...right narrow (unsigned)
+      VQRSHRNsu,    // ...right narrow (signed to unsigned)
+
+      // Vector shift and insert:
+      VSLI,         // ...left
+      VSRI,         // ...right
+
+      // Vector get lane (VMOV scalar to ARM core register)
+      // (These are used for 8- and 16-bit element types only.)
+      VGETLANEu,    // zero-extend vector extract element
+      VGETLANEs,    // sign-extend vector extract element
+
+      // Vector move immediate and move negated immediate:
+      VMOVIMM,
+      VMVNIMM,
+
+      // Vector move f32 immediate:
+      VMOVFPIMM,
+
+      // Vector duplicate:
+      VDUP,
+      VDUPLANE,
+
+      // Vector shuffles:
+      VEXT,         // extract
+      VREV64,       // reverse elements within 64-bit doublewords
+      VREV32,       // reverse elements within 32-bit words
+      VREV16,       // reverse elements within 16-bit halfwords
+      VZIP,         // zip (interleave)
+      VUZP,         // unzip (deinterleave)
+      VTRN,         // transpose
+      VTBL1,        // 1-register shuffle with mask
+      VTBL2,        // 2-register shuffle with mask
+
+      // Vector multiply long:
+      VMULLs,       // ...signed
+      VMULLu,       // ...unsigned
+
+      UMLAL,        // 64bit Unsigned Accumulate Multiply
+      SMLAL,        // 64bit Signed Accumulate Multiply
+      UMAAL,        // 64-bit Unsigned Accumulate Accumulate Multiply
+
+      // Operands of the standard BUILD_VECTOR node are not legalized, which
+      // is fine if BUILD_VECTORs are always lowered to shuffles or other
+      // operations, but for ARM some BUILD_VECTORs are legal as-is and their
+      // operands need to be legalized.  Define an ARM-specific version of
+      // BUILD_VECTOR for this purpose.
+      BUILD_VECTOR,
+
+      // Bit-field insert
+      BFI,
+
+      // Vector OR with immediate
+      VORRIMM,
+      // Vector AND with NOT of immediate
+      VBICIMM,
+
+      // Vector bitwise select
+      VBSL,
+
+      // Pseudo-instruction representing a memory copy using ldm/stm
+      // instructions.
+      MEMCPY,
+
+      // Vector load N-element structure to all lanes:
+      VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      VLD2DUP,
+      VLD3DUP,
+      VLD4DUP,
+
+      // NEON loads with post-increment base updates:
+      VLD1_UPD,
+      VLD2_UPD,
+      VLD3_UPD,
+      VLD4_UPD,
+      VLD2LN_UPD,
+      VLD3LN_UPD,
+      VLD4LN_UPD,
+      VLD1DUP_UPD,
+      VLD2DUP_UPD,
+      VLD3DUP_UPD,
+      VLD4DUP_UPD,
+
+      // NEON stores with post-increment base updates:
+      VST1_UPD,
+      VST2_UPD,
+      VST3_UPD,
+      VST4_UPD,
+      VST2LN_UPD,
+      VST3LN_UPD,
+      VST4LN_UPD
+    };
+  }
+
+  /// Define some predicates that are used for node matching.
+  namespace ARM {
+    bool isBitFieldInvertedMask(unsigned v);
+  }
+
+  //===--------------------------------------------------------------------===//
+  //  ARMTargetLowering - ARM Implementation of the TargetLowering interface
+
+  class ARMTargetLowering : public TargetLowering {
+  public:
+    explicit ARMTargetLowering(const TargetMachine &TM,
+                               const ARMSubtarget &STI);
+
+    unsigned getJumpTableEncoding() const override;
+    bool useSoftFloat() const override;
+
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
+
+    const char *getTargetNodeName(unsigned Opcode) const override;
+
+    bool isSelectSupported(SelectSupportKind Kind) const override {
+      // ARM does not support scalar condition selects on vectors.
+      return (Kind != ScalarCondVectorVal);
+    }
+
+    /// getSetCCResultType - Return the value type to use for ISD::SETCC.
+    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                           EVT VT) const override;
+
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *MBB) const override;
+
+    void AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                       SDNode *Node) const override;
+
+    SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
+    SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const;
+    SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const;
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+    bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override;
+
+    /// allowsMisalignedMemoryAccesses - Returns true if the target allows
+    /// unaligned memory accesses of the specified type. Returns whether it
+    /// is "fast" by reference in the second argument.
+    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+                                        unsigned Align,
+                                        bool *Fast) const override;
+
+    EVT getOptimalMemOpType(uint64_t Size,
+                            unsigned DstAlign, unsigned SrcAlign,
+                            bool IsMemset, bool ZeroMemset,
+                            bool MemcpyStrSrc,
+                            MachineFunction &MF) const override;
+
+    using TargetLowering::isZExtFree;
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+    bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
+
+    bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
+
+
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
+
+    /// getScalingFactorCost - Return the cost of the scaling used in
+    /// addressing mode represented by AM.
+    /// If the AM is supported, the return value must be >= 0.
+    /// If the AM is not supported, the return value must be negative.
+    int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
+
+    bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
+
+    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalICmpImmediate(int64_t Imm) const override;
+
+    /// isLegalAddImmediate - Return true if the specified immediate is legal
+    /// add immediate, that is the target has add instructions which can
+    /// add a register and the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalAddImmediate(int64_t Imm) const override;
+
+    /// getPreIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if the node's address
+    /// can be legally represented as pre-indexed load / store address.
+    bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+                                   ISD::MemIndexedMode &AM,
+                                   SelectionDAG &DAG) const override;
+
+    /// getPostIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if this node can be
+    /// combined with a load / store to form a post-indexed load / store.
+    bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+                                    SDValue &Offset, ISD::MemIndexedMode &AM,
+                                    SelectionDAG &DAG) const override;
+
+    void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth) const override;
+
+
+    bool ExpandInlineAsm(CallInst *CI) const override;
+
+    ConstraintType getConstraintType(StringRef Constraint) const override;
+
+    /// Examine constraint string and operand type and determine a weight value.
+    /// The operand object must already have been set up with the operand type.
+    ConstraintWeight getSingleConstraintMatchWeight(
+      AsmOperandInfo &info, const char *constraint) const override;
+
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 StringRef Constraint, MVT VT) const override;
+
+    const char *LowerXConstraint(EVT ConstraintVT) const override;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
+    /// true it means one of the asm constraint of the inline asm instruction
+    /// being processed is 'm'.
+    void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
+
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+      if (ConstraintCode == "Q")
+        return InlineAsm::Constraint_Q;
+      else if (ConstraintCode == "o")
+        return InlineAsm::Constraint_o;
+      else if (ConstraintCode.size() == 2) {
+        if (ConstraintCode[0] == 'U') {
+          switch(ConstraintCode[1]) {
+          default:
+            break;
+          case 'm':
+            return InlineAsm::Constraint_Um;
+          case 'n':
+            return InlineAsm::Constraint_Un;
+          case 'q':
+            return InlineAsm::Constraint_Uq;
+          case 's':
+            return InlineAsm::Constraint_Us;
+          case 't':
+            return InlineAsm::Constraint_Ut;
+          case 'v':
+            return InlineAsm::Constraint_Uv;
+          case 'y':
+            return InlineAsm::Constraint_Uy;
+          }
+        }
+      }
+      return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+    }
+
+    const ARMSubtarget* getSubtarget() const {
+      return Subtarget;
+    }
+
+    /// getRegClassFor - Return the register class that should be used for the
+    /// specified value type.
+    const TargetRegisterClass *getRegClassFor(MVT VT) const override;
+
+    /// Returns true if a cast between SrcAS and DestAS is a noop.
+    bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+      // Addrspacecasts are always noops.
+      return true;
+    }
+
+    bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
+                                unsigned &PrefAlign) const override;
+
+    /// createFastISel - This method returns a target specific FastISel object,
+    /// or null if the target does not support "fast" ISel.
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo) const override;
+
+    Sched::Preference getSchedulingPreference(SDNode *N) const override;
+
+    bool
+    isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+    /// isFPImmLegal - Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+    bool getTgtMemIntrinsic(IntrinsicInfo &Info,
+                            const CallInst &I,
+                            unsigned Intrinsic) const override;
+
+    /// \brief Returns true if it is beneficial to convert a load of a constant
+    /// to just the constant itself.
+    bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                           Type *Ty) const override;
+
+    /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
+    /// with this index.
+    bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override;
+
+    /// \brief Returns true if an argument of type Ty needs to be passed in a
+    /// contiguous block of registers in calling convention CallConv.
+    bool functionArgumentNeedsConsecutiveRegisters(
+        Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;
+
+    /// If a physical register, this returns the register that receives the
+    /// exception address on entry to an EH pad.
+    unsigned
+    getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+    /// If a physical register, this returns the register that receives the
+    /// exception typeid on entry to a landing pad.
+    unsigned
+    getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
+    Instruction *makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const;
+    Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                          AtomicOrdering Ord) const override;
+    Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+                                Value *Addr, AtomicOrdering Ord) const override;
+
+    void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
+
+    Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
+                          bool IsStore, bool IsLoad) const override;
+    Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
+                           bool IsStore, bool IsLoad) const override;
+
+    unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
+
+    bool lowerInterleavedLoad(LoadInst *LI,
+                              ArrayRef<ShuffleVectorInst *> Shuffles,
+                              ArrayRef<unsigned> Indices,
+                              unsigned Factor) const override;
+    bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+                               unsigned Factor) const override;
+
+    bool shouldInsertFencesForAtomic(const Instruction *I) const override;
+    TargetLoweringBase::AtomicExpansionKind
+    shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+    bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+    TargetLoweringBase::AtomicExpansionKind
+    shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+    bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
+    bool useLoadStackGuardNode() const override;
+
+    bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
+                                   unsigned &Cost) const override;
+
+    bool isCheapToSpeculateCttz() const override;
+    bool isCheapToSpeculateCtlz() const override;
+
+    bool supportSwiftError() const override {
+      return true;
+    }
+
+    bool hasStandaloneRem(EVT VT) const override {
+      return HasStandaloneRem;
+    }
+
+    CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const;
+    CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const;
+
+  protected:
+    std::pair<const TargetRegisterClass *, uint8_t>
+    findRepresentativeClass(const TargetRegisterInfo *TRI,
+                            MVT VT) const override;
+
+  private:
+    /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const ARMSubtarget *Subtarget;
+
+    const TargetRegisterInfo *RegInfo;
+
+    const InstrItineraryData *Itins;
+
+    /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created.
+    ///
+    unsigned ARMPCLabelIndex;
+
+    // TODO: remove this, and have shouldInsertFencesForAtomic do the proper
+    // check.
+    bool InsertFencesForAtomic;
+
+    bool HasStandaloneRem = true;
+
+    void InitLibcallCallingConvs();
+
+    void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
+    void addDRTypeForNEON(MVT VT);
+    void addQRTypeForNEON(MVT VT);
+    std::pair<SDValue, SDValue> getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const;
+
+    typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
+    void PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain,
+                          SDValue &Arg, RegsToPassVector &RegsToPass,
+                          CCValAssign &VA, CCValAssign &NextVA,
+                          SDValue &StackPtr,
+                          SmallVectorImpl<SDValue> &MemOpChains,
+                          ISD::ArgFlagsTy Flags) const;
+    SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
+                                 SDValue &Root, SelectionDAG &DAG,
+                                 const SDLoc &dl) const;
+
+    CallingConv::ID getEffectiveCallingConv(CallingConv::ID CC,
+                                            bool isVarArg) const;
+    CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return,
+                                  bool isVarArg) const;
+    SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+                             const SDLoc &dl, SelectionDAG &DAG,
+                             const CCValAssign &VA,
+                             ISD::ArgFlagsTy Flags) const;
+    SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
+                                    const ARMSubtarget *Subtarget) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+                                            SelectionDAG &DAG) const;
+    SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA,
+                                 SelectionDAG &DAG,
+                                 TLSModel::Model model) const;
+    SDValue LowerGlobalTLSAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddressWindows(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantFP(SDValue Op, SelectionDAG &DAG,
+                            const ARMSubtarget *ST) const;
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
+                              const ARMSubtarget *ST) const;
+    SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+    void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed,
+                           SmallVectorImpl<SDValue> &Results) const;
+    SDValue LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed,
+                                   SDValue &Chain) const;
+    SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const;
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+
+    unsigned getRegisterByName(const char* RegName, EVT VT,
+                               SelectionDAG &DAG) const override;
+
+    /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+    /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+    /// expanded to FMAs when this method returns true, otherwise fmuladd is
+    /// expanded to fmul + fadd.
+    ///
+    /// ARM supports both fused and unfused multiply-add operations; we already
+    /// lower a pair of fmul and fadd to the latter so it's not clear that there
+    /// would be a gain or that the gain would be worthwhile enough to risk
+    /// correctness bugs.
+    bool isFMAFasterThanFMulAndFAdd(EVT VT) const override { return false; }
+
+    SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            const SDLoc &dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+                            SDValue ThisVal) const;
+
+    bool supportSplitCSR(MachineFunction *MF) const override {
+      return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+          MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+    }
+    void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+    void insertCopiesSplitCSR(
+      MachineBasicBlock *Entry,
+      const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
+    SDValue
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
+
+    int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &dl,
+                       SDValue &Chain, const Value *OrigArg,
+                       unsigned InRegsParamRecordIdx, int ArgOffset,
+                       unsigned ArgSize) const;
+
+    void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                              const SDLoc &dl, SDValue &Chain,
+                              unsigned ArgOffset, unsigned TotalArgRegsSaveSize,
+                              bool ForceMutable = false) const;
+
+    SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const override;
+
+    /// HandleByVal - Target-specific cleanup for ByVal support.
+    void HandleByVal(CCState *, unsigned &, unsigned) const override;
+
+    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization. Targets which want to do tail call
+    /// optimization should implement this function.
+    bool IsEligibleForTailCallOptimization(SDValue Callee,
+                                           CallingConv::ID CalleeCC,
+                                           bool isVarArg,
+                                           bool isCalleeStructRet,
+                                           bool isCallerStructRet,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                           SelectionDAG& DAG) const;
+
+    bool CanLowerReturn(CallingConv::ID CallConv,
+                        MachineFunction &MF, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
+
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
+
+    bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+
+    bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+
+    SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal,
+                    SDValue ARMcc, SDValue CCR, SDValue Cmp,
+                    SelectionDAG &DAG) const;
+    SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                      SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const;
+    SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
+                      const SDLoc &dl) const;
+    SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
+
+    SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
+
+    void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
+                                MachineBasicBlock *DispatchBB, int FI) const;
+
+    void EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *MBB) const;
+
+    bool RemapAddSubWithFlags(MachineInstr &MI, MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitStructByval(MachineInstr &MI,
+                                       MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *EmitLowered__chkstk(MachineInstr &MI,
+                                           MachineBasicBlock *MBB) const;
+    MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI,
+                                           MachineBasicBlock *MBB) const;
+  };
+
+  enum NEONModImmType {
+    VMOVModImm,
+    VMVNModImm,
+    OtherModImm
+  };
+
+  namespace ARM {
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo);
+  }
+}
+
+#endif  // ARMISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
new file mode 100644
index 000000000000..488439fc24e0
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -0,0 +1,2531 @@
+//===-- ARMInstrFormats.td - ARM Instruction Formats -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// ARM Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<6> val> {
+  bits<6> Value = val;
+}
+
+def Pseudo        : Format<0>;
+def MulFrm        : Format<1>;
+def BrFrm         : Format<2>;
+def BrMiscFrm     : Format<3>;
+
+def DPFrm         : Format<4>;
+def DPSoRegRegFrm    : Format<5>;
+
+def LdFrm         : Format<6>;
+def StFrm         : Format<7>;
+def LdMiscFrm     : Format<8>;
+def StMiscFrm     : Format<9>;
+def LdStMulFrm    : Format<10>;
+
+def LdStExFrm     : Format<11>;
+
+def ArithMiscFrm  : Format<12>;
+def SatFrm        : Format<13>;
+def ExtFrm        : Format<14>;
+
+def VFPUnaryFrm   : Format<15>;
+def VFPBinaryFrm  : Format<16>;
+def VFPConv1Frm   : Format<17>;
+def VFPConv2Frm   : Format<18>;
+def VFPConv3Frm   : Format<19>;
+def VFPConv4Frm   : Format<20>;
+def VFPConv5Frm   : Format<21>;
+def VFPLdStFrm    : Format<22>;
+def VFPLdStMulFrm : Format<23>;
+def VFPMiscFrm    : Format<24>;
+
+def ThumbFrm      : Format<25>;
+def MiscFrm       : Format<26>;
+
+def NGetLnFrm     : Format<27>;
+def NSetLnFrm     : Format<28>;
+def NDupFrm       : Format<29>;
+def NLdStFrm      : Format<30>;
+def N1RegModImmFrm: Format<31>;
+def N2RegFrm      : Format<32>;
+def NVCVTFrm      : Format<33>;
+def NVDupLnFrm    : Format<34>;
+def N2RegVShLFrm  : Format<35>;
+def N2RegVShRFrm  : Format<36>;
+def N3RegFrm      : Format<37>;
+def N3RegVShFrm   : Format<38>;
+def NVExtFrm      : Format<39>;
+def NVMulSLFrm    : Format<40>;
+def NVTBLFrm      : Format<41>;
+def DPSoRegImmFrm  : Format<42>;
+
+// Misc flags.
+
+// The instruction has an Rn register operand.
+// UnaryDP - Indicates this is a unary data processing instruction, i.e.
+// it doesn't have a Rn operand.
+class UnaryDP    { bit isUnaryDataProc = 1; }
+
+// Xform16Bit - Indicates this Thumb2 instruction may be transformed into
+// a 16-bit Thumb instruction if certain conditions are met.
+class Xform16Bit { bit canXformTo16Bit = 1; }
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction flags.  These need to match ARMBaseInstrInfo.h.
+//
+
+// FIXME: Once the JIT is MC-ized, these can go away.
+// Addressing mode.
+class AddrMode<bits<5> val> {
+  bits<5> Value = val;
+}
+def AddrModeNone    : AddrMode<0>;
+def AddrMode1       : AddrMode<1>;
+def AddrMode2       : AddrMode<2>;
+def AddrMode3       : AddrMode<3>;
+def AddrMode4       : AddrMode<4>;
+def AddrMode5       : AddrMode<5>;
+def AddrMode6       : AddrMode<6>;
+def AddrModeT1_1    : AddrMode<7>;
+def AddrModeT1_2    : AddrMode<8>;
+def AddrModeT1_4    : AddrMode<9>;
+def AddrModeT1_s    : AddrMode<10>;
+def AddrModeT2_i12  : AddrMode<11>;
+def AddrModeT2_i8   : AddrMode<12>;
+def AddrModeT2_so   : AddrMode<13>;
+def AddrModeT2_pc   : AddrMode<14>;
+def AddrModeT2_i8s4 : AddrMode<15>;
+def AddrMode_i12    : AddrMode<16>;
+
+// Load / store index mode.
+class IndexMode<bits<2> val> {
+  bits<2> Value = val;
+}
+def IndexModeNone : IndexMode<0>;
+def IndexModePre  : IndexMode<1>;
+def IndexModePost : IndexMode<2>;
+def IndexModeUpd  : IndexMode<3>;
+
+// Instruction execution domain.
+class Domain<bits<3> val> {
+  bits<3> Value = val;
+}
+def GenericDomain : Domain<0>;
+def VFPDomain     : Domain<1>; // Instructions in VFP domain only
+def NeonDomain    : Domain<2>; // Instructions in Neon domain only
+def VFPNeonDomain : Domain<3>; // Instructions in both VFP & Neon domains
+def VFPNeonA8Domain : Domain<5>; // Instructions in VFP & Neon under A8
+
+//===----------------------------------------------------------------------===//
+// ARM special operands.
+//
+
+// ARM imod and iflag operands, used only by the CPS instruction.
+def imod_op : Operand<i32> {
+  let PrintMethod = "printCPSIMod";
+}
+
+def ProcIFlagsOperand : AsmOperandClass {
+  let Name = "ProcIFlags";
+  let ParserMethod = "parseProcIFlagsOperand";
+}
+def iflags_op : Operand<i32> {
+  let PrintMethod = "printCPSIFlag";
+  let ParserMatchClass = ProcIFlagsOperand;
+}
+
+// ARM Predicate operand. Default to 14 = always (AL). Second part is CC
+// register whose default is 0 (no register).
+def CondCodeOperand : AsmOperandClass { let Name = "CondCode"; }
+def pred : PredicateOperand<OtherVT, (ops i32imm, i32imm),
+                                     (ops (i32 14), (i32 zero_reg))> {
+  let PrintMethod = "printPredicateOperand";
+  let ParserMatchClass = CondCodeOperand;
+  let DecoderMethod = "DecodePredicateOperand";
+}
+
+// Selectable predicate operand for CMOV instructions. We can't use a normal
+// predicate because the default values interfere with instruction selection. In
+// all other respects it is identical though: pseudo-instruction expansion
+// relies on the MachineOperands being compatible.
+def cmovpred : Operand<i32>, PredicateOp,
+               ComplexPattern<i32, 2, "SelectCMOVPred"> {
+  let MIOperandInfo = (ops i32imm, i32imm);
+  let PrintMethod = "printPredicateOperand";
+}
+
+// Conditional code result for instructions whose 's' bit is set, e.g. subs.
+def CCOutOperand : AsmOperandClass { let Name = "CCOut"; }
+def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
+  let EncoderMethod = "getCCOutOpValue";
+  let PrintMethod = "printSBitModifierOperand";
+  let ParserMatchClass = CCOutOperand;
+  let DecoderMethod = "DecodeCCOutOperand";
+}
+
+// Same as cc_out except it defaults to setting CPSR.
+def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
+  let EncoderMethod = "getCCOutOpValue";
+  let PrintMethod = "printSBitModifierOperand";
+  let ParserMatchClass = CCOutOperand;
+  let DecoderMethod = "DecodeCCOutOperand";
+}
+
+// ARM special operands for disassembly only.
+//
+def SetEndAsmOperand : ImmAsmOperand {
+  let Name = "SetEndImm";
+  let ParserMethod = "parseSetEndImm";
+}
+def setend_op : Operand<i32> {
+  let PrintMethod = "printSetendOperand";
+  let ParserMatchClass = SetEndAsmOperand;
+}
+
+def MSRMaskOperand : AsmOperandClass {
+  let Name = "MSRMask";
+  let ParserMethod = "parseMSRMaskOperand";
+}
+def msr_mask : Operand<i32> {
+  let PrintMethod = "printMSRMaskOperand";
+  let DecoderMethod = "DecodeMSRMask";
+  let ParserMatchClass = MSRMaskOperand;
+}
+
+def BankedRegOperand : AsmOperandClass {
+  let Name = "BankedReg";
+  let ParserMethod = "parseBankedRegOperand";
+}
+def banked_reg : Operand<i32> {
+  let PrintMethod = "printBankedRegOperand";
+  let DecoderMethod = "DecodeBankedReg";
+  let ParserMatchClass = BankedRegOperand;
+}
+
+// Shift Right Immediate - A shift right immediate is encoded differently from
+// other shift immediates. The imm6 field is encoded like so:
+//
+//    Offset    Encoding
+//     8        imm6<5:3> = '001', 8 - <imm> is encoded in imm6<2:0>
+//     16       imm6<5:4> = '01', 16 - <imm> is encoded in imm6<3:0>
+//     32       imm6<5> = '1', 32 - <imm> is encoded in imm6<4:0>
+//     64       64 - <imm> is encoded in imm6<5:0>
+def shr_imm8_asm_operand : ImmAsmOperand { let Name = "ShrImm8"; }
+def shr_imm8  : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 8; }]> {
+  let EncoderMethod = "getShiftRight8Imm";
+  let DecoderMethod = "DecodeShiftRight8Imm";
+  let ParserMatchClass = shr_imm8_asm_operand;
+}
+def shr_imm16_asm_operand : ImmAsmOperand { let Name = "ShrImm16"; }
+def shr_imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 16; }]> {
+  let EncoderMethod = "getShiftRight16Imm";
+  let DecoderMethod = "DecodeShiftRight16Imm";
+  let ParserMatchClass = shr_imm16_asm_operand;
+}
+def shr_imm32_asm_operand : ImmAsmOperand { let Name = "ShrImm32"; }
+def shr_imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]> {
+  let EncoderMethod = "getShiftRight32Imm";
+  let DecoderMethod = "DecodeShiftRight32Imm";
+  let ParserMatchClass = shr_imm32_asm_operand;
+}
+def shr_imm64_asm_operand : ImmAsmOperand { let Name = "ShrImm64"; }
+def shr_imm64 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 64; }]> {
+  let EncoderMethod = "getShiftRight64Imm";
+  let DecoderMethod = "DecodeShiftRight64Imm";
+  let ParserMatchClass = shr_imm64_asm_operand;
+}
+
+
+// ARM Assembler operand for ldr Rd, =expression which generates an offset
+// to a constant pool entry or a MOV depending on the value of expression
+def const_pool_asm_operand : AsmOperandClass { let Name = "ConstPoolAsmImm"; }
+def const_pool_asm_imm : Operand<i32> {
+  let ParserMatchClass = const_pool_asm_operand;
+}
+
+
+//===----------------------------------------------------------------------===//
+// ARM Assembler alias templates.
+//
+// Note: When EmitPriority == 1, the alias will be used for printing
+class ARMInstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[IsARM]>;
+class  tInstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[IsThumb]>;
+class t2InstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[IsThumb2]>;
+class VFP2InstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[HasVFP2]>;
+class VFP2DPInstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[HasVFP2,HasDPVFP]>;
+class VFP3InstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[HasVFP3]>;
+class NEONInstAlias<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>, Requires<[HasNEON]>;
+
+
+class VFP2MnemonicAlias<string src, string dst> : MnemonicAlias<src, dst>,
+          Requires<[HasVFP2]>;
+class NEONMnemonicAlias<string src, string dst> : MnemonicAlias<src, dst>,
+          Requires<[HasNEON]>;
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction templates.
+//
+
+
+class InstTemplate<AddrMode am, int sz, IndexMode im,
+                   Format f, Domain d, string cstr, InstrItinClass itin>
+  : Instruction {
+  let Namespace = "ARM";
+
+  AddrMode AM = am;
+  int Size = sz;
+  IndexMode IM = im;
+  bits<2> IndexModeBits = IM.Value;
+  Format F = f;
+  bits<6> Form = F.Value;
+  Domain D = d;
+  bit isUnaryDataProc = 0;
+  bit canXformTo16Bit = 0;
+  // The instruction is a 16-bit flag setting Thumb instruction. Used
+  // by the parser to determine whether to require the 'S' suffix on the
+  // mnemonic (when not in an IT block) or preclude it (when in an IT block).
+  bit thumbArithFlagSetting = 0;
+
+  // If this is a pseudo instruction, mark it isCodeGenOnly.
+  let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
+
+  // The layout of TSFlags should be kept in sync with ARMBaseInfo.h.
+  let TSFlags{4-0}   = AM.Value;
+  let TSFlags{6-5}   = IndexModeBits;
+  let TSFlags{12-7} = Form;
+  let TSFlags{13}    = isUnaryDataProc;
+  let TSFlags{14}    = canXformTo16Bit;
+  let TSFlags{17-15} = D.Value;
+  let TSFlags{18}    = thumbArithFlagSetting;
+
+  let Constraints = cstr;
+  let Itinerary = itin;
+}
+
+class Encoding {
+  field bits<32> Inst;
+  // Mask of bits that cause an encoding to be UNPREDICTABLE.
+  // If a bit is set, then if the corresponding bit in the
+  // target encoding differs from its value in the "Inst" field,
+  // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
+  field bits<32> Unpredictable = 0;
+  // SoftFail is the generic name for this field, but we alias it so
+  // as to make it more obvious what it means in ARM-land.
+  field bits<32> SoftFail = Unpredictable;
+}
+
+class InstARM<AddrMode am, int sz, IndexMode im,
+              Format f, Domain d, string cstr, InstrItinClass itin>
+  : InstTemplate<am, sz, im, f, d, cstr, itin>, Encoding {
+  let DecoderNamespace = "ARM";
+}
+
+// This Encoding-less class is used by Thumb1 to specify the encoding bits later
+// on by adding flavors to specific instructions.
+class InstThumb<AddrMode am, int sz, IndexMode im,
+                Format f, Domain d, string cstr, InstrItinClass itin>
+  : InstTemplate<am, sz, im, f, d, cstr, itin> {
+  let DecoderNamespace = "Thumb";
+}
+
+// Pseudo-instructions for alternate assembly syntax (never used by codegen).
+// These are aliases that require C++ handling to convert to the target
+// instruction, while InstAliases can be handled directly by tblgen.
+class AsmPseudoInst<string asm, dag iops, dag oops = (outs)>
+  : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
+                 "", NoItinerary> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let Pattern = [];
+  let isCodeGenOnly = 0; // So we get asm matcher for it.
+  let AsmString = asm;
+  let isPseudo = 1;
+}
+
+class ARMAsmPseudo<string asm, dag iops, dag oops = (outs)>
+  : AsmPseudoInst<asm, iops, oops>, Requires<[IsARM]>;
+class tAsmPseudo<string asm, dag iops, dag oops = (outs)>
+  : AsmPseudoInst<asm, iops, oops>, Requires<[IsThumb]>;
+class t2AsmPseudo<string asm, dag iops, dag oops = (outs)>
+  : AsmPseudoInst<asm, iops, oops>, Requires<[IsThumb2]>;
+class VFP2AsmPseudo<string asm, dag iops, dag oops = (outs)>
+  : AsmPseudoInst<asm, iops, oops>, Requires<[HasVFP2]>;
+class NEONAsmPseudo<string asm, dag iops, dag oops = (outs)>
+  : AsmPseudoInst<asm, iops, oops>, Requires<[HasNEON]>;
+
+// Pseudo instructions for the code generator.
+class PseudoInst<dag oops, dag iops, InstrItinClass itin, list<dag> pattern>
+  : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo,
+                 GenericDomain, "", itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let Pattern = pattern;
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
+
+// PseudoInst that's ARM-mode only.
+class ARMPseudoInst<dag oops, dag iops, int sz, InstrItinClass itin,
+                    list<dag> pattern>
+  : PseudoInst<oops, iops, itin, pattern> {
+  let Size = sz;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// PseudoInst that's Thumb-mode only.
+class tPseudoInst<dag oops, dag iops, int sz, InstrItinClass itin,
+                    list<dag> pattern>
+  : PseudoInst<oops, iops, itin, pattern> {
+  let Size = sz;
+  list<Predicate> Predicates = [IsThumb];
+}
+
+// PseudoInst that's in ARMv8-M baseline (Somewhere between Thumb and Thumb2)
+class t2basePseudoInst<dag oops, dag iops, int sz, InstrItinClass itin,
+                    list<dag> pattern>
+  : PseudoInst<oops, iops, itin, pattern> {
+  let Size = sz;
+  list<Predicate> Predicates = [IsThumb,HasV8MBaseline];
+}
+
+// PseudoInst that's Thumb2-mode only.
+class t2PseudoInst<dag oops, dag iops, int sz, InstrItinClass itin,
+                    list<dag> pattern>
+  : PseudoInst<oops, iops, itin, pattern> {
+  let Size = sz;
+  list<Predicate> Predicates = [IsThumb2];
+}
+
+class ARMPseudoExpand<dag oops, dag iops, int sz,
+                      InstrItinClass itin, list<dag> pattern,
+                      dag Result>
+  : ARMPseudoInst<oops, iops, sz, itin, pattern>,
+    PseudoInstExpansion<Result>;
+
+class tPseudoExpand<dag oops, dag iops, int sz,
+                    InstrItinClass itin, list<dag> pattern,
+                    dag Result>
+  : tPseudoInst<oops, iops, sz, itin, pattern>,
+    PseudoInstExpansion<Result>;
+
+class t2PseudoExpand<dag oops, dag iops, int sz,
+                    InstrItinClass itin, list<dag> pattern,
+                    dag Result>
+  : t2PseudoInst<oops, iops, sz, itin, pattern>,
+    PseudoInstExpansion<Result>;
+
+// Almost all ARM instructions are predicable.
+class I<dag oops, dag iops, AddrMode am, int sz,
+        IndexMode im, Format f, InstrItinClass itin,
+        string opc, string asm, string cstr,
+        list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  bits<4> p;
+  let Inst{31-28} = p;
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// A few are not predicable
+class InoP<dag oops, dag iops, AddrMode am, int sz,
+           IndexMode im, Format f, InstrItinClass itin,
+           string opc, string asm, string cstr,
+           list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = !strconcat(opc, asm);
+  let Pattern = pattern;
+  let isPredicable = 0;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// Same as I except it can optionally modify CPSR. Note it's modeled as an input
+// operand since by default it's a zero register. It will become an implicit def
+// once it's "flipped".
+class sI<dag oops, dag iops, AddrMode am, int sz,
+         IndexMode im, Format f, InstrItinClass itin,
+         string opc, string asm, string cstr,
+         list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  bits<4> p; // Predicate operand
+  bits<1> s; // condition-code set flag ('1' if the insn should set the flags)
+  let Inst{31-28} = p;
+  let Inst{20} = s;
+
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p, cc_out:$s));
+  let AsmString = !strconcat(opc, "${s}${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// Special cases
+class XI<dag oops, dag iops, AddrMode am, int sz,
+         IndexMode im, Format f, InstrItinClass itin,
+         string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+class AI<dag oops, dag iops, Format f, InstrItinClass itin,
+         string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+      opc, asm, "", pattern>;
+class AsI<dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : sI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+       opc, asm, "", pattern>;
+class AXI<dag oops, dag iops, Format f, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+       asm, "", pattern>;
+class AXIM<dag oops, dag iops, AddrMode am, Format f, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : XI<oops, iops, am, 4, IndexModeNone, f, itin,
+       asm, "", pattern>;
+class AInoP<dag oops, dag iops, Format f, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : InoP<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+         opc, asm, "", pattern>;
+
+// Ctrl flow instructions
+class ABI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, BrFrm, itin,
+      opc, asm, "", pattern> {
+  let Inst{27-24} = opcod;
+}
+class ABXI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, 4, IndexModeNone, BrFrm, itin,
+       asm, "", pattern> {
+  let Inst{27-24} = opcod;
+}
+
+// BR_JT instructions
+class JTI<dag oops, dag iops, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, 0, IndexModeNone, BrMiscFrm, itin,
+       asm, "", pattern>;
+
+class AIldr_ex_or_acq<bits<2> opcod, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rt;
+  bits<4> addr;
+  let Inst{27-23} = 0b00011;
+  let Inst{22-21} = opcod;
+  let Inst{20}    = 1;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+  let Inst{11-10} = 0b11;
+  let Inst{9-8}   = opcod2;
+  let Inst{7-0}   = 0b10011111;
+}
+class AIstr_ex_or_rel<bits<2> opcod, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rt;
+  bits<4> addr;
+  let Inst{27-23} = 0b00011;
+  let Inst{22-21} = opcod;
+  let Inst{20}    = 0;
+  let Inst{19-16} = addr;
+  let Inst{11-10} = 0b11;
+  let Inst{9-8}   = opcod2;
+  let Inst{7-4}   = 0b1001;
+  let Inst{3-0}   = Rt;
+}
+// Atomic load/store instructions
+class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIldr_ex_or_acq<opcod, 0b11, oops, iops, itin, opc, asm, pattern>;
+
+class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIstr_ex_or_rel<opcod, 0b11, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  let Inst{15-12} = Rd;
+}
+
+// Exclusive load/store instructions
+
+class AIldaex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIldr_ex_or_acq<opcod, 0b10, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasAcquireRelease, HasV7Clrex]>;
+
+class AIstlex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIstr_ex_or_rel<opcod, 0b10, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasAcquireRelease, HasV7Clrex]> {
+  bits<4> Rd;
+  let Inst{15-12} = Rd;
+}
+
+class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern>
+  : AI<oops, iops, MiscFrm, NoItinerary, opc, "\t$Rt, $Rt2, $addr", pattern> {
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> addr;
+  let Inst{27-23} = 0b00010;
+  let Inst{22} = b;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+  let Inst{11-4} = 0b00001001;
+  let Inst{3-0} = Rt2;
+
+  let Unpredictable{11-8} = 0b1111;
+  let DecoderMethod = "DecodeSwap";
+}
+// Acquire/Release load/store instructions
+class AIldracq<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIldr_ex_or_acq<opcod, 0b00, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasAcquireRelease]>;
+
+class AIstrrel<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIstr_ex_or_rel<opcod, 0b00, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasAcquireRelease]> {
+  let Inst{15-12}   = 0b1111;
+}
+
+// addrmode1 instructions
+class AI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode1, 4, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  let Inst{24-21} = opcod;
+  let Inst{27-26} = 0b00;
+}
+class AsI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : sI<oops, iops, AddrMode1, 4, IndexModeNone, f, itin,
+       opc, asm, "", pattern> {
+  let Inst{24-21} = opcod;
+  let Inst{27-26} = 0b00;
+}
+class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode1, 4, IndexModeNone, f, itin,
+       asm, "", pattern> {
+  let Inst{24-21} = opcod;
+  let Inst{27-26} = 0b00;
+}
+
+// loads
+
+// LDR/LDRB/STR/STRB/...
+class AI2ldst<bits<3> op, bit isLd, bit isByte, dag oops, dag iops, AddrMode am,
+             Format f, InstrItinClass itin, string opc, string asm,
+             list<dag> pattern>
+  : I<oops, iops, am, 4, IndexModeNone, f, itin, opc, asm,
+      "", pattern> {
+  let Inst{27-25} = op;
+  let Inst{24} = 1;  // 24 == P
+  // 23 == U
+  let Inst{22} = isByte;
+  let Inst{21} = 0;  // 21 == W
+  let Inst{20} = isLd;
+}
+// Indexed load/stores
+class AI2ldstidx<bit isLd, bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, 4, im, f, itin,
+      opc, asm, cstr, pattern> {
+  bits<4> Rt;
+  let Inst{27-26} = 0b01;
+  let Inst{24}    = isPre; // P bit
+  let Inst{22}    = isByte; // B bit
+  let Inst{21}    = isPre; // W bit
+  let Inst{20}    = isLd; // L bit
+  let Inst{15-12} = Rt;
+}
+class AI2stridx_reg<bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+               pattern> {
+  // AM2 store w/ two operands: (GPR, am2offset)
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> Rn;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{19-16} = Rn;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+}
+
+class AI2stridx_imm<bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+               pattern> {
+  // AM2 store w/ two operands: (GPR, am2offset)
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> Rn;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
+  let Inst{19-16} = Rn;
+  let Inst{11-0} = offset{11-0};
+}
+
+
+// FIXME: Merge with the above class when addrmode2 gets used for STR, STRB
+// but for now use this class for STRT and STRBT.
+class AI2stridxT<bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+               pattern> {
+  // AM2 store w/ two operands: (GPR, am2offset)
+  // {17-14}  Rn
+  // {13}     1 == Rm, 0 == imm12
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<18> addr;
+  let Inst{25} = addr{13};
+  let Inst{23} = addr{12};
+  let Inst{19-16} = addr{17-14};
+  let Inst{11-0} = addr{11-0};
+}
+
+// addrmode3 instructions
+class AI3ld<bits<4> op, bit op20, dag oops, dag iops, Format f,
+            InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, 4, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  bits<14> addr;
+  bits<4> Rt;
+  let Inst{27-25} = 0b000;
+  let Inst{24}    = 1;            // P bit
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{21}    = 0;            // W bit
+  let Inst{20}    = op20;         // L bit
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{7-4}   = op;
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+class AI3ldstidx<bits<4> op, bit op20, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, 4, im, f, itin,
+      opc, asm, cstr, pattern> {
+  bits<4> Rt;
+  let Inst{27-25} = 0b000;
+  let Inst{24}    = isPre;        // P bit
+  let Inst{21}    = isPre;        // W bit
+  let Inst{20}    = op20;         // L bit
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{7-4}   = op;
+}
+
+// FIXME: Merge with the above class when addrmode2 gets used for LDR, LDRB
+// but for now use this class for LDRSBT, LDRHT, LDSHT.
+class AI3ldstidxT<bits<4> op, bit isLoad, dag oops, dag iops,
+                  IndexMode im, Format f, InstrItinClass itin, string opc,
+                  string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, 4, im, f, itin, opc, asm, cstr, pattern> {
+  // {13}     1 == imm8, 0 == Rm
+  // {12-9}   Rn
+  // {8}      isAdd
+  // {7-4}    imm7_4/zero
+  // {3-0}    imm3_0/Rm
+  bits<4> addr;
+  bits<4> Rt;
+  let Inst{27-25} = 0b000;
+  let Inst{24}    = 0;            // P bit
+  let Inst{21}    = 1;
+  let Inst{20}    = isLoad;       // L bit
+  let Inst{19-16} = addr;         // Rn
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{7-4}   = op;
+}
+
+// stores
+class AI3str<bits<4> op, dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, 4, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  bits<14> addr;
+  bits<4> Rt;
+  let Inst{27-25} = 0b000;
+  let Inst{24}    = 1;            // P bit
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{21}    = 0;            // W bit
+  let Inst{20}    = 0;            // L bit
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{7-4}   = op;
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+// addrmode4 instructions
+class AXI4<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin,
+           string asm, string cstr, list<dag> pattern>
+  : XI<oops, iops, AddrMode4, 4, im, f, itin, asm, cstr, pattern> {
+  bits<4>  p;
+  bits<16> regs;
+  bits<4>  Rn;
+  let Inst{31-28} = p;
+  let Inst{27-25} = 0b100;
+  let Inst{22}    = 0; // S bit
+  let Inst{19-16} = Rn;
+  let Inst{15-0}  = regs;
+}
+
+// Unsigned multiply, multiply-accumulate instructions.
+class AMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin,
+      opc, asm, "", pattern> {
+  let Inst{7-4}   = 0b1001;
+  let Inst{20}    = 0; // S bit
+  let Inst{27-21} = opcod;
+}
+class AsMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : sI<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin,
+       opc, asm, "", pattern> {
+  let Inst{7-4}   = 0b1001;
+  let Inst{27-21} = opcod;
+}
+
+// Most significant word multiply
+class AMul2I<bits<7> opcod, bits<4> opc7_4, dag oops, dag iops,
+             InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{7-4}   = opc7_4;
+  let Inst{20}    = 1;
+  let Inst{27-21} = opcod;
+  let Inst{19-16} = Rd;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+// MSW multiple w/ Ra operand
+class AMul2Ia<bits<7> opcod, bits<4> opc7_4, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AMul2I<opcod, opc7_4, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Ra;
+  let Inst{15-12} = Ra;
+}
+
+// SMUL<x><y> / SMULW<y> / SMLA<x><y> / SMLAW<x><y>
+class AMulxyIbase<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{4}     = 0;
+  let Inst{7}     = 1;
+  let Inst{20}    = 0;
+  let Inst{27-21} = opcod;
+  let Inst{6-5}   = bit6_5;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+class AMulxyI<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AMulxyIbase<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  let Inst{19-16} = Rd;
+}
+
+// AMulxyI with Ra operand
+class AMulxyIa<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AMulxyI<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Ra;
+  let Inst{15-12} = Ra;
+}
+// SMLAL*
+class AMulxyI64<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AMulxyIbase<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+}
+
+// Extend instructions.
+class AExtI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, ExtFrm, itin,
+      opc, asm, "", pattern> {
+  // All AExtI instructions have Rd and Rm register operands.
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{15-12} = Rd;
+  let Inst{3-0}   = Rm;
+  let Inst{7-4}   = 0b0111;
+  let Inst{9-8}   = 0b00;
+  let Inst{27-20} = opcod;
+
+  let Unpredictable{9-8} = 0b11;
+}
+
+// Misc Arithmetic instructions.
+class AMiscA1I<bits<8> opcod, bits<4> opc7_4, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{27-20} = opcod;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rd;
+  let Inst{11-8}  = 0b1111;
+  let Inst{7-4}   = opc7_4;
+  let Inst{3-0}   = Rm;
+}
+
+// Division instructions.
+class ADivA1I<bits<3> opcod, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{27-23} = 0b01110;
+  let Inst{22-20} = opcod;
+  let Inst{19-16} = Rd;
+  let Inst{15-12} = 0b1111;
+  let Inst{11-8}  = Rm;
+  let Inst{7-4}   = 0b0001;
+  let Inst{3-0}   = Rn;
+}
+
+// PKH instructions
+def PKHLSLAsmOperand : ImmAsmOperand {
+  let Name = "PKHLSLImm";
+  let ParserMethod = "parsePKHLSLImm";
+}
+def pkh_lsl_amt: Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 32; }]>{
+  let PrintMethod = "printPKHLSLShiftImm";
+  let ParserMatchClass = PKHLSLAsmOperand;
+}
+def PKHASRAsmOperand : AsmOperandClass {
+  let Name = "PKHASRImm";
+  let ParserMethod = "parsePKHASRImm";
+}
+def pkh_asr_amt: Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]>{
+  let PrintMethod = "printPKHASRShiftImm";
+  let ParserMatchClass = PKHASRAsmOperand;
+}
+
+class APKHI<bits<8> opcod, bit tb, dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  bits<5> sh;
+  let Inst{27-20} = opcod;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = sh;
+  let Inst{6}     = tb;
+  let Inst{5-4}   = 0b01;
+  let Inst{3-0}   = Rm;
+}
+
+//===----------------------------------------------------------------------===//
+
+// ARMPat - Same as Pat<>, but requires that the compiler be in ARM mode.
+class ARMPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM];
+}
+class ARMV5TPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV5T];
+}
+class ARMV5TEPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV5TE];
+}
+// ARMV5MOPat - Same as ARMV5TEPat with UseMulOps.
+class ARMV5MOPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV5TE, UseMulOps];
+}
+class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV6];
+}
+class VFPPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [HasVFP2];
+}
+class VFPNoNEONPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [HasVFP2, DontUseNEONForFP];
+}
+class Thumb2DSPPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb2, HasDSP];
+}
+class Thumb2DSPMulPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb2, UseMulOps, HasDSP];
+}
+class Thumb2ExtractPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb2, HasT2ExtractPack];
+}
+//===----------------------------------------------------------------------===//
+// Thumb Instruction Format Definitions.
+//
+
+class ThumbI<dag oops, dag iops, AddrMode am, int sz,
+             InstrItinClass itin, string asm, string cstr, list<dag> pattern>
+  : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb];
+}
+
+// TI - Thumb instruction.
+class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, 2, itin, asm, "", pattern>;
+
+// Two-address instructions
+class TIt<dag oops, dag iops, InstrItinClass itin, string asm,
+          list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, 2, itin, asm, "$lhs = $dst",
+           pattern>;
+
+// tBL, tBX 32-bit instructions
+class TIx2<bits<5> opcod1, bits<2> opcod2, bit opcod3,
+           dag oops, dag iops, InstrItinClass itin, string asm,
+           list<dag> pattern>
+    : ThumbI<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>,
+      Encoding {
+  let Inst{31-27} = opcod1;
+  let Inst{15-14} = opcod2;
+  let Inst{12}    = opcod3;
+}
+
+// BR_JT instructions
+class TJTI<dag oops, dag iops, InstrItinClass itin, string asm,
+           list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, 0, itin, asm, "", pattern>;
+
+// Thumb1 only
+class Thumb1I<dag oops, dag iops, AddrMode am, int sz,
+              InstrItinClass itin, string asm, string cstr, list<dag> pattern>
+  : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+}
+
+class T1I<dag oops, dag iops, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, 2, itin, asm, "", pattern>;
+class T1Ix2<dag oops, dag iops, InstrItinClass itin,
+            string asm, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>;
+
+// Two-address instructions
+class T1It<dag oops, dag iops, InstrItinClass itin,
+           string asm, string cstr, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, 2, itin,
+            asm, cstr, pattern>;
+
+// Thumb1 instruction that can either be predicated or set CPSR.
+class Thumb1sI<dag oops, dag iops, AddrMode am, int sz,
+               InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = !con(oops, (outs s_cc_out:$s));
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${s}${p}", asm);
+  let Pattern = pattern;
+  let thumbArithFlagSetting = 1;
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+  let DecoderNamespace = "ThumbSBit";
+}
+
+class T1sI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : Thumb1sI<oops, iops, AddrModeNone, 2, itin, opc, asm, "", pattern>;
+
+// Two-address instructions
+class T1sIt<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb1sI<oops, iops, AddrModeNone, 2, itin, opc, asm,
+             "$Rn = $Rdn", pattern>;
+
+// Thumb1 instruction that can be predicated.
+class Thumb1pI<dag oops, dag iops, AddrMode am, int sz,
+               InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+}
+
+class T1pI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeNone, 2, itin, opc, asm, "", pattern>;
+
+// Two-address instructions
+class T1pIt<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeNone, 2, itin, opc, asm,
+             "$Rn = $Rdn", pattern>;
+
+class T1pIs<dag oops, dag iops,
+            InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeT1_s, 2, itin, opc, asm, "", pattern>;
+
+class Encoding16 : Encoding {
+  let Inst{31-16} = 0x0000;
+}
+
+// A6.2 16-bit Thumb instruction encoding
+class T1Encoding<bits<6> opcode> : Encoding16 {
+  let Inst{15-10} = opcode;
+}
+
+// A6.2.1 Shift (immediate), add, subtract, move, and compare encoding.
+class T1General<bits<5> opcode> : Encoding16 {
+  let Inst{15-14} = 0b00;
+  let Inst{13-9} = opcode;
+}
+
+// A6.2.2 Data-processing encoding.
+class T1DataProcessing<bits<4> opcode> : Encoding16 {
+  let Inst{15-10} = 0b010000;
+  let Inst{9-6} = opcode;
+}
+
+// A6.2.3 Special data instructions and branch and exchange encoding.
+class T1Special<bits<4> opcode> : Encoding16 {
+  let Inst{15-10} = 0b010001;
+  let Inst{9-6}   = opcode;
+}
+
+// A6.2.4 Load/store single data item encoding.
+class T1LoadStore<bits<4> opA, bits<3> opB> : Encoding16 {
+  let Inst{15-12} = opA;
+  let Inst{11-9}  = opB;
+}
+class T1LdStSP<bits<3> opB>   : T1LoadStore<0b1001, opB>; // SP relative
+
+class T1BranchCond<bits<4> opcode> : Encoding16 {
+  let Inst{15-12} = opcode;
+}
+
+// Helper classes to encode Thumb1 loads and stores. For immediates, the
+// following bits are used for "opA" (see A6.2.4):
+//
+//   0b0110 => Immediate, 4 bytes
+//   0b1000 => Immediate, 2 bytes
+//   0b0111 => Immediate, 1 byte
+class T1pILdStEncode<bits<3> opcode, dag oops, dag iops, AddrMode am,
+                     InstrItinClass itin, string opc, string asm,
+                     list<dag> pattern>
+  : Thumb1pI<oops, iops, am, 2, itin, opc, asm, "", pattern>,
+    T1LoadStore<0b0101, opcode> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{8-6} = addr{5-3};    // Rm
+  let Inst{5-3} = addr{2-0};    // Rn
+  let Inst{2-0} = Rt;
+}
+class T1pILdStEncodeImm<bits<4> opA, bit opB, dag oops, dag iops, AddrMode am,
+                        InstrItinClass itin, string opc, string asm,
+                        list<dag> pattern>
+  : Thumb1pI<oops, iops, am, 2, itin, opc, asm, "", pattern>,
+    T1LoadStore<opA, {opB,?,?}> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-6} = addr{7-3};   // imm5
+  let Inst{5-3}  = addr{2-0};   // Rn
+  let Inst{2-0}  = Rt;
+}
+
+// A6.2.5 Miscellaneous 16-bit instructions encoding.
+class T1Misc<bits<7> opcode> : Encoding16 {
+  let Inst{15-12} = 0b1011;
+  let Inst{11-5} = opcode;
+}
+
+// Thumb2I - Thumb2 instruction. Almost all Thumb2 instructions are predicable.
+class Thumb2I<dag oops, dag iops, AddrMode am, int sz,
+              InstrItinClass itin,
+              string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+}
+
+// Same as Thumb2I except it can optionally modify CPSR. Note it's modeled as an
+// input operand since by default it's a zero register. It will become an
+// implicit def once it's "flipped".
+//
+// FIXME: This uses unified syntax so {s} comes before {p}. We should make it
+// more consistent.
+class Thumb2sI<dag oops, dag iops, AddrMode am, int sz,
+               InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  bits<1> s; // condition-code set flag ('1' if the insn should set the flags)
+  let Inst{20} = s;
+
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p, cc_out:$s));
+  let AsmString = !strconcat(opc, "${s}${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+}
+
+// Special cases
+class Thumb2XI<dag oops, dag iops, AddrMode am, int sz,
+               InstrItinClass itin,
+               string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+}
+
+class ThumbXI<dag oops, dag iops, AddrMode am, int sz,
+              InstrItinClass itin,
+              string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+  let DecoderNamespace = "Thumb";
+}
+
+class T2I<dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, 4, itin, opc, asm, "", pattern>;
+class T2Ii12<dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i12, 4, itin, opc, asm, "",pattern>;
+class T2Ii8<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8, 4, itin, opc, asm, "", pattern>;
+class T2Iso<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_so, 4, itin, opc, asm, "", pattern>;
+class T2Ipc<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_pc, 4, itin, opc, asm, "", pattern>;
+class T2Ii8s4<bit P, bit W, bit isLoad, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, string cstr, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, cstr,
+            pattern> {
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<13> addr;
+  let Inst{31-25} = 0b1110100;
+  let Inst{24}    = P;
+  let Inst{23}    = addr{8};
+  let Inst{22}    = 1;
+  let Inst{21}    = W;
+  let Inst{20}    = isLoad;
+  let Inst{19-16} = addr{12-9};
+  let Inst{15-12} = Rt{3-0};
+  let Inst{11-8}  = Rt2{3-0};
+  let Inst{7-0}   = addr{7-0};
+}
+class T2Ii8s4post<bit P, bit W, bit isLoad, dag oops, dag iops,
+                  InstrItinClass itin, string opc, string asm, string cstr,
+                  list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, cstr,
+            pattern> {
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> addr;
+  bits<9> imm;
+  let Inst{31-25} = 0b1110100;
+  let Inst{24}    = P;
+  let Inst{23}    = imm{8};
+  let Inst{22}    = 1;
+  let Inst{21}    = W;
+  let Inst{20}    = isLoad;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt{3-0};
+  let Inst{11-8}  = Rt2{3-0};
+  let Inst{7-0}   = imm{7-0};
+}
+
+class T2sI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : Thumb2sI<oops, iops, AddrModeNone, 4, itin, opc, asm, "", pattern>;
+
+class T2XI<dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : Thumb2XI<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>;
+class T2JTI<dag oops, dag iops, InstrItinClass itin,
+            string asm, list<dag> pattern>
+  : Thumb2XI<oops, iops, AddrModeNone, 0, itin, asm, "", pattern>;
+
+// Move to/from coprocessor instructions
+class T2Cop<bits<4> opc, dag oops, dag iops, string opcstr, string asm,
+            list<dag> pattern>
+  : T2I <oops, iops, NoItinerary, opcstr, asm, pattern>, Requires<[IsThumb2]> {
+  let Inst{31-28} = opc;
+}
+
+// Two-address instructions
+class T2XIt<dag oops, dag iops, InstrItinClass itin,
+            string asm, string cstr, list<dag> pattern>
+  : Thumb2XI<oops, iops, AddrModeNone, 4, itin, asm, cstr, pattern>;
+
+// T2Ipreldst - Thumb2 pre-indexed load / store instructions.
+class T2Ipreldst<bit signed, bits<2> opcod, bit load, bit pre,
+                 dag oops, dag iops,
+                 AddrMode am, IndexMode im, InstrItinClass itin,
+                 string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, 4, im, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+
+  bits<4> Rt;
+  bits<13> addr;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-25} = 0b00;
+  let Inst{24}    = signed;
+  let Inst{23}    = 0;
+  let Inst{22-21} = opcod;
+  let Inst{20}    = load;
+  let Inst{19-16} = addr{12-9};
+  let Inst{15-12} = Rt{3-0};
+  let Inst{11}    = 1;
+  // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed
+  let Inst{10}    = pre; // The P bit.
+  let Inst{9}     = addr{8}; // Sign bit
+  let Inst{8}     = 1; // The W bit.
+  let Inst{7-0}   = addr{7-0};
+
+  let DecoderMethod = "DecodeT2LdStPre";
+}
+
+// T2Ipostldst - Thumb2 post-indexed load / store instructions.
+class T2Ipostldst<bit signed, bits<2> opcod, bit load, bit pre,
+                 dag oops, dag iops,
+                 AddrMode am, IndexMode im, InstrItinClass itin,
+                 string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, 4, im, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+
+  bits<4> Rt;
+  bits<4> Rn;
+  bits<9> offset;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-25} = 0b00;
+  let Inst{24}    = signed;
+  let Inst{23}    = 0;
+  let Inst{22-21} = opcod;
+  let Inst{20}    = load;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rt{3-0};
+  let Inst{11}    = 1;
+  // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed
+  let Inst{10}    = pre; // The P bit.
+  let Inst{9}     = offset{8}; // Sign bit
+  let Inst{8}     = 1; // The W bit.
+  let Inst{7-0}   = offset{7-0};
+
+  let DecoderMethod = "DecodeT2LdStPre";
+}
+
+// T1Pat - Same as Pat<>, but requires that the compiler be in Thumb1 mode.
+class T1Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+}
+
+// T2v6Pat - Same as Pat<>, but requires V6T2 Thumb2 mode.
+class T2v6Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb2, HasV6T2];
+}
+
+// T2Pat - Same as Pat<>, but requires that the compiler be in Thumb2 mode.
+class T2Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb2];
+}
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM VFP Instruction templates.
+//
+
+// Almost all VFP instructions are predicable.
+class VFPI<dag oops, dag iops, AddrMode am, int sz,
+           IndexMode im, Format f, InstrItinClass itin,
+           string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, im, f, VFPDomain, cstr, itin> {
+  bits<4> p;
+  let Inst{31-28} = p;
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  let PostEncoderMethod = "VFPThumb2PostEncoder";
+  let DecoderNamespace = "VFP";
+  list<Predicate> Predicates = [HasVFP2];
+}
+
+// Special cases
+class VFPXI<dag oops, dag iops, AddrMode am, int sz,
+            IndexMode im, Format f, InstrItinClass itin,
+            string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, im, f, VFPDomain, cstr, itin> {
+  bits<4> p;
+  let Inst{31-28} = p;
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let PostEncoderMethod = "VFPThumb2PostEncoder";
+  let DecoderNamespace = "VFP";
+  list<Predicate> Predicates = [HasVFP2];
+}
+
+class VFPAI<dag oops, dag iops, Format f, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : VFPI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+         opc, asm, "", pattern> {
+  let PostEncoderMethod = "VFPThumb2PostEncoder";
+}
+
+// ARM VFP addrmode5 loads and stores
+class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+           InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : VFPI<oops, iops, AddrMode5, 4, IndexModeNone,
+         VFPLdStFrm, itin, opc, asm, "", pattern> {
+  // Instruction operands.
+  bits<5>  Dd;
+  bits<13> addr;
+
+  // Encode instruction operands.
+  let Inst{23}    = addr{8};      // U (add = (U == '1'))
+  let Inst{22}    = Dd{4};
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Dd{3-0};
+  let Inst{7-0}   = addr{7-0};    // imm8
+
+  let Inst{27-24} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
+
+  // Loads & stores operate on both NEON and VFP pipelines.
+  let D = VFPNeonDomain;
+}
+
+class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+           InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : VFPI<oops, iops, AddrMode5, 4, IndexModeNone,
+         VFPLdStFrm, itin, opc, asm, "", pattern> {
+  // Instruction operands.
+  bits<5>  Sd;
+  bits<13> addr;
+
+  // Encode instruction operands.
+  let Inst{23}    = addr{8};      // U (add = (U == '1'))
+  let Inst{22}    = Sd{0};
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Sd{4-1};
+  let Inst{7-0}   = addr{7-0};    // imm8
+
+  let Inst{27-24} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+
+  // Loads & stores operate on both NEON and VFP pipelines.
+  let D = VFPNeonDomain;
+}
+
+class AHI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+           InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : VFPI<oops, iops, AddrMode5, 4, IndexModeNone,
+         VFPLdStFrm, itin, opc, asm, "", pattern> {
+  list<Predicate> Predicates = [HasFullFP16];
+
+  // Instruction operands.
+  bits<5>  Sd;
+  bits<13> addr;
+
+  // Encode instruction operands.
+  let Inst{23}    = addr{8};      // U (add = (U == '1'))
+  let Inst{22}    = Sd{0};
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Sd{4-1};
+  let Inst{7-0}   = addr{7-0};    // imm8
+
+  let Inst{27-24} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1001;     // Half precision
+
+  // Loads & stores operate on both NEON and VFP pipelines.
+  let D = VFPNeonDomain;
+}
+
+// VFP Load / store multiple pseudo instructions.
+class PseudoVFPLdStM<dag oops, dag iops, InstrItinClass itin, string cstr,
+                     list<dag> pattern>
+  : InstARM<AddrMode4, 4, IndexModeNone, Pseudo, VFPNeonDomain,
+            cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasVFP2];
+}
+
+// Load / store multiple
+
+// Unknown precision
+class AXXI4<dag oops, dag iops, IndexMode im,
+            string asm, string cstr, list<dag> pattern>
+  : VFPXI<oops, iops, AddrMode4, 4, im,
+          VFPLdStFrm, NoItinerary, asm, cstr, pattern> {
+  // Instruction operands.
+  bits<4>  Rn;
+  bits<13> regs;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Rn;
+  let Inst{22}    = 0;
+  let Inst{15-12} = regs{11-8};
+  let Inst{7-1}   = regs{7-1};
+
+  let Inst{27-25} = 0b110;
+  let Inst{11-8}  = 0b1011;
+  let Inst{0}     = 1;
+}
+
+// Double precision
+class AXDI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
+            string asm, string cstr, list<dag> pattern>
+  : VFPXI<oops, iops, AddrMode4, 4, im,
+          VFPLdStMulFrm, itin, asm, cstr, pattern> {
+  // Instruction operands.
+  bits<4>  Rn;
+  bits<13> regs;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Rn;
+  let Inst{22}    = regs{12};
+  let Inst{15-12} = regs{11-8};
+  let Inst{7-1}   = regs{7-1};
+
+  let Inst{27-25} = 0b110;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
+  let Inst{0}     = 0;
+}
+
+// Single Precision
+class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
+            string asm, string cstr, list<dag> pattern>
+  : VFPXI<oops, iops, AddrMode4, 4, im,
+          VFPLdStMulFrm, itin, asm, cstr, pattern> {
+  // Instruction operands.
+  bits<4> Rn;
+  bits<13> regs;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Rn;
+  let Inst{22}    = regs{8};
+  let Inst{15-12} = regs{12-9};
+  let Inst{7-0}   = regs{7-0};
+
+  let Inst{27-25} = 0b110;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+}
+
+// Double precision, unary
+class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+           bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+           string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+// Double precision, unary, not-predicated
+class ADuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+           bit opcod5, dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone, VFPUnaryFrm, itin, asm, "", pattern> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Dm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
+// Double precision, binary
+class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
+           dag iops, InstrItinClass itin, string opc, string asm,
+           list<dag> pattern>
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Dn;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{19-16} = Dn{3-0};
+  let Inst{7}     = Dn{4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+// FP, binary, not predicated
+class ADbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
+           InstrItinClass itin, string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone, VFPBinaryFrm, itin,
+          asm, "", pattern>
+{
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Dn;
+  bits<5> Dm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{19-16} = Dn{3-0};
+  let Inst{7}     = Dn{4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1; // double precision
+  let Inst{6}     = opcod3;
+  let Inst{4}     = 0;
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+// Single precision, unary, predicated
+class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+           bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+           string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
+// Single precision, unary, non-predicated
+class ASuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+             bit opcod5, dag oops, dag iops, InstrItinClass itin,
+             string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+          VFPUnaryFrm, itin, asm, "", pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
+// Single precision unary, if no NEON. Same as ASuI except not available if
+// NEON is enabled.
+class ASuIn<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+            string asm, list<dag> pattern>
+  : ASuI<opcod1, opcod2, opcod3, opcod4, opcod5, oops, iops, itin, opc, asm,
+         pattern> {
+  list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
+// Single precision, binary
+class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
+           InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+}
+
+// Single precision, binary, not predicated
+class ASbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
+           InstrItinClass itin, string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+          VFPBinaryFrm, itin, asm, "", pattern>
+{
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0; // Single precision
+  let Inst{6}     = opcod3;
+  let Inst{4}     = 0;
+}
+
+// Single precision binary, if no NEON. Same as ASbI except not available if
+// NEON is enabled.
+class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
+            dag iops, InstrItinClass itin, string opc, string asm,
+            list<dag> pattern>
+  : ASbI<opcod1, opcod2, op6, op4, oops, iops, itin, opc, asm, pattern> {
+  list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+}
+
+// Half precision, unary, predicated
+class AHuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+           bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+           string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+  list<Predicate> Predicates = [HasFullFP16];
+
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-8}  = 0b1001;   // Half precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
+// Half precision, unary, non-predicated
+class AHuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+             bit opcod5, dag oops, dag iops, InstrItinClass itin,
+             string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+          VFPUnaryFrm, itin, asm, "", pattern> {
+  list<Predicate> Predicates = [HasFullFP16];
+
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-8}  = 0b1001;   // Half precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
+// Half precision, binary
+class AHbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
+           InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  list<Predicate> Predicates = [HasFullFP16];
+
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1001;   // Half precision
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+}
+
+// Half precision, binary, not predicated
+class AHbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
+           InstrItinClass itin, string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+          VFPBinaryFrm, itin, asm, "", pattern> {
+  list<Predicate> Predicates = [HasFullFP16];
+
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1001;   // Half precision
+  let Inst{6}     = opcod3;
+  let Inst{4}     = 0;
+}
+
+// VFP conversion instructions
+class AVConv1I<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4,
+               dag oops, dag iops, InstrItinClass itin, string opc, string asm,
+               list<dag> pattern>
+  : VFPAI<oops, iops, VFPConv1Frm, itin, opc, asm, pattern> {
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-8}  = opcod4;
+  let Inst{6}     = 1;
+  let Inst{4}     = 0;
+}
+
+// VFP conversion between floating-point and fixed-point
+class AVConv1XI<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5,
+                dag oops, dag iops, InstrItinClass itin, string opc, string asm,
+                list<dag> pattern>
+  : AVConv1I<op1, op2, op3, op4, oops, iops, itin, opc, asm, pattern> {
+  bits<5> fbits;
+  // size (fixed-point number): sx == 0 ? 16 : 32
+  let Inst{7} = op5; // sx
+  let Inst{5} = fbits{0};
+  let Inst{3-0} = fbits{4-1};
+}
+
+// VFP conversion instructions, if no NEON
+class AVConv1In<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string asm, list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+             pattern> {
+  list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
+class AVConvXI<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, Format f,
+               InstrItinClass itin,
+               string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, f, itin, opc, asm, pattern> {
+  let Inst{27-20} = opcod1;
+  let Inst{11-8}  = opcod2;
+  let Inst{4}     = 1;
+}
+
+class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, itin, opc, asm, pattern>;
+
+class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, itin, opc, asm, pattern>;
+
+class AVConv4I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv4Frm, itin, opc, asm, pattern>;
+
+class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv5Frm, itin, opc, asm, pattern>;
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM NEON Instruction templates.
+//
+
+class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
+            InstrItinClass itin, string opc, string dt, string asm, string cstr,
+            list<dag> pattern>
+  : InstARM<am, 4, im, f, NeonDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+  let DecoderNamespace = "NEON";
+}
+
+// Same as NeonI except it does not have a "data type" specifier.
+class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
+             InstrItinClass itin, string opc, string asm, string cstr,
+             list<dag> pattern>
+  : InstARM<am, 4, im, f, NeonDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", "\t", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+  let DecoderNamespace = "NEON";
+}
+
+// Same as NeonI except it is not predicated
+class NeonInp<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
+            InstrItinClass itin, string opc, string dt, string asm, string cstr,
+            list<dag> pattern>
+  : InstARM<am, 4, im, f, NeonDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = !strconcat(opc, ".", dt, "\t", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+  let DecoderNamespace = "NEON";
+
+  let Inst{31-28} = 0b1111;
+}
+
+class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
+            dag oops, dag iops, InstrItinClass itin,
+            string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NeonI<oops, iops, AddrMode6, IndexModeNone, NLdStFrm, itin, opc, dt, asm,
+          cstr, pattern> {
+  let Inst{31-24} = 0b11110100;
+  let Inst{23}    = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8}  = op11_8;
+  let Inst{7-4}   = op7_4;
+
+  let PostEncoderMethod = "NEONThumb2LoadStorePostEncoder";
+  let DecoderNamespace = "NEONLoadStore";
+
+  bits<5> Vd;
+  bits<6> Rn;
+  bits<4> Rm;
+
+  let Inst{22}    = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{19-16} = Rn{3-0};
+  let Inst{3-0}   = Rm{3-0};
+}
+
+class NLdStLn<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
+            dag oops, dag iops, InstrItinClass itin,
+            string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NLdSt<op23, op21_20, op11_8, op7_4, oops, iops, itin, opc,
+          dt, asm, cstr, pattern> {
+  bits<3> lane;
+}
+
+class PseudoNLdSt<dag oops, dag iops, InstrItinClass itin, string cstr>
+  : InstARM<AddrMode6, 4, IndexModeNone, Pseudo, NeonDomain, cstr,
+            itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  list<Predicate> Predicates = [HasNEON];
+}
+
+class PseudoNeonI<dag oops, dag iops, InstrItinClass itin, string cstr,
+                  list<dag> pattern>
+  : InstARM<AddrModeNone, 4, IndexModeNone, Pseudo, NeonDomain, cstr,
+            itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+}
+
+class NDataI<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NeonI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, dt, asm, cstr,
+          pattern> {
+  let Inst{31-25} = 0b1111001;
+  let PostEncoderMethod = "NEONThumb2DataIPostEncoder";
+  let DecoderNamespace = "NEONData";
+}
+
+class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin,
+              string opc, string asm, string cstr, list<dag> pattern>
+  : NeonXI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, asm,
+           cstr, pattern> {
+  let Inst{31-25} = 0b1111001;
+  let PostEncoderMethod = "NEONThumb2DataIPostEncoder";
+  let DecoderNamespace = "NEONData";
+}
+
+// NEON "one register and a modified immediate" format.
+class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
+               bit op5, bit op4,
+               dag oops, dag iops, InstrItinClass itin,
+               string opc, string dt, string asm, string cstr,
+               list<dag> pattern>
+  : NDataI<oops, iops, N1RegModImmFrm, itin, opc, dt, asm, cstr, pattern> {
+  let Inst{23}    = op23;
+  let Inst{21-19} = op21_19;
+  let Inst{11-8}  = op11_8;
+  let Inst{7}     = op7;
+  let Inst{6}     = op6;
+  let Inst{5}     = op5;
+  let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<13> SIMM;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{24}    = SIMM{7};
+  let Inst{18-16} = SIMM{6-4};
+  let Inst{3-0}   = SIMM{3-0};
+  let DecoderMethod = "DecodeNEONModImmInstruction";
+}
+
+// NEON 2 vector register format.
+class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
+          bits<5> op11_7, bit op6, bit op4,
+          dag oops, dag iops, InstrItinClass itin,
+          string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, N2RegFrm, itin, opc, dt, asm, cstr, pattern> {
+  let Inst{24-23} = op24_23;
+  let Inst{21-20} = op21_20;
+  let Inst{19-18} = op19_18;
+  let Inst{17-16} = op17_16;
+  let Inst{11-7}  = op11_7;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
+}
+
+// Same as N2V but not predicated.
+class N2Vnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+            dag oops, dag iops, InstrItinClass itin, string OpcodeStr,
+            string Dt, list<dag> pattern>
+   : NeonInp<oops, iops, AddrModeNone, IndexModeNone, N2RegFrm, itin,
+             OpcodeStr, Dt, "$Vd, $Vm", "", pattern> {
+  bits<5> Vd;
+  bits<5> Vm;
+
+  // Encode instruction operands
+  let Inst{22}    = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{5}     = Vm{4};
+  let Inst{3-0}   = Vm{3-0};
+
+  // Encode constant bits
+  let Inst{27-23} = 0b00111;
+  let Inst{21-20} = 0b11;
+  let Inst{19-18} = op19_18;
+  let Inst{17-16} = op17_16;
+  let Inst{11} = 0;
+  let Inst{10-8} = op10_8;
+  let Inst{7} = op7;
+  let Inst{6} = op6;
+  let Inst{4} = 0;
+
+  let DecoderNamespace = "NEON";
+}
+
+// Same as N2V except it doesn't have a datatype suffix.
+class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
+           bits<5> op11_7, bit op6, bit op4,
+           dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, string cstr, list<dag> pattern>
+  : NDataXI<oops, iops, N2RegFrm, itin, opc, asm, cstr, pattern> {
+  let Inst{24-23} = op24_23;
+  let Inst{21-20} = op21_20;
+  let Inst{19-18} = op19_18;
+  let Inst{17-16} = op17_16;
+  let Inst{11-7}  = op11_7;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
+}
+
+// NEON 2 vector register with immediate.
+class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+             dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+  let Inst{24}   = op24;
+  let Inst{23}   = op23;
+  let Inst{11-8} = op11_8;
+  let Inst{7}    = op7;
+  let Inst{6}    = op6;
+  let Inst{4}    = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vm;
+  bits<6> SIMM;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
+  let Inst{21-16} = SIMM{5-0};
+}
+
+// NEON 3 vector register format.
+
+class N3VCommon<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string dt, string asm, string cstr,
+                list<dag> pattern>
+  : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+  let Inst{24}    = op24;
+  let Inst{23}    = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8}  = op11_8;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+}
+
+class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
+          dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
+              oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7}     = Vn{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
+}
+
+class N3Vnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops,Format f, InstrItinClass itin,
+                string OpcodeStr, string Dt, list<dag> pattern>
+  : NeonInp<oops, iops, AddrModeNone, IndexModeNone, f, itin, OpcodeStr,
+            Dt, "$Vd, $Vn, $Vm", "", pattern> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+
+  // Encode instruction operands
+  let Inst{22} = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7} = Vn{4};
+  let Inst{5} = Vm{4};
+  let Inst{3-0} = Vm{3-0};
+
+  // Encode constant bits
+  let Inst{27-23} = op27_23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8}  = op11_8;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+}
+
+class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string dt, string asm, string cstr,
+                list<dag> pattern>
+  : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
+              oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  bit lane;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7}     = Vn{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = lane;
+}
+
+class N3VLane16<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string dt, string asm, string cstr,
+                list<dag> pattern>
+  : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
+              oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  bits<2> lane;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7}     = Vn{4};
+  let Inst{2-0}   = Vm{2-0};
+  let Inst{5}     = lane{1};
+  let Inst{3}     = lane{0};
+}
+
+// Same as N3V except it doesn't have a data type suffix.
+class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+           bit op4,
+           dag oops, dag iops, Format f, InstrItinClass itin,
+           string opc, string asm, string cstr, list<dag> pattern>
+  : NDataXI<oops, iops, f, itin, opc, asm, cstr, pattern> {
+  let Inst{24}    = op24;
+  let Inst{23}    = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8}  = op11_8;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7}     = Vn{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
+}
+
+// NEON VMOVs between scalar and core registers.
+class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+               dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string dt, string asm, list<dag> pattern>
+  : InstARM<AddrModeNone, 4, IndexModeNone, f, NeonDomain,
+            "", itin> {
+  let Inst{27-20} = opcod1;
+  let Inst{11-8}  = opcod2;
+  let Inst{6-5}   = opcod3;
+  let Inst{4}     = 1;
+  // A8.6.303, A8.6.328, A8.6.329
+  let Inst{3-0}   = 0b0000;
+
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+
+  let PostEncoderMethod = "NEONThumb2DupPostEncoder";
+  let DecoderNamespace = "NEONDup";
+
+  bits<5> V;
+  bits<4> R;
+  bits<4> p;
+  bits<4> lane;
+
+  let Inst{31-28} = p{3-0};
+  let Inst{7}     = V{4};
+  let Inst{19-16} = V{3-0};
+  let Inst{15-12} = R{3-0};
+}
+class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string dt, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NGetLnFrm, itin,
+             opc, dt, asm, pattern>;
+class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string dt, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NSetLnFrm, itin,
+             opc, dt, asm, pattern>;
+class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+            dag oops, dag iops, InstrItinClass itin,
+            string opc, string dt, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NDupFrm, itin,
+             opc, dt, asm, pattern>;
+
+// Vector Duplicate Lane (from scalar to all elements)
+class NVDupLane<bits<4> op19_16, bit op6, dag oops, dag iops,
+                InstrItinClass itin, string opc, string dt, string asm,
+                list<dag> pattern>
+  : NDataI<oops, iops, NVDupLnFrm, itin, opc, dt, asm, "", pattern> {
+  let Inst{24-23} = 0b11;
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = op19_16;
+  let Inst{11-7}  = 0b11000;
+  let Inst{6}     = op6;
+  let Inst{4}     = 0;
+
+  bits<5> Vd;
+  bits<5> Vm;
+
+  let Inst{22}     = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{5}     = Vm{4};
+  let Inst{3-0} = Vm{3-0};
+}
+
+// NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON
+// for single-precision FP.
+class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [HasNEON,UseNEONForFP];
+}
+
+// VFP/NEON Instruction aliases for type suffices.
+// Note: When EmitPriority == 1, the alias will be used for printing
+class VFPDataTypeInstAlias<string opc, string dt, string asm, dag Result, bit EmitPriority = 0> :
+  InstAlias<!strconcat(opc, dt, "\t", asm), Result, EmitPriority>, Requires<[HasVFP2]>;
+
+// Note: When EmitPriority == 1, the alias will be used for printing
+multiclass VFPDTAnyInstAlias<string opc, string asm, dag Result, bit EmitPriority = 0> {
+  def : VFPDataTypeInstAlias<opc, ".8", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".16", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".32", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".64", asm, Result, EmitPriority>;
+}
+
+// Note: When EmitPriority == 1, the alias will be used for printing
+multiclass NEONDTAnyInstAlias<string opc, string asm, dag Result, bit EmitPriority = 0> {
+  let Predicates = [HasNEON] in {
+  def : VFPDataTypeInstAlias<opc, ".8", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".16", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".32", asm, Result, EmitPriority>;
+  def : VFPDataTypeInstAlias<opc, ".64", asm, Result, EmitPriority>;
+}
+}
+
+// The same alias classes using AsmPseudo instead, for the more complex
+// stuff in NEON that InstAlias can't quite handle.
+// Note that we can't use anonymous defm references here like we can
+// above, as we care about the ultimate instruction enum names generated, unlike
+// for instalias defs.
+class NEONDataTypeAsmPseudoInst<string opc, string dt, string asm, dag iops> :
+  AsmPseudoInst<!strconcat(opc, dt, "\t", asm), iops>, Requires<[HasNEON]>;
+
+// Data type suffix token aliases. Implements Table A7-3 in the ARM ARM.
+def : TokenAlias<".s8", ".i8">;
+def : TokenAlias<".u8", ".i8">;
+def : TokenAlias<".s16", ".i16">;
+def : TokenAlias<".u16", ".i16">;
+def : TokenAlias<".s32", ".i32">;
+def : TokenAlias<".u32", ".i32">;
+def : TokenAlias<".s64", ".i64">;
+def : TokenAlias<".u64", ".i64">;
+
+def : TokenAlias<".i8", ".8">;
+def : TokenAlias<".i16", ".16">;
+def : TokenAlias<".i32", ".32">;
+def : TokenAlias<".i64", ".64">;
+
+def : TokenAlias<".p8", ".8">;
+def : TokenAlias<".p16", ".16">;
+
+def : TokenAlias<".f32", ".32">;
+def : TokenAlias<".f64", ".64">;
+def : TokenAlias<".f", ".f32">;
+def : TokenAlias<".d", ".f64">;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
new file mode 100644
index 000000000000..27b64322dfa9
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -0,0 +1,136 @@
+//===-- ARMInstrInfo.cpp - ARM Instruction Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMInstrInfo.h"
+#include "ARM.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+using namespace llvm;
+
+ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
+    : ARMBaseInstrInfo(STI), RI() {}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  if (hasNOP()) {
+    NopInst.setOpcode(ARM::HINT);
+    NopInst.addOperand(MCOperand::createImm(0));
+    NopInst.addOperand(MCOperand::createImm(ARMCC::AL));
+    NopInst.addOperand(MCOperand::createReg(0));
+  } else {
+    NopInst.setOpcode(ARM::MOVr);
+    NopInst.addOperand(MCOperand::createReg(ARM::R0));
+    NopInst.addOperand(MCOperand::createReg(ARM::R0));
+    NopInst.addOperand(MCOperand::createImm(ARMCC::AL));
+    NopInst.addOperand(MCOperand::createReg(0));
+    NopInst.addOperand(MCOperand::createReg(0));
+  }
+}
+
+unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
+  switch (Opc) {
+  default:
+    break;
+  case ARM::LDR_PRE_IMM:
+  case ARM::LDR_PRE_REG:
+  case ARM::LDR_POST_IMM:
+  case ARM::LDR_POST_REG:
+    return ARM::LDRi12;
+  case ARM::LDRH_PRE:
+  case ARM::LDRH_POST:
+    return ARM::LDRH;
+  case ARM::LDRB_PRE_IMM:
+  case ARM::LDRB_PRE_REG:
+  case ARM::LDRB_POST_IMM:
+  case ARM::LDRB_POST_REG:
+    return ARM::LDRBi12;
+  case ARM::LDRSH_PRE:
+  case ARM::LDRSH_POST:
+    return ARM::LDRSH;
+  case ARM::LDRSB_PRE:
+  case ARM::LDRSB_POST:
+    return ARM::LDRSB;
+  case ARM::STR_PRE_IMM:
+  case ARM::STR_PRE_REG:
+  case ARM::STR_POST_IMM:
+  case ARM::STR_POST_REG:
+    return ARM::STRi12;
+  case ARM::STRH_PRE:
+  case ARM::STRH_POST:
+    return ARM::STRH;
+  case ARM::STRB_PRE_IMM:
+  case ARM::STRB_PRE_REG:
+  case ARM::STRB_POST_IMM:
+  case ARM::STRB_POST_REG:
+    return ARM::STRBi12;
+  }
+
+  return 0;
+}
+
+void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const ARMSubtarget &Subtarget = MF.getSubtarget<ARMSubtarget>();
+  const TargetMachine &TM = MF.getTarget();
+
+  if (!Subtarget.useMovt(MF)) {
+    if (TM.isPositionIndependent())
+      expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12);
+    else
+      expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_abs, ARM::LDRi12);
+    return;
+  }
+
+  if (!TM.isPositionIndependent()) {
+    expandLoadStackGuardBase(MI, ARM::MOVi32imm, ARM::LDRi12);
+    return;
+  }
+
+  const GlobalValue *GV =
+      cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+
+  if (!Subtarget.isGVIndirectSymbol(GV)) {
+    expandLoadStackGuardBase(MI, ARM::MOV_ga_pcrel, ARM::LDRi12);
+    return;
+  }
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Reg = MI->getOperand(0).getReg();
+  MachineInstrBuilder MIB;
+
+  MIB = BuildMI(MBB, MI, DL, get(ARM::MOV_ga_pcrel_ldr), Reg)
+            .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY);
+  auto Flags = MachineMemOperand::MOLoad |
+               MachineMemOperand::MODereferenceable |
+               MachineMemOperand::MOInvariant;
+  MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+      MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
+  MIB.addMemOperand(MMO);
+  MIB = BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg);
+  MIB.addReg(Reg, RegState::Kill).addImm(0);
+  MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+  AddDefaultPred(MIB);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
new file mode 100644
index 000000000000..4b1b7097b18d
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
@@ -0,0 +1,47 @@
+//===-- ARMInstrInfo.h - ARM Instruction Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMINSTRINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMINSTRINFO_H
+
+#include "ARMBaseInstrInfo.h"
+#include "ARMRegisterInfo.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+class ARMInstrInfo : public ARMBaseInstrInfo {
+  ARMRegisterInfo RI;
+public:
+  explicit ARMInstrInfo(const ARMSubtarget &STI);
+
+  /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  unsigned getUnindexedOpcode(unsigned Opc) const override;
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const ARMRegisterInfo &getRegisterInfo() const override { return RI; }
+
+private:
+  void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
new file mode 100644
index 000000000000..c47393990e97
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -0,0 +1,5857 @@
+//===- ARMInstrInfo.td - Target Description for ARM Target -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM specific DAG Nodes.
+//
+
+// Type profiles.
+def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_ARMCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>;
+def SDT_ARMStructByVal : SDTypeProfile<0, 4,
+                                       [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                        SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
+def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>;
+
+def SDT_ARMcall    : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+
+def SDT_ARMCMov    : SDTypeProfile<1, 3,
+                                   [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                    SDTCisVT<3, i32>]>;
+
+def SDT_ARMBrcond  : SDTypeProfile<0, 2,
+                                   [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;
+
+def SDT_ARMBrJT    : SDTypeProfile<0, 2,
+                                  [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
+
+def SDT_ARMBr2JT   : SDTypeProfile<0, 3,
+                                  [SDTCisPtrTy<0>, SDTCisVT<1, i32>,
+                                   SDTCisVT<2, i32>]>;
+
+def SDT_ARMBCC_i64 : SDTypeProfile<0, 6,
+                                  [SDTCisVT<0, i32>,
+                                   SDTCisVT<1, i32>, SDTCisVT<2, i32>,
+                                   SDTCisVT<3, i32>, SDTCisVT<4, i32>,
+                                   SDTCisVT<5, OtherVT>]>;
+
+def SDT_ARMAnd     : SDTypeProfile<1, 2,
+                                   [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                    SDTCisVT<2, i32>]>;
+
+def SDT_ARMCmp     : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+
+def SDT_ARMPICAdd  : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
+                                          SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
+
+def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>,
+                                                 SDTCisInt<2>]>;
+def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>;
+def SDT_ARMEH_SJLJ_SetupDispatch: SDTypeProfile<0, 0, []>;
+
+def SDT_ARMMEMBARRIER     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_ARMPREFETCH : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>,
+                                           SDTCisInt<1>]>;
+
+def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                      SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
+def SDT_WIN__DBZCHK : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+
+def SDT_ARMMEMCPY  : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                          SDTCisVT<2, i32>, SDTCisVT<3, i32>,
+                                          SDTCisVT<4, i32>]>;
+
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, CPSR = op LHS, RHS, CPSR
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<4, i32>]>;
+
+// Node definitions.
+def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
+def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
+def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntUnaryOp>;
+
+def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
+                              [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+def ARMcallseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_ARMCallSeqEnd,
+                              [SDNPHasChain, SDNPSideEffect,
+                               SDNPOptInGlue, SDNPOutGlue]>;
+def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" ,
+                                SDT_ARMStructByVal,
+                                [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                 SDNPMayStore, SDNPMayLoad]>;
+
+def ARMcall          : SDNode<"ARMISD::CALL", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                               SDNPVariadic]>;
+def ARMcall_pred    : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                               SDNPVariadic]>;
+def ARMcall_nolink   : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                               SDNPVariadic]>;
+
+def ARMretflag       : SDNode<"ARMISD::RET_FLAG", SDTNone,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def ARMintretflag    : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
+                              [SDNPInGlue]>;
+
+def ARMssatnoshift   : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>;
+
+def ARMbrcond        : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
+                              [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+
+def ARMbrjt          : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT,
+                              [SDNPHasChain]>;
+def ARMbr2jt         : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT,
+                              [SDNPHasChain]>;
+
+def ARMBcci64        : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64,
+                              [SDNPHasChain]>;
+
+def ARMcmp           : SDNode<"ARMISD::CMP", SDT_ARMCmp,
+                              [SDNPOutGlue]>;
+
+def ARMcmn           : SDNode<"ARMISD::CMN", SDT_ARMCmp,
+                              [SDNPOutGlue]>;
+
+def ARMcmpZ          : SDNode<"ARMISD::CMPZ", SDT_ARMCmp,
+                              [SDNPOutGlue, SDNPCommutative]>;
+
+def ARMpic_add       : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>;
+
+def ARMsrl_flag      : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
+def ARMsra_flag      : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
+def ARMrrx           : SDNode<"ARMISD::RRX"     , SDTIntUnaryOp, [SDNPInGlue ]>;
+
+def ARMaddc          : SDNode<"ARMISD::ADDC",  SDTBinaryArithWithFlags,
+                              [SDNPCommutative]>;
+def ARMsubc          : SDNode<"ARMISD::SUBC",  SDTBinaryArithWithFlags>;
+def ARMadde          : SDNode<"ARMISD::ADDE",  SDTBinaryArithWithFlagsInOut>;
+def ARMsube          : SDNode<"ARMISD::SUBE",  SDTBinaryArithWithFlagsInOut>;
+
+def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>;
+def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP",
+                               SDT_ARMEH_SJLJ_Setjmp,
+                               [SDNPHasChain, SDNPSideEffect]>;
+def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
+                               SDT_ARMEH_SJLJ_Longjmp,
+                               [SDNPHasChain, SDNPSideEffect]>;
+def ARMeh_sjlj_setup_dispatch: SDNode<"ARMISD::EH_SJLJ_SETUP_DISPATCH",
+                                      SDT_ARMEH_SJLJ_SetupDispatch,
+                                      [SDNPHasChain, SDNPSideEffect]>;
+
+def ARMMemBarrierMCR  : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER,
+                               [SDNPHasChain, SDNPSideEffect]>;
+def ARMPreload        : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH,
+                               [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
+
+def ARMtcret         : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET,
+                        [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+def ARMbfi           : SDNode<"ARMISD::BFI", SDT_ARMBFI>;
+
+def ARMmemcopy : SDNode<"ARMISD::MEMCPY", SDT_ARMMEMCPY,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                         SDNPMayStore, SDNPMayLoad]>;
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction Predicate Definitions.
+//
+def HasV4T           : Predicate<"Subtarget->hasV4TOps()">,
+                                 AssemblerPredicate<"HasV4TOps", "armv4t">;
+def NoV4T            : Predicate<"!Subtarget->hasV4TOps()">;
+def HasV5T           : Predicate<"Subtarget->hasV5TOps()">,
+                                 AssemblerPredicate<"HasV5TOps", "armv5t">;
+def HasV5TE          : Predicate<"Subtarget->hasV5TEOps()">,
+                                 AssemblerPredicate<"HasV5TEOps", "armv5te">;
+def HasV6            : Predicate<"Subtarget->hasV6Ops()">,
+                                 AssemblerPredicate<"HasV6Ops", "armv6">;
+def NoV6             : Predicate<"!Subtarget->hasV6Ops()">;
+def HasV6M           : Predicate<"Subtarget->hasV6MOps()">,
+                                 AssemblerPredicate<"HasV6MOps",
+                                                    "armv6m or armv6t2">;
+def HasV8MBaseline   : Predicate<"Subtarget->hasV8MBaselineOps()">,
+                                 AssemblerPredicate<"HasV8MBaselineOps",
+                                                    "armv8m.base">;
+def HasV8MMainline   : Predicate<"Subtarget->hasV8MMainlineOps()">,
+                                 AssemblerPredicate<"HasV8MMainlineOps",
+                                                    "armv8m.main">;
+def HasV6T2          : Predicate<"Subtarget->hasV6T2Ops()">,
+                                 AssemblerPredicate<"HasV6T2Ops", "armv6t2">;
+def NoV6T2           : Predicate<"!Subtarget->hasV6T2Ops()">;
+def HasV6K           : Predicate<"Subtarget->hasV6KOps()">,
+                                 AssemblerPredicate<"HasV6KOps", "armv6k">;
+def NoV6K            : Predicate<"!Subtarget->hasV6KOps()">;
+def HasV7            : Predicate<"Subtarget->hasV7Ops()">,
+                                 AssemblerPredicate<"HasV7Ops", "armv7">;
+def HasV8            : Predicate<"Subtarget->hasV8Ops()">,
+                                 AssemblerPredicate<"HasV8Ops", "armv8">;
+def PreV8            : Predicate<"!Subtarget->hasV8Ops()">,
+                                 AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">;
+def HasV8_1a         : Predicate<"Subtarget->hasV8_1aOps()">,
+                                 AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
+def HasV8_2a         : Predicate<"Subtarget->hasV8_2aOps()">,
+                                 AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
+def NoVFP            : Predicate<"!Subtarget->hasVFP2()">;
+def HasVFP2          : Predicate<"Subtarget->hasVFP2()">,
+                                 AssemblerPredicate<"FeatureVFP2", "VFP2">;
+def HasVFP3          : Predicate<"Subtarget->hasVFP3()">,
+                                 AssemblerPredicate<"FeatureVFP3", "VFP3">;
+def HasVFP4          : Predicate<"Subtarget->hasVFP4()">,
+                                 AssemblerPredicate<"FeatureVFP4", "VFP4">;
+def HasDPVFP         : Predicate<"!Subtarget->isFPOnlySP()">,
+                                 AssemblerPredicate<"!FeatureVFPOnlySP",
+                                                    "double precision VFP">;
+def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
+                                 AssemblerPredicate<"FeatureFPARMv8", "FPARMv8">;
+def HasNEON          : Predicate<"Subtarget->hasNEON()">,
+                                 AssemblerPredicate<"FeatureNEON", "NEON">;
+def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
+                                 AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasCRC           : Predicate<"Subtarget->hasCRC()">,
+                                 AssemblerPredicate<"FeatureCRC", "crc">;
+def HasRAS           : Predicate<"Subtarget->hasRAS()">,
+                                 AssemblerPredicate<"FeatureRAS", "ras">;
+def HasFP16          : Predicate<"Subtarget->hasFP16()">,
+                                 AssemblerPredicate<"FeatureFP16","half-float conversions">;
+def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
+                                 AssemblerPredicate<"FeatureFullFP16","full half-float">;
+def HasDivide        : Predicate<"Subtarget->hasDivide()">,
+                                 AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">;
+def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
+                                 AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">;
+def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
+                                 AssemblerPredicate<"FeatureT2XtPk",
+                                                     "pack/extract">;
+def HasDSP           : Predicate<"Subtarget->hasDSP()">,
+                                 AssemblerPredicate<"FeatureDSP", "dsp">;
+def HasDB            : Predicate<"Subtarget->hasDataBarrier()">,
+                                 AssemblerPredicate<"FeatureDB",
+                                                    "data-barriers">;
+def HasV7Clrex  : Predicate<"Subtarget->hasV7Clrex()">,
+                            AssemblerPredicate<"FeatureV7Clrex",
+                                               "v7 clrex">;
+def HasAcquireRelease : Predicate<"Subtarget->hasAcquireRelease()">,
+                                  AssemblerPredicate<"FeatureAcquireRelease",
+                                                     "acquire/release">;
+def HasMP            : Predicate<"Subtarget->hasMPExtension()">,
+                                 AssemblerPredicate<"FeatureMP",
+                                                    "mp-extensions">;
+def HasVirtualization: Predicate<"false">,
+                                 AssemblerPredicate<"FeatureVirtualization",
+                                                   "virtualization-extensions">;
+def HasTrustZone     : Predicate<"Subtarget->hasTrustZone()">,
+                                 AssemblerPredicate<"FeatureTrustZone",
+                                                    "TrustZone">;
+def Has8MSecExt      : Predicate<"Subtarget->has8MSecExt()">,
+                                 AssemblerPredicate<"Feature8MSecExt",
+                                                    "ARMv8-M Security Extensions">;
+def HasZCZ           : Predicate<"Subtarget->hasZeroCycleZeroing()">;
+def UseNEONForFP     : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
+def IsThumb          : Predicate<"Subtarget->isThumb()">,
+                                 AssemblerPredicate<"ModeThumb", "thumb">;
+def IsThumb1Only     : Predicate<"Subtarget->isThumb1Only()">;
+def IsThumb2         : Predicate<"Subtarget->isThumb2()">,
+                                 AssemblerPredicate<"ModeThumb,FeatureThumb2",
+                                                    "thumb2">;
+def IsMClass         : Predicate<"Subtarget->isMClass()">,
+                                 AssemblerPredicate<"FeatureMClass", "armv*m">;
+def IsNotMClass      : Predicate<"!Subtarget->isMClass()">,
+                                 AssemblerPredicate<"!FeatureMClass",
+                                                    "!armv*m">;
+def IsARM            : Predicate<"!Subtarget->isThumb()">,
+                                 AssemblerPredicate<"!ModeThumb", "arm-mode">;
+def IsMachO          : Predicate<"Subtarget->isTargetMachO()">;
+def IsNotMachO       : Predicate<"!Subtarget->isTargetMachO()">;
+def IsNaCl           : Predicate<"Subtarget->isTargetNaCl()">;
+def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;
+def IsNotWindows     : Predicate<"!Subtarget->isTargetWindows()">;
+def UseNaClTrap      : Predicate<"Subtarget->useNaClTrap()">,
+                                 AssemblerPredicate<"FeatureNaClTrap", "NaCl">;
+def DontUseNaClTrap  : Predicate<"!Subtarget->useNaClTrap()">;
+
+// FIXME: Eventually this will be just "hasV6T2Ops".
+def UseMovt          : Predicate<"Subtarget->useMovt(*MF)">;
+def DontUseMovt      : Predicate<"!Subtarget->useMovt(*MF)">;
+def UseFPVMLx        : Predicate<"Subtarget->useFPVMLx()">;
+def UseMulOps        : Predicate<"Subtarget->useMulOps()">;
+
+// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
+// But only select them if more precision in FP computation is allowed.
+// Do not use them for Darwin platforms.
+def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
+                                 " FPOpFusion::Fast && "
+                                 " Subtarget->hasVFP4()) && "
+                                 "!Subtarget->isTargetDarwin()">;
+def DontUseFusedMAC  : Predicate<"!(TM.Options.AllowFPOpFusion =="
+                                 " FPOpFusion::Fast &&"
+                                 " Subtarget->hasVFP4()) || "
+                                 "Subtarget->isTargetDarwin()">;
+
+def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">;
+def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">;
+
+def HasFastVDUP32 : Predicate<"!Subtarget->hasSlowVDUP32()">;
+def HasSlowVDUP32 : Predicate<"Subtarget->hasSlowVDUP32()">;
+
+def UseVMOVSR : Predicate<"Subtarget->preferVMOVSR() ||"
+                          "!Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseVMOVSR : Predicate<"!Subtarget->preferVMOVSR() &&"
+                              "Subtarget->useNEONForSinglePrecisionFP()">;
+
+def IsLE             : Predicate<"MF->getDataLayout().isLittleEndian()">;
+def IsBE             : Predicate<"MF->getDataLayout().isBigEndian()">;
+
+def GenExecuteOnly : Predicate<"Subtarget->genExecuteOnly()">;
+
+//===----------------------------------------------------------------------===//
+// ARM Flag Definitions.
+
+class RegConstraint<string C> {
+  string Constraints = C;
+}
+
+//===----------------------------------------------------------------------===//
+//  ARM specific transformation functions and pattern fragments.
+//
+
+// imm_neg_XFORM - Return the negation of an i32 immediate value.
+def imm_neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-(int)N->getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+// imm_not_XFORM - Return the complement of a i32 immediate value.
+def imm_not_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~(int)N->getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+/// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31].
+def imm16_31 : ImmLeaf<i32, [{
+  return (int32_t)Imm >= 16 && (int32_t)Imm < 32;
+}]>;
+
+// sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits.
+def sext_16_node : PatLeaf<(i32 GPR:$a), [{
+  if (CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17)
+    return true;
+
+  if (N->getOpcode() != ISD::SRA)
+    return false;
+  if (N->getOperand(0).getOpcode() != ISD::SHL)
+    return false;
+
+  auto *ShiftVal = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!ShiftVal || ShiftVal->getZExtValue() != 16)
+    return false;
+
+  ShiftVal = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1));
+  if (!ShiftVal || ShiftVal->getZExtValue() != 16)
+    return false;
+
+  return true;
+}]>;
+
+/// Split a 32-bit immediate into two 16 bit parts.
+def hi16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+def lo16AllZero : PatLeaf<(i32 imm), [{
+  // Returns true if all low 16-bits are 0.
+  return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0;
+}], hi16>;
+
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>;
+
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+
+// An 'xor' node with a single use.
+def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+
+// An 'fmul' node with a single use.
+def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{
+  return N->hasOneUse();
+}]>;
+
+// An 'fadd' node which checks for single non-hazardous use.
+def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{
+  return hasNoVMLxHazardUse(N);
+}]>;
+
+// An 'fsub' node which checks for single non-hazardous use.
+def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
+  return hasNoVMLxHazardUse(N);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// Immediate operands with a shared generic asm render method.
+class ImmAsmOperand : AsmOperandClass { let RenderMethod = "addImmOperands"; }
+
+// Operands that are part of a memory addressing mode.
+class MemOperand : Operand<i32> { let OperandType = "OPERAND_MEMORY"; }
+
+// Branch target.
+// FIXME: rename brtarget to t2_brtarget
+def brtarget : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeT2BROperand";
+}
+
+// Branches targeting ARM-mode must be divisible by 4 if they're a raw
+// immediate.
+def ARMBranchTarget : AsmOperandClass {
+  let Name = "ARMBranchTarget";
+}
+
+// Branches targeting Thumb-mode must be divisible by 2 if they're a raw
+// immediate.
+def ThumbBranchTarget : AsmOperandClass {
+  let Name = "ThumbBranchTarget";
+}
+
+def arm_br_target : Operand<OtherVT> {
+  let ParserMatchClass = ARMBranchTarget;
+  let EncoderMethod = "getARMBranchTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+}
+
+// Call target for ARM. Handles conditional/unconditional
+// FIXME: rename bl_target to t2_bltarget?
+def arm_bl_target : Operand<i32> {
+  let ParserMatchClass = ARMBranchTarget;
+  let EncoderMethod = "getARMBLTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+}
+
+// Target for BLX *from* ARM mode.
+def arm_blx_target : Operand<i32> {
+  let ParserMatchClass = ThumbBranchTarget;
+  let EncoderMethod = "getARMBLXTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+}
+
+// A list of registers separated by comma. Used by load/store multiple.
+def RegListAsmOperand : AsmOperandClass { let Name = "RegList"; }
+def reglist : Operand<i32> {
+  let EncoderMethod = "getRegisterListOpValue";
+  let ParserMatchClass = RegListAsmOperand;
+  let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeRegListOperand";
+}
+
+def GPRPairOp : RegisterOperand<GPRPair, "printGPRPairOperand">;
+
+def DPRRegListAsmOperand : AsmOperandClass { let Name = "DPRRegList"; }
+def dpr_reglist : Operand<i32> {
+  let EncoderMethod = "getRegisterListOpValue";
+  let ParserMatchClass = DPRRegListAsmOperand;
+  let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeDPRRegListOperand";
+}
+
+def SPRRegListAsmOperand : AsmOperandClass { let Name = "SPRRegList"; }
+def spr_reglist : Operand<i32> {
+  let EncoderMethod = "getRegisterListOpValue";
+  let ParserMatchClass = SPRRegListAsmOperand;
+  let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeSPRRegListOperand";
+}
+
+// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
+def cpinst_operand : Operand<i32> {
+  let PrintMethod = "printCPInstOperand";
+}
+
+// Local PC labels.
+def pclabel : Operand<i32> {
+  let PrintMethod = "printPCLabel";
+}
+
+// ADR instruction labels.
+def AdrLabelAsmOperand : AsmOperandClass { let Name = "AdrLabel"; }
+def adrlabel : Operand<i32> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let ParserMatchClass = AdrLabelAsmOperand;
+  let PrintMethod = "printAdrLabelOperand<0>";
+}
+
+def neon_vcvt_imm32 : Operand<i32> {
+  let EncoderMethod = "getNEONVcvtImm32OpValue";
+  let DecoderMethod = "DecodeVCVTImmOperand";
+}
+
+// rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24.
+def rot_imm_XFORM: SDNodeXForm<imm, [{
+  switch (N->getZExtValue()){
+  default: llvm_unreachable(nullptr);
+  case 0:  return CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+  case 8:  return CurDAG->getTargetConstant(1, SDLoc(N), MVT::i32);
+  case 16: return CurDAG->getTargetConstant(2, SDLoc(N), MVT::i32);
+  case 24: return CurDAG->getTargetConstant(3, SDLoc(N), MVT::i32);
+  }
+}]>;
+def RotImmAsmOperand : AsmOperandClass {
+  let Name = "RotImm";
+  let ParserMethod = "parseRotImm";
+}
+def rot_imm : Operand<i32>, PatLeaf<(i32 imm), [{
+    int32_t v = N->getZExtValue();
+    return v == 8 || v == 16 || v == 24; }],
+    rot_imm_XFORM> {
+  let PrintMethod = "printRotImmOperand";
+  let ParserMatchClass = RotImmAsmOperand;
+}
+
+// shift_imm: An integer that encodes a shift amount and the type of shift
+// (asr or lsl). The 6-bit immediate encodes as:
+//    {5}     0 ==> lsl
+//            1     asr
+//    {4-0}   imm5 shift amount.
+//            asr #32 encoded as imm5 == 0.
+def ShifterImmAsmOperand : AsmOperandClass {
+  let Name = "ShifterImm";
+  let ParserMethod = "parseShifterImm";
+}
+def shift_imm : Operand<i32> {
+  let PrintMethod = "printShiftImmOperand";
+  let ParserMatchClass = ShifterImmAsmOperand;
+}
+
+// shifter_operand operands: so_reg_reg, so_reg_imm, and mod_imm.
+def ShiftedRegAsmOperand : AsmOperandClass { let Name = "RegShiftedReg"; }
+def so_reg_reg : Operand<i32>,  // reg reg imm
+                 ComplexPattern<i32, 3, "SelectRegShifterOperand",
+                                [shl, srl, sra, rotr]> {
+  let EncoderMethod = "getSORegRegOpValue";
+  let PrintMethod = "printSORegRegOperand";
+  let DecoderMethod = "DecodeSORegRegOperand";
+  let ParserMatchClass = ShiftedRegAsmOperand;
+  let MIOperandInfo = (ops GPRnopc, GPRnopc, i32imm);
+}
+
+def ShiftedImmAsmOperand : AsmOperandClass { let Name = "RegShiftedImm"; }
+def so_reg_imm : Operand<i32>, // reg imm
+                 ComplexPattern<i32, 2, "SelectImmShifterOperand",
+                                [shl, srl, sra, rotr]> {
+  let EncoderMethod = "getSORegImmOpValue";
+  let PrintMethod = "printSORegImmOperand";
+  let DecoderMethod = "DecodeSORegImmOperand";
+  let ParserMatchClass = ShiftedImmAsmOperand;
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// FIXME: Does this need to be distinct from so_reg?
+def shift_so_reg_reg : Operand<i32>,    // reg reg imm
+                   ComplexPattern<i32, 3, "SelectShiftRegShifterOperand",
+                                  [shl,srl,sra,rotr]> {
+  let EncoderMethod = "getSORegRegOpValue";
+  let PrintMethod = "printSORegRegOperand";
+  let DecoderMethod = "DecodeSORegRegOperand";
+  let ParserMatchClass = ShiftedRegAsmOperand;
+  let MIOperandInfo = (ops GPR, GPR, i32imm);
+}
+
+// FIXME: Does this need to be distinct from so_reg?
+def shift_so_reg_imm : Operand<i32>,    // reg reg imm
+                   ComplexPattern<i32, 2, "SelectShiftImmShifterOperand",
+                                  [shl,srl,sra,rotr]> {
+  let EncoderMethod = "getSORegImmOpValue";
+  let PrintMethod = "printSORegImmOperand";
+  let DecoderMethod = "DecodeSORegImmOperand";
+  let ParserMatchClass = ShiftedImmAsmOperand;
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// mod_imm: match a 32-bit immediate operand, which can be encoded into
+// a 12-bit immediate; an 8-bit integer and a 4-bit rotator (See ARMARM
+// - "Modified Immediate Constants"). Within the MC layer we keep this
+// immediate in its encoded form.
+def ModImmAsmOperand: AsmOperandClass {
+  let Name = "ModImm";
+  let ParserMethod = "parseModImm";
+}
+def mod_imm : Operand<i32>, ImmLeaf<i32, [{
+    return ARM_AM::getSOImmVal(Imm) != -1;
+  }]> {
+  let EncoderMethod = "getModImmOpValue";
+  let PrintMethod = "printModImmOperand";
+  let ParserMatchClass = ModImmAsmOperand;
+}
+
+// Note: the patterns mod_imm_not and mod_imm_neg do not require an encoder
+// method and such, as they are only used on aliases (Pat<> and InstAlias<>).
+// The actual parsing, encoding, decoding are handled by the destination
+// instructions, which use mod_imm.
+
+def ModImmNotAsmOperand : AsmOperandClass { let Name = "ModImmNot"; }
+def mod_imm_not : Operand<i32>, PatLeaf<(imm), [{
+    return ARM_AM::getSOImmVal(~(uint32_t)N->getZExtValue()) != -1;
+  }], imm_not_XFORM> {
+  let ParserMatchClass = ModImmNotAsmOperand;
+}
+
+def ModImmNegAsmOperand : AsmOperandClass { let Name = "ModImmNeg"; }
+def mod_imm_neg : Operand<i32>, PatLeaf<(imm), [{
+    unsigned Value = -(unsigned)N->getZExtValue();
+    return Value && ARM_AM::getSOImmVal(Value) != -1;
+  }], imm_neg_XFORM> {
+  let ParserMatchClass = ModImmNegAsmOperand;
+}
+
+/// arm_i32imm - True for +V6T2, or when isSOImmTwoParVal()
+def arm_i32imm : PatLeaf<(imm), [{
+  if (Subtarget->useMovt(*MF))
+    return true;
+  return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
+}]>;
+
+/// imm0_1 predicate - Immediate in the range [0,1].
+def Imm0_1AsmOperand: ImmAsmOperand { let Name = "Imm0_1"; }
+def imm0_1 : Operand<i32> { let ParserMatchClass = Imm0_1AsmOperand; }
+
+/// imm0_3 predicate - Immediate in the range [0,3].
+def Imm0_3AsmOperand: ImmAsmOperand { let Name = "Imm0_3"; }
+def imm0_3 : Operand<i32> { let ParserMatchClass = Imm0_3AsmOperand; }
+
+/// imm0_7 predicate - Immediate in the range [0,7].
+def Imm0_7AsmOperand: ImmAsmOperand { let Name = "Imm0_7"; }
+def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 8;
+}]> {
+  let ParserMatchClass = Imm0_7AsmOperand;
+}
+
+/// imm8 predicate - Immediate is exactly 8.
+def Imm8AsmOperand: ImmAsmOperand { let Name = "Imm8"; }
+def imm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 8; }]> {
+  let ParserMatchClass = Imm8AsmOperand;
+}
+
+/// imm16 predicate - Immediate is exactly 16.
+def Imm16AsmOperand: ImmAsmOperand { let Name = "Imm16"; }
+def imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 16; }]> {
+  let ParserMatchClass = Imm16AsmOperand;
+}
+
+/// imm32 predicate - Immediate is exactly 32.
+def Imm32AsmOperand: ImmAsmOperand { let Name = "Imm32"; }
+def imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 32; }]> {
+  let ParserMatchClass = Imm32AsmOperand;
+}
+
+def imm8_or_16 : ImmLeaf<i32, [{ return Imm == 8 || Imm == 16;}]>;
+
+/// imm1_7 predicate - Immediate in the range [1,7].
+def Imm1_7AsmOperand: ImmAsmOperand { let Name = "Imm1_7"; }
+def imm1_7 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 8; }]> {
+  let ParserMatchClass = Imm1_7AsmOperand;
+}
+
+/// imm1_15 predicate - Immediate in the range [1,15].
+def Imm1_15AsmOperand: ImmAsmOperand { let Name = "Imm1_15"; }
+def imm1_15 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 16; }]> {
+  let ParserMatchClass = Imm1_15AsmOperand;
+}
+
+/// imm1_31 predicate - Immediate in the range [1,31].
+def Imm1_31AsmOperand: ImmAsmOperand { let Name = "Imm1_31"; }
+def imm1_31 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 32; }]> {
+  let ParserMatchClass = Imm1_31AsmOperand;
+}
+
+/// imm0_15 predicate - Immediate in the range [0,15].
+def Imm0_15AsmOperand: ImmAsmOperand {
+  let Name = "Imm0_15";
+  let DiagnosticType = "ImmRange0_15";
+}
+def imm0_15 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 16;
+}]> {
+  let ParserMatchClass = Imm0_15AsmOperand;
+}
+
+/// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
+def Imm0_31AsmOperand: ImmAsmOperand { let Name = "Imm0_31"; }
+def imm0_31 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 32;
+}]> {
+  let ParserMatchClass = Imm0_31AsmOperand;
+}
+
+/// imm0_32 predicate - True if the 32-bit immediate is in the range [0,32].
+def Imm0_32AsmOperand: ImmAsmOperand { let Name = "Imm0_32"; }
+def imm0_32 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 32;
+}]> {
+  let ParserMatchClass = Imm0_32AsmOperand;
+}
+
+/// imm0_63 predicate - True if the 32-bit immediate is in the range [0,63].
+def Imm0_63AsmOperand: ImmAsmOperand { let Name = "Imm0_63"; }
+def imm0_63 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 64;
+}]> {
+  let ParserMatchClass = Imm0_63AsmOperand;
+}
+
+/// imm0_239 predicate - Immediate in the range [0,239].
+def Imm0_239AsmOperand : ImmAsmOperand {
+  let Name = "Imm0_239";
+  let DiagnosticType = "ImmRange0_239";
+}
+def imm0_239 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 240; }]> {
+  let ParserMatchClass = Imm0_239AsmOperand;
+}
+
+/// imm0_255 predicate - Immediate in the range [0,255].
+def Imm0_255AsmOperand : ImmAsmOperand { let Name = "Imm0_255"; }
+def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> {
+  let ParserMatchClass = Imm0_255AsmOperand;
+}
+
+/// imm0_65535 - An immediate is in the range [0.65535].
+def Imm0_65535AsmOperand: ImmAsmOperand { let Name = "Imm0_65535"; }
+def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 65536;
+}]> {
+  let ParserMatchClass = Imm0_65535AsmOperand;
+}
+
+// imm0_65535_neg - An immediate whose negative value is in the range [0.65535].
+def imm0_65535_neg : Operand<i32>, ImmLeaf<i32, [{
+  return -Imm >= 0 && -Imm < 65536;
+}]>;
+
+// imm0_65535_expr - For movt/movw - 16-bit immediate that can also reference
+// a relocatable expression.
+//
+// FIXME: This really needs a Thumb version separate from the ARM version.
+// While the range is the same, and can thus use the same match class,
+// the encoding is different so it should have a different encoder method.
+def Imm0_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm0_65535Expr"; }
+def imm0_65535_expr : Operand<i32> {
+  let EncoderMethod = "getHiLo16ImmOpValue";
+  let ParserMatchClass = Imm0_65535ExprAsmOperand;
+}
+
+def Imm256_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm256_65535Expr"; }
+def imm256_65535_expr : Operand<i32> {
+  let ParserMatchClass = Imm256_65535ExprAsmOperand;
+}
+
+/// imm24b - True if the 32-bit immediate is encodable in 24 bits.
+def Imm24bitAsmOperand: ImmAsmOperand { let Name = "Imm24bit"; }
+def imm24b : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm <= 0xffffff;
+}]> {
+  let ParserMatchClass = Imm24bitAsmOperand;
+}
+
+
+/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield
+/// e.g., 0xf000ffff
+def BitfieldAsmOperand : AsmOperandClass {
+  let Name = "Bitfield";
+  let ParserMethod = "parseBitfield";
+}
+
+def bf_inv_mask_imm : Operand<i32>,
+                      PatLeaf<(imm), [{
+  return ARM::isBitFieldInvertedMask(N->getZExtValue());
+}] > {
+  let EncoderMethod = "getBitfieldInvertedMaskOpValue";
+  let PrintMethod = "printBitfieldInvMaskImmOperand";
+  let DecoderMethod = "DecodeBitfieldMaskOperand";
+  let ParserMatchClass = BitfieldAsmOperand;
+}
+
+def imm1_32_XFORM: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, SDLoc(N),
+                                   MVT::i32);
+}]>;
+def Imm1_32AsmOperand: AsmOperandClass { let Name = "Imm1_32"; }
+def imm1_32 : Operand<i32>, PatLeaf<(imm), [{
+   uint64_t Imm = N->getZExtValue();
+   return Imm > 0 && Imm <= 32;
+ }],
+    imm1_32_XFORM> {
+  let PrintMethod = "printImmPlusOneOperand";
+  let ParserMatchClass = Imm1_32AsmOperand;
+}
+
+def imm1_16_XFORM: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, SDLoc(N),
+                                   MVT::i32);
+}]>;
+def Imm1_16AsmOperand: AsmOperandClass { let Name = "Imm1_16"; }
+def imm1_16 : Operand<i32>, PatLeaf<(imm), [{ return Imm > 0 && Imm <= 16; }],
+    imm1_16_XFORM> {
+  let PrintMethod = "printImmPlusOneOperand";
+  let ParserMatchClass = Imm1_16AsmOperand;
+}
+
+// Define ARM specific addressing modes.
+// addrmode_imm12 := reg +/- imm12
+//
+def MemImm12OffsetAsmOperand : AsmOperandClass { let Name = "MemImm12Offset"; }
+class AddrMode_Imm12 : MemOperand,
+                     ComplexPattern<i32, 2, "SelectAddrModeImm12", []> {
+  // 12-bit immediate operand. Note that instructions using this encode
+  // #0 and #-0 differently. We flag #-0 as the magic value INT32_MIN. All other
+  // immediate values are as normal.
+
+  let EncoderMethod = "getAddrModeImm12OpValue";
+  let DecoderMethod = "DecodeAddrModeImm12Operand";
+  let ParserMatchClass = MemImm12OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+def addrmode_imm12 : AddrMode_Imm12 {
+  let PrintMethod = "printAddrModeImm12Operand<false>";
+}
+
+def addrmode_imm12_pre : AddrMode_Imm12 {
+  let PrintMethod = "printAddrModeImm12Operand<true>";
+}
+
+// ldst_so_reg := reg +/- reg shop imm
+//
+def MemRegOffsetAsmOperand : AsmOperandClass { let Name = "MemRegOffset"; }
+def ldst_so_reg : MemOperand,
+                  ComplexPattern<i32, 3, "SelectLdStSOReg", []> {
+  let EncoderMethod = "getLdStSORegOpValue";
+  // FIXME: Simplify the printer
+  let PrintMethod = "printAddrMode2Operand";
+  let DecoderMethod = "DecodeSORegMemOperand";
+  let ParserMatchClass = MemRegOffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, GPRnopc:$offsreg, i32imm:$shift);
+}
+
+// postidx_imm8 := +/- [0,255]
+//
+// 9 bit value:
+//  {8}       1 is imm8 is non-negative. 0 otherwise.
+//  {7-0}     [0,255] imm8 value.
+def PostIdxImm8AsmOperand : AsmOperandClass { let Name = "PostIdxImm8"; }
+def postidx_imm8 : MemOperand {
+  let PrintMethod = "printPostIdxImm8Operand";
+  let ParserMatchClass = PostIdxImm8AsmOperand;
+  let MIOperandInfo = (ops i32imm);
+}
+
+// postidx_imm8s4 := +/- [0,1020]
+//
+// 9 bit value:
+//  {8}       1 is imm8 is non-negative. 0 otherwise.
+//  {7-0}     [0,255] imm8 value, scaled by 4.
+def PostIdxImm8s4AsmOperand : AsmOperandClass { let Name = "PostIdxImm8s4"; }
+def postidx_imm8s4 : MemOperand {
+  let PrintMethod = "printPostIdxImm8s4Operand";
+  let ParserMatchClass = PostIdxImm8s4AsmOperand;
+  let MIOperandInfo = (ops i32imm);
+}
+
+
+// postidx_reg := +/- reg
+//
+def PostIdxRegAsmOperand : AsmOperandClass {
+  let Name = "PostIdxReg";
+  let ParserMethod = "parsePostIdxReg";
+}
+def postidx_reg : MemOperand {
+  let EncoderMethod = "getPostIdxRegOpValue";
+  let DecoderMethod = "DecodePostIdxReg";
+  let PrintMethod = "printPostIdxRegOperand";
+  let ParserMatchClass = PostIdxRegAsmOperand;
+  let MIOperandInfo = (ops GPRnopc, i32imm);
+}
+
+
+// addrmode2 := reg +/- imm12
+//           := reg +/- reg shop imm
+//
+// FIXME: addrmode2 should be refactored the rest of the way to always
+// use explicit imm vs. reg versions above (addrmode_imm12 and ldst_so_reg).
+def AddrMode2AsmOperand : AsmOperandClass { let Name = "AddrMode2"; }
+def addrmode2 : MemOperand,
+                ComplexPattern<i32, 3, "SelectAddrMode2", []> {
+  let EncoderMethod = "getAddrMode2OpValue";
+  let PrintMethod = "printAddrMode2Operand";
+  let ParserMatchClass = AddrMode2AsmOperand;
+  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+}
+
+def PostIdxRegShiftedAsmOperand : AsmOperandClass {
+  let Name = "PostIdxRegShifted";
+  let ParserMethod = "parsePostIdxReg";
+}
+def am2offset_reg : MemOperand,
+                ComplexPattern<i32, 2, "SelectAddrMode2OffsetReg",
+                [], [SDNPWantRoot]> {
+  let EncoderMethod = "getAddrMode2OffsetOpValue";
+  let PrintMethod = "printAddrMode2OffsetOperand";
+  // When using this for assembly, it's always as a post-index offset.
+  let ParserMatchClass = PostIdxRegShiftedAsmOperand;
+  let MIOperandInfo = (ops GPRnopc, i32imm);
+}
+
+// FIXME: am2offset_imm should only need the immediate, not the GPR. Having
+// the GPR is purely vestigal at this point.
+def AM2OffsetImmAsmOperand : AsmOperandClass { let Name = "AM2OffsetImm"; }
+def am2offset_imm : MemOperand,
+                ComplexPattern<i32, 2, "SelectAddrMode2OffsetImm",
+                [], [SDNPWantRoot]> {
+  let EncoderMethod = "getAddrMode2OffsetOpValue";
+  let PrintMethod = "printAddrMode2OffsetOperand";
+  let ParserMatchClass = AM2OffsetImmAsmOperand;
+  let MIOperandInfo = (ops GPRnopc, i32imm);
+}
+
+
+// addrmode3 := reg +/- reg
+// addrmode3 := reg +/- imm8
+//
+// FIXME: split into imm vs. reg versions.
+def AddrMode3AsmOperand : AsmOperandClass { let Name = "AddrMode3"; }
+class AddrMode3 : MemOperand,
+                  ComplexPattern<i32, 3, "SelectAddrMode3", []> {
+  let EncoderMethod = "getAddrMode3OpValue";
+  let ParserMatchClass = AddrMode3AsmOperand;
+  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+}
+
+def addrmode3 : AddrMode3
+{
+  let PrintMethod = "printAddrMode3Operand<false>";
+}
+
+def addrmode3_pre : AddrMode3
+{
+  let PrintMethod = "printAddrMode3Operand<true>";
+}
+
+// FIXME: split into imm vs. reg versions.
+// FIXME: parser method to handle +/- register.
+def AM3OffsetAsmOperand : AsmOperandClass {
+  let Name = "AM3Offset";
+  let ParserMethod = "parseAM3Offset";
+}
+def am3offset : MemOperand,
+                ComplexPattern<i32, 2, "SelectAddrMode3Offset",
+                               [], [SDNPWantRoot]> {
+  let EncoderMethod = "getAddrMode3OffsetOpValue";
+  let PrintMethod = "printAddrMode3OffsetOperand";
+  let ParserMatchClass = AM3OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// ldstm_mode := {ia, ib, da, db}
+//
+def ldstm_mode : OptionalDefOperand<OtherVT, (ops i32), (ops (i32 1))> {
+  let EncoderMethod = "getLdStmModeOpValue";
+  let PrintMethod = "printLdStmModeOperand";
+}
+
+// addrmode5 := reg +/- imm8*4
+//
+def AddrMode5AsmOperand : AsmOperandClass { let Name = "AddrMode5"; }
+class AddrMode5 : MemOperand,
+                  ComplexPattern<i32, 2, "SelectAddrMode5", []> {
+  let EncoderMethod = "getAddrMode5OpValue";
+  let DecoderMethod = "DecodeAddrMode5Operand";
+  let ParserMatchClass = AddrMode5AsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm);
+}
+
+def addrmode5 : AddrMode5 {
+   let PrintMethod = "printAddrMode5Operand<false>";
+}
+
+def addrmode5_pre : AddrMode5 {
+   let PrintMethod = "printAddrMode5Operand<true>";
+}
+
+// addrmode5fp16 := reg +/- imm8*2
+//
+def AddrMode5FP16AsmOperand : AsmOperandClass { let Name = "AddrMode5FP16"; }
+class AddrMode5FP16 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectAddrMode5FP16", []> {
+  let EncoderMethod = "getAddrMode5FP16OpValue";
+  let DecoderMethod = "DecodeAddrMode5FP16Operand";
+  let ParserMatchClass = AddrMode5FP16AsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm);
+}
+
+def addrmode5fp16 : AddrMode5FP16 {
+   let PrintMethod = "printAddrMode5FP16Operand<false>";
+}
+
+// addrmode6 := reg with optional alignment
+//
+def AddrMode6AsmOperand : AsmOperandClass { let Name = "AlignedMemory"; }
+def addrmode6 : MemOperand,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm:$align);
+  let EncoderMethod = "getAddrMode6AddressOpValue";
+  let DecoderMethod = "DecodeAddrMode6Operand";
+  let ParserMatchClass = AddrMode6AsmOperand;
+}
+
+def am6offset : MemOperand,
+                ComplexPattern<i32, 1, "SelectAddrMode6Offset",
+                               [], [SDNPWantRoot]> {
+  let PrintMethod = "printAddrMode6OffsetOperand";
+  let MIOperandInfo = (ops GPR);
+  let EncoderMethod = "getAddrMode6OffsetOpValue";
+  let DecoderMethod = "DecodeGPRRegisterClass";
+}
+
+// Special version of addrmode6 to handle alignment encoding for VST1/VLD1
+// (single element from one lane) for size 32.
+def addrmode6oneL32 : MemOperand,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm);
+  let EncoderMethod = "getAddrMode6OneLane32AddressOpValue";
+}
+
+// Base class for addrmode6 with specific alignment restrictions.
+class AddrMode6Align : MemOperand,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm:$align);
+  let EncoderMethod = "getAddrMode6AddressOpValue";
+  let DecoderMethod = "DecodeAddrMode6Operand";
+}
+
+// Special version of addrmode6 to handle no allowed alignment encoding for
+// VLD/VST instructions and checking the alignment is not specified.
+def AddrMode6AlignNoneAsmOperand : AsmOperandClass {
+  let Name = "AlignedMemoryNone";
+  let DiagnosticType = "AlignedMemoryRequiresNone";
+}
+def addrmode6alignNone : AddrMode6Align {
+  // The alignment specifier can only be omitted.
+  let ParserMatchClass = AddrMode6AlignNoneAsmOperand;
+}
+
+// Special version of addrmode6 to handle 16-bit alignment encoding for
+// VLD/VST instructions and checking the alignment value.
+def AddrMode6Align16AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory16";
+  let DiagnosticType = "AlignedMemoryRequires16";
+}
+def addrmode6align16 : AddrMode6Align {
+  // The alignment specifier can only be 16 or omitted.
+  let ParserMatchClass = AddrMode6Align16AsmOperand;
+}
+
+// Special version of addrmode6 to handle 32-bit alignment encoding for
+// VLD/VST instructions and checking the alignment value.
+def AddrMode6Align32AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory32";
+  let DiagnosticType = "AlignedMemoryRequires32";
+}
+def addrmode6align32 : AddrMode6Align {
+  // The alignment specifier can only be 32 or omitted.
+  let ParserMatchClass = AddrMode6Align32AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit alignment encoding for
+// VLD/VST instructions and checking the alignment value.
+def AddrMode6Align64AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory64";
+  let DiagnosticType = "AlignedMemoryRequires64";
+}
+def addrmode6align64 : AddrMode6Align {
+  // The alignment specifier can only be 64 or omitted.
+  let ParserMatchClass = AddrMode6Align64AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit or 128-bit alignment encoding
+// for VLD/VST instructions and checking the alignment value.
+def AddrMode6Align64or128AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory64or128";
+  let DiagnosticType = "AlignedMemoryRequires64or128";
+}
+def addrmode6align64or128 : AddrMode6Align {
+  // The alignment specifier can only be 64, 128 or omitted.
+  let ParserMatchClass = AddrMode6Align64or128AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit, 128-bit or 256-bit alignment
+// encoding for VLD/VST instructions and checking the alignment value.
+def AddrMode6Align64or128or256AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory64or128or256";
+  let DiagnosticType = "AlignedMemoryRequires64or128or256";
+}
+def addrmode6align64or128or256 : AddrMode6Align {
+  // The alignment specifier can only be 64, 128, 256 or omitted.
+  let ParserMatchClass = AddrMode6Align64or128or256AsmOperand;
+}
+
+// Special version of addrmode6 to handle alignment encoding for VLD-dup
+// instructions, specifically VLD4-dup.
+def addrmode6dup : MemOperand,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm);
+  let EncoderMethod = "getAddrMode6DupAddressOpValue";
+  // FIXME: This is close, but not quite right. The alignment specifier is
+  // different.
+  let ParserMatchClass = AddrMode6AsmOperand;
+}
+
+// Base class for addrmode6dup with specific alignment restrictions.
+class AddrMode6DupAlign : MemOperand,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm);
+  let EncoderMethod = "getAddrMode6DupAddressOpValue";
+}
+
+// Special version of addrmode6 to handle no allowed alignment encoding for
+// VLD-dup instruction and checking the alignment is not specified.
+def AddrMode6dupAlignNoneAsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemoryNone";
+  let DiagnosticType = "DupAlignedMemoryRequiresNone";
+}
+def addrmode6dupalignNone : AddrMode6DupAlign {
+  // The alignment specifier can only be omitted.
+  let ParserMatchClass = AddrMode6dupAlignNoneAsmOperand;
+}
+
+// Special version of addrmode6 to handle 16-bit alignment encoding for VLD-dup
+// instruction and checking the alignment value.
+def AddrMode6dupAlign16AsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemory16";
+  let DiagnosticType = "DupAlignedMemoryRequires16";
+}
+def addrmode6dupalign16 : AddrMode6DupAlign {
+  // The alignment specifier can only be 16 or omitted.
+  let ParserMatchClass = AddrMode6dupAlign16AsmOperand;
+}
+
+// Special version of addrmode6 to handle 32-bit alignment encoding for VLD-dup
+// instruction and checking the alignment value.
+def AddrMode6dupAlign32AsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemory32";
+  let DiagnosticType = "DupAlignedMemoryRequires32";
+}
+def addrmode6dupalign32 : AddrMode6DupAlign {
+  // The alignment specifier can only be 32 or omitted.
+  let ParserMatchClass = AddrMode6dupAlign32AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit alignment encoding for VLD
+// instructions and checking the alignment value.
+def AddrMode6dupAlign64AsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemory64";
+  let DiagnosticType = "DupAlignedMemoryRequires64";
+}
+def addrmode6dupalign64 : AddrMode6DupAlign {
+  // The alignment specifier can only be 64 or omitted.
+  let ParserMatchClass = AddrMode6dupAlign64AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit or 128-bit alignment encoding
+// for VLD instructions and checking the alignment value.
+def AddrMode6dupAlign64or128AsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemory64or128";
+  let DiagnosticType = "DupAlignedMemoryRequires64or128";
+}
+def addrmode6dupalign64or128 : AddrMode6DupAlign {
+  // The alignment specifier can only be 64, 128 or omitted.
+  let ParserMatchClass = AddrMode6dupAlign64or128AsmOperand;
+}
+
+// addrmodepc := pc + reg
+//
+def addrmodepc : MemOperand,
+                 ComplexPattern<i32, 2, "SelectAddrModePC", []> {
+  let PrintMethod = "printAddrModePCOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// addr_offset_none := reg
+//
+def MemNoOffsetAsmOperand : AsmOperandClass { let Name = "MemNoOffset"; }
+def addr_offset_none : MemOperand,
+                       ComplexPattern<i32, 1, "SelectAddrOffsetNone", []> {
+  let PrintMethod = "printAddrMode7Operand";
+  let DecoderMethod = "DecodeAddrMode7Operand";
+  let ParserMatchClass = MemNoOffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base);
+}
+
+def nohash_imm : Operand<i32> {
+  let PrintMethod = "printNoHashImmediate";
+}
+
+def CoprocNumAsmOperand : AsmOperandClass {
+  let Name = "CoprocNum";
+  let ParserMethod = "parseCoprocNumOperand";
+}
+def p_imm : Operand<i32> {
+  let PrintMethod = "printPImmediate";
+  let ParserMatchClass = CoprocNumAsmOperand;
+  let DecoderMethod = "DecodeCoprocessor";
+}
+
+def CoprocRegAsmOperand : AsmOperandClass {
+  let Name = "CoprocReg";
+  let ParserMethod = "parseCoprocRegOperand";
+}
+def c_imm : Operand<i32> {
+  let PrintMethod = "printCImmediate";
+  let ParserMatchClass = CoprocRegAsmOperand;
+}
+def CoprocOptionAsmOperand : AsmOperandClass {
+  let Name = "CoprocOption";
+  let ParserMethod = "parseCoprocOptionOperand";
+}
+def coproc_option_imm : Operand<i32> {
+  let PrintMethod = "printCoprocOptionImm";
+  let ParserMatchClass = CoprocOptionAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+
+include "ARMInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Multiclass helpers...
+//
+
+/// AsI1_bin_irs - Defines a set of (op r, {mod_imm|r|so_reg}) patterns for a
+/// binop that produces a value.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AsI1_bin_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                     SDPatternOperator opnode, bit Commutable = 0> {
+  // The register-immediate version is re-materializable. This is useful
+  // in particular for taking the address of a local.
+  let isReMaterializable = 1 in {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm), DPFrm,
+               iii, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, (opnode GPR:$Rn, mod_imm:$imm))]>,
+           Sched<[WriteALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-0} = imm;
+  }
+  }
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+               iir, opc, "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>,
+           Sched<[WriteALU, ReadALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<4> Rm;
+    let Inst{25} = 0;
+    let isCommutable = Commutable;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-4} = 0b00000000;
+    let Inst{3-0} = Rm;
+  }
+
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_imm:$shift))]>,
+            Sched<[WriteALUsi, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+
+  def rsr : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_reg:$shift))]>,
+            Sched<[WriteALUsr, ReadALUsr]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
+}
+
+/// AsI1_rbin_irs - Same as AsI1_bin_irs except the order of operands are
+/// reversed.  The 'rr' form is only defined for the disassembler; for codegen
+/// it is equivalent to the AsI1_bin_irs counterpart.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                     SDNode opnode, bit Commutable = 0> {
+  // The register-immediate version is re-materializable. This is useful
+  // in particular for taking the address of a local.
+  let isReMaterializable = 1 in {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm), DPFrm,
+               iii, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, (opnode mod_imm:$imm, GPR:$Rn))]>,
+           Sched<[WriteALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-0} = imm;
+  }
+  }
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+               iir, opc, "\t$Rd, $Rn, $Rm",
+               [/* pattern left blank */]>,
+           Sched<[WriteALU, ReadALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<4> Rm;
+    let Inst{11-4} = 0b00000000;
+    let Inst{25} = 0;
+    let Inst{3-0} = Rm;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+  }
+
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode so_reg_imm:$shift, GPR:$Rn))]>,
+            Sched<[WriteALUsi, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+
+  def rsr : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode so_reg_reg:$shift, GPR:$Rn))]>,
+            Sched<[WriteALUsr, ReadALUsr]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
+}
+
+/// AsI1_bin_s_irs - Same as AsI1_bin_irs except it sets the 's' bit by default.
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving them an optional CPSR operand.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+multiclass AsI1_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
+                          InstrItinClass iis, SDNode opnode,
+                          bit Commutable = 0> {
+  def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p),
+                         4, iii,
+                         [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, mod_imm:$imm))]>,
+                         Sched<[WriteALU, ReadALU]>;
+
+  def rr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, pred:$p),
+                         4, iir,
+                         [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm))]>,
+                         Sched<[WriteALU, ReadALU, ReadALU]> {
+    let isCommutable = Commutable;
+  }
+  def rsi : ARMPseudoInst<(outs GPR:$Rd),
+                          (ins GPR:$Rn, so_reg_imm:$shift, pred:$p),
+                          4, iis,
+                          [(set GPR:$Rd, CPSR, (opnode GPR:$Rn,
+                                                so_reg_imm:$shift))]>,
+                          Sched<[WriteALUsi, ReadALU]>;
+
+  def rsr : ARMPseudoInst<(outs GPR:$Rd),
+                          (ins GPR:$Rn, so_reg_reg:$shift, pred:$p),
+                          4, iis,
+                          [(set GPR:$Rd, CPSR, (opnode GPR:$Rn,
+                                                so_reg_reg:$shift))]>,
+                          Sched<[WriteALUSsr, ReadALUsr]>;
+}
+}
+
+/// AsI1_rbin_s_is - Same as AsI1_bin_s_irs, except selection DAG
+/// operands are reversed.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+multiclass AsI1_rbin_s_is<InstrItinClass iii, InstrItinClass iir,
+                          InstrItinClass iis, SDNode opnode,
+                          bit Commutable = 0> {
+  def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p),
+                         4, iii,
+                         [(set GPR:$Rd, CPSR, (opnode mod_imm:$imm, GPR:$Rn))]>,
+           Sched<[WriteALU, ReadALU]>;
+
+  def rsi : ARMPseudoInst<(outs GPR:$Rd),
+                          (ins GPR:$Rn, so_reg_imm:$shift, pred:$p),
+                          4, iis,
+                          [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift,
+                                             GPR:$Rn))]>,
+            Sched<[WriteALUsi, ReadALU]>;
+
+  def rsr : ARMPseudoInst<(outs GPR:$Rd),
+                          (ins GPR:$Rn, so_reg_reg:$shift, pred:$p),
+                          4, iis,
+                          [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift,
+                                             GPR:$Rn))]>,
+            Sched<[WriteALUSsr, ReadALUsr]>;
+}
+}
+
+/// AI1_cmp_irs - Defines a set of (op r, {mod_imm|r|so_reg}) cmp / test
+/// patterns. Similar to AsI1_bin_irs except the instruction does not produce
+/// a explicit result, only implicitly set CPSR.
+let isCompare = 1, Defs = [CPSR] in {
+multiclass AI1_cmp_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                     SDPatternOperator opnode, bit Commutable = 0,
+                     string rrDecoderMethod = ""> {
+  def ri : AI1<opcod, (outs), (ins GPR:$Rn, mod_imm:$imm), DPFrm, iii,
+               opc, "\t$Rn, $imm",
+               [(opnode GPR:$Rn, mod_imm:$imm)]>,
+           Sched<[WriteCMP, ReadALU]> {
+    bits<4> Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-0} = imm;
+
+    let Unpredictable{15-12} = 0b1111;
+  }
+  def rr : AI1<opcod, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, iir,
+               opc, "\t$Rn, $Rm",
+               [(opnode GPR:$Rn, GPR:$Rm)]>,
+           Sched<[WriteCMP, ReadALU, ReadALU]> {
+    bits<4> Rn;
+    bits<4> Rm;
+    let isCommutable = Commutable;
+    let Inst{25} = 0;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-4} = 0b00000000;
+    let Inst{3-0} = Rm;
+    let DecoderMethod = rrDecoderMethod;
+
+    let Unpredictable{15-12} = 0b1111;
+  }
+  def rsi : AI1<opcod, (outs),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, iis,
+               opc, "\t$Rn, $shift",
+               [(opnode GPR:$Rn, so_reg_imm:$shift)]>,
+            Sched<[WriteCMPsi, ReadALU]> {
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+
+    let Unpredictable{15-12} = 0b1111;
+  }
+  def rsr : AI1<opcod, (outs),
+               (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, iis,
+               opc, "\t$Rn, $shift",
+               [(opnode GPRnopc:$Rn, so_reg_reg:$shift)]>,
+            Sched<[WriteCMPsr, ReadALU]> {
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+
+    let Unpredictable{15-12} = 0b1111;
+  }
+
+}
+}
+
+/// AI_ext_rrot - A unary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+/// FIXME: Remove the 'r' variant. Its rot_imm is zero.
+class AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
+          [(set GPRnopc:$Rd, (opnode (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
+       Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<2> rot;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rd;
+  let Inst{11-10} = rot;
+  let Inst{3-0}   = Rm;
+}
+
+class AI_ext_rrot_np<bits<8> opcod, string opc>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTr, opc, "\t$Rd, $Rm$rot", []>,
+       Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> {
+  bits<2> rot;
+  let Inst{19-16} = 0b1111;
+  let Inst{11-10} = rot;
+ }
+
+/// AI_exta_rrot - A binary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+class AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot",
+          [(set GPRnopc:$Rd, (opnode GPR:$Rn,
+                                     (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
+        Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<4> Rn;
+  bits<2> rot;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{11-10} = rot;
+  let Inst{9-4}   = 0b000111;
+  let Inst{3-0}   = Rm;
+}
+
+class AI_exta_rrot_np<bits<8> opcod, string opc>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot", []>,
+       Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> {
+  bits<4> Rn;
+  bits<2> rot;
+  let Inst{19-16} = Rn;
+  let Inst{11-10} = rot;
+}
+
+/// AI1_adde_sube_irs - Define instructions and patterns for adde and sube.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, SDNode opnode,
+                             bit Commutable = 0> {
+  let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm),
+                DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, mod_imm:$imm, CPSR))]>,
+               Requires<[IsARM]>,
+           Sched<[WriteALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+    let Inst{11-0} = imm;
+  }
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm, CPSR))]>,
+               Requires<[IsARM]>,
+           Sched<[WriteALU, ReadALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<4> Rm;
+    let Inst{11-4} = 0b00000000;
+    let Inst{25} = 0;
+    let isCommutable = Commutable;
+    let Inst{3-0} = Rm;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+  }
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+                (ins GPR:$Rn, so_reg_imm:$shift),
+                DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_imm:$shift, CPSR))]>,
+               Requires<[IsARM]>,
+            Sched<[WriteALUsi, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+  def rsr : AsI1<opcod, (outs GPRnopc:$Rd),
+                (ins GPRnopc:$Rn, so_reg_reg:$shift),
+                DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPRnopc:$Rd, CPSR,
+                    (opnode GPRnopc:$Rn, so_reg_reg:$shift, CPSR))]>,
+               Requires<[IsARM]>,
+            Sched<[WriteALUsr, ReadALUsr]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
+  }
+}
+
+/// AI1_rsc_irs - Define instructions and patterns for rsc
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AI1_rsc_irs<bits<4> opcod, string opc, SDNode opnode> {
+  let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm),
+                DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, CPSR, (opnode mod_imm:$imm, GPR:$Rn, CPSR))]>,
+               Requires<[IsARM]>,
+           Sched<[WriteALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+    let Inst{11-0} = imm;
+  }
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm",
+               [/* pattern left blank */]>,
+           Sched<[WriteALU, ReadALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<4> Rm;
+    let Inst{11-4} = 0b00000000;
+    let Inst{25} = 0;
+    let Inst{3-0} = Rm;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+  }
+  def rsi : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_imm:$shift),
+                DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift, GPR:$Rn, CPSR))]>,
+               Requires<[IsARM]>,
+            Sched<[WriteALUsi, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+  def rsr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_reg:$shift),
+                DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift, GPR:$Rn, CPSR))]>,
+               Requires<[IsARM]>,
+            Sched<[WriteALUsr, ReadALUsr]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
+  }
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+multiclass AI_ldr1<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12: AI2ldst<0b010, 1, isByte, (outs GPR:$Rt), (ins addrmode_imm12:$addr),
+                   AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr",
+                  [(set GPR:$Rt, (opnode addrmode_imm12:$addr))]> {
+    bits<4>  Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 1, isByte, (outs GPR:$Rt), (ins ldst_so_reg:$shift),
+                  AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
+                 [(set GPR:$Rt, (opnode ldst_so_reg:$shift))]> {
+    bits<4>  Rt;
+    bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+multiclass AI_ldr1nopc<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12: AI2ldst<0b010, 1, isByte, (outs GPRnopc:$Rt),
+                   (ins addrmode_imm12:$addr),
+                   AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr",
+                   [(set GPRnopc:$Rt, (opnode addrmode_imm12:$addr))]> {
+    bits<4>  Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 1, isByte, (outs GPRnopc:$Rt),
+                   (ins ldst_so_reg:$shift),
+                   AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
+                   [(set GPRnopc:$Rt, (opnode ldst_so_reg:$shift))]> {
+    bits<4>  Rt;
+    bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+}
+
+
+multiclass AI_str1<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12 : AI2ldst<0b010, 0, isByte, (outs),
+                   (ins GPR:$Rt, addrmode_imm12:$addr),
+                   AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr",
+                  [(opnode GPR:$Rt, addrmode_imm12:$addr)]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPR:$Rt, ldst_so_reg:$shift),
+                  AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
+                 [(opnode GPR:$Rt, ldst_so_reg:$shift)]> {
+    bits<4> Rt;
+    bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+
+multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12 : AI2ldst<0b010, 0, isByte, (outs),
+                   (ins GPRnopc:$Rt, addrmode_imm12:$addr),
+                   AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr",
+                  [(opnode GPRnopc:$Rt, addrmode_imm12:$addr)]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 0, isByte, (outs),
+                   (ins GPRnopc:$Rt, ldst_so_reg:$shift),
+                   AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
+                   [(opnode GPRnopc:$Rt, ldst_so_reg:$shift)]> {
+    bits<4> Rt;
+    bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+/// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in
+/// the function.  The first operand is the ID# for this instruction, the second
+/// is the index into the MachineConstantPool that this is, the third is the
+/// size in bytes of this constant pool entry.
+let hasSideEffects = 0, isNotDuplicable = 1 in
+def CONSTPOOL_ENTRY :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                    i32imm:$size), NoItinerary, []>;
+
+/// A jumptable consisting of direct 32-bit addresses of the destination basic
+/// blocks (either absolute, or relative to the start of the jump-table in PIC
+/// mode). Used mostly in ARM and Thumb-1 modes.
+def JUMPTABLE_ADDRS :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                        i32imm:$size), NoItinerary, []>;
+
+/// A jumptable consisting of 32-bit jump instructions. Used for Thumb-2 tables
+/// that cannot be optimised to use TBB or TBH.
+def JUMPTABLE_INSTS :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                        i32imm:$size), NoItinerary, []>;
+
+/// A jumptable consisting of 8-bit unsigned integers representing offsets from
+/// a TBB instruction.
+def JUMPTABLE_TBB :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                        i32imm:$size), NoItinerary, []>;
+
+/// A jumptable consisting of 16-bit unsigned integers representing offsets from
+/// a TBH instruction.
+def JUMPTABLE_TBH :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                        i32imm:$size), NoItinerary, []>;
+
+
+// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE
+// from removing one half of the matched pairs. That breaks PEI, which assumes
+// these will always be in pairs, and asserts if it finds otherwise. Better way?
+let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
+def ADJCALLSTACKUP :
+PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary,
+           [(ARMcallseq_end timm:$amt1, timm:$amt2)]>;
+
+def ADJCALLSTACKDOWN :
+PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
+           [(ARMcallseq_start timm:$amt)]>;
+}
+
+def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
+              "hint", "\t$imm", [(int_arm_hint imm0_239:$imm)]>,
+           Requires<[IsARM, HasV6]> {
+  bits<8> imm;
+  let Inst{27-8} = 0b00110010000011110000;
+  let Inst{7-0} = imm;
+  let DecoderMethod = "DecodeHINTInstruction";
+}
+
+def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6K]>;
+def : InstAlias<"yield$p", (HINT 1, pred:$p)>, Requires<[IsARM, HasV6K]>;
+def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6K]>;
+def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6K]>;
+def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6K]>;
+def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>;
+def : InstAlias<"esb$p", (HINT 16, pred:$p)>, Requires<[IsARM, HasRAS]>;
+
+def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
+             "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{3-0} = Rm;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{27-20} = 0b01101000;
+  let Inst{7-4} = 0b1011;
+  let Inst{11-8} = 0b1111;
+  let Unpredictable{11-8} = 0b1111;
+}
+
+// The 16-bit operand $val can be used by a debugger to store more information
+// about the breakpoint.
+def BKPT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
+                 "bkpt", "\t$val", []>, Requires<[IsARM]> {
+  bits<16> val;
+  let Inst{3-0} = val{3-0};
+  let Inst{19-8} = val{15-4};
+  let Inst{27-20} = 0b00010010;
+  let Inst{31-28} = 0xe; // AL
+  let Inst{7-4} = 0b0111;
+}
+// default immediate for breakpoint mnemonic
+def : InstAlias<"bkpt", (BKPT 0), 0>, Requires<[IsARM]>;
+
+def HLT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
+                 "hlt", "\t$val", []>, Requires<[IsARM, HasV8]> {
+  bits<16> val;
+  let Inst{3-0} = val{3-0};
+  let Inst{19-8} = val{15-4};
+  let Inst{27-20} = 0b00010000;
+  let Inst{31-28} = 0xe; // AL
+  let Inst{7-4} = 0b0111;
+}
+
+// Change Processor State
+// FIXME: We should use InstAlias to handle the optional operands.
+class CPS<dag iops, string asm_ops>
+  : AXI<(outs), iops, MiscFrm, NoItinerary, !strconcat("cps", asm_ops),
+        []>, Requires<[IsARM]> {
+  bits<2> imod;
+  bits<3> iflags;
+  bits<5> mode;
+  bit M;
+
+  let Inst{31-28} = 0b1111;
+  let Inst{27-20} = 0b00010000;
+  let Inst{19-18} = imod;
+  let Inst{17}    = M; // Enabled if mode is set;
+  let Inst{16-9}  = 0b00000000;
+  let Inst{8-6}   = iflags;
+  let Inst{5}     = 0;
+  let Inst{4-0}   = mode;
+}
+
+let DecoderMethod = "DecodeCPSInstruction" in {
+let M = 1 in
+  def CPS3p : CPS<(ins imod_op:$imod, iflags_op:$iflags, imm0_31:$mode),
+                  "$imod\t$iflags, $mode">;
+let mode = 0, M = 0 in
+  def CPS2p : CPS<(ins imod_op:$imod, iflags_op:$iflags), "$imod\t$iflags">;
+
+let imod = 0, iflags = 0, M = 1 in
+  def CPS1p : CPS<(ins imm0_31:$mode), "\t$mode">;
+}
+
+// Preload signals the memory system of possible future data/instruction access.
+multiclass APreLoad<bits<1> read, bits<1> data, string opc> {
+
+  def i12 : AXIM<(outs), (ins addrmode_imm12:$addr), AddrMode_i12, MiscFrm,
+                IIC_Preload, !strconcat(opc, "\t$addr"),
+                [(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]>,
+                Sched<[WritePreLd]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{31-26} = 0b111101;
+    let Inst{25} = 0; // 0 for immediate form
+    let Inst{24} = data;
+    let Inst{23} = addr{12};        // U (add = ('U' == 1))
+    let Inst{22} = read;
+    let Inst{21-20} = 0b01;
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = 0b1111;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+
+  def rs : AXI<(outs), (ins ldst_so_reg:$shift), MiscFrm, IIC_Preload,
+               !strconcat(opc, "\t$shift"),
+               [(ARMPreload ldst_so_reg:$shift, (i32 read), (i32 data))]>,
+               Sched<[WritePreLd]> {
+    bits<17> shift;
+    let Inst{31-26} = 0b111101;
+    let Inst{25} = 1; // 1 for register form
+    let Inst{24} = data;
+    let Inst{23} = shift{12};    // U (add = ('U' == 1))
+    let Inst{22} = read;
+    let Inst{21-20} = 0b01;
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = 0b1111;
+    let Inst{11-0}  = shift{11-0};
+    let Inst{4} = 0;
+  }
+}
+
+defm PLD  : APreLoad<1, 1, "pld">,  Requires<[IsARM]>;
+defm PLDW : APreLoad<0, 1, "pldw">, Requires<[IsARM,HasV7,HasMP]>;
+defm PLI  : APreLoad<1, 0, "pli">,  Requires<[IsARM,HasV7]>;
+
+def SETEND : AXI<(outs), (ins setend_op:$end), MiscFrm, NoItinerary,
+                 "setend\t$end", []>, Requires<[IsARM]>, Deprecated<HasV8Ops> {
+  bits<1> end;
+  let Inst{31-10} = 0b1111000100000001000000;
+  let Inst{9} = end;
+  let Inst{8-0} = 0;
+}
+
+def DBG : AI<(outs), (ins imm0_15:$opt), MiscFrm, NoItinerary, "dbg", "\t$opt",
+             [(int_arm_dbg imm0_15:$opt)]>, Requires<[IsARM, HasV7]> {
+  bits<4> opt;
+  let Inst{27-4} = 0b001100100000111100001111;
+  let Inst{3-0} = opt;
+}
+
+// A8.8.247  UDF - Undefined (Encoding A1)
+def UDF : AInoP<(outs), (ins imm0_65535:$imm16), MiscFrm, NoItinerary,
+                "udf", "\t$imm16", [(int_arm_undefined imm0_65535:$imm16)]> {
+  bits<16> imm16;
+  let Inst{31-28} = 0b1110; // AL
+  let Inst{27-25} = 0b011;
+  let Inst{24-20} = 0b11111;
+  let Inst{19-8} = imm16{15-4};
+  let Inst{7-4} = 0b1111;
+  let Inst{3-0} = imm16{3-0};
+}
+
+/*
+ * A5.4 Permanently UNDEFINED instructions.
+ *
+ * For most targets use UDF #65006, for which the OS will generate SIGTRAP.
+ * Other UDF encodings generate SIGILL.
+ *
+ * NaCl's OS instead chooses an ARM UDF encoding that's also a UDF in Thumb.
+ * Encoding A1:
+ *  1110 0111 1111 iiii iiii iiii 1111 iiii
+ * Encoding T1:
+ *  1101 1110 iiii iiii
+ * It uses the following encoding:
+ *  1110 0111 1111 1110 1101 1110 1111 0000
+ *  - In ARM: UDF #60896;
+ *  - In Thumb: UDF #254 followed by a branch-to-self.
+ */
+let isBarrier = 1, isTerminator = 1 in
+def TRAPNaCl : AXI<(outs), (ins), MiscFrm, NoItinerary,
+               "trap", [(trap)]>,
+           Requires<[IsARM,UseNaClTrap]> {
+  let Inst = 0xe7fedef0;
+}
+let isBarrier = 1, isTerminator = 1 in
+def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary,
+               "trap", [(trap)]>,
+           Requires<[IsARM,DontUseNaClTrap]> {
+  let Inst = 0xe7ffdefe;
+}
+
+// Address computation and loads and stores in PIC mode.
+let isNotDuplicable = 1 in {
+def PICADD  : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p),
+                            4, IIC_iALUr,
+                            [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>,
+                            Sched<[WriteALU, ReadALU]>;
+
+let AddedComplexity = 10 in {
+def PICLDR  : ARMPseudoInst<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
+                            4, IIC_iLoad_r,
+                            [(set GPR:$dst, (load addrmodepc:$addr))]>;
+
+def PICLDRH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+                            4, IIC_iLoad_bh_r,
+                            [(set GPR:$Rt, (zextloadi16 addrmodepc:$addr))]>;
+
+def PICLDRB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+                            4, IIC_iLoad_bh_r,
+                            [(set GPR:$Rt, (zextloadi8 addrmodepc:$addr))]>;
+
+def PICLDRSH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+                            4, IIC_iLoad_bh_r,
+                            [(set GPR:$Rt, (sextloadi16 addrmodepc:$addr))]>;
+
+def PICLDRSB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+                            4, IIC_iLoad_bh_r,
+                            [(set GPR:$Rt, (sextloadi8 addrmodepc:$addr))]>;
+}
+let AddedComplexity = 10 in {
+def PICSTR  : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+      4, IIC_iStore_r, [(store GPR:$src, addrmodepc:$addr)]>;
+
+def PICSTRH : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+      4, IIC_iStore_bh_r, [(truncstorei16 GPR:$src,
+                                                   addrmodepc:$addr)]>;
+
+def PICSTRB : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+      4, IIC_iStore_bh_r, [(truncstorei8 GPR:$src, addrmodepc:$addr)]>;
+}
+} // isNotDuplicable = 1
+
+
+// LEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+let hasSideEffects = 0, isReMaterializable = 1 in
+// The 'adr' mnemonic encodes differently if the label is before or after
+// the instruction. The {24-21} opcode bits are set by the fixup, as we don't
+// know until then which form of the instruction will be used.
+def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label),
+                 MiscFrm, IIC_iALUi, "adr", "\t$Rd, $label", []>,
+                 Sched<[WriteALU, ReadALU]> {
+  bits<4> Rd;
+  bits<14> label;
+  let Inst{27-25} = 0b001;
+  let Inst{24} = 0;
+  let Inst{23-22} = label{13-12};
+  let Inst{21} = 0;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rd;
+  let Inst{11-0} = label{11-0};
+}
+
+let hasSideEffects = 1 in {
+def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p),
+                    4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
+
+def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd),
+                      (ins i32imm:$label, pred:$p),
+                      4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+  // ARMV4T and above
+  def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br,
+                  "bx", "\tlr", [(ARMretflag)]>,
+               Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
+    let Inst{27-0}  = 0b0001001011111111111100011110;
+  }
+
+  // ARMV4 only
+  def MOVPCLR : AI<(outs), (ins), BrMiscFrm, IIC_Br,
+                  "mov", "\tpc, lr", [(ARMretflag)]>,
+               Requires<[IsARM, NoV4T]>, Sched<[WriteBr]> {
+    let Inst{27-0} = 0b0001101000001111000000001110;
+  }
+
+  // Exception return: N.b. doesn't set CPSR as far as we're concerned (it sets
+  // the user-space one).
+  def SUBS_PC_LR : ARMPseudoInst<(outs), (ins i32imm:$offset, pred:$p),
+                                 4, IIC_Br,
+                                 [(ARMintretflag imm:$offset)]>;
+}
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  // ARMV4T and above
+  def BX : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst",
+                  [(brind GPR:$dst)]>,
+              Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
+    bits<4> dst;
+    let Inst{31-4} = 0b1110000100101111111111110001;
+    let Inst{3-0}  = dst;
+  }
+
+  def BX_pred : AI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br,
+                  "bx", "\t$dst", [/* pattern left blank */]>,
+              Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
+    bits<4> dst;
+    let Inst{27-4} = 0b000100101111111111110001;
+    let Inst{3-0}  = dst;
+  }
+}
+
+// SP is marked as a use to prevent stack-pointer assignments that appear
+// immediately before calls from potentially appearing dead.
+let isCall = 1,
+  // FIXME:  Do we really need a non-predicated version? If so, it should
+  // at least be a pseudo instruction expanding to the predicated version
+  // at MC lowering time.
+  Defs = [LR], Uses = [SP] in {
+  def BL  : ABXI<0b1011, (outs), (ins arm_bl_target:$func),
+                IIC_Br, "bl\t$func",
+                [(ARMcall tglobaladdr:$func)]>,
+            Requires<[IsARM]>, Sched<[WriteBrL]> {
+    let Inst{31-28} = 0b1110;
+    bits<24> func;
+    let Inst{23-0} = func;
+    let DecoderMethod = "DecodeBranchImmInstruction";
+  }
+
+  def BL_pred : ABI<0b1011, (outs), (ins arm_bl_target:$func),
+                   IIC_Br, "bl", "\t$func",
+                   [(ARMcall_pred tglobaladdr:$func)]>,
+                Requires<[IsARM]>, Sched<[WriteBrL]> {
+    bits<24> func;
+    let Inst{23-0} = func;
+    let DecoderMethod = "DecodeBranchImmInstruction";
+  }
+
+  // ARMv5T and above
+  def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm,
+                IIC_Br, "blx\t$func",
+                [(ARMcall GPR:$func)]>,
+            Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
+    bits<4> func;
+    let Inst{31-4} = 0b1110000100101111111111110011;
+    let Inst{3-0}  = func;
+  }
+
+  def BLX_pred : AI<(outs), (ins GPR:$func), BrMiscFrm,
+                    IIC_Br, "blx", "\t$func",
+                    [(ARMcall_pred GPR:$func)]>,
+                 Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
+    bits<4> func;
+    let Inst{27-4} = 0b000100101111111111110011;
+    let Inst{3-0}  = func;
+  }
+
+  // ARMv4T
+  // Note: Restrict $func to the tGPR regclass to prevent it being in LR.
+  def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
+                   8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
+                   Requires<[IsARM, HasV4T]>, Sched<[WriteBr]>;
+
+  // ARMv4
+  def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
+                   8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
+                   Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>;
+
+  // mov lr, pc; b if callee is marked noreturn to avoid confusing the
+  // return stack predictor.
+  def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins arm_bl_target:$func),
+                               8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
+                      Requires<[IsARM]>, Sched<[WriteBr]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+  // FIXME: should be able to write a pattern for ARMBrcond, but can't use
+  // a two-value operand where a dag node expects two operands. :(
+  def Bcc : ABI<0b1010, (outs), (ins arm_br_target:$target),
+               IIC_Br, "b", "\t$target",
+               [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>,
+               Sched<[WriteBr]>  {
+    bits<24> target;
+    let Inst{23-0} = target;
+    let DecoderMethod = "DecodeBranchImmInstruction";
+  }
+
+  let isBarrier = 1 in {
+    // B is "predicable" since it's just a Bcc with an 'always' condition.
+    let isPredicable = 1 in
+    // FIXME: We shouldn't need this pseudo at all. Just using Bcc directly
+    // should be sufficient.
+    // FIXME: Is B really a Barrier? That doesn't seem right.
+    def B : ARMPseudoExpand<(outs), (ins arm_br_target:$target), 4, IIC_Br,
+                [(br bb:$target)], (Bcc arm_br_target:$target,
+                (ops 14, zero_reg))>,
+                Sched<[WriteBr]>;
+
+    let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in {
+    def BR_JTr : ARMPseudoInst<(outs),
+                      (ins GPR:$target, i32imm:$jt),
+                      0, IIC_Br,
+                      [(ARMbrjt GPR:$target, tjumptable:$jt)]>,
+                      Sched<[WriteBr]>;
+    // FIXME: This shouldn't use the generic "addrmode2," but rather be split
+    // into i12 and rs suffixed versions.
+    def BR_JTm : ARMPseudoInst<(outs),
+                     (ins addrmode2:$target, i32imm:$jt),
+                     0, IIC_Br,
+                     [(ARMbrjt (i32 (load addrmode2:$target)),
+                               tjumptable:$jt)]>, Sched<[WriteBrTbl]>;
+    def BR_JTadd : ARMPseudoInst<(outs),
+                   (ins GPR:$target, GPR:$idx, i32imm:$jt),
+                   0, IIC_Br,
+                   [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt)]>,
+                   Sched<[WriteBrTbl]>;
+    } // isNotDuplicable = 1, isIndirectBranch = 1
+  } // isBarrier = 1
+
+}
+
+// BLX (immediate)
+def BLXi : AXI<(outs), (ins arm_blx_target:$target), BrMiscFrm, NoItinerary,
+               "blx\t$target", []>,
+           Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
+  let Inst{31-25} = 0b1111101;
+  bits<25> target;
+  let Inst{23-0} = target{24-1};
+  let Inst{24} = target{0};
+  let isCall = 1;
+}
+
+// Branch and Exchange Jazelle
+def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
+              [/* pattern left blank */]>, Sched<[WriteBr]> {
+  bits<4> func;
+  let Inst{23-20} = 0b0010;
+  let Inst{19-8} = 0xfff;
+  let Inst{7-4} = 0b0010;
+  let Inst{3-0} = func;
+  let isBranch = 1;
+}
+
+// Tail calls.
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
+  def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>,
+                   Sched<[WriteBr]>;
+
+  def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>,
+                   Sched<[WriteBr]>;
+
+  def TAILJMPd : ARMPseudoExpand<(outs), (ins arm_br_target:$dst),
+                                 4, IIC_Br, [],
+                                 (Bcc arm_br_target:$dst, (ops 14, zero_reg))>,
+                                 Requires<[IsARM]>, Sched<[WriteBr]>;
+
+  def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst),
+                                 4, IIC_Br, [],
+                                 (BX GPR:$dst)>, Sched<[WriteBr]>,
+                                 Requires<[IsARM]>;
+}
+
+// Secure Monitor Call is a system instruction.
+def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
+              []>, Requires<[IsARM, HasTrustZone]> {
+  bits<4> opt;
+  let Inst{23-4} = 0b01100000000000000111;
+  let Inst{3-0} = opt;
+}
+def : MnemonicAlias<"smi", "smc">;
+
+// Supervisor Call (Software Interrupt)
+let isCall = 1, Uses = [SP] in {
+def SVC : ABI<0b1111, (outs), (ins imm24b:$svc), IIC_Br, "svc", "\t$svc", []>,
+          Sched<[WriteBr]> {
+  bits<24> svc;
+  let Inst{23-0} = svc;
+}
+}
+
+// Store Return State
+class SRSI<bit wb, string asm>
+  : XI<(outs), (ins imm0_31:$mode), AddrModeNone, 4, IndexModeNone, BrFrm,
+       NoItinerary, asm, "", []> {
+  bits<5> mode;
+  let Inst{31-28} = 0b1111;
+  let Inst{27-25} = 0b100;
+  let Inst{22} = 1;
+  let Inst{21} = wb;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0b1101;  // SP
+  let Inst{15-5} = 0b00000101000;
+  let Inst{4-0} = mode;
+}
+
+def SRSDA : SRSI<0, "srsda\tsp, $mode"> {
+  let Inst{24-23} = 0;
+}
+def SRSDA_UPD : SRSI<1, "srsda\tsp!, $mode"> {
+  let Inst{24-23} = 0;
+}
+def SRSDB : SRSI<0, "srsdb\tsp, $mode"> {
+  let Inst{24-23} = 0b10;
+}
+def SRSDB_UPD : SRSI<1, "srsdb\tsp!, $mode"> {
+  let Inst{24-23} = 0b10;
+}
+def SRSIA : SRSI<0, "srsia\tsp, $mode"> {
+  let Inst{24-23} = 0b01;
+}
+def SRSIA_UPD : SRSI<1, "srsia\tsp!, $mode"> {
+  let Inst{24-23} = 0b01;
+}
+def SRSIB : SRSI<0, "srsib\tsp, $mode"> {
+  let Inst{24-23} = 0b11;
+}
+def SRSIB_UPD : SRSI<1, "srsib\tsp!, $mode"> {
+  let Inst{24-23} = 0b11;
+}
+
+def : ARMInstAlias<"srsda $mode", (SRSDA imm0_31:$mode)>;
+def : ARMInstAlias<"srsda $mode!", (SRSDA_UPD imm0_31:$mode)>;
+
+def : ARMInstAlias<"srsdb $mode", (SRSDB imm0_31:$mode)>;
+def : ARMInstAlias<"srsdb $mode!", (SRSDB_UPD imm0_31:$mode)>;
+
+def : ARMInstAlias<"srsia $mode", (SRSIA imm0_31:$mode)>;
+def : ARMInstAlias<"srsia $mode!", (SRSIA_UPD imm0_31:$mode)>;
+
+def : ARMInstAlias<"srsib $mode", (SRSIB imm0_31:$mode)>;
+def : ARMInstAlias<"srsib $mode!", (SRSIB_UPD imm0_31:$mode)>;
+
+// Return From Exception
+class RFEI<bit wb, string asm>
+  : XI<(outs), (ins GPR:$Rn), AddrModeNone, 4, IndexModeNone, BrFrm,
+       NoItinerary, asm, "", []> {
+  bits<4> Rn;
+  let Inst{31-28} = 0b1111;
+  let Inst{27-25} = 0b100;
+  let Inst{22} = 0;
+  let Inst{21} = wb;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-0} = 0xa00;
+}
+
+def RFEDA : RFEI<0, "rfeda\t$Rn"> {
+  let Inst{24-23} = 0;
+}
+def RFEDA_UPD : RFEI<1, "rfeda\t$Rn!"> {
+  let Inst{24-23} = 0;
+}
+def RFEDB : RFEI<0, "rfedb\t$Rn"> {
+  let Inst{24-23} = 0b10;
+}
+def RFEDB_UPD : RFEI<1, "rfedb\t$Rn!"> {
+  let Inst{24-23} = 0b10;
+}
+def RFEIA : RFEI<0, "rfeia\t$Rn"> {
+  let Inst{24-23} = 0b01;
+}
+def RFEIA_UPD : RFEI<1, "rfeia\t$Rn!"> {
+  let Inst{24-23} = 0b01;
+}
+def RFEIB : RFEI<0, "rfeib\t$Rn"> {
+  let Inst{24-23} = 0b11;
+}
+def RFEIB_UPD : RFEI<1, "rfeib\t$Rn!"> {
+  let Inst{24-23} = 0b11;
+}
+
+// Hypervisor Call is a system instruction
+let isCall = 1 in {
+def HVC : AInoP< (outs), (ins imm0_65535:$imm), BrFrm, NoItinerary,
+                "hvc", "\t$imm", []>,
+          Requires<[IsARM, HasVirtualization]> {
+  bits<16> imm;
+
+  // Even though HVC isn't predicable, it's encoding includes a condition field.
+  // The instruction is undefined if the condition field is 0xf otherwise it is
+  // unpredictable if it isn't condition AL (0xe).
+  let Inst{31-28} = 0b1110;
+  let Unpredictable{31-28} = 0b1111;
+  let Inst{27-24} = 0b0001;
+  let Inst{23-20} = 0b0100;
+  let Inst{19-8} = imm{15-4};
+  let Inst{7-4} = 0b0111;
+  let Inst{3-0} = imm{3-0};
+}
+}
+
+// Return from exception in Hypervisor mode.
+let isReturn = 1, isBarrier = 1, isTerminator = 1, Defs = [PC] in
+def ERET : ABI<0b0001, (outs), (ins), NoItinerary, "eret", "", []>,
+    Requires<[IsARM, HasVirtualization]> {
+    let Inst{23-0} = 0b011000000000000001101110;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / Store Instructions.
+//
+
+// Load
+
+
+defm LDR  : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si, load>;
+defm LDRB : AI_ldr1nopc<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si,
+                        zextloadi8>;
+defm STR  : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si, store>;
+defm STRB : AI_str1nopc<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si,
+                        truncstorei8>;
+
+// Special LDR for loads from non-pc-relative constpools.
+let canFoldAsLoad = 1, mayLoad = 1, hasSideEffects = 0,
+    isReMaterializable = 1, isCodeGenOnly = 1 in
+def LDRcp : AI2ldst<0b010, 1, 0, (outs GPR:$Rt), (ins addrmode_imm12:$addr),
+                 AddrMode_i12, LdFrm, IIC_iLoad_r, "ldr", "\t$Rt, $addr",
+                 []> {
+  bits<4> Rt;
+  bits<17> addr;
+  let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rt;
+  let Inst{11-0}  = addr{11-0};   // imm12
+}
+
+// Loads with zero extension
+def LDRH  : AI3ld<0b1011, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
+                  IIC_iLoad_bh_r, "ldrh", "\t$Rt, $addr",
+                  [(set GPR:$Rt, (zextloadi16 addrmode3:$addr))]>;
+
+// Loads with sign extension
+def LDRSH : AI3ld<0b1111, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
+                   IIC_iLoad_bh_r, "ldrsh", "\t$Rt, $addr",
+                   [(set GPR:$Rt, (sextloadi16 addrmode3:$addr))]>;
+
+def LDRSB : AI3ld<0b1101, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
+                   IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr",
+                   [(set GPR:$Rt, (sextloadi8 addrmode3:$addr))]>;
+
+let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
+  // Load doubleword
+  def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode3:$addr),
+                   LdMiscFrm, IIC_iLoad_d_r, "ldrd", "\t$Rt, $Rt2, $addr", []>,
+             Requires<[IsARM, HasV5TE]>;
+}
+
+def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                    NoItinerary, "lda", "\t$Rt, $addr", []>;
+def LDAB : AIldracq<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                    NoItinerary, "ldab", "\t$Rt, $addr", []>;
+def LDAH : AIldracq<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                    NoItinerary, "ldah", "\t$Rt, $addr", []>;
+
+// Indexed loads
+multiclass AI2_ldridx<bit isByte, string opc,
+                      InstrItinClass iii, InstrItinClass iir> {
+  def _PRE_IMM  : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                      (ins addrmode_imm12_pre:$addr), IndexModePre, LdFrm, iii,
+                      opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 0;
+    let Inst{23} = addr{12};
+    let Inst{19-16} = addr{16-13};
+    let Inst{11-0} = addr{11-0};
+    let DecoderMethod = "DecodeLDRPreImm";
+  }
+
+  def _PRE_REG  : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                      (ins ldst_so_reg:$addr), IndexModePre, LdFrm, iir,
+                      opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 1;
+    let Inst{23} = addr{12};
+    let Inst{19-16} = addr{16-13};
+    let Inst{11-0} = addr{11-0};
+    let Inst{4} = 0;
+    let DecoderMethod = "DecodeLDRPreReg";
+  }
+
+  def _POST_REG : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                       (ins addr_offset_none:$addr, am2offset_reg:$offset),
+                       IndexModePost, LdFrm, iir,
+                       opc, "\t$Rt, $addr, $offset",
+                       "$addr.base = $Rn_wb", []> {
+     // {12}     isAdd
+     // {11-0}   imm12/Rm
+     bits<14> offset;
+     bits<4> addr;
+     let Inst{25} = 1;
+     let Inst{23} = offset{12};
+     let Inst{19-16} = addr;
+     let Inst{11-0} = offset{11-0};
+     let Inst{4} = 0;
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+   }
+
+   def _POST_IMM : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                       (ins addr_offset_none:$addr, am2offset_imm:$offset),
+                      IndexModePost, LdFrm, iii,
+                      opc, "\t$Rt, $addr, $offset",
+                      "$addr.base = $Rn_wb", []> {
+    // {12}     isAdd
+    // {11-0}   imm12/Rm
+    bits<14> offset;
+    bits<4> addr;
+    let Inst{25} = 0;
+    let Inst{23} = offset{12};
+    let Inst{19-16} = addr;
+    let Inst{11-0} = offset{11-0};
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+  }
+
+}
+
+let mayLoad = 1, hasSideEffects = 0 in {
+// FIXME: for LDR_PRE_REG etc. the itineray should be either IIC_iLoad_ru or
+// IIC_iLoad_siu depending on whether it the offset register is shifted.
+defm LDR  : AI2_ldridx<0, "ldr", IIC_iLoad_iu, IIC_iLoad_ru>;
+defm LDRB : AI2_ldridx<1, "ldrb", IIC_iLoad_bh_iu, IIC_iLoad_bh_ru>;
+}
+
+multiclass AI3_ldridx<bits<4> op, string opc, InstrItinClass itin> {
+  def _PRE  : AI3ldstidx<op, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                        (ins addrmode3_pre:$addr), IndexModePre,
+                        LdMiscFrm, itin,
+                        opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<14> addr;
+    let Inst{23}    = addr{8};      // U bit
+    let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+    let Inst{19-16} = addr{12-9};   // Rn
+    let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+    let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+    let DecoderMethod = "DecodeAddrMode3Instruction";
+  }
+  def _POST : AI3ldstidx<op, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                        (ins addr_offset_none:$addr, am3offset:$offset),
+                        IndexModePost, LdMiscFrm, itin,
+                        opc, "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb",
+                        []> {
+    bits<10> offset;
+    bits<4> addr;
+    let Inst{23}    = offset{8};      // U bit
+    let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+    let Inst{19-16} = addr;
+    let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+    let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+    let DecoderMethod = "DecodeAddrMode3Instruction";
+  }
+}
+
+let mayLoad = 1, hasSideEffects = 0 in {
+defm LDRH  : AI3_ldridx<0b1011, "ldrh", IIC_iLoad_bh_ru>;
+defm LDRSH : AI3_ldridx<0b1111, "ldrsh", IIC_iLoad_bh_ru>;
+defm LDRSB : AI3_ldridx<0b1101, "ldrsb", IIC_iLoad_bh_ru>;
+let hasExtraDefRegAllocReq = 1 in {
+def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
+                          (ins addrmode3_pre:$addr), IndexModePre,
+                          LdMiscFrm, IIC_iLoad_d_ru,
+                          "ldrd", "\t$Rt, $Rt2, $addr!",
+                          "$addr.base = $Rn_wb", []> {
+  bits<14> addr;
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+def LDRD_POST: AI3ldstidx<0b1101, 0, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
+                          (ins addr_offset_none:$addr, am3offset:$offset),
+                          IndexModePost, LdMiscFrm, IIC_iLoad_d_ru,
+                          "ldrd", "\t$Rt, $Rt2, $addr, $offset",
+                          "$addr.base = $Rn_wb", []> {
+  bits<10> offset;
+  bits<4> addr;
+  let Inst{23}    = offset{8};      // U bit
+  let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr;
+  let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+  let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+} // hasExtraDefRegAllocReq = 1
+} // mayLoad = 1, hasSideEffects = 0
+
+// LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT.
+let mayLoad = 1, hasSideEffects = 0 in {
+def LDRT_POST_REG : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                    (ins addr_offset_none:$addr, am2offset_reg:$offset),
+                    IndexModePost, LdFrm, IIC_iLoad_ru,
+                    "ldrt", "\t$Rt, $addr, $offset",
+                    "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def LDRT_POST_IMM
+  : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+               (ins addr_offset_none:$addr, am2offset_imm:$offset),
+               IndexModePost, LdFrm, IIC_iLoad_ru,
+               "ldrt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def LDRBT_POST_REG : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                     (ins addr_offset_none:$addr, am2offset_reg:$offset),
+                     IndexModePost, LdFrm, IIC_iLoad_bh_ru,
+                     "ldrbt", "\t$Rt, $addr, $offset",
+                     "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def LDRBT_POST_IMM
+  : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+               (ins addr_offset_none:$addr, am2offset_imm:$offset),
+               IndexModePost, LdFrm, IIC_iLoad_bh_ru,
+               "ldrbt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+multiclass AI3ldrT<bits<4> op, string opc> {
+  def i : AI3ldstidxT<op, 1, (outs GPR:$Rt, GPR:$base_wb),
+                      (ins addr_offset_none:$addr, postidx_imm8:$offset),
+                      IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc,
+                      "\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> {
+    bits<9> offset;
+    let Inst{23} = offset{8};
+    let Inst{22} = 1;
+    let Inst{11-8} = offset{7-4};
+    let Inst{3-0} = offset{3-0};
+  }
+  def r : AI3ldstidxT<op, 1, (outs GPRnopc:$Rt, GPRnopc:$base_wb),
+                      (ins addr_offset_none:$addr, postidx_reg:$Rm),
+                      IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc,
+                      "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> {
+    bits<5> Rm;
+    let Inst{23} = Rm{4};
+    let Inst{22} = 0;
+    let Inst{11-8} = 0;
+    let Unpredictable{11-8} = 0b1111;
+    let Inst{3-0} = Rm{3-0};
+    let DecoderMethod = "DecodeLDR";
+  }
+}
+
+defm LDRSBT : AI3ldrT<0b1101, "ldrsbt">;
+defm LDRHT  : AI3ldrT<0b1011, "ldrht">;
+defm LDRSHT : AI3ldrT<0b1111, "ldrsht">;
+}
+
+def LDRT_POST
+  : ARMAsmPseudo<"ldrt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q),
+                 (outs GPR:$Rt)>;
+
+def LDRBT_POST
+  : ARMAsmPseudo<"ldrbt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q),
+                 (outs GPR:$Rt)>;
+
+// Pseudo instruction ldr Rt, =immediate
+def LDRConstPool
+  : ARMAsmPseudo<"ldr${q} $Rt, $immediate",
+                 (ins const_pool_asm_imm:$immediate, pred:$q),
+                 (outs GPR:$Rt)>;
+
+// Store
+
+// Stores with truncate
+def STRH : AI3str<0b1011, (outs), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm,
+               IIC_iStore_bh_r, "strh", "\t$Rt, $addr",
+               [(truncstorei16 GPR:$Rt, addrmode3:$addr)]>;
+
+// Store doubleword
+let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
+  def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$Rt2, addrmode3:$addr),
+                    StMiscFrm, IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", []>,
+             Requires<[IsARM, HasV5TE]> {
+    let Inst{21} = 0;
+  }
+}
+
+// Indexed stores
+multiclass AI2_stridx<bit isByte, string opc,
+                      InstrItinClass iii, InstrItinClass iir> {
+  def _PRE_IMM : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb),
+                            (ins GPR:$Rt, addrmode_imm12_pre:$addr), IndexModePre,
+                            StFrm, iii,
+                            opc, "\t$Rt, $addr!",
+                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 0;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{11-0}  = addr{11-0};   // imm12
+    let DecoderMethod = "DecodeSTRPreImm";
+  }
+
+  def _PRE_REG  : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb),
+                      (ins GPR:$Rt, ldst_so_reg:$addr),
+                      IndexModePre, StFrm, iir,
+                      opc, "\t$Rt, $addr!",
+                      "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 1;
+    let Inst{23}    = addr{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{11-0}  = addr{11-0};
+    let Inst{4}     = 0;           // Inst{4} = 0
+    let DecoderMethod = "DecodeSTRPreReg";
+  }
+  def _POST_REG : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb),
+                (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+                IndexModePost, StFrm, iir,
+                opc, "\t$Rt, $addr, $offset",
+                "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+     // {12}     isAdd
+     // {11-0}   imm12/Rm
+     bits<14> offset;
+     bits<4> addr;
+     let Inst{25} = 1;
+     let Inst{23} = offset{12};
+     let Inst{19-16} = addr;
+     let Inst{11-0} = offset{11-0};
+     let Inst{4} = 0;
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+   }
+
+   def _POST_IMM : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb),
+                (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+                IndexModePost, StFrm, iii,
+                opc, "\t$Rt, $addr, $offset",
+                "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+    // {12}     isAdd
+    // {11-0}   imm12/Rm
+    bits<14> offset;
+    bits<4> addr;
+    let Inst{25} = 0;
+    let Inst{23} = offset{12};
+    let Inst{19-16} = addr;
+    let Inst{11-0} = offset{11-0};
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+  }
+}
+
+let mayStore = 1, hasSideEffects = 0 in {
+// FIXME: for STR_PRE_REG etc. the itineray should be either IIC_iStore_ru or
+// IIC_iStore_siu depending on whether it the offset register is shifted.
+defm STR  : AI2_stridx<0, "str", IIC_iStore_iu, IIC_iStore_ru>;
+defm STRB : AI2_stridx<1, "strb", IIC_iStore_bh_iu, IIC_iStore_bh_ru>;
+}
+
+def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr,
+                         am2offset_reg:$offset),
+             (STR_POST_REG GPR:$Rt, addr_offset_none:$addr,
+                           am2offset_reg:$offset)>;
+def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr,
+                         am2offset_imm:$offset),
+             (STR_POST_IMM GPR:$Rt, addr_offset_none:$addr,
+                           am2offset_imm:$offset)>;
+def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr,
+                             am2offset_reg:$offset),
+             (STRB_POST_REG GPR:$Rt, addr_offset_none:$addr,
+                            am2offset_reg:$offset)>;
+def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr,
+                             am2offset_imm:$offset),
+             (STRB_POST_IMM GPR:$Rt, addr_offset_none:$addr,
+                            am2offset_imm:$offset)>;
+
+// Pseudo-instructions for pattern matching the pre-indexed stores. We can't
+// put the patterns on the instruction definitions directly as ISel wants
+// the address base and offset to be separate operands, not a single
+// complex operand like we represent the instructions themselves. The
+// pseudos map between the two.
+let usesCustomInserter = 1,
+    Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in {
+def STRi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_store GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>;
+def STRr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_store GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>;
+def STRBi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>;
+def STRBr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>;
+def STRH_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am3offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>;
+}
+
+
+
+def STRH_PRE  : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb),
+                           (ins GPR:$Rt, addrmode3_pre:$addr), IndexModePre,
+                           StMiscFrm, IIC_iStore_bh_ru,
+                           "strh", "\t$Rt, $addr!",
+                           "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+  bits<14> addr;
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+def STRH_POST : AI3ldstidx<0b1011, 0, 0, (outs GPR:$Rn_wb),
+                       (ins GPR:$Rt, addr_offset_none:$addr, am3offset:$offset),
+                       IndexModePost, StMiscFrm, IIC_iStore_bh_ru,
+                       "strh", "\t$Rt, $addr, $offset",
+                       "$addr.base = $Rn_wb,@earlyclobber $Rn_wb",
+                   [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt,
+                                                      addr_offset_none:$addr,
+                                                      am3offset:$offset))]> {
+  bits<10> offset;
+  bits<4> addr;
+  let Inst{23}    = offset{8};      // U bit
+  let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr;
+  let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+  let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
+def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb),
+                          (ins GPR:$Rt, GPR:$Rt2, addrmode3_pre:$addr),
+                          IndexModePre, StMiscFrm, IIC_iStore_d_ru,
+                          "strd", "\t$Rt, $Rt2, $addr!",
+                          "$addr.base = $Rn_wb", []> {
+  bits<14> addr;
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+def STRD_POST: AI3ldstidx<0b1111, 0, 0, (outs GPR:$Rn_wb),
+                          (ins GPR:$Rt, GPR:$Rt2, addr_offset_none:$addr,
+                               am3offset:$offset),
+                          IndexModePost, StMiscFrm, IIC_iStore_d_ru,
+                          "strd", "\t$Rt, $Rt2, $addr, $offset",
+                          "$addr.base = $Rn_wb", []> {
+  bits<10> offset;
+  bits<4> addr;
+  let Inst{23}    = offset{8};      // U bit
+  let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr;
+  let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+  let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+} // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1
+
+// STRT, STRBT, and STRHT
+
+def STRBT_POST_REG : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
+                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+                   IndexModePost, StFrm, IIC_iStore_bh_ru,
+                   "strbt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def STRBT_POST_IMM
+  : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
+               (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+               IndexModePost, StFrm, IIC_iStore_bh_ru,
+               "strbt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def STRBT_POST
+  : ARMAsmPseudo<"strbt${q} $Rt, $addr",
+                 (ins GPR:$Rt, addr_offset_none:$addr, pred:$q)>;
+
+let mayStore = 1, hasSideEffects = 0 in {
+def STRT_POST_REG : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
+                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+                   IndexModePost, StFrm, IIC_iStore_ru,
+                   "strt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def STRT_POST_IMM
+  : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
+               (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+               IndexModePost, StFrm, IIC_iStore_ru,
+               "strt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+}
+
+def STRT_POST
+  : ARMAsmPseudo<"strt${q} $Rt, $addr",
+                 (ins GPR:$Rt, addr_offset_none:$addr, pred:$q)>;
+
+multiclass AI3strT<bits<4> op, string opc> {
+  def i : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
+                    (ins GPR:$Rt, addr_offset_none:$addr, postidx_imm8:$offset),
+                    IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc,
+                    "\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> {
+    bits<9> offset;
+    let Inst{23} = offset{8};
+    let Inst{22} = 1;
+    let Inst{11-8} = offset{7-4};
+    let Inst{3-0} = offset{3-0};
+  }
+  def r : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
+                      (ins GPR:$Rt, addr_offset_none:$addr, postidx_reg:$Rm),
+                      IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc,
+                      "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> {
+    bits<5> Rm;
+    let Inst{23} = Rm{4};
+    let Inst{22} = 0;
+    let Inst{11-8} = 0;
+    let Inst{3-0} = Rm{3-0};
+  }
+}
+
+
+defm STRHT : AI3strT<0b1011, "strht">;
+
+def STL : AIstrrel<0b00, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+                   NoItinerary, "stl", "\t$Rt, $addr", []>;
+def STLB : AIstrrel<0b10, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlb", "\t$Rt, $addr", []>;
+def STLH : AIstrrel<0b11, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlh", "\t$Rt, $addr", []>;
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+multiclass arm_ldst_mult<string asm, string sfx, bit L_bit, bit P_bit, Format f,
+                         InstrItinClass itin, InstrItinClass itin_upd> {
+  // IA is the default, so no need for an explicit suffix on the
+  // mnemonic here. Without it is the canonical spelling.
+  def IA :
+    AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeNone, f, itin,
+         !strconcat(asm, "${p}\t$Rn, $regs", sfx), "", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def IA_UPD :
+    AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeUpd, f, itin_upd,
+         !strconcat(asm, "${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
+  }
+  def DA :
+    AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeNone, f, itin,
+         !strconcat(asm, "da${p}\t$Rn, $regs", sfx), "", []> {
+    let Inst{24-23} = 0b00;       // Decrement After
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def DA_UPD :
+    AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeUpd, f, itin_upd,
+         !strconcat(asm, "da${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b00;       // Decrement After
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
+  }
+  def DB :
+    AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeNone, f, itin,
+         !strconcat(asm, "db${p}\t$Rn, $regs", sfx), "", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def DB_UPD :
+    AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeUpd, f, itin_upd,
+         !strconcat(asm, "db${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
+  }
+  def IB :
+    AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeNone, f, itin,
+         !strconcat(asm, "ib${p}\t$Rn, $regs", sfx), "", []> {
+    let Inst{24-23} = 0b11;       // Increment Before
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def IB_UPD :
+    AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeUpd, f, itin_upd,
+         !strconcat(asm, "ib${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b11;       // Increment Before
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
+  }
+}
+
+let hasSideEffects = 0 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm LDM : arm_ldst_mult<"ldm", "", 1, 0, LdStMulFrm, IIC_iLoad_m,
+                         IIC_iLoad_mu>, ComplexDeprecationPredicate<"ARMLoad">;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm STM : arm_ldst_mult<"stm", "", 0, 0, LdStMulFrm, IIC_iStore_m,
+                         IIC_iStore_mu>,
+           ComplexDeprecationPredicate<"ARMStore">;
+
+} // hasSideEffects
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+// FIXME: Should pc be an implicit operand like PICADD, etc?
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+    hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in
+def LDMIA_RET : ARMPseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
+                                                 reglist:$regs, variable_ops),
+                     4, IIC_iLoad_mBr, [],
+                     (LDMIA_UPD GPR:$wb, GPR:$Rn, pred:$p, reglist:$regs)>,
+      RegConstraint<"$Rn = $wb">;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm sysLDM : arm_ldst_mult<"ldm", " ^", 1, 1, LdStMulFrm, IIC_iLoad_m,
+                               IIC_iLoad_mu>;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm sysSTM : arm_ldst_mult<"stm", " ^", 0, 1, LdStMulFrm, IIC_iStore_m,
+                               IIC_iStore_mu>;
+
+
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions.
+//
+
+let hasSideEffects = 0 in
+def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
+                "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<4> Rm;
+
+  let Inst{19-16} = 0b0000;
+  let Inst{11-4} = 0b00000000;
+  let Inst{25} = 0;
+  let Inst{3-0} = Rm;
+  let Inst{15-12} = Rd;
+}
+
+// A version for the smaller set of tail call registers.
+let hasSideEffects = 0 in
+def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm,
+                IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<4> Rm;
+
+  let Inst{11-4} = 0b00000000;
+  let Inst{25} = 0;
+  let Inst{3-0} = Rm;
+  let Inst{15-12} = Rd;
+}
+
+def MOVsr : AsI1<0b1101, (outs GPRnopc:$Rd), (ins shift_so_reg_reg:$src),
+                DPSoRegRegFrm, IIC_iMOVsr,
+                "mov", "\t$Rd, $src",
+                [(set GPRnopc:$Rd, shift_so_reg_reg:$src)]>, UnaryDP,
+                Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<12> src;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = 0b0000;
+  let Inst{11-8} = src{11-8};
+  let Inst{7} = 0;
+  let Inst{6-5} = src{6-5};
+  let Inst{4} = 1;
+  let Inst{3-0} = src{3-0};
+  let Inst{25} = 0;
+}
+
+def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src),
+                DPSoRegImmFrm, IIC_iMOVsr,
+                "mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg_imm:$src)]>,
+                UnaryDP, Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<12> src;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = 0b0000;
+  let Inst{11-5} = src{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = src{3-0};
+  let Inst{25} = 0;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm, IIC_iMOVi,
+                "mov", "\t$Rd, $imm", [(set GPR:$Rd, mod_imm:$imm)]>, UnaryDP,
+                Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = 0b0000;
+  let Inst{11-0} = imm;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins imm0_65535_expr:$imm),
+                 DPFrm, IIC_iMOVi,
+                 "movw", "\t$Rd, $imm",
+                 [(set GPR:$Rd, imm0_65535:$imm)]>,
+                 Requires<[IsARM, HasV6T2]>, UnaryDP, Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<16> imm;
+  let Inst{15-12} = Rd;
+  let Inst{11-0}  = imm{11-0};
+  let Inst{19-16} = imm{15-12};
+  let Inst{20} = 0;
+  let Inst{25} = 1;
+  let DecoderMethod = "DecodeArmMOVTWInstruction";
+}
+
+def : InstAlias<"mov${p} $Rd, $imm",
+                (MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p), 0>,
+        Requires<[IsARM, HasV6T2]>;
+
+def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
+                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+                      Sched<[WriteALU]>;
+
+let Constraints = "$src = $Rd" in {
+def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd),
+                  (ins GPR:$src, imm0_65535_expr:$imm),
+                  DPFrm, IIC_iMOVi,
+                  "movt", "\t$Rd, $imm",
+                  [(set GPRnopc:$Rd,
+                        (or (and GPR:$src, 0xffff),
+                            lo16AllZero:$imm))]>, UnaryDP,
+                  Requires<[IsARM, HasV6T2]>, Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<16> imm;
+  let Inst{15-12} = Rd;
+  let Inst{11-0}  = imm{11-0};
+  let Inst{19-16} = imm{15-12};
+  let Inst{20} = 0;
+  let Inst{25} = 1;
+  let DecoderMethod = "DecodeArmMOVTWInstruction";
+}
+
+def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
+                      (ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+                      Sched<[WriteALU]>;
+
+} // Constraints
+
+def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>,
+      Requires<[IsARM, HasV6T2]>;
+
+let Uses = [CPSR] in
+def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi,
+                    [(set GPR:$Rd, (ARMrrx GPR:$Rm))]>, UnaryDP,
+                    Requires<[IsARM]>, Sched<[WriteALU]>;
+
+// These aren't really mov instructions, but we have to define them this way
+// due to flag operands.
+
+let Defs = [CPSR] in {
+def MOVsrl_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+                      [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP,
+                      Sched<[WriteALU]>, Requires<[IsARM]>;
+def MOVsra_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+                      [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP,
+                      Sched<[WriteALU]>, Requires<[IsARM]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Extend Instructions.
+//
+
+// Sign extenders
+
+def SXTB  : AI_ext_rrot<0b01101010,
+                         "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>;
+def SXTH  : AI_ext_rrot<0b01101011,
+                         "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>;
+
+def SXTAB : AI_exta_rrot<0b01101010,
+               "sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
+def SXTAH : AI_exta_rrot<0b01101011,
+               "sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
+
+def : ARMV6Pat<(add rGPR:$Rn, (sext_inreg (srl rGPR:$Rm, rot_imm:$rot), i8)),
+               (SXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : ARMV6Pat<(add rGPR:$Rn, (sext_inreg (srl rGPR:$Rm, imm8_or_16:$rot),
+                                          i16)),
+               (SXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+
+def SXTB16  : AI_ext_rrot_np<0b01101000, "sxtb16">;
+
+def SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">;
+
+// Zero extenders
+
+let AddedComplexity = 16 in {
+def UXTB   : AI_ext_rrot<0b01101110,
+                          "uxtb"  , UnOpFrag<(and node:$Src, 0x000000FF)>>;
+def UXTH   : AI_ext_rrot<0b01101111,
+                          "uxth"  , UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
+def UXTB16 : AI_ext_rrot<0b01101100,
+                          "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+
+// FIXME: This pattern incorrectly assumes the shl operator is a rotate.
+//        The transformation should probably be done as a combiner action
+//        instead so we can include a check for masking back in the upper
+//        eight bits of the source into the lower eight bits of the result.
+//def : ARMV6Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF),
+//               (UXTB16r_rot GPR:$Src, 3)>;
+def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
+               (UXTB16 GPR:$Src, 1)>;
+
+def UXTAB : AI_exta_rrot<0b01101110, "uxtab",
+                        BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
+def UXTAH : AI_exta_rrot<0b01101111, "uxtah",
+                        BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
+
+def : ARMV6Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, rot_imm:$rot), 0xFF)),
+               (UXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : ARMV6Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot), 0xFFFF)),
+               (UXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+}
+
+// This isn't safe in general, the add is two 16-bit units, not a 32-bit add.
+def UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">;
+
+
+def SBFX  : I<(outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width),
+               AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+               "sbfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
+               Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<5> lsb;
+  bits<5> width;
+  let Inst{27-21} = 0b0111101;
+  let Inst{6-4}   = 0b101;
+  let Inst{20-16} = width;
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = lsb;
+  let Inst{3-0}   = Rn;
+}
+
+def UBFX  : I<(outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width),
+               AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+               "ubfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
+               Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<5> lsb;
+  bits<5> width;
+  let Inst{27-21} = 0b0111111;
+  let Inst{6-4}   = 0b101;
+  let Inst{20-16} = width;
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = lsb;
+  let Inst{3-0}   = Rn;
+}
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+let isAdd = 1 in
+defm ADD  : AsI1_bin_irs<0b0100, "add",
+                         IIC_iALUi, IIC_iALUr, IIC_iALUsr, add, 1>;
+defm SUB  : AsI1_bin_irs<0b0010, "sub",
+                         IIC_iALUi, IIC_iALUr, IIC_iALUsr, sub>;
+
+// ADD and SUB with 's' bit set.
+//
+// Currently, ADDS/SUBS are pseudo opcodes that exist only in the
+// selection DAG. They are "lowered" to real ADD/SUB opcodes by
+// AdjustInstrPostInstrSelection where we determine whether or not to
+// set the "s" bit based on CPSR liveness.
+//
+// FIXME: Eliminate ADDS/SUBS pseudo opcodes after adding tablegen
+// support for an optional CPSR definition that corresponds to the DAG
+// node's second value. We can then eliminate the implicit def of CPSR.
+let isAdd = 1 in
+defm ADDS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMaddc, 1>;
+defm SUBS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMsubc>;
+
+let isAdd = 1 in
+defm ADC : AI1_adde_sube_irs<0b0101, "adc", ARMadde, 1>;
+defm SBC : AI1_adde_sube_irs<0b0110, "sbc", ARMsube>;
+
+defm RSB  : AsI1_rbin_irs<0b0011, "rsb",
+                          IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                          sub>;
+
+// FIXME: Eliminate them if we can write def : Pat patterns which defines
+// CPSR and the implicit def of CPSR is not needed.
+defm RSBS : AsI1_rbin_s_is<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMsubc>;
+
+defm RSC : AI1_rsc_irs<0b0111, "rsc", ARMsube>;
+
+// (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
+// The assume-no-carry-in form uses the negation of the input since add/sub
+// assume opposite meanings of the carry flag (i.e., carry == !borrow).
+// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory
+// details.
+def : ARMPat<(add     GPR:$src, mod_imm_neg:$imm),
+             (SUBri   GPR:$src, mod_imm_neg:$imm)>;
+def : ARMPat<(ARMaddc GPR:$src, mod_imm_neg:$imm),
+             (SUBSri  GPR:$src, mod_imm_neg:$imm)>;
+
+def : ARMPat<(add     GPR:$src, imm0_65535_neg:$imm),
+             (SUBrr   GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
+             Requires<[IsARM, HasV6T2]>;
+def : ARMPat<(ARMaddc GPR:$src, imm0_65535_neg:$imm),
+             (SUBSrr  GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
+             Requires<[IsARM, HasV6T2]>;
+
+// The with-carry-in form matches bitwise not instead of the negation.
+// Effectively, the inverse interpretation of the carry flag already accounts
+// for part of the negation.
+def : ARMPat<(ARMadde GPR:$src, mod_imm_not:$imm, CPSR),
+             (SBCri   GPR:$src, mod_imm_not:$imm)>;
+def : ARMPat<(ARMadde GPR:$src, imm0_65535_neg:$imm, CPSR),
+             (SBCrr   GPR:$src, (MOVi16 (imm_not_XFORM imm:$imm)))>,
+             Requires<[IsARM, HasV6T2]>;
+
+// Note: These are implemented in C++ code, because they have to generate
+// ADD/SUBrs instructions, which use a complex pattern that a xform function
+// cannot produce.
+// (mul X, 2^n+1) -> (add (X << n), X)
+// (mul X, 2^n-1) -> (rsb X, (X << n))
+
+// ARM Arithmetic Instruction
+// GPR:$dst = GPR:$a op GPR:$b
+class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
+          list<dag> pattern = [],
+          dag iops = (ins GPRnopc:$Rn, GPRnopc:$Rm),
+          string asm = "\t$Rd, $Rn, $Rm">
+  : AI<(outs GPRnopc:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern>,
+    Sched<[WriteALU, ReadALU, ReadALU]> {
+  bits<4> Rn;
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{27-20} = op27_20;
+  let Inst{11-4} = op11_4;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{3-0}   = Rm;
+
+  let Unpredictable{11-8} = 0b1111;
+}
+
+// Saturating add/subtract
+
+let DecoderMethod = "DecodeQADDInstruction" in
+def QADD    : AAI<0b00010000, 0b00000101, "qadd",
+                  [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
+
+def QSUB    : AAI<0b00010010, 0b00000101, "qsub",
+                  [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, GPRnopc:$Rn))],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
+def QDADD   : AAI<0b00010100, 0b00000101, "qdadd", [],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn),
+                  "\t$Rd, $Rm, $Rn">;
+def QDSUB   : AAI<0b00010110, 0b00000101, "qdsub", [],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn),
+                  "\t$Rd, $Rm, $Rn">;
+
+def QADD16  : AAI<0b01100010, 0b11110001, "qadd16">;
+def QADD8   : AAI<0b01100010, 0b11111001, "qadd8">;
+def QASX    : AAI<0b01100010, 0b11110011, "qasx">;
+def QSAX    : AAI<0b01100010, 0b11110101, "qsax">;
+def QSUB16  : AAI<0b01100010, 0b11110111, "qsub16">;
+def QSUB8   : AAI<0b01100010, 0b11111111, "qsub8">;
+def UQADD16 : AAI<0b01100110, 0b11110001, "uqadd16">;
+def UQADD8  : AAI<0b01100110, 0b11111001, "uqadd8">;
+def UQASX   : AAI<0b01100110, 0b11110011, "uqasx">;
+def UQSAX   : AAI<0b01100110, 0b11110101, "uqsax">;
+def UQSUB16 : AAI<0b01100110, 0b11110111, "uqsub16">;
+def UQSUB8  : AAI<0b01100110, 0b11111111, "uqsub8">;
+
+// Signed/Unsigned add/subtract
+
+def SASX   : AAI<0b01100001, 0b11110011, "sasx">;
+def SADD16 : AAI<0b01100001, 0b11110001, "sadd16">;
+def SADD8  : AAI<0b01100001, 0b11111001, "sadd8">;
+def SSAX   : AAI<0b01100001, 0b11110101, "ssax">;
+def SSUB16 : AAI<0b01100001, 0b11110111, "ssub16">;
+def SSUB8  : AAI<0b01100001, 0b11111111, "ssub8">;
+def UASX   : AAI<0b01100101, 0b11110011, "uasx">;
+def UADD16 : AAI<0b01100101, 0b11110001, "uadd16">;
+def UADD8  : AAI<0b01100101, 0b11111001, "uadd8">;
+def USAX   : AAI<0b01100101, 0b11110101, "usax">;
+def USUB16 : AAI<0b01100101, 0b11110111, "usub16">;
+def USUB8  : AAI<0b01100101, 0b11111111, "usub8">;
+
+// Signed/Unsigned halving add/subtract
+
+def SHASX   : AAI<0b01100011, 0b11110011, "shasx">;
+def SHADD16 : AAI<0b01100011, 0b11110001, "shadd16">;
+def SHADD8  : AAI<0b01100011, 0b11111001, "shadd8">;
+def SHSAX   : AAI<0b01100011, 0b11110101, "shsax">;
+def SHSUB16 : AAI<0b01100011, 0b11110111, "shsub16">;
+def SHSUB8  : AAI<0b01100011, 0b11111111, "shsub8">;
+def UHASX   : AAI<0b01100111, 0b11110011, "uhasx">;
+def UHADD16 : AAI<0b01100111, 0b11110001, "uhadd16">;
+def UHADD8  : AAI<0b01100111, 0b11111001, "uhadd8">;
+def UHSAX   : AAI<0b01100111, 0b11110101, "uhsax">;
+def UHSUB16 : AAI<0b01100111, 0b11110111, "uhsub16">;
+def UHSUB8  : AAI<0b01100111, 0b11111111, "uhsub8">;
+
+// Unsigned Sum of Absolute Differences [and Accumulate].
+
+def USAD8  : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                MulFrm /* for convenience */, NoItinerary, "usad8",
+                "\t$Rd, $Rn, $Rm", []>,
+             Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{27-20} = 0b01111000;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-4} = 0b0001;
+  let Inst{19-16} = Rd;
+  let Inst{11-8} = Rm;
+  let Inst{3-0} = Rn;
+}
+def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+                MulFrm /* for convenience */, NoItinerary, "usada8",
+                "\t$Rd, $Rn, $Rm, $Ra", []>,
+             Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]>{
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  bits<4> Ra;
+  let Inst{27-20} = 0b01111000;
+  let Inst{7-4} = 0b0001;
+  let Inst{19-16} = Rd;
+  let Inst{15-12} = Ra;
+  let Inst{11-8} = Rm;
+  let Inst{3-0} = Rn;
+}
+
+// Signed/Unsigned saturate
+
+def SSAT : AI<(outs GPRnopc:$Rd),
+              (ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
+              SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+              Requires<[IsARM,HasV6]>{
+  bits<4> Rd;
+  bits<5> sat_imm;
+  bits<4> Rn;
+  bits<8> sh;
+  let Inst{27-21} = 0b0110101;
+  let Inst{5-4} = 0b01;
+  let Inst{20-16} = sat_imm;
+  let Inst{15-12} = Rd;
+  let Inst{11-7} = sh{4-0};
+  let Inst{6} = sh{5};
+  let Inst{3-0} = Rn;
+}
+
+def SSAT16 : AI<(outs GPRnopc:$Rd),
+                (ins imm1_16:$sat_imm, GPRnopc:$Rn), SatFrm,
+                NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
+                Requires<[IsARM,HasV6]>{
+  bits<4> Rd;
+  bits<4> sat_imm;
+  bits<4> Rn;
+  let Inst{27-20} = 0b01101010;
+  let Inst{11-4} = 0b11110011;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = sat_imm;
+  let Inst{3-0} = Rn;
+}
+
+def USAT : AI<(outs GPRnopc:$Rd),
+              (ins imm0_31:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
+              SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+              Requires<[IsARM,HasV6]> {
+  bits<4> Rd;
+  bits<5> sat_imm;
+  bits<4> Rn;
+  bits<8> sh;
+  let Inst{27-21} = 0b0110111;
+  let Inst{5-4} = 0b01;
+  let Inst{15-12} = Rd;
+  let Inst{11-7} = sh{4-0};
+  let Inst{6} = sh{5};
+  let Inst{20-16} = sat_imm;
+  let Inst{3-0} = Rn;
+}
+
+def USAT16 : AI<(outs GPRnopc:$Rd),
+                (ins imm0_15:$sat_imm, GPRnopc:$Rn), SatFrm,
+                NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []>,
+                Requires<[IsARM,HasV6]>{
+  bits<4> Rd;
+  bits<4> sat_imm;
+  bits<4> Rn;
+  let Inst{27-20} = 0b01101110;
+  let Inst{11-4} = 0b11110011;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = sat_imm;
+  let Inst{3-0} = Rn;
+}
+
+def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm1_32:$pos),
+               (SSAT imm1_32:$pos, GPRnopc:$a, 0)>;
+def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos),
+               (USAT imm0_31:$pos, GPRnopc:$a, 0)>;
+def : ARMPat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
+             (SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
+
+//===----------------------------------------------------------------------===//
+//  Bitwise Instructions.
+//
+
+defm AND   : AsI1_bin_irs<0b0000, "and",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr, and, 1>;
+defm ORR   : AsI1_bin_irs<0b1100, "orr",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr, or, 1>;
+defm EOR   : AsI1_bin_irs<0b0001, "eor",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr, xor, 1>;
+defm BIC   : AsI1_bin_irs<0b1110, "bic",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
+                          BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+
+// FIXME: bf_inv_mask_imm should be two operands, the lsb and the msb, just
+// like in the actual instruction encoding. The complexity of mapping the mask
+// to the lsb/msb pair should be handled by ISel, not encapsulated in the
+// instruction description.
+def BFC    : I<(outs GPR:$Rd), (ins GPR:$src, bf_inv_mask_imm:$imm),
+               AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+               "bfc", "\t$Rd, $imm", "$src = $Rd",
+               [(set GPR:$Rd, (and GPR:$src, bf_inv_mask_imm:$imm))]>,
+               Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<10> imm;
+  let Inst{27-21} = 0b0111110;
+  let Inst{6-0}   = 0b0011111;
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = imm{4-0}; // lsb
+  let Inst{20-16} = imm{9-5}; // msb
+}
+
+// A8.6.18  BFI - Bitfield insert (Encoding A1)
+def BFI:I<(outs GPRnopc:$Rd), (ins GPRnopc:$src, GPR:$Rn, bf_inv_mask_imm:$imm),
+          AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+          "bfi", "\t$Rd, $Rn, $imm", "$src = $Rd",
+          [(set GPRnopc:$Rd, (ARMbfi GPRnopc:$src, GPR:$Rn,
+                           bf_inv_mask_imm:$imm))]>,
+          Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<10> imm;
+  let Inst{27-21} = 0b0111110;
+  let Inst{6-4}   = 0b001; // Rn: Inst{3-0} != 15
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = imm{4-0}; // lsb
+  let Inst{20-16} = imm{9-5}; // width
+  let Inst{3-0}   = Rn;
+}
+
+def  MVNr  : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr,
+                  "mvn", "\t$Rd, $Rm",
+                  [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP, Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{25} = 0;
+  let Inst{19-16} = 0b0000;
+  let Inst{11-4} = 0b00000000;
+  let Inst{15-12} = Rd;
+  let Inst{3-0} = Rm;
+}
+def  MVNsi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift),
+                  DPSoRegImmFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
+                  [(set GPR:$Rd, (not so_reg_imm:$shift))]>, UnaryDP,
+                  Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-12} = Rd;
+  let Inst{11-5} = shift{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = shift{3-0};
+}
+def  MVNsr  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift),
+                  DPSoRegRegFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
+                  [(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP,
+                  Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-12} = Rd;
+  let Inst{11-8} = shift{11-8};
+  let Inst{7} = 0;
+  let Inst{6-5} = shift{6-5};
+  let Inst{4} = 1;
+  let Inst{3-0} = shift{3-0};
+}
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def  MVNi  : AsI1<0b1111, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm,
+                  IIC_iMVNi, "mvn", "\t$Rd, $imm",
+                  [(set GPR:$Rd, mod_imm_not:$imm)]>,UnaryDP, Sched<[WriteALU]> {
+  bits<4> Rd;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-12} = Rd;
+  let Inst{11-0} = imm;
+}
+
+def : ARMPat<(and   GPR:$src, mod_imm_not:$imm),
+             (BICri GPR:$src, mod_imm_not:$imm)>;
+
+//===----------------------------------------------------------------------===//
+//  Multiply Instructions.
+//
+class AsMul1I32<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = Rd;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+class AsMul1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+class AsMla1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+
+// FIXME: The v5 pseudos are only necessary for the additional Constraint
+//        property. Remove them when it's possible to add those properties
+//        on an individual MachineInstr, not just an instruction description.
+let isCommutable = 1, TwoOperandAliasConstraint = "$Rn = $Rd" in {
+def MUL : AsMul1I32<0b0000000, (outs GPRnopc:$Rd),
+                    (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                    IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm",
+                  [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>,
+                  Requires<[IsARM, HasV6]> {
+  let Inst{15-12} = 0b0000;
+  let Unpredictable{15-12} = 0b1111;
+}
+
+let Constraints = "@earlyclobber $Rd" in
+def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm,
+                                                    pred:$p, cc_out:$s),
+                           4, IIC_iMUL32,
+               [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))],
+               (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
+               Requires<[IsARM, NoV6, UseMulOps]>;
+}
+
+def MLA  : AsMul1I32<0b0000001, (outs GPRnopc:$Rd),
+                     (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra),
+                     IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra",
+        [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))]>,
+                     Requires<[IsARM, HasV6, UseMulOps]> {
+  bits<4> Ra;
+  let Inst{15-12} = Ra;
+}
+
+let Constraints = "@earlyclobber $Rd" in
+def MLAv5: ARMPseudoExpand<(outs GPRnopc:$Rd),
+                           (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra,
+                            pred:$p, cc_out:$s), 4, IIC_iMAC32,
+         [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))],
+  (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra, pred:$p, cc_out:$s)>,
+                           Requires<[IsARM, NoV6]>;
+
+def MLS  : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+                   IIC_iMAC32, "mls", "\t$Rd, $Rn, $Rm, $Ra",
+                   [(set GPR:$Rd, (sub GPR:$Ra, (mul GPR:$Rn, GPR:$Rm)))]>,
+                   Requires<[IsARM, HasV6T2, UseMulOps]> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<4> Rn;
+  bits<4> Ra;
+  let Inst{19-16} = Rd;
+  let Inst{15-12} = Ra;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+
+// Extra precision multiplies with low / high results
+let hasSideEffects = 0 in {
+let isCommutable = 1 in {
+def SMULL : AsMul1I64<0b0000110, (outs GPR:$RdLo, GPR:$RdHi),
+                                 (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
+                    "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                    Requires<[IsARM, HasV6]>;
+
+def UMULL : AsMul1I64<0b0000100, (outs GPR:$RdLo, GPR:$RdHi),
+                                 (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
+                    "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                    Requires<[IsARM, HasV6]>;
+
+let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
+def SMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
+                            4, IIC_iMUL64, [],
+          (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+                           Requires<[IsARM, NoV6]>;
+
+def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
+                            4, IIC_iMUL64, [],
+          (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+                           Requires<[IsARM, NoV6]>;
+}
+}
+
+// Multiply + accumulate
+def SMLAL : AsMla1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi),
+                        (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
+                    "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
+                        (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
+                    "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+
+def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
+                               (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                               IIC_iMAC64,
+                    "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+
+let Constraints =
+    "@earlyclobber $RdLo,@earlyclobber $RdHi,$RLo = $RdLo,$RHi = $RdHi" in {
+def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+                (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
+                              4, IIC_iMAC64, [],
+             (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
+                           pred:$p, cc_out:$s)>,
+                           Requires<[IsARM, NoV6]>;
+def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+                (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
+                              4, IIC_iMAC64, [],
+             (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
+                           pred:$p, cc_out:$s)>,
+                           Requires<[IsARM, NoV6]>;
+}
+
+} // hasSideEffects
+
+// Most significant word multiply
+def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+               IIC_iMUL32, "smmul", "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, (mulhs GPR:$Rn, GPR:$Rm))]>,
+            Requires<[IsARM, HasV6]> {
+  let Inst{15-12} = 0b1111;
+}
+
+def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+               IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>,
+            Requires<[IsARM, HasV6]> {
+  let Inst{15-12} = 0b1111;
+}
+
+def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd),
+               (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+               IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra",
+               [(set GPR:$Rd, (add (mulhs GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
+            Requires<[IsARM, HasV6, UseMulOps]>;
+
+def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
+               (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+               IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
+            Requires<[IsARM, HasV6]>;
+
+def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
+               (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+               IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", []>,
+            Requires<[IsARM, HasV6, UseMulOps]>;
+
+def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
+               (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+               IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
+            Requires<[IsARM, HasV6]>;
+
+multiclass AI_smul<string opc> {
+  def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
+                                      (sext_inreg GPR:$Rm, i16)))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
+                                      (sra GPR:$Rm, (i32 16))))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
+                                      (sext_inreg GPR:$Rm, i16)))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
+                                      (sra GPR:$Rm, (i32 16))))]>,
+            Requires<[IsARM, HasV5TE]>;
+
+  def WB : AMulxyI<0b0001001, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
+              []>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def WT : AMulxyI<0b0001001, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
+              []>,
+            Requires<[IsARM, HasV5TE]>;
+}
+
+
+multiclass AI_smla<string opc> {
+  let DecoderMethod = "DecodeSMLAInstruction" in {
+  def BB : AMulxyIa<0b0001000, 0b00, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPRnopc:$Rd, (add GPR:$Ra,
+                               (mul (sext_inreg GPRnopc:$Rn, i16),
+                                       (sext_inreg GPRnopc:$Rm, i16))))]>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+  def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (mul (sext_inreg GPRnopc:$Rn, i16),
+                                          (sra GPRnopc:$Rm, (i32 16)))))]>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+  def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
+                                          (sext_inreg GPRnopc:$Rm, i16))))]>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+  def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
+             [(set GPRnopc:$Rd,
+                   (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
+                                         (sra GPRnopc:$Rm, (i32 16)))))]>,
+            Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+  def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
+              []>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+  def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
+              []>,
+            Requires<[IsARM, HasV5TE, UseMulOps]>;
+  }
+}
+
+defm SMUL : AI_smul<"smul">;
+defm SMLA : AI_smla<"smla">;
+
+// Halfword multiply accumulate long: SMLAL<x><y>.
+def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+              Requires<[IsARM, HasV5TE]>;
+
+def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+              Requires<[IsARM, HasV5TE]>;
+
+def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+              Requires<[IsARM, HasV5TE]>;
+
+def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+              Requires<[IsARM, HasV5TE]>;
+
+// Helper class for AI_smld.
+class AMulDualIbase<bit long, bit sub, bit swap, dag oops, dag iops,
+                    InstrItinClass itin, string opc, string asm>
+  : AI<oops, iops, MulFrm, itin, opc, asm, []>, Requires<[IsARM, HasV6]> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{27-23} = 0b01110;
+  let Inst{22}    = long;
+  let Inst{21-20} = 0b00;
+  let Inst{11-8}  = Rm;
+  let Inst{7}     = 0;
+  let Inst{6}     = sub;
+  let Inst{5}     = swap;
+  let Inst{4}     = 1;
+  let Inst{3-0}   = Rn;
+}
+class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops,
+                InstrItinClass itin, string opc, string asm>
+  : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
+  bits<4> Rd;
+  let Inst{15-12} = 0b1111;
+  let Inst{19-16} = Rd;
+}
+class AMulDualIa<bit long, bit sub, bit swap, dag oops, dag iops,
+                InstrItinClass itin, string opc, string asm>
+  : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
+  bits<4> Ra;
+  bits<4> Rd;
+  let Inst{19-16} = Rd;
+  let Inst{15-12} = Ra;
+}
+class AMulDualI64<bit long, bit sub, bit swap, dag oops, dag iops,
+                  InstrItinClass itin, string opc, string asm>
+  : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+}
+
+multiclass AI_smld<bit sub, string opc> {
+
+  def D : AMulDualIa<0, sub, 0, (outs GPRnopc:$Rd),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">;
+
+  def DX: AMulDualIa<0, sub, 1, (outs GPRnopc:$Rd),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">;
+
+  def LD: AMulDualI64<1, sub, 0, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
+                  !strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">;
+
+  def LDX : AMulDualI64<1, sub, 1, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
+                  !strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">;
+
+}
+
+defm SMLA : AI_smld<0, "smla">;
+defm SMLS : AI_smld<1, "smls">;
+
+multiclass AI_sdml<bit sub, string opc> {
+
+  def D:AMulDualI<0, sub, 0, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">;
+  def DX:AMulDualI<0, sub, 1, (outs GPRnopc:$Rd),(ins GPRnopc:$Rn, GPRnopc:$Rm),
+                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">;
+}
+
+defm SMUA : AI_sdml<0, "smua">;
+defm SMUS : AI_sdml<1, "smus">;
+
+//===----------------------------------------------------------------------===//
+//  Division Instructions (ARMv7-A with virtualization extension)
+//
+def SDIV : ADivA1I<0b001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
+                   "sdiv", "\t$Rd, $Rn, $Rm",
+                   [(set GPR:$Rd, (sdiv GPR:$Rn, GPR:$Rm))]>,
+           Requires<[IsARM, HasDivideInARM]>;
+
+def UDIV : ADivA1I<0b011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
+                   "udiv", "\t$Rd, $Rn, $Rm",
+                   [(set GPR:$Rd, (udiv GPR:$Rn, GPR:$Rm))]>,
+           Requires<[IsARM, HasDivideInARM]>;
+
+//===----------------------------------------------------------------------===//
+//  Misc. Arithmetic Instructions.
+//
+
+def CLZ  : AMiscA1I<0b00010110, 0b0001, (outs GPR:$Rd), (ins GPR:$Rm),
+              IIC_iUNAr, "clz", "\t$Rd, $Rm",
+              [(set GPR:$Rd, (ctlz GPR:$Rm))]>, Requires<[IsARM, HasV5T]>,
+           Sched<[WriteALU]>;
+
+def RBIT : AMiscA1I<0b01101111, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm),
+              IIC_iUNAr, "rbit", "\t$Rd, $Rm",
+              [(set GPR:$Rd, (bitreverse GPR:$Rm))]>,
+           Requires<[IsARM, HasV6T2]>,
+           Sched<[WriteALU]>;
+
+def REV  : AMiscA1I<0b01101011, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm),
+              IIC_iUNAr, "rev", "\t$Rd, $Rm",
+              [(set GPR:$Rd, (bswap GPR:$Rm))]>, Requires<[IsARM, HasV6]>,
+           Sched<[WriteALU]>;
+
+let AddedComplexity = 5 in
+def REV16 : AMiscA1I<0b01101011, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
+               IIC_iUNAr, "rev16", "\t$Rd, $Rm",
+               [(set GPR:$Rd, (rotr (bswap GPR:$Rm), (i32 16)))]>,
+               Requires<[IsARM, HasV6]>,
+           Sched<[WriteALU]>;
+
+def : ARMV6Pat<(srl (bswap (extloadi16 addrmode3:$addr)), (i32 16)),
+              (REV16 (LDRH addrmode3:$addr))>;
+def : ARMV6Pat<(truncstorei16 (srl (bswap GPR:$Rn), (i32 16)), addrmode3:$addr),
+               (STRH (REV16 GPR:$Rn), addrmode3:$addr)>;
+
+let AddedComplexity = 5 in
+def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
+               IIC_iUNAr, "revsh", "\t$Rd, $Rm",
+               [(set GPR:$Rd, (sra (bswap GPR:$Rm), (i32 16)))]>,
+               Requires<[IsARM, HasV6]>,
+           Sched<[WriteALU]>;
+
+def : ARMV6Pat<(or (sra (shl GPR:$Rm, (i32 24)), (i32 16)),
+                   (and (srl GPR:$Rm, (i32 8)), 0xFF)),
+               (REVSH GPR:$Rm)>;
+
+def PKHBT : APKHI<0b01101000, 0, (outs GPRnopc:$Rd),
+                              (ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_lsl_amt:$sh),
+               IIC_iALUsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh",
+               [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF),
+                                      (and (shl GPRnopc:$Rm, pkh_lsl_amt:$sh),
+                                           0xFFFF0000)))]>,
+               Requires<[IsARM, HasV6]>,
+           Sched<[WriteALUsi, ReadALU]>;
+
+// Alternate cases for PKHBT where identities eliminate some nodes.
+def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (and GPRnopc:$Rm, 0xFFFF0000)),
+               (PKHBT GPRnopc:$Rn, GPRnopc:$Rm, 0)>;
+def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (shl GPRnopc:$Rm, imm16_31:$sh)),
+               (PKHBT GPRnopc:$Rn, GPRnopc:$Rm, imm16_31:$sh)>;
+
+// Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
+// will match the pattern below.
+def PKHTB : APKHI<0b01101000, 1, (outs GPRnopc:$Rd),
+                              (ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_asr_amt:$sh),
+               IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh",
+               [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF0000),
+                                      (and (sra GPRnopc:$Rm, pkh_asr_amt:$sh),
+                                           0xFFFF)))]>,
+               Requires<[IsARM, HasV6]>,
+           Sched<[WriteALUsi, ReadALU]>;
+
+// Alternate cases for PKHTB where identities eliminate some nodes.  Note that
+// a shift amount of 0 is *not legal* here, it is PKHBT instead.
+// We also can not replace a srl (17..31) by an arithmetic shift we would use in
+// pkhtb src1, src2, asr (17..31).
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+                   (srl GPRnopc:$src2, imm16:$sh)),
+               (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16:$sh)>;
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+                   (sra GPRnopc:$src2, imm16_31:$sh)),
+               (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16_31:$sh)>;
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+                   (and (srl GPRnopc:$src2, imm1_15:$sh), 0xFFFF)),
+               (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm1_15:$sh)>;
+
+//===----------------------------------------------------------------------===//
+// CRC Instructions
+//
+// Polynomials:
+// + CRC32{B,H,W}       0x04C11DB7
+// + CRC32C{B,H,W}      0x1EDC6F41
+//
+
+class AI_crc32<bit C, bits<2> sz, string suffix, SDPatternOperator builtin>
+  : AInoP<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm), MiscFrm, NoItinerary,
+               !strconcat("crc32", suffix), "\t$Rd, $Rn, $Rm",
+               [(set GPRnopc:$Rd, (builtin GPRnopc:$Rn, GPRnopc:$Rm))]>,
+               Requires<[IsARM, HasV8, HasCRC]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{31-28} = 0b1110;
+  let Inst{27-23} = 0b00010;
+  let Inst{22-21} = sz;
+  let Inst{20}    = 0;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{11-10} = 0b00;
+  let Inst{9}     = C;
+  let Inst{8}     = 0;
+  let Inst{7-4}   = 0b0100;
+  let Inst{3-0}   = Rm;
+
+  let Unpredictable{11-8} = 0b1101;
+}
+
+def CRC32B  : AI_crc32<0, 0b00, "b", int_arm_crc32b>;
+def CRC32CB : AI_crc32<1, 0b00, "cb", int_arm_crc32cb>;
+def CRC32H  : AI_crc32<0, 0b01, "h", int_arm_crc32h>;
+def CRC32CH : AI_crc32<1, 0b01, "ch", int_arm_crc32ch>;
+def CRC32W  : AI_crc32<0, 0b10, "w", int_arm_crc32w>;
+def CRC32CW : AI_crc32<1, 0b10, "cw", int_arm_crc32cw>;
+
+//===----------------------------------------------------------------------===//
+// ARMv8.1a Privilege Access Never extension
+//
+// SETPAN #imm1
+
+def SETPAN : AInoP<(outs), (ins imm0_1:$imm), MiscFrm, NoItinerary, "setpan",
+                "\t$imm", []>, Requires<[IsARM, HasV8, HasV8_1a]> {
+  bits<1> imm;
+
+  let Inst{31-28} = 0b1111;
+  let Inst{27-20} = 0b00010001;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-10} = 0b000000;
+  let Inst{9} = imm;
+  let Inst{8} = 0b0;
+  let Inst{7-4} = 0b0000;
+  let Inst{3-0} = 0b0000;
+
+  let Unpredictable{19-16} = 0b1111;
+  let Unpredictable{15-10} = 0b111111;
+  let Unpredictable{8} = 0b1;
+  let Unpredictable{3-0} = 0b1111;
+}
+
+//===----------------------------------------------------------------------===//
+//  Comparison Instructions...
+//
+
+defm CMP  : AI1_cmp_irs<0b1010, "cmp",
+                        IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr, ARMcmp>;
+
+// ARMcmpZ can re-use the above instruction definitions.
+def : ARMPat<(ARMcmpZ GPR:$src, mod_imm:$imm),
+             (CMPri   GPR:$src, mod_imm:$imm)>;
+def : ARMPat<(ARMcmpZ GPR:$src, GPR:$rhs),
+             (CMPrr   GPR:$src, GPR:$rhs)>;
+def : ARMPat<(ARMcmpZ GPR:$src, so_reg_imm:$rhs),
+             (CMPrsi   GPR:$src, so_reg_imm:$rhs)>;
+def : ARMPat<(ARMcmpZ GPR:$src, so_reg_reg:$rhs),
+             (CMPrsr   GPR:$src, so_reg_reg:$rhs)>;
+
+// CMN register-integer
+let isCompare = 1, Defs = [CPSR] in {
+def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, mod_imm:$imm), DPFrm, IIC_iCMPi,
+                "cmn", "\t$Rn, $imm",
+                [(ARMcmn GPR:$Rn, mod_imm:$imm)]>,
+                Sched<[WriteCMP, ReadALU]> {
+  bits<4> Rn;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-0} = imm;
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+// CMN register-register/shift
+def CMNzrr : AI1<0b1011, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, IIC_iCMPr,
+                 "cmn", "\t$Rn, $Rm",
+                 [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                   GPR:$Rn, GPR:$Rm)]>, Sched<[WriteCMP, ReadALU, ReadALU]> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let isCommutable = 1;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-4} = 0b00000000;
+  let Inst{3-0} = Rm;
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+def CMNzrsi : AI1<0b1011, (outs),
+                  (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, IIC_iCMPsr,
+                  "cmn", "\t$Rn, $shift",
+                  [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                    GPR:$Rn, so_reg_imm:$shift)]>,
+                    Sched<[WriteCMPsi, ReadALU]> {
+  bits<4> Rn;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-5} = shift{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = shift{3-0};
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+def CMNzrsr : AI1<0b1011, (outs),
+                  (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, IIC_iCMPsr,
+                  "cmn", "\t$Rn, $shift",
+                  [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                    GPRnopc:$Rn, so_reg_reg:$shift)]>,
+                    Sched<[WriteCMPsr, ReadALU]> {
+  bits<4> Rn;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-8} = shift{11-8};
+  let Inst{7} = 0;
+  let Inst{6-5} = shift{6-5};
+  let Inst{4} = 1;
+  let Inst{3-0} = shift{3-0};
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+}
+
+def : ARMPat<(ARMcmp  GPR:$src, mod_imm_neg:$imm),
+             (CMNri   GPR:$src, mod_imm_neg:$imm)>;
+
+def : ARMPat<(ARMcmpZ GPR:$src, mod_imm_neg:$imm),
+             (CMNri   GPR:$src, mod_imm_neg:$imm)>;
+
+// Note that TST/TEQ don't set all the same flags that CMP does!
+defm TST  : AI1_cmp_irs<0b1000, "tst",
+                        IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr,
+                      BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>, 1,
+                      "DecodeTSTInstruction">;
+defm TEQ  : AI1_cmp_irs<0b1001, "teq",
+                        IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr,
+                      BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>, 1>;
+
+// Pseudo i64 compares for some floating point compares.
+let usesCustomInserter = 1, isBranch = 1, isTerminator = 1,
+    Defs = [CPSR] in {
+def BCCi64 : PseudoInst<(outs),
+    (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, brtarget:$dst),
+     IIC_Br,
+    [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>,
+    Sched<[WriteBr]>;
+
+def BCCZi64 : PseudoInst<(outs),
+     (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, brtarget:$dst), IIC_Br,
+    [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>,
+    Sched<[WriteBr]>;
+} // usesCustomInserter
+
+
+// Conditional moves
+let hasSideEffects = 0 in {
+
+let isCommutable = 1, isSelect = 1 in
+def MOVCCr : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, GPR:$Rm, cmovpred:$p),
+                           4, IIC_iCMOVr,
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm,
+                                                   cmovpred:$p))]>,
+             RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+def MOVCCsi : ARMPseudoInst<(outs GPR:$Rd),
+                            (ins GPR:$false, so_reg_imm:$shift, cmovpred:$p),
+                            4, IIC_iCMOVsr,
+                            [(set GPR:$Rd,
+                                  (ARMcmov GPR:$false, so_reg_imm:$shift,
+                                           cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+def MOVCCsr : ARMPseudoInst<(outs GPR:$Rd),
+                            (ins GPR:$false, so_reg_reg:$shift, cmovpred:$p),
+                           4, IIC_iCMOVsr,
+  [(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift,
+                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+
+let isMoveImm = 1 in
+def MOVCCi16
+    : ARMPseudoInst<(outs GPR:$Rd),
+                    (ins GPR:$false, imm0_65535_expr:$imm, cmovpred:$p),
+                    4, IIC_iMOVi,
+                    [(set GPR:$Rd, (ARMcmov GPR:$false, imm0_65535:$imm,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>,
+      Sched<[WriteALU]>;
+
+let isMoveImm = 1 in
+def MOVCCi : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, mod_imm:$imm, cmovpred:$p),
+                           4, IIC_iCMOVi,
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, mod_imm:$imm,
+                                                   cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+// Two instruction predicate mov immediate.
+let isMoveImm = 1 in
+def MOVCCi32imm
+    : ARMPseudoInst<(outs GPR:$Rd),
+                    (ins GPR:$false, i32imm:$src, cmovpred:$p),
+                    8, IIC_iCMOVix2,
+                    [(set GPR:$Rd, (ARMcmov GPR:$false, imm:$src,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>;
+
+let isMoveImm = 1 in
+def MVNCCi : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, mod_imm:$imm, cmovpred:$p),
+                           4, IIC_iCMOVi,
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, mod_imm_not:$imm,
+                                                   cmovpred:$p))]>,
+                RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+} // hasSideEffects
+
+
+//===----------------------------------------------------------------------===//
+// Atomic operations intrinsics
+//
+
+def MemBarrierOptOperand : AsmOperandClass {
+  let Name = "MemBarrierOpt";
+  let ParserMethod = "parseMemBarrierOptOperand";
+}
+def memb_opt : Operand<i32> {
+  let PrintMethod = "printMemBOption";
+  let ParserMatchClass = MemBarrierOptOperand;
+  let DecoderMethod = "DecodeMemBarrierOption";
+}
+
+def InstSyncBarrierOptOperand : AsmOperandClass {
+  let Name = "InstSyncBarrierOpt";
+  let ParserMethod = "parseInstSyncBarrierOptOperand";
+}
+def instsyncb_opt : Operand<i32> {
+  let PrintMethod = "printInstSyncBOption";
+  let ParserMatchClass = InstSyncBarrierOptOperand;
+  let DecoderMethod = "DecodeInstSyncBarrierOption";
+}
+
+// Memory barriers protect the atomic sequences
+let hasSideEffects = 1 in {
+def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
+                "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>,
+                Requires<[IsARM, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf57ff05;
+  let Inst{3-0} = opt;
+}
+
+def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
+                "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>,
+                Requires<[IsARM, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf57ff04;
+  let Inst{3-0} = opt;
+}
+
+// ISB has only full system option
+def ISB : AInoP<(outs), (ins instsyncb_opt:$opt), MiscFrm, NoItinerary,
+                "isb", "\t$opt", [(int_arm_isb (i32 imm0_15:$opt))]>,
+                Requires<[IsARM, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf57ff06;
+  let Inst{3-0} = opt;
+}
+}
+
+let usesCustomInserter = 1, Defs = [CPSR] in {
+
+// Pseudo instruction that combines movs + predicated rsbmi
+// to implement integer ABS
+  def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
+}
+
+let usesCustomInserter = 1 in {
+    def COPY_STRUCT_BYVAL_I32 : PseudoInst<
+      (outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment),
+      NoItinerary,
+      [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>;
+}
+
+let hasPostISelHook = 1, Constraints = "$newdst = $dst, $newsrc = $src" in {
+    // %newsrc, %newdst = MEMCPY %dst, %src, N, ...N scratch regs...
+    // Copies N registers worth of memory from address %src to address %dst
+    // and returns the incremented addresses.  N scratch register will
+    // be attached for the copy to use.
+    def MEMCPY : PseudoInst<
+      (outs GPR:$newdst, GPR:$newsrc),
+      (ins GPR:$dst, GPR:$src, i32imm:$nreg, variable_ops),
+      NoItinerary,
+      [(set GPR:$newdst, GPR:$newsrc,
+            (ARMmemcopy GPR:$dst, GPR:$src, imm:$nreg))]>;
+}
+
+def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldrex_2 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldrex_4 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def strex_1 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_strex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def strex_2 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_strex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def strex_4 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_strex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldaex_1 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldaex_2 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldaex_4 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stlex_1 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_stlex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stlex_2 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_stlex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stlex_4 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_stlex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+let mayLoad = 1 in {
+def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldrexb", "\t$Rt, $addr",
+                     [(set GPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>;
+def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldrexh", "\t$Rt, $addr",
+                     [(set GPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>;
+def LDREX  : AIldrex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldrex", "\t$Rt, $addr",
+                     [(set GPR:$Rt, (ldrex_4 addr_offset_none:$addr))]>;
+let hasExtraDefRegAllocReq = 1 in
+def LDREXD : AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
+                      NoItinerary, "ldrexd", "\t$Rt, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegLoad";
+}
+
+def LDAEXB : AIldaex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldaexb", "\t$Rt, $addr",
+                     [(set GPR:$Rt, (ldaex_1 addr_offset_none:$addr))]>;
+def LDAEXH : AIldaex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldaexh", "\t$Rt, $addr",
+                    [(set GPR:$Rt, (ldaex_2 addr_offset_none:$addr))]>;
+def LDAEX  : AIldaex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldaex", "\t$Rt, $addr",
+                    [(set GPR:$Rt, (ldaex_4 addr_offset_none:$addr))]>;
+let hasExtraDefRegAllocReq = 1 in
+def LDAEXD : AIldaex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
+                      NoItinerary, "ldaexd", "\t$Rt, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegLoad";
+}
+}
+
+let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
+def STREXB: AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "strexb", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd, (strex_1 GPR:$Rt,
+                                            addr_offset_none:$addr))]>;
+def STREXH: AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "strexh", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd, (strex_2 GPR:$Rt,
+                                            addr_offset_none:$addr))]>;
+def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "strex", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd, (strex_4 GPR:$Rt,
+                                            addr_offset_none:$addr))]>;
+let hasExtraSrcRegAllocReq = 1 in
+def STREXD : AIstrex<0b01, (outs GPR:$Rd),
+                    (ins GPRPairOp:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "strexd", "\t$Rd, $Rt, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegStore";
+}
+def STLEXB: AIstlex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlexb", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd,
+                          (stlex_1 GPR:$Rt, addr_offset_none:$addr))]>;
+def STLEXH: AIstlex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlexh", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd,
+                          (stlex_2 GPR:$Rt, addr_offset_none:$addr))]>;
+def STLEX : AIstlex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlex", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd,
+                          (stlex_4 GPR:$Rt, addr_offset_none:$addr))]>;
+let hasExtraSrcRegAllocReq = 1 in
+def STLEXD : AIstlex<0b01, (outs GPR:$Rd),
+                    (ins GPRPairOp:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlexd", "\t$Rd, $Rt, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegStore";
+}
+}
+
+def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
+                [(int_arm_clrex)]>,
+            Requires<[IsARM, HasV6K]>  {
+  let Inst{31-0} = 0b11110101011111111111000000011111;
+}
+
+def : ARMPat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+             (STREXB GPR:$Rt, addr_offset_none:$addr)>;
+def : ARMPat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+             (STREXH GPR:$Rt, addr_offset_none:$addr)>;
+
+def : ARMPat<(stlex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+             (STLEXB GPR:$Rt, addr_offset_none:$addr)>;
+def : ARMPat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+             (STLEXH GPR:$Rt, addr_offset_none:$addr)>;
+
+class acquiring_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return isAcquireOrStronger(Ordering);
+}]>;
+
+def atomic_load_acquire_8  : acquiring_load<atomic_load_8>;
+def atomic_load_acquire_16 : acquiring_load<atomic_load_16>;
+def atomic_load_acquire_32 : acquiring_load<atomic_load_32>;
+
+class releasing_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return isReleaseOrStronger(Ordering);
+}]>;
+
+def atomic_store_release_8  : releasing_store<atomic_store_8>;
+def atomic_store_release_16 : releasing_store<atomic_store_16>;
+def atomic_store_release_32 : releasing_store<atomic_store_32>;
+
+let AddedComplexity = 8 in {
+  def : ARMPat<(atomic_load_acquire_8 addr_offset_none:$addr),  (LDAB addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_load_acquire_16 addr_offset_none:$addr), (LDAH addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_load_acquire_32 addr_offset_none:$addr), (LDA  addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val),  (STLB GPR:$val, addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (STLH GPR:$val, addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (STL  GPR:$val, addr_offset_none:$addr)>;
+}
+
+// SWP/SWPB are deprecated in V6/V7.
+let mayLoad = 1, mayStore = 1 in {
+def SWP : AIswp<0, (outs GPRnopc:$Rt),
+                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>,
+                Requires<[PreV8]>;
+def SWPB: AIswp<1, (outs GPRnopc:$Rt),
+                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>,
+                Requires<[PreV8]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Coprocessor Instructions.
+//
+
+def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+            c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
+            NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+            [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                          imm:$CRm, imm:$opc2)]>,
+            Requires<[PreV8]> {
+  bits<4> opc1;
+  bits<4> CRn;
+  bits<4> CRd;
+  bits<4> cop;
+  bits<3> opc2;
+  bits<4> CRm;
+
+  let Inst{3-0}   = CRm;
+  let Inst{4}     = 0;
+  let Inst{7-5}   = opc2;
+  let Inst{11-8}  = cop;
+  let Inst{15-12} = CRd;
+  let Inst{19-16} = CRn;
+  let Inst{23-20} = opc1;
+}
+
+def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+               c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
+               NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+               [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                              imm:$CRm, imm:$opc2)]>,
+               Requires<[PreV8]> {
+  let Inst{31-28} = 0b1111;
+  bits<4> opc1;
+  bits<4> CRn;
+  bits<4> CRd;
+  bits<4> cop;
+  bits<3> opc2;
+  bits<4> CRm;
+
+  let Inst{3-0}   = CRm;
+  let Inst{4}     = 0;
+  let Inst{7-5}   = opc2;
+  let Inst{11-8}  = cop;
+  let Inst{15-12} = CRd;
+  let Inst{19-16} = CRn;
+  let Inst{23-20} = opc1;
+}
+
+class ACI<dag oops, dag iops, string opc, string asm,
+            list<dag> pattern, IndexMode im = IndexModeNone>
+  : I<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
+      opc, asm, "", pattern> {
+  let Inst{27-25} = 0b110;
+}
+class ACInoP<dag oops, dag iops, string opc, string asm,
+          list<dag> pattern, IndexMode im = IndexModeNone>
+  : InoP<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
+         opc, asm, "", pattern> {
+  let Inst{31-28} = 0b1111;
+  let Inst{27-25} = 0b110;
+}
+multiclass LdStCop<bit load, bit Dbit, string asm, list<dag> pattern> {
+  def _OFFSET : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                    asm, "\t$cop, $CRd, $addr", pattern> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _PRE : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
+                 asm, "\t$cop, $CRd, $addr!", [], IndexModePre> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _POST: ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                              postidx_imm8s4:$offset),
+                 asm, "\t$cop, $CRd, $addr, $offset", [], IndexModePost> {
+    bits<9> offset;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = offset{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = offset{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _OPTION : ACI<(outs),
+                    (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                         coproc_option_imm:$option),
+      asm, "\t$cop, $CRd, $addr, $option", []> {
+    bits<8> option;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = 1; // U = 1
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = option;
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+}
+multiclass LdSt2Cop<bit load, bit Dbit, string asm, list<dag> pattern> {
+  def _OFFSET : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                       asm, "\t$cop, $CRd, $addr", pattern> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _PRE : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
+                    asm, "\t$cop, $CRd, $addr!", [], IndexModePre> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _POST: ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                                 postidx_imm8s4:$offset),
+                 asm, "\t$cop, $CRd, $addr, $offset", [], IndexModePost> {
+    bits<9> offset;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = offset{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = offset{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _OPTION : ACInoP<(outs),
+                       (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                            coproc_option_imm:$option),
+      asm, "\t$cop, $CRd, $addr, $option", []> {
+    bits<8> option;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = 1; // U = 1
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = option;
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+}
+
+defm LDC   : LdStCop <1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm LDCL  : LdStCop <1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm LDC2  : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+
+defm STC   : LdStCop <0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm STCL  : LdStCop <0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm STC2  : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+
+//===----------------------------------------------------------------------===//
+// Move between coprocessor and ARM core register.
+//
+
+class MovRCopro<string opc, bit direction, dag oops, dag iops,
+                list<dag> pattern>
+  : ABI<0b1110, oops, iops, NoItinerary, opc,
+        "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2", pattern> {
+  let Inst{20} = direction;
+  let Inst{4} = 1;
+
+  bits<4> Rt;
+  bits<4> cop;
+  bits<3> opc1;
+  bits<3> opc2;
+  bits<4> CRm;
+  bits<4> CRn;
+
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = cop;
+  let Inst{23-21} = opc1;
+  let Inst{7-5}   = opc2;
+  let Inst{3-0}   = CRm;
+  let Inst{19-16} = CRn;
+}
+
+def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
+                    (outs),
+                    (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                         c_imm:$CRm, imm0_7:$opc2),
+                    [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                                  imm:$CRm, imm:$opc2)]>,
+                    ComplexDeprecationPredicate<"MCR">;
+def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                   (MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                        c_imm:$CRm, 0, pred:$p)>;
+def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
+                    (outs GPRwithAPSR:$Rt),
+                    (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
+                         imm0_7:$opc2), []>;
+def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                   (MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                        c_imm:$CRm, 0, pred:$p)>;
+
+def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
+             (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+class MovRCopro2<string opc, bit direction, dag oops, dag iops,
+                 list<dag> pattern>
+  : ABXI<0b1110, oops, iops, NoItinerary,
+         !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), pattern> {
+  let Inst{31-24} = 0b11111110;
+  let Inst{20} = direction;
+  let Inst{4} = 1;
+
+  bits<4> Rt;
+  bits<4> cop;
+  bits<3> opc1;
+  bits<3> opc2;
+  bits<4> CRm;
+  bits<4> CRn;
+
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = cop;
+  let Inst{23-21} = opc1;
+  let Inst{7-5}   = opc2;
+  let Inst{3-0}   = CRm;
+  let Inst{19-16} = CRn;
+}
+
+def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
+                      (outs),
+                      (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                           c_imm:$CRm, imm0_7:$opc2),
+                      [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                                     imm:$CRm, imm:$opc2)]>,
+                      Requires<[PreV8]>;
+def : ARMInstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm",
+                   (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                         c_imm:$CRm, 0)>;
+def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
+                      (outs GPRwithAPSR:$Rt),
+                      (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
+                           imm0_7:$opc2), []>,
+                      Requires<[PreV8]>;
+def : ARMInstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm",
+                   (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                         c_imm:$CRm, 0)>;
+
+def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
+                              imm:$CRm, imm:$opc2),
+                (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+class MovRRCopro<string opc, bit direction, dag oops, dag iops, list<dag>
+                 pattern = []>
+  : ABI<0b1100, oops, iops, NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm",
+        pattern> {
+
+  let Inst{23-21} = 0b010;
+  let Inst{20} = direction;
+
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> cop;
+  bits<4> opc1;
+  bits<4> CRm;
+
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+  let Inst{11-8}  = cop;
+  let Inst{7-4}   = opc1;
+  let Inst{3-0}   = CRm;
+}
+
+def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
+                      (outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt,
+                      GPRnopc:$Rt2, c_imm:$CRm),
+                      [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt,
+                                     GPRnopc:$Rt2, imm:$CRm)]>;
+def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */,
+                      (outs GPRnopc:$Rt, GPRnopc:$Rt2),
+                      (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>;
+
+class MovRRCopro2<string opc, bit direction, dag oops, dag iops,
+                  list<dag> pattern = []>
+  : ABXI<0b1100, oops, iops, NoItinerary,
+         !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern>,
+    Requires<[PreV8]> {
+  let Inst{31-28} = 0b1111;
+  let Inst{23-21} = 0b010;
+  let Inst{20} = direction;
+
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> cop;
+  bits<4> opc1;
+  bits<4> CRm;
+
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+  let Inst{11-8}  = cop;
+  let Inst{7-4}   = opc1;
+  let Inst{3-0}   = CRm;
+
+  let DecoderMethod = "DecoderForMRRC2AndMCRR2";
+}
+
+def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
+                        (outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt,
+                        GPRnopc:$Rt2, c_imm:$CRm),
+                        [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt,
+                                        GPRnopc:$Rt2, imm:$CRm)]>;
+
+def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */,
+                       (outs GPRnopc:$Rt, GPRnopc:$Rt2),
+                       (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>;
+
+//===----------------------------------------------------------------------===//
+// Move between special register and ARM core register
+//
+
+// Move to ARM core register from Special Register
+def MRS : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary,
+              "mrs", "\t$Rd, apsr", []> {
+  bits<4> Rd;
+  let Inst{23-16} = 0b00001111;
+  let Unpredictable{19-17} = 0b111;
+
+  let Inst{15-12} = Rd;
+
+  let Inst{11-0} = 0b000000000000;
+  let Unpredictable{11-0} = 0b110100001111;
+}
+
+def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p), 0>,
+         Requires<[IsARM]>;
+
+// The MRSsys instruction is the MRS instruction from the ARM ARM,
+// section B9.3.9, with the R bit set to 1.
+def MRSsys : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary,
+                 "mrs", "\t$Rd, spsr", []> {
+  bits<4> Rd;
+  let Inst{23-16} = 0b01001111;
+  let Unpredictable{19-16} = 0b1111;
+
+  let Inst{15-12} = Rd;
+
+  let Inst{11-0} = 0b000000000000;
+  let Unpredictable{11-0} = 0b110100001111;
+}
+
+// However, the MRS (banked register) system instruction (ARMv7VE) *does* have a
+// separate encoding (distinguished by bit 5.
+def MRSbanked : ABI<0b0001, (outs GPRnopc:$Rd), (ins banked_reg:$banked),
+                    NoItinerary, "mrs", "\t$Rd, $banked", []>,
+                Requires<[IsARM, HasVirtualization]> {
+  bits<6> banked;
+  bits<4> Rd;
+
+  let Inst{23} = 0;
+  let Inst{22} = banked{5}; // R bit
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = banked{3-0};
+  let Inst{15-12} = Rd;
+  let Inst{11-9} = 0b001;
+  let Inst{8} = banked{4};
+  let Inst{7-0} = 0b00000000;
+}
+
+// Move from ARM core register to Special Register
+//
+// No need to have both system and application versions of MSR (immediate) or
+// MSR (register), the encodings are the same and the assembly parser has no way
+// to distinguish between them. The mask operand contains the special register
+// (R Bit) in bit 4 and bits 3-0 contains the mask with the fields to be
+// accessed in the special register.
+let Defs = [CPSR] in
+def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary,
+              "msr", "\t$mask, $Rn", []> {
+  bits<5> mask;
+  bits<4> Rn;
+
+  let Inst{23} = 0;
+  let Inst{22} = mask{4}; // R bit
+  let Inst{21-20} = 0b10;
+  let Inst{19-16} = mask{3-0};
+  let Inst{15-12} = 0b1111;
+  let Inst{11-4} = 0b00000000;
+  let Inst{3-0} = Rn;
+}
+
+let Defs = [CPSR] in
+def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask,  mod_imm:$imm), NoItinerary,
+               "msr", "\t$mask, $imm", []> {
+  bits<5> mask;
+  bits<12> imm;
+
+  let Inst{23} = 0;
+  let Inst{22} = mask{4}; // R bit
+  let Inst{21-20} = 0b10;
+  let Inst{19-16} = mask{3-0};
+  let Inst{15-12} = 0b1111;
+  let Inst{11-0} = imm;
+}
+
+// However, the MSR (banked register) system instruction (ARMv7VE) *does* have a
+// separate encoding (distinguished by bit 5.
+def MSRbanked : ABI<0b0001, (outs), (ins banked_reg:$banked, GPRnopc:$Rn),
+                    NoItinerary, "msr", "\t$banked, $Rn", []>,
+                Requires<[IsARM, HasVirtualization]> {
+  bits<6> banked;
+  bits<4> Rn;
+
+  let Inst{23} = 0;
+  let Inst{22} = banked{5}; // R bit
+  let Inst{21-20} = 0b10;
+  let Inst{19-16} = banked{3-0};
+  let Inst{15-12} = 0b1111;
+  let Inst{11-9} = 0b001;
+  let Inst{8} = banked{4};
+  let Inst{7-4} = 0b0000;
+  let Inst{3-0} = Rn;
+}
+
+// Dynamic stack allocation yields a _chkstk for Windows targets.  These calls
+// are needed to probe the stack when allocating more than
+// 4k bytes in one go. Touching the stack at 4K increments is necessary to
+// ensure that the guard pages used by the OS virtual memory manager are
+// allocated in correct sequence.
+// The main point of having separate instruction are extra unmodelled effects
+// (compared to ordinary calls) like stack pointer change.
+
+def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone,
+                      [SDNPHasChain, SDNPSideEffect]>;
+let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in
+  def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>;
+
+def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK,
+                         [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+let usesCustomInserter = 1, Defs = [CPSR] in
+  def WIN__DBZCHK : PseudoInst<(outs), (ins tGPR:$divisor), NoItinerary,
+                               [(win__dbzchk tGPR:$divisor)]>;
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+// This is a pseudo inst so that we can get the encoding right,
+// complete with fixup for the aeabi_read_tp function.
+// TPsoft is valid for ARM mode only, in case of Thumb mode a tTPsoft pattern
+// is defined in "ARMInstrThumb.td".
+let isCall = 1,
+  Defs = [R0, R12, LR, CPSR], Uses = [SP] in {
+  def TPsoft : ARMPseudoInst<(outs), (ins), 4, IIC_Br,
+               [(set R0, ARMthread_pointer)]>, Sched<[WriteBr]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+//   eh_sjlj_setjmp() is an instruction sequence to store the return
+//   address and save #0 in R0 for the non-longjmp case.
+//   Since by its nature we may be coming from some other function to get
+//   here, and we're using the stack frame for the containing function to
+//   save/restore registers, we can't keep anything live in regs across
+//   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
+//   when we get here from a longjmp(). We force everything out of registers
+//   except for our own input by listing the relevant registers in Defs. By
+//   doing so, we also cause the prologue/epilogue code to actively preserve
+//   all of the callee-saved resgisters, which is exactly what we want.
+//   A constant value is passed in $val, and we use the location as a scratch.
+//
+// These are pseudo-instructions and are lowered to individual MC-insts, so
+// no encoding information is necessary.
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR,
+    Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15 ],
+  hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+  def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
+                               NoItinerary,
+                         [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
+                           Requires<[IsARM, HasVFP2]>;
+}
+
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR ],
+  hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+  def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
+                                   NoItinerary,
+                         [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
+                                Requires<[IsARM, NoVFP]>;
+}
+
+// FIXME: Non-IOS version(s)
+let isBarrier = 1, hasSideEffects = 1, isTerminator = 1,
+    Defs = [ R7, LR, SP ] in {
+def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
+                             NoItinerary,
+                         [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
+                                Requires<[IsARM]>;
+}
+
+let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1 in
+def Int_eh_sjlj_setup_dispatch : PseudoInst<(outs), (ins), NoItinerary,
+            [(ARMeh_sjlj_setup_dispatch)]>;
+
+// eh.sjlj.dispatchsetup pseudo-instruction.
+// This pseudo is used for both ARM and Thumb. Any differences are handled when
+// the pseudo is expanded (which happens before any passes that need the
+// instruction size).
+let isBarrier = 1 in
+def Int_eh_sjlj_dispatchsetup : PseudoInst<(outs), (ins), NoItinerary, []>;
+
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// ARMv4 indirect branch using (MOVr PC, dst)
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in
+  def MOVPCRX : ARMPseudoExpand<(outs), (ins GPR:$dst),
+                    4, IIC_Br, [(brind GPR:$dst)],
+                    (MOVr PC, GPR:$dst, (ops 14, zero_reg), zero_reg)>,
+                  Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>;
+
+// Large immediate handling.
+
+// 32-bit immediate using two piece mod_imms or movw + movt.
+// This is a single pseudo instruction, the benefit is that it can be remat'd
+// as a single unit instead of having to handle reg inputs.
+// FIXME: Remove this when we can do generalized remat.
+let isReMaterializable = 1, isMoveImm = 1 in
+def MOVi32imm : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
+                           [(set GPR:$dst, (arm_i32imm:$src))]>,
+                           Requires<[IsARM]>;
+
+def LDRLIT_ga_abs : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iLoad_i,
+                               [(set GPR:$dst, (ARMWrapper tglobaladdr:$src))]>,
+                    Requires<[IsARM, DontUseMovt]>;
+
+// Pseudo instruction that combines movw + movt + add pc (if PIC).
+// It also makes it possible to rematerialize the instructions.
+// FIXME: Remove this when we can do generalized remat and when machine licm
+// can properly the instructions.
+let isReMaterializable = 1 in {
+def MOV_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                              IIC_iMOVix2addpc,
+                        [(set GPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
+                        Requires<[IsARM, UseMovt]>;
+
+def LDRLIT_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                                 IIC_iLoadiALU,
+                                 [(set GPR:$dst,
+                                       (ARMWrapperPIC tglobaladdr:$addr))]>,
+                      Requires<[IsARM, DontUseMovt]>;
+
+let AddedComplexity = 10 in
+def LDRLIT_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                              NoItinerary,
+                              [(set GPR:$dst,
+                                    (load (ARMWrapperPIC tglobaladdr:$addr)))]>,
+                          Requires<[IsARM, DontUseMovt]>;
+
+let AddedComplexity = 10 in
+def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                                IIC_iMOVix2ld,
+                    [(set GPR:$dst, (load (ARMWrapperPIC tglobaladdr:$addr)))]>,
+                    Requires<[IsARM, UseMovt]>;
+} // isReMaterializable
+
+// The many different faces of TLS access.
+def : ARMPat<(ARMWrapper tglobaltlsaddr :$dst),
+             (MOVi32imm tglobaltlsaddr :$dst)>,
+      Requires<[IsARM, UseMovt]>;
+
+def : Pat<(ARMWrapper tglobaltlsaddr:$src),
+          (LDRLIT_ga_abs tglobaltlsaddr:$src)>,
+      Requires<[IsARM, DontUseMovt]>;
+
+def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
+          (MOV_ga_pcrel tglobaltlsaddr:$addr)>, Requires<[IsARM, UseMovt]>;
+
+def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
+          (LDRLIT_ga_pcrel tglobaltlsaddr:$addr)>,
+      Requires<[IsARM, DontUseMovt]>;
+let AddedComplexity = 10 in
+def : Pat<(load (ARMWrapperPIC tglobaltlsaddr:$addr)),
+          (MOV_ga_pcrel_ldr tglobaltlsaddr:$addr)>,
+      Requires<[IsARM, UseMovt]>;
+
+
+// ConstantPool, GlobalAddress, and JumpTable
+def : ARMPat<(ARMWrapper  tconstpool  :$dst), (LEApcrel tconstpool  :$dst)>;
+def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>,
+            Requires<[IsARM, UseMovt]>;
+def : ARMPat<(ARMWrapper texternalsym :$dst), (MOVi32imm texternalsym :$dst)>,
+            Requires<[IsARM, UseMovt]>;
+def : ARMPat<(ARMWrapperJT tjumptable:$dst),
+             (LEApcrelJT tjumptable:$dst)>;
+
+// TODO: add,sub,and, 3-instr forms?
+
+// Tail calls. These patterns also apply to Thumb mode.
+def : Pat<(ARMtcret tcGPR:$dst), (TCRETURNri tcGPR:$dst)>;
+def : Pat<(ARMtcret (i32 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>;
+def : Pat<(ARMtcret (i32 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>;
+
+// Direct calls
+def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>;
+def : ARMPat<(ARMcall_nolink texternalsym:$func),
+             (BMOVPCB_CALL texternalsym:$func)>;
+
+// zextload i1 -> zextload i8
+def : ARMPat<(zextloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(zextloadi1 ldst_so_reg:$addr),    (LDRBrs ldst_so_reg:$addr)>;
+
+// extload -> zextload
+def : ARMPat<(extloadi1 addrmode_imm12:$addr),  (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(extloadi1 ldst_so_reg:$addr),     (LDRBrs ldst_so_reg:$addr)>;
+def : ARMPat<(extloadi8 addrmode_imm12:$addr),  (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(extloadi8 ldst_so_reg:$addr),     (LDRBrs ldst_so_reg:$addr)>;
+
+def : ARMPat<(extloadi16 addrmode3:$addr),  (LDRH addrmode3:$addr)>;
+
+def : ARMPat<(extloadi8  addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>;
+def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>;
+
+// smul* and smla*
+def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b),
+                 (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))),
+                 (SMULBT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b),
+                (SMULTB GPR:$a, GPR:$b)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (mul sext_16_node:$a, sext_16_node:$b)),
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))),
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
+                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+
+// Pre-v7 uses MCR for synchronization barriers.
+def : ARMPat<(ARMMemBarrierMCR GPR:$zero), (MCR 15, 0, GPR:$zero, 7, 10, 5)>,
+         Requires<[IsARM, HasV6]>;
+
+// SXT/UXT with no rotate
+let AddedComplexity = 16 in {
+def : ARMV6Pat<(and GPR:$Src, 0x000000FF), (UXTB GPR:$Src, 0)>;
+def : ARMV6Pat<(and GPR:$Src, 0x0000FFFF), (UXTH GPR:$Src, 0)>;
+def : ARMV6Pat<(and GPR:$Src, 0x00FF00FF), (UXTB16 GPR:$Src, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0x00FF)),
+               (UXTAB GPR:$Rn, GPR:$Rm, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0xFFFF)),
+               (UXTAH GPR:$Rn, GPR:$Rm, 0)>;
+}
+
+def : ARMV6Pat<(sext_inreg GPR:$Src, i8),  (SXTB GPR:$Src, 0)>;
+def : ARMV6Pat<(sext_inreg GPR:$Src, i16), (SXTH GPR:$Src, 0)>;
+
+def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i8)),
+               (SXTAB GPR:$Rn, GPRnopc:$Rm, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i16)),
+               (SXTAH GPR:$Rn, GPRnopc:$Rm, 0)>;
+
+// Atomic load/store patterns
+def : ARMPat<(atomic_load_8 ldst_so_reg:$src),
+             (LDRBrs ldst_so_reg:$src)>;
+def : ARMPat<(atomic_load_8 addrmode_imm12:$src),
+             (LDRBi12 addrmode_imm12:$src)>;
+def : ARMPat<(atomic_load_16 addrmode3:$src),
+             (LDRH addrmode3:$src)>;
+def : ARMPat<(atomic_load_32 ldst_so_reg:$src),
+             (LDRrs ldst_so_reg:$src)>;
+def : ARMPat<(atomic_load_32 addrmode_imm12:$src),
+             (LDRi12 addrmode_imm12:$src)>;
+def : ARMPat<(atomic_store_8 ldst_so_reg:$ptr, GPR:$val),
+             (STRBrs GPR:$val, ldst_so_reg:$ptr)>;
+def : ARMPat<(atomic_store_8 addrmode_imm12:$ptr, GPR:$val),
+             (STRBi12 GPR:$val, addrmode_imm12:$ptr)>;
+def : ARMPat<(atomic_store_16 addrmode3:$ptr, GPR:$val),
+             (STRH GPR:$val, addrmode3:$ptr)>;
+def : ARMPat<(atomic_store_32 ldst_so_reg:$ptr, GPR:$val),
+             (STRrs GPR:$val, ldst_so_reg:$ptr)>;
+def : ARMPat<(atomic_store_32 addrmode_imm12:$ptr, GPR:$val),
+             (STRi12 GPR:$val, addrmode_imm12:$ptr)>;
+
+
+//===----------------------------------------------------------------------===//
+// Thumb Support
+//
+
+include "ARMInstrThumb.td"
+
+//===----------------------------------------------------------------------===//
+// Thumb2 Support
+//
+
+include "ARMInstrThumb2.td"
+
+//===----------------------------------------------------------------------===//
+// Floating Point Support
+//
+
+include "ARMInstrVFP.td"
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON) Support
+//
+
+include "ARMInstrNEON.td"
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases
+//
+
+// Memory barriers
+def : InstAlias<"dmb", (DMB 0xf), 0>, Requires<[IsARM, HasDB]>;
+def : InstAlias<"dsb", (DSB 0xf), 0>, Requires<[IsARM, HasDB]>;
+def : InstAlias<"isb", (ISB 0xf), 0>, Requires<[IsARM, HasDB]>;
+
+// System instructions
+def : MnemonicAlias<"swi", "svc">;
+
+// Load / Store Multiple
+def : MnemonicAlias<"ldmfd", "ldm">;
+def : MnemonicAlias<"ldmia", "ldm">;
+def : MnemonicAlias<"ldmea", "ldmdb">;
+def : MnemonicAlias<"stmfd", "stmdb">;
+def : MnemonicAlias<"stmia", "stm">;
+def : MnemonicAlias<"stmea", "stm">;
+
+// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT with the
+// input operands swapped when the shift amount is zero (i.e., unspecified).
+def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
+                (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p), 0>,
+        Requires<[IsARM, HasV6]>;
+def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
+                (PKHBT GPRnopc:$Rd, GPRnopc:$Rm, GPRnopc:$Rn, 0, pred:$p), 0>,
+        Requires<[IsARM, HasV6]>;
+
+// PUSH/POP aliases for STM/LDM
+def : ARMInstAlias<"push${p} $regs", (STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : ARMInstAlias<"pop${p} $regs", (LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+
+// SSAT/USAT optional shift operand.
+def : ARMInstAlias<"ssat${p} $Rd, $sat_imm, $Rn",
+                (SSAT GPRnopc:$Rd, imm1_32:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>;
+def : ARMInstAlias<"usat${p} $Rd, $sat_imm, $Rn",
+                (USAT GPRnopc:$Rd, imm0_31:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>;
+
+
+// Extend instruction optional rotate operand.
+def : ARMInstAlias<"sxtab${p} $Rd, $Rn, $Rm",
+                (SXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtah${p} $Rd, $Rn, $Rm",
+                (SXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
+                (SXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtb${p} $Rd, $Rm",
+                (SXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtb16${p} $Rd, $Rm",
+                (SXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxth${p} $Rd, $Rm",
+                (SXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+
+def : ARMInstAlias<"uxtab${p} $Rd, $Rn, $Rm",
+                (UXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtah${p} $Rd, $Rn, $Rm",
+                (UXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
+                (UXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtb${p} $Rd, $Rm",
+                (UXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtb16${p} $Rd, $Rm",
+                (UXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxth${p} $Rd, $Rm",
+                (UXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+
+
+// RFE aliases
+def : MnemonicAlias<"rfefa", "rfeda">;
+def : MnemonicAlias<"rfeea", "rfedb">;
+def : MnemonicAlias<"rfefd", "rfeia">;
+def : MnemonicAlias<"rfeed", "rfeib">;
+def : MnemonicAlias<"rfe", "rfeia">;
+
+// SRS aliases
+def : MnemonicAlias<"srsfa", "srsib">;
+def : MnemonicAlias<"srsea", "srsia">;
+def : MnemonicAlias<"srsfd", "srsdb">;
+def : MnemonicAlias<"srsed", "srsda">;
+def : MnemonicAlias<"srs", "srsia">;
+
+// QSAX == QSUBADDX
+def : MnemonicAlias<"qsubaddx", "qsax">;
+// SASX == SADDSUBX
+def : MnemonicAlias<"saddsubx", "sasx">;
+// SHASX == SHADDSUBX
+def : MnemonicAlias<"shaddsubx", "shasx">;
+// SHSAX == SHSUBADDX
+def : MnemonicAlias<"shsubaddx", "shsax">;
+// SSAX == SSUBADDX
+def : MnemonicAlias<"ssubaddx", "ssax">;
+// UASX == UADDSUBX
+def : MnemonicAlias<"uaddsubx", "uasx">;
+// UHASX == UHADDSUBX
+def : MnemonicAlias<"uhaddsubx", "uhasx">;
+// UHSAX == UHSUBADDX
+def : MnemonicAlias<"uhsubaddx", "uhsax">;
+// UQASX == UQADDSUBX
+def : MnemonicAlias<"uqaddsubx", "uqasx">;
+// UQSAX == UQSUBADDX
+def : MnemonicAlias<"uqsubaddx", "uqsax">;
+// USAX == USUBADDX
+def : MnemonicAlias<"usubaddx", "usax">;
+
+// "mov Rd, mod_imm_not" can be handled via "mvn" in assembly, just like
+// for isel.
+def : ARMInstAlias<"mov${s}${p} $Rd, $imm",
+                   (MVNi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"mvn${s}${p} $Rd, $imm",
+                   (MOVi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
+// Same for AND <--> BIC
+def : ARMInstAlias<"bic${s}${p} $Rd, $Rn, $imm",
+                   (ANDri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
+                          pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"bic${s}${p} $Rdn, $imm",
+                   (ANDri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
+                          pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"and${s}${p} $Rd, $Rn, $imm",
+                   (BICri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
+                          pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"and${s}${p} $Rdn, $imm",
+                   (BICri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
+                          pred:$p, cc_out:$s)>;
+
+// Likewise, "add Rd, mod_imm_neg" -> sub
+def : ARMInstAlias<"add${s}${p} $Rd, $Rn, $imm",
+                 (SUBri GPR:$Rd, GPR:$Rn, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"add${s}${p} $Rd, $imm",
+                 (SUBri GPR:$Rd, GPR:$Rd, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
+// Same for CMP <--> CMN via mod_imm_neg
+def : ARMInstAlias<"cmp${p} $Rd, $imm",
+                   (CMNri rGPR:$Rd, mod_imm_neg:$imm, pred:$p)>;
+def : ARMInstAlias<"cmn${p} $Rd, $imm",
+                   (CMPri rGPR:$Rd, mod_imm_neg:$imm, pred:$p)>;
+
+// The shifter forms of the MOV instruction are aliased to the ASR, LSL,
+// LSR, ROR, and RRX instructions.
+// FIXME: We need C++ parser hooks to map the alias to the MOV
+//        encoding. It seems we should be able to do that sort of thing
+//        in tblgen, but it could get ugly.
+let TwoOperandAliasConstraint = "$Rm = $Rd" in {
+def ASRi : ARMAsmPseudo<"asr${s}${p} $Rd, $Rm, $imm",
+                        (ins GPR:$Rd, GPR:$Rm, imm0_32:$imm, pred:$p,
+                             cc_out:$s)>;
+def LSRi : ARMAsmPseudo<"lsr${s}${p} $Rd, $Rm, $imm",
+                        (ins GPR:$Rd, GPR:$Rm, imm0_32:$imm, pred:$p,
+                             cc_out:$s)>;
+def LSLi : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rm, $imm",
+                        (ins GPR:$Rd, GPR:$Rm, imm0_31:$imm, pred:$p,
+                             cc_out:$s)>;
+def RORi : ARMAsmPseudo<"ror${s}${p} $Rd, $Rm, $imm",
+                        (ins GPR:$Rd, GPR:$Rm, imm0_31:$imm, pred:$p,
+                             cc_out:$s)>;
+}
+def RRXi : ARMAsmPseudo<"rrx${s}${p} $Rd, $Rm",
+                        (ins GPR:$Rd, GPR:$Rm, pred:$p, cc_out:$s)>;
+let TwoOperandAliasConstraint = "$Rn = $Rd" in {
+def ASRr : ARMAsmPseudo<"asr${s}${p} $Rd, $Rn, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                             cc_out:$s)>;
+def LSRr : ARMAsmPseudo<"lsr${s}${p} $Rd, $Rn, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                             cc_out:$s)>;
+def LSLr : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rn, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                             cc_out:$s)>;
+def RORr : ARMAsmPseudo<"ror${s}${p} $Rd, $Rn, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                             cc_out:$s)>;
+}
+
+// "neg" is and alias for "rsb rd, rn, #0"
+def : ARMInstAlias<"neg${s}${p} $Rd, $Rm",
+                   (RSBri GPR:$Rd, GPR:$Rm, 0, pred:$p, cc_out:$s)>;
+
+// Pre-v6, 'mov r0, r0' was used as a NOP encoding.
+def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, zero_reg)>,
+         Requires<[IsARM, NoV6]>;
+
+// MUL/UMLAL/SMLAL/UMULL/SMULL are available on all arches, but
+// the instruction definitions need difference constraints pre-v6.
+// Use these aliases for the assembly parsing on pre-v6.
+def : InstAlias<"mul${s}${p} $Rd, $Rn, $Rm",
+            (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s), 0>,
+         Requires<[IsARM, NoV6]>;
+def : InstAlias<"mla${s}${p} $Rd, $Rn, $Rm, $Ra",
+            (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra,
+             pred:$p, cc_out:$s), 0>,
+         Requires<[IsARM, NoV6]>;
+def : InstAlias<"smlal${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+            (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
+         Requires<[IsARM, NoV6]>;
+def : InstAlias<"umlal${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+            (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
+         Requires<[IsARM, NoV6]>;
+def : InstAlias<"smull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+            (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
+         Requires<[IsARM, NoV6]>;
+def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+            (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
+         Requires<[IsARM, NoV6]>;
+
+// 'it' blocks in ARM mode just validate the predicates. The IT itself
+// is discarded.
+def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>,
+         ComplexDeprecationPredicate<"IT">;
+
+let mayLoad = 1, mayStore =1, hasSideEffects = 1 in
+def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn),
+                       NoItinerary,
+                       [(set GPR:$Rd, (int_arm_space imm:$size, GPR:$Rn))]>;
+
+//===----------------------------------
+// Atomic cmpxchg for -O0
+//===----------------------------------
+
+// The fast register allocator used during -O0 inserts spills to cover any VRegs
+// live across basic block boundaries. When this happens between an LDXR and an
+// STXR it can clear the exclusive monitor, causing all cmpxchg attempts to
+// fail.
+
+// Unfortunately, this means we have to have an alternative (expanded
+// post-regalloc) path for -O0 compilations. Fortunately this path can be
+// significantly more naive than the standard expansion: we conservatively
+// assume seq_cst, strong cmpxchg and omit clrex on failure.
+
+let Constraints = "@earlyclobber $Rd,@earlyclobber $status",
+    mayLoad = 1, mayStore = 1 in {
+def CMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+                            (ins GPR:$addr, GPR:$desired, GPR:$new),
+                            NoItinerary, []>, Sched<[]>;
+
+def CMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+                             (ins GPR:$addr, GPR:$desired, GPR:$new),
+                             NoItinerary, []>, Sched<[]>;
+
+def CMP_SWAP_32 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+                             (ins GPR:$addr, GPR:$desired, GPR:$new),
+                             NoItinerary, []>, Sched<[]>;
+
+def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$status),
+                             (ins GPR:$addr, GPRPair:$desired, GPRPair:$new),
+                             NoItinerary, []>, Sched<[]>;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
new file mode 100644
index 000000000000..b5fa8e999e2a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -0,0 +1,8191 @@
+//===-- ARMInstrNEON.td - NEON support for ARM -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM NEON instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// NEON-specific Operands.
+//===----------------------------------------------------------------------===//
+def nModImm : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+}
+
+def nImmSplatI8AsmOperand : AsmOperandClass { let Name = "NEONi8splat"; }
+def nImmSplatI8 : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmSplatI8AsmOperand;
+}
+def nImmSplatI16AsmOperand : AsmOperandClass { let Name = "NEONi16splat"; }
+def nImmSplatI16 : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmSplatI16AsmOperand;
+}
+def nImmSplatI32AsmOperand : AsmOperandClass { let Name = "NEONi32splat"; }
+def nImmSplatI32 : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmSplatI32AsmOperand;
+}
+def nImmSplatNotI16AsmOperand : AsmOperandClass { let Name = "NEONi16splatNot"; }
+def nImmSplatNotI16 : Operand<i32> {
+  let ParserMatchClass = nImmSplatNotI16AsmOperand;
+}
+def nImmSplatNotI32AsmOperand : AsmOperandClass { let Name = "NEONi32splatNot"; }
+def nImmSplatNotI32 : Operand<i32> {
+  let ParserMatchClass = nImmSplatNotI32AsmOperand;
+}
+def nImmVMOVI32AsmOperand : AsmOperandClass { let Name = "NEONi32vmov"; }
+def nImmVMOVI32 : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMOVI32AsmOperand;
+}
+
+def nImmVMOVI16AsmOperandByteReplicate :
+  AsmOperandClass {
+  let Name = "NEONi16vmovByteReplicate";
+  let PredicateMethod = "isNEONi16ByteReplicate";
+  let RenderMethod = "addNEONvmovByteReplicateOperands";
+}
+def nImmVMOVI32AsmOperandByteReplicate :
+  AsmOperandClass {
+  let Name = "NEONi32vmovByteReplicate";
+  let PredicateMethod = "isNEONi32ByteReplicate";
+  let RenderMethod = "addNEONvmovByteReplicateOperands";
+}
+def nImmVMVNI16AsmOperandByteReplicate :
+  AsmOperandClass {
+  let Name = "NEONi16invByteReplicate";
+  let PredicateMethod = "isNEONi16ByteReplicate";
+  let RenderMethod = "addNEONinvByteReplicateOperands";
+}
+def nImmVMVNI32AsmOperandByteReplicate :
+  AsmOperandClass {
+  let Name = "NEONi32invByteReplicate";
+  let PredicateMethod = "isNEONi32ByteReplicate";
+  let RenderMethod = "addNEONinvByteReplicateOperands";
+}
+
+def nImmVMOVI16ByteReplicate : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMOVI16AsmOperandByteReplicate;
+}
+def nImmVMOVI32ByteReplicate : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMOVI32AsmOperandByteReplicate;
+}
+def nImmVMVNI16ByteReplicate : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMVNI16AsmOperandByteReplicate;
+}
+def nImmVMVNI32ByteReplicate : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMVNI32AsmOperandByteReplicate;
+}
+
+def nImmVMOVI32NegAsmOperand : AsmOperandClass { let Name = "NEONi32vmovNeg"; }
+def nImmVMOVI32Neg : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMOVI32NegAsmOperand;
+}
+def nImmVMOVF32 : Operand<i32> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
+}
+def nImmSplatI64AsmOperand : AsmOperandClass { let Name = "NEONi64splat"; }
+def nImmSplatI64 : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmSplatI64AsmOperand;
+}
+
+def VectorIndex8Operand  : AsmOperandClass { let Name = "VectorIndex8"; }
+def VectorIndex16Operand : AsmOperandClass { let Name = "VectorIndex16"; }
+def VectorIndex32Operand : AsmOperandClass { let Name = "VectorIndex32"; }
+def VectorIndex8 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = VectorIndex8Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i32imm);
+}
+def VectorIndex16 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 4;
+}]> {
+  let ParserMatchClass = VectorIndex16Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i32imm);
+}
+def VectorIndex32 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = VectorIndex32Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i32imm);
+}
+
+// Register list of one D register.
+def VecListOneDAsmOperand : AsmOperandClass {
+  let Name = "VecListOneD";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListOneD : RegisterOperand<DPR, "printVectorListOne"> {
+  let ParserMatchClass = VecListOneDAsmOperand;
+}
+// Register list of two sequential D registers.
+def VecListDPairAsmOperand : AsmOperandClass {
+  let Name = "VecListDPair";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListDPair : RegisterOperand<DPair, "printVectorListTwo"> {
+  let ParserMatchClass = VecListDPairAsmOperand;
+}
+// Register list of three sequential D registers.
+def VecListThreeDAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeD";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListThreeD : RegisterOperand<DPR, "printVectorListThree"> {
+  let ParserMatchClass = VecListThreeDAsmOperand;
+}
+// Register list of four sequential D registers.
+def VecListFourDAsmOperand : AsmOperandClass {
+  let Name = "VecListFourD";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListFourD : RegisterOperand<DPR, "printVectorListFour"> {
+  let ParserMatchClass = VecListFourDAsmOperand;
+}
+// Register list of two D registers spaced by 2 (two sequential Q registers).
+def VecListDPairSpacedAsmOperand : AsmOperandClass {
+  let Name = "VecListDPairSpaced";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListDPairSpaced : RegisterOperand<DPair, "printVectorListTwoSpaced"> {
+  let ParserMatchClass = VecListDPairSpacedAsmOperand;
+}
+// Register list of three D registers spaced by 2 (three Q registers).
+def VecListThreeQAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeQ";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListThreeQ : RegisterOperand<DPR, "printVectorListThreeSpaced"> {
+  let ParserMatchClass = VecListThreeQAsmOperand;
+}
+// Register list of three D registers spaced by 2 (three Q registers).
+def VecListFourQAsmOperand : AsmOperandClass {
+  let Name = "VecListFourQ";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListFourQ : RegisterOperand<DPR, "printVectorListFourSpaced"> {
+  let ParserMatchClass = VecListFourQAsmOperand;
+}
+
+// Register list of one D register, with "all lanes" subscripting.
+def VecListOneDAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListOneDAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListOneDAllLanes : RegisterOperand<DPR, "printVectorListOneAllLanes"> {
+  let ParserMatchClass = VecListOneDAllLanesAsmOperand;
+}
+// Register list of two D registers, with "all lanes" subscripting.
+def VecListDPairAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListDPairAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListDPairAllLanes : RegisterOperand<DPair,
+                                           "printVectorListTwoAllLanes"> {
+  let ParserMatchClass = VecListDPairAllLanesAsmOperand;
+}
+// Register list of two D registers spaced by 2 (two sequential Q registers).
+def VecListDPairSpacedAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListDPairSpacedAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListDPairSpacedAllLanes : RegisterOperand<DPair,
+                                         "printVectorListTwoSpacedAllLanes"> {
+  let ParserMatchClass = VecListDPairSpacedAllLanesAsmOperand;
+}
+// Register list of three D registers, with "all lanes" subscripting.
+def VecListThreeDAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeDAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListThreeDAllLanes : RegisterOperand<DPR,
+                                            "printVectorListThreeAllLanes"> {
+  let ParserMatchClass = VecListThreeDAllLanesAsmOperand;
+}
+// Register list of three D registers spaced by 2 (three sequential Q regs).
+def VecListThreeQAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeQAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListThreeQAllLanes : RegisterOperand<DPR,
+                                         "printVectorListThreeSpacedAllLanes"> {
+  let ParserMatchClass = VecListThreeQAllLanesAsmOperand;
+}
+// Register list of four D registers, with "all lanes" subscripting.
+def VecListFourDAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListFourDAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListFourDAllLanes : RegisterOperand<DPR, "printVectorListFourAllLanes"> {
+  let ParserMatchClass = VecListFourDAllLanesAsmOperand;
+}
+// Register list of four D registers spaced by 2 (four sequential Q regs).
+def VecListFourQAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListFourQAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListFourQAllLanes : RegisterOperand<DPR,
+                                         "printVectorListFourSpacedAllLanes"> {
+  let ParserMatchClass = VecListFourQAllLanesAsmOperand;
+}
+
+
+// Register list of one D register, with byte lane subscripting.
+def VecListOneDByteIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListOneDByteIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListOneDByteIndexed : Operand<i32> {
+  let ParserMatchClass = VecListOneDByteIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListOneDHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListOneDHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListOneDHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListOneDHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListOneDWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListOneDWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListOneDWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListOneDWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
+// Register list of two D registers with byte lane subscripting.
+def VecListTwoDByteIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoDByteIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoDByteIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoDByteIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListTwoDHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoDHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoDHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoDHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListTwoDWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoDWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoDWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoDWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// Register list of two Q registers with half-word lane subscripting.
+def VecListTwoQHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoQHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoQHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoQHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListTwoQWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoQWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoQWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoQWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
+
+// Register list of three D registers with byte lane subscripting.
+def VecListThreeDByteIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeDByteIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeDByteIndexed : Operand<i32> {
+  let ParserMatchClass = VecListThreeDByteIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListThreeDHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeDHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeDHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListThreeDHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListThreeDWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeDWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeDWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListThreeDWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// Register list of three Q registers with half-word lane subscripting.
+def VecListThreeQHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeQHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeQHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListThreeQHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListThreeQWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeQWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeQWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListThreeQWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
+// Register list of four D registers with byte lane subscripting.
+def VecListFourDByteIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListFourDByteIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourDByteIndexed : Operand<i32> {
+  let ParserMatchClass = VecListFourDByteIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListFourDHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListFourDHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourDHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListFourDHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListFourDWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListFourDWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourDWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListFourDWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// Register list of four Q registers with half-word lane subscripting.
+def VecListFourQHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListFourQHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourQHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListFourQHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListFourQWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListFourQWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourQWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListFourQWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
+def dword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+def dword_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 8;
+}]>;
+def word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() == 4;
+}]>;
+def word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() == 4;
+}]>;
+def hword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() == 2;
+}]>;
+def hword_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() == 2;
+}]>;
+def byte_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() == 1;
+}]>;
+def byte_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                             (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() == 1;
+}]>;
+def non_word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() < 4;
+}]>;
+def non_word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                    (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() < 4;
+}]>;
+
+//===----------------------------------------------------------------------===//
+// NEON-specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTARMVCMP    : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>;
+def SDTARMVCMPZ   : SDTypeProfile<1, 1, []>;
+
+def NEONvceq      : SDNode<"ARMISD::VCEQ", SDTARMVCMP>;
+def NEONvceqz     : SDNode<"ARMISD::VCEQZ", SDTARMVCMPZ>;
+def NEONvcge      : SDNode<"ARMISD::VCGE", SDTARMVCMP>;
+def NEONvcgez     : SDNode<"ARMISD::VCGEZ", SDTARMVCMPZ>;
+def NEONvclez     : SDNode<"ARMISD::VCLEZ", SDTARMVCMPZ>;
+def NEONvcgeu     : SDNode<"ARMISD::VCGEU", SDTARMVCMP>;
+def NEONvcgt      : SDNode<"ARMISD::VCGT", SDTARMVCMP>;
+def NEONvcgtz     : SDNode<"ARMISD::VCGTZ", SDTARMVCMPZ>;
+def NEONvcltz     : SDNode<"ARMISD::VCLTZ", SDTARMVCMPZ>;
+def NEONvcgtu     : SDNode<"ARMISD::VCGTU", SDTARMVCMP>;
+def NEONvtst      : SDNode<"ARMISD::VTST", SDTARMVCMP>;
+
+// Types for vector shift by immediates.  The "SHX" version is for long and
+// narrow operations where the source and destination vectors have different
+// types.  The "SHINS" version is for shift and insert operations.
+def SDTARMVSH     : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisVT<2, i32>]>;
+def SDTARMVSHX    : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+                                         SDTCisVT<2, i32>]>;
+def SDTARMVSHINS  : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
+
+def NEONvshl      : SDNode<"ARMISD::VSHL", SDTARMVSH>;
+def NEONvshrs     : SDNode<"ARMISD::VSHRs", SDTARMVSH>;
+def NEONvshru     : SDNode<"ARMISD::VSHRu", SDTARMVSH>;
+def NEONvshrn     : SDNode<"ARMISD::VSHRN", SDTARMVSHX>;
+
+def NEONvrshrs    : SDNode<"ARMISD::VRSHRs", SDTARMVSH>;
+def NEONvrshru    : SDNode<"ARMISD::VRSHRu", SDTARMVSH>;
+def NEONvrshrn    : SDNode<"ARMISD::VRSHRN", SDTARMVSHX>;
+
+def NEONvqshls    : SDNode<"ARMISD::VQSHLs", SDTARMVSH>;
+def NEONvqshlu    : SDNode<"ARMISD::VQSHLu", SDTARMVSH>;
+def NEONvqshlsu   : SDNode<"ARMISD::VQSHLsu", SDTARMVSH>;
+def NEONvqshrns   : SDNode<"ARMISD::VQSHRNs", SDTARMVSHX>;
+def NEONvqshrnu   : SDNode<"ARMISD::VQSHRNu", SDTARMVSHX>;
+def NEONvqshrnsu  : SDNode<"ARMISD::VQSHRNsu", SDTARMVSHX>;
+
+def NEONvqrshrns  : SDNode<"ARMISD::VQRSHRNs", SDTARMVSHX>;
+def NEONvqrshrnu  : SDNode<"ARMISD::VQRSHRNu", SDTARMVSHX>;
+def NEONvqrshrnsu : SDNode<"ARMISD::VQRSHRNsu", SDTARMVSHX>;
+
+def NEONvsli      : SDNode<"ARMISD::VSLI", SDTARMVSHINS>;
+def NEONvsri      : SDNode<"ARMISD::VSRI", SDTARMVSHINS>;
+
+def SDTARMVGETLN  : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
+                                         SDTCisVT<2, i32>]>;
+def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
+def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
+
+def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
+def NEONvmovImm   : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>;
+def NEONvmvnImm   : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>;
+def NEONvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>;
+
+def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                           SDTCisVT<2, i32>]>;
+def NEONvorrImm   : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>;
+def NEONvbicImm   : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>;
+
+def NEONvbsl      : SDNode<"ARMISD::VBSL",
+                           SDTypeProfile<1, 3, [SDTCisVec<0>,
+                                                SDTCisSameAs<0, 1>,
+                                                SDTCisSameAs<0, 2>,
+                                                SDTCisSameAs<0, 3>]>>;
+
+def NEONvdup      : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
+
+// VDUPLANE can produce a quad-register result from a double-register source,
+// so the result is not constrained to match the source.
+def NEONvduplane  : SDNode<"ARMISD::VDUPLANE",
+                           SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                                SDTCisVT<2, i32>]>>;
+
+def SDTARMVEXT    : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
+def NEONvext      : SDNode<"ARMISD::VEXT", SDTARMVEXT>;
+
+def SDTARMVSHUF   : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
+def NEONvrev64    : SDNode<"ARMISD::VREV64", SDTARMVSHUF>;
+def NEONvrev32    : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
+def NEONvrev16    : SDNode<"ARMISD::VREV16", SDTARMVSHUF>;
+
+def SDTARMVSHUF2  : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>,
+                                         SDTCisSameAs<0, 3>]>;
+def NEONzip       : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>;
+def NEONuzp       : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
+def NEONtrn       : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
+
+def SDTARMVMULL   : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+                                         SDTCisSameAs<1, 2>]>;
+def NEONvmulls    : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
+def NEONvmullu    : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
+
+def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{
+  ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
+  unsigned EltBits = 0;
+  uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
+  return (EltBits == 32 && EltVal == 0);
+}]>;
+
+def NEONimmAllOnesV: PatLeaf<(NEONvmovImm (i32 timm)), [{
+  ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
+  unsigned EltBits = 0;
+  uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
+  return (EltBits == 8 && EltVal == 0xff);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// NEON load / store instructions
+//===----------------------------------------------------------------------===//
+
+// Use VLDM to load a Q register as a D register pair.
+// This is a pseudo instruction that is expanded to VLDMD after reg alloc.
+def VLDMQIA
+  : PseudoVFPLdStM<(outs DPair:$dst), (ins GPR:$Rn),
+                    IIC_fpLoad_m, "",
+                   [(set DPair:$dst, (v2f64 (word_alignedload GPR:$Rn)))]>;
+
+// Use VSTM to store a Q register as a D register pair.
+// This is a pseudo instruction that is expanded to VSTMD after reg alloc.
+def VSTMQIA
+  : PseudoVFPLdStM<(outs), (ins DPair:$src, GPR:$Rn),
+                    IIC_fpStore_m, "",
+                   [(word_alignedstore (v2f64 DPair:$src), GPR:$Rn)]>;
+
+// Classes for VLD* pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VLDQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst), (ins addrmode6:$addr), itin, "">;
+class VLDQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset), itin,
+                "$addr.addr = $wb">;
+class VLDQWBfixedPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr), itin,
+                "$addr.addr = $wb">;
+class VLDQWBregisterPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, rGPR:$offset), itin,
+                "$addr.addr = $wb">;
+
+class VLDQQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst), (ins addrmode6:$addr), itin, "">;
+class VLDQQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset), itin,
+                "$addr.addr = $wb">;
+class VLDQQWBfixedPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr), itin,
+                "$addr.addr = $wb">;
+class VLDQQWBregisterPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, rGPR:$offset), itin,
+                "$addr.addr = $wb">;
+
+
+class VLDQQQQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src),itin,
+                "$src = $dst">;
+class VLDQQQQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin,
+                "$addr.addr = $wb, $src = $dst">;
+
+let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
+
+//   VLD1     : Vector Load (multiple single elements)
+class VLD1D<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd),
+          (ins AddrMode:$Rn), IIC_VLD1,
+          "vld1", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDST1Instruction";
+}
+class VLD1Q<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd),
+          (ins AddrMode:$Rn), IIC_VLD1x2,
+          "vld1", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST1Instruction";
+}
+
+def  VLD1d8   : VLD1D<{0,0,0,?}, "8",  addrmode6align64>;
+def  VLD1d16  : VLD1D<{0,1,0,?}, "16", addrmode6align64>;
+def  VLD1d32  : VLD1D<{1,0,0,?}, "32", addrmode6align64>;
+def  VLD1d64  : VLD1D<{1,1,0,?}, "64", addrmode6align64>;
+
+def  VLD1q8   : VLD1Q<{0,0,?,?}, "8",  addrmode6align64or128>;
+def  VLD1q16  : VLD1Q<{0,1,?,?}, "16", addrmode6align64or128>;
+def  VLD1q32  : VLD1Q<{1,0,?,?}, "32", addrmode6align64or128>;
+def  VLD1q64  : VLD1Q<{1,1,?,?}, "64", addrmode6align64or128>;
+
+// ...with address register writeback:
+multiclass VLD1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0,0b10, 0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
+                     (ins AddrMode:$Rn), IIC_VLD1u,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+  def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1u,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+}
+multiclass VLD1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
+                    (ins AddrMode:$Rn), IIC_VLD1x2u,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+  def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+}
+
+defm VLD1d8wb  : VLD1DWB<{0,0,0,?}, "8",  addrmode6align64>;
+defm VLD1d16wb : VLD1DWB<{0,1,0,?}, "16", addrmode6align64>;
+defm VLD1d32wb : VLD1DWB<{1,0,0,?}, "32", addrmode6align64>;
+defm VLD1d64wb : VLD1DWB<{1,1,0,?}, "64", addrmode6align64>;
+defm VLD1q8wb  : VLD1QWB<{0,0,?,?}, "8",  addrmode6align64or128>;
+defm VLD1q16wb : VLD1QWB<{0,1,?,?}, "16", addrmode6align64or128>;
+defm VLD1q32wb : VLD1QWB<{1,0,?,?}, "32", addrmode6align64or128>;
+defm VLD1q64wb : VLD1QWB<{1,1,?,?}, "64", addrmode6align64or128>;
+
+// ...with 3 registers
+class VLD1D3<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd),
+          (ins AddrMode:$Rn), IIC_VLD1x3, "vld1", Dt,
+          "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDST1Instruction";
+}
+multiclass VLD1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0,0b10,0b0110, op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
+                    (ins AddrMode:$Rn), IIC_VLD1x2u,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+  def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+}
+
+def VLD1d8T      : VLD1D3<{0,0,0,?}, "8",  addrmode6align64>;
+def VLD1d16T     : VLD1D3<{0,1,0,?}, "16", addrmode6align64>;
+def VLD1d32T     : VLD1D3<{1,0,0,?}, "32", addrmode6align64>;
+def VLD1d64T     : VLD1D3<{1,1,0,?}, "64", addrmode6align64>;
+
+defm VLD1d8Twb  : VLD1D3WB<{0,0,0,?}, "8",  addrmode6align64>;
+defm VLD1d16Twb : VLD1D3WB<{0,1,0,?}, "16", addrmode6align64>;
+defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32", addrmode6align64>;
+defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64", addrmode6align64>;
+
+def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>;
+def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>;
+def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>;
+
+// ...with 4 registers
+class VLD1D4<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0, 0b10, 0b0010, op7_4, (outs VecListFourD:$Vd),
+          (ins AddrMode:$Rn), IIC_VLD1x4, "vld1", Dt,
+          "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST1Instruction";
+}
+multiclass VLD1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0,0b10,0b0010, op7_4, (outs VecListFourD:$Vd, GPR:$wb),
+                    (ins AddrMode:$Rn), IIC_VLD1x2u,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+  def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+}
+
+def VLD1d8Q      : VLD1D4<{0,0,?,?}, "8",  addrmode6align64or128or256>;
+def VLD1d16Q     : VLD1D4<{0,1,?,?}, "16", addrmode6align64or128or256>;
+def VLD1d32Q     : VLD1D4<{1,0,?,?}, "32", addrmode6align64or128or256>;
+def VLD1d64Q     : VLD1D4<{1,1,?,?}, "64", addrmode6align64or128or256>;
+
+defm VLD1d8Qwb   : VLD1D4WB<{0,0,?,?}, "8",  addrmode6align64or128or256>;
+defm VLD1d16Qwb  : VLD1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
+defm VLD1d32Qwb  : VLD1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
+defm VLD1d64Qwb  : VLD1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
+
+def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>;
+def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>;
+def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>;
+
+//   VLD2     : Vector Load (multiple 2-element structures)
+class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
+           InstrItinClass itin, Operand AddrMode>
+  : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd),
+          (ins AddrMode:$Rn), itin,
+          "vld2", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST2Instruction";
+}
+
+def  VLD2d8   : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2,
+                     addrmode6align64or128>;
+def  VLD2d16  : VLD2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2,
+                     addrmode6align64or128>;
+def  VLD2d32  : VLD2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2,
+                     addrmode6align64or128>;
+
+def  VLD2q8   : VLD2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2,
+                     addrmode6align64or128or256>;
+def  VLD2q16  : VLD2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2,
+                     addrmode6align64or128or256>;
+def  VLD2q32  : VLD2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2,
+                     addrmode6align64or128or256>;
+
+def  VLD2q8Pseudo  : VLDQQPseudo<IIC_VLD2x2>;
+def  VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>;
+def  VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>;
+
+// ...with address register writeback:
+multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
+                  RegisterOperand VdTy, InstrItinClass itin, Operand AddrMode> {
+  def _fixed : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
+                     (ins AddrMode:$Rn), itin,
+                     "vld2", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST2Instruction";
+  }
+  def _register : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm), itin,
+                        "vld2", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST2Instruction";
+  }
+}
+
+defm VLD2d8wb  : VLD2WB<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2u,
+                        addrmode6align64or128>;
+defm VLD2d16wb : VLD2WB<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2u,
+                        addrmode6align64or128>;
+defm VLD2d32wb : VLD2WB<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2u,
+                        addrmode6align64or128>;
+
+defm VLD2q8wb  : VLD2WB<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2u,
+                        addrmode6align64or128or256>;
+defm VLD2q16wb : VLD2WB<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2u,
+                        addrmode6align64or128or256>;
+defm VLD2q32wb : VLD2WB<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2u,
+                        addrmode6align64or128or256>;
+
+def VLD2q8PseudoWB_fixed     : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
+def VLD2q16PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
+def VLD2q32PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
+def VLD2q8PseudoWB_register  : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+
+// ...with double-spaced registers
+def  VLD2b8    : VLD2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2,
+                      addrmode6align64or128>;
+def  VLD2b16   : VLD2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2,
+                      addrmode6align64or128>;
+def  VLD2b32   : VLD2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2,
+                      addrmode6align64or128>;
+defm VLD2b8wb  : VLD2WB<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2u,
+                        addrmode6align64or128>;
+defm VLD2b16wb : VLD2WB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2u,
+                        addrmode6align64or128>;
+defm VLD2b32wb : VLD2WB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2u,
+                        addrmode6align64or128>;
+
+//   VLD3     : Vector Load (multiple 3-element structures)
+class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6:$Rn), IIC_VLD3,
+          "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDST3Instruction";
+}
+
+def  VLD3d8   : VLD3D<0b0100, {0,0,0,?}, "8">;
+def  VLD3d16  : VLD3D<0b0100, {0,1,0,?}, "16">;
+def  VLD3d32  : VLD3D<0b0100, {1,0,0,?}, "32">;
+
+def  VLD3d8Pseudo  : VLDQQPseudo<IIC_VLD3>;
+def  VLD3d16Pseudo : VLDQQPseudo<IIC_VLD3>;
+def  VLD3d32Pseudo : VLDQQPseudo<IIC_VLD3>;
+
+// ...with address register writeback:
+class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD3u,
+          "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDST3Instruction";
+}
+
+def VLD3d8_UPD  : VLD3DWB<0b0100, {0,0,0,?}, "8">;
+def VLD3d16_UPD : VLD3DWB<0b0100, {0,1,0,?}, "16">;
+def VLD3d32_UPD : VLD3DWB<0b0100, {1,0,0,?}, "32">;
+
+def VLD3d8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3u>;
+def VLD3d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
+def VLD3d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
+
+// ...with double-spaced registers:
+def VLD3q8      : VLD3D<0b0101, {0,0,0,?}, "8">;
+def VLD3q16     : VLD3D<0b0101, {0,1,0,?}, "16">;
+def VLD3q32     : VLD3D<0b0101, {1,0,0,?}, "32">;
+def VLD3q8_UPD  : VLD3DWB<0b0101, {0,0,0,?}, "8">;
+def VLD3q16_UPD : VLD3DWB<0b0101, {0,1,0,?}, "16">;
+def VLD3q32_UPD : VLD3DWB<0b0101, {1,0,0,?}, "32">;
+
+def VLD3q8Pseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+
+// ...alternate versions to be allocated odd register numbers:
+def VLD3q8oddPseudo   : VLDQQQQPseudo<IIC_VLD3>;
+def VLD3q16oddPseudo  : VLDQQQQPseudo<IIC_VLD3>;
+def VLD3q32oddPseudo  : VLDQQQQPseudo<IIC_VLD3>;
+
+def VLD3q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+
+//   VLD4     : Vector Load (multiple 4-element structures)
+class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$Rn), IIC_VLD4,
+          "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST4Instruction";
+}
+
+def  VLD4d8   : VLD4D<0b0000, {0,0,?,?}, "8">;
+def  VLD4d16  : VLD4D<0b0000, {0,1,?,?}, "16">;
+def  VLD4d32  : VLD4D<0b0000, {1,0,?,?}, "32">;
+
+def  VLD4d8Pseudo  : VLDQQPseudo<IIC_VLD4>;
+def  VLD4d16Pseudo : VLDQQPseudo<IIC_VLD4>;
+def  VLD4d32Pseudo : VLDQQPseudo<IIC_VLD4>;
+
+// ...with address register writeback:
+class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD4u,
+          "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST4Instruction";
+}
+
+def VLD4d8_UPD  : VLD4DWB<0b0000, {0,0,?,?}, "8">;
+def VLD4d16_UPD : VLD4DWB<0b0000, {0,1,?,?}, "16">;
+def VLD4d32_UPD : VLD4DWB<0b0000, {1,0,?,?}, "32">;
+
+def VLD4d8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4u>;
+def VLD4d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
+def VLD4d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
+
+// ...with double-spaced registers:
+def VLD4q8      : VLD4D<0b0001, {0,0,?,?}, "8">;
+def VLD4q16     : VLD4D<0b0001, {0,1,?,?}, "16">;
+def VLD4q32     : VLD4D<0b0001, {1,0,?,?}, "32">;
+def VLD4q8_UPD  : VLD4DWB<0b0001, {0,0,?,?}, "8">;
+def VLD4q16_UPD : VLD4DWB<0b0001, {0,1,?,?}, "16">;
+def VLD4q32_UPD : VLD4DWB<0b0001, {1,0,?,?}, "32">;
+
+def VLD4q8Pseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+
+// ...alternate versions to be allocated odd register numbers:
+def VLD4q8oddPseudo   : VLDQQQQPseudo<IIC_VLD4>;
+def VLD4q16oddPseudo  : VLDQQQQPseudo<IIC_VLD4>;
+def VLD4q32oddPseudo  : VLDQQQQPseudo<IIC_VLD4>;
+
+def VLD4q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+
+} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
+
+// Classes for VLD*LN pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VLDQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst),
+                (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane),
+                itin, "$src = $dst">;
+class VLDQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">;
+class VLDQQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst),
+                (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane),
+                itin, "$src = $dst">;
+class VLDQQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">;
+class VLDQQQQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQQQPR:$dst),
+                (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane),
+                itin, "$src = $dst">;
+class VLDQQQQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">;
+
+//   VLD1LN   : Vector Load (single element to one lane)
+class VLD1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+             PatFrag LoadOp>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd),
+          (ins addrmode6:$Rn, DPR:$src, nohash_imm:$lane),
+          IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn",
+          "$src = $Vd",
+          [(set DPR:$Vd, (vector_insert (Ty DPR:$src),
+                                         (i32 (LoadOp addrmode6:$Rn)),
+                                         imm:$lane))]> {
+  let Rm = 0b1111;
+  let DecoderMethod = "DecodeVLD1LN";
+}
+class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+             PatFrag LoadOp>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd),
+          (ins addrmode6oneL32:$Rn, DPR:$src, nohash_imm:$lane),
+          IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn",
+          "$src = $Vd",
+          [(set DPR:$Vd, (vector_insert (Ty DPR:$src),
+                                         (i32 (LoadOp addrmode6oneL32:$Rn)),
+                                         imm:$lane))]> {
+  let Rm = 0b1111;
+  let DecoderMethod = "DecodeVLD1LN";
+}
+class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> {
+  let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src),
+                                               (i32 (LoadOp addrmode6:$addr)),
+                                               imm:$lane))];
+}
+
+def VLD1LNd8  : VLD1LN<0b0000, {?,?,?,0}, "8", v8i8, extloadi8> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD1LNd16 : VLD1LN<0b0100, {?,?,0,?}, "16", v4i16, extloadi16> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{5-4} = Rn{5-4};
+}
+def VLD1LNd32 : VLD1LN32<0b1000, {?,0,?,?}, "32", v2i32, load> {
+  let Inst{7} = lane{0};
+  let Inst{5-4} = Rn{5-4};
+}
+
+def VLD1LNq8Pseudo  : VLD1QLNPseudo<v16i8, extloadi8>;
+def VLD1LNq16Pseudo : VLD1QLNPseudo<v8i16, extloadi16>;
+def VLD1LNq32Pseudo : VLD1QLNPseudo<v4i32, load>;
+
+def : Pat<(vector_insert (v2f32 DPR:$src),
+                         (f32 (load addrmode6:$addr)), imm:$lane),
+          (VLD1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>;
+def : Pat<(vector_insert (v4f32 QPR:$src),
+                         (f32 (load addrmode6:$addr)), imm:$lane),
+          (VLD1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
+
+let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
+
+// ...with address register writeback:
+class VLD1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$src, nohash_imm:$lane), IIC_VLD1lnu, "vld1", Dt,
+          "\\{$Vd[$lane]\\}, $Rn$Rm",
+          "$src = $Vd, $Rn.addr = $wb", []> {
+  let DecoderMethod = "DecodeVLD1LN";
+}
+
+def VLD1LNd8_UPD  : VLD1LNWB<0b0000, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD1LNd16_UPD : VLD1LNWB<0b0100, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{4};
+}
+def VLD1LNd32_UPD : VLD1LNWB<0b1000, {?,0,?,?}, "32"> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{4};
+  let Inst{4} = Rn{4};
+}
+
+def VLD1LNq8Pseudo_UPD  : VLDQLNWBPseudo<IIC_VLD1lnu>;
+def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
+def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
+
+//   VLD2LN   : Vector Load (single 2-element structure to one lane)
+class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2),
+          (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, nohash_imm:$lane),
+          IIC_VLD2ln, "vld2", Dt, "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn",
+          "$src1 = $Vd, $src2 = $dst2", []> {
+  let Rm = 0b1111;
+  let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD2LN";
+}
+
+def VLD2LNd8  : VLD2LN<0b0001, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD2LNd16 : VLD2LN<0b0101, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD2LNd32 : VLD2LN<0b1001, {?,0,0,?}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD2LNd8Pseudo  : VLDQLNPseudo<IIC_VLD2ln>;
+def VLD2LNd16Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
+def VLD2LNd32Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
+
+// ...with double-spaced registers:
+def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD2LNq32 : VLD2LN<0b1001, {?,1,0,?}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD2LNq16Pseudo : VLDQQLNPseudo<IIC_VLD2ln>;
+def VLD2LNq32Pseudo : VLDQQLNPseudo<IIC_VLD2ln>;
+
+// ...with address register writeback:
+class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2lnu, "vld2", Dt,
+          "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn$Rm",
+          "$src1 = $Vd, $src2 = $dst2, $Rn.addr = $wb", []> {
+  let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD2LN";
+}
+
+def VLD2LNd8_UPD  : VLD2LNWB<0b0001, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD2LNd16_UPD : VLD2LNWB<0b0101, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,0,?}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD2LNd8Pseudo_UPD  : VLDQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
+
+def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,0,?}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>;
+
+//   VLD3LN   : Vector Load (single 3-element structure to one lane)
+class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3,
+          nohash_imm:$lane), IIC_VLD3ln, "vld3", Dt,
+          "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn",
+          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []> {
+  let Rm = 0b1111;
+  let DecoderMethod = "DecodeVLD3LN";
+}
+
+def VLD3LNd8  : VLD3LN<0b0010, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VLD3LNd8Pseudo  : VLDQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNd16Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNd32Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
+
+// ...with double-spaced registers:
+def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VLD3LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>;
+
+// ...with address register writeback:
+class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane),
+          IIC_VLD3lnu, "vld3", Dt,
+          "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn$Rm",
+          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $Rn.addr = $wb",
+          []> {
+  let DecoderMethod = "DecodeVLD3LN";
+}
+
+def VLD3LNd8_UPD  : VLD3LNWB<0b0010, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD3LNd8Pseudo_UPD  : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+
+def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>;
+
+//   VLD4LN   : Vector Load (single 4-element structure to one lane)
+class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
+          nohash_imm:$lane), IIC_VLD4ln, "vld4", Dt,
+          "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn",
+          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD4LN";
+}
+
+def VLD4LNd8  : VLD4LN<0b0011, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD4LNd16 : VLD4LN<0b0111, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32"> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VLD4LNd8Pseudo  : VLDQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNd16Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNd32Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
+
+// ...with double-spaced registers:
+def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32"> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VLD4LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>;
+
+// ...with address register writeback:
+class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane),
+          IIC_VLD4lnu, "vld4", Dt,
+"\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn$Rm",
+"$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $Rn.addr = $wb",
+          []> {
+  let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD4LN"  ;
+}
+
+def VLD4LNd8_UPD  : VLD4LNWB<0b0011, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD4LNd16_UPD : VLD4LNWB<0b0111, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32"> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VLD4LNd8Pseudo_UPD  : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+
+def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32"> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
+
+} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
+
+//   VLD1DUP  : Vector Load (single element to all lanes)
+class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
+              Operand AddrMode>
+  : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListOneDAllLanes:$Vd),
+          (ins AddrMode:$Rn),
+          IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "",
+          [(set VecListOneDAllLanes:$Vd,
+                (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD1DupInstruction";
+}
+def VLD1DUPd8  : VLD1DUP<{0,0,0,?}, "8", v8i8, extloadi8,
+                         addrmode6dupalignNone>;
+def VLD1DUPd16 : VLD1DUP<{0,1,0,?}, "16", v4i16, extloadi16,
+                         addrmode6dupalign16>;
+def VLD1DUPd32 : VLD1DUP<{1,0,0,?}, "32", v2i32, load,
+                         addrmode6dupalign32>;
+
+def : Pat<(v2f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
+          (VLD1DUPd32 addrmode6:$addr)>;
+
+class VLD1QDUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
+               Operand AddrMode>
+  : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListDPairAllLanes:$Vd),
+          (ins AddrMode:$Rn), IIC_VLD1dup,
+          "vld1", Dt, "$Vd, $Rn", "",
+          [(set VecListDPairAllLanes:$Vd,
+                (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD1DupInstruction";
+}
+
+def VLD1DUPq8  : VLD1QDUP<{0,0,1,0}, "8", v16i8, extloadi8,
+                          addrmode6dupalignNone>;
+def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16", v8i16, extloadi16,
+                          addrmode6dupalign16>;
+def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32", v4i32, load,
+                          addrmode6dupalign32>;
+
+def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
+          (VLD1DUPq32 addrmode6:$addr)>;
+
+let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
+// ...with address register writeback:
+multiclass VLD1DUPWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
+                     (outs VecListOneDAllLanes:$Vd, GPR:$wb),
+                     (ins AddrMode:$Rn), IIC_VLD1dupu,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD1DupInstruction";
+  }
+  def _register : NLdSt<1, 0b10, 0b1100, op7_4,
+                        (outs VecListOneDAllLanes:$Vd, GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1dupu,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD1DupInstruction";
+  }
+}
+multiclass VLD1QDUPWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
+                     (outs VecListDPairAllLanes:$Vd, GPR:$wb),
+                     (ins AddrMode:$Rn), IIC_VLD1dupu,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD1DupInstruction";
+  }
+  def _register : NLdSt<1, 0b10, 0b1100, op7_4,
+                        (outs VecListDPairAllLanes:$Vd, GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1dupu,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD1DupInstruction";
+  }
+}
+
+defm VLD1DUPd8wb  : VLD1DUPWB<{0,0,0,0}, "8", addrmode6dupalignNone>;
+defm VLD1DUPd16wb : VLD1DUPWB<{0,1,0,?}, "16", addrmode6dupalign16>;
+defm VLD1DUPd32wb : VLD1DUPWB<{1,0,0,?}, "32", addrmode6dupalign32>;
+
+defm VLD1DUPq8wb  : VLD1QDUPWB<{0,0,1,0}, "8", addrmode6dupalignNone>;
+defm VLD1DUPq16wb : VLD1QDUPWB<{0,1,1,?}, "16", addrmode6dupalign16>;
+defm VLD1DUPq32wb : VLD1QDUPWB<{1,0,1,?}, "32", addrmode6dupalign32>;
+
+//   VLD2DUP  : Vector Load (single 2-element structure to all lanes)
+class VLD2DUP<bits<4> op7_4, string Dt, RegisterOperand VdTy, Operand AddrMode>
+  : NLdSt<1, 0b10, 0b1101, op7_4, (outs VdTy:$Vd),
+          (ins AddrMode:$Rn), IIC_VLD2dup,
+          "vld2", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD2DupInstruction";
+}
+
+def VLD2DUPd8  : VLD2DUP<{0,0,0,?}, "8",  VecListDPairAllLanes,
+                         addrmode6dupalign16>;
+def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16", VecListDPairAllLanes,
+                         addrmode6dupalign32>;
+def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32", VecListDPairAllLanes,
+                         addrmode6dupalign64>;
+
+// HACK this one, VLD2DUPd8x2 must be changed at the same time with VLD2b8 or
+// "vld2.8 {d0[], d2[]}, [r4:32]" will become "vld2.8 {d0, d2}, [r4:32]".
+// ...with double-spaced registers
+def VLD2DUPd8x2  : VLD2DUP<{0,0,1,?}, "8",  VecListDPairSpacedAllLanes,
+                           addrmode6dupalign16>;
+def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
+                           addrmode6dupalign32>;
+def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
+                           addrmode6dupalign64>;
+
+// ...with address register writeback:
+multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
+                     Operand AddrMode> {
+  def _fixed : NLdSt<1, 0b10, 0b1101, op7_4,
+                     (outs VdTy:$Vd, GPR:$wb),
+                     (ins AddrMode:$Rn), IIC_VLD2dupu,
+                     "vld2", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD2DupInstruction";
+  }
+  def _register : NLdSt<1, 0b10, 0b1101, op7_4,
+                        (outs VdTy:$Vd, GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD2dupu,
+                        "vld2", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD2DupInstruction";
+  }
+}
+
+defm VLD2DUPd8wb    : VLD2DUPWB<{0,0,0,0}, "8",  VecListDPairAllLanes,
+                                addrmode6dupalign16>;
+defm VLD2DUPd16wb   : VLD2DUPWB<{0,1,0,?}, "16", VecListDPairAllLanes,
+                                addrmode6dupalign32>;
+defm VLD2DUPd32wb   : VLD2DUPWB<{1,0,0,?}, "32", VecListDPairAllLanes,
+                                addrmode6dupalign64>;
+
+defm VLD2DUPd8x2wb  : VLD2DUPWB<{0,0,1,0}, "8",  VecListDPairSpacedAllLanes,
+                                addrmode6dupalign16>;
+defm VLD2DUPd16x2wb : VLD2DUPWB<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
+                                addrmode6dupalign32>;
+defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
+                                addrmode6dupalign64>;
+
+//   VLD3DUP  : Vector Load (single 3-element structure to all lanes)
+class VLD3DUP<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6dup:$Rn), IIC_VLD3dup,
+          "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = 0;
+  let DecoderMethod = "DecodeVLD3DupInstruction";
+}
+
+def VLD3DUPd8  : VLD3DUP<{0,0,0,?}, "8">;
+def VLD3DUPd16 : VLD3DUP<{0,1,0,?}, "16">;
+def VLD3DUPd32 : VLD3DUP<{1,0,0,?}, "32">;
+
+def VLD3DUPd8Pseudo  : VLDQQPseudo<IIC_VLD3dup>;
+def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>;
+def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>;
+
+// ...with double-spaced registers (not used for codegen):
+def VLD3DUPq8  : VLD3DUP<{0,0,1,?}, "8">;
+def VLD3DUPq16 : VLD3DUP<{0,1,1,?}, "16">;
+def VLD3DUPq32 : VLD3DUP<{1,0,1,?}, "32">;
+
+// ...with address register writeback:
+class VLD3DUPWB<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+          (ins AddrMode:$Rn, am6offset:$Rm), IIC_VLD3dupu,
+          "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = 0;
+  let DecoderMethod = "DecodeVLD3DupInstruction";
+}
+
+def VLD3DUPd8_UPD  : VLD3DUPWB<{0,0,0,0}, "8",  addrmode6dupalign64>;
+def VLD3DUPd16_UPD : VLD3DUPWB<{0,1,0,?}, "16", addrmode6dupalign64>;
+def VLD3DUPd32_UPD : VLD3DUPWB<{1,0,0,?}, "32", addrmode6dupalign64>;
+
+def VLD3DUPq8_UPD  : VLD3DUPWB<{0,0,1,0}, "8",  addrmode6dupalign64>;
+def VLD3DUPq16_UPD : VLD3DUPWB<{0,1,1,?}, "16", addrmode6dupalign64>;
+def VLD3DUPq32_UPD : VLD3DUPWB<{1,0,1,?}, "32", addrmode6dupalign64>;
+
+def VLD3DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3dupu>;
+def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
+def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
+
+//   VLD4DUP  : Vector Load (single 4-element structure to all lanes)
+class VLD4DUP<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1111, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6dup:$Rn), IIC_VLD4dup,
+          "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD4DupInstruction";
+}
+
+def VLD4DUPd8  : VLD4DUP<{0,0,0,?}, "8">;
+def VLD4DUPd16 : VLD4DUP<{0,1,0,?}, "16">;
+def VLD4DUPd32 : VLD4DUP<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; }
+
+def VLD4DUPd8Pseudo  : VLDQQPseudo<IIC_VLD4dup>;
+def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+
+// ...with double-spaced registers (not used for codegen):
+def VLD4DUPq8  : VLD4DUP<{0,0,1,?}, "8">;
+def VLD4DUPq16 : VLD4DUP<{0,1,1,?}, "16">;
+def VLD4DUPq32 : VLD4DUP<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; }
+
+// ...with address register writeback:
+class VLD4DUPWB<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1111, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+          (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD4dupu,
+          "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD4DupInstruction";
+}
+
+def VLD4DUPd8_UPD  : VLD4DUPWB<{0,0,0,0}, "8">;
+def VLD4DUPd16_UPD : VLD4DUPWB<{0,1,0,?}, "16">;
+def VLD4DUPd32_UPD : VLD4DUPWB<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; }
+
+def VLD4DUPq8_UPD  : VLD4DUPWB<{0,0,1,0}, "8">;
+def VLD4DUPq16_UPD : VLD4DUPWB<{0,1,1,?}, "16">;
+def VLD4DUPq32_UPD : VLD4DUPWB<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; }
+
+def VLD4DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4dupu>;
+def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+
+} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
+
+let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
+
+// Classes for VST* pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VSTQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src), itin, "">;
+class VSTQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QPR:$src), itin,
+                "$addr.addr = $wb">;
+class VSTQWBfixedPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, QPR:$src), itin,
+                "$addr.addr = $wb">;
+class VSTQWBregisterPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, rGPR:$offset, QPR:$src), itin,
+                "$addr.addr = $wb">;
+class VSTQQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src), itin, "">;
+class VSTQQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQPR:$src), itin,
+                "$addr.addr = $wb">;
+class VSTQQWBfixedPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, QQPR:$src), itin,
+                "$addr.addr = $wb">;
+class VSTQQWBregisterPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, rGPR:$offset, QQPR:$src), itin,
+                "$addr.addr = $wb">;
+
+class VSTQQQQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src), itin, "">;
+class VSTQQQQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin,
+                "$addr.addr = $wb">;
+
+//   VST1     : Vector Store (multiple single elements)
+class VST1D<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins AddrMode:$Rn, VecListOneD:$Vd),
+          IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDST1Instruction";
+}
+class VST1Q<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins AddrMode:$Rn, VecListDPair:$Vd),
+          IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST1Instruction";
+}
+
+def  VST1d8   : VST1D<{0,0,0,?}, "8",  addrmode6align64>;
+def  VST1d16  : VST1D<{0,1,0,?}, "16", addrmode6align64>;
+def  VST1d32  : VST1D<{1,0,0,?}, "32", addrmode6align64>;
+def  VST1d64  : VST1D<{1,1,0,?}, "64", addrmode6align64>;
+
+def  VST1q8   : VST1Q<{0,0,?,?}, "8",  addrmode6align64or128>;
+def  VST1q16  : VST1Q<{0,1,?,?}, "16", addrmode6align64or128>;
+def  VST1q32  : VST1Q<{1,0,?,?}, "32", addrmode6align64or128>;
+def  VST1q64  : VST1Q<{1,1,?,?}, "64", addrmode6align64or128>;
+
+// ...with address register writeback:
+multiclass VST1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0,0b00, 0b0111,op7_4, (outs GPR:$wb),
+                     (ins AddrMode:$Rn, VecListOneD:$Vd), IIC_VLD1u,
+                     "vst1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+  def _register : NLdSt<0,0b00,0b0111,op7_4, (outs GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListOneD:$Vd),
+                        IIC_VLD1u,
+                        "vst1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+}
+multiclass VST1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
+                    (ins AddrMode:$Rn, VecListDPair:$Vd), IIC_VLD1x2u,
+                     "vst1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+  def _register : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListDPair:$Vd),
+                        IIC_VLD1x2u,
+                        "vst1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+}
+
+defm VST1d8wb  : VST1DWB<{0,0,0,?}, "8",  addrmode6align64>;
+defm VST1d16wb : VST1DWB<{0,1,0,?}, "16", addrmode6align64>;
+defm VST1d32wb : VST1DWB<{1,0,0,?}, "32", addrmode6align64>;
+defm VST1d64wb : VST1DWB<{1,1,0,?}, "64", addrmode6align64>;
+
+defm VST1q8wb  : VST1QWB<{0,0,?,?}, "8",  addrmode6align64or128>;
+defm VST1q16wb : VST1QWB<{0,1,?,?}, "16", addrmode6align64or128>;
+defm VST1q32wb : VST1QWB<{1,0,?,?}, "32", addrmode6align64or128>;
+defm VST1q64wb : VST1QWB<{1,1,?,?}, "64", addrmode6align64or128>;
+
+// ...with 3 registers
+class VST1D3<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0, 0b00, 0b0110, op7_4, (outs),
+          (ins AddrMode:$Rn, VecListThreeD:$Vd),
+          IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDST1Instruction";
+}
+multiclass VST1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
+                    (ins AddrMode:$Rn, VecListThreeD:$Vd), IIC_VLD1x3u,
+                     "vst1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+  def _register : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListThreeD:$Vd),
+                        IIC_VLD1x3u,
+                        "vst1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+}
+
+def VST1d8T     : VST1D3<{0,0,0,?}, "8",  addrmode6align64>;
+def VST1d16T    : VST1D3<{0,1,0,?}, "16", addrmode6align64>;
+def VST1d32T    : VST1D3<{1,0,0,?}, "32", addrmode6align64>;
+def VST1d64T    : VST1D3<{1,1,0,?}, "64", addrmode6align64>;
+
+defm VST1d8Twb  : VST1D3WB<{0,0,0,?}, "8",  addrmode6align64>;
+defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16", addrmode6align64>;
+defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32", addrmode6align64>;
+defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>;
+
+def VST1d64TPseudo            : VSTQQPseudo<IIC_VST1x3>;
+def VST1d64TPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x3u>;
+def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>;
+
+// ...with 4 registers
+class VST1D4<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
+          (ins AddrMode:$Rn, VecListFourD:$Vd),
+          IIC_VST1x4, "vst1", Dt, "$Vd, $Rn", "",
+          []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST1Instruction";
+}
+multiclass VST1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
+                    (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1x4u,
+                     "vst1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+  def _register : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd),
+                        IIC_VLD1x4u,
+                        "vst1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST1Instruction";
+  }
+}
+
+def VST1d8Q     : VST1D4<{0,0,?,?}, "8",  addrmode6align64or128or256>;
+def VST1d16Q    : VST1D4<{0,1,?,?}, "16", addrmode6align64or128or256>;
+def VST1d32Q    : VST1D4<{1,0,?,?}, "32", addrmode6align64or128or256>;
+def VST1d64Q    : VST1D4<{1,1,?,?}, "64", addrmode6align64or128or256>;
+
+defm VST1d8Qwb  : VST1D4WB<{0,0,?,?}, "8",  addrmode6align64or128or256>;
+defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
+defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
+defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
+
+def VST1d64QPseudo            : VSTQQPseudo<IIC_VST1x4>;
+def VST1d64QPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x4u>;
+def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>;
+
+//   VST2     : Vector Store (multiple 2-element structures)
+class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
+            InstrItinClass itin, Operand AddrMode>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs), (ins AddrMode:$Rn, VdTy:$Vd),
+          itin, "vst2", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST2Instruction";
+}
+
+def  VST2d8   : VST2<0b1000, {0,0,?,?}, "8",  VecListDPair, IIC_VST2,
+                     addrmode6align64or128>;
+def  VST2d16  : VST2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VST2,
+                     addrmode6align64or128>;
+def  VST2d32  : VST2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VST2,
+                     addrmode6align64or128>;
+
+def  VST2q8   : VST2<0b0011, {0,0,?,?}, "8",  VecListFourD, IIC_VST2x2,
+                     addrmode6align64or128or256>;
+def  VST2q16  : VST2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VST2x2,
+                     addrmode6align64or128or256>;
+def  VST2q32  : VST2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VST2x2,
+                     addrmode6align64or128or256>;
+
+def  VST2q8Pseudo  : VSTQQPseudo<IIC_VST2x2>;
+def  VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>;
+def  VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>;
+
+// ...with address register writeback:
+multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
+                   RegisterOperand VdTy, Operand AddrMode> {
+  def _fixed : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+                     (ins AddrMode:$Rn, VdTy:$Vd), IIC_VLD1u,
+                     "vst2", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST2Instruction";
+  }
+  def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u,
+                        "vst2", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST2Instruction";
+  }
+}
+multiclass VST2QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
+  def _fixed : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
+                     (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1u,
+                     "vst2", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST2Instruction";
+  }
+  def _register : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd),
+                        IIC_VLD1u,
+                        "vst2", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDST2Instruction";
+  }
+}
+
+defm VST2d8wb    : VST2DWB<0b1000, {0,0,?,?}, "8",  VecListDPair,
+                           addrmode6align64or128>;
+defm VST2d16wb   : VST2DWB<0b1000, {0,1,?,?}, "16", VecListDPair,
+                           addrmode6align64or128>;
+defm VST2d32wb   : VST2DWB<0b1000, {1,0,?,?}, "32", VecListDPair,
+                           addrmode6align64or128>;
+
+defm VST2q8wb    : VST2QWB<{0,0,?,?}, "8", addrmode6align64or128or256>;
+defm VST2q16wb   : VST2QWB<{0,1,?,?}, "16", addrmode6align64or128or256>;
+defm VST2q32wb   : VST2QWB<{1,0,?,?}, "32", addrmode6align64or128or256>;
+
+def VST2q8PseudoWB_fixed     : VSTQQWBfixedPseudo<IIC_VST2x2u>;
+def VST2q16PseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST2x2u>;
+def VST2q32PseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST2x2u>;
+def VST2q8PseudoWB_register  : VSTQQWBregisterPseudo<IIC_VST2x2u>;
+def VST2q16PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
+def VST2q32PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
+
+// ...with double-spaced registers
+def VST2b8      : VST2<0b1001, {0,0,?,?}, "8",  VecListDPairSpaced, IIC_VST2,
+                      addrmode6align64or128>;
+def VST2b16     : VST2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VST2,
+                      addrmode6align64or128>;
+def VST2b32     : VST2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VST2,
+                      addrmode6align64or128>;
+defm VST2b8wb   : VST2DWB<0b1001, {0,0,?,?}, "8",  VecListDPairSpaced,
+                          addrmode6align64or128>;
+defm VST2b16wb  : VST2DWB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced,
+                          addrmode6align64or128>;
+defm VST2b32wb  : VST2DWB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced,
+                          addrmode6align64or128>;
+
+//   VST3     : Vector Store (multiple 3-element structures)
+class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3,
+          "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDST3Instruction";
+}
+
+def  VST3d8   : VST3D<0b0100, {0,0,0,?}, "8">;
+def  VST3d16  : VST3D<0b0100, {0,1,0,?}, "16">;
+def  VST3d32  : VST3D<0b0100, {1,0,0,?}, "32">;
+
+def  VST3d8Pseudo  : VSTQQPseudo<IIC_VST3>;
+def  VST3d16Pseudo : VSTQQPseudo<IIC_VST3>;
+def  VST3d32Pseudo : VSTQQPseudo<IIC_VST3>;
+
+// ...with address register writeback:
+class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3u,
+          "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDST3Instruction";
+}
+
+def VST3d8_UPD  : VST3DWB<0b0100, {0,0,0,?}, "8">;
+def VST3d16_UPD : VST3DWB<0b0100, {0,1,0,?}, "16">;
+def VST3d32_UPD : VST3DWB<0b0100, {1,0,0,?}, "32">;
+
+def VST3d8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST3u>;
+def VST3d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
+def VST3d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
+
+// ...with double-spaced registers:
+def VST3q8      : VST3D<0b0101, {0,0,0,?}, "8">;
+def VST3q16     : VST3D<0b0101, {0,1,0,?}, "16">;
+def VST3q32     : VST3D<0b0101, {1,0,0,?}, "32">;
+def VST3q8_UPD  : VST3DWB<0b0101, {0,0,0,?}, "8">;
+def VST3q16_UPD : VST3DWB<0b0101, {0,1,0,?}, "16">;
+def VST3q32_UPD : VST3DWB<0b0101, {1,0,0,?}, "32">;
+
+def VST3q8Pseudo_UPD  : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+
+// ...alternate versions to be allocated odd register numbers:
+def VST3q8oddPseudo   : VSTQQQQPseudo<IIC_VST3>;
+def VST3q16oddPseudo  : VSTQQQQPseudo<IIC_VST3>;
+def VST3q32oddPseudo  : VSTQQQQPseudo<IIC_VST3>;
+
+def VST3q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+
+//   VST4     : Vector Store (multiple 4-element structures)
+class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4),
+          IIC_VST4, "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn",
+          "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST4Instruction";
+}
+
+def  VST4d8   : VST4D<0b0000, {0,0,?,?}, "8">;
+def  VST4d16  : VST4D<0b0000, {0,1,?,?}, "16">;
+def  VST4d32  : VST4D<0b0000, {1,0,?,?}, "32">;
+
+def  VST4d8Pseudo  : VSTQQPseudo<IIC_VST4>;
+def  VST4d16Pseudo : VSTQQPseudo<IIC_VST4>;
+def  VST4d32Pseudo : VSTQQPseudo<IIC_VST4>;
+
+// ...with address register writeback:
+class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST4u,
+           "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDST4Instruction";
+}
+
+def VST4d8_UPD  : VST4DWB<0b0000, {0,0,?,?}, "8">;
+def VST4d16_UPD : VST4DWB<0b0000, {0,1,?,?}, "16">;
+def VST4d32_UPD : VST4DWB<0b0000, {1,0,?,?}, "32">;
+
+def VST4d8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST4u>;
+def VST4d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
+def VST4d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
+
+// ...with double-spaced registers:
+def VST4q8      : VST4D<0b0001, {0,0,?,?}, "8">;
+def VST4q16     : VST4D<0b0001, {0,1,?,?}, "16">;
+def VST4q32     : VST4D<0b0001, {1,0,?,?}, "32">;
+def VST4q8_UPD  : VST4DWB<0b0001, {0,0,?,?}, "8">;
+def VST4q16_UPD : VST4DWB<0b0001, {0,1,?,?}, "16">;
+def VST4q32_UPD : VST4DWB<0b0001, {1,0,?,?}, "32">;
+
+def VST4q8Pseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+
+// ...alternate versions to be allocated odd register numbers:
+def VST4q8oddPseudo   : VSTQQQQPseudo<IIC_VST4>;
+def VST4q16oddPseudo  : VSTQQQQPseudo<IIC_VST4>;
+def VST4q32oddPseudo  : VSTQQQQPseudo<IIC_VST4>;
+
+def VST4q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+
+} // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1
+
+// Classes for VST*LN pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VSTQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane),
+                itin, "">;
+class VSTQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb">;
+class VSTQQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane),
+                itin, "">;
+class VSTQQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb">;
+class VSTQQQQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane),
+                itin, "">;
+class VSTQQQQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb">;
+
+//   VST1LN   : Vector Store (single element from one lane)
+class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+             PatFrag StoreOp, SDNode ExtractOp, Operand AddrMode>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins AddrMode:$Rn, DPR:$Vd, nohash_imm:$lane),
+          IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "",
+          [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), AddrMode:$Rn)]> {
+  let Rm = 0b1111;
+  let DecoderMethod = "DecodeVST1LN";
+}
+class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
+  : VSTQLNPseudo<IIC_VST1ln> {
+  let Pattern = [(StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
+                          addrmode6:$addr)];
+}
+
+def VST1LNd8  : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8,
+                       NEONvgetlaneu, addrmode6> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16,
+                       NEONvgetlaneu, addrmode6> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{4};
+}
+
+def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt,
+                       addrmode6oneL32> {
+  let Inst{7}   = lane{0};
+  let Inst{5-4} = Rn{5-4};
+}
+
+def VST1LNq8Pseudo  : VST1QLNPseudo<v16i8, truncstorei8, NEONvgetlaneu>;
+def VST1LNq16Pseudo : VST1QLNPseudo<v8i16, truncstorei16, NEONvgetlaneu>;
+def VST1LNq32Pseudo : VST1QLNPseudo<v4i32, store, extractelt>;
+
+def : Pat<(store (extractelt (v2f32 DPR:$src), imm:$lane), addrmode6:$addr),
+          (VST1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>;
+def : Pat<(store (extractelt (v4f32 QPR:$src), imm:$lane), addrmode6:$addr),
+          (VST1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
+
+// ...with address register writeback:
+class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+               PatFrag StoreOp, SDNode ExtractOp, Operand AdrMode>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins AdrMode:$Rn, am6offset:$Rm,
+           DPR:$Vd, nohash_imm:$lane), IIC_VST1lnu, "vst1", Dt,
+          "\\{$Vd[$lane]\\}, $Rn$Rm",
+          "$Rn.addr = $wb",
+          [(set GPR:$wb, (StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane),
+                                  AdrMode:$Rn, am6offset:$Rm))]> {
+  let DecoderMethod = "DecodeVST1LN";
+}
+class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
+  : VSTQLNWBPseudo<IIC_VST1lnu> {
+  let Pattern = [(set GPR:$wb, (StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
+                                        addrmode6:$addr, am6offset:$offset))];
+}
+
+def VST1LNd8_UPD  : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8,
+                             NEONvgetlaneu, addrmode6> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16,
+                             NEONvgetlaneu, addrmode6> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{4};
+}
+def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32", v2i32, post_store,
+                             extractelt, addrmode6oneL32> {
+  let Inst{7}   = lane{0};
+  let Inst{5-4} = Rn{5-4};
+}
+
+def VST1LNq8Pseudo_UPD  : VST1QLNWBPseudo<v16i8, post_truncsti8, NEONvgetlaneu>;
+def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo<v8i16, post_truncsti16,NEONvgetlaneu>;
+def VST1LNq32Pseudo_UPD : VST1QLNWBPseudo<v4i32, post_store, extractelt>;
+
+let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
+
+//   VST2LN   : Vector Store (single 2-element structure from one lane)
+class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, nohash_imm:$lane),
+          IIC_VST2ln, "vst2", Dt, "\\{$Vd[$lane], $src2[$lane]\\}, $Rn",
+          "", []> {
+  let Rm = 0b1111;
+  let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVST2LN";
+}
+
+def VST2LNd8  : VST2LN<0b0001, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST2LNd16 : VST2LN<0b0101, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST2LNd32 : VST2LN<0b1001, {?,0,0,?}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST2LNd8Pseudo  : VSTQLNPseudo<IIC_VST2ln>;
+def VST2LNd16Pseudo : VSTQLNPseudo<IIC_VST2ln>;
+def VST2LNd32Pseudo : VSTQLNPseudo<IIC_VST2ln>;
+
+// ...with double-spaced registers:
+def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{4};
+}
+def VST2LNq32 : VST2LN<0b1001, {?,1,0,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{4}   = Rn{4};
+}
+
+def VST2LNq16Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
+def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
+
+// ...with address register writeback:
+class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, nohash_imm:$lane), IIC_VST2lnu, "vst2", Dt,
+          "\\{$Vd[$lane], $src2[$lane]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVST2LN";
+}
+
+def VST2LNd8_UPD  : VST2LNWB<0b0001, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST2LNd16_UPD : VST2LNWB<0b0101, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,0,?}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST2LNd8Pseudo_UPD  : VSTQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
+
+def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,0,?}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>;
+
+//   VST3LN   : Vector Store (single 3-element structure from one lane)
+class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3,
+           nohash_imm:$lane), IIC_VST3ln, "vst3", Dt,
+          "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let DecoderMethod = "DecodeVST3LN";
+}
+
+def VST3LNd8  : VST3LN<0b0010, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST3LNd16 : VST3LN<0b0110, {?,?,0,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST3LNd8Pseudo  : VSTQQLNPseudo<IIC_VST3ln>;
+def VST3LNd16Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
+def VST3LNd32Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
+
+// ...with double-spaced registers:
+def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST3LNq32 : VST3LN<0b1010, {?,1,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST3LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST3ln>;
+def VST3LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST3ln>;
+
+// ...with address register writeback:
+class VST3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3, nohash_imm:$lane),
+          IIC_VST3lnu, "vst3", Dt,
+          "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let DecoderMethod = "DecodeVST3LN";
+}
+
+def VST3LNd8_UPD  : VST3LNWB<0b0010, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST3LNd16_UPD : VST3LNWB<0b0110, {?,?,0,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST3LNd8Pseudo_UPD  : VSTQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
+
+def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>;
+
+//   VST4LN   : Vector Store (single 4-element structure from one lane)
+class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4,
+           nohash_imm:$lane), IIC_VST4ln, "vst4", Dt,
+          "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn",
+          "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVST4LN";
+}
+
+def VST4LNd8  : VST4LN<0b0011, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST4LNd16 : VST4LN<0b0111, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VST4LNd8Pseudo  : VSTQQLNPseudo<IIC_VST4ln>;
+def VST4LNd16Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
+def VST4LNd32Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
+
+// ...with double-spaced registers:
+def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VST4LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>;
+def VST4LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>;
+
+// ...with address register writeback:
+class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane),
+          IIC_VST4lnu, "vst4", Dt,
+  "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVST4LN";
+}
+
+def VST4LNd8_UPD  : VST4LNWB<0b0011, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST4LNd16_UPD : VST4LNWB<0b0111, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VST4LNd8Pseudo_UPD  : VSTQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
+
+def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
+
+} // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1
+
+// Use vld1/vst1 for unaligned f64 load / store
+def : Pat<(f64 (hword_alignedload addrmode6:$addr)),
+          (VLD1d16 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(hword_alignedstore (f64 DPR:$value), addrmode6:$addr),
+          (VST1d16 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>;
+def : Pat<(f64 (byte_alignedload addrmode6:$addr)),
+          (VLD1d8 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(byte_alignedstore (f64 DPR:$value), addrmode6:$addr),
+          (VST1d8 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>;
+def : Pat<(f64 (non_word_alignedload addrmode6:$addr)),
+          (VLD1d64 addrmode6:$addr)>, Requires<[IsBE]>;
+def : Pat<(non_word_alignedstore (f64 DPR:$value), addrmode6:$addr),
+          (VST1d64 addrmode6:$addr, DPR:$value)>, Requires<[IsBE]>;
+
+// Use vld1/vst1 for Q and QQ. Also use them for unaligned v2f64
+// load / store if it's legal.
+def : Pat<(v2f64 (dword_alignedload addrmode6:$addr)),
+          (VLD1q64 addrmode6:$addr)>;
+def : Pat<(dword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q64 addrmode6:$addr, QPR:$value)>;
+def : Pat<(v2f64 (word_alignedload addrmode6:$addr)),
+          (VLD1q32 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(word_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q32 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),
+          (VLD1q16 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q16 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
+          (VLD1q8 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q8 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+
+//===----------------------------------------------------------------------===//
+// NEON pattern fragments
+//===----------------------------------------------------------------------===//
+
+// Extract D sub-registers of Q registers.
+def DSubReg_i8_reg  : SDNodeXForm<imm, [{
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/8, SDLoc(N),
+                                   MVT::i32);
+}]>;
+def DSubReg_i16_reg : SDNodeXForm<imm, [{
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/4, SDLoc(N),
+                                   MVT::i32);
+}]>;
+def DSubReg_i32_reg : SDNodeXForm<imm, [{
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/2, SDLoc(N),
+                                   MVT::i32);
+}]>;
+def DSubReg_f64_reg : SDNodeXForm<imm, [{
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue(), SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+// Extract S sub-registers of Q/D registers.
+def SSubReg_f32_reg : SDNodeXForm<imm, [{
+  assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue(), SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+// Translate lane numbers from Q registers to D subregs.
+def SubReg_i8_lane  : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 7, SDLoc(N), MVT::i32);
+}]>;
+def SubReg_i16_lane : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 3, SDLoc(N), MVT::i32);
+}]>;
+def SubReg_i32_lane : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i32);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Classes
+//===----------------------------------------------------------------------===//
+
+// Basic 2-register operations: double- and quad-register.
+class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+           string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VUNAD, OpcodeStr, Dt,"$Vd, $Vm", "",
+        [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm))))]>;
+class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+           string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VUNAQ, OpcodeStr, Dt,"$Vd, $Vm", "",
+        [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm))))]>;
+
+// Basic 2-register intrinsics, both double- and quad-register.
+class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
+        (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
+class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
+        (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Same as above, but not predicated.
+class N2VDIntnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<op19_18, op17_16, op10_8, op7, 0,  (outs DPR:$Vd), (ins DPR:$Vm),
+          itin, OpcodeStr, Dt,
+          [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
+
+class N2VQIntnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<op19_18, op17_16, op10_8, op7, 1,  (outs QPR:$Vd), (ins QPR:$Vm),
+          itin, OpcodeStr, Dt,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Similar to NV2VQIntnp with some more encoding bits exposed (crypto).
+class N2VQIntXnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6,
+              bit op7, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<op19_18, op17_16, op10_8, op7, op6,  (outs QPR:$Vd), (ins QPR:$Vm),
+          itin, OpcodeStr, Dt,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Same as N2VQIntXnp but with Vd as a src register.
+class N2VQIntX2np<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6,
+              bit op7, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<op19_18, op17_16, op10_8, op7, op6,
+          (outs QPR:$Vd), (ins QPR:$src, QPR:$Vm),
+          itin, OpcodeStr, Dt,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vm))))]> {
+  let Constraints = "$src = $Vd";
+}
+
+// Narrow 2-register operations.
+class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType TyD, ValueType TyQ, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd),
+        (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (TyD (OpNode (TyQ QPR:$Vm))))]>;
+
+// Narrow 2-register intrinsics.
+class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyD, ValueType TyQ, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd),
+        (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vm))))]>;
+
+// Long 2-register operations (currently only used for VMOVL).
+class N2VL<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType TyQ, ValueType TyD, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd),
+        (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vm))))]>;
+
+// Long 2-register intrinsics.
+class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyQ, ValueType TyD, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd),
+        (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vm))))]>;
+
+// 2-register shuffles (VTRN/VZIP/VUZP), both double- and quad-register.
+class N2VDShuffle<bits<2> op19_18, bits<5> op11_7, string OpcodeStr, string Dt>
+  : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 0, 0, (outs DPR:$Vd, DPR:$Vm),
+        (ins DPR:$src1, DPR:$src2), IIC_VPERMD,
+        OpcodeStr, Dt, "$Vd, $Vm",
+        "$src1 = $Vd, $src2 = $Vm", []>;
+class N2VQShuffle<bits<2> op19_18, bits<5> op11_7,
+                  InstrItinClass itin, string OpcodeStr, string Dt>
+  : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 1, 0, (outs QPR:$Vd, QPR:$Vm),
+        (ins QPR:$src1, QPR:$src2), itin, OpcodeStr, Dt, "$Vd, $Vm",
+        "$src1 = $Vd, $src2 = $Vm", []>;
+
+// Basic 3-register operations: double- and quad-register.
+class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+// Same as N3VD but no data type.
+class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr,
+           ValueType ResTy, ValueType OpTy,
+           SDNode OpNode, bit Commutable>
+  : N3VX<op24, op23, op21_20, op11_8, 0, op4,
+         (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+         OpcodeStr, "$Vd, $Vn, $Vm", "",
+         [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>{
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+
+class N3VDSL<bits<2> op21_20, bits<4> op11_8,
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType Ty, SDNode ShOp>
+  : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (Ty DPR:$Vd),
+              (Ty (ShOp (Ty DPR:$Vn),
+                        (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = 0;
+}
+class N3VDSL16<bits<2> op21_20, bits<4> op11_8,
+               string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
+  : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane","",
+        [(set (Ty DPR:$Vd),
+              (Ty (ShOp (Ty DPR:$Vn),
+                        (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = 0;
+}
+
+class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr,
+           ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
+  : N3VX<op24, op23, op21_20, op11_8, 1, op4,
+         (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+         OpcodeStr, "$Vd, $Vn, $Vm", "",
+         [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>{
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+class N3VQSL<bits<2> op21_20, bits<4> op11_8,
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType ResTy, ValueType OpTy, SDNode ShOp>
+  : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (ShOp (ResTy QPR:$Vn),
+                           (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+                                                imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = 0;
+}
+class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
+               ValueType ResTy, ValueType OpTy, SDNode ShOp>
+  : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (ShOp (ResTy QPR:$Vn),
+                           (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+                                                imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = 0;
+}
+
+// Basic 3-register intrinsics, both double- and quad-register.
+class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+
+class N3VDIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
+          [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+
+class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
+  : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (Ty DPR:$Vd),
+              (Ty (IntOp (Ty DPR:$Vn),
+                         (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),
+                                           imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+
+class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
+  : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (Ty DPR:$Vd),
+              (Ty (IntOp (Ty DPR:$Vn),
+                         (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VDIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$Vm, DPR:$Vn), f, itin,
+        OpcodeStr, Dt, "$Vd, $Vm, $Vn", "",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (OpTy DPR:$Vn))))]> {
+  let TwoOperandAliasConstraint = "$Vm = $Vd";
+  let isCommutable = 0;
+}
+
+class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+
+class N3VQIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin, OpcodeStr, Dt,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>;
+
+// Same as N3VQIntnp but with Vd as a src register.
+class N3VQInt3np<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs QPR:$Vd), (ins QPR:$src, QPR:$Vn, QPR:$Vm),
+          f, itin, OpcodeStr, Dt,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vn),
+                                       (OpTy QPR:$Vm))))]> {
+  let Constraints = "$src = $Vd";
+}
+
+class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                string OpcodeStr, string Dt,
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (ResTy QPR:$Vn),
+                            (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+                                                 imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt,
+                  ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (ResTy QPR:$Vn),
+                            (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+                                                 imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VQIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$Vm, QPR:$Vn), f, itin,
+        OpcodeStr, Dt, "$Vd, $Vm, $Vn", "",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (OpTy QPR:$Vn))))]> {
+  let TwoOperandAliasConstraint = "$Vm = $Vd";
+  let isCommutable = 0;
+}
+
+// Multiply-Add/Sub operations: double- and quad-register.
+class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType Ty, SDPatternOperator MulOp, SDPatternOperator OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set DPR:$Vd, (Ty (OpNode DPR:$src1,
+                             (Ty (MulOp DPR:$Vn, DPR:$Vm)))))]>;
+
+class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt,
+                  ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp>
+  : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd),
+        (ins DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (Ty DPR:$Vd),
+              (Ty (ShOp (Ty DPR:$src1),
+                        (Ty (MulOp DPR:$Vn,
+                                   (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),
+                                                     imm:$lane)))))))]>;
+class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                    string OpcodeStr, string Dt,
+                    ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp>
+  : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd),
+        (ins DPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (Ty DPR:$Vd),
+              (Ty (ShOp (Ty DPR:$src1),
+                        (Ty (MulOp DPR:$Vn,
+                                   (Ty (NEONvduplane (Ty DPR_8:$Vm),
+                                                     imm:$lane)))))))]>;
+
+class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty,
+                SDPatternOperator MulOp, SDPatternOperator OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (Ty (OpNode QPR:$src1,
+                             (Ty (MulOp QPR:$Vn, QPR:$Vm)))))]>;
+class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+                  SDPatternOperator MulOp, SDPatternOperator ShOp>
+  : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd),
+        (ins QPR:$src1, QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (ShOp (ResTy QPR:$src1),
+                           (ResTy (MulOp QPR:$Vn,
+                                   (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+                                                        imm:$lane)))))))]>;
+class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                    string OpcodeStr, string Dt,
+                    ValueType ResTy, ValueType OpTy,
+                    SDPatternOperator MulOp, SDPatternOperator ShOp>
+  : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd),
+        (ins QPR:$src1, QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (ShOp (ResTy QPR:$src1),
+                           (ResTy (MulOp QPR:$Vn,
+                                   (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+                                                        imm:$lane)))))))]>;
+
+// Neon Intrinsic-Op instructions (VABA): double- and quad-register.
+class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType Ty, SDPatternOperator IntOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set DPR:$Vd, (Ty (OpNode DPR:$src1,
+                             (Ty (IntOp (Ty DPR:$Vn), (Ty DPR:$Vm))))))]>;
+class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType Ty, SDPatternOperator IntOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (Ty (OpNode QPR:$src1,
+                             (Ty (IntOp (Ty QPR:$Vn), (Ty QPR:$Vm))))))]>;
+
+// Neon 3-argument intrinsics, both double- and quad-register.
+// The destination register is also used as the first source operand register.
+class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$src1),
+                                      (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src1),
+                                      (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>;
+
+// Long Multiply-Add/Sub operations.
+class N3VLMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (OpNode (TyQ QPR:$src1),
+                                (TyQ (MulOp (TyD DPR:$Vn),
+                                            (TyD DPR:$Vm)))))]>;
+class N3VLMulOpSL<bit op24, bits<2> op21_20, bits<4> op11_8,
+                  InstrItinClass itin, string OpcodeStr, string Dt,
+                  ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
+  : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set QPR:$Vd,
+          (OpNode (TyQ QPR:$src1),
+                  (TyQ (MulOp (TyD DPR:$Vn),
+                              (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),
+                                                 imm:$lane))))))]>;
+class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+                    InstrItinClass itin, string OpcodeStr, string Dt,
+                    ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
+  : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set QPR:$Vd,
+          (OpNode (TyQ QPR:$src1),
+                  (TyQ (MulOp (TyD DPR:$Vn),
+                              (TyD (NEONvduplane (TyD DPR_8:$Vm),
+                                                 imm:$lane))))))]>;
+
+// Long Intrinsic-Op vector operations with explicit extend (VABAL).
+class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                   InstrItinClass itin, string OpcodeStr, string Dt,
+                   ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp,
+                   SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (OpNode (TyQ QPR:$src1),
+                                (TyQ (ExtOp (TyD (IntOp (TyD DPR:$Vn),
+                                                        (TyD DPR:$Vm)))))))]>;
+
+// Neon Long 3-argument intrinsic.  The destination register is
+// a quad-register and is also used as the first source operand register.
+class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType TyQ, ValueType TyD, SDPatternOperator IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd,
+          (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$Vn), (TyD DPR:$Vm))))]>;
+class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                 string OpcodeStr, string Dt,
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd),
+        (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (ResTy QPR:$src1),
+                            (OpTy DPR:$Vn),
+                            (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+                                                imm:$lane)))))]>;
+class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+                   InstrItinClass itin, string OpcodeStr, string Dt,
+                   ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd),
+        (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (ResTy QPR:$src1),
+                            (OpTy DPR:$Vn),
+                            (OpTy (NEONvduplane (OpTy DPR_8:$Vm),
+                                                imm:$lane)))))]>;
+
+// Narrowing 3-register intrinsics.
+class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ,
+              SDPatternOperator IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINi4D,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vn), (TyQ QPR:$Vm))))]> {
+  let isCommutable = Commutable;
+}
+
+// Long 3-register operations.
+class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType TyQ, ValueType TyD, SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vn), (TyD DPR:$Vm))))]> {
+  let isCommutable = Commutable;
+}
+
+class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8,
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType TyQ, ValueType TyD, SDNode OpNode>
+  : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set QPR:$Vd,
+          (TyQ (OpNode (TyD DPR:$Vn),
+                       (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),imm:$lane)))))]>;
+class N3VLSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType TyQ, ValueType TyD, SDNode OpNode>
+  : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set QPR:$Vd,
+          (TyQ (OpNode (TyD DPR:$Vn),
+                       (TyD (NEONvduplane (TyD DPR_8:$Vm), imm:$lane)))))]>;
+
+// Long 3-register operations with explicitly extended operands.
+class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp,
+              bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (OpNode (TyQ (ExtOp (TyD DPR:$Vn))),
+                                (TyQ (ExtOp (TyD DPR:$Vm)))))]> {
+  let isCommutable = Commutable;
+}
+
+// Long 3-register intrinsics with explicit extend (VABDL).
+class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                 InstrItinClass itin, string OpcodeStr, string Dt,
+                 ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp,
+                 bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (TyQ (ExtOp (TyD (IntOp (TyD DPR:$Vn),
+                                                (TyD DPR:$Vm))))))]> {
+  let isCommutable = Commutable;
+}
+
+// Long 3-register intrinsics.
+class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vn), (TyD DPR:$Vm))))]> {
+  let isCommutable = Commutable;
+}
+
+// Same as above, but not predicated.
+class N3VLIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+
+class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                string OpcodeStr, string Dt,
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (OpTy DPR:$Vn),
+                            (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+                                                imm:$lane)))))]>;
+class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+                  InstrItinClass itin, string OpcodeStr, string Dt,
+                  ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (OpTy DPR:$Vn),
+                            (OpTy (NEONvduplane (OpTy DPR_8:$Vm),
+                                                imm:$lane)))))]>;
+
+// Wide 3-register operations.
+class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD,
+           SDNode OpNode, SDNode ExtOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VSUBiD,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (OpNode (TyQ QPR:$Vn),
+                                (TyQ (ExtOp (TyD DPR:$Vm)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+
+// Pairwise long 2-register intrinsics, both double- and quad-register.
+class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                bits<2> op17_16, bits<5> op11_7, bit op4,
+                string OpcodeStr, string Dt,
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
+class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                bits<2> op17_16, bits<5> op11_7, bit op4,
+                string OpcodeStr, string Dt,
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Pairwise long 2-register accumulate intrinsics,
+// both double- and quad-register.
+// The destination register is also used as the first source operand register.
+class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                 bits<2> op17_16, bits<5> op11_7, bit op4,
+                 string OpcodeStr, string Dt,
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vm), IIC_VPALiD,
+        OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd",
+        [(set DPR:$Vd, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$Vm))))]>;
+class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                 bits<2> op17_16, bits<5> op11_7, bit op4,
+                 string OpcodeStr, string Dt,
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vm), IIC_VPALiQ,
+        OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$Vm))))]>;
+
+// Shift by immediate,
+// both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+             Format f, InstrItinClass itin, Operand ImmTy,
+             string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4,
+           (outs DPR:$Vd), (ins DPR:$Vm, ImmTy:$SIMM), f, itin,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set DPR:$Vd, (Ty (OpNode (Ty DPR:$Vm), (i32 imm:$SIMM))))]>;
+class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+             Format f, InstrItinClass itin, Operand ImmTy,
+             string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4,
+           (outs QPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), f, itin,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set QPR:$Vd, (Ty (OpNode (Ty QPR:$Vm), (i32 imm:$SIMM))))]>;
+}
+
+// Long shift by immediate.
+class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+             string OpcodeStr, string Dt,
+             ValueType ResTy, ValueType OpTy, Operand ImmTy,
+             SDPatternOperator OpNode>
+  : N2VImm<op24, op23, op11_8, op7, op6, op4,
+           (outs QPR:$Vd), (ins DPR:$Vm, ImmTy:$SIMM), N2RegVShLFrm,
+           IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set QPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm), ImmTy:$SIMM)))]>;
+
+// Narrow shift by immediate.
+class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType ResTy, ValueType OpTy, Operand ImmTy,
+             SDPatternOperator OpNode>
+  : N2VImm<op24, op23, op11_8, op7, op6, op4,
+           (outs DPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, itin,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set DPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm),
+                                          (i32 ImmTy:$SIMM))))]>;
+
+// Shift right by immediate and accumulate,
+// both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                Operand ImmTy, string OpcodeStr, string Dt,
+                ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd),
+           (ins DPR:$src1, DPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, IIC_VPALiD,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+           [(set DPR:$Vd, (Ty (add DPR:$src1,
+                                (Ty (ShOp DPR:$Vm, (i32 imm:$SIMM))))))]>;
+class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                Operand ImmTy, string OpcodeStr, string Dt,
+                ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd),
+           (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, IIC_VPALiD,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+           [(set QPR:$Vd, (Ty (add QPR:$src1,
+                                (Ty (ShOp QPR:$Vm, (i32 imm:$SIMM))))))]>;
+}
+
+// Shift by immediate and insert,
+// both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                Operand ImmTy, Format f, string OpcodeStr, string Dt,
+                ValueType Ty,SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd),
+           (ins DPR:$src1, DPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiD,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+           [(set DPR:$Vd, (Ty (ShOp DPR:$src1, DPR:$Vm, (i32 imm:$SIMM))))]>;
+class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                Operand ImmTy, Format f, string OpcodeStr, string Dt,
+                ValueType Ty,SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd),
+           (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiQ,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+           [(set QPR:$Vd, (Ty (ShOp QPR:$src1, QPR:$Vm, (i32 imm:$SIMM))))]>;
+}
+
+// Convert, with fractional bits immediate,
+// both double- and quad-register.
+class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+              string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+              SDPatternOperator IntOp>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4,
+           (outs DPR:$Vd), (ins DPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm,
+           IIC_VUNAD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (i32 imm:$SIMM))))]>;
+class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+              string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+              SDPatternOperator IntOp>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4,
+           (outs QPR:$Vd), (ins QPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm,
+           IIC_VUNAQ, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (i32 imm:$SIMM))))]>;
+
+//===----------------------------------------------------------------------===//
+// Multiclasses
+//===----------------------------------------------------------------------===//
+
+// Abbreviations used in multiclass suffixes:
+//   Q = quarter int (8 bit) elements
+//   H = half int (16 bit) elements
+//   S = single int (32 bit) elements
+//   D = double int (64 bit) elements
+
+// Neon 2-register vector operations and intrinsics.
+
+// Neon 2-register comparisons.
+//   source operand element sizes of 8, 16 and 32 bits:
+multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                       bits<5> op11_7, bit op4, string opc, string Dt,
+                       string asm, SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4,
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "8"), asm, "",
+                  [(set DPR:$Vd, (v8i8 (OpNode (v8i8 DPR:$Vm))))]>;
+  def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "16"), asm, "",
+                  [(set DPR:$Vd, (v4i16 (OpNode (v4i16 DPR:$Vm))))]>;
+  def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "32"), asm, "",
+                  [(set DPR:$Vd, (v2i32 (OpNode (v2i32 DPR:$Vm))))]>;
+  def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, "f32", asm, "",
+                  [(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> {
+    let Inst{10} = 1; // overwrite F = 1
+  }
+  def v4f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, "f16", asm, "",
+                  [(set DPR:$Vd, (v4i16 (OpNode (v4f16 DPR:$Vm))))]>,
+              Requires<[HasNEON,HasFullFP16]> {
+    let Inst{10} = 1; // overwrite F = 1
+  }
+
+  // 128-bit vector types.
+  def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4,
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "8"), asm, "",
+                  [(set QPR:$Vd, (v16i8 (OpNode (v16i8 QPR:$Vm))))]>;
+  def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "16"), asm, "",
+                  [(set QPR:$Vd, (v8i16 (OpNode (v8i16 QPR:$Vm))))]>;
+  def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "32"), asm, "",
+                  [(set QPR:$Vd, (v4i32 (OpNode (v4i32 QPR:$Vm))))]>;
+  def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, "f32", asm, "",
+                  [(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> {
+    let Inst{10} = 1; // overwrite F = 1
+  }
+  def v8f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, "f16", asm, "",
+                  [(set QPR:$Vd, (v8i16 (OpNode (v8f16 QPR:$Vm))))]>,
+              Requires<[HasNEON,HasFullFP16]> {
+    let Inst{10} = 1; // overwrite F = 1
+  }
+}
+
+
+// Neon 2-register vector intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                      bits<5> op11_7, bit op4,
+                      InstrItinClass itinD, InstrItinClass itinQ,
+                      string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                      itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
+  def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                      itinD, OpcodeStr, !strconcat(Dt, "16"),v4i16,v4i16,IntOp>;
+  def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                      itinD, OpcodeStr, !strconcat(Dt, "32"),v2i32,v2i32,IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                      itinQ, OpcodeStr, !strconcat(Dt, "8"), v16i8,v16i8,IntOp>;
+  def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                      itinQ, OpcodeStr, !strconcat(Dt, "16"),v8i16,v8i16,IntOp>;
+  def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                      itinQ, OpcodeStr, !strconcat(Dt, "32"),v4i32,v4i32,IntOp>;
+}
+
+
+// Neon Narrowing 2-register vector operations,
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VN_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                    bits<5> op11_7, bit op6, bit op4,
+                    InstrItinClass itin, string OpcodeStr, string Dt,
+                    SDNode OpNode> {
+  def v8i8  : N2VN<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
+                   itin, OpcodeStr, !strconcat(Dt, "16"),
+                   v8i8, v8i16, OpNode>;
+  def v4i16 : N2VN<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
+                   itin, OpcodeStr, !strconcat(Dt, "32"),
+                   v4i16, v4i32, OpNode>;
+  def v2i32 : N2VN<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
+                   itin, OpcodeStr, !strconcat(Dt, "64"),
+                   v2i32, v2i64, OpNode>;
+}
+
+// Neon Narrowing 2-register vector intrinsics,
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                       bits<5> op11_7, bit op6, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp> {
+  def v8i8  : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
+                      itin, OpcodeStr, !strconcat(Dt, "16"),
+                      v8i8, v8i16, IntOp>;
+  def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
+                      itin, OpcodeStr, !strconcat(Dt, "32"),
+                      v4i16, v4i32, IntOp>;
+  def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
+                      itin, OpcodeStr, !strconcat(Dt, "64"),
+                      v2i32, v2i64, IntOp>;
+}
+
+
+// Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL).
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VL_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4,
+                    string OpcodeStr, string Dt, SDNode OpNode> {
+  def v8i16 : N2VL<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                   OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode>;
+  def v4i32 : N2VL<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                   OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode>;
+  def v2i64 : N2VL<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                   OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, OpNode>;
+}
+
+
+// Neon 3-register vector operations.
+
+// First with only element sizes of 8, 16 and 32 bits:
+multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                   InstrItinClass itinD16, InstrItinClass itinD32,
+                   InstrItinClass itinQ16, InstrItinClass itinQ32,
+                   string OpcodeStr, string Dt,
+                   SDNode OpNode, bit Commutable = 0> {
+  // 64-bit vector types.
+  def v8i8  : N3VD<op24, op23, 0b00, op11_8, op4, itinD16,
+                   OpcodeStr, !strconcat(Dt, "8"),
+                   v8i8, v8i8, OpNode, Commutable>;
+  def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, itinD16,
+                   OpcodeStr, !strconcat(Dt, "16"),
+                   v4i16, v4i16, OpNode, Commutable>;
+  def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, itinD32,
+                   OpcodeStr, !strconcat(Dt, "32"),
+                   v2i32, v2i32, OpNode, Commutable>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, itinQ16,
+                   OpcodeStr, !strconcat(Dt, "8"),
+                   v16i8, v16i8, OpNode, Commutable>;
+  def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, itinQ16,
+                   OpcodeStr, !strconcat(Dt, "16"),
+                   v8i16, v8i16, OpNode, Commutable>;
+  def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, itinQ32,
+                   OpcodeStr, !strconcat(Dt, "32"),
+                   v4i32, v4i32, OpNode, Commutable>;
+}
+
+multiclass N3VSL_HS<bits<4> op11_8, string OpcodeStr, SDNode ShOp> {
+  def v4i16 : N3VDSL16<0b01, op11_8, OpcodeStr, "i16", v4i16, ShOp>;
+  def v2i32 : N3VDSL<0b10, op11_8, IIC_VMULi32D, OpcodeStr, "i32", v2i32, ShOp>;
+  def v8i16 : N3VQSL16<0b01, op11_8, OpcodeStr, "i16", v8i16, v4i16, ShOp>;
+  def v4i32 : N3VQSL<0b10, op11_8, IIC_VMULi32Q, OpcodeStr, "i32",
+                     v4i32, v2i32, ShOp>;
+}
+
+// ....then also with element size 64 bits:
+multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                    InstrItinClass itinD, InstrItinClass itinQ,
+                    string OpcodeStr, string Dt,
+                    SDNode OpNode, bit Commutable = 0>
+  : N3V_QHS<op24, op23, op11_8, op4, itinD, itinD, itinQ, itinQ,
+            OpcodeStr, Dt, OpNode, Commutable> {
+  def v1i64 : N3VD<op24, op23, 0b11, op11_8, op4, itinD,
+                   OpcodeStr, !strconcat(Dt, "64"),
+                   v1i64, v1i64, OpNode, Commutable>;
+  def v2i64 : N3VQ<op24, op23, 0b11, op11_8, op4, itinQ,
+                   OpcodeStr, !strconcat(Dt, "64"),
+                   v2i64, v2i64, OpNode, Commutable>;
+}
+
+
+// Neon 3-register vector intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                     InstrItinClass itinD16, InstrItinClass itinD32,
+                     InstrItinClass itinQ16, InstrItinClass itinQ32,
+                     string OpcodeStr, string Dt,
+                     SDPatternOperator IntOp, bit Commutable = 0> {
+  // 64-bit vector types.
+  def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, f, itinD16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i16, v4i16, IntOp, Commutable>;
+  def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, f, itinD32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i32, v2i32, IntOp, Commutable>;
+
+  // 128-bit vector types.
+  def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, f, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v8i16, v8i16, IntOp, Commutable>;
+  def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, f, itinQ32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v4i32, v4i32, IntOp, Commutable>;
+}
+multiclass N3VInt_HSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                     InstrItinClass itinD16, InstrItinClass itinD32,
+                     InstrItinClass itinQ16, InstrItinClass itinQ32,
+                     string OpcodeStr, string Dt,
+                     SDPatternOperator IntOp> {
+  // 64-bit vector types.
+  def v4i16 : N3VDIntSh<op24, op23, 0b01, op11_8, op4, f, itinD16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i16, v4i16, IntOp>;
+  def v2i32 : N3VDIntSh<op24, op23, 0b10, op11_8, op4, f, itinD32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i32, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v8i16 : N3VQIntSh<op24, op23, 0b01, op11_8, op4, f, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v8i16, v8i16, IntOp>;
+  def v4i32 : N3VQIntSh<op24, op23, 0b10, op11_8, op4, f, itinQ32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v4i32, v4i32, IntOp>;
+}
+
+multiclass N3VIntSL_HS<bits<4> op11_8,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  def v4i16 : N3VDIntSL16<0b01, op11_8, itinD16,
+                          OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp>;
+  def v2i32 : N3VDIntSL<0b10, op11_8, itinD32,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp>;
+  def v8i16 : N3VQIntSL16<0b01, op11_8, itinQ16,
+                          OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16, IntOp>;
+  def v4i32 : N3VQIntSL<0b10, op11_8, itinQ32,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32, IntOp>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                      InstrItinClass itinD16, InstrItinClass itinD32,
+                      InstrItinClass itinQ16, InstrItinClass itinQ32,
+                      string OpcodeStr, string Dt,
+                      SDPatternOperator IntOp, bit Commutable = 0>
+  : N3VInt_HS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+              OpcodeStr, Dt, IntOp, Commutable> {
+  def v8i8  : N3VDInt<op24, op23, 0b00, op11_8, op4, f, itinD16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i8, v8i8, IntOp, Commutable>;
+  def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, f, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v16i8, v16i8, IntOp, Commutable>;
+}
+multiclass N3VInt_QHSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                      InstrItinClass itinD16, InstrItinClass itinD32,
+                      InstrItinClass itinQ16, InstrItinClass itinQ32,
+                      string OpcodeStr, string Dt,
+                      SDPatternOperator IntOp>
+  : N3VInt_HSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+              OpcodeStr, Dt, IntOp> {
+  def v8i8  : N3VDIntSh<op24, op23, 0b00, op11_8, op4, f, itinD16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i8, v8i8, IntOp>;
+  def v16i8 : N3VQIntSh<op24, op23, 0b00, op11_8, op4, f, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v16i8, v16i8, IntOp>;
+}
+
+
+// ....then also with element size of 64 bits:
+multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp, bit Commutable = 0>
+  : N3VInt_QHS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+               OpcodeStr, Dt, IntOp, Commutable> {
+  def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, f, itinD32,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v1i64, v1i64, IntOp, Commutable>;
+  def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, f, itinQ32,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v2i64, v2i64, IntOp, Commutable>;
+}
+multiclass N3VInt_QHSDSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp>
+  : N3VInt_QHSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+               OpcodeStr, Dt, IntOp> {
+  def v1i64 : N3VDIntSh<op24, op23, 0b11, op11_8, op4, f, itinD32,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v1i64, v1i64, IntOp>;
+  def v2i64 : N3VQIntSh<op24, op23, 0b11, op11_8, op4, f, itinQ32,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v2i64, v2i64, IntOp>;
+}
+
+// Neon Narrowing 3-register vector intrinsics,
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp, bit Commutable = 0> {
+  def v8i8  : N3VNInt<op24, op23, 0b00, op11_8, op4,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v8i8, v8i16, IntOp, Commutable>;
+  def v4i16 : N3VNInt<op24, op23, 0b01, op11_8, op4,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v4i16, v4i32, IntOp, Commutable>;
+  def v2i32 : N3VNInt<op24, op23, 0b10, op11_8, op4,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v2i32, v2i64, IntOp, Commutable>;
+}
+
+
+// Neon Long 3-register vector operations.
+
+multiclass N3VL_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                    InstrItinClass itin16, InstrItinClass itin32,
+                    string OpcodeStr, string Dt,
+                    SDNode OpNode, bit Commutable = 0> {
+  def v8i16 : N3VL<op24, op23, 0b00, op11_8, op4, itin16,
+                   OpcodeStr, !strconcat(Dt, "8"),
+                   v8i16, v8i8, OpNode, Commutable>;
+  def v4i32 : N3VL<op24, op23, 0b01, op11_8, op4, itin16,
+                   OpcodeStr, !strconcat(Dt, "16"),
+                   v4i32, v4i16, OpNode, Commutable>;
+  def v2i64 : N3VL<op24, op23, 0b10, op11_8, op4, itin32,
+                   OpcodeStr, !strconcat(Dt, "32"),
+                   v2i64, v2i32, OpNode, Commutable>;
+}
+
+multiclass N3VLSL_HS<bit op24, bits<4> op11_8,
+                     InstrItinClass itin, string OpcodeStr, string Dt,
+                     SDNode OpNode> {
+  def v4i16 : N3VLSL16<op24, 0b01, op11_8, itin, OpcodeStr,
+                       !strconcat(Dt, "16"), v4i32, v4i16, OpNode>;
+  def v2i32 : N3VLSL<op24, 0b10, op11_8, itin, OpcodeStr,
+                     !strconcat(Dt, "32"), v2i64, v2i32, OpNode>;
+}
+
+multiclass N3VLExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin16, InstrItinClass itin32,
+                       string OpcodeStr, string Dt,
+                       SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
+  def v8i16 : N3VLExt<op24, op23, 0b00, op11_8, op4, itin16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i16, v8i8, OpNode, ExtOp, Commutable>;
+  def v4i32 : N3VLExt<op24, op23, 0b01, op11_8, op4, itin16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i32, v4i16, OpNode, ExtOp, Commutable>;
+  def v2i64 : N3VLExt<op24, op23, 0b10, op11_8, op4, itin32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i64, v2i32, OpNode, ExtOp, Commutable>;
+}
+
+// Neon Long 3-register vector intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                      InstrItinClass itin16, InstrItinClass itin32,
+                      string OpcodeStr, string Dt,
+                      SDPatternOperator IntOp, bit Commutable = 0> {
+  def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i32, v4i16, IntOp, Commutable>;
+  def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, itin32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i64, v2i32, IntOp, Commutable>;
+}
+
+multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8,
+                        InstrItinClass itin, string OpcodeStr, string Dt,
+                        SDPatternOperator IntOp> {
+  def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin,
+                          OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
+  def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin16, InstrItinClass itin32,
+                       string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp, bit Commutable = 0>
+  : N3VLInt_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt,
+               IntOp, Commutable> {
+  def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, itin16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i16, v8i8, IntOp, Commutable>;
+}
+
+// ....with explicit extend (VABDL).
+multiclass N3VLIntExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp, SDNode ExtOp, bit Commutable = 0> {
+  def v8i16 : N3VLIntExt<op24, op23, 0b00, op11_8, op4, itin,
+                         OpcodeStr, !strconcat(Dt, "8"),
+                         v8i16, v8i8, IntOp, ExtOp, Commutable>;
+  def v4i32 : N3VLIntExt<op24, op23, 0b01, op11_8, op4, itin,
+                         OpcodeStr, !strconcat(Dt, "16"),
+                         v4i32, v4i16, IntOp, ExtOp, Commutable>;
+  def v2i64 : N3VLIntExt<op24, op23, 0b10, op11_8, op4, itin,
+                         OpcodeStr, !strconcat(Dt, "32"),
+                         v2i64, v2i32, IntOp, ExtOp, Commutable>;
+}
+
+
+// Neon Wide 3-register vector intrinsics,
+//   source operand element sizes of 8, 16 and 32 bits:
+multiclass N3VW_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                    string OpcodeStr, string Dt,
+                    SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
+  def v8i16 : N3VW<op24, op23, 0b00, op11_8, op4,
+                   OpcodeStr, !strconcat(Dt, "8"),
+                   v8i16, v8i8, OpNode, ExtOp, Commutable>;
+  def v4i32 : N3VW<op24, op23, 0b01, op11_8, op4,
+                   OpcodeStr, !strconcat(Dt, "16"),
+                   v4i32, v4i16, OpNode, ExtOp, Commutable>;
+  def v2i64 : N3VW<op24, op23, 0b10, op11_8, op4,
+                   OpcodeStr, !strconcat(Dt, "32"),
+                   v2i64, v2i32, OpNode, ExtOp, Commutable>;
+}
+
+
+// Neon Multiply-Op vector operations,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                        InstrItinClass itinD16, InstrItinClass itinD32,
+                        InstrItinClass itinQ16, InstrItinClass itinQ32,
+                        string OpcodeStr, string Dt, SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N3VDMulOp<op24, op23, 0b00, op11_8, op4, itinD16,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i8, mul, OpNode>;
+  def v4i16 : N3VDMulOp<op24, op23, 0b01, op11_8, op4, itinD16,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, OpNode>;
+  def v2i32 : N3VDMulOp<op24, op23, 0b10, op11_8, op4, itinD32,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, OpNode>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQMulOp<op24, op23, 0b00, op11_8, op4, itinQ16,
+                        OpcodeStr, !strconcat(Dt, "8"), v16i8, mul, OpNode>;
+  def v8i16 : N3VQMulOp<op24, op23, 0b01, op11_8, op4, itinQ16,
+                        OpcodeStr, !strconcat(Dt, "16"), v8i16, mul, OpNode>;
+  def v4i32 : N3VQMulOp<op24, op23, 0b10, op11_8, op4, itinQ32,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, mul, OpNode>;
+}
+
+multiclass N3VMulOpSL_HS<bits<4> op11_8,
+                         InstrItinClass itinD16, InstrItinClass itinD32,
+                         InstrItinClass itinQ16, InstrItinClass itinQ32,
+                         string OpcodeStr, string Dt, SDPatternOperator ShOp> {
+  def v4i16 : N3VDMulOpSL16<0b01, op11_8, itinD16,
+                            OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, ShOp>;
+  def v2i32 : N3VDMulOpSL<0b10, op11_8, itinD32,
+                          OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, ShOp>;
+  def v8i16 : N3VQMulOpSL16<0b01, op11_8, itinQ16,
+                            OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16,
+                            mul, ShOp>;
+  def v4i32 : N3VQMulOpSL<0b10, op11_8, itinQ32,
+                          OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32,
+                          mul, ShOp>;
+}
+
+// Neon Intrinsic-Op vector operations,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                        InstrItinClass itinD, InstrItinClass itinQ,
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp,
+                        SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N3VDIntOp<op24, op23, 0b00, op11_8, op4, itinD,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i8, IntOp, OpNode>;
+  def v4i16 : N3VDIntOp<op24, op23, 0b01, op11_8, op4, itinD,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp, OpNode>;
+  def v2i32 : N3VDIntOp<op24, op23, 0b10, op11_8, op4, itinD,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp, OpNode>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQIntOp<op24, op23, 0b00, op11_8, op4, itinQ,
+                        OpcodeStr, !strconcat(Dt, "8"), v16i8, IntOp, OpNode>;
+  def v8i16 : N3VQIntOp<op24, op23, 0b01, op11_8, op4, itinQ,
+                        OpcodeStr, !strconcat(Dt, "16"), v8i16, IntOp, OpNode>;
+  def v4i32 : N3VQIntOp<op24, op23, 0b10, op11_8, op4, itinQ,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, IntOp, OpNode>;
+}
+
+// Neon 3-argument intrinsics,
+//   element sizes of 16 and 32 bits:
+multiclass N3VInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  // 64-bit vector types.
+  def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, itinD16,
+                       OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp>;
+  def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, itinD32,
+                       OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, itinQ16,
+                       OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>;
+  def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, itinQ32,
+                       OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>;
+}
+
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp>
+           :N3VInt3_HS <op24, op23, op11_8, op4, itinD16, itinD32,
+                        itinQ16, itinQ32, OpcodeStr, Dt, IntOp>{
+  // 64-bit vector types.
+  def v8i8  : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD16,
+                       OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
+  // 128-bit vector types.
+  def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, itinQ16,
+                       OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>;
+}
+
+// Neon Long Multiply-Op vector operations,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VLMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                         InstrItinClass itin16, InstrItinClass itin32,
+                         string OpcodeStr, string Dt, SDNode MulOp,
+                         SDNode OpNode> {
+  def v8i16 : N3VLMulOp<op24, op23, 0b00, op11_8, op4, itin16, OpcodeStr,
+                        !strconcat(Dt, "8"), v8i16, v8i8, MulOp, OpNode>;
+  def v4i32 : N3VLMulOp<op24, op23, 0b01, op11_8, op4, itin16, OpcodeStr,
+                        !strconcat(Dt, "16"), v4i32, v4i16, MulOp, OpNode>;
+  def v2i64 : N3VLMulOp<op24, op23, 0b10, op11_8, op4, itin32, OpcodeStr,
+                        !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>;
+}
+
+multiclass N3VLMulOpSL_HS<bit op24, bits<4> op11_8, string OpcodeStr,
+                          string Dt, SDNode MulOp, SDNode OpNode> {
+  def v4i16 : N3VLMulOpSL16<op24, 0b01, op11_8, IIC_VMACi16D, OpcodeStr,
+                            !strconcat(Dt,"16"), v4i32, v4i16, MulOp, OpNode>;
+  def v2i32 : N3VLMulOpSL<op24, 0b10, op11_8, IIC_VMACi32D, OpcodeStr,
+                          !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>;
+}
+
+
+// Neon Long 3-argument intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin16, InstrItinClass itin32,
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, itin16,
+                       OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
+  def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, itin32,
+                       OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+}
+
+multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
+                         string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8, IIC_VMACi16D,
+                           OpcodeStr, !strconcat(Dt,"16"), v4i32, v4i16, IntOp>;
+  def v2i32 : N3VLInt3SL<op24, 0b10, op11_8, IIC_VMACi32D,
+                         OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                        InstrItinClass itin16, InstrItinClass itin32,
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp>
+  : N3VLInt3_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt, IntOp> {
+  def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, itin16,
+                       OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>;
+}
+
+// ....with explicit extend (VABAL).
+multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                            InstrItinClass itin, string OpcodeStr, string Dt,
+                            SDPatternOperator IntOp, SDNode ExtOp, SDNode OpNode> {
+  def v8i16 : N3VLIntExtOp<op24, op23, 0b00, op11_8, op4, itin,
+                           OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8,
+                           IntOp, ExtOp, OpNode>;
+  def v4i32 : N3VLIntExtOp<op24, op23, 0b01, op11_8, op4, itin,
+                           OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16,
+                           IntOp, ExtOp, OpNode>;
+  def v2i64 : N3VLIntExtOp<op24, op23, 0b10, op11_8, op4, itin,
+                           OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32,
+                           IntOp, ExtOp, OpNode>;
+}
+
+
+// Neon Pairwise long 2-register intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                        bits<5> op11_7, bit op4,
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>;
+  def v4i16 : N2VDPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "16"), v2i32, v4i16, IntOp>;
+  def v2i32 : N2VDPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "32"), v1i64, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i16, v16i8, IntOp>;
+  def v8i16 : N2VQPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i32, v8i16, IntOp>;
+  def v4i32 : N2VQPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i64, v4i32, IntOp>;
+}
+
+
+// Neon Pairwise long 2-register accumulate intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                         bits<5> op11_7, bit op4,
+                         string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>;
+  def v4i16 : N2VDPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "16"), v2i32, v4i16, IntOp>;
+  def v2i32 : N2VDPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "32"), v1i64, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "8"), v8i16, v16i8, IntOp>;
+  def v8i16 : N2VQPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "16"), v4i32, v8i16, IntOp>;
+  def v4i32 : N2VQPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "32"), v2i64, v4i32, IntOp>;
+}
+
+
+// Neon 2-register vector shift by immediate,
+//   with f of either N2RegVShLFrm or N2RegVShRFrm
+//   element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VShL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>;
+                             // imm6 = xxxxxx
+}
+multiclass N2VShR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       string baseOpc, SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8,
+                     OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm16,
+                     OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm32,
+                     OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64,
+                     OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8,
+                     OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm16,
+                     OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm32,
+                     OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64,
+                     OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>;
+                             // imm6 = xxxxxx
+}
+
+// Neon Shift-Accumulate vector operations,
+//   element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                         string OpcodeStr, string Dt, SDNode ShOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm8,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm16,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm32,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDShAdd<op24, op23, op11_8, 1, op4, shr_imm64,
+                        OpcodeStr, !strconcat(Dt, "64"), v1i64, ShOp>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm8,
+                        OpcodeStr, !strconcat(Dt, "8"), v16i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm16,
+                        OpcodeStr, !strconcat(Dt, "16"), v8i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm32,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQShAdd<op24, op23, op11_8, 1, op4, shr_imm64,
+                        OpcodeStr, !strconcat(Dt, "64"), v2i64, ShOp>;
+                             // imm6 = xxxxxx
+}
+
+// Neon Shift-Insert vector operations,
+//   with f of either N2RegVShLFrm or N2RegVShRFrm
+//   element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VShInsL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                          string OpcodeStr> {
+  // 64-bit vector types.
+  def v8i8  : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "8", v8i8, NEONvsli> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "16", v4i16, NEONvsli> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "32", v2i32, NEONvsli> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "64", v1i64, NEONvsli>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "8", v16i8, NEONvsli> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "16", v8i16, NEONvsli> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "32", v4i32, NEONvsli> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "64", v2i64, NEONvsli>;
+                             // imm6 = xxxxxx
+}
+multiclass N2VShInsR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                          string OpcodeStr> {
+  // 64-bit vector types.
+  def v8i8  : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm8,
+                        N2RegVShRFrm, OpcodeStr, "8", v8i8, NEONvsri> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm16,
+                        N2RegVShRFrm, OpcodeStr, "16", v4i16, NEONvsri> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm32,
+                        N2RegVShRFrm, OpcodeStr, "32", v2i32, NEONvsri> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, shr_imm64,
+                        N2RegVShRFrm, OpcodeStr, "64", v1i64, NEONvsri>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm8,
+                        N2RegVShRFrm, OpcodeStr, "8", v16i8, NEONvsri> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm16,
+                        N2RegVShRFrm, OpcodeStr, "16", v8i16, NEONvsri> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm32,
+                        N2RegVShRFrm, OpcodeStr, "32", v4i32, NEONvsri> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, shr_imm64,
+                        N2RegVShRFrm, OpcodeStr, "64", v2i64, NEONvsri>;
+                             // imm6 = xxxxxx
+}
+
+// Neon Shift Long operations,
+//   element sizes of 8, 16, 32 bits:
+multiclass N2VLSh_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
+                      bit op4, string OpcodeStr, string Dt,
+                      SDPatternOperator OpNode> {
+  def v8i16 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+              OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, imm1_7, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i32 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+               OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, imm1_15, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i64 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+               OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, imm1_31, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+}
+
+// Neon Shift Narrow operations,
+//   element sizes of 16, 32, 64 bits:
+multiclass N2VNSh_HSD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
+                      bit op4, InstrItinClass itin, string OpcodeStr, string Dt,
+                      SDPatternOperator OpNode> {
+  def v8i8 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                    OpcodeStr, !strconcat(Dt, "16"),
+                    v8i8, v8i16, shr_imm8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                     OpcodeStr, !strconcat(Dt, "32"),
+                     v4i16, v4i32, shr_imm16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                     OpcodeStr, !strconcat(Dt, "64"),
+                     v2i32, v2i64, shr_imm32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions.
+//===----------------------------------------------------------------------===//
+
+// Vector Add Operations.
+
+//   VADD     : Vector Add (integer and floating-point)
+defm VADD     : N3V_QHSD<0, 0, 0b1000, 0, IIC_VBINiD, IIC_VBINiQ, "vadd", "i",
+                         add, 1>;
+def  VADDfd   : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd", "f32",
+                     v2f32, v2f32, fadd, 1>;
+def  VADDfq   : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32",
+                     v4f32, v4f32, fadd, 1>;
+def  VADDhd   : N3VD<0, 0, 0b01, 0b1101, 0, IIC_VBIND, "vadd", "f16",
+                     v4f16, v4f16, fadd, 1>,
+                Requires<[HasNEON,HasFullFP16]>;
+def  VADDhq   : N3VQ<0, 0, 0b01, 0b1101, 0, IIC_VBINQ, "vadd", "f16",
+                     v8f16, v8f16, fadd, 1>,
+                Requires<[HasNEON,HasFullFP16]>;
+//   VADDL    : Vector Add Long (Q = D + D)
+defm VADDLs   : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
+                            "vaddl", "s", add, sext, 1>;
+defm VADDLu   : N3VLExt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
+                            "vaddl", "u", add, zext, 1>;
+//   VADDW    : Vector Add Wide (Q = Q + D)
+defm VADDWs   : N3VW_QHS<0,1,0b0001,0, "vaddw", "s", add, sext, 0>;
+defm VADDWu   : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zext, 0>;
+//   VHADD    : Vector Halving Add
+defm VHADDs   : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vhadd", "s", int_arm_neon_vhadds, 1>;
+defm VHADDu   : N3VInt_QHS<1, 0, 0b0000, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vhadd", "u", int_arm_neon_vhaddu, 1>;
+//   VRHADD   : Vector Rounding Halving Add
+defm VRHADDs  : N3VInt_QHS<0, 0, 0b0001, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vrhadd", "s", int_arm_neon_vrhadds, 1>;
+defm VRHADDu  : N3VInt_QHS<1, 0, 0b0001, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vrhadd", "u", int_arm_neon_vrhaddu, 1>;
+//   VQADD    : Vector Saturating Add
+defm VQADDs   : N3VInt_QHSD<0, 0, 0b0000, 1, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                            "vqadd", "s", int_arm_neon_vqadds, 1>;
+defm VQADDu   : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                            "vqadd", "u", int_arm_neon_vqaddu, 1>;
+//   VADDHN   : Vector Add and Narrow Returning High Half (D = Q + Q)
+defm VADDHN   : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", null_frag, 1>;
+//   VRADDHN  : Vector Rounding Add and Narrow Returning High Half (D = Q + Q)
+defm VRADDHN  : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i",
+                            int_arm_neon_vraddhn, 1>;
+
+def : Pat<(v8i8  (trunc (NEONvshru (add (v8i16 QPR:$Vn), QPR:$Vm), 8))),
+          (VADDHNv8i8 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v4i16 (trunc (NEONvshru (add (v4i32 QPR:$Vn), QPR:$Vm), 16))),
+          (VADDHNv4i16 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v2i32 (trunc (NEONvshru (add (v2i64 QPR:$Vn), QPR:$Vm), 32))),
+          (VADDHNv2i32 QPR:$Vn, QPR:$Vm)>;
+
+// Vector Multiply Operations.
+
+//   VMUL     : Vector Multiply (integer, polynomial and floating-point)
+defm VMUL     : N3V_QHS<0, 0, 0b1001, 1, IIC_VMULi16D, IIC_VMULi32D,
+                        IIC_VMULi16Q, IIC_VMULi32Q, "vmul", "i", mul, 1>;
+def  VMULpd   : N3VDInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16D, "vmul",
+                        "p8", v8i8, v8i8, int_arm_neon_vmulp, 1>;
+def  VMULpq   : N3VQInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16Q, "vmul",
+                        "p8", v16i8, v16i8, int_arm_neon_vmulp, 1>;
+def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VFMULD, "vmul", "f32",
+                     v2f32, v2f32, fmul, 1>;
+def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VFMULQ, "vmul", "f32",
+                     v4f32, v4f32, fmul, 1>;
+def  VMULhd   : N3VD<1, 0, 0b01, 0b1101, 1, IIC_VFMULD, "vmul", "f16",
+                     v4f16, v4f16, fmul, 1>,
+                Requires<[HasNEON,HasFullFP16]>;
+def  VMULhq   : N3VQ<1, 0, 0b01, 0b1101, 1, IIC_VFMULQ, "vmul", "f16",
+                     v8f16, v8f16, fmul, 1>,
+                Requires<[HasNEON,HasFullFP16]>;
+defm VMULsl   : N3VSL_HS<0b1000, "vmul", mul>;
+def  VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>;
+def  VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul", "f32", v4f32,
+                       v2f32, fmul>;
+def  VMULslhd : N3VDSL16<0b01, 0b1001, "vmul", "f16", v4f16, fmul>,
+                Requires<[HasNEON,HasFullFP16]>;
+def  VMULslhq : N3VQSL16<0b01, 0b1001, "vmul", "f16", v8f16,
+                       v4f16, fmul>,
+                Requires<[HasNEON,HasFullFP16]>;
+
+def : Pat<(v8i16 (mul (v8i16 QPR:$src1),
+                      (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
+          (v8i16 (VMULslv8i16 (v8i16 QPR:$src1),
+                              (v4i16 (EXTRACT_SUBREG QPR:$src2,
+                                      (DSubReg_i16_reg imm:$lane))),
+                              (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (mul (v4i32 QPR:$src1),
+                      (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
+          (v4i32 (VMULslv4i32 (v4i32 QPR:$src1),
+                              (v2i32 (EXTRACT_SUBREG QPR:$src2,
+                                      (DSubReg_i32_reg imm:$lane))),
+                              (SubReg_i32_lane imm:$lane)))>;
+def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
+                       (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))),
+          (v4f32 (VMULslfq (v4f32 QPR:$src1),
+                           (v2f32 (EXTRACT_SUBREG QPR:$src2,
+                                   (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>;
+
+
+def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+          (VMULslfd DPR:$Rn,
+            (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
+            (i32 0))>;
+def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+          (VMULslfq QPR:$Rn,
+            (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
+            (i32 0))>;
+
+
+//   VQDMULH  : Vector Saturating Doubling Multiply Returning High Half
+defm VQDMULH  : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,
+                          IIC_VMULi16Q, IIC_VMULi32Q,
+                          "vqdmulh", "s", int_arm_neon_vqdmulh, 1>;
+defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D,
+                            IIC_VMULi16Q, IIC_VMULi32Q,
+                            "vqdmulh", "s",  int_arm_neon_vqdmulh>;
+def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1),
+                                       (v8i16 (NEONvduplane (v8i16 QPR:$src2),
+                                                            imm:$lane)))),
+          (v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1),
+                                 (v4i16 (EXTRACT_SUBREG QPR:$src2,
+                                         (DSubReg_i16_reg imm:$lane))),
+                                 (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1),
+                                       (v4i32 (NEONvduplane (v4i32 QPR:$src2),
+                                                            imm:$lane)))),
+          (v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1),
+                                 (v2i32 (EXTRACT_SUBREG QPR:$src2,
+                                         (DSubReg_i32_reg imm:$lane))),
+                                 (SubReg_i32_lane imm:$lane)))>;
+
+//   VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half
+defm VQRDMULH   : N3VInt_HS<1, 0, 0b1011, 0, N3RegFrm,
+                            IIC_VMULi16D,IIC_VMULi32D,IIC_VMULi16Q,IIC_VMULi32Q,
+                            "vqrdmulh", "s", int_arm_neon_vqrdmulh, 1>;
+defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D,
+                              IIC_VMULi16Q, IIC_VMULi32Q,
+                              "vqrdmulh", "s",  int_arm_neon_vqrdmulh>;
+def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1),
+                                        (v8i16 (NEONvduplane (v8i16 QPR:$src2),
+                                                             imm:$lane)))),
+          (v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1),
+                                  (v4i16 (EXTRACT_SUBREG QPR:$src2,
+                                          (DSubReg_i16_reg imm:$lane))),
+                                  (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
+                                        (v4i32 (NEONvduplane (v4i32 QPR:$src2),
+                                                             imm:$lane)))),
+          (v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1),
+                                  (v2i32 (EXTRACT_SUBREG QPR:$src2,
+                                          (DSubReg_i32_reg imm:$lane))),
+                                  (SubReg_i32_lane imm:$lane)))>;
+
+//   VMULL    : Vector Multiply Long (integer and polynomial) (Q = D * D)
+let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
+    DecoderNamespace = "NEONData" in {
+  defm VMULLs   : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
+                           "vmull", "s", NEONvmulls, 1>;
+  defm VMULLu   : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
+                           "vmull", "u", NEONvmullu, 1>;
+  def  VMULLp8   :  N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
+                            v8i16, v8i8, int_arm_neon_vmullp, 1>;
+  def  VMULLp64  : N3VLIntnp<0b00101, 0b10, 0b1110, 0, 0, NoItinerary,
+                          "vmull", "p64", v2i64, v1i64, int_arm_neon_vmullp, 1>,
+                    Requires<[HasV8, HasCrypto]>;
+}
+defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>;
+defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>;
+
+//   VQDMULL  : Vector Saturating Doubling Multiply Long (Q = D * D)
+defm VQDMULL  : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D,
+                           "vqdmull", "s", int_arm_neon_vqdmull, 1>;
+defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, IIC_VMULi16D,
+                             "vqdmull", "s", int_arm_neon_vqdmull>;
+
+// Vector Multiply-Accumulate and Multiply-Subtract Operations.
+
+//   VMLA     : Vector Multiply Accumulate (integer and floating-point)
+defm VMLA     : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+                             IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
+def  VMLAfd   : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32",
+                          v2f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+def  VMLAfq   : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32",
+                          v4f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+def  VMLAhd   : N3VDMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACD, "vmla", "f16",
+                          v4f16, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+def  VMLAhq   : N3VQMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACQ, "vmla", "f16",
+                          v8f16, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+defm VMLAsl   : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
+                              IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
+def  VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
+                            v2f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
+def  VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32",
+                            v4f32, v2f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
+def  VMLAslhd : N3VDMulOpSL16<0b01, 0b0001, IIC_VMACD, "vmla", "f16",
+                            v4f16, fmul, fadd>,
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
+def  VMLAslhq : N3VQMulOpSL16<0b01, 0b0001, IIC_VMACQ, "vmla", "f16",
+                            v8f16, v4f16, fmul, fadd>,
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
+
+def : Pat<(v8i16 (add (v8i16 QPR:$src1),
+                  (mul (v8i16 QPR:$src2),
+                       (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+          (v8i16 (VMLAslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2),
+                              (v4i16 (EXTRACT_SUBREG QPR:$src3,
+                                      (DSubReg_i16_reg imm:$lane))),
+                              (SubReg_i16_lane imm:$lane)))>;
+
+def : Pat<(v4i32 (add (v4i32 QPR:$src1),
+                  (mul (v4i32 QPR:$src2),
+                       (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+          (v4i32 (VMLAslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2),
+                              (v2i32 (EXTRACT_SUBREG QPR:$src3,
+                                      (DSubReg_i32_reg imm:$lane))),
+                              (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1),
+                  (fmul_su (v4f32 QPR:$src2),
+                        (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+          (v4f32 (VMLAslfq (v4f32 QPR:$src1),
+                           (v4f32 QPR:$src2),
+                           (v2f32 (EXTRACT_SUBREG QPR:$src3,
+                                   (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>,
+          Requires<[HasNEON, UseFPVMLx]>;
+
+//   VMLAL    : Vector Multiply Accumulate Long (Q += D * D)
+defm VMLALs   : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
+                              "vmlal", "s", NEONvmulls, add>;
+defm VMLALu   : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
+                              "vmlal", "u", NEONvmullu, add>;
+
+defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>;
+defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>;
+
+let Predicates = [HasNEON, HasV8_1a] in {
+  // v8.1a Neon Rounding Double Multiply-Op vector operations,
+  // VQRDMLAH : Vector Saturating Rounding Doubling Multiply Accumulate Long
+  //            (Q += D * D)
+  defm VQRDMLAH : N3VInt3_HS<1, 0, 0b1011, 1, IIC_VMACi16D, IIC_VMACi32D,
+                             IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s",
+                             null_frag>;
+  def : Pat<(v4i16 (int_arm_neon_vqadds
+                     (v4i16 DPR:$src1),
+                     (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn),
+                                                   (v4i16 DPR:$Vm))))),
+            (v4i16 (VQRDMLAHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
+  def : Pat<(v2i32 (int_arm_neon_vqadds
+                     (v2i32 DPR:$src1),
+                     (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn),
+                                                   (v2i32 DPR:$Vm))))),
+            (v2i32 (VQRDMLAHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
+  def : Pat<(v8i16 (int_arm_neon_vqadds
+                     (v8i16 QPR:$src1),
+                     (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn),
+                                                   (v8i16 QPR:$Vm))))),
+            (v8i16 (VQRDMLAHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
+  def : Pat<(v4i32 (int_arm_neon_vqadds
+                     (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn),
+                                                   (v4i32 QPR:$Vm))))),
+            (v4i32 (VQRDMLAHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
+
+  defm VQRDMLAHsl : N3VMulOpSL_HS<0b1110, IIC_VMACi16D, IIC_VMACi32D,
+                                  IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s",
+                                  null_frag>;
+  def : Pat<(v4i16 (int_arm_neon_vqadds
+                     (v4i16 DPR:$src1),
+                     (v4i16 (int_arm_neon_vqrdmulh
+                              (v4i16 DPR:$Vn),
+                              (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+                                                   imm:$lane)))))),
+            (v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm,
+                                    imm:$lane))>;
+  def : Pat<(v2i32 (int_arm_neon_vqadds
+                     (v2i32 DPR:$src1),
+                     (v2i32 (int_arm_neon_vqrdmulh
+                              (v2i32 DPR:$Vn),
+                              (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+                                                   imm:$lane)))))),
+            (v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
+                                    imm:$lane))>;
+  def : Pat<(v8i16 (int_arm_neon_vqadds
+                     (v8i16 QPR:$src1),
+                     (v8i16 (int_arm_neon_vqrdmulh
+                              (v8i16 QPR:$src2),
+                              (v8i16 (NEONvduplane (v8i16 QPR:$src3),
+                                                   imm:$lane)))))),
+            (v8i16 (VQRDMLAHslv8i16 (v8i16 QPR:$src1),
+                                    (v8i16 QPR:$src2),
+                                    (v4i16 (EXTRACT_SUBREG
+                                             QPR:$src3,
+                                             (DSubReg_i16_reg imm:$lane))),
+                                    (SubReg_i16_lane imm:$lane)))>;
+  def : Pat<(v4i32 (int_arm_neon_vqadds
+                     (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqrdmulh 
+                              (v4i32 QPR:$src2),
+                              (v4i32 (NEONvduplane (v4i32 QPR:$src3), 
+                                                   imm:$lane)))))),
+            (v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1),
+                                    (v4i32 QPR:$src2),
+                                    (v2i32 (EXTRACT_SUBREG
+                                             QPR:$src3,
+                                             (DSubReg_i32_reg imm:$lane))),
+                                    (SubReg_i32_lane imm:$lane)))>;
+
+  //   VQRDMLSH : Vector Saturating Rounding Doubling Multiply Subtract Long
+  //              (Q -= D * D)
+  defm VQRDMLSH : N3VInt3_HS<1, 0, 0b1100, 1, IIC_VMACi16D, IIC_VMACi32D,
+                             IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s",
+                             null_frag>;
+  def : Pat<(v4i16 (int_arm_neon_vqsubs
+                     (v4i16 DPR:$src1),
+                     (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn),
+                                                   (v4i16 DPR:$Vm))))),
+            (v4i16 (VQRDMLSHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
+  def : Pat<(v2i32 (int_arm_neon_vqsubs
+                     (v2i32 DPR:$src1),
+                     (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn),
+                                                   (v2i32 DPR:$Vm))))),
+            (v2i32 (VQRDMLSHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
+  def : Pat<(v8i16 (int_arm_neon_vqsubs
+                     (v8i16 QPR:$src1),
+                     (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn),
+                                                   (v8i16 QPR:$Vm))))),
+            (v8i16 (VQRDMLSHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
+  def : Pat<(v4i32 (int_arm_neon_vqsubs
+                     (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn),
+                                                   (v4i32 QPR:$Vm))))),
+            (v4i32 (VQRDMLSHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
+
+  defm VQRDMLSHsl : N3VMulOpSL_HS<0b1111, IIC_VMACi16D, IIC_VMACi32D,
+                                  IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s",
+                                  null_frag>;
+  def : Pat<(v4i16 (int_arm_neon_vqsubs
+                     (v4i16 DPR:$src1),
+                     (v4i16 (int_arm_neon_vqrdmulh
+                              (v4i16 DPR:$Vn),
+                              (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+                                                   imm:$lane)))))),
+            (v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>;
+  def : Pat<(v2i32 (int_arm_neon_vqsubs
+                     (v2i32 DPR:$src1),
+                     (v2i32 (int_arm_neon_vqrdmulh
+                              (v2i32 DPR:$Vn),
+                              (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+                                                   imm:$lane)))))),
+            (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, 
+                                    imm:$lane))>;
+  def : Pat<(v8i16 (int_arm_neon_vqsubs
+                     (v8i16 QPR:$src1),
+                     (v8i16 (int_arm_neon_vqrdmulh
+                              (v8i16 QPR:$src2),
+                              (v8i16 (NEONvduplane (v8i16 QPR:$src3), 
+                                                   imm:$lane)))))),
+            (v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1),
+                                    (v8i16 QPR:$src2),
+                                    (v4i16 (EXTRACT_SUBREG 
+                                             QPR:$src3,
+                                             (DSubReg_i16_reg imm:$lane))),
+                                    (SubReg_i16_lane imm:$lane)))>;
+  def : Pat<(v4i32 (int_arm_neon_vqsubs
+                     (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqrdmulh
+                              (v4i32 QPR:$src2),
+                              (v4i32 (NEONvduplane (v4i32 QPR:$src3),
+                                                    imm:$lane)))))),
+            (v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1),
+                                    (v4i32 QPR:$src2),
+                                    (v2i32 (EXTRACT_SUBREG 
+                                             QPR:$src3,
+                                             (DSubReg_i32_reg imm:$lane))),
+                                    (SubReg_i32_lane imm:$lane)))>;
+}
+//   VQDMLAL  : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
+defm VQDMLAL  : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+                            "vqdmlal", "s", null_frag>;
+defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", null_frag>;
+
+def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                                  (v4i16 DPR:$Vm))))),
+          (VQDMLALv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                                  (v2i32 DPR:$Vm))))),
+          (VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
+def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLALslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
+
+//   VMLS     : Vector Multiply Subtract (integer and floating-point)
+defm VMLS     : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+                             IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
+def  VMLSfd   : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32",
+                          v2f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+def  VMLSfq   : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32",
+                          v4f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+def  VMLShd   : N3VDMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACD, "vmls", "f16",
+                          v4f16, fmul, fsub>,
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+def  VMLShq   : N3VQMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACQ, "vmls", "f16",
+                          v8f16, fmul, fsub>,
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+defm VMLSsl   : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
+                              IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
+def  VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
+                            v2f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
+def  VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32",
+                            v4f32, v2f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
+def  VMLSslhd : N3VDMulOpSL16<0b01, 0b0101, IIC_VMACD, "vmls", "f16",
+                            v4f16, fmul, fsub>,
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
+def  VMLSslhq : N3VQMulOpSL16<0b01, 0b0101, IIC_VMACQ, "vmls", "f16",
+                            v8f16, v4f16, fmul, fsub>,
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
+
+def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
+                  (mul (v8i16 QPR:$src2),
+                       (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+          (v8i16 (VMLSslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2),
+                              (v4i16 (EXTRACT_SUBREG QPR:$src3,
+                                      (DSubReg_i16_reg imm:$lane))),
+                              (SubReg_i16_lane imm:$lane)))>;
+
+def : Pat<(v4i32 (sub (v4i32 QPR:$src1),
+                  (mul (v4i32 QPR:$src2),
+                     (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+          (v4i32 (VMLSslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2),
+                              (v2i32 (EXTRACT_SUBREG QPR:$src3,
+                                      (DSubReg_i32_reg imm:$lane))),
+                              (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1),
+                  (fmul_su (v4f32 QPR:$src2),
+                        (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+          (v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2),
+                           (v2f32 (EXTRACT_SUBREG QPR:$src3,
+                                   (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>,
+          Requires<[HasNEON, UseFPVMLx]>;
+
+//   VMLSL    : Vector Multiply Subtract Long (Q -= D * D)
+defm VMLSLs   : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
+                              "vmlsl", "s", NEONvmulls, sub>;
+defm VMLSLu   : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
+                              "vmlsl", "u", NEONvmullu, sub>;
+
+defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>;
+defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>;
+
+//   VQDMLSL  : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
+defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
+                            "vqdmlsl", "s", null_frag>;
+defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b0111, "vqdmlsl", "s", null_frag>;
+
+def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                                  (v4i16 DPR:$Vm))))),
+          (VQDMLSLv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                                  (v2i32 DPR:$Vm))))),
+          (VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
+def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLSLslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
+
+// Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.
+def  VFMAfd   : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
+                          v2f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
+
+def  VFMAfq   : N3VQMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACQ, "vfma", "f32",
+                          v4f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
+def  VFMAhd   : N3VDMulOp<0, 0, 0b01, 0b1100, 1, IIC_VFMACD, "vfma", "f16",
+                          v4f16, fmul, fadd>,
+                Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
+
+def  VFMAhq   : N3VQMulOp<0, 0, 0b01, 0b1100, 1, IIC_VFMACQ, "vfma", "f16",
+                          v8f16, fmul, fadd>,
+                Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
+
+//   Fused Vector Multiply Subtract (floating-point)
+def  VFMSfd   : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32",
+                          v2f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
+def  VFMSfq   : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
+                          v4f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
+def  VFMShd   : N3VDMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACD, "vfms", "f16",
+                          v4f16, fmul, fsub>,
+                Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
+def  VFMShq   : N3VQMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACQ, "vfms", "f16",
+                          v8f16, fmul, fsub>,
+                Requires<[HasNEON,HasFullFP16,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)),
+          (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+          Requires<[HasVFP4]>;
+def : Pat<(v4f32 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)),
+          (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+          Requires<[HasVFP4]>;
+def : Pat<(v2f32 (fma (fneg DPR:$Vn), DPR:$Vm, DPR:$src1)),
+          (VFMSfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(v4f32 (fma (fneg QPR:$Vn), QPR:$Vm, QPR:$src1)),
+          (VFMSfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+      Requires<[HasVFP4]>;
+
+// Vector Subtract Operations.
+
+//   VSUB     : Vector Subtract (integer and floating-point)
+defm VSUB     : N3V_QHSD<1, 0, 0b1000, 0, IIC_VSUBiD, IIC_VSUBiQ,
+                         "vsub", "i", sub, 0>;
+def  VSUBfd   : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub", "f32",
+                     v2f32, v2f32, fsub, 0>;
+def  VSUBfq   : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32",
+                     v4f32, v4f32, fsub, 0>;
+def  VSUBhd   : N3VD<0, 0, 0b11, 0b1101, 0, IIC_VBIND, "vsub", "f16",
+                     v4f16, v4f16, fsub, 0>,
+                Requires<[HasNEON,HasFullFP16]>;
+def  VSUBhq   : N3VQ<0, 0, 0b11, 0b1101, 0, IIC_VBINQ, "vsub", "f16",
+                     v8f16, v8f16, fsub, 0>,
+                Requires<[HasNEON,HasFullFP16]>;
+//   VSUBL    : Vector Subtract Long (Q = D - D)
+defm VSUBLs   : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
+                            "vsubl", "s", sub, sext, 0>;
+defm VSUBLu   : N3VLExt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
+                            "vsubl", "u", sub, zext, 0>;
+//   VSUBW    : Vector Subtract Wide (Q = Q - D)
+defm VSUBWs   : N3VW_QHS<0,1,0b0011,0, "vsubw", "s", sub, sext, 0>;
+defm VSUBWu   : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zext, 0>;
+//   VHSUB    : Vector Halving Subtract
+defm VHSUBs   : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vhsub", "s", int_arm_neon_vhsubs, 0>;
+defm VHSUBu   : N3VInt_QHS<1, 0, 0b0010, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vhsub", "u", int_arm_neon_vhsubu, 0>;
+//   VQSUB    : Vector Saturing Subtract
+defm VQSUBs   : N3VInt_QHSD<0, 0, 0b0010, 1, N3RegFrm,
+                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                            "vqsub", "s", int_arm_neon_vqsubs, 0>;
+defm VQSUBu   : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm,
+                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                            "vqsub", "u", int_arm_neon_vqsubu, 0>;
+//   VSUBHN   : Vector Subtract and Narrow Returning High Half (D = Q - Q)
+defm VSUBHN   : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", null_frag, 0>;
+//   VRSUBHN  : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q)
+defm VRSUBHN  : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn", "i",
+                            int_arm_neon_vrsubhn, 0>;
+
+def : Pat<(v8i8  (trunc (NEONvshru (sub (v8i16 QPR:$Vn), QPR:$Vm), 8))),
+          (VSUBHNv8i8 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v4i16 (trunc (NEONvshru (sub (v4i32 QPR:$Vn), QPR:$Vm), 16))),
+          (VSUBHNv4i16 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v2i32 (trunc (NEONvshru (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))),
+          (VSUBHNv2i32 QPR:$Vn, QPR:$Vm)>;
+
+// Vector Comparisons.
+
+//   VCEQ     : Vector Compare Equal
+defm VCEQ     : N3V_QHS<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                        IIC_VSUBi4Q, "vceq", "i", NEONvceq, 1>;
+def  VCEQfd   : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
+                     NEONvceq, 1>;
+def  VCEQfq   : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
+                     NEONvceq, 1>;
+def  VCEQhd   : N3VD<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16,
+                     NEONvceq, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCEQhq   : N3VQ<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16,
+                     NEONvceq, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+let TwoOperandAliasConstraint = "$Vm = $Vd" in
+defm VCEQz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
+                            "$Vd, $Vm, #0", NEONvceqz>;
+
+//   VCGE     : Vector Compare Greater Than or Equal
+defm VCGEs    : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                        IIC_VSUBi4Q, "vcge", "s", NEONvcge, 0>;
+defm VCGEu    : N3V_QHS<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                        IIC_VSUBi4Q, "vcge", "u", NEONvcgeu, 0>;
+def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
+                     NEONvcge, 0>;
+def  VCGEfq   : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
+                     NEONvcge, 0>;
+def  VCGEhd   : N3VD<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16,
+                     NEONvcge, 0>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCGEhq   : N3VQ<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16,
+                     NEONvcge, 0>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+defm VCGEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s",
+                            "$Vd, $Vm, #0", NEONvcgez>;
+defm VCLEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s",
+                            "$Vd, $Vm, #0", NEONvclez>;
+}
+
+//   VCGT     : Vector Compare Greater Than
+defm VCGTs    : N3V_QHS<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                        IIC_VSUBi4Q, "vcgt", "s", NEONvcgt, 0>;
+defm VCGTu    : N3V_QHS<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                        IIC_VSUBi4Q, "vcgt", "u", NEONvcgtu, 0>;
+def  VCGTfd   : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
+                     NEONvcgt, 0>;
+def  VCGTfq   : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
+                     NEONvcgt, 0>;
+def  VCGThd   : N3VD<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16,
+                     NEONvcgt, 0>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCGThq   : N3VQ<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16,
+                     NEONvcgt, 0>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+defm VCGTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s",
+                            "$Vd, $Vm, #0", NEONvcgtz>;
+defm VCLTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
+                            "$Vd, $Vm, #0", NEONvcltz>;
+}
+
+//   VACGE    : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
+def  VACGEfd   : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
+                        "f32", v2i32, v2f32, int_arm_neon_vacge, 0>;
+def  VACGEfq   : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge",
+                        "f32", v4i32, v4f32, int_arm_neon_vacge, 0>;
+def  VACGEhd   : N3VDInt<1, 0, 0b01, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
+                        "f16", v4i16, v4f16, int_arm_neon_vacge, 0>,
+                 Requires<[HasNEON, HasFullFP16]>;
+def  VACGEhq   : N3VQInt<1, 0, 0b01, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge",
+                        "f16", v8i16, v8f16, int_arm_neon_vacge, 0>,
+                 Requires<[HasNEON, HasFullFP16]>;
+//   VACGT    : Vector Absolute Compare Greater Than (aka VCAGT)
+def  VACGTfd   : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
+                        "f32", v2i32, v2f32, int_arm_neon_vacgt, 0>;
+def  VACGTfq   : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
+                        "f32", v4i32, v4f32, int_arm_neon_vacgt, 0>;
+def  VACGThd   : N3VDInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
+                        "f16", v4i16, v4f16, int_arm_neon_vacgt, 0>,
+                 Requires<[HasNEON, HasFullFP16]>;
+def  VACGThq   : N3VQInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
+                        "f16", v8f16, v8f16, int_arm_neon_vacgt, 0>,
+                 Requires<[HasNEON, HasFullFP16]>;
+//   VTST     : Vector Test Bits
+defm VTST     : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                        IIC_VBINi4Q, "vtst", "", NEONvtst, 1>;
+
+def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm",
+                   (VACGTfd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm",
+                   (VACGTfq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm",
+                   (VACGEfd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm",
+                   (VACGEfq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vn, $Vm",
+                   (VACGThd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vn, $Vm",
+                   (VACGThq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vn, $Vm",
+                   (VACGEhd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vn, $Vm",
+                   (VACGEhq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>;
+}
+
+def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm",
+                   (VACGTfd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm",
+                   (VACGTfq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm",
+                   (VACGEfd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm",
+                   (VACGEfq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vm",
+                   (VACGThd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vm",
+                   (VACGThq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm",
+                   (VACGEhd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>;
+def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm",
+                   (VACGEhq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>;
+}
+
+// Vector Bitwise Operations.
+
+def vnotd : PatFrag<(ops node:$in),
+                    (xor node:$in, (bitconvert (v8i8 NEONimmAllOnesV)))>;
+def vnotq : PatFrag<(ops node:$in),
+                    (xor node:$in, (bitconvert (v16i8 NEONimmAllOnesV)))>;
+
+
+//   VAND     : Vector Bitwise AND
+def  VANDd    : N3VDX<0, 0, 0b00, 0b0001, 1, IIC_VBINiD, "vand",
+                      v2i32, v2i32, and, 1>;
+def  VANDq    : N3VQX<0, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "vand",
+                      v4i32, v4i32, and, 1>;
+
+//   VEOR     : Vector Bitwise Exclusive OR
+def  VEORd    : N3VDX<1, 0, 0b00, 0b0001, 1, IIC_VBINiD, "veor",
+                      v2i32, v2i32, xor, 1>;
+def  VEORq    : N3VQX<1, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "veor",
+                      v4i32, v4i32, xor, 1>;
+
+//   VORR     : Vector Bitwise OR
+def  VORRd    : N3VDX<0, 0, 0b10, 0b0001, 1, IIC_VBINiD, "vorr",
+                      v2i32, v2i32, or, 1>;
+def  VORRq    : N3VQX<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr",
+                      v4i32, v4i32, or, 1>;
+
+def VORRiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 0, 1,
+                          (outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src),
+                          IIC_VMOVImm,
+                          "vorr", "i16", "$Vd, $SIMM", "$src = $Vd",
+                          [(set DPR:$Vd,
+                            (v4i16 (NEONvorrImm DPR:$src, timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VORRiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 0, 1,
+                          (outs DPR:$Vd), (ins nImmSplatI32:$SIMM, DPR:$src),
+                          IIC_VMOVImm,
+                          "vorr", "i32", "$Vd, $SIMM", "$src = $Vd",
+                          [(set DPR:$Vd,
+                            (v2i32 (NEONvorrImm DPR:$src, timm:$SIMM)))]> {
+  let Inst{10-9} = SIMM{10-9};
+}
+
+def VORRiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 0, 1,
+                          (outs QPR:$Vd), (ins nImmSplatI16:$SIMM, QPR:$src),
+                          IIC_VMOVImm,
+                          "vorr", "i16", "$Vd, $SIMM", "$src = $Vd",
+                          [(set QPR:$Vd,
+                            (v8i16 (NEONvorrImm QPR:$src, timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VORRiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 0, 1,
+                          (outs QPR:$Vd), (ins nImmSplatI32:$SIMM, QPR:$src),
+                          IIC_VMOVImm,
+                          "vorr", "i32", "$Vd, $SIMM", "$src = $Vd",
+                          [(set QPR:$Vd,
+                            (v4i32 (NEONvorrImm QPR:$src, timm:$SIMM)))]> {
+  let Inst{10-9} = SIMM{10-9};
+}
+
+
+//   VBIC     : Vector Bitwise Bit Clear (AND NOT)
+let TwoOperandAliasConstraint = "$Vn = $Vd" in {
+def  VBICd    : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
+                     (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD,
+                     "vbic", "$Vd, $Vn, $Vm", "",
+                     [(set DPR:$Vd, (v2i32 (and DPR:$Vn,
+                                                 (vnotd DPR:$Vm))))]>;
+def  VBICq    : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
+                     (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ,
+                     "vbic", "$Vd, $Vn, $Vm", "",
+                     [(set QPR:$Vd, (v4i32 (and QPR:$Vn,
+                                                 (vnotq QPR:$Vm))))]>;
+}
+
+def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1,
+                          (outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src),
+                          IIC_VMOVImm,
+                          "vbic", "i16", "$Vd, $SIMM", "$src = $Vd",
+                          [(set DPR:$Vd,
+                            (v4i16 (NEONvbicImm DPR:$src, timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VBICiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 1, 1,
+                          (outs DPR:$Vd), (ins nImmSplatI32:$SIMM, DPR:$src),
+                          IIC_VMOVImm,
+                          "vbic", "i32", "$Vd, $SIMM", "$src = $Vd",
+                          [(set DPR:$Vd,
+                            (v2i32 (NEONvbicImm DPR:$src, timm:$SIMM)))]> {
+  let Inst{10-9} = SIMM{10-9};
+}
+
+def VBICiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 1, 1,
+                          (outs QPR:$Vd), (ins nImmSplatI16:$SIMM, QPR:$src),
+                          IIC_VMOVImm,
+                          "vbic", "i16", "$Vd, $SIMM", "$src = $Vd",
+                          [(set QPR:$Vd,
+                            (v8i16 (NEONvbicImm QPR:$src, timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VBICiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 1, 1,
+                          (outs QPR:$Vd), (ins nImmSplatI32:$SIMM, QPR:$src),
+                          IIC_VMOVImm,
+                          "vbic", "i32", "$Vd, $SIMM", "$src = $Vd",
+                          [(set QPR:$Vd,
+                            (v4i32 (NEONvbicImm QPR:$src, timm:$SIMM)))]> {
+  let Inst{10-9} = SIMM{10-9};
+}
+
+//   VORN     : Vector Bitwise OR NOT
+def  VORNd    : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$Vd),
+                     (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD,
+                     "vorn", "$Vd, $Vn, $Vm", "",
+                     [(set DPR:$Vd, (v2i32 (or DPR:$Vn,
+                                                (vnotd DPR:$Vm))))]>;
+def  VORNq    : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$Vd),
+                     (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ,
+                     "vorn", "$Vd, $Vn, $Vm", "",
+                     [(set QPR:$Vd, (v4i32 (or QPR:$Vn,
+                                                (vnotq QPR:$Vm))))]>;
+
+//   VMVN     : Vector Bitwise NOT (Immediate)
+
+let isReMaterializable = 1 in {
+
+def VMVNv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 1, 1, (outs DPR:$Vd),
+                         (ins nImmSplatI16:$SIMM), IIC_VMOVImm,
+                         "vmvn", "i16", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v4i16 (NEONvmvnImm timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VMVNv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 1, 1, (outs QPR:$Vd),
+                         (ins nImmSplatI16:$SIMM), IIC_VMOVImm,
+                         "vmvn", "i16", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v8i16 (NEONvmvnImm timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VMVNv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 1, 1, (outs DPR:$Vd),
+                         (ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
+                         "vmvn", "i32", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v2i32 (NEONvmvnImm timm:$SIMM)))]> {
+  let Inst{11-8} = SIMM{11-8};
+}
+
+def VMVNv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 1, 1, (outs QPR:$Vd),
+                         (ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
+                         "vmvn", "i32", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v4i32 (NEONvmvnImm timm:$SIMM)))]> {
+  let Inst{11-8} = SIMM{11-8};
+}
+}
+
+//   VMVN     : Vector Bitwise NOT
+def  VMVNd    : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0,
+                     (outs DPR:$Vd), (ins DPR:$Vm), IIC_VSUBiD,
+                     "vmvn", "$Vd, $Vm", "",
+                     [(set DPR:$Vd, (v2i32 (vnotd DPR:$Vm)))]>;
+def  VMVNq    : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0,
+                     (outs QPR:$Vd), (ins QPR:$Vm), IIC_VSUBiD,
+                     "vmvn", "$Vd, $Vm", "",
+                     [(set QPR:$Vd, (v4i32 (vnotq QPR:$Vm)))]>;
+def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>;
+def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>;
+
+//   VBSL     : Vector Bitwise Select
+def  VBSLd    : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
+                     (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+                     N3RegFrm, IIC_VCNTiD,
+                     "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     [(set DPR:$Vd,
+                           (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
+def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1),
+                                   (v8i8 DPR:$Vn), (v8i8 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1),
+                                    (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1),
+                                    (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 DPR:$src1),
+                                    (v2f32 DPR:$Vn), (v2f32 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 DPR:$src1),
+                                    (v1i64 DPR:$Vn), (v1i64 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
+                     (and DPR:$Vm, (vnotd DPR:$Vd)))),
+          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+def : Pat<(v1i64 (or (and DPR:$Vn, DPR:$Vd),
+                     (and DPR:$Vm, (vnotd DPR:$Vd)))),
+          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
+                     (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+                     N3RegFrm, IIC_VCNTiQ,
+                     "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     [(set QPR:$Vd,
+                           (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
+
+def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1),
+                                   (v16i8 QPR:$Vn), (v16i8 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1),
+                                    (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1),
+                                    (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 QPR:$src1),
+                                    (v4f32 QPR:$Vn), (v4f32 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 QPR:$src1),
+                                    (v2i64 QPR:$Vn), (v2i64 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
+                     (and QPR:$Vm, (vnotq QPR:$Vd)))),
+          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2i64 (or (and QPR:$Vn, QPR:$Vd),
+                     (and QPR:$Vm, (vnotq QPR:$Vd)))),
+          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+//   VBIF     : Vector Bitwise Insert if False
+//              like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
+// FIXME: This instruction's encoding MAY NOT BE correct.
+def  VBIFd    : N3VX<1, 0, 0b11, 0b0001, 0, 1,
+                     (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+                     N3RegFrm, IIC_VBINiD,
+                     "vbif", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     []>;
+def  VBIFq    : N3VX<1, 0, 0b11, 0b0001, 1, 1,
+                     (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+                     N3RegFrm, IIC_VBINiQ,
+                     "vbif", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     []>;
+
+//   VBIT     : Vector Bitwise Insert if True
+//              like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst",
+// FIXME: This instruction's encoding MAY NOT BE correct.
+def  VBITd    : N3VX<1, 0, 0b10, 0b0001, 0, 1,
+                     (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+                     N3RegFrm, IIC_VBINiD,
+                     "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     []>;
+def  VBITq    : N3VX<1, 0, 0b10, 0b0001, 1, 1,
+                     (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+                     N3RegFrm, IIC_VBINiQ,
+                     "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     []>;
+
+// VBIT/VBIF are not yet implemented.  The TwoAddress pass will not go looking
+// for equivalent operations with different register constraints; it just
+// inserts copies.
+
+// Vector Absolute Differences.
+
+//   VABD     : Vector Absolute Difference
+defm VABDs    : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vabd", "s", int_arm_neon_vabds, 1>;
+defm VABDu    : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vabd", "u", int_arm_neon_vabdu, 1>;
+def  VABDfd   : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND,
+                        "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 1>;
+def  VABDfq   : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ,
+                        "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 1>;
+def  VABDhd   : N3VDInt<1, 0, 0b11, 0b1101, 0, N3RegFrm, IIC_VBIND,
+                        "vabd", "f16", v4f16, v4f16, int_arm_neon_vabds, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VABDhq   : N3VQInt<1, 0, 0b11, 0b1101, 0, N3RegFrm, IIC_VBINQ,
+                        "vabd", "f16", v8f16, v8f16, int_arm_neon_vabds, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+//   VABDL    : Vector Absolute Difference Long (Q = | D - D |)
+defm VABDLs   : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q,
+                               "vabdl", "s", int_arm_neon_vabds, zext, 1>;
+defm VABDLu   : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q,
+                               "vabdl", "u", int_arm_neon_vabdu, zext, 1>;
+
+def abd_shr :
+    PatFrag<(ops node:$in1, node:$in2, node:$shift),
+            (NEONvshrs (sub (zext node:$in1),
+                            (zext node:$in2)), (i32 $shift))>;
+
+def : Pat<(xor (v4i32 (bitconvert (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15)))),
+               (v4i32 (bitconvert (v8i16 (add (sub (zext (v8i8 DPR:$opA)),
+                                                   (zext (v8i8 DPR:$opB))),
+                                              (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15))))))),
+          (VABDLuv8i16 DPR:$opA, DPR:$opB)>;
+
+def : Pat<(xor (v4i32 (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)),
+               (v4i32 (add (sub (zext (v4i16 DPR:$opA)),
+                                (zext (v4i16 DPR:$opB))),
+                           (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)))),
+          (VABDLuv4i32 DPR:$opA, DPR:$opB)>;
+
+def : Pat<(xor (v4i32 (bitconvert (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))),
+               (v4i32 (bitconvert (v2i64 (add (sub (zext (v2i32 DPR:$opA)),
+                                                   (zext (v2i32 DPR:$opB))),
+                                         (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))))),
+          (VABDLuv2i64 DPR:$opA, DPR:$opB)>;
+
+//   VABA     : Vector Absolute Difference and Accumulate
+defm VABAs    : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
+                             "vaba", "s", int_arm_neon_vabds, add>;
+defm VABAu    : N3VIntOp_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
+                             "vaba", "u", int_arm_neon_vabdu, add>;
+
+//   VABAL    : Vector Absolute Difference and Accumulate Long (Q += | D - D |)
+defm VABALs   : N3VLIntExtOp_QHS<0,1,0b0101,0, IIC_VABAD,
+                                 "vabal", "s", int_arm_neon_vabds, zext, add>;
+defm VABALu   : N3VLIntExtOp_QHS<1,1,0b0101,0, IIC_VABAD,
+                                 "vabal", "u", int_arm_neon_vabdu, zext, add>;
+
+// Vector Maximum and Minimum.
+
+//   VMAX     : Vector Maximum
+defm VMAXs    : N3VInt_QHS<0, 0, 0b0110, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vmax", "s", smax, 1>;
+defm VMAXu    : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vmax", "u", umax, 1>;
+def  VMAXfd   : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND,
+                        "vmax", "f32",
+                        v2f32, v2f32, fmaxnan, 1>;
+def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ,
+                        "vmax", "f32",
+                        v4f32, v4f32, fmaxnan, 1>;
+def  VMAXhd   : N3VDInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBIND,
+                        "vmax", "f16",
+                        v4f16, v4f16, fmaxnan, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VMAXhq   : N3VQInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBINQ,
+                        "vmax", "f16",
+                        v8f16, v8f16, fmaxnan, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+// VMAXNM
+let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+  def VMAXNMNDf  : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1,
+                            N3RegFrm, NoItinerary, "vmaxnm", "f32",
+                            v2f32, v2f32, fmaxnum, 1>,
+                            Requires<[HasV8, HasNEON]>;
+  def VMAXNMNQf  : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1,
+                            N3RegFrm, NoItinerary, "vmaxnm", "f32",
+                            v4f32, v4f32, fmaxnum, 1>,
+                            Requires<[HasV8, HasNEON]>;
+  def VMAXNMNDh  : N3VDIntnp<0b00110, 0b01, 0b1111, 0, 1,
+                            N3RegFrm, NoItinerary, "vmaxnm", "f16",
+                            v4f16, v4f16, fmaxnum, 1>,
+                            Requires<[HasV8, HasNEON, HasFullFP16]>;
+  def VMAXNMNQh  : N3VQIntnp<0b00110, 0b01, 0b1111, 1, 1,
+                            N3RegFrm, NoItinerary, "vmaxnm", "f16",
+                            v8f16, v8f16, fmaxnum, 1>,
+                            Requires<[HasV8, HasNEON, HasFullFP16]>;
+}
+
+//   VMIN     : Vector Minimum
+defm VMINs    : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vmin", "s", smin, 1>;
+defm VMINu    : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vmin", "u", umin, 1>;
+def  VMINfd   : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND,
+                        "vmin", "f32",
+                        v2f32, v2f32, fminnan, 1>;
+def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ,
+                        "vmin", "f32",
+                        v4f32, v4f32, fminnan, 1>;
+def  VMINhd   : N3VDInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBIND,
+                        "vmin", "f16",
+                        v4f16, v4f16, fminnan, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VMINhq   : N3VQInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBINQ,
+                        "vmin", "f16",
+                        v8f16, v8f16, fminnan, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+// VMINNM
+let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+  def VMINNMNDf  : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1,
+                            N3RegFrm, NoItinerary, "vminnm", "f32",
+                            v2f32, v2f32, fminnum, 1>,
+                            Requires<[HasV8, HasNEON]>;
+  def VMINNMNQf  : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1,
+                            N3RegFrm, NoItinerary, "vminnm", "f32",
+                            v4f32, v4f32, fminnum, 1>,
+                            Requires<[HasV8, HasNEON]>;
+  def VMINNMNDh  : N3VDIntnp<0b00110, 0b11, 0b1111, 0, 1,
+                            N3RegFrm, NoItinerary, "vminnm", "f16",
+                            v4f16, v4f16, fminnum, 1>,
+                            Requires<[HasV8, HasNEON, HasFullFP16]>;
+  def VMINNMNQh  : N3VQIntnp<0b00110, 0b11, 0b1111, 1, 1,
+                            N3RegFrm, NoItinerary, "vminnm", "f16",
+                            v8f16, v8f16, fminnum, 1>,
+                            Requires<[HasV8, HasNEON, HasFullFP16]>;
+}
+
+// Vector Pairwise Operations.
+
+//   VPADD    : Vector Pairwise Add
+def  VPADDi8  : N3VDInt<0, 0, 0b00, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
+                        "vpadd", "i8",
+                        v8i8, v8i8, int_arm_neon_vpadd, 0>;
+def  VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
+                        "vpadd", "i16",
+                        v4i16, v4i16, int_arm_neon_vpadd, 0>;
+def  VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
+                        "vpadd", "i32",
+                        v2i32, v2i32, int_arm_neon_vpadd, 0>;
+def  VPADDf   : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm,
+                        IIC_VPBIND, "vpadd", "f32",
+                        v2f32, v2f32, int_arm_neon_vpadd, 0>;
+def  VPADDh   : N3VDInt<1, 0, 0b01, 0b1101, 0, N3RegFrm,
+                        IIC_VPBIND, "vpadd", "f16",
+                        v4f16, v4f16, int_arm_neon_vpadd, 0>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+//   VPADDL   : Vector Pairwise Add Long
+defm VPADDLs  : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl", "s",
+                             int_arm_neon_vpaddls>;
+defm VPADDLu  : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpaddl", "u",
+                             int_arm_neon_vpaddlu>;
+
+//   VPADAL   : Vector Pairwise Add and Accumulate Long
+defm VPADALs  : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01100, 0, "vpadal", "s",
+                              int_arm_neon_vpadals>;
+defm VPADALu  : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01101, 0, "vpadal", "u",
+                              int_arm_neon_vpadalu>;
+
+//   VPMAX    : Vector Pairwise Maximum
+def  VPMAXs8  : N3VDInt<0, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "s8", v8i8, v8i8, int_arm_neon_vpmaxs, 0>;
+def  VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "s16", v4i16, v4i16, int_arm_neon_vpmaxs, 0>;
+def  VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "s32", v2i32, v2i32, int_arm_neon_vpmaxs, 0>;
+def  VPMAXu8  : N3VDInt<1, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "u8", v8i8, v8i8, int_arm_neon_vpmaxu, 0>;
+def  VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "u16", v4i16, v4i16, int_arm_neon_vpmaxu, 0>;
+def  VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>;
+def  VPMAXf   : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax",
+                        "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>;
+def  VPMAXh   : N3VDInt<1, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax",
+                        "f16", v4f16, v4f16, int_arm_neon_vpmaxs, 0>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+//   VPMIN    : Vector Pairwise Minimum
+def  VPMINs8  : N3VDInt<0, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "s8", v8i8, v8i8, int_arm_neon_vpmins, 0>;
+def  VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "s16", v4i16, v4i16, int_arm_neon_vpmins, 0>;
+def  VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "s32", v2i32, v2i32, int_arm_neon_vpmins, 0>;
+def  VPMINu8  : N3VDInt<1, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "u8", v8i8, v8i8, int_arm_neon_vpminu, 0>;
+def  VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "u16", v4i16, v4i16, int_arm_neon_vpminu, 0>;
+def  VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>;
+def  VPMINf   : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin",
+                        "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>;
+def  VPMINh   : N3VDInt<1, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin",
+                        "f16", v4f16, v4f16, int_arm_neon_vpmins, 0>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+// Vector Reciprocal and Reciprocal Square Root Estimate and Step.
+
+//   VRECPE   : Vector Reciprocal Estimate
+def  VRECPEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0,
+                        IIC_VUNAD, "vrecpe", "u32",
+                        v2i32, v2i32, int_arm_neon_vrecpe>;
+def  VRECPEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0,
+                        IIC_VUNAQ, "vrecpe", "u32",
+                        v4i32, v4i32, int_arm_neon_vrecpe>;
+def  VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
+                        IIC_VUNAD, "vrecpe", "f32",
+                        v2f32, v2f32, int_arm_neon_vrecpe>;
+def  VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
+                        IIC_VUNAQ, "vrecpe", "f32",
+                        v4f32, v4f32, int_arm_neon_vrecpe>;
+def  VRECPEhd : N2VDInt<0b11, 0b11, 0b01, 0b11, 0b01010, 0,
+                        IIC_VUNAD, "vrecpe", "f16",
+                        v4f16, v4f16, int_arm_neon_vrecpe>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VRECPEhq : N2VQInt<0b11, 0b11, 0b01, 0b11, 0b01010, 0,
+                        IIC_VUNAQ, "vrecpe", "f16",
+                        v8f16, v8f16, int_arm_neon_vrecpe>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+//   VRECPS   : Vector Reciprocal Step
+def  VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSD, "vrecps", "f32",
+                        v2f32, v2f32, int_arm_neon_vrecps, 1>;
+def  VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSQ, "vrecps", "f32",
+                        v4f32, v4f32, int_arm_neon_vrecps, 1>;
+def  VRECPShd : N3VDInt<0, 0, 0b01, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSD, "vrecps", "f16",
+                        v4f16, v4f16, int_arm_neon_vrecps, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VRECPShq : N3VQInt<0, 0, 0b01, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSQ, "vrecps", "f16",
+                        v8f16, v8f16, int_arm_neon_vrecps, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+//   VRSQRTE  : Vector Reciprocal Square Root Estimate
+def  VRSQRTEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
+                         IIC_VUNAD, "vrsqrte", "u32",
+                         v2i32, v2i32, int_arm_neon_vrsqrte>;
+def  VRSQRTEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
+                         IIC_VUNAQ, "vrsqrte", "u32",
+                         v4i32, v4i32, int_arm_neon_vrsqrte>;
+def  VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
+                         IIC_VUNAD, "vrsqrte", "f32",
+                         v2f32, v2f32, int_arm_neon_vrsqrte>;
+def  VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
+                         IIC_VUNAQ, "vrsqrte", "f32",
+                         v4f32, v4f32, int_arm_neon_vrsqrte>;
+def  VRSQRTEhd : N2VDInt<0b11, 0b11, 0b01, 0b11, 0b01011, 0,
+                         IIC_VUNAD, "vrsqrte", "f16",
+                         v4f16, v4f16, int_arm_neon_vrsqrte>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VRSQRTEhq : N2VQInt<0b11, 0b11, 0b01, 0b11, 0b01011, 0,
+                         IIC_VUNAQ, "vrsqrte", "f16",
+                         v8f16, v8f16, int_arm_neon_vrsqrte>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+//   VRSQRTS  : Vector Reciprocal Square Root Step
+def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSD, "vrsqrts", "f32",
+                        v2f32, v2f32, int_arm_neon_vrsqrts, 1>;
+def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSQ, "vrsqrts", "f32",
+                        v4f32, v4f32, int_arm_neon_vrsqrts, 1>;
+def VRSQRTShd : N3VDInt<0, 0, 0b11, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSD, "vrsqrts", "f16",
+                        v4f16, v4f16, int_arm_neon_vrsqrts, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+def VRSQRTShq : N3VQInt<0, 0, 0b11, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSQ, "vrsqrts", "f16",
+                        v8f16, v8f16, int_arm_neon_vrsqrts, 1>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+// Vector Shifts.
+
+//   VSHL     : Vector Shift
+defm VSHLs    : N3VInt_QHSDSh<0, 0, 0b0100, 0, N3RegVShFrm,
+                            IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ,
+                            "vshl", "s", int_arm_neon_vshifts>;
+defm VSHLu    : N3VInt_QHSDSh<1, 0, 0b0100, 0, N3RegVShFrm,
+                            IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ,
+                            "vshl", "u", int_arm_neon_vshiftu>;
+
+//   VSHL     : Vector Shift Left (Immediate)
+defm VSHLi    : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>;
+
+//   VSHR     : Vector Shift Right (Immediate)
+defm VSHRs    : N2VShR_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", "VSHRs",
+                            NEONvshrs>;
+defm VSHRu    : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", "VSHRu",
+                            NEONvshru>;
+
+//   VSHLL    : Vector Shift Left Long
+defm VSHLLs   : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s",
+  PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (sext node:$LHS), node:$RHS)>>;
+defm VSHLLu   : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll", "u",
+  PatFrag<(ops node:$LHS, node:$RHS), (NEONvshl (zext node:$LHS), node:$RHS)>>;
+
+//   VSHLL    : Vector Shift Left Long (with maximum shift count)
+class N2VLShMax<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+                bit op6, bit op4, string OpcodeStr, string Dt, ValueType ResTy,
+                ValueType OpTy, Operand ImmTy>
+  : N2VLSh<op24, op23, op11_8, op7, op6, op4, OpcodeStr, Dt,
+           ResTy, OpTy, ImmTy, null_frag> {
+  let Inst{21-16} = op21_16;
+  let DecoderMethod = "DecodeVSHLMaxInstruction";
+}
+def  VSHLLi8  : N2VLShMax<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll", "i8",
+                          v8i16, v8i8, imm8>;
+def  VSHLLi16 : N2VLShMax<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll", "i16",
+                          v4i32, v4i16, imm16>;
+def  VSHLLi32 : N2VLShMax<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll", "i32",
+                          v2i64, v2i32, imm32>;
+
+def : Pat<(v8i16 (NEONvshl (zext (v8i8 DPR:$Rn)), (i32 8))),
+          (VSHLLi8 DPR:$Rn, 8)>;
+def : Pat<(v4i32 (NEONvshl (zext (v4i16 DPR:$Rn)), (i32 16))),
+          (VSHLLi16 DPR:$Rn, 16)>;
+def : Pat<(v2i64 (NEONvshl (zext (v2i32 DPR:$Rn)), (i32 32))),
+          (VSHLLi32 DPR:$Rn, 32)>;
+def : Pat<(v8i16 (NEONvshl (sext (v8i8 DPR:$Rn)), (i32 8))),
+          (VSHLLi8 DPR:$Rn, 8)>;
+def : Pat<(v4i32 (NEONvshl (sext (v4i16 DPR:$Rn)), (i32 16))),
+          (VSHLLi16 DPR:$Rn, 16)>;
+def : Pat<(v2i64 (NEONvshl (sext (v2i32 DPR:$Rn)), (i32 32))),
+          (VSHLLi32 DPR:$Rn, 32)>;
+
+//   VSHRN    : Vector Shift Right and Narrow
+defm VSHRN    : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i",
+                           PatFrag<(ops node:$Rn, node:$amt),
+                                   (trunc (NEONvshrs node:$Rn, node:$amt))>>;
+
+def : Pat<(v8i8 (trunc (NEONvshru (v8i16 QPR:$Vn), shr_imm8:$amt))),
+          (VSHRNv8i8 QPR:$Vn, shr_imm8:$amt)>;
+def : Pat<(v4i16 (trunc (NEONvshru (v4i32 QPR:$Vn), shr_imm16:$amt))),
+          (VSHRNv4i16 QPR:$Vn, shr_imm16:$amt)>;
+def : Pat<(v2i32 (trunc (NEONvshru (v2i64 QPR:$Vn), shr_imm32:$amt))),
+          (VSHRNv2i32 QPR:$Vn, shr_imm32:$amt)>;
+
+//   VRSHL    : Vector Rounding Shift
+defm VRSHLs   : N3VInt_QHSDSh<0, 0, 0b0101, 0, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vrshl", "s", int_arm_neon_vrshifts>;
+defm VRSHLu   : N3VInt_QHSDSh<1, 0, 0b0101, 0, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vrshl", "u", int_arm_neon_vrshiftu>;
+//   VRSHR    : Vector Rounding Shift Right
+defm VRSHRs   : N2VShR_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", "VRSHRs",
+                            NEONvrshrs>;
+defm VRSHRu   : N2VShR_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", "VRSHRu",
+                            NEONvrshru>;
+
+//   VRSHRN   : Vector Rounding Shift Right and Narrow
+defm VRSHRN   : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i",
+                           NEONvrshrn>;
+
+//   VQSHL    : Vector Saturating Shift
+defm VQSHLs   : N3VInt_QHSDSh<0, 0, 0b0100, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqshl", "s", int_arm_neon_vqshifts>;
+defm VQSHLu   : N3VInt_QHSDSh<1, 0, 0b0100, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqshl", "u", int_arm_neon_vqshiftu>;
+//   VQSHL    : Vector Saturating Shift Left (Immediate)
+defm VQSHLsi  : N2VShL_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls>;
+defm VQSHLui  : N2VShL_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu>;
+
+//   VQSHLU   : Vector Saturating Shift Left (Immediate, Unsigned)
+defm VQSHLsu  : N2VShL_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu>;
+
+//   VQSHRN   : Vector Saturating Shift Right and Narrow
+defm VQSHRNs  : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s",
+                           NEONvqshrns>;
+defm VQSHRNu  : N2VNSh_HSD<1, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "u",
+                           NEONvqshrnu>;
+
+//   VQSHRUN  : Vector Saturating Shift Right and Narrow (Unsigned)
+defm VQSHRUN  : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s",
+                           NEONvqshrnsu>;
+
+//   VQRSHL   : Vector Saturating Rounding Shift
+defm VQRSHLs  : N3VInt_QHSDSh<0, 0, 0b0101, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqrshl", "s", int_arm_neon_vqrshifts>;
+defm VQRSHLu  : N3VInt_QHSDSh<1, 0, 0b0101, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqrshl", "u", int_arm_neon_vqrshiftu>;
+
+//   VQRSHRN  : Vector Saturating Rounding Shift Right and Narrow
+defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s",
+                           NEONvqrshrns>;
+defm VQRSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "u",
+                           NEONvqrshrnu>;
+
+//   VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned)
+defm VQRSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vqrshrun", "s",
+                           NEONvqrshrnsu>;
+
+//   VSRA     : Vector Shift Right and Accumulate
+defm VSRAs    : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra", "s", NEONvshrs>;
+defm VSRAu    : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra", "u", NEONvshru>;
+//   VRSRA    : Vector Rounding Shift Right and Accumulate
+defm VRSRAs   : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>;
+defm VRSRAu   : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>;
+
+//   VSLI     : Vector Shift Left and Insert
+defm VSLI     : N2VShInsL_QHSD<1, 1, 0b0101, 1, "vsli">;
+
+//   VSRI     : Vector Shift Right and Insert
+defm VSRI     : N2VShInsR_QHSD<1, 1, 0b0100, 1, "vsri">;
+
+// Vector Absolute and Saturating Absolute.
+
+//   VABS     : Vector Absolute Value
+defm VABS     : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0,
+                           IIC_VUNAiD, IIC_VUNAiQ, "vabs", "s",
+                           int_arm_neon_vabs>;
+def  VABSfd   : N2VD<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+                     "vabs", "f32",
+                     v2f32, v2f32, fabs>;
+def  VABSfq   : N2VQ<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+                     "vabs", "f32",
+                      v4f32, v4f32, fabs>;
+def  VABShd   : N2VD<0b11, 0b11, 0b01, 0b01, 0b01110, 0,
+                     "vabs", "f16",
+                     v4f16, v4f16, fabs>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VABShq   : N2VQ<0b11, 0b11, 0b01, 0b01, 0b01110, 0,
+                     "vabs", "f16",
+                      v8f16, v8f16, fabs>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+def : Pat<(xor (v2i32 (bitconvert (v8i8 (NEONvshrs DPR:$src, (i32 7))))),
+               (v2i32 (bitconvert (v8i8 (add DPR:$src,
+                                             (NEONvshrs DPR:$src, (i32 7))))))),
+          (VABSv8i8 DPR:$src)>;
+def : Pat<(xor (v2i32 (bitconvert (v4i16 (NEONvshrs DPR:$src, (i32 15))))),
+               (v2i32 (bitconvert (v4i16 (add DPR:$src,
+                                            (NEONvshrs DPR:$src, (i32 15))))))),
+          (VABSv4i16 DPR:$src)>;
+def : Pat<(xor (v2i32 (NEONvshrs DPR:$src, (i32 31))),
+               (v2i32 (add DPR:$src, (NEONvshrs DPR:$src, (i32 31))))),
+          (VABSv2i32 DPR:$src)>;
+def : Pat<(xor (v4i32 (bitconvert (v16i8 (NEONvshrs QPR:$src, (i32 7))))),
+               (v4i32 (bitconvert (v16i8 (add QPR:$src,
+                                             (NEONvshrs QPR:$src, (i32 7))))))),
+          (VABSv16i8 QPR:$src)>;
+def : Pat<(xor (v4i32 (bitconvert (v8i16 (NEONvshrs QPR:$src, (i32 15))))),
+               (v4i32 (bitconvert (v8i16 (add QPR:$src,
+                                            (NEONvshrs QPR:$src, (i32 15))))))),
+          (VABSv8i16 QPR:$src)>;
+def : Pat<(xor (v4i32 (NEONvshrs QPR:$src, (i32 31))),
+               (v4i32 (add QPR:$src, (NEONvshrs QPR:$src, (i32 31))))),
+          (VABSv4i32 QPR:$src)>;
+
+//   VQABS    : Vector Saturating Absolute Value
+defm VQABS    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0,
+                           IIC_VQUNAiD, IIC_VQUNAiQ, "vqabs", "s",
+                           int_arm_neon_vqabs>;
+
+// Vector Negate.
+
+def vnegd  : PatFrag<(ops node:$in),
+                     (sub (bitconvert (v2i32 NEONimmAllZerosV)), node:$in)>;
+def vnegq  : PatFrag<(ops node:$in),
+                     (sub (bitconvert (v4i32 NEONimmAllZerosV)), node:$in)>;
+
+class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm),
+        IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (Ty (vnegd DPR:$Vm)))]>;
+class VNEGQ<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm),
+        IIC_VSHLiQ, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (Ty (vnegq QPR:$Vm)))]>;
+
+//   VNEG     : Vector Negate (integer)
+def  VNEGs8d  : VNEGD<0b00, "vneg", "s8", v8i8>;
+def  VNEGs16d : VNEGD<0b01, "vneg", "s16", v4i16>;
+def  VNEGs32d : VNEGD<0b10, "vneg", "s32", v2i32>;
+def  VNEGs8q  : VNEGQ<0b00, "vneg", "s8", v16i8>;
+def  VNEGs16q : VNEGQ<0b01, "vneg", "s16", v8i16>;
+def  VNEGs32q : VNEGQ<0b10, "vneg", "s32", v4i32>;
+
+//   VNEG     : Vector Negate (floating-point)
+def  VNEGfd   : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
+                    (outs DPR:$Vd), (ins DPR:$Vm), IIC_VUNAD,
+                    "vneg", "f32", "$Vd, $Vm", "",
+                    [(set DPR:$Vd, (v2f32 (fneg DPR:$Vm)))]>;
+def  VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0,
+                    (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ,
+                    "vneg", "f32", "$Vd, $Vm", "",
+                    [(set QPR:$Vd, (v4f32 (fneg QPR:$Vm)))]>;
+def  VNEGhd   : N2V<0b11, 0b11, 0b01, 0b01, 0b01111, 0, 0,
+                    (outs DPR:$Vd), (ins DPR:$Vm), IIC_VUNAD,
+                    "vneg", "f16", "$Vd, $Vm", "",
+                    [(set DPR:$Vd, (v4f16 (fneg DPR:$Vm)))]>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VNEGhq   : N2V<0b11, 0b11, 0b01, 0b01, 0b01111, 1, 0,
+                    (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ,
+                    "vneg", "f16", "$Vd, $Vm", "",
+                    [(set QPR:$Vd, (v8f16 (fneg QPR:$Vm)))]>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+def : Pat<(v8i8  (vnegd  DPR:$src)), (VNEGs8d DPR:$src)>;
+def : Pat<(v4i16 (vnegd  DPR:$src)), (VNEGs16d DPR:$src)>;
+def : Pat<(v2i32 (vnegd  DPR:$src)), (VNEGs32d DPR:$src)>;
+def : Pat<(v16i8 (vnegq QPR:$src)), (VNEGs8q QPR:$src)>;
+def : Pat<(v8i16 (vnegq QPR:$src)), (VNEGs16q QPR:$src)>;
+def : Pat<(v4i32 (vnegq QPR:$src)), (VNEGs32q QPR:$src)>;
+
+//   VQNEG    : Vector Saturating Negate
+defm VQNEG    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0,
+                           IIC_VQUNAiD, IIC_VQUNAiQ, "vqneg", "s",
+                           int_arm_neon_vqneg>;
+
+// Vector Bit Counting Operations.
+
+//   VCLS     : Vector Count Leading Sign Bits
+defm VCLS     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0,
+                           IIC_VCNTiD, IIC_VCNTiQ, "vcls", "s",
+                           int_arm_neon_vcls>;
+//   VCLZ     : Vector Count Leading Zeros
+defm VCLZ     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0,
+                           IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i",
+                           ctlz>;
+//   VCNT     : Vector Count One Bits
+def  VCNTd    : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
+                        IIC_VCNTiD, "vcnt", "8",
+                        v8i8, v8i8, ctpop>;
+def  VCNTq    : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
+                        IIC_VCNTiQ, "vcnt", "8",
+                        v16i8, v16i8, ctpop>;
+
+// Vector Swap
+def  VSWPd    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 0, 0,
+                     (outs DPR:$Vd, DPR:$Vm), (ins DPR:$in1, DPR:$in2),
+                     NoItinerary, "vswp", "$Vd, $Vm", "$in1 = $Vd, $in2 = $Vm",
+                     []>;
+def  VSWPq    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0,
+                     (outs QPR:$Vd, QPR:$Vm), (ins QPR:$in1, QPR:$in2),
+                     NoItinerary, "vswp", "$Vd, $Vm", "$in1 = $Vd, $in2 = $Vm",
+                     []>;
+
+// Vector Move Operations.
+
+//   VMOV     : Vector Move (Register)
+def : NEONInstAlias<"vmov${p} $Vd, $Vm",
+                    (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p} $Vd, $Vm",
+                    (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
+
+//   VMOV     : Vector Move (Immediate)
+
+// Although VMOVs are not strictly speaking cheap, they are as expensive
+// as their copies counterpart (VORR), so we should prefer rematerialization
+// over splitting when it applies.
+let isReMaterializable = 1, isAsCheapAsAMove=1 in {
+def VMOVv8i8  : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$Vd),
+                         (ins nImmSplatI8:$SIMM), IIC_VMOVImm,
+                         "vmov", "i8", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v8i8 (NEONvmovImm timm:$SIMM)))]>;
+def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$Vd),
+                         (ins nImmSplatI8:$SIMM), IIC_VMOVImm,
+                         "vmov", "i8", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v16i8 (NEONvmovImm timm:$SIMM)))]>;
+
+def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 0, 1, (outs DPR:$Vd),
+                         (ins nImmSplatI16:$SIMM), IIC_VMOVImm,
+                         "vmov", "i16", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v4i16 (NEONvmovImm timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 0, 1, (outs QPR:$Vd),
+                         (ins nImmSplatI16:$SIMM), IIC_VMOVImm,
+                         "vmov", "i16", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v8i16 (NEONvmovImm timm:$SIMM)))]> {
+ let Inst{9} = SIMM{9};
+}
+
+def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 0, 1, (outs DPR:$Vd),
+                         (ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
+                         "vmov", "i32", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v2i32 (NEONvmovImm timm:$SIMM)))]> {
+  let Inst{11-8} = SIMM{11-8};
+}
+
+def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 0, 1, (outs QPR:$Vd),
+                         (ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
+                         "vmov", "i32", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v4i32 (NEONvmovImm timm:$SIMM)))]> {
+  let Inst{11-8} = SIMM{11-8};
+}
+
+def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$Vd),
+                         (ins nImmSplatI64:$SIMM), IIC_VMOVImm,
+                         "vmov", "i64", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v1i64 (NEONvmovImm timm:$SIMM)))]>;
+def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$Vd),
+                         (ins nImmSplatI64:$SIMM), IIC_VMOVImm,
+                         "vmov", "i64", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v2i64 (NEONvmovImm timm:$SIMM)))]>;
+
+def VMOVv2f32 : N1ModImm<1, 0b000, 0b1111, 0, 0, 0, 1, (outs DPR:$Vd),
+                         (ins nImmVMOVF32:$SIMM), IIC_VMOVImm,
+                         "vmov", "f32", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v2f32 (NEONvmovFPImm timm:$SIMM)))]>;
+def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd),
+                         (ins nImmVMOVF32:$SIMM), IIC_VMOVImm,
+                         "vmov", "f32", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v4f32 (NEONvmovFPImm timm:$SIMM)))]>;
+} // isReMaterializable, isAsCheapAsAMove
+
+// Add support for bytes replication feature, so it could be GAS compatible.
+// E.g. instructions below:
+// "vmov.i32 d0, 0xffffffff"
+// "vmov.i32 d0, 0xabababab"
+// "vmov.i16 d0, 0xabab"
+// are incorrect, but we could deal with such cases.
+// For last two instructions, for example, it should emit:
+// "vmov.i8 d0, 0xab"
+def : NEONInstAlias<"vmov${p}.i16 $Vd, $Vm",
+                    (VMOVv8i8 DPR:$Vd, nImmVMOVI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $Vm",
+                    (VMOVv8i8 DPR:$Vd, nImmVMOVI32ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i16 $Vd, $Vm",
+                    (VMOVv16i8 QPR:$Vd, nImmVMOVI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $Vm",
+                    (VMOVv16i8 QPR:$Vd, nImmVMOVI32ByteReplicate:$Vm, pred:$p)>;
+
+// Also add same support for VMVN instructions. So instruction:
+// "vmvn.i32 d0, 0xabababab"
+// actually means:
+// "vmov.i8 d0, 0x54"
+def : NEONInstAlias<"vmvn${p}.i16 $Vd, $Vm",
+                    (VMOVv8i8 DPR:$Vd, nImmVMVNI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $Vm",
+                    (VMOVv8i8 DPR:$Vd, nImmVMVNI32ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i16 $Vd, $Vm",
+                    (VMOVv16i8 QPR:$Vd, nImmVMVNI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $Vm",
+                    (VMOVv16i8 QPR:$Vd, nImmVMVNI32ByteReplicate:$Vm, pred:$p)>;
+
+// On some CPUs the two instructions "vmov.i32 dD, #0" and "vmov.i32 qD, #0"
+// require zero cycles to execute so they should be used wherever possible for
+// setting a register to zero.
+
+// Even without these pseudo-insts we would probably end up with the correct
+// instruction, but we could not mark the general ones with "isAsCheapAsAMove"
+// since they are sometimes rather expensive (in general).
+
+let AddedComplexity = 50, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
+  def VMOVD0 : ARMPseudoExpand<(outs DPR:$Vd), (ins), 4, IIC_VMOVImm,
+                               [(set DPR:$Vd, (v2i32 NEONimmAllZerosV))],
+                               (VMOVv2i32 DPR:$Vd, 0, (ops 14, zero_reg))>,
+               Requires<[HasZCZ]>;
+  def VMOVQ0 : ARMPseudoExpand<(outs QPR:$Vd), (ins), 4, IIC_VMOVImm,
+                               [(set QPR:$Vd, (v4i32 NEONimmAllZerosV))],
+                               (VMOVv4i32 QPR:$Vd, 0, (ops 14, zero_reg))>,
+               Requires<[HasZCZ]>;
+}
+
+//   VMOV     : Vector Get Lane (move scalar to ARM core register)
+
+def VGETLNs8  : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?},
+                          (outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane),
+                          IIC_VMOVSI, "vmov", "s8", "$R, $V$lane",
+                          [(set GPR:$R, (NEONvgetlanes (v8i8 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21}  = lane{2};
+  let Inst{6-5} = lane{1-0};
+}
+def VGETLNs16 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, {?,1},
+                          (outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane),
+                          IIC_VMOVSI, "vmov", "s16", "$R, $V$lane",
+                          [(set GPR:$R, (NEONvgetlanes (v4i16 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21} = lane{1};
+  let Inst{6}  = lane{0};
+}
+def VGETLNu8  : NVGetLane<{1,1,1,0,1,1,?,1}, 0b1011, {?,?},
+                          (outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane),
+                          IIC_VMOVSI, "vmov", "u8", "$R, $V$lane",
+                          [(set GPR:$R, (NEONvgetlaneu (v8i8 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21}  = lane{2};
+  let Inst{6-5} = lane{1-0};
+}
+def VGETLNu16 : NVGetLane<{1,1,1,0,1,0,?,1}, 0b1011, {?,1},
+                          (outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane),
+                          IIC_VMOVSI, "vmov", "u16", "$R, $V$lane",
+                          [(set GPR:$R, (NEONvgetlaneu (v4i16 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21} = lane{1};
+  let Inst{6}  = lane{0};
+}
+def VGETLNi32 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, 0b00,
+                          (outs GPR:$R), (ins DPR:$V, VectorIndex32:$lane),
+                          IIC_VMOVSI, "vmov", "32", "$R, $V$lane",
+                          [(set GPR:$R, (extractelt (v2i32 DPR:$V),
+                                           imm:$lane))]>,
+                Requires<[HasVFP2, HasFastVGETLNi32]> {
+  let Inst{21} = lane{0};
+}
+// def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td
+def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane),
+          (VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src,
+                           (DSubReg_i8_reg imm:$lane))),
+                     (SubReg_i8_lane imm:$lane))>;
+def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane),
+          (VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src,
+                             (DSubReg_i16_reg imm:$lane))),
+                     (SubReg_i16_lane imm:$lane))>;
+def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane),
+          (VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src,
+                           (DSubReg_i8_reg imm:$lane))),
+                     (SubReg_i8_lane imm:$lane))>;
+def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane),
+          (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src,
+                             (DSubReg_i16_reg imm:$lane))),
+                     (SubReg_i16_lane imm:$lane))>;
+def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
+          (VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src,
+                             (DSubReg_i32_reg imm:$lane))),
+                     (SubReg_i32_lane imm:$lane))>,
+      Requires<[HasNEON, HasFastVGETLNi32]>;
+def : Pat<(extractelt (v2i32 DPR:$src), imm:$lane),
+          (COPY_TO_REGCLASS
+            (i32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>,
+      Requires<[HasNEON, HasSlowVGETLNi32]>;
+def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
+          (COPY_TO_REGCLASS
+            (i32 (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>,
+      Requires<[HasNEON, HasSlowVGETLNi32]>;
+def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2),
+          (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v2f32 DPR:$src1),DPR_VFP2)),
+                          (SSubReg_f32_reg imm:$src2))>;
+def : Pat<(extractelt (v4f32 QPR:$src1), imm:$src2),
+          (EXTRACT_SUBREG (v4f32 (COPY_TO_REGCLASS (v4f32 QPR:$src1),QPR_VFP2)),
+                          (SSubReg_f32_reg imm:$src2))>;
+//def : Pat<(extractelt (v2i64 QPR:$src1), imm:$src2),
+//          (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
+def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2),
+          (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
+
+
+//   VMOV     : Vector Set Lane (move ARM core register to scalar)
+
+let Constraints = "$src1 = $V" in {
+def VSETLNi8  : NVSetLane<{1,1,1,0,0,1,?,0}, 0b1011, {?,?}, (outs DPR:$V),
+                          (ins DPR:$src1, GPR:$R, VectorIndex8:$lane),
+                          IIC_VMOVISL, "vmov", "8", "$V$lane, $R",
+                          [(set DPR:$V, (vector_insert (v8i8 DPR:$src1),
+                                           GPR:$R, imm:$lane))]> {
+  let Inst{21}  = lane{2};
+  let Inst{6-5} = lane{1-0};
+}
+def VSETLNi16 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, {?,1}, (outs DPR:$V),
+                          (ins DPR:$src1, GPR:$R, VectorIndex16:$lane),
+                          IIC_VMOVISL, "vmov", "16", "$V$lane, $R",
+                          [(set DPR:$V, (vector_insert (v4i16 DPR:$src1),
+                                           GPR:$R, imm:$lane))]> {
+  let Inst{21} = lane{1};
+  let Inst{6}  = lane{0};
+}
+def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V),
+                          (ins DPR:$src1, GPR:$R, VectorIndex32:$lane),
+                          IIC_VMOVISL, "vmov", "32", "$V$lane, $R",
+                          [(set DPR:$V, (insertelt (v2i32 DPR:$src1),
+                                           GPR:$R, imm:$lane))]>,
+                Requires<[HasVFP2]> {
+  let Inst{21} = lane{0};
+  // This instruction is equivalent as
+  // $V = INSERT_SUBREG $src1, $R, translateImmToSubIdx($imm)
+  let isInsertSubreg = 1;
+}
+}
+def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane),
+          (v16i8 (INSERT_SUBREG QPR:$src1,
+                  (v8i8 (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1,
+                                   (DSubReg_i8_reg imm:$lane))),
+                            GPR:$src2, (SubReg_i8_lane imm:$lane))),
+                  (DSubReg_i8_reg imm:$lane)))>;
+def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane),
+          (v8i16 (INSERT_SUBREG QPR:$src1,
+                  (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
+                                     (DSubReg_i16_reg imm:$lane))),
+                             GPR:$src2, (SubReg_i16_lane imm:$lane))),
+                  (DSubReg_i16_reg imm:$lane)))>;
+def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane),
+          (v4i32 (INSERT_SUBREG QPR:$src1,
+                  (v2i32 (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1,
+                                     (DSubReg_i32_reg imm:$lane))),
+                             GPR:$src2, (SubReg_i32_lane imm:$lane))),
+                  (DSubReg_i32_reg imm:$lane)))>;
+
+def : Pat<(v2f32 (insertelt DPR:$src1, SPR:$src2, imm:$src3)),
+          (INSERT_SUBREG (v2f32 (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2)),
+                                SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
+def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)),
+          (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2)),
+                                SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
+
+//def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
+//          (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
+def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
+          (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
+
+def : Pat<(v2f32 (scalar_to_vector SPR:$src)),
+          (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
+def : Pat<(v2f64 (scalar_to_vector (f64 DPR:$src))),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), DPR:$src, dsub_0)>;
+def : Pat<(v4f32 (scalar_to_vector SPR:$src)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
+
+def : Pat<(v8i8 (scalar_to_vector GPR:$src)),
+          (VSETLNi8  (v8i8  (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+def : Pat<(v4i16 (scalar_to_vector GPR:$src)),
+          (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+def : Pat<(v2i32 (scalar_to_vector GPR:$src)),
+          (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+
+def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+                         (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+                         dsub_0)>;
+def : Pat<(v8i16 (scalar_to_vector GPR:$src)),
+          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+                         (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+                         dsub_0)>;
+def : Pat<(v4i32 (scalar_to_vector GPR:$src)),
+          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                         (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+                         dsub_0)>;
+
+//   VDUP     : Vector Duplicate (from ARM core register to all elements)
+
+class VDUPD<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
+  : NVDup<opcod1, 0b1011, opcod3, (outs DPR:$V), (ins GPR:$R),
+          IIC_VMOVIS, "vdup", Dt, "$V, $R",
+          [(set DPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>;
+class VDUPQ<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
+  : NVDup<opcod1, 0b1011, opcod3, (outs QPR:$V), (ins GPR:$R),
+          IIC_VMOVIS, "vdup", Dt, "$V, $R",
+          [(set QPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>;
+
+def  VDUP8d   : VDUPD<0b11101100, 0b00, "8", v8i8>;
+def  VDUP16d  : VDUPD<0b11101000, 0b01, "16", v4i16>;
+def  VDUP32d  : VDUPD<0b11101000, 0b00, "32", v2i32>,
+                Requires<[HasNEON, HasFastVDUP32]>;
+def  VDUP8q   : VDUPQ<0b11101110, 0b00, "8", v16i8>;
+def  VDUP16q  : VDUPQ<0b11101010, 0b01, "16", v8i16>;
+def  VDUP32q  : VDUPQ<0b11101010, 0b00, "32", v4i32>;
+
+// NEONvdup patterns for uarchs with fast VDUP.32.
+def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>,
+      Requires<[HasNEON,HasFastVDUP32]>;
+def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>;
+
+// NEONvdup patterns for uarchs with slow VDUP.32 - use VMOVDRR instead.
+def : Pat<(v2i32 (NEONvdup (i32 GPR:$R))), (VMOVDRR GPR:$R, GPR:$R)>,
+      Requires<[HasNEON,HasSlowVDUP32]>;
+def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VMOVDRR GPR:$R, GPR:$R)>,
+      Requires<[HasNEON,HasSlowVDUP32]>;
+
+//   VDUP     : Vector Duplicate Lane (from scalar to all elements)
+
+class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt,
+              ValueType Ty, Operand IdxTy>
+  : NVDupLane<op19_16, 0, (outs DPR:$Vd), (ins DPR:$Vm, IdxTy:$lane),
+              IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm$lane",
+              [(set DPR:$Vd, (Ty (NEONvduplane (Ty DPR:$Vm), imm:$lane)))]>;
+
+class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, Operand IdxTy>
+  : NVDupLane<op19_16, 1, (outs QPR:$Vd), (ins DPR:$Vm, IdxTy:$lane),
+              IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm$lane",
+              [(set QPR:$Vd, (ResTy (NEONvduplane (OpTy DPR:$Vm),
+                                      VectorIndex32:$lane)))]>;
+
+// Inst{19-16} is partially specified depending on the element size.
+
+def VDUPLN8d  : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8, VectorIndex8> {
+  bits<3> lane;
+  let Inst{19-17} = lane{2-0};
+}
+def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16, VectorIndex16> {
+  bits<2> lane;
+  let Inst{19-18} = lane{1-0};
+}
+def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32, VectorIndex32> {
+  bits<1> lane;
+  let Inst{19} = lane{0};
+}
+def VDUPLN8q  : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8, VectorIndex8> {
+  bits<3> lane;
+  let Inst{19-17} = lane{2-0};
+}
+def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16, VectorIndex16> {
+  bits<2> lane;
+  let Inst{19-18} = lane{1-0};
+}
+def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32, VectorIndex32> {
+  bits<1> lane;
+  let Inst{19} = lane{0};
+}
+
+def : Pat<(v2f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)),
+          (VDUPLN32d DPR:$Vm, imm:$lane)>;
+
+def : Pat<(v4f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)),
+          (VDUPLN32q DPR:$Vm, imm:$lane)>;
+
+def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)),
+          (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src,
+                                  (DSubReg_i8_reg imm:$lane))),
+                           (SubReg_i8_lane imm:$lane)))>;
+def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)),
+          (v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src,
+                                    (DSubReg_i16_reg imm:$lane))),
+                            (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)),
+          (v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src,
+                                    (DSubReg_i32_reg imm:$lane))),
+                            (SubReg_i32_lane imm:$lane)))>;
+def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)),
+          (v4f32 (VDUPLN32q (v2f32 (EXTRACT_SUBREG QPR:$src,
+                                   (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v2f32 (NEONvdup (f32 SPR:$src))),
+          (v2f32 (VDUPLN32d (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
+                             SPR:$src, ssub_0), (i32 0)))>;
+def : Pat<(v4f32 (NEONvdup (f32 SPR:$src))),
+          (v4f32 (VDUPLN32q (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
+                             SPR:$src, ssub_0), (i32 0)))>;
+
+//   VMOVN    : Vector Narrowing Move
+defm VMOVN    : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVN,
+                         "vmovn", "i", trunc>;
+//   VQMOVN   : Vector Saturating Narrowing Move
+defm VQMOVNs  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, IIC_VQUNAiD,
+                            "vqmovn", "s", int_arm_neon_vqmovns>;
+defm VQMOVNu  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, IIC_VQUNAiD,
+                            "vqmovn", "u", int_arm_neon_vqmovnu>;
+defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD,
+                            "vqmovun", "s", int_arm_neon_vqmovnsu>;
+//   VMOVL    : Vector Lengthening Move
+defm VMOVLs   : N2VL_QHS<0b01,0b10100,0,1, "vmovl", "s", sext>;
+defm VMOVLu   : N2VL_QHS<0b11,0b10100,0,1, "vmovl", "u", zext>;
+def : Pat<(v8i16 (anyext (v8i8 DPR:$Vm))), (VMOVLuv8i16 DPR:$Vm)>;
+def : Pat<(v4i32 (anyext (v4i16 DPR:$Vm))), (VMOVLuv4i32 DPR:$Vm)>;
+def : Pat<(v2i64 (anyext (v2i32 DPR:$Vm))), (VMOVLuv2i64 DPR:$Vm)>;
+
+// Vector Conversions.
+
+//   VCVT     : Vector Convert Between Floating-Point and Integers
+def  VCVTf2sd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32",
+                     v2i32, v2f32, fp_to_sint>;
+def  VCVTf2ud : N2VD<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32",
+                     v2i32, v2f32, fp_to_uint>;
+def  VCVTs2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
+                     v2f32, v2i32, sint_to_fp>;
+def  VCVTu2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
+                     v2f32, v2i32, uint_to_fp>;
+
+def  VCVTf2sq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32",
+                     v4i32, v4f32, fp_to_sint>;
+def  VCVTf2uq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32",
+                     v4i32, v4f32, fp_to_uint>;
+def  VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
+                     v4f32, v4i32, sint_to_fp>;
+def  VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
+                     v4f32, v4i32, uint_to_fp>;
+
+def  VCVTh2sd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01110, 0, "vcvt", "s16.f16",
+                     v4i16, v4f16, fp_to_sint>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCVTh2ud : N2VD<0b11, 0b11, 0b01, 0b11, 0b01111, 0, "vcvt", "u16.f16",
+                     v4i16, v4f16, fp_to_uint>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCVTs2hd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01100, 0, "vcvt", "f16.s16",
+                     v4f16, v4i16, sint_to_fp>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCVTu2hd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01101, 0, "vcvt", "f16.u16",
+                     v4f16, v4i16, uint_to_fp>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+def  VCVTh2sq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01110, 0, "vcvt", "s16.f16",
+                     v8i16, v8f16, fp_to_sint>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCVTh2uq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01111, 0, "vcvt", "u16.f16",
+                     v8i16, v8f16, fp_to_uint>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCVTs2hq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01100, 0, "vcvt", "f16.s16",
+                     v8f16, v8i16, sint_to_fp>,
+                Requires<[HasNEON, HasFullFP16]>;
+def  VCVTu2hq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01101, 0, "vcvt", "f16.u16",
+                     v8f16, v8i16, uint_to_fp>,
+                Requires<[HasNEON, HasFullFP16]>;
+
+// VCVT{A, N, P, M}
+multiclass VCVT_FPI<string op, bits<3> op10_8, SDPatternOperator IntS,
+                    SDPatternOperator IntU> {
+  let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+    def SDf : N2VDIntnp<0b10, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+                       "s32.f32", v2i32, v2f32, IntS>, Requires<[HasV8, HasNEON]>;
+    def SQf : N2VQIntnp<0b10, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+                       "s32.f32", v4i32, v4f32, IntS>, Requires<[HasV8, HasNEON]>;
+    def UDf : N2VDIntnp<0b10, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+                       "u32.f32", v2i32, v2f32, IntU>, Requires<[HasV8, HasNEON]>;
+    def UQf : N2VQIntnp<0b10, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+                       "u32.f32", v4i32, v4f32, IntU>, Requires<[HasV8, HasNEON]>;
+    def SDh : N2VDIntnp<0b01, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+                       "s16.f16", v4i16, v4f16, IntS>,
+              Requires<[HasV8, HasNEON, HasFullFP16]>;
+    def SQh : N2VQIntnp<0b01, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+                       "s16.f16", v8i16, v8f16, IntS>,
+              Requires<[HasV8, HasNEON, HasFullFP16]>;
+    def UDh : N2VDIntnp<0b01, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+                       "u16.f16", v4i16, v4f16, IntU>,
+              Requires<[HasV8, HasNEON, HasFullFP16]>;
+    def UQh : N2VQIntnp<0b01, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+                       "u16.f16", v8i16, v8f16, IntU>,
+              Requires<[HasV8, HasNEON, HasFullFP16]>;
+  }
+}
+
+defm VCVTAN : VCVT_FPI<"a", 0b000, int_arm_neon_vcvtas, int_arm_neon_vcvtau>;
+defm VCVTNN : VCVT_FPI<"n", 0b001, int_arm_neon_vcvtns, int_arm_neon_vcvtnu>;
+defm VCVTPN : VCVT_FPI<"p", 0b010, int_arm_neon_vcvtps, int_arm_neon_vcvtpu>;
+defm VCVTMN : VCVT_FPI<"m", 0b011, int_arm_neon_vcvtms, int_arm_neon_vcvtmu>;
+
+//   VCVT     : Vector Convert Between Floating-Point and Fixed-Point.
+let DecoderMethod = "DecodeVCVTD" in {
+def VCVTf2xsd : N2VCvtD<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32",
+                        v2i32, v2f32, int_arm_neon_vcvtfp2fxs>;
+def VCVTf2xud : N2VCvtD<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32",
+                        v2i32, v2f32, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2fd : N2VCvtD<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
+                        v2f32, v2i32, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2fd : N2VCvtD<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
+                        v2f32, v2i32, int_arm_neon_vcvtfxu2fp>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def VCVTh2xsd : N2VCvtD<0, 1, 0b1101, 0, 1, "vcvt", "s16.f16",
+                        v4i16, v4f16, int_arm_neon_vcvtfp2fxs>;
+def VCVTh2xud : N2VCvtD<1, 1, 0b1101, 0, 1, "vcvt", "u16.f16",
+                        v4i16, v4f16, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2hd : N2VCvtD<0, 1, 0b1100, 0, 1, "vcvt", "f16.s16",
+                        v4f16, v4i16, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2hd : N2VCvtD<1, 1, 0b1100, 0, 1, "vcvt", "f16.u16",
+                        v4f16, v4i16, int_arm_neon_vcvtfxu2fp>;
+} // Predicates = [HasNEON, HasFullFP16]
+}
+
+let DecoderMethod = "DecodeVCVTQ" in {
+def VCVTf2xsq : N2VCvtQ<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32",
+                        v4i32, v4f32, int_arm_neon_vcvtfp2fxs>;
+def VCVTf2xuq : N2VCvtQ<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32",
+                        v4i32, v4f32, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
+                        v4f32, v4i32, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
+                        v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
+let Predicates = [HasNEON, HasFullFP16] in {
+def VCVTh2xsq : N2VCvtQ<0, 1, 0b1101, 0, 1, "vcvt", "s16.f16",
+                        v8i16, v8f16, int_arm_neon_vcvtfp2fxs>;
+def VCVTh2xuq : N2VCvtQ<1, 1, 0b1101, 0, 1, "vcvt", "u16.f16",
+                        v8i16, v8f16, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2hq : N2VCvtQ<0, 1, 0b1100, 0, 1, "vcvt", "f16.s16",
+                        v8f16, v8i16, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2hq : N2VCvtQ<1, 1, 0b1100, 0, 1, "vcvt", "f16.u16",
+                        v8f16, v8i16, int_arm_neon_vcvtfxu2fp>;
+} // Predicates = [HasNEON, HasFullFP16]
+}
+
+def : NEONInstAlias<"vcvt${p}.s32.f32 $Dd, $Dm, #0",
+                    (VCVTf2sd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u32.f32 $Dd, $Dm, #0",
+                    (VCVTf2ud DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.s32 $Dd, $Dm, #0",
+                    (VCVTs2fd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.u32 $Dd, $Dm, #0",
+                    (VCVTu2fd DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def : NEONInstAlias<"vcvt${p}.s32.f32 $Qd, $Qm, #0",
+                    (VCVTf2sq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u32.f32 $Qd, $Qm, #0",
+                    (VCVTf2uq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.s32 $Qd, $Qm, #0",
+                    (VCVTs2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.u32 $Qd, $Qm, #0",
+                    (VCVTu2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
+
+def : NEONInstAlias<"vcvt${p}.s16.f16 $Dd, $Dm, #0",
+                    (VCVTh2sd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u16.f16 $Dd, $Dm, #0",
+                    (VCVTh2ud DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f16.s16 $Dd, $Dm, #0",
+                    (VCVTs2hd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f16.u16 $Dd, $Dm, #0",
+                    (VCVTu2hd DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def : NEONInstAlias<"vcvt${p}.s16.f16 $Qd, $Qm, #0",
+                    (VCVTh2sq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u16.f16 $Qd, $Qm, #0",
+                    (VCVTh2uq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f16.s16 $Qd, $Qm, #0",
+                    (VCVTs2hq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f16.u16 $Qd, $Qm, #0",
+                    (VCVTu2hq QPR:$Qd, QPR:$Qm, pred:$p)>;
+
+
+//   VCVT     : Vector Convert Between Half-Precision and Single-Precision.
+def  VCVTf2h  : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0,
+                        IIC_VUNAQ, "vcvt", "f16.f32",
+                        v4i16, v4f32, int_arm_neon_vcvtfp2hf>,
+                Requires<[HasNEON, HasFP16]>;
+def  VCVTh2f  : N2VLInt<0b11, 0b11, 0b01, 0b10, 0b01110, 0, 0,
+                        IIC_VUNAQ, "vcvt", "f32.f16",
+                        v4f32, v4i16, int_arm_neon_vcvthf2fp>,
+                Requires<[HasNEON, HasFP16]>;
+
+// Vector Reverse.
+
+//   VREV64   : Vector Reverse elements within 64-bit doublewords
+
+class VREV64D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VMOVD,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (Ty (NEONvrev64 (Ty DPR:$Vm))))]>;
+class VREV64Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VMOVQ,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (Ty (NEONvrev64 (Ty QPR:$Vm))))]>;
+
+def VREV64d8  : VREV64D<0b00, "vrev64", "8", v8i8>;
+def VREV64d16 : VREV64D<0b01, "vrev64", "16", v4i16>;
+def VREV64d32 : VREV64D<0b10, "vrev64", "32", v2i32>;
+def : Pat<(v2f32 (NEONvrev64 (v2f32 DPR:$Vm))), (VREV64d32 DPR:$Vm)>;
+
+def VREV64q8  : VREV64Q<0b00, "vrev64", "8", v16i8>;
+def VREV64q16 : VREV64Q<0b01, "vrev64", "16", v8i16>;
+def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>;
+def : Pat<(v4f32 (NEONvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>;
+
+//   VREV32   : Vector Reverse elements within 32-bit words
+
+class VREV32D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VMOVD,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (Ty (NEONvrev32 (Ty DPR:$Vm))))]>;
+class VREV32Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VMOVQ,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (Ty (NEONvrev32 (Ty QPR:$Vm))))]>;
+
+def VREV32d8  : VREV32D<0b00, "vrev32", "8", v8i8>;
+def VREV32d16 : VREV32D<0b01, "vrev32", "16", v4i16>;
+
+def VREV32q8  : VREV32Q<0b00, "vrev32", "8", v16i8>;
+def VREV32q16 : VREV32Q<0b01, "vrev32", "16", v8i16>;
+
+//   VREV16   : Vector Reverse elements within 16-bit halfwords
+
+class VREV16D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VMOVD,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (Ty (NEONvrev16 (Ty DPR:$Vm))))]>;
+class VREV16Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VMOVQ,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (Ty (NEONvrev16 (Ty QPR:$Vm))))]>;
+
+def VREV16d8  : VREV16D<0b00, "vrev16", "8", v8i8>;
+def VREV16q8  : VREV16Q<0b00, "vrev16", "8", v16i8>;
+
+// Other Vector Shuffles.
+
+//  Aligned extractions: really just dropping registers
+
+class AlignedVEXTq<ValueType DestTy, ValueType SrcTy, SDNodeXForm LaneCVT>
+      : Pat<(DestTy (vector_extract_subvec (SrcTy QPR:$src), (i32 imm:$start))),
+             (EXTRACT_SUBREG (SrcTy QPR:$src), (LaneCVT imm:$start))>;
+
+def : AlignedVEXTq<v8i8, v16i8, DSubReg_i8_reg>;
+
+def : AlignedVEXTq<v4i16, v8i16, DSubReg_i16_reg>;
+
+def : AlignedVEXTq<v2i32, v4i32, DSubReg_i32_reg>;
+
+def : AlignedVEXTq<v1i64, v2i64, DSubReg_f64_reg>;
+
+def : AlignedVEXTq<v2f32, v4f32, DSubReg_i32_reg>;
+
+
+//   VEXT     : Vector Extract
+
+
+// All of these have a two-operand InstAlias.
+let TwoOperandAliasConstraint = "$Vn = $Vd" in {
+class VEXTd<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
+  : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$Vd),
+        (ins DPR:$Vn, DPR:$Vm, immTy:$index), NVExtFrm,
+        IIC_VEXTD, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "",
+        [(set DPR:$Vd, (Ty (NEONvext (Ty DPR:$Vn),
+                                     (Ty DPR:$Vm), imm:$index)))]> {
+  bits<3> index;
+  let Inst{11} = 0b0;
+  let Inst{10-8} = index{2-0};
+}
+
+class VEXTq<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
+  : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$Vd),
+        (ins QPR:$Vn, QPR:$Vm, imm0_15:$index), NVExtFrm,
+        IIC_VEXTQ, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "",
+        [(set QPR:$Vd, (Ty (NEONvext (Ty QPR:$Vn),
+                                     (Ty QPR:$Vm), imm:$index)))]> {
+  bits<4> index;
+  let Inst{11-8} = index{3-0};
+}
+}
+
+def VEXTd8  : VEXTd<"vext", "8",  v8i8, imm0_7> {
+  let Inst{10-8} = index{2-0};
+}
+def VEXTd16 : VEXTd<"vext", "16", v4i16, imm0_3> {
+  let Inst{10-9} = index{1-0};
+  let Inst{8}    = 0b0;
+}
+def VEXTd32 : VEXTd<"vext", "32", v2i32, imm0_1> {
+  let Inst{10}     = index{0};
+  let Inst{9-8}    = 0b00;
+}
+def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn),
+                           (v2f32 DPR:$Vm),
+                           (i32 imm:$index))),
+          (VEXTd32 DPR:$Vn, DPR:$Vm, imm:$index)>;
+
+def VEXTq8  : VEXTq<"vext", "8",  v16i8, imm0_15> {
+  let Inst{11-8} = index{3-0};
+}
+def VEXTq16 : VEXTq<"vext", "16", v8i16, imm0_7> {
+  let Inst{11-9} = index{2-0};
+  let Inst{8}    = 0b0;
+}
+def VEXTq32 : VEXTq<"vext", "32", v4i32, imm0_3> {
+  let Inst{11-10} = index{1-0};
+  let Inst{9-8}    = 0b00;
+}
+def VEXTq64 : VEXTq<"vext", "64", v2i64, imm0_1> {
+  let Inst{11} = index{0};
+  let Inst{10-8}    = 0b000;
+}
+def : Pat<(v4f32 (NEONvext (v4f32 QPR:$Vn),
+                           (v4f32 QPR:$Vm),
+                           (i32 imm:$index))),
+          (VEXTq32 QPR:$Vn, QPR:$Vm, imm:$index)>;
+
+//   VTRN     : Vector Transpose
+
+def  VTRNd8   : N2VDShuffle<0b00, 0b00001, "vtrn", "8">;
+def  VTRNd16  : N2VDShuffle<0b01, 0b00001, "vtrn", "16">;
+def  VTRNd32  : N2VDShuffle<0b10, 0b00001, "vtrn", "32">;
+
+def  VTRNq8   : N2VQShuffle<0b00, 0b00001, IIC_VPERMQ, "vtrn", "8">;
+def  VTRNq16  : N2VQShuffle<0b01, 0b00001, IIC_VPERMQ, "vtrn", "16">;
+def  VTRNq32  : N2VQShuffle<0b10, 0b00001, IIC_VPERMQ, "vtrn", "32">;
+
+//   VUZP     : Vector Unzip (Deinterleave)
+
+def  VUZPd8   : N2VDShuffle<0b00, 0b00010, "vuzp", "8">;
+def  VUZPd16  : N2VDShuffle<0b01, 0b00010, "vuzp", "16">;
+// vuzp.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+def : NEONInstAlias<"vuzp${p}.32 $Dd, $Dm",
+                    (VTRNd32 DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def  VUZPq8   : N2VQShuffle<0b00, 0b00010, IIC_VPERMQ3, "vuzp", "8">;
+def  VUZPq16  : N2VQShuffle<0b01, 0b00010, IIC_VPERMQ3, "vuzp", "16">;
+def  VUZPq32  : N2VQShuffle<0b10, 0b00010, IIC_VPERMQ3, "vuzp", "32">;
+
+//   VZIP     : Vector Zip (Interleave)
+
+def  VZIPd8   : N2VDShuffle<0b00, 0b00011, "vzip", "8">;
+def  VZIPd16  : N2VDShuffle<0b01, 0b00011, "vzip", "16">;
+// vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+def : NEONInstAlias<"vzip${p}.32 $Dd, $Dm",
+                    (VTRNd32 DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def  VZIPq8   : N2VQShuffle<0b00, 0b00011, IIC_VPERMQ3, "vzip", "8">;
+def  VZIPq16  : N2VQShuffle<0b01, 0b00011, IIC_VPERMQ3, "vzip", "16">;
+def  VZIPq32  : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip", "32">;
+
+// Vector Table Lookup and Table Extension.
+
+//   VTBL     : Vector Table Lookup
+let DecoderMethod = "DecodeTBLInstruction" in {
+def  VTBL1
+  : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$Vd),
+        (ins VecListOneD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB1,
+        "vtbl", "8", "$Vd, $Vn, $Vm", "",
+        [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbl1 VecListOneD:$Vn, DPR:$Vm)))]>;
+let hasExtraSrcRegAllocReq = 1 in {
+def  VTBL2
+  : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$Vd),
+        (ins VecListDPair:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB2,
+        "vtbl", "8", "$Vd, $Vn, $Vm", "", []>;
+def  VTBL3
+  : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$Vd),
+        (ins VecListThreeD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB3,
+        "vtbl", "8", "$Vd, $Vn, $Vm", "", []>;
+def  VTBL4
+  : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$Vd),
+        (ins VecListFourD:$Vn, DPR:$Vm),
+        NVTBLFrm, IIC_VTB4,
+        "vtbl", "8", "$Vd, $Vn, $Vm", "", []>;
+} // hasExtraSrcRegAllocReq = 1
+
+def  VTBL3Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins QQPR:$tbl, DPR:$src), IIC_VTB3, "", []>;
+def  VTBL4Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins QQPR:$tbl, DPR:$src), IIC_VTB4, "", []>;
+
+//   VTBX     : Vector Table Extension
+def  VTBX1
+  : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$Vd),
+        (ins DPR:$orig, VecListOneD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX1,
+        "vtbx", "8", "$Vd, $Vn, $Vm", "$orig = $Vd",
+        [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbx1
+                               DPR:$orig, VecListOneD:$Vn, DPR:$Vm)))]>;
+let hasExtraSrcRegAllocReq = 1 in {
+def  VTBX2
+  : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$Vd),
+        (ins DPR:$orig, VecListDPair:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX2,
+        "vtbx", "8", "$Vd, $Vn, $Vm", "$orig = $Vd", []>;
+def  VTBX3
+  : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$Vd),
+        (ins DPR:$orig, VecListThreeD:$Vn, DPR:$Vm),
+        NVTBLFrm, IIC_VTBX3,
+        "vtbx", "8", "$Vd, $Vn, $Vm",
+        "$orig = $Vd", []>;
+def  VTBX4
+  : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$Vd),
+        (ins DPR:$orig, VecListFourD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX4,
+        "vtbx", "8", "$Vd, $Vn, $Vm",
+        "$orig = $Vd", []>;
+} // hasExtraSrcRegAllocReq = 1
+
+def  VTBX3Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src),
+                IIC_VTBX3, "$orig = $dst", []>;
+def  VTBX4Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src),
+                IIC_VTBX4, "$orig = $dst", []>;
+} // DecoderMethod = "DecodeTBLInstruction"
+
+// VRINT      : Vector Rounding
+multiclass VRINT_FPI<string op, bits<3> op9_7, SDPatternOperator Int> {
+  let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+    def Df : N2VDIntnp<0b10, 0b10, 0b100, 0, NoItinerary,
+                      !strconcat("vrint", op), "f32",
+                      v2f32, v2f32, Int>, Requires<[HasV8, HasNEON]> {
+      let Inst{9-7} = op9_7;
+    }
+    def Qf : N2VQIntnp<0b10, 0b10, 0b100, 0, NoItinerary,
+                      !strconcat("vrint", op), "f32",
+                      v4f32, v4f32, Int>, Requires<[HasV8, HasNEON]> {
+      let Inst{9-7} = op9_7;
+    }
+    def Dh : N2VDIntnp<0b01, 0b10, 0b100, 0, NoItinerary,
+                      !strconcat("vrint", op), "f16",
+                      v4f16, v4f16, Int>,
+             Requires<[HasV8, HasNEON, HasFullFP16]> {
+      let Inst{9-7} = op9_7;
+    }
+    def Qh : N2VQIntnp<0b01, 0b10, 0b100, 0, NoItinerary,
+                      !strconcat("vrint", op), "f16",
+                      v8f16, v8f16, Int>,
+             Requires<[HasV8, HasNEON, HasFullFP16]> {
+      let Inst{9-7} = op9_7;
+    }
+  }
+
+  def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Dd, $Dm"),
+                  (!cast<Instruction>(NAME#"Df") DPR:$Dd, DPR:$Dm)>;
+  def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Qd, $Qm"),
+                  (!cast<Instruction>(NAME#"Qf") QPR:$Qd, QPR:$Qm)>;
+  let Predicates = [HasNEON, HasFullFP16] in {
+  def : NEONInstAlias<!strconcat("vrint", op, ".f16.f16\t$Dd, $Dm"),
+                  (!cast<Instruction>(NAME#"Dh") DPR:$Dd, DPR:$Dm)>;
+  def : NEONInstAlias<!strconcat("vrint", op, ".f16.f16\t$Qd, $Qm"),
+                  (!cast<Instruction>(NAME#"Qh") QPR:$Qd, QPR:$Qm)>;
+  }
+}
+
+defm VRINTNN : VRINT_FPI<"n", 0b000, int_arm_neon_vrintn>;
+defm VRINTXN : VRINT_FPI<"x", 0b001, int_arm_neon_vrintx>;
+defm VRINTAN : VRINT_FPI<"a", 0b010, int_arm_neon_vrinta>;
+defm VRINTZN : VRINT_FPI<"z", 0b011, int_arm_neon_vrintz>;
+defm VRINTMN : VRINT_FPI<"m", 0b101, int_arm_neon_vrintm>;
+defm VRINTPN : VRINT_FPI<"p", 0b111, int_arm_neon_vrintp>;
+
+// Cryptography instructions
+let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
+    DecoderNamespace = "v8Crypto", hasSideEffects = 0 in {
+  class AES<string op, bit op7, bit op6, SDPatternOperator Int>
+    : N2VQIntXnp<0b00, 0b00, 0b011, op6, op7, NoItinerary,
+                 !strconcat("aes", op), "8", v16i8, v16i8, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class AES2Op<string op, bit op7, bit op6, SDPatternOperator Int>
+    : N2VQIntX2np<0b00, 0b00, 0b011, op6, op7, NoItinerary,
+                 !strconcat("aes", op), "8", v16i8, v16i8, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class N2SHA<string op, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+              SDPatternOperator Int>
+    : N2VQIntXnp<0b10, op17_16, op10_8, op6, op7, NoItinerary,
+                 !strconcat("sha", op), "32", v4i32, v4i32, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class N2SHA2Op<string op, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+              SDPatternOperator Int>
+    : N2VQIntX2np<0b10, op17_16, op10_8, op6, op7, NoItinerary,
+                 !strconcat("sha", op), "32", v4i32, v4i32, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class N3SHA3Op<string op, bits<5> op27_23, bits<2> op21_20, SDPatternOperator Int>
+    : N3VQInt3np<op27_23, op21_20, 0b1100, 1, 0, N3RegFrm, NoItinerary,
+                !strconcat("sha", op), "32", v4i32, v4i32, Int, 0>,
+      Requires<[HasV8, HasCrypto]>;
+}
+
+def AESD : AES2Op<"d", 0, 1, int_arm_neon_aesd>;
+def AESE : AES2Op<"e", 0, 0, int_arm_neon_aese>;
+def AESIMC : AES<"imc", 1, 1, int_arm_neon_aesimc>;
+def AESMC : AES<"mc", 1, 0, int_arm_neon_aesmc>;
+
+def SHA1H : N2SHA<"1h", 0b01, 0b010, 1, 1, null_frag>;
+def SHA1SU1 : N2SHA2Op<"1su1", 0b10, 0b011, 1, 0, int_arm_neon_sha1su1>;
+def SHA256SU0 : N2SHA2Op<"256su0", 0b10, 0b011, 1, 1, int_arm_neon_sha256su0>;
+def SHA1C : N3SHA3Op<"1c", 0b00100, 0b00, null_frag>;
+def SHA1M : N3SHA3Op<"1m", 0b00100, 0b10, null_frag>;
+def SHA1P : N3SHA3Op<"1p", 0b00100, 0b01, null_frag>;
+def SHA1SU0 : N3SHA3Op<"1su0", 0b00100, 0b11, int_arm_neon_sha1su0>;
+def SHA256H : N3SHA3Op<"256h", 0b00110, 0b00, int_arm_neon_sha256h>;
+def SHA256H2 : N3SHA3Op<"256h2", 0b00110, 0b01, int_arm_neon_sha256h2>;
+def SHA256SU1 : N3SHA3Op<"256su1", 0b00110, 0b10, int_arm_neon_sha256su1>;
+
+def : Pat<(i32 (int_arm_neon_sha1h i32:$Rn)),
+          (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG
+              (SHA1H (SUBREG_TO_REG (i64 0),
+                                    (f32 (COPY_TO_REGCLASS i32:$Rn, SPR)),
+                                    ssub_0)),
+              ssub_0)), GPR)>;
+
+def : Pat<(v4i32 (int_arm_neon_sha1c v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)),
+          (SHA1C v4i32:$hash_abcd,
+                 (SUBREG_TO_REG (i64 0),
+                                (f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)),
+                                ssub_0),
+                 v4i32:$wk)>;
+
+def : Pat<(v4i32 (int_arm_neon_sha1m v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)),
+          (SHA1M v4i32:$hash_abcd,
+                 (SUBREG_TO_REG (i64 0),
+                                (f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)),
+                                ssub_0),
+                 v4i32:$wk)>;
+
+def : Pat<(v4i32 (int_arm_neon_sha1p v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk)),
+          (SHA1P v4i32:$hash_abcd,
+                 (SUBREG_TO_REG (i64 0),
+                                (f32 (COPY_TO_REGCLASS i32:$hash_e, SPR)),
+                                ssub_0),
+                 v4i32:$wk)>;
+
+//===----------------------------------------------------------------------===//
+// NEON instructions for single-precision FP math
+//===----------------------------------------------------------------------===//
+
+class N2VSPat<SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$a)),
+              (EXTRACT_SUBREG
+               (v2f32 (COPY_TO_REGCLASS (Inst
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$a, ssub_0)), DPR_VFP2)), ssub_0)>;
+
+class N3VSPat<SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$a, SPR:$b)),
+              (EXTRACT_SUBREG
+               (v2f32 (COPY_TO_REGCLASS (Inst
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$a, ssub_0),
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>;
+
+class N3VSMulOpPat<SDNode MulNode, SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))),
+              (EXTRACT_SUBREG
+               (v2f32 (COPY_TO_REGCLASS (Inst
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$acc, ssub_0),
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$a, ssub_0),
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>;
+
+class NVCVTIFPat<SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode GPR:$a)),
+              (f32 (EXTRACT_SUBREG
+                     (v2f32 (Inst
+                       (INSERT_SUBREG
+                         (v2f32 (IMPLICIT_DEF)),
+                         (i32 (COPY_TO_REGCLASS GPR:$a, SPR)), ssub_0))),
+                     ssub_0))>;
+class NVCVTFIPat<SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(i32 (OpNode SPR:$a)),
+              (i32 (EXTRACT_SUBREG
+                     (v2f32 (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
+                                                 SPR:$a, ssub_0))),
+                     ssub_0))>;
+
+def : N3VSPat<fadd, VADDfd>;
+def : N3VSPat<fsub, VSUBfd>;
+def : N3VSPat<fmul, VMULfd>;
+def : N3VSMulOpPat<fmul, fadd, VMLAfd>,
+      Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
+def : N3VSMulOpPat<fmul, fsub, VMLSfd>,
+      Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
+def : N3VSMulOpPat<fmul, fadd, VFMAfd>,
+      Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
+def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
+      Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
+def : N2VSPat<fabs, VABSfd>;
+def : N2VSPat<fneg, VNEGfd>;
+def : N3VSPat<fmaxnan, VMAXfd>, Requires<[HasNEON]>;
+def : N3VSPat<fminnan, VMINfd>, Requires<[HasNEON]>;
+def : NVCVTFIPat<fp_to_sint, VCVTf2sd>;
+def : NVCVTFIPat<fp_to_uint, VCVTf2ud>;
+def : NVCVTIFPat<sint_to_fp, VCVTs2fd>;
+def : NVCVTIFPat<uint_to_fp, VCVTu2fd>;
+
+// NEON doesn't have any f64 conversions, so provide patterns to make
+// sure the VFP conversions match when extracting from a vector.
+def : VFPPat<(f64 (sint_to_fp (extractelt (v2i32 DPR:$src), imm:$lane))),
+             (VSITOD (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>;
+def : VFPPat<(f64 (sint_to_fp (extractelt (v4i32 QPR:$src), imm:$lane))),
+             (VSITOD (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane)))>;
+def : VFPPat<(f64 (uint_to_fp (extractelt (v2i32 DPR:$src), imm:$lane))),
+             (VUITOD (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>;
+def : VFPPat<(f64 (uint_to_fp (extractelt (v4i32 QPR:$src), imm:$lane))),
+             (VUITOD (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane)))>;
+
+
+// Prefer VMOVDRR for i32 -> f32 bitcasts, it can write all DPR registers.
+def : Pat<(f32 (bitconvert GPR:$a)),
+          (EXTRACT_SUBREG (VMOVDRR GPR:$a, GPR:$a), ssub_0)>,
+        Requires<[HasNEON, DontUseVMOVSR]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// bit_convert
+let Predicates = [IsLE] in {
+  def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v8i8  DPR:$src))), (v1i64 DPR:$src)>;
+}
+def : Pat<(v1i64 (bitconvert (f64   DPR:$src))), (v1i64 DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v8i8  DPR:$src))), (v2i32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (f64   DPR:$src))), (v2i32 DPR:$src)>;
+}
+def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v8i8  DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (f64   DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v1i64 DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v2i32 DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v4i16 DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (f64   DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v2f32 DPR:$src))), (v8i8  DPR:$src)>;
+}
+def : Pat<(f64   (bitconvert (v1i64 DPR:$src))), (f64   DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (f64   DPR:$src))), (v2f32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
+}
+def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v8i8  DPR:$src))), (v2f32 DPR:$src)>;
+}
+
+let Predicates = [IsLE] in {
+  def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
+}
+def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
+}
+def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
+}
+def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
+}
+def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
+}
+
+let Predicates = [IsBE] in {
+  // 64 bit conversions
+  def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v8i8  DPR:$src))), (VREV64d8  DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v8i8  DPR:$src))), (VREV32d8  DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (f64   DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v8i8  DPR:$src))), (VREV16d8  DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (f64   DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v1i64 DPR:$src))), (VREV64d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v2i32 DPR:$src))), (VREV32d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v4i16 DPR:$src))), (VREV16d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (f64   DPR:$src))), (VREV64d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v2f32 DPR:$src))), (VREV32d8  DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (VREV64d8  DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (f64   DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v8i8  DPR:$src))), (VREV32d8  DPR:$src)>;
+
+  // 128 bit conversions
+  def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (VREV64q8  QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (VREV32q8  QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (VREV16q8  QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8  QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8  QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8  QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
+}
+
+// Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian
+def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
+          (VREV64q8 (VLD1q8 addrmode6:$addr))>, Requires<[IsBE]>;
+def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q8 addrmode6:$addr, (VREV64q8 QPR:$value))>, Requires<[IsBE]>;
+def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),
+          (VREV64q16 (VLD1q16 addrmode6:$addr))>, Requires<[IsBE]>;
+def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q16 addrmode6:$addr, (VREV64q16 QPR:$value))>, Requires<[IsBE]>;
+
+// Fold extracting an element out of a v2i32 into a vfp register.
+def : Pat<(f32 (bitconvert (i32 (extractelt (v2i32 DPR:$src), imm:$lane)))),
+          (f32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>;
+
+// Vector lengthening move with load, matching extending loads.
+
+// extload, zextload and sextload for a standard lengthening load. Example:
+// Lengthen_Single<"8", "i16", "8"> =
+//     Pat<(v8i16 (extloadvi8 addrmode6:$addr))
+//         (VMOVLuv8i16 (VLD1d8 addrmode6:$addr,
+//                              (f64 (IMPLICIT_DEF)), (i32 0)))>;
+multiclass Lengthen_Single<string DestLanes, string DestTy, string SrcTy> {
+  let AddedComplexity = 10 in {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                    (!cast<PatFrag>("extloadvi" # SrcTy) addrmode6:$addr)),
+                  (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy)
+                    (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+
+  def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                  (!cast<PatFrag>("zextloadvi" # SrcTy) addrmode6:$addr)),
+                (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy)
+                    (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+
+  def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                  (!cast<PatFrag>("sextloadvi" # SrcTy) addrmode6:$addr)),
+                (!cast<Instruction>("VMOVLsv" # DestLanes # DestTy)
+                    (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+  }
+}
+
+// extload, zextload and sextload for a lengthening load which only uses
+// half the lanes available. Example:
+// Lengthen_HalfSingle<"4", "i16", "8", "i16", "i8"> =
+//     Pat<(v4i16 (extloadvi8 addrmode6oneL32:$addr)),
+//         (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr,
+//                                      (f64 (IMPLICIT_DEF)), (i32 0))),
+//                         dsub_0)>;
+multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy,
+                               string InsnLanes, string InsnTy> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
+         (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+         dsub_0)>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
+         (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+         dsub_0)>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # InsnLanes # InsnTy)
+         (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+         dsub_0)>;
+}
+
+// The following class definition is basically a copy of the
+// Lengthen_HalfSingle definition above, however with an additional parameter
+// "RevLanes" to select the correct VREV32dXX instruction. This is to convert
+// data loaded by VLD1LN into proper vector format in big endian mode.
+multiclass Lengthen_HalfSingle_Big_Endian<string DestLanes, string DestTy, string SrcTy,
+                               string InsnLanes, string InsnTy, string RevLanes> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
+         (!cast<Instruction>("VREV32d" # RevLanes)
+           (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+         dsub_0)>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
+         (!cast<Instruction>("VREV32d" # RevLanes)
+           (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+         dsub_0)>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # InsnLanes # InsnTy)
+         (!cast<Instruction>("VREV32d" # RevLanes)
+           (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+         dsub_0)>;
+}
+
+// extload, zextload and sextload for a lengthening load followed by another
+// lengthening load, to quadruple the initial length.
+//
+// Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32"> =
+//     Pat<(v4i32 (extloadvi8 addrmode6oneL32:$addr))
+//         (EXTRACT_SUBREG (VMOVLuv4i32
+//           (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr,
+//                                                   (f64 (IMPLICIT_DEF)),
+//                                                   (i32 0))),
+//                           dsub_0)),
+//           dsub_0)>;
+multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy,
+                           string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+                           string Insn2Ty> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0))>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0))>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0))>;
+}
+
+// The following class definition is basically a copy of the
+// Lengthen_Double definition above, however with an additional parameter
+// "RevLanes" to select the correct VREV32dXX instruction. This is to convert
+// data loaded by VLD1LN into proper vector format in big endian mode.
+multiclass Lengthen_Double_Big_Endian<string DestLanes, string DestTy, string SrcTy,
+                           string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+                           string Insn2Ty, string RevLanes> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV32d" # RevLanes)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0))>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV32d" # RevLanes)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0))>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV32d" # RevLanes)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0))>;
+}
+
+// extload, zextload and sextload for a lengthening load followed by another
+// lengthening load, to quadruple the initial length, but which ends up only
+// requiring half the available lanes (a 64-bit outcome instead of a 128-bit).
+//
+// Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32"> =
+// Pat<(v2i32 (extloadvi8 addrmode6:$addr))
+//     (EXTRACT_SUBREG (VMOVLuv4i32
+//       (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd16 addrmode6:$addr,
+//                                               (f64 (IMPLICIT_DEF)), (i32 0))),
+//                       dsub_0)),
+//       dsub_0)>;
+multiclass Lengthen_HalfDouble<string DestLanes, string DestTy, string SrcTy,
+                           string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+                           string Insn2Ty> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0)),
+          dsub_0)>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0)),
+          dsub_0)>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0)),
+          dsub_0)>;
+}
+
+// The following class definition is basically a copy of the
+// Lengthen_HalfDouble definition above, however with an additional VREV16d8
+// instruction to convert data loaded by VLD1LN into proper vector format
+// in big endian mode.
+multiclass Lengthen_HalfDouble_Big_Endian<string DestLanes, string DestTy, string SrcTy,
+                           string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+                           string Insn2Ty> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0)),
+          dsub_0)>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0)),
+          dsub_0)>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+            (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0)))),
+             dsub_0)),
+          dsub_0)>;
+}
+
+defm : Lengthen_Single<"8", "i16", "8">; // v8i8 -> v8i16
+defm : Lengthen_Single<"4", "i32", "16">; // v4i16 -> v4i32
+defm : Lengthen_Single<"2", "i64", "32">; // v2i32 -> v2i64
+
+let Predicates = [IsLE] in {
+  defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16
+  defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32
+
+  // Double lengthening - v4i8 -> v4i16 -> v4i32
+  defm : Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32">;
+  // v2i8 -> v2i16 -> v2i32
+  defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">;
+  // v2i16 -> v2i32 -> v2i64
+  defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">;
+}
+
+let Predicates = [IsBE] in {
+  defm : Lengthen_HalfSingle_Big_Endian<"4", "i16", "i8", "8", "i16", "8">; // v4i8 -> v4i16
+  defm : Lengthen_HalfSingle_Big_Endian<"2", "i32", "i16", "4", "i32", "16">; // v2i16 -> v2i32
+
+  // Double lengthening - v4i8 -> v4i16 -> v4i32
+  defm : Lengthen_Double_Big_Endian<"4", "i32", "i8", "8", "i16", "4", "i32", "8">;
+  // v2i8 -> v2i16 -> v2i32
+  defm : Lengthen_HalfDouble_Big_Endian<"2", "i32", "i8", "8", "i16", "4", "i32">;
+  // v2i16 -> v2i32 -> v2i64
+  defm : Lengthen_Double_Big_Endian<"2", "i64", "i16", "4", "i32", "2", "i64", "16">;
+}
+
+// Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64
+let Predicates = [IsLE] in {
+  def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)),
+        (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+           (VLD1LNd16 addrmode6:$addr,
+                      (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+  def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)),
+        (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+           (VLD1LNd16 addrmode6:$addr,
+                      (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+  def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)),
+        (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16
+           (VLD1LNd16 addrmode6:$addr,
+                      (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+}
+// The following patterns are basically a copy of the patterns above, 
+// however with an additional VREV16d instruction to convert data
+// loaded by VLD1LN into proper vector format in big endian mode.
+let Predicates = [IsBE] in {
+  def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)),
+        (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+           (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr,
+                        (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>;
+  def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)),
+        (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+           (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr,
+                        (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>;
+  def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)),
+        (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16
+           (!cast<Instruction>("VREV16d8")
+             (VLD1LNd16 addrmode6:$addr,
+                        (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>;
+}
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases
+//
+
+def : VFP2InstAlias<"fmdhr${p} $Dd, $Rn",
+                    (VSETLNi32 DPR:$Dd, GPR:$Rn, 1, pred:$p)>;
+def : VFP2InstAlias<"fmdlr${p} $Dd, $Rn",
+                    (VSETLNi32 DPR:$Dd, GPR:$Rn, 0, pred:$p)>;
+
+// VAND/VBIC/VEOR/VORR accept but do not require a type suffix.
+defm : NEONDTAnyInstAlias<"vand${p}", "$Vd, $Vn, $Vm",
+                         (VANDd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vand${p}", "$Vd, $Vn, $Vm",
+                         (VANDq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbic${p}", "$Vd, $Vn, $Vm",
+                         (VBICd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbic${p}", "$Vd, $Vn, $Vm",
+                         (VBICq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"veor${p}", "$Vd, $Vn, $Vm",
+                         (VEORd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"veor${p}", "$Vd, $Vn, $Vm",
+                         (VEORq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm",
+                         (VORRd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm",
+                         (VORRq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+// ... two-operand aliases
+defm : NEONDTAnyInstAlias<"vand${p}", "$Vdn, $Vm",
+                         (VANDd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vand${p}", "$Vdn, $Vm",
+                         (VANDq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"veor${p}", "$Vdn, $Vm",
+                         (VEORd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"veor${p}", "$Vdn, $Vm",
+                         (VEORq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
+                         (VORRd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
+                         (VORRq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+// ... immediates
+def : NEONInstAlias<"vand${p}.i16 $Vd, $imm",
+                    (VBICiv4i16 DPR:$Vd, nImmSplatNotI16:$imm, pred:$p)>;
+def : NEONInstAlias<"vand${p}.i32 $Vd, $imm",
+                    (VBICiv2i32 DPR:$Vd, nImmSplatNotI32:$imm, pred:$p)>;
+def : NEONInstAlias<"vand${p}.i16 $Vd, $imm",
+                    (VBICiv8i16 QPR:$Vd, nImmSplatNotI16:$imm, pred:$p)>;
+def : NEONInstAlias<"vand${p}.i32 $Vd, $imm",
+                    (VBICiv4i32 QPR:$Vd, nImmSplatNotI32:$imm, pred:$p)>;
+
+
+// VLD1 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr",
+                 (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                      pred:$p)>;
+def VLD1LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr",
+                 (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
+def VLD1LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr",
+                 (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+
+def VLD1LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr!",
+                 (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                      pred:$p)>;
+def VLD1LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr!",
+                 (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
+def VLD1LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr!",
+                 (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VLD1LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD1LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD1LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+// VST1 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VST1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr",
+                 (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                      pred:$p)>;
+def VST1LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr",
+                 (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
+def VST1LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr",
+                 (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+
+def VST1LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr!",
+                 (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                      pred:$p)>;
+def VST1LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr!",
+                 (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
+def VST1LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr!",
+                 (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VST1LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST1LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST1LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VLD2 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD2LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr",
+                 (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                  pred:$p)>;
+def VLD2LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr",
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VLD2LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr",
+                 (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD2LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr",
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VLD2LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr",
+                 (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
+
+def VLD2LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr!",
+                 (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
+def VLD2LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr!",
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VLD2LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr!",
+                 (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
+def VLD2LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr!",
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VLD2LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr!",
+                 (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
+def VLD2LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD2LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD2LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD2LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD2LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+// VST2 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VST2LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr",
+                 (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
+def VST2LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr",
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VST2LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr",
+                 (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
+def VST2LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr",
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VST2LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr",
+                 (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
+
+def VST2LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr!",
+                 (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
+def VST2LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr!",
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VST2LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr!",
+                 (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
+def VST2LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr!",
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
+def VST2LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr!",
+                 (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
+def VST2LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST2LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".16","$list, $addr, $Rm",
+                  (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST2LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST2LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".16","$list, $addr, $Rm",
+                  (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST2LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VLD3 all-lanes pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD3DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+
+def VLD3DUPdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPqWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
+def VLD3DUPdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3DUPdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3DUPdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3DUPqWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3DUPqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3DUPqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+// VLD3 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD3LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+               (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+
+def VLD3LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VLD3LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeDHWordIndexed:$list,
+                       addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
+def VLD3LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeQHWordIndexed:$list,
+                       addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
+def VLD3LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VLD3 multiple structure pseudo-instructions. These need special handling for
+// the vector operands that the normal instructions don't yet model.
+// FIXME: Remove these when the register classes and instructions are updated.
+def VLD3dAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3dAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3dAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+
+def VLD3dWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3dWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3dWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3qWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VLD3dWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3dWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3dWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3qWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3qWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3qWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VST3 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VST3LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
+               (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
+               (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
+               (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
+               (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
+               (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+
+def VST3LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
+def VST3LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeDHWordIndexed:$list,
+                       addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
+def VST3LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeQHWordIndexed:$list,
+                       addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
+def VST3LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+// VST3 multiple structure pseudo-instructions. These need special handling for
+// the vector operands that the normal instructions don't yet model.
+// FIXME: Remove these when the register classes and instructions are updated.
+def VST3dAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3dAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3dAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+
+def VST3dWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3dWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3dWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3qWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
+def VST3dWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3dWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3dWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3qWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3qWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3qWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VLD4 all-lanes pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD4DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr,
+                    pred:$p)>;
+def VLD4DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr,
+                    pred:$p)>;
+def VLD4DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign64or128:$addr,
+                    pred:$p)>;
+def VLD4DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr,
+                    pred:$p)>;
+def VLD4DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr,
+                    pred:$p)>;
+def VLD4DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign64or128:$addr,
+                    pred:$p)>;
+
+def VLD4DUPdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr,
+                    pred:$p)>;
+def VLD4DUPdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr,
+                    pred:$p)>;
+def VLD4DUPdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign64or128:$addr,
+                    pred:$p)>;
+def VLD4DUPqWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr,
+                    pred:$p)>;
+def VLD4DUPqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr,
+                    pred:$p)>;
+def VLD4DUPqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign64or128:$addr,
+                    pred:$p)>;
+def VLD4DUPdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4DUPdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4DUPdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourDAllLanes:$list,
+                       addrmode6dupalign64or128:$addr, rGPR:$Rm, pred:$p)>;
+def VLD4DUPqWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4DUPqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4DUPqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourQAllLanes:$list,
+                       addrmode6dupalign64or128:$addr, rGPR:$Rm, pred:$p)>;
+
+
+// VLD4 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD4LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+               (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                    pred:$p)>;
+def VLD4LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
+def VLD4LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
+def VLD4LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
+def VLD4LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
+
+def VLD4LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+               (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                    pred:$p)>;
+def VLD4LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
+def VLD4LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
+def VLD4LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
+def VLD4LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
+def VLD4LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourDWordIndexed:$list,
+                       addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
+def VLD4LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourQWordIndexed:$list,
+                       addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
+
+
+
+// VLD4 multiple structure pseudo-instructions. These need special handling for
+// the vector operands that the normal instructions don't yet model.
+// FIXME: Remove these when the register classes and instructions are updated.
+def VLD4dAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4dAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4dAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4qAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4qAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4qAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+
+def VLD4dWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4dWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4dWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4qWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4qWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4qWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
+def VLD4dWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4dWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4dWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4qWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4qWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4qWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VST4 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VST4LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
+               (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                    pred:$p)>;
+def VST4LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
+               (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
+def VST4LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
+               (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
+def VST4LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
+               (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
+def VST4LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
+               (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
+
+def VST4LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
+               (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                    pred:$p)>;
+def VST4LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
+               (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
+def VST4LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
+               (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
+def VST4LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
+               (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
+def VST4LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
+               (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
+def VST4LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourDWordIndexed:$list,
+                       addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
+def VST4LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourQWordIndexed:$list,
+                       addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
+
+
+// VST4 multiple structure pseudo-instructions. These need special handling for
+// the vector operands that the normal instructions don't yet model.
+// FIXME: Remove these when the register classes and instructions are updated.
+def VST4dAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4dAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4dAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4qAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4qAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4qAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+
+def VST4dWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4dWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4dWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4qWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4qWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4qWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
+def VST4dWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4dWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4dWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4qWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4qWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4qWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VMOV/VMVN takes an optional datatype suffix
+defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
+                         (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
+                         (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
+
+defm : NEONDTAnyInstAlias<"vmvn${p}", "$Vd, $Vm",
+                         (VMVNd DPR:$Vd, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vmvn${p}", "$Vd, $Vm",
+                         (VMVNq QPR:$Vd, QPR:$Vm, pred:$p)>;
+
+// VCLT (register) is an assembler alias for VCGT w/ the operands reversed.
+// D-register versions.
+def : NEONInstAlias<"vcle${p}.s8 $Dd, $Dn, $Dm",
+                    (VCGEsv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s16 $Dd, $Dn, $Dm",
+                    (VCGEsv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s32 $Dd, $Dn, $Dm",
+                    (VCGEsv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u8 $Dd, $Dn, $Dm",
+                    (VCGEuv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u16 $Dd, $Dn, $Dm",
+                    (VCGEuv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u32 $Dd, $Dn, $Dm",
+                    (VCGEuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.f32 $Dd, $Dn, $Dm",
+                    (VCGEfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in
+def : NEONInstAlias<"vcle${p}.f16 $Dd, $Dn, $Dm",
+                    (VCGEhd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+// Q-register versions.
+def : NEONInstAlias<"vcle${p}.s8 $Qd, $Qn, $Qm",
+                    (VCGEsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s16 $Qd, $Qn, $Qm",
+                    (VCGEsv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s32 $Qd, $Qn, $Qm",
+                    (VCGEsv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u8 $Qd, $Qn, $Qm",
+                    (VCGEuv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u16 $Qd, $Qn, $Qm",
+                    (VCGEuv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u32 $Qd, $Qn, $Qm",
+                    (VCGEuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.f32 $Qd, $Qn, $Qm",
+                    (VCGEfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in
+def : NEONInstAlias<"vcle${p}.f16 $Qd, $Qn, $Qm",
+                    (VCGEhq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+
+// VCLT (register) is an assembler alias for VCGT w/ the operands reversed.
+// D-register versions.
+def : NEONInstAlias<"vclt${p}.s8 $Dd, $Dn, $Dm",
+                    (VCGTsv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s16 $Dd, $Dn, $Dm",
+                    (VCGTsv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s32 $Dd, $Dn, $Dm",
+                    (VCGTsv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u8 $Dd, $Dn, $Dm",
+                    (VCGTuv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u16 $Dd, $Dn, $Dm",
+                    (VCGTuv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u32 $Dd, $Dn, $Dm",
+                    (VCGTuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.f32 $Dd, $Dn, $Dm",
+                    (VCGTfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in
+def : NEONInstAlias<"vclt${p}.f16 $Dd, $Dn, $Dm",
+                    (VCGThd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+// Q-register versions.
+def : NEONInstAlias<"vclt${p}.s8 $Qd, $Qn, $Qm",
+                    (VCGTsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s16 $Qd, $Qn, $Qm",
+                    (VCGTsv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s32 $Qd, $Qn, $Qm",
+                    (VCGTsv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u8 $Qd, $Qn, $Qm",
+                    (VCGTuv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u16 $Qd, $Qn, $Qm",
+                    (VCGTuv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u32 $Qd, $Qn, $Qm",
+                    (VCGTuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.f32 $Qd, $Qn, $Qm",
+                    (VCGTfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+let Predicates = [HasNEON, HasFullFP16] in
+def : NEONInstAlias<"vclt${p}.f16 $Qd, $Qn, $Qm",
+                    (VCGThq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+
+// VSWP allows, but does not require, a type suffix.
+defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm",
+                         (VSWPd DPR:$Vd, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm",
+                         (VSWPq QPR:$Vd, QPR:$Vm, pred:$p)>;
+
+// VBIF, VBIT, and VBSL allow, but do not require, a type suffix.
+defm : NEONDTAnyInstAlias<"vbif${p}", "$Vd, $Vn, $Vm",
+                         (VBIFd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbit${p}", "$Vd, $Vn, $Vm",
+                         (VBITd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbsl${p}", "$Vd, $Vn, $Vm",
+                         (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbif${p}", "$Vd, $Vn, $Vm",
+                         (VBIFq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbit${p}", "$Vd, $Vn, $Vm",
+                         (VBITq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbsl${p}", "$Vd, $Vn, $Vm",
+                         (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+
+// "vmov Rd, #-imm" can be handled via "vmvn".
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $imm",
+                    (VMVNv2i32 DPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $imm",
+                    (VMVNv4i32 QPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $imm",
+                    (VMOVv2i32 DPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $imm",
+                    (VMOVv4i32 QPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>;
+
+// 'gas' compatibility aliases for quad-word instructions. Strictly speaking,
+// these should restrict to just the Q register variants, but the register
+// classes are enough to match correctly regardless, so we keep it simple
+// and just use MnemonicAlias.
+def : NEONMnemonicAlias<"vbicq", "vbic">;
+def : NEONMnemonicAlias<"vandq", "vand">;
+def : NEONMnemonicAlias<"veorq", "veor">;
+def : NEONMnemonicAlias<"vorrq", "vorr">;
+
+def : NEONMnemonicAlias<"vmovq", "vmov">;
+def : NEONMnemonicAlias<"vmvnq", "vmvn">;
+// Explicit versions for floating point so that the FPImm variants get
+// handled early. The parser gets confused otherwise.
+def : NEONMnemonicAlias<"vmovq.f32", "vmov.f32">;
+def : NEONMnemonicAlias<"vmovq.f64", "vmov.f64">;
+
+def : NEONMnemonicAlias<"vaddq", "vadd">;
+def : NEONMnemonicAlias<"vsubq", "vsub">;
+
+def : NEONMnemonicAlias<"vminq", "vmin">;
+def : NEONMnemonicAlias<"vmaxq", "vmax">;
+
+def : NEONMnemonicAlias<"vmulq", "vmul">;
+
+def : NEONMnemonicAlias<"vabsq", "vabs">;
+
+def : NEONMnemonicAlias<"vshlq", "vshl">;
+def : NEONMnemonicAlias<"vshrq", "vshr">;
+
+def : NEONMnemonicAlias<"vcvtq", "vcvt">;
+
+def : NEONMnemonicAlias<"vcleq", "vcle">;
+def : NEONMnemonicAlias<"vceqq", "vceq">;
+
+def : NEONMnemonicAlias<"vzipq", "vzip">;
+def : NEONMnemonicAlias<"vswpq", "vswp">;
+
+def : NEONMnemonicAlias<"vrecpeq.f32", "vrecpe.f32">;
+def : NEONMnemonicAlias<"vrecpeq.u32", "vrecpe.u32">;
+
+
+// Alias for loading floating point immediates that aren't representable
+// using the vmov.f32 encoding but the bitpattern is representable using
+// the .i32 encoding.
+def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm",
+                     (VMOVv4i32 QPR:$Vd, nImmVMOVI32:$imm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm",
+                     (VMOVv2i32 DPR:$Vd, nImmVMOVI32:$imm, pred:$p)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
new file mode 100644
index 000000000000..a681f64b05e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -0,0 +1,1610 @@
+//===-- ARMInstrThumb.td - Thumb support for ARM -----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Thumb instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Thumb specific DAG Nodes.
+//
+
+def imm_sr_XFORM: SDNodeXForm<imm, [{
+  unsigned Imm = N->getZExtValue();
+  return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), SDLoc(N), MVT::i32);
+}]>;
+def ThumbSRImmAsmOperand: AsmOperandClass { let Name = "ImmThumbSR"; }
+def imm_sr : Operand<i32>, PatLeaf<(imm), [{
+  uint64_t Imm = N->getZExtValue();
+  return Imm > 0 && Imm <= 32;
+}], imm_sr_XFORM> {
+  let PrintMethod = "printThumbSRImm";
+  let ParserMatchClass = ThumbSRImmAsmOperand;
+}
+
+def imm_comp_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+def imm0_7_neg : PatLeaf<(i32 imm), [{
+  return (uint32_t)-N->getZExtValue() < 8;
+}], imm_neg_XFORM>;
+
+def imm0_255_comp : PatLeaf<(i32 imm), [{
+  return ~((uint32_t)N->getZExtValue()) < 256;
+}]>;
+
+def imm8_255 : ImmLeaf<i32, [{
+  return Imm >= 8 && Imm < 256;
+}]>;
+def imm8_255_neg : PatLeaf<(i32 imm), [{
+  unsigned Val = -N->getZExtValue();
+  return Val >= 8 && Val < 256;
+}], imm_neg_XFORM>;
+
+// Break imm's up into two pieces: an immediate + a left shift. This uses
+// thumb_immshifted to match and thumb_immshifted_val and thumb_immshifted_shamt
+// to get the val/shift pieces.
+def thumb_immshifted : PatLeaf<(imm), [{
+  return ARM_AM::isThumbImmShiftedVal((unsigned)N->getZExtValue());
+}]>;
+
+def thumb_immshifted_val : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getThumbImmNonShiftedVal((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);
+}]>;
+
+def thumb_immshifted_shamt : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getThumbImmValShift((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);
+}]>;
+
+def imm256_510 : ImmLeaf<i32, [{
+  return Imm >= 256 && Imm < 511;
+}]>;
+
+def thumb_imm256_510_addend : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() - 255, SDLoc(N), MVT::i32);
+}]>;
+
+// Scaled 4 immediate.
+def t_imm0_1020s4_asmoperand: AsmOperandClass { let Name = "Imm0_1020s4"; }
+def t_imm0_1020s4 : Operand<i32> {
+  let PrintMethod = "printThumbS4ImmOperand";
+  let ParserMatchClass = t_imm0_1020s4_asmoperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def t_imm0_508s4_asmoperand: AsmOperandClass { let Name = "Imm0_508s4"; }
+def t_imm0_508s4 : Operand<i32> {
+  let PrintMethod = "printThumbS4ImmOperand";
+  let ParserMatchClass = t_imm0_508s4_asmoperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+// Alias use only, so no printer is necessary.
+def t_imm0_508s4_neg_asmoperand: AsmOperandClass { let Name = "Imm0_508s4Neg"; }
+def t_imm0_508s4_neg : Operand<i32> {
+  let ParserMatchClass = t_imm0_508s4_neg_asmoperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// Define Thumb specific addressing modes.
+
+// unsigned 8-bit, 2-scaled memory offset
+class OperandUnsignedOffset_b8s2 : AsmOperandClass {
+  let Name = "UnsignedOffset_b8s2";
+  let PredicateMethod = "isUnsignedOffset<8, 2>";
+}
+
+def UnsignedOffset_b8s2 : OperandUnsignedOffset_b8s2;
+
+// thumb style PC relative operand. signed, 8 bits magnitude,
+// two bits shift. can be represented as either [pc, #imm], #imm,
+// or relocatable expression...
+def ThumbMemPC : AsmOperandClass {
+  let Name = "ThumbMemPC";
+}
+
+let OperandType = "OPERAND_PCREL" in {
+def t_brtarget : Operand<OtherVT> {
+  let EncoderMethod = "getThumbBRTargetOpValue";
+  let DecoderMethod = "DecodeThumbBROperand";
+}
+
+// ADR instruction labels.
+def t_adrlabel : Operand<i32> {
+  let EncoderMethod = "getThumbAdrLabelOpValue";
+  let PrintMethod = "printAdrLabelOperand<2>";
+  let ParserMatchClass = UnsignedOffset_b8s2;
+}
+
+
+def thumb_br_target : Operand<OtherVT> {
+  let ParserMatchClass = ThumbBranchTarget;
+  let EncoderMethod = "getThumbBranchTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+}
+
+def thumb_bl_target : Operand<i32> {
+  let ParserMatchClass = ThumbBranchTarget;
+  let EncoderMethod = "getThumbBLTargetOpValue";
+  let DecoderMethod = "DecodeThumbBLTargetOperand";
+}
+
+// Target for BLX *from* thumb mode.
+def thumb_blx_target : Operand<i32> {
+  let ParserMatchClass = ARMBranchTarget;
+  let EncoderMethod = "getThumbBLXTargetOpValue";
+  let DecoderMethod = "DecodeThumbBLXOffset";
+}
+
+def thumb_bcc_target : Operand<OtherVT> {
+  let ParserMatchClass = ThumbBranchTarget;
+  let EncoderMethod = "getThumbBCCTargetOpValue";
+  let DecoderMethod = "DecodeThumbBCCTargetOperand";
+}
+
+def thumb_cb_target : Operand<OtherVT> {
+  let ParserMatchClass = ThumbBranchTarget;
+  let EncoderMethod = "getThumbCBTargetOpValue";
+  let DecoderMethod = "DecodeThumbCmpBROperand";
+}
+
+// t_addrmode_pc := <label> => pc + imm8 * 4
+//
+def t_addrmode_pc : MemOperand {
+  let EncoderMethod = "getAddrModePCOpValue";
+  let DecoderMethod = "DecodeThumbAddrModePC";
+  let PrintMethod = "printThumbLdrLabelOperand";
+  let ParserMatchClass = ThumbMemPC;
+}
+}
+
+// t_addrmode_rr := reg + reg
+//
+def t_addrmode_rr_asm_operand : AsmOperandClass { let Name = "MemThumbRR"; }
+def t_addrmode_rr : MemOperand,
+                    ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> {
+  let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+
+// t_addrmode_rrs := reg + reg
+//
+// We use separate scaled versions because the Select* functions need
+// to explicitly check for a matching constant and return false here so that
+// the reg+imm forms will match instead. This is a horrible way to do that,
+// as it forces tight coupling between the methods, but it's how selectiondag
+// currently works.
+def t_addrmode_rrs1 : MemOperand,
+                      ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S1", []> {
+  let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+def t_addrmode_rrs2 : MemOperand,
+                      ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S2", []> {
+  let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+def t_addrmode_rrs4 : MemOperand,
+                      ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S4", []> {
+  let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+
+// t_addrmode_is4 := reg + imm5 * 4
+//
+def t_addrmode_is4_asm_operand : AsmOperandClass { let Name = "MemThumbRIs4"; }
+def t_addrmode_is4 : MemOperand,
+                     ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S4", []> {
+  let EncoderMethod = "getAddrModeISOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeIS";
+  let PrintMethod = "printThumbAddrModeImm5S4Operand";
+  let ParserMatchClass = t_addrmode_is4_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+}
+
+// t_addrmode_is2 := reg + imm5 * 2
+//
+def t_addrmode_is2_asm_operand : AsmOperandClass { let Name = "MemThumbRIs2"; }
+def t_addrmode_is2 : MemOperand,
+                     ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S2", []> {
+  let EncoderMethod = "getAddrModeISOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeIS";
+  let PrintMethod = "printThumbAddrModeImm5S2Operand";
+  let ParserMatchClass = t_addrmode_is2_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+}
+
+// t_addrmode_is1 := reg + imm5
+//
+def t_addrmode_is1_asm_operand : AsmOperandClass { let Name = "MemThumbRIs1"; }
+def t_addrmode_is1 : MemOperand,
+                     ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S1", []> {
+  let EncoderMethod = "getAddrModeISOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeIS";
+  let PrintMethod = "printThumbAddrModeImm5S1Operand";
+  let ParserMatchClass = t_addrmode_is1_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+}
+
+// t_addrmode_sp := sp + imm8 * 4
+//
+// FIXME: This really shouldn't have an explicit SP operand at all. It should
+// be implicit, just like in the instruction encoding itself.
+def t_addrmode_sp_asm_operand : AsmOperandClass { let Name = "MemThumbSPI"; }
+def t_addrmode_sp : MemOperand,
+                    ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> {
+  let EncoderMethod = "getAddrModeThumbSPOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeSP";
+  let PrintMethod = "printThumbAddrModeSPOperand";
+  let ParserMatchClass = t_addrmode_sp_asm_operand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE
+// from removing one half of the matched pairs. That breaks PEI, which assumes
+// these will always be in pairs, and asserts if it finds otherwise. Better way?
+let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
+def tADJCALLSTACKUP :
+  PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary,
+             [(ARMcallseq_end imm:$amt1, imm:$amt2)]>,
+            Requires<[IsThumb, IsThumb1Only]>;
+
+def tADJCALLSTACKDOWN :
+  PseudoInst<(outs), (ins i32imm:$amt), NoItinerary,
+             [(ARMcallseq_start imm:$amt)]>,
+            Requires<[IsThumb, IsThumb1Only]>;
+}
+
+class T1SystemEncoding<bits<8> opc>
+  : T1Encoding<0b101111> {
+  let Inst{9-8} = 0b11;
+  let Inst{7-0} = opc;
+}
+
+def tHINT : T1pI<(outs), (ins imm0_15:$imm), NoItinerary, "hint", "\t$imm",
+                 [(int_arm_hint imm0_15:$imm)]>,
+            T1SystemEncoding<0x00>,
+            Requires<[IsThumb, HasV6M]> {
+  bits<4> imm;
+  let Inst{7-4} = imm;
+}
+
+// Note: When EmitPriority == 1, the alias will be used for printing
+class tHintAlias<string Asm, dag Result, bit EmitPriority = 0> : tInstAlias<Asm, Result, EmitPriority> {
+  let Predicates = [IsThumb, HasV6M];
+}
+
+def : tHintAlias<"nop$p", (tHINT 0, pred:$p), 1>; // A8.6.110
+def : tHintAlias<"yield$p", (tHINT 1, pred:$p), 1>; // A8.6.410
+def : tHintAlias<"wfe$p", (tHINT 2, pred:$p), 1>; // A8.6.408
+def : tHintAlias<"wfi$p", (tHINT 3, pred:$p), 1>; // A8.6.409
+def : tHintAlias<"sev$p", (tHINT 4, pred:$p), 1>; // A8.6.157
+def : tInstAlias<"sevl$p", (tHINT 5, pred:$p), 1> {
+  let Predicates = [IsThumb2, HasV8];
+}
+
+// The imm operand $val can be used by a debugger to store more information
+// about the breakpoint.
+def tBKPT : T1I<(outs), (ins imm0_255:$val), NoItinerary, "bkpt\t$val",
+                []>,
+           T1Encoding<0b101111> {
+  let Inst{9-8} = 0b10;
+  // A8.6.22
+  bits<8> val;
+  let Inst{7-0} = val;
+}
+// default immediate for breakpoint mnemonic
+def : InstAlias<"bkpt", (tBKPT 0), 0>, Requires<[IsThumb]>;
+
+def tHLT : T1I<(outs), (ins imm0_63:$val), NoItinerary, "hlt\t$val",
+                []>, T1Encoding<0b101110>, Requires<[IsThumb, HasV8]> {
+  let Inst{9-6} = 0b1010;
+  bits<6> val;
+  let Inst{5-0} = val;
+}
+
+def tSETEND : T1I<(outs), (ins setend_op:$end), NoItinerary, "setend\t$end",
+                  []>, T1Encoding<0b101101>, Requires<[IsNotMClass]>, Deprecated<HasV8Ops> {
+  bits<1> end;
+  // A8.6.156
+  let Inst{9-5} = 0b10010;
+  let Inst{4}   = 1;
+  let Inst{3}   = end;
+  let Inst{2-0} = 0b000;
+}
+
+// Change Processor State is a system instruction -- for disassembly only.
+def tCPS : T1I<(outs), (ins imod_op:$imod, iflags_op:$iflags),
+                NoItinerary, "cps$imod $iflags", []>,
+           T1Misc<0b0110011> {
+  // A8.6.38 & B6.1.1
+  bit imod;
+  bits<3> iflags;
+
+  let Inst{4}   = imod;
+  let Inst{3}   = 0;
+  let Inst{2-0} = iflags;
+  let DecoderMethod = "DecodeThumbCPS";
+}
+
+// For both thumb1 and thumb2.
+let isNotDuplicable = 1, isCodeGenOnly = 1 in
+def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, "",
+                  [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>,
+              T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
+  // A8.6.6
+  bits<3> dst;
+  let Inst{6-3} = 0b1111; // Rm = pc
+  let Inst{2-0} = dst;
+}
+
+// ADD <Rd>, sp, #<imm8>
+// FIXME: This should not be marked as having side effects, and it should be
+// rematerializable. Clearing the side effect bit causes miscompilations,
+// probably because the instruction can be moved around.
+def tADDrSPi : T1pI<(outs tGPR:$dst), (ins GPRsp:$sp, t_imm0_1020s4:$imm),
+                    IIC_iALUi, "add", "\t$dst, $sp, $imm", []>,
+               T1Encoding<{1,0,1,0,1,?}>, Sched<[WriteALU]> {
+  // A6.2 & A8.6.8
+  bits<3> dst;
+  bits<8> imm;
+  let Inst{10-8} = dst;
+  let Inst{7-0}  = imm;
+  let DecoderMethod = "DecodeThumbAddSpecialReg";
+}
+
+// Thumb1 frame lowering is rather fragile, we hope to be able to use
+// tADDrSPi, but we may need to insert a sequence that clobbers CPSR.
+def tADDframe : PseudoInst<(outs tGPR:$dst), (ins i32imm:$base, i32imm:$offset),
+                           NoItinerary, []>,
+                Requires<[IsThumb, IsThumb1Only]> {
+  let Defs = [CPSR];
+}
+
+// ADD sp, sp, #<imm7>
+def tADDspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
+                     IIC_iALUi, "add", "\t$Rdn, $imm", []>,
+              T1Misc<{0,0,0,0,0,?,?}>, Sched<[WriteALU]> {
+  // A6.2.5 & A8.6.8
+  bits<7> imm;
+  let Inst{6-0} = imm;
+  let DecoderMethod = "DecodeThumbAddSPImm";
+}
+
+// SUB sp, sp, #<imm7>
+// FIXME: The encoding and the ASM string don't match up.
+def tSUBspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
+                    IIC_iALUi, "sub", "\t$Rdn, $imm", []>,
+              T1Misc<{0,0,0,0,1,?,?}>, Sched<[WriteALU]> {
+  // A6.2.5 & A8.6.214
+  bits<7> imm;
+  let Inst{6-0} = imm;
+  let DecoderMethod = "DecodeThumbAddSPImm";
+}
+
+def : tInstAlias<"add${p} sp, $imm",
+                 (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
+def : tInstAlias<"add${p} sp, sp, $imm",
+                 (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
+
+// Can optionally specify SP as a three operand instruction.
+def : tInstAlias<"add${p} sp, sp, $imm",
+                 (tADDspi SP, t_imm0_508s4:$imm, pred:$p)>;
+def : tInstAlias<"sub${p} sp, sp, $imm",
+                 (tSUBspi SP, t_imm0_508s4:$imm, pred:$p)>;
+
+// ADD <Rm>, sp
+def tADDrSP : T1pI<(outs GPR:$Rdn), (ins GPRsp:$sp, GPR:$Rn), IIC_iALUr,
+                   "add", "\t$Rdn, $sp, $Rn", []>,
+              T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
+  // A8.6.9 Encoding T1
+  bits<4> Rdn;
+  let Inst{7}   = Rdn{3};
+  let Inst{6-3} = 0b1101;
+  let Inst{2-0} = Rdn{2-0};
+  let DecoderMethod = "DecodeThumbAddSPReg";
+}
+
+// ADD sp, <Rm>
+def tADDspr : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, GPR:$Rm), IIC_iALUr,
+                  "add", "\t$Rdn, $Rm", []>,
+              T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
+  // A8.6.9 Encoding T2
+  bits<4> Rm;
+  let Inst{7} = 1;
+  let Inst{6-3} = Rm;
+  let Inst{2-0} = 0b101;
+  let DecoderMethod = "DecodeThumbAddSPReg";
+}
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def tBX : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bx${p}\t$Rm", []>,
+            T1Special<{1,1,0,?}>, Sched<[WriteBr]> {
+    // A6.2.3 & A8.6.25
+    bits<4> Rm;
+    let Inst{6-3} = Rm;
+    let Inst{2-0} = 0b000;
+    let Unpredictable{2-0} = 0b111;
+  }
+  def tBXNS : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bxns${p}\t$Rm", []>,
+              Requires<[IsThumb, Has8MSecExt]>,
+              T1Special<{1,1,0,?}>, Sched<[WriteBr]> {
+    bits<4> Rm;
+    let Inst{6-3} = Rm;
+    let Inst{2-0} = 0b100;
+    let Unpredictable{1-0} = 0b11;
+  }
+}
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+  def tBX_RET : tPseudoExpand<(outs), (ins pred:$p), 2, IIC_Br,
+                   [(ARMretflag)], (tBX LR, pred:$p)>, Sched<[WriteBr]>;
+
+  // Alternative return instruction used by vararg functions.
+  def tBX_RET_vararg : tPseudoExpand<(outs), (ins tGPR:$Rm, pred:$p),
+                   2, IIC_Br, [],
+                   (tBX GPR:$Rm, pred:$p)>, Sched<[WriteBr]>;
+}
+
+// All calls clobber the non-callee saved registers. SP is marked as a use to
+// prevent stack-pointer assignments that appear immediately before calls from
+// potentially appearing dead.
+let isCall = 1,
+  Defs = [LR], Uses = [SP] in {
+  // Also used for Thumb2
+  def tBL  : TIx2<0b11110, 0b11, 1,
+                  (outs), (ins pred:$p, thumb_bl_target:$func), IIC_Br,
+                  "bl${p}\t$func",
+                  [(ARMcall tglobaladdr:$func)]>,
+             Requires<[IsThumb]>, Sched<[WriteBrL]> {
+    bits<24> func;
+    let Inst{26} = func{23};
+    let Inst{25-16} = func{20-11};
+    let Inst{13} = func{22};
+    let Inst{11} = func{21};
+    let Inst{10-0} = func{10-0};
+  }
+
+  // ARMv5T and above, also used for Thumb2
+  def tBLXi : TIx2<0b11110, 0b11, 0,
+                 (outs), (ins pred:$p, thumb_blx_target:$func), IIC_Br,
+                   "blx${p}\t$func", []>,
+              Requires<[IsThumb, HasV5T, IsNotMClass]>, Sched<[WriteBrL]> {
+    bits<24> func;
+    let Inst{26} = func{23};
+    let Inst{25-16} = func{20-11};
+    let Inst{13} = func{22};
+    let Inst{11} = func{21};
+    let Inst{10-1} = func{10-1};
+    let Inst{0} = 0; // func{0} is assumed zero
+  }
+
+  // Also used for Thumb2
+  def tBLXr : TI<(outs), (ins pred:$p, GPR:$func), IIC_Br,
+                  "blx${p}\t$func",
+                  [(ARMcall GPR:$func)]>,
+              Requires<[IsThumb, HasV5T]>,
+              T1Special<{1,1,1,?}>, Sched<[WriteBrL]> { // A6.2.3 & A8.6.24;
+    bits<4> func;
+    let Inst{6-3} = func;
+    let Inst{2-0} = 0b000;
+  }
+
+  // ARMv8-M Security Extensions
+  def tBLXNSr : TI<(outs), (ins pred:$p, GPRnopc:$func), IIC_Br,
+                   "blxns${p}\t$func", []>,
+                Requires<[IsThumb, Has8MSecExt]>,
+                T1Special<{1,1,1,?}>, Sched<[WriteBrL]> {
+    bits<4> func;
+    let Inst{6-3} = func;
+    let Inst{2-0} = 0b100;
+    let Unpredictable{1-0} = 0b11;
+  }
+
+  // ARMv4T
+  def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func),
+                  4, IIC_Br,
+                  [(ARMcall_nolink tGPR:$func)]>,
+            Requires<[IsThumb, IsThumb1Only]>, Sched<[WriteBr]>;
+}
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+  let isPredicable = 1 in
+  def tB   : T1pI<(outs), (ins t_brtarget:$target), IIC_Br,
+                 "b", "\t$target", [(br bb:$target)]>,
+             T1Encoding<{1,1,1,0,0,?}>, Sched<[WriteBr]> {
+    bits<11> target;
+    let Inst{10-0} = target;
+    let AsmMatchConverter = "cvtThumbBranches";
+ }
+
+  // Far jump
+  // Just a pseudo for a tBL instruction. Needed to let regalloc know about
+  // the clobber of LR.
+  let Defs = [LR] in
+  def tBfar : tPseudoExpand<(outs), (ins thumb_bl_target:$target, pred:$p),
+                          4, IIC_Br, [],
+                          (tBL pred:$p, thumb_bl_target:$target)>,
+                          Sched<[WriteBrTbl]>;
+
+  def tBR_JTr : tPseudoInst<(outs),
+                      (ins tGPR:$target, i32imm:$jt),
+                      0, IIC_Br,
+                      [(ARMbrjt tGPR:$target, tjumptable:$jt)]>,
+                      Sched<[WriteBrTbl]> {
+    let Size = 2;
+    list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+  }
+}
+
+// FIXME: should be able to write a pattern for ARMBrcond, but can't use
+// a two-value operand where a dag node expects two operands. :(
+let isBranch = 1, isTerminator = 1 in
+  def tBcc : T1I<(outs), (ins thumb_bcc_target:$target, pred:$p), IIC_Br,
+                 "b${p}\t$target",
+                 [/*(ARMbrcond bb:$target, imm:$cc)*/]>,
+             T1BranchCond<{1,1,0,1}>, Sched<[WriteBr]> {
+  bits<4> p;
+  bits<8> target;
+  let Inst{11-8} = p;
+  let Inst{7-0} = target;
+  let AsmMatchConverter = "cvtThumbBranches";
+}
+
+
+// Tail calls
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  // IOS versions.
+  let Uses = [SP] in {
+    def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst),
+                     4, IIC_Br, [],
+                     (tBX GPR:$dst, (ops 14, zero_reg))>,
+                     Requires<[IsThumb]>, Sched<[WriteBr]>;
+  }
+  // tTAILJMPd: MachO version uses a Thumb2 branch (no Thumb1 tail calls
+  // on MachO), so it's in ARMInstrThumb2.td.
+  // Non-MachO version:
+  let Uses = [SP] in {
+    def tTAILJMPdND : tPseudoExpand<(outs),
+                   (ins t_brtarget:$dst, pred:$p),
+                   4, IIC_Br, [],
+                   (tB t_brtarget:$dst, pred:$p)>,
+                 Requires<[IsThumb, IsNotMachO]>, Sched<[WriteBr]>;
+  }
+}
+
+
+// A8.6.218 Supervisor Call (Software Interrupt)
+// A8.6.16 B: Encoding T1
+// If Inst{11-8} == 0b1111 then SEE SVC
+let isCall = 1, Uses = [SP] in
+def tSVC : T1pI<(outs), (ins imm0_255:$imm), IIC_Br,
+                "svc", "\t$imm", []>, Encoding16, Sched<[WriteBr]> {
+  bits<8> imm;
+  let Inst{15-12} = 0b1101;
+  let Inst{11-8}  = 0b1111;
+  let Inst{7-0}   = imm;
+}
+
+// The assembler uses 0xDEFE for a trap instruction.
+let isBarrier = 1, isTerminator = 1 in
+def tTRAP : TI<(outs), (ins), IIC_Br,
+               "trap", [(trap)]>, Encoding16, Sched<[WriteBr]> {
+  let Inst = 0xdefe;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load Store Instructions.
+//
+
+// PC-relative loads need to be matched first as constant pool accesses need to
+// always be PC-relative. We do this using AddedComplexity, as the pattern is
+// simpler than the patterns of the other load instructions.
+let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 10 in
+def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
+                  "ldr", "\t$Rt, $addr",
+                  [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>,
+              T1Encoding<{0,1,0,0,1,?}> {
+  // A6.2 & A8.6.59
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0}  = addr;
+}
+
+// SP-relative loads should be matched before standard immediate-offset loads as
+// it means we avoid having to move SP to another register.
+let canFoldAsLoad = 1 in
+def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
+                    "ldr", "\t$Rt, $addr",
+                    [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>,
+              T1LdStSP<{1,?,?}> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0} = addr;
+}
+
+// Loads: reg/reg and reg/imm5
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
+                              Operand AddrMode_r, Operand AddrMode_i,
+                              AddrMode am, InstrItinClass itin_r,
+                              InstrItinClass itin_i, string asm,
+                              PatFrag opnode> {
+  // Immediate-offset loads should be matched before register-offset loads as
+  // when the offset is a constant it's simpler to first check if it fits in the
+  // immediate offset field then fall back to register-offset if it doesn't.
+  def i : // reg/imm5
+    T1pILdStEncodeImm<imm_opc, 1 /* Load */,
+                      (outs tGPR:$Rt), (ins AddrMode_i:$addr),
+                      am, itin_i, asm, "\t$Rt, $addr",
+                      [(set tGPR:$Rt, (opnode AddrMode_i:$addr))]>;
+  // Register-offset loads are matched last.
+  def r : // reg/reg
+    T1pILdStEncode<reg_opc,
+                   (outs tGPR:$Rt), (ins AddrMode_r:$addr),
+                   am, itin_r, asm, "\t$Rt, $addr",
+                   [(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>;
+}
+// Stores: reg/reg and reg/imm5
+multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
+                              Operand AddrMode_r, Operand AddrMode_i,
+                              AddrMode am, InstrItinClass itin_r,
+                              InstrItinClass itin_i, string asm,
+                              PatFrag opnode> {
+  def i : // reg/imm5
+    T1pILdStEncodeImm<imm_opc, 0 /* Store */,
+                      (outs), (ins tGPR:$Rt, AddrMode_i:$addr),
+                      am, itin_i, asm, "\t$Rt, $addr",
+                      [(opnode tGPR:$Rt, AddrMode_i:$addr)]>;
+  def r : // reg/reg
+    T1pILdStEncode<reg_opc,
+                   (outs), (ins tGPR:$Rt, AddrMode_r:$addr),
+                   am, itin_r, asm, "\t$Rt, $addr",
+                   [(opnode tGPR:$Rt, AddrMode_r:$addr)]>;
+}
+
+// A8.6.57 & A8.6.60
+defm tLDR  : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rr,
+                                t_addrmode_is4, AddrModeT1_4,
+                                IIC_iLoad_r, IIC_iLoad_i, "ldr",
+                                load>;
+
+// A8.6.64 & A8.6.61
+defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rr,
+                                t_addrmode_is1, AddrModeT1_1,
+                                IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb",
+                                zextloadi8>;
+
+// A8.6.76 & A8.6.73
+defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rr,
+                                t_addrmode_is2, AddrModeT1_2,
+                                IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh",
+                                zextloadi16>;
+
+let AddedComplexity = 10 in
+def tLDRSB :                    // A8.6.80
+  T1pILdStEncode<0b011, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr),
+                 AddrModeT1_1, IIC_iLoad_bh_r,
+                 "ldrsb", "\t$Rt, $addr",
+                 [(set tGPR:$Rt, (sextloadi8 t_addrmode_rr:$addr))]>;
+
+let AddedComplexity = 10 in
+def tLDRSH :                    // A8.6.84
+  T1pILdStEncode<0b111, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr),
+                 AddrModeT1_2, IIC_iLoad_bh_r,
+                 "ldrsh", "\t$Rt, $addr",
+                 [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr:$addr))]>;
+
+
+def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
+                    "str", "\t$Rt, $addr",
+                    [(store tGPR:$Rt, t_addrmode_sp:$addr)]>,
+              T1LdStSP<{0,?,?}> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0} = addr;
+}
+
+// A8.6.194 & A8.6.192
+defm tSTR  : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rr,
+                                t_addrmode_is4, AddrModeT1_4,
+                                IIC_iStore_r, IIC_iStore_i, "str",
+                                store>;
+
+// A8.6.197 & A8.6.195
+defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rr,
+                                t_addrmode_is1, AddrModeT1_1,
+                                IIC_iStore_bh_r, IIC_iStore_bh_i, "strb",
+                                truncstorei8>;
+
+// A8.6.207 & A8.6.205
+defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr,
+                               t_addrmode_is2, AddrModeT1_2,
+                               IIC_iStore_bh_r, IIC_iStore_bh_i, "strh",
+                               truncstorei16>;
+
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+// These require base address to be written back or one of the loaded regs.
+let hasSideEffects = 0 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+        IIC_iLoad_m, "ldm${p}\t$Rn, $regs", []>, T1Encoding<{1,1,0,0,1,?}> {
+  bits<3> Rn;
+  bits<8> regs;
+  let Inst{10-8} = Rn;
+  let Inst{7-0}  = regs;
+}
+
+// Writeback version is just a pseudo, as there's no encoding difference.
+// Writeback happens iff the base register is not in the destination register
+// list.
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+def tLDMIA_UPD :
+    InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
+                 "$Rn = $wb", IIC_iLoad_mu>,
+    PseudoInstExpansion<(tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)> {
+  let Size = 2;
+  let OutOperandList = (outs GPR:$wb);
+  let InOperandList = (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops);
+  let Pattern = [];
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+  list<Predicate> Predicates = [IsThumb];
+}
+
+// There is no non-writeback version of STM for Thumb.
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+def tSTMIA_UPD : Thumb1I<(outs GPR:$wb),
+                         (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+                         AddrModeNone, 2, IIC_iStore_mu,
+                         "stm${p}\t$Rn!, $regs", "$Rn = $wb", []>,
+                     T1Encoding<{1,1,0,0,0,?}> {
+  bits<3> Rn;
+  bits<8> regs;
+  let Inst{10-8} = Rn;
+  let Inst{7-0}  = regs;
+}
+
+} // hasSideEffects
+
+def : InstAlias<"ldm${p} $Rn!, $regs",
+                (tLDMIA tGPR:$Rn, pred:$p, reglist:$regs), 0>,
+        Requires<[IsThumb, IsThumb1Only]>;
+
+let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
+def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
+               IIC_iPop,
+               "pop${p}\t$regs", []>,
+           T1Misc<{1,1,0,?,?,?,?}> {
+  bits<16> regs;
+  let Inst{8}   = regs{15};
+  let Inst{7-0} = regs{7-0};
+}
+
+let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in
+def tPUSH : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
+                IIC_iStore_m,
+                "push${p}\t$regs", []>,
+            T1Misc<{0,1,0,?,?,?,?}> {
+  bits<16> regs;
+  let Inst{8}   = regs{14};
+  let Inst{7-0} = regs{7-0};
+}
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+// Helper classes for encoding T1pI patterns:
+class T1pIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
+                   string opc, string asm, list<dag> pattern>
+    : T1pI<oops, iops, itin, opc, asm, pattern>,
+      T1DataProcessing<opA> {
+  bits<3> Rm;
+  bits<3> Rn;
+  let Inst{5-3} = Rm;
+  let Inst{2-0} = Rn;
+}
+class T1pIMiscEncode<bits<7> opA, dag oops, dag iops, InstrItinClass itin,
+                     string opc, string asm, list<dag> pattern>
+    : T1pI<oops, iops, itin, opc, asm, pattern>,
+      T1Misc<opA> {
+  bits<3> Rm;
+  bits<3> Rd;
+  let Inst{5-3} = Rm;
+  let Inst{2-0} = Rd;
+}
+
+// Helper classes for encoding T1sI patterns:
+class T1sIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
+                   string opc, string asm, list<dag> pattern>
+    : T1sI<oops, iops, itin, opc, asm, pattern>,
+      T1DataProcessing<opA> {
+  bits<3> Rd;
+  bits<3> Rn;
+  let Inst{5-3} = Rn;
+  let Inst{2-0} = Rd;
+}
+class T1sIGenEncode<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
+                    string opc, string asm, list<dag> pattern>
+    : T1sI<oops, iops, itin, opc, asm, pattern>,
+      T1General<opA> {
+  bits<3> Rm;
+  bits<3> Rn;
+  bits<3> Rd;
+  let Inst{8-6} = Rm;
+  let Inst{5-3} = Rn;
+  let Inst{2-0} = Rd;
+}
+class T1sIGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
+                       string opc, string asm, list<dag> pattern>
+    : T1sI<oops, iops, itin, opc, asm, pattern>,
+      T1General<opA> {
+  bits<3> Rd;
+  bits<3> Rm;
+  let Inst{5-3} = Rm;
+  let Inst{2-0} = Rd;
+}
+
+// Helper classes for encoding T1sIt patterns:
+class T1sItDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
+                    string opc, string asm, list<dag> pattern>
+    : T1sIt<oops, iops, itin, opc, asm, pattern>,
+      T1DataProcessing<opA> {
+  bits<3> Rdn;
+  bits<3> Rm;
+  let Inst{5-3} = Rm;
+  let Inst{2-0} = Rdn;
+}
+class T1sItGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
+                        string opc, string asm, list<dag> pattern>
+    : T1sIt<oops, iops, itin, opc, asm, pattern>,
+      T1General<opA> {
+  bits<3> Rdn;
+  bits<8> imm8;
+  let Inst{10-8} = Rdn;
+  let Inst{7-0}  = imm8;
+}
+
+let isAdd = 1 in {
+  // Add with carry register
+  let isCommutable = 1, Uses = [CPSR] in
+  def tADC :                      // A8.6.2
+    T1sItDPEncode<0b0101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), IIC_iALUr,
+                  "adc", "\t$Rdn, $Rm",
+                  [(set tGPR:$Rdn, (adde tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+  // Add immediate
+  def tADDi3 :                    // A8.6.4 T1
+    T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
+                     IIC_iALUi,
+                     "add", "\t$Rd, $Rm, $imm3",
+                     [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]>,
+                     Sched<[WriteALU]> {
+    bits<3> imm3;
+    let Inst{8-6} = imm3;
+  }
+
+  def tADDi8 :                    // A8.6.4 T2
+    T1sItGenEncodeImm<{1,1,0,?,?}, (outs tGPR:$Rdn),
+                      (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
+                      "add", "\t$Rdn, $imm8",
+                      [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255:$imm8))]>,
+                      Sched<[WriteALU]>;
+
+  // Add register
+  let isCommutable = 1 in
+  def tADDrr :                    // A8.6.6 T1
+    T1sIGenEncode<0b01100, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
+                  IIC_iALUr,
+                  "add", "\t$Rd, $Rn, $Rm",
+                  [(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+  let hasSideEffects = 0 in
+  def tADDhirr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iALUr,
+                       "add", "\t$Rdn, $Rm", []>,
+                 T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
+    // A8.6.6 T2
+    bits<4> Rdn;
+    bits<4> Rm;
+    let Inst{7}   = Rdn{3};
+    let Inst{6-3} = Rm;
+    let Inst{2-0} = Rdn{2-0};
+  }
+}
+
+// AND register
+let isCommutable = 1 in
+def tAND :                      // A8.6.12
+  T1sItDPEncode<0b0000, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iBITr,
+                "and", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (and tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// ASR immediate
+def tASRri :                    // A8.6.14
+  T1sIGenEncodeImm<{0,1,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
+                   IIC_iMOVsi,
+                   "asr", "\t$Rd, $Rm, $imm5",
+                   [(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm_sr:$imm5)))]>,
+                   Sched<[WriteALU]> {
+  bits<5> imm5;
+  let Inst{10-6} = imm5;
+}
+
+// ASR register
+def tASRrr :                    // A8.6.15
+  T1sItDPEncode<0b0100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMOVsr,
+                "asr", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (sra tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// BIC register
+def tBIC :                      // A8.6.20
+  T1sItDPEncode<0b1110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iBITr,
+                "bic", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (and tGPR:$Rn, (not tGPR:$Rm)))]>,
+                Sched<[WriteALU]>;
+
+// CMN register
+let isCompare = 1, Defs = [CPSR] in {
+//FIXME: Disable CMN, as CCodes are backwards from compare expectations
+//       Compare-to-zero still works out, just not the relationals
+//def tCMN :                     // A8.6.33
+//  T1pIDPEncode<0b1011, (outs), (ins tGPR:$lhs, tGPR:$rhs),
+//               IIC_iCMPr,
+//               "cmn", "\t$lhs, $rhs",
+//               [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>;
+
+def tCMNz :                     // A8.6.33
+  T1pIDPEncode<0b1011, (outs), (ins tGPR:$Rn, tGPR:$Rm),
+               IIC_iCMPr,
+               "cmn", "\t$Rn, $Rm",
+               [(ARMcmpZ tGPR:$Rn, (ineg tGPR:$Rm))]>, Sched<[WriteCMP]>;
+
+} // isCompare = 1, Defs = [CPSR]
+
+// CMP immediate
+let isCompare = 1, Defs = [CPSR] in {
+def tCMPi8 : T1pI<(outs), (ins tGPR:$Rn, imm0_255:$imm8), IIC_iCMPi,
+                  "cmp", "\t$Rn, $imm8",
+                  [(ARMcmp tGPR:$Rn, imm0_255:$imm8)]>,
+             T1General<{1,0,1,?,?}>, Sched<[WriteCMP]> {
+  // A8.6.35
+  bits<3> Rn;
+  bits<8> imm8;
+  let Inst{10-8} = Rn;
+  let Inst{7-0}  = imm8;
+}
+
+// CMP register
+def tCMPr :                     // A8.6.36 T1
+  T1pIDPEncode<0b1010, (outs), (ins tGPR:$Rn, tGPR:$Rm),
+               IIC_iCMPr,
+               "cmp", "\t$Rn, $Rm",
+               [(ARMcmp tGPR:$Rn, tGPR:$Rm)]>, Sched<[WriteCMP]>;
+
+def tCMPhir : T1pI<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_iCMPr,
+                   "cmp", "\t$Rn, $Rm", []>,
+              T1Special<{0,1,?,?}>, Sched<[WriteCMP]> {
+  // A8.6.36 T2
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{7}   = Rn{3};
+  let Inst{6-3} = Rm;
+  let Inst{2-0} = Rn{2-0};
+}
+} // isCompare = 1, Defs = [CPSR]
+
+
+// XOR register
+let isCommutable = 1 in
+def tEOR :                      // A8.6.45
+  T1sItDPEncode<0b0001, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iBITr,
+                "eor", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (xor tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// LSL immediate
+def tLSLri :                    // A8.6.88
+  T1sIGenEncodeImm<{0,0,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_31:$imm5),
+                   IIC_iMOVsi,
+                   "lsl", "\t$Rd, $Rm, $imm5",
+                   [(set tGPR:$Rd, (shl tGPR:$Rm, (i32 imm:$imm5)))]>,
+                   Sched<[WriteALU]> {
+  bits<5> imm5;
+  let Inst{10-6} = imm5;
+}
+
+// LSL register
+def tLSLrr :                    // A8.6.89
+  T1sItDPEncode<0b0010, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMOVsr,
+                "lsl", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (shl tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// LSR immediate
+def tLSRri :                    // A8.6.90
+  T1sIGenEncodeImm<{0,0,1,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
+                   IIC_iMOVsi,
+                   "lsr", "\t$Rd, $Rm, $imm5",
+                   [(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm_sr:$imm5)))]>,
+                   Sched<[WriteALU]> {
+  bits<5> imm5;
+  let Inst{10-6} = imm5;
+}
+
+// LSR register
+def tLSRrr :                    // A8.6.91
+  T1sItDPEncode<0b0011, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMOVsr,
+                "lsr", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (srl tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// Move register
+let isMoveImm = 1 in
+def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255:$imm8), IIC_iMOVi,
+                  "mov", "\t$Rd, $imm8",
+                  [(set tGPR:$Rd, imm0_255:$imm8)]>,
+             T1General<{1,0,0,?,?}>, Sched<[WriteALU]> {
+  // A8.6.96
+  bits<3> Rd;
+  bits<8> imm8;
+  let Inst{10-8} = Rd;
+  let Inst{7-0}  = imm8;
+}
+// Because we have an explicit tMOVSr below, we need an alias to handle
+// the immediate "movs" form here. Blech.
+def : tInstAlias <"movs $Rdn, $imm",
+                 (tMOVi8 tGPR:$Rdn, CPSR, imm0_255:$imm, 14, 0)>;
+
+// A7-73: MOV(2) - mov setting flag.
+
+let hasSideEffects = 0 in {
+def tMOVr : Thumb1pI<(outs GPR:$Rd), (ins GPR:$Rm), AddrModeNone,
+                      2, IIC_iMOVr,
+                      "mov", "\t$Rd, $Rm", "", []>,
+                  T1Special<{1,0,?,?}>, Sched<[WriteALU]> {
+  // A8.6.97
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{7}   = Rd{3};
+  let Inst{6-3} = Rm;
+  let Inst{2-0} = Rd{2-0};
+}
+let Defs = [CPSR] in
+def tMOVSr      : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr,
+                      "movs\t$Rd, $Rm", []>, Encoding16, Sched<[WriteALU]> {
+  // A8.6.97
+  bits<3> Rd;
+  bits<3> Rm;
+  let Inst{15-6} = 0b0000000000;
+  let Inst{5-3}  = Rm;
+  let Inst{2-0}  = Rd;
+}
+} // hasSideEffects
+
+// Multiply register
+let isCommutable = 1 in
+def tMUL :                      // A8.6.105 T1
+  Thumb1sI<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), AddrModeNone, 2,
+           IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", "$Rm = $Rd",
+           [(set tGPR:$Rd, (mul tGPR:$Rn, tGPR:$Rm))]>,
+      T1DataProcessing<0b1101> {
+  bits<3> Rd;
+  bits<3> Rn;
+  let Inst{5-3} = Rn;
+  let Inst{2-0} = Rd;
+  let AsmMatchConverter = "cvtThumbMultiply";
+}
+
+def :tInstAlias<"mul${s}${p} $Rdm, $Rn", (tMUL tGPR:$Rdm, s_cc_out:$s, tGPR:$Rn,
+                                               pred:$p)>;
+
+// Move inverse register
+def tMVN :                      // A8.6.107
+  T1sIDPEncode<0b1111, (outs tGPR:$Rd), (ins tGPR:$Rn), IIC_iMVNr,
+               "mvn", "\t$Rd, $Rn",
+               [(set tGPR:$Rd, (not tGPR:$Rn))]>, Sched<[WriteALU]>;
+
+// Bitwise or register
+let isCommutable = 1 in
+def tORR :                      // A8.6.114
+  T1sItDPEncode<0b1100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iBITr,
+                "orr", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (or tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+
+// Swaps
+def tREV :                      // A8.6.134
+  T1pIMiscEncode<{1,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "rev", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (bswap tGPR:$Rm))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
+
+def tREV16 :                    // A8.6.135
+  T1pIMiscEncode<{1,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "rev16", "\t$Rd, $Rm",
+             [(set tGPR:$Rd, (rotr (bswap tGPR:$Rm), (i32 16)))]>,
+                Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
+
+def tREVSH :                    // A8.6.136
+  T1pIMiscEncode<{1,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "revsh", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (sra (bswap tGPR:$Rm), (i32 16)))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
+
+// Rotate right register
+def tROR :                      // A8.6.139
+  T1sItDPEncode<0b0111, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMOVsr,
+                "ror", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (rotr tGPR:$Rn, tGPR:$Rm))]>,
+                Sched<[WriteALU]>;
+
+// Negate register
+def tRSB :                      // A8.6.141
+  T1sIDPEncode<0b1001, (outs tGPR:$Rd), (ins tGPR:$Rn),
+               IIC_iALUi,
+               "rsb", "\t$Rd, $Rn, #0",
+               [(set tGPR:$Rd, (ineg tGPR:$Rn))]>, Sched<[WriteALU]>;
+
+// Subtract with carry register
+let Uses = [CPSR] in
+def tSBC :                      // A8.6.151
+  T1sItDPEncode<0b0110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iALUr,
+                "sbc", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (sube tGPR:$Rn, tGPR:$Rm))]>,
+                Sched<[WriteALU]>;
+
+// Subtract immediate
+def tSUBi3 :                    // A8.6.210 T1
+  T1sIGenEncodeImm<0b01111, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
+                   IIC_iALUi,
+                   "sub", "\t$Rd, $Rm, $imm3",
+                   [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7_neg:$imm3))]>,
+                   Sched<[WriteALU]> {
+  bits<3> imm3;
+  let Inst{8-6} = imm3;
+}
+
+def tSUBi8 :                    // A8.6.210 T2
+  T1sItGenEncodeImm<{1,1,1,?,?}, (outs tGPR:$Rdn),
+                    (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
+                    "sub", "\t$Rdn, $imm8",
+                    [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>,
+                    Sched<[WriteALU]>;
+
+// Subtract register
+def tSUBrr :                    // A8.6.212
+  T1sIGenEncode<0b01101, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iALUr,
+                "sub", "\t$Rd, $Rn, $Rm",
+                [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>,
+                Sched<[WriteALU]>;
+
+// Sign-extend byte
+def tSXTB :                     // A8.6.222
+  T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "sxtb", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i8))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>,
+                 Sched<[WriteALU]>;
+
+// Sign-extend short
+def tSXTH :                     // A8.6.224
+  T1pIMiscEncode<{0,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "sxth", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i16))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>,
+                 Sched<[WriteALU]>;
+
+// Test
+let isCompare = 1, isCommutable = 1, Defs = [CPSR] in
+def tTST :                      // A8.6.230
+  T1pIDPEncode<0b1000, (outs), (ins tGPR:$Rn, tGPR:$Rm), IIC_iTSTr,
+               "tst", "\t$Rn, $Rm",
+               [(ARMcmpZ (and_su tGPR:$Rn, tGPR:$Rm), 0)]>,
+               Sched<[WriteALU]>;
+
+// A8.8.247  UDF - Undefined (Encoding T1)
+def tUDF : TI<(outs), (ins imm0_255:$imm8), IIC_Br, "udf\t$imm8",
+              [(int_arm_undefined imm0_255:$imm8)]>, Encoding16 {
+  bits<8> imm8;
+  let Inst{15-12} = 0b1101;
+  let Inst{11-8} = 0b1110;
+  let Inst{7-0} = imm8;
+}
+
+def t__brkdiv0 : TI<(outs), (ins), IIC_Br, "__brkdiv0",
+                    [(int_arm_undefined 249)]>, Encoding16,
+    Requires<[IsThumb, IsWindows]> {
+  let Inst = 0xdef9;
+  let isTerminator = 1;
+}
+
+// Zero-extend byte
+def tUXTB :                     // A8.6.262
+  T1pIMiscEncode<{0,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "uxtb", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (and tGPR:$Rm, 0xFF))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>,
+                 Sched<[WriteALU]>;
+
+// Zero-extend short
+def tUXTH :                     // A8.6.264
+  T1pIMiscEncode<{0,0,1,0,1,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "uxth", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (and tGPR:$Rm, 0xFFFF))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
+
+// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC operation.
+// Expanded after instruction selection into a branch sequence.
+let usesCustomInserter = 1 in  // Expanded after instruction selection.
+  def tMOVCCr_pseudo :
+  PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, cmovpred:$p),
+             NoItinerary,
+             [(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, cmovpred:$p))]>;
+
+// tLEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+
+def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p),
+               IIC_iALUi, "adr{$p}\t$Rd, $addr", []>,
+               T1Encoding<{1,0,1,0,0,?}>, Sched<[WriteALU]> {
+  bits<3> Rd;
+  bits<8> addr;
+  let Inst{10-8} = Rd;
+  let Inst{7-0} = addr;
+  let DecoderMethod = "DecodeThumbAddSpecialReg";
+}
+
+let hasSideEffects = 0, isReMaterializable = 1 in
+def tLEApcrel   : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p),
+                              2, IIC_iALUi, []>, Sched<[WriteALU]>;
+
+let hasSideEffects = 1 in
+def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
+                              (ins i32imm:$label, pred:$p),
+                              2, IIC_iALUi, []>, Sched<[WriteALU]>;
+
+// Thumb-1 doesn't have the TBB or TBH instructions, but we can synthesize them
+// and make use of the same compressed jump table format as Thumb-2.
+let Size = 2 in {
+def tTBB_JT : tPseudoInst<(outs),
+        (ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
+        Sched<[WriteBr]>;
+
+def tTBH_JT : tPseudoInst<(outs),
+        (ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
+        Sched<[WriteBr]>;
+}
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+// This is a pseudo inst so that we can get the encoding right,
+// complete with fixup for the aeabi_read_tp function.
+let isCall = 1, Defs = [R0, R12, LR, CPSR], Uses = [SP] in
+def tTPsoft : tPseudoInst<(outs), (ins), 4, IIC_Br,
+                          [(set R0, ARMthread_pointer)]>,
+                          Sched<[WriteBr]>;
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+//
+
+// eh_sjlj_setjmp() is an instruction sequence to store the return address and
+// save #0 in R0 for the non-longjmp case.  Since by its nature we may be coming
+// from some other function to get here, and we're using the stack frame for the
+// containing function to save/restore registers, we can't keep anything live in
+// regs across the eh_sjlj_setjmp(), else it will almost certainly have been
+// tromped upon when we get here from a longjmp(). We force everything out of
+// registers except for our own input by listing the relevant registers in
+// Defs. By doing so, we also cause the prologue/epilogue code to actively
+// preserve all of the callee-saved resgisters, which is exactly what we want.
+// $val is a scratch register for our use.
+let Defs = [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12, CPSR ],
+    hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+    usesCustomInserter = 1 in
+def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val),
+                                  AddrModeNone, 0, NoItinerary, "","",
+                          [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>;
+
+// FIXME: Non-IOS version(s)
+let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
+    Defs = [ R7, LR, SP ] in
+def tInt_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch),
+                              AddrModeNone, 0, IndexModeNone,
+                              Pseudo, NoItinerary, "", "",
+                              [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
+                             Requires<[IsThumb,IsNotWindows]>;
+
+let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
+    Defs = [ R11, LR, SP ] in
+def tInt_WIN_eh_sjlj_longjmp
+  : XI<(outs), (ins GPR:$src, GPR:$scratch), AddrModeNone, 0, IndexModeNone,
+       Pseudo, NoItinerary, "", "", [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
+    Requires<[IsThumb,IsWindows]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// Comparisons
+def : T1Pat<(ARMcmpZ tGPR:$Rn, imm0_255:$imm8),
+            (tCMPi8  tGPR:$Rn, imm0_255:$imm8)>;
+def : T1Pat<(ARMcmpZ tGPR:$Rn, tGPR:$Rm),
+            (tCMPr   tGPR:$Rn, tGPR:$Rm)>;
+
+// Add with carry
+def : T1Pat<(addc   tGPR:$lhs, imm0_7:$rhs),
+            (tADDi3 tGPR:$lhs, imm0_7:$rhs)>;
+def : T1Pat<(addc   tGPR:$lhs, imm8_255:$rhs),
+            (tADDi8 tGPR:$lhs, imm8_255:$rhs)>;
+def : T1Pat<(addc   tGPR:$lhs, tGPR:$rhs),
+            (tADDrr tGPR:$lhs, tGPR:$rhs)>;
+
+// Subtract with carry
+def : T1Pat<(addc   tGPR:$lhs, imm0_7_neg:$rhs),
+            (tSUBi3 tGPR:$lhs, imm0_7_neg:$rhs)>;
+def : T1Pat<(addc   tGPR:$lhs, imm8_255_neg:$rhs),
+            (tSUBi8 tGPR:$lhs, imm8_255_neg:$rhs)>;
+def : T1Pat<(subc   tGPR:$lhs, tGPR:$rhs),
+            (tSUBrr tGPR:$lhs, tGPR:$rhs)>;
+
+// Bswap 16 with load/store
+def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)),
+            (tREV16 (tLDRHi t_addrmode_is2:$addr))>;
+def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rr:$addr)), (i32 16)),
+            (tREV16 (tLDRHr t_addrmode_rr:$addr))>;
+def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
+                           t_addrmode_is2:$addr),
+            (tSTRHi(tREV16 tGPR:$Rn), t_addrmode_is2:$addr)>;
+def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
+                           t_addrmode_rr:$addr),
+            (tSTRHr (tREV16 tGPR:$Rn), t_addrmode_rr:$addr)>;
+
+// ConstantPool
+def : T1Pat<(ARMWrapper  tconstpool  :$dst), (tLEApcrel tconstpool  :$dst)>;
+
+// GlobalAddress
+def tLDRLIT_ga_pcrel : PseudoInst<(outs tGPR:$dst), (ins i32imm:$addr),
+                                  IIC_iLoadiALU,
+                                  [(set tGPR:$dst,
+                                        (ARMWrapperPIC tglobaladdr:$addr))]>,
+                       Requires<[IsThumb, DontUseMovt]>;
+
+def tLDRLIT_ga_abs : PseudoInst<(outs tGPR:$dst), (ins i32imm:$src),
+                                IIC_iLoad_i,
+                                [(set tGPR:$dst,
+                                      (ARMWrapper tglobaladdr:$src))]>,
+                     Requires<[IsThumb, DontUseMovt]>;
+
+// TLS globals
+def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
+          (tLDRLIT_ga_pcrel tglobaltlsaddr:$addr)>,
+      Requires<[IsThumb, DontUseMovt]>;
+def : Pat<(ARMWrapper tglobaltlsaddr:$addr),
+          (tLDRLIT_ga_abs tglobaltlsaddr:$addr)>,
+      Requires<[IsThumb, DontUseMovt]>;
+
+
+// JumpTable
+def : T1Pat<(ARMWrapperJT tjumptable:$dst),
+            (tLEApcrelJT tjumptable:$dst)>;
+
+// Direct calls
+def : T1Pat<(ARMcall texternalsym:$func), (tBL texternalsym:$func)>,
+      Requires<[IsThumb]>;
+
+// zextload i1 -> zextload i8
+def : T1Pat<(zextloadi1 t_addrmode_is1:$addr),
+            (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(zextloadi1 t_addrmode_rr:$addr),
+            (tLDRBr t_addrmode_rr:$addr)>;
+
+// extload from the stack -> word load from the stack, as it avoids having to
+// materialize the base in a separate register. This only works when a word
+// load puts the byte/halfword value in the same place in the register that the
+// byte/halfword load would, i.e. when little-endian.
+def : T1Pat<(extloadi1  t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
+      Requires<[IsThumb, IsThumb1Only, IsLE]>;
+def : T1Pat<(extloadi8  t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
+      Requires<[IsThumb, IsThumb1Only, IsLE]>;
+def : T1Pat<(extloadi16 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
+      Requires<[IsThumb, IsThumb1Only, IsLE]>;
+
+// extload -> zextload
+def : T1Pat<(extloadi1  t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(extloadi1  t_addrmode_rr:$addr),  (tLDRBr t_addrmode_rr:$addr)>;
+def : T1Pat<(extloadi8  t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(extloadi8  t_addrmode_rr:$addr),  (tLDRBr t_addrmode_rr:$addr)>;
+def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>;
+def : T1Pat<(extloadi16 t_addrmode_rr:$addr),  (tLDRHr t_addrmode_rr:$addr)>;
+
+// post-inc loads and stores
+
+// post-inc LDR -> LDM r0!, {r1}. The way operands are layed out in LDMs is
+// different to how ISel expects them for a post-inc load, so use a pseudo
+// and expand it just after ISel.
+let usesCustomInserter = 1,
+    Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in
+ def tLDR_postidx: tPseudoInst<(outs rGPR:$Rt, rGPR:$Rn_wb),
+                               (ins rGPR:$Rn, pred:$p),
+                               4, IIC_iStore_ru,
+                               []>;
+
+// post-inc STR -> STM r0!, {r1}. The layout of this (because it doesn't def
+// multiple registers) is the same in ISel as MachineInstr, so there's no need
+// for a pseudo.
+def : T1Pat<(post_store rGPR:$Rt, rGPR:$Rn, 4),
+            (tSTMIA_UPD rGPR:$Rn, rGPR:$Rt)>;
+
+// If it's impossible to use [r,r] address mode for sextload, select to
+// ldr{b|h} + sxt{b|h} instead.
+def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
+            (tSXTB (tLDRBi t_addrmode_is1:$addr))>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi8 t_addrmode_rr:$addr),
+            (tSXTB (tLDRBr t_addrmode_rr:$addr))>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
+            (tSXTH (tLDRHi t_addrmode_is2:$addr))>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi16 t_addrmode_rr:$addr),
+            (tSXTH (tLDRHr t_addrmode_rr:$addr))>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
+            (tASRri (tLSLri (tLDRBi t_addrmode_is1:$addr), 24), 24)>;
+def : T1Pat<(sextloadi8 t_addrmode_rr:$addr),
+            (tASRri (tLSLri (tLDRBr t_addrmode_rr:$addr), 24), 24)>;
+def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
+            (tASRri (tLSLri (tLDRHi t_addrmode_is2:$addr), 16), 16)>;
+def : T1Pat<(sextloadi16 t_addrmode_rr:$addr),
+            (tASRri (tLSLri (tLDRHr t_addrmode_rr:$addr), 16), 16)>;
+
+def : T1Pat<(atomic_load_8 t_addrmode_is1:$src),
+             (tLDRBi t_addrmode_is1:$src)>;
+def : T1Pat<(atomic_load_8 t_addrmode_rr:$src),
+             (tLDRBr t_addrmode_rr:$src)>;
+def : T1Pat<(atomic_load_16 t_addrmode_is2:$src),
+             (tLDRHi t_addrmode_is2:$src)>;
+def : T1Pat<(atomic_load_16 t_addrmode_rr:$src),
+             (tLDRHr t_addrmode_rr:$src)>;
+def : T1Pat<(atomic_load_32 t_addrmode_is4:$src),
+             (tLDRi t_addrmode_is4:$src)>;
+def : T1Pat<(atomic_load_32 t_addrmode_rr:$src),
+             (tLDRr t_addrmode_rr:$src)>;
+def : T1Pat<(atomic_store_8 t_addrmode_is1:$ptr, tGPR:$val),
+             (tSTRBi tGPR:$val, t_addrmode_is1:$ptr)>;
+def : T1Pat<(atomic_store_8 t_addrmode_rr:$ptr, tGPR:$val),
+             (tSTRBr tGPR:$val, t_addrmode_rr:$ptr)>;
+def : T1Pat<(atomic_store_16 t_addrmode_is2:$ptr, tGPR:$val),
+             (tSTRHi tGPR:$val, t_addrmode_is2:$ptr)>;
+def : T1Pat<(atomic_store_16 t_addrmode_rr:$ptr, tGPR:$val),
+             (tSTRHr tGPR:$val, t_addrmode_rr:$ptr)>;
+def : T1Pat<(atomic_store_32 t_addrmode_is4:$ptr, tGPR:$val),
+             (tSTRi tGPR:$val, t_addrmode_is4:$ptr)>;
+def : T1Pat<(atomic_store_32 t_addrmode_rr:$ptr, tGPR:$val),
+             (tSTRr tGPR:$val, t_addrmode_rr:$ptr)>;
+
+// Large immediate handling.
+
+// Two piece imms.
+def : T1Pat<(i32 thumb_immshifted:$src),
+            (tLSLri (tMOVi8 (thumb_immshifted_val imm:$src)),
+                    (thumb_immshifted_shamt imm:$src))>;
+
+def : T1Pat<(i32 imm0_255_comp:$src),
+            (tMVN (tMOVi8 (imm_comp_XFORM imm:$src)))>;
+
+def : T1Pat<(i32 imm256_510:$src),
+            (tADDi8 (tMOVi8 255),
+                    (thumb_imm256_510_addend imm:$src))>;
+
+// Pseudo instruction that combines ldr from constpool and add pc. This should
+// be expanded into two instructions late to allow if-conversion and
+// scheduling.
+let isReMaterializable = 1 in
+def tLDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp),
+                             NoItinerary,
+               [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
+                                           imm:$cp))]>,
+               Requires<[IsThumb, IsThumb1Only]>;
+
+// Pseudo-instruction for merged POP and return.
+// FIXME: remove when we have a way to marking a MI with these properties.
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+    hasExtraDefRegAllocReq = 1 in
+def tPOP_RET : tPseudoExpand<(outs), (ins pred:$p, reglist:$regs, variable_ops),
+                           2, IIC_iPop_Br, [],
+                           (tPOP pred:$p, reglist:$regs)>, Sched<[WriteBrL]>;
+
+// Indirect branch using "mov pc, $Rm"
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def tBRIND : tPseudoExpand<(outs), (ins GPR:$Rm, pred:$p),
+                  2, IIC_Br, [(brind GPR:$Rm)],
+                  (tMOVr PC, GPR:$Rm, pred:$p)>, Sched<[WriteBr]>;
+}
+
+
+// In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00
+// encoding is available on ARMv6K, but we don't differentiate that finely.
+def : InstAlias<"nop", (tMOVr R8, R8, 14, 0), 0>, Requires<[IsThumb, IsThumb1Only]>;
+
+
+// For round-trip assembly/disassembly, we have to handle a CPS instruction
+// without any iflags. That's not, strictly speaking, valid syntax, but it's
+// a useful extension and assembles to defined behaviour (the insn does
+// nothing).
+def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
+def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
+
+// "neg" is and alias for "rsb rd, rn, #0"
+def : tInstAlias<"neg${s}${p} $Rd, $Rm",
+                 (tRSB tGPR:$Rd, s_cc_out:$s, tGPR:$Rm, pred:$p)>;
+
+
+// Implied destination operand forms for shifts.
+def : tInstAlias<"lsl${s}${p} $Rdm, $imm",
+             (tLSLri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm0_31:$imm, pred:$p)>;
+def : tInstAlias<"lsr${s}${p} $Rdm, $imm",
+             (tLSRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
+def : tInstAlias<"asr${s}${p} $Rdm, $imm",
+             (tASRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
+
+// Pseudo instruction ldr Rt, =immediate
+def tLDRConstPool
+  : tAsmPseudo<"ldr${p} $Rt, $immediate",
+               (ins tGPR:$Rt, const_pool_asm_imm:$immediate, pred:$p)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
new file mode 100644
index 000000000000..603d66403e65
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -0,0 +1,4673 @@
+//===-- ARMInstrThumb2.td - Thumb2 support for ARM ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Thumb2 instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+// IT block predicate field
+def it_pred_asmoperand : AsmOperandClass {
+  let Name = "ITCondCode";
+  let ParserMethod = "parseITCondCode";
+}
+def it_pred : Operand<i32> {
+  let PrintMethod = "printMandatoryPredicateOperand";
+  let ParserMatchClass = it_pred_asmoperand;
+}
+
+// IT block condition mask
+def it_mask_asmoperand : AsmOperandClass { let Name = "ITMask"; }
+def it_mask : Operand<i32> {
+  let PrintMethod = "printThumbITMask";
+  let ParserMatchClass = it_mask_asmoperand;
+}
+
+// t2_shift_imm: An integer that encodes a shift amount and the type of shift
+// (asr or lsl). The 6-bit immediate encodes as:
+//    {5}     0 ==> lsl
+//            1     asr
+//    {4-0}   imm5 shift amount.
+//            asr #32 not allowed
+def t2_shift_imm : Operand<i32> {
+  let PrintMethod = "printShiftImmOperand";
+  let ParserMatchClass = ShifterImmAsmOperand;
+  let DecoderMethod = "DecodeT2ShifterImmOperand";
+}
+
+// Shifted operands. No register controlled shifts for Thumb2.
+// Note: We do not support rrx shifted operands yet.
+def t2_so_reg : Operand<i32>,    // reg imm
+                ComplexPattern<i32, 2, "SelectShiftImmShifterOperand",
+                               [shl,srl,sra,rotr]> {
+  let EncoderMethod = "getT2SORegOpValue";
+  let PrintMethod = "printT2SOOperand";
+  let DecoderMethod = "DecodeSORegImmOperand";
+  let ParserMatchClass = ShiftedImmAsmOperand;
+  let MIOperandInfo = (ops rGPR, i32imm);
+}
+
+// t2_so_imm_not_XFORM - Return the complement of a t2_so_imm value
+def t2_so_imm_not_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+// t2_so_imm_neg_XFORM - Return the negation of a t2_so_imm value
+def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-((int)N->getZExtValue()), SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+// so_imm_notSext_XFORM - Return a so_imm value packed into the format
+// described for so_imm_notSext def below, with sign extension from 16
+// bits.
+def t2_so_imm_notSext16_XFORM : SDNodeXForm<imm, [{
+  APInt apIntN = N->getAPIntValue();
+  unsigned N16bitSignExt = apIntN.trunc(16).sext(32).getZExtValue();
+  return CurDAG->getTargetConstant(~N16bitSignExt, SDLoc(N), MVT::i32);
+}]>;
+
+// t2_so_imm - Match a 32-bit immediate operand, which is an
+// 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit
+// immediate splatted into multiple bytes of the word.
+def t2_so_imm_asmoperand : ImmAsmOperand { let Name = "T2SOImm"; }
+def t2_so_imm : Operand<i32>, ImmLeaf<i32, [{
+    return ARM_AM::getT2SOImmVal(Imm) != -1;
+  }]> {
+  let ParserMatchClass = t2_so_imm_asmoperand;
+  let EncoderMethod = "getT2SOImmOpValue";
+  let DecoderMethod = "DecodeT2SOImm";
+}
+
+// t2_so_imm_not - Match an immediate that is a complement
+// of a t2_so_imm.
+// Note: this pattern doesn't require an encoder method and such, as it's
+// only used on aliases (Pat<> and InstAlias<>). The actual encoding
+// is handled by the destination instructions, which use t2_so_imm.
+def t2_so_imm_not_asmoperand : AsmOperandClass { let Name = "T2SOImmNot"; }
+def t2_so_imm_not : Operand<i32>, PatLeaf<(imm), [{
+  return ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())) != -1;
+}], t2_so_imm_not_XFORM> {
+  let ParserMatchClass = t2_so_imm_not_asmoperand;
+}
+
+// t2_so_imm_notSext - match an immediate that is a complement of a t2_so_imm
+// if the upper 16 bits are zero.
+def t2_so_imm_notSext : Operand<i32>, PatLeaf<(imm), [{
+    APInt apIntN = N->getAPIntValue();
+    if (!apIntN.isIntN(16)) return false;
+    unsigned N16bitSignExt = apIntN.trunc(16).sext(32).getZExtValue();
+    return ARM_AM::getT2SOImmVal(~N16bitSignExt) != -1;
+  }], t2_so_imm_notSext16_XFORM> {
+  let ParserMatchClass = t2_so_imm_not_asmoperand;
+}
+
+// t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm.
+def t2_so_imm_neg_asmoperand : AsmOperandClass { let Name = "T2SOImmNeg"; }
+def t2_so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
+  int64_t Value = -(int)N->getZExtValue();
+  return Value && ARM_AM::getT2SOImmVal(Value) != -1;
+}], t2_so_imm_neg_XFORM> {
+  let ParserMatchClass = t2_so_imm_neg_asmoperand;
+}
+
+/// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
+def imm0_4095_asmoperand: ImmAsmOperand { let Name = "Imm0_4095"; }
+def imm0_4095 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 4096;
+}]> {
+  let ParserMatchClass = imm0_4095_asmoperand;
+}
+
+def imm0_4095_neg_asmoperand: AsmOperandClass { let Name = "Imm0_4095Neg"; }
+def imm0_4095_neg : Operand<i32>, PatLeaf<(i32 imm), [{
+ return (uint32_t)(-N->getZExtValue()) < 4096;
+}], imm_neg_XFORM> {
+  let ParserMatchClass = imm0_4095_neg_asmoperand;
+}
+
+def imm1_255_neg : PatLeaf<(i32 imm), [{
+  uint32_t Val = -N->getZExtValue();
+  return (Val > 0 && Val < 255);
+}], imm_neg_XFORM>;
+
+def imm0_255_not : PatLeaf<(i32 imm), [{
+  return (uint32_t)(~N->getZExtValue()) < 255;
+}], imm_comp_XFORM>;
+
+def lo5AllOne : PatLeaf<(i32 imm), [{
+  // Returns true if all low 5-bits are 1.
+  return (((uint32_t)N->getZExtValue()) & 0x1FUL) == 0x1FUL;
+}]>;
+
+// Define Thumb2 specific addressing modes.
+
+// t2addrmode_imm12  := reg + imm12
+def t2addrmode_imm12_asmoperand : AsmOperandClass {let Name="MemUImm12Offset";}
+def t2addrmode_imm12 : MemOperand,
+                       ComplexPattern<i32, 2, "SelectT2AddrModeImm12", []> {
+  let PrintMethod = "printAddrModeImm12Operand<false>";
+  let EncoderMethod = "getAddrModeImm12OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm12";
+  let ParserMatchClass = t2addrmode_imm12_asmoperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t2ldrlabel  := imm12
+def t2ldrlabel : Operand<i32> {
+  let EncoderMethod = "getAddrModeImm12OpValue";
+  let PrintMethod = "printThumbLdrLabelOperand";
+}
+
+def t2ldr_pcrel_imm12_asmoperand : AsmOperandClass {let Name = "MemPCRelImm12";}
+def t2ldr_pcrel_imm12 : Operand<i32> {
+  let ParserMatchClass = t2ldr_pcrel_imm12_asmoperand;
+  // used for assembler pseudo instruction and maps to t2ldrlabel, so
+  // doesn't need encoder or print methods of its own.
+}
+
+// ADR instruction labels.
+def t2adrlabel : Operand<i32> {
+  let EncoderMethod = "getT2AdrLabelOpValue";
+  let PrintMethod = "printAdrLabelOperand<0>";
+}
+
+// t2addrmode_posimm8  := reg + imm8
+def MemPosImm8OffsetAsmOperand : AsmOperandClass {let Name="MemPosImm8Offset";}
+def t2addrmode_posimm8 : MemOperand {
+  let PrintMethod = "printT2AddrModeImm8Operand<false>";
+  let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8";
+  let ParserMatchClass = MemPosImm8OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_negimm8  := reg - imm8
+def MemNegImm8OffsetAsmOperand : AsmOperandClass {let Name="MemNegImm8Offset";}
+def t2addrmode_negimm8 : MemOperand,
+                      ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
+  let PrintMethod = "printT2AddrModeImm8Operand<false>";
+  let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8";
+  let ParserMatchClass = MemNegImm8OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_imm8  := reg +/- imm8
+def MemImm8OffsetAsmOperand : AsmOperandClass { let Name = "MemImm8Offset"; }
+class T2AddrMode_Imm8 : MemOperand,
+                        ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
+  let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8";
+  let ParserMatchClass = MemImm8OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+def t2addrmode_imm8 : T2AddrMode_Imm8 {
+  let PrintMethod = "printT2AddrModeImm8Operand<false>";
+}
+
+def t2addrmode_imm8_pre : T2AddrMode_Imm8 {
+  let PrintMethod = "printT2AddrModeImm8Operand<true>";
+}
+
+def t2am_imm8_offset : MemOperand,
+                       ComplexPattern<i32, 1, "SelectT2AddrModeImm8Offset",
+                                      [], [SDNPWantRoot]> {
+  let PrintMethod = "printT2AddrModeImm8OffsetOperand";
+  let EncoderMethod = "getT2AddrModeImm8OffsetOpValue";
+  let DecoderMethod = "DecodeT2Imm8";
+}
+
+// t2addrmode_imm8s4  := reg +/- (imm8 << 2)
+def MemImm8s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm8s4Offset";}
+class T2AddrMode_Imm8s4 : MemOperand {
+  let EncoderMethod = "getT2AddrModeImm8s4OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8s4";
+  let ParserMatchClass = MemImm8s4OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+def t2addrmode_imm8s4 : T2AddrMode_Imm8s4 {
+  let PrintMethod = "printT2AddrModeImm8s4Operand<false>";
+}
+
+def t2addrmode_imm8s4_pre : T2AddrMode_Imm8s4 {
+  let PrintMethod = "printT2AddrModeImm8s4Operand<true>";
+}
+
+def t2am_imm8s4_offset_asmoperand : AsmOperandClass { let Name = "Imm8s4"; }
+def t2am_imm8s4_offset : MemOperand {
+  let PrintMethod = "printT2AddrModeImm8s4OffsetOperand";
+  let EncoderMethod = "getT2Imm8s4OpValue";
+  let DecoderMethod = "DecodeT2Imm8S4";
+}
+
+// t2addrmode_imm0_1020s4  := reg + (imm8 << 2)
+def MemImm0_1020s4OffsetAsmOperand : AsmOperandClass {
+  let Name = "MemImm0_1020s4Offset";
+}
+def t2addrmode_imm0_1020s4 : MemOperand,
+                         ComplexPattern<i32, 2, "SelectT2AddrModeExclusive"> {
+  let PrintMethod = "printT2AddrModeImm0_1020s4Operand";
+  let EncoderMethod = "getT2AddrModeImm0_1020s4OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm0_1020s4";
+  let ParserMatchClass = MemImm0_1020s4OffsetAsmOperand;
+  let MIOperandInfo = (ops GPRnopc:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_so_reg  := reg + (reg << imm2)
+def t2addrmode_so_reg_asmoperand : AsmOperandClass {let Name="T2MemRegOffset";}
+def t2addrmode_so_reg : MemOperand,
+                        ComplexPattern<i32, 3, "SelectT2AddrModeSoReg", []> {
+  let PrintMethod = "printT2AddrModeSoRegOperand";
+  let EncoderMethod = "getT2AddrModeSORegOpValue";
+  let DecoderMethod = "DecodeT2AddrModeSOReg";
+  let ParserMatchClass = t2addrmode_so_reg_asmoperand;
+  let MIOperandInfo = (ops GPRnopc:$base, rGPR:$offsreg, i32imm:$offsimm);
+}
+
+// Addresses for the TBB/TBH instructions.
+def addrmode_tbb_asmoperand : AsmOperandClass { let Name = "MemTBB"; }
+def addrmode_tbb : MemOperand {
+  let PrintMethod = "printAddrModeTBB";
+  let ParserMatchClass = addrmode_tbb_asmoperand;
+  let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm);
+}
+def addrmode_tbh_asmoperand : AsmOperandClass { let Name = "MemTBH"; }
+def addrmode_tbh : MemOperand {
+  let PrintMethod = "printAddrModeTBH";
+  let ParserMatchClass = addrmode_tbh_asmoperand;
+  let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm);
+}
+
+//===----------------------------------------------------------------------===//
+// Multiclass helpers...
+//
+
+
+class T2OneRegImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<12> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+
+class T2sOneRegImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+class T2OneRegCmpImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rn;
+  bits<12> imm;
+
+  let Inst{19-16}  = Rn;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+
+class T2OneRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<12> ShiftedRm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2sOneRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<12> ShiftedRm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2OneRegCmpShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rn;
+  bits<12> ShiftedRm;
+
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2TwoReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+}
+
+class T2sTwoReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+}
+
+class T2TwoRegCmp<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+
+class T2TwoRegImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+class T2sTwoRegImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+class T2TwoRegShiftImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<5> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+  let Inst{14-12} = imm{4-2};
+  let Inst{7-6}   = imm{1-0};
+}
+
+class T2sTwoRegShiftImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<5> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+  let Inst{14-12} = imm{4-2};
+  let Inst{7-6}   = imm{1-0};
+}
+
+class T2ThreeReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+class T2ThreeRegNoP<dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : T2XI<oops, iops, itin, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+class T2sThreeReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+class T2TwoRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> ShiftedRm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2sTwoRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> ShiftedRm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2FourReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  bits<4> Ra;
+
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Ra;
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+}
+
+class T2MulLong<bits<3> opc22_20, bits<4> opc7_4,
+                string opc, list<dag> pattern>
+  : T2I<(outs rGPR:$RdLo, rGPR:$RdHi), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64,
+         opc, "\t$RdLo, $RdHi, $Rn, $Rm", pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{31-23} = 0b111110111;
+  let Inst{22-20} = opc22_20;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = RdHi;
+  let Inst{7-4}   = opc7_4;
+  let Inst{3-0}   = Rm;
+}
+class T2MlaLong<bits<3> opc22_20, bits<4> opc7_4, string opc>
+  : T2I<(outs rGPR:$RdLo, rGPR:$RdHi),
+        (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
+        opc, "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+        RegConstraint<"$RLo = $RdLo, $RHi = $RdHi"> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{31-23} = 0b111110111;
+  let Inst{22-20} = opc22_20;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = RdHi;
+  let Inst{7-4}   = opc7_4;
+  let Inst{3-0}   = Rm;
+}
+
+
+/// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
+/// binary operation that produces a value. These are predicable and can be
+/// changed to modify CPSR.
+multiclass T2I_bin_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                     SDPatternOperator opnode, bit Commutable = 0,
+                     string wide = ""> {
+   // shifted imm
+   def ri : T2sTwoRegImm<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), iii,
+                 opc, "\t$Rd, $Rn, $imm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>,
+                 Sched<[WriteALU, ReadALU]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{15} = 0;
+   }
+   // register
+   def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), iir,
+                 opc, !strconcat(wide, "\t$Rd, $Rn, $Rm"),
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
+                 Sched<[WriteALU, ReadALU, ReadALU]> {
+     let isCommutable = Commutable;
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2sTwoRegShiftedReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), iis,
+                 opc, !strconcat(wide, "\t$Rd, $Rn, $ShiftedRm"),
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>,
+                 Sched<[WriteALUsi, ReadALU]>  {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+   }
+  // Assembly aliases for optional destination operand when it's the same
+  // as the source operand.
+  def : t2InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn,
+                                                    t2_so_imm:$imm, pred:$p,
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn,
+                                                    rGPR:$Rm, pred:$p,
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $shift"),
+     (!cast<Instruction>(NAME#"rs") rGPR:$Rdn, rGPR:$Rdn,
+                                                    t2_so_reg:$shift, pred:$p,
+                                                    cc_out:$s)>;
+}
+
+/// T2I_bin_w_irs - Same as T2I_bin_irs except these operations need
+//  the ".w" suffix to indicate that they are wide.
+multiclass T2I_bin_w_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                     SDPatternOperator opnode, bit Commutable = 0> :
+    T2I_bin_irs<opcod, opc, iii, iir, iis, opnode, Commutable, ".w"> {
+  // Assembler aliases w/ the ".w" suffix.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}.w", " $Rd, $Rn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p,
+                                    cc_out:$s)>;
+  // Assembler aliases w/o the ".w" suffix.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $shift"),
+     (!cast<Instruction>(NAME#"rs") rGPR:$Rd, rGPR:$Rn, t2_so_reg:$shift,
+                                    pred:$p, cc_out:$s)>;
+
+  // and with the optional destination operand, too.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}.w", " $Rdn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm,
+                                    pred:$p, cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $shift"),
+     (!cast<Instruction>(NAME#"rs") rGPR:$Rdn, rGPR:$Rdn, t2_so_reg:$shift,
+                                    pred:$p, cc_out:$s)>;
+}
+
+/// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are
+/// reversed.  The 'rr' form is only defined for the disassembler; for codegen
+/// it is equivalent to the T2I_bin_irs counterpart.
+multiclass T2I_rbin_irs<bits<4> opcod, string opc, SDNode opnode> {
+   // shifted imm
+   def ri : T2sTwoRegImm<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
+                 opc, ".w\t$Rd, $Rn, $imm",
+                 [(set rGPR:$Rd, (opnode t2_so_imm:$imm, rGPR:$Rn))]>,
+                 Sched<[WriteALU, ReadALU]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{15} = 0;
+   }
+   // register
+   def rr : T2sThreeReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
+                 opc, "\t$Rd, $Rn, $Rm",
+                 [/* For disassembly only; pattern left blank */]>,
+                 Sched<[WriteALU, ReadALU, ReadALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2sTwoRegShiftedReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
+                 IIC_iALUsir, opc, "\t$Rd, $Rn, $ShiftedRm",
+                 [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]>,
+                 Sched<[WriteALUsi, ReadALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+   }
+}
+
+/// T2I_bin_s_irs - Similar to T2I_bin_irs except it sets the 's' bit so the
+/// instruction modifies the CPSR register.
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+multiclass T2I_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
+                         InstrItinClass iis, SDNode opnode,
+                         bit Commutable = 0> {
+   // shifted imm
+   def ri : t2PseudoInst<(outs rGPR:$Rd),
+                         (ins GPRnopc:$Rn, t2_so_imm:$imm, pred:$p),
+                         4, iii,
+                         [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
+                                                t2_so_imm:$imm))]>,
+            Sched<[WriteALU, ReadALU]>;
+   // register
+   def rr : t2PseudoInst<(outs rGPR:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm, pred:$p),
+                         4, iir,
+                         [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
+                                                rGPR:$Rm))]>,
+            Sched<[WriteALU, ReadALU, ReadALU]> {
+     let isCommutable = Commutable;
+   }
+   // shifted register
+   def rs : t2PseudoInst<(outs rGPR:$Rd),
+                         (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm, pred:$p),
+                         4, iis,
+                         [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
+                                                t2_so_reg:$ShiftedRm))]>,
+            Sched<[WriteALUsi, ReadALUsr]>;
+}
+}
+
+/// T2I_rbin_s_is -  Same as T2I_bin_s_irs, except selection DAG
+/// operands are reversed.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+multiclass T2I_rbin_s_is<SDNode opnode> {
+   // shifted imm
+   def ri : t2PseudoInst<(outs rGPR:$Rd),
+                         (ins rGPR:$Rn, t2_so_imm:$imm, pred:$p),
+                         4, IIC_iALUi,
+                         [(set rGPR:$Rd, CPSR, (opnode t2_so_imm:$imm,
+                                                rGPR:$Rn))]>,
+            Sched<[WriteALU, ReadALU]>;
+   // shifted register
+   def rs : t2PseudoInst<(outs rGPR:$Rd),
+                         (ins rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p),
+                         4, IIC_iALUsi,
+                         [(set rGPR:$Rd, CPSR, (opnode t2_so_reg:$ShiftedRm,
+                                                rGPR:$Rn))]>,
+            Sched<[WriteALUsi, ReadALU]>;
+}
+}
+
+/// T2I_bin_ii12rs - Defines a set of (op reg, {so_imm|imm0_4095|r|so_reg})
+/// patterns for a binary operation that produces a value.
+multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, SDNode opnode,
+                          bit Commutable = 0> {
+   // shifted imm
+   // The register-immediate version is re-materializable. This is useful
+   // in particular for taking the address of a local.
+   let isReMaterializable = 1 in {
+   def ri : T2sTwoRegImm<
+               (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iALUi,
+               opc, ".w\t$Rd, $Rn, $imm",
+               [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_imm:$imm))]>,
+               Sched<[WriteALU, ReadALU]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24} = 1;
+     let Inst{23-21} = op23_21;
+     let Inst{15} = 0;
+   }
+   }
+   // 12-bit imm
+   def ri12 : T2I<
+                  (outs GPRnopc:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi,
+                  !strconcat(opc, "w"), "\t$Rd, $Rn, $imm",
+                  [(set GPRnopc:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]>,
+                  Sched<[WriteALU, ReadALU]> {
+     bits<4> Rd;
+     bits<4> Rn;
+     bits<12> imm;
+     let Inst{31-27} = 0b11110;
+     let Inst{26} = imm{11};
+     let Inst{25-24} = 0b10;
+     let Inst{23-21} = op23_21;
+     let Inst{20} = 0; // The S bit.
+     let Inst{19-16} = Rn;
+     let Inst{15} = 0;
+     let Inst{14-12} = imm{10-8};
+     let Inst{11-8} = Rd;
+     let Inst{7-0} = imm{7-0};
+   }
+   // register
+   def rr : T2sThreeReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm),
+                 IIC_iALUr, opc, ".w\t$Rd, $Rn, $Rm",
+                 [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, rGPR:$Rm))]>,
+                 Sched<[WriteALU, ReadALU, ReadALU]> {
+     let isCommutable = Commutable;
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24} = 1;
+     let Inst{23-21} = op23_21;
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2sTwoRegShiftedReg<
+                 (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm),
+                 IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
+              [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm))]>,
+              Sched<[WriteALUsi, ReadALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24} = 1;
+     let Inst{23-21} = op23_21;
+   }
+}
+
+/// T2I_adde_sube_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns
+/// for a binary operation that produces a value and use the carry
+/// bit. It's not predicable.
+let Defs = [CPSR], Uses = [CPSR] in {
+multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, SDNode opnode,
+                             bit Commutable = 0> {
+   // shifted imm
+   def ri : T2sTwoRegImm<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm),
+                 IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
+               [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_imm:$imm, CPSR))]>,
+                 Requires<[IsThumb2]>, Sched<[WriteALU, ReadALU]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{15} = 0;
+   }
+   // register
+   def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
+                 opc, ".w\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, rGPR:$Rm, CPSR))]>,
+                 Requires<[IsThumb2]>, Sched<[WriteALU, ReadALU, ReadALU]> {
+     let isCommutable = Commutable;
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2sTwoRegShiftedReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
+                 IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
+         [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm, CPSR))]>,
+                 Requires<[IsThumb2]>, Sched<[WriteALUsi, ReadALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+   }
+}
+}
+
+/// T2I_sh_ir - Defines a set of (op reg, {so_imm|r}) patterns for a shift /
+//  rotate operation that produces a value.
+multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, SDNode opnode> {
+   // 5-bit imm
+   def ri : T2sTwoRegShiftImm<
+                 (outs rGPR:$Rd), (ins rGPR:$Rm, ty:$imm), IIC_iMOVsi,
+                 opc, ".w\t$Rd, $Rm, $imm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rm, (i32 ty:$imm)))]>,
+                 Sched<[WriteALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-21} = 0b010010;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{5-4} = opcod;
+   }
+   // register
+   def rr : T2sThreeReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMOVsr,
+                 opc, ".w\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
+                 Sched<[WriteALU]> {
+     let Inst{31-27} = 0b11111;
+     let Inst{26-23} = 0b0100;
+     let Inst{22-21} = opcod;
+     let Inst{15-12} = 0b1111;
+     let Inst{7-4} = 0b0000;
+   }
+
+  // Optional destination register
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, ty:$imm, pred:$p,
+                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
+
+  // Assembler aliases w/o the ".w" suffix.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rd, rGPR:$Rn, ty:$imm, pred:$p,
+                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
+
+  // and with the optional destination operand, too.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, ty:$imm, pred:$p,
+                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
+}
+
+/// T2I_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
+/// patterns. Similar to T2I_bin_irs except the instruction does not produce
+/// a explicit result, only implicitly set CPSR.
+multiclass T2I_cmp_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                     SDPatternOperator opnode> {
+let isCompare = 1, Defs = [CPSR] in {
+   // shifted imm
+   def ri : T2OneRegCmpImm<
+                (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), iii,
+                opc, ".w\t$Rn, $imm",
+                [(opnode GPRnopc:$Rn, t2_so_imm:$imm)]>, Sched<[WriteCMP]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{15} = 0;
+     let Inst{11-8} = 0b1111; // Rd
+   }
+   // register
+   def rr : T2TwoRegCmp<
+                (outs), (ins GPRnopc:$Rn, rGPR:$Rm), iir,
+                opc, ".w\t$Rn, $Rm",
+                [(opnode GPRnopc:$Rn, rGPR:$Rm)]>, Sched<[WriteCMP]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{11-8} = 0b1111; // Rd
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2OneRegCmpShiftedReg<
+                (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), iis,
+                opc, ".w\t$Rn, $ShiftedRm",
+                [(opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]>,
+                Sched<[WriteCMPsi]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{11-8} = 0b1111; // Rd
+   }
+}
+
+  // Assembler aliases w/o the ".w" suffix.
+  // No alias here for 'rr' version as not all instantiations of this
+  // multiclass want one (CMP in particular, does not).
+  def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $imm"),
+     (!cast<Instruction>(NAME#"ri") GPRnopc:$Rn, t2_so_imm:$imm, pred:$p)>;
+  def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $shift"),
+     (!cast<Instruction>(NAME#"rs") GPRnopc:$Rn, t2_so_reg:$shift, pred:$p)>;
+}
+
+/// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns.
+multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
+                  InstrItinClass iii, InstrItinClass iis, RegisterClass target,
+                  PatFrag opnode> {
+  def i12 : T2Ii12<(outs target:$Rt), (ins t2addrmode_imm12:$addr), iii,
+                   opc, ".w\t$Rt, $addr",
+                   [(set target:$Rt, (opnode t2addrmode_imm12:$addr))]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{31-25} = 0b1111100;
+    let Inst{24} = signed;
+    let Inst{23} = 1;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 1; // load
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};  // imm
+
+    let DecoderMethod = "DecodeT2LoadImm12";
+  }
+  def i8  : T2Ii8 <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii,
+                   opc, "\t$Rt, $addr",
+                   [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]> {
+    bits<4> Rt;
+    bits<13> addr;
+    let Inst{31-27} = 0b11111;
+    let Inst{26-25} = 0b00;
+    let Inst{24} = signed;
+    let Inst{23} = 0;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 1; // load
+    let Inst{19-16} = addr{12-9}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11} = 1;
+    // Offset: index==TRUE, wback==FALSE
+    let Inst{10} = 1; // The P bit.
+    let Inst{9}     = addr{8};    // U
+    let Inst{8} = 0; // The W bit.
+    let Inst{7-0}   = addr{7-0};  // imm
+
+    let DecoderMethod = "DecodeT2LoadImm8";
+  }
+  def s   : T2Iso <(outs target:$Rt), (ins t2addrmode_so_reg:$addr), iis,
+                   opc, ".w\t$Rt, $addr",
+                   [(set target:$Rt, (opnode t2addrmode_so_reg:$addr))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-25} = 0b00;
+    let Inst{24} = signed;
+    let Inst{23} = 0;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 1; // load
+    let Inst{11-6} = 0b000000;
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<10> addr;
+    let Inst{19-16} = addr{9-6}; // Rn
+    let Inst{3-0}   = addr{5-2}; // Rm
+    let Inst{5-4}   = addr{1-0}; // imm
+
+    let DecoderMethod = "DecodeT2LoadShift";
+  }
+
+  // pci variant is very similar to i12, but supports negative offsets
+  // from the PC.
+  def pci : T2Ipc <(outs target:$Rt), (ins t2ldrlabel:$addr), iii,
+                   opc, ".w\t$Rt, $addr",
+                   [(set target:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> {
+    let isReMaterializable = 1;
+    let Inst{31-27} = 0b11111;
+    let Inst{26-25} = 0b00;
+    let Inst{24} = signed;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 1; // load
+    let Inst{19-16} = 0b1111; // Rn
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt{3-0};
+
+    bits<13> addr;
+    let Inst{23} = addr{12}; // add = (U == '1')
+    let Inst{11-0}  = addr{11-0};
+
+    let DecoderMethod = "DecodeT2LoadLabel";
+  }
+}
+
+/// T2I_st - Defines a set of (op r, {imm12|imm8|so_reg}) store patterns.
+multiclass T2I_st<bits<2> opcod, string opc,
+                  InstrItinClass iii, InstrItinClass iis, RegisterClass target,
+                  PatFrag opnode> {
+  def i12 : T2Ii12<(outs), (ins target:$Rt, t2addrmode_imm12:$addr), iii,
+                   opc, ".w\t$Rt, $addr",
+                   [(opnode target:$Rt, t2addrmode_imm12:$addr)]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0001;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 0; // !load
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<17> addr;
+    let addr{12}    = 1;           // add = TRUE
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{23}    = addr{12};    // U
+    let Inst{11-0}  = addr{11-0};  // imm
+  }
+  def i8  : T2Ii8 <(outs), (ins target:$Rt, t2addrmode_negimm8:$addr), iii,
+                   opc, "\t$Rt, $addr",
+                   [(opnode target:$Rt, t2addrmode_negimm8:$addr)]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0000;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 0; // !load
+    let Inst{11} = 1;
+    // Offset: index==TRUE, wback==FALSE
+    let Inst{10} = 1; // The P bit.
+    let Inst{8} = 0; // The W bit.
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<13> addr;
+    let Inst{19-16} = addr{12-9}; // Rn
+    let Inst{9}     = addr{8};    // U
+    let Inst{7-0}   = addr{7-0};  // imm
+  }
+  def s   : T2Iso <(outs), (ins target:$Rt, t2addrmode_so_reg:$addr), iis,
+                   opc, ".w\t$Rt, $addr",
+                   [(opnode target:$Rt, t2addrmode_so_reg:$addr)]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0000;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 0; // !load
+    let Inst{11-6} = 0b000000;
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<10> addr;
+    let Inst{19-16}   = addr{9-6}; // Rn
+    let Inst{3-0} = addr{5-2}; // Rm
+    let Inst{5-4}   = addr{1-0}; // imm
+  }
+}
+
+/// T2I_ext_rrot - A unary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+class T2I_ext_rrot<bits<3> opcod, string opc, PatFrag opnode>
+  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
+             opc, ".w\t$Rd, $Rm$rot",
+             [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
+             Requires<[IsThumb2]> {
+   let Inst{31-27} = 0b11111;
+   let Inst{26-23} = 0b0100;
+   let Inst{22-20} = opcod;
+   let Inst{19-16} = 0b1111; // Rn
+   let Inst{15-12} = 0b1111;
+   let Inst{7} = 1;
+
+   bits<2> rot;
+   let Inst{5-4} = rot{1-0}; // rotate
+}
+
+// UXTB16 - Requres T2ExtractPack, does not need the .w qualifier.
+class T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode>
+  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot),
+             IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
+            [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
+          Requires<[HasT2ExtractPack, IsThumb2]> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
+}
+
+// SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern
+// supported yet.
+class T2I_ext_rrot_sxtb16<bits<3> opcod, string opc>
+  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
+             opc, "\t$Rd, $Rm$rot", []>,
+          Requires<[IsThumb2, HasT2ExtractPack]> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
+}
+
+/// T2I_exta_rrot - A binary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+class T2I_exta_rrot<bits<3> opcod, string opc, PatFrag opnode>
+  : T2ThreeReg<(outs rGPR:$Rd),
+               (ins rGPR:$Rn, rGPR:$Rm, rot_imm:$rot),
+               IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot",
+             [(set rGPR:$Rd, (opnode rGPR:$Rn, (rotr rGPR:$Rm,rot_imm:$rot)))]>,
+           Requires<[HasT2ExtractPack, IsThumb2]> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
+}
+
+class T2I_exta_rrot_np<bits<3> opcod, string opc>
+  : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm,rot_imm:$rot),
+               IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot", []>,
+               Requires<[HasT2ExtractPack, IsThumb2]> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+class T2PCOneRegImm<dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : T2XI<oops, iops, itin, asm, pattern> {
+  bits<4> Rd;
+  bits<12> label;
+
+  let Inst{11-8}  = Rd;
+  let Inst{26}    = label{11};
+  let Inst{14-12} = label{10-8};
+  let Inst{7-0}   = label{7-0};
+}
+
+// LEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd),
+              (ins t2adrlabel:$addr, pred:$p),
+              IIC_iALUi, "adr{$p}.w\t$Rd, $addr", []>,
+              Sched<[WriteALU, ReadALU]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-24} = 0b10;
+  // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE)
+  let Inst{22} = 0;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15} = 0;
+
+  bits<4> Rd;
+  bits<13> addr;
+  let Inst{11-8} = Rd;
+  let Inst{23}    = addr{12};
+  let Inst{21}    = addr{12};
+  let Inst{26}    = addr{11};
+  let Inst{14-12} = addr{10-8};
+  let Inst{7-0}   = addr{7-0};
+
+  let DecoderMethod = "DecodeT2Adr";
+}
+
+let hasSideEffects = 0, isReMaterializable = 1 in
+def t2LEApcrel   : t2PseudoInst<(outs rGPR:$Rd), (ins i32imm:$label, pred:$p),
+                                4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
+let hasSideEffects = 1 in
+def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd),
+                                (ins i32imm:$label, pred:$p),
+                                4, IIC_iALUi,
+                                []>, Sched<[WriteALU, ReadALU]>;
+
+
+//===----------------------------------------------------------------------===//
+//  Load / store Instructions.
+//
+
+// Load
+let canFoldAsLoad = 1, isReMaterializable = 1  in
+defm t2LDR   : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si, GPR, load>;
+
+// Loads with zero extension
+defm t2LDRH  : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+                      GPRnopc, zextloadi16>;
+defm t2LDRB  : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+                      GPRnopc, zextloadi8>;
+
+// Loads with sign extension
+defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+                      GPRnopc, sextloadi16>;
+defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+                      GPRnopc, sextloadi8>;
+
+let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
+// Load doubleword
+def t2LDRDi8  : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2),
+                        (ins t2addrmode_imm8s4:$addr),
+                        IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>;
+} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
+
+// zextload i1 -> zextload i8
+def : T2Pat<(zextloadi1 t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(zextloadi1 t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(zextloadi1 t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(zextloadi1 (ARMWrapper tconstpool:$addr)),
+            (t2LDRBpci  tconstpool:$addr)>;
+
+// extload -> zextload
+// FIXME: Reduce the number of patterns by legalizing extload to zextload
+// earlier?
+def : T2Pat<(extloadi1  t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(extloadi1  t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(extloadi1  t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(extloadi1  (ARMWrapper tconstpool:$addr)),
+            (t2LDRBpci  tconstpool:$addr)>;
+
+def : T2Pat<(extloadi8  t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(extloadi8  t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(extloadi8  t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(extloadi8  (ARMWrapper tconstpool:$addr)),
+            (t2LDRBpci  tconstpool:$addr)>;
+
+def : T2Pat<(extloadi16 t2addrmode_imm12:$addr),
+            (t2LDRHi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(extloadi16 t2addrmode_negimm8:$addr),
+            (t2LDRHi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(extloadi16 t2addrmode_so_reg:$addr),
+            (t2LDRHs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)),
+            (t2LDRHpci  tconstpool:$addr)>;
+
+// FIXME: The destination register of the loads and stores can't be PC, but
+//        can be SP. We need another regclass (similar to rGPR) to represent
+//        that. Not a pressing issue since these are selected manually,
+//        not via pattern.
+
+// Indexed loads
+
+let mayLoad = 1, hasSideEffects = 0 in {
+def t2LDR_PRE  : T2Ipreldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                            (ins t2addrmode_imm8_pre:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_iu,
+                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+
+def t2LDR_POST : T2Ipostldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_iu,
+                          "ldr", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRB_PRE : T2Ipreldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                            (ins t2addrmode_imm8_pre:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+
+def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRH_PRE : T2Ipreldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                            (ins t2addrmode_imm8_pre:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+
+def t2LDRH_POST : T2Ipostldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRSB_PRE : T2Ipreldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                            (ins t2addrmode_imm8_pre:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+                            "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []>;
+
+def t2LDRSB_POST : T2Ipostldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrsb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRSH_PRE : T2Ipreldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                            (ins t2addrmode_imm8_pre:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+                            "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []>;
+
+def t2LDRSH_POST : T2Ipostldst<1, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrsh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+} // mayLoad = 1, hasSideEffects = 0
+
+// LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110).
+// Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4
+class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii>
+  : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_posimm8:$addr), ii, opc,
+          "\t$Rt, $addr", []> {
+  bits<4> Rt;
+  bits<13> addr;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-25} = 0b00;
+  let Inst{24} = signed;
+  let Inst{23} = 0;
+  let Inst{22-21} = type;
+  let Inst{20} = 1; // load
+  let Inst{19-16} = addr{12-9};
+  let Inst{15-12} = Rt;
+  let Inst{11} = 1;
+  let Inst{10-8} = 0b110; // PUW.
+  let Inst{7-0} = addr{7-0};
+
+  let DecoderMethod = "DecodeT2LoadT";
+}
+
+def t2LDRT   : T2IldT<0, 0b10, "ldrt", IIC_iLoad_i>;
+def t2LDRBT  : T2IldT<0, 0b00, "ldrbt", IIC_iLoad_bh_i>;
+def t2LDRHT  : T2IldT<0, 0b01, "ldrht", IIC_iLoad_bh_i>;
+def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt", IIC_iLoad_bh_i>;
+def t2LDRSHT : T2IldT<1, 0b01, "ldrsht", IIC_iLoad_bh_i>;
+
+class T2Ildacq<bits<4> bits23_20, bits<2> bit54, dag oops, dag iops,
+               string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary,
+            opc, asm, "", pattern>, Requires<[IsThumb, HasAcquireRelease]> {
+  bits<4> Rt;
+  bits<4> addr;
+
+  let Inst{31-27} = 0b11101;
+  let Inst{26-24} = 0b000;
+  let Inst{23-20} = bits23_20;
+  let Inst{11-6} = 0b111110;
+  let Inst{5-4} = bit54;
+  let Inst{3-0} = 0b1111;
+
+  // Encode instruction operands
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+}
+
+def t2LDA : T2Ildacq<0b1101, 0b10, (outs rGPR:$Rt),
+                     (ins addr_offset_none:$addr), "lda", "\t$Rt, $addr", []>;
+def t2LDAB : T2Ildacq<0b1101, 0b00, (outs rGPR:$Rt),
+                      (ins addr_offset_none:$addr), "ldab", "\t$Rt, $addr", []>;
+def t2LDAH : T2Ildacq<0b1101, 0b01, (outs rGPR:$Rt),
+                      (ins addr_offset_none:$addr), "ldah", "\t$Rt, $addr", []>;
+
+// Store
+defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR, store>;
+defm t2STRB:T2I_st<0b00,"strb", IIC_iStore_bh_i, IIC_iStore_bh_si,
+                   rGPR, truncstorei8>;
+defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
+                   rGPR, truncstorei16>;
+
+// Store doubleword
+let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in
+def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
+                       (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
+               IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>;
+
+// Indexed stores
+
+let mayStore = 1, hasSideEffects = 0 in {
+def t2STR_PRE  : T2Ipreldst<0, 0b10, 0, 1, (outs GPRnopc:$Rn_wb),
+                            (ins GPRnopc:$Rt, t2addrmode_imm8_pre:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
+                            "str", "\t$Rt, $addr!",
+                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+
+def t2STRH_PRE  : T2Ipreldst<0, 0b01, 0, 1, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
+                        "strh", "\t$Rt, $addr!",
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+
+def t2STRB_PRE  : T2Ipreldst<0, 0b00, 0, 1, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu,
+                        "strb", "\t$Rt, $addr!",
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+} // mayStore = 1, hasSideEffects = 0
+
+def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb),
+                            (ins GPRnopc:$Rt, addr_offset_none:$Rn,
+                                 t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iStore_iu,
+                          "str", "\t$Rt, $Rn$offset",
+                          "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+             [(set GPRnopc:$Rn_wb,
+                  (post_store GPRnopc:$Rt, addr_offset_none:$Rn,
+                              t2am_imm8_offset:$offset))]>;
+
+def t2STRH_POST : T2Ipostldst<0, 0b01, 0, 0, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, addr_offset_none:$Rn,
+                                 t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
+                         "strh", "\t$Rt, $Rn$offset",
+                         "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+       [(set GPRnopc:$Rn_wb,
+             (post_truncsti16 rGPR:$Rt, addr_offset_none:$Rn,
+                              t2am_imm8_offset:$offset))]>;
+
+def t2STRB_POST : T2Ipostldst<0, 0b00, 0, 0, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, addr_offset_none:$Rn,
+                                 t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
+                         "strb", "\t$Rt, $Rn$offset",
+                         "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+        [(set GPRnopc:$Rn_wb,
+              (post_truncsti8 rGPR:$Rt, addr_offset_none:$Rn,
+                              t2am_imm8_offset:$offset))]>;
+
+// Pseudo-instructions for pattern matching the pre-indexed stores. We can't
+// put the patterns on the instruction definitions directly as ISel wants
+// the address base and offset to be separate operands, not a single
+// complex operand like we represent the instructions themselves. The
+// pseudos map between the two.
+let usesCustomInserter = 1,
+    Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in {
+def t2STR_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+               (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+      [(set GPRnopc:$Rn_wb,
+            (pre_store rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+def t2STRB_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+               (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+      [(set GPRnopc:$Rn_wb,
+            (pre_truncsti8 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+               (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+      [(set GPRnopc:$Rn_wb,
+            (pre_truncsti16 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+}
+
+// STRT, STRBT, STRHT all have offset mode (PUW=0b110) and are for disassembly
+// only.
+// Ref: A8.6.193 STR (immediate, Thumb) Encoding T4
+class T2IstT<bits<2> type, string opc, InstrItinClass ii>
+  : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
+          "\t$Rt, $addr", []> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-25} = 0b00;
+  let Inst{24} = 0; // not signed
+  let Inst{23} = 0;
+  let Inst{22-21} = type;
+  let Inst{20} = 0; // store
+  let Inst{11} = 1;
+  let Inst{10-8} = 0b110; // PUW
+
+  bits<4> Rt;
+  bits<13> addr;
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = addr{12-9};
+  let Inst{7-0}   = addr{7-0};
+}
+
+def t2STRT   : T2IstT<0b10, "strt", IIC_iStore_i>;
+def t2STRBT  : T2IstT<0b00, "strbt", IIC_iStore_bh_i>;
+def t2STRHT  : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
+
+// ldrd / strd pre / post variants
+
+let mayLoad = 1 in
+def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
+                 (ins t2addrmode_imm8s4_pre:$addr), IIC_iLoad_d_ru,
+                 "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> {
+  let DecoderMethod = "DecodeT2LDRDPreInstruction";
+}
+
+let mayLoad = 1 in
+def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
+                 (ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm),
+                 IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm",
+                 "$addr.base = $wb", []>;
+
+let mayStore = 1 in
+def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
+                 (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4_pre:$addr),
+                 IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!",
+                 "$addr.base = $wb", []> {
+  let DecoderMethod = "DecodeT2STRDPreInstruction";
+}
+
+let mayStore = 1 in
+def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
+                 (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr,
+                      t2am_imm8s4_offset:$imm),
+                 IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr$imm",
+                 "$addr.base = $wb", []>;
+
+class T2Istrrel<bits<2> bit54, dag oops, dag iops,
+                string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary, opc,
+            asm, "", pattern>, Requires<[IsThumb, HasAcquireRelease]> {
+  bits<4> Rt;
+  bits<4> addr;
+
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001100;
+  let Inst{11-6} = 0b111110;
+  let Inst{5-4} = bit54;
+  let Inst{3-0} = 0b1111;
+
+  // Encode instruction operands
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+}
+
+def t2STL  : T2Istrrel<0b10, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+                       "stl", "\t$Rt, $addr", []>;
+def t2STLB : T2Istrrel<0b00, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+                       "stlb", "\t$Rt, $addr", []>;
+def t2STLH : T2Istrrel<0b01, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+                       "stlh", "\t$Rt, $addr", []>;
+
+// T2Ipl (Preload Data/Instruction) signals the memory system of possible future
+// data/instruction access.
+// instr_write is inverted for Thumb mode: (prefetch 3) -> (preload 0),
+// (prefetch 1) -> (preload 2),  (prefetch 2) -> (preload 1).
+multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
+
+  def i12 : T2Ii12<(outs), (ins t2addrmode_imm12:$addr), IIC_Preload, opc,
+                "\t$addr",
+              [(ARMPreload t2addrmode_imm12:$addr, (i32 write), (i32 instr))]>,
+              Sched<[WritePreLd]> {
+    let Inst{31-25} = 0b1111100;
+    let Inst{24} = instr;
+    let Inst{23} = 1;
+    let Inst{22} = 0;
+    let Inst{21} = write;
+    let Inst{20} = 1;
+    let Inst{15-12} = 0b1111;
+
+    bits<17> addr;
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{11-0}  = addr{11-0};  // imm12
+
+    let DecoderMethod = "DecodeT2LoadImm12";
+  }
+
+  def i8 : T2Ii8<(outs), (ins t2addrmode_negimm8:$addr), IIC_Preload, opc,
+                "\t$addr",
+            [(ARMPreload t2addrmode_negimm8:$addr, (i32 write), (i32 instr))]>,
+            Sched<[WritePreLd]> {
+    let Inst{31-25} = 0b1111100;
+    let Inst{24} = instr;
+    let Inst{23} = 0; // U = 0
+    let Inst{22} = 0;
+    let Inst{21} = write;
+    let Inst{20} = 1;
+    let Inst{15-12} = 0b1111;
+    let Inst{11-8} = 0b1100;
+
+    bits<13> addr;
+    let Inst{19-16} = addr{12-9}; // Rn
+    let Inst{7-0}   = addr{7-0};  // imm8
+
+    let DecoderMethod = "DecodeT2LoadImm8";
+  }
+
+  def s : T2Iso<(outs), (ins t2addrmode_so_reg:$addr), IIC_Preload, opc,
+               "\t$addr",
+             [(ARMPreload t2addrmode_so_reg:$addr, (i32 write), (i32 instr))]>,
+             Sched<[WritePreLd]> {
+    let Inst{31-25} = 0b1111100;
+    let Inst{24} = instr;
+    let Inst{23} = 0; // add = TRUE for T1
+    let Inst{22} = 0;
+    let Inst{21} = write;
+    let Inst{20} = 1;
+    let Inst{15-12} = 0b1111;
+    let Inst{11-6} = 0b000000;
+
+    bits<10> addr;
+    let Inst{19-16} = addr{9-6}; // Rn
+    let Inst{3-0}   = addr{5-2}; // Rm
+    let Inst{5-4}   = addr{1-0}; // imm2
+
+    let DecoderMethod = "DecodeT2LoadShift";
+  }
+}
+
+defm t2PLD    : T2Ipl<0, 0, "pld">,  Requires<[IsThumb2]>;
+defm t2PLDW   : T2Ipl<1, 0, "pldw">, Requires<[IsThumb2,HasV7,HasMP]>;
+defm t2PLI    : T2Ipl<0, 1, "pli">,  Requires<[IsThumb2,HasV7]>;
+
+// pci variant is very similar to i12, but supports negative offsets
+// from the PC. Only PLD and PLI have pci variants (not PLDW)
+class T2Iplpci<bits<1> inst, string opc> : T2Iso<(outs), (ins t2ldrlabel:$addr),
+               IIC_Preload, opc, "\t$addr",
+               [(ARMPreload (ARMWrapper tconstpool:$addr),
+                (i32 0), (i32 inst))]>, Sched<[WritePreLd]> {
+  let Inst{31-25} = 0b1111100;
+  let Inst{24} = inst;
+  let Inst{22-20} = 0b001;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = 0b1111;
+
+  bits<13> addr;
+  let Inst{23}   = addr{12};   // add = (U == '1')
+  let Inst{11-0} = addr{11-0}; // imm12
+
+  let DecoderMethod = "DecodeT2LoadLabel";
+}
+
+def t2PLDpci : T2Iplpci<0, "pld">,  Requires<[IsThumb2]>;
+def t2PLIpci : T2Iplpci<1, "pli">,  Requires<[IsThumb2,HasV7]>;
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+multiclass thumb2_ld_mult<string asm, InstrItinClass itin,
+                            InstrItinClass itin_upd, bit L_bit> {
+  def IA :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15-0}  = regs;
+  }
+  def IA_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15-0}  = regs;
+  }
+  def DB :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15-0}  = regs;
+  }
+  def DB_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15-0}  = regs;
+  }
+}
+
+let hasSideEffects = 0 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm t2LDM : thumb2_ld_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, 1>;
+
+multiclass thumb2_st_mult<string asm, InstrItinClass itin,
+                            InstrItinClass itin_upd, bit L_bit> {
+  def IA :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+  def IA_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+  def DB :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+  def DB_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+}
+
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm t2STM : thumb2_st_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
+
+} // hasSideEffects
+
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions.
+//
+
+let hasSideEffects = 0 in
+def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr,
+                   "mov", ".w\t$Rd, $Rm", []>, Sched<[WriteALU]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{14-12} = 0b000;
+  let Inst{7-4} = 0b0000;
+}
+def : t2InstAlias<"mov${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+                                                pred:$p, zero_reg)>;
+def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+                                                 pred:$p, CPSR)>;
+def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+                                               pred:$p, CPSR)>;
+
+// AddedComplexity to ensure isel tries t2MOVi before t2MOVi16.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
+    AddedComplexity = 1 in
+def t2MOVi : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), IIC_iMOVi,
+                   "mov", ".w\t$Rd, $imm",
+                   [(set rGPR:$Rd, t2_so_imm:$imm)]>, Sched<[WriteALU]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 0;
+  let Inst{24-21} = 0b0010;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15} = 0;
+}
+
+// cc_out is handled as part of the explicit mnemonic in the parser for 'mov'.
+// Use aliases to get that to play nice here.
+def : t2InstAlias<"movs${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                                pred:$p, CPSR)>;
+def : t2InstAlias<"movs${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                                pred:$p, CPSR)>;
+
+def : t2InstAlias<"mov${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                                 pred:$p, zero_reg)>;
+def : t2InstAlias<"mov${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                               pred:$p, zero_reg)>;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
+                   "movw", "\t$Rd, $imm",
+                   [(set rGPR:$Rd, imm0_65535:$imm)]>, Sched<[WriteALU]>,
+                   Requires<[IsThumb, HasV8MBaseline]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 0; // The S bit.
+  let Inst{15} = 0;
+
+  bits<4> Rd;
+  bits<16> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = imm{15-12};
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+  let DecoderMethod = "DecodeT2MOVTWInstruction";
+}
+
+def : InstAlias<"mov${p} $Rd, $imm",
+                (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p), 0>,
+                Requires<[IsThumb, HasV8MBaseline]>;
+
+def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
+                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+
+let Constraints = "$src = $Rd" in {
+def t2MOVTi16 : T2I<(outs rGPR:$Rd),
+                    (ins rGPR:$src, imm0_65535_expr:$imm), IIC_iMOVi,
+                    "movt", "\t$Rd, $imm",
+                    [(set rGPR:$Rd,
+                          (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]>,
+                          Sched<[WriteALU]>,
+                          Requires<[IsThumb, HasV8MBaseline]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-21} = 0b0110;
+  let Inst{20} = 0; // The S bit.
+  let Inst{15} = 0;
+
+  bits<4> Rd;
+  bits<16> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = imm{15-12};
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+  let DecoderMethod = "DecodeT2MOVTWInstruction";
+}
+
+def t2MOVTi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
+                     (ins rGPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+                     Sched<[WriteALU]>, Requires<[IsThumb, HasV8MBaseline]>;
+} // Constraints
+
+def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>;
+
+//===----------------------------------------------------------------------===//
+//  Extend Instructions.
+//
+
+// Sign extenders
+
+def t2SXTB  : T2I_ext_rrot<0b100, "sxtb",
+                              UnOpFrag<(sext_inreg node:$Src, i8)>>;
+def t2SXTH  : T2I_ext_rrot<0b000, "sxth",
+                              UnOpFrag<(sext_inreg node:$Src, i16)>>;
+def t2SXTB16 : T2I_ext_rrot_sxtb16<0b010, "sxtb16">;
+
+def t2SXTAB : T2I_exta_rrot<0b100, "sxtab",
+                        BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
+def t2SXTAH : T2I_exta_rrot<0b000, "sxtah",
+                        BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
+def t2SXTAB16 : T2I_exta_rrot_np<0b010, "sxtab16">;
+
+// A simple right-shift can also be used in most cases (the exception is the
+// SXTH operations with a rotate of 24: there the non-contiguous bits are
+// relevant).
+def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+                                        (srl rGPR:$Rm, rot_imm:$rot), i8)),
+                       (t2SXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+                                        (srl rGPR:$Rm, imm8_or_16:$rot), i16)),
+                       (t2SXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+                                        (rotr rGPR:$Rm, (i32 24)), i16)),
+                       (t2SXTAH rGPR:$Rn, rGPR:$Rm, (i32 3))>;
+def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+                                        (or (srl rGPR:$Rm, (i32 24)),
+                                              (shl rGPR:$Rm, (i32 8))), i16)),
+                       (t2SXTAH rGPR:$Rn, rGPR:$Rm, (i32 3))>;
+
+// Zero extenders
+
+let AddedComplexity = 16 in {
+def t2UXTB   : T2I_ext_rrot<0b101, "uxtb",
+                               UnOpFrag<(and node:$Src, 0x000000FF)>>;
+def t2UXTH   : T2I_ext_rrot<0b001, "uxth",
+                               UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
+def t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
+                                   UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+
+// FIXME: This pattern incorrectly assumes the shl operator is a rotate.
+//        The transformation should probably be done as a combiner action
+//        instead so we can include a check for masking back in the upper
+//        eight bits of the source into the lower eight bits of the result.
+//def : T2Pat<(and (shl rGPR:$Src, (i32 8)), 0xFF00FF),
+//            (t2UXTB16 rGPR:$Src, 3)>,
+//          Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(and (srl rGPR:$Src, (i32 8)), 0xFF00FF),
+            (t2UXTB16 rGPR:$Src, 1)>,
+        Requires<[HasT2ExtractPack, IsThumb2]>;
+
+def t2UXTAB : T2I_exta_rrot<0b101, "uxtab",
+                           BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
+def t2UXTAH : T2I_exta_rrot<0b001, "uxtah",
+                           BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
+def t2UXTAB16 : T2I_exta_rrot_np<0b011, "uxtab16">;
+
+def : Thumb2ExtractPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, rot_imm:$rot),
+                                           0xFF)),
+                       (t2UXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2ExtractPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot),
+                                            0xFFFF)),
+                       (t2UXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+let isAdd = 1 in
+defm t2ADD  : T2I_bin_ii12rs<0b000, "add", add, 1>;
+defm t2SUB  : T2I_bin_ii12rs<0b101, "sub", sub>;
+
+// ADD and SUB with 's' bit set. No 12-bit immediate (T4) variants.
+//
+// Currently, t2ADDS/t2SUBS are pseudo opcodes that exist only in the
+// selection DAG. They are "lowered" to real t2ADD/t2SUB opcodes by
+// AdjustInstrPostInstrSelection where we determine whether or not to
+// set the "s" bit based on CPSR liveness.
+//
+// FIXME: Eliminate t2ADDS/t2SUBS pseudo opcodes after adding tablegen
+// support for an optional CPSR definition that corresponds to the DAG
+// node's second value. We can then eliminate the implicit def of CPSR.
+defm t2ADDS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMaddc, 1>;
+defm t2SUBS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi, ARMsubc>;
+
+let hasPostISelHook = 1 in {
+defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc", ARMadde, 1>;
+defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc", ARMsube>;
+}
+
+// RSB
+defm t2RSB  : T2I_rbin_irs  <0b1110, "rsb", sub>;
+
+// FIXME: Eliminate them if we can write def : Pat patterns which defines
+// CPSR and the implicit def of CPSR is not needed.
+defm t2RSBS : T2I_rbin_s_is <ARMsubc>;
+
+// (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
+// The assume-no-carry-in form uses the negation of the input since add/sub
+// assume opposite meanings of the carry flag (i.e., carry == !borrow).
+// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory
+// details.
+// The AddedComplexity preferences the first variant over the others since
+// it can be shrunk to a 16-bit wide encoding, while the others cannot.
+let AddedComplexity = 1 in
+def : T2Pat<(add        GPR:$src, imm1_255_neg:$imm),
+            (t2SUBri    GPR:$src, imm1_255_neg:$imm)>;
+def : T2Pat<(add        GPR:$src, t2_so_imm_neg:$imm),
+            (t2SUBri    GPR:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(add        GPR:$src, imm0_4095_neg:$imm),
+            (t2SUBri12  GPR:$src, imm0_4095_neg:$imm)>;
+def : T2Pat<(add        GPR:$src, imm0_65535_neg:$imm),
+            (t2SUBrr    GPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
+
+let AddedComplexity = 1 in
+def : T2Pat<(ARMaddc    rGPR:$src, imm1_255_neg:$imm),
+            (t2SUBSri   rGPR:$src, imm1_255_neg:$imm)>;
+def : T2Pat<(ARMaddc    rGPR:$src, t2_so_imm_neg:$imm),
+            (t2SUBSri   rGPR:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(ARMaddc    rGPR:$src, imm0_65535_neg:$imm),
+            (t2SUBSrr   rGPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
+// The with-carry-in form matches bitwise not instead of the negation.
+// Effectively, the inverse interpretation of the carry flag already accounts
+// for part of the negation.
+let AddedComplexity = 1 in
+def : T2Pat<(ARMadde    rGPR:$src, imm0_255_not:$imm, CPSR),
+            (t2SBCri    rGPR:$src, imm0_255_not:$imm)>;
+def : T2Pat<(ARMadde    rGPR:$src, t2_so_imm_not:$imm, CPSR),
+            (t2SBCri    rGPR:$src, t2_so_imm_not:$imm)>;
+def : T2Pat<(ARMadde    rGPR:$src, imm0_65535_neg:$imm, CPSR),
+            (t2SBCrr    rGPR:$src, (t2MOVi16 (imm_not_XFORM imm:$imm)))>;
+
+// Select Bytes -- for disassembly only
+
+def t2SEL : T2ThreeReg<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                NoItinerary, "sel", "\t$Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasDSP]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-24} = 0b010;
+  let Inst{23} = 0b1;
+  let Inst{22-20} = 0b010;
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 0b1;
+  let Inst{6-4} = 0b000;
+}
+
+// A6.3.13, A6.3.14, A6.3.15 Parallel addition and subtraction (signed/unsigned)
+// And Miscellaneous operations -- for disassembly only
+class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc,
+              list<dag> pat = [/* For disassembly only; pattern left blank */],
+              dag iops = (ins rGPR:$Rn, rGPR:$Rm),
+              string asm = "\t$Rd, $Rn, $Rm">
+  : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, pat>,
+    Requires<[IsThumb2, HasDSP]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0101;
+  let Inst{22-20} = op22_20;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-4} = op7_4;
+
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+// Saturating add/subtract -- for disassembly only
+
+def t2QADD    : T2I_pam<0b000, 0b1000, "qadd",
+                        [(set rGPR:$Rd, (int_arm_qadd rGPR:$Rn, rGPR:$Rm))],
+                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QADD16  : T2I_pam<0b001, 0b0001, "qadd16">;
+def t2QADD8   : T2I_pam<0b000, 0b0001, "qadd8">;
+def t2QASX    : T2I_pam<0b010, 0b0001, "qasx">;
+def t2QDADD   : T2I_pam<0b000, 0b1001, "qdadd", [],
+                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QDSUB   : T2I_pam<0b000, 0b1011, "qdsub", [],
+                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QSAX    : T2I_pam<0b110, 0b0001, "qsax">;
+def t2QSUB    : T2I_pam<0b000, 0b1010, "qsub",
+                        [(set rGPR:$Rd, (int_arm_qsub rGPR:$Rn, rGPR:$Rm))],
+                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QSUB16  : T2I_pam<0b101, 0b0001, "qsub16">;
+def t2QSUB8   : T2I_pam<0b100, 0b0001, "qsub8">;
+def t2UQADD16 : T2I_pam<0b001, 0b0101, "uqadd16">;
+def t2UQADD8  : T2I_pam<0b000, 0b0101, "uqadd8">;
+def t2UQASX   : T2I_pam<0b010, 0b0101, "uqasx">;
+def t2UQSAX   : T2I_pam<0b110, 0b0101, "uqsax">;
+def t2UQSUB16 : T2I_pam<0b101, 0b0101, "uqsub16">;
+def t2UQSUB8  : T2I_pam<0b100, 0b0101, "uqsub8">;
+
+// Signed/Unsigned add/subtract -- for disassembly only
+
+def t2SASX    : T2I_pam<0b010, 0b0000, "sasx">;
+def t2SADD16  : T2I_pam<0b001, 0b0000, "sadd16">;
+def t2SADD8   : T2I_pam<0b000, 0b0000, "sadd8">;
+def t2SSAX    : T2I_pam<0b110, 0b0000, "ssax">;
+def t2SSUB16  : T2I_pam<0b101, 0b0000, "ssub16">;
+def t2SSUB8   : T2I_pam<0b100, 0b0000, "ssub8">;
+def t2UASX    : T2I_pam<0b010, 0b0100, "uasx">;
+def t2UADD16  : T2I_pam<0b001, 0b0100, "uadd16">;
+def t2UADD8   : T2I_pam<0b000, 0b0100, "uadd8">;
+def t2USAX    : T2I_pam<0b110, 0b0100, "usax">;
+def t2USUB16  : T2I_pam<0b101, 0b0100, "usub16">;
+def t2USUB8   : T2I_pam<0b100, 0b0100, "usub8">;
+
+// Signed/Unsigned halving add/subtract -- for disassembly only
+
+def t2SHASX   : T2I_pam<0b010, 0b0010, "shasx">;
+def t2SHADD16 : T2I_pam<0b001, 0b0010, "shadd16">;
+def t2SHADD8  : T2I_pam<0b000, 0b0010, "shadd8">;
+def t2SHSAX   : T2I_pam<0b110, 0b0010, "shsax">;
+def t2SHSUB16 : T2I_pam<0b101, 0b0010, "shsub16">;
+def t2SHSUB8  : T2I_pam<0b100, 0b0010, "shsub8">;
+def t2UHASX   : T2I_pam<0b010, 0b0110, "uhasx">;
+def t2UHADD16 : T2I_pam<0b001, 0b0110, "uhadd16">;
+def t2UHADD8  : T2I_pam<0b000, 0b0110, "uhadd8">;
+def t2UHSAX   : T2I_pam<0b110, 0b0110, "uhsax">;
+def t2UHSUB16 : T2I_pam<0b101, 0b0110, "uhsub16">;
+def t2UHSUB8  : T2I_pam<0b100, 0b0110, "uhsub8">;
+
+// Helper class for disassembly only
+// A6.3.16 & A6.3.17
+// T2Imac - Thumb2 multiply [accumulate, and absolute difference] instructions.
+class T2ThreeReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops,
+  dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : T2ThreeReg<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-24} = 0b011;
+  let Inst{23}    = long;
+  let Inst{22-20} = op22_20;
+  let Inst{7-4}   = op7_4;
+}
+
+class T2FourReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops,
+  dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : T2FourReg<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-24} = 0b011;
+  let Inst{23}    = long;
+  let Inst{22-20} = op22_20;
+  let Inst{7-4}   = op7_4;
+}
+
+// Unsigned Sum of Absolute Differences [and Accumulate].
+def t2USAD8   : T2ThreeReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
+                                           (ins rGPR:$Rn, rGPR:$Rm),
+                        NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasDSP]> {
+  let Inst{15-12} = 0b1111;
+}
+def t2USADA8  : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
+                       (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), NoItinerary,
+                        "usada8", "\t$Rd, $Rn, $Rm, $Ra", []>,
+          Requires<[IsThumb2, HasDSP]>;
+
+// Signed/Unsigned saturate.
+class T2SatI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<5> sat_imm;
+  bits<7> sh;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{4-0}   = sat_imm;
+  let Inst{21}    = sh{5};
+  let Inst{14-12} = sh{4-2};
+  let Inst{7-6}   = sh{1-0};
+}
+
+def t2SSAT: T2SatI<
+              (outs rGPR:$Rd),
+              (ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+              Requires<[IsThumb2]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-22} = 0b1100;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+  let Inst{5}  = 0;
+}
+
+def t2SSAT16: T2SatI<
+                (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary,
+                "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
+                Requires<[IsThumb2, HasDSP]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-22} = 0b1100;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+  let Inst{21} = 1;        // sh = '1'
+  let Inst{14-12} = 0b000; // imm3 = '000'
+  let Inst{7-6} = 0b00;    // imm2 = '00'
+  let Inst{5-4} = 0b00;
+}
+
+def t2USAT: T2SatI<
+               (outs rGPR:$Rd),
+               (ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>,
+                Requires<[IsThumb2]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-22} = 0b1110;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+}
+
+def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
+                     NoItinerary,
+                     "usat16", "\t$Rd, $sat_imm, $Rn", []>,
+                     Requires<[IsThumb2, HasDSP]> {
+  let Inst{31-22} = 0b1111001110;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+  let Inst{21} = 1;        // sh = '1'
+  let Inst{14-12} = 0b000; // imm3 = '000'
+  let Inst{7-6} = 0b00;    // imm2 = '00'
+  let Inst{5-4} = 0b00;
+}
+
+def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos), (t2SSAT imm1_32:$pos, GPR:$a, 0)>;
+def : T2Pat<(int_arm_usat GPR:$a, imm0_31:$pos), (t2USAT imm0_31:$pos, GPR:$a, 0)>;
+def : T2Pat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
+             (t2SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
+
+//===----------------------------------------------------------------------===//
+//  Shift and rotate Instructions.
+//
+
+defm t2LSL  : T2I_sh_ir<0b00, "lsl", imm0_31, shl>;
+defm t2LSR  : T2I_sh_ir<0b01, "lsr", imm_sr,  srl>;
+defm t2ASR  : T2I_sh_ir<0b10, "asr", imm_sr,  sra>;
+defm t2ROR  : T2I_sh_ir<0b11, "ror", imm0_31, rotr>;
+
+// (rotr x, (and y, 0x...1f)) ==> (ROR x, y)
+def : T2Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
+            (t2RORrr rGPR:$lhs, rGPR:$rhs)>;
+
+let Uses = [CPSR] in {
+def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+                   "rrx", "\t$Rd, $Rm",
+                   [(set rGPR:$Rd, (ARMrrx rGPR:$Rm))]>, Sched<[WriteALU]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{14-12} = 0b000;
+  let Inst{7-4} = 0b0011;
+}
+}
+
+let isCodeGenOnly = 1, Defs = [CPSR] in {
+def t2MOVsrl_flag : T2TwoRegShiftImm<
+                        (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+                        "lsrs", ".w\t$Rd, $Rm, #1",
+                        [(set rGPR:$Rd, (ARMsrl_flag rGPR:$Rm))]>,
+                        Sched<[WriteALU]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 1; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{5-4} = 0b01; // Shift type.
+  // Shift amount = Inst{14-12:7-6} = 1.
+  let Inst{14-12} = 0b000;
+  let Inst{7-6} = 0b01;
+}
+def t2MOVsra_flag : T2TwoRegShiftImm<
+                        (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+                        "asrs", ".w\t$Rd, $Rm, #1",
+                        [(set rGPR:$Rd, (ARMsra_flag rGPR:$Rm))]>,
+                        Sched<[WriteALU]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 1; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{5-4} = 0b10; // Shift type.
+  // Shift amount = Inst{14-12:7-6} = 1.
+  let Inst{14-12} = 0b000;
+  let Inst{7-6} = 0b01;
+}
+}
+
+//===----------------------------------------------------------------------===//
+//  Bitwise Instructions.
+//
+
+defm t2AND  : T2I_bin_w_irs<0b0000, "and",
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi, and, 1>;
+defm t2ORR  : T2I_bin_w_irs<0b0010, "orr",
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi, or, 1>;
+defm t2EOR  : T2I_bin_w_irs<0b0100, "eor",
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi, xor, 1>;
+
+defm t2BIC  : T2I_bin_w_irs<0b0001, "bic",
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
+                            BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+
+class T2BitFI<dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+    : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<5> msb;
+  bits<5> lsb;
+
+  let Inst{11-8}  = Rd;
+  let Inst{4-0}   = msb{4-0};
+  let Inst{14-12} = lsb{4-2};
+  let Inst{7-6}   = lsb{1-0};
+}
+
+class T2TwoRegBitFI<dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+    : T2BitFI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rn;
+
+  let Inst{19-16} = Rn;
+}
+
+let Constraints = "$src = $Rd" in
+def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm),
+                IIC_iUNAsi, "bfc", "\t$Rd, $imm",
+                [(set rGPR:$Rd, (and rGPR:$src, bf_inv_mask_imm:$imm))]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26} = 0; // should be 0.
+  let Inst{25} = 1;
+  let Inst{24-20} = 0b10110;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15} = 0;
+  let Inst{5} = 0; // should be 0.
+
+  bits<10> imm;
+  let msb{4-0} = imm{9-5};
+  let lsb{4-0} = imm{4-0};
+}
+
+def t2SBFX: T2TwoRegBitFI<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb),
+                 IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $msb", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-20} = 0b10100;
+  let Inst{15} = 0;
+}
+
+def t2UBFX: T2TwoRegBitFI<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb),
+                 IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $msb", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-20} = 0b11100;
+  let Inst{15} = 0;
+}
+
+// A8.8.247  UDF - Undefined (Encoding T2)
+def t2UDF : T2XI<(outs), (ins imm0_65535:$imm16), IIC_Br, "udf.w\t$imm16",
+                 [(int_arm_undefined imm0_65535:$imm16)]> {
+  bits<16> imm16;
+  let Inst{31-29} = 0b111;
+  let Inst{28-27} = 0b10;
+  let Inst{26-20} = 0b1111111;
+  let Inst{19-16} = imm16{15-12};
+  let Inst{15} = 0b1;
+  let Inst{14-12} = 0b010;
+  let Inst{11-0} = imm16{11-0};
+}
+
+// A8.6.18  BFI - Bitfield insert (Encoding T1)
+let Constraints = "$src = $Rd" in {
+  def t2BFI : T2TwoRegBitFI<(outs rGPR:$Rd),
+                  (ins rGPR:$src, rGPR:$Rn, bf_inv_mask_imm:$imm),
+                  IIC_iBITi, "bfi", "\t$Rd, $Rn, $imm",
+                  [(set rGPR:$Rd, (ARMbfi rGPR:$src, rGPR:$Rn,
+                                   bf_inv_mask_imm:$imm))]> {
+    let Inst{31-27} = 0b11110;
+    let Inst{26} = 0; // should be 0.
+    let Inst{25} = 1;
+    let Inst{24-20} = 0b10110;
+    let Inst{15} = 0;
+    let Inst{5} = 0; // should be 0.
+
+    bits<10> imm;
+    let msb{4-0} = imm{9-5};
+    let lsb{4-0} = imm{4-0};
+  }
+}
+
+defm t2ORN  : T2I_bin_irs<0b0011, "orn",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsi,
+                          BinOpFrag<(or node:$LHS, (not node:$RHS))>, 0, "">;
+
+/// T2I_un_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
+/// unary operation that produces a value. These are predicable and can be
+/// changed to modify CPSR.
+multiclass T2I_un_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                      PatFrag opnode,
+                      bit Cheap = 0, bit ReMat = 0, bit MoveImm = 0> {
+   // shifted imm
+   def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii,
+                opc, "\t$Rd, $imm",
+                [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]>, Sched<[WriteALU]> {
+     let isAsCheapAsAMove = Cheap;
+     let isReMaterializable = ReMat;
+     let isMoveImm = MoveImm;
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{15} = 0;
+   }
+   // register
+   def r : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), iir,
+                opc, ".w\t$Rd, $Rm",
+                [(set rGPR:$Rd, (opnode rGPR:$Rm))]>, Sched<[WriteALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def s : T2sOneRegShiftedReg<(outs rGPR:$Rd), (ins t2_so_reg:$ShiftedRm), iis,
+                opc, ".w\t$Rd, $ShiftedRm",
+                [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm))]>,
+                Sched<[WriteALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+   }
+}
+
+// Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version
+let AddedComplexity = 1 in
+defm t2MVN  : T2I_un_irs <0b0011, "mvn",
+                          IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi,
+                          not, 1, 1, 1>;
+
+let AddedComplexity = 1 in
+def : T2Pat<(and     rGPR:$src, t2_so_imm_not:$imm),
+            (t2BICri rGPR:$src, t2_so_imm_not:$imm)>;
+
+// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise
+def top16Zero: PatLeaf<(i32 rGPR:$src), [{
+  return CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16));
+  }]>;
+
+// so_imm_notSext is needed instead of so_imm_not, as the value of imm
+// will match the extended, not the original bitWidth for $src.
+def : T2Pat<(and top16Zero:$src, t2_so_imm_notSext:$imm),
+            (t2BICri rGPR:$src, t2_so_imm_notSext:$imm)>;
+
+
+// FIXME: Disable this pattern on Darwin to workaround an assembler bug.
+def : T2Pat<(or      rGPR:$src, t2_so_imm_not:$imm),
+            (t2ORNri rGPR:$src, t2_so_imm_not:$imm)>,
+            Requires<[IsThumb2]>;
+
+def : T2Pat<(t2_so_imm_not:$src),
+            (t2MVNi t2_so_imm_not:$src)>;
+
+//===----------------------------------------------------------------------===//
+//  Multiply Instructions.
+//
+let isCommutable = 1 in
+def t2MUL: T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
+                "mul", "\t$Rd, $Rn, $Rm",
+                [(set rGPR:$Rd, (mul rGPR:$Rn, rGPR:$Rm))]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b000;
+  let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+  let Inst{7-4} = 0b0000; // Multiply
+}
+
+class T2FourRegMLA<bits<4> op7_4, string opc, list<dag> pattern>
+  : T2FourReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+               opc, "\t$Rd, $Rn, $Rm, $Ra", pattern>,
+               Requires<[IsThumb2, UseMulOps]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b000;
+  let Inst{7-4} = op7_4;
+}
+
+def t2MLA : T2FourRegMLA<0b0000, "mla",
+                         [(set rGPR:$Rd, (add (mul rGPR:$Rn, rGPR:$Rm),
+                                               rGPR:$Ra))]>;
+def t2MLS: T2FourRegMLA<0b0001, "mls",
+                        [(set rGPR:$Rd, (sub rGPR:$Ra, (mul rGPR:$Rn,
+                                                            rGPR:$Rm)))]>;
+
+// Extra precision multiplies with low / high results
+let hasSideEffects = 0 in {
+let isCommutable = 1 in {
+def t2SMULL : T2MulLong<0b000, 0b0000, "smull", []>;
+def t2UMULL : T2MulLong<0b010, 0b0000, "umull", []>;
+} // isCommutable
+
+// Multiply + accumulate
+def t2SMLAL : T2MlaLong<0b100, 0b0000, "smlal">;
+def t2UMLAL : T2MlaLong<0b110, 0b0000, "umlal">;
+def t2UMAAL : T2MlaLong<0b110, 0b0110, "umaal">, Requires<[IsThumb2, HasDSP]>;
+} // hasSideEffects
+
+// Rounding variants of the below included for disassembly only
+
+// Most significant word multiply
+class T2SMMUL<bits<4> op7_4, string opc, list<dag> pattern>
+  : T2ThreeReg<(outs rGPR:$Rd),
+               (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
+               opc, "\t$Rd, $Rn, $Rm", pattern>,
+               Requires<[IsThumb2, HasDSP]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b101;
+  let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+  let Inst{7-4} = op7_4;
+}
+def t2SMMUL : T2SMMUL<0b0000, "smmul", [(set rGPR:$Rd, (mulhs rGPR:$Rn,
+                                                              rGPR:$Rm))]>;
+def t2SMMULR : T2SMMUL<0b0001, "smmulr", []>;
+
+class T2FourRegSMMLA<bits<3> op22_20, bits<4> op7_4, string opc,
+                     list<dag> pattern>
+  : T2FourReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+              opc, "\t$Rd, $Rn, $Rm, $Ra", pattern>,
+              Requires<[IsThumb2, HasDSP, UseMulOps]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = op22_20;
+  let Inst{7-4} = op7_4;
+}
+
+def t2SMMLA :   T2FourRegSMMLA<0b101, 0b0000, "smmla",
+                [(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]>;
+def t2SMMLAR:   T2FourRegSMMLA<0b101, 0b0001, "smmlar", []>;
+def t2SMMLS:    T2FourRegSMMLA<0b110, 0b0000, "smmls", []>;
+def t2SMMLSR:   T2FourRegSMMLA<0b110, 0b0001, "smmlsr", []>;
+
+class T2ThreeRegSMUL<bits<3> op22_20, bits<2> op5_4, string opc,
+                     list<dag> pattern>
+  : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, opc,
+               "\t$Rd, $Rn, $Rm", pattern>,
+    Requires<[IsThumb2, HasDSP]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = op22_20;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = op5_4;
+}
+
+def t2SMULBB : T2ThreeRegSMUL<0b001, 0b00, "smulbb",
+             [(set rGPR:$Rd, (mul (sext_inreg rGPR:$Rn, i16),
+                                   (sext_inreg rGPR:$Rm, i16)))]>;
+def t2SMULBT : T2ThreeRegSMUL<0b001, 0b01, "smulbt",
+             [(set rGPR:$Rd, (mul (sext_inreg rGPR:$Rn, i16),
+                                   (sra rGPR:$Rm, (i32 16))))]>;
+def t2SMULTB : T2ThreeRegSMUL<0b001, 0b10, "smultb",
+             [(set rGPR:$Rd, (mul (sra rGPR:$Rn, (i32 16)),
+                                   (sext_inreg rGPR:$Rm, i16)))]>;
+def t2SMULTT : T2ThreeRegSMUL<0b001, 0b11, "smultt",
+             [(set rGPR:$Rd, (mul (sra rGPR:$Rn, (i32 16)),
+                                   (sra rGPR:$Rm, (i32 16))))]>;
+def t2SMULWB : T2ThreeRegSMUL<0b011, 0b00, "smulwb", []>;
+def t2SMULWT : T2ThreeRegSMUL<0b011, 0b01, "smulwt", []>;
+
+def : Thumb2DSPPat<(mul sext_16_node:$Rm, sext_16_node:$Rn),
+                   (t2SMULBB rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(mul sext_16_node:$Rn, (sra rGPR:$Rm, (i32 16))),
+                   (t2SMULBT rGPR:$Rn, rGPR:$Rm)>;
+def : Thumb2DSPPat<(mul (sra rGPR:$Rn, (i32 16)), sext_16_node:$Rm),
+                   (t2SMULTB rGPR:$Rn, rGPR:$Rm)>;
+
+class T2FourRegSMLA<bits<3> op22_20, bits<2> op5_4, string opc,
+                    list<dag> pattern>
+  : T2FourReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMUL16,
+               opc, "\t$Rd, $Rn, $Rm, $Ra", pattern>,
+    Requires<[IsThumb2, HasDSP, UseMulOps]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = op22_20;
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = op5_4;
+}
+
+def t2SMLABB : T2FourRegSMLA<0b001, 0b00, "smlabb",
+             [(set rGPR:$Rd, (add rGPR:$Ra,
+                               (mul (sext_inreg rGPR:$Rn, i16),
+                                     (sext_inreg rGPR:$Rm, i16))))]>;
+def t2SMLABT : T2FourRegSMLA<0b001, 0b01, "smlabt",
+             [(set rGPR:$Rd, (add rGPR:$Ra, (mul (sext_inreg rGPR:$Rn, i16),
+                                                 (sra rGPR:$Rm, (i32 16)))))]>;
+def t2SMLATB : T2FourRegSMLA<0b001, 0b10, "smlatb",
+             [(set rGPR:$Rd, (add rGPR:$Ra, (mul (sra rGPR:$Rn, (i32 16)),
+                                                (sext_inreg rGPR:$Rm, i16))))]>;
+def t2SMLATT : T2FourRegSMLA<0b001, 0b11, "smlatt",
+             [(set rGPR:$Rd, (add rGPR:$Ra, (mul (sra rGPR:$Rn, (i32 16)),
+                                                 (sra rGPR:$Rm, (i32 16)))))]>;
+def t2SMLAWB : T2FourRegSMLA<0b011, 0b00, "smlawb", []>;
+def t2SMLAWT : T2FourRegSMLA<0b011, 0b01, "smlawt", []>;
+
+def : Thumb2DSPMulPat<(add rGPR:$Ra, (mul sext_16_node:$Rn, sext_16_node:$Rm)),
+                      (t2SMLABB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
+def : Thumb2DSPMulPat<(add rGPR:$Ra,
+                        (mul sext_16_node:$Rn, (sra rGPR:$Rm, (i32 16)))),
+                      (t2SMLABT rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
+def : Thumb2DSPMulPat<(add rGPR:$Ra,
+                        (mul (sra rGPR:$Rn, (i32 16)), sext_16_node:$Rm)),
+                      (t2SMLATB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
+
+class T2SMLAL<bits<3> op22_20, bits<4> op7_4, string opc, list<dag> pattern>
+  : T2FourReg_mac<1, op22_20, op7_4,
+                  (outs rGPR:$Ra, rGPR:$Rd),
+                  (ins rGPR:$Rn, rGPR:$Rm),
+                  IIC_iMAC64, opc, "\t$Ra, $Rd, $Rn, $Rm", []>,
+                  Requires<[IsThumb2, HasDSP]>;
+
+// Halfword multiple accumulate long: SMLAL<x><y>
+def t2SMLALBB : T2SMLAL<0b100, 0b1000, "smlalbb", []>;
+def t2SMLALBT : T2SMLAL<0b100, 0b1001, "smlalbt", []>;
+def t2SMLALTB : T2SMLAL<0b100, 0b1010, "smlaltb", []>;
+def t2SMLALTT : T2SMLAL<0b100, 0b1011, "smlaltt", []>;
+
+class T2DualHalfMul<bits<3> op22_20, bits<4> op7_4, string opc>
+  : T2ThreeReg_mac<0, op22_20, op7_4,
+                   (outs rGPR:$Rd),
+                   (ins rGPR:$Rn, rGPR:$Rm),
+                   IIC_iMAC32, opc, "\t$Rd, $Rn, $Rm", []>,
+                   Requires<[IsThumb2, HasDSP]> {
+  let Inst{15-12} = 0b1111;
+}
+
+// Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
+def t2SMUAD: T2DualHalfMul<0b010, 0b0000, "smuad">;
+def t2SMUADX: T2DualHalfMul<0b010, 0b0001, "smuadx">;
+def t2SMUSD: T2DualHalfMul<0b100, 0b0000, "smusd">;
+def t2SMUSDX: T2DualHalfMul<0b100, 0b0001, "smusdx">;
+
+class T2DualHalfMulAdd<bits<3> op22_20, bits<4> op7_4, string opc>
+  : T2FourReg_mac<0, op22_20, op7_4,
+                  (outs rGPR:$Rd),
+                  (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra),
+                  IIC_iMAC32, opc, "\t$Rd, $Rn, $Rm, $Ra", []>,
+                  Requires<[IsThumb2, HasDSP]>;
+
+def t2SMLAD   : T2DualHalfMulAdd<0b010, 0b0000, "smlad">;
+def t2SMLADX  : T2DualHalfMulAdd<0b010, 0b0001, "smladx">;
+def t2SMLSD   : T2DualHalfMulAdd<0b100, 0b0000, "smlsd">;
+def t2SMLSDX  : T2DualHalfMulAdd<0b100, 0b0001, "smlsdx">;
+
+class T2DualHalfMulAddLong<bits<3> op22_20, bits<4> op7_4, string opc>
+  : T2FourReg_mac<1, op22_20, op7_4,
+                  (outs rGPR:$Ra, rGPR:$Rd),
+                  (ins rGPR:$Rn, rGPR:$Rm),
+                  IIC_iMAC64, opc, "\t$Ra, $Rd, $Rn, $Rm", []>,
+                  Requires<[IsThumb2, HasDSP]>;
+
+def t2SMLALD  : T2DualHalfMulAddLong<0b100, 0b1100, "smlald">;
+def t2SMLALDX : T2DualHalfMulAddLong<0b100, 0b1101, "smlaldx">;
+def t2SMLSLD  : T2DualHalfMulAddLong<0b101, 0b1100, "smlsld">;
+def t2SMLSLDX : T2DualHalfMulAddLong<0b101, 0b1101, "smlsldx">;
+
+//===----------------------------------------------------------------------===//
+//  Division Instructions.
+//  Signed and unsigned division on v7-M
+//
+def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
+                 "sdiv", "\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, (sdiv rGPR:$Rn, rGPR:$Rm))]>,
+                 Requires<[HasDivide, IsThumb, HasV8MBaseline]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-21} = 0b011100;
+  let Inst{20} = 0b1;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-4} = 0b1111;
+}
+
+def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
+                 "udiv", "\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, (udiv rGPR:$Rn, rGPR:$Rm))]>,
+                 Requires<[HasDivide, IsThumb, HasV8MBaseline]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-21} = 0b011101;
+  let Inst{20} = 0b1;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-4} = 0b1111;
+}
+
+//===----------------------------------------------------------------------===//
+//  Misc. Arithmetic Instructions.
+//
+
+class T2I_misc<bits<2> op1, bits<2> op2, dag oops, dag iops,
+      InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : T2ThreeReg<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-22} = 0b01010;
+  let Inst{21-20} = op1;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-6} = 0b10;
+  let Inst{5-4} = op2;
+  let Rn{3-0} = Rm;
+}
+
+def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                    "clz", "\t$Rd, $Rm", [(set rGPR:$Rd, (ctlz rGPR:$Rm))]>,
+                    Sched<[WriteALU]>;
+
+def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                      "rbit", "\t$Rd, $Rm",
+                      [(set rGPR:$Rd, (bitreverse rGPR:$Rm))]>,
+                      Sched<[WriteALU]>;
+
+def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                 "rev", ".w\t$Rd, $Rm", [(set rGPR:$Rd, (bswap rGPR:$Rm))]>,
+                 Sched<[WriteALU]>;
+
+def t2REV16 : T2I_misc<0b01, 0b01, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                       "rev16", ".w\t$Rd, $Rm",
+                [(set rGPR:$Rd, (rotr (bswap rGPR:$Rm), (i32 16)))]>,
+                Sched<[WriteALU]>;
+
+def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                       "revsh", ".w\t$Rd, $Rm",
+                 [(set rGPR:$Rd, (sra (bswap rGPR:$Rm), (i32 16)))]>,
+                 Sched<[WriteALU]>;
+
+def : T2Pat<(or (sra (shl rGPR:$Rm, (i32 24)), (i32 16)),
+                (and (srl rGPR:$Rm, (i32 8)), 0xFF)),
+            (t2REVSH rGPR:$Rm)>;
+
+def t2PKHBT : T2ThreeReg<
+            (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, pkh_lsl_amt:$sh),
+                  IIC_iBITsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh",
+                  [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF),
+                                      (and (shl rGPR:$Rm, pkh_lsl_amt:$sh),
+                                           0xFFFF0000)))]>,
+                  Requires<[HasT2ExtractPack, IsThumb2]>,
+                  Sched<[WriteALUsi, ReadALU]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-20} = 0b01100;
+  let Inst{5} = 0; // BT form
+  let Inst{4} = 0;
+
+  bits<5> sh;
+  let Inst{14-12} = sh{4-2};
+  let Inst{7-6}   = sh{1-0};
+}
+
+// Alternate cases for PKHBT where identities eliminate some nodes.
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (and rGPR:$src2, 0xFFFF0000)),
+            (t2PKHBT rGPR:$src1, rGPR:$src2, 0)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (shl rGPR:$src2, imm16_31:$sh)),
+            (t2PKHBT rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+
+// Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
+// will match the pattern below.
+def t2PKHTB : T2ThreeReg<
+                  (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, pkh_asr_amt:$sh),
+                  IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh",
+                  [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF0000),
+                                       (and (sra rGPR:$Rm, pkh_asr_amt:$sh),
+                                            0xFFFF)))]>,
+                  Requires<[HasT2ExtractPack, IsThumb2]>,
+                  Sched<[WriteALUsi, ReadALU]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-20} = 0b01100;
+  let Inst{5} = 1; // TB form
+  let Inst{4} = 0;
+
+  bits<5> sh;
+  let Inst{14-12} = sh{4-2};
+  let Inst{7-6}   = sh{1-0};
+}
+
+// Alternate cases for PKHTB where identities eliminate some nodes.  Note that
+// a shift amount of 0 is *not legal* here, it is PKHBT instead.
+// We also can not replace a srl (17..31) by an arithmetic shift we would use in
+// pkhtb src1, src2, asr (17..31).
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16:$sh)),
+            (t2PKHTB rGPR:$src1, rGPR:$src2, imm16:$sh)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (sra rGPR:$src2, imm16_31:$sh)),
+            (t2PKHTB rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
+                (and (srl rGPR:$src2, imm1_15:$sh), 0xFFFF)),
+            (t2PKHTB rGPR:$src1, rGPR:$src2, imm1_15:$sh)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+
+//===----------------------------------------------------------------------===//
+// CRC32 Instructions
+//
+// Polynomials:
+// + CRC32{B,H,W}       0x04C11DB7
+// + CRC32C{B,H,W}      0x1EDC6F41
+//
+
+class T2I_crc32<bit C, bits<2> sz, string suffix, SDPatternOperator builtin>
+  : T2ThreeRegNoP<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), NoItinerary,
+               !strconcat("crc32", suffix, "\t$Rd, $Rn, $Rm"),
+               [(set rGPR:$Rd, (builtin rGPR:$Rn, rGPR:$Rm))]>,
+               Requires<[IsThumb2, HasV8, HasCRC]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-21} = 0b010110;
+  let Inst{20}    = C;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-6}   = 0b10;
+  let Inst{5-4}   = sz;
+}
+
+def t2CRC32B  : T2I_crc32<0, 0b00, "b", int_arm_crc32b>;
+def t2CRC32CB : T2I_crc32<1, 0b00, "cb", int_arm_crc32cb>;
+def t2CRC32H  : T2I_crc32<0, 0b01, "h", int_arm_crc32h>;
+def t2CRC32CH : T2I_crc32<1, 0b01, "ch", int_arm_crc32ch>;
+def t2CRC32W  : T2I_crc32<0, 0b10, "w", int_arm_crc32w>;
+def t2CRC32CW : T2I_crc32<1, 0b10, "cw", int_arm_crc32cw>;
+
+//===----------------------------------------------------------------------===//
+//  Comparison Instructions...
+//
+defm t2CMP  : T2I_cmp_irs<0b1101, "cmp",
+                          IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi, ARMcmp>;
+
+def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, t2_so_imm:$imm),
+            (t2CMPri  GPRnopc:$lhs, t2_so_imm:$imm)>;
+def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, rGPR:$rhs),
+            (t2CMPrr  GPRnopc:$lhs, rGPR:$rhs)>;
+def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, t2_so_reg:$rhs),
+            (t2CMPrs  GPRnopc:$lhs, t2_so_reg:$rhs)>;
+
+let isCompare = 1, Defs = [CPSR] in {
+   // shifted imm
+   def t2CMNri : T2OneRegCmpImm<
+                (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iCMPi,
+                "cmn", ".w\t$Rn, $imm",
+                [(ARMcmn GPRnopc:$Rn, (ineg t2_so_imm:$imm))]>,
+                Sched<[WriteCMP, ReadALU]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = 0b1000;
+     let Inst{20} = 1; // The S bit.
+     let Inst{15} = 0;
+     let Inst{11-8} = 0b1111; // Rd
+   }
+   // register
+   def t2CMNzrr : T2TwoRegCmp<
+                (outs), (ins GPRnopc:$Rn, rGPR:$Rm), IIC_iCMPr,
+                "cmn", ".w\t$Rn, $Rm",
+                [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                  GPRnopc:$Rn, rGPR:$Rm)]>, Sched<[WriteCMP, ReadALU, ReadALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = 0b1000;
+     let Inst{20} = 1; // The S bit.
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{11-8} = 0b1111; // Rd
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def t2CMNzrs : T2OneRegCmpShiftedReg<
+                (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), IIC_iCMPsi,
+                "cmn", ".w\t$Rn, $ShiftedRm",
+                [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                  GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]>,
+                  Sched<[WriteCMPsi, ReadALU, ReadALU]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = 0b1000;
+     let Inst{20} = 1; // The S bit.
+     let Inst{11-8} = 0b1111; // Rd
+   }
+}
+
+// Assembler aliases w/o the ".w" suffix.
+// No alias here for 'rr' version as not all instantiations of this multiclass
+// want one (CMP in particular, does not).
+def : t2InstAlias<"cmn${p} $Rn, $imm",
+   (t2CMNri GPRnopc:$Rn, t2_so_imm:$imm, pred:$p)>;
+def : t2InstAlias<"cmn${p} $Rn, $shift",
+   (t2CMNzrs GPRnopc:$Rn, t2_so_reg:$shift, pred:$p)>;
+
+def : T2Pat<(ARMcmp  GPR:$src, t2_so_imm_neg:$imm),
+            (t2CMNri GPR:$src, t2_so_imm_neg:$imm)>;
+
+def : T2Pat<(ARMcmpZ GPRnopc:$src, t2_so_imm_neg:$imm),
+            (t2CMNri GPRnopc:$src, t2_so_imm_neg:$imm)>;
+
+defm t2TST  : T2I_cmp_irs<0b0000, "tst",
+                          IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
+                         BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>>;
+defm t2TEQ  : T2I_cmp_irs<0b0100, "teq",
+                          IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
+                         BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>>;
+
+// Conditional moves
+let hasSideEffects = 0 in {
+
+let isCommutable = 1, isSelect = 1 in
+def t2MOVCCr : t2PseudoInst<(outs rGPR:$Rd),
+                            (ins rGPR:$false, rGPR:$Rm, cmovpred:$p),
+                            4, IIC_iCMOVr,
+                            [(set rGPR:$Rd, (ARMcmov rGPR:$false, rGPR:$Rm,
+                                                     cmovpred:$p))]>,
+               RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+let isMoveImm = 1 in
+def t2MOVCCi
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins rGPR:$false, t2_so_imm:$imm, cmovpred:$p),
+                   4, IIC_iCMOVi,
+                   [(set rGPR:$Rd, (ARMcmov rGPR:$false,t2_so_imm:$imm,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+let isCodeGenOnly = 1 in {
+let isMoveImm = 1 in
+def t2MOVCCi16
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins  rGPR:$false, imm0_65535_expr:$imm, cmovpred:$p),
+                   4, IIC_iCMOVi,
+                   [(set rGPR:$Rd, (ARMcmov rGPR:$false, imm0_65535:$imm,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+let isMoveImm = 1 in
+def t2MVNCCi
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins rGPR:$false, t2_so_imm:$imm, cmovpred:$p),
+                   4, IIC_iCMOVi,
+                   [(set rGPR:$Rd,
+                         (ARMcmov rGPR:$false, t2_so_imm_not:$imm,
+                                  cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+class MOVCCShPseudo<SDPatternOperator opnode, Operand ty>
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins rGPR:$false, rGPR:$Rm, i32imm:$imm, cmovpred:$p),
+                   4, IIC_iCMOVsi,
+                   [(set rGPR:$Rd, (ARMcmov rGPR:$false,
+                                            (opnode rGPR:$Rm, (i32 ty:$imm)),
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+def t2MOVCClsl : MOVCCShPseudo<shl,  imm0_31>;
+def t2MOVCClsr : MOVCCShPseudo<srl,  imm_sr>;
+def t2MOVCCasr : MOVCCShPseudo<sra,  imm_sr>;
+def t2MOVCCror : MOVCCShPseudo<rotr, imm0_31>;
+
+let isMoveImm = 1 in
+def t2MOVCCi32imm
+    : t2PseudoInst<(outs rGPR:$dst),
+                   (ins rGPR:$false, i32imm:$src, cmovpred:$p),
+                   8, IIC_iCMOVix2,
+                   [(set rGPR:$dst, (ARMcmov rGPR:$false, imm:$src,
+                                             cmovpred:$p))]>,
+      RegConstraint<"$false = $dst">;
+} // isCodeGenOnly = 1
+
+} // hasSideEffects
+
+//===----------------------------------------------------------------------===//
+// Atomic operations intrinsics
+//
+
+// memory barriers protect the atomic sequences
+let hasSideEffects = 1 in {
+def t2DMB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
+                "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>,
+                Requires<[IsThumb, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf3bf8f5;
+  let Inst{3-0} = opt;
+}
+
+def t2DSB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
+                "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>,
+                Requires<[IsThumb, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf3bf8f4;
+  let Inst{3-0} = opt;
+}
+
+def t2ISB : T2I<(outs), (ins instsyncb_opt:$opt), NoItinerary,
+                "isb", "\t$opt", [(int_arm_isb (i32 imm0_15:$opt))]>,
+                Requires<[IsThumb, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf3bf8f6;
+  let Inst{3-0} = opt;
+}
+}
+
+class T2I_ldrex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz,
+                InstrItinClass itin, string opc, string asm, string cstr,
+                list<dag> pattern, bits<4> rt2 = 0b1111>
+  : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001101;
+  let Inst{11-8} = rt2;
+  let Inst{7-4} = opcod;
+  let Inst{3-0} = 0b1111;
+
+  bits<4> addr;
+  bits<4> Rt;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+}
+class T2I_strex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz,
+                InstrItinClass itin, string opc, string asm, string cstr,
+                list<dag> pattern, bits<4> rt2 = 0b1111>
+  : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001100;
+  let Inst{11-8} = rt2;
+  let Inst{7-4} = opcod;
+
+  bits<4> Rd;
+  bits<4> addr;
+  bits<4> Rt;
+  let Inst{3-0}  = Rd;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+}
+
+let mayLoad = 1 in {
+def t2LDREXB : T2I_ldrex<0b0100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldrexb", "\t$Rt, $addr", "",
+                         [(set rGPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]>;
+def t2LDREXH : T2I_ldrex<0b0101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldrexh", "\t$Rt, $addr", "",
+                         [(set rGPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]>;
+def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr),
+                       AddrModeNone, 4, NoItinerary,
+                       "ldrex", "\t$Rt, $addr", "",
+                     [(set rGPR:$Rt, (ldrex_4 t2addrmode_imm0_1020s4:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]> {
+  bits<4> Rt;
+  bits<12> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0000101;
+  let Inst{19-16} = addr{11-8};
+  let Inst{15-12} = Rt;
+  let Inst{11-8} = 0b1111;
+  let Inst{7-0} = addr{7-0};
+}
+let hasExtraDefRegAllocReq = 1 in
+def t2LDREXD : T2I_ldrex<0b0111, (outs rGPR:$Rt, rGPR:$Rt2),
+                         (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldrexd", "\t$Rt, $Rt2, $addr", "",
+                         [], {?, ?, ?, ?}>,
+               Requires<[IsThumb2, IsNotMClass]> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+}
+def t2LDAEXB : T2I_ldrex<0b1100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldaexb", "\t$Rt, $addr", "",
+                         [(set rGPR:$Rt, (ldaex_1 addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
+def t2LDAEXH : T2I_ldrex<0b1101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldaexh", "\t$Rt, $addr", "",
+                         [(set rGPR:$Rt, (ldaex_2 addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
+def t2LDAEX  : Thumb2I<(outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                       AddrModeNone, 4, NoItinerary,
+                       "ldaex", "\t$Rt, $addr", "",
+                         [(set rGPR:$Rt, (ldaex_4 addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]> {
+  bits<4> Rt;
+  bits<4> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001101;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+  let Inst{11-8} = 0b1111;
+  let Inst{7-0} = 0b11101111;
+}
+let hasExtraDefRegAllocReq = 1 in
+def t2LDAEXD : T2I_ldrex<0b1111, (outs rGPR:$Rt, rGPR:$Rt2),
+                         (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldaexd", "\t$Rt, $Rt2, $addr", "",
+                         [], {?, ?, ?, ?}>, Requires<[IsThumb,
+                         HasAcquireRelease, HasV7Clrex, IsNotMClass]> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+
+  let Inst{7} = 1;
+}
+}
+
+let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
+def t2STREXB : T2I_strex<0b0100, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "strexb", "\t$Rd, $Rt, $addr", "",
+                         [(set rGPR:$Rd,
+                               (strex_1 rGPR:$Rt, addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]>;
+def t2STREXH : T2I_strex<0b0101, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "strexh", "\t$Rd, $Rt, $addr", "",
+                         [(set rGPR:$Rd,
+                               (strex_2 rGPR:$Rt, addr_offset_none:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]>;
+
+def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
+                             t2addrmode_imm0_1020s4:$addr),
+                  AddrModeNone, 4, NoItinerary,
+                  "strex", "\t$Rd, $Rt, $addr", "",
+                  [(set rGPR:$Rd,
+                        (strex_4 rGPR:$Rt, t2addrmode_imm0_1020s4:$addr))]>,
+               Requires<[IsThumb, HasV8MBaseline]> {
+  bits<4> Rd;
+  bits<4> Rt;
+  bits<12> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0000100;
+  let Inst{19-16} = addr{11-8};
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = Rd;
+  let Inst{7-0} = addr{7-0};
+}
+let hasExtraSrcRegAllocReq = 1 in
+def t2STREXD : T2I_strex<0b0111, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
+                         {?, ?, ?, ?}>,
+               Requires<[IsThumb2, IsNotMClass]> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+}
+def t2STLEXB : T2I_strex<0b1100, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "stlexb", "\t$Rd, $Rt, $addr", "",
+                         [(set rGPR:$Rd,
+                               (stlex_1 rGPR:$Rt, addr_offset_none:$addr))]>,
+                         Requires<[IsThumb, HasAcquireRelease,
+                                   HasV7Clrex]>;
+
+def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "stlexh", "\t$Rd, $Rt, $addr", "",
+                         [(set rGPR:$Rd,
+                               (stlex_2 rGPR:$Rt, addr_offset_none:$addr))]>,
+                         Requires<[IsThumb, HasAcquireRelease,
+                                   HasV7Clrex]>;
+
+def t2STLEX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
+                             addr_offset_none:$addr),
+                  AddrModeNone, 4, NoItinerary,
+                  "stlex", "\t$Rd, $Rt, $addr", "",
+                  [(set rGPR:$Rd,
+                        (stlex_4 rGPR:$Rt, addr_offset_none:$addr))]>,
+                  Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]> {
+  bits<4> Rd;
+  bits<4> Rt;
+  bits<4> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001100;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+  let Inst{11-4}  = 0b11111110;
+  let Inst{3-0}   = Rd;
+}
+let hasExtraSrcRegAllocReq = 1 in
+def t2STLEXD : T2I_strex<0b1111, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "stlexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
+                         {?, ?, ?, ?}>, Requires<[IsThumb, HasAcquireRelease,
+                         HasV7Clrex, IsNotMClass]> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+}
+}
+
+def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", [(int_arm_clrex)]>,
+            Requires<[IsThumb, HasV7Clrex]>  {
+  let Inst{31-16} = 0xf3bf;
+  let Inst{15-14} = 0b10;
+  let Inst{13} = 0;
+  let Inst{12} = 0;
+  let Inst{11-8} = 0b1111;
+  let Inst{7-4} = 0b0010;
+  let Inst{3-0} = 0b1111;
+}
+
+def : T2Pat<(and (ldrex_1 addr_offset_none:$addr), 0xff),
+            (t2LDREXB addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasV8MBaseline]>;
+def : T2Pat<(and (ldrex_2 addr_offset_none:$addr), 0xffff),
+            (t2LDREXH addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasV8MBaseline]>;
+def : T2Pat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+            (t2STREXB GPR:$Rt, addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasV8MBaseline]>;
+def : T2Pat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+            (t2STREXH GPR:$Rt, addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasV8MBaseline]>;
+
+def : T2Pat<(and (ldaex_1 addr_offset_none:$addr), 0xff),
+            (t2LDAEXB addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
+def : T2Pat<(and (ldaex_2 addr_offset_none:$addr), 0xffff),
+            (t2LDAEXH addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
+def : T2Pat<(stlex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+            (t2STLEXB GPR:$Rt, addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
+def : T2Pat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+            (t2STLEXH GPR:$Rt, addr_offset_none:$addr)>,
+            Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>;
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+//   eh_sjlj_setjmp() is an instruction sequence to store the return
+//   address and save #0 in R0 for the non-longjmp case.
+//   Since by its nature we may be coming from some other function to get
+//   here, and we're using the stack frame for the containing function to
+//   save/restore registers, we can't keep anything live in regs across
+//   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
+//   when we get here from a longjmp(). We force everything out of registers
+//   except for our own input by listing the relevant registers in Defs. By
+//   doing so, we also cause the prologue/epilogue code to actively preserve
+//   all of the callee-saved resgisters, which is exactly what we want.
+//   $val is a scratch register for our use.
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR,
+    Q0, Q1, Q2, Q3, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15],
+  hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+  usesCustomInserter = 1 in {
+  def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
+                               AddrModeNone, 0, NoItinerary, "", "",
+                          [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>,
+                             Requires<[IsThumb2, HasVFP2]>;
+}
+
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR ],
+  hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+  usesCustomInserter = 1 in {
+  def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
+                               AddrModeNone, 0, NoItinerary, "", "",
+                          [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>,
+                                  Requires<[IsThumb2, NoVFP]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Control-Flow Instructions
+//
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+// FIXME: Should pc be an implicit operand like PICADD, etc?
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+    hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in
+def t2LDMIA_RET: t2PseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
+                                                   reglist:$regs, variable_ops),
+                              4, IIC_iLoad_mBr, [],
+            (t2LDMIA_UPD GPR:$wb, GPR:$Rn, pred:$p, reglist:$regs)>,
+                         RegConstraint<"$Rn = $wb">;
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+let isPredicable = 1 in
+def t2B   : T2I<(outs), (ins thumb_br_target:$target), IIC_Br,
+                 "b", ".w\t$target",
+                 [(br bb:$target)]>, Sched<[WriteBr]>,
+                 Requires<[IsThumb, HasV8MBaseline]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 1;
+
+  bits<24> target;
+  let Inst{26} = target{23};
+  let Inst{13} = target{22};
+  let Inst{11} = target{21};
+  let Inst{25-16} = target{20-11};
+  let Inst{10-0} = target{10-0};
+  let DecoderMethod = "DecodeT2BInstruction";
+  let AsmMatchConverter = "cvtThumbBranches";
+}
+
+let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in {
+
+// available in both v8-M.Baseline and Thumb2 targets
+def t2BR_JT : t2basePseudoInst<(outs),
+          (ins GPR:$target, GPR:$index, i32imm:$jt),
+           0, IIC_Br,
+          [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt)]>,
+          Sched<[WriteBr]>;
+
+// FIXME: Add a case that can be predicated.
+def t2TBB_JT : t2PseudoInst<(outs),
+        (ins GPR:$base, GPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
+        Sched<[WriteBr]>;
+
+def t2TBH_JT : t2PseudoInst<(outs),
+        (ins GPR:$base, GPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
+        Sched<[WriteBr]>;
+
+def t2TBB : T2I<(outs), (ins addrmode_tbb:$addr), IIC_Br,
+                    "tbb", "\t$addr", []>, Sched<[WriteBrTbl]> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{31-20} = 0b111010001101;
+  let Inst{19-16} = Rn;
+  let Inst{15-5} = 0b11110000000;
+  let Inst{4} = 0; // B form
+  let Inst{3-0} = Rm;
+
+  let DecoderMethod = "DecodeThumbTableBranch";
+}
+
+def t2TBH : T2I<(outs), (ins addrmode_tbh:$addr), IIC_Br,
+                   "tbh", "\t$addr", []>, Sched<[WriteBrTbl]> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{31-20} = 0b111010001101;
+  let Inst{19-16} = Rn;
+  let Inst{15-5} = 0b11110000000;
+  let Inst{4} = 1; // H form
+  let Inst{3-0} = Rm;
+
+  let DecoderMethod = "DecodeThumbTableBranch";
+}
+} // isNotDuplicable, isIndirectBranch
+
+} // isBranch, isTerminator, isBarrier
+
+// FIXME: should be able to write a pattern for ARMBrcond, but can't use
+// a two-value operand where a dag node expects ", "two operands. :(
+let isBranch = 1, isTerminator = 1 in
+def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
+                "b", ".w\t$target",
+                [/*(ARMbrcond bb:$target, imm:$cc)*/]>, Sched<[WriteBr]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 0;
+
+  bits<4> p;
+  let Inst{25-22} = p;
+
+  bits<21> target;
+  let Inst{26} = target{20};
+  let Inst{11} = target{19};
+  let Inst{13} = target{18};
+  let Inst{21-16} = target{17-12};
+  let Inst{10-0} = target{11-1};
+
+  let DecoderMethod = "DecodeThumb2BCCInstruction";
+  let AsmMatchConverter = "cvtThumbBranches";
+}
+
+// Tail calls. The MachO version of thumb tail calls uses a t2 branch, so
+// it goes here.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  // IOS version.
+  let Uses = [SP] in
+  def tTAILJMPd: tPseudoExpand<(outs),
+                   (ins thumb_br_target:$dst, pred:$p),
+                   4, IIC_Br, [],
+                   (t2B thumb_br_target:$dst, pred:$p)>,
+                 Requires<[IsThumb2, IsMachO]>, Sched<[WriteBr]>;
+}
+
+// IT block
+let Defs = [ITSTATE] in
+def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
+                    AddrModeNone, 2,  IIC_iALUx,
+                    "it$mask\t$cc", "", []>,
+           ComplexDeprecationPredicate<"IT"> {
+  // 16-bit instruction.
+  let Inst{31-16} = 0x0000;
+  let Inst{15-8} = 0b10111111;
+
+  bits<4> cc;
+  bits<4> mask;
+  let Inst{7-4} = cc;
+  let Inst{3-0} = mask;
+
+  let DecoderMethod = "DecodeIT";
+}
+
+// Branch and Exchange Jazelle -- for disassembly only
+// Rm = Inst{19-16}
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in
+def t2BXJ : T2I<(outs), (ins GPRnopc:$func), NoItinerary, "bxj", "\t$func", []>,
+    Sched<[WriteBr]>, Requires<[IsThumb2, IsNotMClass]> {
+  bits<4> func;
+  let Inst{31-27} = 0b11110;
+  let Inst{26} = 0;
+  let Inst{25-20} = 0b111100;
+  let Inst{19-16} = func;
+  let Inst{15-0} = 0b1000111100000000;
+}
+
+// Compare and branch on zero / non-zero
+let isBranch = 1, isTerminator = 1 in {
+  def tCBZ  : T1I<(outs), (ins tGPR:$Rn, thumb_cb_target:$target), IIC_Br,
+                  "cbz\t$Rn, $target", []>,
+              T1Misc<{0,0,?,1,?,?,?}>,
+              Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteBr]> {
+    // A8.6.27
+    bits<6> target;
+    bits<3> Rn;
+    let Inst{9}   = target{5};
+    let Inst{7-3} = target{4-0};
+    let Inst{2-0} = Rn;
+  }
+
+  def tCBNZ : T1I<(outs), (ins tGPR:$Rn, thumb_cb_target:$target), IIC_Br,
+                  "cbnz\t$Rn, $target", []>,
+              T1Misc<{1,0,?,1,?,?,?}>,
+              Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteBr]> {
+    // A8.6.27
+    bits<6> target;
+    bits<3> Rn;
+    let Inst{9}   = target{5};
+    let Inst{7-3} = target{4-0};
+    let Inst{2-0} = Rn;
+  }
+}
+
+
+// Change Processor State is a system instruction.
+// FIXME: Since the asm parser has currently no clean way to handle optional
+// operands, create 3 versions of the same instruction. Once there's a clean
+// framework to represent optional operands, change this behavior.
+class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary,
+            !strconcat("cps", asm_op), []>,
+          Requires<[IsThumb2, IsNotMClass]> {
+  bits<2> imod;
+  bits<3> iflags;
+  bits<5> mode;
+  bit M;
+
+  let Inst{31-11} = 0b111100111010111110000;
+  let Inst{10-9}  = imod;
+  let Inst{8}     = M;
+  let Inst{7-5}   = iflags;
+  let Inst{4-0}   = mode;
+  let DecoderMethod = "DecodeT2CPSInstruction";
+}
+
+let M = 1 in
+  def t2CPS3p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags, i32imm:$mode),
+                      "$imod\t$iflags, $mode">;
+let mode = 0, M = 0 in
+  def t2CPS2p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags),
+                      "$imod.w\t$iflags">;
+let imod = 0, iflags = 0, M = 1 in
+  def t2CPS1p : t2CPS<(ins imm0_31:$mode), "\t$mode">;
+
+def : t2InstAlias<"cps$imod.w $iflags, $mode",
+                   (t2CPS3p imod_op:$imod, iflags_op:$iflags, i32imm:$mode), 0>;
+def : t2InstAlias<"cps.w $mode", (t2CPS1p imm0_31:$mode), 0>;
+
+// A6.3.4 Branches and miscellaneous control
+// Table A6-14 Change Processor State, and hint instructions
+def t2HINT : T2I<(outs), (ins imm0_239:$imm), NoItinerary, "hint", ".w\t$imm",
+                  [(int_arm_hint imm0_239:$imm)]> {
+  bits<8> imm;
+  let Inst{31-3} = 0b11110011101011111000000000000;
+  let Inst{7-0} = imm;
+}
+
+def : t2InstAlias<"hint$p $imm", (t2HINT imm0_239:$imm, pred:$p), 0>;
+def : t2InstAlias<"nop$p.w", (t2HINT 0, pred:$p), 1>;
+def : t2InstAlias<"yield$p.w", (t2HINT 1, pred:$p), 1>;
+def : t2InstAlias<"wfe$p.w", (t2HINT 2, pred:$p), 1>;
+def : t2InstAlias<"wfi$p.w", (t2HINT 3, pred:$p), 1>;
+def : t2InstAlias<"sev$p.w", (t2HINT 4, pred:$p), 1>;
+def : t2InstAlias<"sevl$p.w", (t2HINT 5, pred:$p), 1> {
+  let Predicates = [IsThumb2, HasV8];
+}
+def : t2InstAlias<"esb$p.w", (t2HINT 16, pred:$p), 1> {
+  let Predicates = [IsThumb2, HasRAS];
+}
+def : t2InstAlias<"esb$p", (t2HINT 16, pred:$p), 0> {
+  let Predicates = [IsThumb2, HasRAS];
+}
+
+def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt",
+                [(int_arm_dbg imm0_15:$opt)]> {
+  bits<4> opt;
+  let Inst{31-20} = 0b111100111010;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-8} = 0b10000000;
+  let Inst{7-4} = 0b1111;
+  let Inst{3-0} = opt;
+}
+
+// Secure Monitor Call is a system instruction.
+// Option = Inst{19-16}
+let isCall = 1, Uses = [SP] in
+def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
+                []>, Requires<[IsThumb2, HasTrustZone]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26-20} = 0b1111111;
+  let Inst{15-12} = 0b1000;
+
+  bits<4> opt;
+  let Inst{19-16} = opt;
+}
+
+class T2DCPS<bits<2> opt, string opc>
+  : T2I<(outs), (ins), NoItinerary, opc, "", []>, Requires<[IsThumb2, HasV8]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26-20} = 0b1111000;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = 0b1000;
+  let Inst{11-2} = 0b0000000000;
+  let Inst{1-0} = opt;
+}
+
+def t2DCPS1 : T2DCPS<0b01, "dcps1">;
+def t2DCPS2 : T2DCPS<0b10, "dcps2">;
+def t2DCPS3 : T2DCPS<0b11, "dcps3">;
+
+class T2SRS<bits<2> Op, bit W, dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsThumb2,IsNotMClass]> {
+  bits<5> mode;
+  let Inst{31-25} = 0b1110100;
+  let Inst{24-23} = Op;
+  let Inst{22} = 0;
+  let Inst{21} = W;
+  let Inst{20-16} = 0b01101;
+  let Inst{15-5} = 0b11000000000;
+  let Inst{4-0} = mode{4-0};
+}
+
+// Store Return State is a system instruction.
+def t2SRSDB_UPD : T2SRS<0b00, 1, (outs), (ins imm0_31:$mode), NoItinerary,
+                        "srsdb", "\tsp!, $mode", []>;
+def t2SRSDB  : T2SRS<0b00, 0, (outs), (ins imm0_31:$mode), NoItinerary,
+                     "srsdb","\tsp, $mode", []>;
+def t2SRSIA_UPD : T2SRS<0b11, 1, (outs), (ins imm0_31:$mode), NoItinerary,
+                        "srsia","\tsp!, $mode", []>;
+def t2SRSIA  : T2SRS<0b11, 0, (outs), (ins imm0_31:$mode), NoItinerary,
+                     "srsia","\tsp, $mode", []>;
+
+
+def : t2InstAlias<"srsdb${p} $mode", (t2SRSDB imm0_31:$mode, pred:$p)>;
+def : t2InstAlias<"srsdb${p} $mode!", (t2SRSDB_UPD imm0_31:$mode, pred:$p)>;
+
+def : t2InstAlias<"srsia${p} $mode", (t2SRSIA imm0_31:$mode, pred:$p)>;
+def : t2InstAlias<"srsia${p} $mode!", (t2SRSIA_UPD imm0_31:$mode, pred:$p)>;
+
+// Return From Exception is a system instruction.
+let isReturn = 1, isBarrier = 1, isTerminator = 1, Defs = [PC] in
+class T2RFE<bits<12> op31_20, dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsThumb2,IsNotMClass]> {
+  let Inst{31-20} = op31_20{11-0};
+
+  bits<4> Rn;
+  let Inst{19-16} = Rn;
+  let Inst{15-0} = 0xc000;
+}
+
+def t2RFEDBW : T2RFE<0b111010000011,
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn!",
+                   [/* For disassembly only; pattern left blank */]>;
+def t2RFEDB  : T2RFE<0b111010000001,
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn",
+                   [/* For disassembly only; pattern left blank */]>;
+def t2RFEIAW : T2RFE<0b111010011011,
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn!",
+                   [/* For disassembly only; pattern left blank */]>;
+def t2RFEIA  : T2RFE<0b111010011001,
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn",
+                   [/* For disassembly only; pattern left blank */]>;
+
+// B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction.
+// Exception return instruction is "subs pc, lr, #imm".
+let isReturn = 1, isBarrier = 1, isTerminator = 1, Defs = [PC] in
+def t2SUBS_PC_LR : T2I <(outs), (ins imm0_255:$imm), NoItinerary,
+                        "subs", "\tpc, lr, $imm",
+                        [(ARMintretflag imm0_255:$imm)]>,
+                   Requires<[IsThumb2,IsNotMClass]> {
+  let Inst{31-8} = 0b111100111101111010001111;
+
+  bits<8> imm;
+  let Inst{7-0} = imm;
+}
+
+// Hypervisor Call is a system instruction.
+let isCall = 1 in {
+def t2HVC : T2XI <(outs), (ins imm0_65535:$imm16), IIC_Br, "hvc.w\t$imm16", []>,
+      Requires<[IsThumb2, HasVirtualization]>, Sched<[WriteBr]> {
+    bits<16> imm16;
+    let Inst{31-20} = 0b111101111110;
+    let Inst{19-16} = imm16{15-12};
+    let Inst{15-12} = 0b1000;
+    let Inst{11-0} = imm16{11-0};
+}
+}
+
+// Alias for HVC without the ".w" optional width specifier
+def : t2InstAlias<"hvc\t$imm16", (t2HVC imm0_65535:$imm16)>;
+
+// ERET - Return from exception in Hypervisor mode.
+// B9.3.3, B9.3.20: ERET is an alias for "SUBS PC, LR, #0" in an implementation that
+// includes virtualization extensions.
+def t2ERET : InstAlias<"eret${p}", (t2SUBS_PC_LR 0, pred:$p), 1>,
+             Requires<[IsThumb2, HasVirtualization]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// 32-bit immediate using movw + movt.
+// This is a single pseudo instruction to make it re-materializable.
+// FIXME: Remove this when we can do generalized remat.
+let isReMaterializable = 1, isMoveImm = 1 in
+def t2MOVi32imm : PseudoInst<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
+                            [(set rGPR:$dst, (i32 imm:$src))]>,
+                            Requires<[IsThumb, UseMovt]>;
+
+// Pseudo instruction that combines movw + movt + add pc (if pic).
+// It also makes it possible to rematerialize the instructions.
+// FIXME: Remove this when we can do generalized remat and when machine licm
+// can properly the instructions.
+let isReMaterializable = 1 in {
+def t2MOV_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr),
+                                IIC_iMOVix2addpc,
+                          [(set rGPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
+                          Requires<[IsThumb, HasV8MBaseline, UseMovt]>;
+
+}
+
+def : T2Pat<(ARMWrapperPIC tglobaltlsaddr :$dst),
+            (t2MOV_ga_pcrel tglobaltlsaddr:$dst)>,
+      Requires<[IsThumb2, UseMovt]>;
+def : T2Pat<(ARMWrapper tglobaltlsaddr:$dst),
+            (t2MOVi32imm tglobaltlsaddr:$dst)>,
+      Requires<[IsThumb2, UseMovt]>;
+
+// ConstantPool, GlobalAddress, and JumpTable
+def : T2Pat<(ARMWrapper tconstpool :$dst), (t2LEApcrel tconstpool :$dst)>;
+def : T2Pat<(ARMWrapper texternalsym :$dst), (t2MOVi32imm texternalsym :$dst)>,
+    Requires<[IsThumb, HasV8MBaseline, UseMovt]>;
+def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>,
+    Requires<[IsThumb, HasV8MBaseline, UseMovt]>;
+
+def : T2Pat<(ARMWrapperJT tjumptable:$dst), (t2LEApcrelJT tjumptable:$dst)>;
+
+// Pseudo instruction that combines ldr from constpool and add pc. This should
+// be expanded into two instructions late to allow if-conversion and
+// scheduling.
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp),
+                   IIC_iLoadiALU,
+              [(set rGPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
+                                           imm:$cp))]>,
+               Requires<[IsThumb2]>;
+
+// Pseudo isntruction that combines movs + predicated rsbmi
+// to implement integer ABS
+let usesCustomInserter = 1, Defs = [CPSR] in {
+def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src),
+                       NoItinerary, []>, Requires<[IsThumb2]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Coprocessor load/store -- for disassembly only
+//
+class T2CI<bits<4> op31_28, dag oops, dag iops, string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, NoItinerary, opc, asm, pattern> {
+  let Inst{31-28} = op31_28;
+  let Inst{27-25} = 0b110;
+}
+
+multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm, list<dag> pattern> {
+  def _OFFSET : T2CI<op31_28,
+                     (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                     asm, "\t$cop, $CRd, $addr", pattern> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _PRE : T2CI<op31_28,
+                  (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
+                  asm, "\t$cop, $CRd, $addr!", []> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _POST: T2CI<op31_28,
+                  (outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                               postidx_imm8s4:$offset),
+                 asm, "\t$cop, $CRd, $addr, $offset", []> {
+    bits<9> offset;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = offset{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = offset{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _OPTION : T2CI<op31_28, (outs),
+                     (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                          coproc_option_imm:$option),
+      asm, "\t$cop, $CRd, $addr, $option", []> {
+    bits<8> option;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = 1; // U = 1
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = option;
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+}
+
+defm t2LDC   : t2LdStCop<0b1110, 1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm t2LDCL  : t2LdStCop<0b1110, 1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+
+defm t2STC   : t2LdStCop<0b1110, 0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm t2STCL  : t2LdStCop<0b1110, 0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
+defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8,IsThumb2]>;
+
+
+//===----------------------------------------------------------------------===//
+// Move between special register and ARM core register -- for disassembly only
+//
+// Move to ARM core register from Special Register
+
+// A/R class MRS.
+//
+// A/R class can only move from CPSR or SPSR.
+def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr",
+                  []>, Requires<[IsThumb2,IsNotMClass]> {
+  bits<4> Rd;
+  let Inst{31-12} = 0b11110011111011111000;
+  let Inst{11-8} = Rd;
+  let Inst{7-0} = 0b00000000;
+}
+
+def : t2InstAlias<"mrs${p} $Rd, cpsr", (t2MRS_AR GPR:$Rd, pred:$p)>;
+
+def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr",
+                   []>, Requires<[IsThumb2,IsNotMClass]> {
+  bits<4> Rd;
+  let Inst{31-12} = 0b11110011111111111000;
+  let Inst{11-8} = Rd;
+  let Inst{7-0} = 0b00000000;
+}
+
+def t2MRSbanked : T2I<(outs rGPR:$Rd), (ins banked_reg:$banked),
+                      NoItinerary, "mrs", "\t$Rd, $banked", []>,
+                  Requires<[IsThumb, HasVirtualization]> {
+  bits<6> banked;
+  bits<4> Rd;
+
+  let Inst{31-21} = 0b11110011111;
+  let Inst{20} = banked{5}; // R bit
+  let Inst{19-16} = banked{3-0};
+  let Inst{15-12} = 0b1000;
+  let Inst{11-8} = Rd;
+  let Inst{7-5} = 0b001;
+  let Inst{4} = banked{4};
+  let Inst{3-0} = 0b0000;
+}
+
+
+// M class MRS.
+//
+// This MRS has a mask field in bits 7-0 and can take more values than
+// the A/R class (a full msr_mask).
+def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$SYSm), NoItinerary,
+                  "mrs", "\t$Rd, $SYSm", []>,
+              Requires<[IsThumb,IsMClass]> {
+  bits<4> Rd;
+  bits<8> SYSm;
+  let Inst{31-12} = 0b11110011111011111000;
+  let Inst{11-8} = Rd;
+  let Inst{7-0} = SYSm;
+
+  let Unpredictable{20-16} = 0b11111;
+  let Unpredictable{13} = 0b1;
+}
+
+
+// Move from ARM core register to Special Register
+//
+// A/R class MSR.
+//
+// No need to have both system and application versions, the encodings are the
+// same and the assembly parser has no way to distinguish between them. The mask
+// operand contains the special register (R Bit) in bit 4 and bits 3-0 contains
+// the mask with the fields to be accessed in the special register.
+let Defs = [CPSR] in
+def t2MSR_AR : T2I<(outs), (ins msr_mask:$mask, rGPR:$Rn),
+                   NoItinerary, "msr", "\t$mask, $Rn", []>,
+               Requires<[IsThumb2,IsNotMClass]> {
+  bits<5> mask;
+  bits<4> Rn;
+  let Inst{31-21} = 0b11110011100;
+  let Inst{20}    = mask{4}; // R Bit
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b1000;
+  let Inst{11-8}  = mask{3-0};
+  let Inst{7-0}   = 0;
+}
+
+// However, the MSR (banked register) system instruction (ARMv7VE) *does* have a
+// separate encoding (distinguished by bit 5.
+def t2MSRbanked : T2I<(outs), (ins banked_reg:$banked, rGPR:$Rn),
+                      NoItinerary, "msr", "\t$banked, $Rn", []>,
+                  Requires<[IsThumb, HasVirtualization]> {
+  bits<6> banked;
+  bits<4> Rn;
+
+  let Inst{31-21} = 0b11110011100;
+  let Inst{20} = banked{5}; // R bit
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b1000;
+  let Inst{11-8} = banked{3-0};
+  let Inst{7-5} = 0b001;
+  let Inst{4} = banked{4};
+  let Inst{3-0} = 0b0000;
+}
+
+
+// M class MSR.
+//
+// Move from ARM core register to Special Register
+let Defs = [CPSR] in
+def t2MSR_M : T2I<(outs), (ins msr_mask:$SYSm, rGPR:$Rn),
+                  NoItinerary, "msr", "\t$SYSm, $Rn", []>,
+              Requires<[IsThumb,IsMClass]> {
+  bits<12> SYSm;
+  bits<4> Rn;
+  let Inst{31-21} = 0b11110011100;
+  let Inst{20}    = 0b0;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b1000;
+  let Inst{11-10} = SYSm{11-10};
+  let Inst{9-8}   = 0b00;
+  let Inst{7-0}   = SYSm{7-0};
+
+  let Unpredictable{20} = 0b1;
+  let Unpredictable{13} = 0b1;
+  let Unpredictable{9-8} = 0b11;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Move between coprocessor and ARM core register
+//
+
+class t2MovRCopro<bits<4> Op, string opc, bit direction, dag oops, dag iops,
+                  list<dag> pattern>
+  : T2Cop<Op, oops, iops, opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2",
+          pattern> {
+  let Inst{27-24} = 0b1110;
+  let Inst{20} = direction;
+  let Inst{4} = 1;
+
+  bits<4> Rt;
+  bits<4> cop;
+  bits<3> opc1;
+  bits<3> opc2;
+  bits<4> CRm;
+  bits<4> CRn;
+
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = cop;
+  let Inst{23-21} = opc1;
+  let Inst{7-5}   = opc2;
+  let Inst{3-0}   = CRm;
+  let Inst{19-16} = CRn;
+}
+
+class t2MovRRCopro<bits<4> Op, string opc, bit direction, dag oops, dag iops,
+                   list<dag> pattern = []>
+  : T2Cop<Op, oops, iops, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> {
+  let Inst{27-24} = 0b1100;
+  let Inst{23-21} = 0b010;
+  let Inst{20} = direction;
+
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> cop;
+  bits<4> opc1;
+  bits<4> CRm;
+
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+  let Inst{11-8}  = cop;
+  let Inst{7-4}   = opc1;
+  let Inst{3-0}   = CRm;
+}
+
+/* from ARM core register to coprocessor */
+def t2MCR : t2MovRCopro<0b1110, "mcr", 0,
+           (outs),
+           (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                c_imm:$CRm, imm0_7:$opc2),
+           [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                         imm:$CRm, imm:$opc2)]>,
+           ComplexDeprecationPredicate<"MCR">;
+def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                         c_imm:$CRm, 0, pred:$p)>;
+def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0,
+             (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                          c_imm:$CRm, imm0_7:$opc2),
+             [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                            imm:$CRm, imm:$opc2)]> {
+  let Predicates = [IsThumb2, PreV8];
+}
+def : t2InstAlias<"mcr2${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                          c_imm:$CRm, 0, pred:$p)>;
+
+/* from coprocessor to ARM core register */
+def t2MRC : t2MovRCopro<0b1110, "mrc", 1,
+             (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                                  c_imm:$CRm, imm0_7:$opc2), []>;
+def : t2InstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                         c_imm:$CRm, 0, pred:$p)>;
+
+def t2MRC2 : t2MovRCopro<0b1111, "mrc2", 1,
+             (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                                  c_imm:$CRm, imm0_7:$opc2), []> {
+  let Predicates = [IsThumb2, PreV8];
+}
+def : t2InstAlias<"mrc2${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                          c_imm:$CRm, 0, pred:$p)>;
+
+def : T2v6Pat<(int_arm_mrc  imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
+              (t2MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+def : T2v6Pat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
+              (t2MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+
+/* from ARM core register to coprocessor */
+def t2MCRR : t2MovRRCopro<0b1110, "mcrr", 0, (outs),
+                         (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2,
+                         c_imm:$CRm),
+                        [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2,
+                                       imm:$CRm)]>;
+def t2MCRR2 : t2MovRRCopro<0b1111, "mcrr2", 0, (outs),
+                          (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2,
+                           c_imm:$CRm),
+                          [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt,
+                                          GPR:$Rt2, imm:$CRm)]> {
+  let Predicates = [IsThumb2, PreV8];
+}
+
+/* from coprocessor to ARM core register */
+def t2MRRC : t2MovRRCopro<0b1110, "mrrc", 1, (outs GPR:$Rt, GPR:$Rt2),
+                          (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm)>;
+
+def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1, (outs GPR:$Rt, GPR:$Rt2),
+                           (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm)> {
+  let Predicates = [IsThumb2, PreV8];
+}
+
+//===----------------------------------------------------------------------===//
+// Other Coprocessor Instructions.
+//
+
+def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+                 c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
+                 "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+                 [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                               imm:$CRm, imm:$opc2)]> {
+  let Inst{27-24} = 0b1110;
+
+  bits<4> opc1;
+  bits<4> CRn;
+  bits<4> CRd;
+  bits<4> cop;
+  bits<3> opc2;
+  bits<4> CRm;
+
+  let Inst{3-0}   = CRm;
+  let Inst{4}     = 0;
+  let Inst{7-5}   = opc2;
+  let Inst{11-8}  = cop;
+  let Inst{15-12} = CRd;
+  let Inst{19-16} = CRn;
+  let Inst{23-20} = opc1;
+
+  let Predicates = [IsThumb2, PreV8];
+}
+
+def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+                   c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
+                   "cdp2", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+                   [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                                  imm:$CRm, imm:$opc2)]> {
+  let Inst{27-24} = 0b1110;
+
+  bits<4> opc1;
+  bits<4> CRn;
+  bits<4> CRd;
+  bits<4> cop;
+  bits<3> opc2;
+  bits<4> CRm;
+
+  let Inst{3-0}   = CRm;
+  let Inst{4}     = 0;
+  let Inst{7-5}   = opc2;
+  let Inst{11-8}  = cop;
+  let Inst{15-12} = CRd;
+  let Inst{19-16} = CRn;
+  let Inst{23-20} = opc1;
+
+  let Predicates = [IsThumb2, PreV8];
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// ARMv8.1 Privilege Access Never extension
+//
+// SETPAN #imm1
+
+def t2SETPAN : T1I<(outs), (ins imm0_1:$imm), NoItinerary, "setpan\t$imm", []>,
+               T1Misc<0b0110000>, Requires<[IsThumb2, HasV8, HasV8_1a]> {
+  bits<1> imm;
+
+  let Inst{4} = 0b1;
+  let Inst{3} = imm;
+  let Inst{2-0} = 0b000;
+
+  let Unpredictable{4} = 0b1;
+  let Unpredictable{2-0} = 0b111;
+}
+
+//===----------------------------------------------------------------------===//
+// ARMv8-M Security Extensions instructions
+//
+
+let hasSideEffects = 1 in
+def t2SG : T2I<(outs), (ins), NoItinerary, "sg", "", []>,
+           Requires<[Has8MSecExt]> {
+  let Inst = 0xe97fe97f;
+}
+
+class T2TT<bits<2> at, string asm, list<dag> pattern>
+  : T2I<(outs rGPR:$Rt), (ins GPRnopc:$Rn), NoItinerary, asm, "\t$Rt, $Rn",
+        pattern> {
+  bits<4> Rn;
+  bits<4> Rt;
+
+  let Inst{31-20} = 0b111010000100;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b1111;
+  let Inst{11-8} = Rt;
+  let Inst{7-6} = at;
+  let Inst{5-0} = 0b000000;
+
+  let Unpredictable{5-0} = 0b111111;
+}
+
+def t2TT   : T2TT<0b00, "tt",   []>, Requires<[IsThumb,Has8MSecExt]>;
+def t2TTT  : T2TT<0b01, "ttt",  []>, Requires<[IsThumb,Has8MSecExt]>;
+def t2TTA  : T2TT<0b10, "tta",  []>, Requires<[IsThumb,Has8MSecExt]>;
+def t2TTAT : T2TT<0b11, "ttat", []>, Requires<[IsThumb,Has8MSecExt]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// SXT/UXT with no rotate
+let AddedComplexity = 16 in {
+def : T2Pat<(and rGPR:$Rm, 0x000000FF), (t2UXTB rGPR:$Rm, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(and rGPR:$Rm, 0x0000FFFF), (t2UXTH rGPR:$Rm, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(and rGPR:$Rm, 0x00FF00FF), (t2UXTB16 rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0x00FF)),
+            (t2UXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0xFFFF)),
+            (t2UXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+}
+
+def : T2Pat<(sext_inreg rGPR:$Src, i8),  (t2SXTB rGPR:$Src, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(sext_inreg rGPR:$Src, i16), (t2SXTH rGPR:$Src, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i8)),
+            (t2SXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i16)),
+            (t2SXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+
+// Atomic load/store patterns
+def : T2Pat<(atomic_load_8   t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_8   t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_8   t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_load_16  t2addrmode_imm12:$addr),
+            (t2LDRHi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_16  t2addrmode_negimm8:$addr),
+            (t2LDRHi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_16  t2addrmode_so_reg:$addr),
+            (t2LDRHs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_load_32  t2addrmode_imm12:$addr),
+            (t2LDRi12   t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_32  t2addrmode_negimm8:$addr),
+            (t2LDRi8    t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_32  t2addrmode_so_reg:$addr),
+            (t2LDRs     t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_8  t2addrmode_imm12:$addr, GPR:$val),
+            (t2STRBi12  GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_8  t2addrmode_negimm8:$addr, GPR:$val),
+            (t2STRBi8   GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_8  t2addrmode_so_reg:$addr, GPR:$val),
+            (t2STRBs    GPR:$val, t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_imm12:$addr, GPR:$val),
+            (t2STRHi12  GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_negimm8:$addr, GPR:$val),
+            (t2STRHi8   GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_so_reg:$addr, GPR:$val),
+            (t2STRHs    GPR:$val, t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_imm12:$addr, GPR:$val),
+            (t2STRi12   GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_negimm8:$addr, GPR:$val),
+            (t2STRi8    GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_so_reg:$addr, GPR:$val),
+            (t2STRs     GPR:$val, t2addrmode_so_reg:$addr)>;
+
+let AddedComplexity = 8 in {
+  def : T2Pat<(atomic_load_acquire_8 addr_offset_none:$addr),  (t2LDAB addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_load_acquire_16 addr_offset_none:$addr), (t2LDAH addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_load_acquire_32 addr_offset_none:$addr), (t2LDA  addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val),  (t2STLB GPR:$val, addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (t2STLH GPR:$val, addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (t2STL  GPR:$val, addr_offset_none:$addr)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases
+//
+
+// Aliases for ADC without the ".w" optional width specifier.
+def : t2InstAlias<"adc${s}${p} $Rd, $Rn, $Rm",
+                  (t2ADCrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"adc${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2ADCrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Aliases for SBC without the ".w" optional width specifier.
+def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $Rm",
+                  (t2SBCrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2SBCrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Aliases for ADD without the ".w" optional width specifier.
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+        (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p,
+         cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
+           (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $Rm",
+              (t2ADDrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2ADDrs GPRnopc:$Rd, GPRnopc:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+// ... and with the destination and source register combined.
+def : t2InstAlias<"add${s}${p} $Rdn, $imm",
+      (t2ADDri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rdn, $imm",
+           (t2ADDri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p} $Rdn, $Rm",
+            (t2ADDrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${s}${p} $Rdn, $ShiftedRm",
+                  (t2ADDrs GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// add w/ negative immediates is just a sub.
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+        (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
+                 cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
+           (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p} $Rdn, $imm",
+      (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
+               cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rdn, $imm",
+           (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
+
+def : t2InstAlias<"add${s}${p}.w $Rd, $Rn, $imm",
+        (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
+                 cc_out:$s)>;
+def : t2InstAlias<"addw${p} $Rd, $Rn, $imm",
+           (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p}.w $Rdn, $imm",
+      (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
+               cc_out:$s)>;
+def : t2InstAlias<"addw${p} $Rdn, $imm",
+           (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
+
+
+// Aliases for SUB without the ".w" optional width specifier.
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $imm",
+        (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${p} $Rd, $Rn, $imm",
+           (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $Rm",
+              (t2SUBrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2SUBrs GPRnopc:$Rd, GPRnopc:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+// ... and with the destination and source register combined.
+def : t2InstAlias<"sub${s}${p} $Rdn, $imm",
+      (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${p} $Rdn, $imm",
+           (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"sub${s}${p}.w $Rdn, $Rm",
+            (t2SUBrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${s}${p} $Rdn, $Rm",
+            (t2SUBrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${s}${p} $Rdn, $ShiftedRm",
+                  (t2SUBrs GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Alias for compares without the ".w" optional width specifier.
+def : t2InstAlias<"cmn${p} $Rn, $Rm",
+                  (t2CMNzrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"teq${p} $Rn, $Rm",
+                  (t2TEQrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"tst${p} $Rn, $Rm",
+                  (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+
+// Memory barriers
+def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+
+// Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
+// width specifier.
+def : t2InstAlias<"ldr${p} $Rt, $addr",
+                  (t2LDRi12 GPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p} $Rt, $addr",
+                  (t2LDRBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p} $Rt, $addr",
+                  (t2LDRHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p} $Rt, $addr",
+                  (t2LDRSBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p} $Rt, $addr",
+                  (t2LDRSHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+
+def : t2InstAlias<"ldr${p} $Rt, $addr",
+                  (t2LDRs GPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p} $Rt, $addr",
+                  (t2LDRBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p} $Rt, $addr",
+                  (t2LDRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p} $Rt, $addr",
+                  (t2LDRSBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p} $Rt, $addr",
+                  (t2LDRSHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+
+def : t2InstAlias<"ldr${p} $Rt, $addr",
+                  (t2LDRpci GPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p} $Rt, $addr",
+                  (t2LDRBpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p} $Rt, $addr",
+                  (t2LDRHpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p} $Rt, $addr",
+                  (t2LDRSBpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p} $Rt, $addr",
+                  (t2LDRSHpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+
+// Alias for MVN with(out) the ".w" optional width specifier.
+def : t2InstAlias<"mvn${s}${p}.w $Rd, $imm",
+           (t2MVNi rGPR:$Rd, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"mvn${s}${p} $Rd, $Rm",
+           (t2MVNr rGPR:$Rd, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"mvn${s}${p} $Rd, $ShiftedRm",
+           (t2MVNs rGPR:$Rd, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>;
+
+// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT with the
+// input operands swapped when the shift amount is zero (i.e., unspecified).
+def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
+                (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
+                (t2PKHBT rGPR:$Rd, rGPR:$Rm, rGPR:$Rn, 0, pred:$p), 0>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+
+// PUSH/POP aliases for STM/LDM
+def : t2InstAlias<"push${p}.w $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"push${p} $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"pop${p}.w $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"pop${p} $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+
+// STMIA/STMIA_UPD aliases w/o the optional .w suffix
+def : t2InstAlias<"stm${p} $Rn, $regs",
+                  (t2STMIA GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"stm${p} $Rn!, $regs",
+                  (t2STMIA_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// LDMIA/LDMIA_UPD aliases w/o the optional .w suffix
+def : t2InstAlias<"ldm${p} $Rn, $regs",
+                  (t2LDMIA GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"ldm${p} $Rn!, $regs",
+                  (t2LDMIA_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// STMDB/STMDB_UPD aliases w/ the optional .w suffix
+def : t2InstAlias<"stmdb${p}.w $Rn, $regs",
+                  (t2STMDB GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"stmdb${p}.w $Rn!, $regs",
+                  (t2STMDB_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// LDMDB/LDMDB_UPD aliases w/ the optional .w suffix
+def : t2InstAlias<"ldmdb${p}.w $Rn, $regs",
+                  (t2LDMDB GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"ldmdb${p}.w $Rn!, $regs",
+                  (t2LDMDB_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// Alias for REV/REV16/REVSH without the ".w" optional width specifier.
+def : t2InstAlias<"rev${p} $Rd, $Rm", (t2REV rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"rev16${p} $Rd, $Rm", (t2REV16 rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"revsh${p} $Rd, $Rm", (t2REVSH rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+
+
+// Alias for RSB without the ".w" optional width specifier, and with optional
+// implied destination register.
+def : t2InstAlias<"rsb${s}${p} $Rd, $Rn, $imm",
+           (t2RSBri rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $imm",
+           (t2RSBri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $Rm",
+           (t2RSBrr rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $ShiftedRm",
+           (t2RSBrs rGPR:$Rdn, rGPR:$Rdn, t2_so_reg:$ShiftedRm, pred:$p,
+                    cc_out:$s)>;
+
+// SSAT/USAT optional shift operand.
+def : t2InstAlias<"ssat${p} $Rd, $sat_imm, $Rn",
+                  (t2SSAT rGPR:$Rd, imm1_32:$sat_imm, rGPR:$Rn, 0, pred:$p)>;
+def : t2InstAlias<"usat${p} $Rd, $sat_imm, $Rn",
+                  (t2USAT rGPR:$Rd, imm0_31:$sat_imm, rGPR:$Rn, 0, pred:$p)>;
+
+// STM w/o the .w suffix.
+def : t2InstAlias<"stm${p} $Rn, $regs",
+                  (t2STMIA GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// Alias for STR, STRB, and STRH without the ".w" optional
+// width specifier.
+def : t2InstAlias<"str${p} $Rt, $addr",
+                  (t2STRi12 GPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"strb${p} $Rt, $addr",
+                  (t2STRBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"strh${p} $Rt, $addr",
+                  (t2STRHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+
+def : t2InstAlias<"str${p} $Rt, $addr",
+                  (t2STRs GPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"strb${p} $Rt, $addr",
+                  (t2STRBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"strh${p} $Rt, $addr",
+                  (t2STRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+
+// Extend instruction optional rotate operand.
+def : InstAlias<"sxtab${p} $Rd, $Rn, $Rm",
+              (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
+              Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"sxtah${p} $Rd, $Rn, $Rm",
+              (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
+              Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
+              (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
+              Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"sxtb16${p} $Rd, $Rm",
+              (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p), 0>,
+              Requires<[HasT2ExtractPack, IsThumb2]>;
+
+def : t2InstAlias<"sxtb${p} $Rd, $Rm",
+                (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxth${p} $Rd, $Rm",
+                (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtb${p}.w $Rd, $Rm",
+                (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxth${p}.w $Rd, $Rm",
+                (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+
+def : InstAlias<"uxtab${p} $Rd, $Rn, $Rm",
+              (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
+              Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"uxtah${p} $Rd, $Rn, $Rm",
+              (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
+              Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
+              (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
+              Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"uxtb16${p} $Rd, $Rm",
+              (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p), 0>,
+              Requires<[HasT2ExtractPack, IsThumb2]>;
+
+def : t2InstAlias<"uxtb${p} $Rd, $Rm",
+                (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxth${p} $Rd, $Rm",
+                (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtb${p}.w $Rd, $Rm",
+                (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxth${p}.w $Rd, $Rm",
+                (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+
+// Extend instruction w/o the ".w" optional width specifier.
+def : t2InstAlias<"uxtb${p} $Rd, $Rm$rot",
+                  (t2UXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : InstAlias<"uxtb16${p} $Rd, $Rm$rot",
+                (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p), 0>,
+                Requires<[HasT2ExtractPack, IsThumb2]>;
+def : t2InstAlias<"uxth${p} $Rd, $Rm$rot",
+                  (t2UXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+
+def : t2InstAlias<"sxtb${p} $Rd, $Rm$rot",
+                  (t2SXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : InstAlias<"sxtb16${p} $Rd, $Rm$rot",
+                (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p), 0>,
+                Requires<[HasT2ExtractPack, IsThumb2]>;
+def : t2InstAlias<"sxth${p} $Rd, $Rm$rot",
+                  (t2SXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+
+
+// "mov Rd, t2_so_imm_not" can be handled via "mvn" in assembly, just like
+// for isel.
+def : t2InstAlias<"mov${p} $Rd, $imm",
+                  (t2MVNi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
+def : t2InstAlias<"mvn${p} $Rd, $imm",
+                  (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
+// Same for AND <--> BIC
+def : t2InstAlias<"bic${s}${p} $Rd, $Rn, $imm",
+                  (t2ANDri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstAlias<"bic${s}${p} $Rdn, $imm",
+                  (t2ANDri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstAlias<"and${s}${p} $Rd, $Rn, $imm",
+                  (t2BICri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstAlias<"and${s}${p} $Rdn, $imm",
+                  (t2BICri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+// Likewise, "add Rd, t2_so_imm_neg" -> sub
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+                  (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${s}${p} $Rd, $imm",
+                  (t2SUBri GPRnopc:$Rd, GPRnopc:$Rd, t2_so_imm_neg:$imm,
+                           pred:$p, cc_out:$s)>;
+// Same for CMP <--> CMN via t2_so_imm_neg
+def : t2InstAlias<"cmp${p} $Rd, $imm",
+                  (t2CMNri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
+def : t2InstAlias<"cmn${p} $Rd, $imm",
+                  (t2CMPri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
+
+
+// Wide 'mul' encoding can be specified with only two operands.
+def : t2InstAlias<"mul${p} $Rn, $Rm",
+                  (t2MUL rGPR:$Rn, rGPR:$Rm, rGPR:$Rn, pred:$p)>;
+
+// "neg" is and alias for "rsb rd, rn, #0"
+def : t2InstAlias<"neg${s}${p} $Rd, $Rm",
+                  (t2RSBri rGPR:$Rd, rGPR:$Rm, 0, pred:$p, cc_out:$s)>;
+
+// MOV so_reg assembler pseudos. InstAlias isn't expressive enough for
+// these, unfortunately.
+def t2MOVsi: t2AsmPseudo<"mov${p} $Rd, $shift",
+                         (ins rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+def t2MOVSsi: t2AsmPseudo<"movs${p} $Rd, $shift",
+                          (ins rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+
+def t2MOVsr: t2AsmPseudo<"mov${p} $Rd, $shift",
+                         (ins rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+def t2MOVSsr: t2AsmPseudo<"movs${p} $Rd, $shift",
+                          (ins rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+
+// ADR w/o the .w suffix
+def : t2InstAlias<"adr${p} $Rd, $addr",
+                  (t2ADR rGPR:$Rd, t2adrlabel:$addr, pred:$p)>;
+
+// LDR(literal) w/ alternate [pc, #imm] syntax.
+def t2LDRpcrel   : t2AsmPseudo<"ldr${p} $Rt, $addr",
+                         (ins GPR:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def t2LDRBpcrel  : t2AsmPseudo<"ldrb${p} $Rt, $addr",
+                         (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def t2LDRHpcrel  : t2AsmPseudo<"ldrh${p} $Rt, $addr",
+                         (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def t2LDRSBpcrel  : t2AsmPseudo<"ldrsb${p} $Rt, $addr",
+                         (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def t2LDRSHpcrel  : t2AsmPseudo<"ldrsh${p} $Rt, $addr",
+                         (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+    // Version w/ the .w suffix.
+def : t2InstAlias<"ldr${p}.w $Rt, $addr",
+                  (t2LDRpcrel GPR:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p), 0>;
+def : t2InstAlias<"ldrb${p}.w $Rt, $addr",
+                  (t2LDRBpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p}.w $Rt, $addr",
+                  (t2LDRHpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p}.w $Rt, $addr",
+                  (t2LDRSBpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p}.w $Rt, $addr",
+                  (t2LDRSHpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+
+def : t2InstAlias<"add${p} $Rd, pc, $imm",
+                  (t2ADR rGPR:$Rd, imm0_4095:$imm, pred:$p)>;
+
+// Pseudo instruction ldr Rt, =immediate
+def t2LDRConstPool
+  : t2AsmPseudo<"ldr${p} $Rt, $immediate",
+                (ins GPRnopc:$Rt, const_pool_asm_imm:$immediate, pred:$p)>;
+// Version w/ the .w suffix.
+def : t2InstAlias<"ldr${p}.w $Rt, $immediate",
+                  (t2LDRConstPool GPRnopc:$Rt,
+                  const_pool_asm_imm:$immediate, pred:$p)>;
+
+// PLD/PLDW/PLI with alternate literal form.
+def : t2InstAlias<"pld${p} $addr",
+                  (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : InstAlias<"pli${p} $addr",
+                 (t2PLIpci  t2ldr_pcrel_imm12:$addr, pred:$p), 0>,
+      Requires<[IsThumb2,HasV7]>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
new file mode 100644
index 000000000000..e99048645685
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -0,0 +1,2308 @@
+//===-- ARMInstrVFP.td - VFP support for ARM ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM VFP instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+def SDT_CMPFP0  : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
+                                       SDTCisSameAs<1, 2>]>;
+
+def arm_fmstat : SDNode<"ARMISD::FMSTAT",  SDTNone, [SDNPInGlue, SDNPOutGlue]>;
+def arm_cmpfp  : SDNode<"ARMISD::CMPFP",   SDT_ARMCmp, [SDNPOutGlue]>;
+def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
+def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// 8-bit floating-point immediate encodings.
+def FPImmOperand : AsmOperandClass {
+  let Name = "FPImm";
+  let ParserMethod = "parseFPImm";
+}
+
+def vfp_f16imm : Operand<f16>,
+                 PatLeaf<(f16 fpimm), [{
+      return ARM_AM::getFP16Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM_AM::getFP16Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
+}
+
+def vfp_f32imm : Operand<f32>,
+                 PatLeaf<(f32 fpimm), [{
+      return ARM_AM::getFP32Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM_AM::getFP32Imm(InVal);
+      return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+    }]>> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
+}
+
+def vfp_f64imm : Operand<f64>,
+                 PatLeaf<(f64 fpimm), [{
+      return ARM_AM::getFP64Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM_AM::getFP64Imm(InVal);
+      return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32);
+    }]>> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
+}
+
+def alignedload32 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+def alignedstore32 : PatFrag<(ops node:$val, node:$ptr),
+                             (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+// The VCVT to/from fixed-point instructions encode the 'fbits' operand
+// (the number of fixed bits) differently than it appears in the assembly
+// source. It's encoded as "Size - fbits" where Size is the size of the
+// fixed-point representation (32 or 16) and fbits is the value appearing
+// in the assembly source, an integer in [0,16] or (0,32], depending on size.
+def fbits32_asm_operand : AsmOperandClass { let Name = "FBits32"; }
+def fbits32 : Operand<i32> {
+  let PrintMethod = "printFBits32";
+  let ParserMatchClass = fbits32_asm_operand;
+}
+
+def fbits16_asm_operand : AsmOperandClass { let Name = "FBits16"; }
+def fbits16 : Operand<i32> {
+  let PrintMethod = "printFBits16";
+  let ParserMatchClass = fbits16_asm_operand;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / store Instructions.
+//
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+
+def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$Dd), (ins addrmode5:$addr),
+                 IIC_fpLoad64, "vldr", "\t$Dd, $addr",
+                 [(set DPR:$Dd, (f64 (alignedload32 addrmode5:$addr)))]>;
+
+def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
+                 IIC_fpLoad32, "vldr", "\t$Sd, $addr",
+                 [(set SPR:$Sd, (alignedload32 addrmode5:$addr))]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+def VLDRH : AHI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5fp16:$addr),
+                 IIC_fpLoad16, "vldr", ".16\t$Sd, $addr",
+                 []>,
+            Requires<[HasFullFP16]>;
+
+} // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in'
+
+def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr),
+                 IIC_fpStore64, "vstr", "\t$Dd, $addr",
+                 [(alignedstore32 (f64 DPR:$Dd), addrmode5:$addr)]>;
+
+def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr),
+                 IIC_fpStore32, "vstr", "\t$Sd, $addr",
+                 [(alignedstore32 SPR:$Sd, addrmode5:$addr)]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+def VSTRH : AHI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5fp16:$addr),
+                 IIC_fpStore16, "vstr", ".16\t$Sd, $addr",
+                 []>,
+            Requires<[HasFullFP16]>;
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+multiclass vfp_ldst_mult<string asm, bit L_bit,
+                         InstrItinClass itin, InstrItinClass itin_upd> {
+  // Double Precision
+  def DIA :
+    AXDI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeNone, itin,
+          !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def DIA_UPD :
+    AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs,
+                               variable_ops),
+          IndexModeUpd, itin_upd,
+          !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+  }
+  def DDB_UPD :
+    AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs,
+                               variable_ops),
+          IndexModeUpd, itin_upd,
+          !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+  }
+
+  // Single Precision
+  def SIA :
+    AXSI4<(outs), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops),
+          IndexModeNone, itin,
+          !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+
+    // Some single precision VFP instructions may be executed on both NEON and
+    // VFP pipelines.
+    let D = VFPNeonDomain;
+  }
+  def SIA_UPD :
+    AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs,
+                               variable_ops),
+          IndexModeUpd, itin_upd,
+          !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    // Some single precision VFP instructions may be executed on both NEON and
+    // VFP pipelines.
+    let D = VFPNeonDomain;
+  }
+  def SDB_UPD :
+    AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs,
+                               variable_ops),
+          IndexModeUpd, itin_upd,
+          !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    // Some single precision VFP instructions may be executed on both NEON and
+    // VFP pipelines.
+    let D = VFPNeonDomain;
+  }
+}
+
+let hasSideEffects = 0 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm VLDM : vfp_ldst_mult<"vldm", 1, IIC_fpLoad_m, IIC_fpLoad_mu>;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm VSTM : vfp_ldst_mult<"vstm", 0, IIC_fpStore_m, IIC_fpStore_mu>;
+
+} // hasSideEffects
+
+def : MnemonicAlias<"vldm", "vldmia">;
+def : MnemonicAlias<"vstm", "vstmia">;
+
+
+//===----------------------------------------------------------------------===//
+//  Lazy load / store multiple Instructions
+//
+let mayLoad = 1 in
+def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
+                  IIC_fpLoad_m, "vlldm${p}\t$Rn", "", []>,
+            Requires<[HasV8MMainline, Has8MSecExt]> {
+    let Inst{24-23} = 0b00;
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;
+    let Inst{20}    = 1;
+    let Inst{15-12} = 0;
+    let Inst{7-0}   = 0;
+    let mayLoad     = 1;
+}
+
+let mayStore = 1 in
+def VLSTM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
+                  IIC_fpStore_m, "vlstm${p}\t$Rn", "", []>,
+            Requires<[HasV8MMainline, Has8MSecExt]> {
+    let Inst{24-23} = 0b00;
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;
+    let Inst{20}    = 0;
+    let Inst{15-12} = 0;
+    let Inst{7-0}   = 0;
+    let mayStore    = 1;
+}
+
+
+// FLDM/FSTM - Load / Store multiple single / double precision registers for
+// pre-ARMv6 cores.
+// These instructions are deprecated!
+def : VFP2MnemonicAlias<"fldmias", "vldmia">;
+def : VFP2MnemonicAlias<"fldmdbs", "vldmdb">;
+def : VFP2MnemonicAlias<"fldmeas", "vldmdb">;
+def : VFP2MnemonicAlias<"fldmfds", "vldmia">;
+def : VFP2MnemonicAlias<"fldmiad", "vldmia">;
+def : VFP2MnemonicAlias<"fldmdbd", "vldmdb">;
+def : VFP2MnemonicAlias<"fldmead", "vldmdb">;
+def : VFP2MnemonicAlias<"fldmfdd", "vldmia">;
+
+def : VFP2MnemonicAlias<"fstmias", "vstmia">;
+def : VFP2MnemonicAlias<"fstmdbs", "vstmdb">;
+def : VFP2MnemonicAlias<"fstmeas", "vstmia">;
+def : VFP2MnemonicAlias<"fstmfds", "vstmdb">;
+def : VFP2MnemonicAlias<"fstmiad", "vstmia">;
+def : VFP2MnemonicAlias<"fstmdbd", "vstmdb">;
+def : VFP2MnemonicAlias<"fstmead", "vstmia">;
+def : VFP2MnemonicAlias<"fstmfdd", "vstmdb">;
+
+def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r), 0>,
+                Requires<[HasVFP2]>;
+def : InstAlias<"vpush${p} $r", (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r), 0>,
+                Requires<[HasVFP2]>;
+def : InstAlias<"vpop${p} $r",  (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r), 0>,
+                Requires<[HasVFP2]>;
+def : InstAlias<"vpop${p} $r",  (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r), 0>,
+                Requires<[HasVFP2]>;
+defm : VFPDTAnyInstAlias<"vpush${p}", "$r",
+                         (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>;
+defm : VFPDTAnyInstAlias<"vpush${p}", "$r",
+                         (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r)>;
+defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
+                         (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r)>;
+defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
+                         (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r)>;
+
+// FLDMX, FSTMX - Load and store multiple unknown precision registers for
+// pre-armv6 cores.
+// These instruction are deprecated so we don't want them to get selected.
+multiclass vfp_ldstx_mult<string asm, bit L_bit> {
+  // Unknown precision
+  def XIA :
+    AXXI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeNone, !strconcat(asm, "iax${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def XIA_UPD :
+    AXXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeUpd, !strconcat(asm, "iax${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b01;         // Increment After
+    let Inst{21}    = 1;            // Writeback
+    let Inst{20}    = L_bit;
+  }
+  def XDB_UPD :
+    AXXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeUpd, !strconcat(asm, "dbx${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b10;         // Decrement Before
+    let Inst{21}    = 1;            // Writeback
+    let Inst{20}    = L_bit;
+  }
+}
+
+defm FLDM : vfp_ldstx_mult<"fldm", 1>;
+defm FSTM : vfp_ldstx_mult<"fstm", 0>;
+
+def : VFP2MnemonicAlias<"fldmeax", "fldmdbx">;
+def : VFP2MnemonicAlias<"fldmfdx", "fldmiax">;
+
+def : VFP2MnemonicAlias<"fstmeax", "fstmiax">;
+def : VFP2MnemonicAlias<"fstmfdx", "fstmdbx">;
+
+//===----------------------------------------------------------------------===//
+// FP Binary Operations.
+//
+
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
+def VADDD  : ADbI<0b11100, 0b11, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VADDS  : ASbIn<0b11100, 0b11, 0, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm",
+                   [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VADDH  : AHbI<0b11100, 0b11, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
+                  []>;
+
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
+def VSUBD  : ADbI<0b11100, 0b11, 1, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VSUBS  : ASbIn<0b11100, 0b11, 1, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm",
+                   [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VSUBH  : AHbI<0b11100, 0b11, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
+                  []>;
+
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
+def VDIVD  : ADbI<0b11101, 0b00, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VDIVS  : ASbI<0b11101, 0b00, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VDIVH  : AHbI<0b11101, 0b00, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm",
+                  []>;
+
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
+def VMULD  : ADbI<0b11100, 0b10, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VMULS  : ASbIn<0b11100, 0b10, 0, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm",
+                   [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VMULH  : AHbI<0b11100, 0b10, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm",
+                  []>;
+
+def VNMULD : ADbI<0b11100, 0b10, 1, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>;
+
+def VNMULS : ASbI<0b11100, 0b10, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VNMULH : AHbI<0b11100, 0b10, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm",
+                  []>;
+
+multiclass vsel_inst<string op, bits<2> opc, int CC> {
+  let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
+      Uses = [CPSR], AddedComplexity = 4 in {
+    def H : AHbInp<0b11100, opc, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   NoItinerary, !strconcat("vsel", op, ".f16\t$Sd, $Sn, $Sm"),
+                   []>,
+                   Requires<[HasFullFP16]>;
+
+    def S : ASbInp<0b11100, opc, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   NoItinerary, !strconcat("vsel", op, ".f32\t$Sd, $Sn, $Sm"),
+                   [(set SPR:$Sd, (ARMcmov SPR:$Sm, SPR:$Sn, CC))]>,
+                   Requires<[HasFPARMv8]>;
+
+    def D : ADbInp<0b11100, opc, 0,
+                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                   NoItinerary, !strconcat("vsel", op, ".f64\t$Dd, $Dn, $Dm"),
+                   [(set DPR:$Dd, (ARMcmov (f64 DPR:$Dm), (f64 DPR:$Dn), CC))]>,
+                   Requires<[HasFPARMv8, HasDPVFP]>;
+  }
+}
+
+// The CC constants here match ARMCC::CondCodes.
+defm VSELGT : vsel_inst<"gt", 0b11, 12>;
+defm VSELGE : vsel_inst<"ge", 0b10, 10>;
+defm VSELEQ : vsel_inst<"eq", 0b00, 0>;
+defm VSELVS : vsel_inst<"vs", 0b01, 6>;
+
+multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
+  let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
+    def H : AHbInp<0b11101, 0b00, opc,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   NoItinerary, !strconcat(op, ".f16\t$Sd, $Sn, $Sm"),
+                   []>,
+                   Requires<[HasFullFP16]>;
+
+    def S : ASbInp<0b11101, 0b00, opc,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   NoItinerary, !strconcat(op, ".f32\t$Sd, $Sn, $Sm"),
+                   [(set SPR:$Sd, (SD SPR:$Sn, SPR:$Sm))]>,
+                   Requires<[HasFPARMv8]>;
+
+    def D : ADbInp<0b11101, 0b00, opc,
+                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                   NoItinerary, !strconcat(op, ".f64\t$Dd, $Dn, $Dm"),
+                   [(set DPR:$Dd, (f64 (SD (f64 DPR:$Dn), (f64 DPR:$Dm))))]>,
+                   Requires<[HasFPARMv8, HasDPVFP]>;
+  }
+}
+
+defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>;
+defm VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>;
+
+// Match reassociated forms only if not sign dependent rounding.
+def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)),
+          (VNMULD DPR:$a, DPR:$b)>,
+          Requires<[NoHonorSignDependentRounding,HasDPVFP]>;
+def : Pat<(fmul (fneg SPR:$a), SPR:$b),
+          (VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
+
+// These are encoded as unary instructions.
+let Defs = [FPSCR_NZCV] in {
+def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0,
+                  (outs), (ins DPR:$Dd, DPR:$Dm),
+                  IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm",
+                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>;
+
+def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
+                  (outs), (ins SPR:$Sd, SPR:$Sm),
+                  IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm",
+                  [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
+                  (outs), (ins SPR:$Sd, SPR:$Sm),
+                  IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
+                  []>;
+
+
+// FIXME: Verify encoding after integrated assembler is working.
+def VCMPD  : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
+                  (outs), (ins DPR:$Dd, DPR:$Dm),
+                  IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm",
+                  [/* For disassembly only; pattern left blank */]>;
+
+def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
+                  (outs), (ins SPR:$Sd, SPR:$Sm),
+                  IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm",
+                  [/* For disassembly only; pattern left blank */]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VCMPH  : AHuI<0b11101, 0b11, 0b0100, 0b01, 0,
+                  (outs), (ins SPR:$Sd, SPR:$Sm),
+                  IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm",
+                  []>;
+} // Defs = [FPSCR_NZCV]
+
+//===----------------------------------------------------------------------===//
+// FP Unary Operations.
+//
+
+def VABSD  : ADuI<0b11101, 0b11, 0b0000, 0b11, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dm),
+                  IIC_fpUNA64, "vabs", ".f64\t$Dd, $Dm",
+                  [(set DPR:$Dd, (fabs (f64 DPR:$Dm)))]>;
+
+def VABSS  : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   IIC_fpUNA32, "vabs", ".f32\t$Sd, $Sm",
+                   [(set SPR:$Sd, (fabs SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VABSH  : AHuI<0b11101, 0b11, 0b0000, 0b11, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   IIC_fpUNA16, "vabs", ".f16\t$Sd, $Sm",
+                   []>;
+
+let Defs = [FPSCR_NZCV] in {
+def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
+                   (outs), (ins DPR:$Dd),
+                   IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0",
+                   [(arm_cmpfp0 (f64 DPR:$Dd))]> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+}
+
+def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
+                   (outs), (ins SPR:$Sd),
+                   IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0",
+                   [(arm_cmpfp0 SPR:$Sd)]> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
+                   (outs), (ins SPR:$Sd),
+                   IIC_fpCMP16, "vcmpe", ".f16\t$Sd, #0",
+                   []> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+}
+
+// FIXME: Verify encoding after integrated assembler is working.
+def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
+                   (outs), (ins DPR:$Dd),
+                   IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+}
+
+def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
+                   (outs), (ins SPR:$Sd),
+                   IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VCMPZH  : AHuI<0b11101, 0b11, 0b0101, 0b01, 0,
+                   (outs), (ins SPR:$Sd),
+                   IIC_fpCMP16, "vcmp", ".f16\t$Sd, #0",
+                   []> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+}
+} // Defs = [FPSCR_NZCV]
+
+def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
+                   (outs DPR:$Dd), (ins SPR:$Sm),
+                   IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm",
+                   [(set DPR:$Dd, (fpextend SPR:$Sm))]> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+// Special case encoding: bits 11-8 is 0b1011.
+def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
+                    IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm",
+                    [(set SPR:$Sd, (fpround DPR:$Dm))]> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = 0b11101;
+  let Inst{21-16} = 0b110111;
+  let Inst{11-8}  = 0b1011;
+  let Inst{7-6}   = 0b11;
+  let Inst{4}     = 0;
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+// Between half, single and double-precision.  For disassembly only.
+
+// FIXME: Verify encoding after integrated assembler is working.
+def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
+                 [/* For disassembly only; pattern left blank */]>,
+                 Requires<[HasFP16]>;
+
+def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
+                 [/* For disassembly only; pattern left blank */]>,
+                 Requires<[HasFP16]>;
+
+def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
+                 [/* For disassembly only; pattern left blank */]>,
+                 Requires<[HasFP16]>;
+
+def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
+                 [/* For disassembly only; pattern left blank */]>,
+                 Requires<[HasFP16]>;
+
+def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
+                   (outs DPR:$Dd), (ins SPR:$Sm),
+                   NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm",
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+  // Instruction operands.
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0} = Sm{4-1};
+  let Inst{5}   = Sm{0};
+}
+
+def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
+                   (outs SPR:$Sd), (ins DPR:$Dm),
+                   NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm",
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}     = Dm{3-0};
+  let Inst{5}       = Dm{4};
+  let Inst{15-12}   = Sd{4-1};
+  let Inst{22}      = Sd{0};
+}
+
+def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
+                   (outs DPR:$Dd), (ins SPR:$Sm),
+                   NoItinerary, "vcvtt", ".f64.f16\t$Dd, $Sm",
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+  // Instruction operands.
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0} = Sm{4-1};
+  let Inst{5}   = Sm{0};
+}
+
+def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0,
+                   (outs SPR:$Sd), (ins DPR:$Dm),
+                   NoItinerary, "vcvtt", ".f16.f64\t$Sd, $Dm",
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+}
+
+def : Pat<(fp_to_f16 SPR:$a),
+          (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
+
+def : Pat<(fp_to_f16 (f64 DPR:$a)),
+          (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>;
+
+def : Pat<(f16_to_fp GPR:$a),
+          (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+def : Pat<(f64 (f16_to_fp GPR:$a)),
+          (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+multiclass vcvt_inst<string opc, bits<2> rm,
+                     SDPatternOperator node = null_frag> {
+  let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+    def SH : AHuInp<0b11101, 0b11, 0b1100, 0b11, 0,
+                    (outs SPR:$Sd), (ins SPR:$Sm),
+                    NoItinerary, !strconcat("vcvt", opc, ".s32.f16\t$Sd, $Sm"),
+                    []>,
+                    Requires<[HasFullFP16]> {
+      let Inst{17-16} = rm;
+    }
+
+    def UH : AHuInp<0b11101, 0b11, 0b1100, 0b01, 0,
+                    (outs SPR:$Sd), (ins SPR:$Sm),
+                    NoItinerary, !strconcat("vcvt", opc, ".u32.f16\t$Sd, $Sm"),
+                    []>,
+                    Requires<[HasFullFP16]> {
+      let Inst{17-16} = rm;
+    }
+
+    def SS : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
+                    (outs SPR:$Sd), (ins SPR:$Sm),
+                    NoItinerary, !strconcat("vcvt", opc, ".s32.f32\t$Sd, $Sm"),
+                    []>,
+                    Requires<[HasFPARMv8]> {
+      let Inst{17-16} = rm;
+    }
+
+    def US : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0,
+                    (outs SPR:$Sd), (ins SPR:$Sm),
+                    NoItinerary, !strconcat("vcvt", opc, ".u32.f32\t$Sd, $Sm"),
+                    []>,
+                    Requires<[HasFPARMv8]> {
+      let Inst{17-16} = rm;
+    }
+
+    def SD : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
+                    (outs SPR:$Sd), (ins DPR:$Dm),
+                    NoItinerary, !strconcat("vcvt", opc, ".s32.f64\t$Sd, $Dm"),
+                    []>,
+                    Requires<[HasFPARMv8, HasDPVFP]> {
+      bits<5> Dm;
+
+      let Inst{17-16} = rm;
+
+      // Encode instruction operands
+      let Inst{3-0} = Dm{3-0};
+      let Inst{5}   = Dm{4};
+      let Inst{8} = 1;
+    }
+
+    def UD : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0,
+                    (outs SPR:$Sd), (ins DPR:$Dm),
+                    NoItinerary, !strconcat("vcvt", opc, ".u32.f64\t$Sd, $Dm"),
+                    []>,
+                    Requires<[HasFPARMv8, HasDPVFP]> {
+      bits<5> Dm;
+
+      let Inst{17-16} = rm;
+
+      // Encode instruction operands
+      let Inst{3-0}  = Dm{3-0};
+      let Inst{5}    = Dm{4};
+      let Inst{8} = 1;
+    }
+  }
+
+  let Predicates = [HasFPARMv8] in {
+    def : Pat<(i32 (fp_to_sint (node SPR:$a))),
+              (COPY_TO_REGCLASS
+                (!cast<Instruction>(NAME#"SS") SPR:$a),
+                GPR)>;
+    def : Pat<(i32 (fp_to_uint (node SPR:$a))),
+              (COPY_TO_REGCLASS
+                (!cast<Instruction>(NAME#"US") SPR:$a),
+                GPR)>;
+  }
+  let Predicates = [HasFPARMv8, HasDPVFP] in {
+    def : Pat<(i32 (fp_to_sint (node (f64 DPR:$a)))),
+              (COPY_TO_REGCLASS
+                (!cast<Instruction>(NAME#"SD") DPR:$a),
+                GPR)>;
+    def : Pat<(i32 (fp_to_uint (node (f64 DPR:$a)))),
+              (COPY_TO_REGCLASS
+                (!cast<Instruction>(NAME#"UD") DPR:$a),
+                GPR)>;
+  }
+}
+
+defm VCVTA : vcvt_inst<"a", 0b00, fround>;
+defm VCVTN : vcvt_inst<"n", 0b01>;
+defm VCVTP : vcvt_inst<"p", 0b10, fceil>;
+defm VCVTM : vcvt_inst<"m", 0b11, ffloor>;
+
+def VNEGD  : ADuI<0b11101, 0b11, 0b0001, 0b01, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dm),
+                  IIC_fpUNA64, "vneg", ".f64\t$Dd, $Dm",
+                  [(set DPR:$Dd, (fneg (f64 DPR:$Dm)))]>;
+
+def VNEGS  : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   IIC_fpUNA32, "vneg", ".f32\t$Sd, $Sm",
+                   [(set SPR:$Sd, (fneg SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VNEGH  : AHuI<0b11101, 0b11, 0b0001, 0b01, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpUNA16, "vneg", ".f16\t$Sd, $Sm",
+                  []>;
+
+multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
+  def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0,
+               (outs SPR:$Sd), (ins SPR:$Sm),
+               NoItinerary, !strconcat("vrint", opc), ".f16\t$Sd, $Sm",
+               []>,
+               Requires<[HasFullFP16]> {
+    let Inst{7} = op2;
+    let Inst{16} = op;
+  }
+
+  def S : ASuI<0b11101, 0b11, 0b0110, 0b11, 0,
+               (outs SPR:$Sd), (ins SPR:$Sm),
+               NoItinerary, !strconcat("vrint", opc), ".f32\t$Sd, $Sm",
+               [(set (f32 SPR:$Sd), (node (f32 SPR:$Sm)))]>,
+               Requires<[HasFPARMv8]> {
+    let Inst{7} = op2;
+    let Inst{16} = op;
+  }
+  def D : ADuI<0b11101, 0b11, 0b0110, 0b11, 0,
+                (outs DPR:$Dd), (ins DPR:$Dm),
+                NoItinerary, !strconcat("vrint", opc), ".f64\t$Dd, $Dm",
+                [(set (f64 DPR:$Dd), (node (f64 DPR:$Dm)))]>,
+                Requires<[HasFPARMv8, HasDPVFP]> {
+    let Inst{7} = op2;
+    let Inst{16} = op;
+  }
+
+  def : InstAlias<!strconcat("vrint", opc, "$p.f16.f16\t$Sd, $Sm"),
+                  (!cast<Instruction>(NAME#"H") SPR:$Sd, SPR:$Sm, pred:$p), 0>,
+        Requires<[HasFullFP16]>;
+  def : InstAlias<!strconcat("vrint", opc, "$p.f32.f32\t$Sd, $Sm"),
+                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm, pred:$p), 0>,
+        Requires<[HasFPARMv8]>;
+  def : InstAlias<!strconcat("vrint", opc, "$p.f64.f64\t$Dd, $Dm"),
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm, pred:$p), 0>,
+        Requires<[HasFPARMv8,HasDPVFP]>;
+}
+
+defm VRINTZ : vrint_inst_zrx<"z", 0, 1, ftrunc>;
+defm VRINTR : vrint_inst_zrx<"r", 0, 0, fnearbyint>;
+defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint>;
+
+multiclass vrint_inst_anpm<string opc, bits<2> rm,
+                           SDPatternOperator node = null_frag> {
+  let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+    def H : AHuInp<0b11101, 0b11, 0b1000, 0b01, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   NoItinerary, !strconcat("vrint", opc, ".f16\t$Sd, $Sm"),
+                   []>,
+                   Requires<[HasFullFP16]> {
+      let Inst{17-16} = rm;
+    }
+    def S : ASuInp<0b11101, 0b11, 0b1000, 0b01, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   NoItinerary, !strconcat("vrint", opc, ".f32\t$Sd, $Sm"),
+                   [(set (f32 SPR:$Sd), (node (f32 SPR:$Sm)))]>,
+                   Requires<[HasFPARMv8]> {
+      let Inst{17-16} = rm;
+    }
+    def D : ADuInp<0b11101, 0b11, 0b1000, 0b01, 0,
+                   (outs DPR:$Dd), (ins DPR:$Dm),
+                   NoItinerary, !strconcat("vrint", opc, ".f64\t$Dd, $Dm"),
+                   [(set (f64 DPR:$Dd), (node (f64 DPR:$Dm)))]>,
+                   Requires<[HasFPARMv8, HasDPVFP]> {
+      let Inst{17-16} = rm;
+    }
+  }
+
+  def : InstAlias<!strconcat("vrint", opc, ".f32.f32\t$Sd, $Sm"),
+                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm), 0>,
+        Requires<[HasFPARMv8]>;
+  def : InstAlias<!strconcat("vrint", opc, ".f64.f64\t$Dd, $Dm"),
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm), 0>,
+        Requires<[HasFPARMv8,HasDPVFP]>;
+}
+
+defm VRINTA : vrint_inst_anpm<"a", 0b00, fround>;
+defm VRINTN : vrint_inst_anpm<"n", 0b01>;
+defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>;
+defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>;
+
+def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dm),
+                  IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm",
+                  [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>;
+
+def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm",
+                  [(set SPR:$Sd, (fsqrt SPR:$Sm))]>;
+
+def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm",
+                  []>;
+
+let hasSideEffects = 0 in {
+def VMOVD  : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dm),
+                  IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", []>;
+
+def VMOVS  : ASuI<0b11101, 0b11, 0b0000, 0b01, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>;
+
+let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+def VMOVH  : ASuInp<0b11101, 0b11, 0b0000, 0b01, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpUNA16, "vmovx.f16\t$Sd, $Sm", []>,
+             Requires<[HasFullFP16]>;
+
+def VINSH  : ASuInp<0b11101, 0b11, 0b0000, 0b11, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpUNA16, "vins.f16\t$Sd, $Sm", []>,
+             Requires<[HasFullFP16]>;
+} // PostEncoderMethod
+} // hasSideEffects
+
+//===----------------------------------------------------------------------===//
+// FP <-> GPR Copies.  Int <-> FP Conversions.
+//
+
+def VMOVRS : AVConv2I<0b11100001, 0b1010,
+                      (outs GPR:$Rt), (ins SPR:$Sn),
+                      IIC_fpMOVSI, "vmov", "\t$Rt, $Sn",
+                      [(set GPR:$Rt, (bitconvert SPR:$Sn))]> {
+  // Instruction operands.
+  bits<4> Rt;
+  bits<5> Sn;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Rt;
+
+  let Inst{6-5}   = 0b00;
+  let Inst{3-0}   = 0b0000;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+// Bitcast i32 -> f32.  NEON prefers to use VMOVDRR.
+def VMOVSR : AVConv4I<0b11100000, 0b1010,
+                      (outs SPR:$Sn), (ins GPR:$Rt),
+                      IIC_fpMOVIS, "vmov", "\t$Sn, $Rt",
+                      [(set SPR:$Sn, (bitconvert GPR:$Rt))]>,
+             Requires<[HasVFP2, UseVMOVSR]> {
+  // Instruction operands.
+  bits<5> Sn;
+  bits<4> Rt;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Rt;
+
+  let Inst{6-5}   = 0b00;
+  let Inst{3-0}   = 0b0000;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+let hasSideEffects = 0 in {
+def VMOVRRD  : AVConv3I<0b11000101, 0b1011,
+                        (outs GPR:$Rt, GPR:$Rt2), (ins DPR:$Dm),
+                        IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $Dm",
+                 [/* FIXME: Can't write pattern for multiple result instr*/]> {
+  // Instruction operands.
+  bits<5> Dm;
+  bits<4> Rt;
+  bits<4> Rt2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+
+  let Inst{7-6} = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+
+  // This instruction is equivalent to
+  // $Rt = EXTRACT_SUBREG $Dm, ssub_0
+  // $Rt2 = EXTRACT_SUBREG $Dm, ssub_1
+  let isExtractSubreg = 1;
+}
+
+def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
+                      (outs GPR:$Rt, GPR:$Rt2), (ins SPR:$src1, SPR:$src2),
+                 IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $src1, $src2",
+                 [/* For disassembly only; pattern left blank */]> {
+  bits<5> src1;
+  bits<4> Rt;
+  bits<4> Rt2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = src1{4-1};
+  let Inst{5}     = src1{0};
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+
+  let Inst{7-6} = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+  let DecoderMethod = "DecodeVMOVRRS";
+}
+} // hasSideEffects
+
+// FMDHR: GPR -> SPR
+// FMDLR: GPR -> SPR
+
+def VMOVDRR : AVConv5I<0b11000100, 0b1011,
+                      (outs DPR:$Dm), (ins GPR:$Rt, GPR:$Rt2),
+                      IIC_fpMOVID, "vmov", "\t$Dm, $Rt, $Rt2",
+                      [(set DPR:$Dm, (arm_fmdrr GPR:$Rt, GPR:$Rt2))]> {
+  // Instruction operands.
+  bits<5> Dm;
+  bits<4> Rt;
+  bits<4> Rt2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+
+  let Inst{7-6}   = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+
+  // This instruction is equivalent to
+  // $Dm = REG_SEQUENCE $Rt, ssub_0, $Rt2, ssub_1
+  let isRegSequence = 1;
+}
+
+// Hoist an fabs or a fneg of a value coming from integer registers
+// and do the fabs/fneg on the integer value. This is never a lose
+// and could enable the conversion to float to be removed completely.
+def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)),
+          (VMOVDRR GPR:$Rl, (BFC GPR:$Rh, (i32 0x7FFFFFFF)))>,
+      Requires<[IsARM, HasV6T2]>;
+def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)),
+          (VMOVDRR GPR:$Rl, (t2BFC GPR:$Rh, (i32 0x7FFFFFFF)))>,
+      Requires<[IsThumb2, HasV6T2]>;
+def : Pat<(fneg (arm_fmdrr GPR:$Rl, GPR:$Rh)),
+          (VMOVDRR GPR:$Rl, (EORri GPR:$Rh, (i32 0x80000000)))>,
+      Requires<[IsARM]>;
+def : Pat<(fneg (arm_fmdrr GPR:$Rl, GPR:$Rh)),
+          (VMOVDRR GPR:$Rl, (t2EORri GPR:$Rh, (i32 0x80000000)))>,
+      Requires<[IsThumb2]>;
+
+let hasSideEffects = 0 in
+def VMOVSRR : AVConv5I<0b11000100, 0b1010,
+                     (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2),
+                IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2",
+                [/* For disassembly only; pattern left blank */]> {
+  // Instruction operands.
+  bits<5> dst1;
+  bits<4> src1;
+  bits<4> src2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = dst1{4-1};
+  let Inst{5}     = dst1{0};
+  let Inst{15-12} = src1;
+  let Inst{19-16} = src2;
+
+  let Inst{7-6} = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+
+  let DecoderMethod = "DecodeVMOVSRR";
+}
+
+// Move H->R, clearing top 16 bits
+def VMOVRH : AVConv2I<0b11100001, 0b1001,
+                      (outs GPR:$Rt), (ins SPR:$Sn),
+                      IIC_fpMOVSI, "vmov", ".f16\t$Rt, $Sn",
+                      []>,
+             Requires<[HasFullFP16]> {
+  // Instruction operands.
+  bits<4> Rt;
+  bits<5> Sn;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Rt;
+
+  let Inst{6-5}   = 0b00;
+  let Inst{3-0}   = 0b0000;
+}
+
+// Move R->H, clearing top 16 bits
+def VMOVHR : AVConv4I<0b11100000, 0b1001,
+                      (outs SPR:$Sn), (ins GPR:$Rt),
+                      IIC_fpMOVIS, "vmov", ".f16\t$Sn, $Rt",
+                      []>,
+             Requires<[HasFullFP16]> {
+  // Instruction operands.
+  bits<5> Sn;
+  bits<4> Rt;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Rt;
+
+  let Inst{6-5}   = 0b00;
+  let Inst{3-0}   = 0b0000;
+}
+
+// FMRDH: SPR -> GPR
+// FMRDL: SPR -> GPR
+// FMRRS: SPR -> GPR
+// FMRX:  SPR system reg -> GPR
+// FMSRR: GPR -> SPR
+// FMXR:  GPR -> VFP system reg
+
+
+// Int -> FP:
+
+class AVConv1IDs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                        bits<4> opcod4, dag oops, dag iops,
+                        InstrItinClass itin, string opc, string asm,
+                        list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+             pattern> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                         bits<4> opcod4, dag oops, dag iops,InstrItinClass itin,
+                         string opc, string asm, list<dag> pattern>
+  : AVConv1In<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+              pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+}
+
+class AVConv1IHs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                        bits<4> opcod4, dag oops, dag iops,
+                        InstrItinClass itin, string opc, string asm,
+                        list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+             pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Predicates = [HasFullFP16];
+}
+
+def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
+                               (outs DPR:$Dd), (ins SPR:$Sm),
+                               IIC_fpCVTID, "vcvt", ".f64.s32\t$Dd, $Sm",
+                               []> {
+  let Inst{7} = 1; // s32
+}
+
+let Predicates=[HasVFP2, HasDPVFP] in {
+  def : VFPPat<(f64 (sint_to_fp GPR:$a)),
+               (VSITOD (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+  def : VFPPat<(f64 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
+               (VSITOD (VLDRS addrmode5:$a))>;
+}
+
+def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
+                                (outs SPR:$Sd),(ins SPR:$Sm),
+                                IIC_fpCVTIS, "vcvt", ".f32.s32\t$Sd, $Sm",
+                                []> {
+  let Inst{7} = 1; // s32
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)),
+                   (VSITOS (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
+                   (VSITOS (VLDRS addrmode5:$a))>;
+
+def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
+                               (outs SPR:$Sd), (ins SPR:$Sm),
+                               IIC_fpCVTIH, "vcvt", ".f16.s32\t$Sd, $Sm",
+                               []> {
+  let Inst{7} = 1; // s32
+}
+
+def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
+                               (outs DPR:$Dd), (ins SPR:$Sm),
+                               IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm",
+                               []> {
+  let Inst{7} = 0; // u32
+}
+
+let Predicates=[HasVFP2, HasDPVFP] in {
+  def : VFPPat<(f64 (uint_to_fp GPR:$a)),
+               (VUITOD (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+  def : VFPPat<(f64 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
+               (VUITOD (VLDRS addrmode5:$a))>;
+}
+
+def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
+                                (outs SPR:$Sd), (ins SPR:$Sm),
+                                IIC_fpCVTIS, "vcvt", ".f32.u32\t$Sd, $Sm",
+                                []> {
+  let Inst{7} = 0; // u32
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)),
+                   (VUITOS (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
+                   (VUITOS (VLDRS addrmode5:$a))>;
+
+def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
+                                (outs SPR:$Sd), (ins SPR:$Sm),
+                                IIC_fpCVTIH, "vcvt", ".f16.u32\t$Sd, $Sm",
+                                []> {
+  let Inst{7} = 0; // u32
+}
+
+// FP -> Int:
+
+class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                        bits<4> opcod4, dag oops, dag iops,
+                        InstrItinClass itin, string opc, string asm,
+                        list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+             pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                         bits<4> opcod4, dag oops, dag iops,
+                         InstrItinClass itin, string opc, string asm,
+                         list<dag> pattern>
+  : AVConv1In<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+              pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+}
+
+class AVConv1IsH_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                         bits<4> opcod4, dag oops, dag iops,
+                         InstrItinClass itin, string opc, string asm,
+                         list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+              pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Predicates = [HasFullFP16];
+}
+
+// Always set Z bit in the instruction, i.e. "round towards zero" variants.
+def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
+                                (outs SPR:$Sd), (ins DPR:$Dm),
+                                IIC_fpCVTDI, "vcvt", ".s32.f64\t$Sd, $Dm",
+                                []> {
+  let Inst{7} = 1; // Z bit
+}
+
+let Predicates=[HasVFP2, HasDPVFP] in {
+  def : VFPPat<(i32 (fp_to_sint (f64 DPR:$a))),
+               (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>;
+
+  def : VFPPat<(alignedstore32 (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr),
+               (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>;
+}
+
+def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTSI, "vcvt", ".s32.f32\t$Sd, $Sm",
+                                 []> {
+  let Inst{7} = 1; // Z bit
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def : VFPNoNEONPat<(i32 (fp_to_sint SPR:$a)),
+                   (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>;
+
+def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))),
+                                   addrmode5:$ptr),
+                   (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>;
+
+def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTHI, "vcvt", ".s32.f16\t$Sd, $Sm",
+                                 []> {
+  let Inst{7} = 1; // Z bit
+}
+
+def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
+                               (outs SPR:$Sd), (ins DPR:$Dm),
+                               IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm",
+                               []> {
+  let Inst{7} = 1; // Z bit
+}
+
+let Predicates=[HasVFP2, HasDPVFP] in {
+  def : VFPPat<(i32 (fp_to_uint (f64 DPR:$a))),
+               (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>;
+
+  def : VFPPat<(alignedstore32 (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr),
+               (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>;
+}
+
+def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTSI, "vcvt", ".u32.f32\t$Sd, $Sm",
+                                 []> {
+  let Inst{7} = 1; // Z bit
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def : VFPNoNEONPat<(i32 (fp_to_uint SPR:$a)),
+                   (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>;
+
+def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))),
+                                   addrmode5:$ptr),
+                  (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>;
+
+def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTHI, "vcvt", ".u32.f16\t$Sd, $Sm",
+                                 []> {
+  let Inst{7} = 1; // Z bit
+}
+
+// And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
+let Uses = [FPSCR] in {
+// FIXME: Verify encoding after integrated assembler is working.
+def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
+                                (outs SPR:$Sd), (ins DPR:$Dm),
+                                IIC_fpCVTDI, "vcvtr", ".s32.f64\t$Sd, $Dm",
+                                [(set SPR:$Sd, (int_arm_vcvtr (f64 DPR:$Dm)))]>{
+  let Inst{7} = 0; // Z bit
+}
+
+def VTOSIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTSI, "vcvtr", ".s32.f32\t$Sd, $Sm",
+                                 [(set SPR:$Sd, (int_arm_vcvtr SPR:$Sm))]> {
+  let Inst{7} = 0; // Z bit
+}
+
+def VTOSIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTHI, "vcvtr", ".s32.f16\t$Sd, $Sm",
+                                 []> {
+  let Inst{7} = 0; // Z bit
+}
+
+def VTOUIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
+                                (outs SPR:$Sd), (ins DPR:$Dm),
+                                IIC_fpCVTDI, "vcvtr", ".u32.f64\t$Sd, $Dm",
+                                [(set SPR:$Sd, (int_arm_vcvtru(f64 DPR:$Dm)))]>{
+  let Inst{7} = 0; // Z bit
+}
+
+def VTOUIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTSI, "vcvtr", ".u32.f32\t$Sd, $Sm",
+                                 [(set SPR:$Sd, (int_arm_vcvtru SPR:$Sm))]> {
+  let Inst{7} = 0; // Z bit
+}
+
+def VTOUIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTHI, "vcvtr", ".u32.f16\t$Sd, $Sm",
+                                 []> {
+  let Inst{7} = 0; // Z bit
+}
+}
+
+// Convert between floating-point and fixed-point
+// Data type for fixed-point naming convention:
+//   S16 (U=0, sx=0) -> SH
+//   U16 (U=1, sx=0) -> UH
+//   S32 (U=0, sx=1) -> SL
+//   U32 (U=1, sx=1) -> UL
+
+let Constraints = "$a = $dst" in {
+
+// FP to Fixed-Point:
+
+// Single Precision register
+class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
+                          bit op5, dag oops, dag iops, InstrItinClass itin,
+                          string opc, string asm, list<dag> pattern>
+  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern>,
+  Sched<[WriteCvtFP]> {
+  bits<5> dst;
+  // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
+  let Inst{22} = dst{0};
+  let Inst{15-12} = dst{4-1};
+}
+
+// Double Precision register
+class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
+                          bit op5, dag oops, dag iops, InstrItinClass itin,
+                          string opc, string asm, list<dag> pattern>
+  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern>,
+    Sched<[WriteCvtFP]> {
+  bits<5> dst;
+  // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
+  let Inst{22} = dst{4};
+  let Inst{15-12} = dst{3-0};
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+def VTOSHH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1001, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTHI, "vcvt", ".s16.f16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VTOUHH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1001, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTHI, "vcvt", ".u16.f16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VTOSLH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1001, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTHI, "vcvt", ".s32.f16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VTOULH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1001, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTHI, "vcvt", ".u32.f16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VTOULS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VTOSHD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1110, 0b1011, 0,
+                       (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
+                 IIC_fpCVTDI, "vcvt", ".s16.f64\t$dst, $a, $fbits", []>;
+
+def VTOUHD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 0,
+                       (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
+                 IIC_fpCVTDI, "vcvt", ".u16.f64\t$dst, $a, $fbits", []>;
+
+def VTOSLD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1110, 0b1011, 1,
+                       (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
+                 IIC_fpCVTDI, "vcvt", ".s32.f64\t$dst, $a, $fbits", []>;
+
+def VTOULD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 1,
+                       (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
+                 IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a, $fbits", []>;
+
+// Fixed-Point to FP:
+
+def VSHTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1001, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTIH, "vcvt", ".f16.s16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VUHTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1001, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTIH, "vcvt", ".f16.u16\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VSLTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1001, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTIH, "vcvt", ".f16.s32\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VULTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1001, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTIH, "vcvt", ".f16.u32\t$dst, $a, $fbits", []>,
+             Requires<[HasFullFP16]>;
+
+def VSHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VUHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTIS, "vcvt", ".f32.u16\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VSLTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VULTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VSHTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1010, 0b1011, 0,
+                       (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
+                 IIC_fpCVTID, "vcvt", ".f64.s16\t$dst, $a, $fbits", []>;
+
+def VUHTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 0,
+                       (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
+                 IIC_fpCVTID, "vcvt", ".f64.u16\t$dst, $a, $fbits", []>;
+
+def VSLTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1010, 0b1011, 1,
+                       (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
+                 IIC_fpCVTID, "vcvt", ".f64.s32\t$dst, $a, $fbits", []>;
+
+def VULTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 1,
+                       (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
+                 IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits", []>;
+
+} // End of 'let Constraints = "$a = $dst" in'
+
+//===----------------------------------------------------------------------===//
+// FP Multiply-Accumulate Operations.
+//
+
+def VMLAD : ADbI<0b11100, 0b00, 0, 0,
+                 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                 IIC_fpMAC64, "vmla", ".f64\t$Dd, $Dn, $Dm",
+                 [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+                                          (f64 DPR:$Ddin)))]>,
+              RegConstraint<"$Ddin = $Dd">,
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC32, "vmla", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
+                                           SPR:$Sdin))]>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VMLAH : AHbI<0b11100, 0b00, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC16, "vmla", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+
+def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+          (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+          (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
+
+def VMLSD : ADbI<0b11100, 0b00, 1, 0,
+                 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                 IIC_fpMAC64, "vmls", ".f64\t$Dd, $Dn, $Dm",
+                 [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+                                          (f64 DPR:$Ddin)))]>,
+              RegConstraint<"$Ddin = $Dd">,
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC32, "vmls", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+                                           SPR:$Sdin))]>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VMLSH : AHbI<0b11100, 0b00, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC16, "vmls", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+
+def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+          (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+          (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
+                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                  IIC_fpMAC64, "vnmla", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+                                          (f64 DPR:$Ddin)))]>,
+                RegConstraint<"$Ddin = $Dd">,
+                Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC32, "vnmla", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+                                           SPR:$Sdin))]>,
+                RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VNMLAH : AHbI<0b11100, 0b01, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC16, "vnmla", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+                RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+
+def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
+          (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
+          (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                  IIC_fpMAC64, "vnmls", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+                                           (f64 DPR:$Ddin)))]>,
+               RegConstraint<"$Ddin = $Dd">,
+               Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm",
+             [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
+                         RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VNMLSH : AHbI<0b11100, 0b01, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm",
+             []>,
+                         RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+
+def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
+          (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
+          (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+
+//===----------------------------------------------------------------------===//
+// Fused FP Multiply-Accumulate Operations.
+//
+def VFMAD : ADbI<0b11101, 0b10, 0, 0,
+                 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                 IIC_fpFMAC64, "vfma", ".f64\t$Dd, $Dn, $Dm",
+                 [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+                                          (f64 DPR:$Ddin)))]>,
+              RegConstraint<"$Ddin = $Dd">,
+              Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+
+def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC32, "vfma", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
+                                           SPR:$Sdin))]>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines.
+}
+
+def VFMAH : AHbI<0b11101, 0b10, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasFullFP16,UseFusedMAC]>;
+
+def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+          (VFMAD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+          (VFMAS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+// (fma x, y, z) -> (vfms z, x, y)
+def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)),
+          (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)),
+          (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+
+def VFMSD : ADbI<0b11101, 0b10, 1, 0,
+                 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                 IIC_fpFMAC64, "vfms", ".f64\t$Dd, $Dn, $Dm",
+                 [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+                                          (f64 DPR:$Ddin)))]>,
+              RegConstraint<"$Ddin = $Dd">,
+              Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+
+def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC32, "vfms", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+                                           SPR:$Sdin))]>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines.
+}
+
+def VFMSH : AHbI<0b11101, 0b10, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasFullFP16,UseFusedMAC]>;
+
+def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+          (VFMSD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+          (VFMSS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+// (fma (fneg x), y, z) -> (vfms z, x, y)
+def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)),
+          (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
+          (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fma x, (fneg y), z) -> (vfms z, x, y)
+def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
+          (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)),
+          (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+
+def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
+                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                  IIC_fpFMAC64, "vfnma", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+                                          (f64 DPR:$Ddin)))]>,
+                RegConstraint<"$Ddin = $Dd">,
+                Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+
+def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC32, "vfnma", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+                                           SPR:$Sdin))]>,
+                RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines.
+}
+
+def VFNMAH : AHbI<0b11101, 0b01, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm",
+                  []>,
+                RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasFullFP16,UseFusedMAC]>;
+
+def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
+          (VFNMAD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
+          (VFNMAS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+// (fneg (fma x, y, z)) -> (vfnma z, x, y)
+def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))),
+          (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))),
+          (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y)
+def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
+          (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))),
+          (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+
+def VFNMSD : ADbI<0b11101, 0b01, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                  IIC_fpFMAC64, "vfnms", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+                                           (f64 DPR:$Ddin)))]>,
+               RegConstraint<"$Ddin = $Dd">,
+               Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+
+def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC32, "vfnms", ".f32\t$Sd, $Sn, $Sm",
+             [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
+                         RegConstraint<"$Sdin = $Sd">,
+                  Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines.
+}
+
+def VFNMSH : AHbI<0b11101, 0b01, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm",
+             []>,
+                         RegConstraint<"$Sdin = $Sd">,
+                  Requires<[HasFullFP16,UseFusedMAC]>;
+
+def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
+          (VFNMSD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
+          (VFNMSS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+
+// (fma x, y, (fneg z)) -> (vfnms z, x, y))
+def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))),
+          (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))),
+          (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y)
+def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
+          (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
+          (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fneg (fma x, (fneg y), z) -> (vfnms z, x, y)
+def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
+          (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4,HasDPVFP]>;
+def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
+          (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+
+//===----------------------------------------------------------------------===//
+// FP Conditional moves.
+//
+
+let hasSideEffects = 0 in {
+def VMOVDcc  : PseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, cmovpred:$p),
+                    IIC_fpUNA64,
+                    [(set (f64 DPR:$Dd),
+                          (ARMcmov DPR:$Dn, DPR:$Dm, cmovpred:$p))]>,
+               RegConstraint<"$Dn = $Dd">, Requires<[HasVFP2,HasDPVFP]>;
+
+def VMOVScc  : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p),
+                    IIC_fpUNA32,
+                    [(set (f32 SPR:$Sd),
+                          (ARMcmov SPR:$Sn, SPR:$Sm, cmovpred:$p))]>,
+               RegConstraint<"$Sn = $Sd">, Requires<[HasVFP2]>;
+} // hasSideEffects
+
+//===----------------------------------------------------------------------===//
+// Move from VFP System Register to ARM core register.
+//
+
+class MovFromVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
+                 list<dag> pattern>:
+  VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> {
+
+  // Instruction operand.
+  bits<4> Rt;
+
+  let Inst{27-20} = 0b11101111;
+  let Inst{19-16} = opc19_16;
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = 0b1010;
+  let Inst{7}     = 0;
+  let Inst{6-5}   = 0b00;
+  let Inst{4}     = 1;
+  let Inst{3-0}   = 0b0000;
+}
+
+// APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags
+// to APSR.
+let Defs = [CPSR], Uses = [FPSCR_NZCV], Rt = 0b1111 /* apsr_nzcv */ in
+def FMSTAT : MovFromVFP<0b0001 /* fpscr */, (outs), (ins),
+                        "vmrs", "\tAPSR_nzcv, fpscr", [(arm_fmstat)]>;
+
+// Application level FPSCR -> GPR
+let hasSideEffects = 1, Uses = [FPSCR] in
+def VMRS : MovFromVFP<0b0001 /* fpscr */, (outs GPR:$Rt), (ins),
+                      "vmrs", "\t$Rt, fpscr",
+                      [(set GPR:$Rt, (int_arm_get_fpscr))]>;
+
+// System level FPEXC, FPSID -> GPR
+let Uses = [FPSCR] in {
+  def VMRS_FPEXC : MovFromVFP<0b1000 /* fpexc */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, fpexc", []>;
+  def VMRS_FPSID : MovFromVFP<0b0000 /* fpsid */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, fpsid", []>;
+  def VMRS_MVFR0 : MovFromVFP<0b0111 /* mvfr0 */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, mvfr0", []>;
+  def VMRS_MVFR1 : MovFromVFP<0b0110 /* mvfr1 */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, mvfr1", []>;
+  def VMRS_MVFR2 : MovFromVFP<0b0101 /* mvfr2 */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, mvfr2", []>, Requires<[HasFPARMv8]>;
+  def VMRS_FPINST : MovFromVFP<0b1001 /* fpinst */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, fpinst", []>;
+  def VMRS_FPINST2 : MovFromVFP<0b1010 /* fpinst2 */, (outs GPR:$Rt), (ins),
+                                "vmrs", "\t$Rt, fpinst2", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Move from ARM core register to VFP System Register.
+//
+
+class MovToVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
+               list<dag> pattern>:
+  VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> {
+
+  // Instruction operand.
+  bits<4> src;
+
+  // Encode instruction operand.
+  let Inst{15-12} = src;
+
+  let Inst{27-20} = 0b11101110;
+  let Inst{19-16} = opc19_16;
+  let Inst{11-8}  = 0b1010;
+  let Inst{7}     = 0;
+  let Inst{4}     = 1;
+}
+
+let Defs = [FPSCR] in {
+  // Application level GPR -> FPSCR
+  def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPR:$src),
+                      "vmsr", "\tfpscr, $src", [(int_arm_set_fpscr GPR:$src)]>;
+  // System level GPR -> FPEXC
+  def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPR:$src),
+                      "vmsr", "\tfpexc, $src", []>;
+  // System level GPR -> FPSID
+  def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPR:$src),
+                      "vmsr", "\tfpsid, $src", []>;
+
+  def VMSR_FPINST : MovToVFP<0b1001 /* fpinst */, (outs), (ins GPR:$src),
+                              "vmsr", "\tfpinst, $src", []>;
+  def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPR:$src),
+                                "vmsr", "\tfpinst2, $src", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Misc.
+//
+
+// Materialize FP immediates. VFP3 only.
+let isReMaterializable = 1 in {
+def FCONSTD : VFPAI<(outs DPR:$Dd), (ins vfp_f64imm:$imm),
+                    VFPMiscFrm, IIC_fpUNA64,
+                    "vmov", ".f64\t$Dd, $imm",
+                    [(set DPR:$Dd, vfp_f64imm:$imm)]>,
+              Requires<[HasVFP3,HasDPVFP]> {
+  bits<5> Dd;
+  bits<8> imm;
+
+  let Inst{27-23} = 0b11101;
+  let Inst{22}    = Dd{4};
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = imm{7-4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision.
+  let Inst{7-4}   = 0b0000;
+  let Inst{3-0}   = imm{3-0};
+}
+
+def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
+                     VFPMiscFrm, IIC_fpUNA32,
+                     "vmov", ".f32\t$Sd, $imm",
+                     [(set SPR:$Sd, vfp_f32imm:$imm)]>, Requires<[HasVFP3]> {
+  bits<5> Sd;
+  bits<8> imm;
+
+  let Inst{27-23} = 0b11101;
+  let Inst{22}    = Sd{0};
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = imm{7-4};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision.
+  let Inst{7-4}   = 0b0000;
+  let Inst{3-0}   = imm{3-0};
+}
+
+def FCONSTH : VFPAI<(outs SPR:$Sd), (ins vfp_f16imm:$imm),
+                     VFPMiscFrm, IIC_fpUNA16,
+                     "vmov", ".f16\t$Sd, $imm",
+                     []>, Requires<[HasFullFP16]> {
+  bits<5> Sd;
+  bits<8> imm;
+
+  let Inst{27-23} = 0b11101;
+  let Inst{22}    = Sd{0};
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = imm{7-4};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{11-8}  = 0b1001;     // Half precision
+  let Inst{7-4}   = 0b0000;
+  let Inst{3-0}   = imm{3-0};
+}
+}
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases.
+//
+// A few mnemonic aliases for pre-unifixed syntax. We don't guarantee to
+// support them all, but supporting at least some of the basics is
+// good to be friendly.
+def : VFP2MnemonicAlias<"flds", "vldr">;
+def : VFP2MnemonicAlias<"fldd", "vldr">;
+def : VFP2MnemonicAlias<"fmrs", "vmov">;
+def : VFP2MnemonicAlias<"fmsr", "vmov">;
+def : VFP2MnemonicAlias<"fsqrts", "vsqrt">;
+def : VFP2MnemonicAlias<"fsqrtd", "vsqrt">;
+def : VFP2MnemonicAlias<"fadds", "vadd.f32">;
+def : VFP2MnemonicAlias<"faddd", "vadd.f64">;
+def : VFP2MnemonicAlias<"fmrdd", "vmov">;
+def : VFP2MnemonicAlias<"fmrds", "vmov">;
+def : VFP2MnemonicAlias<"fmrrd", "vmov">;
+def : VFP2MnemonicAlias<"fmdrr", "vmov">;
+def : VFP2MnemonicAlias<"fmuls", "vmul.f32">;
+def : VFP2MnemonicAlias<"fmuld", "vmul.f64">;
+def : VFP2MnemonicAlias<"fnegs", "vneg.f32">;
+def : VFP2MnemonicAlias<"fnegd", "vneg.f64">;
+def : VFP2MnemonicAlias<"ftosizd", "vcvt.s32.f64">;
+def : VFP2MnemonicAlias<"ftosid", "vcvtr.s32.f64">;
+def : VFP2MnemonicAlias<"ftosizs", "vcvt.s32.f32">;
+def : VFP2MnemonicAlias<"ftosis", "vcvtr.s32.f32">;
+def : VFP2MnemonicAlias<"ftouizd", "vcvt.u32.f64">;
+def : VFP2MnemonicAlias<"ftouid", "vcvtr.u32.f64">;
+def : VFP2MnemonicAlias<"ftouizs", "vcvt.u32.f32">;
+def : VFP2MnemonicAlias<"ftouis", "vcvtr.u32.f32">;
+def : VFP2MnemonicAlias<"fsitod", "vcvt.f64.s32">;
+def : VFP2MnemonicAlias<"fsitos", "vcvt.f32.s32">;
+def : VFP2MnemonicAlias<"fuitod", "vcvt.f64.u32">;
+def : VFP2MnemonicAlias<"fuitos", "vcvt.f32.u32">;
+def : VFP2MnemonicAlias<"fsts", "vstr">;
+def : VFP2MnemonicAlias<"fstd", "vstr">;
+def : VFP2MnemonicAlias<"fmacd", "vmla.f64">;
+def : VFP2MnemonicAlias<"fmacs", "vmla.f32">;
+def : VFP2MnemonicAlias<"fcpys", "vmov.f32">;
+def : VFP2MnemonicAlias<"fcpyd", "vmov.f64">;
+def : VFP2MnemonicAlias<"fcmps", "vcmp.f32">;
+def : VFP2MnemonicAlias<"fcmpd", "vcmp.f64">;
+def : VFP2MnemonicAlias<"fdivs", "vdiv.f32">;
+def : VFP2MnemonicAlias<"fdivd", "vdiv.f64">;
+def : VFP2MnemonicAlias<"fmrx", "vmrs">;
+def : VFP2MnemonicAlias<"fmxr", "vmsr">;
+
+// Be friendly and accept the old form of zero-compare
+def : VFP2DPInstAlias<"fcmpzd${p} $val", (VCMPZD DPR:$val, pred:$p)>;
+def : VFP2InstAlias<"fcmpzs${p} $val", (VCMPZS SPR:$val, pred:$p)>;
+
+
+def : VFP2InstAlias<"fmstat${p}", (FMSTAT pred:$p)>;
+def : VFP2InstAlias<"fadds${p} $Sd, $Sn, $Sm",
+                    (VADDS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
+def : VFP2DPInstAlias<"faddd${p} $Dd, $Dn, $Dm",
+                      (VADDD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
+def : VFP2InstAlias<"fsubs${p} $Sd, $Sn, $Sm",
+                    (VSUBS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
+def : VFP2DPInstAlias<"fsubd${p} $Dd, $Dn, $Dm",
+                      (VSUBD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
+
+// No need for the size suffix on VSQRT. It's implied by the register classes.
+def : VFP2InstAlias<"vsqrt${p} $Sd, $Sm", (VSQRTS SPR:$Sd, SPR:$Sm, pred:$p)>;
+def : VFP2DPInstAlias<"vsqrt${p} $Dd, $Dm", (VSQRTD DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+// VLDR/VSTR accept an optional type suffix.
+def : VFP2InstAlias<"vldr${p}.32 $Sd, $addr",
+                    (VLDRS SPR:$Sd, addrmode5:$addr, pred:$p)>;
+def : VFP2InstAlias<"vstr${p}.32 $Sd, $addr",
+                    (VSTRS SPR:$Sd, addrmode5:$addr, pred:$p)>;
+def : VFP2InstAlias<"vldr${p}.64 $Dd, $addr",
+                    (VLDRD DPR:$Dd, addrmode5:$addr, pred:$p)>;
+def : VFP2InstAlias<"vstr${p}.64 $Dd, $addr",
+                    (VSTRD DPR:$Dd, addrmode5:$addr, pred:$p)>;
+
+// VMOV can accept optional 32-bit or less data type suffix suffix.
+def : VFP2InstAlias<"vmov${p}.8 $Rt, $Sn",
+                    (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.16 $Rt, $Sn",
+                    (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.32 $Rt, $Sn",
+                    (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.8 $Sn, $Rt",
+                    (VMOVSR SPR:$Sn, GPR:$Rt, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.16 $Sn, $Rt",
+                    (VMOVSR SPR:$Sn, GPR:$Rt, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.32 $Sn, $Rt",
+                    (VMOVSR SPR:$Sn, GPR:$Rt, pred:$p)>;
+
+def : VFP2InstAlias<"vmov${p}.f64 $Rt, $Rt2, $Dn",
+                    (VMOVRRD GPR:$Rt, GPR:$Rt2, DPR:$Dn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.f64 $Dn, $Rt, $Rt2",
+                    (VMOVDRR DPR:$Dn, GPR:$Rt, GPR:$Rt2, pred:$p)>;
+
+// VMOVS doesn't need the .f32 to disambiguate from the NEON encoding the way
+// VMOVD does.
+def : VFP2InstAlias<"vmov${p} $Sd, $Sm",
+                    (VMOVS SPR:$Sd, SPR:$Sm, pred:$p)>;
+
+// FCONSTD/FCONSTS alias for vmov.f64/vmov.f32
+// These aliases provide added functionality over vmov.f instructions by
+// allowing users to write assembly containing encoded floating point constants
+// (e.g. #0x70 vs #1.0).  Without these alises there is no way for the
+// assembler to accept encoded fp constants (but the equivalent fp-literal is
+// accepted directly by vmovf).
+def : VFP3InstAlias<"fconstd${p} $Dd, $val",
+                    (FCONSTD DPR:$Dd, vfp_f64imm:$val, pred:$p)>;
+def : VFP3InstAlias<"fconsts${p} $Sd, $val",
+                    (FCONSTS SPR:$Sd, vfp_f32imm:$val, pred:$p)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
new file mode 100644
index 000000000000..2bdbe4fca3de
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -0,0 +1,109 @@
+//===- ARMInstructionSelector.cpp ----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for ARM.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "ARMInstructionSelector.h"
+#include "ARMRegisterBankInfo.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "arm-isel"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+ARMInstructionSelector::ARMInstructionSelector(const ARMSubtarget &STI,
+                                               const ARMRegisterBankInfo &RBI)
+    : InstructionSelector(), TII(*STI.getInstrInfo()),
+      TRI(*STI.getRegisterInfo()), RBI(RBI) {}
+
+static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
+                       MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+                       const RegisterBankInfo &RBI) {
+  unsigned DstReg = I.getOperand(0).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+    return true;
+
+  const RegisterBank *RegBank = RBI.getRegBank(DstReg, MRI, TRI);
+  (void)RegBank;
+  assert(RegBank && "Can't get reg bank for virtual register");
+
+  const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+  (void)DstSize;
+  unsigned SrcReg = I.getOperand(1).getReg();
+  const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
+  (void)SrcSize;
+  assert((DstSize == SrcSize ||
+          // Copies are a means to setup initial types, the number of
+          // bits may not exactly match.
+          (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+           DstSize <= SrcSize)) &&
+         "Copy with different width?!");
+
+  assert(RegBank->getID() == ARM::GPRRegBankID && "Unsupported reg bank");
+  const TargetRegisterClass *RC = &ARM::GPRRegClass;
+
+  // No need to constrain SrcReg. It will get constrained when
+  // we hit another of its uses or its defs.
+  // Copies do not have constraints.
+  if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
+    DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+                 << " operand\n");
+    return false;
+  }
+  return true;
+}
+
+bool ARMInstructionSelector::select(MachineInstr &I) const {
+  assert(I.getParent() && "Instruction should be in a basic block!");
+  assert(I.getParent()->getParent() && "Instruction should be in a function!");
+
+  auto &MBB = *I.getParent();
+  auto &MF = *MBB.getParent();
+  auto &MRI = MF.getRegInfo();
+
+  if (!isPreISelGenericOpcode(I.getOpcode())) {
+    if (I.isCopy())
+      return selectCopy(I, TII, MRI, TRI, RBI);
+
+    return true;
+  }
+
+  MachineInstrBuilder MIB{MF, I};
+
+  using namespace TargetOpcode;
+  switch (I.getOpcode()) {
+  case G_ADD:
+    I.setDesc(TII.get(ARM::ADDrr));
+    AddDefaultCC(AddDefaultPred(MIB));
+    break;
+  case G_FRAME_INDEX:
+    // Add 0 to the given frame index and hope it will eventually be folded into
+    // the user(s).
+    I.setDesc(TII.get(ARM::ADDri));
+    AddDefaultCC(AddDefaultPred(MIB.addImm(0)));
+    break;
+  case G_LOAD:
+    I.setDesc(TII.get(ARM::LDRi12));
+    AddDefaultPred(MIB.addImm(0));
+    break;
+  default:
+    return false;
+  }
+
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.h b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.h
new file mode 100644
index 000000000000..5072cdd60ce4
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.h
@@ -0,0 +1,39 @@
+//===- ARMInstructionSelector ------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the InstructionSelector class for ARM.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMINSTRUCTIONSELECTOR_H
+#define LLVM_LIB_TARGET_ARM_ARMINSTRUCTIONSELECTOR_H
+
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+
+namespace llvm {
+class ARMBaseInstrInfo;
+class ARMBaseRegisterInfo;
+class ARMBaseTargetMachine;
+class ARMRegisterBankInfo;
+class ARMSubtarget;
+
+class ARMInstructionSelector : public InstructionSelector {
+public:
+  ARMInstructionSelector(const ARMSubtarget &STI,
+                         const ARMRegisterBankInfo &RBI);
+
+  virtual bool select(MachineInstr &I) const override;
+
+private:
+  const ARMBaseInstrInfo &TII;
+  const ARMBaseRegisterInfo &TRI;
+  const ARMRegisterBankInfo &RBI;
+};
+
+} // End llvm namespace.
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
new file mode 100644
index 000000000000..255ea4bc7198
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -0,0 +1,44 @@
+//===- ARMLegalizerInfo.cpp --------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for ARM.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "ARMLegalizerInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Target/TargetOpcodes.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+ARMLegalizerInfo::ARMLegalizerInfo() {
+  using namespace TargetOpcode;
+
+  const LLT p0 = LLT::pointer(0, 32);
+
+  const LLT s8 = LLT::scalar(8);
+  const LLT s16 = LLT::scalar(16);
+  const LLT s32 = LLT::scalar(32);
+
+  setAction({G_FRAME_INDEX, p0}, Legal);
+
+  setAction({G_LOAD, s32}, Legal);
+  setAction({G_LOAD, 1, p0}, Legal);
+
+  for (auto Ty : {s8, s16, s32})
+    setAction({G_ADD, Ty}, Legal);
+
+  computeTables();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h
new file mode 100644
index 000000000000..ca3eea81271b
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h
@@ -0,0 +1,29 @@
+//===- ARMLegalizerInfo ------------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for ARM.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMMACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_ARM_ARMMACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class LLVMContext;
+
+/// This class provides the information for the target register banks.
+class ARMLegalizerInfo : public LegalizerInfo {
+public:
+  ARMLegalizerInfo();
+};
+} // End llvm namespace.
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
new file mode 100644
index 000000000000..48ab491b5be9
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -0,0 +1,2389 @@
+//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains a pass that performs load / store related peephole
+/// optimizations. This pass should be run after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMISelLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "ThumbRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-ldst-opt"
+
+STATISTIC(NumLDMGened , "Number of ldm instructions generated");
+STATISTIC(NumSTMGened , "Number of stm instructions generated");
+STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
+STATISTIC(NumVSTMGened, "Number of vstm instructions generated");
+STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
+STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
+STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
+STATISTIC(NumLDRD2LDM,  "Number of ldrd instructions turned back into ldm");
+STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
+STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
+STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
+
+/// This switch disables formation of double/multi instructions that could
+/// potentially lead to (new) alignment traps even with CCR.UNALIGN_TRP
+/// disabled. This can be used to create libraries that are robust even when
+/// users provoke undefined behaviour by supplying misaligned pointers.
+/// \see mayCombineMisaligned()
+static cl::opt<bool>
+AssumeMisalignedLoadStores("arm-assume-misaligned-load-store", cl::Hidden,
+    cl::init(false), cl::desc("Be more conservative in ARM load/store opt"));
+
+#define ARM_LOAD_STORE_OPT_NAME "ARM load / store optimization pass"
+
+namespace {
+  /// Post- register allocation pass the combine load / store instructions to
+  /// form ldm / stm instructions.
+  struct ARMLoadStoreOpt : public MachineFunctionPass {
+    static char ID;
+    ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
+
+    const MachineFunction *MF;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const ARMSubtarget *STI;
+    const TargetLowering *TL;
+    ARMFunctionInfo *AFI;
+    LivePhysRegs LiveRegs;
+    RegisterClassInfo RegClassInfo;
+    MachineBasicBlock::const_iterator LiveRegPos;
+    bool LiveRegsValid;
+    bool RegClassInfoValid;
+    bool isThumb1, isThumb2;
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    StringRef getPassName() const override { return ARM_LOAD_STORE_OPT_NAME; }
+
+  private:
+    /// A set of load/store MachineInstrs with same base register sorted by
+    /// offset.
+    struct MemOpQueueEntry {
+      MachineInstr *MI;
+      int Offset;        ///< Load/Store offset.
+      unsigned Position; ///< Position as counted from end of basic block.
+      MemOpQueueEntry(MachineInstr &MI, int Offset, unsigned Position)
+          : MI(&MI), Offset(Offset), Position(Position) {}
+    };
+    typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
+
+    /// A set of MachineInstrs that fulfill (nearly all) conditions to get
+    /// merged into a LDM/STM.
+    struct MergeCandidate {
+      /// List of instructions ordered by load/store offset.
+      SmallVector<MachineInstr*, 4> Instrs;
+      /// Index in Instrs of the instruction being latest in the schedule.
+      unsigned LatestMIIdx;
+      /// Index in Instrs of the instruction being earliest in the schedule.
+      unsigned EarliestMIIdx;
+      /// Index into the basic block where the merged instruction will be
+      /// inserted. (See MemOpQueueEntry.Position)
+      unsigned InsertPos;
+      /// Whether the instructions can be merged into a ldm/stm instruction.
+      bool CanMergeToLSMulti;
+      /// Whether the instructions can be merged into a ldrd/strd instruction.
+      bool CanMergeToLSDouble;
+    };
+    SpecificBumpPtrAllocator<MergeCandidate> Allocator;
+    SmallVector<const MergeCandidate*,4> Candidates;
+    SmallVector<MachineInstr*,4> MergeBaseCandidates;
+
+    void moveLiveRegsBefore(const MachineBasicBlock &MBB,
+                            MachineBasicBlock::const_iterator Before);
+    unsigned findFreeReg(const TargetRegisterClass &RegClass);
+    void UpdateBaseRegUses(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+                           unsigned Base, unsigned WordOffset,
+                           ARMCC::CondCodes Pred, unsigned PredReg);
+    MachineInstr *CreateLoadStoreMulti(
+        MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+        int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
+        ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
+        ArrayRef<std::pair<unsigned, bool>> Regs);
+    MachineInstr *CreateLoadStoreDouble(
+        MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+        int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
+        ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
+        ArrayRef<std::pair<unsigned, bool>> Regs) const;
+    void FormCandidates(const MemOpQueue &MemOps);
+    MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand);
+    bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator &MBBI);
+    bool MergeBaseUpdateLoadStore(MachineInstr *MI);
+    bool MergeBaseUpdateLSMultiple(MachineInstr *MI);
+    bool MergeBaseUpdateLSDouble(MachineInstr &MI) const;
+    bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
+    bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
+    bool CombineMovBx(MachineBasicBlock &MBB);
+  };
+  char ARMLoadStoreOpt::ID = 0;
+}
+
+INITIALIZE_PASS(ARMLoadStoreOpt, "arm-ldst-opt", ARM_LOAD_STORE_OPT_NAME, false,
+                false)
+
+static bool definesCPSR(const MachineInstr &MI) {
+  for (const auto &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead())
+      // If the instruction has live CPSR def, then it's not safe to fold it
+      // into load / store.
+      return true;
+  }
+
+  return false;
+}
+
+static int getMemoryOpOffset(const MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
+  bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
+  unsigned NumOperands = MI.getDesc().getNumOperands();
+  unsigned OffField = MI.getOperand(NumOperands - 3).getImm();
+
+  if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
+      Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
+      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 ||
+      Opcode == ARM::LDRi12   || Opcode == ARM::STRi12)
+    return OffField;
+
+  // Thumb1 immediate offsets are scaled by 4
+  if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi ||
+      Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi)
+    return OffField * 4;
+
+  int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
+    : ARM_AM::getAM5Offset(OffField) * 4;
+  ARM_AM::AddrOpc Op = isAM3 ? ARM_AM::getAM3Op(OffField)
+    : ARM_AM::getAM5Op(OffField);
+
+  if (Op == ARM_AM::sub)
+    return -Offset;
+
+  return Offset;
+}
+
+static const MachineOperand &getLoadStoreBaseOp(const MachineInstr &MI) {
+  return MI.getOperand(1);
+}
+
+static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) {
+  return MI.getOperand(0);
+}
+
+static int getLoadStoreMultipleOpcode(unsigned Opcode, ARM_AM::AMSubMode Mode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case ARM::LDRi12:
+    ++NumLDMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::LDMIA;
+    case ARM_AM::da: return ARM::LDMDA;
+    case ARM_AM::db: return ARM::LDMDB;
+    case ARM_AM::ib: return ARM::LDMIB;
+    }
+  case ARM::STRi12:
+    ++NumSTMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::STMIA;
+    case ARM_AM::da: return ARM::STMDA;
+    case ARM_AM::db: return ARM::STMDB;
+    case ARM_AM::ib: return ARM::STMIB;
+    }
+  case ARM::tLDRi:
+  case ARM::tLDRspi:
+    // tLDMIA is writeback-only - unless the base register is in the input
+    // reglist.
+    ++NumLDMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::tLDMIA;
+    }
+  case ARM::tSTRi:
+  case ARM::tSTRspi:
+    // There is no non-writeback tSTMIA either.
+    ++NumSTMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::tSTMIA_UPD;
+    }
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+    ++NumLDMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::t2LDMIA;
+    case ARM_AM::db: return ARM::t2LDMDB;
+    }
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    ++NumSTMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::t2STMIA;
+    case ARM_AM::db: return ARM::t2STMDB;
+    }
+  case ARM::VLDRS:
+    ++NumVLDMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VLDMSIA;
+    case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists.
+    }
+  case ARM::VSTRS:
+    ++NumVSTMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VSTMSIA;
+    case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists.
+    }
+  case ARM::VLDRD:
+    ++NumVLDMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VLDMDIA;
+    case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists.
+    }
+  case ARM::VSTRD:
+    ++NumVSTMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VSTMDIA;
+    case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists.
+    }
+  }
+}
+
+static ARM_AM::AMSubMode getLoadStoreMultipleSubMode(unsigned Opcode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case ARM::LDMIA_RET:
+  case ARM::LDMIA:
+  case ARM::LDMIA_UPD:
+  case ARM::STMIA:
+  case ARM::STMIA_UPD:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
+  case ARM::t2LDMIA_RET:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2STMIA:
+  case ARM::t2STMIA_UPD:
+  case ARM::VLDMSIA:
+  case ARM::VLDMSIA_UPD:
+  case ARM::VSTMSIA:
+  case ARM::VSTMSIA_UPD:
+  case ARM::VLDMDIA:
+  case ARM::VLDMDIA_UPD:
+  case ARM::VSTMDIA:
+  case ARM::VSTMDIA_UPD:
+    return ARM_AM::ia;
+
+  case ARM::LDMDA:
+  case ARM::LDMDA_UPD:
+  case ARM::STMDA:
+  case ARM::STMDA_UPD:
+    return ARM_AM::da;
+
+  case ARM::LDMDB:
+  case ARM::LDMDB_UPD:
+  case ARM::STMDB:
+  case ARM::STMDB_UPD:
+  case ARM::t2LDMDB:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMDB:
+  case ARM::t2STMDB_UPD:
+  case ARM::VLDMSDB_UPD:
+  case ARM::VSTMSDB_UPD:
+  case ARM::VLDMDDB_UPD:
+  case ARM::VSTMDDB_UPD:
+    return ARM_AM::db;
+
+  case ARM::LDMIB:
+  case ARM::LDMIB_UPD:
+  case ARM::STMIB:
+  case ARM::STMIB_UPD:
+    return ARM_AM::ib;
+  }
+}
+
+static bool isT1i32Load(unsigned Opc) {
+  return Opc == ARM::tLDRi || Opc == ARM::tLDRspi;
+}
+
+static bool isT2i32Load(unsigned Opc) {
+  return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
+}
+
+static bool isi32Load(unsigned Opc) {
+  return Opc == ARM::LDRi12 || isT1i32Load(Opc) || isT2i32Load(Opc) ;
+}
+
+static bool isT1i32Store(unsigned Opc) {
+  return Opc == ARM::tSTRi || Opc == ARM::tSTRspi;
+}
+
+static bool isT2i32Store(unsigned Opc) {
+  return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
+}
+
+static bool isi32Store(unsigned Opc) {
+  return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc);
+}
+
+static bool isLoadSingle(unsigned Opc) {
+  return isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
+}
+
+static unsigned getImmScale(unsigned Opc) {
+  switch (Opc) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case ARM::tLDRi:
+  case ARM::tSTRi:
+  case ARM::tLDRspi:
+  case ARM::tSTRspi:
+    return 1;
+  case ARM::tLDRHi:
+  case ARM::tSTRHi:
+    return 2;
+  case ARM::tLDRBi:
+  case ARM::tSTRBi:
+    return 4;
+  }
+}
+
+static unsigned getLSMultipleTransferSize(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return 0;
+  case ARM::LDRi12:
+  case ARM::STRi12:
+  case ARM::tLDRi:
+  case ARM::tSTRi:
+  case ARM::tLDRspi:
+  case ARM::tSTRspi:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+  case ARM::VLDRS:
+  case ARM::VSTRS:
+    return 4;
+  case ARM::VLDRD:
+  case ARM::VSTRD:
+    return 8;
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+  case ARM::VLDMSIA:
+  case ARM::VSTMSIA:
+    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
+  case ARM::VLDMDIA:
+  case ARM::VSTMDIA:
+    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
+  }
+}
+
+/// Update future uses of the base register with the offset introduced
+/// due to writeback. This function only works on Thumb1.
+void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        const DebugLoc &DL, unsigned Base,
+                                        unsigned WordOffset,
+                                        ARMCC::CondCodes Pred,
+                                        unsigned PredReg) {
+  assert(isThumb1 && "Can only update base register uses for Thumb1!");
+  // Start updating any instructions with immediate offsets. Insert a SUB before
+  // the first non-updateable instruction (if any).
+  for (; MBBI != MBB.end(); ++MBBI) {
+    bool InsertSub = false;
+    unsigned Opc = MBBI->getOpcode();
+
+    if (MBBI->readsRegister(Base)) {
+      int Offset;
+      bool IsLoad =
+        Opc == ARM::tLDRi || Opc == ARM::tLDRHi || Opc == ARM::tLDRBi;
+      bool IsStore =
+        Opc == ARM::tSTRi || Opc == ARM::tSTRHi || Opc == ARM::tSTRBi;
+
+      if (IsLoad || IsStore) {
+        // Loads and stores with immediate offsets can be updated, but only if
+        // the new offset isn't negative.
+        // The MachineOperand containing the offset immediate is the last one
+        // before predicates.
+        MachineOperand &MO =
+          MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
+        // The offsets are scaled by 1, 2 or 4 depending on the Opcode.
+        Offset = MO.getImm() - WordOffset * getImmScale(Opc);
+
+        // If storing the base register, it needs to be reset first.
+        unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
+
+        if (Offset >= 0 && !(IsStore && InstrSrcReg == Base))
+          MO.setImm(Offset);
+        else
+          InsertSub = true;
+
+      } else if ((Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) &&
+                 !definesCPSR(*MBBI)) {
+        // SUBS/ADDS using this register, with a dead def of the CPSR.
+        // Merge it with the update; if the merged offset is too large,
+        // insert a new sub instead.
+        MachineOperand &MO =
+          MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
+        Offset = (Opc == ARM::tSUBi8) ?
+          MO.getImm() + WordOffset * 4 :
+          MO.getImm() - WordOffset * 4 ;
+        if (Offset >= 0 && TL->isLegalAddImmediate(Offset)) {
+          // FIXME: Swap ADDS<->SUBS if Offset < 0, erase instruction if
+          // Offset == 0.
+          MO.setImm(Offset);
+          // The base register has now been reset, so exit early.
+          return;
+        } else {
+          InsertSub = true;
+        }
+
+      } else {
+        // Can't update the instruction.
+        InsertSub = true;
+      }
+
+    } else if (definesCPSR(*MBBI) || MBBI->isCall() || MBBI->isBranch()) {
+      // Since SUBS sets the condition flags, we can't place the base reset
+      // after an instruction that has a live CPSR def.
+      // The base register might also contain an argument for a function call.
+      InsertSub = true;
+    }
+
+    if (InsertSub) {
+      // An instruction above couldn't be updated, so insert a sub.
+      AddDefaultT1CC(BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
+        .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
+      return;
+    }
+
+    if (MBBI->killsRegister(Base) || MBBI->definesRegister(Base))
+      // Register got killed. Stop updating.
+      return;
+  }
+
+  // End of block was reached.
+  if (MBB.succ_size() > 0) {
+    // FIXME: Because of a bug, live registers are sometimes missing from
+    // the successor blocks' live-in sets. This means we can't trust that
+    // information and *always* have to reset at the end of a block.
+    // See PR21029.
+    if (MBBI != MBB.end()) --MBBI;
+    AddDefaultT1CC(
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
+      .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
+  }
+}
+
+/// Return the first register of class \p RegClass that is not in \p Regs.
+unsigned ARMLoadStoreOpt::findFreeReg(const TargetRegisterClass &RegClass) {
+  if (!RegClassInfoValid) {
+    RegClassInfo.runOnMachineFunction(*MF);
+    RegClassInfoValid = true;
+  }
+
+  for (unsigned Reg : RegClassInfo.getOrder(&RegClass))
+    if (!LiveRegs.contains(Reg))
+      return Reg;
+  return 0;
+}
+
+/// Compute live registers just before instruction \p Before (in normal schedule
+/// direction). Computes backwards so multiple queries in the same block must
+/// come in reverse order.
+void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB,
+    MachineBasicBlock::const_iterator Before) {
+  // Initialize if we never queried in this block.
+  if (!LiveRegsValid) {
+    LiveRegs.init(*TRI);
+    LiveRegs.addLiveOuts(MBB);
+    LiveRegPos = MBB.end();
+    LiveRegsValid = true;
+  }
+  // Move backward just before the "Before" position.
+  while (LiveRegPos != Before) {
+    --LiveRegPos;
+    LiveRegs.stepBackward(*LiveRegPos);
+  }
+}
+
+static bool ContainsReg(const ArrayRef<std::pair<unsigned, bool>> &Regs,
+                        unsigned Reg) {
+  for (const std::pair<unsigned, bool> &R : Regs)
+    if (R.first == Reg)
+      return true;
+  return false;
+}
+
+/// Create and insert a LDM or STM with Base as base register and registers in
+/// Regs as the register operands that would be loaded / stored.  It returns
+/// true if the transformation is done.
+MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
+    ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
+    ArrayRef<std::pair<unsigned, bool>> Regs) {
+  unsigned NumRegs = Regs.size();
+  assert(NumRegs > 1);
+
+  // For Thumb1 targets, it might be necessary to clobber the CPSR to merge.
+  // Compute liveness information for that register to make the decision.
+  bool SafeToClobberCPSR = !isThumb1 ||
+    (MBB.computeRegisterLiveness(TRI, ARM::CPSR, InsertBefore, 20) ==
+     MachineBasicBlock::LQR_Dead);
+
+  bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback.
+
+  // Exception: If the base register is in the input reglist, Thumb1 LDM is
+  // non-writeback.
+  // It's also not possible to merge an STR of the base register in Thumb1.
+  if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) {
+    assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
+    if (Opcode == ARM::tLDRi) {
+      Writeback = false;
+    } else if (Opcode == ARM::tSTRi) {
+      return nullptr;
+    }
+  }
+
+  ARM_AM::AMSubMode Mode = ARM_AM::ia;
+  // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA.
+  bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
+  bool haveIBAndDA = isNotVFP && !isThumb2 && !isThumb1;
+
+  if (Offset == 4 && haveIBAndDA) {
+    Mode = ARM_AM::ib;
+  } else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) {
+    Mode = ARM_AM::da;
+  } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) {
+    // VLDM/VSTM do not support DB mode without also updating the base reg.
+    Mode = ARM_AM::db;
+  } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
+    // Check if this is a supported opcode before inserting instructions to
+    // calculate a new base register.
+    if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return nullptr;
+
+    // If starting offset isn't zero, insert a MI to materialize a new base.
+    // But only do so if it is cost effective, i.e. merging more than two
+    // loads / stores.
+    if (NumRegs <= 2)
+      return nullptr;
+
+    // On Thumb1, it's not worth materializing a new base register without
+    // clobbering the CPSR (i.e. not using ADDS/SUBS).
+    if (!SafeToClobberCPSR)
+      return nullptr;
+
+    unsigned NewBase;
+    if (isi32Load(Opcode)) {
+      // If it is a load, then just use one of the destination registers
+      // as the new base. Will no longer be writeback in Thumb1.
+      NewBase = Regs[NumRegs-1].first;
+      Writeback = false;
+    } else {
+      // Find a free register that we can use as scratch register.
+      moveLiveRegsBefore(MBB, InsertBefore);
+      // The merged instruction does not exist yet but will use several Regs if
+      // it is a Store.
+      if (!isLoadSingle(Opcode))
+        for (const std::pair<unsigned, bool> &R : Regs)
+          LiveRegs.addReg(R.first);
+
+      NewBase = findFreeReg(isThumb1 ? ARM::tGPRRegClass : ARM::GPRRegClass);
+      if (NewBase == 0)
+        return nullptr;
+    }
+
+    int BaseOpc =
+      isThumb2 ? ARM::t2ADDri :
+      (isThumb1 && Base == ARM::SP) ? ARM::tADDrSPi :
+      (isThumb1 && Offset < 8) ? ARM::tADDi3 :
+      isThumb1 ? ARM::tADDi8  : ARM::ADDri;
+
+    if (Offset < 0) {
+      Offset = - Offset;
+      BaseOpc =
+        isThumb2 ? ARM::t2SUBri :
+        (isThumb1 && Offset < 8 && Base != ARM::SP) ? ARM::tSUBi3 :
+        isThumb1 ? ARM::tSUBi8  : ARM::SUBri;
+    }
+
+    if (!TL->isLegalAddImmediate(Offset))
+      // FIXME: Try add with register operand?
+      return nullptr; // Probably not worth it then.
+
+    // We can only append a kill flag to the add/sub input if the value is not
+    // used in the register list of the stm as well.
+    bool KillOldBase = BaseKill &&
+      (!isi32Store(Opcode) || !ContainsReg(Regs, Base));
+
+    if (isThumb1) {
+      // Thumb1: depending on immediate size, use either
+      //   ADDS NewBase, Base, #imm3
+      // or
+      //   MOV  NewBase, Base
+      //   ADDS NewBase, #imm8.
+      if (Base != NewBase &&
+          (BaseOpc == ARM::tADDi8 || BaseOpc == ARM::tSUBi8)) {
+        // Need to insert a MOV to the new base first.
+        if (isARMLowRegister(NewBase) && isARMLowRegister(Base) &&
+            !STI->hasV6Ops()) {
+          // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr
+          if (Pred != ARMCC::AL)
+            return nullptr;
+          BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVSr), NewBase)
+            .addReg(Base, getKillRegState(KillOldBase));
+        } else
+          BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase)
+            .addReg(Base, getKillRegState(KillOldBase))
+            .addImm(Pred).addReg(PredReg);
+
+        // The following ADDS/SUBS becomes an update.
+        Base = NewBase;
+        KillOldBase = true;
+      }
+      if (BaseOpc == ARM::tADDrSPi) {
+        assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4");
+        BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
+          .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset/4)
+          .addImm(Pred).addReg(PredReg);
+      } else
+        AddDefaultT1CC(
+          BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase), true)
+          .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
+          .addImm(Pred).addReg(PredReg);
+    } else {
+      BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
+        .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
+        .addImm(Pred).addReg(PredReg).addReg(0);
+    }
+    Base = NewBase;
+    BaseKill = true; // New base is always killed straight away.
+  }
+
+  bool isDef = isLoadSingle(Opcode);
+
+  // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with
+  // base register writeback.
+  Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
+  if (!Opcode)
+    return nullptr;
+
+  // Check if a Thumb1 LDM/STM merge is safe. This is the case if:
+  // - There is no writeback (LDM of base register),
+  // - the base register is killed by the merged instruction,
+  // - or it's safe to overwrite the condition flags, i.e. to insert a SUBS
+  //   to reset the base register.
+  // Otherwise, don't merge.
+  // It's safe to return here since the code to materialize a new base register
+  // above is also conditional on SafeToClobberCPSR.
+  if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill)
+    return nullptr;
+
+  MachineInstrBuilder MIB;
+
+  if (Writeback) {
+    assert(isThumb1 && "expected Writeback only inThumb1");
+    if (Opcode == ARM::tLDMIA) {
+      assert(!(ContainsReg(Regs, Base)) && "Thumb1 can't LDM ! with Base in Regs");
+      // Update tLDMIA with writeback if necessary.
+      Opcode = ARM::tLDMIA_UPD;
+    }
+
+    MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
+
+    // Thumb1: we might need to set base writeback when building the MI.
+    MIB.addReg(Base, getDefRegState(true))
+       .addReg(Base, getKillRegState(BaseKill));
+
+    // The base isn't dead after a merged instruction with writeback.
+    // Insert a sub instruction after the newly formed instruction to reset.
+    if (!BaseKill)
+      UpdateBaseRegUses(MBB, InsertBefore, DL, Base, NumRegs, Pred, PredReg);
+
+  } else {
+    // No writeback, simply build the MachineInstr.
+    MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
+    MIB.addReg(Base, getKillRegState(BaseKill));
+  }
+
+  MIB.addImm(Pred).addReg(PredReg);
+
+  for (const std::pair<unsigned, bool> &R : Regs)
+    MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second));
+
+  return MIB.getInstr();
+}
+
+MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    int Offset, unsigned Base, bool BaseKill, unsigned Opcode,
+    ARMCC::CondCodes Pred, unsigned PredReg, const DebugLoc &DL,
+    ArrayRef<std::pair<unsigned, bool>> Regs) const {
+  bool IsLoad = isi32Load(Opcode);
+  assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store");
+  unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8;
+
+  assert(Regs.size() == 2);
+  MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL,
+                                    TII->get(LoadStoreOpcode));
+  if (IsLoad) {
+    MIB.addReg(Regs[0].first, RegState::Define)
+       .addReg(Regs[1].first, RegState::Define);
+  } else {
+    MIB.addReg(Regs[0].first, getKillRegState(Regs[0].second))
+       .addReg(Regs[1].first, getKillRegState(Regs[1].second));
+  }
+  MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+  return MIB.getInstr();
+}
+
+/// Call MergeOps and update MemOps and merges accordingly on success.
+MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
+  const MachineInstr *First = Cand.Instrs.front();
+  unsigned Opcode = First->getOpcode();
+  bool IsLoad = isLoadSingle(Opcode);
+  SmallVector<std::pair<unsigned, bool>, 8> Regs;
+  SmallVector<unsigned, 4> ImpDefs;
+  DenseSet<unsigned> KilledRegs;
+  DenseSet<unsigned> UsedRegs;
+  // Determine list of registers and list of implicit super-register defs.
+  for (const MachineInstr *MI : Cand.Instrs) {
+    const MachineOperand &MO = getLoadStoreRegOp(*MI);
+    unsigned Reg = MO.getReg();
+    bool IsKill = MO.isKill();
+    if (IsKill)
+      KilledRegs.insert(Reg);
+    Regs.push_back(std::make_pair(Reg, IsKill));
+    UsedRegs.insert(Reg);
+
+    if (IsLoad) {
+      // Collect any implicit defs of super-registers, after merging we can't
+      // be sure anymore that we properly preserved these live ranges and must
+      // removed these implicit operands.
+      for (const MachineOperand &MO : MI->implicit_operands()) {
+        if (!MO.isReg() || !MO.isDef() || MO.isDead())
+          continue;
+        assert(MO.isImplicit());
+        unsigned DefReg = MO.getReg();
+
+        if (is_contained(ImpDefs, DefReg))
+          continue;
+        // We can ignore cases where the super-reg is read and written.
+        if (MI->readsRegister(DefReg))
+          continue;
+        ImpDefs.push_back(DefReg);
+      }
+    }
+  }
+
+  // Attempt the merge.
+  typedef MachineBasicBlock::iterator iterator;
+  MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx];
+  iterator InsertBefore = std::next(iterator(LatestMI));
+  MachineBasicBlock &MBB = *LatestMI->getParent();
+  unsigned Offset = getMemoryOpOffset(*First);
+  unsigned Base = getLoadStoreBaseOp(*First).getReg();
+  bool BaseKill = LatestMI->killsRegister(Base);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(*First, PredReg);
+  DebugLoc DL = First->getDebugLoc();
+  MachineInstr *Merged = nullptr;
+  if (Cand.CanMergeToLSDouble)
+    Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill,
+                                   Opcode, Pred, PredReg, DL, Regs);
+  if (!Merged && Cand.CanMergeToLSMulti)
+    Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill,
+                                  Opcode, Pred, PredReg, DL, Regs);
+  if (!Merged)
+    return nullptr;
+
+  // Determine earliest instruction that will get removed. We then keep an
+  // iterator just above it so the following erases don't invalidated it.
+  iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]);
+  bool EarliestAtBegin = false;
+  if (EarliestI == MBB.begin()) {
+    EarliestAtBegin = true;
+  } else {
+    EarliestI = std::prev(EarliestI);
+  }
+
+  // Remove instructions which have been merged.
+  for (MachineInstr *MI : Cand.Instrs)
+    MBB.erase(MI);
+
+  // Determine range between the earliest removed instruction and the new one.
+  if (EarliestAtBegin)
+    EarliestI = MBB.begin();
+  else
+    EarliestI = std::next(EarliestI);
+  auto FixupRange = make_range(EarliestI, iterator(Merged));
+
+  if (isLoadSingle(Opcode)) {
+    // If the previous loads defined a super-reg, then we have to mark earlier
+    // operands undef; Replicate the super-reg def on the merged instruction.
+    for (MachineInstr &MI : FixupRange) {
+      for (unsigned &ImpDefReg : ImpDefs) {
+        for (MachineOperand &MO : MI.implicit_operands()) {
+          if (!MO.isReg() || MO.getReg() != ImpDefReg)
+            continue;
+          if (MO.readsReg())
+            MO.setIsUndef();
+          else if (MO.isDef())
+            ImpDefReg = 0;
+        }
+      }
+    }
+
+    MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged);
+    for (unsigned ImpDef : ImpDefs)
+      MIB.addReg(ImpDef, RegState::ImplicitDefine);
+  } else {
+    // Remove kill flags: We are possibly storing the values later now.
+    assert(isi32Store(Opcode) || Opcode == ARM::VSTRS || Opcode == ARM::VSTRD);
+    for (MachineInstr &MI : FixupRange) {
+      for (MachineOperand &MO : MI.uses()) {
+        if (!MO.isReg() || !MO.isKill())
+          continue;
+        if (UsedRegs.count(MO.getReg()))
+          MO.setIsKill(false);
+      }
+    }
+    assert(ImpDefs.empty());
+  }
+
+  return Merged;
+}
+
+static bool isValidLSDoubleOffset(int Offset) {
+  unsigned Value = abs(Offset);
+  // t2LDRDi8/t2STRDi8 supports an 8 bit immediate which is internally
+  // multiplied by 4.
+  return (Value % 4) == 0 && Value < 1024;
+}
+
+/// Return true for loads/stores that can be combined to a double/multi
+/// operation without increasing the requirements for alignment.
+static bool mayCombineMisaligned(const TargetSubtargetInfo &STI,
+                                 const MachineInstr &MI) {
+  // vldr/vstr trap on misaligned pointers anyway, forming vldm makes no
+  // difference.
+  unsigned Opcode = MI.getOpcode();
+  if (!isi32Load(Opcode) && !isi32Store(Opcode))
+    return true;
+
+  // Stack pointer alignment is out of the programmers control so we can trust
+  // SP-relative loads/stores.
+  if (getLoadStoreBaseOp(MI).getReg() == ARM::SP &&
+      STI.getFrameLowering()->getTransientStackAlignment() >= 4)
+    return true;
+  return false;
+}
+
+/// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
+void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
+  const MachineInstr *FirstMI = MemOps[0].MI;
+  unsigned Opcode = FirstMI->getOpcode();
+  bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
+  unsigned Size = getLSMultipleTransferSize(FirstMI);
+
+  unsigned SIndex = 0;
+  unsigned EIndex = MemOps.size();
+  do {
+    // Look at the first instruction.
+    const MachineInstr *MI = MemOps[SIndex].MI;
+    int Offset = MemOps[SIndex].Offset;
+    const MachineOperand &PMO = getLoadStoreRegOp(*MI);
+    unsigned PReg = PMO.getReg();
+    unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
+    unsigned Latest = SIndex;
+    unsigned Earliest = SIndex;
+    unsigned Count = 1;
+    bool CanMergeToLSDouble =
+      STI->isThumb2() && isNotVFP && isValidLSDoubleOffset(Offset);
+    // ARM errata 602117: LDRD with base in list may result in incorrect base
+    // register when interrupted or faulted.
+    if (STI->isCortexM3() && isi32Load(Opcode) &&
+        PReg == getLoadStoreBaseOp(*MI).getReg())
+      CanMergeToLSDouble = false;
+
+    bool CanMergeToLSMulti = true;
+    // On swift vldm/vstm starting with an odd register number as that needs
+    // more uops than single vldrs.
+    if (STI->hasSlowOddRegister() && !isNotVFP && (PRegNum % 2) == 1)
+      CanMergeToLSMulti = false;
+
+    // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it
+    // deprecated; LDM to PC is fine but cannot happen here.
+    if (PReg == ARM::SP || PReg == ARM::PC)
+      CanMergeToLSMulti = CanMergeToLSDouble = false;
+
+    // Should we be conservative?
+    if (AssumeMisalignedLoadStores && !mayCombineMisaligned(*STI, *MI))
+      CanMergeToLSMulti = CanMergeToLSDouble = false;
+
+    // Merge following instructions where possible.
+    for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
+      int NewOffset = MemOps[I].Offset;
+      if (NewOffset != Offset + (int)Size)
+        break;
+      const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
+      unsigned Reg = MO.getReg();
+      if (Reg == ARM::SP || Reg == ARM::PC)
+        break;
+
+      // See if the current load/store may be part of a multi load/store.
+      unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
+      bool PartOfLSMulti = CanMergeToLSMulti;
+      if (PartOfLSMulti) {
+        // Register numbers must be in ascending order.
+        if (RegNum <= PRegNum)
+          PartOfLSMulti = false;
+        // For VFP / NEON load/store multiples, the registers must be
+        // consecutive and within the limit on the number of registers per
+        // instruction.
+        else if (!isNotVFP && RegNum != PRegNum+1)
+          PartOfLSMulti = false;
+      }
+      // See if the current load/store may be part of a double load/store.
+      bool PartOfLSDouble = CanMergeToLSDouble && Count <= 1;
+
+      if (!PartOfLSMulti && !PartOfLSDouble)
+        break;
+      CanMergeToLSMulti &= PartOfLSMulti;
+      CanMergeToLSDouble &= PartOfLSDouble;
+      // Track MemOp with latest and earliest position (Positions are
+      // counted in reverse).
+      unsigned Position = MemOps[I].Position;
+      if (Position < MemOps[Latest].Position)
+        Latest = I;
+      else if (Position > MemOps[Earliest].Position)
+        Earliest = I;
+      // Prepare for next MemOp.
+      Offset += Size;
+      PRegNum = RegNum;
+    }
+
+    // Form a candidate from the Ops collected so far.
+    MergeCandidate *Candidate = new(Allocator.Allocate()) MergeCandidate;
+    for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C)
+      Candidate->Instrs.push_back(MemOps[C].MI);
+    Candidate->LatestMIIdx = Latest - SIndex;
+    Candidate->EarliestMIIdx = Earliest - SIndex;
+    Candidate->InsertPos = MemOps[Latest].Position;
+    if (Count == 1)
+      CanMergeToLSMulti = CanMergeToLSDouble = false;
+    Candidate->CanMergeToLSMulti = CanMergeToLSMulti;
+    Candidate->CanMergeToLSDouble = CanMergeToLSDouble;
+    Candidates.push_back(Candidate);
+    // Continue after the chain.
+    SIndex += Count;
+  } while (SIndex < EIndex);
+}
+
+static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
+                                            ARM_AM::AMSubMode Mode) {
+  switch (Opc) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::LDMIA_UPD;
+    case ARM_AM::ib: return ARM::LDMIB_UPD;
+    case ARM_AM::da: return ARM::LDMDA_UPD;
+    case ARM_AM::db: return ARM::LDMDB_UPD;
+    }
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::STMIA_UPD;
+    case ARM_AM::ib: return ARM::STMIB_UPD;
+    case ARM_AM::da: return ARM::STMDA_UPD;
+    case ARM_AM::db: return ARM::STMDB_UPD;
+    }
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::t2LDMIA_UPD;
+    case ARM_AM::db: return ARM::t2LDMDB_UPD;
+    }
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::t2STMIA_UPD;
+    case ARM_AM::db: return ARM::t2STMDB_UPD;
+    }
+  case ARM::VLDMSIA:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VLDMSIA_UPD;
+    case ARM_AM::db: return ARM::VLDMSDB_UPD;
+    }
+  case ARM::VLDMDIA:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VLDMDIA_UPD;
+    case ARM_AM::db: return ARM::VLDMDDB_UPD;
+    }
+  case ARM::VSTMSIA:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VSTMSIA_UPD;
+    case ARM_AM::db: return ARM::VSTMSDB_UPD;
+    }
+  case ARM::VSTMDIA:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VSTMDIA_UPD;
+    case ARM_AM::db: return ARM::VSTMDDB_UPD;
+    }
+  }
+}
+
+/// Check if the given instruction increments or decrements a register and
+/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags
+/// generated by the instruction are possibly read as well.
+static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
+                                  ARMCC::CondCodes Pred, unsigned PredReg) {
+  bool CheckCPSRDef;
+  int Scale;
+  switch (MI.getOpcode()) {
+  case ARM::tADDi8:  Scale =  4; CheckCPSRDef = true; break;
+  case ARM::tSUBi8:  Scale = -4; CheckCPSRDef = true; break;
+  case ARM::t2SUBri:
+  case ARM::SUBri:   Scale = -1; CheckCPSRDef = true; break;
+  case ARM::t2ADDri:
+  case ARM::ADDri:   Scale =  1; CheckCPSRDef = true; break;
+  case ARM::tADDspi: Scale =  4; CheckCPSRDef = false; break;
+  case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break;
+  default: return 0;
+  }
+
+  unsigned MIPredReg;
+  if (MI.getOperand(0).getReg() != Reg ||
+      MI.getOperand(1).getReg() != Reg ||
+      getInstrPredicate(MI, MIPredReg) != Pred ||
+      MIPredReg != PredReg)
+    return 0;
+
+  if (CheckCPSRDef && definesCPSR(MI))
+    return 0;
+  return MI.getOperand(2).getImm() * Scale;
+}
+
+/// Searches for an increment or decrement of \p Reg before \p MBBI.
+static MachineBasicBlock::iterator
+findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
+                 ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
+  Offset = 0;
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  if (MBBI == BeginMBBI)
+    return EndMBBI;
+
+  // Skip debug values.
+  MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
+  while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI)
+    --PrevMBBI;
+
+  Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg);
+  return Offset == 0 ? EndMBBI : PrevMBBI;
+}
+
+/// Searches for a increment or decrement of \p Reg after \p MBBI.
+static MachineBasicBlock::iterator
+findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
+                ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
+  Offset = 0;
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
+  // Skip debug values.
+  while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+    ++NextMBBI;
+  if (NextMBBI == EndMBBI)
+    return EndMBBI;
+
+  Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
+  return Offset == 0 ? EndMBBI : NextMBBI;
+}
+
+/// Fold proceeding/trailing inc/dec of base register into the
+/// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
+///
+/// stmia rn, <ra, rb, rc>
+/// rn := rn + 4 * 3;
+/// =>
+/// stmia rn!, <ra, rb, rc>
+///
+/// rn := rn - 4 * 3;
+/// ldmia rn, <ra, rb, rc>
+/// =>
+/// ldmdb rn!, <ra, rb, rc>
+bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
+  // Thumb1 is already using updating loads/stores.
+  if (isThumb1) return false;
+
+  const MachineOperand &BaseOP = MI->getOperand(0);
+  unsigned Base = BaseOP.getReg();
+  bool BaseKill = BaseOP.isKill();
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
+  unsigned Opcode = MI->getOpcode();
+  DebugLoc DL = MI->getDebugLoc();
+
+  // Can't use an updating ld/st if the base register is also a dest
+  // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
+  for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
+    if (MI->getOperand(i).getReg() == Base)
+      return false;
+
+  int Bytes = getLSMultipleTransferSize(MI);
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineBasicBlock::iterator MBBI(MI);
+  int Offset;
+  MachineBasicBlock::iterator MergeInstr
+    = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
+  ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
+  if (Mode == ARM_AM::ia && Offset == -Bytes) {
+    Mode = ARM_AM::db;
+  } else if (Mode == ARM_AM::ib && Offset == -Bytes) {
+    Mode = ARM_AM::da;
+  } else {
+    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+    if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
+        ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes)) {
+
+      // We couldn't find an inc/dec to merge. But if the base is dead, we
+      // can still change to a writeback form as that will save us 2 bytes
+      // of code size. It can create WAW hazards though, so only do it if
+      // we're minimizing code size.
+      if (!MBB.getParent()->getFunction()->optForMinSize() || !BaseKill)
+        return false;
+      
+      bool HighRegsUsed = false;
+      for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
+        if (MI->getOperand(i).getReg() >= ARM::R8) {
+          HighRegsUsed = true;
+          break;
+        }
+
+      if (!HighRegsUsed)
+        MergeInstr = MBB.end();
+      else
+        return false;
+    }
+  }
+  if (MergeInstr != MBB.end())
+    MBB.erase(MergeInstr);
+
+  unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
+    .addReg(Base, getDefRegState(true)) // WB base register
+    .addReg(Base, getKillRegState(BaseKill))
+    .addImm(Pred).addReg(PredReg);
+
+  // Transfer the rest of operands.
+  for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum)
+    MIB.addOperand(MI->getOperand(OpNum));
+
+  // Transfer memoperands.
+  MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  MBB.erase(MBBI);
+  return true;
+}
+
+static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc,
+                                             ARM_AM::AddrOpc Mode) {
+  switch (Opc) {
+  case ARM::LDRi12:
+    return ARM::LDR_PRE_IMM;
+  case ARM::STRi12:
+    return ARM::STR_PRE_IMM;
+  case ARM::VLDRS:
+    return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
+  case ARM::VLDRD:
+    return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
+  case ARM::VSTRS:
+    return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
+  case ARM::VSTRD:
+    return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+    return ARM::t2LDR_PRE;
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    return ARM::t2STR_PRE;
+  default: llvm_unreachable("Unhandled opcode!");
+  }
+}
+
+static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
+                                              ARM_AM::AddrOpc Mode) {
+  switch (Opc) {
+  case ARM::LDRi12:
+    return ARM::LDR_POST_IMM;
+  case ARM::STRi12:
+    return ARM::STR_POST_IMM;
+  case ARM::VLDRS:
+    return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
+  case ARM::VLDRD:
+    return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
+  case ARM::VSTRS:
+    return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
+  case ARM::VSTRD:
+    return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+    return ARM::t2LDR_POST;
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    return ARM::t2STR_POST;
+  default: llvm_unreachable("Unhandled opcode!");
+  }
+}
+
+/// Fold proceeding/trailing inc/dec of base register into the
+/// LDR/STR/FLD{D|S}/FST{D|S} op when possible:
+bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
+  // Thumb1 doesn't have updating LDR/STR.
+  // FIXME: Use LDM/STM with single register instead.
+  if (isThumb1) return false;
+
+  unsigned Base = getLoadStoreBaseOp(*MI).getReg();
+  bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
+  unsigned Opcode = MI->getOpcode();
+  DebugLoc DL = MI->getDebugLoc();
+  bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
+                Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
+  bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12);
+  if (isi32Load(Opcode) || isi32Store(Opcode))
+    if (MI->getOperand(2).getImm() != 0)
+      return false;
+  if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
+    return false;
+
+  // Can't do the merge if the destination register is the same as the would-be
+  // writeback register.
+  if (MI->getOperand(0).getReg() == Base)
+    return false;
+
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
+  int Bytes = getLSMultipleTransferSize(MI);
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineBasicBlock::iterator MBBI(MI);
+  int Offset;
+  MachineBasicBlock::iterator MergeInstr
+    = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
+  unsigned NewOpc;
+  if (!isAM5 && Offset == Bytes) {
+    NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
+  } else if (Offset == -Bytes) {
+    NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
+  } else {
+    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+    if (Offset == Bytes) {
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
+    } else if (!isAM5 && Offset == -Bytes) {
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
+    } else
+      return false;
+  }
+  MBB.erase(MergeInstr);
+
+  ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add;
+
+  bool isLd = isLoadSingle(Opcode);
+  if (isAM5) {
+    // VLDM[SD]_UPD, VSTM[SD]_UPD
+    // (There are no base-updating versions of VLDR/VSTR instructions, but the
+    // updating load/store-multiple instructions can be used with only one
+    // register.)
+    MachineOperand &MO = MI->getOperand(0);
+    BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
+      .addReg(Base, getDefRegState(true)) // WB base register
+      .addReg(Base, getKillRegState(isLd ? BaseKill : false))
+      .addImm(Pred).addReg(PredReg)
+      .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
+                            getKillRegState(MO.isKill())));
+  } else if (isLd) {
+    if (isAM2) {
+      // LDR_PRE, LDR_POST
+      if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
+        BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
+          .addReg(Base, RegState::Define)
+          .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+      } else {
+        int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+        BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
+          .addReg(Base, RegState::Define)
+          .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
+      }
+    } else {
+      // t2LDR_PRE, t2LDR_POST
+      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
+        .addReg(Base, RegState::Define)
+        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+    }
+  } else {
+    MachineOperand &MO = MI->getOperand(0);
+    // FIXME: post-indexed stores use am2offset_imm, which still encodes
+    // the vestigal zero-reg offset register. When that's fixed, this clause
+    // can be removed entirely.
+    if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
+      int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+      // STR_PRE, STR_POST
+      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
+        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+        .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
+    } else {
+      // t2STR_PRE, t2STR_POST
+      BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
+        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+    }
+  }
+  MBB.erase(MBBI);
+
+  return true;
+}
+
+bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) &&
+         "Must have t2STRDi8 or t2LDRDi8");
+  if (MI.getOperand(3).getImm() != 0)
+    return false;
+
+  // Behaviour for writeback is undefined if base register is the same as one
+  // of the others.
+  const MachineOperand &BaseOp = MI.getOperand(2);
+  unsigned Base = BaseOp.getReg();
+  const MachineOperand &Reg0Op = MI.getOperand(0);
+  const MachineOperand &Reg1Op = MI.getOperand(1);
+  if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
+    return false;
+
+  unsigned PredReg;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  MachineBasicBlock::iterator MBBI(MI);
+  MachineBasicBlock &MBB = *MI.getParent();
+  int Offset;
+  MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred,
+                                                            PredReg, Offset);
+  unsigned NewOpc;
+  if (Offset == 8 || Offset == -8) {
+    NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE;
+  } else {
+    MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
+    if (Offset == 8 || Offset == -8) {
+      NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
+    } else
+      return false;
+  }
+  MBB.erase(MergeInstr);
+
+  DebugLoc DL = MI.getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
+  if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) {
+    MIB.addOperand(Reg0Op).addOperand(Reg1Op)
+       .addReg(BaseOp.getReg(), RegState::Define);
+  } else {
+    assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST);
+    MIB.addReg(BaseOp.getReg(), RegState::Define)
+       .addOperand(Reg0Op).addOperand(Reg1Op);
+  }
+  MIB.addReg(BaseOp.getReg(), RegState::Kill)
+     .addImm(Offset).addImm(Pred).addReg(PredReg);
+  assert(TII->get(Opcode).getNumOperands() == 6 &&
+         TII->get(NewOpc).getNumOperands() == 7 &&
+         "Unexpected number of operands in Opcode specification.");
+
+  // Transfer implicit operands.
+  for (const MachineOperand &MO : MI.implicit_operands())
+    MIB.addOperand(MO);
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MBB.erase(MBBI);
+  return true;
+}
+
+/// Returns true if instruction is a memory operation that this pass is capable
+/// of operating on.
+static bool isMemoryOp(const MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  case ARM::VLDRS:
+  case ARM::VSTRS:
+  case ARM::VLDRD:
+  case ARM::VSTRD:
+  case ARM::LDRi12:
+  case ARM::STRi12:
+  case ARM::tLDRi:
+  case ARM::tSTRi:
+  case ARM::tLDRspi:
+  case ARM::tSTRspi:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    break;
+  default:
+    return false;
+  }
+  if (!MI.getOperand(1).isReg())
+    return false;
+
+  // When no memory operands are present, conservatively assume unaligned,
+  // volatile, unfoldable.
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  const MachineMemOperand &MMO = **MI.memoperands_begin();
+
+  // Don't touch volatile memory accesses - we may be changing their order.
+  if (MMO.isVolatile())
+    return false;
+
+  // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
+  // not.
+  if (MMO.getAlignment() < 4)
+    return false;
+
+  // str <undef> could probably be eliminated entirely, but for now we just want
+  // to avoid making a mess of it.
+  // FIXME: Use str <undef> as a wildcard to enable better stm folding.
+  if (MI.getOperand(0).isReg() && MI.getOperand(0).isUndef())
+    return false;
+
+  // Likewise don't mess with references to undefined addresses.
+  if (MI.getOperand(1).isUndef())
+    return false;
+
+  return true;
+}
+
+static void InsertLDR_STR(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator &MBBI, int Offset,
+                          bool isDef, const DebugLoc &DL, unsigned NewOpc,
+                          unsigned Reg, bool RegDeadKill, bool RegUndef,
+                          unsigned BaseReg, bool BaseKill, bool BaseUndef,
+                          bool OffKill, bool OffUndef, ARMCC::CondCodes Pred,
+                          unsigned PredReg, const TargetInstrInfo *TII,
+                          bool isT2) {
+  if (isDef) {
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+                                      TII->get(NewOpc))
+      .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
+      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
+    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+  } else {
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+                                      TII->get(NewOpc))
+      .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
+      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
+    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+  }
+}
+
+bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator &MBBI) {
+  MachineInstr *MI = &*MBBI;
+  unsigned Opcode = MI->getOpcode();
+  if (Opcode != ARM::LDRD && Opcode != ARM::STRD && Opcode != ARM::t2LDRDi8)
+    return false;
+
+  const MachineOperand &BaseOp = MI->getOperand(2);
+  unsigned BaseReg = BaseOp.getReg();
+  unsigned EvenReg = MI->getOperand(0).getReg();
+  unsigned OddReg  = MI->getOperand(1).getReg();
+  unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
+  unsigned OddRegNum  = TRI->getDwarfRegNum(OddReg, false);
+
+  // ARM errata 602117: LDRD with base in list may result in incorrect base
+  // register when interrupted or faulted.
+  bool Errata602117 = EvenReg == BaseReg &&
+    (Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8) && STI->isCortexM3();
+  // ARM LDRD/STRD needs consecutive registers.
+  bool NonConsecutiveRegs = (Opcode == ARM::LDRD || Opcode == ARM::STRD) &&
+    (EvenRegNum % 2 != 0 || EvenRegNum + 1 != OddRegNum);
+
+  if (!Errata602117 && !NonConsecutiveRegs)
+    return false;
+
+  bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
+  bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
+  bool EvenDeadKill = isLd ?
+    MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
+  bool EvenUndef = MI->getOperand(0).isUndef();
+  bool OddDeadKill  = isLd ?
+    MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
+  bool OddUndef = MI->getOperand(1).isUndef();
+  bool BaseKill = BaseOp.isKill();
+  bool BaseUndef = BaseOp.isUndef();
+  bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
+  bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
+  int OffImm = getMemoryOpOffset(*MI);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
+
+  if (OddRegNum > EvenRegNum && OffImm == 0) {
+    // Ascending register numbers and no offset. It's safe to change it to a
+    // ldm or stm.
+    unsigned NewOpc = (isLd)
+      ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA)
+      : (isT2 ? ARM::t2STMIA : ARM::STMIA);
+    if (isLd) {
+      BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
+        .addReg(BaseReg, getKillRegState(BaseKill))
+        .addImm(Pred).addReg(PredReg)
+        .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
+        .addReg(OddReg,  getDefRegState(isLd) | getDeadRegState(OddDeadKill));
+      ++NumLDRD2LDM;
+    } else {
+      BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
+        .addReg(BaseReg, getKillRegState(BaseKill))
+        .addImm(Pred).addReg(PredReg)
+        .addReg(EvenReg,
+                getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
+        .addReg(OddReg,
+                getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
+      ++NumSTRD2STM;
+    }
+  } else {
+    // Split into two instructions.
+    unsigned NewOpc = (isLd)
+      ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
+      : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
+    // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset,
+    // so adjust and use t2LDRi12 here for that.
+    unsigned NewOpc2 = (isLd)
+      ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
+      : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
+    DebugLoc dl = MBBI->getDebugLoc();
+    // If this is a load and base register is killed, it may have been
+    // re-defed by the load, make sure the first load does not clobber it.
+    if (isLd &&
+        (BaseKill || OffKill) &&
+        (TRI->regsOverlap(EvenReg, BaseReg))) {
+      assert(!TRI->regsOverlap(OddReg, BaseReg));
+      InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
+                    OddReg, OddDeadKill, false,
+                    BaseReg, false, BaseUndef, false, OffUndef,
+                    Pred, PredReg, TII, isT2);
+      InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
+                    EvenReg, EvenDeadKill, false,
+                    BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
+                    Pred, PredReg, TII, isT2);
+    } else {
+      if (OddReg == EvenReg && EvenDeadKill) {
+        // If the two source operands are the same, the kill marker is
+        // probably on the first one. e.g.
+        // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
+        EvenDeadKill = false;
+        OddDeadKill = true;
+      }
+      // Never kill the base register in the first instruction.
+      if (EvenReg == BaseReg)
+        EvenDeadKill = false;
+      InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
+                    EvenReg, EvenDeadKill, EvenUndef,
+                    BaseReg, false, BaseUndef, false, OffUndef,
+                    Pred, PredReg, TII, isT2);
+      InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
+                    OddReg, OddDeadKill, OddUndef,
+                    BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
+                    Pred, PredReg, TII, isT2);
+    }
+    if (isLd)
+      ++NumLDRD2LDR;
+    else
+      ++NumSTRD2STR;
+  }
+
+  MBBI = MBB.erase(MBBI);
+  return true;
+}
+
+/// An optimization pass to turn multiple LDR / STR ops of the same base and
+/// incrementing offset into LDM / STM ops.
+bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
+  MemOpQueue MemOps;
+  unsigned CurrBase = 0;
+  unsigned CurrOpc = ~0u;
+  ARMCC::CondCodes CurrPred = ARMCC::AL;
+  unsigned Position = 0;
+  assert(Candidates.size() == 0);
+  assert(MergeBaseCandidates.size() == 0);
+  LiveRegsValid = false;
+
+  for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin();
+       I = MBBI) {
+    // The instruction in front of the iterator is the one we look at.
+    MBBI = std::prev(I);
+    if (FixInvalidRegPairOp(MBB, MBBI))
+      continue;
+    ++Position;
+
+    if (isMemoryOp(*MBBI)) {
+      unsigned Opcode = MBBI->getOpcode();
+      const MachineOperand &MO = MBBI->getOperand(0);
+      unsigned Reg = MO.getReg();
+      unsigned Base = getLoadStoreBaseOp(*MBBI).getReg();
+      unsigned PredReg = 0;
+      ARMCC::CondCodes Pred = getInstrPredicate(*MBBI, PredReg);
+      int Offset = getMemoryOpOffset(*MBBI);
+      if (CurrBase == 0) {
+        // Start of a new chain.
+        CurrBase = Base;
+        CurrOpc  = Opcode;
+        CurrPred = Pred;
+        MemOps.push_back(MemOpQueueEntry(*MBBI, Offset, Position));
+        continue;
+      }
+      // Note: No need to match PredReg in the next if.
+      if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
+        // Watch out for:
+        //   r4 := ldr [r0, #8]
+        //   r4 := ldr [r0, #4]
+        // or
+        //   r0 := ldr [r0]
+        // If a load overrides the base register or a register loaded by
+        // another load in our chain, we cannot take this instruction.
+        bool Overlap = false;
+        if (isLoadSingle(Opcode)) {
+          Overlap = (Base == Reg);
+          if (!Overlap) {
+            for (const MemOpQueueEntry &E : MemOps) {
+              if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) {
+                Overlap = true;
+                break;
+              }
+            }
+          }
+        }
+
+        if (!Overlap) {
+          // Check offset and sort memory operation into the current chain.
+          if (Offset > MemOps.back().Offset) {
+            MemOps.push_back(MemOpQueueEntry(*MBBI, Offset, Position));
+            continue;
+          } else {
+            MemOpQueue::iterator MI, ME;
+            for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) {
+              if (Offset < MI->Offset) {
+                // Found a place to insert.
+                break;
+              }
+              if (Offset == MI->Offset) {
+                // Collision, abort.
+                MI = ME;
+                break;
+              }
+            }
+            if (MI != MemOps.end()) {
+              MemOps.insert(MI, MemOpQueueEntry(*MBBI, Offset, Position));
+              continue;
+            }
+          }
+        }
+      }
+
+      // Don't advance the iterator; The op will start a new chain next.
+      MBBI = I;
+      --Position;
+      // Fallthrough to look into existing chain.
+    } else if (MBBI->isDebugValue()) {
+      continue;
+    } else if (MBBI->getOpcode() == ARM::t2LDRDi8 ||
+               MBBI->getOpcode() == ARM::t2STRDi8) {
+      // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions
+      // remember them because we may still be able to merge add/sub into them.
+      MergeBaseCandidates.push_back(&*MBBI);
+    }
+
+
+    // If we are here then the chain is broken; Extract candidates for a merge.
+    if (MemOps.size() > 0) {
+      FormCandidates(MemOps);
+      // Reset for the next chain.
+      CurrBase = 0;
+      CurrOpc = ~0u;
+      CurrPred = ARMCC::AL;
+      MemOps.clear();
+    }
+  }
+  if (MemOps.size() > 0)
+    FormCandidates(MemOps);
+
+  // Sort candidates so they get processed from end to begin of the basic
+  // block later; This is necessary for liveness calculation.
+  auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) {
+    return M0->InsertPos < M1->InsertPos;
+  };
+  std::sort(Candidates.begin(), Candidates.end(), LessThan);
+
+  // Go through list of candidates and merge.
+  bool Changed = false;
+  for (const MergeCandidate *Candidate : Candidates) {
+    if (Candidate->CanMergeToLSMulti || Candidate->CanMergeToLSDouble) {
+      MachineInstr *Merged = MergeOpsUpdate(*Candidate);
+      // Merge preceding/trailing base inc/dec into the merged op.
+      if (Merged) {
+        Changed = true;
+        unsigned Opcode = Merged->getOpcode();
+        if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8)
+          MergeBaseUpdateLSDouble(*Merged);
+        else
+          MergeBaseUpdateLSMultiple(Merged);
+      } else {
+        for (MachineInstr *MI : Candidate->Instrs) {
+          if (MergeBaseUpdateLoadStore(MI))
+            Changed = true;
+        }
+      }
+    } else {
+      assert(Candidate->Instrs.size() == 1);
+      if (MergeBaseUpdateLoadStore(Candidate->Instrs.front()))
+        Changed = true;
+    }
+  }
+  Candidates.clear();
+  // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt.
+  for (MachineInstr *MI : MergeBaseCandidates)
+    MergeBaseUpdateLSDouble(*MI);
+  MergeBaseCandidates.clear();
+
+  return Changed;
+}
+
+/// If this is a exit BB, try merging the return ops ("bx lr" and "mov pc, lr")
+/// into the preceding stack restore so it directly restore the value of LR
+/// into pc.
+///   ldmfd sp!, {..., lr}
+///   bx lr
+/// or
+///   ldmfd sp!, {..., lr}
+///   mov pc, lr
+/// =>
+///   ldmfd sp!, {..., pc}
+bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
+  // Thumb1 LDM doesn't allow high registers.
+  if (isThumb1) return false;
+  if (MBB.empty()) return false;
+
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  if (MBBI != MBB.begin() && MBBI != MBB.end() &&
+      (MBBI->getOpcode() == ARM::BX_RET ||
+       MBBI->getOpcode() == ARM::tBX_RET ||
+       MBBI->getOpcode() == ARM::MOVPCLR)) {
+    MachineBasicBlock::iterator PrevI = std::prev(MBBI);
+    // Ignore any DBG_VALUE instructions.
+    while (PrevI->isDebugValue() && PrevI != MBB.begin())
+      --PrevI;
+    MachineInstr &PrevMI = *PrevI;
+    unsigned Opcode = PrevMI.getOpcode();
+    if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
+        Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
+        Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
+      MachineOperand &MO = PrevMI.getOperand(PrevMI.getNumOperands() - 1);
+      if (MO.getReg() != ARM::LR)
+        return false;
+      unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET);
+      assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) ||
+              Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!");
+      PrevMI.setDesc(TII->get(NewOpc));
+      MO.setReg(ARM::PC);
+      PrevMI.copyImplicitOps(*MBB.getParent(), *MBBI);
+      MBB.erase(MBBI);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) {
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  if (MBBI == MBB.begin() || MBBI == MBB.end() ||
+      MBBI->getOpcode() != ARM::tBX_RET)
+    return false;
+
+  MachineBasicBlock::iterator Prev = MBBI;
+  --Prev;
+  if (Prev->getOpcode() != ARM::tMOVr || !Prev->definesRegister(ARM::LR))
+    return false;
+
+  for (auto Use : Prev->uses())
+    if (Use.isKill()) {
+      AddDefaultPred(BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX))
+                         .addReg(Use.getReg(), RegState::Kill))
+          .copyImplicitOps(*MBBI);
+      MBB.erase(MBBI);
+      MBB.erase(Prev);
+      return true;
+    }
+
+  llvm_unreachable("tMOVr doesn't kill a reg before tBX_RET?");
+}
+
+bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
+  MF = &Fn;
+  STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+  TL = STI->getTargetLowering();
+  AFI = Fn.getInfo<ARMFunctionInfo>();
+  TII = STI->getInstrInfo();
+  TRI = STI->getRegisterInfo();
+
+  RegClassInfoValid = false;
+  isThumb2 = AFI->isThumb2Function();
+  isThumb1 = AFI->isThumbFunction() && !isThumb2;
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    Modified |= LoadStoreMultipleOpti(MBB);
+    if (STI->hasV5TOps())
+      Modified |= MergeReturnIntoLDM(MBB);
+    if (isThumb1)
+      Modified |= CombineMovBx(MBB);
+  }
+
+  Allocator.DestroyAll();
+  return Modified;
+}
+
+#define ARM_PREALLOC_LOAD_STORE_OPT_NAME                                       \
+  "ARM pre- register allocation load / store optimization pass"
+
+namespace {
+  /// Pre- register allocation pass that move load / stores from consecutive
+  /// locations close to make it more likely they will be combined later.
+  struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
+    static char ID;
+    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
+
+    const DataLayout *TD;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const ARMSubtarget *STI;
+    MachineRegisterInfo *MRI;
+    MachineFunction *MF;
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    StringRef getPassName() const override {
+      return ARM_PREALLOC_LOAD_STORE_OPT_NAME;
+    }
+
+  private:
+    bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
+                          unsigned &NewOpc, unsigned &EvenReg,
+                          unsigned &OddReg, unsigned &BaseReg,
+                          int &Offset,
+                          unsigned &PredReg, ARMCC::CondCodes &Pred,
+                          bool &isT2);
+    bool RescheduleOps(MachineBasicBlock *MBB,
+                       SmallVectorImpl<MachineInstr *> &Ops,
+                       unsigned Base, bool isLd,
+                       DenseMap<MachineInstr*, unsigned> &MI2LocMap);
+    bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
+  };
+  char ARMPreAllocLoadStoreOpt::ID = 0;
+}
+
+INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
+                ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
+
+bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  if (AssumeMisalignedLoadStores || skipFunction(*Fn.getFunction()))
+    return false;
+
+  TD = &Fn.getDataLayout();
+  STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+  TII = STI->getInstrInfo();
+  TRI = STI->getRegisterInfo();
+  MRI = &Fn.getRegInfo();
+  MF  = &Fn;
+
+  bool Modified = false;
+  for (MachineBasicBlock &MFI : Fn)
+    Modified |= RescheduleLoadStoreInstrs(&MFI);
+
+  return Modified;
+}
+
+static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
+                                      MachineBasicBlock::iterator I,
+                                      MachineBasicBlock::iterator E,
+                                      SmallPtrSetImpl<MachineInstr*> &MemOps,
+                                      SmallSet<unsigned, 4> &MemRegs,
+                                      const TargetRegisterInfo *TRI) {
+  // Are there stores / loads / calls between them?
+  // FIXME: This is overly conservative. We should make use of alias information
+  // some day.
+  SmallSet<unsigned, 4> AddedRegPressure;
+  while (++I != E) {
+    if (I->isDebugValue() || MemOps.count(&*I))
+      continue;
+    if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
+      return false;
+    if (isLd && I->mayStore())
+      return false;
+    if (!isLd) {
+      if (I->mayLoad())
+        return false;
+      // It's not safe to move the first 'str' down.
+      // str r1, [r0]
+      // strh r5, [r0]
+      // str r4, [r0, #+4]
+      if (I->mayStore())
+        return false;
+    }
+    for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
+      MachineOperand &MO = I->getOperand(j);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (MO.isDef() && TRI->regsOverlap(Reg, Base))
+        return false;
+      if (Reg != Base && !MemRegs.count(Reg))
+        AddedRegPressure.insert(Reg);
+    }
+  }
+
+  // Estimate register pressure increase due to the transformation.
+  if (MemRegs.size() <= 4)
+    // Ok if we are moving small number of instructions.
+    return true;
+  return AddedRegPressure.size() <= MemRegs.size() * 2;
+}
+
+bool
+ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
+                                          DebugLoc &dl, unsigned &NewOpc,
+                                          unsigned &FirstReg,
+                                          unsigned &SecondReg,
+                                          unsigned &BaseReg, int &Offset,
+                                          unsigned &PredReg,
+                                          ARMCC::CondCodes &Pred,
+                                          bool &isT2) {
+  // Make sure we're allowed to generate LDRD/STRD.
+  if (!STI->hasV5TEOps())
+    return false;
+
+  // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
+  unsigned Scale = 1;
+  unsigned Opcode = Op0->getOpcode();
+  if (Opcode == ARM::LDRi12) {
+    NewOpc = ARM::LDRD;
+  } else if (Opcode == ARM::STRi12) {
+    NewOpc = ARM::STRD;
+  } else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
+    NewOpc = ARM::t2LDRDi8;
+    Scale = 4;
+    isT2 = true;
+  } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
+    NewOpc = ARM::t2STRDi8;
+    Scale = 4;
+    isT2 = true;
+  } else {
+    return false;
+  }
+
+  // Make sure the base address satisfies i64 ld / st alignment requirement.
+  // At the moment, we ignore the memoryoperand's value.
+  // If we want to use AliasAnalysis, we should check it accordingly.
+  if (!Op0->hasOneMemOperand() ||
+      (*Op0->memoperands_begin())->isVolatile())
+    return false;
+
+  unsigned Align = (*Op0->memoperands_begin())->getAlignment();
+  const Function *Func = MF->getFunction();
+  unsigned ReqAlign = STI->hasV6Ops()
+    ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext()))
+    : 8;  // Pre-v6 need 8-byte align
+  if (Align < ReqAlign)
+    return false;
+
+  // Then make sure the immediate offset fits.
+  int OffImm = getMemoryOpOffset(*Op0);
+  if (isT2) {
+    int Limit = (1 << 8) * Scale;
+    if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1)))
+      return false;
+    Offset = OffImm;
+  } else {
+    ARM_AM::AddrOpc AddSub = ARM_AM::add;
+    if (OffImm < 0) {
+      AddSub = ARM_AM::sub;
+      OffImm = - OffImm;
+    }
+    int Limit = (1 << 8) * Scale;
+    if (OffImm >= Limit || (OffImm & (Scale-1)))
+      return false;
+    Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
+  }
+  FirstReg = Op0->getOperand(0).getReg();
+  SecondReg = Op1->getOperand(0).getReg();
+  if (FirstReg == SecondReg)
+    return false;
+  BaseReg = Op0->getOperand(1).getReg();
+  Pred = getInstrPredicate(*Op0, PredReg);
+  dl = Op0->getDebugLoc();
+  return true;
+}
+
+bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
+                                 SmallVectorImpl<MachineInstr *> &Ops,
+                                 unsigned Base, bool isLd,
+                                 DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
+  bool RetVal = false;
+
+  // Sort by offset (in reverse order).
+  std::sort(Ops.begin(), Ops.end(),
+            [](const MachineInstr *LHS, const MachineInstr *RHS) {
+              int LOffset = getMemoryOpOffset(*LHS);
+              int ROffset = getMemoryOpOffset(*RHS);
+              assert(LHS == RHS || LOffset != ROffset);
+              return LOffset > ROffset;
+            });
+
+  // The loads / stores of the same base are in order. Scan them from first to
+  // last and check for the following:
+  // 1. Any def of base.
+  // 2. Any gaps.
+  while (Ops.size() > 1) {
+    unsigned FirstLoc = ~0U;
+    unsigned LastLoc = 0;
+    MachineInstr *FirstOp = nullptr;
+    MachineInstr *LastOp = nullptr;
+    int LastOffset = 0;
+    unsigned LastOpcode = 0;
+    unsigned LastBytes = 0;
+    unsigned NumMove = 0;
+    for (int i = Ops.size() - 1; i >= 0; --i) {
+      MachineInstr *Op = Ops[i];
+      unsigned Loc = MI2LocMap[Op];
+      if (Loc <= FirstLoc) {
+        FirstLoc = Loc;
+        FirstOp = Op;
+      }
+      if (Loc >= LastLoc) {
+        LastLoc = Loc;
+        LastOp = Op;
+      }
+
+      unsigned LSMOpcode
+        = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia);
+      if (LastOpcode && LSMOpcode != LastOpcode)
+        break;
+
+      int Offset = getMemoryOpOffset(*Op);
+      unsigned Bytes = getLSMultipleTransferSize(Op);
+      if (LastBytes) {
+        if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
+          break;
+      }
+      LastOffset = Offset;
+      LastBytes = Bytes;
+      LastOpcode = LSMOpcode;
+      if (++NumMove == 8) // FIXME: Tune this limit.
+        break;
+    }
+
+    if (NumMove <= 1)
+      Ops.pop_back();
+    else {
+      SmallPtrSet<MachineInstr*, 4> MemOps;
+      SmallSet<unsigned, 4> MemRegs;
+      for (int i = NumMove-1; i >= 0; --i) {
+        MemOps.insert(Ops[i]);
+        MemRegs.insert(Ops[i]->getOperand(0).getReg());
+      }
+
+      // Be conservative, if the instructions are too far apart, don't
+      // move them. We want to limit the increase of register pressure.
+      bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
+      if (DoMove)
+        DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
+                                           MemOps, MemRegs, TRI);
+      if (!DoMove) {
+        for (unsigned i = 0; i != NumMove; ++i)
+          Ops.pop_back();
+      } else {
+        // This is the new location for the loads / stores.
+        MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
+        while (InsertPos != MBB->end() &&
+               (MemOps.count(&*InsertPos) || InsertPos->isDebugValue()))
+          ++InsertPos;
+
+        // If we are moving a pair of loads / stores, see if it makes sense
+        // to try to allocate a pair of registers that can form register pairs.
+        MachineInstr *Op0 = Ops.back();
+        MachineInstr *Op1 = Ops[Ops.size()-2];
+        unsigned FirstReg = 0, SecondReg = 0;
+        unsigned BaseReg = 0, PredReg = 0;
+        ARMCC::CondCodes Pred = ARMCC::AL;
+        bool isT2 = false;
+        unsigned NewOpc = 0;
+        int Offset = 0;
+        DebugLoc dl;
+        if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
+                                             FirstReg, SecondReg, BaseReg,
+                                             Offset, PredReg, Pred, isT2)) {
+          Ops.pop_back();
+          Ops.pop_back();
+
+          const MCInstrDesc &MCID = TII->get(NewOpc);
+          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
+          MRI->constrainRegClass(FirstReg, TRC);
+          MRI->constrainRegClass(SecondReg, TRC);
+
+          // Form the pair instruction.
+          if (isLd) {
+            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
+              .addReg(FirstReg, RegState::Define)
+              .addReg(SecondReg, RegState::Define)
+              .addReg(BaseReg);
+            // FIXME: We're converting from LDRi12 to an insn that still
+            // uses addrmode2, so we need an explicit offset reg. It should
+            // always by reg0 since we're transforming LDRi12s.
+            if (!isT2)
+              MIB.addReg(0);
+            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+            MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
+            DEBUG(dbgs() << "Formed " << *MIB << "\n");
+            ++NumLDRDFormed;
+          } else {
+            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
+              .addReg(FirstReg)
+              .addReg(SecondReg)
+              .addReg(BaseReg);
+            // FIXME: We're converting from LDRi12 to an insn that still
+            // uses addrmode2, so we need an explicit offset reg. It should
+            // always by reg0 since we're transforming STRi12s.
+            if (!isT2)
+              MIB.addReg(0);
+            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+            MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
+            DEBUG(dbgs() << "Formed " << *MIB << "\n");
+            ++NumSTRDFormed;
+          }
+          MBB->erase(Op0);
+          MBB->erase(Op1);
+
+          if (!isT2) {
+            // Add register allocation hints to form register pairs.
+            MRI->setRegAllocationHint(FirstReg, ARMRI::RegPairEven, SecondReg);
+            MRI->setRegAllocationHint(SecondReg,  ARMRI::RegPairOdd, FirstReg);
+          }
+        } else {
+          for (unsigned i = 0; i != NumMove; ++i) {
+            MachineInstr *Op = Ops.back();
+            Ops.pop_back();
+            MBB->splice(InsertPos, MBB, Op);
+          }
+        }
+
+        NumLdStMoved += NumMove;
+        RetVal = true;
+      }
+    }
+  }
+
+  return RetVal;
+}
+
+bool
+ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
+  bool RetVal = false;
+
+  DenseMap<MachineInstr*, unsigned> MI2LocMap;
+  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
+  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
+  SmallVector<unsigned, 4> LdBases;
+  SmallVector<unsigned, 4> StBases;
+
+  unsigned Loc = 0;
+  MachineBasicBlock::iterator MBBI = MBB->begin();
+  MachineBasicBlock::iterator E = MBB->end();
+  while (MBBI != E) {
+    for (; MBBI != E; ++MBBI) {
+      MachineInstr &MI = *MBBI;
+      if (MI.isCall() || MI.isTerminator()) {
+        // Stop at barriers.
+        ++MBBI;
+        break;
+      }
+
+      if (!MI.isDebugValue())
+        MI2LocMap[&MI] = ++Loc;
+
+      if (!isMemoryOp(MI))
+        continue;
+      unsigned PredReg = 0;
+      if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
+        continue;
+
+      int Opc = MI.getOpcode();
+      bool isLd = isLoadSingle(Opc);
+      unsigned Base = MI.getOperand(1).getReg();
+      int Offset = getMemoryOpOffset(MI);
+
+      bool StopHere = false;
+      if (isLd) {
+        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
+          Base2LdsMap.find(Base);
+        if (BI != Base2LdsMap.end()) {
+          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
+            if (Offset == getMemoryOpOffset(*BI->second[i])) {
+              StopHere = true;
+              break;
+            }
+          }
+          if (!StopHere)
+            BI->second.push_back(&MI);
+        } else {
+          Base2LdsMap[Base].push_back(&MI);
+          LdBases.push_back(Base);
+        }
+      } else {
+        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
+          Base2StsMap.find(Base);
+        if (BI != Base2StsMap.end()) {
+          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
+            if (Offset == getMemoryOpOffset(*BI->second[i])) {
+              StopHere = true;
+              break;
+            }
+          }
+          if (!StopHere)
+            BI->second.push_back(&MI);
+        } else {
+          Base2StsMap[Base].push_back(&MI);
+          StBases.push_back(Base);
+        }
+      }
+
+      if (StopHere) {
+        // Found a duplicate (a base+offset combination that's seen earlier).
+        // Backtrack.
+        --Loc;
+        break;
+      }
+    }
+
+    // Re-schedule loads.
+    for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
+      unsigned Base = LdBases[i];
+      SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base];
+      if (Lds.size() > 1)
+        RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
+    }
+
+    // Re-schedule stores.
+    for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
+      unsigned Base = StBases[i];
+      SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base];
+      if (Sts.size() > 1)
+        RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
+    }
+
+    if (MBBI != E) {
+      Base2LdsMap.clear();
+      Base2StsMap.clear();
+      LdBases.clear();
+      StBases.clear();
+    }
+  }
+
+  return RetVal;
+}
+
+
+/// Returns an instance of the load / store optimization pass.
+FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
+  if (PreAlloc)
+    return new ARMPreAllocLoadStoreOpt();
+  return new ARMLoadStoreOpt();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp
new file mode 100644
index 000000000000..293a527b09e8
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -0,0 +1,263 @@
+//===-- ARMMCInstLower.cpp - Convert ARM MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower ARM MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAsmPrinter.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+using namespace llvm;
+
+
+MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
+                                      const MCSymbol *Symbol) {
+  const MCExpr *Expr =
+      MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+  switch (MO.getTargetFlags() & ARMII::MO_OPTION_MASK) {
+  default:
+    llvm_unreachable("Unknown target flag on symbol operand");
+  case ARMII::MO_NO_FLAG:
+    break;
+  case ARMII::MO_LO16:
+    Expr =
+        MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+    Expr = ARMMCExpr::createLower16(Expr, OutContext);
+    break;
+  case ARMII::MO_HI16:
+    Expr =
+        MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+    Expr = ARMMCExpr::createUpper16(Expr, OutContext);
+    break;
+  }
+
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::createAdd(Expr,
+                                   MCConstantExpr::create(MO.getOffset(),
+                                                          OutContext),
+                                   OutContext);
+  return MCOperand::createExpr(Expr);
+
+}
+
+bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
+                                 MCOperand &MCOp) {
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Register:
+    // Ignore all non-CPSR implicit register operands.
+    if (MO.isImplicit() && MO.getReg() != ARM::CPSR)
+      return false;
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    MCOp = MCOperand::createReg(MO.getReg());
+    break;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::createImm(MO.getImm());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
+        MO.getMBB()->getSymbol(), OutContext));
+    break;
+  case MachineOperand::MO_GlobalAddress: {
+    MCOp = GetSymbolRef(MO,
+                        GetARMGVSymbol(MO.getGlobal(), MO.getTargetFlags()));
+    break;
+  }
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = GetSymbolRef(MO,
+                        GetExternalSymbolSymbol(MO.getSymbolName()));
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    MCOp = GetSymbolRef(MO, GetJTISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    if (Subtarget->genExecuteOnly())
+      llvm_unreachable("execute-only should not generate constant pools");
+    MCOp = GetSymbolRef(MO, GetCPISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_BlockAddress:
+    MCOp = GetSymbolRef(MO, GetBlockAddressSymbol(MO.getBlockAddress()));
+    break;
+  case MachineOperand::MO_FPImmediate: {
+    APFloat Val = MO.getFPImm()->getValueAPF();
+    bool ignored;
+    Val.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &ignored);
+    MCOp = MCOperand::createFPImm(Val.convertToDouble());
+    break;
+  }
+  case MachineOperand::MO_RegisterMask:
+    // Ignore call clobbers.
+    return false;
+  }
+  return true;
+}
+
+void llvm::LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                        ARMAsmPrinter &AP) {
+  OutMI.setOpcode(MI->getOpcode());
+
+  // In the MC layer, we keep modified immediates in their encoded form
+  bool EncodeImms = false;
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::MOVi:
+  case ARM::MVNi:
+  case ARM::CMPri:
+  case ARM::CMNri:
+  case ARM::TSTri:
+  case ARM::TEQri:
+  case ARM::MSRi:
+  case ARM::ADCri:
+  case ARM::ADDri:
+  case ARM::ADDSri:
+  case ARM::SBCri:
+  case ARM::SUBri:
+  case ARM::SUBSri:
+  case ARM::ANDri:
+  case ARM::ORRri:
+  case ARM::EORri:
+  case ARM::BICri:
+  case ARM::RSBri:
+  case ARM::RSBSri:
+  case ARM::RSCri:
+    EncodeImms = true;
+    break;
+  }
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    if (AP.lowerOperand(MO, MCOp)) {
+      if (MCOp.isImm() && EncodeImms) {
+        int32_t Enc = ARM_AM::getSOImmVal(MCOp.getImm());
+        if (Enc != -1)
+          MCOp.setImm(Enc);
+      }
+      OutMI.addOperand(MCOp);
+    }
+  }
+}
+
+void ARMAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
+{
+  if (MI.getParent()->getParent()->getInfo<ARMFunctionInfo>()
+    ->isThumbFunction())
+  {
+    MI.emitError("An attempt to perform XRay instrumentation for a"
+      " Thumb function (not supported). Detected when emitting a sled.");
+    return;
+  }
+  static const int8_t NoopsInSledCount = 6;
+  // We want to emit the following pattern:
+  //
+  // .Lxray_sled_N:
+  //   ALIGN
+  //   B #20
+  //   ; 6 NOP instructions (24 bytes)
+  // .tmpN
+  //
+  // We need the 24 bytes (6 instructions) because at runtime, we'd be patching
+  // over the full 28 bytes (7 instructions) with the following pattern:
+  //
+  //   PUSH{ r0, lr }
+  //   MOVW r0, #<lower 16 bits of function ID>
+  //   MOVT r0, #<higher 16 bits of function ID>
+  //   MOVW ip, #<lower 16 bits of address of __xray_FunctionEntry/Exit>
+  //   MOVT ip, #<higher 16 bits of address of __xray_FunctionEntry/Exit>
+  //   BLX ip
+  //   POP{ r0, lr }
+  //
+  OutStreamer->EmitCodeAlignment(4);
+  auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+  OutStreamer->EmitLabel(CurSled);
+  auto Target = OutContext.createTempSymbol();
+
+  // Emit "B #20" instruction, which jumps over the next 24 bytes (because
+  // register pc is 8 bytes ahead of the jump instruction by the moment CPU
+  // is executing it).
+  // By analogy to ARMAsmPrinter::emitPseudoExpansionLowering() |case ARM::B|.
+  // It is not clear why |addReg(0)| is needed (the last operand).
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::Bcc).addImm(20)
+    .addImm(ARMCC::AL).addReg(0));
+
+  MCInst Noop;
+  Subtarget->getInstrInfo()->getNoopForElfTarget(Noop);
+  for (int8_t I = 0; I < NoopsInSledCount; I++)
+  {
+    OutStreamer->EmitInstruction(Noop, getSubtargetInfo());
+  }
+
+  OutStreamer->EmitLabel(Target);
+  recordSled(CurSled, MI, Kind);
+}
+
+void ARMAsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI)
+{
+  EmitSled(MI, SledKind::FUNCTION_ENTER);
+}
+
+void ARMAsmPrinter::LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI)
+{
+  EmitSled(MI, SledKind::FUNCTION_EXIT);
+}
+
+void ARMAsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI)
+{
+  EmitSled(MI, SledKind::TAIL_CALL);
+}
+
+void ARMAsmPrinter::EmitXRayTable()
+{
+  if (Sleds.empty())
+    return;
+
+  MCSection *Section = nullptr;
+  if (Subtarget->isTargetELF()) {
+    Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+                                       ELF::SHF_ALLOC | ELF::SHF_GROUP |
+                                           ELF::SHF_MERGE,
+                                       0, CurrentFnSym->getName());
+  } else if (Subtarget->isTargetMachO()) {
+    Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
+                                         SectionKind::getReadOnlyWithRel());
+  } else {
+    llvm_unreachable("Unsupported target");
+  }
+
+  auto PrevSection = OutStreamer->getCurrentSectionOnly();
+  OutStreamer->SwitchSection(Section);
+  for (const auto &Sled : Sleds) {
+    OutStreamer->EmitSymbolValue(Sled.Sled, 4);
+    OutStreamer->EmitSymbolValue(CurrentFnSym, 4);
+    auto Kind = static_cast<uint8_t>(Sled.Kind);
+    OutStreamer->EmitBytes(
+      StringRef(reinterpret_cast<const char *>(&Kind), 1));
+    OutStreamer->EmitBytes(
+      StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1));
+    OutStreamer->EmitZeros(6);
+  }
+  OutStreamer->SwitchSection(PrevSection);
+
+  Sleds.clear();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..50d8f0941460
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -0,0 +1,24 @@
+//===-- ARMMachineFunctionInfo.cpp - ARM machine function info ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMachineFunctionInfo.h"
+
+using namespace llvm;
+
+void ARMFunctionInfo::anchor() {}
+
+ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF)
+    : isThumb(MF.getSubtarget<ARMSubtarget>().isThumb()),
+      hasThumb2(MF.getSubtarget<ARMSubtarget>().hasThumb2()),
+      StByValParamsPadding(0), ArgRegsSaveSize(0), ReturnRegsCount(0),
+      HasStackFrame(false), RestoreSPFromFP(false), LRSpilledForFarJump(false),
+      FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+      GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), PICLabelUId(0),
+      VarArgsFrameIndex(0), HasITBlocks(false), ArgumentStackSize(0),
+      IsSplitCSR(false), PromotedGlobalsIncrease(0) {}
diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
new file mode 100644
index 000000000000..8c485e89bf54
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -0,0 +1,255 @@
+//===-- ARMMachineFunctionInfo.h - ARM machine function info ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares ARM-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMMACHINEFUNCTIONINFO_H
+
+#include "ARMSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+
+/// ARMFunctionInfo - This class is derived from MachineFunctionInfo and
+/// contains private ARM-specific information for each MachineFunction.
+class ARMFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// isThumb - True if this function is compiled under Thumb mode.
+  /// Used to initialized Align, so must precede it.
+  bool isThumb;
+
+  /// hasThumb2 - True if the target architecture supports Thumb2. Do not use
+  /// to determine if function is compiled under Thumb mode, for that use
+  /// 'isThumb'.
+  bool hasThumb2;
+
+  /// StByValParamsPadding - For parameter that is split between
+  /// GPRs and memory; while recovering GPRs part, when
+  /// StackAlignment > 4, and GPRs-part-size mod StackAlignment != 0,
+  /// we need to insert gap before parameter start address. It allows to
+  /// "attach" GPR-part to the part that was passed via stack.
+  unsigned StByValParamsPadding;
+
+  /// VarArgsRegSaveSize - Size of the register save area for vararg functions.
+  ///
+  unsigned ArgRegsSaveSize;
+
+  /// ReturnRegsCount - Number of registers used up in the return.
+  unsigned ReturnRegsCount;
+
+  /// HasStackFrame - True if this function has a stack frame. Set by
+  /// determineCalleeSaves().
+  bool HasStackFrame;
+
+  /// RestoreSPFromFP - True if epilogue should restore SP from FP. Set by
+  /// emitPrologue.
+  bool RestoreSPFromFP;
+
+  /// LRSpilledForFarJump - True if the LR register has been for spilled to
+  /// enable far jump.
+  bool LRSpilledForFarJump;
+
+  /// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer
+  /// spill stack offset.
+  unsigned FramePtrSpillOffset;
+
+  /// GPRCS1Offset, GPRCS2Offset, DPRCSOffset - Starting offset of callee saved
+  /// register spills areas. For Mac OS X:
+  ///
+  /// GPR callee-saved (1) : r4, r5, r6, r7, lr
+  /// --------------------------------------------
+  /// GPR callee-saved (2) : r8, r10, r11
+  /// --------------------------------------------
+  /// DPR callee-saved : d8 - d15
+  ///
+  /// Also see AlignedDPRCSRegs below. Not all D-regs need to go in area 3.
+  /// Some may be spilled after the stack has been realigned.
+  unsigned GPRCS1Offset;
+  unsigned GPRCS2Offset;
+  unsigned DPRCSOffset;
+
+  /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills
+  /// areas.
+  unsigned GPRCS1Size;
+  unsigned GPRCS2Size;
+  unsigned DPRCSAlignGapSize;
+  unsigned DPRCSSize;
+
+  /// NumAlignedDPRCS2Regs - The number of callee-saved DPRs that are saved in
+  /// the aligned portion of the stack frame.  This is always a contiguous
+  /// sequence of D-registers starting from d8.
+  ///
+  /// We do not keep track of the frame indices used for these registers - they
+  /// behave like any other frame index in the aligned stack frame.  These
+  /// registers also aren't included in DPRCSSize above.
+  unsigned NumAlignedDPRCS2Regs;
+
+  unsigned PICLabelUId;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+  /// HasITBlocks - True if IT blocks have been inserted.
+  bool HasITBlocks;
+
+  /// CPEClones - Track constant pool entries clones created by Constant Island
+  /// pass.
+  DenseMap<unsigned, unsigned> CPEClones;
+
+  /// ArgumentStackSize - amount of bytes on stack consumed by the arguments
+  /// being passed on the stack
+  unsigned ArgumentStackSize;
+
+  /// CoalescedWeights - mapping of basic blocks to the rolling counter of
+  /// coalesced weights.
+  DenseMap<const MachineBasicBlock*, unsigned> CoalescedWeights;
+
+  /// True if this function has a subset of CSRs that is handled explicitly via
+  /// copies.
+  bool IsSplitCSR;
+
+  /// Globals that have had their storage promoted into the constant pool.
+  SmallPtrSet<const GlobalVariable*,2> PromotedGlobals;
+
+  /// The amount the literal pool has been increasedby due to promoted globals.
+  int PromotedGlobalsIncrease;
+  
+public:
+  ARMFunctionInfo() :
+    isThumb(false),
+    hasThumb2(false),
+    ArgRegsSaveSize(0), ReturnRegsCount(0), HasStackFrame(false),
+    RestoreSPFromFP(false),
+    LRSpilledForFarJump(false),
+    FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+    GPRCS1Size(0), GPRCS2Size(0), DPRCSAlignGapSize(0), DPRCSSize(0),
+    NumAlignedDPRCS2Regs(0), PICLabelUId(0),
+    VarArgsFrameIndex(0), HasITBlocks(false), IsSplitCSR(false),
+    PromotedGlobalsIncrease(0) {}
+
+  explicit ARMFunctionInfo(MachineFunction &MF);
+
+  bool isThumbFunction() const { return isThumb; }
+  bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
+  bool isThumb2Function() const { return isThumb && hasThumb2; }
+
+  unsigned getStoredByValParamsPadding() const { return StByValParamsPadding; }
+  void setStoredByValParamsPadding(unsigned p) { StByValParamsPadding = p; }
+
+  unsigned getArgRegsSaveSize() const { return ArgRegsSaveSize; }
+  void setArgRegsSaveSize(unsigned s) { ArgRegsSaveSize = s; }
+
+  unsigned getReturnRegsCount() const { return ReturnRegsCount; }
+  void setReturnRegsCount(unsigned s) { ReturnRegsCount = s; }
+
+  bool hasStackFrame() const { return HasStackFrame; }
+  void setHasStackFrame(bool s) { HasStackFrame = s; }
+
+  bool shouldRestoreSPFromFP() const { return RestoreSPFromFP; }
+  void setShouldRestoreSPFromFP(bool s) { RestoreSPFromFP = s; }
+
+  bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; }
+  void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; }
+
+  unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; }
+  void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; }
+
+  unsigned getNumAlignedDPRCS2Regs() const { return NumAlignedDPRCS2Regs; }
+  void setNumAlignedDPRCS2Regs(unsigned n) { NumAlignedDPRCS2Regs = n; }
+
+  unsigned getGPRCalleeSavedArea1Offset() const { return GPRCS1Offset; }
+  unsigned getGPRCalleeSavedArea2Offset() const { return GPRCS2Offset; }
+  unsigned getDPRCalleeSavedAreaOffset()  const { return DPRCSOffset; }
+
+  void setGPRCalleeSavedArea1Offset(unsigned o) { GPRCS1Offset = o; }
+  void setGPRCalleeSavedArea2Offset(unsigned o) { GPRCS2Offset = o; }
+  void setDPRCalleeSavedAreaOffset(unsigned o)  { DPRCSOffset = o; }
+
+  unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; }
+  unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; }
+  unsigned getDPRCalleeSavedGapSize() const   { return DPRCSAlignGapSize; }
+  unsigned getDPRCalleeSavedAreaSize()  const { return DPRCSSize; }
+
+  void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; }
+  void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
+  void setDPRCalleeSavedGapSize(unsigned s)   { DPRCSAlignGapSize = s; }
+  void setDPRCalleeSavedAreaSize(unsigned s)  { DPRCSSize = s; }
+
+  unsigned getArgumentStackSize() const { return ArgumentStackSize; }
+  void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
+  void initPICLabelUId(unsigned UId) {
+    PICLabelUId = UId;
+  }
+
+  unsigned getNumPICLabels() const {
+    return PICLabelUId;
+  }
+
+  unsigned createPICLabelUId() {
+    return PICLabelUId++;
+  }
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+  bool hasITBlocks() const { return HasITBlocks; }
+  void setHasITBlocks(bool h) { HasITBlocks = h; }
+
+  bool isSplitCSR() const { return IsSplitCSR; }
+  void setIsSplitCSR(bool s) { IsSplitCSR = s; }
+
+  void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) {
+    if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second)
+      llvm_unreachable("Duplicate entries!");
+  }
+
+  unsigned getOriginalCPIdx(unsigned CloneIdx) const {
+    DenseMap<unsigned, unsigned>::const_iterator I = CPEClones.find(CloneIdx);
+    if (I != CPEClones.end())
+      return I->second;
+    else
+      return -1U;
+  }
+
+  DenseMap<const MachineBasicBlock*, unsigned>::iterator getCoalescedWeight(
+                                                  MachineBasicBlock* MBB) {
+    auto It = CoalescedWeights.find(MBB);
+    if (It == CoalescedWeights.end()) {
+      It = CoalescedWeights.insert(std::make_pair(MBB, 0)).first;
+    }
+    return It;
+  }
+
+  /// Indicate to the backend that \c GV has had its storage changed to inside
+  /// a constant pool. This means it no longer needs to be emitted as a
+  /// global variable.
+  void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV) {
+    PromotedGlobals.insert(GV);
+  }
+  SmallPtrSet<const GlobalVariable*, 2>& getGlobalsPromotedToConstantPool() {
+    return PromotedGlobals;
+  }
+  int getPromotedConstpoolIncrease() const {
+    return PromotedGlobalsIncrease;
+  }
+  void setPromotedConstpoolIncrease(int Sz) {
+    PromotedGlobalsIncrease = Sz;
+  }
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
new file mode 100644
index 000000000000..581d5fe159fd
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
@@ -0,0 +1,105 @@
+//===-- ARMOptimizeBarriersPass - two DMBs without a memory access in between,
+//removed one -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===------------------------------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "double barriers"
+
+STATISTIC(NumDMBsRemoved, "Number of DMBs removed");
+
+namespace {
+class ARMOptimizeBarriersPass : public MachineFunctionPass {
+public:
+  static char ID;
+  ARMOptimizeBarriersPass() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  StringRef getPassName() const override { return "optimise barriers pass"; }
+};
+char ARMOptimizeBarriersPass::ID = 0;
+}
+
+// Returns whether the instruction can safely move past a DMB instruction
+// The current implementation allows this iif MI does not have any possible
+// memory access
+static bool CanMovePastDMB(const MachineInstr *MI) {
+  return !(MI->mayLoad() ||
+          MI->mayStore() ||
+          MI->hasUnmodeledSideEffects() ||
+          MI->isCall() ||
+          MI->isReturn());
+}
+
+bool ARMOptimizeBarriersPass::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  // Vector to store the DMBs we will remove after the first iteration
+  std::vector<MachineInstr *> ToRemove;
+  // DMBType is the Imm value of the first operand. It determines whether it's a
+  // DMB ish, dmb sy, dmb osh, etc
+  int64_t DMBType = -1;
+
+  // Find a dmb. If we can move it until the next dmb, tag the second one for
+  // removal
+  for (auto &MBB : MF) {
+    // Will be true when we have seen a DMB, and not seen any instruction since
+    // that cannot move past a DMB
+    bool IsRemovableNextDMB = false;
+    for (auto &MI : MBB) {
+      if (MI.getOpcode() == ARM::DMB) {
+        if (IsRemovableNextDMB) {
+          // If the Imm of this DMB is the same as that of the last DMB, we can
+          // tag this second DMB for removal
+          if (MI.getOperand(0).getImm() == DMBType) {
+            ToRemove.push_back(&MI);
+          } else {
+            // If it has a different DMBType, we cannot remove it, but will scan
+            // for the next DMB, recording this DMB's type as last seen DMB type
+            DMBType = MI.getOperand(0).getImm();
+          }
+        } else {
+          // After we see a DMB, a next one is removable
+          IsRemovableNextDMB = true;
+          DMBType = MI.getOperand(0).getImm();
+        }
+      } else if (!CanMovePastDMB(&MI)) {
+        // If we find an instruction unable to pass past a DMB, a next DMB is
+        // not removable
+        IsRemovableNextDMB = false;
+      }
+    }
+  }
+  // Remove the tagged DMB
+  for (auto MI : ToRemove) {
+    MI->eraseFromParent();
+    ++NumDMBsRemoved;
+  }
+
+  return NumDMBsRemoved > 0;
+}
+
+/// createARMOptimizeBarriersPass - Returns an instance of the remove double
+/// barriers
+/// pass.
+FunctionPass *llvm::createARMOptimizeBarriersPass() {
+  return new ARMOptimizeBarriersPass();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h
new file mode 100644
index 000000000000..3ff0bee7e5bf
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h
@@ -0,0 +1,6591 @@
+//===-- ARMPerfectShuffle.h - NEON Perfect Shuffle Table --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle using neon instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMPERFECTSHUFFLE_H
+#define LLVM_LIB_TARGET_ARM_ARMPERFECTSHUFFLE_H
+
+// 31 entries have cost 0
+// 242 entries have cost 1
+// 1447 entries have cost 2
+// 3602 entries have cost 3
+// 1237 entries have cost 4
+// 2 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+  135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
+  1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+  2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+  2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+  2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+  2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+  2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
+  2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+  1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
+  1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+  2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+  2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+  3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+  2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+  1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
+  3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+  3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+  1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+  3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+  3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+  2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+  2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+  2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+  2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+  2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+  2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+  3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+  3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+  3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+  2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+  2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+  3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+  1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+  1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+  2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+  2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+  3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+  3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+  2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+  2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+  2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+  2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+  2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+  2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+  3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+  2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+  3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+  3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+  3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+  2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+  2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+  2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+  2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+  3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+  3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+  2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+  3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+  3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+  2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+  2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
+  1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
+  1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+  2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+  1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+  1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+  135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
+  2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+  1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+  2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+  2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+  4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+  2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+  2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+  1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+  1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+  2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+  2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+  2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+  1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+  2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+  2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+  2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+  1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+  1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+  3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+  2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+  835584U, // <0,1,2,3>: Cost 0 copy LHS
+  1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+  3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+  2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+  1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+  835584U, // <0,1,2,u>: Cost 0 copy LHS
+  2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+  2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+  2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+  2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+  2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+  2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+  2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+  2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+  2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+  2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+  4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+  2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+  2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+  2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+  1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+  1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+  3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+  2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+  3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+  2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+  2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+  2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+  2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+  2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+  2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+  2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+  3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+  2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+  3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+  2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+  3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+  2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+  1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+  1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+  2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+  2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+  2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+  2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+  2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+  3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+  2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+  2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+  2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+  1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+  1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+  835584U, // <0,1,u,3>: Cost 0 copy LHS
+  1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+  1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+  835584U, // <0,1,u,u>: Cost 0 copy LHS
+  2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+  1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+  2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+  2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+  2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+  2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+  2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+  1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+  2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+  2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+  2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+  2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+  2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+  2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+  3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+  2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+  1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+  2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+  2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+  3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+  2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+  2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+  2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+  1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+  2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+  2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+  2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+  2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+  2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+  2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+  4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+  2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+  2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+  2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+  1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+  2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+  2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+  2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+  3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+  3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+  2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+  2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+  2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+  2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+  3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+  2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+  2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+  2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+  3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+  2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+  2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+  2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+  2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+  3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+  2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+  2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+  2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+  3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+  2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+  2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+  1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+  3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+  1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+  2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+  2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+  2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+  2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+  4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+  2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+  4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+  3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+  3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+  2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+  2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+  2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+  2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+  2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+  2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+  4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+  3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+  3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+  2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+  1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+  1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+  2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+  3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+  3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+  2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+  1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+  3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+  2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+  3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+  2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+  3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+  3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+  3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+  3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+  2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+  3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+  2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+  2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+  4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+  3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+  2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+  3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+  3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+  2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+  3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+  3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+  4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+  3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+  2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+  3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+  2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+  2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+  2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+  2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+  3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+  3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+  3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+  3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+  3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+  3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+  2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+  2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+  3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+  2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+  3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+  3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+  3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+  3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+  3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+  3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+  2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+  1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+  1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+  2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+  3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+  3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+  2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+  1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+  2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+  2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+  3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+  2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+  2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+  3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+  3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+  3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+  3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+  2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+  2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+  1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
+  3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+  2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
+  2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+  2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+  3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+  2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+  2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+  2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+  1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+  3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+  3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+  3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+  3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+  3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+  3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+  4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+  3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+  3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+  2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+  3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+  3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+  3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+  2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+  2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+  2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+  3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+  2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+  2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+  2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+  3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+  2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+  2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+  2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+  1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+  1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+  3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+  2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+  3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+  2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+  3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+  2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+  2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+  2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+  3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+  3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+  3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+  3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+  3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+  2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+  3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+  2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+  2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+  2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+  2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+  1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
+  1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+  2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+  1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+  3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+  2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+  3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+  3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+  3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+  2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+  3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+  2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+  1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+  2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+  2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+  2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+  1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+  2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+  2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+  2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+  1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+  2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+  2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+  3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+  2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+  2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+  3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+  4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+  3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+  3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+  3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+  3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+  2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+  3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+  3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+  2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+  2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+  2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+  3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+  3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+  3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+  3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+  2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+  2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+  2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+  3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+  3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+  3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+  3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+  3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+  2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+  2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+  2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+  2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+  2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+  3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+  3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+  3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+  2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+  2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+  3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+  1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+  1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+  2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+  2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+  2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+  3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+  2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+  2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+  2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+  2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+  2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+  1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+  2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+  2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+  1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+  2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+  1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+  1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+  2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+  2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+  3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+  2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+  2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+  2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+  2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+  2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+  3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+  2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+  3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+  2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+  2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+  2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+  2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+  1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+  2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+  2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+  2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+  1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+  2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+  3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+  2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+  1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+  3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+  3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+  3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+  3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+  2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+  3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+  3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+  2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+  2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+  2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+  4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+  2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+  3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+  2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+  2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+  4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+  2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+  2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+  3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+  3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+  3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+  3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+  3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+  4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+  2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+  3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+  3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+  3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+  3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+  3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+  2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+  2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+  2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+  2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+  2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+  3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+  2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+  2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+  2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+  3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+  2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+  2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+  1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+  2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+  2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+  1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+  2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+  2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+  1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+  2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+  2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+  3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+  2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+  2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+  2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+  3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+  2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+  3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+  2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+  3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+  2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+  3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+  3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+  2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+  2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+  2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+  1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+  3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+  2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+  2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+  2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+  2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+  3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+  1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+  3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+  3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+  3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+  3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+  3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+  3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+  2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+  2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+  3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+  2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+  2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+  3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+  3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+  2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+  3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+  2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+  3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+  3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+  3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+  3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+  2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+  3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+  2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+  2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+  2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+  3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+  2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+  3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+  3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+  3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+  2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+  3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+  2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+  2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+  3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+  3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+  3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+  3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+  3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+  3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+  3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+  2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+  2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+  2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+  1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+  2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+  2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+  2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+  2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+  2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+  1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+  135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
+  1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+  2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+  1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+  2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+  3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+  135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
+  1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+  1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
+  1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+  1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
+  2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+  2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
+  1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+  1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+  1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+  835584U, // <0,u,2,3>: Cost 0 copy LHS
+  1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+  3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+  1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+  1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+  835584U, // <0,u,2,u>: Cost 0 copy LHS
+  2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+  2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+  2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+  2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+  2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+  2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+  2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+  1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+  2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+  1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+  2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+  2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+  3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+  2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+  2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+  2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+  1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+  1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+  2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+  2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+  2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+  2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+  2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+  2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+  1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+  1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+  2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+  2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+  2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+  2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+  2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+  2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+  2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+  2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+  2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+  135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
+  1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+  835584U, // <0,u,u,3>: Cost 0 copy LHS
+  1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+  1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+  1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+  835584U, // <0,u,u,u>: Cost 0 copy LHS
+  2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+  1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+  2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+  2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+  2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+  2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+  3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+  3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+  1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+  2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+  2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+  1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+  2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+  2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+  2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+  3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+  1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+  2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+  2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+  2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+  2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+  2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+  3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+  2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+  2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+  2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+  3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+  67944550U, // <1,0,3,2>: Cost 1 vrev LHS
+  2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+  2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+  4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+  3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+  2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+  68386972U, // <1,0,3,u>: Cost 1 vrev LHS
+  2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+  2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+  2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+  3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+  3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+  2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+  3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+  2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+  4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+  2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+  3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+  2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+  3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+  3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+  3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+  3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+  2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+  3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+  3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+  3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+  3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+  3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+  2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+  2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+  2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+  4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+  2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+  3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+  3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+  3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+  2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+  3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+  2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+  3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+  67985515U, // <1,0,u,2>: Cost 1 vrev LHS
+  2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+  2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+  2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+  2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+  68427937U, // <1,0,u,u>: Cost 1 vrev LHS
+  1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+  1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+  2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+  2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+  2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+  2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+  3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+  3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+  1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+  1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
+  2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+  2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+  1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+  2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+  2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
+  2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+  2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+  2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+  2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+  2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+  3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+  2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+  3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+  2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+  2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+  3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+  4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+  2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+  2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+  2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+  3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+  2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+  2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+  2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+  2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+  3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+  3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+  2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+  1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+  3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+  1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+  2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+  2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+  4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+  2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+  2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+  2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+  2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+  2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+  3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+  2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+  2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+  3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+  3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+  4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+  2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+  2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+  2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+  2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+  2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+  4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+  2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+  3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+  3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+  2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+  2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
+  2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+  2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+  1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+  2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
+  2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+  1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+  2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+  2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+  2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+  3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+  2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+  2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+  1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+  2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+  2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+  2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+  2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+  2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+  2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+  2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+  3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+  2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+  2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+  2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+  2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+  2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+  3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+  3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+  2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+  2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+  2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+  403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+  1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+  403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+  1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+  2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+  3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+  3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+  2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+  2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+  1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+  2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+  1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+  2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+  2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+  3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+  2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+  2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+  2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+  2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+  2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+  2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+  2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+  2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+  2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+  3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+  3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+  2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+  2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+  2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+  1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+  2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+  3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+  2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+  2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+  3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+  3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+  2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+  1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+  403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+  1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+  403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+  1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+  2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+  1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+  2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+  2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+  2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+  3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+  3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+  3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+  1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+  2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+  2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+  2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+  1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+  2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+  2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+  3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+  2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+  1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+  2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+  2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+  2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+  2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+  1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+  2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+  2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+  1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+  2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+  3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+  1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+  2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+  2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+  3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+  2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+  2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+  1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+  3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+  1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+  2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+  2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+  2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+  2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+  2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+  2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+  2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+  1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+  3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+  2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+  3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+  2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+  3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+  2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+  2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+  2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+  3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+  2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+  2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+  2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+  3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+  2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+  2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+  1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+  1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+  2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+  1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+  1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+  1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+  1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+  1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+  2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+  2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+  2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+  1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+  2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+  3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+  1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+  3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+  2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+  3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+  3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+  3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+  2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+  3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+  2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+  3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+  3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+  3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+  2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+  3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+  2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+  3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+  2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+  2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+  2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+  2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+  3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+  2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+  3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+  3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+  3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+  3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+  2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+  3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+  3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+  3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+  2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+  2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+  2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+  3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+  2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+  2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+  2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+  2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+  3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+  2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+  2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+  1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+  2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+  3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+  2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+  2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+  3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+  3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+  2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+  2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+  3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+  3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+  3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+  3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+  3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+  2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+  3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+  2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+  2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+  2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+  2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+  2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+  1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+  1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+  1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+  3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+  1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+  2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+  3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+  1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+  2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+  2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+  2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+  2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+  2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+  2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+  3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+  2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+  3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+  2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+  2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+  2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+  3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+  3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+  2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+  4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+  2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+  2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+  2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+  3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+  2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+  2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+  3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+  4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+  2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+  2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+  3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+  2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+  1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+  2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+  1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+  2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+  2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+  3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+  4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+  2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+  2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+  2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+  2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+  2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+  2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+  2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+  2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+  3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+  3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+  3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+  2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+  2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+  2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+  2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+  2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+  2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+  2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+  2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+  2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+  3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+  2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+  2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+  2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+  1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+  2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+  1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+  1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+  2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+  2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+  3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+  2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+  3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+  2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+  2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+  2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+  4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+  2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+  3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+  2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+  3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+  3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+  2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+  3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+  3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+  2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+  2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+  3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+  3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+  3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+  3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+  3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+  2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+  2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+  2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+  3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+  2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+  3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+  2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+  4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+  2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+  3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+  3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+  3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+  3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+  3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+  3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+  2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+  4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+  2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+  3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+  3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+  3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+  3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+  3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+  3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+  2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+  2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+  2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+  2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+  3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+  3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+  3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+  3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+  3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+  2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+  2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+  2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+  1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+  2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+  2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+  3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+  2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+  2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+  2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+  4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+  1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+  1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+  2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+  3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+  2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+  2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+  3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+  2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+  2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+  3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+  2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+  3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+  2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+  2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+  2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+  2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+  2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+  3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+  2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+  2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+  2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+  2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+  2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+  2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+  2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+  3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+  3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+  2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+  2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+  3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+  2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+  3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+  2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+  1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+  2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+  2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+  2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+  1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+  1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+  2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+  2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+  1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+  2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+  3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+  3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+  3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+  3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+  2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+  3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+  3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+  2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+  2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+  2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+  3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+  2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+  2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+  4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+  2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+  3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+  2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+  2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+  2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+  3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+  3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+  3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+  3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+  3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+  2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+  2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+  2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+  3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+  3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+  3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+  3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+  2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+  3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+  2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+  2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+  1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+  2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+  2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+  1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+  1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+  2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+  2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+  1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+  1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+  1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+  2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+  2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+  1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+  1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+  2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+  2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+  1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+  1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
+  1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+  1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+  3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
+  2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+  2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+  1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  115726126U, // <1,u,3,2>: Cost 1 vrev LHS
+  2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+  403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+  1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+  1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+  403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
+  2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+  2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+  2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+  2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+  2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+  1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+  2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+  1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+  2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+  2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+  3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+  2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+  1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+  2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+  2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+  2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+  2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+  2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+  3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+  2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+  2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+  2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+  2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+  2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+  2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+  2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+  2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+  2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+  1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+  403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+  202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
+  115767091U, // <1,u,u,2>: Cost 1 vrev LHS
+  1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+  403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+  1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+  403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
+  2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+  2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+  1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+  2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+  2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+  3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+  2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+  4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+  1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+  2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+  2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+  1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+  2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+  3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+  2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+  3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+  1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+  2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+  2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+  2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+  1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+  2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+  2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+  2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+  1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+  2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+  4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+  3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+  3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+  4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+  3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+  2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+  2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+  2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+  2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+  3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+  2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+  2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+  2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+  3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+  3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+  2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+  3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+  2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+  3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+  3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+  2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+  4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+  2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+  2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+  3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+  2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+  3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+  3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+  3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+  2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+  2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+  2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+  3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+  3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+  3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+  3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+  3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+  2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+  2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+  1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+  2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+  2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+  3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+  1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+  2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+  2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+  3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+  2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+  1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+  2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+  2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+  2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+  2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+  2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+  3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+  3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+  3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+  2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+  2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+  3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+  2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+  2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+  2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+  3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+  3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+  2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+  2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+  2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+  2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+  2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+  2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+  2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+  2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+  3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+  3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+  1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+  3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+  2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+  3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+  3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+  1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+  2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+  2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+  3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+  2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+  2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+  3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+  3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+  3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+  2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+  2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+  3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+  2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+  3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+  2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+  4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+  3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+  3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+  2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+  2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+  3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+  3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+  2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+  3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+  3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+  3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+  3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+  2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+  2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+  2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+  2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+  2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+  1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+  1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+  1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+  2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+  2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+  3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+  3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+  3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+  1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+  2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+  2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+  2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+  2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+  3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+  2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+  3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+  3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+  2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+  1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+  269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
+  2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+  1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+  2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+  2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
+  2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+  2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+  2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+  1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
+  2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+  3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+  2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+  1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
+  2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+  3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+  2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+  3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+  2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+  1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+  3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+  1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+  3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+  2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+  2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+  4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+  2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+  2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+  2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+  2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+  3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+  3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+  2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+  2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+  3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+  3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+  3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+  2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+  2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+  2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+  3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+  2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+  4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+  2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+  3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+  3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+  2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+  2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+  1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
+  1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
+  1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
+  1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+  1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+  1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+  470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+  1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+  2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+  1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+  2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+  2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+  1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+  1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+  1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+  1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+  1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+  2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+  2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+  1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+  1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+  2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+  1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+  470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+  470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+  2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+  1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+  2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+  1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+  2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+  1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+  2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+  1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+  1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+  2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+  2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+  1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+  1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+  1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+  1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+  470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+  2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+  2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+  2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+  2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+  2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+  1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+  1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+  3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+  2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+  2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+  3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+  3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+  3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+  3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+  2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+  2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+  3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+  2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+  2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+  2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+  2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+  3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+  3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+  2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+  2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+  4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+  3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+  3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+  3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+  2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+  2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+  3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+  3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+  3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+  2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+  2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+  2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+  4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+  2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+  2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+  2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+  2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+  2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+  2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+  3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+  1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+  2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+  2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+  2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+  1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+  2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+  2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+  1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+  2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+  3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+  3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+  3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+  2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+  2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+  3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+  2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+  2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+  2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+  1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+  2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+  2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+  2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+  2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+  2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+  3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+  3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+  3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+  2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+  2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+  2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+  3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+  3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+  2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+  2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+  3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+  4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+  2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+  3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+  2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+  3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+  3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+  3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+  2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+  2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+  2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+  3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+  2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+  2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+  2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+  2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+  3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+  2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+  2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+  2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+  2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+  3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+  2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+  3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+  3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+  3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+  2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+  2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+  4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+  2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+  2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+  2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+  3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+  2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+  3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+  2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+  3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+  4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+  3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+  3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+  2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+  2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+  2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+  3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+  2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+  2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+  2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+  4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+  2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+  2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+  2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+  2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+  2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+  2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+  1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+  3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+  1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+  3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+  2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+  2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+  1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+  2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+  2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+  2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+  2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+  3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+  2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+  2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+  2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+  2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+  2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+  2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+  2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+  2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+  2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+  2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+  2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+  3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+  2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+  2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+  2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+  3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+  3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
+  1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
+  2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+  2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+  2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+  3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+  2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+  1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+  4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+  1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+  3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+  2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+  2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+  3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+  2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+  2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+  2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+  2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+  3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+  2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+  3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+  2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+  3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+  2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+  2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+  2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+  2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+  2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+  2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+  2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+  2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+  3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+  2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+  2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+  2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+  2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+  1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+  2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+  1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+  1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
+  1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
+  2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+  1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+  2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+  2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+  2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+  2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+  2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+  2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+  1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+  2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+  3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+  3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+  3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+  3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+  2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+  3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+  2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+  3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+  2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+  3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+  3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+  2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+  2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+  3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+  2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+  1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+  2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+  2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+  2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+  1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+  2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+  1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+  2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+  1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+  2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+  3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+  3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+  2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+  3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+  2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+  3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+  2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+  2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+  3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+  3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+  3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+  2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+  3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+  3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+  2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+  2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+  2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+  3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+  2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+  3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+  2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+  2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+  2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+  3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+  2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+  2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+  3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+  3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+  3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+  2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+  3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+  2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+  2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+  2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+  1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+  1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+  2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+  2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+  1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+  2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+  2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+  1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+  1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+  1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+  3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+  1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+  1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+  2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
+  1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+  2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
+  1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+  1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+  1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
+  1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+  1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+  1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
+  1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
+  1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+  1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+  2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+  1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+  1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+  470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+  1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+  2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+  1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+  2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+  1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+  2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+  1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+  1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+  2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+  1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+  2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+  2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+  1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+  470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+  269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
+  1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
+  1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+  470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+  1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
+  470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+  1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+  2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+  3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+  3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+  3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+  1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+  1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+  2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+  537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+  2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+  1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+  2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+  2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+  2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+  537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+  2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+  2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+  2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+  1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+  2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+  2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+  2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+  3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+  2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+  3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+  2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+  2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+  1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+  2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+  1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+  3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+  1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+  2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+  3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+  3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+  2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+  2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+  2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+  2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+  3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+  2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+  2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+  2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+  3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+  3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+  3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+  2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+  2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+  2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+  2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+  2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+  2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+  3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+  2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+  3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+  3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+  2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+  2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+  1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+  1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+  2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+  2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+  537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+  2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+  2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+  2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+  1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+  2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+  3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+  3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+  1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+  1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+  2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+  1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+  2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+  3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+  1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+  2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+  2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+  1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+  2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+  2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+  2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+  1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+  1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+  1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+  1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+  1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+  1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+  2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+  2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+  2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+  2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+  2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+  2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+  3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+  2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+  2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+  2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+  2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+  1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+  3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+  3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+  2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+  2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+  2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+  2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+  2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+  2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+  3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+  2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+  4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+  3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+  2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+  3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+  3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+  3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+  1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+  2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+  1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+  1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+  2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+  1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+  2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+  2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+  2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+  1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+  1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+  2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+  2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+  2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+  2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+  2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+  1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+  2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+  2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+  1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+  2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+  3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+  1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+  1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+  2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+  2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+  2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+  1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+  2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+  2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+  2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+  1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+  2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+  2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+  2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+  2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+  1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+  2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+  1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+  2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+  2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+  2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+  2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+  2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+  1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+  2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+  2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+  2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+  1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+  2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+  1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+  2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+  4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+  2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+  2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+  2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+  2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+  2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+  2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+  1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+  1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+  1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+  2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+  1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+  2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+  1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+  2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+  2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+  2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+  2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+  3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+  4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+  1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+  2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+  1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+  2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+  2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+  2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+  2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+  3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+  2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+  1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+  2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+  2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+  1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+  2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+  2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+  2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+  2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+  2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+  1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+  1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+  2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+  336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+  2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+  2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+  336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
+  2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+  2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+  2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+  2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+  2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+  1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+  2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+  4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+  1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+  2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+  2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+  2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+  2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+  2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+  3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+  2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+  2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+  3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+  2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+  2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+  2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+  3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+  2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+  2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+  2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+  2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+  2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+  2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+  2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+  2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+  2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+  3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+  3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+  2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+  1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+  1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+  336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+  2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+  2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
+  2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+  1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+  3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+  2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+  1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+  1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+  3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+  1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+  2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+  1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+  2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+  2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+  2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+  2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+  1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+  3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+  2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+  2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+  2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+  2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+  2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+  2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+  2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+  2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+  2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+  2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+  2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+  2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+  2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+  2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+  3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+  3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+  2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+  2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+  2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+  2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+  2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+  3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+  1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+  1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+  2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+  2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+  1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+  2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+  537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+  2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+  537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+  2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+  2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+  2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+  2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+  1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+  2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+  1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+  2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+  3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+  2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+  3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+  2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+  2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+  2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+  2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+  2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+  1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+  1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+  2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+  1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+  1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+  2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+  537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+  3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+  2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+  2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+  3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+  2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+  2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+  2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+  2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+  2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+  3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+  2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+  2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+  2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+  2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+  3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+  1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+  1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+  3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+  2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+  3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+  2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+  2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+  2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+  3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+  2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+  2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+  3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+  2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+  3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+  2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+  2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+  2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+  4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+  3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+  3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+  2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+  3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+  2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+  3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+  2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+  2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+  2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+  1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+  2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+  2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+  2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+  2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+  2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+  1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+  1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+  1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+  2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+  2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+  2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+  2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+  2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+  2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+  2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+  1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+  1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+  1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+  2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+  2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+  1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+  1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+  2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+  1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+  1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+  2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+  2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+  1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+  1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+  2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+  1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+  2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+  2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+  2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+  3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+  2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+  2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+  2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+  2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+  3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+  3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+  2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+  3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+  2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+  3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+  2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+  2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+  2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+  2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+  2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+  2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+  3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+  2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+  2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+  2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+  1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+  1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+  3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+  3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+  3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+  2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+  2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+  3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+  3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+  2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+  3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+  2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+  2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+  3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+  2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+  2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+  2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+  2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+  2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+  2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+  3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+  2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+  2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+  3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+  4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+  2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+  2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+  2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+  2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+  2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+  3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+  2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+  2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+  1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+  1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+  1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+  2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+  2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+  2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+  1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+  2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+  2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+  2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+  1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+  1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+  2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+  2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+  2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+  1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+  2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+  1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+  1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+  2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+  1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+  3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+  2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+  2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+  2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+  2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+  1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+  2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+  2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+  2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+  2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+  1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+  2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+  2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+  1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+  2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+  3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+  2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+  2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+  2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+  2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+  1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+  2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+  1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+  2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+  2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+  2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+  2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+  2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+  2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+  2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+  2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+  2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+  2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+  2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+  2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+  3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+  2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+  1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+  2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+  1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+  2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+  2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+  3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+  2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+  2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+  2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+  2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+  2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+  2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+  2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+  2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+  2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+  2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+  2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+  2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+  2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+  2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+  2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+  2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+  2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+  2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+  2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+  2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+  1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+  2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+  1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+  2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+  2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+  1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+  1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+  1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+  1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+  1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+  1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+  2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+  1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+  1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+  537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+  1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+  1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+  1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+  2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+  1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+  537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+  1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+  1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+  1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+  1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+  1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+  1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+  1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+  1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+  2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+  336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
+  1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+  1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+  2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
+  2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+  1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+  1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+  1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+  1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+  1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+  1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+  1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+  2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+  1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+  1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+  1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+  1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+  537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+  2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+  2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+  2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+  1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+  1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+  1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+  1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+  1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+  2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+  3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+  1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+  1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+  1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+  1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+  537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+  336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
+  1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+  1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+  537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+  1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+  537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+  2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+  2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+  2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+  3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+  2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+  3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+  3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+  3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+  2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+  2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+  2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+  1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+  2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+  2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+  3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+  2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+  1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+  3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+  2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+  2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+  2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+  3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+  2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+  2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+  3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+  2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+  3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+  3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+  3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+  3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+  3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+  2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+  2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+  2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+  3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+  2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+  2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+  3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+  2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+  2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+  1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
+  2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+  2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+  2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+  3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+  3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+  2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+  1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
+  2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+  3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+  1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+  2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+  2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+  3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+  2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+  1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+  3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+  3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+  4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+  3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+  3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+  3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+  3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+  3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+  4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+  2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+  1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
+  1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+  2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+  2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+  1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+  2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+  3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+  2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+  2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+  3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+  3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+  3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+  2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+  3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+  2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+  2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+  2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+  3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+  3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+  3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+  3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+  2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+  2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+  2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+  3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+  1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+  2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+  3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+  3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+  3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+  1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+  2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+  2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+  2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+  2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+  2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+  2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+  2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+  2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+  3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+  3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+  3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+  3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+  3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+  2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+  3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+  2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+  1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+  2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+  2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+  1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+  2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+  2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+  2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+  4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+  3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+  2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+  2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+  4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+  3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+  2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+  2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+  3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+  2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+  3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+  3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+  3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+  3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+  2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+  2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+  2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+  1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+  1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+  2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+  2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+  2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+  2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+  3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+  2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+  3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+  2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+  3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+  2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+  3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+  3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+  3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+  3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+  2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+  3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+  3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+  2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+  3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+  2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+  2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+  3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+  3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+  3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+  3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+  2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+  2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+  3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+  3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+  2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+  2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+  2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+  3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+  2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+  2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+  2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+  2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+  2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+  2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+  2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+  2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+  3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+  2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+  2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+  2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+  2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+  2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+  2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+  3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+  2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+  3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+  2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+  1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+  2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+  2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+  2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+  1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+  2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+  2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+  2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+  1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+  2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+  3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+  2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+  4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+  3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+  3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+  3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+  2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+  2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+  2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+  2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+  1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+  2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+  2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+  1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+  3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+  2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+  2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+  3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+  3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+  2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+  3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+  3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+  2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+  2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+  3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+  2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+  2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+  3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+  3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+  3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+  3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+  2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+  3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+  2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+  3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+  2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+  3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+  2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+  3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+  3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+  2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+  3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+  3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+  3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+  2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+  3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+  3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+  3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+  2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+  2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+  2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+  3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+  2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+  2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+  2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+  3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+  2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+  2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+  2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+  2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+  2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+  4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+  3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+  2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+  2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+  2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+  2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+  2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+  2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+  1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+  4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+  2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+  1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+  3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+  3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+  3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+  3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+  3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+  3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+  3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+  3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+  3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+  2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+  2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+  2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+  2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+  2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+  1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+  2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+  2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+  1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+  2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+  1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+  2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+  2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+  2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+  3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+  1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+  2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+  2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+  2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+  2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+  2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+  2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+  3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+  3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+  2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+  3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+  3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+  2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+  2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+  2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+  3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+  2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+  3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+  2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+  2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+  3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+  3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+  2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+  3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+  3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+  3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+  2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+  2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+  2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+  2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+  161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
+  1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+  2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+  2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
+  2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+  2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+  3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+  2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+  2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+  1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
+  1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+  1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+  3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+  2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+  2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+  2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+  2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+  1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+  1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+  2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+  3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+  3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+  3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+  2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+  2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+  2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+  2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+  1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+  161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
+  1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
+  1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+  161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
+  2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+  1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+  2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+  2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+  3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+  2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+  2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+  1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+  2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+  2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+  2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+  3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+  2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+  3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+  2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+  2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+  2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+  3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+  2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+  1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+  2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+  3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+  2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+  4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+  1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+  2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+  3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+  2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+  2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+  2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+  2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+  2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+  2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+  2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+  2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+  3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+  2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+  2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+  2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+  1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+  2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+  1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+  1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+  2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+  2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+  2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+  1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+  2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+  2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+  2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+  1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+  1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+  2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+  2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+  1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+  1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+  3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+  2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+  27705344U, // <4,5,6,7>: Cost 0 copy RHS
+  27705344U, // <4,5,6,u>: Cost 0 copy RHS
+  2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+  2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+  2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+  2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+  2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+  2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+  2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+  2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+  2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+  1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+  1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+  1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+  1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+  1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+  27705344U, // <4,5,u,7>: Cost 0 copy RHS
+  27705344U, // <4,5,u,u>: Cost 0 copy RHS
+  2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+  1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+  3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+  2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+  3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+  2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+  4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+  1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+  2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+  2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+  2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+  3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+  2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+  3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+  4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+  2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+  2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+  2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+  2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+  2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+  2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+  2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+  2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+  2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+  2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+  2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+  2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+  3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+  2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+  2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+  2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+  3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+  2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+  2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+  2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+  2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+  2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+  2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+  2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+  1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+  2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+  1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+  2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+  2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+  2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+  3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+  2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+  2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+  2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+  2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+  2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+  1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+  2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+  2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+  2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+  1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+  2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+  2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+  2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+  1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+  2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+  2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+  3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+  2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+  2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+  2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+  2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+  1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+  1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+  2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+  1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+  1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+  2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+  1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+  2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+  3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+  3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+  2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+  3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+  3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+  2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+  2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+  3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+  2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+  3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+  3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+  3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+  3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+  3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+  2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+  3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+  3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+  3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+  2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+  3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+  2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+  3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+  3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+  2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+  3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+  3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+  3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+  3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+  3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+  2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+  2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+  3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+  2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+  2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+  3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+  3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+  3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+  2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+  2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+  2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+  3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+  2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+  2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+  3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+  2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+  3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+  2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+  2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+  2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+  2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+  2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+  1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+  3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+  2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+  2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+  1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+  1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+  2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+  3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+  3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+  3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+  3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+  3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+  3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+  2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+  3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+  2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+  2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+  1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+  2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+  2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+  1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+  1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+  2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+  3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+  2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+  1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+  2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+  2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+  2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+  2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+  2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+  1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+  2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+  2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+  1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+  2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+  2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+  3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+  2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+  1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+  2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+  2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+  1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+  2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+  2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+  2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+  2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+  1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+  2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+  2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+  2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+  2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+  2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+  2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+  2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+  2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+  1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+  2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+  2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+  2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+  161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
+  1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+  2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+  161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
+  1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
+  2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+  3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+  1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
+  1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+  1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+  2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+  1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+  1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+  1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+  1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+  1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+  27705344U, // <4,u,6,7>: Cost 0 copy RHS
+  27705344U, // <4,u,6,u>: Cost 0 copy RHS
+  2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+  2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+  2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+  2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+  2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+  2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+  2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+  1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+  1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+  161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
+  1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  27705344U, // <4,u,u,7>: Cost 0 copy RHS
+  27705344U, // <4,u,u,u>: Cost 0 copy RHS
+  2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+  2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+  2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+  3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+  2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+  3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+  3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+  3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+  2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+  2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+  2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+  1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+  2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+  2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+  3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+  3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+  1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+  2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+  2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+  2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+  2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+  2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+  2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+  3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+  3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+  2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+  3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+  2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+  3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+  3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+  3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+  2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+  2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+  1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+  3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+  2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+  2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+  2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+  2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+  3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+  3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+  3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+  2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+  3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+  3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+  2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+  4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+  2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+  3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+  3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+  3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+  3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+  2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+  2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+  2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+  2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+  2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+  3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+  2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+  2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+  2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+  4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+  2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+  2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+  1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+  1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+  2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+  2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+  2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+  1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+  2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+  2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+  2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+  3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+  3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+  1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+  2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+  2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+  2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+  2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+  2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+  2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+  3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+  3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+  2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+  3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+  3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+  2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+  2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+  2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+  2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+  2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+  3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+  2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+  2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+  2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+  3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+  2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+  2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+  2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+  3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+  2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+  2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+  1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+  2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+  2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+  2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+  2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+  1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+  3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+  1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+  2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+  2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+  3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+  2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+  2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+  2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+  2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+  2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+  2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+  2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+  3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+  2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+  3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+  2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+  2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+  2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+  2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+  2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+  2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+  2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+  2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+  2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+  2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+  4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+  2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+  2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+  1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+  2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+  2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+  1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+  2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+  2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+  3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+  2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+  3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+  3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+  3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+  3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+  3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+  2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+  2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+  3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+  3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+  2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+  3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+  3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+  3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+  3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+  2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+  3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+  3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+  2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+  2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+  2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+  3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+  3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+  3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+  2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+  2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+  2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+  2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+  3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+  1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+  2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+  3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+  3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+  1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+  2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+  3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+  2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+  2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+  2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+  2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+  3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+  2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+  2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+  3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+  3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+  2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+  2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+  2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+  3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+  3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+  2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+  3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+  3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+  2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+  2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+  2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+  3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+  3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+  3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+  2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+  2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+  2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+  2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+  2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+  2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+  4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+  3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+  3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+  2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+  2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+  2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+  2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+  1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+  2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+  3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+  1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+  3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+  2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+  3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+  3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+  2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+  3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+  3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+  2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+  2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+  3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+  3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+  2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+  2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+  3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+  2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+  3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+  2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+  2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+  3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+  3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+  3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+  2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+  2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+  2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+  3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+  3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+  2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+  3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+  2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+  3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+  2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+  2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+  2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+  3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+  3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+  2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+  2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+  3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+  2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+  2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+  2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+  2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+  3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+  2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+  2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+  2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+  2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+  2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+  2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+  2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+  2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+  3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+  2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+  2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+  2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+  3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+  2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+  2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+  2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+  3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+  4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+  2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+  2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+  1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+  2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+  2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+  1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+  2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+  2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+  3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+  1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+  1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+  2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+  2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+  1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+  2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+  2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+  2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+  1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+  3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+  2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+  3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+  3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+  2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+  2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+  2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+  3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+  2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+  2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+  3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+  3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+  3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+  3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+  2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+  3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+  3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+  2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+  3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+  3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+  3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+  2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+  3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+  2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+  3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+  3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+  2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+  3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+  3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+  2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+  3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+  2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+  3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+  2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+  3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+  2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+  2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+  2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+  3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+  3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+  2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+  1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+  2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+  2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+  1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+  2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+  2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+  2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+  3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+  2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+  2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+  1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+  1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+  2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+  2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+  3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+  2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+  2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+  2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+  3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+  2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+  2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+  2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+  2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+  2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+  2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+  2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+  3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+  94817590U, // <5,4,7,6>: Cost 1 vrev RHS
+  2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+  94965064U, // <5,4,7,u>: Cost 1 vrev RHS
+  2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+  2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+  2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+  2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+  2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+  1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+  94825783U, // <5,4,u,6>: Cost 1 vrev RHS
+  2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+  94973257U, // <5,4,u,u>: Cost 1 vrev RHS
+  2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+  1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+  3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+  2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+  2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+  3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+  4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+  1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+  2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+  2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+  2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+  2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+  2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+  3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+  2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+  2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+  3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+  3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+  2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+  2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+  3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+  2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+  2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+  4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+  2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+  2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+  3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+  3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+  2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+  2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+  2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+  3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+  3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+  2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+  2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+  2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+  3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+  3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+  1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+  2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+  2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+  1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+  1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+  2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+  2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+  1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
+  2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+  2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+  229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
+  2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+  3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+  2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+  3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+  2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+  2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+  2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+  2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+  2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+  2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+  3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+  2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+  2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+  2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+  3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+  4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+  2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+  2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+  1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+  2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+  1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
+  2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+  2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+  229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
+  2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+  1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+  2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+  2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+  3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+  3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+  4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+  1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+  2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+  2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+  2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+  2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+  2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+  2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+  3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+  2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+  3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+  3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+  2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+  2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+  2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+  2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+  2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+  2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+  2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+  2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+  3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+  3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+  2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+  1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+  2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+  3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+  4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+  1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+  2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+  3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+  2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+  2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+  2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+  1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+  2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+  1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+  2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+  2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+  3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+  3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+  2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+  2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+  2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+  2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+  2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+  2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+  3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+  2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+  2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+  2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+  2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+  2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+  2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+  430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+  1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+  1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+  1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+  1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+  430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+  430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+  1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+  1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+  430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+  2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+  1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+  2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+  2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+  2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+  3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+  2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+  1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+  2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+  2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+  1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+  2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+  2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+  2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+  3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+  1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+  2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+  3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+  2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+  2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+  2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+  2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+  2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+  3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+  2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+  2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+  2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+  3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+  2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+  2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+  2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+  3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+  2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+  2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+  2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+  2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+  3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+  2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+  2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+  1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+  1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+  2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+  2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+  2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+  2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+  2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+  2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+  2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+  1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+  1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+  2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+  3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+  2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+  2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+  2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+  2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+  2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+  2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+  1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+  2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+  2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+  2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+  1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+  1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+  2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+  2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+  1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+  1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+  1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+  1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+  1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+  1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+  1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+  1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+  1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+  2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+  2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+  2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+  2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+  2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+  1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+  2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+  1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+  2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+  2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+  2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+  2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+  2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+  2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+  2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+  2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+  2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+  2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+  2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+  2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+  2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+  2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+  2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+  1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+  2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+  2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+  2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+  1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+  1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+  1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+  2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+  2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+  1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+  2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+  1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+  1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+  2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+  2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+  1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
+  1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+  229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
+  2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+  2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+  2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+  2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+  2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+  2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+  430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+  1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+  1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+  430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+  1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  118708378U, // <5,u,7,6>: Cost 1 vrev RHS
+  2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+  430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
+  430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+  1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+  430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+  229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
+  118716571U, // <5,u,u,6>: Cost 1 vrev RHS
+  1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+  430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
+  2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+  2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+  2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+  3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+  2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+  3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+  3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+  3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+  2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+  2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+  3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+  1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+  2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+  2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+  2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+  2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+  1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+  2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+  3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+  2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+  1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+  3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+  2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+  3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+  1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+  3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+  2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+  2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+  3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+  2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+  3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+  3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+  3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+  2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+  2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+  2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+  1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+  3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+  2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+  2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+  2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+  1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+  3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+  2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+  3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+  3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+  2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+  3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+  2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+  2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+  2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+  3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+  3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+  2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+  3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+  2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+  2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+  2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+  2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+  2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+  2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+  2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+  2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+  4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+  3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+  4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+  2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+  2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+  2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+  1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+  1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+  2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+  2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+  2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+  3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+  2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+  2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+  3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+  2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+  2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+  2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+  3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+  2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+  3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+  2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+  2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+  3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+  3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+  3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+  2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+  2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+  3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+  3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+  2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+  2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+  3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+  2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+  3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+  2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+  2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+  2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+  2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+  3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+  2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+  2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+  2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+  3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+  2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+  2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+  2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+  3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+  2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+  3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+  2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+  3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+  2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+  2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+  2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+  3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+  3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+  2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+  2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+  3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+  2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+  3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+  2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+  2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+  3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+  3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+  3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+  3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+  2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+  3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+  3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+  2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+  3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+  2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+  2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+  3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+  4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+  2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+  2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+  2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+  2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+  2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+  2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+  2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+  2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+  2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+  2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+  1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+  3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+  2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+  3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+  2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+  3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+  1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+  2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+  2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+  2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+  2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+  3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+  2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+  2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+  3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+  2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+  3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+  3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+  2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+  2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+  2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+  3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+  2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+  3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+  2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+  2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+  3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+  2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+  2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+  2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+  2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+  2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+  2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+  2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+  1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+  2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+  2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+  2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+  1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+  3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+  1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+  3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+  2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+  3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+  2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+  2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+  2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+  2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+  2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+  2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+  2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+  2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+  2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+  2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+  2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+  2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+  2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+  2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+  2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+  2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+  2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+  1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
+  2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+  3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+  2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+  1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
+  1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+  1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+  1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
+  2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+  1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+  2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
+  3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+  2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+  2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+  3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+  2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+  2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+  3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+  2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+  2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+  3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+  3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+  3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+  3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+  2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+  3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+  3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+  2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+  2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+  3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+  3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+  3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+  2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+  3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+  2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+  3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+  2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+  2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+  2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+  3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+  3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+  3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+  2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+  2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+  2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+  2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+  2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+  2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+  1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+  2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+  2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+  1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+  3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+  3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+  3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+  3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+  2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+  3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+  3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+  3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+  2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+  2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+  2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+  2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+  3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+  2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+  3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+  2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+  3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+  2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+  1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+  2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+  1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+  2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+  1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+  2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+  2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+  2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+  1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+  2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+  1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+  2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+  1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+  1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+  2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+  2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+  3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+  2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+  3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+  2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+  2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+  2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+  3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+  2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+  2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+  3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+  3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+  2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+  3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+  3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+  2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+  3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+  2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+  2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+  3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+  3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+  3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+  2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+  2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+  2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+  3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+  2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+  3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+  3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+  3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+  3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+  2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+  3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+  2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+  3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+  2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+  2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+  3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+  2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+  3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+  2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+  2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+  1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+  3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+  2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+  2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+  2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+  2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+  1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+  2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+  2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+  1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+  2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+  2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+  2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+  1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+  2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+  2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+  3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+  2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+  2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+  2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+  2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+  4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+  2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+  1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+  2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+  1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+  2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+  2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+  3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+  3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+  3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+  4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+  3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+  1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+  1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+  2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+  3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+  3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+  3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+  3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+  3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+  3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+  2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+  2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+  2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+  3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+  3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+  3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+  2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+  3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+  2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+  3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+  2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+  3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+  3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+  3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+  3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+  2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+  3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+  3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+  2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+  2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+  2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+  3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+  3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+  2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+  2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+  2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+  1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+  1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+  2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+  3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+  3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+  3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+  2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+  2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+  2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+  2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+  2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+  2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+  3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+  3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+  2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+  2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+  3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+  2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+  2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+  2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+  2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+  2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+  3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+  2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+  2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+  2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+  2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+  2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+  2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+  2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+  2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+  2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+  1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+  2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+  1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+  3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+  2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+  3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+  2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+  4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+  1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+  2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+  2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+  2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+  2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+  2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+  2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+  3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+  2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+  3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+  2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+  2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+  3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+  3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+  2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+  2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+  2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+  2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+  3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+  3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+  2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+  2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+  3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+  2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+  4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+  2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+  2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+  3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+  3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+  3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+  1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+  2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+  1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+  3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+  2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+  3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+  3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+  2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+  2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+  2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+  2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+  2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+  1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+  2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+  2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+  1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+  296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
+  2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+  296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
+  2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+  3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+  2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+  2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+  2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+  2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+  2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+  1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
+  1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
+  1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+  2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
+  1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
+  296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
+  1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+  1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+  2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+  497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+  1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+  1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+  2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+  2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+  2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+  1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+  1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+  1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+  2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+  2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+  1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+  1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+  497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+  2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+  2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+  1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+  2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+  1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+  2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+  2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+  1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+  1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+  1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+  1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+  1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+  497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+  1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+  1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+  1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+  2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+  497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+  1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+  1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+  1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+  2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+  2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+  1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+  1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+  1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+  2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+  2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+  2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+  1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+  1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+  497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+  2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+  1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+  2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+  1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+  2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+  1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+  2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+  296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
+  1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
+  1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+  2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+  1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+  1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
+  1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+  2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+  1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+  1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
+  1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+  1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+  497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+  1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+  497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+  296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
+  1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
+  497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+  1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+  1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+  3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+  2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+  2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+  2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+  3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+  1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+  1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+  2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+  564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+  2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+  1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+  2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+  1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+  2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+  564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+  2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+  2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+  1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+  2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+  1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+  2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+  2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+  2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+  2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+  2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+  2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+  2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+  3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+  2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+  2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+  1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+  1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+  3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+  2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+  1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+  3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+  1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+  2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+  2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+  2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+  2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+  2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+  1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+  2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+  1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+  2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+  2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+  2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+  3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+  2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+  2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+  2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+  2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+  2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+  2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+  3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+  3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+  2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+  2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+  2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+  2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+  2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+  1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+  1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+  564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+  2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+  1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+  1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+  2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+  564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+  2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+  2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+  2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+  1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+  2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+  2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+  3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+  1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+  2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+  1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+  1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+  2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+  2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+  2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+  3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+  1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+  2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+  2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+  3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+  1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+  2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+  2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+  2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+  2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+  1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+  2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+  1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+  2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+  2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+  1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+  2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+  2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+  1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+  2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+  2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+  2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+  2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+  3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+  2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+  2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+  3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+  2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+  2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+  2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+  3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+  1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+  2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+  3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+  2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+  2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+  1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+  3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+  2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+  2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+  3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+  2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+  3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+  2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+  2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+  2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+  2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+  3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+  3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+  3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+  3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+  2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+  3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+  1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+  2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+  1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+  2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+  1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+  2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+  2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+  1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+  2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+  2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+  2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+  2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+  2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+  2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+  2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+  3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+  2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+  2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+  3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+  3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+  2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+  2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+  3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+  2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+  2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+  2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+  2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+  1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+  2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+  2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+  2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+  3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+  1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+  1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+  2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+  2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+  2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+  1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+  2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+  2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+  2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+  1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+  2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+  2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+  2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+  2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+  2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+  2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+  2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+  3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+  2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+  2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+  3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+  2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+  2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+  2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+  3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+  2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+  3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+  2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+  2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+  2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+  2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+  1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+  2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+  2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+  2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+  3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+  1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+  2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+  3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+  3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+  2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+  2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+  3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+  4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+  2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+  2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+  1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+  2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+  1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+  1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+  2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+  2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+  2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+  1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+  2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+  1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+  2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+  2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+  2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+  2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+  2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+  3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+  1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+  2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+  2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+  2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+  2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+  2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+  2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+  3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+  2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+  2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+  2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+  2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+  2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+  2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+  2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+  2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+  2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+  2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+  2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+  2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+  2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+  2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+  1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+  2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+  2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+  2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+  1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+  2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+  2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+  2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+  2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+  1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+  2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+  3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+  1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+  2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+  1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+  2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+  2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+  2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+  2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+  2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+  2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+  1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+  2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+  2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+  1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+  2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+  2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+  3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+  2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+  2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+  1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+  2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+  2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+  2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+  2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+  2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+  2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+  2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+  2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+  2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+  2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+  1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+  1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+  1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+  1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+  2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+  2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+  1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+  2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+  1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+  3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+  2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+  1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+  2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+  2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+  2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+  2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+  2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+  2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+  2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+  3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+  2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+  3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+  3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+  2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+  2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+  3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+  2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+  3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+  2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+  2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+  3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+  3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+  2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+  2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+  2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+  2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+  3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+  2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+  2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+  3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+  3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+  3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+  1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+  1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+  1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+  1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+  2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+  2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+  2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+  1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+  2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+  564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+  2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+  564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+  2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+  3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+  2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+  2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+  1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+  2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+  2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+  2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+  3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+  3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+  3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+  2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+  2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+  2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+  2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+  1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+  1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+  2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+  1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+  1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+  564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+  2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+  564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+  2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+  2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+  3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+  2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+  2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+  2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+  2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+  2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+  3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+  3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+  2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+  2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+  2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+  2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+  1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+  3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+  3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+  2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+  3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+  2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+  2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+  2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+  3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+  2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+  3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+  3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+  3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+  2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+  3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+  2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+  2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+  2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+  3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+  3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+  3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+  2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+  2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+  2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+  1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+  2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+  2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+  3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+  3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+  2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+  1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+  2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+  2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+  2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+  3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+  2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+  2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+  2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+  1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+  1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+  2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+  2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+  2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+  1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+  1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+  1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+  1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+  2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+  2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+  1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+  1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+  2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+  1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+  1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+  2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+  1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+  2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+  2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+  2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+  2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+  2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+  1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+  1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+  2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+  2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+  2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+  2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+  2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+  2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+  1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+  2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+  2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+  2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+  2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+  2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+  2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+  2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+  1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+  2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+  2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+  2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+  2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+  2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+  3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+  2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+  2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+  2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+  2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+  2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+  2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+  2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+  1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+  2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+  1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+  2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+  2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+  2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+  3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+  1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+  2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+  2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+  2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+  1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+  2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+  2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+  2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+  3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+  2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+  2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+  1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+  1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+  2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+  2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+  2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+  1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+  2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+  2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+  2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+  1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+  1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+  1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+  2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+  1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+  1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+  1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+  2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+  1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+  2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+  3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+  2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+  2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+  2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+  2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+  1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+  2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+  2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+  2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+  2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+  2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+  2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+  2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+  2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+  2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+  3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+  3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+  2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+  2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+  3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+  3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+  2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+  2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+  2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+  2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+  3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+  3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+  2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+  2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+  2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+  2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+  2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+  2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+  2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+  3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+  3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+  3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+  2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+  1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+  2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+  2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+  1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+  2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+  2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+  3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+  2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+  2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+  1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+  2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+  2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+  1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+  2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+  2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+  2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+  2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+  2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+  2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+  1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+  2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+  1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+  1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+  2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+  2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+  1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+  2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+  363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
+  1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+  2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+  2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+  1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+  1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+  363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
+  1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+  1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+  1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+  2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+  1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+  1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+  1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+  1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+  1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+  1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+  1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+  2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+  1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+  1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+  1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+  1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+  2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+  1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+  1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+  1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+  1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+  1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+  2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+  2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+  1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+  2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+  1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+  1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+  2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+  1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+  1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+  1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+  1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+  1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+  2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+  1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+  1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+  1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+  564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+  1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+  564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+  2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+  2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+  1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+  1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+  1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+  1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+  1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+  1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+  1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+  2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+  2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+  2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+  1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+  1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+  2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+  363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
+  1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+  1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+  564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+  1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+  1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+  1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+  564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+  363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
+  564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+  135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
+  1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+  2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+  2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+  2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
+  1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+  1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
+  537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+  2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+  1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+  2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+  1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+  2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+  537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+  2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  72589981U, // <u,0,3,2>: Cost 1 vrev LHS
+  2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+  2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+  2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+  2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+  2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+  73032403U, // <u,0,3,u>: Cost 1 vrev LHS
+  2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+  1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+  2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+  1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+  2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+  1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
+  2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+  2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+  2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+  1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+  2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+  2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+  2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+  2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+  2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+  2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+  2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+  1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+  2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+  2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+  2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+  2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+  2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+  2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+  2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+  2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+  2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+  135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
+  1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+  2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+  2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+  1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+  1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+  2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+  1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+  2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+  2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+  1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+  202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
+  2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+  1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+  2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+  2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
+  1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+  2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+  835584U, // <u,1,2,3>: Cost 0 copy LHS
+  1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+  2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+  1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+  835584U, // <u,1,2,u>: Cost 0 copy LHS
+  1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+  1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+  1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+  1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+  1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+  2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+  2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+  1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+  2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+  1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+  2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+  2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+  1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+  1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+  2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+  2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+  1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+  2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+  2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+  2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+  1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+  2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+  2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+  1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+  1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+  2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+  2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+  2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+  2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+  2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+  2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+  2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+  2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+  202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
+  2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  835584U, // <u,1,u,3>: Cost 0 copy LHS
+  1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+  1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+  835584U, // <u,1,u,u>: Cost 0 copy LHS
+  1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+  1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+  2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+  2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+  2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+  1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+  1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+  2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+  2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+  2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+  2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+  2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+  1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+  2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+  269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
+  1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+  2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+  2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
+  408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+  1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
+  408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+  1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
+  1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+  2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+  2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+  1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+  2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+  2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+  2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+  2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+  2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+  2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+  1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+  1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+  2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+  2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+  1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+  2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+  1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+  2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+  2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+  1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
+  2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+  2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+  2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+  1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
+  408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+  1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
+  1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+  1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+  1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
+  1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+  1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+  1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+  2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+  471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+  1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+  1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+  1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+  2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+  1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+  1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+  336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
+  1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+  2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
+  1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+  1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+  2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+  1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+  471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+  471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+  2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+  1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+  2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+  2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+  1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+  1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+  2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+  2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+  1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+  1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+  1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+  1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+  2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+  1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+  2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+  2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+  1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+  1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+  1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
+  1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+  1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+  471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+  2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+  1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+  2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+  1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+  2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+  2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+  1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+  2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+  2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+  1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
+  2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+  2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+  2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+  2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+  2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+  2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+  2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+  2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+  2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+  2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+  2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+  2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+  2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+  2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+  2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+  2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+  2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
+  1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
+  1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+  2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+  2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+  1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+  1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
+  537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+  2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+  1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+  2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+  2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+  2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+  1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+  2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+  2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+  2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+  2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+  2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+  2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+  2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  96808489U, // <u,4,7,6>: Cost 1 vrev RHS
+  2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+  96955963U, // <u,4,7,u>: Cost 1 vrev RHS
+  1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+  1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+  2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+  161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
+  1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+  2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+  2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+  1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+  2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+  2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+  1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+  1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+  1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+  1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+  2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+  2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+  2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+  1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+  2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+  2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+  1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+  2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+  2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+  1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+  2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+  2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+  3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+  1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+  2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+  2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+  2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+  2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+  2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+  3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+  2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+  2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+  2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+  1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+  1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+  2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+  1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+  1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+  2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+  2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+  2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+  1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+  229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
+  2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
+  1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+  2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+  2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+  1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+  1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+  2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  27705344U, // <u,5,6,7>: Cost 0 copy RHS
+  27705344U, // <u,5,6,u>: Cost 0 copy RHS
+  1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+  2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+  2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+  1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+  1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+  1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+  1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+  2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+  1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+  1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+  229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
+  2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  27705344U, // <u,5,u,7>: Cost 0 copy RHS
+  27705344U, // <u,5,u,u>: Cost 0 copy RHS
+  2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+  1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+  2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+  1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+  2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+  2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+  2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+  1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+  1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+  2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+  2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+  2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+  2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+  2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+  2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+  1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+  1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+  2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+  2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+  2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+  1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+  2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+  2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+  1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+  2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+  2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+  2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+  1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+  2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+  3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
+  1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
+  2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+  2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+  2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+  2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+  1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+  2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+  1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+  2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+  2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+  2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+  2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+  2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+  2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+  2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+  1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+  1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+  2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+  2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+  2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+  1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+  2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+  296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
+  1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
+  432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+  1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+  1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
+  432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
+  432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+  1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+  1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
+  1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
+  432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
+  1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+  1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+  2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+  2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+  497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+  2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+  1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+  2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+  2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+  1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+  2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+  1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+  1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+  2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+  1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+  2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+  1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+  1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+  2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+  1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+  1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+  2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+  1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+  2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+  1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+  1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+  497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+  2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+  2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+  1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+  1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+  2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+  2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+  1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+  1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+  363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
+  1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
+  497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+  135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
+  471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+  1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+  471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
+  537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+  1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+  1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+  1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+  1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+  1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+  269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
+  835584U, // <u,u,2,3>: Cost 0 copy LHS
+  1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+  2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  835584U, // <u,u,2,u>: Cost 0 copy LHS
+  408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+  1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  120371557U, // <u,u,3,2>: Cost 1 vrev LHS
+  336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
+  408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+  1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
+  408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
+  1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+  1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+  1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+  161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
+  471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+  1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+  1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+  1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+  1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+  229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
+  537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+  1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+  537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+  1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+  2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+  1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+  1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+  296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
+  27705344U, // <u,u,6,7>: Cost 0 copy RHS
+  27705344U, // <u,u,6,u>: Cost 0 copy RHS
+  432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+  1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+  1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+  1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
+  432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+  1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  120699277U, // <u,u,7,6>: Cost 1 vrev RHS
+  363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
+  432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
+  408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+  471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+  537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+  835584U, // <u,u,u,3>: Cost 0 copy LHS
+  408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+  471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+  537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+  27705344U, // <u,u,u,7>: Cost 0 copy RHS
+  835584U, // <u,u,u,u>: Cost 0 copy LHS
+  0
+};
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
new file mode 100644
index 000000000000..9bd036a1eace
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -0,0 +1,127 @@
+//===- ARMRegisterBankInfo.cpp -----------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for ARM.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "ARMRegisterBankInfo.h"
+#include "ARMInstrInfo.h" // For the register classes
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+// FIXME: TableGen this.
+// If it grows too much and TableGen still isn't ready to do the job, extract it
+// into an ARMGenRegisterBankInfo.def (similar to AArch64).
+namespace llvm {
+namespace ARM {
+RegisterBank GPRRegBank;
+RegisterBank *RegBanks[] = {&GPRRegBank};
+
+RegisterBankInfo::PartialMapping GPRPartialMapping{0, 32, GPRRegBank};
+
+RegisterBankInfo::ValueMapping ValueMappings[] = {
+    {&GPRPartialMapping, 1}, {&GPRPartialMapping, 1}, {&GPRPartialMapping, 1}};
+} // end namespace arm
+} // end namespace llvm
+
+ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
+    : RegisterBankInfo(ARM::RegBanks, ARM::NumRegisterBanks) {
+  static bool AlreadyInit = false;
+  // We have only one set of register banks, whatever the subtarget
+  // is. Therefore, the initialization of the RegBanks table should be
+  // done only once. Indeed the table of all register banks
+  // (ARM::RegBanks) is unique in the compiler. At some point, it
+  // will get tablegen'ed and the whole constructor becomes empty.
+  if (AlreadyInit)
+    return;
+  AlreadyInit = true;
+
+  // Initialize the GPR bank.
+  createRegisterBank(ARM::GPRRegBankID, "GPRB");
+
+  addRegBankCoverage(ARM::GPRRegBankID, ARM::GPRRegClassID, TRI);
+  addRegBankCoverage(ARM::GPRRegBankID, ARM::GPRwithAPSRRegClassID, TRI);
+  const RegisterBank &RBGPR = getRegBank(ARM::GPRRegBankID);
+  (void)RBGPR;
+  assert(&ARM::GPRRegBank == &RBGPR && "The order in RegBanks is messed up");
+  assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRRegClassID)) &&
+         "Subclass not added?");
+  assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRwithAPSRRegClassID)) &&
+         "Subclass not added?");
+  assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRnopcRegClassID)) &&
+         "Subclass not added?");
+  assert(RBGPR.covers(*TRI.getRegClass(ARM::rGPRRegClassID)) &&
+         "Subclass not added?");
+  assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPRRegClassID)) &&
+         "Subclass not added?");
+  assert(RBGPR.covers(*TRI.getRegClass(ARM::tcGPRRegClassID)) &&
+         "Subclass not added?");
+  assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPR_and_tcGPRRegClassID)) &&
+         "Subclass not added?");
+  assert(RBGPR.getSize() == 32 && "GPRs should hold up to 32-bit");
+}
+
+const RegisterBank &ARMRegisterBankInfo::getRegBankFromRegClass(
+    const TargetRegisterClass &RC) const {
+  using namespace ARM;
+
+  switch (RC.getID()) {
+  case GPRRegClassID:
+  case tGPR_and_tcGPRRegClassID:
+    return getRegBank(ARM::GPRRegBankID);
+  default:
+    llvm_unreachable("Unsupported register kind");
+  }
+
+  llvm_unreachable("Switch should handle all register classes");
+}
+
+RegisterBankInfo::InstructionMapping
+ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+  auto Opc = MI.getOpcode();
+
+  // Try the default logic for non-generic instructions that are either copies
+  // or already have some operands assigned to banks.
+  if (!isPreISelGenericOpcode(Opc)) {
+    InstructionMapping Mapping = getInstrMappingImpl(MI);
+    if (Mapping.isValid())
+      return Mapping;
+  }
+
+  using namespace TargetOpcode;
+
+  unsigned NumOperands = MI.getNumOperands();
+  const ValueMapping *OperandsMapping = &ARM::ValueMappings[0];
+
+  switch (Opc) {
+  case G_ADD:
+  case G_LOAD:
+    // FIXME: We're abusing the fact that everything lives in a GPR for now; in
+    // the real world we would use different mappings.
+    OperandsMapping = &ARM::ValueMappings[0];
+    break;
+  case G_FRAME_INDEX:
+    OperandsMapping = getOperandsMapping({&ARM::ValueMappings[0], nullptr});
+    break;
+  default:
+    return InstructionMapping{};
+  }
+
+  return InstructionMapping{DefaultMappingID, /*Cost=*/1, OperandsMapping,
+                            NumOperands};
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.h b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
new file mode 100644
index 000000000000..773920ee57a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
@@ -0,0 +1,41 @@
+//===- ARMRegisterBankInfo ---------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for ARM.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMREGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMREGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+namespace llvm {
+
+class TargetRegisterInfo;
+
+namespace ARM {
+enum {
+  GPRRegBankID = 0, // General purpose registers
+  NumRegisterBanks,
+};
+} // end namespace ARM
+
+/// This class provides the information for the target register banks.
+class ARMRegisterBankInfo final : public RegisterBankInfo {
+public:
+  ARMRegisterBankInfo(const TargetRegisterInfo &TRI);
+
+  const RegisterBank &
+  getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+
+  InstructionMapping getInstrMapping(const MachineInstr &MI) const override;
+};
+} // End llvm namespace.
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
new file mode 100644
index 000000000000..e6e8cdf965e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -0,0 +1,19 @@
+//===-- ARMRegisterInfo.cpp - ARM Register Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMRegisterInfo.h"
+using namespace llvm;
+
+void ARMRegisterInfo::anchor() { }
+
+ARMRegisterInfo::ARMRegisterInfo() : ARMBaseRegisterInfo() {}
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h
new file mode 100644
index 000000000000..e2e650e4af93
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h
@@ -0,0 +1,31 @@
+//===-- ARMRegisterInfo.h - ARM Register Information Impl -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMREGISTERINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMREGISTERINFO_H
+
+#include "ARMBaseRegisterInfo.h"
+
+namespace llvm {
+
+class ARMSubtarget;
+
+struct ARMRegisterInfo : public ARMBaseRegisterInfo {
+  virtual void anchor();
+public:
+  ARMRegisterInfo();
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
new file mode 100644
index 000000000000..02cbfb1fa9f1
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -0,0 +1,430 @@
+//===-- ARMRegisterInfo.td - ARM Register defs -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the ARM register file
+//===----------------------------------------------------------------------===//
+
+// Registers are identified with 4-bit ID numbers.
+class ARMReg<bits<16> Enc, string n, list<Register> subregs = []> : Register<n> {
+  let HWEncoding = Enc;
+  let Namespace = "ARM";
+  let SubRegs = subregs;
+  // All bits of ARM registers with sub-registers are covered by sub-registers.
+  let CoveredBySubRegs = 1;
+}
+
+class ARMFReg<bits<16> Enc, string n> : Register<n> {
+  let HWEncoding = Enc;
+  let Namespace = "ARM";
+}
+
+// Subregister indices.
+let Namespace = "ARM" in {
+def qqsub_0 : SubRegIndex<256>;
+def qqsub_1 : SubRegIndex<256, 256>;
+
+// Note: Code depends on these having consecutive numbers.
+def qsub_0 : SubRegIndex<128>;
+def qsub_1 : SubRegIndex<128, 128>;
+def qsub_2 : ComposedSubRegIndex<qqsub_1, qsub_0>;
+def qsub_3 : ComposedSubRegIndex<qqsub_1, qsub_1>;
+
+def dsub_0 : SubRegIndex<64>;
+def dsub_1 : SubRegIndex<64, 64>;
+def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>;
+def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>;
+def dsub_4 : ComposedSubRegIndex<qsub_2, dsub_0>;
+def dsub_5 : ComposedSubRegIndex<qsub_2, dsub_1>;
+def dsub_6 : ComposedSubRegIndex<qsub_3, dsub_0>;
+def dsub_7 : ComposedSubRegIndex<qsub_3, dsub_1>;
+
+def ssub_0  : SubRegIndex<32>;
+def ssub_1  : SubRegIndex<32, 32>;
+def ssub_2  : ComposedSubRegIndex<dsub_1, ssub_0>;
+def ssub_3  : ComposedSubRegIndex<dsub_1, ssub_1>;
+
+def gsub_0  : SubRegIndex<32>;
+def gsub_1  : SubRegIndex<32, 32>;
+// Let TableGen synthesize the remaining 12 ssub_* indices.
+// We don't need to name them.
+}
+
+// Integer registers
+def R0  : ARMReg< 0, "r0">,  DwarfRegNum<[0]>;
+def R1  : ARMReg< 1, "r1">,  DwarfRegNum<[1]>;
+def R2  : ARMReg< 2, "r2">,  DwarfRegNum<[2]>;
+def R3  : ARMReg< 3, "r3">,  DwarfRegNum<[3]>;
+def R4  : ARMReg< 4, "r4">,  DwarfRegNum<[4]>;
+def R5  : ARMReg< 5, "r5">,  DwarfRegNum<[5]>;
+def R6  : ARMReg< 6, "r6">,  DwarfRegNum<[6]>;
+def R7  : ARMReg< 7, "r7">,  DwarfRegNum<[7]>;
+// These require 32-bit instructions.
+let CostPerUse = 1 in {
+def R8  : ARMReg< 8, "r8">,  DwarfRegNum<[8]>;
+def R9  : ARMReg< 9, "r9">,  DwarfRegNum<[9]>;
+def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>;
+def R11 : ARMReg<11, "r11">, DwarfRegNum<[11]>;
+def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>;
+def SP  : ARMReg<13, "sp">,  DwarfRegNum<[13]>;
+def LR  : ARMReg<14, "lr">,  DwarfRegNum<[14]>;
+def PC  : ARMReg<15, "pc">,  DwarfRegNum<[15]>;
+}
+
+// Float registers
+def S0  : ARMFReg< 0, "s0">;  def S1  : ARMFReg< 1, "s1">;
+def S2  : ARMFReg< 2, "s2">;  def S3  : ARMFReg< 3, "s3">;
+def S4  : ARMFReg< 4, "s4">;  def S5  : ARMFReg< 5, "s5">;
+def S6  : ARMFReg< 6, "s6">;  def S7  : ARMFReg< 7, "s7">;
+def S8  : ARMFReg< 8, "s8">;  def S9  : ARMFReg< 9, "s9">;
+def S10 : ARMFReg<10, "s10">; def S11 : ARMFReg<11, "s11">;
+def S12 : ARMFReg<12, "s12">; def S13 : ARMFReg<13, "s13">;
+def S14 : ARMFReg<14, "s14">; def S15 : ARMFReg<15, "s15">;
+def S16 : ARMFReg<16, "s16">; def S17 : ARMFReg<17, "s17">;
+def S18 : ARMFReg<18, "s18">; def S19 : ARMFReg<19, "s19">;
+def S20 : ARMFReg<20, "s20">; def S21 : ARMFReg<21, "s21">;
+def S22 : ARMFReg<22, "s22">; def S23 : ARMFReg<23, "s23">;
+def S24 : ARMFReg<24, "s24">; def S25 : ARMFReg<25, "s25">;
+def S26 : ARMFReg<26, "s26">; def S27 : ARMFReg<27, "s27">;
+def S28 : ARMFReg<28, "s28">; def S29 : ARMFReg<29, "s29">;
+def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">;
+
+// Aliases of the F* registers used to hold 64-bit fp values (doubles)
+let SubRegIndices = [ssub_0, ssub_1] in {
+def D0  : ARMReg< 0,  "d0", [S0,   S1]>, DwarfRegNum<[256]>;
+def D1  : ARMReg< 1,  "d1", [S2,   S3]>, DwarfRegNum<[257]>;
+def D2  : ARMReg< 2,  "d2", [S4,   S5]>, DwarfRegNum<[258]>;
+def D3  : ARMReg< 3,  "d3", [S6,   S7]>, DwarfRegNum<[259]>;
+def D4  : ARMReg< 4,  "d4", [S8,   S9]>, DwarfRegNum<[260]>;
+def D5  : ARMReg< 5,  "d5", [S10, S11]>, DwarfRegNum<[261]>;
+def D6  : ARMReg< 6,  "d6", [S12, S13]>, DwarfRegNum<[262]>;
+def D7  : ARMReg< 7,  "d7", [S14, S15]>, DwarfRegNum<[263]>;
+def D8  : ARMReg< 8,  "d8", [S16, S17]>, DwarfRegNum<[264]>;
+def D9  : ARMReg< 9,  "d9", [S18, S19]>, DwarfRegNum<[265]>;
+def D10 : ARMReg<10, "d10", [S20, S21]>, DwarfRegNum<[266]>;
+def D11 : ARMReg<11, "d11", [S22, S23]>, DwarfRegNum<[267]>;
+def D12 : ARMReg<12, "d12", [S24, S25]>, DwarfRegNum<[268]>;
+def D13 : ARMReg<13, "d13", [S26, S27]>, DwarfRegNum<[269]>;
+def D14 : ARMReg<14, "d14", [S28, S29]>, DwarfRegNum<[270]>;
+def D15 : ARMReg<15, "d15", [S30, S31]>, DwarfRegNum<[271]>;
+}
+
+// VFP3 defines 16 additional double registers
+def D16 : ARMFReg<16, "d16">, DwarfRegNum<[272]>;
+def D17 : ARMFReg<17, "d17">, DwarfRegNum<[273]>;
+def D18 : ARMFReg<18, "d18">, DwarfRegNum<[274]>;
+def D19 : ARMFReg<19, "d19">, DwarfRegNum<[275]>;
+def D20 : ARMFReg<20, "d20">, DwarfRegNum<[276]>;
+def D21 : ARMFReg<21, "d21">, DwarfRegNum<[277]>;
+def D22 : ARMFReg<22, "d22">, DwarfRegNum<[278]>;
+def D23 : ARMFReg<23, "d23">, DwarfRegNum<[279]>;
+def D24 : ARMFReg<24, "d24">, DwarfRegNum<[280]>;
+def D25 : ARMFReg<25, "d25">, DwarfRegNum<[281]>;
+def D26 : ARMFReg<26, "d26">, DwarfRegNum<[282]>;
+def D27 : ARMFReg<27, "d27">, DwarfRegNum<[283]>;
+def D28 : ARMFReg<28, "d28">, DwarfRegNum<[284]>;
+def D29 : ARMFReg<29, "d29">, DwarfRegNum<[285]>;
+def D30 : ARMFReg<30, "d30">, DwarfRegNum<[286]>;
+def D31 : ARMFReg<31, "d31">, DwarfRegNum<[287]>;
+
+// Advanced SIMD (NEON) defines 16 quad-word aliases
+let SubRegIndices = [dsub_0, dsub_1] in {
+def Q0  : ARMReg< 0,  "q0", [D0,   D1]>;
+def Q1  : ARMReg< 1,  "q1", [D2,   D3]>;
+def Q2  : ARMReg< 2,  "q2", [D4,   D5]>;
+def Q3  : ARMReg< 3,  "q3", [D6,   D7]>;
+def Q4  : ARMReg< 4,  "q4", [D8,   D9]>;
+def Q5  : ARMReg< 5,  "q5", [D10, D11]>;
+def Q6  : ARMReg< 6,  "q6", [D12, D13]>;
+def Q7  : ARMReg< 7,  "q7", [D14, D15]>;
+}
+let SubRegIndices = [dsub_0, dsub_1] in {
+def Q8  : ARMReg< 8,  "q8", [D16, D17]>;
+def Q9  : ARMReg< 9,  "q9", [D18, D19]>;
+def Q10 : ARMReg<10, "q10", [D20, D21]>;
+def Q11 : ARMReg<11, "q11", [D22, D23]>;
+def Q12 : ARMReg<12, "q12", [D24, D25]>;
+def Q13 : ARMReg<13, "q13", [D26, D27]>;
+def Q14 : ARMReg<14, "q14", [D28, D29]>;
+def Q15 : ARMReg<15, "q15", [D30, D31]>;
+}
+
+// Current Program Status Register.
+// We model fpscr with two registers: FPSCR models the control bits and will be
+// reserved. FPSCR_NZCV models the flag bits and will be unreserved. APSR_NZCV
+// models the APSR when it's accessed by some special instructions. In such cases
+// it has the same encoding as PC.
+def CPSR       : ARMReg<0,  "cpsr">;
+def APSR       : ARMReg<1,  "apsr">;
+def APSR_NZCV  : ARMReg<15, "apsr_nzcv">;
+def SPSR       : ARMReg<2,  "spsr">;
+def FPSCR      : ARMReg<3,  "fpscr">;
+def FPSCR_NZCV : ARMReg<3,  "fpscr_nzcv"> {
+  let Aliases = [FPSCR];
+}
+def ITSTATE    : ARMReg<4, "itstate">;
+
+// Special Registers - only available in privileged mode.
+def FPSID   : ARMReg<0,  "fpsid">;
+def MVFR2   : ARMReg<5,  "mvfr2">;
+def MVFR1   : ARMReg<6,  "mvfr1">;
+def MVFR0   : ARMReg<7,  "mvfr0">;
+def FPEXC   : ARMReg<8,  "fpexc">;
+def FPINST  : ARMReg<9,  "fpinst">;
+def FPINST2 : ARMReg<10, "fpinst2">;
+
+// Register classes.
+//
+// pc  == Program Counter
+// lr  == Link Register
+// sp  == Stack Pointer
+// r12 == ip (scratch)
+// r7  == Frame Pointer (thumb-style backtraces)
+// r9  == May be reserved as Thread Register
+// r11 == Frame Pointer (arm-style backtraces)
+// r10 == Stack Limit
+//
+def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
+                                               SP, LR, PC)> {
+  // Allocate LR as the first CSR since it is always saved anyway.
+  // For Thumb1 mode, we don't want to allocate hi regs at all, as we don't
+  // know how to spill them. If we make our prologue/epilogue code smarter at
+  // some point, we can go back to using the above allocation orders for the
+  // Thumb1 instructions that know how to use hi regs.
+  let AltOrders = [(add LR, GPR), (trunc GPR, 8)];
+  let AltOrderSelect = [{
+      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
+// GPRs without the PC.  Some ARM instructions do not allow the PC in
+// certain operand slots, particularly as the destination.  Primarily
+// useful for disassembly.
+def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
+  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+  let AltOrderSelect = [{
+      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
+// GPRs without the PC but with APSR. Some instructions allow accessing the
+// APSR, while actually encoding PC in the register field. This is useful
+// for assembly and disassembly only.
+def GPRwithAPSR : RegisterClass<"ARM", [i32], 32, (add (sub GPR, PC), APSR_NZCV)> {
+  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+  let AltOrderSelect = [{
+      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
+// GPRsp - Only the SP is legal. Used by Thumb1 instructions that want the
+// implied SP argument list.
+// FIXME: It would be better to not use this at all and refactor the
+// instructions to not have SP an an explicit argument. That makes
+// frame index resolution a bit trickier, though.
+def GPRsp : RegisterClass<"ARM", [i32], 32, (add SP)>;
+
+// restricted GPR register class. Many Thumb2 instructions allow the full
+// register range for operands, but have undefined behaviours when PC
+// or SP (R13 or R15) are used. The ARM ISA refers to these operands
+// via the BadReg() pseudo-code description.
+def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
+  let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)];
+  let AltOrderSelect = [{
+      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
+// Thumb registers are R0-R7 normally. Some instructions can still use
+// the general GPR register class above (MOV, e.g.)
+def tGPR : RegisterClass<"ARM", [i32], 32, (trunc GPR, 8)>;
+
+// The high registers in thumb mode, R8-R15.
+def hGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, tGPR)>;
+
+// For tail calls, we can't use callee-saved registers, as they are restored
+// to the saved value before the tail call, which would clobber a call address.
+// Note, getMinimalPhysRegClass(R0) returns tGPR because of the names of
+// this class and the preceding one(!)  This is what we want.
+def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R12)> {
+  let AltOrders = [(and tcGPR, tGPR)];
+  let AltOrderSelect = [{
+      return MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
+// Condition code registers.
+def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+  let isAllocatable = 0;
+}
+
+// Scalar single precision floating point register class..
+// FIXME: Allocation order changed to s0, s2, ... or s0, s4, ... as a quick hack
+// to avoid partial-write dependencies on D or Q (depending on platform)
+// registers (S registers are renamed as portions of D/Q registers).
+def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> {
+  let AltOrders = [(add (decimate SPR, 2), SPR),
+                   (add (decimate SPR, 4),
+                        (decimate SPR, 2),
+                        (decimate (rotl SPR, 1), 4),
+                        (decimate (rotl SPR, 1), 2))];
+  let AltOrderSelect = [{
+    return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+  }];
+}
+
+// Subset of SPR which can be used as a source of NEON scalars for 16-bit
+// operations
+def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)>;
+
+// Scalar double precision floating point / generic 64-bit vector register
+// class.
+// ARM requires only word alignment for double. It's more performant if it
+// is double-word alignment though.
+def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
+                        (sequence "D%u", 0, 31)> {
+  // Allocate non-VFP2 registers D16-D31 first, and prefer even registers on
+  // Darwin platforms.
+  let AltOrders = [(rotl DPR, 16),
+                   (add (decimate (rotl DPR, 16), 2), (rotl DPR, 16))];
+  let AltOrderSelect = [{
+    return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+  }];
+}
+
+// Subset of DPR that are accessible with VFP2 (and so that also have
+// 32-bit SPR subregs).
+def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
+                             (trunc DPR, 16)>;
+
+// Subset of DPR which can be used as a source of NEON scalars for 16-bit
+// operations
+def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
+                          (trunc DPR, 8)>;
+
+// Generic 128-bit vector register class.
+def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128,
+                        (sequence "Q%u", 0, 15)> {
+  // Allocate non-VFP2 aliases Q8-Q15 first.
+  let AltOrders = [(rotl QPR, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// Subset of QPR that have 32-bit SPR subregs.
+def QPR_VFP2 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                             128, (trunc QPR, 8)>;
+
+// Subset of QPR that have DPR_8 and SPR_8 subregs.
+def QPR_8 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                           128, (trunc QPR, 4)>;
+
+// Pseudo-registers representing odd-even pairs of D registers. The even-odd
+// pairs are already represented by the Q registers.
+// These are needed by NEON instructions requiring two consecutive D registers.
+// There is no D31_D0 register as that is always an UNPREDICTABLE encoding.
+def TuplesOE2D : RegisterTuples<[dsub_0, dsub_1],
+                                [(decimate (shl DPR, 1), 2),
+                                 (decimate (shl DPR, 2), 2)]>;
+
+// Register class representing a pair of consecutive D registers.
+// Use the Q registers for the even-odd pairs.
+def DPair : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                          128, (interleave QPR, TuplesOE2D)> {
+  // Allocate starting at non-VFP2 registers D16-D31 first.
+  // Prefer even-odd pairs as they are easier to copy.
+  let AltOrders = [(add (rotl QPR, 8), (rotl DPair, 16))];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// Pseudo-registers representing even-odd pairs of GPRs from R1 to R13/SP.
+// These are needed by instructions (e.g. ldrexd/strexd) requiring even-odd GPRs.
+def Tuples2R : RegisterTuples<[gsub_0, gsub_1],
+                              [(add R0, R2, R4, R6, R8, R10, R12),
+                               (add R1, R3, R5, R7, R9, R11, SP)]>;
+
+// Register class representing a pair of even-odd GPRs.
+def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2R)> {
+  let Size = 64; // 2 x 32 bits, we have no predefined type of that size.
+}
+
+// Pseudo-registers representing 3 consecutive D registers.
+def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2],
+                              [(shl DPR, 0),
+                               (shl DPR, 1),
+                               (shl DPR, 2)]>;
+
+// 3 consecutive D registers.
+def DTriple : RegisterClass<"ARM", [untyped], 64, (add Tuples3D)> {
+  let Size = 192; // 3 x 64 bits, we have no predefined type of that size.
+}
+
+// Pseudo 256-bit registers to represent pairs of Q registers. These should
+// never be present in the emitted code.
+// These are used for NEON load / store instructions, e.g., vld4, vst3.
+def Tuples2Q : RegisterTuples<[qsub_0, qsub_1], [(shl QPR, 0), (shl QPR, 1)]>;
+
+// Pseudo 256-bit vector register class to model pairs of Q registers
+// (4 consecutive D registers).
+def QQPR : RegisterClass<"ARM", [v4i64], 256, (add Tuples2Q)> {
+  // Allocate non-VFP2 aliases first.
+  let AltOrders = [(rotl QQPR, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// Tuples of 4 D regs that isn't also a pair of Q regs.
+def TuplesOE4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3],
+                                [(decimate (shl DPR, 1), 2),
+                                 (decimate (shl DPR, 2), 2),
+                                 (decimate (shl DPR, 3), 2),
+                                 (decimate (shl DPR, 4), 2)]>;
+
+// 4 consecutive D registers.
+def DQuad : RegisterClass<"ARM", [v4i64], 256,
+                          (interleave Tuples2Q, TuplesOE4D)>;
+
+// Pseudo 512-bit registers to represent four consecutive Q registers.
+def Tuples2QQ : RegisterTuples<[qqsub_0, qqsub_1],
+                               [(shl QQPR, 0), (shl QQPR, 2)]>;
+
+// Pseudo 512-bit vector register class to model 4 consecutive Q registers
+// (8 consecutive D registers).
+def QQQQPR : RegisterClass<"ARM", [v8i64], 256, (add Tuples2QQ)> {
+  // Allocate non-VFP2 aliases first.
+  let AltOrders = [(rotl QQQQPR, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+
+// Pseudo-registers representing 2-spaced consecutive D registers.
+def Tuples2DSpc : RegisterTuples<[dsub_0, dsub_2],
+                                 [(shl DPR, 0),
+                                  (shl DPR, 2)]>;
+
+// Spaced pairs of D registers.
+def DPairSpc : RegisterClass<"ARM", [v2i64], 64, (add Tuples2DSpc)>;
+
+def Tuples3DSpc : RegisterTuples<[dsub_0, dsub_2, dsub_4],
+                                 [(shl DPR, 0),
+                                  (shl DPR, 2),
+                                  (shl DPR, 4)]>;
+
+// Spaced triples of D registers.
+def DTripleSpc : RegisterClass<"ARM", [untyped], 64, (add Tuples3DSpc)> {
+  let Size = 192; // 3 x 64 bits, we have no predefined type of that size.
+}
+
+def Tuples4DSpc : RegisterTuples<[dsub_0, dsub_2, dsub_4, dsub_6],
+                                 [(shl DPR, 0),
+                                  (shl DPR, 2),
+                                  (shl DPR, 4),
+                                  (shl DPR, 6)]>;
+
+// Spaced quads of D registers.
+def DQuadSpc : RegisterClass<"ARM", [v4i64], 64, (add Tuples3DSpc)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMSchedule.td b/contrib/llvm/lib/Target/ARM/ARMSchedule.td
new file mode 100644
index 000000000000..b7d2d34614df
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSchedule.td
@@ -0,0 +1,367 @@
+//===-- ARMSchedule.td - ARM Scheduling Definitions --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Instruction scheduling annotations for out-of-order CPUs.
+// These annotations are independent of the itinerary class defined below.
+// Here we define the subtarget independent read/write per-operand resources.
+// The subtarget schedule definitions will then map these to the subtarget's
+// resource usages.
+// For example:
+// The instruction cycle timings table might contain an entry for an operation
+// like the following:
+// Rd <- ADD Rn, Rm, <shift> Rs
+//  Uops | Latency from register | Uops - resource requirements - latency
+//  2    | Rn: 1 Rm: 4 Rs: 4     | uop T0, Rm, Rs - P01 - 3
+//       |                       | uopc Rd, Rn, T0 -  P01 - 1
+// This is telling us that the result will be available in destination register
+// Rd after a minimum of three cycles after the result in Rm and Rs is available
+// and one cycle after the result in Rn is available. The micro-ops can execute
+// on resource P01.
+// To model this, we need to express that we need to dispatch two micro-ops,
+// that the resource P01 is needed and that the latency to Rn is different than
+// the latency to Rm and Rs. The scheduler can decrease Rn's producer latency by
+// two.
+// We will do this by assigning (abstract) resources to register defs/uses.
+// ARMSchedule.td:
+//   def WriteALUsr : SchedWrite;
+//   def ReadAdvanceALUsr : ScheRead;
+//
+// ARMInstrInfo.td:
+//   def ADDrs : I<>, Sched<[WriteALUsr, ReadAdvanceALUsr, ReadDefault,
+//                           ReadDefault]> { ...}
+// ReadAdvance read resources allow us to define "pipeline by-passes" or
+// shorter latencies to certain registers as needed in the example above.
+// The "ReadDefault" can be omitted.
+// Next, the subtarget td file assigns resources to the abstract resources
+// defined here.
+// ARMScheduleSubtarget.td:
+//  // Resources.
+//  def P01 : ProcResource<3>; // ALU unit (3 of it).
+//  ...
+//  // Resource usages.
+//  def : WriteRes<WriteALUsr, [P01, P01]> {
+//    Latency = 4; // Latency of 4.
+//    NumMicroOps = 2; // Dispatch 2 micro-ops.
+//    // The two instances of resource P01 are occupied for one cycle. It is one
+//    // cycle because these resources happen to be pipelined.
+//    ResourceCycles = [1, 1];
+//  }
+//  def : ReadAdvance<ReadAdvanceALUsr, 3>;
+
+// Basic ALU operation.
+def WriteALU : SchedWrite;
+def ReadALU : SchedRead;
+
+// Basic ALU with shifts.
+def WriteALUsi : SchedWrite; // Shift by immediate.
+def WriteALUsr : SchedWrite; // Shift by register.
+def WriteALUSsr : SchedWrite; // Shift by register (flag setting).
+def ReadALUsr : SchedRead; // Some operands are read later.
+
+// Compares.
+def WriteCMP : SchedWrite;
+def WriteCMPsi : SchedWrite;
+def WriteCMPsr : SchedWrite;
+
+// Division.
+def WriteDiv : SchedWrite;
+
+// Loads.
+def WriteLd : SchedWrite;
+def WritePreLd : SchedWrite;
+
+// Branches.
+def WriteBr : SchedWrite;
+def WriteBrL : SchedWrite;
+def WriteBrTbl : SchedWrite;
+
+// Fixpoint conversions.
+def WriteCvtFP : SchedWrite;
+
+// Noop.
+def WriteNoop : SchedWrite;
+
+// Define TII for use in SchedVariant Predicates.
+def : PredicateProlog<[{
+  const ARMBaseInstrInfo *TII =
+    static_cast<const ARMBaseInstrInfo*>(SchedModel->getInstrInfo());
+  (void)TII;
+}]>;
+
+def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(*MI)}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for ARM
+//
+def IIC_iALUx      : InstrItinClass;
+def IIC_iALUi      : InstrItinClass;
+def IIC_iALUr      : InstrItinClass;
+def IIC_iALUsi     : InstrItinClass;
+def IIC_iALUsir    : InstrItinClass;
+def IIC_iALUsr     : InstrItinClass;
+def IIC_iBITi      : InstrItinClass;
+def IIC_iBITr      : InstrItinClass;
+def IIC_iBITsi     : InstrItinClass;
+def IIC_iBITsr     : InstrItinClass;
+def IIC_iUNAr      : InstrItinClass;
+def IIC_iUNAsi     : InstrItinClass;
+def IIC_iEXTr      : InstrItinClass;
+def IIC_iEXTAr     : InstrItinClass;
+def IIC_iEXTAsr    : InstrItinClass;
+def IIC_iCMPi      : InstrItinClass;
+def IIC_iCMPr      : InstrItinClass;
+def IIC_iCMPsi     : InstrItinClass;
+def IIC_iCMPsr     : InstrItinClass;
+def IIC_iTSTi      : InstrItinClass;
+def IIC_iTSTr      : InstrItinClass;
+def IIC_iTSTsi     : InstrItinClass;
+def IIC_iTSTsr     : InstrItinClass;
+def IIC_iMOVi      : InstrItinClass;
+def IIC_iMOVr      : InstrItinClass;
+def IIC_iMOVsi     : InstrItinClass;
+def IIC_iMOVsr     : InstrItinClass;
+def IIC_iMOVix2    : InstrItinClass;
+def IIC_iMOVix2addpc : InstrItinClass;
+def IIC_iMOVix2ld  : InstrItinClass;
+def IIC_iMVNi      : InstrItinClass;
+def IIC_iMVNr      : InstrItinClass;
+def IIC_iMVNsi     : InstrItinClass;
+def IIC_iMVNsr     : InstrItinClass;
+def IIC_iCMOVi     : InstrItinClass;
+def IIC_iCMOVr     : InstrItinClass;
+def IIC_iCMOVsi    : InstrItinClass;
+def IIC_iCMOVsr    : InstrItinClass;
+def IIC_iCMOVix2   : InstrItinClass;
+def IIC_iMUL16     : InstrItinClass;
+def IIC_iMAC16     : InstrItinClass;
+def IIC_iMUL32     : InstrItinClass;
+def IIC_iMAC32     : InstrItinClass;
+def IIC_iMUL64     : InstrItinClass;
+def IIC_iMAC64     : InstrItinClass;
+def IIC_iDIV     : InstrItinClass;
+def IIC_iLoad_i    : InstrItinClass;
+def IIC_iLoad_r    : InstrItinClass;
+def IIC_iLoad_si   : InstrItinClass;
+def IIC_iLoad_iu   : InstrItinClass;
+def IIC_iLoad_ru   : InstrItinClass;
+def IIC_iLoad_siu  : InstrItinClass;
+def IIC_iLoad_bh_i   : InstrItinClass;
+def IIC_iLoad_bh_r   : InstrItinClass;
+def IIC_iLoad_bh_si  : InstrItinClass;
+def IIC_iLoad_bh_iu  : InstrItinClass;
+def IIC_iLoad_bh_ru  : InstrItinClass;
+def IIC_iLoad_bh_siu : InstrItinClass;
+def IIC_iLoad_d_i  : InstrItinClass;
+def IIC_iLoad_d_r  : InstrItinClass;
+def IIC_iLoad_d_ru : InstrItinClass;
+def IIC_iLoad_m    : InstrItinClass;
+def IIC_iLoad_mu   : InstrItinClass;
+def IIC_iLoad_mBr  : InstrItinClass;
+def IIC_iPop       : InstrItinClass;
+def IIC_iPop_Br    : InstrItinClass;
+def IIC_iLoadiALU  : InstrItinClass;
+def IIC_iStore_i   : InstrItinClass;
+def IIC_iStore_r   : InstrItinClass;
+def IIC_iStore_si  : InstrItinClass;
+def IIC_iStore_iu  : InstrItinClass;
+def IIC_iStore_ru  : InstrItinClass;
+def IIC_iStore_siu : InstrItinClass;
+def IIC_iStore_bh_i   : InstrItinClass;
+def IIC_iStore_bh_r   : InstrItinClass;
+def IIC_iStore_bh_si  : InstrItinClass;
+def IIC_iStore_bh_iu  : InstrItinClass;
+def IIC_iStore_bh_ru  : InstrItinClass;
+def IIC_iStore_bh_siu : InstrItinClass;
+def IIC_iStore_d_i   : InstrItinClass;
+def IIC_iStore_d_r   : InstrItinClass;
+def IIC_iStore_d_ru  : InstrItinClass;
+def IIC_iStore_m   : InstrItinClass;
+def IIC_iStore_mu  : InstrItinClass;
+def IIC_Preload    : InstrItinClass;
+def IIC_Br         : InstrItinClass;
+def IIC_fpSTAT     : InstrItinClass;
+def IIC_fpUNA16    : InstrItinClass;
+def IIC_fpUNA32    : InstrItinClass;
+def IIC_fpUNA64    : InstrItinClass;
+def IIC_fpCMP16    : InstrItinClass;
+def IIC_fpCMP32    : InstrItinClass;
+def IIC_fpCMP64    : InstrItinClass;
+def IIC_fpCVTSD    : InstrItinClass;
+def IIC_fpCVTDS    : InstrItinClass;
+def IIC_fpCVTSH    : InstrItinClass;
+def IIC_fpCVTHS    : InstrItinClass;
+def IIC_fpCVTIH    : InstrItinClass;
+def IIC_fpCVTIS    : InstrItinClass;
+def IIC_fpCVTID    : InstrItinClass;
+def IIC_fpCVTHI    : InstrItinClass;
+def IIC_fpCVTSI    : InstrItinClass;
+def IIC_fpCVTDI    : InstrItinClass;
+def IIC_fpMOVIS    : InstrItinClass;
+def IIC_fpMOVID    : InstrItinClass;
+def IIC_fpMOVSI    : InstrItinClass;
+def IIC_fpMOVDI    : InstrItinClass;
+def IIC_fpALU16    : InstrItinClass;
+def IIC_fpALU32    : InstrItinClass;
+def IIC_fpALU64    : InstrItinClass;
+def IIC_fpMUL16    : InstrItinClass;
+def IIC_fpMUL32    : InstrItinClass;
+def IIC_fpMUL64    : InstrItinClass;
+def IIC_fpMAC16    : InstrItinClass;
+def IIC_fpMAC32    : InstrItinClass;
+def IIC_fpMAC64    : InstrItinClass;
+def IIC_fpFMAC16   : InstrItinClass;
+def IIC_fpFMAC32   : InstrItinClass;
+def IIC_fpFMAC64   : InstrItinClass;
+def IIC_fpDIV16    : InstrItinClass;
+def IIC_fpDIV32    : InstrItinClass;
+def IIC_fpDIV64    : InstrItinClass;
+def IIC_fpSQRT16   : InstrItinClass;
+def IIC_fpSQRT32   : InstrItinClass;
+def IIC_fpSQRT64   : InstrItinClass;
+def IIC_fpLoad16   : InstrItinClass;
+def IIC_fpLoad32   : InstrItinClass;
+def IIC_fpLoad64   : InstrItinClass;
+def IIC_fpLoad_m   : InstrItinClass;
+def IIC_fpLoad_mu  : InstrItinClass;
+def IIC_fpStore16  : InstrItinClass;
+def IIC_fpStore32  : InstrItinClass;
+def IIC_fpStore64  : InstrItinClass;
+def IIC_fpStore_m  : InstrItinClass;
+def IIC_fpStore_mu : InstrItinClass;
+def IIC_VLD1       : InstrItinClass;
+def IIC_VLD1x2     : InstrItinClass;
+def IIC_VLD1x3     : InstrItinClass;
+def IIC_VLD1x4     : InstrItinClass;
+def IIC_VLD1u      : InstrItinClass;
+def IIC_VLD1x2u    : InstrItinClass;
+def IIC_VLD1x3u    : InstrItinClass;
+def IIC_VLD1x4u    : InstrItinClass;
+def IIC_VLD1ln     : InstrItinClass;
+def IIC_VLD1lnu    : InstrItinClass;
+def IIC_VLD1dup    : InstrItinClass;
+def IIC_VLD1dupu   : InstrItinClass;
+def IIC_VLD2       : InstrItinClass;
+def IIC_VLD2x2     : InstrItinClass;
+def IIC_VLD2u      : InstrItinClass;
+def IIC_VLD2x2u    : InstrItinClass;
+def IIC_VLD2ln     : InstrItinClass;
+def IIC_VLD2lnu    : InstrItinClass;
+def IIC_VLD2dup    : InstrItinClass;
+def IIC_VLD2dupu   : InstrItinClass;
+def IIC_VLD3       : InstrItinClass;
+def IIC_VLD3ln     : InstrItinClass;
+def IIC_VLD3u      : InstrItinClass;
+def IIC_VLD3lnu    : InstrItinClass;
+def IIC_VLD3dup    : InstrItinClass;
+def IIC_VLD3dupu   : InstrItinClass;
+def IIC_VLD4       : InstrItinClass;
+def IIC_VLD4ln     : InstrItinClass;
+def IIC_VLD4u      : InstrItinClass;
+def IIC_VLD4lnu    : InstrItinClass;
+def IIC_VLD4dup    : InstrItinClass;
+def IIC_VLD4dupu   : InstrItinClass;
+def IIC_VST1       : InstrItinClass;
+def IIC_VST1x2     : InstrItinClass;
+def IIC_VST1x3     : InstrItinClass;
+def IIC_VST1x4     : InstrItinClass;
+def IIC_VST1u      : InstrItinClass;
+def IIC_VST1x2u    : InstrItinClass;
+def IIC_VST1x3u    : InstrItinClass;
+def IIC_VST1x4u    : InstrItinClass;
+def IIC_VST1ln     : InstrItinClass;
+def IIC_VST1lnu    : InstrItinClass;
+def IIC_VST2       : InstrItinClass;
+def IIC_VST2x2     : InstrItinClass;
+def IIC_VST2u      : InstrItinClass;
+def IIC_VST2x2u    : InstrItinClass;
+def IIC_VST2ln     : InstrItinClass;
+def IIC_VST2lnu    : InstrItinClass;
+def IIC_VST3       : InstrItinClass;
+def IIC_VST3u      : InstrItinClass;
+def IIC_VST3ln     : InstrItinClass;
+def IIC_VST3lnu    : InstrItinClass;
+def IIC_VST4       : InstrItinClass;
+def IIC_VST4u      : InstrItinClass;
+def IIC_VST4ln     : InstrItinClass;
+def IIC_VST4lnu    : InstrItinClass;
+def IIC_VUNAD      : InstrItinClass;
+def IIC_VUNAQ      : InstrItinClass;
+def IIC_VBIND      : InstrItinClass;
+def IIC_VBINQ      : InstrItinClass;
+def IIC_VPBIND     : InstrItinClass;
+def IIC_VFMULD     : InstrItinClass;
+def IIC_VFMULQ     : InstrItinClass;
+def IIC_VMOV       : InstrItinClass;
+def IIC_VMOVImm    : InstrItinClass;
+def IIC_VMOVD      : InstrItinClass;
+def IIC_VMOVQ      : InstrItinClass;
+def IIC_VMOVIS     : InstrItinClass;
+def IIC_VMOVID     : InstrItinClass;
+def IIC_VMOVISL    : InstrItinClass;
+def IIC_VMOVSI     : InstrItinClass;
+def IIC_VMOVDI     : InstrItinClass;
+def IIC_VMOVN      : InstrItinClass;
+def IIC_VPERMD     : InstrItinClass;
+def IIC_VPERMQ     : InstrItinClass;
+def IIC_VPERMQ3    : InstrItinClass;
+def IIC_VMACD      : InstrItinClass;
+def IIC_VMACQ      : InstrItinClass;
+def IIC_VFMACD     : InstrItinClass;
+def IIC_VFMACQ     : InstrItinClass;
+def IIC_VRECSD     : InstrItinClass;
+def IIC_VRECSQ     : InstrItinClass;
+def IIC_VCNTiD     : InstrItinClass;
+def IIC_VCNTiQ     : InstrItinClass;
+def IIC_VUNAiD     : InstrItinClass;
+def IIC_VUNAiQ     : InstrItinClass;
+def IIC_VQUNAiD    : InstrItinClass;
+def IIC_VQUNAiQ    : InstrItinClass;
+def IIC_VBINiD     : InstrItinClass;
+def IIC_VBINiQ     : InstrItinClass;
+def IIC_VSUBiD     : InstrItinClass;
+def IIC_VSUBiQ     : InstrItinClass;
+def IIC_VBINi4D    : InstrItinClass;
+def IIC_VBINi4Q    : InstrItinClass;
+def IIC_VSUBi4D    : InstrItinClass;
+def IIC_VSUBi4Q    : InstrItinClass;
+def IIC_VABAD      : InstrItinClass;
+def IIC_VABAQ      : InstrItinClass;
+def IIC_VSHLiD     : InstrItinClass;
+def IIC_VSHLiQ     : InstrItinClass;
+def IIC_VSHLi4D    : InstrItinClass;
+def IIC_VSHLi4Q    : InstrItinClass;
+def IIC_VPALiD     : InstrItinClass;
+def IIC_VPALiQ     : InstrItinClass;
+def IIC_VMULi16D   : InstrItinClass;
+def IIC_VMULi32D   : InstrItinClass;
+def IIC_VMULi16Q   : InstrItinClass;
+def IIC_VMULi32Q   : InstrItinClass;
+def IIC_VMACi16D   : InstrItinClass;
+def IIC_VMACi32D   : InstrItinClass;
+def IIC_VMACi16Q   : InstrItinClass;
+def IIC_VMACi32Q   : InstrItinClass;
+def IIC_VEXTD      : InstrItinClass;
+def IIC_VEXTQ      : InstrItinClass;
+def IIC_VTB1       : InstrItinClass;
+def IIC_VTB2       : InstrItinClass;
+def IIC_VTB3       : InstrItinClass;
+def IIC_VTB4       : InstrItinClass;
+def IIC_VTBX1      : InstrItinClass;
+def IIC_VTBX2      : InstrItinClass;
+def IIC_VTBX3      : InstrItinClass;
+def IIC_VTBX4      : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+include "ARMScheduleV6.td"
+include "ARMScheduleA8.td"
+include "ARMScheduleA9.td"
+include "ARMScheduleSwift.td"
+include "ARMScheduleR52.td"
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td
new file mode 100644
index 000000000000..ba380cba100f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td
@@ -0,0 +1,1075 @@
+//=- ARMScheduleA8.td - ARM Cortex-A8 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM Cortex A8 processors.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Scheduling information derived from "Cortex-A8 Technical Reference Manual".
+// Functional Units.
+def A8_Pipe0   : FuncUnit; // pipeline 0
+def A8_Pipe1   : FuncUnit; // pipeline 1
+def A8_LSPipe  : FuncUnit; // Load / store pipeline
+def A8_NPipe   : FuncUnit; // NEON ALU/MUL pipe
+def A8_NLSPipe : FuncUnit; // NEON LS pipe
+//
+// Dual issue pipeline represented by A8_Pipe0 | A8_Pipe1
+//
+def CortexA8Itineraries : ProcessorItineraries<
+  [A8_Pipe0, A8_Pipe1, A8_LSPipe, A8_NPipe, A8_NLSPipe],
+  [], [
+  // Two fully-pipelined integer ALU pipelines
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iALUr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>,
+  InstrItinData<IIC_iALUsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iALUsir,[InstrStage<1,[A8_Pipe0, A8_Pipe1]>], [2, 1, 2]>,
+  InstrItinData<IIC_iALUsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>,
+  //
+  // Bitwise Instructions that produce a result
+  InstrItinData<IIC_iBITi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iBITr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>,
+  InstrItinData<IIC_iBITsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iBITsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>,
+  //
+  // Unary Instructions that produce a result
+  InstrItinData<IIC_iUNAr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iUNAsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  //
+  // Zero and sign extension instructions
+  InstrItinData<IIC_iEXTr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iEXTAr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>],[2, 2, 1, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMPr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iCMPsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMPsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  //
+  // Test instructions
+  InstrItinData<IIC_iTSTi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iTSTr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iTSTsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iTSTsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>,
+  InstrItinData<IIC_iMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>,
+  InstrItinData<IIC_iMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                             InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                                  InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                                  InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3]>,
+  InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<1, [A8_LSPipe]>], [5]>,
+  //
+  // Move instructions, conditional
+  InstrItinData<IIC_iCMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iCMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                              InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3, 1]>,
+  //
+  // MVN instructions
+  InstrItinData<IIC_iMVNi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>,
+  InstrItinData<IIC_iMVNr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMVNsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMVNsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>,
+
+  // Integer multiply pipeline
+  // Result written in E5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  //
+  InstrItinData<IIC_iMUL16   , [InstrStage<1, [A8_Pipe0]>], [5, 1, 1]>,
+  InstrItinData<IIC_iMAC16   , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>,
+  InstrItinData<IIC_iMUL32   , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1]>,
+  InstrItinData<IIC_iMAC32   , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>,
+  InstrItinData<IIC_iMUL64   , [InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>,
+  InstrItinData<IIC_iMAC64   , [InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>,
+
+  // Integer load pipeline
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iLoad_d_i,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  // FIXME: lsl by 2 takes 1 cycle.
+  InstrItinData<IIC_iLoad_si  , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_si,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [4, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1]>,
+  InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iLoad_siu , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<2, [A8_LSPipe]>], [4, 3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<2, [A8_LSPipe]>], [4, 3, 1, 1]>,
+  //
+  // Load multiple, def is the 5th operand. Pipeline 0 only.
+  // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<2, [A8_Pipe0], 0>,
+                                InstrStage<2, [A8_LSPipe]>],
+                [1, 1, 1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<3, [A8_Pipe0], 0>,
+                                InstrStage<3, [A8_LSPipe]>],
+                [2, 1, 1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Load multiple plus branch
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [A8_Pipe0], 0>,
+                                InstrStage<3, [A8_LSPipe]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
+                              [1, 2, 1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop  ,    [InstrStage<3, [A8_Pipe0], 0>,
+                                InstrStage<3, [A8_LSPipe]>],
+                [1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Push, def is the 3th operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<3, [A8_Pipe0], 0>,
+                                InstrStage<3, [A8_LSPipe]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
+                               [1, 1, 3], [], -1>, // dynamic uops
+  //
+  // iLoadi + iALUr for t2LDRpci_pic.
+  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                InstrStage<1, [A8_LSPipe]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [4, 1]>,
+
+
+  // Integer store pipeline
+  //
+  // Immediate offset
+  InstrItinData<IIC_iStore_i  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStore_r  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  InstrItinData<IIC_iStore_si , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<2, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_si,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<2, [A8_LSPipe]>], [3, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStore_iu , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [2, 3, 1]>,
+  InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [2, 3, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStore_ru  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>,
+  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iStore_siu, [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<2, [A8_LSPipe]>], [3, 3, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                   InstrStage<2, [A8_LSPipe]>], [3, 3, 1, 1]>,
+  //
+  // Store multiple. Pipeline 0 only.
+  // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
+  InstrItinData<IIC_iStore_m , [InstrStage<2, [A8_Pipe0], 0>,
+                                InstrStage<2, [A8_LSPipe]>],
+                [], [], -1>, // dynamic uops
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu, [InstrStage<2, [A8_Pipe0], 0>,
+                                InstrStage<2, [A8_LSPipe]>],
+                [2], [], -1>, // dynamic uops
+  //
+  // Preload
+  InstrItinData<IIC_Preload, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br      , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>,
+
+  // VFP
+  // Issue through integer pipeline, and execute in NEON unit. We assume
+  // RunFast mode so that NFP pipeline is used for single-precision when
+  // possible.
+  //
+  // FP Special Register to Integer Register File Move
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                              InstrStage<1, [A8_NLSPipe]>], [20]>,
+  //
+  // Single-precision FP Unary
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 1]>,
+  //
+  // Double-precision FP Unary
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NPipe], 0>,
+                               InstrStage<4, [A8_NLSPipe]>], [4, 1]>,
+  //
+  // Single-precision FP Compare
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [1, 1]>,
+  //
+  // Double-precision FP Compare
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NPipe], 0>,
+                               InstrStage<4, [A8_NLSPipe]>], [4, 1]>,
+  //
+  // Single to Double FP Convert
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<7, [A8_NPipe], 0>,
+                               InstrStage<7, [A8_NLSPipe]>], [7, 1]>,
+  //
+  // Double to Single FP Convert
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NPipe], 0>,
+                               InstrStage<5, [A8_NLSPipe]>], [5, 1]>,
+  //
+  // Single-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 1]>,
+  //
+  // Double-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<8, [A8_NPipe], 0>,
+                               InstrStage<8, [A8_NLSPipe]>], [8, 1]>,
+  //
+  // Integer to Single-Precision FP Convert
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 1]>,
+  //
+  // Integer to Double-Precision FP Convert
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<8, [A8_NPipe], 0>,
+                               InstrStage<8, [A8_NLSPipe]>], [8, 1]>,
+  //
+  // Single-precision FP ALU
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 1, 1]>,
+  //
+  // Double-precision FP ALU
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<9, [A8_NPipe], 0>,
+                               InstrStage<9, [A8_NLSPipe]>], [9, 1, 1]>,
+  //
+  // Single-precision FP Multiply
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 1, 1]>,
+  //
+  // Double-precision FP Multiply
+  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<11, [A8_NPipe], 0>,
+                               InstrStage<11, [A8_NLSPipe]>], [11, 1, 1]>,
+  //
+  // Single-precision FP MAC
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>,
+  //
+  // Double-precision FP MAC
+  InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<19, [A8_NPipe], 0>,
+                               InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
+  //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<19, [A8_NPipe], 0>,
+                               InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
+  //
+  // Single-precision FP DIV
+  InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<20, [A8_NPipe], 0>,
+                               InstrStage<20, [A8_NLSPipe]>], [20, 1, 1]>,
+  //
+  // Double-precision FP DIV
+  InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<29, [A8_NPipe], 0>,
+                               InstrStage<29, [A8_NLSPipe]>], [29, 1, 1]>,
+  //
+  // Single-precision FP SQRT
+  InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<19, [A8_NPipe], 0>,
+                               InstrStage<19, [A8_NLSPipe]>], [19, 1]>,
+  //
+  // Double-precision FP SQRT
+  InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<29, [A8_NPipe], 0>,
+                               InstrStage<29, [A8_NLSPipe]>], [29, 1]>,
+
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>],
+                              [2, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>],
+                              [2, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>],
+                              [20, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>],
+                              [20, 20, 1]>,
+
+  //
+  // Single-precision FP Load
+  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                              [2, 1]>,
+  //
+  // Double-precision FP Load
+  InstrItinData<IIC_fpLoad64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                              [2, 1]>,
+  //
+  // FP Load Multiple
+  // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
+  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [1, 1, 1, 2], [], -1>, // dynamic uops
+  //
+  // FP Load Multiple + update
+  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [2, 1, 1, 1, 2], [], -1>, // dynamic uops
+  //
+  // Single-precision FP Store
+  InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Store
+  InstrItinData<IIC_fpStore64,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                              [1, 1]>,
+  //
+  // FP Store Multiple
+  InstrItinData<IIC_fpStore_m,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [1, 1, 1, 1], [], -1>, // dynamic uops
+  //
+  // FP Store Multiple + update
+  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                InstrStage<1, [A8_NLSPipe], 0>,
+                                InstrStage<1, [A8_LSPipe]>,
+                                InstrStage<1, [A8_NLSPipe], 0>,
+                                InstrStage<1, [A8_LSPipe]>],
+                [2, 1, 1, 1, 1], [], -1>, // dynamic uops
+  // NEON
+  // Issue through integer pipeline, and execute in NEON unit.
+  //
+  // VLD1
+  InstrItinData<IIC_VLD1,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1]>,
+  // VLD1x2
+  InstrItinData<IIC_VLD1x2,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD1x3
+  InstrItinData<IIC_VLD1x3,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 1]>,
+  //
+  // VLD1x4
+  InstrItinData<IIC_VLD1x4,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD1u
+  InstrItinData<IIC_VLD1u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD1x2u
+  InstrItinData<IIC_VLD1x2u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 2, 1]>,
+  //
+  // VLD1x3u
+  InstrItinData<IIC_VLD1x3u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 2, 1]>,
+  //
+  // VLD1x4u
+  InstrItinData<IIC_VLD1x4u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 2, 1]>,
+  //
+  // VLD1ln
+  InstrItinData<IIC_VLD1ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [3, 1, 1, 1]>,
+  //
+  // VLD1lnu
+  InstrItinData<IIC_VLD1lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [3, 2, 1, 1, 1, 1]>,
+  //
+  // VLD1dup
+  InstrItinData<IIC_VLD1dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1]>,
+  //
+  // VLD1dupu
+  InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1, 1]>,
+  //
+  // VLD2
+  InstrItinData<IIC_VLD2,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD2x2
+  InstrItinData<IIC_VLD2x2,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD2ln
+  InstrItinData<IIC_VLD2ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [3, 3, 1, 1, 1, 1]>,
+  //
+  // VLD2u
+  InstrItinData<IIC_VLD2u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 2, 1, 1, 1]>,
+  //
+  // VLD2x2u
+  InstrItinData<IIC_VLD2x2u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 2, 1]>,
+  //
+  // VLD2lnu
+  InstrItinData<IIC_VLD2lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [3, 3, 2, 1, 1, 1, 1, 1]>,
+  //
+  // VLD2dup
+  InstrItinData<IIC_VLD2dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD2dupu
+  InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 2, 1, 1]>,
+  //
+  // VLD3
+  InstrItinData<IIC_VLD3,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [3, 3, 4, 1]>,
+  //
+  // VLD3ln
+  InstrItinData<IIC_VLD3ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NLSPipe], 0>,
+                               InstrStage<5, [A8_LSPipe]>],
+                              [4, 4, 5, 1, 1, 1, 1, 2]>,
+  //
+  // VLD3u
+  InstrItinData<IIC_VLD3u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [3, 3, 4, 2, 1]>,
+  //
+  // VLD3lnu
+  InstrItinData<IIC_VLD3lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NLSPipe], 0>,
+                               InstrStage<5, [A8_LSPipe]>],
+                              [4, 4, 5, 2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VLD3dup
+  InstrItinData<IIC_VLD3dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 1]>,
+  //
+  // VLD3dupu
+  InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 2, 1, 1]>,
+  //
+  // VLD4
+  InstrItinData<IIC_VLD4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [3, 3, 4, 4, 1]>,
+  //
+  // VLD4ln
+  InstrItinData<IIC_VLD4ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NLSPipe], 0>,
+                               InstrStage<5, [A8_LSPipe]>],
+                              [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4u
+  InstrItinData<IIC_VLD4u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [3, 3, 4, 4, 2, 1]>,
+  //
+  // VLD4lnu
+  InstrItinData<IIC_VLD4lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NLSPipe], 0>,
+                               InstrStage<5, [A8_LSPipe]>],
+                              [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4dup
+  InstrItinData<IIC_VLD4dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD4dupu
+  InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 2, 1, 1]>,
+  //
+  // VST1
+  InstrItinData<IIC_VST1,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1]>,
+  //
+  // VST1x2
+  InstrItinData<IIC_VST1x2,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST1x3
+  InstrItinData<IIC_VST1x3,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST1x4
+  InstrItinData<IIC_VST1x4,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1u
+  InstrItinData<IIC_VST1u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST1x2u
+  InstrItinData<IIC_VST1x2u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST1x3u
+  InstrItinData<IIC_VST1x3u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST1x4u
+  InstrItinData<IIC_VST1x4u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1ln
+  InstrItinData<IIC_VST1ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1]>,
+  //
+  // VST1lnu
+  InstrItinData<IIC_VST1lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST2
+  InstrItinData<IIC_VST2,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2x2
+  InstrItinData<IIC_VST2x2,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2u
+  InstrItinData<IIC_VST2u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST2x2u
+  InstrItinData<IIC_VST2x2u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2ln
+  InstrItinData<IIC_VST2ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2lnu
+  InstrItinData<IIC_VST2lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST3
+  InstrItinData<IIC_VST3,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3u
+  InstrItinData<IIC_VST3u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST3ln
+  InstrItinData<IIC_VST3ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3lnu
+  InstrItinData<IIC_VST3lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST4
+  InstrItinData<IIC_VST4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4u
+  InstrItinData<IIC_VST4u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4ln
+  InstrItinData<IIC_VST4ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4lnu
+  InstrItinData<IIC_VST4lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // Double-register FP Unary
+  InstrItinData<IIC_VUNAD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [5, 2]>,
+  //
+  // Quad-register FP Unary
+  // Result written in N5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [6, 2]>,
+  //
+  // Double-register FP Binary
+  InstrItinData<IIC_VBIND,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [5, 2, 2]>,
+  //
+  // VPADD, etc.
+  InstrItinData<IIC_VPBIND,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [5, 2, 2]>,
+  //
+  // Double-register FP VMUL
+  InstrItinData<IIC_VFMULD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [5, 2, 1]>,
+
+  //
+  // Quad-register FP Binary
+  // Result written in N5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  InstrItinData<IIC_VBINQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [6, 2, 2]>,
+  //
+  // Quad-register FP VMUL
+  InstrItinData<IIC_VFMULQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [6, 2, 1]>,
+  //
+  // Move
+  InstrItinData<IIC_VMOV,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [1, 1]>,
+  //
+  // Move Immediate
+  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3]>,
+  //
+  // Double-register Permute Move
+  InstrItinData<IIC_VMOVD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [2, 1]>,
+  //
+  // Quad-register Permute Move
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 3 for those cases
+  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 1]>,
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [2, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [20, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [20, 20, 1]>,
+  //
+  // Integer to Lane Move
+  InstrItinData<IIC_VMOVISL , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>,
+  //
+  // Vector narrow move
+  InstrItinData<IIC_VMOVN   , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [2, 1]>,
+  //
+  // Double-register Permute
+  InstrItinData<IIC_VPERMD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [2, 2, 1, 1]>,
+  //
+  // Quad-register Permute
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 3 for those cases
+  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 3, 1, 1]>,
+  //
+  // Quad-register Permute (3 cycle issue)
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 4 for those cases
+  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>,
+                               InstrStage<1, [A8_NPipe], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [4, 4, 1, 1]>,
+  //
+  // Double-register FP Multiple-Accumulate
+  InstrItinData<IIC_VMACD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>,
+  //
+  // Quad-register FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VMACQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
+  //
+  // Double-register Fused FP Multiple-Accumulate
+  InstrItinData<IIC_VFMACD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>,
+  //
+  // Quad-register Fused FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
+  //
+  // Double-register Reciprical Step
+  InstrItinData<IIC_VRECSD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [9, 2, 2]>,
+  //
+  // Quad-register Reciprical Step
+  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [10, 2, 2]>,
+  //
+  // Double-register Integer Count
+  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 2, 2]>,
+  //
+  // Quad-register Integer Count
+  // Result written in N3, but that is relative to the last cycle of multicycle,
+  // so we use 4 for those cases
+  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [4, 2, 2]>,
+  //
+  // Double-register Integer Unary
+  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2]>,
+  //
+  // Quad-register Integer Unary
+  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2]>,
+  //
+  // Double-register Integer Q-Unary
+  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 1]>,
+  //
+  // Quad-register Integer CountQ-Unary
+  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 1]>,
+  //
+  // Double-register Integer Binary
+  InstrItinData<IIC_VBINiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 2, 2]>,
+  //
+  // Quad-register Integer Binary
+  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 2, 2]>,
+  //
+  // Double-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
+  //
+  // Quad-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
+
+  //
+  // Double-register Integer Subtract
+  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 2, 1]>,
+  //
+  // Quad-register Integer Subtract
+  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 2, 1]>,
+  //
+  // Double-register Integer Subtract
+  InstrItinData<IIC_VSUBi4D,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
+  //
+  // Quad-register Integer Subtract
+  InstrItinData<IIC_VSUBi4Q,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
+  //
+  // Double-register Integer Shift
+  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 1, 1]>,
+  //
+  // Quad-register Integer Shift
+  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [4, 1, 1]>,
+  //
+  // Double-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4D,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 1, 1]>,
+  //
+  // Quad-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4Q,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [5, 1, 1]>,
+  //
+  // Double-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [6, 3, 1]>,
+  //
+  // Quad-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [7, 3, 1]>,
+  //
+  // Double-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [6, 3, 2, 1]>,
+  //
+  // Quad-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [6, 3, 2, 1]>,
+
+  //
+  // Double-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [6, 2, 2]>,
+  //
+  // Double-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [7, 2, 1]>,
+  //
+  // Quad-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [7, 2, 2]>,
+  //
+  // Quad-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_NPipe]>], [9, 2, 1]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [6, 3, 2, 2]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [7, 3, 2, 1]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [7, 3, 2, 2]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_NPipe]>], [9, 3, 2, 1]>,
+  //
+  // Double-register VEXT
+  InstrItinData<IIC_VEXTD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>,
+  //
+  // Quad-register VEXT
+  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>,
+  //
+  // VTB
+  InstrItinData<IIC_VTB1,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 2, 1]>,
+  InstrItinData<IIC_VTB2,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 2, 2, 1]>,
+  InstrItinData<IIC_VTB3,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>,
+                               InstrStage<1, [A8_NPipe], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [4, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTB4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>,
+                               InstrStage<1, [A8_NPipe], 0>,
+                               InstrStage<2, [A8_NLSPipe]>],[4, 2, 2, 3, 3, 1]>,
+  //
+  // VTBX
+  InstrItinData<IIC_VTBX1,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 1]>,
+  InstrItinData<IIC_VTBX2,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 2, 1]>,
+  InstrItinData<IIC_VTBX3,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>,
+                               InstrStage<1, [A8_NPipe], 0>,
+                               InstrStage<2, [A8_NLSPipe]>],[4, 1, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTBX4,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>,
+                               InstrStage<1, [A8_NPipe], 0>,
+                            InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// This following definitions describe the simple machine model which
+// will replace itineraries.
+
+// Cortex-A8 machine model for scheduling and other instruction cost heuristics.
+def CortexA8Model : SchedMachineModel {
+  let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+  let LoadLatency = 2; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 13; // Based on estimate of pipeline depth.
+  let CompleteModel = 0;
+
+  let Itineraries = CortexA8Itineraries;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
new file mode 100644
index 000000000000..519e595bd184
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
@@ -0,0 +1,2529 @@
+//=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM Cortex A9 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// This section contains legacy support for itineraries. This is
+// required until SD and PostRA schedulers are replaced by MachineScheduler.
+
+//
+// Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical
+// Reference Manual".
+//
+// Functional units
+def A9_Issue0  : FuncUnit; // Issue 0
+def A9_Issue1  : FuncUnit; // Issue 1
+def A9_Branch  : FuncUnit; // Branch
+def A9_ALU0    : FuncUnit; // ALU / MUL pipeline 0
+def A9_ALU1    : FuncUnit; // ALU pipeline 1
+def A9_AGU     : FuncUnit; // Address generation unit for ld / st
+def A9_NPipe   : FuncUnit; // NEON pipeline
+def A9_MUX0    : FuncUnit; // AGU + NEON/FPU multiplexer
+def A9_LSUnit  : FuncUnit; // L/S Unit
+def A9_DRegsVFP: FuncUnit; // FP register set, VFP side
+def A9_DRegsN  : FuncUnit; // FP register set, NEON side
+
+// Bypasses
+def A9_LdBypass : Bypass;
+
+def CortexA9Itineraries : ProcessorItineraries<
+  [A9_Issue0, A9_Issue1, A9_Branch, A9_ALU0, A9_ALU1, A9_AGU, A9_NPipe, A9_MUX0,
+   A9_LSUnit, A9_DRegsVFP, A9_DRegsN],
+  [A9_LdBypass], [
+  // Two fully-pipelined integer ALU pipelines
+
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
+  InstrItinData<IIC_iMOVr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iMOVix2 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                                  InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                                  InstrStage<1, [A9_ALU0, A9_ALU1]>], [3]>,
+  InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_AGU], 0>,
+                               InstrStage<1, [A9_LSUnit]>], [5]>,
+  //
+  // MVN instructions
+  InstrItinData<IIC_iMVNi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMVNr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                              [1, 1], [NoBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iMVNsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>],
+                              [2, 1]>,
+  InstrItinData<IIC_iMVNsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0, A9_ALU1]>],
+                              [3, 1, 1]>,
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                            [1, 1], [NoBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                            [1, 1, 1], [NoBypass, A9_LdBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iALUsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>],
+                            [2, 1, 1], [NoBypass, A9_LdBypass, NoBypass]>,
+  InstrItinData<IIC_iALUsir,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>],
+                            [2, 1, 1], [NoBypass, NoBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iALUsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<3, [A9_ALU0, A9_ALU1]>],
+                            [3, 1, 1, 1],
+                            [NoBypass, A9_LdBypass, NoBypass, NoBypass]>,
+  //
+  // Bitwise Instructions that produce a result
+  InstrItinData<IIC_iBITi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iBITr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>,
+  InstrItinData<IIC_iBITsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iBITsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>,
+  //
+  // Unary Instructions that produce a result
+
+  // CLZ, RBIT, etc.
+  InstrItinData<IIC_iUNAr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+
+  // BFC, BFI, UBFX, SBFX
+  InstrItinData<IIC_iUNAsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1]>,
+
+  //
+  // Zero and sign extension instructions
+  InstrItinData<IIC_iEXTr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [2, 1]>,
+  InstrItinData<IIC_iEXTAr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>], [3, 1, 1]>,
+  InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                               [1], [A9_LdBypass]>,
+  InstrItinData<IIC_iCMPr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                               [1, 1], [A9_LdBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iCMPsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>],
+                                [1, 1], [A9_LdBypass, NoBypass]>,
+  InstrItinData<IIC_iCMPsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0, A9_ALU1]>],
+                              [1, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>,
+  //
+  // Test instructions
+  InstrItinData<IIC_iTSTi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
+  InstrItinData<IIC_iTSTr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iTSTsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iTSTsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>,
+  //
+  // Move instructions, conditional
+  // FIXME: Correctly model the extra input dep on the destination.
+  InstrItinData<IIC_iCMOVi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
+  InstrItinData<IIC_iCMOVr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iCMOVsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iCMOVix2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>,
+
+  // Integer multiply pipeline
+  //
+  InstrItinData<IIC_iMUL16  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0]>], [3, 1, 1]>,
+  InstrItinData<IIC_iMAC16  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0]>],
+                              [3, 1, 1, 1]>,
+  InstrItinData<IIC_iMUL32  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0]>], [4, 1, 1]>,
+  InstrItinData<IIC_iMAC32  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0]>],
+                              [4, 1, 1, 1]>,
+  InstrItinData<IIC_iMUL64  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0]>], [4, 5, 1, 1]>,
+  InstrItinData<IIC_iMAC64  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0]>],
+                              [4, 5, 1, 1]>,
+  // Integer load pipeline
+  // FIXME: The timings are some rough approximations
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 1], [A9_LdBypass]>,
+  // FIXME: If address is 64-bit aligned, AGU cycles is 1.
+  InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 3, 1], [A9_LdBypass]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 3, 1, 1], [A9_LdBypass]>,
+  //
+  // Scaled register offset
+  InstrItinData<IIC_iLoad_si  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit], 0>],
+                                [4, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [5, 1, 1], [A9_LdBypass]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 2, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 3, 1], [A9_LdBypass]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 2, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 3, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 3, 1, 1], [A9_LdBypass]>,
+  //
+  // Scaled register offset with update
+  InstrItinData<IIC_iLoad_siu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 3, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>],
+                                 [5, 4, 1, 1], [A9_LdBypass]>,
+  //
+  // Load multiple, def is the 5th operand.
+  // FIXME: This assumes 3 to 4 registers.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>],
+                               [1, 1, 1, 1, 3],
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+                         -1>, // dynamic uops
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>],
+                               [2, 1, 1, 1, 3],
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+                         -1>, // dynamic uops
+  //
+  // Load multiple plus branch
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>,
+                                InstrStage<1, [A9_Branch]>],
+                               [1, 2, 1, 1, 3],
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+                         -1>, // dynamic uops
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop  ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>],
+                               [1, 1, 3],
+                               [NoBypass, NoBypass, A9_LdBypass],
+                               -1>, // dynamic uops
+  //
+  // Pop + branch, def is the 3rd operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>,
+                                InstrStage<1, [A9_Branch]>],
+                               [1, 1, 3],
+                               [NoBypass, NoBypass, A9_LdBypass],
+                               -1>, // dynamic uops
+  //
+  // iLoadi + iALUr for t2LDRpci_pic.
+  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU], 0>,
+                                InstrStage<1, [A9_LSUnit]>,
+                                InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                               [2, 1]>,
+
+  // Integer store pipeline
+  ///
+  // Immediate offset
+  InstrItinData<IIC_iStore_i  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
+  InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
+  // FIXME: If address is 64-bit aligned, AGU cycles is 1.
+  InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStore_r  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  //
+  // Scaled register offset
+  InstrItinData<IIC_iStore_si ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<1, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStore_iu ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<1, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>], [2, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>], [3, 1, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStore_ru ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<1, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>],
+                                 [2, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>],
+                                 [3, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>],
+                                 [3, 1, 1, 1]>,
+  //
+  // Scaled register offset with update
+  InstrItinData<IIC_iStore_siu,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                    InstrStage<1, [A9_MUX0], 0>,
+                                    InstrStage<1, [A9_AGU], 0>,
+                                    InstrStage<1, [A9_LSUnit]>],
+                                   [2, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                    InstrStage<1, [A9_MUX0], 0>,
+                                    InstrStage<2, [A9_AGU], 1>,
+                                    InstrStage<1, [A9_LSUnit]>],
+                                   [3, 1, 1, 1]>,
+  //
+  // Store multiple
+  InstrItinData<IIC_iStore_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU], 0>,
+                                InstrStage<2, [A9_LSUnit]>],
+                [], [], -1>, // dynamic uops
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU], 0>,
+                                InstrStage<2, [A9_LSUnit]>],
+                [2], [], -1>, // dynamic uops
+  //
+  // Preload
+  InstrItinData<IIC_Preload,   [InstrStage<1, [A9_Issue0, A9_Issue1]>], [1, 1]>,
+
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br       , [InstrStage<1, [A9_Issue0], 0>,
+                                InstrStage<1, [A9_Issue1], 0>,
+                                InstrStage<1, [A9_Branch]>]>,
+
+  // VFP and NEON shares the same register file. This means that every VFP
+  // instruction should wait for full completion of the consecutive NEON
+  // instruction and vice-versa. We model this behavior with two artificial FUs:
+  // DRegsVFP and DRegsVFP.
+  //
+  // Every VFP instruction:
+  //  - Acquires DRegsVFP resource for 1 cycle
+  //  - Reserves DRegsN resource for the whole duration (including time to
+  //    register file writeback!).
+  // Every NEON instruction does the same but with FUs swapped.
+  //
+  // Since the reserved FU cannot be acquired, this models precisely
+  // "cross-domain" stalls.
+
+  // VFP
+  // Issue through integer pipeline, and execute in NEON unit.
+
+  // FP Special Register to Integer Register File Move
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                              InstrStage<1, [A9_MUX0], 0>,
+                              InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                              InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                              InstrStage<1, [A9_NPipe]>],
+                             [1]>,
+  //
+  // Single-precision FP Unary
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra latency cycles since wbck is 2 cycles
+                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Unary
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra latency cycles since wbck is 2 cycles
+                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+
+  //
+  // Single-precision FP Compare
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra latency cycles since wbck is 4 cycles
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Compare
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra latency cycles since wbck is 4 cycles
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+  //
+  // Single to Double FP Convert
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Double to Single FP Convert
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+
+  //
+  // Single to Half FP Convert
+  InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Half to Single FP Convert
+  InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
+
+  //
+  // Single-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Double-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Integer to Single-Precision FP Convert
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Integer to Double-Precision FP Convert
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Single-precision FP ALU
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1, 1]>,
+  //
+  // Double-precision FP ALU
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1, 1]>,
+  //
+  // Single-precision FP Multiply
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<6, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 1, 1]>,
+  //
+  // Double-precision FP Multiply
+  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<7, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 1, 1]>,
+  //
+  // Single-precision FP MAC
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<9, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [8, 1, 1, 1]>,
+  //
+  // Double-precision FP MAC
+  InstrItinData<IIC_fpMAC64 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<10, [A9_DRegsN],  0, Reserved>,
+                               InstrStage<2,  [A9_NPipe]>],
+                              [9, 1, 1, 1]>,
+  //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<9, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [8, 1, 1, 1]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<10, [A9_DRegsN],  0, Reserved>,
+                               InstrStage<2,  [A9_NPipe]>],
+                              [9, 1, 1, 1]>,
+  //
+  // Single-precision FP DIV
+  InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<16, [A9_DRegsN],  0, Reserved>,
+                               InstrStage<10, [A9_NPipe]>],
+                              [15, 1, 1]>,
+  //
+  // Double-precision FP DIV
+  InstrItinData<IIC_fpDIV64 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<26, [A9_DRegsN],  0, Reserved>,
+                               InstrStage<20, [A9_NPipe]>],
+                              [25, 1, 1]>,
+  //
+  // Single-precision FP SQRT
+  InstrItinData<IIC_fpSQRT32, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<18, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<13, [A9_NPipe]>],
+                              [17, 1]>,
+  //
+  // Double-precision FP SQRT
+  InstrItinData<IIC_fpSQRT64, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<33, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<28, [A9_NPipe]>],
+                              [32, 1]>,
+
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra 1 latency cycle since wbck is 2 cycles
+                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra 1 latency cycle since wbck is 2 cycles
+                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  //
+  // On A9 move-from-VFP is free to issue with no stall if other VFP
+  // operations are in flight. I assume it still can't dual-issue though.
+  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>],
+                              [2, 1]>,
+  //
+  // Double-precision to Integer Move
+  //
+  // On A9 move-from-VFP is free to issue with no stall if other VFP
+  // operations are in flight. I assume it still can't dual-issue though.
+  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>],
+                              [2, 1, 1]>,
+  //
+  // Single-precision FP Load
+  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Load
+  // FIXME: Result latency is 1 if address is 64-bit aligned.
+  InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1]>,
+  //
+  // FP Load Multiple
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
+  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                [1, 1, 1, 1], [], -1>, // dynamic uops
+  //
+  // FP Load Multiple + update
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
+  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                [2, 1, 1, 1], [], -1>, // dynamic uops
+  //
+  // Single-precision FP Store
+  InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Store
+  InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
+  //
+  // FP Store Multiple
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
+  InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                [1, 1, 1, 1], [], -1>, // dynamic uops
+  //
+  // FP Store Multiple + update
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
+  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                                InstrStage<1, [A9_NPipe], 0>,
+                                InstrStage<2, [A9_LSUnit]>],
+                [2, 1, 1, 1], [], -1>, // dynamic uops
+  // NEON
+  // VLD1
+  InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
+  // VLD1x2
+  InstrItinData<IIC_VLD1x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1]>,
+  // VLD1x3
+  InstrItinData<IIC_VLD1x3,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 1]>,
+  // VLD1x4
+  InstrItinData<IIC_VLD1x4,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 1]>,
+  // VLD1u
+  InstrItinData<IIC_VLD1u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 2, 1]>,
+  // VLD1x2u
+  InstrItinData<IIC_VLD1x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 2, 1]>,
+  // VLD1x3u
+  InstrItinData<IIC_VLD1x3u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 1]>,
+  // VLD1x4u
+  InstrItinData<IIC_VLD1x4u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 2, 1]>,
+  //
+  // VLD1ln
+  InstrItinData<IIC_VLD1ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 1, 1, 1]>,
+  //
+  // VLD1lnu
+  InstrItinData<IIC_VLD1lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 2, 1, 1, 1, 1]>,
+  //
+  // VLD1dup
+  InstrItinData<IIC_VLD1dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1]>,
+  //
+  // VLD1dupu
+  InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1, 1]>,
+  //
+  // VLD2
+  InstrItinData<IIC_VLD2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1]>,
+  //
+  // VLD2x2
+  InstrItinData<IIC_VLD2x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 3, 2, 3, 1]>,
+  //
+  // VLD2ln
+  InstrItinData<IIC_VLD2ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 3, 1, 1, 1, 1]>,
+  //
+  // VLD2u
+  InstrItinData<IIC_VLD2u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 2, 1, 1, 1]>,
+  //
+  // VLD2x2u
+  InstrItinData<IIC_VLD2x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 3, 2, 3, 2, 1]>,
+  //
+  // VLD2lnu
+  InstrItinData<IIC_VLD2lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 3, 2, 1, 1, 1, 1, 1]>,
+  //
+  // VLD2dup
+  InstrItinData<IIC_VLD2dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1]>,
+  //
+  // VLD2dupu
+  InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 2, 1, 1]>,
+  //
+  // VLD3
+  InstrItinData<IIC_VLD3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 1]>,
+  //
+  // VLD3ln
+  InstrItinData<IIC_VLD3ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<5, [A9_NPipe], 0>,
+                               InstrStage<5, [A9_LSUnit]>],
+                              [5, 5, 6, 1, 1, 1, 1, 2]>,
+  //
+  // VLD3u
+  InstrItinData<IIC_VLD3u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 2, 1]>,
+  //
+  // VLD3lnu
+  InstrItinData<IIC_VLD3lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<5, [A9_NPipe], 0>,
+                               InstrStage<5, [A9_LSUnit]>],
+                              [5, 5, 6, 2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VLD3dup
+  InstrItinData<IIC_VLD3dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 1]>,
+  //
+  // VLD3dupu
+  InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 2, 1, 1]>,
+  //
+  // VLD4
+  InstrItinData<IIC_VLD4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 4, 1]>,
+  //
+  // VLD4ln
+  InstrItinData<IIC_VLD4ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe], 0>,
+                               InstrStage<4, [A9_LSUnit]>],
+                              [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4u
+  InstrItinData<IIC_VLD4u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 4, 2, 1]>,
+  //
+  // VLD4lnu
+  InstrItinData<IIC_VLD4lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe], 0>,
+                               InstrStage<4, [A9_LSUnit]>],
+                              [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4dup
+  InstrItinData<IIC_VLD4dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD4dupu
+  InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 2, 3, 3, 2, 1, 1]>,
+  //
+  // VST1
+  InstrItinData<IIC_VST1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1]>,
+  //
+  // VST1x2
+  InstrItinData<IIC_VST1x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST1x3
+  InstrItinData<IIC_VST1x3,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST1x4
+  InstrItinData<IIC_VST1x4,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1u
+  InstrItinData<IIC_VST1u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST1x2u
+  InstrItinData<IIC_VST1x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST1x3u
+  InstrItinData<IIC_VST1x3u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST1x4u
+  InstrItinData<IIC_VST1x4u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1ln
+  InstrItinData<IIC_VST1ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1]>,
+  //
+  // VST1lnu
+  InstrItinData<IIC_VST1lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST2
+  InstrItinData<IIC_VST2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2x2
+  InstrItinData<IIC_VST2x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2u
+  InstrItinData<IIC_VST2u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST2x2u
+  InstrItinData<IIC_VST2x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2ln
+  InstrItinData<IIC_VST2ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2lnu
+  InstrItinData<IIC_VST2lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST3
+  InstrItinData<IIC_VST3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3u
+  InstrItinData<IIC_VST3u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST3ln
+  InstrItinData<IIC_VST3ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3lnu
+  InstrItinData<IIC_VST3lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST4
+  InstrItinData<IIC_VST4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4u
+  InstrItinData<IIC_VST4u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4ln
+  InstrItinData<IIC_VST4ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4lnu
+  InstrItinData<IIC_VST4lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+
+  //
+  // Double-register Integer Unary
+  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2]>,
+  //
+  // Quad-register Integer Unary
+  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2]>,
+  //
+  // Double-register Integer Q-Unary
+  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Quad-register Integer CountQ-Unary
+  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Double-register Integer Binary
+  InstrItinData<IIC_VBINiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 2]>,
+  //
+  // Quad-register Integer Binary
+  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 2]>,
+  //
+  // Double-register Integer Subtract
+  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 1]>,
+  //
+  // Quad-register Integer Subtract
+  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 1]>,
+  //
+  // Double-register Integer Shift
+  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 1, 1]>,
+  //
+  // Quad-register Integer Shift
+  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 1, 1]>,
+  //
+  // Double-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2, 2]>,
+  //
+  // Quad-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2, 2]>,
+  //
+  // Double-register Integer Subtract (4 cycle)
+  InstrItinData<IIC_VSUBi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2, 1]>,
+  //
+  // Quad-register Integer Subtract (4 cycle)
+  InstrItinData<IIC_VSUBi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2, 1]>,
+
+  //
+  // Double-register Integer Count
+  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 2]>,
+  //
+  // Quad-register Integer Count
+  // Result written in N3, but that is relative to the last cycle of multicycle,
+  // so we use 4 for those cases
+  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [4, 2, 2]>,
+  //
+  // Double-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
+  //
+  // Quad-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
+  //
+  // Double-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 3, 1]>,
+  //
+  // Quad-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 1]>,
+
+  //
+  // Double-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 2, 2]>,
+  //
+  // Quad-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [7, 2, 2]>,
+
+  //
+  // Double-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [7, 2, 1]>,
+  //
+  // Quad-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 9 cycles
+                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [9, 2, 1]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 3, 2, 2]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [7, 3, 2, 1]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [7, 3, 2, 2]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 9 cycles
+                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [9, 3, 2, 1]>,
+
+  //
+  // Move
+  InstrItinData<IIC_VMOV,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1,1]>,
+  //
+  // Move Immediate
+  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3]>,
+  //
+  // Double-register Permute Move
+  InstrItinData<IIC_VMOVD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
+  //
+  // Quad-register Permute Move
+  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 2, 1]>,
+  //
+  // Integer to Lane Move
+  InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 1, 1]>,
+
+  //
+  // Vector narrow move
+  InstrItinData<IIC_VMOVN,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 1]>,
+  //
+  // Double-register FP Unary
+  InstrItinData<IIC_VUNAD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 2]>,
+  //
+  // Quad-register FP Unary
+  // Result written in N5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 2]>,
+  //
+  // Double-register FP Binary
+  // FIXME: We're using this itin for many instructions and [2, 2] here is too
+  // optimistic.
+  InstrItinData<IIC_VBIND,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 2, 2]>,
+
+  //
+  // VPADD, etc.
+  InstrItinData<IIC_VPBIND,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 1, 1]>,
+  //
+  // Double-register FP VMUL
+  InstrItinData<IIC_VFMULD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 2, 1]>,
+  //
+  // Quad-register FP Binary
+  // Result written in N5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  // FIXME: We're using this itin for many instructions and [2, 2] here is too
+  // optimistic.
+  InstrItinData<IIC_VBINQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 2, 2]>,
+  //
+  // Quad-register FP VMUL
+  InstrItinData<IIC_VFMULQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 2, 1]>,
+  //
+  // Double-register FP Multiple-Accumulate
+  InstrItinData<IIC_VMACD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
+  //
+  // Quad-register FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VMACQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 9 cycles
+                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [8, 4, 2, 1]>,
+  //
+  // Double-register Fused FP Multiple-Accumulate
+  InstrItinData<IIC_VFMACD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
+  //
+  // Quad-register Fused FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 9 cycles
+                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [8, 4, 2, 1]>,
+  //
+  // Double-register Reciprical Step
+  InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 10 cycles
+                               InstrStage<11, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [9, 2, 2]>,
+  //
+  // Quad-register Reciprical Step
+  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 11 cycles
+                               InstrStage<12, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [10, 2, 2]>,
+  //
+  // Double-register Permute
+  InstrItinData<IIC_VPERMD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 2, 1, 1]>,
+  //
+  // Quad-register Permute
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 3 for those cases
+  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 3, 1, 1]>,
+  //
+  // Quad-register Permute (3 cycle issue)
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 4 for those cases
+  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 8 cycles
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe]>],
+                              [4, 4, 1, 1]>,
+
+  //
+  // Double-register VEXT
+  InstrItinData<IIC_VEXTD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register VEXT
+  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 1, 2]>,
+  //
+  // VTB
+  InstrItinData<IIC_VTB1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 2, 1]>,
+  InstrItinData<IIC_VTB2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<2, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 2, 2, 1]>,
+  InstrItinData<IIC_VTB3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<2, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 8 cycles
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe]>],
+                              [4, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTB4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 8 cycles
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe]>],
+                              [4, 2, 2, 3, 3, 1]>,
+  //
+  // VTBX
+  InstrItinData<IIC_VTBX1,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 1, 2, 1]>,
+  InstrItinData<IIC_VTBX2,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 1, 2, 2, 1]>,
+  InstrItinData<IIC_VTBX3,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 8 cycles
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe]>],
+                              [4, 1, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTBX4,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 8 cycles
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [4, 1, 2, 2, 3, 3, 1]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler and will eventually replace itineraries.
+
+class A9WriteLMOpsListType<list<WriteSequence> writes> {
+  list <WriteSequence> Writes = writes;
+  SchedMachineModel SchedModel = ?;
+}
+
+// Cortex-A9 machine model for scheduling and other instruction cost heuristics.
+def CortexA9Model : SchedMachineModel {
+  let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+  let MicroOpBufferSize = 56; // Based on available renamed registers.
+  let LoadLatency = 2; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 8; // Based on estimate of pipeline depth.
+
+  let Itineraries = CortexA9Itineraries;
+
+  // FIXME: Many vector operations were never given an itinerary. We
+  // haven't mapped these to the new model either.
+  let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+//
+// The AGU unit has BufferSize=1 so that the latency between operations
+// that use it are considered to stall other operations.
+//
+// The FP unit has BufferSize=0 so that it is a hard dispatch
+// hazard. No instruction may be dispatched while the unit is reserved.
+
+let SchedModel = CortexA9Model in {
+
+def A9UnitALU : ProcResource<2>;
+def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; }
+def A9UnitAGU : ProcResource<1> { let BufferSize = 1; }
+def A9UnitLS  : ProcResource<1>;
+def A9UnitFP  : ProcResource<1> { let BufferSize = 0; }
+def A9UnitB   : ProcResource<1>;
+
+//===----------------------------------------------------------------------===//
+// Define scheduler read/write types with their resources and latency on A9.
+
+// Consume an issue slot, but no processor resources. This is useful when all
+// other writes associated with the operand have NumMicroOps = 0.
+def A9WriteIssue : SchedWriteRes<[]> { let Latency = 0; }
+
+// Write an integer register.
+def A9WriteI : SchedWriteRes<[A9UnitALU]>;
+// Write an integer shifted-by register
+def A9WriteIsr : SchedWriteRes<[A9UnitALU]> { let Latency = 2; }
+
+// Basic ALU.
+def A9WriteALU : SchedWriteRes<[A9UnitALU]>;
+// ALU with operand shifted by immediate.
+def : WriteRes<WriteALUsi, [A9UnitALU]> { let Latency = 2; }
+// ALU with operand shifted by register.
+def A9WriteALUsr : SchedWriteRes<[A9UnitALU]> { let Latency = 3; }
+
+// Multiplication
+def A9WriteM   : SchedWriteRes<[A9UnitMul, A9UnitMul]> { let Latency = 4; }
+def A9WriteMHi : SchedWriteRes<[A9UnitMul]> { let Latency = 5;
+                                              let NumMicroOps = 0; }
+def A9WriteM16   : SchedWriteRes<[A9UnitMul]> { let Latency = 3; }
+def A9WriteM16Hi : SchedWriteRes<[A9UnitMul]> { let Latency = 4;
+                                                let NumMicroOps = 0; }
+
+// Floating-point
+// Only one FP or AGU instruction may issue per cycle. We model this
+// by having FP instructions consume the AGU resource.
+def A9WriteF      : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; }
+def A9WriteFMov   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
+def A9WriteFMulS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
+def A9WriteFMulD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
+def A9WriteFMAS   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 8; }
+def A9WriteFMAD   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
+def A9WriteFDivS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 15; }
+def A9WriteFDivD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 25; }
+def A9WriteFSqrtS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 17; }
+def A9WriteFSqrtD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 32; }
+
+// NEON has an odd mix of latencies. Simply name the write types by latency.
+def A9WriteV1 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
+def A9WriteV2 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 2; }
+def A9WriteV3 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 3; }
+def A9WriteV4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; }
+def A9WriteV5 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
+def A9WriteV6 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
+def A9WriteV7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 7; }
+def A9WriteV9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
+def A9WriteV10 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 10; }
+
+// Reserve A9UnitFP for 2 consecutive cycles.
+def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+  let Latency = 4;
+  let ResourceCycles = [2];
+}
+def A9Write2V7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+  let Latency = 7;
+  let ResourceCycles = [2];
+}
+def A9Write2V9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+  let Latency = 9;
+  let ResourceCycles = [2];
+}
+
+// Branches don't have a def operand but still consume resources.
+def A9WriteB : SchedWriteRes<[A9UnitB]>;
+
+// Address generation.
+def A9WriteAdr : SchedWriteRes<[A9UnitAGU]> { let NumMicroOps = 0; }
+
+// Load Integer.
+def A9WriteL : SchedWriteRes<[A9UnitLS]> { let Latency = 3; }
+// Load the upper 32-bits using the same micro-op.
+def A9WriteLHi : SchedWriteRes<[]> { let Latency = 3;
+                                     let NumMicroOps = 0; }
+// Offset shifted by register.
+def A9WriteLsi : SchedWriteRes<[A9UnitLS]> { let Latency = 4; }
+// Load (and zero extend) a byte.
+def A9WriteLb : SchedWriteRes<[A9UnitLS]> { let Latency = 4; }
+def A9WriteLbsi : SchedWriteRes<[A9UnitLS]> { let Latency = 5; }
+
+// Load or Store Float, aligned.
+def A9WriteLSfp : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 1; }
+
+// Store Integer.
+def A9WriteS : SchedWriteRes<[A9UnitLS]>;
+
+//===----------------------------------------------------------------------===//
+// Define resources dynamically for load multiple variants.
+
+// Define helpers for extra latency without consuming resources.
+def A9WriteCycle1 : SchedWriteRes<[]> { let Latency = 1; let NumMicroOps = 0; }
+foreach NumCycles = 2-8 in {
+def A9WriteCycle#NumCycles : WriteSequence<[A9WriteCycle1], NumCycles>;
+} // foreach NumCycles
+
+// Define address generation sequences and predicates for 8 flavors of LDMs.
+foreach NumAddr = 1-8 in {
+
+// Define A9WriteAdr1-8 as a sequence of A9WriteAdr with additive
+// latency for instructions that generate multiple loads or stores.
+def A9WriteAdr#NumAddr : WriteSequence<[A9WriteAdr], NumAddr>;
+
+// Define a predicate to select the LDM based on number of memory addresses.
+def A9LMAdr#NumAddr#Pred :
+  SchedPredicate<"(TII->getNumLDMAddresses(*MI)+1)/2 == "#NumAddr>;
+
+} // foreach NumAddr
+
+// Fall-back for unknown LDMs.
+def A9LMUnknownPred : SchedPredicate<"TII->getNumLDMAddresses(*MI) == 0">;
+
+// LDM/VLDM/VLDn address generation latency & resources.
+// Dynamically select the A9WriteAdrN sequence using a predicate.
+def A9WriteLMAdr : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, [A9WriteAdr1]>,
+  SchedVar<A9LMAdr2Pred, [A9WriteAdr2]>,
+  SchedVar<A9LMAdr3Pred, [A9WriteAdr3]>,
+  SchedVar<A9LMAdr4Pred, [A9WriteAdr4]>,
+  SchedVar<A9LMAdr5Pred, [A9WriteAdr5]>,
+  SchedVar<A9LMAdr6Pred, [A9WriteAdr6]>,
+  SchedVar<A9LMAdr7Pred, [A9WriteAdr7]>,
+  SchedVar<A9LMAdr8Pred, [A9WriteAdr8]>,
+  // For unknown LDM/VLDM/VSTM, assume 2 32-bit registers.
+  SchedVar<A9LMUnknownPred, [A9WriteAdr2]>]>;
+
+// Define LDM Resources.
+// These take no issue resource, so they can be combined with other
+// writes like WriteB.
+// A9WriteLMLo takes a single LS resource and 2 cycles.
+def A9WriteLMLo : SchedWriteRes<[A9UnitLS]> { let Latency = 2;
+                                              let NumMicroOps = 0; }
+// Assuming aligned access, the upper half of each pair is free with
+// the same latency.
+def A9WriteLMHi : SchedWriteRes<[]> { let Latency = 2;
+                                      let NumMicroOps = 0; }
+// Each A9WriteL#N variant adds N cycles of latency without consuming
+// additional resources.
+foreach NumAddr = 1-8 in {
+def A9WriteL#NumAddr : WriteSequence<
+  [A9WriteLMLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+def A9WriteL#NumAddr#Hi : WriteSequence<
+  [A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// LDM: Load multiple into 32-bit integer registers.
+
+def A9WriteLMOpsList : A9WriteLMOpsListType<
+                 [A9WriteL1, A9WriteL1Hi,
+                  A9WriteL2, A9WriteL2Hi,
+                  A9WriteL3, A9WriteL3Hi,
+                  A9WriteL4, A9WriteL4Hi,
+                  A9WriteL5, A9WriteL5Hi,
+                  A9WriteL6, A9WriteL6Hi,
+                  A9WriteL7, A9WriteL7Hi,
+                  A9WriteL8, A9WriteL8Hi]>;
+
+// A9WriteLM variants expand into a pair of writes for each 64-bit
+// value loaded. When the number of registers is odd, the last
+// A9WriteLnHi is naturally ignored because the instruction has no
+// following def operands.  These variants take no issue resource, so
+// they may need to be part of a WriteSequence that includes A9WriteIssue.
+def A9WriteLM : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, A9WriteLMOpsList.Writes[0-1]>,
+  SchedVar<A9LMAdr2Pred, A9WriteLMOpsList.Writes[0-3]>,
+  SchedVar<A9LMAdr3Pred, A9WriteLMOpsList.Writes[0-5]>,
+  SchedVar<A9LMAdr4Pred, A9WriteLMOpsList.Writes[0-7]>,
+  SchedVar<A9LMAdr5Pred, A9WriteLMOpsList.Writes[0-9]>,
+  SchedVar<A9LMAdr6Pred, A9WriteLMOpsList.Writes[0-11]>,
+  SchedVar<A9LMAdr7Pred, A9WriteLMOpsList.Writes[0-13]>,
+  SchedVar<A9LMAdr8Pred, A9WriteLMOpsList.Writes[0-15]>,
+  // For unknown LDMs, define the maximum number of writes, but only
+  // make the first two consume resources.
+  SchedVar<A9LMUnknownPred, [A9WriteL1, A9WriteL1Hi,
+                             A9WriteL2, A9WriteL2Hi,
+                             A9WriteL3Hi, A9WriteL3Hi,
+                             A9WriteL4Hi, A9WriteL4Hi,
+                             A9WriteL5Hi, A9WriteL5Hi,
+                             A9WriteL6Hi, A9WriteL6Hi,
+                             A9WriteL7Hi, A9WriteL7Hi,
+                             A9WriteL8Hi, A9WriteL8Hi]>]> {
+  let Variadic = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// VFP Load/Store Multiple Variants, and NEON VLDn/VSTn support.
+
+// A9WriteLfpOp is the same as A9WriteLSfp but takes no issue resources
+// so can be used in WriteSequences for in single-issue instructions that
+// encapsulate multiple loads.
+def A9WriteLfpOp : SchedWriteRes<[A9UnitLS, A9UnitFP]> {
+  let Latency = 1;
+  let NumMicroOps = 0;
+}
+
+foreach NumAddr = 1-8 in {
+
+// Helper for A9WriteLfp1-8: A sequence of fp loads with no micro-ops.
+def A9WriteLfp#NumAddr#Seq : WriteSequence<[A9WriteLfpOp], NumAddr>;
+
+// A9WriteLfp1-8 definitions are statically expanded into a sequence of
+// A9WriteLfpOps with additive latency that takes a single issue slot.
+// Used directly to describe NEON VLDn.
+def A9WriteLfp#NumAddr : WriteSequence<
+  [A9WriteIssue, !cast<SchedWrite>("A9WriteLfp"#NumAddr#Seq)]>;
+
+// A9WriteLfp1-8Mov adds a cycle of latency and FP resource for
+// permuting loaded values.
+def A9WriteLfp#NumAddr#Mov : WriteSequence<
+  [A9WriteF, !cast<SchedWrite>("A9WriteLfp"#NumAddr#Seq)]>;
+
+} // foreach NumAddr
+
+// Define VLDM/VSTM PreRA resources.
+// A9WriteLMfpPreRA are dynamically expanded into the correct
+// A9WriteLfp1-8 sequence based on a predicate. This supports the
+// preRA VLDM variants in which all 64-bit loads are written to the
+// same tuple of either single or double precision registers.
+def A9WriteLMfpPreRA : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, [A9WriteLfp1]>,
+  SchedVar<A9LMAdr2Pred, [A9WriteLfp2]>,
+  SchedVar<A9LMAdr3Pred, [A9WriteLfp3]>,
+  SchedVar<A9LMAdr4Pred, [A9WriteLfp4]>,
+  SchedVar<A9LMAdr5Pred, [A9WriteLfp5]>,
+  SchedVar<A9LMAdr6Pred, [A9WriteLfp6]>,
+  SchedVar<A9LMAdr7Pred, [A9WriteLfp7]>,
+  SchedVar<A9LMAdr8Pred, [A9WriteLfp8]>,
+  // For unknown VLDM/VSTM PreRA, assume 2xS registers.
+  SchedVar<A9LMUnknownPred, [A9WriteLfp2]>]>;
+
+// Define VLDM/VSTM PostRA Resources.
+// A9WriteLMfpLo takes a LS and FP resource and one issue slot but no latency.
+def A9WriteLMfpLo : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 0; }
+
+foreach NumAddr = 1-8 in {
+
+// Each A9WriteL#N variant adds N cycles of latency without consuming
+// additional resources.
+def A9WriteLMfp#NumAddr : WriteSequence<
+  [A9WriteLMfpLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+
+// Assuming aligned access, the upper half of each pair is free with
+// the same latency.
+def A9WriteLMfp#NumAddr#Hi : WriteSequence<
+  [A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+
+} // foreach NumAddr
+
+// VLDM PostRA Variants. These variants expand A9WriteLMfpPostRA into a
+// pair of writes for each 64-bit data loaded. When the number of
+// registers is odd, the last WriteLMfpnHi is naturally ignored because
+// the instruction has no following def operands.
+
+def A9WriteLMfpPostRAOpsList : A9WriteLMOpsListType<
+                 [A9WriteLMfp1, A9WriteLMfp2,       // 0-1
+                  A9WriteLMfp3, A9WriteLMfp4,       // 2-3
+                  A9WriteLMfp5, A9WriteLMfp6,       // 4-5
+                  A9WriteLMfp7, A9WriteLMfp8,       // 6-7
+                  A9WriteLMfp1Hi,                   // 8-8
+                  A9WriteLMfp2Hi, A9WriteLMfp2Hi,   // 9-10
+                  A9WriteLMfp3Hi, A9WriteLMfp3Hi,   // 11-12
+                  A9WriteLMfp4Hi, A9WriteLMfp4Hi,   // 13-14
+                  A9WriteLMfp5Hi, A9WriteLMfp5Hi,   // 15-16
+                  A9WriteLMfp6Hi, A9WriteLMfp6Hi,   // 17-18
+                  A9WriteLMfp7Hi, A9WriteLMfp7Hi,   // 19-20
+                  A9WriteLMfp8Hi, A9WriteLMfp8Hi]>; // 21-22
+
+def A9WriteLMfpPostRA : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, A9WriteLMfpPostRAOpsList.Writes[0-0, 8-8]>,
+  SchedVar<A9LMAdr2Pred, A9WriteLMfpPostRAOpsList.Writes[0-1, 9-10]>,
+  SchedVar<A9LMAdr3Pred, A9WriteLMfpPostRAOpsList.Writes[0-2, 10-12]>,
+  SchedVar<A9LMAdr4Pred, A9WriteLMfpPostRAOpsList.Writes[0-3, 11-14]>,
+  SchedVar<A9LMAdr5Pred, A9WriteLMfpPostRAOpsList.Writes[0-4, 12-16]>,
+  SchedVar<A9LMAdr6Pred, A9WriteLMfpPostRAOpsList.Writes[0-5, 13-18]>,
+  SchedVar<A9LMAdr7Pred, A9WriteLMfpPostRAOpsList.Writes[0-6, 14-20]>,
+  SchedVar<A9LMAdr8Pred, A9WriteLMfpPostRAOpsList.Writes[0-7, 15-22]>,
+  // For unknown LDMs, define the maximum number of writes, but only
+  // make the first two consume resources. We are optimizing for the case
+  // where the operands are DPRs, and this determines the first eight
+  // types. The remaining eight types are filled to cover the case
+  // where the operands are SPRs.
+  SchedVar<A9LMUnknownPred, [A9WriteLMfp1, A9WriteLMfp2,
+                             A9WriteLMfp3Hi, A9WriteLMfp4Hi,
+                             A9WriteLMfp5Hi, A9WriteLMfp6Hi,
+                             A9WriteLMfp7Hi, A9WriteLMfp8Hi,
+                             A9WriteLMfp5Hi, A9WriteLMfp5Hi,
+                             A9WriteLMfp6Hi, A9WriteLMfp6Hi,
+                             A9WriteLMfp7Hi, A9WriteLMfp7Hi,
+                             A9WriteLMfp8Hi, A9WriteLMfp8Hi]>]> {
+  let Variadic = 1;
+}
+
+// Distinguish between our multiple MI-level forms of the same
+// VLDM/VSTM instructions.
+def A9PreRA : SchedPredicate<
+  "TargetRegisterInfo::isVirtualRegister(MI->getOperand(0).getReg())">;
+def A9PostRA : SchedPredicate<
+  "TargetRegisterInfo::isPhysicalRegister(MI->getOperand(0).getReg())">;
+
+// VLDM represents all destination registers as a single register
+// tuple, unlike LDM. So the number of write operands is not variadic.
+def A9WriteLMfp : SchedWriteVariant<[
+  SchedVar<A9PreRA, [A9WriteLMfpPreRA]>,
+  SchedVar<A9PostRA, [A9WriteLMfpPostRA]>]>;
+
+//===----------------------------------------------------------------------===//
+// Resources for other (non-LDM/VLDM) Variants.
+
+// These mov immediate writers are unconditionally expanded with
+// additive latency.
+def A9WriteI2 : WriteSequence<[A9WriteI, A9WriteI]>;
+def A9WriteI2pc : WriteSequence<[A9WriteI, A9WriteI, WriteALU]>;
+def A9WriteI2ld  : WriteSequence<[A9WriteI, A9WriteI, A9WriteL]>;
+
+// Some ALU operations can read loaded integer values one cycle early.
+def A9ReadALU : SchedReadAdvance<1,
+  [A9WriteL, A9WriteLHi, A9WriteLsi, A9WriteLb, A9WriteLbsi,
+   A9WriteL1, A9WriteL2, A9WriteL3, A9WriteL4,
+   A9WriteL5, A9WriteL6, A9WriteL7, A9WriteL8,
+   A9WriteL1Hi, A9WriteL2Hi, A9WriteL3Hi, A9WriteL4Hi,
+   A9WriteL5Hi, A9WriteL6Hi, A9WriteL7Hi, A9WriteL8Hi]>;
+
+// Read types for operands that are unconditionally read in cycle N
+// after the instruction issues, decreases producer latency by N-1.
+def A9Read2 : SchedReadAdvance<1>;
+def A9Read3 : SchedReadAdvance<2>;
+def A9Read4 : SchedReadAdvance<3>;
+
+//===----------------------------------------------------------------------===//
+// Map itinerary classes to scheduler read/write resources per operand.
+//
+// For ARM, we piggyback scheduler resources on the Itinerary classes
+// to avoid perturbing the existing instruction definitions.
+
+// This table follows the ARM Cortex-A9 Technical Reference Manuals,
+// mostly in order.
+
+def :ItinRW<[WriteALU], [IIC_iMOVi,IIC_iMOVr,IIC_iMOVsi,
+                         IIC_iMVNi,IIC_iMVNsi,
+                         IIC_iCMOVi,IIC_iCMOVr,IIC_iCMOVsi]>;
+def :ItinRW<[WriteALU, A9ReadALU],[IIC_iMVNr]>;
+def :ItinRW<[A9WriteIsr], [IIC_iMOVsr,IIC_iMVNsr,IIC_iCMOVsr]>;
+
+def :ItinRW<[A9WriteI2],   [IIC_iMOVix2,IIC_iCMOVix2]>;
+def :ItinRW<[A9WriteI2pc], [IIC_iMOVix2addpc]>;
+def :ItinRW<[A9WriteI2ld], [IIC_iMOVix2ld]>;
+
+def :ItinRW<[WriteALU], [IIC_iBITi,IIC_iBITr,IIC_iUNAr,IIC_iTSTi,IIC_iTSTr]>;
+def :ItinRW<[WriteALU, A9ReadALU], [IIC_iALUi, IIC_iCMPi, IIC_iCMPsi]>;
+def :ItinRW<[WriteALU, A9ReadALU, A9ReadALU],[IIC_iALUr,IIC_iCMPr]>;
+def :ItinRW<[WriteALUsi], [IIC_iBITsi,IIC_iUNAsi,IIC_iEXTr,IIC_iTSTsi]>;
+def :ItinRW<[WriteALUsi, A9ReadALU], [IIC_iALUsi]>;
+def :ItinRW<[WriteALUsi, ReadDefault, A9ReadALU], [IIC_iALUsir]>; // RSB
+def :ItinRW<[A9WriteALUsr], [IIC_iBITsr,IIC_iTSTsr,IIC_iEXTAr,IIC_iEXTAsr]>;
+def :ItinRW<[A9WriteALUsr, A9ReadALU], [IIC_iALUsr,IIC_iCMPsr]>;
+
+// A9WriteHi ignored for MUL32.
+def :ItinRW<[A9WriteM, A9WriteMHi], [IIC_iMUL32,IIC_iMAC32,
+                                     IIC_iMUL64,IIC_iMAC64]>;
+// FIXME: SMLALxx needs itin classes
+def :ItinRW<[A9WriteM16, A9WriteM16Hi], [IIC_iMUL16,IIC_iMAC16]>;
+
+// TODO: For floating-point ops, we model the pipeline forwarding
+// latencies here. WAW latencies are sometimes longer.
+
+def :ItinRW<[A9WriteFMov], [IIC_fpSTAT, IIC_fpMOVIS, IIC_fpMOVID, IIC_fpMOVSI,
+                            IIC_fpUNA32, IIC_fpUNA64,
+                            IIC_fpCMP32, IIC_fpCMP64]>;
+def :ItinRW<[A9WriteFMov, A9WriteFMov], [IIC_fpMOVDI]>;
+def :ItinRW<[A9WriteF], [IIC_fpCVTSD, IIC_fpCVTDS, IIC_fpCVTSH, IIC_fpCVTHS,
+                         IIC_fpCVTIS, IIC_fpCVTID, IIC_fpCVTSI, IIC_fpCVTDI,
+                         IIC_fpALU32, IIC_fpALU64]>;
+def :ItinRW<[A9WriteFMulS], [IIC_fpMUL32]>;
+def :ItinRW<[A9WriteFMulD], [IIC_fpMUL64]>;
+def :ItinRW<[A9WriteFMAS], [IIC_fpMAC32]>;
+def :ItinRW<[A9WriteFMAD], [IIC_fpMAC64]>;
+def :ItinRW<[A9WriteFDivS], [IIC_fpDIV32]>;
+def :ItinRW<[A9WriteFDivD], [IIC_fpDIV64]>;
+def :ItinRW<[A9WriteFSqrtS], [IIC_fpSQRT32]>;
+def :ItinRW<[A9WriteFSqrtD], [IIC_fpSQRT64]>;
+
+def :ItinRW<[A9WriteB], [IIC_Br]>;
+
+// A9 PLD is processed in a dedicated unit.
+def :ItinRW<[], [IIC_Preload]>;
+
+// Note: We must assume that loads are aligned, since the machine
+// model cannot know this statically and A9 ignores alignment hints.
+
+// A9WriteAdr consumes AGU regardless address writeback. But it's
+// latency is only relevant for users of an updated address.
+def :ItinRW<[A9WriteL, A9WriteAdr], [IIC_iLoad_i,IIC_iLoad_r,
+                                     IIC_iLoad_iu,IIC_iLoad_ru]>;
+def :ItinRW<[A9WriteLsi, A9WriteAdr], [IIC_iLoad_si,IIC_iLoad_siu]>;
+def :ItinRW<[A9WriteLb, A9WriteAdr2], [IIC_iLoad_bh_i,IIC_iLoad_bh_r,
+                                       IIC_iLoad_bh_iu,IIC_iLoad_bh_ru]>;
+def :ItinRW<[A9WriteLbsi, A9WriteAdr2], [IIC_iLoad_bh_si,IIC_iLoad_bh_siu]>;
+def :ItinRW<[A9WriteL, A9WriteLHi, A9WriteAdr], [IIC_iLoad_d_i,IIC_iLoad_d_r,
+                                            IIC_iLoad_d_ru]>;
+// Store either has no def operands, or the one def for address writeback.
+def :ItinRW<[A9WriteAdr, A9WriteS], [IIC_iStore_i, IIC_iStore_r,
+                                     IIC_iStore_iu, IIC_iStore_ru,
+                                     IIC_iStore_d_i, IIC_iStore_d_r,
+                                     IIC_iStore_d_ru]>;
+def :ItinRW<[A9WriteAdr2, A9WriteS], [IIC_iStore_si, IIC_iStore_siu,
+                                      IIC_iStore_bh_i, IIC_iStore_bh_r,
+                                      IIC_iStore_bh_iu, IIC_iStore_bh_ru]>;
+def :ItinRW<[A9WriteAdr3, A9WriteS], [IIC_iStore_bh_si, IIC_iStore_bh_siu]>;
+
+// A9WriteML will be expanded into a separate write for each def
+// operand. Address generation consumes resources, but A9WriteLMAdr
+// is listed after all def operands, so has no effective latency.
+//
+// Note: A9WriteLM expands into an even number of def operands. The
+// actual number of def operands may be less by one.
+def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteIssue], [IIC_iLoad_m, IIC_iPop]>;
+
+// Load multiple with address writeback has an extra def operand in
+// front of the loaded registers.
+//
+// Reuse the load-multiple variants for store-multiple because the
+// resources are identical, For stores only the address writeback
+// has a def operand so the WriteL latencies are unused.
+def :ItinRW<[A9WriteLMAdr, A9WriteLM, A9WriteIssue], [IIC_iLoad_mu,
+                                                      IIC_iStore_m,
+                                                      IIC_iStore_mu]>;
+def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteB], [IIC_iLoad_mBr, IIC_iPop_Br]>;
+def :ItinRW<[A9WriteL, A9WriteAdr, WriteALU], [IIC_iLoadiALU]>;
+
+def :ItinRW<[A9WriteLSfp, A9WriteAdr], [IIC_fpLoad32, IIC_fpLoad64]>;
+
+def :ItinRW<[A9WriteLMfp, A9WriteLMAdr], [IIC_fpLoad_m]>;
+def :ItinRW<[A9WriteLMAdr, A9WriteLMfp], [IIC_fpLoad_mu]>;
+def :ItinRW<[A9WriteAdr, A9WriteLSfp], [IIC_fpStore32, IIC_fpStore64,
+                                        IIC_fpStore_m, IIC_fpStore_mu]>;
+
+// Note: Unlike VLDM, VLD1 expects the writeback operand after the
+// normal writes.
+def :ItinRW<[A9WriteLfp1, A9WriteAdr1], [IIC_VLD1, IIC_VLD1u,
+                                         IIC_VLD1x2, IIC_VLD1x2u]>;
+def :ItinRW<[A9WriteLfp2, A9WriteAdr2], [IIC_VLD1x3, IIC_VLD1x3u,
+                                         IIC_VLD1x4, IIC_VLD1x4u,
+                                         IIC_VLD4dup, IIC_VLD4dupu]>;
+def :ItinRW<[A9WriteLfp1Mov, A9WriteAdr1], [IIC_VLD1dup, IIC_VLD1dupu,
+                                            IIC_VLD2, IIC_VLD2u,
+                                            IIC_VLD2dup, IIC_VLD2dupu]>;
+def :ItinRW<[A9WriteLfp2Mov, A9WriteAdr1], [IIC_VLD1ln, IIC_VLD1lnu,
+                                            IIC_VLD2x2, IIC_VLD2x2u,
+                                            IIC_VLD2ln, IIC_VLD2lnu]>;
+def :ItinRW<[A9WriteLfp3Mov, A9WriteAdr3], [IIC_VLD3, IIC_VLD3u,
+                                            IIC_VLD3dup, IIC_VLD3dupu]>;
+def :ItinRW<[A9WriteLfp4Mov, A9WriteAdr4], [IIC_VLD4, IIC_VLD4u,
+                                            IIC_VLD4ln, IIC_VLD4lnu]>;
+def :ItinRW<[A9WriteLfp5Mov, A9WriteAdr5], [IIC_VLD3ln, IIC_VLD3lnu]>;
+
+// Vector stores use similar resources to vector loads, so use the
+// same write types. The address write must be first for stores with
+// address writeback.
+def :ItinRW<[A9WriteAdr1, A9WriteLfp1], [IIC_VST1, IIC_VST1u,
+                                         IIC_VST1x2, IIC_VST1x2u,
+                                         IIC_VST1ln, IIC_VST1lnu,
+                                         IIC_VST2, IIC_VST2u,
+                                         IIC_VST2x2, IIC_VST2x2u,
+                                         IIC_VST2ln, IIC_VST2lnu]>;
+def :ItinRW<[A9WriteAdr2, A9WriteLfp2], [IIC_VST1x3, IIC_VST1x3u,
+                                         IIC_VST1x4, IIC_VST1x4u,
+                                         IIC_VST3, IIC_VST3u,
+                                         IIC_VST3ln, IIC_VST3lnu,
+                                         IIC_VST4, IIC_VST4u,
+                                         IIC_VST4ln, IIC_VST4lnu]>;
+
+// NEON moves.
+def :ItinRW<[A9WriteV2], [IIC_VMOVSI, IIC_VMOVDI, IIC_VMOVD, IIC_VMOVQ]>;
+def :ItinRW<[A9WriteV1], [IIC_VMOV, IIC_VMOVIS, IIC_VMOVID]>;
+def :ItinRW<[A9WriteV3], [IIC_VMOVISL, IIC_VMOVN]>;
+
+// NEON integer arithmetic
+//
+// VADD/VAND/VORR/VEOR/VBIC/VORN/VBIT/VBIF/VBSL
+def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VBINiD, IIC_VBINiQ]>;
+// VSUB/VMVN/VCLSD/VCLZD/VCNTD
+def :ItinRW<[A9WriteV3, A9Read2], [IIC_VSUBiD, IIC_VSUBiQ, IIC_VCNTiD]>;
+// VADDL/VSUBL/VNEG are mapped later under IIC_SHLi.
+// ...
+// VHADD/VRHADD/VQADD/VTST/VADH/VRADH
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2], [IIC_VBINi4D, IIC_VBINi4Q]>;
+
+// VSBH/VRSBH/VHSUB/VQSUB/VABD/VCEQ/VCGE/VCGT/VMAX/VMIN/VPMAX/VPMIN/VABDL
+def :ItinRW<[A9WriteV4, A9Read2], [IIC_VSUBi4D, IIC_VSUBi4Q]>;
+// VQNEG/VQABS
+def :ItinRW<[A9WriteV4], [IIC_VQUNAiD, IIC_VQUNAiQ]>;
+// VABS
+def :ItinRW<[A9WriteV4, A9Read2], [IIC_VUNAiD, IIC_VUNAiQ]>;
+// VPADD/VPADDL are mapped later under IIC_SHLi.
+// ...
+// VCLSQ/VCLZQ/VCNTQ, takes two cycles.
+def :ItinRW<[A9Write2V4, A9Read3], [IIC_VCNTiQ]>;
+// VMOVimm/VMVNimm/VORRimm/VBICimm
+def :ItinRW<[A9WriteV3], [IIC_VMOVImm]>;
+def :ItinRW<[A9WriteV6, A9Read3, A9Read2], [IIC_VABAD, IIC_VABAQ]>;
+def :ItinRW<[A9WriteV6, A9Read3], [IIC_VPALiD, IIC_VPALiQ]>;
+
+// NEON integer multiply
+//
+// Note: these don't quite match the timing docs, but they do match
+// the original A9 itinerary.
+def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VMULi16D]>;
+def :ItinRW<[A9WriteV7, A9Read2, A9Read2], [IIC_VMULi16Q]>;
+def :ItinRW<[A9Write2V7, A9Read2], [IIC_VMULi32D]>;
+def :ItinRW<[A9Write2V9, A9Read2], [IIC_VMULi32Q]>;
+def :ItinRW<[A9WriteV6, A9Read3, A9Read2, A9Read2], [IIC_VMACi16D]>;
+def :ItinRW<[A9WriteV7, A9Read3, A9Read2, A9Read2], [IIC_VMACi16Q]>;
+def :ItinRW<[A9Write2V7, A9Read3, A9Read2], [IIC_VMACi32D]>;
+def :ItinRW<[A9Write2V9, A9Read3, A9Read2], [IIC_VMACi32Q]>;
+
+// NEON integer shift
+// TODO: Q,Q,Q shifts should actually reserve FP for 2 cycles.
+def :ItinRW<[A9WriteV3], [IIC_VSHLiD, IIC_VSHLiQ]>;
+def :ItinRW<[A9WriteV4], [IIC_VSHLi4D, IIC_VSHLi4Q]>;
+
+// NEON permute
+def :ItinRW<[A9WriteV2, A9WriteV2], [IIC_VPERMD, IIC_VPERMQ, IIC_VEXTD]>;
+def :ItinRW<[A9WriteV3, A9WriteV4, ReadDefault, A9Read2],
+            [IIC_VPERMQ3, IIC_VEXTQ]>;
+def :ItinRW<[A9WriteV3, A9Read2], [IIC_VTB1]>;
+def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VTB2]>;
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3], [IIC_VTB3]>;
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3, A9Read3], [IIC_VTB4]>;
+def :ItinRW<[A9WriteV3, ReadDefault, A9Read2], [IIC_VTBX1]>;
+def :ItinRW<[A9WriteV3, ReadDefault, A9Read2, A9Read2], [IIC_VTBX2]>;
+def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3], [IIC_VTBX3]>;
+def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3, A9Read3],
+            [IIC_VTBX4]>;
+
+// NEON floating-point
+def :ItinRW<[A9WriteV5, A9Read2, A9Read2], [IIC_VBIND]>;
+def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VBINQ]>;
+def :ItinRW<[A9WriteV5, A9Read2], [IIC_VUNAD, IIC_VFMULD]>;
+def :ItinRW<[A9WriteV6, A9Read2], [IIC_VUNAQ, IIC_VFMULQ]>;
+def :ItinRW<[A9WriteV9, A9Read3, A9Read2], [IIC_VMACD, IIC_VFMACD]>;
+def :ItinRW<[A9WriteV10, A9Read3, A9Read2], [IIC_VMACQ, IIC_VFMACQ]>;
+def :ItinRW<[A9WriteV9, A9Read2, A9Read2], [IIC_VRECSD]>;
+def :ItinRW<[A9WriteV10, A9Read2, A9Read2], [IIC_VRECSQ]>;
+
+// Map SchedRWs that are identical for cortexa9 to existing resources.
+def : SchedAlias<WriteALU, A9WriteALU>;
+def : SchedAlias<WriteALUsr, A9WriteALUsr>;
+def : SchedAlias<WriteALUSsr, A9WriteALUsr>;
+def : SchedAlias<ReadALU, A9ReadALU>;
+def : SchedAlias<ReadALUsr, A9ReadALU>;
+def : InstRW< [WriteALU],
+      (instregex "ANDri", "ORRri", "EORri", "BICri", "ANDrr", "ORRrr", "EORrr",
+                 "BICrr")>;
+def : InstRW< [WriteALUsi], (instregex "ANDrsi", "ORRrsi", "EORrsi", "BICrsi")>;
+def : InstRW< [WriteALUsr], (instregex "ANDrsr", "ORRrsr", "EORrsr", "BICrsr")>;
+
+
+def : SchedAlias<WriteCMP, A9WriteALU>;
+def : SchedAlias<WriteCMPsi, A9WriteALU>;
+def : SchedAlias<WriteCMPsr, A9WriteALU>;
+
+def : InstRW< [A9WriteIsr], (instregex "MOVsr", "MOVsi", "MVNsr", "MOVCCsi",
+                                       "MOVCCsr")>;
+def : InstRW< [WriteALU, A9ReadALU], (instregex "MVNr")>;
+def : InstRW< [A9WriteI2], (instregex "MOVCCi32imm", "MOVi32imm",
+                                      "MOV_ga_dyn")>;
+def : InstRW< [A9WriteI2pc], (instregex "MOV_ga_pcrel")>;
+def : InstRW< [A9WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
+
+def : InstRW< [WriteALU], (instregex "SEL")>;
+
+def : InstRW< [WriteALUsi], (instregex "BFC", "BFI", "UBFX", "SBFX")>;
+
+def : InstRW< [A9WriteM],
+      (instregex "MUL", "MULv5", "SMMUL", "SMMULR", "MLA", "MLAv5", "MLS",
+      "SMMLA", "SMMLAR", "SMMLS", "SMMLSR")>;
+def : InstRW< [A9WriteM, A9WriteMHi],
+      (instregex "SMULL", "SMULLv5", "UMULL", "UMULLv5", "SMLAL$", "UMLAL",
+      "UMAAL", "SMLALv5", "UMLALv5", "UMAALv5", "SMLALBB", "SMLALBT", "SMLALTB",
+      "SMLALTT")>;
+// FIXME: These instructions used to have NoItinerary. Just copied the one from above.
+def : InstRW< [A9WriteM, A9WriteMHi],
+      (instregex "SMLAD", "SMLADX", "SMLALD", "SMLALDX", "SMLSD", "SMLSDX",
+      "SMLSLD", "SMLLDX", "SMUAD", "SMUADX", "SMUSD", "SMUSDX")>;
+
+def : InstRW<[A9WriteM16, A9WriteM16Hi],
+      (instregex "SMULBB", "SMULBT", "SMULTB", "SMULTT", "SMULWB", "SMULWT")>;
+def : InstRW<[A9WriteM16, A9WriteM16Hi],
+      (instregex "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLAWB", "SMLAWT")>;
+
+def : InstRW<[A9WriteL], (instregex "LDRi12", "PICLDR$")>;
+def : InstRW<[A9WriteLsi], (instregex "LDRrs")>;
+def : InstRW<[A9WriteLb],
+      (instregex "LDRBi12", "PICLDRH", "PICLDRB", "PICLDRSH", "PICLDRSB",
+      "LDRH", "LDRSH", "LDRSB")>;
+def : InstRW<[A9WriteLbsi], (instregex "LDRrs")>;
+
+def : WriteRes<WriteDiv, []> { let Latency = 0; }
+
+def : WriteRes<WriteBr, [A9UnitB]>;
+def : WriteRes<WriteBrL, [A9UnitB]>;
+def : WriteRes<WriteBrTbl, [A9UnitB]>;
+def : WriteRes<WritePreLd, []>;
+def : SchedAlias<WriteCvtFP, A9WriteF>;
+def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
+} // SchedModel = CortexA9Model
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleR52.td b/contrib/llvm/lib/Target/ARM/ARMScheduleR52.td
new file mode 100644
index 000000000000..1b40742a093b
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleR52.td
@@ -0,0 +1,983 @@
+//==- ARMScheduleR52.td - Cortex-R52 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SchedRead/Write data for the ARM Cortex-R52 processor.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The Cortex-R52 is an in-order pipelined superscalar microprocessor with
+// a 8 stage pipeline. It can issue maximum two instructions in each cycle.
+// There are two ALUs, one LDST, one MUL  and a non-pipelined integer DIV.
+// A number of forwarding paths enable results of computations to be input
+// to subsequent operations before they are written to registers.
+// This scheduler is a MachineScheduler. See TargetSchedule.td for details.
+
+def CortexR52Model : SchedMachineModel {
+  let MicroOpBufferSize = 0;  // R52 is in-order processor
+  let IssueWidth = 2;         // 2 micro-ops dispatched per cycle
+  let LoadLatency = 1;        // Optimistic, assuming no misses
+  let MispredictPenalty = 8;  // A branch direction mispredict, including PFU
+  let PostRAScheduler = 1;    // Enable PostRA scheduler pass.
+  let CompleteModel = 0;      // Covers instructions applicable to cortex-r52.
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
+// Cortex-R52 is an in-order processor.
+
+def R52UnitALU    : ProcResource<2> { let BufferSize = 0; } // Int ALU
+def R52UnitMAC    : ProcResource<1> { let BufferSize = 0; } // Int MAC
+def R52UnitDiv    : ProcResource<1> { let BufferSize = 0; } // Int Division
+def R52UnitLd     : ProcResource<1> { let BufferSize = 0; } // Load/Store
+def R52UnitB      : ProcResource<1> { let BufferSize = 0; } // Branch
+def R52UnitFPALU  : ProcResource<2> { let BufferSize = 0; } // FP ALU
+def R52UnitFPMUL  : ProcResource<2> { let BufferSize = 0; } // FP MUL
+def R52UnitFPDIV  : ProcResource<1> { let BufferSize = 0; } // FP DIV
+
+// Cortex-R52 specific SchedReads
+def R52Read_ISS   : SchedRead;
+def R52Read_EX1   : SchedRead;
+def R52Read_EX2   : SchedRead;
+def R52Read_WRI   : SchedRead;
+def R52Read_F0    : SchedRead; // F0 maps to ISS stage of integer pipe
+def R52Read_F1    : SchedRead;
+def R52Read_F2    : SchedRead;
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types which map ProcResources and set latency.
+
+let SchedModel = CortexR52Model in {
+
+// ALU - Write occurs in Late EX2 (independent of whether shift was required)
+def : WriteRes<WriteALU, [R52UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteALUsi, [R52UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteALUsr, [R52UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteALUSsr, [R52UnitALU]> { let Latency = 3; }
+
+// Compares
+def : WriteRes<WriteCMP, [R52UnitALU]> { let Latency = 0; }
+def : WriteRes<WriteCMPsi, [R52UnitALU]> { let Latency = 0; }
+def : WriteRes<WriteCMPsr, [R52UnitALU]> { let Latency = 0; }
+
+// Div - may stall 0-9 cycles depending on input (i.e. WRI+(0-9)/2)
+def : WriteRes<WriteDiv, [R52UnitDiv]> {
+  let Latency = 8; let ResourceCycles = [8]; // not pipelined
+}
+
+// Loads
+def : WriteRes<WriteLd, [R52UnitLd]> { let Latency = 4; }
+def : WriteRes<WritePreLd, [R52UnitLd]> { let Latency = 4; }
+
+// Branches  - LR written in Late EX2
+def : WriteRes<WriteBr, [R52UnitB]> { let Latency = 0; }
+def : WriteRes<WriteBrL, [R52UnitB]> { let Latency = 0; }
+def : WriteRes<WriteBrTbl, [R52UnitALU]> { let Latency = 0; }
+
+// Misc
+def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
+def : WriteRes<WriteCvtFP, [R52UnitALU]> { let Latency = 3; }
+
+def : ReadAdvance<ReadALU, 1>;   // Operand needed in EX1 stage
+def : ReadAdvance<ReadALUsr, 0>; // Shift operands needed in ISS
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedReadWrites.
+
+// Forwarding information - based on when an operand is read
+def : ReadAdvance<R52Read_ISS, 0>;
+def : ReadAdvance<R52Read_EX1, 1>;
+def : ReadAdvance<R52Read_EX2, 2>;
+def : ReadAdvance<R52Read_F0, 0>;
+def : ReadAdvance<R52Read_F1, 1>;
+def : ReadAdvance<R52Read_F2, 2>;
+
+
+// Cortex-R52 specific SchedWrites for use with InstRW
+def R52WriteMAC        : SchedWriteRes<[R52UnitMAC]> { let Latency = 4; }
+def R52WriteDIV        : SchedWriteRes<[R52UnitDiv]> {
+  let Latency = 8; let ResourceCycles = [8]; // not pipelined
+}
+def R52WriteLd         : SchedWriteRes<[R52UnitLd]> { let Latency = 4; }
+def R52WriteST         : SchedWriteRes<[R52UnitLd]> { let Latency = 4; }
+def R52WriteAdr        : SchedWriteRes<[]> { let Latency = 0; }
+def R52WriteCC         : SchedWriteRes<[]> { let Latency = 0; }
+def R52WriteALU_EX1    : SchedWriteRes<[R52UnitALU]> { let Latency = 2; }
+def R52WriteALU_EX2    : SchedWriteRes<[R52UnitALU]> { let Latency = 3; }
+def R52WriteALU_WRI    : SchedWriteRes<[R52UnitALU]> { let Latency = 4; }
+
+def R52WriteNoRSRC_EX2 : SchedWriteRes<[]> { let Latency = 3; }
+def R52WriteNoRSRC_WRI : SchedWriteRes<[]> { let Latency = 4; }
+
+def R52WriteFPALU_F3   : SchedWriteRes<[R52UnitFPALU]> { let Latency = 4; }
+def R52Write2FPALU_F3  : SchedWriteRes<[R52UnitFPALU, R52UnitFPALU]> {
+  let Latency = 4;
+}
+def R52WriteFPALU_F4   : SchedWriteRes<[R52UnitFPALU]> { let Latency = 5; }
+def R52Write2FPALU_F4  : SchedWriteRes<[R52UnitFPALU, R52UnitFPALU]> {
+  let Latency = 5;
+}
+def R52WriteFPALU_F5   : SchedWriteRes<[R52UnitFPALU]> { let Latency = 6; }
+def R52Write2FPALU_F5  : SchedWriteRes<[R52UnitFPALU, R52UnitFPALU]> {
+  let Latency = 6;
+}
+def R52WriteFPMUL_F5   : SchedWriteRes<[R52UnitFPMUL]> { let Latency = 6; }
+def R52Write2FPMUL_F5  : SchedWriteRes<[R52UnitFPMUL, R52UnitFPMUL]> {
+  let Latency = 6;
+}
+def R52WriteFPMAC_F5   : SchedWriteRes<[R52UnitFPMUL, R52UnitFPALU]> {
+  let Latency = 11;     // as it is internally two insns (MUL then ADD)
+}
+def R52Write2FPMAC_F5  : SchedWriteRes<[R52UnitFPMUL, R52UnitFPMUL,
+                                         R52UnitFPALU, R52UnitFPALU]> {
+  let Latency = 11;
+}
+
+def R52WriteFPLd_F4    : SchedWriteRes<[R52UnitLd]> { let Latency = 5; }
+def R52WriteFPST_F4    : SchedWriteRes<[R52UnitLd]> { let Latency = 5; }
+
+def R52WriteFPDIV_SP   : SchedWriteRes<[R52UnitFPDIV]> {
+  let Latency = 7;          // FP div takes fixed #cycles
+  let ResourceCycles = [7]; // is not pipelined
+ }
+def R52WriteFPDIV_DP   : SchedWriteRes<[R52UnitFPDIV]> {
+  let Latency = 17;
+  let ResourceCycles = [17];
+}
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific - map operands to SchedReadWrites
+
+def : InstRW<[WriteALU], (instrs COPY)>;
+
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_ISS],
+      (instregex "SXTB", "SXTH", "SXTB16", "UXTB", "UXTH", "UXTB16",
+      "t2SXTB", "t2SXTH", "t2SXTB16", "t2UXTB", "t2UXTH", "t2UXTB16")>;
+
+def : InstRW<[R52WriteALU_EX1, R52Read_ISS],
+      (instregex "MOVCCi32imm", "MOVi32imm", "MOV_ga_dyn", "t2MOVCCi",
+      "t2MOVi", "t2MOV_ga_dyn")>;
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1],
+      (instregex "MOV_ga_pcrel", "t2MOV_ga_pcrel")>;
+def : InstRW<[R52WriteLd,R52Read_ISS],
+      (instregex "MOV_ga_pcrel_ldr", "t2MOV_ga_pcrel_ldr")>;
+
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_EX1], (instregex "SEL", "t2SEL")>;
+
+def : InstRW< [R52WriteALU_EX2, R52Read_ISS, R52Read_ISS],
+      (instregex "BFC", "BFI", "UBFX", "SBFX", "(t|t2)BFC", "(t|t2)BFI",
+      "(t|t2)UBFX", "(t|t2)SBFX")>;
+
+// Saturating arithmetic
+def : InstRW< [R52WriteALU_WRI, R52Read_EX1, R52Read_EX1],
+      (instregex "QADD", "QSUB", "QDADD", "QDSUB", "SSAT", "SSAT16", "USAT",
+      "QADD8", "QADD16", "QSUB8", "QSUB16", "QASX", "QSAX",
+      "UQADD8", "UQADD16","UQSUB8","UQSUB16","UQASX","UQSAX", "t2QADD",
+      "t2QSUB", "t2QDADD", "t2QDSUB", "t2SSAT", "t2SSAT16", "t2USAT",
+      "t2QADD8", "t2QADD16", "t2QSUB8", "t2QSUB16", "t2QASX", "t2QSAX",
+      "t2UQADD8", "t2UQADD16","t2UQSUB8","t2UQSUB16","t2UQASX","t2UQSAX","t2ABS")>;
+
+// Parallel arithmetic
+def : InstRW< [R52WriteALU_EX2, R52Read_EX1, R52Read_EX1],
+      (instregex "SADD8", "SADD16", "SSUB8", "SSUB16", "SASX", "SSAX",
+      "UADD8", "UADD16", "USUB8", "USUB16", "UASX", "USAX", "t2SADD8",
+      "t2SADD16", "t2SSUB8", "t2SSUB16", "t2SASX", "t2SSAX", "t2UADD8",
+      "t2UADD16", "t2USUB8", "t2USUB16", "t2UASX", "t2USAX")>;
+
+// Flag setting.
+def : InstRW< [R52WriteALU_EX2, R52Read_EX1, R52Read_EX1],
+      (instregex "SHADD8", "SHADD16", "SHSUB8", "SHSUB16", "SHASX", "SHSAX",
+      "SXTAB", "SXTAB16", "SXTAH", "UHADD8", "UHADD16", "UHSUB8", "UHSUB16",
+      "UHASX", "UHSAX", "UXTAB", "UXTAB16", "UXTAH", "t2SHADD8", "t2SHADD16",
+      "t2SHSUB8", "t2SHSUB16", "t2SHASX", "t2SHSAX", "t2SXTAB", "t2SXTAB16",
+      "t2SXTAH", "t2UHADD8", "t2UHADD16", "t2UHSUB8", "t2UHSUB16", "t2UHASX",
+      "t2UHSAX", "t2UXTAB", "t2UXTAB16", "t2UXTAH")>;
+
+// Sum of Absolute Difference
+def : InstRW< [R52WriteALU_WRI, R52Read_ISS, R52Read_ISS, R52Read_ISS],
+      (instregex "USAD8", "t2USAD8", "tUSAD8","USADA8", "t2USADA8", "tUSADA8") >;
+
+// Integer Multiply
+def : InstRW<[R52WriteMAC, R52Read_ISS, R52Read_ISS],
+      (instregex "MULS", "MUL", "SMMUL", "SMMULR", "SMULBB", "SMULBT",
+      "SMULTB", "SMULTT", "SMULWB", "SMULWT", "SMUSD", "SMUSDXi", "t2MUL",
+      "t2SMMUL", "t2SMMULR", "t2SMULBB", "t2SMULBT", "t2SMULTB", "t2SMULTT",
+      "t2SMULWB", "t2SMULWT", "t2SMUSD")>;
+
+// Multiply Accumulate
+// Even for 64-bit accumulation (or Long), the single MAC is used (not ALUs).
+// The store pipeline is used partly for 64-bit operations.
+def : InstRW<[R52WriteMAC, R52Read_ISS, R52Read_ISS, R52Read_ISS],
+      (instregex "MLAS", "MLA", "MLS", "SMMLA", "SMMLAR", "SMMLS", "SMMLSR",
+      "t2MLA", "t2MLS", "t2MLAS", "t2SMMLA", "t2SMMLAR", "t2SMMLS", "t2SMMLSR",
+      "SMUAD", "SMUADX", "t2SMUAD", "t2SMUADX",
+      "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLSD", "SMLSDX",
+      "SMLAWB", "SMLAWT", "t2SMLABB", "t2SMLABT", "t2SMLATB", "t2SMLATT",
+      "t2SMLSD", "t2SMLSDX", "t2SMLAWB", "t2SMLAWT",
+      "SMLAD", "SMLADX", "t2SMLAD", "t2SMLADX",
+      "SMULL$", "UMULL$", "t2SMULL$", "t2UMULL$",
+      "SMLALS", "UMLALS", "SMLAL", "UMLAL", "MLALBB", "SMLALBT",
+      "SMLALTB", "SMLALTT", "SMLALD", "SMLALDX", "SMLSLD", "SMLSLDX",
+      "UMAAL", "t2SMLALS", "t2UMLALS", "t2SMLAL", "t2UMLAL", "t2MLALBB",
+      "t2SMLALBT", "t2SMLALTB", "t2SMLALTT", "t2SMLALD", "t2SMLALDX",
+      "t2SMLSLD", "t2SMLSLDX", "t2UMAAL")>;
+
+def : InstRW <[R52WriteDIV, R52Read_ISS, R52Read_ISS],
+      (instregex "SDIV", "UDIV", "t2SDIV", "t2UDIV")>;
+
+// Loads (except POST) with SHL > 2, or ror, require 2 extra cycles.
+// However, that's non-trivial to specify, so we keep it uniform
+def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_ISS],
+      (instregex "LDR(i12|rs)$", "LDRB(i12|rs)$", "t2LDR(i8|i12|s|pci)",
+      "t2LDR(H|B)(i8|i12|s|pci)", "LDREX", "t2LDREX",
+      "tLDR[BH](r|i|spi|pci|pciASM)", "tLDR(r|i|spi|pci|pciASM)",
+      "LDRH$",  "PICLDR$", "PICLDR(H|B)$", "LDRcp$",
+      "PICLDRS(H|B)$", "t2LDRS(H|B)(i|r|p|s)", "LDRS(H|B)$",
+      "t2LDRpci_pic", "tLDRS(B|H)", "t2LDRDi8", "LDRD$", "LDA", "t2LDA")>;
+def : InstRW<[R52WriteLd, R52WriteAdr, R52Read_ISS, R52Read_ISS],
+      (instregex "LD(RB|R)(_|T_)(POST|PRE)_(IMM|REG)", "LDRH(_PRE|_POST)",
+      "LDRBT_POST$", "LDR(T|BT)_POST_(REG|IMM)", "LDRHT(i|r)",
+      "t2LD(R|RB|RH)_(PRE|POST)", "t2LD(R|RB|RH)T",
+      "LDR(SH|SB)(_POST|_PRE)", "t2LDR(SH|SB)(_POST|_PRE)",
+      "LDRS(B|H)T(i|r)", "t2LDRS(B|H)T(i|r)", "t2LDRS(B|H)T",
+      "LDRD_(POST|PRE)", "t2LDRD_(POST|PRE)")>;
+
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1], (instregex "MOVS?sr", "t2MOVS?sr")>;
+def : InstRW<[R52WriteALU_WRI, R52Read_EX2], (instregex "MOVT", "t2MOVT")>;
+
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1], (instregex "AD(C|D)S?ri","ANDS?ri",
+      "BICS?ri", "CLZ", "EORri", "MVNS?r", "ORRri", "RSBS?ri", "RSCri", "SBCri",
+      "t2AD(C|D)S?ri", "t2ANDS?ri", "t2BICS?ri","t2CLZ", "t2EORri", "t2MVN",
+      "t2ORRri", "t2RSBS?ri", "t2SBCri")>;
+
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_EX1], (instregex "AD(C|D)S?rr",
+      "ANDS?rr", "BICS?rr", "CRC*", "EORrr", "ORRrr", "RSBrr", "RSCrr", "SBCrr",
+      "t2AD(C|D)S?rr", "t2ANDS?rr", "t2BICS?rr", "t2CRC", "t2EORrr", "t2SBCrr")>;
+
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_ISS], (instregex "AD(C|D)S?rsi",
+      "ANDS?rsi", "BICS?rsi", "EORrsi", "ORRrsi", "RSBrsi", "RSCrsi", "SBCrsi",
+      "t2AD(|D)S?rsi", "t2ANDS?rsi", "t2BICS?rsi", "t2EORrsi", "t2ORRrsi", "t2RSBrsi", "t2SBCrsi")>;
+
+def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_ISS, R52Read_ISS],
+      (instregex "AD(C|D)S?rsr", "ANDS?rsr", "BICS?rsr", "EORrsr", "MVNS?sr",
+      "ORRrsrr", "RSBrsr", "RSCrsr", "SBCrsr")>;
+
+def : InstRW<[R52WriteALU_EX1],
+    (instregex "ADR", "MOVSi", "MOVSsi", "MOVST?i16*", "MVNS?s?i", "t2MOVS?si")>;
+
+def : InstRW<[R52WriteALU_EX1, R52Read_ISS], (instregex "ASRi", "RORS?i")>;
+def : InstRW<[R52WriteALU_EX1, R52Read_ISS, R52Read_ISS],
+      (instregex "ASRr", "RORS?r", "LSR", "LSL")>;
+
+def : InstRW<[R52WriteCC, R52Read_EX1], (instregex "CMPri", "CMNri")>;
+def : InstRW<[R52WriteCC, R52Read_EX1, R52Read_EX1], (instregex "CMPrr", "CMNzrr")>;
+def : InstRW<[R52WriteCC, R52Read_EX1, R52Read_ISS], (instregex "CMPrsi", "CMNzrsi")>;
+def : InstRW<[R52WriteCC, R52Read_EX1, R52Read_ISS, R52Read_ISS], (instregex "CMPrsr", "CMNzrsr")>;
+
+def : InstRW<[R52WriteALU_EX2, R52Read_ISS],
+      (instregex "t2LDC", "RBIT", "REV", "REV16", "REVSH", "RRX")>;
+
+def : InstRW<[R52WriteCC, R52Read_ISS], (instregex "TST")>;
+
+def : InstRW<[R52WriteLd], (instregex "MRS", "MRSbanked")>;
+def : InstRW<[R52WriteLd, R52Read_EX1], (instregex "MSR", "MSRbanked")>;
+
+//def : InstRW<[R52WriteLd, R52Read_ISS], (instregex "^LDRB?(_PRE_IMM|_POST_IMM)", "LDRrs")>;
+//def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_ISS], (instregex "^LDRB?_PRE_REG", "LDRB?rr")>;
+//def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_ISS], (instregex "^LDRB?_POST_REG")>;
+
+//def : InstRW<[R52WriteST, R52Read_ISS], (instregex "STRi12", "PICSTR")>;
+//def : InstRW<[R52WriteST, R52WriteAdr, R52Read_ISS, R52Read_EX2], (instregex "t2STRB?_PRE_REG", "STRB?_PRE_REG")>;
+//def : InstRW<[R52WriteST, R52WriteAdr, R52Read_ISS, R52Read_EX2], (instregex "t2STRB?_POST_REG", "STRB?_POST_REG")>;
+
+
+// Integer Load, Multiple.
+foreach Lat = 3-25 in {
+  def R52WriteILDM#Lat#Cy : SchedWriteRes<[R52UnitLd]> {
+    let Latency = Lat;
+  }
+  def R52WriteILDM#Lat#CyNo : SchedWriteRes<[]> {
+    let Latency = Lat;
+    let NumMicroOps = 0;
+  }
+}
+foreach NAddr = 1-16 in {
+  def R52ILDMAddr#NAddr#Pred : SchedPredicate<"TII->getNumLDMAddresses(*MI) == "#NAddr>;
+}
+def R52WriteILDMAddrNoWB : SchedWriteRes<[R52UnitLd]> { let Latency = 0; }
+def R52WriteILDMAddrWB : SchedWriteRes<[R52UnitLd]>;
+def R52WriteILDM : SchedWriteVariant<[
+    SchedVar<R52ILDMAddr2Pred, [R52WriteILDM4Cy, R52WriteILDM5Cy]>,
+
+    SchedVar<R52ILDMAddr3Pred, [R52WriteILDM4Cy, R52WriteILDM5Cy,
+                                 R52WriteILDM6Cy]>,
+    SchedVar<R52ILDMAddr4Pred, [R52WriteILDM4Cy, R52WriteILDM5Cy,
+                                 R52WriteILDM6Cy, R52WriteILDM7Cy]>,
+
+    SchedVar<R52ILDMAddr5Pred, [R52WriteILDM4Cy, R52WriteILDM5Cy,
+                                 R52WriteILDM6Cy, R52WriteILDM7Cy,
+                                 R52WriteILDM8Cy]>,
+    SchedVar<R52ILDMAddr6Pred, [R52WriteILDM4Cy, R52WriteILDM5Cy,
+                                 R52WriteILDM6Cy, R52WriteILDM7Cy,
+                                 R52WriteILDM8Cy, R52WriteILDM9Cy]>,
+
+    SchedVar<R52ILDMAddr7Pred, [R52WriteILDM4Cy, R52WriteILDM5Cy,
+                                 R52WriteILDM6Cy, R52WriteILDM7Cy,
+                                 R52WriteILDM8Cy, R52WriteILDM9Cy,
+                                 R52WriteILDM10Cy]>,
+    SchedVar<R52ILDMAddr8Pred, [R52WriteILDM4Cy, R52WriteILDM5Cy,
+                                 R52WriteILDM6Cy, R52WriteILDM7Cy,
+                                 R52WriteILDM8Cy, R52WriteILDM9Cy,
+                                 R52WriteILDM10Cy, R52WriteILDM11Cy]>,
+
+    SchedVar<R52ILDMAddr9Pred, [R52WriteILDM4Cy, R52WriteILDM5Cy,
+                                 R52WriteILDM6Cy, R52WriteILDM7Cy,
+                                 R52WriteILDM8Cy, R52WriteILDM9Cy,
+                                 R52WriteILDM10Cy, R52WriteILDM11Cy,
+                                 R52WriteILDM12Cy]>,
+    SchedVar<R52ILDMAddr10Pred,[R52WriteILDM4Cy, R52WriteILDM5Cy,
+                                 R52WriteILDM6Cy, R52WriteILDM7Cy,
+                                 R52WriteILDM8Cy, R52WriteILDM9Cy,
+                                 R52WriteILDM10Cy, R52WriteILDM11Cy,
+                                 R52WriteILDM12Cy, R52WriteILDM13Cy]>,
+
+    SchedVar<R52ILDMAddr11Pred,[R52WriteILDM4Cy, R52WriteILDM5Cy,
+                                 R52WriteILDM6Cy, R52WriteILDM7Cy,
+                                 R52WriteILDM8Cy, R52WriteILDM9Cy,
+                                 R52WriteILDM10Cy, R52WriteILDM11Cy,
+                                 R52WriteILDM12Cy, R52WriteILDM13Cy,
+                                 R52WriteILDM14Cy]>,
+    SchedVar<R52ILDMAddr12Pred,[R52WriteILDM4Cy, R52WriteILDM5Cy,
+                                 R52WriteILDM6Cy, R52WriteILDM7Cy,
+                                 R52WriteILDM8Cy, R52WriteILDM9Cy,
+                                 R52WriteILDM10Cy, R52WriteILDM11Cy,
+                                 R52WriteILDM12Cy, R52WriteILDM13Cy,
+                                 R52WriteILDM14Cy, R52WriteILDM15Cy]>,
+
+    SchedVar<R52ILDMAddr13Pred,[R52WriteILDM4Cy, R52WriteILDM5Cy,
+                                 R52WriteILDM6Cy, R52WriteILDM7Cy,
+                                 R52WriteILDM8Cy, R52WriteILDM9Cy,
+                                 R52WriteILDM10Cy, R52WriteILDM11Cy,
+                                 R52WriteILDM12Cy, R52WriteILDM13Cy,
+                                 R52WriteILDM14Cy, R52WriteILDM15Cy,
+                                 R52WriteILDM16Cy]>,
+    SchedVar<R52ILDMAddr14Pred,[R52WriteILDM4Cy, R52WriteILDM5Cy,
+                                 R52WriteILDM6Cy, R52WriteILDM7Cy,
+                                 R52WriteILDM8Cy, R52WriteILDM9Cy,
+                                 R52WriteILDM10Cy, R52WriteILDM11Cy,
+                                 R52WriteILDM12Cy, R52WriteILDM13Cy,
+                                 R52WriteILDM14Cy, R52WriteILDM15Cy,
+                                 R52WriteILDM16Cy, R52WriteILDM17Cy]>,
+
+    SchedVar<R52ILDMAddr15Pred,[R52WriteILDM4Cy, R52WriteILDM5Cy,
+                                 R52WriteILDM6Cy, R52WriteILDM7Cy,
+                                 R52WriteILDM8Cy, R52WriteILDM9Cy,
+                                 R52WriteILDM10Cy, R52WriteILDM11Cy,
+                                 R52WriteILDM12Cy, R52WriteILDM13Cy,
+                                 R52WriteILDM14Cy, R52WriteILDM15Cy,
+                                 R52WriteILDM16Cy, R52WriteILDM17Cy,
+                                 R52WriteILDM18Cy]>,
+    SchedVar<R52ILDMAddr15Pred,[R52WriteILDM4Cy, R52WriteILDM5Cy,
+                                 R52WriteILDM6Cy, R52WriteILDM7Cy,
+                                 R52WriteILDM8Cy, R52WriteILDM9Cy,
+                                 R52WriteILDM10Cy, R52WriteILDM11Cy,
+                                 R52WriteILDM12Cy, R52WriteILDM13Cy,
+                                 R52WriteILDM14Cy, R52WriteILDM15Cy,
+                                 R52WriteILDM16Cy, R52WriteILDM17Cy,
+                                 R52WriteILDM18Cy, R52WriteILDM19Cy]>,
+
+// Unknown number of registers, just use resources for two registers.
+    SchedVar<NoSchedPred,      [R52WriteILDM4Cy, R52WriteILDM5Cy,
+                                R52WriteILDM6CyNo, R52WriteILDM7CyNo,
+                                R52WriteILDM8CyNo, R52WriteILDM9CyNo,
+                                R52WriteILDM10CyNo, R52WriteILDM11CyNo,
+                                R52WriteILDM12CyNo, R52WriteILDM13CyNo,
+                                R52WriteILDM14CyNo, R52WriteILDM15CyNo,
+                                R52WriteILDM16CyNo, R52WriteILDM17CyNo,
+                                R52WriteILDM18Cy, R52WriteILDM19Cy]>
+]> { let Variadic=1; }
+
+// Integer Store, Multiple
+def R52WriteIStIncAddr : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+foreach NumAddr = 1-16 in {
+  def R52WriteISTM#NumAddr : WriteSequence<[R52WriteIStIncAddr], NumAddr>;
+}
+def R52WriteISTM : SchedWriteVariant<[
+    SchedVar<R52ILDMAddr2Pred, [R52WriteISTM2]>,
+    SchedVar<R52ILDMAddr3Pred, [R52WriteISTM3]>,
+    SchedVar<R52ILDMAddr4Pred, [R52WriteISTM4]>,
+    SchedVar<R52ILDMAddr5Pred, [R52WriteISTM5]>,
+    SchedVar<R52ILDMAddr6Pred, [R52WriteISTM6]>,
+    SchedVar<R52ILDMAddr7Pred, [R52WriteISTM7]>,
+    SchedVar<R52ILDMAddr8Pred, [R52WriteISTM8]>,
+    SchedVar<R52ILDMAddr9Pred, [R52WriteISTM9]>,
+    SchedVar<R52ILDMAddr10Pred,[R52WriteISTM10]>,
+    SchedVar<R52ILDMAddr11Pred,[R52WriteISTM11]>,
+    SchedVar<R52ILDMAddr12Pred,[R52WriteISTM12]>,
+    SchedVar<R52ILDMAddr13Pred,[R52WriteISTM13]>,
+    SchedVar<R52ILDMAddr14Pred,[R52WriteISTM14]>,
+    SchedVar<R52ILDMAddr15Pred,[R52WriteISTM15]>,
+    SchedVar<R52ILDMAddr16Pred,[R52WriteISTM16]>,
+    // Unknow number of registers, just use resources for two registers.
+    SchedVar<NoSchedPred,      [R52WriteISTM2]>
+]>;
+
+def : InstRW<[R52WriteILDM, R52Read_ISS],
+      (instregex "LDM(IA|DA|DB|IB)$", "t2LDM(IA|DA|DB|IB)$",
+      "(t|sys)LDM(IA|DA|DB|IB)$")>;
+def : InstRW<[R52WriteILDM, R52WriteAdr, R52Read_ISS],
+      (instregex "LDM(IA|DA|DB|IB)_UPD", "(t2|sys|t)LDM(IA|DA|DB|IB)_UPD")>;
+def : InstRW<[R52WriteILDM, R52WriteAdr, R52Read_ISS],
+        (instregex "LDMIA_RET", "(t|t2)LDMIA_RET", "POP", "tPOP")>;
+
+// Integer Store, Single Element
+def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_EX2],
+      (instregex "PICSTR", "STR(i12|rs)", "STRB(i12|rs)", "STRH$", "STREX", "SRS", "t2SRS",
+      "t2SRSDB", "t2STREX", "t2STREXB", "t2STREXD", "t2STREXH", "t2STR(i12|i8|s)$",
+      "RFE", "t2RFE", "t2STR[BH](i12|i8|s)$", "tSTR[BH](i|r)", "tSTR(i|r)", "tSTRspi")>;
+
+def : InstRW<[R52WriteLd, R52WriteAdr, R52Read_ISS, R52Read_EX2],
+      (instregex "STR(B_|_|BT_|T_)(PRE_IMM|PRE_REG|POST_REG|POST_IMM)",
+      "STR(i|r)_preidx", "STRB(i|r)_preidx", "STRH_preidx", "STR(H_|HT_)(PRE|POST)",
+      "STR(BT|HT|T)", "t2STR_(PRE|POST)", "t2STR[BH]_(PRE|POST)",
+      "t2STR_preidx", "t2STR[BH]_preidx", "t2ST(RB|RH|R)T")>;
+
+// Integer Store, Dual
+def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_EX2],
+    (instregex "STRD$", "t2STRDi8", "STL", "t2STRD$", "t2STL")>;
+def : InstRW<[R52WriteLd, R52WriteAdr, R52Read_ISS, R52Read_EX2],
+    (instregex "(t2|t)STRD_(POST|PRE)", "STRD_(POST|PRE)")>;
+
+def : InstRW<[R52WriteISTM, R52Read_ISS, R52Read_EX2],
+    (instregex "STM(IB|IA|DB|DA)$", "(t2|sys|t)STM(IB|IA|DB|DA)$")>;
+def : InstRW<[R52WriteISTM, R52WriteAdr, R52Read_ISS, R52Read_EX2],
+    (instregex "STM(IB|IA|DB|DA)_UPD", "(t2|sys|t)STM(IB|IA|DB|DA)_UPD",
+    "PUSH", "tPUSH")>;
+
+// LDRLIT pseudo instructions, they expand to LDR + PICADD
+def : InstRW<[R52WriteLd],
+      (instregex "t?LDRLIT_ga_abs", "t?LDRLIT_ga_pcrel")>;
+// LDRLIT_ga_pcrel_ldr expands to LDR + PICLDR
+def : InstRW<[R52WriteLd], (instregex "LDRLIT_ga_pcrel_ldr")>;
+
+
+
+//===----------------------------------------------------------------------===//
+// VFP, Floating Point Support
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1], (instregex "VABD(fd|hd)")>;
+def : InstRW<[R52Write2FPALU_F5, R52Read_F1, R52Read_F1], (instregex "VABD(fq|hq)")>;
+
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1], (instregex "VABS(D|S|H)")>;
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1], (instregex "VABS(fd|hd)")>;
+def : InstRW<[R52Write2FPALU_F5, R52Read_F1], (instregex "VABS(fq|hq)")>;
+
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "(VACGE|VACGT)(fd|hd)")>;
+def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F1], (instregex "(VACGE|VACGT)(fq|hq)")>;
+
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1], (instregex "(VADD|VSUB)(D|S|H|fd|hd)")>;
+def : InstRW<[R52Write2FPALU_F5, R52Read_F1, R52Read_F1], (instregex "(VADD|VSUB)(fq|hq)")>;
+
+def : InstRW<[R52WriteFPDIV_SP, R52Read_F0, R52Read_F0], (instregex "VDIV(S|H)")>;
+def : InstRW<[R52WriteFPDIV_DP, R52Read_F0, R52Read_F0], (instregex "VDIVD")>;
+
+def : InstRW<[R52WriteFPMAC_F5, R52Read_F1, R52Read_F1, R52Read_F1],
+                                          (instregex "(VFMA|VFMS|VFNMA|VFNMS)(D|H|S)")>;
+
+def : InstRW<[R52WriteFPLd_F4, R52Read_ISS, R52Read_F1], (instregex "VLDR")>;
+def : InstRW<[R52WriteFPST_F4, R52Read_ISS, R52Read_F1], (instregex "VSTR")>;
+
+
+//===----------------------------------------------------------------------===//
+// Neon Support
+
+// vector multiple load stores
+foreach NumAddr = 1-16 in {
+  def R52LMAddrPred#NumAddr :
+    SchedPredicate<"MI->getNumOperands() == "#NumAddr>;
+}
+foreach Lat = 1-32 in {
+  def R52WriteLM#Lat#Cy : SchedWriteRes<[]> {
+    let Latency = Lat;
+  }
+}
+foreach Num = 1-32 in { // reserve LdSt resource, no dual-issue
+  def R52ReserveLd#Num#Cy : SchedWriteRes<[R52UnitLd]> {
+    let Latency = 0;
+    let NumMicroOps = Num;
+    let ResourceCycles = [Num];
+  }
+}
+def R52WriteVLDM : SchedWriteVariant<[
+  // 1 D reg
+  SchedVar<R52LMAddrPred1,  [R52WriteLM5Cy,
+                              R52ReserveLd5Cy]>,
+  SchedVar<R52LMAddrPred2,  [R52WriteLM5Cy,
+                              R52ReserveLd5Cy]>,
+
+  // 2 D reg
+  SchedVar<R52LMAddrPred3,  [R52WriteLM5Cy, R52WriteLM6Cy,
+                              R52ReserveLd6Cy]>,
+  SchedVar<R52LMAddrPred4,  [R52WriteLM5Cy, R52WriteLM6Cy,
+                              R52ReserveLd6Cy]>,
+
+  // 3 D reg
+  SchedVar<R52LMAddrPred5,  [R52WriteLM5Cy, R52WriteLM6Cy,
+                              R52WriteLM7Cy,
+                              R52ReserveLd4Cy]>,
+  SchedVar<R52LMAddrPred6,  [R52WriteLM5Cy, R52WriteLM6Cy,
+                              R52WriteLM7Cy,
+                              R52ReserveLd7Cy]>,
+
+  // 4 D reg
+  SchedVar<R52LMAddrPred7,  [R52WriteLM5Cy, R52WriteLM6Cy,
+                              R52WriteLM7Cy, R52WriteLM8Cy,
+                              R52ReserveLd8Cy]>,
+  SchedVar<R52LMAddrPred8,  [R52WriteLM5Cy, R52WriteLM6Cy,
+                              R52WriteLM7Cy, R52WriteLM8Cy,
+                              R52ReserveLd8Cy]>,
+
+  // 5 D reg
+  SchedVar<R52LMAddrPred9,  [R52WriteLM5Cy, R52WriteLM6Cy,
+                              R52WriteLM7Cy, R52WriteLM8Cy,
+                              R52WriteLM9Cy,
+                              R52ReserveLd9Cy]>,
+  SchedVar<R52LMAddrPred10, [R52WriteLM5Cy, R52WriteLM6Cy,
+                              R52WriteLM7Cy, R52WriteLM8Cy,
+                              R52WriteLM9Cy,
+                              R52ReserveLd9Cy]>,
+
+  // 6 D reg
+  SchedVar<R52LMAddrPred11, [R52WriteLM5Cy, R52WriteLM6Cy,
+                              R52WriteLM7Cy, R52WriteLM8Cy,
+                              R52WriteLM9Cy, R52WriteLM10Cy,
+                              R52ReserveLd10Cy]>,
+  SchedVar<R52LMAddrPred12, [R52WriteLM5Cy, R52WriteLM6Cy,
+                              R52WriteLM7Cy, R52WriteLM8Cy,
+                              R52WriteLM9Cy, R52WriteLM10Cy,
+                              R52ReserveLd10Cy]>,
+
+  // 7 D reg
+  SchedVar<R52LMAddrPred13, [R52WriteLM5Cy, R52WriteLM6Cy,
+                              R52WriteLM7Cy, R52WriteLM8Cy,
+                              R52WriteLM9Cy, R52WriteLM10Cy,
+                              R52WriteLM11Cy,
+                              R52ReserveLd11Cy]>,
+  SchedVar<R52LMAddrPred14, [R52WriteLM5Cy, R52WriteLM6Cy,
+                              R52WriteLM7Cy, R52WriteLM8Cy,
+                              R52WriteLM9Cy, R52WriteLM10Cy,
+                              R52WriteLM11Cy,
+                              R52ReserveLd11Cy]>,
+
+  // 8 D reg
+  SchedVar<R52LMAddrPred14, [R52WriteLM5Cy, R52WriteLM6Cy,
+                              R52WriteLM7Cy, R52WriteLM8Cy,
+                              R52WriteLM9Cy, R52WriteLM10Cy,
+                              R52WriteLM11Cy, R52WriteLM12Cy,
+                              R52ReserveLd12Cy]>,
+  SchedVar<R52LMAddrPred15, [R52WriteLM5Cy, R52WriteLM6Cy,
+                              R52WriteLM7Cy, R52WriteLM8Cy,
+                              R52WriteLM9Cy, R52WriteLM10Cy,
+                              R52WriteLM11Cy, R52WriteLM12Cy,
+                              R52ReserveLd12Cy]>,
+  // unknown number of reg.
+  SchedVar<NoSchedPred,      [R52WriteLM5Cy, R52WriteLM6Cy,
+                              R52WriteLM7Cy, R52WriteLM8Cy,
+                              R52WriteLM9Cy, R52WriteLM10Cy,
+                              R52WriteLM11Cy, R52WriteLM12Cy,
+                              R52ReserveLd5Cy]>
+]> { let Variadic=1;}
+
+// variable stores. Cannot dual-issue
+def R52WriteSTM5  : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1];
+}
+def R52WriteSTM6  : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2];
+}
+def R52WriteSTM7  : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 7;
+  let NumMicroOps = 6;
+  let ResourceCycles = [3];
+}
+def R52WriteSTM8  : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 8;
+  let NumMicroOps = 8;
+  let ResourceCycles = [4];
+}
+def R52WriteSTM9  : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 9;
+  let NumMicroOps = 10;
+  let ResourceCycles = [5];
+}
+def R52WriteSTM10 : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 10;
+  let NumMicroOps = 12;
+  let ResourceCycles = [6];
+}
+def R52WriteSTM11 : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 11;
+  let NumMicroOps = 14;
+  let ResourceCycles = [7];
+}
+def R52WriteSTM12 : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 12;
+  let NumMicroOps = 16;
+  let ResourceCycles = [8];
+}
+def R52WriteSTM13 : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 13;
+  let NumMicroOps = 18;
+  let ResourceCycles = [9];
+}
+def R52WriteSTM14 : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 14;
+  let NumMicroOps = 20;
+  let ResourceCycles = [10];
+}
+def R52WriteSTM15 : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 15;
+  let NumMicroOps = 22;
+  let ResourceCycles = [11];
+}
+
+def R52WriteSTM : SchedWriteVariant<[
+  SchedVar<R52LMAddrPred1, [R52WriteSTM5]>,
+  SchedVar<R52LMAddrPred2, [R52WriteSTM5]>,
+  SchedVar<R52LMAddrPred3, [R52WriteSTM6]>,
+  SchedVar<R52LMAddrPred4, [R52WriteSTM6]>,
+  SchedVar<R52LMAddrPred5, [R52WriteSTM7]>,
+  SchedVar<R52LMAddrPred6, [R52WriteSTM7]>,
+  SchedVar<R52LMAddrPred7, [R52WriteSTM8]>,
+  SchedVar<R52LMAddrPred8, [R52WriteSTM8]>,
+  SchedVar<R52LMAddrPred9,  [R52WriteSTM9]>,
+  SchedVar<R52LMAddrPred10, [R52WriteSTM9]>,
+  SchedVar<R52LMAddrPred11, [R52WriteSTM10]>,
+  SchedVar<R52LMAddrPred12, [R52WriteSTM10]>,
+  SchedVar<R52LMAddrPred13, [R52WriteSTM11]>,
+  SchedVar<R52LMAddrPred14, [R52WriteSTM11]>,
+  SchedVar<R52LMAddrPred15, [R52WriteSTM12]>,
+  SchedVar<R52LMAddrPred16, [R52WriteSTM12]>,
+  // unknown number of registers, just use resources for two
+  SchedVar<NoSchedPred,      [R52WriteSTM6]>
+]>;
+
+// Vector Load/Stores. Can issue only in slot-0. Can dual-issue with
+// another instruction in slot-1, but only in the last issue.
+def R52WriteVLD1Mem  : SchedWriteRes<[R52UnitLd]> { let Latency = 5;}
+def R52WriteVLD2Mem  : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2];
+}
+def R52WriteVLD3Mem  : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 7;
+  let NumMicroOps = 5;
+  let ResourceCycles = [3];
+}
+def R52WriteVLD4Mem  : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 8;
+  let NumMicroOps = 7;
+  let ResourceCycles = [4];
+}
+def R52WriteVST1Mem  : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def R52WriteVST2Mem  : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2];
+}
+def R52WriteVST3Mem  : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 7;
+  let NumMicroOps = 5;
+  let ResourceCycles = [3];
+}
+def R52WriteVST4Mem  : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 8;
+  let NumMicroOps = 7;
+  let ResourceCycles = [4];
+}
+def R52WriteVST5Mem  : SchedWriteRes<[R52UnitLd]> {
+  let Latency = 9;
+  let NumMicroOps = 9;
+  let ResourceCycles = [5];
+}
+
+
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "VABA(u|s)(v8i8|v4i16|v2i32)")>;
+def : InstRW<[R52Write2FPALU_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "VABA(u|s)(v16i8|v8i16|v4i32)")>;
+def : InstRW<[R52Write2FPALU_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "VABAL(u|s)(v8i16|v4i32|v2i64)")>;
+
+def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VABD(u|s)(v8i8|v4i16|v2i32)")>;
+def : InstRW<[R52Write2FPALU_F4, R52Read_F1, R52Read_F1], (instregex "VABD(u|s)(v16i8|v8i16|v4i32)")>;
+def : InstRW<[R52Write2FPALU_F4, R52Read_F1, R52Read_F1], (instregex "VABDL(u|s)(v16i8|v8i16|v4i32)")>;
+
+def : InstRW<[R52Write2FPALU_F4, R52Read_F1], (instregex "VABS(v16i8|v8i16|v4i32)")>;
+
+def : InstRW<[R52WriteFPALU_F4, R52Read_F2, R52Read_F2],
+                               (instregex "(VADD|VSUB)(v8i8|v4i16|v2i32|v1i64)")>;
+def : InstRW<[R52Write2FPALU_F4, R52Read_F2, R52Read_F2],
+                                (instregex "(VADD|VSUB)(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[R52Write2FPALU_F5, R52Read_F2, R52Read_F2],
+                               (instregex "(VADDHN|VRADDHN|VSUBHN|VRSUBHN)(v8i8|v4i16|v2i32)")>;
+
+def : InstRW<[R52Write2FPALU_F4, R52Read_F1, R52Read_F1],
+                                            (instregex "VADDL", "VADDW", "VSUBL", "VSUBW")>;
+
+def : InstRW<[R52WriteFPALU_F3, R52Read_F2, R52Read_F2], (instregex "(VAND|VBIC|VEOR)d")>;
+def : InstRW<[R52Write2FPALU_F3, R52Read_F2, R52Read_F2], (instregex "(VAND|VBIC|VEOR)q")>;
+
+def : InstRW<[R52WriteFPALU_F3, R52Read_F2], (instregex "VBICi(v4i16|v2i32)")>;
+def : InstRW<[R52Write2FPALU_F3, R52Read_F2], (instregex "VBICi(v8i16|v4i32)")>;
+
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL)d")>;
+def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL)q")>;
+
+def : InstRW<[R52Write2FPALU_F3, R52Read_F2], (instregex "VBICi(v8i16|v4i32)")>;
+
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1],
+      (instregex "(VCEQ|VCGE|VCGT|VCLE|VCLT|VCLZ|VCMP|VCMPE|VCNT)")>;
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1],
+      (instregex "VCVT", "VSITO", "VUITO", "VTO")>;
+
+def : InstRW<[R52WriteFPALU_F3, R52Read_ISS], (instregex "VDUP(8|16|32)d")>;
+def : InstRW<[R52Write2FPALU_F3, R52Read_ISS], (instregex "VDUP(8|16|32)q")>;
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1], (instregex "VDUPLN(8|16|32)d")>;
+def : InstRW<[R52Write2FPALU_F3, R52Read_F1], (instregex "VDUPLN(8|16|32)q")>;
+
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "VEXTd(8|16|32)", "VSEL")>;
+def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F1], (instregex "VEXTq(8|16|32|64)")>;
+
+def : InstRW<[R52WriteFPMAC_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "(VFMA|VFMS)(f|h)d")>;
+def : InstRW<[R52Write2FPMAC_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "(VFMA|VFMS)(f|h)q")>;
+
+def : InstRW<[R52WriteFPALU_F4, R52Read_F2, R52Read_F2], (instregex "(VHADD|VHSUB)(u|s)(v8i8|v4i16|v2i32)")>;
+def : InstRW<[R52Write2FPALU_F4, R52Read_F2, R52Read_F2], (instregex "(VHADD|VHSUB)(u|s)(v16i8|v8i16|v4i32)")>;
+
+def : InstRW<[R52WriteVLDM], (instregex "VLDM[SD](IA|DB)$")>;
+def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VMAX", "VMIN", "VPMAX", "VPMIN")>;
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "VMOV", "VORR", "VORN", "VREV")>;
+def : InstRW<[R52WriteNoRSRC_WRI], (instregex "VMRS")>;
+def : InstRW<[R52WriteFPMUL_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "VMUL", "VNMUL", "VMLA")>;
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1], (instregex "VNEG")>;
+def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VPADDi")>;
+def : InstRW<[R52Write2FPALU_F4, R52Read_F1, R52Read_F1], (instregex "VPADAL", "VPADDL")>;
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1], (instregex "VQABS(v8i8|v4i16|v2i32|v1i64)")>;
+def : InstRW<[R52Write2FPALU_F5, R52Read_F1], (instregex "VQABS(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[R52WriteFPALU_F5, R52Read_F2, R52Read_F2],
+                  (instregex "(VQADD|VQSUB)(u|s)(v8i8|v4i16|v2i32|v1i64)")>;
+def : InstRW<[R52Write2FPALU_F5, R52Read_F2, R52Read_F2],
+                  (instregex "(VQADD|VQSUB)(u|s)(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[R52Write2FPMAC_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "VQDMLAL", "VQDMLSL")>;
+def : InstRW<[R52WriteFPMUL_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "VQDMUL","VQRDMUL")>;
+def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1],
+                 (instregex "VQMOVN", "VQNEG", "VQSHL", "VQSHRN")>;
+def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VRSHL", "VRSHR", "VRSHRN", "VTB")>;
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "VSWP", "VTRN", "VUZP", "VZIP")>;
+
+//---
+// VLDx. Vector Loads
+//---
+// 1-element structure load
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)$")>;
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD1q(8|16|32|64)$")>;
+def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)T$")>;
+def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Q$")>;
+def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD1d64TPseudo$")>;
+def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD1d64QPseudo$")>;
+
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1(LN|DUP)d(8|16|32)$")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1LNdAsm_(8|16|32)")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1(LN|DUP)q(8|16|32)Pseudo$")>;
+
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)wb")>;
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1q(8|16|32|64)wb")>;
+def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Twb")>;
+def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Qwb")>;
+def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d64TPseudoWB")>;
+def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d64QPseudoWB")>;
+
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1LNd(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1LNdWB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1DUP(d|q)(8|16|32)wb")>;
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1(LN|DUP)q(8|16|32)Pseudo_UPD")>;
+
+// 2-element structure load
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD2(d|b)(8|16|32)$")>;
+def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD2q(8|16|32)$")>;
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2(d|b)(8|16|32)wb")>;
+def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2q(8|16|32)wb")>;
+def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD2q(8|16|32)Pseudo$")>;
+def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2q(8|16|32)PseudoWB")>;
+
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNd(8|16|32)$")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNdAsm_(8|16|32)$")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNq(16|32)$")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNqAsm_(16|32)$")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)$")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)x2$")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNd(8|16|32)Pseudo")>;
+def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNq(16|32)Pseudo")>;
+
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNd(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNdWB_(fixed|register)_Asm_(8|16|32)")>;
+
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNq(16|32)_UPD")>;
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNqWB_(fixed|register)_Asm_(16|32)")>;
+
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)wb")>;
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)x2wb")>;
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNd(8|16|32)Pseudo_UPD")>;
+def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNq(16|32)Pseudo_UPD")>;
+
+// 3-element structure load
+def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)$")>;
+def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)Asm_(8|16|32)$")>;
+def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo")>;
+def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)$")>;
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)Asm_(8|16|32)$")>;
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
+
+// 4-element structure load
+def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)$")>;
+def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)Asm_(8|16|32)$")>;
+def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo")>;
+def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)$")>;
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)Asm_(8|16|32)$")>;
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4LN(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4DUPd(8|16|32)Pseudo$")>;
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
+
+//---
+// VSTx. Vector Stores
+//---
+// 1-element structure store
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST1d(8|16|32|64)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST1q(8|16|32|64)$")>;
+def : InstRW<[R52WriteVST3Mem, R52Read_ISS, R52Read_F2], (instregex "VST1d(8|16|32|64)T$")>;
+def : InstRW<[R52WriteVST4Mem, R52Read_ISS, R52Read_F2], (instregex "VST1d(8|16|32|64)Q$")>;
+def : InstRW<[R52WriteVST3Mem, R52Read_ISS, R52Read_F2], (instregex "VST1d64TPseudo$")>;
+def : InstRW<[R52WriteVST4Mem, R52Read_ISS, R52Read_F2], (instregex "VST1d64QPseudo$")>;
+
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST1LNd(8|16|32)$")>;
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST1LNdAsm_(8|16|32)$")>;
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST1LNq(8|16|32)Pseudo$")>;
+
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1d(8|16|32|64)wb")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1q(8|16|32|64)wb")>;
+def : InstRW<[R52WriteVST3Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1d(8|16|32|64)Twb")>;
+def : InstRW<[R52WriteVST4Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1d(8|16|32|64)Qwb")>;
+def : InstRW<[R52WriteVST3Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1d64TPseudoWB")>;
+def : InstRW<[R52WriteVST4Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1d64QPseudoWB")>;
+
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1LNd(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1LNdWB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST1LNq(8|16|32)Pseudo_UPD")>;
+
+// 2-element structure store
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST2(d|b)(8|16|32)$")>;
+def : InstRW<[R52WriteVST4Mem, R52Read_ISS, R52Read_F2], (instregex "VST2q(8|16|32)$")>;
+def : InstRW<[R52WriteVST4Mem, R52Read_ISS, R52Read_F2], (instregex "VST2q(8|16|32)Pseudo$")>;
+
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST2LNd(8|16|32)$")>;
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST2LNdAsm_(8|16|32)$")>;
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST2LNd(8|16|32)Pseudo$")>;
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST2LNq(16|32)$")>;
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST2LNqAsm_(16|32)$")>;
+def : InstRW<[R52WriteVST1Mem, R52Read_ISS, R52Read_F2], (instregex "VST2LNq(16|32)Pseudo$")>;
+
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2(d|b)(8|16|32)wb")>;
+def : InstRW<[R52WriteVST4Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2q(8|16|32)wb")>;
+def : InstRW<[R52WriteVST4Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2q(8|16|32)PseudoWB")>;
+
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2LNd(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2LNdWB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2LNd(8|16|32)Pseudo_UPD")>;
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2LNq(16|32)_UPD")>;
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2LNqWB_(fixed|register)_Asm_(16|32)")>;
+def : InstRW<[R52WriteVST1Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST2LNq(16|32)Pseudo_UPD")>;
+
+// 3-element structure store
+def : InstRW<[R52WriteVST4Mem, R52Read_ISS, R52Read_F2], (instregex "VST3(d|q)(8|16|32)$")>;
+def : InstRW<[R52WriteVST4Mem, R52Read_ISS, R52Read_F2], (instregex "VST3(d|q)Asm_(8|16|32)$")>;
+def : InstRW<[R52WriteVST4Mem, R52Read_ISS, R52Read_F2], (instregex "VST3d(8|16|32)(oddP|P)seudo$")>;
+
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST3LNd(8|16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST3LNdAsm_(8|16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST3LNd(8|16|32)Pseudo$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST3LNq(16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST3LNqAsm_(16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST3LNq(16|32)Pseudo$")>;
+
+def : InstRW<[R52WriteVST4Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3(d|q)(8|16|32)_UPD$")>;
+def : InstRW<[R52WriteVST4Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3(d|q)WB_(fixed|register)_Asm_(8|16|32)$")>;
+def : InstRW<[R52WriteVST4Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3LNd(8|16|32)_UPD$")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3LNdWB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3LNd(8|16|32)Pseudo_UPD$")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3LNq(16|32)_UPD$")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3LNqWB_(fixed|register)_Asm_(16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST3LNq(16|32)Pseudo_UPD$")>;
+
+// 4-element structure store
+def : InstRW<[R52WriteVST5Mem, R52Read_ISS, R52Read_F2], (instregex "VST4(d|q)(8|16|32)$")>;
+def : InstRW<[R52WriteVST5Mem, R52Read_ISS, R52Read_F2], (instregex "VST4(d|q)Asm_(8|16|32)$")>;
+def : InstRW<[R52WriteVST5Mem, R52Read_ISS, R52Read_F2], (instregex "VST4d(8|16|32)Pseudo$")>;
+
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST4LNd(8|16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST4LNdAsm_(8|16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST4LNd(8|16|32)Pseudo$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST4LNq(16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST4LNqAsm_(16|32)$")>;
+def : InstRW<[R52WriteVST2Mem, R52Read_ISS, R52Read_F2], (instregex "VST4LNq(16|32)Pseudo$")>;
+
+def : InstRW<[R52WriteVST5Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4(d|q)(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVST5Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVST5Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4LNd(8|16|32)_UPD")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4LNdWB_(fixed|register)_Asm_(8|16|32)")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4LNd(8|16|32)Pseudo_UPD")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4LNq(16|32)_UPD")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4LNqWB_(fixed|register)_Asm_(16|32)")>;
+def : InstRW<[R52WriteVST2Mem, R52WriteAdr, R52Read_ISS, R52Read_F2], (instregex "VST4LNq(16|32)Pseudo_UPD")>;
+
+} // R52 SchedModel
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
new file mode 100644
index 000000000000..ea2bf4b578f0
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
@@ -0,0 +1,1046 @@
+//=- ARMScheduleSwift.td - Swift Scheduling Definitions -*- tablegen -*----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Swift processor..
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// This section contains legacy support for itineraries. This is
+// required until SD and PostRA schedulers are replaced by MachineScheduler.
+
+def SW_DIS0 : FuncUnit;
+def SW_DIS1 : FuncUnit;
+def SW_DIS2 : FuncUnit;
+
+def SW_ALU0 : FuncUnit;
+def SW_ALU1 : FuncUnit;
+def SW_LS   : FuncUnit;
+def SW_IDIV : FuncUnit;
+def SW_FDIV : FuncUnit;
+
+// FIXME: Need bypasses.
+// FIXME: Model the multiple stages of IIC_iMOVix2, IIC_iMOVix2addpc, and
+//        IIC_iMOVix2ld better.
+// FIXME: Model the special immediate shifts that are not microcoded.
+// FIXME: Do we need to model the fact that uses of r15 in a micro-op force it
+//        to issue on pipe 1?
+// FIXME: Model the pipelined behavior of CMP / TST instructions.
+// FIXME: Better model the microcode stages of multiply instructions, especially
+//        conditional variants.
+// FIXME: Add preload instruction when it is documented.
+// FIXME: Model non-pipelined nature of FP div / sqrt unit.
+
+// Swift machine model for scheduling and other instruction cost heuristics.
+def SwiftModel : SchedMachineModel {
+  let IssueWidth = 3; // 3 micro-ops are dispatched per cycle.
+  let MicroOpBufferSize = 45; // Based on NEON renamed registers.
+  let LoadLatency = 3;
+  let MispredictPenalty = 14; // A branch direction mispredict.
+  let CompleteModel = 0;      // FIXME: Remove if all instructions are covered.
+}
+
+// Swift predicates.
+def IsFastImmShiftSwiftPred : SchedPredicate<[{TII->isSwiftFastImmShift(MI)}]>;
+
+// Swift resource mapping.
+let SchedModel = SwiftModel in {
+  // Processor resources.
+  def SwiftUnitP01 : ProcResource<2>; // ALU unit.
+  def SwiftUnitP0 : ProcResource<1> { let Super = SwiftUnitP01; } // Mul unit.
+  def SwiftUnitP1 : ProcResource<1> { let Super = SwiftUnitP01; } // Br unit.
+  def SwiftUnitP2 : ProcResource<1>; // LS unit.
+  def SwiftUnitDiv : ProcResource<1>;
+
+  // Generic resource requirements.
+  def SwiftWriteP0OneCycle : SchedWriteRes<[SwiftUnitP0]>;
+  def SwiftWriteP0TwoCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 2; }
+  def SwiftWriteP0FourCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 4; }
+  def SwiftWriteP0SixCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 6; }
+  def SwiftWriteP0P1FourCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP1]> {
+    let Latency = 4;
+  }
+  def SwiftWriteP0P1SixCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP1]> {
+    let Latency = 6;
+  }
+  def SwiftWriteP01OneCycle : SchedWriteRes<[SwiftUnitP01]>;
+  def SwiftWriteP1TwoCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 2; }
+  def SwiftWriteP1FourCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 4; }
+  def SwiftWriteP1SixCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 6; }
+  def SwiftWriteP1EightCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 8; }
+  def SwiftWriteP1TwelveCyc : SchedWriteRes<[SwiftUnitP1]> { let Latency = 12; }
+  def SwiftWriteP01OneCycle2x : WriteSequence<[SwiftWriteP01OneCycle], 2>;
+  def SwiftWriteP01OneCycle3x : WriteSequence<[SwiftWriteP01OneCycle], 3>;
+  def SwiftWriteP01TwoCycle : SchedWriteRes<[SwiftUnitP01]> { let Latency = 2; }
+  def SwiftWriteP01ThreeCycleTwoUops : SchedWriteRes<[SwiftUnitP01,
+                                                      SwiftUnitP01]> {
+    let Latency = 3;
+    let NumMicroOps = 2;
+  }
+  def SwiftWriteP0ThreeCycleThreeUops : SchedWriteRes<[SwiftUnitP0]> {
+    let Latency = 3;
+    let NumMicroOps = 3;
+    let ResourceCycles = [3];
+  }
+  // Plain load without writeback.
+  def SwiftWriteP2ThreeCycle : SchedWriteRes<[SwiftUnitP2]> {
+    let Latency = 3;
+  }
+  def SwiftWriteP2FourCycle : SchedWriteRes<[SwiftUnitP2]> {
+    let Latency = 4;
+  }
+  // A store does not write to a register.
+  def SwiftWriteP2 : SchedWriteRes<[SwiftUnitP2]> {
+    let Latency = 0;
+  }
+  foreach Num = 1-4 in {
+    def SwiftWrite#Num#xP2 : WriteSequence<[SwiftWriteP2], Num>;
+  }
+  def SwiftWriteP01OneCycle2x_load : WriteSequence<[SwiftWriteP01OneCycle,
+                                                    SwiftWriteP01OneCycle,
+                                                    SwiftWriteP2ThreeCycle]>;
+  // 4.2.4 Arithmetic and Logical.
+  // ALU operation register shifted by immediate variant.
+  def SwiftWriteALUsi : SchedWriteVariant<[
+    // lsl #2, lsl #1, or lsr #1.
+    SchedVar<IsFastImmShiftSwiftPred, [SwiftWriteP01TwoCycle]>,
+    SchedVar<NoSchedPred,             [WriteALU]>
+  ]>;
+  def SwiftWriteALUsr : SchedWriteVariant<[
+    SchedVar<IsPredicatedPred, [SwiftWriteP01ThreeCycleTwoUops]>,
+    SchedVar<NoSchedPred,      [SwiftWriteP01TwoCycle]>
+  ]>;
+  def SwiftWriteALUSsr : SchedWriteVariant<[
+    SchedVar<IsPredicatedPred, [SwiftWriteP0ThreeCycleThreeUops]>,
+    SchedVar<NoSchedPred,      [SwiftWriteP01TwoCycle]>
+  ]>;
+  def SwiftReadAdvanceALUsr : SchedReadVariant<[
+    SchedVar<IsPredicatedPred, [SchedReadAdvance<2>]>,
+    SchedVar<NoSchedPred,      [NoReadAdvance]>
+  ]>;
+  // ADC,ADD,NEG,RSB,RSC,SBC,SUB,ADR
+  // AND,BIC,EOR,ORN,ORR
+  // CLZ,RBIT,REV,REV16,REVSH,PKH
+  def : WriteRes<WriteALU, [SwiftUnitP01]>;
+  def : SchedAlias<WriteALUsi, SwiftWriteALUsi>;
+  def : SchedAlias<WriteALUsr, SwiftWriteALUsr>;
+  def : SchedAlias<WriteALUSsr, SwiftWriteALUSsr>;
+  def : ReadAdvance<ReadALU, 0>;
+  def : SchedAlias<ReadALUsr, SwiftReadAdvanceALUsr>;
+
+
+  def SwiftChooseShiftKindP01OneOrTwoCycle : SchedWriteVariant<[
+    SchedVar<IsFastImmShiftSwiftPred, [SwiftWriteP01OneCycle]>,
+    SchedVar<NoSchedPred,             [SwiftWriteP01TwoCycle]>
+  ]>;
+
+  // 4.2.5 Integer comparison
+  def : WriteRes<WriteCMP, [SwiftUnitP01]>;
+  def : SchedAlias<WriteCMPsi, SwiftChooseShiftKindP01OneOrTwoCycle>;
+  def : SchedAlias<WriteCMPsr, SwiftWriteP01TwoCycle>;
+
+  // 4.2.6 Shift, Move
+  // Shift
+  //  ASR,LSL,ROR,RRX
+  //  MOV(register-shiftedregister)  MVN(register-shiftedregister)
+  // Move
+  //  MOV,MVN
+  //  MOVT
+  // Sign/Zero extension
+  def : InstRW<[SwiftWriteP01OneCycle],
+               (instregex "SXTB", "SXTH", "SXTB16", "UXTB", "UXTH", "UXTB16",
+                          "t2SXTB", "t2SXTH", "t2SXTB16", "t2UXTB", "t2UXTH",
+                          "t2UXTB16")>;
+  // Pseudo instructions.
+  def : InstRW<[SwiftWriteP01OneCycle2x],
+        (instregex "MOVCCi32imm", "MOVi32imm", "MOV_ga_dyn", "t2MOVCCi32imm",
+                   "t2MOVi32imm", "t2MOV_ga_dyn")>;
+  def : InstRW<[SwiftWriteP01OneCycle3x],
+        (instregex "MOV_ga_pcrel", "t2MOV_ga_pcrel", "t2MOVi16_ga_pcrel")>;
+  def : InstRW<[SwiftWriteP01OneCycle2x_load],
+        (instregex "MOV_ga_pcrel_ldr", "t2MOV_ga_pcrel_ldr")>;
+
+  def SwiftWriteP0TwoCyleTwoUops : WriteSequence<[SwiftWriteP0OneCycle], 2>;
+
+  def SwiftPredP0OneOrTwoCycle : SchedWriteVariant<[
+    SchedVar<IsPredicatedPred, [ SwiftWriteP0TwoCyleTwoUops ]>,
+    SchedVar<NoSchedPred,     [ SwiftWriteP0OneCycle ]>
+  ]>;
+
+  // 4.2.7 Select
+  // SEL
+  def : InstRW<[SwiftPredP0OneOrTwoCycle], (instregex "SEL", "t2SEL")>;
+
+  // 4.2.8 Bitfield
+  // BFI,BFC, SBFX,UBFX
+  def : InstRW< [SwiftWriteP01TwoCycle],
+        (instregex "BFC", "BFI", "UBFX", "SBFX", "(t|t2)BFC", "(t|t2)BFI",
+        "(t|t2)UBFX", "(t|t2)SBFX")>;
+
+  // 4.2.9 Saturating arithmetic
+  def : InstRW< [SwiftWriteP01TwoCycle],
+        (instregex "QADD", "QSUB", "QDADD", "QDSUB", "SSAT", "SSAT16", "USAT",
+        "USAT16", "QADD8", "QADD16", "QSUB8", "QSUB16", "QASX", "QSAX",
+        "UQADD8", "UQADD16","UQSUB8","UQSUB16","UQASX","UQSAX", "t2QADD",
+        "t2QSUB", "t2QDADD", "t2QDSUB", "t2SSAT", "t2SSAT16", "t2USAT",
+        "t2QADD8", "t2QADD16", "t2QSUB8", "t2QSUB16", "t2QASX", "t2QSAX",
+        "t2UQADD8", "t2UQADD16","t2UQSUB8","t2UQSUB16","t2UQASX","t2UQSAX")>;
+
+  // 4.2.10 Parallel Arithmetic
+  // Not flag setting.
+  def : InstRW< [SwiftWriteALUsr],
+        (instregex "SADD8", "SADD16", "SSUB8", "SSUB16", "SASX", "SSAX",
+        "UADD8", "UADD16", "USUB8", "USUB16", "UASX", "USAX", "t2SADD8",
+        "t2SADD16", "t2SSUB8", "t2SSUB16", "t2SASX", "t2SSAX", "t2UADD8",
+        "t2UADD16", "t2USUB8", "t2USUB16", "t2UASX", "t2USAX")>;
+  // Flag setting.
+  def : InstRW< [SwiftWriteP01TwoCycle],
+       (instregex "SHADD8", "SHADD16", "SHSUB8", "SHSUB16", "SHASX", "SHSAX",
+       "SXTAB", "SXTAB16", "SXTAH", "UHADD8", "UHADD16", "UHSUB8", "UHSUB16",
+       "UHASX", "UHSAX", "UXTAB", "UXTAB16", "UXTAH", "t2SHADD8", "t2SHADD16",
+       "t2SHSUB8", "t2SHSUB16", "t2SHASX", "t2SHSAX", "t2SXTAB", "t2SXTAB16",
+       "t2SXTAH", "t2UHADD8", "t2UHADD16", "t2UHSUB8", "t2UHSUB16", "t2UHASX",
+       "t2UHSAX", "t2UXTAB", "t2UXTAB16", "t2UXTAH")>;
+
+  // 4.2.11 Sum of Absolute Difference
+  def : InstRW< [SwiftWriteP0P1FourCycle], (instregex "USAD8") >;
+  def : InstRW<[SwiftWriteP0P1FourCycle, ReadALU, ReadALU, SchedReadAdvance<2>],
+        (instregex "USADA8")>;
+
+  // 4.2.12 Integer Multiply (32-bit result)
+  // Two sources.
+  def : InstRW< [SwiftWriteP0FourCycle],
+        (instregex "MULS", "MUL", "SMMUL", "SMMULR", "SMULBB", "SMULBT",
+        "SMULTB", "SMULTT", "SMULWB", "SMULWT", "SMUSD", "SMUSDXi", "t2MUL",
+        "t2SMMUL", "t2SMMULR", "t2SMULBB", "t2SMULBT", "t2SMULTB", "t2SMULTT",
+        "t2SMULWB", "t2SMULWT", "t2SMUSD")>;
+
+  def SwiftWriteP0P01FiveCycleTwoUops :
+      SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]>  {
+    let Latency = 5;
+  }
+
+  def SwiftPredP0P01FourFiveCycle : SchedWriteVariant<[
+    SchedVar<IsPredicatedPred, [ SwiftWriteP0P01FiveCycleTwoUops ]>,
+    SchedVar<NoSchedPred,      [ SwiftWriteP0FourCycle ]>
+  ]>;
+
+  def SwiftReadAdvanceFourCyclesPred : SchedReadVariant<[
+     SchedVar<IsPredicatedPred, [SchedReadAdvance<4>]>,
+     SchedVar<NoSchedPred,      [ReadALU]>
+  ]>;
+
+  // Multiply accumulate, three sources
+  def : InstRW< [SwiftPredP0P01FourFiveCycle, ReadALU, ReadALU,
+                 SwiftReadAdvanceFourCyclesPred],
+        (instregex "MLAS", "MLA", "MLS", "SMMLA", "SMMLAR", "SMMLS", "SMMLSR",
+        "t2MLA", "t2MLS", "t2MLAS", "t2SMMLA", "t2SMMLAR", "t2SMMLS",
+        "t2SMMLSR")>;
+
+  // 4.2.13 Integer Multiply (32-bit result, Q flag)
+  def : InstRW< [SwiftWriteP0FourCycle],
+        (instregex "SMUAD", "SMUADX", "t2SMUAD", "t2SMUADX")>;
+  def : InstRW< [SwiftPredP0P01FourFiveCycle, ReadALU, ReadALU,
+                 SwiftReadAdvanceFourCyclesPred],
+        (instregex "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLSD", "SMLSDX",
+        "SMLAWB", "SMLAWT", "t2SMLABB", "t2SMLABT", "t2SMLATB", "t2SMLATT",
+        "t2SMLSD", "t2SMLSDX", "t2SMLAWB", "t2SMLAWT")>;
+  def : InstRW< [SwiftPredP0P01FourFiveCycle],
+        (instregex "SMLAD", "SMLADX", "t2SMLAD", "t2SMLADX")>;
+
+  def SwiftP0P0P01FiveCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]> {
+    let Latency = 5;
+    let NumMicroOps = 3;
+    let ResourceCycles = [2, 1];
+  }
+  def SwiftWrite1Cycle : SchedWriteRes<[]> {
+    let Latency = 1;
+    let NumMicroOps = 0;
+  }
+  def SwiftWrite5Cycle : SchedWriteRes<[]> {
+    let Latency = 5;
+    let NumMicroOps = 0;
+  }
+  def SwiftWrite6Cycle : SchedWriteRes<[]> {
+    let Latency = 6;
+    let NumMicroOps = 0;
+  }
+
+  // 4.2.14 Integer Multiply, Long
+  def : InstRW< [SwiftP0P0P01FiveCycle, SwiftWrite5Cycle],
+        (instregex "SMULL$", "UMULL$", "t2SMULL$", "t2UMULL$")>;
+
+  def Swift2P03P01FiveCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]> {
+    let Latency = 7;
+    let NumMicroOps = 5;
+    let ResourceCycles = [2, 3];
+  }
+
+  // 4.2.15 Integer Multiply Accumulate, Long
+  // 4.2.16 Integer Multiply Accumulate, Dual
+  // 4.2.17 Integer Multiply Accumulate Accumulate, Long
+  // We are being a bit inaccurate here.
+  def : InstRW< [SwiftWrite5Cycle, Swift2P03P01FiveCycle, ReadALU, ReadALU,
+                 SchedReadAdvance<4>, SchedReadAdvance<3>],
+        (instregex "SMLALS", "UMLALS", "SMLAL", "UMLAL", "MLALBB", "SMLALBT",
+        "SMLALTB", "SMLALTT", "SMLALD", "SMLALDX", "SMLSLD", "SMLSLDX",
+        "UMAAL", "t2SMLALS", "t2UMLALS", "t2SMLAL", "t2UMLAL", "t2MLALBB", "t2SMLALBT",
+        "t2SMLALTB", "t2SMLALTT", "t2SMLALD", "t2SMLALDX", "t2SMLSLD", "t2SMLSLDX",
+        "t2UMAAL")>;
+
+  def SwiftDiv : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> {
+    let NumMicroOps = 1;
+    let Latency = 14;
+    let ResourceCycles = [1, 14];
+  }
+  // 4.2.18 Integer Divide
+  def : WriteRes<WriteDiv, [SwiftUnitDiv]>; // Workaround.
+  def : InstRW <[SwiftDiv],
+        (instregex "SDIV", "UDIV", "t2SDIV", "t2UDIV")>;
+
+  // 4.2.19 Integer Load Single Element
+  // 4.2.20 Integer Load Signextended
+  def SwiftWriteP2P01ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+    let Latency = 3;
+    let NumMicroOps = 2;
+  }
+  def SwiftWriteP2P01FourCyle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+    let Latency = 4;
+    let NumMicroOps = 2;
+  }
+  def SwiftWriteP2P01P01FourCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01,
+                                                   SwiftUnitP01]> {
+    let Latency = 4;
+    let NumMicroOps = 3;
+  }
+  def SwiftWriteP2P2ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP2]> {
+    let Latency = 3;
+    let NumMicroOps = 2;
+  }
+  def SwiftWriteP2P2P01ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP2,
+                                                   SwiftUnitP01]> {
+    let Latency = 3;
+    let NumMicroOps = 3;
+  }
+  def SwiftWrBackOne : SchedWriteRes<[]> {
+    let Latency = 1;
+    let NumMicroOps = 0;
+  }
+  def SwiftWriteLdFour : SchedWriteRes<[]> {
+    let Latency = 4;
+    let NumMicroOps = 0;
+  }
+   // Not accurate.
+  def : InstRW<[SwiftWriteP2ThreeCycle],
+        (instregex "LDR(i12|rs)$", "LDRB(i12|rs)$", "t2LDR(i8|i12|s|pci)",
+        "t2LDR(H|B)(i8|i12|s|pci)", "LDREX", "tLDR[BH](r|i|spi|pci|pciASM)",
+        "tLDR(r|i|spi|pci|pciASM)")>;
+  def : InstRW<[SwiftWriteP2ThreeCycle],
+        (instregex "LDRH$",  "PICLDR$", "PICLDR(H|B)$", "LDRcp$")>;
+  def : InstRW<[SwiftWriteP2P01FourCyle],
+        (instregex "PICLDRS(H|B)$", "t2LDRS(H|B)(i|r|p|s)", "LDRS(H|B)$",
+        "t2LDRpci_pic", "tLDRS(B|H)")>;
+  def : InstRW<[SwiftWriteP2P01ThreeCycle,  SwiftWrBackOne],
+        (instregex "LD(RB|R)(_|T_)(POST|PRE)_(IMM|REG)", "LDRH(_PRE|_POST)",
+        "LDR(T|BT)_POST_(REG|IMM)", "LDRHT(i|r)",
+        "t2LD(R|RB|RH)_(PRE|POST)", "t2LD(R|RB|RH)T")>;
+  def : InstRW<[SwiftWriteP2P01P01FourCycle, SwiftWrBackOne],
+        (instregex "LDR(SH|SB)(_POST|_PRE)", "t2LDR(SH|SB)(_POST|_PRE)",
+        "LDRS(B|H)T(i|r)", "t2LDRS(B|H)T(i|r)", "t2LDRS(B|H)T")>;
+
+  // 4.2.21 Integer Dual Load
+  // Not accurate.
+  def : InstRW<[SwiftWriteP2P2ThreeCycle, SwiftWriteLdFour],
+        (instregex "t2LDRDi8", "LDRD$")>;
+  def : InstRW<[SwiftWriteP2P2P01ThreeCycle, SwiftWriteLdFour, SwiftWrBackOne],
+        (instregex "LDRD_(POST|PRE)", "t2LDRD_(POST|PRE)")>;
+
+  // 4.2.22 Integer Load, Multiple
+  // NumReg = 1 .. 16
+  foreach Lat = 3-25 in {
+    def SwiftWriteLM#Lat#Cy : SchedWriteRes<[SwiftUnitP2]> {
+      let Latency = Lat;
+    }
+    def SwiftWriteLM#Lat#CyNo : SchedWriteRes<[]> {
+      let Latency = Lat;
+      let NumMicroOps = 0;
+    }
+  }
+  // Predicate.
+  foreach NumAddr = 1-16 in {
+    def SwiftLMAddr#NumAddr#Pred : SchedPredicate<"TII->getNumLDMAddresses(*MI) == "#NumAddr>;
+  }
+  def SwiftWriteLDMAddrNoWB : SchedWriteRes<[SwiftUnitP01]> { let Latency = 0; }
+  def SwiftWriteLDMAddrWB : SchedWriteRes<[SwiftUnitP01, SwiftUnitP01]>;
+  def SwiftWriteLM : SchedWriteVariant<[
+    SchedVar<SwiftLMAddr2Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy]>,
+    SchedVar<SwiftLMAddr3Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy]>,
+    SchedVar<SwiftLMAddr4Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy]>,
+    SchedVar<SwiftLMAddr5Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy]>,
+    SchedVar<SwiftLMAddr6Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy]>,
+    SchedVar<SwiftLMAddr7Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy]>,
+    SchedVar<SwiftLMAddr8Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy]>,
+    SchedVar<SwiftLMAddr9Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy]>,
+    SchedVar<SwiftLMAddr10Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy]>,
+    SchedVar<SwiftLMAddr11Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy]>,
+    SchedVar<SwiftLMAddr12Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy]>,
+    SchedVar<SwiftLMAddr13Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy]>,
+    SchedVar<SwiftLMAddr14Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy, SwiftWriteLM16Cy]>,
+    SchedVar<SwiftLMAddr15Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy, SwiftWriteLM16Cy,
+                                SwiftWriteLM17Cy]>,
+    SchedVar<SwiftLMAddr16Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy, SwiftWriteLM16Cy,
+                                SwiftWriteLM17Cy, SwiftWriteLM18Cy]>,
+    // Unknow number of registers, just use resources for two registers.
+    SchedVar<NoSchedPred,      [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5CyNo, SwiftWriteLM6CyNo,
+                                SwiftWriteLM7CyNo, SwiftWriteLM8CyNo,
+                                SwiftWriteLM9CyNo, SwiftWriteLM10CyNo,
+                                SwiftWriteLM11CyNo, SwiftWriteLM12CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM15CyNo, SwiftWriteLM16CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo]>
+
+  ]> { let Variadic=1; }
+
+  def : InstRW<[SwiftWriteLM, SwiftWriteLDMAddrNoWB],
+        (instregex "LDM(IA|DA|DB|IB)$", "t2LDM(IA|DA|DB|IB)$",
+        "(t|sys)LDM(IA|DA|DB|IB)$")>;
+  def : InstRW<[SwiftWriteLDMAddrWB, SwiftWriteLM],
+        (instregex /*"t2LDMIA_RET", "tLDMIA_RET", "LDMIA_RET",*/
+        "LDM(IA|DA|DB|IB)_UPD", "(t2|sys|t)LDM(IA|DA|DB|IB)_UPD")>;
+  def : InstRW<[SwiftWriteLDMAddrWB, SwiftWriteLM, SwiftWriteP1TwoCycle],
+        (instregex "LDMIA_RET", "(t|t2)LDMIA_RET", "POP", "tPOP")>;
+  // 4.2.23 Integer Store, Single Element
+  def : InstRW<[SwiftWriteP2],
+        (instregex "PICSTR", "STR(i12|rs)", "STRB(i12|rs)", "STRH$", "STREX",
+        "t2STR(i12|i8|s)$", "t2STR[BH](i12|i8|s)$", "tSTR[BH](i|r)", "tSTR(i|r)", "tSTRspi")>;
+
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteP2],
+        (instregex "STR(B_|_|BT_|T_)(PRE_IMM|PRE_REG|POST_REG|POST_IMM)",
+        "STR(i|r)_preidx", "STRB(i|r)_preidx", "STRH_preidx", "STR(H_|HT_)(PRE|POST)",
+        "STR(BT|HT|T)", "t2STR_(PRE|POST)", "t2STR[BH]_(PRE|POST)",
+        "t2STR_preidx", "t2STR[BH]_preidx", "t2ST(RB|RH|R)T")>;
+
+  // 4.2.24 Integer Store, Dual
+  def : InstRW<[SwiftWriteP2, SwiftWriteP2, SwiftWriteP01OneCycle],
+        (instregex "STRD$", "t2STRDi8")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteP2, SwiftWriteP2,
+                SwiftWriteP01OneCycle],
+        (instregex "(t2|t)STRD_(POST|PRE)", "STRD_(POST|PRE)")>;
+
+  // 4.2.25 Integer Store, Multiple
+  def SwiftWriteStIncAddr : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+    let Latency = 0;
+    let NumMicroOps = 2;
+  }
+  foreach NumAddr = 1-16 in {
+     def SwiftWriteSTM#NumAddr : WriteSequence<[SwiftWriteStIncAddr], NumAddr>;
+  }
+  def SwiftWriteSTM : SchedWriteVariant<[
+    SchedVar<SwiftLMAddr2Pred, [SwiftWriteSTM2]>,
+    SchedVar<SwiftLMAddr3Pred, [SwiftWriteSTM3]>,
+    SchedVar<SwiftLMAddr4Pred, [SwiftWriteSTM4]>,
+    SchedVar<SwiftLMAddr5Pred, [SwiftWriteSTM5]>,
+    SchedVar<SwiftLMAddr6Pred, [SwiftWriteSTM6]>,
+    SchedVar<SwiftLMAddr7Pred, [SwiftWriteSTM7]>,
+    SchedVar<SwiftLMAddr8Pred, [SwiftWriteSTM8]>,
+    SchedVar<SwiftLMAddr9Pred, [SwiftWriteSTM9]>,
+    SchedVar<SwiftLMAddr10Pred,[SwiftWriteSTM10]>,
+    SchedVar<SwiftLMAddr11Pred,[SwiftWriteSTM11]>,
+    SchedVar<SwiftLMAddr12Pred,[SwiftWriteSTM12]>,
+    SchedVar<SwiftLMAddr13Pred,[SwiftWriteSTM13]>,
+    SchedVar<SwiftLMAddr14Pred,[SwiftWriteSTM14]>,
+    SchedVar<SwiftLMAddr15Pred,[SwiftWriteSTM15]>,
+    SchedVar<SwiftLMAddr16Pred,[SwiftWriteSTM16]>,
+    // Unknow number of registers, just use resources for two registers.
+    SchedVar<NoSchedPred,      [SwiftWriteSTM2]>
+  ]>;
+  def : InstRW<[SwiftWriteSTM],
+        (instregex "STM(IB|IA|DB|DA)$", "(t2|sys|t)STM(IB|IA|DB|DA)$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteSTM],
+        (instregex "STM(IB|IA|DB|DA)_UPD", "(t2|sys|t)STM(IB|IA|DB|DA)_UPD",
+        "PUSH", "tPUSH")>;
+
+  // LDRLIT pseudo instructions, they expand to LDR + PICADD
+  def : InstRW<[SwiftWriteP2ThreeCycle, WriteALU],
+        (instregex "t?LDRLIT_ga_abs", "t?LDRLIT_ga_pcrel")>;
+  // LDRLIT_ga_pcrel_ldr expands to LDR + PICLDR
+  def : InstRW<[SwiftWriteP2ThreeCycle, SwiftWriteP2ThreeCycle],
+        (instregex "LDRLIT_ga_pcrel_ldr")>;
+
+  // 4.2.26 Branch
+  def : WriteRes<WriteBr, [SwiftUnitP1]> { let Latency = 0; }
+  def : WriteRes<WriteBrL, [SwiftUnitP1]> { let Latency = 2; }
+  def : WriteRes<WriteBrTbl, [SwiftUnitP1, SwiftUnitP2]> { let Latency = 0; }
+
+  // 4.2.27 Not issued
+  def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
+  def : InstRW<[WriteNoop], (instregex "t2IT", "IT", "NOP")>;
+
+  // 4.2.28 Advanced SIMD, Integer, 2 cycle
+  def : InstRW<[SwiftWriteP0TwoCycle],
+        (instregex "VADDv", "VSUBv", "VNEG(s|f|v)", "VADDL", "VSUBL",
+                   "VADDW", "VSUBW", "VHADD", "VHSUB", "VRHADD", "VPADDi",
+                   "VPADDL", "VAND", "VBIC", "VEOR", "VORN", "VORR", "VTST",
+                   "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL", "VQSHLU", "VBIF",
+                   "VBIT", "VBSL", "VSLI", "VSRI", "VCLS", "VCLZ", "VCNT")>;
+
+  def : InstRW<[SwiftWriteP1TwoCycle],
+        (instregex "VEXT", "VREV16", "VREV32", "VREV64")>;
+
+  // 4.2.29 Advanced SIMD, Integer, 4 cycle
+  // 4.2.30 Advanced SIMD, Integer with Accumulate
+  def : InstRW<[SwiftWriteP0FourCycle],
+        (instregex "VABA", "VABAL", "VPADAL", "VRSRA", "VSRA", "VACGE", "VACGT",
+        "VACLE", "VACLT", "VCEQ", "VCGE", "VCGT", "VCLE", "VCLT", "VRSHL",
+        "VQRSHL", "VRSHR(u|s)", "VABS(f|v)", "VQABS", "VQNEG", "VQADD",
+        "VQSUB")>;
+  def : InstRW<[SwiftWriteP1FourCycle],
+        (instregex "VRECPE", "VRSQRTE")>;
+
+  // 4.2.31 Advanced SIMD, Add and Shift with Narrow
+  def : InstRW<[SwiftWriteP0P1FourCycle],
+        (instregex "VADDHN", "VSUBHN", "VSHRN")>;
+  def : InstRW<[SwiftWriteP0P1SixCycle],
+        (instregex "VRADDHN", "VRSUBHN", "VRSHRN", "VQSHRN", "VQSHRUN",
+                   "VQRSHRN", "VQRSHRUN")>;
+
+  // 4.2.32 Advanced SIMD, Vector Table Lookup
+  foreach Num = 1-4 in {
+    def SwiftWrite#Num#xP1TwoCycle : WriteSequence<[SwiftWriteP1TwoCycle], Num>;
+  }
+  def : InstRW<[SwiftWrite1xP1TwoCycle],
+        (instregex "VTB(L|X)1")>;
+  def : InstRW<[SwiftWrite2xP1TwoCycle],
+        (instregex "VTB(L|X)2")>;
+  def : InstRW<[SwiftWrite3xP1TwoCycle],
+        (instregex "VTB(L|X)3")>;
+  def : InstRW<[SwiftWrite4xP1TwoCycle],
+        (instregex "VTB(L|X)4")>;
+
+  // 4.2.33 Advanced SIMD, Transpose
+  def : InstRW<[SwiftWriteP1FourCycle, SwiftWriteP1FourCycle,
+                SwiftWriteP1TwoCycle/*RsrcOnly*/, SchedReadAdvance<2>],
+        (instregex "VSWP", "VTRN", "VUZP", "VZIP")>;
+
+  // 4.2.34 Advanced SIMD and VFP, Floating Point
+  def : InstRW<[SwiftWriteP0TwoCycle], (instregex "VABS(S|D)$", "VNEG(S|D)$")>;
+  def : InstRW<[SwiftWriteP0FourCycle],
+        (instregex "VCMP(D|S|ZD|ZS)$", "VCMPE(D|S|ZD|ZS)")>;
+  def : InstRW<[SwiftWriteP0FourCycle],
+        (instregex "VADD(S|f)", "VSUB(S|f)", "VABD", "VPADDf", "VMAX", "VMIN", "VPMAX",
+                   "VPMIN")>;
+  def : InstRW<[SwiftWriteP0SixCycle], (instregex "VADDD$", "VSUBD$")>;
+  def : InstRW<[SwiftWriteP1EightCycle], (instregex "VRECPS", "VRSQRTS")>;
+
+  // 4.2.35 Advanced SIMD and VFP, Multiply
+  def : InstRW<[SwiftWriteP1FourCycle],
+        (instregex "VMUL(S|v|p|f|s)", "VNMULS", "VQDMULH", "VQRDMULH",
+                   "VMULL", "VQDMULL")>;
+  def : InstRW<[SwiftWriteP1SixCycle],
+        (instregex "VMULD", "VNMULD")>;
+  def : InstRW<[SwiftWriteP1FourCycle],
+        (instregex "VMLA", "VMLS", "VNMLA", "VNMLS", "VFMA(S|D)", "VFMS(S|D)",
+        "VFNMA", "VFNMS", "VMLAL", "VMLSL","VQDMLAL", "VQDMLSL")>;
+  def : InstRW<[SwiftWriteP1EightCycle], (instregex "VFMAfd", "VFMSfd")>;
+  def : InstRW<[SwiftWriteP1TwelveCyc], (instregex "VFMAfq", "VFMSfq")>;
+
+  // 4.2.36 Advanced SIMD and VFP, Convert
+  def : InstRW<[SwiftWriteP1FourCycle], (instregex "VCVT", "V(S|U)IT", "VTO(S|U)")>;
+  // Fixpoint conversions.
+  def : WriteRes<WriteCvtFP, [SwiftUnitP1]> { let Latency = 4; }
+
+  // 4.2.37 Advanced SIMD and VFP, Move
+  def : InstRW<[SwiftWriteP0TwoCycle],
+        (instregex "VMOVv", "VMOV(S|D)$", "VMOV(S|D)cc",
+                   "VMVNv", "VMVN(d|q)", "VMVN(S|D)cc",
+                   "FCONST(D|S)")>;
+  def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VMOVN", "VMOVL")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP0FourCycle, SwiftWriteP1TwoCycle]>],
+        (instregex "VQMOVN")>;
+  def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VDUPLN", "VDUPf")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP1TwoCycle]>],
+        (instregex "VDUP(8|16|32)")>;
+  def : InstRW<[SwiftWriteP2ThreeCycle], (instregex "VMOVRS$")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP0TwoCycle]>],
+        (instregex "VMOVSR$", "VSETLN")>;
+  def : InstRW<[SwiftWriteP2ThreeCycle, SwiftWriteP2FourCycle],
+        (instregex "VMOVRR(D|S)$")>;
+  def : InstRW<[SwiftWriteP2FourCycle], (instregex "VMOVDRR$")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP1TwoCycle]>,
+                WriteSequence<[SwiftWrite1Cycle, SwiftWriteP2FourCycle,
+                               SwiftWriteP1TwoCycle]>],
+                (instregex "VMOVSRR$")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP1TwoCycle, SwiftWriteP2ThreeCycle]>],
+        (instregex "VGETLN(u|i)")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP1TwoCycle, SwiftWriteP2ThreeCycle,
+                               SwiftWriteP01OneCycle]>],
+        (instregex "VGETLNs")>;
+
+  // 4.2.38 Advanced SIMD and VFP, Move FPSCR
+  // Serializing instructions.
+  def SwiftWaitP0For15Cy : SchedWriteRes<[SwiftUnitP0]> {
+    let Latency = 15;
+    let ResourceCycles = [15];
+  }
+  def SwiftWaitP1For15Cy : SchedWriteRes<[SwiftUnitP1]> {
+    let Latency = 15;
+    let ResourceCycles = [15];
+  }
+  def SwiftWaitP2For15Cy : SchedWriteRes<[SwiftUnitP2]> {
+    let Latency = 15;
+    let ResourceCycles = [15];
+  }
+  def : InstRW<[SwiftWaitP0For15Cy, SwiftWaitP1For15Cy, SwiftWaitP2For15Cy],
+        (instregex "VMRS")>;
+  def : InstRW<[SwiftWaitP0For15Cy, SwiftWaitP1For15Cy, SwiftWaitP2For15Cy],
+        (instregex "VMSR")>;
+  // Not serializing.
+  def : InstRW<[SwiftWriteP0TwoCycle], (instregex "FMSTAT")>;
+
+  // 4.2.39 Advanced SIMD and VFP, Load Single Element
+  def : InstRW<[SwiftWriteLM4Cy], (instregex "VLDRD$", "VLDRS$")>;
+
+  // 4.2.40 Advanced SIMD and VFP, Store Single Element
+  def : InstRW<[SwiftWriteLM4Cy], (instregex "VSTRD$", "VSTRS$")>;
+
+  // 4.2.41 Advanced SIMD and VFP, Load Multiple
+  // 4.2.42 Advanced SIMD and VFP, Store Multiple
+
+  // Resource requirement for permuting, just reserves the resources.
+  foreach Num = 1-28 in {
+    def SwiftVLDMPerm#Num : SchedWriteRes<[SwiftUnitP1]> {
+      let Latency = 0;
+      let NumMicroOps = Num;
+      let ResourceCycles = [Num];
+    }
+  }
+
+  // Pre RA pseudos - load/store to a Q register as a D register pair.
+  def : InstRW<[SwiftWriteLM4Cy], (instregex "VLDMQIA$", "VSTMQIA$")>;
+
+  // Post RA not modelled accurately. We assume that register use of width 64
+  // bit maps to a D register, 128 maps to a Q register. Not all different kinds
+  // are accurately represented.
+  def SwiftWriteVLDM : SchedWriteVariant<[
+    // Load of one S register.
+    SchedVar<SwiftLMAddr1Pred, [SwiftWriteLM4Cy]>,
+    // Load of one D register.
+    SchedVar<SwiftLMAddr2Pred, [SwiftWriteLM4Cy, SwiftWriteLM4CyNo]>,
+    // Load of 3 S register.
+    SchedVar<SwiftLMAddr3Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm3]>,
+    // Load of a Q register (not necessarily true). We should not be mapping to
+    // 4 S registers, either.
+    SchedVar<SwiftLMAddr4Pred, [SwiftWriteLM4Cy, SwiftWriteLM4CyNo,
+                                SwiftWriteLM4CyNo, SwiftWriteLM4CyNo]>,
+    // Load of 5 S registers.
+    SchedVar<SwiftLMAddr5Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo,  SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm5]>,
+    // Load of 3 D registers. (Must also be able to handle s register list -
+    // though, not accurate)
+    SchedVar<SwiftLMAddr6Pred, [SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM10Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm5]>,
+    // Load of 7 S registers.
+    SchedVar<SwiftLMAddr7Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm7]>,
+    // Load of two Q registers.
+    SchedVar<SwiftLMAddr8Pred, [SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteP01OneCycle,  SwiftVLDMPerm2]>,
+    // Load of 9 S registers.
+    SchedVar<SwiftLMAddr9Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm9]>,
+    // Load of 5 D registers.
+    SchedVar<SwiftLMAddr10Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM10Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo,  SwiftWriteLM14CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm5]>,
+    // Inaccurate: reuse describtion from 9 S registers.
+    SchedVar<SwiftLMAddr11Pred,[SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm9]>,
+    // Load of three Q registers.
+    SchedVar<SwiftLMAddr12Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM11Cy,
+                                SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+                                SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+                                SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+                                SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm3]>,
+    // Inaccurate: reuse describtion from 9 S registers.
+    SchedVar<SwiftLMAddr13Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm9]>,
+    // Load of 7 D registers inaccurate.
+    SchedVar<SwiftLMAddr14Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM10Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM14Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo,  SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo,  SwiftWriteLM14CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm7]>,
+    SchedVar<SwiftLMAddr15Pred,[SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM17Cy, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm9]>,
+    // Load of 4 Q registers.
+    SchedVar<SwiftLMAddr16Pred,[SwiftWriteLM7Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy, SwiftWriteLM18CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm4]>,
+    // Unknow number of registers, just use resources for two registers.
+    SchedVar<NoSchedPred,      [SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteP01OneCycle,  SwiftVLDMPerm2]>
+  ]> { let Variadic = 1; }
+
+  def : InstRW<[SwiftWriteVLDM], (instregex "VLDM[SD](IA|DB)$")>;
+
+  def : InstRW<[SwiftWriteP01OneCycle2x, SwiftWriteVLDM],
+        (instregex "VLDM[SD](IA|DB)_UPD$")>;
+
+  def SwiftWriteVSTM : SchedWriteVariant<[
+    // One S register.
+    SchedVar<SwiftLMAddr1Pred, [SwiftWriteSTM1]>,
+    // One D register.
+    SchedVar<SwiftLMAddr2Pred, [SwiftWriteSTM1]>,
+    // Three S registers.
+    SchedVar<SwiftLMAddr3Pred, [SwiftWriteSTM4]>,
+    // Assume one Q register.
+    SchedVar<SwiftLMAddr4Pred, [SwiftWriteSTM1]>,
+    SchedVar<SwiftLMAddr5Pred, [SwiftWriteSTM6]>,
+    // Assume three D registers.
+    SchedVar<SwiftLMAddr6Pred, [SwiftWriteSTM4]>,
+    SchedVar<SwiftLMAddr7Pred, [SwiftWriteSTM8]>,
+    // Assume two Q registers.
+    SchedVar<SwiftLMAddr8Pred, [SwiftWriteSTM3]>,
+    SchedVar<SwiftLMAddr9Pred, [SwiftWriteSTM10]>,
+    // Assume 5 D registers.
+    SchedVar<SwiftLMAddr10Pred, [SwiftWriteSTM6]>,
+    SchedVar<SwiftLMAddr11Pred, [SwiftWriteSTM12]>,
+    // Assume three Q registers.
+    SchedVar<SwiftLMAddr12Pred, [SwiftWriteSTM4]>,
+    SchedVar<SwiftLMAddr13Pred, [SwiftWriteSTM14]>,
+    // Assume 7 D registers.
+    SchedVar<SwiftLMAddr14Pred, [SwiftWriteSTM8]>,
+    SchedVar<SwiftLMAddr15Pred, [SwiftWriteSTM16]>,
+    // Assume four Q registers.
+    SchedVar<SwiftLMAddr16Pred, [SwiftWriteSTM5]>,
+    // Asumme two Q registers.
+    SchedVar<NoSchedPred, [SwiftWriteSTM3]>
+  ]> { let Variadic = 1; }
+
+  def : InstRW<[SwiftWriteVSTM], (instregex "VSTM[SD](IA|DB)$")>;
+
+  def : InstRW<[SwiftWriteP01OneCycle2x, SwiftWriteVSTM],
+        (instregex "VSTM[SD](IA|DB)_UPD")>;
+
+  // 4.2.43 Advanced SIMD, Element or Structure Load and Store
+  def SwiftWrite2xP2FourCy : SchedWriteRes<[SwiftUnitP2]> {
+      let Latency = 4;
+      let ResourceCycles = [2];
+  }
+  def SwiftWrite3xP2FourCy : SchedWriteRes<[SwiftUnitP2]> {
+      let Latency = 4;
+      let ResourceCycles = [3];
+  }
+  foreach Num = 1-2 in {
+    def SwiftExt#Num#xP0 : SchedWriteRes<[SwiftUnitP0]> {
+      let Latency = 0;
+      let NumMicroOps = Num;
+      let ResourceCycles = [Num];
+    }
+  }
+  // VLDx
+  // Multiple structures.
+  // Single element structure loads.
+  // We assume aligned.
+  // Single/two register.
+  def : InstRW<[SwiftWriteLM4Cy], (instregex "VLD1(d|q)(8|16|32|64)$")>;
+  def : InstRW<[SwiftWriteLM4Cy, SwiftWriteP01OneCycle],
+        (instregex "VLD1(d|q)(8|16|32|64)wb")>;
+  // Three register.
+  def : InstRW<[SwiftWrite3xP2FourCy],
+        (instregex "VLD1(d|q)(8|16|32|64)T$", "VLD1d64TPseudo")>;
+  def : InstRW<[SwiftWrite3xP2FourCy, SwiftWriteP01OneCycle],
+        (instregex "VLD1(d|q)(8|16|32|64)Twb")>;
+  /// Four Register.
+  def : InstRW<[SwiftWrite2xP2FourCy],
+        (instregex "VLD1(d|q)(8|16|32|64)Q$", "VLD1d64QPseudo")>;
+  def : InstRW<[SwiftWrite2xP2FourCy, SwiftWriteP01OneCycle],
+        (instregex "VLD1(d|q)(8|16|32|64)Qwb")>;
+  // Two element structure loads.
+  // Two/four register.
+  def : InstRW<[SwiftWriteLM9Cy, SwiftExt2xP0, SwiftVLDMPerm2],
+        (instregex "VLD2(d|q|b)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM9Cy, SwiftWriteP01OneCycle, SwiftExt2xP0,
+                SwiftVLDMPerm2],
+        (instregex "VLD2(d|q|b)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>;
+  // Three element structure.
+  def : InstRW<[SwiftWriteLM9Cy, SwiftWriteLM9CyNo, SwiftWriteLM9CyNo,
+                SwiftVLDMPerm3, SwiftWrite3xP2FourCy],
+        (instregex "VLD3(d|q)(8|16|32)$")>;
+  def : InstRW<[SwiftWriteLM9Cy, SwiftVLDMPerm3, SwiftWrite3xP2FourCy],
+        (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>;
+
+  def : InstRW<[SwiftWriteLM9Cy, SwiftWriteLM9CyNo, SwiftWriteLM9CyNo,
+                SwiftWriteP01OneCycle, SwiftVLDMPerm3, SwiftWrite3xP2FourCy],
+        (instregex "VLD3(d|q)(8|16|32)_UPD$")>;
+  def : InstRW<[SwiftWriteLM9Cy, SwiftWriteP01OneCycle, SwiftVLDMPerm3,
+                SwiftWrite3xP2FourCy],
+        (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+  // Four element structure loads.
+  def : InstRW<[SwiftWriteLM11Cy, SwiftWriteLM11Cy, SwiftWriteLM11Cy,
+                SwiftWriteLM11Cy, SwiftExt2xP0, SwiftVLDMPerm4,
+                SwiftWrite3xP2FourCy],
+        (instregex "VLD4(d|q)(8|16|32)$")>;
+  def : InstRW<[SwiftWriteLM11Cy,  SwiftExt2xP0, SwiftVLDMPerm4,
+                SwiftWrite3xP2FourCy],
+        (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>;
+  def : InstRW<[SwiftWriteLM11Cy, SwiftWriteLM11Cy, SwiftWriteLM11Cy,
+                SwiftWriteLM11Cy, SwiftWriteP01OneCycle, SwiftExt2xP0,
+                SwiftVLDMPerm4, SwiftWrite3xP2FourCy],
+        (instregex "VLD4(d|q)(8|16|32)_UPD")>;
+  def : InstRW<[SwiftWriteLM11Cy, SwiftWriteP01OneCycle, SwiftExt2xP0,
+                SwiftVLDMPerm4, SwiftWrite3xP2FourCy],
+        (instregex  "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+  // Single all/lane loads.
+  // One element structure.
+  def : InstRW<[SwiftWriteLM6Cy, SwiftVLDMPerm2],
+        (instregex "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftVLDMPerm2],
+        (instregex "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)",
+                  "VLD1LNq(8|16|32)Pseudo_UPD")>;
+  // Two element structure.
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteLM6Cy, SwiftExt1xP0, SwiftVLDMPerm2],
+        (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$",
+                   "VLD2LN(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteLM6Cy, SwiftWriteP01OneCycle,
+                SwiftExt1xP0, SwiftVLDMPerm2],
+        (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>;
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftWriteLM6Cy,
+                SwiftExt1xP0, SwiftVLDMPerm2],
+        (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb")>;
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftWriteLM6Cy,
+                SwiftExt1xP0, SwiftVLDMPerm2],
+        (instregex "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>;
+  // Three element structure.
+  def : InstRW<[SwiftWriteLM7Cy, SwiftWriteLM8Cy, SwiftWriteLM8Cy, SwiftExt1xP0,
+                SwiftVLDMPerm3],
+        (instregex "VLD3(DUP|LN)(d|q)(8|16|32)$",
+                   "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM7Cy, SwiftWriteLM8Cy, SwiftWriteLM8Cy,
+                SwiftWriteP01OneCycle, SwiftExt1xP0, SwiftVLDMPerm3],
+        (instregex "VLD3(LN|DUP)(d|q)(8|16|32)_UPD")>;
+  def : InstRW<[SwiftWriteLM7Cy, SwiftWriteP01OneCycle, SwiftWriteLM8Cy,
+                SwiftWriteLM8Cy, SwiftExt1xP0, SwiftVLDMPerm3],
+        (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
+  // Four element struture.
+  def : InstRW<[SwiftWriteLM8Cy, SwiftWriteLM9Cy, SwiftWriteLM10CyNo,
+                SwiftWriteLM10CyNo, SwiftExt1xP0, SwiftVLDMPerm5],
+        (instregex "VLD4(LN|DUP)(d|q)(8|16|32)$",
+                   "VLD4(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM8Cy, SwiftWriteLM9Cy, SwiftWriteLM10CyNo,
+                SwiftWriteLM10CyNo, SwiftWriteP01OneCycle, SwiftExt1xP0,
+                SwiftVLDMPerm5],
+        (instregex "VLD4(DUP|LN)(d|q)(8|16|32)_UPD")>;
+  def : InstRW<[SwiftWriteLM8Cy, SwiftWriteP01OneCycle, SwiftWriteLM9Cy,
+                SwiftWriteLM10CyNo, SwiftWriteLM10CyNo, SwiftExt1xP0,
+                SwiftVLDMPerm5],
+        (instregex "VLD4(DUP|LN)(d|q)(8|16|32)Pseudo_UPD")>;
+  // VSTx
+  // Multiple structures.
+  // Single element structure store.
+  def : InstRW<[SwiftWrite1xP2], (instregex "VST1d(8|16|32|64)$")>;
+  def : InstRW<[SwiftWrite2xP2], (instregex "VST1q(8|16|32|64)$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2],
+        (instregex "VST1d(8|16|32|64)wb")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite2xP2],
+        (instregex "VST1q(8|16|32|64)wb")>;
+  def : InstRW<[SwiftWrite3xP2],
+        (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite3xP2],
+        (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>;
+  def : InstRW<[SwiftWrite4xP2],
+        (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2],
+        (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>;
+  // Two element structure store.
+  def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm1],
+        (instregex "VST2(d|b)(8|16|32)$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm1],
+        (instregex "VST2(b|d)(8|16|32)wb")>;
+  def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2],
+        (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2],
+        (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>;
+  // Three element structure store.
+  def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST3(d|q)(8|16|32)_UPD",
+                   "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+  // Four element structure store.
+  def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm4],
+        (instregex "VST4(d|q)(8|16|32)_UPD",
+                   "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+  // Single/all lane store.
+  // One element structure.
+  def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm1],
+        (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm1],
+        (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>;
+  // Two element structure.
+  def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm2],
+        (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm2],
+        (instregex "VST2LN(d|q)(8|16|32)_UPD",
+                   "VST2LN(d|q)(8|16|32)Pseudo_UPD")>;
+  // Three element structure.
+  def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST3LN(d|q)(8|16|32)_UPD",
+                   "VST3LN(d|q)(8|16|32)Pseudo_UPD")>;
+  // Four element structure.
+  def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2],
+        (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite2xP2, SwiftVLDMPerm2],
+        (instregex "VST4LN(d|q)(8|16|32)_UPD",
+                   "VST4LN(d|q)(8|16|32)Pseudo_UPD")>;
+
+  // 4.2.44 VFP, Divide and Square Root
+  def SwiftDiv17 : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> {
+    let NumMicroOps = 1;
+    let Latency = 17;
+    let ResourceCycles = [1, 15];
+  }
+  def SwiftDiv32 : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> {
+    let NumMicroOps = 1;
+    let Latency = 32;
+    let ResourceCycles = [1, 30];
+  }
+  def : InstRW<[SwiftDiv17], (instregex "VDIVS", "VSQRTS")>;
+  def : InstRW<[SwiftDiv32], (instregex "VDIVD", "VSQRTD")>;
+
+  // Not specified.
+  def : InstRW<[SwiftWriteP01OneCycle2x], (instregex "ABS")>;
+  // Preload.
+  def : WriteRes<WritePreLd, [SwiftUnitP2]> { let Latency = 0;
+    let ResourceCycles = [0];
+  }
+
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td
new file mode 100644
index 000000000000..57d0bfb65049
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td
@@ -0,0 +1,300 @@
+//===-- ARMScheduleV6.td - ARM v6 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM v6 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// Model based on ARM1176
+//
+// Functional Units
+def V6_Pipe : FuncUnit; // pipeline
+
+// Scheduling information derived from "ARM1176JZF-S Technical Reference Manual"
+//
+def ARMV6Itineraries : ProcessorItineraries<
+  [V6_Pipe], [], [
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx    , [InstrStage<1, [V6_Pipe]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iALUr    , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+  InstrItinData<IIC_iALUsi   , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iALUsr   , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>,
+  //
+  // Bitwise Instructions that produce a result
+  InstrItinData<IIC_iBITi    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iBITr    , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+  InstrItinData<IIC_iBITsi   , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iBITsr   , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>,
+  //
+  // Unary Instructions that produce a result
+  InstrItinData<IIC_iUNAr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iUNAsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  //
+  // Zero and sign extension instructions
+  InstrItinData<IIC_iEXTr    , [InstrStage<1, [V6_Pipe]>], [1, 1]>,
+  InstrItinData<IIC_iEXTAr   , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iEXTAsr  , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi    , [InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iCMPr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iCMPsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iCMPsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+  //
+  // Test instructions
+  InstrItinData<IIC_iTSTi    , [InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iTSTr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iTSTsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iTSTsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi    , [InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iMOVr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iMOVsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iMOVsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+  InstrItinData<IIC_iMOVix2  , [InstrStage<1, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [V6_Pipe]>,
+                                  InstrStage<1, [V6_Pipe]>,
+                                  InstrStage<1, [V6_Pipe]>], [3]>,
+  InstrItinData<IIC_iMOVix2ld , [InstrStage<1, [V6_Pipe]>,
+                                 InstrStage<1, [V6_Pipe]>,
+                                 InstrStage<1, [V6_Pipe]>], [5]>,
+  //
+  // Move instructions, conditional
+  InstrItinData<IIC_iCMOVi   , [InstrStage<1, [V6_Pipe]>], [3]>,
+  InstrItinData<IIC_iCMOVr   , [InstrStage<1, [V6_Pipe]>], [3, 2]>,
+  InstrItinData<IIC_iCMOVsi  , [InstrStage<1, [V6_Pipe]>], [3, 1]>,
+  InstrItinData<IIC_iCMOVsr  , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>,
+  InstrItinData<IIC_iCMOVix2 , [InstrStage<1, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [4]>,
+  //
+  // MVN instructions
+  InstrItinData<IIC_iMVNi    , [InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iMVNr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iMVNsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iMVNsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+
+  // Integer multiply pipeline
+  //
+  InstrItinData<IIC_iMUL16   , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iMAC16   , [InstrStage<1, [V6_Pipe]>], [4, 1, 1, 2]>,
+  InstrItinData<IIC_iMUL32   , [InstrStage<2, [V6_Pipe]>], [5, 1, 1]>,
+  InstrItinData<IIC_iMAC32   , [InstrStage<2, [V6_Pipe]>], [5, 1, 1, 2]>,
+  InstrItinData<IIC_iMUL64   , [InstrStage<3, [V6_Pipe]>], [6, 1, 1]>,
+  InstrItinData<IIC_iMAC64   , [InstrStage<3, [V6_Pipe]>], [6, 1, 1, 2]>,
+
+  // Integer load pipeline
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [V6_Pipe]>], [4, 1]>,
+  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [V6_Pipe]>], [4, 1]>,
+  InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [V6_Pipe]>], [4, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  InstrItinData<IIC_iLoad_si   , [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>,
+  InstrItinData<IIC_iLoad_bh_si, [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoad_iu   , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>,
+  InstrItinData<IIC_iLoad_bh_iu, [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoad_ru   , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_ru, [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_ru , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iLoad_siu,   [InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>,
+  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>,
+
+  //
+  // Load multiple, def is the 5th operand.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<3, [V6_Pipe]>], [1, 1, 1, 1, 4]>,
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 1, 4]>,
+  //
+  // Load multiple plus branch
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [1, 2, 1, 1, 4]>,
+
+  //
+  // iLoadi + iALUr for t2LDRpci_pic.
+  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [3, 1]>,
+
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop     , [InstrStage<3, [V6_Pipe]>], [1, 1, 4]>,
+  //
+  // Pop + branch, def is the 3rd operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<3, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [1, 2, 4]>,
+
+  // Integer store pipeline
+  //
+  // Immediate offset
+  InstrItinData<IIC_iStore_i   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iStore_bh_i, [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iStore_d_i , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStore_r   , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_r, [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>,
+  InstrItinData<IIC_iStore_d_r , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  InstrItinData<IIC_iStore_si   , [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iStore_bh_si, [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStore_iu   , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iStore_bh_iu, [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStore_ru,   [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>,
+  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iStore_siu,   [InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>,
+  InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>,
+  //
+  // Store multiple
+  InstrItinData<IIC_iStore_m  , [InstrStage<3, [V6_Pipe]>]>,
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu , [InstrStage<3, [V6_Pipe]>], [2]>,
+
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br      , [InstrStage<1, [V6_Pipe]>]>,
+
+  // VFP
+  // Issue through integer pipeline, and execute in NEON unit. We assume
+  // RunFast mode so that NFP pipeline is used for single-precision when
+  // possible.
+  //
+  // FP Special Register to Integer Register File Move
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [V6_Pipe]>], [3]>,
+  //
+  // Single-precision FP Unary
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [V6_Pipe]>], [5, 2]>,
+  //
+  // Double-precision FP Unary
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [V6_Pipe]>], [5, 2]>,
+  //
+  // Single-precision FP Compare
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  //
+  // Double-precision FP Compare
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  //
+  // Single to Double FP Convert
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [V6_Pipe]>], [5, 2]>,
+  //
+  // Double to Single FP Convert
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [V6_Pipe]>], [5, 2]>,
+  //
+  // Single-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [V6_Pipe]>], [9, 2]>,
+  //
+  // Double-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [V6_Pipe]>], [9, 2]>,
+  //
+  // Integer to Single-Precision FP Convert
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [V6_Pipe]>], [9, 2]>,
+  //
+  // Integer to Double-Precision FP Convert
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [V6_Pipe]>], [9, 2]>,
+  //
+  // Single-precision FP ALU
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>,
+  //
+  // Double-precision FP ALU
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>,
+  //
+  // Single-precision FP Multiply
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>,
+  //
+  // Double-precision FP Multiply
+  InstrItinData<IIC_fpMUL64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2]>,
+  //
+  // Single-precision FP MAC
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
+  // Double-precision FP MAC
+  InstrItinData<IIC_fpMAC64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
+  // Single-precision FP DIV
+  InstrItinData<IIC_fpDIV32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>,
+  //
+  // Double-precision FP DIV
+  InstrItinData<IIC_fpDIV64 , [InstrStage<29, [V6_Pipe]>], [34, 2, 2]>,
+  //
+  // Single-precision FP SQRT
+  InstrItinData<IIC_fpSQRT32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>,
+  //
+  // Double-precision FP SQRT
+  InstrItinData<IIC_fpSQRT64 , [InstrStage<29, [V6_Pipe]>], [34, 2, 2]>,
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [V6_Pipe]>], [10, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [V6_Pipe]>], [10, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [V6_Pipe]>], [10, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [V6_Pipe]>], [10, 10, 1]>,
+  //
+  // Single-precision FP Load
+  InstrItinData<IIC_fpLoad32 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>,
+  //
+  // Double-precision FP Load
+  InstrItinData<IIC_fpLoad64 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>,
+  //
+  // FP Load Multiple
+  InstrItinData<IIC_fpLoad_m , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 5]>,
+  //
+  // FP Load Multiple + update
+  InstrItinData<IIC_fpLoad_mu, [InstrStage<3, [V6_Pipe]>], [3, 2, 1, 1, 5]>,
+  //
+  // Single-precision FP Store
+  InstrItinData<IIC_fpStore32 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+  //
+  // Double-precision FP Store
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpStore64 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+  //
+  // FP Store Multiple
+  InstrItinData<IIC_fpStore_m, [InstrStage<3, [V6_Pipe]>], [2, 2, 2, 2]>,
+  //
+  // FP Store Multiple + update
+  InstrItinData<IIC_fpStore_mu,[InstrStage<3, [V6_Pipe]>], [3, 2, 2, 2, 2]>
+]>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..3b99762f7157
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -0,0 +1,261 @@
+//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARMSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-selectiondag-info"
+
+// Emit, if possible, a specialized version of the given Libcall. Typically this
+// means selecting the appropriately aligned version, but we also convert memset
+// of 0 into memclr.
+SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
+  const ARMSubtarget &Subtarget =
+      DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
+  const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
+
+  // Only use a specialized AEABI function if the default version of this
+  // Libcall is an AEABI function.
+  if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
+    return SDValue();
+
+  // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
+  // able to translate memset to memclr and use the value to index the function
+  // name array.
+  enum {
+    AEABI_MEMCPY = 0,
+    AEABI_MEMMOVE,
+    AEABI_MEMSET,
+    AEABI_MEMCLR
+  } AEABILibcall;
+  switch (LC) {
+  case RTLIB::MEMCPY:
+    AEABILibcall = AEABI_MEMCPY;
+    break;
+  case RTLIB::MEMMOVE:
+    AEABILibcall = AEABI_MEMMOVE;
+    break;
+  case RTLIB::MEMSET: 
+    AEABILibcall = AEABI_MEMSET;
+    if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src))
+      if (ConstantSrc->getZExtValue() == 0)
+        AEABILibcall = AEABI_MEMCLR;
+    break;
+  default:
+    return SDValue();
+  }
+
+  // Choose the most-aligned libcall variant that we can
+  enum {
+    ALIGN1 = 0,
+    ALIGN4,
+    ALIGN8
+  } AlignVariant;
+  if ((Align & 7) == 0)
+    AlignVariant = ALIGN8;
+  else if ((Align & 3) == 0)
+    AlignVariant = ALIGN4;
+  else
+    AlignVariant = ALIGN1;
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+  Entry.Node = Dst;
+  Args.push_back(Entry);
+  if (AEABILibcall == AEABI_MEMCLR) {
+    Entry.Node = Size;
+    Args.push_back(Entry);
+  } else if (AEABILibcall == AEABI_MEMSET) {
+    // Adjust parameters for memset, EABI uses format (ptr, size, value),
+    // GNU library uses (ptr, value, size)
+    // See RTABI section 4.3.4
+    Entry.Node = Size;
+    Args.push_back(Entry);
+
+    // Extend or truncate the argument to be an i32 value for the call.
+    if (Src.getValueType().bitsGT(MVT::i32))
+      Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
+    else if (Src.getValueType().bitsLT(MVT::i32))
+      Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
+
+    Entry.Node = Src; 
+    Entry.Ty = Type::getInt32Ty(*DAG.getContext());
+    Entry.isSExt = false;
+    Args.push_back(Entry);
+  } else {
+    Entry.Node = Src;
+    Args.push_back(Entry);
+    
+    Entry.Node = Size;
+    Args.push_back(Entry);
+  }
+
+  char const *FunctionNames[4][3] = {
+    { "__aeabi_memcpy",  "__aeabi_memcpy4",  "__aeabi_memcpy8"  },
+    { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
+    { "__aeabi_memset",  "__aeabi_memset4",  "__aeabi_memset8"  },
+    { "__aeabi_memclr",  "__aeabi_memclr4",  "__aeabi_memclr8"  }
+  };
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setCallee(
+           TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
+           DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
+                                 TLI->getPointerTy(DAG.getDataLayout())),
+           std::move(Args))
+      .setDiscardResult();
+  std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
+  
+  return CallResult.second;
+}
+
+SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+  const ARMSubtarget &Subtarget =
+      DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
+  // Do repeated 4-byte loads and stores. To be improved.
+  // This requires 4-byte alignment.
+  if ((Align & 3) != 0)
+    return SDValue();
+  // This requires the copy size to be a constant, preferably
+  // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
+                                  RTLIB::MEMCPY);
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
+    return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
+                                  RTLIB::MEMCPY);
+
+  unsigned BytesLeft = SizeVal & 3;
+  unsigned NumMemOps = SizeVal >> 2;
+  unsigned EmittedNumMemOps = 0;
+  EVT VT = MVT::i32;
+  unsigned VTSize = 4;
+  unsigned i = 0;
+  // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
+  const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
+  SDValue TFOps[6];
+  SDValue Loads[6];
+  uint64_t SrcOff = 0, DstOff = 0;
+
+  // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
+  // VLDM/VSTM and make this code emit it when appropriate. This would reduce
+  // pressure on the general purpose registers. However this seems harder to map
+  // onto the register allocator's view of the world.
+
+  // The number of MEMCPY pseudo-instructions to emit. We use up to
+  // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
+  // later on. This is a lower bound on the number of MEMCPY operations we must
+  // emit.
+  unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
+
+  // Code size optimisation: do not inline memcpy if expansion results in
+  // more instructions than the libary call.
+  if (NumMEMCPYs > 1 && DAG.getMachineFunction().getFunction()->optForMinSize()) {
+    return SDValue();
+  }
+
+  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
+
+  for (unsigned I = 0; I != NumMEMCPYs; ++I) {
+    // Evenly distribute registers among MEMCPY operations to reduce register
+    // pressure.
+    unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
+    unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
+
+    Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
+                      DAG.getConstant(NumRegs, dl, MVT::i32));
+    Src = Dst.getValue(1);
+    Chain = Dst.getValue(2);
+
+    DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
+    SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
+
+    EmittedNumMemOps = NextEmittedNumMemOps;
+  }
+
+  if (BytesLeft == 0)
+    return Chain;
+
+  // Issue loads / stores for the trailing (1 - 3) bytes.
+  unsigned BytesLeftSave = BytesLeft;
+  i = 0;
+  while (BytesLeft) {
+    if (BytesLeft >= 2) {
+      VT = MVT::i16;
+      VTSize = 2;
+    } else {
+      VT = MVT::i8;
+      VTSize = 1;
+    }
+
+    Loads[i] = DAG.getLoad(VT, dl, Chain,
+                           DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
+                                       DAG.getConstant(SrcOff, dl, MVT::i32)),
+                           SrcPtrInfo.getWithOffset(SrcOff));
+    TFOps[i] = Loads[i].getValue(1);
+    ++i;
+    SrcOff += VTSize;
+    BytesLeft -= VTSize;
+  }
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                      makeArrayRef(TFOps, i));
+
+  i = 0;
+  BytesLeft = BytesLeftSave;
+  while (BytesLeft) {
+    if (BytesLeft >= 2) {
+      VT = MVT::i16;
+      VTSize = 2;
+    } else {
+      VT = MVT::i8;
+      VTSize = 1;
+    }
+
+    TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+                            DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
+                                        DAG.getConstant(DstOff, dl, MVT::i32)),
+                            DstPtrInfo.getWithOffset(DstOff));
+    ++i;
+    DstOff += VTSize;
+    BytesLeft -= VTSize;
+  }
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     makeArrayRef(TFOps, i));
+}
+
+SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+  return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
+                                RTLIB::MEMMOVE);
+}
+
+SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo) const {
+  return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
+                                RTLIB::MEMSET);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
new file mode 100644
index 000000000000..2ddb42c95397
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -0,0 +1,69 @@
+//===-- ARMSelectionDAGInfo.h - ARM SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMSELECTIONDAGINFO_H
+
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+namespace ARM_AM {
+  static inline ShiftOpc getShiftOpcForNode(unsigned Opcode) {
+    switch (Opcode) {
+    default:          return ARM_AM::no_shift;
+    case ISD::SHL:    return ARM_AM::lsl;
+    case ISD::SRL:    return ARM_AM::lsr;
+    case ISD::SRA:    return ARM_AM::asr;
+    case ISD::ROTR:   return ARM_AM::ror;
+    //case ISD::ROTL:  // Only if imm -> turn into ROTR.
+    // Can't handle RRX here, because it would require folding a flag into
+    // the addressing mode.  :(  This causes us to miss certain things.
+    //case ARMISD::RRX: return ARM_AM::rrx;
+    }
+  }
+}  // end namespace ARM_AM
+
+class ARMSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
+
+  SDValue
+  EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
+                           SDValue Dst, SDValue Src, SDValue Size,
+                           unsigned Align, bool isVolatile,
+                           MachinePointerInfo DstPtrInfo,
+                           MachinePointerInfo SrcPtrInfo) const override;
+
+  // Adjust parameters for memset, see RTABI section 4.3.4
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Op1, SDValue Op2,
+                                  SDValue Op3, unsigned Align, bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const override;
+
+  SDValue EmitSpecializedLibcall(SelectionDAG &DAG, const SDLoc &dl,
+                                 SDValue Chain, SDValue Dst, SDValue Src,
+                                 SDValue Size, unsigned Align,
+                                 RTLIB::Libcall LC) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
new file mode 100644
index 000000000000..e2df0bddd0d1
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -0,0 +1,382 @@
+//===-- ARMSubtarget.cpp - ARM Subtarget Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMSubtarget.h"
+#include "ARMFrameLowering.h"
+#include "ARMISelLowering.h"
+#include "ARMInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSelectionDAGInfo.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "Thumb1FrameLowering.h"
+#include "Thumb1InstrInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/TargetParser.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "ARMGenSubtargetInfo.inc"
+
+static cl::opt<bool>
+UseFusedMulOps("arm-use-mulops",
+               cl::init(true), cl::Hidden);
+
+enum ITMode {
+  DefaultIT,
+  RestrictedIT,
+  NoRestrictedIT
+};
+
+static cl::opt<ITMode>
+IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT),
+   cl::ZeroOrMore,
+   cl::values(clEnumValN(DefaultIT, "arm-default-it",
+                         "Generate IT block based on arch"),
+              clEnumValN(RestrictedIT, "arm-restrict-it",
+                         "Disallow deprecated IT based on ARMv8"),
+              clEnumValN(NoRestrictedIT, "arm-no-restrict-it",
+                         "Allow IT blocks based on ARMv7")));
+
+/// ForceFastISel - Use the fast-isel, even for subtargets where it is not
+/// currently supported (for testing only).
+static cl::opt<bool>
+ForceFastISel("arm-force-fast-isel",
+               cl::init(false), cl::Hidden);
+
+/// initializeSubtargetDependencies - Initializes using a CPU and feature string
+/// so that we can use initializer lists for subtarget initialization.
+ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                            StringRef FS) {
+  initializeEnvironment();
+  initSubtargetFeatures(CPU, FS);
+  return *this;
+}
+
+/// EnableExecuteOnly - Enables the generation of execute-only code on supported
+/// targets
+static cl::opt<bool>
+EnableExecuteOnly("arm-execute-only");
+
+ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU,
+                                                        StringRef FS) {
+  ARMSubtarget &STI = initializeSubtargetDependencies(CPU, FS);
+  if (STI.isThumb1Only())
+    return (ARMFrameLowering *)new Thumb1FrameLowering(STI);
+
+  return new ARMFrameLowering(STI);
+}
+
+ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
+                           const std::string &FS,
+                           const ARMBaseTargetMachine &TM, bool IsLittle)
+    : ARMGenSubtargetInfo(TT, CPU, FS), UseMulOps(UseFusedMulOps),
+      GenExecuteOnly(EnableExecuteOnly), CPUString(CPU), IsLittle(IsLittle),
+      TargetTriple(TT), Options(TM.Options), TM(TM),
+      FrameLowering(initializeFrameLowering(CPU, FS)),
+      // At this point initializeSubtargetDependencies has been called so
+      // we can query directly.
+      InstrInfo(isThumb1Only()
+                    ? (ARMBaseInstrInfo *)new Thumb1InstrInfo(*this)
+                    : !isThumb()
+                          ? (ARMBaseInstrInfo *)new ARMInstrInfo(*this)
+                          : (ARMBaseInstrInfo *)new Thumb2InstrInfo(*this)),
+      TLInfo(TM, *this), GISel() {}
+
+const CallLowering *ARMSubtarget::getCallLowering() const {
+  assert(GISel && "Access to GlobalISel APIs not set");
+  return GISel->getCallLowering();
+}
+
+const InstructionSelector *ARMSubtarget::getInstructionSelector() const {
+  assert(GISel && "Access to GlobalISel APIs not set");
+  return GISel->getInstructionSelector();
+}
+
+const LegalizerInfo *ARMSubtarget::getLegalizerInfo() const {
+  assert(GISel && "Access to GlobalISel APIs not set");
+  return GISel->getLegalizerInfo();
+}
+
+const RegisterBankInfo *ARMSubtarget::getRegBankInfo() const {
+  assert(GISel && "Access to GlobalISel APIs not set");
+  return GISel->getRegBankInfo();
+}
+
+bool ARMSubtarget::isXRaySupported() const {
+  // We don't currently suppport Thumb, but Windows requires Thumb.
+  return hasV6Ops() && hasARMOps() && !isTargetWindows();
+}
+
+void ARMSubtarget::initializeEnvironment() {
+  // MCAsmInfo isn't always present (e.g. in opt) so we can't initialize this
+  // directly from it, but we can try to make sure they're consistent when both
+  // available.
+  UseSjLjEH = isTargetDarwin() && !isTargetWatchABI();
+  assert((!TM.getMCAsmInfo() ||
+          (TM.getMCAsmInfo()->getExceptionHandlingType() ==
+           ExceptionHandling::SjLj) == UseSjLjEH) &&
+         "inconsistent sjlj choice between CodeGen and MC");
+}
+
+void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
+  if (CPUString.empty()) {
+    CPUString = "generic";
+
+    if (isTargetDarwin()) {
+      StringRef ArchName = TargetTriple.getArchName();
+      unsigned ArchKind = llvm::ARM::parseArch(ArchName);
+      if (ArchKind == llvm::ARM::AK_ARMV7S)
+        // Default to the Swift CPU when targeting armv7s/thumbv7s.
+        CPUString = "swift";
+      else if (ArchKind == llvm::ARM::AK_ARMV7K)
+        // Default to the Cortex-a7 CPU when targeting armv7k/thumbv7k.
+        // ARMv7k does not use SjLj exception handling.
+        CPUString = "cortex-a7";
+    }
+  }
+
+  // Insert the architecture feature derived from the target triple into the
+  // feature string. This is important for setting features that are implied
+  // based on the architecture version.
+  std::string ArchFS = ARM_MC::ParseARMTriple(TargetTriple, CPUString);
+  if (!FS.empty()) {
+    if (!ArchFS.empty())
+      ArchFS = (Twine(ArchFS) + "," + FS).str();
+    else
+      ArchFS = FS;
+  }
+  ParseSubtargetFeatures(CPUString, ArchFS);
+
+  // FIXME: This used enable V6T2 support implicitly for Thumb2 mode.
+  // Assert this for now to make the change obvious.
+  assert(hasV6T2Ops() || !hasThumb2());
+
+  // Execute only support requires movt support
+  if (genExecuteOnly())
+    assert(hasV8MBaselineOps() && !NoMovt && "Cannot generate execute-only code for this target");
+
+  // Keep a pointer to static instruction cost data for the specified CPU.
+  SchedModel = getSchedModelForCPU(CPUString);
+
+  // Initialize scheduling itinerary for the specified CPU.
+  InstrItins = getInstrItineraryForCPU(CPUString);
+
+  // FIXME: this is invalid for WindowsCE
+  if (isTargetWindows())
+    NoARM = true;
+
+  if (isAAPCS_ABI())
+    stackAlignment = 8;
+  if (isTargetNaCl() || isAAPCS16_ABI())
+    stackAlignment = 16;
+
+  // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
+  // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
+  // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation
+  // support in the assembler and linker to be used. This would need to be
+  // fixed to fully support tail calls in Thumb1.
+  //
+  // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
+  // LR.  This means if we need to reload LR, it takes an extra instructions,
+  // which outweighs the value of the tail call; but here we don't know yet
+  // whether LR is going to be used.  Probably the right approach is to
+  // generate the tail call here and turn it back into CALL/RET in
+  // emitEpilogue if LR is used.
+
+  // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
+  // but we need to make sure there are enough registers; the only valid
+  // registers are the 4 used for parameters.  We don't currently do this
+  // case.
+
+  SupportsTailCall = !isThumb() || hasV8MBaselineOps();
+
+  if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0))
+    SupportsTailCall = false;
+
+  switch (IT) {
+  case DefaultIT:
+    RestrictIT = hasV8Ops();
+    break;
+  case RestrictedIT:
+    RestrictIT = true;
+    break;
+  case NoRestrictedIT:
+    RestrictIT = false;
+    break;
+  }
+
+  // NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default.
+  const FeatureBitset &Bits = getFeatureBits();
+  if ((Bits[ARM::ProcA5] || Bits[ARM::ProcA8]) && // Where this matters
+      (Options.UnsafeFPMath || isTargetDarwin()))
+    UseNEONForSinglePrecisionFP = true;
+
+  if (isRWPI())
+    ReserveR9 = true;
+
+  // FIXME: Teach TableGen to deal with these instead of doing it manually here.
+  switch (ARMProcFamily) {
+  case Others:
+  case CortexA5:
+    break;
+  case CortexA7:
+    LdStMultipleTiming = DoubleIssue;
+    break;
+  case CortexA8:
+    LdStMultipleTiming = DoubleIssue;
+    break;
+  case CortexA9:
+    LdStMultipleTiming = DoubleIssueCheckUnalignedAccess;
+    PreISelOperandLatencyAdjustment = 1;
+    break;
+  case CortexA12:
+    break;
+  case CortexA15:
+    MaxInterleaveFactor = 2;
+    PreISelOperandLatencyAdjustment = 1;
+    PartialUpdateClearance = 12;
+    break;
+  case CortexA17:
+  case CortexA32:
+  case CortexA35:
+  case CortexA53:
+  case CortexA57:
+  case CortexA72:
+  case CortexA73:
+  case CortexR4:
+  case CortexR4F:
+  case CortexR5:
+  case CortexR7:
+  case CortexM3:
+  case ExynosM1:
+  case CortexR52:
+    break;
+  case Krait:
+    PreISelOperandLatencyAdjustment = 1;
+    break;
+  case Swift:
+    MaxInterleaveFactor = 2;
+    LdStMultipleTiming = SingleIssuePlusExtras;
+    PreISelOperandLatencyAdjustment = 1;
+    PartialUpdateClearance = 12;
+    break;
+  }
+}
+
+bool ARMSubtarget::isAPCS_ABI() const {
+  assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
+  return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_APCS;
+}
+bool ARMSubtarget::isAAPCS_ABI() const {
+  assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
+  return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS ||
+         TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+}
+bool ARMSubtarget::isAAPCS16_ABI() const {
+  assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
+  return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+}
+
+bool ARMSubtarget::isROPI() const {
+  return TM.getRelocationModel() == Reloc::ROPI ||
+         TM.getRelocationModel() == Reloc::ROPI_RWPI;
+}
+bool ARMSubtarget::isRWPI() const {
+  return TM.getRelocationModel() == Reloc::RWPI ||
+         TM.getRelocationModel() == Reloc::ROPI_RWPI;
+}
+
+bool ARMSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
+  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+    return true;
+
+  // 32 bit macho has no relocation for a-b if a is undefined, even if b is in
+  // the section that is being relocated. This means we have to use o load even
+  // for GVs that are known to be local to the dso.
+  if (isTargetMachO() && TM.isPositionIndependent() &&
+      (GV->isDeclarationForLinker() || GV->hasCommonLinkage()))
+    return true;
+
+  return false;
+}
+
+unsigned ARMSubtarget::getMispredictionPenalty() const {
+  return SchedModel.MispredictPenalty;
+}
+
+bool ARMSubtarget::hasSinCos() const {
+  return isTargetWatchOS() ||
+    (isTargetIOS() && !getTargetTriple().isOSVersionLT(7, 0));
+}
+
+bool ARMSubtarget::enableMachineScheduler() const {
+  // Enable the MachineScheduler before register allocation for out-of-order
+  // architectures where we do not use the PostRA scheduler anymore (for now
+  // restricted to swift).
+  return getSchedModel().isOutOfOrder() && isSwift();
+}
+
+// This overrides the PostRAScheduler bit in the SchedModel for any CPU.
+bool ARMSubtarget::enablePostRAScheduler() const {
+  // No need for PostRA scheduling on out of order CPUs (for now restricted to
+  // swift).
+  if (getSchedModel().isOutOfOrder() && isSwift())
+    return false;
+  return (!isThumb() || hasThumb2());
+}
+
+bool ARMSubtarget::enableAtomicExpand() const { return hasAnyDataBarrier(); }
+
+bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const {
+  // For general targets, the prologue can grow when VFPs are allocated with
+  // stride 4 (more vpush instructions). But WatchOS uses a compact unwind
+  // format which it's more important to get right.
+  return isTargetWatchABI() || (isSwift() && !MF.getFunction()->optForMinSize());
+}
+
+bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
+  // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit
+  // immediates as it is inherently position independent, and may be out of
+  // range otherwise.
+  return !NoMovt && hasV8MBaselineOps() &&
+         (isTargetWindows() || !MF.getFunction()->optForMinSize() || genExecuteOnly());
+}
+
+bool ARMSubtarget::useFastISel() const {
+  // Enable fast-isel for any target, for testing only.
+  if (ForceFastISel)
+    return true;
+
+  // Limit fast-isel to the targets that are or have been tested.
+  if (!hasV6Ops())
+    return false;
+
+  // Thumb2 support on iOS; ARM support on iOS, Linux and NaCl.
+  return TM.Options.EnableFastISel &&
+         ((isTargetMachO() && !isThumb1Only()) ||
+          (isTargetLinux() && !isThumb()) || (isTargetNaCl() && !isThumb()));
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
new file mode 100644
index 000000000000..8c8218d0f432
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -0,0 +1,661 @@
+//===-- ARMSubtarget.h - Define Subtarget for the ARM ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMSUBTARGET_H
+#define LLVM_LIB_TARGET_ARM_ARMSUBTARGET_H
+
+
+#include "ARMFrameLowering.h"
+#include "ARMISelLowering.h"
+#include "ARMInstrInfo.h"
+#include "ARMSelectionDAGInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "Thumb1FrameLowering.h"
+#include "Thumb1InstrInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "ARMGenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+class TargetOptions;
+class ARMBaseTargetMachine;
+
+class ARMSubtarget : public ARMGenSubtargetInfo {
+protected:
+  enum ARMProcFamilyEnum {
+    Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15,
+    CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexR52, CortexM3,
+    CortexA32, CortexA35, CortexA53, CortexA57, CortexA72, CortexA73,
+    Krait, Swift, ExynosM1
+  };
+  enum ARMProcClassEnum {
+    None, AClass, RClass, MClass
+  };
+  enum ARMArchEnum {
+    ARMv2, ARMv2a, ARMv3, ARMv3m, ARMv4, ARMv4t, ARMv5, ARMv5t, ARMv5te,
+    ARMv5tej, ARMv6, ARMv6k, ARMv6kz, ARMv6t2, ARMv6m, ARMv6sm, ARMv7a, ARMv7r,
+    ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a, ARMv8mMainline, ARMv8mBaseline,
+    ARMv8r
+  };
+
+public:
+  /// What kind of timing do load multiple/store multiple instructions have.
+  enum ARMLdStMultipleTiming {
+    /// Can load/store 2 registers/cycle.
+    DoubleIssue,
+    /// Can load/store 2 registers/cycle, but needs an extra cycle if the access
+    /// is not 64-bit aligned.
+    DoubleIssueCheckUnalignedAccess,
+    /// Can load/store 1 register/cycle.
+    SingleIssue,
+    /// Can load/store 1 register/cycle, but needs an extra cycle for address
+    /// computation and potentially also for register writeback.
+    SingleIssuePlusExtras,
+  };
+
+protected:
+  /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
+  ARMProcFamilyEnum ARMProcFamily = Others;
+
+  /// ARMProcClass - ARM processor class: None, AClass, RClass or MClass.
+  ARMProcClassEnum ARMProcClass = None;
+
+  /// ARMArch - ARM architecture
+  ARMArchEnum ARMArch = ARMv4t;
+
+  /// HasV4TOps, HasV5TOps, HasV5TEOps,
+  /// HasV6Ops, HasV6MOps, HasV6KOps, HasV6T2Ops, HasV7Ops, HasV8Ops -
+  /// Specify whether target support specific ARM ISA variants.
+  bool HasV4TOps = false;
+  bool HasV5TOps = false;
+  bool HasV5TEOps = false;
+  bool HasV6Ops = false;
+  bool HasV6MOps = false;
+  bool HasV6KOps = false;
+  bool HasV6T2Ops = false;
+  bool HasV7Ops = false;
+  bool HasV8Ops = false;
+  bool HasV8_1aOps = false;
+  bool HasV8_2aOps = false;
+  bool HasV8MBaselineOps = false;
+  bool HasV8MMainlineOps = false;
+
+  /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what
+  /// floating point ISAs are supported.
+  bool HasVFPv2 = false;
+  bool HasVFPv3 = false;
+  bool HasVFPv4 = false;
+  bool HasFPARMv8 = false;
+  bool HasNEON = false;
+
+  /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
+  /// specified. Use the method useNEONForSinglePrecisionFP() to
+  /// determine if NEON should actually be used.
+  bool UseNEONForSinglePrecisionFP = false;
+
+  /// UseMulOps - True if non-microcoded fused integer multiply-add and
+  /// multiply-subtract instructions should be used.
+  bool UseMulOps = false;
+
+  /// SlowFPVMLx - If the VFP2 / NEON instructions are available, indicates
+  /// whether the FP VML[AS] instructions are slow (if so, don't use them).
+  bool SlowFPVMLx = false;
+
+  /// HasVMLxForwarding - If true, NEON has special multiplier accumulator
+  /// forwarding to allow mul + mla being issued back to back.
+  bool HasVMLxForwarding = false;
+
+  /// SlowFPBrcc - True if floating point compare + branch is slow.
+  bool SlowFPBrcc = false;
+
+  /// InThumbMode - True if compiling for Thumb, false for ARM.
+  bool InThumbMode = false;
+
+  /// UseSoftFloat - True if we're using software floating point features.
+  bool UseSoftFloat = false;
+
+  /// HasThumb2 - True if Thumb2 instructions are supported.
+  bool HasThumb2 = false;
+
+  /// NoARM - True if subtarget does not support ARM mode execution.
+  bool NoARM = false;
+
+  /// ReserveR9 - True if R9 is not available as a general purpose register.
+  bool ReserveR9 = false;
+
+  /// NoMovt - True if MOVT / MOVW pairs are not used for materialization of
+  /// 32-bit imms (including global addresses).
+  bool NoMovt = false;
+
+  /// SupportsTailCall - True if the OS supports tail call. The dynamic linker
+  /// must be able to synthesize call stubs for interworking between ARM and
+  /// Thumb.
+  bool SupportsTailCall = false;
+
+  /// HasFP16 - True if subtarget supports half-precision FP conversions
+  bool HasFP16 = false;
+
+  /// HasFullFP16 - True if subtarget supports half-precision FP operations
+  bool HasFullFP16 = false;
+
+  /// HasD16 - True if subtarget is limited to 16 double precision
+  /// FP registers for VFPv3.
+  bool HasD16 = false;
+
+  /// HasHardwareDivide - True if subtarget supports [su]div
+  bool HasHardwareDivide = false;
+
+  /// HasHardwareDivideInARM - True if subtarget supports [su]div in ARM mode
+  bool HasHardwareDivideInARM = false;
+
+  /// HasT2ExtractPack - True if subtarget supports thumb2 extract/pack
+  /// instructions.
+  bool HasT2ExtractPack = false;
+
+  /// HasDataBarrier - True if the subtarget supports DMB / DSB data barrier
+  /// instructions.
+  bool HasDataBarrier = false;
+
+  /// HasV7Clrex - True if the subtarget supports CLREX instructions
+  bool HasV7Clrex = false;
+
+  /// HasAcquireRelease - True if the subtarget supports v8 atomics (LDA/LDAEX etc)
+  /// instructions
+  bool HasAcquireRelease = false;
+
+  /// Pref32BitThumb - If true, codegen would prefer 32-bit Thumb instructions
+  /// over 16-bit ones.
+  bool Pref32BitThumb = false;
+
+  /// AvoidCPSRPartialUpdate - If true, codegen would avoid using instructions
+  /// that partially update CPSR and add false dependency on the previous
+  /// CPSR setting instruction.
+  bool AvoidCPSRPartialUpdate = false;
+
+  /// AvoidMOVsShifterOperand - If true, codegen should avoid using flag setting
+  /// movs with shifter operand (i.e. asr, lsl, lsr).
+  bool AvoidMOVsShifterOperand = false;
+
+  /// HasRetAddrStack - Some processors perform return stack prediction. CodeGen should
+  /// avoid issue "normal" call instructions to callees which do not return.
+  bool HasRetAddrStack = false;
+
+  /// HasMPExtension - True if the subtarget supports Multiprocessing
+  /// extension (ARMv7 only).
+  bool HasMPExtension = false;
+
+  /// HasVirtualization - True if the subtarget supports the Virtualization
+  /// extension.
+  bool HasVirtualization = false;
+
+  /// FPOnlySP - If true, the floating point unit only supports single
+  /// precision.
+  bool FPOnlySP = false;
+
+  /// If true, the processor supports the Performance Monitor Extensions. These
+  /// include a generic cycle-counter as well as more fine-grained (often
+  /// implementation-specific) events.
+  bool HasPerfMon = false;
+
+  /// HasTrustZone - if true, processor supports TrustZone security extensions
+  bool HasTrustZone = false;
+
+  /// Has8MSecExt - if true, processor supports ARMv8-M Security Extensions
+  bool Has8MSecExt = false;
+
+  /// HasCrypto - if true, processor supports Cryptography extensions
+  bool HasCrypto = false;
+
+  /// HasCRC - if true, processor supports CRC instructions
+  bool HasCRC = false;
+
+  /// HasRAS - if true, the processor supports RAS extensions
+  bool HasRAS = false;
+
+  /// If true, the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are
+  /// particularly effective at zeroing a VFP register.
+  bool HasZeroCycleZeroing = false;
+
+  /// HasFPAO - if true, processor  does positive address offset computation faster
+  bool HasFPAO = false;
+
+  /// If true, if conversion may decide to leave some instructions unpredicated.
+  bool IsProfitableToUnpredicate = false;
+
+  /// If true, VMOV will be favored over VGETLNi32.
+  bool HasSlowVGETLNi32 = false;
+
+  /// If true, VMOV will be favored over VDUP.
+  bool HasSlowVDUP32 = false;
+
+  /// If true, VMOVSR will be favored over VMOVDRR.
+  bool PreferVMOVSR = false;
+
+  /// If true, ISHST barriers will be used for Release semantics.
+  bool PreferISHST = false;
+
+  /// If true, a VLDM/VSTM starting with an odd register number is considered to
+  /// take more microops than single VLDRS/VSTRS.
+  bool SlowOddRegister = false;
+
+  /// If true, loading into a D subregister will be penalized.
+  bool SlowLoadDSubregister = false;
+
+  /// If true, the AGU and NEON/FPU units are multiplexed.
+  bool HasMuxedUnits = false;
+
+  /// If true, VMOVS will never be widened to VMOVD
+  bool DontWidenVMOVS = false;
+
+  /// If true, run the MLx expansion pass.
+  bool ExpandMLx = false;
+
+  /// If true, VFP/NEON VMLA/VMLS have special RAW hazards.
+  bool HasVMLxHazards = false;
+
+  /// If true, VMOVRS, VMOVSR and VMOVS will be converted from VFP to NEON.
+  bool UseNEONForFPMovs = false;
+
+  /// If true, VLDn instructions take an extra cycle for unaligned accesses.
+  bool CheckVLDnAlign = false;
+
+  /// If true, VFP instructions are not pipelined.
+  bool NonpipelinedVFP = false;
+
+  /// StrictAlign - If true, the subtarget disallows unaligned memory
+  /// accesses for some types.  For details, see
+  /// ARMTargetLowering::allowsMisalignedMemoryAccesses().
+  bool StrictAlign = false;
+
+  /// RestrictIT - If true, the subtarget disallows generation of deprecated IT
+  ///  blocks to conform to ARMv8 rule.
+  bool RestrictIT = false;
+
+  /// HasDSP - If true, the subtarget supports the DSP (saturating arith
+  /// and such) instructions.
+  bool HasDSP = false;
+
+  /// NaCl TRAP instruction is generated instead of the regular TRAP.
+  bool UseNaClTrap = false;
+
+  /// Generate calls via indirect call instructions.
+  bool GenLongCalls = false;
+
+  /// Generate code that does not contain data access to code sections.
+  bool GenExecuteOnly = false;
+
+  /// Target machine allowed unsafe FP math (such as use of NEON fp)
+  bool UnsafeFPMath = false;
+
+  /// UseSjLjEH - If true, the target uses SjLj exception handling (e.g. iOS).
+  bool UseSjLjEH = false;
+
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned stackAlignment = 4;
+
+  /// CPUString - String name of used CPU.
+  std::string CPUString;
+
+  unsigned MaxInterleaveFactor = 1;
+
+  /// Clearance before partial register updates (in number of instructions)
+  unsigned PartialUpdateClearance = 0;
+
+  /// What kind of timing do load multiple/store multiple have (double issue,
+  /// single issue etc).
+  ARMLdStMultipleTiming LdStMultipleTiming = SingleIssue;
+
+  /// The adjustment that we need to apply to get the operand latency from the
+  /// operand cycle returned by the itinerary data for pre-ISel operands.
+  int PreISelOperandLatencyAdjustment = 2;
+
+  /// IsLittle - The target is Little Endian
+  bool IsLittle;
+
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
+  /// SchedModel - Processor specific instruction costs.
+  MCSchedModel SchedModel;
+
+  /// Selected instruction itineraries (one entry per itinerary class.)
+  InstrItineraryData InstrItins;
+
+  /// Options passed via command line that could influence the target
+  const TargetOptions &Options;
+
+  const ARMBaseTargetMachine &TM;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  ARMSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
+               const ARMBaseTargetMachine &TM, bool IsLittle);
+
+  /// This object will take onwership of \p GISelAccessor.
+  void setGISelAccessor(GISelAccessor &GISel) { this->GISel.reset(&GISel); }
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const {
+    return 64;
+  }
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// initializeSubtargetDependencies - Initializes using a CPU and feature string
+  /// so that we can use initializer lists for subtarget initialization.
+  ARMSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+  const ARMSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const ARMBaseInstrInfo *getInstrInfo() const override {
+    return InstrInfo.get();
+  }
+  const ARMTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const ARMFrameLowering *getFrameLowering() const override {
+    return FrameLowering.get();
+  }
+  const ARMBaseRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo->getRegisterInfo();
+  }
+
+  const CallLowering *getCallLowering() const override;
+  const InstructionSelector *getInstructionSelector() const override;
+  const LegalizerInfo *getLegalizerInfo() const override;
+  const RegisterBankInfo *getRegBankInfo() const override;
+
+private:
+  ARMSelectionDAGInfo TSInfo;
+  // Either Thumb1FrameLowering or ARMFrameLowering.
+  std::unique_ptr<ARMFrameLowering> FrameLowering;
+  // Either Thumb1InstrInfo or Thumb2InstrInfo.
+  std::unique_ptr<ARMBaseInstrInfo> InstrInfo;
+  ARMTargetLowering   TLInfo;
+
+  /// Gather the accessor points to GlobalISel-related APIs.
+  /// This is used to avoid ifndefs spreading around while GISel is
+  /// an optional library.
+  std::unique_ptr<GISelAccessor> GISel;
+
+  void initializeEnvironment();
+  void initSubtargetFeatures(StringRef CPU, StringRef FS);
+  ARMFrameLowering *initializeFrameLowering(StringRef CPU, StringRef FS);
+
+public:
+  void computeIssueWidth();
+
+  bool hasV4TOps()  const { return HasV4TOps;  }
+  bool hasV5TOps()  const { return HasV5TOps;  }
+  bool hasV5TEOps() const { return HasV5TEOps; }
+  bool hasV6Ops()   const { return HasV6Ops;   }
+  bool hasV6MOps()  const { return HasV6MOps;  }
+  bool hasV6KOps()  const { return HasV6KOps; }
+  bool hasV6T2Ops() const { return HasV6T2Ops; }
+  bool hasV7Ops()   const { return HasV7Ops;  }
+  bool hasV8Ops()   const { return HasV8Ops;  }
+  bool hasV8_1aOps() const { return HasV8_1aOps; }
+  bool hasV8_2aOps() const { return HasV8_2aOps; }
+  bool hasV8MBaselineOps() const { return HasV8MBaselineOps; }
+  bool hasV8MMainlineOps() const { return HasV8MMainlineOps; }
+
+  /// @{
+  /// These functions are obsolete, please consider adding subtarget features
+  /// or properties instead of calling them.
+  bool isCortexA5() const { return ARMProcFamily == CortexA5; }
+  bool isCortexA7() const { return ARMProcFamily == CortexA7; }
+  bool isCortexA8() const { return ARMProcFamily == CortexA8; }
+  bool isCortexA9() const { return ARMProcFamily == CortexA9; }
+  bool isCortexA15() const { return ARMProcFamily == CortexA15; }
+  bool isSwift()    const { return ARMProcFamily == Swift; }
+  bool isCortexM3() const { return ARMProcFamily == CortexM3; }
+  bool isLikeA9() const { return isCortexA9() || isCortexA15() || isKrait(); }
+  bool isCortexR5() const { return ARMProcFamily == CortexR5; }
+  bool isKrait() const { return ARMProcFamily == Krait; }
+  /// @}
+
+  bool hasARMOps() const { return !NoARM; }
+
+  bool hasVFP2() const { return HasVFPv2; }
+  bool hasVFP3() const { return HasVFPv3; }
+  bool hasVFP4() const { return HasVFPv4; }
+  bool hasFPARMv8() const { return HasFPARMv8; }
+  bool hasNEON() const { return HasNEON;  }
+  bool hasCrypto() const { return HasCrypto; }
+  bool hasCRC() const { return HasCRC; }
+  bool hasRAS() const { return HasRAS; }
+  bool hasVirtualization() const { return HasVirtualization; }
+  bool useNEONForSinglePrecisionFP() const {
+    return hasNEON() && UseNEONForSinglePrecisionFP;
+  }
+
+  bool hasDivide() const { return HasHardwareDivide; }
+  bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
+  bool hasT2ExtractPack() const { return HasT2ExtractPack; }
+  bool hasDataBarrier() const { return HasDataBarrier; }
+  bool hasV7Clrex() const { return HasV7Clrex; }
+  bool hasAcquireRelease() const { return HasAcquireRelease; }
+  bool hasAnyDataBarrier() const {
+    return HasDataBarrier || (hasV6Ops() && !isThumb());
+  }
+  bool useMulOps() const { return UseMulOps; }
+  bool useFPVMLx() const { return !SlowFPVMLx; }
+  bool hasVMLxForwarding() const { return HasVMLxForwarding; }
+  bool isFPBrccSlow() const { return SlowFPBrcc; }
+  bool isFPOnlySP() const { return FPOnlySP; }
+  bool hasPerfMon() const { return HasPerfMon; }
+  bool hasTrustZone() const { return HasTrustZone; }
+  bool has8MSecExt() const { return Has8MSecExt; }
+  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
+  bool hasFPAO() const { return HasFPAO; }
+  bool isProfitableToUnpredicate() const { return IsProfitableToUnpredicate; }
+  bool hasSlowVGETLNi32() const { return HasSlowVGETLNi32; }
+  bool hasSlowVDUP32() const { return HasSlowVDUP32; }
+  bool preferVMOVSR() const { return PreferVMOVSR; }
+  bool preferISHSTBarriers() const { return PreferISHST; }
+  bool expandMLx() const { return ExpandMLx; }
+  bool hasVMLxHazards() const { return HasVMLxHazards; }
+  bool hasSlowOddRegister() const { return SlowOddRegister; }
+  bool hasSlowLoadDSubregister() const { return SlowLoadDSubregister; }
+  bool hasMuxedUnits() const { return HasMuxedUnits; }
+  bool dontWidenVMOVS() const { return DontWidenVMOVS; }
+  bool useNEONForFPMovs() const { return UseNEONForFPMovs; }
+  bool checkVLDnAccessAlignment() const { return CheckVLDnAlign; }
+  bool nonpipelinedVFP() const { return NonpipelinedVFP; }
+  bool prefers32BitThumb() const { return Pref32BitThumb; }
+  bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
+  bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
+  bool hasRetAddrStack() const { return HasRetAddrStack; }
+  bool hasMPExtension() const { return HasMPExtension; }
+  bool hasDSP() const { return HasDSP; }
+  bool useNaClTrap() const { return UseNaClTrap; }
+  bool useSjLjEH() const { return UseSjLjEH; }
+  bool genLongCalls() const { return GenLongCalls; }
+  bool genExecuteOnly() const { return GenExecuteOnly; }
+
+  bool hasFP16() const { return HasFP16; }
+  bool hasD16() const { return HasD16; }
+  bool hasFullFP16() const { return HasFullFP16; }
+
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+  bool isTargetIOS() const { return TargetTriple.isiOS(); }
+  bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); }
+  bool isTargetWatchABI() const { return TargetTriple.isWatchABI(); }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+  bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
+  bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
+
+  bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+  // ARM EABI is the bare-metal EABI described in ARM ABI documents and
+  // can be accessed via -target arm-none-eabi. This is NOT GNUEABI.
+  // FIXME: Add a flag for bare-metal for that target and set Triple::EABI
+  // even for GNUEABI, so we can make a distinction here and still conform to
+  // the EABI on GNU (and Android) mode. This requires change in Clang, too.
+  // FIXME: The Darwin exception is temporary, while we move users to
+  // "*-*-*-macho" triples as quickly as possible.
+  bool isTargetAEABI() const {
+    return (TargetTriple.getEnvironment() == Triple::EABI ||
+            TargetTriple.getEnvironment() == Triple::EABIHF) &&
+           !isTargetDarwin() && !isTargetWindows();
+  }
+  bool isTargetGNUAEABI() const {
+    return (TargetTriple.getEnvironment() == Triple::GNUEABI ||
+            TargetTriple.getEnvironment() == Triple::GNUEABIHF) &&
+           !isTargetDarwin() && !isTargetWindows();
+  }
+  bool isTargetMuslAEABI() const {
+    return (TargetTriple.getEnvironment() == Triple::MuslEABI ||
+            TargetTriple.getEnvironment() == Triple::MuslEABIHF) &&
+           !isTargetDarwin() && !isTargetWindows();
+  }
+
+  // ARM Targets that support EHABI exception handling standard
+  // Darwin uses SjLj. Other targets might need more checks.
+  bool isTargetEHABICompatible() const {
+    return (TargetTriple.getEnvironment() == Triple::EABI ||
+            TargetTriple.getEnvironment() == Triple::GNUEABI ||
+            TargetTriple.getEnvironment() == Triple::MuslEABI ||
+            TargetTriple.getEnvironment() == Triple::EABIHF ||
+            TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+            TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
+            isTargetAndroid()) &&
+           !isTargetDarwin() && !isTargetWindows();
+  }
+
+  bool isTargetHardFloat() const {
+    // FIXME: this is invalid for WindowsCE
+    return TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+           TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
+           TargetTriple.getEnvironment() == Triple::EABIHF ||
+           isTargetWindows() || isAAPCS16_ABI();
+  }
+  bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
+
+  virtual bool isXRaySupported() const override;
+
+  bool isAPCS_ABI() const;
+  bool isAAPCS_ABI() const;
+  bool isAAPCS16_ABI() const;
+
+  bool isROPI() const;
+  bool isRWPI() const;
+
+  bool useSoftFloat() const { return UseSoftFloat; }
+  bool isThumb() const { return InThumbMode; }
+  bool isThumb1Only() const { return InThumbMode && !HasThumb2; }
+  bool isThumb2() const { return InThumbMode && HasThumb2; }
+  bool hasThumb2() const { return HasThumb2; }
+  bool isMClass() const { return ARMProcClass == MClass; }
+  bool isRClass() const { return ARMProcClass == RClass; }
+  bool isAClass() const { return ARMProcClass == AClass; }
+
+  bool isR9Reserved() const {
+    return isTargetMachO() ? (ReserveR9 || !HasV6Ops) : ReserveR9;
+  }
+
+  bool useR7AsFramePointer() const {
+    return isTargetDarwin() || (!isTargetWindows() && isThumb());
+  }
+  /// Returns true if the frame setup is split into two separate pushes (first
+  /// r0-r7,lr then r8-r11), principally so that the frame pointer is adjacent
+  /// to lr. This is always required on Thumb1-only targets, as the push and
+  /// pop instructions can't access the high registers.
+  bool splitFramePushPop(const MachineFunction &MF) const {
+    return (useR7AsFramePointer() &&
+            MF.getTarget().Options.DisableFramePointerElim(MF)) ||
+           isThumb1Only();
+  }
+
+  bool useStride4VFPs(const MachineFunction &MF) const;
+
+  bool useMovt(const MachineFunction &MF) const;
+
+  bool supportsTailCall() const { return SupportsTailCall; }
+
+  bool allowsUnalignedMem() const { return !StrictAlign; }
+
+  bool restrictIT() const { return RestrictIT; }
+
+  const std::string & getCPUString() const { return CPUString; }
+
+  bool isLittle() const { return IsLittle; }
+
+  unsigned getMispredictionPenalty() const;
+
+  /// This function returns true if the target has sincos() routine in its
+  /// compiler runtime or math libraries.
+  bool hasSinCos() const;
+
+  /// Returns true if machine scheduler should be enabled.
+  bool enableMachineScheduler() const override;
+
+  /// True for some subtargets at > -O0.
+  bool enablePostRAScheduler() const override;
+
+  // enableAtomicExpand- True if we need to expand our atomics.
+  bool enableAtomicExpand() const override;
+
+  /// getInstrItins - Return the instruction itineraries based on subtarget
+  /// selection.
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return stackAlignment; }
+
+  unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
+
+  unsigned getPartialUpdateClearance() const { return PartialUpdateClearance; }
+
+  ARMLdStMultipleTiming getLdStMultipleTiming() const {
+    return LdStMultipleTiming;
+  }
+
+  int getPreISelOperandLatencyAdjustment() const {
+    return PreISelOperandLatencyAdjustment;
+  }
+
+  /// True if the GV will be accessed via an indirect symbol.
+  bool isGVIndirectSymbol(const GlobalValue *GV) const;
+
+  /// True if fast-isel is used.
+  bool useFastISel() const;
+};
+} // End llvm namespace
+
+#endif  // ARMSUBTARGET_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
new file mode 100644
index 000000000000..70c9567d99f8
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -0,0 +1,544 @@
+//===-- ARMTargetMachine.cpp - Define TargetMachine for ARM ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetMachine.h"
+#include "ARM.h"
+#include "ARMCallLowering.h"
+#include "ARMFrameLowering.h"
+#include "ARMInstructionSelector.h"
+#include "ARMLegalizerInfo.h"
+#include "ARMRegisterBankInfo.h"
+#include "ARMTargetObjectFile.h"
+#include "ARMTargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+static cl::opt<bool>
+DisableA15SDOptimization("disable-a15-sd-optimization", cl::Hidden,
+                   cl::desc("Inhibit optimization of S->D register accesses on A15"),
+                   cl::init(false));
+
+static cl::opt<bool>
+EnableAtomicTidy("arm-atomic-cfg-tidy", cl::Hidden,
+                 cl::desc("Run SimplifyCFG after expanding atomic operations"
+                          " to make use of cmpxchg flow-based information"),
+                 cl::init(true));
+
+static cl::opt<bool>
+EnableARMLoadStoreOpt("arm-load-store-opt", cl::Hidden,
+                      cl::desc("Enable ARM load/store optimization pass"),
+                      cl::init(true));
+
+// FIXME: Unify control over GlobalMerge.
+static cl::opt<cl::boolOrDefault>
+EnableGlobalMerge("arm-global-merge", cl::Hidden,
+                  cl::desc("Enable the global merge pass"));
+
+extern "C" void LLVMInitializeARMTarget() {
+  // Register the target.
+  RegisterTargetMachine<ARMLETargetMachine> X(getTheARMLETarget());
+  RegisterTargetMachine<ARMBETargetMachine> Y(getTheARMBETarget());
+  RegisterTargetMachine<ThumbLETargetMachine> A(getTheThumbLETarget());
+  RegisterTargetMachine<ThumbBETargetMachine> B(getTheThumbBETarget());
+
+  PassRegistry &Registry = *PassRegistry::getPassRegistry();
+  initializeGlobalISel(Registry);
+  initializeARMLoadStoreOptPass(Registry);
+  initializeARMPreAllocLoadStoreOptPass(Registry);
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO())
+    return make_unique<TargetLoweringObjectFileMachO>();
+  if (TT.isOSWindows())
+    return make_unique<TargetLoweringObjectFileCOFF>();
+  return make_unique<ARMElfTargetObjectFile>();
+}
+
+static ARMBaseTargetMachine::ARMABI
+computeTargetABI(const Triple &TT, StringRef CPU,
+                 const TargetOptions &Options) {
+  if (Options.MCOptions.getABIName() == "aapcs16")
+    return ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+  else if (Options.MCOptions.getABIName().startswith("aapcs"))
+    return ARMBaseTargetMachine::ARM_ABI_AAPCS;
+  else if (Options.MCOptions.getABIName().startswith("apcs"))
+    return ARMBaseTargetMachine::ARM_ABI_APCS;
+
+  assert(Options.MCOptions.getABIName().empty() &&
+         "Unknown target-abi option!");
+
+  ARMBaseTargetMachine::ARMABI TargetABI =
+      ARMBaseTargetMachine::ARM_ABI_UNKNOWN;
+
+  unsigned ArchKind = llvm::ARM::parseCPUArch(CPU);
+  StringRef ArchName = llvm::ARM::getArchName(ArchKind);
+  // FIXME: This is duplicated code from the front end and should be unified.
+  if (TT.isOSBinFormatMachO()) {
+    if (TT.getEnvironment() == llvm::Triple::EABI ||
+        (TT.getOS() == llvm::Triple::UnknownOS && TT.isOSBinFormatMachO()) ||
+        llvm::ARM::parseArchProfile(ArchName) == llvm::ARM::PK_M) {
+      TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+    } else if (TT.isWatchABI()) {
+      TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+    } else {
+      TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
+    }
+  } else if (TT.isOSWindows()) {
+    // FIXME: this is invalid for WindowsCE
+    TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+  } else {
+    // Select the default based on the platform.
+    switch (TT.getEnvironment()) {
+    case llvm::Triple::Android:
+    case llvm::Triple::GNUEABI:
+    case llvm::Triple::GNUEABIHF:
+    case llvm::Triple::MuslEABI:
+    case llvm::Triple::MuslEABIHF:
+    case llvm::Triple::EABIHF:
+    case llvm::Triple::EABI:
+      TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+      break;
+    case llvm::Triple::GNU:
+      TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
+      break;
+    default:
+      if (TT.isOSNetBSD())
+        TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
+      else
+        TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+      break;
+    }
+  }
+
+  return TargetABI;
+}
+
+static std::string computeDataLayout(const Triple &TT, StringRef CPU,
+                                     const TargetOptions &Options,
+                                     bool isLittle) {
+  auto ABI = computeTargetABI(TT, CPU, Options);
+  std::string Ret = "";
+
+  if (isLittle)
+    // Little endian.
+    Ret += "e";
+  else
+    // Big endian.
+    Ret += "E";
+
+  Ret += DataLayout::getManglingComponent(TT);
+
+  // Pointers are 32 bits and aligned to 32 bits.
+  Ret += "-p:32:32";
+
+  // ABIs other than APCS have 64 bit integers with natural alignment.
+  if (ABI != ARMBaseTargetMachine::ARM_ABI_APCS)
+    Ret += "-i64:64";
+
+  // We have 64 bits floats. The APCS ABI requires them to be aligned to 32
+  // bits, others to 64 bits. We always try to align to 64 bits.
+  if (ABI == ARMBaseTargetMachine::ARM_ABI_APCS)
+    Ret += "-f64:32:64";
+
+  // We have 128 and 64 bit vectors. The APCS ABI aligns them to 32 bits, others
+  // to 64. We always ty to give them natural alignment.
+  if (ABI == ARMBaseTargetMachine::ARM_ABI_APCS)
+    Ret += "-v64:32:64-v128:32:128";
+  else if (ABI != ARMBaseTargetMachine::ARM_ABI_AAPCS16)
+    Ret += "-v128:64:128";
+
+  // Try to align aggregates to 32 bits (the default is 64 bits, which has no
+  // particular hardware support on 32-bit ARM).
+  Ret += "-a:0:32";
+
+  // Integer registers are 32 bits.
+  Ret += "-n32";
+
+  // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit
+  // aligned everywhere else.
+  if (TT.isOSNaCl() || ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16)
+    Ret += "-S128";
+  else if (ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS)
+    Ret += "-S64";
+  else
+    Ret += "-S32";
+
+  return Ret;
+}
+
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+                                           Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    // Default relocation model on Darwin is PIC.
+    return TT.isOSBinFormatMachO() ? Reloc::PIC_ : Reloc::Static;
+
+  if (*RM == Reloc::ROPI || *RM == Reloc::RWPI || *RM == Reloc::ROPI_RWPI)
+    assert(TT.isOSBinFormatELF() &&
+           "ROPI/RWPI currently only supported for ELF");
+
+  // DynamicNoPIC is only used on darwin.
+  if (*RM == Reloc::DynamicNoPIC && !TT.isOSDarwin())
+    return Reloc::Static;
+
+  return *RM;
+}
+
+/// Create an ARM architecture model.
+///
+ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL, bool isLittle)
+    : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT,
+                        CPU, FS, Options, getEffectiveRelocModel(TT, RM), CM,
+                        OL),
+      TargetABI(computeTargetABI(TT, CPU, Options)),
+      TLOF(createTLOF(getTargetTriple())),
+      Subtarget(TT, CPU, FS, *this, isLittle), isLittle(isLittle) {
+
+  // Default to triple-appropriate float ABI
+  if (Options.FloatABIType == FloatABI::Default)
+    this->Options.FloatABIType =
+        Subtarget.isTargetHardFloat() ? FloatABI::Hard : FloatABI::Soft;
+
+  // Default to triple-appropriate EABI
+  if (Options.EABIVersion == EABI::Default ||
+      Options.EABIVersion == EABI::Unknown) {
+    // musl is compatible with glibc with regard to EABI version
+    if (Subtarget.isTargetGNUAEABI() || Subtarget.isTargetMuslAEABI())
+      this->Options.EABIVersion = EABI::GNU;
+    else
+      this->Options.EABIVersion = EABI::EABI5;
+  }
+}
+
+ARMBaseTargetMachine::~ARMBaseTargetMachine() {}
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+struct ARMGISelActualAccessor : public GISelAccessor {
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+  const CallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+  const InstructionSelector *getInstructionSelector() const override {
+    return InstSelector.get();
+  }
+  const LegalizerInfo *getLegalizerInfo() const override {
+    return Legalizer.get();
+  }
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
+  }
+};
+} // End anonymous namespace.
+#endif
+
+const ARMSubtarget *
+ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+
+  // FIXME: This is related to the code below to reset the target options,
+  // we need to know whether or not the soft float flag is set on the
+  // function before we can generate a subtarget. We also need to use
+  // it as a key for the subtarget since that can be the only difference
+  // between two functions.
+  bool SoftFloat =
+      F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+  // If the soft float attribute is set on the function turn on the soft float
+  // subtarget feature.
+  if (SoftFloat)
+    FS += FS.empty() ? "+soft-float" : ",+soft-float";
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle);
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+    GISelAccessor *GISel = new GISelAccessor();
+#else
+    ARMGISelActualAccessor *GISel = new ARMGISelActualAccessor();
+    GISel->CallLoweringInfo.reset(new ARMCallLowering(*I->getTargetLowering()));
+    GISel->Legalizer.reset(new ARMLegalizerInfo());
+
+    auto *RBI = new ARMRegisterBankInfo(*I->getRegisterInfo());
+
+    // FIXME: At this point, we can't rely on Subtarget having RBI.
+    // It's awkward to mix passing RBI and the Subtarget; should we pass
+    // TII/TRI as well?
+    GISel->InstSelector.reset(new ARMInstructionSelector(*I, *RBI));
+
+    GISel->RegBankInfo.reset(RBI);
+#endif
+    I->setGISelAccessor(*GISel);
+  }
+  return I.get();
+}
+
+TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(ARMTTIImpl(this, F));
+  });
+}
+
+void ARMTargetMachine::anchor() {}
+
+ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT,
+                                   StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
+                                   Optional<Reloc::Model> RM,
+                                   CodeModel::Model CM, CodeGenOpt::Level OL,
+                                   bool isLittle)
+    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
+  initAsmInfo();
+  if (!Subtarget.hasARMOps())
+    report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
+                       "support ARM mode execution!");
+}
+
+void ARMLETargetMachine::anchor() {}
+
+ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+    : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+void ARMBETargetMachine::anchor() {}
+
+ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+    : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+void ThumbTargetMachine::anchor() {}
+
+ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
+                                       CodeGenOpt::Level OL, bool isLittle)
+    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
+  initAsmInfo();
+}
+
+void ThumbLETargetMachine::anchor() {}
+
+ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+    : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+void ThumbBETargetMachine::anchor() {}
+
+ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, const Triple &TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+    : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+namespace {
+/// ARM Code Generator Pass Configuration Options.
+class ARMPassConfig : public TargetPassConfig {
+public:
+  ARMPassConfig(ARMBaseTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  ARMBaseTargetMachine &getARMTargetMachine() const {
+    return getTM<ARMBaseTargetMachine>();
+  }
+
+  void addIRPasses() override;
+  bool addPreISel() override;
+  bool addInstSelector() override;
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+  bool addIRTranslator() override;
+  bool addLegalizeMachineIR() override;
+  bool addRegBankSelect() override;
+  bool addGlobalInstructionSelect() override;
+#endif
+  void addPreRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new ARMPassConfig(this, PM);
+}
+
+void ARMPassConfig::addIRPasses() {
+  if (TM->Options.ThreadModel == ThreadModel::Single)
+    addPass(createLowerAtomicPass());
+  else
+    addPass(createAtomicExpandPass(TM));
+
+  // Cmpxchg instructions are often used with a subsequent comparison to
+  // determine whether it succeeded. We can exploit existing control-flow in
+  // ldrex/strex loops to simplify this, but it needs tidying up.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
+    addPass(createCFGSimplificationPass(-1, [this](const Function &F) {
+      const auto &ST = this->TM->getSubtarget<ARMSubtarget>(F);
+      return ST.hasAnyDataBarrier() && !ST.isThumb1Only();
+    }));
+
+  TargetPassConfig::addIRPasses();
+
+  // Match interleaved memory accesses to ldN/stN intrinsics.
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createInterleavedAccessPass(TM));
+}
+
+bool ARMPassConfig::addPreISel() {
+  if ((TM->getOptLevel() != CodeGenOpt::None &&
+       EnableGlobalMerge == cl::BOU_UNSET) ||
+      EnableGlobalMerge == cl::BOU_TRUE) {
+    // FIXME: This is using the thumb1 only constant value for
+    // maximal global offset for merging globals. We may want
+    // to look into using the old value for non-thumb1 code of
+    // 4095 based on the TargetMachine, but this starts to become
+    // tricky when doing code gen per function.
+    bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) &&
+                               (EnableGlobalMerge == cl::BOU_UNSET);
+    // Merging of extern globals is enabled by default on non-Mach-O as we
+    // expect it to be generally either beneficial or harmless. On Mach-O it
+    // is disabled as we emit the .subsections_via_symbols directive which
+    // means that merging extern globals is not safe.
+    bool MergeExternalByDefault = !TM->getTargetTriple().isOSBinFormatMachO();
+    addPass(createGlobalMergePass(TM, 127, OnlyOptimizeForSize,
+                                  MergeExternalByDefault));
+  }
+
+  return false;
+}
+
+bool ARMPassConfig::addInstSelector() {
+  addPass(createARMISelDag(getARMTargetMachine(), getOptLevel()));
+  return false;
+}
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+bool ARMPassConfig::addIRTranslator() {
+  addPass(new IRTranslator());
+  return false;
+}
+
+bool ARMPassConfig::addLegalizeMachineIR() {
+  addPass(new Legalizer());
+  return false;
+}
+
+bool ARMPassConfig::addRegBankSelect() {
+  addPass(new RegBankSelect());
+  return false;
+}
+
+bool ARMPassConfig::addGlobalInstructionSelect() {
+  addPass(new InstructionSelect());
+  return false;
+}
+#endif
+
+void ARMPassConfig::addPreRegAlloc() {
+  if (getOptLevel() != CodeGenOpt::None) {
+    addPass(createMLxExpansionPass());
+
+    if (EnableARMLoadStoreOpt)
+      addPass(createARMLoadStoreOptimizationPass(/* pre-register alloc */ true));
+
+    if (!DisableA15SDOptimization)
+      addPass(createA15SDOptimizerPass());
+  }
+}
+
+void ARMPassConfig::addPreSched2() {
+  if (getOptLevel() != CodeGenOpt::None) {
+    if (EnableARMLoadStoreOpt)
+      addPass(createARMLoadStoreOptimizationPass());
+
+    addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass));
+  }
+
+  // Expand some pseudo instructions into multiple instructions to allow
+  // proper scheduling.
+  addPass(createARMExpandPseudoPass());
+
+  if (getOptLevel() != CodeGenOpt::None) {
+    // in v8, IfConversion depends on Thumb instruction widths
+    addPass(createThumb2SizeReductionPass([this](const Function &F) {
+      return this->TM->getSubtarget<ARMSubtarget>(F).restrictIT();
+    }));
+
+    addPass(createIfConverter([](const MachineFunction &MF) {
+      return !MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+    }));
+  }
+  addPass(createThumb2ITBlockPass());
+}
+
+void ARMPassConfig::addPreEmitPass() {
+  addPass(createThumb2SizeReductionPass());
+
+  // Constant island pass work on unbundled instructions.
+  addPass(createUnpackMachineBundles([](const MachineFunction &MF) {
+    return MF.getSubtarget<ARMSubtarget>().isThumb2();
+  }));
+
+  // Don't optimize barriers at -O0.
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createARMOptimizeBarriersPass());
+
+  addPass(createARMConstantIslandPass());
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
new file mode 100644
index 000000000000..c6b70b953162
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -0,0 +1,131 @@
+//===-- ARMTargetMachine.h - Define TargetMachine for ARM -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H
+#define LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H
+
+#include "ARMInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class ARMBaseTargetMachine : public LLVMTargetMachine {
+public:
+  enum ARMABI {
+    ARM_ABI_UNKNOWN,
+    ARM_ABI_APCS,
+    ARM_ABI_AAPCS, // ARM EABI
+    ARM_ABI_AAPCS16
+  } TargetABI;
+
+protected:
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  ARMSubtarget        Subtarget;
+  bool isLittle;
+  mutable StringMap<std::unique_ptr<ARMSubtarget>> SubtargetMap;
+
+public:
+  ARMBaseTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL, bool isLittle);
+  ~ARMBaseTargetMachine() override;
+
+  const ARMSubtarget *getSubtargetImpl() const { return &Subtarget; }
+  const ARMSubtarget *getSubtargetImpl(const Function &F) const override;
+  bool isLittleEndian() const { return isLittle; }
+
+  /// \brief Get the TargetIRAnalysis for this target.
+  TargetIRAnalysis getTargetIRAnalysis() override;
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+};
+
+/// ARM target machine.
+///
+class ARMTargetMachine : public ARMBaseTargetMachine {
+  virtual void anchor();
+ public:
+   ARMTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                    StringRef FS, const TargetOptions &Options,
+                    Optional<Reloc::Model> RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL, bool isLittle);
+};
+
+/// ARM little endian target machine.
+///
+class ARMLETargetMachine : public ARMTargetMachine {
+  void anchor() override;
+public:
+  ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                     StringRef FS, const TargetOptions &Options,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+};
+
+/// ARM big endian target machine.
+///
+class ARMBETargetMachine : public ARMTargetMachine {
+  void anchor() override;
+public:
+  ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                     StringRef FS, const TargetOptions &Options,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+};
+
+/// Thumb target machine.
+/// Due to the way architectures are handled, this represents both
+///   Thumb-1 and Thumb-2.
+///
+class ThumbTargetMachine : public ARMBaseTargetMachine {
+  virtual void anchor();
+public:
+  ThumbTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                     StringRef FS, const TargetOptions &Options,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL, bool isLittle);
+};
+
+/// Thumb little endian target machine.
+///
+class ThumbLETargetMachine : public ThumbTargetMachine {
+  void anchor() override;
+public:
+  ThumbLETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+/// Thumb big endian target machine.
+///
+class ThumbBETargetMachine : public ThumbTargetMachine {
+  void anchor() override;
+public:
+  ThumbBETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
new file mode 100644
index 000000000000..625c4280e1a6
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -0,0 +1,92 @@
+//===-- llvm/Target/ARMTargetObjectFile.cpp - ARM Object Info Impl --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetObjectFile.h"
+#include "ARMTargetMachine.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+using namespace dwarf;
+
+//===----------------------------------------------------------------------===//
+//                               ELF Target
+//===----------------------------------------------------------------------===//
+
+void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
+                                        const TargetMachine &TM) {
+  const ARMTargetMachine &ARM_TM = static_cast<const ARMTargetMachine &>(TM);
+  bool isAAPCS_ABI = ARM_TM.TargetABI == ARMTargetMachine::ARMABI::ARM_ABI_AAPCS;
+  genExecuteOnly = ARM_TM.getSubtargetImpl()->genExecuteOnly();
+
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(isAAPCS_ABI);
+
+  if (isAAPCS_ABI) {
+    LSDASection = nullptr;
+  }
+
+  AttributesSection =
+      getContext().getELFSection(".ARM.attributes", ELF::SHT_ARM_ATTRIBUTES, 0);
+
+  // Make code section unreadable when in execute-only mode
+  if (genExecuteOnly) {
+    unsigned  Type = ELF::SHT_PROGBITS;
+    unsigned Flags = ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_ARM_PURECODE;
+    // Since we cannot modify flags for an existing section, we create a new
+    // section with the right flags, and use 0 as the unique ID for
+    // execute-only text
+    TextSection = Ctx.getELFSection(".text", Type, Flags, 0, "", 0U);
+  }
+}
+
+const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
+    MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+  if (TM.getMCAsmInfo()->getExceptionHandlingType() != ExceptionHandling::ARM)
+    return TargetLoweringObjectFileELF::getTTypeGlobalReference(
+        GV, Encoding, TM, MMI, Streamer);
+
+  assert(Encoding == DW_EH_PE_absptr && "Can handle absptr encoding only");
+
+  return MCSymbolRefExpr::create(TM.getSymbol(GV),
+                                 MCSymbolRefExpr::VK_ARM_TARGET2, getContext());
+}
+
+const MCExpr *ARMElfTargetObjectFile::
+getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
+  return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_ARM_TLSLDO,
+                                 getContext());
+}
+
+MCSection *
+ARMElfTargetObjectFile::getExplicitSectionGlobal(const GlobalObject *GO,
+                                                 SectionKind SK, const TargetMachine &TM) const {
+  // Set execute-only access for the explicit section
+  if (genExecuteOnly && SK.isText())
+    SK = SectionKind::getExecuteOnly();
+
+  return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, SK, TM);
+}
+
+MCSection *
+ARMElfTargetObjectFile::SelectSectionForGlobal(const GlobalObject *GO,
+                                               SectionKind SK, const TargetMachine &TM) const {
+  // Place the global in the execute-only text section
+  if (genExecuteOnly && SK.isText())
+    SK = SectionKind::getExecuteOnly();
+
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, SK, TM);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
new file mode 100644
index 000000000000..24e755ddac27
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
@@ -0,0 +1,50 @@
+//===-- llvm/Target/ARMTargetObjectFile.h - ARM Object Info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_ARM_ARMTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+class MCContext;
+class TargetMachine;
+
+class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
+  mutable bool genExecuteOnly = false;
+protected:
+  const MCSection *AttributesSection;
+public:
+  ARMElfTargetObjectFile()
+      : TargetLoweringObjectFileELF(), AttributesSection(nullptr) {
+    PLTRelativeVariantKind = MCSymbolRefExpr::VK_ARM_PREL31;
+  }
+
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+  const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+                                        unsigned Encoding,
+                                        const TargetMachine &TM,
+                                        MachineModuleInfo *MMI,
+                                        MCStreamer &Streamer) const override;
+
+  /// \brief Describe a TLS variable address within debug info.
+  const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
+
+  MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind,
+                                      const TargetMachine &TM) const override;
+
+  MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+                                    const TargetMachine &TM) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
new file mode 100644
index 000000000000..10e6297ef1ed
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -0,0 +1,538 @@
+//===-- ARMTargetTransformInfo.cpp - ARM specific TTI ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "armtti"
+
+int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+  assert(Ty->isIntegerTy());
+
+ unsigned Bits = Ty->getPrimitiveSizeInBits();
+ if (Bits == 0 || Imm.getActiveBits() >= 64)
+   return 4;
+
+  int64_t SImmVal = Imm.getSExtValue();
+  uint64_t ZImmVal = Imm.getZExtValue();
+  if (!ST->isThumb()) {
+    if ((SImmVal >= 0 && SImmVal < 65536) ||
+        (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
+        (ARM_AM::getSOImmVal(~ZImmVal) != -1))
+      return 1;
+    return ST->hasV6T2Ops() ? 2 : 3;
+  }
+  if (ST->isThumb2()) {
+    if ((SImmVal >= 0 && SImmVal < 65536) ||
+        (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
+        (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
+      return 1;
+    return ST->hasV6T2Ops() ? 2 : 3;
+  }
+  // Thumb1.
+  if (SImmVal >= 0 && SImmVal < 256)
+    return 1;
+  if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
+    return 2;
+  // Load from constantpool.
+  return 3;
+}
+
+
+// Constants smaller than 256 fit in the immediate field of
+// Thumb1 instructions so we return a zero cost and 1 otherwise.
+int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
+                                      const APInt &Imm, Type *Ty) {
+  if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
+    return 0;
+
+  return 1;
+}
+
+int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                              Type *Ty) {
+  // Division by a constant can be turned into multiplication, but only if we
+  // know it's constant. So it's not so much that the immediate is cheap (it's
+  // not), but that the alternative is worse.
+  // FIXME: this is probably unneeded with GlobalISel.
+  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
+       Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
+      Idx == 1)
+    return 0;
+
+  if (Opcode == Instruction::And)
+      // Conversion to BIC is free, and means we can use ~Imm instead.
+      return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty));
+
+  if (Opcode == Instruction::Add)
+    // Conversion to SUB is free, and means we can use -Imm instead.
+    return std::min(getIntImmCost(Imm, Ty), getIntImmCost(-Imm, Ty));
+
+  if (Opcode == Instruction::ICmp && Imm.isNegative() &&
+      Ty->getIntegerBitWidth() == 32) {
+    int64_t NegImm = -Imm.getSExtValue();
+    if (ST->isThumb2() && NegImm < 1<<12)
+      // icmp X, #-C -> cmn X, #C
+      return 0;
+    if (ST->isThumb() && NegImm < 1<<8)
+      // icmp X, #-C -> adds X, #C
+      return 0;
+  }
+
+  return getIntImmCost(Imm, Ty);
+}
+
+
+int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  // Single to/from double precision conversions.
+  static const CostTblEntry NEONFltDblTbl[] = {
+    // Vector fptrunc/fpext conversions.
+    { ISD::FP_ROUND,   MVT::v2f64, 2 },
+    { ISD::FP_EXTEND,  MVT::v2f32, 2 },
+    { ISD::FP_EXTEND,  MVT::v4f32, 4 }
+  };
+
+  if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
+                                          ISD == ISD::FP_EXTEND)) {
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+    if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  EVT SrcTy = TLI->getValueType(DL, Src);
+  EVT DstTy = TLI->getValueType(DL, Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return BaseT::getCastInstrCost(Opcode, Dst, Src);
+
+  // Some arithmetic, load and store operations have specific instructions
+  // to cast up/down their types automatically at no extra cost.
+  // TODO: Get these tables to know at least what the related operations are.
+  static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
+    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
+    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
+    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
+    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
+    { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
+
+    // The number of vmovl instructions for the extension.
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+    { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
+    { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
+    { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
+    { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+
+    // Operations that we legalize using splitting.
+    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
+    { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
+
+    // Vector float <-> i32 conversions.
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
+
+    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
+    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
+    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
+
+    { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
+    { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
+    { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
+
+    // Vector double <-> i32 conversions.
+    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+
+    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
+    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
+    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+
+    { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
+    { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
+    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
+    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
+  };
+
+  if (SrcTy.isVector() && ST->hasNEON()) {
+    if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
+  // Scalar float to integer conversions.
+  static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
+    { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
+    { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
+    { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
+    { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
+  };
+  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
+    if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
+  // Scalar integer to float conversions.
+  static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
+  };
+
+  if (SrcTy.isInteger() && ST->hasNEON()) {
+    if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
+                                                   ISD, DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
+  // Scalar integer conversion costs.
+  static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
+    // i16 -> i64 requires two dependent operations.
+    { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
+
+    // Truncates on i64 are assumed to be free.
+    { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
+    { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
+    { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
+    { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
+  };
+
+  if (SrcTy.isInteger()) {
+    if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
+  return BaseT::getCastInstrCost(Opcode, Dst, Src);
+}
+
+int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                   unsigned Index) {
+  // Penalize inserting into an D-subregister. We end up with a three times
+  // lower estimated throughput on swift.
+  if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
+      ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
+    return 3;
+
+  if ((Opcode == Instruction::InsertElement ||
+       Opcode == Instruction::ExtractElement)) {
+    // Cross-class copies are expensive on many microarchitectures,
+    // so assume they are expensive by default.
+    if (ValTy->getVectorElementType()->isIntegerTy())
+      return 3;
+
+    // Even if it's not a cross class copy, this likely leads to mixing
+    // of NEON and VFP code and should be therefore penalized.
+    if (ValTy->isVectorTy() &&
+        ValTy->getScalarSizeInBits() <= 32)
+      return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
+  }
+
+  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+}
+
+int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  // On NEON a a vector select gets lowered to vbsl.
+  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
+    // Lowering of some vector selects is currently far from perfect.
+    static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
+      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
+    };
+
+    EVT SelCondTy = TLI->getValueType(DL, CondTy);
+    EVT SelValTy = TLI->getValueType(DL, ValTy);
+    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
+      if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
+                                                     SelCondTy.getSimpleVT(),
+                                                     SelValTy.getSimpleVT()))
+        return Entry->Cost;
+    }
+
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+    return LT.first;
+  }
+
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+int ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+  // Address computations in vectorized code with non-consecutive addresses will
+  // likely result in more instructions compared to scalar code where the
+  // computation can more often be merged into the index mode. The resulting
+  // extra micro-ops can significantly decrease throughput.
+  unsigned NumVectorInstToHideOverhead = 10;
+
+  if (Ty->isVectorTy() && IsComplex)
+    return NumVectorInstToHideOverhead;
+
+  // In many cases the address computation is not merged into the instruction
+  // addressing mode.
+  return 1;
+}
+
+int ARMTTIImpl::getFPOpCost(Type *Ty) {
+  // Use similar logic that's in ARMISelLowering:
+  // Any ARM CPU with VFP2 has floating point, but Thumb1 didn't have access
+  // to VFP.
+
+  if (ST->hasVFP2() && !ST->isThumb1Only()) {
+    if (Ty->isFloatTy()) {
+      return TargetTransformInfo::TCC_Basic;
+    }
+
+    if (Ty->isDoubleTy()) {
+      return ST->isFPOnlySP() ? TargetTransformInfo::TCC_Expensive :
+        TargetTransformInfo::TCC_Basic;
+    }
+  }
+
+  return TargetTransformInfo::TCC_Expensive;
+}
+
+int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                               Type *SubTp) {
+  // We only handle costs of reverse and alternate shuffles for now.
+  if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
+    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+
+  if (Kind == TTI::SK_Reverse) {
+    static const CostTblEntry NEONShuffleTbl[] = {
+        // Reverse shuffle cost one instruction if we are shuffling within a
+        // double word (vrev) or two if we shuffle a quad word (vrev, vext).
+        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
+
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+    if (const auto *Entry = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+
+    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+  }
+  if (Kind == TTI::SK_Alternate) {
+    static const CostTblEntry NEONAltShuffleTbl[] = {
+        // Alt shuffle cost table for ARM. Cost is the number of instructions
+        // required to create the shuffled vector.
+
+        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
+
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+    if (const auto *Entry = CostTableLookup(NEONAltShuffleTbl,
+                                            ISD::VECTOR_SHUFFLE, LT.second))
+      return LT.first * Entry->Cost;
+    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+  }
+  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+int ARMTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
+    TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
+
+  int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+  const unsigned FunctionCallDivCost = 20;
+  const unsigned ReciprocalDivCost = 10;
+  static const CostTblEntry CostTbl[] = {
+    // Division.
+    // These costs are somewhat random. Choose a cost of 20 to indicate that
+    // vectorizing devision (added function call) is going to be very expensive.
+    // Double registers types.
+    { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
+    { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
+    { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
+    { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
+    { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
+    { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
+    { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
+    { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
+    { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
+    { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
+    { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
+    { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
+    { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
+    { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
+    { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
+    { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
+    // Quad register types.
+    { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
+    { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
+    { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
+    { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
+    { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
+    { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
+    { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
+    { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
+    { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
+    { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
+    { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
+    { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
+    { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
+    { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
+    { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
+    { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
+    // Multiplication.
+  };
+
+  if (ST->hasNEON())
+    if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
+      return LT.first * Entry->Cost;
+
+  int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+                                           Opd1PropInfo, Opd2PropInfo);
+
+  // This is somewhat of a hack. The problem that we are facing is that SROA
+  // creates a sequence of shift, and, or instructions to construct values.
+  // These sequences are recognized by the ISel and have zero-cost. Not so for
+  // the vectorized code. Because we have support for v2i64 but not i64 those
+  // sequences look particularly beneficial to vectorize.
+  // To work around this we increase the cost of v2i64 operations to make them
+  // seem less beneficial.
+  if (LT.second == MVT::v2i64 &&
+      Op2Info == TargetTransformInfo::OK_UniformConstantValue)
+    Cost += 4;
+
+  return Cost;
+}
+
+int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                unsigned AddressSpace) {
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+
+  if (Src->isVectorTy() && Alignment != 16 &&
+      Src->getVectorElementType()->isDoubleTy()) {
+    // Unaligned loads/stores are extremely inefficient.
+    // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
+    return LT.first * 4;
+  }
+  return LT.first;
+}
+
+int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                           unsigned Factor,
+                                           ArrayRef<unsigned> Indices,
+                                           unsigned Alignment,
+                                           unsigned AddressSpace) {
+  assert(Factor >= 2 && "Invalid interleave factor");
+  assert(isa<VectorType>(VecTy) && "Expect a vector type");
+
+  // vldN/vstN doesn't support vector types of i64/f64 element.
+  bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
+
+  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
+    unsigned NumElts = VecTy->getVectorNumElements();
+    Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
+    unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+
+    // vldN/vstN only support legal vector types of size 64 or 128 in bits.
+    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
+      return Factor;
+  }
+
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
new file mode 100644
index 000000000000..d83228afb0ab
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -0,0 +1,139 @@
+//===-- ARMTargetTransformInfo.h - ARM specific TTI -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// ARM target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H
+
+#include "ARM.h"
+#include "ARMTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
+  typedef BasicTTIImplBase<ARMTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const ARMSubtarget *ST;
+  const ARMTargetLowering *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+
+  const ARMSubtarget *getST() const { return ST; }
+  const ARMTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  bool enableInterleavedAccessVectorization() { return true; }
+
+  /// Floating-point computation using ARMv8 AArch32 Advanced
+  /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
+  /// is IEEE-754 compliant, but it's not covered in this target.
+  bool isFPVectorizationPotentiallyUnsafe() {
+    return !ST->isTargetDarwin();
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+
+  int getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                            Type *Ty);
+
+  using BaseT::getIntImmCost;
+  int getIntImmCost(const APInt &Imm, Type *Ty);
+
+  int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector) {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 16;
+      return 0;
+    }
+
+    if (ST->isThumb1Only())
+      return 8;
+    return 13;
+  }
+
+  unsigned getRegisterBitWidth(bool Vector) {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 128;
+      return 0;
+    }
+
+    return 32;
+  }
+
+  unsigned getMaxInterleaveFactor(unsigned VF) {
+    return ST->getMaxInterleaveFactor();
+  }
+
+  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+
+  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+
+  int getAddressComputationCost(Type *Val, bool IsComplex);
+
+  int getFPOpCost(Type *Ty);
+
+  int getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+
+  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                      unsigned AddressSpace);
+
+  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+                                 ArrayRef<unsigned> Indices, unsigned Alignment,
+                                 unsigned AddressSpace);
+
+  bool shouldBuildLookupTablesForConstant(Constant *C) const {
+    // In the ROPI and RWPI relocation models we can't have pointers to global
+    // variables or functions in constant data, so don't convert switches to
+    // lookup tables if any of the values would need relocation.
+    if (ST->isROPI() || ST->isRWPI())
+      return !C->needsRelocation();
+
+    return true;
+  }
+  /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
new file mode 100644
index 000000000000..c243a2d35979
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -0,0 +1,10352 @@
+//===-- ARMAsmParser.cpp - Parse ARM assembly to MCInst instructions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMFeatures.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserUtils.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/ARMEHABI.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+enum class ImplicitItModeTy { Always, Never, ARMOnly, ThumbOnly };
+
+static cl::opt<ImplicitItModeTy> ImplicitItMode(
+    "arm-implicit-it", cl::init(ImplicitItModeTy::ARMOnly),
+    cl::desc("Allow conditional instructions outdside of an IT block"),
+    cl::values(clEnumValN(ImplicitItModeTy::Always, "always",
+                          "Accept in both ISAs, emit implicit ITs in Thumb"),
+               clEnumValN(ImplicitItModeTy::Never, "never",
+                          "Warn in ARM, reject in Thumb"),
+               clEnumValN(ImplicitItModeTy::ARMOnly, "arm",
+                          "Accept in ARM, reject in Thumb"),
+               clEnumValN(ImplicitItModeTy::ThumbOnly, "thumb",
+                          "Warn in ARM, emit implicit ITs in Thumb")));
+
+class ARMOperand;
+
+enum VectorLaneTy { NoLanes, AllLanes, IndexedLane };
+
+class UnwindContext {
+  MCAsmParser &Parser;
+
+  typedef SmallVector<SMLoc, 4> Locs;
+
+  Locs FnStartLocs;
+  Locs CantUnwindLocs;
+  Locs PersonalityLocs;
+  Locs PersonalityIndexLocs;
+  Locs HandlerDataLocs;
+  int FPReg;
+
+public:
+  UnwindContext(MCAsmParser &P) : Parser(P), FPReg(ARM::SP) {}
+
+  bool hasFnStart() const { return !FnStartLocs.empty(); }
+  bool cantUnwind() const { return !CantUnwindLocs.empty(); }
+  bool hasHandlerData() const { return !HandlerDataLocs.empty(); }
+  bool hasPersonality() const {
+    return !(PersonalityLocs.empty() && PersonalityIndexLocs.empty());
+  }
+
+  void recordFnStart(SMLoc L) { FnStartLocs.push_back(L); }
+  void recordCantUnwind(SMLoc L) { CantUnwindLocs.push_back(L); }
+  void recordPersonality(SMLoc L) { PersonalityLocs.push_back(L); }
+  void recordHandlerData(SMLoc L) { HandlerDataLocs.push_back(L); }
+  void recordPersonalityIndex(SMLoc L) { PersonalityIndexLocs.push_back(L); }
+
+  void saveFPReg(int Reg) { FPReg = Reg; }
+  int getFPReg() const { return FPReg; }
+
+  void emitFnStartLocNotes() const {
+    for (Locs::const_iterator FI = FnStartLocs.begin(), FE = FnStartLocs.end();
+         FI != FE; ++FI)
+      Parser.Note(*FI, ".fnstart was specified here");
+  }
+  void emitCantUnwindLocNotes() const {
+    for (Locs::const_iterator UI = CantUnwindLocs.begin(),
+                              UE = CantUnwindLocs.end(); UI != UE; ++UI)
+      Parser.Note(*UI, ".cantunwind was specified here");
+  }
+  void emitHandlerDataLocNotes() const {
+    for (Locs::const_iterator HI = HandlerDataLocs.begin(),
+                              HE = HandlerDataLocs.end(); HI != HE; ++HI)
+      Parser.Note(*HI, ".handlerdata was specified here");
+  }
+  void emitPersonalityLocNotes() const {
+    for (Locs::const_iterator PI = PersonalityLocs.begin(),
+                              PE = PersonalityLocs.end(),
+                              PII = PersonalityIndexLocs.begin(),
+                              PIE = PersonalityIndexLocs.end();
+         PI != PE || PII != PIE;) {
+      if (PI != PE && (PII == PIE || PI->getPointer() < PII->getPointer()))
+        Parser.Note(*PI++, ".personality was specified here");
+      else if (PII != PIE && (PI == PE || PII->getPointer() < PI->getPointer()))
+        Parser.Note(*PII++, ".personalityindex was specified here");
+      else
+        llvm_unreachable(".personality and .personalityindex cannot be "
+                         "at the same location");
+    }
+  }
+
+  void reset() {
+    FnStartLocs = Locs();
+    CantUnwindLocs = Locs();
+    PersonalityLocs = Locs();
+    HandlerDataLocs = Locs();
+    PersonalityIndexLocs = Locs();
+    FPReg = ARM::SP;
+  }
+};
+
+class ARMAsmParser : public MCTargetAsmParser {
+  const MCInstrInfo &MII;
+  const MCRegisterInfo *MRI;
+  UnwindContext UC;
+
+  ARMTargetStreamer &getTargetStreamer() {
+    assert(getParser().getStreamer().getTargetStreamer() &&
+           "do not have a target streamer");
+    MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+    return static_cast<ARMTargetStreamer &>(TS);
+  }
+
+  // Map of register aliases registers via the .req directive.
+  StringMap<unsigned> RegisterReqs;
+
+  bool NextSymbolIsThumb;
+
+  bool useImplicitITThumb() const {
+    return ImplicitItMode == ImplicitItModeTy::Always ||
+           ImplicitItMode == ImplicitItModeTy::ThumbOnly;
+  }
+
+  bool useImplicitITARM() const {
+    return ImplicitItMode == ImplicitItModeTy::Always ||
+           ImplicitItMode == ImplicitItModeTy::ARMOnly;
+  }
+
+  struct {
+    ARMCC::CondCodes Cond;    // Condition for IT block.
+    unsigned Mask:4;          // Condition mask for instructions.
+                              // Starting at first 1 (from lsb).
+                              //   '1'  condition as indicated in IT.
+                              //   '0'  inverse of condition (else).
+                              // Count of instructions in IT block is
+                              // 4 - trailingzeroes(mask)
+                              // Note that this does not have the same encoding
+                              // as in the IT instruction, which also depends
+                              // on the low bit of the condition code.
+
+    unsigned CurPosition;     // Current position in parsing of IT
+                              // block. In range [0,4], with 0 being the IT
+                              // instruction itself. Initialized according to
+                              // count of instructions in block.  ~0U if no
+                              // active IT block.
+
+    bool IsExplicit;          // true  - The IT instruction was present in the
+                              //         input, we should not modify it.
+                              // false - The IT instruction was added
+                              //         implicitly, we can extend it if that
+                              //         would be legal.
+  } ITState;
+
+  llvm::SmallVector<MCInst, 4> PendingConditionalInsts;
+
+  void flushPendingInstructions(MCStreamer &Out) override {
+    if (!inImplicitITBlock()) {
+      assert(PendingConditionalInsts.size() == 0);
+      return;
+    }
+
+    // Emit the IT instruction
+    unsigned Mask = getITMaskEncoding();
+    MCInst ITInst;
+    ITInst.setOpcode(ARM::t2IT);
+    ITInst.addOperand(MCOperand::createImm(ITState.Cond));
+    ITInst.addOperand(MCOperand::createImm(Mask));
+    Out.EmitInstruction(ITInst, getSTI());
+
+    // Emit the conditonal instructions
+    assert(PendingConditionalInsts.size() <= 4);
+    for (const MCInst &Inst : PendingConditionalInsts) {
+      Out.EmitInstruction(Inst, getSTI());
+    }
+    PendingConditionalInsts.clear();
+
+    // Clear the IT state
+    ITState.Mask = 0;
+    ITState.CurPosition = ~0U;
+  }
+
+  bool inITBlock() { return ITState.CurPosition != ~0U; }
+  bool inExplicitITBlock() { return inITBlock() && ITState.IsExplicit; }
+  bool inImplicitITBlock() { return inITBlock() && !ITState.IsExplicit; }
+  bool lastInITBlock() {
+    return ITState.CurPosition == 4 - countTrailingZeros(ITState.Mask);
+  }
+  void forwardITPosition() {
+    if (!inITBlock()) return;
+    // Move to the next instruction in the IT block, if there is one. If not,
+    // mark the block as done, except for implicit IT blocks, which we leave
+    // open until we find an instruction that can't be added to it.
+    unsigned TZ = countTrailingZeros(ITState.Mask);
+    if (++ITState.CurPosition == 5 - TZ && ITState.IsExplicit)
+      ITState.CurPosition = ~0U; // Done with the IT block after this.
+  }
+
+  // Rewind the state of the current IT block, removing the last slot from it.
+  void rewindImplicitITPosition() {
+    assert(inImplicitITBlock());
+    assert(ITState.CurPosition > 1);
+    ITState.CurPosition--;
+    unsigned TZ = countTrailingZeros(ITState.Mask);
+    unsigned NewMask = 0;
+    NewMask |= ITState.Mask & (0xC << TZ);
+    NewMask |= 0x2 << TZ;
+    ITState.Mask = NewMask;
+  }
+
+  // Rewind the state of the current IT block, removing the last slot from it.
+  // If we were at the first slot, this closes the IT block.
+  void discardImplicitITBlock() {
+    assert(inImplicitITBlock());
+    assert(ITState.CurPosition == 1);
+    ITState.CurPosition = ~0U;
+    return;
+  }
+
+  // Get the encoding of the IT mask, as it will appear in an IT instruction.
+  unsigned getITMaskEncoding() {
+    assert(inITBlock());
+    unsigned Mask = ITState.Mask;
+    unsigned TZ = countTrailingZeros(Mask);
+    if ((ITState.Cond & 1) == 0) {
+      assert(Mask && TZ <= 3 && "illegal IT mask value!");
+      Mask ^= (0xE << TZ) & 0xF;
+    }
+    return Mask;
+  }
+
+  // Get the condition code corresponding to the current IT block slot.
+  ARMCC::CondCodes currentITCond() {
+    unsigned MaskBit;
+    if (ITState.CurPosition == 1)
+      MaskBit = 1;
+    else
+      MaskBit = (ITState.Mask >> (5 - ITState.CurPosition)) & 1;
+
+    return MaskBit ? ITState.Cond : ARMCC::getOppositeCondition(ITState.Cond);
+  }
+
+  // Invert the condition of the current IT block slot without changing any
+  // other slots in the same block.
+  void invertCurrentITCondition() {
+    if (ITState.CurPosition == 1) {
+      ITState.Cond = ARMCC::getOppositeCondition(ITState.Cond);
+    } else {
+      ITState.Mask ^= 1 << (5 - ITState.CurPosition);
+    }
+  }
+
+  // Returns true if the current IT block is full (all 4 slots used).
+  bool isITBlockFull() {
+    return inITBlock() && (ITState.Mask & 1);
+  }
+
+  // Extend the current implicit IT block to have one more slot with the given
+  // condition code.
+  void extendImplicitITBlock(ARMCC::CondCodes Cond) {
+    assert(inImplicitITBlock());
+    assert(!isITBlockFull());
+    assert(Cond == ITState.Cond ||
+           Cond == ARMCC::getOppositeCondition(ITState.Cond));
+    unsigned TZ = countTrailingZeros(ITState.Mask);
+    unsigned NewMask = 0;
+    // Keep any existing condition bits.
+    NewMask |= ITState.Mask & (0xE << TZ);
+    // Insert the new condition bit.
+    NewMask |= (Cond == ITState.Cond) << TZ;
+    // Move the trailing 1 down one bit.
+    NewMask |= 1 << (TZ - 1);
+    ITState.Mask = NewMask;
+  }
+
+  // Create a new implicit IT block with a dummy condition code.
+  void startImplicitITBlock() {
+    assert(!inITBlock());
+    ITState.Cond = ARMCC::AL;
+    ITState.Mask = 8;
+    ITState.CurPosition = 1;
+    ITState.IsExplicit = false;
+    return;
+  }
+
+  // Create a new explicit IT block with the given condition and mask. The mask
+  // should be in the parsed format, with a 1 implying 't', regardless of the
+  // low bit of the condition.
+  void startExplicitITBlock(ARMCC::CondCodes Cond, unsigned Mask) {
+    assert(!inITBlock());
+    ITState.Cond = Cond;
+    ITState.Mask = Mask;
+    ITState.CurPosition = 0;
+    ITState.IsExplicit = true;
+    return;
+  }
+
+  void Note(SMLoc L, const Twine &Msg, SMRange Range = None) {
+    return getParser().Note(L, Msg, Range);
+  }
+  bool Warning(SMLoc L, const Twine &Msg, SMRange Range = None) {
+    return getParser().Warning(L, Msg, Range);
+  }
+  bool Error(SMLoc L, const Twine &Msg, SMRange Range = None) {
+    return getParser().Error(L, Msg, Range);
+  }
+
+  bool validatetLDMRegList(const MCInst &Inst, const OperandVector &Operands,
+                           unsigned ListNo, bool IsARPop = false);
+  bool validatetSTMRegList(const MCInst &Inst, const OperandVector &Operands,
+                           unsigned ListNo);
+
+  int tryParseRegister();
+  bool tryParseRegisterWithWriteBack(OperandVector &);
+  int tryParseShiftRegister(OperandVector &);
+  bool parseRegisterList(OperandVector &);
+  bool parseMemory(OperandVector &);
+  bool parseOperand(OperandVector &, StringRef Mnemonic);
+  bool parsePrefix(ARMMCExpr::VariantKind &RefKind);
+  bool parseMemRegOffsetShift(ARM_AM::ShiftOpc &ShiftType,
+                              unsigned &ShiftAmount);
+  bool parseLiteralValues(unsigned Size, SMLoc L);
+  bool parseDirectiveThumb(SMLoc L);
+  bool parseDirectiveARM(SMLoc L);
+  bool parseDirectiveThumbFunc(SMLoc L);
+  bool parseDirectiveCode(SMLoc L);
+  bool parseDirectiveSyntax(SMLoc L);
+  bool parseDirectiveReq(StringRef Name, SMLoc L);
+  bool parseDirectiveUnreq(SMLoc L);
+  bool parseDirectiveArch(SMLoc L);
+  bool parseDirectiveEabiAttr(SMLoc L);
+  bool parseDirectiveCPU(SMLoc L);
+  bool parseDirectiveFPU(SMLoc L);
+  bool parseDirectiveFnStart(SMLoc L);
+  bool parseDirectiveFnEnd(SMLoc L);
+  bool parseDirectiveCantUnwind(SMLoc L);
+  bool parseDirectivePersonality(SMLoc L);
+  bool parseDirectiveHandlerData(SMLoc L);
+  bool parseDirectiveSetFP(SMLoc L);
+  bool parseDirectivePad(SMLoc L);
+  bool parseDirectiveRegSave(SMLoc L, bool IsVector);
+  bool parseDirectiveInst(SMLoc L, char Suffix = '\0');
+  bool parseDirectiveLtorg(SMLoc L);
+  bool parseDirectiveEven(SMLoc L);
+  bool parseDirectivePersonalityIndex(SMLoc L);
+  bool parseDirectiveUnwindRaw(SMLoc L);
+  bool parseDirectiveTLSDescSeq(SMLoc L);
+  bool parseDirectiveMovSP(SMLoc L);
+  bool parseDirectiveObjectArch(SMLoc L);
+  bool parseDirectiveArchExtension(SMLoc L);
+  bool parseDirectiveAlign(SMLoc L);
+  bool parseDirectiveThumbSet(SMLoc L);
+
+  StringRef splitMnemonic(StringRef Mnemonic, unsigned &PredicationCode,
+                          bool &CarrySetting, unsigned &ProcessorIMod,
+                          StringRef &ITMask);
+  void getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
+                             bool &CanAcceptCarrySet,
+                             bool &CanAcceptPredicationCode);
+
+  void tryConvertingToTwoOperandForm(StringRef Mnemonic, bool CarrySetting,
+                                     OperandVector &Operands);
+  bool isThumb() const {
+    // FIXME: Can tablegen auto-generate this?
+    return getSTI().getFeatureBits()[ARM::ModeThumb];
+  }
+  bool isThumbOne() const {
+    return isThumb() && !getSTI().getFeatureBits()[ARM::FeatureThumb2];
+  }
+  bool isThumbTwo() const {
+    return isThumb() && getSTI().getFeatureBits()[ARM::FeatureThumb2];
+  }
+  bool hasThumb() const {
+    return getSTI().getFeatureBits()[ARM::HasV4TOps];
+  }
+  bool hasThumb2() const {
+    return getSTI().getFeatureBits()[ARM::FeatureThumb2];
+  }
+  bool hasV6Ops() const {
+    return getSTI().getFeatureBits()[ARM::HasV6Ops];
+  }
+  bool hasV6T2Ops() const {
+    return getSTI().getFeatureBits()[ARM::HasV6T2Ops];
+  }
+  bool hasV6MOps() const {
+    return getSTI().getFeatureBits()[ARM::HasV6MOps];
+  }
+  bool hasV7Ops() const {
+    return getSTI().getFeatureBits()[ARM::HasV7Ops];
+  }
+  bool hasV8Ops() const {
+    return getSTI().getFeatureBits()[ARM::HasV8Ops];
+  }
+  bool hasV8MBaseline() const {
+    return getSTI().getFeatureBits()[ARM::HasV8MBaselineOps];
+  }
+  bool hasV8MMainline() const {
+    return getSTI().getFeatureBits()[ARM::HasV8MMainlineOps];
+  }
+  bool has8MSecExt() const {
+    return getSTI().getFeatureBits()[ARM::Feature8MSecExt];
+  }
+  bool hasARM() const {
+    return !getSTI().getFeatureBits()[ARM::FeatureNoARM];
+  }
+  bool hasDSP() const {
+    return getSTI().getFeatureBits()[ARM::FeatureDSP];
+  }
+  bool hasD16() const {
+    return getSTI().getFeatureBits()[ARM::FeatureD16];
+  }
+  bool hasV8_1aOps() const {
+    return getSTI().getFeatureBits()[ARM::HasV8_1aOps];
+  }
+  bool hasRAS() const {
+    return getSTI().getFeatureBits()[ARM::FeatureRAS];
+  }
+
+  void SwitchMode() {
+    MCSubtargetInfo &STI = copySTI();
+    uint64_t FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
+    setAvailableFeatures(FB);
+  }
+  void FixModeAfterArchChange(bool WasThumb, SMLoc Loc);
+  bool isMClass() const {
+    return getSTI().getFeatureBits()[ARM::FeatureMClass];
+  }
+
+  /// @name Auto-generated Match Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "ARMGenAsmMatcher.inc"
+
+  /// }
+
+  OperandMatchResultTy parseITCondCode(OperandVector &);
+  OperandMatchResultTy parseCoprocNumOperand(OperandVector &);
+  OperandMatchResultTy parseCoprocRegOperand(OperandVector &);
+  OperandMatchResultTy parseCoprocOptionOperand(OperandVector &);
+  OperandMatchResultTy parseMemBarrierOptOperand(OperandVector &);
+  OperandMatchResultTy parseInstSyncBarrierOptOperand(OperandVector &);
+  OperandMatchResultTy parseProcIFlagsOperand(OperandVector &);
+  OperandMatchResultTy parseMSRMaskOperand(OperandVector &);
+  OperandMatchResultTy parseBankedRegOperand(OperandVector &);
+  OperandMatchResultTy parsePKHImm(OperandVector &O, StringRef Op, int Low,
+                                   int High);
+  OperandMatchResultTy parsePKHLSLImm(OperandVector &O) {
+    return parsePKHImm(O, "lsl", 0, 31);
+  }
+  OperandMatchResultTy parsePKHASRImm(OperandVector &O) {
+    return parsePKHImm(O, "asr", 1, 32);
+  }
+  OperandMatchResultTy parseSetEndImm(OperandVector &);
+  OperandMatchResultTy parseShifterImm(OperandVector &);
+  OperandMatchResultTy parseRotImm(OperandVector &);
+  OperandMatchResultTy parseModImm(OperandVector &);
+  OperandMatchResultTy parseBitfield(OperandVector &);
+  OperandMatchResultTy parsePostIdxReg(OperandVector &);
+  OperandMatchResultTy parseAM3Offset(OperandVector &);
+  OperandMatchResultTy parseFPImm(OperandVector &);
+  OperandMatchResultTy parseVectorList(OperandVector &);
+  OperandMatchResultTy parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index,
+                                       SMLoc &EndLoc);
+
+  // Asm Match Converter Methods
+  void cvtThumbMultiply(MCInst &Inst, const OperandVector &);
+  void cvtThumbBranches(MCInst &Inst, const OperandVector &);
+
+  bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
+  bool processInstruction(MCInst &Inst, const OperandVector &Ops, MCStreamer &Out);
+  bool shouldOmitCCOutOperand(StringRef Mnemonic, OperandVector &Operands);
+  bool shouldOmitPredicateOperand(StringRef Mnemonic, OperandVector &Operands);
+  bool isITBlockTerminator(MCInst &Inst) const;
+
+public:
+  enum ARMMatchResultTy {
+    Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY,
+    Match_RequiresNotITBlock,
+    Match_RequiresV6,
+    Match_RequiresThumb2,
+    Match_RequiresV8,
+    Match_RequiresFlagSetting,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "ARMGenAsmMatcher.inc"
+
+  };
+
+  ARMAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
+               const MCInstrInfo &MII, const MCTargetOptions &Options)
+    : MCTargetAsmParser(Options, STI), MII(MII), UC(Parser) {
+    MCAsmParserExtension::Initialize(Parser);
+
+    // Cache the MCRegisterInfo.
+    MRI = getContext().getRegisterInfo();
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+
+    // Not in an ITBlock to start with.
+    ITState.CurPosition = ~0U;
+
+    NextSymbolIsThumb = false;
+  }
+
+  // Implementation of the MCTargetAsmParser interface:
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
+
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
+  unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+  unsigned MatchInstruction(OperandVector &Operands, MCInst &Inst,
+                            uint64_t &ErrorInfo, bool MatchingInlineAsm,
+                            bool &EmitInITBlock, MCStreamer &Out);
+  void onLabelParsed(MCSymbol *Symbol) override;
+};
+} // end anonymous namespace
+
+namespace {
+
+/// ARMOperand - Instances of this class represent a parsed ARM machine
+/// operand.
+class ARMOperand : public MCParsedAsmOperand {
+  enum KindTy {
+    k_CondCode,
+    k_CCOut,
+    k_ITCondMask,
+    k_CoprocNum,
+    k_CoprocReg,
+    k_CoprocOption,
+    k_Immediate,
+    k_MemBarrierOpt,
+    k_InstSyncBarrierOpt,
+    k_Memory,
+    k_PostIndexRegister,
+    k_MSRMask,
+    k_BankedReg,
+    k_ProcIFlags,
+    k_VectorIndex,
+    k_Register,
+    k_RegisterList,
+    k_DPRRegisterList,
+    k_SPRRegisterList,
+    k_VectorList,
+    k_VectorListAllLanes,
+    k_VectorListIndexed,
+    k_ShiftedRegister,
+    k_ShiftedImmediate,
+    k_ShifterImmediate,
+    k_RotateImmediate,
+    k_ModifiedImmediate,
+    k_ConstantPoolImmediate,
+    k_BitfieldDescriptor,
+    k_Token,
+  } Kind;
+
+  SMLoc StartLoc, EndLoc, AlignmentLoc;
+  SmallVector<unsigned, 8> Registers;
+
+  struct CCOp {
+    ARMCC::CondCodes Val;
+  };
+
+  struct CopOp {
+    unsigned Val;
+  };
+
+  struct CoprocOptionOp {
+    unsigned Val;
+  };
+
+  struct ITMaskOp {
+    unsigned Mask:4;
+  };
+
+  struct MBOptOp {
+    ARM_MB::MemBOpt Val;
+  };
+
+  struct ISBOptOp {
+    ARM_ISB::InstSyncBOpt Val;
+  };
+
+  struct IFlagsOp {
+    ARM_PROC::IFlags Val;
+  };
+
+  struct MMaskOp {
+    unsigned Val;
+  };
+
+  struct BankedRegOp {
+    unsigned Val;
+  };
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+  };
+
+  // A vector register list is a sequential list of 1 to 4 registers.
+  struct VectorListOp {
+    unsigned RegNum;
+    unsigned Count;
+    unsigned LaneIndex;
+    bool isDoubleSpaced;
+  };
+
+  struct VectorIndexOp {
+    unsigned Val;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  /// Combined record for all forms of ARM address expressions.
+  struct MemoryOp {
+    unsigned BaseRegNum;
+    // Offset is in OffsetReg or OffsetImm. If both are zero, no offset
+    // was specified.
+    const MCConstantExpr *OffsetImm;  // Offset immediate value
+    unsigned OffsetRegNum;    // Offset register num, when OffsetImm == NULL
+    ARM_AM::ShiftOpc ShiftType; // Shift type for OffsetReg
+    unsigned ShiftImm;        // shift for OffsetReg.
+    unsigned Alignment;       // 0 = no alignment specified
+    // n = alignment in bytes (2, 4, 8, 16, or 32)
+    unsigned isNegative : 1;  // Negated OffsetReg? (~'U' bit)
+  };
+
+  struct PostIdxRegOp {
+    unsigned RegNum;
+    bool isAdd;
+    ARM_AM::ShiftOpc ShiftTy;
+    unsigned ShiftImm;
+  };
+
+  struct ShifterImmOp {
+    bool isASR;
+    unsigned Imm;
+  };
+
+  struct RegShiftedRegOp {
+    ARM_AM::ShiftOpc ShiftTy;
+    unsigned SrcReg;
+    unsigned ShiftReg;
+    unsigned ShiftImm;
+  };
+
+  struct RegShiftedImmOp {
+    ARM_AM::ShiftOpc ShiftTy;
+    unsigned SrcReg;
+    unsigned ShiftImm;
+  };
+
+  struct RotImmOp {
+    unsigned Imm;
+  };
+
+  struct ModImmOp {
+    unsigned Bits;
+    unsigned Rot;
+  };
+
+  struct BitfieldOp {
+    unsigned LSB;
+    unsigned Width;
+  };
+
+  union {
+    struct CCOp CC;
+    struct CopOp Cop;
+    struct CoprocOptionOp CoprocOption;
+    struct MBOptOp MBOpt;
+    struct ISBOptOp ISBOpt;
+    struct ITMaskOp ITMask;
+    struct IFlagsOp IFlags;
+    struct MMaskOp MMask;
+    struct BankedRegOp BankedReg;
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct VectorListOp VectorList;
+    struct VectorIndexOp VectorIndex;
+    struct ImmOp Imm;
+    struct MemoryOp Memory;
+    struct PostIdxRegOp PostIdxReg;
+    struct ShifterImmOp ShifterImm;
+    struct RegShiftedRegOp RegShiftedReg;
+    struct RegShiftedImmOp RegShiftedImm;
+    struct RotImmOp RotImm;
+    struct ModImmOp ModImm;
+    struct BitfieldOp Bitfield;
+  };
+
+public:
+  ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
+  SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+
+  /// getAlignmentLoc - Get the location of the Alignment token of this operand.
+  SMLoc getAlignmentLoc() const {
+    assert(Kind == k_Memory && "Invalid access!");
+    return AlignmentLoc;
+  }
+
+  ARMCC::CondCodes getCondCode() const {
+    assert(Kind == k_CondCode && "Invalid access!");
+    return CC.Val;
+  }
+
+  unsigned getCoproc() const {
+    assert((Kind == k_CoprocNum || Kind == k_CoprocReg) && "Invalid access!");
+    return Cop.Val;
+  }
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const override {
+    assert((Kind == k_Register || Kind == k_CCOut) && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  const SmallVectorImpl<unsigned> &getRegList() const {
+    assert((Kind == k_RegisterList || Kind == k_DPRRegisterList ||
+            Kind == k_SPRRegisterList) && "Invalid access!");
+    return Registers;
+  }
+
+  const MCExpr *getImm() const {
+    assert(isImm() && "Invalid access!");
+    return Imm.Val;
+  }
+
+  const MCExpr *getConstantPoolImm() const {
+    assert(isConstantPoolImm() && "Invalid access!");
+    return Imm.Val;
+  }
+
+  unsigned getVectorIndex() const {
+    assert(Kind == k_VectorIndex && "Invalid access!");
+    return VectorIndex.Val;
+  }
+
+  ARM_MB::MemBOpt getMemBarrierOpt() const {
+    assert(Kind == k_MemBarrierOpt && "Invalid access!");
+    return MBOpt.Val;
+  }
+
+  ARM_ISB::InstSyncBOpt getInstSyncBarrierOpt() const {
+    assert(Kind == k_InstSyncBarrierOpt && "Invalid access!");
+    return ISBOpt.Val;
+  }
+
+  ARM_PROC::IFlags getProcIFlags() const {
+    assert(Kind == k_ProcIFlags && "Invalid access!");
+    return IFlags.Val;
+  }
+
+  unsigned getMSRMask() const {
+    assert(Kind == k_MSRMask && "Invalid access!");
+    return MMask.Val;
+  }
+
+  unsigned getBankedReg() const {
+    assert(Kind == k_BankedReg && "Invalid access!");
+    return BankedReg.Val;
+  }
+
+  bool isCoprocNum() const { return Kind == k_CoprocNum; }
+  bool isCoprocReg() const { return Kind == k_CoprocReg; }
+  bool isCoprocOption() const { return Kind == k_CoprocOption; }
+  bool isCondCode() const { return Kind == k_CondCode; }
+  bool isCCOut() const { return Kind == k_CCOut; }
+  bool isITMask() const { return Kind == k_ITCondMask; }
+  bool isITCondCode() const { return Kind == k_CondCode; }
+  bool isImm() const override {
+    return Kind == k_Immediate;
+  }
+
+  bool isARMBranchTarget() const {
+    if (!isImm()) return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()))
+      return CE->getValue() % 4 == 0;
+    return true;
+  }
+
+
+  bool isThumbBranchTarget() const {
+    if (!isImm()) return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()))
+      return CE->getValue() % 2 == 0;
+    return true;
+  }
+
+  // checks whether this operand is an unsigned offset which fits is a field
+  // of specified width and scaled by a specific number of bits
+  template<unsigned width, unsigned scale>
+  bool isUnsignedOffset() const {
+    if (!isImm()) return false;
+    if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Align = 1LL << scale;
+      int64_t Max = Align * ((1LL << width) - 1);
+      return ((Val % Align) == 0) && (Val >= 0) && (Val <= Max);
+    }
+    return false;
+  }
+  // checks whether this operand is an signed offset which fits is a field
+  // of specified width and scaled by a specific number of bits
+  template<unsigned width, unsigned scale>
+  bool isSignedOffset() const {
+    if (!isImm()) return false;
+    if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Align = 1LL << scale;
+      int64_t Max = Align * ((1LL << (width-1)) - 1);
+      int64_t Min = -Align * (1LL << (width-1));
+      return ((Val % Align) == 0) && (Val >= Min) && (Val <= Max);
+    }
+    return false;
+  }
+
+  // checks whether this operand is a memory operand computed as an offset
+  // applied to PC. the offset may have 8 bits of magnitude and is represented
+  // with two bits of shift. textually it may be either [pc, #imm], #imm or 
+  // relocable expression...
+  bool isThumbMemPC() const {
+    int64_t Val = 0;
+    if (isImm()) {
+      if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
+      if (!CE) return false;
+      Val = CE->getValue();
+    }
+    else if (isMem()) {
+      if(!Memory.OffsetImm || Memory.OffsetRegNum) return false;
+      if(Memory.BaseRegNum != ARM::PC) return false;
+      Val = Memory.OffsetImm->getValue();
+    }
+    else return false;
+    return ((Val % 4) == 0) && (Val >= 0) && (Val <= 1020);
+  }
+  bool isFPImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue()));
+    return Val != -1;
+  }
+  bool isFBits16() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value <= 16;
+  }
+  bool isFBits32() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 1 && Value <= 32;
+  }
+  bool isImm8s4() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ((Value & 3) == 0) && Value >= -1020 && Value <= 1020;
+  }
+  bool isImm0_1020s4() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ((Value & 3) == 0) && Value >= 0 && Value <= 1020;
+  }
+  bool isImm0_508s4() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ((Value & 3) == 0) && Value >= 0 && Value <= 508;
+  }
+  bool isImm0_508s4Neg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = -CE->getValue();
+    // explicitly exclude zero. we want that to use the normal 0_508 version.
+    return ((Value & 3) == 0) && Value > 0 && Value <= 508;
+  }
+  bool isImm0_239() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 240;
+  }
+  bool isImm0_255() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 256;
+  }
+  bool isImm0_4095() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 4096;
+  }
+  bool isImm0_4095Neg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = -CE->getValue();
+    return Value > 0 && Value < 4096;
+  }
+  bool isImm0_1() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 2;
+  }
+  bool isImm0_3() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 4;
+  }
+  bool isImm0_7() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 8;
+  }
+  bool isImm0_15() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 16;
+  }
+  bool isImm0_31() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 32;
+  }
+  bool isImm0_63() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 64;
+  }
+  bool isImm8() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value == 8;
+  }
+  bool isImm16() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value == 16;
+  }
+  bool isImm32() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value == 32;
+  }
+  bool isShrImm8() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 8;
+  }
+  bool isShrImm16() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 16;
+  }
+  bool isShrImm32() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 32;
+  }
+  bool isShrImm64() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 64;
+  }
+  bool isImm1_7() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 8;
+  }
+  bool isImm1_15() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 16;
+  }
+  bool isImm1_31() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 32;
+  }
+  bool isImm1_16() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 17;
+  }
+  bool isImm1_32() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 33;
+  }
+  bool isImm0_32() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 33;
+  }
+  bool isImm0_65535() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 65536;
+  }
+  bool isImm256_65535Expr() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // If it's not a constant expression, it'll generate a fixup and be
+    // handled later.
+    if (!CE) return true;
+    int64_t Value = CE->getValue();
+    return Value >= 256 && Value < 65536;
+  }
+  bool isImm0_65535Expr() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // If it's not a constant expression, it'll generate a fixup and be
+    // handled later.
+    if (!CE) return true;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 65536;
+  }
+  bool isImm24bit() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value <= 0xffffff;
+  }
+  bool isImmThumbSR() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 33;
+  }
+  bool isPKHLSLImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 32;
+  }
+  bool isPKHASRImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 32;
+  }
+  bool isAdrLabel() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup.
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+
+    // If it is a constant, it must fit into a modified immediate encoding.
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return (ARM_AM::getSOImmVal(Value) != -1 ||
+            ARM_AM::getSOImmVal(-Value) != -1);
+  }
+  bool isT2SOImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getT2SOImmVal(Value) != -1;
+  }
+  bool isT2SOImmNot() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getT2SOImmVal(Value) == -1 &&
+      ARM_AM::getT2SOImmVal(~Value) != -1;
+  }
+  bool isT2SOImmNeg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    // Only use this when not representable as a plain so_imm.
+    return ARM_AM::getT2SOImmVal(Value) == -1 &&
+      ARM_AM::getT2SOImmVal(-Value) != -1;
+  }
+  bool isSetEndImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value == 1 || Value == 0;
+  }
+  bool isReg() const override { return Kind == k_Register; }
+  bool isRegList() const { return Kind == k_RegisterList; }
+  bool isDPRRegList() const { return Kind == k_DPRRegisterList; }
+  bool isSPRRegList() const { return Kind == k_SPRRegisterList; }
+  bool isToken() const override { return Kind == k_Token; }
+  bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; }
+  bool isInstSyncBarrierOpt() const { return Kind == k_InstSyncBarrierOpt; }
+  bool isMem() const override { return Kind == k_Memory; }
+  bool isShifterImm() const { return Kind == k_ShifterImmediate; }
+  bool isRegShiftedReg() const { return Kind == k_ShiftedRegister; }
+  bool isRegShiftedImm() const { return Kind == k_ShiftedImmediate; }
+  bool isRotImm() const { return Kind == k_RotateImmediate; }
+  bool isModImm() const { return Kind == k_ModifiedImmediate; }
+  bool isModImmNot() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getSOImmVal(~Value) != -1;
+  }
+  bool isModImmNeg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getSOImmVal(Value) == -1 &&
+      ARM_AM::getSOImmVal(-Value) != -1;
+  }
+  bool isConstantPoolImm() const { return Kind == k_ConstantPoolImmediate; }
+  bool isBitfield() const { return Kind == k_BitfieldDescriptor; }
+  bool isPostIdxRegShifted() const { return Kind == k_PostIndexRegister; }
+  bool isPostIdxReg() const {
+    return Kind == k_PostIndexRegister && PostIdxReg.ShiftTy ==ARM_AM::no_shift;
+  }
+  bool isMemNoOffset(bool alignOK = false, unsigned Alignment = 0) const {
+    if (!isMem())
+      return false;
+    // No offset of any kind.
+    return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr &&
+     (alignOK || Memory.Alignment == Alignment);
+  }
+  bool isMemPCRelImm12() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Base register must be PC.
+    if (Memory.BaseRegNum != ARM::PC)
+      return false;
+    // Immediate offset in range [-4095, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val > -4096 && Val < 4096) || (Val == INT32_MIN);
+  }
+  bool isAlignedMemory() const {
+    return isMemNoOffset(true);
+  }
+  bool isAlignedMemoryNone() const {
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemoryNone() const {
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory16() const {
+    if (isMemNoOffset(false, 2)) // alignment in bytes for 16-bits is 2.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemory16() const {
+    if (isMemNoOffset(false, 2)) // alignment in bytes for 16-bits is 2.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory32() const {
+    if (isMemNoOffset(false, 4)) // alignment in bytes for 32-bits is 4.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemory32() const {
+    if (isMemNoOffset(false, 4)) // alignment in bytes for 32-bits is 4.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory64() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemory64() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory64or128() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemory64or128() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory64or128or256() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16.
+      return true;
+    if (isMemNoOffset(false, 32)) // alignment in bytes for 256-bits is 32.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAddrMode2() const {
+    if (!isMem() || Memory.Alignment != 0) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return true;
+    // Immediate offset in range [-4095, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val > -4096 && Val < 4096;
+  }
+  bool isAM2OffsetImm() const {
+    if (!isImm()) return false;
+    // Immediate offset in range [-4095, 4095].
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Val = CE->getValue();
+    return (Val == INT32_MIN) || (Val > -4096 && Val < 4096);
+  }
+  bool isAddrMode3() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+    if (!isMem() || Memory.Alignment != 0) return false;
+    // No shifts are legal for AM3.
+    if (Memory.ShiftType != ARM_AM::no_shift) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return true;
+    // Immediate offset in range [-255, 255].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    // The #-0 offset is encoded as INT32_MIN, and we have to check 
+    // for this too.
+    return (Val > -256 && Val < 256) || Val == INT32_MIN;
+  }
+  bool isAM3Offset() const {
+    if (Kind != k_Immediate && Kind != k_PostIndexRegister)
+      return false;
+    if (Kind == k_PostIndexRegister)
+      return PostIdxReg.ShiftTy == ARM_AM::no_shift;
+    // Immediate offset in range [-255, 255].
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Val = CE->getValue();
+    // Special case, #-0 is INT32_MIN.
+    return (Val > -256 && Val < 256) || Val == INT32_MIN;
+  }
+  bool isAddrMode5() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+    if (!isMem() || Memory.Alignment != 0) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return false;
+    // Immediate offset in range [-1020, 1020] and a multiple of 4.
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val >= -1020 && Val <= 1020 && ((Val & 3) == 0)) ||
+      Val == INT32_MIN;
+  }
+  bool isAddrMode5FP16() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+    if (!isMem() || Memory.Alignment != 0) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return false;
+    // Immediate offset in range [-510, 510] and a multiple of 2.
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val >= -510 && Val <= 510 && ((Val & 1) == 0)) || Val == INT32_MIN;
+  }
+  bool isMemTBB() const {
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
+      return false;
+    return true;
+  }
+  bool isMemTBH() const {
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm != 1 ||
+        Memory.Alignment != 0 )
+      return false;
+    return true;
+  }
+  bool isMemRegOffset() const {
+    if (!isMem() || !Memory.OffsetRegNum || Memory.Alignment != 0)
+      return false;
+    return true;
+  }
+  bool isT2MemRegOffset() const {
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.Alignment != 0 || Memory.BaseRegNum == ARM::PC)
+      return false;
+    // Only lsl #{0, 1, 2, 3} allowed.
+    if (Memory.ShiftType == ARM_AM::no_shift)
+      return true;
+    if (Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm > 3)
+      return false;
+    return true;
+  }
+  bool isMemThumbRR() const {
+    // Thumb reg+reg addressing is simple. Just two registers, a base and
+    // an offset. No shifts, negations or any other complicating factors.
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
+      return false;
+    return isARMLowRegister(Memory.BaseRegNum) &&
+      (!Memory.OffsetRegNum || isARMLowRegister(Memory.OffsetRegNum));
+  }
+  bool isMemThumbRIs4() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
+        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+      return false;
+    // Immediate offset, multiple of 4 in range [0, 124].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 124 && (Val % 4) == 0;
+  }
+  bool isMemThumbRIs2() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
+        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+      return false;
+    // Immediate offset, multiple of 4 in range [0, 62].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 62 && (Val % 2) == 0;
+  }
+  bool isMemThumbRIs1() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
+        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [0, 31].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 31;
+  }
+  bool isMemThumbSPI() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
+        Memory.BaseRegNum != ARM::SP || Memory.Alignment != 0)
+      return false;
+    // Immediate offset, multiple of 4 in range [0, 1020].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 1020 && (Val % 4) == 0;
+  }
+  bool isMemImm8s4Offset() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset a multiple of 4 in range [-1020, 1020].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    // Special case, #-0 is INT32_MIN.
+    return (Val >= -1020 && Val <= 1020 && (Val & 3) == 0) || Val == INT32_MIN;
+  }
+  bool isMemImm0_1020s4Offset() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset a multiple of 4 in range [0, 1020].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 1020 && (Val & 3) == 0;
+  }
+  bool isMemImm8Offset() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Base reg of PC isn't allowed for these encodings.
+    if (Memory.BaseRegNum == ARM::PC) return false;
+    // Immediate offset in range [-255, 255].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val == INT32_MIN) || (Val > -256 && Val < 256);
+  }
+  bool isMemPosImm8Offset() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [0, 255].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val < 256;
+  }
+  bool isMemNegImm8Offset() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Base reg of PC isn't allowed for these encodings.
+    if (Memory.BaseRegNum == ARM::PC) return false;
+    // Immediate offset in range [-255, -1].
+    if (!Memory.OffsetImm) return false;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val == INT32_MIN) || (Val > -256 && Val < 0);
+  }
+  bool isMemUImm12Offset() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [0, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val >= 0 && Val < 4096);
+  }
+  bool isMemImm12Offset() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [-4095, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val > -4096 && Val < 4096) || (Val == INT32_MIN);
+  }
+  bool isConstPoolAsmImm() const {
+    // Delay processing of Constant Pool Immediate, this will turn into
+    // a constant. Match no other operand
+    return (isConstantPoolImm());
+  }
+  bool isPostIdxImm8() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Val = CE->getValue();
+    return (Val > -256 && Val < 256) || (Val == INT32_MIN);
+  }
+  bool isPostIdxImm8s4() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Val = CE->getValue();
+    return ((Val & 3) == 0 && Val >= -1020 && Val <= 1020) ||
+      (Val == INT32_MIN);
+  }
+
+  bool isMSRMask() const { return Kind == k_MSRMask; }
+  bool isBankedReg() const { return Kind == k_BankedReg; }
+  bool isProcIFlags() const { return Kind == k_ProcIFlags; }
+
+  // NEON operands.
+  bool isSingleSpacedVectorList() const {
+    return Kind == k_VectorList && !VectorList.isDoubleSpaced;
+  }
+  bool isDoubleSpacedVectorList() const {
+    return Kind == k_VectorList && VectorList.isDoubleSpaced;
+  }
+  bool isVecListOneD() const {
+    if (!isSingleSpacedVectorList()) return false;
+    return VectorList.Count == 1;
+  }
+
+  bool isVecListDPair() const {
+    if (!isSingleSpacedVectorList()) return false;
+    return (ARMMCRegisterClasses[ARM::DPairRegClassID]
+              .contains(VectorList.RegNum));
+  }
+
+  bool isVecListThreeD() const {
+    if (!isSingleSpacedVectorList()) return false;
+    return VectorList.Count == 3;
+  }
+
+  bool isVecListFourD() const {
+    if (!isSingleSpacedVectorList()) return false;
+    return VectorList.Count == 4;
+  }
+
+  bool isVecListDPairSpaced() const {
+    if (Kind != k_VectorList) return false;
+    if (isSingleSpacedVectorList()) return false;
+    return (ARMMCRegisterClasses[ARM::DPairSpcRegClassID]
+              .contains(VectorList.RegNum));
+  }
+
+  bool isVecListThreeQ() const {
+    if (!isDoubleSpacedVectorList()) return false;
+    return VectorList.Count == 3;
+  }
+
+  bool isVecListFourQ() const {
+    if (!isDoubleSpacedVectorList()) return false;
+    return VectorList.Count == 4;
+  }
+
+  bool isSingleSpacedVectorAllLanes() const {
+    return Kind == k_VectorListAllLanes && !VectorList.isDoubleSpaced;
+  }
+  bool isDoubleSpacedVectorAllLanes() const {
+    return Kind == k_VectorListAllLanes && VectorList.isDoubleSpaced;
+  }
+  bool isVecListOneDAllLanes() const {
+    if (!isSingleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 1;
+  }
+
+  bool isVecListDPairAllLanes() const {
+    if (!isSingleSpacedVectorAllLanes()) return false;
+    return (ARMMCRegisterClasses[ARM::DPairRegClassID]
+              .contains(VectorList.RegNum));
+  }
+
+  bool isVecListDPairSpacedAllLanes() const {
+    if (!isDoubleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 2;
+  }
+
+  bool isVecListThreeDAllLanes() const {
+    if (!isSingleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 3;
+  }
+
+  bool isVecListThreeQAllLanes() const {
+    if (!isDoubleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 3;
+  }
+
+  bool isVecListFourDAllLanes() const {
+    if (!isSingleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 4;
+  }
+
+  bool isVecListFourQAllLanes() const {
+    if (!isDoubleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 4;
+  }
+
+  bool isSingleSpacedVectorIndexed() const {
+    return Kind == k_VectorListIndexed && !VectorList.isDoubleSpaced;
+  }
+  bool isDoubleSpacedVectorIndexed() const {
+    return Kind == k_VectorListIndexed && VectorList.isDoubleSpaced;
+  }
+  bool isVecListOneDByteIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 1 && VectorList.LaneIndex <= 7;
+  }
+
+  bool isVecListOneDHWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 1 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListOneDWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 1 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListTwoDByteIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 7;
+  }
+
+  bool isVecListTwoDHWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListTwoQWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListTwoQHWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListTwoDWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListThreeDByteIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 3 && VectorList.LaneIndex <= 7;
+  }
+
+  bool isVecListThreeDHWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 3 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListThreeQWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 3 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListThreeQHWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 3 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListThreeDWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 3 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListFourDByteIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 4 && VectorList.LaneIndex <= 7;
+  }
+
+  bool isVecListFourDHWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 4 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListFourQWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 4 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListFourQHWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 4 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListFourDWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 4 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVectorIndex8() const {
+    if (Kind != k_VectorIndex) return false;
+    return VectorIndex.Val < 8;
+  }
+  bool isVectorIndex16() const {
+    if (Kind != k_VectorIndex) return false;
+    return VectorIndex.Val < 4;
+  }
+  bool isVectorIndex32() const {
+    if (Kind != k_VectorIndex) return false;
+    return VectorIndex.Val < 2;
+  }
+
+  bool isNEONi8splat() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    // i8 value splatted across 8 bytes. The immediate is just the 8 byte
+    // value.
+    return Value >= 0 && Value < 256;
+  }
+
+  bool isNEONi16splat() const {
+    if (isNEONByteReplicate(2))
+      return false; // Leave that for bytes replication and forbid by default.
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    unsigned Value = CE->getValue();
+    return ARM_AM::isNEONi16splat(Value);
+  }
+
+  bool isNEONi16splatNot() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    unsigned Value = CE->getValue();
+    return ARM_AM::isNEONi16splat(~Value & 0xffff);
+  }
+
+  bool isNEONi32splat() const {
+    if (isNEONByteReplicate(4))
+      return false; // Leave that for bytes replication and forbid by default.
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    unsigned Value = CE->getValue();
+    return ARM_AM::isNEONi32splat(Value);
+  }
+
+  bool isNEONi32splatNot() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    unsigned Value = CE->getValue();
+    return ARM_AM::isNEONi32splat(~Value);
+  }
+
+  bool isNEONByteReplicate(unsigned NumBytes) const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE)
+      return false;
+    int64_t Value = CE->getValue();
+    if (!Value)
+      return false; // Don't bother with zero.
+
+    unsigned char B = Value & 0xff;
+    for (unsigned i = 1; i < NumBytes; ++i) {
+      Value >>= 8;
+      if ((Value & 0xff) != B)
+        return false;
+    }
+    return true;
+  }
+  bool isNEONi16ByteReplicate() const { return isNEONByteReplicate(2); }
+  bool isNEONi32ByteReplicate() const { return isNEONByteReplicate(4); }
+  bool isNEONi32vmov() const {
+    if (isNEONByteReplicate(4))
+      return false; // Let it to be classified as byte-replicate case.
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE)
+      return false;
+    int64_t Value = CE->getValue();
+    // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
+    // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
+    // FIXME: This is probably wrong and a copy and paste from previous example
+    return (Value >= 0 && Value < 256) ||
+      (Value >= 0x0100 && Value <= 0xff00) ||
+      (Value >= 0x010000 && Value <= 0xff0000) ||
+      (Value >= 0x01000000 && Value <= 0xff000000) ||
+      (Value >= 0x01ff && Value <= 0xffff && (Value & 0xff) == 0xff) ||
+      (Value >= 0x01ffff && Value <= 0xffffff && (Value & 0xffff) == 0xffff);
+  }
+  bool isNEONi32vmovNeg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    int64_t Value = ~CE->getValue();
+    // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
+    // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
+    // FIXME: This is probably wrong and a copy and paste from previous example
+    return (Value >= 0 && Value < 256) ||
+      (Value >= 0x0100 && Value <= 0xff00) ||
+      (Value >= 0x010000 && Value <= 0xff0000) ||
+      (Value >= 0x01000000 && Value <= 0xff000000) ||
+      (Value >= 0x01ff && Value <= 0xffff && (Value & 0xff) == 0xff) ||
+      (Value >= 0x01ffff && Value <= 0xffffff && (Value & 0xffff) == 0xffff);
+  }
+
+  bool isNEONi64splat() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    uint64_t Value = CE->getValue();
+    // i64 value with each byte being either 0 or 0xff.
+    for (unsigned i = 0; i < 8; ++i, Value >>= 8)
+      if ((Value & 0xff) != 0 && (Value & 0xff) != 0xff) return false;
+    return true;
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.  Null MCExpr = 0.
+    if (!Expr)
+      Inst.addOperand(MCOperand::createImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  void addARMBranchTargetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addThumbBranchTargetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(unsigned(getCondCode())));
+    unsigned RegNum = getCondCode() == ARMCC::AL ? 0: ARM::CPSR;
+    Inst.addOperand(MCOperand::createReg(RegNum));
+  }
+
+  void addCoprocNumOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getCoproc()));
+  }
+
+  void addCoprocRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getCoproc()));
+  }
+
+  void addCoprocOptionOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(CoprocOption.Val));
+  }
+
+  void addITMaskOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(ITMask.Mask));
+  }
+
+  void addITCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(unsigned(getCondCode())));
+  }
+
+  void addCCOutOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
+  void addRegShiftedRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    assert(isRegShiftedReg() &&
+           "addRegShiftedRegOperands() on non-RegShiftedReg!");
+    Inst.addOperand(MCOperand::createReg(RegShiftedReg.SrcReg));
+    Inst.addOperand(MCOperand::createReg(RegShiftedReg.ShiftReg));
+    Inst.addOperand(MCOperand::createImm(
+      ARM_AM::getSORegOpc(RegShiftedReg.ShiftTy, RegShiftedReg.ShiftImm)));
+  }
+
+  void addRegShiftedImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    assert(isRegShiftedImm() &&
+           "addRegShiftedImmOperands() on non-RegShiftedImm!");
+    Inst.addOperand(MCOperand::createReg(RegShiftedImm.SrcReg));
+    // Shift of #32 is encoded as 0 where permitted
+    unsigned Imm = (RegShiftedImm.ShiftImm == 32 ? 0 : RegShiftedImm.ShiftImm);
+    Inst.addOperand(MCOperand::createImm(
+      ARM_AM::getSORegOpc(RegShiftedImm.ShiftTy, Imm)));
+  }
+
+  void addShifterImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm((ShifterImm.isASR << 5) |
+                                         ShifterImm.Imm));
+  }
+
+  void addRegListOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const SmallVectorImpl<unsigned> &RegList = getRegList();
+    for (SmallVectorImpl<unsigned>::const_iterator
+           I = RegList.begin(), E = RegList.end(); I != E; ++I)
+      Inst.addOperand(MCOperand::createReg(*I));
+  }
+
+  void addDPRRegListOperands(MCInst &Inst, unsigned N) const {
+    addRegListOperands(Inst, N);
+  }
+
+  void addSPRRegListOperands(MCInst &Inst, unsigned N) const {
+    addRegListOperands(Inst, N);
+  }
+
+  void addRotImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // Encoded as val>>3. The printer handles display as 8, 16, 24.
+    Inst.addOperand(MCOperand::createImm(RotImm.Imm >> 3));
+  }
+
+  void addModImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    // Support for fixups (MCFixup)
+    if (isImm())
+      return addImmOperands(Inst, N);
+
+    Inst.addOperand(MCOperand::createImm(ModImm.Bits | (ModImm.Rot << 7)));
+  }
+
+  void addModImmNotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    uint32_t Enc = ARM_AM::getSOImmVal(~CE->getValue());
+    Inst.addOperand(MCOperand::createImm(Enc));
+  }
+
+  void addModImmNegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    uint32_t Enc = ARM_AM::getSOImmVal(-CE->getValue());
+    Inst.addOperand(MCOperand::createImm(Enc));
+  }
+
+  void addBitfieldOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // Munge the lsb/width into a bitfield mask.
+    unsigned lsb = Bitfield.LSB;
+    unsigned width = Bitfield.Width;
+    // Make a 32-bit mask w/ the referenced bits clear and all other bits set.
+    uint32_t Mask = ~(((uint32_t)0xffffffff >> lsb) << (32 - width) >>
+                      (32 - (lsb + width)));
+    Inst.addOperand(MCOperand::createImm(Mask));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addFBits16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(16 - CE->getValue()));
+  }
+
+  void addFBits32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(32 - CE->getValue()));
+  }
+
+  void addFPImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue()));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addImm8s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // FIXME: We really want to scale the value here, but the LDRD/STRD
+    // instruction don't encode operands that way yet.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(CE->getValue()));
+  }
+
+  void addImm0_1020s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate is scaled by four in the encoding and is stored
+    // in the MCInst as such. Lop off the low two bits here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(CE->getValue() / 4));
+  }
+
+  void addImm0_508s4NegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate is scaled by four in the encoding and is stored
+    // in the MCInst as such. Lop off the low two bits here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(-(CE->getValue() / 4)));
+  }
+
+  void addImm0_508s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate is scaled by four in the encoding and is stored
+    // in the MCInst as such. Lop off the low two bits here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(CE->getValue() / 4));
+  }
+
+  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The constant encodes as the immediate-1, and we store in the instruction
+    // the bits as encoded, so subtract off one here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(CE->getValue() - 1));
+  }
+
+  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The constant encodes as the immediate-1, and we store in the instruction
+    // the bits as encoded, so subtract off one here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(CE->getValue() - 1));
+  }
+
+  void addImmThumbSROperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The constant encodes as the immediate, except for 32, which encodes as
+    // zero.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Imm = CE->getValue();
+    Inst.addOperand(MCOperand::createImm((Imm == 32 ? 0 : Imm)));
+  }
+
+  void addPKHASRImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // An ASR value of 32 encodes as 0, so that's how we want to add it to
+    // the instruction as well.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    int Val = CE->getValue();
+    Inst.addOperand(MCOperand::createImm(Val == 32 ? 0 : Val));
+  }
+
+  void addT2SOImmNotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The operand is actually a t2_so_imm, but we have its bitwise
+    // negation in the assembly source, so twiddle it here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(~CE->getValue()));
+  }
+
+  void addT2SOImmNegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The operand is actually a t2_so_imm, but we have its
+    // negation in the assembly source, so twiddle it here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(-CE->getValue()));
+  }
+
+  void addImm0_4095NegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The operand is actually an imm0_4095, but we have its
+    // negation in the assembly source, so twiddle it here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(-CE->getValue()));
+  }
+
+  void addUnsignedOffset_b8s2Operands(MCInst &Inst, unsigned N) const {
+    if(const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm())) {
+      Inst.addOperand(MCOperand::createImm(CE->getValue() >> 2));
+      return;
+    }
+
+    const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
+    assert(SR && "Unknown value type!");
+    Inst.addOperand(MCOperand::createExpr(SR));
+  }
+
+  void addThumbMemPCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (isImm()) {
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+      if (CE) {
+        Inst.addOperand(MCOperand::createImm(CE->getValue()));
+        return;
+      }
+
+      const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
+ 
+      assert(SR && "Unknown value type!");
+      Inst.addOperand(MCOperand::createExpr(SR));
+      return;
+    }
+
+    assert(isMem()  && "Unknown value type!");
+    assert(isa<MCConstantExpr>(Memory.OffsetImm) && "Unknown value type!");
+    Inst.addOperand(MCOperand::createImm(Memory.OffsetImm->getValue()));
+  }
+
+  void addMemBarrierOptOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(unsigned(getMemBarrierOpt())));
+  }
+
+  void addInstSyncBarrierOptOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(unsigned(getInstSyncBarrierOpt())));
+  }
+
+  void addMemNoOffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+  }
+
+  void addMemPCRelImm12Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    int32_t Imm = Memory.OffsetImm->getValue();
+    Inst.addOperand(MCOperand::createImm(Imm));
+  }
+
+  void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(isImm() && "Not an immediate!");
+
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. 
+    if (!isa<MCConstantExpr>(getImm())) {
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+      return;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    int Val = CE->getValue();
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addAlignedMemoryOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createImm(Memory.Alignment));
+  }
+
+  void addDupAlignedMemoryNoneOperands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemoryNoneOperands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory16Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addDupAlignedMemory16Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory32Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addDupAlignedMemory32Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory64Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addDupAlignedMemory64Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory64or128Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addDupAlignedMemory64or128Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory64or128or256Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAddrMode2Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    if (!Memory.OffsetRegNum) {
+      ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+      // Special case for #-0
+      if (Val == INT32_MIN) Val = 0;
+      if (Val < 0) Val = -Val;
+      Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift);
+    } else {
+      // For register offset, we encode the shift type and negation flag
+      // here.
+      Val = ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add,
+                              Memory.ShiftImm, Memory.ShiftType);
+    }
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addAM2OffsetImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE && "non-constant AM2OffsetImm operand!");
+    int32_t Val = CE->getValue();
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift);
+    Inst.addOperand(MCOperand::createReg(0));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addAddrMode3Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm()) {
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+      Inst.addOperand(MCOperand::createReg(0));
+      Inst.addOperand(MCOperand::createImm(0));
+      return;
+    }
+
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    if (!Memory.OffsetRegNum) {
+      ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+      // Special case for #-0
+      if (Val == INT32_MIN) Val = 0;
+      if (Val < 0) Val = -Val;
+      Val = ARM_AM::getAM3Opc(AddSub, Val);
+    } else {
+      // For register offset, we encode the shift type and negation flag
+      // here.
+      Val = ARM_AM::getAM3Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add, 0);
+    }
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addAM3OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    if (Kind == k_PostIndexRegister) {
+      int32_t Val =
+        ARM_AM::getAM3Opc(PostIdxReg.isAdd ? ARM_AM::add : ARM_AM::sub, 0);
+      Inst.addOperand(MCOperand::createReg(PostIdxReg.RegNum));
+      Inst.addOperand(MCOperand::createImm(Val));
+      return;
+    }
+
+    // Constant offset.
+    const MCConstantExpr *CE = static_cast<const MCConstantExpr*>(getImm());
+    int32_t Val = CE->getValue();
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM3Opc(AddSub, Val);
+    Inst.addOperand(MCOperand::createReg(0));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addAddrMode5Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm()) {
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+      Inst.addOperand(MCOperand::createImm(0));
+      return;
+    }
+
+    // The lower two bits are always zero and as such are not encoded.
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0;
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM5Opc(AddSub, Val);
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addAddrMode5FP16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm()) {
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+      Inst.addOperand(MCOperand::createImm(0));
+      return;
+    }
+
+    // The lower bit is always zero and as such is not encoded.
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 2 : 0;
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM5FP16Opc(AddSub, Val);
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addMemImm8s4OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm()) {
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+      Inst.addOperand(MCOperand::createImm(0));
+      return;
+    }
+
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addMemImm0_1020s4OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // The lower two bits are always zero and as such are not encoded.
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0;
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addMemImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addMemPosImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+    addMemImm8OffsetOperands(Inst, N);
+  }
+
+  void addMemNegImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+    addMemImm8OffsetOperands(Inst, N);
+  }
+
+  void addMemUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If this is an immediate, it's a label reference.
+    if (isImm()) {
+      addExpr(Inst, getImm());
+      Inst.addOperand(MCOperand::createImm(0));
+      return;
+    }
+
+    // Otherwise, it's a normal memory reg+offset.
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addMemImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If this is an immediate, it's a label reference.
+    if (isImm()) {
+      addExpr(Inst, getImm());
+      Inst.addOperand(MCOperand::createImm(0));
+      return;
+    }
+
+    // Otherwise, it's a normal memory reg+offset.
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addConstPoolAsmImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // This is container for the immediate that we will create the constant
+    // pool from
+    addExpr(Inst, getConstantPoolImm());
+    return;
+  }
+
+  void addMemTBBOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
+  }
+
+  void addMemTBHOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
+  }
+
+  void addMemRegOffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    unsigned Val =
+      ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add,
+                        Memory.ShiftImm, Memory.ShiftType);
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addT2MemRegOffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::createImm(Memory.ShiftImm));
+  }
+
+  void addMemThumbRROperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
+  }
+
+  void addMemThumbRIs4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0;
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addMemThumbRIs2Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 2) : 0;
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addMemThumbRIs1Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue()) : 0;
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addMemThumbSPIOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0;
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addPostIdxImm8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE && "non-constant post-idx-imm8 operand!");
+    int Imm = CE->getValue();
+    bool isAdd = Imm >= 0;
+    if (Imm == INT32_MIN) Imm = 0;
+    Imm = (Imm < 0 ? -Imm : Imm) | (int)isAdd << 8;
+    Inst.addOperand(MCOperand::createImm(Imm));
+  }
+
+  void addPostIdxImm8s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE && "non-constant post-idx-imm8s4 operand!");
+    int Imm = CE->getValue();
+    bool isAdd = Imm >= 0;
+    if (Imm == INT32_MIN) Imm = 0;
+    // Immediate is scaled by 4.
+    Imm = ((Imm < 0 ? -Imm : Imm) / 4) | (int)isAdd << 8;
+    Inst.addOperand(MCOperand::createImm(Imm));
+  }
+
+  void addPostIdxRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(PostIdxReg.RegNum));
+    Inst.addOperand(MCOperand::createImm(PostIdxReg.isAdd));
+  }
+
+  void addPostIdxRegShiftedOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(PostIdxReg.RegNum));
+    // The sign, shift type, and shift amount are encoded in a single operand
+    // using the AM2 encoding helpers.
+    ARM_AM::AddrOpc opc = PostIdxReg.isAdd ? ARM_AM::add : ARM_AM::sub;
+    unsigned Imm = ARM_AM::getAM2Opc(opc, PostIdxReg.ShiftImm,
+                                     PostIdxReg.ShiftTy);
+    Inst.addOperand(MCOperand::createImm(Imm));
+  }
+
+  void addMSRMaskOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(unsigned(getMSRMask())));
+  }
+
+  void addBankedRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(unsigned(getBankedReg())));
+  }
+
+  void addProcIFlagsOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(unsigned(getProcIFlags())));
+  }
+
+  void addVecListOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(VectorList.RegNum));
+  }
+
+  void addVecListIndexedOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(VectorList.RegNum));
+    Inst.addOperand(MCOperand::createImm(VectorList.LaneIndex));
+  }
+
+  void addVectorIndex8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+  }
+
+  void addVectorIndex16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+  }
+
+  void addVectorIndex32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getVectorIndex()));
+  }
+
+  void addNEONi8splatOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    // Mask in that this is an i8 splat.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::createImm(CE->getValue() | 0xe00));
+  }
+
+  void addNEONi16splatOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    Value = ARM_AM::encodeNEONi16splat(Value);
+    Inst.addOperand(MCOperand::createImm(Value));
+  }
+
+  void addNEONi16splatNotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    Value = ARM_AM::encodeNEONi16splat(~Value & 0xffff);
+    Inst.addOperand(MCOperand::createImm(Value));
+  }
+
+  void addNEONi32splatOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    Value = ARM_AM::encodeNEONi32splat(Value);
+    Inst.addOperand(MCOperand::createImm(Value));
+  }
+
+  void addNEONi32splatNotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    Value = ARM_AM::encodeNEONi32splat(~Value);
+    Inst.addOperand(MCOperand::createImm(Value));
+  }
+
+  void addNEONinvByteReplicateOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
+            Inst.getOpcode() == ARM::VMOVv16i8) &&
+           "All vmvn instructions that wants to replicate non-zero byte "
+           "always must be replaced with VMOVv8i8 or VMOVv16i8.");
+    unsigned B = ((~Value) & 0xff);
+    B |= 0xe00; // cmode = 0b1110
+    Inst.addOperand(MCOperand::createImm(B));
+  }
+  void addNEONi32vmovOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    if (Value >= 256 && Value <= 0xffff)
+      Value = (Value >> 8) | ((Value & 0xff) ? 0xc00 : 0x200);
+    else if (Value > 0xffff && Value <= 0xffffff)
+      Value = (Value >> 16) | ((Value & 0xff) ? 0xd00 : 0x400);
+    else if (Value > 0xffffff)
+      Value = (Value >> 24) | 0x600;
+    Inst.addOperand(MCOperand::createImm(Value));
+  }
+
+  void addNEONvmovByteReplicateOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
+            Inst.getOpcode() == ARM::VMOVv16i8) &&
+           "All instructions that wants to replicate non-zero byte "
+           "always must be replaced with VMOVv8i8 or VMOVv16i8.");
+    unsigned B = Value & 0xff;
+    B |= 0xe00; // cmode = 0b1110
+    Inst.addOperand(MCOperand::createImm(B));
+  }
+  void addNEONi32vmovNegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = ~CE->getValue();
+    if (Value >= 256 && Value <= 0xffff)
+      Value = (Value >> 8) | ((Value & 0xff) ? 0xc00 : 0x200);
+    else if (Value > 0xffff && Value <= 0xffffff)
+      Value = (Value >> 16) | ((Value & 0xff) ? 0xd00 : 0x400);
+    else if (Value > 0xffffff)
+      Value = (Value >> 24) | 0x600;
+    Inst.addOperand(MCOperand::createImm(Value));
+  }
+
+  void addNEONi64splatOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+    unsigned Imm = 0;
+    for (unsigned i = 0; i < 8; ++i, Value >>= 8) {
+      Imm |= (Value & 1) << i;
+    }
+    Inst.addOperand(MCOperand::createImm(Imm | 0x1e00));
+  }
+
+  void print(raw_ostream &OS) const override;
+
+  static std::unique_ptr<ARMOperand> CreateITMask(unsigned Mask, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_ITCondMask);
+    Op->ITMask.Mask = Mask;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateCondCode(ARMCC::CondCodes CC,
+                                                    SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_CondCode);
+    Op->CC.Val = CC;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateCoprocNum(unsigned CopVal, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_CoprocNum);
+    Op->Cop.Val = CopVal;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateCoprocReg(unsigned CopVal, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_CoprocReg);
+    Op->Cop.Val = CopVal;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateCoprocOption(unsigned Val, SMLoc S,
+                                                        SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_CoprocOption);
+    Op->Cop.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateCCOut(unsigned RegNum, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_CCOut);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateToken(StringRef Str, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateReg(unsigned RegNum, SMLoc S,
+                                               SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_Register);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateShiftedRegister(ARM_AM::ShiftOpc ShTy, unsigned SrcReg,
+                        unsigned ShiftReg, unsigned ShiftImm, SMLoc S,
+                        SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ShiftedRegister);
+    Op->RegShiftedReg.ShiftTy = ShTy;
+    Op->RegShiftedReg.SrcReg = SrcReg;
+    Op->RegShiftedReg.ShiftReg = ShiftReg;
+    Op->RegShiftedReg.ShiftImm = ShiftImm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy, unsigned SrcReg,
+                         unsigned ShiftImm, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ShiftedImmediate);
+    Op->RegShiftedImm.ShiftTy = ShTy;
+    Op->RegShiftedImm.SrcReg = SrcReg;
+    Op->RegShiftedImm.ShiftImm = ShiftImm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateShifterImm(bool isASR, unsigned Imm,
+                                                      SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ShifterImmediate);
+    Op->ShifterImm.isASR = isASR;
+    Op->ShifterImm.Imm = Imm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateRotImm(unsigned Imm, SMLoc S,
+                                                  SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_RotateImmediate);
+    Op->RotImm.Imm = Imm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateModImm(unsigned Bits, unsigned Rot,
+                                                  SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ModifiedImmediate);
+    Op->ModImm.Bits = Bits;
+    Op->ModImm.Rot = Rot;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateConstantPoolImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_ConstantPoolImmediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateBitfield(unsigned LSB, unsigned Width, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_BitfieldDescriptor);
+    Op->Bitfield.LSB = LSB;
+    Op->Bitfield.Width = Width;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateRegList(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
+                SMLoc StartLoc, SMLoc EndLoc) {
+    assert (Regs.size() > 0 && "RegList contains no registers?");
+    KindTy Kind = k_RegisterList;
+
+    if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Regs.front().second))
+      Kind = k_DPRRegisterList;
+    else if (ARMMCRegisterClasses[ARM::SPRRegClassID].
+             contains(Regs.front().second))
+      Kind = k_SPRRegisterList;
+
+    // Sort based on the register encoding values.
+    array_pod_sort(Regs.begin(), Regs.end());
+
+    auto Op = make_unique<ARMOperand>(Kind);
+    for (SmallVectorImpl<std::pair<unsigned, unsigned> >::const_iterator
+           I = Regs.begin(), E = Regs.end(); I != E; ++I)
+      Op->Registers.push_back(I->second);
+    Op->StartLoc = StartLoc;
+    Op->EndLoc = EndLoc;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateVectorList(unsigned RegNum,
+                                                      unsigned Count,
+                                                      bool isDoubleSpaced,
+                                                      SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_VectorList);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.isDoubleSpaced = isDoubleSpaced;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateVectorListAllLanes(unsigned RegNum, unsigned Count, bool isDoubleSpaced,
+                           SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_VectorListAllLanes);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.isDoubleSpaced = isDoubleSpaced;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateVectorListIndexed(unsigned RegNum, unsigned Count, unsigned Index,
+                          bool isDoubleSpaced, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_VectorListIndexed);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.LaneIndex = Index;
+    Op->VectorList.isDoubleSpaced = isDoubleSpaced;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
+    auto Op = make_unique<ARMOperand>(k_VectorIndex);
+    Op->VectorIndex.Val = Idx;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateImm(const MCExpr *Val, SMLoc S,
+                                               SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateMem(unsigned BaseRegNum, const MCConstantExpr *OffsetImm,
+            unsigned OffsetRegNum, ARM_AM::ShiftOpc ShiftType,
+            unsigned ShiftImm, unsigned Alignment, bool isNegative, SMLoc S,
+            SMLoc E, SMLoc AlignmentLoc = SMLoc()) {
+    auto Op = make_unique<ARMOperand>(k_Memory);
+    Op->Memory.BaseRegNum = BaseRegNum;
+    Op->Memory.OffsetImm = OffsetImm;
+    Op->Memory.OffsetRegNum = OffsetRegNum;
+    Op->Memory.ShiftType = ShiftType;
+    Op->Memory.ShiftImm = ShiftImm;
+    Op->Memory.Alignment = Alignment;
+    Op->Memory.isNegative = isNegative;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    Op->AlignmentLoc = AlignmentLoc;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreatePostIdxReg(unsigned RegNum, bool isAdd, ARM_AM::ShiftOpc ShiftTy,
+                   unsigned ShiftImm, SMLoc S, SMLoc E) {
+    auto Op = make_unique<ARMOperand>(k_PostIndexRegister);
+    Op->PostIdxReg.RegNum = RegNum;
+    Op->PostIdxReg.isAdd = isAdd;
+    Op->PostIdxReg.ShiftTy = ShiftTy;
+    Op->PostIdxReg.ShiftImm = ShiftImm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateMemBarrierOpt(ARM_MB::MemBOpt Opt,
+                                                         SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_MemBarrierOpt);
+    Op->MBOpt.Val = Opt;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand>
+  CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_InstSyncBarrierOpt);
+    Op->ISBOpt.Val = Opt;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateProcIFlags(ARM_PROC::IFlags IFlags,
+                                                      SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_ProcIFlags);
+    Op->IFlags.Val = IFlags;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateMSRMask(unsigned MMask, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_MSRMask);
+    Op->MMask.Val = MMask;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<ARMOperand> CreateBankedReg(unsigned Reg, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_BankedReg);
+    Op->BankedReg.Val = Reg;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+};
+
+} // end anonymous namespace.
+
+void ARMOperand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case k_CondCode:
+    OS << "<ARMCC::" << ARMCondCodeToString(getCondCode()) << ">";
+    break;
+  case k_CCOut:
+    OS << "<ccout " << getReg() << ">";
+    break;
+  case k_ITCondMask: {
+    static const char *const MaskStr[] = {
+      "()", "(t)", "(e)", "(tt)", "(et)", "(te)", "(ee)", "(ttt)", "(ett)",
+      "(tet)", "(eet)", "(tte)", "(ete)", "(tee)", "(eee)"
+    };
+    assert((ITMask.Mask & 0xf) == ITMask.Mask);
+    OS << "<it-mask " << MaskStr[ITMask.Mask] << ">";
+    break;
+  }
+  case k_CoprocNum:
+    OS << "<coprocessor number: " << getCoproc() << ">";
+    break;
+  case k_CoprocReg:
+    OS << "<coprocessor register: " << getCoproc() << ">";
+    break;
+  case k_CoprocOption:
+    OS << "<coprocessor option: " << CoprocOption.Val << ">";
+    break;
+  case k_MSRMask:
+    OS << "<mask: " << getMSRMask() << ">";
+    break;
+  case k_BankedReg:
+    OS << "<banked reg: " << getBankedReg() << ">";
+    break;
+  case k_Immediate:
+    OS << *getImm();
+    break;
+  case k_MemBarrierOpt:
+    OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt(), false) << ">";
+    break;
+  case k_InstSyncBarrierOpt:
+    OS << "<ARM_ISB::" << InstSyncBOptToString(getInstSyncBarrierOpt()) << ">";
+    break;
+  case k_Memory:
+    OS << "<memory "
+       << " base:" << Memory.BaseRegNum;
+    OS << ">";
+    break;
+  case k_PostIndexRegister:
+    OS << "post-idx register " << (PostIdxReg.isAdd ? "" : "-")
+       << PostIdxReg.RegNum;
+    if (PostIdxReg.ShiftTy != ARM_AM::no_shift)
+      OS << ARM_AM::getShiftOpcStr(PostIdxReg.ShiftTy) << " "
+         << PostIdxReg.ShiftImm;
+    OS << ">";
+    break;
+  case k_ProcIFlags: {
+    OS << "<ARM_PROC::";
+    unsigned IFlags = getProcIFlags();
+    for (int i=2; i >= 0; --i)
+      if (IFlags & (1 << i))
+        OS << ARM_PROC::IFlagsToString(1 << i);
+    OS << ">";
+    break;
+  }
+  case k_Register:
+    OS << "<register " << getReg() << ">";
+    break;
+  case k_ShifterImmediate:
+    OS << "<shift " << (ShifterImm.isASR ? "asr" : "lsl")
+       << " #" << ShifterImm.Imm << ">";
+    break;
+  case k_ShiftedRegister:
+    OS << "<so_reg_reg "
+       << RegShiftedReg.SrcReg << " "
+       << ARM_AM::getShiftOpcStr(RegShiftedReg.ShiftTy)
+       << " " << RegShiftedReg.ShiftReg << ">";
+    break;
+  case k_ShiftedImmediate:
+    OS << "<so_reg_imm "
+       << RegShiftedImm.SrcReg << " "
+       << ARM_AM::getShiftOpcStr(RegShiftedImm.ShiftTy)
+       << " #" << RegShiftedImm.ShiftImm << ">";
+    break;
+  case k_RotateImmediate:
+    OS << "<ror " << " #" << (RotImm.Imm * 8) << ">";
+    break;
+  case k_ModifiedImmediate:
+    OS << "<mod_imm #" << ModImm.Bits << ", #"
+       <<  ModImm.Rot << ")>";
+    break;
+  case k_ConstantPoolImmediate:
+    OS << "<constant_pool_imm #" << *getConstantPoolImm();
+    break;
+  case k_BitfieldDescriptor:
+    OS << "<bitfield " << "lsb: " << Bitfield.LSB
+       << ", width: " << Bitfield.Width << ">";
+    break;
+  case k_RegisterList:
+  case k_DPRRegisterList:
+  case k_SPRRegisterList: {
+    OS << "<register_list ";
+
+    const SmallVectorImpl<unsigned> &RegList = getRegList();
+    for (SmallVectorImpl<unsigned>::const_iterator
+           I = RegList.begin(), E = RegList.end(); I != E; ) {
+      OS << *I;
+      if (++I < E) OS << ", ";
+    }
+
+    OS << ">";
+    break;
+  }
+  case k_VectorList:
+    OS << "<vector_list " << VectorList.Count << " * "
+       << VectorList.RegNum << ">";
+    break;
+  case k_VectorListAllLanes:
+    OS << "<vector_list(all lanes) " << VectorList.Count << " * "
+       << VectorList.RegNum << ">";
+    break;
+  case k_VectorListIndexed:
+    OS << "<vector_list(lane " << VectorList.LaneIndex << ") "
+       << VectorList.Count << " * " << VectorList.RegNum << ">";
+    break;
+  case k_Token:
+    OS << "'" << getToken() << "'";
+    break;
+  case k_VectorIndex:
+    OS << "<vectorindex " << getVectorIndex() << ">";
+    break;
+  }
+}
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+bool ARMAsmParser::ParseRegister(unsigned &RegNo,
+                                 SMLoc &StartLoc, SMLoc &EndLoc) {
+  const AsmToken &Tok = getParser().getTok();
+  StartLoc = Tok.getLoc();
+  EndLoc = Tok.getEndLoc();
+  RegNo = tryParseRegister();
+
+  return (RegNo == (unsigned)-1);
+}
+
+/// Try to parse a register name.  The token must be an Identifier when called,
+/// and if it is a register name the token is eaten and the register number is
+/// returned.  Otherwise return -1.
+///
+int ARMAsmParser::tryParseRegister() {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier)) return -1;
+
+  std::string lowerCase = Tok.getString().lower();
+  unsigned RegNum = MatchRegisterName(lowerCase);
+  if (!RegNum) {
+    RegNum = StringSwitch<unsigned>(lowerCase)
+      .Case("r13", ARM::SP)
+      .Case("r14", ARM::LR)
+      .Case("r15", ARM::PC)
+      .Case("ip", ARM::R12)
+      // Additional register name aliases for 'gas' compatibility.
+      .Case("a1", ARM::R0)
+      .Case("a2", ARM::R1)
+      .Case("a3", ARM::R2)
+      .Case("a4", ARM::R3)
+      .Case("v1", ARM::R4)
+      .Case("v2", ARM::R5)
+      .Case("v3", ARM::R6)
+      .Case("v4", ARM::R7)
+      .Case("v5", ARM::R8)
+      .Case("v6", ARM::R9)
+      .Case("v7", ARM::R10)
+      .Case("v8", ARM::R11)
+      .Case("sb", ARM::R9)
+      .Case("sl", ARM::R10)
+      .Case("fp", ARM::R11)
+      .Default(0);
+  }
+  if (!RegNum) {
+    // Check for aliases registered via .req. Canonicalize to lower case.
+    // That's more consistent since register names are case insensitive, and
+    // it's how the original entry was passed in from MC/MCParser/AsmParser.
+    StringMap<unsigned>::const_iterator Entry = RegisterReqs.find(lowerCase);
+    // If no match, return failure.
+    if (Entry == RegisterReqs.end())
+      return -1;
+    Parser.Lex(); // Eat identifier token.
+    return Entry->getValue();
+  }
+
+  // Some FPUs only have 16 D registers, so D16-D31 are invalid
+  if (hasD16() && RegNum >= ARM::D16 && RegNum <= ARM::D31)
+    return -1;
+
+  Parser.Lex(); // Eat identifier token.
+
+  return RegNum;
+}
+
+// Try to parse a shifter  (e.g., "lsl <amt>"). On success, return 0.
+// If a recoverable error occurs, return 1. If an irrecoverable error
+// occurs, return -1. An irrecoverable error is one where tokens have been
+// consumed in the process of trying to parse the shifter (i.e., when it is
+// indeed a shifter operand, but malformed).
+int ARMAsmParser::tryParseShiftRegister(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return -1; 
+
+  std::string lowerCase = Tok.getString().lower();
+  ARM_AM::ShiftOpc ShiftTy = StringSwitch<ARM_AM::ShiftOpc>(lowerCase)
+      .Case("asl", ARM_AM::lsl)
+      .Case("lsl", ARM_AM::lsl)
+      .Case("lsr", ARM_AM::lsr)
+      .Case("asr", ARM_AM::asr)
+      .Case("ror", ARM_AM::ror)
+      .Case("rrx", ARM_AM::rrx)
+      .Default(ARM_AM::no_shift);
+
+  if (ShiftTy == ARM_AM::no_shift)
+    return 1;
+
+  Parser.Lex(); // Eat the operator.
+
+  // The source register for the shift has already been added to the
+  // operand list, so we need to pop it off and combine it into the shifted
+  // register operand instead.
+  std::unique_ptr<ARMOperand> PrevOp(
+      (ARMOperand *)Operands.pop_back_val().release());
+  if (!PrevOp->isReg())
+    return Error(PrevOp->getStartLoc(), "shift must be of a register");
+  int SrcReg = PrevOp->getReg();
+
+  SMLoc EndLoc;
+  int64_t Imm = 0;
+  int ShiftReg = 0;
+  if (ShiftTy == ARM_AM::rrx) {
+    // RRX Doesn't have an explicit shift amount. The encoder expects
+    // the shift register to be the same as the source register. Seems odd,
+    // but OK.
+    ShiftReg = SrcReg;
+  } else {
+    // Figure out if this is shifted by a constant or a register (for non-RRX).
+    if (Parser.getTok().is(AsmToken::Hash) ||
+        Parser.getTok().is(AsmToken::Dollar)) {
+      Parser.Lex(); // Eat hash.
+      SMLoc ImmLoc = Parser.getTok().getLoc();
+      const MCExpr *ShiftExpr = nullptr;
+      if (getParser().parseExpression(ShiftExpr, EndLoc)) {
+        Error(ImmLoc, "invalid immediate shift value");
+        return -1;
+      }
+      // The expression must be evaluatable as an immediate.
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftExpr);
+      if (!CE) {
+        Error(ImmLoc, "invalid immediate shift value");
+        return -1;
+      }
+      // Range check the immediate.
+      // lsl, ror: 0 <= imm <= 31
+      // lsr, asr: 0 <= imm <= 32
+      Imm = CE->getValue();
+      if (Imm < 0 ||
+          ((ShiftTy == ARM_AM::lsl || ShiftTy == ARM_AM::ror) && Imm > 31) ||
+          ((ShiftTy == ARM_AM::lsr || ShiftTy == ARM_AM::asr) && Imm > 32)) {
+        Error(ImmLoc, "immediate shift value out of range");
+        return -1;
+      }
+      // shift by zero is a nop. Always send it through as lsl.
+      // ('as' compatibility)
+      if (Imm == 0)
+        ShiftTy = ARM_AM::lsl;
+    } else if (Parser.getTok().is(AsmToken::Identifier)) {
+      SMLoc L = Parser.getTok().getLoc();
+      EndLoc = Parser.getTok().getEndLoc();
+      ShiftReg = tryParseRegister();
+      if (ShiftReg == -1) {
+        Error(L, "expected immediate or register in shift operand");
+        return -1;
+      }
+    } else {
+      Error(Parser.getTok().getLoc(),
+            "expected immediate or register in shift operand");
+      return -1;
+    }
+  }
+
+  if (ShiftReg && ShiftTy != ARM_AM::rrx)
+    Operands.push_back(ARMOperand::CreateShiftedRegister(ShiftTy, SrcReg,
+                                                         ShiftReg, Imm,
+                                                         S, EndLoc));
+  else
+    Operands.push_back(ARMOperand::CreateShiftedImmediate(ShiftTy, SrcReg, Imm,
+                                                          S, EndLoc));
+
+  return 0;
+}
+
+
+/// Try to parse a register name.  The token must be an Identifier when called.
+/// If it's a register, an AsmOperand is created. Another AsmOperand is created
+/// if there is a "writeback". 'true' if it's not a register.
+///
+/// TODO this is likely to change to allow different register types and or to
+/// parse for a specific register type.
+bool ARMAsmParser::tryParseRegisterWithWriteBack(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &RegTok = Parser.getTok();
+  int RegNo = tryParseRegister();
+  if (RegNo == -1)
+    return true;
+
+  Operands.push_back(ARMOperand::CreateReg(RegNo, RegTok.getLoc(),
+                                           RegTok.getEndLoc()));
+
+  const AsmToken &ExclaimTok = Parser.getTok();
+  if (ExclaimTok.is(AsmToken::Exclaim)) {
+    Operands.push_back(ARMOperand::CreateToken(ExclaimTok.getString(),
+                                               ExclaimTok.getLoc()));
+    Parser.Lex(); // Eat exclaim token
+    return false;
+  }
+
+  // Also check for an index operand. This is only legal for vector registers,
+  // but that'll get caught OK in operand matching, so we don't need to
+  // explicitly filter everything else out here.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return true;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE)
+      return TokError("immediate value expected for vector index");
+
+    if (Parser.getTok().isNot(AsmToken::RBrac))
+      return Error(Parser.getTok().getLoc(), "']' expected");
+
+    SMLoc E = Parser.getTok().getEndLoc();
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(ARMOperand::CreateVectorIndex(MCE->getValue(),
+                                                     SIdx, E,
+                                                     getContext()));
+  }
+
+  return false;
+}
+
+/// MatchCoprocessorOperandName - Try to parse an coprocessor related
+/// instruction with a symbolic operand name.
+/// We accept "crN" syntax for GAS compatibility.
+/// <operand-name> ::= <prefix><number>
+/// If CoprocOp is 'c', then:
+///   <prefix> ::= c | cr
+/// If CoprocOp is 'p', then :
+///   <prefix> ::= p
+/// <number> ::= integer in range [0, 15]
+static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) {
+  // Use the same layout as the tablegen'erated register name matcher. Ugly,
+  // but efficient.
+  if (Name.size() < 2 || Name[0] != CoprocOp)
+    return -1;
+  Name = (Name[1] == 'r') ? Name.drop_front(2) : Name.drop_front();
+
+  switch (Name.size()) {
+  default: return -1;
+  case 1:
+    switch (Name[0]) {
+    default:  return -1;
+    case '0': return 0;
+    case '1': return 1;
+    case '2': return 2;
+    case '3': return 3;
+    case '4': return 4;
+    case '5': return 5;
+    case '6': return 6;
+    case '7': return 7;
+    case '8': return 8;
+    case '9': return 9;
+    }
+  case 2:
+    if (Name[0] != '1')
+      return -1;
+    switch (Name[1]) {
+    default:  return -1;
+    // CP10 and CP11 are VFP/NEON and so vector instructions should be used.
+    // However, old cores (v5/v6) did use them in that way.
+    case '0': return 10;
+    case '1': return 11;
+    case '2': return 12;
+    case '3': return 13;
+    case '4': return 14;
+    case '5': return 15;
+    }
+  }
+}
+
+/// parseITCondCode - Try to parse a condition code for an IT instruction.
+OperandMatchResultTy
+ARMAsmParser::parseITCondCode(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+  unsigned CC = StringSwitch<unsigned>(Tok.getString().lower())
+    .Case("eq", ARMCC::EQ)
+    .Case("ne", ARMCC::NE)
+    .Case("hs", ARMCC::HS)
+    .Case("cs", ARMCC::HS)
+    .Case("lo", ARMCC::LO)
+    .Case("cc", ARMCC::LO)
+    .Case("mi", ARMCC::MI)
+    .Case("pl", ARMCC::PL)
+    .Case("vs", ARMCC::VS)
+    .Case("vc", ARMCC::VC)
+    .Case("hi", ARMCC::HI)
+    .Case("ls", ARMCC::LS)
+    .Case("ge", ARMCC::GE)
+    .Case("lt", ARMCC::LT)
+    .Case("gt", ARMCC::GT)
+    .Case("le", ARMCC::LE)
+    .Case("al", ARMCC::AL)
+    .Default(~0U);
+  if (CC == ~0U)
+    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the token.
+
+  Operands.push_back(ARMOperand::CreateCondCode(ARMCC::CondCodes(CC), S));
+
+  return MatchOperand_Success;
+}
+
+/// parseCoprocNumOperand - Try to parse an coprocessor number operand. The
+/// token must be an Identifier when called, and if it is a coprocessor
+/// number, the token is eaten and the operand is added to the operand list.
+OperandMatchResultTy
+ARMAsmParser::parseCoprocNumOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  int Num = MatchCoprocessorOperandName(Tok.getString(), 'p');
+  if (Num == -1)
+    return MatchOperand_NoMatch;
+  // ARMv7 and v8 don't allow cp10/cp11 due to VFP/NEON specific instructions
+  if ((hasV7Ops() || hasV8Ops()) && (Num == 10 || Num == 11))
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateCoprocNum(Num, S));
+  return MatchOperand_Success;
+}
+
+/// parseCoprocRegOperand - Try to parse an coprocessor register operand. The
+/// token must be an Identifier when called, and if it is a coprocessor
+/// number, the token is eaten and the operand is added to the operand list.
+OperandMatchResultTy
+ARMAsmParser::parseCoprocRegOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  int Reg = MatchCoprocessorOperandName(Tok.getString(), 'c');
+  if (Reg == -1)
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateCoprocReg(Reg, S));
+  return MatchOperand_Success;
+}
+
+/// parseCoprocOptionOperand - Try to parse an coprocessor option operand.
+/// coproc_option : '{' imm0_255 '}'
+OperandMatchResultTy
+ARMAsmParser::parseCoprocOptionOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = Parser.getTok().getLoc();
+
+  // If this isn't a '{', this isn't a coprocessor immediate operand.
+  if (Parser.getTok().isNot(AsmToken::LCurly))
+    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the '{'
+
+  const MCExpr *Expr;
+  SMLoc Loc = Parser.getTok().getLoc();
+  if (getParser().parseExpression(Expr)) {
+    Error(Loc, "illegal expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+  if (!CE || CE->getValue() < 0 || CE->getValue() > 255) {
+    Error(Loc, "coprocessor option must be an immediate in range [0, 255]");
+    return MatchOperand_ParseFail;
+  }
+  int Val = CE->getValue();
+
+  // Check for and consume the closing '}'
+  if (Parser.getTok().isNot(AsmToken::RCurly))
+    return MatchOperand_ParseFail;
+  SMLoc E = Parser.getTok().getEndLoc();
+  Parser.Lex(); // Eat the '}'
+
+  Operands.push_back(ARMOperand::CreateCoprocOption(Val, S, E));
+  return MatchOperand_Success;
+}
+
+// For register list parsing, we need to map from raw GPR register numbering
+// to the enumeration values. The enumeration values aren't sorted by
+// register number due to our using "sp", "lr" and "pc" as canonical names.
+static unsigned getNextRegister(unsigned Reg) {
+  // If this is a GPR, we need to do it manually, otherwise we can rely
+  // on the sort ordering of the enumeration since the other reg-classes
+  // are sane.
+  if (!ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+    return Reg + 1;
+  switch(Reg) {
+  default: llvm_unreachable("Invalid GPR number!");
+  case ARM::R0:  return ARM::R1;  case ARM::R1:  return ARM::R2;
+  case ARM::R2:  return ARM::R3;  case ARM::R3:  return ARM::R4;
+  case ARM::R4:  return ARM::R5;  case ARM::R5:  return ARM::R6;
+  case ARM::R6:  return ARM::R7;  case ARM::R7:  return ARM::R8;
+  case ARM::R8:  return ARM::R9;  case ARM::R9:  return ARM::R10;
+  case ARM::R10: return ARM::R11; case ARM::R11: return ARM::R12;
+  case ARM::R12: return ARM::SP;  case ARM::SP:  return ARM::LR;
+  case ARM::LR:  return ARM::PC;  case ARM::PC:  return ARM::R0;
+  }
+}
+
+// Return the low-subreg of a given Q register.
+static unsigned getDRegFromQReg(unsigned QReg) {
+  switch (QReg) {
+  default: llvm_unreachable("expected a Q register!");
+  case ARM::Q0:  return ARM::D0;
+  case ARM::Q1:  return ARM::D2;
+  case ARM::Q2:  return ARM::D4;
+  case ARM::Q3:  return ARM::D6;
+  case ARM::Q4:  return ARM::D8;
+  case ARM::Q5:  return ARM::D10;
+  case ARM::Q6:  return ARM::D12;
+  case ARM::Q7:  return ARM::D14;
+  case ARM::Q8:  return ARM::D16;
+  case ARM::Q9:  return ARM::D18;
+  case ARM::Q10: return ARM::D20;
+  case ARM::Q11: return ARM::D22;
+  case ARM::Q12: return ARM::D24;
+  case ARM::Q13: return ARM::D26;
+  case ARM::Q14: return ARM::D28;
+  case ARM::Q15: return ARM::D30;
+  }
+}
+
+/// Parse a register list.
+bool ARMAsmParser::parseRegisterList(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  if (Parser.getTok().isNot(AsmToken::LCurly))
+    return TokError("Token is not a Left Curly Brace");
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat '{' token.
+  SMLoc RegLoc = Parser.getTok().getLoc();
+
+  // Check the first register in the list to see what register class
+  // this is a list of.
+  int Reg = tryParseRegister();
+  if (Reg == -1)
+    return Error(RegLoc, "register expected");
+
+  // The reglist instructions have at most 16 registers, so reserve
+  // space for that many.
+  int EReg = 0;
+  SmallVector<std::pair<unsigned, unsigned>, 16> Registers;
+
+  // Allow Q regs and just interpret them as the two D sub-registers.
+  if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+    Reg = getDRegFromQReg(Reg);
+    EReg = MRI->getEncodingValue(Reg);
+    Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+    ++Reg;
+  }
+  const MCRegisterClass *RC;
+  if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+    RC = &ARMMCRegisterClasses[ARM::GPRRegClassID];
+  else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg))
+    RC = &ARMMCRegisterClasses[ARM::DPRRegClassID];
+  else if (ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg))
+    RC = &ARMMCRegisterClasses[ARM::SPRRegClassID];
+  else
+    return Error(RegLoc, "invalid register in register list");
+
+  // Store the register.
+  EReg = MRI->getEncodingValue(Reg);
+  Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+
+  // This starts immediately after the first register token in the list,
+  // so we can see either a comma or a minus (range separator) as a legal
+  // next token.
+  while (Parser.getTok().is(AsmToken::Comma) ||
+         Parser.getTok().is(AsmToken::Minus)) {
+    if (Parser.getTok().is(AsmToken::Minus)) {
+      Parser.Lex(); // Eat the minus.
+      SMLoc AfterMinusLoc = Parser.getTok().getLoc();
+      int EndReg = tryParseRegister();
+      if (EndReg == -1)
+        return Error(AfterMinusLoc, "register expected");
+      // Allow Q regs and just interpret them as the two D sub-registers.
+      if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg))
+        EndReg = getDRegFromQReg(EndReg) + 1;
+      // If the register is the same as the start reg, there's nothing
+      // more to do.
+      if (Reg == EndReg)
+        continue;
+      // The register must be in the same register class as the first.
+      if (!RC->contains(EndReg))
+        return Error(AfterMinusLoc, "invalid register in register list");
+      // Ranges must go from low to high.
+      if (MRI->getEncodingValue(Reg) > MRI->getEncodingValue(EndReg))
+        return Error(AfterMinusLoc, "bad range in register list");
+
+      // Add all the registers in the range to the register list.
+      while (Reg != EndReg) {
+        Reg = getNextRegister(Reg);
+        EReg = MRI->getEncodingValue(Reg);
+        Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+      }
+      continue;
+    }
+    Parser.Lex(); // Eat the comma.
+    RegLoc = Parser.getTok().getLoc();
+    int OldReg = Reg;
+    const AsmToken RegTok = Parser.getTok();
+    Reg = tryParseRegister();
+    if (Reg == -1)
+      return Error(RegLoc, "register expected");
+    // Allow Q regs and just interpret them as the two D sub-registers.
+    bool isQReg = false;
+    if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+      Reg = getDRegFromQReg(Reg);
+      isQReg = true;
+    }
+    // The register must be in the same register class as the first.
+    if (!RC->contains(Reg))
+      return Error(RegLoc, "invalid register in register list");
+    // List must be monotonically increasing.
+    if (MRI->getEncodingValue(Reg) < MRI->getEncodingValue(OldReg)) {
+      if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+        Warning(RegLoc, "register list not in ascending order");
+      else
+        return Error(RegLoc, "register list not in ascending order");
+    }
+    if (MRI->getEncodingValue(Reg) == MRI->getEncodingValue(OldReg)) {
+      Warning(RegLoc, "duplicated register (" + RegTok.getString() +
+              ") in register list");
+      continue;
+    }
+    // VFP register lists must also be contiguous.
+    if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] &&
+        Reg != OldReg + 1)
+      return Error(RegLoc, "non-contiguous register range");
+    EReg = MRI->getEncodingValue(Reg);
+    Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+    if (isQReg) {
+      EReg = MRI->getEncodingValue(++Reg);
+      Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+    }
+  }
+
+  if (Parser.getTok().isNot(AsmToken::RCurly))
+    return Error(Parser.getTok().getLoc(), "'}' expected");
+  SMLoc E = Parser.getTok().getEndLoc();
+  Parser.Lex(); // Eat '}' token.
+
+  // Push the register list operand.
+  Operands.push_back(ARMOperand::CreateRegList(Registers, S, E));
+
+  // The ARM system instruction variants for LDM/STM have a '^' token here.
+  if (Parser.getTok().is(AsmToken::Caret)) {
+    Operands.push_back(ARMOperand::CreateToken("^",Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat '^' token.
+  }
+
+  return false;
+}
+
+// Helper function to parse the lane index for vector lists.
+OperandMatchResultTy ARMAsmParser::
+parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index, SMLoc &EndLoc) {
+  MCAsmParser &Parser = getParser();
+  Index = 0; // Always return a defined index value.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    Parser.Lex(); // Eat the '['.
+    if (Parser.getTok().is(AsmToken::RBrac)) {
+      // "Dn[]" is the 'all lanes' syntax.
+      LaneKind = AllLanes;
+      EndLoc = Parser.getTok().getEndLoc();
+      Parser.Lex(); // Eat the ']'.
+      return MatchOperand_Success;
+    }
+
+    // There's an optional '#' token here. Normally there wouldn't be, but
+    // inline assemble puts one in, and it's friendly to accept that.
+    if (Parser.getTok().is(AsmToken::Hash))
+      Parser.Lex(); // Eat '#' or '$'.
+
+    const MCExpr *LaneIndex;
+    SMLoc Loc = Parser.getTok().getLoc();
+    if (getParser().parseExpression(LaneIndex)) {
+      Error(Loc, "illegal expression");
+      return MatchOperand_ParseFail;
+    }
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(LaneIndex);
+    if (!CE) {
+      Error(Loc, "lane index must be empty or an integer");
+      return MatchOperand_ParseFail;
+    }
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(Parser.getTok().getLoc(), "']' expected");
+      return MatchOperand_ParseFail;
+    }
+    EndLoc = Parser.getTok().getEndLoc();
+    Parser.Lex(); // Eat the ']'.
+    int64_t Val = CE->getValue();
+
+    // FIXME: Make this range check context sensitive for .8, .16, .32.
+    if (Val < 0 || Val > 7) {
+      Error(Parser.getTok().getLoc(), "lane index out of range");
+      return MatchOperand_ParseFail;
+    }
+    Index = Val;
+    LaneKind = IndexedLane;
+    return MatchOperand_Success;
+  }
+  LaneKind = NoLanes;
+  return MatchOperand_Success;
+}
+
+// parse a vector register list
+OperandMatchResultTy
+ARMAsmParser::parseVectorList(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  VectorLaneTy LaneKind;
+  unsigned LaneIndex;
+  SMLoc S = Parser.getTok().getLoc();
+  // As an extension (to match gas), support a plain D register or Q register
+  // (without encosing curly braces) as a single or double entry list,
+  // respectively.
+  if (Parser.getTok().is(AsmToken::Identifier)) {
+    SMLoc E = Parser.getTok().getEndLoc();
+    int Reg = tryParseRegister();
+    if (Reg == -1)
+      return MatchOperand_NoMatch;
+    if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) {
+      OperandMatchResultTy Res = parseVectorLane(LaneKind, LaneIndex, E);
+      if (Res != MatchOperand_Success)
+        return Res;
+      switch (LaneKind) {
+      case NoLanes:
+        Operands.push_back(ARMOperand::CreateVectorList(Reg, 1, false, S, E));
+        break;
+      case AllLanes:
+        Operands.push_back(ARMOperand::CreateVectorListAllLanes(Reg, 1, false,
+                                                                S, E));
+        break;
+      case IndexedLane:
+        Operands.push_back(ARMOperand::CreateVectorListIndexed(Reg, 1,
+                                                               LaneIndex,
+                                                               false, S, E));
+        break;
+      }
+      return MatchOperand_Success;
+    }
+    if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+      Reg = getDRegFromQReg(Reg);
+      OperandMatchResultTy Res = parseVectorLane(LaneKind, LaneIndex, E);
+      if (Res != MatchOperand_Success)
+        return Res;
+      switch (LaneKind) {
+      case NoLanes:
+        Reg = MRI->getMatchingSuperReg(Reg, ARM::dsub_0,
+                                   &ARMMCRegisterClasses[ARM::DPairRegClassID]);
+        Operands.push_back(ARMOperand::CreateVectorList(Reg, 2, false, S, E));
+        break;
+      case AllLanes:
+        Reg = MRI->getMatchingSuperReg(Reg, ARM::dsub_0,
+                                   &ARMMCRegisterClasses[ARM::DPairRegClassID]);
+        Operands.push_back(ARMOperand::CreateVectorListAllLanes(Reg, 2, false,
+                                                                S, E));
+        break;
+      case IndexedLane:
+        Operands.push_back(ARMOperand::CreateVectorListIndexed(Reg, 2,
+                                                               LaneIndex,
+                                                               false, S, E));
+        break;
+      }
+      return MatchOperand_Success;
+    }
+    Error(S, "vector register expected");
+    return MatchOperand_ParseFail;
+  }
+
+  if (Parser.getTok().isNot(AsmToken::LCurly))
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat '{' token.
+  SMLoc RegLoc = Parser.getTok().getLoc();
+
+  int Reg = tryParseRegister();
+  if (Reg == -1) {
+    Error(RegLoc, "register expected");
+    return MatchOperand_ParseFail;
+  }
+  unsigned Count = 1;
+  int Spacing = 0;
+  unsigned FirstReg = Reg;
+  // The list is of D registers, but we also allow Q regs and just interpret
+  // them as the two D sub-registers.
+  if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+    FirstReg = Reg = getDRegFromQReg(Reg);
+    Spacing = 1; // double-spacing requires explicit D registers, otherwise
+                 // it's ambiguous with four-register single spaced.
+    ++Reg;
+    ++Count;
+  }
+
+  SMLoc E;
+  if (parseVectorLane(LaneKind, LaneIndex, E) != MatchOperand_Success)
+    return MatchOperand_ParseFail;
+
+  while (Parser.getTok().is(AsmToken::Comma) ||
+         Parser.getTok().is(AsmToken::Minus)) {
+    if (Parser.getTok().is(AsmToken::Minus)) {
+      if (!Spacing)
+        Spacing = 1; // Register range implies a single spaced list.
+      else if (Spacing == 2) {
+        Error(Parser.getTok().getLoc(),
+              "sequential registers in double spaced list");
+        return MatchOperand_ParseFail;
+      }
+      Parser.Lex(); // Eat the minus.
+      SMLoc AfterMinusLoc = Parser.getTok().getLoc();
+      int EndReg = tryParseRegister();
+      if (EndReg == -1) {
+        Error(AfterMinusLoc, "register expected");
+        return MatchOperand_ParseFail;
+      }
+      // Allow Q regs and just interpret them as the two D sub-registers.
+      if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg))
+        EndReg = getDRegFromQReg(EndReg) + 1;
+      // If the register is the same as the start reg, there's nothing
+      // more to do.
+      if (Reg == EndReg)
+        continue;
+      // The register must be in the same register class as the first.
+      if (!ARMMCRegisterClasses[ARM::DPRRegClassID].contains(EndReg)) {
+        Error(AfterMinusLoc, "invalid register in register list");
+        return MatchOperand_ParseFail;
+      }
+      // Ranges must go from low to high.
+      if (Reg > EndReg) {
+        Error(AfterMinusLoc, "bad range in register list");
+        return MatchOperand_ParseFail;
+      }
+      // Parse the lane specifier if present.
+      VectorLaneTy NextLaneKind;
+      unsigned NextLaneIndex;
+      if (parseVectorLane(NextLaneKind, NextLaneIndex, E) !=
+          MatchOperand_Success)
+        return MatchOperand_ParseFail;
+      if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) {
+        Error(AfterMinusLoc, "mismatched lane index in register list");
+        return MatchOperand_ParseFail;
+      }
+
+      // Add all the registers in the range to the register list.
+      Count += EndReg - Reg;
+      Reg = EndReg;
+      continue;
+    }
+    Parser.Lex(); // Eat the comma.
+    RegLoc = Parser.getTok().getLoc();
+    int OldReg = Reg;
+    Reg = tryParseRegister();
+    if (Reg == -1) {
+      Error(RegLoc, "register expected");
+      return MatchOperand_ParseFail;
+    }
+    // vector register lists must be contiguous.
+    // It's OK to use the enumeration values directly here rather, as the
+    // VFP register classes have the enum sorted properly.
+    //
+    // The list is of D registers, but we also allow Q regs and just interpret
+    // them as the two D sub-registers.
+    if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+      if (!Spacing)
+        Spacing = 1; // Register range implies a single spaced list.
+      else if (Spacing == 2) {
+        Error(RegLoc,
+              "invalid register in double-spaced list (must be 'D' register')");
+        return MatchOperand_ParseFail;
+      }
+      Reg = getDRegFromQReg(Reg);
+      if (Reg != OldReg + 1) {
+        Error(RegLoc, "non-contiguous register range");
+        return MatchOperand_ParseFail;
+      }
+      ++Reg;
+      Count += 2;
+      // Parse the lane specifier if present.
+      VectorLaneTy NextLaneKind;
+      unsigned NextLaneIndex;
+      SMLoc LaneLoc = Parser.getTok().getLoc();
+      if (parseVectorLane(NextLaneKind, NextLaneIndex, E) !=
+          MatchOperand_Success)
+        return MatchOperand_ParseFail;
+      if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) {
+        Error(LaneLoc, "mismatched lane index in register list");
+        return MatchOperand_ParseFail;
+      }
+      continue;
+    }
+    // Normal D register.
+    // Figure out the register spacing (single or double) of the list if
+    // we don't know it already.
+    if (!Spacing)
+      Spacing = 1 + (Reg == OldReg + 2);
+
+    // Just check that it's contiguous and keep going.
+    if (Reg != OldReg + Spacing) {
+      Error(RegLoc, "non-contiguous register range");
+      return MatchOperand_ParseFail;
+    }
+    ++Count;
+    // Parse the lane specifier if present.
+    VectorLaneTy NextLaneKind;
+    unsigned NextLaneIndex;
+    SMLoc EndLoc = Parser.getTok().getLoc();
+    if (parseVectorLane(NextLaneKind, NextLaneIndex, E) != MatchOperand_Success)
+      return MatchOperand_ParseFail;
+    if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) {
+      Error(EndLoc, "mismatched lane index in register list");
+      return MatchOperand_ParseFail;
+    }
+  }
+
+  if (Parser.getTok().isNot(AsmToken::RCurly)) {
+    Error(Parser.getTok().getLoc(), "'}' expected");
+    return MatchOperand_ParseFail;
+  }
+  E = Parser.getTok().getEndLoc();
+  Parser.Lex(); // Eat '}' token.
+
+  switch (LaneKind) {
+  case NoLanes:
+    // Two-register operands have been converted to the
+    // composite register classes.
+    if (Count == 2) {
+      const MCRegisterClass *RC = (Spacing == 1) ?
+        &ARMMCRegisterClasses[ARM::DPairRegClassID] :
+        &ARMMCRegisterClasses[ARM::DPairSpcRegClassID];
+      FirstReg = MRI->getMatchingSuperReg(FirstReg, ARM::dsub_0, RC);
+    }
+
+    Operands.push_back(ARMOperand::CreateVectorList(FirstReg, Count,
+                                                    (Spacing == 2), S, E));
+    break;
+  case AllLanes:
+    // Two-register operands have been converted to the
+    // composite register classes.
+    if (Count == 2) {
+      const MCRegisterClass *RC = (Spacing == 1) ?
+        &ARMMCRegisterClasses[ARM::DPairRegClassID] :
+        &ARMMCRegisterClasses[ARM::DPairSpcRegClassID];
+      FirstReg = MRI->getMatchingSuperReg(FirstReg, ARM::dsub_0, RC);
+    }
+    Operands.push_back(ARMOperand::CreateVectorListAllLanes(FirstReg, Count,
+                                                            (Spacing == 2),
+                                                            S, E));
+    break;
+  case IndexedLane:
+    Operands.push_back(ARMOperand::CreateVectorListIndexed(FirstReg, Count,
+                                                           LaneIndex,
+                                                           (Spacing == 2),
+                                                           S, E));
+    break;
+  }
+  return MatchOperand_Success;
+}
+
+/// parseMemBarrierOptOperand - Try to parse DSB/DMB data barrier options.
+OperandMatchResultTy
+ARMAsmParser::parseMemBarrierOptOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  unsigned Opt;
+
+  if (Tok.is(AsmToken::Identifier)) {
+    StringRef OptStr = Tok.getString();
+
+    Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size()).lower())
+      .Case("sy",    ARM_MB::SY)
+      .Case("st",    ARM_MB::ST)
+      .Case("ld",    ARM_MB::LD)
+      .Case("sh",    ARM_MB::ISH)
+      .Case("ish",   ARM_MB::ISH)
+      .Case("shst",  ARM_MB::ISHST)
+      .Case("ishst", ARM_MB::ISHST)
+      .Case("ishld", ARM_MB::ISHLD)
+      .Case("nsh",   ARM_MB::NSH)
+      .Case("un",    ARM_MB::NSH)
+      .Case("nshst", ARM_MB::NSHST)
+      .Case("nshld", ARM_MB::NSHLD)
+      .Case("unst",  ARM_MB::NSHST)
+      .Case("osh",   ARM_MB::OSH)
+      .Case("oshst", ARM_MB::OSHST)
+      .Case("oshld", ARM_MB::OSHLD)
+      .Default(~0U);
+
+    // ishld, oshld, nshld and ld are only available from ARMv8.
+    if (!hasV8Ops() && (Opt == ARM_MB::ISHLD || Opt == ARM_MB::OSHLD ||
+                        Opt == ARM_MB::NSHLD || Opt == ARM_MB::LD))
+      Opt = ~0U;
+
+    if (Opt == ~0U)
+      return MatchOperand_NoMatch;
+
+    Parser.Lex(); // Eat identifier token.
+  } else if (Tok.is(AsmToken::Hash) ||
+             Tok.is(AsmToken::Dollar) ||
+             Tok.is(AsmToken::Integer)) {
+    if (Parser.getTok().isNot(AsmToken::Integer))
+      Parser.Lex(); // Eat '#' or '$'.
+    SMLoc Loc = Parser.getTok().getLoc();
+
+    const MCExpr *MemBarrierID;
+    if (getParser().parseExpression(MemBarrierID)) {
+      Error(Loc, "illegal expression");
+      return MatchOperand_ParseFail;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(MemBarrierID);
+    if (!CE) {
+      Error(Loc, "constant expression expected");
+      return MatchOperand_ParseFail;
+    }
+
+    int Val = CE->getValue();
+    if (Val & ~0xf) {
+      Error(Loc, "immediate value out of range");
+      return MatchOperand_ParseFail;
+    }
+
+    Opt = ARM_MB::RESERVED_0 + Val;
+  } else
+    return MatchOperand_ParseFail;
+
+  Operands.push_back(ARMOperand::CreateMemBarrierOpt((ARM_MB::MemBOpt)Opt, S));
+  return MatchOperand_Success;
+}
+
+/// parseInstSyncBarrierOptOperand - Try to parse ISB inst sync barrier options.
+OperandMatchResultTy
+ARMAsmParser::parseInstSyncBarrierOptOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  unsigned Opt;
+
+  if (Tok.is(AsmToken::Identifier)) {
+    StringRef OptStr = Tok.getString();
+
+    if (OptStr.equals_lower("sy"))
+      Opt = ARM_ISB::SY;
+    else
+      return MatchOperand_NoMatch;
+
+    Parser.Lex(); // Eat identifier token.
+  } else if (Tok.is(AsmToken::Hash) ||
+             Tok.is(AsmToken::Dollar) ||
+             Tok.is(AsmToken::Integer)) {
+    if (Parser.getTok().isNot(AsmToken::Integer))
+      Parser.Lex(); // Eat '#' or '$'.
+    SMLoc Loc = Parser.getTok().getLoc();
+
+    const MCExpr *ISBarrierID;
+    if (getParser().parseExpression(ISBarrierID)) {
+      Error(Loc, "illegal expression");
+      return MatchOperand_ParseFail;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ISBarrierID);
+    if (!CE) {
+      Error(Loc, "constant expression expected");
+      return MatchOperand_ParseFail;
+    }
+
+    int Val = CE->getValue();
+    if (Val & ~0xf) {
+      Error(Loc, "immediate value out of range");
+      return MatchOperand_ParseFail;
+    }
+
+    Opt = ARM_ISB::RESERVED_0 + Val;
+  } else
+    return MatchOperand_ParseFail;
+
+  Operands.push_back(ARMOperand::CreateInstSyncBarrierOpt(
+          (ARM_ISB::InstSyncBOpt)Opt, S));
+  return MatchOperand_Success;
+}
+
+
+/// parseProcIFlagsOperand - Try to parse iflags from CPS instruction.
+OperandMatchResultTy
+ARMAsmParser::parseProcIFlagsOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier)) 
+    return MatchOperand_NoMatch;
+  StringRef IFlagsStr = Tok.getString();
+
+  // An iflags string of "none" is interpreted to mean that none of the AIF
+  // bits are set.  Not a terribly useful instruction, but a valid encoding.
+  unsigned IFlags = 0;
+  if (IFlagsStr != "none") {
+        for (int i = 0, e = IFlagsStr.size(); i != e; ++i) {
+      unsigned Flag = StringSwitch<unsigned>(IFlagsStr.substr(i, 1))
+        .Case("a", ARM_PROC::A)
+        .Case("i", ARM_PROC::I)
+        .Case("f", ARM_PROC::F)
+        .Default(~0U);
+
+      // If some specific iflag is already set, it means that some letter is
+      // present more than once, this is not acceptable.
+      if (Flag == ~0U || (IFlags & Flag))
+        return MatchOperand_NoMatch;
+
+      IFlags |= Flag;
+    }
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateProcIFlags((ARM_PROC::IFlags)IFlags, S));
+  return MatchOperand_Success;
+}
+
+/// parseMSRMaskOperand - Try to parse mask flags from MSR instruction.
+OperandMatchResultTy
+ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+  StringRef Mask = Tok.getString();
+
+  if (isMClass()) {
+    // See ARMv6-M 10.1.1
+    std::string Name = Mask.lower();
+    unsigned FlagsVal = StringSwitch<unsigned>(Name)
+      // Note: in the documentation:
+      //  ARM deprecates using MSR APSR without a _<bits> qualifier as an alias
+      //  for MSR APSR_nzcvq.
+      // but we do make it an alias here.  This is so to get the "mask encoding"
+      // bits correct on MSR APSR writes.
+      //
+      // FIXME: Note the 0xc00 "mask encoding" bits version of the registers
+      // should really only be allowed when writing a special register.  Note
+      // they get dropped in the MRS instruction reading a special register as
+      // the SYSm field is only 8 bits.
+      .Case("apsr", 0x800)
+      .Case("apsr_nzcvq", 0x800)
+      .Case("apsr_g", 0x400)
+      .Case("apsr_nzcvqg", 0xc00)
+      .Case("iapsr", 0x801)
+      .Case("iapsr_nzcvq", 0x801)
+      .Case("iapsr_g", 0x401)
+      .Case("iapsr_nzcvqg", 0xc01)
+      .Case("eapsr", 0x802)
+      .Case("eapsr_nzcvq", 0x802)
+      .Case("eapsr_g", 0x402)
+      .Case("eapsr_nzcvqg", 0xc02)
+      .Case("xpsr", 0x803)
+      .Case("xpsr_nzcvq", 0x803)
+      .Case("xpsr_g", 0x403)
+      .Case("xpsr_nzcvqg", 0xc03)
+      .Case("ipsr", 0x805)
+      .Case("epsr", 0x806)
+      .Case("iepsr", 0x807)
+      .Case("msp", 0x808)
+      .Case("psp", 0x809)
+      .Case("primask", 0x810)
+      .Case("basepri", 0x811)
+      .Case("basepri_max", 0x812)
+      .Case("faultmask", 0x813)
+      .Case("control", 0x814)
+      .Case("msplim", 0x80a)
+      .Case("psplim", 0x80b)
+      .Case("msp_ns", 0x888)
+      .Case("psp_ns", 0x889)
+      .Case("msplim_ns", 0x88a)
+      .Case("psplim_ns", 0x88b)
+      .Case("primask_ns", 0x890)
+      .Case("basepri_ns", 0x891)
+      .Case("basepri_max_ns", 0x892)
+      .Case("faultmask_ns", 0x893)
+      .Case("control_ns", 0x894)
+      .Case("sp_ns", 0x898)
+      .Default(~0U);
+
+    if (FlagsVal == ~0U)
+      return MatchOperand_NoMatch;
+
+    if (!hasDSP() && (FlagsVal & 0x400))
+      // The _g and _nzcvqg versions are only valid if the DSP extension is
+      // available.
+      return MatchOperand_NoMatch;
+
+    if (!hasV7Ops() && FlagsVal >= 0x811 && FlagsVal <= 0x813)
+      // basepri, basepri_max and faultmask only valid for V7m.
+      return MatchOperand_NoMatch;
+
+    if (!has8MSecExt() && (FlagsVal == 0x80a || FlagsVal == 0x80b ||
+                             (FlagsVal > 0x814 && FlagsVal < 0xc00)))
+      return MatchOperand_NoMatch;
+
+    if (!hasV8MMainline() && (FlagsVal == 0x88a || FlagsVal == 0x88b ||
+                              (FlagsVal > 0x890 && FlagsVal <= 0x893)))
+      return MatchOperand_NoMatch;
+
+    Parser.Lex(); // Eat identifier token.
+    Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S));
+    return MatchOperand_Success;
+  }
+
+  // Split spec_reg from flag, example: CPSR_sxf => "CPSR" and "sxf"
+  size_t Start = 0, Next = Mask.find('_');
+  StringRef Flags = "";
+  std::string SpecReg = Mask.slice(Start, Next).lower();
+  if (Next != StringRef::npos)
+    Flags = Mask.slice(Next+1, Mask.size());
+
+  // FlagsVal contains the complete mask:
+  // 3-0: Mask
+  // 4: Special Reg (cpsr, apsr => 0; spsr => 1)
+  unsigned FlagsVal = 0;
+
+  if (SpecReg == "apsr") {
+    FlagsVal = StringSwitch<unsigned>(Flags)
+    .Case("nzcvq",  0x8) // same as CPSR_f
+    .Case("g",      0x4) // same as CPSR_s
+    .Case("nzcvqg", 0xc) // same as CPSR_fs
+    .Default(~0U);
+
+    if (FlagsVal == ~0U) {
+      if (!Flags.empty())
+        return MatchOperand_NoMatch;
+      else
+        FlagsVal = 8; // No flag
+    }
+  } else if (SpecReg == "cpsr" || SpecReg == "spsr") {
+    // cpsr_all is an alias for cpsr_fc, as is plain cpsr.
+    if (Flags == "all" || Flags == "")
+      Flags = "fc";
+    for (int i = 0, e = Flags.size(); i != e; ++i) {
+      unsigned Flag = StringSwitch<unsigned>(Flags.substr(i, 1))
+      .Case("c", 1)
+      .Case("x", 2)
+      .Case("s", 4)
+      .Case("f", 8)
+      .Default(~0U);
+
+      // If some specific flag is already set, it means that some letter is
+      // present more than once, this is not acceptable.
+      if (FlagsVal == ~0U || (FlagsVal & Flag))
+        return MatchOperand_NoMatch;
+      FlagsVal |= Flag;
+    }
+  } else // No match for special register.
+    return MatchOperand_NoMatch;
+
+  // Special register without flags is NOT equivalent to "fc" flags.
+  // NOTE: This is a divergence from gas' behavior.  Uncommenting the following
+  // two lines would enable gas compatibility at the expense of breaking
+  // round-tripping.
+  //
+  // if (!FlagsVal)
+  //  FlagsVal = 0x9;
+
+  // Bit 4: Special Reg (cpsr, apsr => 0; spsr => 1)
+  if (SpecReg == "spsr")
+    FlagsVal |= 16;
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S));
+  return MatchOperand_Success;
+}
+
+/// parseBankedRegOperand - Try to parse a banked register (e.g. "lr_irq") for
+/// use in the MRS/MSR instructions added to support virtualization.
+OperandMatchResultTy
+ARMAsmParser::parseBankedRegOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+  StringRef RegName = Tok.getString();
+
+  // The values here come from B9.2.3 of the ARM ARM, where bits 4-0 are SysM
+  // and bit 5 is R.
+  unsigned Encoding = StringSwitch<unsigned>(RegName.lower())
+                          .Case("r8_usr", 0x00)
+                          .Case("r9_usr", 0x01)
+                          .Case("r10_usr", 0x02)
+                          .Case("r11_usr", 0x03)
+                          .Case("r12_usr", 0x04)
+                          .Case("sp_usr", 0x05)
+                          .Case("lr_usr", 0x06)
+                          .Case("r8_fiq", 0x08)
+                          .Case("r9_fiq", 0x09)
+                          .Case("r10_fiq", 0x0a)
+                          .Case("r11_fiq", 0x0b)
+                          .Case("r12_fiq", 0x0c)
+                          .Case("sp_fiq", 0x0d)
+                          .Case("lr_fiq", 0x0e)
+                          .Case("lr_irq", 0x10)
+                          .Case("sp_irq", 0x11)
+                          .Case("lr_svc", 0x12)
+                          .Case("sp_svc", 0x13)
+                          .Case("lr_abt", 0x14)
+                          .Case("sp_abt", 0x15)
+                          .Case("lr_und", 0x16)
+                          .Case("sp_und", 0x17)
+                          .Case("lr_mon", 0x1c)
+                          .Case("sp_mon", 0x1d)
+                          .Case("elr_hyp", 0x1e)
+                          .Case("sp_hyp", 0x1f)
+                          .Case("spsr_fiq", 0x2e)
+                          .Case("spsr_irq", 0x30)
+                          .Case("spsr_svc", 0x32)
+                          .Case("spsr_abt", 0x34)
+                          .Case("spsr_und", 0x36)
+                          .Case("spsr_mon", 0x3c)
+                          .Case("spsr_hyp", 0x3e)
+                          .Default(~0U);
+
+  if (Encoding == ~0U)
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateBankedReg(Encoding, S));
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+ARMAsmParser::parsePKHImm(OperandVector &Operands, StringRef Op, int Low,
+                          int High) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(Parser.getTok().getLoc(), Op + " operand expected.");
+    return MatchOperand_ParseFail;
+  }
+  StringRef ShiftName = Tok.getString();
+  std::string LowerOp = Op.lower();
+  std::string UpperOp = Op.upper();
+  if (ShiftName != LowerOp && ShiftName != UpperOp) {
+    Error(Parser.getTok().getLoc(), Op + " operand expected.");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat shift type token.
+
+  // There must be a '#' and a shift amount.
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *ShiftAmount;
+  SMLoc Loc = Parser.getTok().getLoc();
+  SMLoc EndLoc;
+  if (getParser().parseExpression(ShiftAmount, EndLoc)) {
+    Error(Loc, "illegal expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+  if (!CE) {
+    Error(Loc, "constant expression expected");
+    return MatchOperand_ParseFail;
+  }
+  int Val = CE->getValue();
+  if (Val < Low || Val > High) {
+    Error(Loc, "immediate value out of range");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARMOperand::CreateImm(CE, Loc, EndLoc));
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+ARMAsmParser::parseSetEndImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(S, "'be' or 'le' operand expected");
+    return MatchOperand_ParseFail;
+  }
+  int Val = StringSwitch<int>(Tok.getString().lower())
+    .Case("be", 1)
+    .Case("le", 0)
+    .Default(-1);
+  Parser.Lex(); // Eat the token.
+
+  if (Val == -1) {
+    Error(S, "'be' or 'le' operand expected");
+    return MatchOperand_ParseFail;
+  }
+  Operands.push_back(ARMOperand::CreateImm(MCConstantExpr::create(Val,
+                                                                  getContext()),
+                                           S, Tok.getEndLoc()));
+  return MatchOperand_Success;
+}
+
+/// parseShifterImm - Parse the shifter immediate operand for SSAT/USAT
+/// instructions. Legal values are:
+///     lsl #n  'n' in [0,31]
+///     asr #n  'n' in [1,32]
+///             n == 32 encoded as n == 0.
+OperandMatchResultTy
+ARMAsmParser::parseShifterImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(S, "shift operator 'asr' or 'lsl' expected");
+    return MatchOperand_ParseFail;
+  }
+  StringRef ShiftName = Tok.getString();
+  bool isASR;
+  if (ShiftName == "lsl" || ShiftName == "LSL")
+    isASR = false;
+  else if (ShiftName == "asr" || ShiftName == "ASR")
+    isASR = true;
+  else {
+    Error(S, "shift operator 'asr' or 'lsl' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat the operator.
+
+  // A '#' and a shift amount.
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+  SMLoc ExLoc = Parser.getTok().getLoc();
+
+  const MCExpr *ShiftAmount;
+  SMLoc EndLoc;
+  if (getParser().parseExpression(ShiftAmount, EndLoc)) {
+    Error(ExLoc, "malformed shift expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+  if (!CE) {
+    Error(ExLoc, "shift amount must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t Val = CE->getValue();
+  if (isASR) {
+    // Shift amount must be in [1,32]
+    if (Val < 1 || Val > 32) {
+      Error(ExLoc, "'asr' shift amount must be in range [1,32]");
+      return MatchOperand_ParseFail;
+    }
+    // asr #32 encoded as asr #0, but is not allowed in Thumb2 mode.
+    if (isThumb() && Val == 32) {
+      Error(ExLoc, "'asr #32' shift amount not allowed in Thumb mode");
+      return MatchOperand_ParseFail;
+    }
+    if (Val == 32) Val = 0;
+  } else {
+    // Shift amount must be in [1,32]
+    if (Val < 0 || Val > 31) {
+      Error(ExLoc, "'lsr' shift amount must be in range [0,31]");
+      return MatchOperand_ParseFail;
+    }
+  }
+
+  Operands.push_back(ARMOperand::CreateShifterImm(isASR, Val, S, EndLoc));
+
+  return MatchOperand_Success;
+}
+
+/// parseRotImm - Parse the shifter immediate operand for SXTB/UXTB family
+/// of instructions. Legal values are:
+///     ror #n  'n' in {0, 8, 16, 24}
+OperandMatchResultTy
+ARMAsmParser::parseRotImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+  StringRef ShiftName = Tok.getString();
+  if (ShiftName != "ror" && ShiftName != "ROR")
+    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the operator.
+
+  // A '#' and a rotate amount.
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+  SMLoc ExLoc = Parser.getTok().getLoc();
+
+  const MCExpr *ShiftAmount;
+  SMLoc EndLoc;
+  if (getParser().parseExpression(ShiftAmount, EndLoc)) {
+    Error(ExLoc, "malformed rotate expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+  if (!CE) {
+    Error(ExLoc, "rotate amount must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t Val = CE->getValue();
+  // Shift amount must be in {0, 8, 16, 24} (0 is undocumented extension)
+  // normally, zero is represented in asm by omitting the rotate operand
+  // entirely.
+  if (Val != 8 && Val != 16 && Val != 24 && Val != 0) {
+    Error(ExLoc, "'ror' rotate amount must be 8, 16, or 24");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARMOperand::CreateRotImm(Val, S, EndLoc));
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+ARMAsmParser::parseModImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  MCAsmLexer &Lexer = getLexer();
+  int64_t Imm1, Imm2;
+
+  SMLoc S = Parser.getTok().getLoc();
+
+  // 1) A mod_imm operand can appear in the place of a register name:
+  //   add r0, #mod_imm
+  //   add r0, r0, #mod_imm
+  // to correctly handle the latter, we bail out as soon as we see an
+  // identifier.
+  //
+  // 2) Similarly, we do not want to parse into complex operands:
+  //   mov r0, #mod_imm
+  //   mov r0, :lower16:(_foo)
+  if (Parser.getTok().is(AsmToken::Identifier) ||
+      Parser.getTok().is(AsmToken::Colon))
+    return MatchOperand_NoMatch;
+
+  // Hash (dollar) is optional as per the ARMARM
+  if (Parser.getTok().is(AsmToken::Hash) ||
+      Parser.getTok().is(AsmToken::Dollar)) {
+    // Avoid parsing into complex operands (#:)
+    if (Lexer.peekTok().is(AsmToken::Colon))
+      return MatchOperand_NoMatch;
+
+    // Eat the hash (dollar)
+    Parser.Lex();
+  }
+
+  SMLoc Sx1, Ex1;
+  Sx1 = Parser.getTok().getLoc();
+  const MCExpr *Imm1Exp;
+  if (getParser().parseExpression(Imm1Exp, Ex1)) {
+    Error(Sx1, "malformed expression");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm1Exp);
+
+  if (CE) {
+    // Immediate must fit within 32-bits
+    Imm1 = CE->getValue();
+    int Enc = ARM_AM::getSOImmVal(Imm1);
+    if (Enc != -1 && Parser.getTok().is(AsmToken::EndOfStatement)) {
+      // We have a match!
+      Operands.push_back(ARMOperand::CreateModImm((Enc & 0xFF),
+                                                  (Enc & 0xF00) >> 7,
+                                                  Sx1, Ex1));
+      return MatchOperand_Success;
+    }
+
+    // We have parsed an immediate which is not for us, fallback to a plain
+    // immediate. This can happen for instruction aliases. For an example,
+    // ARMInstrInfo.td defines the alias [mov <-> mvn] which can transform
+    // a mov (mvn) with a mod_imm_neg/mod_imm_not operand into the opposite
+    // instruction with a mod_imm operand. The alias is defined such that the
+    // parser method is shared, that's why we have to do this here.
+    if (Parser.getTok().is(AsmToken::EndOfStatement)) {
+      Operands.push_back(ARMOperand::CreateImm(Imm1Exp, Sx1, Ex1));
+      return MatchOperand_Success;
+    }
+  } else {
+    // Operands like #(l1 - l2) can only be evaluated at a later stage (via an
+    // MCFixup). Fallback to a plain immediate.
+    Operands.push_back(ARMOperand::CreateImm(Imm1Exp, Sx1, Ex1));
+    return MatchOperand_Success;
+  }
+
+  // From this point onward, we expect the input to be a (#bits, #rot) pair
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Error(Sx1, "expected modified immediate operand: #[0, 255], #even[0-30]");
+    return MatchOperand_ParseFail;
+  }
+
+  if (Imm1 & ~0xFF) {
+    Error(Sx1, "immediate operand must a number in the range [0, 255]");
+    return MatchOperand_ParseFail;
+  }
+
+  // Eat the comma
+  Parser.Lex();
+
+  // Repeat for #rot
+  SMLoc Sx2, Ex2;
+  Sx2 = Parser.getTok().getLoc();
+
+  // Eat the optional hash (dollar)
+  if (Parser.getTok().is(AsmToken::Hash) ||
+      Parser.getTok().is(AsmToken::Dollar))
+    Parser.Lex();
+
+  const MCExpr *Imm2Exp;
+  if (getParser().parseExpression(Imm2Exp, Ex2)) {
+    Error(Sx2, "malformed expression");
+    return MatchOperand_ParseFail;
+  }
+
+  CE = dyn_cast<MCConstantExpr>(Imm2Exp);
+
+  if (CE) {
+    Imm2 = CE->getValue();
+    if (!(Imm2 & ~0x1E)) {
+      // We have a match!
+      Operands.push_back(ARMOperand::CreateModImm(Imm1, Imm2, S, Ex2));
+      return MatchOperand_Success;
+    }
+    Error(Sx2, "immediate operand must an even number in the range [0, 30]");
+    return MatchOperand_ParseFail;
+  } else {
+    Error(Sx2, "constant expression expected");
+    return MatchOperand_ParseFail;
+  }
+}
+
+OperandMatchResultTy
+ARMAsmParser::parseBitfield(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = Parser.getTok().getLoc();
+  // The bitfield descriptor is really two operands, the LSB and the width.
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *LSBExpr;
+  SMLoc E = Parser.getTok().getLoc();
+  if (getParser().parseExpression(LSBExpr)) {
+    Error(E, "malformed immediate expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(LSBExpr);
+  if (!CE) {
+    Error(E, "'lsb' operand must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t LSB = CE->getValue();
+  // The LSB must be in the range [0,31]
+  if (LSB < 0 || LSB > 31) {
+    Error(E, "'lsb' operand must be in the range [0,31]");
+    return MatchOperand_ParseFail;
+  }
+  E = Parser.getTok().getLoc();
+
+  // Expect another immediate operand.
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Error(Parser.getTok().getLoc(), "too few operands");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *WidthExpr;
+  SMLoc EndLoc;
+  if (getParser().parseExpression(WidthExpr, EndLoc)) {
+    Error(E, "malformed immediate expression");
+    return MatchOperand_ParseFail;
+  }
+  CE = dyn_cast<MCConstantExpr>(WidthExpr);
+  if (!CE) {
+    Error(E, "'width' operand must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t Width = CE->getValue();
+  // The LSB must be in the range [1,32-lsb]
+  if (Width < 1 || Width > 32 - LSB) {
+    Error(E, "'width' operand must be in the range [1,32-lsb]");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARMOperand::CreateBitfield(LSB, Width, S, EndLoc));
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+ARMAsmParser::parsePostIdxReg(OperandVector &Operands) {
+  // Check for a post-index addressing register operand. Specifically:
+  // postidx_reg := '+' register {, shift}
+  //              | '-' register {, shift}
+  //              | register {, shift}
+
+  // This method must return MatchOperand_NoMatch without consuming any tokens
+  // in the case where there is no match, as other alternatives take other
+  // parse methods.
+  MCAsmParser &Parser = getParser();
+  AsmToken Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  bool haveEaten = false;
+  bool isAdd = true;
+  if (Tok.is(AsmToken::Plus)) {
+    Parser.Lex(); // Eat the '+' token.
+    haveEaten = true;
+  } else if (Tok.is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the '-' token.
+    isAdd = false;
+    haveEaten = true;
+  }
+
+  SMLoc E = Parser.getTok().getEndLoc();
+  int Reg = tryParseRegister();
+  if (Reg == -1) {
+    if (!haveEaten)
+      return MatchOperand_NoMatch;
+    Error(Parser.getTok().getLoc(), "register expected");
+    return MatchOperand_ParseFail;
+  }
+
+  ARM_AM::ShiftOpc ShiftTy = ARM_AM::no_shift;
+  unsigned ShiftImm = 0;
+  if (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the ','.
+    if (parseMemRegOffsetShift(ShiftTy, ShiftImm))
+      return MatchOperand_ParseFail;
+
+    // FIXME: Only approximates end...may include intervening whitespace.
+    E = Parser.getTok().getLoc();
+  }
+
+  Operands.push_back(ARMOperand::CreatePostIdxReg(Reg, isAdd, ShiftTy,
+                                                  ShiftImm, S, E));
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+ARMAsmParser::parseAM3Offset(OperandVector &Operands) {
+  // Check for a post-index addressing register operand. Specifically:
+  // am3offset := '+' register
+  //              | '-' register
+  //              | register
+  //              | # imm
+  //              | # + imm
+  //              | # - imm
+
+  // This method must return MatchOperand_NoMatch without consuming any tokens
+  // in the case where there is no match, as other alternatives take other
+  // parse methods.
+  MCAsmParser &Parser = getParser();
+  AsmToken Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+
+  // Do immediates first, as we always parse those if we have a '#'.
+  if (Parser.getTok().is(AsmToken::Hash) ||
+      Parser.getTok().is(AsmToken::Dollar)) {
+    Parser.Lex(); // Eat '#' or '$'.
+    // Explicitly look for a '-', as we need to encode negative zero
+    // differently.
+    bool isNegative = Parser.getTok().is(AsmToken::Minus);
+    const MCExpr *Offset;
+    SMLoc E;
+    if (getParser().parseExpression(Offset, E))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset);
+    if (!CE) {
+      Error(S, "constant expression expected");
+      return MatchOperand_ParseFail;
+    }
+    // Negative zero is encoded as the flag value INT32_MIN.
+    int32_t Val = CE->getValue();
+    if (isNegative && Val == 0)
+      Val = INT32_MIN;
+
+    Operands.push_back(
+      ARMOperand::CreateImm(MCConstantExpr::create(Val, getContext()), S, E));
+
+    return MatchOperand_Success;
+  }
+
+
+  bool haveEaten = false;
+  bool isAdd = true;
+  if (Tok.is(AsmToken::Plus)) {
+    Parser.Lex(); // Eat the '+' token.
+    haveEaten = true;
+  } else if (Tok.is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the '-' token.
+    isAdd = false;
+    haveEaten = true;
+  }
+
+  Tok = Parser.getTok();
+  int Reg = tryParseRegister();
+  if (Reg == -1) {
+    if (!haveEaten)
+      return MatchOperand_NoMatch;
+    Error(Tok.getLoc(), "register expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARMOperand::CreatePostIdxReg(Reg, isAdd, ARM_AM::no_shift,
+                                                  0, S, Tok.getEndLoc()));
+
+  return MatchOperand_Success;
+}
+
+/// Convert parsed operands to MCInst.  Needed here because this instruction
+/// only has two register operands, but multiplication is commutative so
+/// assemblers should accept both "mul rD, rN, rD" and "mul rD, rD, rN".
+void ARMAsmParser::cvtThumbMultiply(MCInst &Inst,
+                                    const OperandVector &Operands) {
+  ((ARMOperand &)*Operands[3]).addRegOperands(Inst, 1);
+  ((ARMOperand &)*Operands[1]).addCCOutOperands(Inst, 1);
+  // If we have a three-operand form, make sure to set Rn to be the operand
+  // that isn't the same as Rd.
+  unsigned RegOp = 4;
+  if (Operands.size() == 6 &&
+      ((ARMOperand &)*Operands[4]).getReg() ==
+          ((ARMOperand &)*Operands[3]).getReg())
+    RegOp = 5;
+  ((ARMOperand &)*Operands[RegOp]).addRegOperands(Inst, 1);
+  Inst.addOperand(Inst.getOperand(0));
+  ((ARMOperand &)*Operands[2]).addCondCodeOperands(Inst, 2);
+}
+
+void ARMAsmParser::cvtThumbBranches(MCInst &Inst,
+                                    const OperandVector &Operands) {
+  int CondOp = -1, ImmOp = -1;
+  switch(Inst.getOpcode()) {
+    case ARM::tB:
+    case ARM::tBcc:  CondOp = 1; ImmOp = 2; break;
+
+    case ARM::t2B:
+    case ARM::t2Bcc: CondOp = 1; ImmOp = 3; break;
+
+    default: llvm_unreachable("Unexpected instruction in cvtThumbBranches");
+  }
+  // first decide whether or not the branch should be conditional
+  // by looking at it's location relative to an IT block
+  if(inITBlock()) {
+    // inside an IT block we cannot have any conditional branches. any 
+    // such instructions needs to be converted to unconditional form
+    switch(Inst.getOpcode()) {
+      case ARM::tBcc: Inst.setOpcode(ARM::tB); break;
+      case ARM::t2Bcc: Inst.setOpcode(ARM::t2B); break;
+    }
+  } else {
+    // outside IT blocks we can only have unconditional branches with AL
+    // condition code or conditional branches with non-AL condition code
+    unsigned Cond = static_cast<ARMOperand &>(*Operands[CondOp]).getCondCode();
+    switch(Inst.getOpcode()) {
+      case ARM::tB:
+      case ARM::tBcc: 
+        Inst.setOpcode(Cond == ARMCC::AL ? ARM::tB : ARM::tBcc); 
+        break;
+      case ARM::t2B:
+      case ARM::t2Bcc: 
+        Inst.setOpcode(Cond == ARMCC::AL ? ARM::t2B : ARM::t2Bcc);
+        break;
+    }
+  }
+
+  // now decide on encoding size based on branch target range
+  switch(Inst.getOpcode()) {
+    // classify tB as either t2B or t1B based on range of immediate operand
+    case ARM::tB: {
+      ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
+      if (!op.isSignedOffset<11, 1>() && isThumb() && hasV8MBaseline())
+        Inst.setOpcode(ARM::t2B);
+      break;
+    }
+    // classify tBcc as either t2Bcc or t1Bcc based on range of immediate operand
+    case ARM::tBcc: {
+      ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
+      if (!op.isSignedOffset<8, 1>() && isThumb() && hasV8MBaseline())
+        Inst.setOpcode(ARM::t2Bcc);
+      break;
+    }
+  }
+  ((ARMOperand &)*Operands[ImmOp]).addImmOperands(Inst, 1);
+  ((ARMOperand &)*Operands[CondOp]).addCondCodeOperands(Inst, 2);
+}
+
+/// Parse an ARM memory expression, return false if successful else return true
+/// or an error.  The first token must be a '[' when called.
+bool ARMAsmParser::parseMemory(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S, E;
+  if (Parser.getTok().isNot(AsmToken::LBrac))
+    return TokError("Token is not a Left Bracket");
+  S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat left bracket token.
+
+  const AsmToken &BaseRegTok = Parser.getTok();
+  int BaseRegNum = tryParseRegister();
+  if (BaseRegNum == -1)
+    return Error(BaseRegTok.getLoc(), "register expected");
+
+  // The next token must either be a comma, a colon or a closing bracket.
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Colon) && !Tok.is(AsmToken::Comma) &&
+      !Tok.is(AsmToken::RBrac))
+    return Error(Tok.getLoc(), "malformed memory operand");
+
+  if (Tok.is(AsmToken::RBrac)) {
+    E = Tok.getEndLoc();
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, 0,
+                                             ARM_AM::no_shift, 0, 0, false,
+                                             S, E));
+
+    // If there's a pre-indexing writeback marker, '!', just add it as a token
+    // operand. It's rather odd, but syntactically valid.
+    if (Parser.getTok().is(AsmToken::Exclaim)) {
+      Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+      Parser.Lex(); // Eat the '!'.
+    }
+
+    return false;
+  }
+
+  assert((Tok.is(AsmToken::Colon) || Tok.is(AsmToken::Comma)) &&
+         "Lost colon or comma in memory operand?!");
+  if (Tok.is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the comma.
+  }
+
+  // If we have a ':', it's an alignment specifier.
+  if (Parser.getTok().is(AsmToken::Colon)) {
+    Parser.Lex(); // Eat the ':'.
+    E = Parser.getTok().getLoc();
+    SMLoc AlignmentLoc = Tok.getLoc();
+
+    const MCExpr *Expr;
+    if (getParser().parseExpression(Expr))
+     return true;
+
+    // The expression has to be a constant. Memory references with relocations
+    // don't come through here, as they use the <label> forms of the relevant
+    // instructions.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+    if (!CE)
+      return Error (E, "constant expression expected");
+
+    unsigned Align = 0;
+    switch (CE->getValue()) {
+    default:
+      return Error(E,
+                   "alignment specifier must be 16, 32, 64, 128, or 256 bits");
+    case 16:  Align = 2; break;
+    case 32:  Align = 4; break;
+    case 64:  Align = 8; break;
+    case 128: Align = 16; break;
+    case 256: Align = 32; break;
+    }
+
+    // Now we should have the closing ']'
+    if (Parser.getTok().isNot(AsmToken::RBrac))
+      return Error(Parser.getTok().getLoc(), "']' expected");
+    E = Parser.getTok().getEndLoc();
+    Parser.Lex(); // Eat right bracket token.
+
+    // Don't worry about range checking the value here. That's handled by
+    // the is*() predicates.
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, 0,
+                                             ARM_AM::no_shift, 0, Align,
+                                             false, S, E, AlignmentLoc));
+
+    // If there's a pre-indexing writeback marker, '!', just add it as a token
+    // operand.
+    if (Parser.getTok().is(AsmToken::Exclaim)) {
+      Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+      Parser.Lex(); // Eat the '!'.
+    }
+
+    return false;
+  }
+
+  // If we have a '#', it's an immediate offset, else assume it's a register
+  // offset. Be friendly and also accept a plain integer (without a leading
+  // hash) for gas compatibility.
+  if (Parser.getTok().is(AsmToken::Hash) ||
+      Parser.getTok().is(AsmToken::Dollar) ||
+      Parser.getTok().is(AsmToken::Integer)) {
+    if (Parser.getTok().isNot(AsmToken::Integer))
+      Parser.Lex(); // Eat '#' or '$'.
+    E = Parser.getTok().getLoc();
+
+    bool isNegative = getParser().getTok().is(AsmToken::Minus);
+    const MCExpr *Offset;
+    if (getParser().parseExpression(Offset))
+     return true;
+
+    // The expression has to be a constant. Memory references with relocations
+    // don't come through here, as they use the <label> forms of the relevant
+    // instructions.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset);
+    if (!CE)
+      return Error (E, "constant expression expected");
+
+    // If the constant was #-0, represent it as INT32_MIN.
+    int32_t Val = CE->getValue();
+    if (isNegative && Val == 0)
+      CE = MCConstantExpr::create(INT32_MIN, getContext());
+
+    // Now we should have the closing ']'
+    if (Parser.getTok().isNot(AsmToken::RBrac))
+      return Error(Parser.getTok().getLoc(), "']' expected");
+    E = Parser.getTok().getEndLoc();
+    Parser.Lex(); // Eat right bracket token.
+
+    // Don't worry about range checking the value here. That's handled by
+    // the is*() predicates.
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, CE, 0,
+                                             ARM_AM::no_shift, 0, 0,
+                                             false, S, E));
+
+    // If there's a pre-indexing writeback marker, '!', just add it as a token
+    // operand.
+    if (Parser.getTok().is(AsmToken::Exclaim)) {
+      Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+      Parser.Lex(); // Eat the '!'.
+    }
+
+    return false;
+  }
+
+  // The register offset is optionally preceded by a '+' or '-'
+  bool isNegative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    isNegative = true;
+    Parser.Lex(); // Eat the '-'.
+  } else if (Parser.getTok().is(AsmToken::Plus)) {
+    // Nothing to do.
+    Parser.Lex(); // Eat the '+'.
+  }
+
+  E = Parser.getTok().getLoc();
+  int OffsetRegNum = tryParseRegister();
+  if (OffsetRegNum == -1)
+    return Error(E, "register expected");
+
+  // If there's a shift operator, handle it.
+  ARM_AM::ShiftOpc ShiftType = ARM_AM::no_shift;
+  unsigned ShiftImm = 0;
+  if (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the ','.
+    if (parseMemRegOffsetShift(ShiftType, ShiftImm))
+      return true;
+  }
+
+  // Now we should have the closing ']'
+  if (Parser.getTok().isNot(AsmToken::RBrac))
+    return Error(Parser.getTok().getLoc(), "']' expected");
+  E = Parser.getTok().getEndLoc();
+  Parser.Lex(); // Eat right bracket token.
+
+  Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, OffsetRegNum,
+                                           ShiftType, ShiftImm, 0, isNegative,
+                                           S, E));
+
+  // If there's a pre-indexing writeback marker, '!', just add it as a token
+  // operand.
+  if (Parser.getTok().is(AsmToken::Exclaim)) {
+    Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat the '!'.
+  }
+
+  return false;
+}
+
+/// parseMemRegOffsetShift - one of these two:
+///   ( lsl | lsr | asr | ror ) , # shift_amount
+///   rrx
+/// return true if it parses a shift otherwise it returns false.
+bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
+                                          unsigned &Amount) {
+  MCAsmParser &Parser = getParser();
+  SMLoc Loc = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return true;
+  StringRef ShiftName = Tok.getString();
+  if (ShiftName == "lsl" || ShiftName == "LSL" ||
+      ShiftName == "asl" || ShiftName == "ASL")
+    St = ARM_AM::lsl;
+  else if (ShiftName == "lsr" || ShiftName == "LSR")
+    St = ARM_AM::lsr;
+  else if (ShiftName == "asr" || ShiftName == "ASR")
+    St = ARM_AM::asr;
+  else if (ShiftName == "ror" || ShiftName == "ROR")
+    St = ARM_AM::ror;
+  else if (ShiftName == "rrx" || ShiftName == "RRX")
+    St = ARM_AM::rrx;
+  else
+    return Error(Loc, "illegal shift operator");
+  Parser.Lex(); // Eat shift type token.
+
+  // rrx stands alone.
+  Amount = 0;
+  if (St != ARM_AM::rrx) {
+    Loc = Parser.getTok().getLoc();
+    // A '#' and a shift amount.
+    const AsmToken &HashTok = Parser.getTok();
+    if (HashTok.isNot(AsmToken::Hash) &&
+        HashTok.isNot(AsmToken::Dollar))
+      return Error(HashTok.getLoc(), "'#' expected");
+    Parser.Lex(); // Eat hash token.
+
+    const MCExpr *Expr;
+    if (getParser().parseExpression(Expr))
+      return true;
+    // Range check the immediate.
+    // lsl, ror: 0 <= imm <= 31
+    // lsr, asr: 0 <= imm <= 32
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+    if (!CE)
+      return Error(Loc, "shift amount must be an immediate");
+    int64_t Imm = CE->getValue();
+    if (Imm < 0 ||
+        ((St == ARM_AM::lsl || St == ARM_AM::ror) && Imm > 31) ||
+        ((St == ARM_AM::lsr || St == ARM_AM::asr) && Imm > 32))
+      return Error(Loc, "immediate shift value out of range");
+    // If <ShiftTy> #0, turn it into a no_shift.
+    if (Imm == 0)
+      St = ARM_AM::lsl;
+    // For consistency, treat lsr #32 and asr #32 as having immediate value 0.
+    if (Imm == 32)
+      Imm = 0;
+    Amount = Imm;
+  }
+
+  return false;
+}
+
+/// parseFPImm - A floating point immediate expression operand.
+OperandMatchResultTy
+ARMAsmParser::parseFPImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  // Anything that can accept a floating point constant as an operand
+  // needs to go through here, as the regular parseExpression is
+  // integer only.
+  //
+  // This routine still creates a generic Immediate operand, containing
+  // a bitcast of the 64-bit floating point value. The various operands
+  // that accept floats can check whether the value is valid for them
+  // via the standard is*() predicates.
+
+  SMLoc S = Parser.getTok().getLoc();
+
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+
+  // Disambiguate the VMOV forms that can accept an FP immediate.
+  // vmov.f32 <sreg>, #imm
+  // vmov.f64 <dreg>, #imm
+  // vmov.f32 <dreg>, #imm  @ vector f32x2
+  // vmov.f32 <qreg>, #imm  @ vector f32x4
+  //
+  // There are also the NEON VMOV instructions which expect an
+  // integer constant. Make sure we don't try to parse an FPImm
+  // for these:
+  // vmov.i{8|16|32|64} <dreg|qreg>, #imm
+  ARMOperand &TyOp = static_cast<ARMOperand &>(*Operands[2]);
+  bool isVmovf = TyOp.isToken() &&
+                 (TyOp.getToken() == ".f32" || TyOp.getToken() == ".f64" ||
+                  TyOp.getToken() == ".f16");
+  ARMOperand &Mnemonic = static_cast<ARMOperand &>(*Operands[0]);
+  bool isFconst = Mnemonic.isToken() && (Mnemonic.getToken() == "fconstd" ||
+                                         Mnemonic.getToken() == "fconsts");
+  if (!(isVmovf || isFconst))
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat '#' or '$'.
+
+  // Handle negation, as that still comes through as a separate token.
+  bool isNegative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    isNegative = true;
+    Parser.Lex();
+  }
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc Loc = Tok.getLoc();
+  if (Tok.is(AsmToken::Real) && isVmovf) {
+    APFloat RealVal(APFloat::IEEEsingle(), Tok.getString());
+    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+    // If we had a '-' in front, toggle the sign bit.
+    IntVal ^= (uint64_t)isNegative << 31;
+    Parser.Lex(); // Eat the token.
+    Operands.push_back(ARMOperand::CreateImm(
+          MCConstantExpr::create(IntVal, getContext()),
+          S, Parser.getTok().getLoc()));
+    return MatchOperand_Success;
+  }
+  // Also handle plain integers. Instructions which allow floating point
+  // immediates also allow a raw encoded 8-bit value.
+  if (Tok.is(AsmToken::Integer) && isFconst) {
+    int64_t Val = Tok.getIntVal();
+    Parser.Lex(); // Eat the token.
+    if (Val > 255 || Val < 0) {
+      Error(Loc, "encoded floating point value out of range");
+      return MatchOperand_ParseFail;
+    }
+    float RealVal = ARM_AM::getFPImmFloat(Val);
+    Val = APFloat(RealVal).bitcastToAPInt().getZExtValue();
+
+    Operands.push_back(ARMOperand::CreateImm(
+        MCConstantExpr::create(Val, getContext()), S,
+        Parser.getTok().getLoc()));
+    return MatchOperand_Success;
+  }
+
+  Error(Loc, "invalid floating point immediate");
+  return MatchOperand_ParseFail;
+}
+
+/// Parse a arm instruction operand.  For now this parses the operand regardless
+/// of the mnemonic.
+bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S, E;
+
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  switch (getLexer().getKind()) {
+  default:
+    Error(Parser.getTok().getLoc(), "unexpected token in operand");
+    return true;
+  case AsmToken::Identifier: {
+    // If we've seen a branch mnemonic, the next operand must be a label.  This
+    // is true even if the label is a register name.  So "br r1" means branch to
+    // label "r1".
+    bool ExpectLabel = Mnemonic == "b" || Mnemonic == "bl";
+    if (!ExpectLabel) {
+      if (!tryParseRegisterWithWriteBack(Operands))
+        return false;
+      int Res = tryParseShiftRegister(Operands);
+      if (Res == 0) // success
+        return false;
+      else if (Res == -1) // irrecoverable error
+        return true;
+      // If this is VMRS, check for the apsr_nzcv operand.
+      if (Mnemonic == "vmrs" &&
+          Parser.getTok().getString().equals_lower("apsr_nzcv")) {
+        S = Parser.getTok().getLoc();
+        Parser.Lex();
+        Operands.push_back(ARMOperand::CreateToken("APSR_nzcv", S));
+        return false;
+      }
+    }
+
+    // Fall though for the Identifier case that is not a register or a
+    // special name.
+  }
+  case AsmToken::LParen:  // parenthesized expressions like (_strcmp-4)
+  case AsmToken::Integer: // things like 1f and 2b as a branch targets
+  case AsmToken::String:  // quoted label names.
+  case AsmToken::Dot: {   // . as a branch target
+    // This was not a register so parse other operands that start with an
+    // identifier (like labels) as expressions and create them as immediates.
+    const MCExpr *IdVal;
+    S = Parser.getTok().getLoc();
+    if (getParser().parseExpression(IdVal))
+      return true;
+    E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    Operands.push_back(ARMOperand::CreateImm(IdVal, S, E));
+    return false;
+  }
+  case AsmToken::LBrac:
+    return parseMemory(Operands);
+  case AsmToken::LCurly:
+    return parseRegisterList(Operands);
+  case AsmToken::Dollar:
+  case AsmToken::Hash: {
+    // #42 -> immediate.
+    S = Parser.getTok().getLoc();
+    Parser.Lex();
+
+    if (Parser.getTok().isNot(AsmToken::Colon)) {
+      bool isNegative = Parser.getTok().is(AsmToken::Minus);
+      const MCExpr *ImmVal;
+      if (getParser().parseExpression(ImmVal))
+        return true;
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
+      if (CE) {
+        int32_t Val = CE->getValue();
+        if (isNegative && Val == 0)
+          ImmVal = MCConstantExpr::create(INT32_MIN, getContext());
+      }
+      E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      Operands.push_back(ARMOperand::CreateImm(ImmVal, S, E));
+
+      // There can be a trailing '!' on operands that we want as a separate
+      // '!' Token operand. Handle that here. For example, the compatibility
+      // alias for 'srsdb sp!, #imm' is 'srsdb #imm!'.
+      if (Parser.getTok().is(AsmToken::Exclaim)) {
+        Operands.push_back(ARMOperand::CreateToken(Parser.getTok().getString(),
+                                                   Parser.getTok().getLoc()));
+        Parser.Lex(); // Eat exclaim token
+      }
+      return false;
+    }
+    // w/ a ':' after the '#', it's just like a plain ':'.
+    LLVM_FALLTHROUGH;
+  }
+  case AsmToken::Colon: {
+    S = Parser.getTok().getLoc();
+    // ":lower16:" and ":upper16:" expression prefixes
+    // FIXME: Check it's an expression prefix,
+    // e.g. (FOO - :lower16:BAR) isn't legal.
+    ARMMCExpr::VariantKind RefKind;
+    if (parsePrefix(RefKind))
+      return true;
+
+    const MCExpr *SubExprVal;
+    if (getParser().parseExpression(SubExprVal))
+      return true;
+
+    const MCExpr *ExprVal = ARMMCExpr::create(RefKind, SubExprVal,
+                                              getContext());
+    E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    Operands.push_back(ARMOperand::CreateImm(ExprVal, S, E));
+    return false;
+  }
+  case AsmToken::Equal: {
+    S = Parser.getTok().getLoc();
+    if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val)
+      return Error(S, "unexpected token in operand");
+    Parser.Lex(); // Eat '='
+    const MCExpr *SubExprVal;
+    if (getParser().parseExpression(SubExprVal))
+      return true;
+    E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+    // execute-only: we assume that assembly programmers know what they are
+    // doing and allow literal pool creation here
+    Operands.push_back(ARMOperand::CreateConstantPoolImm(SubExprVal, S, E));
+    return false;
+  }
+  }
+}
+
+// parsePrefix - Parse ARM 16-bit relocations expression prefix, i.e.
+//  :lower16: and :upper16:.
+bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
+  MCAsmParser &Parser = getParser();
+  RefKind = ARMMCExpr::VK_ARM_None;
+
+  // consume an optional '#' (GNU compatibility)
+  if (getLexer().is(AsmToken::Hash))
+    Parser.Lex();
+
+  // :lower16: and :upper16: modifiers
+  assert(getLexer().is(AsmToken::Colon) && "expected a :");
+  Parser.Lex(); // Eat ':'
+
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    Error(Parser.getTok().getLoc(), "expected prefix identifier in operand");
+    return true;
+  }
+
+  enum {
+    COFF = (1 << MCObjectFileInfo::IsCOFF),
+    ELF = (1 << MCObjectFileInfo::IsELF),
+    MACHO = (1 << MCObjectFileInfo::IsMachO)
+  };
+  static const struct PrefixEntry {
+    const char *Spelling;
+    ARMMCExpr::VariantKind VariantKind;
+    uint8_t SupportedFormats;
+  } PrefixEntries[] = {
+    { "lower16", ARMMCExpr::VK_ARM_LO16, COFF | ELF | MACHO },
+    { "upper16", ARMMCExpr::VK_ARM_HI16, COFF | ELF | MACHO },
+  };
+
+  StringRef IDVal = Parser.getTok().getIdentifier();
+
+  const auto &Prefix =
+      std::find_if(std::begin(PrefixEntries), std::end(PrefixEntries),
+                   [&IDVal](const PrefixEntry &PE) {
+                      return PE.Spelling == IDVal;
+                   });
+  if (Prefix == std::end(PrefixEntries)) {
+    Error(Parser.getTok().getLoc(), "unexpected prefix in operand");
+    return true;
+  }
+
+  uint8_t CurrentFormat;
+  switch (getContext().getObjectFileInfo()->getObjectFileType()) {
+  case MCObjectFileInfo::IsMachO:
+    CurrentFormat = MACHO;
+    break;
+  case MCObjectFileInfo::IsELF:
+    CurrentFormat = ELF;
+    break;
+  case MCObjectFileInfo::IsCOFF:
+    CurrentFormat = COFF;
+    break;
+  }
+
+  if (~Prefix->SupportedFormats & CurrentFormat) {
+    Error(Parser.getTok().getLoc(),
+          "cannot represent relocation in the current file format");
+    return true;
+  }
+
+  RefKind = Prefix->VariantKind;
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::Colon)) {
+    Error(Parser.getTok().getLoc(), "unexpected token after prefix");
+    return true;
+  }
+  Parser.Lex(); // Eat the last ':'
+
+  return false;
+}
+
+/// \brief Given a mnemonic, split out possible predication code and carry
+/// setting letters to form a canonical mnemonic and flags.
+//
+// FIXME: Would be nice to autogen this.
+// FIXME: This is a bit of a maze of special cases.
+StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
+                                      unsigned &PredicationCode,
+                                      bool &CarrySetting,
+                                      unsigned &ProcessorIMod,
+                                      StringRef &ITMask) {
+  PredicationCode = ARMCC::AL;
+  CarrySetting = false;
+  ProcessorIMod = 0;
+
+  // Ignore some mnemonics we know aren't predicated forms.
+  //
+  // FIXME: Would be nice to autogen this.
+  if ((Mnemonic == "movs" && isThumb()) ||
+      Mnemonic == "teq"   || Mnemonic == "vceq"   || Mnemonic == "svc"   ||
+      Mnemonic == "mls"   || Mnemonic == "smmls"  || Mnemonic == "vcls"  ||
+      Mnemonic == "vmls"  || Mnemonic == "vnmls"  || Mnemonic == "vacge" ||
+      Mnemonic == "vcge"  || Mnemonic == "vclt"   || Mnemonic == "vacgt" ||
+      Mnemonic == "vaclt" || Mnemonic == "vacle"  || Mnemonic == "hlt" ||
+      Mnemonic == "vcgt"  || Mnemonic == "vcle"   || Mnemonic == "smlal" ||
+      Mnemonic == "umaal" || Mnemonic == "umlal"  || Mnemonic == "vabal" ||
+      Mnemonic == "vmlal" || Mnemonic == "vpadal" || Mnemonic == "vqdmlal" ||
+      Mnemonic == "fmuls" || Mnemonic == "vmaxnm" || Mnemonic == "vminnm" ||
+      Mnemonic == "vcvta" || Mnemonic == "vcvtn"  || Mnemonic == "vcvtp" ||
+      Mnemonic == "vcvtm" || Mnemonic == "vrinta" || Mnemonic == "vrintn" ||
+      Mnemonic == "vrintp" || Mnemonic == "vrintm" || Mnemonic == "hvc" ||
+      Mnemonic.startswith("vsel") || Mnemonic == "vins" || Mnemonic == "vmovx" ||
+      Mnemonic == "bxns"  || Mnemonic == "blxns")
+    return Mnemonic;
+
+  // First, split out any predication code. Ignore mnemonics we know aren't
+  // predicated but do have a carry-set and so weren't caught above.
+  if (Mnemonic != "adcs" && Mnemonic != "bics" && Mnemonic != "movs" &&
+      Mnemonic != "muls" && Mnemonic != "smlals" && Mnemonic != "smulls" &&
+      Mnemonic != "umlals" && Mnemonic != "umulls" && Mnemonic != "lsls" &&
+      Mnemonic != "sbcs" && Mnemonic != "rscs") {
+    unsigned CC = StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2))
+      .Case("eq", ARMCC::EQ)
+      .Case("ne", ARMCC::NE)
+      .Case("hs", ARMCC::HS)
+      .Case("cs", ARMCC::HS)
+      .Case("lo", ARMCC::LO)
+      .Case("cc", ARMCC::LO)
+      .Case("mi", ARMCC::MI)
+      .Case("pl", ARMCC::PL)
+      .Case("vs", ARMCC::VS)
+      .Case("vc", ARMCC::VC)
+      .Case("hi", ARMCC::HI)
+      .Case("ls", ARMCC::LS)
+      .Case("ge", ARMCC::GE)
+      .Case("lt", ARMCC::LT)
+      .Case("gt", ARMCC::GT)
+      .Case("le", ARMCC::LE)
+      .Case("al", ARMCC::AL)
+      .Default(~0U);
+    if (CC != ~0U) {
+      Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 2);
+      PredicationCode = CC;
+    }
+  }
+
+  // Next, determine if we have a carry setting bit. We explicitly ignore all
+  // the instructions we know end in 's'.
+  if (Mnemonic.endswith("s") &&
+      !(Mnemonic == "cps" || Mnemonic == "mls" ||
+        Mnemonic == "mrs" || Mnemonic == "smmls" || Mnemonic == "vabs" ||
+        Mnemonic == "vcls" || Mnemonic == "vmls" || Mnemonic == "vmrs" ||
+        Mnemonic == "vnmls" || Mnemonic == "vqabs" || Mnemonic == "vrecps" ||
+        Mnemonic == "vrsqrts" || Mnemonic == "srs" || Mnemonic == "flds" ||
+        Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" ||
+        Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" ||
+        Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" ||
+        Mnemonic == "vfms" || Mnemonic == "vfnms" || Mnemonic == "fconsts" ||
+        Mnemonic == "bxns" || Mnemonic == "blxns" ||
+        (Mnemonic == "movs" && isThumb()))) {
+    Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
+    CarrySetting = true;
+  }
+
+  // The "cps" instruction can have a interrupt mode operand which is glued into
+  // the mnemonic. Check if this is the case, split it and parse the imod op
+  if (Mnemonic.startswith("cps")) {
+    // Split out any imod code.
+    unsigned IMod =
+      StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2, 2))
+      .Case("ie", ARM_PROC::IE)
+      .Case("id", ARM_PROC::ID)
+      .Default(~0U);
+    if (IMod != ~0U) {
+      Mnemonic = Mnemonic.slice(0, Mnemonic.size()-2);
+      ProcessorIMod = IMod;
+    }
+  }
+
+  // The "it" instruction has the condition mask on the end of the mnemonic.
+  if (Mnemonic.startswith("it")) {
+    ITMask = Mnemonic.slice(2, Mnemonic.size());
+    Mnemonic = Mnemonic.slice(0, 2);
+  }
+
+  return Mnemonic;
+}
+
+/// \brief Given a canonical mnemonic, determine if the instruction ever allows
+/// inclusion of carry set or predication code operands.
+//
+// FIXME: It would be nice to autogen this.
+void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
+                                         bool &CanAcceptCarrySet,
+                                         bool &CanAcceptPredicationCode) {
+  CanAcceptCarrySet =
+      Mnemonic == "and" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
+      Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" ||
+      Mnemonic == "add" || Mnemonic == "adc" || Mnemonic == "mul" ||
+      Mnemonic == "bic" || Mnemonic == "asr" || Mnemonic == "orr" ||
+      Mnemonic == "mvn" || Mnemonic == "rsb" || Mnemonic == "rsc" ||
+      Mnemonic == "orn" || Mnemonic == "sbc" || Mnemonic == "eor" ||
+      Mnemonic == "neg" || Mnemonic == "vfm" || Mnemonic == "vfnm" ||
+      (!isThumb() &&
+       (Mnemonic == "smull" || Mnemonic == "mov" || Mnemonic == "mla" ||
+        Mnemonic == "smlal" || Mnemonic == "umlal" || Mnemonic == "umull"));
+
+  if (Mnemonic == "bkpt" || Mnemonic == "cbnz" || Mnemonic == "setend" ||
+      Mnemonic == "cps" || Mnemonic == "it" || Mnemonic == "cbz" ||
+      Mnemonic == "trap" || Mnemonic == "hlt" || Mnemonic == "udf" ||
+      Mnemonic.startswith("crc32") || Mnemonic.startswith("cps") ||
+      Mnemonic.startswith("vsel") || Mnemonic == "vmaxnm" ||
+      Mnemonic == "vminnm" || Mnemonic == "vcvta" || Mnemonic == "vcvtn" ||
+      Mnemonic == "vcvtp" || Mnemonic == "vcvtm" || Mnemonic == "vrinta" ||
+      Mnemonic == "vrintn" || Mnemonic == "vrintp" || Mnemonic == "vrintm" ||
+      Mnemonic.startswith("aes") || Mnemonic == "hvc" || Mnemonic == "setpan" ||
+      Mnemonic.startswith("sha1") || Mnemonic.startswith("sha256") ||
+      (FullInst.startswith("vmull") && FullInst.endswith(".p64")) ||
+      Mnemonic == "vmovx" || Mnemonic == "vins") {
+    // These mnemonics are never predicable
+    CanAcceptPredicationCode = false;
+  } else if (!isThumb()) {
+    // Some instructions are only predicable in Thumb mode
+    CanAcceptPredicationCode =
+        Mnemonic != "cdp2" && Mnemonic != "clrex" && Mnemonic != "mcr2" &&
+        Mnemonic != "mcrr2" && Mnemonic != "mrc2" && Mnemonic != "mrrc2" &&
+        Mnemonic != "dmb" && Mnemonic != "dsb" && Mnemonic != "isb" &&
+        Mnemonic != "pld" && Mnemonic != "pli" && Mnemonic != "pldw" &&
+        Mnemonic != "ldc2" && Mnemonic != "ldc2l" && Mnemonic != "stc2" &&
+        Mnemonic != "stc2l" && !Mnemonic.startswith("rfe") &&
+        !Mnemonic.startswith("srs");
+  } else if (isThumbOne()) {
+    if (hasV6MOps())
+      CanAcceptPredicationCode = Mnemonic != "movs";
+    else
+      CanAcceptPredicationCode = Mnemonic != "nop" && Mnemonic != "movs";
+  } else
+    CanAcceptPredicationCode = true;
+}
+
+// \brief Some Thumb instructions have two operand forms that are not
+// available as three operand, convert to two operand form if possible.
+//
+// FIXME: We would really like to be able to tablegen'erate this.
+void ARMAsmParser::tryConvertingToTwoOperandForm(StringRef Mnemonic,
+                                                 bool CarrySetting,
+                                                 OperandVector &Operands) {
+  if (Operands.size() != 6)
+    return;
+
+  const auto &Op3 = static_cast<ARMOperand &>(*Operands[3]);
+        auto &Op4 = static_cast<ARMOperand &>(*Operands[4]);
+  if (!Op3.isReg() || !Op4.isReg())
+    return;
+
+  auto Op3Reg = Op3.getReg();
+  auto Op4Reg = Op4.getReg();
+
+  // For most Thumb2 cases we just generate the 3 operand form and reduce
+  // it in processInstruction(), but the 3 operand form of ADD (t2ADDrr)
+  // won't accept SP or PC so we do the transformation here taking care
+  // with immediate range in the 'add sp, sp #imm' case.
+  auto &Op5 = static_cast<ARMOperand &>(*Operands[5]);
+  if (isThumbTwo()) {
+    if (Mnemonic != "add")
+      return;
+    bool TryTransform = Op3Reg == ARM::PC || Op4Reg == ARM::PC ||
+                        (Op5.isReg() && Op5.getReg() == ARM::PC);
+    if (!TryTransform) {
+      TryTransform = (Op3Reg == ARM::SP || Op4Reg == ARM::SP ||
+                      (Op5.isReg() && Op5.getReg() == ARM::SP)) &&
+                     !(Op3Reg == ARM::SP && Op4Reg == ARM::SP &&
+                       Op5.isImm() && !Op5.isImm0_508s4());
+    }
+    if (!TryTransform)
+      return;
+  } else if (!isThumbOne())
+    return;
+
+  if (!(Mnemonic == "add" || Mnemonic == "sub" || Mnemonic == "and" ||
+        Mnemonic == "eor" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
+        Mnemonic == "asr" || Mnemonic == "adc" || Mnemonic == "sbc" ||
+        Mnemonic == "ror" || Mnemonic == "orr" || Mnemonic == "bic"))
+    return;
+
+  // If first 2 operands of a 3 operand instruction are the same
+  // then transform to 2 operand version of the same instruction
+  // e.g. 'adds r0, r0, #1' transforms to 'adds r0, #1'
+  bool Transform = Op3Reg == Op4Reg;
+
+  // For communtative operations, we might be able to transform if we swap
+  // Op4 and Op5.  The 'ADD Rdm, SP, Rdm' form is already handled specially
+  // as tADDrsp.
+  const ARMOperand *LastOp = &Op5;
+  bool Swap = false;
+  if (!Transform && Op5.isReg() && Op3Reg == Op5.getReg() &&
+      ((Mnemonic == "add" && Op4Reg != ARM::SP) ||
+       Mnemonic == "and" || Mnemonic == "eor" ||
+       Mnemonic == "adc" || Mnemonic == "orr")) {
+    Swap = true;
+    LastOp = &Op4;
+    Transform = true;
+  }
+
+  // If both registers are the same then remove one of them from
+  // the operand list, with certain exceptions.
+  if (Transform) {
+    // Don't transform 'adds Rd, Rd, Rm' or 'sub{s} Rd, Rd, Rm' because the
+    // 2 operand forms don't exist.
+    if (((Mnemonic == "add" && CarrySetting) || Mnemonic == "sub") &&
+        LastOp->isReg())
+      Transform = false;
+
+    // Don't transform 'add/sub{s} Rd, Rd, #imm' if the immediate fits into
+    // 3-bits because the ARMARM says not to.
+    if ((Mnemonic == "add" || Mnemonic == "sub") && LastOp->isImm0_7())
+      Transform = false;
+  }
+
+  if (Transform) {
+    if (Swap)
+      std::swap(Op4, Op5);
+    Operands.erase(Operands.begin() + 3);
+  }
+}
+
+bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
+                                          OperandVector &Operands) {
+  // FIXME: This is all horribly hacky. We really need a better way to deal
+  // with optional operands like this in the matcher table.
+
+  // The 'mov' mnemonic is special. One variant has a cc_out operand, while
+  // another does not. Specifically, the MOVW instruction does not. So we
+  // special case it here and remove the defaulted (non-setting) cc_out
+  // operand if that's the instruction we're trying to match.
+  //
+  // We do this as post-processing of the explicit operands rather than just
+  // conditionally adding the cc_out in the first place because we need
+  // to check the type of the parsed immediate operand.
+  if (Mnemonic == "mov" && Operands.size() > 4 && !isThumb() &&
+      !static_cast<ARMOperand &>(*Operands[4]).isModImm() &&
+      static_cast<ARMOperand &>(*Operands[4]).isImm0_65535Expr() &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0)
+    return true;
+
+  // Register-register 'add' for thumb does not have a cc_out operand
+  // when there are only two register operands.
+  if (isThumb() && Mnemonic == "add" && Operands.size() == 5 &&
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0)
+    return true;
+  // Register-register 'add' for thumb does not have a cc_out operand
+  // when it's an ADD Rdm, SP, {Rdm|#imm0_255} instruction. We do
+  // have to check the immediate range here since Thumb2 has a variant
+  // that can handle a different range and has a cc_out operand.
+  if (((isThumb() && Mnemonic == "add") ||
+       (isThumbTwo() && Mnemonic == "sub")) &&
+      Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::SP &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+      ((Mnemonic == "add" && static_cast<ARMOperand &>(*Operands[5]).isReg()) ||
+       static_cast<ARMOperand &>(*Operands[5]).isImm0_1020s4()))
+    return true;
+  // For Thumb2, add/sub immediate does not have a cc_out operand for the
+  // imm0_4095 variant. That's the least-preferred variant when
+  // selecting via the generic "add" mnemonic, so to know that we
+  // should remove the cc_out operand, we have to explicitly check that
+  // it's not one of the other variants. Ugh.
+  if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") &&
+      Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[5]).isImm()) {
+    // Nest conditions rather than one big 'if' statement for readability.
+    //
+    // If both registers are low, we're in an IT block, and the immediate is
+    // in range, we should use encoding T1 instead, which has a cc_out.
+    if (inITBlock() &&
+        isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) &&
+        isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) &&
+        static_cast<ARMOperand &>(*Operands[5]).isImm0_7())
+      return false;
+    // Check against T3. If the second register is the PC, this is an
+    // alternate form of ADR, which uses encoding T4, so check for that too.
+    if (static_cast<ARMOperand &>(*Operands[4]).getReg() != ARM::PC &&
+        static_cast<ARMOperand &>(*Operands[5]).isT2SOImm())
+      return false;
+
+    // Otherwise, we use encoding T4, which does not have a cc_out
+    // operand.
+    return true;
+  }
+
+  // The thumb2 multiply instruction doesn't have a CCOut register, so
+  // if we have a "mul" mnemonic in Thumb mode, check if we'll be able to
+  // use the 16-bit encoding or not.
+  if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 6 &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[5]).isReg() &&
+      // If the registers aren't low regs, the destination reg isn't the
+      // same as one of the source regs, or the cc_out operand is zero
+      // outside of an IT block, we have to use the 32-bit encoding, so
+      // remove the cc_out operand.
+      (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[5]).getReg()) ||
+       !inITBlock() || (static_cast<ARMOperand &>(*Operands[3]).getReg() !=
+                            static_cast<ARMOperand &>(*Operands[5]).getReg() &&
+                        static_cast<ARMOperand &>(*Operands[3]).getReg() !=
+                            static_cast<ARMOperand &>(*Operands[4]).getReg())))
+    return true;
+
+  // Also check the 'mul' syntax variant that doesn't specify an explicit
+  // destination register.
+  if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 5 &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      // If the registers aren't low regs  or the cc_out operand is zero
+      // outside of an IT block, we have to use the 32-bit encoding, so
+      // remove the cc_out operand.
+      (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) ||
+       !inITBlock()))
+    return true;
+
+
+
+  // Register-register 'add/sub' for thumb does not have a cc_out operand
+  // when it's an ADD/SUB SP, #imm. Be lenient on count since there's also
+  // the "add/sub SP, SP, #imm" version. If the follow-up operands aren't
+  // right, this will result in better diagnostics (which operand is off)
+  // anyway.
+  if (isThumb() && (Mnemonic == "add" || Mnemonic == "sub") &&
+      (Operands.size() == 5 || Operands.size() == 6) &&
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::SP &&
+      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
+      (static_cast<ARMOperand &>(*Operands[4]).isImm() ||
+       (Operands.size() == 6 &&
+        static_cast<ARMOperand &>(*Operands[5]).isImm())))
+    return true;
+
+  return false;
+}
+
+bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic,
+                                              OperandVector &Operands) {
+  // VRINT{Z, R, X} have a predicate operand in VFP, but not in NEON
+  unsigned RegIdx = 3;
+  if ((Mnemonic == "vrintz" || Mnemonic == "vrintx" || Mnemonic == "vrintr") &&
+      (static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32" ||
+       static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f16")) {
+    if (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+        (static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32" ||
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f16"))
+      RegIdx = 4;
+
+    if (static_cast<ARMOperand &>(*Operands[RegIdx]).isReg() &&
+        (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(
+             static_cast<ARMOperand &>(*Operands[RegIdx]).getReg()) ||
+         ARMMCRegisterClasses[ARM::QPRRegClassID].contains(
+             static_cast<ARMOperand &>(*Operands[RegIdx]).getReg())))
+      return true;
+  }
+  return false;
+}
+
+static bool isDataTypeToken(StringRef Tok) {
+  return Tok == ".8" || Tok == ".16" || Tok == ".32" || Tok == ".64" ||
+    Tok == ".i8" || Tok == ".i16" || Tok == ".i32" || Tok == ".i64" ||
+    Tok == ".u8" || Tok == ".u16" || Tok == ".u32" || Tok == ".u64" ||
+    Tok == ".s8" || Tok == ".s16" || Tok == ".s32" || Tok == ".s64" ||
+    Tok == ".p8" || Tok == ".p16" || Tok == ".f32" || Tok == ".f64" ||
+    Tok == ".f" || Tok == ".d";
+}
+
+// FIXME: This bit should probably be handled via an explicit match class
+// in the .td files that matches the suffix instead of having it be
+// a literal string token the way it is now.
+static bool doesIgnoreDataTypeSuffix(StringRef Mnemonic, StringRef DT) {
+  return Mnemonic.startswith("vldm") || Mnemonic.startswith("vstm");
+}
+static void applyMnemonicAliases(StringRef &Mnemonic, uint64_t Features,
+                                 unsigned VariantID);
+
+static bool RequiresVFPRegListValidation(StringRef Inst,
+                                         bool &AcceptSinglePrecisionOnly,
+                                         bool &AcceptDoublePrecisionOnly) {
+  if (Inst.size() < 7)
+    return false;
+
+  if (Inst.startswith("fldm") || Inst.startswith("fstm")) {
+    StringRef AddressingMode = Inst.substr(4, 2);
+    if (AddressingMode == "ia" || AddressingMode == "db" ||
+        AddressingMode == "ea" || AddressingMode == "fd") {
+      AcceptSinglePrecisionOnly = Inst[6] == 's';
+      AcceptDoublePrecisionOnly = Inst[6] == 'd' || Inst[6] == 'x';
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// Parse an arm instruction mnemonic followed by its operands.
+bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                    SMLoc NameLoc, OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  // FIXME: Can this be done via tablegen in some fashion?
+  bool RequireVFPRegisterListCheck;
+  bool AcceptSinglePrecisionOnly;
+  bool AcceptDoublePrecisionOnly;
+  RequireVFPRegisterListCheck =
+    RequiresVFPRegListValidation(Name, AcceptSinglePrecisionOnly,
+                                 AcceptDoublePrecisionOnly);
+
+  // Apply mnemonic aliases before doing anything else, as the destination
+  // mnemonic may include suffices and we want to handle them normally.
+  // The generic tblgen'erated code does this later, at the start of
+  // MatchInstructionImpl(), but that's too late for aliases that include
+  // any sort of suffix.
+  uint64_t AvailableFeatures = getAvailableFeatures();
+  unsigned AssemblerDialect = getParser().getAssemblerDialect();
+  applyMnemonicAliases(Name, AvailableFeatures, AssemblerDialect);
+
+  // First check for the ARM-specific .req directive.
+  if (Parser.getTok().is(AsmToken::Identifier) &&
+      Parser.getTok().getIdentifier() == ".req") {
+    parseDirectiveReq(Name, NameLoc);
+    // We always return 'error' for this, as we're done with this
+    // statement and don't need to match the 'instruction."
+    return true;
+  }
+
+  // Create the leading tokens for the mnemonic, split by '.' characters.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Mnemonic = Name.slice(Start, Next);
+
+  // Split out the predication code and carry setting flag from the mnemonic.
+  unsigned PredicationCode;
+  unsigned ProcessorIMod;
+  bool CarrySetting;
+  StringRef ITMask;
+  Mnemonic = splitMnemonic(Mnemonic, PredicationCode, CarrySetting,
+                           ProcessorIMod, ITMask);
+
+  // In Thumb1, only the branch (B) instruction can be predicated.
+  if (isThumbOne() && PredicationCode != ARMCC::AL && Mnemonic != "b") {
+    return Error(NameLoc, "conditional execution not supported in Thumb1");
+  }
+
+  Operands.push_back(ARMOperand::CreateToken(Mnemonic, NameLoc));
+
+  // Handle the IT instruction ITMask. Convert it to a bitmask. This
+  // is the mask as it will be for the IT encoding if the conditional
+  // encoding has a '1' as it's bit0 (i.e. 't' ==> '1'). In the case
+  // where the conditional bit0 is zero, the instruction post-processing
+  // will adjust the mask accordingly.
+  if (Mnemonic == "it") {
+    SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + 2);
+    if (ITMask.size() > 3) {
+      return Error(Loc, "too many conditions on IT instruction");
+    }
+    unsigned Mask = 8;
+    for (unsigned i = ITMask.size(); i != 0; --i) {
+      char pos = ITMask[i - 1];
+      if (pos != 't' && pos != 'e') {
+        return Error(Loc, "illegal IT block condition mask '" + ITMask + "'");
+      }
+      Mask >>= 1;
+      if (ITMask[i - 1] == 't')
+        Mask |= 8;
+    }
+    Operands.push_back(ARMOperand::CreateITMask(Mask, Loc));
+  }
+
+  // FIXME: This is all a pretty gross hack. We should automatically handle
+  // optional operands like this via tblgen.
+
+  // Next, add the CCOut and ConditionCode operands, if needed.
+  //
+  // For mnemonics which can ever incorporate a carry setting bit or predication
+  // code, our matching model involves us always generating CCOut and
+  // ConditionCode operands to match the mnemonic "as written" and then we let
+  // the matcher deal with finding the right instruction or generating an
+  // appropriate error.
+  bool CanAcceptCarrySet, CanAcceptPredicationCode;
+  getMnemonicAcceptInfo(Mnemonic, Name, CanAcceptCarrySet, CanAcceptPredicationCode);
+
+  // If we had a carry-set on an instruction that can't do that, issue an
+  // error.
+  if (!CanAcceptCarrySet && CarrySetting) {
+    return Error(NameLoc, "instruction '" + Mnemonic +
+                 "' can not set flags, but 's' suffix specified");
+  }
+  // If we had a predication code on an instruction that can't do that, issue an
+  // error.
+  if (!CanAcceptPredicationCode && PredicationCode != ARMCC::AL) {
+    return Error(NameLoc, "instruction '" + Mnemonic +
+                 "' is not predicable, but condition code specified");
+  }
+
+  // Add the carry setting operand, if necessary.
+  if (CanAcceptCarrySet) {
+    SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size());
+    Operands.push_back(ARMOperand::CreateCCOut(CarrySetting ? ARM::CPSR : 0,
+                                               Loc));
+  }
+
+  // Add the predication code operand, if necessary.
+  if (CanAcceptPredicationCode) {
+    SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size() +
+                                      CarrySetting);
+    Operands.push_back(ARMOperand::CreateCondCode(
+                         ARMCC::CondCodes(PredicationCode), Loc));
+  }
+
+  // Add the processor imod operand, if necessary.
+  if (ProcessorIMod) {
+    Operands.push_back(ARMOperand::CreateImm(
+          MCConstantExpr::create(ProcessorIMod, getContext()),
+                                 NameLoc, NameLoc));
+  } else if (Mnemonic == "cps" && isMClass()) {
+    return Error(NameLoc, "instruction 'cps' requires effect for M-class");
+  }
+
+  // Add the remaining tokens in the mnemonic.
+  while (Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    StringRef ExtraToken = Name.slice(Start, Next);
+
+    // Some NEON instructions have an optional datatype suffix that is
+    // completely ignored. Check for that.
+    if (isDataTypeToken(ExtraToken) &&
+        doesIgnoreDataTypeSuffix(Mnemonic, ExtraToken))
+      continue;
+
+    // For for ARM mode generate an error if the .n qualifier is used.
+    if (ExtraToken == ".n" && !isThumb()) {
+      SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start);
+      return Error(Loc, "instruction with .n (narrow) qualifier not allowed in "
+                   "arm mode");
+    }
+
+    // The .n qualifier is always discarded as that is what the tables
+    // and matcher expect.  In ARM mode the .w qualifier has no effect,
+    // so discard it to avoid errors that can be caused by the matcher.
+    if (ExtraToken != ".n" && (isThumb() || ExtraToken != ".w")) {
+      SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start);
+      Operands.push_back(ARMOperand::CreateToken(ExtraToken, Loc));
+    }
+  }
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (parseOperand(Operands, Mnemonic)) {
+      return true;
+    }
+
+    while (parseOptionalToken(AsmToken::Comma)) {
+      // Parse and remember the operand.
+      if (parseOperand(Operands, Mnemonic)) {
+        return true;
+      }
+    }
+  }
+
+  if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
+    return true;
+
+  if (RequireVFPRegisterListCheck) {
+    ARMOperand &Op = static_cast<ARMOperand &>(*Operands.back());
+    if (AcceptSinglePrecisionOnly && !Op.isSPRRegList())
+      return Error(Op.getStartLoc(),
+                   "VFP/Neon single precision register expected");
+    if (AcceptDoublePrecisionOnly && !Op.isDPRRegList())
+      return Error(Op.getStartLoc(),
+                   "VFP/Neon double precision register expected");
+  }
+
+  tryConvertingToTwoOperandForm(Mnemonic, CarrySetting, Operands);
+
+  // Some instructions, mostly Thumb, have forms for the same mnemonic that
+  // do and don't have a cc_out optional-def operand. With some spot-checks
+  // of the operand list, we can figure out which variant we're trying to
+  // parse and adjust accordingly before actually matching. We shouldn't ever
+  // try to remove a cc_out operand that was explicitly set on the
+  // mnemonic, of course (CarrySetting == true). Reason number #317 the
+  // table driven matcher doesn't fit well with the ARM instruction set.
+  if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands))
+    Operands.erase(Operands.begin() + 1);
+
+  // Some instructions have the same mnemonic, but don't always
+  // have a predicate. Distinguish them here and delete the
+  // predicate if needed.
+  if (shouldOmitPredicateOperand(Mnemonic, Operands))
+    Operands.erase(Operands.begin() + 1);
+
+  // ARM mode 'blx' need special handling, as the register operand version
+  // is predicable, but the label operand version is not. So, we can't rely
+  // on the Mnemonic based checking to correctly figure out when to put
+  // a k_CondCode operand in the list. If we're trying to match the label
+  // version, remove the k_CondCode operand here.
+  if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 &&
+      static_cast<ARMOperand &>(*Operands[2]).isImm())
+    Operands.erase(Operands.begin() + 1);
+
+  // Adjust operands of ldrexd/strexd to MCK_GPRPair.
+  // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
+  // a single GPRPair reg operand is used in the .td file to replace the two
+  // GPRs. However, when parsing from asm, the two GRPs cannot be automatically
+  // expressed as a GPRPair, so we have to manually merge them.
+  // FIXME: We would really like to be able to tablegen'erate this.
+  if (!isThumb() && Operands.size() > 4 &&
+      (Mnemonic == "ldrexd" || Mnemonic == "strexd" || Mnemonic == "ldaexd" ||
+       Mnemonic == "stlexd")) {
+    bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd");
+    unsigned Idx = isLoad ? 2 : 3;
+    ARMOperand &Op1 = static_cast<ARMOperand &>(*Operands[Idx]);
+    ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[Idx + 1]);
+
+    const MCRegisterClass& MRC = MRI->getRegClass(ARM::GPRRegClassID);
+    // Adjust only if Op1 and Op2 are GPRs.
+    if (Op1.isReg() && Op2.isReg() && MRC.contains(Op1.getReg()) &&
+        MRC.contains(Op2.getReg())) {
+      unsigned Reg1 = Op1.getReg();
+      unsigned Reg2 = Op2.getReg();
+      unsigned Rt = MRI->getEncodingValue(Reg1);
+      unsigned Rt2 = MRI->getEncodingValue(Reg2);
+
+      // Rt2 must be Rt + 1 and Rt must be even.
+      if (Rt + 1 != Rt2 || (Rt & 1)) {
+        return Error(Op2.getStartLoc(),
+                     isLoad ? "destination operands must be sequential"
+                            : "source operands must be sequential");
+      }
+      unsigned NewReg = MRI->getMatchingSuperReg(Reg1, ARM::gsub_0,
+          &(MRI->getRegClass(ARM::GPRPairRegClassID)));
+      Operands[Idx] =
+          ARMOperand::CreateReg(NewReg, Op1.getStartLoc(), Op2.getEndLoc());
+      Operands.erase(Operands.begin() + Idx + 1);
+    }
+  }
+
+  // GNU Assembler extension (compatibility)
+  if ((Mnemonic == "ldrd" || Mnemonic == "strd")) {
+    ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]);
+    ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]);
+    if (Op3.isMem()) {
+      assert(Op2.isReg() && "expected register argument");
+
+      unsigned SuperReg = MRI->getMatchingSuperReg(
+          Op2.getReg(), ARM::gsub_0, &MRI->getRegClass(ARM::GPRPairRegClassID));
+
+      assert(SuperReg && "expected register pair");
+
+      unsigned PairedReg = MRI->getSubReg(SuperReg, ARM::gsub_1);
+
+      Operands.insert(
+          Operands.begin() + 3,
+          ARMOperand::CreateReg(PairedReg, Op2.getStartLoc(), Op2.getEndLoc()));
+    }
+  }
+
+  // FIXME: As said above, this is all a pretty gross hack.  This instruction
+  // does not fit with other "subs" and tblgen.
+  // Adjust operands of B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction
+  // so the Mnemonic is the original name "subs" and delete the predicate
+  // operand so it will match the table entry.
+  if (isThumbTwo() && Mnemonic == "sub" && Operands.size() == 6 &&
+      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::PC &&
+      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::LR &&
+      static_cast<ARMOperand &>(*Operands[5]).isImm()) {
+    Operands.front() = ARMOperand::CreateToken(Name, NameLoc);
+    Operands.erase(Operands.begin() + 1);
+  }
+  return false;
+}
+
+// Validate context-sensitive operand constraints.
+
+// return 'true' if register list contains non-low GPR registers,
+// 'false' otherwise. If Reg is in the register list or is HiReg, set
+// 'containsReg' to true.
+static bool checkLowRegisterList(const MCInst &Inst, unsigned OpNo,
+                                 unsigned Reg, unsigned HiReg,
+                                 bool &containsReg) {
+  containsReg = false;
+  for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) {
+    unsigned OpReg = Inst.getOperand(i).getReg();
+    if (OpReg == Reg)
+      containsReg = true;
+    // Anything other than a low register isn't legal here.
+    if (!isARMLowRegister(OpReg) && (!HiReg || OpReg != HiReg))
+      return true;
+  }
+  return false;
+}
+
+// Check if the specified regisgter is in the register list of the inst,
+// starting at the indicated operand number.
+static bool listContainsReg(const MCInst &Inst, unsigned OpNo, unsigned Reg) {
+  for (unsigned i = OpNo, e = Inst.getNumOperands(); i < e; ++i) {
+    unsigned OpReg = Inst.getOperand(i).getReg();
+    if (OpReg == Reg)
+      return true;
+  }
+  return false;
+}
+
+// Return true if instruction has the interesting property of being
+// allowed in IT blocks, but not being predicable.
+static bool instIsBreakpoint(const MCInst &Inst) {
+    return Inst.getOpcode() == ARM::tBKPT ||
+           Inst.getOpcode() == ARM::BKPT ||
+           Inst.getOpcode() == ARM::tHLT ||
+           Inst.getOpcode() == ARM::HLT;
+
+}
+
+bool ARMAsmParser::validatetLDMRegList(const MCInst &Inst,
+                                       const OperandVector &Operands,
+                                       unsigned ListNo, bool IsARPop) {
+  const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]);
+  bool HasWritebackToken = Op.isToken() && Op.getToken() == "!";
+
+  bool ListContainsSP = listContainsReg(Inst, ListNo, ARM::SP);
+  bool ListContainsLR = listContainsReg(Inst, ListNo, ARM::LR);
+  bool ListContainsPC = listContainsReg(Inst, ListNo, ARM::PC);
+
+  if (!IsARPop && ListContainsSP)
+    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+                 "SP may not be in the register list");
+  else if (ListContainsPC && ListContainsLR)
+    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+                 "PC and LR may not be in the register list simultaneously");
+  else if (inITBlock() && !lastInITBlock() && ListContainsPC)
+    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+                 "instruction must be outside of IT block or the last "
+                 "instruction in an IT block");
+  return false;
+}
+
+bool ARMAsmParser::validatetSTMRegList(const MCInst &Inst,
+                                       const OperandVector &Operands,
+                                       unsigned ListNo) {
+  const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]);
+  bool HasWritebackToken = Op.isToken() && Op.getToken() == "!";
+
+  bool ListContainsSP = listContainsReg(Inst, ListNo, ARM::SP);
+  bool ListContainsPC = listContainsReg(Inst, ListNo, ARM::PC);
+
+  if (ListContainsSP && ListContainsPC)
+    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+                 "SP and PC may not be in the register list");
+  else if (ListContainsSP)
+    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+                 "SP may not be in the register list");
+  else if (ListContainsPC)
+    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
+                 "PC may not be in the register list");
+  return false;
+}
+
+// FIXME: We would really like to be able to tablegen'erate this.
+bool ARMAsmParser::validateInstruction(MCInst &Inst,
+                                       const OperandVector &Operands) {
+  const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+  SMLoc Loc = Operands[0]->getStartLoc();
+
+  // Check the IT block state first.
+  // NOTE: BKPT and HLT instructions have the interesting property of being
+  // allowed in IT blocks, but not being predicable. They just always execute.
+  if (inITBlock() && !instIsBreakpoint(Inst)) {
+    // The instruction must be predicable.
+    if (!MCID.isPredicable())
+      return Error(Loc, "instructions in IT block must be predicable");
+    unsigned Cond = Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm();
+    if (Cond != currentITCond()) {
+      // Find the condition code Operand to get its SMLoc information.
+      SMLoc CondLoc;
+      for (unsigned I = 1; I < Operands.size(); ++I)
+        if (static_cast<ARMOperand &>(*Operands[I]).isCondCode())
+          CondLoc = Operands[I]->getStartLoc();
+      return Error(CondLoc, "incorrect condition in IT block; got '" +
+                   StringRef(ARMCondCodeToString(ARMCC::CondCodes(Cond))) +
+                   "', but expected '" +
+                   ARMCondCodeToString(ARMCC::CondCodes(currentITCond())) + "'");
+    }
+  // Check for non-'al' condition codes outside of the IT block.
+  } else if (isThumbTwo() && MCID.isPredicable() &&
+             Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() !=
+             ARMCC::AL && Inst.getOpcode() != ARM::tBcc &&
+             Inst.getOpcode() != ARM::t2Bcc) {
+    return Error(Loc, "predicated instructions must be in IT block");
+  } else if (!isThumb() && !useImplicitITARM() && MCID.isPredicable() &&
+             Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() !=
+                 ARMCC::AL) {
+    return Warning(Loc, "predicated instructions should be in IT block");
+  }
+
+  const unsigned Opcode = Inst.getOpcode();
+  switch (Opcode) {
+  case ARM::LDRD:
+  case ARM::LDRD_PRE:
+  case ARM::LDRD_POST: {
+    const unsigned RtReg = Inst.getOperand(0).getReg();
+
+    // Rt can't be R14.
+    if (RtReg == ARM::LR)
+      return Error(Operands[3]->getStartLoc(),
+                   "Rt can't be R14");
+
+    const unsigned Rt = MRI->getEncodingValue(RtReg);
+    // Rt must be even-numbered.
+    if ((Rt & 1) == 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "Rt must be even-numbered");
+
+    // Rt2 must be Rt + 1.
+    const unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    if (Rt2 != Rt + 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "destination operands must be sequential");
+
+    if (Opcode == ARM::LDRD_PRE || Opcode == ARM::LDRD_POST) {
+      const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(3).getReg());
+      // For addressing modes with writeback, the base register needs to be
+      // different from the destination registers.
+      if (Rn == Rt || Rn == Rt2)
+        return Error(Operands[3]->getStartLoc(),
+                     "base register needs to be different from destination "
+                     "registers");
+    }
+
+    return false;
+  }
+  case ARM::t2LDRDi8:
+  case ARM::t2LDRD_PRE:
+  case ARM::t2LDRD_POST: {
+    // Rt2 must be different from Rt.
+    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    if (Rt2 == Rt)
+      return Error(Operands[3]->getStartLoc(),
+                   "destination operands can't be identical");
+    return false;
+  }
+  case ARM::t2BXJ: {
+    const unsigned RmReg = Inst.getOperand(0).getReg();
+    // Rm = SP is no longer unpredictable in v8-A
+    if (RmReg == ARM::SP && !hasV8Ops())
+      return Error(Operands[2]->getStartLoc(),
+                   "r13 (SP) is an unpredictable operand to BXJ");
+    return false;
+  }
+  case ARM::STRD: {
+    // Rt2 must be Rt + 1.
+    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    if (Rt2 != Rt + 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "source operands must be sequential");
+    return false;
+  }
+  case ARM::STRD_PRE:
+  case ARM::STRD_POST: {
+    // Rt2 must be Rt + 1.
+    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(2).getReg());
+    if (Rt2 != Rt + 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "source operands must be sequential");
+    return false;
+  }
+  case ARM::STR_PRE_IMM:
+  case ARM::STR_PRE_REG:
+  case ARM::STR_POST_IMM:
+  case ARM::STR_POST_REG:
+  case ARM::STRH_PRE:
+  case ARM::STRH_POST:
+  case ARM::STRB_PRE_IMM:
+  case ARM::STRB_PRE_REG:
+  case ARM::STRB_POST_IMM:
+  case ARM::STRB_POST_REG: {
+    // Rt must be different from Rn.
+    const unsigned Rt = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(2).getReg());
+
+    if (Rt == Rn)
+      return Error(Operands[3]->getStartLoc(),
+                   "source register and base register can't be identical");
+    return false;
+  }
+  case ARM::LDR_PRE_IMM:
+  case ARM::LDR_PRE_REG:
+  case ARM::LDR_POST_IMM:
+  case ARM::LDR_POST_REG:
+  case ARM::LDRH_PRE:
+  case ARM::LDRH_POST:
+  case ARM::LDRSH_PRE:
+  case ARM::LDRSH_POST:
+  case ARM::LDRB_PRE_IMM:
+  case ARM::LDRB_PRE_REG:
+  case ARM::LDRB_POST_IMM:
+  case ARM::LDRB_POST_REG:
+  case ARM::LDRSB_PRE:
+  case ARM::LDRSB_POST: {
+    // Rt must be different from Rn.
+    const unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+    const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(2).getReg());
+
+    if (Rt == Rn)
+      return Error(Operands[3]->getStartLoc(),
+                   "destination register and base register can't be identical");
+    return false;
+  }
+  case ARM::SBFX:
+  case ARM::UBFX: {
+    // Width must be in range [1, 32-lsb].
+    unsigned LSB = Inst.getOperand(2).getImm();
+    unsigned Widthm1 = Inst.getOperand(3).getImm();
+    if (Widthm1 >= 32 - LSB)
+      return Error(Operands[5]->getStartLoc(),
+                   "bitfield width must be in range [1,32-lsb]");
+    return false;
+  }
+  // Notionally handles ARM::tLDMIA_UPD too.
+  case ARM::tLDMIA: {
+    // If we're parsing Thumb2, the .w variant is available and handles
+    // most cases that are normally illegal for a Thumb1 LDM instruction.
+    // We'll make the transformation in processInstruction() if necessary.
+    //
+    // Thumb LDM instructions are writeback iff the base register is not
+    // in the register list.
+    unsigned Rn = Inst.getOperand(0).getReg();
+    bool HasWritebackToken =
+        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == "!");
+    bool ListContainsBase;
+    if (checkLowRegisterList(Inst, 3, Rn, 0, ListContainsBase) && !isThumbTwo())
+      return Error(Operands[3 + HasWritebackToken]->getStartLoc(),
+                   "registers must be in range r0-r7");
+    // If we should have writeback, then there should be a '!' token.
+    if (!ListContainsBase && !HasWritebackToken && !isThumbTwo())
+      return Error(Operands[2]->getStartLoc(),
+                   "writeback operator '!' expected");
+    // If we should not have writeback, there must not be a '!'. This is
+    // true even for the 32-bit wide encodings.
+    if (ListContainsBase && HasWritebackToken)
+      return Error(Operands[3]->getStartLoc(),
+                   "writeback operator '!' not allowed when base register "
+                   "in register list");
+
+    if (validatetLDMRegList(Inst, Operands, 3))
+      return true;
+    break;
+  }
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::LDMDA_UPD:
+    // ARM variants loading and updating the same register are only officially
+    // UNPREDICTABLE on v7 upwards. Goodness knows what they did before.
+    if (!hasV7Ops())
+      break;
+    if (listContainsReg(Inst, 3, Inst.getOperand(0).getReg()))
+      return Error(Operands.back()->getStartLoc(),
+                   "writeback register not allowed in register list");
+    break;
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+    if (validatetLDMRegList(Inst, Operands, 3))
+      return true;
+    break;
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+    if (validatetSTMRegList(Inst, Operands, 3))
+      return true;
+    break;
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD: {
+    if (listContainsReg(Inst, 3, Inst.getOperand(0).getReg()))
+      return Error(Operands.back()->getStartLoc(),
+                   "writeback register not allowed in register list");
+
+    if (Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
+      if (validatetLDMRegList(Inst, Operands, 3))
+        return true;
+    } else {
+      if (validatetSTMRegList(Inst, Operands, 3))
+        return true;
+    }
+    break;
+  }
+  case ARM::sysLDMIA_UPD:
+  case ARM::sysLDMDA_UPD:
+  case ARM::sysLDMDB_UPD:
+  case ARM::sysLDMIB_UPD:
+    if (!listContainsReg(Inst, 3, ARM::PC))
+      return Error(Operands[4]->getStartLoc(),
+                   "writeback register only allowed on system LDM "
+                   "if PC in register-list");
+    break;
+  case ARM::sysSTMIA_UPD:
+  case ARM::sysSTMDA_UPD:
+  case ARM::sysSTMDB_UPD:
+  case ARM::sysSTMIB_UPD:
+    return Error(Operands[2]->getStartLoc(),
+                 "system STM cannot have writeback register");
+  case ARM::tMUL: {
+    // The second source operand must be the same register as the destination
+    // operand.
+    //
+    // In this case, we must directly check the parsed operands because the
+    // cvtThumbMultiply() function is written in such a way that it guarantees
+    // this first statement is always true for the new Inst.  Essentially, the
+    // destination is unconditionally copied into the second source operand
+    // without checking to see if it matches what we actually parsed.
+    if (Operands.size() == 6 && (((ARMOperand &)*Operands[3]).getReg() !=
+                                 ((ARMOperand &)*Operands[5]).getReg()) &&
+        (((ARMOperand &)*Operands[3]).getReg() !=
+         ((ARMOperand &)*Operands[4]).getReg())) {
+      return Error(Operands[3]->getStartLoc(),
+                   "destination register must match source register");
+    }
+    break;
+  }
+  // Like for ldm/stm, push and pop have hi-reg handling version in Thumb2,
+  // so only issue a diagnostic for thumb1. The instructions will be
+  // switched to the t2 encodings in processInstruction() if necessary.
+  case ARM::tPOP: {
+    bool ListContainsBase;
+    if (checkLowRegisterList(Inst, 2, 0, ARM::PC, ListContainsBase) &&
+        !isThumbTwo())
+      return Error(Operands[2]->getStartLoc(),
+                   "registers must be in range r0-r7 or pc");
+    if (validatetLDMRegList(Inst, Operands, 2, !isMClass()))
+      return true;
+    break;
+  }
+  case ARM::tPUSH: {
+    bool ListContainsBase;
+    if (checkLowRegisterList(Inst, 2, 0, ARM::LR, ListContainsBase) &&
+        !isThumbTwo())
+      return Error(Operands[2]->getStartLoc(),
+                   "registers must be in range r0-r7 or lr");
+    if (validatetSTMRegList(Inst, Operands, 2))
+      return true;
+    break;
+  }
+  case ARM::tSTMIA_UPD: {
+    bool ListContainsBase, InvalidLowList;
+    InvalidLowList = checkLowRegisterList(Inst, 4, Inst.getOperand(0).getReg(),
+                                          0, ListContainsBase);
+    if (InvalidLowList && !isThumbTwo())
+      return Error(Operands[4]->getStartLoc(),
+                   "registers must be in range r0-r7");
+
+    // This would be converted to a 32-bit stm, but that's not valid if the
+    // writeback register is in the list.
+    if (InvalidLowList && ListContainsBase)
+      return Error(Operands[4]->getStartLoc(),
+                   "writeback operator '!' not allowed when base register "
+                   "in register list");
+
+    if (validatetSTMRegList(Inst, Operands, 4))
+      return true;
+    break;
+  }
+  case ARM::tADDrSP: {
+    // If the non-SP source operand and the destination operand are not the
+    // same, we need thumb2 (for the wide encoding), or we have an error.
+    if (!isThumbTwo() &&
+        Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) {
+      return Error(Operands[4]->getStartLoc(),
+                   "source register must be the same as destination");
+    }
+    break;
+  }
+  // Final range checking for Thumb unconditional branch instructions.
+  case ARM::tB:
+    if (!(static_cast<ARMOperand &>(*Operands[2])).isSignedOffset<11, 1>())
+      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    break;
+  case ARM::t2B: {
+    int op = (Operands[2]->isImm()) ? 2 : 3;
+    if (!static_cast<ARMOperand &>(*Operands[op]).isSignedOffset<24, 1>())
+      return Error(Operands[op]->getStartLoc(), "branch target out of range");
+    break;
+  }
+  // Final range checking for Thumb conditional branch instructions.
+  case ARM::tBcc:
+    if (!static_cast<ARMOperand &>(*Operands[2]).isSignedOffset<8, 1>())
+      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    break;
+  case ARM::t2Bcc: {
+    int Op = (Operands[2]->isImm()) ? 2 : 3;
+    if (!static_cast<ARMOperand &>(*Operands[Op]).isSignedOffset<20, 1>())
+      return Error(Operands[Op]->getStartLoc(), "branch target out of range");
+    break;
+  }
+  case ARM::tCBZ:
+  case ARM::tCBNZ: {
+    if (!static_cast<ARMOperand &>(*Operands[2]).isUnsignedOffset<6, 1>())
+      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    break;
+  }
+  case ARM::MOVi16:
+  case ARM::t2MOVi16:
+  case ARM::t2MOVTi16:
+    {
+    // We want to avoid misleadingly allowing something like "mov r0, <symbol>"
+    // especially when we turn it into a movw and the expression <symbol> does
+    // not have a :lower16: or :upper16 as part of the expression.  We don't
+    // want the behavior of silently truncating, which can be unexpected and
+    // lead to bugs that are difficult to find since this is an easy mistake
+    // to make.
+    int i = (Operands[3]->isImm()) ? 3 : 4;
+    ARMOperand &Op = static_cast<ARMOperand &>(*Operands[i]);
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm());
+    if (CE) break;
+    const MCExpr *E = dyn_cast<MCExpr>(Op.getImm());
+    if (!E) break;
+    const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E);
+    if (!ARM16Expr || (ARM16Expr->getKind() != ARMMCExpr::VK_ARM_HI16 &&
+                       ARM16Expr->getKind() != ARMMCExpr::VK_ARM_LO16))
+      return Error(
+          Op.getStartLoc(),
+          "immediate expression for mov requires :lower16: or :upper16");
+    break;
+  }
+  case ARM::HINT:
+  case ARM::t2HINT: {
+    if (hasRAS()) {
+      // ESB is not predicable (pred must be AL)
+      unsigned Imm8 = Inst.getOperand(0).getImm();
+      unsigned Pred = Inst.getOperand(1).getImm();
+      if (Imm8 == 0x10 && Pred != ARMCC::AL)
+        return Error(Operands[1]->getStartLoc(), "instruction 'esb' is not "
+                                                 "predicable, but condition "
+                                                 "code specified");
+    }
+    // Without the RAS extension, this behaves as any other unallocated hint.
+    break;
+  }
+  }
+
+  return false;
+}
+
+static unsigned getRealVSTOpcode(unsigned Opc, unsigned &Spacing) {
+  switch(Opc) {
+  default: llvm_unreachable("unexpected opcode!");
+  // VST1LN
+  case ARM::VST1LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VST1LNd8_UPD;
+  case ARM::VST1LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST1LNd16_UPD;
+  case ARM::VST1LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST1LNd32_UPD;
+  case ARM::VST1LNdWB_register_Asm_8:  Spacing = 1; return ARM::VST1LNd8_UPD;
+  case ARM::VST1LNdWB_register_Asm_16: Spacing = 1; return ARM::VST1LNd16_UPD;
+  case ARM::VST1LNdWB_register_Asm_32: Spacing = 1; return ARM::VST1LNd32_UPD;
+  case ARM::VST1LNdAsm_8:  Spacing = 1; return ARM::VST1LNd8;
+  case ARM::VST1LNdAsm_16: Spacing = 1; return ARM::VST1LNd16;
+  case ARM::VST1LNdAsm_32: Spacing = 1; return ARM::VST1LNd32;
+
+  // VST2LN
+  case ARM::VST2LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VST2LNd8_UPD;
+  case ARM::VST2LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST2LNd16_UPD;
+  case ARM::VST2LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST2LNd32_UPD;
+  case ARM::VST2LNqWB_fixed_Asm_16: Spacing = 2; return ARM::VST2LNq16_UPD;
+  case ARM::VST2LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VST2LNq32_UPD;
+
+  case ARM::VST2LNdWB_register_Asm_8:  Spacing = 1; return ARM::VST2LNd8_UPD;
+  case ARM::VST2LNdWB_register_Asm_16: Spacing = 1; return ARM::VST2LNd16_UPD;
+  case ARM::VST2LNdWB_register_Asm_32: Spacing = 1; return ARM::VST2LNd32_UPD;
+  case ARM::VST2LNqWB_register_Asm_16: Spacing = 2; return ARM::VST2LNq16_UPD;
+  case ARM::VST2LNqWB_register_Asm_32: Spacing = 2; return ARM::VST2LNq32_UPD;
+
+  case ARM::VST2LNdAsm_8:  Spacing = 1; return ARM::VST2LNd8;
+  case ARM::VST2LNdAsm_16: Spacing = 1; return ARM::VST2LNd16;
+  case ARM::VST2LNdAsm_32: Spacing = 1; return ARM::VST2LNd32;
+  case ARM::VST2LNqAsm_16: Spacing = 2; return ARM::VST2LNq16;
+  case ARM::VST2LNqAsm_32: Spacing = 2; return ARM::VST2LNq32;
+
+  // VST3LN
+  case ARM::VST3LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VST3LNd8_UPD;
+  case ARM::VST3LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST3LNd16_UPD;
+  case ARM::VST3LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST3LNd32_UPD;
+  case ARM::VST3LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VST3LNq16_UPD;
+  case ARM::VST3LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VST3LNq32_UPD;
+  case ARM::VST3LNdWB_register_Asm_8:  Spacing = 1; return ARM::VST3LNd8_UPD;
+  case ARM::VST3LNdWB_register_Asm_16: Spacing = 1; return ARM::VST3LNd16_UPD;
+  case ARM::VST3LNdWB_register_Asm_32: Spacing = 1; return ARM::VST3LNd32_UPD;
+  case ARM::VST3LNqWB_register_Asm_16: Spacing = 2; return ARM::VST3LNq16_UPD;
+  case ARM::VST3LNqWB_register_Asm_32: Spacing = 2; return ARM::VST3LNq32_UPD;
+  case ARM::VST3LNdAsm_8:  Spacing = 1; return ARM::VST3LNd8;
+  case ARM::VST3LNdAsm_16: Spacing = 1; return ARM::VST3LNd16;
+  case ARM::VST3LNdAsm_32: Spacing = 1; return ARM::VST3LNd32;
+  case ARM::VST3LNqAsm_16: Spacing = 2; return ARM::VST3LNq16;
+  case ARM::VST3LNqAsm_32: Spacing = 2; return ARM::VST3LNq32;
+
+  // VST3
+  case ARM::VST3dWB_fixed_Asm_8:  Spacing = 1; return ARM::VST3d8_UPD;
+  case ARM::VST3dWB_fixed_Asm_16: Spacing = 1; return ARM::VST3d16_UPD;
+  case ARM::VST3dWB_fixed_Asm_32: Spacing = 1; return ARM::VST3d32_UPD;
+  case ARM::VST3qWB_fixed_Asm_8:  Spacing = 2; return ARM::VST3q8_UPD;
+  case ARM::VST3qWB_fixed_Asm_16: Spacing = 2; return ARM::VST3q16_UPD;
+  case ARM::VST3qWB_fixed_Asm_32: Spacing = 2; return ARM::VST3q32_UPD;
+  case ARM::VST3dWB_register_Asm_8:  Spacing = 1; return ARM::VST3d8_UPD;
+  case ARM::VST3dWB_register_Asm_16: Spacing = 1; return ARM::VST3d16_UPD;
+  case ARM::VST3dWB_register_Asm_32: Spacing = 1; return ARM::VST3d32_UPD;
+  case ARM::VST3qWB_register_Asm_8:  Spacing = 2; return ARM::VST3q8_UPD;
+  case ARM::VST3qWB_register_Asm_16: Spacing = 2; return ARM::VST3q16_UPD;
+  case ARM::VST3qWB_register_Asm_32: Spacing = 2; return ARM::VST3q32_UPD;
+  case ARM::VST3dAsm_8:  Spacing = 1; return ARM::VST3d8;
+  case ARM::VST3dAsm_16: Spacing = 1; return ARM::VST3d16;
+  case ARM::VST3dAsm_32: Spacing = 1; return ARM::VST3d32;
+  case ARM::VST3qAsm_8:  Spacing = 2; return ARM::VST3q8;
+  case ARM::VST3qAsm_16: Spacing = 2; return ARM::VST3q16;
+  case ARM::VST3qAsm_32: Spacing = 2; return ARM::VST3q32;
+
+  // VST4LN
+  case ARM::VST4LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VST4LNd8_UPD;
+  case ARM::VST4LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST4LNd16_UPD;
+  case ARM::VST4LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST4LNd32_UPD;
+  case ARM::VST4LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VST4LNq16_UPD;
+  case ARM::VST4LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VST4LNq32_UPD;
+  case ARM::VST4LNdWB_register_Asm_8:  Spacing = 1; return ARM::VST4LNd8_UPD;
+  case ARM::VST4LNdWB_register_Asm_16: Spacing = 1; return ARM::VST4LNd16_UPD;
+  case ARM::VST4LNdWB_register_Asm_32: Spacing = 1; return ARM::VST4LNd32_UPD;
+  case ARM::VST4LNqWB_register_Asm_16: Spacing = 2; return ARM::VST4LNq16_UPD;
+  case ARM::VST4LNqWB_register_Asm_32: Spacing = 2; return ARM::VST4LNq32_UPD;
+  case ARM::VST4LNdAsm_8:  Spacing = 1; return ARM::VST4LNd8;
+  case ARM::VST4LNdAsm_16: Spacing = 1; return ARM::VST4LNd16;
+  case ARM::VST4LNdAsm_32: Spacing = 1; return ARM::VST4LNd32;
+  case ARM::VST4LNqAsm_16: Spacing = 2; return ARM::VST4LNq16;
+  case ARM::VST4LNqAsm_32: Spacing = 2; return ARM::VST4LNq32;
+
+  // VST4
+  case ARM::VST4dWB_fixed_Asm_8:  Spacing = 1; return ARM::VST4d8_UPD;
+  case ARM::VST4dWB_fixed_Asm_16: Spacing = 1; return ARM::VST4d16_UPD;
+  case ARM::VST4dWB_fixed_Asm_32: Spacing = 1; return ARM::VST4d32_UPD;
+  case ARM::VST4qWB_fixed_Asm_8:  Spacing = 2; return ARM::VST4q8_UPD;
+  case ARM::VST4qWB_fixed_Asm_16: Spacing = 2; return ARM::VST4q16_UPD;
+  case ARM::VST4qWB_fixed_Asm_32: Spacing = 2; return ARM::VST4q32_UPD;
+  case ARM::VST4dWB_register_Asm_8:  Spacing = 1; return ARM::VST4d8_UPD;
+  case ARM::VST4dWB_register_Asm_16: Spacing = 1; return ARM::VST4d16_UPD;
+  case ARM::VST4dWB_register_Asm_32: Spacing = 1; return ARM::VST4d32_UPD;
+  case ARM::VST4qWB_register_Asm_8:  Spacing = 2; return ARM::VST4q8_UPD;
+  case ARM::VST4qWB_register_Asm_16: Spacing = 2; return ARM::VST4q16_UPD;
+  case ARM::VST4qWB_register_Asm_32: Spacing = 2; return ARM::VST4q32_UPD;
+  case ARM::VST4dAsm_8:  Spacing = 1; return ARM::VST4d8;
+  case ARM::VST4dAsm_16: Spacing = 1; return ARM::VST4d16;
+  case ARM::VST4dAsm_32: Spacing = 1; return ARM::VST4d32;
+  case ARM::VST4qAsm_8:  Spacing = 2; return ARM::VST4q8;
+  case ARM::VST4qAsm_16: Spacing = 2; return ARM::VST4q16;
+  case ARM::VST4qAsm_32: Spacing = 2; return ARM::VST4q32;
+  }
+}
+
+static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
+  switch(Opc) {
+  default: llvm_unreachable("unexpected opcode!");
+  // VLD1LN
+  case ARM::VLD1LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD1LNd8_UPD;
+  case ARM::VLD1LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD1LNd16_UPD;
+  case ARM::VLD1LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD1LNd32_UPD;
+  case ARM::VLD1LNdWB_register_Asm_8:  Spacing = 1; return ARM::VLD1LNd8_UPD;
+  case ARM::VLD1LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD1LNd16_UPD;
+  case ARM::VLD1LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD1LNd32_UPD;
+  case ARM::VLD1LNdAsm_8:  Spacing = 1; return ARM::VLD1LNd8;
+  case ARM::VLD1LNdAsm_16: Spacing = 1; return ARM::VLD1LNd16;
+  case ARM::VLD1LNdAsm_32: Spacing = 1; return ARM::VLD1LNd32;
+
+  // VLD2LN
+  case ARM::VLD2LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD2LNd8_UPD;
+  case ARM::VLD2LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD2LNd16_UPD;
+  case ARM::VLD2LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD2LNd32_UPD;
+  case ARM::VLD2LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD2LNq16_UPD;
+  case ARM::VLD2LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD2LNq32_UPD;
+  case ARM::VLD2LNdWB_register_Asm_8:  Spacing = 1; return ARM::VLD2LNd8_UPD;
+  case ARM::VLD2LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD2LNd16_UPD;
+  case ARM::VLD2LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD2LNd32_UPD;
+  case ARM::VLD2LNqWB_register_Asm_16: Spacing = 2; return ARM::VLD2LNq16_UPD;
+  case ARM::VLD2LNqWB_register_Asm_32: Spacing = 2; return ARM::VLD2LNq32_UPD;
+  case ARM::VLD2LNdAsm_8:  Spacing = 1; return ARM::VLD2LNd8;
+  case ARM::VLD2LNdAsm_16: Spacing = 1; return ARM::VLD2LNd16;
+  case ARM::VLD2LNdAsm_32: Spacing = 1; return ARM::VLD2LNd32;
+  case ARM::VLD2LNqAsm_16: Spacing = 2; return ARM::VLD2LNq16;
+  case ARM::VLD2LNqAsm_32: Spacing = 2; return ARM::VLD2LNq32;
+
+  // VLD3DUP
+  case ARM::VLD3DUPdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD3DUPd8_UPD;
+  case ARM::VLD3DUPdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3DUPd16_UPD;
+  case ARM::VLD3DUPdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3DUPd32_UPD;
+  case ARM::VLD3DUPqWB_fixed_Asm_8: Spacing = 1; return ARM::VLD3DUPq8_UPD;
+  case ARM::VLD3DUPqWB_fixed_Asm_16: Spacing = 2; return ARM::VLD3DUPq16_UPD;
+  case ARM::VLD3DUPqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3DUPq32_UPD;
+  case ARM::VLD3DUPdWB_register_Asm_8:  Spacing = 1; return ARM::VLD3DUPd8_UPD;
+  case ARM::VLD3DUPdWB_register_Asm_16: Spacing = 1; return ARM::VLD3DUPd16_UPD;
+  case ARM::VLD3DUPdWB_register_Asm_32: Spacing = 1; return ARM::VLD3DUPd32_UPD;
+  case ARM::VLD3DUPqWB_register_Asm_8: Spacing = 2; return ARM::VLD3DUPq8_UPD;
+  case ARM::VLD3DUPqWB_register_Asm_16: Spacing = 2; return ARM::VLD3DUPq16_UPD;
+  case ARM::VLD3DUPqWB_register_Asm_32: Spacing = 2; return ARM::VLD3DUPq32_UPD;
+  case ARM::VLD3DUPdAsm_8:  Spacing = 1; return ARM::VLD3DUPd8;
+  case ARM::VLD3DUPdAsm_16: Spacing = 1; return ARM::VLD3DUPd16;
+  case ARM::VLD3DUPdAsm_32: Spacing = 1; return ARM::VLD3DUPd32;
+  case ARM::VLD3DUPqAsm_8: Spacing = 2; return ARM::VLD3DUPq8;
+  case ARM::VLD3DUPqAsm_16: Spacing = 2; return ARM::VLD3DUPq16;
+  case ARM::VLD3DUPqAsm_32: Spacing = 2; return ARM::VLD3DUPq32;
+
+  // VLD3LN
+  case ARM::VLD3LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD3LNd8_UPD;
+  case ARM::VLD3LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3LNd16_UPD;
+  case ARM::VLD3LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3LNd32_UPD;
+  case ARM::VLD3LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3LNq16_UPD;
+  case ARM::VLD3LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3LNq32_UPD;
+  case ARM::VLD3LNdWB_register_Asm_8:  Spacing = 1; return ARM::VLD3LNd8_UPD;
+  case ARM::VLD3LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD3LNd16_UPD;
+  case ARM::VLD3LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD3LNd32_UPD;
+  case ARM::VLD3LNqWB_register_Asm_16: Spacing = 2; return ARM::VLD3LNq16_UPD;
+  case ARM::VLD3LNqWB_register_Asm_32: Spacing = 2; return ARM::VLD3LNq32_UPD;
+  case ARM::VLD3LNdAsm_8:  Spacing = 1; return ARM::VLD3LNd8;
+  case ARM::VLD3LNdAsm_16: Spacing = 1; return ARM::VLD3LNd16;
+  case ARM::VLD3LNdAsm_32: Spacing = 1; return ARM::VLD3LNd32;
+  case ARM::VLD3LNqAsm_16: Spacing = 2; return ARM::VLD3LNq16;
+  case ARM::VLD3LNqAsm_32: Spacing = 2; return ARM::VLD3LNq32;
+
+  // VLD3
+  case ARM::VLD3dWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD3d8_UPD;
+  case ARM::VLD3dWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3d16_UPD;
+  case ARM::VLD3dWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3d32_UPD;
+  case ARM::VLD3qWB_fixed_Asm_8:  Spacing = 2; return ARM::VLD3q8_UPD;
+  case ARM::VLD3qWB_fixed_Asm_16: Spacing = 2; return ARM::VLD3q16_UPD;
+  case ARM::VLD3qWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3q32_UPD;
+  case ARM::VLD3dWB_register_Asm_8:  Spacing = 1; return ARM::VLD3d8_UPD;
+  case ARM::VLD3dWB_register_Asm_16: Spacing = 1; return ARM::VLD3d16_UPD;
+  case ARM::VLD3dWB_register_Asm_32: Spacing = 1; return ARM::VLD3d32_UPD;
+  case ARM::VLD3qWB_register_Asm_8:  Spacing = 2; return ARM::VLD3q8_UPD;
+  case ARM::VLD3qWB_register_Asm_16: Spacing = 2; return ARM::VLD3q16_UPD;
+  case ARM::VLD3qWB_register_Asm_32: Spacing = 2; return ARM::VLD3q32_UPD;
+  case ARM::VLD3dAsm_8:  Spacing = 1; return ARM::VLD3d8;
+  case ARM::VLD3dAsm_16: Spacing = 1; return ARM::VLD3d16;
+  case ARM::VLD3dAsm_32: Spacing = 1; return ARM::VLD3d32;
+  case ARM::VLD3qAsm_8:  Spacing = 2; return ARM::VLD3q8;
+  case ARM::VLD3qAsm_16: Spacing = 2; return ARM::VLD3q16;
+  case ARM::VLD3qAsm_32: Spacing = 2; return ARM::VLD3q32;
+
+  // VLD4LN
+  case ARM::VLD4LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD4LNd8_UPD;
+  case ARM::VLD4LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4LNd16_UPD;
+  case ARM::VLD4LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD4LNd32_UPD;
+  case ARM::VLD4LNqWB_fixed_Asm_16: Spacing = 2; return ARM::VLD4LNq16_UPD;
+  case ARM::VLD4LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD4LNq32_UPD;
+  case ARM::VLD4LNdWB_register_Asm_8:  Spacing = 1; return ARM::VLD4LNd8_UPD;
+  case ARM::VLD4LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD4LNd16_UPD;
+  case ARM::VLD4LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD4LNd32_UPD;
+  case ARM::VLD4LNqWB_register_Asm_16: Spacing = 2; return ARM::VLD4LNq16_UPD;
+  case ARM::VLD4LNqWB_register_Asm_32: Spacing = 2; return ARM::VLD4LNq32_UPD;
+  case ARM::VLD4LNdAsm_8:  Spacing = 1; return ARM::VLD4LNd8;
+  case ARM::VLD4LNdAsm_16: Spacing = 1; return ARM::VLD4LNd16;
+  case ARM::VLD4LNdAsm_32: Spacing = 1; return ARM::VLD4LNd32;
+  case ARM::VLD4LNqAsm_16: Spacing = 2; return ARM::VLD4LNq16;
+  case ARM::VLD4LNqAsm_32: Spacing = 2; return ARM::VLD4LNq32;
+
+  // VLD4DUP
+  case ARM::VLD4DUPdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD4DUPd8_UPD;
+  case ARM::VLD4DUPdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4DUPd16_UPD;
+  case ARM::VLD4DUPdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD4DUPd32_UPD;
+  case ARM::VLD4DUPqWB_fixed_Asm_8: Spacing = 1; return ARM::VLD4DUPq8_UPD;
+  case ARM::VLD4DUPqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4DUPq16_UPD;
+  case ARM::VLD4DUPqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD4DUPq32_UPD;
+  case ARM::VLD4DUPdWB_register_Asm_8:  Spacing = 1; return ARM::VLD4DUPd8_UPD;
+  case ARM::VLD4DUPdWB_register_Asm_16: Spacing = 1; return ARM::VLD4DUPd16_UPD;
+  case ARM::VLD4DUPdWB_register_Asm_32: Spacing = 1; return ARM::VLD4DUPd32_UPD;
+  case ARM::VLD4DUPqWB_register_Asm_8: Spacing = 2; return ARM::VLD4DUPq8_UPD;
+  case ARM::VLD4DUPqWB_register_Asm_16: Spacing = 2; return ARM::VLD4DUPq16_UPD;
+  case ARM::VLD4DUPqWB_register_Asm_32: Spacing = 2; return ARM::VLD4DUPq32_UPD;
+  case ARM::VLD4DUPdAsm_8:  Spacing = 1; return ARM::VLD4DUPd8;
+  case ARM::VLD4DUPdAsm_16: Spacing = 1; return ARM::VLD4DUPd16;
+  case ARM::VLD4DUPdAsm_32: Spacing = 1; return ARM::VLD4DUPd32;
+  case ARM::VLD4DUPqAsm_8: Spacing = 2; return ARM::VLD4DUPq8;
+  case ARM::VLD4DUPqAsm_16: Spacing = 2; return ARM::VLD4DUPq16;
+  case ARM::VLD4DUPqAsm_32: Spacing = 2; return ARM::VLD4DUPq32;
+
+  // VLD4
+  case ARM::VLD4dWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD4d8_UPD;
+  case ARM::VLD4dWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4d16_UPD;
+  case ARM::VLD4dWB_fixed_Asm_32: Spacing = 1; return ARM::VLD4d32_UPD;
+  case ARM::VLD4qWB_fixed_Asm_8:  Spacing = 2; return ARM::VLD4q8_UPD;
+  case ARM::VLD4qWB_fixed_Asm_16: Spacing = 2; return ARM::VLD4q16_UPD;
+  case ARM::VLD4qWB_fixed_Asm_32: Spacing = 2; return ARM::VLD4q32_UPD;
+  case ARM::VLD4dWB_register_Asm_8:  Spacing = 1; return ARM::VLD4d8_UPD;
+  case ARM::VLD4dWB_register_Asm_16: Spacing = 1; return ARM::VLD4d16_UPD;
+  case ARM::VLD4dWB_register_Asm_32: Spacing = 1; return ARM::VLD4d32_UPD;
+  case ARM::VLD4qWB_register_Asm_8:  Spacing = 2; return ARM::VLD4q8_UPD;
+  case ARM::VLD4qWB_register_Asm_16: Spacing = 2; return ARM::VLD4q16_UPD;
+  case ARM::VLD4qWB_register_Asm_32: Spacing = 2; return ARM::VLD4q32_UPD;
+  case ARM::VLD4dAsm_8:  Spacing = 1; return ARM::VLD4d8;
+  case ARM::VLD4dAsm_16: Spacing = 1; return ARM::VLD4d16;
+  case ARM::VLD4dAsm_32: Spacing = 1; return ARM::VLD4d32;
+  case ARM::VLD4qAsm_8:  Spacing = 2; return ARM::VLD4q8;
+  case ARM::VLD4qAsm_16: Spacing = 2; return ARM::VLD4q16;
+  case ARM::VLD4qAsm_32: Spacing = 2; return ARM::VLD4q32;
+  }
+}
+
+bool ARMAsmParser::processInstruction(MCInst &Inst,
+                                      const OperandVector &Operands,
+                                      MCStreamer &Out) {
+  switch (Inst.getOpcode()) {
+  // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
+  case ARM::LDRT_POST:
+  case ARM::LDRBT_POST: {
+    const unsigned Opcode =
+      (Inst.getOpcode() == ARM::LDRT_POST) ? ARM::LDRT_POST_IMM
+                                           : ARM::LDRBT_POST_IMM;
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opcode);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createReg(0));
+    TmpInst.addOperand(MCOperand::createImm(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    Inst = TmpInst;
+    return true;
+  }
+  // Alias for alternate form of 'str{,b}t Rt, [Rn], #imm' instruction.
+  case ARM::STRT_POST:
+  case ARM::STRBT_POST: {
+    const unsigned Opcode =
+      (Inst.getOpcode() == ARM::STRT_POST) ? ARM::STRT_POST_IMM
+                                           : ARM::STRBT_POST_IMM;
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opcode);
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createReg(0));
+    TmpInst.addOperand(MCOperand::createImm(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    Inst = TmpInst;
+    return true;
+  }
+  // Alias for alternate form of 'ADR Rd, #imm' instruction.
+  case ARM::ADDri: {
+    if (Inst.getOperand(1).getReg() != ARM::PC ||
+        Inst.getOperand(5).getReg() != 0 ||
+        !(Inst.getOperand(2).isExpr() || Inst.getOperand(2).isImm()))
+      return false;
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::ADR);
+    TmpInst.addOperand(Inst.getOperand(0));
+    if (Inst.getOperand(2).isImm()) {
+      // Immediate (mod_imm) will be in its encoded form, we must unencode it
+      // before passing it to the ADR instruction.
+      unsigned Enc = Inst.getOperand(2).getImm();
+      TmpInst.addOperand(MCOperand::createImm(
+        ARM_AM::rotr32(Enc & 0xFF, (Enc & 0xF00) >> 7)));
+    } else {
+      // Turn PC-relative expression into absolute expression.
+      // Reading PC provides the start of the current instruction + 8 and
+      // the transform to adr is biased by that.
+      MCSymbol *Dot = getContext().createTempSymbol();
+      Out.EmitLabel(Dot);
+      const MCExpr *OpExpr = Inst.getOperand(2).getExpr();
+      const MCExpr *InstPC = MCSymbolRefExpr::create(Dot,
+                                                     MCSymbolRefExpr::VK_None,
+                                                     getContext());
+      const MCExpr *Const8 = MCConstantExpr::create(8, getContext());
+      const MCExpr *ReadPC = MCBinaryExpr::createAdd(InstPC, Const8,
+                                                     getContext());
+      const MCExpr *FixupAddr = MCBinaryExpr::createAdd(ReadPC, OpExpr,
+                                                        getContext());
+      TmpInst.addOperand(MCOperand::createExpr(FixupAddr));
+    }
+    TmpInst.addOperand(Inst.getOperand(3));
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+  // Aliases for alternate PC+imm syntax of LDR instructions.
+  case ARM::t2LDRpcrel:
+    // Select the narrow version if the immediate will fit.
+    if (Inst.getOperand(1).getImm() > 0 &&
+        Inst.getOperand(1).getImm() <= 0xff &&
+        !(static_cast<ARMOperand &>(*Operands[2]).isToken() &&
+          static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w"))
+      Inst.setOpcode(ARM::tLDRpci);
+    else
+      Inst.setOpcode(ARM::t2LDRpci);
+    return true;
+  case ARM::t2LDRBpcrel:
+    Inst.setOpcode(ARM::t2LDRBpci);
+    return true;
+  case ARM::t2LDRHpcrel:
+    Inst.setOpcode(ARM::t2LDRHpci);
+    return true;
+  case ARM::t2LDRSBpcrel:
+    Inst.setOpcode(ARM::t2LDRSBpci);
+    return true;
+  case ARM::t2LDRSHpcrel:
+    Inst.setOpcode(ARM::t2LDRSHpci);
+    return true;
+  case ARM::LDRConstPool:
+  case ARM::tLDRConstPool:
+  case ARM::t2LDRConstPool: {
+    // Pseudo instruction ldr rt, =immediate is converted to a
+    // MOV rt, immediate if immediate is known and representable
+    // otherwise we create a constant pool entry that we load from.
+    MCInst TmpInst;
+    if (Inst.getOpcode() == ARM::LDRConstPool)
+      TmpInst.setOpcode(ARM::LDRi12);
+    else if (Inst.getOpcode() == ARM::tLDRConstPool)
+      TmpInst.setOpcode(ARM::tLDRpci);
+    else if (Inst.getOpcode() == ARM::t2LDRConstPool)
+      TmpInst.setOpcode(ARM::t2LDRpci);
+    const ARMOperand &PoolOperand =
+      (static_cast<ARMOperand &>(*Operands[2]).isToken() &&
+       static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w") ?
+      static_cast<ARMOperand &>(*Operands[4]) :
+      static_cast<ARMOperand &>(*Operands[3]);
+    const MCExpr *SubExprVal = PoolOperand.getConstantPoolImm();
+    // If SubExprVal is a constant we may be able to use a MOV
+    if (isa<MCConstantExpr>(SubExprVal) &&
+        Inst.getOperand(0).getReg() != ARM::PC &&
+        Inst.getOperand(0).getReg() != ARM::SP) {
+      int64_t Value =
+        (int64_t) (cast<MCConstantExpr>(SubExprVal))->getValue();
+      bool UseMov  = true;
+      bool MovHasS = true;
+      if (Inst.getOpcode() == ARM::LDRConstPool) {
+        // ARM Constant
+        if (ARM_AM::getSOImmVal(Value) != -1) {
+          Value = ARM_AM::getSOImmVal(Value);
+          TmpInst.setOpcode(ARM::MOVi);
+        }
+        else if (ARM_AM::getSOImmVal(~Value) != -1) {
+          Value = ARM_AM::getSOImmVal(~Value);
+          TmpInst.setOpcode(ARM::MVNi);
+        }
+        else if (hasV6T2Ops() &&
+                 Value >=0 && Value < 65536) {
+          TmpInst.setOpcode(ARM::MOVi16);
+          MovHasS = false;
+        }
+        else
+          UseMov = false;
+      }
+      else {
+        // Thumb/Thumb2 Constant
+        if (hasThumb2() &&
+            ARM_AM::getT2SOImmVal(Value) != -1)
+          TmpInst.setOpcode(ARM::t2MOVi);
+        else if (hasThumb2() &&
+                 ARM_AM::getT2SOImmVal(~Value) != -1) {
+          TmpInst.setOpcode(ARM::t2MVNi);
+          Value = ~Value;
+        }
+        else if (hasV8MBaseline() &&
+                 Value >=0 && Value < 65536) {
+          TmpInst.setOpcode(ARM::t2MOVi16);
+          MovHasS = false;
+        }
+        else
+          UseMov = false;
+      }
+      if (UseMov) {
+        TmpInst.addOperand(Inst.getOperand(0));           // Rt
+        TmpInst.addOperand(MCOperand::createImm(Value));  // Immediate
+        TmpInst.addOperand(Inst.getOperand(2));           // CondCode
+        TmpInst.addOperand(Inst.getOperand(3));           // CondCode
+        if (MovHasS)
+          TmpInst.addOperand(MCOperand::createReg(0));    // S
+        Inst = TmpInst;
+        return true;
+      }
+    }
+    // No opportunity to use MOV/MVN create constant pool
+    const MCExpr *CPLoc =
+      getTargetStreamer().addConstantPoolEntry(SubExprVal,
+                                               PoolOperand.getStartLoc());
+    TmpInst.addOperand(Inst.getOperand(0));           // Rt
+    TmpInst.addOperand(MCOperand::createExpr(CPLoc)); // offset to constpool
+    if (TmpInst.getOpcode() == ARM::LDRi12)
+      TmpInst.addOperand(MCOperand::createImm(0));    // unused offset
+    TmpInst.addOperand(Inst.getOperand(2));           // CondCode
+    TmpInst.addOperand(Inst.getOperand(3));           // CondCode
+    Inst = TmpInst;
+    return true;
+  }
+  // Handle NEON VST complex aliases.
+  case ARM::VST1LNdWB_register_Asm_8:
+  case ARM::VST1LNdWB_register_Asm_16:
+  case ARM::VST1LNdWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST2LNdWB_register_Asm_8:
+  case ARM::VST2LNdWB_register_Asm_16:
+  case ARM::VST2LNdWB_register_Asm_32:
+  case ARM::VST2LNqWB_register_Asm_16:
+  case ARM::VST2LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST3LNdWB_register_Asm_8:
+  case ARM::VST3LNdWB_register_Asm_16:
+  case ARM::VST3LNdWB_register_Asm_32:
+  case ARM::VST3LNqWB_register_Asm_16:
+  case ARM::VST3LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST4LNdWB_register_Asm_8:
+  case ARM::VST4LNdWB_register_Asm_16:
+  case ARM::VST4LNdWB_register_Asm_32:
+  case ARM::VST4LNqWB_register_Asm_16:
+  case ARM::VST4LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST1LNdWB_fixed_Asm_8:
+  case ARM::VST1LNdWB_fixed_Asm_16:
+  case ARM::VST1LNdWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST2LNdWB_fixed_Asm_8:
+  case ARM::VST2LNdWB_fixed_Asm_16:
+  case ARM::VST2LNdWB_fixed_Asm_32:
+  case ARM::VST2LNqWB_fixed_Asm_16:
+  case ARM::VST2LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST3LNdWB_fixed_Asm_8:
+  case ARM::VST3LNdWB_fixed_Asm_16:
+  case ARM::VST3LNdWB_fixed_Asm_32:
+  case ARM::VST3LNqWB_fixed_Asm_16:
+  case ARM::VST3LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST4LNdWB_fixed_Asm_8:
+  case ARM::VST4LNdWB_fixed_Asm_16:
+  case ARM::VST4LNdWB_fixed_Asm_32:
+  case ARM::VST4LNqWB_fixed_Asm_16:
+  case ARM::VST4LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST1LNdAsm_8:
+  case ARM::VST1LNdAsm_16:
+  case ARM::VST1LNdAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST2LNdAsm_8:
+  case ARM::VST2LNdAsm_16:
+  case ARM::VST2LNdAsm_32:
+  case ARM::VST2LNqAsm_16:
+  case ARM::VST2LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST3LNdAsm_8:
+  case ARM::VST3LNdAsm_16:
+  case ARM::VST3LNdAsm_32:
+  case ARM::VST3LNqAsm_16:
+  case ARM::VST3LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST4LNdAsm_8:
+  case ARM::VST4LNdAsm_16:
+  case ARM::VST4LNdAsm_32:
+  case ARM::VST4LNqAsm_16:
+  case ARM::VST4LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // Handle NEON VLD complex aliases.
+  case ARM::VLD1LNdWB_register_Asm_8:
+  case ARM::VLD1LNdWB_register_Asm_16:
+  case ARM::VLD1LNdWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD2LNdWB_register_Asm_8:
+  case ARM::VLD2LNdWB_register_Asm_16:
+  case ARM::VLD2LNdWB_register_Asm_32:
+  case ARM::VLD2LNqWB_register_Asm_16:
+  case ARM::VLD2LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3LNdWB_register_Asm_8:
+  case ARM::VLD3LNdWB_register_Asm_16:
+  case ARM::VLD3LNdWB_register_Asm_32:
+  case ARM::VLD3LNqWB_register_Asm_16:
+  case ARM::VLD3LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4LNdWB_register_Asm_8:
+  case ARM::VLD4LNdWB_register_Asm_16:
+  case ARM::VLD4LNdWB_register_Asm_32:
+  case ARM::VLD4LNqWB_register_Asm_16:
+  case ARM::VLD4LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD1LNdWB_fixed_Asm_8:
+  case ARM::VLD1LNdWB_fixed_Asm_16:
+  case ARM::VLD1LNdWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD2LNdWB_fixed_Asm_8:
+  case ARM::VLD2LNdWB_fixed_Asm_16:
+  case ARM::VLD2LNdWB_fixed_Asm_32:
+  case ARM::VLD2LNqWB_fixed_Asm_16:
+  case ARM::VLD2LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3LNdWB_fixed_Asm_8:
+  case ARM::VLD3LNdWB_fixed_Asm_16:
+  case ARM::VLD3LNdWB_fixed_Asm_32:
+  case ARM::VLD3LNqWB_fixed_Asm_16:
+  case ARM::VLD3LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4LNdWB_fixed_Asm_8:
+  case ARM::VLD4LNdWB_fixed_Asm_16:
+  case ARM::VLD4LNdWB_fixed_Asm_32:
+  case ARM::VLD4LNqWB_fixed_Asm_16:
+  case ARM::VLD4LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD1LNdAsm_8:
+  case ARM::VLD1LNdAsm_16:
+  case ARM::VLD1LNdAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD2LNdAsm_8:
+  case ARM::VLD2LNdAsm_16:
+  case ARM::VLD2LNdAsm_32:
+  case ARM::VLD2LNqAsm_16:
+  case ARM::VLD2LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3LNdAsm_8:
+  case ARM::VLD3LNdAsm_16:
+  case ARM::VLD3LNdAsm_32:
+  case ARM::VLD3LNqAsm_16:
+  case ARM::VLD3LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4LNdAsm_8:
+  case ARM::VLD4LNdAsm_16:
+  case ARM::VLD4LNdAsm_32:
+  case ARM::VLD4LNqAsm_16:
+  case ARM::VLD4LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VLD3DUP single 3-element structure to all lanes instructions.
+  case ARM::VLD3DUPdAsm_8:
+  case ARM::VLD3DUPdAsm_16:
+  case ARM::VLD3DUPdAsm_32:
+  case ARM::VLD3DUPqAsm_8:
+  case ARM::VLD3DUPqAsm_16:
+  case ARM::VLD3DUPqAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3DUPdWB_fixed_Asm_8:
+  case ARM::VLD3DUPdWB_fixed_Asm_16:
+  case ARM::VLD3DUPdWB_fixed_Asm_32:
+  case ARM::VLD3DUPqWB_fixed_Asm_8:
+  case ARM::VLD3DUPqWB_fixed_Asm_16:
+  case ARM::VLD3DUPqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3DUPdWB_register_Asm_8:
+  case ARM::VLD3DUPdWB_register_Asm_16:
+  case ARM::VLD3DUPdWB_register_Asm_32:
+  case ARM::VLD3DUPqWB_register_Asm_8:
+  case ARM::VLD3DUPqWB_register_Asm_16:
+  case ARM::VLD3DUPqWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VLD3 multiple 3-element structure instructions.
+  case ARM::VLD3dAsm_8:
+  case ARM::VLD3dAsm_16:
+  case ARM::VLD3dAsm_32:
+  case ARM::VLD3qAsm_8:
+  case ARM::VLD3qAsm_16:
+  case ARM::VLD3qAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3dWB_fixed_Asm_8:
+  case ARM::VLD3dWB_fixed_Asm_16:
+  case ARM::VLD3dWB_fixed_Asm_32:
+  case ARM::VLD3qWB_fixed_Asm_8:
+  case ARM::VLD3qWB_fixed_Asm_16:
+  case ARM::VLD3qWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3dWB_register_Asm_8:
+  case ARM::VLD3dWB_register_Asm_16:
+  case ARM::VLD3dWB_register_Asm_32:
+  case ARM::VLD3qWB_register_Asm_8:
+  case ARM::VLD3qWB_register_Asm_16:
+  case ARM::VLD3qWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VLD4DUP single 3-element structure to all lanes instructions.
+  case ARM::VLD4DUPdAsm_8:
+  case ARM::VLD4DUPdAsm_16:
+  case ARM::VLD4DUPdAsm_32:
+  case ARM::VLD4DUPqAsm_8:
+  case ARM::VLD4DUPqAsm_16:
+  case ARM::VLD4DUPqAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4DUPdWB_fixed_Asm_8:
+  case ARM::VLD4DUPdWB_fixed_Asm_16:
+  case ARM::VLD4DUPdWB_fixed_Asm_32:
+  case ARM::VLD4DUPqWB_fixed_Asm_8:
+  case ARM::VLD4DUPqWB_fixed_Asm_16:
+  case ARM::VLD4DUPqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4DUPdWB_register_Asm_8:
+  case ARM::VLD4DUPdWB_register_Asm_16:
+  case ARM::VLD4DUPdWB_register_Asm_32:
+  case ARM::VLD4DUPqWB_register_Asm_8:
+  case ARM::VLD4DUPqWB_register_Asm_16:
+  case ARM::VLD4DUPqWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VLD4 multiple 4-element structure instructions.
+  case ARM::VLD4dAsm_8:
+  case ARM::VLD4dAsm_16:
+  case ARM::VLD4dAsm_32:
+  case ARM::VLD4qAsm_8:
+  case ARM::VLD4qAsm_16:
+  case ARM::VLD4qAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4dWB_fixed_Asm_8:
+  case ARM::VLD4dWB_fixed_Asm_16:
+  case ARM::VLD4dWB_fixed_Asm_32:
+  case ARM::VLD4qWB_fixed_Asm_8:
+  case ARM::VLD4qWB_fixed_Asm_16:
+  case ARM::VLD4qWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4dWB_register_Asm_8:
+  case ARM::VLD4dWB_register_Asm_16:
+  case ARM::VLD4dWB_register_Asm_32:
+  case ARM::VLD4qWB_register_Asm_8:
+  case ARM::VLD4qWB_register_Asm_16:
+  case ARM::VLD4qWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VST3 multiple 3-element structure instructions.
+  case ARM::VST3dAsm_8:
+  case ARM::VST3dAsm_16:
+  case ARM::VST3dAsm_32:
+  case ARM::VST3qAsm_8:
+  case ARM::VST3qAsm_16:
+  case ARM::VST3qAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST3dWB_fixed_Asm_8:
+  case ARM::VST3dWB_fixed_Asm_16:
+  case ARM::VST3dWB_fixed_Asm_32:
+  case ARM::VST3qWB_fixed_Asm_8:
+  case ARM::VST3qWB_fixed_Asm_16:
+  case ARM::VST3qWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST3dWB_register_Asm_8:
+  case ARM::VST3dWB_register_Asm_16:
+  case ARM::VST3dWB_register_Asm_32:
+  case ARM::VST3qWB_register_Asm_8:
+  case ARM::VST3qWB_register_Asm_16:
+  case ARM::VST3qWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VST4 multiple 3-element structure instructions.
+  case ARM::VST4dAsm_8:
+  case ARM::VST4dAsm_16:
+  case ARM::VST4dAsm_32:
+  case ARM::VST4qAsm_8:
+  case ARM::VST4qAsm_16:
+  case ARM::VST4qAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST4dWB_fixed_Asm_8:
+  case ARM::VST4dWB_fixed_Asm_16:
+  case ARM::VST4dWB_fixed_Asm_32:
+  case ARM::VST4qWB_fixed_Asm_8:
+  case ARM::VST4qWB_fixed_Asm_16:
+  case ARM::VST4qWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::createReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST4dWB_register_Asm_8:
+  case ARM::VST4dWB_register_Asm_16:
+  case ARM::VST4dWB_register_Asm_32:
+  case ARM::VST4qWB_register_Asm_8:
+  case ARM::VST4qWB_register_Asm_16:
+  case ARM::VST4qWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::createReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // Handle encoding choice for the shift-immediate instructions.
+  case ARM::t2LSLri:
+  case ARM::t2LSRri:
+  case ARM::t2ASRri: {
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+        Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        !(static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+          static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case ARM::t2LSLri: NewOpc = ARM::tLSLri; break;
+      case ARM::t2LSRri: NewOpc = ARM::tLSRri; break;
+      case ARM::t2ASRri: NewOpc = ARM::tASRri; break;
+      }
+      // The Thumb1 operands aren't in the same order. Awesome, eh?
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(5));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+
+  // Handle the Thumb2 mode MOV complex aliases.
+  case ARM::t2MOVsr:
+  case ARM::t2MOVSsr: {
+    // Which instruction to expand to depends on the CCOut operand and
+    // whether we're in an IT block if the register operands are low
+    // registers.
+    bool isNarrow = false;
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        isARMLowRegister(Inst.getOperand(2).getReg()) &&
+        Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsr))
+      isNarrow = true;
+    MCInst TmpInst;
+    unsigned newOpc;
+    switch(ARM_AM::getSORegShOp(Inst.getOperand(3).getImm())) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRrr : ARM::t2ASRrr; break;
+    case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRrr : ARM::t2LSRrr; break;
+    case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLrr : ARM::t2LSLrr; break;
+    case ARM_AM::ror: newOpc = isNarrow ? ARM::tROR   : ARM::t2RORrr; break;
+    }
+    TmpInst.setOpcode(newOpc);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    if (isNarrow)
+      TmpInst.addOperand(MCOperand::createReg(
+          Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // Rm
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    if (!isNarrow)
+      TmpInst.addOperand(MCOperand::createReg(
+          Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::t2MOVsi:
+  case ARM::t2MOVSsi: {
+    // Which instruction to expand to depends on the CCOut operand and
+    // whether we're in an IT block if the register operands are low
+    // registers.
+    bool isNarrow = false;
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsi))
+      isNarrow = true;
+    MCInst TmpInst;
+    unsigned newOpc;
+    switch(ARM_AM::getSORegShOp(Inst.getOperand(2).getImm())) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRri : ARM::t2ASRri; break;
+    case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRri : ARM::t2LSRri; break;
+    case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLri : ARM::t2LSLri; break;
+    case ARM_AM::ror: newOpc = ARM::t2RORri; isNarrow = false; break;
+    case ARM_AM::rrx: isNarrow = false; newOpc = ARM::t2RRX; break;
+    }
+    unsigned Amount = ARM_AM::getSORegOffset(Inst.getOperand(2).getImm());
+    if (Amount == 32) Amount = 0;
+    TmpInst.setOpcode(newOpc);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    if (isNarrow)
+      TmpInst.addOperand(MCOperand::createReg(
+          Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    if (newOpc != ARM::t2RRX)
+      TmpInst.addOperand(MCOperand::createImm(Amount));
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    if (!isNarrow)
+      TmpInst.addOperand(MCOperand::createReg(
+          Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
+    Inst = TmpInst;
+    return true;
+  }
+  // Handle the ARM mode MOV complex aliases.
+  case ARM::ASRr:
+  case ARM::LSRr:
+  case ARM::LSLr:
+  case ARM::RORr: {
+    ARM_AM::ShiftOpc ShiftTy;
+    switch(Inst.getOpcode()) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM::ASRr: ShiftTy = ARM_AM::asr; break;
+    case ARM::LSRr: ShiftTy = ARM_AM::lsr; break;
+    case ARM::LSLr: ShiftTy = ARM_AM::lsl; break;
+    case ARM::RORr: ShiftTy = ARM_AM::ror; break;
+    }
+    unsigned Shifter = ARM_AM::getSORegOpc(ShiftTy, 0);
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::MOVsr);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // Rm
+    TmpInst.addOperand(MCOperand::createImm(Shifter)); // Shift value and ty
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    TmpInst.addOperand(Inst.getOperand(5)); // cc_out
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::ASRi:
+  case ARM::LSRi:
+  case ARM::LSLi:
+  case ARM::RORi: {
+    ARM_AM::ShiftOpc ShiftTy;
+    switch(Inst.getOpcode()) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM::ASRi: ShiftTy = ARM_AM::asr; break;
+    case ARM::LSRi: ShiftTy = ARM_AM::lsr; break;
+    case ARM::LSLi: ShiftTy = ARM_AM::lsl; break;
+    case ARM::RORi: ShiftTy = ARM_AM::ror; break;
+    }
+    // A shift by zero is a plain MOVr, not a MOVsi.
+    unsigned Amt = Inst.getOperand(2).getImm();
+    unsigned Opc = Amt == 0 ? ARM::MOVr : ARM::MOVsi;
+    // A shift by 32 should be encoded as 0 when permitted
+    if (Amt == 32 && (ShiftTy == ARM_AM::lsr || ShiftTy == ARM_AM::asr))
+      Amt = 0;
+    unsigned Shifter = ARM_AM::getSORegOpc(ShiftTy, Amt);
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opc);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    if (Opc == ARM::MOVsi)
+      TmpInst.addOperand(MCOperand::createImm(Shifter)); // Shift value and ty
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    TmpInst.addOperand(Inst.getOperand(5)); // cc_out
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::RRXi: {
+    unsigned Shifter = ARM_AM::getSORegOpc(ARM_AM::rrx, 0);
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::MOVsi);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(MCOperand::createImm(Shifter)); // Shift value and ty
+    TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(3));
+    TmpInst.addOperand(Inst.getOperand(4)); // cc_out
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::t2LDMIA_UPD: {
+    // If this is a load of a single register, then we should use
+    // a post-indexed LDR instruction instead, per the ARM ARM.
+    if (Inst.getNumOperands() != 5)
+      return false;
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::t2LDR_POST);
+    TmpInst.addOperand(Inst.getOperand(4)); // Rt
+    TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(MCOperand::createImm(4));
+    TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(3));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::t2STMDB_UPD: {
+    // If this is a store of a single register, then we should use
+    // a pre-indexed STR instruction instead, per the ARM ARM.
+    if (Inst.getNumOperands() != 5)
+      return false;
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::t2STR_PRE);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(4)); // Rt
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(MCOperand::createImm(-4));
+    TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(3));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::LDMIA_UPD:
+    // If this is a load of a single register via a 'pop', then we should use
+    // a post-indexed LDR instruction instead, per the ARM ARM.
+    if (static_cast<ARMOperand &>(*Operands[0]).getToken() == "pop" &&
+        Inst.getNumOperands() == 5) {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::LDR_POST_IMM);
+      TmpInst.addOperand(Inst.getOperand(4)); // Rt
+      TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+      TmpInst.addOperand(Inst.getOperand(1)); // Rn
+      TmpInst.addOperand(MCOperand::createReg(0));  // am2offset
+      TmpInst.addOperand(MCOperand::createImm(4));
+      TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+      return true;
+    }
+    break;
+  case ARM::STMDB_UPD:
+    // If this is a store of a single register via a 'push', then we should use
+    // a pre-indexed STR instruction instead, per the ARM ARM.
+    if (static_cast<ARMOperand &>(*Operands[0]).getToken() == "push" &&
+        Inst.getNumOperands() == 5) {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::STR_PRE_IMM);
+      TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+      TmpInst.addOperand(Inst.getOperand(4)); // Rt
+      TmpInst.addOperand(Inst.getOperand(1)); // addrmode_imm12
+      TmpInst.addOperand(MCOperand::createImm(-4));
+      TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+    }
+    break;
+  case ARM::t2ADDri12:
+    // If the immediate fits for encoding T3 (t2ADDri) and the generic "add"
+    // mnemonic was used (not "addw"), encoding T3 is preferred.
+    if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "add" ||
+        ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1)
+      break;
+    Inst.setOpcode(ARM::t2ADDri);
+    Inst.addOperand(MCOperand::createReg(0)); // cc_out
+    break;
+  case ARM::t2SUBri12:
+    // If the immediate fits for encoding T3 (t2SUBri) and the generic "sub"
+    // mnemonic was used (not "subw"), encoding T3 is preferred.
+    if (static_cast<ARMOperand &>(*Operands[0]).getToken() != "sub" ||
+        ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1)
+      break;
+    Inst.setOpcode(ARM::t2SUBri);
+    Inst.addOperand(MCOperand::createReg(0)); // cc_out
+    break;
+  case ARM::tADDi8:
+    // If the immediate is in the range 0-7, we want tADDi3 iff Rd was
+    // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
+    // to encoding T2 if <Rd> is specified and encoding T2 is preferred
+    // to encoding T1 if <Rd> is omitted."
+    if ((unsigned)Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) {
+      Inst.setOpcode(ARM::tADDi3);
+      return true;
+    }
+    break;
+  case ARM::tSUBi8:
+    // If the immediate is in the range 0-7, we want tADDi3 iff Rd was
+    // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
+    // to encoding T2 if <Rd> is specified and encoding T2 is preferred
+    // to encoding T1 if <Rd> is omitted."
+    if ((unsigned)Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) {
+      Inst.setOpcode(ARM::tSUBi3);
+      return true;
+    }
+    break;
+  case ARM::t2ADDri:
+  case ARM::t2SUBri: {
+    // If the destination and first source operand are the same, and
+    // the flags are compatible with the current IT status, use encoding T2
+    // instead of T3. For compatibility with the system 'as'. Make sure the
+    // wide encoding wasn't explicit.
+    if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() ||
+        !isARMLowRegister(Inst.getOperand(0).getReg()) ||
+        (unsigned)Inst.getOperand(2).getImm() > 255 ||
+        ((!inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR) ||
+         (inITBlock() && Inst.getOperand(5).getReg() != 0)) ||
+        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
+      break;
+    MCInst TmpInst;
+    TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDri ?
+                      ARM::tADDi8 : ARM::tSUBi8);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(5));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::t2ADDrr: {
+    // If the destination and first source operand are the same, and
+    // there's no setting of the flags, use encoding T2 instead of T3.
+    // Note that this is only for ADD, not SUB. This mirrors the system
+    // 'as' behaviour.  Also take advantage of ADD being commutative.
+    // Make sure the wide encoding wasn't explicit.
+    bool Swap = false;
+    auto DestReg = Inst.getOperand(0).getReg();
+    bool Transform = DestReg == Inst.getOperand(1).getReg();
+    if (!Transform && DestReg == Inst.getOperand(2).getReg()) {
+      Transform = true;
+      Swap = true;
+    }
+    if (!Transform ||
+        Inst.getOperand(5).getReg() != 0 ||
+        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
+      break;
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::tADDhirr);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(Swap ? 1 : 2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::tADDrSP: {
+    // If the non-SP source operand and the destination operand are not the
+    // same, we need to use the 32-bit encoding if it's available.
+    if (Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) {
+      Inst.setOpcode(ARM::t2ADDrr);
+      Inst.addOperand(MCOperand::createReg(0)); // cc_out
+      return true;
+    }
+    break;
+  }
+  case ARM::tB:
+    // A Thumb conditional branch outside of an IT block is a tBcc.
+    if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock()) {
+      Inst.setOpcode(ARM::tBcc);
+      return true;
+    }
+    break;
+  case ARM::t2B:
+    // A Thumb2 conditional branch outside of an IT block is a t2Bcc.
+    if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock()){
+      Inst.setOpcode(ARM::t2Bcc);
+      return true;
+    }
+    break;
+  case ARM::t2Bcc:
+    // If the conditional is AL or we're in an IT block, we really want t2B.
+    if (Inst.getOperand(1).getImm() == ARMCC::AL || inITBlock()) {
+      Inst.setOpcode(ARM::t2B);
+      return true;
+    }
+    break;
+  case ARM::tBcc:
+    // If the conditional is AL, we really want tB.
+    if (Inst.getOperand(1).getImm() == ARMCC::AL) {
+      Inst.setOpcode(ARM::tB);
+      return true;
+    }
+    break;
+  case ARM::tLDMIA: {
+    // If the register list contains any high registers, or if the writeback
+    // doesn't match what tLDMIA can do, we need to use the 32-bit encoding
+    // instead if we're in Thumb2. Otherwise, this should have generated
+    // an error in validateInstruction().
+    unsigned Rn = Inst.getOperand(0).getReg();
+    bool hasWritebackToken =
+        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
+         static_cast<ARMOperand &>(*Operands[3]).getToken() == "!");
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) ||
+        (!listContainsBase && !hasWritebackToken) ||
+        (listContainsBase && hasWritebackToken)) {
+      // 16-bit encoding isn't sufficient. Switch to the 32-bit version.
+      assert (isThumbTwo());
+      Inst.setOpcode(hasWritebackToken ? ARM::t2LDMIA_UPD : ARM::t2LDMIA);
+      // If we're switching to the updating version, we need to insert
+      // the writeback tied operand.
+      if (hasWritebackToken)
+        Inst.insert(Inst.begin(),
+                    MCOperand::createReg(Inst.getOperand(0).getReg()));
+      return true;
+    }
+    break;
+  }
+  case ARM::tSTMIA_UPD: {
+    // If the register list contains any high registers, we need to use
+    // the 32-bit encoding instead if we're in Thumb2. Otherwise, this
+    // should have generated an error in validateInstruction().
+    unsigned Rn = Inst.getOperand(0).getReg();
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 4, Rn, 0, listContainsBase)) {
+      // 16-bit encoding isn't sufficient. Switch to the 32-bit version.
+      assert (isThumbTwo());
+      Inst.setOpcode(ARM::t2STMIA_UPD);
+      return true;
+    }
+    break;
+  }
+  case ARM::tPOP: {
+    bool listContainsBase;
+    // If the register list contains any high registers, we need to use
+    // the 32-bit encoding instead if we're in Thumb2. Otherwise, this
+    // should have generated an error in validateInstruction().
+    if (!checkLowRegisterList(Inst, 2, 0, ARM::PC, listContainsBase))
+      return false;
+    assert (isThumbTwo());
+    Inst.setOpcode(ARM::t2LDMIA_UPD);
+    // Add the base register and writeback operands.
+    Inst.insert(Inst.begin(), MCOperand::createReg(ARM::SP));
+    Inst.insert(Inst.begin(), MCOperand::createReg(ARM::SP));
+    return true;
+  }
+  case ARM::tPUSH: {
+    bool listContainsBase;
+    if (!checkLowRegisterList(Inst, 2, 0, ARM::LR, listContainsBase))
+      return false;
+    assert (isThumbTwo());
+    Inst.setOpcode(ARM::t2STMDB_UPD);
+    // Add the base register and writeback operands.
+    Inst.insert(Inst.begin(), MCOperand::createReg(ARM::SP));
+    Inst.insert(Inst.begin(), MCOperand::createReg(ARM::SP));
+    return true;
+  }
+  case ARM::t2MOVi: {
+    // If we can use the 16-bit encoding and the user didn't explicitly
+    // request the 32-bit variant, transform it here.
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        (unsigned)Inst.getOperand(1).getImm() <= 255 &&
+        ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL &&
+          Inst.getOperand(4).getReg() == ARM::CPSR) ||
+         (inITBlock() && Inst.getOperand(4).getReg() == 0)) &&
+        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
+         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+      // The operands aren't in the same order for tMOVi8...
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tMOVi8);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(4));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+      return true;
+    }
+    break;
+  }
+  case ARM::t2MOVr: {
+    // If we can use the 16-bit encoding and the user didn't explicitly
+    // request the 32-bit variant, transform it here.
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        Inst.getOperand(2).getImm() == ARMCC::AL &&
+        Inst.getOperand(4).getReg() == ARM::CPSR &&
+        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
+         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+      // The operands aren't the same for tMOV[S]r... (no cc_out)
+      MCInst TmpInst;
+      TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+      return true;
+    }
+    break;
+  }
+  case ARM::t2SXTH:
+  case ARM::t2SXTB:
+  case ARM::t2UXTH:
+  case ARM::t2UXTB: {
+    // If we can use the 16-bit encoding and the user didn't explicitly
+    // request the 32-bit variant, transform it here.
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        Inst.getOperand(2).getImm() == 0 &&
+        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
+         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+      default: llvm_unreachable("Illegal opcode!");
+      case ARM::t2SXTH: NewOpc = ARM::tSXTH; break;
+      case ARM::t2SXTB: NewOpc = ARM::tSXTB; break;
+      case ARM::t2UXTH: NewOpc = ARM::tUXTH; break;
+      case ARM::t2UXTB: NewOpc = ARM::tUXTB; break;
+      }
+      // The operands aren't the same for thumb1 (no rotate operand).
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    break;
+  }
+  case ARM::MOVsi: {
+    ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(Inst.getOperand(2).getImm());
+    // rrx shifts and asr/lsr of #32 is encoded as 0
+    if (SOpc == ARM_AM::rrx || SOpc == ARM_AM::asr || SOpc == ARM_AM::lsr) 
+      return false;
+    if (ARM_AM::getSORegOffset(Inst.getOperand(2).getImm()) == 0) {
+      // Shifting by zero is accepted as a vanilla 'MOVr'
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVr);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      TmpInst.addOperand(Inst.getOperand(5));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+  case ARM::ANDrsi:
+  case ARM::ORRrsi:
+  case ARM::EORrsi:
+  case ARM::BICrsi:
+  case ARM::SUBrsi:
+  case ARM::ADDrsi: {
+    unsigned newOpc;
+    ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(Inst.getOperand(3).getImm());
+    if (SOpc == ARM_AM::rrx) return false;
+    switch (Inst.getOpcode()) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM::ANDrsi: newOpc = ARM::ANDrr; break;
+    case ARM::ORRrsi: newOpc = ARM::ORRrr; break;
+    case ARM::EORrsi: newOpc = ARM::EORrr; break;
+    case ARM::BICrsi: newOpc = ARM::BICrr; break;
+    case ARM::SUBrsi: newOpc = ARM::SUBrr; break;
+    case ARM::ADDrsi: newOpc = ARM::ADDrr; break;
+    }
+    // If the shift is by zero, use the non-shifted instruction definition.
+    // The exception is for right shifts, where 0 == 32
+    if (ARM_AM::getSORegOffset(Inst.getOperand(3).getImm()) == 0 &&
+        !(SOpc == ARM_AM::lsr || SOpc == ARM_AM::asr)) {
+      MCInst TmpInst;
+      TmpInst.setOpcode(newOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(4));
+      TmpInst.addOperand(Inst.getOperand(5));
+      TmpInst.addOperand(Inst.getOperand(6));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+  case ARM::ITasm:
+  case ARM::t2IT: {
+    MCOperand &MO = Inst.getOperand(1);
+    unsigned Mask = MO.getImm();
+    ARMCC::CondCodes Cond = ARMCC::CondCodes(Inst.getOperand(0).getImm());
+
+    // Set up the IT block state according to the IT instruction we just
+    // matched.
+    assert(!inITBlock() && "nested IT blocks?!");
+    startExplicitITBlock(Cond, Mask);
+    MO.setImm(getITMaskEncoding());
+    break;
+  }
+  case ARM::t2LSLrr:
+  case ARM::t2LSRrr:
+  case ARM::t2ASRrr:
+  case ARM::t2SBCrr:
+  case ARM::t2RORrr:
+  case ARM::t2BICrr:
+  {
+    // Assemblers should use the narrow encodings of these instructions when permissible.
+    if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
+         isARMLowRegister(Inst.getOperand(2).getReg())) &&
+        Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+        ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
+         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
+        (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
+         !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
+             ".w"))) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+        default: llvm_unreachable("unexpected opcode");
+        case ARM::t2LSLrr: NewOpc = ARM::tLSLrr; break;
+        case ARM::t2LSRrr: NewOpc = ARM::tLSRrr; break;
+        case ARM::t2ASRrr: NewOpc = ARM::tASRrr; break;
+        case ARM::t2SBCrr: NewOpc = ARM::tSBC; break;
+        case ARM::t2RORrr: NewOpc = ARM::tROR; break;
+        case ARM::t2BICrr: NewOpc = ARM::tBIC; break;
+      }
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(5));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+  case ARM::t2ANDrr:
+  case ARM::t2EORrr:
+  case ARM::t2ADCrr:
+  case ARM::t2ORRrr:
+  {
+    // Assemblers should use the narrow encodings of these instructions when permissible.
+    // These instructions are special in that they are commutable, so shorter encodings
+    // are available more often.
+    if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
+         isARMLowRegister(Inst.getOperand(2).getReg())) &&
+        (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() ||
+         Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) &&
+        ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
+         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
+        (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
+         !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
+             ".w"))) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+        default: llvm_unreachable("unexpected opcode");
+        case ARM::t2ADCrr: NewOpc = ARM::tADC; break;
+        case ARM::t2ANDrr: NewOpc = ARM::tAND; break;
+        case ARM::t2EORrr: NewOpc = ARM::tEOR; break;
+        case ARM::t2ORRrr: NewOpc = ARM::tORR; break;
+      }
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(5));
+      if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) {
+        TmpInst.addOperand(Inst.getOperand(1));
+        TmpInst.addOperand(Inst.getOperand(2));
+      } else {
+        TmpInst.addOperand(Inst.getOperand(2));
+        TmpInst.addOperand(Inst.getOperand(1));
+      }
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+  }
+  return false;
+}
+
+unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+  // 16-bit thumb arithmetic instructions either require or preclude the 'S'
+  // suffix depending on whether they're in an IT block or not.
+  unsigned Opc = Inst.getOpcode();
+  const MCInstrDesc &MCID = MII.get(Opc);
+  if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) {
+    assert(MCID.hasOptionalDef() &&
+           "optionally flag setting instruction missing optional def operand");
+    assert(MCID.NumOperands == Inst.getNumOperands() &&
+           "operand count mismatch!");
+    // Find the optional-def operand (cc_out).
+    unsigned OpNo;
+    for (OpNo = 0;
+         !MCID.OpInfo[OpNo].isOptionalDef() && OpNo < MCID.NumOperands;
+         ++OpNo)
+      ;
+    // If we're parsing Thumb1, reject it completely.
+    if (isThumbOne() && Inst.getOperand(OpNo).getReg() != ARM::CPSR)
+      return Match_RequiresFlagSetting;
+    // If we're parsing Thumb2, which form is legal depends on whether we're
+    // in an IT block.
+    if (isThumbTwo() && Inst.getOperand(OpNo).getReg() != ARM::CPSR &&
+        !inITBlock())
+      return Match_RequiresITBlock;
+    if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR &&
+        inITBlock())
+      return Match_RequiresNotITBlock;
+  } else if (isThumbOne()) {
+    // Some high-register supporting Thumb1 encodings only allow both registers
+    // to be from r0-r7 when in Thumb2.
+    if (Opc == ARM::tADDhirr && !hasV6MOps() &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        isARMLowRegister(Inst.getOperand(2).getReg()))
+      return Match_RequiresThumb2;
+    // Others only require ARMv6 or later.
+    else if (Opc == ARM::tMOVr && !hasV6Ops() &&
+             isARMLowRegister(Inst.getOperand(0).getReg()) &&
+             isARMLowRegister(Inst.getOperand(1).getReg()))
+      return Match_RequiresV6;
+  }
+
+  for (unsigned I = 0; I < MCID.NumOperands; ++I)
+    if (MCID.OpInfo[I].RegClass == ARM::rGPRRegClassID) {
+      // rGPRRegClass excludes PC, and also excluded SP before ARMv8
+      if ((Inst.getOperand(I).getReg() == ARM::SP) && !hasV8Ops())
+        return Match_RequiresV8;
+      else if (Inst.getOperand(I).getReg() == ARM::PC)
+        return Match_InvalidOperand;
+    }
+
+  return Match_Success;
+}
+
+namespace llvm {
+template <> inline bool IsCPSRDead<MCInst>(MCInst *Instr) {
+  return true; // In an assembly source, no need to second-guess
+}
+}
+
+// Returns true if Inst is unpredictable if it is in and IT block, but is not
+// the last instruction in the block.
+bool ARMAsmParser::isITBlockTerminator(MCInst &Inst) const {
+  const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+
+  // All branch & call instructions terminate IT blocks.
+  if (MCID.isTerminator() || MCID.isCall() || MCID.isReturn() ||
+      MCID.isBranch() || MCID.isIndirectBranch())
+    return true;
+
+  // Any arithmetic instruction which writes to the PC also terminates the IT
+  // block.
+  for (unsigned OpIdx = 0; OpIdx < MCID.getNumDefs(); ++OpIdx) {
+    MCOperand &Op = Inst.getOperand(OpIdx);
+    if (Op.isReg() && Op.getReg() == ARM::PC)
+      return true;
+  }
+
+  if (MCID.hasImplicitDefOfPhysReg(ARM::PC, MRI))
+    return true;
+
+  // Instructions with variable operand lists, which write to the variable
+  // operands. We only care about Thumb instructions here, as ARM instructions
+  // obviously can't be in an IT block.
+  switch (Inst.getOpcode()) {
+  case ARM::t2LDMIA:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB:
+  case ARM::t2LDMDB_UPD:
+    if (listContainsReg(Inst, 3, ARM::PC))
+      return true;
+    break;
+  case ARM::tPOP:
+    if (listContainsReg(Inst, 2, ARM::PC))
+      return true;
+    break;
+  }
+
+  return false;
+}
+
+unsigned ARMAsmParser::MatchInstruction(OperandVector &Operands, MCInst &Inst,
+                                          uint64_t &ErrorInfo,
+                                          bool MatchingInlineAsm,
+                                          bool &EmitInITBlock,
+                                          MCStreamer &Out) {
+  // If we can't use an implicit IT block here, just match as normal.
+  if (inExplicitITBlock() || !isThumbTwo() || !useImplicitITThumb())
+    return MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+
+  // Try to match the instruction in an extension of the current IT block (if
+  // there is one).
+  if (inImplicitITBlock()) {
+    extendImplicitITBlock(ITState.Cond);
+    if (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm) ==
+            Match_Success) {
+      // The match succeded, but we still have to check that the instruction is
+      // valid in this implicit IT block.
+      const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+      if (MCID.isPredicable()) {
+        ARMCC::CondCodes InstCond =
+            (ARMCC::CondCodes)Inst.getOperand(MCID.findFirstPredOperandIdx())
+                .getImm();
+        ARMCC::CondCodes ITCond = currentITCond();
+        if (InstCond == ITCond) {
+          EmitInITBlock = true;
+          return Match_Success;
+        } else if (InstCond == ARMCC::getOppositeCondition(ITCond)) {
+          invertCurrentITCondition();
+          EmitInITBlock = true;
+          return Match_Success;
+        }
+      }
+    }
+    rewindImplicitITPosition();
+  }
+
+  // Finish the current IT block, and try to match outside any IT block.
+  flushPendingInstructions(Out);
+  unsigned PlainMatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+  if (PlainMatchResult == Match_Success) {
+    const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+    if (MCID.isPredicable()) {
+      ARMCC::CondCodes InstCond =
+          (ARMCC::CondCodes)Inst.getOperand(MCID.findFirstPredOperandIdx())
+              .getImm();
+      // Some forms of the branch instruction have their own condition code
+      // fields, so can be conditionally executed without an IT block.
+      if (Inst.getOpcode() == ARM::tBcc || Inst.getOpcode() == ARM::t2Bcc) {
+        EmitInITBlock = false;
+        return Match_Success;
+      }
+      if (InstCond == ARMCC::AL) {
+        EmitInITBlock = false;
+        return Match_Success;
+      }
+    } else {
+      EmitInITBlock = false;
+      return Match_Success;
+    }
+  }
+
+  // Try to match in a new IT block. The matcher doesn't check the actual
+  // condition, so we create an IT block with a dummy condition, and fix it up
+  // once we know the actual condition.
+  startImplicitITBlock();
+  if (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm) ==
+      Match_Success) {
+    const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+    if (MCID.isPredicable()) {
+      ITState.Cond =
+          (ARMCC::CondCodes)Inst.getOperand(MCID.findFirstPredOperandIdx())
+              .getImm();
+      EmitInITBlock = true;
+      return Match_Success;
+    }
+  }
+  discardImplicitITBlock();
+
+  // If none of these succeed, return the error we got when trying to match
+  // outside any IT blocks.
+  EmitInITBlock = false;
+  return PlainMatchResult;
+}
+
+static const char *getSubtargetFeatureName(uint64_t Val);
+bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                           OperandVector &Operands,
+                                           MCStreamer &Out, uint64_t &ErrorInfo,
+                                           bool MatchingInlineAsm) {
+  MCInst Inst;
+  unsigned MatchResult;
+  bool PendConditionalInstruction = false;
+
+  MatchResult = MatchInstruction(Operands, Inst, ErrorInfo, MatchingInlineAsm,
+                                 PendConditionalInstruction, Out);
+
+  switch (MatchResult) {
+  case Match_Success:
+    // Context sensitive operand constraints aren't handled by the matcher,
+    // so check them here.
+    if (validateInstruction(Inst, Operands)) {
+      // Still progress the IT block, otherwise one wrong condition causes
+      // nasty cascading errors.
+      forwardITPosition();
+      return true;
+    }
+
+    { // processInstruction() updates inITBlock state, we need to save it away
+      bool wasInITBlock = inITBlock();
+
+      // Some instructions need post-processing to, for example, tweak which
+      // encoding is selected. Loop on it while changes happen so the
+      // individual transformations can chain off each other. E.g.,
+      // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
+      while (processInstruction(Inst, Operands, Out))
+        ;
+
+      // Only after the instruction is fully processed, we can validate it
+      if (wasInITBlock && hasV8Ops() && isThumb() &&
+          !isV8EligibleForIT(&Inst)) {
+        Warning(IDLoc, "deprecated instruction in IT block");
+      }
+    }
+
+    // Only move forward at the very end so that everything in validate
+    // and process gets a consistent answer about whether we're in an IT
+    // block.
+    forwardITPosition();
+
+    // ITasm is an ARM mode pseudo-instruction that just sets the ITblock and
+    // doesn't actually encode.
+    if (Inst.getOpcode() == ARM::ITasm)
+      return false;
+
+    Inst.setLoc(IDLoc);
+    if (PendConditionalInstruction) {
+      PendingConditionalInsts.push_back(Inst);
+      if (isITBlockFull() || isITBlockTerminator(Inst))
+        flushPendingInstructions(Out);
+    } else {
+      Out.EmitInstruction(Inst, getSTI());
+    }
+    return false;
+  case Match_MissingFeature: {
+    assert(ErrorInfo && "Unknown missing feature!");
+    // Special case the error message for the very common case where only
+    // a single subtarget feature is missing (Thumb vs. ARM, e.g.).
+    std::string Msg = "instruction requires:";
+    uint64_t Mask = 1;
+    for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+      if (ErrorInfo & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg);
+  }
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0ULL) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction",
+                 ((ARMOperand &)*Operands[0]).getLocRange());
+  case Match_RequiresNotITBlock:
+    return Error(IDLoc, "flag setting instruction only valid outside IT block");
+  case Match_RequiresITBlock:
+    return Error(IDLoc, "instruction only valid inside IT block");
+  case Match_RequiresV6:
+    return Error(IDLoc, "instruction variant requires ARMv6 or later");
+  case Match_RequiresThumb2:
+    return Error(IDLoc, "instruction variant requires Thumb2");
+  case Match_RequiresV8:
+    return Error(IDLoc, "instruction variant requires ARMv8 or later");
+  case Match_RequiresFlagSetting:
+    return Error(IDLoc, "no flag-preserving variant of this instruction available");
+  case Match_ImmRange0_15: {
+    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
+    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
+  }
+  case Match_ImmRange0_239: {
+    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
+    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    return Error(ErrorLoc, "immediate operand must be in the range [0,239]");
+  }
+  case Match_AlignedMemoryRequiresNone:
+  case Match_DupAlignedMemoryRequiresNone:
+  case Match_AlignedMemoryRequires16:
+  case Match_DupAlignedMemoryRequires16:
+  case Match_AlignedMemoryRequires32:
+  case Match_DupAlignedMemoryRequires32:
+  case Match_AlignedMemoryRequires64:
+  case Match_DupAlignedMemoryRequires64:
+  case Match_AlignedMemoryRequires64or128:
+  case Match_DupAlignedMemoryRequires64or128:
+  case Match_AlignedMemoryRequires64or128or256:
+  {
+    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getAlignmentLoc();
+    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    switch (MatchResult) {
+      default:
+        llvm_unreachable("Missing Match_Aligned type");
+      case Match_AlignedMemoryRequiresNone:
+      case Match_DupAlignedMemoryRequiresNone:
+        return Error(ErrorLoc, "alignment must be omitted");
+      case Match_AlignedMemoryRequires16:
+      case Match_DupAlignedMemoryRequires16:
+        return Error(ErrorLoc, "alignment must be 16 or omitted");
+      case Match_AlignedMemoryRequires32:
+      case Match_DupAlignedMemoryRequires32:
+        return Error(ErrorLoc, "alignment must be 32 or omitted");
+      case Match_AlignedMemoryRequires64:
+      case Match_DupAlignedMemoryRequires64:
+        return Error(ErrorLoc, "alignment must be 64 or omitted");
+      case Match_AlignedMemoryRequires64or128:
+      case Match_DupAlignedMemoryRequires64or128:
+        return Error(ErrorLoc, "alignment must be 64, 128 or omitted");
+      case Match_AlignedMemoryRequires64or128or256:
+        return Error(ErrorLoc, "alignment must be 64, 128, 256 or omitted");
+    }
+  }
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+}
+
+/// parseDirective parses the arm specific directives
+bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
+  const MCObjectFileInfo::Environment Format =
+    getContext().getObjectFileInfo()->getObjectFileType();
+  bool IsMachO = Format == MCObjectFileInfo::IsMachO;
+  bool IsCOFF = Format == MCObjectFileInfo::IsCOFF;
+
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal == ".word")
+    parseLiteralValues(4, DirectiveID.getLoc());
+  else if (IDVal == ".short" || IDVal == ".hword")
+    parseLiteralValues(2, DirectiveID.getLoc());
+  else if (IDVal == ".thumb")
+    parseDirectiveThumb(DirectiveID.getLoc());
+  else if (IDVal == ".arm")
+    parseDirectiveARM(DirectiveID.getLoc());
+  else if (IDVal == ".thumb_func")
+    parseDirectiveThumbFunc(DirectiveID.getLoc());
+  else if (IDVal == ".code")
+    parseDirectiveCode(DirectiveID.getLoc());
+  else if (IDVal == ".syntax")
+    parseDirectiveSyntax(DirectiveID.getLoc());
+  else if (IDVal == ".unreq")
+    parseDirectiveUnreq(DirectiveID.getLoc());
+  else if (IDVal == ".fnend")
+    parseDirectiveFnEnd(DirectiveID.getLoc());
+  else if (IDVal == ".cantunwind")
+    parseDirectiveCantUnwind(DirectiveID.getLoc());
+  else if (IDVal == ".personality")
+    parseDirectivePersonality(DirectiveID.getLoc());
+  else if (IDVal == ".handlerdata")
+    parseDirectiveHandlerData(DirectiveID.getLoc());
+  else if (IDVal == ".setfp")
+    parseDirectiveSetFP(DirectiveID.getLoc());
+  else if (IDVal == ".pad")
+    parseDirectivePad(DirectiveID.getLoc());
+  else if (IDVal == ".save")
+    parseDirectiveRegSave(DirectiveID.getLoc(), false);
+  else if (IDVal == ".vsave")
+    parseDirectiveRegSave(DirectiveID.getLoc(), true);
+  else if (IDVal == ".ltorg" || IDVal == ".pool")
+    parseDirectiveLtorg(DirectiveID.getLoc());
+  else if (IDVal == ".even")
+    parseDirectiveEven(DirectiveID.getLoc());
+  else if (IDVal == ".personalityindex")
+    parseDirectivePersonalityIndex(DirectiveID.getLoc());
+  else if (IDVal == ".unwind_raw")
+    parseDirectiveUnwindRaw(DirectiveID.getLoc());
+  else if (IDVal == ".movsp")
+    parseDirectiveMovSP(DirectiveID.getLoc());
+  else if (IDVal == ".arch_extension")
+    parseDirectiveArchExtension(DirectiveID.getLoc());
+  else if (IDVal == ".align")
+    return parseDirectiveAlign(DirectiveID.getLoc()); // Use Generic on failure.
+  else if (IDVal == ".thumb_set")
+    parseDirectiveThumbSet(DirectiveID.getLoc());
+  else if (!IsMachO && !IsCOFF) {
+    if (IDVal == ".arch")
+      parseDirectiveArch(DirectiveID.getLoc());
+    else if (IDVal == ".cpu")
+      parseDirectiveCPU(DirectiveID.getLoc());
+    else if (IDVal == ".eabi_attribute")
+      parseDirectiveEabiAttr(DirectiveID.getLoc());
+    else if (IDVal == ".fpu")
+      parseDirectiveFPU(DirectiveID.getLoc());
+    else if (IDVal == ".fnstart")
+      parseDirectiveFnStart(DirectiveID.getLoc());
+    else if (IDVal == ".inst")
+      parseDirectiveInst(DirectiveID.getLoc());
+    else if (IDVal == ".inst.n")
+      parseDirectiveInst(DirectiveID.getLoc(), 'n');
+    else if (IDVal == ".inst.w")
+      parseDirectiveInst(DirectiveID.getLoc(), 'w');
+    else if (IDVal == ".object_arch")
+      parseDirectiveObjectArch(DirectiveID.getLoc());
+    else if (IDVal == ".tlsdescseq")
+      parseDirectiveTLSDescSeq(DirectiveID.getLoc());
+    else
+      return true;
+  } else
+    return true;
+  return false;
+}
+
+/// parseLiteralValues
+///  ::= .hword expression [, expression]*
+///  ::= .short expression [, expression]*
+///  ::= .word expression [, expression]*
+bool ARMAsmParser::parseLiteralValues(unsigned Size, SMLoc L) {
+  auto parseOne = [&]() -> bool {
+    const MCExpr *Value;
+    if (getParser().parseExpression(Value))
+      return true;
+    getParser().getStreamer().EmitValue(Value, Size, L);
+    return false;
+  };
+  return (parseMany(parseOne));
+}
+
+/// parseDirectiveThumb
+///  ::= .thumb
+bool ARMAsmParser::parseDirectiveThumb(SMLoc L) {
+  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive") ||
+      check(!hasThumb(), L, "target does not support Thumb mode"))
+    return true;
+
+  if (!isThumb())
+    SwitchMode();
+
+  getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+  return false;
+}
+
+/// parseDirectiveARM
+///  ::= .arm
+bool ARMAsmParser::parseDirectiveARM(SMLoc L) {
+  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive") ||
+      check(!hasARM(), L, "target does not support ARM mode"))
+    return true;
+
+  if (isThumb())
+    SwitchMode();
+  getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+  return false;
+}
+
+void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) {
+  // We need to flush the current implicit IT block on a label, because it is
+  // not legal to branch into an IT block.
+  flushPendingInstructions(getStreamer());
+  if (NextSymbolIsThumb) {
+    getParser().getStreamer().EmitThumbFunc(Symbol);
+    NextSymbolIsThumb = false;
+  }
+}
+
+/// parseDirectiveThumbFunc
+///  ::= .thumbfunc symbol_name
+bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  const auto Format = getContext().getObjectFileInfo()->getObjectFileType();
+  bool IsMachO = Format == MCObjectFileInfo::IsMachO;
+
+  // Darwin asm has (optionally) function name after .thumb_func direction
+  // ELF doesn't
+
+  if (IsMachO) {
+    if (Parser.getTok().is(AsmToken::Identifier) ||
+        Parser.getTok().is(AsmToken::String)) {
+      MCSymbol *Func = getParser().getContext().getOrCreateSymbol(
+          Parser.getTok().getIdentifier());
+      getParser().getStreamer().EmitThumbFunc(Func);
+      Parser.Lex();
+      if (parseToken(AsmToken::EndOfStatement,
+                     "unexpected token in '.thumb_func' directive"))
+        return true;
+      return false;
+    }
+  }
+
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.thumb_func' directive"))
+    return true;
+
+  NextSymbolIsThumb = true;
+  return false;
+}
+
+/// parseDirectiveSyntax
+///  ::= .syntax unified | divided
+bool ARMAsmParser::parseDirectiveSyntax(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(L, "unexpected token in .syntax directive");
+    return false;
+  }
+
+  StringRef Mode = Tok.getString();
+  Parser.Lex();
+  if (check(Mode == "divided" || Mode == "DIVIDED", L,
+            "'.syntax divided' arm assembly not supported") ||
+      check(Mode != "unified" && Mode != "UNIFIED", L,
+            "unrecognized syntax mode in .syntax directive") ||
+      parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+    return true;
+
+  // TODO tell the MC streamer the mode
+  // getParser().getStreamer().Emit???();
+  return false;
+}
+
+/// parseDirectiveCode
+///  ::= .code 16 | 32
+bool ARMAsmParser::parseDirectiveCode(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Integer))
+    return Error(L, "unexpected token in .code directive");
+  int64_t Val = Parser.getTok().getIntVal();
+  if (Val != 16 && Val != 32) {
+    Error(L, "invalid operand to .code directive");
+    return false;
+  }
+  Parser.Lex();
+
+  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+    return true;
+
+  if (Val == 16) {
+    if (!hasThumb())
+      return Error(L, "target does not support Thumb mode");
+
+    if (!isThumb())
+      SwitchMode();
+    getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+  } else {
+    if (!hasARM())
+      return Error(L, "target does not support ARM mode");
+
+    if (isThumb())
+      SwitchMode();
+    getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+  }
+
+  return false;
+}
+
+/// parseDirectiveReq
+///  ::= name .req registername
+bool ARMAsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex(); // Eat the '.req' token.
+  unsigned Reg;
+  SMLoc SRegLoc, ERegLoc;
+  if (check(ParseRegister(Reg, SRegLoc, ERegLoc), SRegLoc,
+            "register name expected") ||
+      parseToken(AsmToken::EndOfStatement,
+                 "unexpected input in .req directive."))
+    return true;
+
+  if (RegisterReqs.insert(std::make_pair(Name, Reg)).first->second != Reg)
+    return Error(SRegLoc,
+                 "redefinition of '" + Name + "' does not match original.");
+
+  return false;
+}
+
+/// parseDirectiveUneq
+///  ::= .unreq registername
+bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  if (Parser.getTok().isNot(AsmToken::Identifier))
+    return Error(L, "unexpected input in .unreq directive.");
+  RegisterReqs.erase(Parser.getTok().getIdentifier().lower());
+  Parser.Lex(); // Eat the identifier.
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected input in '.unreq' directive"))
+    return true;
+  return false;
+}
+
+// After changing arch/CPU, try to put the ARM/Thumb mode back to what it was
+// before, if supported by the new target, or emit mapping symbols for the mode
+// switch.
+void ARMAsmParser::FixModeAfterArchChange(bool WasThumb, SMLoc Loc) {
+  if (WasThumb != isThumb()) {
+    if (WasThumb && hasThumb()) {
+      // Stay in Thumb mode
+      SwitchMode();
+    } else if (!WasThumb && hasARM()) {
+      // Stay in ARM mode
+      SwitchMode();
+    } else {
+      // Mode switch forced, because the new arch doesn't support the old mode.
+      getParser().getStreamer().EmitAssemblerFlag(isThumb() ? MCAF_Code16
+                                                            : MCAF_Code32);
+      // Warn about the implcit mode switch. GAS does not switch modes here,
+      // but instead stays in the old mode, reporting an error on any following
+      // instructions as the mode does not exist on the target.
+      Warning(Loc, Twine("new target does not support ") +
+                       (WasThumb ? "thumb" : "arm") + " mode, switching to " +
+                       (!WasThumb ? "thumb" : "arm") + " mode");
+    }
+  }
+}
+
+/// parseDirectiveArch
+///  ::= .arch token
+bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
+  StringRef Arch = getParser().parseStringToEndOfStatement().trim();
+  unsigned ID = ARM::parseArch(Arch);
+
+  if (ID == ARM::AK_INVALID)
+    return Error(L, "Unknown arch name");
+
+  bool WasThumb = isThumb();
+  Triple T;
+  MCSubtargetInfo &STI = copySTI();
+  STI.setDefaultFeatures("", ("+" + ARM::getArchName(ID)).str());
+  setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  FixModeAfterArchChange(WasThumb, L);
+
+  getTargetStreamer().emitArch(ID);
+  return false;
+}
+
+/// parseDirectiveEabiAttr
+///  ::= .eabi_attribute int, int [, "str"]
+///  ::= .eabi_attribute Tag_name, int [, "str"]
+bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  int64_t Tag;
+  SMLoc TagLoc;
+  TagLoc = Parser.getTok().getLoc();
+  if (Parser.getTok().is(AsmToken::Identifier)) {
+    StringRef Name = Parser.getTok().getIdentifier();
+    Tag = ARMBuildAttrs::AttrTypeFromString(Name);
+    if (Tag == -1) {
+      Error(TagLoc, "attribute name not recognised: " + Name);
+      return false;
+    }
+    Parser.Lex();
+  } else {
+    const MCExpr *AttrExpr;
+
+    TagLoc = Parser.getTok().getLoc();
+    if (Parser.parseExpression(AttrExpr))
+      return true;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(AttrExpr);
+    if (check(!CE, TagLoc, "expected numeric constant"))
+      return true;
+
+    Tag = CE->getValue();
+  }
+
+  if (Parser.parseToken(AsmToken::Comma, "comma expected"))
+    return true;
+
+  StringRef StringValue = "";
+  bool IsStringValue = false;
+
+  int64_t IntegerValue = 0;
+  bool IsIntegerValue = false;
+
+  if (Tag == ARMBuildAttrs::CPU_raw_name || Tag == ARMBuildAttrs::CPU_name)
+    IsStringValue = true;
+  else if (Tag == ARMBuildAttrs::compatibility) {
+    IsStringValue = true;
+    IsIntegerValue = true;
+  } else if (Tag < 32 || Tag % 2 == 0)
+    IsIntegerValue = true;
+  else if (Tag % 2 == 1)
+    IsStringValue = true;
+  else
+    llvm_unreachable("invalid tag type");
+
+  if (IsIntegerValue) {
+    const MCExpr *ValueExpr;
+    SMLoc ValueExprLoc = Parser.getTok().getLoc();
+    if (Parser.parseExpression(ValueExpr))
+      return true;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ValueExpr);
+    if (!CE)
+      return Error(ValueExprLoc, "expected numeric constant");
+    IntegerValue = CE->getValue();
+  }
+
+  if (Tag == ARMBuildAttrs::compatibility) {
+    if (Parser.parseToken(AsmToken::Comma, "comma expected"))
+      return true;
+  }
+
+  if (IsStringValue) {
+    if (Parser.getTok().isNot(AsmToken::String))
+      return Error(Parser.getTok().getLoc(), "bad string constant");
+
+    StringValue = Parser.getTok().getStringContents();
+    Parser.Lex();
+  }
+
+  if (Parser.parseToken(AsmToken::EndOfStatement,
+                        "unexpected token in '.eabi_attribute' directive"))
+    return true;
+
+  if (IsIntegerValue && IsStringValue) {
+    assert(Tag == ARMBuildAttrs::compatibility);
+    getTargetStreamer().emitIntTextAttribute(Tag, IntegerValue, StringValue);
+  } else if (IsIntegerValue)
+    getTargetStreamer().emitAttribute(Tag, IntegerValue);
+  else if (IsStringValue)
+    getTargetStreamer().emitTextAttribute(Tag, StringValue);
+  return false;
+}
+
+/// parseDirectiveCPU
+///  ::= .cpu str
+bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
+  StringRef CPU = getParser().parseStringToEndOfStatement().trim();
+  getTargetStreamer().emitTextAttribute(ARMBuildAttrs::CPU_name, CPU);
+
+  // FIXME: This is using table-gen data, but should be moved to
+  // ARMTargetParser once that is table-gen'd.
+  if (!getSTI().isCPUStringValid(CPU))
+    return Error(L, "Unknown CPU name");
+
+  bool WasThumb = isThumb();
+  MCSubtargetInfo &STI = copySTI();
+  STI.setDefaultFeatures(CPU, "");
+  setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  FixModeAfterArchChange(WasThumb, L);
+
+  return false;
+}
+/// parseDirectiveFPU
+///  ::= .fpu str
+bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
+  SMLoc FPUNameLoc = getTok().getLoc();
+  StringRef FPU = getParser().parseStringToEndOfStatement().trim();
+
+  unsigned ID = ARM::parseFPU(FPU);
+  std::vector<StringRef> Features;
+  if (!ARM::getFPUFeatures(ID, Features))
+    return Error(FPUNameLoc, "Unknown FPU name");
+
+  MCSubtargetInfo &STI = copySTI();
+  for (auto Feature : Features)
+    STI.ApplyFeatureFlag(Feature);
+  setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+
+  getTargetStreamer().emitFPU(ID);
+  return false;
+}
+
+/// parseDirectiveFnStart
+///  ::= .fnstart
+bool ARMAsmParser::parseDirectiveFnStart(SMLoc L) {
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.fnstart' directive"))
+    return true;
+
+  if (UC.hasFnStart()) {
+    Error(L, ".fnstart starts before the end of previous one");
+    UC.emitFnStartLocNotes();
+    return true;
+  }
+
+  // Reset the unwind directives parser state
+  UC.reset();
+
+  getTargetStreamer().emitFnStart();
+
+  UC.recordFnStart(L);
+  return false;
+}
+
+/// parseDirectiveFnEnd
+///  ::= .fnend
+bool ARMAsmParser::parseDirectiveFnEnd(SMLoc L) {
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.fnend' directive"))
+    return true;
+  // Check the ordering of unwind directives
+  if (!UC.hasFnStart())
+    return Error(L, ".fnstart must precede .fnend directive");
+
+  // Reset the unwind directives parser state
+  getTargetStreamer().emitFnEnd();
+
+  UC.reset();
+  return false;
+}
+
+/// parseDirectiveCantUnwind
+///  ::= .cantunwind
+bool ARMAsmParser::parseDirectiveCantUnwind(SMLoc L) {
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.cantunwind' directive"))
+    return true;
+
+  UC.recordCantUnwind(L);
+  // Check the ordering of unwind directives
+  if (check(!UC.hasFnStart(), L, ".fnstart must precede .cantunwind directive"))
+    return true;
+
+  if (UC.hasHandlerData()) {
+    Error(L, ".cantunwind can't be used with .handlerdata directive");
+    UC.emitHandlerDataLocNotes();
+    return true;
+  }
+  if (UC.hasPersonality()) {
+    Error(L, ".cantunwind can't be used with .personality directive");
+    UC.emitPersonalityLocNotes();
+    return true;
+  }
+
+  getTargetStreamer().emitCantUnwind();
+  return false;
+}
+
+/// parseDirectivePersonality
+///  ::= .personality name
+bool ARMAsmParser::parseDirectivePersonality(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  bool HasExistingPersonality = UC.hasPersonality();
+
+  // Parse the name of the personality routine
+  if (Parser.getTok().isNot(AsmToken::Identifier))
+    return Error(L, "unexpected input in .personality directive.");
+  StringRef Name(Parser.getTok().getIdentifier());
+  Parser.Lex();
+
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.personality' directive"))
+    return true;
+
+  UC.recordPersonality(L);
+
+  // Check the ordering of unwind directives
+  if (!UC.hasFnStart())
+    return Error(L, ".fnstart must precede .personality directive");
+  if (UC.cantUnwind()) {
+    Error(L, ".personality can't be used with .cantunwind directive");
+    UC.emitCantUnwindLocNotes();
+    return true;
+  }
+  if (UC.hasHandlerData()) {
+    Error(L, ".personality must precede .handlerdata directive");
+    UC.emitHandlerDataLocNotes();
+    return true;
+  }
+  if (HasExistingPersonality) {
+    Error(L, "multiple personality directives");
+    UC.emitPersonalityLocNotes();
+    return true;
+  }
+
+  MCSymbol *PR = getParser().getContext().getOrCreateSymbol(Name);
+  getTargetStreamer().emitPersonality(PR);
+  return false;
+}
+
+/// parseDirectiveHandlerData
+///  ::= .handlerdata
+bool ARMAsmParser::parseDirectiveHandlerData(SMLoc L) {
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.handlerdata' directive"))
+    return true;
+
+  UC.recordHandlerData(L);
+  // Check the ordering of unwind directives
+  if (!UC.hasFnStart())
+    return Error(L, ".fnstart must precede .personality directive");
+  if (UC.cantUnwind()) {
+    Error(L, ".handlerdata can't be used with .cantunwind directive");
+    UC.emitCantUnwindLocNotes();
+    return true;
+  }
+
+  getTargetStreamer().emitHandlerData();
+  return false;
+}
+
+/// parseDirectiveSetFP
+///  ::= .setfp fpreg, spreg [, offset]
+bool ARMAsmParser::parseDirectiveSetFP(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  // Check the ordering of unwind directives
+  if (check(!UC.hasFnStart(), L, ".fnstart must precede .setfp directive") ||
+      check(UC.hasHandlerData(), L,
+            ".setfp must precede .handlerdata directive"))
+    return true;
+
+  // Parse fpreg
+  SMLoc FPRegLoc = Parser.getTok().getLoc();
+  int FPReg = tryParseRegister();
+
+  if (check(FPReg == -1, FPRegLoc, "frame pointer register expected") ||
+      Parser.parseToken(AsmToken::Comma, "comma expected"))
+    return true;
+
+  // Parse spreg
+  SMLoc SPRegLoc = Parser.getTok().getLoc();
+  int SPReg = tryParseRegister();
+  if (check(SPReg == -1, SPRegLoc, "stack pointer register expected") ||
+      check(SPReg != ARM::SP && SPReg != UC.getFPReg(), SPRegLoc,
+            "register should be either $sp or the latest fp register"))
+    return true;
+
+  // Update the frame pointer register
+  UC.saveFPReg(FPReg);
+
+  // Parse offset
+  int64_t Offset = 0;
+  if (Parser.parseOptionalToken(AsmToken::Comma)) {
+    if (Parser.getTok().isNot(AsmToken::Hash) &&
+        Parser.getTok().isNot(AsmToken::Dollar))
+      return Error(Parser.getTok().getLoc(), "'#' expected");
+    Parser.Lex(); // skip hash token.
+
+    const MCExpr *OffsetExpr;
+    SMLoc ExLoc = Parser.getTok().getLoc();
+    SMLoc EndLoc;
+    if (getParser().parseExpression(OffsetExpr, EndLoc))
+      return Error(ExLoc, "malformed setfp offset");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+    if (check(!CE, ExLoc, "setfp offset must be an immediate"))
+      return true;
+    Offset = CE->getValue();
+  }
+
+  if (Parser.parseToken(AsmToken::EndOfStatement))
+    return true;
+
+  getTargetStreamer().emitSetFP(static_cast<unsigned>(FPReg),
+                                static_cast<unsigned>(SPReg), Offset);
+  return false;
+}
+
+/// parseDirective
+///  ::= .pad offset
+bool ARMAsmParser::parseDirectivePad(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  // Check the ordering of unwind directives
+  if (!UC.hasFnStart())
+    return Error(L, ".fnstart must precede .pad directive");
+  if (UC.hasHandlerData())
+    return Error(L, ".pad must precede .handlerdata directive");
+
+  // Parse the offset
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar))
+    return Error(Parser.getTok().getLoc(), "'#' expected");
+  Parser.Lex(); // skip hash token.
+
+  const MCExpr *OffsetExpr;
+  SMLoc ExLoc = Parser.getTok().getLoc();
+  SMLoc EndLoc;
+  if (getParser().parseExpression(OffsetExpr, EndLoc))
+    return Error(ExLoc, "malformed pad offset");
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+  if (!CE)
+    return Error(ExLoc, "pad offset must be an immediate");
+
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.pad' directive"))
+    return true;
+
+  getTargetStreamer().emitPad(CE->getValue());
+  return false;
+}
+
+/// parseDirectiveRegSave
+///  ::= .save  { registers }
+///  ::= .vsave { registers }
+bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) {
+  // Check the ordering of unwind directives
+  if (!UC.hasFnStart())
+    return Error(L, ".fnstart must precede .save or .vsave directives");
+  if (UC.hasHandlerData())
+    return Error(L, ".save or .vsave must precede .handlerdata directive");
+
+  // RAII object to make sure parsed operands are deleted.
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
+
+  // Parse the register list
+  if (parseRegisterList(Operands) ||
+      parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+    return true;
+  ARMOperand &Op = (ARMOperand &)*Operands[0];
+  if (!IsVector && !Op.isRegList())
+    return Error(L, ".save expects GPR registers");
+  if (IsVector && !Op.isDPRRegList())
+    return Error(L, ".vsave expects DPR registers");
+
+  getTargetStreamer().emitRegSave(Op.getRegList(), IsVector);
+  return false;
+}
+
+/// parseDirectiveInst
+///  ::= .inst opcode [, ...]
+///  ::= .inst.n opcode [, ...]
+///  ::= .inst.w opcode [, ...]
+bool ARMAsmParser::parseDirectiveInst(SMLoc Loc, char Suffix) {
+  int Width = 4;
+
+  if (isThumb()) {
+    switch (Suffix) {
+    case 'n':
+      Width = 2;
+      break;
+    case 'w':
+      break;
+    default:
+      return Error(Loc, "cannot determine Thumb instruction size, "
+                        "use inst.n/inst.w instead");
+    }
+  } else {
+    if (Suffix)
+      return Error(Loc, "width suffixes are invalid in ARM mode");
+  }
+
+  auto parseOne = [&]() -> bool {
+    const MCExpr *Expr;
+    if (getParser().parseExpression(Expr))
+      return true;
+    const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
+    if (!Value) {
+      return Error(Loc, "expected constant expression");
+    }
+
+    switch (Width) {
+    case 2:
+      if (Value->getValue() > 0xffff)
+        return Error(Loc, "inst.n operand is too big, use inst.w instead");
+      break;
+    case 4:
+      if (Value->getValue() > 0xffffffff)
+        return Error(Loc, StringRef(Suffix ? "inst.w" : "inst") +
+                              " operand is too big");
+      break;
+    default:
+      llvm_unreachable("only supported widths are 2 and 4");
+    }
+
+    getTargetStreamer().emitInst(Value->getValue(), Suffix);
+    return false;
+  };
+
+  if (parseOptionalToken(AsmToken::EndOfStatement))
+    return Error(Loc, "expected expression following directive");
+  if (parseMany(parseOne))
+    return true;
+  return false;
+}
+
+/// parseDirectiveLtorg
+///  ::= .ltorg | .pool
+bool ARMAsmParser::parseDirectiveLtorg(SMLoc L) {
+  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+    return true;
+  getTargetStreamer().emitCurrentConstantPool();
+  return false;
+}
+
+bool ARMAsmParser::parseDirectiveEven(SMLoc L) {
+  const MCSection *Section = getStreamer().getCurrentSectionOnly();
+
+  if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
+    return true;
+
+  if (!Section) {
+    getStreamer().InitSections(false);
+    Section = getStreamer().getCurrentSectionOnly();
+  }
+
+  assert(Section && "must have section to emit alignment");
+  if (Section->UseCodeAlign())
+    getStreamer().EmitCodeAlignment(2);
+  else
+    getStreamer().EmitValueToAlignment(2);
+
+  return false;
+}
+
+/// parseDirectivePersonalityIndex
+///   ::= .personalityindex index
+bool ARMAsmParser::parseDirectivePersonalityIndex(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  bool HasExistingPersonality = UC.hasPersonality();
+
+  const MCExpr *IndexExpression;
+  SMLoc IndexLoc = Parser.getTok().getLoc();
+  if (Parser.parseExpression(IndexExpression) ||
+      parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.personalityindex' directive")) {
+    return true;
+  }
+
+  UC.recordPersonalityIndex(L);
+
+  if (!UC.hasFnStart()) {
+    return Error(L, ".fnstart must precede .personalityindex directive");
+  }
+  if (UC.cantUnwind()) {
+    Error(L, ".personalityindex cannot be used with .cantunwind");
+    UC.emitCantUnwindLocNotes();
+    return true;
+  }
+  if (UC.hasHandlerData()) {
+    Error(L, ".personalityindex must precede .handlerdata directive");
+    UC.emitHandlerDataLocNotes();
+    return true;
+  }
+  if (HasExistingPersonality) {
+    Error(L, "multiple personality directives");
+    UC.emitPersonalityLocNotes();
+    return true;
+  }
+
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(IndexExpression);
+  if (!CE)
+    return Error(IndexLoc, "index must be a constant number");
+  if (CE->getValue() < 0 || CE->getValue() >= ARM::EHABI::NUM_PERSONALITY_INDEX)
+    return Error(IndexLoc,
+                 "personality routine index should be in range [0-3]");
+
+  getTargetStreamer().emitPersonalityIndex(CE->getValue());
+  return false;
+}
+
+/// parseDirectiveUnwindRaw
+///   ::= .unwind_raw offset, opcode [, opcode...]
+bool ARMAsmParser::parseDirectiveUnwindRaw(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  int64_t StackOffset;
+  const MCExpr *OffsetExpr;
+  SMLoc OffsetLoc = getLexer().getLoc();
+
+  if (!UC.hasFnStart())
+    return Error(L, ".fnstart must precede .unwind_raw directives");
+  if (getParser().parseExpression(OffsetExpr))
+    return Error(OffsetLoc, "expected expression");
+
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+  if (!CE)
+    return Error(OffsetLoc, "offset must be a constant");
+
+  StackOffset = CE->getValue();
+
+  if (Parser.parseToken(AsmToken::Comma, "expected comma"))
+    return true;
+
+  SmallVector<uint8_t, 16> Opcodes;
+
+  auto parseOne = [&]() -> bool {
+    const MCExpr *OE;
+    SMLoc OpcodeLoc = getLexer().getLoc();
+    if (check(getLexer().is(AsmToken::EndOfStatement) ||
+                  Parser.parseExpression(OE),
+              OpcodeLoc, "expected opcode expression"))
+      return true;
+    const MCConstantExpr *OC = dyn_cast<MCConstantExpr>(OE);
+    if (!OC)
+      return Error(OpcodeLoc, "opcode value must be a constant");
+    const int64_t Opcode = OC->getValue();
+    if (Opcode & ~0xff)
+      return Error(OpcodeLoc, "invalid opcode");
+    Opcodes.push_back(uint8_t(Opcode));
+    return false;
+  };
+
+  // Must have at least 1 element
+  SMLoc OpcodeLoc = getLexer().getLoc();
+  if (parseOptionalToken(AsmToken::EndOfStatement))
+    return Error(OpcodeLoc, "expected opcode expression");
+  if (parseMany(parseOne))
+    return true;
+
+  getTargetStreamer().emitUnwindRaw(StackOffset, Opcodes);
+  return false;
+}
+
+/// parseDirectiveTLSDescSeq
+///   ::= .tlsdescseq tls-variable
+bool ARMAsmParser::parseDirectiveTLSDescSeq(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+
+  if (getLexer().isNot(AsmToken::Identifier))
+    return TokError("expected variable after '.tlsdescseq' directive");
+
+  const MCSymbolRefExpr *SRE =
+    MCSymbolRefExpr::create(Parser.getTok().getIdentifier(),
+                            MCSymbolRefExpr::VK_ARM_TLSDESCSEQ, getContext());
+  Lex();
+
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.tlsdescseq' directive"))
+    return true;
+
+  getTargetStreamer().AnnotateTLSDescriptorSequence(SRE);
+  return false;
+}
+
+/// parseDirectiveMovSP
+///  ::= .movsp reg [, #offset]
+bool ARMAsmParser::parseDirectiveMovSP(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  if (!UC.hasFnStart())
+    return Error(L, ".fnstart must precede .movsp directives");
+  if (UC.getFPReg() != ARM::SP)
+    return Error(L, "unexpected .movsp directive");
+
+  SMLoc SPRegLoc = Parser.getTok().getLoc();
+  int SPReg = tryParseRegister();
+  if (SPReg == -1)
+    return Error(SPRegLoc, "register expected");
+  if (SPReg == ARM::SP || SPReg == ARM::PC)
+    return Error(SPRegLoc, "sp and pc are not permitted in .movsp directive");
+
+  int64_t Offset = 0;
+  if (Parser.parseOptionalToken(AsmToken::Comma)) {
+    if (Parser.parseToken(AsmToken::Hash, "expected #constant"))
+      return true;
+
+    const MCExpr *OffsetExpr;
+    SMLoc OffsetLoc = Parser.getTok().getLoc();
+
+    if (Parser.parseExpression(OffsetExpr))
+      return Error(OffsetLoc, "malformed offset expression");
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+    if (!CE)
+      return Error(OffsetLoc, "offset must be an immediate constant");
+
+    Offset = CE->getValue();
+  }
+
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.movsp' directive"))
+    return true;
+
+  getTargetStreamer().emitMovSP(SPReg, Offset);
+  UC.saveFPReg(SPReg);
+
+  return false;
+}
+
+/// parseDirectiveObjectArch
+///   ::= .object_arch name
+bool ARMAsmParser::parseDirectiveObjectArch(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  if (getLexer().isNot(AsmToken::Identifier))
+    return Error(getLexer().getLoc(), "unexpected token");
+
+  StringRef Arch = Parser.getTok().getString();
+  SMLoc ArchLoc = Parser.getTok().getLoc();
+  Lex();
+
+  unsigned ID = ARM::parseArch(Arch);
+
+  if (ID == ARM::AK_INVALID)
+    return Error(ArchLoc, "unknown architecture '" + Arch + "'");
+  if (parseToken(AsmToken::EndOfStatement))
+    return true;
+
+  getTargetStreamer().emitObjectArch(ID);
+  return false;
+}
+
+/// parseDirectiveAlign
+///   ::= .align
+bool ARMAsmParser::parseDirectiveAlign(SMLoc L) {
+  // NOTE: if this is not the end of the statement, fall back to the target
+  // agnostic handling for this directive which will correctly handle this.
+  if (parseOptionalToken(AsmToken::EndOfStatement)) {
+    // '.align' is target specifically handled to mean 2**2 byte alignment.
+    const MCSection *Section = getStreamer().getCurrentSectionOnly();
+    assert(Section && "must have section to emit alignment");
+    if (Section->UseCodeAlign())
+      getStreamer().EmitCodeAlignment(4, 0);
+    else
+      getStreamer().EmitValueToAlignment(4, 0, 1, 0);
+    return false;
+  }
+  return true;
+}
+
+/// parseDirectiveThumbSet
+///  ::= .thumb_set name, value
+bool ARMAsmParser::parseDirectiveThumbSet(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+
+  StringRef Name;
+  if (check(Parser.parseIdentifier(Name),
+            "expected identifier after '.thumb_set'") ||
+      parseToken(AsmToken::Comma, "expected comma after name '" + Name + "'"))
+    return true;
+
+  MCSymbol *Sym;
+  const MCExpr *Value;
+  if (MCParserUtils::parseAssignmentExpression(Name, /* allow_redef */ true,
+                                               Parser, Sym, Value))
+    return true;
+
+  getTargetStreamer().emitThumbSet(Sym, Value);
+  return false;
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializeARMAsmParser() {
+  RegisterMCAsmParser<ARMAsmParser> X(getTheARMLETarget());
+  RegisterMCAsmParser<ARMAsmParser> Y(getTheARMBETarget());
+  RegisterMCAsmParser<ARMAsmParser> A(getTheThumbLETarget());
+  RegisterMCAsmParser<ARMAsmParser> B(getTheThumbBETarget());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#include "ARMGenAsmMatcher.inc"
+
+// FIXME: This structure should be moved inside ARMTargetParser
+// when we start to table-generate them, and we can use the ARM
+// flags below, that were generated by table-gen.
+static const struct {
+  const unsigned Kind;
+  const uint64_t ArchCheck;
+  const FeatureBitset Features;
+} Extensions[] = {
+  { ARM::AEK_CRC, Feature_HasV8, {ARM::FeatureCRC} },
+  { ARM::AEK_CRYPTO,  Feature_HasV8,
+    {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} },
+  { ARM::AEK_FP, Feature_HasV8, {ARM::FeatureFPARMv8} },
+  { (ARM::AEK_HWDIV | ARM::AEK_HWDIVARM), Feature_HasV7 | Feature_IsNotMClass,
+    {ARM::FeatureHWDiv, ARM::FeatureHWDivARM} },
+  { ARM::AEK_MP, Feature_HasV7 | Feature_IsNotMClass, {ARM::FeatureMP} },
+  { ARM::AEK_SIMD, Feature_HasV8, {ARM::FeatureNEON, ARM::FeatureFPARMv8} },
+  { ARM::AEK_SEC, Feature_HasV6K, {ARM::FeatureTrustZone} },
+  // FIXME: Only available in A-class, isel not predicated
+  { ARM::AEK_VIRT, Feature_HasV7, {ARM::FeatureVirtualization} },
+  { ARM::AEK_FP16, Feature_HasV8_2a, {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} },
+  { ARM::AEK_RAS, Feature_HasV8, {ARM::FeatureRAS} },
+  // FIXME: Unsupported extensions.
+  { ARM::AEK_OS, Feature_None, {} },
+  { ARM::AEK_IWMMXT, Feature_None, {} },
+  { ARM::AEK_IWMMXT2, Feature_None, {} },
+  { ARM::AEK_MAVERICK, Feature_None, {} },
+  { ARM::AEK_XSCALE, Feature_None, {} },
+};
+
+/// parseDirectiveArchExtension
+///   ::= .arch_extension [no]feature
+bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+
+  if (getLexer().isNot(AsmToken::Identifier))
+    return Error(getLexer().getLoc(), "expected architecture extension name");
+
+  StringRef Name = Parser.getTok().getString();
+  SMLoc ExtLoc = Parser.getTok().getLoc();
+  Lex();
+
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.arch_extension' directive"))
+    return true;
+
+  bool EnableFeature = true;
+  if (Name.startswith_lower("no")) {
+    EnableFeature = false;
+    Name = Name.substr(2);
+  }
+  unsigned FeatureKind = ARM::parseArchExt(Name);
+  if (FeatureKind == ARM::AEK_INVALID)
+    return Error(ExtLoc, "unknown architectural extension: " + Name);
+
+  for (const auto &Extension : Extensions) {
+    if (Extension.Kind != FeatureKind)
+      continue;
+
+    if (Extension.Features.none())
+      return Error(ExtLoc, "unsupported architectural extension: " + Name);
+
+    if ((getAvailableFeatures() & Extension.ArchCheck) != Extension.ArchCheck)
+      return Error(ExtLoc, "architectural extension '" + Name +
+                               "' is not "
+                               "allowed for the current base architecture");
+
+    MCSubtargetInfo &STI = copySTI();
+    FeatureBitset ToggleFeatures = EnableFeature
+      ? (~STI.getFeatureBits() & Extension.Features)
+      : ( STI.getFeatureBits() & Extension.Features);
+
+    uint64_t Features =
+        ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
+    setAvailableFeatures(Features);
+    return false;
+  }
+
+  return Error(ExtLoc, "unknown architectural extension: " + Name);
+}
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+                                                  unsigned Kind) {
+  ARMOperand &Op = static_cast<ARMOperand &>(AsmOp);
+  // If the kind is a token for a literal immediate, check if our asm
+  // operand matches. This is for InstAliases which have a fixed-value
+  // immediate in the syntax.
+  switch (Kind) {
+  default: break;
+  case MCK__35_0:
+    if (Op.isImm())
+      if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()))
+        if (CE->getValue() == 0)
+          return Match_Success;
+    break;
+  case MCK_ModImm:
+    if (Op.isImm()) {
+      const MCExpr *SOExpr = Op.getImm();
+      int64_t Value;
+      if (!SOExpr->evaluateAsAbsolute(Value))
+        return Match_Success;
+      assert((Value >= INT32_MIN && Value <= UINT32_MAX) &&
+             "expression value must be representable in 32 bits");
+    }
+    break;
+  case MCK_rGPR:
+    if (hasV8Ops() && Op.isReg() && Op.getReg() == ARM::SP)
+      return Match_Success;
+    break;
+  case MCK_GPRPair:
+    if (Op.isReg() &&
+        MRI->getRegClass(ARM::GPRRegClassID).contains(Op.getReg()))
+      return Match_Success;
+    break;
+  }
+  return Match_InvalidOperand;
+}
diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
new file mode 100644
index 000000000000..ac3d8c780af2
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -0,0 +1,5313 @@
+//===-- ARMDisassembler.cpp - Disassembler for ARM/Thumb ISA --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+  // Handles the condition code status of instructions in IT blocks
+  class ITStatus
+  {
+    public:
+      // Returns the condition code for instruction in IT block
+      unsigned getITCC() {
+        unsigned CC = ARMCC::AL;
+        if (instrInITBlock())
+          CC = ITStates.back();
+        return CC;
+      }
+
+      // Advances the IT block state to the next T or E
+      void advanceITState() {
+        ITStates.pop_back();
+      }
+
+      // Returns true if the current instruction is in an IT block
+      bool instrInITBlock() {
+        return !ITStates.empty();
+      }
+
+      // Returns true if current instruction is the last instruction in an IT block
+      bool instrLastInITBlock() {
+        return ITStates.size() == 1;
+      }
+
+      // Called when decoding an IT instruction. Sets the IT state for the following
+      // instructions that for the IT block. Firstcond and Mask correspond to the
+      // fields in the IT instruction encoding.
+      void setITState(char Firstcond, char Mask) {
+        // (3 - the number of trailing zeros) is the number of then / else.
+        unsigned CondBit0 = Firstcond & 1;
+        unsigned NumTZ = countTrailingZeros<uint8_t>(Mask);
+        unsigned char CCBits = static_cast<unsigned char>(Firstcond & 0xf);
+        assert(NumTZ <= 3 && "Invalid IT mask!");
+        // push condition codes onto the stack the correct order for the pops
+        for (unsigned Pos = NumTZ+1; Pos <= 3; ++Pos) {
+          bool T = ((Mask >> Pos) & 1) == CondBit0;
+          if (T)
+            ITStates.push_back(CCBits);
+          else
+            ITStates.push_back(CCBits ^ 1);
+        }
+        ITStates.push_back(CCBits);
+      }
+
+    private:
+      std::vector<unsigned char> ITStates;
+  };
+}
+
+namespace {
+/// ARM disassembler for all ARM platforms.
+class ARMDisassembler : public MCDisassembler {
+public:
+  ARMDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+    MCDisassembler(STI, Ctx) {
+  }
+
+  ~ARMDisassembler() override {}
+
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
+};
+
+/// Thumb disassembler for all Thumb platforms.
+class ThumbDisassembler : public MCDisassembler {
+public:
+  ThumbDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+    MCDisassembler(STI, Ctx) {
+  }
+
+  ~ThumbDisassembler() override {}
+
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
+
+private:
+  mutable ITStatus ITBlock;
+  DecodeStatus AddThumbPredicate(MCInst&) const;
+  void UpdateThumbVFPPredicate(MCInst&) const;
+};
+}
+
+static bool Check(DecodeStatus &Out, DecodeStatus In) {
+  switch (In) {
+    case MCDisassembler::Success:
+      // Out stays the same.
+      return true;
+    case MCDisassembler::SoftFail:
+      Out = In;
+      return true;
+    case MCDisassembler::Fail:
+      Out = In;
+      return false;
+  }
+  llvm_unreachable("Invalid DecodeStatus!");
+}
+
+
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
+static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeGPRwithAPSRRegisterClass(MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPR_VFP2RegisterClass(MCInst &Inst,
+                                                unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
+                               unsigned RegNo, uint64_t Address,
+                               const void *Decoder);
+
+static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode2IdxInstruction(MCInst &Inst,
+                                                  unsigned Insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode3Instruction(MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst & Inst,
+                                                  unsigned Insn,
+                                                  uint64_t Adddress,
+                                                  const void *Decoder);
+static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode5FP16Operand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBranchImmInstruction(MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeNEONModImmInstruction(MCInst &Inst,unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBankedReg(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+
+
+static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBCCTargetOperand(MCInst &Inst,unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeIT(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LDRDPreInstruction(MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2STRDPreInstruction(MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2Adr(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecoderForMRRC2AndMCRR2(llvm::MCInst &Inst, unsigned Val,
+                                            uint64_t Address, const void *Decoder);
+#include "ARMGenDisassemblerTables.inc"
+
+static MCDisassembler *createARMDisassembler(const Target &T,
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  return new ARMDisassembler(STI, Ctx);
+}
+
+static MCDisassembler *createThumbDisassembler(const Target &T,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new ThumbDisassembler(STI, Ctx);
+}
+
+// Post-decoding checks
+static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size,
+                                            uint64_t Address, raw_ostream &OS,
+                                            raw_ostream &CS,
+                                            uint32_t Insn,
+                                            DecodeStatus Result)
+{
+  switch (MI.getOpcode()) {
+    case ARM::HVC: {
+      // HVC is undefined if condition = 0xf otherwise upredictable
+      // if condition != 0xe
+      uint32_t Cond = (Insn >> 28) & 0xF;
+      if (Cond == 0xF)
+        return MCDisassembler::Fail;
+      if (Cond != 0xE)
+        return MCDisassembler::SoftFail;
+      return Result;
+    }
+    default: return Result;
+  }
+}
+
+DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                             ArrayRef<uint8_t> Bytes,
+                                             uint64_t Address, raw_ostream &OS,
+                                             raw_ostream &CS) const {
+  CommentStream = &CS;
+
+  assert(!STI.getFeatureBits()[ARM::ModeThumb] &&
+         "Asked to disassemble an ARM instruction but Subtarget is in Thumb "
+         "mode!");
+
+  // We want to read exactly 4 bytes of data.
+  if (Bytes.size() < 4) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Encoded as a small-endian 32-bit word in the stream.
+  uint32_t Insn =
+      (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
+
+  // Calling the auto-generated decoder function.
+  DecodeStatus Result =
+      decodeInstruction(DecoderTableARM32, MI, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return checkDecodedInstruction(MI, Size, Address, OS, CS, Insn, Result);
+  }
+
+  // VFP and NEON instructions, similarly, are shared between ARM
+  // and Thumb modes.
+  Result = decodeInstruction(DecoderTableVFP32, MI, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  Result = decodeInstruction(DecoderTableVFPV832, MI, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  Result =
+      decodeInstruction(DecoderTableNEONData32, MI, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    // Add a fake predicate operand, because we share these instruction
+    // definitions with Thumb2 where these instructions are predicable.
+    if (!DecodePredicateOperand(MI, 0xE, Address, this))
+      return MCDisassembler::Fail;
+    return Result;
+  }
+
+  Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, Insn, Address,
+                             this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    // Add a fake predicate operand, because we share these instruction
+    // definitions with Thumb2 where these instructions are predicable.
+    if (!DecodePredicateOperand(MI, 0xE, Address, this))
+      return MCDisassembler::Fail;
+    return Result;
+  }
+
+  Result =
+      decodeInstruction(DecoderTableNEONDup32, MI, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    // Add a fake predicate operand, because we share these instruction
+    // definitions with Thumb2 where these instructions are predicable.
+    if (!DecodePredicateOperand(MI, 0xE, Address, this))
+      return MCDisassembler::Fail;
+    return Result;
+  }
+
+  Result =
+      decodeInstruction(DecoderTablev8NEON32, MI, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  Result =
+      decodeInstruction(DecoderTablev8Crypto32, MI, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  Size = 0;
+  return MCDisassembler::Fail;
+}
+
+namespace llvm {
+extern const MCInstrDesc ARMInsts[];
+}
+
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst.  The immediate Value has had any PC
+/// adjustment made by the caller.  If the instruction is a branch instruction
+/// then isBranch is true, else false.  If the getOpInfo() function was set as
+/// part of the setupForSymbolicDisassembly() call then that function is called
+/// to get any symbolic information at the Address for this instruction.  If
+/// that returns non-zero then the symbolic information it returns is used to
+/// create an MCExpr and that is added as an operand to the MCInst.  If
+/// getOpInfo() returns zero and isBranch is true then a symbol look up for
+/// Value is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with Value is created.  This function returns true if it adds an
+/// operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
+                                     bool isBranch, uint64_t InstSize,
+                                     MCInst &MI, const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  // FIXME: Does it make sense for value to be negative?
+  return Dis->tryAddingSymbolicOperand(MI, (uint32_t)Value, Address, isBranch,
+                                       /* Offset */ 0, InstSize);
+}
+
+/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
+/// referenced by a load instruction with the base register that is the Pc.
+/// These can often be values in a literal pool near the Address of the
+/// instruction.  The Address of the instruction and its immediate Value are
+/// used as a possible literal pool entry.  The SymbolLookUp call back will
+/// return the name of a symbol referenced by the literal pool's entry if
+/// the referenced address is that of a symbol.  Or it will return a pointer to
+/// a literal 'C' string if the referenced address of the literal pool's entry
+/// is an address into a section with 'C' string literals.
+static void tryAddingPcLoadReferenceComment(uint64_t Address, int Value,
+                                            const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  Dis->tryAddingPcLoadReferenceComment(Value, Address);
+}
+
+// Thumb1 instructions don't have explicit S bits.  Rather, they
+// implicitly set CPSR.  Since it's not represented in the encoding, the
+// auto-generated decoder won't inject the CPSR operand.  We need to fix
+// that as a post-pass.
+static void AddThumb1SBit(MCInst &MI, bool InITBlock) {
+  const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+  unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+  MCInst::iterator I = MI.begin();
+  for (unsigned i = 0; i < NumOps; ++i, ++I) {
+    if (I == MI.end()) break;
+    if (OpInfo[i].isOptionalDef() && OpInfo[i].RegClass == ARM::CCRRegClassID) {
+      if (i > 0 && OpInfo[i-1].isPredicate()) continue;
+      MI.insert(I, MCOperand::createReg(InITBlock ? 0 : ARM::CPSR));
+      return;
+    }
+  }
+
+  MI.insert(I, MCOperand::createReg(InITBlock ? 0 : ARM::CPSR));
+}
+
+// Most Thumb instructions don't have explicit predicates in the
+// encoding, but rather get their predicates from IT context.  We need
+// to fix up the predicate operands using this context information as a
+// post-pass.
+MCDisassembler::DecodeStatus
+ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
+  MCDisassembler::DecodeStatus S = Success;
+
+  const FeatureBitset &FeatureBits = getSubtargetInfo().getFeatureBits();
+
+  // A few instructions actually have predicates encoded in them.  Don't
+  // try to overwrite it if we're seeing one of those.
+  switch (MI.getOpcode()) {
+    case ARM::tBcc:
+    case ARM::t2Bcc:
+    case ARM::tCBZ:
+    case ARM::tCBNZ:
+    case ARM::tCPS:
+    case ARM::t2CPS3p:
+    case ARM::t2CPS2p:
+    case ARM::t2CPS1p:
+    case ARM::tMOVSr:
+    case ARM::tSETEND:
+      // Some instructions (mostly conditional branches) are not
+      // allowed in IT blocks.
+      if (ITBlock.instrInITBlock())
+        S = SoftFail;
+      else
+        return Success;
+      break;
+    case ARM::t2HINT:
+      if (MI.getOperand(0).getImm() == 0x10 && (FeatureBits[ARM::FeatureRAS]) != 0)
+        S = SoftFail;
+      break;
+    case ARM::tB:
+    case ARM::t2B:
+    case ARM::t2TBB:
+    case ARM::t2TBH:
+      // Some instructions (mostly unconditional branches) can
+      // only appears at the end of, or outside of, an IT.
+      if (ITBlock.instrInITBlock() && !ITBlock.instrLastInITBlock())
+        S = SoftFail;
+      break;
+    default:
+      break;
+  }
+
+  // If we're in an IT block, base the predicate on that.  Otherwise,
+  // assume a predicate of AL.
+  unsigned CC;
+  CC = ITBlock.getITCC();
+  if (CC == 0xF) 
+    CC = ARMCC::AL;
+  if (ITBlock.instrInITBlock())
+    ITBlock.advanceITState();
+
+  const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+  unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+  MCInst::iterator I = MI.begin();
+  for (unsigned i = 0; i < NumOps; ++i, ++I) {
+    if (I == MI.end()) break;
+    if (OpInfo[i].isPredicate()) {
+      I = MI.insert(I, MCOperand::createImm(CC));
+      ++I;
+      if (CC == ARMCC::AL)
+        MI.insert(I, MCOperand::createReg(0));
+      else
+        MI.insert(I, MCOperand::createReg(ARM::CPSR));
+      return S;
+    }
+  }
+
+  I = MI.insert(I, MCOperand::createImm(CC));
+  ++I;
+  if (CC == ARMCC::AL)
+    MI.insert(I, MCOperand::createReg(0));
+  else
+    MI.insert(I, MCOperand::createReg(ARM::CPSR));
+
+  return S;
+}
+
+// Thumb VFP instructions are a special case.  Because we share their
+// encodings between ARM and Thumb modes, and they are predicable in ARM
+// mode, the auto-generated decoder will give them an (incorrect)
+// predicate operand.  We need to rewrite these operands based on the IT
+// context as a post-pass.
+void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const {
+  unsigned CC;
+  CC = ITBlock.getITCC();
+  if (ITBlock.instrInITBlock())
+    ITBlock.advanceITState();
+
+  const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+  MCInst::iterator I = MI.begin();
+  unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+  for (unsigned i = 0; i < NumOps; ++i, ++I) {
+    if (OpInfo[i].isPredicate() ) {
+      I->setImm(CC);
+      ++I;
+      if (CC == ARMCC::AL)
+        I->setReg(0);
+      else
+        I->setReg(ARM::CPSR);
+      return;
+    }
+  }
+}
+
+DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                               ArrayRef<uint8_t> Bytes,
+                                               uint64_t Address,
+                                               raw_ostream &OS,
+                                               raw_ostream &CS) const {
+  CommentStream = &CS;
+
+  assert(STI.getFeatureBits()[ARM::ModeThumb] &&
+         "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!");
+
+  // We want to read exactly 2 bytes of data.
+  if (Bytes.size() < 2) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  uint16_t Insn16 = (Bytes[1] << 8) | Bytes[0];
+  DecodeStatus Result =
+      decodeInstruction(DecoderTableThumb16, MI, Insn16, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 2;
+    Check(Result, AddThumbPredicate(MI));
+    return Result;
+  }
+
+  Result = decodeInstruction(DecoderTableThumbSBit16, MI, Insn16, Address, this,
+                             STI);
+  if (Result) {
+    Size = 2;
+    bool InITBlock = ITBlock.instrInITBlock();
+    Check(Result, AddThumbPredicate(MI));
+    AddThumb1SBit(MI, InITBlock);
+    return Result;
+  }
+
+  Result =
+      decodeInstruction(DecoderTableThumb216, MI, Insn16, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 2;
+
+    // Nested IT blocks are UNPREDICTABLE.  Must be checked before we add
+    // the Thumb predicate.
+    if (MI.getOpcode() == ARM::t2IT && ITBlock.instrInITBlock())
+      Result = MCDisassembler::SoftFail;
+
+    Check(Result, AddThumbPredicate(MI));
+
+    // If we find an IT instruction, we need to parse its condition
+    // code and mask operands so that we can apply them correctly
+    // to the subsequent instructions.
+    if (MI.getOpcode() == ARM::t2IT) {
+
+      unsigned Firstcond = MI.getOperand(0).getImm();
+      unsigned Mask = MI.getOperand(1).getImm();
+      ITBlock.setITState(Firstcond, Mask);
+    }
+
+    return Result;
+  }
+
+  // We want to read exactly 4 bytes of data.
+  if (Bytes.size() < 4) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  uint32_t Insn32 =
+      (Bytes[3] << 8) | (Bytes[2] << 0) | (Bytes[1] << 24) | (Bytes[0] << 16);
+  Result =
+      decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    bool InITBlock = ITBlock.instrInITBlock();
+    Check(Result, AddThumbPredicate(MI));
+    AddThumb1SBit(MI, InITBlock);
+    return Result;
+  }
+
+  Result =
+      decodeInstruction(DecoderTableThumb232, MI, Insn32, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    Check(Result, AddThumbPredicate(MI));
+    return Result;
+  }
+
+  if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
+    Result =
+        decodeInstruction(DecoderTableVFP32, MI, Insn32, Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      UpdateThumbVFPPredicate(MI);
+      return Result;
+    }
+  }
+
+  Result =
+      decodeInstruction(DecoderTableVFPV832, MI, Insn32, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
+    Result = decodeInstruction(DecoderTableNEONDup32, MI, Insn32, Address, this,
+                               STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(Result, AddThumbPredicate(MI));
+      return Result;
+    }
+  }
+
+  if (fieldFromInstruction(Insn32, 24, 8) == 0xF9) {
+    uint32_t NEONLdStInsn = Insn32;
+    NEONLdStInsn &= 0xF0FFFFFF;
+    NEONLdStInsn |= 0x04000000;
+    Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, NEONLdStInsn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(Result, AddThumbPredicate(MI));
+      return Result;
+    }
+  }
+
+  if (fieldFromInstruction(Insn32, 24, 4) == 0xF) {
+    uint32_t NEONDataInsn = Insn32;
+    NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24
+    NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
+    NEONDataInsn |= 0x12000000; // Set bits 28 and 25
+    Result = decodeInstruction(DecoderTableNEONData32, MI, NEONDataInsn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(Result, AddThumbPredicate(MI));
+      return Result;
+    }
+
+    uint32_t NEONCryptoInsn = Insn32;
+    NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24
+    NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
+    NEONCryptoInsn |= 0x12000000; // Set bits 28 and 25
+    Result = decodeInstruction(DecoderTablev8Crypto32, MI, NEONCryptoInsn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+
+    uint32_t NEONv8Insn = Insn32;
+    NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26
+    Result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
+                               this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  Size = 0;
+  return MCDisassembler::Fail;
+}
+
+
+extern "C" void LLVMInitializeARMDisassembler() {
+  TargetRegistry::RegisterMCDisassembler(getTheARMLETarget(),
+                                         createARMDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheARMBETarget(),
+                                         createARMDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheThumbLETarget(),
+                                         createThumbDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheThumbBETarget(),
+                                         createThumbDisassembler);
+}
+
+static const uint16_t GPRDecoderTable[] = {
+  ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+  ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+  ARM::R8, ARM::R9, ARM::R10, ARM::R11,
+  ARM::R12, ARM::SP, ARM::LR, ARM::PC
+};
+
+static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+
+  unsigned Register = GPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (RegNo == 15) 
+    S = MCDisassembler::SoftFail;
+
+  Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+
+  return S;
+}
+
+static DecodeStatus
+DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (RegNo == 15)
+  {
+    Inst.addOperand(MCOperand::createReg(ARM::APSR_NZCV));
+    return MCDisassembler::Success;
+  }
+
+  Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+  return S;
+}
+
+static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static const uint16_t GPRPairDecoderTable[] = {
+  ARM::R0_R1, ARM::R2_R3,   ARM::R4_R5,  ARM::R6_R7,
+  ARM::R8_R9, ARM::R10_R11, ARM::R12_SP
+};
+
+static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (RegNo > 13)
+    return MCDisassembler::Fail;
+
+  if ((RegNo & 1) || RegNo == 0xe)
+     S = MCDisassembler::SoftFail;
+
+  unsigned RegisterPair = GPRPairDecoderTable[RegNo/2];
+  Inst.addOperand(MCOperand::createReg(RegisterPair));
+  return S;
+}
+
+static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned Register = 0;
+  switch (RegNo) {
+    case 0:
+      Register = ARM::R0;
+      break;
+    case 1:
+      Register = ARM::R1;
+      break;
+    case 2:
+      Register = ARM::R2;
+      break;
+    case 3:
+      Register = ARM::R3;
+      break;
+    case 9:
+      Register = ARM::R9;
+      break;
+    case 12:
+      Register = ARM::R12;
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  const FeatureBitset &featureBits =
+    ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+  if ((RegNo == 13 && !featureBits[ARM::HasV8Ops]) || RegNo == 15)
+    S = MCDisassembler::SoftFail;
+
+  Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+  return S;
+}
+
+static const uint16_t SPRDecoderTable[] = {
+     ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
+     ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
+     ARM::S8,  ARM::S9, ARM::S10, ARM::S11,
+    ARM::S12, ARM::S13, ARM::S14, ARM::S15,
+    ARM::S16, ARM::S17, ARM::S18, ARM::S19,
+    ARM::S20, ARM::S21, ARM::S22, ARM::S23,
+    ARM::S24, ARM::S25, ARM::S26, ARM::S27,
+    ARM::S28, ARM::S29, ARM::S30, ARM::S31
+};
+
+static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Register = SPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+static const uint16_t DPRDecoderTable[] = {
+     ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
+     ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
+     ARM::D8,  ARM::D9, ARM::D10, ARM::D11,
+    ARM::D12, ARM::D13, ARM::D14, ARM::D15,
+    ARM::D16, ARM::D17, ARM::D18, ARM::D19,
+    ARM::D20, ARM::D21, ARM::D22, ARM::D23,
+    ARM::D24, ARM::D25, ARM::D26, ARM::D27,
+    ARM::D28, ARM::D29, ARM::D30, ARM::D31
+};
+
+static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  const FeatureBitset &featureBits =
+    ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+  bool hasD16 = featureBits[ARM::FeatureD16];
+
+  if (RegNo > 31 || (hasD16 && RegNo > 15))
+    return MCDisassembler::Fail;
+
+  unsigned Register = DPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus
+DecodeDPR_VFP2RegisterClass(MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+  return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static const uint16_t QPRDecoderTable[] = {
+     ARM::Q0,  ARM::Q1,  ARM::Q2,  ARM::Q3,
+     ARM::Q4,  ARM::Q5,  ARM::Q6,  ARM::Q7,
+     ARM::Q8,  ARM::Q9, ARM::Q10, ARM::Q11,
+    ARM::Q12, ARM::Q13, ARM::Q14, ARM::Q15
+};
+
+
+static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 31 || (RegNo & 1) != 0)
+    return MCDisassembler::Fail;
+  RegNo >>= 1;
+
+  unsigned Register = QPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+static const uint16_t DPairDecoderTable[] = {
+  ARM::Q0,  ARM::D1_D2,   ARM::Q1,  ARM::D3_D4,   ARM::Q2,  ARM::D5_D6,
+  ARM::Q3,  ARM::D7_D8,   ARM::Q4,  ARM::D9_D10,  ARM::Q5,  ARM::D11_D12,
+  ARM::Q6,  ARM::D13_D14, ARM::Q7,  ARM::D15_D16, ARM::Q8,  ARM::D17_D18,
+  ARM::Q9,  ARM::D19_D20, ARM::Q10, ARM::D21_D22, ARM::Q11, ARM::D23_D24,
+  ARM::Q12, ARM::D25_D26, ARM::Q13, ARM::D27_D28, ARM::Q14, ARM::D29_D30,
+  ARM::Q15
+};
+
+static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 30)
+    return MCDisassembler::Fail;
+
+  unsigned Register = DPairDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+static const uint16_t DPairSpacedDecoderTable[] = {
+  ARM::D0_D2,   ARM::D1_D3,   ARM::D2_D4,   ARM::D3_D5,
+  ARM::D4_D6,   ARM::D5_D7,   ARM::D6_D8,   ARM::D7_D9,
+  ARM::D8_D10,  ARM::D9_D11,  ARM::D10_D12, ARM::D11_D13,
+  ARM::D12_D14, ARM::D13_D15, ARM::D14_D16, ARM::D15_D17,
+  ARM::D16_D18, ARM::D17_D19, ARM::D18_D20, ARM::D19_D21,
+  ARM::D20_D22, ARM::D21_D23, ARM::D22_D24, ARM::D23_D25,
+  ARM::D24_D26, ARM::D25_D27, ARM::D26_D28, ARM::D27_D29,
+  ARM::D28_D30, ARM::D29_D31
+};
+
+static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
+                                                   unsigned RegNo,
+                                                   uint64_t Address,
+                                                   const void *Decoder) {
+  if (RegNo > 29)
+    return MCDisassembler::Fail;
+
+  unsigned Register = DPairSpacedDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  if (Val == 0xF) return MCDisassembler::Fail;
+  // AL predicate is not allowed on Thumb1 branches.
+  if (Inst.getOpcode() == ARM::tBcc && Val == 0xE)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(Val));
+  if (Val == ARMCC::AL) {
+    Inst.addOperand(MCOperand::createReg(0));
+  } else
+    Inst.addOperand(MCOperand::createReg(ARM::CPSR));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  if (Val)
+    Inst.addOperand(MCOperand::createReg(ARM::CPSR));
+  else
+    Inst.addOperand(MCOperand::createReg(0));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rm = fieldFromInstruction(Val, 0, 4);
+  unsigned type = fieldFromInstruction(Val, 5, 2);
+  unsigned imm = fieldFromInstruction(Val, 7, 5);
+
+  // Register-immediate
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  ARM_AM::ShiftOpc Shift = ARM_AM::lsl;
+  switch (type) {
+    case 0:
+      Shift = ARM_AM::lsl;
+      break;
+    case 1:
+      Shift = ARM_AM::lsr;
+      break;
+    case 2:
+      Shift = ARM_AM::asr;
+      break;
+    case 3:
+      Shift = ARM_AM::ror;
+      break;
+  }
+
+  if (Shift == ARM_AM::ror && imm == 0)
+    Shift = ARM_AM::rrx;
+
+  unsigned Op = Shift | (imm << 3);
+  Inst.addOperand(MCOperand::createImm(Op));
+
+  return S;
+}
+
+static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rm = fieldFromInstruction(Val, 0, 4);
+  unsigned type = fieldFromInstruction(Val, 5, 2);
+  unsigned Rs = fieldFromInstruction(Val, 8, 4);
+
+  // Register-register
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rs, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  ARM_AM::ShiftOpc Shift = ARM_AM::lsl;
+  switch (type) {
+    case 0:
+      Shift = ARM_AM::lsl;
+      break;
+    case 1:
+      Shift = ARM_AM::lsr;
+      break;
+    case 2:
+      Shift = ARM_AM::asr;
+      break;
+    case 3:
+      Shift = ARM_AM::ror;
+      break;
+  }
+
+  Inst.addOperand(MCOperand::createImm(Shift));
+
+  return S;
+}
+
+static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  bool NeedDisjointWriteback = false;
+  unsigned WritebackReg = 0;
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::LDMDA_UPD:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD:
+    NeedDisjointWriteback = true;
+    WritebackReg = Inst.getOperand(0).getReg();
+    break;
+  }
+
+  // Empty register lists are not allowed.
+  if (Val == 0) return MCDisassembler::Fail;
+  for (unsigned i = 0; i < 16; ++i) {
+    if (Val & (1 << i)) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, i, Address, Decoder)))
+        return MCDisassembler::Fail;
+      // Writeback not allowed if Rn is in the target list.
+      if (NeedDisjointWriteback && WritebackReg == Inst.end()[-1].getReg())
+        Check(S, MCDisassembler::SoftFail);
+    }
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Vd = fieldFromInstruction(Val, 8, 5);
+  unsigned regs = fieldFromInstruction(Val, 0, 8);
+
+  // In case of unpredictable encoding, tweak the operands.
+  if (regs == 0 || (Vd + regs) > 32) {
+    regs = Vd + regs > 32 ? 32 - Vd : regs;
+    regs = std::max( 1u, regs);
+    S = MCDisassembler::SoftFail;
+  }
+
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Vd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  for (unsigned i = 0; i < (regs - 1); ++i) {
+    if (!Check(S, DecodeSPRRegisterClass(Inst, ++Vd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Vd = fieldFromInstruction(Val, 8, 5);
+  unsigned regs = fieldFromInstruction(Val, 1, 7);
+
+  // In case of unpredictable encoding, tweak the operands.
+  if (regs == 0 || regs > 16 || (Vd + regs) > 32) {
+    regs = Vd + regs > 32 ? 32 - Vd : regs;
+    regs = std::max( 1u, regs);
+    regs = std::min(16u, regs);
+    S = MCDisassembler::SoftFail;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  for (unsigned i = 0; i < (regs - 1); ++i) {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, ++Vd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Val,
+                                      uint64_t Address, const void *Decoder) {
+  // This operand encodes a mask of contiguous zeros between a specified MSB
+  // and LSB.  To decode it, we create the mask of all bits MSB-and-lower,
+  // the mask of all bits LSB-and-lower, and then xor them to create
+  // the mask of that's all ones on [msb, lsb].  Finally we not it to
+  // create the final mask.
+  unsigned msb = fieldFromInstruction(Val, 5, 5);
+  unsigned lsb = fieldFromInstruction(Val, 0, 5);
+
+  DecodeStatus S = MCDisassembler::Success;
+  if (lsb > msb) {
+    Check(S, MCDisassembler::SoftFail);
+    // The check above will cause the warning for the "potentially undefined
+    // instruction encoding" but we can't build a bad MCOperand value here
+    // with a lsb > msb or else printing the MCInst will cause a crash.
+    lsb = msb;
+  }
+
+  uint32_t msb_mask = 0xFFFFFFFF;
+  if (msb != 31) msb_mask = (1U << (msb+1)) - 1;
+  uint32_t lsb_mask = (1U << lsb) - 1;
+
+  Inst.addOperand(MCOperand::createImm(~(msb_mask ^ lsb_mask)));
+  return S;
+}
+
+static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned CRd = fieldFromInstruction(Insn, 12, 4);
+  unsigned coproc = fieldFromInstruction(Insn, 8, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 8);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned U = fieldFromInstruction(Insn, 23, 1);
+
+  switch (Inst.getOpcode()) {
+    case ARM::LDC_OFFSET:
+    case ARM::LDC_PRE:
+    case ARM::LDC_POST:
+    case ARM::LDC_OPTION:
+    case ARM::LDCL_OFFSET:
+    case ARM::LDCL_PRE:
+    case ARM::LDCL_POST:
+    case ARM::LDCL_OPTION:
+    case ARM::STC_OFFSET:
+    case ARM::STC_PRE:
+    case ARM::STC_POST:
+    case ARM::STC_OPTION:
+    case ARM::STCL_OFFSET:
+    case ARM::STCL_PRE:
+    case ARM::STCL_POST:
+    case ARM::STCL_OPTION:
+    case ARM::t2LDC_OFFSET:
+    case ARM::t2LDC_PRE:
+    case ARM::t2LDC_POST:
+    case ARM::t2LDC_OPTION:
+    case ARM::t2LDCL_OFFSET:
+    case ARM::t2LDCL_PRE:
+    case ARM::t2LDCL_POST:
+    case ARM::t2LDCL_OPTION:
+    case ARM::t2STC_OFFSET:
+    case ARM::t2STC_PRE:
+    case ARM::t2STC_POST:
+    case ARM::t2STC_OPTION:
+    case ARM::t2STCL_OFFSET:
+    case ARM::t2STCL_PRE:
+    case ARM::t2STCL_POST:
+    case ARM::t2STCL_OPTION:
+      if (coproc == 0xA || coproc == 0xB)
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  const FeatureBitset &featureBits =
+    ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+  if (featureBits[ARM::HasV8Ops] && (coproc != 14))
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createImm(coproc));
+  Inst.addOperand(MCOperand::createImm(CRd));
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  switch (Inst.getOpcode()) {
+    case ARM::t2LDC2_OFFSET:
+    case ARM::t2LDC2L_OFFSET:
+    case ARM::t2LDC2_PRE:
+    case ARM::t2LDC2L_PRE:
+    case ARM::t2STC2_OFFSET:
+    case ARM::t2STC2L_OFFSET:
+    case ARM::t2STC2_PRE:
+    case ARM::t2STC2L_PRE:
+    case ARM::LDC2_OFFSET:
+    case ARM::LDC2L_OFFSET:
+    case ARM::LDC2_PRE:
+    case ARM::LDC2L_PRE:
+    case ARM::STC2_OFFSET:
+    case ARM::STC2L_OFFSET:
+    case ARM::STC2_PRE:
+    case ARM::STC2L_PRE:
+    case ARM::t2LDC_OFFSET:
+    case ARM::t2LDCL_OFFSET:
+    case ARM::t2LDC_PRE:
+    case ARM::t2LDCL_PRE:
+    case ARM::t2STC_OFFSET:
+    case ARM::t2STCL_OFFSET:
+    case ARM::t2STC_PRE:
+    case ARM::t2STCL_PRE:
+    case ARM::LDC_OFFSET:
+    case ARM::LDCL_OFFSET:
+    case ARM::LDC_PRE:
+    case ARM::LDCL_PRE:
+    case ARM::STC_OFFSET:
+    case ARM::STCL_OFFSET:
+    case ARM::STC_PRE:
+    case ARM::STCL_PRE:
+      imm = ARM_AM::getAM5Opc(U ? ARM_AM::add : ARM_AM::sub, imm);
+      Inst.addOperand(MCOperand::createImm(imm));
+      break;
+    case ARM::t2LDC2_POST:
+    case ARM::t2LDC2L_POST:
+    case ARM::t2STC2_POST:
+    case ARM::t2STC2L_POST:
+    case ARM::LDC2_POST:
+    case ARM::LDC2L_POST:
+    case ARM::STC2_POST:
+    case ARM::STC2L_POST:
+    case ARM::t2LDC_POST:
+    case ARM::t2LDCL_POST:
+    case ARM::t2STC_POST:
+    case ARM::t2STCL_POST:
+    case ARM::LDC_POST:
+    case ARM::LDCL_POST:
+    case ARM::STC_POST:
+    case ARM::STCL_POST:
+      imm |= U << 8;
+      LLVM_FALLTHROUGH;
+    default:
+      // The 'option' variant doesn't encode 'U' in the immediate since
+      // the immediate is unsigned [0,255].
+      Inst.addOperand(MCOperand::createImm(imm));
+      break;
+  }
+
+  switch (Inst.getOpcode()) {
+    case ARM::LDC_OFFSET:
+    case ARM::LDC_PRE:
+    case ARM::LDC_POST:
+    case ARM::LDC_OPTION:
+    case ARM::LDCL_OFFSET:
+    case ARM::LDCL_PRE:
+    case ARM::LDCL_POST:
+    case ARM::LDCL_OPTION:
+    case ARM::STC_OFFSET:
+    case ARM::STC_PRE:
+    case ARM::STC_POST:
+    case ARM::STC_OPTION:
+    case ARM::STCL_OFFSET:
+    case ARM::STCL_PRE:
+    case ARM::STCL_POST:
+    case ARM::STCL_OPTION:
+      if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  return S;
+}
+
+static DecodeStatus
+DecodeAddrMode2IdxInstruction(MCInst &Inst, unsigned Insn,
+                              uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned reg = fieldFromInstruction(Insn, 25, 1);
+  unsigned P = fieldFromInstruction(Insn, 24, 1);
+  unsigned W = fieldFromInstruction(Insn, 21, 1);
+
+  // On stores, the writeback operand precedes Rt.
+  switch (Inst.getOpcode()) {
+    case ARM::STR_POST_IMM:
+    case ARM::STR_POST_REG:
+    case ARM::STRB_POST_IMM:
+    case ARM::STRB_POST_REG:
+    case ARM::STRT_POST_REG:
+    case ARM::STRT_POST_IMM:
+    case ARM::STRBT_POST_REG:
+    case ARM::STRBT_POST_IMM:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // On loads, the writeback operand comes after Rt.
+  switch (Inst.getOpcode()) {
+    case ARM::LDR_POST_IMM:
+    case ARM::LDR_POST_REG:
+    case ARM::LDRB_POST_IMM:
+    case ARM::LDRB_POST_REG:
+    case ARM::LDRBT_POST_REG:
+    case ARM::LDRBT_POST_IMM:
+    case ARM::LDRT_POST_REG:
+    case ARM::LDRT_POST_IMM:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  ARM_AM::AddrOpc Op = ARM_AM::add;
+  if (!fieldFromInstruction(Insn, 23, 1))
+    Op = ARM_AM::sub;
+
+  bool writeback = (P == 0) || (W == 1);
+  unsigned idx_mode = 0;
+  if (P && writeback)
+    idx_mode = ARMII::IndexModePre;
+  else if (!P && writeback)
+    idx_mode = ARMII::IndexModePost;
+
+  if (writeback && (Rn == 15 || Rn == Rt))
+    S = MCDisassembler::SoftFail; // UNPREDICTABLE
+
+  if (reg) {
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+    ARM_AM::ShiftOpc Opc = ARM_AM::lsl;
+    switch( fieldFromInstruction(Insn, 5, 2)) {
+      case 0:
+        Opc = ARM_AM::lsl;
+        break;
+      case 1:
+        Opc = ARM_AM::lsr;
+        break;
+      case 2:
+        Opc = ARM_AM::asr;
+        break;
+      case 3:
+        Opc = ARM_AM::ror;
+        break;
+      default:
+        return MCDisassembler::Fail;
+    }
+    unsigned amt = fieldFromInstruction(Insn, 7, 5);
+    if (Opc == ARM_AM::ror && amt == 0)
+      Opc = ARM_AM::rrx;
+    unsigned imm = ARM_AM::getAM2Opc(Op, amt, Opc, idx_mode);
+
+    Inst.addOperand(MCOperand::createImm(imm));
+  } else {
+    Inst.addOperand(MCOperand::createReg(0));
+    unsigned tmp = ARM_AM::getAM2Opc(Op, imm, ARM_AM::lsl, idx_mode);
+    Inst.addOperand(MCOperand::createImm(tmp));
+  }
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 13, 4);
+  unsigned Rm = fieldFromInstruction(Val,  0, 4);
+  unsigned type = fieldFromInstruction(Val, 5, 2);
+  unsigned imm = fieldFromInstruction(Val, 7, 5);
+  unsigned U = fieldFromInstruction(Val, 12, 1);
+
+  ARM_AM::ShiftOpc ShOp = ARM_AM::lsl;
+  switch (type) {
+    case 0:
+      ShOp = ARM_AM::lsl;
+      break;
+    case 1:
+      ShOp = ARM_AM::lsr;
+      break;
+    case 2:
+      ShOp = ARM_AM::asr;
+      break;
+    case 3:
+      ShOp = ARM_AM::ror;
+      break;
+  }
+
+  if (ShOp == ARM_AM::ror && imm == 0)
+    ShOp = ARM_AM::rrx;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  unsigned shift;
+  if (U)
+    shift = ARM_AM::getAM2Opc(ARM_AM::add, imm, ShOp);
+  else
+    shift = ARM_AM::getAM2Opc(ARM_AM::sub, imm, ShOp);
+  Inst.addOperand(MCOperand::createImm(shift));
+
+  return S;
+}
+
+static DecodeStatus
+DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned type = fieldFromInstruction(Insn, 22, 1);
+  unsigned imm = fieldFromInstruction(Insn, 8, 4);
+  unsigned U = ((~fieldFromInstruction(Insn, 23, 1)) & 1) << 8;
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned W = fieldFromInstruction(Insn, 21, 1);
+  unsigned P = fieldFromInstruction(Insn, 24, 1);
+  unsigned Rt2 = Rt + 1;
+
+  bool writeback = (W == 1) | (P == 0);
+
+  // For {LD,ST}RD, Rt must be even, else undefined.
+  switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+      if (Rt & 0x1) S = MCDisassembler::SoftFail;
+      break;
+    default:
+      break;
+  }
+  switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+      if (P == 0 && W == 1)
+        S = MCDisassembler::SoftFail;
+
+      if (writeback && (Rn == 15 || Rn == Rt || Rn == Rt2))
+        S = MCDisassembler::SoftFail;
+      if (type && Rm == 15)
+        S = MCDisassembler::SoftFail;
+      if (Rt2 == 15)
+        S = MCDisassembler::SoftFail;
+      if (!type && fieldFromInstruction(Insn, 8, 4))
+        S = MCDisassembler::SoftFail;
+      break;
+    case ARM::STRH:
+    case ARM::STRH_PRE:
+    case ARM::STRH_POST:
+      if (Rt == 15)
+        S = MCDisassembler::SoftFail;
+      if (writeback && (Rn == 15 || Rn == Rt))
+        S = MCDisassembler::SoftFail;
+      if (!type && Rm == 15)
+        S = MCDisassembler::SoftFail;
+      break;
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+      if (type && Rn == 15){
+        if (Rt2 == 15)
+          S = MCDisassembler::SoftFail;
+        break;
+      }
+      if (P == 0 && W == 1)
+        S = MCDisassembler::SoftFail;
+      if (!type && (Rt2 == 15 || Rm == 15 || Rm == Rt || Rm == Rt2))
+        S = MCDisassembler::SoftFail;
+      if (!type && writeback && Rn == 15)
+        S = MCDisassembler::SoftFail;
+      if (writeback && (Rn == Rt || Rn == Rt2))
+        S = MCDisassembler::SoftFail;
+      break;
+    case ARM::LDRH:
+    case ARM::LDRH_PRE:
+    case ARM::LDRH_POST:
+      if (type && Rn == 15){
+        if (Rt == 15)
+          S = MCDisassembler::SoftFail;
+        break;
+      }
+      if (Rt == 15)
+        S = MCDisassembler::SoftFail;
+      if (!type && Rm == 15)
+        S = MCDisassembler::SoftFail;
+      if (!type && writeback && (Rn == 15 || Rn == Rt))
+        S = MCDisassembler::SoftFail;
+      break;
+    case ARM::LDRSH:
+    case ARM::LDRSH_PRE:
+    case ARM::LDRSH_POST:
+    case ARM::LDRSB:
+    case ARM::LDRSB_PRE:
+    case ARM::LDRSB_POST:
+      if (type && Rn == 15){
+        if (Rt == 15)
+          S = MCDisassembler::SoftFail;
+        break;
+      }
+      if (type && (Rt == 15 || (writeback && Rn == Rt)))
+        S = MCDisassembler::SoftFail;
+      if (!type && (Rt == 15 || Rm == 15))
+        S = MCDisassembler::SoftFail;
+      if (!type && writeback && (Rn == 15 || Rn == Rt))
+        S = MCDisassembler::SoftFail;
+      break;
+    default:
+      break;
+  }
+
+  if (writeback) { // Writeback
+    if (P)
+      U |= ARMII::IndexModePre << 9;
+    else
+      U |= ARMII::IndexModePost << 9;
+
+    // On stores, the writeback operand precedes Rt.
+    switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+    case ARM::STRH:
+    case ARM::STRH_PRE:
+    case ARM::STRH_POST:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  if (writeback) {
+    // On loads, the writeback operand comes after Rt.
+    switch (Inst.getOpcode()) {
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+    case ARM::LDRH:
+    case ARM::LDRH_PRE:
+    case ARM::LDRH_POST:
+    case ARM::LDRSH:
+    case ARM::LDRSH_PRE:
+    case ARM::LDRSH_POST:
+    case ARM::LDRSB:
+    case ARM::LDRSB_PRE:
+    case ARM::LDRSB_POST:
+    case ARM::LDRHTr:
+    case ARM::LDRSBTr:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (type) {
+    Inst.addOperand(MCOperand::createReg(0));
+    Inst.addOperand(MCOperand::createImm(U | (imm << 4) | Rm));
+  } else {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    Inst.addOperand(MCOperand::createImm(U));
+  }
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeRFEInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned mode = fieldFromInstruction(Insn, 23, 2);
+
+  switch (mode) {
+    case 0:
+      mode = ARM_AM::da;
+      break;
+    case 1:
+      mode = ARM_AM::ia;
+      break;
+    case 2:
+      mode = ARM_AM::db;
+      break;
+    case 3:
+      mode = ARM_AM::ib;
+      break;
+  }
+
+  Inst.addOperand(MCOperand::createImm(mode));
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (pred == 0xF)
+    return DecodeCPSInstruction(Inst, Insn, Address, Decoder);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned reglist = fieldFromInstruction(Insn, 0, 16);
+
+  if (pred == 0xF) {
+    // Ambiguous with RFE and SRS
+    switch (Inst.getOpcode()) {
+      case ARM::LDMDA:
+        Inst.setOpcode(ARM::RFEDA);
+        break;
+      case ARM::LDMDA_UPD:
+        Inst.setOpcode(ARM::RFEDA_UPD);
+        break;
+      case ARM::LDMDB:
+        Inst.setOpcode(ARM::RFEDB);
+        break;
+      case ARM::LDMDB_UPD:
+        Inst.setOpcode(ARM::RFEDB_UPD);
+        break;
+      case ARM::LDMIA:
+        Inst.setOpcode(ARM::RFEIA);
+        break;
+      case ARM::LDMIA_UPD:
+        Inst.setOpcode(ARM::RFEIA_UPD);
+        break;
+      case ARM::LDMIB:
+        Inst.setOpcode(ARM::RFEIB);
+        break;
+      case ARM::LDMIB_UPD:
+        Inst.setOpcode(ARM::RFEIB_UPD);
+        break;
+      case ARM::STMDA:
+        Inst.setOpcode(ARM::SRSDA);
+        break;
+      case ARM::STMDA_UPD:
+        Inst.setOpcode(ARM::SRSDA_UPD);
+        break;
+      case ARM::STMDB:
+        Inst.setOpcode(ARM::SRSDB);
+        break;
+      case ARM::STMDB_UPD:
+        Inst.setOpcode(ARM::SRSDB_UPD);
+        break;
+      case ARM::STMIA:
+        Inst.setOpcode(ARM::SRSIA);
+        break;
+      case ARM::STMIA_UPD:
+        Inst.setOpcode(ARM::SRSIA_UPD);
+        break;
+      case ARM::STMIB:
+        Inst.setOpcode(ARM::SRSIB);
+        break;
+      case ARM::STMIB_UPD:
+        Inst.setOpcode(ARM::SRSIB_UPD);
+        break;
+      default:
+        return MCDisassembler::Fail;
+    }
+
+    // For stores (which become SRS's, the only operand is the mode.
+    if (fieldFromInstruction(Insn, 20, 1) == 0) {
+      // Check SRS encoding constraints
+      if (!(fieldFromInstruction(Insn, 22, 1) == 1 &&
+            fieldFromInstruction(Insn, 20, 1) == 0))
+        return MCDisassembler::Fail;
+
+      Inst.addOperand(
+          MCOperand::createImm(fieldFromInstruction(Insn, 0, 4)));
+      return S;
+    }
+
+    return DecodeRFEInstruction(Inst, Insn, Address, Decoder);
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail; // Tied
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeRegListOperand(Inst, reglist, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+// Check for UNPREDICTABLE predicated ESB instruction
+static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned imm8 = fieldFromInstruction(Insn, 0, 8);
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  const FeatureBitset &FeatureBits = Dis->getSubtargetInfo().getFeatureBits();
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  Inst.addOperand(MCOperand::createImm(imm8));
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // ESB is unpredictable if pred != AL. Without the RAS extension, it is a NOP,
+  // so all predicates should be allowed.
+  if (imm8 == 0x10 && pred != 0xe && ((FeatureBits[ARM::FeatureRAS]) != 0))
+    S = MCDisassembler::SoftFail;
+
+  return S;
+}
+
+static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned imod = fieldFromInstruction(Insn, 18, 2);
+  unsigned M = fieldFromInstruction(Insn, 17, 1);
+  unsigned iflags = fieldFromInstruction(Insn, 6, 3);
+  unsigned mode = fieldFromInstruction(Insn, 0, 5);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // This decoder is called from multiple location that do not check
+  // the full encoding is valid before they do.
+  if (fieldFromInstruction(Insn, 5, 1) != 0 ||
+      fieldFromInstruction(Insn, 16, 1) != 0 ||
+      fieldFromInstruction(Insn, 20, 8) != 0x10)
+    return MCDisassembler::Fail;
+
+  // imod == '01' --> UNPREDICTABLE
+  // NOTE: Even though this is technically UNPREDICTABLE, we choose to
+  // return failure here.  The '01' imod value is unprintable, so there's
+  // nothing useful we could do even if we returned UNPREDICTABLE.
+
+  if (imod == 1) return MCDisassembler::Fail;
+
+  if (imod && M) {
+    Inst.setOpcode(ARM::CPS3p);
+    Inst.addOperand(MCOperand::createImm(imod));
+    Inst.addOperand(MCOperand::createImm(iflags));
+    Inst.addOperand(MCOperand::createImm(mode));
+  } else if (imod && !M) {
+    Inst.setOpcode(ARM::CPS2p);
+    Inst.addOperand(MCOperand::createImm(imod));
+    Inst.addOperand(MCOperand::createImm(iflags));
+    if (mode) S = MCDisassembler::SoftFail;
+  } else if (!imod && M) {
+    Inst.setOpcode(ARM::CPS1p);
+    Inst.addOperand(MCOperand::createImm(mode));
+    if (iflags) S = MCDisassembler::SoftFail;
+  } else {
+    // imod == '00' && M == '0' --> UNPREDICTABLE
+    Inst.setOpcode(ARM::CPS1p);
+    Inst.addOperand(MCOperand::createImm(mode));
+    S = MCDisassembler::SoftFail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned imod = fieldFromInstruction(Insn, 9, 2);
+  unsigned M = fieldFromInstruction(Insn, 8, 1);
+  unsigned iflags = fieldFromInstruction(Insn, 5, 3);
+  unsigned mode = fieldFromInstruction(Insn, 0, 5);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // imod == '01' --> UNPREDICTABLE
+  // NOTE: Even though this is technically UNPREDICTABLE, we choose to
+  // return failure here.  The '01' imod value is unprintable, so there's
+  // nothing useful we could do even if we returned UNPREDICTABLE.
+
+  if (imod == 1) return MCDisassembler::Fail;
+
+  if (imod && M) {
+    Inst.setOpcode(ARM::t2CPS3p);
+    Inst.addOperand(MCOperand::createImm(imod));
+    Inst.addOperand(MCOperand::createImm(iflags));
+    Inst.addOperand(MCOperand::createImm(mode));
+  } else if (imod && !M) {
+    Inst.setOpcode(ARM::t2CPS2p);
+    Inst.addOperand(MCOperand::createImm(imod));
+    Inst.addOperand(MCOperand::createImm(iflags));
+    if (mode) S = MCDisassembler::SoftFail;
+  } else if (!imod && M) {
+    Inst.setOpcode(ARM::t2CPS1p);
+    Inst.addOperand(MCOperand::createImm(mode));
+    if (iflags) S = MCDisassembler::SoftFail;
+  } else {
+    // imod == '00' && M == '0' --> this is a HINT instruction
+    int imm = fieldFromInstruction(Insn, 0, 8);
+    // HINT are defined only for immediate in [0..4]
+    if(imm > 4) return MCDisassembler::Fail;
+    Inst.setOpcode(ARM::t2HINT);
+    Inst.addOperand(MCOperand::createImm(imm));
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 8, 4);
+  unsigned imm = 0;
+
+  imm |= (fieldFromInstruction(Insn, 0, 8) << 0);
+  imm |= (fieldFromInstruction(Insn, 12, 3) << 8);
+  imm |= (fieldFromInstruction(Insn, 16, 4) << 12);
+  imm |= (fieldFromInstruction(Insn, 26, 1) << 11);
+
+  if (Inst.getOpcode() == ARM::t2MOVTi16)
+    if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::createImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned imm = 0;
+
+  imm |= (fieldFromInstruction(Insn, 0, 12) << 0);
+  imm |= (fieldFromInstruction(Insn, 16, 4) << 12);
+
+  if (Inst.getOpcode() == ARM::MOVTi16)
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::createImm(imm));
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 8, 4);
+  unsigned Ra = fieldFromInstruction(Insn, 12, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (pred == 0xF)
+    return DecodeCPSInstruction(Inst, Insn, Address, Decoder);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Ra, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+  if (Pred == 0xF)
+    return DecodeSETPANInstruction(Inst, Insn, Address, Decoder);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, Pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Imm = fieldFromInstruction(Insn, 9, 1);
+
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  const FeatureBitset &FeatureBits = Dis->getSubtargetInfo().getFeatureBits();
+
+  if (!FeatureBits[ARM::HasV8_1aOps] || 
+      !FeatureBits[ARM::HasV8Ops])
+    return MCDisassembler::Fail;
+
+  // Decoder can be called from DecodeTST, which does not check the full
+  // encoding is valid.
+  if (fieldFromInstruction(Insn, 20,12) != 0xf11 ||
+      fieldFromInstruction(Insn, 4,4) != 0)
+    return MCDisassembler::Fail;
+  if (fieldFromInstruction(Insn, 10,10) != 0 ||
+      fieldFromInstruction(Insn, 0,4) != 0)
+    S = MCDisassembler::SoftFail;
+
+  Inst.setOpcode(ARM::SETPAN);
+  Inst.addOperand(MCOperand::createImm(Imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned add = fieldFromInstruction(Val, 12, 1);
+  unsigned imm = fieldFromInstruction(Val, 0, 12);
+  unsigned Rn = fieldFromInstruction(Val, 13, 4);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!add) imm *= -1;
+  if (imm == 0 && !add) imm = INT32_MIN;
+  Inst.addOperand(MCOperand::createImm(imm));
+  if (Rn == 15)
+    tryAddingPcLoadReferenceComment(Address, Address + imm + 8, Decoder);
+
+  return S;
+}
+
+static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 9, 4);
+  // U == 1 to add imm, 0 to subtract it.
+  unsigned U = fieldFromInstruction(Val, 8, 1);
+  unsigned imm = fieldFromInstruction(Val, 0, 8);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (U)
+    Inst.addOperand(MCOperand::createImm(ARM_AM::getAM5Opc(ARM_AM::add, imm)));
+  else
+    Inst.addOperand(MCOperand::createImm(ARM_AM::getAM5Opc(ARM_AM::sub, imm)));
+
+  return S;
+}
+
+static DecodeStatus DecodeAddrMode5FP16Operand(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 9, 4);
+  // U == 1 to add imm, 0 to subtract it.
+  unsigned U = fieldFromInstruction(Val, 8, 1);
+  unsigned imm = fieldFromInstruction(Val, 0, 8);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (U)
+    Inst.addOperand(MCOperand::createImm(ARM_AM::getAM5FP16Opc(ARM_AM::add, imm)));
+  else
+    Inst.addOperand(MCOperand::createImm(ARM_AM::getAM5FP16Opc(ARM_AM::sub, imm)));
+
+  return S;
+}
+
+static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeGPRRegisterClass(Inst, Val, Address, Decoder);
+}
+
+static DecodeStatus
+DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
+                     uint64_t Address, const void *Decoder) {
+  DecodeStatus Status = MCDisassembler::Success;
+
+  // Note the J1 and J2 values are from the encoded instruction.  So here
+  // change them to I1 and I2 values via as documented:
+  // I1 = NOT(J1 EOR S);
+  // I2 = NOT(J2 EOR S);
+  // and build the imm32 with one trailing zero as documented:
+  // imm32 = SignExtend(S:I1:I2:imm10:imm11:'0', 32);
+  unsigned S = fieldFromInstruction(Insn, 26, 1);
+  unsigned J1 = fieldFromInstruction(Insn, 13, 1);
+  unsigned J2 = fieldFromInstruction(Insn, 11, 1);
+  unsigned I1 = !(J1 ^ S);
+  unsigned I2 = !(J2 ^ S);
+  unsigned imm10 = fieldFromInstruction(Insn, 16, 10);
+  unsigned imm11 = fieldFromInstruction(Insn, 0, 11);
+  unsigned tmp = (S << 23) | (I1 << 22) | (I2 << 21) | (imm10 << 11) | imm11;
+  int imm32 = SignExtend32<25>(tmp << 1);
+  if (!tryAddingSymbolicOperand(Address, Address + imm32 + 4,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::createImm(imm32));
+
+  return Status;
+}
+
+static DecodeStatus
+DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 24) << 2;
+
+  if (pred == 0xF) {
+    Inst.setOpcode(ARM::BLXi);
+    imm |= fieldFromInstruction(Insn, 24, 1) << 1;
+    if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<26>(imm) + 8,
+                                  true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::createImm(SignExtend32<26>(imm)));
+    return S;
+  }
+
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<26>(imm) + 8,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::createImm(SignExtend32<26>(imm)));
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+
+static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rm = fieldFromInstruction(Val, 0, 4);
+  unsigned align = fieldFromInstruction(Val, 4, 2);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!align)
+    Inst.addOperand(MCOperand::createImm(0));
+  else
+    Inst.addOperand(MCOperand::createImm(4 << align));
+
+  return S;
+}
+
+static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned wb = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  Rn |= fieldFromInstruction(Insn, 4, 2) << 4;
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+  // First output register
+  switch (Inst.getOpcode()) {
+  case ARM::VLD1q16: case ARM::VLD1q32: case ARM::VLD1q64: case ARM::VLD1q8:
+  case ARM::VLD1q16wb_fixed: case ARM::VLD1q16wb_register:
+  case ARM::VLD1q32wb_fixed: case ARM::VLD1q32wb_register:
+  case ARM::VLD1q64wb_fixed: case ARM::VLD1q64wb_register:
+  case ARM::VLD1q8wb_fixed: case ARM::VLD1q8wb_register:
+  case ARM::VLD2d16: case ARM::VLD2d32: case ARM::VLD2d8:
+  case ARM::VLD2d16wb_fixed: case ARM::VLD2d16wb_register:
+  case ARM::VLD2d32wb_fixed: case ARM::VLD2d32wb_register:
+  case ARM::VLD2d8wb_fixed: case ARM::VLD2d8wb_register:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  case ARM::VLD2b16:
+  case ARM::VLD2b32:
+  case ARM::VLD2b8:
+  case ARM::VLD2b16wb_fixed:
+  case ARM::VLD2b16wb_register:
+  case ARM::VLD2b32wb_fixed:
+  case ARM::VLD2b32wb_register:
+  case ARM::VLD2b8wb_fixed:
+  case ARM::VLD2b8wb_register:
+    if (!Check(S, DecodeDPairSpacedRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  // Second output register
+  switch (Inst.getOpcode()) {
+    case ARM::VLD3d8:
+    case ARM::VLD3d16:
+    case ARM::VLD3d32:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VLD3q8:
+    case ARM::VLD3q16:
+    case ARM::VLD3q32:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4q8:
+    case ARM::VLD4q16:
+    case ARM::VLD4q32:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+    default:
+      break;
+  }
+
+  // Third output register
+  switch(Inst.getOpcode()) {
+    case ARM::VLD3d8:
+    case ARM::VLD3d16:
+    case ARM::VLD3d32:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VLD3q8:
+    case ARM::VLD3q16:
+    case ARM::VLD3q32:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4q8:
+    case ARM::VLD4q16:
+    case ARM::VLD4q32:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+4)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Fourth output register
+  switch (Inst.getOpcode()) {
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VLD4q8:
+    case ARM::VLD4q16:
+    case ARM::VLD4q32:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+6)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Writeback operand
+  switch (Inst.getOpcode()) {
+    case ARM::VLD1d8wb_fixed:
+    case ARM::VLD1d16wb_fixed:
+    case ARM::VLD1d32wb_fixed:
+    case ARM::VLD1d64wb_fixed:
+    case ARM::VLD1d8wb_register:
+    case ARM::VLD1d16wb_register:
+    case ARM::VLD1d32wb_register:
+    case ARM::VLD1d64wb_register:
+    case ARM::VLD1q8wb_fixed:
+    case ARM::VLD1q16wb_fixed:
+    case ARM::VLD1q32wb_fixed:
+    case ARM::VLD1q64wb_fixed:
+    case ARM::VLD1q8wb_register:
+    case ARM::VLD1q16wb_register:
+    case ARM::VLD1q32wb_register:
+    case ARM::VLD1q64wb_register:
+    case ARM::VLD1d8Twb_fixed:
+    case ARM::VLD1d8Twb_register:
+    case ARM::VLD1d16Twb_fixed:
+    case ARM::VLD1d16Twb_register:
+    case ARM::VLD1d32Twb_fixed:
+    case ARM::VLD1d32Twb_register:
+    case ARM::VLD1d64Twb_fixed:
+    case ARM::VLD1d64Twb_register:
+    case ARM::VLD1d8Qwb_fixed:
+    case ARM::VLD1d8Qwb_register:
+    case ARM::VLD1d16Qwb_fixed:
+    case ARM::VLD1d16Qwb_register:
+    case ARM::VLD1d32Qwb_fixed:
+    case ARM::VLD1d32Qwb_register:
+    case ARM::VLD1d64Qwb_fixed:
+    case ARM::VLD1d64Qwb_register:
+    case ARM::VLD2d8wb_fixed:
+    case ARM::VLD2d16wb_fixed:
+    case ARM::VLD2d32wb_fixed:
+    case ARM::VLD2q8wb_fixed:
+    case ARM::VLD2q16wb_fixed:
+    case ARM::VLD2q32wb_fixed:
+    case ARM::VLD2d8wb_register:
+    case ARM::VLD2d16wb_register:
+    case ARM::VLD2d32wb_register:
+    case ARM::VLD2q8wb_register:
+    case ARM::VLD2q16wb_register:
+    case ARM::VLD2q32wb_register:
+    case ARM::VLD2b8wb_fixed:
+    case ARM::VLD2b16wb_fixed:
+    case ARM::VLD2b32wb_fixed:
+    case ARM::VLD2b8wb_register:
+    case ARM::VLD2b16wb_register:
+    case ARM::VLD2b32wb_register:
+      Inst.addOperand(MCOperand::createImm(0));
+      break;
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, wb, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // AddrMode6 Base (register+alignment)
+  if (!Check(S, DecodeAddrMode6Operand(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // AddrMode6 Offset (register)
+  switch (Inst.getOpcode()) {
+  default:
+    // The below have been updated to have explicit am6offset split
+    // between fixed and register offset. For those instructions not
+    // yet updated, we need to add an additional reg0 operand for the
+    // fixed variant.
+    //
+    // The fixed offset encodes as Rm == 0xd, so we check for that.
+    if (Rm == 0xd) {
+      Inst.addOperand(MCOperand::createReg(0));
+      break;
+    }
+    // Fall through to handle the register offset variant.
+    LLVM_FALLTHROUGH;
+  case ARM::VLD1d8wb_fixed:
+  case ARM::VLD1d16wb_fixed:
+  case ARM::VLD1d32wb_fixed:
+  case ARM::VLD1d64wb_fixed:
+  case ARM::VLD1d8Twb_fixed:
+  case ARM::VLD1d16Twb_fixed:
+  case ARM::VLD1d32Twb_fixed:
+  case ARM::VLD1d64Twb_fixed:
+  case ARM::VLD1d8Qwb_fixed:
+  case ARM::VLD1d16Qwb_fixed:
+  case ARM::VLD1d32Qwb_fixed:
+  case ARM::VLD1d64Qwb_fixed:
+  case ARM::VLD1d8wb_register:
+  case ARM::VLD1d16wb_register:
+  case ARM::VLD1d32wb_register:
+  case ARM::VLD1d64wb_register:
+  case ARM::VLD1q8wb_fixed:
+  case ARM::VLD1q16wb_fixed:
+  case ARM::VLD1q32wb_fixed:
+  case ARM::VLD1q64wb_fixed:
+  case ARM::VLD1q8wb_register:
+  case ARM::VLD1q16wb_register:
+  case ARM::VLD1q32wb_register:
+  case ARM::VLD1q64wb_register:
+    // The fixed offset post-increment encodes Rm == 0xd. The no-writeback
+    // variant encodes Rm == 0xf. Anything else is a register offset post-
+    // increment and we need to add the register operand to the instruction.
+    if (Rm != 0xD && Rm != 0xF &&
+        !Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  case ARM::VLD2d8wb_fixed:
+  case ARM::VLD2d16wb_fixed:
+  case ARM::VLD2d32wb_fixed:
+  case ARM::VLD2b8wb_fixed:
+  case ARM::VLD2b16wb_fixed:
+  case ARM::VLD2b32wb_fixed:
+  case ARM::VLD2q8wb_fixed:
+  case ARM::VLD2q16wb_fixed:
+  case ARM::VLD2q32wb_fixed:
+    break;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned type = fieldFromInstruction(Insn, 8, 4);
+  unsigned align = fieldFromInstruction(Insn, 4, 2);
+  if (type == 6 && (align & 2)) return MCDisassembler::Fail;
+  if (type == 7 && (align & 2)) return MCDisassembler::Fail;
+  if (type == 10 && align == 3) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  if (size == 3) return MCDisassembler::Fail;
+
+  unsigned type = fieldFromInstruction(Insn, 8, 4);
+  unsigned align = fieldFromInstruction(Insn, 4, 2);
+  if (type == 8 && align == 3) return MCDisassembler::Fail;
+  if (type == 9 && align == 3) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  if (size == 3) return MCDisassembler::Fail;
+
+  unsigned align = fieldFromInstruction(Insn, 4, 2);
+  if (align & 2) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  if (size == 3) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned wb = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  Rn |= fieldFromInstruction(Insn, 4, 2) << 4;
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+  // Writeback Operand
+  switch (Inst.getOpcode()) {
+    case ARM::VST1d8wb_fixed:
+    case ARM::VST1d16wb_fixed:
+    case ARM::VST1d32wb_fixed:
+    case ARM::VST1d64wb_fixed:
+    case ARM::VST1d8wb_register:
+    case ARM::VST1d16wb_register:
+    case ARM::VST1d32wb_register:
+    case ARM::VST1d64wb_register:
+    case ARM::VST1q8wb_fixed:
+    case ARM::VST1q16wb_fixed:
+    case ARM::VST1q32wb_fixed:
+    case ARM::VST1q64wb_fixed:
+    case ARM::VST1q8wb_register:
+    case ARM::VST1q16wb_register:
+    case ARM::VST1q32wb_register:
+    case ARM::VST1q64wb_register:
+    case ARM::VST1d8Twb_fixed:
+    case ARM::VST1d16Twb_fixed:
+    case ARM::VST1d32Twb_fixed:
+    case ARM::VST1d64Twb_fixed:
+    case ARM::VST1d8Twb_register:
+    case ARM::VST1d16Twb_register:
+    case ARM::VST1d32Twb_register:
+    case ARM::VST1d64Twb_register:
+    case ARM::VST1d8Qwb_fixed:
+    case ARM::VST1d16Qwb_fixed:
+    case ARM::VST1d32Qwb_fixed:
+    case ARM::VST1d64Qwb_fixed:
+    case ARM::VST1d8Qwb_register:
+    case ARM::VST1d16Qwb_register:
+    case ARM::VST1d32Qwb_register:
+    case ARM::VST1d64Qwb_register:
+    case ARM::VST2d8wb_fixed:
+    case ARM::VST2d16wb_fixed:
+    case ARM::VST2d32wb_fixed:
+    case ARM::VST2d8wb_register:
+    case ARM::VST2d16wb_register:
+    case ARM::VST2d32wb_register:
+    case ARM::VST2q8wb_fixed:
+    case ARM::VST2q16wb_fixed:
+    case ARM::VST2q32wb_fixed:
+    case ARM::VST2q8wb_register:
+    case ARM::VST2q16wb_register:
+    case ARM::VST2q32wb_register:
+    case ARM::VST2b8wb_fixed:
+    case ARM::VST2b16wb_fixed:
+    case ARM::VST2b32wb_fixed:
+    case ARM::VST2b8wb_register:
+    case ARM::VST2b16wb_register:
+    case ARM::VST2b32wb_register:
+      if (Rm == 0xF)
+        return MCDisassembler::Fail;
+      Inst.addOperand(MCOperand::createImm(0));
+      break;
+    case ARM::VST3d8_UPD:
+    case ARM::VST3d16_UPD:
+    case ARM::VST3d32_UPD:
+    case ARM::VST3q8_UPD:
+    case ARM::VST3q16_UPD:
+    case ARM::VST3q32_UPD:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, wb, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // AddrMode6 Base (register+alignment)
+  if (!Check(S, DecodeAddrMode6Operand(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // AddrMode6 Offset (register)
+  switch (Inst.getOpcode()) {
+    default:
+      if (Rm == 0xD)
+        Inst.addOperand(MCOperand::createReg(0));
+      else if (Rm != 0xF) {
+        if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+          return MCDisassembler::Fail;
+      }
+      break;
+    case ARM::VST1d8wb_fixed:
+    case ARM::VST1d16wb_fixed:
+    case ARM::VST1d32wb_fixed:
+    case ARM::VST1d64wb_fixed:
+    case ARM::VST1q8wb_fixed:
+    case ARM::VST1q16wb_fixed:
+    case ARM::VST1q32wb_fixed:
+    case ARM::VST1q64wb_fixed:
+    case ARM::VST1d8Twb_fixed:
+    case ARM::VST1d16Twb_fixed:
+    case ARM::VST1d32Twb_fixed:
+    case ARM::VST1d64Twb_fixed:
+    case ARM::VST1d8Qwb_fixed:
+    case ARM::VST1d16Qwb_fixed:
+    case ARM::VST1d32Qwb_fixed:
+    case ARM::VST1d64Qwb_fixed:
+    case ARM::VST2d8wb_fixed:
+    case ARM::VST2d16wb_fixed:
+    case ARM::VST2d32wb_fixed:
+    case ARM::VST2q8wb_fixed:
+    case ARM::VST2q16wb_fixed:
+    case ARM::VST2q32wb_fixed:
+    case ARM::VST2b8wb_fixed:
+    case ARM::VST2b16wb_fixed:
+    case ARM::VST2b32wb_fixed:
+      break;
+  }
+
+
+  // First input register
+  switch (Inst.getOpcode()) {
+  case ARM::VST1q16:
+  case ARM::VST1q32:
+  case ARM::VST1q64:
+  case ARM::VST1q8:
+  case ARM::VST1q16wb_fixed:
+  case ARM::VST1q16wb_register:
+  case ARM::VST1q32wb_fixed:
+  case ARM::VST1q32wb_register:
+  case ARM::VST1q64wb_fixed:
+  case ARM::VST1q64wb_register:
+  case ARM::VST1q8wb_fixed:
+  case ARM::VST1q8wb_register:
+  case ARM::VST2d16:
+  case ARM::VST2d32:
+  case ARM::VST2d8:
+  case ARM::VST2d16wb_fixed:
+  case ARM::VST2d16wb_register:
+  case ARM::VST2d32wb_fixed:
+  case ARM::VST2d32wb_register:
+  case ARM::VST2d8wb_fixed:
+  case ARM::VST2d8wb_register:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  case ARM::VST2b16:
+  case ARM::VST2b32:
+  case ARM::VST2b8:
+  case ARM::VST2b16wb_fixed:
+  case ARM::VST2b16wb_register:
+  case ARM::VST2b32wb_fixed:
+  case ARM::VST2b32wb_register:
+  case ARM::VST2b8wb_fixed:
+  case ARM::VST2b8wb_register:
+    if (!Check(S, DecodeDPairSpacedRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  // Second input register
+  switch (Inst.getOpcode()) {
+    case ARM::VST3d8:
+    case ARM::VST3d16:
+    case ARM::VST3d32:
+    case ARM::VST3d8_UPD:
+    case ARM::VST3d16_UPD:
+    case ARM::VST3d32_UPD:
+    case ARM::VST4d8:
+    case ARM::VST4d16:
+    case ARM::VST4d32:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VST3q8:
+    case ARM::VST3q16:
+    case ARM::VST3q32:
+    case ARM::VST3q8_UPD:
+    case ARM::VST3q16_UPD:
+    case ARM::VST3q32_UPD:
+    case ARM::VST4q8:
+    case ARM::VST4q16:
+    case ARM::VST4q32:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Third input register
+  switch (Inst.getOpcode()) {
+    case ARM::VST3d8:
+    case ARM::VST3d16:
+    case ARM::VST3d32:
+    case ARM::VST3d8_UPD:
+    case ARM::VST3d16_UPD:
+    case ARM::VST3d32_UPD:
+    case ARM::VST4d8:
+    case ARM::VST4d16:
+    case ARM::VST4d32:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VST3q8:
+    case ARM::VST3q16:
+    case ARM::VST3q32:
+    case ARM::VST3q8_UPD:
+    case ARM::VST3q16_UPD:
+    case ARM::VST3q32_UPD:
+    case ARM::VST4q8:
+    case ARM::VST4q16:
+    case ARM::VST4q32:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+4)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Fourth input register
+  switch (Inst.getOpcode()) {
+    case ARM::VST4d8:
+    case ARM::VST4d16:
+    case ARM::VST4d32:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VST4q8:
+    case ARM::VST4q16:
+    case ARM::VST4q32:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+6)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned align = fieldFromInstruction(Insn, 4, 1);
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+
+  if (size == 0 && align == 1)
+    return MCDisassembler::Fail;
+  align *= (1 << size);
+
+  switch (Inst.getOpcode()) {
+  case ARM::VLD1DUPq16: case ARM::VLD1DUPq32: case ARM::VLD1DUPq8:
+  case ARM::VLD1DUPq16wb_fixed: case ARM::VLD1DUPq16wb_register:
+  case ARM::VLD1DUPq32wb_fixed: case ARM::VLD1DUPq32wb_register:
+  case ARM::VLD1DUPq8wb_fixed: case ARM::VLD1DUPq8wb_register:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  }
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(align));
+
+  // The fixed offset post-increment encodes Rm == 0xd. The no-writeback
+  // variant encodes Rm == 0xf. Anything else is a register offset post-
+  // increment and we need to add the register operand to the instruction.
+  if (Rm != 0xD && Rm != 0xF &&
+      !Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned align = fieldFromInstruction(Insn, 4, 1);
+  unsigned size = 1 << fieldFromInstruction(Insn, 6, 2);
+  align *= 2*size;
+
+  switch (Inst.getOpcode()) {
+  case ARM::VLD2DUPd16: case ARM::VLD2DUPd32: case ARM::VLD2DUPd8:
+  case ARM::VLD2DUPd16wb_fixed: case ARM::VLD2DUPd16wb_register:
+  case ARM::VLD2DUPd32wb_fixed: case ARM::VLD2DUPd32wb_register:
+  case ARM::VLD2DUPd8wb_fixed: case ARM::VLD2DUPd8wb_register:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  case ARM::VLD2DUPd16x2: case ARM::VLD2DUPd32x2: case ARM::VLD2DUPd8x2:
+  case ARM::VLD2DUPd16x2wb_fixed: case ARM::VLD2DUPd16x2wb_register:
+  case ARM::VLD2DUPd32x2wb_fixed: case ARM::VLD2DUPd32x2wb_register:
+  case ARM::VLD2DUPd8x2wb_fixed: case ARM::VLD2DUPd8x2wb_register:
+    if (!Check(S, DecodeDPairSpacedRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  }
+
+  if (Rm != 0xF)
+    Inst.addOperand(MCOperand::createImm(0));
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(align));
+
+  if (Rm != 0xD && Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned inc = fieldFromInstruction(Insn, 5, 1) + 1;
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2*inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(0));
+
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::createReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  unsigned inc = fieldFromInstruction(Insn, 5, 1) + 1;
+  unsigned align = fieldFromInstruction(Insn, 4, 1);
+
+  if (size == 0x3) {
+    if (align == 0)
+      return MCDisassembler::Fail;
+    align = 16;
+  } else {
+    if (size == 2) {
+      align *= 8;
+    } else {
+      size = 1 << size;
+      align *= 4*size;
+    }
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2*inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3*inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(align));
+
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::createReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus
+DecodeNEONModImmInstruction(MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned imm = fieldFromInstruction(Insn, 0, 4);
+  imm |= fieldFromInstruction(Insn, 16, 3) << 4;
+  imm |= fieldFromInstruction(Insn, 24, 1) << 7;
+  imm |= fieldFromInstruction(Insn, 8, 4) << 8;
+  imm |= fieldFromInstruction(Insn, 5, 1) << 12;
+  unsigned Q = fieldFromInstruction(Insn, 6, 1);
+
+  if (Q) {
+    if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  } else {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+
+  Inst.addOperand(MCOperand::createImm(imm));
+
+  switch (Inst.getOpcode()) {
+    case ARM::VORRiv4i16:
+    case ARM::VORRiv2i32:
+    case ARM::VBICiv4i16:
+    case ARM::VBICiv2i32:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VORRiv8i16:
+    case ARM::VORRiv4i32:
+    case ARM::VBICiv8i16:
+    case ARM::VBICiv4i32:
+      if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  Rm |= fieldFromInstruction(Insn, 5, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 18, 2);
+
+  if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(8 << size));
+
+  return S;
+}
+
+static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::createImm(8 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::createImm(16 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::createImm(32 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::createImm(64 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  Rn |= fieldFromInstruction(Insn, 7, 1) << 4;
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  Rm |= fieldFromInstruction(Insn, 5, 1) << 4;
+  unsigned op = fieldFromInstruction(Insn, 6, 1);
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (op) {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail; // Writeback
+  }
+
+  switch (Inst.getOpcode()) {
+  case ARM::VTBL2:
+  case ARM::VTBX2:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
+                                     uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned dst = fieldFromInstruction(Insn, 8, 3);
+  unsigned imm = fieldFromInstruction(Insn, 0, 8);
+
+  if (!Check(S, DecodetGPRRegisterClass(Inst, dst, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  switch(Inst.getOpcode()) {
+    default:
+      return MCDisassembler::Fail;
+    case ARM::tADR:
+      break; // tADR does not explicitly represent the PC as an operand.
+    case ARM::tADDrSPi:
+      Inst.addOperand(MCOperand::createReg(ARM::SP));
+      break;
+  }
+
+  Inst.addOperand(MCOperand::createImm(imm));
+  return S;
+}
+
+static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<12>(Val<<1) + 4,
+                                true, 2, Inst, Decoder))
+    Inst.addOperand(MCOperand::createImm(SignExtend32<12>(Val << 1)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<21>(Val) + 4,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::createImm(SignExtend32<21>(Val)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  if (!tryAddingSymbolicOperand(Address, Address + (Val<<1) + 4,
+                                true, 2, Inst, Decoder))
+    Inst.addOperand(MCOperand::createImm(Val << 1));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 0, 3);
+  unsigned Rm = fieldFromInstruction(Val, 3, 3);
+
+  if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodetGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 0, 3);
+  unsigned imm = fieldFromInstruction(Val, 3, 5);
+
+  if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  unsigned imm = Val << 2;
+
+  Inst.addOperand(MCOperand::createImm(imm));
+  tryAddingPcLoadReferenceComment(Address, (Address & ~2u) + imm + 4, Decoder);
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::createReg(ARM::SP));
+  Inst.addOperand(MCOperand::createImm(Val));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 6, 4);
+  unsigned Rm = fieldFromInstruction(Val, 2, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 2);
+
+  // Thumb stores cannot use PC as dest register.
+  switch (Inst.getOpcode()) {
+  case ARM::t2STRHs:
+  case ARM::t2STRBs:
+  case ARM::t2STRs:
+    if (Rn == 15)
+      return MCDisassembler::Fail;
+  default:
+    break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn,
+                              uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+
+  const FeatureBitset &featureBits =
+    ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+  bool hasMP = featureBits[ARM::FeatureMP];
+  bool hasV7Ops = featureBits[ARM::HasV7Ops];
+
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRBs:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRHs:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSHs:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    case ARM::t2LDRSBs:
+      Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2LDRs:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2PLDs:
+      Inst.setOpcode(ARM::t2PLDpci);
+      break;
+    case ARM::t2PLIs:
+      Inst.setOpcode(ARM::t2PLIpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRSHs:
+      return MCDisassembler::Fail;
+    case ARM::t2LDRHs:
+      Inst.setOpcode(ARM::t2PLDWs);
+      break;
+    case ARM::t2LDRSBs:
+      Inst.setOpcode(ARM::t2PLIs);
+    default:
+      break;
+    }
+  }
+
+  switch (Inst.getOpcode()) {
+    case ARM::t2PLDs:
+      break;
+    case ARM::t2PLIs:
+      if (!hasV7Ops)
+        return MCDisassembler::Fail;
+      break;
+    case ARM::t2PLDWs:
+      if (!hasV7Ops || !hasMP)
+        return MCDisassembler::Fail;
+      break;
+    default:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+        return MCDisassembler::Fail;
+  }
+
+  unsigned addrmode = fieldFromInstruction(Insn, 4, 2);
+  addrmode |= fieldFromInstruction(Insn, 0, 4) << 2;
+  addrmode |= fieldFromInstruction(Insn, 16, 4) << 6;
+  if (!Check(S, DecodeT2AddrModeSOReg(Inst, addrmode, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void* Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned U = fieldFromInstruction(Insn, 9, 1);
+  unsigned imm = fieldFromInstruction(Insn, 0, 8);
+  imm |= (U << 8);
+  imm |= (Rn << 9);
+  unsigned add = fieldFromInstruction(Insn, 9, 1);
+
+  const FeatureBitset &featureBits =
+    ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+  bool hasMP = featureBits[ARM::FeatureMP];
+  bool hasV7Ops = featureBits[ARM::HasV7Ops];
+
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRi8:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2LDRBi8:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRSBi8:
+      Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2LDRHi8:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSHi8:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    case ARM::t2PLDi8:
+      Inst.setOpcode(ARM::t2PLDpci);
+      break;
+    case ARM::t2PLIi8:
+      Inst.setOpcode(ARM::t2PLIpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRSHi8:
+      return MCDisassembler::Fail;
+    case ARM::t2LDRHi8:
+      if (!add)
+        Inst.setOpcode(ARM::t2PLDWi8);
+      break;
+    case ARM::t2LDRSBi8:
+      Inst.setOpcode(ARM::t2PLIi8);
+      break;
+    default:
+      break;
+    }
+  }
+
+  switch (Inst.getOpcode()) {
+  case ARM::t2PLDi8:
+    break;
+  case ARM::t2PLIi8:
+    if (!hasV7Ops)
+      return MCDisassembler::Fail;
+    break;
+  case ARM::t2PLDWi8:
+      if (!hasV7Ops || !hasMP)
+        return MCDisassembler::Fail;
+      break;
+  default:
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeT2AddrModeImm8(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void* Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  imm |= (Rn << 13);
+
+  const FeatureBitset &featureBits =
+    ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+  bool hasMP = featureBits[ARM::FeatureMP];
+  bool hasV7Ops = featureBits[ARM::HasV7Ops];
+
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRi12:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2LDRHi12:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSHi12:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    case ARM::t2LDRBi12:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRSBi12:
+      Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2PLDi12:
+      Inst.setOpcode(ARM::t2PLDpci);
+      break;
+    case ARM::t2PLIi12:
+      Inst.setOpcode(ARM::t2PLIpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRSHi12:
+      return MCDisassembler::Fail;
+    case ARM::t2LDRHi12:
+      Inst.setOpcode(ARM::t2PLDWi12);
+      break;
+    case ARM::t2LDRSBi12:
+      Inst.setOpcode(ARM::t2PLIi12);
+      break;
+    default:
+      break;
+    }
+  }
+
+  switch (Inst.getOpcode()) {
+  case ARM::t2PLDi12:
+    break;
+  case ARM::t2PLIi12:
+    if (!hasV7Ops)
+      return MCDisassembler::Fail;
+    break;
+  case ARM::t2PLDWi12:
+      if (!hasV7Ops || !hasMP)
+        return MCDisassembler::Fail;
+      break;
+  default:
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeT2AddrModeImm12(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void* Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 8);
+  imm |= (Rn << 9);
+
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRT:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2LDRBT:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRHT:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSBT:
+      Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2LDRSHT:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeT2AddrModeImm8(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void* Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned U = fieldFromInstruction(Insn, 23, 1);
+  int imm = fieldFromInstruction(Insn, 0, 12);
+
+  const FeatureBitset &featureBits =
+    ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+  bool hasV7Ops = featureBits[ARM::HasV7Ops];
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+      case ARM::t2LDRBpci:
+      case ARM::t2LDRHpci:
+        Inst.setOpcode(ARM::t2PLDpci);
+        break;
+      case ARM::t2LDRSBpci:
+        Inst.setOpcode(ARM::t2PLIpci);
+        break;
+      case ARM::t2LDRSHpci:
+        return MCDisassembler::Fail;
+      default:
+        break;
+    }
+  }
+
+  switch(Inst.getOpcode()) {
+  case ARM::t2PLDpci:
+    break;
+  case ARM::t2PLIpci:
+    if (!hasV7Ops)
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!U) {
+    // Special case for #-0.
+    if (imm == 0)
+      imm = INT32_MIN;
+    else
+      imm = -imm;
+  }
+  Inst.addOperand(MCOperand::createImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
+                           uint64_t Address, const void *Decoder) {
+  if (Val == 0)
+    Inst.addOperand(MCOperand::createImm(INT32_MIN));
+  else {
+    int imm = Val & 0xFF;
+
+    if (!(Val & 0x100)) imm *= -1;
+    Inst.addOperand(MCOperand::createImm(imm * 4));
+  }
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 9, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 9);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeT2Imm8S4(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 8, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 8);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
+                         uint64_t Address, const void *Decoder) {
+  int imm = Val & 0xFF;
+  if (Val == 0)
+    imm = INT32_MIN;
+  else if (!(Val & 0x100))
+    imm *= -1;
+  Inst.addOperand(MCOperand::createImm(imm));
+
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 9, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 9);
+
+  // Thumb stores cannot use PC as dest register.
+  switch (Inst.getOpcode()) {
+  case ARM::t2STRT:
+  case ARM::t2STRBT:
+  case ARM::t2STRHT:
+  case ARM::t2STRi8:
+  case ARM::t2STRHi8:
+  case ARM::t2STRBi8:
+    if (Rn == 15)
+      return MCDisassembler::Fail;
+    break;
+  default:
+    break;
+  }
+
+  // Some instructions always use an additive offset.
+  switch (Inst.getOpcode()) {
+    case ARM::t2LDRT:
+    case ARM::t2LDRBT:
+    case ARM::t2LDRHT:
+    case ARM::t2LDRSBT:
+    case ARM::t2LDRSHT:
+    case ARM::t2STRT:
+    case ARM::t2STRBT:
+    case ARM::t2STRHT:
+      imm |= 0x100;
+      break;
+    default:
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeT2Imm8(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned addr = fieldFromInstruction(Insn, 0, 8);
+  addr |= fieldFromInstruction(Insn, 9, 1) << 8;
+  addr |= Rn << 9;
+  unsigned load = fieldFromInstruction(Insn, 20, 1);
+
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDR_PRE:
+    case ARM::t2LDR_POST:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2LDRB_PRE:
+    case ARM::t2LDRB_POST:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRH_PRE:
+    case ARM::t2LDRH_POST:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSB_PRE:
+    case ARM::t2LDRSB_POST:
+      if (Rt == 15)
+        Inst.setOpcode(ARM::t2PLIpci);
+      else
+        Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2LDRSH_PRE:
+    case ARM::t2LDRSH_POST:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (!load) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (load) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeT2AddrModeImm8(Inst, addr, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 13, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 12);
+
+  // Thumb stores cannot use PC as dest register.
+  switch (Inst.getOpcode()) {
+  case ARM::t2STRi12:
+  case ARM::t2STRBi12:
+  case ARM::t2STRHi12:
+    if (Rn == 15)
+      return MCDisassembler::Fail;
+  default:
+    break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(imm));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder) {
+  unsigned imm = fieldFromInstruction(Insn, 0, 7);
+
+  Inst.addOperand(MCOperand::createReg(ARM::SP));
+  Inst.addOperand(MCOperand::createReg(ARM::SP));
+  Inst.addOperand(MCOperand::createImm(imm));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (Inst.getOpcode() == ARM::tADDrSP) {
+    unsigned Rdm = fieldFromInstruction(Insn, 0, 3);
+    Rdm |= fieldFromInstruction(Insn, 7, 1) << 3;
+
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    Inst.addOperand(MCOperand::createReg(ARM::SP));
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  } else if (Inst.getOpcode() == ARM::tADDspr) {
+    unsigned Rm = fieldFromInstruction(Insn, 3, 4);
+
+    Inst.addOperand(MCOperand::createReg(ARM::SP));
+    Inst.addOperand(MCOperand::createReg(ARM::SP));
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
+                           uint64_t Address, const void *Decoder) {
+  unsigned imod = fieldFromInstruction(Insn, 4, 1) | 0x2;
+  unsigned flags = fieldFromInstruction(Insn, 0, 3);
+
+  Inst.addOperand(MCOperand::createImm(imod));
+  Inst.addOperand(MCOperand::createImm(flags));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
+                             uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned add = fieldFromInstruction(Insn, 4, 1);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(add));
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  // Val is passed in as S:J1:J2:imm10H:imm10L:'0'
+  // Note only one trailing zero not two.  Also the J1 and J2 values are from
+  // the encoded instruction.  So here change to I1 and I2 values via:
+  // I1 = NOT(J1 EOR S);
+  // I2 = NOT(J2 EOR S);
+  // and build the imm32 with two trailing zeros as documented:
+  // imm32 = SignExtend(S:I1:I2:imm10H:imm10L:'00', 32);
+  unsigned S = (Val >> 23) & 1;
+  unsigned J1 = (Val >> 22) & 1;
+  unsigned J2 = (Val >> 21) & 1;
+  unsigned I1 = !(J1 ^ S);
+  unsigned I2 = !(J2 ^ S);
+  unsigned tmp = (Val & ~0x600000) | (I1 << 22) | (I2 << 21);
+  int imm32 = SignExtend32<25>(tmp << 1);
+
+  if (!tryAddingSymbolicOperand(Address,
+                                (Address & ~2u) + imm32 + 4,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::createImm(imm32));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val,
+                              uint64_t Address, const void *Decoder) {
+  if (Val == 0xA || Val == 0xB)
+    return MCDisassembler::Fail;
+
+  const FeatureBitset &featureBits =
+    ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+  if (featureBits[ARM::HasV8Ops] && !(Val == 14 || Val == 15))
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeThumbTableBranch(MCInst &Inst, unsigned Insn,
+                       uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+  if (Rn == ARM::SP) S = MCDisassembler::SoftFail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus
+DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned pred = fieldFromInstruction(Insn, 22, 4);
+  if (pred == 0xE || pred == 0xF) {
+    unsigned opc = fieldFromInstruction(Insn, 4, 28);
+    switch (opc) {
+      default:
+        return MCDisassembler::Fail;
+      case 0xf3bf8f4:
+        Inst.setOpcode(ARM::t2DSB);
+        break;
+      case 0xf3bf8f5:
+        Inst.setOpcode(ARM::t2DMB);
+        break;
+      case 0xf3bf8f6:
+        Inst.setOpcode(ARM::t2ISB);
+        break;
+    }
+
+    unsigned imm = fieldFromInstruction(Insn, 0, 4);
+    return DecodeMemBarrierOption(Inst, imm, Address, Decoder);
+  }
+
+  unsigned brtarget = fieldFromInstruction(Insn, 0, 11) << 1;
+  brtarget |= fieldFromInstruction(Insn, 11, 1) << 19;
+  brtarget |= fieldFromInstruction(Insn, 13, 1) << 18;
+  brtarget |= fieldFromInstruction(Insn, 16, 6) << 12;
+  brtarget |= fieldFromInstruction(Insn, 26, 1) << 20;
+
+  if (!Check(S, DecodeT2BROperand(Inst, brtarget, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+// Decode a shifted immediate operand.  These basically consist
+// of an 8-bit value, and a 4-bit directive that specifies either
+// a splat operation or a rotation.
+static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
+                          uint64_t Address, const void *Decoder) {
+  unsigned ctrl = fieldFromInstruction(Val, 10, 2);
+  if (ctrl == 0) {
+    unsigned byte = fieldFromInstruction(Val, 8, 2);
+    unsigned imm = fieldFromInstruction(Val, 0, 8);
+    switch (byte) {
+      case 0:
+        Inst.addOperand(MCOperand::createImm(imm));
+        break;
+      case 1:
+        Inst.addOperand(MCOperand::createImm((imm << 16) | imm));
+        break;
+      case 2:
+        Inst.addOperand(MCOperand::createImm((imm << 24) | (imm << 8)));
+        break;
+      case 3:
+        Inst.addOperand(MCOperand::createImm((imm << 24) | (imm << 16) |
+                                             (imm << 8)  |  imm));
+        break;
+    }
+  } else {
+    unsigned unrot = fieldFromInstruction(Val, 0, 7) | 0x80;
+    unsigned rot = fieldFromInstruction(Val, 7, 5);
+    unsigned imm = (unrot >> rot) | (unrot << ((32-rot)&31));
+    Inst.addOperand(MCOperand::createImm(imm));
+  }
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
+                            uint64_t Address, const void *Decoder){
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<9>(Val<<1) + 4,
+                                true, 2, Inst, Decoder))
+    Inst.addOperand(MCOperand::createImm(SignExtend32<9>(Val << 1)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
+                                       uint64_t Address, const void *Decoder){
+  // Val is passed in as S:J1:J2:imm10:imm11
+  // Note no trailing zero after imm11.  Also the J1 and J2 values are from
+  // the encoded instruction.  So here change to I1 and I2 values via:
+  // I1 = NOT(J1 EOR S);
+  // I2 = NOT(J2 EOR S);
+  // and build the imm32 with one trailing zero as documented:
+  // imm32 = SignExtend(S:I1:I2:imm10:imm11:'0', 32);
+  unsigned S = (Val >> 23) & 1;
+  unsigned J1 = (Val >> 22) & 1;
+  unsigned J2 = (Val >> 21) & 1;
+  unsigned I1 = !(J1 ^ S);
+  unsigned I2 = !(J2 ^ S);
+  unsigned tmp = (Val & ~0x600000) | (I1 << 22) | (I2 << 21);
+  int imm32 = SignExtend32<25>(tmp << 1);
+
+  if (!tryAddingSymbolicOperand(Address, Address + imm32 + 4,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::createImm(imm32));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  if (Val & ~0xf)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder) {
+  if (Val & ~0xf)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
+                          uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  const FeatureBitset &FeatureBits =
+    ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+
+  if (FeatureBits[ARM::FeatureMClass]) {
+    unsigned ValLow = Val & 0xff;
+
+    // Validate the SYSm value first.
+    switch (ValLow) {
+    case  0: // apsr
+    case  1: // iapsr
+    case  2: // eapsr
+    case  3: // xpsr
+    case  5: // ipsr
+    case  6: // epsr
+    case  7: // iepsr
+    case  8: // msp
+    case  9: // psp
+    case 16: // primask
+    case 20: // control
+      break;
+    case 17: // basepri
+    case 18: // basepri_max
+    case 19: // faultmask
+      if (!(FeatureBits[ARM::HasV7Ops]))
+        // Values basepri, basepri_max and faultmask are only valid for v7m.
+        return MCDisassembler::Fail;
+      break;
+    case 0x8a: // msplim_ns
+    case 0x8b: // psplim_ns
+    case 0x91: // basepri_ns
+    case 0x92: // basepri_max_ns
+    case 0x93: // faultmask_ns
+      if (!(FeatureBits[ARM::HasV8MMainlineOps]))
+        return MCDisassembler::Fail;
+      LLVM_FALLTHROUGH;
+    case 10:   // msplim
+    case 11:   // psplim
+    case 0x88: // msp_ns
+    case 0x89: // psp_ns
+    case 0x90: // primask_ns
+    case 0x94: // control_ns
+    case 0x98: // sp_ns
+      if (!(FeatureBits[ARM::Feature8MSecExt]))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+
+    if (Inst.getOpcode() == ARM::t2MSR_M) {
+      unsigned Mask = fieldFromInstruction(Val, 10, 2);
+      if (!(FeatureBits[ARM::HasV7Ops])) {
+        // The ARMv6-M MSR bits {11-10} can be only 0b10, other values are
+        // unpredictable.
+        if (Mask != 2)
+          S = MCDisassembler::SoftFail;
+      }
+      else {
+        // The ARMv7-M architecture stores an additional 2-bit mask value in
+        // MSR bits {11-10}. The mask is used only with apsr, iapsr, eapsr and
+        // xpsr, it has to be 0b10 in other cases. Bit mask{1} indicates if
+        // the NZCVQ bits should be moved by the instruction. Bit mask{0}
+        // indicates the move for the GE{3:0} bits, the mask{0} bit can be set
+        // only if the processor includes the DSP extension.
+        if (Mask == 0 || (Mask != 2 && ValLow > 3) ||
+            (!(FeatureBits[ARM::FeatureDSP]) && (Mask & 1)))
+          S = MCDisassembler::SoftFail;
+      }
+    }
+  } else {
+    // A/R class
+    if (Val == 0)
+      return MCDisassembler::Fail;
+  }
+  Inst.addOperand(MCOperand::createImm(Val));
+  return S;
+}
+
+static DecodeStatus DecodeBankedReg(MCInst &Inst, unsigned Val,
+                                    uint64_t Address, const void *Decoder) {
+
+  unsigned R = fieldFromInstruction(Val, 5, 1);
+  unsigned SysM = fieldFromInstruction(Val, 0, 5);
+
+  // The table of encodings for these banked registers comes from B9.2.3 of the
+  // ARM ARM. There are patterns, but nothing regular enough to make this logic
+  // neater. So by fiat, these values are UNPREDICTABLE:
+  if (!R) {
+    if (SysM == 0x7 || SysM == 0xf || SysM == 0x18 || SysM == 0x19 ||
+        SysM == 0x1a || SysM == 0x1b)
+      return MCDisassembler::SoftFail;
+  } else {
+    if (SysM != 0xe && SysM != 0x10 && SysM != 0x12 && SysM != 0x14 &&
+        SysM != 0x16 && SysM != 0x1c && SysM != 0x1e)
+      return MCDisassembler::SoftFail;
+  }
+
+  Inst.addOperand(MCOperand::createImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (Rn == 0xF)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRPairRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address, const void *Decoder){
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (Rn == 0xF || Rd == Rn || Rd == Rt || Rd == Rt+1)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRPairRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  imm |= fieldFromInstruction(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeAddrModeImm12Operand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  imm |= fieldFromInstruction(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+  if (Rm == 0xF) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSORegMemOperand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+
+static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  imm |= fieldFromInstruction(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeAddrModeImm12Operand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  imm |= fieldFromInstruction(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSORegMemOperand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 6, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0 :
+          align = 0; break;
+        case 3:
+          align = 4; break;
+        default:
+          return MCDisassembler::Fail;
+      }
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::createReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 6, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0: 
+          align = 0; break;
+        case 3:
+          align = 4; break;
+        default:
+          return MCDisassembler::Fail;
+      }
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::createReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(index));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      index = fieldFromInstruction(Insn, 5, 3);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 2;
+      break;
+    case 1:
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 4;
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 4, 1) != 0)
+        align = 8;
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::createReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      index = fieldFromInstruction(Insn, 5, 3);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 2;
+      break;
+    case 1:
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 4;
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 4, 1) != 0)
+        align = 8;
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::createReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(index));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 4, 2))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::createReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 4, 2))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::createReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(index));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 4;
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 8;
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0:
+          align = 0; break;
+        case 3:
+          return MCDisassembler::Fail;
+        default:
+          align = 4 << fieldFromInstruction(Insn, 4, 2); break;
+      }
+
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::createReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 4;
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 8;
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0:
+          align = 0; break;
+        case 3:
+          return MCDisassembler::Fail;
+        default:
+          align = 4 << fieldFromInstruction(Insn, 4, 2); break;
+      }
+
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::createReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Rt  = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm  = fieldFromInstruction(Insn,  5, 1);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  Rm |= fieldFromInstruction(Insn, 0, 4) << 1;
+
+  if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm+1, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2 , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Rt  = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm  = fieldFromInstruction(Insn,  5, 1);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  Rm |= fieldFromInstruction(Insn, 0, 4) << 1;
+
+  if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2 , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm+1, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeIT(MCInst &Inst, unsigned Insn,
+                             uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned pred = fieldFromInstruction(Insn, 4, 4);
+  unsigned mask = fieldFromInstruction(Insn, 0, 4);
+
+  if (pred == 0xF) {
+    pred = 0xE;
+    S = MCDisassembler::SoftFail;
+  }
+
+  if (mask == 0x0)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createImm(pred));
+  Inst.addOperand(MCOperand::createImm(mask));
+  return S;
+}
+
+static DecodeStatus
+DecodeT2LDRDPreInstruction(MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction(Insn, 8, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned addr = fieldFromInstruction(Insn, 0, 8);
+  unsigned W = fieldFromInstruction(Insn, 21, 1);
+  unsigned U = fieldFromInstruction(Insn, 23, 1);
+  unsigned P = fieldFromInstruction(Insn, 24, 1);
+  bool writeback = (W == 1) | (P == 0);
+
+  addr |= (U << 8) | (Rn << 9);
+
+  if (writeback && (Rn == Rt || Rn == Rt2))
+    Check(S, MCDisassembler::SoftFail);
+  if (Rt == Rt2)
+    Check(S, MCDisassembler::SoftFail);
+
+  // Rt
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Rt2
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt2, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Writeback operand
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // addr
+  if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus
+DecodeT2STRDPreInstruction(MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction(Insn, 8, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned addr = fieldFromInstruction(Insn, 0, 8);
+  unsigned W = fieldFromInstruction(Insn, 21, 1);
+  unsigned U = fieldFromInstruction(Insn, 23, 1);
+  unsigned P = fieldFromInstruction(Insn, 24, 1);
+  bool writeback = (W == 1) | (P == 0);
+
+  addr |= (U << 8) | (Rn << 9);
+
+  if (writeback && (Rn == Rt || Rn == Rt2))
+    Check(S, MCDisassembler::SoftFail);
+
+  // Writeback operand
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Rt
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Rt2
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt2, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // addr
+  if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2Adr(MCInst &Inst, uint32_t Insn,
+                                uint64_t Address, const void *Decoder) {
+  unsigned sign1 = fieldFromInstruction(Insn, 21, 1);
+  unsigned sign2 = fieldFromInstruction(Insn, 23, 1);
+  if (sign1 != sign2) return MCDisassembler::Fail;
+
+  unsigned Val = fieldFromInstruction(Insn, 0, 8);
+  Val |= fieldFromInstruction(Insn, 12, 3) << 8;
+  Val |= fieldFromInstruction(Insn, 26, 1) << 11;
+  Val |= sign1 << 12;
+  Inst.addOperand(MCOperand::createImm(SignExtend32<13>(Val)));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, uint32_t Val,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  // Shift of "asr #32" is not allowed in Thumb2 mode.
+  if (Val == 0x20) S = MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(Val));
+  return S;
+}
+
+static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder) {
+  unsigned Rt   = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt2  = fieldFromInstruction(Insn, 0,  4);
+  unsigned Rn   = fieldFromInstruction(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (pred == 0xF)
+    return DecodeCPSInstruction(Inst, Insn, Address, Decoder);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (Rt == Rn || Rn == Rt2)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder) {
+  const FeatureBitset &featureBits =
+      ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
+  bool hasFullFP16 = featureBits[ARM::FeatureFullFP16];
+
+  unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0);
+  Vd |= (fieldFromInstruction(Insn, 22, 1) << 4);
+  unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0);
+  Vm |= (fieldFromInstruction(Insn, 5, 1) << 4);
+  unsigned imm = fieldFromInstruction(Insn, 16, 6);
+  unsigned cmode = fieldFromInstruction(Insn, 8, 4);
+  unsigned op = fieldFromInstruction(Insn, 5, 1);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // If the top 3 bits of imm are clear, this is a VMOV (immediate)
+  if (!(imm & 0x38)) {
+    if (cmode == 0xF) {
+      if (op == 1) return MCDisassembler::Fail;
+      Inst.setOpcode(ARM::VMOVv2f32);
+    }
+    if (hasFullFP16) {
+      if (cmode == 0xE) {
+        if (op == 1) {
+          Inst.setOpcode(ARM::VMOVv1i64);
+        } else {
+          Inst.setOpcode(ARM::VMOVv8i8);
+        }
+      }
+      if (cmode == 0xD) {
+        if (op == 1) {
+          Inst.setOpcode(ARM::VMVNv2i32);
+        } else {
+          Inst.setOpcode(ARM::VMOVv2i32);
+        }
+      }
+      if (cmode == 0xC) {
+        if (op == 1) {
+          Inst.setOpcode(ARM::VMVNv2i32);
+        } else {
+          Inst.setOpcode(ARM::VMOVv2i32);
+        }
+      }
+    }
+    return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
+  }
+
+  if (!(imm & 0x20)) return MCDisassembler::Fail;
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Vm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(64 - imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder) {
+  const FeatureBitset &featureBits =
+      ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits();
+  bool hasFullFP16 = featureBits[ARM::FeatureFullFP16];
+
+  unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0);
+  Vd |= (fieldFromInstruction(Insn, 22, 1) << 4);
+  unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0);
+  Vm |= (fieldFromInstruction(Insn, 5, 1) << 4);
+  unsigned imm = fieldFromInstruction(Insn, 16, 6);
+  unsigned cmode = fieldFromInstruction(Insn, 8, 4);
+  unsigned op = fieldFromInstruction(Insn, 5, 1);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // If the top 3 bits of imm are clear, this is a VMOV (immediate)
+  if (!(imm & 0x38)) {
+    if (cmode == 0xF) {
+      if (op == 1) return MCDisassembler::Fail;
+      Inst.setOpcode(ARM::VMOVv4f32);
+    }
+    if (hasFullFP16) {
+      if (cmode == 0xE) {
+        if (op == 1) {
+          Inst.setOpcode(ARM::VMOVv2i64);
+        } else {
+          Inst.setOpcode(ARM::VMOVv16i8);
+        }
+      }
+      if (cmode == 0xD) {
+        if (op == 1) {
+          Inst.setOpcode(ARM::VMVNv4i32);
+        } else {
+          Inst.setOpcode(ARM::VMOVv4i32);
+        }
+      }
+      if (cmode == 0xC) {
+        if (op == 1) {
+          Inst.setOpcode(ARM::VMVNv4i32);
+        } else {
+          Inst.setOpcode(ARM::VMOVv4i32);
+        }
+      }
+    }
+    return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
+  }
+
+  if (!(imm & 0x20)) return MCDisassembler::Fail;
+
+  if (!Check(S, DecodeQPRRegisterClass(Inst, Vd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeQPRRegisterClass(Inst, Vm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(64 - imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 16, 4);
+  unsigned Rt = fieldFromInstruction(Val, 12, 4);
+  unsigned Rm = fieldFromInstruction(Val, 0, 4);
+  Rm |= (fieldFromInstruction(Val, 23, 1) << 4);
+  unsigned Cond = fieldFromInstruction(Val, 28, 4);
+
+  if (fieldFromInstruction(Val, 8, 4) != 0 || Rn == Rt)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeAddrMode7Operand(Inst, Rn, Address, Decoder))) 
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePostIdxReg(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, Cond, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecoderForMRRC2AndMCRR2(llvm::MCInst &Inst, unsigned Val,
+                                            uint64_t Address, const void *Decoder) {
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned CRm = fieldFromInstruction(Val, 0, 4);
+  unsigned opc1 = fieldFromInstruction(Val, 4, 4);
+  unsigned cop = fieldFromInstruction(Val, 8, 4);
+  unsigned Rt = fieldFromInstruction(Val, 12, 4);
+  unsigned Rt2 = fieldFromInstruction(Val, 16, 4);
+
+  if ((cop & ~0x1) == 0xa)
+    return MCDisassembler::Fail;
+
+  if (Rt == Rt2)
+    S = MCDisassembler::SoftFail;
+
+  // We have to check if the instruction is MRRC2
+  // or MCRR2 when constructing the operands for
+  // Inst. Reason is because MRRC2 stores to two
+  // registers so it's tablegen desc has has two
+  // outputs whereas MCRR doesn't store to any
+  // registers so all of it's operands are listed
+  // as inputs, therefore the operand order for
+  // MRRC2 needs to be [Rt, Rt2, cop, opc1, CRm]
+  // and MCRR2 operand order is [cop, opc1, Rt, Rt2, CRm]
+
+  if (Inst.getOpcode() == ARM::MRRC2) {
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  Inst.addOperand(MCOperand::createImm(cop));
+  Inst.addOperand(MCOperand::createImm(opc1));
+  if (Inst.getOpcode() == ARM::MCRR2) {
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  Inst.addOperand(MCOperand::createImm(CRm));
+
+  return S;
+}
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
new file mode 100644
index 000000000000..3667952d44c0
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -0,0 +1,1669 @@
+//===-- ARMInstPrinter.cpp - Convert ARM MCInst to assembly syntax --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMInstPrinter.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#define PRINT_ALIAS_INSTR
+#include "ARMGenAsmWriter.inc"
+
+/// translateShiftImm - Convert shift immediate from 0-31 to 1-32 for printing.
+///
+/// getSORegOffset returns an integer from 0-31, representing '32' as 0.
+static unsigned translateShiftImm(unsigned imm) {
+  // lsr #32 and asr #32 exist, but should be encoded as a 0.
+  assert((imm & ~0x1f) == 0 && "Invalid shift encoding");
+
+  if (imm == 0)
+    return 32;
+  return imm;
+}
+
+/// Prints the shift value with an immediate value.
+static void printRegImmShift(raw_ostream &O, ARM_AM::ShiftOpc ShOpc,
+                             unsigned ShImm, bool UseMarkup) {
+  if (ShOpc == ARM_AM::no_shift || (ShOpc == ARM_AM::lsl && !ShImm))
+    return;
+  O << ", ";
+
+  assert(!(ShOpc == ARM_AM::ror && !ShImm) && "Cannot have ror #0");
+  O << getShiftOpcStr(ShOpc);
+
+  if (ShOpc != ARM_AM::rrx) {
+    O << " ";
+    if (UseMarkup)
+      O << "<imm:";
+    O << "#" << translateShiftImm(ShImm);
+    if (UseMarkup)
+      O << ">";
+  }
+}
+
+ARMInstPrinter::ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                               const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << markup("<reg:") << getRegisterName(RegNo) << markup(">");
+}
+
+void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot, const MCSubtargetInfo &STI) {
+  unsigned Opcode = MI->getOpcode();
+
+  switch (Opcode) {
+
+  // Check for MOVs and print canonical forms, instead.
+  case ARM::MOVsr: {
+    // FIXME: Thumb variants?
+    const MCOperand &Dst = MI->getOperand(0);
+    const MCOperand &MO1 = MI->getOperand(1);
+    const MCOperand &MO2 = MI->getOperand(2);
+    const MCOperand &MO3 = MI->getOperand(3);
+
+    O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm()));
+    printSBitModifierOperand(MI, 6, STI, O);
+    printPredicateOperand(MI, 4, STI, O);
+
+    O << '\t';
+    printRegName(O, Dst.getReg());
+    O << ", ";
+    printRegName(O, MO1.getReg());
+
+    O << ", ";
+    printRegName(O, MO2.getReg());
+    assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  case ARM::MOVsi: {
+    // FIXME: Thumb variants?
+    const MCOperand &Dst = MI->getOperand(0);
+    const MCOperand &MO1 = MI->getOperand(1);
+    const MCOperand &MO2 = MI->getOperand(2);
+
+    O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO2.getImm()));
+    printSBitModifierOperand(MI, 5, STI, O);
+    printPredicateOperand(MI, 3, STI, O);
+
+    O << '\t';
+    printRegName(O, Dst.getReg());
+    O << ", ";
+    printRegName(O, MO1.getReg());
+
+    if (ARM_AM::getSORegShOp(MO2.getImm()) == ARM_AM::rrx) {
+      printAnnotation(O, Annot);
+      return;
+    }
+
+    O << ", " << markup("<imm:") << "#"
+      << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm())) << markup(">");
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  // A8.6.123 PUSH
+  case ARM::STMDB_UPD:
+  case ARM::t2STMDB_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) {
+      // Should only print PUSH if there are at least two registers in the list.
+      O << '\t' << "push";
+      printPredicateOperand(MI, 2, STI, O);
+      if (Opcode == ARM::t2STMDB_UPD)
+        O << ".w";
+      O << '\t';
+      printRegisterList(MI, 4, STI, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  case ARM::STR_PRE_IMM:
+    if (MI->getOperand(2).getReg() == ARM::SP &&
+        MI->getOperand(3).getImm() == -4) {
+      O << '\t' << "push";
+      printPredicateOperand(MI, 4, STI, O);
+      O << "\t{";
+      printRegName(O, MI->getOperand(1).getReg());
+      O << "}";
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  // A8.6.122 POP
+  case ARM::LDMIA_UPD:
+  case ARM::t2LDMIA_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) {
+      // Should only print POP if there are at least two registers in the list.
+      O << '\t' << "pop";
+      printPredicateOperand(MI, 2, STI, O);
+      if (Opcode == ARM::t2LDMIA_UPD)
+        O << ".w";
+      O << '\t';
+      printRegisterList(MI, 4, STI, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  case ARM::LDR_POST_IMM:
+    if (MI->getOperand(2).getReg() == ARM::SP &&
+        MI->getOperand(4).getImm() == 4) {
+      O << '\t' << "pop";
+      printPredicateOperand(MI, 5, STI, O);
+      O << "\t{";
+      printRegName(O, MI->getOperand(0).getReg());
+      O << "}";
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  // A8.6.355 VPUSH
+  case ARM::VSTMSDB_UPD:
+  case ARM::VSTMDDB_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP) {
+      O << '\t' << "vpush";
+      printPredicateOperand(MI, 2, STI, O);
+      O << '\t';
+      printRegisterList(MI, 4, STI, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  // A8.6.354 VPOP
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMDIA_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP) {
+      O << '\t' << "vpop";
+      printPredicateOperand(MI, 2, STI, O);
+      O << '\t';
+      printRegisterList(MI, 4, STI, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  case ARM::tLDMIA: {
+    bool Writeback = true;
+    unsigned BaseReg = MI->getOperand(0).getReg();
+    for (unsigned i = 3; i < MI->getNumOperands(); ++i) {
+      if (MI->getOperand(i).getReg() == BaseReg)
+        Writeback = false;
+    }
+
+    O << "\tldm";
+
+    printPredicateOperand(MI, 1, STI, O);
+    O << '\t';
+    printRegName(O, BaseReg);
+    if (Writeback)
+      O << "!";
+    O << ", ";
+    printRegisterList(MI, 3, STI, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  // Combine 2 GPRs from disassember into a GPRPair to match with instr def.
+  // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
+  // a single GPRPair reg operand is used in the .td file to replace the two
+  // GPRs. However, when decoding them, the two GRPs cannot be automatically
+  // expressed as a GPRPair, so we have to manually merge them.
+  // FIXME: We would really like to be able to tablegen'erate this.
+  case ARM::LDREXD:
+  case ARM::STREXD:
+  case ARM::LDAEXD:
+  case ARM::STLEXD: {
+    const MCRegisterClass &MRC = MRI.getRegClass(ARM::GPRRegClassID);
+    bool isStore = Opcode == ARM::STREXD || Opcode == ARM::STLEXD;
+    unsigned Reg = MI->getOperand(isStore ? 1 : 0).getReg();
+    if (MRC.contains(Reg)) {
+      MCInst NewMI;
+      MCOperand NewReg;
+      NewMI.setOpcode(Opcode);
+
+      if (isStore)
+        NewMI.addOperand(MI->getOperand(0));
+      NewReg = MCOperand::createReg(MRI.getMatchingSuperReg(
+          Reg, ARM::gsub_0, &MRI.getRegClass(ARM::GPRPairRegClassID)));
+      NewMI.addOperand(NewReg);
+
+      // Copy the rest operands into NewMI.
+      for (unsigned i = isStore ? 3 : 2; i < MI->getNumOperands(); ++i)
+        NewMI.addOperand(MI->getOperand(i));
+      printInstruction(&NewMI, STI, O);
+      return;
+    }
+    break;
+  }
+  }
+
+  if (!printAliasInstr(MI, STI, O))
+    printInstruction(MI, STI, O);
+
+  printAnnotation(O, Annot);
+}
+
+void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI, raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    printRegName(O, Reg);
+  } else if (Op.isImm()) {
+    O << markup("<imm:") << '#' << formatImm(Op.getImm()) << markup(">");
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    const MCExpr *Expr = Op.getExpr();
+    switch (Expr->getKind()) {
+    case MCExpr::Binary:
+      O << '#';
+      Expr->print(O, &MAI);
+      break;
+    case MCExpr::Constant: {
+      // If a symbolic branch target was added as a constant expression then
+      // print that address in hex. And only print 32 unsigned bits for the
+      // address.
+      const MCConstantExpr *Constant = cast<MCConstantExpr>(Expr);
+      int64_t TargetAddress;
+      if (!Constant->evaluateAsAbsolute(TargetAddress)) {
+        O << '#';
+        Expr->print(O, &MAI);
+      } else {
+        O << "0x";
+        O.write_hex(static_cast<uint32_t>(TargetAddress));
+      }
+      break;
+    }
+    default:
+      // FIXME: Should we always treat this as if it is a constant literal and
+      // prefix it with '#'?
+      Expr->print(O, &MAI);
+      break;
+    }
+  }
+}
+
+void ARMInstPrinter::printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
+                                               const MCSubtargetInfo &STI,
+                                               raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  if (MO1.isExpr()) {
+    MO1.getExpr()->print(O, &MAI);
+    return;
+  }
+
+  O << markup("<mem:") << "[pc, ";
+
+  int32_t OffImm = (int32_t)MO1.getImm();
+  bool isSub = OffImm < 0;
+
+  // Special value for #-0. All others are normal.
+  if (OffImm == INT32_MIN)
+    OffImm = 0;
+  if (isSub) {
+    O << markup("<imm:") << "#-" << formatImm(-OffImm) << markup(">");
+  } else {
+    O << markup("<imm:") << "#" << formatImm(OffImm) << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+// so_reg is a 4-operand unit corresponding to register forms of the A5.1
+// "Addressing Mode 1 - Data-processing operands" forms.  This includes:
+//    REG 0   0           - e.g. R5
+//    REG REG 0,SH_OPC    - e.g. R5, ROR R3
+//    REG 0   IMM,SH_OPC  - e.g. R5, LSL #3
+void ARMInstPrinter::printSORegRegOperand(const MCInst *MI, unsigned OpNum,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+  const MCOperand &MO3 = MI->getOperand(OpNum + 2);
+
+  printRegName(O, MO1.getReg());
+
+  // Print the shift opc.
+  ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO3.getImm());
+  O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
+  if (ShOpc == ARM_AM::rrx)
+    return;
+
+  O << ' ';
+  printRegName(O, MO2.getReg());
+  assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+}
+
+void ARMInstPrinter::printSORegImmOperand(const MCInst *MI, unsigned OpNum,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+  printRegName(O, MO1.getReg());
+
+  // Print the shift opc.
+  printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()),
+                   ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup);
+}
+
+//===--------------------------------------------------------------------===//
+// Addressing Mode #2
+//===--------------------------------------------------------------------===//
+
+void ARMInstPrinter::printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
+                                                const MCSubtargetInfo &STI,
+                                                raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op + 1);
+  const MCOperand &MO3 = MI->getOperand(Op + 2);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  if (!MO2.getReg()) {
+    if (ARM_AM::getAM2Offset(MO3.getImm())) { // Don't print +0.
+      O << ", " << markup("<imm:") << "#"
+        << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
+        << ARM_AM::getAM2Offset(MO3.getImm()) << markup(">");
+    }
+    O << "]" << markup(">");
+    return;
+  }
+
+  O << ", ";
+  O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()));
+  printRegName(O, MO2.getReg());
+
+  printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO3.getImm()),
+                   ARM_AM::getAM2Offset(MO3.getImm()), UseMarkup);
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrModeTBB(const MCInst *MI, unsigned Op,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op + 1);
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  O << ", ";
+  printRegName(O, MO2.getReg());
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrModeTBH(const MCInst *MI, unsigned Op,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op + 1);
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  O << ", ";
+  printRegName(O, MO2.getReg());
+  O << ", lsl " << markup("<imm:") << "#1" << markup(">") << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+
+  if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op, STI, O);
+    return;
+  }
+
+#ifndef NDEBUG
+  const MCOperand &MO3 = MI->getOperand(Op + 2);
+  unsigned IdxMode = ARM_AM::getAM2IdxMode(MO3.getImm());
+  assert(IdxMode != ARMII::IndexModePost && "Should be pre or offset index op");
+#endif
+
+  printAM2PreOrOffsetIndexOp(MI, Op, STI, O);
+}
+
+void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI,
+                                                 unsigned OpNum,
+                                                 const MCSubtargetInfo &STI,
+                                                 raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+  if (!MO1.getReg()) {
+    unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm());
+    O << markup("<imm:") << '#'
+      << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm())) << ImmOffs
+      << markup(">");
+    return;
+  }
+
+  O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm()));
+  printRegName(O, MO1.getReg());
+
+  printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO2.getImm()),
+                   ARM_AM::getAM2Offset(MO2.getImm()), UseMarkup);
+}
+
+//===--------------------------------------------------------------------===//
+// Addressing Mode #3
+//===--------------------------------------------------------------------===//
+
+void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
+                                                raw_ostream &O,
+                                                bool AlwaysPrintImm0) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op + 1);
+  const MCOperand &MO3 = MI->getOperand(Op + 2);
+
+  O << markup("<mem:") << '[';
+  printRegName(O, MO1.getReg());
+
+  if (MO2.getReg()) {
+    O << ", " << getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()));
+    printRegName(O, MO2.getReg());
+    O << ']' << markup(">");
+    return;
+  }
+
+  // If the op is sub we have to print the immediate even if it is 0
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm());
+  ARM_AM::AddrOpc op = ARM_AM::getAM3Op(MO3.getImm());
+
+  if (AlwaysPrintImm0 || ImmOffs || (op == ARM_AM::sub)) {
+    O << ", " << markup("<imm:") << "#" << ARM_AM::getAddrOpcStr(op) << ImmOffs
+      << markup(">");
+  }
+  O << ']' << markup(">");
+}
+
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned Op,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  if (!MO1.isReg()) { //  For label symbolic references.
+    printOperand(MI, Op, STI, O);
+    return;
+  }
+
+  assert(ARM_AM::getAM3IdxMode(MI->getOperand(Op + 2).getImm()) !=
+             ARMII::IndexModePost &&
+         "unexpected idxmode");
+  printAM3PreOrOffsetIndexOp(MI, Op, O, AlwaysPrintImm0);
+}
+
+void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI,
+                                                 unsigned OpNum,
+                                                 const MCSubtargetInfo &STI,
+                                                 raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+  if (MO1.getReg()) {
+    O << getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm()));
+    printRegName(O, MO1.getReg());
+    return;
+  }
+
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm());
+  O << markup("<imm:") << '#'
+    << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm())) << ImmOffs
+    << markup(">");
+}
+
+void ARMInstPrinter::printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum,
+                                             const MCSubtargetInfo &STI,
+                                             raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  unsigned Imm = MO.getImm();
+  O << markup("<imm:") << '#' << ((Imm & 256) ? "" : "-") << (Imm & 0xff)
+    << markup(">");
+}
+
+void ARMInstPrinter::printPostIdxRegOperand(const MCInst *MI, unsigned OpNum,
+                                            const MCSubtargetInfo &STI,
+                                            raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+  O << (MO2.getImm() ? "" : "-");
+  printRegName(O, MO1.getReg());
+}
+
+void ARMInstPrinter::printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum,
+                                               const MCSubtargetInfo &STI,
+                                               raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  unsigned Imm = MO.getImm();
+  O << markup("<imm:") << '#' << ((Imm & 256) ? "" : "-") << ((Imm & 0xff) << 2)
+    << markup(">");
+}
+
+void ARMInstPrinter::printLdStmModeOperand(const MCInst *MI, unsigned OpNum,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  ARM_AM::AMSubMode Mode =
+      ARM_AM::getAM4SubMode(MI->getOperand(OpNum).getImm());
+  O << ARM_AM::getAMSubModeStr(Mode);
+}
+
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+  if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, OpNum, STI, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm());
+  ARM_AM::AddrOpc Op = ARM_AM::getAM5Op(MO2.getImm());
+  if (AlwaysPrintImm0 || ImmOffs || Op == ARM_AM::sub) {
+    O << ", " << markup("<imm:") << "#" << ARM_AM::getAddrOpcStr(Op)
+      << ImmOffs * 4 << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printAddrMode5FP16Operand(const MCInst *MI, unsigned OpNum,
+                                               const MCSubtargetInfo &STI,
+                                               raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, OpNum, STI, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  unsigned ImmOffs = ARM_AM::getAM5FP16Offset(MO2.getImm());
+  unsigned Op = ARM_AM::getAM5FP16Op(MO2.getImm());
+  if (AlwaysPrintImm0 || ImmOffs || Op == ARM_AM::sub) {
+    O << ", "
+      << markup("<imm:")
+      << "#"
+      << ARM_AM::getAddrOpcStr(ARM_AM::getAM5FP16Op(MO2.getImm()))
+      << ImmOffs * 2
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  if (MO2.getImm()) {
+    O << ":" << (MO2.getImm() << 3);
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode7Operand(const MCInst *MI, unsigned OpNum,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode6OffsetOperand(const MCInst *MI,
+                                                 unsigned OpNum,
+                                                 const MCSubtargetInfo &STI,
+                                                 raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  if (MO.getReg() == 0)
+    O << "!";
+  else {
+    O << ", ";
+    printRegName(O, MO.getReg());
+  }
+}
+
+void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI,
+                                                    unsigned OpNum,
+                                                    const MCSubtargetInfo &STI,
+                                                    raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  uint32_t v = ~MO.getImm();
+  int32_t lsb = countTrailingZeros(v);
+  int32_t width = (32 - countLeadingZeros(v)) - lsb;
+  assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!");
+  O << markup("<imm:") << '#' << lsb << markup(">") << ", " << markup("<imm:")
+    << '#' << width << markup(">");
+}
+
+void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  unsigned val = MI->getOperand(OpNum).getImm();
+  O << ARM_MB::MemBOptToString(val, STI.getFeatureBits()[ARM::HasV8Ops]);
+}
+
+void ARMInstPrinter::printInstSyncBOption(const MCInst *MI, unsigned OpNum,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  unsigned val = MI->getOperand(OpNum).getImm();
+  O << ARM_ISB::InstSyncBOptToString(val);
+}
+
+void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  unsigned ShiftOp = MI->getOperand(OpNum).getImm();
+  bool isASR = (ShiftOp & (1 << 5)) != 0;
+  unsigned Amt = ShiftOp & 0x1f;
+  if (isASR) {
+    O << ", asr " << markup("<imm:") << "#" << (Amt == 0 ? 32 : Amt)
+      << markup(">");
+  } else if (Amt) {
+    O << ", lsl " << markup("<imm:") << "#" << Amt << markup(">");
+  }
+}
+
+void ARMInstPrinter::printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  if (Imm == 0)
+    return;
+  assert(Imm > 0 && Imm < 32 && "Invalid PKH shift immediate value!");
+  O << ", lsl " << markup("<imm:") << "#" << Imm << markup(">");
+}
+
+void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  // A shift amount of 32 is encoded as 0.
+  if (Imm == 0)
+    Imm = 32;
+  assert(Imm > 0 && Imm <= 32 && "Invalid PKH shift immediate value!");
+  O << ", asr " << markup("<imm:") << "#" << Imm << markup(">");
+}
+
+void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+  assert(std::is_sorted(MI->begin() + OpNum, MI->end(),
+                        [&](const MCOperand &LHS, const MCOperand &RHS) {
+                          return MRI.getEncodingValue(LHS.getReg()) <
+                                 MRI.getEncodingValue(RHS.getReg());
+                        }));
+
+  O << "{";
+  for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) {
+    if (i != OpNum)
+      O << ", ";
+    printRegName(O, MI->getOperand(i).getReg());
+  }
+  O << "}";
+}
+
+void ARMInstPrinter::printGPRPairOperand(const MCInst *MI, unsigned OpNum,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  printRegName(O, MRI.getSubReg(Reg, ARM::gsub_0));
+  O << ", ";
+  printRegName(O, MRI.getSubReg(Reg, ARM::gsub_1));
+}
+
+void ARMInstPrinter::printSetendOperand(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  if (Op.getImm())
+    O << "be";
+  else
+    O << "le";
+}
+
+void ARMInstPrinter::printCPSIMod(const MCInst *MI, unsigned OpNum,
+                                  const MCSubtargetInfo &STI, raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  O << ARM_PROC::IModToString(Op.getImm());
+}
+
+void ARMInstPrinter::printCPSIFlag(const MCInst *MI, unsigned OpNum,
+                                   const MCSubtargetInfo &STI, raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  unsigned IFlags = Op.getImm();
+  for (int i = 2; i >= 0; --i)
+    if (IFlags & (1 << i))
+      O << ARM_PROC::IFlagsToString(1 << i);
+
+  if (IFlags == 0)
+    O << "none";
+}
+
+void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  unsigned SpecRegRBit = Op.getImm() >> 4;
+  unsigned Mask = Op.getImm() & 0xf;
+  const FeatureBitset &FeatureBits = STI.getFeatureBits();
+
+  if (FeatureBits[ARM::FeatureMClass]) {
+    unsigned SYSm = Op.getImm();
+    unsigned Opcode = MI->getOpcode();
+
+    // For writes, handle extended mask bits if the DSP extension is present.
+    if (Opcode == ARM::t2MSR_M && FeatureBits[ARM::FeatureDSP]) {
+      switch (SYSm) {
+      case 0x400:
+        O << "apsr_g";
+        return;
+      case 0xc00:
+        O << "apsr_nzcvqg";
+        return;
+      case 0x401:
+        O << "iapsr_g";
+        return;
+      case 0xc01:
+        O << "iapsr_nzcvqg";
+        return;
+      case 0x402:
+        O << "eapsr_g";
+        return;
+      case 0xc02:
+        O << "eapsr_nzcvqg";
+        return;
+      case 0x403:
+        O << "xpsr_g";
+        return;
+      case 0xc03:
+        O << "xpsr_nzcvqg";
+        return;
+      }
+    }
+
+    // Handle the basic 8-bit mask.
+    SYSm &= 0xff;
+
+    if (Opcode == ARM::t2MSR_M && FeatureBits [ARM::HasV7Ops]) {
+      // ARMv7-M deprecates using MSR APSR without a _<bits> qualifier as an
+      // alias for MSR APSR_nzcvq.
+      switch (SYSm) {
+      case 0:
+        O << "apsr_nzcvq";
+        return;
+      case 1:
+        O << "iapsr_nzcvq";
+        return;
+      case 2:
+        O << "eapsr_nzcvq";
+        return;
+      case 3:
+        O << "xpsr_nzcvq";
+        return;
+      }
+    }
+
+    switch (SYSm) {
+    default:
+      llvm_unreachable("Unexpected mask value!");
+    case 0:
+      O << "apsr";
+      return;
+    case 1:
+      O << "iapsr";
+      return;
+    case 2:
+      O << "eapsr";
+      return;
+    case 3:
+      O << "xpsr";
+      return;
+    case 5:
+      O << "ipsr";
+      return;
+    case 6:
+      O << "epsr";
+      return;
+    case 7:
+      O << "iepsr";
+      return;
+    case 8:
+      O << "msp";
+      return;
+    case 9:
+      O << "psp";
+      return;
+    case 16:
+      O << "primask";
+      return;
+    case 17:
+      O << "basepri";
+      return;
+    case 18:
+      O << "basepri_max";
+      return;
+    case 19:
+      O << "faultmask";
+      return;
+    case 20:
+      O << "control";
+      return;
+    case 10:
+      O << "msplim";
+      return;
+    case 11:
+      O << "psplim";
+      return;
+    case 0x88:
+      O << "msp_ns";
+      return;
+    case 0x89:
+      O << "psp_ns";
+      return;
+    case 0x8a:
+      O << "msplim_ns";
+      return;
+    case 0x8b:
+      O << "psplim_ns";
+      return;
+    case 0x90:
+      O << "primask_ns";
+      return;
+    case 0x91:
+      O << "basepri_ns";
+      return;
+    case 0x92:
+      O << "basepri_max_ns";
+      return;
+    case 0x93:
+      O << "faultmask_ns";
+      return;
+    case 0x94:
+      O << "control_ns";
+      return;
+    case 0x98:
+      O << "sp_ns";
+      return;
+    }
+  }
+
+  // As special cases, CPSR_f, CPSR_s and CPSR_fs prefer printing as
+  // APSR_nzcvq, APSR_g and APSRnzcvqg, respectively.
+  if (!SpecRegRBit && (Mask == 8 || Mask == 4 || Mask == 12)) {
+    O << "APSR_";
+    switch (Mask) {
+    default:
+      llvm_unreachable("Unexpected mask value!");
+    case 4:
+      O << "g";
+      return;
+    case 8:
+      O << "nzcvq";
+      return;
+    case 12:
+      O << "nzcvqg";
+      return;
+    }
+  }
+
+  if (SpecRegRBit)
+    O << "SPSR";
+  else
+    O << "CPSR";
+
+  if (Mask) {
+    O << '_';
+    if (Mask & 8)
+      O << 'f';
+    if (Mask & 4)
+      O << 's';
+    if (Mask & 2)
+      O << 'x';
+    if (Mask & 1)
+      O << 'c';
+  }
+}
+
+void ARMInstPrinter::printBankedRegOperand(const MCInst *MI, unsigned OpNum,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  uint32_t Banked = MI->getOperand(OpNum).getImm();
+  uint32_t R = (Banked & 0x20) >> 5;
+  uint32_t SysM = Banked & 0x1f;
+
+  // Nothing much we can do about this, the encodings are specified in B9.2.3 of
+  // the ARM ARM v7C, and are all over the shop.
+  if (R) {
+    O << "SPSR_";
+
+    switch (SysM) {
+    case 0x0e:
+      O << "fiq";
+      return;
+    case 0x10:
+      O << "irq";
+      return;
+    case 0x12:
+      O << "svc";
+      return;
+    case 0x14:
+      O << "abt";
+      return;
+    case 0x16:
+      O << "und";
+      return;
+    case 0x1c:
+      O << "mon";
+      return;
+    case 0x1e:
+      O << "hyp";
+      return;
+    default:
+      llvm_unreachable("Invalid banked SPSR register");
+    }
+  }
+
+  assert(!R && "should have dealt with SPSR regs");
+  const char *RegNames[] = {
+      "r8_usr", "r9_usr", "r10_usr", "r11_usr", "r12_usr", "sp_usr",  "lr_usr",
+      "",       "r8_fiq", "r9_fiq",  "r10_fiq", "r11_fiq", "r12_fiq", "sp_fiq",
+      "lr_fiq", "",       "lr_irq",  "sp_irq",  "lr_svc",  "sp_svc",  "lr_abt",
+      "sp_abt", "lr_und", "sp_und",  "",        "",        "",        "",
+      "lr_mon", "sp_mon", "elr_hyp", "sp_hyp"};
+  const char *Name = RegNames[SysM];
+  assert(Name[0] && "invalid banked register operand");
+
+  O << Name;
+}
+
+void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
+  // Handle the undefined 15 CC value here for printing so we don't abort().
+  if ((unsigned)CC == 15)
+    O << "<und>";
+  else if (CC != ARMCC::AL)
+    O << ARMCondCodeToString(CC);
+}
+
+void ARMInstPrinter::printMandatoryPredicateOperand(const MCInst *MI,
+                                                    unsigned OpNum,
+                                                    const MCSubtargetInfo &STI,
+                                                    raw_ostream &O) {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
+  O << ARMCondCodeToString(CC);
+}
+
+void ARMInstPrinter::printSBitModifierOperand(const MCInst *MI, unsigned OpNum,
+                                              const MCSubtargetInfo &STI,
+                                              raw_ostream &O) {
+  if (MI->getOperand(OpNum).getReg()) {
+    assert(MI->getOperand(OpNum).getReg() == ARM::CPSR &&
+           "Expect ARM CPSR register!");
+    O << 's';
+  }
+}
+
+void ARMInstPrinter::printNoHashImmediate(const MCInst *MI, unsigned OpNum,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  O << MI->getOperand(OpNum).getImm();
+}
+
+void ARMInstPrinter::printPImmediate(const MCInst *MI, unsigned OpNum,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  O << "p" << MI->getOperand(OpNum).getImm();
+}
+
+void ARMInstPrinter::printCImmediate(const MCInst *MI, unsigned OpNum,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  O << "c" << MI->getOperand(OpNum).getImm();
+}
+
+void ARMInstPrinter::printCoprocOptionImm(const MCInst *MI, unsigned OpNum,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  O << "{" << MI->getOperand(OpNum).getImm() << "}";
+}
+
+void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum,
+                                  const MCSubtargetInfo &STI, raw_ostream &O) {
+  llvm_unreachable("Unhandled PC-relative pseudo-instruction!");
+}
+
+template <unsigned scale>
+void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+
+  if (MO.isExpr()) {
+    MO.getExpr()->print(O, &MAI);
+    return;
+  }
+
+  int32_t OffImm = (int32_t)MO.getImm() << scale;
+
+  O << markup("<imm:");
+  if (OffImm == INT32_MIN)
+    O << "#-0";
+  else if (OffImm < 0)
+    O << "#-" << -OffImm;
+  else
+    O << "#" << OffImm;
+  O << markup(">");
+}
+
+void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum,
+                                            const MCSubtargetInfo &STI,
+                                            raw_ostream &O) {
+  O << markup("<imm:") << "#" << formatImm(MI->getOperand(OpNum).getImm() * 4)
+    << markup(">");
+}
+
+void ARMInstPrinter::printThumbSRImm(const MCInst *MI, unsigned OpNum,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  O << markup("<imm:") << "#" << formatImm((Imm == 0 ? 32 : Imm))
+    << markup(">");
+}
+
+void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  // (3 - the number of trailing zeros) is the number of then / else.
+  unsigned Mask = MI->getOperand(OpNum).getImm();
+  unsigned Firstcond = MI->getOperand(OpNum - 1).getImm();
+  unsigned CondBit0 = Firstcond & 1;
+  unsigned NumTZ = countTrailingZeros(Mask);
+  assert(NumTZ <= 3 && "Invalid IT mask!");
+  for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
+    bool T = ((Mask >> Pos) & 1) == CondBit0;
+    if (T)
+      O << 't';
+    else
+      O << 'e';
+  }
+}
+
+void ARMInstPrinter::printThumbAddrModeRROperand(const MCInst *MI, unsigned Op,
+                                                 const MCSubtargetInfo &STI,
+                                                 raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op + 1);
+
+  if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op, STI, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  if (unsigned RegNum = MO2.getReg()) {
+    O << ", ";
+    printRegName(O, RegNum);
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printThumbAddrModeImm5SOperand(const MCInst *MI,
+                                                    unsigned Op,
+                                                    const MCSubtargetInfo &STI,
+                                                    raw_ostream &O,
+                                                    unsigned Scale) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op + 1);
+
+  if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op, STI, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  if (unsigned ImmOffs = MO2.getImm()) {
+    O << ", " << markup("<imm:") << "#" << formatImm(ImmOffs * Scale)
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printThumbAddrModeImm5S1Operand(const MCInst *MI,
+                                                     unsigned Op,
+                                                     const MCSubtargetInfo &STI,
+                                                     raw_ostream &O) {
+  printThumbAddrModeImm5SOperand(MI, Op, STI, O, 1);
+}
+
+void ARMInstPrinter::printThumbAddrModeImm5S2Operand(const MCInst *MI,
+                                                     unsigned Op,
+                                                     const MCSubtargetInfo &STI,
+                                                     raw_ostream &O) {
+  printThumbAddrModeImm5SOperand(MI, Op, STI, O, 2);
+}
+
+void ARMInstPrinter::printThumbAddrModeImm5S4Operand(const MCInst *MI,
+                                                     unsigned Op,
+                                                     const MCSubtargetInfo &STI,
+                                                     raw_ostream &O) {
+  printThumbAddrModeImm5SOperand(MI, Op, STI, O, 4);
+}
+
+void ARMInstPrinter::printThumbAddrModeSPOperand(const MCInst *MI, unsigned Op,
+                                                 const MCSubtargetInfo &STI,
+                                                 raw_ostream &O) {
+  printThumbAddrModeImm5SOperand(MI, Op, STI, O, 4);
+}
+
+// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2
+// register with shift forms.
+// REG 0   0           - e.g. R5
+// REG IMM, SH_OPC     - e.g. R5, LSL #3
+void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+  unsigned Reg = MO1.getReg();
+  printRegName(O, Reg);
+
+  // Print the shift opc.
+  assert(MO2.isImm() && "Not a valid t2_so_reg value!");
+  printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()),
+                   ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup);
+}
+
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
+                                               const MCSubtargetInfo &STI,
+                                               raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+  if (!MO1.isReg()) { // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, OpNum, STI, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  int32_t OffImm = (int32_t)MO2.getImm();
+  bool isSub = OffImm < 0;
+  // Special value for #-0. All others are normal.
+  if (OffImm == INT32_MIN)
+    OffImm = 0;
+  if (isSub) {
+    O << ", " << markup("<imm:") << "#-" << formatImm(-OffImm) << markup(">");
+  } else if (AlwaysPrintImm0 || OffImm > 0) {
+    O << ", " << markup("<imm:") << "#" << formatImm(OffImm) << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI,
+                                                unsigned OpNum,
+                                                const MCSubtargetInfo &STI,
+                                                raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  int32_t OffImm = (int32_t)MO2.getImm();
+  bool isSub = OffImm < 0;
+  // Don't print +0.
+  if (OffImm == INT32_MIN)
+    OffImm = 0;
+  if (isSub) {
+    O << ", " << markup("<imm:") << "#-" << -OffImm << markup(">");
+  } else if (AlwaysPrintImm0 || OffImm > 0) {
+    O << ", " << markup("<imm:") << "#" << OffImm << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+template <bool AlwaysPrintImm0>
+void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI,
+                                                  unsigned OpNum,
+                                                  const MCSubtargetInfo &STI,
+                                                  raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+  if (!MO1.isReg()) { //  For label symbolic references.
+    printOperand(MI, OpNum, STI, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  int32_t OffImm = (int32_t)MO2.getImm();
+  bool isSub = OffImm < 0;
+
+  assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
+
+  // Don't print +0.
+  if (OffImm == INT32_MIN)
+    OffImm = 0;
+  if (isSub) {
+    O << ", " << markup("<imm:") << "#-" << -OffImm << markup(">");
+  } else if (AlwaysPrintImm0 || OffImm > 0) {
+    O << ", " << markup("<imm:") << "#" << OffImm << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeImm0_1020s4Operand(
+    const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI,
+    raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  if (MO2.getImm()) {
+    O << ", " << markup("<imm:") << "#" << formatImm(MO2.getImm() * 4)
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeImm8OffsetOperand(
+    const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI,
+    raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  int32_t OffImm = (int32_t)MO1.getImm();
+  O << ", " << markup("<imm:");
+  if (OffImm == INT32_MIN)
+    O << "#-0";
+  else if (OffImm < 0)
+    O << "#-" << -OffImm;
+  else
+    O << "#" << OffImm;
+  O << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand(
+    const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI,
+    raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  int32_t OffImm = (int32_t)MO1.getImm();
+
+  assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
+
+  O << ", " << markup("<imm:");
+  if (OffImm == INT32_MIN)
+    O << "#-0";
+  else if (OffImm < 0)
+    O << "#-" << -OffImm;
+  else
+    O << "#" << OffImm;
+  O << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI,
+                                                 unsigned OpNum,
+                                                 const MCSubtargetInfo &STI,
+                                                 raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+  const MCOperand &MO3 = MI->getOperand(OpNum + 2);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  assert(MO2.getReg() && "Invalid so_reg load / store address!");
+  O << ", ";
+  printRegName(O, MO2.getReg());
+
+  unsigned ShAmt = MO3.getImm();
+  if (ShAmt) {
+    assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!");
+    O << ", lsl " << markup("<imm:") << "#" << ShAmt << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  O << markup("<imm:") << '#' << ARM_AM::getFPImmFloat(MO.getImm())
+    << markup(">");
+}
+
+void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
+                                            const MCSubtargetInfo &STI,
+                                            raw_ostream &O) {
+  unsigned EncodedImm = MI->getOperand(OpNum).getImm();
+  unsigned EltBits;
+  uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits);
+  O << markup("<imm:") << "#0x";
+  O.write_hex(Val);
+  O << markup(">");
+}
+
+void ARMInstPrinter::printImmPlusOneOperand(const MCInst *MI, unsigned OpNum,
+                                            const MCSubtargetInfo &STI,
+                                            raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  O << markup("<imm:") << "#" << formatImm(Imm + 1) << markup(">");
+}
+
+void ARMInstPrinter::printRotImmOperand(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  if (Imm == 0)
+    return;
+  assert(Imm <= 3 && "illegal ror immediate!");
+  O << ", ror " << markup("<imm:") << "#" << 8 * Imm << markup(">");
+}
+
+void ARMInstPrinter::printModImmOperand(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  MCOperand Op = MI->getOperand(OpNum);
+
+  // Support for fixups (MCFixup)
+  if (Op.isExpr())
+    return printOperand(MI, OpNum, STI, O);
+
+  unsigned Bits = Op.getImm() & 0xFF;
+  unsigned Rot = (Op.getImm() & 0xF00) >> 7;
+
+  bool PrintUnsigned = false;
+  switch (MI->getOpcode()) {
+  case ARM::MOVi:
+    // Movs to PC should be treated unsigned
+    PrintUnsigned = (MI->getOperand(OpNum - 1).getReg() == ARM::PC);
+    break;
+  case ARM::MSRi:
+    // Movs to special registers should be treated unsigned
+    PrintUnsigned = true;
+    break;
+  }
+
+  int32_t Rotated = ARM_AM::rotr32(Bits, Rot);
+  if (ARM_AM::getSOImmVal(Rotated) == Op.getImm()) {
+    // #rot has the least possible value
+    O << "#" << markup("<imm:");
+    if (PrintUnsigned)
+      O << static_cast<uint32_t>(Rotated);
+    else
+      O << Rotated;
+    O << markup(">");
+    return;
+  }
+
+  // Explicit #bits, #rot implied
+  O << "#" << markup("<imm:") << Bits << markup(">") << ", #" << markup("<imm:")
+    << Rot << markup(">");
+}
+
+void ARMInstPrinter::printFBits16(const MCInst *MI, unsigned OpNum,
+                                  const MCSubtargetInfo &STI, raw_ostream &O) {
+  O << markup("<imm:") << "#" << 16 - MI->getOperand(OpNum).getImm()
+    << markup(">");
+}
+
+void ARMInstPrinter::printFBits32(const MCInst *MI, unsigned OpNum,
+                                  const MCSubtargetInfo &STI, raw_ostream &O) {
+  O << markup("<imm:") << "#" << 32 - MI->getOperand(OpNum).getImm()
+    << markup(">");
+}
+
+void ARMInstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  O << "[" << MI->getOperand(OpNum).getImm() << "]";
+}
+
+void ARMInstPrinter::printVectorListOne(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListTwo(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+  unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1);
+  O << "{";
+  printRegName(O, Reg0);
+  O << ", ";
+  printRegName(O, Reg1);
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum,
+                                              const MCSubtargetInfo &STI,
+                                              raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+  unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2);
+  O << "{";
+  printRegName(O, Reg0);
+  O << ", ";
+  printRegName(O, Reg1);
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListThree(const MCInst *MI, unsigned OpNum,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListFour(const MCInst *MI, unsigned OpNum,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 3);
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListOneAllLanes(const MCInst *MI,
+                                                unsigned OpNum,
+                                                const MCSubtargetInfo &STI,
+                                                raw_ostream &O) {
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListTwoAllLanes(const MCInst *MI,
+                                                unsigned OpNum,
+                                                const MCSubtargetInfo &STI,
+                                                raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+  unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1);
+  O << "{";
+  printRegName(O, Reg0);
+  O << "[], ";
+  printRegName(O, Reg1);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListThreeAllLanes(const MCInst *MI,
+                                                  unsigned OpNum,
+                                                  const MCSubtargetInfo &STI,
+                                                  raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListFourAllLanes(const MCInst *MI,
+                                                 unsigned OpNum,
+                                                 const MCSubtargetInfo &STI,
+                                                 raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 3);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListTwoSpacedAllLanes(
+    const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI,
+    raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+  unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2);
+  O << "{";
+  printRegName(O, Reg0);
+  O << "[], ";
+  printRegName(O, Reg1);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListThreeSpacedAllLanes(
+    const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI,
+    raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListFourSpacedAllLanes(
+    const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI,
+    raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 6);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListThreeSpaced(const MCInst *MI,
+                                                unsigned OpNum,
+                                                const MCSubtargetInfo &STI,
+                                                raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListFourSpaced(const MCInst *MI, unsigned OpNum,
+                                               const MCSubtargetInfo &STI,
+                                               raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 6);
+  O << "}";
+}
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
new file mode 100644
index 000000000000..9d80eed84dc2
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -0,0 +1,238 @@
+//===- ARMInstPrinter.h - Convert ARM MCInst to assembly syntax -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H
+#define LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class ARMInstPrinter : public MCInstPrinter {
+public:
+  ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                 const MCRegisterInfo &MRI);
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
+                        raw_ostream &O);
+  virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
+                               raw_ostream &O);
+  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                                       unsigned PrintMethodIdx,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
+
+  void printSORegRegOperand(const MCInst *MI, unsigned OpNum,
+                            const MCSubtargetInfo &STI, raw_ostream &O);
+  void printSORegImmOperand(const MCInst *MI, unsigned OpNum,
+                            const MCSubtargetInfo &STI, raw_ostream &O);
+
+  void printAddrModeTBB(const MCInst *MI, unsigned OpNum,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
+  void printAddrModeTBH(const MCInst *MI, unsigned OpNum,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
+  void printAddrMode2Operand(const MCInst *MI, unsigned OpNum,
+                             const MCSubtargetInfo &STI, raw_ostream &O);
+  void printAM2PostIndexOp(const MCInst *MI, unsigned OpNum,
+                           const MCSubtargetInfo &STI, raw_ostream &O);
+  void printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned OpNum,
+                                  const MCSubtargetInfo &STI, raw_ostream &O);
+  void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum,
+                                   const MCSubtargetInfo &STI, raw_ostream &O);
+  template <bool AlwaysPrintImm0>
+  void printAddrMode3Operand(const MCInst *MI, unsigned OpNum,
+                             const MCSubtargetInfo &STI, raw_ostream &O);
+  void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum,
+                                   const MCSubtargetInfo &STI, raw_ostream &O);
+  void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, raw_ostream &O,
+                                  bool AlwaysPrintImm0);
+  void printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum,
+                               const MCSubtargetInfo &STI, raw_ostream &O);
+  void printPostIdxRegOperand(const MCInst *MI, unsigned OpNum,
+                              const MCSubtargetInfo &STI, raw_ostream &O);
+  void printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum,
+                                 const MCSubtargetInfo &STI, raw_ostream &O);
+
+  void printLdStmModeOperand(const MCInst *MI, unsigned OpNum,
+                             const MCSubtargetInfo &STI, raw_ostream &O);
+  template <bool AlwaysPrintImm0>
+  void printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
+                             const MCSubtargetInfo &STI, raw_ostream &O);
+  template <bool AlwaysPrintImm0>
+  void printAddrMode5FP16Operand(const MCInst *MI, unsigned OpNum,
+                                 const MCSubtargetInfo &STI, raw_ostream &O);
+  void printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
+                             const MCSubtargetInfo &STI, raw_ostream &O);
+  void printAddrMode7Operand(const MCInst *MI, unsigned OpNum,
+                             const MCSubtargetInfo &STI, raw_ostream &O);
+  void printAddrMode6OffsetOperand(const MCInst *MI, unsigned OpNum,
+                                   const MCSubtargetInfo &STI, raw_ostream &O);
+
+  void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O);
+  void printMemBOption(const MCInst *MI, unsigned OpNum,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printInstSyncBOption(const MCInst *MI, unsigned OpNum,
+                            const MCSubtargetInfo &STI, raw_ostream &O);
+  void printShiftImmOperand(const MCInst *MI, unsigned OpNum,
+                            const MCSubtargetInfo &STI, raw_ostream &O);
+  void printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum,
+                           const MCSubtargetInfo &STI, raw_ostream &O);
+  void printPKHASRShiftImm(const MCInst *MI, unsigned OpNum,
+                           const MCSubtargetInfo &STI, raw_ostream &O);
+
+  template <unsigned scale>
+  void printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
+                            const MCSubtargetInfo &STI, raw_ostream &O);
+  void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum,
+                              const MCSubtargetInfo &STI, raw_ostream &O);
+  void printThumbSRImm(const MCInst *MI, unsigned OpNum,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printThumbITMask(const MCInst *MI, unsigned OpNum,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
+  void printThumbAddrModeRROperand(const MCInst *MI, unsigned OpNum,
+                                   const MCSubtargetInfo &STI, raw_ostream &O);
+  void printThumbAddrModeImm5SOperand(const MCInst *MI, unsigned OpNum,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O, unsigned Scale);
+  void printThumbAddrModeImm5S1Operand(const MCInst *MI, unsigned OpNum,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O);
+  void printThumbAddrModeImm5S2Operand(const MCInst *MI, unsigned OpNum,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O);
+  void printThumbAddrModeImm5S4Operand(const MCInst *MI, unsigned OpNum,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O);
+  void printThumbAddrModeSPOperand(const MCInst *MI, unsigned OpNum,
+                                   const MCSubtargetInfo &STI, raw_ostream &O);
+
+  void printT2SOOperand(const MCInst *MI, unsigned OpNum,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
+  template <bool AlwaysPrintImm0>
+  void printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
+                                 const MCSubtargetInfo &STI, raw_ostream &O);
+  template <bool AlwaysPrintImm0>
+  void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum,
+                                  const MCSubtargetInfo &STI, raw_ostream &O);
+  template <bool AlwaysPrintImm0>
+  void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum,
+                                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printT2AddrModeImm0_1020s4Operand(const MCInst *MI, unsigned OpNum,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O);
+  void printT2AddrModeImm8OffsetOperand(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O);
+  void printT2AddrModeImm8s4OffsetOperand(const MCInst *MI, unsigned OpNum,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O);
+  void printT2AddrModeSoRegOperand(const MCInst *MI, unsigned OpNum,
+                                   const MCSubtargetInfo &STI, raw_ostream &O);
+
+  void printSetendOperand(const MCInst *MI, unsigned OpNum,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printCPSIMod(const MCInst *MI, unsigned OpNum,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printCPSIFlag(const MCInst *MI, unsigned OpNum,
+                     const MCSubtargetInfo &STI, raw_ostream &O);
+  void printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
+                           const MCSubtargetInfo &STI, raw_ostream &O);
+  void printBankedRegOperand(const MCInst *MI, unsigned OpNum,
+                             const MCSubtargetInfo &STI, raw_ostream &O);
+  void printPredicateOperand(const MCInst *MI, unsigned OpNum,
+                             const MCSubtargetInfo &STI, raw_ostream &O);
+  void printMandatoryPredicateOperand(const MCInst *MI, unsigned OpNum,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O);
+  void printSBitModifierOperand(const MCInst *MI, unsigned OpNum,
+                                const MCSubtargetInfo &STI, raw_ostream &O);
+  void printRegisterList(const MCInst *MI, unsigned OpNum,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printNoHashImmediate(const MCInst *MI, unsigned OpNum,
+                            const MCSubtargetInfo &STI, raw_ostream &O);
+  void printPImmediate(const MCInst *MI, unsigned OpNum,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printCImmediate(const MCInst *MI, unsigned OpNum,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printCoprocOptionImm(const MCInst *MI, unsigned OpNum,
+                            const MCSubtargetInfo &STI, raw_ostream &O);
+  void printFPImmOperand(const MCInst *MI, unsigned OpNum,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
+                              const MCSubtargetInfo &STI, raw_ostream &O);
+  void printImmPlusOneOperand(const MCInst *MI, unsigned OpNum,
+                              const MCSubtargetInfo &STI, raw_ostream &O);
+  void printRotImmOperand(const MCInst *MI, unsigned OpNum,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printModImmOperand(const MCInst *MI, unsigned OpNum,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printGPRPairOperand(const MCInst *MI, unsigned OpNum,
+                           const MCSubtargetInfo &STI, raw_ostream &O);
+
+  void printPCLabel(const MCInst *MI, unsigned OpNum,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
+                                 const MCSubtargetInfo &STI, raw_ostream &O);
+  void printFBits16(const MCInst *MI, unsigned OpNum,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printFBits32(const MCInst *MI, unsigned OpNum,
+                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printVectorIndex(const MCInst *MI, unsigned OpNum,
+                        const MCSubtargetInfo &STI, raw_ostream &O);
+  void printVectorListOne(const MCInst *MI, unsigned OpNum,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printVectorListTwo(const MCInst *MI, unsigned OpNum,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
+  void printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum,
+                                const MCSubtargetInfo &STI, raw_ostream &O);
+  void printVectorListThree(const MCInst *MI, unsigned OpNum,
+                            const MCSubtargetInfo &STI, raw_ostream &O);
+  void printVectorListFour(const MCInst *MI, unsigned OpNum,
+                           const MCSubtargetInfo &STI, raw_ostream &O);
+  void printVectorListOneAllLanes(const MCInst *MI, unsigned OpNum,
+                                  const MCSubtargetInfo &STI, raw_ostream &O);
+  void printVectorListTwoAllLanes(const MCInst *MI, unsigned OpNum,
+                                  const MCSubtargetInfo &STI, raw_ostream &O);
+  void printVectorListThreeAllLanes(const MCInst *MI, unsigned OpNum,
+                                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printVectorListFourAllLanes(const MCInst *MI, unsigned OpNum,
+                                   const MCSubtargetInfo &STI, raw_ostream &O);
+  void printVectorListTwoSpacedAllLanes(const MCInst *MI, unsigned OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O);
+  void printVectorListThreeSpacedAllLanes(const MCInst *MI, unsigned OpNum,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O);
+  void printVectorListFourSpacedAllLanes(const MCInst *MI, unsigned OpNum,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O);
+  void printVectorListThreeSpaced(const MCInst *MI, unsigned OpNum,
+                                  const MCSubtargetInfo &STI, raw_ostream &O);
+  void printVectorListFourSpaced(const MCInst *MI, unsigned OpNum,
+                                 const MCSubtargetInfo &STI, raw_ostream &O);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/LICENSE.TXT b/contrib/llvm/lib/Target/ARM/LICENSE.TXT
new file mode 100755
index 000000000000..68afea12ed44
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/LICENSE.TXT
@@ -0,0 +1,47 @@
+ARM Limited
+
+Software Grant License Agreement ("Agreement")
+
+Except for the license granted herein to you, ARM Limited ("ARM") reserves all
+right, title, and interest in and to the Software (defined below).
+
+Definition
+
+"Software" means the code and documentation as well as any original work of
+authorship, including any modifications or additions to an existing work, that
+is intentionally submitted by ARM to llvm.org (http://llvm.org) ("LLVM") for
+inclusion in, or documentation of, any of the products owned or managed by LLVM
+(the "Work"). For the purposes of this definition, "submitted" means any form of
+electronic, verbal, or written communication sent to LLVM or its
+representatives, including but not limited to communication on electronic
+mailing lists, source code control systems, and issue tracking systems that are
+managed by, or on behalf of, LLVM for the purpose of discussing and improving
+the Work, but excluding communication that is conspicuously marked otherwise.
+
+1. Grant of Copyright License. Subject to the terms and conditions of this
+   Agreement, ARM hereby grants to you and to recipients of the Software
+   distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
+   royalty-free, irrevocable copyright license to reproduce, prepare derivative
+   works of, publicly display, publicly perform, sublicense, and distribute the
+   Software and such derivative works.
+
+2. Grant of Patent License. Subject to the terms and conditions of this
+   Agreement, ARM hereby grants you and to recipients of the Software
+   distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
+   royalty-free, irrevocable (except as stated in this section) patent license
+   to make, have made, use, offer to sell, sell, import, and otherwise transfer
+   the Work, where such license applies only to those patent claims licensable
+   by ARM that are necessarily infringed by ARM's Software alone or by
+   combination of the Software with the Work to which such Software was
+   submitted. If any entity institutes patent litigation against ARM or any
+   other entity (including a cross-claim or counterclaim in a lawsuit) alleging
+   that ARM's Software, or the Work to which ARM has contributed constitutes
+   direct or contributory patent infringement, then any patent licenses granted
+   to that entity under this Agreement for the Software or Work shall terminate
+   as of the date such litigation is filed.
+
+Unless required by applicable law or agreed to in writing, the software is
+provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+either express or implied, including, without limitation, any warranties or
+conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE.
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
new file mode 100644
index 000000000000..3959eab966a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -0,0 +1,762 @@
+//===-- ARMAddressingModes.h - ARM Addressing Modes -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+
+/// ARM_AM - ARM Addressing Mode Stuff
+namespace ARM_AM {
+  enum ShiftOpc {
+    no_shift = 0,
+    asr,
+    lsl,
+    lsr,
+    ror,
+    rrx
+  };
+
+  enum AddrOpc {
+    sub = 0,
+    add
+  };
+
+  static inline const char *getAddrOpcStr(AddrOpc Op) {
+    return Op == sub ? "-" : "";
+  }
+
+  static inline const char *getShiftOpcStr(ShiftOpc Op) {
+    switch (Op) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::asr: return "asr";
+    case ARM_AM::lsl: return "lsl";
+    case ARM_AM::lsr: return "lsr";
+    case ARM_AM::ror: return "ror";
+    case ARM_AM::rrx: return "rrx";
+    }
+  }
+
+  static inline unsigned getShiftOpcEncoding(ShiftOpc Op) {
+    switch (Op) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::asr: return 2;
+    case ARM_AM::lsl: return 0;
+    case ARM_AM::lsr: return 1;
+    case ARM_AM::ror: return 3;
+    }
+  }
+
+  enum AMSubMode {
+    bad_am_submode = 0,
+    ia,
+    ib,
+    da,
+    db
+  };
+
+  static inline const char *getAMSubModeStr(AMSubMode Mode) {
+    switch (Mode) {
+    default: llvm_unreachable("Unknown addressing sub-mode!");
+    case ARM_AM::ia: return "ia";
+    case ARM_AM::ib: return "ib";
+    case ARM_AM::da: return "da";
+    case ARM_AM::db: return "db";
+    }
+  }
+
+  /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits.
+  ///
+  static inline unsigned rotr32(unsigned Val, unsigned Amt) {
+    assert(Amt < 32 && "Invalid rotate amount");
+    return (Val >> Amt) | (Val << ((32-Amt)&31));
+  }
+
+  /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits.
+  ///
+  static inline unsigned rotl32(unsigned Val, unsigned Amt) {
+    assert(Amt < 32 && "Invalid rotate amount");
+    return (Val << Amt) | (Val >> ((32-Amt)&31));
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #1: shift_operand with registers
+  //===--------------------------------------------------------------------===//
+  //
+  // This 'addressing mode' is used for arithmetic instructions.  It can
+  // represent things like:
+  //   reg
+  //   reg [asr|lsl|lsr|ror|rrx] reg
+  //   reg [asr|lsl|lsr|ror|rrx] imm
+  //
+  // This is stored three operands [rega, regb, opc].  The first is the base
+  // reg, the second is the shift amount (or reg0 if not present or imm).  The
+  // third operand encodes the shift opcode and the imm if a reg isn't present.
+  //
+  static inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) {
+    return ShOp | (Imm << 3);
+  }
+  static inline unsigned getSORegOffset(unsigned Op) {
+    return Op >> 3;
+  }
+  static inline ShiftOpc getSORegShOp(unsigned Op) {
+    return (ShiftOpc)(Op & 7);
+  }
+
+  /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return
+  /// the 8-bit imm value.
+  static inline unsigned getSOImmValImm(unsigned Imm) {
+    return Imm & 0xFF;
+  }
+  /// getSOImmValRot - Given an encoded imm field for the reg/imm form, return
+  /// the rotate amount.
+  static inline unsigned getSOImmValRot(unsigned Imm) {
+    return (Imm >> 8) * 2;
+  }
+
+  /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand,
+  /// computing the rotate amount to use.  If this immediate value cannot be
+  /// handled with a single shifter-op, determine a good rotate amount that will
+  /// take a maximal chunk of bits out of the immediate.
+  static inline unsigned getSOImmValRotate(unsigned Imm) {
+    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+    // of zero.
+    if ((Imm & ~255U) == 0) return 0;
+
+    // Use CTZ to compute the rotate amount.
+    unsigned TZ = countTrailingZeros(Imm);
+
+    // Rotate amount must be even.  Something like 0x200 must be rotated 8 bits,
+    // not 9.
+    unsigned RotAmt = TZ & ~1;
+
+    // If we can handle this spread, return it.
+    if ((rotr32(Imm, RotAmt) & ~255U) == 0)
+      return (32-RotAmt)&31;  // HW rotates right, not left.
+
+    // For values like 0xF000000F, we should ignore the low 6 bits, then
+    // retry the hunt.
+    if (Imm & 63U) {
+      unsigned TZ2 = countTrailingZeros(Imm & ~63U);
+      unsigned RotAmt2 = TZ2 & ~1;
+      if ((rotr32(Imm, RotAmt2) & ~255U) == 0)
+        return (32-RotAmt2)&31;  // HW rotates right, not left.
+    }
+
+    // Otherwise, we have no way to cover this span of bits with a single
+    // shifter_op immediate.  Return a chunk of bits that will be useful to
+    // handle.
+    return (32-RotAmt)&31;  // HW rotates right, not left.
+  }
+
+  /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit
+  /// into an shifter_operand immediate operand, return the 12-bit encoding for
+  /// it.  If not, return -1.
+  static inline int getSOImmVal(unsigned Arg) {
+    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+    // of zero.
+    if ((Arg & ~255U) == 0) return Arg;
+
+    unsigned RotAmt = getSOImmValRotate(Arg);
+
+    // If this cannot be handled with a single shifter_op, bail out.
+    if (rotr32(~255U, RotAmt) & Arg)
+      return -1;
+
+    // Encode this correctly.
+    return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8);
+  }
+
+  /// isSOImmTwoPartVal - Return true if the specified value can be obtained by
+  /// or'ing together two SOImmVal's.
+  static inline bool isSOImmTwoPartVal(unsigned V) {
+    // If this can be handled with a single shifter_op, bail out.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    if (V == 0)
+      return false;
+
+    // If this can be handled with two shifter_op's, accept.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    return V == 0;
+  }
+
+  /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal,
+  /// return the first chunk of it.
+  static inline unsigned getSOImmTwoPartFirst(unsigned V) {
+    return rotr32(255U, getSOImmValRotate(V)) & V;
+  }
+
+  /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal,
+  /// return the second chunk of it.
+  static inline unsigned getSOImmTwoPartSecond(unsigned V) {
+    // Mask out the first hunk.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+
+    // Take what's left.
+    assert(V == (rotr32(255U, getSOImmValRotate(V)) & V));
+    return V;
+  }
+
+  /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
+  /// by a left shift. Returns the shift amount to use.
+  static inline unsigned getThumbImmValShift(unsigned Imm) {
+    // 8-bit (or less) immediates are trivially immediate operand with a shift
+    // of zero.
+    if ((Imm & ~255U) == 0) return 0;
+
+    // Use CTZ to compute the shift amount.
+    return countTrailingZeros(Imm);
+  }
+
+  /// isThumbImmShiftedVal - Return true if the specified value can be obtained
+  /// by left shifting a 8-bit immediate.
+  static inline bool isThumbImmShiftedVal(unsigned V) {
+    // If this can be handled with
+    V = (~255U << getThumbImmValShift(V)) & V;
+    return V == 0;
+  }
+
+  /// getThumbImm16ValShift - Try to handle Imm with a 16-bit immediate followed
+  /// by a left shift. Returns the shift amount to use.
+  static inline unsigned getThumbImm16ValShift(unsigned Imm) {
+    // 16-bit (or less) immediates are trivially immediate operand with a shift
+    // of zero.
+    if ((Imm & ~65535U) == 0) return 0;
+
+    // Use CTZ to compute the shift amount.
+    return countTrailingZeros(Imm);
+  }
+
+  /// isThumbImm16ShiftedVal - Return true if the specified value can be
+  /// obtained by left shifting a 16-bit immediate.
+  static inline bool isThumbImm16ShiftedVal(unsigned V) {
+    // If this can be handled with
+    V = (~65535U << getThumbImm16ValShift(V)) & V;
+    return V == 0;
+  }
+
+  /// getThumbImmNonShiftedVal - If V is a value that satisfies
+  /// isThumbImmShiftedVal, return the non-shiftd value.
+  static inline unsigned getThumbImmNonShiftedVal(unsigned V) {
+    return V >> getThumbImmValShift(V);
+  }
+
+
+  /// getT2SOImmValSplat - Return the 12-bit encoded representation
+  /// if the specified value can be obtained by splatting the low 8 bits
+  /// into every other byte or every byte of a 32-bit value. i.e.,
+  ///     00000000 00000000 00000000 abcdefgh    control = 0
+  ///     00000000 abcdefgh 00000000 abcdefgh    control = 1
+  ///     abcdefgh 00000000 abcdefgh 00000000    control = 2
+  ///     abcdefgh abcdefgh abcdefgh abcdefgh    control = 3
+  /// Return -1 if none of the above apply.
+  /// See ARM Reference Manual A6.3.2.
+  static inline int getT2SOImmValSplatVal(unsigned V) {
+    unsigned u, Vs, Imm;
+    // control = 0
+    if ((V & 0xffffff00) == 0)
+      return V;
+
+    // If the value is zeroes in the first byte, just shift those off
+    Vs = ((V & 0xff) == 0) ? V >> 8 : V;
+    // Any passing value only has 8 bits of payload, splatted across the word
+    Imm = Vs & 0xff;
+    // Likewise, any passing values have the payload splatted into the 3rd byte
+    u = Imm | (Imm << 16);
+
+    // control = 1 or 2
+    if (Vs == u)
+      return (((Vs == V) ? 1 : 2) << 8) | Imm;
+
+    // control = 3
+    if (Vs == (u | (u << 8)))
+      return (3 << 8) | Imm;
+
+    return -1;
+  }
+
+  /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the
+  /// specified value is a rotated 8-bit value. Return -1 if no rotation
+  /// encoding is possible.
+  /// See ARM Reference Manual A6.3.2.
+  static inline int getT2SOImmValRotateVal(unsigned V) {
+    unsigned RotAmt = countLeadingZeros(V);
+    if (RotAmt >= 24)
+      return -1;
+
+    // If 'Arg' can be handled with a single shifter_op return the value.
+    if ((rotr32(0xff000000U, RotAmt) & V) == V)
+      return (rotr32(V, 24 - RotAmt) & 0x7f) | ((RotAmt + 8) << 7);
+
+    return -1;
+  }
+
+  /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit
+  /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit
+  /// encoding for it.  If not, return -1.
+  /// See ARM Reference Manual A6.3.2.
+  static inline int getT2SOImmVal(unsigned Arg) {
+    // If 'Arg' is an 8-bit splat, then get the encoded value.
+    int Splat = getT2SOImmValSplatVal(Arg);
+    if (Splat != -1)
+      return Splat;
+
+    // If 'Arg' can be handled with a single shifter_op return the value.
+    int Rot = getT2SOImmValRotateVal(Arg);
+    if (Rot != -1)
+      return Rot;
+
+    return -1;
+  }
+
+  static inline unsigned getT2SOImmValRotate(unsigned V) {
+    if ((V & ~255U) == 0) return 0;
+    // Use CTZ to compute the rotate amount.
+    unsigned RotAmt = countTrailingZeros(V);
+    return (32 - RotAmt) & 31;
+  }
+
+  static inline bool isT2SOImmTwoPartVal (unsigned Imm) {
+    unsigned V = Imm;
+    // Passing values can be any combination of splat values and shifter
+    // values. If this can be handled with a single shifter or splat, bail
+    // out. Those should be handled directly, not with a two-part val.
+    if (getT2SOImmValSplatVal(V) != -1)
+      return false;
+    V = rotr32 (~255U, getT2SOImmValRotate(V)) & V;
+    if (V == 0)
+      return false;
+
+    // If this can be handled as an immediate, accept.
+    if (getT2SOImmVal(V) != -1) return true;
+
+    // Likewise, try masking out a splat value first.
+    V = Imm;
+    if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1)
+      V &= ~0xff00ff00U;
+    else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1)
+      V &= ~0x00ff00ffU;
+    // If what's left can be handled as an immediate, accept.
+    if (getT2SOImmVal(V) != -1) return true;
+
+    // Otherwise, do not accept.
+    return false;
+  }
+
+  static inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) {
+    assert (isT2SOImmTwoPartVal(Imm) &&
+            "Immedate cannot be encoded as two part immediate!");
+    // Try a shifter operand as one part
+    unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm;
+    // If the rest is encodable as an immediate, then return it.
+    if (getT2SOImmVal(V) != -1) return V;
+
+    // Try masking out a splat value first.
+    if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1)
+      return Imm & 0xff00ff00U;
+
+    // The other splat is all that's left as an option.
+    assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1);
+    return Imm & 0x00ff00ffU;
+  }
+
+  static inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) {
+    // Mask out the first hunk
+    Imm ^= getT2SOImmTwoPartFirst(Imm);
+    // Return what's left
+    assert (getT2SOImmVal(Imm) != -1 &&
+            "Unable to encode second part of T2 two part SO immediate");
+    return Imm;
+  }
+
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #2
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for most simple load/store instructions.
+  //
+  // addrmode2 := reg +/- reg shop imm
+  // addrmode2 := reg +/- imm12
+  //
+  // The first operand is always a Reg.  The second operand is a reg if in
+  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
+  // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. The
+  // fourth operand 16-17 encodes the index mode.
+  //
+  // If this addressing mode is a frame index (before prolog/epilog insertion
+  // and code rewriting), this operand will have the form:  FI#, reg0, <offs>
+  // with no shift amount for the frame offset.
+  //
+  static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO,
+                                   unsigned IdxMode = 0) {
+    assert(Imm12 < (1 << 12) && "Imm too large!");
+    bool isSub = Opc == sub;
+    return Imm12 | ((int)isSub << 12) | (SO << 13) | (IdxMode << 16) ;
+  }
+  static inline unsigned getAM2Offset(unsigned AM2Opc) {
+    return AM2Opc & ((1 << 12)-1);
+  }
+  static inline AddrOpc getAM2Op(unsigned AM2Opc) {
+    return ((AM2Opc >> 12) & 1) ? sub : add;
+  }
+  static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
+    return (ShiftOpc)((AM2Opc >> 13) & 7);
+  }
+  static inline unsigned getAM2IdxMode(unsigned AM2Opc) {
+    return (AM2Opc >> 16);
+  }
+
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #3
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for sign-extending loads, and load/store-pair instructions.
+  //
+  // addrmode3 := reg +/- reg
+  // addrmode3 := reg +/- imm8
+  //
+  // The first operand is always a Reg.  The second operand is a reg if in
+  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
+  // in bit 8, the immediate in bits 0-7. The fourth operand 9-10 encodes the
+  // index mode.
+
+  /// getAM3Opc - This function encodes the addrmode3 opc field.
+  static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset,
+                                   unsigned IdxMode = 0) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset | (IdxMode << 9);
+  }
+  static inline unsigned char getAM3Offset(unsigned AM3Opc) {
+    return AM3Opc & 0xFF;
+  }
+  static inline AddrOpc getAM3Op(unsigned AM3Opc) {
+    return ((AM3Opc >> 8) & 1) ? sub : add;
+  }
+  static inline unsigned getAM3IdxMode(unsigned AM3Opc) {
+    return (AM3Opc >> 9);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #4
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for load / store multiple instructions.
+  //
+  // addrmode4 := reg, <mode>
+  //
+  // The four modes are:
+  //    IA - Increment after
+  //    IB - Increment before
+  //    DA - Decrement after
+  //    DB - Decrement before
+  // For VFP instructions, only the IA and DB modes are valid.
+
+  static inline AMSubMode getAM4SubMode(unsigned Mode) {
+    return (AMSubMode)(Mode & 0x7);
+  }
+
+  static inline unsigned getAM4ModeImm(AMSubMode SubMode) {
+    return (int)SubMode;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #5
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for coprocessor instructions, such as FP load/stores.
+  //
+  // addrmode5 := reg +/- imm8*4
+  //
+  // The first operand is always a Reg.  The second operand encodes the
+  // operation (add or subtract) in bit 8 and the immediate in bits 0-7.
+
+  /// getAM5Opc - This function encodes the addrmode5 opc field.
+  static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset;
+  }
+  static inline unsigned char getAM5Offset(unsigned AM5Opc) {
+    return AM5Opc & 0xFF;
+  }
+  static inline AddrOpc getAM5Op(unsigned AM5Opc) {
+    return ((AM5Opc >> 8) & 1) ? sub : add;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #5 FP16
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for coprocessor instructions, such as 16-bit FP load/stores.
+  //
+  // addrmode5fp16 := reg +/- imm8*2
+  //
+  // The first operand is always a Reg.  The second operand encodes the
+  // operation (add or subtract) in bit 8 and the immediate in bits 0-7.
+
+  /// getAM5FP16Opc - This function encodes the addrmode5fp16 opc field.
+  static inline unsigned getAM5FP16Opc(AddrOpc Opc, unsigned char Offset) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset;
+  }
+  static inline unsigned char getAM5FP16Offset(unsigned AM5Opc) {
+    return AM5Opc & 0xFF;
+  }
+  static inline AddrOpc getAM5FP16Op(unsigned AM5Opc) {
+    return ((AM5Opc >> 8) & 1) ? sub : add;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #6
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for NEON load / store instructions.
+  //
+  // addrmode6 := reg with optional alignment
+  //
+  // This is stored in two operands [regaddr, align].  The first is the
+  // address register.  The second operand is the value of the alignment
+  // specifier in bytes or zero if no explicit alignment.
+  // Valid alignments depend on the specific instruction.
+
+  //===--------------------------------------------------------------------===//
+  // NEON Modified Immediates
+  //===--------------------------------------------------------------------===//
+  //
+  // Several NEON instructions (e.g., VMOV) take a "modified immediate"
+  // vector operand, where a small immediate encoded in the instruction
+  // specifies a full NEON vector value.  These modified immediates are
+  // represented here as encoded integers.  The low 8 bits hold the immediate
+  // value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold
+  // the "Cmode" field of the instruction.  The interfaces below treat the
+  // Op and Cmode values as a single 5-bit value.
+
+  static inline unsigned createNEONModImm(unsigned OpCmode, unsigned Val) {
+    return (OpCmode << 8) | Val;
+  }
+  static inline unsigned getNEONModImmOpCmode(unsigned ModImm) {
+    return (ModImm >> 8) & 0x1f;
+  }
+  static inline unsigned getNEONModImmVal(unsigned ModImm) {
+    return ModImm & 0xff;
+  }
+
+  /// decodeNEONModImm - Decode a NEON modified immediate value into the
+  /// element value and the element size in bits.  (If the element size is
+  /// smaller than the vector, it is splatted into all the elements.)
+  static inline uint64_t decodeNEONModImm(unsigned ModImm, unsigned &EltBits) {
+    unsigned OpCmode = getNEONModImmOpCmode(ModImm);
+    unsigned Imm8 = getNEONModImmVal(ModImm);
+    uint64_t Val = 0;
+
+    if (OpCmode == 0xe) {
+      // 8-bit vector elements
+      Val = Imm8;
+      EltBits = 8;
+    } else if ((OpCmode & 0xc) == 0x8) {
+      // 16-bit vector elements
+      unsigned ByteNum = (OpCmode & 0x6) >> 1;
+      Val = Imm8 << (8 * ByteNum);
+      EltBits = 16;
+    } else if ((OpCmode & 0x8) == 0) {
+      // 32-bit vector elements, zero with one byte set
+      unsigned ByteNum = (OpCmode & 0x6) >> 1;
+      Val = Imm8 << (8 * ByteNum);
+      EltBits = 32;
+    } else if ((OpCmode & 0xe) == 0xc) {
+      // 32-bit vector elements, one byte with low bits set
+      unsigned ByteNum = 1 + (OpCmode & 0x1);
+      Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum)));
+      EltBits = 32;
+    } else if (OpCmode == 0x1e) {
+      // 64-bit vector elements
+      for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
+        if ((ModImm >> ByteNum) & 1)
+          Val |= (uint64_t)0xff << (8 * ByteNum);
+      }
+      EltBits = 64;
+    } else {
+      llvm_unreachable("Unsupported NEON immediate");
+    }
+    return Val;
+  }
+
+  // Generic validation for single-byte immediate (0X00, 00X0, etc).
+  static inline bool isNEONBytesplat(unsigned Value, unsigned Size) {
+    assert(Size >= 1 && Size <= 4 && "Invalid size");
+    unsigned count = 0;
+    for (unsigned i = 0; i < Size; ++i) {
+      if (Value & 0xff) count++;
+      Value >>= 8;
+    }
+    return count == 1;
+  }
+
+  /// Checks if Value is a correct immediate for instructions like VBIC/VORR.
+  static inline bool isNEONi16splat(unsigned Value) {
+    if (Value > 0xffff)
+      return false;
+    // i16 value with set bits only in one byte X0 or 0X.
+    return Value == 0 || isNEONBytesplat(Value, 2);
+  }
+
+  // Encode NEON 16 bits Splat immediate for instructions like VBIC/VORR
+  static inline unsigned encodeNEONi16splat(unsigned Value) {
+    assert(isNEONi16splat(Value) && "Invalid NEON splat value");
+    if (Value >= 0x100)
+      Value = (Value >> 8) | 0xa00;
+    else
+      Value |= 0x800;
+    return Value;
+  }
+
+  /// Checks if Value is a correct immediate for instructions like VBIC/VORR.
+  static inline bool isNEONi32splat(unsigned Value) {
+    // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X.
+    return Value == 0 || isNEONBytesplat(Value, 4);
+  }
+
+  /// Encode NEON 32 bits Splat immediate for instructions like VBIC/VORR.
+  static inline unsigned encodeNEONi32splat(unsigned Value) {
+    assert(isNEONi32splat(Value) && "Invalid NEON splat value");
+    if (Value >= 0x100 && Value <= 0xff00)
+      Value = (Value >> 8) | 0x200;
+    else if (Value > 0xffff && Value <= 0xff0000)
+      Value = (Value >> 16) | 0x400;
+    else if (Value > 0xffffff)
+      Value = (Value >> 24) | 0x600;
+    return Value;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Floating-point Immediates
+  //
+  static inline float getFPImmFloat(unsigned Imm) {
+    // We expect an 8-bit binary encoding of a floating-point number here.
+    union {
+      uint32_t I;
+      float F;
+    } FPUnion;
+
+    uint8_t Sign = (Imm >> 7) & 0x1;
+    uint8_t Exp = (Imm >> 4) & 0x7;
+    uint8_t Mantissa = Imm & 0xf;
+
+    //   8-bit FP    iEEEE Float Encoding
+    //   abcd efgh   aBbbbbbc defgh000 00000000 00000000
+    //
+    // where B = NOT(b);
+
+    FPUnion.I = 0;
+    FPUnion.I |= Sign << 31;
+    FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
+    FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
+    FPUnion.I |= (Exp & 0x3) << 23;
+    FPUnion.I |= Mantissa << 19;
+    return FPUnion.F;
+  }
+
+  /// getFP16Imm - Return an 8-bit floating-point version of the 16-bit
+  /// floating-point value. If the value cannot be represented as an 8-bit
+  /// floating-point value, then return -1.
+  static inline int getFP16Imm(const APInt &Imm) {
+    uint32_t Sign = Imm.lshr(15).getZExtValue() & 1;
+    int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15;  // -14 to 15
+    int64_t Mantissa = Imm.getZExtValue() & 0x3ff;  // 10 bits
+
+    // We can handle 4 bits of mantissa.
+    // mantissa = (16+UInt(e:f:g:h))/16.
+    if (Mantissa & 0x3f)
+      return -1;
+    Mantissa >>= 6;
+
+    // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+    if (Exp < -3 || Exp > 4)
+      return -1;
+    Exp = ((Exp+3) & 0x7) ^ 4;
+
+    return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+  }
+
+  static inline int getFP16Imm(const APFloat &FPImm) {
+    return getFP16Imm(FPImm.bitcastToAPInt());
+  }
+
+  /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
+  /// floating-point value. If the value cannot be represented as an 8-bit
+  /// floating-point value, then return -1.
+  static inline int getFP32Imm(const APInt &Imm) {
+    uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
+    int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
+    int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
+
+    // We can handle 4 bits of mantissa.
+    // mantissa = (16+UInt(e:f:g:h))/16.
+    if (Mantissa & 0x7ffff)
+      return -1;
+    Mantissa >>= 19;
+    if ((Mantissa & 0xf) != Mantissa)
+      return -1;
+
+    // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+    if (Exp < -3 || Exp > 4)
+      return -1;
+    Exp = ((Exp+3) & 0x7) ^ 4;
+
+    return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+  }
+
+  static inline int getFP32Imm(const APFloat &FPImm) {
+    return getFP32Imm(FPImm.bitcastToAPInt());
+  }
+
+  /// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
+  /// floating-point value. If the value cannot be represented as an 8-bit
+  /// floating-point value, then return -1.
+  static inline int getFP64Imm(const APInt &Imm) {
+    uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
+    int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023
+    uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
+
+    // We can handle 4 bits of mantissa.
+    // mantissa = (16+UInt(e:f:g:h))/16.
+    if (Mantissa & 0xffffffffffffULL)
+      return -1;
+    Mantissa >>= 48;
+    if ((Mantissa & 0xf) != Mantissa)
+      return -1;
+
+    // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+    if (Exp < -3 || Exp > 4)
+      return -1;
+    Exp = ((Exp+3) & 0x7) ^ 4;
+
+    return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+  }
+
+  static inline int getFP64Imm(const APFloat &FPImm) {
+    return getFP64Imm(FPImm.bitcastToAPInt());
+  }
+
+} // end namespace ARM_AM
+} // end namespace llvm
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
new file mode 100644
index 000000000000..a58d5b34131b
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -0,0 +1,1168 @@
+//===-- ARMAsmBackend.cpp - ARM Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMAsmBackend.h"
+#include "MCTargetDesc/ARMAsmBackendDarwin.h"
+#include "MCTargetDesc/ARMAsmBackendELF.h"
+#include "MCTargetDesc/ARMAsmBackendWinCOFF.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+class ARMELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  ARMELFObjectWriter(uint8_t OSABI)
+      : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI, ELF::EM_ARM,
+                                /*HasRelocationAddend*/ false) {}
+};
+} // end anonymous namespace
+
+const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+  const static MCFixupKindInfo InfosLE[ARM::NumTargetFixupKinds] = {
+      // This table *must* be in the order that the fixup_* kinds are defined in
+      // ARMFixupKinds.h.
+      //
+      // Name                      Offset (bits) Size (bits)     Flags
+      {"fixup_arm_ldst_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_ldst_pcrel_12", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_pcrel_10_unscaled", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_pcrel_10", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_pcrel_10", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_pcrel_9", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_pcrel_9", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_thumb_adr_pcrel_10", 0, 8,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_adr_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_adr_pcrel_12", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_condbranch", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_uncondbranch", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_condbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_uncondbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_uncondbl", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_condbl", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_blx", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_bl", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_blx", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_thumb_cb", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_cp", 0, 8,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_thumb_bcc", 0, 8, MCFixupKindInfo::FKF_IsPCRel},
+      // movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16
+      // - 19.
+      {"fixup_arm_movt_hi16", 0, 20, 0},
+      {"fixup_arm_movw_lo16", 0, 20, 0},
+      {"fixup_t2_movt_hi16", 0, 20, 0},
+      {"fixup_t2_movw_lo16", 0, 20, 0},
+      {"fixup_arm_mod_imm", 0, 12, 0},
+  };
+  const static MCFixupKindInfo InfosBE[ARM::NumTargetFixupKinds] = {
+      // This table *must* be in the order that the fixup_* kinds are defined in
+      // ARMFixupKinds.h.
+      //
+      // Name                      Offset (bits) Size (bits)     Flags
+      {"fixup_arm_ldst_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_ldst_pcrel_12", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_pcrel_10_unscaled", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_pcrel_10", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_pcrel_10", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_pcrel_9", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_pcrel_9", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_thumb_adr_pcrel_10", 8, 8,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_adr_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_adr_pcrel_12", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_condbranch", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_uncondbranch", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_condbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_uncondbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_uncondbl", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_condbl", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_blx", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_bl", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_blx", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_thumb_cb", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_cp", 8, 8,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_thumb_bcc", 8, 8, MCFixupKindInfo::FKF_IsPCRel},
+      // movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16
+      // - 19.
+      {"fixup_arm_movt_hi16", 12, 20, 0},
+      {"fixup_arm_movw_lo16", 12, 20, 0},
+      {"fixup_t2_movt_hi16", 12, 20, 0},
+      {"fixup_t2_movw_lo16", 12, 20, 0},
+      {"fixup_arm_mod_imm", 20, 12, 0},
+  };
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+         "Invalid kind!");
+  return (IsLittleEndian ? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
+}
+
+void ARMAsmBackend::handleAssemblerFlag(MCAssemblerFlag Flag) {
+  switch (Flag) {
+  default:
+    break;
+  case MCAF_Code16:
+    setIsThumb(true);
+    break;
+  case MCAF_Code32:
+    setIsThumb(false);
+    break;
+  }
+}
+
+unsigned ARMAsmBackend::getRelaxedOpcode(unsigned Op) const {
+  bool HasThumb2 = STI->getFeatureBits()[ARM::FeatureThumb2];
+  bool HasV8MBaselineOps = STI->getFeatureBits()[ARM::HasV8MBaselineOps];
+
+  switch (Op) {
+  default:
+    return Op;
+  case ARM::tBcc:
+    return HasThumb2 ? (unsigned)ARM::t2Bcc : Op;
+  case ARM::tLDRpci:
+    return HasThumb2 ? (unsigned)ARM::t2LDRpci : Op;
+  case ARM::tADR:
+    return HasThumb2 ? (unsigned)ARM::t2ADR : Op;
+  case ARM::tB:
+    return HasV8MBaselineOps ? (unsigned)ARM::t2B : Op;
+  case ARM::tCBZ:
+    return ARM::tHINT;
+  case ARM::tCBNZ:
+    return ARM::tHINT;
+  }
+}
+
+bool ARMAsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  if (getRelaxedOpcode(Inst.getOpcode()) != Inst.getOpcode())
+    return true;
+  return false;
+}
+
+const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup,
+                                                    uint64_t Value) const {
+  switch ((unsigned)Fixup.getKind()) {
+  case ARM::fixup_arm_thumb_br: {
+    // Relaxing tB to t2B. tB has a signed 12-bit displacement with the
+    // low bit being an implied zero. There's an implied +4 offset for the
+    // branch, so we adjust the other way here to determine what's
+    // encodable.
+    //
+    // Relax if the value is too big for a (signed) i8.
+    int64_t Offset = int64_t(Value) - 4;
+    if (Offset > 2046 || Offset < -2048)
+      return "out of range pc-relative fixup value";
+    break;
+  }
+  case ARM::fixup_arm_thumb_bcc: {
+    // Relaxing tBcc to t2Bcc. tBcc has a signed 9-bit displacement with the
+    // low bit being an implied zero. There's an implied +4 offset for the
+    // branch, so we adjust the other way here to determine what's
+    // encodable.
+    //
+    // Relax if the value is too big for a (signed) i8.
+    int64_t Offset = int64_t(Value) - 4;
+    if (Offset > 254 || Offset < -256)
+      return "out of range pc-relative fixup value";
+    break;
+  }
+  case ARM::fixup_thumb_adr_pcrel_10:
+  case ARM::fixup_arm_thumb_cp: {
+    // If the immediate is negative, greater than 1020, or not a multiple
+    // of four, the wide version of the instruction must be used.
+    int64_t Offset = int64_t(Value) - 4;
+    if (Offset & 3)
+      return "misaligned pc-relative fixup value";
+    else if (Offset > 1020 || Offset < 0)
+      return "out of range pc-relative fixup value";
+    break;
+  }
+  case ARM::fixup_arm_thumb_cb: {
+    // If we have a Thumb CBZ or CBNZ instruction and its target is the next
+    // instruction it is is actually out of range for the instruction.
+    // It will be changed to a NOP.
+    int64_t Offset = (Value & ~1);
+    if (Offset == 2)
+      return "will be converted to nop";
+    break;
+  }
+  default:
+    llvm_unreachable("Unexpected fixup kind in reasonForFixupRelaxation()!");
+  }
+  return nullptr;
+}
+
+bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                                         const MCRelaxableFragment *DF,
+                                         const MCAsmLayout &Layout) const {
+  return reasonForFixupRelaxation(Fixup, Value);
+}
+
+void ARMAsmBackend::relaxInstruction(const MCInst &Inst,
+                                     const MCSubtargetInfo &STI,
+                                     MCInst &Res) const {
+  unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
+
+  // Sanity check w/ diagnostic if we get here w/ a bogus instruction.
+  if (RelaxedOp == Inst.getOpcode()) {
+    SmallString<256> Tmp;
+    raw_svector_ostream OS(Tmp);
+    Inst.dump_pretty(OS);
+    OS << "\n";
+    report_fatal_error("unexpected instruction to relax: " + OS.str());
+  }
+
+  // If we are changing Thumb CBZ or CBNZ instruction to a NOP, aka tHINT, we
+  // have to change the operands too.
+  if ((Inst.getOpcode() == ARM::tCBZ || Inst.getOpcode() == ARM::tCBNZ) &&
+      RelaxedOp == ARM::tHINT) {
+    Res.setOpcode(RelaxedOp);
+    Res.addOperand(MCOperand::createImm(0));
+    Res.addOperand(MCOperand::createImm(14));
+    Res.addOperand(MCOperand::createReg(0));
+    return;
+  }
+
+  // The rest of instructions we're relaxing have the same operands.
+  // We just need to update to the proper opcode.
+  Res = Inst;
+  Res.setOpcode(RelaxedOp);
+}
+
+bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  const uint16_t Thumb1_16bitNopEncoding = 0x46c0; // using MOV r8,r8
+  const uint16_t Thumb2_16bitNopEncoding = 0xbf00; // NOP
+  const uint32_t ARMv4_NopEncoding = 0xe1a00000;   // using MOV r0,r0
+  const uint32_t ARMv6T2_NopEncoding = 0xe320f000; // NOP
+  if (isThumb()) {
+    const uint16_t nopEncoding =
+        hasNOP() ? Thumb2_16bitNopEncoding : Thumb1_16bitNopEncoding;
+    uint64_t NumNops = Count / 2;
+    for (uint64_t i = 0; i != NumNops; ++i)
+      OW->write16(nopEncoding);
+    if (Count & 1)
+      OW->write8(0);
+    return true;
+  }
+  // ARM mode
+  const uint32_t nopEncoding =
+      hasNOP() ? ARMv6T2_NopEncoding : ARMv4_NopEncoding;
+  uint64_t NumNops = Count / 4;
+  for (uint64_t i = 0; i != NumNops; ++i)
+    OW->write32(nopEncoding);
+  // FIXME: should this function return false when unable to write exactly
+  // 'Count' bytes with NOP encodings?
+  switch (Count % 4) {
+  default:
+    break; // No leftover bytes to write
+  case 1:
+    OW->write8(0);
+    break;
+  case 2:
+    OW->write16(0);
+    break;
+  case 3:
+    OW->write16(0);
+    OW->write8(0xa0);
+    break;
+  }
+
+  return true;
+}
+
+static uint32_t swapHalfWords(uint32_t Value, bool IsLittleEndian) {
+  if (IsLittleEndian) {
+    // Note that the halfwords are stored high first and low second in thumb;
+    // so we need to swap the fixup value here to map properly.
+    uint32_t Swapped = (Value & 0xFFFF0000) >> 16;
+    Swapped |= (Value & 0x0000FFFF) << 16;
+    return Swapped;
+  } else
+    return Value;
+}
+
+static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf,
+                              bool IsLittleEndian) {
+  uint32_t Value;
+
+  if (IsLittleEndian) {
+    Value = (SecondHalf & 0xFFFF) << 16;
+    Value |= (FirstHalf & 0xFFFF);
+  } else {
+    Value = (SecondHalf & 0xFFFF);
+    Value |= (FirstHalf & 0xFFFF) << 16;
+  }
+
+  return Value;
+}
+
+unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+                                         bool IsPCRel, MCContext *Ctx,
+                                         bool IsLittleEndian,
+                                         bool IsResolved) const {
+  unsigned Kind = Fixup.getKind();
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+    return Value;
+  case FK_SecRel_2:
+    return Value;
+  case FK_SecRel_4:
+    return Value;
+  case ARM::fixup_arm_movt_hi16:
+    if (!IsPCRel)
+      Value >>= 16;
+    LLVM_FALLTHROUGH;
+  case ARM::fixup_arm_movw_lo16: {
+    unsigned Hi4 = (Value & 0xF000) >> 12;
+    unsigned Lo12 = Value & 0x0FFF;
+    // inst{19-16} = Hi4;
+    // inst{11-0} = Lo12;
+    Value = (Hi4 << 16) | (Lo12);
+    return Value;
+  }
+  case ARM::fixup_t2_movt_hi16:
+    if (!IsPCRel)
+      Value >>= 16;
+    LLVM_FALLTHROUGH;
+  case ARM::fixup_t2_movw_lo16: {
+    unsigned Hi4 = (Value & 0xF000) >> 12;
+    unsigned i = (Value & 0x800) >> 11;
+    unsigned Mid3 = (Value & 0x700) >> 8;
+    unsigned Lo8 = Value & 0x0FF;
+    // inst{19-16} = Hi4;
+    // inst{26} = i;
+    // inst{14-12} = Mid3;
+    // inst{7-0} = Lo8;
+    Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8);
+    return swapHalfWords(Value, IsLittleEndian);
+  }
+  case ARM::fixup_arm_ldst_pcrel_12:
+    // ARM PC-relative values are offset by 8.
+    Value -= 4;
+    LLVM_FALLTHROUGH;
+  case ARM::fixup_t2_ldst_pcrel_12: {
+    // Offset by 4, adjusted by two due to the half-word ordering of thumb.
+    Value -= 4;
+    bool isAdd = true;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      isAdd = false;
+    }
+    if (Ctx && Value >= 4096) {
+      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+      return 0;
+    }
+    Value |= isAdd << 23;
+
+    // Same addressing mode as fixup_arm_pcrel_10,
+    // but with 16-bit halfwords swapped.
+    if (Kind == ARM::fixup_t2_ldst_pcrel_12)
+      return swapHalfWords(Value, IsLittleEndian);
+
+    return Value;
+  }
+  case ARM::fixup_arm_adr_pcrel_12: {
+    // ARM PC-relative values are offset by 8.
+    Value -= 8;
+    unsigned opc = 4; // bits {24-21}. Default to add: 0b0100
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      opc = 2; // 0b0010
+    }
+    if (Ctx && ARM_AM::getSOImmVal(Value) == -1) {
+      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+      return 0;
+    }
+    // Encode the immediate and shift the opcode into place.
+    return ARM_AM::getSOImmVal(Value) | (opc << 21);
+  }
+
+  case ARM::fixup_t2_adr_pcrel_12: {
+    Value -= 4;
+    unsigned opc = 0;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      opc = 5;
+    }
+
+    uint32_t out = (opc << 21);
+    out |= (Value & 0x800) << 15;
+    out |= (Value & 0x700) << 4;
+    out |= (Value & 0x0FF);
+
+    return swapHalfWords(out, IsLittleEndian);
+  }
+
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+  case ARM::fixup_arm_uncondbl:
+  case ARM::fixup_arm_condbl:
+  case ARM::fixup_arm_blx:
+    // These values don't encode the low two bits since they're always zero.
+    // Offset by 8 just as above.
+    if (const MCSymbolRefExpr *SRE =
+            dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
+      if (SRE->getKind() == MCSymbolRefExpr::VK_TLSCALL)
+        return 0;
+    return 0xffffff & ((Value - 8) >> 2);
+  case ARM::fixup_t2_uncondbranch: {
+    Value = Value - 4;
+    Value >>= 1; // Low bit is not encoded.
+
+    uint32_t out = 0;
+    bool I = Value & 0x800000;
+    bool J1 = Value & 0x400000;
+    bool J2 = Value & 0x200000;
+    J1 ^= I;
+    J2 ^= I;
+
+    out |= I << 26;                 // S bit
+    out |= !J1 << 13;               // J1 bit
+    out |= !J2 << 11;               // J2 bit
+    out |= (Value & 0x1FF800) << 5; // imm6 field
+    out |= (Value & 0x0007FF);      // imm11 field
+
+    return swapHalfWords(out, IsLittleEndian);
+  }
+  case ARM::fixup_t2_condbranch: {
+    Value = Value - 4;
+    Value >>= 1; // Low bit is not encoded.
+
+    uint64_t out = 0;
+    out |= (Value & 0x80000) << 7; // S bit
+    out |= (Value & 0x40000) >> 7; // J2 bit
+    out |= (Value & 0x20000) >> 4; // J1 bit
+    out |= (Value & 0x1F800) << 5; // imm6 field
+    out |= (Value & 0x007FF);      // imm11 field
+
+    return swapHalfWords(out, IsLittleEndian);
+  }
+  case ARM::fixup_arm_thumb_bl: {
+    // The value doesn't encode the low bit (always zero) and is offset by
+    // four. The 32-bit immediate value is encoded as
+    //   imm32 = SignExtend(S:I1:I2:imm10:imm11:0)
+    // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
+    // The value is encoded into disjoint bit positions in the destination
+    // opcode. x = unchanged, I = immediate value bit, S = sign extension bit,
+    // J = either J1 or J2 bit
+    //
+    //   BL:  xxxxxSIIIIIIIIII xxJxJIIIIIIIIIII
+    //
+    // Note that the halfwords are stored high first, low second; so we need
+    // to transpose the fixup value here to map properly.
+    uint32_t offset = (Value - 4) >> 1;
+    uint32_t signBit = (offset & 0x800000) >> 23;
+    uint32_t I1Bit = (offset & 0x400000) >> 22;
+    uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit;
+    uint32_t I2Bit = (offset & 0x200000) >> 21;
+    uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
+    uint32_t imm10Bits = (offset & 0x1FF800) >> 11;
+    uint32_t imm11Bits = (offset & 0x000007FF);
+
+    uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
+    uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
+                           (uint16_t)imm11Bits);
+    return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
+  }
+  case ARM::fixup_arm_thumb_blx: {
+    // The value doesn't encode the low two bits (always zero) and is offset by
+    // four (see fixup_arm_thumb_cp). The 32-bit immediate value is encoded as
+    //   imm32 = SignExtend(S:I1:I2:imm10H:imm10L:00)
+    // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
+    // The value is encoded into disjoint bit positions in the destination
+    // opcode. x = unchanged, I = immediate value bit, S = sign extension bit,
+    // J = either J1 or J2 bit, 0 = zero.
+    //
+    //   BLX: xxxxxSIIIIIIIIII xxJxJIIIIIIIIII0
+    //
+    // Note that the halfwords are stored high first, low second; so we need
+    // to transpose the fixup value here to map properly.
+    if (Ctx && Value % 4 != 0) {
+      Ctx->reportError(Fixup.getLoc(), "misaligned ARM call destination");
+      return 0;
+    }
+
+    uint32_t offset = (Value - 4) >> 2;
+    if (const MCSymbolRefExpr *SRE =
+            dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
+      if (SRE->getKind() == MCSymbolRefExpr::VK_TLSCALL)
+        offset = 0;
+    uint32_t signBit = (offset & 0x400000) >> 22;
+    uint32_t I1Bit = (offset & 0x200000) >> 21;
+    uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit;
+    uint32_t I2Bit = (offset & 0x100000) >> 20;
+    uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
+    uint32_t imm10HBits = (offset & 0xFFC00) >> 10;
+    uint32_t imm10LBits = (offset & 0x3FF);
+
+    uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
+    uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
+                           ((uint16_t)imm10LBits) << 1);
+    return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
+  }
+  case ARM::fixup_thumb_adr_pcrel_10:
+  case ARM::fixup_arm_thumb_cp:
+    // On CPUs supporting Thumb2, this will be relaxed to an ldr.w, otherwise we
+    // could have an error on our hands.
+    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] && IsResolved) {
+      const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
+      if (FixupDiagnostic) {
+        Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+        return 0;
+      }
+    }
+    // Offset by 4, and don't encode the low two bits.
+    return ((Value - 4) >> 2) & 0xff;
+  case ARM::fixup_arm_thumb_cb: {
+    // CB instructions can only branch to offsets in [4, 126] in multiples of 2
+    // so ensure that the raw value LSB is zero and it lies in [2, 130].
+    // An offset of 2 will be relaxed to a NOP.
+    if (Ctx && ((int64_t)Value < 2 || Value > 0x82 || Value & 1)) {
+      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+      return 0;
+    }
+    // Offset by 4 and don't encode the lower bit, which is always 0.
+    // FIXME: diagnose if no Thumb2
+    uint32_t Binary = (Value - 4) >> 1;
+    return ((Binary & 0x20) << 4) | ((Binary & 0x1f) << 3);
+  }
+  case ARM::fixup_arm_thumb_br:
+    // Offset by 4 and don't encode the lower bit, which is always 0.
+    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] &&
+               !STI->getFeatureBits()[ARM::HasV8MBaselineOps]) {
+      const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
+      if (FixupDiagnostic) {
+        Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+        return 0;
+      }
+    }
+    return ((Value - 4) >> 1) & 0x7ff;
+  case ARM::fixup_arm_thumb_bcc:
+    // Offset by 4 and don't encode the lower bit, which is always 0.
+    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) {
+      const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
+      if (FixupDiagnostic) {
+        Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+        return 0;
+      }
+    }
+    return ((Value - 4) >> 1) & 0xff;
+  case ARM::fixup_arm_pcrel_10_unscaled: {
+    Value = Value - 8; // ARM fixups offset by an additional word and don't
+                       // need to adjust for the half-word ordering.
+    bool isAdd = true;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      isAdd = false;
+    }
+    // The value has the low 4 bits encoded in [3:0] and the high 4 in [11:8].
+    if (Ctx && Value >= 256) {
+      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+      return 0;
+    }
+    Value = (Value & 0xf) | ((Value & 0xf0) << 4);
+    return Value | (isAdd << 23);
+  }
+  case ARM::fixup_arm_pcrel_10:
+    Value = Value - 4; // ARM fixups offset by an additional word and don't
+                       // need to adjust for the half-word ordering.
+    LLVM_FALLTHROUGH;
+  case ARM::fixup_t2_pcrel_10: {
+    // Offset by 4, adjusted by two due to the half-word ordering of thumb.
+    Value = Value - 4;
+    bool isAdd = true;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      isAdd = false;
+    }
+    // These values don't encode the low two bits since they're always zero.
+    Value >>= 2;
+    if (Ctx && Value >= 256) {
+      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+      return 0;
+    }
+    Value |= isAdd << 23;
+
+    // Same addressing mode as fixup_arm_pcrel_10, but with 16-bit halfwords
+    // swapped.
+    if (Kind == ARM::fixup_t2_pcrel_10)
+      return swapHalfWords(Value, IsLittleEndian);
+
+    return Value;
+  }
+  case ARM::fixup_arm_pcrel_9:
+    Value = Value - 4; // ARM fixups offset by an additional word and don't
+                       // need to adjust for the half-word ordering.
+    LLVM_FALLTHROUGH;
+  case ARM::fixup_t2_pcrel_9: {
+    // Offset by 4, adjusted by two due to the half-word ordering of thumb.
+    Value = Value - 4;
+    bool isAdd = true;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      isAdd = false;
+    }
+    // These values don't encode the low bit since it's always zero.
+    if (Ctx && (Value & 1)) {
+      Ctx->reportError(Fixup.getLoc(), "invalid value for this fixup");
+      return 0;
+    }
+    Value >>= 1;
+    if (Ctx && Value >= 256) {
+      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+      return 0;
+    }
+    Value |= isAdd << 23;
+
+    // Same addressing mode as fixup_arm_pcrel_9, but with 16-bit halfwords
+    // swapped.
+    if (Kind == ARM::fixup_t2_pcrel_9)
+      return swapHalfWords(Value, IsLittleEndian);
+
+    return Value;
+  }
+  case ARM::fixup_arm_mod_imm:
+    Value = ARM_AM::getSOImmVal(Value);
+    if (Ctx && Value >> 12) {
+      Ctx->reportError(Fixup.getLoc(), "out of range immediate fixup value");
+      return 0;
+    }
+    return Value;
+  }
+}
+
+void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
+                                      const MCAsmLayout &Layout,
+                                      const MCFixup &Fixup,
+                                      const MCFragment *DF,
+                                      const MCValue &Target, uint64_t &Value,
+                                      bool &IsResolved) {
+  const MCSymbolRefExpr *A = Target.getSymA();
+  const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
+  // MachO (the only user of "Value") tries to make .o files that look vaguely
+  // pre-linked, so for MOVW/MOVT and .word relocations they put the Thumb bit
+  // into the addend if possible. Other relocation types don't want this bit
+  // though (branches couldn't encode it if it *was* present, and no other
+  // relocations exist) and it can interfere with checking valid expressions.
+  if ((unsigned)Fixup.getKind() == FK_Data_4 ||
+      (unsigned)Fixup.getKind() == ARM::fixup_arm_movw_lo16 ||
+      (unsigned)Fixup.getKind() == ARM::fixup_arm_movt_hi16 ||
+      (unsigned)Fixup.getKind() == ARM::fixup_t2_movw_lo16 ||
+      (unsigned)Fixup.getKind() == ARM::fixup_t2_movt_hi16) {
+    if (Sym) {
+      if (Asm.isThumbFunc(Sym))
+        Value |= 1;
+    }
+  }
+  if (IsResolved && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
+    assert(Sym && "How did we resolve this?");
+
+    // If the symbol is external the linker will handle it.
+    // FIXME: Should we handle it as an optimization?
+
+    // If the symbol is out of range, produce a relocation and hope the
+    // linker can handle it. GNU AS produces an error in this case.
+    if (Sym->isExternal() || Value >= 0x400004)
+      IsResolved = false;
+  }
+  // We must always generate a relocation for BL/BLX instructions if we have
+  // a symbol to reference, as the linker relies on knowing the destination
+  // symbol's thumb-ness to get interworking right.
+  if (A && ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_blx ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_blx ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_uncondbl ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_condbl))
+    IsResolved = false;
+
+  // Try to get the encoded value for the fixup as-if we're mapping it into
+  // the instruction. This allows adjustFixupValue() to issue a diagnostic
+  // if the value aren't invalid.
+  (void)adjustFixupValue(Fixup, Value, false, &Asm.getContext(),
+                         IsLittleEndian, IsResolved);
+}
+
+/// getFixupKindNumBytes - The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+
+  case FK_Data_1:
+  case ARM::fixup_arm_thumb_bcc:
+  case ARM::fixup_arm_thumb_cp:
+  case ARM::fixup_thumb_adr_pcrel_10:
+    return 1;
+
+  case FK_Data_2:
+  case ARM::fixup_arm_thumb_br:
+  case ARM::fixup_arm_thumb_cb:
+  case ARM::fixup_arm_mod_imm:
+    return 2;
+
+  case ARM::fixup_arm_pcrel_10_unscaled:
+  case ARM::fixup_arm_ldst_pcrel_12:
+  case ARM::fixup_arm_pcrel_10:
+  case ARM::fixup_arm_pcrel_9:
+  case ARM::fixup_arm_adr_pcrel_12:
+  case ARM::fixup_arm_uncondbl:
+  case ARM::fixup_arm_condbl:
+  case ARM::fixup_arm_blx:
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+    return 3;
+
+  case FK_Data_4:
+  case ARM::fixup_t2_ldst_pcrel_12:
+  case ARM::fixup_t2_condbranch:
+  case ARM::fixup_t2_uncondbranch:
+  case ARM::fixup_t2_pcrel_10:
+  case ARM::fixup_t2_pcrel_9:
+  case ARM::fixup_t2_adr_pcrel_12:
+  case ARM::fixup_arm_thumb_bl:
+  case ARM::fixup_arm_thumb_blx:
+  case ARM::fixup_arm_movt_hi16:
+  case ARM::fixup_arm_movw_lo16:
+  case ARM::fixup_t2_movt_hi16:
+  case ARM::fixup_t2_movw_lo16:
+    return 4;
+
+  case FK_SecRel_2:
+    return 2;
+  case FK_SecRel_4:
+    return 4;
+  }
+}
+
+/// getFixupKindContainerSizeBytes - The number of bytes of the
+/// container involved in big endian.
+static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+
+  case FK_Data_1:
+    return 1;
+  case FK_Data_2:
+    return 2;
+  case FK_Data_4:
+    return 4;
+
+  case ARM::fixup_arm_thumb_bcc:
+  case ARM::fixup_arm_thumb_cp:
+  case ARM::fixup_thumb_adr_pcrel_10:
+  case ARM::fixup_arm_thumb_br:
+  case ARM::fixup_arm_thumb_cb:
+    // Instruction size is 2 bytes.
+    return 2;
+
+  case ARM::fixup_arm_pcrel_10_unscaled:
+  case ARM::fixup_arm_ldst_pcrel_12:
+  case ARM::fixup_arm_pcrel_10:
+  case ARM::fixup_arm_adr_pcrel_12:
+  case ARM::fixup_arm_uncondbl:
+  case ARM::fixup_arm_condbl:
+  case ARM::fixup_arm_blx:
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+  case ARM::fixup_t2_ldst_pcrel_12:
+  case ARM::fixup_t2_condbranch:
+  case ARM::fixup_t2_uncondbranch:
+  case ARM::fixup_t2_pcrel_10:
+  case ARM::fixup_t2_adr_pcrel_12:
+  case ARM::fixup_arm_thumb_bl:
+  case ARM::fixup_arm_thumb_blx:
+  case ARM::fixup_arm_movt_hi16:
+  case ARM::fixup_arm_movw_lo16:
+  case ARM::fixup_t2_movt_hi16:
+  case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_arm_mod_imm:
+    // Instruction size is 4 bytes.
+    return 4;
+  }
+}
+
+void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                               unsigned DataSize, uint64_t Value,
+                               bool IsPCRel) const {
+  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+  Value =
+      adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian, true);
+  if (!Value)
+    return; // Doesn't change encoding.
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // Used to point to big endian bytes.
+  unsigned FullSizeBytes;
+  if (!IsLittleEndian) {
+    FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind());
+    assert((Offset + FullSizeBytes) <= DataSize && "Invalid fixup size!");
+    assert(NumBytes <= FullSizeBytes && "Invalid fixup size!");
+  }
+
+  // For each byte of the fragment that the fixup touches, mask in the bits from
+  // the fixup value. The Value has been "split up" into the appropriate
+  // bitfields above.
+  for (unsigned i = 0; i != NumBytes; ++i) {
+    unsigned Idx = IsLittleEndian ? i : (FullSizeBytes - 1 - i);
+    Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+  }
+}
+
+namespace CU {
+
+/// \brief Compact unwind encoding values.
+enum CompactUnwindEncodings {
+  UNWIND_ARM_MODE_MASK                         = 0x0F000000,
+  UNWIND_ARM_MODE_FRAME                        = 0x01000000,
+  UNWIND_ARM_MODE_FRAME_D                      = 0x02000000,
+  UNWIND_ARM_MODE_DWARF                        = 0x04000000,
+
+  UNWIND_ARM_FRAME_STACK_ADJUST_MASK           = 0x00C00000,
+
+  UNWIND_ARM_FRAME_FIRST_PUSH_R4               = 0x00000001,
+  UNWIND_ARM_FRAME_FIRST_PUSH_R5               = 0x00000002,
+  UNWIND_ARM_FRAME_FIRST_PUSH_R6               = 0x00000004,
+
+  UNWIND_ARM_FRAME_SECOND_PUSH_R8              = 0x00000008,
+  UNWIND_ARM_FRAME_SECOND_PUSH_R9              = 0x00000010,
+  UNWIND_ARM_FRAME_SECOND_PUSH_R10             = 0x00000020,
+  UNWIND_ARM_FRAME_SECOND_PUSH_R11             = 0x00000040,
+  UNWIND_ARM_FRAME_SECOND_PUSH_R12             = 0x00000080,
+
+  UNWIND_ARM_FRAME_D_REG_COUNT_MASK            = 0x00000F00,
+
+  UNWIND_ARM_DWARF_SECTION_OFFSET              = 0x00FFFFFF
+};
+
+} // end CU namespace
+
+/// Generate compact unwind encoding for the function based on the CFI
+/// instructions. If the CFI instructions describe a frame that cannot be
+/// encoded in compact unwind, the method returns UNWIND_ARM_MODE_DWARF which
+/// tells the runtime to fallback and unwind using dwarf.
+uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
+    ArrayRef<MCCFIInstruction> Instrs) const {
+  DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "generateCU()\n");
+  // Only armv7k uses CFI based unwinding.
+  if (Subtype != MachO::CPU_SUBTYPE_ARM_V7K)
+    return 0;
+  // No .cfi directives means no frame.
+  if (Instrs.empty())
+    return 0;
+  // Start off assuming CFA is at SP+0.
+  int CFARegister = ARM::SP;
+  int CFARegisterOffset = 0;
+  // Mark savable registers as initially unsaved
+  DenseMap<unsigned, int> RegOffsets;
+  int FloatRegCount = 0;
+  // Process each .cfi directive and build up compact unwind info.
+  for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
+    int Reg;
+    const MCCFIInstruction &Inst = Instrs[i];
+    switch (Inst.getOperation()) {
+    case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa
+      CFARegisterOffset = -Inst.getOffset();
+      CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true);
+      break;
+    case MCCFIInstruction::OpDefCfaOffset: // DW_CFA_def_cfa_offset
+      CFARegisterOffset = -Inst.getOffset();
+      break;
+    case MCCFIInstruction::OpDefCfaRegister: // DW_CFA_def_cfa_register
+      CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true);
+      break;
+    case MCCFIInstruction::OpOffset: // DW_CFA_offset
+      Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+      if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+        RegOffsets[Reg] = Inst.getOffset();
+      else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) {
+        RegOffsets[Reg] = Inst.getOffset();
+        ++FloatRegCount;
+      } else {
+        DEBUG_WITH_TYPE("compact-unwind",
+                        llvm::dbgs() << ".cfi_offset on unknown register="
+                                     << Inst.getRegister() << "\n");
+        return CU::UNWIND_ARM_MODE_DWARF;
+      }
+      break;
+    case MCCFIInstruction::OpRelOffset: // DW_CFA_advance_loc
+      // Ignore
+      break;
+    default:
+      // Directive not convertable to compact unwind, bail out.
+      DEBUG_WITH_TYPE("compact-unwind",
+                      llvm::dbgs()
+                          << "CFI directive not compatiable with comact "
+                             "unwind encoding, opcode=" << Inst.getOperation()
+                          << "\n");
+      return CU::UNWIND_ARM_MODE_DWARF;
+      break;
+    }
+  }
+
+  // If no frame set up, return no unwind info.
+  if ((CFARegister == ARM::SP) && (CFARegisterOffset == 0))
+    return 0;
+
+  // Verify standard frame (lr/r7) was used.
+  if (CFARegister != ARM::R7) {
+    DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "frame register is "
+                                                   << CFARegister
+                                                   << " instead of r7\n");
+    return CU::UNWIND_ARM_MODE_DWARF;
+  }
+  int StackAdjust = CFARegisterOffset - 8;
+  if (RegOffsets.lookup(ARM::LR) != (-4 - StackAdjust)) {
+    DEBUG_WITH_TYPE("compact-unwind",
+                    llvm::dbgs()
+                        << "LR not saved as standard frame, StackAdjust="
+                        << StackAdjust
+                        << ", CFARegisterOffset=" << CFARegisterOffset
+                        << ", lr save at offset=" << RegOffsets[14] << "\n");
+    return CU::UNWIND_ARM_MODE_DWARF;
+  }
+  if (RegOffsets.lookup(ARM::R7) != (-8 - StackAdjust)) {
+    DEBUG_WITH_TYPE("compact-unwind",
+                    llvm::dbgs() << "r7 not saved as standard frame\n");
+    return CU::UNWIND_ARM_MODE_DWARF;
+  }
+  uint32_t CompactUnwindEncoding = CU::UNWIND_ARM_MODE_FRAME;
+
+  // If var-args are used, there may be a stack adjust required.
+  switch (StackAdjust) {
+  case 0:
+    break;
+  case 4:
+    CompactUnwindEncoding |= 0x00400000;
+    break;
+  case 8:
+    CompactUnwindEncoding |= 0x00800000;
+    break;
+  case 12:
+    CompactUnwindEncoding |= 0x00C00000;
+    break;
+  default:
+    DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs()
+                                          << ".cfi_def_cfa stack adjust ("
+                                          << StackAdjust << ") out of range\n");
+    return CU::UNWIND_ARM_MODE_DWARF;
+  }
+
+  // If r6 is saved, it must be right below r7.
+  static struct {
+    unsigned Reg;
+    unsigned Encoding;
+  } GPRCSRegs[] = {{ARM::R6, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R6},
+                   {ARM::R5, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R5},
+                   {ARM::R4, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R4},
+                   {ARM::R12, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R12},
+                   {ARM::R11, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R11},
+                   {ARM::R10, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R10},
+                   {ARM::R9, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R9},
+                   {ARM::R8, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R8}};
+
+  int CurOffset = -8 - StackAdjust;
+  for (auto CSReg : GPRCSRegs) {
+    auto Offset = RegOffsets.find(CSReg.Reg);
+    if (Offset == RegOffsets.end())
+      continue;
+
+    int RegOffset = Offset->second;
+    if (RegOffset != CurOffset - 4) {
+      DEBUG_WITH_TYPE("compact-unwind",
+                      llvm::dbgs() << MRI.getName(CSReg.Reg) << " saved at "
+                                   << RegOffset << " but only supported at "
+                                   << CurOffset << "\n");
+      return CU::UNWIND_ARM_MODE_DWARF;
+    }
+    CompactUnwindEncoding |= CSReg.Encoding;
+    CurOffset -= 4;
+  }
+
+  // If no floats saved, we are done.
+  if (FloatRegCount == 0)
+    return CompactUnwindEncoding;
+
+  // Switch mode to include D register saving.
+  CompactUnwindEncoding &= ~CU::UNWIND_ARM_MODE_MASK;
+  CompactUnwindEncoding |= CU::UNWIND_ARM_MODE_FRAME_D;
+
+  // FIXME: supporting more than 4 saved D-registers compactly would be trivial,
+  // but needs coordination with the linker and libunwind.
+  if (FloatRegCount > 4) {
+    DEBUG_WITH_TYPE("compact-unwind",
+                    llvm::dbgs() << "unsupported number of D registers saved ("
+                                 << FloatRegCount << ")\n");
+      return CU::UNWIND_ARM_MODE_DWARF;
+  }
+
+  // Floating point registers must either be saved sequentially, or we defer to
+  // DWARF. No gaps allowed here so check that each saved d-register is
+  // precisely where it should be.
+  static unsigned FPRCSRegs[] = { ARM::D8, ARM::D10, ARM::D12, ARM::D14 };
+  for (int Idx = FloatRegCount - 1; Idx >= 0; --Idx) {
+    auto Offset = RegOffsets.find(FPRCSRegs[Idx]);
+    if (Offset == RegOffsets.end()) {
+      DEBUG_WITH_TYPE("compact-unwind",
+                      llvm::dbgs() << FloatRegCount << " D-regs saved, but "
+                                   << MRI.getName(FPRCSRegs[Idx])
+                                   << " not saved\n");
+      return CU::UNWIND_ARM_MODE_DWARF;
+    } else if (Offset->second != CurOffset - 8) {
+      DEBUG_WITH_TYPE("compact-unwind",
+                      llvm::dbgs() << FloatRegCount << " D-regs saved, but "
+                                   << MRI.getName(FPRCSRegs[Idx])
+                                   << " saved at " << Offset->second
+                                   << ", expected at " << CurOffset - 8
+                                   << "\n");
+      return CU::UNWIND_ARM_MODE_DWARF;
+    }
+    CurOffset -= 8;
+  }
+
+  return CompactUnwindEncoding | ((FloatRegCount - 1) << 8);
+}
+
+static MachO::CPUSubTypeARM getMachOSubTypeFromArch(StringRef Arch) {
+  unsigned AK = ARM::parseArch(Arch);
+  switch (AK) {
+  default:
+    return MachO::CPU_SUBTYPE_ARM_V7;
+  case ARM::AK_ARMV4T:
+    return MachO::CPU_SUBTYPE_ARM_V4T;
+  case ARM::AK_ARMV5T:
+  case ARM::AK_ARMV5TE:
+  case ARM::AK_ARMV5TEJ:
+    return MachO::CPU_SUBTYPE_ARM_V5;
+  case ARM::AK_ARMV6:
+  case ARM::AK_ARMV6K:
+    return MachO::CPU_SUBTYPE_ARM_V6;
+  case ARM::AK_ARMV7A:
+    return MachO::CPU_SUBTYPE_ARM_V7;
+  case ARM::AK_ARMV7S:
+    return MachO::CPU_SUBTYPE_ARM_V7S;
+  case ARM::AK_ARMV7K:
+    return MachO::CPU_SUBTYPE_ARM_V7K;
+  case ARM::AK_ARMV6M:
+    return MachO::CPU_SUBTYPE_ARM_V6M;
+  case ARM::AK_ARMV7M:
+    return MachO::CPU_SUBTYPE_ARM_V7M;
+  case ARM::AK_ARMV7EM:
+    return MachO::CPU_SUBTYPE_ARM_V7EM;
+  }
+}
+
+MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        const Triple &TheTriple, StringRef CPU,
+                                        const MCTargetOptions &Options,
+                                        bool isLittle) {
+  switch (TheTriple.getObjectFormat()) {
+  default:
+    llvm_unreachable("unsupported object format");
+  case Triple::MachO: {
+    MachO::CPUSubTypeARM CS = getMachOSubTypeFromArch(TheTriple.getArchName());
+    return new ARMAsmBackendDarwin(T, TheTriple, MRI, CS);
+  }
+  case Triple::COFF:
+    assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported");
+    return new ARMAsmBackendWinCOFF(T, TheTriple);
+  case Triple::ELF:
+    assert(TheTriple.isOSBinFormatELF() && "using ELF for non-ELF target");
+    uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+    return new ARMAsmBackendELF(T, TheTriple, OSABI, isLittle);
+  }
+}
+
+MCAsmBackend *llvm::createARMLEAsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          const Triple &TT, StringRef CPU,
+                                          const MCTargetOptions &Options) {
+  return createARMAsmBackend(T, MRI, TT, CPU, Options, true);
+}
+
+MCAsmBackend *llvm::createARMBEAsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          const Triple &TT, StringRef CPU,
+                                          const MCTargetOptions &Options) {
+  return createARMAsmBackend(T, MRI, TT, CPU, Options, false);
+}
+
+MCAsmBackend *llvm::createThumbLEAsmBackend(const Target &T,
+                                            const MCRegisterInfo &MRI,
+                                            const Triple &TT, StringRef CPU,
+                                            const MCTargetOptions &Options) {
+  return createARMAsmBackend(T, MRI, TT, CPU, Options, true);
+}
+
+MCAsmBackend *llvm::createThumbBEAsmBackend(const Target &T,
+                                            const MCRegisterInfo &MRI,
+                                            const Triple &TT, StringRef CPU,
+                                            const MCTargetOptions &Options) {
+  return createARMAsmBackend(T, MRI, TT, CPU, Options, false);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
new file mode 100644
index 000000000000..84caaacc47d3
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -0,0 +1,80 @@
+//===-- ARMAsmBackend.h - ARM Assembler Backend -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMASMBACKEND_H
+#define LLVM_LIB_TARGET_ARM_ARMASMBACKEND_H
+
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+namespace llvm {
+
+class ARMAsmBackend : public MCAsmBackend {
+  const MCSubtargetInfo *STI;
+  bool isThumbMode;    // Currently emitting Thumb code.
+  bool IsLittleEndian; // Big or little endian.
+public:
+  ARMAsmBackend(const Target &T, const Triple &TT, bool IsLittle)
+      : MCAsmBackend(), STI(ARM_MC::createARMMCSubtargetInfo(TT, "", "")),
+        isThumbMode(TT.getArchName().startswith("thumb")),
+        IsLittleEndian(IsLittle) {}
+
+  ~ARMAsmBackend() override { delete STI; }
+
+  unsigned getNumFixupKinds() const override {
+    return ARM::NumTargetFixupKinds;
+  }
+
+  bool hasNOP() const { return STI->getFeatureBits()[ARM::HasV6T2Ops]; }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
+  /// processFixupValue - Target hook to process the literal value of a fixup
+  /// if necessary.
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override;
+
+  unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, bool IsPCRel,
+                            MCContext *Ctx, bool IsLittleEndian,
+                            bool IsResolved) const;
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  unsigned getRelaxedOpcode(unsigned Op) const;
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override;
+
+  const char *reasonForFixupRelaxation(const MCFixup &Fixup,
+                                       uint64_t Value) const;
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override;
+
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override;
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+  void handleAssemblerFlag(MCAssemblerFlag Flag) override;
+
+  unsigned getPointerSize() const { return 4; }
+  bool isThumb() const { return isThumbMode; }
+  void setIsThumb(bool it) { isThumbMode = it; }
+  bool isLittle() const { return IsLittleEndian; }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
new file mode 100644
index 000000000000..09dc0173ade6
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
@@ -0,0 +1,36 @@
+//===-- ARMAsmBackendDarwin.h   ARM Asm Backend Darwin ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMASMBACKENDDARWIN_H
+#define LLVM_LIB_TARGET_ARM_ARMASMBACKENDDARWIN_H
+
+#include "ARMAsmBackend.h"
+#include "llvm/Support/MachO.h"
+
+namespace llvm {
+class ARMAsmBackendDarwin : public ARMAsmBackend {
+  const MCRegisterInfo &MRI;
+public:
+  const MachO::CPUSubTypeARM Subtype;
+  ARMAsmBackendDarwin(const Target &T, const Triple &TT,
+                      const MCRegisterInfo &MRI, MachO::CPUSubTypeARM st)
+      : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), MRI(MRI), Subtype(st) {
+  }
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createARMMachObjectWriter(OS, /*Is64Bit=*/false, MachO::CPU_TYPE_ARM,
+                                     Subtype);
+  }
+
+  uint32_t generateCompactUnwindEncoding(
+      ArrayRef<MCCFIInstruction> Instrs) const override;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
new file mode 100644
index 000000000000..748f915be17b
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
@@ -0,0 +1,31 @@
+//===-- ARMAsmBackendELF.h  ARM Asm Backend ELF -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ELFARMASMBACKEND_H
+#define LLVM_LIB_TARGET_ARM_ELFARMASMBACKEND_H
+
+#include "ARMAsmBackend.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+using namespace llvm;
+
+namespace {
+class ARMAsmBackendELF : public ARMAsmBackend {
+public:
+  uint8_t OSABI;
+  ARMAsmBackendELF(const Target &T, const Triple &TT, uint8_t OSABI,
+                   bool IsLittle)
+      : ARMAsmBackend(T, TT, IsLittle), OSABI(OSABI) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createARMELFObjectWriter(OS, OSABI, isLittle());
+  }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
new file mode 100644
index 000000000000..2a375be49a83
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
@@ -0,0 +1,27 @@
+//===-- ARMAsmBackendWinCOFF.h - ARM Asm Backend WinCOFF --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMASMBACKENDWINCOFF_H
+#define LLVM_LIB_TARGET_ARM_ARMASMBACKENDWINCOFF_H
+
+#include "ARMAsmBackend.h"
+using namespace llvm;
+
+namespace {
+class ARMAsmBackendWinCOFF : public ARMAsmBackend {
+public:
+  ARMAsmBackendWinCOFF(const Target &T, const Triple &TheTriple)
+      : ARMAsmBackend(T, TheTriple, true) {}
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createARMWinCOFFObjectWriter(OS, /*Is64Bit=*/false);
+  }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
new file mode 100644
index 000000000000..088b4205ed62
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -0,0 +1,466 @@
+//===-- ARMBaseInfo.h - Top level definitions for ARM -------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the ARM target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMBASEINFO_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMBASEINFO_H
+
+#include "ARMMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+// Enums corresponding to ARM condition codes
+namespace ARMCC {
+  // The CondCodes constants map directly to the 4-bit encoding of the
+  // condition field for predicated instructions.
+  enum CondCodes { // Meaning (integer)          Meaning (floating-point)
+    EQ,            // Equal                      Equal
+    NE,            // Not equal                  Not equal, or unordered
+    HS,            // Carry set                  >, ==, or unordered
+    LO,            // Carry clear                Less than
+    MI,            // Minus, negative            Less than
+    PL,            // Plus, positive or zero     >, ==, or unordered
+    VS,            // Overflow                   Unordered
+    VC,            // No overflow                Not unordered
+    HI,            // Unsigned higher            Greater than, or unordered
+    LS,            // Unsigned lower or same     Less than or equal
+    GE,            // Greater than or equal      Greater than or equal
+    LT,            // Less than                  Less than, or unordered
+    GT,            // Greater than               Greater than
+    LE,            // Less than or equal         <, ==, or unordered
+    AL             // Always (unconditional)     Always (unconditional)
+  };
+
+  inline static CondCodes getOppositeCondition(CondCodes CC) {
+    switch (CC) {
+    default: llvm_unreachable("Unknown condition code");
+    case EQ: return NE;
+    case NE: return EQ;
+    case HS: return LO;
+    case LO: return HS;
+    case MI: return PL;
+    case PL: return MI;
+    case VS: return VC;
+    case VC: return VS;
+    case HI: return LS;
+    case LS: return HI;
+    case GE: return LT;
+    case LT: return GE;
+    case GT: return LE;
+    case LE: return GT;
+    }
+  }
+} // namespace ARMCC
+
+inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
+  switch (CC) {
+  case ARMCC::EQ:  return "eq";
+  case ARMCC::NE:  return "ne";
+  case ARMCC::HS:  return "hs";
+  case ARMCC::LO:  return "lo";
+  case ARMCC::MI:  return "mi";
+  case ARMCC::PL:  return "pl";
+  case ARMCC::VS:  return "vs";
+  case ARMCC::VC:  return "vc";
+  case ARMCC::HI:  return "hi";
+  case ARMCC::LS:  return "ls";
+  case ARMCC::GE:  return "ge";
+  case ARMCC::LT:  return "lt";
+  case ARMCC::GT:  return "gt";
+  case ARMCC::LE:  return "le";
+  case ARMCC::AL:  return "al";
+  }
+  llvm_unreachable("Unknown condition code");
+}
+
+namespace ARM_PROC {
+  enum IMod {
+    IE = 2,
+    ID = 3
+  };
+
+  enum IFlags {
+    F = 1,
+    I = 2,
+    A = 4
+  };
+
+  inline static const char *IFlagsToString(unsigned val) {
+    switch (val) {
+    default: llvm_unreachable("Unknown iflags operand");
+    case F: return "f";
+    case I: return "i";
+    case A: return "a";
+    }
+  }
+
+  inline static const char *IModToString(unsigned val) {
+    switch (val) {
+    default: llvm_unreachable("Unknown imod operand");
+    case IE: return "ie";
+    case ID: return "id";
+    }
+  }
+}
+
+namespace ARM_MB {
+  // The Memory Barrier Option constants map directly to the 4-bit encoding of
+  // the option field for memory barrier operations.
+  enum MemBOpt {
+    RESERVED_0 = 0,
+    OSHLD = 1,
+    OSHST = 2,
+    OSH   = 3,
+    RESERVED_4 = 4,
+    NSHLD = 5,
+    NSHST = 6,
+    NSH   = 7,
+    RESERVED_8 = 8,
+    ISHLD = 9,
+    ISHST = 10,
+    ISH   = 11,
+    RESERVED_12 = 12,
+    LD = 13,
+    ST    = 14,
+    SY    = 15
+  };
+
+  inline static const char *MemBOptToString(unsigned val, bool HasV8) {
+    switch (val) {
+    default: llvm_unreachable("Unknown memory operation");
+    case SY:    return "sy";
+    case ST:    return "st";
+    case LD: return HasV8 ? "ld" : "#0xd";
+    case RESERVED_12: return "#0xc";
+    case ISH:   return "ish";
+    case ISHST: return "ishst";
+    case ISHLD: return HasV8 ?  "ishld" : "#0x9";
+    case RESERVED_8: return "#0x8";
+    case NSH:   return "nsh";
+    case NSHST: return "nshst";
+    case NSHLD: return HasV8 ? "nshld" : "#0x5";
+    case RESERVED_4: return "#0x4";
+    case OSH:   return "osh";
+    case OSHST: return "oshst";
+    case OSHLD: return HasV8 ? "oshld" : "#0x1";
+    case RESERVED_0: return "#0x0";
+    }
+  }
+} // namespace ARM_MB
+
+namespace ARM_ISB {
+  enum InstSyncBOpt {
+    RESERVED_0 = 0,
+    RESERVED_1 = 1,
+    RESERVED_2 = 2,
+    RESERVED_3 = 3,
+    RESERVED_4 = 4,
+    RESERVED_5 = 5,
+    RESERVED_6 = 6,
+    RESERVED_7 = 7,
+    RESERVED_8 = 8,
+    RESERVED_9 = 9,
+    RESERVED_10 = 10,
+    RESERVED_11 = 11,
+    RESERVED_12 = 12,
+    RESERVED_13 = 13,
+    RESERVED_14 = 14,
+    SY = 15
+  };
+
+  inline static const char *InstSyncBOptToString(unsigned val) {
+    switch (val) {
+    default:
+      llvm_unreachable("Unknown memory operation");
+      case RESERVED_0:  return "#0x0";
+      case RESERVED_1:  return "#0x1";
+      case RESERVED_2:  return "#0x2";
+      case RESERVED_3:  return "#0x3";
+      case RESERVED_4:  return "#0x4";
+      case RESERVED_5:  return "#0x5";
+      case RESERVED_6:  return "#0x6";
+      case RESERVED_7:  return "#0x7";
+      case RESERVED_8:  return "#0x8";
+      case RESERVED_9:  return "#0x9";
+      case RESERVED_10: return "#0xa";
+      case RESERVED_11: return "#0xb";
+      case RESERVED_12: return "#0xc";
+      case RESERVED_13: return "#0xd";
+      case RESERVED_14: return "#0xe";
+      case SY:          return "sy";
+    }
+  }
+} // namespace ARM_ISB
+
+/// isARMLowRegister - Returns true if the register is a low register (r0-r7).
+///
+static inline bool isARMLowRegister(unsigned Reg) {
+  using namespace ARM;
+  switch (Reg) {
+  case R0:  case R1:  case R2:  case R3:
+  case R4:  case R5:  case R6:  case R7:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// ARMII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace ARMII {
+
+  /// ARM Index Modes
+  enum IndexMode {
+    IndexModeNone  = 0,
+    IndexModePre   = 1,
+    IndexModePost  = 2,
+    IndexModeUpd   = 3
+  };
+
+  /// ARM Addressing Modes
+  enum AddrMode {
+    AddrModeNone    = 0,
+    AddrMode1       = 1,
+    AddrMode2       = 2,
+    AddrMode3       = 3,
+    AddrMode4       = 4,
+    AddrMode5       = 5,
+    AddrMode6       = 6,
+    AddrModeT1_1    = 7,
+    AddrModeT1_2    = 8,
+    AddrModeT1_4    = 9,
+    AddrModeT1_s    = 10, // i8 * 4 for pc and sp relative data
+    AddrModeT2_i12  = 11,
+    AddrModeT2_i8   = 12,
+    AddrModeT2_so   = 13,
+    AddrModeT2_pc   = 14, // +/- i12 for pc relative data
+    AddrModeT2_i8s4 = 15, // i8 * 4
+    AddrMode_i12    = 16
+  };
+
+  inline static const char *AddrModeToString(AddrMode addrmode) {
+    switch (addrmode) {
+    case AddrModeNone:    return "AddrModeNone";
+    case AddrMode1:       return "AddrMode1";
+    case AddrMode2:       return "AddrMode2";
+    case AddrMode3:       return "AddrMode3";
+    case AddrMode4:       return "AddrMode4";
+    case AddrMode5:       return "AddrMode5";
+    case AddrMode6:       return "AddrMode6";
+    case AddrModeT1_1:    return "AddrModeT1_1";
+    case AddrModeT1_2:    return "AddrModeT1_2";
+    case AddrModeT1_4:    return "AddrModeT1_4";
+    case AddrModeT1_s:    return "AddrModeT1_s";
+    case AddrModeT2_i12:  return "AddrModeT2_i12";
+    case AddrModeT2_i8:   return "AddrModeT2_i8";
+    case AddrModeT2_so:   return "AddrModeT2_so";
+    case AddrModeT2_pc:   return "AddrModeT2_pc";
+    case AddrModeT2_i8s4: return "AddrModeT2_i8s4";
+    case AddrMode_i12:    return "AddrMode_i12";
+    }
+  }
+
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // ARM Specific MachineOperand flags.
+
+    MO_NO_FLAG = 0,
+
+    /// MO_LO16 - On a symbol operand, this represents a relocation containing
+    /// lower 16 bit of the address. Used only via movw instruction.
+    MO_LO16 = 0x1,
+
+    /// MO_HI16 - On a symbol operand, this represents a relocation containing
+    /// higher 16 bit of the address. Used only via movt instruction.
+    MO_HI16 = 0x2,
+
+    /// MO_OPTION_MASK - Most flags are mutually exclusive; this mask selects
+    /// just that part of the flag set.
+    MO_OPTION_MASK = 0x1f,
+
+    /// MO_DLLIMPORT - On a symbol operand, this represents that the reference
+    /// to the symbol is for an import stub.  This is used for DLL import
+    /// storage class indication on Windows.
+    MO_DLLIMPORT = 0x20,
+
+    /// MO_SECREL - On a symbol operand this indicates that the immediate is
+    /// the offset from beginning of section.
+    ///
+    /// This is the TLS offset for the COFF/Windows TLS mechanism.
+    MO_SECREL = 0x40,
+
+    /// MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it
+    /// represents a symbol which, if indirect, will get special Darwin mangling
+    /// as a non-lazy-ptr indirect symbol (i.e. "L_FOO$non_lazy_ptr"). Can be
+    /// combined with MO_LO16, MO_HI16 or MO_NO_FLAG (in a constant-pool, for
+    /// example).
+    MO_NONLAZY = 0x80,
+
+    // It's undefined behaviour if an enum overflows the range between its
+    // smallest and largest values, but since these are |ed together, it can
+    // happen. Put a sentinel in (values of this enum are stored as "unsigned
+    // char").
+    MO_UNUSED_MAXIMUM = 0xff
+  };
+
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction Flags.
+
+    //===------------------------------------------------------------------===//
+    // This four-bit field describes the addressing mode used.
+    AddrModeMask  = 0x1f, // The AddrMode enums are declared in ARMBaseInfo.h
+
+    // IndexMode - Unindex, pre-indexed, or post-indexed are valid for load
+    // and store ops only.  Generic "updating" flag is used for ld/st multiple.
+    // The index mode enums are declared in ARMBaseInfo.h
+    IndexModeShift = 5,
+    IndexModeMask  = 3 << IndexModeShift,
+
+    //===------------------------------------------------------------------===//
+    // Instruction encoding formats.
+    //
+    FormShift     = 7,
+    FormMask      = 0x3f << FormShift,
+
+    // Pseudo instructions
+    Pseudo        = 0  << FormShift,
+
+    // Multiply instructions
+    MulFrm        = 1  << FormShift,
+
+    // Branch instructions
+    BrFrm         = 2  << FormShift,
+    BrMiscFrm     = 3  << FormShift,
+
+    // Data Processing instructions
+    DPFrm         = 4  << FormShift,
+    DPSoRegFrm    = 5  << FormShift,
+
+    // Load and Store
+    LdFrm         = 6  << FormShift,
+    StFrm         = 7  << FormShift,
+    LdMiscFrm     = 8  << FormShift,
+    StMiscFrm     = 9  << FormShift,
+    LdStMulFrm    = 10 << FormShift,
+
+    LdStExFrm     = 11 << FormShift,
+
+    // Miscellaneous arithmetic instructions
+    ArithMiscFrm  = 12 << FormShift,
+    SatFrm        = 13 << FormShift,
+
+    // Extend instructions
+    ExtFrm        = 14 << FormShift,
+
+    // VFP formats
+    VFPUnaryFrm   = 15 << FormShift,
+    VFPBinaryFrm  = 16 << FormShift,
+    VFPConv1Frm   = 17 << FormShift,
+    VFPConv2Frm   = 18 << FormShift,
+    VFPConv3Frm   = 19 << FormShift,
+    VFPConv4Frm   = 20 << FormShift,
+    VFPConv5Frm   = 21 << FormShift,
+    VFPLdStFrm    = 22 << FormShift,
+    VFPLdStMulFrm = 23 << FormShift,
+    VFPMiscFrm    = 24 << FormShift,
+
+    // Thumb format
+    ThumbFrm      = 25 << FormShift,
+
+    // Miscelleaneous format
+    MiscFrm       = 26 << FormShift,
+
+    // NEON formats
+    NGetLnFrm     = 27 << FormShift,
+    NSetLnFrm     = 28 << FormShift,
+    NDupFrm       = 29 << FormShift,
+    NLdStFrm      = 30 << FormShift,
+    N1RegModImmFrm= 31 << FormShift,
+    N2RegFrm      = 32 << FormShift,
+    NVCVTFrm      = 33 << FormShift,
+    NVDupLnFrm    = 34 << FormShift,
+    N2RegVShLFrm  = 35 << FormShift,
+    N2RegVShRFrm  = 36 << FormShift,
+    N3RegFrm      = 37 << FormShift,
+    N3RegVShFrm   = 38 << FormShift,
+    NVExtFrm      = 39 << FormShift,
+    NVMulSLFrm    = 40 << FormShift,
+    NVTBLFrm      = 41 << FormShift,
+
+    //===------------------------------------------------------------------===//
+    // Misc flags.
+
+    // UnaryDP - Indicates this is a unary data processing instruction, i.e.
+    // it doesn't have a Rn operand.
+    UnaryDP       = 1 << 13,
+
+    // Xform16Bit - Indicates this Thumb2 instruction may be transformed into
+    // a 16-bit Thumb instruction if certain conditions are met.
+    Xform16Bit    = 1 << 14,
+
+    // ThumbArithFlagSetting - The instruction is a 16-bit flag setting Thumb
+    // instruction. Used by the parser to determine whether to require the 'S'
+    // suffix on the mnemonic (when not in an IT block) or preclude it (when
+    // in an IT block).
+    ThumbArithFlagSetting = 1 << 18,
+
+    //===------------------------------------------------------------------===//
+    // Code domain.
+    DomainShift   = 15,
+    DomainMask    = 7 << DomainShift,
+    DomainGeneral = 0 << DomainShift,
+    DomainVFP     = 1 << DomainShift,
+    DomainNEON    = 2 << DomainShift,
+    DomainNEONA8  = 4 << DomainShift,
+
+    //===------------------------------------------------------------------===//
+    // Field shifts - such shifts are used to set field while generating
+    // machine instructions.
+    //
+    // FIXME: This list will need adjusting/fixing as the MC code emitter
+    // takes shape and the ARMCodeEmitter.cpp bits go away.
+    ShiftTypeShift = 4,
+
+    M_BitShift     = 5,
+    ShiftImmShift  = 5,
+    ShiftShift     = 7,
+    N_BitShift     = 7,
+    ImmHiShift     = 8,
+    SoRotImmShift  = 8,
+    RegRsShift     = 8,
+    ExtRotImmShift = 10,
+    RegRdLoShift   = 12,
+    RegRdShift     = 12,
+    RegRdHiShift   = 16,
+    RegRnShift     = 16,
+    S_BitShift     = 20,
+    W_BitShift     = 21,
+    AM3_I_BitShift = 22,
+    D_BitShift     = 22,
+    U_BitShift     = 23,
+    P_BitShift     = 24,
+    I_BitShift     = 25,
+    CondShift      = 28
+  };
+
+} // end namespace ARMII
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
new file mode 100644
index 000000000000..6f19754b899e
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -0,0 +1,289 @@
+//===-- ARMELFObjectWriter.cpp - ARM ELF Writer ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+  class ARMELFObjectWriter : public MCELFObjectTargetWriter {
+    enum { DefaultEABIVersion = 0x05000000U };
+    unsigned GetRelocTypeInner(const MCValue &Target,
+                               const MCFixup &Fixup,
+                               bool IsPCRel) const;
+
+
+  public:
+    ARMELFObjectWriter(uint8_t OSABI);
+
+    ~ARMELFObjectWriter() override;
+
+    unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                          const MCFixup &Fixup, bool IsPCRel) const override;
+
+    bool needsRelocateWithSymbol(const MCSymbol &Sym,
+                                 unsigned Type) const override;
+  };
+}
+
+ARMELFObjectWriter::ARMELFObjectWriter(uint8_t OSABI)
+  : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI,
+                            ELF::EM_ARM,
+                            /*HasRelocationAddend*/ false) {}
+
+ARMELFObjectWriter::~ARMELFObjectWriter() {}
+
+bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
+                                                 unsigned Type) const {
+  // FIXME: This is extremely conservative. This really needs to use a
+  // whitelist with a clear explanation for why each realocation needs to
+  // point to the symbol, not to the section.
+  switch (Type) {
+  default:
+    return true;
+
+  case ELF::R_ARM_PREL31:
+  case ELF::R_ARM_ABS32:
+    return false;
+  }
+}
+
+// Need to examine the Fixup when determining whether to 
+// emit the relocation as an explicit symbol or as a section relative
+// offset
+unsigned ARMELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel) const {
+  return GetRelocTypeInner(Target, Fixup, IsPCRel);
+}
+
+unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
+                                               const MCFixup &Fixup,
+                                               bool IsPCRel) const  {
+  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
+
+  unsigned Type = 0;
+  if (IsPCRel) {
+    switch ((unsigned)Fixup.getKind()) {
+    default:
+      report_fatal_error("unsupported relocation on symbol");
+      return ELF::R_ARM_NONE;
+    case FK_Data_4:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_REL32;
+        break;
+      case MCSymbolRefExpr::VK_TLSGD:
+        llvm_unreachable("unimplemented");
+      case MCSymbolRefExpr::VK_GOTTPOFF:
+        Type = ELF::R_ARM_TLS_IE32;
+        break;
+      case MCSymbolRefExpr::VK_ARM_GOT_PREL:
+        Type = ELF::R_ARM_GOT_PREL;
+        break;
+      case MCSymbolRefExpr::VK_ARM_PREL31:
+        Type = ELF::R_ARM_PREL31;
+        break;
+      }
+      break;
+    case ARM::fixup_arm_blx:
+    case ARM::fixup_arm_uncondbl:
+      switch (Modifier) {
+      case MCSymbolRefExpr::VK_PLT:
+        Type = ELF::R_ARM_CALL;
+        break;
+      case MCSymbolRefExpr::VK_TLSCALL:
+        Type = ELF::R_ARM_TLS_CALL;
+        break;
+      default:
+        Type = ELF::R_ARM_CALL;
+        break;
+      }
+      break;
+    case ARM::fixup_arm_condbl:
+    case ARM::fixup_arm_condbranch:
+    case ARM::fixup_arm_uncondbranch:
+      Type = ELF::R_ARM_JUMP24;
+      break;
+    case ARM::fixup_t2_condbranch:
+      Type = ELF::R_ARM_THM_JUMP19;
+      break;
+    case ARM::fixup_t2_uncondbranch:
+      Type = ELF::R_ARM_THM_JUMP24;
+      break;
+    case ARM::fixup_arm_movt_hi16:
+      Type = ELF::R_ARM_MOVT_PREL;
+      break;
+    case ARM::fixup_arm_movw_lo16:
+      Type = ELF::R_ARM_MOVW_PREL_NC;
+      break;
+    case ARM::fixup_t2_movt_hi16:
+      Type = ELF::R_ARM_THM_MOVT_PREL;
+      break;
+    case ARM::fixup_t2_movw_lo16:
+      Type = ELF::R_ARM_THM_MOVW_PREL_NC;
+      break;
+    case ARM::fixup_arm_thumb_br:
+      Type = ELF::R_ARM_THM_JUMP11;
+      break;
+    case ARM::fixup_arm_thumb_bcc:
+      Type = ELF::R_ARM_THM_JUMP8;
+      break;
+    case ARM::fixup_arm_thumb_bl:
+    case ARM::fixup_arm_thumb_blx:
+      switch (Modifier) {
+      case MCSymbolRefExpr::VK_TLSCALL:
+        Type = ELF::R_ARM_THM_TLS_CALL;
+        break;
+      default:
+        Type = ELF::R_ARM_THM_CALL;
+        break;
+      }
+      break;
+    }
+  } else {
+    switch ((unsigned)Fixup.getKind()) {
+    default:
+      report_fatal_error("unsupported relocation on symbol");
+      return ELF::R_ARM_NONE;
+    case FK_Data_1:
+      switch (Modifier) {
+      default: llvm_unreachable("unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_ABS8;
+        break;
+      }
+      break;
+    case FK_Data_2:
+      switch (Modifier) {
+      default: llvm_unreachable("unsupported modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_ABS16;
+        break;
+      }
+      break;
+    case FK_Data_4:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_ARM_NONE:
+        Type = ELF::R_ARM_NONE;
+        break;
+      case MCSymbolRefExpr::VK_GOT:
+        Type = ELF::R_ARM_GOT_BREL;
+        break;
+      case MCSymbolRefExpr::VK_TLSGD:
+        Type = ELF::R_ARM_TLS_GD32;
+        break;
+      case MCSymbolRefExpr::VK_TPOFF:
+        Type = ELF::R_ARM_TLS_LE32;
+        break;
+      case MCSymbolRefExpr::VK_GOTTPOFF:
+        Type = ELF::R_ARM_TLS_IE32;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_ABS32;
+        break;
+      case MCSymbolRefExpr::VK_GOTOFF:
+        Type = ELF::R_ARM_GOTOFF32;
+        break;
+      case MCSymbolRefExpr::VK_ARM_GOT_PREL:
+        Type = ELF::R_ARM_GOT_PREL;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TARGET1:
+        Type = ELF::R_ARM_TARGET1;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TARGET2:
+        Type = ELF::R_ARM_TARGET2;
+        break;
+      case MCSymbolRefExpr::VK_ARM_PREL31:
+        Type = ELF::R_ARM_PREL31;
+        break;
+      case MCSymbolRefExpr::VK_ARM_SBREL:
+        Type = ELF::R_ARM_SBREL32;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TLSLDO:
+        Type = ELF::R_ARM_TLS_LDO32;
+        break;
+      case MCSymbolRefExpr::VK_TLSCALL:
+        Type = ELF::R_ARM_TLS_CALL;
+        break;
+      case MCSymbolRefExpr::VK_TLSDESC:
+        Type = ELF::R_ARM_TLS_GOTDESC;
+        break;
+      case MCSymbolRefExpr::VK_TLSLDM:
+        Type = ELF::R_ARM_TLS_LDM32;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TLSDESCSEQ:
+        Type = ELF::R_ARM_TLS_DESCSEQ;
+        break;
+      }
+      break;
+    case ARM::fixup_arm_ldst_pcrel_12:
+    case ARM::fixup_arm_pcrel_10:
+    case ARM::fixup_arm_adr_pcrel_12:
+    case ARM::fixup_arm_thumb_bl:
+    case ARM::fixup_arm_thumb_cb:
+    case ARM::fixup_arm_thumb_cp:
+    case ARM::fixup_arm_thumb_br:
+      llvm_unreachable("Unimplemented");
+    case ARM::fixup_arm_condbranch:
+    case ARM::fixup_arm_uncondbranch:
+      Type = ELF::R_ARM_JUMP24;
+      break;
+    case ARM::fixup_arm_movt_hi16:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_MOVT_ABS;
+        break;
+      case MCSymbolRefExpr::VK_ARM_SBREL:
+        Type = ELF:: R_ARM_MOVT_BREL;
+        break;
+      }
+      break;
+    case ARM::fixup_arm_movw_lo16:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_MOVW_ABS_NC;
+        break;
+      case MCSymbolRefExpr::VK_ARM_SBREL:
+        Type = ELF:: R_ARM_MOVW_BREL_NC;
+        break;
+      }
+      break;
+    case ARM::fixup_t2_movt_hi16:
+      Type = ELF::R_ARM_THM_MOVT_ABS;
+      break;
+    case ARM::fixup_t2_movw_lo16:
+      Type = ELF::R_ARM_THM_MOVW_ABS_NC;
+      break;
+    }
+  }
+
+  return Type;
+}
+
+MCObjectWriter *llvm::createARMELFObjectWriter(raw_pwrite_stream &OS,
+                                               uint8_t OSABI,
+                                               bool IsLittleEndian) {
+  MCELFObjectTargetWriter *MOTW = new ARMELFObjectWriter(OSABI);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
new file mode 100644
index 000000000000..f6bb35d2326b
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -0,0 +1,1403 @@
+//===- lib/MC/ARMELFStreamer.cpp - ELF Object Output for ARM --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits ARM ELF .o object files. Different
+// from generic ELF streamer in emitting mapping symbols ($a, $t and $d) to
+// delimit regions of data and code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMRegisterInfo.h"
+#include "ARMUnwindOpAsm.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/ARMEHABI.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+using namespace llvm;
+
+static std::string GetAEABIUnwindPersonalityName(unsigned Index) {
+  assert(Index < ARM::EHABI::NUM_PERSONALITY_INDEX &&
+         "Invalid personality index");
+  return (Twine("__aeabi_unwind_cpp_pr") + Twine(Index)).str();
+}
+
+namespace {
+
+class ARMELFStreamer;
+
+class ARMTargetAsmStreamer : public ARMTargetStreamer {
+  formatted_raw_ostream &OS;
+  MCInstPrinter &InstPrinter;
+  bool IsVerboseAsm;
+
+  void emitFnStart() override;
+  void emitFnEnd() override;
+  void emitCantUnwind() override;
+  void emitPersonality(const MCSymbol *Personality) override;
+  void emitPersonalityIndex(unsigned Index) override;
+  void emitHandlerData() override;
+  void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0) override;
+  void emitMovSP(unsigned Reg, int64_t Offset = 0) override;
+  void emitPad(int64_t Offset) override;
+  void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                   bool isVector) override;
+  void emitUnwindRaw(int64_t Offset,
+                     const SmallVectorImpl<uint8_t> &Opcodes) override;
+
+  void switchVendor(StringRef Vendor) override;
+  void emitAttribute(unsigned Attribute, unsigned Value) override;
+  void emitTextAttribute(unsigned Attribute, StringRef String) override;
+  void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
+                            StringRef StringValue) override;
+  void emitArch(unsigned Arch) override;
+  void emitArchExtension(unsigned ArchExt) override;
+  void emitObjectArch(unsigned Arch) override;
+  void emitFPU(unsigned FPU) override;
+  void emitInst(uint32_t Inst, char Suffix = '\0') override;
+  void finishAttributeSection() override;
+
+  void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
+  void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) override;
+
+public:
+  ARMTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS,
+                       MCInstPrinter &InstPrinter, bool VerboseAsm);
+};
+
+ARMTargetAsmStreamer::ARMTargetAsmStreamer(MCStreamer &S,
+                                           formatted_raw_ostream &OS,
+                                           MCInstPrinter &InstPrinter,
+                                           bool VerboseAsm)
+    : ARMTargetStreamer(S), OS(OS), InstPrinter(InstPrinter),
+      IsVerboseAsm(VerboseAsm) {}
+void ARMTargetAsmStreamer::emitFnStart() { OS << "\t.fnstart\n"; }
+void ARMTargetAsmStreamer::emitFnEnd() { OS << "\t.fnend\n"; }
+void ARMTargetAsmStreamer::emitCantUnwind() { OS << "\t.cantunwind\n"; }
+void ARMTargetAsmStreamer::emitPersonality(const MCSymbol *Personality) {
+  OS << "\t.personality " << Personality->getName() << '\n';
+}
+void ARMTargetAsmStreamer::emitPersonalityIndex(unsigned Index) {
+  OS << "\t.personalityindex " << Index << '\n';
+}
+void ARMTargetAsmStreamer::emitHandlerData() { OS << "\t.handlerdata\n"; }
+void ARMTargetAsmStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
+                                     int64_t Offset) {
+  OS << "\t.setfp\t";
+  InstPrinter.printRegName(OS, FpReg);
+  OS << ", ";
+  InstPrinter.printRegName(OS, SpReg);
+  if (Offset)
+    OS << ", #" << Offset;
+  OS << '\n';
+}
+void ARMTargetAsmStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
+  assert((Reg != ARM::SP && Reg != ARM::PC) &&
+         "the operand of .movsp cannot be either sp or pc");
+
+  OS << "\t.movsp\t";
+  InstPrinter.printRegName(OS, Reg);
+  if (Offset)
+    OS << ", #" << Offset;
+  OS << '\n';
+}
+void ARMTargetAsmStreamer::emitPad(int64_t Offset) {
+  OS << "\t.pad\t#" << Offset << '\n';
+}
+void ARMTargetAsmStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                       bool isVector) {
+  assert(RegList.size() && "RegList should not be empty");
+  if (isVector)
+    OS << "\t.vsave\t{";
+  else
+    OS << "\t.save\t{";
+
+  InstPrinter.printRegName(OS, RegList[0]);
+
+  for (unsigned i = 1, e = RegList.size(); i != e; ++i) {
+    OS << ", ";
+    InstPrinter.printRegName(OS, RegList[i]);
+  }
+
+  OS << "}\n";
+}
+void ARMTargetAsmStreamer::switchVendor(StringRef Vendor) {
+}
+void ARMTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+  OS << "\t.eabi_attribute\t" << Attribute << ", " << Twine(Value);
+  if (IsVerboseAsm) {
+    StringRef Name = ARMBuildAttrs::AttrTypeAsString(Attribute);
+    if (!Name.empty())
+      OS << "\t@ " << Name;
+  }
+  OS << "\n";
+}
+void ARMTargetAsmStreamer::emitTextAttribute(unsigned Attribute,
+                                             StringRef String) {
+  switch (Attribute) {
+  case ARMBuildAttrs::CPU_name:
+    OS << "\t.cpu\t" << String.lower();
+    break;
+  default:
+    OS << "\t.eabi_attribute\t" << Attribute << ", \"" << String << "\"";
+    if (IsVerboseAsm) {
+      StringRef Name = ARMBuildAttrs::AttrTypeAsString(Attribute);
+      if (!Name.empty())
+        OS << "\t@ " << Name;
+    }
+    break;
+  }
+  OS << "\n";
+}
+void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute,
+                                                unsigned IntValue,
+                                                StringRef StringValue) {
+  switch (Attribute) {
+  default: llvm_unreachable("unsupported multi-value attribute in asm mode");
+  case ARMBuildAttrs::compatibility:
+    OS << "\t.eabi_attribute\t" << Attribute << ", " << IntValue;
+    if (!StringValue.empty())
+      OS << ", \"" << StringValue << "\"";
+    if (IsVerboseAsm)
+      OS << "\t@ " << ARMBuildAttrs::AttrTypeAsString(Attribute);
+    break;
+  }
+  OS << "\n";
+}
+void ARMTargetAsmStreamer::emitArch(unsigned Arch) {
+  OS << "\t.arch\t" << ARM::getArchName(Arch) << "\n";
+}
+void ARMTargetAsmStreamer::emitArchExtension(unsigned ArchExt) {
+  OS << "\t.arch_extension\t" << ARM::getArchExtName(ArchExt) << "\n";
+}
+void ARMTargetAsmStreamer::emitObjectArch(unsigned Arch) {
+  OS << "\t.object_arch\t" << ARM::getArchName(Arch) << '\n';
+}
+void ARMTargetAsmStreamer::emitFPU(unsigned FPU) {
+  OS << "\t.fpu\t" << ARM::getFPUName(FPU) << "\n";
+}
+void ARMTargetAsmStreamer::finishAttributeSection() {
+}
+void
+ARMTargetAsmStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
+  OS << "\t.tlsdescseq\t" << S->getSymbol().getName();
+}
+
+void ARMTargetAsmStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
+  const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
+
+  OS << "\t.thumb_set\t";
+  Symbol->print(OS, MAI);
+  OS << ", ";
+  Value->print(OS, MAI);
+  OS << '\n';
+}
+
+void ARMTargetAsmStreamer::emitInst(uint32_t Inst, char Suffix) {
+  OS << "\t.inst";
+  if (Suffix)
+    OS << "." << Suffix;
+  OS << "\t0x" << Twine::utohexstr(Inst) << "\n";
+}
+
+void ARMTargetAsmStreamer::emitUnwindRaw(int64_t Offset,
+                                      const SmallVectorImpl<uint8_t> &Opcodes) {
+  OS << "\t.unwind_raw " << Offset;
+  for (SmallVectorImpl<uint8_t>::const_iterator OCI = Opcodes.begin(),
+                                                OCE = Opcodes.end();
+       OCI != OCE; ++OCI)
+    OS << ", 0x" << Twine::utohexstr(*OCI);
+  OS << '\n';
+}
+
+class ARMTargetELFStreamer : public ARMTargetStreamer {
+private:
+  // This structure holds all attributes, accounting for
+  // their string/numeric value, so we can later emit them
+  // in declaration order, keeping all in the same vector
+  struct AttributeItem {
+    enum {
+      HiddenAttribute = 0,
+      NumericAttribute,
+      TextAttribute,
+      NumericAndTextAttributes
+    } Type;
+    unsigned Tag;
+    unsigned IntValue;
+    std::string StringValue;
+
+    static bool LessTag(const AttributeItem &LHS, const AttributeItem &RHS) {
+      // The conformance tag must be emitted first when serialised
+      // into an object file. Specifically, the addenda to the ARM ABI
+      // states that (2.3.7.4):
+      //
+      // "To simplify recognition by consumers in the common case of
+      // claiming conformity for the whole file, this tag should be
+      // emitted first in a file-scope sub-subsection of the first
+      // public subsection of the attributes section."
+      //
+      // So it is special-cased in this comparison predicate when the
+      // attributes are sorted in finishAttributeSection().
+      return (RHS.Tag != ARMBuildAttrs::conformance) &&
+             ((LHS.Tag == ARMBuildAttrs::conformance) || (LHS.Tag < RHS.Tag));
+    }
+  };
+
+  StringRef CurrentVendor;
+  unsigned FPU;
+  unsigned Arch;
+  unsigned EmittedArch;
+  SmallVector<AttributeItem, 64> Contents;
+
+  MCSection *AttributeSection;
+
+  AttributeItem *getAttributeItem(unsigned Attribute) {
+    for (size_t i = 0; i < Contents.size(); ++i)
+      if (Contents[i].Tag == Attribute)
+        return &Contents[i];
+    return nullptr;
+  }
+
+  void setAttributeItem(unsigned Attribute, unsigned Value,
+                        bool OverwriteExisting) {
+    // Look for existing attribute item
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->Type = AttributeItem::NumericAttribute;
+      Item->IntValue = Value;
+      return;
+    }
+
+    // Create new attribute item
+    AttributeItem Item = {
+      AttributeItem::NumericAttribute,
+      Attribute,
+      Value,
+      StringRef("")
+    };
+    Contents.push_back(Item);
+  }
+
+  void setAttributeItem(unsigned Attribute, StringRef Value,
+                        bool OverwriteExisting) {
+    // Look for existing attribute item
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->Type = AttributeItem::TextAttribute;
+      Item->StringValue = Value;
+      return;
+    }
+
+    // Create new attribute item
+    AttributeItem Item = {
+      AttributeItem::TextAttribute,
+      Attribute,
+      0,
+      Value
+    };
+    Contents.push_back(Item);
+  }
+
+  void setAttributeItems(unsigned Attribute, unsigned IntValue,
+                         StringRef StringValue, bool OverwriteExisting) {
+    // Look for existing attribute item
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->Type = AttributeItem::NumericAndTextAttributes;
+      Item->IntValue = IntValue;
+      Item->StringValue = StringValue;
+      return;
+    }
+
+    // Create new attribute item
+    AttributeItem Item = {
+      AttributeItem::NumericAndTextAttributes,
+      Attribute,
+      IntValue,
+      StringValue
+    };
+    Contents.push_back(Item);
+  }
+
+  void emitArchDefaultAttributes();
+  void emitFPUDefaultAttributes();
+
+  ARMELFStreamer &getStreamer();
+
+  void emitFnStart() override;
+  void emitFnEnd() override;
+  void emitCantUnwind() override;
+  void emitPersonality(const MCSymbol *Personality) override;
+  void emitPersonalityIndex(unsigned Index) override;
+  void emitHandlerData() override;
+  void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0) override;
+  void emitMovSP(unsigned Reg, int64_t Offset = 0) override;
+  void emitPad(int64_t Offset) override;
+  void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                   bool isVector) override;
+  void emitUnwindRaw(int64_t Offset,
+                     const SmallVectorImpl<uint8_t> &Opcodes) override;
+
+  void switchVendor(StringRef Vendor) override;
+  void emitAttribute(unsigned Attribute, unsigned Value) override;
+  void emitTextAttribute(unsigned Attribute, StringRef String) override;
+  void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
+                            StringRef StringValue) override;
+  void emitArch(unsigned Arch) override;
+  void emitObjectArch(unsigned Arch) override;
+  void emitFPU(unsigned FPU) override;
+  void emitInst(uint32_t Inst, char Suffix = '\0') override;
+  void finishAttributeSection() override;
+  void emitLabel(MCSymbol *Symbol) override;
+
+  void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
+  void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) override;
+
+  size_t calculateContentSize() const;
+
+  // Reset state between object emissions
+  void reset() override;
+
+public:
+  ARMTargetELFStreamer(MCStreamer &S)
+    : ARMTargetStreamer(S), CurrentVendor("aeabi"), FPU(ARM::FK_INVALID),
+      Arch(ARM::AK_INVALID), EmittedArch(ARM::AK_INVALID),
+      AttributeSection(nullptr) {}
+};
+
+/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
+/// the appropriate points in the object files. These symbols are defined in the
+/// ARM ELF ABI: infocenter.arm.com/help/topic/com.arm.../IHI0044D_aaelf.pdf.
+///
+/// In brief: $a, $t or $d should be emitted at the start of each contiguous
+/// region of ARM code, Thumb code or data in a section. In practice, this
+/// emission does not rely on explicit assembler directives but on inherent
+/// properties of the directives doing the emission (e.g. ".byte" is data, "add
+/// r0, r0, r0" an instruction).
+///
+/// As a result this system is orthogonal to the DataRegion infrastructure used
+/// by MachO. Beware!
+class ARMELFStreamer : public MCELFStreamer {
+public:
+  friend class ARMTargetELFStreamer;
+
+  ARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_pwrite_stream &OS,
+                 MCCodeEmitter *Emitter, bool IsThumb)
+      : MCELFStreamer(Context, TAB, OS, Emitter), IsThumb(IsThumb),
+        MappingSymbolCounter(0), LastEMS(EMS_None) {
+    EHReset();
+  }
+
+  ~ARMELFStreamer() {}
+
+  void FinishImpl() override;
+
+  // ARM exception handling directives
+  void emitFnStart();
+  void emitFnEnd();
+  void emitCantUnwind();
+  void emitPersonality(const MCSymbol *Per);
+  void emitPersonalityIndex(unsigned index);
+  void emitHandlerData();
+  void emitSetFP(unsigned NewFpReg, unsigned NewSpReg, int64_t Offset = 0);
+  void emitMovSP(unsigned Reg, int64_t Offset = 0);
+  void emitPad(int64_t Offset);
+  void emitRegSave(const SmallVectorImpl<unsigned> &RegList, bool isVector);
+  void emitUnwindRaw(int64_t Offset, const SmallVectorImpl<uint8_t> &Opcodes);
+
+  void ChangeSection(MCSection *Section, const MCExpr *Subsection) override {
+    // We have to keep track of the mapping symbol state of any sections we
+    // use. Each one should start off as EMS_None, which is provided as the
+    // default constructor by DenseMap::lookup.
+    LastMappingSymbols[getPreviousSection().first] = LastEMS;
+    LastEMS = LastMappingSymbols.lookup(Section);
+
+    MCELFStreamer::ChangeSection(Section, Subsection);
+  }
+
+  /// This function is the one used to emit instruction data into the ELF
+  /// streamer. We override it to add the appropriate mapping symbol if
+  /// necessary.
+  void EmitInstruction(const MCInst& Inst,
+                       const MCSubtargetInfo &STI) override {
+    if (IsThumb)
+      EmitThumbMappingSymbol();
+    else
+      EmitARMMappingSymbol();
+
+    MCELFStreamer::EmitInstruction(Inst, STI);
+  }
+
+  void emitInst(uint32_t Inst, char Suffix) {
+    unsigned Size;
+    char Buffer[4];
+    const bool LittleEndian = getContext().getAsmInfo()->isLittleEndian();
+
+    switch (Suffix) {
+    case '\0':
+      Size = 4;
+
+      assert(!IsThumb);
+      EmitARMMappingSymbol();
+      for (unsigned II = 0, IE = Size; II != IE; II++) {
+        const unsigned I = LittleEndian ? (Size - II - 1) : II;
+        Buffer[Size - II - 1] = uint8_t(Inst >> I * CHAR_BIT);
+      }
+
+      break;
+    case 'n':
+    case 'w':
+      Size = (Suffix == 'n' ? 2 : 4);
+
+      assert(IsThumb);
+      EmitThumbMappingSymbol();
+      for (unsigned II = 0, IE = Size; II != IE; II = II + 2) {
+        const unsigned I0 = LittleEndian ? II + 0 : (Size - II - 1);
+        const unsigned I1 = LittleEndian ? II + 1 : (Size - II - 2);
+        Buffer[Size - II - 2] = uint8_t(Inst >> I0 * CHAR_BIT);
+        Buffer[Size - II - 1] = uint8_t(Inst >> I1 * CHAR_BIT);
+      }
+
+      break;
+    default:
+      llvm_unreachable("Invalid Suffix");
+    }
+
+    MCELFStreamer::EmitBytes(StringRef(Buffer, Size));
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
+  /// necessary.
+  void EmitBytes(StringRef Data) override {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitBytes(Data);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
+  /// necessary.
+  void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
+    if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value))
+      if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4)) {
+        getContext().reportError(Loc, "relocated expression must be 32-bit");
+        return;
+      }
+
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitValueImpl(Value, Size, Loc);
+  }
+
+  void EmitAssemblerFlag(MCAssemblerFlag Flag) override {
+    MCELFStreamer::EmitAssemblerFlag(Flag);
+
+    switch (Flag) {
+    case MCAF_SyntaxUnified:
+      return; // no-op here.
+    case MCAF_Code16:
+      IsThumb = true;
+      return; // Change to Thumb mode
+    case MCAF_Code32:
+      IsThumb = false;
+      return; // Change to ARM mode
+    case MCAF_Code64:
+      return;
+    case MCAF_SubsectionsViaSymbols:
+      return;
+    }
+  }
+
+private:
+  enum ElfMappingSymbol {
+    EMS_None,
+    EMS_ARM,
+    EMS_Thumb,
+    EMS_Data
+  };
+
+  void EmitDataMappingSymbol() {
+    if (LastEMS == EMS_Data) return;
+    EmitMappingSymbol("$d");
+    LastEMS = EMS_Data;
+  }
+
+  void EmitThumbMappingSymbol() {
+    if (LastEMS == EMS_Thumb) return;
+    EmitMappingSymbol("$t");
+    LastEMS = EMS_Thumb;
+  }
+
+  void EmitARMMappingSymbol() {
+    if (LastEMS == EMS_ARM) return;
+    EmitMappingSymbol("$a");
+    LastEMS = EMS_ARM;
+  }
+
+  void EmitMappingSymbol(StringRef Name) {
+    auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
+        Name + "." + Twine(MappingSymbolCounter++)));
+    EmitLabel(Symbol);
+
+    Symbol->setType(ELF::STT_NOTYPE);
+    Symbol->setBinding(ELF::STB_LOCAL);
+    Symbol->setExternal(false);
+  }
+
+  void EmitThumbFunc(MCSymbol *Func) override {
+    getAssembler().setIsThumbFunc(Func);
+    EmitSymbolAttribute(Func, MCSA_ELF_TypeFunction);
+  }
+
+  // Helper functions for ARM exception handling directives
+  void EHReset();
+
+  // Reset state between object emissions
+  void reset() override;
+
+  void EmitPersonalityFixup(StringRef Name);
+  void FlushPendingOffset();
+  void FlushUnwindOpcodes(bool NoHandlerData);
+
+  void SwitchToEHSection(StringRef Prefix, unsigned Type, unsigned Flags,
+                         SectionKind Kind, const MCSymbol &Fn);
+  void SwitchToExTabSection(const MCSymbol &FnStart);
+  void SwitchToExIdxSection(const MCSymbol &FnStart);
+
+  void EmitFixup(const MCExpr *Expr, MCFixupKind Kind);
+
+  bool IsThumb;
+  int64_t MappingSymbolCounter;
+
+  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
+  ElfMappingSymbol LastEMS;
+
+  // ARM Exception Handling Frame Information
+  MCSymbol *ExTab;
+  MCSymbol *FnStart;
+  const MCSymbol *Personality;
+  unsigned PersonalityIndex;
+  unsigned FPReg; // Frame pointer register
+  int64_t FPOffset; // Offset: (final frame pointer) - (initial $sp)
+  int64_t SPOffset; // Offset: (final $sp) - (initial $sp)
+  int64_t PendingOffset; // Offset: (final $sp) - (emitted $sp)
+  bool UsedFP;
+  bool CantUnwind;
+  SmallVector<uint8_t, 64> Opcodes;
+  UnwindOpcodeAssembler UnwindOpAsm;
+};
+} // end anonymous namespace
+
+ARMELFStreamer &ARMTargetELFStreamer::getStreamer() {
+  return static_cast<ARMELFStreamer &>(Streamer);
+}
+
+void ARMTargetELFStreamer::emitFnStart() { getStreamer().emitFnStart(); }
+void ARMTargetELFStreamer::emitFnEnd() { getStreamer().emitFnEnd(); }
+void ARMTargetELFStreamer::emitCantUnwind() { getStreamer().emitCantUnwind(); }
+void ARMTargetELFStreamer::emitPersonality(const MCSymbol *Personality) {
+  getStreamer().emitPersonality(Personality);
+}
+void ARMTargetELFStreamer::emitPersonalityIndex(unsigned Index) {
+  getStreamer().emitPersonalityIndex(Index);
+}
+void ARMTargetELFStreamer::emitHandlerData() {
+  getStreamer().emitHandlerData();
+}
+void ARMTargetELFStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
+                                     int64_t Offset) {
+  getStreamer().emitSetFP(FpReg, SpReg, Offset);
+}
+void ARMTargetELFStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
+  getStreamer().emitMovSP(Reg, Offset);
+}
+void ARMTargetELFStreamer::emitPad(int64_t Offset) {
+  getStreamer().emitPad(Offset);
+}
+void ARMTargetELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                       bool isVector) {
+  getStreamer().emitRegSave(RegList, isVector);
+}
+void ARMTargetELFStreamer::emitUnwindRaw(int64_t Offset,
+                                      const SmallVectorImpl<uint8_t> &Opcodes) {
+  getStreamer().emitUnwindRaw(Offset, Opcodes);
+}
+void ARMTargetELFStreamer::switchVendor(StringRef Vendor) {
+  assert(!Vendor.empty() && "Vendor cannot be empty.");
+
+  if (CurrentVendor == Vendor)
+    return;
+
+  if (!CurrentVendor.empty())
+    finishAttributeSection();
+
+  assert(Contents.empty() &&
+         ".ARM.attributes should be flushed before changing vendor");
+  CurrentVendor = Vendor;
+
+}
+void ARMTargetELFStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+  setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
+}
+void ARMTargetELFStreamer::emitTextAttribute(unsigned Attribute,
+                                             StringRef Value) {
+  setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
+}
+void ARMTargetELFStreamer::emitIntTextAttribute(unsigned Attribute,
+                                                unsigned IntValue,
+                                                StringRef StringValue) {
+  setAttributeItems(Attribute, IntValue, StringValue,
+                    /* OverwriteExisting= */ true);
+}
+void ARMTargetELFStreamer::emitArch(unsigned Value) {
+  Arch = Value;
+}
+void ARMTargetELFStreamer::emitObjectArch(unsigned Value) {
+  EmittedArch = Value;
+}
+void ARMTargetELFStreamer::emitArchDefaultAttributes() {
+  using namespace ARMBuildAttrs;
+
+  setAttributeItem(CPU_name,
+                   ARM::getCPUAttr(Arch),
+                   false);
+
+  if (EmittedArch == ARM::AK_INVALID)
+    setAttributeItem(CPU_arch,
+                     ARM::getArchAttr(Arch),
+                     false);
+  else
+    setAttributeItem(CPU_arch,
+                     ARM::getArchAttr(EmittedArch),
+                     false);
+
+  switch (Arch) {
+  case ARM::AK_ARMV2:
+  case ARM::AK_ARMV2A:
+  case ARM::AK_ARMV3:
+  case ARM::AK_ARMV3M:
+  case ARM::AK_ARMV4:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    break;
+
+  case ARM::AK_ARMV4T:
+  case ARM::AK_ARMV5T:
+  case ARM::AK_ARMV5TE:
+  case ARM::AK_ARMV6:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, Allowed, false);
+    break;
+
+  case ARM::AK_ARMV6T2:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    break;
+
+  case ARM::AK_ARMV6K:
+  case ARM::AK_ARMV6KZ:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, Allowed, false);
+    setAttributeItem(Virtualization_use, AllowTZ, false);
+    break;
+
+  case ARM::AK_ARMV6M:
+    setAttributeItem(THUMB_ISA_use, Allowed, false);
+    break;
+
+  case ARM::AK_ARMV7A:
+    setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    break;
+
+  case ARM::AK_ARMV7R:
+    setAttributeItem(CPU_arch_profile, RealTimeProfile, false);
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    break;
+
+  case ARM::AK_ARMV7M:
+    setAttributeItem(CPU_arch_profile, MicroControllerProfile, false);
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    break;
+
+  case ARM::AK_ARMV8A:
+  case ARM::AK_ARMV8_1A:
+  case ARM::AK_ARMV8_2A:
+    setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
+    setAttributeItem(MPextension_use, Allowed, false);
+    setAttributeItem(Virtualization_use, AllowTZVirtualization, false);
+    break;
+
+  case ARM::AK_ARMV8MBaseline:
+  case ARM::AK_ARMV8MMainline:
+    setAttributeItem(THUMB_ISA_use, AllowThumbDerived, false);
+    setAttributeItem(CPU_arch_profile, MicroControllerProfile, false);
+    break;
+
+  case ARM::AK_IWMMXT:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, Allowed, false);
+    setAttributeItem(WMMX_arch, AllowWMMXv1, false);
+    break;
+
+  case ARM::AK_IWMMXT2:
+    setAttributeItem(ARM_ISA_use, Allowed, false);
+    setAttributeItem(THUMB_ISA_use, Allowed, false);
+    setAttributeItem(WMMX_arch, AllowWMMXv2, false);
+    break;
+
+  default:
+    report_fatal_error("Unknown Arch: " + Twine(Arch));
+    break;
+  }
+}
+void ARMTargetELFStreamer::emitFPU(unsigned Value) {
+  FPU = Value;
+}
+void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
+  switch (FPU) {
+  case ARM::FK_VFP:
+  case ARM::FK_VFPV2:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv2,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::FK_VFPV3:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv3A,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::FK_VFPV3_FP16:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv3A,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::FP_HP_extension,
+                     ARMBuildAttrs::AllowHPFP,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::FK_VFPV3_D16:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv3B,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::FK_VFPV3_D16_FP16:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv3B,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::FP_HP_extension,
+                     ARMBuildAttrs::AllowHPFP,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::FK_VFPV3XD:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv3B,
+                     /* OverwriteExisting= */ false);
+    break;
+  case ARM::FK_VFPV3XD_FP16:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv3B,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::FP_HP_extension,
+                     ARMBuildAttrs::AllowHPFP,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::FK_VFPV4:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv4A,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  // ABI_HardFP_use is handled in ARMAsmPrinter, so _SP_D16 is treated the same
+  // as _D16 here.
+  case ARM::FK_FPV4_SP_D16:
+  case ARM::FK_VFPV4_D16:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv4B,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::FK_FP_ARMV8:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPARMv8A,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  // FPV5_D16 is identical to FP_ARMV8 except for the number of D registers, so
+  // uses the FP_ARMV8_D16 build attribute.
+  case ARM::FK_FPV5_SP_D16:
+  case ARM::FK_FPV5_D16:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPARMv8B,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::FK_NEON:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv3A,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+                     ARMBuildAttrs::AllowNeon,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::FK_NEON_FP16:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv3A,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+                     ARMBuildAttrs::AllowNeon,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::FP_HP_extension,
+                     ARMBuildAttrs::AllowHPFP,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::FK_NEON_VFPV4:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPv4A,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+                     ARMBuildAttrs::AllowNeon2,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::FK_NEON_FP_ARMV8:
+  case ARM::FK_CRYPTO_NEON_FP_ARMV8:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPARMv8A,
+                     /* OverwriteExisting= */ false);
+    // 'Advanced_SIMD_arch' must be emitted not here, but within
+    // ARMAsmPrinter::emitAttributes(), depending on hasV8Ops() and hasV8_1a()
+    break;
+
+  case ARM::FK_SOFTVFP:
+  case ARM::FK_NONE:
+    break;
+
+  default:
+    report_fatal_error("Unknown FPU: " + Twine(FPU));
+    break;
+  }
+}
+size_t ARMTargetELFStreamer::calculateContentSize() const {
+  size_t Result = 0;
+  for (size_t i = 0; i < Contents.size(); ++i) {
+    AttributeItem item = Contents[i];
+    switch (item.Type) {
+    case AttributeItem::HiddenAttribute:
+      break;
+    case AttributeItem::NumericAttribute:
+      Result += getULEB128Size(item.Tag);
+      Result += getULEB128Size(item.IntValue);
+      break;
+    case AttributeItem::TextAttribute:
+      Result += getULEB128Size(item.Tag);
+      Result += item.StringValue.size() + 1; // string + '\0'
+      break;
+    case AttributeItem::NumericAndTextAttributes:
+      Result += getULEB128Size(item.Tag);
+      Result += getULEB128Size(item.IntValue);
+      Result += item.StringValue.size() + 1; // string + '\0';
+      break;
+    }
+  }
+  return Result;
+}
+void ARMTargetELFStreamer::finishAttributeSection() {
+  // <format-version>
+  // [ <section-length> "vendor-name"
+  // [ <file-tag> <size> <attribute>*
+  //   | <section-tag> <size> <section-number>* 0 <attribute>*
+  //   | <symbol-tag> <size> <symbol-number>* 0 <attribute>*
+  //   ]+
+  // ]*
+
+  if (FPU != ARM::FK_INVALID)
+    emitFPUDefaultAttributes();
+
+  if (Arch != ARM::AK_INVALID)
+    emitArchDefaultAttributes();
+
+  if (Contents.empty())
+    return;
+
+  std::sort(Contents.begin(), Contents.end(), AttributeItem::LessTag);
+
+  ARMELFStreamer &Streamer = getStreamer();
+
+  // Switch to .ARM.attributes section
+  if (AttributeSection) {
+    Streamer.SwitchSection(AttributeSection);
+  } else {
+    AttributeSection = Streamer.getContext().getELFSection(
+        ".ARM.attributes", ELF::SHT_ARM_ATTRIBUTES, 0);
+    Streamer.SwitchSection(AttributeSection);
+
+    // Format version
+    Streamer.EmitIntValue(0x41, 1);
+  }
+
+  // Vendor size + Vendor name + '\0'
+  const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1;
+
+  // Tag + Tag Size
+  const size_t TagHeaderSize = 1 + 4;
+
+  const size_t ContentsSize = calculateContentSize();
+
+  Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4);
+  Streamer.EmitBytes(CurrentVendor);
+  Streamer.EmitIntValue(0, 1); // '\0'
+
+  Streamer.EmitIntValue(ARMBuildAttrs::File, 1);
+  Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4);
+
+  // Size should have been accounted for already, now
+  // emit each field as its type (ULEB or String)
+  for (size_t i = 0; i < Contents.size(); ++i) {
+    AttributeItem item = Contents[i];
+    Streamer.EmitULEB128IntValue(item.Tag);
+    switch (item.Type) {
+    default: llvm_unreachable("Invalid attribute type");
+    case AttributeItem::NumericAttribute:
+      Streamer.EmitULEB128IntValue(item.IntValue);
+      break;
+    case AttributeItem::TextAttribute:
+      Streamer.EmitBytes(item.StringValue);
+      Streamer.EmitIntValue(0, 1); // '\0'
+      break;
+    case AttributeItem::NumericAndTextAttributes:
+      Streamer.EmitULEB128IntValue(item.IntValue);
+      Streamer.EmitBytes(item.StringValue);
+      Streamer.EmitIntValue(0, 1); // '\0'
+      break;
+    }
+  }
+
+  Contents.clear();
+  FPU = ARM::FK_INVALID;
+}
+
+void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
+  ARMELFStreamer &Streamer = getStreamer();
+  if (!Streamer.IsThumb)
+    return;
+
+  Streamer.getAssembler().registerSymbol(*Symbol);
+  unsigned Type = cast<MCSymbolELF>(Symbol)->getType();
+  if (Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)
+    Streamer.EmitThumbFunc(Symbol);
+}
+
+void
+ARMTargetELFStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
+  getStreamer().EmitFixup(S, FK_Data_4);
+}
+
+void ARMTargetELFStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
+  if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Value)) {
+    const MCSymbol &Sym = SRE->getSymbol();
+    if (!Sym.isDefined()) {
+      getStreamer().EmitAssignment(Symbol, Value);
+      return;
+    }
+  }
+
+  getStreamer().EmitThumbFunc(Symbol);
+  getStreamer().EmitAssignment(Symbol, Value);
+}
+
+void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) {
+  getStreamer().emitInst(Inst, Suffix);
+}
+
+void ARMTargetELFStreamer::reset() { AttributeSection = nullptr; }
+
+void ARMELFStreamer::FinishImpl() {
+  MCTargetStreamer &TS = *getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+  ATS.finishAttributeSection();
+
+  MCELFStreamer::FinishImpl();
+}
+
+void ARMELFStreamer::reset() {
+  MCTargetStreamer &TS = *getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+  ATS.reset();
+  MappingSymbolCounter = 0;
+  MCELFStreamer::reset();
+  // MCELFStreamer clear's the assembler's e_flags. However, for
+  // arm we manually set the ABI version on streamer creation, so
+  // do the same here
+  getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+}
+
+inline void ARMELFStreamer::SwitchToEHSection(StringRef Prefix,
+                                              unsigned Type,
+                                              unsigned Flags,
+                                              SectionKind Kind,
+                                              const MCSymbol &Fn) {
+  const MCSectionELF &FnSection =
+    static_cast<const MCSectionELF &>(Fn.getSection());
+
+  // Create the name for new section
+  StringRef FnSecName(FnSection.getSectionName());
+  SmallString<128> EHSecName(Prefix);
+  if (FnSecName != ".text") {
+    EHSecName += FnSecName;
+  }
+
+  // Get .ARM.extab or .ARM.exidx section
+  const MCSymbolELF *Group = FnSection.getGroup();
+  if (Group)
+    Flags |= ELF::SHF_GROUP;
+  MCSectionELF *EHSection =
+      getContext().getELFSection(EHSecName, Type, Flags, 0, Group,
+                                 FnSection.getUniqueID(), nullptr, &FnSection);
+
+  assert(EHSection && "Failed to get the required EH section");
+
+  // Switch to .ARM.extab or .ARM.exidx section
+  SwitchSection(EHSection);
+  EmitCodeAlignment(4);
+}
+
+inline void ARMELFStreamer::SwitchToExTabSection(const MCSymbol &FnStart) {
+  SwitchToEHSection(".ARM.extab", ELF::SHT_PROGBITS, ELF::SHF_ALLOC,
+                    SectionKind::getData(), FnStart);
+}
+
+inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
+  SwitchToEHSection(".ARM.exidx", ELF::SHT_ARM_EXIDX,
+                    ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER,
+                    SectionKind::getData(), FnStart);
+}
+void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
+  MCDataFragment *Frag = getOrCreateDataFragment();
+  Frag->getFixups().push_back(MCFixup::create(Frag->getContents().size(), Expr,
+                                              Kind));
+}
+
+void ARMELFStreamer::EHReset() {
+  ExTab = nullptr;
+  FnStart = nullptr;
+  Personality = nullptr;
+  PersonalityIndex = ARM::EHABI::NUM_PERSONALITY_INDEX;
+  FPReg = ARM::SP;
+  FPOffset = 0;
+  SPOffset = 0;
+  PendingOffset = 0;
+  UsedFP = false;
+  CantUnwind = false;
+
+  Opcodes.clear();
+  UnwindOpAsm.Reset();
+}
+
+void ARMELFStreamer::emitFnStart() {
+  assert(FnStart == nullptr);
+  FnStart = getContext().createTempSymbol();
+  EmitLabel(FnStart);
+}
+
+void ARMELFStreamer::emitFnEnd() {
+  assert(FnStart && ".fnstart must precedes .fnend");
+
+  // Emit unwind opcodes if there is no .handlerdata directive
+  if (!ExTab && !CantUnwind)
+    FlushUnwindOpcodes(true);
+
+  // Emit the exception index table entry
+  SwitchToExIdxSection(*FnStart);
+
+  if (PersonalityIndex < ARM::EHABI::NUM_PERSONALITY_INDEX)
+    EmitPersonalityFixup(GetAEABIUnwindPersonalityName(PersonalityIndex));
+
+  const MCSymbolRefExpr *FnStartRef =
+    MCSymbolRefExpr::create(FnStart,
+                            MCSymbolRefExpr::VK_ARM_PREL31,
+                            getContext());
+
+  EmitValue(FnStartRef, 4);
+
+  if (CantUnwind) {
+    EmitIntValue(ARM::EHABI::EXIDX_CANTUNWIND, 4);
+  } else if (ExTab) {
+    // Emit a reference to the unwind opcodes in the ".ARM.extab" section.
+    const MCSymbolRefExpr *ExTabEntryRef =
+      MCSymbolRefExpr::create(ExTab,
+                              MCSymbolRefExpr::VK_ARM_PREL31,
+                              getContext());
+    EmitValue(ExTabEntryRef, 4);
+  } else {
+    // For the __aeabi_unwind_cpp_pr0, we have to emit the unwind opcodes in
+    // the second word of exception index table entry.  The size of the unwind
+    // opcodes should always be 4 bytes.
+    assert(PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0 &&
+           "Compact model must use __aeabi_unwind_cpp_pr0 as personality");
+    assert(Opcodes.size() == 4u &&
+           "Unwind opcode size for __aeabi_unwind_cpp_pr0 must be equal to 4");
+    uint64_t Intval = Opcodes[0] |
+                      Opcodes[1] << 8 |
+                      Opcodes[2] << 16 |
+                      Opcodes[3] << 24;
+    EmitIntValue(Intval, Opcodes.size());
+  }
+
+  // Switch to the section containing FnStart
+  SwitchSection(&FnStart->getSection());
+
+  // Clean exception handling frame information
+  EHReset();
+}
+
+void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; }
+
+// Add the R_ARM_NONE fixup at the same position
+void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
+  const MCSymbol *PersonalitySym = getContext().getOrCreateSymbol(Name);
+
+  const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::create(
+      PersonalitySym, MCSymbolRefExpr::VK_ARM_NONE, getContext());
+
+  visitUsedExpr(*PersonalityRef);
+  MCDataFragment *DF = getOrCreateDataFragment();
+  DF->getFixups().push_back(MCFixup::create(DF->getContents().size(),
+                                            PersonalityRef,
+                                            MCFixup::getKindForSize(4, false)));
+}
+
+void ARMELFStreamer::FlushPendingOffset() {
+  if (PendingOffset != 0) {
+    UnwindOpAsm.EmitSPOffset(-PendingOffset);
+    PendingOffset = 0;
+  }
+}
+
+void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
+  // Emit the unwind opcode to restore $sp.
+  if (UsedFP) {
+    const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+    int64_t LastRegSaveSPOffset = SPOffset - PendingOffset;
+    UnwindOpAsm.EmitSPOffset(LastRegSaveSPOffset - FPOffset);
+    UnwindOpAsm.EmitSetSP(MRI->getEncodingValue(FPReg));
+  } else {
+    FlushPendingOffset();
+  }
+
+  // Finalize the unwind opcode sequence
+  UnwindOpAsm.Finalize(PersonalityIndex, Opcodes);
+
+  // For compact model 0, we have to emit the unwind opcodes in the .ARM.exidx
+  // section.  Thus, we don't have to create an entry in the .ARM.extab
+  // section.
+  if (NoHandlerData && PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0)
+    return;
+
+  // Switch to .ARM.extab section.
+  SwitchToExTabSection(*FnStart);
+
+  // Create .ARM.extab label for offset in .ARM.exidx
+  assert(!ExTab);
+  ExTab = getContext().createTempSymbol();
+  EmitLabel(ExTab);
+
+  // Emit personality
+  if (Personality) {
+    const MCSymbolRefExpr *PersonalityRef =
+      MCSymbolRefExpr::create(Personality,
+                              MCSymbolRefExpr::VK_ARM_PREL31,
+                              getContext());
+
+    EmitValue(PersonalityRef, 4);
+  }
+
+  // Emit unwind opcodes
+  assert((Opcodes.size() % 4) == 0 &&
+         "Unwind opcode size for __aeabi_cpp_unwind_pr0 must be multiple of 4");
+  for (unsigned I = 0; I != Opcodes.size(); I += 4) {
+    uint64_t Intval = Opcodes[I] |
+                      Opcodes[I + 1] << 8 |
+                      Opcodes[I + 2] << 16 |
+                      Opcodes[I + 3] << 24;
+    EmitIntValue(Intval, 4);
+  }
+
+  // According to ARM EHABI section 9.2, if the __aeabi_unwind_cpp_pr1() or
+  // __aeabi_unwind_cpp_pr2() is used, then the handler data must be emitted
+  // after the unwind opcodes.  The handler data consists of several 32-bit
+  // words, and should be terminated by zero.
+  //
+  // In case that the .handlerdata directive is not specified by the
+  // programmer, we should emit zero to terminate the handler data.
+  if (NoHandlerData && !Personality)
+    EmitIntValue(0, 4);
+}
+
+void ARMELFStreamer::emitHandlerData() { FlushUnwindOpcodes(false); }
+
+void ARMELFStreamer::emitPersonality(const MCSymbol *Per) {
+  Personality = Per;
+  UnwindOpAsm.setPersonality(Per);
+}
+
+void ARMELFStreamer::emitPersonalityIndex(unsigned Index) {
+  assert(Index < ARM::EHABI::NUM_PERSONALITY_INDEX && "invalid index");
+  PersonalityIndex = Index;
+}
+
+void ARMELFStreamer::emitSetFP(unsigned NewFPReg, unsigned NewSPReg,
+                               int64_t Offset) {
+  assert((NewSPReg == ARM::SP || NewSPReg == FPReg) &&
+         "the operand of .setfp directive should be either $sp or $fp");
+
+  UsedFP = true;
+  FPReg = NewFPReg;
+
+  if (NewSPReg == ARM::SP)
+    FPOffset = SPOffset + Offset;
+  else
+    FPOffset += Offset;
+}
+
+void ARMELFStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
+  assert((Reg != ARM::SP && Reg != ARM::PC) &&
+         "the operand of .movsp cannot be either sp or pc");
+  assert(FPReg == ARM::SP && "current FP must be SP");
+
+  FlushPendingOffset();
+
+  FPReg = Reg;
+  FPOffset = SPOffset + Offset;
+
+  const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+  UnwindOpAsm.EmitSetSP(MRI->getEncodingValue(FPReg));
+}
+
+void ARMELFStreamer::emitPad(int64_t Offset) {
+  // Track the change of the $sp offset
+  SPOffset -= Offset;
+
+  // To squash multiple .pad directives, we should delay the unwind opcode
+  // until the .save, .vsave, .handlerdata, or .fnend directives.
+  PendingOffset -= Offset;
+}
+
+void ARMELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                 bool IsVector) {
+  // Collect the registers in the register list
+  unsigned Count = 0;
+  uint32_t Mask = 0;
+  const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+  for (size_t i = 0; i < RegList.size(); ++i) {
+    unsigned Reg = MRI->getEncodingValue(RegList[i]);
+    assert(Reg < (IsVector ? 32U : 16U) && "Register out of range");
+    unsigned Bit = (1u << Reg);
+    if ((Mask & Bit) == 0) {
+      Mask |= Bit;
+      ++Count;
+    }
+  }
+
+  // Track the change the $sp offset: For the .save directive, the
+  // corresponding push instruction will decrease the $sp by (4 * Count).
+  // For the .vsave directive, the corresponding vpush instruction will
+  // decrease $sp by (8 * Count).
+  SPOffset -= Count * (IsVector ? 8 : 4);
+
+  // Emit the opcode
+  FlushPendingOffset();
+  if (IsVector)
+    UnwindOpAsm.EmitVFPRegSave(Mask);
+  else
+    UnwindOpAsm.EmitRegSave(Mask);
+}
+
+void ARMELFStreamer::emitUnwindRaw(int64_t Offset,
+                                   const SmallVectorImpl<uint8_t> &Opcodes) {
+  FlushPendingOffset();
+  SPOffset = SPOffset - Offset;
+  UnwindOpAsm.EmitRaw(Opcodes);
+}
+
+namespace llvm {
+
+MCTargetStreamer *createARMTargetAsmStreamer(MCStreamer &S,
+                                             formatted_raw_ostream &OS,
+                                             MCInstPrinter *InstPrint,
+                                             bool isVerboseAsm) {
+  return new ARMTargetAsmStreamer(S, OS, *InstPrint, isVerboseAsm);
+}
+
+MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S) {
+  return new ARMTargetStreamer(S);
+}
+
+MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S,
+                                                const MCSubtargetInfo &STI) {
+  const Triple &TT = STI.getTargetTriple();
+  if (TT.isOSBinFormatELF())
+    return new ARMTargetELFStreamer(S);
+  return new ARMTargetStreamer(S);
+}
+
+MCELFStreamer *createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                    raw_pwrite_stream &OS,
+                                    MCCodeEmitter *Emitter, bool RelaxAll,
+                                    bool IsThumb) {
+    ARMELFStreamer *S = new ARMELFStreamer(Context, TAB, OS, Emitter, IsThumb);
+    // FIXME: This should eventually end up somewhere else where more
+    // intelligent flag decisions can be made. For now we are just maintaining
+    // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default.
+    S->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+
+    if (RelaxAll)
+      S->getAssembler().setRelaxAll(true);
+    return S;
+  }
+
+}
+
+
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
new file mode 100644
index 000000000000..3fe2302bdd37
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
@@ -0,0 +1,120 @@
+//===-- ARMFixupKinds.h - ARM Specific Fixup Entries ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMFIXUPKINDS_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace ARM {
+enum Fixups {
+  // fixup_arm_ldst_pcrel_12 - 12-bit PC relative relocation for symbol
+  // addresses
+  fixup_arm_ldst_pcrel_12 = FirstTargetFixupKind,
+
+  // fixup_t2_ldst_pcrel_12 - Equivalent to fixup_arm_ldst_pcrel_12, with
+  // the 16-bit halfwords reordered.
+  fixup_t2_ldst_pcrel_12,
+
+  // fixup_arm_pcrel_10_unscaled - 10-bit PC relative relocation for symbol
+  // addresses used in LDRD/LDRH/LDRB/etc. instructions. All bits are encoded.
+  fixup_arm_pcrel_10_unscaled,
+  // fixup_arm_pcrel_10 - 10-bit PC relative relocation for symbol addresses
+  // used in VFP instructions where the lower 2 bits are not encoded
+  // (so it's encoded as an 8-bit immediate).
+  fixup_arm_pcrel_10,
+  // fixup_t2_pcrel_10 - Equivalent to fixup_arm_pcrel_10, accounting for
+  // the short-swapped encoding of Thumb2 instructions.
+  fixup_t2_pcrel_10,
+  // fixup_arm_pcrel_9 - 9-bit PC relative relocation for symbol addresses
+  // used in VFP instructions where bit 0 not encoded (so it's encoded as an
+  // 8-bit immediate).
+  fixup_arm_pcrel_9,
+  // fixup_t2_pcrel_9 - Equivalent to fixup_arm_pcrel_9, accounting for
+  // the short-swapped encoding of Thumb2 instructions.
+  fixup_t2_pcrel_9,
+  // fixup_thumb_adr_pcrel_10 - 10-bit PC relative relocation for symbol
+  // addresses where the lower 2 bits are not encoded (so it's encoded as an
+  // 8-bit immediate).
+  fixup_thumb_adr_pcrel_10,
+  // fixup_arm_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
+  // instruction.
+  fixup_arm_adr_pcrel_12,
+  // fixup_t2_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
+  // instruction.
+  fixup_t2_adr_pcrel_12,
+  // fixup_arm_condbranch - 24-bit PC relative relocation for conditional branch
+  // instructions. 
+  fixup_arm_condbranch,
+  // fixup_arm_uncondbranch - 24-bit PC relative relocation for 
+  // branch instructions. (unconditional)
+  fixup_arm_uncondbranch,
+  // fixup_t2_condbranch - 20-bit PC relative relocation for Thumb2 direct
+  // uconditional branch instructions.
+  fixup_t2_condbranch,
+  // fixup_t2_uncondbranch - 20-bit PC relative relocation for Thumb2 direct
+  // branch unconditional branch instructions.
+  fixup_t2_uncondbranch,
+
+  // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions.
+  fixup_arm_thumb_br,
+
+  // The following fixups handle the ARM BL instructions. These can be
+  // conditionalised; however, the ARM ELF ABI requires a different relocation
+  // in that case: R_ARM_JUMP24 instead of R_ARM_CALL. The difference is that
+  // R_ARM_CALL is allowed to change the instruction to a BLX inline, which has
+  // no conditional version; R_ARM_JUMP24 would have to insert a veneer.
+  //
+  // MachO does not draw a distinction between the two cases, so it will treat
+  // fixup_arm_uncondbl and fixup_arm_condbl as identical fixups.
+
+  // fixup_arm_uncondbl - Fixup for unconditional ARM BL instructions.
+  fixup_arm_uncondbl,
+
+  // fixup_arm_condbl - Fixup for ARM BL instructions with nontrivial
+  // conditionalisation.
+  fixup_arm_condbl,
+
+  // fixup_arm_blx - Fixup for ARM BLX instructions.
+  fixup_arm_blx,
+
+  // fixup_arm_thumb_bl - Fixup for Thumb BL instructions.
+  fixup_arm_thumb_bl,
+
+  // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions.
+  fixup_arm_thumb_blx,
+
+  // fixup_arm_thumb_cb - Fixup for Thumb branch instructions.
+  fixup_arm_thumb_cb,
+
+  // fixup_arm_thumb_cp - Fixup for Thumb load/store from constant pool instrs.
+  fixup_arm_thumb_cp,
+
+  // fixup_arm_thumb_bcc - Fixup for Thumb conditional branching instructions.
+  fixup_arm_thumb_bcc,
+
+  // The next two are for the movt/movw pair
+  // the 16bit imm field are split into imm{15-12} and imm{11-0}
+  fixup_arm_movt_hi16, // :upper16:
+  fixup_arm_movw_lo16, // :lower16:
+  fixup_t2_movt_hi16, // :upper16:
+  fixup_t2_movw_lo16, // :lower16:
+
+  // fixup_arm_mod_imm - Fixup for mod_imm
+  fixup_arm_mod_imm,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
new file mode 100644
index 000000000000..1e062ad45af5
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -0,0 +1,115 @@
+//===-- ARMMCAsmInfo.cpp - ARM asm properties -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the ARMMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+void ARMMCAsmInfoDarwin::anchor() { }
+
+ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(const Triple &TheTriple) {
+  if ((TheTriple.getArch() == Triple::armeb) ||
+      (TheTriple.getArch() == Triple::thumbeb))
+    IsLittleEndian = false;
+
+  Data64bitsDirective = nullptr;
+  CommentString = "@";
+  Code16Directive = ".code\t16";
+  Code32Directive = ".code\t32";
+  UseDataRegionDirectives = true;
+
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = (TheTriple.isOSDarwin() && !TheTriple.isWatchABI())
+                       ? ExceptionHandling::SjLj
+                       : ExceptionHandling::DwarfCFI;
+
+  UseIntegratedAssembler = true;
+}
+
+void ARMELFMCAsmInfo::anchor() { }
+
+ARMELFMCAsmInfo::ARMELFMCAsmInfo(const Triple &TheTriple) {
+  if ((TheTriple.getArch() == Triple::armeb) ||
+      (TheTriple.getArch() == Triple::thumbeb))
+    IsLittleEndian = false;
+
+  // ".comm align is in bytes but .align is pow-2."
+  AlignmentIsInBytes = false;
+
+  Data64bitsDirective = nullptr;
+  CommentString = "@";
+  Code16Directive = ".code\t16";
+  Code32Directive = ".code\t32";
+
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  switch (TheTriple.getOS()) {
+  case Triple::Bitrig:
+  case Triple::NetBSD:
+    ExceptionsType = ExceptionHandling::DwarfCFI;
+    break;
+  default:
+    ExceptionsType = ExceptionHandling::ARM;
+    break;
+  }
+
+  // foo(plt) instead of foo@plt
+  UseParensForSymbolVariant = true;
+
+  UseIntegratedAssembler = true;
+}
+
+void ARMELFMCAsmInfo::setUseIntegratedAssembler(bool Value) {
+  UseIntegratedAssembler = Value;
+  if (!UseIntegratedAssembler) {
+    // gas doesn't handle VFP register names in cfi directives,
+    // so don't use register names with external assembler.
+    // See https://sourceware.org/bugzilla/show_bug.cgi?id=16694
+    DwarfRegNumForCFI = true;
+  }
+}
+
+void ARMCOFFMCAsmInfoMicrosoft::anchor() { }
+
+ARMCOFFMCAsmInfoMicrosoft::ARMCOFFMCAsmInfoMicrosoft() {
+  AlignmentIsInBytes = false;
+
+  PrivateGlobalPrefix = "$M";
+  PrivateLabelPrefix = "$M";
+  CommentString = ";";
+}
+
+void ARMCOFFMCAsmInfoGNU::anchor() { }
+
+ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() {
+  AlignmentIsInBytes = false;
+  HasSingleParameterDotFile = true;
+
+  CommentString = "@";
+  Code16Directive = ".code\t16";
+  Code32Directive = ".code\t32";
+  PrivateGlobalPrefix = ".L";
+  PrivateLabelPrefix = ".L";
+
+  SupportsDebugInformation = true;
+  ExceptionsType = ExceptionHandling::None;
+  UseParensForSymbolVariant = true;
+
+  UseIntegratedAssembler = false;
+  DwarfRegNumForCFI = true;
+}
+
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
new file mode 100644
index 000000000000..5e548162bec6
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -0,0 +1,56 @@
+//===-- ARMMCAsmInfo.h - ARM asm properties --------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the ARMMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCASMINFO_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin {
+  virtual void anchor();
+
+public:
+  explicit ARMMCAsmInfoDarwin(const Triple &TheTriple);
+};
+
+class ARMELFMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit ARMELFMCAsmInfo(const Triple &TT);
+
+  void setUseIntegratedAssembler(bool Value) override;
+};
+
+class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+  void anchor() override;
+
+public:
+  explicit ARMCOFFMCAsmInfoMicrosoft();
+};
+
+class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF {
+  void anchor() override;
+
+public:
+  explicit ARMCOFFMCAsmInfoGNU();
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
new file mode 100644
index 000000000000..559a4f8de75f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -0,0 +1,1702 @@
+//===-- ARM/ARMMCCodeEmitter.cpp - Convert ARM code to machine code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARMMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created.");
+
+namespace {
+class ARMMCCodeEmitter : public MCCodeEmitter {
+  ARMMCCodeEmitter(const ARMMCCodeEmitter &) = delete;
+  void operator=(const ARMMCCodeEmitter &) = delete;
+  const MCInstrInfo &MCII;
+  const MCContext &CTX;
+  bool IsLittleEndian;
+
+public:
+  ARMMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx, bool IsLittle)
+    : MCII(mcii), CTX(ctx), IsLittleEndian(IsLittle) {
+  }
+
+  ~ARMMCCodeEmitter() override {}
+
+  bool isThumb(const MCSubtargetInfo &STI) const {
+    return STI.getFeatureBits()[ARM::ModeThumb];
+  }
+  bool isThumb2(const MCSubtargetInfo &STI) const {
+    return isThumb(STI) && STI.getFeatureBits()[ARM::FeatureThumb2];
+  }
+  bool isTargetMachO(const MCSubtargetInfo &STI) const {
+    const Triple &TT = STI.getTargetTriple();
+    return TT.isOSBinFormatMachO();
+  }
+
+  unsigned getMachineSoImmOpValue(unsigned SoImm) const;
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  /// getHiLo16ImmOpValue - Return the encoding for the hi / low 16-bit of
+  /// the specified operand. This is used for operands with :lower16: and
+  /// :upper16: prefixes.
+  uint32_t getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  bool EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx,
+                              unsigned &Reg, unsigned &Imm,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  /// getThumbBLTargetOpValue - Return encoding info for Thumb immediate
+  /// BL branch target.
+  uint32_t getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  /// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate
+  /// BLX branch target.
+  uint32_t getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  /// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
+  uint32_t getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  /// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
+  uint32_t getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  /// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
+  uint32_t getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  /// getBranchTargetOpValue - Return encoding info for 24-bit immediate
+  /// branch target.
+  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  /// getThumbBranchTargetOpValue - Return encoding info for 24-bit
+  /// immediate Thumb2 direct branch target.
+  uint32_t getThumbBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const;
+
+  /// getARMBranchTargetOpValue - Return encoding info for 24-bit immediate
+  /// branch target.
+  uint32_t getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+  uint32_t getARMBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  /// getAdrLabelOpValue - Return encoding info for 12-bit immediate
+  /// ADR label target.
+  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  uint32_t getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  uint32_t getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+
+  /// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12'
+  /// operand.
+  uint32_t getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  /// getThumbAddrModeRegRegOpValue - Return encoding for 'reg + reg' operand.
+  uint32_t getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const;
+
+  /// getT2AddrModeImm8s4OpValue - Return encoding info for 'reg +/- imm8<<2'
+  /// operand.
+  uint32_t getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  /// getT2AddrModeImm0_1020s4OpValue - Return encoding info for 'reg + imm8<<2'
+  /// operand.
+  uint32_t getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  /// getT2Imm8s4OpValue - Return encoding info for '+/- imm8<<2'
+  /// operand.
+  uint32_t getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+
+  /// getLdStSORegOpValue - Return encoding info for 'reg +/- reg shop imm'
+  /// operand as needed by load/store instructions.
+  uint32_t getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getLdStmModeOpValue - Return encoding for load/store multiple mode.
+  uint32_t getLdStmModeOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const {
+    ARM_AM::AMSubMode Mode = (ARM_AM::AMSubMode)MI.getOperand(OpIdx).getImm();
+    switch (Mode) {
+    default: llvm_unreachable("Unknown addressing sub-mode!");
+    case ARM_AM::da: return 0;
+    case ARM_AM::ia: return 1;
+    case ARM_AM::db: return 2;
+    case ARM_AM::ib: return 3;
+    }
+  }
+  /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
+  ///
+  unsigned getShiftOp(ARM_AM::ShiftOpc ShOpc) const {
+    switch (ShOpc) {
+    case ARM_AM::no_shift:
+    case ARM_AM::lsl: return 0;
+    case ARM_AM::lsr: return 1;
+    case ARM_AM::asr: return 2;
+    case ARM_AM::ror:
+    case ARM_AM::rrx: return 3;
+    }
+    llvm_unreachable("Invalid ShiftOpc!");
+  }
+
+  /// getAddrMode2OffsetOpValue - Return encoding for am2offset operands.
+  uint32_t getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  /// getPostIdxRegOpValue - Return encoding for postidx_reg operands.
+  uint32_t getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getAddrMode3OffsetOpValue - Return encoding for am3offset operands.
+  uint32_t getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  /// getAddrMode3OpValue - Return encoding for addrmode3 operands.
+  uint32_t getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getAddrModeThumbSPOpValue - Return encoding info for 'reg +/- imm12'
+  /// operand.
+  uint32_t getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  /// getAddrModeISOpValue - Encode the t_addrmode_is# operands.
+  uint32_t getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands.
+  uint32_t getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getAddrMode5OpValue - Return encoding info for 'reg +/- (imm8 << 2)' operand.
+  uint32_t getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getAddrMode5FP16OpValue - Return encoding info for 'reg +/- (imm8 << 1)' operand.
+  uint32_t getAddrMode5FP16OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getCCOutOpValue - Return encoding of the 's' bit.
+  unsigned getCCOutOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+    // The operand is either reg0 or CPSR. The 's' bit is encoded as '0' or
+    // '1' respectively.
+    return MI.getOperand(Op).getReg() == ARM::CPSR;
+  }
+
+  /// getSOImmOpValue - Return an encoded 12-bit shifted-immediate value.
+  unsigned getSOImmOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+
+    const MCOperand &MO = MI.getOperand(Op);
+
+    // We expect MO to be an immediate or an expression,
+    // if it is an immediate - that's fine, just encode the value.
+    // Otherwise - create a Fixup.
+    if (MO.isExpr()) {
+      const MCExpr *Expr = MO.getExpr();
+      // In instruction code this value always encoded as lowest 12 bits,
+      // so we don't have to perform any specific adjustments.
+      // Due to requirements of relocatable records we have to use FK_Data_4.
+      // See ARMELFObjectWriter::ExplicitRelSym and
+      //     ARMELFObjectWriter::GetRelocTypeInner for more details.
+      MCFixupKind Kind = MCFixupKind(FK_Data_4);
+      Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+      return 0;
+    }
+
+    unsigned SoImm = MO.getImm();
+    int SoImmVal = ARM_AM::getSOImmVal(SoImm);
+    assert(SoImmVal != -1 && "Not a valid so_imm value!");
+
+    // Encode rotate_imm.
+    unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1)
+      << ARMII::SoRotImmShift;
+
+    // Encode immed_8.
+    Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal);
+    return Binary;
+  }
+
+  unsigned getModImmOpValue(const MCInst &MI, unsigned Op,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &ST) const {
+    const MCOperand &MO = MI.getOperand(Op);
+
+    // Support for fixups (MCFixup)
+    if (MO.isExpr()) {
+      const MCExpr *Expr = MO.getExpr();
+      // Fixups resolve to plain values that need to be encoded.
+      MCFixupKind Kind = MCFixupKind(ARM::fixup_arm_mod_imm);
+      Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+      return 0;
+    }
+
+    // Immediate is already in its encoded format
+    return MO.getImm();
+  }
+
+  /// getT2SOImmOpValue - Return an encoded 12-bit shifted-immediate value.
+  unsigned getT2SOImmOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+    unsigned SoImm = MI.getOperand(Op).getImm();
+    unsigned Encoded =  ARM_AM::getT2SOImmVal(SoImm);
+    assert(Encoded != ~0U && "Not a Thumb2 so_imm value?");
+    return Encoded;
+  }
+
+  unsigned getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const;
+  unsigned getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const;
+  unsigned getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const;
+
+  /// getSORegOpValue - Return an encoded so_reg shifted register value.
+  unsigned getSORegRegOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const;
+  unsigned getSORegImmOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const;
+  unsigned getT2SORegOpValue(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getNEONVcvtImm32OpValue(const MCInst &MI, unsigned Op,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const {
+    return 64 - MI.getOperand(Op).getImm();
+  }
+
+  unsigned getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  unsigned getRegisterListOpValue(const MCInst &MI, unsigned Op,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+  unsigned getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+  unsigned getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const;
+  unsigned getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const;
+  unsigned getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  unsigned getShiftRight8Imm(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getShiftRight16Imm(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned getShiftRight32Imm(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned getShiftRight64Imm(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  unsigned getThumbSRImmOpValue(const MCInst &MI, unsigned Op,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  unsigned NEONThumb2DataIPostEncoder(const MCInst &MI,
+                                      unsigned EncodedValue,
+                                      const MCSubtargetInfo &STI) const;
+  unsigned NEONThumb2LoadStorePostEncoder(const MCInst &MI,
+                                          unsigned EncodedValue,
+                                          const MCSubtargetInfo &STI) const;
+  unsigned NEONThumb2DupPostEncoder(const MCInst &MI,
+                                    unsigned EncodedValue,
+                                    const MCSubtargetInfo &STI) const;
+  unsigned NEONThumb2V8PostEncoder(const MCInst &MI,
+                                   unsigned EncodedValue,
+                                   const MCSubtargetInfo &STI) const;
+
+  unsigned VFPThumb2PostEncoder(const MCInst &MI,
+                                unsigned EncodedValue,
+                                const MCSubtargetInfo &STI) const;
+
+  void EmitByte(unsigned char C, raw_ostream &OS) const {
+    OS << (char)C;
+  }
+
+  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != Size; ++i) {
+      unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
+      EmitByte((Val >> Shift) & 0xff, OS);
+    }
+  }
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              MCContext &Ctx) {
+  return new ARMMCCodeEmitter(MCII, Ctx, true);
+}
+
+MCCodeEmitter *llvm::createARMBEMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              MCContext &Ctx) {
+  return new ARMMCCodeEmitter(MCII, Ctx, false);
+}
+
+/// NEONThumb2DataIPostEncoder - Post-process encoded NEON data-processing
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2DataIPostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue,
+                                                 const MCSubtargetInfo &STI) const {
+  if (isThumb2(STI)) {
+    // NEON Thumb2 data-processsing encodings are very simple: bit 24 is moved
+    // to bit 12 of the high half-word (i.e. bit 28), and bits 27-24 are
+    // set to 1111.
+    unsigned Bit24 = EncodedValue & 0x01000000;
+    unsigned Bit28 = Bit24 << 4;
+    EncodedValue &= 0xEFFFFFFF;
+    EncodedValue |= Bit28;
+    EncodedValue |= 0x0F000000;
+  }
+
+  return EncodedValue;
+}
+
+/// NEONThumb2LoadStorePostEncoder - Post-process encoded NEON load/store
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2LoadStorePostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue,
+                                                 const MCSubtargetInfo &STI) const {
+  if (isThumb2(STI)) {
+    EncodedValue &= 0xF0FFFFFF;
+    EncodedValue |= 0x09000000;
+  }
+
+  return EncodedValue;
+}
+
+/// NEONThumb2DupPostEncoder - Post-process encoded NEON vdup
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2DupPostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue,
+                                                 const MCSubtargetInfo &STI) const {
+  if (isThumb2(STI)) {
+    EncodedValue &= 0x00FFFFFF;
+    EncodedValue |= 0xEE000000;
+  }
+
+  return EncodedValue;
+}
+
+/// Post-process encoded NEON v8 instructions, and rewrite them to Thumb2 form
+/// if we are in Thumb2.
+unsigned ARMMCCodeEmitter::NEONThumb2V8PostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue,
+                                                 const MCSubtargetInfo &STI) const {
+  if (isThumb2(STI)) {
+    EncodedValue |= 0xC000000; // Set bits 27-26
+  }
+
+  return EncodedValue;
+}
+
+/// VFPThumb2PostEncoder - Post-process encoded VFP instructions and rewrite
+/// them to their Thumb2 form if we are currently in Thumb2 mode.
+unsigned ARMMCCodeEmitter::
+VFPThumb2PostEncoder(const MCInst &MI, unsigned EncodedValue,
+                     const MCSubtargetInfo &STI) const {
+  if (isThumb2(STI)) {
+    EncodedValue &= 0x0FFFFFFF;
+    EncodedValue |= 0xE0000000;
+  }
+  return EncodedValue;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned ARMMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+    unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg);
+
+    // Q registers are encoded as 2x their register number.
+    switch (Reg) {
+    default:
+      return RegNo;
+    case ARM::Q0:  case ARM::Q1:  case ARM::Q2:  case ARM::Q3:
+    case ARM::Q4:  case ARM::Q5:  case ARM::Q6:  case ARM::Q7:
+    case ARM::Q8:  case ARM::Q9:  case ARM::Q10: case ARM::Q11:
+    case ARM::Q12: case ARM::Q13: case ARM::Q14: case ARM::Q15:
+      return 2 * RegNo;
+    }
+  } else if (MO.isImm()) {
+    return static_cast<unsigned>(MO.getImm());
+  } else if (MO.isFPImm()) {
+    return static_cast<unsigned>(APFloat(MO.getFPImm())
+                     .bitcastToAPInt().getHiBits(32).getLimitedValue());
+  }
+
+  llvm_unreachable("Unable to encode MCOperand!");
+}
+
+/// getAddrModeImmOpValue - Return encoding info for 'reg +/- imm' operand.
+bool ARMMCCodeEmitter::
+EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg,
+                       unsigned &Imm, SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+
+  Reg = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  int32_t SImm = MO1.getImm();
+  bool isAdd = true;
+
+  // Special value for #-0
+  if (SImm == INT32_MIN) {
+    SImm = 0;
+    isAdd = false;
+  }
+
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (SImm < 0) {
+    SImm = -SImm;
+    isAdd = false;
+  }
+
+  Imm = SImm;
+  return isAdd;
+}
+
+/// getBranchTargetOpValue - Helper function to get the branch target operand,
+/// which is either an immediate or requires a fixup.
+static uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                       unsigned FixupKind,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm()) return MO.getImm();
+  assert(MO.isExpr() && "Unexpected branch target type!");
+  const MCExpr *Expr = MO.getExpr();
+  MCFixupKind Kind = MCFixupKind(FixupKind);
+  Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+// Thumb BL and BLX use a strange offset encoding where bits 22 and 21 are
+// determined by negating them and XOR'ing them with bit 23.
+static int32_t encodeThumbBLOffset(int32_t offset) {
+  offset >>= 1;
+  uint32_t S  = (offset & 0x800000) >> 23;
+  uint32_t J1 = (offset & 0x400000) >> 22;
+  uint32_t J2 = (offset & 0x200000) >> 21;
+  J1 = (~J1 & 0x1);
+  J2 = (~J2 & 0x1);
+  J1 ^= S;
+  J2 ^= S;
+
+  offset &= ~0x600000;
+  offset |= J1 << 22;
+  offset |= J2 << 21;
+
+  return offset;
+}
+
+/// getThumbBLTargetOpValue - Return encoding info for immediate branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bl,
+                                    Fixups, STI);
+  return encodeThumbBLOffset(MO.getImm());
+}
+
+/// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate
+/// BLX branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_blx,
+                                    Fixups, STI);
+  return encodeThumbBLOffset(MO.getImm());
+}
+
+/// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_br,
+                                    Fixups, STI);
+  return (MO.getImm() >> 1);
+}
+
+/// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bcc,
+                                    Fixups, STI);
+  return (MO.getImm() >> 1);
+}
+
+/// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cb, Fixups, STI);
+  return (MO.getImm() >> 1);
+}
+
+/// Return true if this branch has a non-always predication
+static bool HasConditionalBranch(const MCInst &MI) {
+  int NumOp = MI.getNumOperands();
+  if (NumOp >= 2) {
+    for (int i = 0; i < NumOp-1; ++i) {
+      const MCOperand &MCOp1 = MI.getOperand(i);
+      const MCOperand &MCOp2 = MI.getOperand(i + 1);
+      if (MCOp1.isImm() && MCOp2.isReg() &&
+          (MCOp2.getReg() == 0 || MCOp2.getReg() == ARM::CPSR)) {
+        if (ARMCC::CondCodes(MCOp1.getImm()) != ARMCC::AL)
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch
+/// target.
+uint32_t ARMMCCodeEmitter::
+getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+  // FIXME: This really, really shouldn't use TargetMachine. We don't want
+  // coupling between MC and TM anywhere we can help it.
+  if (isThumb2(STI))
+    return
+      ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_condbranch, Fixups, STI);
+  return getARMBranchTargetOpValue(MI, OpIdx, Fixups, STI);
+}
+
+/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch
+/// target.
+uint32_t ARMMCCodeEmitter::
+getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr()) {
+    if (HasConditionalBranch(MI))
+      return ::getBranchTargetOpValue(MI, OpIdx,
+                                      ARM::fixup_arm_condbranch, Fixups, STI);
+    return ::getBranchTargetOpValue(MI, OpIdx,
+                                    ARM::fixup_arm_uncondbranch, Fixups, STI);
+  }
+
+  return MO.getImm() >> 2;
+}
+
+uint32_t ARMMCCodeEmitter::
+getARMBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr()) {
+    if (HasConditionalBranch(MI))
+      return ::getBranchTargetOpValue(MI, OpIdx, 
+                                      ARM::fixup_arm_condbl, Fixups, STI);
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_uncondbl, Fixups, STI);
+  }
+
+  return MO.getImm() >> 2;
+}
+
+uint32_t ARMMCCodeEmitter::
+getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_blx, Fixups, STI);
+
+  return MO.getImm() >> 1;
+}
+
+/// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
+/// immediate branch target.
+uint32_t ARMMCCodeEmitter::getThumbBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  unsigned Val = 0;
+  const MCOperand MO = MI.getOperand(OpIdx);
+    
+  if(MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups, STI);
+  else 
+    Val = MO.getImm() >> 1;
+
+  bool I  = (Val & 0x800000);
+  bool J1 = (Val & 0x400000);
+  bool J2 = (Val & 0x200000);
+  if (I ^ J1)
+    Val &= ~0x400000;
+  else
+    Val |= 0x400000;
+
+  if (I ^ J2)
+    Val &= ~0x200000;
+  else
+    Val |= 0x200000;
+
+  return Val;
+}
+
+/// getAdrLabelOpValue - Return encoding info for 12-bit shifted-immediate
+/// ADR label target.
+uint32_t ARMMCCodeEmitter::
+getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_adr_pcrel_12,
+                                    Fixups, STI);
+  int64_t offset = MO.getImm();
+  uint32_t Val = 0x2000;
+
+  int SoImmVal;
+  if (offset == INT32_MIN) {
+    Val = 0x1000;
+    SoImmVal = 0;
+  } else if (offset < 0) {
+    Val = 0x1000;
+    offset *= -1;
+    SoImmVal = ARM_AM::getSOImmVal(offset);
+    if(SoImmVal == -1) {
+      Val = 0x2000;
+      offset *= -1;
+      SoImmVal = ARM_AM::getSOImmVal(offset);
+    }
+  } else {
+    SoImmVal = ARM_AM::getSOImmVal(offset);
+    if(SoImmVal == -1) {
+      Val = 0x1000;
+      offset *= -1;
+      SoImmVal = ARM_AM::getSOImmVal(offset);
+    }
+  }
+
+  assert(SoImmVal != -1 && "Not a valid so_imm value!");
+
+  Val |= SoImmVal;
+  return Val;
+}
+
+/// getT2AdrLabelOpValue - Return encoding info for 12-bit immediate ADR label
+/// target.
+uint32_t ARMMCCodeEmitter::
+getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_adr_pcrel_12,
+                                    Fixups, STI);
+  int32_t Val = MO.getImm();
+  if (Val == INT32_MIN)
+    Val = 0x1000;
+  else if (Val < 0) {
+    Val *= -1;
+    Val |= 0x1000;
+  }
+  return Val;
+}
+
+/// getThumbAdrLabelOpValue - Return encoding info for 8-bit immediate ADR label
+/// target.
+uint32_t ARMMCCodeEmitter::
+getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_thumb_adr_pcrel_10,
+                                    Fixups, STI);
+  return MO.getImm();
+}
+
+/// getThumbAddrModeRegRegOpValue - Return encoding info for 'reg + reg'
+/// operand.
+uint32_t ARMMCCodeEmitter::
+getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &,
+                              const MCSubtargetInfo &STI) const {
+  // [Rn, Rm]
+  //   {5-3} = Rm
+  //   {2-0} = Rn
+  const MCOperand &MO1 = MI.getOperand(OpIdx);
+  const MCOperand &MO2 = MI.getOperand(OpIdx + 1);
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+  unsigned Rm = CTX.getRegisterInfo()->getEncodingValue(MO2.getReg());
+  return (Rm << 3) | Rn;
+}
+
+/// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' operand.
+uint32_t ARMMCCodeEmitter::
+getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
+  // {17-13} = reg
+  // {12}    = (U)nsigned (add == '1', sub == '0')
+  // {11-0}  = imm12
+  unsigned Reg, Imm12;
+  bool isAdd = true;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
+    Imm12 = 0;
+
+    if (MO.isExpr()) {
+      const MCExpr *Expr = MO.getExpr();
+      isAdd = false ; // 'U' bit is set as part of the fixup.
+
+      MCFixupKind Kind;
+      if (isThumb2(STI))
+        Kind = MCFixupKind(ARM::fixup_t2_ldst_pcrel_12);
+      else
+        Kind = MCFixupKind(ARM::fixup_arm_ldst_pcrel_12);
+      Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+      ++MCNumCPRelocations;
+    } else {
+      Reg = ARM::PC;
+      int32_t Offset = MO.getImm();
+      if (Offset == INT32_MIN) {
+        Offset = 0;
+        isAdd = false;
+      } else if (Offset < 0) {
+        Offset *= -1;
+        isAdd = false;
+      }
+      Imm12 = Offset;
+    }
+  } else
+    isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm12, Fixups, STI);
+
+  uint32_t Binary = Imm12 & 0xfff;
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 12);
+  Binary |= (Reg << 13);
+  return Binary;
+}
+
+/// getT2Imm8s4OpValue - Return encoding info for
+/// '+/- imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
+  // FIXME: The immediate operand should have already been encoded like this
+  // before ever getting here. The encoder method should just need to combine
+  // the MI operands for the register and the offset into a single
+  // representation for the complex operand in the .td file. This isn't just
+  // style, unfortunately. As-is, we can't represent the distinct encoding
+  // for #-0.
+
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  int32_t Imm8 = MI.getOperand(OpIdx).getImm();
+  bool isAdd = Imm8 >= 0;
+
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (Imm8 < 0)
+    Imm8 = -(uint32_t)Imm8;
+
+  // Scaled by 4.
+  Imm8 /= 4;
+
+  uint32_t Binary = Imm8 & 0xff;
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  return Binary;
+}
+
+/// getT2AddrModeImm8s4OpValue - Return encoding info for
+/// 'reg +/- imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
+  // {12-9} = reg
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  unsigned Reg, Imm8;
+  bool isAdd = true;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
+    Imm8 = 0;
+    isAdd = false ; // 'U' bit is set as part of the fixup.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind = MCFixupKind(ARM::fixup_t2_pcrel_10);
+    Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+    ++MCNumCPRelocations;
+  } else
+    isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups, STI);
+
+  // FIXME: The immediate operand should have already been encoded like this
+  // before ever getting here. The encoder method should just need to combine
+  // the MI operands for the register and the offset into a single
+  // representation for the complex operand in the .td file. This isn't just
+  // style, unfortunately. As-is, we can't represent the distinct encoding
+  // for #-0.
+  uint32_t Binary = (Imm8 >> 2) & 0xff;
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  Binary |= (Reg << 9);
+  return Binary;
+}
+
+/// getT2AddrModeImm0_1020s4OpValue - Return encoding info for
+/// 'reg + imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
+  // {11-8} = reg
+  // {7-0}  = imm8
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  unsigned Reg = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+  unsigned Imm8 = MO1.getImm();
+  return (Reg << 8) | Imm8;
+}
+
+uint32_t
+ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
+  // {20-16} = imm{15-12}
+  // {11-0}  = imm{11-0}
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (MO.isImm())
+    // Hi / lo 16 bits already extracted during earlier passes.
+    return static_cast<unsigned>(MO.getImm());
+
+  // Handle :upper16: and :lower16: assembly prefixes.
+  const MCExpr *E = MO.getExpr();
+  MCFixupKind Kind;
+  if (E->getKind() == MCExpr::Target) {
+    const ARMMCExpr *ARM16Expr = cast<ARMMCExpr>(E);
+    E = ARM16Expr->getSubExpr();
+
+    if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(E)) {
+      const int64_t Value = MCE->getValue();
+      if (Value > UINT32_MAX)
+        report_fatal_error("constant value truncated (limited to 32-bit)");
+
+      switch (ARM16Expr->getKind()) {
+      case ARMMCExpr::VK_ARM_HI16:
+        return (int32_t(Value) & 0xffff0000) >> 16;
+      case ARMMCExpr::VK_ARM_LO16:
+        return (int32_t(Value) & 0x0000ffff);
+      default: llvm_unreachable("Unsupported ARMFixup");
+      }
+    }
+
+    switch (ARM16Expr->getKind()) {
+    default: llvm_unreachable("Unsupported ARMFixup");
+    case ARMMCExpr::VK_ARM_HI16:
+      Kind = MCFixupKind(isThumb(STI) ? ARM::fixup_t2_movt_hi16
+                                      : ARM::fixup_arm_movt_hi16);
+      break;
+    case ARMMCExpr::VK_ARM_LO16:
+      Kind = MCFixupKind(isThumb(STI) ? ARM::fixup_t2_movw_lo16
+                                      : ARM::fixup_arm_movw_lo16);
+      break;
+    }
+
+    Fixups.push_back(MCFixup::create(0, E, Kind, MI.getLoc()));
+    return 0;
+  }
+  // If the expression doesn't have :upper16: or :lower16: on it,
+  // it's just a plain immediate expression, previously those evaluated to
+  // the lower 16 bits of the expression regardless of whether
+  // we have a movt or a movw, but that led to misleadingly results.
+  // This is disallowed in the AsmParser in validateInstruction()
+  // so this should never happen.
+  llvm_unreachable("expression without :upper16: or :lower16:");
+}
+
+uint32_t ARMMCCodeEmitter::
+getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  const MCOperand &MO2 = MI.getOperand(OpIdx+2);
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+  unsigned Rm = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+  unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm());
+  bool isAdd = ARM_AM::getAM2Op(MO2.getImm()) == ARM_AM::add;
+  ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(MO2.getImm());
+  unsigned SBits = getShiftOp(ShOp);
+
+  // While "lsr #32" and "asr #32" exist, they are encoded with a 0 in the shift
+  // amount. However, it would be an easy mistake to make so check here.
+  assert((ShImm & ~0x1f) == 0 && "Out of range shift amount");
+
+  // {16-13} = Rn
+  // {12}    = isAdd
+  // {11-0}  = shifter
+  //  {3-0}  = Rm
+  //  {4}    = 0
+  //  {6-5}  = type
+  //  {11-7} = imm
+  uint32_t Binary = Rm;
+  Binary |= Rn << 13;
+  Binary |= SBits << 5;
+  Binary |= ShImm << 7;
+  if (isAdd)
+    Binary |= 1 << 12;
+  return Binary;
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
+  // {13}     1 == imm12, 0 == Rm
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  unsigned Imm = MO1.getImm();
+  bool isAdd = ARM_AM::getAM2Op(Imm) == ARM_AM::add;
+  bool isReg = MO.getReg() != 0;
+  uint32_t Binary = ARM_AM::getAM2Offset(Imm);
+  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm12
+  if (isReg) {
+    ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(Imm);
+    Binary <<= 7;                    // Shift amount is bits [11:7]
+    Binary |= getShiftOp(ShOp) << 5; // Shift type is bits [6:5]
+    Binary |= CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); // Rm is bits [3:0]
+  }
+  return Binary | (isAdd << 12) | (isReg << 13);
+}
+
+uint32_t ARMMCCodeEmitter::
+getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  // {4}      isAdd
+  // {3-0}    Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  bool isAdd = MO1.getImm() != 0;
+  return CTX.getRegisterInfo()->getEncodingValue(MO.getReg()) | (isAdd << 4);
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
+  // {9}      1 == imm8, 0 == Rm
+  // {8}      isAdd
+  // {7-4}    imm7_4/zero
+  // {3-0}    imm3_0/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  unsigned Imm = MO1.getImm();
+  bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
+  bool isImm = MO.getReg() == 0;
+  uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
+  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
+  if (!isImm)
+    Imm8 = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+  return Imm8 | (isAdd << 8) | (isImm << 9);
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  // {13}     1 == imm8, 0 == Rm
+  // {12-9}   Rn
+  // {8}      isAdd
+  // {7-4}    imm7_4/zero
+  // {3-0}    imm3_0/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  const MCOperand &MO2 = MI.getOperand(OpIdx+2);
+
+  // If The first operand isn't a register, we have a label reference.
+  if (!MO.isReg()) {
+    unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind = MCFixupKind(ARM::fixup_arm_pcrel_10_unscaled);
+    Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+    ++MCNumCPRelocations;
+    return (Rn << 9) | (1 << 13);
+  }
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+  unsigned Imm = MO2.getImm();
+  bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
+  bool isImm = MO1.getReg() == 0;
+  uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
+  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
+  if (!isImm)
+    Imm8 = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+  return (Rn << 9) | Imm8 | (isAdd << 8) | (isImm << 13);
+}
+
+/// getAddrModeThumbSPOpValue - Encode the t_addrmode_sp operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
+  // [SP, #imm]
+  //   {7-0} = imm8
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  assert(MI.getOperand(OpIdx).getReg() == ARM::SP &&
+         "Unexpected base register!");
+
+  // The immediate is already shifted for the implicit zeroes, so no change
+  // here.
+  return MO1.getImm() & 0xff;
+}
+
+/// getAddrModeISOpValue - Encode the t_addrmode_is# operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  // [Rn, #imm]
+  //   {7-3} = imm5
+  //   {2-0} = Rn
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+  unsigned Imm5 = MO1.getImm();
+  return ((Imm5 & 0x1f) << 3) | Rn;
+}
+
+/// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cp, Fixups, STI);
+  return (MO.getImm() >> 2);
+}
+
+/// getAddrMode5OpValue - Return encoding info for 'reg +/- (imm8 << 2)' operand.
+uint32_t ARMMCCodeEmitter::
+getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  // {12-9} = reg
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  unsigned Reg, Imm8;
+  bool isAdd;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
+    Imm8 = 0;
+    isAdd = false; // 'U' bit is handled as part of the fixup.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind;
+    if (isThumb2(STI))
+      Kind = MCFixupKind(ARM::fixup_t2_pcrel_10);
+    else
+      Kind = MCFixupKind(ARM::fixup_arm_pcrel_10);
+    Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+    ++MCNumCPRelocations;
+  } else {
+    EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups, STI);
+    isAdd = ARM_AM::getAM5Op(Imm8) == ARM_AM::add;
+  }
+
+  uint32_t Binary = ARM_AM::getAM5Offset(Imm8);
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  Binary |= (Reg << 9);
+  return Binary;
+}
+
+/// getAddrMode5FP16OpValue - Return encoding info for 'reg +/- (imm8 << 1)' operand.
+uint32_t ARMMCCodeEmitter::
+getAddrMode5FP16OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  // {12-9} = reg
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  unsigned Reg, Imm8;
+  bool isAdd;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
+    Imm8 = 0;
+    isAdd = false; // 'U' bit is handled as part of the fixup.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind;
+    if (isThumb2(STI))
+      Kind = MCFixupKind(ARM::fixup_t2_pcrel_9);
+    else
+      Kind = MCFixupKind(ARM::fixup_arm_pcrel_9);
+    Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+
+    ++MCNumCPRelocations;
+  } else {
+    EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups, STI);
+    isAdd = ARM_AM::getAM5Op(Imm8) == ARM_AM::add;
+  }
+
+  uint32_t Binary = ARM_AM::getAM5Offset(Imm8);
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  Binary |= (Reg << 9);
+  return Binary;
+}
+
+unsigned ARMMCCodeEmitter::
+getSORegRegOpValue(const MCInst &MI, unsigned OpIdx,
+                SmallVectorImpl<MCFixup> &Fixups,
+                const MCSubtargetInfo &STI) const {
+  // Sub-operands are [reg, reg, imm]. The first register is Rm, the reg to be
+  // shifted. The second is Rs, the amount to shift by, and the third specifies
+  // the type of the shift.
+  //
+  // {3-0} = Rm.
+  // {4}   = 1
+  // {6-5} = type
+  // {11-8} = Rs
+  // {7}    = 0
+
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  const MCOperand &MO2 = MI.getOperand(OpIdx + 2);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm());
+
+  // Encode Rm.
+  unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+  unsigned Rs = MO1.getReg();
+  if (Rs) {
+    // Set shift operand (bit[7:4]).
+    // LSL - 0001
+    // LSR - 0011
+    // ASR - 0101
+    // ROR - 0111
+    switch (SOpc) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::lsl: SBits = 0x1; break;
+    case ARM_AM::lsr: SBits = 0x3; break;
+    case ARM_AM::asr: SBits = 0x5; break;
+    case ARM_AM::ror: SBits = 0x7; break;
+    }
+  }
+
+  Binary |= SBits << 4;
+
+  // Encode the shift operation Rs.
+  // Encode Rs bit[11:8].
+  assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
+  return Binary | (CTX.getRegisterInfo()->getEncodingValue(Rs) << ARMII::RegRsShift);
+}
+
+unsigned ARMMCCodeEmitter::
+getSORegImmOpValue(const MCInst &MI, unsigned OpIdx,
+                SmallVectorImpl<MCFixup> &Fixups,
+                const MCSubtargetInfo &STI) const {
+  // Sub-operands are [reg, imm]. The first register is Rm, the reg to be
+  // shifted. The second is the amount to shift by.
+  //
+  // {3-0} = Rm.
+  // {4}   = 0
+  // {6-5} = type
+  // {11-7} = imm
+
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
+
+  // Encode Rm.
+  unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+
+  // Set shift operand (bit[6:4]).
+  // LSL - 000
+  // LSR - 010
+  // ASR - 100
+  // ROR - 110
+  // RRX - 110 and bit[11:8] clear.
+  switch (SOpc) {
+  default: llvm_unreachable("Unknown shift opc!");
+  case ARM_AM::lsl: SBits = 0x0; break;
+  case ARM_AM::lsr: SBits = 0x2; break;
+  case ARM_AM::asr: SBits = 0x4; break;
+  case ARM_AM::ror: SBits = 0x6; break;
+  case ARM_AM::rrx:
+    Binary |= 0x60;
+    return Binary;
+  }
+
+  // Encode shift_imm bit[11:7].
+  Binary |= SBits << 4;
+  unsigned Offset = ARM_AM::getSORegOffset(MO1.getImm());
+  assert(Offset < 32 && "Offset must be in range 0-31!");
+  return Binary | (Offset << 7);
+}
+
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
+                SmallVectorImpl<MCFixup> &Fixups,
+                const MCSubtargetInfo &STI) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+  const MCOperand &MO2 = MI.getOperand(OpNum+1);
+  const MCOperand &MO3 = MI.getOperand(OpNum+2);
+
+  // Encoded as [Rn, Rm, imm].
+  // FIXME: Needs fixup support.
+  unsigned Value = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+  Value <<= 4;
+  Value |= CTX.getRegisterInfo()->getEncodingValue(MO2.getReg());
+  Value <<= 2;
+  Value |= MO3.getImm();
+
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+  const MCOperand &MO2 = MI.getOperand(OpNum+1);
+
+  // FIXME: Needs fixup support.
+  unsigned Value = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+
+  // Even though the immediate is 8 bits long, we need 9 bits in order
+  // to represent the (inverse of the) sign bit.
+  Value <<= 9;
+  int32_t tmp = (int32_t)MO2.getImm();
+  if (tmp < 0)
+    tmp = abs(tmp);
+  else
+    Value |= 256; // Set the ADD bit
+  Value |= tmp & 255;
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+
+  // FIXME: Needs fixup support.
+  unsigned Value = 0;
+  int32_t tmp = (int32_t)MO1.getImm();
+  if (tmp < 0)
+    tmp = abs(tmp);
+  else
+    Value |= 256; // Set the ADD bit
+  Value |= tmp & 255;
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2SORegOpValue(const MCInst &MI, unsigned OpIdx,
+                SmallVectorImpl<MCFixup> &Fixups,
+                const MCSubtargetInfo &STI) const {
+  // Sub-operands are [reg, imm]. The first register is Rm, the reg to be
+  // shifted. The second is the amount to shift by.
+  //
+  // {3-0} = Rm.
+  // {4}   = 0
+  // {6-5} = type
+  // {11-7} = imm
+
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
+
+  // Encode Rm.
+  unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+  // Set shift operand (bit[6:4]).
+  // LSL - 000
+  // LSR - 010
+  // ASR - 100
+  // ROR - 110
+  switch (SOpc) {
+  default: llvm_unreachable("Unknown shift opc!");
+  case ARM_AM::lsl: SBits = 0x0; break;
+  case ARM_AM::lsr: SBits = 0x2; break;
+  case ARM_AM::asr: SBits = 0x4; break;
+  case ARM_AM::rrx: LLVM_FALLTHROUGH;
+  case ARM_AM::ror: SBits = 0x6; break;
+  }
+
+  Binary |= SBits << 4;
+  if (SOpc == ARM_AM::rrx)
+    return Binary;
+
+  // Encode shift_imm bit[11:7].
+  return Binary | ARM_AM::getSORegOffset(MO1.getImm()) << 7;
+}
+
+unsigned ARMMCCodeEmitter::
+getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const {
+  // 10 bits. lower 5 bits are are the lsb of the mask, high five bits are the
+  // msb of the mask.
+  const MCOperand &MO = MI.getOperand(Op);
+  uint32_t v = ~MO.getImm();
+  uint32_t lsb = countTrailingZeros(v);
+  uint32_t msb = (32 - countLeadingZeros (v)) - 1;
+  assert (v != 0 && lsb < 32 && msb < 32 && "Illegal bitfield mask!");
+  return lsb | (msb << 5);
+}
+
+unsigned ARMMCCodeEmitter::
+getRegisterListOpValue(const MCInst &MI, unsigned Op,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+  // VLDM/VSTM:
+  //   {12-8} = Vd
+  //   {7-0}  = Number of registers
+  //
+  // LDM/STM:
+  //   {15-0}  = Bitfield of GPRs.
+  unsigned Reg = MI.getOperand(Op).getReg();
+  bool SPRRegs = ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg);
+  bool DPRRegs = ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg);
+
+  unsigned Binary = 0;
+
+  if (SPRRegs || DPRRegs) {
+    // VLDM/VSTM
+    unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg);
+    unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff;
+    Binary |= (RegNo & 0x1f) << 8;
+    if (SPRRegs)
+      Binary |= NumRegs;
+    else
+      Binary |= NumRegs * 2;
+  } else {
+    const MCRegisterInfo &MRI = *CTX.getRegisterInfo();
+    assert(std::is_sorted(MI.begin() + Op, MI.end(),
+                          [&](const MCOperand &LHS, const MCOperand &RHS) {
+                            return MRI.getEncodingValue(LHS.getReg()) <
+                                   MRI.getEncodingValue(RHS.getReg());
+                          }));
+
+    for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
+      unsigned RegNo = MRI.getEncodingValue(MI.getOperand(I).getReg());
+      Binary |= 1 << RegNo;
+    }
+  }
+
+  return Binary;
+}
+
+/// getAddrMode6AddressOpValue - Encode an addrmode6 register number along
+/// with the alignment operand.
+unsigned ARMMCCodeEmitter::
+getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+  const MCOperand &Reg = MI.getOperand(Op);
+  const MCOperand &Imm = MI.getOperand(Op + 1);
+
+  unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg());
+  unsigned Align = 0;
+
+  switch (Imm.getImm()) {
+  default: break;
+  case 2:
+  case 4:
+  case 8:  Align = 0x01; break;
+  case 16: Align = 0x02; break;
+  case 32: Align = 0x03; break;
+  }
+
+  return RegNo | (Align << 4);
+}
+
+/// getAddrMode6OneLane32AddressOpValue - Encode an addrmode6 register number
+/// along  with the alignment operand for use in VST1 and VLD1 with size 32.
+unsigned ARMMCCodeEmitter::
+getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const {
+  const MCOperand &Reg = MI.getOperand(Op);
+  const MCOperand &Imm = MI.getOperand(Op + 1);
+
+  unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg());
+  unsigned Align = 0;
+
+  switch (Imm.getImm()) {
+  default: break;
+  case 8:
+  case 16:
+  case 32: // Default '0' value for invalid alignments of 8, 16, 32 bytes.
+  case 2: Align = 0x00; break;
+  case 4: Align = 0x03; break;
+  }
+
+  return RegNo | (Align << 4);
+}
+
+
+/// getAddrMode6DupAddressOpValue - Encode an addrmode6 register number and
+/// alignment operand for use in VLD-dup instructions.  This is the same as
+/// getAddrMode6AddressOpValue except for the alignment encoding, which is
+/// different for VLD4-dup.
+unsigned ARMMCCodeEmitter::
+getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const {
+  const MCOperand &Reg = MI.getOperand(Op);
+  const MCOperand &Imm = MI.getOperand(Op + 1);
+
+  unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg());
+  unsigned Align = 0;
+
+  switch (Imm.getImm()) {
+  default: break;
+  case 2:
+  case 4:
+  case 8:  Align = 0x01; break;
+  case 16: Align = 0x03; break;
+  }
+
+  return RegNo | (Align << 4);
+}
+
+unsigned ARMMCCodeEmitter::
+getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(Op);
+  if (MO.getReg() == 0) return 0x0D;
+  return CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight8Imm(const MCInst &MI, unsigned Op,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  return 8 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight16Imm(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
+  return 16 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight32Imm(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
+  return 32 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight64Imm(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups,
+                   const MCSubtargetInfo &STI) const {
+  return 64 - MI.getOperand(Op).getImm();
+}
+
+void ARMMCCodeEmitter::
+encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  // Pseudo instructions don't get encoded.
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+  if ((TSFlags & ARMII::FormMask) == ARMII::Pseudo)
+    return;
+
+  int Size;
+  if (Desc.getSize() == 2 || Desc.getSize() == 4)
+    Size = Desc.getSize();
+  else
+    llvm_unreachable("Unexpected instruction size!");
+
+  uint32_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+  // Thumb 32-bit wide instructions need to emit the high order halfword
+  // first.
+  if (isThumb(STI) && Size == 4) {
+    EmitConstant(Binary >> 16, 2, OS);
+    EmitConstant(Binary & 0xffff, 2, OS);
+  } else
+    EmitConstant(Binary, Size, OS);
+  ++MCNumEmitted;  // Keep track of the # of mi's emitted.
+}
+
+#include "ARMGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
new file mode 100644
index 000000000000..2063ca6bdf3b
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
@@ -0,0 +1,41 @@
+//===-- ARMMCExpr.cpp - ARM specific MC expression classes ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "armmcexpr"
+
+const ARMMCExpr*
+ARMMCExpr::create(VariantKind Kind, const MCExpr *Expr,
+                       MCContext &Ctx) {
+  return new (Ctx) ARMMCExpr(Kind, Expr);
+}
+
+void ARMMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  switch (Kind) {
+  default: llvm_unreachable("Invalid kind!");
+  case VK_ARM_HI16: OS << ":upper16:"; break;
+  case VK_ARM_LO16: OS << ":lower16:"; break;
+  }
+
+  const MCExpr *Expr = getSubExpr();
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    OS << '(';
+  Expr->print(OS, MAI);
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    OS << ')';
+}
+
+void ARMMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
new file mode 100644
index 000000000000..75dde8008fca
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
@@ -0,0 +1,79 @@
+//===-- ARMMCExpr.h - ARM specific MC expression classes --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCEXPR_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class ARMMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_ARM_None,
+    VK_ARM_HI16,  // The R_ARM_MOVT_ABS relocation (:upper16: in the .s file)
+    VK_ARM_LO16   // The R_ARM_MOVW_ABS_NC relocation (:lower16: in the .s file)
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+
+  explicit ARMMCExpr(VariantKind Kind, const MCExpr *Expr)
+      : Kind(Kind), Expr(Expr) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const ARMMCExpr *create(VariantKind Kind, const MCExpr *Expr,
+                                      MCContext &Ctx);
+
+  static const ARMMCExpr *createUpper16(const MCExpr *Expr, MCContext &Ctx) {
+    return create(VK_ARM_HI16, Expr, Ctx);
+  }
+
+  static const ARMMCExpr *createLower16(const MCExpr *Expr, MCContext &Ctx) {
+    return create(VK_ARM_LO16, Expr, Ctx);
+  }
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// @}
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override {
+    return false;
+  }
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+  MCFragment *findAssociatedFragment() const override {
+    return getSubExpr()->findAssociatedFragment();
+  }
+
+  // There are no TLS ARMMCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
new file mode 100644
index 000000000000..9e4d202321e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -0,0 +1,331 @@
+//===-- ARMMCTargetDesc.cpp - ARM Target Descriptions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides ARM specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMBaseInfo.h"
+#include "ARMMCAsmInfo.h"
+#include "ARMMCTargetDesc.h"
+#include "InstPrinter/ARMInstPrinter.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_MC_DESC
+#include "ARMGenRegisterInfo.inc"
+
+static bool getMCRDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
+                                  std::string &Info) {
+  if (STI.getFeatureBits()[llvm::ARM::HasV7Ops] &&
+      (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 15) &&
+      (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) &&
+      // Checks for the deprecated CP15ISB encoding:
+      // mcr p15, #0, rX, c7, c5, #4
+      (MI.getOperand(3).isImm() && MI.getOperand(3).getImm() == 7)) {
+    if ((MI.getOperand(5).isImm() && MI.getOperand(5).getImm() == 4)) {
+      if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 5) {
+        Info = "deprecated since v7, use 'isb'";
+        return true;
+      }
+
+      // Checks for the deprecated CP15DSB encoding:
+      // mcr p15, #0, rX, c7, c10, #4
+      if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 10) {
+        Info = "deprecated since v7, use 'dsb'";
+        return true;
+      }
+    }
+    // Checks for the deprecated CP15DMB encoding:
+    // mcr p15, #0, rX, c7, c10, #5
+    if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 10 &&
+        (MI.getOperand(5).isImm() && MI.getOperand(5).getImm() == 5)) {
+      Info = "deprecated since v7, use 'dmb'";
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool getITDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
+                                 std::string &Info) {
+  if (STI.getFeatureBits()[llvm::ARM::HasV8Ops] && MI.getOperand(1).isImm() &&
+      MI.getOperand(1).getImm() != 8) {
+    Info = "applying IT instruction to more than one subsequent instruction is "
+           "deprecated";
+    return true;
+  }
+
+  return false;
+}
+
+static bool getARMStoreDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
+                                       std::string &Info) {
+  assert(!STI.getFeatureBits()[llvm::ARM::ModeThumb] &&
+         "cannot predicate thumb instructions");
+
+  assert(MI.getNumOperands() >= 4 && "expected >= 4 arguments");
+  for (unsigned OI = 4, OE = MI.getNumOperands(); OI < OE; ++OI) {
+    assert(MI.getOperand(OI).isReg() && "expected register");
+    if (MI.getOperand(OI).getReg() == ARM::SP ||
+        MI.getOperand(OI).getReg() == ARM::PC) {
+      Info = "use of SP or PC in the list is deprecated";
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool getARMLoadDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
+                                      std::string &Info) {
+  assert(!STI.getFeatureBits()[llvm::ARM::ModeThumb] &&
+         "cannot predicate thumb instructions");
+
+  assert(MI.getNumOperands() >= 4 && "expected >= 4 arguments");
+  bool ListContainsPC = false, ListContainsLR = false;
+  for (unsigned OI = 4, OE = MI.getNumOperands(); OI < OE; ++OI) {
+    assert(MI.getOperand(OI).isReg() && "expected register");
+    switch (MI.getOperand(OI).getReg()) {
+    default:
+      break;
+    case ARM::LR:
+      ListContainsLR = true;
+      break;
+    case ARM::PC:
+      ListContainsPC = true;
+      break;
+    case ARM::SP:
+      Info = "use of SP in the list is deprecated";
+      return true;
+    }
+  }
+
+  if (ListContainsPC && ListContainsLR) {
+    Info = "use of LR and PC simultaneously in the list is deprecated";
+    return true;
+  }
+
+  return false;
+}
+
+#define GET_INSTRINFO_MC_DESC
+#include "ARMGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "ARMGenSubtargetInfo.inc"
+
+std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) {
+  bool isThumb =
+      TT.getArch() == Triple::thumb || TT.getArch() == Triple::thumbeb;
+
+  std::string ARMArchFeature;
+
+  unsigned ArchID = ARM::parseArch(TT.getArchName());
+  if (ArchID != ARM::AK_INVALID &&  (CPU.empty() || CPU == "generic"))
+    ARMArchFeature = (ARMArchFeature + "+" + ARM::getArchName(ArchID)).str();
+
+  if (isThumb) {
+    if (ARMArchFeature.empty())
+      ARMArchFeature = "+thumb-mode";
+    else
+      ARMArchFeature += ",+thumb-mode";
+  }
+
+  if (TT.isOSNaCl()) {
+    if (ARMArchFeature.empty())
+      ARMArchFeature = "+nacl-trap";
+    else
+      ARMArchFeature += ",+nacl-trap";
+  }
+
+  return ARMArchFeature;
+}
+
+MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT,
+                                                  StringRef CPU, StringRef FS) {
+  std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU);
+  if (!FS.empty()) {
+    if (!ArchFS.empty())
+      ArchFS = (Twine(ArchFS) + "," + FS).str();
+    else
+      ArchFS = FS;
+  }
+
+  return createARMMCSubtargetInfoImpl(TT, CPU, ArchFS);
+}
+
+static MCInstrInfo *createARMMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitARMMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createARMMCRegisterInfo(const Triple &Triple) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitARMMCRegisterInfo(X, ARM::LR, 0, 0, ARM::PC);
+  return X;
+}
+
+static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI,
+                                     const Triple &TheTriple) {
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSDarwin() || TheTriple.isOSBinFormatMachO())
+    MAI = new ARMMCAsmInfoDarwin(TheTriple);
+  else if (TheTriple.isWindowsMSVCEnvironment())
+    MAI = new ARMCOFFMCAsmInfoMicrosoft();
+  else if (TheTriple.isOSWindows())
+    MAI = new ARMCOFFMCAsmInfoGNU();
+  else
+    MAI = new ARMELFMCAsmInfo(TheTriple);
+
+  unsigned Reg = MRI.getDwarfRegNum(ARM::SP, true);
+  MAI->addInitialFrameState(MCCFIInstruction::createDefCfa(nullptr, Reg, 0));
+
+  return MAI;
+}
+
+static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx,
+                                     MCAsmBackend &MAB, raw_pwrite_stream &OS,
+                                     MCCodeEmitter *Emitter, bool RelaxAll) {
+  return createARMELFStreamer(Ctx, MAB, OS, Emitter, false,
+                              (T.getArch() == Triple::thumb ||
+                               T.getArch() == Triple::thumbeb));
+}
+
+static MCStreamer *createARMMachOStreamer(MCContext &Ctx, MCAsmBackend &MAB,
+                                          raw_pwrite_stream &OS,
+                                          MCCodeEmitter *Emitter, bool RelaxAll,
+                                          bool DWARFMustBeAtTheEnd) {
+  return createMachOStreamer(Ctx, MAB, OS, Emitter, false, DWARFMustBeAtTheEnd);
+}
+
+static MCInstPrinter *createARMMCInstPrinter(const Triple &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
+                                             const MCRegisterInfo &MRI) {
+  if (SyntaxVariant == 0)
+    return new ARMInstPrinter(MAI, MII, MRI);
+  return nullptr;
+}
+
+static MCRelocationInfo *createARMMCRelocationInfo(const Triple &TT,
+                                                   MCContext &Ctx) {
+  if (TT.isOSBinFormatMachO())
+    return createARMMachORelocationInfo(Ctx);
+  // Default to the stock relocation info.
+  return llvm::createMCRelocationInfo(TT, Ctx);
+}
+
+namespace {
+
+class ARMMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  ARMMCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
+
+  bool isUnconditionalBranch(const MCInst &Inst) const override {
+    // BCCs with the "always" predicate are unconditional branches.
+    if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL)
+      return true;
+    return MCInstrAnalysis::isUnconditionalBranch(Inst);
+  }
+
+  bool isConditionalBranch(const MCInst &Inst) const override {
+    // BCCs with the "always" predicate are unconditional branches.
+    if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL)
+      return false;
+    return MCInstrAnalysis::isConditionalBranch(Inst);
+  }
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                      uint64_t Size, uint64_t &Target) const override {
+    // We only handle PCRel branches for now.
+    if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL)
+      return false;
+
+    int64_t Imm = Inst.getOperand(0).getImm();
+    // FIXME: This is not right for thumb.
+    Target = Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes.
+    return true;
+  }
+};
+
+}
+
+static MCInstrAnalysis *createARMMCInstrAnalysis(const MCInstrInfo *Info) {
+  return new ARMMCInstrAnalysis(Info);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeARMTargetMC() {
+  for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget(),
+                    &getTheThumbLETarget(), &getTheThumbBETarget()}) {
+    // Register the MC asm info.
+    RegisterMCAsmInfoFn X(*T, createARMMCAsmInfo);
+
+    // Register the MC instruction info.
+    TargetRegistry::RegisterMCInstrInfo(*T, createARMMCInstrInfo);
+
+    // Register the MC register info.
+    TargetRegistry::RegisterMCRegInfo(*T, createARMMCRegisterInfo);
+
+    // Register the MC subtarget info.
+    TargetRegistry::RegisterMCSubtargetInfo(*T,
+                                            ARM_MC::createARMMCSubtargetInfo);
+
+    // Register the MC instruction analyzer.
+    TargetRegistry::RegisterMCInstrAnalysis(*T, createARMMCInstrAnalysis);
+
+    TargetRegistry::RegisterELFStreamer(*T, createELFStreamer);
+    TargetRegistry::RegisterCOFFStreamer(*T, createARMWinCOFFStreamer);
+    TargetRegistry::RegisterMachOStreamer(*T, createARMMachOStreamer);
+
+    // Register the obj target streamer.
+    TargetRegistry::RegisterObjectTargetStreamer(*T,
+                                                 createARMObjectTargetStreamer);
+
+    // Register the asm streamer.
+    TargetRegistry::RegisterAsmTargetStreamer(*T, createARMTargetAsmStreamer);
+
+    // Register the null TargetStreamer.
+    TargetRegistry::RegisterNullTargetStreamer(*T, createARMNullTargetStreamer);
+
+    // Register the MCInstPrinter.
+    TargetRegistry::RegisterMCInstPrinter(*T, createARMMCInstPrinter);
+
+    // Register the MC relocation info.
+    TargetRegistry::RegisterMCRelocationInfo(*T, createARMMCRelocationInfo);
+  }
+
+  // Register the MC Code Emitter
+  for (Target *T : {&getTheARMLETarget(), &getTheThumbLETarget()})
+    TargetRegistry::RegisterMCCodeEmitter(*T, createARMLEMCCodeEmitter);
+  for (Target *T : {&getTheARMBETarget(), &getTheThumbBETarget()})
+    TargetRegistry::RegisterMCCodeEmitter(*T, createARMBEMCCodeEmitter);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(getTheARMLETarget(),
+                                       createARMLEAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(getTheARMBETarget(),
+                                       createARMBEAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(getTheThumbLETarget(),
+                                       createThumbLEAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(getTheThumbBETarget(),
+                                       createThumbBEAsmBackend);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
new file mode 100644
index 000000000000..ba834201e585
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -0,0 +1,131 @@
+//===-- ARMMCTargetDesc.h - ARM Target Descriptions -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides ARM specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCTARGETDESC_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+class formatted_raw_ostream;
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCInstPrinter;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCStreamer;
+class MCTargetOptions;
+class MCRelocationInfo;
+class MCTargetStreamer;
+class StringRef;
+class Target;
+class Triple;
+class raw_ostream;
+class raw_pwrite_stream;
+
+Target &getTheARMLETarget();
+Target &getTheThumbLETarget();
+Target &getTheARMBETarget();
+Target &getTheThumbBETarget();
+
+namespace ARM_MC {
+std::string ParseARMTriple(const Triple &TT, StringRef CPU);
+
+/// Create a ARM MCSubtargetInfo instance. This is exposed so Asm parser, etc.
+/// do not need to go through TargetRegistry.
+MCSubtargetInfo *createARMMCSubtargetInfo(const Triple &TT, StringRef CPU,
+                                          StringRef FS);
+}
+
+MCTargetStreamer *createARMNullTargetStreamer(MCStreamer &S);
+MCTargetStreamer *createARMTargetAsmStreamer(MCStreamer &S,
+                                             formatted_raw_ostream &OS,
+                                             MCInstPrinter *InstPrint,
+                                             bool isVerboseAsm);
+MCTargetStreamer *createARMObjectTargetStreamer(MCStreamer &S,
+                                                const MCSubtargetInfo &STI);
+
+MCCodeEmitter *createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        MCContext &Ctx);
+
+MCCodeEmitter *createARMBEMCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        MCContext &Ctx);
+
+MCAsmBackend *createARMAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  const Triple &TT, StringRef CPU,
+                                  const MCTargetOptions &Options,
+                                  bool IsLittleEndian);
+
+MCAsmBackend *createARMLEAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                    const Triple &TT, StringRef CPU,
+                                    const MCTargetOptions &Options);
+
+MCAsmBackend *createARMBEAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                    const Triple &TT, StringRef CPU,
+                                    const MCTargetOptions &Options);
+
+MCAsmBackend *createThumbLEAsmBackend(const Target &T,
+                                      const MCRegisterInfo &MRI,
+                                      const Triple &TT, StringRef CPU,
+                                      const MCTargetOptions &Options);
+
+MCAsmBackend *createThumbBEAsmBackend(const Target &T,
+                                      const MCRegisterInfo &MRI,
+                                      const Triple &TT, StringRef CPU,
+                                      const MCTargetOptions &Options);
+
+// Construct a PE/COFF machine code streamer which will generate a PE/COFF
+// object file.
+MCStreamer *createARMWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     raw_pwrite_stream &OS,
+                                     MCCodeEmitter *Emitter, bool RelaxAll,
+                                     bool IncrementalLinkerCompatible);
+
+/// Construct an ELF Mach-O object writer.
+MCObjectWriter *createARMELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
+                                         bool IsLittleEndian);
+
+/// Construct an ARM Mach-O object writer.
+MCObjectWriter *createARMMachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+                                          uint32_t CPUType,
+                                          uint32_t CPUSubtype);
+
+/// Construct an ARM PE/COFF object writer.
+MCObjectWriter *createARMWinCOFFObjectWriter(raw_pwrite_stream &OS,
+                                             bool Is64Bit);
+
+/// Construct ARM Mach-O relocation info.
+MCRelocationInfo *createARMMachORelocationInfo(MCContext &Ctx);
+} // End llvm namespace
+
+// Defines symbolic names for ARM registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "ARMGenRegisterInfo.inc"
+
+// Defines symbolic names for the ARM instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "ARMGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "ARMGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
new file mode 100644
index 000000000000..482bcf902518
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
@@ -0,0 +1,43 @@
+//===-- ARMMachORelocationInfo.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "ARMMCExpr.h"
+#include "llvm-c/Disassembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
+
+using namespace llvm;
+using namespace object;
+
+namespace {
+class ARMMachORelocationInfo : public MCRelocationInfo {
+public:
+  ARMMachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+  const MCExpr *createExprForCAPIVariantKind(const MCExpr *SubExpr,
+                                             unsigned VariantKind) override {
+    switch(VariantKind) {
+    case LLVMDisassembler_VariantKind_ARM_HI16:
+      return ARMMCExpr::createUpper16(SubExpr, Ctx);
+    case LLVMDisassembler_VariantKind_ARM_LO16:
+      return ARMMCExpr::createLower16(SubExpr, Ctx);
+    default:
+      return MCRelocationInfo::createExprForCAPIVariantKind(SubExpr,
+                                                            VariantKind);
+    }
+  }
+};
+} // End unnamed namespace
+
+/// createARMMachORelocationInfo - Construct an ARM Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createARMMachORelocationInfo(MCContext &Ctx) {
+  return new ARMMachORelocationInfo(Ctx);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
new file mode 100644
index 000000000000..b77181f29b2d
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -0,0 +1,486 @@
+//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+class ARMMachObjectWriter : public MCMachObjectTargetWriter {
+  void RecordARMScatteredRelocation(MachObjectWriter *Writer,
+                                    const MCAssembler &Asm,
+                                    const MCAsmLayout &Layout,
+                                    const MCFragment *Fragment,
+                                    const MCFixup &Fixup,
+                                    MCValue Target,
+                                    unsigned Type,
+                                    unsigned Log2Size,
+                                    uint64_t &FixedValue);
+  void RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
+                                        const MCAssembler &Asm,
+                                        const MCAsmLayout &Layout,
+                                        const MCFragment *Fragment,
+                                        const MCFixup &Fixup, MCValue Target,
+                                        uint64_t &FixedValue);
+
+  bool requiresExternRelocation(MachObjectWriter *Writer,
+                                const MCAssembler &Asm,
+                                const MCFragment &Fragment, unsigned RelocType,
+                                const MCSymbol &S, uint64_t FixedValue);
+
+public:
+  ARMMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
+
+  void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override;
+};
+}
+
+static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
+                              unsigned &Log2Size) {
+  RelocType = unsigned(MachO::ARM_RELOC_VANILLA);
+  Log2Size = ~0U;
+
+  switch (Kind) {
+  default:
+    return false;
+
+  case FK_Data_1:
+    Log2Size = llvm::Log2_32(1);
+    return true;
+  case FK_Data_2:
+    Log2Size = llvm::Log2_32(2);
+    return true;
+  case FK_Data_4:
+    Log2Size = llvm::Log2_32(4);
+    return true;
+  case FK_Data_8:
+    Log2Size = llvm::Log2_32(8);
+    return true;
+
+    // These fixups are expected to always be resolvable at assembly time and
+    // have no relocations supported.
+  case ARM::fixup_arm_ldst_pcrel_12:
+  case ARM::fixup_arm_pcrel_10:
+  case ARM::fixup_arm_adr_pcrel_12:
+  case ARM::fixup_arm_thumb_br:
+    return false;
+
+    // Handle 24-bit branch kinds.
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+  case ARM::fixup_arm_uncondbl:
+  case ARM::fixup_arm_condbl:
+  case ARM::fixup_arm_blx:
+    RelocType = unsigned(MachO::ARM_RELOC_BR24);
+    // Report as 'long', even though that is not quite accurate.
+    Log2Size = llvm::Log2_32(4);
+    return true;
+
+  case ARM::fixup_t2_uncondbranch:
+  case ARM::fixup_arm_thumb_bl:
+  case ARM::fixup_arm_thumb_blx:
+    RelocType = unsigned(MachO::ARM_THUMB_RELOC_BR22);
+    Log2Size = llvm::Log2_32(4);
+    return true;
+
+  // For movw/movt r_type relocations they always have a pair following them and
+  // the r_length bits are used differently.  The encoding of the r_length is as
+  // follows:
+  //   low bit of r_length:
+  //      0 - :lower16: for movw instructions
+  //      1 - :upper16: for movt instructions
+  //   high bit of r_length:
+  //      0 - arm instructions
+  //      1 - thumb instructions
+  case ARM::fixup_arm_movt_hi16:
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
+    Log2Size = 1;
+    return true;
+  case ARM::fixup_t2_movt_hi16:
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
+    Log2Size = 3;
+    return true;
+
+  case ARM::fixup_arm_movw_lo16:
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
+    Log2Size = 0;
+    return true;
+  case ARM::fixup_t2_movw_lo16:
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
+    Log2Size = 2;
+    return true;
+  }
+}
+
+void ARMMachObjectWriter::
+RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
+                                 const MCAssembler &Asm,
+                                 const MCAsmLayout &Layout,
+                                 const MCFragment *Fragment,
+                                 const MCFixup &Fixup,
+                                 MCValue Target,
+                                 uint64_t &FixedValue) {
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Type = MachO::ARM_RELOC_HALF;
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+
+  if (!A->getFragment()) {
+    Asm.getContext().reportError(Fixup.getLoc(),
+                       "symbol '" + A->getName() +
+                       "' can not be undefined in a subtraction expression");
+    return;
+  }
+
+  uint32_t Value = Writer->getSymbolAddress(*A, Layout);
+  uint32_t Value2 = 0;
+  uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
+  FixedValue += SecAddr;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    const MCSymbol *SB = &B->getSymbol();
+
+    if (!SB->getFragment()) {
+      Asm.getContext().reportError(Fixup.getLoc(),
+                         "symbol '" + B->getSymbol().getName() +
+                         "' can not be undefined in a subtraction expression");
+      return;
+    }
+
+    // Select the appropriate difference relocation type.
+    Type = MachO::ARM_RELOC_HALF_SECTDIFF;
+    Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
+    FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
+  }
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  // ARM_RELOC_HALF and ARM_RELOC_HALF_SECTDIFF abuse the r_length field:
+  //
+  // For these two r_type relocations they always have a pair following them and
+  // the r_length bits are used differently.  The encoding of the r_length is as
+  // follows:
+  //   low bit of r_length:
+  //      0 - :lower16: for movw instructions
+  //      1 - :upper16: for movt instructions
+  //   high bit of r_length:
+  //      0 - arm instructions
+  //      1 - thumb instructions
+  // the other half of the relocated expression is in the following pair
+  // relocation entry in the low 16 bits of r_address field.
+  unsigned ThumbBit = 0;
+  unsigned MovtBit = 0;
+  switch ((unsigned)Fixup.getKind()) {
+  default: break;
+  case ARM::fixup_arm_movt_hi16:
+    MovtBit = 1;
+    // The thumb bit shouldn't be set in the 'other-half' bit of the
+    // relocation, but it will be set in FixedValue if the base symbol
+    // is a thumb function. Clear it out here.
+    if (Asm.isThumbFunc(A))
+      FixedValue &= 0xfffffffe;
+    break;
+  case ARM::fixup_t2_movt_hi16:
+    if (Asm.isThumbFunc(A))
+      FixedValue &= 0xfffffffe;
+    MovtBit = 1;
+    LLVM_FALLTHROUGH;
+  case ARM::fixup_t2_movw_lo16:
+    ThumbBit = 1;
+    break;
+  }
+
+  if (Type == MachO::ARM_RELOC_HALF_SECTDIFF) {
+    uint32_t OtherHalf = MovtBit
+      ? (FixedValue & 0xffff) : ((FixedValue & 0xffff0000) >> 16);
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = ((OtherHalf             <<  0) |
+                   (MachO::ARM_RELOC_PAIR << 24) |
+                   (MovtBit               << 28) |
+                   (ThumbBit              << 29) |
+                   (IsPCRel               << 30) |
+                   MachO::R_SCATTERED);
+    MRE.r_word1 = Value2;
+    Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+  }
+
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = ((FixupOffset <<  0) |
+                 (Type        << 24) |
+                 (MovtBit     << 28) |
+                 (ThumbBit    << 29) |
+                 (IsPCRel     << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = Value;
+  Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+}
+
+void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
+                                                    const MCAssembler &Asm,
+                                                    const MCAsmLayout &Layout,
+                                                    const MCFragment *Fragment,
+                                                    const MCFixup &Fixup,
+                                                    MCValue Target,
+                                                    unsigned Type,
+                                                    unsigned Log2Size,
+                                                    uint64_t &FixedValue) {
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+
+  if (!A->getFragment()) {
+    Asm.getContext().reportError(Fixup.getLoc(),
+                       "symbol '" + A->getName() +
+                       "' can not be undefined in a subtraction expression");
+    return;
+  }
+
+  uint32_t Value = Writer->getSymbolAddress(*A, Layout);
+  uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
+  FixedValue += SecAddr;
+  uint32_t Value2 = 0;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    assert(Type == MachO::ARM_RELOC_VANILLA && "invalid reloc for 2 symbols");
+    const MCSymbol *SB = &B->getSymbol();
+
+    if (!SB->getFragment()) {
+      Asm.getContext().reportError(Fixup.getLoc(),
+                         "symbol '" + B->getSymbol().getName() +
+                         "' can not be undefined in a subtraction expression");
+      return;
+    }
+
+    // Select the appropriate difference relocation type.
+    Type = MachO::ARM_RELOC_SECTDIFF;
+    Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
+    FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
+  }
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  if (Type == MachO::ARM_RELOC_SECTDIFF ||
+      Type == MachO::ARM_RELOC_LOCAL_SECTDIFF) {
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = ((0                     <<  0) |
+                   (MachO::ARM_RELOC_PAIR << 24) |
+                   (Log2Size              << 28) |
+                   (IsPCRel               << 30) |
+                   MachO::R_SCATTERED);
+    MRE.r_word1 = Value2;
+    Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+  }
+
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = ((FixupOffset <<  0) |
+                 (Type        << 24) |
+                 (Log2Size    << 28) |
+                 (IsPCRel     << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = Value;
+  Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+}
+
+bool ARMMachObjectWriter::requiresExternRelocation(MachObjectWriter *Writer,
+                                                   const MCAssembler &Asm,
+                                                   const MCFragment &Fragment,
+                                                   unsigned RelocType,
+                                                   const MCSymbol &S,
+                                                   uint64_t FixedValue) {
+  // Most cases can be identified purely from the symbol.
+  if (Writer->doesSymbolRequireExternRelocation(S))
+    return true;
+  int64_t Value = (int64_t)FixedValue;  // The displacement is signed.
+  int64_t Range;
+  switch (RelocType) {
+  default:
+    return false;
+  case MachO::ARM_RELOC_BR24:
+    // PC pre-adjustment of 8 for these instructions.
+    Value -= 8;
+    // ARM BL/BLX has a 25-bit offset.
+    Range = 0x1ffffff;
+    break;
+  case MachO::ARM_THUMB_RELOC_BR22:
+    // PC pre-adjustment of 4 for these instructions.
+    Value -= 4;
+    // Thumb BL/BLX has a 24-bit offset.
+    Range = 0xffffff;
+  }
+  // BL/BLX also use external relocations when an internal relocation
+  // would result in the target being out of range. This gives the linker
+  // enough information to generate a branch island.
+  Value += Writer->getSectionAddress(&S.getSection());
+  Value -= Writer->getSectionAddress(Fragment.getParent());
+  // If the resultant value would be out of range for an internal relocation,
+  // use an external instead.
+  if (Value > Range || Value < -(Range + 1))
+    return true;
+  return false;
+}
+
+void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
+                                           MCAssembler &Asm,
+                                           const MCAsmLayout &Layout,
+                                           const MCFragment *Fragment,
+                                           const MCFixup &Fixup, MCValue Target,
+                                           uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Log2Size;
+  unsigned RelocType = MachO::ARM_RELOC_VANILLA;
+  if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size)) {
+    // If we failed to get fixup kind info, it's because there's no legal
+    // relocation type for the fixup kind. This happens when it's a fixup that's
+    // expected to always be resolvable at assembly time and not have any
+    // relocations needed.
+    Asm.getContext().reportError(Fixup.getLoc(),
+                                 "unsupported relocation on symbol");
+    return;
+  }
+
+  // If this is a difference or a defined symbol plus an offset, then we need a
+  // scattered relocation entry.  Differences always require scattered
+  // relocations.
+  if (Target.getSymB()) {
+    if (RelocType == MachO::ARM_RELOC_HALF)
+      return RecordARMScatteredHalfRelocation(Writer, Asm, Layout, Fragment,
+                                              Fixup, Target, FixedValue);
+    return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                                        Target, RelocType, Log2Size,
+                                        FixedValue);
+  }
+
+  // Get the symbol data, if any.
+  const MCSymbol *A = nullptr;
+  if (Target.getSymA())
+    A = &Target.getSymA()->getSymbol();
+
+  // FIXME: For other platforms, we need to use scattered relocations for
+  // internal relocations with offsets.  If this is an internal relocation with
+  // an offset, it also needs a scattered relocation entry.
+  //
+  // Is this right for ARM?
+  uint32_t Offset = Target.getConstant();
+  if (IsPCRel && RelocType == MachO::ARM_RELOC_VANILLA)
+    Offset += 1 << Log2Size;
+  if (Offset && A && !Writer->doesSymbolRequireExternRelocation(*A) &&
+      RelocType != MachO::ARM_RELOC_HALF)
+    return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                                        Target, RelocType, Log2Size,
+                                        FixedValue);
+
+  // See <reloc.h>.
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned Index = 0;
+  unsigned Type = 0;
+  const MCSymbol *RelSymbol = nullptr;
+
+  if (Target.isAbsolute()) { // constant
+    // FIXME!
+    report_fatal_error("FIXME: relocations to absolute targets "
+                       "not yet implemented");
+  } else {
+    // Resolve constant variables.
+    if (A->isVariable()) {
+      int64_t Res;
+      if (A->getVariableValue()->evaluateAsAbsolute(
+              Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+    }
+
+    // Check whether we need an external or internal relocation.
+    if (requiresExternRelocation(Writer, Asm, *Fragment, RelocType, *A,
+                                 FixedValue)) {
+      RelSymbol = A;
+
+      // For external relocations, make sure to offset the fixup value to
+      // compensate for the addend of the symbol address, if it was
+      // undefined. This occurs with weak definitions, for example.
+      if (!A->isUndefined())
+        FixedValue -= Layout.getSymbolOffset(*A);
+    } else {
+      // The index is the section ordinal (1-based).
+      const MCSection &Sec = A->getSection();
+      Index = Sec.getOrdinal() + 1;
+      FixedValue += Writer->getSectionAddress(&Sec);
+    }
+    if (IsPCRel)
+      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+
+    // The type is determined by the fixup kind.
+    Type = RelocType;
+  }
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 =
+      (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+
+  // Even when it's not a scattered relocation, movw/movt always uses
+  // a PAIR relocation.
+  if (Type == MachO::ARM_RELOC_HALF) {
+    // The entire addend is needed to correctly apply a relocation. One half is
+    // extracted from the instruction itself, the other comes from this
+    // PAIR. I.e. it's correct that we insert the high bits of the addend in the
+    // MOVW case here.  relocation entries.
+    uint32_t Value = 0;
+    switch ((unsigned)Fixup.getKind()) {
+    default: break;
+    case ARM::fixup_arm_movw_lo16:
+    case ARM::fixup_t2_movw_lo16:
+      Value = (FixedValue >> 16) & 0xffff;
+      break;
+    case ARM::fixup_arm_movt_hi16:
+    case ARM::fixup_t2_movt_hi16:
+      Value = FixedValue & 0xffff;
+      break;
+    }
+    MachO::any_relocation_info MREPair;
+    MREPair.r_word0 = Value;
+    MREPair.r_word1 = ((0xffffff              <<  0) |
+                       (Log2Size              << 25) |
+                       (MachO::ARM_RELOC_PAIR << 28));
+
+    Writer->addRelocation(nullptr, Fragment->getParent(), MREPair);
+  }
+
+  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createARMMachObjectWriter(raw_pwrite_stream &OS,
+                                                bool Is64Bit, uint32_t CPUType,
+                                                uint32_t CPUSubtype) {
+  return createMachObjectWriter(new ARMMachObjectWriter(Is64Bit,
+                                                        CPUType,
+                                                        CPUSubtype),
+                                OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
new file mode 100644
index 000000000000..c0d10c896354
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -0,0 +1,77 @@
+//===- ARMTargetStreamer.cpp - ARMTargetStreamer class --*- C++ -*---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARMTargetStreamer class.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/MapVector.h"
+#include "llvm/MC/ConstantPools.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+//
+// ARMTargetStreamer Implemenation
+//
+ARMTargetStreamer::ARMTargetStreamer(MCStreamer &S)
+    : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {}
+
+ARMTargetStreamer::~ARMTargetStreamer() {}
+
+// The constant pool handling is shared by all ARMTargetStreamer
+// implementations.
+const MCExpr *ARMTargetStreamer::addConstantPoolEntry(const MCExpr *Expr, SMLoc Loc) {
+  return ConstantPools->addEntry(Streamer, Expr, 4, Loc);
+}
+
+void ARMTargetStreamer::emitCurrentConstantPool() {
+  ConstantPools->emitForCurrentSection(Streamer);
+}
+
+// finish() - write out any non-empty assembler constant pools.
+void ARMTargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
+
+// reset() - Reset any state
+void ARMTargetStreamer::reset() {}
+
+// The remaining callbacks should be handled separately by each
+// streamer.
+void ARMTargetStreamer::emitFnStart() {}
+void ARMTargetStreamer::emitFnEnd() {}
+void ARMTargetStreamer::emitCantUnwind() {}
+void ARMTargetStreamer::emitPersonality(const MCSymbol *Personality) {}
+void ARMTargetStreamer::emitPersonalityIndex(unsigned Index) {}
+void ARMTargetStreamer::emitHandlerData() {}
+void ARMTargetStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
+                                  int64_t Offset) {}
+void ARMTargetStreamer::emitMovSP(unsigned Reg, int64_t Offset) {}
+void ARMTargetStreamer::emitPad(int64_t Offset) {}
+void ARMTargetStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                    bool isVector) {}
+void ARMTargetStreamer::emitUnwindRaw(int64_t StackOffset,
+                                      const SmallVectorImpl<uint8_t> &Opcodes) {
+}
+void ARMTargetStreamer::switchVendor(StringRef Vendor) {}
+void ARMTargetStreamer::emitAttribute(unsigned Attribute, unsigned Value) {}
+void ARMTargetStreamer::emitTextAttribute(unsigned Attribute,
+                                          StringRef String) {}
+void ARMTargetStreamer::emitIntTextAttribute(unsigned Attribute,
+                                             unsigned IntValue,
+                                             StringRef StringValue) {}
+void ARMTargetStreamer::emitArch(unsigned Arch) {}
+void ARMTargetStreamer::emitArchExtension(unsigned ArchExt) {}
+void ARMTargetStreamer::emitObjectArch(unsigned Arch) {}
+void ARMTargetStreamer::emitFPU(unsigned FPU) {}
+void ARMTargetStreamer::finishAttributeSection() {}
+void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) {}
+void
+ARMTargetStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) {}
+
+void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
new file mode 100644
index 000000000000..173cc93d44fb
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
@@ -0,0 +1,196 @@
+//===-- ARMUnwindOpAsm.cpp - ARM Unwind Opcodes Assembler -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the unwind opcode assmebler for ARM exception handling
+// table.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMUnwindOpAsm.h"
+#include "llvm/Support/ARMEHABI.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
+
+using namespace llvm;
+
+namespace {
+  /// UnwindOpcodeStreamer - The simple wrapper over SmallVector to emit bytes
+  /// with MSB to LSB per uint32_t ordering.  For example, the first byte will
+  /// be placed in Vec[3], and the following bytes will be placed in 2, 1, 0,
+  /// 7, 6, 5, 4, 11, 10, 9, 8, and so on.
+  class UnwindOpcodeStreamer {
+  private:
+    SmallVectorImpl<uint8_t> &Vec;
+    size_t Pos;
+
+  public:
+    UnwindOpcodeStreamer(SmallVectorImpl<uint8_t> &V) : Vec(V), Pos(3) {
+    }
+
+    /// Emit the byte in MSB to LSB per uint32_t order.
+    inline void EmitByte(uint8_t elem) {
+      Vec[Pos] = elem;
+      Pos = (((Pos ^ 0x3u) + 1) ^ 0x3u);
+    }
+
+    /// Emit the size prefix.
+    inline void EmitSize(size_t Size) {
+      size_t SizeInWords = (Size + 3) / 4;
+      assert(SizeInWords <= 0x100u &&
+             "Only 256 additional words are allowed for unwind opcodes");
+      EmitByte(static_cast<uint8_t>(SizeInWords - 1));
+    }
+
+    /// Emit the personality index prefix.
+    inline void EmitPersonalityIndex(unsigned PI) {
+      assert(PI < ARM::EHABI::NUM_PERSONALITY_INDEX &&
+             "Invalid personality prefix");
+      EmitByte(ARM::EHABI::EHT_COMPACT | PI);
+    }
+
+    /// Fill the rest of bytes with FINISH opcode.
+    inline void FillFinishOpcode() {
+      while (Pos < Vec.size())
+        EmitByte(ARM::EHABI::UNWIND_OPCODE_FINISH);
+    }
+  };
+}
+
+void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) {
+  if (RegSave == 0u)
+    return;
+
+  // One byte opcode to save register r14 and r11-r4
+  if (RegSave & (1u << 4)) {
+    // The one byte opcode will always save r4, thus we can't use the one byte
+    // opcode when r4 is not in .save directive.
+
+    // Compute the consecutive registers from r4 to r11.
+    uint32_t Mask = RegSave & 0xff0u;
+    uint32_t Range = countTrailingOnes(Mask >> 5); // Exclude r4.
+    // Mask off non-consecutive registers. Keep r4.
+    Mask &= ~(0xffffffe0u << Range);
+
+    // Emit this opcode when the mask covers every registers.
+    uint32_t UnmaskedReg = RegSave & 0xfff0u & (~Mask);
+    if (UnmaskedReg == 0u) {
+      // Pop r[4 : (4 + n)]
+      EmitInt8(ARM::EHABI::UNWIND_OPCODE_POP_REG_RANGE_R4 | Range);
+      RegSave &= 0x000fu;
+    } else if (UnmaskedReg == (1u << 14)) {
+      // Pop r[14] + r[4 : (4 + n)]
+      EmitInt8(ARM::EHABI::UNWIND_OPCODE_POP_REG_RANGE_R4_R14 | Range);
+      RegSave &= 0x000fu;
+    }
+  }
+
+  // Two bytes opcode to save register r15-r4
+  if ((RegSave & 0xfff0u) != 0)
+    EmitInt16(ARM::EHABI::UNWIND_OPCODE_POP_REG_MASK_R4 | (RegSave >> 4));
+
+  // Opcode to save register r3-r0
+  if ((RegSave & 0x000fu) != 0)
+    EmitInt16(ARM::EHABI::UNWIND_OPCODE_POP_REG_MASK | (RegSave & 0x000fu));
+}
+
+/// Emit unwind opcodes for .vsave directives
+void UnwindOpcodeAssembler::EmitVFPRegSave(uint32_t VFPRegSave) {
+  // We only have 4 bits to save the offset in the opcode so look at the lower
+  // and upper 16 bits separately.
+  for (uint32_t Regs : {VFPRegSave & 0xffff0000u, VFPRegSave & 0x0000ffffu}) {
+    while (Regs) {
+      // Now look for a run of set bits. Remember the MSB and LSB of the run.
+      auto RangeMSB = 32 - countLeadingZeros(Regs);
+      auto RangeLen = countLeadingOnes(Regs << (32 - RangeMSB));
+      auto RangeLSB = RangeMSB - RangeLen;
+
+      int Opcode = RangeLSB >= 16
+                       ? ARM::EHABI::UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D16
+                       : ARM::EHABI::UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD;
+
+      EmitInt16(Opcode | ((RangeLSB % 16) << 4) | (RangeLen - 1));
+
+      // Zero out bits we're done with.
+      Regs &= ~(-1u << RangeLSB);
+    }
+  }
+}
+
+/// Emit unwind opcodes to copy address from source register to $sp.
+void UnwindOpcodeAssembler::EmitSetSP(uint16_t Reg) {
+  EmitInt8(ARM::EHABI::UNWIND_OPCODE_SET_VSP | Reg);
+}
+
+/// Emit unwind opcodes to add $sp with an offset.
+void UnwindOpcodeAssembler::EmitSPOffset(int64_t Offset) {
+  if (Offset > 0x200) {
+    uint8_t Buff[16];
+    Buff[0] = ARM::EHABI::UNWIND_OPCODE_INC_VSP_ULEB128;
+    size_t ULEBSize = encodeULEB128((Offset - 0x204) >> 2, Buff + 1);
+    EmitBytes(Buff, ULEBSize + 1);
+  } else if (Offset > 0) {
+    if (Offset > 0x100) {
+      EmitInt8(ARM::EHABI::UNWIND_OPCODE_INC_VSP | 0x3fu);
+      Offset -= 0x100;
+    }
+    EmitInt8(ARM::EHABI::UNWIND_OPCODE_INC_VSP |
+             static_cast<uint8_t>((Offset - 4) >> 2));
+  } else if (Offset < 0) {
+    while (Offset < -0x100) {
+      EmitInt8(ARM::EHABI::UNWIND_OPCODE_DEC_VSP | 0x3fu);
+      Offset += 0x100;
+    }
+    EmitInt8(ARM::EHABI::UNWIND_OPCODE_DEC_VSP |
+             static_cast<uint8_t>(((-Offset) - 4) >> 2));
+  }
+}
+
+void UnwindOpcodeAssembler::Finalize(unsigned &PersonalityIndex,
+                                     SmallVectorImpl<uint8_t> &Result) {
+
+  UnwindOpcodeStreamer OpStreamer(Result);
+
+  if (HasPersonality) {
+    // User-specifed personality routine: [ SIZE , OP1 , OP2 , ... ]
+    PersonalityIndex = ARM::EHABI::NUM_PERSONALITY_INDEX;
+    size_t TotalSize = Ops.size() + 1;
+    size_t RoundUpSize = (TotalSize + 3) / 4 * 4;
+    Result.resize(RoundUpSize);
+    OpStreamer.EmitSize(RoundUpSize);
+  } else {
+    // If no personalityindex is specified, select ane
+    if (PersonalityIndex == ARM::EHABI::NUM_PERSONALITY_INDEX)
+      PersonalityIndex = (Ops.size() <= 3) ? ARM::EHABI::AEABI_UNWIND_CPP_PR0
+                                           : ARM::EHABI::AEABI_UNWIND_CPP_PR1;
+    if (PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0) {
+      // __aeabi_unwind_cpp_pr0: [ 0x80 , OP1 , OP2 , OP3 ]
+      assert(Ops.size() <= 3 && "too many opcodes for __aeabi_unwind_cpp_pr0");
+      Result.resize(4);
+      OpStreamer.EmitPersonalityIndex(PersonalityIndex);
+    } else {
+      // __aeabi_unwind_cpp_pr{1,2}: [ {0x81,0x82} , SIZE , OP1 , OP2 , ... ]
+      size_t TotalSize = Ops.size() + 2;
+      size_t RoundUpSize = (TotalSize + 3) / 4 * 4;
+      Result.resize(RoundUpSize);
+      OpStreamer.EmitPersonalityIndex(PersonalityIndex);
+      OpStreamer.EmitSize(RoundUpSize);
+    }
+  }
+
+  // Copy the unwind opcodes
+  for (size_t i = OpBegins.size() - 1; i > 0; --i)
+    for (size_t j = OpBegins[i - 1], end = OpBegins[i]; j < end; ++j)
+      OpStreamer.EmitByte(Ops[j]);
+
+  // Emit the padding finish opcodes if the size is not multiple of 4.
+  OpStreamer.FillFinishOpcode();
+
+  // Reset the assembler state
+  Reset();
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
new file mode 100644
index 000000000000..e0c113ecfaa3
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
@@ -0,0 +1,93 @@
+//===-- ARMUnwindOpAsm.h - ARM Unwind Opcodes Assembler ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the unwind opcode assmebler for ARM exception handling
+// table.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMUNWINDOPASM_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMUNWINDOPASM_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ARMEHABI.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class MCSymbol;
+
+class UnwindOpcodeAssembler {
+private:
+  llvm::SmallVector<uint8_t, 32> Ops;
+  llvm::SmallVector<unsigned, 8> OpBegins;
+  bool HasPersonality;
+
+public:
+  UnwindOpcodeAssembler()
+      : HasPersonality(0) {
+    OpBegins.push_back(0);
+  }
+
+  /// Reset the unwind opcode assembler.
+  void Reset() {
+    Ops.clear();
+    OpBegins.clear();
+    OpBegins.push_back(0);
+    HasPersonality = 0;
+  }
+
+  /// Set the personality
+  void setPersonality(const MCSymbol *Per) {
+    HasPersonality = 1;
+  }
+
+  /// Emit unwind opcodes for .save directives
+  void EmitRegSave(uint32_t RegSave);
+
+  /// Emit unwind opcodes for .vsave directives
+  void EmitVFPRegSave(uint32_t VFPRegSave);
+
+  /// Emit unwind opcodes to copy address from source register to $sp.
+  void EmitSetSP(uint16_t Reg);
+
+  /// Emit unwind opcodes to add $sp with an offset.
+  void EmitSPOffset(int64_t Offset);
+
+  /// Emit unwind raw opcodes
+  void EmitRaw(const SmallVectorImpl<uint8_t> &Opcodes) {
+    Ops.insert(Ops.end(), Opcodes.begin(), Opcodes.end());
+    OpBegins.push_back(OpBegins.back() + Opcodes.size());
+  }
+
+  /// Finalize the unwind opcode sequence for EmitBytes()
+  void Finalize(unsigned &PersonalityIndex,
+                SmallVectorImpl<uint8_t> &Result);
+
+private:
+  void EmitInt8(unsigned Opcode) {
+    Ops.push_back(Opcode & 0xff);
+    OpBegins.push_back(OpBegins.back() + 1);
+  }
+
+  void EmitInt16(unsigned Opcode) {
+    Ops.push_back((Opcode >> 8) & 0xff);
+    Ops.push_back(Opcode & 0xff);
+    OpBegins.push_back(OpBegins.back() + 2);
+  }
+
+  void EmitBytes(const uint8_t *Opcode, size_t Size) {
+    Ops.insert(Ops.end(), Opcode, Opcode + Size);
+    OpBegins.push_back(OpBegins.back() + Size);
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
new file mode 100644
index 000000000000..166c04b41a77
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -0,0 +1,91 @@
+//===-- ARMWinCOFFObjectWriter.cpp - ARM Windows COFF Object Writer -- C++ -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+namespace {
+class ARMWinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+public:
+  ARMWinCOFFObjectWriter(bool Is64Bit)
+    : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARMNT) {
+    assert(!Is64Bit && "AArch64 support not yet implemented");
+  }
+  ~ARMWinCOFFObjectWriter() override {}
+
+  unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsCrossSection,
+                        const MCAsmBackend &MAB) const override;
+
+  bool recordRelocation(const MCFixup &) const override;
+};
+
+unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target,
+                                              const MCFixup &Fixup,
+                                              bool IsCrossSection,
+                                              const MCAsmBackend &MAB) const {
+  assert(getMachine() == COFF::IMAGE_FILE_MACHINE_ARMNT &&
+         "AArch64 support not yet implemented");
+
+  MCSymbolRefExpr::VariantKind Modifier =
+    Target.isAbsolute() ? MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
+  switch (static_cast<unsigned>(Fixup.getKind())) {
+  default: {
+    const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
+    report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
+  }
+  case FK_Data_4:
+    switch (Modifier) {
+    case MCSymbolRefExpr::VK_COFF_IMGREL32:
+      return COFF::IMAGE_REL_ARM_ADDR32NB;
+    case MCSymbolRefExpr::VK_SECREL:
+      return COFF::IMAGE_REL_ARM_SECREL;
+    default:
+      return COFF::IMAGE_REL_ARM_ADDR32;
+    }
+  case FK_SecRel_2:
+    return COFF::IMAGE_REL_ARM_SECTION;
+  case FK_SecRel_4:
+    return COFF::IMAGE_REL_ARM_SECREL;
+  case ARM::fixup_t2_condbranch:
+    return COFF::IMAGE_REL_ARM_BRANCH20T;
+  case ARM::fixup_t2_uncondbranch:
+    return COFF::IMAGE_REL_ARM_BRANCH24T;
+  case ARM::fixup_arm_thumb_bl:
+  case ARM::fixup_arm_thumb_blx:
+    return COFF::IMAGE_REL_ARM_BLX23T;
+  case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_movt_hi16:
+    return COFF::IMAGE_REL_ARM_MOV32T;
+  }
+}
+
+bool ARMWinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
+  return static_cast<unsigned>(Fixup.getKind()) != ARM::fixup_t2_movt_hi16;
+}
+}
+
+namespace llvm {
+MCObjectWriter *createARMWinCOFFObjectWriter(raw_pwrite_stream &OS,
+                                             bool Is64Bit) {
+  MCWinCOFFObjectTargetWriter *MOTW = new ARMWinCOFFObjectWriter(Is64Bit);
+  return createWinCOFFObjectWriter(MOTW, OS);
+}
+}
+
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
new file mode 100644
index 000000000000..83fa084e60c7
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -0,0 +1,47 @@
+//===-- ARMWinCOFFStreamer.cpp - ARM Target WinCOFF Streamer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCTargetDesc.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+class ARMWinCOFFStreamer : public MCWinCOFFStreamer {
+public:
+  ARMWinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter &CE,
+                     raw_pwrite_stream &OS)
+      : MCWinCOFFStreamer(C, AB, CE, OS) {}
+
+  void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
+  void EmitThumbFunc(MCSymbol *Symbol) override;
+};
+
+void ARMWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  switch (Flag) {
+  default: llvm_unreachable("not implemented");
+  case MCAF_SyntaxUnified:
+  case MCAF_Code16:
+    break;
+  }
+}
+
+void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) {
+  getAssembler().setIsThumbFunc(Symbol);
+}
+}
+
+MCStreamer *llvm::createARMWinCOFFStreamer(
+    MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS,
+    MCCodeEmitter *Emitter, bool RelaxAll, bool IncrementalLinkerCompatible) {
+  auto *S = new ARMWinCOFFStreamer(Context, MAB, *Emitter, OS);
+  S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
+  return S;
+}
+
diff --git a/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp
new file mode 100644
index 000000000000..744761bcddb8
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp
@@ -0,0 +1,395 @@
+//===-- MLxExpansionPass.cpp - Expand MLx instrs to avoid hazards ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Expand VFP / NEON floating point MLA / MLS instructions (each to a pair of
+// multiple and add / sub instructions) when special VMLx hazards are detected.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mlx-expansion"
+
+static cl::opt<bool>
+ForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden);
+static cl::opt<unsigned>
+ExpandLimit("expand-limit", cl::init(~0U), cl::Hidden);
+
+STATISTIC(NumExpand, "Number of fp MLA / MLS instructions expanded");
+
+namespace {
+  struct MLxExpansion : public MachineFunctionPass {
+    static char ID;
+    MLxExpansion() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    StringRef getPassName() const override {
+      return "ARM MLA / MLS expansion pass";
+    }
+
+  private:
+    const ARMBaseInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineRegisterInfo *MRI;
+
+    bool isLikeA9;
+    bool isSwift;
+    unsigned MIIdx;
+    MachineInstr* LastMIs[4];
+    SmallPtrSet<MachineInstr*, 4> IgnoreStall;
+
+    void clearStack();
+    void pushStack(MachineInstr *MI);
+    MachineInstr *getAccDefMI(MachineInstr *MI) const;
+    unsigned getDefReg(MachineInstr *MI) const;
+    bool hasLoopHazard(MachineInstr *MI) const;
+    bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const;
+    bool FindMLxHazard(MachineInstr *MI);
+    void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
+                                unsigned MulOpc, unsigned AddSubOpc,
+                                bool NegAcc, bool HasLane);
+    bool ExpandFPMLxInstructions(MachineBasicBlock &MBB);
+  };
+  char MLxExpansion::ID = 0;
+}
+
+void MLxExpansion::clearStack() {
+  std::fill(LastMIs, LastMIs + 4, nullptr);
+  MIIdx = 0;
+}
+
+void MLxExpansion::pushStack(MachineInstr *MI) {
+  LastMIs[MIIdx] = MI;
+  if (++MIIdx == 4)
+    MIIdx = 0;
+}
+
+MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
+  // Look past COPY and INSERT_SUBREG instructions to find the
+  // real definition MI. This is important for _sfp instructions.
+  unsigned Reg = MI->getOperand(1).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return nullptr;
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *DefMI = MRI->getVRegDef(Reg);
+  while (true) {
+    if (DefMI->getParent() != MBB)
+      break;
+    if (DefMI->isCopyLike()) {
+      Reg = DefMI->getOperand(1).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        DefMI = MRI->getVRegDef(Reg);
+        continue;
+      }
+    } else if (DefMI->isInsertSubreg()) {
+      Reg = DefMI->getOperand(2).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        DefMI = MRI->getVRegDef(Reg);
+        continue;
+      }
+    }
+    break;
+  }
+  return DefMI;
+}
+
+unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
+  unsigned Reg = MI->getOperand(0).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+      !MRI->hasOneNonDBGUse(Reg))
+    return Reg;
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *UseMI = &*MRI->use_instr_nodbg_begin(Reg);
+  if (UseMI->getParent() != MBB)
+    return Reg;
+
+  while (UseMI->isCopy() || UseMI->isInsertSubreg()) {
+    Reg = UseMI->getOperand(0).getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+        !MRI->hasOneNonDBGUse(Reg))
+      return Reg;
+    UseMI = &*MRI->use_instr_nodbg_begin(Reg);
+    if (UseMI->getParent() != MBB)
+      return Reg;
+  }
+
+  return Reg;
+}
+
+/// hasLoopHazard - Check whether an MLx instruction is chained to itself across
+/// a single-MBB loop.
+bool MLxExpansion::hasLoopHazard(MachineInstr *MI) const {
+  unsigned Reg = MI->getOperand(1).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return false;
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *DefMI = MRI->getVRegDef(Reg);
+  while (true) {
+outer_continue:
+    if (DefMI->getParent() != MBB)
+      break;
+
+    if (DefMI->isPHI()) {
+      for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) {
+        if (DefMI->getOperand(i + 1).getMBB() == MBB) {
+          unsigned SrcReg = DefMI->getOperand(i).getReg();
+          if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+            DefMI = MRI->getVRegDef(SrcReg);
+            goto outer_continue;
+          }
+        }
+      }
+    } else if (DefMI->isCopyLike()) {
+      Reg = DefMI->getOperand(1).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        DefMI = MRI->getVRegDef(Reg);
+        continue;
+      }
+    } else if (DefMI->isInsertSubreg()) {
+      Reg = DefMI->getOperand(2).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        DefMI = MRI->getVRegDef(Reg);
+        continue;
+      }
+    }
+
+    break;
+  }
+
+  return DefMI == MI;
+}
+
+bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
+  // FIXME: Detect integer instructions properly.
+  const MCInstrDesc &MCID = MI->getDesc();
+  unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
+  if (MI->mayStore())
+    return false;
+  unsigned Opcode = MCID.getOpcode();
+  if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+    return false;
+  if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON))
+    return MI->readsRegister(Reg, TRI);
+  return false;
+}
+
+static bool isFpMulInstruction(unsigned Opcode) {
+  switch (Opcode) {
+  case ARM::VMULS:
+  case ARM::VMULfd:
+  case ARM::VMULfq:
+  case ARM::VMULD:
+  case ARM::VMULslfd:
+  case ARM::VMULslfq:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
+  if (NumExpand >= ExpandLimit)
+    return false;
+
+  if (ForceExapnd)
+    return true;
+
+  MachineInstr *DefMI = getAccDefMI(MI);
+  if (TII->isFpMLxInstruction(DefMI->getOpcode())) {
+    // r0 = vmla
+    // r3 = vmla r0, r1, r2
+    // takes 16 - 17 cycles
+    //
+    // r0 = vmla
+    // r4 = vmul r1, r2
+    // r3 = vadd r0, r4
+    // takes about 14 - 15 cycles even with vmul stalling for 4 cycles.
+    IgnoreStall.insert(DefMI);
+    return true;
+  }
+
+  // On Swift, we mostly care about hazards from multiplication instructions
+  // writing the accumulator and the pipelining of loop iterations by out-of-
+  // order execution. 
+  if (isSwift)
+    return isFpMulInstruction(DefMI->getOpcode()) || hasLoopHazard(MI);
+
+  if (IgnoreStall.count(MI))
+    return false;
+
+  // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the
+  // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall
+  // preserves the in-order retirement of the instructions.
+  // Look at the next few instructions, if *most* of them can cause hazards,
+  // then the scheduler can't *fix* this, we'd better break up the VMLA.
+  unsigned Limit1 = isLikeA9 ? 1 : 4;
+  unsigned Limit2 = isLikeA9 ? 1 : 4;
+  for (unsigned i = 1; i <= 4; ++i) {
+    int Idx = ((int)MIIdx - i + 4) % 4;
+    MachineInstr *NextMI = LastMIs[Idx];
+    if (!NextMI)
+      continue;
+
+    if (TII->canCauseFpMLxStall(NextMI->getOpcode())) {
+      if (i <= Limit1)
+        return true;
+    }
+
+    // Look for VMLx RAW hazard.
+    if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI))
+      return true;
+  }
+
+  return false;
+}
+
+/// ExpandFPMLxInstructions - Expand a MLA / MLS instruction into a pair
+/// of MUL + ADD / SUB instructions.
+void
+MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
+                                     unsigned MulOpc, unsigned AddSubOpc,
+                                     bool NegAcc, bool HasLane) {
+  unsigned DstReg = MI->getOperand(0).getReg();
+  bool DstDead = MI->getOperand(0).isDead();
+  unsigned AccReg = MI->getOperand(1).getReg();
+  unsigned Src1Reg = MI->getOperand(2).getReg();
+  unsigned Src2Reg = MI->getOperand(3).getReg();
+  bool Src1Kill = MI->getOperand(2).isKill();
+  bool Src2Kill = MI->getOperand(3).isKill();
+  unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0;
+  unsigned NextOp = HasLane ? 5 : 4;
+  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm();
+  unsigned PredReg = MI->getOperand(++NextOp).getReg();
+
+  const MCInstrDesc &MCID1 = TII->get(MulOpc);
+  const MCInstrDesc &MCID2 = TII->get(AddSubOpc);
+  const MachineFunction &MF = *MI->getParent()->getParent();
+  unsigned TmpReg = MRI->createVirtualRegister(
+                      TII->getRegClass(MCID1, 0, TRI, MF));
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg)
+    .addReg(Src1Reg, getKillRegState(Src1Kill))
+    .addReg(Src2Reg, getKillRegState(Src2Kill));
+  if (HasLane)
+    MIB.addImm(LaneImm);
+  MIB.addImm(Pred).addReg(PredReg);
+
+  MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID2)
+    .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead));
+
+  if (NegAcc) {
+    bool AccKill = MRI->hasOneNonDBGUse(AccReg);
+    MIB.addReg(TmpReg, getKillRegState(true))
+       .addReg(AccReg, getKillRegState(AccKill));
+  } else {
+    MIB.addReg(AccReg).addReg(TmpReg, getKillRegState(true));
+  }
+  MIB.addImm(Pred).addReg(PredReg);
+
+  DEBUG({
+      dbgs() << "Expanding: " << *MI;
+      dbgs() << "  to:\n";
+      MachineBasicBlock::iterator MII = MI;
+      MII = std::prev(MII);
+      MachineInstr &MI2 = *MII;
+      MII = std::prev(MII);
+      MachineInstr &MI1 = *MII;
+      dbgs() << "    " << MI1;
+      dbgs() << "    " << MI2;
+   });
+
+  MI->eraseFromParent();
+  ++NumExpand;
+}
+
+bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
+  bool Changed = false;
+
+  clearStack();
+  IgnoreStall.clear();
+
+  unsigned Skip = 0;
+  MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend();
+  while (MII != E) {
+    MachineInstr *MI = &*MII++;
+
+    if (MI->isPosition() || MI->isImplicitDef() || MI->isCopy())
+      continue;
+
+    const MCInstrDesc &MCID = MI->getDesc();
+    if (MI->isBarrier()) {
+      clearStack();
+      Skip = 0;
+      continue;
+    }
+
+    unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
+    if (Domain == ARMII::DomainGeneral) {
+      if (++Skip == 2)
+        // Assume dual issues of non-VFP / NEON instructions.
+        pushStack(nullptr);
+    } else {
+      Skip = 0;
+
+      unsigned MulOpc, AddSubOpc;
+      bool NegAcc, HasLane;
+      if (!TII->isFpMLxInstruction(MCID.getOpcode(),
+                                   MulOpc, AddSubOpc, NegAcc, HasLane) ||
+          !FindMLxHazard(MI))
+        pushStack(MI);
+      else {
+        ExpandFPMLxInstruction(MBB, MI, MulOpc, AddSubOpc, NegAcc, HasLane);
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
+  TII = static_cast<const ARMBaseInstrInfo *>(Fn.getSubtarget().getInstrInfo());
+  TRI = Fn.getSubtarget().getRegisterInfo();
+  MRI = &Fn.getRegInfo();
+  const ARMSubtarget *STI = &Fn.getSubtarget<ARMSubtarget>();
+  if (!STI->expandMLx())
+    return false;
+  isLikeA9 = STI->isLikeA9() || STI->isSwift();
+  isSwift = STI->isSwift();
+
+  bool Modified = false;
+  for (MachineBasicBlock &MBB : Fn)
+    Modified |= ExpandFPMLxInstructions(MBB);
+
+  return Modified;
+}
+
+FunctionPass *llvm::createMLxExpansionPass() {
+  return new MLxExpansion();
+}
diff --git a/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
new file mode 100644
index 000000000000..caa69f8d71b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -0,0 +1,41 @@
+//===-- ARMTargetInfo.cpp - ARM Target Implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheARMLETarget() {
+  static Target TheARMLETarget;
+  return TheARMLETarget;
+}
+Target &llvm::getTheARMBETarget() {
+  static Target TheARMBETarget;
+  return TheARMBETarget;
+}
+Target &llvm::getTheThumbLETarget() {
+  static Target TheThumbLETarget;
+  return TheThumbLETarget;
+}
+Target &llvm::getTheThumbBETarget() {
+  static Target TheThumbBETarget;
+  return TheThumbBETarget;
+}
+
+extern "C" void LLVMInitializeARMTargetInfo() {
+  RegisterTarget<Triple::arm, /*HasJIT=*/true> X(getTheARMLETarget(), "arm",
+                                                 "ARM");
+  RegisterTarget<Triple::armeb, /*HasJIT=*/true> Y(getTheARMBETarget(), "armeb",
+                                                   "ARM (big endian)");
+
+  RegisterTarget<Triple::thumb, /*HasJIT=*/true> A(getTheThumbLETarget(),
+                                                   "thumb", "Thumb");
+  RegisterTarget<Triple::thumbeb, /*HasJIT=*/true> B(
+      getTheThumbBETarget(), "thumbeb", "Thumb (big endian)");
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
new file mode 100644
index 000000000000..9953c61cd89c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -0,0 +1,884 @@
+//===-- Thumb1FrameLowering.cpp - Thumb1 Frame Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb1 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb1FrameLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+Thumb1FrameLowering::Thumb1FrameLowering(const ARMSubtarget &sti)
+    : ARMFrameLowering(sti) {}
+
+bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const{
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  unsigned CFSize = MFI.getMaxCallFrameSize();
+  // It's not always a good idea to include the call frame as part of the
+  // stack frame. ARM (especially Thumb) has small immediate offset to
+  // address the stack frame. So a large call frame can cause poor codegen
+  // and may even makes it impossible to scavenge a register.
+  if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4
+    return false;
+
+  return !MFI.hasVarSizedObjects();
+}
+
+static void emitSPUpdate(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator &MBBI,
+                         const TargetInstrInfo &TII, const DebugLoc &dl,
+                         const ThumbRegisterInfo &MRI, int NumBytes,
+                         unsigned MIFlags = MachineInstr::NoFlags) {
+  emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII,
+                            MRI, MIFlags);
+}
+
+
+MachineBasicBlock::iterator Thumb1FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const Thumb1InstrInfo &TII =
+      *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo());
+  const ThumbRegisterInfo *RegInfo =
+      static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo());
+  if (!hasReservedCallFrame(MF)) {
+    // If we have alloca, convert as follows:
+    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+    // ADJCALLSTACKUP   -> add, sp, sp, amount
+    MachineInstr &Old = *I;
+    DebugLoc dl = Old.getDebugLoc();
+    unsigned Amount = Old.getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      // Replace the pseudo instruction with a new instruction...
+      unsigned Opc = Old.getOpcode();
+      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
+        emitSPUpdate(MBB, I, TII, dl, *RegInfo, -Amount);
+      } else {
+        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
+        emitSPUpdate(MBB, I, TII, dl, *RegInfo, Amount);
+      }
+    }
+  }
+  return MBB.erase(I);
+}
+
+void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const ThumbRegisterInfo *RegInfo =
+      static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo());
+  const Thumb1InstrInfo &TII =
+      *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo());
+
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+  unsigned NumBytes = MFI.getStackSize();
+  assert(NumBytes >= ArgRegsSaveSize &&
+         "ArgRegsSaveSize is included in NumBytes");
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc dl;
+  
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  unsigned BasePtr = RegInfo->getBaseRegister();
+  int CFAOffset = 0;
+
+  // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4.
+  NumBytes = (NumBytes + 3) & ~3;
+  MFI.setStackSize(NumBytes);
+
+  // Determine the sizes of each callee-save spill areas and record which frame
+  // belongs to which callee-save spill areas.
+  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+  int FramePtrSpillFI = 0;
+
+  if (ArgRegsSaveSize) {
+    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize,
+                 MachineInstr::FrameSetup);
+    CFAOffset -= ArgRegsSaveSize;
+    unsigned CFIIndex = MF.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes - ArgRegsSaveSize != 0) {
+      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -(NumBytes - ArgRegsSaveSize),
+                   MachineInstr::FrameSetup);
+      CFAOffset -= NumBytes - ArgRegsSaveSize;
+      unsigned CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+    return;
+  }
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    int FI = CSI[i].getFrameIdx();
+    switch (Reg) {
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+      if (STI.splitFramePushPop(MF)) {
+        GPRCS2Size += 4;
+        break;
+      }
+      LLVM_FALLTHROUGH;
+    case ARM::R4:
+    case ARM::R5:
+    case ARM::R6:
+    case ARM::R7:
+    case ARM::LR:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      GPRCS1Size += 4;
+      break;
+    default:
+      DPRCSSize += 8;
+    }
+  }
+
+  if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) {
+    ++MBBI;
+  }
+
+  // Determine starting offsets of spill areas.
+  unsigned DPRCSOffset  = NumBytes - ArgRegsSaveSize - (GPRCS1Size + GPRCS2Size + DPRCSSize);
+  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
+  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
+  bool HasFP = hasFP(MF);
+  if (HasFP)
+    AFI->setFramePtrSpillOffset(MFI.getObjectOffset(FramePtrSpillFI) +
+                                NumBytes);
+  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
+  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
+  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+  NumBytes = DPRCSOffset;
+
+  int FramePtrOffsetInBlock = 0;
+  unsigned adjustedGPRCS1Size = GPRCS1Size;
+  if (GPRCS1Size > 0 && GPRCS2Size == 0 &&
+      tryFoldSPUpdateIntoPushPop(STI, MF, &*std::prev(MBBI), NumBytes)) {
+    FramePtrOffsetInBlock = NumBytes;
+    adjustedGPRCS1Size += NumBytes;
+    NumBytes = 0;
+  }
+
+  if (adjustedGPRCS1Size) {
+    CFAOffset -= adjustedGPRCS1Size;
+    unsigned CFIIndex = MF.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
+  for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+         E = CSI.end(); I != E; ++I) {
+    unsigned Reg = I->getReg();
+    int FI = I->getFrameIdx();
+    switch (Reg) {
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+    case ARM::R12:
+      if (STI.splitFramePushPop(MF))
+        break;
+      // fallthough
+    case ARM::R0:
+    case ARM::R1:
+    case ARM::R2:
+    case ARM::R3:
+    case ARM::R4:
+    case ARM::R5:
+    case ARM::R6:
+    case ARM::R7:
+    case ARM::LR:
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+          nullptr, MRI->getDwarfRegNum(Reg, true), MFI.getObjectOffset(FI)));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+      break;
+    }
+  }
+
+  // Adjust FP so it point to the stack slot that contains the previous FP.
+  if (HasFP) {
+    FramePtrOffsetInBlock +=
+        MFI.getObjectOffset(FramePtrSpillFI) + GPRCS1Size + ArgRegsSaveSize;
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
+      .addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4)
+      .setMIFlags(MachineInstr::FrameSetup));
+    if(FramePtrOffsetInBlock) {
+      CFAOffset += FramePtrOffsetInBlock;
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+          nullptr, MRI->getDwarfRegNum(FramePtr, true), CFAOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+    } else {
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+              nullptr, MRI->getDwarfRegNum(FramePtr, true)));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+    if (NumBytes > 508)
+      // If offset is > 508 then sp cannot be adjusted in a single instruction,
+      // try restoring from fp instead.
+      AFI->setShouldRestoreSPFromFP(true);
+  }
+
+  // Skip past the spilling of r8-r11, which could consist of multiple tPUSH
+  // and tMOVr instructions. We don't need to add any call frame information
+  // in-between these instructions, because they do not modify the high
+  // registers.
+  while (true) {
+    MachineBasicBlock::iterator OldMBBI = MBBI;
+    // Skip a run of tMOVr instructions
+    while (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tMOVr)
+      MBBI++;
+    if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) {
+      MBBI++;
+    } else {
+      // We have reached an instruction which is not a push, so the previous
+      // run of tMOVr instructions (which may have been empty) was not part of
+      // the prologue. Reset MBBI back to the last PUSH of the prologue.
+      MBBI = OldMBBI;
+      break;
+    }
+  }
+
+  // Emit call frame information for the callee-saved high registers.
+  for (auto &I : CSI) {
+    unsigned Reg = I.getReg();
+    int FI = I.getFrameIdx();
+    switch (Reg) {
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+    case ARM::R12: {
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+          nullptr, MRI->getDwarfRegNum(Reg, true), MFI.getObjectOffset(FI)));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  if (NumBytes) {
+    // Insert it after all the callee-save spills.
+    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
+                 MachineInstr::FrameSetup);
+    if (!HasFP) {
+      CFAOffset -= NumBytes;
+      unsigned CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+  }
+
+  if (STI.isTargetELF() && HasFP)
+    MFI.setOffsetAdjustment(MFI.getOffsetAdjustment() -
+                            AFI->getFramePtrSpillOffset());
+
+  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
+  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+
+  // Thumb1 does not currently support dynamic stack realignment.  Report a
+  // fatal error rather then silently generate bad code.
+  if (RegInfo->needsStackRealignment(MF))
+      report_fatal_error("Dynamic stack realignment not supported for thumb1.");
+
+  // If we need a base pointer, set it up here. It's whatever the value
+  // of the stack pointer is at this point. Any variable size objects
+  // will be allocated after this, so we can still use the base pointer
+  // to reference locals.
+  if (RegInfo->hasBasePointer(MF))
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), BasePtr)
+                   .addReg(ARM::SP));
+
+  // If the frame has variable sized objects then the epilogue must restore
+  // the sp from fp. We can assume there's an FP here since hasFP already
+  // checks for hasVarSizedObjects.
+  if (MFI.hasVarSizedObjects())
+    AFI->setShouldRestoreSPFromFP(true);
+}
+
+static bool isCSRestore(MachineInstr &MI, const MCPhysReg *CSRegs) {
+  if (MI.getOpcode() == ARM::tLDRspi && MI.getOperand(1).isFI() &&
+      isCalleeSavedRegister(MI.getOperand(0).getReg(), CSRegs))
+    return true;
+  else if (MI.getOpcode() == ARM::tPOP) {
+    return true;
+  } else if (MI.getOpcode() == ARM::tMOVr) {
+    unsigned Dst = MI.getOperand(0).getReg();
+    unsigned Src = MI.getOperand(1).getReg();
+    return ((ARM::tGPRRegClass.contains(Src) || Src == ARM::LR) &&
+            ARM::hGPRRegClass.contains(Dst));
+  }
+  return false;
+}
+
+void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const ThumbRegisterInfo *RegInfo =
+      static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo());
+  const Thumb1InstrInfo &TII =
+      *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo());
+
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+  int NumBytes = (int)MFI.getStackSize();
+  assert((unsigned)NumBytes >= ArgRegsSaveSize &&
+         "ArgRegsSaveSize is included in NumBytes");
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes - ArgRegsSaveSize != 0)
+      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes - ArgRegsSaveSize);
+  } else {
+    // Unwind MBBI to point to first LDR / VLDRD.
+    if (MBBI != MBB.begin()) {
+      do
+        --MBBI;
+      while (MBBI != MBB.begin() && isCSRestore(*MBBI, CSRegs));
+      if (!isCSRestore(*MBBI, CSRegs))
+        ++MBBI;
+    }
+
+    // Move SP to start of FP callee save spill area.
+    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
+                 AFI->getGPRCalleeSavedArea2Size() +
+                 AFI->getDPRCalleeSavedAreaSize() +
+                 ArgRegsSaveSize);
+
+    if (AFI->shouldRestoreSPFromFP()) {
+      NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+      // Reset SP based on frame pointer only if the stack frame extends beyond
+      // frame pointer stack slot, the target is ELF and the function has FP, or
+      // the target uses var sized objects.
+      if (NumBytes) {
+        assert(!MFI.getPristineRegs(MF).test(ARM::R4) &&
+               "No scratch register to restore SP from FP!");
+        emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
+                                  TII, *RegInfo);
+        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+                               ARM::SP)
+          .addReg(ARM::R4));
+      } else
+        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+                               ARM::SP)
+          .addReg(FramePtr));
+    } else {
+      if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tBX_RET &&
+          &MBB.front() != &*MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) {
+        MachineBasicBlock::iterator PMBBI = std::prev(MBBI);
+        if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*PMBBI, NumBytes))
+          emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
+      } else if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes))
+        emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
+    }
+  }
+
+  if (needPopSpecialFixUp(MF)) {
+    bool Done = emitPopSpecialFixUp(MBB, /* DoIt */ true);
+    (void)Done;
+    assert(Done && "Emission of the special fixup failed!?");
+  }
+}
+
+bool Thumb1FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
+  if (!needPopSpecialFixUp(*MBB.getParent()))
+    return true;
+
+  MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
+  return emitPopSpecialFixUp(*TmpMBB, /* DoIt */ false);
+}
+
+bool Thumb1FrameLowering::needPopSpecialFixUp(const MachineFunction &MF) const {
+  ARMFunctionInfo *AFI =
+      const_cast<MachineFunction *>(&MF)->getInfo<ARMFunctionInfo>();
+  if (AFI->getArgRegsSaveSize())
+    return true;
+
+  // LR cannot be encoded with Thumb1, i.e., it requires a special fix-up.
+  for (const CalleeSavedInfo &CSI : MF.getFrameInfo().getCalleeSavedInfo())
+    if (CSI.getReg() == ARM::LR)
+      return true;
+
+  return false;
+}
+
+bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
+                                              bool DoIt) const {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  const ThumbRegisterInfo *RegInfo =
+      static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo());
+
+  // If MBBI is a return instruction, or is a tPOP followed by a return
+  // instruction in the successor BB, we may be able to directly restore
+  // LR in the PC.
+  // This is only possible with v5T ops (v4T can't change the Thumb bit via
+  // a POP PC instruction), and only if we do not need to emit any SP update.
+  // Otherwise, we need a temporary register to pop the value
+  // and copy that value into LR.
+  auto MBBI = MBB.getFirstTerminator();
+  bool CanRestoreDirectly = STI.hasV5TOps() && !ArgRegsSaveSize;
+  if (CanRestoreDirectly) {
+    if (MBBI != MBB.end() && MBBI->getOpcode() != ARM::tB)
+      CanRestoreDirectly = (MBBI->getOpcode() == ARM::tBX_RET ||
+                            MBBI->getOpcode() == ARM::tPOP_RET);
+    else {
+      auto MBBI_prev = MBBI;
+      MBBI_prev--;
+      assert(MBBI_prev->getOpcode() == ARM::tPOP);
+      assert(MBB.succ_size() == 1);
+      if ((*MBB.succ_begin())->begin()->getOpcode() == ARM::tBX_RET)
+        MBBI = MBBI_prev; // Replace the final tPOP with a tPOP_RET.
+      else
+        CanRestoreDirectly = false;
+    }
+  }
+
+  if (CanRestoreDirectly) {
+    if (!DoIt || MBBI->getOpcode() == ARM::tPOP_RET)
+      return true;
+    MachineInstrBuilder MIB =
+        AddDefaultPred(
+            BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET)));
+    // Copy implicit ops and popped registers, if any.
+    for (auto MO: MBBI->operands())
+      if (MO.isReg() && (MO.isImplicit() || MO.isDef()))
+        MIB.addOperand(MO);
+    MIB.addReg(ARM::PC, RegState::Define);
+    // Erase the old instruction (tBX_RET or tPOP).
+    MBB.erase(MBBI);
+    return true;
+  }
+
+  // Look for a temporary register to use.
+  // First, compute the liveness information.
+  LivePhysRegs UsedRegs(STI.getRegisterInfo());
+  UsedRegs.addLiveOuts(MBB);
+  // The semantic of pristines changed recently and now,
+  // the callee-saved registers that are touched in the function
+  // are not part of the pristines set anymore.
+  // Add those callee-saved now.
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    UsedRegs.addReg(CSRegs[i]);
+
+  DebugLoc dl = DebugLoc();
+  if (MBBI != MBB.end()) {
+    dl = MBBI->getDebugLoc();
+    auto InstUpToMBBI = MBB.end();
+    while (InstUpToMBBI != MBBI)
+      // The pre-decrement is on purpose here.
+      // We want to have the liveness right before MBBI.
+      UsedRegs.stepBackward(*--InstUpToMBBI);
+  }
+
+  // Look for a register that can be directly use in the POP.
+  unsigned PopReg = 0;
+  // And some temporary register, just in case.
+  unsigned TemporaryReg = 0;
+  BitVector PopFriendly =
+      TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::tGPRRegClassID));
+  assert(PopFriendly.any() && "No allocatable pop-friendly register?!");
+  // Rebuild the GPRs from the high registers because they are removed
+  // form the GPR reg class for thumb1.
+  BitVector GPRsNoLRSP =
+      TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::hGPRRegClassID));
+  GPRsNoLRSP |= PopFriendly;
+  GPRsNoLRSP.reset(ARM::LR);
+  GPRsNoLRSP.reset(ARM::SP);
+  GPRsNoLRSP.reset(ARM::PC);
+  for (int Register = GPRsNoLRSP.find_first(); Register != -1;
+       Register = GPRsNoLRSP.find_next(Register)) {
+    if (!UsedRegs.contains(Register)) {
+      // Remember the first pop-friendly register and exit.
+      if (PopFriendly.test(Register)) {
+        PopReg = Register;
+        TemporaryReg = 0;
+        break;
+      }
+      // Otherwise, remember that the register will be available to
+      // save a pop-friendly register.
+      TemporaryReg = Register;
+    }
+  }
+
+  if (!DoIt && !PopReg && !TemporaryReg)
+    return false;
+
+  assert((PopReg || TemporaryReg) && "Cannot get LR");
+
+  if (TemporaryReg) {
+    assert(!PopReg && "Unnecessary MOV is about to be inserted");
+    PopReg = PopFriendly.find_first();
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+                       .addReg(TemporaryReg, RegState::Define)
+                       .addReg(PopReg, RegState::Kill));
+  }
+
+  if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPOP_RET) {
+    // We couldn't use the direct restoration above, so
+    // perform the opposite conversion: tPOP_RET to tPOP.
+    MachineInstrBuilder MIB =
+        AddDefaultPred(
+            BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP)));
+    bool Popped = false;
+    for (auto MO: MBBI->operands())
+      if (MO.isReg() && (MO.isImplicit() || MO.isDef()) &&
+          MO.getReg() != ARM::PC) {
+        MIB.addOperand(MO);
+        if (!MO.isImplicit())
+          Popped = true;
+      }
+    // Is there anything left to pop?
+    if (!Popped)
+      MBB.erase(MIB.getInstr());
+    // Erase the old instruction.
+    MBB.erase(MBBI);
+    MBBI = AddDefaultPred(BuildMI(MBB, MBB.end(), dl, TII.get(ARM::tBX_RET)));
+  }
+
+  assert(PopReg && "Do not know how to get LR");
+  AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
+      .addReg(PopReg, RegState::Define);
+
+  emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);
+
+  AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+                     .addReg(ARM::LR, RegState::Define)
+                     .addReg(PopReg, RegState::Kill));
+
+  if (TemporaryReg)
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+                       .addReg(PopReg, RegState::Define)
+                       .addReg(TemporaryReg, RegState::Kill));
+
+  return true;
+}
+
+// Return the first iteraror after CurrentReg which is present in EnabledRegs,
+// or OrderEnd if no further registers are in that set. This does not advance
+// the iterator fiorst, so returns CurrentReg if it is in EnabledRegs.
+template <unsigned SetSize>
+static const unsigned *
+findNextOrderedReg(const unsigned *CurrentReg,
+                   SmallSet<unsigned, SetSize> &EnabledRegs,
+                   const unsigned *OrderEnd) {
+  while (CurrentReg != OrderEnd && !EnabledRegs.count(*CurrentReg))
+    ++CurrentReg;
+  return CurrentReg;
+}
+
+bool Thumb1FrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL;
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  MachineFunction &MF = *MBB.getParent();
+  const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
+
+  SmallSet<unsigned, 9> LoRegsToSave; // r0-r7, lr
+  SmallSet<unsigned, 4> HiRegsToSave; // r8-r11
+  SmallSet<unsigned, 9> CopyRegs; // Registers which can be used after pushing
+                           // LoRegs for saving HiRegs.
+
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+
+    if (ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR) {
+      LoRegsToSave.insert(Reg);
+    } else if (ARM::hGPRRegClass.contains(Reg) && Reg != ARM::LR) {
+      HiRegsToSave.insert(Reg);
+    } else {
+      llvm_unreachable("callee-saved register of unexpected class");
+    }
+
+    if ((ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR) &&
+        !MF.getRegInfo().isLiveIn(Reg) &&
+        !(hasFP(MF) && Reg == RegInfo->getFrameRegister(MF)))
+      CopyRegs.insert(Reg);
+  }
+
+  // Unused argument registers can be used for the high register saving.
+  for (unsigned ArgReg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3})
+    if (!MF.getRegInfo().isLiveIn(ArgReg))
+      CopyRegs.insert(ArgReg);
+
+  // Push the low registers and lr
+  if (!LoRegsToSave.empty()) {
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH));
+    AddDefaultPred(MIB);
+    for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6, ARM::R7, ARM::LR}) {
+      if (LoRegsToSave.count(Reg)) {
+        bool isKill = !MF.getRegInfo().isLiveIn(Reg);
+        if (isKill)
+          MBB.addLiveIn(Reg);
+
+        MIB.addReg(Reg, getKillRegState(isKill));
+      }
+    }
+    MIB.setMIFlags(MachineInstr::FrameSetup);
+  }
+
+  // Push the high registers. There are no store instructions that can access
+  // these registers directly, so we have to move them to low registers, and
+  // push them. This might take multiple pushes, as it is possible for there to
+  // be fewer low registers available than high registers which need saving.
+
+  // These are in reverse order so that in the case where we need to use
+  // multiple PUSH instructions, the order of the registers on the stack still
+  // matches the unwind info. They need to be swicthed back to ascending order
+  // before adding to the PUSH instruction.
+  static const unsigned AllCopyRegs[] = {ARM::LR, ARM::R7, ARM::R6,
+                                         ARM::R5, ARM::R4, ARM::R3,
+                                         ARM::R2, ARM::R1, ARM::R0};
+  static const unsigned AllHighRegs[] = {ARM::R11, ARM::R10, ARM::R9, ARM::R8};
+
+  const unsigned *AllCopyRegsEnd = std::end(AllCopyRegs);
+  const unsigned *AllHighRegsEnd = std::end(AllHighRegs);
+
+  // Find the first register to save.
+  const unsigned *HiRegToSave = findNextOrderedReg(
+      std::begin(AllHighRegs), HiRegsToSave, AllHighRegsEnd);
+
+  while (HiRegToSave != AllHighRegsEnd) {
+    // Find the first low register to use.
+    const unsigned *CopyReg =
+        findNextOrderedReg(std::begin(AllCopyRegs), CopyRegs, AllCopyRegsEnd);
+
+    // Create the PUSH, but don't insert it yet (the MOVs need to come first).
+    MachineInstrBuilder PushMIB = BuildMI(MF, DL, TII.get(ARM::tPUSH));
+    AddDefaultPred(PushMIB);
+
+    SmallVector<unsigned, 4> RegsToPush;
+    while (HiRegToSave != AllHighRegsEnd && CopyReg != AllCopyRegsEnd) {
+      if (HiRegsToSave.count(*HiRegToSave)) {
+        bool isKill = !MF.getRegInfo().isLiveIn(*HiRegToSave);
+        if (isKill)
+          MBB.addLiveIn(*HiRegToSave);
+
+        // Emit a MOV from the high reg to the low reg.
+        MachineInstrBuilder MIB =
+            BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr));
+        MIB.addReg(*CopyReg, RegState::Define);
+        MIB.addReg(*HiRegToSave, getKillRegState(isKill));
+        AddDefaultPred(MIB);
+
+        // Record the register that must be added to the PUSH.
+        RegsToPush.push_back(*CopyReg);
+
+        CopyReg = findNextOrderedReg(++CopyReg, CopyRegs, AllCopyRegsEnd);
+        HiRegToSave =
+            findNextOrderedReg(++HiRegToSave, HiRegsToSave, AllHighRegsEnd);
+      }
+    }
+
+    // Add the low registers to the PUSH, in ascending order.
+    for (unsigned Reg : reverse(RegsToPush))
+      PushMIB.addReg(Reg, RegState::Kill);
+
+    // Insert the PUSH instruction after the MOVs.
+    MBB.insert(MI, PushMIB);
+  }
+
+  return true;
+}
+
+bool Thumb1FrameLowering::
+restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const std::vector<CalleeSavedInfo> &CSI,
+                            const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
+
+  bool isVarArg = AFI->getArgRegsSaveSize() > 0;
+  DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
+
+  SmallSet<unsigned, 9> LoRegsToRestore;
+  SmallSet<unsigned, 4> HiRegsToRestore;
+  // Low registers (r0-r7) which can be used to restore the high registers.
+  SmallSet<unsigned, 9> CopyRegs;
+
+  for (CalleeSavedInfo I : CSI) {
+    unsigned Reg = I.getReg();
+
+    if (ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR) {
+      LoRegsToRestore.insert(Reg);
+    } else if (ARM::hGPRRegClass.contains(Reg) && Reg != ARM::LR) {
+      HiRegsToRestore.insert(Reg);
+    } else {
+      llvm_unreachable("callee-saved register of unexpected class");
+    }
+
+    // If this is a low register not used as the frame pointer, we may want to
+    // use it for restoring the high registers.
+    if ((ARM::tGPRRegClass.contains(Reg)) &&
+        !(hasFP(MF) && Reg == RegInfo->getFrameRegister(MF)))
+      CopyRegs.insert(Reg);
+  }
+
+  // If this is a return block, we may be able to use some unused return value
+  // registers for restoring the high regs.
+  auto Terminator = MBB.getFirstTerminator();
+  if (Terminator != MBB.end() && Terminator->getOpcode() == ARM::tBX_RET) {
+    CopyRegs.insert(ARM::R0);
+    CopyRegs.insert(ARM::R1);
+    CopyRegs.insert(ARM::R2);
+    CopyRegs.insert(ARM::R3);
+    for (auto Op : Terminator->implicit_operands()) {
+      if (Op.isReg())
+        CopyRegs.erase(Op.getReg());
+    }
+  }
+
+  static const unsigned AllCopyRegs[] = {ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+                                         ARM::R4, ARM::R5, ARM::R6, ARM::R7};
+  static const unsigned AllHighRegs[] = {ARM::R8, ARM::R9, ARM::R10, ARM::R11};
+
+  const unsigned *AllCopyRegsEnd = std::end(AllCopyRegs);
+  const unsigned *AllHighRegsEnd = std::end(AllHighRegs);
+
+  // Find the first register to restore.
+  auto HiRegToRestore = findNextOrderedReg(std::begin(AllHighRegs),
+                                           HiRegsToRestore, AllHighRegsEnd);
+
+  while (HiRegToRestore != AllHighRegsEnd) {
+    assert(!CopyRegs.empty());
+    // Find the first low register to use.
+    auto CopyReg =
+        findNextOrderedReg(std::begin(AllCopyRegs), CopyRegs, AllCopyRegsEnd);
+
+    // Create the POP instruction.
+    MachineInstrBuilder PopMIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPOP));
+    AddDefaultPred(PopMIB);
+
+    while (HiRegToRestore != AllHighRegsEnd && CopyReg != AllCopyRegsEnd) {
+      // Add the low register to the POP.
+      PopMIB.addReg(*CopyReg, RegState::Define);
+
+      // Create the MOV from low to high register.
+      MachineInstrBuilder MIB =
+          BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr));
+      MIB.addReg(*HiRegToRestore, RegState::Define);
+      MIB.addReg(*CopyReg, RegState::Kill);
+      AddDefaultPred(MIB);
+
+      CopyReg = findNextOrderedReg(++CopyReg, CopyRegs, AllCopyRegsEnd);
+      HiRegToRestore =
+          findNextOrderedReg(++HiRegToRestore, HiRegsToRestore, AllHighRegsEnd);
+    }
+  }
+
+
+
+
+  MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP));
+  AddDefaultPred(MIB);
+
+  bool NeedsPop = false;
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+
+    // High registers (excluding lr) have already been dealt with
+    if (!(ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR))
+      continue;
+
+    if (Reg == ARM::LR) {
+      if (MBB.succ_empty()) {
+        // Special epilogue for vararg functions. See emitEpilogue
+        if (isVarArg)
+          continue;
+        // ARMv4T requires BX, see emitEpilogue
+        if (!STI.hasV5TOps())
+          continue;
+        Reg = ARM::PC;
+        (*MIB).setDesc(TII.get(ARM::tPOP_RET));
+        if (MI != MBB.end())
+          MIB.copyImplicitOps(*MI);
+        MI = MBB.erase(MI);
+      } else
+        // LR may only be popped into PC, as part of return sequence.
+        // If this isn't the return sequence, we'll need emitPopSpecialFixUp
+        // to restore LR the hard way.
+        continue;
+    }
+    MIB.addReg(Reg, getDefRegState(true));
+    NeedsPop = true;
+  }
+
+  // It's illegal to emit pop instruction without operands.
+  if (NeedsPop)
+    MBB.insert(MI, &*MIB);
+  else
+    MF.DeleteMachineInstr(MIB);
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h
new file mode 100644
index 000000000000..9de1ba1d7009
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h
@@ -0,0 +1,93 @@
+//===-- Thumb1FrameLowering.h - Thumb1-specific frame info stuff --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_THUMB1FRAMELOWERING_H
+#define LLVM_LIB_TARGET_ARM_THUMB1FRAMELOWERING_H
+
+#include "ARMFrameLowering.h"
+#include "Thumb1InstrInfo.h"
+#include "ThumbRegisterInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class Thumb1FrameLowering : public ARMFrameLowering {
+public:
+  explicit Thumb1FrameLowering(const ARMSubtarget &sti);
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override;
+
+  /// Check whether or not the given \p MBB can be used as a epilogue
+  /// for the target.
+  /// The epilogue will be inserted before the first terminator of that block.
+  /// This method is used by the shrink-wrapping pass to decide if
+  /// \p MBB will be correctly handled by the target.
+  bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
+
+  /// Disable shrink wrap as tBfar/BL will be used to adjust for long jumps.
+  bool enableShrinkWrapping(const MachineFunction &MF) const override {
+    return false;
+  }
+
+private:
+  /// Check if the frame lowering of \p MF needs a special fixup
+  /// code sequence for the epilogue.
+  /// Unlike T2 and ARM mode, the T1 pop instruction cannot restore
+  /// to LR, and we can't pop the value directly to the PC when
+  /// we need to update the SP after popping the value. So instead
+  /// we have to emit:
+  ///   POP {r3}
+  ///   ADD sp, #offset
+  ///   BX r3
+  /// If this would clobber a return value, then generate this sequence instead:
+  ///   MOV ip, r3
+  ///   POP {r3}
+  ///   ADD sp, #offset
+  ///   MOV lr, r3
+  ///   MOV r3, ip
+  ///   BX lr
+  bool needPopSpecialFixUp(const MachineFunction &MF) const;
+
+  /// Emit the special fixup code sequence for the epilogue.
+  /// \see needPopSpecialFixUp for more details.
+  /// \p DoIt, tells this method whether or not to actually insert
+  /// the code sequence in \p MBB. I.e., when \p DoIt is false,
+  /// \p MBB is left untouched.
+  /// \returns For \p DoIt == true: True when the emission succeeded
+  /// false otherwise. For \p DoIt == false: True when the emission
+  /// would have been possible, false otherwise.
+  bool emitPopSpecialFixUp(MachineBasicBlock &MBB, bool DoIt) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
new file mode 100644
index 000000000000..4b4fbaab28d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -0,0 +1,129 @@
+//===-- Thumb1InstrInfo.cpp - Thumb-1 Instruction Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMSubtarget.h"
+#include "Thumb1InstrInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
+    : ARMBaseInstrInfo(STI), RI() {}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void Thumb1InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(ARM::tMOVr);
+  NopInst.addOperand(MCOperand::createReg(ARM::R8));
+  NopInst.addOperand(MCOperand::createReg(ARM::R8));
+  NopInst.addOperand(MCOperand::createImm(ARMCC::AL));
+  NopInst.addOperand(MCOperand::createReg(0));
+}
+
+unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const {
+  return 0;
+}
+
+void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, unsigned DestReg,
+                                  unsigned SrcReg, bool KillSrc) const {
+  // Need to check the arch.
+  MachineFunction &MF = *MBB.getParent();
+  const ARMSubtarget &st = MF.getSubtarget<ARMSubtarget>();
+
+  assert(ARM::GPRRegClass.contains(DestReg, SrcReg) &&
+         "Thumb1 can only copy GPR registers");
+
+  if (st.hasV6Ops() || ARM::hGPRRegClass.contains(SrcReg)
+      || !ARM::tGPRRegClass.contains(DestReg))
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc)));
+  else {
+    // FIXME: The performance consequences of this are going to be atrocious.
+    // Some things to try that should be better:
+    //   * 'mov hi, $src; mov $dst, hi', with hi as either r10 or r11
+    //   * 'movs $dst, $src' if cpsr isn't live
+    // See: http://lists.llvm.org/pipermail/llvm-dev/2014-August/075998.html
+
+    // 'MOV lo, lo' is unpredictable on < v6, so use the stack to do it
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tPUSH)))
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tPOP)))
+      .addReg(DestReg, getDefRegState(true));
+  }
+}
+
+void Thumb1InstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+  assert((RC == &ARM::tGPRRegClass ||
+          (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+           isARMLowRegister(SrcReg))) && "Unknown regclass!");
+
+  if (RC == &ARM::tGPRRegClass ||
+      (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+       isARMLowRegister(SrcReg))) {
+    DebugLoc DL;
+    if (I != MBB.end()) DL = I->getDebugLoc();
+
+    MachineFunction &MF = *MBB.getParent();
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSTRspi))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+  }
+}
+
+void Thumb1InstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  assert((RC == &ARM::tGPRRegClass ||
+          (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
+           isARMLowRegister(DestReg))) && "Unknown regclass!");
+
+  if (RC == &ARM::tGPRRegClass ||
+      (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
+       isARMLowRegister(DestReg))) {
+    DebugLoc DL;
+    if (I != MBB.end()) DL = I->getDebugLoc();
+
+    MachineFunction &MF = *MBB.getParent();
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+  }
+}
+
+void Thumb1InstrInfo::expandLoadStackGuard(
+    MachineBasicBlock::iterator MI) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetMachine &TM = MF.getTarget();
+  if (TM.isPositionIndependent())
+    expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_pcrel, ARM::tLDRi);
+  else
+    expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::tLDRi);
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h
new file mode 100644
index 000000000000..931914ad2799
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h
@@ -0,0 +1,61 @@
+//===-- Thumb1InstrInfo.h - Thumb-1 Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_THUMB1INSTRINFO_H
+#define LLVM_LIB_TARGET_ARM_THUMB1INSTRINFO_H
+
+#include "ARMBaseInstrInfo.h"
+#include "ThumbRegisterInfo.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+class Thumb1InstrInfo : public ARMBaseInstrInfo {
+  ThumbRegisterInfo RI;
+public:
+  explicit Thumb1InstrInfo(const ARMSubtarget &STI);
+
+  /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  unsigned getUnindexedOpcode(unsigned Opc) const override;
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+private:
+  void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
new file mode 100644
index 000000000000..d01fc8c40ddf
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -0,0 +1,304 @@
+//===-- Thumb2ITBlockPass.cpp - Insert Thumb-2 IT blocks ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "thumb2-it"
+
+STATISTIC(NumITs,        "Number of IT blocks inserted");
+STATISTIC(NumMovedInsts, "Number of predicated instructions moved");
+
+namespace {
+  class Thumb2ITBlockPass : public MachineFunctionPass {
+  public:
+    static char ID;
+    Thumb2ITBlockPass() : MachineFunctionPass(ID) {}
+
+    bool restrictIT;
+    const Thumb2InstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    ARMFunctionInfo *AFI;
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    StringRef getPassName() const override {
+      return "Thumb IT blocks insertion pass";
+    }
+
+  private:
+    bool MoveCopyOutOfITBlock(MachineInstr *MI,
+                              ARMCC::CondCodes CC, ARMCC::CondCodes OCC,
+                              SmallSet<unsigned, 4> &Defs,
+                              SmallSet<unsigned, 4> &Uses);
+    bool InsertITInstructions(MachineBasicBlock &MBB);
+  };
+  char Thumb2ITBlockPass::ID = 0;
+}
+
+/// TrackDefUses - Tracking what registers are being defined and used by
+/// instructions in the IT block. This also tracks "dependencies", i.e. uses
+/// in the IT block that are defined before the IT instruction.
+static void TrackDefUses(MachineInstr *MI,
+                         SmallSet<unsigned, 4> &Defs,
+                         SmallSet<unsigned, 4> &Uses,
+                         const TargetRegisterInfo *TRI) {
+  SmallVector<unsigned, 4> LocalDefs;
+  SmallVector<unsigned, 4> LocalUses;
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || Reg == ARM::ITSTATE || Reg == ARM::SP)
+      continue;
+    if (MO.isUse())
+      LocalUses.push_back(Reg);
+    else
+      LocalDefs.push_back(Reg);
+  }
+
+  for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
+    unsigned Reg = LocalUses[i];
+    for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true);
+         Subreg.isValid(); ++Subreg)
+      Uses.insert(*Subreg);
+  }
+
+  for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
+    unsigned Reg = LocalDefs[i];
+    for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true);
+         Subreg.isValid(); ++Subreg)
+      Defs.insert(*Subreg);
+    if (Reg == ARM::CPSR)
+      continue;
+  }
+}
+
+/// Clear kill flags for any uses in the given set.  This will likely
+/// conservatively remove more kill flags than are necessary, but removing them
+/// is safer than incorrect kill flags remaining on instructions.
+static void ClearKillFlags(MachineInstr *MI, SmallSet<unsigned, 4> &Uses) {
+  for (MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg() || MO.isDef() || !MO.isKill())
+      continue;
+    if (!Uses.count(MO.getReg()))
+      continue;
+    MO.setIsKill(false);
+  }
+}
+
+static bool isCopy(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case ARM::MOVr:
+  case ARM::MOVr_TC:
+  case ARM::tMOVr:
+  case ARM::t2MOVr:
+    return true;
+  }
+}
+
+bool
+Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
+                                      ARMCC::CondCodes CC, ARMCC::CondCodes OCC,
+                                        SmallSet<unsigned, 4> &Defs,
+                                        SmallSet<unsigned, 4> &Uses) {
+  if (!isCopy(MI))
+    return false;
+  // llvm models select's as two-address instructions. That means a copy
+  // is inserted before a t2MOVccr, etc. If the copy is scheduled in
+  // between selects we would end up creating multiple IT blocks.
+  assert(MI->getOperand(0).getSubReg() == 0 &&
+         MI->getOperand(1).getSubReg() == 0 &&
+         "Sub-register indices still around?");
+
+  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+
+  // First check if it's safe to move it.
+  if (Uses.count(DstReg) || Defs.count(SrcReg))
+    return false;
+
+  // If the CPSR is defined by this copy, then we don't want to move it. E.g.,
+  // if we have:
+  //
+  //   movs  r1, r1
+  //   rsb   r1, 0
+  //   movs  r2, r2
+  //   rsb   r2, 0
+  //
+  // we don't want this to be converted to:
+  //
+  //   movs  r1, r1
+  //   movs  r2, r2
+  //   itt   mi
+  //   rsb   r1, 0
+  //   rsb   r2, 0
+  //
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MI->hasOptionalDef() &&
+      MI->getOperand(MCID.getNumOperands() - 1).getReg() == ARM::CPSR)
+    return false;
+
+  // Then peek at the next instruction to see if it's predicated on CC or OCC.
+  // If not, then there is nothing to be gained by moving the copy.
+  MachineBasicBlock::iterator I = MI; ++I;
+  MachineBasicBlock::iterator E = MI->getParent()->end();
+  while (I != E && I->isDebugValue())
+    ++I;
+  if (I != E) {
+    unsigned NPredReg = 0;
+    ARMCC::CondCodes NCC = getITInstrPredicate(*I, NPredReg);
+    if (NCC == CC || NCC == OCC)
+      return true;
+  }
+  return false;
+}
+
+bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  SmallSet<unsigned, 4> Defs;
+  SmallSet<unsigned, 4> Uses;
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineInstr *MI = &*MBBI;
+    DebugLoc dl = MI->getDebugLoc();
+    unsigned PredReg = 0;
+    ARMCC::CondCodes CC = getITInstrPredicate(*MI, PredReg);
+    if (CC == ARMCC::AL) {
+      ++MBBI;
+      continue;
+    }
+
+    Defs.clear();
+    Uses.clear();
+    TrackDefUses(MI, Defs, Uses, TRI);
+
+    // Insert an IT instruction.
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(ARM::t2IT))
+      .addImm(CC);
+
+    // Add implicit use of ITSTATE to IT block instructions.
+    MI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
+                                             true/*isImp*/, false/*isKill*/));
+
+    MachineInstr *LastITMI = MI;
+    MachineBasicBlock::iterator InsertPos = MIB.getInstr();
+    ++MBBI;
+
+    // Form IT block.
+    ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
+    unsigned Mask = 0, Pos = 3;
+
+    // v8 IT blocks are limited to one conditional op unless -arm-no-restrict-it
+    // is set: skip the loop
+    if (!restrictIT) {
+      // Branches, including tricky ones like LDM_RET, need to end an IT
+      // block so check the instruction we just put in the block.
+      for (; MBBI != E && Pos &&
+             (!MI->isBranch() && !MI->isReturn()) ; ++MBBI) {
+        if (MBBI->isDebugValue())
+          continue;
+
+        MachineInstr *NMI = &*MBBI;
+        MI = NMI;
+
+        unsigned NPredReg = 0;
+        ARMCC::CondCodes NCC = getITInstrPredicate(*NMI, NPredReg);
+        if (NCC == CC || NCC == OCC) {
+          Mask |= (NCC & 1) << Pos;
+          // Add implicit use of ITSTATE.
+          NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
+                                                 true/*isImp*/, false/*isKill*/));
+          LastITMI = NMI;
+        } else {
+          if (NCC == ARMCC::AL &&
+              MoveCopyOutOfITBlock(NMI, CC, OCC, Defs, Uses)) {
+            --MBBI;
+            MBB.remove(NMI);
+            MBB.insert(InsertPos, NMI);
+            ClearKillFlags(MI, Uses);
+            ++NumMovedInsts;
+            continue;
+          }
+          break;
+        }
+        TrackDefUses(NMI, Defs, Uses, TRI);
+        --Pos;
+      }
+    }
+
+    // Finalize IT mask.
+    Mask |= (1 << Pos);
+    // Tag along (firstcond[0] << 4) with the mask.
+    Mask |= (CC & 1) << 4;
+    MIB.addImm(Mask);
+
+    // Last instruction in IT block kills ITSTATE.
+    LastITMI->findRegisterUseOperand(ARM::ITSTATE)->setIsKill();
+
+    // Finalize the bundle.
+    finalizeBundle(MBB, InsertPos.getInstrIterator(),
+                   ++LastITMI->getIterator());
+
+    Modified = true;
+    ++NumITs;
+  }
+
+  return Modified;
+}
+
+bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
+  const ARMSubtarget &STI =
+      static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+  if (!STI.isThumb2())
+    return false;
+  AFI = Fn.getInfo<ARMFunctionInfo>();
+  TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
+  TRI = STI.getRegisterInfo();
+  restrictIT = STI.restrictIT();
+
+  if (!AFI->isThumbFunction())
+    return false;
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ) {
+    MachineBasicBlock &MBB = *MFI;
+    ++MFI;
+    Modified |= InsertITInstructions(MBB);
+  }
+
+  if (Modified)
+    AFI->setHasITBlocks(true);
+
+  return Modified;
+}
+
+/// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks
+/// insertion pass.
+FunctionPass *llvm::createThumb2ITBlockPass() {
+  return new Thumb2ITBlockPass();
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
new file mode 100644
index 000000000000..1c731d669eda
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -0,0 +1,643 @@
+//===-- Thumb2InstrInfo.cpp - Thumb-2 Instruction Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-2 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb2InstrInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden,
+           cl::desc("Use old-style Thumb2 if-conversion heuristics"),
+           cl::init(false));
+
+Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
+    : ARMBaseInstrInfo(STI), RI() {}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void Thumb2InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(ARM::tHINT);
+  NopInst.addOperand(MCOperand::createImm(0));
+  NopInst.addOperand(MCOperand::createImm(ARMCC::AL));
+  NopInst.addOperand(MCOperand::createReg(0));
+}
+
+unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const {
+  // FIXME
+  return 0;
+}
+
+void
+Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+                                         MachineBasicBlock *NewDest) const {
+  MachineBasicBlock *MBB = Tail->getParent();
+  ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
+  if (!AFI->hasITBlocks() || Tail->isBranch()) {
+    TargetInstrInfo::ReplaceTailWithBranchTo(Tail, NewDest);
+    return;
+  }
+
+  // If the first instruction of Tail is predicated, we may have to update
+  // the IT instruction.
+  unsigned PredReg = 0;
+  ARMCC::CondCodes CC = getInstrPredicate(*Tail, PredReg);
+  MachineBasicBlock::iterator MBBI = Tail;
+  if (CC != ARMCC::AL)
+    // Expecting at least the t2IT instruction before it.
+    --MBBI;
+
+  // Actually replace the tail.
+  TargetInstrInfo::ReplaceTailWithBranchTo(Tail, NewDest);
+
+  // Fix up IT.
+  if (CC != ARMCC::AL) {
+    MachineBasicBlock::iterator E = MBB->begin();
+    unsigned Count = 4; // At most 4 instructions in an IT block.
+    while (Count && MBBI != E) {
+      if (MBBI->isDebugValue()) {
+        --MBBI;
+        continue;
+      }
+      if (MBBI->getOpcode() == ARM::t2IT) {
+        unsigned Mask = MBBI->getOperand(1).getImm();
+        if (Count == 4)
+          MBBI->eraseFromParent();
+        else {
+          unsigned MaskOn = 1 << Count;
+          unsigned MaskOff = ~(MaskOn - 1);
+          MBBI->getOperand(1).setImm((Mask & MaskOff) | MaskOn);
+        }
+        return;
+      }
+      --MBBI;
+      --Count;
+    }
+
+    // Ctrl flow can reach here if branch folding is run before IT block
+    // formation pass.
+  }
+}
+
+bool
+Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI) const {
+  while (MBBI->isDebugValue()) {
+    ++MBBI;
+    if (MBBI == MBB.end())
+      return false;
+  }
+
+  unsigned PredReg = 0;
+  return getITInstrPredicate(*MBBI, PredReg) == ARMCC::AL;
+}
+
+void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, unsigned DestReg,
+                                  unsigned SrcReg, bool KillSrc) const {
+  // Handle SPR, DPR, and QPR copies.
+  if (!ARM::GPRRegClass.contains(DestReg, SrcReg))
+    return ARMBaseInstrInfo::copyPhysReg(MBB, I, DL, DestReg, SrcReg, KillSrc);
+
+  AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc)));
+}
+
+void Thumb2InstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+
+  if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
+      RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
+      RC == &ARM::GPRnopcRegClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2STRi12))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    return;
+  }
+
+  if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
+    // Thumb2 STRD expects its dest-registers to be in rGPR. Not a problem for
+    // gsub_0, but needs an extra constraint for gsub_1 (which could be sp
+    // otherwise).
+    if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+      MachineRegisterInfo *MRI = &MF.getRegInfo();
+      MRI->constrainRegClass(SrcReg, &ARM::GPRPair_with_gsub_1_in_rGPRRegClass);
+    }
+
+    MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8));
+    AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
+    AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+    MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+    AddDefaultPred(MIB);
+    return;
+  }
+
+  ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC, TRI);
+}
+
+void Thumb2InstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
+      RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
+      RC == &ARM::GPRnopcRegClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    return;
+  }
+
+  if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
+    // Thumb2 LDRD expects its dest-registers to be in rGPR. Not a problem for
+    // gsub_0, but needs an extra constraint for gsub_1 (which could be sp
+    // otherwise).
+    if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+      MachineRegisterInfo *MRI = &MF.getRegInfo();
+      MRI->constrainRegClass(DestReg,
+                             &ARM::GPRPair_with_gsub_1_in_rGPRRegClass);
+    }
+
+    MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8));
+    AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
+    AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+    MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+    AddDefaultPred(MIB);
+
+    if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+      MIB.addReg(DestReg, RegState::ImplicitDefine);
+    return;
+  }
+
+  ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI);
+}
+
+void Thumb2InstrInfo::expandLoadStackGuard(
+    MachineBasicBlock::iterator MI) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  if (MF.getTarget().isPositionIndependent())
+    expandLoadStackGuardBase(MI, ARM::t2MOV_ga_pcrel, ARM::t2LDRi12);
+  else
+    expandLoadStackGuardBase(MI, ARM::t2MOVi32imm, ARM::t2LDRi12);
+}
+
+void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator &MBBI,
+                                  const DebugLoc &dl, unsigned DestReg,
+                                  unsigned BaseReg, int NumBytes,
+                                  ARMCC::CondCodes Pred, unsigned PredReg,
+                                  const ARMBaseInstrInfo &TII,
+                                  unsigned MIFlags) {
+  if (NumBytes == 0 && DestReg != BaseReg) {
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
+      .addReg(BaseReg, RegState::Kill)
+      .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags);
+    return;
+  }
+
+  bool isSub = NumBytes < 0;
+  if (isSub) NumBytes = -NumBytes;
+
+  // If profitable, use a movw or movt to materialize the offset.
+  // FIXME: Use the scavenger to grab a scratch register.
+  if (DestReg != ARM::SP && DestReg != BaseReg &&
+      NumBytes >= 4096 &&
+      ARM_AM::getT2SOImmVal(NumBytes) == -1) {
+    bool Fits = false;
+    if (NumBytes < 65536) {
+      // Use a movw to materialize the 16-bit constant.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), DestReg)
+        .addImm(NumBytes)
+        .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags);
+      Fits = true;
+    } else if ((NumBytes & 0xffff) == 0) {
+      // Use a movt to materialize the 32-bit constant.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), DestReg)
+        .addReg(DestReg)
+        .addImm(NumBytes >> 16)
+        .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags);
+      Fits = true;
+    }
+
+    if (Fits) {
+      if (isSub) {
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), DestReg)
+          .addReg(BaseReg)
+          .addReg(DestReg, RegState::Kill)
+          .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+          .setMIFlags(MIFlags);
+      } else {
+        // Here we know that DestReg is not SP but we do not
+        // know anything about BaseReg. t2ADDrr is an invalid
+        // instruction is SP is used as the second argument, but
+        // is fine if SP is the first argument. To be sure we
+        // do not generate invalid encoding, put BaseReg first.
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::t2ADDrr), DestReg)
+          .addReg(BaseReg)
+          .addReg(DestReg, RegState::Kill)
+          .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+          .setMIFlags(MIFlags);
+      }
+      return;
+    }
+  }
+
+  while (NumBytes) {
+    unsigned ThisVal = NumBytes;
+    unsigned Opc = 0;
+    if (DestReg == ARM::SP && BaseReg != ARM::SP) {
+      // mov sp, rn. Note t2MOVr cannot be used.
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),DestReg)
+        .addReg(BaseReg).setMIFlags(MIFlags));
+      BaseReg = ARM::SP;
+      continue;
+    }
+
+    bool HasCCOut = true;
+    if (BaseReg == ARM::SP) {
+      // sub sp, sp, #imm7
+      if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) {
+        assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?");
+        Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+          .addReg(BaseReg).addImm(ThisVal/4).setMIFlags(MIFlags));
+        NumBytes = 0;
+        continue;
+      }
+
+      // sub rd, sp, so_imm
+      Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri;
+      if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
+        NumBytes = 0;
+      } else {
+        // FIXME: Move this to ARMAddressingModes.h?
+        unsigned RotAmt = countLeadingZeros(ThisVal);
+        ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+        NumBytes &= ~ThisVal;
+        assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+               "Bit extraction didn't work?");
+      }
+    } else {
+      assert(DestReg != ARM::SP && BaseReg != ARM::SP);
+      Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri;
+      if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
+        NumBytes = 0;
+      } else if (ThisVal < 4096) {
+        Opc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12;
+        HasCCOut = false;
+        NumBytes = 0;
+      } else {
+        // FIXME: Move this to ARMAddressingModes.h?
+        unsigned RotAmt = countLeadingZeros(ThisVal);
+        ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+        NumBytes &= ~ThisVal;
+        assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+               "Bit extraction didn't work?");
+      }
+    }
+
+    // Build the new ADD / SUB.
+    MachineInstrBuilder MIB =
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+                     .addReg(BaseReg, RegState::Kill)
+                     .addImm(ThisVal)).setMIFlags(MIFlags);
+    if (HasCCOut)
+      AddDefaultCC(MIB);
+
+    BaseReg = DestReg;
+  }
+}
+
+static unsigned
+negativeOffsetOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  case ARM::t2LDRi12:   return ARM::t2LDRi8;
+  case ARM::t2LDRHi12:  return ARM::t2LDRHi8;
+  case ARM::t2LDRBi12:  return ARM::t2LDRBi8;
+  case ARM::t2LDRSHi12: return ARM::t2LDRSHi8;
+  case ARM::t2LDRSBi12: return ARM::t2LDRSBi8;
+  case ARM::t2STRi12:   return ARM::t2STRi8;
+  case ARM::t2STRBi12:  return ARM::t2STRBi8;
+  case ARM::t2STRHi12:  return ARM::t2STRHi8;
+  case ARM::t2PLDi12:   return ARM::t2PLDi8;
+
+  case ARM::t2LDRi8:
+  case ARM::t2LDRHi8:
+  case ARM::t2LDRBi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRSBi8:
+  case ARM::t2STRi8:
+  case ARM::t2STRBi8:
+  case ARM::t2STRHi8:
+  case ARM::t2PLDi8:
+    return opcode;
+
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+static unsigned
+positiveOffsetOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  case ARM::t2LDRi8:   return ARM::t2LDRi12;
+  case ARM::t2LDRHi8:  return ARM::t2LDRHi12;
+  case ARM::t2LDRBi8:  return ARM::t2LDRBi12;
+  case ARM::t2LDRSHi8: return ARM::t2LDRSHi12;
+  case ARM::t2LDRSBi8: return ARM::t2LDRSBi12;
+  case ARM::t2STRi8:   return ARM::t2STRi12;
+  case ARM::t2STRBi8:  return ARM::t2STRBi12;
+  case ARM::t2STRHi8:  return ARM::t2STRHi12;
+  case ARM::t2PLDi8:   return ARM::t2PLDi12;
+
+  case ARM::t2LDRi12:
+  case ARM::t2LDRHi12:
+  case ARM::t2LDRBi12:
+  case ARM::t2LDRSHi12:
+  case ARM::t2LDRSBi12:
+  case ARM::t2STRi12:
+  case ARM::t2STRBi12:
+  case ARM::t2STRHi12:
+  case ARM::t2PLDi12:
+    return opcode;
+
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+static unsigned
+immediateOffsetOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  case ARM::t2LDRs:   return ARM::t2LDRi12;
+  case ARM::t2LDRHs:  return ARM::t2LDRHi12;
+  case ARM::t2LDRBs:  return ARM::t2LDRBi12;
+  case ARM::t2LDRSHs: return ARM::t2LDRSHi12;
+  case ARM::t2LDRSBs: return ARM::t2LDRSBi12;
+  case ARM::t2STRs:   return ARM::t2STRi12;
+  case ARM::t2STRBs:  return ARM::t2STRBi12;
+  case ARM::t2STRHs:  return ARM::t2STRHi12;
+  case ARM::t2PLDs:   return ARM::t2PLDi12;
+
+  case ARM::t2LDRi12:
+  case ARM::t2LDRHi12:
+  case ARM::t2LDRBi12:
+  case ARM::t2LDRSHi12:
+  case ARM::t2LDRSBi12:
+  case ARM::t2STRi12:
+  case ARM::t2STRBi12:
+  case ARM::t2STRHi12:
+  case ARM::t2PLDi12:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRHi8:
+  case ARM::t2LDRBi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRSBi8:
+  case ARM::t2STRi8:
+  case ARM::t2STRBi8:
+  case ARM::t2STRHi8:
+  case ARM::t2PLDi8:
+    return opcode;
+
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                               unsigned FrameReg, int &Offset,
+                               const ARMBaseInstrInfo &TII) {
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MI.getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  bool isSub = false;
+
+  // Memory operands in inline assembly always use AddrModeT2_i12.
+  if (Opcode == ARM::INLINEASM)
+    AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2?
+
+  if (Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) {
+    Offset += MI.getOperand(FrameRegIdx+1).getImm();
+
+    unsigned PredReg;
+    if (Offset == 0 && getInstrPredicate(MI, PredReg) == ARMCC::AL) {
+      // Turn it into a move.
+      MI.setDesc(TII.get(ARM::tMOVr));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      // Remove offset and remaining explicit predicate operands.
+      do MI.RemoveOperand(FrameRegIdx+1);
+      while (MI.getNumOperands() > FrameRegIdx+1);
+      MachineInstrBuilder MIB(*MI.getParent()->getParent(), &MI);
+      AddDefaultPred(MIB);
+      return true;
+    }
+
+    bool HasCCOut = Opcode != ARM::t2ADDri12;
+
+    if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+      MI.setDesc(TII.get(ARM::t2SUBri));
+    } else {
+      MI.setDesc(TII.get(ARM::t2ADDri));
+    }
+
+    // Common case: small offset, fits into instruction.
+    if (ARM_AM::getT2SOImmVal(Offset) != -1) {
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+      // Add cc_out operand if the original instruction did not have one.
+      if (!HasCCOut)
+        MI.addOperand(MachineOperand::CreateReg(0, false));
+      Offset = 0;
+      return true;
+    }
+    // Another common case: imm12.
+    if (Offset < 4096 &&
+        (!HasCCOut || MI.getOperand(MI.getNumOperands()-1).getReg() == 0)) {
+      unsigned NewOpc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12;
+      MI.setDesc(TII.get(NewOpc));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+      // Remove the cc_out operand.
+      if (HasCCOut)
+        MI.RemoveOperand(MI.getNumOperands()-1);
+      Offset = 0;
+      return true;
+    }
+
+    // Otherwise, extract 8 adjacent bits from the immediate into this
+    // t2ADDri/t2SUBri.
+    unsigned RotAmt = countLeadingZeros<unsigned>(Offset);
+    unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xff000000U, RotAmt);
+
+    // We will handle these bits from offset, clear them.
+    Offset &= ~ThisImmVal;
+
+    assert(ARM_AM::getT2SOImmVal(ThisImmVal) != -1 &&
+           "Bit extraction didn't work?");
+    MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal);
+    // Add cc_out operand if the original instruction did not have one.
+    if (!HasCCOut)
+      MI.addOperand(MachineOperand::CreateReg(0, false));
+
+  } else {
+
+    // AddrMode4 and AddrMode6 cannot handle any offset.
+    if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6)
+      return false;
+
+    // AddrModeT2_so cannot handle any offset. If there is no offset
+    // register then we change to an immediate version.
+    unsigned NewOpc = Opcode;
+    if (AddrMode == ARMII::AddrModeT2_so) {
+      unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg();
+      if (OffsetReg != 0) {
+        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+        return Offset == 0;
+      }
+
+      MI.RemoveOperand(FrameRegIdx+1);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(0);
+      NewOpc = immediateOffsetOpcode(Opcode);
+      AddrMode = ARMII::AddrModeT2_i12;
+    }
+
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    if (AddrMode == ARMII::AddrModeT2_i8 || AddrMode == ARMII::AddrModeT2_i12) {
+      // i8 supports only negative, and i12 supports only positive, so
+      // based on Offset sign convert Opcode to the appropriate
+      // instruction
+      Offset += MI.getOperand(FrameRegIdx+1).getImm();
+      if (Offset < 0) {
+        NewOpc = negativeOffsetOpcode(Opcode);
+        NumBits = 8;
+        isSub = true;
+        Offset = -Offset;
+      } else {
+        NewOpc = positiveOffsetOpcode(Opcode);
+        NumBits = 12;
+      }
+    } else if (AddrMode == ARMII::AddrMode5) {
+      // VFP address mode.
+      const MachineOperand &OffOp = MI.getOperand(FrameRegIdx+1);
+      int InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm());
+      if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      Scale = 4;
+      Offset += InstrOffs * 4;
+      assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+      if (Offset < 0) {
+        Offset = -Offset;
+        isSub = true;
+      }
+    } else if (AddrMode == ARMII::AddrModeT2_i8s4) {
+      Offset += MI.getOperand(FrameRegIdx + 1).getImm() * 4;
+      NumBits = 10; // 8 bits scaled by 4
+      // MCInst operand expects already scaled value.
+      Scale = 1;
+      assert((Offset & 3) == 0 && "Can't encode this offset!");
+    } else {
+      llvm_unreachable("Unsupported addressing mode!");
+    }
+
+    if (NewOpc != Opcode)
+      MI.setDesc(TII.get(NewOpc));
+
+    MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1);
+
+    // Attempt to fold address computation
+    // Common case: small offset, fits into instruction.
+    int ImmedOffset = Offset / Scale;
+    unsigned Mask = (1 << NumBits) - 1;
+    if ((unsigned)Offset <= Mask * Scale) {
+      // Replace the FrameIndex with fp/sp
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      if (isSub) {
+        if (AddrMode == ARMII::AddrMode5)
+          // FIXME: Not consistent.
+          ImmedOffset |= 1 << NumBits;
+        else
+          ImmedOffset = -ImmedOffset;
+      }
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      Offset = 0;
+      return true;
+    }
+
+    // Otherwise, offset doesn't fit. Pull in what we can to simplify
+    ImmedOffset = ImmedOffset & Mask;
+    if (isSub) {
+      if (AddrMode == ARMII::AddrMode5)
+        // FIXME: Not consistent.
+        ImmedOffset |= 1 << NumBits;
+      else {
+        ImmedOffset = -ImmedOffset;
+        if (ImmedOffset == 0)
+          // Change the opcode back if the encoded offset is zero.
+          MI.setDesc(TII.get(positiveOffsetOpcode(NewOpc)));
+      }
+    }
+    ImmOp.ChangeToImmediate(ImmedOffset);
+    Offset &= ~(Mask*Scale);
+  }
+
+  Offset = (isSub) ? -Offset : Offset;
+  return Offset == 0;
+}
+
+ARMCC::CondCodes llvm::getITInstrPredicate(const MachineInstr &MI,
+                                           unsigned &PredReg) {
+  unsigned Opc = MI.getOpcode();
+  if (Opc == ARM::tBcc || Opc == ARM::t2Bcc)
+    return ARMCC::AL;
+  return getInstrPredicate(MI, PredReg);
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h
new file mode 100644
index 000000000000..15d63300b6a2
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -0,0 +1,74 @@
+//===-- Thumb2InstrInfo.h - Thumb-2 Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-2 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_THUMB2INSTRINFO_H
+#define LLVM_LIB_TARGET_ARM_THUMB2INSTRINFO_H
+
+#include "ARMBaseInstrInfo.h"
+#include "ThumbRegisterInfo.h"
+
+namespace llvm {
+class ARMSubtarget;
+class ScheduleHazardRecognizer;
+
+class Thumb2InstrInfo : public ARMBaseInstrInfo {
+  ThumbRegisterInfo RI;
+public:
+  explicit Thumb2InstrInfo(const ARMSubtarget &STI);
+
+  /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  unsigned getUnindexedOpcode(unsigned Opc) const override;
+
+  void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+                               MachineBasicBlock *NewDest) const override;
+
+  bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
+
+private:
+  void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
+};
+
+/// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
+/// to llvm::getInstrPredicate except it returns AL for conditional branch
+/// instructions which are "predicated", but are not in IT blocks.
+ARMCC::CondCodes getITInstrPredicate(const MachineInstr &MI, unsigned &PredReg);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
new file mode 100644
index 000000000000..8208e7e24770
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -0,0 +1,1106 @@
+//===-- Thumb2SizeReduction.cpp - Thumb2 code size reduction pass -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h" // To access Function attributes
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "t2-reduce-size"
+
+STATISTIC(NumNarrows,  "Number of 32-bit instrs reduced to 16-bit ones");
+STATISTIC(Num2Addrs,   "Number of 32-bit instrs reduced to 2addr 16-bit ones");
+STATISTIC(NumLdSts,    "Number of 32-bit load / store reduced to 16-bit ones");
+
+static cl::opt<int> ReduceLimit("t2-reduce-limit",
+                                cl::init(-1), cl::Hidden);
+static cl::opt<int> ReduceLimit2Addr("t2-reduce-limit2",
+                                     cl::init(-1), cl::Hidden);
+static cl::opt<int> ReduceLimitLdSt("t2-reduce-limit3",
+                                     cl::init(-1), cl::Hidden);
+
+namespace {
+  /// ReduceTable - A static table with information on mapping from wide
+  /// opcodes to narrow
+  struct ReduceEntry {
+    uint16_t WideOpc;      // Wide opcode
+    uint16_t NarrowOpc1;   // Narrow opcode to transform to
+    uint16_t NarrowOpc2;   // Narrow opcode when it's two-address
+    uint8_t  Imm1Limit;    // Limit of immediate field (bits)
+    uint8_t  Imm2Limit;    // Limit of immediate field when it's two-address
+    unsigned LowRegs1 : 1; // Only possible if low-registers are used
+    unsigned LowRegs2 : 1; // Only possible if low-registers are used (2addr)
+    unsigned PredCC1  : 2; // 0 - If predicated, cc is on and vice versa.
+                           // 1 - No cc field.
+                           // 2 - Always set CPSR.
+    unsigned PredCC2  : 2;
+    unsigned PartFlag : 1; // 16-bit instruction does partial flag update
+    unsigned Special  : 1; // Needs to be dealt with specially
+    unsigned AvoidMovs: 1; // Avoid movs with shifter operand (for Swift)
+  };
+
+  static const ReduceEntry ReduceTable[] = {
+  // Wide,        Narrow1,      Narrow2,     imm1,imm2, lo1, lo2, P/C,PF,S,AM
+  { ARM::t2ADCrr, 0,            ARM::tADC,     0,   0,   0,   1,  0,0, 0,0,0 },
+  { ARM::t2ADDri, ARM::tADDi3,  ARM::tADDi8,   3,   8,   1,   1,  0,0, 0,1,0 },
+  { ARM::t2ADDrr, ARM::tADDrr,  ARM::tADDhirr, 0,   0,   1,   0,  0,1, 0,0,0 },
+  { ARM::t2ADDSri,ARM::tADDi3,  ARM::tADDi8,   3,   8,   1,   1,  2,2, 0,1,0 },
+  { ARM::t2ADDSrr,ARM::tADDrr,  0,             0,   0,   1,   0,  2,0, 0,1,0 },
+  { ARM::t2ANDrr, 0,            ARM::tAND,     0,   0,   0,   1,  0,0, 1,0,0 },
+  { ARM::t2ASRri, ARM::tASRri,  0,             5,   0,   1,   0,  0,0, 1,0,1 },
+  { ARM::t2ASRrr, 0,            ARM::tASRrr,   0,   0,   0,   1,  0,0, 1,0,1 },
+  { ARM::t2BICrr, 0,            ARM::tBIC,     0,   0,   0,   1,  0,0, 1,0,0 },
+  //FIXME: Disable CMN, as CCodes are backwards from compare expectations
+  //{ ARM::t2CMNrr, ARM::tCMN,  0,             0,   0,   1,   0,  2,0, 0,0,0 },
+  { ARM::t2CMNzrr, ARM::tCMNz,  0,             0,   0,   1,   0,  2,0, 0,0,0 },
+  { ARM::t2CMPri, ARM::tCMPi8,  0,             8,   0,   1,   0,  2,0, 0,0,0 },
+  { ARM::t2CMPrr, ARM::tCMPhir, 0,             0,   0,   0,   0,  2,0, 0,1,0 },
+  { ARM::t2EORrr, 0,            ARM::tEOR,     0,   0,   0,   1,  0,0, 1,0,0 },
+  // FIXME: adr.n immediate offset must be multiple of 4.
+  //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0,   0,   0,   1,   0,  1,0, 0,0,0 },
+  { ARM::t2LSLri, ARM::tLSLri,  0,             5,   0,   1,   0,  0,0, 1,0,1 },
+  { ARM::t2LSLrr, 0,            ARM::tLSLrr,   0,   0,   0,   1,  0,0, 1,0,1 },
+  { ARM::t2LSRri, ARM::tLSRri,  0,             5,   0,   1,   0,  0,0, 1,0,1 },
+  { ARM::t2LSRrr, 0,            ARM::tLSRrr,   0,   0,   0,   1,  0,0, 1,0,1 },
+  { ARM::t2MOVi,  ARM::tMOVi8,  0,             8,   0,   1,   0,  0,0, 1,0,0 },
+  { ARM::t2MOVi16,ARM::tMOVi8,  0,             8,   0,   1,   0,  0,0, 1,1,0 },
+  // FIXME: Do we need the 16-bit 'S' variant?
+  { ARM::t2MOVr,ARM::tMOVr,     0,             0,   0,   0,   0,  1,0, 0,0,0 },
+  { ARM::t2MUL,   0,            ARM::tMUL,     0,   0,   0,   1,  0,0, 1,0,0 },
+  { ARM::t2MVNr,  ARM::tMVN,    0,             0,   0,   1,   0,  0,0, 0,0,0 },
+  { ARM::t2ORRrr, 0,            ARM::tORR,     0,   0,   0,   1,  0,0, 1,0,0 },
+  { ARM::t2REV,   ARM::tREV,    0,             0,   0,   1,   0,  1,0, 0,0,0 },
+  { ARM::t2REV16, ARM::tREV16,  0,             0,   0,   1,   0,  1,0, 0,0,0 },
+  { ARM::t2REVSH, ARM::tREVSH,  0,             0,   0,   1,   0,  1,0, 0,0,0 },
+  { ARM::t2RORrr, 0,            ARM::tROR,     0,   0,   0,   1,  0,0, 1,0,0 },
+  { ARM::t2RSBri, ARM::tRSB,    0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2RSBSri,ARM::tRSB,    0,             0,   0,   1,   0,  2,0, 0,1,0 },
+  { ARM::t2SBCrr, 0,            ARM::tSBC,     0,   0,   0,   1,  0,0, 0,0,0 },
+  { ARM::t2SUBri, ARM::tSUBi3,  ARM::tSUBi8,   3,   8,   1,   1,  0,0, 0,0,0 },
+  { ARM::t2SUBrr, ARM::tSUBrr,  0,             0,   0,   1,   0,  0,0, 0,0,0 },
+  { ARM::t2SUBSri,ARM::tSUBi3,  ARM::tSUBi8,   3,   8,   1,   1,  2,2, 0,0,0 },
+  { ARM::t2SUBSrr,ARM::tSUBrr,  0,             0,   0,   1,   0,  2,0, 0,0,0 },
+  { ARM::t2SXTB,  ARM::tSXTB,   0,             0,   0,   1,   0,  1,0, 0,1,0 },
+  { ARM::t2SXTH,  ARM::tSXTH,   0,             0,   0,   1,   0,  1,0, 0,1,0 },
+  { ARM::t2TSTrr, ARM::tTST,    0,             0,   0,   1,   0,  2,0, 0,0,0 },
+  { ARM::t2UXTB,  ARM::tUXTB,   0,             0,   0,   1,   0,  1,0, 0,1,0 },
+  { ARM::t2UXTH,  ARM::tUXTH,   0,             0,   0,   1,   0,  1,0, 0,1,0 },
+
+  // FIXME: Clean this up after splitting each Thumb load / store opcode
+  // into multiple ones.
+  { ARM::t2LDRi12,ARM::tLDRi,   ARM::tLDRspi,  5,   8,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRs,  ARM::tLDRr,   0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRBi12,ARM::tLDRBi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRBs, ARM::tLDRBr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRHi12,ARM::tLDRHi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRHs, ARM::tLDRHr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRSBs,ARM::tLDRSB,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRSHs,ARM::tLDRSH,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDR_POST,ARM::tLDMIA_UPD,0,         0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRi12,ARM::tSTRi,   ARM::tSTRspi,  5,   8,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRs,  ARM::tSTRr,   0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRBi12,ARM::tSTRBi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRBs, ARM::tSTRBr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRHi12,ARM::tSTRHi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRHs, ARM::tSTRHr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STR_POST,ARM::tSTMIA_UPD,0,         0,   0,   1,   0,  0,0, 0,1,0 },
+
+  { ARM::t2LDMIA, ARM::tLDMIA,  0,             0,   0,   1,   1,  1,1, 0,1,0 },
+  { ARM::t2LDMIA_RET,0,         ARM::tPOP_RET, 0,   0,   1,   1,  1,1, 0,1,0 },
+  { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0,   0,   1,   1,  1,1, 0,1,0 },
+  // ARM::t2STMIA (with no basereg writeback) has no Thumb1 equivalent.
+  // tSTMIA_UPD is a change in semantics which can only be used if the base
+  // register is killed. This difference is correctly handled elsewhere.
+  { ARM::t2STMIA, ARM::tSTMIA_UPD, 0,          0,   0,   1,   1,  1,1, 0,1,0 },
+  { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0,       0,   0,   1,   1,  1,1, 0,1,0 },
+  { ARM::t2STMDB_UPD, 0,        ARM::tPUSH,    0,   0,   1,   1,  1,1, 0,1,0 }
+  };
+
+  class Thumb2SizeReduce : public MachineFunctionPass {
+  public:
+    static char ID;
+    Thumb2SizeReduce(std::function<bool(const Function &)> Ftor);
+
+    const Thumb2InstrInfo *TII;
+    const ARMSubtarget *STI;
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    StringRef getPassName() const override {
+      return "Thumb2 instruction size reduction pass";
+    }
+
+  private:
+    /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable.
+    DenseMap<unsigned, unsigned> ReduceOpcodeMap;
+
+    bool canAddPseudoFlagDep(MachineInstr *Use, bool IsSelfLoop);
+
+    bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
+                         bool is2Addr, ARMCC::CondCodes Pred,
+                         bool LiveCPSR, bool &HasCC, bool &CCDead);
+
+    bool ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
+                         const ReduceEntry &Entry);
+
+    bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
+                       const ReduceEntry &Entry, bool LiveCPSR, bool IsSelfLoop);
+
+    /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address
+    /// instruction.
+    bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
+                       const ReduceEntry &Entry, bool LiveCPSR,
+                       bool IsSelfLoop);
+
+    /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit
+    /// non-two-address instruction.
+    bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
+                        const ReduceEntry &Entry, bool LiveCPSR,
+                        bool IsSelfLoop);
+
+    /// ReduceMI - Attempt to reduce MI, return true on success.
+    bool ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
+                  bool LiveCPSR, bool IsSelfLoop);
+
+    /// ReduceMBB - Reduce width of instructions in the specified basic block.
+    bool ReduceMBB(MachineBasicBlock &MBB);
+
+    bool OptimizeSize;
+    bool MinimizeSize;
+
+    // Last instruction to define CPSR in the current block.
+    MachineInstr *CPSRDef;
+    // Was CPSR last defined by a high latency instruction?
+    // When CPSRDef is null, this refers to CPSR defs in predecessors.
+    bool HighLatencyCPSR;
+
+    struct MBBInfo {
+      // The flags leaving this block have high latency.
+      bool HighLatencyCPSR;
+      // Has this block been visited yet?
+      bool Visited;
+
+      MBBInfo() : HighLatencyCPSR(false), Visited(false) {}
+    };
+
+    SmallVector<MBBInfo, 8> BlockInfo;
+
+    std::function<bool(const Function &)> PredicateFtor;
+  };
+  char Thumb2SizeReduce::ID = 0;
+}
+
+Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor)
+    : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) {
+  OptimizeSize = MinimizeSize = false;
+  for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) {
+    unsigned FromOpc = ReduceTable[i].WideOpc;
+    if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second)
+      llvm_unreachable("Duplicated entries?");
+  }
+}
+
+static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) {
+  for (const MCPhysReg *Regs = MCID.getImplicitDefs(); *Regs; ++Regs)
+    if (*Regs == ARM::CPSR)
+      return true;
+  return false;
+}
+
+// Check for a likely high-latency flag def.
+static bool isHighLatencyCPSR(MachineInstr *Def) {
+  switch(Def->getOpcode()) {
+  case ARM::FMSTAT:
+  case ARM::tMUL:
+    return true;
+  }
+  return false;
+}
+
+/// canAddPseudoFlagDep - For A9 (and other out-of-order) implementations,
+/// the 's' 16-bit instruction partially update CPSR. Abort the
+/// transformation to avoid adding false dependency on last CPSR setting
+/// instruction which hurts the ability for out-of-order execution engine
+/// to do register renaming magic.
+/// This function checks if there is a read-of-write dependency between the
+/// last instruction that defines the CPSR and the current instruction. If there
+/// is, then there is no harm done since the instruction cannot be retired
+/// before the CPSR setting instruction anyway.
+/// Note, we are not doing full dependency analysis here for the sake of compile
+/// time. We're not looking for cases like:
+/// r0 = muls ...
+/// r1 = add.w r0, ...
+/// ...
+///    = mul.w r1
+/// In this case it would have been ok to narrow the mul.w to muls since there
+/// are indirect RAW dependency between the muls and the mul.w
+bool
+Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) {
+  // Disable the check for -Oz (aka OptimizeForSizeHarder).
+  if (MinimizeSize || !STI->avoidCPSRPartialUpdate())
+    return false;
+
+  if (!CPSRDef)
+    // If this BB loops back to itself, conservatively avoid narrowing the
+    // first instruction that does partial flag update.
+    return HighLatencyCPSR || FirstInSelfLoop;
+
+  SmallSet<unsigned, 2> Defs;
+  for (const MachineOperand &MO : CPSRDef->operands()) {
+    if (!MO.isReg() || MO.isUndef() || MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || Reg == ARM::CPSR)
+      continue;
+    Defs.insert(Reg);
+  }
+
+  for (const MachineOperand &MO : Use->operands()) {
+    if (!MO.isReg() || MO.isUndef() || MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Defs.count(Reg))
+      return false;
+  }
+
+  // If the current CPSR has high latency, try to avoid the false dependency.
+  if (HighLatencyCPSR)
+    return true;
+
+  // tMOVi8 usually doesn't start long dependency chains, and there are a lot
+  // of them, so always shrink them when CPSR doesn't have high latency.
+  if (Use->getOpcode() == ARM::t2MOVi ||
+      Use->getOpcode() == ARM::t2MOVi16)
+    return false;
+
+  // No read-after-write dependency. The narrowing will add false dependency.
+  return true;
+}
+
+bool
+Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
+                                  bool is2Addr, ARMCC::CondCodes Pred,
+                                  bool LiveCPSR, bool &HasCC, bool &CCDead) {
+  if ((is2Addr  && Entry.PredCC2 == 0) ||
+      (!is2Addr && Entry.PredCC1 == 0)) {
+    if (Pred == ARMCC::AL) {
+      // Not predicated, must set CPSR.
+      if (!HasCC) {
+        // Original instruction was not setting CPSR, but CPSR is not
+        // currently live anyway. It's ok to set it. The CPSR def is
+        // dead though.
+        if (!LiveCPSR) {
+          HasCC = true;
+          CCDead = true;
+          return true;
+        }
+        return false;
+      }
+    } else {
+      // Predicated, must not set CPSR.
+      if (HasCC)
+        return false;
+    }
+  } else if ((is2Addr  && Entry.PredCC2 == 2) ||
+             (!is2Addr && Entry.PredCC1 == 2)) {
+    /// Old opcode has an optional def of CPSR.
+    if (HasCC)
+      return true;
+    // If old opcode does not implicitly define CPSR, then it's not ok since
+    // these new opcodes' CPSR def is not meant to be thrown away. e.g. CMP.
+    if (!HasImplicitCPSRDef(MI->getDesc()))
+      return false;
+    HasCC = true;
+  } else {
+    // 16-bit instruction does not set CPSR.
+    if (HasCC)
+      return false;
+  }
+
+  return true;
+}
+
+static bool VerifyLowRegs(MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+  bool isPCOk = (Opc == ARM::t2LDMIA_RET || Opc == ARM::t2LDMIA_UPD);
+  bool isLROk = (Opc == ARM::t2STMDB_UPD);
+  bool isSPOk = isPCOk || isLROk;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || Reg == ARM::CPSR)
+      continue;
+    if (isPCOk && Reg == ARM::PC)
+      continue;
+    if (isLROk && Reg == ARM::LR)
+      continue;
+    if (Reg == ARM::SP) {
+      if (isSPOk)
+        continue;
+      if (i == 1 && (Opc == ARM::t2LDRi12 || Opc == ARM::t2STRi12))
+        // Special case for these ldr / str with sp as base register.
+        continue;
+    }
+    if (!isARMLowRegister(Reg))
+      return false;
+  }
+  return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
+                                  const ReduceEntry &Entry) {
+  if (ReduceLimitLdSt != -1 && ((int)NumLdSts >= ReduceLimitLdSt))
+    return false;
+
+  unsigned Scale = 1;
+  bool HasImmOffset = false;
+  bool HasShift = false;
+  bool HasOffReg = true;
+  bool isLdStMul = false;
+  unsigned Opc = Entry.NarrowOpc1;
+  unsigned OpNum = 3; // First 'rest' of operands.
+  uint8_t  ImmLimit = Entry.Imm1Limit;
+
+  switch (Entry.WideOpc) {
+  default:
+    llvm_unreachable("Unexpected Thumb2 load / store opcode!");
+  case ARM::t2LDRi12:
+  case ARM::t2STRi12:
+    if (MI->getOperand(1).getReg() == ARM::SP) {
+      Opc = Entry.NarrowOpc2;
+      ImmLimit = Entry.Imm2Limit;
+    }
+
+    Scale = 4;
+    HasImmOffset = true;
+    HasOffReg = false;
+    break;
+  case ARM::t2LDRBi12:
+  case ARM::t2STRBi12:
+    HasImmOffset = true;
+    HasOffReg = false;
+    break;
+  case ARM::t2LDRHi12:
+  case ARM::t2STRHi12:
+    Scale = 2;
+    HasImmOffset = true;
+    HasOffReg = false;
+    break;
+  case ARM::t2LDRs:
+  case ARM::t2LDRBs:
+  case ARM::t2LDRHs:
+  case ARM::t2LDRSBs:
+  case ARM::t2LDRSHs:
+  case ARM::t2STRs:
+  case ARM::t2STRBs:
+  case ARM::t2STRHs:
+    HasShift = true;
+    OpNum = 4;
+    break;
+  case ARM::t2LDR_POST:
+  case ARM::t2STR_POST: {
+    if (!MBB.getParent()->getFunction()->optForMinSize())
+      return false;
+
+    if (!MI->hasOneMemOperand() ||
+        (*MI->memoperands_begin())->getAlignment() < 4)
+      return false;
+
+    // We're creating a completely different type of load/store - LDM from LDR.
+    // For this reason we can't reuse the logic at the end of this function; we
+    // have to implement the MI building here.
+    bool IsStore = Entry.WideOpc == ARM::t2STR_POST;
+    unsigned Rt = MI->getOperand(IsStore ? 1 : 0).getReg();
+    unsigned Rn = MI->getOperand(IsStore ? 0 : 1).getReg();
+    unsigned Offset = MI->getOperand(3).getImm();
+    unsigned PredImm = MI->getOperand(4).getImm();
+    unsigned PredReg = MI->getOperand(5).getReg();
+    assert(isARMLowRegister(Rt));
+    assert(isARMLowRegister(Rn));
+
+    if (Offset != 4)
+      return false;
+
+    // Add the 16-bit load / store instruction.
+    DebugLoc dl = MI->getDebugLoc();
+    auto MIB = BuildMI(MBB, MI, dl, TII->get(Entry.NarrowOpc1))
+                   .addReg(Rn, RegState::Define)
+                   .addReg(Rn)
+                   .addImm(PredImm)
+                   .addReg(PredReg)
+                   .addReg(Rt, IsStore ? 0 : RegState::Define);
+
+    // Transfer memoperands.
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+    // Transfer MI flags.
+    MIB.setMIFlags(MI->getFlags());
+
+    // Kill the old instruction.
+    MI->eraseFromBundle();
+    ++NumLdSts;
+    return true;
+  }
+  case ARM::t2LDMIA: {
+    unsigned BaseReg = MI->getOperand(0).getReg();
+    assert(isARMLowRegister(BaseReg));
+
+    // For the non-writeback version (this one), the base register must be
+    // one of the registers being loaded.
+    bool isOK = false;
+    for (unsigned i = 3; i < MI->getNumOperands(); ++i) {
+      if (MI->getOperand(i).getReg() == BaseReg) {
+        isOK = true;
+        break;
+      }
+    }
+
+    if (!isOK)
+      return false;
+
+    OpNum = 0;
+    isLdStMul = true;
+    break;
+  }
+  case ARM::t2STMIA: {
+    // If the base register is killed, we don't care what its value is after the
+    // instruction, so we can use an updating STMIA.
+    if (!MI->getOperand(0).isKill())
+      return false;
+
+    break;
+  }
+  case ARM::t2LDMIA_RET: {
+    unsigned BaseReg = MI->getOperand(1).getReg();
+    if (BaseReg != ARM::SP)
+      return false;
+    Opc = Entry.NarrowOpc2; // tPOP_RET
+    OpNum = 2;
+    isLdStMul = true;
+    break;
+  }
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD: {
+    OpNum = 0;
+
+    unsigned BaseReg = MI->getOperand(1).getReg();
+    if (BaseReg == ARM::SP &&
+        (Entry.WideOpc == ARM::t2LDMIA_UPD ||
+         Entry.WideOpc == ARM::t2STMDB_UPD)) {
+      Opc = Entry.NarrowOpc2; // tPOP or tPUSH
+      OpNum = 2;
+    } else if (!isARMLowRegister(BaseReg) ||
+               (Entry.WideOpc != ARM::t2LDMIA_UPD &&
+                Entry.WideOpc != ARM::t2STMIA_UPD)) {
+      return false;
+    }
+
+    isLdStMul = true;
+    break;
+  }
+  }
+
+  unsigned OffsetReg = 0;
+  bool OffsetKill = false;
+  bool OffsetInternal = false;
+  if (HasShift) {
+    OffsetReg  = MI->getOperand(2).getReg();
+    OffsetKill = MI->getOperand(2).isKill();
+    OffsetInternal = MI->getOperand(2).isInternalRead();
+
+    if (MI->getOperand(3).getImm())
+      // Thumb1 addressing mode doesn't support shift.
+      return false;
+  }
+
+  unsigned OffsetImm = 0;
+  if (HasImmOffset) {
+    OffsetImm = MI->getOperand(2).getImm();
+    unsigned MaxOffset = ((1 << ImmLimit) - 1) * Scale;
+
+    if ((OffsetImm & (Scale - 1)) || OffsetImm > MaxOffset)
+      // Make sure the immediate field fits.
+      return false;
+  }
+
+  // Add the 16-bit load / store instruction.
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc));
+
+  // tSTMIA_UPD takes a defining register operand. We've already checked that
+  // the register is killed, so mark it as dead here.
+  if (Entry.WideOpc == ARM::t2STMIA)
+    MIB.addReg(MI->getOperand(0).getReg(), RegState::Define | RegState::Dead);
+
+  if (!isLdStMul) {
+    MIB.addOperand(MI->getOperand(0));
+    MIB.addOperand(MI->getOperand(1));
+
+    if (HasImmOffset)
+      MIB.addImm(OffsetImm / Scale);
+
+    assert((!HasShift || OffsetReg) && "Invalid so_reg load / store address!");
+
+    if (HasOffReg)
+      MIB.addReg(OffsetReg, getKillRegState(OffsetKill) |
+                            getInternalReadRegState(OffsetInternal));
+  }
+
+  // Transfer the rest of operands.
+  for (unsigned e = MI->getNumOperands(); OpNum != e; ++OpNum)
+    MIB.addOperand(MI->getOperand(OpNum));
+
+  // Transfer memoperands.
+  MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  // Transfer MI flags.
+  MIB.setMIFlags(MI->getFlags());
+
+  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+
+  MBB.erase_instr(MI);
+  ++NumLdSts;
+  return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
+                                const ReduceEntry &Entry,
+                                bool LiveCPSR, bool IsSelfLoop) {
+  unsigned Opc = MI->getOpcode();
+  if (Opc == ARM::t2ADDri) {
+    // If the source register is SP, try to reduce to tADDrSPi, otherwise
+    // it's a normal reduce.
+    if (MI->getOperand(1).getReg() != ARM::SP) {
+      if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
+        return true;
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+    }
+    // Try to reduce to tADDrSPi.
+    unsigned Imm = MI->getOperand(2).getImm();
+    // The immediate must be in range, the destination register must be a low
+    // reg, the predicate must be "always" and the condition flags must not
+    // be being set.
+    if (Imm & 3 || Imm > 1020)
+      return false;
+    if (!isARMLowRegister(MI->getOperand(0).getReg()))
+      return false;
+    if (MI->getOperand(3).getImm() != ARMCC::AL)
+      return false;
+    const MCInstrDesc &MCID = MI->getDesc();
+    if (MCID.hasOptionalDef() &&
+        MI->getOperand(MCID.getNumOperands()-1).getReg() == ARM::CPSR)
+      return false;
+
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
+                                      TII->get(ARM::tADDrSPi))
+      .addOperand(MI->getOperand(0))
+      .addOperand(MI->getOperand(1))
+      .addImm(Imm / 4); // The tADDrSPi has an implied scale by four.
+    AddDefaultPred(MIB);
+
+    // Transfer MI flags.
+    MIB.setMIFlags(MI->getFlags());
+
+    DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " <<*MIB);
+
+    MBB.erase_instr(MI);
+    ++NumNarrows;
+    return true;
+  }
+
+  if (Entry.LowRegs1 && !VerifyLowRegs(MI))
+    return false;
+
+  if (MI->mayLoadOrStore())
+    return ReduceLoadStore(MBB, MI, Entry);
+
+  switch (Opc) {
+  default: break;
+  case ARM::t2ADDSri:
+  case ARM::t2ADDSrr: {
+    unsigned PredReg = 0;
+    if (getInstrPredicate(*MI, PredReg) == ARMCC::AL) {
+      switch (Opc) {
+      default: break;
+      case ARM::t2ADDSri: {
+        if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
+          return true;
+        LLVM_FALLTHROUGH;
+      }
+      case ARM::t2ADDSrr:
+        return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+      }
+    }
+    break;
+  }
+  case ARM::t2RSBri:
+  case ARM::t2RSBSri:
+  case ARM::t2SXTB:
+  case ARM::t2SXTH:
+  case ARM::t2UXTB:
+  case ARM::t2UXTH:
+    if (MI->getOperand(2).getImm() == 0)
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+    break;
+  case ARM::t2MOVi16:
+    // Can convert only 'pure' immediate operands, not immediates obtained as
+    // globals' addresses.
+    if (MI->getOperand(1).isImm())
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+    break;
+  case ARM::t2CMPrr: {
+    // Try to reduce to the lo-reg only version first. Why there are two
+    // versions of the instruction is a mystery.
+    // It would be nice to just have two entries in the master table that
+    // are prioritized, but the table assumes a unique entry for each
+    // source insn opcode. So for now, we hack a local entry record to use.
+    static const ReduceEntry NarrowEntry =
+      { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1,0 };
+    if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, IsSelfLoop))
+      return true;
+    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+  }
+  }
+  return false;
+}
+
+bool
+Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
+                                const ReduceEntry &Entry,
+                                bool LiveCPSR, bool IsSelfLoop) {
+
+  if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
+    return false;
+
+  if (!OptimizeSize && Entry.AvoidMovs && STI->avoidMOVsShifterOperand())
+    // Don't issue movs with shifter operand for some CPUs unless we
+    // are optimizing for size.
+    return false;
+
+  unsigned Reg0 = MI->getOperand(0).getReg();
+  unsigned Reg1 = MI->getOperand(1).getReg();
+  // t2MUL is "special". The tied source operand is second, not first.
+  if (MI->getOpcode() == ARM::t2MUL) {
+    unsigned Reg2 = MI->getOperand(2).getReg();
+    // Early exit if the regs aren't all low regs.
+    if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1)
+        || !isARMLowRegister(Reg2))
+      return false;
+    if (Reg0 != Reg2) {
+      // If the other operand also isn't the same as the destination, we
+      // can't reduce.
+      if (Reg1 != Reg0)
+        return false;
+      // Try to commute the operands to make it a 2-address instruction.
+      MachineInstr *CommutedMI = TII->commuteInstruction(*MI);
+      if (!CommutedMI)
+        return false;
+    }
+  } else if (Reg0 != Reg1) {
+    // Try to commute the operands to make it a 2-address instruction.
+    unsigned CommOpIdx1 = 1;
+    unsigned CommOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;
+    if (!TII->findCommutedOpIndices(*MI, CommOpIdx1, CommOpIdx2) ||
+        MI->getOperand(CommOpIdx2).getReg() != Reg0)
+      return false;
+    MachineInstr *CommutedMI =
+        TII->commuteInstruction(*MI, false, CommOpIdx1, CommOpIdx2);
+    if (!CommutedMI)
+      return false;
+  }
+  if (Entry.LowRegs2 && !isARMLowRegister(Reg0))
+    return false;
+  if (Entry.Imm2Limit) {
+    unsigned Imm = MI->getOperand(2).getImm();
+    unsigned Limit = (1 << Entry.Imm2Limit) - 1;
+    if (Imm > Limit)
+      return false;
+  } else {
+    unsigned Reg2 = MI->getOperand(2).getReg();
+    if (Entry.LowRegs2 && !isARMLowRegister(Reg2))
+      return false;
+  }
+
+  // Check if it's possible / necessary to transfer the predicate.
+  const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc2);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
+  bool SkipPred = false;
+  if (Pred != ARMCC::AL) {
+    if (!NewMCID.isPredicable())
+      // Can't transfer predicate, fail.
+      return false;
+  } else {
+    SkipPred = !NewMCID.isPredicable();
+  }
+
+  bool HasCC = false;
+  bool CCDead = false;
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MCID.hasOptionalDef()) {
+    unsigned NumOps = MCID.getNumOperands();
+    HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR);
+    if (HasCC && MI->getOperand(NumOps-1).isDead())
+      CCDead = true;
+  }
+  if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead))
+    return false;
+
+  // Avoid adding a false dependency on partial flag update by some 16-bit
+  // instructions which has the 's' bit set.
+  if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC &&
+      canAddPseudoFlagDep(MI, IsSelfLoop))
+    return false;
+
+  // Add the 16-bit instruction.
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
+  MIB.addOperand(MI->getOperand(0));
+  if (NewMCID.hasOptionalDef()) {
+    if (HasCC)
+      AddDefaultT1CC(MIB, CCDead);
+    else
+      AddNoT1CC(MIB);
+  }
+
+  // Transfer the rest of operands.
+  unsigned NumOps = MCID.getNumOperands();
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+    if (i < NumOps && MCID.OpInfo[i].isOptionalDef())
+      continue;
+    if (SkipPred && MCID.OpInfo[i].isPredicate())
+      continue;
+    MIB.addOperand(MI->getOperand(i));
+  }
+
+  // Transfer MI flags.
+  MIB.setMIFlags(MI->getFlags());
+
+  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+
+  MBB.erase_instr(MI);
+  ++Num2Addrs;
+  return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
+                                 const ReduceEntry &Entry,
+                                 bool LiveCPSR, bool IsSelfLoop) {
+  if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit))
+    return false;
+
+  if (!OptimizeSize && Entry.AvoidMovs && STI->avoidMOVsShifterOperand())
+    // Don't issue movs with shifter operand for some CPUs unless we
+    // are optimizing for size.
+    return false;
+
+  unsigned Limit = ~0U;
+  if (Entry.Imm1Limit)
+    Limit = (1 << Entry.Imm1Limit) - 1;
+
+  const MCInstrDesc &MCID = MI->getDesc();
+  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) {
+    if (MCID.OpInfo[i].isPredicate())
+      continue;
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg()) {
+      unsigned Reg = MO.getReg();
+      if (!Reg || Reg == ARM::CPSR)
+        continue;
+      if (Entry.LowRegs1 && !isARMLowRegister(Reg))
+        return false;
+    } else if (MO.isImm() &&
+               !MCID.OpInfo[i].isPredicate()) {
+      if (((unsigned)MO.getImm()) > Limit)
+        return false;
+    }
+  }
+
+  // Check if it's possible / necessary to transfer the predicate.
+  const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc1);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
+  bool SkipPred = false;
+  if (Pred != ARMCC::AL) {
+    if (!NewMCID.isPredicable())
+      // Can't transfer predicate, fail.
+      return false;
+  } else {
+    SkipPred = !NewMCID.isPredicable();
+  }
+
+  bool HasCC = false;
+  bool CCDead = false;
+  if (MCID.hasOptionalDef()) {
+    unsigned NumOps = MCID.getNumOperands();
+    HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR);
+    if (HasCC && MI->getOperand(NumOps-1).isDead())
+      CCDead = true;
+  }
+  if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead))
+    return false;
+
+  // Avoid adding a false dependency on partial flag update by some 16-bit
+  // instructions which has the 's' bit set.
+  if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC &&
+      canAddPseudoFlagDep(MI, IsSelfLoop))
+    return false;
+
+  // Add the 16-bit instruction.
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
+  MIB.addOperand(MI->getOperand(0));
+  if (NewMCID.hasOptionalDef()) {
+    if (HasCC)
+      AddDefaultT1CC(MIB, CCDead);
+    else
+      AddNoT1CC(MIB);
+  }
+
+  // Transfer the rest of operands.
+  unsigned NumOps = MCID.getNumOperands();
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+    if (i < NumOps && MCID.OpInfo[i].isOptionalDef())
+      continue;
+    if ((MCID.getOpcode() == ARM::t2RSBSri ||
+         MCID.getOpcode() == ARM::t2RSBri ||
+         MCID.getOpcode() == ARM::t2SXTB ||
+         MCID.getOpcode() == ARM::t2SXTH ||
+         MCID.getOpcode() == ARM::t2UXTB ||
+         MCID.getOpcode() == ARM::t2UXTH) && i == 2)
+      // Skip the zero immediate operand, it's now implicit.
+      continue;
+    bool isPred = (i < NumOps && MCID.OpInfo[i].isPredicate());
+    if (SkipPred && isPred)
+        continue;
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isImplicit() && MO.getReg() == ARM::CPSR)
+      // Skip implicit def of CPSR. Either it's modeled as an optional
+      // def now or it's already an implicit def on the new instruction.
+      continue;
+    MIB.addOperand(MO);
+  }
+  if (!MCID.isPredicable() && NewMCID.isPredicable())
+    AddDefaultPred(MIB);
+
+  // Transfer MI flags.
+  MIB.setMIFlags(MI->getFlags());
+
+  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+
+  MBB.erase_instr(MI);
+  ++NumNarrows;
+  return true;
+}
+
+static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR, bool &DefCPSR) {
+  bool HasDef = false;
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || MO.isUndef() || MO.isUse())
+      continue;
+    if (MO.getReg() != ARM::CPSR)
+      continue;
+
+    DefCPSR = true;
+    if (!MO.isDead())
+      HasDef = true;
+  }
+
+  return HasDef || LiveCPSR;
+}
+
+static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || MO.isUndef() || MO.isDef())
+      continue;
+    if (MO.getReg() != ARM::CPSR)
+      continue;
+    assert(LiveCPSR && "CPSR liveness tracking is wrong!");
+    if (MO.isKill()) {
+      LiveCPSR = false;
+      break;
+    }
+  }
+
+  return LiveCPSR;
+}
+
+bool Thumb2SizeReduce::ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
+                                bool LiveCPSR, bool IsSelfLoop) {
+  unsigned Opcode = MI->getOpcode();
+  DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode);
+  if (OPI == ReduceOpcodeMap.end())
+    return false;
+  const ReduceEntry &Entry = ReduceTable[OPI->second];
+
+  // Don't attempt normal reductions on "special" cases for now.
+  if (Entry.Special)
+    return ReduceSpecial(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+
+  // Try to transform to a 16-bit two-address instruction.
+  if (Entry.NarrowOpc2 &&
+      ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
+    return true;
+
+  // Try to transform to a 16-bit non-two-address instruction.
+  if (Entry.NarrowOpc1 &&
+      ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
+    return true;
+
+  return false;
+}
+
+bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  // Yes, CPSR could be livein.
+  bool LiveCPSR = MBB.isLiveIn(ARM::CPSR);
+  MachineInstr *BundleMI = nullptr;
+
+  CPSRDef = nullptr;
+  HighLatencyCPSR = false;
+
+  // Check predecessors for the latest CPSRDef.
+  for (auto *Pred : MBB.predecessors()) {
+    const MBBInfo &PInfo = BlockInfo[Pred->getNumber()];
+    if (!PInfo.Visited) {
+      // Since blocks are visited in RPO, this must be a back-edge.
+      continue;
+    }
+    if (PInfo.HighLatencyCPSR) {
+      HighLatencyCPSR = true;
+      break;
+    }
+  }
+
+  // If this BB loops back to itself, conservatively avoid narrowing the
+  // first instruction that does partial flag update.
+  bool IsSelfLoop = MBB.isSuccessor(&MBB);
+  MachineBasicBlock::instr_iterator MII = MBB.instr_begin(),E = MBB.instr_end();
+  MachineBasicBlock::instr_iterator NextMII;
+  for (; MII != E; MII = NextMII) {
+    NextMII = std::next(MII);
+
+    MachineInstr *MI = &*MII;
+    if (MI->isBundle()) {
+      BundleMI = MI;
+      continue;
+    }
+    if (MI->isDebugValue())
+      continue;
+
+    LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR);
+
+    // Does NextMII belong to the same bundle as MI?
+    bool NextInSameBundle = NextMII != E && NextMII->isBundledWithPred();
+
+    if (ReduceMI(MBB, MI, LiveCPSR, IsSelfLoop)) {
+      Modified = true;
+      MachineBasicBlock::instr_iterator I = std::prev(NextMII);
+      MI = &*I;
+      // Removing and reinserting the first instruction in a bundle will break
+      // up the bundle. Fix the bundling if it was broken.
+      if (NextInSameBundle && !NextMII->isBundledWithPred())
+        NextMII->bundleWithPred();
+    }
+
+    if (BundleMI && !NextInSameBundle && MI->isInsideBundle()) {
+      // FIXME: Since post-ra scheduler operates on bundles, the CPSR kill
+      // marker is only on the BUNDLE instruction. Process the BUNDLE
+      // instruction as we finish with the bundled instruction to work around
+      // the inconsistency.
+      if (BundleMI->killsRegister(ARM::CPSR))
+        LiveCPSR = false;
+      MachineOperand *MO = BundleMI->findRegisterDefOperand(ARM::CPSR);
+      if (MO && !MO->isDead())
+        LiveCPSR = true;
+      MO = BundleMI->findRegisterUseOperand(ARM::CPSR);
+      if (MO && !MO->isKill())
+        LiveCPSR = true;
+    }
+
+    bool DefCPSR = false;
+    LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR);
+    if (MI->isCall()) {
+      // Calls don't really set CPSR.
+      CPSRDef = nullptr;
+      HighLatencyCPSR = false;
+      IsSelfLoop = false;
+    } else if (DefCPSR) {
+      // This is the last CPSR defining instruction.
+      CPSRDef = MI;
+      HighLatencyCPSR = isHighLatencyCPSR(CPSRDef);
+      IsSelfLoop = false;
+    }
+  }
+
+  MBBInfo &Info = BlockInfo[MBB.getNumber()];
+  Info.HighLatencyCPSR = HighLatencyCPSR;
+  Info.Visited = true;
+  return Modified;
+}
+
+bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
+  if (PredicateFtor && !PredicateFtor(*MF.getFunction()))
+    return false;
+
+  STI = &static_cast<const ARMSubtarget &>(MF.getSubtarget());
+  if (STI->isThumb1Only() || STI->prefers32BitThumb())
+    return false;
+
+  TII = static_cast<const Thumb2InstrInfo *>(STI->getInstrInfo());
+
+  // Optimizing / minimizing size? Minimizing size implies optimizing for size.
+  OptimizeSize = MF.getFunction()->optForSize();
+  MinimizeSize = MF.getFunction()->optForMinSize();
+
+  BlockInfo.clear();
+  BlockInfo.resize(MF.getNumBlockIDs());
+
+  // Visit blocks in reverse post-order so LastCPSRDef is known for all
+  // predecessors.
+  ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
+  bool Modified = false;
+  for (ReversePostOrderTraversal<MachineFunction*>::rpo_iterator
+       I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
+    Modified |= ReduceMBB(**I);
+  return Modified;
+}
+
+/// createThumb2SizeReductionPass - Returns an instance of the Thumb2 size
+/// reduction pass.
+FunctionPass *llvm::createThumb2SizeReductionPass(
+    std::function<bool(const Function &)> Ftor) {
+  return new Thumb2SizeReduce(std::move(Ftor));
+}
diff --git a/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
new file mode 100644
index 000000000000..2efd63b84a2c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -0,0 +1,625 @@
+//===-- ThumbRegisterInfo.cpp - Thumb-1 Register Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ThumbRegisterInfo.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+extern cl::opt<bool> ReuseFrameIndexVals;
+}
+
+using namespace llvm;
+
+ThumbRegisterInfo::ThumbRegisterInfo() : ARMBaseRegisterInfo() {}
+
+const TargetRegisterClass *
+ThumbRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                                              const MachineFunction &MF) const {
+  if (!MF.getSubtarget<ARMSubtarget>().isThumb1Only())
+    return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC, MF);
+
+  if (ARM::tGPRRegClass.hasSubClassEq(RC))
+    return &ARM::tGPRRegClass;
+  return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC, MF);
+}
+
+const TargetRegisterClass *
+ThumbRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                      unsigned Kind) const {
+  if (!MF.getSubtarget<ARMSubtarget>().isThumb1Only())
+    return ARMBaseRegisterInfo::getPointerRegClass(MF, Kind);
+  return &ARM::tGPRRegClass;
+}
+
+static void emitThumb1LoadConstPool(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator &MBBI,
+                                    const DebugLoc &dl, unsigned DestReg,
+                                    unsigned SubIdx, int Val,
+                                    ARMCC::CondCodes Pred, unsigned PredReg,
+                                    unsigned MIFlags) {
+  MachineFunction &MF = *MBB.getParent();
+  const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  MachineConstantPool *ConstantPool = MF.getConstantPool();
+  const Constant *C = ConstantInt::get(
+          Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRpci))
+    .addReg(DestReg, getDefRegState(true), SubIdx)
+    .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg)
+    .setMIFlags(MIFlags);
+}
+
+static void emitThumb2LoadConstPool(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator &MBBI,
+                                    const DebugLoc &dl, unsigned DestReg,
+                                    unsigned SubIdx, int Val,
+                                    ARMCC::CondCodes Pred, unsigned PredReg,
+                                    unsigned MIFlags) {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  MachineConstantPool *ConstantPool = MF.getConstantPool();
+  const Constant *C = ConstantInt::get(
+           Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci))
+    .addReg(DestReg, getDefRegState(true), SubIdx)
+    .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0)
+    .setMIFlags(MIFlags);
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+void ThumbRegisterInfo::emitLoadConstPool(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+    const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, int Val,
+    ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const {
+  MachineFunction &MF = *MBB.getParent();
+  const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+  if (STI.isThumb1Only()) {
+    assert((isARMLowRegister(DestReg) || isVirtualRegister(DestReg)) &&
+           "Thumb1 does not have ldr to high register");
+    return emitThumb1LoadConstPool(MBB, MBBI, dl, DestReg, SubIdx, Val, Pred,
+                                   PredReg, MIFlags);
+  }
+  return emitThumb2LoadConstPool(MBB, MBBI, dl, DestReg, SubIdx, Val, Pred,
+                                 PredReg, MIFlags);
+}
+
+/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize
+/// a destreg = basereg + immediate in Thumb code. Materialize the immediate
+/// in a register using mov / mvn sequences or load the immediate from a
+/// constpool entry.
+static void emitThumbRegPlusImmInReg(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+    const DebugLoc &dl, unsigned DestReg, unsigned BaseReg, int NumBytes,
+    bool CanChangeCC, const TargetInstrInfo &TII,
+    const ARMBaseRegisterInfo &MRI, unsigned MIFlags = MachineInstr::NoFlags) {
+  MachineFunction &MF = *MBB.getParent();
+  const ARMSubtarget &ST = MF.getSubtarget<ARMSubtarget>();
+  bool isHigh = !isARMLowRegister(DestReg) ||
+                (BaseReg != 0 && !isARMLowRegister(BaseReg));
+  bool isSub = false;
+  // Subtract doesn't have high register version. Load the negative value
+  // if either base or dest register is a high register. Also, if do not
+  // issue sub as part of the sequence if condition register is to be
+  // preserved.
+  if (NumBytes < 0 && !isHigh && CanChangeCC) {
+    isSub = true;
+    NumBytes = -NumBytes;
+  }
+  unsigned LdReg = DestReg;
+  if (DestReg == ARM::SP)
+    assert(BaseReg == ARM::SP && "Unexpected!");
+  if (!isARMLowRegister(DestReg) && !MRI.isVirtualRegister(DestReg))
+    LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
+
+  if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) {
+    AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+        .addImm(NumBytes)
+        .setMIFlags(MIFlags);
+  } else if (NumBytes < 0 && NumBytes >= -255 && CanChangeCC) {
+    AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+        .addImm(NumBytes)
+        .setMIFlags(MIFlags);
+    AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg))
+        .addReg(LdReg, RegState::Kill)
+        .setMIFlags(MIFlags);
+  } else if (ST.genExecuteOnly()) {
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), LdReg)
+      .addImm(NumBytes).setMIFlags(MIFlags);
+  } else
+    MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes, ARMCC::AL, 0,
+                          MIFlags);
+
+  // Emit add / sub.
+  int Opc = (isSub) ? ARM::tSUBrr
+                    : ((isHigh || !CanChangeCC) ? ARM::tADDhirr : ARM::tADDrr);
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+  if (Opc != ARM::tADDhirr)
+    MIB = AddDefaultT1CC(MIB);
+  if (DestReg == ARM::SP || isSub)
+    MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill);
+  else
+    MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill);
+  AddDefaultPred(MIB);
+}
+
+/// emitThumbRegPlusImmediate - Emits a series of instructions to materialize
+/// a destreg = basereg + immediate in Thumb code. Tries a series of ADDs or
+/// SUBs first, and uses a constant pool value if the instruction sequence would
+/// be too long. This is allowed to modify the condition flags.
+void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator &MBBI,
+                                     const DebugLoc &dl, unsigned DestReg,
+                                     unsigned BaseReg, int NumBytes,
+                                     const TargetInstrInfo &TII,
+                                     const ARMBaseRegisterInfo &MRI,
+                                     unsigned MIFlags) {
+  bool isSub = NumBytes < 0;
+  unsigned Bytes = (unsigned)NumBytes;
+  if (isSub) Bytes = -NumBytes;
+
+  int CopyOpc = 0;
+  unsigned CopyBits = 0;
+  unsigned CopyScale = 1;
+  bool CopyNeedsCC = false;
+  int ExtraOpc = 0;
+  unsigned ExtraBits = 0;
+  unsigned ExtraScale = 1;
+  bool ExtraNeedsCC = false;
+
+  // Strategy:
+  // We need to select two types of instruction, maximizing the available
+  // immediate range of each. The instructions we use will depend on whether
+  // DestReg and BaseReg are low, high or the stack pointer.
+  // * CopyOpc  - DestReg = BaseReg + imm
+  //              This will be emitted once if DestReg != BaseReg, and never if
+  //              DestReg == BaseReg.
+  // * ExtraOpc - DestReg = DestReg + imm
+  //              This will be emitted as many times as necessary to add the
+  //              full immediate.
+  // If the immediate ranges of these instructions are not large enough to cover
+  // NumBytes with a reasonable number of instructions, we fall back to using a
+  // value loaded from a constant pool.
+  if (DestReg == ARM::SP) {
+    if (BaseReg == ARM::SP) {
+      // sp -> sp
+      // Already in right reg, no copy needed
+    } else {
+      // low -> sp or high -> sp
+      CopyOpc = ARM::tMOVr;
+      CopyBits = 0;
+    }
+    ExtraOpc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+    ExtraBits = 7;
+    ExtraScale = 4;
+  } else if (isARMLowRegister(DestReg)) {
+    if (BaseReg == ARM::SP) {
+      // sp -> low
+      assert(!isSub && "Thumb1 does not have tSUBrSPi");
+      CopyOpc = ARM::tADDrSPi;
+      CopyBits = 8;
+      CopyScale = 4;
+    } else if (DestReg == BaseReg) {
+      // low -> same low
+      // Already in right reg, no copy needed
+    } else if (isARMLowRegister(BaseReg)) {
+      // low -> different low
+      CopyOpc = isSub ? ARM::tSUBi3 : ARM::tADDi3;
+      CopyBits = 3;
+      CopyNeedsCC = true;
+    } else {
+      // high -> low
+      CopyOpc = ARM::tMOVr;
+      CopyBits = 0;
+    }
+    ExtraOpc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+    ExtraBits = 8;
+    ExtraNeedsCC = true;
+  } else /* DestReg is high */ {
+    if (DestReg == BaseReg) {
+      // high -> same high
+      // Already in right reg, no copy needed
+    } else {
+      // {low,high,sp} -> high
+      CopyOpc = ARM::tMOVr;
+      CopyBits = 0;
+    }
+    ExtraOpc = 0;
+  }
+
+  // We could handle an unaligned immediate with an unaligned copy instruction
+  // and an aligned extra instruction, but this case is not currently needed.
+  assert(((Bytes & 3) == 0 || ExtraScale == 1) &&
+         "Unaligned offset, but all instructions require alignment");
+
+  unsigned CopyRange = ((1 << CopyBits) - 1) * CopyScale;
+  // If we would emit the copy with an immediate of 0, just use tMOVr.
+  if (CopyOpc && Bytes < CopyScale) {
+    CopyOpc = ARM::tMOVr;
+    CopyScale = 1;
+    CopyNeedsCC = false;
+    CopyRange = 0;
+  }
+  unsigned ExtraRange = ((1 << ExtraBits) - 1) * ExtraScale; // per instruction
+  unsigned RequiredCopyInstrs = CopyOpc ? 1 : 0;
+  unsigned RangeAfterCopy = (CopyRange > Bytes) ? 0 : (Bytes - CopyRange);
+
+  // We could handle this case when the copy instruction does not require an
+  // aligned immediate, but we do not currently do this.
+  assert(RangeAfterCopy % ExtraScale == 0 &&
+         "Extra instruction requires immediate to be aligned");
+
+  unsigned RequiredExtraInstrs;
+  if (ExtraRange)
+    RequiredExtraInstrs = alignTo(RangeAfterCopy, ExtraRange) / ExtraRange;
+  else if (RangeAfterCopy > 0)
+    // We need an extra instruction but none is available
+    RequiredExtraInstrs = 1000000;
+  else
+    RequiredExtraInstrs = 0;
+  unsigned RequiredInstrs = RequiredCopyInstrs + RequiredExtraInstrs;
+  unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2;
+
+  // Use a constant pool, if the sequence of ADDs/SUBs is too expensive.
+  if (RequiredInstrs > Threshold) {
+    emitThumbRegPlusImmInReg(MBB, MBBI, dl,
+                             DestReg, BaseReg, NumBytes, true,
+                             TII, MRI, MIFlags);
+    return;
+  }
+
+  // Emit zero or one copy instructions
+  if (CopyOpc) {
+    unsigned CopyImm = std::min(Bytes, CopyRange) / CopyScale;
+    Bytes -= CopyImm * CopyScale;
+
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(CopyOpc), DestReg);
+    if (CopyNeedsCC)
+      MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(BaseReg, RegState::Kill);
+    if (CopyOpc != ARM::tMOVr) {
+      MIB.addImm(CopyImm);
+    }
+    AddDefaultPred(MIB.setMIFlags(MIFlags));
+
+    BaseReg = DestReg;
+  }
+
+  // Emit zero or more in-place add/sub instructions
+  while (Bytes) {
+    unsigned ExtraImm = std::min(Bytes, ExtraRange) / ExtraScale;
+    Bytes -= ExtraImm * ExtraScale;
+
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(ExtraOpc), DestReg);
+    if (ExtraNeedsCC)
+      MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(BaseReg).addImm(ExtraImm);
+    MIB = AddDefaultPred(MIB);
+    MIB.setMIFlags(MIFlags);
+  }
+}
+
+static void removeOperands(MachineInstr &MI, unsigned i) {
+  unsigned Op = i;
+  for (unsigned e = MI.getNumOperands(); i != e; ++i)
+    MI.RemoveOperand(Op);
+}
+
+/// convertToNonSPOpcode - Change the opcode to the non-SP version, because
+/// we're replacing the frame index with a non-SP register.
+static unsigned convertToNonSPOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case ARM::tLDRspi:
+    return ARM::tLDRi;
+
+  case ARM::tSTRspi:
+    return ARM::tSTRi;
+  }
+
+  return Opcode;
+}
+
+bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II,
+                                          unsigned FrameRegIdx,
+                                          unsigned FrameReg, int &Offset,
+                                          const ARMBaseInstrInfo &TII) const {
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  assert(MBB.getParent()->getSubtarget<ARMSubtarget>().isThumb1Only() &&
+         "This isn't needed for thumb2!");
+  DebugLoc dl = MI.getDebugLoc();
+  MachineInstrBuilder MIB(*MBB.getParent(), &MI);
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MI.getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+
+  if (Opcode == ARM::tADDframe) {
+    Offset += MI.getOperand(FrameRegIdx+1).getImm();
+    unsigned DestReg = MI.getOperand(0).getReg();
+
+    emitThumbRegPlusImmediate(MBB, II, dl, DestReg, FrameReg, Offset, TII,
+                              *this);
+    MBB.erase(II);
+    return true;
+  } else {
+    if (AddrMode != ARMII::AddrModeT1_s)
+      llvm_unreachable("Unsupported addressing mode!");
+
+    unsigned ImmIdx = FrameRegIdx + 1;
+    int InstrOffs = MI.getOperand(ImmIdx).getImm();
+    unsigned NumBits = (FrameReg == ARM::SP) ? 8 : 5;
+    unsigned Scale = 4;
+
+    Offset += InstrOffs * Scale;
+    assert((Offset & (Scale - 1)) == 0 && "Can't encode this offset!");
+
+    // Common case: small offset, fits into instruction.
+    MachineOperand &ImmOp = MI.getOperand(ImmIdx);
+    int ImmedOffset = Offset / Scale;
+    unsigned Mask = (1 << NumBits) - 1;
+
+    if ((unsigned)Offset <= Mask * Scale) {
+      // Replace the FrameIndex with the frame register (e.g., sp).
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      ImmOp.ChangeToImmediate(ImmedOffset);
+
+      // If we're using a register where sp was stored, convert the instruction
+      // to the non-SP version.
+      unsigned NewOpc = convertToNonSPOpcode(Opcode);
+      if (NewOpc != Opcode && FrameReg != ARM::SP)
+        MI.setDesc(TII.get(NewOpc));
+
+      return true;
+    }
+
+    NumBits = 5;
+    Mask = (1 << NumBits) - 1;
+
+    // If this is a thumb spill / restore, we will be using a constpool load to
+    // materialize the offset.
+    if (Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
+      ImmOp.ChangeToImmediate(0);
+    } else {
+      // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
+      ImmedOffset = ImmedOffset & Mask;
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      Offset &= ~(Mask * Scale);
+    }
+  }
+
+  return Offset == 0;
+}
+
+void ThumbRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                           int64_t Offset) const {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+  if (!STI.isThumb1Only())
+    return ARMBaseRegisterInfo::resolveFrameIndex(MI, BaseReg, Offset);
+
+  const ARMBaseInstrInfo &TII = *STI.getInstrInfo();
+  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  unsigned i = 0;
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  bool Done = rewriteFrameIndex(MI, i, BaseReg, Off, TII);
+  assert (Done && "Unable to resolve frame index!");
+  (void)Done;
+}
+
+/// saveScavengerRegister - Spill the register so it can be used by the
+/// register scavenger. Return true.
+bool ThumbRegisterInfo::saveScavengerRegister(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+    MachineBasicBlock::iterator &UseMI, const TargetRegisterClass *RC,
+    unsigned Reg) const {
+
+  const ARMSubtarget &STI = MBB.getParent()->getSubtarget<ARMSubtarget>();
+  if (!STI.isThumb1Only())
+    return ARMBaseRegisterInfo::saveScavengerRegister(MBB, I, UseMI, RC, Reg);
+
+  // Thumb1 can't use the emergency spill slot on the stack because
+  // ldr/str immediate offsets must be positive, and if we're referencing
+  // off the frame pointer (if, for example, there are alloca() calls in
+  // the function, the offset will be negative. Use R12 instead since that's
+  // a call clobbered register that we know won't be used in Thumb1 mode.
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  DebugLoc DL;
+  AddDefaultPred(BuildMI(MBB, I, DL, TII.get(ARM::tMOVr))
+    .addReg(ARM::R12, RegState::Define)
+    .addReg(Reg, RegState::Kill));
+
+  // The UseMI is where we would like to restore the register. If there's
+  // interference with R12 before then, however, we'll need to restore it
+  // before that instead and adjust the UseMI.
+  bool done = false;
+  for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) {
+    if (II->isDebugValue())
+      continue;
+    // If this instruction affects R12, adjust our restore point.
+    for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = II->getOperand(i);
+      if (MO.isRegMask() && MO.clobbersPhysReg(ARM::R12)) {
+        UseMI = II;
+        done = true;
+        break;
+      }
+      if (!MO.isReg() || MO.isUndef() || !MO.getReg() ||
+          TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        continue;
+      if (MO.getReg() == ARM::R12) {
+        UseMI = II;
+        done = true;
+        break;
+      }
+    }
+  }
+  // Restore the register from R12
+  AddDefaultPred(BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVr)).
+    addReg(Reg, RegState::Define).addReg(ARM::R12, RegState::Kill));
+
+  return true;
+}
+
+void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                            int SPAdj, unsigned FIOperandNum,
+                                            RegScavenger *RS) const {
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+  if (!STI.isThumb1Only())
+    return ARMBaseRegisterInfo::eliminateFrameIndex(II, SPAdj, FIOperandNum,
+                                                    RS);
+
+  unsigned VReg = 0;
+  const ARMBaseInstrInfo &TII = *STI.getInstrInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc dl = MI.getDebugLoc();
+  MachineInstrBuilder MIB(*MBB.getParent(), &MI);
+
+  unsigned FrameReg = ARM::SP;
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex) +
+               MF.getFrameInfo().getStackSize() + SPAdj;
+
+  if (MF.getFrameInfo().hasVarSizedObjects()) {
+    assert(SPAdj == 0 && STI.getFrameLowering()->hasFP(MF) && "Unexpected");
+    // There are alloca()'s in this function, must reference off the frame
+    // pointer or base pointer instead.
+    if (!hasBasePointer(MF)) {
+      FrameReg = getFrameRegister(MF);
+      Offset -= AFI->getFramePtrSpillOffset();
+    } else
+      FrameReg = BasePtr;
+  }
+
+  // PEI::scavengeFrameVirtualRegs() cannot accurately track SPAdj because the
+  // call frame setup/destroy instructions have already been eliminated.  That
+  // means the stack pointer cannot be used to access the emergency spill slot
+  // when !hasReservedCallFrame().
+#ifndef NDEBUG
+  if (RS && FrameReg == ARM::SP && RS->isScavengingFrameIndex(FrameIndex)){
+    assert(STI.getFrameLowering()->hasReservedCallFrame(MF) &&
+           "Cannot use SP to access the emergency spill slot in "
+           "functions without a reserved call frame");
+    assert(!MF.getFrameInfo().hasVarSizedObjects() &&
+           "Cannot use SP to access the emergency spill slot in "
+           "functions with variable sized frame objects");
+  }
+#endif // NDEBUG
+
+  // Special handling of dbg_value instructions.
+  if (MI.isDebugValue()) {
+    MI.getOperand(FIOperandNum).  ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum+1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // Modify MI as necessary to handle as much of 'Offset' as possible
+  assert(AFI->isThumbFunction() &&
+         "This eliminateFrameIndex only supports Thumb1!");
+  if (rewriteFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
+    return;
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above, handle the rest, providing a register that is
+  // SP+LargeImm.
+  assert(Offset && "This code isn't needed if offset already handled!");
+
+  unsigned Opcode = MI.getOpcode();
+
+  // Remove predicate first.
+  int PIdx = MI.findFirstPredOperandIdx();
+  if (PIdx != -1)
+    removeOperands(MI, PIdx);
+
+  if (MI.mayLoad()) {
+    // Use the destination register to materialize sp + offset.
+    unsigned TmpReg = MI.getOperand(0).getReg();
+    bool UseRR = false;
+    if (Opcode == ARM::tLDRspi) {
+      if (FrameReg == ARM::SP || STI.genExecuteOnly())
+        emitThumbRegPlusImmInReg(MBB, II, dl, TmpReg, FrameReg,
+                                 Offset, false, TII, *this);
+      else {
+        emitLoadConstPool(MBB, II, dl, TmpReg, 0, Offset);
+        UseRR = true;
+      }
+    } else {
+      emitThumbRegPlusImmediate(MBB, II, dl, TmpReg, FrameReg, Offset, TII,
+                                *this);
+    }
+
+    MI.setDesc(TII.get(UseRR ? ARM::tLDRr : ARM::tLDRi));
+    MI.getOperand(FIOperandNum).ChangeToRegister(TmpReg, false, false, true);
+    if (UseRR)
+      // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame
+      // register. The offset is already handled in the vreg value.
+      MI.getOperand(FIOperandNum+1).ChangeToRegister(FrameReg, false, false,
+                                                     false);
+  } else if (MI.mayStore()) {
+      VReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
+      bool UseRR = false;
+
+      if (Opcode == ARM::tSTRspi) {
+        if (FrameReg == ARM::SP || STI.genExecuteOnly())
+          emitThumbRegPlusImmInReg(MBB, II, dl, VReg, FrameReg,
+                                   Offset, false, TII, *this);
+        else {
+          emitLoadConstPool(MBB, II, dl, VReg, 0, Offset);
+          UseRR = true;
+        }
+      } else
+        emitThumbRegPlusImmediate(MBB, II, dl, VReg, FrameReg, Offset, TII,
+                                  *this);
+      MI.setDesc(TII.get(UseRR ? ARM::tSTRr : ARM::tSTRi));
+      MI.getOperand(FIOperandNum).ChangeToRegister(VReg, false, false, true);
+      if (UseRR)
+        // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame
+        // register. The offset is already handled in the vreg value.
+        MI.getOperand(FIOperandNum+1).ChangeToRegister(FrameReg, false, false,
+                                                       false);
+  } else {
+    llvm_unreachable("Unexpected opcode!");
+  }
+
+  // Add predicate back if it's needed.
+  if (MI.isPredicable())
+    AddDefaultPred(MIB);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.h
new file mode 100644
index 000000000000..e6b06959e428
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.h
@@ -0,0 +1,66 @@
+//===- ThumbRegisterInfo.h - Thumb Register Information Impl -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb implementation of the TargetRegisterInfo
+// class. With the exception of emitLoadConstPool Thumb2 tracks
+// ARMBaseRegisterInfo, Thumb1 overloads the functions below.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_THUMB1REGISTERINFO_H
+#define LLVM_LIB_TARGET_ARM_THUMB1REGISTERINFO_H
+
+#include "ARMBaseRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+  class ARMSubtarget;
+  class ARMBaseInstrInfo;
+
+struct ThumbRegisterInfo : public ARMBaseRegisterInfo {
+public:
+  ThumbRegisterInfo();
+
+  const TargetRegisterClass *
+  getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                            const MachineFunction &MF) const override;
+
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+
+  /// emitLoadConstPool - Emits a load from constpool to materialize the
+  /// specified immediate.
+  void
+  emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                    const DebugLoc &dl, unsigned DestReg, unsigned SubIdx,
+                    int Val, ARMCC::CondCodes Pred = ARMCC::AL,
+                    unsigned PredReg = 0,
+                    unsigned MIFlags = MachineInstr::NoFlags) const override;
+
+  // rewrite MI to access 'Offset' bytes from the FP. Update Offset to be
+  // however much remains to be handled. Return 'true' if no further
+  // work is required.
+  bool rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
+                         unsigned FrameReg, int &Offset,
+                         const ARMBaseInstrInfo &TII) const;
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+  bool saveScavengerRegister(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I,
+                             MachineBasicBlock::iterator &UseMI,
+                             const TargetRegisterClass *RC,
+                             unsigned Reg) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AVR/AVR.h b/contrib/llvm/lib/Target/AVR/AVR.h
new file mode 100644
index 000000000000..8e5cc5360ad4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVR.h
@@ -0,0 +1,58 @@
+//===-- AVR.h - Top-level interface for AVR representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// AVR back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_H
+#define LLVM_AVR_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+
+namespace llvm {
+
+class AVRTargetMachine;
+class FunctionPass;
+
+FunctionPass *createAVRISelDag(AVRTargetMachine &TM,
+                               CodeGenOpt::Level OptLevel);
+FunctionPass *createAVRExpandPseudoPass();
+FunctionPass *createAVRFrameAnalyzerPass();
+FunctionPass *createAVRInstrumentFunctionsPass();
+FunctionPass *createAVRRelaxMemPass();
+FunctionPass *createAVRDynAllocaSRPass();
+FunctionPass *createAVRBranchSelectionPass();
+
+void initializeAVRExpandPseudoPass(PassRegistry&);
+void initializeAVRInstrumentFunctionsPass(PassRegistry&);
+void initializeAVRRelaxMemPass(PassRegistry&);
+
+/// Contains the AVR backend.
+namespace AVR {
+
+enum AddressSpace { DataMemory, ProgramMemory };
+
+template <typename T> bool isProgramMemoryAddress(T *V) {
+  return cast<PointerType>(V->getType())->getAddressSpace() == ProgramMemory;
+}
+
+inline bool isProgramMemoryAccess(MemSDNode const *N) {
+  auto V = N->getMemOperand()->getValue();
+
+  return (V != nullptr) ? isProgramMemoryAddress(V) : false;
+}
+
+} // end of namespace AVR
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_H
diff --git a/contrib/llvm/lib/Target/AVR/AVR.td b/contrib/llvm/lib/Target/AVR/AVR.td
new file mode 100644
index 000000000000..d03b983aa70b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVR.td
@@ -0,0 +1,81 @@
+//===-- AVR.td - Describe the AVR Target Machine ----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+// This is the top level entry point for the AVR target.
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===---------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===---------------------------------------------------------------------===//
+// AVR Device Definitions
+//===---------------------------------------------------------------------===//
+
+include "AVRDevices.td"
+
+//===---------------------------------------------------------------------===//
+// Register File Description
+//===---------------------------------------------------------------------===//
+
+include "AVRRegisterInfo.td"
+
+//===---------------------------------------------------------------------===//
+// Instruction Descriptions
+//===---------------------------------------------------------------------===//
+
+include "AVRInstrInfo.td"
+
+def AVRInstrInfo : InstrInfo;
+
+//===---------------------------------------------------------------------===//
+// Calling Conventions
+//===---------------------------------------------------------------------===//
+
+include "AVRCallingConv.td"
+
+//===---------------------------------------------------------------------===//
+// Assembly Printers
+//===---------------------------------------------------------------------===//
+
+def AVRAsmWriter : AsmWriter {
+ string AsmWriterClassName = "InstPrinter";
+ bit isMCAsmWriter = 1;
+}
+
+//===---------------------------------------------------------------------===//
+// Assembly Parsers
+//===---------------------------------------------------------------------===//
+
+def AVRAsmParser : AsmParser {
+  let ShouldEmitMatchRegisterName = 1;
+  let ShouldEmitMatchRegisterAltName = 1;
+}
+
+def AVRAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+
+  // Recognize hard coded registers.
+  string RegisterPrefix = "$";
+  string TokenizingCharacters = "+";
+}
+
+//===---------------------------------------------------------------------===//
+// Target Declaration
+//===---------------------------------------------------------------------===//
+
+def AVR : Target {
+  let InstructionSet         = AVRInstrInfo;
+  let AssemblyWriters        = [AVRAsmWriter];
+
+  let AssemblyParsers        = [AVRAsmParser];
+  let AssemblyParserVariants = [AVRAsmParserVariant];
+}
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/contrib/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
new file mode 100644
index 000000000000..4afdd3a0ec08
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -0,0 +1,184 @@
+//===-- AVRAsmPrinter.cpp - AVR LLVM assembly writer ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format AVR assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVR.h"
+#include "AVRMCInstLower.h"
+#include "AVRSubtarget.h"
+#include "InstPrinter/AVRInstPrinter.h"
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define DEBUG_TYPE "avr-asm-printer"
+
+namespace llvm {
+
+/// An AVR assembly code printer.
+class AVRAsmPrinter : public AsmPrinter {
+public:
+  AVRAsmPrinter(TargetMachine &TM,
+                std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)), MRI(*TM.getMCRegisterInfo()) { }
+
+  StringRef getPassName() const override { return "AVR Assembly Printer"; }
+
+  void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O,
+                    const char *Modifier = 0);
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O) override;
+
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O) override;
+
+  void EmitInstruction(const MachineInstr *MI) override;
+
+private:
+  const MCRegisterInfo &MRI;
+};
+
+void AVRAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+                                 raw_ostream &O, const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << AVRInstPrinter::getPrettyRegisterName(MO.getReg(), MRI);
+    break;
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    O << getSymbol(MO.getGlobal());
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    O << *GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    break;
+  default:
+    llvm_unreachable("Not implemented yet!");
+  }
+}
+
+bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                                    unsigned AsmVariant, const char *ExtraCode,
+                                    raw_ostream &O) {
+  // Default asm printer can only deal with some extra codes,
+  // so try it first.
+  bool Error = AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+
+  if (Error && ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    if (ExtraCode[0] >= 'A' && ExtraCode[0] <= 'Z') {
+      const MachineOperand &RegOp = MI->getOperand(OpNum);
+
+      assert(RegOp.isReg() && "Operand must be a register when you're"
+                              "using 'A'..'Z' operand extracodes.");
+      unsigned Reg = RegOp.getReg();
+
+      unsigned ByteNumber = ExtraCode[0] - 'A';
+
+      unsigned OpFlags = MI->getOperand(OpNum - 1).getImm();
+      unsigned NumOpRegs = InlineAsm::getNumOperandRegisters(OpFlags);
+      (void)NumOpRegs;
+
+      const AVRSubtarget &STI = MF->getSubtarget<AVRSubtarget>();
+      const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+
+      unsigned BytesPerReg = TRI.getMinimalPhysRegClass(Reg)->getSize();
+      assert(BytesPerReg <= 2 && "Only 8 and 16 bit regs are supported.");
+
+      unsigned RegIdx = ByteNumber / BytesPerReg;
+      assert(RegIdx < NumOpRegs && "Multibyte index out of range.");
+
+      Reg = MI->getOperand(OpNum + RegIdx).getReg();
+
+      if (BytesPerReg == 2) {
+        Reg = TRI.getSubReg(Reg, ByteNumber % BytesPerReg ? AVR::sub_hi
+                                                          : AVR::sub_lo);
+      }
+
+      O << AVRInstPrinter::getPrettyRegisterName(Reg, MRI);
+      return false;
+    }
+  }
+
+  printOperand(MI, OpNum, O);
+
+  return false;
+}
+
+bool AVRAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                          unsigned OpNum, unsigned AsmVariant,
+                                          const char *ExtraCode,
+                                          raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    llvm_unreachable("This branch is not implemented yet");
+  }
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  (void)MO;
+  assert(MO.isReg() && "Unexpected inline asm memory operand");
+
+  // TODO: We can look up the alternative name for the register if it's given.
+  if (MI->getOperand(OpNum).getReg() == AVR::R31R30) {
+    O << "Z";
+  } else {
+    assert(MI->getOperand(OpNum).getReg() == AVR::R29R28 &&
+           "Wrong register class for memory operand.");
+    O << "Y";
+  }
+
+  // If NumOpRegs == 2, then we assume it is product of a FrameIndex expansion
+  // and the second operand is an Imm.
+  unsigned OpFlags = MI->getOperand(OpNum - 1).getImm();
+  unsigned NumOpRegs = InlineAsm::getNumOperandRegisters(OpFlags);
+
+  if (NumOpRegs == 2) {
+    O << '+' << MI->getOperand(OpNum + 1).getImm();
+  }
+
+  return false;
+}
+
+void AVRAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  AVRMCInstLower MCInstLowering(OutContext, *this);
+
+  MCInst I;
+  MCInstLowering.lowerInstruction(*MI, I);
+  EmitToStreamer(*OutStreamer, I);
+}
+
+} // end of namespace llvm
+
+extern "C" void LLVMInitializeAVRAsmPrinter() {
+  llvm::RegisterAsmPrinter<llvm::AVRAsmPrinter> X(llvm::getTheAVRTarget());
+}
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRCallingConv.td b/contrib/llvm/lib/Target/AVR/AVRCallingConv.td
new file mode 100644
index 000000000000..68dbce02706f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRCallingConv.td
@@ -0,0 +1,58 @@
+//===-- AVRCallingConv.td - Calling Conventions for AVR ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for AVR architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AVR Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+
+def RetCC_AVR : CallingConv
+<[
+  // i8 is returned in R24.
+  CCIfType<[i8], CCAssignToReg<[R24]>>,
+
+  // i16 are returned in R25:R24, R23:R22, R21:R20 and R19:R18.
+  CCIfType<[i16], CCAssignToReg<[R25R24, R23R22, R21R20, R19R18]>>
+]>;
+
+// Special return value calling convention for runtime functions.
+def RetCC_AVR_BUILTIN : CallingConv
+<[
+  CCIfType<[i8], CCAssignToReg<[R24,R25]>>,
+  CCIfType<[i16], CCAssignToReg<[R23R22, R25R24]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// AVR Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// The calling conventions are implemented in custom C++ code
+
+// Calling convention for variadic functions.
+def ArgCC_AVR_Vararg : CallingConv
+<[
+  // i16 are always passed through the stack with an alignment of 1.
+  CCAssignToStack<2, 1>
+]>;
+
+// Special argument calling convention for
+// division runtime functions.
+def ArgCC_AVR_BUILTIN_DIV : CallingConv
+<[
+  CCIfType<[i8], CCAssignToReg<[R24,R22]>>,
+  CCIfType<[i16], CCAssignToReg<[R25R24, R23R22]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved register lists.
+//===----------------------------------------------------------------------===//
+
+def CSR_Normal : CalleeSavedRegs<(add R29, R28, (sequence "R%u", 17, 2))>;
+def CSR_Interrupts : CalleeSavedRegs<(add (sequence "R%u", 31, 0))>;
diff --git a/contrib/llvm/lib/Target/AVR/AVRDevices.td b/contrib/llvm/lib/Target/AVR/AVRDevices.td
new file mode 100644
index 000000000000..9224af613d14
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRDevices.td
@@ -0,0 +1,491 @@
+//===---------------------------------------------------------------------===//
+// AVR Device Definitions
+//===---------------------------------------------------------------------===//
+
+// :TODO: Implement the skip errata, see `gcc/config/avr/avr-arch.h` for details
+// :TODO: We define all devices with SRAM to have all variants of LD/ST/LDD/STD.
+//        In reality, avr1 (no SRAM) has one variant each of `LD` and `ST`.
+//        avr2 (with SRAM) adds the rest of the variants.
+// :TODO: s/AVRTiny/Tiny
+
+
+// A feature set aggregates features, grouping them. We don't want to create a
+// new member in AVRSubtarget (to store a value) for each set because we do not
+// care if the set is supported, only the subfeatures inside the set. We fix
+// this by simply setting the same dummy member for all feature sets, which is
+// then ignored.
+class FeatureSet<string name, string desc, list<SubtargetFeature> i>
+  : SubtargetFeature<name, "m_FeatureSetDummy", "true", desc, i>;
+
+// A family of microcontrollers, defining a set of supported features.
+class Family<string name, list<SubtargetFeature> i>
+  : FeatureSet<name, !strconcat("The device is a part of the ",
+               name, " family"), i>;
+
+// The device has SRAM, and supports the bare minimum of
+// SRAM-relevant instructions.
+//
+// These are:
+// LD - all 9 variants
+// ST - all 9 variants
+// LDD - two variants for Y and Z
+// STD - two variants for Y and Z
+// `LDS Rd, K`
+// `STS k, Rr`
+// `PUSH`/`POP`
+def FeatureSRAM           : SubtargetFeature<"sram", "m_hasSRAM", "true",
+                                  "The device has random access memory">;
+
+// The device supports the `JMP k` and `CALL k` instructions.
+def FeatureJMPCALL        : SubtargetFeature<"jmpcall", "m_hasJMPCALL", "true",
+                                  "The device supports the `JMP` and "
+                                  "`CALL` instructions">;
+
+
+// The device supports the indirect branches `IJMP` and `ICALL`.
+def FeatureIJMPCALL       : SubtargetFeature<"ijmpcall", "m_hasIJMPCALL",
+                                  "true",
+                                  "The device supports `IJMP`/`ICALL`"
+                                  "instructions">;
+
+// The device supports the extended indirect branches `EIJMP` and `EICALL`.
+def FeatureEIJMPCALL      : SubtargetFeature<"eijmpcall", "m_hasEIJMPCALL",
+                                  "true", "The device supports the "
+                                  "`EIJMP`/`EICALL` instructions">;
+
+// The device supports `ADDI Rd, K`, `SUBI Rd, K`.
+def FeatureADDSUBIW       : SubtargetFeature<"addsubiw", "m_hasADDSUBIW",
+                                  "true", "Enable 16-bit register-immediate "
+                                  "addition and subtraction instructions">;
+
+// The device has an 8-bit stack pointer (SP) register.
+def FeatureSmallStack     : SubtargetFeature<"smallstack", "m_hasSmallStack",
+                                  "true", "The device has an 8-bit "
+                                  "stack pointer">;
+
+// The device supports the 16-bit GPR pair MOVW instruction.
+def FeatureMOVW           : SubtargetFeature<"movw", "m_hasMOVW", "true",
+                                  "The device supports the 16-bit MOVW "
+                                  "instruction">;
+
+// The device supports the `LPM` instruction, with implied destination being r0.
+def FeatureLPM            : SubtargetFeature<"lpm", "m_hasLPM", "true",
+                                  "The device supports the `LPM` instruction">;
+
+// The device supports the `LPM Rd, Z[+] instruction.
+def FeatureLPMX           : SubtargetFeature<"lpmx", "m_hasLPMX", "true",
+                                  "The device supports the `LPM Rd, Z[+]` "
+                                  "instruction">;
+
+// The device supports the `ELPM` instruction.
+def FeatureELPM           : SubtargetFeature<"elpm", "m_hasELPM", "true",
+                                  "The device supports the ELPM instruction">;
+
+// The device supports the `ELPM Rd, Z[+]` instructions.
+def FeatureELPMX          : SubtargetFeature<"elpmx", "m_hasELPMX", "true",
+                                  "The device supports the `ELPM Rd, Z[+]` "
+                                  "instructions">;
+
+// The device supports the `SPM` instruction.
+def FeatureSPM            : SubtargetFeature<"spm", "m_hasSPM", "true",
+                                  "The device supports the `SPM` instruction">;
+
+// The device supports the `SPM Z+` instruction.
+def FeatureSPMX           : SubtargetFeature<"spmx", "m_hasSPMX", "true",
+                                  "The device supports the `SPM Z+` "
+                                  "instruction">;
+
+// The device supports the `DES k` instruction.
+def FeatureDES            : SubtargetFeature<"des", "m_hasDES", "true",
+                                  "The device supports the `DES k` encryption "
+                                  "instruction">;
+
+// The device supports the Read-Write-Modify instructions
+// XCH, LAS, LAC, and LAT.
+def FeatureRMW            : SubtargetFeature<"rmw", "m_supportsRMW", "true",
+                                  "The device supports the read-write-modify "
+                                  "instructions: XCH, LAS, LAC, LAT">;
+
+// The device supports the `[F]MUL[S][U]` family of instructions.
+def FeatureMultiplication : SubtargetFeature<"mul", "m_supportsMultiplication",
+                                  "true", "The device supports the "
+                                  "multiplication instructions">;
+
+// The device supports the `BREAK` instruction.
+def FeatureBREAK          : SubtargetFeature<"break", "m_hasBREAK", "true",
+                                  "The device supports the `BREAK` debugging "
+                                  "instruction">;
+
+// The device has instruction encodings specific to the Tiny core.
+def FeatureTinyEncoding   : SubtargetFeature<"tinyencoding",
+                                  "m_hasTinyEncoding", "true",
+                                  "The device has Tiny core specific "
+                                  "instruction encodings">;
+
+class ELFArch<string name>  : SubtargetFeature<"", "ELFArch",
+                                    !strconcat("ELF::",name), "">;
+
+// ELF e_flags architecture values
+def ELFArchAVR1    : ELFArch<"EF_AVR_ARCH_AVR1">;
+def ELFArchAVR2    : ELFArch<"EF_AVR_ARCH_AVR2">;
+def ELFArchAVR25   : ELFArch<"EF_AVR_ARCH_AVR25">;
+def ELFArchAVR3    : ELFArch<"EF_AVR_ARCH_AVR3">;
+def ELFArchAVR31   : ELFArch<"EF_AVR_ARCH_AVR31">;
+def ELFArchAVR35   : ELFArch<"EF_AVR_ARCH_AVR35">;
+def ELFArchAVR4    : ELFArch<"EF_AVR_ARCH_AVR4">;
+def ELFArchAVR5    : ELFArch<"EF_AVR_ARCH_AVR5">;
+def ELFArchAVR51   : ELFArch<"EF_AVR_ARCH_AVR51">;
+def ELFArchAVR6    : ELFArch<"EF_AVR_ARCH_AVR6">;
+def ELFArchAVRTiny : ELFArch<"EF_AVR_ARCH_AVRTINY">;
+def ELFArchXMEGA1  : ELFArch<"EF_AVR_ARCH_XMEGA1">;
+def ELFArchXMEGA2  : ELFArch<"EF_AVR_ARCH_XMEGA2">;
+def ELFArchXMEGA3  : ELFArch<"EF_AVR_ARCH_XMEGA3">;
+def ELFArchXMEGA4  : ELFArch<"EF_AVR_ARCH_XMEGA4">;
+def ELFArchXMEGA5  : ELFArch<"EF_AVR_ARCH_XMEGA5">;
+def ELFArchXMEGA6  : ELFArch<"EF_AVR_ARCH_XMEGA6">;
+def ELFArchXMEGA7  : ELFArch<"EF_AVR_ARCH_XMEGA7">;
+
+//===---------------------------------------------------------------------===//
+// AVR Families
+//===---------------------------------------------------------------------===//
+
+// The device has at least the bare minimum that **every** single AVR
+// device should have.
+def FamilyAVR0           : Family<"avr0", []>;
+
+def FamilyAVR1           : Family<"avr1", [FamilyAVR0, FeatureLPM]>;
+
+def FamilyAVR2           : Family<"avr2",
+                                 [FamilyAVR1, FeatureIJMPCALL, FeatureADDSUBIW,
+                                  FeatureSRAM]>;
+
+def FamilyAVR25          : Family<"avr25",
+                                 [FamilyAVR2, FeatureMOVW, FeatureLPMX,
+                                  FeatureSPM, FeatureBREAK]>;
+
+def FamilyAVR3           : Family<"avr3",
+                                 [FamilyAVR2, FeatureJMPCALL]>;
+
+def FamilyAVR31          : Family<"avr31",
+                                 [FamilyAVR3, FeatureELPM]>;
+
+def FamilyAVR35          : Family<"avr35",
+                                 [FamilyAVR3, FeatureMOVW, FeatureLPMX,
+                                  FeatureSPM, FeatureBREAK]>;
+
+def FamilyAVR4           : Family<"avr4",
+                                 [FamilyAVR2, FeatureMultiplication,
+                                  FeatureMOVW, FeatureLPMX, FeatureSPM,
+                                  FeatureBREAK]>;
+
+def FamilyAVR5           : Family<"avr5",
+                                 [FamilyAVR3, FeatureMultiplication,
+                                  FeatureMOVW, FeatureLPMX, FeatureSPM,
+                                  FeatureBREAK]>;
+
+def FamilyAVR51          : Family<"avr51",
+                                 [FamilyAVR5, FeatureELPM, FeatureELPMX]>;
+
+def FamilyAVR6           : Family<"avr6",
+                                 [FamilyAVR51]>;
+
+def FamilyAVRTiny        : Family<"avrtiny",
+                                 [FamilyAVR0, FeatureBREAK, FeatureSRAM,
+                                  FeatureTinyEncoding]>;
+
+def FamilyXMEGA          : Family<"xmega",
+                                 [FamilyAVR51, FeatureEIJMPCALL, FeatureSPMX,
+                                  FeatureDES]>;
+
+def FamilyXMEGAU         : Family<"xmegau",
+                                  [FamilyXMEGA, FeatureRMW]>;
+
+def FeatureSetSpecial    : FeatureSet<"special",
+                                      "Enable use of the entire instruction "
+                                      "set - used for debugging",
+                                      [FeatureSRAM, FeatureJMPCALL,
+                                       FeatureIJMPCALL, FeatureEIJMPCALL,
+                                       FeatureADDSUBIW, FeatureMOVW,
+                                       FeatureLPM, FeatureLPMX, FeatureELPM,
+                                       FeatureELPMX, FeatureSPM, FeatureSPMX,
+                                       FeatureDES, FeatureRMW,
+                                       FeatureMultiplication, FeatureBREAK]>;
+
+//===---------------------------------------------------------------------===//
+// AVR microcontrollers supported.
+//===---------------------------------------------------------------------===//
+
+class Device<string Name, Family Fam, ELFArch Arch,
+             list<SubtargetFeature> ExtraFeatures = []>
+  : Processor<Name, NoItineraries, !listconcat([Fam,Arch],ExtraFeatures)>;
+
+// Generic MCUs
+// Note that several versions of GCC has strange ELF architecture
+// settings for backwards compatibility - see `gas/config/tc-avr.c`
+// in AVR binutils. We do not replicate this.
+def : Device<"avr1",      FamilyAVR1,    ELFArchAVR1>;
+def : Device<"avr2",      FamilyAVR2,    ELFArchAVR2>;
+def : Device<"avr25",     FamilyAVR25,   ELFArchAVR25>;
+def : Device<"avr3",      FamilyAVR3,    ELFArchAVR3>;
+def : Device<"avr31",     FamilyAVR31,   ELFArchAVR31>;
+def : Device<"avr35",     FamilyAVR35,   ELFArchAVR35>;
+def : Device<"avr4",      FamilyAVR4,    ELFArchAVR4>;
+def : Device<"avr5",      FamilyAVR5,    ELFArchAVR5>;
+def : Device<"avr51",     FamilyAVR51,   ELFArchAVR51>;
+def : Device<"avr6",      FamilyAVR6,    ELFArchAVR6>;
+def : Device<"avrxmega1", FamilyXMEGA,   ELFArchXMEGA1>;
+def : Device<"avrxmega2", FamilyXMEGA,   ELFArchXMEGA2>;
+def : Device<"avrxmega3", FamilyXMEGA,   ELFArchXMEGA3>;
+def : Device<"avrxmega4", FamilyXMEGA,   ELFArchXMEGA4>;
+def : Device<"avrxmega5", FamilyXMEGA,   ELFArchXMEGA5>;
+def : Device<"avrxmega6", FamilyXMEGA,   ELFArchXMEGA6>;
+def : Device<"avrxmega7", FamilyXMEGA,   ELFArchXMEGA7>;
+def : Device<"avrtiny",   FamilyAVRTiny, ELFArchAVRTiny>;
+
+// Specific MCUs
+def : Device<"at90s1200",          FamilyAVR0, ELFArchAVR1>;
+def : Device<"attiny11",           FamilyAVR1, ELFArchAVR1>;
+def : Device<"attiny12",           FamilyAVR1, ELFArchAVR1>;
+def : Device<"attiny15",           FamilyAVR1, ELFArchAVR1>;
+def : Device<"attiny28",           FamilyAVR1, ELFArchAVR1>;
+def : Device<"at90s2313",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s2323",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s2333",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s2343",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"attiny22",           FamilyAVR2, ELFArchAVR2>;
+def : Device<"attiny26",           FamilyAVR2, ELFArchAVR2, [FeatureLPMX]>;
+def : Device<"at86rf401",          FamilyAVR2, ELFArchAVR25,
+             [FeatureMOVW, FeatureLPMX]>;
+def : Device<"at90s4414",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s4433",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s4434",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s8515",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90c8534",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"at90s8535",          FamilyAVR2, ELFArchAVR2>;
+def : Device<"ata5272",            FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny13",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny13a",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny2313",         FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny2313a",        FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny24",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny24a",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny4313",         FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny44",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny44a",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny84",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny84a",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny25",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny45",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny85",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny261",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny261a",         FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny461",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny461a",         FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny861",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny861a",         FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny87",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny43u",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny48",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny88",           FamilyAVR25, ELFArchAVR25>;
+def : Device<"attiny828",          FamilyAVR25, ELFArchAVR25>;
+def : Device<"at43usb355",         FamilyAVR3,  ELFArchAVR3>;
+def : Device<"at76c711",           FamilyAVR3,  ELFArchAVR3>;
+def : Device<"atmega103",          FamilyAVR31, ELFArchAVR31>;
+def : Device<"at43usb320",         FamilyAVR31, ELFArchAVR31>;
+def : Device<"attiny167",          FamilyAVR35, ELFArchAVR35>;
+def : Device<"at90usb82",          FamilyAVR35, ELFArchAVR35>;
+def : Device<"at90usb162",         FamilyAVR35, ELFArchAVR35>;
+def : Device<"ata5505",            FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega8u2",          FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega16u2",         FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega32u2",         FamilyAVR35, ELFArchAVR35>;
+def : Device<"attiny1634",         FamilyAVR35, ELFArchAVR35>;
+def : Device<"atmega8",            FamilyAVR4,  ELFArchAVR4>; // FIXME: family may be wrong
+def : Device<"ata6289",            FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega8a",           FamilyAVR4,  ELFArchAVR4>;
+def : Device<"ata6285",            FamilyAVR4,  ELFArchAVR4>;
+def : Device<"ata6286",            FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega48",           FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega48a",          FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega48pa",         FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega48p",          FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega88",           FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega88a",          FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega88p",          FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega88pa",         FamilyAVR4,  ELFArchAVR4>;
+def : Device<"atmega8515",         FamilyAVR2,  ELFArchAVR4,
+             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+def : Device<"atmega8535",         FamilyAVR2,  ELFArchAVR4,
+             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+def : Device<"atmega8hva",         FamilyAVR4,  ELFArchAVR4>;
+def : Device<"at90pwm1",           FamilyAVR4,  ELFArchAVR4>;
+def : Device<"at90pwm2",           FamilyAVR4,  ELFArchAVR4>;
+def : Device<"at90pwm2b",          FamilyAVR4,  ELFArchAVR4>;
+def : Device<"at90pwm3",           FamilyAVR4,  ELFArchAVR4>;
+def : Device<"at90pwm3b",          FamilyAVR4,  ELFArchAVR4>;
+def : Device<"at90pwm81",          FamilyAVR4,  ELFArchAVR4>;
+def : Device<"ata5790",            FamilyAVR5,  ELFArchAVR5>;
+def : Device<"ata5795",            FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega16",           FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega16a",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega161",          FamilyAVR3,  ELFArchAVR5,
+             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+def : Device<"atmega162",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega163",          FamilyAVR3,  ELFArchAVR5,
+             [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>;
+def : Device<"atmega164a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega164p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega164pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega165",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega165a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega165p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega165pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega168",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega168a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega168p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega168pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega169",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega169a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega169p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega169pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega32",           FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega32a",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega323",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega324a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega324p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega324pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega325",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega325a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega325p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega325pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega3250",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega3250a",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega3250p",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega3250pa",       FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega328",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega328p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega329",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega329a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega329p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega329pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega3290",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega3290a",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega3290p",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega3290pa",       FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega406",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega64",           FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega64a",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega640",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega644",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega644a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega644p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega644pa",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega645",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega645a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega645p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega649",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega649a",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega649p",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega6450",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega6450a",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega6450p",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega6490",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega6490a",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega6490p",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega64rfr2",       FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega644rfr2",      FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega16hva",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega16hva2",       FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega16hvb",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega16hvbrevb",    FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega32hvb",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega32hvbrevb",    FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega64hve",        FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at90can32",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at90can64",          FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at90pwm161",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at90pwm216",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at90pwm316",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega32c1",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega64c1",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega16m1",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega32m1",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega64m1",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega16u4",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega32u4",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega32u6",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at90usb646",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at90usb647",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at90scr100",         FamilyAVR5,  ELFArchAVR5>;
+def : Device<"at94k",              FamilyAVR3,  ELFArchAVR5,
+             [FeatureMultiplication, FeatureMOVW, FeatureLPMX]>;
+def : Device<"m3000",              FamilyAVR5,  ELFArchAVR5>;
+def : Device<"atmega128",          FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega128a",         FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1280",         FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1281",         FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1284",         FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1284p",        FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega128rfa1",      FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega128rfr2",      FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega1284rfr2",     FamilyAVR51, ELFArchAVR51>;
+def : Device<"at90can128",         FamilyAVR51, ELFArchAVR51>;
+def : Device<"at90usb1286",        FamilyAVR51, ELFArchAVR51>;
+def : Device<"at90usb1287",        FamilyAVR51, ELFArchAVR51>;
+def : Device<"atmega2560",         FamilyAVR6,  ELFArchAVR6>;
+def : Device<"atmega2561",         FamilyAVR6,  ELFArchAVR6>;
+def : Device<"atmega256rfr2",      FamilyAVR6,  ELFArchAVR6>;
+def : Device<"atmega2564rfr2",     FamilyAVR6,  ELFArchAVR6>;
+def : Device<"atxmega16a4",        FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega16a4u",       FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega16c4",        FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega16d4",        FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32a4",        FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32a4u",       FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega32c4",        FamilyXMEGAU, ELFArchXMEGA2>;
+def : Device<"atxmega32d4",        FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32e5",        FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega16e5",        FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega8e5",         FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega32x1",        FamilyXMEGA, ELFArchXMEGA2>;
+def : Device<"atxmega64a3",        FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"atxmega64a3u",       FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64a4u",       FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64b1",        FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64b3",        FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64c3",        FamilyXMEGAU, ELFArchXMEGA4>;
+def : Device<"atxmega64d3",        FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"atxmega64d4",        FamilyXMEGA, ELFArchXMEGA4>;
+def : Device<"atxmega64a1",        FamilyXMEGA, ELFArchXMEGA5>;
+def : Device<"atxmega64a1u",       FamilyXMEGAU, ELFArchXMEGA5>;
+def : Device<"atxmega128a3",       FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega128a3u",      FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128b1",       FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128b3",       FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128c3",       FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega128d3",       FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega128d4",       FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega192a3",       FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega192a3u",      FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega192c3",       FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega192d3",       FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega256a3",       FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega256a3u",      FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega256a3b",      FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega256a3bu",     FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega256c3",       FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega256d3",       FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega384c3",       FamilyXMEGAU, ELFArchXMEGA6>;
+def : Device<"atxmega384d3",       FamilyXMEGA, ELFArchXMEGA6>;
+def : Device<"atxmega128a1",       FamilyXMEGA, ELFArchXMEGA7>;
+def : Device<"atxmega128a1u",      FamilyXMEGAU, ELFArchXMEGA7>;
+def : Device<"atxmega128a4u",      FamilyXMEGAU, ELFArchXMEGA7>;
+def : Device<"attiny4",            FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny5",            FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny9",            FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny10",           FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny20",           FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny40",           FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny102",          FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny104",          FamilyAVRTiny, ELFArchAVRTiny>;
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
new file mode 100644
index 000000000000..1b2f2cec0bca
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -0,0 +1,1515 @@
+//===-- AVRExpandPseudoInsts.cpp - Expand pseudo instructions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions. This pass should be run after register allocation but before
+// the post-regalloc scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVR.h"
+#include "AVRInstrInfo.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define AVR_EXPAND_PSEUDO_NAME "AVR pseudo instruction expansion pass"
+
+namespace {
+
+/// Expands "placeholder" instructions marked as pseudo into
+/// actual AVR instructions.
+class AVRExpandPseudo : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AVRExpandPseudo() : MachineFunctionPass(ID) {
+    initializeAVRExpandPseudoPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return AVR_EXPAND_PSEUDO_NAME; }
+
+private:
+  typedef MachineBasicBlock Block;
+  typedef Block::iterator BlockIt;
+
+  const AVRRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+
+  /// The register to be used for temporary storage.
+  const unsigned SCRATCH_REGISTER = AVR::R0;
+  /// The IO address of the status register.
+  const unsigned SREG_ADDR = 0x3f;
+
+  bool expandMBB(Block &MBB);
+  bool expandMI(Block &MBB, BlockIt MBBI);
+  template <unsigned OP> bool expand(Block &MBB, BlockIt MBBI);
+
+  MachineInstrBuilder buildMI(Block &MBB, BlockIt MBBI, unsigned Opcode) {
+    return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(Opcode));
+  }
+
+  MachineInstrBuilder buildMI(Block &MBB, BlockIt MBBI, unsigned Opcode,
+                              unsigned DstReg) {
+    return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(Opcode), DstReg);
+  }
+
+  MachineRegisterInfo &getRegInfo(Block &MBB) { return MBB.getParent()->getRegInfo(); }
+
+  bool expandArith(unsigned OpLo, unsigned OpHi, Block &MBB, BlockIt MBBI);
+  bool expandLogic(unsigned Op, Block &MBB, BlockIt MBBI);
+  bool expandLogicImm(unsigned Op, Block &MBB, BlockIt MBBI);
+  bool isLogicImmOpRedundant(unsigned Op, unsigned ImmVal) const;
+
+  template<typename Func>
+  bool expandAtomic(Block &MBB, BlockIt MBBI, Func f);
+
+  template<typename Func>
+  bool expandAtomicBinaryOp(unsigned Opcode, Block &MBB, BlockIt MBBI, Func f);
+
+  bool expandAtomicBinaryOp(unsigned Opcode, Block &MBB, BlockIt MBBI);
+
+  bool expandAtomicArithmeticOp(unsigned MemOpcode,
+                                unsigned ArithOpcode,
+                                Block &MBB,
+                                BlockIt MBBI);
+};
+
+char AVRExpandPseudo::ID = 0;
+
+bool AVRExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  BlockIt MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    BlockIt NMBBI = std::next(MBBI);
+    Modified |= expandMI(MBB, MBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool AVRExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  bool Modified = false;
+
+  const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+  TRI = STI.getRegisterInfo();
+  TII = STI.getInstrInfo();
+
+  // We need to track liveness in order to use register scavenging.
+  MF.getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
+
+  for (Block &MBB : MF) {
+    bool ContinueExpanding = true;
+    unsigned ExpandCount = 0;
+
+    // Continue expanding the block until all pseudos are expanded.
+    do {
+      assert(ExpandCount < 10 && "pseudo expand limit reached");
+
+      bool BlockModified = expandMBB(MBB);
+      Modified |= BlockModified;
+      ExpandCount++;
+
+      ContinueExpanding = BlockModified;
+    } while (ContinueExpanding);
+  }
+
+  return Modified;
+}
+
+bool AVRExpandPseudo::
+expandArith(unsigned OpLo, unsigned OpHi, Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(2).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
+  bool SrcIsKill = MI.getOperand(2).isKill();
+  bool ImpIsDead = MI.getOperand(3).isDead();
+  TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  buildMI(MBB, MBBI, OpLo)
+    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstLoReg, getKillRegState(DstIsKill))
+    .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstHiReg, getKillRegState(DstIsKill))
+    .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+  if (ImpIsDead)
+    MIBHI->getOperand(3).setIsDead();
+
+  // SREG is always implicitly killed
+  MIBHI->getOperand(4).setIsKill();
+
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AVRExpandPseudo::
+expandLogic(unsigned Op, Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(2).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
+  bool SrcIsKill = MI.getOperand(2).isKill();
+  bool ImpIsDead = MI.getOperand(3).isDead();
+  TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  auto MIBLO = buildMI(MBB, MBBI, Op)
+    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstLoReg, getKillRegState(DstIsKill))
+    .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+
+  // SREG is always implicitly dead
+  MIBLO->getOperand(3).setIsDead();
+
+  auto MIBHI = buildMI(MBB, MBBI, Op)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstHiReg, getKillRegState(DstIsKill))
+    .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+  if (ImpIsDead)
+    MIBHI->getOperand(3).setIsDead();
+
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AVRExpandPseudo::
+  isLogicImmOpRedundant(unsigned Op, unsigned ImmVal) const {
+
+  // ANDI Rd, 0xff is redundant.
+  if (Op == AVR::ANDIRdK && ImmVal == 0xff)
+    return true;
+
+  // ORI Rd, 0x0 is redundant.
+  if (Op == AVR::ORIRdK && ImmVal == 0x0)
+    return true;
+
+  return false;
+}
+
+bool AVRExpandPseudo::
+expandLogicImm(unsigned Op, Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool SrcIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(3).isDead();
+  unsigned Imm = MI.getOperand(2).getImm();
+  unsigned Lo8 = Imm & 0xff;
+  unsigned Hi8 = (Imm >> 8) & 0xff;
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  if (!isLogicImmOpRedundant(Op, Lo8)) {
+    auto MIBLO = buildMI(MBB, MBBI, Op)
+      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstLoReg, getKillRegState(SrcIsKill))
+      .addImm(Lo8);
+
+    // SREG is always implicitly dead
+    MIBLO->getOperand(3).setIsDead();
+  }
+
+  if (!isLogicImmOpRedundant(Op, Hi8)) {
+    auto MIBHI = buildMI(MBB, MBBI, Op)
+      .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstHiReg, getKillRegState(SrcIsKill))
+      .addImm(Hi8);
+
+    if (ImpIsDead)
+      MIBHI->getOperand(3).setIsDead();
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ADDWRdRr>(Block &MBB, BlockIt MBBI) {
+  return expandArith(AVR::ADDRdRr, AVR::ADCRdRr, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ADCWRdRr>(Block &MBB, BlockIt MBBI) {
+  return expandArith(AVR::ADCRdRr, AVR::ADCRdRr, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::SUBWRdRr>(Block &MBB, BlockIt MBBI) {
+  return expandArith(AVR::SUBRdRr, AVR::SBCRdRr, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::SUBIWRdK>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool SrcIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(3).isDead();
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  auto MIBLO = buildMI(MBB, MBBI, AVR::SUBIRdK)
+    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstLoReg, getKillRegState(SrcIsKill));
+
+  auto MIBHI = buildMI(MBB, MBBI, AVR::SBCIRdK)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstHiReg, getKillRegState(SrcIsKill));
+
+  switch (MI.getOperand(2).getType()) {
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MI.getOperand(2).getGlobal();
+    int64_t Offs = MI.getOperand(2).getOffset();
+    unsigned TF = MI.getOperand(2).getTargetFlags();
+    MIBLO.addGlobalAddress(GV, Offs, TF | AVRII::MO_NEG | AVRII::MO_LO);
+    MIBHI.addGlobalAddress(GV, Offs, TF | AVRII::MO_NEG | AVRII::MO_HI);
+    break;
+  }
+  case MachineOperand::MO_Immediate: {
+    unsigned Imm = MI.getOperand(2).getImm();
+    MIBLO.addImm(Imm & 0xff);
+    MIBHI.addImm((Imm >> 8) & 0xff);
+    break;
+  }
+  default:
+    llvm_unreachable("Unknown operand type!");
+  }
+
+  if (ImpIsDead)
+    MIBHI->getOperand(3).setIsDead();
+
+  // SREG is always implicitly killed
+  MIBHI->getOperand(4).setIsKill();
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::SBCWRdRr>(Block &MBB, BlockIt MBBI) {
+  return expandArith(AVR::SBCRdRr, AVR::SBCRdRr, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::SBCIWRdK>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool SrcIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(3).isDead();
+  unsigned Imm = MI.getOperand(2).getImm();
+  unsigned Lo8 = Imm & 0xff;
+  unsigned Hi8 = (Imm >> 8) & 0xff;
+  OpLo = AVR::SBCIRdK;
+  OpHi = AVR::SBCIRdK;
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstLoReg, getKillRegState(SrcIsKill))
+    .addImm(Lo8);
+
+  // SREG is always implicitly killed
+  MIBLO->getOperand(4).setIsKill();
+
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstHiReg, getKillRegState(SrcIsKill))
+    .addImm(Hi8);
+
+  if (ImpIsDead)
+    MIBHI->getOperand(3).setIsDead();
+
+  // SREG is always implicitly killed
+  MIBHI->getOperand(4).setIsKill();
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ANDWRdRr>(Block &MBB, BlockIt MBBI) {
+  return expandLogic(AVR::ANDRdRr, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ANDIWRdK>(Block &MBB, BlockIt MBBI) {
+  return expandLogicImm(AVR::ANDIRdK, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ORWRdRr>(Block &MBB, BlockIt MBBI) {
+  return expandLogic(AVR::ORRdRr, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ORIWRdK>(Block &MBB, BlockIt MBBI) {
+  return expandLogicImm(AVR::ORIRdK, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::EORWRdRr>(Block &MBB, BlockIt MBBI) {
+  return expandLogic(AVR::EORRdRr, MBB, MBBI);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::COMWRd>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(2).isDead();
+  OpLo = AVR::COMRd;
+  OpHi = AVR::COMRd;
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+  // SREG is always implicitly dead
+  MIBLO->getOperand(2).setIsDead();
+
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstHiReg, getKillRegState(DstIsKill));
+
+  if (ImpIsDead)
+    MIBHI->getOperand(2).setIsDead();
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::CPWRdRr>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  bool DstIsKill = MI.getOperand(0).isKill();
+  bool SrcIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(2).isDead();
+  OpLo = AVR::CPRdRr;
+  OpHi = AVR::CPCRdRr;
+  TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  // Low part
+  buildMI(MBB, MBBI, OpLo)
+    .addReg(DstLoReg, getKillRegState(DstIsKill))
+    .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(DstHiReg, getKillRegState(DstIsKill))
+    .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+  if (ImpIsDead)
+    MIBHI->getOperand(2).setIsDead();
+
+  // SREG is always implicitly killed
+  MIBHI->getOperand(3).setIsKill();
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::CPCWRdRr>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  bool DstIsKill = MI.getOperand(0).isKill();
+  bool SrcIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(2).isDead();
+  OpLo = AVR::CPCRdRr;
+  OpHi = AVR::CPCRdRr;
+  TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(DstLoReg, getKillRegState(DstIsKill))
+    .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+
+  // SREG is always implicitly killed
+  MIBLO->getOperand(3).setIsKill();
+
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(DstHiReg, getKillRegState(DstIsKill))
+    .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+  if (ImpIsDead)
+    MIBHI->getOperand(2).setIsDead();
+
+  // SREG is always implicitly killed
+  MIBHI->getOperand(3).setIsKill();
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LDIWRdK>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  OpLo = AVR::LDIRdK;
+  OpHi = AVR::LDIRdK;
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead));
+
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead));
+
+  switch (MI.getOperand(1).getType()) {
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MI.getOperand(1).getGlobal();
+    int64_t Offs = MI.getOperand(1).getOffset();
+    unsigned TF = MI.getOperand(1).getTargetFlags();
+
+    MIBLO.addGlobalAddress(GV, Offs, TF | AVRII::MO_LO);
+    MIBHI.addGlobalAddress(GV, Offs, TF | AVRII::MO_HI);
+    break;
+  }
+  case MachineOperand::MO_BlockAddress: {
+    const BlockAddress *BA = MI.getOperand(1).getBlockAddress();
+    unsigned TF = MI.getOperand(1).getTargetFlags();
+
+    MIBLO.addOperand(MachineOperand::CreateBA(BA, TF | AVRII::MO_LO));
+    MIBHI.addOperand(MachineOperand::CreateBA(BA, TF | AVRII::MO_HI));
+    break;
+  }
+  case MachineOperand::MO_Immediate: {
+    unsigned Imm = MI.getOperand(1).getImm();
+
+    MIBLO.addImm(Imm & 0xff);
+    MIBHI.addImm((Imm >> 8) & 0xff);
+    break;
+  }
+  default:
+    llvm_unreachable("Unknown operand type!");
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LDSWRdK>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  OpLo = AVR::LDSRdK;
+  OpHi = AVR::LDSRdK;
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead));
+
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead));
+
+  switch (MI.getOperand(1).getType()) {
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MI.getOperand(1).getGlobal();
+    int64_t Offs = MI.getOperand(1).getOffset();
+    unsigned TF = MI.getOperand(1).getTargetFlags();
+
+    MIBLO.addGlobalAddress(GV, Offs, TF);
+    MIBHI.addGlobalAddress(GV, Offs + 1, TF);
+    break;
+  }
+  case MachineOperand::MO_Immediate: {
+    unsigned Imm = MI.getOperand(1).getImm();
+
+    MIBLO.addImm(Imm);
+    MIBHI.addImm(Imm + 1);
+    break;
+  }
+  default:
+    llvm_unreachable("Unknown operand type!");
+  }
+
+  MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LDWRdPtr>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool SrcIsKill = MI.getOperand(1).isKill();
+  OpLo = AVR::LDRdPtr;
+  OpHi = AVR::LDDRdPtrQ;
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
+
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(SrcReg);
+
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(SrcReg, getKillRegState(SrcIsKill))
+    .addImm(1);
+
+  MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LDWRdPtrPi>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool SrcIsDead = MI.getOperand(1).isKill();
+  OpLo = AVR::LDRdPtrPi;
+  OpHi = AVR::LDRdPtrPi;
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
+
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(SrcReg, RegState::Define)
+    .addReg(SrcReg, RegState::Kill);
+
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(SrcReg, RegState::Define | getDeadRegState(SrcIsDead))
+    .addReg(SrcReg, RegState::Kill);
+
+  MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LDWRdPtrPd>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool SrcIsDead = MI.getOperand(1).isKill();
+  OpLo = AVR::LDRdPtrPd;
+  OpHi = AVR::LDRdPtrPd;
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
+
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(SrcReg, RegState::Define)
+    .addReg(SrcReg, RegState::Kill);
+
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(SrcReg, RegState::Define | getDeadRegState(SrcIsDead))
+    .addReg(SrcReg, RegState::Kill);
+
+  MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LDDWRdPtrQ>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  unsigned Imm = MI.getOperand(2).getImm();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool SrcIsKill = MI.getOperand(1).isKill();
+  OpLo = AVR::LDDRdPtrQ;
+  OpHi = AVR::LDDRdPtrQ;
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  assert(Imm <= 63 && "Offset is out of range");
+
+  MachineInstr *MIBLO, *MIBHI;
+
+  // HACK: We shouldn't have instances of this instruction
+  // where src==dest because the instruction itself is
+  // marked earlyclobber. We do however get this instruction when
+  // loading from stack slots where the earlyclobber isn't useful.
+  //
+  // In this case, just use a temporary register.
+  if (DstReg == SrcReg) {
+    RegScavenger RS;
+
+    RS.enterBasicBlock(MBB);
+    RS.forward(MBBI);
+
+    BitVector Candidates =
+        TRI->getAllocatableSet
+        (*MBB.getParent(), &AVR::GPR8RegClass);
+
+    // Exclude all the registers being used by the instruction.
+    for (MachineOperand &MO : MI.operands()) {
+      if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() &&
+          !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        Candidates.reset(MO.getReg());
+    }
+
+    BitVector Available = RS.getRegsAvailable(&AVR::GPR8RegClass);
+    Available &= Candidates;
+
+    signed TmpReg = Available.find_first();
+    assert(TmpReg != -1 && "ran out of registers");
+
+    MIBLO = buildMI(MBB, MBBI, OpLo)
+      .addReg(TmpReg, RegState::Define)
+      .addReg(SrcReg)
+      .addImm(Imm);
+
+    buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstLoReg).addReg(TmpReg);
+
+    MIBHI = buildMI(MBB, MBBI, OpHi)
+      .addReg(TmpReg, RegState::Define)
+      .addReg(SrcReg, getKillRegState(SrcIsKill))
+      .addImm(Imm + 1);
+
+    buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstHiReg).addReg(TmpReg);
+  } else {
+    MIBLO = buildMI(MBB, MBBI, OpLo)
+      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(SrcReg)
+      .addImm(Imm);
+
+    MIBHI = buildMI(MBB, MBBI, OpHi)
+      .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(SrcReg, getKillRegState(SrcIsKill))
+      .addImm(Imm + 1);
+  }
+
+  MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LPMWRdZ>(Block &MBB, BlockIt MBBI) {
+  llvm_unreachable("wide LPM is unimplemented");
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LPMWRdZPi>(Block &MBB, BlockIt MBBI) {
+  llvm_unreachable("wide LPMPi is unimplemented");
+}
+
+template<typename Func>
+bool AVRExpandPseudo::expandAtomic(Block &MBB, BlockIt MBBI, Func f) {
+  // Remove the pseudo instruction.
+  MachineInstr &MI = *MBBI;
+
+  // Store the SREG.
+  buildMI(MBB, MBBI, AVR::INRdA)
+    .addReg(SCRATCH_REGISTER, RegState::Define)
+    .addImm(SREG_ADDR);
+
+  // Disable exceptions.
+  buildMI(MBB, MBBI, AVR::BCLRs).addImm(7); // CLI
+
+  f(MI);
+
+  // Restore the status reg.
+  buildMI(MBB, MBBI, AVR::OUTARr)
+    .addImm(SREG_ADDR)
+    .addReg(SCRATCH_REGISTER);
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template<typename Func>
+bool AVRExpandPseudo::expandAtomicBinaryOp(unsigned Opcode,
+                                           Block &MBB,
+                                           BlockIt MBBI,
+                                           Func f) {
+  return expandAtomic(MBB, MBBI, [&](MachineInstr &MI) {
+      auto Op1 = MI.getOperand(0);
+      auto Op2 = MI.getOperand(1);
+
+      MachineInstr &NewInst = *buildMI(MBB, MBBI, Opcode)
+        .addOperand(Op1).addOperand(Op2)
+        .getInstr();
+      f(NewInst);
+  });
+}
+
+bool AVRExpandPseudo::expandAtomicBinaryOp(unsigned Opcode,
+                                           Block &MBB,
+                                           BlockIt MBBI) {
+  return expandAtomicBinaryOp(Opcode, MBB, MBBI, [](MachineInstr &MI) {});
+}
+
+bool AVRExpandPseudo::expandAtomicArithmeticOp(unsigned Width,
+                                               unsigned ArithOpcode,
+                                               Block &MBB,
+                                               BlockIt MBBI) {
+  return expandAtomic(MBB, MBBI, [&](MachineInstr &MI) {
+      auto Op1 = MI.getOperand(0);
+      auto Op2 = MI.getOperand(1);
+
+      unsigned LoadOpcode = (Width == 8) ? AVR::LDRdPtr : AVR::LDWRdPtr;
+      unsigned StoreOpcode = (Width == 8) ? AVR::STPtrRr : AVR::STWPtrRr;
+
+      // Create the load
+      buildMI(MBB, MBBI, LoadOpcode).addOperand(Op1).addOperand(Op2);
+
+      // Create the arithmetic op
+      buildMI(MBB, MBBI, ArithOpcode)
+        .addOperand(Op1).addOperand(Op1)
+        .addOperand(Op2);
+
+      // Create the store
+      buildMI(MBB, MBBI, StoreOpcode).addOperand(Op2).addOperand(Op1);
+  });
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoad8>(Block &MBB, BlockIt MBBI) {
+  return expandAtomicBinaryOp(AVR::LDRdPtr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoad16>(Block &MBB, BlockIt MBBI) {
+  return expandAtomicBinaryOp(AVR::LDWRdPtr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicStore8>(Block &MBB, BlockIt MBBI) {
+  return expandAtomicBinaryOp(AVR::STPtrRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicStore16>(Block &MBB, BlockIt MBBI) {
+  return expandAtomicBinaryOp(AVR::STWPtrRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadAdd8>(Block &MBB, BlockIt MBBI) {
+  return expandAtomicArithmeticOp(8, AVR::ADDRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadAdd16>(Block &MBB, BlockIt MBBI) {
+  return expandAtomicArithmeticOp(16, AVR::ADDWRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadSub8>(Block &MBB, BlockIt MBBI) {
+  return expandAtomicArithmeticOp(8, AVR::SUBRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadSub16>(Block &MBB, BlockIt MBBI) {
+  return expandAtomicArithmeticOp(16, AVR::SUBWRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadAnd8>(Block &MBB, BlockIt MBBI) {
+  return expandAtomicArithmeticOp(8, AVR::ANDRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadAnd16>(Block &MBB, BlockIt MBBI) {
+  return expandAtomicArithmeticOp(16, AVR::ANDWRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadOr8>(Block &MBB, BlockIt MBBI) {
+  return expandAtomicArithmeticOp(8, AVR::ORRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadOr16>(Block &MBB, BlockIt MBBI) {
+  return expandAtomicArithmeticOp(16, AVR::ORWRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadXor8>(Block &MBB, BlockIt MBBI) {
+  return expandAtomicArithmeticOp(8, AVR::EORRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicLoadXor16>(Block &MBB, BlockIt MBBI) {
+  return expandAtomicArithmeticOp(16, AVR::EORWRdRr, MBB, MBBI);
+}
+
+template<>
+bool AVRExpandPseudo::expand<AVR::AtomicFence>(Block &MBB, BlockIt MBBI) {
+  // On AVR, there is only one core and so atomic fences do nothing.
+  MBBI->eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::STSWKRr>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  bool SrcIsKill = MI.getOperand(1).isKill();
+  OpLo = AVR::STSKRr;
+  OpHi = AVR::STSKRr;
+  TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+  // Write the high byte first in case this address belongs to a special
+  // I/O address with a special temporary register.
+  auto MIBHI = buildMI(MBB, MBBI, OpHi);
+  auto MIBLO = buildMI(MBB, MBBI, OpLo);
+
+  switch (MI.getOperand(0).getType()) {
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MI.getOperand(0).getGlobal();
+    int64_t Offs = MI.getOperand(0).getOffset();
+    unsigned TF = MI.getOperand(0).getTargetFlags();
+
+    MIBLO.addGlobalAddress(GV, Offs, TF);
+    MIBHI.addGlobalAddress(GV, Offs + 1, TF);
+    break;
+  }
+  case MachineOperand::MO_Immediate: {
+    unsigned Imm = MI.getOperand(0).getImm();
+
+    MIBLO.addImm(Imm);
+    MIBHI.addImm(Imm + 1);
+    break;
+  }
+  default:
+    llvm_unreachable("Unknown operand type!");
+  }
+
+  MIBLO.addReg(SrcLoReg, getKillRegState(SrcIsKill));
+  MIBHI.addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+  MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::STWPtrRr>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  bool DstIsKill = MI.getOperand(0).isKill();
+  bool SrcIsKill = MI.getOperand(1).isKill();
+  OpLo = AVR::STPtrRr;
+  OpHi = AVR::STDPtrQRr;
+  TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+  //:TODO: need to reverse this order like inw and stsw?
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(DstReg)
+    .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(DstReg, getKillRegState(DstIsKill))
+    .addImm(1)
+    .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+  MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::STWPtrPiRr>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(2).getReg();
+  unsigned Imm = MI.getOperand(3).getImm();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool SrcIsKill = MI.getOperand(2).isKill();
+  OpLo = AVR::STPtrPiRr;
+  OpHi = AVR::STPtrPiRr;
+  TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+  assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
+
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(DstReg, RegState::Define)
+    .addReg(DstReg, RegState::Kill)
+    .addReg(SrcLoReg, getKillRegState(SrcIsKill))
+    .addImm(Imm);
+
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstReg, RegState::Kill)
+    .addReg(SrcHiReg, getKillRegState(SrcIsKill))
+    .addImm(Imm);
+
+  MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::STWPtrPdRr>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(2).getReg();
+  unsigned Imm = MI.getOperand(3).getImm();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool SrcIsKill = MI.getOperand(2).isKill();
+  OpLo = AVR::STPtrPdRr;
+  OpHi = AVR::STPtrPdRr;
+  TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+  assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
+
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(DstReg, RegState::Define)
+    .addReg(DstReg, RegState::Kill)
+    .addReg(SrcHiReg, getKillRegState(SrcIsKill))
+    .addImm(Imm);
+
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstReg, RegState::Kill)
+    .addReg(SrcLoReg, getKillRegState(SrcIsKill))
+    .addImm(Imm);
+
+  MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::STDWPtrQRr>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(2).getReg();
+  unsigned Imm = MI.getOperand(1).getImm();
+  bool DstIsKill = MI.getOperand(0).isKill();
+  bool SrcIsKill = MI.getOperand(2).isKill();
+  OpLo = AVR::STDPtrQRr;
+  OpHi = AVR::STDPtrQRr;
+  TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+  assert(Imm <= 63 && "Offset is out of range");
+
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(DstReg)
+    .addImm(Imm)
+    .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(DstReg, getKillRegState(DstIsKill))
+    .addImm(Imm + 1)
+    .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+  MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::INWRdA>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+  unsigned Imm = MI.getOperand(1).getImm();
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  OpLo = AVR::INRdA;
+  OpHi = AVR::INRdA;
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  assert(Imm <= 63 && "Address is out of range");
+
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addImm(Imm);
+
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addImm(Imm + 1);
+
+  MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::OUTWARr>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
+  unsigned Imm = MI.getOperand(0).getImm();
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  bool SrcIsKill = MI.getOperand(1).isKill();
+  OpLo = AVR::OUTARr;
+  OpHi = AVR::OUTARr;
+  TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+  assert(Imm <= 63 && "Address is out of range");
+
+  // 16 bit I/O writes need the high byte first
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addImm(Imm + 1)
+    .addReg(SrcHiReg, getKillRegState(SrcIsKill));
+
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addImm(Imm)
+    .addReg(SrcLoReg, getKillRegState(SrcIsKill));
+
+  MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::PUSHWRr>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
+  unsigned SrcReg = MI.getOperand(0).getReg();
+  bool SrcIsKill = MI.getOperand(0).isKill();
+  unsigned Flags = MI.getFlags();
+  OpLo = AVR::PUSHRr;
+  OpHi = AVR::PUSHRr;
+  TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+  // Low part
+  buildMI(MBB, MBBI, OpLo)
+    .addReg(SrcLoReg, getKillRegState(SrcIsKill))
+    .setMIFlags(Flags);
+
+  // High part
+  buildMI(MBB, MBBI, OpHi)
+    .addReg(SrcHiReg, getKillRegState(SrcIsKill))
+    .setMIFlags(Flags);
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::POPWRd>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned Flags = MI.getFlags();
+  OpLo = AVR::POPRd;
+  OpHi = AVR::POPRd;
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  buildMI(MBB, MBBI, OpHi, DstHiReg).setMIFlags(Flags); // High
+  buildMI(MBB, MBBI, OpLo, DstLoReg).setMIFlags(Flags); // Low
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LSLWRd>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(2).isDead();
+  OpLo = AVR::LSLRd;
+  OpHi = AVR::ROLRd;
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  // Low part
+  buildMI(MBB, MBBI, OpLo)
+    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstHiReg, getKillRegState(DstIsKill));
+
+  if (ImpIsDead)
+    MIBHI->getOperand(2).setIsDead();
+
+  // SREG is always implicitly killed
+  MIBHI->getOperand(3).setIsKill();
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::LSRWRd>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(2).isDead();
+  OpLo = AVR::RORRd;
+  OpHi = AVR::LSRRd;
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  // High part
+  buildMI(MBB, MBBI, OpHi)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstHiReg, getKillRegState(DstIsKill));
+
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+  if (ImpIsDead)
+    MIBLO->getOperand(2).setIsDead();
+
+  // SREG is always implicitly killed
+  MIBLO->getOperand(3).setIsKill();
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::RORWRd>(Block &MBB, BlockIt MBBI) {
+  llvm_unreachable("RORW unimplemented");
+  return false;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ROLWRd>(Block &MBB, BlockIt MBBI) {
+  llvm_unreachable("ROLW unimplemented");
+  return false;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ASRWRd>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(2).isDead();
+  OpLo = AVR::RORRd;
+  OpHi = AVR::ASRRd;
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  // High part
+  buildMI(MBB, MBBI, OpHi)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstHiReg, getKillRegState(DstIsKill));
+
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+  if (ImpIsDead)
+    MIBLO->getOperand(2).setIsDead();
+
+  // SREG is always implicitly killed
+  MIBLO->getOperand(3).setIsKill();
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <> bool AVRExpandPseudo::expand<AVR::SEXT>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned DstLoReg, DstHiReg;
+  // sext R17:R16, R17
+  // mov     r16, r17
+  // lsl     r17
+  // sbc     r17, r17
+  // sext R17:R16, R13
+  // mov     r16, r13
+  // mov     r17, r13
+  // lsl     r17
+  // sbc     r17, r17
+  // sext R17:R16, R16
+  // mov     r17, r16
+  // lsl     r17
+  // sbc     r17, r17
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool SrcIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(2).isDead();
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  if (SrcReg != DstLoReg) {
+    auto MOV = buildMI(MBB, MBBI, AVR::MOVRdRr)
+      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(SrcReg);
+
+    if (SrcReg == DstHiReg) {
+      MOV->getOperand(1).setIsKill();
+    }
+  }
+
+  if (SrcReg != DstHiReg) {
+    buildMI(MBB, MBBI, AVR::MOVRdRr)
+      .addReg(DstHiReg, RegState::Define)
+      .addReg(SrcReg, getKillRegState(SrcIsKill));
+  }
+
+  buildMI(MBB, MBBI, AVR::LSLRd)
+    .addReg(DstHiReg, RegState::Define)
+    .addReg(DstHiReg, RegState::Kill);
+
+  auto SBC = buildMI(MBB, MBBI, AVR::SBCRdRr)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstHiReg, RegState::Kill)
+    .addReg(DstHiReg, RegState::Kill);
+
+  if (ImpIsDead)
+    SBC->getOperand(3).setIsDead();
+
+  // SREG is always implicitly killed
+  SBC->getOperand(4).setIsKill();
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <> bool AVRExpandPseudo::expand<AVR::ZEXT>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned DstLoReg, DstHiReg;
+  // zext R25:R24, R20
+  // mov      R24, R20
+  // eor      R25, R25
+  // zext R25:R24, R24
+  // eor      R25, R25
+  // zext R25:R24, R25
+  // mov      R24, R25
+  // eor      R25, R25
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool SrcIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(2).isDead();
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  if (SrcReg != DstLoReg) {
+    buildMI(MBB, MBBI, AVR::MOVRdRr)
+      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(SrcReg, getKillRegState(SrcIsKill));
+  }
+
+  auto EOR = buildMI(MBB, MBBI, AVR::EORRdRr)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstHiReg, RegState::Kill)
+    .addReg(DstHiReg, RegState::Kill);
+
+  if (ImpIsDead)
+    EOR->getOperand(3).setIsDead();
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::SPREAD>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  unsigned Flags = MI.getFlags();
+  OpLo = AVR::INRdA;
+  OpHi = AVR::INRdA;
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  // Low part
+  buildMI(MBB, MBBI, OpLo)
+    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addImm(0x3d)
+    .setMIFlags(Flags);
+
+  // High part
+  buildMI(MBB, MBBI, OpHi)
+    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addImm(0x3e)
+    .setMIFlags(Flags);
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::SPWRITE>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned SrcLoReg, SrcHiReg;
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  bool SrcIsKill = MI.getOperand(1).isKill();
+  unsigned Flags = MI.getFlags();
+  TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
+
+  buildMI(MBB, MBBI, AVR::INRdA)
+    .addReg(AVR::R0, RegState::Define)
+    .addImm(SREG_ADDR)
+    .setMIFlags(Flags);
+
+  buildMI(MBB, MBBI, AVR::BCLRs).addImm(0x07).setMIFlags(Flags);
+
+  buildMI(MBB, MBBI, AVR::OUTARr)
+    .addImm(0x3e)
+    .addReg(SrcHiReg, getKillRegState(SrcIsKill))
+    .setMIFlags(Flags);
+
+  buildMI(MBB, MBBI, AVR::OUTARr)
+    .addImm(SREG_ADDR)
+    .addReg(AVR::R0, RegState::Kill)
+    .setMIFlags(Flags);
+
+  buildMI(MBB, MBBI, AVR::OUTARr)
+    .addImm(0x3d)
+    .addReg(SrcLoReg, getKillRegState(SrcIsKill))
+    .setMIFlags(Flags);
+
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AVRExpandPseudo::expandMI(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  int Opcode = MBBI->getOpcode();
+
+#define EXPAND(Op)               \
+  case Op:                       \
+    return expand<Op>(MBB, MI)
+
+  switch (Opcode) {
+    EXPAND(AVR::ADDWRdRr);
+    EXPAND(AVR::ADCWRdRr);
+    EXPAND(AVR::SUBWRdRr);
+    EXPAND(AVR::SUBIWRdK);
+    EXPAND(AVR::SBCWRdRr);
+    EXPAND(AVR::SBCIWRdK);
+    EXPAND(AVR::ANDWRdRr);
+    EXPAND(AVR::ANDIWRdK);
+    EXPAND(AVR::ORWRdRr);
+    EXPAND(AVR::ORIWRdK);
+    EXPAND(AVR::EORWRdRr);
+    EXPAND(AVR::COMWRd);
+    EXPAND(AVR::CPWRdRr);
+    EXPAND(AVR::CPCWRdRr);
+    EXPAND(AVR::LDIWRdK);
+    EXPAND(AVR::LDSWRdK);
+    EXPAND(AVR::LDWRdPtr);
+    EXPAND(AVR::LDWRdPtrPi);
+    EXPAND(AVR::LDWRdPtrPd);
+  case AVR::LDDWRdYQ: //:FIXME: remove this once PR13375 gets fixed
+    EXPAND(AVR::LDDWRdPtrQ);
+    EXPAND(AVR::LPMWRdZ);
+    EXPAND(AVR::LPMWRdZPi);
+    EXPAND(AVR::AtomicLoad8);
+    EXPAND(AVR::AtomicLoad16);
+    EXPAND(AVR::AtomicStore8);
+    EXPAND(AVR::AtomicStore16);
+    EXPAND(AVR::AtomicLoadAdd8);
+    EXPAND(AVR::AtomicLoadAdd16);
+    EXPAND(AVR::AtomicLoadSub8);
+    EXPAND(AVR::AtomicLoadSub16);
+    EXPAND(AVR::AtomicLoadAnd8);
+    EXPAND(AVR::AtomicLoadAnd16);
+    EXPAND(AVR::AtomicLoadOr8);
+    EXPAND(AVR::AtomicLoadOr16);
+    EXPAND(AVR::AtomicLoadXor8);
+    EXPAND(AVR::AtomicLoadXor16);
+    EXPAND(AVR::AtomicFence);
+    EXPAND(AVR::STSWKRr);
+    EXPAND(AVR::STWPtrRr);
+    EXPAND(AVR::STWPtrPiRr);
+    EXPAND(AVR::STWPtrPdRr);
+    EXPAND(AVR::STDWPtrQRr);
+    EXPAND(AVR::INWRdA);
+    EXPAND(AVR::OUTWARr);
+    EXPAND(AVR::PUSHWRr);
+    EXPAND(AVR::POPWRd);
+    EXPAND(AVR::LSLWRd);
+    EXPAND(AVR::LSRWRd);
+    EXPAND(AVR::RORWRd);
+    EXPAND(AVR::ROLWRd);
+    EXPAND(AVR::ASRWRd);
+    EXPAND(AVR::SEXT);
+    EXPAND(AVR::ZEXT);
+    EXPAND(AVR::SPREAD);
+    EXPAND(AVR::SPWRITE);
+  }
+#undef EXPAND
+  return false;
+}
+
+} // end of anonymous namespace
+
+INITIALIZE_PASS(AVRExpandPseudo, "avr-expand-pseudo",
+                AVR_EXPAND_PSEUDO_NAME, false, false)
+namespace llvm {
+
+FunctionPass *createAVRExpandPseudoPass() { return new AVRExpandPseudo(); }
+
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/contrib/llvm/lib/Target/AVR/AVRFrameLowering.cpp
new file mode 100644
index 000000000000..b8cb2215ddb4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -0,0 +1,538 @@
+//===-- AVRFrameLowering.cpp - AVR Frame Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AVR implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRFrameLowering.h"
+
+#include "AVR.h"
+#include "AVRInstrInfo.h"
+#include "AVRMachineFunctionInfo.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+
+#include <vector>
+
+namespace llvm {
+
+AVRFrameLowering::AVRFrameLowering()
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 1, -2) {}
+
+bool AVRFrameLowering::canSimplifyCallFramePseudos(
+    const MachineFunction &MF) const {
+  // Always simplify call frame pseudo instructions, even when
+  // hasReservedCallFrame is false.
+  return true;
+}
+
+bool AVRFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  // Reserve call frame memory in function prologue under the following
+  // conditions:
+  // - Y pointer is reserved to be the frame pointer.
+  // - The function does not contain variable sized objects.
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return hasFP(MF) && !MFI.hasVarSizedObjects();
+}
+
+void AVRFrameLowering::emitPrologue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  CallingConv::ID CallConv = MF.getFunction()->getCallingConv();
+  DebugLoc DL = (MBBI != MBB.end()) ? MBBI->getDebugLoc() : DebugLoc();
+  const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+  const AVRInstrInfo &TII = *STI.getInstrInfo();
+
+  // Interrupt handlers re-enable interrupts in function entry.
+  if (CallConv == CallingConv::AVR_INTR) {
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::BSETs))
+        .addImm(0x07)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // Emit special prologue code to save R1, R0 and SREG in interrupt/signal
+  // handlers before saving any other registers.
+  if (CallConv == CallingConv::AVR_INTR ||
+      CallConv == CallingConv::AVR_SIGNAL) {
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHWRr))
+        .addReg(AVR::R1R0, RegState::Kill)
+        .setMIFlag(MachineInstr::FrameSetup);
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::INRdA), AVR::R0)
+        .addImm(0x3f)
+        .setMIFlag(MachineInstr::FrameSetup);
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHRr))
+        .addReg(AVR::R0, RegState::Kill)
+        .setMIFlag(MachineInstr::FrameSetup);
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::EORRdRr))
+        .addReg(AVR::R0, RegState::Define)
+        .addReg(AVR::R0, RegState::Kill)
+        .addReg(AVR::R0, RegState::Kill)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // Early exit if the frame pointer is not needed in this function.
+  if (!hasFP(MF)) {
+    return;
+  }
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+  unsigned FrameSize = MFI.getStackSize() - AFI->getCalleeSavedFrameSize();
+
+  // Skip the callee-saved push instructions.
+  while (
+      (MBBI != MBB.end()) && MBBI->getFlag(MachineInstr::FrameSetup) &&
+      (MBBI->getOpcode() == AVR::PUSHRr || MBBI->getOpcode() == AVR::PUSHWRr)) {
+    ++MBBI;
+  }
+
+  // Update Y with the new base value.
+  BuildMI(MBB, MBBI, DL, TII.get(AVR::SPREAD), AVR::R29R28)
+      .addReg(AVR::SP)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // Mark the FramePtr as live-in in every block except the entry.
+  for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
+       I != E; ++I) {
+    I->addLiveIn(AVR::R29R28);
+  }
+
+  if (!FrameSize) {
+    return;
+  }
+
+  // Reserve the necessary frame memory by doing FP -= <size>.
+  unsigned Opcode = (isUInt<6>(FrameSize)) ? AVR::SBIWRdK : AVR::SUBIWRdK;
+
+  MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opcode), AVR::R29R28)
+                         .addReg(AVR::R29R28, RegState::Kill)
+                         .addImm(FrameSize)
+                         .setMIFlag(MachineInstr::FrameSetup);
+  // The SREG implicit def is dead.
+  MI->getOperand(3).setIsDead();
+
+  // Write back R29R28 to SP and temporarily disable interrupts.
+  BuildMI(MBB, MBBI, DL, TII.get(AVR::SPWRITE), AVR::SP)
+      .addReg(AVR::R29R28)
+      .setMIFlag(MachineInstr::FrameSetup);
+}
+
+void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {
+  CallingConv::ID CallConv = MF.getFunction()->getCallingConv();
+  bool isHandler = (CallConv == CallingConv::AVR_INTR ||
+                    CallConv == CallingConv::AVR_SIGNAL);
+
+  // Early exit if the frame pointer is not needed in this function except for
+  // signal/interrupt handlers where special code generation is required.
+  if (!hasFP(MF) && !isHandler) {
+    return;
+  }
+
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert(MBBI->getDesc().isReturn() &&
+         "Can only insert epilog into returning blocks");
+
+  DebugLoc DL = MBBI->getDebugLoc();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+  unsigned FrameSize = MFI.getStackSize() - AFI->getCalleeSavedFrameSize();
+  const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+  const AVRInstrInfo &TII = *STI.getInstrInfo();
+
+  // Emit special epilogue code to restore R1, R0 and SREG in interrupt/signal
+  // handlers at the very end of the function, just before reti.
+  if (isHandler) {
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::POPRd), AVR::R0);
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::OUTARr))
+        .addImm(0x3f)
+        .addReg(AVR::R0, RegState::Kill);
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::POPWRd), AVR::R1R0);
+  }
+
+  // Early exit if there is no need to restore the frame pointer.
+  if (!FrameSize) {
+    return;
+  }
+
+  // Skip the callee-saved pop instructions.
+  while (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PI = std::prev(MBBI);
+    int Opc = PI->getOpcode();
+
+    if (Opc != AVR::POPRd && Opc != AVR::POPWRd && !PI->isTerminator()) {
+      break;
+    }
+
+    --MBBI;
+  }
+
+  unsigned Opcode;
+
+  // Select the optimal opcode depending on how big it is.
+  if (isUInt<6>(FrameSize)) {
+    Opcode = AVR::ADIWRdK;
+  } else {
+    Opcode = AVR::SUBIWRdK;
+    FrameSize = -FrameSize;
+  }
+
+  // Restore the frame pointer by doing FP += <size>.
+  MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opcode), AVR::R29R28)
+                         .addReg(AVR::R29R28, RegState::Kill)
+                         .addImm(FrameSize);
+  // The SREG implicit def is dead.
+  MI->getOperand(3).setIsDead();
+
+  // Write back R29R28 to SP and temporarily disable interrupts.
+  BuildMI(MBB, MBBI, DL, TII.get(AVR::SPWRITE), AVR::SP)
+      .addReg(AVR::R29R28, RegState::Kill);
+}
+
+// Return true if the specified function should have a dedicated frame
+// pointer register. This is true if the function meets any of the following
+// conditions:
+//  - a register has been spilled
+//  - has allocas
+//  - input arguments are passed using the stack
+//
+// Notice that strictly this is not a frame pointer because it contains SP after
+// frame allocation instead of having the original SP in function entry.
+bool AVRFrameLowering::hasFP(const MachineFunction &MF) const {
+  const AVRMachineFunctionInfo *FuncInfo = MF.getInfo<AVRMachineFunctionInfo>();
+
+  return (FuncInfo->getHasSpills() || FuncInfo->getHasAllocas() ||
+          FuncInfo->getHasStackArgs());
+}
+
+bool AVRFrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  if (CSI.empty()) {
+    return false;
+  }
+
+  unsigned CalleeFrameSize = 0;
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  MachineFunction &MF = *MBB.getParent();
+  const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  AVRMachineFunctionInfo *AVRFI = MF.getInfo<AVRMachineFunctionInfo>();
+
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i - 1].getReg();
+    bool IsNotLiveIn = !MBB.isLiveIn(Reg);
+
+    assert(TRI->getMinimalPhysRegClass(Reg)->getSize() == 1 &&
+           "Invalid register size");
+
+    // Add the callee-saved register as live-in only if it is not already a
+    // live-in register, this usually happens with arguments that are passed
+    // through callee-saved registers.
+    if (IsNotLiveIn) {
+      MBB.addLiveIn(Reg);
+    }
+
+    // Do not kill the register when it is an input argument.
+    BuildMI(MBB, MI, DL, TII.get(AVR::PUSHRr))
+        .addReg(Reg, getKillRegState(IsNotLiveIn))
+        .setMIFlag(MachineInstr::FrameSetup);
+    ++CalleeFrameSize;
+  }
+
+  AVRFI->setCalleeSavedFrameSize(CalleeFrameSize);
+
+  return true;
+}
+
+bool AVRFrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  if (CSI.empty()) {
+    return false;
+  }
+
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  const MachineFunction &MF = *MBB.getParent();
+  const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+  for (const CalleeSavedInfo &CCSI : CSI) {
+    unsigned Reg = CCSI.getReg();
+
+    assert(TRI->getMinimalPhysRegClass(Reg)->getSize() == 1 &&
+           "Invalid register size");
+
+    BuildMI(MBB, MI, DL, TII.get(AVR::POPRd), Reg);
+  }
+
+  return true;
+}
+
+/// Replace pseudo store instructions that pass arguments through the stack with
+/// real instructions. If insertPushes is true then all instructions are
+/// replaced with push instructions, otherwise regular std instructions are
+/// inserted.
+static void fixStackStores(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           const TargetInstrInfo &TII, bool insertPushes) {
+  const AVRSubtarget &STI = MBB.getParent()->getSubtarget<AVRSubtarget>();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+
+  // Iterate through the BB until we hit a call instruction or we reach the end.
+  for (auto I = MI, E = MBB.end(); I != E && !I->isCall();) {
+    MachineBasicBlock::iterator NextMI = std::next(I);
+    MachineInstr &MI = *I;
+    unsigned Opcode = I->getOpcode();
+
+    // Only care of pseudo store instructions where SP is the base pointer.
+    if (Opcode != AVR::STDSPQRr && Opcode != AVR::STDWSPQRr) {
+      I = NextMI;
+      continue;
+    }
+
+    assert(MI.getOperand(0).getReg() == AVR::SP &&
+           "Invalid register, should be SP!");
+    if (insertPushes) {
+      // Replace this instruction with a push.
+      unsigned SrcReg = MI.getOperand(2).getReg();
+      bool SrcIsKill = MI.getOperand(2).isKill();
+
+      // We can't use PUSHWRr here because when expanded the order of the new
+      // instructions are reversed from what we need. Perform the expansion now.
+      if (Opcode == AVR::STDWSPQRr) {
+        BuildMI(MBB, I, MI.getDebugLoc(), TII.get(AVR::PUSHRr))
+            .addReg(TRI.getSubReg(SrcReg, AVR::sub_hi),
+                    getKillRegState(SrcIsKill));
+        BuildMI(MBB, I, MI.getDebugLoc(), TII.get(AVR::PUSHRr))
+            .addReg(TRI.getSubReg(SrcReg, AVR::sub_lo),
+                    getKillRegState(SrcIsKill));
+      } else {
+        BuildMI(MBB, I, MI.getDebugLoc(), TII.get(AVR::PUSHRr))
+            .addReg(SrcReg, getKillRegState(SrcIsKill));
+      }
+
+      MI.eraseFromParent();
+      I = NextMI;
+      continue;
+    }
+
+    // Replace this instruction with a regular store. Use Y as the base
+    // pointer since it is guaranteed to contain a copy of SP.
+    unsigned STOpc =
+        (Opcode == AVR::STDWSPQRr) ? AVR::STDWPtrQRr : AVR::STDPtrQRr;
+
+    MI.setDesc(TII.get(STOpc));
+    MI.getOperand(0).setReg(AVR::R29R28);
+
+    I = NextMI;
+  }
+}
+
+MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MI) const {
+  const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+  const TargetFrameLowering &TFI = *STI.getFrameLowering();
+  const AVRInstrInfo &TII = *STI.getInstrInfo();
+
+  // There is nothing to insert when the call frame memory is allocated during
+  // function entry. Delete the call frame pseudo and replace all pseudo stores
+  // with real store instructions.
+  if (TFI.hasReservedCallFrame(MF)) {
+    fixStackStores(MBB, MI, TII, false);
+    return MBB.erase(MI);
+  }
+
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned int Opcode = MI->getOpcode();
+  int Amount = MI->getOperand(0).getImm();
+
+  // Adjcallstackup does not need to allocate stack space for the call, instead
+  // we insert push instructions that will allocate the necessary stack.
+  // For adjcallstackdown we convert it into an 'adiw reg, <amt>' handling
+  // the read and write of SP in I/O space.
+  if (Amount != 0) {
+    assert(TFI.getStackAlignment() == 1 && "Unsupported stack alignment");
+
+    if (Opcode == TII.getCallFrameSetupOpcode()) {
+      fixStackStores(MBB, MI, TII, true);
+    } else {
+      assert(Opcode == TII.getCallFrameDestroyOpcode());
+
+      // Select the best opcode to adjust SP based on the offset size.
+      unsigned addOpcode;
+      if (isUInt<6>(Amount)) {
+        addOpcode = AVR::ADIWRdK;
+      } else {
+        addOpcode = AVR::SUBIWRdK;
+        Amount = -Amount;
+      }
+
+      // Build the instruction sequence.
+      BuildMI(MBB, MI, DL, TII.get(AVR::SPREAD), AVR::R31R30).addReg(AVR::SP);
+
+      MachineInstr *New = BuildMI(MBB, MI, DL, TII.get(addOpcode), AVR::R31R30)
+                              .addReg(AVR::R31R30, RegState::Kill)
+                              .addImm(Amount);
+      New->getOperand(3).setIsDead();
+
+      BuildMI(MBB, MI, DL, TII.get(AVR::SPWRITE), AVR::SP)
+          .addReg(AVR::R31R30, RegState::Kill);
+    }
+  }
+
+  return MBB.erase(MI);
+}
+
+void AVRFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                            BitVector &SavedRegs,
+                                            RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+  // Spill register Y when it is used as the frame pointer.
+  if (hasFP(MF)) {
+    SavedRegs.set(AVR::R29R28);
+    SavedRegs.set(AVR::R29);
+    SavedRegs.set(AVR::R28);
+  }
+}
+/// The frame analyzer pass.
+///
+/// Scans the function for allocas and used arguments
+/// that are passed through the stack.
+struct AVRFrameAnalyzer : public MachineFunctionPass {
+  static char ID;
+  AVRFrameAnalyzer() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) {
+    const MachineFrameInfo &MFI = MF.getFrameInfo();
+    AVRMachineFunctionInfo *FuncInfo = MF.getInfo<AVRMachineFunctionInfo>();
+
+    // If there are no fixed frame indexes during this stage it means there
+    // are allocas present in the function.
+    if (MFI.getNumObjects() != MFI.getNumFixedObjects()) {
+      // Check for the type of allocas present in the function. We only care
+      // about fixed size allocas so do not give false positives if only
+      // variable sized allocas are present.
+      for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
+        // Variable sized objects have size 0.
+        if (MFI.getObjectSize(i)) {
+          FuncInfo->setHasAllocas(true);
+          break;
+        }
+      }
+    }
+
+    // If there are fixed frame indexes present, scan the function to see if
+    // they are really being used.
+    if (MFI.getNumFixedObjects() == 0) {
+      return false;
+    }
+
+    // Ok fixed frame indexes present, now scan the function to see if they
+    // are really being used, otherwise we can ignore them.
+    for (const MachineBasicBlock &BB : MF) {
+      for (const MachineInstr &MI : BB) {
+        int Opcode = MI.getOpcode();
+
+        if ((Opcode != AVR::LDDRdPtrQ) && (Opcode != AVR::LDDWRdPtrQ) &&
+            (Opcode != AVR::STDPtrQRr) && (Opcode != AVR::STDWPtrQRr)) {
+          continue;
+        }
+
+        for (const MachineOperand &MO : MI.operands()) {
+          if (!MO.isFI()) {
+            continue;
+          }
+
+          if (MFI.isFixedObjectIndex(MO.getIndex())) {
+            FuncInfo->setHasStackArgs(true);
+            return false;
+          }
+        }
+      }
+    }
+
+    return false;
+  }
+
+  StringRef getPassName() const { return "AVR Frame Analyzer"; }
+};
+
+char AVRFrameAnalyzer::ID = 0;
+
+/// Creates instance of the frame analyzer pass.
+FunctionPass *createAVRFrameAnalyzerPass() { return new AVRFrameAnalyzer(); }
+
+/// Create the Dynalloca Stack Pointer Save/Restore pass.
+/// Insert a copy of SP before allocating the dynamic stack memory and restore
+/// it in function exit to restore the original SP state. This avoids the need
+/// of reserving a register pair for a frame pointer.
+struct AVRDynAllocaSR : public MachineFunctionPass {
+  static char ID;
+  AVRDynAllocaSR() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) {
+    // Early exit when there are no variable sized objects in the function.
+    if (!MF.getFrameInfo().hasVarSizedObjects()) {
+      return false;
+    }
+
+    const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+    const TargetInstrInfo &TII = *STI.getInstrInfo();
+    MachineBasicBlock &EntryMBB = MF.front();
+    MachineBasicBlock::iterator MBBI = EntryMBB.begin();
+    DebugLoc DL = EntryMBB.findDebugLoc(MBBI);
+
+    unsigned SPCopy =
+        MF.getRegInfo().createVirtualRegister(&AVR::DREGSRegClass);
+
+    // Create a copy of SP in function entry before any dynallocas are
+    // inserted.
+    BuildMI(EntryMBB, MBBI, DL, TII.get(AVR::COPY), SPCopy).addReg(AVR::SP);
+
+    // Restore SP in all exit basic blocks.
+    for (MachineBasicBlock &MBB : MF) {
+      // If last instruction is a return instruction, add a restore copy.
+      if (!MBB.empty() && MBB.back().isReturn()) {
+        MBBI = MBB.getLastNonDebugInstr();
+        DL = MBBI->getDebugLoc();
+        BuildMI(MBB, MBBI, DL, TII.get(AVR::COPY), AVR::SP)
+            .addReg(SPCopy, RegState::Kill);
+      }
+    }
+
+    return true;
+  }
+
+  StringRef getPassName() const {
+    return "AVR dynalloca stack pointer save/restore";
+  }
+};
+
+char AVRDynAllocaSR::ID = 0;
+
+/// createAVRDynAllocaSRPass - returns an instance of the dynalloca stack
+/// pointer save/restore pass.
+FunctionPass *createAVRDynAllocaSRPass() { return new AVRDynAllocaSR(); }
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRFrameLowering.h b/contrib/llvm/lib/Target/AVR/AVRFrameLowering.h
new file mode 100644
index 000000000000..850a43abebfa
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRFrameLowering.h
@@ -0,0 +1,46 @@
+//===-- AVRFrameLowering.h - Define frame lowering for AVR ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_FRAME_LOWERING_H
+#define LLVM_AVR_FRAME_LOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+/// Utilities for creating function call frames.
+class AVRFrameLowering : public TargetFrameLowering {
+public:
+  explicit AVRFrameLowering();
+
+public:
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  bool hasFP(const MachineFunction &MF) const override;
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+  bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              const std::vector<CalleeSavedInfo> &CSI,
+                              const TargetRegisterInfo *TRI) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS = nullptr) const override;
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_FRAME_LOWERING_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
new file mode 100644
index 000000000000..156a21dfecfe
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -0,0 +1,565 @@
+//===-- AVRISelDAGToDAG.cpp - A dag to dag inst selector for AVR ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the AVR target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVR.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "avr-isel"
+
+namespace llvm {
+
+/// Lowers LLVM IR (in DAG form) to AVR MC instructions (in DAG form).
+class AVRDAGToDAGISel : public SelectionDAGISel {
+public:
+  AVRDAGToDAGISel(AVRTargetMachine &TM, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(TM, OptLevel), Subtarget(nullptr) {}
+
+  StringRef getPassName() const override {
+    return "AVR DAG->DAG Instruction Selection";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  bool SelectAddr(SDNode *Op, SDValue N, SDValue &Base, SDValue &Disp);
+
+  bool selectIndexedLoad(SDNode *N);
+  unsigned selectIndexedProgMemLoad(const LoadSDNode *LD, MVT VT);
+
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
+
+// Include the pieces autogenerated from the target description.
+#include "AVRGenDAGISel.inc"
+
+private:
+  void Select(SDNode *N) override;
+  bool trySelect(SDNode *N);
+
+  template <unsigned NodeType> bool select(SDNode *N);
+  bool selectMultiplication(SDNode *N);
+
+  const AVRSubtarget *Subtarget;
+};
+
+bool AVRDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<AVRSubtarget>();
+  return SelectionDAGISel::runOnMachineFunction(MF);
+}
+
+bool AVRDAGToDAGISel::SelectAddr(SDNode *Op, SDValue N, SDValue &Base,
+                                 SDValue &Disp) {
+  SDLoc dl(Op);
+  auto DL = CurDAG->getDataLayout();
+  MVT PtrVT = getTargetLowering()->getPointerTy(DL);
+
+  // if the address is a frame index get the TargetFrameIndex.
+  if (const FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(N)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), PtrVT);
+    Disp = CurDAG->getTargetConstant(0, dl, MVT::i8);
+
+    return true;
+  }
+
+  // Match simple Reg + uimm6 operands.
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      !CurDAG->isBaseWithConstantOffset(N)) {
+    return false;
+  }
+
+  if (const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+
+    // Convert negative offsets into positives ones.
+    if (N.getOpcode() == ISD::SUB) {
+      RHSC = -RHSC;
+    }
+
+    // <#Frame index + const>
+    // Allow folding offsets bigger than 63 so the frame pointer can be used
+    // directly instead of copying it around by adjusting and restoring it for
+    // each access.
+    if (N.getOperand(0).getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N.getOperand(0))->getIndex();
+
+      Base = CurDAG->getTargetFrameIndex(FI, PtrVT);
+      Disp = CurDAG->getTargetConstant(RHSC, dl, MVT::i16);
+
+      return true;
+    }
+
+    // The value type of the memory instruction determines what is the maximum
+    // offset allowed.
+    MVT VT = cast<MemSDNode>(Op)->getMemoryVT().getSimpleVT();
+
+    // We only accept offsets that fit in 6 bits (unsigned).
+    if (isUInt<6>(RHSC) && (VT == MVT::i8 || VT == MVT::i16)) {
+      Base = N.getOperand(0);
+      Disp = CurDAG->getTargetConstant(RHSC, dl, MVT::i8);
+
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool AVRDAGToDAGISel::selectIndexedLoad(SDNode *N) {
+  const LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  MVT VT = LD->getMemoryVT().getSimpleVT();
+  auto PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout());
+
+  // We only care if this load uses a POSTINC or PREDEC mode.
+  if ((LD->getExtensionType() != ISD::NON_EXTLOAD) ||
+      (AM != ISD::POST_INC && AM != ISD::PRE_DEC)) {
+
+    return false;
+  }
+
+  unsigned Opcode = 0;
+  bool isPre = (AM == ISD::PRE_DEC);
+  int Offs = cast<ConstantSDNode>(LD->getOffset())->getSExtValue();
+
+  switch (VT.SimpleTy) {
+  case MVT::i8: {
+    if ((!isPre && Offs != 1) || (isPre && Offs != -1)) {
+      return false;
+    }
+
+    Opcode = (isPre) ? AVR::LDRdPtrPd : AVR::LDRdPtrPi;
+    break;
+  }
+  case MVT::i16: {
+    if ((!isPre && Offs != 2) || (isPre && Offs != -2)) {
+      return false;
+    }
+
+    Opcode = (isPre) ? AVR::LDWRdPtrPd : AVR::LDWRdPtrPi;
+    break;
+  }
+  default:
+    return false;
+  }
+
+  SDNode *ResNode = CurDAG->getMachineNode(Opcode, SDLoc(N), VT,
+                                           PtrVT, MVT::Other,
+                                           LD->getBasePtr(), LD->getChain());
+  ReplaceUses(N, ResNode);
+  CurDAG->RemoveDeadNode(N);
+
+  return true;
+}
+
+unsigned AVRDAGToDAGISel::selectIndexedProgMemLoad(const LoadSDNode *LD,
+                                                   MVT VT) {
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+
+  // Progmem indexed loads only work in POSTINC mode.
+  if (LD->getExtensionType() != ISD::NON_EXTLOAD || AM != ISD::POST_INC) {
+    return 0;
+  }
+
+  unsigned Opcode = 0;
+  int Offs = cast<ConstantSDNode>(LD->getOffset())->getSExtValue();
+
+  switch (VT.SimpleTy) {
+  case MVT::i8: {
+    if (Offs != 1) {
+      return 0;
+    }
+    Opcode = AVR::LPMRdZPi;
+    break;
+  }
+  case MVT::i16: {
+    if (Offs != 2) {
+      return 0;
+    }
+    Opcode = AVR::LPMWRdZPi;
+    break;
+  }
+  default:
+    return 0;
+  }
+
+  return Opcode;
+}
+
+bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                                   unsigned ConstraintCode,
+                                                   std::vector<SDValue> &OutOps) {
+  assert(ConstraintCode == InlineAsm::Constraint_m ||
+         ConstraintCode == InlineAsm::Constraint_Q &&
+      "Unexpected asm memory constraint");
+
+  MachineRegisterInfo &RI = MF->getRegInfo();
+  const AVRSubtarget &STI = MF->getSubtarget<AVRSubtarget>();
+  const TargetLowering &TL = *STI.getTargetLowering();
+  SDLoc dl(Op);
+  auto DL = CurDAG->getDataLayout();
+
+  const RegisterSDNode *RegNode = dyn_cast<RegisterSDNode>(Op);
+
+  // If address operand is of PTRDISPREGS class, all is OK, then.
+  if (RegNode &&
+      RI.getRegClass(RegNode->getReg()) == &AVR::PTRDISPREGSRegClass) {
+    OutOps.push_back(Op);
+    return false;
+  }
+
+  if (Op->getOpcode() == ISD::FrameIndex) {
+    SDValue Base, Disp;
+
+    if (SelectAddr(Op.getNode(), Op, Base, Disp)) {
+      OutOps.push_back(Base);
+      OutOps.push_back(Disp);
+
+      return false;
+    }
+
+    return true;
+  }
+
+  // If Op is add 'register, immediate' and
+  // register is either virtual register or register of PTRDISPREGSRegClass
+  if (Op->getOpcode() == ISD::ADD || Op->getOpcode() == ISD::SUB) {
+    SDValue CopyFromRegOp = Op->getOperand(0);
+    SDValue ImmOp = Op->getOperand(1);
+    ConstantSDNode *ImmNode = dyn_cast<ConstantSDNode>(ImmOp);
+
+    unsigned Reg;
+    bool CanHandleRegImmOpt = true;
+
+    CanHandleRegImmOpt &= ImmNode != 0;
+    CanHandleRegImmOpt &= ImmNode->getAPIntValue().getZExtValue() < 64;
+
+    if (CopyFromRegOp->getOpcode() == ISD::CopyFromReg) {
+      RegisterSDNode *RegNode =
+          cast<RegisterSDNode>(CopyFromRegOp->getOperand(1));
+      Reg = RegNode->getReg();
+      CanHandleRegImmOpt &= (TargetRegisterInfo::isVirtualRegister(Reg) ||
+                             AVR::PTRDISPREGSRegClass.contains(Reg));
+    } else {
+      CanHandleRegImmOpt = false;
+    }
+
+    // If we detect proper case - correct virtual register class
+    // if needed and go to another inlineasm operand.
+    if (CanHandleRegImmOpt) {
+      SDValue Base, Disp;
+
+      if (RI.getRegClass(Reg) != &AVR::PTRDISPREGSRegClass) {
+        SDLoc dl(CopyFromRegOp);
+
+        unsigned VReg = RI.createVirtualRegister(&AVR::PTRDISPREGSRegClass);
+
+        SDValue CopyToReg =
+            CurDAG->getCopyToReg(CopyFromRegOp, dl, VReg, CopyFromRegOp);
+
+        SDValue NewCopyFromRegOp =
+            CurDAG->getCopyFromReg(CopyToReg, dl, VReg, TL.getPointerTy(DL));
+
+        Base = NewCopyFromRegOp;
+      } else {
+        Base = CopyFromRegOp;
+      }
+
+      if (ImmNode->getValueType(0) != MVT::i8) {
+        Disp = CurDAG->getTargetConstant(ImmNode->getAPIntValue().getZExtValue(), dl, MVT::i8);
+      } else {
+        Disp = ImmOp;
+      }
+
+      OutOps.push_back(Base);
+      OutOps.push_back(Disp);
+
+      return false;
+    }
+  }
+
+  // More generic case.
+  // Create chain that puts Op into pointer register
+  // and return that register.
+  unsigned VReg = RI.createVirtualRegister(&AVR::PTRDISPREGSRegClass);
+
+  SDValue CopyToReg = CurDAG->getCopyToReg(Op, dl, VReg, Op);
+  SDValue CopyFromReg =
+      CurDAG->getCopyFromReg(CopyToReg, dl, VReg, TL.getPointerTy(DL));
+
+  OutOps.push_back(CopyFromReg);
+
+  return false;
+}
+
+template <> bool AVRDAGToDAGISel::select<ISD::FrameIndex>(SDNode *N) {
+  auto DL = CurDAG->getDataLayout();
+
+  // Convert the frameindex into a temp instruction that will hold the
+  // effective address of the final stack slot.
+  int FI = cast<FrameIndexSDNode>(N)->getIndex();
+  SDValue TFI =
+    CurDAG->getTargetFrameIndex(FI, getTargetLowering()->getPointerTy(DL));
+
+  CurDAG->SelectNodeTo(N, AVR::FRMIDX,
+                       getTargetLowering()->getPointerTy(DL), TFI,
+                       CurDAG->getTargetConstant(0, SDLoc(N), MVT::i16));
+  return true;
+}
+
+template <> bool AVRDAGToDAGISel::select<ISD::STORE>(SDNode *N) {
+  // Use the STD{W}SPQRr pseudo instruction when passing arguments through
+  // the stack on function calls for further expansion during the PEI phase.
+  const StoreSDNode *ST = cast<StoreSDNode>(N);
+  SDValue BasePtr = ST->getBasePtr();
+
+  // Early exit when the base pointer is a frame index node or a constant.
+  if (isa<FrameIndexSDNode>(BasePtr) || isa<ConstantSDNode>(BasePtr) ||
+      BasePtr.isUndef()) {
+    return false;
+  }
+
+  const RegisterSDNode *RN = dyn_cast<RegisterSDNode>(BasePtr.getOperand(0));
+  // Only stores where SP is the base pointer are valid.
+  if (!RN || (RN->getReg() != AVR::SP)) {
+    return false;
+  }
+
+  int CST = (int)cast<ConstantSDNode>(BasePtr.getOperand(1))->getZExtValue();
+  SDValue Chain = ST->getChain();
+  EVT VT = ST->getValue().getValueType();
+  SDLoc DL(N);
+  SDValue Offset = CurDAG->getTargetConstant(CST, DL, MVT::i16);
+  SDValue Ops[] = {BasePtr.getOperand(0), Offset, ST->getValue(), Chain};
+  unsigned Opc = (VT == MVT::i16) ? AVR::STDWSPQRr : AVR::STDSPQRr;
+
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, DL, MVT::Other, Ops);
+
+  // Transfer memory operands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = ST->getMemOperand();
+  cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
+
+  ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
+  CurDAG->RemoveDeadNode(N);
+
+  return true;
+}
+
+template <> bool AVRDAGToDAGISel::select<ISD::LOAD>(SDNode *N) {
+  const LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (!AVR::isProgramMemoryAccess(LD)) {
+    // Check if the opcode can be converted into an indexed load.
+    return selectIndexedLoad(N);
+  }
+
+  assert(Subtarget->hasLPM() && "cannot load from program memory on this mcu");
+
+  // This is a flash memory load, move the pointer into R31R30 and emit
+  // the lpm instruction.
+  MVT VT = LD->getMemoryVT().getSimpleVT();
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  SDNode *ResNode;
+  SDLoc DL(N);
+
+  Chain = CurDAG->getCopyToReg(Chain, DL, AVR::R31R30, Ptr, SDValue());
+  Ptr = CurDAG->getCopyFromReg(Chain, DL, AVR::R31R30, MVT::i16,
+                               Chain.getValue(1));
+
+  SDValue RegZ = CurDAG->getRegister(AVR::R31R30, MVT::i16);
+
+  // Check if the opcode can be converted into an indexed load.
+  if (unsigned LPMOpc = selectIndexedProgMemLoad(LD, VT)) {
+    // It is legal to fold the load into an indexed load.
+    ResNode = CurDAG->getMachineNode(LPMOpc, DL, VT, MVT::i16, MVT::Other, Ptr,
+                                     RegZ);
+    ReplaceUses(SDValue(N, 1), SDValue(ResNode, 1));
+  } else {
+    // Selecting an indexed load is not legal, fallback to a normal load.
+    switch (VT.SimpleTy) {
+    case MVT::i8:
+      ResNode = CurDAG->getMachineNode(AVR::LPMRdZ, DL, MVT::i8, MVT::Other,
+                                       Ptr, RegZ);
+      break;
+    case MVT::i16:
+      ResNode = CurDAG->getMachineNode(AVR::LPMWRdZ, DL, MVT::i16,
+                                       MVT::Other, Ptr, RegZ);
+      ReplaceUses(SDValue(N, 1), SDValue(ResNode, 1));
+      break;
+    default:
+      llvm_unreachable("Unsupported VT!");
+    }
+  }
+
+  // Transfer memory operands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = LD->getMemOperand();
+  cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
+
+  ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
+  ReplaceUses(SDValue(N, 1), SDValue(ResNode, 1));
+  CurDAG->RemoveDeadNode(N);
+
+  return true;
+}
+
+template <> bool AVRDAGToDAGISel::select<AVRISD::CALL>(SDNode *N) {
+  SDValue InFlag;
+  SDValue Chain = N->getOperand(0);
+  SDValue Callee = N->getOperand(1);
+  unsigned LastOpNum = N->getNumOperands() - 1;
+
+  // Direct calls are autogenerated.
+  unsigned Op = Callee.getOpcode();
+  if (Op == ISD::TargetGlobalAddress || Op == ISD::TargetExternalSymbol) {
+    return false;
+  }
+
+  // Skip the incoming flag if present
+  if (N->getOperand(LastOpNum).getValueType() == MVT::Glue) {
+    --LastOpNum;
+  }
+
+  SDLoc DL(N);
+  Chain = CurDAG->getCopyToReg(Chain, DL, AVR::R31R30, Callee, InFlag);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(CurDAG->getRegister(AVR::R31R30, MVT::i16));
+
+  // Map all operands into the new node.
+  for (unsigned i = 2, e = LastOpNum + 1; i != e; ++i) {
+    Ops.push_back(N->getOperand(i));
+  }
+
+  Ops.push_back(Chain);
+  Ops.push_back(Chain.getValue(1));
+
+  SDNode *ResNode =
+    CurDAG->getMachineNode(AVR::ICALL, DL, MVT::Other, MVT::Glue, Ops);
+
+  ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
+  ReplaceUses(SDValue(N, 1), SDValue(ResNode, 1));
+  CurDAG->RemoveDeadNode(N);
+
+  return true;
+}
+
+template <> bool AVRDAGToDAGISel::select<ISD::BRIND>(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue JmpAddr = N->getOperand(1);
+
+  SDLoc DL(N);
+  // Move the destination address of the indirect branch into R31R30.
+  Chain = CurDAG->getCopyToReg(Chain, DL, AVR::R31R30, JmpAddr);
+  SDNode *ResNode = CurDAG->getMachineNode(AVR::IJMP, DL, MVT::Other, Chain);
+
+  ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
+  CurDAG->RemoveDeadNode(N);
+
+  return true;
+}
+
+bool AVRDAGToDAGISel::selectMultiplication(llvm::SDNode *N) {
+  SDLoc DL(N);
+  MVT Type = N->getSimpleValueType(0);
+
+  assert(Type == MVT::i8 && "unexpected value type");
+
+  bool isSigned = N->getOpcode() == ISD::SMUL_LOHI;
+  unsigned MachineOp = isSigned ? AVR::MULSRdRr : AVR::MULRdRr;
+
+  SDValue Lhs = N->getOperand(0);
+  SDValue Rhs = N->getOperand(1);
+  SDNode *Mul = CurDAG->getMachineNode(MachineOp, DL, MVT::Glue, Lhs, Rhs);
+  SDValue InChain = CurDAG->getEntryNode();
+  SDValue InGlue = SDValue(Mul, 0);
+
+  // Copy the low half of the result, if it is needed.
+  if (N->hasAnyUseOfValue(0)) {
+    SDValue CopyFromLo =
+        CurDAG->getCopyFromReg(InChain, DL, AVR::R0, Type, InGlue);
+
+    ReplaceUses(SDValue(N, 0), CopyFromLo);
+
+    InChain = CopyFromLo.getValue(1);
+    InGlue = CopyFromLo.getValue(2);
+  }
+
+  // Copy the high half of the result, if it is needed.
+  if (N->hasAnyUseOfValue(1)) {
+    SDValue CopyFromHi =
+        CurDAG->getCopyFromReg(InChain, DL, AVR::R1, Type, InGlue);
+
+    ReplaceUses(SDValue(N, 1), CopyFromHi);
+
+    InChain = CopyFromHi.getValue(1);
+    InGlue = CopyFromHi.getValue(2);
+  }
+
+  CurDAG->RemoveDeadNode(N);
+
+  // We need to clear R1. This is currently done (dirtily)
+  // using a custom inserter.
+
+  return true;
+}
+
+void AVRDAGToDAGISel::Select(SDNode *N) {
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: "; N->dump(CurDAG); errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (N->isMachineOpcode()) {
+    DEBUG(errs() << "== "; N->dump(CurDAG); errs() << "\n");
+    N->setNodeId(-1);
+    return;
+  }
+
+  // See if subclasses can handle this node.
+  if (trySelect(N))
+    return;
+
+  // Select the default instruction
+  SelectCode(N);
+}
+
+bool AVRDAGToDAGISel::trySelect(SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  SDLoc DL(N);
+
+  switch (Opcode) {
+  // Nodes we fully handle.
+  case ISD::FrameIndex: return select<ISD::FrameIndex>(N);
+  case ISD::BRIND:      return select<ISD::BRIND>(N);
+  case ISD::UMUL_LOHI:
+  case ISD::SMUL_LOHI:  return selectMultiplication(N);
+
+  // Nodes we handle partially. Other cases are autogenerated
+  case ISD::STORE:   return select<ISD::STORE>(N);
+  case ISD::LOAD:    return select<ISD::LOAD>(N);
+  case AVRISD::CALL: return select<AVRISD::CALL>(N);
+  default:           return false;
+  }
+}
+
+FunctionPass *createAVRISelDag(AVRTargetMachine &TM,
+                               CodeGenOpt::Level OptLevel) {
+  return new AVRDAGToDAGISel(TM, OptLevel);
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp b/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp
new file mode 100644
index 000000000000..53668f05b59b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -0,0 +1,1937 @@
+//===-- AVRISelLowering.cpp - AVR DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that AVR uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRISelLowering.h"
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#include "AVR.h"
+#include "AVRMachineFunctionInfo.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+namespace llvm {
+
+AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm)
+    : TargetLowering(tm) {
+  // Set up the register classes.
+  addRegisterClass(MVT::i8, &AVR::GPR8RegClass);
+  addRegisterClass(MVT::i16, &AVR::DREGSRegClass);
+
+  // Compute derived properties from the register classes.
+  computeRegisterProperties(tm.getSubtargetImpl()->getRegisterInfo());
+
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent);
+  setSchedulingPreference(Sched::RegPressure);
+  setStackPointerRegisterToSaveRestore(AVR::SP);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i16, Custom);
+  setOperationAction(ISD::BlockAddress, MVT::i16, Custom);
+
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i8, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Expand);
+
+  for (MVT VT : MVT::integer_valuetypes()) {
+    for (auto N : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}) {
+      setLoadExtAction(N, VT, MVT::i1, Promote);
+      setLoadExtAction(N, VT, MVT::i8, Expand);
+    }
+  }
+
+  setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+
+  // sub (x, imm) gets canonicalized to add (x, -imm), so for illegal types
+  // revert into a sub since we don't have an add with immediate instruction.
+  setOperationAction(ISD::ADD, MVT::i32, Custom);
+  setOperationAction(ISD::ADD, MVT::i64, Custom);
+
+  // our shift instructions are only able to shift 1 bit at a time, so handle
+  // this in a custom way.
+  setOperationAction(ISD::SRA, MVT::i8, Custom);
+  setOperationAction(ISD::SHL, MVT::i8, Custom);
+  setOperationAction(ISD::SRL, MVT::i8, Custom);
+  setOperationAction(ISD::SRA, MVT::i16, Custom);
+  setOperationAction(ISD::SHL, MVT::i16, Custom);
+  setOperationAction(ISD::SRL, MVT::i16, Custom);
+  setOperationAction(ISD::SHL_PARTS, MVT::i16, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i16, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i16, Expand);
+
+  setOperationAction(ISD::BR_CC, MVT::i8, Custom);
+  setOperationAction(ISD::BR_CC, MVT::i16, Custom);
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+
+  setOperationAction(ISD::SELECT_CC, MVT::i8, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i16, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  setOperationAction(ISD::SETCC, MVT::i8, Custom);
+  setOperationAction(ISD::SETCC, MVT::i16, Custom);
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+  setOperationAction(ISD::SETCC, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT, MVT::i8, Expand);
+  setOperationAction(ISD::SELECT, MVT::i16, Expand);
+
+  setOperationAction(ISD::BSWAP, MVT::i16, Expand);
+
+  // Add support for postincrement and predecrement load/stores.
+  setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
+  setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
+  setIndexedLoadAction(ISD::PRE_DEC, MVT::i8, Legal);
+  setIndexedLoadAction(ISD::PRE_DEC, MVT::i16, Legal);
+  setIndexedStoreAction(ISD::POST_INC, MVT::i8, Legal);
+  setIndexedStoreAction(ISD::POST_INC, MVT::i16, Legal);
+  setIndexedStoreAction(ISD::PRE_DEC, MVT::i8, Legal);
+  setIndexedStoreAction(ISD::PRE_DEC, MVT::i16, Legal);
+
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  setOperationAction(ISD::VAARG, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+
+  // Atomic operations which must be lowered to rtlib calls
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setOperationAction(ISD::ATOMIC_SWAP, VT, Expand);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, VT, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX, VT, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN, VT, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, VT, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, VT, Expand);
+  }
+
+  // Division/remainder
+  setOperationAction(ISD::UDIV, MVT::i8, Expand);
+  setOperationAction(ISD::UDIV, MVT::i16, Expand);
+  setOperationAction(ISD::UREM, MVT::i8, Expand);
+  setOperationAction(ISD::UREM, MVT::i16, Expand);
+  setOperationAction(ISD::SDIV, MVT::i8, Expand);
+  setOperationAction(ISD::SDIV, MVT::i16, Expand);
+  setOperationAction(ISD::SREM, MVT::i8, Expand);
+  setOperationAction(ISD::SREM, MVT::i16, Expand);
+
+  // Make division and modulus custom
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setOperationAction(ISD::UDIVREM, VT, Custom);
+    setOperationAction(ISD::SDIVREM, VT, Custom);
+  }
+
+  // Do not use MUL. The AVR instructions are closer to SMUL_LOHI &co.
+  setOperationAction(ISD::MUL, MVT::i8, Expand);
+  setOperationAction(ISD::MUL, MVT::i16, Expand);
+
+  // Expand 16 bit multiplications.
+  setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
+
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setOperationAction(ISD::MULHS, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
+  }
+
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+  }
+
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+    // TODO: The generated code is pretty poor. Investigate using the
+    // same "shift and subtract with carry" trick that we do for
+    // extending 8-bit to 16-bit. This may require infrastructure
+    // improvements in how we treat 16-bit "registers" to be feasible.
+  }
+
+  // Division rtlib functions (not supported)
+  setLibcallName(RTLIB::SDIV_I8, nullptr);
+  setLibcallName(RTLIB::SDIV_I16, nullptr);
+  setLibcallName(RTLIB::SDIV_I32, nullptr);
+  setLibcallName(RTLIB::SDIV_I64, nullptr);
+  setLibcallName(RTLIB::SDIV_I128, nullptr);
+  setLibcallName(RTLIB::UDIV_I8, nullptr);
+  setLibcallName(RTLIB::UDIV_I16, nullptr);
+  setLibcallName(RTLIB::UDIV_I32, nullptr);
+  setLibcallName(RTLIB::UDIV_I64, nullptr);
+  setLibcallName(RTLIB::UDIV_I128, nullptr);
+
+  // Modulus rtlib functions (not supported)
+  setLibcallName(RTLIB::SREM_I8, nullptr);
+  setLibcallName(RTLIB::SREM_I16, nullptr);
+  setLibcallName(RTLIB::SREM_I32, nullptr);
+  setLibcallName(RTLIB::SREM_I64, nullptr);
+  setLibcallName(RTLIB::SREM_I128, nullptr);
+  setLibcallName(RTLIB::UREM_I8, nullptr);
+  setLibcallName(RTLIB::UREM_I16, nullptr);
+  setLibcallName(RTLIB::UREM_I32, nullptr);
+  setLibcallName(RTLIB::UREM_I64, nullptr);
+  setLibcallName(RTLIB::UREM_I128, nullptr);
+
+  // Division and modulus rtlib functions
+  setLibcallName(RTLIB::SDIVREM_I8, "__divmodqi4");
+  setLibcallName(RTLIB::SDIVREM_I16, "__divmodhi4");
+  setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
+  setLibcallName(RTLIB::SDIVREM_I64, "__divmoddi4");
+  setLibcallName(RTLIB::SDIVREM_I128, "__divmodti4");
+  setLibcallName(RTLIB::UDIVREM_I8, "__udivmodqi4");
+  setLibcallName(RTLIB::UDIVREM_I16, "__udivmodhi4");
+  setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+  setLibcallName(RTLIB::UDIVREM_I64, "__udivmoddi4");
+  setLibcallName(RTLIB::UDIVREM_I128, "__udivmodti4");
+
+  // Several of the runtime library functions use a special calling conv
+  setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::AVR_BUILTIN);
+  setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::AVR_BUILTIN);
+  setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::AVR_BUILTIN);
+  setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::AVR_BUILTIN);
+
+  // Trigonometric rtlib functions
+  setLibcallName(RTLIB::SIN_F32, "sin");
+  setLibcallName(RTLIB::COS_F32, "cos");
+
+  setMinFunctionAlignment(1);
+  setMinimumJumpTableEntries(INT_MAX);
+}
+
+const char *AVRTargetLowering::getTargetNodeName(unsigned Opcode) const {
+#define NODE(name)       \
+  case AVRISD::name:     \
+    return #name
+
+  switch (Opcode) {
+  default:
+    return nullptr;
+    NODE(RET_FLAG);
+    NODE(RETI_FLAG);
+    NODE(CALL);
+    NODE(WRAPPER);
+    NODE(LSL);
+    NODE(LSR);
+    NODE(ROL);
+    NODE(ROR);
+    NODE(ASR);
+    NODE(LSLLOOP);
+    NODE(LSRLOOP);
+    NODE(ASRLOOP);
+    NODE(BRCOND);
+    NODE(CMP);
+    NODE(CMPC);
+    NODE(TST);
+    NODE(SELECT_CC);
+#undef NODE
+  }
+}
+
+EVT AVRTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                                          EVT VT) const {
+  assert(!VT.isVector() && "No AVR SetCC type for vectors!");
+  return MVT::i8;
+}
+
+SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
+  //:TODO: this function has to be completely rewritten to produce optimal
+  // code, for now it's producing very long but correct code.
+  unsigned Opc8;
+  const SDNode *N = Op.getNode();
+  EVT VT = Op.getValueType();
+  SDLoc dl(N);
+
+  // Expand non-constant shifts to loops.
+  if (!isa<ConstantSDNode>(N->getOperand(1))) {
+    switch (Op.getOpcode()) {
+    default:
+      llvm_unreachable("Invalid shift opcode!");
+    case ISD::SHL:
+      return DAG.getNode(AVRISD::LSLLOOP, dl, VT, N->getOperand(0),
+                         N->getOperand(1));
+    case ISD::SRL:
+      return DAG.getNode(AVRISD::LSRLOOP, dl, VT, N->getOperand(0),
+                         N->getOperand(1));
+    case ISD::SRA:
+      return DAG.getNode(AVRISD::ASRLOOP, dl, VT, N->getOperand(0),
+                         N->getOperand(1));
+    }
+  }
+
+  uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  SDValue Victim = N->getOperand(0);
+
+  switch (Op.getOpcode()) {
+  case ISD::SRA:
+    Opc8 = AVRISD::ASR;
+    break;
+  case ISD::ROTL:
+    Opc8 = AVRISD::ROL;
+    break;
+  case ISD::ROTR:
+    Opc8 = AVRISD::ROR;
+    break;
+  case ISD::SRL:
+    Opc8 = AVRISD::LSR;
+    break;
+  case ISD::SHL:
+    Opc8 = AVRISD::LSL;
+    break;
+  default:
+    llvm_unreachable("Invalid shift opcode");
+  }
+
+  while (ShiftAmount--) {
+    Victim = DAG.getNode(Opc8, dl, VT, Victim);
+  }
+
+  return Victim;
+}
+
+SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
+  unsigned Opcode = Op->getOpcode();
+  assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
+         "Invalid opcode for Div/Rem lowering");
+  bool isSigned = (Opcode == ISD::SDIVREM);
+  EVT VT = Op->getValueType(0);
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+
+  RTLIB::Libcall LC;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    llvm_unreachable("Unexpected request for libcall!");
+  case MVT::i8:
+    LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8;
+    break;
+  case MVT::i16:
+    LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16;
+    break;
+  case MVT::i32:
+    LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32;
+    break;
+  case MVT::i64:
+    LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64;
+    break;
+  }
+
+  SDValue InChain = DAG.getEntryNode();
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (SDValue const &Value : Op->op_values()) {
+    Entry.Node = Value;
+    Entry.Ty = Value.getValueType().getTypeForEVT(*DAG.getContext());
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy(DAG.getDataLayout()));
+
+  Type *RetTy = (Type *)StructType::get(Ty, Ty, nullptr);
+
+  SDLoc dl(Op);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+      .setChain(InChain)
+      .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+      .setInRegister()
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+  return CallInfo.first;
+}
+
+SDValue AVRTargetLowering::LowerGlobalAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  auto DL = DAG.getDataLayout();
+
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+
+  // Create the TargetGlobalAddress node, folding in the constant offset.
+  SDValue Result =
+      DAG.getTargetGlobalAddress(GV, SDLoc(Op), getPointerTy(DL), Offset);
+  return DAG.getNode(AVRISD::WRAPPER, SDLoc(Op), getPointerTy(DL), Result);
+}
+
+SDValue AVRTargetLowering::LowerBlockAddress(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  auto DL = DAG.getDataLayout();
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+
+  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(DL));
+
+  return DAG.getNode(AVRISD::WRAPPER, SDLoc(Op), getPointerTy(DL), Result);
+}
+
+/// IntCCToAVRCC - Convert a DAG integer condition code to an AVR CC.
+static AVRCC::CondCodes intCCToAVRCC(ISD::CondCode CC) {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown condition code!");
+  case ISD::SETEQ:
+    return AVRCC::COND_EQ;
+  case ISD::SETNE:
+    return AVRCC::COND_NE;
+  case ISD::SETGE:
+    return AVRCC::COND_GE;
+  case ISD::SETLT:
+    return AVRCC::COND_LT;
+  case ISD::SETUGE:
+    return AVRCC::COND_SH;
+  case ISD::SETULT:
+    return AVRCC::COND_LO;
+  }
+}
+
+/// Returns appropriate AVR CMP/CMPC nodes and corresponding condition code for
+/// the given operands.
+SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                     SDValue &AVRcc, SelectionDAG &DAG,
+                                     SDLoc DL) const {
+  SDValue Cmp;
+  EVT VT = LHS.getValueType();
+  bool UseTest = false;
+
+  switch (CC) {
+  default:
+    break;
+  case ISD::SETLE: {
+    // Swap operands and reverse the branching condition.
+    std::swap(LHS, RHS);
+    CC = ISD::SETGE;
+    break;
+  }
+  case ISD::SETGT: {
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
+      switch (C->getSExtValue()) {
+      case -1: {
+        // When doing lhs > -1 use a tst instruction on the top part of lhs
+        // and use brpl instead of using a chain of cp/cpc.
+        UseTest = true;
+        AVRcc = DAG.getConstant(AVRCC::COND_PL, DL, MVT::i8);
+        break;
+      }
+      case 0: {
+        // Turn lhs > 0 into 0 < lhs since 0 can be materialized with
+        // __zero_reg__ in lhs.
+        RHS = LHS;
+        LHS = DAG.getConstant(0, DL, VT);
+        CC = ISD::SETLT;
+        break;
+      }
+      default: {
+        // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows
+        // us to  fold the constant into the cmp instruction.
+        RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+        CC = ISD::SETGE;
+        break;
+      }
+      }
+      break;
+    }
+    // Swap operands and reverse the branching condition.
+    std::swap(LHS, RHS);
+    CC = ISD::SETLT;
+    break;
+  }
+  case ISD::SETLT: {
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
+      switch (C->getSExtValue()) {
+      case 1: {
+        // Turn lhs < 1 into 0 >= lhs since 0 can be materialized with
+        // __zero_reg__ in lhs.
+        RHS = LHS;
+        LHS = DAG.getConstant(0, DL, VT);
+        CC = ISD::SETGE;
+        break;
+      }
+      case 0: {
+        // When doing lhs < 0 use a tst instruction on the top part of lhs
+        // and use brmi instead of using a chain of cp/cpc.
+        UseTest = true;
+        AVRcc = DAG.getConstant(AVRCC::COND_MI, DL, MVT::i8);
+        break;
+      }
+      }
+    }
+    break;
+  }
+  case ISD::SETULE: {
+    // Swap operands and reverse the branching condition.
+    std::swap(LHS, RHS);
+    CC = ISD::SETUGE;
+    break;
+  }
+  case ISD::SETUGT: {
+    // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to
+    // fold the constant into the cmp instruction.
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
+      RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+      CC = ISD::SETUGE;
+      break;
+    }
+    // Swap operands and reverse the branching condition.
+    std::swap(LHS, RHS);
+    CC = ISD::SETULT;
+    break;
+  }
+  }
+
+  // Expand 32 and 64 bit comparisons with custom CMP and CMPC nodes instead of
+  // using the default and/or/xor expansion code which is much longer.
+  if (VT == MVT::i32) {
+    SDValue LHSlo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS,
+                                DAG.getIntPtrConstant(0, DL));
+    SDValue LHShi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS,
+                                DAG.getIntPtrConstant(1, DL));
+    SDValue RHSlo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS,
+                                DAG.getIntPtrConstant(0, DL));
+    SDValue RHShi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS,
+                                DAG.getIntPtrConstant(1, DL));
+
+    if (UseTest) {
+      // When using tst we only care about the highest part.
+      SDValue Top = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHShi,
+                                DAG.getIntPtrConstant(1, DL));
+      Cmp = DAG.getNode(AVRISD::TST, DL, MVT::Glue, Top);
+    } else {
+      Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHSlo, RHSlo);
+      Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHShi, RHShi, Cmp);
+    }
+  } else if (VT == MVT::i64) {
+    SDValue LHS_0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS,
+                                DAG.getIntPtrConstant(0, DL));
+    SDValue LHS_1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS,
+                                DAG.getIntPtrConstant(1, DL));
+
+    SDValue LHS0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS_0,
+                               DAG.getIntPtrConstant(0, DL));
+    SDValue LHS1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS_0,
+                               DAG.getIntPtrConstant(1, DL));
+    SDValue LHS2 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS_1,
+                               DAG.getIntPtrConstant(0, DL));
+    SDValue LHS3 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS_1,
+                               DAG.getIntPtrConstant(1, DL));
+
+    SDValue RHS_0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS,
+                                DAG.getIntPtrConstant(0, DL));
+    SDValue RHS_1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS,
+                                DAG.getIntPtrConstant(1, DL));
+
+    SDValue RHS0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS_0,
+                               DAG.getIntPtrConstant(0, DL));
+    SDValue RHS1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS_0,
+                               DAG.getIntPtrConstant(1, DL));
+    SDValue RHS2 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS_1,
+                               DAG.getIntPtrConstant(0, DL));
+    SDValue RHS3 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS_1,
+                               DAG.getIntPtrConstant(1, DL));
+
+    if (UseTest) {
+      // When using tst we only care about the highest part.
+      SDValue Top = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHS3,
+                                DAG.getIntPtrConstant(1, DL));
+      Cmp = DAG.getNode(AVRISD::TST, DL, MVT::Glue, Top);
+    } else {
+      Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHS0, RHS0);
+      Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS1, RHS1, Cmp);
+      Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS2, RHS2, Cmp);
+      Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS3, RHS3, Cmp);
+    }
+  } else if (VT == MVT::i8 || VT == MVT::i16) {
+    if (UseTest) {
+      // When using tst we only care about the highest part.
+      Cmp = DAG.getNode(AVRISD::TST, DL, MVT::Glue,
+                        (VT == MVT::i8)
+                            ? LHS
+                            : DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8,
+                                          LHS, DAG.getIntPtrConstant(1, DL)));
+    } else {
+      Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHS, RHS);
+    }
+  } else {
+    llvm_unreachable("Invalid comparison size");
+  }
+
+  // When using a test instruction AVRcc is already set.
+  if (!UseTest) {
+    AVRcc = DAG.getConstant(intCCToAVRCC(CC), DL, MVT::i8);
+  }
+
+  return Cmp;
+}
+
+SDValue AVRTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  SDLoc dl(Op);
+
+  SDValue TargetCC;
+  SDValue Cmp = getAVRCmp(LHS, RHS, CC, TargetCC, DAG, dl);
+
+  return DAG.getNode(AVRISD::BRCOND, dl, MVT::Other, Chain, Dest, TargetCC,
+                     Cmp);
+}
+
+SDValue AVRTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue TrueV = Op.getOperand(2);
+  SDValue FalseV = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDLoc dl(Op);
+
+  SDValue TargetCC;
+  SDValue Cmp = getAVRCmp(LHS, RHS, CC, TargetCC, DAG, dl);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  SDValue Ops[] = {TrueV, FalseV, TargetCC, Cmp};
+
+  return DAG.getNode(AVRISD::SELECT_CC, dl, VTs, Ops);
+}
+
+SDValue AVRTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDLoc DL(Op);
+
+  SDValue TargetCC;
+  SDValue Cmp = getAVRCmp(LHS, RHS, CC, TargetCC, DAG, DL);
+
+  SDValue TrueV = DAG.getConstant(1, DL, Op.getValueType());
+  SDValue FalseV = DAG.getConstant(0, DL, Op.getValueType());
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  SDValue Ops[] = {TrueV, FalseV, TargetCC, Cmp};
+
+  return DAG.getNode(AVRISD::SELECT_CC, DL, VTs, Ops);
+}
+
+SDValue AVRTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  auto DL = DAG.getDataLayout();
+  SDLoc dl(Op);
+
+  // Vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  SDValue FI = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), getPointerTy(DL));
+
+  return DAG.getStore(Op.getOperand(0), dl, FI, Op.getOperand(1),
+                      MachinePointerInfo(SV), 0);
+}
+
+SDValue AVRTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to custom lower this!");
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR:
+    return LowerShifts(Op, DAG);
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::BlockAddress:
+    return LowerBlockAddress(Op, DAG);
+  case ISD::BR_CC:
+    return LowerBR_CC(Op, DAG);
+  case ISD::SELECT_CC:
+    return LowerSELECT_CC(Op, DAG);
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+    return LowerDivRem(Op, DAG);
+  }
+
+  return SDValue();
+}
+
+/// Replace a node with an illegal result type
+/// with a new node built out of custom code.
+void AVRTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue> &Results,
+                                           SelectionDAG &DAG) const {
+  SDLoc DL(N);
+
+  switch (N->getOpcode()) {
+  case ISD::ADD: {
+    // Convert add (x, imm) into sub (x, -imm).
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+      SDValue Sub = DAG.getNode(
+          ISD::SUB, DL, N->getValueType(0), N->getOperand(0),
+          DAG.getConstant(-C->getAPIntValue(), DL, C->getValueType(0)));
+      Results.push_back(Sub);
+    }
+    break;
+  }
+  default: {
+    SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+    for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
+      Results.push_back(Res.getValue(I));
+
+    break;
+  }
+  }
+}
+
+/// Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool AVRTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                              const AddrMode &AM, Type *Ty,
+                                              unsigned AS) const {
+  int64_t Offs = AM.BaseOffs;
+
+  // Allow absolute addresses.
+  if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && Offs == 0) {
+    return true;
+  }
+
+  // Flash memory instructions only allow zero offsets.
+  if (isa<PointerType>(Ty) && AS == AVR::ProgramMemory) {
+    return false;
+  }
+
+  // Allow reg+<6bit> offset.
+  if (Offs < 0)
+    Offs = -Offs;
+  if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 0 && isUInt<6>(Offs)) {
+    return true;
+  }
+
+  return false;
+}
+
+/// Returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if the node's address
+/// can be legally represented as pre-indexed load / store address.
+bool AVRTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                                  SDValue &Offset,
+                                                  ISD::MemIndexedMode &AM,
+                                                  SelectionDAG &DAG) const {
+  EVT VT;
+  const SDNode *Op;
+  SDLoc DL(N);
+
+  if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    Op = LD->getBasePtr().getNode();
+    if (LD->getExtensionType() != ISD::NON_EXTLOAD)
+      return false;
+    if (AVR::isProgramMemoryAccess(LD)) {
+      return false;
+    }
+  } else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    Op = ST->getBasePtr().getNode();
+    if (AVR::isProgramMemoryAccess(ST)) {
+      return false;
+    }
+  } else {
+    return false;
+  }
+
+  if (VT != MVT::i8 && VT != MVT::i16) {
+    return false;
+  }
+
+  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) {
+    return false;
+  }
+
+  if (const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+    int RHSC = RHS->getSExtValue();
+    if (Op->getOpcode() == ISD::SUB)
+      RHSC = -RHSC;
+
+    if ((VT == MVT::i16 && RHSC != -2) || (VT == MVT::i8 && RHSC != -1)) {
+      return false;
+    }
+
+    Base = Op->getOperand(0);
+    Offset = DAG.getConstant(RHSC, DL, MVT::i8);
+    AM = ISD::PRE_DEC;
+
+    return true;
+  }
+
+  return false;
+}
+
+/// Returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool AVRTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                                   SDValue &Base,
+                                                   SDValue &Offset,
+                                                   ISD::MemIndexedMode &AM,
+                                                   SelectionDAG &DAG) const {
+  EVT VT;
+  SDLoc DL(N);
+
+  if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    if (LD->getExtensionType() != ISD::NON_EXTLOAD)
+      return false;
+  } else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    if (AVR::isProgramMemoryAccess(ST)) {
+      return false;
+    }
+  } else {
+    return false;
+  }
+
+  if (VT != MVT::i8 && VT != MVT::i16) {
+    return false;
+  }
+
+  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) {
+    return false;
+  }
+
+  if (const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+    int RHSC = RHS->getSExtValue();
+    if (Op->getOpcode() == ISD::SUB)
+      RHSC = -RHSC;
+    if ((VT == MVT::i16 && RHSC != 2) || (VT == MVT::i8 && RHSC != 1)) {
+      return false;
+    }
+
+    Base = Op->getOperand(0);
+    Offset = DAG.getConstant(RHSC, DL, MVT::i8);
+    AM = ISD::POST_INC;
+
+    return true;
+  }
+
+  return false;
+}
+
+bool AVRTargetLowering::isOffsetFoldingLegal(
+    const GlobalAddressSDNode *GA) const {
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//             Formal Arguments Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "AVRGenCallingConv.inc"
+
+/// For each argument in a function store the number of pieces it is composed
+/// of.
+static void parseFunctionArgs(const Function *F, const DataLayout *TD,
+                              SmallVectorImpl<unsigned> &Out) {
+  for (Argument const &Arg : F->args()) {
+    unsigned Bytes = (TD->getTypeSizeInBits(Arg.getType()) + 7) / 8;
+    Out.push_back((Bytes + 1) / 2);
+  }
+}
+
+/// For external symbols there is no function prototype information so we
+/// have to rely directly on argument sizes.
+static void parseExternFuncCallArgs(const SmallVectorImpl<ISD::OutputArg> &In,
+                                    SmallVectorImpl<unsigned> &Out) {
+  for (unsigned i = 0, e = In.size(); i != e;) {
+    unsigned Size = 0;
+    unsigned Offset = 0;
+    while ((i != e) && (In[i].PartOffset == Offset)) {
+      Offset += In[i].VT.getStoreSize();
+      ++i;
+      ++Size;
+    }
+    Out.push_back(Size);
+  }
+}
+
+static StringRef getFunctionName(TargetLowering::CallLoweringInfo &CLI) {
+  SDValue Callee = CLI.Callee;
+
+  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    return G->getSymbol();
+  }
+
+  if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    return G->getGlobal()->getName();
+  }
+
+  llvm_unreachable("don't know how to get the name for this callee");
+}
+
+/// Analyze incoming and outgoing function arguments. We need custom C++ code
+/// to handle special constraints in the ABI like reversing the order of the
+/// pieces of splitted arguments. In addition, all pieces of a certain argument
+/// have to be passed either using registers or the stack but never mixing both.
+static void analyzeStandardArguments(TargetLowering::CallLoweringInfo *CLI,
+                                     const Function *F, const DataLayout *TD,
+                                     const SmallVectorImpl<ISD::OutputArg> *Outs,
+                                     const SmallVectorImpl<ISD::InputArg> *Ins,
+                                     CallingConv::ID CallConv,
+                                     SmallVectorImpl<CCValAssign> &ArgLocs,
+                                     CCState &CCInfo, bool IsCall, bool IsVarArg) {
+  static const MCPhysReg RegList8[] = {AVR::R24, AVR::R22, AVR::R20,
+                                       AVR::R18, AVR::R16, AVR::R14,
+                                       AVR::R12, AVR::R10, AVR::R8};
+  static const MCPhysReg RegList16[] = {AVR::R25R24, AVR::R23R22, AVR::R21R20,
+                                        AVR::R19R18, AVR::R17R16, AVR::R15R14,
+                                        AVR::R13R12, AVR::R11R10, AVR::R9R8};
+  if (IsVarArg) {
+    // Variadic functions do not need all the analisys below.
+    if (IsCall) {
+      CCInfo.AnalyzeCallOperands(*Outs, ArgCC_AVR_Vararg);
+    } else {
+      CCInfo.AnalyzeFormalArguments(*Ins, ArgCC_AVR_Vararg);
+    }
+    return;
+  }
+
+  // Fill in the Args array which will contain original argument sizes.
+  SmallVector<unsigned, 8> Args;
+  if (IsCall) {
+    parseExternFuncCallArgs(*Outs, Args);
+  } else {
+    assert(F != nullptr && "function should not be null");
+    parseFunctionArgs(F, TD, Args);
+  }
+
+  unsigned RegsLeft = array_lengthof(RegList8), ValNo = 0;
+  // Variadic functions always use the stack.
+  bool UsesStack = false;
+  for (unsigned i = 0, pos = 0, e = Args.size(); i != e; ++i) {
+    unsigned Size = Args[i];
+    MVT LocVT = (IsCall) ? (*Outs)[pos].VT : (*Ins)[pos].VT;
+
+    // If we have plenty of regs to pass the whole argument do it.
+    if (!UsesStack && (Size <= RegsLeft)) {
+      const MCPhysReg *RegList = (LocVT == MVT::i16) ? RegList16 : RegList8;
+
+      for (unsigned j = 0; j != Size; ++j) {
+        unsigned Reg = CCInfo.AllocateReg(
+            ArrayRef<MCPhysReg>(RegList, array_lengthof(RegList8)));
+        CCInfo.addLoc(
+            CCValAssign::getReg(ValNo++, LocVT, Reg, LocVT, CCValAssign::Full));
+        --RegsLeft;
+      }
+
+      // Reverse the order of the pieces to agree with the "big endian" format
+      // required in the calling convention ABI.
+      std::reverse(ArgLocs.begin() + pos, ArgLocs.begin() + pos + Size);
+    } else {
+      // Pass the rest of arguments using the stack.
+      UsesStack = true;
+      for (unsigned j = 0; j != Size; ++j) {
+        unsigned Offset = CCInfo.AllocateStack(
+            TD->getTypeAllocSize(EVT(LocVT).getTypeForEVT(CCInfo.getContext())),
+            TD->getABITypeAlignment(
+                EVT(LocVT).getTypeForEVT(CCInfo.getContext())));
+        CCInfo.addLoc(CCValAssign::getMem(ValNo++, LocVT, Offset, LocVT,
+                                          CCValAssign::Full));
+      }
+    }
+    pos += Size;
+  }
+}
+
+static void analyzeBuiltinArguments(TargetLowering::CallLoweringInfo &CLI,
+                                    const Function *F, const DataLayout *TD,
+                                    const SmallVectorImpl<ISD::OutputArg> *Outs,
+                                    const SmallVectorImpl<ISD::InputArg> *Ins,
+                                    CallingConv::ID CallConv,
+                                    SmallVectorImpl<CCValAssign> &ArgLocs,
+                                    CCState &CCInfo, bool IsCall, bool IsVarArg) {
+  StringRef FuncName = getFunctionName(CLI);
+
+  if (FuncName.startswith("__udivmod") || FuncName.startswith("__divmod")) {
+    CCInfo.AnalyzeCallOperands(*Outs, ArgCC_AVR_BUILTIN_DIV);
+  } else {
+    analyzeStandardArguments(&CLI, F, TD, Outs, Ins,
+                             CallConv, ArgLocs, CCInfo,
+                             IsCall, IsVarArg);
+  }
+}
+
+static void analyzeArguments(TargetLowering::CallLoweringInfo *CLI,
+                             const Function *F, const DataLayout *TD,
+                             const SmallVectorImpl<ISD::OutputArg> *Outs,
+                             const SmallVectorImpl<ISD::InputArg> *Ins,
+                             CallingConv::ID CallConv,
+                             SmallVectorImpl<CCValAssign> &ArgLocs,
+                             CCState &CCInfo, bool IsCall, bool IsVarArg) {
+  switch (CallConv) {
+    case CallingConv::AVR_BUILTIN: {
+      analyzeBuiltinArguments(*CLI, F, TD, Outs, Ins,
+                              CallConv, ArgLocs, CCInfo,
+                              IsCall, IsVarArg);
+      return;
+    }
+    default: {
+      analyzeStandardArguments(CLI, F, TD, Outs, Ins,
+                               CallConv, ArgLocs, CCInfo,
+                               IsCall, IsVarArg);
+      return;
+    }
+  }
+}
+
+SDValue AVRTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  auto DL = DAG.getDataLayout();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+
+  analyzeArguments(nullptr, MF.getFunction(), &DL, 0, &Ins, CallConv, ArgLocs, CCInfo,
+                   false, isVarArg);
+
+  SDValue ArgValue;
+  for (CCValAssign &VA : ArgLocs) {
+
+    // Arguments stored on registers.
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+      const TargetRegisterClass *RC;
+      if (RegVT == MVT::i8) {
+        RC = &AVR::GPR8RegClass;
+      } else if (RegVT == MVT::i16) {
+        RC = &AVR::DREGSRegClass;
+      } else {
+        llvm_unreachable("Unknown argument type!");
+      }
+
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+
+      // :NOTE: Clang should not promote any i8 into i16 but for safety the
+      // following code will handle zexts or sexts generated by other
+      // front ends. Otherwise:
+      // If this is an 8 bit value, it is really passed promoted
+      // to 16 bits. Insert an assert[sz]ext to capture this, then
+      // truncate to the right size.
+      switch (VA.getLocInfo()) {
+      default:
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::BCvt:
+        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::SExt:
+        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::ZExt:
+        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+        break;
+      }
+
+      InVals.push_back(ArgValue);
+    } else {
+      // Sanity check.
+      assert(VA.isMemLoc());
+
+      EVT LocVT = VA.getLocVT();
+
+      // Create the frame index object for this incoming parameter.
+      int FI = MFI.CreateFixedObject(LocVT.getSizeInBits() / 8,
+                                     VA.getLocMemOffset(), true);
+
+      // Create the SelectionDAG nodes corresponding to a load
+      // from this parameter.
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DL));
+      InVals.push_back(DAG.getLoad(LocVT, dl, Chain, FIN,
+                                   MachinePointerInfo::getFixedStack(MF, FI),
+                                   0));
+    }
+  }
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    unsigned StackSize = CCInfo.getNextStackOffset();
+    AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+
+    AFI->setVarArgsFrameIndex(MFI.CreateFixedObject(2, StackSize, true));
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//                  Call Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                     SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &DL = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &isTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool isVarArg = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // AVR does not yet support tail call optimization.
+  isTailCall = false;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  const Function *F = nullptr;
+  if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+
+    F = cast<Function>(GV);
+    Callee =
+        DAG.getTargetGlobalAddress(GV, DL, getPointerTy(DAG.getDataLayout()));
+  } else if (const ExternalSymbolSDNode *ES =
+                 dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    Callee = DAG.getTargetExternalSymbol(ES->getSymbol(),
+                                         getPointerTy(DAG.getDataLayout()));
+  }
+
+  analyzeArguments(&CLI, F, &DAG.getDataLayout(), &Outs, 0, CallConv, ArgLocs, CCInfo,
+                   true, isVarArg);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
+                               DL);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+  // First, walk the register assignments, inserting copies.
+  unsigned AI, AE;
+  bool HasStackArgs = false;
+  for (AI = 0, AE = ArgLocs.size(); AI != AE; ++AI) {
+    CCValAssign &VA = ArgLocs[AI];
+    EVT RegVT = VA.getLocVT();
+    SDValue Arg = OutVals[AI];
+
+    // Promote the value if needed. With Clang this should not happen.
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, RegVT, Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, RegVT, Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, RegVT, Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, RegVT, Arg);
+      break;
+    }
+
+    // Stop when we encounter a stack argument, we need to process them
+    // in reverse order in the loop below.
+    if (VA.isMemLoc()) {
+      HasStackArgs = true;
+      break;
+    }
+
+    // Arguments that can be passed on registers must be kept in the RegsToPass
+    // vector.
+    RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+  }
+
+  // Second, stack arguments have to walked in reverse order by inserting
+  // chained stores, this ensures their order is not changed by the scheduler
+  // and that the push instruction sequence generated is correct, otherwise they
+  // can be freely intermixed.
+  if (HasStackArgs) {
+    for (AE = AI, AI = ArgLocs.size(); AI != AE; --AI) {
+      unsigned Loc = AI - 1;
+      CCValAssign &VA = ArgLocs[Loc];
+      SDValue Arg = OutVals[Loc];
+
+      assert(VA.isMemLoc());
+
+      // SP points to one stack slot further so add one to adjust it.
+      SDValue PtrOff = DAG.getNode(
+          ISD::ADD, DL, getPointerTy(DAG.getDataLayout()),
+          DAG.getRegister(AVR::SP, getPointerTy(DAG.getDataLayout())),
+          DAG.getIntPtrConstant(VA.getLocMemOffset() + 1, DL));
+
+      Chain =
+          DAG.getStore(Chain, DL, Arg, PtrOff,
+                       MachinePointerInfo::getStack(MF, VA.getLocMemOffset()),
+                       0);
+    }
+  }
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain and
+  // flag operands which copy the outgoing args into registers.  The InFlag in
+  // necessary since all emited instructions must be stuck together.
+  SDValue InFlag;
+  for (auto Reg : RegsToPass) {
+    Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (auto Reg : RegsToPass) {
+    Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
+  }
+
+  // Add a register mask operand representing the call-preserved registers.
+  const AVRTargetMachine &TM = (const AVRTargetMachine &)getTargetMachine();
+  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
+  const uint32_t *Mask =
+      TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode()) {
+    Ops.push_back(InFlag);
+  }
+
+  Chain = DAG.getNode(AVRISD::CALL, DL, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
+                             DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
+
+  if (!Ins.empty()) {
+    InFlag = Chain.getValue(1);
+  }
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, DL, DAG,
+                         InVals);
+}
+
+/// Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue AVRTargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  // Handle runtime calling convs.
+  auto CCFunction = CCAssignFnForReturn(CallConv);
+  CCInfo.AnalyzeCallResult(Ins, CCFunction);
+
+  if (CallConv != CallingConv::AVR_BUILTIN && RVLocs.size() > 1) {
+    // Reverse splitted return values to get the "big endian" format required
+    // to agree with the calling convention ABI.
+    std::reverse(RVLocs.begin(), RVLocs.end());
+  }
+
+  // Copy all of the result registers out of their specified physreg.
+  for (CCValAssign const &RVLoc : RVLocs) {
+    Chain = DAG.getCopyFromReg(Chain, dl, RVLoc.getLocReg(), RVLoc.getValVT(),
+                               InFlag)
+                .getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+CCAssignFn *AVRTargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
+  switch (CC) {
+  case CallingConv::AVR_BUILTIN:
+    return RetCC_AVR_BUILTIN;
+  default:
+    return RetCC_AVR;
+  }
+}
+
+bool
+AVRTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                  MachineFunction &MF, bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  LLVMContext &Context) const
+{
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+
+  auto CCFunction = CCAssignFnForReturn(CallConv);
+  return CCInfo.CheckReturn(Outs, CCFunction);
+}
+
+SDValue
+AVRTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
+                               const SDLoc &dl, SelectionDAG &DAG) const {
+  // CCValAssign - represent the assignment of the return value to locations.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  // Analyze return values.
+  auto CCFunction = CCAssignFnForReturn(CallConv);
+  CCInfo.AnalyzeReturn(Outs, CCFunction);
+
+  // If this is the first return lowered for this function, add the regs to
+  // the liveout set for the function.
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned e = RVLocs.size();
+
+  // Reverse splitted return values to get the "big endian" format required
+  // to agree with the calling convention ABI.
+  if (e > 1) {
+    std::reverse(RVLocs.begin(), RVLocs.end());
+  }
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), OutVals[i], Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  // Don't emit the ret/reti instruction when the naked attribute is present in
+  // the function being compiled.
+  if (MF.getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::Naked)) {
+    return Chain;
+  }
+
+  unsigned RetOpc =
+      (CallConv == CallingConv::AVR_INTR || CallConv == CallingConv::AVR_SIGNAL)
+          ? AVRISD::RETI_FLAG
+          : AVRISD::RET_FLAG;
+
+  RetOps[0] = Chain; // Update chain.
+
+  if (Flag.getNode()) {
+    RetOps.push_back(Flag);
+  }
+
+  return DAG.getNode(RetOpc, dl, MVT::Other, RetOps);
+}
+
+//===----------------------------------------------------------------------===//
+//  Custom Inserters
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
+                                                  MachineBasicBlock *BB) const {
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  MachineFunction *F = BB->getParent();
+  MachineRegisterInfo &RI = F->getRegInfo();
+  const AVRTargetMachine &TM = (const AVRTargetMachine &)getTargetMachine();
+  const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Invalid shift opcode!");
+  case AVR::Lsl8:
+    Opc = AVR::LSLRd;
+    RC = &AVR::GPR8RegClass;
+    break;
+  case AVR::Lsl16:
+    Opc = AVR::LSLWRd;
+    RC = &AVR::DREGSRegClass;
+    break;
+  case AVR::Asr8:
+    Opc = AVR::ASRRd;
+    RC = &AVR::GPR8RegClass;
+    break;
+  case AVR::Asr16:
+    Opc = AVR::ASRWRd;
+    RC = &AVR::DREGSRegClass;
+    break;
+  case AVR::Lsr8:
+    Opc = AVR::LSRRd;
+    RC = &AVR::GPR8RegClass;
+    break;
+  case AVR::Lsr16:
+    Opc = AVR::LSRWRd;
+    RC = &AVR::DREGSRegClass;
+    break;
+  }
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator I = BB->getParent()->begin();
+  ++I;
+
+  // Create loop block.
+  MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *RemBB = F->CreateMachineBasicBlock(LLVM_BB);
+
+  F->insert(I, LoopBB);
+  F->insert(I, RemBB);
+
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the block containing instructions after shift.
+  RemBB->splice(RemBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
+                BB->end());
+  RemBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB.
+  BB->addSuccessor(LoopBB);
+  BB->addSuccessor(RemBB);
+  LoopBB->addSuccessor(RemBB);
+  LoopBB->addSuccessor(LoopBB);
+
+  unsigned ShiftAmtReg = RI.createVirtualRegister(&AVR::LD8RegClass);
+  unsigned ShiftAmtReg2 = RI.createVirtualRegister(&AVR::LD8RegClass);
+  unsigned ShiftReg = RI.createVirtualRegister(RC);
+  unsigned ShiftReg2 = RI.createVirtualRegister(RC);
+  unsigned ShiftAmtSrcReg = MI.getOperand(2).getReg();
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  unsigned DstReg = MI.getOperand(0).getReg();
+
+  // BB:
+  // cp 0, N
+  // breq RemBB
+  BuildMI(BB, dl, TII.get(AVR::CPRdRr)).addReg(ShiftAmtSrcReg).addReg(AVR::R0);
+  BuildMI(BB, dl, TII.get(AVR::BREQk)).addMBB(RemBB);
+
+  // LoopBB:
+  // ShiftReg = phi [%SrcReg, BB], [%ShiftReg2, LoopBB]
+  // ShiftAmt = phi [%N, BB],      [%ShiftAmt2, LoopBB]
+  // ShiftReg2 = shift ShiftReg
+  // ShiftAmt2 = ShiftAmt - 1;
+  BuildMI(LoopBB, dl, TII.get(AVR::PHI), ShiftReg)
+      .addReg(SrcReg)
+      .addMBB(BB)
+      .addReg(ShiftReg2)
+      .addMBB(LoopBB);
+  BuildMI(LoopBB, dl, TII.get(AVR::PHI), ShiftAmtReg)
+      .addReg(ShiftAmtSrcReg)
+      .addMBB(BB)
+      .addReg(ShiftAmtReg2)
+      .addMBB(LoopBB);
+  BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2).addReg(ShiftReg);
+  BuildMI(LoopBB, dl, TII.get(AVR::SUBIRdK), ShiftAmtReg2)
+      .addReg(ShiftAmtReg)
+      .addImm(1);
+  BuildMI(LoopBB, dl, TII.get(AVR::BRNEk)).addMBB(LoopBB);
+
+  // RemBB:
+  // DestReg = phi [%SrcReg, BB], [%ShiftReg, LoopBB]
+  BuildMI(*RemBB, RemBB->begin(), dl, TII.get(AVR::PHI), DstReg)
+      .addReg(SrcReg)
+      .addMBB(BB)
+      .addReg(ShiftReg2)
+      .addMBB(LoopBB);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return RemBB;
+}
+
+static bool isCopyMulResult(MachineBasicBlock::iterator const &I) {
+  if (I->getOpcode() == AVR::COPY) {
+    unsigned SrcReg = I->getOperand(1).getReg();
+    return (SrcReg == AVR::R0 || SrcReg == AVR::R1);
+  }
+
+  return false;
+}
+
+// The mul instructions wreak havock on our zero_reg R1. We need to clear it
+// after the result has been evacuated. This is probably not the best way to do
+// it, but it works for now.
+MachineBasicBlock *AVRTargetLowering::insertMul(MachineInstr &MI,
+                                                MachineBasicBlock *BB) const {
+  const AVRTargetMachine &TM = (const AVRTargetMachine &)getTargetMachine();
+  const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+  MachineBasicBlock::iterator I(MI);
+  ++I; // in any case insert *after* the mul instruction
+  if (isCopyMulResult(I))
+    ++I;
+  if (isCopyMulResult(I))
+    ++I;
+  BuildMI(*BB, I, MI.getDebugLoc(), TII.get(AVR::EORRdRr), AVR::R1)
+      .addReg(AVR::R1)
+      .addReg(AVR::R1);
+  return BB;
+}
+
+MachineBasicBlock *
+AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                               MachineBasicBlock *MBB) const {
+  int Opc = MI.getOpcode();
+
+  // Pseudo shift instructions with a non constant shift amount are expanded
+  // into a loop.
+  switch (Opc) {
+  case AVR::Lsl8:
+  case AVR::Lsl16:
+  case AVR::Lsr8:
+  case AVR::Lsr16:
+  case AVR::Asr8:
+  case AVR::Asr16:
+    return insertShift(MI, MBB);
+  case AVR::MULRdRr:
+  case AVR::MULSRdRr:
+    return insertMul(MI, MBB);
+  }
+
+  assert((Opc == AVR::Select16 || Opc == AVR::Select8) &&
+         "Unexpected instr type to insert");
+
+  const AVRInstrInfo &TII = (const AVRInstrInfo &)*MI.getParent()
+                                ->getParent()
+                                ->getSubtarget()
+                                .getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  // To "insert" a SELECT instruction, we insert the diamond
+  // control-flow pattern. The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch
+  // on, the true/false values to select between, and a branch opcode
+  // to use.
+
+  MachineFunction *MF = MBB->getParent();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineBasicBlock *trueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *falseMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+  MachineFunction::iterator I = MBB->getParent()->begin();
+  ++I;
+  MF->insert(I, trueMBB);
+  MF->insert(I, falseMBB);
+
+  // Transfer remaining instructions and all successors of the current
+  // block to the block which will contain the Phi node for the
+  // select.
+  trueMBB->splice(trueMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  trueMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  AVRCC::CondCodes CC = (AVRCC::CondCodes)MI.getOperand(3).getImm();
+  BuildMI(MBB, dl, TII.getBrCond(CC)).addMBB(trueMBB);
+  BuildMI(MBB, dl, TII.get(AVR::RJMPk)).addMBB(falseMBB);
+  MBB->addSuccessor(falseMBB);
+  MBB->addSuccessor(trueMBB);
+
+  // Unconditionally flow back to the true block
+  BuildMI(falseMBB, dl, TII.get(AVR::RJMPk)).addMBB(trueMBB);
+  falseMBB->addSuccessor(trueMBB);
+
+  // Set up the Phi node to determine where we came from
+  BuildMI(*trueMBB, trueMBB->begin(), dl, TII.get(AVR::PHI), MI.getOperand(0).getReg())
+    .addReg(MI.getOperand(1).getReg())
+    .addMBB(MBB)
+    .addReg(MI.getOperand(2).getReg())
+    .addMBB(falseMBB) ;
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return trueMBB;
+}
+
+//===----------------------------------------------------------------------===//
+//  Inline Asm Support
+//===----------------------------------------------------------------------===//
+
+AVRTargetLowering::ConstraintType
+AVRTargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    // See http://www.nongnu.org/avr-libc/user-manual/inline_asm.html
+    switch (Constraint[0]) {
+    case 'a': // Simple upper registers
+    case 'b': // Base pointer registers pairs
+    case 'd': // Upper register
+    case 'l': // Lower registers
+    case 'e': // Pointer register pairs
+    case 'q': // Stack pointer register
+    case 'r': // Any register
+    case 'w': // Special upper register pairs
+      return C_RegisterClass;
+    case 't': // Temporary register
+    case 'x': case 'X': // Pointer register pair X
+    case 'y': case 'Y': // Pointer register pair Y
+    case 'z': case 'Z': // Pointer register pair Z
+      return C_Register;
+    case 'Q': // A memory address based on Y or Z pointer with displacement.
+      return C_Memory;
+    case 'G': // Floating point constant
+    case 'I': // 6-bit positive integer constant
+    case 'J': // 6-bit negative integer constant
+    case 'K': // Integer constant (Range: 2)
+    case 'L': // Integer constant (Range: 0)
+    case 'M': // 8-bit integer constant
+    case 'N': // Integer constant (Range: -1)
+    case 'O': // Integer constant (Range: 8, 16, 24)
+    case 'P': // Integer constant (Range: 1)
+    case 'R': // Integer constant (Range: -6 to 5)x
+      return C_Other;
+    default:
+      break;
+    }
+  }
+
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+unsigned
+AVRTargetLowering::getInlineAsmMemConstraint(StringRef ConstraintCode) const {
+  // Not sure if this is actually the right thing to do, but we got to do
+  // *something* [agnat]
+  switch (ConstraintCode[0]) {
+  case 'Q':
+    return InlineAsm::Constraint_Q;
+  }
+  return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+}
+
+AVRTargetLowering::ConstraintWeight
+AVRTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  // (this behaviour has been copied from the ARM backend)
+  if (!CallOperandVal) {
+    return CW_Default;
+  }
+
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'd':
+  case 'r':
+  case 'l':
+    weight = CW_Register;
+    break;
+  case 'a':
+  case 'b':
+  case 'e':
+  case 'q':
+  case 't':
+  case 'w':
+  case 'x': case 'X':
+  case 'y': case 'Y':
+  case 'z': case 'Z':
+    weight = CW_SpecificReg;
+    break;
+  case 'G':
+    if (const ConstantFP *C = dyn_cast<ConstantFP>(CallOperandVal)) {
+      if (C->isZero()) {
+        weight = CW_Constant;
+      }
+    }
+    break;
+  case 'I':
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (isUInt<6>(C->getZExtValue())) {
+        weight = CW_Constant;
+      }
+    }
+    break;
+  case 'J':
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getSExtValue() >= -63) && (C->getSExtValue() <= 0)) {
+        weight = CW_Constant;
+      }
+    }
+    break;
+  case 'K':
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() == 2) {
+        weight = CW_Constant;
+      }
+    }
+    break;
+  case 'L':
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() == 0) {
+        weight = CW_Constant;
+      }
+    }
+    break;
+  case 'M':
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (isUInt<8>(C->getZExtValue())) {
+        weight = CW_Constant;
+      }
+    }
+    break;
+  case 'N':
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getSExtValue() == -1) {
+        weight = CW_Constant;
+      }
+    }
+    break;
+  case 'O':
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getZExtValue() == 8) || (C->getZExtValue() == 16) ||
+          (C->getZExtValue() == 24)) {
+        weight = CW_Constant;
+      }
+    }
+    break;
+  case 'P':
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() == 1) {
+        weight = CW_Constant;
+      }
+    }
+    break;
+  case 'R':
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getSExtValue() >= -6) && (C->getSExtValue() <= 5)) {
+        weight = CW_Constant;
+      }
+    }
+    break;
+  case 'Q':
+    weight = CW_Memory;
+    break;
+  }
+
+  return weight;
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+AVRTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                StringRef Constraint,
+                                                MVT VT) const {
+  auto STI = static_cast<const AVRTargetMachine &>(this->getTargetMachine())
+                 .getSubtargetImpl();
+
+  // We only support i8 and i16.
+  //
+  //:FIXME: remove this assert for now since it gets sometimes executed
+  // assert((VT == MVT::i16 || VT == MVT::i8) && "Wrong operand type.");
+
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'a': // Simple upper registers r16..r23.
+      return std::make_pair(0U, &AVR::LD8loRegClass);
+    case 'b': // Base pointer registers: y, z.
+      return std::make_pair(0U, &AVR::PTRDISPREGSRegClass);
+    case 'd': // Upper registers r16..r31.
+      return std::make_pair(0U, &AVR::LD8RegClass);
+    case 'l': // Lower registers r0..r15.
+      return std::make_pair(0U, &AVR::GPR8loRegClass);
+    case 'e': // Pointer register pairs: x, y, z.
+      return std::make_pair(0U, &AVR::PTRREGSRegClass);
+    case 'q': // Stack pointer register: SPH:SPL.
+      return std::make_pair(0U, &AVR::GPRSPRegClass);
+    case 'r': // Any register: r0..r31.
+      if (VT == MVT::i8)
+        return std::make_pair(0U, &AVR::GPR8RegClass);
+
+      assert(VT == MVT::i16 && "inline asm constraint too large");
+      return std::make_pair(0U, &AVR::DREGSRegClass);
+    case 't': // Temporary register: r0.
+      return std::make_pair(unsigned(AVR::R0), &AVR::GPR8RegClass);
+    case 'w': // Special upper register pairs: r24, r26, r28, r30.
+      return std::make_pair(0U, &AVR::IWREGSRegClass);
+    case 'x': // Pointer register pair X: r27:r26.
+    case 'X':
+      return std::make_pair(unsigned(AVR::R27R26), &AVR::PTRREGSRegClass);
+    case 'y': // Pointer register pair Y: r29:r28.
+    case 'Y':
+      return std::make_pair(unsigned(AVR::R29R28), &AVR::PTRREGSRegClass);
+    case 'z': // Pointer register pair Z: r31:r30.
+    case 'Z':
+      return std::make_pair(unsigned(AVR::R31R30), &AVR::PTRREGSRegClass);
+    default:
+      break;
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(STI->getRegisterInfo(),
+                                                      Constraint, VT);
+}
+
+void AVRTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
+                                                     std::vector<SDValue> &Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result(0, 0);
+  SDLoc DL(Op);
+  EVT Ty = Op.getValueType();
+
+  // Currently only support length 1 constraints.
+  if (Constraint.length() != 1) {
+    return;
+  }
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default:
+    break;
+  // Deal with integers first:
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+  case 'O':
+  case 'P':
+  case 'R': {
+    const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C) {
+      return;
+    }
+
+    int64_t CVal64 = C->getSExtValue();
+    uint64_t CUVal64 = C->getZExtValue();
+    switch (ConstraintLetter) {
+    case 'I': // 0..63
+      if (!isUInt<6>(CUVal64))
+        return;
+      Result = DAG.getTargetConstant(CUVal64, DL, Ty);
+      break;
+    case 'J': // -63..0
+      if (CVal64 < -63 || CVal64 > 0)
+        return;
+      Result = DAG.getTargetConstant(CVal64, DL, Ty);
+      break;
+    case 'K': // 2
+      if (CUVal64 != 2)
+        return;
+      Result = DAG.getTargetConstant(CUVal64, DL, Ty);
+      break;
+    case 'L': // 0
+      if (CUVal64 != 0)
+        return;
+      Result = DAG.getTargetConstant(CUVal64, DL, Ty);
+      break;
+    case 'M': // 0..255
+      if (!isUInt<8>(CUVal64))
+        return;
+      // i8 type may be printed as a negative number,
+      // e.g. 254 would be printed as -2,
+      // so we force it to i16 at least.
+      if (Ty.getSimpleVT() == MVT::i8) {
+        Ty = MVT::i16;
+      }
+      Result = DAG.getTargetConstant(CUVal64, DL, Ty);
+      break;
+    case 'N': // -1
+      if (CVal64 != -1)
+        return;
+      Result = DAG.getTargetConstant(CVal64, DL, Ty);
+      break;
+    case 'O': // 8, 16, 24
+      if (CUVal64 != 8 && CUVal64 != 16 && CUVal64 != 24)
+        return;
+      Result = DAG.getTargetConstant(CUVal64, DL, Ty);
+      break;
+    case 'P': // 1
+      if (CUVal64 != 1)
+        return;
+      Result = DAG.getTargetConstant(CUVal64, DL, Ty);
+      break;
+    case 'R': // -6..5
+      if (CVal64 < -6 || CVal64 > 5)
+        return;
+      Result = DAG.getTargetConstant(CVal64, DL, Ty);
+      break;
+    }
+
+    break;
+  }
+  case 'G':
+    const ConstantFPSDNode *FC = dyn_cast<ConstantFPSDNode>(Op);
+    if (!FC || !FC->isZero())
+      return;
+    // Soften float to i8 0
+    Result = DAG.getTargetConstant(0, DL, MVT::i8);
+    break;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRISelLowering.h b/contrib/llvm/lib/Target/AVR/AVRISelLowering.h
new file mode 100644
index 000000000000..17074e1b1eee
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRISelLowering.h
@@ -0,0 +1,163 @@
+//===-- AVRISelLowering.h - AVR DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that AVR uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_ISEL_LOWERING_H
+#define LLVM_AVR_ISEL_LOWERING_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+namespace AVRISD {
+
+/// AVR Specific DAG Nodes
+enum NodeType {
+  /// Start the numbering where the builtin ops leave off.
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  /// Return from subroutine.
+  RET_FLAG,
+  /// Return from ISR.
+  RETI_FLAG,
+  /// Represents an abstract call instruction,
+  /// which includes a bunch of information.
+  CALL,
+  /// A wrapper node for TargetConstantPool,
+  /// TargetExternalSymbol, and TargetGlobalAddress.
+  WRAPPER,
+  LSL,     ///< Logical shift left.
+  LSR,     ///< Logical shift right.
+  ASR,     ///< Arithmetic shift right.
+  ROR,     ///< Bit rotate right.
+  ROL,     ///< Bit rotate left.
+  LSLLOOP, ///< A loop of single logical shift left instructions.
+  LSRLOOP, ///< A loop of single logical shift right instructions.
+  ASRLOOP, ///< A loop of single arithmetic shift right instructions.
+  /// AVR conditional branches. Operand 0 is the chain operand, operand 1
+  /// is the block to branch if condition is true, operand 2 is the
+  /// condition code, and operand 3 is the flag operand produced by a CMP
+  /// or TEST instruction.
+  BRCOND,
+  /// Compare instruction.
+  CMP,
+  /// Compare with carry instruction.
+  CMPC,
+  /// Test for zero or minus instruction.
+  TST,
+  /// Operand 0 and operand 1 are selection variable, operand 2
+  /// is condition code and operand 3 is flag operand.
+  SELECT_CC
+};
+
+} // end of namespace AVRISD
+
+class AVRTargetMachine;
+
+/// Performs target lowering for the AVR.
+class AVRTargetLowering : public TargetLowering {
+public:
+  explicit AVRTargetLowering(AVRTargetMachine &TM);
+
+public:
+  MVT getScalarShiftAmountTy(const DataLayout &, EVT LHSTy) const override {
+    return MVT::i8;
+  }
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
+
+  bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+                                 ISD::MemIndexedMode &AM,
+                                 SelectionDAG &DAG) const override;
+
+  bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+                                  SDValue &Offset, ISD::MemIndexedMode &AM,
+                                  SelectionDAG &DAG) const override;
+
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                         EVT VT) const override;
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *MBB) const override;
+
+  ConstraintType getConstraintType(StringRef Constraint) const override;
+
+  ConstraintWeight
+  getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                 const char *constraint) const override;
+
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+
+  unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override;
+
+  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+
+private:
+  SDValue getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AVRcc,
+                    SelectionDAG &DAG, SDLoc dl) const;
+  SDValue LowerShifts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+
+  CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC) const;
+
+  bool CanLowerReturn(CallingConv::ID CallConv,
+                      MachineFunction &MF, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
+                      SelectionDAG &DAG) const override;
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+  SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool isVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins,
+                          const SDLoc &dl, SelectionDAG &DAG,
+                          SmallVectorImpl<SDValue> &InVals) const;
+
+private:
+  MachineBasicBlock *insertShift(MachineInstr &MI, MachineBasicBlock *BB) const;
+  MachineBasicBlock *insertMul(MachineInstr &MI, MachineBasicBlock *BB) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_ISEL_LOWERING_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRInstrFormats.td b/contrib/llvm/lib/Target/AVR/AVRInstrFormats.td
new file mode 100644
index 000000000000..ce5e606f9787
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRInstrFormats.td
@@ -0,0 +1,579 @@
+//===-- AVRInstrInfo.td - AVR Instruction Formats ----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AVR Instruction Format Definitions.
+//
+//===----------------------------------------------------------------------===//
+
+// A generic AVR instruction.
+class AVRInst<dag outs, dag ins, string asmstr, list<dag> pattern> : Instruction
+{
+  let Namespace = "AVR";
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+
+  field bits<32> SoftFail = 0;
+}
+
+/// A 16-bit AVR instruction.
+class AVRInst16<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst<outs, ins, asmstr, pattern>
+{
+  field bits<16> Inst;
+
+  let Size = 2;
+}
+
+/// a 32-bit AVR instruction.
+class AVRInst32<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst<outs, ins, asmstr, pattern>
+{
+  field bits<32> Inst;
+
+  let Size = 4;
+}
+
+// A class for pseudo instructions.
+// Psuedo instructions are not real AVR instructions. The DAG stores
+// psuedo instructions which are replaced by real AVR instructions by
+// AVRExpandPseudoInsts.cpp.
+//
+// For example, the ADDW (add wide, as in add 16 bit values) instruction
+// is defined as a pseudo instruction. In AVRExpandPseudoInsts.cpp,
+// the instruction is then replaced by two add instructions - one for each byte.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  let Pattern = pattern;
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Register / register instruction: <|opcode|ffrd|dddd|rrrr|>
+// opcode = 4 bits.
+// f = secondary opcode = 2 bits
+// d = destination = 5 bits
+// r = source = 5 bits
+// (Accepts all registers)
+//===----------------------------------------------------------------------===//
+class FRdRr<bits<4> opcode, bits<2> f, dag outs, dag ins, string asmstr,
+            list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> rd;
+  bits<5> rr;
+
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = f;
+  let Inst{9} = rr{4};
+  let Inst{8-4} = rd;
+  let Inst{3-0} = rr{3-0};
+}
+
+class FTST<bits<4> opcode, bits<2> f, dag outs, dag ins, string asmstr,
+            list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> rd;
+
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = f;
+  let Inst{9} = rd{4};
+  let Inst{8-4} = rd;
+  let Inst{3-0} = rd{3-0};
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction of the format `<mnemonic> Z, Rd`
+// <|1001|001r|rrrr|0ttt>
+//===----------------------------------------------------------------------===//
+class FZRd<bits<3> t, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> rd;
+
+  let Inst{15-12} = 0b1001;
+
+  let Inst{11-9} = 0b001;
+  let Inst{8} = rd{4};
+
+  let Inst{7-4} = rd{3-0};
+
+  let Inst{3} = 0;
+  let Inst{2-0} = t;
+}
+
+//===----------------------------------------------------------------------===//
+// Register / immediate8 instruction: <|opcode|KKKK|dddd|KKKK|>
+// opcode = 4 bits.
+// K = constant data = 8 bits
+// d = destination = 4 bits
+// (Only accepts r16-r31)
+//===----------------------------------------------------------------------===//
+class FRdK<bits<4> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<4> rd;
+  bits<8> k;
+
+  let Inst{15-12} = opcode;
+  let Inst{11-8} = k{7-4};
+  let Inst{7-4} = rd{3-0};
+  let Inst{3-0} = k{3-0};
+
+  let isAsCheapAsAMove = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Register instruction: <|opcode|fffd|dddd|ffff|>
+// opcode = 4 bits.
+// f = secondary opcode = 7 bits
+// d = destination = 5 bits
+// (Accepts all registers)
+//===----------------------------------------------------------------------===//
+class FRd<bits<4> opcode, bits<7> f, dag outs, dag ins, string asmstr,
+          list<dag> pattern> : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> d;
+
+  let Inst{15-12} = opcode;
+  let Inst{11-9} = f{6-4};
+  let Inst{8-4} = d;
+  let Inst{3-0} = f{3-0};
+}
+
+//===----------------------------------------------------------------------===//
+// [STD/LDD] P+q, Rr special encoding: <|10q0|qqtr|rrrr|pqqq>
+// t = type (1 for STD, 0 for LDD)
+// q = displacement (6 bits)
+// r = register (5 bits)
+// p = pointer register (1 bit) [1 for Y, 0 for Z]
+//===----------------------------------------------------------------------===//
+class FSTDLDD<bit type, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<7> memri;
+  bits<5> reg; // the GP register
+
+  let Inst{15-14} = 0b10;
+  let Inst{13} = memri{5};
+  let Inst{12} = 0;
+
+  let Inst{11-10} = memri{4-3};
+  let Inst{9} = type;
+  let Inst{8} = reg{4};
+
+  let Inst{7-4} = reg{3-0};
+
+  let Inst{3} = memri{6};
+  let Inst{2-0} = memri{2-0};
+}
+
+//===---------------------------------------------------------------------===//
+// An ST/LD instruction.
+// <|100i|00tr|rrrr|ppaa|>
+// t = type (1 for store, 0 for load)
+// a = regular/postinc/predec (reg = 0b00, postinc = 0b01, predec = 0b10)
+// p = pointer register
+// r = src/dst register
+//
+// Note that the bit labelled 'i' above does not follow a simple pattern,
+// so there exists a post encoder method to set it manually.
+//===---------------------------------------------------------------------===//
+class FSTLD<bit type, bits<2> mode, dag outs, dag ins,
+            string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<2> ptrreg;
+  bits<5> reg;
+
+  let Inst{15-13} = 0b100;
+  // This bit varies depending on the arguments and the mode.
+  // We have a post encoder method to set this bit manually.
+  let Inst{12} = 0;
+
+  let Inst{11-10} = 0b00;
+  let Inst{9} = type;
+  let Inst{8} = reg{4};
+
+  let Inst{7-4} = reg{3-0};
+
+  let Inst{3-2} = ptrreg{1-0};
+  let Inst{1-0} = mode{1-0};
+
+  let PostEncoderMethod = "loadStorePostEncoder";
+}
+
+//===---------------------------------------------------------------------===//
+// Special format for the LPM/ELPM instructions
+// [E]LPM Rd, Z[+]
+// <|1001|000d|dddd|01ep>
+// d = destination register
+// e = is elpm
+// p = is postincrement
+//===---------------------------------------------------------------------===//
+class FLPMX<bit e, bit p, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+   bits<5> reg;
+
+   let Inst{15-12} = 0b1001;
+
+   let Inst{11-9} = 0b000;
+   let Inst{8} = reg{4};
+
+   let Inst{7-4} = reg{3-0};
+
+   let Inst{3-2} = 0b01;
+   let Inst{1} = e;
+   let Inst{0} = p;
+}
+
+//===----------------------------------------------------------------------===//
+// MOVWRdRr special encoding: <|0000|0001|dddd|rrrr|>
+// d = destination = 4 bits
+// r = source = 4 bits
+// (Only accepts even registers)
+//===----------------------------------------------------------------------===//
+class FMOVWRdRr<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> d;
+  bits<5> r;
+
+  let Inst{15-8} = 0b00000001;
+  let Inst{7-4} = d{4-1};
+  let Inst{3-0} = r{4-1};
+}
+
+//===----------------------------------------------------------------------===//
+// MULSrr special encoding: <|0000|0010|dddd|rrrr|>
+// d = multiplicand = 4 bits
+// r = multiplier = 4 bits
+// (Only accepts r16-r31)
+//===----------------------------------------------------------------------===//
+class FMUL2RdRr<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> rd;              // accept 5 bits but only encode the lower 4
+  bits<5> rr;              // accept 5 bits but only encode the lower 4
+
+  let Inst{15-9} = 0b0000001;
+  let Inst{8} = f;
+  let Inst{7-4} = rd{3-0};
+  let Inst{3-0} = rr{3-0};
+}
+
+// Special encoding for the FMUL family of instructions.
+//
+// <0000|0011|fddd|frrr|>
+//
+// ff = 0b01 for FMUL
+//      0b10 for FMULS
+//      0b11 for FMULSU
+//
+// ddd = destination register
+// rrr = source register
+class FFMULRdRr<bits<2> f, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<3> rd;
+  bits<3> rr;
+
+  let Inst{15-8} = 0b00000011;
+  let Inst{7} = f{1};
+  let Inst{6-4} = rd;
+  let Inst{3} = f{0};
+  let Inst{2-0} = rr;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Arithmetic word instructions (ADIW / SBIW): <|1001|011f|kkdd|kkkk|>
+// f = secondary opcode = 1 bit
+// k = constant data = 6 bits
+// d = destination = 4 bits
+// (Only accepts r25:24 r27:26 r29:28 r31:30)
+//===----------------------------------------------------------------------===//
+class FWRdK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> dst;              // accept 5 bits but only encode bits 1 and 2
+  bits<6> k;
+
+  let Inst{15-9} = 0b1001011;
+  let Inst{8} = f;
+  let Inst{7-6} = k{5-4};
+  let Inst{5-4} = dst{2-1};
+  let Inst{3-0} = k{3-0};
+}
+
+//===----------------------------------------------------------------------===//
+// In I/O instruction: <|1011|0AAd|dddd|AAAA|>
+// A = I/O location address = 6 bits
+// d = destination = 5 bits
+// (Accepts all registers)
+//===----------------------------------------------------------------------===//
+class FIORdA<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> d;
+  bits<6> A;
+
+  let Inst{15-11} = 0b10110;
+  let Inst{10-9} = A{5-4};
+  let Inst{8-4} = d;
+  let Inst{3-0} = A{3-0};
+}
+
+//===----------------------------------------------------------------------===//
+// Out I/O instruction: <|1011|1AAr|rrrr|AAAA|>
+// A = I/O location address = 6 bits
+// d = destination = 5 bits
+// (Accepts all registers)
+//===----------------------------------------------------------------------===//
+class FIOARr<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<6> A;
+  bits<5> r;
+
+  let Inst{15-11} = 0b10111;
+  let Inst{10-9} = A{5-4};
+  let Inst{8-4} = r;
+  let Inst{3-0} = A{3-0};
+}
+
+//===----------------------------------------------------------------------===//
+// I/O bit instruction.
+// <|1001|10tt|AAAA|Abbb>
+// t = type (1 for SBI, 0 for CBI)
+// A = I/O location address (5 bits)
+// b = bit number
+//===----------------------------------------------------------------------===//
+class FIOBIT<bits<2> t, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> A;
+  bits<3> b;
+
+  let Inst{15-12} = 0b1001;
+
+  let Inst{11-10} = 0b10;
+  let Inst{9-8} = t;
+
+  let Inst{7-4} = A{4-1};
+
+  let Inst{3} = A{0};
+  let Inst{2-0} = b{2-0};
+}
+
+//===----------------------------------------------------------------------===//
+// BST/BLD instruction.
+// <|1111|1ttd|dddd|0bbb>
+// t = type (1 for BST, 0 for BLD)
+// d = destination register
+// b = bit
+//===----------------------------------------------------------------------===//
+class FRdB<bits<2> t, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<5> rd;
+  bits<3> b;
+
+  let Inst{15-12} = 0b1111;
+
+  let Inst{11} = 0b1;
+  let Inst{10-9} = t;
+  let Inst{8} = rd{4};
+
+  let Inst{7-4} = rd{3-0};
+
+  let Inst{3} = 0;
+  let Inst{2-0} = b;
+}
+
+// Special encoding for the `DES K` instruction.
+//
+// <|1001|0100|KKKK|1011>
+//
+// KKKK = 4 bit immediate
+class FDES<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<4> k;
+
+  let Inst{15-12} = 0b1001;
+
+  let Inst{11-8} = 0b0100;
+
+  let Inst{7-4} = k;
+
+  let Inst{3-0} = 0b1011;
+}
+
+//===----------------------------------------------------------------------===//
+// Conditional Branching instructions: <|1111|0fkk|kkkk|ksss|>
+// f = secondary opcode = 1 bit
+// k = constant address = 7 bits
+// s = bit in status register = 3 bits
+//===----------------------------------------------------------------------===//
+class FBRsk<bit f, bits<3> s, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<7> k;
+
+  let Inst{15-11} = 0b11110;
+  let Inst{10} = f;
+  let Inst{9-3} = k;
+  let Inst{2-0} = s;
+}
+
+//===----------------------------------------------------------------------===//
+// Special, opcode only instructions: <|opcode|>
+//===----------------------------------------------------------------------===//
+
+class F16<bits<16> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  let Inst = opcode;
+}
+
+class F32<bits<32> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst32<outs, ins, asmstr, pattern>
+{
+  let Inst = opcode;
+}
+
+//===----------------------------------------------------------------------===//
+// Branching instructions with immediate12: <|110f|kkkk|kkkk|kkkk|>
+// f = secondary opcode = 1 bit
+// k = constant address = 12 bits
+//===----------------------------------------------------------------------===//
+class FBRk<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<12> k;
+
+  let Inst{15-13} = 0b110;
+  let Inst{12} = f;
+  let Inst{11-0} = k;
+}
+
+//===----------------------------------------------------------------------===//
+// 32 bits branching instructions: <|1001|010k|kkkk|fffk|kkkk|kkkk|kkkk|kkkk|>
+// f = secondary opcode = 3 bits
+// k = constant address = 22 bits
+//===----------------------------------------------------------------------===//
+class F32BRk<bits<3> f, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst32<outs, ins, asmstr, pattern>
+{
+  bits<22> k;
+
+  let Inst{31-25} = 0b1001010;
+  let Inst{24-20} = k{21-17};
+  let Inst{19-17} = f;
+  let Inst{16-0} = k{16-0};
+}
+
+//===----------------------------------------------------------------------===//
+// 32 bits direct mem instructions: <|1001|00fd|dddd|0000|kkkk|kkkk|kkkk|kkkk|> 
+// f = secondary opcode = 1 bit
+// d = destination = 5 bits
+// k = constant address = 16 bits
+// (Accepts all registers)
+//===----------------------------------------------------------------------===//
+class F32DM<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst32<outs, ins, asmstr, pattern>
+{
+  bits<5> rd;
+  bits<16> k;
+
+  let Inst{31-28} = 0b1001;
+
+  let Inst{27-26} = 0b00;
+  let Inst{25} = f;
+  let Inst{24} = rd{4};
+
+  let Inst{23-20} = rd{3-0};
+
+  let Inst{19-16} = 0b0000;
+
+  let Inst{15-0} = k;
+}
+
+// <|1001|0100|bfff|1000>
+class FS<bit b, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<3> s;
+
+  let Inst{15-12} = 0b1001;
+
+  let Inst{11-8} = 0b0100;
+
+  let Inst{7} = b;
+  let Inst{6-4} = s;
+
+  let Inst{3-0} = 0b1000;
+}
+
+// Set/clr bit in status flag instructions/
+// <BRBS|BRBC> s, k
+// ---------------------
+// <|1111|0fkk|kkkk|ksss>
+class FSK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : AVRInst16<outs, ins, asmstr, pattern>
+{
+  bits<7> k;
+  bits<3> s;
+
+  let Inst{15-12} = 0b1111;
+
+  let Inst{11} = 0;
+  let Inst{10} = f;
+  let Inst{9-8} = k{6-5};
+
+  let Inst{7-4} = k{4-1};
+
+  let Inst{3} = k{0};
+  let Inst{2-0} = s;
+}
+
+class ExtensionPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Pseudo<outs, ins, asmstr, pattern>
+{
+  let Defs = [SREG];
+}
+
+class StorePseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Pseudo<outs, ins, asmstr, pattern>
+{
+  let Defs = [SP];
+}
+
+class SelectPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Pseudo<outs, ins, asmstr, pattern>
+{
+  let usesCustomInserter = 1;
+
+  let Uses = [SREG];
+}
+
+class ShiftPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Pseudo<outs, ins, asmstr, pattern>
+{
+  let usesCustomInserter = 1;
+
+  let Defs = [SREG];
+}
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.cpp
new file mode 100644
index 000000000000..88f889260cce
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -0,0 +1,498 @@
+//===-- AVRInstrInfo.cpp - AVR Instruction Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AVR implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRInstrInfo.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include "AVR.h"
+#include "AVRMachineFunctionInfo.h"
+#include "AVRRegisterInfo.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "AVRGenInstrInfo.inc"
+
+namespace llvm {
+
+AVRInstrInfo::AVRInstrInfo()
+    : AVRGenInstrInfo(AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), RI() {}
+
+void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MI,
+                               const DebugLoc &DL, unsigned DestReg,
+                               unsigned SrcReg, bool KillSrc) const {
+  const AVRSubtarget &STI = MBB.getParent()->getSubtarget<AVRSubtarget>();
+  const AVRRegisterInfo &TRI = *STI.getRegisterInfo();
+  unsigned Opc;
+
+  // Not all AVR devices support the 16-bit `MOVW` instruction.
+  if (AVR::DREGSRegClass.contains(DestReg, SrcReg)) {
+    if (STI.hasMOVW()) {
+      BuildMI(MBB, MI, DL, get(AVR::MOVWRdRr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      unsigned DestLo, DestHi, SrcLo, SrcHi;
+
+      TRI.splitReg(DestReg, DestLo, DestHi);
+      TRI.splitReg(SrcReg,  SrcLo,  SrcHi);
+
+      // Copy each individual register with the `MOV` instruction.
+      BuildMI(MBB, MI, DL, get(AVR::MOVRdRr), DestLo)
+        .addReg(SrcLo, getKillRegState(KillSrc));
+      BuildMI(MBB, MI, DL, get(AVR::MOVRdRr), DestHi)
+        .addReg(SrcHi, getKillRegState(KillSrc));
+    }
+  } else {
+    if (AVR::GPR8RegClass.contains(DestReg, SrcReg)) {
+      Opc = AVR::MOVRdRr;
+    } else if (SrcReg == AVR::SP && AVR::DREGSRegClass.contains(DestReg)) {
+      Opc = AVR::SPREAD;
+    } else if (DestReg == AVR::SP && AVR::DREGSRegClass.contains(SrcReg)) {
+      Opc = AVR::SPWRITE;
+    } else {
+      llvm_unreachable("Impossible reg-to-reg copy");
+    }
+
+    BuildMI(MBB, MI, DL, get(Opc), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+  }
+}
+
+unsigned AVRInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                           int &FrameIndex) const {
+  switch (MI.getOpcode()) {
+  case AVR::LDDRdPtrQ:
+  case AVR::LDDWRdYQ: { //:FIXME: remove this once PR13375 gets fixed
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+        MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
+  }
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+unsigned AVRInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                          int &FrameIndex) const {
+  switch (MI.getOpcode()) {
+  case AVR::STDPtrQRr:
+  case AVR::STDWPtrQRr: {
+    if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
+        MI.getOperand(1).getImm() == 0) {
+      FrameIndex = MI.getOperand(0).getIndex();
+      return MI.getOperand(2).getReg();
+    }
+    break;
+  }
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+void AVRInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       unsigned SrcReg, bool isKill,
+                                       int FrameIndex,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+
+  AFI->setHasSpills(true);
+
+  DebugLoc DL;
+  if (MI != MBB.end()) {
+    DL = MI->getDebugLoc();
+  }
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FrameIndex),
+      MachineMemOperand::MOStore, MFI.getObjectSize(FrameIndex),
+      MFI.getObjectAlignment(FrameIndex));
+
+  unsigned Opcode = 0;
+  if (RC->hasType(MVT::i8)) {
+    Opcode = AVR::STDPtrQRr;
+  } else if (RC->hasType(MVT::i16)) {
+    Opcode = AVR::STDWPtrQRr;
+  } else {
+    llvm_unreachable("Cannot store this register into a stack slot!");
+  }
+
+  BuildMI(MBB, MI, DL, get(Opcode))
+      .addFrameIndex(FrameIndex)
+      .addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addMemOperand(MMO);
+}
+
+void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        unsigned DestReg, int FrameIndex,
+                                        const TargetRegisterClass *RC,
+                                        const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MI != MBB.end()) {
+    DL = MI->getDebugLoc();
+  }
+
+  MachineFunction &MF = *MBB.getParent();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FrameIndex),
+      MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
+      MFI.getObjectAlignment(FrameIndex));
+
+  unsigned Opcode = 0;
+  if (RC->hasType(MVT::i8)) {
+    Opcode = AVR::LDDRdPtrQ;
+  } else if (RC->hasType(MVT::i16)) {
+    // Opcode = AVR::LDDWRdPtrQ;
+    //:FIXME: remove this once PR13375 gets fixed
+    Opcode = AVR::LDDWRdYQ;
+  } else {
+    llvm_unreachable("Cannot load this register from a stack slot!");
+  }
+
+  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+      .addFrameIndex(FrameIndex)
+      .addImm(0)
+      .addMemOperand(MMO);
+}
+
+const MCInstrDesc &AVRInstrInfo::getBrCond(AVRCC::CondCodes CC) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown condition code!");
+  case AVRCC::COND_EQ:
+    return get(AVR::BREQk);
+  case AVRCC::COND_NE:
+    return get(AVR::BRNEk);
+  case AVRCC::COND_GE:
+    return get(AVR::BRGEk);
+  case AVRCC::COND_LT:
+    return get(AVR::BRLTk);
+  case AVRCC::COND_SH:
+    return get(AVR::BRSHk);
+  case AVRCC::COND_LO:
+    return get(AVR::BRLOk);
+  case AVRCC::COND_MI:
+    return get(AVR::BRMIk);
+  case AVRCC::COND_PL:
+    return get(AVR::BRPLk);
+  }
+}
+
+AVRCC::CondCodes AVRInstrInfo::getCondFromBranchOpc(unsigned Opc) const {
+  switch (Opc) {
+  default:
+    return AVRCC::COND_INVALID;
+  case AVR::BREQk:
+    return AVRCC::COND_EQ;
+  case AVR::BRNEk:
+    return AVRCC::COND_NE;
+  case AVR::BRSHk:
+    return AVRCC::COND_SH;
+  case AVR::BRLOk:
+    return AVRCC::COND_LO;
+  case AVR::BRMIk:
+    return AVRCC::COND_MI;
+  case AVR::BRPLk:
+    return AVRCC::COND_PL;
+  case AVR::BRGEk:
+    return AVRCC::COND_GE;
+  case AVR::BRLTk:
+    return AVRCC::COND_LT;
+  }
+}
+
+AVRCC::CondCodes AVRInstrInfo::getOppositeCondition(AVRCC::CondCodes CC) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Invalid condition!");
+  case AVRCC::COND_EQ:
+    return AVRCC::COND_NE;
+  case AVRCC::COND_NE:
+    return AVRCC::COND_EQ;
+  case AVRCC::COND_SH:
+    return AVRCC::COND_LO;
+  case AVRCC::COND_LO:
+    return AVRCC::COND_SH;
+  case AVRCC::COND_GE:
+    return AVRCC::COND_LT;
+  case AVRCC::COND_LT:
+    return AVRCC::COND_GE;
+  case AVRCC::COND_MI:
+    return AVRCC::COND_PL;
+  case AVRCC::COND_PL:
+    return AVRCC::COND_MI;
+  }
+}
+
+bool AVRInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                 MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue()) {
+      continue;
+    }
+
+    // Working from the bottom, when we see a non-terminator
+    // instruction, we're done.
+    if (!isUnpredicatedTerminator(*I)) {
+      break;
+    }
+
+    // A terminator that isn't a branch can't easily be handled
+    // by this analysis.
+    if (!I->getDesc().isBranch()) {
+      return true;
+    }
+
+    // Handle unconditional branches.
+    //:TODO: add here jmp
+    if (I->getOpcode() == AVR::RJMPk) {
+      UnCondBrIter = I;
+
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (std::next(I) != MBB.end()) {
+        std::next(I)->eraseFromParent();
+      }
+
+      Cond.clear();
+      FBB = 0;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = 0;
+        I->eraseFromParent();
+        I = MBB.end();
+        UnCondBrIter = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditinal destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+
+    // Handle conditional branches.
+    AVRCC::CondCodes BranchCode = getCondFromBranchOpc(I->getOpcode());
+    if (BranchCode == AVRCC::COND_INVALID) {
+      return true; // Can't handle indirect branch.
+    }
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
+      if (AllowModify && UnCondBrIter != MBB.end() &&
+          MBB.isLayoutSuccessor(TargetBB)) {
+        // If we can modify the code and it ends in something like:
+        //
+        //     jCC L1
+        //     jmp L2
+        //   L1:
+        //     ...
+        //   L2:
+        //
+        // Then we can change this to:
+        //
+        //     jnCC L2
+        //   L1:
+        //     ...
+        //   L2:
+        //
+        // Which is a bit more efficient.
+        // We conditionally jump to the fall-through block.
+        BranchCode = getOppositeCondition(BranchCode);
+        unsigned JNCC = getBrCond(BranchCode).getOpcode();
+        MachineBasicBlock::iterator OldInst = I;
+
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
+            .addMBB(UnCondBrIter->getOperand(0).getMBB());
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(AVR::RJMPk))
+            .addMBB(TargetBB);
+
+        OldInst->eraseFromParent();
+        UnCondBrIter->eraseFromParent();
+
+        // Restart the analysis.
+        UnCondBrIter = MBB.end();
+        I = MBB.end();
+        continue;
+      }
+
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      continue;
+    }
+
+    // Handle subsequent conditional branches. Only handle the case where all
+    // conditional branches branch to the same destination.
+    assert(Cond.size() == 1);
+    assert(TBB);
+
+    // Only handle the case where all conditional branches branch to
+    // the same destination.
+    if (TBB != I->getOperand(0).getMBB()) {
+      return true;
+    }
+
+    AVRCC::CondCodes OldBranchCode = (AVRCC::CondCodes)Cond[0].getImm();
+    // If the conditions are the same, we can leave them alone.
+    if (OldBranchCode == BranchCode) {
+      continue;
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+unsigned AVRInstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock *TBB,
+                                    MachineBasicBlock *FBB,
+                                    ArrayRef<MachineOperand> Cond,
+                                    const DebugLoc &DL,
+                                    int *BytesAdded) const {
+  assert(!BytesAdded && "code size not handled");
+
+  // Shouldn't be a fall through.
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "AVR branch conditions have one component!");
+
+  if (Cond.empty()) {
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(AVR::RJMPk)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  AVRCC::CondCodes CC = (AVRCC::CondCodes)Cond[0].getImm();
+  BuildMI(&MBB, DL, getBrCond(CC)).addMBB(TBB);
+  ++Count;
+
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, DL, get(AVR::RJMPk)).addMBB(FBB);
+    ++Count;
+  }
+
+  return Count;
+}
+
+unsigned AVRInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                    int *BytesRemoved) const {
+  assert(!BytesRemoved && "code size not handled");
+
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue()) {
+      continue;
+    }
+    //:TODO: add here the missing jmp instructions once they are implemented
+    // like jmp, {e}ijmp, and other cond branches, ...
+    if (I->getOpcode() != AVR::RJMPk &&
+        getCondFromBranchOpc(I->getOpcode()) == AVRCC::COND_INVALID) {
+      break;
+    }
+
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+bool AVRInstrInfo::reverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid AVR branch condition!");
+
+  AVRCC::CondCodes CC = static_cast<AVRCC::CondCodes>(Cond[0].getImm());
+  Cond[0].setImm(getOppositeCondition(CC));
+
+  return false;
+}
+
+unsigned AVRInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+
+  switch (Opcode) {
+  // A regular instruction
+  default: {
+    const MCInstrDesc &Desc = get(Opcode);
+    return Desc.getSize();
+  }
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::DBG_VALUE:
+    return 0;
+  case TargetOpcode::INLINEASM: {
+    const MachineFunction &MF = *MI.getParent()->getParent();
+    const AVRTargetMachine &TM = static_cast<const AVRTargetMachine&>(MF.getTarget());
+    const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+    const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+    return TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(),
+                                  *TM.getMCAsmInfo());
+  }
+  }
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRInstrInfo.h b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.h
new file mode 100644
index 000000000000..c5105dafe5eb
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.h
@@ -0,0 +1,112 @@
+//===-- AVRInstrInfo.h - AVR Instruction Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AVR implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_INSTR_INFO_H
+#define LLVM_AVR_INSTR_INFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+
+#include "AVRRegisterInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "AVRGenInstrInfo.inc"
+#undef GET_INSTRINFO_HEADER
+
+namespace llvm {
+
+namespace AVRCC {
+
+/// AVR specific condition codes.
+/// These correspond to `AVR_*_COND` in `AVRInstrInfo.td`.
+/// They must be kept in synch.
+enum CondCodes {
+  COND_EQ, //!< Equal
+  COND_NE, //!< Not equal
+  COND_GE, //!< Greater than or equal
+  COND_LT, //!< Less than
+  COND_SH, //!< Unsigned same or higher
+  COND_LO, //!< Unsigned lower
+  COND_MI, //!< Minus
+  COND_PL, //!< Plus
+  COND_INVALID
+};
+
+} // end of namespace AVRCC
+
+namespace AVRII {
+
+/// Specifies a target operand flag.
+enum TOF {
+  MO_NO_FLAG,
+
+  /// On a symbol operand, this represents the lo part.
+  MO_LO = (1 << 1),
+
+  /// On a symbol operand, this represents the hi part.
+  MO_HI = (1 << 2),
+
+  /// On a symbol operand, this represents it has to be negated.
+  MO_NEG = (1 << 3)
+};
+
+} // end of namespace AVRII
+
+/// Utilities related to the AVR instruction set.
+class AVRInstrInfo : public AVRGenInstrInfo {
+public:
+  explicit AVRInstrInfo();
+
+  const AVRRegisterInfo &getRegisterInfo() const { return RI; }
+  const MCInstrDesc &getBrCond(AVRCC::CondCodes CC) const;
+  AVRCC::CondCodes getCondFromBranchOpc(unsigned Opc) const;
+  AVRCC::CondCodes getOppositeCondition(AVRCC::CondCodes CC) const;
+  unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, unsigned SrcReg,
+                           bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI, unsigned DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+
+  // Branch analysis.
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override;
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+  bool
+  reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+private:
+  const AVRRegisterInfo RI;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_INSTR_INFO_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRInstrInfo.td b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.td
new file mode 100644
index 000000000000..bc66379ab708
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -0,0 +1,2047 @@
+//===-- AVRInstrInfo.td - AVR Instruction defs -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the AVR instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "AVRInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// AVR Type Profiles
+//===----------------------------------------------------------------------===//
+
+def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_AVRCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
+def SDT_AVRCall : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+def SDT_AVRWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def SDT_AVRBrcond : SDTypeProfile<0, 2,
+                                  [SDTCisVT<0, OtherVT>, SDTCisVT<1, i8>]>;
+def SDT_AVRCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_AVRTst : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDT_AVRSelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+                                    SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
+//===----------------------------------------------------------------------===//
+// AVR Specific Node Definitions
+//===----------------------------------------------------------------------===//
+
+def AVRretflag : SDNode<"AVRISD::RET_FLAG", SDTNone,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def AVRretiflag : SDNode<"AVRISD::RETI_FLAG", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def AVRcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_AVRCallSeqStart,
+                              [SDNPHasChain, SDNPOutGlue]>;
+def AVRcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_AVRCallSeqEnd,
+                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AVRcall : SDNode<"AVRISD::CALL", SDT_AVRCall,
+                     [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>;
+
+def AVRWrapper : SDNode<"AVRISD::WRAPPER", SDT_AVRWrapper>;
+
+def AVRbrcond : SDNode<"AVRISD::BRCOND", SDT_AVRBrcond,
+                       [SDNPHasChain, SDNPInGlue]>;
+def AVRcmp : SDNode<"AVRISD::CMP", SDT_AVRCmp, [SDNPOutGlue]>;
+def AVRcmpc : SDNode<"AVRISD::CMPC", SDT_AVRCmp, [SDNPInGlue, SDNPOutGlue]>;
+def AVRtst : SDNode<"AVRISD::TST", SDT_AVRTst, [SDNPOutGlue]>;
+def AVRselectcc: SDNode<"AVRISD::SELECT_CC", SDT_AVRSelectCC, [SDNPInGlue]>;
+
+// Shift nodes.
+def AVRlsl : SDNode<"AVRISD::LSL", SDTIntUnaryOp>;
+def AVRlsr : SDNode<"AVRISD::LSR", SDTIntUnaryOp>;
+def AVRrol : SDNode<"AVRISD::ROL", SDTIntUnaryOp>;
+def AVRror : SDNode<"AVRISD::ROR", SDTIntUnaryOp>;
+def AVRasr : SDNode<"AVRISD::ASR", SDTIntUnaryOp>;
+
+// Pseudo shift nodes for non-constant shift amounts.
+def AVRlslLoop : SDNode<"AVRISD::LSLLOOP", SDTIntShiftOp>;
+def AVRlsrLoop : SDNode<"AVRISD::LSRLOOP", SDTIntShiftOp>;
+def AVRasrLoop : SDNode<"AVRISD::ASRLOOP", SDTIntShiftOp>;
+
+//===----------------------------------------------------------------------===//
+// AVR Operands, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+def imm8_neg_XFORM : SDNodeXForm<imm,
+[{
+  return CurDAG->getTargetConstant(-N->getAPIntValue(), SDLoc(N), MVT::i8);
+}]>;
+
+def imm16_neg_XFORM : SDNodeXForm<imm,
+[{
+  return CurDAG->getTargetConstant(-N->getAPIntValue(), SDLoc(N), MVT::i16);
+}]>;
+
+def imm0_63_neg : PatLeaf<(imm),
+[{
+  int64_t val = -N->getSExtValue();
+  return val >= 0 && val < 64;
+}], imm16_neg_XFORM>;
+
+def uimm6 : PatLeaf<(imm), [{ return isUInt<6>(N->getZExtValue()); }]>;
+
+def ioaddr_XFORM : SDNodeXForm<imm,
+[{
+  return CurDAG->getTargetConstant(uint8_t(N->getZExtValue()) - 0x20, SDLoc(N), MVT::i8);
+}]>;
+
+def iobitpos8_XFORM : SDNodeXForm<imm,
+[{
+  return CurDAG->getTargetConstant(Log2_32(uint8_t(N->getZExtValue())),
+                                   SDLoc(N), MVT::i8);
+}]>;
+
+def iobitposn8_XFORM : SDNodeXForm<imm,
+[{
+  return CurDAG->getTargetConstant(Log2_32(uint8_t(~N->getZExtValue())),
+                                   SDLoc(N), MVT::i8);
+}]>;
+
+def ioaddr8 : PatLeaf<(imm),
+[{
+  uint64_t val = N->getZExtValue();
+  return val >= 0x20 && val < 0x60;
+}], ioaddr_XFORM>;
+
+def lowioaddr8 : PatLeaf<(imm),
+[{
+  uint64_t val = N->getZExtValue();
+  return val >= 0x20 && val < 0x40;
+}], ioaddr_XFORM>;
+
+def ioaddr16 : PatLeaf<(imm),
+[{
+  uint64_t val = N->getZExtValue();
+  return val >= 0x20 && val < 0x5f;
+}], ioaddr_XFORM>;
+
+def iobitpos8 : PatLeaf<(imm),
+[{
+  return isPowerOf2_32(uint8_t(N->getZExtValue()));
+}], iobitpos8_XFORM>;
+
+def iobitposn8 : PatLeaf<(imm),
+[{
+  return isPowerOf2_32(uint8_t(~N->getZExtValue()));
+}], iobitposn8_XFORM>;
+
+def MemriAsmOperand : AsmOperandClass {
+  let Name = "Memri";
+  let ParserMethod = "parseMemriOperand";
+}
+
+/// Address operand for `reg+imm` used by STD and LDD.
+def memri : Operand<iPTR>
+{
+  let MIOperandInfo = (ops PTRDISPREGS, i16imm);
+
+  let PrintMethod = "printMemri";
+  let EncoderMethod = "encodeMemri";
+
+  let ParserMatchClass = MemriAsmOperand;
+}
+
+// Address operand for `SP+imm` used by STD{W}SPQRr
+def memspi : Operand<iPTR>
+{
+  let MIOperandInfo = (ops GPRSP, i16imm);
+}
+
+def imm_com8 : Operand<i8>
+{
+  let EncoderMethod = "encodeComplement";
+
+  let MIOperandInfo = (ops i8imm);
+}
+
+def relbrtarget_7 : Operand<OtherVT>
+{
+    let PrintMethod   = "printPCRelImm";
+    let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_7_pcrel>";
+}
+
+def brtarget_13 : Operand<OtherVT>
+{
+    let PrintMethod   = "printPCRelImm";
+    let EncoderMethod = "encodeRelCondBrTarget<AVR::fixup_13_pcrel>";
+}
+
+// The target of a 22 or 16-bit call/jmp instruction.
+def call_target : Operand<iPTR>
+{
+    let EncoderMethod = "encodeCallTarget";
+}
+
+// A 16-bit address (which can lead to an R_AVR_16 relocation).
+def imm16 : Operand<i16>
+{
+    let EncoderMethod = "encodeImm<AVR::fixup_16>";
+}
+
+/// A 6-bit immediate used in the ADIW/SBIW instructions.
+def imm_arith6 : Operand<i16>
+{
+    let EncoderMethod = "encodeImm<AVR::fixup_6_adiw>";
+}
+
+/// An 8-bit immediate inside an instruction with the same format
+/// as the `LDI` instruction (the `FRdK` format).
+def imm_ldi8 : Operand<i8>
+{
+    let EncoderMethod = "encodeImm<AVR::fixup_ldi>";
+}
+
+/// A 5-bit port number used in SBIC and friends (the `FIOBIT` format).
+def imm_port5 : Operand<i8>
+{
+    let EncoderMethod = "encodeImm<AVR::fixup_port5>";
+}
+
+/// A 6-bit port number used in the `IN` instruction and friends (the
+/// `FIORdA` format.
+def imm_port6 : Operand<i8>
+{
+    let EncoderMethod = "encodeImm<AVR::fixup_port6>";
+}
+
+// Addressing mode pattern reg+imm6
+def addr : ComplexPattern<iPTR, 2, "SelectAddr", [], [SDNPWantRoot]>;
+
+// AsmOperand class for a pointer register.
+// Used with the LD/ST family of instructions.
+// See FSTLD in AVRInstrFormats.td
+def PtrRegAsmOperand : AsmOperandClass
+{
+   let Name = "Reg";
+}
+
+// A special operand type for the LD/ST instructions.
+// It converts the pointer register number into a two-bit field used in the
+// instruction.
+def LDSTPtrReg : Operand<i16>
+{
+    let MIOperandInfo = (ops PTRREGS);
+    let EncoderMethod = "encodeLDSTPtrReg";
+
+    let ParserMatchClass = PtrRegAsmOperand;
+}
+
+// A special operand type for the LDD/STD instructions.
+// It behaves identically to the LD/ST version, except restricts
+// the pointer registers to Y and Z.
+def LDDSTDPtrReg : Operand<i16>
+{
+    let MIOperandInfo = (ops PTRDISPREGS);
+    let EncoderMethod = "encodeLDSTPtrReg";
+
+    let ParserMatchClass = PtrRegAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+// AVR predicates for subtarget features
+//===----------------------------------------------------------------------===//
+
+def HasSRAM       :    Predicate<"Subtarget->hasSRAM()">,
+                         AssemblerPredicate<"FeatureSRAM">;
+
+def HasJMPCALL    :    Predicate<"Subtarget->hasJMPCALL()">,
+                         AssemblerPredicate<"FeatureJMPCALL">;
+
+def HasIJMPCALL   :    Predicate<"Subtarget->hasIJMPCALL()">,
+                         AssemblerPredicate<"FeatureIJMPCALL">;
+
+def HasEIJMPCALL  :    Predicate<"Subtarget->hasEIJMPCALL()">,
+                         AssemblerPredicate<"FeatureEIJMPCALL">;
+
+def HasADDSUBIW   :    Predicate<"Subtarget->hasADDSUBIW()">,
+                         AssemblerPredicate<"FeatureADDSUBIW">;
+
+def HasSmallStack :    Predicate<"Subtarget->HasSmallStack()">,
+                         AssemblerPredicate<"FeatureSmallStack">;
+
+def HasMOVW       :    Predicate<"Subtarget->hasMOVW()">,
+                         AssemblerPredicate<"FeatureMOVW">;
+
+def HasLPM        :    Predicate<"Subtarget->hasLPM()">,
+                         AssemblerPredicate<"FeatureLPM">;
+
+def HasLPMX       :    Predicate<"Subtarget->hasLPMX()">,
+                         AssemblerPredicate<"FeatureLPMX">;
+
+def HasELPM       :    Predicate<"Subtarget->hasELPM()">,
+                         AssemblerPredicate<"FeatureELPM">;
+
+def HasELPMX      :    Predicate<"Subtarget->hasELPMX()">,
+                         AssemblerPredicate<"FeatureELPMX">;
+
+def HasSPM        :    Predicate<"Subtarget->hasSPM()">,
+                         AssemblerPredicate<"FeatureSPM">;
+
+def HasSPMX       :    Predicate<"Subtarget->hasSPMX()">,
+                         AssemblerPredicate<"FeatureSPMX">;
+
+def HasDES        :    Predicate<"Subtarget->hasDES()">,
+                         AssemblerPredicate<"FeatureDES">;
+
+def SupportsRMW   :    Predicate<"Subtarget->supportsRMW()">,
+                         AssemblerPredicate<"FeatureRMW">;
+
+def SupportsMultiplication : Predicate<"Subtarget->supportsMultiplication()">,
+                               AssemblerPredicate<"FeatureMultiplication">;
+
+def HasBREAK      :    Predicate<"Subtarget->hasBREAK()">,
+                         AssemblerPredicate<"FeatureBREAK">;
+
+def HasTinyEncoding : Predicate<"Subtarget->hasTinyEncoding()">,
+                        AssemblerPredicate<"FeatureTinyEncoding">;
+
+
+// AVR specific condition code. These correspond to AVR_*_COND in
+// AVRInstrInfo.td. They must be kept in synch.
+def AVR_COND_EQ : PatLeaf<(i8 0)>;
+def AVR_COND_NE : PatLeaf<(i8 1)>;
+def AVR_COND_GE : PatLeaf<(i8 2)>;
+def AVR_COND_LT : PatLeaf<(i8 3)>;
+def AVR_COND_SH : PatLeaf<(i8 4)>;
+def AVR_COND_LO : PatLeaf<(i8 5)>;
+def AVR_COND_MI : PatLeaf<(i8 6)>;
+def AVR_COND_PL : PatLeaf<(i8 7)>;
+
+
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// AVR Instruction list
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+
+// ADJCALLSTACKDOWN/UP implicitly use/def SP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber SREG.
+let Defs = [SP, SREG],
+Uses = [SP] in
+{
+  def ADJCALLSTACKDOWN : Pseudo<(outs),
+                                (ins i16imm:$amt),
+                                "#ADJCALLSTACKDOWN",
+                                [(AVRcallseq_start timm:$amt)]>;
+
+  // R31R30 is used to update SP, since it is a scratch reg and this instruction
+  // is placed after the function call then R31R30 should be always free.
+  //let Defs = [R31R30],
+  //Uses = [R31R30] in
+  //:TODO: if we enable this, the pseudo is killed because it looks dead
+  def ADJCALLSTACKUP : Pseudo<(outs),
+                              (ins i16imm:$amt1, i16imm:$amt2),
+                              "#ADJCALLSTACKUP",
+                              [(AVRcallseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+let isCommutable = 1,
+Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+  // ADD Rd, Rr
+  // Adds two 8-bit registers.
+  def ADDRdRr : FRdRr<0b0000,
+                      0b11,
+                      (outs GPR8:$rd),
+                      (ins GPR8:$src, GPR8:$rr),
+                      "add\t$rd, $rr",
+                      [(set i8:$rd, (add i8:$src, i8:$rr)),
+                       (implicit SREG)]>;
+
+  // ADDW Rd+1:Rd, Rr+1:Rr
+  // Pseudo instruction to add four 8-bit registers as two 16-bit values.
+  //
+  // Expands to:
+  // add Rd,    Rr
+  // adc Rd+1, Rr+1
+  def ADDWRdRr : Pseudo<(outs DREGS:$rd),
+                        (ins DREGS:$src, DREGS:$rr),
+                        "addw\t$rd, $rr",
+                        [(set i16:$rd, (add i16:$src, i16:$rr)),
+                         (implicit SREG)]>;
+
+  // ADC Rd, Rr
+  // Adds two 8-bit registers with carry.
+  let Uses = [SREG] in
+  def ADCRdRr : FRdRr<0b0001,
+                      0b11,
+                      (outs GPR8:$rd),
+                      (ins GPR8:$src, GPR8:$rr),
+                      "adc\t$rd, $rr",
+                      [(set i8:$rd, (adde i8:$src, i8:$rr)),
+                       (implicit SREG)]>;
+
+  // ADCW Rd+1:Rd, Rr+1:Rr
+  // Pseudo instruction to add four 8-bit registers as two 16-bit values with
+  // carry.
+  //
+  // Expands to:
+  // adc Rd,   Rr
+  // adc Rd+1, Rr+1
+  let Uses = [SREG] in
+  def ADCWRdRr : Pseudo<(outs DREGS:$rd),
+                        (ins DREGS:$src, DREGS:$rr),
+                        "adcw\t$rd, $rr",
+                        [(set i16:$rd, (adde i16:$src, i16:$rr)),
+                         (implicit SREG)]>;
+
+  // AIDW Rd, k
+  // Adds an immediate 6-bit value K to Rd, placing the result in Rd.
+  def ADIWRdK : FWRdK<0b0,
+                      (outs IWREGS:$rd),
+                      (ins IWREGS:$src, imm_arith6:$k),
+                      "adiw\t$rd, $k",
+                      [(set i16:$rd, (add i16:$src, uimm6:$k)),
+                       (implicit SREG)]>,
+                Requires<[HasADDSUBIW]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+  // SUB Rd, Rr
+  // Subtracts the 8-bit value of Rr from Rd and places the value in Rd.
+  def SUBRdRr : FRdRr<0b0001,
+                      0b10,
+                      (outs GPR8:$rd),
+                      (ins GPR8:$src, GPR8:$rr),
+                      "sub\t$rd, $rr",
+                      [(set i8:$rd, (sub i8:$src, i8:$rr)),
+                       (implicit SREG)]>;
+
+  // SUBW Rd+1:Rd, Rr+1:Rr
+  // Subtracts two 16-bit values and places the result into Rd.
+  //
+  // Expands to:
+  // sub Rd,   Rr
+  // sbc Rd+1, Rr+1
+  def SUBWRdRr : Pseudo<(outs DREGS:$rd),
+                        (ins DREGS:$src, DREGS:$rr),
+                        "subw\t$rd, $rr",
+                        [(set i16:$rd, (sub i16:$src, i16:$rr)),
+                         (implicit SREG)]>;
+
+  def SUBIRdK : FRdK<0b0101,
+                     (outs LD8:$rd),
+                     (ins LD8:$src, imm_ldi8:$k),
+                     "subi\t$rd, $k",
+                     [(set i8:$rd, (sub i8:$src, imm:$k)),
+                      (implicit SREG)]>;
+
+  // SUBIW Rd+1:Rd, K+1:K
+  //
+  // Expands to:
+  // subi Rd,   K
+  // sbci Rd+1, K+1
+  def SUBIWRdK : Pseudo<(outs DLDREGS:$rd),
+                        (ins DLDREGS:$src, i16imm:$rr),
+                        "subiw\t$rd, $rr",
+                        [(set i16:$rd, (sub i16:$src, imm:$rr)),
+                         (implicit SREG)]>;
+
+  def SBIWRdK : FWRdK<0b1,
+                      (outs IWREGS:$rd),
+                      (ins IWREGS:$src, imm_arith6:$k),
+                      "sbiw\t$rd, $k",
+                      [(set i16:$rd, (sub i16:$src, uimm6:$k)),
+                       (implicit SREG)]>,
+                Requires<[HasADDSUBIW]>;
+
+  // Subtract with carry operations which must read the carry flag in SREG.
+  let Uses = [SREG] in
+  {
+    def SBCRdRr : FRdRr<0b0000,
+                        0b10,
+                        (outs GPR8:$rd),
+                        (ins GPR8:$src, GPR8:$rr),
+                        "sbc\t$rd, $rr",
+                        [(set i8:$rd, (sube i8:$src, i8:$rr)),
+                         (implicit SREG)]>;
+
+    // SBCW Rd+1:Rd, Rr+1:Rr
+    //
+    // Expands to:
+    // sbc Rd,   Rr
+    // sbc Rd+1, Rr+1
+    def SBCWRdRr : Pseudo<(outs DREGS:$rd),
+                          (ins DREGS:$src, DREGS:$rr),
+                          "sbcw\t$rd, $rr",
+                          [(set i16:$rd, (sube i16:$src, i16:$rr)),
+                           (implicit SREG)]>;
+
+    def SBCIRdK : FRdK<0b0100,
+                       (outs LD8:$rd),
+                       (ins LD8:$src, imm_ldi8:$k),
+                       "sbci\t$rd, $k",
+                       [(set i8:$rd, (sube i8:$src, imm:$k)),
+                        (implicit SREG)]>;
+
+    // SBCIW Rd+1:Rd, K+1:K
+    // sbci Rd,   K
+    // sbci Rd+1, K+1
+    def SBCIWRdK : Pseudo<(outs DLDREGS:$rd),
+                          (ins DLDREGS:$src, i16imm:$rr),
+                          "sbciw\t$rd, $rr",
+                          [(set i16:$rd, (sube i16:$src, imm:$rr)),
+                           (implicit SREG)]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Increment and Decrement
+//===----------------------------------------------------------------------===//
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+  def INCRd : FRd<0b1001,
+                  0b0100011,
+                  (outs GPR8:$rd),
+                  (ins GPR8:$src),
+                  "inc\t$rd",
+                  [(set i8:$rd, (add i8:$src, 1)), (implicit SREG)]>;
+
+  def DECRd : FRd<0b1001,
+                  0b0101010,
+                  (outs GPR8:$rd),
+                  (ins GPR8:$src),
+                  "dec\t$rd",
+                  [(set i8:$rd, (add i8:$src, -1)), (implicit SREG)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+let isCommutable = 1,
+Defs = [R1, R0, SREG] in
+{
+  // MUL Rd, Rr
+  // Multiplies Rd by Rr and places the result into R1:R0.
+  let usesCustomInserter = 1 in {
+    def MULRdRr : FRdRr<0b1001, 0b11,
+                        (outs),
+                        (ins GPR8:$lhs, GPR8:$rhs),
+                        "mul\t$lhs, $rhs",
+                        [/*(set R1, R0, (smullohi i8:$lhs, i8:$rhs))*/]>,
+                    Requires<[SupportsMultiplication]>;
+
+    def MULSRdRr : FMUL2RdRr<0,
+                             (outs),
+                             (ins GPR8:$lhs, GPR8:$rhs),
+                             "muls\t$lhs, $rhs",
+                             []>,
+                   Requires<[SupportsMultiplication]>;
+  }
+
+  def MULSURdRr : FMUL2RdRr<1,
+                            (outs),
+                            (ins GPR8:$lhs, GPR8:$rhs),
+                            "mulsu\t$lhs, $rhs",
+                            []>,
+                  Requires<[SupportsMultiplication]>;
+
+  def FMUL : FFMULRdRr<0b01,
+                       (outs),
+                       (ins GPR8:$lhs, GPR8:$rhs),
+                       "fmul\t$lhs, $rhs",
+                       []>,
+             Requires<[SupportsMultiplication]>;
+
+  def FMULS : FFMULRdRr<0b10,
+                        (outs),
+                        (ins GPR8:$lhs, GPR8:$rhs),
+                        "fmuls\t$lhs, $rhs",
+                        []>,
+              Requires<[SupportsMultiplication]>;
+
+  def FMULSU : FFMULRdRr<0b11,
+                         (outs),
+                         (ins GPR8:$lhs, GPR8:$rhs),
+                         "fmulsu\t$lhs, $rhs",
+                         []>,
+               Requires<[SupportsMultiplication]>;
+}
+
+let Defs = [R15, R14, R13, R12, R11, R10, R9,
+            R8, R7, R6, R5, R4, R3, R2, R1, R0] in
+def DESK : FDES<(outs),
+                (ins i8imm:$k),
+                "des\t$k",
+                []>,
+           Requires<[HasDES]>;
+
+//===----------------------------------------------------------------------===//
+// Logic
+//===----------------------------------------------------------------------===//
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+  // Register-Register logic instructions (which have the
+  // property of commutativity).
+  let isCommutable = 1 in
+  {
+    def ANDRdRr : FRdRr<0b0010,
+                        0b00,
+                        (outs GPR8:$rd),
+                        (ins GPR8:$src, GPR8:$rr),
+                        "and\t$rd, $rr",
+                        [(set i8:$rd, (and i8:$src, i8:$rr)),
+                         (implicit SREG)]>;
+
+    // ANDW Rd+1:Rd, Rr+1:Rr
+    //
+    // Expands to:
+    // and Rd,   Rr
+    // and Rd+1, Rr+1
+    def ANDWRdRr : Pseudo<(outs DREGS:$rd),
+                          (ins DREGS:$src, DREGS:$rr),
+                          "andw\t$rd, $rr",
+                          [(set i16:$rd, (and i16:$src, i16:$rr)),
+                           (implicit SREG)]>;
+
+    def ORRdRr : FRdRr<0b0010,
+                       0b10,
+                       (outs GPR8:$rd),
+                       (ins GPR8:$src, GPR8:$rr),
+                       "or\t$rd, $rr",
+                       [(set i8:$rd, (or i8:$src, i8:$rr)),
+                        (implicit SREG)]>;
+
+    // ORW Rd+1:Rd, Rr+1:Rr
+    //
+    // Expands to:
+    // or Rd,   Rr
+    // or Rd+1, Rr+1
+    def ORWRdRr : Pseudo<(outs DREGS:$rd),
+                         (ins DREGS:$src, DREGS:$rr),
+                         "orw\t$rd, $rr",
+                         [(set i16:$rd, (or i16:$src, i16:$rr)),
+                          (implicit SREG)]>;
+
+    def EORRdRr : FRdRr<0b0010,
+                        0b01,
+                        (outs GPR8:$rd),
+                        (ins GPR8:$src, GPR8:$rr),
+                        "eor\t$rd, $rr",
+                        [(set i8:$rd, (xor i8:$src, i8:$rr)),
+                         (implicit SREG)]>;
+
+    // EORW Rd+1:Rd, Rr+1:Rr
+    //
+    // Expands to:
+    // eor Rd,   Rr
+    // eor Rd+1, Rr+1
+    def EORWRdRr : Pseudo<(outs DREGS:$rd),
+                          (ins DREGS:$src, DREGS:$rr),
+                          "eorw\t$rd, $rr",
+                          [(set i16:$rd, (xor i16:$src, i16:$rr)),
+                           (implicit SREG)]>;
+  }
+
+  def ANDIRdK : FRdK<0b0111,
+                     (outs LD8:$rd),
+                     (ins LD8:$src, imm_ldi8:$k),
+                     "andi\t$rd, $k",
+                     [(set i8:$rd, (and i8:$src, imm:$k)),
+                      (implicit SREG)]>;
+
+  // ANDI Rd+1:Rd, K+1:K
+  //
+  // Expands to:
+  // andi Rd,   K
+  // andi Rd+1, K+1
+  def ANDIWRdK : Pseudo<(outs DLDREGS:$rd),
+                        (ins DLDREGS:$src, i16imm:$k),
+                        "andiw\t$rd, $k",
+                        [(set i16:$rd, (and i16:$src, imm:$k)),
+                         (implicit SREG)]>;
+
+  def ORIRdK : FRdK<0b0110,
+                    (outs LD8:$rd),
+                    (ins LD8:$src, imm_ldi8:$k),
+                    "ori\t$rd, $k",
+                    [(set i8:$rd, (or i8:$src, imm:$k)),
+                     (implicit SREG)]>;
+
+  // ORIW Rd+1:Rd, K+1,K
+  //
+  // Expands to:
+  // ori Rd,   K
+  // ori Rd+1, K+1
+  def ORIWRdK : Pseudo<(outs DLDREGS:$rd),
+                       (ins DLDREGS:$src, i16imm:$rr),
+                       "oriw\t$rd, $rr",
+                       [(set i16:$rd, (or i16:$src, imm:$rr)),
+                        (implicit SREG)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// One's/Two's Compliment
+//===----------------------------------------------------------------------===//
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+  def COMRd : FRd<0b1001,
+                  0b0100000,
+                  (outs GPR8:$rd),
+                  (ins GPR8:$src),
+                  "com\t$rd",
+                  [(set i8:$rd, (not i8:$src)), (implicit SREG)]>;
+
+  // COMW Rd+1:Rd
+  //
+  // Expands to:
+  // com Rd
+  // com Rd+1
+  def COMWRd : Pseudo<(outs DREGS:$rd),
+                      (ins DREGS:$src),
+                      "comw\t$rd",
+                      [(set i16:$rd, (not i16:$src)), (implicit SREG)]>;
+
+  //:TODO: optimize NEG for wider types
+  def NEGRd : FRd<0b1001,
+                  0b0100001,
+                  (outs GPR8:$rd),
+                  (ins GPR8:$src),
+                  "neg\t$rd",
+                  [(set i8:$rd, (ineg i8:$src)), (implicit SREG)]>;
+}
+
+// TST Rd
+// Test for zero of minus.
+// This operation is identical to a `Rd AND Rd`.
+//def : InstAlias<"tst\t$rd", (ANDRdRr GPR8:$rd, GPR8:$rd), 1>;
+
+let Defs = [SREG] in
+def TSTRd : FTST<0b0010,
+                  0b00,
+                  (outs),
+                  (ins GPR8:$rd),
+                  "tst\t$rd",
+                  [(AVRtst i8:$rd)]>;
+
+//===----------------------------------------------------------------------===//
+// Jump instructions
+//===----------------------------------------------------------------------===//
+let isBarrier = 1,
+isBranch = 1,
+isTerminator = 1 in
+{
+  def RJMPk : FBRk<0,
+                   (outs),
+                   (ins brtarget_13:$target),
+                   "rjmp\t$target",
+                   [(br bb:$target)]>;
+
+  let isIndirectBranch = 1,
+  Uses = [R31R30] in
+  def IJMP : F16<0b1001010000001001,
+                 (outs),
+                 (ins),
+                 "ijmp",
+                 []>,
+             Requires<[HasIJMPCALL]>;
+
+  let isIndirectBranch = 1,
+  Uses = [R31R30] in
+  def EIJMP : F16<0b1001010000011001,
+                  (outs),
+                  (ins),
+                  "eijmp",
+                  []>,
+              Requires<[HasEIJMPCALL]>;
+
+  def JMPk : F32BRk<0b110,
+                    (outs),
+                    (ins call_target:$k),
+                    "jmp\t$k",
+                    []>,
+             Requires<[HasJMPCALL]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Call instructions
+//===----------------------------------------------------------------------===//
+let isCall = 1 in
+{
+  // SP is marked as a use to prevent stack-pointer assignments that appear
+  // immediately before calls from potentially appearing dead.
+  let Uses = [SP] in
+  def RCALLk : FBRk<1,
+                    (outs),
+                    (ins brtarget_13:$target),
+                    "rcall\t$target",
+                    []>;
+
+  // SP is marked as a use to prevent stack-pointer assignments that appear
+  // immediately before calls from potentially appearing dead.
+  let Uses = [SP, R31R30] in
+  def ICALL : F16<0b1001010100001001,
+                  (outs),
+                  (ins variable_ops),
+                  "icall",
+                  []>,
+              Requires<[HasIJMPCALL]>;
+
+  // SP is marked as a use to prevent stack-pointer assignments that appear
+  // immediately before calls from potentially appearing dead.
+  let Uses = [SP, R31R30] in
+  def EICALL : F16<0b1001010100011001,
+                   (outs),
+                   (ins variable_ops),
+                   "eicall",
+                   []>,
+               Requires<[HasEIJMPCALL]>;
+
+  // SP is marked as a use to prevent stack-pointer assignments that appear
+  // immediately before calls from potentially appearing dead.
+  //
+  //:TODO: the imm field can be either 16 or 22 bits in devices with more
+  // than 64k of ROM, fix it once we support the largest devices.
+  let Uses = [SP] in
+  def CALLk : F32BRk<0b111,
+                     (outs),
+                     (ins call_target:$k),
+                     "call\t$k",
+                     [(AVRcall imm:$k)]>,
+              Requires<[HasJMPCALL]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Return instructions.
+//===----------------------------------------------------------------------===//
+let isTerminator = 1,
+isReturn = 1,
+isBarrier = 1 in 
+{
+  def RET : F16<0b1001010100001000,
+                (outs),
+                (ins),
+                "ret",
+                [(AVRretflag)]>;
+
+  def RETI : F16<0b1001010100011000,
+                 (outs),
+                 (ins),
+                 "reti",
+                 [(AVRretiflag)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Compare operations.
+//===----------------------------------------------------------------------===//
+let Defs = [SREG] in
+{
+  // CPSE Rd, Rr
+  // Compare Rd and Rr, skipping the next instruction if they are equal.
+  let isBarrier = 1,
+  isBranch = 1,
+  isTerminator = 1 in
+  def CPSE : FRdRr<0b0001,
+                   0b00,
+                   (outs),
+                   (ins GPR8:$rd, GPR8:$rr),
+                   "cpse\t$rd, $rr",
+                   []>;
+
+  def CPRdRr : FRdRr<0b0001,
+                     0b01,
+                     (outs),
+                     (ins GPR8:$rd, GPR8:$rr),
+                     "cp\t$rd, $rr",
+                     [(AVRcmp i8:$rd, i8:$rr), (implicit SREG)]>;
+
+  // CPW Rd+1:Rd, Rr+1:Rr
+  //
+  // Expands to:
+  // cp  Rd,   Rr
+  // cpc Rd+1, Rr+1
+  def CPWRdRr : Pseudo<(outs),
+                       (ins DREGS:$src, DREGS:$src2),
+                       "cpw\t$src, $src2",
+                       [(AVRcmp i16:$src, i16:$src2), (implicit SREG)]>;
+
+  let Uses = [SREG] in
+  def CPCRdRr : FRdRr<0b0000,
+                      0b01,
+                      (outs),
+                      (ins GPR8:$rd, GPR8:$rr),
+                      "cpc\t$rd, $rr",
+                      [(AVRcmpc i8:$rd, i8:$rr), (implicit SREG)]>;
+
+  // CPCW Rd+1:Rd. Rr+1:Rr
+  //
+  // Expands to:
+  // cpc Rd,   Rr
+  // cpc Rd+1, Rr+1
+  let Uses = [SREG] in
+  def CPCWRdRr : Pseudo<(outs),
+                        (ins DREGS:$src, DREGS:$src2),
+                        "cpcw\t$src, $src2",
+                        [(AVRcmpc i16:$src, i16:$src2), (implicit SREG)]>;
+
+  // CPI Rd, K
+  // Compares a register with an 8 bit immediate.
+  let Uses = [SREG] in
+  def CPIRdK : FRdK<0b0011,
+                    (outs),
+                    (ins GPR8:$rd, imm_ldi8:$k),
+                    "cpi\t$rd, $k",
+                    [(AVRcmp i8:$rd, imm:$k), (implicit SREG)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register conditional skipping/branching operations.
+//===----------------------------------------------------------------------===//
+let isBranch = 1,
+isTerminator = 1 in
+{
+  // Conditional skipping on GPR register bits, and
+  // conditional skipping on IO register bits.
+  let isBarrier = 1 in
+  {
+    def SBRCRrB : FRdB<0b10,
+                       (outs),
+                       (ins GPR8:$rr, i8imm:$b),
+                       "sbrc\t$rr, $b",
+                       []>;
+
+    def SBRSRrB : FRdB<0b11,
+                       (outs),
+                       (ins GPR8:$rr, i8imm:$b),
+                       "sbrs\t$rr, $b",
+                       []>;
+
+    def SBICAb : FIOBIT<0b01,
+                        (outs),
+                        (ins imm_port5:$a, i8imm:$b),
+                        "sbic\t$a, $b",
+                        []>;
+
+    def SBISAb : FIOBIT<0b11,
+                        (outs),
+                        (ins imm_port5:$a, i8imm:$b),
+                        "sbis\t$a, $b",
+                        []>;
+  }
+
+  // Relative branches on status flag bits.
+  let Uses = [SREG] in
+  {
+    // BRBS s, k
+    // Branch if `s` flag in status register is set.
+    def BRBSsk : FSK<0,
+                     (outs),
+                     (ins i8imm:$s, relbrtarget_7:$k),
+                     "brbs\t$s, $k",
+                     []>;
+
+    // BRBC s, k
+    // Branch if `s` flag in status register is clear.
+    def BRBCsk : FSK<1,
+                     (outs),
+                     (ins i8imm:$s, relbrtarget_7:$k),
+                     "brbc\t$s, $k",
+                     []>;
+  }
+}
+
+
+// BRCS k
+// Branch if carry flag is set
+def : InstAlias<"brcs\t$k", (BRBSsk 0, relbrtarget_7:$k)>;
+
+// BRCC k
+// Branch if carry flag is clear
+def : InstAlias<"brcc\t$k", (BRBCsk 0, relbrtarget_7:$k)>;
+
+// BRHS k
+// Branch if half carry flag is set
+def : InstAlias<"brhs\t$k", (BRBSsk 5, relbrtarget_7:$k)>;
+
+// BRHC k
+// Branch if half carry flag is clear
+def : InstAlias<"brhc\t$k", (BRBCsk 5, relbrtarget_7:$k)>;
+
+// BRTS k
+// Branch if the T flag is set
+def : InstAlias<"brts\t$k", (BRBSsk 6, relbrtarget_7:$k)>;
+
+// BRTC k
+// Branch if the T flag is clear
+def : InstAlias<"brtc\t$k", (BRBCsk 6, relbrtarget_7:$k)>;
+
+// BRVS k
+// Branch if the overflow flag is set
+def : InstAlias<"brvs\t$k", (BRBSsk 3, relbrtarget_7:$k)>;
+
+// BRVC k
+// Branch if the overflow flag is clear
+def : InstAlias<"brvc\t$k", (BRBCsk 3, relbrtarget_7:$k)>;
+
+// BRIE k
+// Branch if the global interrupt flag is enabled
+def : InstAlias<"brie\t$k", (BRBSsk 7, relbrtarget_7:$k)>;
+
+// BRID k
+// Branch if the global interrupt flag is disabled
+def : InstAlias<"brid\t$k", (BRBCsk 7, relbrtarget_7:$k)>;
+
+//===----------------------------------------------------------------------===//
+// PC-relative conditional branches
+//===----------------------------------------------------------------------===//
+// Based on status register. We cannot simplify these into instruction aliases
+// because we also need to be able to specify a pattern to match for ISel.
+let isBranch = 1,
+isTerminator = 1,
+Uses = [SREG] in
+{
+  def BREQk : FBRsk<0,
+                    0b001,
+                    (outs),
+                    (ins relbrtarget_7:$target),
+                    "breq\t$target",
+                    [(AVRbrcond bb:$target, AVR_COND_EQ)]>;
+
+  def BRNEk : FBRsk<1,
+                    0b001,
+                    (outs),
+                    (ins relbrtarget_7:$target),
+                    "brne\t$target",
+                    [(AVRbrcond bb:$target, AVR_COND_NE)]>;
+
+
+  def BRSHk : FBRsk<1,
+                    0b000,
+                    (outs),
+                    (ins relbrtarget_7:$target),
+                    "brsh\t$target",
+                    [(AVRbrcond bb:$target, AVR_COND_SH)]>;
+
+  def BRLOk : FBRsk<0,
+                    0b000,
+                    (outs),
+                    (ins relbrtarget_7:$target),
+                    "brlo\t$target",
+                    [(AVRbrcond bb:$target, AVR_COND_LO)]>;
+
+  def BRMIk : FBRsk<0,
+                    0b010,
+                    (outs),
+                    (ins relbrtarget_7:$target),
+                    "brmi\t$target",
+                    [(AVRbrcond bb:$target, AVR_COND_MI)]>;
+
+  def BRPLk : FBRsk<1,
+                    0b010,
+                    (outs),
+                    (ins relbrtarget_7:$target),
+                    "brpl\t$target",
+                    [(AVRbrcond bb:$target, AVR_COND_PL)]>;
+
+  def BRGEk : FBRsk<1,
+                    0b100,
+                    (outs),
+                    (ins relbrtarget_7:$target),
+                    "brge\t$target",
+                    [(AVRbrcond bb:$target, AVR_COND_GE)]>;
+
+  def BRLTk : FBRsk<0,
+                    0b100,
+                    (outs),
+                    (ins relbrtarget_7:$target),
+                    "brlt\t$target",
+                    [(AVRbrcond bb:$target, AVR_COND_LT)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Data transfer instructions
+//===----------------------------------------------------------------------===//
+// 8 and 16-bit register move instructions.
+let hasSideEffects = 0 in
+{
+  def MOVRdRr : FRdRr<0b0010,
+                      0b11,
+                      (outs GPR8:$rd),
+                      (ins GPR8:$rr),
+                      "mov\t$rd, $rr",
+                      []>;
+
+  def MOVWRdRr : FMOVWRdRr<(outs DREGS:$dst),
+                           (ins DREGS:$src),
+                           "movw\t$dst, $src",
+                           []>,
+                 Requires<[HasMOVW]>;
+}
+
+// Load immediate values into registers.
+let isReMaterializable = 1 in
+{
+  def LDIRdK : FRdK<0b1110,
+                    (outs LD8:$rd),
+                    (ins imm_ldi8:$k),
+                    "ldi\t$rd, $k",
+                    [(set i8:$rd, imm:$k)]>;
+
+  // LDIW Rd+1:Rd, K+1:K
+  //
+  // Expands to:
+  // ldi Rd,   K
+  // ldi Rd+1, K+1
+  def LDIWRdK : Pseudo<(outs DLDREGS:$dst),
+                       (ins i16imm:$src),
+                       "ldiw\t$dst, $src",
+                       [(set i16:$dst, imm:$src)]>;
+}
+
+// Load from data space into register.
+let canFoldAsLoad = 1,
+isReMaterializable = 1 in
+{
+  def LDSRdK : F32DM<0b0,
+                     (outs GPR8:$rd),
+                     (ins imm16:$k),
+                     "lds\t$rd, $k",
+                     [(set i8:$rd, (load imm:$k))]>,
+               Requires<[HasSRAM]>;
+
+  // LDSW Rd+1:Rd, K+1:K
+  //
+  // Expands to:
+  // lds Rd,  (K+1:K)
+  // lds Rd+1 (K+1:K) + 1
+  def LDSWRdK : Pseudo<(outs DREGS:$dst),
+                       (ins i16imm:$src),
+                       "ldsw\t$dst, $src",
+                       [(set i16:$dst, (load imm:$src))]>,
+                Requires<[HasSRAM]>;
+}
+
+// Indirect loads.
+let canFoldAsLoad = 1,
+isReMaterializable = 1 in
+{
+  def LDRdPtr : FSTLD<0,
+                      0b00,
+                      (outs GPR8:$reg),
+                      (ins LDSTPtrReg:$ptrreg),
+                      "ld\t$reg, $ptrreg",
+                      [(set GPR8:$reg, (load i16:$ptrreg))]>,
+                Requires<[HasSRAM]>;
+
+  // LDW Rd+1:Rd, P
+  //
+  // Expands to:
+  // ld Rd,   P+
+  // ld Rd+1, P+
+  let Constraints = "@earlyclobber $reg" in
+  def LDWRdPtr : Pseudo<(outs DREGS:$reg),
+                        (ins PTRDISPREGS:$ptrreg),
+                        "ldw\t$reg, $ptrreg",
+                        [(set i16:$reg, (load i16:$ptrreg))]>,
+                 Requires<[HasSRAM]>;
+}
+
+// Indirect loads (with postincrement or predecrement).
+let mayLoad = 1,
+hasSideEffects = 0,
+Constraints = "$ptrreg = $base_wb,@earlyclobber $reg,@earlyclobber $base_wb" in
+{
+  def LDRdPtrPi : FSTLD<0,
+                        0b01,
+                        (outs GPR8:$reg, PTRREGS:$base_wb),
+                        (ins LDSTPtrReg:$ptrreg),
+                        "ld\t$reg, $ptrreg+",
+                        []>,
+                  Requires<[HasSRAM]>;
+
+  // LDW Rd+1:Rd, P+
+  // Expands to:
+  // ld Rd,   P+
+  // ld Rd+1, P+
+  def LDWRdPtrPi : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb),
+                          (ins PTRREGS:$ptrreg),
+                          "ldw\t$reg, $ptrreg+",
+                          []>,
+                   Requires<[HasSRAM]>;
+
+  def LDRdPtrPd : FSTLD<0,
+                        0b10,
+                        (outs GPR8:$reg, PTRREGS:$base_wb),
+                        (ins LDSTPtrReg:$ptrreg),
+                        "ld\t$reg, -$ptrreg",
+                        []>,
+                  Requires<[HasSRAM]>;
+
+  // LDW Rd+1:Rd, -P
+  //
+  // Expands to:
+  // ld Rd+1, -P
+  // ld Rd,   -P
+  def LDWRdPtrPd : Pseudo<(outs DREGS:$reg, PTRREGS:$base_wb),
+                          (ins PTRREGS:$ptrreg),
+                          "ldw\t$reg, -$ptrreg",
+                          []>,
+                   Requires<[HasSRAM]>;
+}
+
+// Load indirect with displacement operations.
+let canFoldAsLoad = 1,
+isReMaterializable = 1 in
+{
+  let Constraints = "@earlyclobber $reg" in
+  def LDDRdPtrQ : FSTDLDD<0,
+                          (outs GPR8:$reg),
+                          (ins memri:$memri),
+                          "ldd\t$reg, $memri",
+                          [(set i8:$reg, (load addr:$memri))]>,
+                  Requires<[HasSRAM]>;
+
+  // LDDW Rd+1:Rd, P+q
+  //
+  // Expands to:
+  // ldd Rd,   P+q
+  // ldd Rd+1, P+q+1
+  let Constraints = "@earlyclobber $dst" in
+  def LDDWRdPtrQ : Pseudo<(outs DREGS:$dst),
+                          (ins memri:$memri),
+                          "lddw\t$dst, $memri",
+                          [(set i16:$dst, (load addr:$memri))]>,
+                   Requires<[HasSRAM]>;
+
+  let mayLoad = 1,
+  hasSideEffects = 0,
+  Constraints = "@earlyclobber $dst" in
+  def LDDWRdYQ : Pseudo<(outs DREGS:$dst),
+                        (ins memri:$memri),
+                        "lddw\t$dst, $memri",
+                        []>,
+                 Requires<[HasSRAM]>;
+}
+
+class AtomicLoad<PatFrag Op, RegisterClass DRC> :
+  Pseudo<(outs DRC:$rd), (ins PTRREGS:$rr), "atomic_op",
+         [(set DRC:$rd, (Op i16:$rr))]>;
+
+class AtomicStore<PatFrag Op, RegisterClass DRC> :
+  Pseudo<(outs), (ins PTRDISPREGS:$rd, DRC:$rr), "atomic_op",
+         [(Op i16:$rd, DRC:$rr)]>;
+
+class AtomicLoadOp<PatFrag Op, RegisterClass DRC> :
+  Pseudo<(outs DRC:$rd), (ins PTRREGS:$rr, DRC:$operand),
+         "atomic_op",
+         [(set DRC:$rd, (Op i16:$rr, DRC:$operand))]>;
+
+def AtomicLoad8   : AtomicLoad<atomic_load_8, GPR8>;
+def AtomicLoad16  : AtomicLoad<atomic_load_16, DREGS>;
+
+def AtomicStore8  : AtomicStore<atomic_store_8, GPR8>;
+def AtomicStore16 : AtomicStore<atomic_store_16, DREGS>;
+
+def AtomicLoadAdd8  : AtomicLoadOp<atomic_load_add_8, GPR8>;
+def AtomicLoadAdd16 : AtomicLoadOp<atomic_load_add_16, DREGS>;
+def AtomicLoadSub8  : AtomicLoadOp<atomic_load_sub_8, GPR8>;
+def AtomicLoadSub16 : AtomicLoadOp<atomic_load_sub_16, DREGS>;
+def AtomicLoadAnd8  : AtomicLoadOp<atomic_load_and_8, GPR8>;
+def AtomicLoadAnd16 : AtomicLoadOp<atomic_load_and_16, DREGS>;
+def AtomicLoadOr8   : AtomicLoadOp<atomic_load_or_8, GPR8>;
+def AtomicLoadOr16  : AtomicLoadOp<atomic_load_or_16, DREGS>;
+def AtomicLoadXor8  : AtomicLoadOp<atomic_load_xor_8, GPR8>;
+def AtomicLoadXor16 : AtomicLoadOp<atomic_load_xor_16, DREGS>;
+def AtomicFence     : Pseudo<(outs), (ins), "atomic_fence",
+                             [(atomic_fence imm, imm)]>;
+
+// Indirect store from register to data space.
+def STSKRr : F32DM<0b1,
+                   (outs),
+                   (ins imm16:$k, GPR8:$rd),
+                   "sts\t$k, $rd",
+                   [(store i8:$rd, imm:$k)]>,
+             Requires<[HasSRAM]>;
+
+// STSW K+1:K, Rr+1:Rr
+//
+// Expands to:
+// sts Rr+1, (K+1:K) + 1
+// sts Rr,   (K+1:K)
+def STSWKRr : Pseudo<(outs),
+                     (ins i16imm:$dst, DREGS:$src),
+                     "stsw\t$dst, $src",
+                     [(store i16:$src, imm:$dst)]>,
+              Requires<[HasSRAM]>;
+
+// Indirect stores.
+// ST P, Rr
+// Stores the value of Rr into the location addressed by pointer P.
+def STPtrRr : FSTLD<1,
+                    0b00,
+                    (outs),
+                    (ins LDSTPtrReg:$ptrreg, GPR8:$reg),
+                    "st\t$ptrreg, $reg",
+                    [(store GPR8:$reg, i16:$ptrreg)]>,
+              Requires<[HasSRAM]>;
+
+// STW P, Rr+1:Rr
+// Stores the value of Rr into the location addressed by pointer P.
+//
+// Expands to:
+// st P, Rr
+// std P+1, Rr+1
+def STWPtrRr : Pseudo<(outs),
+                      (ins PTRDISPREGS:$ptrreg, DREGS:$reg),
+                      "stw\t$ptrreg, $reg",
+                      [(store i16:$reg, i16:$ptrreg)]>,
+               Requires<[HasSRAM]>;
+
+// Indirect stores (with postincrement or predecrement).
+let Constraints = "$ptrreg = $base_wb,@earlyclobber $base_wb" in
+{
+
+  // ST P+, Rr
+  // Stores the value of Rr into the location addressed by pointer P.
+  // Post increments P.
+  def STPtrPiRr : FSTLD<1,
+                        0b01,
+                        (outs LDSTPtrReg:$base_wb),
+                        (ins LDSTPtrReg:$ptrreg, GPR8:$reg, i8imm:$offs),
+                        "st\t$ptrreg+, $reg",
+                        [(set i16:$base_wb,
+                         (post_store GPR8:$reg, i16:$ptrreg, imm:$offs))]>,
+                  Requires<[HasSRAM]>;
+
+  // STW P+, Rr+1:Rr
+  // Stores the value of Rr into the location addressed by pointer P.
+  // Post increments P.
+  //
+  // Expands to:
+  // st P+, Rr
+  // st P+, Rr+1
+  def STWPtrPiRr : Pseudo<(outs PTRREGS:$base_wb),
+                          (ins PTRREGS:$ptrreg, DREGS:$trh, i8imm:$offs),
+                          "stw\t$ptrreg+, $trh",
+                          [(set PTRREGS:$base_wb,
+                           (post_store DREGS:$trh, PTRREGS:$ptrreg, imm:$offs))]>,
+                   Requires<[HasSRAM]>;
+
+  // ST -P, Rr
+  // Stores the value of Rr into the location addressed by pointer P.
+  // Pre decrements P.
+  def STPtrPdRr : FSTLD<1,
+                        0b10,
+                        (outs LDSTPtrReg:$base_wb),
+                        (ins LDSTPtrReg:$ptrreg, GPR8:$reg, i8imm:$offs),
+                        "st\t-$ptrreg, $reg",
+                        [(set i16:$base_wb,
+                         (pre_store GPR8:$reg, i16:$ptrreg, imm:$offs))]>,
+                  Requires<[HasSRAM]>;
+
+  // STW -P, Rr+1:Rr
+  // Stores the value of Rr into the location addressed by pointer P.
+  // Pre decrements P.
+  //
+  // Expands to:
+  // st -P, Rr+1
+  // st -P, Rr
+  def STWPtrPdRr : Pseudo<(outs PTRREGS:$base_wb),
+                          (ins PTRREGS:$ptrreg, DREGS:$reg, i8imm:$offs),
+                          "stw\t-$ptrreg, $reg",
+                          [(set PTRREGS:$base_wb,
+                           (pre_store i16:$reg, i16:$ptrreg, imm:$offs))]>,
+                   Requires<[HasSRAM]>;
+}
+
+// Store indirect with displacement operations.
+// STD P+q, Rr
+// Stores the value of Rr into the location addressed by pointer P with a
+// displacement of q. Does not modify P.
+def STDPtrQRr : FSTDLDD<1,
+                        (outs),
+                        (ins memri:$memri, GPR8:$reg),
+                        "std\t$memri, $reg",
+                        [(store i8:$reg, addr:$memri)]>,
+                Requires<[HasSRAM]>;
+
+// STDW P+q, Rr+1:Rr
+// Stores the value of Rr into the location addressed by pointer P with a
+// displacement of q. Does not modify P.
+//
+// Expands to:
+// std P+q,   Rr
+// std P+q+1, Rr+1
+def STDWPtrQRr : Pseudo<(outs),
+                        (ins memri:$memri, DREGS:$src),
+                        "stdw\t$memri, $src",
+                        [(store i16:$src, addr:$memri)]>,
+                 Requires<[HasSRAM]>;
+
+
+// Load program memory operations.
+let canFoldAsLoad = 1,
+isReMaterializable = 1,
+hasSideEffects = 0 in
+{
+  let Defs = [R0],
+      Uses = [R31R30] in
+  def LPM : F16<0b1001010111001000,
+                (outs),
+                (ins),
+                "lpm",
+                []>,
+            Requires<[HasLPM]>;
+
+  def LPMRdZ : FLPMX<0,
+                     0,
+                     (outs GPR8:$dst),
+                     (ins ZREGS:$z),
+                     "lpm\t$dst, $z",
+                     []>,
+               Requires<[HasLPMX]>;
+
+  def LPMWRdZ : Pseudo<(outs DREGS:$dst),
+                       (ins ZREGS:$z),
+                       "lpmw\t$dst, $z",
+                       []>,
+                Requires<[HasLPMX]>;
+
+  // Load program memory, while postincrementing the Z register.
+  let mayLoad = 1,
+  Defs = [R31R30] in
+  {
+    def LPMRdZPi : FLPMX<0,
+                         1,
+                         (outs GPR8:$dst),
+                         (ins ZREGS:$z),
+                         "lpm\t$dst, $z+",
+                         []>,
+                   Requires<[HasLPMX]>;
+
+    def LPMWRdZPi : Pseudo<(outs DREGS:$dst),
+                           (ins ZREGS:$z),
+                           "lpmw\t$dst, $z+",
+                           []>,
+                    Requires<[HasLPMX]>;
+  }
+}
+
+// Extended load program memory operations.
+let mayLoad = 1,
+hasSideEffects = 0 in
+{
+  let Defs = [R0],
+      Uses = [R31R30] in
+  def ELPM : F16<0b1001010111011000,
+                 (outs),
+                 (ins),
+                 "elpm",
+                 []>,
+             Requires<[HasELPM]>;
+
+  def ELPMRdZ : FLPMX<1,
+                      0,
+                      (outs GPR8:$dst),
+                      (ins ZREGS:$z),
+                      "elpm\t$dst, $z",
+                      []>,
+                Requires<[HasELPMX]>;
+
+  let Defs = [R31R30] in
+  def ELPMRdZPi : FLPMX<1,
+                        1,
+                        (outs GPR8:$dst),
+                        (ins ZREGS: $z),
+                        "elpm\t$dst, $z+",
+                        []>,
+                  Requires<[HasELPMX]>;
+}
+
+// Store program memory operations.
+let Uses = [R1, R0] in
+{
+  let Uses = [R31R30, R1, R0] in
+  def SPM : F16<0b1001010111101000,
+                (outs),
+                (ins),
+                "spm",
+                []>,
+            Requires<[HasSPM]>;
+
+  let Defs = [R31R30] in
+  def SPMZPi : F16<0b1001010111111000,
+                   (outs),
+                   (ins ZREGS:$z),
+                   "spm $z+",
+                   []>,
+               Requires<[HasSPMX]>;
+}
+
+// Read data from IO location operations.
+let canFoldAsLoad = 1,
+isReMaterializable = 1 in
+{
+  def INRdA : FIORdA<(outs GPR8:$dst),
+                     (ins imm_port6:$src),
+                     "in\t$dst, $src",
+                     [(set i8:$dst, (load ioaddr8:$src))]>;
+
+  def INWRdA : Pseudo<(outs DREGS:$dst),
+                      (ins imm_port6:$src),
+                      "inw\t$dst, $src",
+                      [(set i16:$dst, (load ioaddr16:$src))]>;
+}
+
+// Write data to IO location operations.
+def OUTARr : FIOARr<(outs),
+                    (ins imm_port6:$dst, GPR8:$src),
+                    "out\t$dst, $src",
+                    [(store i8:$src, ioaddr8:$dst)]>;
+
+def OUTWARr : Pseudo<(outs),
+                     (ins imm_port6:$dst, DREGS:$src),
+                     "outw\t$dst, $src",
+                     [(store i16:$src, ioaddr16:$dst)]>;
+
+// Stack push/pop operations.
+let Defs = [SP],
+Uses = [SP],
+hasSideEffects = 0 in
+{
+  // Stack push operations.
+  let mayStore = 1 in
+  {
+    def PUSHRr : FRd<0b1001,
+                     0b0011111,
+                     (outs),
+                     (ins GPR8:$reg),
+                     "push\t$reg",
+                     []>,
+                 Requires<[HasSRAM]>;
+
+    def PUSHWRr : Pseudo<(outs),
+                         (ins DREGS:$reg),
+                         "pushw\t$reg",
+                         []>,
+                  Requires<[HasSRAM]>;
+  }
+
+  // Stack pop operations.
+  let mayLoad = 1 in
+  {
+    def POPRd : FRd<0b1001,
+                    0b0001111,
+                    (outs GPR8:$reg),
+                    (ins),
+                    "pop\t$reg",
+                    []>,
+                Requires<[HasSRAM]>;
+
+    def POPWRd : Pseudo<(outs DREGS:$reg),
+                        (ins),
+                        "popw\t$reg",
+                        []>,
+                 Requires<[HasSRAM]>;
+  }
+}
+
+// Read-Write-Modify (RMW) instructions.
+def XCHZRd : FZRd<0b100,
+                  (outs GPR8:$rd),
+                  (ins ZREGS:$z),
+                  "xch\t$z, $rd",
+                  []>,
+             Requires<[SupportsRMW]>;
+
+def LASZRd : FZRd<0b101,
+                  (outs GPR8:$rd),
+                  (ins ZREGS:$z),
+                  "las\t$z, $rd",
+                  []>,
+             Requires<[SupportsRMW]>;
+
+def LACZRd : FZRd<0b110,
+                  (outs GPR8:$rd),
+                  (ins ZREGS:$z),
+                  "lac\t$z, $rd",
+                  []>,
+             Requires<[SupportsRMW]>;
+
+def LATZRd : FZRd<0b111,
+                  (outs GPR8:$rd),
+                  (ins ZREGS:$z),
+                  "lat\t$z, $rd",
+                  []>,
+             Requires<[SupportsRMW]>;
+
+//===----------------------------------------------------------------------===//
+// Bit and bit-test instructions
+//===----------------------------------------------------------------------===//
+
+// Bit shift/rotate operations.
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+  def LSLRd : FRdRr<0b0000,
+                    0b11,
+                    (outs GPR8:$rd),
+                    (ins GPR8:$src),
+                    "lsl\t$rd",
+                    [(set i8:$rd, (AVRlsl i8:$src)), (implicit SREG)]>;
+
+  def LSLWRd : Pseudo<(outs DREGS:$rd),
+                      (ins DREGS:$src),
+                      "lslw\t$rd",
+                      [(set i16:$rd, (AVRlsl i16:$src)), (implicit SREG)]>;
+
+  def LSRRd : FRd<0b1001,
+                  0b0100110,
+                  (outs GPR8:$rd),
+                  (ins GPR8:$src),
+                  "lsr\t$rd",
+                  [(set i8:$rd, (AVRlsr i8:$src)), (implicit SREG)]>;
+
+  def LSRWRd : Pseudo<(outs DREGS:$rd),
+                      (ins DREGS:$src),
+                      "lsrw\t$rd",
+                      [(set i16:$rd, (AVRlsr i16:$src)), (implicit SREG)]>;
+
+  def ASRRd : FRd<0b1001,
+                  0b0100101,
+                  (outs GPR8:$rd),
+                  (ins GPR8:$src),
+                  "asr\t$rd",
+                  [(set i8:$rd, (AVRasr i8:$src)), (implicit SREG)]>;
+
+  def ASRWRd : Pseudo<(outs DREGS:$rd),
+                      (ins DREGS:$src),
+                      "asrw\t$rd",
+                      [(set i16:$rd, (AVRasr i16:$src)), (implicit SREG)]>;
+
+  // Bit rotate operations.
+  let Uses = [SREG] in
+  {
+    def ROLRd : FRdRr<0b0001,
+                      0b11,
+                      (outs GPR8:$rd),
+                      (ins GPR8:$src),
+                      "rol\t$rd",
+                      [(set i8:$rd, (AVRrol i8:$src)), (implicit SREG)]>;
+
+    def ROLWRd : Pseudo<(outs DREGS:$rd),
+                        (ins DREGS:$src),
+                        "rolw\t$rd",
+                        [(set i16:$rd, (AVRrol i16:$src)), (implicit SREG)]>;
+
+    def RORRd : FRd<0b1001,
+                    0b0100111,
+                    (outs GPR8:$rd),
+                    (ins GPR8:$src),
+                    "ror\t$rd",
+                    [(set i8:$rd, (AVRror i8:$src)), (implicit SREG)]>;
+
+    def RORWRd : Pseudo<(outs DREGS:$rd),
+                        (ins DREGS:$src),
+                        "rorw\t$rd",
+                        [(set i16:$rd, (AVRror i16:$src)), (implicit SREG)]>;
+  }
+}
+
+// SWAP Rd
+// Swaps the high and low nibbles in a register.
+let Constraints = "$src = $rd" in
+def SWAPRd : FRd<0b1001,
+                 0b0100010,
+                 (outs GPR8:$rd),
+                 (ins GPR8:$src),
+                 "swap\t$rd",
+                 [(set i8:$rd, (bswap i8:$src))]>;
+
+// IO register bit set/clear operations.
+//:TODO: add patterns when popcount(imm)==2 to be expanded with 2 sbi/cbi
+// instead of in+ori+out which requires one more instr.
+def SBIAb : FIOBIT<0b10,
+                   (outs),
+                   (ins imm_port5:$addr, i8imm:$bit),
+                   "sbi\t$addr, $bit",
+                   [(store (or (i8 (load lowioaddr8:$addr)), iobitpos8:$bit),
+                     lowioaddr8:$addr)]>;
+
+def CBIAb : FIOBIT<0b00,
+                   (outs),
+                   (ins imm_port5:$addr, i8imm:$bit),
+                   "cbi\t$addr, $bit",
+                   [(store (and (i8 (load lowioaddr8:$addr)), iobitposn8:$bit),
+                     lowioaddr8:$addr)]>;
+
+// Status register bit load/store operations.
+let Defs = [SREG] in
+def BST : FRdB<0b01,
+               (outs),
+               (ins GPR8:$rd, i8imm:$b),
+               "bst\t$rd, $b",
+               []>;
+
+let Uses = [SREG] in
+def BLD : FRdB<0b00,
+               (outs),
+               (ins GPR8:$rd, i8imm:$b),
+               "bld\t$rd, $b",
+               []>;
+
+// Set/clear bit in register operations.
+let Constraints = "$src = $rd",
+Defs = [SREG] in
+{
+  // SBR Rd, K
+  // Alias for ORI Rd, K
+  def SBRRdK : FRdK<0b0110,
+                    (outs LD8:$rd),
+                    (ins LD8:$src, imm_ldi8:$k),
+                    "sbr\t$rd, $k",
+                    [(set i8:$rd, (or i8:$src, imm:$k)),
+                     (implicit SREG)]>;
+
+  // CBR Rd, K
+  // Alias for `ANDI Rd, COM(K)` where COM(K) is the compliment of K.
+  // FIXME: This uses the 'complement' encoder. We need it to also use the
+  // imm_ldi8 encoder. This will cause no fixups to be created on this instruction.
+  def CBRRdK : FRdK<0b0111,
+                    (outs LD8:$rd),
+                    (ins LD8:$src, imm_com8:$k),
+                    "cbr\t$rd, $k",
+                    []>;
+}
+
+// CLR Rd
+// Alias for EOR Rd, Rd
+// -------------
+// Clears all bits in a register.
+def CLR : InstAlias<"clr\t$rd", (EORRdRr GPR8:$rd, GPR8:$rd)>;
+
+// SER Rd
+// Alias for LDI Rd, 0xff
+// ---------
+// Sets all bits in a register.
+def : InstAlias<"ser\t$rd", (LDIRdK LD8:$rd, 0xff), 0>;
+
+let Defs = [SREG] in
+def BSETs : FS<0,
+               (outs),
+               (ins i8imm:$s),
+               "bset\t$s",
+               []>;
+
+let Defs = [SREG] in
+def BCLRs : FS<1,
+               (outs),
+               (ins i8imm:$s),
+               "bclr\t$s",
+               []>;
+
+// Set/clear aliases for the carry (C) status flag (bit 0).
+def : InstAlias<"sec", (BSETs 0)>;
+def : InstAlias<"clc", (BCLRs 0)>;
+
+// Set/clear aliases for the zero (Z) status flag (bit 1).
+def : InstAlias<"sez", (BSETs 1)>;
+def : InstAlias<"clz", (BCLRs 1)>;
+
+// Set/clear aliases for the negative (N) status flag (bit 2).
+def : InstAlias<"sen", (BSETs 2)>;
+def : InstAlias<"cln", (BCLRs 2)>;
+
+// Set/clear aliases for the overflow (V) status flag (bit 3).
+def : InstAlias<"sev", (BSETs 3)>;
+def : InstAlias<"clv", (BCLRs 3)>;
+
+// Set/clear aliases for the signed (S) status flag (bit 4).
+def : InstAlias<"ses", (BSETs 4)>;
+def : InstAlias<"cls", (BCLRs 4)>;
+
+// Set/clear aliases for the half-carry (H) status flag (bit 5).
+def : InstAlias<"seh", (BSETs 5)>;
+def : InstAlias<"clh", (BCLRs 5)>;
+
+// Set/clear aliases for the T status flag (bit 6).
+def : InstAlias<"set", (BSETs 6)>;
+def : InstAlias<"clt", (BCLRs 6)>;
+
+// Set/clear aliases for the interrupt (I) status flag (bit 7).
+def : InstAlias<"sei", (BSETs 7)>;
+def : InstAlias<"cli", (BCLRs 7)>;
+
+//===----------------------------------------------------------------------===//
+// Special/Control instructions
+//===----------------------------------------------------------------------===//
+
+// BREAK
+// Breakpoint instruction
+// ---------
+// <|1001|0101|1001|1000>
+def BREAK : F16<0b1001010110011000,
+                (outs),
+                (ins),
+                "break",
+                []>,
+            Requires<[HasBREAK]>;
+
+// NOP
+// No-operation instruction
+// ---------
+// <|0000|0000|0000|0000>
+def NOP : F16<0b0000000000000000,
+              (outs),
+              (ins),
+              "nop",
+              []>;
+
+// SLEEP
+// Sleep instruction
+// ---------
+// <|1001|0101|1000|1000>
+def SLEEP : F16<0b1001010110001000,
+                (outs),
+                (ins),
+                "sleep",
+                []>;
+
+// WDR
+// Watchdog reset
+// ---------
+// <|1001|0101|1010|1000>
+def WDR : F16<0b1001010110101000,
+              (outs),
+              (ins),
+              "wdr",
+              []>;
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions for later expansion
+//===----------------------------------------------------------------------===//
+
+//:TODO: Optimize this for wider types AND optimize the following code
+//       compile int foo(char a, char b, char c, char d) {return d+b;}
+//       looks like a missed sext_inreg opportunity.
+def SEXT : ExtensionPseudo<
+  (outs DREGS:$dst),
+  (ins GPR8:$src),
+  "sext\t$dst, $src",
+  [(set i16:$dst, (sext i8:$src)), (implicit SREG)]
+>;
+
+def ZEXT : ExtensionPseudo<
+  (outs DREGS:$dst),
+  (ins GPR8:$src),
+  "zext\t$dst, $src",
+  [(set i16:$dst, (zext i8:$src)), (implicit SREG)]
+>;
+
+// This pseudo gets expanded into a movw+adiw thus it clobbers SREG.
+let Defs = [SREG],
+    hasSideEffects = 0 in
+def FRMIDX : Pseudo<(outs DLDREGS:$dst),
+                    (ins DLDREGS:$src, i16imm:$src2),
+                    "frmidx\t$dst, $src, $src2",
+                    []>;
+
+// This pseudo is either converted to a regular store or a push which clobbers
+// SP.
+def STDSPQRr : StorePseudo<
+  (outs),
+  (ins memspi:$dst, GPR8:$src),
+  "stdstk\t$dst, $src",
+  [(store i8:$src, addr:$dst)]
+>;
+
+// This pseudo is either converted to a regular store or a push which clobbers
+// SP.
+def STDWSPQRr : StorePseudo<
+  (outs),
+  (ins memspi:$dst, DREGS:$src),
+  "stdwstk\t$dst, $src",
+  [(store i16:$src, addr:$dst)]
+>;
+
+// SP read/write pseudos.
+let hasSideEffects = 0 in
+{
+  let Uses = [SP] in
+  def SPREAD : Pseudo<
+    (outs DREGS:$dst),
+    (ins GPRSP:$src),
+    "spread\t$dst, $src",
+    []
+  >;
+
+  let Defs = [SP] in
+  def SPWRITE : Pseudo<
+    (outs GPRSP:$dst),
+    (ins DREGS:$src),
+    "spwrite\t$dst, $src",
+    []>;
+}
+
+def Select8 : SelectPseudo<
+  (outs GPR8:$dst),
+  (ins GPR8:$src, GPR8:$src2, i8imm:$cc),
+  "# Select8 PSEUDO",
+  [(set i8:$dst, (AVRselectcc i8:$src, i8:$src2, imm:$cc))]
+>;
+
+def Select16 : SelectPseudo<
+  (outs DREGS:$dst),
+  (ins DREGS:$src, DREGS:$src2, i8imm:$cc),
+  "# Select16 PSEUDO",
+  [(set i16:$dst, (AVRselectcc i16:$src, i16:$src2, imm:$cc))]
+>;
+
+def Lsl8 : ShiftPseudo<
+  (outs GPR8:$dst),
+  (ins GPR8:$src, GPR8:$cnt),
+  "# Lsl8 PSEUDO",
+  [(set i8:$dst, (AVRlslLoop i8:$src, i8:$cnt))]
+>;
+
+def Lsl16 : ShiftPseudo<
+  (outs DREGS:$dst),
+  (ins DREGS:$src, GPR8:$cnt),
+  "# Lsl16 PSEUDO",
+  [(set i16:$dst, (AVRlslLoop i16:$src, i8:$cnt))]
+>;
+
+def Lsr8 : ShiftPseudo<
+  (outs GPR8:$dst),
+  (ins GPR8:$src, GPR8:$cnt),
+  "# Lsr8 PSEUDO",
+  [(set i8:$dst, (AVRlsrLoop i8:$src, i8:$cnt))]
+>;
+
+
+def Lsr16 : ShiftPseudo<
+  (outs DREGS:$dst),
+   (ins DREGS:$src, GPR8:$cnt),
+   "# Lsr16 PSEUDO",
+   [(set i16:$dst, (AVRlsrLoop i16:$src, i8:$cnt))]
+>;
+
+def Asr8 : ShiftPseudo<
+  (outs GPR8:$dst),
+  (ins GPR8:$src, GPR8:$cnt),
+  "# Asr8 PSEUDO",
+  [(set i8:$dst, (AVRasrLoop i8:$src, i8:$cnt))]
+>;
+
+def Asr16 : ShiftPseudo<
+  (outs DREGS:$dst),
+   (ins DREGS:$src, GPR8:$cnt),
+   "# Asr16 PSEUDO",
+   [(set i16:$dst, (AVRasrLoop i16:$src, i8:$cnt))]
+>;
+
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+//:TODO: look in x86InstrCompiler.td for odd encoding trick related to
+// add x, 128 -> sub x, -128. Clang is emitting an eor for this (ldi+eor)
+
+// the add instruction always writes the carry flag
+def : Pat<(addc i8:$src, i8:$src2),
+          (ADDRdRr i8:$src, i8:$src2)>;
+def : Pat<(addc DREGS:$src, DREGS:$src2),
+          (ADDWRdRr DREGS:$src, DREGS:$src2)>;
+
+// all sub instruction variants always writes the carry flag
+def : Pat<(subc i8:$src, i8:$src2),
+          (SUBRdRr i8:$src, i8:$src2)>;
+def : Pat<(subc i16:$src, i16:$src2),
+          (SUBWRdRr i16:$src, i16:$src2)>;
+def : Pat<(subc i8:$src, imm:$src2),
+          (SUBIRdK i8:$src, imm:$src2)>;
+def : Pat<(subc i16:$src, imm:$src2),
+          (SUBIWRdK i16:$src, imm:$src2)>;
+
+// These patterns convert add (x, -imm) to sub (x, imm) since we dont have
+// any add with imm instructions. Also take care of the adiw/sbiw instructions.
+def : Pat<(add i16:$src1, imm0_63_neg:$src2),
+          (SBIWRdK i16:$src1, (imm0_63_neg:$src2))>;
+def : Pat<(add i16:$src1, imm:$src2),
+          (SUBIWRdK i16:$src1, (imm16_neg_XFORM imm:$src2))>;
+def : Pat<(addc i16:$src1, imm:$src2),
+          (SUBIWRdK i16:$src1, (imm16_neg_XFORM imm:$src2))>;
+def : Pat<(adde i16:$src1, imm:$src2),
+          (SBCIWRdK i16:$src1, (imm16_neg_XFORM imm:$src2))>;
+
+def : Pat<(add i8:$src1, imm:$src2),
+          (SUBIRdK i8:$src1, (imm8_neg_XFORM imm:$src2))>;
+def : Pat<(addc i8:$src1, imm:$src2),
+          (SUBIRdK i8:$src1, (imm8_neg_XFORM imm:$src2))>;
+def : Pat<(adde i8:$src1, imm:$src2),
+          (SBCIRdK i8:$src1, (imm8_neg_XFORM imm:$src2))>;
+
+// Calls.
+def : Pat<(AVRcall (i16 tglobaladdr:$dst)),
+          (CALLk tglobaladdr:$dst)>;
+def : Pat<(AVRcall (i16 texternalsym:$dst)),
+          (CALLk texternalsym:$dst)>;
+
+// `anyext`
+def : Pat<(i16 (anyext i8:$src)),
+          (INSERT_SUBREG (i16 (IMPLICIT_DEF)), i8:$src, sub_lo)>;
+
+// `trunc`
+def : Pat<(i8 (trunc i16:$src)),
+          (EXTRACT_SUBREG i16:$src, sub_lo)>;
+
+// sext_inreg
+def : Pat<(sext_inreg i16:$src, i8),
+          (SEXT (i8 (EXTRACT_SUBREG i16:$src, sub_lo)))>;
+
+// GlobalAddress
+def : Pat<(i16 (AVRWrapper tglobaladdr:$dst)),
+          (LDIWRdK tglobaladdr:$dst)>;
+def : Pat<(add i16:$src, (AVRWrapper tglobaladdr:$src2)),
+          (SUBIWRdK i16:$src, tglobaladdr:$src2)>;
+def : Pat<(i8 (load (AVRWrapper tglobaladdr:$dst))),
+          (LDSRdK tglobaladdr:$dst)>;
+def : Pat<(i16 (load (AVRWrapper tglobaladdr:$dst))),
+          (LDSWRdK tglobaladdr:$dst)>;
+def : Pat<(store i8:$src, (i16 (AVRWrapper tglobaladdr:$dst))),
+          (STSKRr tglobaladdr:$dst, i8:$src)>;
+def : Pat<(store i16:$src, (i16 (AVRWrapper tglobaladdr:$dst))),
+          (STSWKRr tglobaladdr:$dst, i16:$src)>;
+
+// BlockAddress
+def : Pat<(i16 (AVRWrapper tblockaddress:$dst)),
+          (LDIWRdK tblockaddress:$dst)>;
+
+// hi-reg truncation : trunc(int16 >> 8)
+//:FIXME: i think it's better to emit an extract subreg node in the DAG than
+// all this mess once we get optimal shift code
+// lol... I think so, too. [@agnat]
+def : Pat<(i8 (trunc (AVRlsr (AVRlsr (AVRlsr (AVRlsr (AVRlsr (AVRlsr (AVRlsr
+                     (AVRlsr DREGS:$src)))))))))),
+          (EXTRACT_SUBREG DREGS:$src, sub_hi)>;
+
+// :FIXME: DAGCombiner produces an shl node after legalization from these seq:
+// BR_JT -> (mul x, 2) -> (shl x, 1)
+def : Pat<(shl i16:$src1, (i8 1)),
+          (LSLWRd i16:$src1)>;
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRInstrumentFunctions.cpp b/contrib/llvm/lib/Target/AVR/AVRInstrumentFunctions.cpp
new file mode 100644
index 000000000000..5553dc2da31b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRInstrumentFunctions.cpp
@@ -0,0 +1,222 @@
+//===-- AVRInstrumentFunctions.cpp - Insert instrumentation for testing ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass takes a function and inserts calls to hook functions which are
+// told the name, arguments, and results of function calls.
+//
+// The hooks can do anything with the information given. It is possible to
+// send the data through a serial connection in order to runs tests on
+// bare metal.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVR.h"
+
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Module.h>
+
+using namespace llvm;
+
+#define AVR_INSTRUMENT_FUNCTIONS_NAME "AVR function instrumentation pass"
+
+namespace {
+
+// External symbols that we emit calls to.
+namespace symbols {
+
+#define SYMBOL_PREFIX "avr_instrumentation"
+
+  const StringRef PREFIX = SYMBOL_PREFIX;
+
+  // void (i16 argCount);
+  const StringRef BEGIN_FUNCTION_SIGNATURE = SYMBOL_PREFIX "_begin_signature";
+  // void(i16 argCount);
+  const StringRef END_FUNCTION_SIGNATURE = SYMBOL_PREFIX "_end_signature";
+
+#undef SYMBOL_PREFIX
+}
+
+class AVRInstrumentFunctions : public FunctionPass {
+public:
+  static char ID;
+
+  AVRInstrumentFunctions() : FunctionPass(ID) {
+    initializeAVRInstrumentFunctionsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override { return AVR_INSTRUMENT_FUNCTIONS_NAME; }
+};
+
+char AVRInstrumentFunctions::ID = 0;
+
+/// Creates a pointer to a string.
+static Value *CreateStringPtr(BasicBlock &BB, StringRef Str) {
+  LLVMContext &Ctx = BB.getContext();
+  IntegerType *I8 = Type::getInt8Ty(Ctx);
+
+  Constant *ConstantStr = ConstantDataArray::getString(Ctx, Str);
+  GlobalVariable *GlobalStr = new GlobalVariable(*BB.getParent()->getParent(),
+                                                 ConstantStr->getType(),
+                                                 true, /* is a constant */
+                                                 GlobalValue::PrivateLinkage,
+                                                 ConstantStr);
+  return GetElementPtrInst::CreateInBounds(GlobalStr,
+    {ConstantInt::get(I8, 0), ConstantInt::get(I8, 0)}, "", &BB);
+}
+
+static std::string GetTypeName(Type &Ty) {
+  if (auto *IntTy = dyn_cast<IntegerType>(&Ty)) {
+    return std::string("i") + std::to_string(IntTy->getBitWidth());
+  }
+
+  if (Ty.isFloatingPointTy()) {
+    return std::string("f") + std::to_string(Ty.getPrimitiveSizeInBits());
+  }
+
+  llvm_unreachable("unknown return type");
+}
+
+/// Builds a call to one of the signature begin/end hooks.
+static void BuildSignatureCall(StringRef SymName, BasicBlock &BB, Function &F) {
+  LLVMContext &Ctx = F.getContext();
+  IntegerType *I16 = Type::getInt16Ty(Ctx);
+
+  FunctionType *FnType = FunctionType::get(Type::getVoidTy(Ctx),
+    {Type::getInt8PtrTy(Ctx), I16}, false);
+
+  Constant *Fn = F.getParent()->getOrInsertFunction(SymName, FnType);
+  Value *FunctionName = CreateStringPtr(BB, F.getName());
+
+  Value *Args[] = {FunctionName,
+                   ConstantInt::get(I16, F.getArgumentList().size())};
+  CallInst::Create(Fn, Args, "", &BB);
+}
+
+/// Builds instructions to call into an external function to
+/// notify about a function signature beginning.
+static void BuildBeginSignature(BasicBlock &BB, Function &F) {
+  return BuildSignatureCall(symbols::BEGIN_FUNCTION_SIGNATURE, BB, F);
+}
+
+/// Builds instructions to call into an external function to
+/// notify about a function signature ending.
+static void BuildEndSignature(BasicBlock &BB, Function &F) {
+  return BuildSignatureCall(symbols::END_FUNCTION_SIGNATURE, BB, F);
+}
+
+/// Get the name of the external symbol that we need to call
+/// to notify about this argument.
+static std::string GetArgumentSymbolName(Argument &Arg) {
+  return (symbols::PREFIX + "_argument_" + GetTypeName(*Arg.getType())).str();
+}
+
+/// Builds a call to one of the argument hooks.
+static void BuildArgument(BasicBlock &BB, Argument &Arg) {
+  Function &F = *Arg.getParent();
+  LLVMContext &Ctx = F.getContext();
+
+  Type *I8 = Type::getInt8Ty(Ctx);
+
+  FunctionType *FnType = FunctionType::get(Type::getVoidTy(Ctx),
+    {Type::getInt8PtrTy(Ctx), I8, Arg.getType()}, false);
+
+  Constant *Fn = F.getParent()->getOrInsertFunction(
+    GetArgumentSymbolName(Arg), FnType);
+  Value *ArgName = CreateStringPtr(BB, Arg.getName());
+
+  Value *Args[] = {ArgName, ConstantInt::get(I8, Arg.getArgNo()), &Arg};
+  CallInst::Create(Fn, Args, "", &BB);
+}
+
+/// Builds a call to all of the function signature hooks.
+static void BuildSignature(BasicBlock &BB, Function &F) {
+  BuildBeginSignature(BB, F);
+  for (Argument &Arg : F.args()) { BuildArgument(BB, Arg); }
+  BuildEndSignature(BB, F);
+}
+
+/// Builds the instrumentation entry block.
+static void BuildEntryBlock(Function &F) {
+  BasicBlock &EntryBlock = F.getEntryBlock();
+
+  // Create a new basic block at the start of the existing entry block.
+  BasicBlock *BB = BasicBlock::Create(F.getContext(),
+                                      "instrumentation_entry",
+                                      &F, &EntryBlock);
+
+  BuildSignature(*BB, F);
+
+  // Jump to the actual entry block.
+  BranchInst::Create(&EntryBlock, BB);
+}
+
+static std::string GetReturnSymbolName(Value &Val) {
+  return (symbols::PREFIX + "_result_" + GetTypeName(*Val.getType())).str();
+}
+
+static void BuildExitHook(Instruction &I) {
+  Function &F = *I.getParent()->getParent();
+  LLVMContext &Ctx = F.getContext();
+
+  if (auto *Ret = dyn_cast<ReturnInst>(&I)) {
+    Value *RetVal = Ret->getReturnValue();
+    assert(RetVal && "should only be instrumenting functions with return values");
+
+    FunctionType *FnType = FunctionType::get(Type::getVoidTy(Ctx),
+      {RetVal->getType()}, false);
+
+    Constant *Fn = F.getParent()->getOrInsertFunction(
+      GetReturnSymbolName(*RetVal), FnType);
+
+    // Call the result hook just before the return.
+    CallInst::Create(Fn, {RetVal}, "", &I);
+  }
+}
+
+/// Runs return hooks before all returns in a function.
+static void BuildExitHooks(Function &F) {
+  for (BasicBlock &BB : F) {
+    auto BBI = BB.begin(), E = BB.end();
+    while (BBI != E) {
+      auto NBBI = std::next(BBI);
+
+      BuildExitHook(*BBI);
+
+      // Modified |= expandMI(BB, MBBI);
+      BBI = NBBI;
+    }
+  }
+}
+
+static bool ShouldInstrument(Function &F) {
+  // No point reporting results if there are none.
+  return !F.getReturnType()->isVoidTy();
+}
+
+bool AVRInstrumentFunctions::runOnFunction(Function &F) {
+  if (ShouldInstrument(F)) {
+    BuildEntryBlock(F);
+    BuildExitHooks(F);
+  }
+
+  return true;
+}
+
+} // end of anonymous namespace
+
+INITIALIZE_PASS(AVRInstrumentFunctions, "avr-instrument-functions",
+                AVR_INSTRUMENT_FUNCTIONS_NAME, false, false)
+
+namespace llvm {
+
+FunctionPass *createAVRInstrumentFunctionsPass() { return new AVRInstrumentFunctions(); }
+
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/AVRMCInstLower.cpp b/contrib/llvm/lib/Target/AVR/AVRMCInstLower.cpp
new file mode 100644
index 000000000000..342fe558813a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRMCInstLower.cpp
@@ -0,0 +1,100 @@
+//===-- AVRMCInstLower.cpp - Convert AVR MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower AVR MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRMCInstLower.h"
+
+#include "AVRInstrInfo.h"
+#include "MCTargetDesc/AVRMCExpr.h"
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+MCOperand AVRMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
+                                             MCSymbol *Sym) const {
+  unsigned char TF = MO.getTargetFlags();
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
+
+  bool IsNegated = false;
+  if (TF & AVRII::MO_NEG) { IsNegated = true; }
+
+  if (!MO.isJTI() && MO.getOffset()) {
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+  }
+
+  if (TF & AVRII::MO_LO) {
+    Expr = AVRMCExpr::create(AVRMCExpr::VK_AVR_LO8, Expr, IsNegated, Ctx);
+  } else if (TF & AVRII::MO_HI) {
+    Expr = AVRMCExpr::create(AVRMCExpr::VK_AVR_HI8, Expr, IsNegated, Ctx);
+  } else if (TF != 0) {
+    llvm_unreachable("Unknown target flag on symbol operand");
+  }
+
+  return MCOperand::createExpr(Expr);
+}
+
+void AVRMCInstLower::lowerInstruction(const MachineInstr &MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI.getOpcode());
+
+  for (MachineOperand const &MO : MI.operands()) {
+    MCOperand MCOp;
+
+    switch (MO.getType()) {
+    default:
+      MI.dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit())
+        continue;
+      MCOp = MCOperand::createReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::createImm(MO.getImm());
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = lowerSymbolOperand(MO, Printer.getSymbol(MO.getGlobal()));
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = lowerSymbolOperand(
+          MO, Printer.GetExternalSymbolSymbol(MO.getSymbolName()));
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::createExpr(
+          MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
+      break;
+    case MachineOperand::MO_RegisterMask:
+      continue;
+    case MachineOperand::MO_BlockAddress:
+      MCOp = lowerSymbolOperand(
+          MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = lowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = lowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
+      break;
+    }
+
+    OutMI.addOperand(MCOp);
+  }
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRMCInstLower.h b/contrib/llvm/lib/Target/AVR/AVRMCInstLower.h
new file mode 100644
index 000000000000..2e2d1014485e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRMCInstLower.h
@@ -0,0 +1,43 @@
+//===-- AVRMCInstLower.h - Lower MachineInstr to MCInst ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_MCINST_LOWER_H
+#define LLVM_AVR_MCINST_LOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class MachineInstr;
+class MachineOperand;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+
+/// Lowers `MachineInstr` objects into `MCInst` objects.
+class AVRMCInstLower {
+public:
+  AVRMCInstLower(MCContext &Ctx, AsmPrinter &Printer)
+      : Ctx(Ctx), Printer(Printer) {}
+
+  /// Lowers a `MachineInstr` into a `MCInst`.
+  void lowerInstruction(const MachineInstr &MI, MCInst &OutMI) const;
+  MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+private:
+  MCContext &Ctx;
+  AsmPrinter &Printer;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_MCINST_LOWER_H
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h b/contrib/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h
new file mode 100644
index 000000000000..cf0c73576301
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h
@@ -0,0 +1,69 @@
+//===-- AVRMachineFuctionInfo.h - AVR machine function info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares AVR-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_MACHINE_FUNCTION_INFO_H
+#define LLVM_AVR_MACHINE_FUNCTION_INFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// Contains AVR-specific information for each MachineFunction.
+class AVRMachineFunctionInfo : public MachineFunctionInfo {
+  /// Indicates if a register has been spilled by the register
+  /// allocator.
+  bool HasSpills;
+
+  /// Indicates if there are any fixed size allocas present.
+  /// Note that if there are only variable sized allocas this is set to false.
+  bool HasAllocas;
+
+  /// Indicates if arguments passed using the stack are being
+  /// used inside the function.
+  bool HasStackArgs;
+
+  /// Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize;
+
+  /// FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+public:
+  AVRMachineFunctionInfo()
+      : HasSpills(false), HasAllocas(false), HasStackArgs(false),
+        CalleeSavedFrameSize(0), VarArgsFrameIndex(0) {}
+
+  explicit AVRMachineFunctionInfo(MachineFunction &MF)
+      : HasSpills(false), HasAllocas(false), HasStackArgs(false),
+        CalleeSavedFrameSize(0), VarArgsFrameIndex(0) {}
+
+  bool getHasSpills() const { return HasSpills; }
+  void setHasSpills(bool B) { HasSpills = B; }
+
+  bool getHasAllocas() const { return HasAllocas; }
+  void setHasAllocas(bool B) { HasAllocas = B; }
+
+  bool getHasStackArgs() const { return HasStackArgs; }
+  void setHasStackArgs(bool B) { HasStackArgs = B; }
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned Bytes) { CalleeSavedFrameSize = Bytes; }
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
+};
+
+} // end llvm namespace
+
+#endif // LLVM_AVR_MACHINE_FUNCTION_INFO_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.cpp b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
new file mode 100644
index 000000000000..48798bd4a1da
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -0,0 +1,266 @@
+//===-- AVRRegisterInfo.cpp - AVR Register Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AVR implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRRegisterInfo.h"
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+#include "AVR.h"
+#include "AVRInstrInfo.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "AVRGenRegisterInfo.inc"
+
+namespace llvm {
+
+AVRRegisterInfo::AVRRegisterInfo() : AVRGenRegisterInfo(0) {}
+
+const uint16_t *
+AVRRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  CallingConv::ID CC = MF->getFunction()->getCallingConv();
+
+  return ((CC == CallingConv::AVR_INTR || CC == CallingConv::AVR_SIGNAL)
+              ? CSR_Interrupts_SaveList
+              : CSR_Normal_SaveList);
+}
+
+const uint32_t *
+AVRRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                      CallingConv::ID CC) const {
+  return ((CC == CallingConv::AVR_INTR || CC == CallingConv::AVR_SIGNAL)
+              ? CSR_Interrupts_RegMask
+              : CSR_Normal_RegMask);
+}
+
+BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const AVRTargetMachine &TM = static_cast<const AVRTargetMachine&>(MF.getTarget());
+  const TargetFrameLowering *TFI = TM.getSubtargetImpl()->getFrameLowering();
+
+  // Reserve the intermediate result registers r1 and r2
+  // The result of instructions like 'mul' is always stored here.
+  Reserved.set(AVR::R0);
+  Reserved.set(AVR::R1);
+  Reserved.set(AVR::R1R0);
+
+  //  Reserve the stack pointer.
+  Reserved.set(AVR::SPL);
+  Reserved.set(AVR::SPH);
+  Reserved.set(AVR::SP);
+
+  // Reserve the frame pointer registers r28 and r29 if the function requires one.
+  if (TFI->hasFP(MF)) {
+    Reserved.set(AVR::R28);
+    Reserved.set(AVR::R29);
+    Reserved.set(AVR::R29R28);
+  }
+
+  return Reserved;
+}
+
+const TargetRegisterClass *
+AVRRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                                           const MachineFunction &MF) const {
+  if (RC->hasType(MVT::i16)) {
+    return &AVR::DREGSRegClass;
+  }
+
+  if (RC->hasType(MVT::i8)) {
+    return &AVR::GPR8RegClass;
+  }
+
+  llvm_unreachable("Invalid register size");
+}
+
+/// Fold a frame offset shared between two add instructions into a single one.
+static void foldFrameOffset(MachineInstr &MI, int &Offset, unsigned DstReg) {
+  int Opcode = MI.getOpcode();
+
+  // Don't bother trying if the next instruction is not an add or a sub.
+  if ((Opcode != AVR::SUBIWRdK) && (Opcode != AVR::ADIWRdK)) {
+    return;
+  }
+
+  // Check that DstReg matches with next instruction, otherwise the instruction
+  // is not related to stack address manipulation.
+  if (DstReg != MI.getOperand(0).getReg()) {
+    return;
+  }
+
+  // Add the offset in the next instruction to our offset.
+  switch (Opcode) {
+  case AVR::SUBIWRdK:
+    Offset += -MI.getOperand(2).getImm();
+    break;
+  case AVR::ADIWRdK:
+    Offset += MI.getOperand(2).getImm();
+    break;
+  }
+
+  // Finally remove the instruction.
+  MI.eraseFromParent();
+}
+
+void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                          int SPAdj, unsigned FIOperandNum,
+                                          RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected SPAdj value");
+
+  MachineInstr &MI = *II;
+  DebugLoc dl = MI.getDebugLoc();
+  MachineBasicBlock &MBB = *MI.getParent();
+  const MachineFunction &MF = *MBB.getParent();
+  const AVRTargetMachine &TM = (const AVRTargetMachine &)MF.getTarget();
+  const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetFrameLowering *TFI = TM.getSubtargetImpl()->getFrameLowering();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  int Offset = MFI.getObjectOffset(FrameIndex);
+
+  // Add one to the offset because SP points to an empty slot.
+  Offset += MFI.getStackSize() - TFI->getOffsetOfLocalArea() + 1;
+  // Fold incoming offset.
+  Offset += MI.getOperand(FIOperandNum + 1).getImm();
+
+  // This is actually "load effective address" of the stack slot
+  // instruction. We have only two-address instructions, thus we need to
+  // expand it into move + add.
+  if (MI.getOpcode() == AVR::FRMIDX) {
+    MI.setDesc(TII.get(AVR::MOVWRdRr));
+    MI.getOperand(FIOperandNum).ChangeToRegister(AVR::R29R28, false);
+
+    assert(Offset > 0 && "Invalid offset");
+
+    // We need to materialize the offset via an add instruction.
+    unsigned Opcode;
+    unsigned DstReg = MI.getOperand(0).getReg();
+    assert(DstReg != AVR::R29R28 && "Dest reg cannot be the frame pointer");
+
+    // Generally, to load a frame address two add instructions are emitted that
+    // could get folded into a single one:
+    //  movw    r31:r30, r29:r28
+    //  adiw    r31:r30, 29
+    //  adiw    r31:r30, 16
+    // to:
+    //  movw    r31:r30, r29:r28
+    //  adiw    r31:r30, 45
+    foldFrameOffset(*std::next(II), Offset, DstReg);
+
+    // Select the best opcode based on DstReg and the offset size.
+    switch (DstReg) {
+    case AVR::R25R24:
+    case AVR::R27R26:
+    case AVR::R31R30: {
+      if (isUInt<6>(Offset)) {
+        Opcode = AVR::ADIWRdK;
+        break;
+      }
+      LLVM_FALLTHROUGH;
+    }
+    default: {
+      // This opcode will get expanded into a pair of subi/sbci.
+      Opcode = AVR::SUBIWRdK;
+      Offset = -Offset;
+      break;
+    }
+    }
+
+    MachineInstr *New = BuildMI(MBB, std::next(II), dl, TII.get(Opcode), DstReg)
+                            .addReg(DstReg, RegState::Kill)
+                            .addImm(Offset);
+    New->getOperand(3).setIsDead();
+
+    return;
+  }
+
+  // If the offset is too big we have to adjust and restore the frame pointer
+  // to materialize a valid load/store with displacement.
+  //:TODO: consider using only one adiw/sbiw chain for more than one frame index
+  if (Offset > 63) {
+    unsigned AddOpc = AVR::ADIWRdK, SubOpc = AVR::SBIWRdK;
+    int AddOffset = Offset - 63 + 1;
+
+    // For huge offsets where adiw/sbiw cannot be used use a pair of subi/sbci.
+    if ((Offset - 63 + 1) > 63) {
+      AddOpc = AVR::SUBIWRdK;
+      SubOpc = AVR::SUBIWRdK;
+      AddOffset = -AddOffset;
+    }
+
+    // It is possible that the spiller places this frame instruction in between
+    // a compare and branch, invalidating the contents of SREG set by the
+    // compare instruction because of the add/sub pairs. Conservatively save and
+    // restore SREG before and after each add/sub pair.
+    BuildMI(MBB, II, dl, TII.get(AVR::INRdA), AVR::R0).addImm(0x3f);
+
+    MachineInstr *New = BuildMI(MBB, II, dl, TII.get(AddOpc), AVR::R29R28)
+                            .addReg(AVR::R29R28, RegState::Kill)
+                            .addImm(AddOffset);
+    New->getOperand(3).setIsDead();
+
+    // Restore SREG.
+    BuildMI(MBB, std::next(II), dl, TII.get(AVR::OUTARr))
+        .addImm(0x3f)
+        .addReg(AVR::R0, RegState::Kill);
+
+    // No need to set SREG as dead here otherwise if the next instruction is a
+    // cond branch it will be using a dead register.
+    New = BuildMI(MBB, std::next(II), dl, TII.get(SubOpc), AVR::R29R28)
+              .addReg(AVR::R29R28, RegState::Kill)
+              .addImm(Offset - 63 + 1);
+
+    Offset = 62;
+  }
+
+  MI.getOperand(FIOperandNum).ChangeToRegister(AVR::R29R28, false);
+  assert(isUInt<6>(Offset) && "Offset is out of range");
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+}
+
+unsigned AVRRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  if (TFI->hasFP(MF)) {
+    // The Y pointer register
+    return AVR::R28;
+  }
+
+  return AVR::SP;
+}
+
+const TargetRegisterClass *
+AVRRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                    unsigned Kind) const {
+  // FIXME: Currently we're using avr-gcc as reference, so we restrict
+  // ptrs to Y and Z regs. Though avr-gcc has buggy implementation
+  // of memory constraint, so we can fix it and bit avr-gcc here ;-)
+  return &AVR::PTRDISPREGSRegClass;
+}
+
+void AVRRegisterInfo::splitReg(unsigned Reg,
+                               unsigned &LoReg,
+                               unsigned &HiReg) const {
+    assert(AVR::DREGSRegClass.contains(Reg) && "can only split 16-bit registers");
+
+    LoReg = getSubReg(Reg, AVR::sub_lo);
+    HiReg = getSubReg(Reg, AVR::sub_hi);
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.h b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.h
new file mode 100644
index 000000000000..b97e32ea203f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.h
@@ -0,0 +1,58 @@
+//===-- AVRRegisterInfo.h - AVR Register Information Impl -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AVR implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_REGISTER_INFO_H
+#define LLVM_AVR_REGISTER_INFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "AVRGenRegisterInfo.inc"
+
+namespace llvm {
+
+/// Utilities relating to AVR registers.
+class AVRRegisterInfo : public AVRGenRegisterInfo {
+public:
+  AVRRegisterInfo();
+
+public:
+  const uint16_t *
+  getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID CC) const override;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  const TargetRegisterClass *
+  getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                            const MachineFunction &MF) const override;
+
+  /// Stack Frame Processing Methods
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const override;
+
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+
+  /// Splits a 16-bit `DREGS` register into the lo/hi register pair.
+  /// \param Reg A 16-bit register to split.
+  void splitReg(unsigned Reg, unsigned &LoReg, unsigned &HiReg) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_REGISTER_INFO_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td
new file mode 100644
index 000000000000..32650fc66751
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td
@@ -0,0 +1,216 @@
+//===-- AVRRegisterInfo.td - AVR Register defs -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the AVR register file
+//===----------------------------------------------------------------------===//
+
+// 8-bit General purpose register definition.
+class AVRReg<bits<16> num,
+             string name,
+             list<Register> subregs = [],
+             list<string> altNames = []>
+  : RegisterWithSubRegs<name, subregs>
+{
+  field bits<16> Num = num;
+
+  let HWEncoding = num;
+  let Namespace = "AVR";
+  let SubRegs = subregs;
+  let AltNames = altNames;
+}
+
+// Subregister indices.
+let Namespace = "AVR" in
+{
+  def sub_lo : SubRegIndex<8>;
+  def sub_hi : SubRegIndex<8, 8>;
+}
+
+let Namespace = "AVR" in {
+  def ptr : RegAltNameIndex;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  8-bit general purpose registers
+//===----------------------------------------------------------------------===//
+
+def R0  : AVRReg<0,  "r0">,  DwarfRegNum<[0]>;
+def R1  : AVRReg<1,  "r1">,  DwarfRegNum<[1]>;
+def R2  : AVRReg<2,  "r2">,  DwarfRegNum<[2]>;
+def R3  : AVRReg<3,  "r3">,  DwarfRegNum<[3]>;
+def R4  : AVRReg<4,  "r4">,  DwarfRegNum<[4]>;
+def R5  : AVRReg<5,  "r5">,  DwarfRegNum<[5]>;
+def R6  : AVRReg<6,  "r6">,  DwarfRegNum<[6]>;
+def R7  : AVRReg<7,  "r7">,  DwarfRegNum<[7]>;
+def R8  : AVRReg<8,  "r8">,  DwarfRegNum<[8]>;
+def R9  : AVRReg<9,  "r9">,  DwarfRegNum<[9]>;
+def R10 : AVRReg<10, "r10">, DwarfRegNum<[10]>;
+def R11 : AVRReg<11, "r11">, DwarfRegNum<[11]>;
+def R12 : AVRReg<12, "r12">, DwarfRegNum<[12]>;
+def R13 : AVRReg<13, "r13">, DwarfRegNum<[13]>;
+def R14 : AVRReg<14, "r14">, DwarfRegNum<[14]>;
+def R15 : AVRReg<15, "r15">, DwarfRegNum<[15]>;
+def R16 : AVRReg<16, "r16">, DwarfRegNum<[16]>;
+def R17 : AVRReg<17, "r17">, DwarfRegNum<[17]>;
+def R18 : AVRReg<18, "r18">, DwarfRegNum<[18]>;
+def R19 : AVRReg<19, "r19">, DwarfRegNum<[19]>;
+def R20 : AVRReg<20, "r20">, DwarfRegNum<[20]>;
+def R21 : AVRReg<21, "r21">, DwarfRegNum<[21]>;
+def R22 : AVRReg<22, "r22">, DwarfRegNum<[22]>;
+def R23 : AVRReg<23, "r23">, DwarfRegNum<[23]>;
+def R24 : AVRReg<24, "r24">, DwarfRegNum<[24]>;
+def R25 : AVRReg<25, "r25">, DwarfRegNum<[25]>;
+def R26 : AVRReg<26, "r26">, DwarfRegNum<[26]>;
+def R27 : AVRReg<27, "r27">, DwarfRegNum<[27]>;
+def R28 : AVRReg<28, "r28">, DwarfRegNum<[28]>;
+def R29 : AVRReg<29, "r29">, DwarfRegNum<[29]>;
+def R30 : AVRReg<30, "r30">, DwarfRegNum<[30]>;
+def R31 : AVRReg<31, "r31">, DwarfRegNum<[31]>;
+def SPL : AVRReg<32, "SPL">, DwarfRegNum<[32]>;
+def SPH : AVRReg<33, "SPH">, DwarfRegNum<[33]>;
+
+let SubRegIndices = [sub_lo, sub_hi],
+CoveredBySubRegs = 1 in
+{
+  // 16 bit GPR pairs.
+  def SP     : AVRReg<32, "SP",      [SPL, SPH]>, DwarfRegNum<[32]>;
+
+  // The pointer registers (X,Y,Z) are a special case because they
+  // are printed as a `high:low` pair when a DREG is expected,
+  // but printed using `X`, `Y`, `Z` when a pointer register is expected.
+  let RegAltNameIndices = [ptr] in {
+      def R31R30 : AVRReg<30, "r31:r30", [R30, R31], ["Z"]>, DwarfRegNum<[30]>;
+      def R29R28 : AVRReg<28, "r29:r28", [R28, R29], ["Y"]>, DwarfRegNum<[28]>;
+      def R27R26 : AVRReg<26, "r27:r26", [R26, R27], ["X"]>, DwarfRegNum<[26]>;
+  }
+  def R25R24 : AVRReg<24, "r25:r24", [R24, R25]>, DwarfRegNum<[24]>;
+  def R23R22 : AVRReg<22, "r23:r22", [R22, R23]>, DwarfRegNum<[22]>;
+  def R21R20 : AVRReg<20, "r21:r20", [R20, R21]>, DwarfRegNum<[20]>;
+  def R19R18 : AVRReg<18, "r19:r18", [R18, R19]>, DwarfRegNum<[18]>;
+  def R17R16 : AVRReg<16, "r17:r16", [R16, R17]>, DwarfRegNum<[16]>;
+  def R15R14 : AVRReg<14, "r15:r14", [R14, R15]>, DwarfRegNum<[14]>;
+  def R13R12 : AVRReg<12, "r13:r12", [R12, R13]>, DwarfRegNum<[12]>;
+  def R11R10 : AVRReg<10, "r11:r10", [R10, R11]>, DwarfRegNum<[10]>;
+  def R9R8   : AVRReg<8,  "r9:r8",   [R8, R9]>,   DwarfRegNum<[8]>;
+  def R7R6   : AVRReg<6,  "r7:r6",   [R6, R7]>,   DwarfRegNum<[6]>;
+  def R5R4   : AVRReg<4,  "r5:r4",   [R4, R5]>,   DwarfRegNum<[4]>;
+  def R3R2   : AVRReg<2,  "r3:r2",   [R2, R3]>,   DwarfRegNum<[2]>;
+  def R1R0   : AVRReg<0,  "r1:r0",   [R0, R1]>,   DwarfRegNum<[0]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register Classes
+//===----------------------------------------------------------------------===//
+
+//:TODO: use proper set instructions instead of using always "add"
+
+// Main 8-bit register class.
+def GPR8 : RegisterClass<"AVR", [i8], 8,
+  (
+    // Return value and argument registers.
+    add R24, R25, R18, R19, R20, R21, R22, R23,
+    // Scratch registers.
+    R30, R31, R26, R27,
+    // Callee saved registers.
+    R28, R29, R17, R16, R15, R14, R13, R12, R11, R10,
+    R9, R8, R7, R6, R5, R4, R3, R2, R0, R1
+  )>;
+
+// Simple lower registers r0..r15
+def GPR8lo : RegisterClass<"AVR", [i8], 8,
+  (
+    add R15, R14, R13, R12, R11, R10, R9, R8, R7, R6, R5, R4, R3, R2, R0, R1
+  )>;
+
+// 8-bit register class for instructions which take immediates.
+def LD8 : RegisterClass<"AVR", [i8], 8,
+  (
+    // Return value and arguments.
+    add R24, R25, R18, R19, R20, R21, R22, R23,
+    // Scratch registers.
+    R30, R31, R26, R27,
+    // Callee saved registers.
+    R28, R29, R17, R16
+  )>;
+
+// Simple lower registers r16..r23
+def LD8lo : RegisterClass<"AVR", [i8], 8,
+  (
+    add R23, R22, R21, R20, R19, R18, R17, R16
+  )>;
+
+// Main 16-bit pair register class.
+def DREGS : RegisterClass<"AVR", [i16], 8,
+  (
+    // Return value and arguments.
+    add R25R24, R19R18, R21R20, R23R22,
+    // Scratch registers.
+    R31R30, R27R26,
+    // Callee saved registers.
+    R29R28, R17R16, R15R14, R13R12, R11R10,
+    R9R8, R7R6, R5R4, R3R2, R1R0
+  )>;
+
+// 16-bit register class for immediate instructions.
+def DLDREGS : RegisterClass<"AVR", [i16], 8,
+  (
+    // Return value and arguments.
+    add R25R24, R19R18, R21R20, R23R22,
+    // Scratch registers.
+    R31R30, R27R26,
+    // Callee saved registers.
+    R29R28, R17R16
+  )>;
+
+// 16-bit register class for the adiw/sbiw instructions.
+def IWREGS : RegisterClass<"AVR", [i16], 8,
+  (
+    // Return value and arguments.
+    add R25R24,
+    // Scratch registers.
+    R31R30, R27R26,
+    // Callee saved registers.
+    R29R28
+  )>;
+
+// 16-bit register class for the ld and st instructions.
+// AKA X,Y, and Z
+def PTRREGS : RegisterClass<"AVR", [i16], 8,
+  (
+    add R27R26, // X
+        R29R28, // Y
+        R31R30  // Z
+  ), ptr>;
+
+// 16-bit register class for the ldd and std instructions.
+// AKA Y and Z.
+def PTRDISPREGS : RegisterClass<"AVR", [i16], 8,
+  (
+    add R31R30, R29R28
+  ), ptr>;
+
+// We have a bunch of instructions with an explicit Z register argument. We
+// model this using a register class containing only the Z register.
+// :TODO: Rename to 'ZREG'.
+def ZREGS : RegisterClass<"AVR", [i16], 8, (add R31R30)>;
+
+// Register class used for the stack read pseudo instruction.
+def GPRSP: RegisterClass<"AVR", [i16], 8, (add SP)>;
+
+//:TODO: if we remove this we get an error in tablegen
+//:TODO: this is just a hack, remove it once add16 works!
+// Status register.
+def SREG : AVRReg<14, "FLAGS">, DwarfRegNum<[88]>;
+def CCR : RegisterClass<"AVR", [i8], 8, (add SREG)>
+{
+  let CopyCost = -1;      // Don't allow copying of status registers
+}
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp b/contrib/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp
new file mode 100644
index 000000000000..26dbcf77b452
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRRelaxMemOperations.cpp
@@ -0,0 +1,149 @@
+//===-- AVRRelaxMemOperations.cpp - Relax out of range loads/stores -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass which relaxes out of range memory operations into
+// equivalent operations which handle bigger addresses.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVR.h"
+#include "AVRInstrInfo.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define AVR_RELAX_MEM_OPS_NAME "AVR memory operation relaxation pass"
+
+namespace {
+
+class AVRRelaxMem : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AVRRelaxMem() : MachineFunctionPass(ID) {
+    initializeAVRRelaxMemPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return AVR_RELAX_MEM_OPS_NAME; }
+
+private:
+  typedef MachineBasicBlock Block;
+  typedef Block::iterator BlockIt;
+
+  const TargetInstrInfo *TII;
+
+  template <unsigned OP> bool relax(Block &MBB, BlockIt MBBI);
+
+  bool runOnBasicBlock(Block &MBB);
+  bool runOnInstruction(Block &MBB, BlockIt MBBI);
+
+  MachineInstrBuilder buildMI(Block &MBB, BlockIt MBBI, unsigned Opcode) {
+    return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(Opcode));
+  }
+};
+
+char AVRRelaxMem::ID = 0;
+
+bool AVRRelaxMem::runOnMachineFunction(MachineFunction &MF) {
+  bool Modified = false;
+
+  const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
+  TII = STI.getInstrInfo();
+
+  for (Block &MBB : MF) {
+    bool BlockModified = runOnBasicBlock(MBB);
+    Modified |= BlockModified;
+  }
+
+  return Modified;
+}
+
+bool AVRRelaxMem::runOnBasicBlock(Block &MBB) {
+  bool Modified = false;
+
+  BlockIt MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    BlockIt NMBBI = std::next(MBBI);
+    Modified |= runOnInstruction(MBB, MBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+template <>
+bool AVRRelaxMem::relax<AVR::STDWPtrQRr>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+
+  MachineOperand &Ptr = MI.getOperand(0);
+  MachineOperand &Src = MI.getOperand(2);
+  int64_t Imm = MI.getOperand(1).getImm();
+
+  // We can definitely optimise this better.
+  if (Imm > 63) {
+    // Push the previous state of the pointer register.
+    // This instruction must preserve the value.
+    buildMI(MBB, MBBI, AVR::PUSHWRr)
+      .addReg(Ptr.getReg());
+
+    // Add the immediate to the pointer register.
+    buildMI(MBB, MBBI, AVR::SBCIWRdK)
+      .addReg(Ptr.getReg(), RegState::Define)
+      .addReg(Ptr.getReg())
+      .addImm(-Imm);
+
+    // Store the value in the source register to the address
+    // pointed to by the pointer register.
+    buildMI(MBB, MBBI, AVR::STWPtrRr)
+      .addReg(Ptr.getReg())
+      .addReg(Src.getReg(), getKillRegState(Src.isKill()));
+
+    // Pop the original state of the pointer register.
+    buildMI(MBB, MBBI, AVR::POPWRd)
+      .addReg(Ptr.getReg(), getKillRegState(Ptr.isKill()));
+
+    MI.removeFromParent();
+  }
+
+  return false;
+}
+
+bool AVRRelaxMem::runOnInstruction(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  int Opcode = MBBI->getOpcode();
+
+#define RELAX(Op)                \
+  case Op:                       \
+    return relax<Op>(MBB, MI)
+
+  switch (Opcode) {
+    RELAX(AVR::STDWPtrQRr);
+  }
+#undef RELAX
+  return false;
+}
+
+} // end of anonymous namespace
+
+INITIALIZE_PASS(AVRRelaxMem, "avr-relax-mem",
+                AVR_RELAX_MEM_OPS_NAME, false, false)
+
+namespace llvm {
+
+FunctionPass *createAVRRelaxMemPass() { return new AVRRelaxMem(); }
+
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h b/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h
new file mode 100644
index 000000000000..6474c8779330
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h
@@ -0,0 +1,28 @@
+//===-- AVRSelectionDAGInfo.h - AVR SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AVR subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_SELECTION_DAG_INFO_H
+#define LLVM_AVR_SELECTION_DAG_INFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+/// Holds information about the AVR instruction selection DAG.
+class AVRSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_SELECTION_DAG_INFO_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRSubtarget.cpp b/contrib/llvm/lib/Target/AVR/AVRSubtarget.cpp
new file mode 100644
index 000000000000..c228d051d771
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRSubtarget.cpp
@@ -0,0 +1,47 @@
+//===-- AVRSubtarget.cpp - AVR Subtarget Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AVR specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRSubtarget.h"
+
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include "AVR.h"
+#include "AVRTargetMachine.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#define DEBUG_TYPE "avr-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "AVRGenSubtargetInfo.inc"
+
+namespace llvm {
+
+AVRSubtarget::AVRSubtarget(const Triple &TT, const std::string &CPU,
+                           const std::string &FS, AVRTargetMachine &TM)
+    : AVRGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(),
+      TLInfo(TM), TSInfo(),
+
+      // Subtarget features
+      m_hasSRAM(false), m_hasJMPCALL(false), m_hasIJMPCALL(false),
+      m_hasEIJMPCALL(false), m_hasADDSUBIW(false), m_hasSmallStack(false),
+      m_hasMOVW(false), m_hasLPM(false), m_hasLPMX(false),  m_hasELPM(false),
+      m_hasELPMX(false), m_hasSPM(false), m_hasSPMX(false), m_hasDES(false),
+      m_supportsRMW(false), m_supportsMultiplication(false), m_hasBREAK(false),
+      m_hasTinyEncoding(false), ELFArch(false), m_FeatureSetDummy(false) {
+  // Parse features string.
+  ParseSubtargetFeatures(CPU, FS);
+}
+
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/AVRSubtarget.h b/contrib/llvm/lib/Target/AVR/AVRSubtarget.h
new file mode 100644
index 000000000000..a37849c3f3f7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRSubtarget.h
@@ -0,0 +1,119 @@
+//===-- AVRSubtarget.h - Define Subtarget for the AVR -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AVR specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_SUBTARGET_H
+#define LLVM_AVR_SUBTARGET_H
+
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include "AVRFrameLowering.h"
+#include "AVRISelLowering.h"
+#include "AVRInstrInfo.h"
+#include "AVRSelectionDAGInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AVRGenSubtargetInfo.inc"
+
+namespace llvm {
+
+/// A specific AVR target MCU.
+class AVRSubtarget : public AVRGenSubtargetInfo {
+public:
+  //! Creates an AVR subtarget.
+  //! \param TT  The target triple.
+  //! \param CPU The CPU to target.
+  //! \param FS  The feature string.
+  //! \param TM  The target machine.
+  AVRSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
+               AVRTargetMachine &TM);
+
+  const AVRInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const TargetFrameLowering *getFrameLowering() const override { return &FrameLowering; }
+  const AVRTargetLowering *getTargetLowering() const override { return &TLInfo; }
+  const AVRSelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; }
+  const AVRRegisterInfo *getRegisterInfo() const override { return &InstrInfo.getRegisterInfo(); }
+
+  /// Parses a subtarget feature string, setting appropriate options.
+  /// \note Definition of function is auto generated by `tblgen`.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  // Subtarget feature getters.
+  // See AVR.td for details.
+  bool hasSRAM() const { return m_hasSRAM; }
+  bool hasJMPCALL() const { return m_hasJMPCALL; }
+  bool hasIJMPCALL() const { return m_hasIJMPCALL; }
+  bool hasEIJMPCALL() const { return m_hasEIJMPCALL; }
+  bool hasADDSUBIW() const { return m_hasADDSUBIW; }
+  bool hasSmallStack() const { return m_hasSmallStack; }
+  bool hasMOVW() const { return m_hasMOVW; }
+  bool hasLPM() const { return m_hasLPM; }
+  bool hasLPMX() const { return m_hasLPMX; }
+  bool hasELPM() const { return m_hasELPM; }
+  bool hasELPMX() const { return m_hasELPMX; }
+  bool hasSPM() const { return m_hasSPM; }
+  bool hasSPMX() const { return m_hasSPMX; }
+  bool hasDES() const { return m_hasDES; }
+  bool supportsRMW() const { return m_supportsRMW; }
+  bool supportsMultiplication() const { return m_supportsMultiplication; }
+  bool hasBREAK() const { return m_hasBREAK; }
+  bool hasTinyEncoding() const { return m_hasTinyEncoding; }
+
+  /// Gets the ELF architecture for the e_flags field
+  /// of an ELF object file.
+  unsigned getELFArch() const {
+    assert(ELFArch != 0 &&
+           "every device must have an associate ELF architecture");
+    return ELFArch;
+  }
+
+private:
+  AVRInstrInfo InstrInfo;
+  AVRFrameLowering FrameLowering;
+  AVRTargetLowering TLInfo;
+  AVRSelectionDAGInfo TSInfo;
+
+  // Subtarget feature settings
+  // See AVR.td for details.
+  bool m_hasSRAM;
+  bool m_hasJMPCALL;
+  bool m_hasIJMPCALL;
+  bool m_hasEIJMPCALL;
+  bool m_hasADDSUBIW;
+  bool m_hasSmallStack;
+  bool m_hasMOVW;
+  bool m_hasLPM;
+  bool m_hasLPMX;
+  bool m_hasELPM;
+  bool m_hasELPMX;
+  bool m_hasSPM;
+  bool m_hasSPMX;
+  bool m_hasDES;
+  bool m_supportsRMW;
+  bool m_supportsMultiplication;
+  bool m_hasBREAK;
+  bool m_hasTinyEncoding;
+
+  /// The ELF e_flags architecture.
+  unsigned ELFArch;
+
+  // Dummy member, used by FeatureSet's. We cannot have a SubtargetFeature with
+  // no variable, so we instead bind pseudo features to this variable.
+  bool m_FeatureSetDummy;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_SUBTARGET_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp
new file mode 100644
index 000000000000..fb3262916b4f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -0,0 +1,118 @@
+//===-- AVRTargetMachine.cpp - Define TargetMachine for AVR ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AVR specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRTargetMachine.h"
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include "AVRTargetObjectFile.h"
+#include "AVR.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+namespace llvm {
+
+static const char *AVRDataLayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-n8";
+
+/// Processes a CPU name.
+static StringRef getCPU(StringRef CPU) {
+  if (CPU.empty() || CPU == "generic") {
+    return "avr2";
+  }
+
+  return CPU;
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  return RM.hasValue() ? *RM : Reloc::Static;
+}
+
+AVRTargetMachine::AVRTargetMachine(const Target &T, const Triple &TT,
+                                   StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
+                                   Optional<Reloc::Model> RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL)
+    : LLVMTargetMachine(
+          T, AVRDataLayout, TT,
+          getCPU(CPU), FS, Options, getEffectiveRelocModel(RM), CM, OL),
+      SubTarget(TT, getCPU(CPU), FS, *this) {
+  this->TLOF = make_unique<AVRTargetObjectFile>();
+  initAsmInfo();
+}
+
+namespace {
+/// AVR Code Generator Pass Configuration Options.
+class AVRPassConfig : public TargetPassConfig {
+public:
+  AVRPassConfig(AVRTargetMachine *TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  AVRTargetMachine &getAVRTargetMachine() const {
+    return getTM<AVRTargetMachine>();
+  }
+
+  bool addInstSelector() override;
+  void addPreSched2() override;
+  void addPreRegAlloc() override;
+};
+} // namespace
+
+TargetPassConfig *AVRTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new AVRPassConfig(this, PM);
+}
+
+extern "C" void LLVMInitializeAVRTarget() {
+  // Register the target.
+  RegisterTargetMachine<AVRTargetMachine> X(getTheAVRTarget());
+
+  auto &PR = *PassRegistry::getPassRegistry();
+  initializeAVRExpandPseudoPass(PR);
+  initializeAVRInstrumentFunctionsPass(PR);
+  initializeAVRRelaxMemPass(PR);
+}
+
+const AVRSubtarget *AVRTargetMachine::getSubtargetImpl() const {
+  return &SubTarget;
+}
+
+const AVRSubtarget *AVRTargetMachine::getSubtargetImpl(const Function &) const {
+  return &SubTarget;
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool AVRPassConfig::addInstSelector() {
+  // Install an instruction selector.
+  addPass(createAVRISelDag(getAVRTargetMachine(), getOptLevel()));
+  // Create the frame analyzer pass used by the PEI pass.
+  addPass(createAVRFrameAnalyzerPass());
+
+  return false;
+}
+
+void AVRPassConfig::addPreRegAlloc() {
+  // Create the dynalloc SP save/restore pass to handle variable sized allocas.
+  addPass(createAVRDynAllocaSRPass());
+}
+
+void AVRPassConfig::addPreSched2() {
+  addPass(createAVRRelaxMemPass());
+  addPass(createAVRExpandPseudoPass());
+}
+
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetMachine.h b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.h
new file mode 100644
index 000000000000..10345193d14a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.h
@@ -0,0 +1,51 @@
+//===-- AVRTargetMachine.h - Define TargetMachine for AVR -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AVR specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_TARGET_MACHINE_H
+#define LLVM_AVR_TARGET_MACHINE_H
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include "AVRFrameLowering.h"
+#include "AVRISelLowering.h"
+#include "AVRInstrInfo.h"
+#include "AVRSelectionDAGInfo.h"
+#include "AVRSubtarget.h"
+
+namespace llvm {
+
+/// A generic AVR implementation.
+class AVRTargetMachine : public LLVMTargetMachine {
+public:
+  AVRTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                   StringRef FS, const TargetOptions &Options, Optional<Reloc::Model> RM,
+                   CodeModel::Model CM, CodeGenOpt::Level OL);
+
+  const AVRSubtarget *getSubtargetImpl() const;
+  const AVRSubtarget *getSubtargetImpl(const Function &) const override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return this->TLOF.get();
+  }
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+private:
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  AVRSubtarget SubTarget;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_TARGET_MACHINE_H
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
new file mode 100644
index 000000000000..af14d9292f27
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
@@ -0,0 +1,41 @@
+//===-- AVRTargetObjectFile.cpp - AVR Object Files ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRTargetObjectFile.h"
+
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+
+#include "AVR.h"
+
+namespace llvm {
+void AVRTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
+  Base::Initialize(Ctx, TM);
+  ProgmemDataSection =
+      Ctx.getELFSection(".progmem.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+}
+
+MCSection *
+AVRTargetObjectFile::SelectSectionForGlobal(const GlobalObject *GO,
+                                            SectionKind Kind,
+                                            const TargetMachine &TM) const {
+  // Global values in flash memory are placed in the progmem.data section
+  // unless they already have a user assigned section.
+  if (AVR::isProgramMemoryAddress(GO) && !GO->hasSection())
+    return ProgmemDataSection;
+
+  // Otherwise, we work the same way as ELF.
+  return Base::SelectSectionForGlobal(GO, Kind, TM);
+}
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h
new file mode 100644
index 000000000000..ba91036fd64c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h
@@ -0,0 +1,33 @@
+//===-- AVRTargetObjectFile.h - AVR Object Info -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_TARGET_OBJECT_FILE_H
+#define LLVM_AVR_TARGET_OBJECT_FILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+/// Lowering for an AVR ELF32 object file.
+class AVRTargetObjectFile : public TargetLoweringObjectFileELF {
+  typedef TargetLoweringObjectFileELF Base;
+
+public:
+  void Initialize(MCContext &ctx, const TargetMachine &TM) override;
+
+  MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+                                    const TargetMachine &TM) const override;
+
+private:
+  MCSection *ProgmemDataSection;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_TARGET_OBJECT_FILE_H
diff --git a/contrib/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/contrib/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
new file mode 100644
index 000000000000..5b0398c0ca34
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -0,0 +1,631 @@
+//===---- AVRAsmParser.cpp - Parse AVR assembly to MCInst instructions ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVR.h"
+#include "AVRRegisterInfo.h"
+#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include <sstream>
+
+#define DEBUG_TYPE "avr-asm-parser"
+
+namespace llvm {
+
+/// Parses AVR assembly from a stream.
+class AVRAsmParser : public MCTargetAsmParser {
+  const MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+  const MCRegisterInfo *MRI;
+
+#define GET_ASSEMBLER_HEADER
+#include "AVRGenAsmMatcher.inc"
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+
+  bool ParseDirective(AsmToken directiveID) override;
+
+  OperandMatchResultTy parseMemriOperand(OperandVector &Operands);
+
+  bool parseOperand(OperandVector &Operands);
+  int parseRegisterName(unsigned (*matchFn)(StringRef));
+  int parseRegisterName();
+  int parseRegister();
+  bool tryParseRegisterOperand(OperandVector &Operands);
+  bool tryParseExpression(OperandVector &Operands);
+  bool tryParseRelocExpression(OperandVector &Operands);
+  void eatComma();
+
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
+
+  unsigned toDREG(unsigned Reg, unsigned From = AVR::sub_lo) {
+    MCRegisterClass const *Class = &AVRMCRegisterClasses[AVR::DREGSRegClassID];
+    return MRI->getMatchingSuperReg(Reg, From, Class);
+  }
+
+  bool emit(MCInst &Instruction, SMLoc const &Loc, MCStreamer &Out) const;
+  bool invalidOperand(SMLoc const &Loc, OperandVector const &Operands,
+                      uint64_t const &ErrorInfo);
+  bool missingFeature(SMLoc const &Loc, uint64_t const &ErrorInfo);
+
+public:
+  AVRAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
+               const MCInstrInfo &MII, const MCTargetOptions &Options)
+      : MCTargetAsmParser(Options, STI), STI(STI), Parser(Parser) {
+    MCAsmParserExtension::Initialize(Parser);
+    MRI = getContext().getRegisterInfo();
+
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+};
+
+/// An parsed AVR assembly operand.
+class AVROperand : public MCParsedAsmOperand {
+  typedef MCParsedAsmOperand Base;
+  enum KindTy { k_Immediate, k_Register, k_Token, k_Memri } Kind;
+
+public:
+  AVROperand(StringRef Tok, SMLoc const &S)
+      : Base(), Kind(k_Token), Tok(Tok), Start(S), End(S) {}
+  AVROperand(unsigned Reg, SMLoc const &S, SMLoc const &E)
+      : Base(), Kind(k_Register), RegImm({Reg, nullptr}), Start(S), End(E) {}
+  AVROperand(MCExpr const *Imm, SMLoc const &S, SMLoc const &E)
+      : Base(), Kind(k_Immediate), RegImm({0, Imm}), Start(S), End(E) {}
+  AVROperand(unsigned Reg, MCExpr const *Imm, SMLoc const &S, SMLoc const &E)
+      : Base(), Kind(k_Memri), RegImm({Reg, Imm}), Start(S), End(E) {}
+
+  struct RegisterImmediate {
+    unsigned Reg;
+    MCExpr const *Imm;
+  };
+  union {
+    StringRef Tok;
+    RegisterImmediate RegImm;
+  };
+
+  SMLoc Start, End;
+
+public:
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(Kind == k_Register && "Unexpected operand kind");
+    assert(N == 1 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediate when possible
+    if (!Expr)
+      Inst.addOperand(MCOperand::createImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(Kind == k_Immediate && "Unexpected operand kind");
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCExpr *Expr = getImm();
+    addExpr(Inst, Expr);
+  }
+
+  /// Adds the contained reg+imm operand to an instruction.
+  void addMemriOperands(MCInst &Inst, unsigned N) const {
+    assert(Kind == k_Memri && "Unexpected operand kind");
+    assert(N == 2 && "Invalid number of operands");
+
+    Inst.addOperand(MCOperand::createReg(getReg()));
+    addExpr(Inst, getImm());
+  }
+
+  bool isReg() const { return Kind == k_Register; }
+  bool isImm() const { return Kind == k_Immediate; }
+  bool isToken() const { return Kind == k_Token; }
+  bool isMem() const { return Kind == k_Memri; }
+  bool isMemri() const { return Kind == k_Memri; }
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return Tok;
+  }
+
+  unsigned getReg() const {
+    assert((Kind == k_Register || Kind == k_Memri) && "Invalid access!");
+
+    return RegImm.Reg;
+  }
+
+  const MCExpr *getImm() const {
+    assert((Kind == k_Immediate || Kind == k_Memri) && "Invalid access!");
+    return RegImm.Imm;
+  }
+
+  static std::unique_ptr<AVROperand> CreateToken(StringRef Str, SMLoc S) {
+    return make_unique<AVROperand>(Str, S);
+  }
+
+  static std::unique_ptr<AVROperand> CreateReg(unsigned RegNum, SMLoc S,
+                                               SMLoc E) {
+    return make_unique<AVROperand>(RegNum, S, E);
+  }
+
+  static std::unique_ptr<AVROperand> CreateImm(const MCExpr *Val, SMLoc S,
+                                               SMLoc E) {
+    return make_unique<AVROperand>(Val, S, E);
+  }
+
+  static std::unique_ptr<AVROperand>
+  CreateMemri(unsigned RegNum, const MCExpr *Val, SMLoc S, SMLoc E) {
+    return make_unique<AVROperand>(RegNum, Val, S, E);
+  }
+
+  void makeToken(StringRef Token) {
+    Kind = k_Token;
+    Tok = Token;
+  }
+
+  void makeReg(unsigned RegNo) {
+    Kind = k_Register;
+    RegImm = {RegNo, nullptr};
+  }
+
+  void makeImm(MCExpr const *Ex) {
+    Kind = k_Immediate;
+    RegImm = {0, Ex};
+  }
+
+  void makeMemri(unsigned RegNo, MCExpr const *Imm) {
+    Kind = k_Memri;
+    RegImm = {RegNo, Imm};
+  }
+
+  SMLoc getStartLoc() const { return Start; }
+  SMLoc getEndLoc() const { return End; }
+
+  virtual void print(raw_ostream &O) const {
+    switch (Kind) {
+    case k_Token:
+      O << "Token: \"" << getToken() << "\"";
+      break;
+    case k_Register:
+      O << "Register: " << getReg();
+      break;
+    case k_Immediate:
+      O << "Immediate: \"" << *getImm() << "\"";
+      break;
+    case k_Memri: {
+      // only manually print the size for non-negative values,
+      // as the sign is inserted automatically.
+      O << "Memri: \"" << getReg() << '+' << *getImm() << "\"";
+      break;
+    }
+    }
+    O << "\n";
+  }
+};
+
+// Auto-generated Match Functions
+
+/// Maps from the set of all register names to a register number.
+/// \note Generated by TableGen.
+static unsigned MatchRegisterName(StringRef Name);
+
+/// Maps from the set of all alternative registernames to a register number.
+/// \note Generated by TableGen.
+static unsigned MatchRegisterAltName(StringRef Name);
+
+bool AVRAsmParser::invalidOperand(SMLoc const &Loc,
+                                  OperandVector const &Operands,
+                                  uint64_t const &ErrorInfo) {
+  SMLoc ErrorLoc = Loc;
+  char const *Diag = 0;
+
+  if (ErrorInfo != ~0U) {
+    if (ErrorInfo >= Operands.size()) {
+      Diag = "too few operands for instruction.";
+    } else {
+      AVROperand const &Op = (AVROperand const &)*Operands[ErrorInfo];
+
+      // TODO: See if we can do a better error than just "invalid ...".
+      if (Op.getStartLoc() != SMLoc()) {
+        ErrorLoc = Op.getStartLoc();
+      }
+    }
+  }
+
+  if (!Diag) {
+    Diag = "invalid operand for instruction";
+  }
+
+  return Error(ErrorLoc, Diag);
+}
+
+bool AVRAsmParser::missingFeature(llvm::SMLoc const &Loc,
+                                  uint64_t const &ErrorInfo) {
+  return Error(Loc, "instruction requires a CPU feature not currently enabled");
+}
+
+bool AVRAsmParser::emit(MCInst &Inst, SMLoc const &Loc, MCStreamer &Out) const {
+  Inst.setLoc(Loc);
+  Out.EmitInstruction(Inst, STI);
+
+  return false;
+}
+
+bool AVRAsmParser::MatchAndEmitInstruction(SMLoc Loc, unsigned &Opcode,
+                                           OperandVector &Operands,
+                                           MCStreamer &Out, uint64_t &ErrorInfo,
+                                           bool MatchingInlineAsm) {
+  MCInst Inst;
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+
+  switch (MatchResult) {
+  case Match_Success:        return emit(Inst, Loc, Out);
+  case Match_MissingFeature: return missingFeature(Loc, ErrorInfo);
+  case Match_InvalidOperand: return invalidOperand(Loc, Operands, ErrorInfo);
+  case Match_MnemonicFail:   return Error(Loc, "invalid instruction");
+  default:                   return true;
+  }
+}
+
+/// Parses a register name using a given matching function.
+/// Checks for lowercase or uppercase if necessary.
+int AVRAsmParser::parseRegisterName(unsigned (*matchFn)(StringRef)) {
+  StringRef Name = Parser.getTok().getString();
+
+  int RegNum = matchFn(Name);
+
+  // GCC supports case insensitive register names. Some of the AVR registers
+  // are all lower case, some are all upper case but non are mixed. We prefer
+  // to use the original names in the register definitions. That is why we
+  // have to test both upper and lower case here.
+  if (RegNum == AVR::NoRegister) {
+    RegNum = matchFn(Name.lower());
+  }
+  if (RegNum == AVR::NoRegister) {
+    RegNum = matchFn(Name.upper());
+  }
+
+  return RegNum;
+}
+
+int AVRAsmParser::parseRegisterName() {
+  int RegNum = parseRegisterName(&MatchRegisterName);
+
+  if (RegNum == AVR::NoRegister)
+    RegNum = parseRegisterName(&MatchRegisterAltName);
+
+  return RegNum;
+}
+
+int AVRAsmParser::parseRegister() {
+  int RegNum = AVR::NoRegister;
+
+  if (Parser.getTok().is(AsmToken::Identifier)) {
+    // Check for register pair syntax
+    if (Parser.getLexer().peekTok().is(AsmToken::Colon)) {
+      Parser.Lex();
+      Parser.Lex(); // Eat high (odd) register and colon
+
+      if (Parser.getTok().is(AsmToken::Identifier)) {
+        // Convert lower (even) register to DREG
+        RegNum = toDREG(parseRegisterName());
+      }
+    } else {
+      RegNum = parseRegisterName();
+    }
+  }
+  return RegNum;
+}
+
+bool AVRAsmParser::tryParseRegisterOperand(OperandVector &Operands) {
+  int RegNo = parseRegister();
+
+  if (RegNo == AVR::NoRegister)
+    return true;
+
+  AsmToken const &T = Parser.getTok();
+  Operands.push_back(AVROperand::CreateReg(RegNo, T.getLoc(), T.getEndLoc()));
+  Parser.Lex(); // Eat register token.
+
+  return false;
+}
+
+bool AVRAsmParser::tryParseExpression(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+
+  if (!tryParseRelocExpression(Operands))
+    return false;
+
+  if ((Parser.getTok().getKind() == AsmToken::Plus ||
+       Parser.getTok().getKind() == AsmToken::Minus) &&
+      Parser.getLexer().peekTok().getKind() == AsmToken::Identifier) {
+    // Don't handle this case - it should be split into two
+    // separate tokens.
+    return true;
+  }
+
+  // Parse (potentially inner) expression
+  MCExpr const *Expression;
+  if (getParser().parseExpression(Expression))
+    return true;
+
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Operands.push_back(AVROperand::CreateImm(Expression, S, E));
+  return false;
+}
+
+bool AVRAsmParser::tryParseRelocExpression(OperandVector &Operands) {
+  bool isNegated = false;
+  AVRMCExpr::VariantKind ModifierKind = AVRMCExpr::VK_AVR_None;
+
+  SMLoc S = Parser.getTok().getLoc();
+
+  // Check for sign
+  AsmToken tokens[2];
+  size_t ReadCount = Parser.getLexer().peekTokens(tokens);
+
+  if (ReadCount == 2) {
+    if (tokens[0].getKind() == AsmToken::Identifier &&
+        tokens[1].getKind() == AsmToken::LParen) {
+
+      AsmToken::TokenKind CurTok = Parser.getLexer().getKind();
+      if (CurTok == AsmToken::Minus) {
+        isNegated = true;
+      } else {
+        assert(CurTok == AsmToken::Plus);
+        isNegated = false;
+      }
+
+      // Eat the sign
+      Parser.Lex();
+    }
+  }
+
+  // Check if we have a target specific modifier (lo8, hi8, &c)
+  if (Parser.getTok().getKind() != AsmToken::Identifier ||
+      Parser.getLexer().peekTok().getKind() != AsmToken::LParen) {
+    // Not a reloc expr
+    return true;
+  }
+  StringRef ModifierName = Parser.getTok().getString();
+  ModifierKind = AVRMCExpr::getKindByName(ModifierName.str().c_str());
+
+  if (ModifierKind != AVRMCExpr::VK_AVR_None) {
+    Parser.Lex();
+    Parser.Lex(); // Eat modifier name and parenthesis
+  } else {
+    return Error(Parser.getTok().getLoc(), "unknown modifier");
+  }
+
+  MCExpr const *InnerExpression;
+  if (getParser().parseExpression(InnerExpression))
+    return true;
+
+  // If we have a modifier wrap the inner expression
+  assert(Parser.getTok().getKind() == AsmToken::RParen);
+  Parser.Lex(); // Eat closing parenthesis
+
+  MCExpr const *Expression = AVRMCExpr::create(ModifierKind, InnerExpression,
+                                               isNegated, getContext());
+
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Operands.push_back(AVROperand::CreateImm(Expression, S, E));
+
+  return false;
+}
+
+bool AVRAsmParser::parseOperand(OperandVector &Operands) {
+  DEBUG(dbgs() << "parseOperand\n");
+
+  switch (getLexer().getKind()) {
+  default:
+    return Error(Parser.getTok().getLoc(), "unexpected token in operand");
+
+  case AsmToken::Identifier:
+    // Try to parse a register, if it fails,
+    // fall through to the next case.
+    if (!tryParseRegisterOperand(Operands)) {
+      return false;
+    }
+  case AsmToken::LParen:
+  case AsmToken::Integer:
+  case AsmToken::Dot:
+    return tryParseExpression(Operands);
+  case AsmToken::Plus:
+  case AsmToken::Minus: {
+    // If the sign preceeds a number, parse the number,
+    // otherwise treat the sign a an independent token.
+    switch (getLexer().peekTok().getKind()) {
+    case AsmToken::Integer:
+    case AsmToken::BigNum:
+    case AsmToken::Identifier:
+    case AsmToken::Real:
+      if (!tryParseExpression(Operands))
+        return false;
+    default:
+      break;
+    }
+    // Treat the token as an independent token.
+    Operands.push_back(AVROperand::CreateToken(Parser.getTok().getString(),
+                                               Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat the token.
+    return false;
+  }
+  }
+
+  // Could not parse operand
+  return true;
+}
+
+OperandMatchResultTy
+AVRAsmParser::parseMemriOperand(OperandVector &Operands) {
+  DEBUG(dbgs() << "parseMemriOperand()\n");
+
+  SMLoc E, S;
+  MCExpr const *Expression;
+  int RegNo;
+
+  // Parse register.
+  {
+    RegNo = parseRegister();
+
+    if (RegNo == AVR::NoRegister)
+      return MatchOperand_ParseFail;
+
+    S = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    Parser.Lex(); // Eat register token.
+  }
+
+  // Parse immediate;
+  {
+    if (getParser().parseExpression(Expression))
+      return MatchOperand_ParseFail;
+
+    E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  }
+
+  Operands.push_back(AVROperand::CreateMemri(RegNo, Expression, S, E));
+
+  return MatchOperand_Success;
+}
+
+bool AVRAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                 SMLoc &EndLoc) {
+  StartLoc = Parser.getTok().getLoc();
+  RegNo = parseRegister();
+  EndLoc = Parser.getTok().getLoc();
+
+  return (RegNo == AVR::NoRegister);
+}
+
+void AVRAsmParser::eatComma() {
+  if (getLexer().is(AsmToken::Comma)) {
+    Parser.Lex();
+  } else {
+    // GCC allows commas to be omitted.
+  }
+}
+
+bool AVRAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                    StringRef Mnemonic, SMLoc NameLoc,
+                                    OperandVector &Operands) {
+  Operands.push_back(AVROperand::CreateToken(Mnemonic, NameLoc));
+
+  bool first = true;
+  while (getLexer().isNot(AsmToken::EndOfStatement)) {
+    if (!first) eatComma();
+
+    first = false;
+
+    auto MatchResult = MatchOperandParserImpl(Operands, Mnemonic);
+
+    if (MatchResult == MatchOperand_Success) {
+      continue;
+    }
+
+    if (MatchResult == MatchOperand_ParseFail) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+
+      return Error(Loc, "failed to parse register and immediate pair");
+    }
+
+    if (parseOperand(Operands)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+  }
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+bool AVRAsmParser::ParseDirective(llvm::AsmToken DirectiveID) { return true; }
+
+extern "C" void LLVMInitializeAVRAsmParser() {
+  RegisterMCAsmParser<AVRAsmParser> X(getTheAVRTarget());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "AVRGenAsmMatcher.inc"
+
+// Uses enums defined in AVRGenAsmMatcher.inc
+unsigned AVRAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+                                                  unsigned ExpectedKind) {
+  AVROperand &Op = static_cast<AVROperand &>(AsmOp);
+  MatchClassKind Expected = static_cast<MatchClassKind>(ExpectedKind);
+
+  // If need be, GCC converts bare numbers to register names
+  // It's ugly, but GCC supports it.
+  if (Op.isImm()) {
+    if (MCConstantExpr const *Const = dyn_cast<MCConstantExpr>(Op.getImm())) {
+      int64_t RegNum = Const->getValue();
+      std::ostringstream RegName;
+      RegName << "r" << RegNum;
+      RegNum = MatchRegisterName(RegName.str().c_str());
+      if (RegNum != AVR::NoRegister) {
+        Op.makeReg(RegNum);
+        if (validateOperandClass(Op, Expected) == Match_Success) {
+          return Match_Success;
+        }
+      }
+      // Let the other quirks try their magic.
+    }
+  }
+
+  if (Op.isReg()) {
+    // If the instruction uses a register pair but we got a single, lower
+    // register we perform a "class cast".
+    if (isSubclass(Expected, MCK_DREGS)) {
+      unsigned correspondingDREG = toDREG(Op.getReg());
+
+      if (correspondingDREG != AVR::NoRegister) {
+        Op.makeReg(correspondingDREG);
+        return validateOperandClass(Op, Expected);
+      }
+    }
+  }
+  return Match_InvalidOperand;
+}
+
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/contrib/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
new file mode 100644
index 000000000000..d2a21fb64635
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
@@ -0,0 +1,156 @@
+//===- AVRDisassembler.cpp - Disassembler for AVR ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the AVR Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVR.h"
+#include "AVRRegisterInfo.h"
+#include "AVRSubtarget.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "avr-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// A disassembler class for AVR.
+class AVRDisassembler : public MCDisassembler {
+public:
+  AVRDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+      : MCDisassembler(STI, Ctx) {}
+  virtual ~AVRDisassembler() {}
+
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
+};
+}
+
+static MCDisassembler *createAVRDisassembler(const Target &T,
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  return new AVRDisassembler(STI, Ctx);
+}
+
+
+extern "C" void LLVMInitializeAVRDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(getTheAVRTarget(),
+                                         createAVRDisassembler);
+}
+
+static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address, const void *Decoder) {
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLD8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address, const void *Decoder) {
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePTRREGSRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address, const void *Decoder) {
+  return MCDisassembler::Success;
+}
+
+#include "AVRGenDisassemblerTables.inc"
+
+static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                      uint64_t &Size, uint32_t &Insn) {
+  if (Bytes.size() < 2) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  Size = 2;
+  Insn = (Bytes[0] << 0) | (Bytes[1] << 8);
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                      uint64_t &Size, uint32_t &Insn) {
+
+  if (Bytes.size() < 4) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  Size = 4;
+  Insn = (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) | (Bytes[3] << 24);
+
+  return MCDisassembler::Success;
+}
+
+static const uint8_t *getDecoderTable(uint64_t Size) {
+
+  switch (Size) {
+    case 2: return DecoderTable16;
+    case 4: return DecoderTable32;
+    default: llvm_unreachable("instructions must be 16 or 32-bits");
+  }
+}
+
+DecodeStatus AVRDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+                                             ArrayRef<uint8_t> Bytes,
+                                             uint64_t Address,
+                                             raw_ostream &VStream,
+                                             raw_ostream &CStream) const {
+  uint32_t Insn;
+
+  DecodeStatus Result;
+
+  // Try decode a 16-bit instruction.
+  {
+    Result = readInstruction16(Bytes, Address, Size, Insn);
+
+    if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
+
+    // Try to auto-decode a 16-bit instruction.
+    Result = decodeInstruction(getDecoderTable(Size), Instr,
+                               Insn, Address, this, STI);
+
+    if (Result != MCDisassembler::Fail)
+      return Result;
+  }
+
+  // Try decode a 32-bit instruction.
+  {
+    Result = readInstruction32(Bytes, Address, Size, Insn);
+
+    if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
+
+    Result = decodeInstruction(getDecoderTable(Size), Instr, Insn,
+                               Address, this, STI);
+
+    if (Result != MCDisassembler::Fail) {
+      return Result;
+    }
+
+    return MCDisassembler::Fail;
+  }
+}
+
+typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned insn, uint64_t Address,
+                                   const void *Decoder);
+
diff --git a/contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp b/contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp
new file mode 100644
index 000000000000..316b7836df0d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp
@@ -0,0 +1,171 @@
+//===-- AVRInstPrinter.cpp - Convert AVR MCInst to assembly syntax --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an AVR MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRInstPrinter.h"
+
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+
+#include <cstring>
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace llvm {
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "AVRGenAsmWriter.inc"
+
+void AVRInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot, const MCSubtargetInfo &STI) {
+  unsigned Opcode = MI->getOpcode();
+
+  // First handle load and store instructions with postinc or predec
+  // of the form "ld reg, X+".
+  // TODO: We should be able to rewrite this using TableGen data.
+  switch (Opcode) {
+  case AVR::LDRdPtr:
+  case AVR::LDRdPtrPi:
+  case AVR::LDRdPtrPd:
+    O << "\tld\t";
+    printOperand(MI, 0, O);
+    O << ", ";
+
+    if (Opcode == AVR::LDRdPtrPd)
+      O << '-';
+
+    printOperand(MI, 1, O);
+
+    if (Opcode == AVR::LDRdPtrPi)
+      O << '+';
+    break;
+  case AVR::STPtrRr:
+    O << "\tst\t";
+    printOperand(MI, 0, O);
+    O << ", ";
+    printOperand(MI, 1, O);
+    break;
+  case AVR::STPtrPiRr:
+  case AVR::STPtrPdRr:
+    O << "\tst\t";
+
+    if (Opcode == AVR::STPtrPdRr)
+      O << '-';
+
+    printOperand(MI, 1, O);
+
+    if (Opcode == AVR::STPtrPiRr)
+      O << '+';
+
+    O << ", ";
+    printOperand(MI, 2, O);
+    break;
+  default:
+    if (!printAliasInstr(MI, O))
+      printInstruction(MI, O);
+
+    printAnnotation(O, Annot);
+    break;
+  }
+}
+
+const char *AVRInstPrinter::getPrettyRegisterName(unsigned RegNum,
+                                                  MCRegisterInfo const &MRI) {
+  // GCC prints register pairs by just printing the lower register
+  // If the register contains a subregister, print it instead
+  if (MRI.getNumSubRegIndices() > 0) {
+    unsigned RegLoNum = MRI.getSubReg(RegNum, AVR::sub_lo);
+    RegNum = (RegLoNum != AVR::NoRegister) ? RegLoNum : RegNum;
+  }
+
+  return getRegisterName(RegNum);
+}
+
+void AVRInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  const MCOperandInfo &MOI = this->MII.get(MI->getOpcode()).OpInfo[OpNo];
+
+  if (Op.isReg()) {
+    bool isPtrReg = (MOI.RegClass == AVR::PTRREGSRegClassID) ||
+                    (MOI.RegClass == AVR::PTRDISPREGSRegClassID) ||
+                    (MOI.RegClass == AVR::ZREGSRegClassID);
+
+    if (isPtrReg) {
+      O << getRegisterName(Op.getReg(), AVR::ptr);
+    } else {
+      O << getPrettyRegisterName(Op.getReg(), MRI);
+    }
+  } else if (Op.isImm()) {
+    O << Op.getImm();
+  } else {
+    assert(Op.isExpr() && "Unknown operand kind in printOperand");
+    O << *Op.getExpr();
+  }
+}
+
+/// This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.
+void AVRInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+
+  if (Op.isImm()) {
+    int64_t Imm = Op.getImm();
+    O << '.';
+
+    // Print a position sign if needed.
+    // Negative values have their sign printed automatically.
+    if (Imm >= 0)
+      O << '+';
+
+    O << Imm;
+  } else {
+    assert(Op.isExpr() && "Unknown pcrel immediate operand");
+    O << *Op.getExpr();
+  }
+}
+
+void AVRInstPrinter::printMemri(const MCInst *MI, unsigned OpNo,
+                                raw_ostream &O) {
+  assert(MI->getOperand(OpNo).isReg() && "Expected a register for the first operand");
+
+  const MCOperand &OffsetOp = MI->getOperand(OpNo + 1);
+
+  // Print the register.
+  printOperand(MI, OpNo, O);
+
+  // Print the {+,-}offset.
+  if (OffsetOp.isImm()) {
+    int64_t Offset = OffsetOp.getImm();
+
+    if (Offset >= 0)
+      O << '+';
+
+    O << Offset;
+  } else if (OffsetOp.isExpr()) {
+    O << *OffsetOp.getExpr();
+  } else {
+    llvm_unreachable("unknown type for offset");
+  }
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.h b/contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.h
new file mode 100644
index 000000000000..c9f65b922745
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.h
@@ -0,0 +1,54 @@
+//===- AVRInstPrinter.h - Convert AVR MCInst to assembly syntax -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an AVR MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_INST_PRINTER_H
+#define LLVM_AVR_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+namespace llvm {
+
+/// Prints AVR instructions to a textual stream.
+class AVRInstPrinter : public MCInstPrinter {
+public:
+  AVRInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                 const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+  static const char *getPrettyRegisterName(unsigned RegNo,
+                                           MCRegisterInfo const &MRI);
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+
+private:
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = AVR::NoRegAltName);
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemri(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  // Autogenerated by TableGen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx, raw_ostream &O);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_INST_PRINTER_H
+
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
new file mode 100644
index 000000000000..081d8b5740ef
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -0,0 +1,473 @@
+//===-- AVRAsmBackend.cpp - AVR Asm Backend  ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AVRAsmBackend class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AVRAsmBackend.h"
+#include "MCTargetDesc/AVRFixupKinds.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+// FIXME: we should be doing checks to make sure asm operands
+// are not out of bounds.
+
+namespace adjust {
+
+using namespace llvm;
+
+void signed_width(unsigned Width, uint64_t Value, std::string Description,
+                  const MCFixup &Fixup, MCContext *Ctx = nullptr) {
+  if (!isIntN(Width, Value)) {
+    std::string Diagnostic = "out of range " + Description;
+
+    int64_t Min = minIntN(Width);
+    int64_t Max = maxIntN(Width);
+
+    Diagnostic += " (expected an integer in the range " + std::to_string(Min) +
+      " to " + std::to_string(Max) + ")";
+
+    if (Ctx) {
+      Ctx->reportFatalError(Fixup.getLoc(), Diagnostic);
+    } else {
+      llvm_unreachable(Diagnostic.c_str());
+    }
+  }
+}
+
+void unsigned_width(unsigned Width, uint64_t Value, std::string Description,
+                    const MCFixup &Fixup, MCContext *Ctx = nullptr) {
+  if (!isUIntN(Width, Value)) {
+    std::string Diagnostic = "out of range " + Description;
+
+    int64_t Max = maxUIntN(Width);
+
+    Diagnostic += " (expected an integer in the range 0 to " +
+      std::to_string(Max) + ")";
+
+    if (Ctx) {
+      Ctx->reportFatalError(Fixup.getLoc(), Diagnostic);
+    } else {
+      llvm_unreachable(Diagnostic.c_str());
+    }
+  }
+}
+
+/// Adjusts the value of a branch target before fixup application.
+void adjustBranch(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+                  MCContext *Ctx = nullptr) {
+  // We have one extra bit of precision because the value is rightshifted by
+  // one.
+  unsigned_width(Size + 1, Value, std::string("branch target"), Fixup, Ctx);
+
+  // Rightshifts the value by one.
+  AVR::fixups::adjustBranchTarget(Value);
+}
+
+/// Adjusts the value of a relative branch target before fixup application.
+void adjustRelativeBranch(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+                          MCContext *Ctx = nullptr) {
+  // We have one extra bit of precision because the value is rightshifted by
+  // one.
+  signed_width(Size + 1, Value, std::string("branch target"), Fixup, Ctx);
+
+  Value -= 2;
+
+  // Rightshifts the value by one.
+  AVR::fixups::adjustBranchTarget(Value);
+}
+
+/// 22-bit absolute fixup.
+///
+/// Resolves to:
+/// 1001 kkkk 010k kkkk kkkk kkkk 111k kkkk
+///
+/// Offset of 0 (so the result is left shifted by 3 bits before application).
+void fixup_call(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+                MCContext *Ctx = nullptr) {
+  adjustBranch(Size, Fixup, Value, Ctx);
+
+  auto top = Value & (0xf00000 << 6);   // the top four bits
+  auto middle = Value & (0x1ffff << 5); // the middle 13 bits
+  auto bottom = Value & 0x1f;           // end bottom 5 bits
+
+  Value = (top << 6) | (middle << 3) | (bottom << 0);
+}
+
+/// 7-bit PC-relative fixup.
+///
+/// Resolves to:
+/// 0000 00kk kkkk k000
+/// Offset of 0 (so the result is left shifted by 3 bits before application).
+void fixup_7_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+                   MCContext *Ctx = nullptr) {
+  adjustRelativeBranch(Size, Fixup, Value, Ctx);
+
+  // Because the value may be negative, we must mask out the sign bits
+  Value &= 0x7f;
+}
+
+/// 12-bit PC-relative fixup.
+/// Yes, the fixup is 12 bits even though the name says otherwise.
+///
+/// Resolves to:
+/// 0000 kkkk kkkk kkkk
+/// Offset of 0 (so the result isn't left-shifted before application).
+void fixup_13_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+                    MCContext *Ctx = nullptr) {
+  adjustRelativeBranch(Size, Fixup, Value, Ctx);
+
+  // Because the value may be negative, we must mask out the sign bits
+  Value &= 0xfff;
+}
+
+/// 6-bit fixup for the immediate operand of the ADIW family of
+/// instructions.
+///
+/// Resolves to:
+/// 0000 0000 kk00 kkkk
+void fixup_6_adiw(const MCFixup &Fixup, uint64_t &Value,
+                  MCContext *Ctx = nullptr) {
+  unsigned_width(6, Value, std::string("immediate"), Fixup, Ctx);
+
+  Value = ((Value & 0x30) << 2) | (Value & 0x0f);
+}
+
+/// 5-bit port number fixup on the SBIC family of instructions.
+///
+/// Resolves to:
+/// 0000 0000 AAAA A000
+void fixup_port5(const MCFixup &Fixup, uint64_t &Value,
+                 MCContext *Ctx = nullptr) {
+  unsigned_width(5, Value, std::string("port number"), Fixup, Ctx);
+
+  Value &= 0x1f;
+
+  Value <<= 3;
+}
+
+/// 6-bit port number fixup on the `IN` family of instructions.
+///
+/// Resolves to:
+/// 1011 0AAd dddd AAAA
+void fixup_port6(const MCFixup &Fixup, uint64_t &Value,
+                 MCContext *Ctx = nullptr) {
+  unsigned_width(6, Value, std::string("port number"), Fixup, Ctx);
+
+  Value = ((Value & 0x30) << 5) | (Value & 0x0f);
+}
+
+/// Adjusts a program memory address.
+/// This is a simple right-shift.
+void pm(uint64_t &Value) {
+  Value >>= 1;
+}
+
+/// Fixups relating to the LDI instruction.
+namespace ldi {
+
+/// Adjusts a value to fix up the immediate of an `LDI Rd, K` instruction.
+///
+/// Resolves to:
+/// 0000 KKKK 0000 KKKK
+/// Offset of 0 (so the result isn't left-shifted before application).
+void fixup(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+           MCContext *Ctx = nullptr) {
+  uint64_t upper = Value & 0xf0;
+  uint64_t lower = Value & 0x0f;
+
+  Value = (upper << 4) | lower;
+}
+
+void neg(uint64_t &Value) { Value *= -1; }
+
+void lo8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+         MCContext *Ctx = nullptr) {
+  Value &= 0xff;
+  ldi::fixup(Size, Fixup, Value, Ctx);
+}
+
+void hi8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+         MCContext *Ctx = nullptr) {
+  Value = (Value & 0xff00) >> 8;
+  ldi::fixup(Size, Fixup, Value, Ctx);
+}
+
+void hh8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+         MCContext *Ctx = nullptr) {
+  Value = (Value & 0xff0000) >> 16;
+  ldi::fixup(Size, Fixup, Value, Ctx);
+}
+
+void ms8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+         MCContext *Ctx = nullptr) {
+  Value = (Value & 0xff000000) >> 24;
+  ldi::fixup(Size, Fixup, Value, Ctx);
+}
+
+} // end of ldi namespace
+} // end of adjust namespace
+
+namespace llvm {
+
+// Prepare value for the target space for it
+void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t &Value,
+                                     MCContext *Ctx) const {
+  // The size of the fixup in bits.
+  uint64_t Size = AVRAsmBackend::getFixupKindInfo(Fixup.getKind()).TargetSize;
+
+  unsigned Kind = Fixup.getKind();
+
+  switch (Kind) {
+  default:
+    llvm_unreachable("unhandled fixup");
+  case AVR::fixup_7_pcrel:
+    adjust::fixup_7_pcrel(Size, Fixup, Value, Ctx);
+    break;
+  case AVR::fixup_13_pcrel:
+    adjust::fixup_13_pcrel(Size, Fixup, Value, Ctx);
+    break;
+  case AVR::fixup_call:
+    adjust::fixup_call(Size, Fixup, Value, Ctx);
+    break;
+  case AVR::fixup_ldi:
+    adjust::ldi::fixup(Size, Fixup, Value, Ctx);
+    break;
+  case AVR::fixup_lo8_ldi:
+  case AVR::fixup_lo8_ldi_pm:
+    if (Kind == AVR::fixup_lo8_ldi_pm) adjust::pm(Value);
+
+    adjust::ldi::lo8(Size, Fixup, Value, Ctx);
+    break;
+  case AVR::fixup_hi8_ldi:
+  case AVR::fixup_hi8_ldi_pm:
+    if (Kind == AVR::fixup_hi8_ldi_pm) adjust::pm(Value);
+
+    adjust::ldi::hi8(Size, Fixup, Value, Ctx);
+    break;
+  case AVR::fixup_hh8_ldi:
+  case AVR::fixup_hh8_ldi_pm:
+    if (Kind == AVR::fixup_hh8_ldi_pm) adjust::pm(Value);
+
+    adjust::ldi::hh8(Size, Fixup, Value, Ctx);
+    break;
+  case AVR::fixup_ms8_ldi:
+    adjust::ldi::ms8(Size, Fixup, Value, Ctx);
+    break;
+
+  case AVR::fixup_lo8_ldi_neg:
+  case AVR::fixup_lo8_ldi_pm_neg:
+    if (Kind == AVR::fixup_lo8_ldi_pm_neg) adjust::pm(Value);
+
+    adjust::ldi::neg(Value);
+    adjust::ldi::lo8(Size, Fixup, Value, Ctx);
+    break;
+  case AVR::fixup_hi8_ldi_neg:
+  case AVR::fixup_hi8_ldi_pm_neg:
+    if (Kind == AVR::fixup_hi8_ldi_pm_neg) adjust::pm(Value);
+
+    adjust::ldi::neg(Value);
+    adjust::ldi::hi8(Size, Fixup, Value, Ctx);
+    break;
+  case AVR::fixup_hh8_ldi_neg:
+  case AVR::fixup_hh8_ldi_pm_neg:
+    if (Kind == AVR::fixup_hh8_ldi_pm_neg) adjust::pm(Value);
+
+    adjust::ldi::neg(Value);
+    adjust::ldi::hh8(Size, Fixup, Value, Ctx);
+    break;
+  case AVR::fixup_ms8_ldi_neg:
+    adjust::ldi::neg(Value);
+    adjust::ldi::ms8(Size, Fixup, Value, Ctx);
+    break;
+  case AVR::fixup_16:
+    adjust::unsigned_width(16, Value, std::string("port number"), Fixup, Ctx);
+
+    Value &= 0xffff;
+    break;
+  case AVR::fixup_6_adiw:
+    adjust::fixup_6_adiw(Fixup, Value, Ctx);
+    break;
+
+  case AVR::fixup_port5:
+    adjust::fixup_port5(Fixup, Value, Ctx);
+    break;
+
+  case AVR::fixup_port6:
+    adjust::fixup_port6(Fixup, Value, Ctx);
+    break;
+
+  // Fixups which do not require adjustments.
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+    break;
+
+  case FK_GPRel_4:
+    llvm_unreachable("don't know how to adjust this fixup");
+    break;
+  }
+}
+
+MCObjectWriter *AVRAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
+  return createAVRELFObjectWriter(OS,
+                                  MCELFObjectTargetWriter::getOSABI(OSType));
+}
+
+void AVRAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                               unsigned DataSize, uint64_t Value,
+                               bool IsPCRel) const {
+  if (Value == 0)
+    return; // Doesn't change encoding.
+
+  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+
+  // The number of bits in the fixup mask
+  auto NumBits = Info.TargetSize + Info.TargetOffset;
+  auto NumBytes = (NumBits / 8) + ((NumBits % 8) == 0 ? 0 : 1);
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i < NumBytes; ++i) {
+    uint8_t mask = (((Value >> (i * 8)) & 0xff));
+    Data[Offset + i] |= mask;
+  }
+}
+
+MCFixupKindInfo const &AVRAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+  // NOTE: Many AVR fixups work on sets of non-contignous bits. We work around
+  // this by saying that the fixup is the size of the entire instruction.
+  const static MCFixupKindInfo Infos[AVR::NumTargetFixupKinds] = {
+      // This table *must* be in same the order of fixup_* kinds in
+      // AVRFixupKinds.h.
+      //
+      // name                    offset  bits  flags
+      {"fixup_32", 0, 32, 0},
+
+      {"fixup_7_pcrel", 3, 7, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_13_pcrel", 0, 12, MCFixupKindInfo::FKF_IsPCRel},
+
+      {"fixup_16", 0, 16, 0},
+      {"fixup_16_pm", 0, 16, 0},
+
+      {"fixup_ldi", 0, 8, 0},
+
+      {"fixup_lo8_ldi", 0, 8, 0},
+      {"fixup_hi8_ldi", 0, 8, 0},
+      {"fixup_hh8_ldi", 0, 8, 0},
+      {"fixup_ms8_ldi", 0, 8, 0},
+
+      {"fixup_lo8_ldi_neg", 0, 8, 0},
+      {"fixup_hi8_ldi_neg", 0, 8, 0},
+      {"fixup_hh8_ldi_neg", 0, 8, 0},
+      {"fixup_ms8_ldi_neg", 0, 8, 0},
+
+      {"fixup_lo8_ldi_pm", 0, 8, 0},
+      {"fixup_hi8_ldi_pm", 0, 8, 0},
+      {"fixup_hh8_ldi_pm", 0, 8, 0},
+
+      {"fixup_lo8_ldi_pm_neg", 0, 8, 0},
+      {"fixup_hi8_ldi_pm_neg", 0, 8, 0},
+      {"fixup_hh8_ldi_pm_neg", 0, 8, 0},
+
+      {"fixup_call", 0, 22, 0},
+
+      {"fixup_6", 0, 16, 0}, // non-contiguous
+      {"fixup_6_adiw", 0, 6, 0},
+
+      {"fixup_lo8_ldi_gs", 0, 8, 0},
+      {"fixup_hi8_ldi_gs", 0, 8, 0},
+
+      {"fixup_8", 0, 8, 0},
+      {"fixup_8_lo8", 0, 8, 0},
+      {"fixup_8_hi8", 0, 8, 0},
+      {"fixup_8_hlo8", 0, 8, 0},
+
+      {"fixup_sym_diff", 0, 32, 0},
+      {"fixup_16_ldst", 0, 16, 0},
+
+      {"fixup_lds_sts_16", 0, 16, 0},
+
+      {"fixup_port6", 0, 16, 0}, // non-contiguous
+      {"fixup_port5", 3, 5, 0},
+  };
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+         "Invalid kind!");
+
+  return Infos[Kind - FirstTargetFixupKind];
+}
+
+bool AVRAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  // If the count is not 2-byte aligned, we must be writing data into the text
+  // section (otherwise we have unaligned instructions, and thus have far
+  // bigger problems), so just write zeros instead.
+  assert((Count % 2) == 0 && "NOP instructions must be 2 bytes");
+
+  OW->WriteZeros(Count);
+  return true;
+}
+
+void AVRAsmBackend::processFixupValue(const MCAssembler &Asm,
+                                      const MCAsmLayout &Layout,
+                                      const MCFixup &Fixup,
+                                      const MCFragment *DF,
+                                      const MCValue &Target, uint64_t &Value,
+                                      bool &IsResolved) {
+  switch ((unsigned) Fixup.getKind()) {
+  // Fixups which should always be recorded as relocations.
+  case AVR::fixup_7_pcrel:
+  case AVR::fixup_13_pcrel:
+  case AVR::fixup_call:
+    IsResolved = false;
+    break;
+  default:
+    // Parsed LLVM-generated temporary labels are already
+    // adjusted for instruction size, but normal labels aren't.
+    //
+    // To handle both cases, we simply un-adjust the temporary label
+    // case so it acts like all other labels.
+    if (Target.getSymA()->getSymbol().isTemporary())
+      Value += 2;
+
+    adjustFixupValue(Fixup, Value, &Asm.getContext());
+    break;
+  }
+}
+
+MCAsmBackend *createAVRAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  const Triple &TT, StringRef CPU,
+                                  const llvm::MCTargetOptions &TO) {
+  return new AVRAsmBackend(TT.getOS());
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
new file mode 100644
index 000000000000..7ff4b8f350f6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -0,0 +1,78 @@
+//===-- AVRAsmBackend.h - AVR Asm Backend  --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file The AVR assembly backend implementation.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef LLVM_AVR_ASM_BACKEND_H
+#define LLVM_AVR_ASM_BACKEND_H
+
+#include "MCTargetDesc/AVRFixupKinds.h"
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmBackend.h"
+
+namespace llvm {
+
+class MCAssembler;
+class MCObjectWriter;
+class Target;
+
+struct MCFixupKindInfo;
+
+/// Utilities for manipulating generated AVR machine code.
+class AVRAsmBackend : public MCAsmBackend {
+public:
+
+  AVRAsmBackend(Triple::OSType OSType)
+      : MCAsmBackend(), OSType(OSType) {}
+
+  void adjustFixupValue(const MCFixup &Fixup, uint64_t &Value,
+                        MCContext *Ctx = nullptr) const;
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
+  unsigned getNumFixupKinds() const override {
+    return AVR::NumTargetFixupKinds;
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    llvm_unreachable("RelaxInstruction() unimplemented");
+    return false;
+  }
+
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {}
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override;
+
+private:
+  Triple::OSType OSType;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_ASM_BACKEND_H
+
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
new file mode 100644
index 000000000000..161f305fd014
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
@@ -0,0 +1,127 @@
+//===-- AVRELFObjectWriter.cpp - AVR ELF Writer ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AVRFixupKinds.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+/// Writes AVR machine code into an ELF32 object file.
+class AVRELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  AVRELFObjectWriter(uint8_t OSABI);
+
+  virtual ~AVRELFObjectWriter() {}
+
+  unsigned getRelocType(MCContext &Ctx,
+                        const MCValue &Target,
+                        const MCFixup &Fixup,
+                        bool IsPCRel) const override;
+};
+
+AVRELFObjectWriter::AVRELFObjectWriter(uint8_t OSABI)
+    : MCELFObjectTargetWriter(false, OSABI, ELF::EM_AVR, true, false) {}
+
+unsigned AVRELFObjectWriter::getRelocType(MCContext &Ctx,
+                                          const MCValue &Target,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel) const {
+  switch ((unsigned) Fixup.getKind()) {
+  case FK_Data_1:
+  case FK_Data_4:
+    llvm_unreachable("unsupported relocation type");
+  case FK_Data_2:
+    return ELF::R_AVR_16_PM;
+  case AVR::fixup_32:
+    return ELF::R_AVR_32;
+  case AVR::fixup_7_pcrel:
+    return ELF::R_AVR_7_PCREL;
+  case AVR::fixup_13_pcrel:
+    return ELF::R_AVR_13_PCREL;
+  case AVR::fixup_16:
+    return ELF::R_AVR_16;
+  case AVR::fixup_16_pm:
+    return ELF::R_AVR_16_PM;
+  case AVR::fixup_lo8_ldi:
+    return ELF::R_AVR_LO8_LDI;
+  case AVR::fixup_hi8_ldi:
+    return ELF::R_AVR_HI8_LDI;
+  case AVR::fixup_hh8_ldi:
+    return ELF::R_AVR_HH8_LDI;
+  case AVR::fixup_lo8_ldi_neg:
+    return ELF::R_AVR_LO8_LDI_NEG;
+  case AVR::fixup_hi8_ldi_neg:
+    return ELF::R_AVR_HI8_LDI_NEG;
+  case AVR::fixup_hh8_ldi_neg:
+    return ELF::R_AVR_HH8_LDI_NEG;
+  case AVR::fixup_lo8_ldi_pm:
+    return ELF::R_AVR_LO8_LDI_PM;
+  case AVR::fixup_hi8_ldi_pm:
+    return ELF::R_AVR_HI8_LDI_PM;
+  case AVR::fixup_hh8_ldi_pm:
+    return ELF::R_AVR_HH8_LDI_PM;
+  case AVR::fixup_lo8_ldi_pm_neg:
+    return ELF::R_AVR_LO8_LDI_PM_NEG;
+  case AVR::fixup_hi8_ldi_pm_neg:
+    return ELF::R_AVR_HI8_LDI_PM_NEG;
+  case AVR::fixup_hh8_ldi_pm_neg:
+    return ELF::R_AVR_HH8_LDI_PM_NEG;
+  case AVR::fixup_call:
+    return ELF::R_AVR_CALL;
+  case AVR::fixup_ldi:
+    return ELF::R_AVR_LDI;
+  case AVR::fixup_6:
+    return ELF::R_AVR_6;
+  case AVR::fixup_6_adiw:
+    return ELF::R_AVR_6_ADIW;
+  case AVR::fixup_ms8_ldi:
+    return ELF::R_AVR_MS8_LDI;
+  case AVR::fixup_ms8_ldi_neg:
+    return ELF::R_AVR_MS8_LDI_NEG;
+  case AVR::fixup_lo8_ldi_gs:
+    return ELF::R_AVR_LO8_LDI_GS;
+  case AVR::fixup_hi8_ldi_gs:
+    return ELF::R_AVR_HI8_LDI_GS;
+  case AVR::fixup_8:
+    return ELF::R_AVR_8;
+  case AVR::fixup_8_lo8:
+    return ELF::R_AVR_8_LO8;
+  case AVR::fixup_8_hi8:
+    return ELF::R_AVR_8_HI8;
+  case AVR::fixup_8_hlo8:
+    return ELF::R_AVR_8_HLO8;
+  case AVR::fixup_sym_diff:
+    return ELF::R_AVR_SYM_DIFF;
+  case AVR::fixup_16_ldst:
+    return ELF::R_AVR_16_LDST;
+  case AVR::fixup_lds_sts_16:
+    return ELF::R_AVR_LDS_STS_16;
+  case AVR::fixup_port6:
+    return ELF::R_AVR_PORT6;
+  case AVR::fixup_port5:
+    return ELF::R_AVR_PORT5;
+  default:
+    llvm_unreachable("invalid fixup kind!");
+  }
+}
+
+MCObjectWriter *createAVRELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new AVRELFObjectWriter(OSABI);
+  return createELFObjectWriter(MOTW, OS, true);
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
new file mode 100644
index 000000000000..481de320b22f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
@@ -0,0 +1,66 @@
+#include "AVRELFStreamer.h"
+
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/FormattedStream.h"
+
+#include "AVRMCTargetDesc.h"
+
+namespace llvm {
+
+static unsigned getEFlagsForFeatureSet(const FeatureBitset &Features) {
+  unsigned EFlags = 0;
+
+  // Set architecture
+  if (Features[AVR::ELFArchAVR1])
+    EFlags |= ELF::EF_AVR_ARCH_AVR1;
+  else if (Features[AVR::ELFArchAVR2])
+    EFlags |= ELF::EF_AVR_ARCH_AVR2;
+  else if (Features[AVR::ELFArchAVR25])
+    EFlags |= ELF::EF_AVR_ARCH_AVR25;
+  else if (Features[AVR::ELFArchAVR3])
+    EFlags |= ELF::EF_AVR_ARCH_AVR3;
+  else if (Features[AVR::ELFArchAVR31])
+    EFlags |= ELF::EF_AVR_ARCH_AVR31;
+  else if (Features[AVR::ELFArchAVR35])
+    EFlags |= ELF::EF_AVR_ARCH_AVR35;
+  else if (Features[AVR::ELFArchAVR4])
+    EFlags |= ELF::EF_AVR_ARCH_AVR4;
+  else if (Features[AVR::ELFArchAVR5])
+    EFlags |= ELF::EF_AVR_ARCH_AVR5;
+  else if (Features[AVR::ELFArchAVR51])
+    EFlags |= ELF::EF_AVR_ARCH_AVR51;
+  else if (Features[AVR::ELFArchAVR6])
+    EFlags |= ELF::EF_AVR_ARCH_AVR6;
+  else if (Features[AVR::ELFArchAVRTiny])
+    EFlags |= ELF::EF_AVR_ARCH_AVRTINY;
+  else if (Features[AVR::ELFArchXMEGA1])
+    EFlags |= ELF::EF_AVR_ARCH_XMEGA1;
+  else if (Features[AVR::ELFArchXMEGA2])
+    EFlags |= ELF::EF_AVR_ARCH_XMEGA2;
+  else if (Features[AVR::ELFArchXMEGA3])
+    EFlags |= ELF::EF_AVR_ARCH_XMEGA3;
+  else if (Features[AVR::ELFArchXMEGA4])
+    EFlags |= ELF::EF_AVR_ARCH_XMEGA4;
+  else if (Features[AVR::ELFArchXMEGA5])
+    EFlags |= ELF::EF_AVR_ARCH_XMEGA5;
+  else if (Features[AVR::ELFArchXMEGA6])
+    EFlags |= ELF::EF_AVR_ARCH_XMEGA6;
+  else if (Features[AVR::ELFArchXMEGA7])
+    EFlags |= ELF::EF_AVR_ARCH_XMEGA7;
+
+  return EFlags;
+}
+
+AVRELFStreamer::AVRELFStreamer(MCStreamer &S,
+                               const MCSubtargetInfo &STI)
+    : AVRTargetStreamer(S) {
+
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned EFlags = MCA.getELFHeaderEFlags();
+
+  EFlags |= getEFlagsForFeatureSet(STI.getFeatureBits());
+
+  MCA.setELFHeaderEFlags(EFlags);
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h
new file mode 100644
index 000000000000..e5df6cc34e40
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.h
@@ -0,0 +1,29 @@
+//===----- AVRELFStreamer.h - AVR Target Streamer --------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_ELF_STREAMER_H
+#define LLVM_AVR_ELF_STREAMER_H
+
+#include "AVRTargetStreamer.h"
+
+namespace llvm {
+
+/// A target streamer for an AVR ELF object file.
+class AVRELFStreamer : public AVRTargetStreamer {
+public:
+  AVRELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
+
+  MCELFStreamer &getStreamer() {
+    return static_cast<MCELFStreamer &>(Streamer);
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h
new file mode 100644
index 000000000000..d3bd52d343fc
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h
@@ -0,0 +1,149 @@
+//===-- AVRFixupKinds.h - AVR Specific Fixup Entries ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_FIXUP_KINDS_H
+#define LLVM_AVR_FIXUP_KINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace AVR {
+
+/// The set of supported fixups.
+///
+/// Although most of the current fixup types reflect a unique relocation
+/// one can have multiple fixup types for a given relocation and thus need
+/// to be uniquely named.
+///
+/// \note This table *must* be in the same order of
+///       MCFixupKindInfo Infos[AVR::NumTargetFixupKinds]
+///       in `AVRAsmBackend.cpp`.
+enum Fixups {
+  /// A 32-bit AVR fixup.
+  fixup_32 = FirstTargetFixupKind,
+
+  /// A 7-bit PC-relative fixup for the family of conditional
+  /// branches which take 7-bit targets (BRNE,BRGT,etc).
+  fixup_7_pcrel,
+  /// A 12-bit PC-relative fixup for the family of branches
+  /// which take 12-bit targets (RJMP,RCALL,etc).
+  /// \note Although the fixup is labelled as 13 bits, it
+  ///       is actually only encoded in 12. The reason for
+  ///       The nonmenclature is that AVR branch targets are
+  ///       rightshifted by 1, because instructions are always
+  ///       aligned to 2 bytes, so the 0'th bit is always 0.
+  ///       This way there is 13-bits of precision.
+  fixup_13_pcrel,
+
+  /// A 16-bit address.
+  fixup_16,
+  /// A 16-bit program memory address.
+  fixup_16_pm,
+
+  /// Replaces the 8-bit immediate with another value.
+  fixup_ldi,
+
+  /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+  /// with the lower 8 bits of a 16-bit value (bits 0-7).
+  fixup_lo8_ldi,
+  /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+  /// with the upper 8 bits of a 16-bit value (bits 8-15).
+  fixup_hi8_ldi,
+  /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+  /// with the upper 8 bits of a 24-bit value (bits 16-23).
+  fixup_hh8_ldi,
+  /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+  /// with the upper 8 bits of a 32-bit value (bits 24-31).
+  fixup_ms8_ldi,
+
+  /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+  /// with the lower 8 bits of a negated 16-bit value (bits 0-7).
+  fixup_lo8_ldi_neg,
+  /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+  /// with the upper 8 bits of a negated 16-bit value (bits 8-15).
+  fixup_hi8_ldi_neg,
+  /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+  /// with the upper 8 bits of a negated negated 24-bit value (bits 16-23).
+  fixup_hh8_ldi_neg,
+  /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+  /// with the upper 8 bits of a negated negated 32-bit value (bits 24-31).
+  fixup_ms8_ldi_neg,
+
+  /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+  /// with the lower 8 bits of a 16-bit program memory address value (bits 0-7).
+  fixup_lo8_ldi_pm,
+  /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+  /// with the upper 8 bits of a 16-bit program memory address value (bits
+  /// 8-15).
+  fixup_hi8_ldi_pm,
+  /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+  /// with the upper 8 bits of a 24-bit program memory address value (bits
+  /// 16-23).
+  fixup_hh8_ldi_pm,
+
+  /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+  /// with the lower 8 bits of a negated 16-bit program memory address value
+  /// (bits 0-7).
+  fixup_lo8_ldi_pm_neg,
+  /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+  /// with the upper 8 bits of a negated 16-bit program memory address value
+  /// (bits 8-15).
+  fixup_hi8_ldi_pm_neg,
+  /// Replaces the immediate operand of a 16-bit `Rd, K` instruction
+  /// with the upper 8 bits of a negated 24-bit program memory address value
+  /// (bits 16-23).
+  fixup_hh8_ldi_pm_neg,
+
+  /// A 22-bit fixup for the target of a `CALL k` or `JMP k` instruction.
+  fixup_call,
+
+  fixup_6,
+  /// A symbol+addr fixup for the `LDD <x>+<n>, <r>" family of instructions.
+  fixup_6_adiw,
+
+  fixup_lo8_ldi_gs,
+  fixup_hi8_ldi_gs,
+
+  fixup_8,
+  fixup_8_lo8,
+  fixup_8_hi8,
+  fixup_8_hlo8,
+
+  /// Fixup to calculate the difference between two symbols.
+  /// Is the only stateful fixup. We do not support it yet.
+  fixup_sym_diff,
+  fixup_16_ldst,
+
+  fixup_lds_sts_16,
+
+  /// A 6-bit port address.
+  fixup_port6,
+  /// A 5-bit port address.
+  fixup_port5,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+
+namespace fixups {
+
+/// Adjusts the value of a branch target.
+/// All branch targets in AVR are rightshifted by 1 to take advantage
+/// of the fact that all instructions are aligned to addresses of size
+/// 2, so bit 0 of an address is always 0. This gives us another bit
+/// of precision.
+/// \param[in,out] The target to adjust.
+template <typename T> inline void adjustBranchTarget(T &val) { val >>= 1; }
+
+} // end of namespace fixups
+}
+} // end of namespace llvm::AVR
+
+#endif // LLVM_AVR_FIXUP_KINDS_H
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
new file mode 100644
index 000000000000..cca3bcc4968a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
@@ -0,0 +1,28 @@
+//===-- AVRMCAsmInfo.cpp - AVR asm properties -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the AVRMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRMCAsmInfo.h"
+
+#include "llvm/ADT/Triple.h"
+
+namespace llvm {
+
+AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT) {
+  PointerSize = 2;
+  CalleeSaveStackSlotSize = 2;
+  CommentString = ";";
+  PrivateGlobalPrefix = ".L";
+  UsesELFSectionDirectiveForBSS = true;
+}
+
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
new file mode 100644
index 000000000000..cc2207a3cfae
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- AVRMCAsmInfo.h - AVR asm properties ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the AVRMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_ASM_INFO_H
+#define LLVM_AVR_ASM_INFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+
+class Triple;
+
+/// Specifies the format of AVR assembly files.
+class AVRMCAsmInfo : public MCAsmInfo {
+public:
+  explicit AVRMCAsmInfo(const Triple &TT);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_ASM_INFO_H
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
new file mode 100644
index 000000000000..e6dc8868c705
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
@@ -0,0 +1,304 @@
+//===-- AVRMCCodeEmitter.cpp - Convert AVR Code to Machine Code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AVRMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRMCCodeEmitter.h"
+
+#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCTargetDesc.h"
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "mccodeemitter"
+
+#define GET_INSTRMAP_INFO
+#include "AVRGenInstrInfo.inc"
+#undef GET_INSTRMAP_INFO
+
+namespace llvm {
+
+/// Performs a post-encoding step on a `LD` or `ST` instruction.
+///
+/// The encoding of the LD/ST family of instructions is inconsistent w.r.t
+/// the pointer register and the addressing mode.
+///
+/// The permutations of the format are as followed:
+/// ld Rd, X    `1001 000d dddd 1100`
+/// ld Rd, X+   `1001 000d dddd 1101`
+/// ld Rd, -X   `1001 000d dddd 1110`
+///
+/// ld Rd, Y    `1000 000d dddd 1000`
+/// ld Rd, Y+   `1001 000d dddd 1001`
+/// ld Rd, -Y   `1001 000d dddd 1010`
+///
+/// ld Rd, Z    `1000 000d dddd 0000`
+/// ld Rd, Z+   `1001 000d dddd 0001`
+/// ld Rd, -Z   `1001 000d dddd 0010`
+///                 ^
+///                 |
+/// Note this one inconsistent bit - it is 1 sometimes and 0 at other times.
+/// There is no logical pattern. Looking at a truth table, the following
+/// formula can be derived to fit the pattern:
+//
+/// ```
+/// inconsistent_bit = is_predec OR is_postinc OR is_reg_x
+/// ```
+//
+/// We manually set this bit in this post encoder method.
+unsigned
+AVRMCCodeEmitter::loadStorePostEncoder(const MCInst &MI, unsigned EncodedValue,
+                                       const MCSubtargetInfo &STI) const {
+
+  assert(MI.getOperand(0).isReg() && MI.getOperand(1).isReg() &&
+         "the load/store operands must be registers");
+
+  unsigned Opcode = MI.getOpcode();
+
+  // check whether either of the registers are the X pointer register.
+  bool IsRegX = MI.getOperand(0).getReg() == AVR::R27R26 ||
+                  MI.getOperand(1).getReg() == AVR::R27R26;
+
+  bool IsPredec = Opcode == AVR::LDRdPtrPd || Opcode == AVR::STPtrPdRr;
+  bool IsPostinc = Opcode == AVR::LDRdPtrPi || Opcode == AVR::STPtrPiRr;
+
+  // Check if we need to set the inconsistent bit
+  if (IsRegX || IsPredec || IsPostinc) {
+    EncodedValue |= (1 << 12);
+  }
+
+  return EncodedValue;
+}
+
+template <AVR::Fixups Fixup>
+unsigned
+AVRMCCodeEmitter::encodeRelCondBrTarget(const MCInst &MI, unsigned OpNo,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  if (MO.isExpr()) {
+    Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+                     MCFixupKind(Fixup), MI.getLoc()));
+    return 0;
+  }
+
+  assert(MO.isImm());
+
+  // Take the size of the current instruction away.
+  // With labels, this is implicitly done.
+  auto target = MO.getImm();
+  AVR::fixups::adjustBranchTarget(target);
+  return target;
+}
+
+unsigned AVRMCCodeEmitter::encodeLDSTPtrReg(const MCInst &MI, unsigned OpNo,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  auto MO = MI.getOperand(OpNo);
+
+  // The operand should be a pointer register.
+  assert(MO.isReg());
+
+  switch (MO.getReg()) {
+  case AVR::R27R26: return 0x03; // X: 0b11
+  case AVR::R29R28: return 0x02; // Y: 0b10
+  case AVR::R31R30: return 0x00; // Z: 0b00
+  default:
+    llvm_unreachable("invalid pointer register");
+  }
+}
+
+/// Encodes a `memri` operand.
+/// The operand is 7-bits.
+/// * The lower 6 bits is the immediate
+/// * The upper bit is the pointer register bit (Z=0,Y=1)
+unsigned AVRMCCodeEmitter::encodeMemri(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  auto RegOp = MI.getOperand(OpNo);
+  auto OffsetOp = MI.getOperand(OpNo + 1);
+
+  assert(RegOp.isReg() && "Expected register operand");
+
+  uint8_t RegBit = 0;
+
+  switch (RegOp.getReg()) {
+  default:
+    llvm_unreachable("Expected either Y or Z register");
+  case AVR::R31R30:
+    RegBit = 0;
+    break; // Z register
+  case AVR::R29R28:
+    RegBit = 1;
+    break; // Y register
+  }
+
+  int8_t OffsetBits;
+
+  if (OffsetOp.isImm()) {
+    OffsetBits = OffsetOp.getImm();
+  } else if (OffsetOp.isExpr()) {
+    OffsetBits = 0;
+    Fixups.push_back(MCFixup::create(0, OffsetOp.getExpr(),
+                     MCFixupKind(AVR::fixup_6), MI.getLoc()));
+  } else {
+    llvm_unreachable("invalid value for offset");
+  }
+
+  return (RegBit << 6) | OffsetBits;
+}
+
+unsigned AVRMCCodeEmitter::encodeComplement(const MCInst &MI, unsigned OpNo,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  // The operand should be an immediate.
+  assert(MI.getOperand(OpNo).isImm());
+
+  auto Imm = MI.getOperand(OpNo).getImm();
+  return (~0) - Imm;
+}
+
+template <AVR::Fixups Fixup>
+unsigned AVRMCCodeEmitter::encodeImm(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+  auto MO = MI.getOperand(OpNo);
+
+  if (MO.isExpr()) {
+    if (isa<AVRMCExpr>(MO.getExpr())) {
+      // If the expression is already an AVRMCExpr (i.e. a lo8(symbol),
+      // we shouldn't perform any more fixups. Without this check, we would
+      // instead create a fixup to the symbol named 'lo8(symbol)' which
+      // is not correct.
+      return getExprOpValue(MO.getExpr(), Fixups, STI);
+    }
+
+    MCFixupKind FixupKind = static_cast<MCFixupKind>(Fixup);
+    Fixups.push_back(MCFixup::create(0, MO.getExpr(), FixupKind, MI.getLoc()));
+
+    return 0;
+  }
+
+  assert(MO.isImm());
+  return MO.getImm();
+}
+
+unsigned AVRMCCodeEmitter::encodeCallTarget(const MCInst &MI, unsigned OpNo,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  auto MO = MI.getOperand(OpNo);
+
+  if (MO.isExpr()) {
+    MCFixupKind FixupKind = static_cast<MCFixupKind>(AVR::fixup_call);
+    Fixups.push_back(MCFixup::create(0, MO.getExpr(), FixupKind, MI.getLoc()));
+    return 0;
+  }
+
+  assert(MO.isImm());
+
+  auto Target = MO.getImm();
+  AVR::fixups::adjustBranchTarget(Target);
+  return Target;
+}
+
+unsigned AVRMCCodeEmitter::getExprOpValue(const MCExpr *Expr,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+
+  MCExpr::ExprKind Kind = Expr->getKind();
+
+  if (Kind == MCExpr::Binary) {
+    Expr = static_cast<const MCBinaryExpr *>(Expr)->getLHS();
+    Kind = Expr->getKind();
+  }
+
+  if (Kind == MCExpr::Target) {
+    AVRMCExpr const *AVRExpr = cast<AVRMCExpr>(Expr);
+    int64_t Result;
+    if (AVRExpr->evaluateAsConstant(Result)) {
+      return Result;
+    }
+
+    MCFixupKind FixupKind = static_cast<MCFixupKind>(AVRExpr->getFixupKind());
+    Fixups.push_back(MCFixup::create(0, AVRExpr, FixupKind));
+    return 0;
+  }
+
+  assert(Kind == MCExpr::SymbolRef);
+  return 0;
+}
+
+unsigned AVRMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                             const MCOperand &MO,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  if (MO.isReg()) return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+  if (MO.isImm()) return static_cast<unsigned>(MO.getImm());
+
+  if (MO.isFPImm())
+    return static_cast<unsigned>(APFloat(MO.getFPImm())
+                                     .bitcastToAPInt()
+                                     .getHiBits(32)
+                                     .getLimitedValue());
+
+  // MO must be an Expr.
+  assert(MO.isExpr());
+
+  return getExprOpValue(MO.getExpr(), Fixups, STI);
+}
+
+void AVRMCCodeEmitter::emitInstruction(uint64_t Val, unsigned Size,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &OS) const {
+  const uint16_t *Words = reinterpret_cast<uint16_t const *>(&Val);
+  size_t WordCount = Size / 2;
+
+  for (int64_t i = WordCount - 1; i >= 0; --i) {
+    uint16_t Word = Words[i];
+
+    OS << (uint8_t) ((Word & 0x00ff) >> 0);
+    OS << (uint8_t) ((Word & 0xff00) >> 8);
+  }
+}
+
+void AVRMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+
+  // Get byte count of instruction
+  unsigned Size = Desc.getSize();
+
+  assert(Size > 0 && "Instruction size cannot be zero");
+
+  uint64_t BinaryOpCode = getBinaryCodeForInstr(MI, Fixups, STI);
+  emitInstruction(BinaryOpCode, Size, STI, OS);
+}
+
+MCCodeEmitter *createAVRMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
+                                      MCContext &Ctx) {
+  return new AVRMCCodeEmitter(MCII, Ctx);
+}
+
+#include "AVRGenMCCodeEmitter.inc"
+
+} // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
new file mode 100644
index 000000000000..5fa425c296a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
@@ -0,0 +1,115 @@
+//===-- AVRMCCodeEmitter.h - Convert AVR Code to Machine Code -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AVRMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef LLVM_AVR_CODE_EMITTER_H
+#define LLVM_AVR_CODE_EMITTER_H
+
+#include "AVRFixupKinds.h"
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/Support/DataTypes.h"
+
+#define GET_INSTRINFO_OPERAND_TYPES_ENUM
+#include "AVRGenInstrInfo.inc"
+
+namespace llvm {
+
+class MCContext;
+class MCExpr;
+class MCFixup;
+class MCInst;
+class MCInstrInfo;
+class MCOperand;
+class MCSubtargetInfo;
+class raw_ostream;
+
+/// Writes AVR machine code to a stream.
+class AVRMCCodeEmitter : public MCCodeEmitter {
+public:
+  AVRMCCodeEmitter(const MCInstrInfo &MCII, MCContext &Ctx)
+      : MCII(MCII), Ctx(Ctx) {}
+
+private:
+  /// Finishes up encoding an LD/ST instruction.
+  /// The purpose of this function is to set an bit in the instruction
+  /// which follows no logical pattern. See the implementation for details.
+  unsigned loadStorePostEncoder(const MCInst &MI, unsigned EncodedValue,
+                                const MCSubtargetInfo &STI) const;
+
+  /// Gets the encoding for a conditional branch target.
+  template <AVR::Fixups Fixup>
+  unsigned encodeRelCondBrTarget(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// Encodes the `PTRREGS` operand to a load or store instruction.
+  unsigned encodeLDSTPtrReg(const MCInst &MI, unsigned OpNo,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
+
+  /// Encodes a `register+immediate` operand for `LDD`/`STD`.
+  unsigned encodeMemri(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const;
+
+  /// Takes the compliment of a number (~0 - val).
+  unsigned encodeComplement(const MCInst &MI, unsigned OpNo,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
+
+  /// Encodes an immediate value with a given fixup.
+  template <AVR::Fixups Fixup>
+  unsigned encodeImm(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const;
+
+  /// Gets the encoding of the target for the `CALL k` instruction.
+  unsigned encodeCallTarget(const MCInst &MI, unsigned OpNo,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
+
+  /// TableGen'ed function to get the binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  unsigned getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const;
+
+  /// Returns the binary encoding of operand.
+  ///
+  /// If the machine operand requires relocation, the relocation is recorded
+  /// and zero is returned.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  void emitInstruction(uint64_t Val, unsigned Size, const MCSubtargetInfo &STI,
+                       raw_ostream &OS) const;
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  AVRMCCodeEmitter(const AVRMCCodeEmitter &) = delete;
+  void operator=(const AVRMCCodeEmitter &) = delete;
+
+  const MCInstrInfo &MCII;
+  MCContext &Ctx;
+};
+
+} // end namespace of llvm.
+
+#endif // LLVM_AVR_CODE_EMITTER_H
+
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
new file mode 100644
index 000000000000..400296b8409b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -0,0 +1,189 @@
+//===-- AVRMCExpr.cpp - AVR specific MC expression classes ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRMCExpr.h"
+
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCAsmLayout.h"
+
+namespace llvm {
+
+namespace {
+
+const struct ModifierEntry {
+  const char * const Spelling;
+  AVRMCExpr::VariantKind VariantKind;
+} ModifierNames[] = {
+    {"lo8", AVRMCExpr::VK_AVR_LO8},       {"hi8", AVRMCExpr::VK_AVR_HI8},
+    {"hh8", AVRMCExpr::VK_AVR_HH8}, // synonym with hlo8
+    {"hlo8", AVRMCExpr::VK_AVR_HH8},      {"hhi8", AVRMCExpr::VK_AVR_HHI8},
+
+    {"pm_lo8", AVRMCExpr::VK_AVR_PM_LO8}, {"pm_hi8", AVRMCExpr::VK_AVR_PM_HI8},
+    {"pm_hh8", AVRMCExpr::VK_AVR_PM_HH8},
+};
+
+} // end of anonymous namespace
+
+const AVRMCExpr *AVRMCExpr::create(VariantKind Kind, const MCExpr *Expr,
+                                   bool Negated, MCContext &Ctx) {
+  return new (Ctx) AVRMCExpr(Kind, Expr, Negated);
+}
+
+void AVRMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  assert(Kind != VK_AVR_None);
+
+  if (isNegated())
+    OS << '-';
+
+  OS << getName() << '(';
+  getSubExpr()->print(OS, MAI);
+  OS << ')';
+}
+
+bool AVRMCExpr::evaluateAsConstant(int64_t &Result) const {
+  MCValue Value;
+
+  bool isRelocatable =
+      getSubExpr()->evaluateAsRelocatable(Value, nullptr, nullptr);
+
+  if (!isRelocatable)
+    return false;
+
+  if (Value.isAbsolute()) {
+    Result = evaluateAsInt64(Value.getConstant());
+    return true;
+  }
+
+  return false;
+}
+
+bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result,
+                                          const MCAsmLayout *Layout,
+                                          const MCFixup *Fixup) const {
+  MCValue Value;
+  bool isRelocatable = SubExpr->evaluateAsRelocatable(Value, Layout, Fixup);
+
+  if (!isRelocatable)
+    return false;
+
+  if (Value.isAbsolute()) {
+    Result = MCValue::get(evaluateAsInt64(Value.getConstant()));
+  } else {
+    if (!Layout) return false;
+
+    MCContext &Context = Layout->getAssembler().getContext();
+    const MCSymbolRefExpr *Sym = Value.getSymA();
+    MCSymbolRefExpr::VariantKind Modifier = Sym->getKind();
+    if (Modifier != MCSymbolRefExpr::VK_None)
+      return false;
+
+    Sym = MCSymbolRefExpr::create(&Sym->getSymbol(), Modifier, Context);
+    Result = MCValue::get(Sym, Value.getSymB(), Value.getConstant());
+  }
+
+  return true;
+}
+
+int64_t AVRMCExpr::evaluateAsInt64(int64_t Value) const {
+  if (Negated)
+    Value *= -1;
+
+  switch (Kind) {
+  case AVRMCExpr::VK_AVR_LO8:
+    break;
+  case AVRMCExpr::VK_AVR_HI8:
+    Value >>= 8;
+    break;
+  case AVRMCExpr::VK_AVR_HH8:
+    Value >>= 16;
+    break;
+  case AVRMCExpr::VK_AVR_HHI8:
+    Value >>= 24;
+    break;
+  case AVRMCExpr::VK_AVR_PM_LO8:
+    Value >>= 1;
+    break;
+  case AVRMCExpr::VK_AVR_PM_HI8:
+    Value >>= 9;
+    break;
+  case AVRMCExpr::VK_AVR_PM_HH8:
+    Value >>= 17;
+    break;
+
+  case AVRMCExpr::VK_AVR_None:
+    llvm_unreachable("Uninitialized expression.");
+  }
+  return static_cast<uint64_t>(Value) & 0xff;
+}
+
+AVR::Fixups AVRMCExpr::getFixupKind() const {
+  AVR::Fixups Kind = AVR::Fixups::LastTargetFixupKind;
+
+  switch (getKind()) {
+  case VK_AVR_LO8:
+    Kind = isNegated() ? AVR::fixup_lo8_ldi_neg : AVR::fixup_lo8_ldi;
+    break;
+  case VK_AVR_HI8:
+    Kind = isNegated() ? AVR::fixup_hi8_ldi_neg : AVR::fixup_hi8_ldi;
+    break;
+  case VK_AVR_HH8:
+    Kind = isNegated() ? AVR::fixup_hh8_ldi_neg : AVR::fixup_hh8_ldi;
+    break;
+  case VK_AVR_HHI8:
+    Kind = isNegated() ? AVR::fixup_ms8_ldi_neg : AVR::fixup_ms8_ldi;
+    break;
+
+  case VK_AVR_PM_LO8:
+    Kind = isNegated() ? AVR::fixup_lo8_ldi_pm_neg : AVR::fixup_lo8_ldi_pm;
+    break;
+  case VK_AVR_PM_HI8:
+    Kind = isNegated() ? AVR::fixup_hi8_ldi_pm_neg : AVR::fixup_hi8_ldi_pm;
+    break;
+  case VK_AVR_PM_HH8:
+    Kind = isNegated() ? AVR::fixup_hh8_ldi_pm_neg : AVR::fixup_hh8_ldi_pm;
+    break;
+
+  case VK_AVR_None:
+    llvm_unreachable("Uninitialized expression");
+  }
+
+  return Kind;
+}
+
+void AVRMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
+}
+
+const char *AVRMCExpr::getName() const {
+  const auto &Modifier = std::find_if(
+      std::begin(ModifierNames), std::end(ModifierNames),
+      [this](ModifierEntry const &Mod) { return Mod.VariantKind == Kind; });
+
+  if (Modifier != std::end(ModifierNames)) {
+    return Modifier->Spelling;
+  }
+  return nullptr;
+}
+
+AVRMCExpr::VariantKind AVRMCExpr::getKindByName(StringRef Name) {
+  const auto &Modifier = std::find_if(
+      std::begin(ModifierNames), std::end(ModifierNames),
+      [&Name](ModifierEntry const &Mod) { return Mod.Spelling == Name; });
+
+  if (Modifier != std::end(ModifierNames)) {
+    return Modifier->VariantKind;
+  }
+  return VK_AVR_None;
+}
+
+} // end of namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
new file mode 100644
index 000000000000..be565a8be340
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
@@ -0,0 +1,88 @@
+//===-- AVRMCExpr.h - AVR specific MC expression classes --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_MCEXPR_H
+#define LLVM_AVR_MCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+
+#include "MCTargetDesc/AVRFixupKinds.h"
+
+namespace llvm {
+
+/// A expression in AVR machine code.
+class AVRMCExpr : public MCTargetExpr {
+public:
+  /// Specifies the type of an expression.
+  enum VariantKind {
+    VK_AVR_None,
+
+    VK_AVR_HI8,  ///< Corresponds to `hi8()`.
+    VK_AVR_LO8,  ///< Corresponds to `lo8()`.
+    VK_AVR_HH8,  ///< Corresponds to `hlo8() and hh8()`.
+    VK_AVR_HHI8, ///< Corresponds to `hhi8()`.
+
+    VK_AVR_PM_LO8, ///< Corresponds to `pm_lo8()`.
+    VK_AVR_PM_HI8, ///< Corresponds to `pm_hi8()`.
+    VK_AVR_PM_HH8  ///< Corresponds to `pm_hh8()`.
+  };
+
+public:
+  /// Creates an AVR machine code expression.
+  static const AVRMCExpr *create(VariantKind Kind, const MCExpr *Expr,
+                                 bool isNegated, MCContext &Ctx);
+
+  /// Gets the type of the expression.
+  VariantKind getKind() const { return Kind; }
+  /// Gets the name of the expression.
+  const char *getName() const;
+  const MCExpr *getSubExpr() const { return SubExpr; }
+  /// Gets the fixup which corresponds to the expression.
+  AVR::Fixups getFixupKind() const;
+  /// Evaluates the fixup as a constant value.
+  bool evaluateAsConstant(int64_t &Result) const;
+
+  bool isNegated() const { return Negated; }
+  void setNegated(bool negated = true) { Negated = negated; }
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override;
+
+  void visitUsedExpr(MCStreamer &streamer) const override;
+
+  MCFragment *findAssociatedFragment() const override {
+    return getSubExpr()->findAssociatedFragment();
+  }
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+
+public:
+  static VariantKind getKindByName(StringRef Name);
+
+private:
+  int64_t evaluateAsInt64(int64_t Value) const;
+
+  const VariantKind Kind;
+  const MCExpr *SubExpr;
+  bool Negated;
+
+private:
+  explicit AVRMCExpr(VariantKind Kind, const MCExpr *Expr, bool Negated)
+      : Kind(Kind), SubExpr(Expr), Negated(Negated) {}
+  ~AVRMCExpr() {}
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_MCEXPR_H
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
new file mode 100644
index 000000000000..a4fa5c0a9310
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
@@ -0,0 +1,121 @@
+//===-- AVRMCTargetDesc.cpp - AVR Target Descriptions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AVR specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRELFStreamer.h"
+#include "AVRMCAsmInfo.h"
+#include "AVRMCTargetDesc.h"
+#include "AVRTargetStreamer.h"
+#include "InstPrinter/AVRInstPrinter.h"
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "AVRGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "AVRGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "AVRGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createAVRMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitAVRMCInstrInfo(X);
+
+  return X;
+}
+
+static MCRegisterInfo *createAVRMCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitAVRMCRegisterInfo(X, 0);
+
+  return X;
+}
+
+static MCSubtargetInfo *createAVRMCSubtargetInfo(const Triple &TT,
+                                                 StringRef CPU, StringRef FS) {
+  return createAVRMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCInstPrinter *createAVRMCInstPrinter(const Triple &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
+                                             const MCRegisterInfo &MRI) {
+  if (SyntaxVariant == 0) {
+    return new AVRInstPrinter(MAI, MII, MRI);
+  }
+
+  return nullptr;
+}
+
+static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
+                                    MCAsmBackend &MAB, raw_pwrite_stream &OS,
+                                    MCCodeEmitter *Emitter, bool RelaxAll) {
+  return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+}
+
+static MCTargetStreamer *
+createAVRObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  return new AVRELFStreamer(S, STI);
+}
+
+static MCTargetStreamer *createMCAsmTargetStreamer(MCStreamer &S,
+                                                   formatted_raw_ostream &OS,
+                                                   MCInstPrinter *InstPrint,
+                                                   bool isVerboseAsm) {
+  return new AVRTargetAsmStreamer(S);
+}
+
+extern "C" void LLVMInitializeAVRTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfo<AVRMCAsmInfo> X(getTheAVRTarget());
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(getTheAVRTarget(), createAVRMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(getTheAVRTarget(), createAVRMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(getTheAVRTarget(),
+                                          createAVRMCSubtargetInfo);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(getTheAVRTarget(),
+                                        createAVRMCInstPrinter);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(getTheAVRTarget(), createAVRMCCodeEmitter);
+
+  // Register the ELF streamer
+  TargetRegistry::RegisterELFStreamer(getTheAVRTarget(), createMCStreamer);
+
+  // Register the obj target streamer.
+  TargetRegistry::RegisterObjectTargetStreamer(getTheAVRTarget(),
+                                               createAVRObjectTargetStreamer);
+
+  // Register the asm target streamer.
+  TargetRegistry::RegisterAsmTargetStreamer(getTheAVRTarget(),
+                                            createMCAsmTargetStreamer);
+
+  // Register the asm backend (as little endian).
+  TargetRegistry::RegisterMCAsmBackend(getTheAVRTarget(), createAVRAsmBackend);
+}
+
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
new file mode 100644
index 000000000000..41a574767910
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
@@ -0,0 +1,59 @@
+//===-- AVRMCTargetDesc.h - AVR Target Descriptions -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AVR specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_MCTARGET_DESC_H
+#define LLVM_AVR_MCTARGET_DESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCTargetOptions;
+class StringRef;
+class Target;
+class Triple;
+class raw_pwrite_stream;
+
+Target &getTheAVRTarget();
+
+/// Creates a machine code emitter for AVR.
+MCCodeEmitter *createAVRMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
+                                      MCContext &Ctx);
+
+/// Creates an assembly backend for AVR.
+MCAsmBackend *createAVRAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  const Triple &TT, StringRef CPU,
+                                  const llvm::MCTargetOptions &TO);
+
+/// Creates an ELF object writer for AVR.
+MCObjectWriter *createAVRELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI);
+
+} // end namespace llvm
+
+#define GET_REGINFO_ENUM
+#include "AVRGenRegisterInfo.inc"
+
+#define GET_INSTRINFO_ENUM
+#include "AVRGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AVRGenSubtargetInfo.inc"
+
+#endif // LLVM_AVR_MCTARGET_DESC_H
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp
new file mode 100644
index 000000000000..a2d8c16eeb8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp
@@ -0,0 +1,24 @@
+//===-- AVRTargetStreamer.cpp - AVR Target Streamer Methods ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AVR specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AVRTargetStreamer.h"
+
+namespace llvm {
+
+AVRTargetStreamer::AVRTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+AVRTargetAsmStreamer::AVRTargetAsmStreamer(MCStreamer &S)
+    : AVRTargetStreamer(S) {}
+
+} // end namespace llvm
+
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h
new file mode 100644
index 000000000000..99a536699ae9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.h
@@ -0,0 +1,32 @@
+//===-- AVRTargetStreamer.h - AVR Target Streamer --------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AVR_TARGET_STREAMER_H
+#define LLVM_AVR_TARGET_STREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+class MCStreamer;
+
+/// A generic AVR target output stream.
+class AVRTargetStreamer : public MCTargetStreamer {
+public:
+  explicit AVRTargetStreamer(MCStreamer &S);
+};
+
+/// A target streamer for textual AVR assembly code.
+class AVRTargetAsmStreamer : public AVRTargetStreamer {
+public:
+  explicit AVRTargetAsmStreamer(MCStreamer &S);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_AVR_TARGET_STREAMER_H
diff --git a/contrib/llvm/lib/Target/AVR/README.md b/contrib/llvm/lib/Target/AVR/README.md
new file mode 100644
index 000000000000..bd8b453aa81e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/README.md
@@ -0,0 +1,8 @@
+# AVR backend
+
+This experimental backend is for the 8-bit Atmel [AVR](https://en.wikipedia.org/wiki/Atmel_AVR) microcontroller.
+
+## Useful links
+
+* [Unresolved bugs](https://llvm.org/bugs/buglist.cgi?product=libraries&component=Backend%3A%20AVR&resolution=---&list_id=109466)
+* [Architecture notes](https://github.com/avr-llvm/architecture)
diff --git a/contrib/llvm/lib/Target/AVR/TODO.md b/contrib/llvm/lib/Target/AVR/TODO.md
new file mode 100644
index 000000000000..3a333355646d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/TODO.md
@@ -0,0 +1,7 @@
+# Write an XFAIL test for this `FIXME` in `AVRInstrInfo.td`
+
+```
+// :FIXME: DAGCombiner produces an shl node after legalization from these seq:
+// BR_JT -> (mul x, 2) -> (shl x, 1)
+```
+
diff --git a/contrib/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp b/contrib/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
new file mode 100644
index 000000000000..36cecaa7ac7a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- AVRTargetInfo.cpp - AVR Target Implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+namespace llvm {
+Target &getTheAVRTarget() {
+  static Target TheAVRTarget;
+  return TheAVRTarget;
+}
+}
+
+extern "C" void LLVMInitializeAVRTargetInfo() {
+  llvm::RegisterTarget<llvm::Triple::avr> X(llvm::getTheAVRTarget(), "avr",
+                                            "Atmel AVR Microcontroller");
+}
+
diff --git a/contrib/llvm/lib/Target/BPF/BPF.h b/contrib/llvm/lib/Target/BPF/BPF.h
new file mode 100644
index 000000000000..4a0cb20357c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPF.h
@@ -0,0 +1,22 @@
+//===-- BPF.h - Top-level interface for BPF representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPF_H
+#define LLVM_LIB_TARGET_BPF_BPF_H
+
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class BPFTargetMachine;
+
+FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/BPF.td b/contrib/llvm/lib/Target/BPF/BPF.td
new file mode 100644
index 000000000000..11abe520c506
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPF.td
@@ -0,0 +1,44 @@
+//===-- BPF.td - Describe the BPF Target Machine -----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+include "BPFRegisterInfo.td"
+include "BPFCallingConv.td"
+include "BPFInstrInfo.td"
+
+def BPFInstrInfo : InstrInfo;
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic", []>;
+
+def BPFInstPrinter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+def BPFAsmParser : AsmParser {
+  bit HasMnemonicFirst = 0;
+}
+
+def BPFAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+  string Name = "BPF";
+  string BreakCharacters = ".";
+  string TokenizingCharacters = "#()[]=:.<>!+*";
+}
+
+def BPF : Target {
+  let InstructionSet = BPFInstrInfo;
+  let AssemblyWriters = [BPFInstPrinter];
+  let AssemblyParsers = [BPFAsmParser];
+  let AssemblyParserVariants = [BPFAsmParserVariant];
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/contrib/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
new file mode 100644
index 000000000000..c5201465e074
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -0,0 +1,61 @@
+//===-- BPFAsmPrinter.cpp - BPF LLVM assembly writer ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the BPF assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFInstrInfo.h"
+#include "BPFMCInstLower.h"
+#include "BPFTargetMachine.h"
+#include "InstPrinter/BPFInstPrinter.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+class BPFAsmPrinter : public AsmPrinter {
+public:
+  explicit BPFAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)) {}
+
+  StringRef getPassName() const override { return "BPF Assembly Printer"; }
+
+  void EmitInstruction(const MachineInstr *MI) override;
+};
+}
+
+void BPFAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+
+  BPFMCInstLower MCInstLowering(OutContext, *this);
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  EmitToStreamer(*OutStreamer, TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeBPFAsmPrinter() {
+  RegisterAsmPrinter<BPFAsmPrinter> X(getTheBPFleTarget());
+  RegisterAsmPrinter<BPFAsmPrinter> Y(getTheBPFbeTarget());
+  RegisterAsmPrinter<BPFAsmPrinter> Z(getTheBPFTarget());
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFCallingConv.td b/contrib/llvm/lib/Target/BPF/BPFCallingConv.td
new file mode 100644
index 000000000000..8cec6fa54698
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFCallingConv.td
@@ -0,0 +1,29 @@
+//===-- BPFCallingConv.td - Calling Conventions BPF --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the BPF architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// BPF 64-bit C return-value convention.
+def RetCC_BPF64 : CallingConv<[CCIfType<[i64], CCAssignToReg<[R0]>>]>;
+
+// BPF 64-bit C Calling convention.
+def CC_BPF64 : CallingConv<[
+  // Promote i8/i16/i32 args to i64
+  CCIfType<[ i8, i16, i32 ], CCPromoteToType<i64>>,
+
+  // All arguments get passed in integer registers if there is space.
+  CCIfType<[i64], CCAssignToReg<[ R1, R2, R3, R4, R5 ]>>,
+
+  // Could be assigned to the stack in 8-byte aligned units, but unsupported
+  CCAssignToStack<8, 8>
+]>;
+
+def CSR : CalleeSavedRegs<(add R6, R7, R8, R9, R10)>;
diff --git a/contrib/llvm/lib/Target/BPF/BPFFrameLowering.cpp b/contrib/llvm/lib/Target/BPF/BPFFrameLowering.cpp
new file mode 100644
index 000000000000..c2806c85f24f
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFFrameLowering.cpp
@@ -0,0 +1,40 @@
+//===-- BPFFrameLowering.cpp - BPF Frame Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the BPF implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFFrameLowering.h"
+#include "BPFInstrInfo.h"
+#include "BPFSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+bool BPFFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
+
+void BPFFrameLowering::emitPrologue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {}
+
+void BPFFrameLowering::emitEpilogue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {}
+
+void BPFFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                            BitVector &SavedRegs,
+                                            RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+  SavedRegs.reset(BPF::R6);
+  SavedRegs.reset(BPF::R7);
+  SavedRegs.reset(BPF::R8);
+  SavedRegs.reset(BPF::R9);
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFFrameLowering.h b/contrib/llvm/lib/Target/BPF/BPFFrameLowering.h
new file mode 100644
index 000000000000..5db963f518b1
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFFrameLowering.h
@@ -0,0 +1,41 @@
+//===-- BPFFrameLowering.h - Define frame lowering for BPF -----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements BPF-specific bits of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFFRAMELOWERING_H
+#define LLVM_LIB_TARGET_BPF_BPFFRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class BPFSubtarget;
+
+class BPFFrameLowering : public TargetFrameLowering {
+public:
+  explicit BPFFrameLowering(const BPFSubtarget &sti)
+      : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, 0) {}
+
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
+
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override {
+    return MBB.erase(MI);
+  }
+};
+}
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
new file mode 100644
index 000000000000..12091449cc11
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -0,0 +1,186 @@
+//===-- BPFISelDAGToDAG.cpp - A dag to dag inst selector for BPF ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for BPF,
+// converting from a legalized dag to a BPF dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFRegisterInfo.h"
+#include "BPFSubtarget.h"
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-isel"
+
+// Instruction Selector Implementation
+namespace {
+
+class BPFDAGToDAGISel : public SelectionDAGISel {
+public:
+  explicit BPFDAGToDAGISel(BPFTargetMachine &TM) : SelectionDAGISel(TM) {}
+
+  StringRef getPassName() const override {
+    return "BPF DAG->DAG Pattern Instruction Selection";
+  }
+
+private:
+// Include the pieces autogenerated from the target description.
+#include "BPFGenDAGISel.inc"
+
+  void Select(SDNode *N) override;
+
+  // Complex Pattern for address selection.
+  bool SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
+  bool SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
+};
+}
+
+// ComplexPattern used on BPF Load/Store instructions
+bool BPFDAGToDAGISel::SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
+  // if Address is FI, get the TargetFrameIndex.
+  SDLoc DL(Addr);
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+    Offset = CurDAG->getTargetConstant(0, DL, MVT::i64);
+    return true;
+  }
+
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;
+
+  // Addresses of the form Addr+const or Addr|const
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isInt<32>(CN->getSExtValue())) {
+
+      // If the first operand is a FI, get the TargetFI Node
+      if (FrameIndexSDNode *FIN =
+              dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+      else
+        Base = Addr.getOperand(0);
+
+      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), DL, MVT::i64);
+      return true;
+    }
+  }
+
+  Base   = Addr;
+  Offset = CurDAG->getTargetConstant(0, DL, MVT::i64);
+  return true;
+}
+
+// ComplexPattern used on BPF FI instruction
+bool BPFDAGToDAGISel::SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
+  SDLoc DL(Addr);
+
+  if (!CurDAG->isBaseWithConstantOffset(Addr))
+    return false;
+
+  // Addresses of the form Addr+const or Addr|const
+  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+  if (isInt<32>(CN->getSExtValue())) {
+
+    // If the first operand is a FI, get the TargetFI Node
+    if (FrameIndexSDNode *FIN =
+            dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+      Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+    else
+      return false;
+
+    Offset = CurDAG->getTargetConstant(CN->getSExtValue(), DL, MVT::i64);
+    return true;
+  }
+
+  return false;
+}
+
+void BPFDAGToDAGISel::Select(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+
+  // Dump information about the Node being selected
+  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
+    return;
+  }
+
+  // tablegen selection should be handled here.
+  switch (Opcode) {
+  default: break;
+  case ISD::SDIV: {
+    DebugLoc Empty;
+    const DebugLoc &DL = Node->getDebugLoc();
+    if (DL != Empty)
+      errs() << "Error at line " << DL.getLine() << ": ";
+    else
+      errs() << "Error: ";
+    errs() << "Unsupport signed division for DAG: ";
+    Node->dump(CurDAG);
+    errs() << "Please convert to unsigned div/mod.\n";
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    case Intrinsic::bpf_load_byte:
+    case Intrinsic::bpf_load_half:
+    case Intrinsic::bpf_load_word: {
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+      SDValue N1 = Node->getOperand(1);
+      SDValue Skb = Node->getOperand(2);
+      SDValue N3 = Node->getOperand(3);
+
+      SDValue R6Reg = CurDAG->getRegister(BPF::R6, MVT::i64);
+      Chain = CurDAG->getCopyToReg(Chain, DL, R6Reg, Skb, SDValue());
+      Node = CurDAG->UpdateNodeOperands(Node, Chain, N1, R6Reg, N3);
+      break;
+    }
+    }
+    break;
+  }
+
+  case ISD::FrameIndex: {
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    EVT VT = Node->getValueType(0);
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
+    unsigned Opc = BPF::MOV_rr;
+    if (Node->hasOneUse()) {
+      CurDAG->SelectNodeTo(Node, Opc, VT, TFI);
+      return;
+    }
+    ReplaceNode(Node, CurDAG->getMachineNode(Opc, SDLoc(Node), VT, TFI));
+    return;
+  }
+  }
+
+  // Select the default instruction
+  SelectCode(Node);
+}
+
+FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) {
+  return new BPFDAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp
new file mode 100644
index 000000000000..cca3492a1992
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -0,0 +1,596 @@
+//===-- BPFISelLowering.cpp - BPF DAG Lowering Implementation  ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that BPF uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFISelLowering.h"
+#include "BPF.h"
+#include "BPFSubtarget.h"
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-lower"
+
+static void fail(const SDLoc &DL, SelectionDAG &DAG, const char *Msg) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  DAG.getContext()->diagnose(
+      DiagnosticInfoUnsupported(*MF.getFunction(), Msg, DL.getDebugLoc()));
+}
+
+static void fail(const SDLoc &DL, SelectionDAG &DAG, const char *Msg,
+                 SDValue Val) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  std::string Str;
+  raw_string_ostream OS(Str);
+  OS << Msg;
+  Val->print(OS);
+  OS.flush();
+  DAG.getContext()->diagnose(
+      DiagnosticInfoUnsupported(*MF.getFunction(), Str, DL.getDebugLoc()));
+}
+
+BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
+                                     const BPFSubtarget &STI)
+    : TargetLowering(TM) {
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i64, &BPF::GPRRegClass);
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties(STI.getRegisterInfo());
+
+  setStackPointerRegisterToSaveRestore(BPF::R11);
+
+  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::SETCC, MVT::i64, Expand);
+  setOperationAction(ISD::SELECT, MVT::i64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  setOperationAction(ISD::MULHU, MVT::i64, Expand);
+  setOperationAction(ISD::MULHS, MVT::i64, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+
+  setOperationAction(ISD::ADDC, MVT::i64, Expand);
+  setOperationAction(ISD::ADDE, MVT::i64, Expand);
+  setOperationAction(ISD::SUBC, MVT::i64, Expand);
+  setOperationAction(ISD::SUBE, MVT::i64, Expand);
+
+  setOperationAction(ISD::ROTR, MVT::i64, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+
+  setOperationAction(ISD::CTTZ, MVT::i64, Custom);
+  setOperationAction(ISD::CTLZ, MVT::i64, Custom);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Expand);
+
+  // Extended load operations for i1 types must be promoted
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
+  }
+
+  setBooleanContents(ZeroOrOneBooleanContent);
+
+  // Function alignments (log2)
+  setMinFunctionAlignment(3);
+  setPrefFunctionAlignment(3);
+
+  // inline memcpy() for kernel to see explicit copy
+  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 128;
+  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 128;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128;
+}
+
+SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  case ISD::BR_CC:
+    return LowerBR_CC(Op, DAG);
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::SELECT_CC:
+    return LowerSELECT_CC(Op, DAG);
+  default:
+    llvm_unreachable("unimplemented operand");
+  }
+}
+
+// Calling Convention Implementation
+#include "BPFGenCallingConv.inc"
+
+SDValue BPFTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  switch (CallConv) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::C:
+  case CallingConv::Fast:
+    break;
+  }
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_BPF64);
+
+  for (auto &VA : ArgLocs) {
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      EVT RegVT = VA.getLocVT();
+      switch (RegVT.getSimpleVT().SimpleTy) {
+      default: {
+        errs() << "LowerFormalArguments Unhandled argument type: "
+               << RegVT.getEVTString() << '\n';
+        llvm_unreachable(0);
+      }
+      case MVT::i64:
+        unsigned VReg = RegInfo.createVirtualRegister(&BPF::GPRRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT);
+
+        // If this is an 8/16/32-bit value, it is really passed promoted to 64
+        // bits. Insert an assert[sz]ext to capture this, then truncate to the
+        // right size.
+        if (VA.getLocInfo() == CCValAssign::SExt)
+          ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+        else if (VA.getLocInfo() == CCValAssign::ZExt)
+          ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+
+        if (VA.getLocInfo() != CCValAssign::Full)
+          ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
+
+        InVals.push_back(ArgValue);
+      }
+    } else {
+      fail(DL, DAG, "defined with too many args");
+      InVals.push_back(DAG.getConstant(0, DL, VA.getLocVT()));
+    }
+  }
+
+  if (IsVarArg || MF.getFunction()->hasStructRetAttr()) {
+    fail(DL, DAG, "functions with VarArgs or StructRet are not supported");
+  }
+
+  return Chain;
+}
+
+const unsigned BPFTargetLowering::MaxArgs = 5;
+
+SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                     SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  auto &Outs = CLI.Outs;
+  auto &OutVals = CLI.OutVals;
+  auto &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &IsTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // BPF target does not support tail call optimization.
+  IsTailCall = false;
+
+  switch (CallConv) {
+  default:
+    report_fatal_error("Unsupported calling convention");
+  case CallingConv::Fast:
+  case CallingConv::C:
+    break;
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_BPF64);
+
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  if (Outs.size() > MaxArgs)
+    fail(CLI.DL, DAG, "too many args to ", Callee);
+
+  for (auto &Arg : Outs) {
+    ISD::ArgFlagsTy Flags = Arg.Flags;
+    if (!Flags.isByVal())
+      continue;
+
+    fail(CLI.DL, DAG, "pass by value not supported ", Callee);
+  }
+
+  auto PtrVT = getPointerTy(MF.getDataLayout());
+  Chain = DAG.getCALLSEQ_START(
+      Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true), CLI.DL);
+
+  SmallVector<std::pair<unsigned, SDValue>, MaxArgs> RegsToPass;
+
+  // Walk arg assignments
+  for (unsigned i = 0,
+                e = std::min(static_cast<unsigned>(ArgLocs.size()), MaxArgs);
+       i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[i];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, CLI.DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, CLI.DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, CLI.DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    // Push arguments into RegsToPass vector
+    if (VA.isRegLoc())
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    else
+      llvm_unreachable("call arg pass bug");
+  }
+
+  SDValue InFlag;
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain and
+  // flag operands which copy the outgoing args into registers.  The InFlag in
+  // necessary since all emitted instructions must be stuck together.
+  for (auto &Reg : RegsToPass) {
+    Chain = DAG.getCopyToReg(Chain, CLI.DL, Reg.first, Reg.second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), CLI.DL, PtrVT,
+                                        G->getOffset(), 0);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (auto &Reg : RegsToPass)
+    Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(BPFISD::CALL, CLI.DL, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(
+      Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true),
+      DAG.getConstant(0, CLI.DL, PtrVT, true), InFlag, CLI.DL);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, CLI.DL, DAG,
+                         InVals);
+}
+
+SDValue
+BPFTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                               bool IsVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
+                               const SDLoc &DL, SelectionDAG &DAG) const {
+  unsigned Opc = BPFISD::RET_FLAG;
+
+  // CCValAssign - represent the assignment of the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
+
+  if (MF.getFunction()->getReturnType()->isAggregateType()) {
+    fail(DL, DAG, "only integer returns supported");
+    return DAG.getNode(Opc, DL, MVT::Other, Chain);
+  }
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_BPF64);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVals[i], Flag);
+
+    // Guarantee that all emitted copies are stuck together,
+    // avoiding something bad.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain; // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
+}
+
+SDValue BPFTargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
+
+  if (Ins.size() >= 2) {
+    fail(DL, DAG, "only small returns supported");
+    for (unsigned i = 0, e = Ins.size(); i != e; ++i)
+      InVals.push_back(DAG.getConstant(0, DL, Ins[i].VT));
+    return DAG.getCopyFromReg(Chain, DL, 1, Ins[0].VT, InFlag).getValue(1);
+  }
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_BPF64);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (auto &Val : RVLocs) {
+    Chain = DAG.getCopyFromReg(Chain, DL, Val.getLocReg(),
+                               Val.getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+static void NegateCC(SDValue &LHS, SDValue &RHS, ISD::CondCode &CC) {
+  switch (CC) {
+  default:
+    break;
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETLT:
+  case ISD::SETLE:
+    CC = ISD::getSetCCSwappedOperands(CC);
+    std::swap(LHS, RHS);
+    break;
+  }
+}
+
+SDValue BPFTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  SDLoc DL(Op);
+
+  NegateCC(LHS, RHS, CC);
+
+  return DAG.getNode(BPFISD::BR_CC, DL, Op.getValueType(), Chain, LHS, RHS,
+                     DAG.getConstant(CC, DL, MVT::i64), Dest);
+}
+
+SDValue BPFTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue TrueV = Op.getOperand(2);
+  SDValue FalseV = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDLoc DL(Op);
+
+  NegateCC(LHS, RHS, CC);
+
+  SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i64);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  SDValue Ops[] = {LHS, RHS, TargetCC, TrueV, FalseV};
+
+  return DAG.getNode(BPFISD::SELECT_CC, DL, VTs, Ops);
+}
+
+const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch ((BPFISD::NodeType)Opcode) {
+  case BPFISD::FIRST_NUMBER:
+    break;
+  case BPFISD::RET_FLAG:
+    return "BPFISD::RET_FLAG";
+  case BPFISD::CALL:
+    return "BPFISD::CALL";
+  case BPFISD::SELECT_CC:
+    return "BPFISD::SELECT_CC";
+  case BPFISD::BR_CC:
+    return "BPFISD::BR_CC";
+  case BPFISD::Wrapper:
+    return "BPFISD::Wrapper";
+  }
+  return nullptr;
+}
+
+SDValue BPFTargetLowering::LowerGlobalAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i64);
+
+  return DAG.getNode(BPFISD::Wrapper, DL, MVT::i64, GA);
+}
+
+MachineBasicBlock *
+BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                               MachineBasicBlock *BB) const {
+  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  assert(MI.getOpcode() == BPF::Select && "Unexpected instr type to insert");
+
+  // To "insert" a SELECT instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator I = ++BB->getIterator();
+
+  // ThisMBB:
+  // ...
+  //  TrueVal = ...
+  //  jmp_XX r1, r2 goto Copy1MBB
+  //  fallthrough --> Copy0MBB
+  MachineBasicBlock *ThisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *Copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *Copy1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+
+  F->insert(I, Copy0MBB);
+  F->insert(I, Copy1MBB);
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  Copy1MBB->splice(Copy1MBB->begin(), BB,
+                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  Copy1MBB->transferSuccessorsAndUpdatePHIs(BB);
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(Copy0MBB);
+  BB->addSuccessor(Copy1MBB);
+
+  // Insert Branch if Flag
+  unsigned LHS = MI.getOperand(1).getReg();
+  unsigned RHS = MI.getOperand(2).getReg();
+  int CC = MI.getOperand(3).getImm();
+  switch (CC) {
+  case ISD::SETGT:
+    BuildMI(BB, DL, TII.get(BPF::JSGT_rr))
+        .addReg(LHS)
+        .addReg(RHS)
+        .addMBB(Copy1MBB);
+    break;
+  case ISD::SETUGT:
+    BuildMI(BB, DL, TII.get(BPF::JUGT_rr))
+        .addReg(LHS)
+        .addReg(RHS)
+        .addMBB(Copy1MBB);
+    break;
+  case ISD::SETGE:
+    BuildMI(BB, DL, TII.get(BPF::JSGE_rr))
+        .addReg(LHS)
+        .addReg(RHS)
+        .addMBB(Copy1MBB);
+    break;
+  case ISD::SETUGE:
+    BuildMI(BB, DL, TII.get(BPF::JUGE_rr))
+        .addReg(LHS)
+        .addReg(RHS)
+        .addMBB(Copy1MBB);
+    break;
+  case ISD::SETEQ:
+    BuildMI(BB, DL, TII.get(BPF::JEQ_rr))
+        .addReg(LHS)
+        .addReg(RHS)
+        .addMBB(Copy1MBB);
+    break;
+  case ISD::SETNE:
+    BuildMI(BB, DL, TII.get(BPF::JNE_rr))
+        .addReg(LHS)
+        .addReg(RHS)
+        .addMBB(Copy1MBB);
+    break;
+  default:
+    report_fatal_error("unimplemented select CondCode " + Twine(CC));
+  }
+
+  // Copy0MBB:
+  //  %FalseValue = ...
+  //  # fallthrough to Copy1MBB
+  BB = Copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(Copy1MBB);
+
+  // Copy1MBB:
+  //  %Result = phi [ %FalseValue, Copy0MBB ], [ %TrueValue, ThisMBB ]
+  // ...
+  BB = Copy1MBB;
+  BuildMI(*BB, BB->begin(), DL, TII.get(BPF::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(5).getReg())
+      .addMBB(Copy0MBB)
+      .addReg(MI.getOperand(4).getReg())
+      .addMBB(ThisMBB);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFISelLowering.h b/contrib/llvm/lib/Target/BPF/BPFISelLowering.h
new file mode 100644
index 000000000000..3d1726be286e
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -0,0 +1,93 @@
+//===-- BPFISelLowering.h - BPF DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that BPF uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFISELLOWERING_H
+#define LLVM_LIB_TARGET_BPF_BPFISELLOWERING_H
+
+#include "BPF.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+class BPFSubtarget;
+namespace BPFISD {
+enum NodeType : unsigned {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  RET_FLAG,
+  CALL,
+  SELECT_CC,
+  BR_CC,
+  Wrapper
+};
+}
+
+class BPFTargetLowering : public TargetLowering {
+public:
+  explicit BPFTargetLowering(const TargetMachine &TM, const BPFSubtarget &STI);
+
+  // Provide custom lowering hooks for some operations.
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  // This method returns the name of a target specific DAG node.
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *BB) const override;
+
+private:
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+
+  // Lower the result values of a call, copying them out of physregs into vregs
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool IsVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins,
+                          const SDLoc &DL, SelectionDAG &DAG,
+                          SmallVectorImpl<SDValue> &InVals) const;
+
+  // Maximum number of arguments to a call
+  static const unsigned MaxArgs;
+
+  // Lower a call into CALLSEQ_START - BPFISD:CALL - CALLSEQ_END chain
+  SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  // Lower incoming arguments, copy physregs into vregs
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool IsVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+                      SelectionDAG &DAG) const override;
+
+  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                          MachineFunction &MF) const override {
+    return Size >= 8 ? MVT::i64 : MVT::i32;
+  }
+
+  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                         Type *Ty) const override {
+    return true;
+  }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrFormats.td b/contrib/llvm/lib/Target/BPF/BPFInstrFormats.td
new file mode 100644
index 000000000000..53f3ad623587
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFInstrFormats.td
@@ -0,0 +1,33 @@
+//===-- BPFInstrFormats.td - BPF Instruction Formats -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class InstBPF<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Instruction {
+  field bits<64> Inst;
+  field bits<64> SoftFail = 0;
+  let Size = 8;
+
+  let Namespace = "BPF";
+  let DecoderNamespace = "BPF";
+
+  bits<3> BPFClass;
+  let Inst{58-56} = BPFClass;
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+}
+
+// Pseudo instructions
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstBPF<outs, ins, asmstr, pattern> {
+  let Inst{63-0} = 0;
+  let isPseudo = 1;
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp
new file mode 100644
index 000000000000..cbe4466164f9
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@@ -0,0 +1,174 @@
+//===-- BPFInstrInfo.cpp - BPF Instruction Information ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the BPF implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFInstrInfo.h"
+#include "BPFSubtarget.h"
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "BPFGenInstrInfo.inc"
+
+using namespace llvm;
+
+BPFInstrInfo::BPFInstrInfo()
+    : BPFGenInstrInfo(BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {}
+
+void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I,
+                               const DebugLoc &DL, unsigned DestReg,
+                               unsigned SrcReg, bool KillSrc) const {
+  if (BPF::GPRRegClass.contains(DestReg, SrcReg))
+    BuildMI(MBB, I, DL, get(BPF::MOV_rr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+}
+
+void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned SrcReg, bool IsKill, int FI,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end())
+    DL = I->getDebugLoc();
+
+  if (RC == &BPF::GPRRegClass)
+    BuildMI(MBB, I, DL, get(BPF::STD))
+        .addReg(SrcReg, getKillRegState(IsKill))
+        .addFrameIndex(FI)
+        .addImm(0);
+  else
+    llvm_unreachable("Can't store this register to stack slot");
+}
+
+void BPFInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned DestReg, int FI,
+                                        const TargetRegisterClass *RC,
+                                        const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end())
+    DL = I->getDebugLoc();
+
+  if (RC == &BPF::GPRRegClass)
+    BuildMI(MBB, I, DL, get(BPF::LDD), DestReg).addFrameIndex(FI).addImm(0);
+  else
+    llvm_unreachable("Can't load this register from stack slot");
+}
+
+bool BPFInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                 MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+
+    // Working from the bottom, when we see a non-terminator
+    // instruction, we're done.
+    if (!isUnpredicatedTerminator(*I))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled
+    // by this analysis.
+    if (!I->isBranch())
+      return true;
+
+    // Handle unconditional branches.
+    if (I->getOpcode() == BPF::JMP) {
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a J, delete them.
+      while (std::next(I) != MBB.end())
+        std::next(I)->eraseFromParent();
+      Cond.clear();
+      FBB = 0;
+
+      // Delete the J if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = 0;
+        I->eraseFromParent();
+        I = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditinal destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+    // Cannot handle conditional branches
+    return true;
+  }
+
+  return false;
+}
+
+unsigned BPFInstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock *TBB,
+                                    MachineBasicBlock *FBB,
+                                    ArrayRef<MachineOperand> Cond,
+                                    const DebugLoc &DL,
+                                    int *BytesAdded) const {
+  assert(!BytesAdded && "code size not handled");
+
+  // Shouldn't be a fall through.
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+
+  if (Cond.empty()) {
+    // Unconditional branch
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(BPF::JMP)).addMBB(TBB);
+    return 1;
+  }
+
+  llvm_unreachable("Unexpected conditional branch");
+}
+
+unsigned BPFInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                    int *BytesRemoved) const {
+  assert(!BytesRemoved && "code size not handled");
+
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (I->getOpcode() != BPF::JMP)
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h
new file mode 100644
index 000000000000..c7048ab979b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.h
@@ -0,0 +1,61 @@
+//===-- BPFInstrInfo.h - BPF Instruction Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the BPF implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFINSTRINFO_H
+#define LLVM_LIB_TARGET_BPF_BPFINSTRINFO_H
+
+#include "BPFRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "BPFGenInstrInfo.inc"
+
+namespace llvm {
+
+class BPFInstrInfo : public BPFGenInstrInfo {
+  const BPFRegisterInfo RI;
+
+public:
+  BPFInstrInfo();
+
+  const BPFRegisterInfo &getRegisterInfo() const { return RI; }
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+                           bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI, unsigned DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td
new file mode 100644
index 000000000000..a7910dea98de
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -0,0 +1,578 @@
+//===-- BPFInstrInfo.td - Target Description for BPF Target ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the BPF instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "BPFInstrFormats.td"
+
+// Instruction Operands and Patterns
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_BPFCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+def SDT_BPFCall         : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+def SDT_BPFSetFlag      : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>]>;
+def SDT_BPFSelectCC     : SDTypeProfile<1, 5, [SDTCisSameAs<1, 2>,
+                                               SDTCisSameAs<0, 4>,
+                                               SDTCisSameAs<4, 5>]>;
+def SDT_BPFBrCC         : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>,
+                                               SDTCisVT<3, OtherVT>]>;
+def SDT_BPFWrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+                                               SDTCisPtrTy<0>]>;
+
+def BPFcall         : SDNode<"BPFISD::CALL", SDT_BPFCall,
+                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                              SDNPVariadic]>;
+def BPFretflag      : SDNode<"BPFISD::RET_FLAG", SDTNone,
+                             [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def BPFcallseq_start: SDNode<"ISD::CALLSEQ_START", SDT_BPFCallSeqStart,
+                             [SDNPHasChain, SDNPOutGlue]>;
+def BPFcallseq_end  : SDNode<"ISD::CALLSEQ_END",   SDT_BPFCallSeqEnd,
+                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def BPFbrcc         : SDNode<"BPFISD::BR_CC", SDT_BPFBrCC,
+                             [SDNPHasChain, SDNPOutGlue, SDNPInGlue]>;
+
+def BPFselectcc     : SDNode<"BPFISD::SELECT_CC", SDT_BPFSelectCC, [SDNPInGlue]>;
+def BPFWrapper      : SDNode<"BPFISD::Wrapper", SDT_BPFWrapper>;
+
+def brtarget : Operand<OtherVT>;
+def calltarget : Operand<i64>;
+
+def u64imm   : Operand<i64> {
+  let PrintMethod = "printImm64Operand";
+}
+
+def i64immSExt32 : PatLeaf<(imm),
+                [{return isInt<32>(N->getSExtValue()); }]>;
+
+// Addressing modes.
+def ADDRri : ComplexPattern<i64, 2, "SelectAddr", [], []>;
+def FIri : ComplexPattern<i64, 2, "SelectFIAddr", [add, or], []>;
+
+// Address operands
+def MEMri : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let EncoderMethod = "getMemoryOpValue";
+  let DecoderMethod = "decodeMemoryOpValue";
+  let MIOperandInfo = (ops GPR, i16imm);
+}
+
+// Conditional code predicates - used for pattern matching for jump instructions
+def BPF_CC_EQ  : PatLeaf<(imm),
+                         [{return (N->getZExtValue() == ISD::SETEQ);}]>;
+def BPF_CC_NE  : PatLeaf<(imm),
+                         [{return (N->getZExtValue() == ISD::SETNE);}]>;
+def BPF_CC_GE  : PatLeaf<(imm),
+                         [{return (N->getZExtValue() == ISD::SETGE);}]>;
+def BPF_CC_GT  : PatLeaf<(imm),
+                         [{return (N->getZExtValue() == ISD::SETGT);}]>;
+def BPF_CC_GTU : PatLeaf<(imm),
+                         [{return (N->getZExtValue() == ISD::SETUGT);}]>;
+def BPF_CC_GEU : PatLeaf<(imm),
+                         [{return (N->getZExtValue() == ISD::SETUGE);}]>;
+
+// jump instructions
+class JMP_RR<bits<4> Opc, string OpcodeStr, PatLeaf Cond>
+    : InstBPF<(outs), (ins GPR:$dst, GPR:$src, brtarget:$BrDst),
+              "if $dst "#OpcodeStr#" $src goto $BrDst",
+              [(BPFbrcc i64:$dst, i64:$src, Cond, bb:$BrDst)]> {
+  bits<4> op;
+  bits<1> BPFSrc;
+  bits<4> dst;
+  bits<4> src;
+  bits<16> BrDst;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{55-52} = src;
+  let Inst{51-48} = dst;
+  let Inst{47-32} = BrDst;
+
+  let op = Opc;
+  let BPFSrc = 1;
+  let BPFClass = 5; // BPF_JMP
+}
+
+class JMP_RI<bits<4> Opc, string OpcodeStr, PatLeaf Cond>
+    : InstBPF<(outs), (ins GPR:$dst, i64imm:$imm, brtarget:$BrDst),
+              "if $dst "#OpcodeStr#" $imm goto $BrDst",
+              [(BPFbrcc i64:$dst, i64immSExt32:$imm, Cond, bb:$BrDst)]> {
+  bits<4> op;
+  bits<1> BPFSrc;
+  bits<4> dst;
+  bits<16> BrDst;
+  bits<32> imm;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{51-48} = dst;
+  let Inst{47-32} = BrDst;
+  let Inst{31-0} = imm;
+
+  let op = Opc;
+  let BPFSrc = 0;
+  let BPFClass = 5; // BPF_JMP
+}
+
+multiclass J<bits<4> Opc, string OpcodeStr, PatLeaf Cond> {
+  def _rr : JMP_RR<Opc, OpcodeStr, Cond>;
+  def _ri : JMP_RI<Opc, OpcodeStr, Cond>;
+}
+
+let isBranch = 1, isTerminator = 1, hasDelaySlot=0 in {
+// cmp+goto instructions
+defm JEQ  : J<0x1, "==",  BPF_CC_EQ>;
+defm JUGT : J<0x2, ">", BPF_CC_GTU>;
+defm JUGE : J<0x3, ">=", BPF_CC_GEU>;
+defm JNE  : J<0x5, "!=",  BPF_CC_NE>;
+defm JSGT : J<0x6, "s>", BPF_CC_GT>;
+defm JSGE : J<0x7, "s>=", BPF_CC_GE>;
+}
+
+// ALU instructions
+class ALU_RI<bits<4> Opc, string OpcodeStr, SDNode OpNode>
+    : InstBPF<(outs GPR:$dst), (ins GPR:$src2, i64imm:$imm),
+              "$dst "#OpcodeStr#" $imm",
+              [(set GPR:$dst, (OpNode GPR:$src2, i64immSExt32:$imm))]> {
+  bits<4> op;
+  bits<1> BPFSrc;
+  bits<4> dst;
+  bits<32> imm;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{51-48} = dst;
+  let Inst{31-0} = imm;
+
+  let op = Opc;
+  let BPFSrc = 0;
+  let BPFClass = 7; // BPF_ALU64
+}
+
+class ALU_RR<bits<4> Opc, string OpcodeStr, SDNode OpNode>
+    : InstBPF<(outs GPR:$dst), (ins GPR:$src2, GPR:$src),
+              "$dst "#OpcodeStr#" $src",
+              [(set GPR:$dst, (OpNode i64:$src2, i64:$src))]> {
+  bits<4> op;
+  bits<1> BPFSrc;
+  bits<4> dst;
+  bits<4> src;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{55-52} = src;
+  let Inst{51-48} = dst;
+
+  let op = Opc;
+  let BPFSrc = 1;
+  let BPFClass = 7; // BPF_ALU64
+}
+
+multiclass ALU<bits<4> Opc, string OpcodeStr, SDNode OpNode> {
+  def _rr : ALU_RR<Opc, OpcodeStr, OpNode>;
+  def _ri : ALU_RI<Opc, OpcodeStr, OpNode>;
+}
+
+let Constraints = "$dst = $src2" in {
+let isAsCheapAsAMove = 1 in {
+  defm ADD : ALU<0x0, "+=", add>;
+  defm SUB : ALU<0x1, "-=", sub>;
+  defm OR  : ALU<0x4, "|=", or>;
+  defm AND : ALU<0x5, "&=", and>;
+  defm SLL : ALU<0x6, "<<=", shl>;
+  defm SRL : ALU<0x7, ">>=", srl>;
+  defm XOR : ALU<0xa, "^=", xor>;
+  defm SRA : ALU<0xc, "s>>=", sra>;
+}
+  defm MUL : ALU<0x2, "*=", mul>;
+  defm DIV : ALU<0x3, "/=", udiv>;
+}
+
+class MOV_RR<string OpcodeStr>
+    : InstBPF<(outs GPR:$dst), (ins GPR:$src),
+              "$dst "#OpcodeStr#" $src",
+              []> {
+  bits<4> op;
+  bits<1> BPFSrc;
+  bits<4> dst;
+  bits<4> src;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{55-52} = src;
+  let Inst{51-48} = dst;
+
+  let op = 0xb;     // BPF_MOV
+  let BPFSrc = 1;   // BPF_X
+  let BPFClass = 7; // BPF_ALU64
+}
+
+class MOV_RI<string OpcodeStr>
+    : InstBPF<(outs GPR:$dst), (ins i64imm:$imm),
+              "$dst "#OpcodeStr#" $imm",
+              [(set GPR:$dst, (i64 i64immSExt32:$imm))]> {
+  bits<4> op;
+  bits<1> BPFSrc;
+  bits<4> dst;
+  bits<32> imm;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{51-48} = dst;
+  let Inst{31-0} = imm;
+
+  let op = 0xb;     // BPF_MOV
+  let BPFSrc = 0;   // BPF_K
+  let BPFClass = 7; // BPF_ALU64
+}
+
+class LD_IMM64<bits<4> Pseudo, string OpcodeStr>
+    : InstBPF<(outs GPR:$dst), (ins u64imm:$imm),
+              "$dst "#OpcodeStr#" ${imm}ll",
+              [(set GPR:$dst, (i64 imm:$imm))]> {
+
+  bits<3> mode;
+  bits<2> size;
+  bits<4> dst;
+  bits<64> imm;
+
+  let Inst{63-61} = mode;
+  let Inst{60-59} = size;
+  let Inst{51-48} = dst;
+  let Inst{55-52} = Pseudo;
+  let Inst{47-32} = 0;
+  let Inst{31-0} = imm{31-0};
+
+  let mode = 0;     // BPF_IMM
+  let size = 3;     // BPF_DW
+  let BPFClass = 0; // BPF_LD
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def LD_imm64 : LD_IMM64<0, "=">;
+def MOV_rr : MOV_RR<"=">;
+def MOV_ri : MOV_RI<"=">;
+}
+
+def FI_ri
+    : InstBPF<(outs GPR:$dst), (ins MEMri:$addr),
+               "lea\t$dst, $addr",
+               [(set i64:$dst, FIri:$addr)]> {
+  // This is a tentative instruction, and will be replaced
+  // with MOV_rr and ADD_ri in PEI phase
+  let Inst{63-61} = 0;
+  let Inst{60-59} = 3;
+  let Inst{51-48} = 0;
+  let Inst{55-52} = 2;
+  let Inst{47-32} = 0;
+  let Inst{31-0} = 0;
+  let BPFClass = 0;
+}
+
+
+def LD_pseudo
+    : InstBPF<(outs GPR:$dst), (ins i64imm:$pseudo, u64imm:$imm),
+              "ld_pseudo\t$dst, $pseudo, $imm",
+              [(set GPR:$dst, (int_bpf_pseudo imm:$pseudo, imm:$imm))]> {
+
+  bits<3> mode;
+  bits<2> size;
+  bits<4> dst;
+  bits<64> imm;
+  bits<4> pseudo;
+
+  let Inst{63-61} = mode;
+  let Inst{60-59} = size;
+  let Inst{51-48} = dst;
+  let Inst{55-52} = pseudo;
+  let Inst{47-32} = 0;
+  let Inst{31-0} = imm{31-0};
+
+  let mode = 0;     // BPF_IMM
+  let size = 3;     // BPF_DW
+  let BPFClass = 0; // BPF_LD
+}
+
+// STORE instructions
+class STORE<bits<2> SizeOp, string OpcodeStr, list<dag> Pattern>
+    : InstBPF<(outs), (ins GPR:$src, MEMri:$addr),
+              "*("#OpcodeStr#" *)($addr) = $src", Pattern> {
+  bits<3> mode;
+  bits<2> size;
+  bits<4> src;
+  bits<20> addr;
+
+  let Inst{63-61} = mode;
+  let Inst{60-59} = size;
+  let Inst{51-48} = addr{19-16}; // base reg
+  let Inst{55-52} = src;
+  let Inst{47-32} = addr{15-0}; // offset
+
+  let mode = 3;     // BPF_MEM
+  let size = SizeOp;
+  let BPFClass = 3; // BPF_STX
+}
+
+class STOREi64<bits<2> Opc, string OpcodeStr, PatFrag OpNode>
+    : STORE<Opc, OpcodeStr, [(OpNode i64:$src, ADDRri:$addr)]>;
+
+def STW : STOREi64<0x0, "u32", truncstorei32>;
+def STH : STOREi64<0x1, "u16", truncstorei16>;
+def STB : STOREi64<0x2, "u8", truncstorei8>;
+def STD : STOREi64<0x3, "u64", store>;
+
+// LOAD instructions
+class LOAD<bits<2> SizeOp, string OpcodeStr, list<dag> Pattern>
+    : InstBPF<(outs GPR:$dst), (ins MEMri:$addr),
+              "$dst = *("#OpcodeStr#" *)($addr)", Pattern> {
+  bits<3> mode;
+  bits<2> size;
+  bits<4> dst;
+  bits<20> addr;
+
+  let Inst{63-61} = mode;
+  let Inst{60-59} = size;
+  let Inst{51-48} = dst;
+  let Inst{55-52} = addr{19-16};
+  let Inst{47-32} = addr{15-0};
+
+  let mode = 3;     // BPF_MEM
+  let size = SizeOp;
+  let BPFClass = 1; // BPF_LDX
+}
+
+class LOADi64<bits<2> SizeOp, string OpcodeStr, PatFrag OpNode>
+    : LOAD<SizeOp, OpcodeStr, [(set i64:$dst, (OpNode ADDRri:$addr))]>;
+
+def LDW : LOADi64<0x0, "u32", zextloadi32>;
+def LDH : LOADi64<0x1, "u16", zextloadi16>;
+def LDB : LOADi64<0x2, "u8", zextloadi8>;
+def LDD : LOADi64<0x3, "u64", load>;
+
+class BRANCH<bits<4> Opc, string OpcodeStr, list<dag> Pattern>
+    : InstBPF<(outs), (ins brtarget:$BrDst),
+              !strconcat(OpcodeStr, " $BrDst"), Pattern> {
+  bits<4> op;
+  bits<16> BrDst;
+  bits<1> BPFSrc;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{47-32} = BrDst;
+
+  let op = Opc;
+  let BPFSrc = 0;
+  let BPFClass = 5; // BPF_JMP
+}
+
+class CALL<string OpcodeStr>
+    : InstBPF<(outs), (ins calltarget:$BrDst),
+              !strconcat(OpcodeStr, " $BrDst"), []> {
+  bits<4> op;
+  bits<32> BrDst;
+  bits<1> BPFSrc;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{31-0} = BrDst;
+
+  let op = 8;       // BPF_CALL
+  let BPFSrc = 0;
+  let BPFClass = 5; // BPF_JMP
+}
+
+// Jump always
+let isBranch = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1 in {
+  def JMP : BRANCH<0x0, "goto", [(br bb:$BrDst)]>;
+}
+
+// Jump and link
+let isCall=1, hasDelaySlot=0, Uses = [R11],
+    // Potentially clobbered registers
+    Defs = [R0, R1, R2, R3, R4, R5] in {
+  def JAL  : CALL<"call">;
+}
+
+class NOP_I<string OpcodeStr>
+    : InstBPF<(outs), (ins i32imm:$imm),
+              !strconcat(OpcodeStr, "\t$imm"), []> {
+  // mov r0, r0 == nop
+  bits<4> op;
+  bits<1> BPFSrc;
+  bits<4> dst;
+  bits<4> src;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{55-52} = src;
+  let Inst{51-48} = dst;
+
+  let op = 0xb;     // BPF_MOV
+  let BPFSrc = 1;   // BPF_X
+  let BPFClass = 7; // BPF_ALU64
+  let src = 0;      // R0
+  let dst = 0;      // R0
+}
+
+let hasSideEffects = 0 in
+  def NOP : NOP_I<"nop">;
+
+class RET<string OpcodeStr>
+    : InstBPF<(outs), (ins),
+              !strconcat(OpcodeStr, ""), [(BPFretflag)]> {
+  bits<4> op;
+
+  let Inst{63-60} = op;
+  let Inst{59} = 0;
+  let Inst{31-0} = 0;
+
+  let op = 9;       // BPF_EXIT
+  let BPFClass = 5; // BPF_JMP
+}
+
+let isReturn = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1,
+    isNotDuplicable = 1 in {
+  def RET : RET<"exit">;
+}
+
+// ADJCALLSTACKDOWN/UP pseudo insns
+let Defs = [R11], Uses = [R11] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
+                              "#ADJCALLSTACKDOWN $amt",
+                              [(BPFcallseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+                              "#ADJCALLSTACKUP $amt1 $amt2",
+                              [(BPFcallseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let usesCustomInserter = 1 in {
+  def Select : Pseudo<(outs GPR:$dst),
+                      (ins GPR:$lhs, GPR:$rhs, i64imm:$imm, GPR:$src, GPR:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i64:$dst,
+                       (BPFselectcc i64:$lhs, i64:$rhs, (i64 imm:$imm), i64:$src, i64:$src2))]>;
+}
+
+// load 64-bit global addr into register
+def : Pat<(BPFWrapper tglobaladdr:$in), (LD_imm64 tglobaladdr:$in)>;
+
+// 0xffffFFFF doesn't fit into simm32, optimize common case
+def : Pat<(i64 (and (i64 GPR:$src), 0xffffFFFF)),
+          (SRL_ri (SLL_ri (i64 GPR:$src), 32), 32)>;
+
+// Calls
+def : Pat<(BPFcall tglobaladdr:$dst), (JAL tglobaladdr:$dst)>;
+def : Pat<(BPFcall imm:$dst), (JAL imm:$dst)>;
+
+// Loads
+def : Pat<(extloadi8  ADDRri:$src), (i64 (LDB ADDRri:$src))>;
+def : Pat<(extloadi16 ADDRri:$src), (i64 (LDH ADDRri:$src))>;
+def : Pat<(extloadi32 ADDRri:$src), (i64 (LDW ADDRri:$src))>;
+
+// Atomics
+class XADD<bits<2> SizeOp, string OpcodeStr, PatFrag OpNode>
+    : InstBPF<(outs GPR:$dst), (ins MEMri:$addr, GPR:$val),
+              "lock *("#OpcodeStr#" *)($addr) += $val",
+              [(set GPR:$dst, (OpNode ADDRri:$addr, GPR:$val))]> {
+  bits<3> mode;
+  bits<2> size;
+  bits<4> dst;
+  bits<20> addr;
+
+  let Inst{63-61} = mode;
+  let Inst{60-59} = size;
+  let Inst{51-48} = addr{19-16}; // base reg
+  let Inst{55-52} = dst;
+  let Inst{47-32} = addr{15-0}; // offset
+
+  let mode = 6;     // BPF_XADD
+  let size = SizeOp;
+  let BPFClass = 3; // BPF_STX
+}
+
+let Constraints = "$dst = $val" in {
+def XADD32 : XADD<0, "u32", atomic_load_add_32>;
+def XADD64 : XADD<3, "u64", atomic_load_add_64>;
+// undefined def XADD16 : XADD<1, "xadd16", atomic_load_add_16>;
+// undefined def XADD8  : XADD<2, "xadd8", atomic_load_add_8>;
+}
+
+// bswap16, bswap32, bswap64
+class BSWAP<bits<32> SizeOp, string OpcodeStr, list<dag> Pattern>
+    : InstBPF<(outs GPR:$dst), (ins GPR:$src),
+              !strconcat(OpcodeStr, "\t$dst"),
+              Pattern> {
+  bits<4> op;
+  bits<1> BPFSrc;
+  bits<4> dst;
+  bits<32> imm;
+
+  let Inst{63-60} = op;
+  let Inst{59} = BPFSrc;
+  let Inst{51-48} = dst;
+  let Inst{31-0} = imm;
+
+  let op = 0xd;     // BPF_END
+  let BPFSrc = 1;   // BPF_TO_BE (TODO: use BPF_TO_LE for big-endian target)
+  let BPFClass = 4; // BPF_ALU
+  let imm = SizeOp;
+}
+
+let Constraints = "$dst = $src" in {
+def BSWAP16 : BSWAP<16, "bswap16", [(set GPR:$dst, (srl (bswap GPR:$src), (i64 48)))]>;
+def BSWAP32 : BSWAP<32, "bswap32", [(set GPR:$dst, (srl (bswap GPR:$src), (i64 32)))]>;
+def BSWAP64 : BSWAP<64, "bswap64", [(set GPR:$dst, (bswap GPR:$src))]>;
+}
+
+let Defs = [R0, R1, R2, R3, R4, R5], Uses = [R6], hasSideEffects = 1,
+    hasExtraDefRegAllocReq = 1, hasExtraSrcRegAllocReq = 1, mayLoad = 1 in {
+class LOAD_ABS<bits<2> SizeOp, string OpcodeStr, Intrinsic OpNode>
+    : InstBPF<(outs), (ins GPR:$skb, i64imm:$imm),
+              "r0 = *("#OpcodeStr#" *)skb[$imm]",
+              [(set R0, (OpNode GPR:$skb, i64immSExt32:$imm))]> {
+  bits<3> mode;
+  bits<2> size;
+  bits<32> imm;
+
+  let Inst{63-61} = mode;
+  let Inst{60-59} = size;
+  let Inst{31-0} = imm;
+
+  let mode = 1;     // BPF_ABS
+  let size = SizeOp;
+  let BPFClass = 0; // BPF_LD
+}
+
+class LOAD_IND<bits<2> SizeOp, string OpcodeStr, Intrinsic OpNode>
+    : InstBPF<(outs), (ins GPR:$skb, GPR:$val),
+              "r0 = *("#OpcodeStr#" *)skb[$val]",
+              [(set R0, (OpNode GPR:$skb, GPR:$val))]> {
+  bits<3> mode;
+  bits<2> size;
+  bits<4> val;
+
+  let Inst{63-61} = mode;
+  let Inst{60-59} = size;
+  let Inst{55-52} = val;
+
+  let mode = 2;     // BPF_IND
+  let size = SizeOp;
+  let BPFClass = 0; // BPF_LD
+}
+}
+
+def LD_ABS_B : LOAD_ABS<2, "u8", int_bpf_load_byte>;
+def LD_ABS_H : LOAD_ABS<1, "u16", int_bpf_load_half>;
+def LD_ABS_W : LOAD_ABS<0, "u32", int_bpf_load_word>;
+
+def LD_IND_B : LOAD_IND<2, "u8", int_bpf_load_byte>;
+def LD_IND_H : LOAD_IND<1, "u16", int_bpf_load_half>;
+def LD_IND_W : LOAD_IND<0, "u32", int_bpf_load_word>;
diff --git a/contrib/llvm/lib/Target/BPF/BPFMCInstLower.cpp b/contrib/llvm/lib/Target/BPF/BPFMCInstLower.cpp
new file mode 100644
index 000000000000..f64defecf3cc
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFMCInstLower.cpp
@@ -0,0 +1,76 @@
+//=-- BPFMCInstLower.cpp - Convert BPF MachineInstr to an MCInst ------------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower BPF MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFMCInstLower.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+MCSymbol *
+BPFMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  return Printer.getSymbol(MO.getGlobal());
+}
+
+MCOperand BPFMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                             MCSymbol *Sym) const {
+
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
+
+  if (!MO.isJTI() && MO.getOffset())
+    llvm_unreachable("unknown symbol op");
+
+  return MCOperand::createExpr(Expr);
+}
+
+void BPFMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit())
+        continue;
+      MCOp = MCOperand::createReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::createImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::createExpr(
+          MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
+      break;
+    case MachineOperand::MO_RegisterMask:
+      continue;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+      break;
+    }
+
+    OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFMCInstLower.h b/contrib/llvm/lib/Target/BPF/BPFMCInstLower.h
new file mode 100644
index 000000000000..054e89407db2
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFMCInstLower.h
@@ -0,0 +1,43 @@
+//===-- BPFMCInstLower.h - Lower MachineInstr to MCInst ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFMCINSTLOWER_H
+#define LLVM_LIB_TARGET_BPF_BPFMCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+class MachineInstr;
+class MachineModuleInfoMachO;
+class MachineOperand;
+class Mangler;
+
+// BPFMCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class LLVM_LIBRARY_VISIBILITY BPFMCInstLower {
+  MCContext &Ctx;
+
+  AsmPrinter &Printer;
+
+public:
+  BPFMCInstLower(MCContext &ctx, AsmPrinter &printer)
+      : Ctx(ctx), Printer(printer) {}
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
new file mode 100644
index 000000000000..71846e3e92c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
@@ -0,0 +1,103 @@
+//===-- BPFRegisterInfo.cpp - BPF Register Information ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the BPF implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFRegisterInfo.h"
+#include "BPFSubtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "BPFGenRegisterInfo.inc"
+using namespace llvm;
+
+BPFRegisterInfo::BPFRegisterInfo()
+    : BPFGenRegisterInfo(BPF::R0) {}
+
+const MCPhysReg *
+BPFRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  return CSR_SaveList;
+}
+
+BitVector BPFRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(BPF::R10); // R10 is read only frame pointer
+  Reserved.set(BPF::R11); // R11 is pseudo stack pointer
+  return Reserved;
+}
+
+void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                          int SPAdj, unsigned FIOperandNum,
+                                          RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  unsigned FrameReg = getFrameRegister(MF);
+  int FrameIndex = MI.getOperand(i).getIndex();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  if (MI.getOpcode() == BPF::MOV_rr) {
+    int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex);
+
+    MI.getOperand(i).ChangeToRegister(FrameReg, false);
+    unsigned reg = MI.getOperand(i - 1).getReg();
+    BuildMI(MBB, ++II, DL, TII.get(BPF::ADD_ri), reg)
+        .addReg(reg)
+        .addImm(Offset);
+    return;
+  }
+
+  int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex) +
+               MI.getOperand(i + 1).getImm();
+
+  if (!isInt<32>(Offset))
+    llvm_unreachable("bug in frame offset");
+
+  if (MI.getOpcode() == BPF::FI_ri) {
+    // architecture does not really support FI_ri, replace it with
+    //    MOV_rr <target_reg>, frame_reg
+    //    ADD_ri <target_reg>, imm
+    unsigned reg = MI.getOperand(i - 1).getReg();
+
+    BuildMI(MBB, ++II, DL, TII.get(BPF::MOV_rr), reg)
+        .addReg(FrameReg);
+    BuildMI(MBB, II, DL, TII.get(BPF::ADD_ri), reg)
+        .addReg(reg)
+        .addImm(Offset);
+
+    // Remove FI_ri instruction
+    MI.eraseFromParent();
+  } else {
+    MI.getOperand(i).ChangeToRegister(FrameReg, false);
+    MI.getOperand(i + 1).ChangeToImmediate(Offset);
+  }
+}
+
+unsigned BPFRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return BPF::R10;
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h
new file mode 100644
index 000000000000..7072dd0bde1a
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.h
@@ -0,0 +1,40 @@
+//===-- BPFRegisterInfo.h - BPF Register Information Impl -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the BPF implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFREGISTERINFO_H
+#define LLVM_LIB_TARGET_BPF_BPFREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "BPFGenRegisterInfo.inc"
+
+namespace llvm {
+
+struct BPFRegisterInfo : public BPFGenRegisterInfo {
+
+  BPFRegisterInfo();
+
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.td b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.td
new file mode 100644
index 000000000000..c8e24f810310
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.td
@@ -0,0 +1,41 @@
+//===-- BPFRegisterInfo.td - BPF Register defs -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the BPF register file
+//===----------------------------------------------------------------------===//
+
+// Registers are identified with 4-bit ID numbers.
+// Ri - 64-bit integer registers
+class Ri<bits<16> Enc, string n> : Register<n> {
+  let Namespace = "BPF";
+  let HWEncoding = Enc;
+}
+
+// Integer registers
+def R0 : Ri< 0, "r0">, DwarfRegNum<[0]>;
+def R1 : Ri< 1, "r1">, DwarfRegNum<[1]>;
+def R2 : Ri< 2, "r2">, DwarfRegNum<[2]>;
+def R3 : Ri< 3, "r3">, DwarfRegNum<[3]>;
+def R4 : Ri< 4, "r4">, DwarfRegNum<[4]>;
+def R5 : Ri< 5, "r5">, DwarfRegNum<[5]>;
+def R6 : Ri< 6, "r6">, DwarfRegNum<[6]>;
+def R7 : Ri< 7, "r7">, DwarfRegNum<[7]>;
+def R8 : Ri< 8, "r8">, DwarfRegNum<[8]>;
+def R9 : Ri< 9, "r9">, DwarfRegNum<[9]>;
+def R10 : Ri<10, "r10">, DwarfRegNum<[10]>;
+def R11 : Ri<11, "r11">, DwarfRegNum<[11]>;
+
+// Register classes.
+def GPR : RegisterClass<"BPF", [i64], 64, (add R1, R2, R3, R4, R5,
+                                           R6, R7, R8, R9, // callee saved
+                                           R0,  // return value
+                                           R11, // stack ptr
+                                           R10  // frame ptr
+                                          )>;
diff --git a/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp b/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp
new file mode 100644
index 000000000000..c3a8b1caa63d
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFSubtarget.cpp
@@ -0,0 +1,31 @@
+//===-- BPFSubtarget.cpp - BPF Subtarget Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the BPF specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPFSubtarget.h"
+#include "BPF.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "BPFGenSubtargetInfo.inc"
+
+void BPFSubtarget::anchor() {}
+
+BPFSubtarget::BPFSubtarget(const Triple &TT, const std::string &CPU,
+                           const std::string &FS, const TargetMachine &TM)
+    : BPFGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(*this),
+      TLInfo(TM, *this) {}
diff --git a/contrib/llvm/lib/Target/BPF/BPFSubtarget.h b/contrib/llvm/lib/Target/BPF/BPFSubtarget.h
new file mode 100644
index 000000000000..27cc9a262fc3
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFSubtarget.h
@@ -0,0 +1,64 @@
+//===-- BPFSubtarget.h - Define Subtarget for the BPF -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the BPF specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFSUBTARGET_H
+#define LLVM_LIB_TARGET_BPF_BPFSUBTARGET_H
+
+#include "BPFFrameLowering.h"
+#include "BPFISelLowering.h"
+#include "BPFInstrInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "BPFGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class BPFSubtarget : public BPFGenSubtargetInfo {
+  virtual void anchor();
+  BPFInstrInfo InstrInfo;
+  BPFFrameLowering FrameLowering;
+  BPFTargetLowering TLInfo;
+  SelectionDAGTargetInfo TSInfo;
+
+public:
+  // This constructor initializes the data members to match that
+  // of the specified triple.
+  BPFSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
+               const TargetMachine &TM);
+
+  // ParseSubtargetFeatures - Parses features string setting specified
+  // subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  const BPFInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const BPFFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const BPFTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const TargetRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp
new file mode 100644
index 000000000000..897695633e46
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -0,0 +1,82 @@
+//===-- BPFTargetMachine.cpp - Define TargetMachine for BPF ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about BPF target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFTargetMachine.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeBPFTarget() {
+  // Register the target.
+  RegisterTargetMachine<BPFTargetMachine> X(getTheBPFleTarget());
+  RegisterTargetMachine<BPFTargetMachine> Y(getTheBPFbeTarget());
+  RegisterTargetMachine<BPFTargetMachine> Z(getTheBPFTarget());
+}
+
+// DataLayout: little or big endian
+static std::string computeDataLayout(const Triple &TT) {
+  if (TT.getArch() == Triple::bpfeb)
+    return "E-m:e-p:64:64-i64:64-n32:64-S128";
+  else
+    return "e-m:e-p:64:64-i64:64-n32:64-S128";
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::PIC_;
+  return *RM;
+}
+
+BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT,
+                                   StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
+                                   Optional<Reloc::Model> RM,
+                                   CodeModel::Model CM, CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
+                        getEffectiveRelocModel(RM), CM, OL),
+      TLOF(make_unique<TargetLoweringObjectFileELF>()),
+      Subtarget(TT, CPU, FS, *this) {
+  initAsmInfo();
+}
+namespace {
+// BPF Code Generator Pass Configuration Options.
+class BPFPassConfig : public TargetPassConfig {
+public:
+  BPFPassConfig(BPFTargetMachine *TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  BPFTargetMachine &getBPFTargetMachine() const {
+    return getTM<BPFTargetMachine>();
+  }
+
+  bool addInstSelector() override;
+};
+}
+
+TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new BPFPassConfig(this, PM);
+}
+
+// Install an instruction selector pass using
+// the ISelDag to gen BPF code.
+bool BPFPassConfig::addInstSelector() {
+  addPass(createBPFISelDag(getBPFTargetMachine()));
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/BPF/BPFTargetMachine.h b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.h
new file mode 100644
index 000000000000..644481446883
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.h
@@ -0,0 +1,44 @@
+//===-- BPFTargetMachine.h - Define TargetMachine for BPF --- C++ ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the BPF specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_BPFTARGETMACHINE_H
+#define LLVM_LIB_TARGET_BPF_BPFTARGETMACHINE_H
+
+#include "BPFSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class BPFTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  BPFSubtarget Subtarget;
+
+public:
+  BPFTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                   StringRef FS, const TargetOptions &Options,
+                   Optional<Reloc::Model> RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL);
+
+  const BPFSubtarget *getSubtargetImpl() const { return &Subtarget; }
+  const BPFSubtarget *getSubtargetImpl(const Function &) const override {
+    return &Subtarget;
+  }
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
new file mode 100644
index 000000000000..b0037fbc16ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -0,0 +1,154 @@
+//===- BPFDisassembler.cpp - Disassembler for BPF ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the BPF Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFRegisterInfo.h"
+#include "BPFSubtarget.h"
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bpf-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// A disassembler class for BPF.
+class BPFDisassembler : public MCDisassembler {
+public:
+  BPFDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+      : MCDisassembler(STI, Ctx) {}
+  virtual ~BPFDisassembler() {}
+
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
+};
+}
+
+static MCDisassembler *createBPFDisassembler(const Target &T,
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  return new BPFDisassembler(STI, Ctx);
+}
+
+
+extern "C" void LLVMInitializeBPFDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(getTheBPFTarget(),
+                                         createBPFDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheBPFleTarget(),
+                                         createBPFDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheBPFbeTarget(),
+                                         createBPFDisassembler);
+}
+
+static const unsigned GPRDecoderTable[] = {
+    BPF::R0,  BPF::R1,  BPF::R2,  BPF::R3,  BPF::R4,  BPF::R5,
+    BPF::R6,  BPF::R7,  BPF::R8,  BPF::R9,  BPF::R10, BPF::R11};
+
+static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t /*Address*/,
+                                           const void * /*Decoder*/) {
+  if (RegNo > 11)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = GPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeMemoryOpValue(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder) {
+  unsigned Register = (Insn >> 16) & 0xf;
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+  unsigned Offset = (Insn & 0xffff);
+  Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
+
+  return MCDisassembler::Success;
+}
+
+#include "BPFGenDisassemblerTables.inc"
+
+static DecodeStatus readInstruction64(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                      uint64_t &Size, uint64_t &Insn) {
+  uint64_t Lo, Hi;
+
+  if (Bytes.size() < 8) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  Size = 8;
+  Hi = (Bytes[0] << 24) | (Bytes[1] << 16) | (Bytes[2] << 0) | (Bytes[3] << 8);
+  Lo = (Bytes[4] << 0) | (Bytes[5] << 8) | (Bytes[6] << 16) | (Bytes[7] << 24);
+  Insn = Make_64(Hi, Lo);
+
+  return MCDisassembler::Success;
+}
+
+DecodeStatus BPFDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+                                             ArrayRef<uint8_t> Bytes,
+                                             uint64_t Address,
+                                             raw_ostream &VStream,
+                                             raw_ostream &CStream) const {
+  uint64_t Insn;
+  DecodeStatus Result;
+
+  Result = readInstruction64(Bytes, Address, Size, Insn);
+  if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
+
+  Result = decodeInstruction(DecoderTableBPF64, Instr, Insn,
+                             Address, this, STI);
+  if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
+
+  switch (Instr.getOpcode()) {
+  case BPF::LD_imm64: {
+    if (Bytes.size() < 16) {
+      Size = 0;
+      return MCDisassembler::Fail;
+    }
+    Size = 16;
+    uint64_t Hi = (Bytes[12] << 0) | (Bytes[13] << 8) | (Bytes[14] << 16) | (Bytes[15] << 24);
+    auto& Op = Instr.getOperand(1);
+    Op.setImm(Make_64(Hi, Op.getImm()));
+    break;
+  }
+  case BPF::LD_ABS_B:
+  case BPF::LD_ABS_H:
+  case BPF::LD_ABS_W:
+  case BPF::LD_IND_B:
+  case BPF::LD_IND_H:
+  case BPF::LD_IND_W: {
+    auto Op = Instr.getOperand(0);
+    Instr.clear();
+    Instr.addOperand(MCOperand::createReg(BPF::R6));
+    Instr.addOperand(Op);
+    break;
+  }
+  }
+
+  return Result;
+}
+
+typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned insn, uint64_t Address,
+                                   const void *Decoder);
diff --git a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
new file mode 100644
index 000000000000..ffd29f3ea991
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
@@ -0,0 +1,94 @@
+//===-- BPFInstPrinter.cpp - Convert BPF MCInst to asm syntax -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an BPF MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFInstPrinter.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#include "BPFGenAsmWriter.inc"
+
+void BPFInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot, const MCSubtargetInfo &STI) {
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+static void printExpr(const MCExpr *Expr, raw_ostream &O) {
+#ifndef NDEBUG
+  const MCSymbolRefExpr *SRE;
+
+  if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr))
+    SRE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+  else
+    SRE = dyn_cast<MCSymbolRefExpr>(Expr);
+  assert(SRE && "Unexpected MCExpr type.");
+
+  MCSymbolRefExpr::VariantKind Kind = SRE->getKind();
+
+  assert(Kind == MCSymbolRefExpr::VK_None);
+#endif
+  O << *Expr;
+}
+
+void BPFInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O, const char *Modifier) {
+  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    O << getRegisterName(Op.getReg());
+  } else if (Op.isImm()) {
+    O << (int32_t)Op.getImm();
+  } else {
+    assert(Op.isExpr() && "Expected an expression");
+    printExpr(Op.getExpr(), O);
+  }
+}
+
+void BPFInstPrinter::printMemOperand(const MCInst *MI, int OpNo, raw_ostream &O,
+                                     const char *Modifier) {
+  const MCOperand &RegOp = MI->getOperand(OpNo);
+  const MCOperand &OffsetOp = MI->getOperand(OpNo + 1);
+
+  // register
+  assert(RegOp.isReg() && "Register operand not a register");
+  O << getRegisterName(RegOp.getReg());
+
+  // offset
+  if (OffsetOp.isImm()) {
+    auto Imm = OffsetOp.getImm();
+    if (Imm >= 0)
+      O << " + " << formatDec(Imm);
+    else
+      O << " - " << formatDec(-Imm);
+  } else {
+    assert(0 && "Expected an immediate");
+  }
+}
+
+void BPFInstPrinter::printImm64Operand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << (uint64_t)Op.getImm();
+  else
+    O << Op;
+}
diff --git a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h
new file mode 100644
index 000000000000..4276d0858c2e
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h
@@ -0,0 +1,40 @@
+//===-- BPFInstPrinter.h - Convert BPF MCInst to asm syntax -------*- C++ -*--//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a BPF MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_INSTPRINTER_BPFINSTPRINTER_H
+#define LLVM_LIB_TARGET_BPF_INSTPRINTER_BPFINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+class BPFInstPrinter : public MCInstPrinter {
+public:
+  BPFInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                 const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                    const char *Modifier = nullptr);
+  void printMemOperand(const MCInst *MI, int OpNo, raw_ostream &O,
+                       const char *Modifier = nullptr);
+  void printImm64Operand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
new file mode 100644
index 000000000000..a6cd2002c12c
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -0,0 +1,109 @@
+//===-- BPFAsmBackend.cpp - BPF Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class BPFAsmBackend : public MCAsmBackend {
+public:
+  bool IsLittleEndian;
+
+  BPFAsmBackend(bool IsLittleEndian)
+    : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
+  ~BPFAsmBackend() override {}
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+  // No instruction requires relaxation
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+
+  unsigned getNumFixupKinds() const override { return 1; }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {}
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+
+bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  if ((Count % 8) != 0)
+    return false;
+
+  for (uint64_t i = 0; i < Count; i += 8)
+    OW->write64(0x15000000);
+
+  return true;
+}
+
+void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                               unsigned DataSize, uint64_t Value,
+                               bool IsPCRel) const {
+
+  if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
+    assert(Value == 0);
+  } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) {
+    unsigned Size = Fixup.getKind() == FK_Data_4 ? 4 : 8;
+
+    for (unsigned i = 0; i != Size; ++i) {
+      unsigned Idx = IsLittleEndian ? i : Size - i;
+      Data[Fixup.getOffset() + Idx] = uint8_t(Value >> (i * 8));
+    }
+  } else {
+    assert(Fixup.getKind() == FK_PCRel_2);
+    Value = (uint16_t)((Value - 8) / 8);
+    if (IsLittleEndian) {
+      Data[Fixup.getOffset() + 2] = Value & 0xFF;
+      Data[Fixup.getOffset() + 3] = Value >> 8;
+    } else {
+      Data[Fixup.getOffset() + 2] = Value >> 8;
+      Data[Fixup.getOffset() + 3] = Value & 0xFF;
+    }
+  }
+}
+
+MCObjectWriter *BPFAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
+  return createBPFELFObjectWriter(OS, 0, IsLittleEndian);
+}
+}
+
+MCAsmBackend *llvm::createBPFAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        const Triple &TT, StringRef CPU,
+                                        const MCTargetOptions&) {
+  return new BPFAsmBackend(/*IsLittleEndian=*/true);
+}
+
+MCAsmBackend *llvm::createBPFbeAsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          const Triple &TT, StringRef CPU,
+                                          const MCTargetOptions&) {
+  return new BPFAsmBackend(/*IsLittleEndian=*/false);
+}
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
new file mode 100644
index 000000000000..3d1c0eb55afa
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -0,0 +1,58 @@
+//===-- BPFELFObjectWriter.cpp - BPF ELF Writer ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class BPFELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  BPFELFObjectWriter(uint8_t OSABI);
+
+  ~BPFELFObjectWriter() override;
+
+protected:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+};
+}
+
+BPFELFObjectWriter::BPFELFObjectWriter(uint8_t OSABI)
+    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_BPF,
+                              /*HasRelocationAddend*/ false) {}
+
+BPFELFObjectWriter::~BPFELFObjectWriter() {}
+
+unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel) const {
+  // determine the type of the relocation
+  switch ((unsigned)Fixup.getKind()) {
+  default:
+    llvm_unreachable("invalid fixup kind!");
+  case FK_SecRel_8:
+    return ELF::R_BPF_64_64;
+  case FK_SecRel_4:
+    return ELF::R_BPF_64_32;
+  case FK_Data_8:
+    return ELF::R_BPF_64_64;
+  case FK_Data_4:
+    return ELF::R_BPF_64_32;
+  }
+}
+
+MCObjectWriter *llvm::createBPFELFObjectWriter(raw_pwrite_stream &OS,
+                                               uint8_t OSABI, bool IsLittleEndian) {
+  MCELFObjectTargetWriter *MOTW = new BPFELFObjectWriter(OSABI);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
new file mode 100644
index 000000000000..559ac291a79e
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -0,0 +1,50 @@
+//===-- BPFMCAsmInfo.h - BPF asm properties -------------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the BPFMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H
+#define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+class Target;
+
+class BPFMCAsmInfo : public MCAsmInfo {
+public:
+  explicit BPFMCAsmInfo(const Triple &TT) {
+    if (TT.getArch() == Triple::bpfeb)
+      IsLittleEndian = false;
+
+    PrivateGlobalPrefix = ".L";
+    WeakRefDirective = "\t.weak\t";
+
+    UsesELFSectionDirectiveForBSS = true;
+    HasSingleParameterDotFile = false;
+    HasDotTypeDotSizeDirective = false;
+
+    SupportsDebugInformation = true;
+    ExceptionsType = ExceptionHandling::DwarfCFI;
+    MinInstAlignment = 8;
+
+    // the default is 4 and it only affects dwarf elf output
+    // so if not set correctly, the dwarf data will be
+    // messed up in random places by 4 bytes. .debug_line
+    // section will be parsable, but with odd offsets and
+    // line numbers, etc.
+    PointerSize = 8;
+  }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
new file mode 100644
index 000000000000..47f16512a397
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -0,0 +1,179 @@
+//===-- BPFMCCodeEmitter.cpp - Convert BPF code to machine code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the BPFMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+namespace {
+class BPFMCCodeEmitter : public MCCodeEmitter {
+  BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete;
+  void operator=(const BPFMCCodeEmitter &) = delete;
+  const MCInstrInfo &MCII;
+  const MCRegisterInfo &MRI;
+  bool IsLittleEndian;
+
+public:
+  BPFMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
+                   bool IsLittleEndian)
+      : MCII(mcii), MRI(mri), IsLittleEndian(IsLittleEndian) {}
+
+  ~BPFMCCodeEmitter() {}
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  // getMachineOpValue - Return binary encoding of operand. If the machin
+  // operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  uint64_t getMemoryOpValue(const MCInst &MI, unsigned Op,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+private:
+  uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+  void verifyInstructionPredicates(const MCInst &MI,
+                                   uint64_t AvailableFeatures) const;
+};
+}
+
+MCCodeEmitter *llvm::createBPFMCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
+                                            MCContext &Ctx) {
+  return new BPFMCCodeEmitter(MCII, MRI, true);
+}
+
+MCCodeEmitter *llvm::createBPFbeMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              MCContext &Ctx) {
+  return new BPFMCCodeEmitter(MCII, MRI, false);
+}
+
+unsigned BPFMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                             const MCOperand &MO,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return MRI.getEncodingValue(MO.getReg());
+  if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+
+  assert(MO.isExpr());
+
+  const MCExpr *Expr = MO.getExpr();
+
+  assert(Expr->getKind() == MCExpr::SymbolRef);
+
+  if (MI.getOpcode() == BPF::JAL)
+    // func call name
+    Fixups.push_back(MCFixup::create(0, Expr, FK_SecRel_4));
+  else if (MI.getOpcode() == BPF::LD_imm64)
+    Fixups.push_back(MCFixup::create(0, Expr, FK_SecRel_8));
+  else
+    // bb label
+    Fixups.push_back(MCFixup::create(0, Expr, FK_PCRel_2));
+
+  return 0;
+}
+
+static uint8_t SwapBits(uint8_t Val)
+{
+  return (Val & 0x0F) << 4 | (Val & 0xF0) >> 4;
+}
+
+void BPFMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  verifyInstructionPredicates(MI,
+                              computeAvailableFeatures(STI.getFeatureBits()));
+
+  unsigned Opcode = MI.getOpcode();
+  support::endian::Writer<support::little> LE(OS);
+  support::endian::Writer<support::big> BE(OS);
+
+  if (Opcode == BPF::LD_imm64 || Opcode == BPF::LD_pseudo) {
+    uint64_t Value = getBinaryCodeForInstr(MI, Fixups, STI);
+    LE.write<uint8_t>(Value >> 56);
+    if (IsLittleEndian)
+      LE.write<uint8_t>((Value >> 48) & 0xff);
+    else
+      LE.write<uint8_t>(SwapBits((Value >> 48) & 0xff));
+    LE.write<uint16_t>(0);
+    if (IsLittleEndian)
+      LE.write<uint32_t>(Value & 0xffffFFFF);
+    else
+      BE.write<uint32_t>(Value & 0xffffFFFF);
+
+    const MCOperand &MO = MI.getOperand(1);
+    uint64_t Imm = MO.isImm() ? MO.getImm() : 0;
+    LE.write<uint8_t>(0);
+    LE.write<uint8_t>(0);
+    LE.write<uint16_t>(0);
+    if (IsLittleEndian)
+      LE.write<uint32_t>(Imm >> 32);
+    else
+      BE.write<uint32_t>(Imm >> 32);
+  } else {
+    // Get instruction encoding and emit it
+    uint64_t Value = getBinaryCodeForInstr(MI, Fixups, STI);
+    LE.write<uint8_t>(Value >> 56);
+    if (IsLittleEndian) {
+      LE.write<uint8_t>((Value >> 48) & 0xff);
+      LE.write<uint16_t>((Value >> 32) & 0xffff);
+      LE.write<uint32_t>(Value & 0xffffFFFF);
+    } else {
+      LE.write<uint8_t>(SwapBits((Value >> 48) & 0xff));
+      BE.write<uint16_t>((Value >> 32) & 0xffff);
+      BE.write<uint32_t>(Value & 0xffffFFFF);
+    }
+  }
+}
+
+// Encode BPF Memory Operand
+uint64_t BPFMCCodeEmitter::getMemoryOpValue(const MCInst &MI, unsigned Op,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  uint64_t Encoding;
+  const MCOperand Op1 = MI.getOperand(1);
+  assert(Op1.isReg() && "First operand is not register.");
+  Encoding = MRI.getEncodingValue(Op1.getReg());
+  Encoding <<= 16;
+  MCOperand Op2 = MI.getOperand(2);
+  assert(Op2.isImm() && "Second operand is not immediate.");
+  Encoding |= Op2.getImm() & 0xffff;
+  return Encoding;
+}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "BPFGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
new file mode 100644
index 000000000000..55415f97396b
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -0,0 +1,116 @@
+//===-- BPFMCTargetDesc.cpp - BPF Target Descriptions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides BPF specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFMCTargetDesc.h"
+#include "BPFMCAsmInfo.h"
+#include "InstPrinter/BPFInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "BPFGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "BPFGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "BPFGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createBPFMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitBPFMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createBPFMCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitBPFMCRegisterInfo(X, BPF::R11 /* RAReg doesn't exist */);
+  return X;
+}
+
+static MCSubtargetInfo *createBPFMCSubtargetInfo(const Triple &TT,
+                                                 StringRef CPU, StringRef FS) {
+  return createBPFMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCStreamer *createBPFMCStreamer(const Triple &T,
+                                       MCContext &Ctx, MCAsmBackend &MAB,
+                                       raw_pwrite_stream &OS, MCCodeEmitter *Emitter,
+                                       bool RelaxAll) {
+  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
+}
+
+static MCInstPrinter *createBPFMCInstPrinter(const Triple &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
+                                             const MCRegisterInfo &MRI) {
+  if (SyntaxVariant == 0)
+    return new BPFInstPrinter(MAI, MII, MRI);
+  return 0;
+}
+
+extern "C" void LLVMInitializeBPFTargetMC() {
+  for (Target *T :
+       {&getTheBPFleTarget(), &getTheBPFbeTarget(), &getTheBPFTarget()}) {
+    // Register the MC asm info.
+    RegisterMCAsmInfo<BPFMCAsmInfo> X(*T);
+
+    // Register the MC instruction info.
+    TargetRegistry::RegisterMCInstrInfo(*T, createBPFMCInstrInfo);
+
+    // Register the MC register info.
+    TargetRegistry::RegisterMCRegInfo(*T, createBPFMCRegisterInfo);
+
+    // Register the MC subtarget info.
+    TargetRegistry::RegisterMCSubtargetInfo(*T,
+                                            createBPFMCSubtargetInfo);
+
+    // Register the object streamer
+    TargetRegistry::RegisterELFStreamer(*T, createBPFMCStreamer);
+
+    // Register the MCInstPrinter.
+    TargetRegistry::RegisterMCInstPrinter(*T, createBPFMCInstPrinter);
+  }
+
+  // Register the MC code emitter
+  TargetRegistry::RegisterMCCodeEmitter(getTheBPFleTarget(),
+                                        createBPFMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(getTheBPFbeTarget(),
+                                        createBPFbeMCCodeEmitter);
+
+  // Register the ASM Backend
+  TargetRegistry::RegisterMCAsmBackend(getTheBPFleTarget(),
+                                       createBPFAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(getTheBPFbeTarget(),
+                                       createBPFbeAsmBackend);
+
+  if (sys::IsLittleEndianHost) {
+    TargetRegistry::RegisterMCCodeEmitter(getTheBPFTarget(),
+                                          createBPFMCCodeEmitter);
+    TargetRegistry::RegisterMCAsmBackend(getTheBPFTarget(),
+                                         createBPFAsmBackend);
+  } else {
+    TargetRegistry::RegisterMCCodeEmitter(getTheBPFTarget(),
+                                          createBPFbeMCCodeEmitter);
+    TargetRegistry::RegisterMCAsmBackend(getTheBPFTarget(),
+                                         createBPFbeAsmBackend);
+  }
+}
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
new file mode 100644
index 000000000000..3df673eaeb4b
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
@@ -0,0 +1,71 @@
+//===-- BPFMCTargetDesc.h - BPF Target Descriptions -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides BPF specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCTARGETDESC_H
+#define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Config/config.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class StringRef;
+class Target;
+class Triple;
+class raw_ostream;
+class raw_pwrite_stream;
+
+Target &getTheBPFleTarget();
+Target &getTheBPFbeTarget();
+Target &getTheBPFTarget();
+
+MCCodeEmitter *createBPFMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
+                                      MCContext &Ctx);
+MCCodeEmitter *createBPFbeMCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        MCContext &Ctx);
+
+MCAsmBackend *createBPFAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  const Triple &TT, StringRef CPU,
+                                  const MCTargetOptions &Options);
+MCAsmBackend *createBPFbeAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                    const Triple &TT, StringRef CPU,
+                                    const MCTargetOptions &Options);
+
+MCObjectWriter *createBPFELFObjectWriter(raw_pwrite_stream &OS,
+                                         uint8_t OSABI, bool IsLittleEndian);
+}
+
+// Defines symbolic names for BPF registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "BPFGenRegisterInfo.inc"
+
+// Defines symbolic names for the BPF instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "BPFGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "BPFGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp b/contrib/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
new file mode 100644
index 000000000000..265180b99876
--- /dev/null
+++ b/contrib/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
@@ -0,0 +1,36 @@
+//===-- BPFTargetInfo.cpp - BPF Target Implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+namespace llvm {
+Target &getTheBPFleTarget() {
+  static Target TheBPFleTarget;
+  return TheBPFleTarget;
+}
+Target &getTheBPFbeTarget() {
+  static Target TheBPFbeTarget;
+  return TheBPFbeTarget;
+}
+Target &getTheBPFTarget() {
+  static Target TheBPFTarget;
+  return TheBPFTarget;
+}
+} // namespace llvm
+
+extern "C" void LLVMInitializeBPFTargetInfo() {
+  TargetRegistry::RegisterTarget(getTheBPFTarget(), "bpf", "BPF (host endian)",
+                                 [](Triple::ArchType) { return false; }, true);
+  RegisterTarget<Triple::bpfel, /*HasJIT=*/true> X(getTheBPFleTarget(), "bpfel",
+                                                   "BPF (little endian)");
+  RegisterTarget<Triple::bpfeb, /*HasJIT=*/true> Y(getTheBPFbeTarget(), "bpfeb",
+                                                   "BPF (big endian)");
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
new file mode 100644
index 000000000000..becc086c81b0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -0,0 +1,2154 @@
+//===-- HexagonAsmParser.cpp - Parse Hexagon asm to MCInst instructions----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mcasmparser"
+
+#include "Hexagon.h"
+#include "HexagonTargetStreamer.h"
+#include "MCTargetDesc/HexagonMCChecker.h"
+#include "MCTargetDesc/HexagonMCELFStreamer.h"
+#include "MCTargetDesc/HexagonMCExpr.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "MCTargetDesc/HexagonShuffler.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+using namespace llvm;
+
+static cl::opt<bool> EnableFutureRegs("mfuture-regs",
+                                      cl::desc("Enable future registers"));
+
+static cl::opt<bool> WarnMissingParenthesis("mwarn-missing-parenthesis",
+cl::desc("Warn for missing parenthesis around predicate registers"),
+cl::init(true));
+static cl::opt<bool> ErrorMissingParenthesis("merror-missing-parenthesis",
+cl::desc("Error for missing parenthesis around predicate registers"),
+cl::init(false));
+static cl::opt<bool> WarnSignedMismatch("mwarn-sign-mismatch",
+cl::desc("Warn for mismatching a signed and unsigned value"),
+cl::init(true));
+static cl::opt<bool> WarnNoncontigiousRegister("mwarn-noncontigious-register",
+cl::desc("Warn for register names that arent contigious"),
+cl::init(true));
+static cl::opt<bool> ErrorNoncontigiousRegister("merror-noncontigious-register",
+cl::desc("Error for register names that aren't contigious"),
+cl::init(false));
+
+namespace {
+
+struct HexagonOperand;
+
+class HexagonAsmParser : public MCTargetAsmParser {
+
+  HexagonTargetStreamer &getTargetStreamer() {
+    MCTargetStreamer &TS = *Parser.getStreamer().getTargetStreamer();
+    return static_cast<HexagonTargetStreamer &>(TS);
+  }
+
+  MCAsmParser &Parser;
+  MCAssembler *Assembler;
+  MCInstrInfo const &MCII;
+  MCInst MCB;
+  bool InBrackets;
+
+  MCAsmParser &getParser() const { return Parser; }
+  MCAssembler *getAssembler() const { return Assembler; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  bool equalIsAsmAssignment() override { return false; }
+  bool isLabel(AsmToken &Token) override;
+
+  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+  bool ParseDirectiveFalign(unsigned Size, SMLoc L);
+
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  bool ParseDirectiveSubsection(SMLoc L);
+  bool ParseDirectiveValue(unsigned Size, SMLoc L);
+  bool ParseDirectiveComm(bool IsLocal, SMLoc L);
+  bool RegisterMatchesArch(unsigned MatchNum) const;
+
+  bool matchBundleOptions();
+  bool handleNoncontigiousRegister(bool Contigious, SMLoc &Loc);
+  bool finishBundle(SMLoc IDLoc, MCStreamer &Out);
+  void canonicalizeImmediates(MCInst &MCI);
+  bool matchOneInstruction(MCInst &MCB, SMLoc IDLoc,
+                           OperandVector &InstOperands, uint64_t &ErrorInfo,
+                           bool MatchingInlineAsm);
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo, bool MatchingInlineAsm) override;
+
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override;
+  bool OutOfRange(SMLoc IDLoc, long long Val, long long Max);
+  int processInstruction(MCInst &Inst, OperandVector const &Operands,
+                         SMLoc IDLoc);
+
+  // Check if we have an assembler and, if so, set the ELF e_header flags.
+  void chksetELFHeaderEFlags(unsigned flags) {
+    if (getAssembler())
+      getAssembler()->setELFHeaderEFlags(flags);
+  }
+
+  unsigned matchRegister(StringRef Name);
+
+/// @name Auto-generated Match Functions
+/// {
+
+#define GET_ASSEMBLER_HEADER
+#include "HexagonGenAsmMatcher.inc"
+
+  /// }
+
+public:
+  HexagonAsmParser(const MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+                   const MCInstrInfo &MII, const MCTargetOptions &Options)
+    : MCTargetAsmParser(Options, _STI), Parser(_Parser),
+      MCII (MII), MCB(HexagonMCInstrInfo::createBundle()), InBrackets(false) {
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+
+    MCAsmParserExtension::Initialize(_Parser);
+
+    Assembler = nullptr;
+    // FIXME: need better way to detect AsmStreamer (upstream removed getKind())
+    if (!Parser.getStreamer().hasRawTextSupport()) {
+      MCELFStreamer *MES = static_cast<MCELFStreamer *>(&Parser.getStreamer());
+      Assembler = &MES->getAssembler();
+    }
+  }
+
+  bool splitIdentifier(OperandVector &Operands);
+  bool parseOperand(OperandVector &Operands);
+  bool parseInstruction(OperandVector &Operands);
+  bool implicitExpressionLocation(OperandVector &Operands);
+  bool parseExpressionOrOperand(OperandVector &Operands);
+  bool parseExpression(MCExpr const *& Expr);
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override
+  {
+    llvm_unreachable("Unimplemented");
+  }
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, AsmToken ID,
+                        OperandVector &Operands) override;
+
+  bool ParseDirective(AsmToken DirectiveID) override;
+};
+
+/// HexagonOperand - Instances of this class represent a parsed Hexagon machine
+/// instruction.
+struct HexagonOperand : public MCParsedAsmOperand {
+  enum KindTy { Token, Immediate, Register } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+  struct TokTy {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegTy {
+    unsigned RegNum;
+  };
+
+  struct ImmTy {
+    const MCExpr *Val;
+  };
+
+  struct InstTy {
+    OperandVector *SubInsts;
+  };
+
+  union {
+    struct TokTy Tok;
+    struct RegTy Reg;
+    struct ImmTy Imm;
+  };
+
+  HexagonOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+public:
+  HexagonOperand(const HexagonOperand &o) : MCParsedAsmOperand() {
+    Kind = o.Kind;
+    StartLoc = o.StartLoc;
+    EndLoc = o.EndLoc;
+    switch (Kind) {
+    case Register:
+      Reg = o.Reg;
+      break;
+    case Immediate:
+      Imm = o.Imm;
+      break;
+    case Token:
+      Tok = o.Tok;
+      break;
+    }
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+
+  unsigned getReg() const override {
+    assert(Kind == Register && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  bool isToken() const override { return Kind == Token; }
+  bool isImm() const override { return Kind == Immediate; }
+  bool isMem() const override { llvm_unreachable("No isMem"); }
+  bool isReg() const override { return Kind == Register; }
+
+  bool CheckImmRange(int immBits, int zeroBits, bool isSigned,
+                     bool isRelocatable, bool Extendable) const {
+    if (Kind == Immediate) {
+      const MCExpr *myMCExpr = &HexagonMCInstrInfo::getExpr(*getImm());
+      if (HexagonMCInstrInfo::mustExtend(*Imm.Val) && !Extendable)
+        return false;
+      int64_t Res;
+      if (myMCExpr->evaluateAsAbsolute(Res)) {
+        int bits = immBits + zeroBits;
+        // Field bit range is zerobits + bits
+        // zeroBits must be 0
+        if (Res & ((1 << zeroBits) - 1))
+          return false;
+        if (isSigned) {
+          if (Res < (1LL << (bits - 1)) && Res >= -(1LL << (bits - 1)))
+            return true;
+        } else {
+          if (bits == 64)
+            return true;
+          if (Res >= 0)
+            return ((uint64_t)Res < (uint64_t)(1ULL << bits));
+          else {
+            const int64_t high_bit_set = 1ULL << 63;
+            const uint64_t mask = (high_bit_set >> (63 - bits));
+            return (((uint64_t)Res & mask) == mask);
+          }
+        }
+      } else if (myMCExpr->getKind() == MCExpr::SymbolRef && isRelocatable)
+        return true;
+      else if (myMCExpr->getKind() == MCExpr::Binary ||
+               myMCExpr->getKind() == MCExpr::Unary)
+        return true;
+    }
+    return false;
+  }
+
+  bool isf32Ext() const { return false; }
+  bool iss32_0Imm() const { return CheckImmRange(32, 0, true, true, false); }
+  bool iss23_2Imm() const { return CheckImmRange(23, 2, true, true, false); }
+  bool iss8_0Imm() const { return CheckImmRange(8, 0, true, false, false); }
+  bool iss8_0Imm64() const { return CheckImmRange(8, 0, true, true, false); }
+  bool iss7_0Imm() const { return CheckImmRange(7, 0, true, false, false); }
+  bool iss6_0Imm() const { return CheckImmRange(6, 0, true, false, false); }
+  bool iss4_0Imm() const { return CheckImmRange(4, 0, true, false, false); }
+  bool iss4_1Imm() const { return CheckImmRange(4, 1, true, false, false); }
+  bool iss4_2Imm() const { return CheckImmRange(4, 2, true, false, false); }
+  bool iss4_3Imm() const { return CheckImmRange(4, 3, true, false, false); }
+  bool iss4_6Imm() const { return CheckImmRange(4, 0, true, false, false); }
+  bool iss3_6Imm() const { return CheckImmRange(3, 0, true, false, false); }
+  bool iss3_0Imm() const { return CheckImmRange(3, 0, true, false, false); }
+
+  bool isu64_0Imm() const { return CheckImmRange(64, 0, false, true, true); }
+  bool isu32_0Imm() const { return CheckImmRange(32, 0, false, true, false); }
+  bool isu26_6Imm() const { return CheckImmRange(26, 6, false, true, false); }
+  bool isu16_0Imm() const { return CheckImmRange(16, 0, false, true, false); }
+  bool isu16_1Imm() const { return CheckImmRange(16, 1, false, true, false); }
+  bool isu16_2Imm() const { return CheckImmRange(16, 2, false, true, false); }
+  bool isu16_3Imm() const { return CheckImmRange(16, 3, false, true, false); }
+  bool isu11_3Imm() const { return CheckImmRange(11, 3, false, false, false); }
+  bool isu6_1Imm() const { return CheckImmRange(6, 1, false, false, false); }
+  bool isu6_2Imm() const { return CheckImmRange(6, 2, false, false, false); }
+  bool isu6_3Imm() const { return CheckImmRange(6, 3, false, false, false); }
+  bool isu10_0Imm() const { return CheckImmRange(10, 0, false, false, false); }
+  bool isu9_0Imm() const { return CheckImmRange(9, 0, false, false, false); }
+  bool isu8_0Imm() const { return CheckImmRange(8, 0, false, false, false); }
+  bool isu7_0Imm() const { return CheckImmRange(7, 0, false, false, false); }
+  bool isu6_0Imm() const { return CheckImmRange(6, 0, false, false, false); }
+  bool isu5_0Imm() const { return CheckImmRange(5, 0, false, false, false); }
+  bool isu4_0Imm() const { return CheckImmRange(4, 0, false, false, false); }
+  bool isu3_0Imm() const { return CheckImmRange(3, 0, false, false, false); }
+  bool isu2_0Imm() const { return CheckImmRange(2, 0, false, false, false); }
+  bool isu1_0Imm() const { return CheckImmRange(1, 0, false, false, false); }
+
+  bool ism6_0Imm() const { return CheckImmRange(6, 0, false, false, false); }
+  bool isn8_0Imm() const { return CheckImmRange(8, 0, false, false, false); }
+  bool isn1Const() const {
+    if (!isImm())
+      return false;
+    int64_t Value;
+    if (!getImm()->evaluateAsAbsolute(Value))
+      return false;
+    return Value == -1;
+  }
+
+  bool iss16_0Ext() const { return CheckImmRange(16 + 26, 0, true, true, true); }
+  bool iss12_0Ext() const { return CheckImmRange(12 + 26, 0, true, true, true); }
+  bool iss10_0Ext() const { return CheckImmRange(10 + 26, 0, true, true, true); }
+  bool iss9_0Ext() const { return CheckImmRange(9 + 26, 0, true, true, true); }
+  bool iss8_0Ext() const { return CheckImmRange(8 + 26, 0, true, true, true); }
+  bool iss7_0Ext() const { return CheckImmRange(7 + 26, 0, true, true, true); }
+  bool iss6_0Ext() const { return CheckImmRange(6 + 26, 0, true, true, true); }
+  bool iss11_0Ext() const {
+    return CheckImmRange(11 + 26, 0, true, true, true);
+  }
+  bool iss11_1Ext() const {
+    return CheckImmRange(11 + 26, 1, true, true, true);
+  }
+  bool iss11_2Ext() const {
+    return CheckImmRange(11 + 26, 2, true, true, true);
+  }
+  bool iss11_3Ext() const {
+    return CheckImmRange(11 + 26, 3, true, true, true);
+  }
+
+  bool isu7_0Ext() const { return CheckImmRange(7 + 26, 0, false, true, true); }
+  bool isu8_0Ext() const { return CheckImmRange(8 + 26, 0, false, true, true); }
+  bool isu9_0Ext() const { return CheckImmRange(9 + 26, 0, false, true, true); }
+  bool isu10_0Ext() const { return CheckImmRange(10 + 26, 0, false, true, true); }
+  bool isu6_0Ext() const { return CheckImmRange(6 + 26, 0, false, true, true); }
+  bool isu6_1Ext() const { return CheckImmRange(6 + 26, 1, false, true, true); }
+  bool isu6_2Ext() const { return CheckImmRange(6 + 26, 2, false, true, true); }
+  bool isu6_3Ext() const { return CheckImmRange(6 + 26, 3, false, true, true); }
+  bool isu32_0MustExt() const { return isImm(); }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createExpr(getImm()));
+  }
+
+  void addSignedImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    HexagonMCExpr *Expr =
+        const_cast<HexagonMCExpr *>(cast<HexagonMCExpr>(getImm()));
+    int64_t Value;
+    if (!Expr->evaluateAsAbsolute(Value)) {
+      Inst.addOperand(MCOperand::createExpr(Expr));
+      return;
+    }
+    int64_t Extended = SignExtend64(Value, 32);
+    if ((Extended < 0) != (Value < 0))
+      Expr->setSignMismatch();
+    Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  void addf32ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void adds32_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds23_2ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds8_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds8_0Imm64Operands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds6_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds4_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds4_1ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds4_2ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds4_3ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds3_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+
+  void addu64_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu32_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu26_6ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu16_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu16_1ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu16_2ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu16_3ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu11_3ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu10_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu9_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu8_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu7_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6_1ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6_2ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6_3ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu5_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu4_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu3_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu2_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu1_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void addm6_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addn8_0ImmOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void adds16_0ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds12_0ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds10_0ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds9_0ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds8_0ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds6_0ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds11_0ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds11_1ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds11_2ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void adds11_3ExtOperands(MCInst &Inst, unsigned N) const {
+    addSignedImmOperands(Inst, N);
+  }
+  void addn1ConstOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void addu7_0ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu8_0ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu9_0ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu10_0ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6_0ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6_1ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6_2ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu6_3ExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+  void addu32_0MustExtOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void adds4_6ImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE =
+        dyn_cast<MCConstantExpr>(&HexagonMCInstrInfo::getExpr(*getImm()));
+    Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
+  }
+
+  void adds3_6ImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE =
+        dyn_cast<MCConstantExpr>(&HexagonMCInstrInfo::getExpr(*getImm()));
+    Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
+  }
+
+  StringRef getToken() const {
+    assert(Kind == Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  void print(raw_ostream &OS) const override;
+
+  static std::unique_ptr<HexagonOperand> CreateToken(StringRef Str, SMLoc S) {
+    HexagonOperand *Op = new HexagonOperand(Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return std::unique_ptr<HexagonOperand>(Op);
+  }
+
+  static std::unique_ptr<HexagonOperand> CreateReg(unsigned RegNum, SMLoc S,
+                                                   SMLoc E) {
+    HexagonOperand *Op = new HexagonOperand(Register);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return std::unique_ptr<HexagonOperand>(Op);
+  }
+
+  static std::unique_ptr<HexagonOperand> CreateImm(const MCExpr *Val, SMLoc S,
+                                                   SMLoc E) {
+    HexagonOperand *Op = new HexagonOperand(Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return std::unique_ptr<HexagonOperand>(Op);
+  }
+};
+
+} // end anonymous namespace
+
+void HexagonOperand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case Immediate:
+    getImm()->print(OS, nullptr);
+    break;
+  case Register:
+    OS << "<register R";
+    OS << getReg() << ">";
+    break;
+  case Token:
+    OS << "'" << getToken() << "'";
+    break;
+  }
+}
+
+bool HexagonAsmParser::finishBundle(SMLoc IDLoc, MCStreamer &Out) {
+  DEBUG(dbgs() << "Bundle:");
+  DEBUG(MCB.dump_pretty(dbgs()));
+  DEBUG(dbgs() << "--\n");
+
+  // Check the bundle for errors.
+  const MCRegisterInfo *RI = getContext().getRegisterInfo();
+  HexagonMCChecker Check(MCII, getSTI(), MCB, MCB, *RI);
+
+  bool CheckOk = HexagonMCInstrInfo::canonicalizePacket(MCII, getSTI(),
+                                                        getContext(), MCB,
+                                                        &Check);
+
+  while (Check.getNextErrInfo()) {
+    unsigned Reg = Check.getErrRegister();
+    Twine R(RI->getName(Reg));
+
+    uint64_t Err = Check.getError();
+    if (Err != HexagonMCErrInfo::CHECK_SUCCESS) {
+      if (HexagonMCErrInfo::CHECK_ERROR_BRANCHES & Err)
+        return Error(
+            IDLoc,
+            "unconditional branch cannot precede another branch in packet");
+
+      if (HexagonMCErrInfo::CHECK_ERROR_NEWP & Err ||
+          HexagonMCErrInfo::CHECK_ERROR_NEWV & Err)
+        return Error(IDLoc, "register `" + R +
+                                "' used with `.new' "
+                                "but not validly modified in the same packet");
+
+      if (HexagonMCErrInfo::CHECK_ERROR_REGISTERS & Err)
+        return Error(IDLoc, "register `" + R + "' modified more than once");
+
+      if (HexagonMCErrInfo::CHECK_ERROR_READONLY & Err)
+        return Error(IDLoc, "cannot write to read-only register `" + R + "'");
+
+      if (HexagonMCErrInfo::CHECK_ERROR_LOOP & Err)
+        return Error(IDLoc, "loop-setup and some branch instructions "
+                            "cannot be in the same packet");
+
+      if (HexagonMCErrInfo::CHECK_ERROR_ENDLOOP & Err) {
+        Twine N(HexagonMCInstrInfo::isInnerLoop(MCB) ? '0' : '1');
+        return Error(IDLoc,
+                     "packet marked with `:endloop" + N + "' " +
+                         "cannot contain instructions that modify register " +
+                         "`" + R + "'");
+      }
+
+      if (HexagonMCErrInfo::CHECK_ERROR_SOLO & Err)
+        return Error(
+            IDLoc,
+            "instruction cannot appear in packet with other instructions");
+
+      if (HexagonMCErrInfo::CHECK_ERROR_NOSLOTS & Err)
+        return Error(IDLoc, "too many slots used in packet");
+
+      if (Err & HexagonMCErrInfo::CHECK_ERROR_SHUFFLE) {
+        uint64_t Erm = Check.getShuffleError();
+
+        if (HexagonShuffler::SHUFFLE_ERROR_INVALID == Erm)
+          return Error(IDLoc, "invalid instruction packet");
+        else if (HexagonShuffler::SHUFFLE_ERROR_STORES == Erm)
+          return Error(IDLoc, "invalid instruction packet: too many stores");
+        else if (HexagonShuffler::SHUFFLE_ERROR_LOADS == Erm)
+          return Error(IDLoc, "invalid instruction packet: too many loads");
+        else if (HexagonShuffler::SHUFFLE_ERROR_BRANCHES == Erm)
+          return Error(IDLoc, "too many branches in packet");
+        else if (HexagonShuffler::SHUFFLE_ERROR_NOSLOTS == Erm)
+          return Error(IDLoc, "invalid instruction packet: out of slots");
+        else if (HexagonShuffler::SHUFFLE_ERROR_SLOTS == Erm)
+          return Error(IDLoc, "invalid instruction packet: slot error");
+        else if (HexagonShuffler::SHUFFLE_ERROR_ERRATA2 == Erm)
+          return Error(IDLoc, "v60 packet violation");
+        else if (HexagonShuffler::SHUFFLE_ERROR_STORE_LOAD_CONFLICT == Erm)
+          return Error(IDLoc, "slot 0 instruction does not allow slot 1 store");
+        else
+          return Error(IDLoc, "unknown error in instruction packet");
+      }
+    }
+
+    unsigned Warn = Check.getWarning();
+    if (Warn != HexagonMCErrInfo::CHECK_SUCCESS) {
+      if (HexagonMCErrInfo::CHECK_WARN_CURRENT & Warn)
+        Warning(IDLoc, "register `" + R + "' used with `.cur' "
+                                          "but not used in the same packet");
+      else if (HexagonMCErrInfo::CHECK_WARN_TEMPORARY & Warn)
+        Warning(IDLoc, "register `" + R + "' used with `.tmp' "
+                                          "but not used in the same packet");
+    }
+  }
+
+  if (CheckOk) {
+    MCB.setLoc(IDLoc);
+    if (HexagonMCInstrInfo::bundleSize(MCB) == 0) {
+      assert(!HexagonMCInstrInfo::isInnerLoop(MCB));
+      assert(!HexagonMCInstrInfo::isOuterLoop(MCB));
+      // Empty packets are valid yet aren't emitted
+      return false;
+    }
+    Out.EmitInstruction(MCB, getSTI());
+  } else {
+    // If compounding and duplexing didn't reduce the size below
+    // 4 or less we have a packet that is too big.
+    if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE) {
+      Error(IDLoc, "invalid instruction packet: out of slots");
+      return true; // Error
+    }
+  }
+
+  return false; // No error
+}
+
+bool HexagonAsmParser::matchBundleOptions() {
+  MCAsmParser &Parser = getParser();
+  while (true) {
+    if (!Parser.getTok().is(AsmToken::Colon))
+      return false;
+    Lex();
+    StringRef Option = Parser.getTok().getString();
+    if (Option.compare_lower("endloop0") == 0)
+      HexagonMCInstrInfo::setInnerLoop(MCB);
+    else if (Option.compare_lower("endloop1") == 0)
+      HexagonMCInstrInfo::setOuterLoop(MCB);
+    else if (Option.compare_lower("mem_noshuf") == 0)
+      HexagonMCInstrInfo::setMemReorderDisabled(MCB);
+    else if (Option.compare_lower("mem_shuf") == 0)
+      HexagonMCInstrInfo::setMemStoreReorderEnabled(MCB);
+    else
+      return true;
+    Lex();
+  }
+}
+
+// For instruction aliases, immediates are generated rather than
+// MCConstantExpr.  Convert them for uniform MCExpr.
+// Also check for signed/unsigned mismatches and warn
+void HexagonAsmParser::canonicalizeImmediates(MCInst &MCI) {
+  MCInst NewInst;
+  NewInst.setOpcode(MCI.getOpcode());
+  for (MCOperand &I : MCI)
+    if (I.isImm()) {
+      int64_t Value (I.getImm());
+      NewInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(
+          MCConstantExpr::create(Value, getContext()), getContext())));
+    }
+    else {
+      if (I.isExpr() && cast<HexagonMCExpr>(I.getExpr())->signMismatch() &&
+          WarnSignedMismatch)
+        Warning (MCI.getLoc(), "Signed/Unsigned mismatch");
+      NewInst.addOperand(I);
+    }
+  MCI = NewInst;
+}
+
+bool HexagonAsmParser::matchOneInstruction(MCInst &MCI, SMLoc IDLoc,
+                                           OperandVector &InstOperands,
+                                           uint64_t &ErrorInfo,
+                                           bool MatchingInlineAsm) {
+  // Perform matching with tablegen asmmatcher generated function
+  int result =
+      MatchInstructionImpl(InstOperands, MCI, ErrorInfo, MatchingInlineAsm);
+  if (result == Match_Success) {
+    MCI.setLoc(IDLoc);
+    canonicalizeImmediates(MCI);
+    result = processInstruction(MCI, InstOperands, IDLoc);
+
+    DEBUG(dbgs() << "Insn:");
+    DEBUG(MCI.dump_pretty(dbgs()));
+    DEBUG(dbgs() << "\n\n");
+
+    MCI.setLoc(IDLoc);
+  }
+
+  // Create instruction operand for bundle instruction
+  //   Break this into a separate function Code here is less readable
+  //   Think about how to get an instruction error to report correctly.
+  //   SMLoc will return the "{"
+  switch (result) {
+  default:
+    break;
+  case Match_Success:
+    return false;
+  case Match_MissingFeature:
+    return Error(IDLoc, "invalid instruction");
+  case Match_MnemonicFail:
+    return Error(IDLoc, "unrecognized instruction");
+  case Match_InvalidOperand:
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= InstOperands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = (static_cast<HexagonOperand *>(InstOperands[ErrorInfo].get()))
+                     ->getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  llvm_unreachable("Implement any new match types added!");
+}
+
+bool HexagonAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                               OperandVector &Operands,
+                                               MCStreamer &Out,
+                                               uint64_t &ErrorInfo,
+                                               bool MatchingInlineAsm) {
+  if (!InBrackets) {
+    MCB.clear();
+    MCB.addOperand(MCOperand::createImm(0));
+  }
+  HexagonOperand &FirstOperand = static_cast<HexagonOperand &>(*Operands[0]);
+  if (FirstOperand.isToken() && FirstOperand.getToken() == "{") {
+    assert(Operands.size() == 1 && "Brackets should be by themselves");
+    if (InBrackets) {
+      getParser().Error(IDLoc, "Already in a packet");
+      return true;
+    }
+    InBrackets = true;
+    return false;
+  }
+  if (FirstOperand.isToken() && FirstOperand.getToken() == "}") {
+    assert(Operands.size() == 1 && "Brackets should be by themselves");
+    if (!InBrackets) {
+      getParser().Error(IDLoc, "Not in a packet");
+      return true;
+    }
+    InBrackets = false;
+    if (matchBundleOptions())
+      return true;
+    return finishBundle(IDLoc, Out);
+  }
+  MCInst *SubInst = new (getParser().getContext()) MCInst;
+  if (matchOneInstruction(*SubInst, IDLoc, Operands, ErrorInfo,
+                          MatchingInlineAsm))
+    return true;
+  HexagonMCInstrInfo::extendIfNeeded(
+      getParser().getContext(), MCII, MCB, *SubInst);
+  MCB.addOperand(MCOperand::createInst(SubInst));
+  if (!InBrackets)
+    return finishBundle(IDLoc, Out);
+  return false;
+}
+
+/// ParseDirective parses the Hexagon specific directives
+bool HexagonAsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if ((IDVal.lower() == ".word") || (IDVal.lower() == ".4byte"))
+    return ParseDirectiveValue(4, DirectiveID.getLoc());
+  if (IDVal.lower() == ".short" || IDVal.lower() == ".hword" ||
+      IDVal.lower() == ".half")
+    return ParseDirectiveValue(2, DirectiveID.getLoc());
+  if (IDVal.lower() == ".falign")
+    return ParseDirectiveFalign(256, DirectiveID.getLoc());
+  if ((IDVal.lower() == ".lcomm") || (IDVal.lower() == ".lcommon"))
+    return ParseDirectiveComm(true, DirectiveID.getLoc());
+  if ((IDVal.lower() == ".comm") || (IDVal.lower() == ".common"))
+    return ParseDirectiveComm(false, DirectiveID.getLoc());
+  if (IDVal.lower() == ".subsection")
+    return ParseDirectiveSubsection(DirectiveID.getLoc());
+
+  return true;
+}
+bool HexagonAsmParser::ParseDirectiveSubsection(SMLoc L) {
+  const MCExpr *Subsection = nullptr;
+  int64_t Res;
+
+  assert((getLexer().isNot(AsmToken::EndOfStatement)) &&
+         "Invalid subsection directive");
+  getParser().parseExpression(Subsection);
+
+  if (!Subsection->evaluateAsAbsolute(Res))
+    return Error(L, "Cannot evaluate subsection number");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  // 0-8192 is the hard-coded range in MCObjectStreamper.cpp, this keeps the
+  // negative subsections together and in the same order but at the opposite
+  // end of the section.  Only legacy hexagon-gcc created assembly code
+  // used negative subsections.
+  if ((Res < 0) && (Res > -8193))
+    Subsection = HexagonMCExpr::create(
+        MCConstantExpr::create(8192 + Res, getContext()), getContext());
+
+  getStreamer().SubSection(Subsection);
+  return false;
+}
+
+///  ::= .falign [expression]
+bool HexagonAsmParser::ParseDirectiveFalign(unsigned Size, SMLoc L) {
+
+  int64_t MaxBytesToFill = 15;
+
+  // if there is an argument
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    const MCExpr *Value;
+    SMLoc ExprLoc = L;
+
+    // Make sure we have a number (false is returned if expression is a number)
+    if (!getParser().parseExpression(Value)) {
+      // Make sure this is a number that is in range
+      const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
+      uint64_t IntValue = MCE->getValue();
+      if (!isUIntN(Size, IntValue) && !isIntN(Size, IntValue))
+        return Error(ExprLoc, "literal value out of range (256) for falign");
+      MaxBytesToFill = IntValue;
+      Lex();
+    } else {
+      return Error(ExprLoc, "not a valid expression for falign directive");
+    }
+  }
+
+  getTargetStreamer().emitFAlign(16, MaxBytesToFill);
+  Lex();
+
+  return false;
+}
+
+///  ::= .word [ expression (, expression)* ]
+bool HexagonAsmParser::ParseDirectiveValue(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    while (true) {
+      const MCExpr *Value;
+      SMLoc ExprLoc = L;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      // Special case constant expressions to match code generator.
+      if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
+        assert(Size <= 8 && "Invalid size");
+        uint64_t IntValue = MCE->getValue();
+        if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
+          return Error(ExprLoc, "literal value out of range for directive");
+        getStreamer().EmitIntValue(IntValue, Size);
+      } else
+        getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return TokError("unexpected token in directive");
+      Lex();
+    }
+  }
+
+  Lex();
+  return false;
+}
+
+// This is largely a copy of AsmParser's ParseDirectiveComm extended to
+// accept a 3rd argument, AccessAlignment which indicates the smallest
+// memory access made to the symbol, expressed in bytes.  If no
+// AccessAlignment is specified it defaults to the Alignment Value.
+// Hexagon's .lcomm:
+//   .lcomm Symbol, Length, Alignment, AccessAlignment
+bool HexagonAsmParser::ParseDirectiveComm(bool IsLocal, SMLoc Loc) {
+  // FIXME: need better way to detect if AsmStreamer (upstream removed
+  // getKind())
+  if (getStreamer().hasRawTextSupport())
+    return true; // Only object file output requires special treatment.
+
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return TokError("expected identifier in directive");
+  // Handle the identifier as the key symbol.
+  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  int64_t Size;
+  SMLoc SizeLoc = getLexer().getLoc();
+  if (getParser().parseAbsoluteExpression(Size))
+    return true;
+
+  int64_t ByteAlignment = 1;
+  SMLoc ByteAlignmentLoc;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+    ByteAlignmentLoc = getLexer().getLoc();
+    if (getParser().parseAbsoluteExpression(ByteAlignment))
+      return true;
+    if (!isPowerOf2_64(ByteAlignment))
+      return Error(ByteAlignmentLoc, "alignment must be a power of 2");
+  }
+
+  int64_t AccessAlignment = 0;
+  if (getLexer().is(AsmToken::Comma)) {
+    // The optional access argument specifies the size of the smallest memory
+    //   access to be made to the symbol, expressed in bytes.
+    SMLoc AccessAlignmentLoc;
+    Lex();
+    AccessAlignmentLoc = getLexer().getLoc();
+    if (getParser().parseAbsoluteExpression(AccessAlignment))
+      return true;
+
+    if (!isPowerOf2_64(AccessAlignment))
+      return Error(AccessAlignmentLoc, "access alignment must be a power of 2");
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.comm' or '.lcomm' directive");
+
+  Lex();
+
+  // NOTE: a size of zero for a .comm should create a undefined symbol
+  // but a size of .lcomm creates a bss symbol of size zero.
+  if (Size < 0)
+    return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't "
+                          "be less than zero");
+
+  // NOTE: The alignment in the directive is a power of 2 value, the assembler
+  // may internally end up wanting an alignment in bytes.
+  // FIXME: Diagnose overflow.
+  if (ByteAlignment < 0)
+    return Error(ByteAlignmentLoc, "invalid '.comm' or '.lcomm' directive "
+                                   "alignment, can't be less than zero");
+
+  if (!Sym->isUndefined())
+    return Error(Loc, "invalid symbol redefinition");
+
+  HexagonMCELFStreamer &HexagonELFStreamer =
+      static_cast<HexagonMCELFStreamer &>(getStreamer());
+  if (IsLocal) {
+    HexagonELFStreamer.HexagonMCEmitLocalCommonSymbol(Sym, Size, ByteAlignment,
+                                                      AccessAlignment);
+    return false;
+  }
+
+  HexagonELFStreamer.HexagonMCEmitCommonSymbol(Sym, Size, ByteAlignment,
+                                               AccessAlignment);
+  return false;
+}
+
+// validate register against architecture
+bool HexagonAsmParser::RegisterMatchesArch(unsigned MatchNum) const {
+  return true;
+}
+
+// extern "C" void LLVMInitializeHexagonAsmLexer();
+
+/// Force static initialization.
+extern "C" void LLVMInitializeHexagonAsmParser() {
+  RegisterMCAsmParser<HexagonAsmParser> X(getTheHexagonTarget());
+}
+
+#define GET_MATCHER_IMPLEMENTATION
+#define GET_REGISTER_MATCHER
+#include "HexagonGenAsmMatcher.inc"
+
+static bool previousEqual(OperandVector &Operands, size_t Index,
+                          StringRef String) {
+  if (Index >= Operands.size())
+    return false;
+  MCParsedAsmOperand &Operand = *Operands[Operands.size() - Index - 1];
+  if (!Operand.isToken())
+    return false;
+  return static_cast<HexagonOperand &>(Operand).getToken().equals_lower(String);
+}
+
+static bool previousIsLoop(OperandVector &Operands, size_t Index) {
+  return previousEqual(Operands, Index, "loop0") ||
+         previousEqual(Operands, Index, "loop1") ||
+         previousEqual(Operands, Index, "sp1loop0") ||
+         previousEqual(Operands, Index, "sp2loop0") ||
+         previousEqual(Operands, Index, "sp3loop0");
+}
+
+bool HexagonAsmParser::splitIdentifier(OperandVector &Operands) {
+  AsmToken const &Token = getParser().getTok();
+  StringRef String = Token.getString();
+  SMLoc Loc = Token.getLoc();
+  Lex();
+  do {
+    std::pair<StringRef, StringRef> HeadTail = String.split('.');
+    if (!HeadTail.first.empty())
+      Operands.push_back(HexagonOperand::CreateToken(HeadTail.first, Loc));
+    if (!HeadTail.second.empty())
+      Operands.push_back(HexagonOperand::CreateToken(
+          String.substr(HeadTail.first.size(), 1), Loc));
+    String = HeadTail.second;
+  } while (!String.empty());
+  return false;
+}
+
+bool HexagonAsmParser::parseOperand(OperandVector &Operands) {
+  unsigned Register;
+  SMLoc Begin;
+  SMLoc End;
+  MCAsmLexer &Lexer = getLexer();
+  if (!ParseRegister(Register, Begin, End)) {
+    if (!ErrorMissingParenthesis)
+      switch (Register) {
+      default:
+        break;
+      case Hexagon::P0:
+      case Hexagon::P1:
+      case Hexagon::P2:
+      case Hexagon::P3:
+        if (previousEqual(Operands, 0, "if")) {
+          if (WarnMissingParenthesis)
+            Warning (Begin, "Missing parenthesis around predicate register");
+          static char const *LParen = "(";
+          static char const *RParen = ")";
+          Operands.push_back(HexagonOperand::CreateToken(LParen, Begin));
+          Operands.push_back(HexagonOperand::CreateReg(Register, Begin, End));
+          const AsmToken &MaybeDotNew = Lexer.getTok();
+          if (MaybeDotNew.is(AsmToken::TokenKind::Identifier) &&
+              MaybeDotNew.getString().equals_lower(".new"))
+            splitIdentifier(Operands);
+          Operands.push_back(HexagonOperand::CreateToken(RParen, Begin));
+          return false;
+        }
+        if (previousEqual(Operands, 0, "!") &&
+            previousEqual(Operands, 1, "if")) {
+          if (WarnMissingParenthesis)
+            Warning (Begin, "Missing parenthesis around predicate register");
+          static char const *LParen = "(";
+          static char const *RParen = ")";
+          Operands.insert(Operands.end () - 1,
+                          HexagonOperand::CreateToken(LParen, Begin));
+          Operands.push_back(HexagonOperand::CreateReg(Register, Begin, End));
+          const AsmToken &MaybeDotNew = Lexer.getTok();
+          if (MaybeDotNew.is(AsmToken::TokenKind::Identifier) &&
+              MaybeDotNew.getString().equals_lower(".new"))
+            splitIdentifier(Operands);
+          Operands.push_back(HexagonOperand::CreateToken(RParen, Begin));
+          return false;
+        }
+        break;
+      }
+    Operands.push_back(HexagonOperand::CreateReg(
+        Register, Begin, End));
+    return false;
+  }
+  return splitIdentifier(Operands);
+}
+
+bool HexagonAsmParser::isLabel(AsmToken &Token) {
+  MCAsmLexer &Lexer = getLexer();
+  AsmToken const &Second = Lexer.getTok();
+  AsmToken Third = Lexer.peekTok();  
+  StringRef String = Token.getString();
+  if (Token.is(AsmToken::TokenKind::LCurly) ||
+      Token.is(AsmToken::TokenKind::RCurly))
+    return false;
+  if (!Token.is(AsmToken::TokenKind::Identifier))
+    return true;
+  if (!matchRegister(String.lower()))
+    return true;
+  (void)Second;
+  assert(Second.is(AsmToken::Colon));
+  StringRef Raw (String.data(), Third.getString().data() - String.data() +
+                 Third.getString().size());
+  std::string Collapsed = Raw;
+  Collapsed.erase(llvm::remove_if(Collapsed, isspace), Collapsed.end());
+  StringRef Whole = Collapsed;
+  std::pair<StringRef, StringRef> DotSplit = Whole.split('.');
+  if (!matchRegister(DotSplit.first.lower()))
+    return true;
+  return false;
+}
+
+bool HexagonAsmParser::handleNoncontigiousRegister(bool Contigious, SMLoc &Loc) {
+  if (!Contigious && ErrorNoncontigiousRegister) {
+    Error(Loc, "Register name is not contigious");
+    return true;
+  }
+  if (!Contigious && WarnNoncontigiousRegister)
+    Warning(Loc, "Register name is not contigious");
+  return false;
+}
+
+bool HexagonAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+  MCAsmLexer &Lexer = getLexer();
+  StartLoc = getLexer().getLoc();
+  SmallVector<AsmToken, 5> Lookahead;
+  StringRef RawString(Lexer.getTok().getString().data(), 0);
+  bool Again = Lexer.is(AsmToken::Identifier);
+  bool NeededWorkaround = false;
+  while (Again) {
+    AsmToken const &Token = Lexer.getTok();
+    RawString = StringRef(RawString.data(),
+                          Token.getString().data() - RawString.data () +
+                          Token.getString().size());
+    Lookahead.push_back(Token);
+    Lexer.Lex();
+    bool Contigious = Lexer.getTok().getString().data() ==
+                      Lookahead.back().getString().data() +
+                      Lookahead.back().getString().size();
+    bool Type = Lexer.is(AsmToken::Identifier) || Lexer.is(AsmToken::Dot) ||
+                Lexer.is(AsmToken::Integer) || Lexer.is(AsmToken::Real) ||
+                Lexer.is(AsmToken::Colon);
+    bool Workaround = Lexer.is(AsmToken::Colon) ||
+                      Lookahead.back().is(AsmToken::Colon);
+    Again = (Contigious && Type) || (Workaround && Type);
+    NeededWorkaround = NeededWorkaround || (Again && !(Contigious && Type));
+  }
+  std::string Collapsed = RawString;
+  Collapsed.erase(llvm::remove_if(Collapsed, isspace), Collapsed.end());
+  StringRef FullString = Collapsed;
+  std::pair<StringRef, StringRef> DotSplit = FullString.split('.');
+  unsigned DotReg = matchRegister(DotSplit.first.lower());
+  if (DotReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) {
+    if (DotSplit.second.empty()) {
+      RegNo = DotReg;
+      EndLoc = Lexer.getLoc();
+      if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc))
+        return true;
+      return false;
+    } else {
+      RegNo = DotReg;
+      size_t First = RawString.find('.');
+      StringRef DotString (RawString.data() + First, RawString.size() - First);
+      Lexer.UnLex(AsmToken(AsmToken::Identifier, DotString));
+      EndLoc = Lexer.getLoc();
+      if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc))
+        return true;
+      return false;
+    }
+  }
+  std::pair<StringRef, StringRef> ColonSplit = StringRef(FullString).split(':');
+  unsigned ColonReg = matchRegister(ColonSplit.first.lower());
+  if (ColonReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) {
+    Lexer.UnLex(Lookahead.back());
+    Lookahead.pop_back();
+    Lexer.UnLex(Lookahead.back());
+    Lookahead.pop_back();
+    RegNo = ColonReg;
+    EndLoc = Lexer.getLoc();
+    if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc))
+      return true;
+    return false;
+  }
+  while (!Lookahead.empty()) {
+    Lexer.UnLex(Lookahead.back());
+    Lookahead.pop_back();
+  }
+  return true;
+}
+
+bool HexagonAsmParser::implicitExpressionLocation(OperandVector &Operands) {
+  if (previousEqual(Operands, 0, "call"))
+    return true;
+  if (previousEqual(Operands, 0, "jump"))
+    if (!getLexer().getTok().is(AsmToken::Colon))
+      return true;
+  if (previousEqual(Operands, 0, "(") && previousIsLoop(Operands, 1))
+    return true;
+  if (previousEqual(Operands, 1, ":") && previousEqual(Operands, 2, "jump") &&
+      (previousEqual(Operands, 0, "nt") || previousEqual(Operands, 0, "t")))
+    return true;
+  return false;
+}
+
+bool HexagonAsmParser::parseExpression(MCExpr const *& Expr) {
+  SmallVector<AsmToken, 4> Tokens;
+  MCAsmLexer &Lexer = getLexer();
+  bool Done = false;
+  static char const * Comma = ",";
+  do {
+    Tokens.emplace_back (Lexer.getTok());
+    Lex();
+    switch (Tokens.back().getKind())
+    {
+    case AsmToken::TokenKind::Hash:
+      if (Tokens.size () > 1)
+        if ((Tokens.end () - 2)->getKind() == AsmToken::TokenKind::Plus) {
+          Tokens.insert(Tokens.end() - 2,
+                        AsmToken(AsmToken::TokenKind::Comma, Comma));
+          Done = true;
+        }
+      break;
+    case AsmToken::TokenKind::RCurly:
+    case AsmToken::TokenKind::EndOfStatement:
+    case AsmToken::TokenKind::Eof:
+      Done = true;
+      break;
+    default:
+      break;
+    }
+  } while (!Done);
+  while (!Tokens.empty()) {
+    Lexer.UnLex(Tokens.back());
+    Tokens.pop_back();
+  }
+  return getParser().parseExpression(Expr);
+}
+
+bool HexagonAsmParser::parseExpressionOrOperand(OperandVector &Operands) {
+  if (implicitExpressionLocation(Operands)) {
+    MCAsmParser &Parser = getParser();
+    SMLoc Loc = Parser.getLexer().getLoc();
+    MCExpr const *Expr = nullptr;
+    bool Error = parseExpression(Expr);
+    Expr = HexagonMCExpr::create(Expr, getContext());
+    if (!Error)
+      Operands.push_back(HexagonOperand::CreateImm(Expr, Loc, Loc));
+    return Error;
+  }
+  return parseOperand(Operands);
+}
+
+/// Parse an instruction.
+bool HexagonAsmParser::parseInstruction(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  MCAsmLexer &Lexer = getLexer();
+  while (true) {
+    AsmToken const &Token = Parser.getTok();
+    switch (Token.getKind()) {
+    case AsmToken::EndOfStatement: {
+      Lex();
+      return false;
+    }
+    case AsmToken::LCurly: {
+      if (!Operands.empty())
+        return true;
+      Operands.push_back(
+          HexagonOperand::CreateToken(Token.getString(), Token.getLoc()));
+      Lex();
+      return false;
+    }
+    case AsmToken::RCurly: {
+      if (Operands.empty()) {
+        Operands.push_back(
+            HexagonOperand::CreateToken(Token.getString(), Token.getLoc()));
+        Lex();
+      }
+      return false;
+    }
+    case AsmToken::Comma: {
+      Lex();
+      continue;
+    }
+    case AsmToken::EqualEqual:
+    case AsmToken::ExclaimEqual:
+    case AsmToken::GreaterEqual:
+    case AsmToken::GreaterGreater:
+    case AsmToken::LessEqual:
+    case AsmToken::LessLess: {
+      Operands.push_back(HexagonOperand::CreateToken(
+          Token.getString().substr(0, 1), Token.getLoc()));
+      Operands.push_back(HexagonOperand::CreateToken(
+          Token.getString().substr(1, 1), Token.getLoc()));
+      Lex();
+      continue;
+    }
+    case AsmToken::Hash: {
+      bool MustNotExtend = false;
+      bool ImplicitExpression = implicitExpressionLocation(Operands);
+      SMLoc ExprLoc = Lexer.getLoc();
+      if (!ImplicitExpression)
+        Operands.push_back(
+          HexagonOperand::CreateToken(Token.getString(), Token.getLoc()));
+      Lex();
+      bool MustExtend = false;
+      bool HiOnly = false;
+      bool LoOnly = false;
+      if (Lexer.is(AsmToken::Hash)) {
+        Lex();
+        MustExtend = true;
+      } else if (ImplicitExpression)
+        MustNotExtend = true;
+      AsmToken const &Token = Parser.getTok();
+      if (Token.is(AsmToken::Identifier)) {
+        StringRef String = Token.getString();
+        if (String.lower() == "hi") {
+          HiOnly = true;
+        } else if (String.lower() == "lo") {
+          LoOnly = true;
+        }
+        if (HiOnly || LoOnly) {
+          AsmToken LParen = Lexer.peekTok();
+          if (!LParen.is(AsmToken::LParen)) {
+            HiOnly = false;
+            LoOnly = false;
+          } else {
+            Lex();
+          }
+        }
+      }
+      MCExpr const *Expr = nullptr;
+      if (parseExpression(Expr))
+        return true;
+      int64_t Value;
+      MCContext &Context = Parser.getContext();
+      assert(Expr != nullptr);
+      if (Expr->evaluateAsAbsolute(Value)) {
+        if (HiOnly)
+          Expr = MCBinaryExpr::createLShr(
+              Expr,  MCConstantExpr::create(16, Context), Context);
+        if (HiOnly || LoOnly)
+          Expr = MCBinaryExpr::createAnd(Expr,
+              MCConstantExpr::create(0xffff, Context),
+                                    Context);
+      } else {
+        MCValue Value;
+        if (Expr->evaluateAsRelocatable(Value, nullptr, nullptr)) {
+          if (!Value.isAbsolute()) {
+            switch(Value.getAccessVariant()) {
+            case MCSymbolRefExpr::VariantKind::VK_TPREL:
+            case MCSymbolRefExpr::VariantKind::VK_DTPREL:
+              // Don't lazy extend these expression variants
+              MustNotExtend = !MustExtend;
+              break;
+            default:
+              break;
+            }
+          }
+        }
+      }
+      Expr = HexagonMCExpr::create(Expr, Context);
+      HexagonMCInstrInfo::setMustNotExtend(*Expr, MustNotExtend);
+      HexagonMCInstrInfo::setMustExtend(*Expr, MustExtend);
+      std::unique_ptr<HexagonOperand> Operand =
+          HexagonOperand::CreateImm(Expr, ExprLoc, ExprLoc);
+      Operands.push_back(std::move(Operand));
+      continue;
+    }
+    default:
+      break;
+    }
+    if (parseExpressionOrOperand(Operands))
+      return true;
+  }
+}
+
+bool HexagonAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                        StringRef Name,
+                                        AsmToken ID,
+                                        OperandVector &Operands) {
+  getLexer().UnLex(ID);
+  return parseInstruction(Operands);
+}
+
+static MCInst makeCombineInst(int opCode, MCOperand &Rdd,
+                              MCOperand &MO1, MCOperand &MO2) {
+  MCInst TmpInst;
+  TmpInst.setOpcode(opCode);
+  TmpInst.addOperand(Rdd);
+  TmpInst.addOperand(MO1);
+  TmpInst.addOperand(MO2);
+
+  return TmpInst;
+}
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned HexagonAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+                                                      unsigned Kind) {
+  HexagonOperand *Op = static_cast<HexagonOperand *>(&AsmOp);
+
+  switch (Kind) {
+  case MCK_0: {
+    int64_t Value;
+    return Op->isImm() && Op->Imm.Val->evaluateAsAbsolute(Value) && Value == 0
+               ? Match_Success
+               : Match_InvalidOperand;
+  }
+  case MCK_1: {
+    int64_t Value;
+    return Op->isImm() && Op->Imm.Val->evaluateAsAbsolute(Value) && Value == 1
+               ? Match_Success
+               : Match_InvalidOperand;
+  }
+  }
+  if (Op->Kind == HexagonOperand::Token && Kind != InvalidMatchClass) {
+    StringRef myStringRef = StringRef(Op->Tok.Data, Op->Tok.Length);
+    if (matchTokenString(myStringRef.lower()) == (MatchClassKind)Kind)
+      return Match_Success;
+    if (matchTokenString(myStringRef.upper()) == (MatchClassKind)Kind)
+      return Match_Success;
+  }
+
+  DEBUG(dbgs() << "Unmatched Operand:");
+  DEBUG(Op->dump());
+  DEBUG(dbgs() << "\n");
+
+  return Match_InvalidOperand;
+}
+
+// FIXME: Calls to OutOfRange shoudl propagate failure up to parseStatement.
+bool HexagonAsmParser::OutOfRange(SMLoc IDLoc, long long Val, long long Max) {
+  std::string errStr;
+  raw_string_ostream ES(errStr);
+  ES << "value " << Val << "(" << format_hex(Val, 0) << ") out of range: ";
+  if (Max >= 0)
+    ES << "0-" << Max;
+  else
+    ES << Max << "-" << (-Max - 1);
+  return Parser.printError(IDLoc, ES.str());
+}
+
+int HexagonAsmParser::processInstruction(MCInst &Inst,
+                                         OperandVector const &Operands,
+                                         SMLoc IDLoc) {
+  MCContext &Context = getParser().getContext();
+  const MCRegisterInfo *RI = getContext().getRegisterInfo();
+  std::string r = "r";
+  std::string v = "v";
+  std::string Colon = ":";
+
+  bool is32bit = false; // used to distinguish between CONST32 and CONST64
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+
+  case Hexagon::A2_iconst: {
+    Inst.setOpcode(Hexagon::A2_addi);
+    MCOperand Reg = Inst.getOperand(0);
+    MCOperand S16 = Inst.getOperand(1);
+    HexagonMCInstrInfo::setMustNotExtend(*S16.getExpr());
+    HexagonMCInstrInfo::setS23_2_reloc(*S16.getExpr());
+    Inst.clear();
+    Inst.addOperand(Reg);
+    Inst.addOperand(MCOperand::createReg(Hexagon::R0));
+    Inst.addOperand(S16);
+    break;
+  }
+  case Hexagon::M4_mpyrr_addr:
+  case Hexagon::S4_addi_asl_ri:
+  case Hexagon::S4_addi_lsr_ri:
+  case Hexagon::S4_andi_asl_ri:
+  case Hexagon::S4_andi_lsr_ri:
+  case Hexagon::S4_ori_asl_ri:
+  case Hexagon::S4_ori_lsr_ri:
+  case Hexagon::S4_or_andix:
+  case Hexagon::S4_subi_asl_ri:
+  case Hexagon::S4_subi_lsr_ri: {
+    MCOperand &Ry = Inst.getOperand(0);
+    MCOperand &src = Inst.getOperand(2);
+    if (RI->getEncodingValue(Ry.getReg()) != RI->getEncodingValue(src.getReg()))
+      return Match_InvalidOperand;
+    break;
+  }
+
+  case Hexagon::C2_cmpgei: {
+    MCOperand &MO = Inst.getOperand(2);
+    MO.setExpr(HexagonMCExpr::create(MCBinaryExpr::createSub(
+        MO.getExpr(), MCConstantExpr::create(1, Context), Context), Context));
+    Inst.setOpcode(Hexagon::C2_cmpgti);
+    break;
+  }
+
+  case Hexagon::C2_cmpgeui: {
+    MCOperand &MO = Inst.getOperand(2);
+    int64_t Value;
+    bool Success = MO.getExpr()->evaluateAsAbsolute(Value);
+    (void)Success;
+    assert(Success && "Assured by matcher");
+    if (Value == 0) {
+      MCInst TmpInst;
+      MCOperand &Pd = Inst.getOperand(0);
+      MCOperand &Rt = Inst.getOperand(1);
+      TmpInst.setOpcode(Hexagon::C2_cmpeq);
+      TmpInst.addOperand(Pd);
+      TmpInst.addOperand(Rt);
+      TmpInst.addOperand(Rt);
+      Inst = TmpInst;
+    } else {
+      MO.setExpr(HexagonMCExpr::create(MCBinaryExpr::createSub(
+          MO.getExpr(), MCConstantExpr::create(1, Context), Context), Context));
+      Inst.setOpcode(Hexagon::C2_cmpgtui);
+    }
+    break;
+  }
+
+  // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)"
+  case Hexagon::A2_tfrp: {
+    MCOperand &MO = Inst.getOperand(1);
+    unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
+    std::string R1 = r + utostr(RegPairNum + 1);
+    StringRef Reg1(R1);
+    MO.setReg(matchRegister(Reg1));
+    // Add a new operand for the second register in the pair.
+    std::string R2 = r + utostr(RegPairNum);
+    StringRef Reg2(R2);
+    Inst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
+    Inst.setOpcode(Hexagon::A2_combinew);
+    break;
+  }
+
+  case Hexagon::A2_tfrpt:
+  case Hexagon::A2_tfrpf: {
+    MCOperand &MO = Inst.getOperand(2);
+    unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
+    std::string R1 = r + utostr(RegPairNum + 1);
+    StringRef Reg1(R1);
+    MO.setReg(matchRegister(Reg1));
+    // Add a new operand for the second register in the pair.
+    std::string R2 = r + utostr(RegPairNum);
+    StringRef Reg2(R2);
+    Inst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
+    Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt)
+                       ? Hexagon::C2_ccombinewt
+                       : Hexagon::C2_ccombinewf);
+    break;
+  }
+  case Hexagon::A2_tfrptnew:
+  case Hexagon::A2_tfrpfnew: {
+    MCOperand &MO = Inst.getOperand(2);
+    unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
+    std::string R1 = r + utostr(RegPairNum + 1);
+    StringRef Reg1(R1);
+    MO.setReg(matchRegister(Reg1));
+    // Add a new operand for the second register in the pair.
+    std::string R2 = r + utostr(RegPairNum);
+    StringRef Reg2(R2);
+    Inst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
+    Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew)
+                       ? Hexagon::C2_ccombinewnewt
+                       : Hexagon::C2_ccombinewnewf);
+    break;
+  }
+
+  // Translate a "$Vdd = $Vss" to "$Vdd = vcombine($Vs, $Vt)"
+  case Hexagon::V6_vassignp: {
+    MCOperand &MO = Inst.getOperand(1);
+    unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
+    std::string R1 = v + utostr(RegPairNum + 1);
+    MO.setReg(MatchRegisterName(R1));
+    // Add a new operand for the second register in the pair.
+    std::string R2 = v + utostr(RegPairNum);
+    Inst.addOperand(MCOperand::createReg(MatchRegisterName(R2)));
+    Inst.setOpcode(Hexagon::V6_vcombine);
+    break;
+  }
+
+  // Translate a "$Rx =  CONST32(#imm)" to "$Rx = memw(gp+#LABEL) "
+  case Hexagon::CONST32:
+    is32bit = true;
+  // Translate a "$Rx:y =  CONST64(#imm)" to "$Rx:y = memd(gp+#LABEL) "
+  case Hexagon::CONST64:
+    // FIXME: need better way to detect AsmStreamer (upstream removed getKind())
+    if (!Parser.getStreamer().hasRawTextSupport()) {
+      MCELFStreamer *MES = static_cast<MCELFStreamer *>(&Parser.getStreamer());
+      MCOperand &MO_1 = Inst.getOperand(1);
+      MCOperand &MO_0 = Inst.getOperand(0);
+
+      // push section onto section stack
+      MES->PushSection();
+
+      std::string myCharStr;
+      MCSectionELF *mySection;
+
+      // check if this as an immediate or a symbol
+      int64_t Value;
+      bool Absolute = MO_1.getExpr()->evaluateAsAbsolute(Value);
+      if (Absolute) {
+        // Create a new section - one for each constant
+        // Some or all of the zeros are replaced with the given immediate.
+        if (is32bit) {
+          std::string myImmStr = utohexstr(static_cast<uint32_t>(Value));
+          myCharStr = StringRef(".gnu.linkonce.l4.CONST_00000000")
+                          .drop_back(myImmStr.size())
+                          .str() +
+                      myImmStr;
+        } else {
+          std::string myImmStr = utohexstr(Value);
+          myCharStr = StringRef(".gnu.linkonce.l8.CONST_0000000000000000")
+                          .drop_back(myImmStr.size())
+                          .str() +
+                      myImmStr;
+        }
+
+        mySection = getContext().getELFSection(myCharStr, ELF::SHT_PROGBITS,
+                                               ELF::SHF_ALLOC | ELF::SHF_WRITE);
+      } else if (MO_1.isExpr()) {
+        // .lita - for expressions
+        myCharStr = ".lita";
+        mySection = getContext().getELFSection(myCharStr, ELF::SHT_PROGBITS,
+                                               ELF::SHF_ALLOC | ELF::SHF_WRITE);
+      } else
+        llvm_unreachable("unexpected type of machine operand!");
+
+      MES->SwitchSection(mySection);
+      unsigned byteSize = is32bit ? 4 : 8;
+      getStreamer().EmitCodeAlignment(byteSize, byteSize);
+
+      MCSymbol *Sym;
+
+      // for symbols, get rid of prepended ".gnu.linkonce.lx."
+
+      // emit symbol if needed
+      if (Absolute) {
+        Sym = getContext().getOrCreateSymbol(StringRef(myCharStr.c_str() + 16));
+        if (Sym->isUndefined()) {
+          getStreamer().EmitLabel(Sym);
+          getStreamer().EmitSymbolAttribute(Sym, MCSA_Global);
+          getStreamer().EmitIntValue(Value, byteSize);
+        }
+      } else if (MO_1.isExpr()) {
+        const char *StringStart = nullptr;
+        const char *StringEnd = nullptr;
+        if (*Operands[4]->getStartLoc().getPointer() == '#') {
+          StringStart = Operands[5]->getStartLoc().getPointer();
+          StringEnd = Operands[6]->getStartLoc().getPointer();
+        } else { // no pound
+          StringStart = Operands[4]->getStartLoc().getPointer();
+          StringEnd = Operands[5]->getStartLoc().getPointer();
+        }
+
+        unsigned size = StringEnd - StringStart;
+        std::string DotConst = ".CONST_";
+        Sym = getContext().getOrCreateSymbol(DotConst +
+                                             StringRef(StringStart, size));
+
+        if (Sym->isUndefined()) {
+          // case where symbol is not yet defined: emit symbol
+          getStreamer().EmitLabel(Sym);
+          getStreamer().EmitSymbolAttribute(Sym, MCSA_Local);
+          getStreamer().EmitValue(MO_1.getExpr(), 4);
+        }
+      } else
+        llvm_unreachable("unexpected type of machine operand!");
+
+      MES->PopSection();
+
+      if (Sym) {
+        MCInst TmpInst;
+        if (is32bit) // 32 bit
+          TmpInst.setOpcode(Hexagon::L2_loadrigp);
+        else // 64 bit
+          TmpInst.setOpcode(Hexagon::L2_loadrdgp);
+
+        TmpInst.addOperand(MO_0);
+        TmpInst.addOperand(
+            MCOperand::createExpr(MCSymbolRefExpr::create(Sym, getContext())));
+        Inst = TmpInst;
+      }
+    }
+    break;
+
+  // Translate a "$Rdd = #-imm" to "$Rdd = combine(#[-1,0], #-imm)"
+  case Hexagon::A2_tfrpi: {
+    MCOperand &Rdd = Inst.getOperand(0);
+    MCOperand &MO = Inst.getOperand(1);
+    int64_t Value;
+    int sVal = (MO.getExpr()->evaluateAsAbsolute(Value) && Value < 0) ? -1 : 0;
+    MCOperand imm(MCOperand::createExpr(
+        HexagonMCExpr::create(MCConstantExpr::create(sVal, Context), Context)));
+    Inst = makeCombineInst(Hexagon::A2_combineii, Rdd, imm, MO);
+    break;
+  }
+
+  // Translate a "$Rdd = [#]#imm" to "$Rdd = combine(#, [#]#imm)"
+  case Hexagon::TFRI64_V4: {
+    MCOperand &Rdd = Inst.getOperand(0);
+    MCOperand &MO = Inst.getOperand(1);
+    int64_t Value;
+    if (MO.getExpr()->evaluateAsAbsolute(Value)) {
+      int s8 = Hi_32(Value);
+      if (!isInt<8>(s8))
+        OutOfRange(IDLoc, s8, -128);
+      MCOperand imm(MCOperand::createExpr(HexagonMCExpr::create(
+          MCConstantExpr::create(s8, Context), Context))); // upper 32
+      auto Expr = HexagonMCExpr::create(
+          MCConstantExpr::create(Lo_32(Value), Context), Context);
+      HexagonMCInstrInfo::setMustExtend(*Expr, HexagonMCInstrInfo::mustExtend(*MO.getExpr()));
+      MCOperand imm2(MCOperand::createExpr(Expr)); // lower 32
+      Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, imm, imm2);
+    } else {
+      MCOperand imm(MCOperand::createExpr(HexagonMCExpr::create(
+          MCConstantExpr::create(0, Context), Context))); // upper 32
+      Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, imm, MO);
+    }
+    break;
+  }
+
+  // Handle $Rdd = combine(##imm, #imm)"
+  case Hexagon::TFRI64_V2_ext: {
+    MCOperand &Rdd = Inst.getOperand(0);
+    MCOperand &MO1 = Inst.getOperand(1);
+    MCOperand &MO2 = Inst.getOperand(2);
+    int64_t Value;
+    if (MO2.getExpr()->evaluateAsAbsolute(Value)) {
+      int s8 = Value;
+      if (s8 < -128 || s8 > 127)
+        OutOfRange(IDLoc, s8, -128);
+    }
+    Inst = makeCombineInst(Hexagon::A2_combineii, Rdd, MO1, MO2);
+    break;
+  }
+
+  // Handle $Rdd = combine(#imm, ##imm)"
+  case Hexagon::A4_combineii: {
+    MCOperand &Rdd = Inst.getOperand(0);
+    MCOperand &MO1 = Inst.getOperand(1);
+    int64_t Value;
+    if (MO1.getExpr()->evaluateAsAbsolute(Value)) {
+      int s8 = Value;
+      if (s8 < -128 || s8 > 127)
+        OutOfRange(IDLoc, s8, -128);
+    }
+    MCOperand &MO2 = Inst.getOperand(2);
+    Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, MO1, MO2);
+    break;
+  }
+
+  case Hexagon::S2_tableidxb_goodsyntax:
+    Inst.setOpcode(Hexagon::S2_tableidxb);
+    break;
+
+  case Hexagon::S2_tableidxh_goodsyntax: {
+    MCInst TmpInst;
+    MCOperand &Rx = Inst.getOperand(0);
+    MCOperand &_dst_ = Inst.getOperand(1);
+    MCOperand &Rs = Inst.getOperand(2);
+    MCOperand &Imm4 = Inst.getOperand(3);
+    MCOperand &Imm6 = Inst.getOperand(4);
+    Imm6.setExpr(HexagonMCExpr::create(MCBinaryExpr::createSub(
+        Imm6.getExpr(), MCConstantExpr::create(1, Context), Context), Context));
+    TmpInst.setOpcode(Hexagon::S2_tableidxh);
+    TmpInst.addOperand(Rx);
+    TmpInst.addOperand(_dst_);
+    TmpInst.addOperand(Rs);
+    TmpInst.addOperand(Imm4);
+    TmpInst.addOperand(Imm6);
+    Inst = TmpInst;
+    break;
+  }
+
+  case Hexagon::S2_tableidxw_goodsyntax: {
+    MCInst TmpInst;
+    MCOperand &Rx = Inst.getOperand(0);
+    MCOperand &_dst_ = Inst.getOperand(1);
+    MCOperand &Rs = Inst.getOperand(2);
+    MCOperand &Imm4 = Inst.getOperand(3);
+    MCOperand &Imm6 = Inst.getOperand(4);
+    Imm6.setExpr(HexagonMCExpr::create(MCBinaryExpr::createSub(
+        Imm6.getExpr(), MCConstantExpr::create(2, Context), Context), Context));
+    TmpInst.setOpcode(Hexagon::S2_tableidxw);
+    TmpInst.addOperand(Rx);
+    TmpInst.addOperand(_dst_);
+    TmpInst.addOperand(Rs);
+    TmpInst.addOperand(Imm4);
+    TmpInst.addOperand(Imm6);
+    Inst = TmpInst;
+    break;
+  }
+
+  case Hexagon::S2_tableidxd_goodsyntax: {
+    MCInst TmpInst;
+    MCOperand &Rx = Inst.getOperand(0);
+    MCOperand &_dst_ = Inst.getOperand(1);
+    MCOperand &Rs = Inst.getOperand(2);
+    MCOperand &Imm4 = Inst.getOperand(3);
+    MCOperand &Imm6 = Inst.getOperand(4);
+    Imm6.setExpr(HexagonMCExpr::create(MCBinaryExpr::createSub(
+        Imm6.getExpr(), MCConstantExpr::create(3, Context), Context), Context));
+    TmpInst.setOpcode(Hexagon::S2_tableidxd);
+    TmpInst.addOperand(Rx);
+    TmpInst.addOperand(_dst_);
+    TmpInst.addOperand(Rs);
+    TmpInst.addOperand(Imm4);
+    TmpInst.addOperand(Imm6);
+    Inst = TmpInst;
+    break;
+  }
+
+  case Hexagon::M2_mpyui:
+    Inst.setOpcode(Hexagon::M2_mpyi);
+    break;
+  case Hexagon::M2_mpysmi: {
+    MCInst TmpInst;
+    MCOperand &Rd = Inst.getOperand(0);
+    MCOperand &Rs = Inst.getOperand(1);
+    MCOperand &Imm = Inst.getOperand(2);
+    int64_t Value;
+    MCExpr const &Expr = *Imm.getExpr();
+    bool Absolute = Expr.evaluateAsAbsolute(Value);
+    assert(Absolute);
+    (void)Absolute;
+    if (!HexagonMCInstrInfo::mustExtend(Expr)) {
+      if (Value < 0 && Value > -256) {
+        Imm.setExpr(HexagonMCExpr::create(
+            MCConstantExpr::create(Value * -1, Context), Context));
+        TmpInst.setOpcode(Hexagon::M2_mpysin);
+      } else if (Value < 256 && Value >= 0)
+        TmpInst.setOpcode(Hexagon::M2_mpysip);
+      else
+        return Match_InvalidOperand;
+    } else {
+      if (Value >= 0)
+        TmpInst.setOpcode(Hexagon::M2_mpysip);
+      else
+        return Match_InvalidOperand;
+    }
+    TmpInst.addOperand(Rd);
+    TmpInst.addOperand(Rs);
+    TmpInst.addOperand(Imm);
+    Inst = TmpInst;
+    break;
+  }
+
+  case Hexagon::S2_asr_i_r_rnd_goodsyntax: {
+    MCOperand &Imm = Inst.getOperand(2);
+    MCInst TmpInst;
+    int64_t Value;
+    bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);
+    (void)Absolute;
+    if (Value == 0) { // convert to $Rd = $Rs
+      TmpInst.setOpcode(Hexagon::A2_tfr);
+      MCOperand &Rd = Inst.getOperand(0);
+      MCOperand &Rs = Inst.getOperand(1);
+      TmpInst.addOperand(Rd);
+      TmpInst.addOperand(Rs);
+    } else {
+      Imm.setExpr(HexagonMCExpr::create(
+          MCBinaryExpr::createSub(Imm.getExpr(),
+                                  MCConstantExpr::create(1, Context), Context),
+          Context));
+      TmpInst.setOpcode(Hexagon::S2_asr_i_r_rnd);
+      MCOperand &Rd = Inst.getOperand(0);
+      MCOperand &Rs = Inst.getOperand(1);
+      TmpInst.addOperand(Rd);
+      TmpInst.addOperand(Rs);
+      TmpInst.addOperand(Imm);
+    }
+    Inst = TmpInst;
+    break;
+  }
+
+  case Hexagon::S2_asr_i_p_rnd_goodsyntax: {
+    MCOperand &Rdd = Inst.getOperand(0);
+    MCOperand &Rss = Inst.getOperand(1);
+    MCOperand &Imm = Inst.getOperand(2);
+    int64_t Value;
+    bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);
+    (void)Absolute;
+    if (Value == 0) { // convert to $Rdd = combine ($Rs[0], $Rs[1])
+      MCInst TmpInst;
+      unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg());
+      std::string R1 = r + utostr(RegPairNum + 1);
+      StringRef Reg1(R1);
+      Rss.setReg(matchRegister(Reg1));
+      // Add a new operand for the second register in the pair.
+      std::string R2 = r + utostr(RegPairNum);
+      StringRef Reg2(R2);
+      TmpInst.setOpcode(Hexagon::A2_combinew);
+      TmpInst.addOperand(Rdd);
+      TmpInst.addOperand(Rss);
+      TmpInst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
+      Inst = TmpInst;
+    } else {
+      Imm.setExpr(HexagonMCExpr::create(
+          MCBinaryExpr::createSub(Imm.getExpr(),
+                                  MCConstantExpr::create(1, Context), Context),
+          Context));
+      Inst.setOpcode(Hexagon::S2_asr_i_p_rnd);
+    }
+    break;
+  }
+
+  case Hexagon::A4_boundscheck: {
+    MCOperand &Rs = Inst.getOperand(1);
+    unsigned int RegNum = RI->getEncodingValue(Rs.getReg());
+    if (RegNum & 1) { // Odd mapped to raw:hi, regpair is rodd:odd-1, like r3:2
+      Inst.setOpcode(Hexagon::A4_boundscheck_hi);
+      std::string Name = r + utostr(RegNum) + Colon + utostr(RegNum - 1);
+      StringRef RegPair = Name;
+      Rs.setReg(matchRegister(RegPair));
+    } else { // raw:lo
+      Inst.setOpcode(Hexagon::A4_boundscheck_lo);
+      std::string Name = r + utostr(RegNum + 1) + Colon + utostr(RegNum);
+      StringRef RegPair = Name;
+      Rs.setReg(matchRegister(RegPair));
+    }
+    break;
+  }
+
+  case Hexagon::A2_addsp: {
+    MCOperand &Rs = Inst.getOperand(1);
+    unsigned int RegNum = RI->getEncodingValue(Rs.getReg());
+    if (RegNum & 1) { // Odd mapped to raw:hi
+      Inst.setOpcode(Hexagon::A2_addsph);
+      std::string Name = r + utostr(RegNum) + Colon + utostr(RegNum - 1);
+      StringRef RegPair = Name;
+      Rs.setReg(matchRegister(RegPair));
+    } else { // Even mapped raw:lo
+      Inst.setOpcode(Hexagon::A2_addspl);
+      std::string Name = r + utostr(RegNum + 1) + Colon + utostr(RegNum);
+      StringRef RegPair = Name;
+      Rs.setReg(matchRegister(RegPair));
+    }
+    break;
+  }
+
+  case Hexagon::M2_vrcmpys_s1: {
+    MCOperand &Rt = Inst.getOperand(2);
+    unsigned int RegNum = RI->getEncodingValue(Rt.getReg());
+    if (RegNum & 1) { // Odd mapped to sat:raw:hi
+      Inst.setOpcode(Hexagon::M2_vrcmpys_s1_h);
+      std::string Name = r + utostr(RegNum) + Colon + utostr(RegNum - 1);
+      StringRef RegPair = Name;
+      Rt.setReg(matchRegister(RegPair));
+    } else { // Even mapped sat:raw:lo
+      Inst.setOpcode(Hexagon::M2_vrcmpys_s1_l);
+      std::string Name = r + utostr(RegNum + 1) + Colon + utostr(RegNum);
+      StringRef RegPair = Name;
+      Rt.setReg(matchRegister(RegPair));
+    }
+    break;
+  }
+
+  case Hexagon::M2_vrcmpys_acc_s1: {
+    MCInst TmpInst;
+    MCOperand &Rxx = Inst.getOperand(0);
+    MCOperand &Rss = Inst.getOperand(2);
+    MCOperand &Rt = Inst.getOperand(3);
+    unsigned int RegNum = RI->getEncodingValue(Rt.getReg());
+    if (RegNum & 1) { // Odd mapped to sat:raw:hi
+      TmpInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_h);
+      std::string Name = r + utostr(RegNum) + Colon + utostr(RegNum - 1);
+      StringRef RegPair = Name;
+      Rt.setReg(matchRegister(RegPair));
+    } else { // Even mapped sat:raw:lo
+      TmpInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_l);
+      std::string Name = r + utostr(RegNum + 1) + Colon + utostr(RegNum);
+      StringRef RegPair = Name;
+      Rt.setReg(matchRegister(RegPair));
+    }
+    // Registers are in different positions
+    TmpInst.addOperand(Rxx);
+    TmpInst.addOperand(Rxx);
+    TmpInst.addOperand(Rss);
+    TmpInst.addOperand(Rt);
+    Inst = TmpInst;
+    break;
+  }
+
+  case Hexagon::M2_vrcmpys_s1rp: {
+    MCOperand &Rt = Inst.getOperand(2);
+    unsigned int RegNum = RI->getEncodingValue(Rt.getReg());
+    if (RegNum & 1) { // Odd mapped to rnd:sat:raw:hi
+      Inst.setOpcode(Hexagon::M2_vrcmpys_s1rp_h);
+      std::string Name = r + utostr(RegNum) + Colon + utostr(RegNum - 1);
+      StringRef RegPair = Name;
+      Rt.setReg(matchRegister(RegPair));
+    } else { // Even mapped rnd:sat:raw:lo
+      Inst.setOpcode(Hexagon::M2_vrcmpys_s1rp_l);
+      std::string Name = r + utostr(RegNum + 1) + Colon + utostr(RegNum);
+      StringRef RegPair = Name;
+      Rt.setReg(matchRegister(RegPair));
+    }
+    break;
+  }
+
+  case Hexagon::S5_asrhub_rnd_sat_goodsyntax: {
+    MCOperand &Imm = Inst.getOperand(2);
+    int64_t Value;
+    bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);
+    (void)Absolute;
+    if (Value == 0)
+      Inst.setOpcode(Hexagon::S2_vsathub);
+    else {
+      Imm.setExpr(HexagonMCExpr::create(
+          MCBinaryExpr::createSub(Imm.getExpr(),
+                                  MCConstantExpr::create(1, Context), Context),
+          Context));
+      Inst.setOpcode(Hexagon::S5_asrhub_rnd_sat);
+    }
+    break;
+  }
+
+  case Hexagon::S5_vasrhrnd_goodsyntax: {
+    MCOperand &Rdd = Inst.getOperand(0);
+    MCOperand &Rss = Inst.getOperand(1);
+    MCOperand &Imm = Inst.getOperand(2);
+    int64_t Value;
+    bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);
+    (void)Absolute;
+    if (Value == 0) {
+      MCInst TmpInst;
+      unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg());
+      std::string R1 = r + utostr(RegPairNum + 1);
+      StringRef Reg1(R1);
+      Rss.setReg(matchRegister(Reg1));
+      // Add a new operand for the second register in the pair.
+      std::string R2 = r + utostr(RegPairNum);
+      StringRef Reg2(R2);
+      TmpInst.setOpcode(Hexagon::A2_combinew);
+      TmpInst.addOperand(Rdd);
+      TmpInst.addOperand(Rss);
+      TmpInst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
+      Inst = TmpInst;
+    } else {
+      Imm.setExpr(HexagonMCExpr::create(
+          MCBinaryExpr::createSub(Imm.getExpr(),
+                                  MCConstantExpr::create(1, Context), Context),
+          Context));
+      Inst.setOpcode(Hexagon::S5_vasrhrnd);
+    }
+    break;
+  }
+
+  case Hexagon::A2_not: {
+    MCInst TmpInst;
+    MCOperand &Rd = Inst.getOperand(0);
+    MCOperand &Rs = Inst.getOperand(1);
+    TmpInst.setOpcode(Hexagon::A2_subri);
+    TmpInst.addOperand(Rd);
+    TmpInst.addOperand(MCOperand::createExpr(
+        HexagonMCExpr::create(MCConstantExpr::create(-1, Context), Context)));
+    TmpInst.addOperand(Rs);
+    Inst = TmpInst;
+    break;
+  }
+  } // switch
+
+  return Match_Success;
+}
+
+unsigned HexagonAsmParser::matchRegister(StringRef Name) {
+  if (unsigned Reg = MatchRegisterName(Name))
+    return Reg;
+  return MatchRegisterAltName(Name);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp b/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp
new file mode 100644
index 000000000000..c0591c332dea
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp
@@ -0,0 +1,1144 @@
+//===--- BitTracker.cpp ---------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// SSA-based bit propagation.
+//
+// The purpose of this code is, for a given virtual register, to provide
+// information about the value of each bit in the register. The values
+// of bits are represented by the class BitValue, and take one of four
+// cases: 0, 1, "ref" and "bottom". The 0 and 1 are rather clear, the
+// "ref" value means that the bit is a copy of another bit (which itself
+// cannot be a copy of yet another bit---such chains are not allowed).
+// A "ref" value is associated with a BitRef structure, which indicates
+// which virtual register, and which bit in that register is the origin
+// of the value. For example, given an instruction
+//   vreg2 = ASL vreg1, 1
+// assuming that nothing is known about bits of vreg1, bit 1 of vreg2
+// will be a "ref" to (vreg1, 0). If there is a subsequent instruction
+//   vreg3 = ASL vreg2, 2
+// then bit 3 of vreg3 will be a "ref" to (vreg1, 0) as well.
+// The "bottom" case means that the bit's value cannot be determined,
+// and that this virtual register actually defines it. The "bottom" case
+// is discussed in detail in BitTracker.h. In fact, "bottom" is a "ref
+// to self", so for the vreg1 above, the bit 0 of it will be a "ref" to
+// (vreg1, 0), bit 1 will be a "ref" to (vreg1, 1), etc.
+//
+// The tracker implements the Wegman-Zadeck algorithm, originally developed
+// for SSA-based constant propagation. Each register is represented as
+// a sequence of bits, with the convention that bit 0 is the least signi-
+// ficant bit. Each bit is propagated individually. The class RegisterCell
+// implements the register's representation, and is also the subject of
+// the lattice operations in the tracker.
+//
+// The intended usage of the bit tracker is to create a target-specific
+// machine instruction evaluator, pass the evaluator to the BitTracker
+// object, and run the tracker. The tracker will then collect the bit
+// value information for a given machine function. After that, it can be
+// queried for the cells for each virtual register.
+// Sample code:
+//   const TargetSpecificEvaluator TSE(TRI, MRI);
+//   BitTracker BT(TSE, MF);
+//   BT.run();
+//   ...
+//   unsigned Reg = interestingRegister();
+//   RegisterCell RC = BT.get(Reg);
+//   if (RC[3].is(1))
+//      Reg0bit3 = 1;
+//
+// The code below is intended to be fully target-independent.
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include "BitTracker.h"
+
+using namespace llvm;
+
+typedef BitTracker BT;
+
+namespace {
+  // Local trickery to pretty print a register (without the whole "%vreg"
+  // business).
+  struct printv {
+    printv(unsigned r) : R(r) {}
+    unsigned R;
+  };
+  raw_ostream &operator<< (raw_ostream &OS, const printv &PV) {
+    if (PV.R)
+      OS << 'v' << TargetRegisterInfo::virtReg2Index(PV.R);
+    else
+      OS << 's';
+    return OS;
+  }
+}
+
+namespace llvm {
+  raw_ostream &operator<<(raw_ostream &OS, const BT::BitValue &BV) {
+    switch (BV.Type) {
+      case BT::BitValue::Top:
+        OS << 'T';
+        break;
+      case BT::BitValue::Zero:
+        OS << '0';
+        break;
+      case BT::BitValue::One:
+        OS << '1';
+        break;
+      case BT::BitValue::Ref:
+        OS << printv(BV.RefI.Reg) << '[' << BV.RefI.Pos << ']';
+        break;
+    }
+    return OS;
+  }
+
+  raw_ostream &operator<<(raw_ostream &OS, const BT::RegisterCell &RC) {
+    unsigned n = RC.Bits.size();
+    OS << "{ w:" << n;
+    // Instead of printing each bit value individually, try to group them
+    // into logical segments, such as sequences of 0 or 1 bits or references
+    // to consecutive bits (e.g. "bits 3-5 are same as bits 7-9 of reg xyz").
+    // "Start" will be the index of the beginning of the most recent segment.
+    unsigned Start = 0;
+    bool SeqRef = false;    // A sequence of refs to consecutive bits.
+    bool ConstRef = false;  // A sequence of refs to the same bit.
+
+    for (unsigned i = 1, n = RC.Bits.size(); i < n; ++i) {
+      const BT::BitValue &V = RC[i];
+      const BT::BitValue &SV = RC[Start];
+      bool IsRef = (V.Type == BT::BitValue::Ref);
+      // If the current value is the same as Start, skip to the next one.
+      if (!IsRef && V == SV)
+        continue;
+      if (IsRef && SV.Type == BT::BitValue::Ref && V.RefI.Reg == SV.RefI.Reg) {
+        if (Start+1 == i) {
+          SeqRef = (V.RefI.Pos == SV.RefI.Pos+1);
+          ConstRef = (V.RefI.Pos == SV.RefI.Pos);
+        }
+        if (SeqRef && V.RefI.Pos == SV.RefI.Pos+(i-Start))
+          continue;
+        if (ConstRef && V.RefI.Pos == SV.RefI.Pos)
+          continue;
+      }
+
+      // The current value is different. Print the previous one and reset
+      // the Start.
+      OS << " [" << Start;
+      unsigned Count = i - Start;
+      if (Count == 1) {
+        OS << "]:" << SV;
+      } else {
+        OS << '-' << i-1 << "]:";
+        if (SV.Type == BT::BitValue::Ref && SeqRef)
+          OS << printv(SV.RefI.Reg) << '[' << SV.RefI.Pos << '-'
+             << SV.RefI.Pos+(Count-1) << ']';
+        else
+          OS << SV;
+      }
+      Start = i;
+      SeqRef = ConstRef = false;
+    }
+
+    OS << " [" << Start;
+    unsigned Count = n - Start;
+    if (n-Start == 1) {
+      OS << "]:" << RC[Start];
+    } else {
+      OS << '-' << n-1 << "]:";
+      const BT::BitValue &SV = RC[Start];
+      if (SV.Type == BT::BitValue::Ref && SeqRef)
+        OS << printv(SV.RefI.Reg) << '[' << SV.RefI.Pos << '-'
+           << SV.RefI.Pos+(Count-1) << ']';
+      else
+        OS << SV;
+    }
+    OS << " }";
+
+    return OS;
+  }
+}
+
+void BitTracker::print_cells(raw_ostream &OS) const {
+  for (CellMapType::iterator I = Map.begin(), E = Map.end(); I != E; ++I)
+    dbgs() << PrintReg(I->first, &ME.TRI) << " -> " << I->second << "\n";
+}
+
+
+BitTracker::BitTracker(const MachineEvaluator &E, MachineFunction &F)
+    : Trace(false), ME(E), MF(F), MRI(F.getRegInfo()), Map(*new CellMapType) {}
+
+BitTracker::~BitTracker() {
+  delete &Map;
+}
+
+
+// If we were allowed to update a cell for a part of a register, the meet
+// operation would need to be parametrized by the register number and the
+// exact part of the register, so that the computer BitRefs correspond to
+// the actual bits of the "self" register.
+// While this cannot happen in the current implementation, I'm not sure
+// if this should be ruled out in the future.
+bool BT::RegisterCell::meet(const RegisterCell &RC, unsigned SelfR) {
+  // An example when "meet" can be invoked with SelfR == 0 is a phi node
+  // with a physical register as an operand.
+  assert(SelfR == 0 || TargetRegisterInfo::isVirtualRegister(SelfR));
+  bool Changed = false;
+  for (uint16_t i = 0, n = Bits.size(); i < n; ++i) {
+    const BitValue &RCV = RC[i];
+    Changed |= Bits[i].meet(RCV, BitRef(SelfR, i));
+  }
+  return Changed;
+}
+
+
+// Insert the entire cell RC into the current cell at position given by M.
+BT::RegisterCell &BT::RegisterCell::insert(const BT::RegisterCell &RC,
+      const BitMask &M) {
+  uint16_t B = M.first(), E = M.last(), W = width();
+  // Sanity: M must be a valid mask for *this.
+  assert(B < W && E < W);
+  // Sanity: the masked part of *this must have the same number of bits
+  // as the source.
+  assert(B > E || E-B+1 == RC.width());      // B <= E  =>  E-B+1 = |RC|.
+  assert(B <= E || E+(W-B)+1 == RC.width()); // E < B   =>  E+(W-B)+1 = |RC|.
+  if (B <= E) {
+    for (uint16_t i = 0; i <= E-B; ++i)
+      Bits[i+B] = RC[i];
+  } else {
+    for (uint16_t i = 0; i < W-B; ++i)
+      Bits[i+B] = RC[i];
+    for (uint16_t i = 0; i <= E; ++i)
+      Bits[i] = RC[i+(W-B)];
+  }
+  return *this;
+}
+
+
+BT::RegisterCell BT::RegisterCell::extract(const BitMask &M) const {
+  uint16_t B = M.first(), E = M.last(), W = width();
+  assert(B < W && E < W);
+  if (B <= E) {
+    RegisterCell RC(E-B+1);
+    for (uint16_t i = B; i <= E; ++i)
+      RC.Bits[i-B] = Bits[i];
+    return RC;
+  }
+
+  RegisterCell RC(E+(W-B)+1);
+  for (uint16_t i = 0; i < W-B; ++i)
+    RC.Bits[i] = Bits[i+B];
+  for (uint16_t i = 0; i <= E; ++i)
+    RC.Bits[i+(W-B)] = Bits[i];
+  return RC;
+}
+
+
+BT::RegisterCell &BT::RegisterCell::rol(uint16_t Sh) {
+  // Rotate left (i.e. towards increasing bit indices).
+  // Swap the two parts:  [0..W-Sh-1] [W-Sh..W-1]
+  uint16_t W = width();
+  Sh = Sh % W;
+  if (Sh == 0)
+    return *this;
+
+  RegisterCell Tmp(W-Sh);
+  // Tmp = [0..W-Sh-1].
+  for (uint16_t i = 0; i < W-Sh; ++i)
+    Tmp[i] = Bits[i];
+  // Shift [W-Sh..W-1] to [0..Sh-1].
+  for (uint16_t i = 0; i < Sh; ++i)
+    Bits[i] = Bits[W-Sh+i];
+  // Copy Tmp to [Sh..W-1].
+  for (uint16_t i = 0; i < W-Sh; ++i)
+    Bits[i+Sh] = Tmp.Bits[i];
+  return *this;
+}
+
+
+BT::RegisterCell &BT::RegisterCell::fill(uint16_t B, uint16_t E,
+      const BitValue &V) {
+  assert(B <= E);
+  while (B < E)
+    Bits[B++] = V;
+  return *this;
+}
+
+
+BT::RegisterCell &BT::RegisterCell::cat(const RegisterCell &RC) {
+  // Append the cell given as the argument to the "this" cell.
+  // Bit 0 of RC becomes bit W of the result, where W is this->width().
+  uint16_t W = width(), WRC = RC.width();
+  Bits.resize(W+WRC);
+  for (uint16_t i = 0; i < WRC; ++i)
+    Bits[i+W] = RC.Bits[i];
+  return *this;
+}
+
+
+uint16_t BT::RegisterCell::ct(bool B) const {
+  uint16_t W = width();
+  uint16_t C = 0;
+  BitValue V = B;
+  while (C < W && Bits[C] == V)
+    C++;
+  return C;
+}
+
+
+uint16_t BT::RegisterCell::cl(bool B) const {
+  uint16_t W = width();
+  uint16_t C = 0;
+  BitValue V = B;
+  while (C < W && Bits[W-(C+1)] == V)
+    C++;
+  return C;
+}
+
+
+bool BT::RegisterCell::operator== (const RegisterCell &RC) const {
+  uint16_t W = Bits.size();
+  if (RC.Bits.size() != W)
+    return false;
+  for (uint16_t i = 0; i < W; ++i)
+    if (Bits[i] != RC[i])
+      return false;
+  return true;
+}
+
+
+uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
+  // The general problem is with finding a register class that corresponds
+  // to a given reference reg:sub. There can be several such classes, and
+  // since we only care about the register size, it does not matter which
+  // such class we would find.
+  // The easiest way to accomplish what we want is to
+  // 1. find a physical register PhysR from the same class as RR.Reg,
+  // 2. find a physical register PhysS that corresponds to PhysR:RR.Sub,
+  // 3. find a register class that contains PhysS.
+  unsigned PhysR;
+  if (TargetRegisterInfo::isVirtualRegister(RR.Reg)) {
+    const TargetRegisterClass *VC = MRI.getRegClass(RR.Reg);
+    assert(VC->begin() != VC->end() && "Empty register class");
+    PhysR = *VC->begin();
+  } else {
+    assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg));
+    PhysR = RR.Reg;
+  }
+
+  unsigned PhysS = (RR.Sub == 0) ? PhysR : TRI.getSubReg(PhysR, RR.Sub);
+  const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PhysS);
+  uint16_t BW = RC->getSize()*8;
+  return BW;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::getCell(const RegisterRef &RR,
+      const CellMapType &M) const {
+  uint16_t BW = getRegBitWidth(RR);
+
+  // Physical registers are assumed to be present in the map with an unknown
+  // value. Don't actually insert anything in the map, just return the cell.
+  if (TargetRegisterInfo::isPhysicalRegister(RR.Reg))
+    return RegisterCell::self(0, BW);
+
+  assert(TargetRegisterInfo::isVirtualRegister(RR.Reg));
+  // For virtual registers that belong to a class that is not tracked,
+  // generate an "unknown" value as well.
+  const TargetRegisterClass *C = MRI.getRegClass(RR.Reg);
+  if (!track(C))
+    return RegisterCell::self(0, BW);
+
+  CellMapType::const_iterator F = M.find(RR.Reg);
+  if (F != M.end()) {
+    if (!RR.Sub)
+      return F->second;
+    BitMask M = mask(RR.Reg, RR.Sub);
+    return F->second.extract(M);
+  }
+  // If not found, create a "top" entry, but do not insert it in the map.
+  return RegisterCell::top(BW);
+}
+
+
+void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC,
+      CellMapType &M) const {
+  // While updating the cell map can be done in a meaningful way for
+  // a part of a register, it makes little sense to implement it as the
+  // SSA representation would never contain such "partial definitions".
+  if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+    return;
+  assert(RR.Sub == 0 && "Unexpected sub-register in definition");
+  // Eliminate all ref-to-reg-0 bit values: replace them with "self".
+  for (unsigned i = 0, n = RC.width(); i < n; ++i) {
+    const BitValue &V = RC[i];
+    if (V.Type == BitValue::Ref && V.RefI.Reg == 0)
+      RC[i].RefI = BitRef(RR.Reg, i);
+  }
+  M[RR.Reg] = RC;
+}
+
+
+// Check if the cell represents a compile-time integer value.
+bool BT::MachineEvaluator::isInt(const RegisterCell &A) const {
+  uint16_t W = A.width();
+  for (uint16_t i = 0; i < W; ++i)
+    if (!A[i].is(0) && !A[i].is(1))
+      return false;
+  return true;
+}
+
+
+// Convert a cell to the integer value. The result must fit in uint64_t.
+uint64_t BT::MachineEvaluator::toInt(const RegisterCell &A) const {
+  assert(isInt(A));
+  uint64_t Val = 0;
+  uint16_t W = A.width();
+  for (uint16_t i = 0; i < W; ++i) {
+    Val <<= 1;
+    Val |= A[i].is(1);
+  }
+  return Val;
+}
+
+
+// Evaluator helper functions. These implement some common operation on
+// register cells that can be used to implement target-specific instructions
+// in a target-specific evaluator.
+
+BT::RegisterCell BT::MachineEvaluator::eIMM(int64_t V, uint16_t W) const {
+  RegisterCell Res(W);
+  // For bits beyond the 63rd, this will generate the sign bit of V.
+  for (uint16_t i = 0; i < W; ++i) {
+    Res[i] = BitValue(V & 1);
+    V >>= 1;
+  }
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eIMM(const ConstantInt *CI) const {
+  const APInt &A = CI->getValue();
+  uint16_t BW = A.getBitWidth();
+  assert((unsigned)BW == A.getBitWidth() && "BitWidth overflow");
+  RegisterCell Res(BW);
+  for (uint16_t i = 0; i < BW; ++i)
+    Res[i] = A[i];
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eADD(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width();
+  assert(W == A2.width());
+  RegisterCell Res(W);
+  bool Carry = false;
+  uint16_t I;
+  for (I = 0; I < W; ++I) {
+    const BitValue &V1 = A1[I];
+    const BitValue &V2 = A2[I];
+    if (!V1.num() || !V2.num())
+      break;
+    unsigned S = bool(V1) + bool(V2) + Carry;
+    Res[I] = BitValue(S & 1);
+    Carry = (S > 1);
+  }
+  for (; I < W; ++I) {
+    const BitValue &V1 = A1[I];
+    const BitValue &V2 = A2[I];
+    // If the next bit is same as Carry, the result will be 0 plus the
+    // other bit. The Carry bit will remain unchanged.
+    if (V1.is(Carry))
+      Res[I] = BitValue::ref(V2);
+    else if (V2.is(Carry))
+      Res[I] = BitValue::ref(V1);
+    else
+      break;
+  }
+  for (; I < W; ++I)
+    Res[I] = BitValue::self();
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eSUB(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width();
+  assert(W == A2.width());
+  RegisterCell Res(W);
+  bool Borrow = false;
+  uint16_t I;
+  for (I = 0; I < W; ++I) {
+    const BitValue &V1 = A1[I];
+    const BitValue &V2 = A2[I];
+    if (!V1.num() || !V2.num())
+      break;
+    unsigned S = bool(V1) - bool(V2) - Borrow;
+    Res[I] = BitValue(S & 1);
+    Borrow = (S > 1);
+  }
+  for (; I < W; ++I) {
+    const BitValue &V1 = A1[I];
+    const BitValue &V2 = A2[I];
+    if (V1.is(Borrow)) {
+      Res[I] = BitValue::ref(V2);
+      break;
+    }
+    if (V2.is(Borrow))
+      Res[I] = BitValue::ref(V1);
+    else
+      break;
+  }
+  for (; I < W; ++I)
+    Res[I] = BitValue::self();
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eMLS(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width() + A2.width();
+  uint16_t Z = A1.ct(0) + A2.ct(0);
+  RegisterCell Res(W);
+  Res.fill(0, Z, BitValue::Zero);
+  Res.fill(Z, W, BitValue::self());
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eMLU(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width() + A2.width();
+  uint16_t Z = A1.ct(0) + A2.ct(0);
+  RegisterCell Res(W);
+  Res.fill(0, Z, BitValue::Zero);
+  Res.fill(Z, W, BitValue::self());
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eASL(const RegisterCell &A1,
+      uint16_t Sh) const {
+  assert(Sh <= A1.width());
+  RegisterCell Res = RegisterCell::ref(A1);
+  Res.rol(Sh);
+  Res.fill(0, Sh, BitValue::Zero);
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eLSR(const RegisterCell &A1,
+      uint16_t Sh) const {
+  uint16_t W = A1.width();
+  assert(Sh <= W);
+  RegisterCell Res = RegisterCell::ref(A1);
+  Res.rol(W-Sh);
+  Res.fill(W-Sh, W, BitValue::Zero);
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eASR(const RegisterCell &A1,
+      uint16_t Sh) const {
+  uint16_t W = A1.width();
+  assert(Sh <= W);
+  RegisterCell Res = RegisterCell::ref(A1);
+  BitValue Sign = Res[W-1];
+  Res.rol(W-Sh);
+  Res.fill(W-Sh, W, Sign);
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eAND(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width();
+  assert(W == A2.width());
+  RegisterCell Res(W);
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitValue &V1 = A1[i];
+    const BitValue &V2 = A2[i];
+    if (V1.is(1))
+      Res[i] = BitValue::ref(V2);
+    else if (V2.is(1))
+      Res[i] = BitValue::ref(V1);
+    else if (V1.is(0) || V2.is(0))
+      Res[i] = BitValue::Zero;
+    else if (V1 == V2)
+      Res[i] = V1;
+    else
+      Res[i] = BitValue::self();
+  }
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eORL(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width();
+  assert(W == A2.width());
+  RegisterCell Res(W);
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitValue &V1 = A1[i];
+    const BitValue &V2 = A2[i];
+    if (V1.is(1) || V2.is(1))
+      Res[i] = BitValue::One;
+    else if (V1.is(0))
+      Res[i] = BitValue::ref(V2);
+    else if (V2.is(0))
+      Res[i] = BitValue::ref(V1);
+    else if (V1 == V2)
+      Res[i] = V1;
+    else
+      Res[i] = BitValue::self();
+  }
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eXOR(const RegisterCell &A1,
+      const RegisterCell &A2) const {
+  uint16_t W = A1.width();
+  assert(W == A2.width());
+  RegisterCell Res(W);
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitValue &V1 = A1[i];
+    const BitValue &V2 = A2[i];
+    if (V1.is(0))
+      Res[i] = BitValue::ref(V2);
+    else if (V2.is(0))
+      Res[i] = BitValue::ref(V1);
+    else if (V1 == V2)
+      Res[i] = BitValue::Zero;
+    else
+      Res[i] = BitValue::self();
+  }
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eNOT(const RegisterCell &A1) const {
+  uint16_t W = A1.width();
+  RegisterCell Res(W);
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitValue &V = A1[i];
+    if (V.is(0))
+      Res[i] = BitValue::One;
+    else if (V.is(1))
+      Res[i] = BitValue::Zero;
+    else
+      Res[i] = BitValue::self();
+  }
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eSET(const RegisterCell &A1,
+      uint16_t BitN) const {
+  assert(BitN < A1.width());
+  RegisterCell Res = RegisterCell::ref(A1);
+  Res[BitN] = BitValue::One;
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eCLR(const RegisterCell &A1,
+      uint16_t BitN) const {
+  assert(BitN < A1.width());
+  RegisterCell Res = RegisterCell::ref(A1);
+  Res[BitN] = BitValue::Zero;
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eCLB(const RegisterCell &A1, bool B,
+      uint16_t W) const {
+  uint16_t C = A1.cl(B), AW = A1.width();
+  // If the last leading non-B bit is not a constant, then we don't know
+  // the real count.
+  if ((C < AW && A1[AW-1-C].num()) || C == AW)
+    return eIMM(C, W);
+  return RegisterCell::self(0, W);
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eCTB(const RegisterCell &A1, bool B,
+      uint16_t W) const {
+  uint16_t C = A1.ct(B), AW = A1.width();
+  // If the last trailing non-B bit is not a constant, then we don't know
+  // the real count.
+  if ((C < AW && A1[C].num()) || C == AW)
+    return eIMM(C, W);
+  return RegisterCell::self(0, W);
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eSXT(const RegisterCell &A1,
+      uint16_t FromN) const {
+  uint16_t W = A1.width();
+  assert(FromN <= W);
+  RegisterCell Res = RegisterCell::ref(A1);
+  BitValue Sign = Res[FromN-1];
+  // Sign-extend "inreg".
+  Res.fill(FromN, W, Sign);
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eZXT(const RegisterCell &A1,
+      uint16_t FromN) const {
+  uint16_t W = A1.width();
+  assert(FromN <= W);
+  RegisterCell Res = RegisterCell::ref(A1);
+  Res.fill(FromN, W, BitValue::Zero);
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eXTR(const RegisterCell &A1,
+      uint16_t B, uint16_t E) const {
+  uint16_t W = A1.width();
+  assert(B < W && E <= W);
+  if (B == E)
+    return RegisterCell(0);
+  uint16_t Last = (E > 0) ? E-1 : W-1;
+  RegisterCell Res = RegisterCell::ref(A1).extract(BT::BitMask(B, Last));
+  // Return shorter cell.
+  return Res;
+}
+
+
+BT::RegisterCell BT::MachineEvaluator::eINS(const RegisterCell &A1,
+      const RegisterCell &A2, uint16_t AtN) const {
+  uint16_t W1 = A1.width(), W2 = A2.width();
+  (void)W1;
+  assert(AtN < W1 && AtN+W2 <= W1);
+  // Copy bits from A1, insert A2 at position AtN.
+  RegisterCell Res = RegisterCell::ref(A1);
+  if (W2 > 0)
+    Res.insert(RegisterCell::ref(A2), BT::BitMask(AtN, AtN+W2-1));
+  return Res;
+}
+
+
+BT::BitMask BT::MachineEvaluator::mask(unsigned Reg, unsigned Sub) const {
+  assert(Sub == 0 && "Generic BitTracker::mask called for Sub != 0");
+  uint16_t W = getRegBitWidth(Reg);
+  assert(W > 0 && "Cannot generate mask for empty register");
+  return BitMask(0, W-1);
+}
+
+bool BT::MachineEvaluator::evaluate(const MachineInstr &MI,
+                                    const CellMapType &Inputs,
+                                    CellMapType &Outputs) const {
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+    case TargetOpcode::REG_SEQUENCE: {
+      RegisterRef RD = MI.getOperand(0);
+      assert(RD.Sub == 0);
+      RegisterRef RS = MI.getOperand(1);
+      unsigned SS = MI.getOperand(2).getImm();
+      RegisterRef RT = MI.getOperand(3);
+      unsigned ST = MI.getOperand(4).getImm();
+      assert(SS != ST);
+
+      uint16_t W = getRegBitWidth(RD);
+      RegisterCell Res(W);
+      Res.insert(RegisterCell::ref(getCell(RS, Inputs)), mask(RD.Reg, SS));
+      Res.insert(RegisterCell::ref(getCell(RT, Inputs)), mask(RD.Reg, ST));
+      putCell(RD, Res, Outputs);
+      break;
+    }
+
+    case TargetOpcode::COPY: {
+      // COPY can transfer a smaller register into a wider one.
+      // If that is the case, fill the remaining high bits with 0.
+      RegisterRef RD = MI.getOperand(0);
+      RegisterRef RS = MI.getOperand(1);
+      assert(RD.Sub == 0);
+      uint16_t WD = getRegBitWidth(RD);
+      uint16_t WS = getRegBitWidth(RS);
+      assert(WD >= WS);
+      RegisterCell Src = getCell(RS, Inputs);
+      RegisterCell Res(WD);
+      Res.insert(Src, BitMask(0, WS-1));
+      Res.fill(WS, WD, BitValue::Zero);
+      putCell(RD, Res, Outputs);
+      break;
+    }
+
+    default:
+      return false;
+  }
+
+  return true;
+}
+
+
+// Main W-Z implementation.
+
+void BT::visitPHI(const MachineInstr &PI) {
+  int ThisN = PI.getParent()->getNumber();
+  if (Trace)
+    dbgs() << "Visit FI(BB#" << ThisN << "): " << PI;
+
+  const MachineOperand &MD = PI.getOperand(0);
+  assert(MD.getSubReg() == 0 && "Unexpected sub-register in definition");
+  RegisterRef DefRR(MD);
+  uint16_t DefBW = ME.getRegBitWidth(DefRR);
+
+  RegisterCell DefC = ME.getCell(DefRR, Map);
+  if (DefC == RegisterCell::self(DefRR.Reg, DefBW))    // XXX slow
+    return;
+
+  bool Changed = false;
+
+  for (unsigned i = 1, n = PI.getNumOperands(); i < n; i += 2) {
+    const MachineBasicBlock *PB = PI.getOperand(i + 1).getMBB();
+    int PredN = PB->getNumber();
+    if (Trace)
+      dbgs() << "  edge BB#" << PredN << "->BB#" << ThisN;
+    if (!EdgeExec.count(CFGEdge(PredN, ThisN))) {
+      if (Trace)
+        dbgs() << " not executable\n";
+      continue;
+    }
+
+    RegisterRef RU = PI.getOperand(i);
+    RegisterCell ResC = ME.getCell(RU, Map);
+    if (Trace)
+      dbgs() << " input reg: " << PrintReg(RU.Reg, &ME.TRI, RU.Sub)
+             << " cell: " << ResC << "\n";
+    Changed |= DefC.meet(ResC, DefRR.Reg);
+  }
+
+  if (Changed) {
+    if (Trace)
+      dbgs() << "Output: " << PrintReg(DefRR.Reg, &ME.TRI, DefRR.Sub)
+             << " cell: " << DefC << "\n";
+    ME.putCell(DefRR, DefC, Map);
+    visitUsesOf(DefRR.Reg);
+  }
+}
+
+void BT::visitNonBranch(const MachineInstr &MI) {
+  if (Trace) {
+    int ThisN = MI.getParent()->getNumber();
+    dbgs() << "Visit MI(BB#" << ThisN << "): " << MI;
+  }
+  if (MI.isDebugValue())
+    return;
+  assert(!MI.isBranch() && "Unexpected branch instruction");
+
+  CellMapType ResMap;
+  bool Eval = ME.evaluate(MI, Map, ResMap);
+
+  if (Trace && Eval) {
+    for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i) {
+      const MachineOperand &MO = MI.getOperand(i);
+      if (!MO.isReg() || !MO.isUse())
+        continue;
+      RegisterRef RU(MO);
+      dbgs() << "  input reg: " << PrintReg(RU.Reg, &ME.TRI, RU.Sub)
+             << " cell: " << ME.getCell(RU, Map) << "\n";
+    }
+    dbgs() << "Outputs:\n";
+    for (CellMapType::iterator I = ResMap.begin(), E = ResMap.end();
+         I != E; ++I) {
+      RegisterRef RD(I->first);
+      dbgs() << "  " << PrintReg(I->first, &ME.TRI) << " cell: "
+             << ME.getCell(RD, ResMap) << "\n";
+    }
+  }
+
+  // Iterate over all definitions of the instruction, and update the
+  // cells accordingly.
+  for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    // Visit register defs only.
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    RegisterRef RD(MO);
+    assert(RD.Sub == 0 && "Unexpected sub-register in definition");
+    if (!TargetRegisterInfo::isVirtualRegister(RD.Reg))
+      continue;
+
+    bool Changed = false;
+    if (!Eval || ResMap.count(RD.Reg) == 0) {
+      // Set to "ref" (aka "bottom").
+      uint16_t DefBW = ME.getRegBitWidth(RD);
+      RegisterCell RefC = RegisterCell::self(RD.Reg, DefBW);
+      if (RefC != ME.getCell(RD, Map)) {
+        ME.putCell(RD, RefC, Map);
+        Changed = true;
+      }
+    } else {
+      RegisterCell DefC = ME.getCell(RD, Map);
+      RegisterCell ResC = ME.getCell(RD, ResMap);
+      // This is a non-phi instruction, so the values of the inputs come
+      // from the same registers each time this instruction is evaluated.
+      // During the propagation, the values of the inputs can become lowered
+      // in the sense of the lattice operation, which may cause different
+      // results to be calculated in subsequent evaluations. This should
+      // not cause the bottoming of the result in the map, since the new
+      // result is already reflecting the lowered inputs.
+      for (uint16_t i = 0, w = DefC.width(); i < w; ++i) {
+        BitValue &V = DefC[i];
+        // Bits that are already "bottom" should not be updated.
+        if (V.Type == BitValue::Ref && V.RefI.Reg == RD.Reg)
+          continue;
+        // Same for those that are identical in DefC and ResC.
+        if (V == ResC[i])
+          continue;
+        V = ResC[i];
+        Changed = true;
+      }
+      if (Changed)
+        ME.putCell(RD, DefC, Map);
+    }
+    if (Changed)
+      visitUsesOf(RD.Reg);
+  }
+}
+
+void BT::visitBranchesFrom(const MachineInstr &BI) {
+  const MachineBasicBlock &B = *BI.getParent();
+  MachineBasicBlock::const_iterator It = BI, End = B.end();
+  BranchTargetList Targets, BTs;
+  bool FallsThrough = true, DefaultToAll = false;
+  int ThisN = B.getNumber();
+
+  do {
+    BTs.clear();
+    const MachineInstr &MI = *It;
+    if (Trace)
+      dbgs() << "Visit BR(BB#" << ThisN << "): " << MI;
+    assert(MI.isBranch() && "Expecting branch instruction");
+    InstrExec.insert(&MI);
+    bool Eval = ME.evaluate(MI, Map, BTs, FallsThrough);
+    if (!Eval) {
+      // If the evaluation failed, we will add all targets. Keep going in
+      // the loop to mark all executable branches as such.
+      DefaultToAll = true;
+      FallsThrough = true;
+      if (Trace)
+        dbgs() << "  failed to evaluate: will add all CFG successors\n";
+    } else if (!DefaultToAll) {
+      // If evaluated successfully add the targets to the cumulative list.
+      if (Trace) {
+        dbgs() << "  adding targets:";
+        for (unsigned i = 0, n = BTs.size(); i < n; ++i)
+          dbgs() << " BB#" << BTs[i]->getNumber();
+        if (FallsThrough)
+          dbgs() << "\n  falls through\n";
+        else
+          dbgs() << "\n  does not fall through\n";
+      }
+      Targets.insert(BTs.begin(), BTs.end());
+    }
+    ++It;
+  } while (FallsThrough && It != End);
+
+  typedef MachineBasicBlock::const_succ_iterator succ_iterator;
+  if (!DefaultToAll) {
+    // Need to add all CFG successors that lead to EH landing pads.
+    // There won't be explicit branches to these blocks, but they must
+    // be processed.
+    for (succ_iterator I = B.succ_begin(), E = B.succ_end(); I != E; ++I) {
+      const MachineBasicBlock *SB = *I;
+      if (SB->isEHPad())
+        Targets.insert(SB);
+    }
+    if (FallsThrough) {
+      MachineFunction::const_iterator BIt = B.getIterator();
+      MachineFunction::const_iterator Next = std::next(BIt);
+      if (Next != MF.end())
+        Targets.insert(&*Next);
+    }
+  } else {
+    for (succ_iterator I = B.succ_begin(), E = B.succ_end(); I != E; ++I)
+      Targets.insert(*I);
+  }
+
+  for (unsigned i = 0, n = Targets.size(); i < n; ++i) {
+    int TargetN = Targets[i]->getNumber();
+    FlowQ.push(CFGEdge(ThisN, TargetN));
+  }
+}
+
+
+void BT::visitUsesOf(unsigned Reg) {
+  if (Trace)
+    dbgs() << "visiting uses of " << PrintReg(Reg, &ME.TRI) << "\n";
+
+  typedef MachineRegisterInfo::use_nodbg_iterator use_iterator;
+  use_iterator End = MRI.use_nodbg_end();
+  for (use_iterator I = MRI.use_nodbg_begin(Reg); I != End; ++I) {
+    MachineInstr *UseI = I->getParent();
+    if (!InstrExec.count(UseI))
+      continue;
+    if (UseI->isPHI())
+      visitPHI(*UseI);
+    else if (!UseI->isBranch())
+      visitNonBranch(*UseI);
+    else
+      visitBranchesFrom(*UseI);
+  }
+}
+
+
+BT::RegisterCell BT::get(RegisterRef RR) const {
+  return ME.getCell(RR, Map);
+}
+
+
+void BT::put(RegisterRef RR, const RegisterCell &RC) {
+  ME.putCell(RR, RC, Map);
+}
+
+
+// Replace all references to bits from OldRR with the corresponding bits
+// in NewRR.
+void BT::subst(RegisterRef OldRR, RegisterRef NewRR) {
+  assert(Map.count(OldRR.Reg) > 0 && "OldRR not present in map");
+  BitMask OM = ME.mask(OldRR.Reg, OldRR.Sub);
+  BitMask NM = ME.mask(NewRR.Reg, NewRR.Sub);
+  uint16_t OMB = OM.first(), OME = OM.last();
+  uint16_t NMB = NM.first(), NME = NM.last();
+  (void)NME;
+  assert((OME-OMB == NME-NMB) &&
+         "Substituting registers of different lengths");
+  for (CellMapType::iterator I = Map.begin(), E = Map.end(); I != E; ++I) {
+    RegisterCell &RC = I->second;
+    for (uint16_t i = 0, w = RC.width(); i < w; ++i) {
+      BitValue &V = RC[i];
+      if (V.Type != BitValue::Ref || V.RefI.Reg != OldRR.Reg)
+        continue;
+      if (V.RefI.Pos < OMB || V.RefI.Pos > OME)
+        continue;
+      V.RefI.Reg = NewRR.Reg;
+      V.RefI.Pos += NMB-OMB;
+    }
+  }
+}
+
+
+// Check if the block has been "executed" during propagation. (If not, the
+// block is dead, but it may still appear to be reachable.)
+bool BT::reached(const MachineBasicBlock *B) const {
+  int BN = B->getNumber();
+  assert(BN >= 0);
+  for (EdgeSetType::iterator I = EdgeExec.begin(), E = EdgeExec.end();
+       I != E; ++I) {
+    if (I->second == BN)
+      return true;
+  }
+  return false;
+}
+
+
+// Visit an individual instruction. This could be a newly added instruction,
+// or one that has been modified by an optimization.
+void BT::visit(const MachineInstr &MI) {
+  assert(!MI.isBranch() && "Only non-branches are allowed");
+  InstrExec.insert(&MI);
+  visitNonBranch(MI);
+  // The call to visitNonBranch could propagate the changes until a branch
+  // is actually visited. This could result in adding CFG edges to the flow
+  // queue. Since the queue won't be processed, clear it.
+  while (!FlowQ.empty())
+    FlowQ.pop();
+}
+
+
+void BT::reset() {
+  EdgeExec.clear();
+  InstrExec.clear();
+  Map.clear();
+}
+
+
+void BT::run() {
+  reset();
+  assert(FlowQ.empty());
+
+  typedef GraphTraits<const MachineFunction*> MachineFlowGraphTraits;
+  const MachineBasicBlock *Entry = MachineFlowGraphTraits::getEntryNode(&MF);
+
+  unsigned MaxBN = 0;
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    assert(I->getNumber() >= 0 && "Disconnected block");
+    unsigned BN = I->getNumber();
+    if (BN > MaxBN)
+      MaxBN = BN;
+  }
+
+  // Keep track of visited blocks.
+  BitVector BlockScanned(MaxBN+1);
+
+  int EntryN = Entry->getNumber();
+  // Generate a fake edge to get something to start with.
+  FlowQ.push(CFGEdge(-1, EntryN));
+
+  while (!FlowQ.empty()) {
+    CFGEdge Edge = FlowQ.front();
+    FlowQ.pop();
+
+    if (EdgeExec.count(Edge))
+      continue;
+    EdgeExec.insert(Edge);
+
+    const MachineBasicBlock &B = *MF.getBlockNumbered(Edge.second);
+    MachineBasicBlock::const_iterator It = B.begin(), End = B.end();
+    // Visit PHI nodes first.
+    while (It != End && It->isPHI()) {
+      const MachineInstr &PI = *It++;
+      InstrExec.insert(&PI);
+      visitPHI(PI);
+    }
+
+    // If this block has already been visited through a flow graph edge,
+    // then the instructions have already been processed. Any updates to
+    // the cells would now only happen through visitUsesOf...
+    if (BlockScanned[Edge.second])
+      continue;
+    BlockScanned[Edge.second] = true;
+
+    // Visit non-branch instructions.
+    while (It != End && !It->isBranch()) {
+      const MachineInstr &MI = *It++;
+      InstrExec.insert(&MI);
+      visitNonBranch(MI);
+    }
+    // If block end has been reached, add the fall-through edge to the queue.
+    if (It == End) {
+      MachineFunction::const_iterator BIt = B.getIterator();
+      MachineFunction::const_iterator Next = std::next(BIt);
+      if (Next != MF.end() && B.isSuccessor(&*Next)) {
+        int ThisN = B.getNumber();
+        int NextN = Next->getNumber();
+        FlowQ.push(CFGEdge(ThisN, NextN));
+      }
+    } else {
+      // Handle the remaining sequence of branches. This function will update
+      // the work queue.
+      visitBranchesFrom(*It);
+    }
+  } // while (!FlowQ->empty())
+
+  if (Trace)
+    print_cells(dbgs() << "Cells after propagation:\n");
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/BitTracker.h b/contrib/llvm/lib/Target/Hexagon/BitTracker.h
new file mode 100644
index 000000000000..74cafcd00b60
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/BitTracker.h
@@ -0,0 +1,438 @@
+//===--- BitTracker.h -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BITTRACKER_H
+#define BITTRACKER_H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+#include <map>
+#include <queue>
+#include <set>
+
+namespace llvm {
+  class ConstantInt;
+  class MachineRegisterInfo;
+  class MachineBasicBlock;
+  class MachineInstr;
+  class MachineOperand;
+  class raw_ostream;
+
+struct BitTracker {
+  struct BitRef;
+  struct RegisterRef;
+  struct BitValue;
+  struct BitMask;
+  struct RegisterCell;
+  struct MachineEvaluator;
+
+  typedef SetVector<const MachineBasicBlock *> BranchTargetList;
+
+  typedef std::map<unsigned, RegisterCell> CellMapType;
+
+  BitTracker(const MachineEvaluator &E, MachineFunction &F);
+  ~BitTracker();
+
+  void run();
+  void trace(bool On = false) { Trace = On; }
+  bool has(unsigned Reg) const;
+  const RegisterCell &lookup(unsigned Reg) const;
+  RegisterCell get(RegisterRef RR) const;
+  void put(RegisterRef RR, const RegisterCell &RC);
+  void subst(RegisterRef OldRR, RegisterRef NewRR);
+  bool reached(const MachineBasicBlock *B) const;
+  void visit(const MachineInstr &MI);
+
+  void print_cells(raw_ostream &OS) const;
+
+private:
+  void visitPHI(const MachineInstr &PI);
+  void visitNonBranch(const MachineInstr &MI);
+  void visitBranchesFrom(const MachineInstr &BI);
+  void visitUsesOf(unsigned Reg);
+  void reset();
+
+  typedef std::pair<int,int> CFGEdge;
+  typedef std::set<CFGEdge> EdgeSetType;
+  typedef std::set<const MachineInstr *> InstrSetType;
+  typedef std::queue<CFGEdge> EdgeQueueType;
+
+  EdgeSetType EdgeExec;       // Executable flow graph edges.
+  InstrSetType InstrExec;     // Executable instructions.
+  EdgeQueueType FlowQ;        // Work queue of CFG edges.
+  bool Trace;                 // Enable tracing for debugging.
+
+  const MachineEvaluator &ME;
+  MachineFunction &MF;
+  MachineRegisterInfo &MRI;
+  CellMapType &Map;
+};
+
+
+// Abstraction of a reference to bit at position Pos from a register Reg.
+struct BitTracker::BitRef {
+  BitRef(unsigned R = 0, uint16_t P = 0) : Reg(R), Pos(P) {}
+  bool operator== (const BitRef &BR) const {
+    // If Reg is 0, disregard Pos.
+    return Reg == BR.Reg && (Reg == 0 || Pos == BR.Pos);
+  }
+  unsigned Reg;
+  uint16_t Pos;
+};
+
+
+// Abstraction of a register reference in MachineOperand.  It contains the
+// register number and the subregister index.
+struct BitTracker::RegisterRef {
+  RegisterRef(unsigned R = 0, unsigned S = 0)
+    : Reg(R), Sub(S) {}
+  RegisterRef(const MachineOperand &MO)
+      : Reg(MO.getReg()), Sub(MO.getSubReg()) {}
+  unsigned Reg, Sub;
+};
+
+
+// Value that a single bit can take.  This is outside of the context of
+// any register, it is more of an abstraction of the two-element set of
+// possible bit values.  One extension here is the "Ref" type, which
+// indicates that this bit takes the same value as the bit described by
+// RefInfo.
+struct BitTracker::BitValue {
+  enum ValueType {
+    Top,    // Bit not yet defined.
+    Zero,   // Bit = 0.
+    One,    // Bit = 1.
+    Ref     // Bit value same as the one described in RefI.
+    // Conceptually, there is no explicit "bottom" value: the lattice's
+    // bottom will be expressed as a "ref to itself", which, in the context
+    // of registers, could be read as "this value of this bit is defined by
+    // this bit".
+    // The ordering is:
+    //   x <= Top,
+    //   Self <= x, where "Self" is "ref to itself".
+    // This makes the value lattice different for each virtual register
+    // (even for each bit in the same virtual register), since the "bottom"
+    // for one register will be a simple "ref" for another register.
+    // Since we do not store the "Self" bit and register number, the meet
+    // operation will need to take it as a parameter.
+    //
+    // In practice there is a special case for values that are not associa-
+    // ted with any specific virtual register. An example would be a value
+    // corresponding to a bit of a physical register, or an intermediate
+    // value obtained in some computation (such as instruction evaluation).
+    // Such cases are identical to the usual Ref type, but the register
+    // number is 0. In such case the Pos field of the reference is ignored.
+    //
+    // What is worthy of notice is that in value V (that is a "ref"), as long
+    // as the RefI.Reg is not 0, it may actually be the same register as the
+    // one in which V will be contained.  If the RefI.Pos refers to the posi-
+    // tion of V, then V is assumed to be "bottom" (as a "ref to itself"),
+    // otherwise V is taken to be identical to the referenced bit of the
+    // same register.
+    // If RefI.Reg is 0, however, such a reference to the same register is
+    // not possible.  Any value V that is a "ref", and whose RefI.Reg is 0
+    // is treated as "bottom".
+  };
+  ValueType Type;
+  BitRef RefI;
+
+  BitValue(ValueType T = Top) : Type(T) {}
+  BitValue(bool B) : Type(B ? One : Zero) {}
+  BitValue(unsigned Reg, uint16_t Pos) : Type(Ref), RefI(Reg, Pos) {}
+
+  bool operator== (const BitValue &V) const {
+    if (Type != V.Type)
+      return false;
+    if (Type == Ref && !(RefI == V.RefI))
+      return false;
+    return true;
+  }
+  bool operator!= (const BitValue &V) const {
+    return !operator==(V);
+  }
+  bool is(unsigned T) const {
+    assert(T == 0 || T == 1);
+    return T == 0 ? Type == Zero
+                  : (T == 1 ? Type == One : false);
+  }
+
+  // The "meet" operation is the "." operation in a semilattice (L, ., T, B):
+  // (1)  x.x = x
+  // (2)  x.y = y.x
+  // (3)  x.(y.z) = (x.y).z
+  // (4)  x.T = x  (i.e. T = "top")
+  // (5)  x.B = B  (i.e. B = "bottom")
+  //
+  // This "meet" function will update the value of the "*this" object with
+  // the newly calculated one, and return "true" if the value of *this has
+  // changed, and "false" otherwise.
+  // To prove that it satisfies the conditions (1)-(5), it is sufficient
+  // to show that a relation
+  //   x <= y  <=>  x.y = x
+  // defines a partial order (i.e. that "meet" is same as "infimum").
+  bool meet(const BitValue &V, const BitRef &Self) {
+    // First, check the cases where there is nothing to be done.
+    if (Type == Ref && RefI == Self)    // Bottom.meet(V) = Bottom (i.e. This)
+      return false;
+    if (V.Type == Top)                  // This.meet(Top) = This
+      return false;
+    if (*this == V)                     // This.meet(This) = This
+      return false;
+
+    // At this point, we know that the value of "this" will change.
+    // If it is Top, it will become the same as V, otherwise it will
+    // become "bottom" (i.e. Self).
+    if (Type == Top) {
+      Type = V.Type;
+      RefI = V.RefI;  // This may be irrelevant, but copy anyway.
+      return true;
+    }
+    // Become "bottom".
+    Type = Ref;
+    RefI = Self;
+    return true;
+  }
+
+  // Create a reference to the bit value V.
+  static BitValue ref(const BitValue &V);
+  // Create a "self".
+  static BitValue self(const BitRef &Self = BitRef());
+
+  bool num() const {
+    return Type == Zero || Type == One;
+  }
+  operator bool() const {
+    assert(Type == Zero || Type == One);
+    return Type == One;
+  }
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const BitValue &BV);
+};
+
+
+// This operation must be idempotent, i.e. ref(ref(V)) == ref(V).
+inline BitTracker::BitValue
+BitTracker::BitValue::ref(const BitValue &V) {
+  if (V.Type != Ref)
+    return BitValue(V.Type);
+  if (V.RefI.Reg != 0)
+    return BitValue(V.RefI.Reg, V.RefI.Pos);
+  return self();
+}
+
+
+inline BitTracker::BitValue
+BitTracker::BitValue::self(const BitRef &Self) {
+  return BitValue(Self.Reg, Self.Pos);
+}
+
+
+// A sequence of bits starting from index B up to and including index E.
+// If E < B, the mask represents two sections: [0..E] and [B..W) where
+// W is the width of the register.
+struct BitTracker::BitMask {
+  BitMask() : B(0), E(0) {}
+  BitMask(uint16_t b, uint16_t e) : B(b), E(e) {}
+  uint16_t first() const { return B; }
+  uint16_t last() const { return E; }
+private:
+  uint16_t B, E;
+};
+
+
+// Representation of a register: a list of BitValues.
+struct BitTracker::RegisterCell {
+  RegisterCell(uint16_t Width = DefaultBitN) : Bits(Width) {}
+
+  uint16_t width() const {
+    return Bits.size();
+  }
+  const BitValue &operator[](uint16_t BitN) const {
+    assert(BitN < Bits.size());
+    return Bits[BitN];
+  }
+  BitValue &operator[](uint16_t BitN) {
+    assert(BitN < Bits.size());
+    return Bits[BitN];
+  }
+
+  bool meet(const RegisterCell &RC, unsigned SelfR);
+  RegisterCell &insert(const RegisterCell &RC, const BitMask &M);
+  RegisterCell extract(const BitMask &M) const;  // Returns a new cell.
+  RegisterCell &rol(uint16_t Sh);    // Rotate left.
+  RegisterCell &fill(uint16_t B, uint16_t E, const BitValue &V);
+  RegisterCell &cat(const RegisterCell &RC);  // Concatenate.
+  uint16_t cl(bool B) const;
+  uint16_t ct(bool B) const;
+
+  bool operator== (const RegisterCell &RC) const;
+  bool operator!= (const RegisterCell &RC) const {
+    return !operator==(RC);
+  }
+
+  // Generate a "ref" cell for the corresponding register. In the resulting
+  // cell each bit will be described as being the same as the corresponding
+  // bit in register Reg (i.e. the cell is "defined" by register Reg).
+  static RegisterCell self(unsigned Reg, uint16_t Width);
+  // Generate a "top" cell of given size.
+  static RegisterCell top(uint16_t Width);
+  // Generate a cell that is a "ref" to another cell.
+  static RegisterCell ref(const RegisterCell &C);
+
+private:
+  // The DefaultBitN is here only to avoid frequent reallocation of the
+  // memory in the vector.
+  static const unsigned DefaultBitN = 32;
+  typedef SmallVector<BitValue, DefaultBitN> BitValueList;
+  BitValueList Bits;
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const RegisterCell &RC);
+};
+
+
+inline bool BitTracker::has(unsigned Reg) const {
+  return Map.find(Reg) != Map.end();
+}
+
+
+inline const BitTracker::RegisterCell&
+BitTracker::lookup(unsigned Reg) const {
+  CellMapType::const_iterator F = Map.find(Reg);
+  assert(F != Map.end());
+  return F->second;
+}
+
+
+inline BitTracker::RegisterCell
+BitTracker::RegisterCell::self(unsigned Reg, uint16_t Width) {
+  RegisterCell RC(Width);
+  for (uint16_t i = 0; i < Width; ++i)
+    RC.Bits[i] = BitValue::self(BitRef(Reg, i));
+  return RC;
+}
+
+
+inline BitTracker::RegisterCell
+BitTracker::RegisterCell::top(uint16_t Width) {
+  RegisterCell RC(Width);
+  for (uint16_t i = 0; i < Width; ++i)
+    RC.Bits[i] = BitValue(BitValue::Top);
+  return RC;
+}
+
+
+inline BitTracker::RegisterCell
+BitTracker::RegisterCell::ref(const RegisterCell &C) {
+  uint16_t W = C.width();
+  RegisterCell RC(W);
+  for (unsigned i = 0; i < W; ++i)
+    RC[i] = BitValue::ref(C[i]);
+  return RC;
+}
+
+// A class to evaluate target's instructions and update the cell maps.
+// This is used internally by the bit tracker.  A target that wants to
+// utilize this should implement the evaluation functions (noted below)
+// in a subclass of this class.
+struct BitTracker::MachineEvaluator {
+  MachineEvaluator(const TargetRegisterInfo &T, MachineRegisterInfo &M)
+      : TRI(T), MRI(M) {}
+  virtual ~MachineEvaluator() {}
+
+  uint16_t getRegBitWidth(const RegisterRef &RR) const;
+
+  RegisterCell getCell(const RegisterRef &RR, const CellMapType &M) const;
+  void putCell(const RegisterRef &RR, RegisterCell RC, CellMapType &M) const;
+  // A result of any operation should use refs to the source cells, not
+  // the cells directly. This function is a convenience wrapper to quickly
+  // generate a ref for a cell corresponding to a register reference.
+  RegisterCell getRef(const RegisterRef &RR, const CellMapType &M) const {
+    RegisterCell RC = getCell(RR, M);
+    return RegisterCell::ref(RC);
+  }
+
+  // Helper functions.
+  // Check if a cell is an immediate value (i.e. all bits are either 0 or 1).
+  bool isInt(const RegisterCell &A) const;
+  // Convert cell to an immediate value.
+  uint64_t toInt(const RegisterCell &A) const;
+
+  // Generate cell from an immediate value.
+  RegisterCell eIMM(int64_t V, uint16_t W) const;
+  RegisterCell eIMM(const ConstantInt *CI) const;
+
+  // Arithmetic.
+  RegisterCell eADD(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eSUB(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eMLS(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eMLU(const RegisterCell &A1, const RegisterCell &A2) const;
+
+  // Shifts.
+  RegisterCell eASL(const RegisterCell &A1, uint16_t Sh) const;
+  RegisterCell eLSR(const RegisterCell &A1, uint16_t Sh) const;
+  RegisterCell eASR(const RegisterCell &A1, uint16_t Sh) const;
+
+  // Logical.
+  RegisterCell eAND(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eORL(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eXOR(const RegisterCell &A1, const RegisterCell &A2) const;
+  RegisterCell eNOT(const RegisterCell &A1) const;
+
+  // Set bit, clear bit.
+  RegisterCell eSET(const RegisterCell &A1, uint16_t BitN) const;
+  RegisterCell eCLR(const RegisterCell &A1, uint16_t BitN) const;
+
+  // Count leading/trailing bits (zeros/ones).
+  RegisterCell eCLB(const RegisterCell &A1, bool B, uint16_t W) const;
+  RegisterCell eCTB(const RegisterCell &A1, bool B, uint16_t W) const;
+
+  // Sign/zero extension.
+  RegisterCell eSXT(const RegisterCell &A1, uint16_t FromN) const;
+  RegisterCell eZXT(const RegisterCell &A1, uint16_t FromN) const;
+
+  // Extract/insert
+  // XTR R,b,e:  extract bits from A1 starting at bit b, ending at e-1.
+  // INS R,S,b:  take R and replace bits starting from b with S.
+  RegisterCell eXTR(const RegisterCell &A1, uint16_t B, uint16_t E) const;
+  RegisterCell eINS(const RegisterCell &A1, const RegisterCell &A2,
+                    uint16_t AtN) const;
+
+  // User-provided functions for individual targets:
+
+  // Return a sub-register mask that indicates which bits in Reg belong
+  // to the subregister Sub. These bits are assumed to be contiguous in
+  // the super-register, and have the same ordering in the sub-register
+  // as in the super-register. It is valid to call this function with
+  // Sub == 0, in this case, the function should return a mask that spans
+  // the entire register Reg (which is what the default implementation
+  // does).
+  virtual BitMask mask(unsigned Reg, unsigned Sub) const;
+  // Indicate whether a given register class should be tracked.
+  virtual bool track(const TargetRegisterClass *RC) const { return true; }
+  // Evaluate a non-branching machine instruction, given the cell map with
+  // the input values. Place the results in the Outputs map. Return "true"
+  // if evaluation succeeded, "false" otherwise.
+  virtual bool evaluate(const MachineInstr &MI, const CellMapType &Inputs,
+                        CellMapType &Outputs) const;
+  // Evaluate a branch, given the cell map with the input values. Fill out
+  // a list of all possible branch targets and indicate (through a flag)
+  // whether the branch could fall-through. Return "true" if this information
+  // has been successfully computed, "false" otherwise.
+  virtual bool evaluate(const MachineInstr &BI, const CellMapType &Inputs,
+                        BranchTargetList &Targets, bool &FallsThru) const = 0;
+
+  const TargetRegisterInfo &TRI;
+  MachineRegisterInfo &MRI;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
new file mode 100644
index 000000000000..c05fbc1d7756
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -0,0 +1,1624 @@
+//===-- HexagonDisassembler.cpp - Disassembler for Hexagon ISA ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-disassembler"
+
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCChecker.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+using namespace llvm;
+using namespace Hexagon;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// \brief Hexagon disassembler for all Hexagon platforms.
+class HexagonDisassembler : public MCDisassembler {
+public:
+  std::unique_ptr<MCInstrInfo const> const MCII;
+  std::unique_ptr<MCInst *> CurrentBundle;
+
+  HexagonDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                      MCInstrInfo const *MCII)
+      : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(new MCInst *) {}
+
+  DecodeStatus getSingleInstruction(MCInst &Instr, MCInst &MCB,
+                                    ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                    raw_ostream &VStream, raw_ostream &CStream,
+                                    bool &Complete) const;
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
+
+  void adjustExtendedInstructions(MCInst &MCI, MCInst const &MCB) const;
+  void addSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) const;
+};
+
+} // end anonymous namespace
+
+// Forward declare these because the auto-generated code will reference them.
+// Definitions are further down.
+
+static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+
+static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn);
+static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn,
+                                 void const *Decoder);
+
+static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op,
+                                 raw_ostream &os);
+
+static unsigned getRegFromSubinstEncoding(unsigned encoded_reg);
+
+static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
+                                       uint64_t Address, const void *Decoder);
+static DecodeStatus s16_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus s12_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus s11_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                    const void *Decoder);
+static DecodeStatus s11_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                    const void *Decoder);
+static DecodeStatus s11_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                    const void *Decoder);
+static DecodeStatus s11_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                    const void *Decoder);
+static DecodeStatus s10_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                 const void *Decoder);
+static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                   const void *Decoder);
+static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                   const void *Decoder);
+static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                   const void *Decoder);
+static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                   const void *Decoder);
+static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                   const void *Decoder);
+static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                   const void *Decoder);
+static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                   const void *Decoder);
+static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                    const void *Decoder);
+
+#include "HexagonGenDisassemblerTables.inc"
+
+static MCDisassembler *createHexagonDisassembler(const Target &T,
+                                                 const MCSubtargetInfo &STI,
+                                                 MCContext &Ctx) {
+  return new HexagonDisassembler(STI, Ctx, T.createMCInstrInfo());
+}
+
+extern "C" void LLVMInitializeHexagonDisassembler() {
+  TargetRegistry::RegisterMCDisassembler(getTheHexagonTarget(),
+                                         createHexagonDisassembler);
+}
+
+DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                 ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address,
+                                                 raw_ostream &os,
+                                                 raw_ostream &cs) const {
+  DecodeStatus Result = DecodeStatus::Success;
+  bool Complete = false;
+  Size = 0;
+
+  *CurrentBundle = &MI;
+  MI = HexagonMCInstrInfo::createBundle();
+  while (Result == Success && !Complete) {
+    if (Bytes.size() < HEXAGON_INSTR_SIZE)
+      return MCDisassembler::Fail;
+    MCInst *Inst = new (getContext()) MCInst;
+    Result = getSingleInstruction(*Inst, MI, Bytes, Address, os, cs, Complete);
+    MI.addOperand(MCOperand::createInst(Inst));
+    Size += HEXAGON_INSTR_SIZE;
+    Bytes = Bytes.slice(HEXAGON_INSTR_SIZE);
+  }
+  if(Result == MCDisassembler::Fail)
+    return Result;
+  HexagonMCChecker Checker (*MCII, STI, MI, MI, *getContext().getRegisterInfo());
+  if(!Checker.check())
+    return MCDisassembler::Fail;
+  return MCDisassembler::Success;
+}
+
+static HexagonDisassembler const &disassembler(void const *Decoder) {
+  return *static_cast<HexagonDisassembler const *>(Decoder);
+}
+
+static MCContext &contextFromDecoder(void const *Decoder) {
+  return disassembler(Decoder).getContext();
+}
+
+DecodeStatus HexagonDisassembler::getSingleInstruction(
+    MCInst &MI, MCInst &MCB, ArrayRef<uint8_t> Bytes, uint64_t Address,
+    raw_ostream &os, raw_ostream &cs, bool &Complete) const {
+  assert(Bytes.size() >= HEXAGON_INSTR_SIZE);
+
+  uint32_t Instruction =
+      (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
+
+  auto BundleSize = HexagonMCInstrInfo::bundleSize(MCB);
+  if ((Instruction & HexagonII::INST_PARSE_MASK) ==
+      HexagonII::INST_PARSE_LOOP_END) {
+    if (BundleSize == 0)
+      HexagonMCInstrInfo::setInnerLoop(MCB);
+    else if (BundleSize == 1)
+      HexagonMCInstrInfo::setOuterLoop(MCB);
+    else
+      return DecodeStatus::Fail;
+  }
+
+  DecodeStatus Result = DecodeStatus::Success;
+  if ((Instruction & HexagonII::INST_PARSE_MASK) ==
+      HexagonII::INST_PARSE_DUPLEX) {
+    // Determine the instruction class of each instruction in the duplex.
+    unsigned duplexIClass, IClassLow, IClassHigh;
+
+    duplexIClass = ((Instruction >> 28) & 0xe) | ((Instruction >> 13) & 0x1);
+    switch (duplexIClass) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      IClassLow = HexagonII::HSIG_L1;
+      IClassHigh = HexagonII::HSIG_L1;
+      break;
+    case 1:
+      IClassLow = HexagonII::HSIG_L2;
+      IClassHigh = HexagonII::HSIG_L1;
+      break;
+    case 2:
+      IClassLow = HexagonII::HSIG_L2;
+      IClassHigh = HexagonII::HSIG_L2;
+      break;
+    case 3:
+      IClassLow = HexagonII::HSIG_A;
+      IClassHigh = HexagonII::HSIG_A;
+      break;
+    case 4:
+      IClassLow = HexagonII::HSIG_L1;
+      IClassHigh = HexagonII::HSIG_A;
+      break;
+    case 5:
+      IClassLow = HexagonII::HSIG_L2;
+      IClassHigh = HexagonII::HSIG_A;
+      break;
+    case 6:
+      IClassLow = HexagonII::HSIG_S1;
+      IClassHigh = HexagonII::HSIG_A;
+      break;
+    case 7:
+      IClassLow = HexagonII::HSIG_S2;
+      IClassHigh = HexagonII::HSIG_A;
+      break;
+    case 8:
+      IClassLow = HexagonII::HSIG_S1;
+      IClassHigh = HexagonII::HSIG_L1;
+      break;
+    case 9:
+      IClassLow = HexagonII::HSIG_S1;
+      IClassHigh = HexagonII::HSIG_L2;
+      break;
+    case 10:
+      IClassLow = HexagonII::HSIG_S1;
+      IClassHigh = HexagonII::HSIG_S1;
+      break;
+    case 11:
+      IClassLow = HexagonII::HSIG_S2;
+      IClassHigh = HexagonII::HSIG_S1;
+      break;
+    case 12:
+      IClassLow = HexagonII::HSIG_S2;
+      IClassHigh = HexagonII::HSIG_L1;
+      break;
+    case 13:
+      IClassLow = HexagonII::HSIG_S2;
+      IClassHigh = HexagonII::HSIG_L2;
+      break;
+    case 14:
+      IClassLow = HexagonII::HSIG_S2;
+      IClassHigh = HexagonII::HSIG_S2;
+      break;
+    }
+
+    // Set the MCInst to be a duplex instruction. Which one doesn't matter.
+    MI.setOpcode(Hexagon::DuplexIClass0);
+
+    // Decode each instruction in the duplex.
+    // Create an MCInst for each instruction.
+    unsigned instLow = Instruction & 0x1fff;
+    unsigned instHigh = (Instruction >> 16) & 0x1fff;
+    unsigned opLow;
+    if (GetSubinstOpcode(IClassLow, instLow, opLow, os) !=
+        MCDisassembler::Success)
+      return MCDisassembler::Fail;
+    unsigned opHigh;
+    if (GetSubinstOpcode(IClassHigh, instHigh, opHigh, os) !=
+        MCDisassembler::Success)
+      return MCDisassembler::Fail;
+    MCInst *MILow = new (getContext()) MCInst;
+    MILow->setOpcode(opLow);
+    MCInst *MIHigh = new (getContext()) MCInst;
+    MIHigh->setOpcode(opHigh);
+    addSubinstOperands(MILow, opLow, instLow);
+    addSubinstOperands(MIHigh, opHigh, instHigh);
+    // see ConvertToSubInst() in
+    // lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+
+    // Add the duplex instruction MCInsts as operands to the passed in MCInst.
+    MCOperand OPLow = MCOperand::createInst(MILow);
+    MCOperand OPHigh = MCOperand::createInst(MIHigh);
+    MI.addOperand(OPLow);
+    MI.addOperand(OPHigh);
+    Complete = true;
+  } else {
+    if ((Instruction & HexagonII::INST_PARSE_MASK) ==
+        HexagonII::INST_PARSE_PACKET_END)
+      Complete = true;
+    // Calling the auto-generated decoder function.
+    Result =
+        decodeInstruction(DecoderTable32, MI, Instruction, Address, this, STI);
+
+    // If a, "standard" insn isn't found check special cases.
+    if (MCDisassembler::Success != Result ||
+        MI.getOpcode() == Hexagon::A4_ext) {
+      Result = decodeImmext(MI, Instruction, this);
+      if (MCDisassembler::Success != Result) {
+        Result = decodeSpecial(MI, Instruction);
+      }
+    } else {
+      // If the instruction is a compound instruction, register values will
+      // follow the duplex model, so the register values in the MCInst are
+      // incorrect. If the instruction is a compound, loop through the
+      // operands and change registers appropriately.
+      if (HexagonMCInstrInfo::getType(*MCII, MI) == HexagonII::TypeCOMPOUND) {
+        for (MCInst::iterator i = MI.begin(), last = MI.end(); i < last; ++i) {
+          if (i->isReg()) {
+            unsigned reg = i->getReg() - Hexagon::R0;
+            i->setReg(getRegFromSubinstEncoding(reg));
+          }
+        }
+      }
+    }
+  }
+
+  switch(MI.getOpcode()) {
+  case Hexagon::J4_cmpeqn1_f_jumpnv_nt:
+  case Hexagon::J4_cmpeqn1_f_jumpnv_t:
+  case Hexagon::J4_cmpeqn1_fp0_jump_nt:
+  case Hexagon::J4_cmpeqn1_fp0_jump_t:
+  case Hexagon::J4_cmpeqn1_fp1_jump_nt:
+  case Hexagon::J4_cmpeqn1_fp1_jump_t:
+  case Hexagon::J4_cmpeqn1_t_jumpnv_nt:
+  case Hexagon::J4_cmpeqn1_t_jumpnv_t:
+  case Hexagon::J4_cmpeqn1_tp0_jump_nt:
+  case Hexagon::J4_cmpeqn1_tp0_jump_t:
+  case Hexagon::J4_cmpeqn1_tp1_jump_nt:
+  case Hexagon::J4_cmpeqn1_tp1_jump_t:
+  case Hexagon::J4_cmpgtn1_f_jumpnv_nt:
+  case Hexagon::J4_cmpgtn1_f_jumpnv_t:
+  case Hexagon::J4_cmpgtn1_fp0_jump_nt:
+  case Hexagon::J4_cmpgtn1_fp0_jump_t:
+  case Hexagon::J4_cmpgtn1_fp1_jump_nt:
+  case Hexagon::J4_cmpgtn1_fp1_jump_t:
+  case Hexagon::J4_cmpgtn1_t_jumpnv_nt:
+  case Hexagon::J4_cmpgtn1_t_jumpnv_t:
+  case Hexagon::J4_cmpgtn1_tp0_jump_nt:
+  case Hexagon::J4_cmpgtn1_tp0_jump_t:
+  case Hexagon::J4_cmpgtn1_tp1_jump_nt:
+  case Hexagon::J4_cmpgtn1_tp1_jump_t:
+    MI.insert(MI.begin() + 1, MCOperand::createExpr(MCConstantExpr::create(-1, getContext())));
+    break;
+  default:
+    break;
+  }
+
+  if (HexagonMCInstrInfo::isNewValue(*MCII, MI)) {
+    unsigned OpIndex = HexagonMCInstrInfo::getNewValueOp(*MCII, MI);
+    MCOperand &MCO = MI.getOperand(OpIndex);
+    assert(MCO.isReg() && "New value consumers must be registers");
+    unsigned Register =
+        getContext().getRegisterInfo()->getEncodingValue(MCO.getReg());
+    if ((Register & 0x6) == 0)
+      // HexagonPRM 10.11 Bit 1-2 == 0 is reserved
+      return MCDisassembler::Fail;
+    unsigned Lookback = (Register & 0x6) >> 1;
+    unsigned Offset = 1;
+    bool Vector = HexagonMCInstrInfo::isVector(*MCII, MI);
+    auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+    auto i = Instructions.end() - 1;
+    for (auto n = Instructions.begin() - 1;; --i, ++Offset) {
+      if (i == n)
+        // Couldn't find producer
+        return MCDisassembler::Fail;
+      if (Vector && !HexagonMCInstrInfo::isVector(*MCII, *i->getInst()))
+        // Skip scalars when calculating distances for vectors
+        ++Lookback;
+      if (HexagonMCInstrInfo::isImmext(*i->getInst()))
+        ++Lookback;
+      if (Offset == Lookback)
+        break;
+    }
+    auto const &Inst = *i->getInst();
+    bool SubregBit = (Register & 0x1) != 0;
+    if (SubregBit && HexagonMCInstrInfo::hasNewValue2(*MCII, Inst)) {
+      // If subreg bit is set we're selecting the second produced newvalue
+      unsigned Producer =
+          HexagonMCInstrInfo::getNewValueOperand2(*MCII, Inst).getReg();
+      assert(Producer != Hexagon::NoRegister);
+      MCO.setReg(Producer);
+    } else if (HexagonMCInstrInfo::hasNewValue(*MCII, Inst)) {
+      unsigned Producer =
+          HexagonMCInstrInfo::getNewValueOperand(*MCII, Inst).getReg();
+      if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
+        Producer = ((Producer - Hexagon::W0) << 1) + SubregBit + Hexagon::V0;
+      else if (SubregBit)
+        // Hexagon PRM 10.11 New-value operands
+        // Nt[0] is reserved and should always be encoded as zero.
+        return MCDisassembler::Fail;
+      assert(Producer != Hexagon::NoRegister);
+      MCO.setReg(Producer);
+    } else
+      return MCDisassembler::Fail;
+  }
+
+  adjustExtendedInstructions(MI, MCB);
+  MCInst const *Extender =
+    HexagonMCInstrInfo::extenderForIndex(MCB,
+                                         HexagonMCInstrInfo::bundleSize(MCB));
+  if(Extender != nullptr) {
+    MCInst const & Inst = HexagonMCInstrInfo::isDuplex(*MCII, MI) ?
+                          *MI.getOperand(1).getInst() : MI;
+    if (!HexagonMCInstrInfo::isExtendable(*MCII, Inst) &&
+        !HexagonMCInstrInfo::isExtended(*MCII, Inst))
+      return MCDisassembler::Fail;
+  }
+  return Result;
+}
+
+void HexagonDisassembler::adjustExtendedInstructions(MCInst &MCI,
+                                                     MCInst const &MCB) const {
+  if (!HexagonMCInstrInfo::hasExtenderForIndex(
+          MCB, HexagonMCInstrInfo::bundleSize(MCB))) {
+    unsigned opcode;
+    // This code is used by the disassembler to disambiguate between GP
+    // relative and absolute addressing instructions since they both have
+    // same encoding bits. However, an absolute addressing instruction must
+    // follow an immediate extender. Disassembler alwaus select absolute
+    // addressing instructions first and uses this code to change them into
+    // GP relative instruction in the absence of the corresponding immediate
+    // extender.
+    switch (MCI.getOpcode()) {
+    case Hexagon::PS_storerbabs:
+      opcode = Hexagon::S2_storerbgp;
+      break;
+    case Hexagon::PS_storerhabs:
+      opcode = Hexagon::S2_storerhgp;
+      break;
+    case Hexagon::PS_storerfabs:
+      opcode = Hexagon::S2_storerfgp;
+      break;
+    case Hexagon::PS_storeriabs:
+      opcode = Hexagon::S2_storerigp;
+      break;
+    case Hexagon::PS_storerbnewabs:
+      opcode = Hexagon::S2_storerbnewgp;
+      break;
+    case Hexagon::PS_storerhnewabs:
+      opcode = Hexagon::S2_storerhnewgp;
+      break;
+    case Hexagon::PS_storerinewabs:
+      opcode = Hexagon::S2_storerinewgp;
+      break;
+    case Hexagon::PS_storerdabs:
+      opcode = Hexagon::S2_storerdgp;
+      break;
+    case Hexagon::PS_loadrbabs:
+      opcode = Hexagon::L2_loadrbgp;
+      break;
+    case Hexagon::PS_loadrubabs:
+      opcode = Hexagon::L2_loadrubgp;
+      break;
+    case Hexagon::PS_loadrhabs:
+      opcode = Hexagon::L2_loadrhgp;
+      break;
+    case Hexagon::PS_loadruhabs:
+      opcode = Hexagon::L2_loadruhgp;
+      break;
+    case Hexagon::PS_loadriabs:
+      opcode = Hexagon::L2_loadrigp;
+      break;
+    case Hexagon::PS_loadrdabs:
+      opcode = Hexagon::L2_loadrdgp;
+      break;
+    default:
+      opcode = MCI.getOpcode();
+    }
+    MCI.setOpcode(opcode);
+  }
+}
+
+static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
+                                        ArrayRef<MCPhysReg> Table) {
+  if (RegNo < Table.size()) {
+    Inst.addOperand(MCOperand::createReg(Table[RegNo]));
+    return MCDisassembler::Success;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                                   uint64_t Address,
+                                                   const void *Decoder) {
+  return DecodeIntRegsRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  static const MCPhysReg IntRegDecoderTable[] = {
+      Hexagon::R0,  Hexagon::R1,  Hexagon::R2,  Hexagon::R3,  Hexagon::R4,
+      Hexagon::R5,  Hexagon::R6,  Hexagon::R7,  Hexagon::R8,  Hexagon::R9,
+      Hexagon::R10, Hexagon::R11, Hexagon::R12, Hexagon::R13, Hexagon::R14,
+      Hexagon::R15, Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19,
+      Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23, Hexagon::R24,
+      Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29,
+      Hexagon::R30, Hexagon::R31};
+
+  return DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable);
+}
+
+static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                  uint64_t /*Address*/,
+                                                  const void *Decoder) {
+  static const MCPhysReg VecRegDecoderTable[] = {
+      Hexagon::V0,  Hexagon::V1,  Hexagon::V2,  Hexagon::V3,  Hexagon::V4,
+      Hexagon::V5,  Hexagon::V6,  Hexagon::V7,  Hexagon::V8,  Hexagon::V9,
+      Hexagon::V10, Hexagon::V11, Hexagon::V12, Hexagon::V13, Hexagon::V14,
+      Hexagon::V15, Hexagon::V16, Hexagon::V17, Hexagon::V18, Hexagon::V19,
+      Hexagon::V20, Hexagon::V21, Hexagon::V22, Hexagon::V23, Hexagon::V24,
+      Hexagon::V25, Hexagon::V26, Hexagon::V27, Hexagon::V28, Hexagon::V29,
+      Hexagon::V30, Hexagon::V31};
+
+  return DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable);
+}
+
+static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                  uint64_t /*Address*/,
+                                                  const void *Decoder) {
+  static const MCPhysReg DoubleRegDecoderTable[] = {
+      Hexagon::D0,  Hexagon::D1,  Hexagon::D2,  Hexagon::D3,
+      Hexagon::D4,  Hexagon::D5,  Hexagon::D6,  Hexagon::D7,
+      Hexagon::D8,  Hexagon::D9,  Hexagon::D10, Hexagon::D11,
+      Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15};
+
+  return DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable);
+}
+
+static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                  uint64_t /*Address*/,
+                                                  const void *Decoder) {
+  static const MCPhysReg VecDblRegDecoderTable[] = {
+      Hexagon::W0,  Hexagon::W1,  Hexagon::W2,  Hexagon::W3,
+      Hexagon::W4,  Hexagon::W5,  Hexagon::W6,  Hexagon::W7,
+      Hexagon::W8,  Hexagon::W9,  Hexagon::W10, Hexagon::W11,
+      Hexagon::W12, Hexagon::W13, Hexagon::W14, Hexagon::W15};
+
+  return (DecodeRegisterClass(Inst, RegNo >> 1, VecDblRegDecoderTable));
+}
+
+static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                uint64_t /*Address*/,
+                                                const void *Decoder) {
+  static const MCPhysReg PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1,
+                                                  Hexagon::P2, Hexagon::P3};
+
+  return DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable);
+}
+
+static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                   uint64_t /*Address*/,
+                                                   const void *Decoder) {
+  static const MCPhysReg VecPredRegDecoderTable[] = {Hexagon::Q0, Hexagon::Q1,
+                                                     Hexagon::Q2, Hexagon::Q3};
+
+  return DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable);
+}
+
+static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t /*Address*/,
+                                               const void *Decoder) {
+  static const MCPhysReg CtrlRegDecoderTable[] = {
+    Hexagon::SA0, Hexagon::LC0, Hexagon::SA1, Hexagon::LC1,
+    Hexagon::P3_0, Hexagon::C5, Hexagon::C6, Hexagon::C7,
+    Hexagon::USR, Hexagon::PC, Hexagon::UGP, Hexagon::GP,
+    Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPC
+  };
+
+  if (RegNo >= array_lengthof(CtrlRegDecoderTable))
+    return MCDisassembler::Fail;
+
+  if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister)
+    return MCDisassembler::Fail;
+
+  unsigned Register = CtrlRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                                 uint64_t /*Address*/,
+                                                 const void *Decoder) {
+  static const MCPhysReg CtrlReg64DecoderTable[] = {
+      Hexagon::C1_0,   Hexagon::NoRegister,
+      Hexagon::C3_2,   Hexagon::NoRegister,
+      Hexagon::C7_6,   Hexagon::NoRegister,
+      Hexagon::C9_8,   Hexagon::NoRegister,
+      Hexagon::C11_10, Hexagon::NoRegister,
+      Hexagon::CS,     Hexagon::NoRegister,
+      Hexagon::UPC,    Hexagon::NoRegister
+  };
+
+  if (RegNo >= array_lengthof(CtrlReg64DecoderTable))
+    return MCDisassembler::Fail;
+
+  if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister)
+    return MCDisassembler::Fail;
+
+  unsigned Register = CtrlReg64DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t /*Address*/,
+                                               const void *Decoder) {
+  unsigned Register = 0;
+  switch (RegNo) {
+  case 0:
+    Register = Hexagon::M0;
+    break;
+  case 1:
+    Register = Hexagon::M1;
+    break;
+  default:
+    return MCDisassembler::Fail;
+  }
+  Inst.addOperand(MCOperand::createReg(Register));
+  return MCDisassembler::Success;
+}
+
+static uint32_t fullValue(MCInstrInfo const &MCII, MCInst &MCB, MCInst &MI,
+                          int64_t Value) {
+  MCInst const *Extender = HexagonMCInstrInfo::extenderForIndex(
+    MCB, HexagonMCInstrInfo::bundleSize(MCB));
+  if(!Extender || MI.size() != HexagonMCInstrInfo::getExtendableOp(MCII, MI))
+    return Value;
+  unsigned Alignment = HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
+  uint32_t Lower6 = static_cast<uint32_t>(Value >> Alignment) & 0x3f;
+  int64_t Bits;
+  bool Success = Extender->getOperand(0).getExpr()->evaluateAsAbsolute(Bits);
+  assert(Success);(void)Success;
+  uint32_t Upper26 = static_cast<uint32_t>(Bits);
+  uint32_t Operand = Upper26 | Lower6;
+  return Operand;
+}
+
+template <size_t T>
+static void signedDecoder(MCInst &MI, unsigned tmp, const void *Decoder) {
+  HexagonDisassembler const &Disassembler = disassembler(Decoder);
+  int64_t FullValue = fullValue(*Disassembler.MCII,
+                                **Disassembler.CurrentBundle,
+                                MI, SignExtend64<T>(tmp));
+  int64_t Extended = SignExtend64<32>(FullValue);
+  HexagonMCInstrInfo::addConstant(MI, Extended,
+                                  Disassembler.getContext());
+}
+
+static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
+                                       uint64_t /*Address*/,
+                                       const void *Decoder) {
+  HexagonDisassembler const &Disassembler = disassembler(Decoder);
+  int64_t FullValue = fullValue(*Disassembler.MCII,
+                                **Disassembler.CurrentBundle,
+                                MI, tmp);
+  assert(FullValue >= 0 && "Negative in unsigned decoder");
+  HexagonMCInstrInfo::addConstant(MI, FullValue, Disassembler.getContext());
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus s16_0ImmDecoder(MCInst &MI, unsigned tmp,
+                                  uint64_t /*Address*/, const void *Decoder) {
+  signedDecoder<16>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus s12_0ImmDecoder(MCInst &MI, unsigned tmp,
+                                  uint64_t /*Address*/, const void *Decoder) {
+  signedDecoder<12>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus s11_0ImmDecoder(MCInst &MI, unsigned tmp,
+                                    uint64_t /*Address*/, const void *Decoder) {
+  signedDecoder<11>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus s11_1ImmDecoder(MCInst &MI, unsigned tmp,
+                                    uint64_t /*Address*/, const void *Decoder) {
+  HexagonMCInstrInfo::addConstant(MI, SignExtend64<12>(tmp), contextFromDecoder(Decoder));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus s11_2ImmDecoder(MCInst &MI, unsigned tmp,
+                                    uint64_t /*Address*/, const void *Decoder) {
+  signedDecoder<13>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus s11_3ImmDecoder(MCInst &MI, unsigned tmp,
+                                    uint64_t /*Address*/, const void *Decoder) {
+  signedDecoder<14>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus s10_0ImmDecoder(MCInst &MI, unsigned tmp,
+                                  uint64_t /*Address*/, const void *Decoder) {
+  signedDecoder<10>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/,
+                                 const void *Decoder) {
+  signedDecoder<8>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp,
+                                   uint64_t /*Address*/, const void *Decoder) {
+  signedDecoder<6>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp,
+                                   uint64_t /*Address*/, const void *Decoder) {
+  signedDecoder<4>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp,
+                                   uint64_t /*Address*/, const void *Decoder) {
+  signedDecoder<5>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp,
+                                   uint64_t /*Address*/, const void *Decoder) {
+  signedDecoder<6>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp,
+                                   uint64_t /*Address*/, const void *Decoder) {
+  signedDecoder<7>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp,
+                                   uint64_t /*Address*/, const void *Decoder) {
+  signedDecoder<10>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp,
+                                   uint64_t /*Address*/, const void *Decoder) {
+  signedDecoder<19>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+
+// custom decoder for various jump/call immediates
+static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+                                    const void *Decoder) {
+  HexagonDisassembler const &Disassembler = disassembler(Decoder);
+  unsigned Bits = HexagonMCInstrInfo::getExtentBits(*Disassembler.MCII, MI);
+  // r13_2 is not extendable, so if there are no extent bits, it's r13_2
+  if (Bits == 0)
+    Bits = 15;
+  uint32_t FullValue = fullValue(*Disassembler.MCII,
+                                **Disassembler.CurrentBundle,
+                                MI, SignExtend64(tmp, Bits));
+  int64_t Extended = SignExtend64<32>(FullValue) + Address;
+  if (!Disassembler.tryAddingSymbolicOperand(MI, Extended, Address, true,
+                                              0, 4))
+    HexagonMCInstrInfo::addConstant(MI, Extended, Disassembler.getContext());
+  return MCDisassembler::Success;
+}
+
+// Addressing mode dependent load store opcode map.
+//   - If an insn is preceded by an extender the address is absolute.
+//      - memw(##symbol) = r0
+//   - If an insn is not preceded by an extender the address is GP relative.
+//      - memw(gp + #symbol) = r0
+// Please note that the instructions must be ordered in the descending order
+// of their opcode.
+// HexagonII::INST_ICLASS_ST
+static const unsigned int StoreConditionalOpcodeData[][2] = {
+    {S4_pstorerdfnew_abs, 0xafc02084},
+    {S4_pstorerdtnew_abs, 0xafc02080},
+    {S4_pstorerdf_abs, 0xafc00084},
+    {S4_pstorerdt_abs, 0xafc00080},
+    {S4_pstorerinewfnew_abs, 0xafa03084},
+    {S4_pstorerinewtnew_abs, 0xafa03080},
+    {S4_pstorerhnewfnew_abs, 0xafa02884},
+    {S4_pstorerhnewtnew_abs, 0xafa02880},
+    {S4_pstorerbnewfnew_abs, 0xafa02084},
+    {S4_pstorerbnewtnew_abs, 0xafa02080},
+    {S4_pstorerinewf_abs, 0xafa01084},
+    {S4_pstorerinewt_abs, 0xafa01080},
+    {S4_pstorerhnewf_abs, 0xafa00884},
+    {S4_pstorerhnewt_abs, 0xafa00880},
+    {S4_pstorerbnewf_abs, 0xafa00084},
+    {S4_pstorerbnewt_abs, 0xafa00080},
+    {S4_pstorerifnew_abs, 0xaf802084},
+    {S4_pstoreritnew_abs, 0xaf802080},
+    {S4_pstorerif_abs, 0xaf800084},
+    {S4_pstorerit_abs, 0xaf800080},
+    {S4_pstorerhfnew_abs, 0xaf402084},
+    {S4_pstorerhtnew_abs, 0xaf402080},
+    {S4_pstorerhf_abs, 0xaf400084},
+    {S4_pstorerht_abs, 0xaf400080},
+    {S4_pstorerbfnew_abs, 0xaf002084},
+    {S4_pstorerbtnew_abs, 0xaf002080},
+    {S4_pstorerbf_abs, 0xaf000084},
+    {S4_pstorerbt_abs, 0xaf000080}};
+// HexagonII::INST_ICLASS_LD
+
+// HexagonII::INST_ICLASS_LD_ST_2
+static unsigned int LoadStoreOpcodeData[][2] = {{PS_loadrdabs, 0x49c00000},
+                                                {PS_loadriabs, 0x49800000},
+                                                {PS_loadruhabs, 0x49600000},
+                                                {PS_loadrhabs, 0x49400000},
+                                                {PS_loadrubabs, 0x49200000},
+                                                {PS_loadrbabs, 0x49000000},
+                                                {PS_storerdabs, 0x48c00000},
+                                                {PS_storerinewabs, 0x48a01000},
+                                                {PS_storerhnewabs, 0x48a00800},
+                                                {PS_storerbnewabs, 0x48a00000},
+                                                {PS_storeriabs, 0x48800000},
+                                                {PS_storerfabs, 0x48600000},
+                                                {PS_storerhabs, 0x48400000},
+                                                {PS_storerbabs, 0x48000000}};
+static const size_t NumCondS = array_lengthof(StoreConditionalOpcodeData);
+static const size_t NumLS = array_lengthof(LoadStoreOpcodeData);
+
+static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) {
+  unsigned MachineOpcode = 0;
+  unsigned LLVMOpcode = 0;
+
+  if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_ST) {
+    for (size_t i = 0; i < NumCondS; ++i) {
+      if ((insn & StoreConditionalOpcodeData[i][1]) ==
+          StoreConditionalOpcodeData[i][1]) {
+        MachineOpcode = StoreConditionalOpcodeData[i][1];
+        LLVMOpcode = StoreConditionalOpcodeData[i][0];
+        break;
+      }
+    }
+  }
+  if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_LD_ST_2) {
+    for (size_t i = 0; i < NumLS; ++i) {
+      if ((insn & LoadStoreOpcodeData[i][1]) == LoadStoreOpcodeData[i][1]) {
+        MachineOpcode = LoadStoreOpcodeData[i][1];
+        LLVMOpcode = LoadStoreOpcodeData[i][0];
+        break;
+      }
+    }
+  }
+
+  if (MachineOpcode) {
+    unsigned Value = 0;
+    unsigned shift = 0;
+    MI.setOpcode(LLVMOpcode);
+    // Remove the parse bits from the insn.
+    insn &= ~HexagonII::INST_PARSE_MASK;
+
+    switch (LLVMOpcode) {
+    default:
+      return MCDisassembler::Fail;
+      break;
+
+    case Hexagon::S4_pstorerdf_abs:
+    case Hexagon::S4_pstorerdt_abs:
+    case Hexagon::S4_pstorerdfnew_abs:
+    case Hexagon::S4_pstorerdtnew_abs:
+      // op: Pv
+      Value = insn & UINT64_C(3);
+      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
+      // op: u6
+      Value = (insn >> 12) & UINT64_C(48);
+      Value |= (insn >> 3) & UINT64_C(15);
+      MI.addOperand(MCOperand::createImm(Value));
+      // op: Rtt
+      Value = (insn >> 8) & UINT64_C(31);
+      DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
+      break;
+
+    case Hexagon::S4_pstorerbnewf_abs:
+    case Hexagon::S4_pstorerbnewt_abs:
+    case Hexagon::S4_pstorerbnewfnew_abs:
+    case Hexagon::S4_pstorerbnewtnew_abs:
+    case Hexagon::S4_pstorerhnewf_abs:
+    case Hexagon::S4_pstorerhnewt_abs:
+    case Hexagon::S4_pstorerhnewfnew_abs:
+    case Hexagon::S4_pstorerhnewtnew_abs:
+    case Hexagon::S4_pstorerinewf_abs:
+    case Hexagon::S4_pstorerinewt_abs:
+    case Hexagon::S4_pstorerinewfnew_abs:
+    case Hexagon::S4_pstorerinewtnew_abs:
+      // op: Pv
+      Value = insn & UINT64_C(3);
+      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
+      // op: u6
+      Value = (insn >> 12) & UINT64_C(48);
+      Value |= (insn >> 3) & UINT64_C(15);
+      MI.addOperand(MCOperand::createImm(Value));
+      // op: Nt
+      Value = (insn >> 8) & UINT64_C(7);
+      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
+      break;
+
+    case Hexagon::S4_pstorerbf_abs:
+    case Hexagon::S4_pstorerbt_abs:
+    case Hexagon::S4_pstorerbfnew_abs:
+    case Hexagon::S4_pstorerbtnew_abs:
+    case Hexagon::S4_pstorerhf_abs:
+    case Hexagon::S4_pstorerht_abs:
+    case Hexagon::S4_pstorerhfnew_abs:
+    case Hexagon::S4_pstorerhtnew_abs:
+    case Hexagon::S4_pstorerif_abs:
+    case Hexagon::S4_pstorerit_abs:
+    case Hexagon::S4_pstorerifnew_abs:
+    case Hexagon::S4_pstoreritnew_abs:
+      // op: Pv
+      Value = insn & UINT64_C(3);
+      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
+      // op: u6
+      Value = (insn >> 12) & UINT64_C(48);
+      Value |= (insn >> 3) & UINT64_C(15);
+      MI.addOperand(MCOperand::createImm(Value));
+      // op: Rt
+      Value = (insn >> 8) & UINT64_C(31);
+      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
+      break;
+
+    case Hexagon::L4_ploadrdf_abs:
+    case Hexagon::L4_ploadrdt_abs:
+    case Hexagon::L4_ploadrdfnew_abs:
+    case Hexagon::L4_ploadrdtnew_abs:
+      // op: Rdd
+      Value = insn & UINT64_C(31);
+      DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
+      // op: Pt
+      Value = ((insn >> 9) & UINT64_C(3));
+      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
+      // op: u6
+      Value = ((insn >> 15) & UINT64_C(62));
+      Value |= ((insn >> 8) & UINT64_C(1));
+      MI.addOperand(MCOperand::createImm(Value));
+      break;
+
+    case Hexagon::L4_ploadrbf_abs:
+    case Hexagon::L4_ploadrbt_abs:
+    case Hexagon::L4_ploadrbfnew_abs:
+    case Hexagon::L4_ploadrbtnew_abs:
+    case Hexagon::L4_ploadrhf_abs:
+    case Hexagon::L4_ploadrht_abs:
+    case Hexagon::L4_ploadrhfnew_abs:
+    case Hexagon::L4_ploadrhtnew_abs:
+    case Hexagon::L4_ploadrubf_abs:
+    case Hexagon::L4_ploadrubt_abs:
+    case Hexagon::L4_ploadrubfnew_abs:
+    case Hexagon::L4_ploadrubtnew_abs:
+    case Hexagon::L4_ploadruhf_abs:
+    case Hexagon::L4_ploadruht_abs:
+    case Hexagon::L4_ploadruhfnew_abs:
+    case Hexagon::L4_ploadruhtnew_abs:
+    case Hexagon::L4_ploadrif_abs:
+    case Hexagon::L4_ploadrit_abs:
+    case Hexagon::L4_ploadrifnew_abs:
+    case Hexagon::L4_ploadritnew_abs:
+      // op: Rd
+      Value = insn & UINT64_C(31);
+      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
+      // op: Pt
+      Value = (insn >> 9) & UINT64_C(3);
+      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
+      // op: u6
+      Value = (insn >> 15) & UINT64_C(62);
+      Value |= (insn >> 8) & UINT64_C(1);
+      MI.addOperand(MCOperand::createImm(Value));
+      break;
+
+    // op: g16_2
+    case (Hexagon::PS_loadriabs):
+      ++shift;
+    // op: g16_1
+    case Hexagon::PS_loadrhabs:
+    case Hexagon::PS_loadruhabs:
+      ++shift;
+    // op: g16_0
+    case Hexagon::PS_loadrbabs:
+    case Hexagon::PS_loadrubabs:
+      // op: Rd
+      Value |= insn & UINT64_C(31);
+      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
+      Value = (insn >> 11) & UINT64_C(49152);
+      Value |= (insn >> 7) & UINT64_C(15872);
+      Value |= (insn >> 5) & UINT64_C(511);
+      MI.addOperand(MCOperand::createImm(Value << shift));
+      break;
+
+    case Hexagon::PS_loadrdabs:
+      Value = insn & UINT64_C(31);
+      DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
+      Value = (insn >> 11) & UINT64_C(49152);
+      Value |= (insn >> 7) & UINT64_C(15872);
+      Value |= (insn >> 5) & UINT64_C(511);
+      MI.addOperand(MCOperand::createImm(Value << 3));
+      break;
+
+    case Hexagon::PS_storerdabs:
+      // op: g16_3
+      Value = (insn >> 11) & UINT64_C(49152);
+      Value |= (insn >> 7) & UINT64_C(15872);
+      Value |= (insn >> 5) & UINT64_C(256);
+      Value |= insn & UINT64_C(255);
+      MI.addOperand(MCOperand::createImm(Value << 3));
+      // op: Rtt
+      Value = (insn >> 8) & UINT64_C(31);
+      DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
+      break;
+
+    // op: g16_2
+    case Hexagon::PS_storerinewabs:
+      ++shift;
+    // op: g16_1
+    case Hexagon::PS_storerhnewabs:
+      ++shift;
+    // op: g16_0
+    case Hexagon::PS_storerbnewabs:
+      Value = (insn >> 11) & UINT64_C(49152);
+      Value |= (insn >> 7) & UINT64_C(15872);
+      Value |= (insn >> 5) & UINT64_C(256);
+      Value |= insn & UINT64_C(255);
+      MI.addOperand(MCOperand::createImm(Value << shift));
+      // op: Nt
+      Value = (insn >> 8) & UINT64_C(7);
+      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
+      break;
+
+    // op: g16_2
+    case Hexagon::PS_storeriabs:
+      ++shift;
+    // op: g16_1
+    case Hexagon::PS_storerhabs:
+    case Hexagon::PS_storerfabs:
+      ++shift;
+    // op: g16_0
+    case Hexagon::PS_storerbabs:
+      Value = (insn >> 11) & UINT64_C(49152);
+      Value |= (insn >> 7) & UINT64_C(15872);
+      Value |= (insn >> 5) & UINT64_C(256);
+      Value |= insn & UINT64_C(255);
+      MI.addOperand(MCOperand::createImm(Value << shift));
+      // op: Rt
+      Value = (insn >> 8) & UINT64_C(31);
+      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
+      break;
+    }
+    return MCDisassembler::Success;
+  }
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn,
+                                 void const *Decoder) {
+  // Instruction Class for a constant a extender: bits 31:28 = 0x0000
+  if ((~insn & 0xf0000000) == 0xf0000000) {
+    unsigned Value;
+    // 27:16 High 12 bits of 26-bit extender.
+    Value = (insn & 0x0fff0000) << 4;
+    // 13:0 Low 14 bits of 26-bit extender.
+    Value |= ((insn & 0x3fff) << 6);
+    MI.setOpcode(Hexagon::A4_ext);
+    HexagonMCInstrInfo::addConstant(MI, Value, contextFromDecoder(Decoder));
+    return MCDisassembler::Success;
+  }
+  return MCDisassembler::Fail;
+}
+
+// These values are from HexagonGenMCCodeEmitter.inc and HexagonIsetDx.td
+enum subInstBinaryValues {
+  SA1_addi_BITS = 0x0000,
+  SA1_addi_MASK = 0x1800,
+  SA1_addrx_BITS = 0x1800,
+  SA1_addrx_MASK = 0x1f00,
+  SA1_addsp_BITS = 0x0c00,
+  SA1_addsp_MASK = 0x1c00,
+  SA1_and1_BITS = 0x1200,
+  SA1_and1_MASK = 0x1f00,
+  SA1_clrf_BITS = 0x1a70,
+  SA1_clrf_MASK = 0x1e70,
+  SA1_clrfnew_BITS = 0x1a50,
+  SA1_clrfnew_MASK = 0x1e70,
+  SA1_clrt_BITS = 0x1a60,
+  SA1_clrt_MASK = 0x1e70,
+  SA1_clrtnew_BITS = 0x1a40,
+  SA1_clrtnew_MASK = 0x1e70,
+  SA1_cmpeqi_BITS = 0x1900,
+  SA1_cmpeqi_MASK = 0x1f00,
+  SA1_combine0i_BITS = 0x1c00,
+  SA1_combine0i_MASK = 0x1d18,
+  SA1_combine1i_BITS = 0x1c08,
+  SA1_combine1i_MASK = 0x1d18,
+  SA1_combine2i_BITS = 0x1c10,
+  SA1_combine2i_MASK = 0x1d18,
+  SA1_combine3i_BITS = 0x1c18,
+  SA1_combine3i_MASK = 0x1d18,
+  SA1_combinerz_BITS = 0x1d08,
+  SA1_combinerz_MASK = 0x1d08,
+  SA1_combinezr_BITS = 0x1d00,
+  SA1_combinezr_MASK = 0x1d08,
+  SA1_dec_BITS = 0x1300,
+  SA1_dec_MASK = 0x1f00,
+  SA1_inc_BITS = 0x1100,
+  SA1_inc_MASK = 0x1f00,
+  SA1_seti_BITS = 0x0800,
+  SA1_seti_MASK = 0x1c00,
+  SA1_setin1_BITS = 0x1a00,
+  SA1_setin1_MASK = 0x1e40,
+  SA1_sxtb_BITS = 0x1500,
+  SA1_sxtb_MASK = 0x1f00,
+  SA1_sxth_BITS = 0x1400,
+  SA1_sxth_MASK = 0x1f00,
+  SA1_tfr_BITS = 0x1000,
+  SA1_tfr_MASK = 0x1f00,
+  SA1_zxtb_BITS = 0x1700,
+  SA1_zxtb_MASK = 0x1f00,
+  SA1_zxth_BITS = 0x1600,
+  SA1_zxth_MASK = 0x1f00,
+  SL1_loadri_io_BITS = 0x0000,
+  SL1_loadri_io_MASK = 0x1000,
+  SL1_loadrub_io_BITS = 0x1000,
+  SL1_loadrub_io_MASK = 0x1000,
+  SL2_deallocframe_BITS = 0x1f00,
+  SL2_deallocframe_MASK = 0x1fc0,
+  SL2_jumpr31_BITS = 0x1fc0,
+  SL2_jumpr31_MASK = 0x1fc4,
+  SL2_jumpr31_f_BITS = 0x1fc5,
+  SL2_jumpr31_f_MASK = 0x1fc7,
+  SL2_jumpr31_fnew_BITS = 0x1fc7,
+  SL2_jumpr31_fnew_MASK = 0x1fc7,
+  SL2_jumpr31_t_BITS = 0x1fc4,
+  SL2_jumpr31_t_MASK = 0x1fc7,
+  SL2_jumpr31_tnew_BITS = 0x1fc6,
+  SL2_jumpr31_tnew_MASK = 0x1fc7,
+  SL2_loadrb_io_BITS = 0x1000,
+  SL2_loadrb_io_MASK = 0x1800,
+  SL2_loadrd_sp_BITS = 0x1e00,
+  SL2_loadrd_sp_MASK = 0x1f00,
+  SL2_loadrh_io_BITS = 0x0000,
+  SL2_loadrh_io_MASK = 0x1800,
+  SL2_loadri_sp_BITS = 0x1c00,
+  SL2_loadri_sp_MASK = 0x1e00,
+  SL2_loadruh_io_BITS = 0x0800,
+  SL2_loadruh_io_MASK = 0x1800,
+  SL2_return_BITS = 0x1f40,
+  SL2_return_MASK = 0x1fc4,
+  SL2_return_f_BITS = 0x1f45,
+  SL2_return_f_MASK = 0x1fc7,
+  SL2_return_fnew_BITS = 0x1f47,
+  SL2_return_fnew_MASK = 0x1fc7,
+  SL2_return_t_BITS = 0x1f44,
+  SL2_return_t_MASK = 0x1fc7,
+  SL2_return_tnew_BITS = 0x1f46,
+  SL2_return_tnew_MASK = 0x1fc7,
+  SS1_storeb_io_BITS = 0x1000,
+  SS1_storeb_io_MASK = 0x1000,
+  SS1_storew_io_BITS = 0x0000,
+  SS1_storew_io_MASK = 0x1000,
+  SS2_allocframe_BITS = 0x1c00,
+  SS2_allocframe_MASK = 0x1e00,
+  SS2_storebi0_BITS = 0x1200,
+  SS2_storebi0_MASK = 0x1f00,
+  SS2_storebi1_BITS = 0x1300,
+  SS2_storebi1_MASK = 0x1f00,
+  SS2_stored_sp_BITS = 0x0a00,
+  SS2_stored_sp_MASK = 0x1e00,
+  SS2_storeh_io_BITS = 0x0000,
+  SS2_storeh_io_MASK = 0x1800,
+  SS2_storew_sp_BITS = 0x0800,
+  SS2_storew_sp_MASK = 0x1e00,
+  SS2_storewi0_BITS = 0x1000,
+  SS2_storewi0_MASK = 0x1f00,
+  SS2_storewi1_BITS = 0x1100,
+  SS2_storewi1_MASK = 0x1f00
+};
+
+static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op,
+                                 raw_ostream &os) {
+  switch (IClass) {
+  case HexagonII::HSIG_L1:
+    if ((inst & SL1_loadri_io_MASK) == SL1_loadri_io_BITS)
+      op = Hexagon::SL1_loadri_io;
+    else if ((inst & SL1_loadrub_io_MASK) == SL1_loadrub_io_BITS)
+      op = Hexagon::SL1_loadrub_io;
+    else {
+      os << "<unknown subinstruction>";
+      return MCDisassembler::Fail;
+    }
+    break;
+  case HexagonII::HSIG_L2:
+    if ((inst & SL2_deallocframe_MASK) == SL2_deallocframe_BITS)
+      op = Hexagon::SL2_deallocframe;
+    else if ((inst & SL2_jumpr31_MASK) == SL2_jumpr31_BITS)
+      op = Hexagon::SL2_jumpr31;
+    else if ((inst & SL2_jumpr31_f_MASK) == SL2_jumpr31_f_BITS)
+      op = Hexagon::SL2_jumpr31_f;
+    else if ((inst & SL2_jumpr31_fnew_MASK) == SL2_jumpr31_fnew_BITS)
+      op = Hexagon::SL2_jumpr31_fnew;
+    else if ((inst & SL2_jumpr31_t_MASK) == SL2_jumpr31_t_BITS)
+      op = Hexagon::SL2_jumpr31_t;
+    else if ((inst & SL2_jumpr31_tnew_MASK) == SL2_jumpr31_tnew_BITS)
+      op = Hexagon::SL2_jumpr31_tnew;
+    else if ((inst & SL2_loadrb_io_MASK) == SL2_loadrb_io_BITS)
+      op = Hexagon::SL2_loadrb_io;
+    else if ((inst & SL2_loadrd_sp_MASK) == SL2_loadrd_sp_BITS)
+      op = Hexagon::SL2_loadrd_sp;
+    else if ((inst & SL2_loadrh_io_MASK) == SL2_loadrh_io_BITS)
+      op = Hexagon::SL2_loadrh_io;
+    else if ((inst & SL2_loadri_sp_MASK) == SL2_loadri_sp_BITS)
+      op = Hexagon::SL2_loadri_sp;
+    else if ((inst & SL2_loadruh_io_MASK) == SL2_loadruh_io_BITS)
+      op = Hexagon::SL2_loadruh_io;
+    else if ((inst & SL2_return_MASK) == SL2_return_BITS)
+      op = Hexagon::SL2_return;
+    else if ((inst & SL2_return_f_MASK) == SL2_return_f_BITS)
+      op = Hexagon::SL2_return_f;
+    else if ((inst & SL2_return_fnew_MASK) == SL2_return_fnew_BITS)
+      op = Hexagon::SL2_return_fnew;
+    else if ((inst & SL2_return_t_MASK) == SL2_return_t_BITS)
+      op = Hexagon::SL2_return_t;
+    else if ((inst & SL2_return_tnew_MASK) == SL2_return_tnew_BITS)
+      op = Hexagon::SL2_return_tnew;
+    else {
+      os << "<unknown subinstruction>";
+      return MCDisassembler::Fail;
+    }
+    break;
+  case HexagonII::HSIG_A:
+    if ((inst & SA1_addi_MASK) == SA1_addi_BITS)
+      op = Hexagon::SA1_addi;
+    else if ((inst & SA1_addrx_MASK) == SA1_addrx_BITS)
+      op = Hexagon::SA1_addrx;
+    else if ((inst & SA1_addsp_MASK) == SA1_addsp_BITS)
+      op = Hexagon::SA1_addsp;
+    else if ((inst & SA1_and1_MASK) == SA1_and1_BITS)
+      op = Hexagon::SA1_and1;
+    else if ((inst & SA1_clrf_MASK) == SA1_clrf_BITS)
+      op = Hexagon::SA1_clrf;
+    else if ((inst & SA1_clrfnew_MASK) == SA1_clrfnew_BITS)
+      op = Hexagon::SA1_clrfnew;
+    else if ((inst & SA1_clrt_MASK) == SA1_clrt_BITS)
+      op = Hexagon::SA1_clrt;
+    else if ((inst & SA1_clrtnew_MASK) == SA1_clrtnew_BITS)
+      op = Hexagon::SA1_clrtnew;
+    else if ((inst & SA1_cmpeqi_MASK) == SA1_cmpeqi_BITS)
+      op = Hexagon::SA1_cmpeqi;
+    else if ((inst & SA1_combine0i_MASK) == SA1_combine0i_BITS)
+      op = Hexagon::SA1_combine0i;
+    else if ((inst & SA1_combine1i_MASK) == SA1_combine1i_BITS)
+      op = Hexagon::SA1_combine1i;
+    else if ((inst & SA1_combine2i_MASK) == SA1_combine2i_BITS)
+      op = Hexagon::SA1_combine2i;
+    else if ((inst & SA1_combine3i_MASK) == SA1_combine3i_BITS)
+      op = Hexagon::SA1_combine3i;
+    else if ((inst & SA1_combinerz_MASK) == SA1_combinerz_BITS)
+      op = Hexagon::SA1_combinerz;
+    else if ((inst & SA1_combinezr_MASK) == SA1_combinezr_BITS)
+      op = Hexagon::SA1_combinezr;
+    else if ((inst & SA1_dec_MASK) == SA1_dec_BITS)
+      op = Hexagon::SA1_dec;
+    else if ((inst & SA1_inc_MASK) == SA1_inc_BITS)
+      op = Hexagon::SA1_inc;
+    else if ((inst & SA1_seti_MASK) == SA1_seti_BITS)
+      op = Hexagon::SA1_seti;
+    else if ((inst & SA1_setin1_MASK) == SA1_setin1_BITS)
+      op = Hexagon::SA1_setin1;
+    else if ((inst & SA1_sxtb_MASK) == SA1_sxtb_BITS)
+      op = Hexagon::SA1_sxtb;
+    else if ((inst & SA1_sxth_MASK) == SA1_sxth_BITS)
+      op = Hexagon::SA1_sxth;
+    else if ((inst & SA1_tfr_MASK) == SA1_tfr_BITS)
+      op = Hexagon::SA1_tfr;
+    else if ((inst & SA1_zxtb_MASK) == SA1_zxtb_BITS)
+      op = Hexagon::SA1_zxtb;
+    else if ((inst & SA1_zxth_MASK) == SA1_zxth_BITS)
+      op = Hexagon::SA1_zxth;
+    else {
+      os << "<unknown subinstruction>";
+      return MCDisassembler::Fail;
+    }
+    break;
+  case HexagonII::HSIG_S1:
+    if ((inst & SS1_storeb_io_MASK) == SS1_storeb_io_BITS)
+      op = Hexagon::SS1_storeb_io;
+    else if ((inst & SS1_storew_io_MASK) == SS1_storew_io_BITS)
+      op = Hexagon::SS1_storew_io;
+    else {
+      os << "<unknown subinstruction>";
+      return MCDisassembler::Fail;
+    }
+    break;
+  case HexagonII::HSIG_S2:
+    if ((inst & SS2_allocframe_MASK) == SS2_allocframe_BITS)
+      op = Hexagon::SS2_allocframe;
+    else if ((inst & SS2_storebi0_MASK) == SS2_storebi0_BITS)
+      op = Hexagon::SS2_storebi0;
+    else if ((inst & SS2_storebi1_MASK) == SS2_storebi1_BITS)
+      op = Hexagon::SS2_storebi1;
+    else if ((inst & SS2_stored_sp_MASK) == SS2_stored_sp_BITS)
+      op = Hexagon::SS2_stored_sp;
+    else if ((inst & SS2_storeh_io_MASK) == SS2_storeh_io_BITS)
+      op = Hexagon::SS2_storeh_io;
+    else if ((inst & SS2_storew_sp_MASK) == SS2_storew_sp_BITS)
+      op = Hexagon::SS2_storew_sp;
+    else if ((inst & SS2_storewi0_MASK) == SS2_storewi0_BITS)
+      op = Hexagon::SS2_storewi0;
+    else if ((inst & SS2_storewi1_MASK) == SS2_storewi1_BITS)
+      op = Hexagon::SS2_storewi1;
+    else {
+      os << "<unknown subinstruction>";
+      return MCDisassembler::Fail;
+    }
+    break;
+  default:
+    os << "<unknown>";
+    return MCDisassembler::Fail;
+  }
+  return MCDisassembler::Success;
+}
+
+static unsigned getRegFromSubinstEncoding(unsigned encoded_reg) {
+  if (encoded_reg < 8)
+    return Hexagon::R0 + encoded_reg;
+  else if (encoded_reg < 16)
+    return Hexagon::R0 + encoded_reg + 8;
+
+  // patently false value
+  return Hexagon::NoRegister;
+}
+
+static unsigned getDRegFromSubinstEncoding(unsigned encoded_dreg) {
+  if (encoded_dreg < 4)
+    return Hexagon::D0 + encoded_dreg;
+  else if (encoded_dreg < 8)
+    return Hexagon::D0 + encoded_dreg + 4;
+
+  // patently false value
+  return Hexagon::NoRegister;
+}
+
+void HexagonDisassembler::addSubinstOperands(MCInst *MI, unsigned opcode,
+                                             unsigned inst) const {
+  int64_t operand;
+  MCOperand Op;
+  switch (opcode) {
+  case Hexagon::SL2_deallocframe:
+  case Hexagon::SL2_jumpr31:
+  case Hexagon::SL2_jumpr31_f:
+  case Hexagon::SL2_jumpr31_fnew:
+  case Hexagon::SL2_jumpr31_t:
+  case Hexagon::SL2_jumpr31_tnew:
+  case Hexagon::SL2_return:
+  case Hexagon::SL2_return_f:
+  case Hexagon::SL2_return_fnew:
+  case Hexagon::SL2_return_t:
+  case Hexagon::SL2_return_tnew:
+    // no operands for these instructions
+    break;
+  case Hexagon::SS2_allocframe:
+    // u 8-4{5_3}
+    operand = ((inst & 0x1f0) >> 4) << 3;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    break;
+  case Hexagon::SL1_loadri_io:
+    // Rd 3-0, Rs 7-4, u 11-8{4_2}
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = (inst & 0xf00) >> 6;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    break;
+  case Hexagon::SL1_loadrub_io:
+    // Rd 3-0, Rs 7-4, u 11-8
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = (inst & 0xf00) >> 8;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    break;
+  case Hexagon::SL2_loadrb_io:
+    // Rd 3-0, Rs 7-4, u 10-8
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = (inst & 0x700) >> 8;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    break;
+  case Hexagon::SL2_loadrh_io:
+  case Hexagon::SL2_loadruh_io:
+    // Rd 3-0, Rs 7-4, u 10-8{3_1}
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = ((inst & 0x700) >> 8) << 1;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    break;
+  case Hexagon::SL2_loadrd_sp:
+    // Rdd 2-0, u 7-3{5_3}
+    operand = getDRegFromSubinstEncoding(inst & 0x7);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = ((inst & 0x0f8) >> 3) << 3;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    break;
+  case Hexagon::SL2_loadri_sp:
+    // Rd 3-0, u 8-4{5_2}
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = ((inst & 0x1f0) >> 4) << 2;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    break;
+  case Hexagon::SA1_addi:
+    // Rx 3-0 (x2), s7 10-4
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    MI->addOperand(Op);
+    operand = SignExtend64<7>((inst & 0x7f0) >> 4);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    break;
+  case Hexagon::SA1_addrx:
+    // Rx 3-0 (x2), Rs 7-4
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::SA1_and1:
+  case Hexagon::SA1_dec:
+  case Hexagon::SA1_inc:
+  case Hexagon::SA1_sxtb:
+  case Hexagon::SA1_sxth:
+  case Hexagon::SA1_tfr:
+  case Hexagon::SA1_zxtb:
+  case Hexagon::SA1_zxth:
+    // Rd 3-0, Rs 7-4
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::SA1_addsp:
+    // Rd 3-0, u 9-4{6_2}
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = ((inst & 0x3f0) >> 4) << 2;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    break;
+  case Hexagon::SA1_seti:
+    // Rd 3-0, u 9-4
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = (inst & 0x3f0) >> 4;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    break;
+  case Hexagon::SA1_clrf:
+  case Hexagon::SA1_clrfnew:
+  case Hexagon::SA1_clrt:
+  case Hexagon::SA1_clrtnew:
+  case Hexagon::SA1_setin1:
+    // Rd 3-0
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    if (opcode == Hexagon::SA1_setin1)
+      break;
+    MI->addOperand(MCOperand::createReg(Hexagon::P0));
+    break;
+  case Hexagon::SA1_cmpeqi:
+    // Rs 7-4, u 1-0
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = inst & 0x3;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    break;
+  case Hexagon::SA1_combine0i:
+  case Hexagon::SA1_combine1i:
+  case Hexagon::SA1_combine2i:
+  case Hexagon::SA1_combine3i:
+    // Rdd 2-0, u 6-5
+    operand = getDRegFromSubinstEncoding(inst & 0x7);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = (inst & 0x060) >> 5;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    break;
+  case Hexagon::SA1_combinerz:
+  case Hexagon::SA1_combinezr:
+    // Rdd 2-0, Rs 7-4
+    operand = getDRegFromSubinstEncoding(inst & 0x7);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::SS1_storeb_io:
+    // Rs 7-4, u 11-8, Rt 3-0
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = (inst & 0xf00) >> 8;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::SS1_storew_io:
+    // Rs 7-4, u 11-8{4_2}, Rt 3-0
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = ((inst & 0xf00) >> 8) << 2;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::SS2_storebi0:
+  case Hexagon::SS2_storebi1:
+    // Rs 7-4, u 3-0
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = inst & 0xf;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    break;
+  case Hexagon::SS2_storewi0:
+  case Hexagon::SS2_storewi1:
+    // Rs 7-4, u 3-0{4_2}
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = (inst & 0xf) << 2;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    break;
+  case Hexagon::SS2_stored_sp:
+    // s 8-3{6_3}, Rtt 2-0
+    operand = SignExtend64<9>(((inst & 0x1f8) >> 3) << 3);
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    operand = getDRegFromSubinstEncoding(inst & 0x7);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::SS2_storeh_io:
+    // Rs 7-4, u 10-8{3_1}, Rt 3-0
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = ((inst & 0x700) >> 8) << 1;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::SS2_storew_sp:
+    // u 8-4{5_2}, Rd 3-0
+    operand = ((inst & 0x1f0) >> 4) << 2;
+    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    break;
+  default:
+    // don't crash with an invalid subinstruction
+    // llvm_unreachable("Invalid subinstruction in duplex instruction");
+    break;
+  }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.h b/contrib/llvm/lib/Target/Hexagon/Hexagon.h
new file mode 100644
index 000000000000..ed7d9578902e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.h
@@ -0,0 +1,56 @@
+//=-- Hexagon.h - Top-level interface for Hexagon representation --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Hexagon back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGON_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGON_H
+
+#define Hexagon_POINTER_SIZE 4
+
+#define Hexagon_PointerSize (Hexagon_POINTER_SIZE)
+#define Hexagon_PointerSize_Bits (Hexagon_POINTER_SIZE * 8)
+#define Hexagon_WordSize Hexagon_PointerSize
+#define Hexagon_WordSize_Bits Hexagon_PointerSize_Bits
+
+// allocframe saves LR and FP on stack before allocating
+// a new stack frame. This takes 8 bytes.
+#define HEXAGON_LRFP_SIZE 8
+
+// Normal instruction size (in bytes).
+#define HEXAGON_INSTR_SIZE 4
+
+// Maximum number of words and instructions in a packet.
+#define HEXAGON_PACKET_SIZE 4
+#define HEXAGON_MAX_PACKET_SIZE (HEXAGON_PACKET_SIZE * HEXAGON_INSTR_SIZE)
+// Minimum number of instructions in an end-loop packet.
+#define HEXAGON_PACKET_INNER_SIZE 2
+#define HEXAGON_PACKET_OUTER_SIZE 3
+// Maximum number of instructions in a packet before shuffling,
+// including a compound one or a duplex or an extender.
+#define HEXAGON_PRESHUFFLE_PACKET_SIZE (HEXAGON_PACKET_SIZE + 3)
+
+// Name of the global offset table as defined by the Hexagon ABI
+#define HEXAGON_GOT_SYM_NAME "_GLOBAL_OFFSET_TABLE_"
+
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class HexagonTargetMachine;
+
+  /// \brief Creates a Hexagon-specific Target Transformation Info pass.
+  ImmutablePass *createHexagonTargetTransformInfoPass(const HexagonTargetMachine *TM);
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.td b/contrib/llvm/lib/Target/Hexagon/Hexagon.td
new file mode 100644
index 000000000000..0b2b46387b6a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.td
@@ -0,0 +1,295 @@
+//===-- Hexagon.td - Describe the Hexagon Target Machine --*- tablegen -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the Hexagon target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Hexagon Subtarget features.
+//===----------------------------------------------------------------------===//
+
+// Hexagon Architectures
+def ArchV4:  SubtargetFeature<"v4",  "HexagonArchVersion", "V4",  "Hexagon V4">;
+def ArchV5:  SubtargetFeature<"v5",  "HexagonArchVersion", "V5",  "Hexagon V5">;
+def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "V55", "Hexagon V55">;
+def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "V60", "Hexagon V60">;
+
+def FeatureHVX: SubtargetFeature<"hvx", "UseHVXOps", "true",
+      "Hexagon HVX instructions">;
+def FeatureHVXDbl: SubtargetFeature<"hvx-double", "UseHVXDblOps", "true",
+      "Hexagon HVX Double instructions">;
+def FeatureLongCalls: SubtargetFeature<"long-calls", "UseLongCalls", "true",
+      "Use constant-extended calls">;
+
+//===----------------------------------------------------------------------===//
+// Hexagon Instruction Predicate Definitions.
+//===----------------------------------------------------------------------===//
+def HasV5T             : Predicate<"HST->hasV5TOps()">;
+def NoV5T              : Predicate<"!HST->hasV5TOps()">;
+def HasV55T            : Predicate<"HST->hasV55TOps()">,
+                         AssemblerPredicate<"ArchV55">;
+def HasV60T            : Predicate<"HST->hasV60TOps()">,
+                         AssemblerPredicate<"ArchV60">;
+def UseMEMOP           : Predicate<"HST->useMemOps()">;
+def IEEERndNearV5T     : Predicate<"HST->modeIEEERndNear()">;
+def UseHVXDbl          : Predicate<"HST->useHVXDblOps()">,
+                         AssemblerPredicate<"FeatureHVXDbl">;
+def UseHVXSgl          : Predicate<"HST->useHVXSglOps()">;
+def UseHVX             : Predicate<"HST->useHVXSglOps() ||HST->useHVXDblOps()">,
+                         AssemblerPredicate<"FeatureHVX">;
+
+//===----------------------------------------------------------------------===//
+// Classes used for relation maps.
+//===----------------------------------------------------------------------===//
+
+class ImmRegShl;
+// PredRel - Filter class used to relate non-predicated instructions with their
+// predicated forms.
+class PredRel;
+// PredNewRel - Filter class used to relate predicated instructions with their
+// predicate-new forms.
+class PredNewRel: PredRel;
+// ImmRegRel - Filter class used to relate instructions having reg-reg form
+// with their reg-imm counterparts.
+class ImmRegRel;
+// NewValueRel - Filter class used to relate regular store instructions with
+// their new-value store form.
+class NewValueRel: PredNewRel;
+// NewValueRel - Filter class used to relate load/store instructions having
+// different addressing modes with each other.
+class AddrModeRel: NewValueRel;
+class IntrinsicsRel;
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate non-predicate instructions with their
+// predicated formats - true and false.
+//
+
+def getPredOpcode : InstrMapping {
+  let FilterClass = "PredRel";
+  // Instructions with the same BaseOpcode and isNVStore values form a row.
+  let RowFields = ["BaseOpcode", "isNVStore", "PNewValue", "isNT"];
+  // Instructions with the same predicate sense form a column.
+  let ColFields = ["PredSense"];
+  // The key column is the unpredicated instructions.
+  let KeyCol = [""];
+  // Value columns are PredSense=true and PredSense=false
+  let ValueCols = [["true"], ["false"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate predicate-true instructions with their
+// predicate-false forms
+//
+def getFalsePredOpcode : InstrMapping {
+  let FilterClass = "PredRel";
+  let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken", "isNT"];
+  let ColFields = ["PredSense"];
+  let KeyCol = ["true"];
+  let ValueCols = [["false"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate predicate-false instructions with their
+// predicate-true forms
+//
+def getTruePredOpcode : InstrMapping {
+  let FilterClass = "PredRel";
+  let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken", "isNT"];
+  let ColFields = ["PredSense"];
+  let KeyCol = ["false"];
+  let ValueCols = [["true"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate predicated instructions with their .new
+// format.
+//
+def getPredNewOpcode : InstrMapping {
+  let FilterClass = "PredNewRel";
+  let RowFields = ["BaseOpcode", "PredSense", "isNVStore", "isBrTaken"];
+  let ColFields = ["PNewValue"];
+  let KeyCol = [""];
+  let ValueCols = [["new"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate .new predicated instructions with their old
+// format.
+//
+def getPredOldOpcode : InstrMapping {
+  let FilterClass = "PredNewRel";
+  let RowFields = ["BaseOpcode", "PredSense", "isNVStore"];
+  let ColFields = ["PNewValue"];
+  let KeyCol = ["new"];
+  let ValueCols = [[""]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate store instructions with their new-value
+// format.
+//
+def getNewValueOpcode : InstrMapping {
+  let FilterClass = "NewValueRel";
+  let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode", "isNT"];
+  let ColFields = ["NValueST"];
+  let KeyCol = ["false"];
+  let ValueCols = [["true"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate new-value store instructions with their old
+// format.
+//
+def getNonNVStore : InstrMapping {
+  let FilterClass = "NewValueRel";
+  let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode", "isNT"];
+  let ColFields = ["NValueST"];
+  let KeyCol = ["true"];
+  let ValueCols = [["false"]];
+}
+
+def getBaseWithImmOffset : InstrMapping {
+  let FilterClass = "AddrModeRel";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore",
+                   "isFloat"];
+  let ColFields = ["addrMode"];
+  let KeyCol = ["Absolute"];
+  let ValueCols = [["BaseImmOffset"]];
+}
+
+def getAbsoluteForm : InstrMapping {
+  let FilterClass = "AddrModeRel";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore",
+                   "isFloat"];
+  let ColFields = ["addrMode"];
+  let KeyCol = ["BaseImmOffset"];
+  let ValueCols = [["Absolute"]];
+}
+
+def getBaseWithRegOffset : InstrMapping {
+  let FilterClass = "AddrModeRel";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
+  let ColFields = ["addrMode"];
+  let KeyCol = ["BaseImmOffset"];
+  let ValueCols = [["BaseRegOffset"]];
+}
+
+def xformRegToImmOffset : InstrMapping {
+  let FilterClass = "AddrModeRel";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
+  let ColFields = ["addrMode"];
+  let KeyCol = ["BaseRegOffset"];
+  let ValueCols = [["BaseImmOffset"]];
+}
+
+def getBaseWithLongOffset : InstrMapping {
+  let FilterClass = "ImmRegShl";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
+  let ColFields = ["addrMode"];
+  let KeyCol = ["BaseRegOffset"];
+  let ValueCols = [["BaseLongOffset"]];
+}
+
+def getRegForm : InstrMapping {
+  let FilterClass = "ImmRegRel";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue"];
+  let ColFields = ["InputType"];
+  let KeyCol = ["imm"];
+  let ValueCols = [["reg"]];
+}
+
+def getRegShlForm : InstrMapping {
+  let FilterClass = "ImmRegShl";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
+  let ColFields = ["InputType"];
+  let KeyCol = ["imm"];
+  let ValueCols = [["reg"]];
+}
+
+def notTakenBranchPrediction : InstrMapping {
+  let FilterClass = "PredRel";
+  let RowFields = ["BaseOpcode", "PNewValue",  "PredSense", "isBranch", "isPredicated"];
+  let ColFields = ["isBrTaken"];
+  let KeyCol = ["true"];
+  let ValueCols = [["false"]];
+}
+
+def takenBranchPrediction : InstrMapping {
+  let FilterClass = "PredRel";
+  let RowFields = ["BaseOpcode", "PNewValue",  "PredSense", "isBranch", "isPredicated"];
+  let ColFields = ["isBrTaken"];
+  let KeyCol = ["false"];
+  let ValueCols = [["true"]];
+}
+
+def getRealHWInstr : InstrMapping {
+  let FilterClass = "IntrinsicsRel";
+  let RowFields = ["BaseOpcode"];
+  let ColFields = ["InstrType"];
+  let KeyCol = ["Pseudo"];
+  let ValueCols = [["Pseudo"], ["Real"]];
+}
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+include "HexagonSchedule.td"
+include "HexagonRegisterInfo.td"
+include "HexagonCallingConv.td"
+include "HexagonInstrInfo.td"
+include "HexagonPatterns.td"
+include "HexagonIntrinsics.td"
+include "HexagonIntrinsicsDerived.td"
+
+def HexagonInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Hexagon processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, SchedMachineModel Model,
+           list<SubtargetFeature> Features>
+ : ProcessorModel<Name, Model, Features>;
+
+def : Proc<"hexagonv4",  HexagonModelV4,
+           [ArchV4]>;
+def : Proc<"hexagonv5",  HexagonModelV4,
+           [ArchV4, ArchV5]>;
+def : Proc<"hexagonv55", HexagonModelV55,
+           [ArchV4, ArchV5, ArchV55]>;
+def : Proc<"hexagonv60", HexagonModelV60,
+           [ArchV4, ArchV5, ArchV55, ArchV60, FeatureHVX]>;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def HexagonAsmParser : AsmParser {
+  let ShouldEmitMatchRegisterAltName = 1;
+  bit HasMnemonicFirst = 0;
+}
+
+def HexagonAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+  string TokenizingCharacters = "#()=:.<>!+*-|^&";
+}
+
+def Hexagon : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = HexagonInstrInfo;
+  let AssemblyParsers = [HexagonAsmParser];
+  let AssemblyParserVariants = [HexagonAsmParserVariant];
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
new file mode 100644
index 000000000000..54db5ad4374b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -0,0 +1,604 @@
+//===-- HexagonAsmPrinter.cpp - Print machine instrs to Hexagon assembly --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to Hexagon assembly language. This printer is
+// the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonAsmPrinter.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "MCTargetDesc/HexagonInstPrinter.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+  void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+                        MCInst &MCB, HexagonAsmPrinter &AP);
+}
+
+#define DEBUG_TYPE "asm-printer"
+
+static cl::opt<bool> AlignCalls(
+         "hexagon-align-calls", cl::Hidden, cl::init(true),
+          cl::desc("Insert falign after call instruction for Hexagon target"));
+
+// Given a scalar register return its pair.
+inline static unsigned getHexagonRegisterPair(unsigned Reg,
+      const MCRegisterInfo *RI) {
+  assert(Hexagon::IntRegsRegClass.contains(Reg));
+  MCSuperRegIterator SR(Reg, RI, false);
+  unsigned Pair = *SR;
+  assert(Hexagon::DoubleRegsRegClass.contains(Pair));
+  return Pair;
+}
+
+HexagonAsmPrinter::HexagonAsmPrinter(TargetMachine &TM,
+                                     std::unique_ptr<MCStreamer> Streamer)
+    : AsmPrinter(TM, std::move(Streamer)), Subtarget(nullptr) {}
+
+void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+
+  switch (MO.getType()) {
+  default: llvm_unreachable ("<unknown operand type>");
+  case MachineOperand::MO_Register:
+    O << HexagonInstPrinter::getRegisterName(MO.getReg());
+    return;
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    MO.getMBB()->getSymbol()->print(O, MAI);
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    GetCPISymbol(MO.getIndex())->print(O, MAI);
+    return;
+  case MachineOperand::MO_GlobalAddress:
+    // Computing the address of a global symbol, not calling it.
+    getSymbol(MO.getGlobal())->print(O, MAI);
+    printOffset(MO.getOffset(), O);
+    return;
+  }
+}
+
+//
+// isBlockOnlyReachableByFallthrough - We need to override this since the
+// default AsmPrinter does not print labels for any basic block that
+// is only reachable by a fall through. That works for all cases except
+// for the case in which the basic block is reachable by a fall through but
+// through an indirect from a jump table. In this case, the jump table
+// will contain a label not defined by AsmPrinter.
+//
+bool HexagonAsmPrinter::
+isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
+  if (MBB->hasAddressTaken())
+    return false;
+  return AsmPrinter::isBlockOnlyReachableByFallthrough(MBB);
+}
+
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                        unsigned AsmVariant,
+                                        const char *ExtraCode,
+                                        raw_ostream &OS) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
+    case 'c': // Don't print "$" before a global var name or constant.
+      // Hexagon never has a prefix.
+      printOperand(MI, OpNo, OS);
+      return false;
+    case 'L':
+    case 'H': { // The highest-numbered register of a pair.
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+      if (!MO.isReg())
+        return true;
+      unsigned RegNumber = MO.getReg();
+      // This should be an assert in the frontend.
+      if (Hexagon::DoubleRegsRegClass.contains(RegNumber))
+        RegNumber = TRI->getSubReg(RegNumber, ExtraCode[0] == 'L' ?
+                                              Hexagon::isub_lo :
+                                              Hexagon::isub_hi);
+      OS << HexagonInstPrinter::getRegisterName(RegNumber);
+      return false;
+    }
+    case 'I':
+      // Write 'i' if an integer constant, otherwise nothing.  Used to print
+      // addi vs add, etc.
+      if (MI->getOperand(OpNo).isImm())
+        OS << "i";
+      return false;
+    }
+  }
+
+  printOperand(MI, OpNo, OS);
+  return false;
+}
+
+bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                              unsigned OpNo, unsigned AsmVariant,
+                                              const char *ExtraCode,
+                                              raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  const MachineOperand &Base  = MI->getOperand(OpNo);
+  const MachineOperand &Offset = MI->getOperand(OpNo+1);
+
+  if (Base.isReg())
+    printOperand(MI, OpNo, O);
+  else
+    llvm_unreachable("Unimplemented");
+
+  if (Offset.isImm()) {
+    if (Offset.getImm())
+      O << " + #" << Offset.getImm();
+  }
+  else
+    llvm_unreachable("Unimplemented");
+
+  return false;
+}
+
+static MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI,
+                           MCStreamer &OutStreamer, const MCOperand &Imm,
+                           int AlignSize) {
+  MCSymbol *Sym;
+  int64_t Value;
+  if (Imm.getExpr()->evaluateAsAbsolute(Value)) {
+    StringRef sectionPrefix;
+    std::string ImmString;
+    StringRef Name;
+    if (AlignSize == 8) {
+       Name = ".CONST_0000000000000000";
+       sectionPrefix = ".gnu.linkonce.l8";
+       ImmString = utohexstr(Value);
+    } else {
+       Name = ".CONST_00000000";
+       sectionPrefix = ".gnu.linkonce.l4";
+       ImmString = utohexstr(static_cast<uint32_t>(Value));
+    }
+
+    std::string symbolName =   // Yes, leading zeros are kept.
+      Name.drop_back(ImmString.size()).str() + ImmString;
+    std::string sectionName = sectionPrefix.str() + symbolName;
+
+    MCSectionELF *Section = OutStreamer.getContext().getELFSection(
+        sectionName, ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+    OutStreamer.SwitchSection(Section);
+
+    Sym = AP.OutContext.getOrCreateSymbol(Twine(symbolName));
+    if (Sym->isUndefined()) {
+      OutStreamer.EmitLabel(Sym);
+      OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global);
+      OutStreamer.EmitIntValue(Value, AlignSize);
+      OutStreamer.EmitCodeAlignment(AlignSize);
+    }
+  } else {
+    assert(Imm.isExpr() && "Expected expression and found none");
+    const MachineOperand &MO = MI.getOperand(1);
+    assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
+    MCSymbol *MOSymbol = nullptr;
+    if (MO.isGlobal())
+      MOSymbol = AP.getSymbol(MO.getGlobal());
+    else if (MO.isCPI())
+      MOSymbol = AP.GetCPISymbol(MO.getIndex());
+    else if (MO.isJTI())
+      MOSymbol = AP.GetJTISymbol(MO.getIndex());
+    else
+      llvm_unreachable("Unknown operand type!");
+
+    StringRef SymbolName = MOSymbol->getName();
+    std::string LitaName = ".CONST_" + SymbolName.str();
+
+    MCSectionELF *Section = OutStreamer.getContext().getELFSection(
+        ".lita", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+
+    OutStreamer.SwitchSection(Section);
+    Sym = AP.OutContext.getOrCreateSymbol(Twine(LitaName));
+    if (Sym->isUndefined()) {
+      OutStreamer.EmitLabel(Sym);
+      OutStreamer.EmitSymbolAttribute(Sym, MCSA_Local);
+      OutStreamer.EmitValue(Imm.getExpr(), AlignSize);
+      OutStreamer.EmitCodeAlignment(AlignSize);
+    }
+  }
+  return Sym;
+}
+
+void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
+                                                  const MachineInstr &MI) {
+  MCInst &MappedInst = static_cast <MCInst &>(Inst);
+  const MCRegisterInfo *RI = OutStreamer->getContext().getRegisterInfo();
+
+  switch (Inst.getOpcode()) {
+  default: return;
+
+  case Hexagon::A2_iconst: {
+    Inst.setOpcode(Hexagon::A2_addi);
+    MCOperand Reg = Inst.getOperand(0);
+    MCOperand S16 = Inst.getOperand(1);
+    HexagonMCInstrInfo::setMustNotExtend(*S16.getExpr());
+    HexagonMCInstrInfo::setS23_2_reloc(*S16.getExpr());
+    Inst.clear();
+    Inst.addOperand(Reg);
+    Inst.addOperand(MCOperand::createReg(Hexagon::R0));
+    Inst.addOperand(S16);
+    break;
+  }
+
+  // "$dst = CONST64(#$src1)",
+  case Hexagon::CONST64:
+    if (!OutStreamer->hasRawTextSupport()) {
+      const MCOperand &Imm = MappedInst.getOperand(1);
+      MCSectionSubPair Current = OutStreamer->getCurrentSection();
+
+      MCSymbol *Sym = smallData(*this, MI, *OutStreamer, Imm, 8);
+
+      OutStreamer->SwitchSection(Current.first, Current.second);
+      MCInst TmpInst;
+      MCOperand &Reg = MappedInst.getOperand(0);
+      TmpInst.setOpcode(Hexagon::L2_loadrdgp);
+      TmpInst.addOperand(Reg);
+      TmpInst.addOperand(MCOperand::createExpr(
+                         MCSymbolRefExpr::create(Sym, OutContext)));
+      MappedInst = TmpInst;
+
+    }
+    break;
+  case Hexagon::CONST32:
+    if (!OutStreamer->hasRawTextSupport()) {
+      MCOperand &Imm = MappedInst.getOperand(1);
+      MCSectionSubPair Current = OutStreamer->getCurrentSection();
+      MCSymbol *Sym = smallData(*this, MI, *OutStreamer, Imm, 4);
+      OutStreamer->SwitchSection(Current.first, Current.second);
+      MCInst TmpInst;
+      MCOperand &Reg = MappedInst.getOperand(0);
+      TmpInst.setOpcode(Hexagon::L2_loadrigp);
+      TmpInst.addOperand(Reg);
+      TmpInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(
+          MCSymbolRefExpr::create(Sym, OutContext), OutContext)));
+      MappedInst = TmpInst;
+    }
+    break;
+
+  // C2_pxfer_map maps to C2_or instruction. Though, it's possible to use
+  // C2_or during instruction selection itself but it results
+  // into suboptimal code.
+  case Hexagon::C2_pxfer_map: {
+    MCOperand &Ps = Inst.getOperand(1);
+    MappedInst.setOpcode(Hexagon::C2_or);
+    MappedInst.addOperand(Ps);
+    return;
+  }
+
+  // Vector reduce complex multiply by scalar, Rt & 1 map to :hi else :lo
+  // The insn is mapped from the 4 operand to the 3 operand raw form taking
+  // 3 register pairs.
+  case Hexagon::M2_vrcmpys_acc_s1: {
+    MCOperand &Rt = Inst.getOperand(3);
+    assert (Rt.isReg() && "Expected register and none was found");
+    unsigned Reg = RI->getEncodingValue(Rt.getReg());
+    if (Reg & 1)
+      MappedInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_h);
+    else
+      MappedInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_l);
+    Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI));
+    return;
+  }
+  case Hexagon::M2_vrcmpys_s1: {
+    MCOperand &Rt = Inst.getOperand(2);
+    assert (Rt.isReg() && "Expected register and none was found");
+    unsigned Reg = RI->getEncodingValue(Rt.getReg());
+    if (Reg & 1)
+      MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1_h);
+    else
+      MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1_l);
+    Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI));
+    return;
+  }
+
+  case Hexagon::M2_vrcmpys_s1rp: {
+    MCOperand &Rt = Inst.getOperand(2);
+    assert (Rt.isReg() && "Expected register and none was found");
+    unsigned Reg = RI->getEncodingValue(Rt.getReg());
+    if (Reg & 1)
+      MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1rp_h);
+    else
+      MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1rp_l);
+    Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI));
+    return;
+  }
+
+  case Hexagon::A4_boundscheck: {
+    MCOperand &Rs = Inst.getOperand(1);
+    assert (Rs.isReg() && "Expected register and none was found");
+    unsigned Reg = RI->getEncodingValue(Rs.getReg());
+    if (Reg & 1) // Odd mapped to raw:hi, regpair is rodd:odd-1, like r3:2
+      MappedInst.setOpcode(Hexagon::A4_boundscheck_hi);
+    else         // raw:lo
+      MappedInst.setOpcode(Hexagon::A4_boundscheck_lo);
+    Rs.setReg(getHexagonRegisterPair(Rs.getReg(), RI));
+    return;
+  }
+  case Hexagon::S5_asrhub_rnd_sat_goodsyntax: {
+    MCOperand &MO = MappedInst.getOperand(2);
+    int64_t Imm;
+    MCExpr const *Expr = MO.getExpr();
+    bool Success = Expr->evaluateAsAbsolute(Imm);
+    assert (Success && "Expected immediate and none was found");
+    (void)Success;
+    MCInst TmpInst;
+    if (Imm == 0) {
+      TmpInst.setOpcode(Hexagon::S2_vsathub);
+      TmpInst.addOperand(MappedInst.getOperand(0));
+      TmpInst.addOperand(MappedInst.getOperand(1));
+      MappedInst = TmpInst;
+      return;
+    }
+    TmpInst.setOpcode(Hexagon::S5_asrhub_rnd_sat);
+    TmpInst.addOperand(MappedInst.getOperand(0));
+    TmpInst.addOperand(MappedInst.getOperand(1));
+    const MCExpr *One = MCConstantExpr::create(1, OutContext);
+    const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext);
+    TmpInst.addOperand(
+        MCOperand::createExpr(HexagonMCExpr::create(Sub, OutContext)));
+    MappedInst = TmpInst;
+    return;
+  }
+  case Hexagon::S5_vasrhrnd_goodsyntax:
+  case Hexagon::S2_asr_i_p_rnd_goodsyntax: {
+    MCOperand &MO2 = MappedInst.getOperand(2);
+    MCExpr const *Expr = MO2.getExpr();
+    int64_t Imm;
+    bool Success = Expr->evaluateAsAbsolute(Imm);
+    assert (Success && "Expected immediate and none was found");
+    (void)Success;
+    MCInst TmpInst;
+    if (Imm == 0) {
+      TmpInst.setOpcode(Hexagon::A2_combinew);
+      TmpInst.addOperand(MappedInst.getOperand(0));
+      MCOperand &MO1 = MappedInst.getOperand(1);
+      unsigned High = RI->getSubReg(MO1.getReg(), Hexagon::isub_hi);
+      unsigned Low = RI->getSubReg(MO1.getReg(), Hexagon::isub_lo);
+      // Add a new operand for the second register in the pair.
+      TmpInst.addOperand(MCOperand::createReg(High));
+      TmpInst.addOperand(MCOperand::createReg(Low));
+      MappedInst = TmpInst;
+      return;
+    }
+
+    if (Inst.getOpcode() == Hexagon::S2_asr_i_p_rnd_goodsyntax)
+      TmpInst.setOpcode(Hexagon::S2_asr_i_p_rnd);
+    else
+      TmpInst.setOpcode(Hexagon::S5_vasrhrnd);
+    TmpInst.addOperand(MappedInst.getOperand(0));
+    TmpInst.addOperand(MappedInst.getOperand(1));
+    const MCExpr *One = MCConstantExpr::create(1, OutContext);
+    const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext);
+    TmpInst.addOperand(
+        MCOperand::createExpr(HexagonMCExpr::create(Sub, OutContext)));
+    MappedInst = TmpInst;
+    return;
+  }
+  // if ("#u5==0") Assembler mapped to: "Rd=Rs"; else Rd=asr(Rs,#u5-1):rnd
+  case Hexagon::S2_asr_i_r_rnd_goodsyntax: {
+    MCOperand &MO = Inst.getOperand(2);
+    MCExpr const *Expr = MO.getExpr();
+    int64_t Imm;
+    bool Success = Expr->evaluateAsAbsolute(Imm);
+    assert (Success && "Expected immediate and none was found");
+    (void)Success;
+    MCInst TmpInst;
+    if (Imm == 0) {
+      TmpInst.setOpcode(Hexagon::A2_tfr);
+      TmpInst.addOperand(MappedInst.getOperand(0));
+      TmpInst.addOperand(MappedInst.getOperand(1));
+      MappedInst = TmpInst;
+      return;
+    }
+    TmpInst.setOpcode(Hexagon::S2_asr_i_r_rnd);
+    TmpInst.addOperand(MappedInst.getOperand(0));
+    TmpInst.addOperand(MappedInst.getOperand(1));
+    const MCExpr *One = MCConstantExpr::create(1, OutContext);
+    const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext);
+    TmpInst.addOperand(
+        MCOperand::createExpr(HexagonMCExpr::create(Sub, OutContext)));
+    MappedInst = TmpInst;
+    return;
+  }
+
+  // Translate a "$Rdd = #imm" to "$Rdd = combine(#[-1,0], #imm)"
+  case Hexagon::A2_tfrpi: {
+    MCInst TmpInst;
+    MCOperand &Rdd = MappedInst.getOperand(0);
+    MCOperand &MO = MappedInst.getOperand(1);
+
+    TmpInst.setOpcode(Hexagon::A2_combineii);
+    TmpInst.addOperand(Rdd);
+    int64_t Imm;
+    bool Success = MO.getExpr()->evaluateAsAbsolute(Imm);
+    if (Success && Imm < 0) {
+      const MCExpr *MOne = MCConstantExpr::create(-1, OutContext);
+      TmpInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(MOne, OutContext)));
+    } else {
+      const MCExpr *Zero = MCConstantExpr::create(0, OutContext);
+      TmpInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(Zero, OutContext)));
+    }
+    TmpInst.addOperand(MO);
+    MappedInst = TmpInst;
+    return;
+  }
+  // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)"
+  case Hexagon::A2_tfrp: {
+    MCOperand &MO = MappedInst.getOperand(1);
+    unsigned High = RI->getSubReg(MO.getReg(), Hexagon::isub_hi);
+    unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::isub_lo);
+    MO.setReg(High);
+    // Add a new operand for the second register in the pair.
+    MappedInst.addOperand(MCOperand::createReg(Low));
+    MappedInst.setOpcode(Hexagon::A2_combinew);
+    return;
+  }
+
+  case Hexagon::A2_tfrpt:
+  case Hexagon::A2_tfrpf: {
+    MCOperand &MO = MappedInst.getOperand(2);
+    unsigned High = RI->getSubReg(MO.getReg(), Hexagon::isub_hi);
+    unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::isub_lo);
+    MO.setReg(High);
+    // Add a new operand for the second register in the pair.
+    MappedInst.addOperand(MCOperand::createReg(Low));
+    MappedInst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt)
+                          ? Hexagon::C2_ccombinewt
+                          : Hexagon::C2_ccombinewf);
+    return;
+  }
+  case Hexagon::A2_tfrptnew:
+  case Hexagon::A2_tfrpfnew: {
+    MCOperand &MO = MappedInst.getOperand(2);
+    unsigned High = RI->getSubReg(MO.getReg(), Hexagon::isub_hi);
+    unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::isub_lo);
+    MO.setReg(High);
+    // Add a new operand for the second register in the pair.
+    MappedInst.addOperand(MCOperand::createReg(Low));
+    MappedInst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew)
+                          ? Hexagon::C2_ccombinewnewt
+                          : Hexagon::C2_ccombinewnewf);
+    return;
+  }
+
+  case Hexagon::M2_mpysmi: {
+    MCOperand &Imm = MappedInst.getOperand(2);
+    MCExpr const *Expr = Imm.getExpr();
+    int64_t Value;
+    bool Success = Expr->evaluateAsAbsolute(Value);
+    assert(Success);
+    (void)Success;
+    if (Value < 0 && Value > -256) {
+      MappedInst.setOpcode(Hexagon::M2_mpysin);
+      Imm.setExpr(HexagonMCExpr::create(
+          MCUnaryExpr::createMinus(Expr, OutContext), OutContext));
+    } else
+      MappedInst.setOpcode(Hexagon::M2_mpysip);
+    return;
+  }
+
+  case Hexagon::A2_addsp: {
+    MCOperand &Rt = Inst.getOperand(1);
+    assert (Rt.isReg() && "Expected register and none was found");
+    unsigned Reg = RI->getEncodingValue(Rt.getReg());
+    if (Reg & 1)
+      MappedInst.setOpcode(Hexagon::A2_addsph);
+    else
+      MappedInst.setOpcode(Hexagon::A2_addspl);
+    Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI));
+    return;
+  }
+  case Hexagon::V6_vd0:
+  case Hexagon::V6_vd0_128B: {
+    MCInst TmpInst;
+    assert (Inst.getOperand(0).isReg() &&
+            "Expected register and none was found");
+
+    TmpInst.setOpcode(Hexagon::V6_vxor);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    MappedInst = TmpInst;
+    return;
+  }
+
+  }
+}
+
+
+/// printMachineInstruction -- Print out a single Hexagon MI in Darwin syntax to
+/// the current output stream.
+///
+void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MCInst MCB = HexagonMCInstrInfo::createBundle();
+  const MCInstrInfo &MCII = *Subtarget->getInstrInfo();
+
+  if (MI->isBundle()) {
+    const MachineBasicBlock* MBB = MI->getParent();
+    MachineBasicBlock::const_instr_iterator MII = MI->getIterator();
+    unsigned IgnoreCount = 0;
+
+    for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII)
+      if (MII->getOpcode() == TargetOpcode::DBG_VALUE ||
+          MII->getOpcode() == TargetOpcode::IMPLICIT_DEF)
+        ++IgnoreCount;
+      else
+        HexagonLowerToMC(MCII, &*MII, MCB, *this);
+  }
+  else
+    HexagonLowerToMC(MCII, MI, MCB, *this);
+
+  bool Ok = HexagonMCInstrInfo::canonicalizePacket(
+      MCII, *Subtarget, OutStreamer->getContext(), MCB, nullptr);
+  assert(Ok);
+  (void)Ok;
+  if(HexagonMCInstrInfo::bundleSize(MCB) == 0)
+    return;
+  OutStreamer->EmitInstruction(MCB, getSubtargetInfo());
+}
+
+extern "C" void LLVMInitializeHexagonAsmPrinter() {
+  RegisterAsmPrinter<HexagonAsmPrinter> X(getTheHexagonTarget());
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
new file mode 100644
index 000000000000..775da03e0f8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -0,0 +1,62 @@
+//===-- HexagonAsmPrinter.h - Print machine code to an Hexagon .s file ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hexagon Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONASMPRINTER_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONASMPRINTER_H
+
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+  class HexagonAsmPrinter : public AsmPrinter {
+    const HexagonSubtarget *Subtarget;
+
+  public:
+    explicit HexagonAsmPrinter(TargetMachine &TM,
+                               std::unique_ptr<MCStreamer> Streamer);
+
+    bool runOnMachineFunction(MachineFunction &Fn) override {
+      Subtarget = &Fn.getSubtarget<HexagonSubtarget>();
+      return AsmPrinter::runOnMachineFunction(Fn);
+    }
+
+    StringRef getPassName() const override {
+      return "Hexagon Assembly Printer";
+    }
+
+    bool isBlockOnlyReachableByFallthrough(
+                                   const MachineBasicBlock *MBB) const override;
+
+    void EmitInstruction(const MachineInstr *MI) override;
+
+    void HexagonProcessInstruction(MCInst &Inst,
+                                   const MachineInstr &MBB);
+
+
+    void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &OS) override;
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &OS) override;
+
+    static const char *getRegisterName(unsigned RegNo);
+  };
+
+} // end of llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
new file mode 100644
index 000000000000..fe7278fde1b1
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -0,0 +1,2881 @@
+//===--- HexagonBitSimplify.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexbit"
+
+#include "HexagonBitTracker.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<bool> PreserveTiedOps("hexbit-keep-tied", cl::Hidden,
+  cl::init(true), cl::desc("Preserve subregisters in tied operands"));
+
+namespace llvm {
+
+  void initializeHexagonBitSimplifyPass(PassRegistry& Registry);
+  FunctionPass *createHexagonBitSimplify();
+
+} // end namespace llvm
+
+namespace {
+
+  // Set of virtual registers, based on BitVector.
+  struct RegisterSet : private BitVector {
+    RegisterSet() = default;
+    explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {}
+    RegisterSet(const RegisterSet &RS) = default;
+
+    using BitVector::clear;
+    using BitVector::count;
+
+    unsigned find_first() const {
+      int First = BitVector::find_first();
+      if (First < 0)
+        return 0;
+      return x2v(First);
+    }
+
+    unsigned find_next(unsigned Prev) const {
+      int Next = BitVector::find_next(v2x(Prev));
+      if (Next < 0)
+        return 0;
+      return x2v(Next);
+    }
+
+    RegisterSet &insert(unsigned R) {
+      unsigned Idx = v2x(R);
+      ensure(Idx);
+      return static_cast<RegisterSet&>(BitVector::set(Idx));
+    }
+    RegisterSet &remove(unsigned R) {
+      unsigned Idx = v2x(R);
+      if (Idx >= size())
+        return *this;
+      return static_cast<RegisterSet&>(BitVector::reset(Idx));
+    }
+
+    RegisterSet &insert(const RegisterSet &Rs) {
+      return static_cast<RegisterSet&>(BitVector::operator|=(Rs));
+    }
+    RegisterSet &remove(const RegisterSet &Rs) {
+      return static_cast<RegisterSet&>(BitVector::reset(Rs));
+    }
+
+    reference operator[](unsigned R) {
+      unsigned Idx = v2x(R);
+      ensure(Idx);
+      return BitVector::operator[](Idx);
+    }
+    bool operator[](unsigned R) const {
+      unsigned Idx = v2x(R);
+      assert(Idx < size());
+      return BitVector::operator[](Idx);
+    }
+    bool has(unsigned R) const {
+      unsigned Idx = v2x(R);
+      if (Idx >= size())
+        return false;
+      return BitVector::test(Idx);
+    }
+
+    bool empty() const {
+      return !BitVector::any();
+    }
+    bool includes(const RegisterSet &Rs) const {
+      // A.BitVector::test(B)  <=>  A-B != {}
+      return !Rs.BitVector::test(*this);
+    }
+    bool intersects(const RegisterSet &Rs) const {
+      return BitVector::anyCommon(Rs);
+    }
+
+  private:
+    void ensure(unsigned Idx) {
+      if (size() <= Idx)
+        resize(std::max(Idx+1, 32U));
+    }
+
+    static inline unsigned v2x(unsigned v) {
+      return TargetRegisterInfo::virtReg2Index(v);
+    }
+
+    static inline unsigned x2v(unsigned x) {
+      return TargetRegisterInfo::index2VirtReg(x);
+    }
+  };
+
+  struct PrintRegSet {
+    PrintRegSet(const RegisterSet &S, const TargetRegisterInfo *RI)
+      : RS(S), TRI(RI) {}
+
+    friend raw_ostream &operator<< (raw_ostream &OS,
+          const PrintRegSet &P);
+
+  private:
+    const RegisterSet &RS;
+    const TargetRegisterInfo *TRI;
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P)
+    LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) {
+    OS << '{';
+    for (unsigned R = P.RS.find_first(); R; R = P.RS.find_next(R))
+      OS << ' ' << PrintReg(R, P.TRI);
+    OS << " }";
+    return OS;
+  }
+
+  class Transformation;
+
+  class HexagonBitSimplify : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    HexagonBitSimplify() : MachineFunctionPass(ID), MDT(nullptr) {
+      initializeHexagonBitSimplifyPass(*PassRegistry::getPassRegistry());
+    }
+
+    StringRef getPassName() const override {
+      return "Hexagon bit simplification";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    static void getInstrDefs(const MachineInstr &MI, RegisterSet &Defs);
+    static void getInstrUses(const MachineInstr &MI, RegisterSet &Uses);
+    static bool isEqual(const BitTracker::RegisterCell &RC1, uint16_t B1,
+        const BitTracker::RegisterCell &RC2, uint16_t B2, uint16_t W);
+    static bool isZero(const BitTracker::RegisterCell &RC, uint16_t B,
+        uint16_t W);
+    static bool getConst(const BitTracker::RegisterCell &RC, uint16_t B,
+        uint16_t W, uint64_t &U);
+    static bool replaceReg(unsigned OldR, unsigned NewR,
+        MachineRegisterInfo &MRI);
+    static bool getSubregMask(const BitTracker::RegisterRef &RR,
+        unsigned &Begin, unsigned &Width, MachineRegisterInfo &MRI);
+    static bool replaceRegWithSub(unsigned OldR, unsigned NewR,
+        unsigned NewSR, MachineRegisterInfo &MRI);
+    static bool replaceSubWithSub(unsigned OldR, unsigned OldSR,
+        unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI);
+    static bool parseRegSequence(const MachineInstr &I,
+        BitTracker::RegisterRef &SL, BitTracker::RegisterRef &SH,
+        const MachineRegisterInfo &MRI);
+
+    static bool getUsedBitsInStore(unsigned Opc, BitVector &Bits,
+        uint16_t Begin);
+    static bool getUsedBits(unsigned Opc, unsigned OpN, BitVector &Bits,
+        uint16_t Begin, const HexagonInstrInfo &HII);
+
+    static const TargetRegisterClass *getFinalVRegClass(
+        const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI);
+    static bool isTransparentCopy(const BitTracker::RegisterRef &RD,
+        const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI);
+
+  private:
+    MachineDominatorTree *MDT;
+
+    bool visitBlock(MachineBasicBlock &B, Transformation &T, RegisterSet &AVs);
+    static bool hasTiedUse(unsigned Reg, MachineRegisterInfo &MRI,
+        unsigned NewSub = Hexagon::NoSubRegister);
+  };
+
+  char HexagonBitSimplify::ID = 0;
+  typedef HexagonBitSimplify HBS;
+
+  // The purpose of this class is to provide a common facility to traverse
+  // the function top-down or bottom-up via the dominator tree, and keep
+  // track of the available registers.
+  class Transformation {
+  public:
+    bool TopDown;
+
+    Transformation(bool TD) : TopDown(TD) {}
+    virtual ~Transformation() = default;
+
+    virtual bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) = 0;
+  };
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(HexagonBitSimplify, "hexbit",
+      "Hexagon bit simplification", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(HexagonBitSimplify, "hexbit",
+      "Hexagon bit simplification", false, false)
+
+bool HexagonBitSimplify::visitBlock(MachineBasicBlock &B, Transformation &T,
+      RegisterSet &AVs) {
+  MachineDomTreeNode *N = MDT->getNode(&B);
+  typedef GraphTraits<MachineDomTreeNode*> GTN;
+  bool Changed = false;
+
+  if (T.TopDown)
+    Changed = T.processBlock(B, AVs);
+
+  RegisterSet Defs;
+  for (auto &I : B)
+    getInstrDefs(I, Defs);
+  RegisterSet NewAVs = AVs;
+  NewAVs.insert(Defs);
+
+  for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) {
+    MachineBasicBlock *SB = (*I)->getBlock();
+    Changed |= visitBlock(*SB, T, NewAVs);
+  }
+  if (!T.TopDown)
+    Changed |= T.processBlock(B, AVs);
+
+  return Changed;
+}
+
+//
+// Utility functions:
+//
+void HexagonBitSimplify::getInstrDefs(const MachineInstr &MI,
+      RegisterSet &Defs) {
+  for (auto &Op : MI.operands()) {
+    if (!Op.isReg() || !Op.isDef())
+      continue;
+    unsigned R = Op.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      continue;
+    Defs.insert(R);
+  }
+}
+
+void HexagonBitSimplify::getInstrUses(const MachineInstr &MI,
+      RegisterSet &Uses) {
+  for (auto &Op : MI.operands()) {
+    if (!Op.isReg() || !Op.isUse())
+      continue;
+    unsigned R = Op.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      continue;
+    Uses.insert(R);
+  }
+}
+
+// Check if all the bits in range [B, E) in both cells are equal.
+bool HexagonBitSimplify::isEqual(const BitTracker::RegisterCell &RC1,
+      uint16_t B1, const BitTracker::RegisterCell &RC2, uint16_t B2,
+      uint16_t W) {
+  for (uint16_t i = 0; i < W; ++i) {
+    // If RC1[i] is "bottom", it cannot be proven equal to RC2[i].
+    if (RC1[B1+i].Type == BitTracker::BitValue::Ref && RC1[B1+i].RefI.Reg == 0)
+      return false;
+    // Same for RC2[i].
+    if (RC2[B2+i].Type == BitTracker::BitValue::Ref && RC2[B2+i].RefI.Reg == 0)
+      return false;
+    if (RC1[B1+i] != RC2[B2+i])
+      return false;
+  }
+  return true;
+}
+
+bool HexagonBitSimplify::isZero(const BitTracker::RegisterCell &RC,
+      uint16_t B, uint16_t W) {
+  assert(B < RC.width() && B+W <= RC.width());
+  for (uint16_t i = B; i < B+W; ++i)
+    if (!RC[i].is(0))
+      return false;
+  return true;
+}
+
+bool HexagonBitSimplify::getConst(const BitTracker::RegisterCell &RC,
+        uint16_t B, uint16_t W, uint64_t &U) {
+  assert(B < RC.width() && B+W <= RC.width());
+  int64_t T = 0;
+  for (uint16_t i = B+W; i > B; --i) {
+    const BitTracker::BitValue &BV = RC[i-1];
+    T <<= 1;
+    if (BV.is(1))
+      T |= 1;
+    else if (!BV.is(0))
+      return false;
+  }
+  U = T;
+  return true;
+}
+
+bool HexagonBitSimplify::replaceReg(unsigned OldR, unsigned NewR,
+      MachineRegisterInfo &MRI) {
+  if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
+      !TargetRegisterInfo::isVirtualRegister(NewR))
+    return false;
+  auto Begin = MRI.use_begin(OldR), End = MRI.use_end();
+  decltype(End) NextI;
+  for (auto I = Begin; I != End; I = NextI) {
+    NextI = std::next(I);
+    I->setReg(NewR);
+  }
+  return Begin != End;
+}
+
+bool HexagonBitSimplify::replaceRegWithSub(unsigned OldR, unsigned NewR,
+      unsigned NewSR, MachineRegisterInfo &MRI) {
+  if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
+      !TargetRegisterInfo::isVirtualRegister(NewR))
+    return false;
+  if (hasTiedUse(OldR, MRI, NewSR))
+    return false;
+  auto Begin = MRI.use_begin(OldR), End = MRI.use_end();
+  decltype(End) NextI;
+  for (auto I = Begin; I != End; I = NextI) {
+    NextI = std::next(I);
+    I->setReg(NewR);
+    I->setSubReg(NewSR);
+  }
+  return Begin != End;
+}
+
+bool HexagonBitSimplify::replaceSubWithSub(unsigned OldR, unsigned OldSR,
+      unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI) {
+  if (!TargetRegisterInfo::isVirtualRegister(OldR) ||
+      !TargetRegisterInfo::isVirtualRegister(NewR))
+    return false;
+  if (OldSR != NewSR && hasTiedUse(OldR, MRI, NewSR))
+    return false;
+  auto Begin = MRI.use_begin(OldR), End = MRI.use_end();
+  decltype(End) NextI;
+  for (auto I = Begin; I != End; I = NextI) {
+    NextI = std::next(I);
+    if (I->getSubReg() != OldSR)
+      continue;
+    I->setReg(NewR);
+    I->setSubReg(NewSR);
+  }
+  return Begin != End;
+}
+
+// For a register ref (pair Reg:Sub), set Begin to the position of the LSB
+// of Sub in Reg, and set Width to the size of Sub in bits. Return true,
+// if this succeeded, otherwise return false.
+bool HexagonBitSimplify::getSubregMask(const BitTracker::RegisterRef &RR,
+      unsigned &Begin, unsigned &Width, MachineRegisterInfo &MRI) {
+  const TargetRegisterClass *RC = MRI.getRegClass(RR.Reg);
+  if (RR.Sub == 0) {
+    Begin = 0;
+    Width = RC->getSize()*8;
+    return true;
+  }
+
+  Begin = 0;
+
+  switch (RC->getID()) {
+    case Hexagon::DoubleRegsRegClassID:
+    case Hexagon::VecDblRegsRegClassID:
+    case Hexagon::VecDblRegs128BRegClassID:
+      Width = RC->getSize()*8 / 2;
+      if (RR.Sub == Hexagon::isub_hi || RR.Sub == Hexagon::vsub_hi)
+        Begin = Width;
+      break;
+    default:
+      return false;
+  }
+  return true;
+}
+
+
+// For a REG_SEQUENCE, set SL to the low subregister and SH to the high
+// subregister.
+bool HexagonBitSimplify::parseRegSequence(const MachineInstr &I,
+      BitTracker::RegisterRef &SL, BitTracker::RegisterRef &SH,
+      const MachineRegisterInfo &MRI) {
+  assert(I.getOpcode() == TargetOpcode::REG_SEQUENCE);
+  unsigned Sub1 = I.getOperand(2).getImm(), Sub2 = I.getOperand(4).getImm();
+  auto *DstRC = MRI.getRegClass(I.getOperand(0).getReg());
+  auto &HRI = static_cast<const HexagonRegisterInfo&>(
+                  *MRI.getTargetRegisterInfo());
+  unsigned SubLo = HRI.getHexagonSubRegIndex(DstRC, Hexagon::ps_sub_lo);
+  unsigned SubHi = HRI.getHexagonSubRegIndex(DstRC, Hexagon::ps_sub_hi);
+  assert((Sub1 == SubLo && Sub2 == SubHi) || (Sub1 == SubHi && Sub2 == SubLo));
+  if (Sub1 == SubLo && Sub2 == SubHi) {
+    SL = I.getOperand(1);
+    SH = I.getOperand(3);
+    return true;
+  }
+  if (Sub1 == SubHi && Sub2 == SubLo) {
+    SH = I.getOperand(1);
+    SL = I.getOperand(3);
+    return true;
+  }
+  return false;
+}
+
+// All stores (except 64-bit stores) take a 32-bit register as the source
+// of the value to be stored. If the instruction stores into a location
+// that is shorter than 32 bits, some bits of the source register are not
+// used. For each store instruction, calculate the set of used bits in
+// the source register, and set appropriate bits in Bits. Return true if
+// the bits are calculated, false otherwise.
+bool HexagonBitSimplify::getUsedBitsInStore(unsigned Opc, BitVector &Bits,
+      uint16_t Begin) {
+  using namespace Hexagon;
+
+  switch (Opc) {
+    // Store byte
+    case S2_storerb_io:           // memb(Rs32+#s11:0)=Rt32
+    case S2_storerbnew_io:        // memb(Rs32+#s11:0)=Nt8.new
+    case S2_pstorerbt_io:         // if (Pv4) memb(Rs32+#u6:0)=Rt32
+    case S2_pstorerbf_io:         // if (!Pv4) memb(Rs32+#u6:0)=Rt32
+    case S4_pstorerbtnew_io:      // if (Pv4.new) memb(Rs32+#u6:0)=Rt32
+    case S4_pstorerbfnew_io:      // if (!Pv4.new) memb(Rs32+#u6:0)=Rt32
+    case S2_pstorerbnewt_io:      // if (Pv4) memb(Rs32+#u6:0)=Nt8.new
+    case S2_pstorerbnewf_io:      // if (!Pv4) memb(Rs32+#u6:0)=Nt8.new
+    case S4_pstorerbnewtnew_io:   // if (Pv4.new) memb(Rs32+#u6:0)=Nt8.new
+    case S4_pstorerbnewfnew_io:   // if (!Pv4.new) memb(Rs32+#u6:0)=Nt8.new
+    case S2_storerb_pi:           // memb(Rx32++#s4:0)=Rt32
+    case S2_storerbnew_pi:        // memb(Rx32++#s4:0)=Nt8.new
+    case S2_pstorerbt_pi:         // if (Pv4) memb(Rx32++#s4:0)=Rt32
+    case S2_pstorerbf_pi:         // if (!Pv4) memb(Rx32++#s4:0)=Rt32
+    case S2_pstorerbtnew_pi:      // if (Pv4.new) memb(Rx32++#s4:0)=Rt32
+    case S2_pstorerbfnew_pi:      // if (!Pv4.new) memb(Rx32++#s4:0)=Rt32
+    case S2_pstorerbnewt_pi:      // if (Pv4) memb(Rx32++#s4:0)=Nt8.new
+    case S2_pstorerbnewf_pi:      // if (!Pv4) memb(Rx32++#s4:0)=Nt8.new
+    case S2_pstorerbnewtnew_pi:   // if (Pv4.new) memb(Rx32++#s4:0)=Nt8.new
+    case S2_pstorerbnewfnew_pi:   // if (!Pv4.new) memb(Rx32++#s4:0)=Nt8.new
+    case S4_storerb_ap:           // memb(Re32=#U6)=Rt32
+    case S4_storerbnew_ap:        // memb(Re32=#U6)=Nt8.new
+    case S2_storerb_pr:           // memb(Rx32++Mu2)=Rt32
+    case S2_storerbnew_pr:        // memb(Rx32++Mu2)=Nt8.new
+    case S4_storerb_ur:           // memb(Ru32<<#u2+#U6)=Rt32
+    case S4_storerbnew_ur:        // memb(Ru32<<#u2+#U6)=Nt8.new
+    case S2_storerb_pbr:          // memb(Rx32++Mu2:brev)=Rt32
+    case S2_storerbnew_pbr:       // memb(Rx32++Mu2:brev)=Nt8.new
+    case S2_storerb_pci:          // memb(Rx32++#s4:0:circ(Mu2))=Rt32
+    case S2_storerbnew_pci:       // memb(Rx32++#s4:0:circ(Mu2))=Nt8.new
+    case S2_storerb_pcr:          // memb(Rx32++I:circ(Mu2))=Rt32
+    case S2_storerbnew_pcr:       // memb(Rx32++I:circ(Mu2))=Nt8.new
+    case S4_storerb_rr:           // memb(Rs32+Ru32<<#u2)=Rt32
+    case S4_storerbnew_rr:        // memb(Rs32+Ru32<<#u2)=Nt8.new
+    case S4_pstorerbt_rr:         // if (Pv4) memb(Rs32+Ru32<<#u2)=Rt32
+    case S4_pstorerbf_rr:         // if (!Pv4) memb(Rs32+Ru32<<#u2)=Rt32
+    case S4_pstorerbtnew_rr:      // if (Pv4.new) memb(Rs32+Ru32<<#u2)=Rt32
+    case S4_pstorerbfnew_rr:      // if (!Pv4.new) memb(Rs32+Ru32<<#u2)=Rt32
+    case S4_pstorerbnewt_rr:      // if (Pv4) memb(Rs32+Ru32<<#u2)=Nt8.new
+    case S4_pstorerbnewf_rr:      // if (!Pv4) memb(Rs32+Ru32<<#u2)=Nt8.new
+    case S4_pstorerbnewtnew_rr:   // if (Pv4.new) memb(Rs32+Ru32<<#u2)=Nt8.new
+    case S4_pstorerbnewfnew_rr:   // if (!Pv4.new) memb(Rs32+Ru32<<#u2)=Nt8.new
+    case S2_storerbgp:            // memb(gp+#u16:0)=Rt32
+    case S2_storerbnewgp:         // memb(gp+#u16:0)=Nt8.new
+    case S4_pstorerbt_abs:        // if (Pv4) memb(#u6)=Rt32
+    case S4_pstorerbf_abs:        // if (!Pv4) memb(#u6)=Rt32
+    case S4_pstorerbtnew_abs:     // if (Pv4.new) memb(#u6)=Rt32
+    case S4_pstorerbfnew_abs:     // if (!Pv4.new) memb(#u6)=Rt32
+    case S4_pstorerbnewt_abs:     // if (Pv4) memb(#u6)=Nt8.new
+    case S4_pstorerbnewf_abs:     // if (!Pv4) memb(#u6)=Nt8.new
+    case S4_pstorerbnewtnew_abs:  // if (Pv4.new) memb(#u6)=Nt8.new
+    case S4_pstorerbnewfnew_abs:  // if (!Pv4.new) memb(#u6)=Nt8.new
+      Bits.set(Begin, Begin+8);
+      return true;
+
+    // Store low half
+    case S2_storerh_io:           // memh(Rs32+#s11:1)=Rt32
+    case S2_storerhnew_io:        // memh(Rs32+#s11:1)=Nt8.new
+    case S2_pstorerht_io:         // if (Pv4) memh(Rs32+#u6:1)=Rt32
+    case S2_pstorerhf_io:         // if (!Pv4) memh(Rs32+#u6:1)=Rt32
+    case S4_pstorerhtnew_io:      // if (Pv4.new) memh(Rs32+#u6:1)=Rt32
+    case S4_pstorerhfnew_io:      // if (!Pv4.new) memh(Rs32+#u6:1)=Rt32
+    case S2_pstorerhnewt_io:      // if (Pv4) memh(Rs32+#u6:1)=Nt8.new
+    case S2_pstorerhnewf_io:      // if (!Pv4) memh(Rs32+#u6:1)=Nt8.new
+    case S4_pstorerhnewtnew_io:   // if (Pv4.new) memh(Rs32+#u6:1)=Nt8.new
+    case S4_pstorerhnewfnew_io:   // if (!Pv4.new) memh(Rs32+#u6:1)=Nt8.new
+    case S2_storerh_pi:           // memh(Rx32++#s4:1)=Rt32
+    case S2_storerhnew_pi:        // memh(Rx32++#s4:1)=Nt8.new
+    case S2_pstorerht_pi:         // if (Pv4) memh(Rx32++#s4:1)=Rt32
+    case S2_pstorerhf_pi:         // if (!Pv4) memh(Rx32++#s4:1)=Rt32
+    case S2_pstorerhtnew_pi:      // if (Pv4.new) memh(Rx32++#s4:1)=Rt32
+    case S2_pstorerhfnew_pi:      // if (!Pv4.new) memh(Rx32++#s4:1)=Rt32
+    case S2_pstorerhnewt_pi:      // if (Pv4) memh(Rx32++#s4:1)=Nt8.new
+    case S2_pstorerhnewf_pi:      // if (!Pv4) memh(Rx32++#s4:1)=Nt8.new
+    case S2_pstorerhnewtnew_pi:   // if (Pv4.new) memh(Rx32++#s4:1)=Nt8.new
+    case S2_pstorerhnewfnew_pi:   // if (!Pv4.new) memh(Rx32++#s4:1)=Nt8.new
+    case S4_storerh_ap:           // memh(Re32=#U6)=Rt32
+    case S4_storerhnew_ap:        // memh(Re32=#U6)=Nt8.new
+    case S2_storerh_pr:           // memh(Rx32++Mu2)=Rt32
+    case S2_storerhnew_pr:        // memh(Rx32++Mu2)=Nt8.new
+    case S4_storerh_ur:           // memh(Ru32<<#u2+#U6)=Rt32
+    case S4_storerhnew_ur:        // memh(Ru32<<#u2+#U6)=Nt8.new
+    case S2_storerh_pbr:          // memh(Rx32++Mu2:brev)=Rt32
+    case S2_storerhnew_pbr:       // memh(Rx32++Mu2:brev)=Nt8.new
+    case S2_storerh_pci:          // memh(Rx32++#s4:1:circ(Mu2))=Rt32
+    case S2_storerhnew_pci:       // memh(Rx32++#s4:1:circ(Mu2))=Nt8.new
+    case S2_storerh_pcr:          // memh(Rx32++I:circ(Mu2))=Rt32
+    case S2_storerhnew_pcr:       // memh(Rx32++I:circ(Mu2))=Nt8.new
+    case S4_storerh_rr:           // memh(Rs32+Ru32<<#u2)=Rt32
+    case S4_pstorerht_rr:         // if (Pv4) memh(Rs32+Ru32<<#u2)=Rt32
+    case S4_pstorerhf_rr:         // if (!Pv4) memh(Rs32+Ru32<<#u2)=Rt32
+    case S4_pstorerhtnew_rr:      // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Rt32
+    case S4_pstorerhfnew_rr:      // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Rt32
+    case S4_storerhnew_rr:        // memh(Rs32+Ru32<<#u2)=Nt8.new
+    case S4_pstorerhnewt_rr:      // if (Pv4) memh(Rs32+Ru32<<#u2)=Nt8.new
+    case S4_pstorerhnewf_rr:      // if (!Pv4) memh(Rs32+Ru32<<#u2)=Nt8.new
+    case S4_pstorerhnewtnew_rr:   // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Nt8.new
+    case S4_pstorerhnewfnew_rr:   // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Nt8.new
+    case S2_storerhgp:            // memh(gp+#u16:1)=Rt32
+    case S2_storerhnewgp:         // memh(gp+#u16:1)=Nt8.new
+    case S4_pstorerht_abs:        // if (Pv4) memh(#u6)=Rt32
+    case S4_pstorerhf_abs:        // if (!Pv4) memh(#u6)=Rt32
+    case S4_pstorerhtnew_abs:     // if (Pv4.new) memh(#u6)=Rt32
+    case S4_pstorerhfnew_abs:     // if (!Pv4.new) memh(#u6)=Rt32
+    case S4_pstorerhnewt_abs:     // if (Pv4) memh(#u6)=Nt8.new
+    case S4_pstorerhnewf_abs:     // if (!Pv4) memh(#u6)=Nt8.new
+    case S4_pstorerhnewtnew_abs:  // if (Pv4.new) memh(#u6)=Nt8.new
+    case S4_pstorerhnewfnew_abs:  // if (!Pv4.new) memh(#u6)=Nt8.new
+      Bits.set(Begin, Begin+16);
+      return true;
+
+    // Store high half
+    case S2_storerf_io:           // memh(Rs32+#s11:1)=Rt.H32
+    case S2_pstorerft_io:         // if (Pv4) memh(Rs32+#u6:1)=Rt.H32
+    case S2_pstorerff_io:         // if (!Pv4) memh(Rs32+#u6:1)=Rt.H32
+    case S4_pstorerftnew_io:      // if (Pv4.new) memh(Rs32+#u6:1)=Rt.H32
+    case S4_pstorerffnew_io:      // if (!Pv4.new) memh(Rs32+#u6:1)=Rt.H32
+    case S2_storerf_pi:           // memh(Rx32++#s4:1)=Rt.H32
+    case S2_pstorerft_pi:         // if (Pv4) memh(Rx32++#s4:1)=Rt.H32
+    case S2_pstorerff_pi:         // if (!Pv4) memh(Rx32++#s4:1)=Rt.H32
+    case S2_pstorerftnew_pi:      // if (Pv4.new) memh(Rx32++#s4:1)=Rt.H32
+    case S2_pstorerffnew_pi:      // if (!Pv4.new) memh(Rx32++#s4:1)=Rt.H32
+    case S4_storerf_ap:           // memh(Re32=#U6)=Rt.H32
+    case S2_storerf_pr:           // memh(Rx32++Mu2)=Rt.H32
+    case S4_storerf_ur:           // memh(Ru32<<#u2+#U6)=Rt.H32
+    case S2_storerf_pbr:          // memh(Rx32++Mu2:brev)=Rt.H32
+    case S2_storerf_pci:          // memh(Rx32++#s4:1:circ(Mu2))=Rt.H32
+    case S2_storerf_pcr:          // memh(Rx32++I:circ(Mu2))=Rt.H32
+    case S4_storerf_rr:           // memh(Rs32+Ru32<<#u2)=Rt.H32
+    case S4_pstorerft_rr:         // if (Pv4) memh(Rs32+Ru32<<#u2)=Rt.H32
+    case S4_pstorerff_rr:         // if (!Pv4) memh(Rs32+Ru32<<#u2)=Rt.H32
+    case S4_pstorerftnew_rr:      // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Rt.H32
+    case S4_pstorerffnew_rr:      // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Rt.H32
+    case S2_storerfgp:            // memh(gp+#u16:1)=Rt.H32
+    case S4_pstorerft_abs:        // if (Pv4) memh(#u6)=Rt.H32
+    case S4_pstorerff_abs:        // if (!Pv4) memh(#u6)=Rt.H32
+    case S4_pstorerftnew_abs:     // if (Pv4.new) memh(#u6)=Rt.H32
+    case S4_pstorerffnew_abs:     // if (!Pv4.new) memh(#u6)=Rt.H32
+      Bits.set(Begin+16, Begin+32);
+      return true;
+  }
+
+  return false;
+}
+
+// For an instruction with opcode Opc, calculate the set of bits that it
+// uses in a register in operand OpN. This only calculates the set of used
+// bits for cases where it does not depend on any operands (as is the case
+// in shifts, for example). For concrete instructions from a program, the
+// operand may be a subregister of a larger register, while Bits would
+// correspond to the larger register in its entirety. Because of that,
+// the parameter Begin can be used to indicate which bit of Bits should be
+// considered the LSB of of the operand.
+bool HexagonBitSimplify::getUsedBits(unsigned Opc, unsigned OpN,
+      BitVector &Bits, uint16_t Begin, const HexagonInstrInfo &HII) {
+  using namespace Hexagon;
+
+  const MCInstrDesc &D = HII.get(Opc);
+  if (D.mayStore()) {
+    if (OpN == D.getNumOperands()-1)
+      return getUsedBitsInStore(Opc, Bits, Begin);
+    return false;
+  }
+
+  switch (Opc) {
+    // One register source. Used bits: R1[0-7].
+    case A2_sxtb:
+    case A2_zxtb:
+    case A4_cmpbeqi:
+    case A4_cmpbgti:
+    case A4_cmpbgtui:
+      if (OpN == 1) {
+        Bits.set(Begin, Begin+8);
+        return true;
+      }
+      break;
+
+    // One register source. Used bits: R1[0-15].
+    case A2_aslh:
+    case A2_sxth:
+    case A2_zxth:
+    case A4_cmpheqi:
+    case A4_cmphgti:
+    case A4_cmphgtui:
+      if (OpN == 1) {
+        Bits.set(Begin, Begin+16);
+        return true;
+      }
+      break;
+
+    // One register source. Used bits: R1[16-31].
+    case A2_asrh:
+      if (OpN == 1) {
+        Bits.set(Begin+16, Begin+32);
+        return true;
+      }
+      break;
+
+    // Two register sources. Used bits: R1[0-7], R2[0-7].
+    case A4_cmpbeq:
+    case A4_cmpbgt:
+    case A4_cmpbgtu:
+      if (OpN == 1) {
+        Bits.set(Begin, Begin+8);
+        return true;
+      }
+      break;
+
+    // Two register sources. Used bits: R1[0-15], R2[0-15].
+    case A4_cmpheq:
+    case A4_cmphgt:
+    case A4_cmphgtu:
+    case A2_addh_h16_ll:
+    case A2_addh_h16_sat_ll:
+    case A2_addh_l16_ll:
+    case A2_addh_l16_sat_ll:
+    case A2_combine_ll:
+    case A2_subh_h16_ll:
+    case A2_subh_h16_sat_ll:
+    case A2_subh_l16_ll:
+    case A2_subh_l16_sat_ll:
+    case M2_mpy_acc_ll_s0:
+    case M2_mpy_acc_ll_s1:
+    case M2_mpy_acc_sat_ll_s0:
+    case M2_mpy_acc_sat_ll_s1:
+    case M2_mpy_ll_s0:
+    case M2_mpy_ll_s1:
+    case M2_mpy_nac_ll_s0:
+    case M2_mpy_nac_ll_s1:
+    case M2_mpy_nac_sat_ll_s0:
+    case M2_mpy_nac_sat_ll_s1:
+    case M2_mpy_rnd_ll_s0:
+    case M2_mpy_rnd_ll_s1:
+    case M2_mpy_sat_ll_s0:
+    case M2_mpy_sat_ll_s1:
+    case M2_mpy_sat_rnd_ll_s0:
+    case M2_mpy_sat_rnd_ll_s1:
+    case M2_mpyd_acc_ll_s0:
+    case M2_mpyd_acc_ll_s1:
+    case M2_mpyd_ll_s0:
+    case M2_mpyd_ll_s1:
+    case M2_mpyd_nac_ll_s0:
+    case M2_mpyd_nac_ll_s1:
+    case M2_mpyd_rnd_ll_s0:
+    case M2_mpyd_rnd_ll_s1:
+    case M2_mpyu_acc_ll_s0:
+    case M2_mpyu_acc_ll_s1:
+    case M2_mpyu_ll_s0:
+    case M2_mpyu_ll_s1:
+    case M2_mpyu_nac_ll_s0:
+    case M2_mpyu_nac_ll_s1:
+    case M2_mpyud_acc_ll_s0:
+    case M2_mpyud_acc_ll_s1:
+    case M2_mpyud_ll_s0:
+    case M2_mpyud_ll_s1:
+    case M2_mpyud_nac_ll_s0:
+    case M2_mpyud_nac_ll_s1:
+      if (OpN == 1 || OpN == 2) {
+        Bits.set(Begin, Begin+16);
+        return true;
+      }
+      break;
+
+    // Two register sources. Used bits: R1[0-15], R2[16-31].
+    case A2_addh_h16_lh:
+    case A2_addh_h16_sat_lh:
+    case A2_combine_lh:
+    case A2_subh_h16_lh:
+    case A2_subh_h16_sat_lh:
+    case M2_mpy_acc_lh_s0:
+    case M2_mpy_acc_lh_s1:
+    case M2_mpy_acc_sat_lh_s0:
+    case M2_mpy_acc_sat_lh_s1:
+    case M2_mpy_lh_s0:
+    case M2_mpy_lh_s1:
+    case M2_mpy_nac_lh_s0:
+    case M2_mpy_nac_lh_s1:
+    case M2_mpy_nac_sat_lh_s0:
+    case M2_mpy_nac_sat_lh_s1:
+    case M2_mpy_rnd_lh_s0:
+    case M2_mpy_rnd_lh_s1:
+    case M2_mpy_sat_lh_s0:
+    case M2_mpy_sat_lh_s1:
+    case M2_mpy_sat_rnd_lh_s0:
+    case M2_mpy_sat_rnd_lh_s1:
+    case M2_mpyd_acc_lh_s0:
+    case M2_mpyd_acc_lh_s1:
+    case M2_mpyd_lh_s0:
+    case M2_mpyd_lh_s1:
+    case M2_mpyd_nac_lh_s0:
+    case M2_mpyd_nac_lh_s1:
+    case M2_mpyd_rnd_lh_s0:
+    case M2_mpyd_rnd_lh_s1:
+    case M2_mpyu_acc_lh_s0:
+    case M2_mpyu_acc_lh_s1:
+    case M2_mpyu_lh_s0:
+    case M2_mpyu_lh_s1:
+    case M2_mpyu_nac_lh_s0:
+    case M2_mpyu_nac_lh_s1:
+    case M2_mpyud_acc_lh_s0:
+    case M2_mpyud_acc_lh_s1:
+    case M2_mpyud_lh_s0:
+    case M2_mpyud_lh_s1:
+    case M2_mpyud_nac_lh_s0:
+    case M2_mpyud_nac_lh_s1:
+    // These four are actually LH.
+    case A2_addh_l16_hl:
+    case A2_addh_l16_sat_hl:
+    case A2_subh_l16_hl:
+    case A2_subh_l16_sat_hl:
+      if (OpN == 1) {
+        Bits.set(Begin, Begin+16);
+        return true;
+      }
+      if (OpN == 2) {
+        Bits.set(Begin+16, Begin+32);
+        return true;
+      }
+      break;
+
+    // Two register sources, used bits: R1[16-31], R2[0-15].
+    case A2_addh_h16_hl:
+    case A2_addh_h16_sat_hl:
+    case A2_combine_hl:
+    case A2_subh_h16_hl:
+    case A2_subh_h16_sat_hl:
+    case M2_mpy_acc_hl_s0:
+    case M2_mpy_acc_hl_s1:
+    case M2_mpy_acc_sat_hl_s0:
+    case M2_mpy_acc_sat_hl_s1:
+    case M2_mpy_hl_s0:
+    case M2_mpy_hl_s1:
+    case M2_mpy_nac_hl_s0:
+    case M2_mpy_nac_hl_s1:
+    case M2_mpy_nac_sat_hl_s0:
+    case M2_mpy_nac_sat_hl_s1:
+    case M2_mpy_rnd_hl_s0:
+    case M2_mpy_rnd_hl_s1:
+    case M2_mpy_sat_hl_s0:
+    case M2_mpy_sat_hl_s1:
+    case M2_mpy_sat_rnd_hl_s0:
+    case M2_mpy_sat_rnd_hl_s1:
+    case M2_mpyd_acc_hl_s0:
+    case M2_mpyd_acc_hl_s1:
+    case M2_mpyd_hl_s0:
+    case M2_mpyd_hl_s1:
+    case M2_mpyd_nac_hl_s0:
+    case M2_mpyd_nac_hl_s1:
+    case M2_mpyd_rnd_hl_s0:
+    case M2_mpyd_rnd_hl_s1:
+    case M2_mpyu_acc_hl_s0:
+    case M2_mpyu_acc_hl_s1:
+    case M2_mpyu_hl_s0:
+    case M2_mpyu_hl_s1:
+    case M2_mpyu_nac_hl_s0:
+    case M2_mpyu_nac_hl_s1:
+    case M2_mpyud_acc_hl_s0:
+    case M2_mpyud_acc_hl_s1:
+    case M2_mpyud_hl_s0:
+    case M2_mpyud_hl_s1:
+    case M2_mpyud_nac_hl_s0:
+    case M2_mpyud_nac_hl_s1:
+      if (OpN == 1) {
+        Bits.set(Begin+16, Begin+32);
+        return true;
+      }
+      if (OpN == 2) {
+        Bits.set(Begin, Begin+16);
+        return true;
+      }
+      break;
+
+    // Two register sources, used bits: R1[16-31], R2[16-31].
+    case A2_addh_h16_hh:
+    case A2_addh_h16_sat_hh:
+    case A2_combine_hh:
+    case A2_subh_h16_hh:
+    case A2_subh_h16_sat_hh:
+    case M2_mpy_acc_hh_s0:
+    case M2_mpy_acc_hh_s1:
+    case M2_mpy_acc_sat_hh_s0:
+    case M2_mpy_acc_sat_hh_s1:
+    case M2_mpy_hh_s0:
+    case M2_mpy_hh_s1:
+    case M2_mpy_nac_hh_s0:
+    case M2_mpy_nac_hh_s1:
+    case M2_mpy_nac_sat_hh_s0:
+    case M2_mpy_nac_sat_hh_s1:
+    case M2_mpy_rnd_hh_s0:
+    case M2_mpy_rnd_hh_s1:
+    case M2_mpy_sat_hh_s0:
+    case M2_mpy_sat_hh_s1:
+    case M2_mpy_sat_rnd_hh_s0:
+    case M2_mpy_sat_rnd_hh_s1:
+    case M2_mpyd_acc_hh_s0:
+    case M2_mpyd_acc_hh_s1:
+    case M2_mpyd_hh_s0:
+    case M2_mpyd_hh_s1:
+    case M2_mpyd_nac_hh_s0:
+    case M2_mpyd_nac_hh_s1:
+    case M2_mpyd_rnd_hh_s0:
+    case M2_mpyd_rnd_hh_s1:
+    case M2_mpyu_acc_hh_s0:
+    case M2_mpyu_acc_hh_s1:
+    case M2_mpyu_hh_s0:
+    case M2_mpyu_hh_s1:
+    case M2_mpyu_nac_hh_s0:
+    case M2_mpyu_nac_hh_s1:
+    case M2_mpyud_acc_hh_s0:
+    case M2_mpyud_acc_hh_s1:
+    case M2_mpyud_hh_s0:
+    case M2_mpyud_hh_s1:
+    case M2_mpyud_nac_hh_s0:
+    case M2_mpyud_nac_hh_s1:
+      if (OpN == 1 || OpN == 2) {
+        Bits.set(Begin+16, Begin+32);
+        return true;
+      }
+      break;
+  }
+
+  return false;
+}
+
+// Calculate the register class that matches Reg:Sub. For example, if
+// vreg1 is a double register, then vreg1:isub_hi would match the "int"
+// register class.
+const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass(
+      const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI) {
+  if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+    return nullptr;
+  auto *RC = MRI.getRegClass(RR.Reg);
+  if (RR.Sub == 0)
+    return RC;
+  auto &HRI = static_cast<const HexagonRegisterInfo&>(
+                  *MRI.getTargetRegisterInfo());
+
+  auto VerifySR = [&HRI] (const TargetRegisterClass *RC, unsigned Sub) -> void {
+    assert(Sub == HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_lo) ||
+           Sub == HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_hi));
+  };
+
+  switch (RC->getID()) {
+    case Hexagon::DoubleRegsRegClassID:
+      VerifySR(RC, RR.Sub);
+      return &Hexagon::IntRegsRegClass;
+    case Hexagon::VecDblRegsRegClassID:
+      VerifySR(RC, RR.Sub);
+      return &Hexagon::VectorRegsRegClass;
+    case Hexagon::VecDblRegs128BRegClassID:
+      VerifySR(RC, RR.Sub);
+      return &Hexagon::VectorRegs128BRegClass;
+  }
+  return nullptr;
+}
+
+// Check if RD could be replaced with RS at any possible use of RD.
+// For example a predicate register cannot be replaced with a integer
+// register, but a 64-bit register with a subregister can be replaced
+// with a 32-bit register.
+bool HexagonBitSimplify::isTransparentCopy(const BitTracker::RegisterRef &RD,
+      const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI) {
+  if (!TargetRegisterInfo::isVirtualRegister(RD.Reg) ||
+      !TargetRegisterInfo::isVirtualRegister(RS.Reg))
+    return false;
+  // Return false if one (or both) classes are nullptr.
+  auto *DRC = getFinalVRegClass(RD, MRI);
+  if (!DRC)
+    return false;
+
+  return DRC == getFinalVRegClass(RS, MRI);
+}
+
+bool HexagonBitSimplify::hasTiedUse(unsigned Reg, MachineRegisterInfo &MRI,
+      unsigned NewSub) {
+  if (!PreserveTiedOps)
+    return false;
+  return llvm::any_of(MRI.use_operands(Reg),
+                      [NewSub] (const MachineOperand &Op) -> bool {
+                        return Op.getSubReg() != NewSub && Op.isTied();
+                      });
+}
+
+namespace {
+
+  class DeadCodeElimination {
+  public:
+    DeadCodeElimination(MachineFunction &mf, MachineDominatorTree &mdt)
+      : MF(mf), HII(*MF.getSubtarget<HexagonSubtarget>().getInstrInfo()),
+        MDT(mdt), MRI(mf.getRegInfo()) {}
+
+    bool run() {
+      return runOnNode(MDT.getRootNode());
+    }
+
+  private:
+    bool isDead(unsigned R) const;
+    bool runOnNode(MachineDomTreeNode *N);
+
+    MachineFunction &MF;
+    const HexagonInstrInfo &HII;
+    MachineDominatorTree &MDT;
+    MachineRegisterInfo &MRI;
+  };
+
+} // end anonymous namespace
+
+bool DeadCodeElimination::isDead(unsigned R) const {
+  for (auto I = MRI.use_begin(R), E = MRI.use_end(); I != E; ++I) {
+    MachineInstr *UseI = I->getParent();
+    if (UseI->isDebugValue())
+      continue;
+    if (UseI->isPHI()) {
+      assert(!UseI->getOperand(0).getSubReg());
+      unsigned DR = UseI->getOperand(0).getReg();
+      if (DR == R)
+        continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) {
+  bool Changed = false;
+  typedef GraphTraits<MachineDomTreeNode*> GTN;
+  for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
+    Changed |= runOnNode(*I);
+
+  MachineBasicBlock *B = N->getBlock();
+  std::vector<MachineInstr*> Instrs;
+  for (auto I = B->rbegin(), E = B->rend(); I != E; ++I)
+    Instrs.push_back(&*I);
+
+  for (auto MI : Instrs) {
+    unsigned Opc = MI->getOpcode();
+    // Do not touch lifetime markers. This is why the target-independent DCE
+    // cannot be used.
+    if (Opc == TargetOpcode::LIFETIME_START ||
+        Opc == TargetOpcode::LIFETIME_END)
+      continue;
+    bool Store = false;
+    if (MI->isInlineAsm())
+      continue;
+    // Delete PHIs if possible.
+    if (!MI->isPHI() && !MI->isSafeToMove(nullptr, Store))
+      continue;
+
+    bool AllDead = true;
+    SmallVector<unsigned,2> Regs;
+    for (auto &Op : MI->operands()) {
+      if (!Op.isReg() || !Op.isDef())
+        continue;
+      unsigned R = Op.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(R) || !isDead(R)) {
+        AllDead = false;
+        break;
+      }
+      Regs.push_back(R);
+    }
+    if (!AllDead)
+      continue;
+
+    B->erase(MI);
+    for (unsigned i = 0, n = Regs.size(); i != n; ++i)
+      MRI.markUsesInDebugValueAsUndef(Regs[i]);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+namespace {
+
+// Eliminate redundant instructions
+//
+// This transformation will identify instructions where the output register
+// is the same as one of its input registers. This only works on instructions
+// that define a single register (unlike post-increment loads, for example).
+// The equality check is actually more detailed: the code calculates which
+// bits of the output are used, and only compares these bits with the input
+// registers.
+// If the output matches an input, the instruction is replaced with COPY.
+// The copies will be removed by another transformation.
+  class RedundantInstrElimination : public Transformation {
+  public:
+    RedundantInstrElimination(BitTracker &bt, const HexagonInstrInfo &hii,
+          MachineRegisterInfo &mri)
+        : Transformation(true), HII(hii), MRI(mri), BT(bt) {}
+
+    bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+
+  private:
+    bool isLossyShiftLeft(const MachineInstr &MI, unsigned OpN,
+          unsigned &LostB, unsigned &LostE);
+    bool isLossyShiftRight(const MachineInstr &MI, unsigned OpN,
+          unsigned &LostB, unsigned &LostE);
+    bool computeUsedBits(unsigned Reg, BitVector &Bits);
+    bool computeUsedBits(const MachineInstr &MI, unsigned OpN, BitVector &Bits,
+          uint16_t Begin);
+    bool usedBitsEqual(BitTracker::RegisterRef RD, BitTracker::RegisterRef RS);
+
+    const HexagonInstrInfo &HII;
+    MachineRegisterInfo &MRI;
+    BitTracker &BT;
+  };
+
+} // end anonymous namespace
+
+// Check if the instruction is a lossy shift left, where the input being
+// shifted is the operand OpN of MI. If true, [LostB, LostE) is the range
+// of bit indices that are lost.
+bool RedundantInstrElimination::isLossyShiftLeft(const MachineInstr &MI,
+      unsigned OpN, unsigned &LostB, unsigned &LostE) {
+  using namespace Hexagon;
+
+  unsigned Opc = MI.getOpcode();
+  unsigned ImN, RegN, Width;
+  switch (Opc) {
+    case S2_asl_i_p:
+      ImN = 2;
+      RegN = 1;
+      Width = 64;
+      break;
+    case S2_asl_i_p_acc:
+    case S2_asl_i_p_and:
+    case S2_asl_i_p_nac:
+    case S2_asl_i_p_or:
+    case S2_asl_i_p_xacc:
+      ImN = 3;
+      RegN = 2;
+      Width = 64;
+      break;
+    case S2_asl_i_r:
+      ImN = 2;
+      RegN = 1;
+      Width = 32;
+      break;
+    case S2_addasl_rrri:
+    case S4_andi_asl_ri:
+    case S4_ori_asl_ri:
+    case S4_addi_asl_ri:
+    case S4_subi_asl_ri:
+    case S2_asl_i_r_acc:
+    case S2_asl_i_r_and:
+    case S2_asl_i_r_nac:
+    case S2_asl_i_r_or:
+    case S2_asl_i_r_sat:
+    case S2_asl_i_r_xacc:
+      ImN = 3;
+      RegN = 2;
+      Width = 32;
+      break;
+    default:
+      return false;
+  }
+
+  if (RegN != OpN)
+    return false;
+
+  assert(MI.getOperand(ImN).isImm());
+  unsigned S = MI.getOperand(ImN).getImm();
+  if (S == 0)
+    return false;
+  LostB = Width-S;
+  LostE = Width;
+  return true;
+}
+
+// Check if the instruction is a lossy shift right, where the input being
+// shifted is the operand OpN of MI. If true, [LostB, LostE) is the range
+// of bit indices that are lost.
+bool RedundantInstrElimination::isLossyShiftRight(const MachineInstr &MI,
+      unsigned OpN, unsigned &LostB, unsigned &LostE) {
+  using namespace Hexagon;
+
+  unsigned Opc = MI.getOpcode();
+  unsigned ImN, RegN;
+  switch (Opc) {
+    case S2_asr_i_p:
+    case S2_lsr_i_p:
+      ImN = 2;
+      RegN = 1;
+      break;
+    case S2_asr_i_p_acc:
+    case S2_asr_i_p_and:
+    case S2_asr_i_p_nac:
+    case S2_asr_i_p_or:
+    case S2_lsr_i_p_acc:
+    case S2_lsr_i_p_and:
+    case S2_lsr_i_p_nac:
+    case S2_lsr_i_p_or:
+    case S2_lsr_i_p_xacc:
+      ImN = 3;
+      RegN = 2;
+      break;
+    case S2_asr_i_r:
+    case S2_lsr_i_r:
+      ImN = 2;
+      RegN = 1;
+      break;
+    case S4_andi_lsr_ri:
+    case S4_ori_lsr_ri:
+    case S4_addi_lsr_ri:
+    case S4_subi_lsr_ri:
+    case S2_asr_i_r_acc:
+    case S2_asr_i_r_and:
+    case S2_asr_i_r_nac:
+    case S2_asr_i_r_or:
+    case S2_lsr_i_r_acc:
+    case S2_lsr_i_r_and:
+    case S2_lsr_i_r_nac:
+    case S2_lsr_i_r_or:
+    case S2_lsr_i_r_xacc:
+      ImN = 3;
+      RegN = 2;
+      break;
+
+    default:
+      return false;
+  }
+
+  if (RegN != OpN)
+    return false;
+
+  assert(MI.getOperand(ImN).isImm());
+  unsigned S = MI.getOperand(ImN).getImm();
+  LostB = 0;
+  LostE = S;
+  return true;
+}
+
+// Calculate the bit vector that corresponds to the used bits of register Reg.
+// The vector Bits has the same size, as the size of Reg in bits. If the cal-
+// culation fails (i.e. the used bits are unknown), it returns false. Other-
+// wise, it returns true and sets the corresponding bits in Bits.
+bool RedundantInstrElimination::computeUsedBits(unsigned Reg, BitVector &Bits) {
+  BitVector Used(Bits.size());
+  RegisterSet Visited;
+  std::vector<unsigned> Pending;
+  Pending.push_back(Reg);
+
+  for (unsigned i = 0; i < Pending.size(); ++i) {
+    unsigned R = Pending[i];
+    if (Visited.has(R))
+      continue;
+    Visited.insert(R);
+    for (auto I = MRI.use_begin(R), E = MRI.use_end(); I != E; ++I) {
+      BitTracker::RegisterRef UR = *I;
+      unsigned B, W;
+      if (!HBS::getSubregMask(UR, B, W, MRI))
+        return false;
+      MachineInstr &UseI = *I->getParent();
+      if (UseI.isPHI() || UseI.isCopy()) {
+        unsigned DefR = UseI.getOperand(0).getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(DefR))
+          return false;
+        Pending.push_back(DefR);
+      } else {
+        if (!computeUsedBits(UseI, I.getOperandNo(), Used, B))
+          return false;
+      }
+    }
+  }
+  Bits |= Used;
+  return true;
+}
+
+// Calculate the bits used by instruction MI in a register in operand OpN.
+// Return true/false if the calculation succeeds/fails. If is succeeds, set
+// used bits in Bits. This function does not reset any bits in Bits, so
+// subsequent calls over different instructions will result in the union
+// of the used bits in all these instructions.
+// The register in question may be used with a sub-register, whereas Bits
+// holds the bits for the entire register. To keep track of that, the
+// argument Begin indicates where in Bits is the lowest-significant bit
+// of the register used in operand OpN. For example, in instruction:
+//   vreg1 = S2_lsr_i_r vreg2:isub_hi, 10
+// the operand 1 is a 32-bit register, which happens to be a subregister
+// of the 64-bit register vreg2, and that subregister starts at position 32.
+// In this case Begin=32, since Bits[32] would be the lowest-significant bit
+// of vreg2:isub_hi.
+bool RedundantInstrElimination::computeUsedBits(const MachineInstr &MI,
+      unsigned OpN, BitVector &Bits, uint16_t Begin) {
+  unsigned Opc = MI.getOpcode();
+  BitVector T(Bits.size());
+  bool GotBits = HBS::getUsedBits(Opc, OpN, T, Begin, HII);
+  // Even if we don't have bits yet, we could still provide some information
+  // if the instruction is a lossy shift: the lost bits will be marked as
+  // not used.
+  unsigned LB, LE;
+  if (isLossyShiftLeft(MI, OpN, LB, LE) || isLossyShiftRight(MI, OpN, LB, LE)) {
+    assert(MI.getOperand(OpN).isReg());
+    BitTracker::RegisterRef RR = MI.getOperand(OpN);
+    const TargetRegisterClass *RC = HBS::getFinalVRegClass(RR, MRI);
+    uint16_t Width = RC->getSize()*8;
+
+    if (!GotBits)
+      T.set(Begin, Begin+Width);
+    assert(LB <= LE && LB < Width && LE <= Width);
+    T.reset(Begin+LB, Begin+LE);
+    GotBits = true;
+  }
+  if (GotBits)
+    Bits |= T;
+  return GotBits;
+}
+
+// Calculates the used bits in RD ("defined register"), and checks if these
+// bits in RS ("used register") and RD are identical.
+bool RedundantInstrElimination::usedBitsEqual(BitTracker::RegisterRef RD,
+      BitTracker::RegisterRef RS) {
+  const BitTracker::RegisterCell &DC = BT.lookup(RD.Reg);
+  const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
+
+  unsigned DB, DW;
+  if (!HBS::getSubregMask(RD, DB, DW, MRI))
+    return false;
+  unsigned SB, SW;
+  if (!HBS::getSubregMask(RS, SB, SW, MRI))
+    return false;
+  if (SW != DW)
+    return false;
+
+  BitVector Used(DC.width());
+  if (!computeUsedBits(RD.Reg, Used))
+    return false;
+
+  for (unsigned i = 0; i != DW; ++i)
+    if (Used[i+DB] && DC[DB+i] != SC[SB+i])
+      return false;
+  return true;
+}
+
+bool RedundantInstrElimination::processBlock(MachineBasicBlock &B,
+      const RegisterSet&) {
+  if (!BT.reached(&B))
+    return false;
+  bool Changed = false;
+
+  for (auto I = B.begin(), E = B.end(), NextI = I; I != E; ++I) {
+    NextI = std::next(I);
+    MachineInstr *MI = &*I;
+
+    if (MI->getOpcode() == TargetOpcode::COPY)
+      continue;
+    if (MI->hasUnmodeledSideEffects() || MI->isInlineAsm())
+      continue;
+    unsigned NumD = MI->getDesc().getNumDefs();
+    if (NumD != 1)
+      continue;
+
+    BitTracker::RegisterRef RD = MI->getOperand(0);
+    if (!BT.has(RD.Reg))
+      continue;
+    const BitTracker::RegisterCell &DC = BT.lookup(RD.Reg);
+    auto At = MI->isPHI() ? B.getFirstNonPHI()
+                          : MachineBasicBlock::iterator(MI);
+
+    // Find a source operand that is equal to the result.
+    for (auto &Op : MI->uses()) {
+      if (!Op.isReg())
+        continue;
+      BitTracker::RegisterRef RS = Op;
+      if (!BT.has(RS.Reg))
+        continue;
+      if (!HBS::isTransparentCopy(RD, RS, MRI))
+        continue;
+
+      unsigned BN, BW;
+      if (!HBS::getSubregMask(RS, BN, BW, MRI))
+        continue;
+
+      const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
+      if (!usedBitsEqual(RD, RS) && !HBS::isEqual(DC, 0, SC, BN, BW))
+        continue;
+
+      // If found, replace the instruction with a COPY.
+      const DebugLoc &DL = MI->getDebugLoc();
+      const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI);
+      unsigned NewR = MRI.createVirtualRegister(FRC);
+      MachineInstr *CopyI =
+          BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
+            .addReg(RS.Reg, 0, RS.Sub);
+      HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+      // This pass can create copies between registers that don't have the
+      // exact same values. Updating the tracker has to involve updating
+      // all dependent cells. Example:
+      //   vreg1 = inst vreg2     ; vreg1 != vreg2, but used bits are equal
+      //
+      //   vreg3 = copy vreg2     ; <- inserted
+      //     ... = vreg3          ; <- replaced from vreg2
+      // Indirectly, we can create a "copy" between vreg1 and vreg2 even
+      // though their exact values do not match.
+      BT.visit(*CopyI);
+      Changed = true;
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+namespace {
+
+// Recognize instructions that produce constant values known at compile-time.
+// Replace them with register definitions that load these constants directly.
+  class ConstGeneration : public Transformation {
+  public:
+    ConstGeneration(BitTracker &bt, const HexagonInstrInfo &hii,
+        MachineRegisterInfo &mri)
+      : Transformation(true), HII(hii), MRI(mri), BT(bt) {}
+
+    bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+    static bool isTfrConst(const MachineInstr &MI);
+
+  private:
+    unsigned genTfrConst(const TargetRegisterClass *RC, int64_t C,
+        MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL);
+
+    const HexagonInstrInfo &HII;
+    MachineRegisterInfo &MRI;
+    BitTracker &BT;
+  };
+
+} // end anonymous namespace
+
+bool ConstGeneration::isTfrConst(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+    case Hexagon::A2_combineii:
+    case Hexagon::A4_combineii:
+    case Hexagon::A2_tfrsi:
+    case Hexagon::A2_tfrpi:
+    case Hexagon::PS_true:
+    case Hexagon::PS_false:
+    case Hexagon::CONST32:
+    case Hexagon::CONST64:
+      return true;
+  }
+  return false;
+}
+
+// Generate a transfer-immediate instruction that is appropriate for the
+// register class and the actual value being transferred.
+unsigned ConstGeneration::genTfrConst(const TargetRegisterClass *RC, int64_t C,
+      MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL) {
+  unsigned Reg = MRI.createVirtualRegister(RC);
+  if (RC == &Hexagon::IntRegsRegClass) {
+    BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrsi), Reg)
+        .addImm(int32_t(C));
+    return Reg;
+  }
+
+  if (RC == &Hexagon::DoubleRegsRegClass) {
+    if (isInt<8>(C)) {
+      BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrpi), Reg)
+          .addImm(C);
+      return Reg;
+    }
+
+    unsigned Lo = Lo_32(C), Hi = Hi_32(C);
+    if (isInt<8>(Lo) || isInt<8>(Hi)) {
+      unsigned Opc = isInt<8>(Lo) ? Hexagon::A2_combineii
+                                  : Hexagon::A4_combineii;
+      BuildMI(B, At, DL, HII.get(Opc), Reg)
+          .addImm(int32_t(Hi))
+          .addImm(int32_t(Lo));
+      return Reg;
+    }
+
+    BuildMI(B, At, DL, HII.get(Hexagon::CONST64), Reg)
+        .addImm(C);
+    return Reg;
+  }
+
+  if (RC == &Hexagon::PredRegsRegClass) {
+    unsigned Opc;
+    if (C == 0)
+      Opc = Hexagon::PS_false;
+    else if ((C & 0xFF) == 0xFF)
+      Opc = Hexagon::PS_true;
+    else
+      return 0;
+    BuildMI(B, At, DL, HII.get(Opc), Reg);
+    return Reg;
+  }
+
+  return 0;
+}
+
+bool ConstGeneration::processBlock(MachineBasicBlock &B, const RegisterSet&) {
+  if (!BT.reached(&B))
+    return false;
+  bool Changed = false;
+  RegisterSet Defs;
+
+  for (auto I = B.begin(), E = B.end(); I != E; ++I) {
+    if (isTfrConst(*I))
+      continue;
+    Defs.clear();
+    HBS::getInstrDefs(*I, Defs);
+    if (Defs.count() != 1)
+      continue;
+    unsigned DR = Defs.find_first();
+    if (!TargetRegisterInfo::isVirtualRegister(DR))
+      continue;
+    uint64_t U;
+    const BitTracker::RegisterCell &DRC = BT.lookup(DR);
+    if (HBS::getConst(DRC, 0, DRC.width(), U)) {
+      int64_t C = U;
+      DebugLoc DL = I->getDebugLoc();
+      auto At = I->isPHI() ? B.getFirstNonPHI() : I;
+      unsigned ImmReg = genTfrConst(MRI.getRegClass(DR), C, B, At, DL);
+      if (ImmReg) {
+        HBS::replaceReg(DR, ImmReg, MRI);
+        BT.put(ImmReg, DRC);
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+namespace {
+
+// Identify pairs of available registers which hold identical values.
+// In such cases, only one of them needs to be calculated, the other one
+// will be defined as a copy of the first.
+  class CopyGeneration : public Transformation {
+  public:
+    CopyGeneration(BitTracker &bt, const HexagonInstrInfo &hii,
+        const HexagonRegisterInfo &hri, MachineRegisterInfo &mri)
+      : Transformation(true), HII(hii), HRI(hri), MRI(mri), BT(bt) {}
+
+    bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+
+  private:
+    bool findMatch(const BitTracker::RegisterRef &Inp,
+        BitTracker::RegisterRef &Out, const RegisterSet &AVs);
+
+    const HexagonInstrInfo &HII;
+    const HexagonRegisterInfo &HRI;
+    MachineRegisterInfo &MRI;
+    BitTracker &BT;
+    RegisterSet Forbidden;
+  };
+
+// Eliminate register copies RD = RS, by replacing the uses of RD with
+// with uses of RS.
+  class CopyPropagation : public Transformation {
+  public:
+    CopyPropagation(const HexagonRegisterInfo &hri, MachineRegisterInfo &mri)
+        : Transformation(false), HRI(hri), MRI(mri) {}
+
+    bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+
+    static bool isCopyReg(unsigned Opc, bool NoConv);
+
+  private:
+    bool propagateRegCopy(MachineInstr &MI);
+
+    const HexagonRegisterInfo &HRI;
+    MachineRegisterInfo &MRI;
+  };
+
+} // end anonymous namespace
+
+/// Check if there is a register in AVs that is identical to Inp. If so,
+/// set Out to the found register. The output may be a pair Reg:Sub.
+bool CopyGeneration::findMatch(const BitTracker::RegisterRef &Inp,
+      BitTracker::RegisterRef &Out, const RegisterSet &AVs) {
+  if (!BT.has(Inp.Reg))
+    return false;
+  const BitTracker::RegisterCell &InpRC = BT.lookup(Inp.Reg);
+  auto *FRC = HBS::getFinalVRegClass(Inp, MRI);
+  unsigned B, W;
+  if (!HBS::getSubregMask(Inp, B, W, MRI))
+    return false;
+
+  for (unsigned R = AVs.find_first(); R; R = AVs.find_next(R)) {
+    if (!BT.has(R) || Forbidden[R])
+      continue;
+    const BitTracker::RegisterCell &RC = BT.lookup(R);
+    unsigned RW = RC.width();
+    if (W == RW) {
+      if (FRC != MRI.getRegClass(R))
+        continue;
+      if (!HBS::isTransparentCopy(R, Inp, MRI))
+        continue;
+      if (!HBS::isEqual(InpRC, B, RC, 0, W))
+        continue;
+      Out.Reg = R;
+      Out.Sub = 0;
+      return true;
+    }
+    // Check if there is a super-register, whose part (with a subregister)
+    // is equal to the input.
+    // Only do double registers for now.
+    if (W*2 != RW)
+      continue;
+    if (MRI.getRegClass(R) != &Hexagon::DoubleRegsRegClass)
+      continue;
+
+    if (HBS::isEqual(InpRC, B, RC, 0, W))
+      Out.Sub = Hexagon::isub_lo;
+    else if (HBS::isEqual(InpRC, B, RC, W, W))
+      Out.Sub = Hexagon::isub_hi;
+    else
+      continue;
+    Out.Reg = R;
+    if (HBS::isTransparentCopy(Out, Inp, MRI))
+      return true;
+  }
+  return false;
+}
+
+bool CopyGeneration::processBlock(MachineBasicBlock &B,
+      const RegisterSet &AVs) {
+  if (!BT.reached(&B))
+    return false;
+  RegisterSet AVB(AVs);
+  bool Changed = false;
+  RegisterSet Defs;
+
+  for (auto I = B.begin(), E = B.end(), NextI = I; I != E;
+       ++I, AVB.insert(Defs)) {
+    NextI = std::next(I);
+    Defs.clear();
+    HBS::getInstrDefs(*I, Defs);
+
+    unsigned Opc = I->getOpcode();
+    if (CopyPropagation::isCopyReg(Opc, false) ||
+        ConstGeneration::isTfrConst(*I))
+      continue;
+
+    DebugLoc DL = I->getDebugLoc();
+    auto At = I->isPHI() ? B.getFirstNonPHI() : I;
+
+    for (unsigned R = Defs.find_first(); R; R = Defs.find_next(R)) {
+      BitTracker::RegisterRef MR;
+      auto *FRC = HBS::getFinalVRegClass(R, MRI);
+
+      if (findMatch(R, MR, AVB)) {
+        unsigned NewR = MRI.createVirtualRegister(FRC);
+        BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
+          .addReg(MR.Reg, 0, MR.Sub);
+        BT.put(BitTracker::RegisterRef(NewR), BT.get(MR));
+        HBS::replaceReg(R, NewR, MRI);
+        Forbidden.insert(R);
+        continue;
+      }
+
+      if (FRC == &Hexagon::DoubleRegsRegClass ||
+          FRC == &Hexagon::VecDblRegsRegClass ||
+          FRC == &Hexagon::VecDblRegs128BRegClass) {
+        // Try to generate REG_SEQUENCE.
+        unsigned SubLo = HRI.getHexagonSubRegIndex(FRC, Hexagon::ps_sub_lo);
+        unsigned SubHi = HRI.getHexagonSubRegIndex(FRC, Hexagon::ps_sub_hi);
+        BitTracker::RegisterRef TL = { R, SubLo };
+        BitTracker::RegisterRef TH = { R, SubHi };
+        BitTracker::RegisterRef ML, MH;
+        if (findMatch(TL, ML, AVB) && findMatch(TH, MH, AVB)) {
+          auto *FRC = HBS::getFinalVRegClass(R, MRI);
+          unsigned NewR = MRI.createVirtualRegister(FRC);
+          BuildMI(B, At, DL, HII.get(TargetOpcode::REG_SEQUENCE), NewR)
+            .addReg(ML.Reg, 0, ML.Sub)
+            .addImm(SubLo)
+            .addReg(MH.Reg, 0, MH.Sub)
+            .addImm(SubHi);
+          BT.put(BitTracker::RegisterRef(NewR), BT.get(R));
+          HBS::replaceReg(R, NewR, MRI);
+          Forbidden.insert(R);
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool CopyPropagation::isCopyReg(unsigned Opc, bool NoConv) {
+  switch (Opc) {
+    case TargetOpcode::COPY:
+    case TargetOpcode::REG_SEQUENCE:
+    case Hexagon::A4_combineir:
+    case Hexagon::A4_combineri:
+      return true;
+    case Hexagon::A2_tfr:
+    case Hexagon::A2_tfrp:
+    case Hexagon::A2_combinew:
+    case Hexagon::V6_vcombine:
+    case Hexagon::V6_vcombine_128B:
+      return NoConv;
+    default:
+      break;
+  }
+  return false;
+}
+
+bool CopyPropagation::propagateRegCopy(MachineInstr &MI) {
+  bool Changed = false;
+  unsigned Opc = MI.getOpcode();
+  BitTracker::RegisterRef RD = MI.getOperand(0);
+  assert(MI.getOperand(0).getSubReg() == 0);
+
+  switch (Opc) {
+    case TargetOpcode::COPY:
+    case Hexagon::A2_tfr:
+    case Hexagon::A2_tfrp: {
+      BitTracker::RegisterRef RS = MI.getOperand(1);
+      if (!HBS::isTransparentCopy(RD, RS, MRI))
+        break;
+      if (RS.Sub != 0)
+        Changed = HBS::replaceRegWithSub(RD.Reg, RS.Reg, RS.Sub, MRI);
+      else
+        Changed = HBS::replaceReg(RD.Reg, RS.Reg, MRI);
+      break;
+    }
+    case TargetOpcode::REG_SEQUENCE: {
+      BitTracker::RegisterRef SL, SH;
+      if (HBS::parseRegSequence(MI, SL, SH, MRI)) {
+        const TargetRegisterClass *RC = MRI.getRegClass(RD.Reg);
+        unsigned SubLo = HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_lo);
+        unsigned SubHi = HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_hi);
+        Changed  = HBS::replaceSubWithSub(RD.Reg, SubLo, SL.Reg, SL.Sub, MRI);
+        Changed |= HBS::replaceSubWithSub(RD.Reg, SubHi, SH.Reg, SH.Sub, MRI);
+      }
+      break;
+    }
+    case Hexagon::A2_combinew:
+    case Hexagon::V6_vcombine:
+    case Hexagon::V6_vcombine_128B: {
+      const TargetRegisterClass *RC = MRI.getRegClass(RD.Reg);
+      unsigned SubLo = HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_lo);
+      unsigned SubHi = HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_hi);
+      BitTracker::RegisterRef RH = MI.getOperand(1), RL = MI.getOperand(2);
+      Changed  = HBS::replaceSubWithSub(RD.Reg, SubLo, RL.Reg, RL.Sub, MRI);
+      Changed |= HBS::replaceSubWithSub(RD.Reg, SubHi, RH.Reg, RH.Sub, MRI);
+      break;
+    }
+    case Hexagon::A4_combineir:
+    case Hexagon::A4_combineri: {
+      unsigned SrcX = (Opc == Hexagon::A4_combineir) ? 2 : 1;
+      unsigned Sub = (Opc == Hexagon::A4_combineir) ? Hexagon::isub_lo
+                                                    : Hexagon::isub_hi;
+      BitTracker::RegisterRef RS = MI.getOperand(SrcX);
+      Changed = HBS::replaceSubWithSub(RD.Reg, Sub, RS.Reg, RS.Sub, MRI);
+      break;
+    }
+  }
+  return Changed;
+}
+
+bool CopyPropagation::processBlock(MachineBasicBlock &B, const RegisterSet&) {
+  std::vector<MachineInstr*> Instrs;
+  for (auto I = B.rbegin(), E = B.rend(); I != E; ++I)
+    Instrs.push_back(&*I);
+
+  bool Changed = false;
+  for (auto I : Instrs) {
+    unsigned Opc = I->getOpcode();
+    if (!CopyPropagation::isCopyReg(Opc, true))
+      continue;
+    Changed |= propagateRegCopy(*I);
+  }
+
+  return Changed;
+}
+
+namespace {
+
+// Recognize patterns that can be simplified and replace them with the
+// simpler forms.
+// This is by no means complete
+  class BitSimplification : public Transformation {
+  public:
+    BitSimplification(BitTracker &bt, const HexagonInstrInfo &hii,
+        const HexagonRegisterInfo &hri, MachineRegisterInfo &mri,
+        MachineFunction &mf)
+      : Transformation(true), HII(hii), HRI(hri), MRI(mri), MF(mf), BT(bt) {}
+
+    bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
+
+  private:
+    struct RegHalf : public BitTracker::RegisterRef {
+      bool Low;  // Low/High halfword.
+    };
+
+    bool matchHalf(unsigned SelfR, const BitTracker::RegisterCell &RC,
+          unsigned B, RegHalf &RH);
+    bool validateReg(BitTracker::RegisterRef R, unsigned Opc, unsigned OpNum);
+
+    bool matchPackhl(unsigned SelfR, const BitTracker::RegisterCell &RC,
+          BitTracker::RegisterRef &Rs, BitTracker::RegisterRef &Rt);
+    unsigned getCombineOpcode(bool HLow, bool LLow);
+
+    bool genStoreUpperHalf(MachineInstr *MI);
+    bool genStoreImmediate(MachineInstr *MI);
+    bool genPackhl(MachineInstr *MI, BitTracker::RegisterRef RD,
+          const BitTracker::RegisterCell &RC);
+    bool genExtractHalf(MachineInstr *MI, BitTracker::RegisterRef RD,
+          const BitTracker::RegisterCell &RC);
+    bool genCombineHalf(MachineInstr *MI, BitTracker::RegisterRef RD,
+          const BitTracker::RegisterCell &RC);
+    bool genExtractLow(MachineInstr *MI, BitTracker::RegisterRef RD,
+          const BitTracker::RegisterCell &RC);
+    bool simplifyTstbit(MachineInstr *MI, BitTracker::RegisterRef RD,
+          const BitTracker::RegisterCell &RC);
+
+    const HexagonInstrInfo &HII;
+    const HexagonRegisterInfo &HRI;
+    MachineRegisterInfo &MRI;
+    MachineFunction &MF;
+    BitTracker &BT;
+  };
+
+} // end anonymous namespace
+
+// Check if the bits [B..B+16) in register cell RC form a valid halfword,
+// i.e. [0..16), [16..32), etc. of some register. If so, return true and
+// set the information about the found register in RH.
+bool BitSimplification::matchHalf(unsigned SelfR,
+      const BitTracker::RegisterCell &RC, unsigned B, RegHalf &RH) {
+  // XXX This could be searching in the set of available registers, in case
+  // the match is not exact.
+
+  // Match 16-bit chunks, where the RC[B..B+15] references exactly one
+  // register and all the bits B..B+15 match between RC and the register.
+  // This is meant to match "v1[0-15]", where v1 = { [0]:0 [1-15]:v1... },
+  // and RC = { [0]:0 [1-15]:v1[1-15]... }.
+  bool Low = false;
+  unsigned I = B;
+  while (I < B+16 && RC[I].num())
+    I++;
+  if (I == B+16)
+    return false;
+
+  unsigned Reg = RC[I].RefI.Reg;
+  unsigned P = RC[I].RefI.Pos;    // The RefI.Pos will be advanced by I-B.
+  if (P < I-B)
+    return false;
+  unsigned Pos = P - (I-B);
+
+  if (Reg == 0 || Reg == SelfR)    // Don't match "self".
+    return false;
+  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    return false;
+  if (!BT.has(Reg))
+    return false;
+
+  const BitTracker::RegisterCell &SC = BT.lookup(Reg);
+  if (Pos+16 > SC.width())
+    return false;
+
+  for (unsigned i = 0; i < 16; ++i) {
+    const BitTracker::BitValue &RV = RC[i+B];
+    if (RV.Type == BitTracker::BitValue::Ref) {
+      if (RV.RefI.Reg != Reg)
+        return false;
+      if (RV.RefI.Pos != i+Pos)
+        return false;
+      continue;
+    }
+    if (RC[i+B] != SC[i+Pos])
+      return false;
+  }
+
+  unsigned Sub = 0;
+  switch (Pos) {
+    case 0:
+      Sub = Hexagon::isub_lo;
+      Low = true;
+      break;
+    case 16:
+      Sub = Hexagon::isub_lo;
+      Low = false;
+      break;
+    case 32:
+      Sub = Hexagon::isub_hi;
+      Low = true;
+      break;
+    case 48:
+      Sub = Hexagon::isub_hi;
+      Low = false;
+      break;
+    default:
+      return false;
+  }
+
+  RH.Reg = Reg;
+  RH.Sub = Sub;
+  RH.Low = Low;
+  // If the subregister is not valid with the register, set it to 0.
+  if (!HBS::getFinalVRegClass(RH, MRI))
+    RH.Sub = 0;
+
+  return true;
+}
+
+bool BitSimplification::validateReg(BitTracker::RegisterRef R, unsigned Opc,
+      unsigned OpNum) {
+  auto *OpRC = HII.getRegClass(HII.get(Opc), OpNum, &HRI, MF);
+  auto *RRC = HBS::getFinalVRegClass(R, MRI);
+  return OpRC->hasSubClassEq(RRC);
+}
+
+// Check if RC matches the pattern of a S2_packhl. If so, return true and
+// set the inputs Rs and Rt.
+bool BitSimplification::matchPackhl(unsigned SelfR,
+      const BitTracker::RegisterCell &RC, BitTracker::RegisterRef &Rs,
+      BitTracker::RegisterRef &Rt) {
+  RegHalf L1, H1, L2, H2;
+
+  if (!matchHalf(SelfR, RC, 0, L2)  || !matchHalf(SelfR, RC, 16, L1))
+    return false;
+  if (!matchHalf(SelfR, RC, 32, H2) || !matchHalf(SelfR, RC, 48, H1))
+    return false;
+
+  // Rs = H1.L1, Rt = H2.L2
+  if (H1.Reg != L1.Reg || H1.Sub != L1.Sub || H1.Low || !L1.Low)
+    return false;
+  if (H2.Reg != L2.Reg || H2.Sub != L2.Sub || H2.Low || !L2.Low)
+    return false;
+
+  Rs = H1;
+  Rt = H2;
+  return true;
+}
+
+unsigned BitSimplification::getCombineOpcode(bool HLow, bool LLow) {
+  return HLow ? LLow ? Hexagon::A2_combine_ll
+                     : Hexagon::A2_combine_lh
+              : LLow ? Hexagon::A2_combine_hl
+                     : Hexagon::A2_combine_hh;
+}
+
+// If MI stores the upper halfword of a register (potentially obtained via
+// shifts or extracts), replace it with a storerf instruction. This could
+// cause the "extraction" code to become dead.
+bool BitSimplification::genStoreUpperHalf(MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+  if (Opc != Hexagon::S2_storerh_io)
+    return false;
+
+  MachineOperand &ValOp = MI->getOperand(2);
+  BitTracker::RegisterRef RS = ValOp;
+  if (!BT.has(RS.Reg))
+    return false;
+  const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg);
+  RegHalf H;
+  if (!matchHalf(0, RC, 0, H))
+    return false;
+  if (H.Low)
+    return false;
+  MI->setDesc(HII.get(Hexagon::S2_storerf_io));
+  ValOp.setReg(H.Reg);
+  ValOp.setSubReg(H.Sub);
+  return true;
+}
+
+// If MI stores a value known at compile-time, and the value is within a range
+// that avoids using constant-extenders, replace it with a store-immediate.
+bool BitSimplification::genStoreImmediate(MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+  unsigned Align = 0;
+  switch (Opc) {
+    case Hexagon::S2_storeri_io:
+      Align++;
+    case Hexagon::S2_storerh_io:
+      Align++;
+    case Hexagon::S2_storerb_io:
+      break;
+    default:
+      return false;
+  }
+
+  // Avoid stores to frame-indices (due to an unknown offset).
+  if (!MI->getOperand(0).isReg())
+    return false;
+  MachineOperand &OffOp = MI->getOperand(1);
+  if (!OffOp.isImm())
+    return false;
+
+  int64_t Off = OffOp.getImm();
+  // Offset is u6:a. Sadly, there is no isShiftedUInt(n,x).
+  if (!isUIntN(6+Align, Off) || (Off & ((1<<Align)-1)))
+    return false;
+  // Source register:
+  BitTracker::RegisterRef RS = MI->getOperand(2);
+  if (!BT.has(RS.Reg))
+    return false;
+  const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg);
+  uint64_t U;
+  if (!HBS::getConst(RC, 0, RC.width(), U))
+    return false;
+
+  // Only consider 8-bit values to avoid constant-extenders.
+  int V;
+  switch (Opc) {
+    case Hexagon::S2_storerb_io:
+      V = int8_t(U);
+      break;
+    case Hexagon::S2_storerh_io:
+      V = int16_t(U);
+      break;
+    case Hexagon::S2_storeri_io:
+      V = int32_t(U);
+      break;
+  }
+  if (!isInt<8>(V))
+    return false;
+
+  MI->RemoveOperand(2);
+  switch (Opc) {
+    case Hexagon::S2_storerb_io:
+      MI->setDesc(HII.get(Hexagon::S4_storeirb_io));
+      break;
+    case Hexagon::S2_storerh_io:
+      MI->setDesc(HII.get(Hexagon::S4_storeirh_io));
+      break;
+    case Hexagon::S2_storeri_io:
+      MI->setDesc(HII.get(Hexagon::S4_storeiri_io));
+      break;
+  }
+  MI->addOperand(MachineOperand::CreateImm(V));
+  return true;
+}
+
+// If MI is equivalent o S2_packhl, generate the S2_packhl. MI could be the
+// last instruction in a sequence that results in something equivalent to
+// the pack-halfwords. The intent is to cause the entire sequence to become
+// dead.
+bool BitSimplification::genPackhl(MachineInstr *MI,
+      BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+  unsigned Opc = MI->getOpcode();
+  if (Opc == Hexagon::S2_packhl)
+    return false;
+  BitTracker::RegisterRef Rs, Rt;
+  if (!matchPackhl(RD.Reg, RC, Rs, Rt))
+    return false;
+  if (!validateReg(Rs, Hexagon::S2_packhl, 1) ||
+      !validateReg(Rt, Hexagon::S2_packhl, 2))
+    return false;
+
+  MachineBasicBlock &B = *MI->getParent();
+  unsigned NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
+  DebugLoc DL = MI->getDebugLoc();
+  auto At = MI->isPHI() ? B.getFirstNonPHI()
+                        : MachineBasicBlock::iterator(MI);
+  BuildMI(B, At, DL, HII.get(Hexagon::S2_packhl), NewR)
+      .addReg(Rs.Reg, 0, Rs.Sub)
+      .addReg(Rt.Reg, 0, Rt.Sub);
+  HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+  BT.put(BitTracker::RegisterRef(NewR), RC);
+  return true;
+}
+
+// If MI produces halfword of the input in the low half of the output,
+// replace it with zero-extend or extractu.
+bool BitSimplification::genExtractHalf(MachineInstr *MI,
+      BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+  RegHalf L;
+  // Check for halfword in low 16 bits, zeros elsewhere.
+  if (!matchHalf(RD.Reg, RC, 0, L) || !HBS::isZero(RC, 16, 16))
+    return false;
+
+  unsigned Opc = MI->getOpcode();
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  // Prefer zxth, since zxth can go in any slot, while extractu only in
+  // slots 2 and 3.
+  unsigned NewR = 0;
+  auto At = MI->isPHI() ? B.getFirstNonPHI()
+                        : MachineBasicBlock::iterator(MI);
+  if (L.Low && Opc != Hexagon::A2_zxth) {
+    if (validateReg(L, Hexagon::A2_zxth, 1)) {
+      NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+      BuildMI(B, At, DL, HII.get(Hexagon::A2_zxth), NewR)
+          .addReg(L.Reg, 0, L.Sub);
+    }
+  } else if (!L.Low && Opc != Hexagon::S2_lsr_i_r) {
+    if (validateReg(L, Hexagon::S2_lsr_i_r, 1)) {
+      NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+      BuildMI(B, MI, DL, HII.get(Hexagon::S2_lsr_i_r), NewR)
+          .addReg(L.Reg, 0, L.Sub)
+          .addImm(16);
+    }
+  }
+  if (NewR == 0)
+    return false;
+  HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+  BT.put(BitTracker::RegisterRef(NewR), RC);
+  return true;
+}
+
+// If MI is equivalent to a combine(.L/.H, .L/.H) replace with with the
+// combine.
+bool BitSimplification::genCombineHalf(MachineInstr *MI,
+      BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+  RegHalf L, H;
+  // Check for combine h/l
+  if (!matchHalf(RD.Reg, RC, 0, L) || !matchHalf(RD.Reg, RC, 16, H))
+    return false;
+  // Do nothing if this is just a reg copy.
+  if (L.Reg == H.Reg && L.Sub == H.Sub && !H.Low && L.Low)
+    return false;
+
+  unsigned Opc = MI->getOpcode();
+  unsigned COpc = getCombineOpcode(H.Low, L.Low);
+  if (COpc == Opc)
+    return false;
+  if (!validateReg(H, COpc, 1) || !validateReg(L, COpc, 2))
+    return false;
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  auto At = MI->isPHI() ? B.getFirstNonPHI()
+                        : MachineBasicBlock::iterator(MI);
+  BuildMI(B, At, DL, HII.get(COpc), NewR)
+      .addReg(H.Reg, 0, H.Sub)
+      .addReg(L.Reg, 0, L.Sub);
+  HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+  BT.put(BitTracker::RegisterRef(NewR), RC);
+  return true;
+}
+
+// If MI resets high bits of a register and keeps the lower ones, replace it
+// with zero-extend byte/half, and-immediate, or extractu, as appropriate.
+bool BitSimplification::genExtractLow(MachineInstr *MI,
+      BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case Hexagon::A2_zxtb:
+    case Hexagon::A2_zxth:
+    case Hexagon::S2_extractu:
+      return false;
+  }
+  if (Opc == Hexagon::A2_andir && MI->getOperand(2).isImm()) {
+    int32_t Imm = MI->getOperand(2).getImm();
+    if (isInt<10>(Imm))
+      return false;
+  }
+
+  if (MI->hasUnmodeledSideEffects() || MI->isInlineAsm())
+    return false;
+  unsigned W = RC.width();
+  while (W > 0 && RC[W-1].is(0))
+    W--;
+  if (W == 0 || W == RC.width())
+    return false;
+  unsigned NewOpc = (W == 8)  ? Hexagon::A2_zxtb
+                  : (W == 16) ? Hexagon::A2_zxth
+                  : (W < 10)  ? Hexagon::A2_andir
+                  : Hexagon::S2_extractu;
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  for (auto &Op : MI->uses()) {
+    if (!Op.isReg())
+      continue;
+    BitTracker::RegisterRef RS = Op;
+    if (!BT.has(RS.Reg))
+      continue;
+    const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
+    unsigned BN, BW;
+    if (!HBS::getSubregMask(RS, BN, BW, MRI))
+      continue;
+    if (BW < W || !HBS::isEqual(RC, 0, SC, BN, W))
+      continue;
+    if (!validateReg(RS, NewOpc, 1))
+      continue;
+
+    unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+    auto At = MI->isPHI() ? B.getFirstNonPHI()
+                          : MachineBasicBlock::iterator(MI);
+    auto MIB = BuildMI(B, At, DL, HII.get(NewOpc), NewR)
+                  .addReg(RS.Reg, 0, RS.Sub);
+    if (NewOpc == Hexagon::A2_andir)
+      MIB.addImm((1 << W) - 1);
+    else if (NewOpc == Hexagon::S2_extractu)
+      MIB.addImm(W).addImm(0);
+    HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI);
+    BT.put(BitTracker::RegisterRef(NewR), RC);
+    return true;
+  }
+  return false;
+}
+
+// Check for tstbit simplification opportunity, where the bit being checked
+// can be tracked back to another register. For example:
+//   vreg2 = S2_lsr_i_r  vreg1, 5
+//   vreg3 = S2_tstbit_i vreg2, 0
+// =>
+//   vreg3 = S2_tstbit_i vreg1, 5
+bool BitSimplification::simplifyTstbit(MachineInstr *MI,
+      BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
+  unsigned Opc = MI->getOpcode();
+  if (Opc != Hexagon::S2_tstbit_i)
+    return false;
+
+  unsigned BN = MI->getOperand(2).getImm();
+  BitTracker::RegisterRef RS = MI->getOperand(1);
+  unsigned F, W;
+  DebugLoc DL = MI->getDebugLoc();
+  if (!BT.has(RS.Reg) || !HBS::getSubregMask(RS, F, W, MRI))
+    return false;
+  MachineBasicBlock &B = *MI->getParent();
+  auto At = MI->isPHI() ? B.getFirstNonPHI()
+                        : MachineBasicBlock::iterator(MI);
+
+  const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg);
+  const BitTracker::BitValue &V = SC[F+BN];
+  if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg != RS.Reg) {
+    const TargetRegisterClass *TC = MRI.getRegClass(V.RefI.Reg);
+    // Need to map V.RefI.Reg to a 32-bit register, i.e. if it is
+    // a double register, need to use a subregister and adjust bit
+    // number.
+    unsigned P = std::numeric_limits<unsigned>::max();
+    BitTracker::RegisterRef RR(V.RefI.Reg, 0);
+    if (TC == &Hexagon::DoubleRegsRegClass) {
+      P = V.RefI.Pos;
+      RR.Sub = Hexagon::isub_lo;
+      if (P >= 32) {
+        P -= 32;
+        RR.Sub = Hexagon::isub_hi;
+      }
+    } else if (TC == &Hexagon::IntRegsRegClass) {
+      P = V.RefI.Pos;
+    }
+    if (P != std::numeric_limits<unsigned>::max()) {
+      unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+      BuildMI(B, At, DL, HII.get(Hexagon::S2_tstbit_i), NewR)
+          .addReg(RR.Reg, 0, RR.Sub)
+          .addImm(P);
+      HBS::replaceReg(RD.Reg, NewR, MRI);
+      BT.put(NewR, RC);
+      return true;
+    }
+  } else if (V.is(0) || V.is(1)) {
+    unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+    unsigned NewOpc = V.is(0) ? Hexagon::PS_false : Hexagon::PS_true;
+    BuildMI(B, At, DL, HII.get(NewOpc), NewR);
+    HBS::replaceReg(RD.Reg, NewR, MRI);
+    return true;
+  }
+
+  return false;
+}
+
+bool BitSimplification::processBlock(MachineBasicBlock &B,
+      const RegisterSet &AVs) {
+  if (!BT.reached(&B))
+    return false;
+  bool Changed = false;
+  RegisterSet AVB = AVs;
+  RegisterSet Defs;
+
+  for (auto I = B.begin(), E = B.end(); I != E; ++I, AVB.insert(Defs)) {
+    MachineInstr *MI = &*I;
+    Defs.clear();
+    HBS::getInstrDefs(*MI, Defs);
+
+    unsigned Opc = MI->getOpcode();
+    if (Opc == TargetOpcode::COPY || Opc == TargetOpcode::REG_SEQUENCE)
+      continue;
+
+    if (MI->mayStore()) {
+      bool T = genStoreUpperHalf(MI);
+      T = T || genStoreImmediate(MI);
+      Changed |= T;
+      continue;
+    }
+
+    if (Defs.count() != 1)
+      continue;
+    const MachineOperand &Op0 = MI->getOperand(0);
+    if (!Op0.isReg() || !Op0.isDef())
+      continue;
+    BitTracker::RegisterRef RD = Op0;
+    if (!BT.has(RD.Reg))
+      continue;
+    const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI);
+    const BitTracker::RegisterCell &RC = BT.lookup(RD.Reg);
+
+    if (FRC->getID() == Hexagon::DoubleRegsRegClassID) {
+      bool T = genPackhl(MI, RD, RC);
+      Changed |= T;
+      continue;
+    }
+
+    if (FRC->getID() == Hexagon::IntRegsRegClassID) {
+      bool T = genExtractHalf(MI, RD, RC);
+      T = T || genCombineHalf(MI, RD, RC);
+      T = T || genExtractLow(MI, RD, RC);
+      Changed |= T;
+      continue;
+    }
+
+    if (FRC->getID() == Hexagon::PredRegsRegClassID) {
+      bool T = simplifyTstbit(MI, RD, RC);
+      Changed |= T;
+      continue;
+    }
+  }
+  return Changed;
+}
+
+bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HRI = *HST.getRegisterInfo();
+  auto &HII = *HST.getInstrInfo();
+
+  MDT = &getAnalysis<MachineDominatorTree>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  bool Changed;
+
+  Changed = DeadCodeElimination(MF, *MDT).run();
+
+  const HexagonEvaluator HE(HRI, MRI, HII, MF);
+  BitTracker BT(HE, MF);
+  DEBUG(BT.trace(true));
+  BT.run();
+
+  MachineBasicBlock &Entry = MF.front();
+
+  RegisterSet AIG;  // Available registers for IG.
+  ConstGeneration ImmG(BT, HII, MRI);
+  Changed |= visitBlock(Entry, ImmG, AIG);
+
+  RegisterSet ARE;  // Available registers for RIE.
+  RedundantInstrElimination RIE(BT, HII, MRI);
+  bool Ried = visitBlock(Entry, RIE, ARE);
+  if (Ried) {
+    Changed = true;
+    BT.run();
+  }
+
+  RegisterSet ACG;  // Available registers for CG.
+  CopyGeneration CopyG(BT, HII, HRI, MRI);
+  Changed |= visitBlock(Entry, CopyG, ACG);
+
+  RegisterSet ACP;  // Available registers for CP.
+  CopyPropagation CopyP(HRI, MRI);
+  Changed |= visitBlock(Entry, CopyP, ACP);
+
+  Changed = DeadCodeElimination(MF, *MDT).run() || Changed;
+
+  BT.run();
+  RegisterSet ABS;  // Available registers for BS.
+  BitSimplification BitS(BT, HII, HRI, MRI, MF);
+  Changed |= visitBlock(Entry, BitS, ABS);
+
+  Changed = DeadCodeElimination(MF, *MDT).run() || Changed;
+
+  if (Changed) {
+    for (auto &B : MF)
+      for (auto &I : B)
+        I.clearKillInfo();
+    DeadCodeElimination(MF, *MDT).run();
+  }
+  return Changed;
+}
+
+// Recognize loops where the code at the end of the loop matches the code
+// before the entry of the loop, and the matching code is such that is can
+// be simplified. This pass relies on the bit simplification above and only
+// prepares code in a way that can be handled by the bit simplifcation.
+//
+// This is the motivating testcase (and explanation):
+//
+// {
+//   loop0(.LBB0_2, r1)      // %for.body.preheader
+//   r5:4 = memd(r0++#8)
+// }
+// {
+//   r3 = lsr(r4, #16)
+//   r7:6 = combine(r5, r5)
+// }
+// {
+//   r3 = insert(r5, #16, #16)
+//   r7:6 = vlsrw(r7:6, #16)
+// }
+// .LBB0_2:
+// {
+//   memh(r2+#4) = r5
+//   memh(r2+#6) = r6            # R6 is really R5.H
+// }
+// {
+//   r2 = add(r2, #8)
+//   memh(r2+#0) = r4
+//   memh(r2+#2) = r3            # R3 is really R4.H
+// }
+// {
+//   r5:4 = memd(r0++#8)
+// }
+// {                             # "Shuffling" code that sets up R3 and R6
+//   r3 = lsr(r4, #16)           # so that their halves can be stored in the
+//   r7:6 = combine(r5, r5)      # next iteration. This could be folded into
+// }                             # the stores if the code was at the beginning
+// {                             # of the loop iteration. Since the same code
+//   r3 = insert(r5, #16, #16)   # precedes the loop, it can actually be moved
+//   r7:6 = vlsrw(r7:6, #16)     # there.
+// }:endloop0
+//
+//
+// The outcome:
+//
+// {
+//   loop0(.LBB0_2, r1)
+//   r5:4 = memd(r0++#8)
+// }
+// .LBB0_2:
+// {
+//   memh(r2+#4) = r5
+//   memh(r2+#6) = r5.h
+// }
+// {
+//   r2 = add(r2, #8)
+//   memh(r2+#0) = r4
+//   memh(r2+#2) = r4.h
+// }
+// {
+//   r5:4 = memd(r0++#8)
+// }:endloop0
+
+namespace llvm {
+
+  FunctionPass *createHexagonLoopRescheduling();
+  void initializeHexagonLoopReschedulingPass(PassRegistry&);
+
+} // end namespace llvm
+
+namespace {
+
+  class HexagonLoopRescheduling : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    HexagonLoopRescheduling() : MachineFunctionPass(ID),
+        HII(nullptr), HRI(nullptr), MRI(nullptr), BTP(nullptr) {
+      initializeHexagonLoopReschedulingPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    const HexagonInstrInfo *HII;
+    const HexagonRegisterInfo *HRI;
+    MachineRegisterInfo *MRI;
+    BitTracker *BTP;
+
+    struct LoopCand {
+      LoopCand(MachineBasicBlock *lb, MachineBasicBlock *pb,
+            MachineBasicBlock *eb) : LB(lb), PB(pb), EB(eb) {}
+      MachineBasicBlock *LB, *PB, *EB;
+    };
+    typedef std::vector<MachineInstr*> InstrList;
+    struct InstrGroup {
+      BitTracker::RegisterRef Inp, Out;
+      InstrList Ins;
+    };
+    struct PhiInfo {
+      PhiInfo(MachineInstr &P, MachineBasicBlock &B);
+      unsigned DefR;
+      BitTracker::RegisterRef LR, PR; // Loop Register, Preheader Register
+      MachineBasicBlock *LB, *PB;     // Loop Block, Preheader Block
+    };
+
+    static unsigned getDefReg(const MachineInstr *MI);
+    bool isConst(unsigned Reg) const;
+    bool isBitShuffle(const MachineInstr *MI, unsigned DefR) const;
+    bool isStoreInput(const MachineInstr *MI, unsigned DefR) const;
+    bool isShuffleOf(unsigned OutR, unsigned InpR) const;
+    bool isSameShuffle(unsigned OutR1, unsigned InpR1, unsigned OutR2,
+        unsigned &InpR2) const;
+    void moveGroup(InstrGroup &G, MachineBasicBlock &LB, MachineBasicBlock &PB,
+        MachineBasicBlock::iterator At, unsigned OldPhiR, unsigned NewPredR);
+    bool processLoop(LoopCand &C);
+  };
+
+} // end anonymous namespace
+
+char HexagonLoopRescheduling::ID = 0;
+
+INITIALIZE_PASS(HexagonLoopRescheduling, "hexagon-loop-resched",
+  "Hexagon Loop Rescheduling", false, false)
+
+HexagonLoopRescheduling::PhiInfo::PhiInfo(MachineInstr &P,
+      MachineBasicBlock &B) {
+  DefR = HexagonLoopRescheduling::getDefReg(&P);
+  LB = &B;
+  PB = nullptr;
+  for (unsigned i = 1, n = P.getNumOperands(); i < n; i += 2) {
+    const MachineOperand &OpB = P.getOperand(i+1);
+    if (OpB.getMBB() == &B) {
+      LR = P.getOperand(i);
+      continue;
+    }
+    PB = OpB.getMBB();
+    PR = P.getOperand(i);
+  }
+}
+
+unsigned HexagonLoopRescheduling::getDefReg(const MachineInstr *MI) {
+  RegisterSet Defs;
+  HBS::getInstrDefs(*MI, Defs);
+  if (Defs.count() != 1)
+    return 0;
+  return Defs.find_first();
+}
+
+bool HexagonLoopRescheduling::isConst(unsigned Reg) const {
+  if (!BTP->has(Reg))
+    return false;
+  const BitTracker::RegisterCell &RC = BTP->lookup(Reg);
+  for (unsigned i = 0, w = RC.width(); i < w; ++i) {
+    const BitTracker::BitValue &V = RC[i];
+    if (!V.is(0) && !V.is(1))
+      return false;
+  }
+  return true;
+}
+
+bool HexagonLoopRescheduling::isBitShuffle(const MachineInstr *MI,
+      unsigned DefR) const {
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case TargetOpcode::COPY:
+    case Hexagon::S2_lsr_i_r:
+    case Hexagon::S2_asr_i_r:
+    case Hexagon::S2_asl_i_r:
+    case Hexagon::S2_lsr_i_p:
+    case Hexagon::S2_asr_i_p:
+    case Hexagon::S2_asl_i_p:
+    case Hexagon::S2_insert:
+    case Hexagon::A2_or:
+    case Hexagon::A2_orp:
+    case Hexagon::A2_and:
+    case Hexagon::A2_andp:
+    case Hexagon::A2_combinew:
+    case Hexagon::A4_combineri:
+    case Hexagon::A4_combineir:
+    case Hexagon::A2_combineii:
+    case Hexagon::A4_combineii:
+    case Hexagon::A2_combine_ll:
+    case Hexagon::A2_combine_lh:
+    case Hexagon::A2_combine_hl:
+    case Hexagon::A2_combine_hh:
+      return true;
+  }
+  return false;
+}
+
+bool HexagonLoopRescheduling::isStoreInput(const MachineInstr *MI,
+      unsigned InpR) const {
+  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+    const MachineOperand &Op = MI->getOperand(i);
+    if (!Op.isReg())
+      continue;
+    if (Op.getReg() == InpR)
+      return i == n-1;
+  }
+  return false;
+}
+
+bool HexagonLoopRescheduling::isShuffleOf(unsigned OutR, unsigned InpR) const {
+  if (!BTP->has(OutR) || !BTP->has(InpR))
+    return false;
+  const BitTracker::RegisterCell &OutC = BTP->lookup(OutR);
+  for (unsigned i = 0, w = OutC.width(); i < w; ++i) {
+    const BitTracker::BitValue &V = OutC[i];
+    if (V.Type != BitTracker::BitValue::Ref)
+      continue;
+    if (V.RefI.Reg != InpR)
+      return false;
+  }
+  return true;
+}
+
+bool HexagonLoopRescheduling::isSameShuffle(unsigned OutR1, unsigned InpR1,
+      unsigned OutR2, unsigned &InpR2) const {
+  if (!BTP->has(OutR1) || !BTP->has(InpR1) || !BTP->has(OutR2))
+    return false;
+  const BitTracker::RegisterCell &OutC1 = BTP->lookup(OutR1);
+  const BitTracker::RegisterCell &OutC2 = BTP->lookup(OutR2);
+  unsigned W = OutC1.width();
+  unsigned MatchR = 0;
+  if (W != OutC2.width())
+    return false;
+  for (unsigned i = 0; i < W; ++i) {
+    const BitTracker::BitValue &V1 = OutC1[i], &V2 = OutC2[i];
+    if (V1.Type != V2.Type || V1.Type == BitTracker::BitValue::One)
+      return false;
+    if (V1.Type != BitTracker::BitValue::Ref)
+      continue;
+    if (V1.RefI.Pos != V2.RefI.Pos)
+      return false;
+    if (V1.RefI.Reg != InpR1)
+      return false;
+    if (V2.RefI.Reg == 0 || V2.RefI.Reg == OutR2)
+      return false;
+    if (!MatchR)
+      MatchR = V2.RefI.Reg;
+    else if (V2.RefI.Reg != MatchR)
+      return false;
+  }
+  InpR2 = MatchR;
+  return true;
+}
+
+void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB,
+      MachineBasicBlock &PB, MachineBasicBlock::iterator At, unsigned OldPhiR,
+      unsigned NewPredR) {
+  DenseMap<unsigned,unsigned> RegMap;
+
+  const TargetRegisterClass *PhiRC = MRI->getRegClass(NewPredR);
+  unsigned PhiR = MRI->createVirtualRegister(PhiRC);
+  BuildMI(LB, At, At->getDebugLoc(), HII->get(TargetOpcode::PHI), PhiR)
+    .addReg(NewPredR)
+    .addMBB(&PB)
+    .addReg(G.Inp.Reg)
+    .addMBB(&LB);
+  RegMap.insert(std::make_pair(G.Inp.Reg, PhiR));
+
+  for (unsigned i = G.Ins.size(); i > 0; --i) {
+    const MachineInstr *SI = G.Ins[i-1];
+    unsigned DR = getDefReg(SI);
+    const TargetRegisterClass *RC = MRI->getRegClass(DR);
+    unsigned NewDR = MRI->createVirtualRegister(RC);
+    DebugLoc DL = SI->getDebugLoc();
+
+    auto MIB = BuildMI(LB, At, DL, HII->get(SI->getOpcode()), NewDR);
+    for (unsigned j = 0, m = SI->getNumOperands(); j < m; ++j) {
+      const MachineOperand &Op = SI->getOperand(j);
+      if (!Op.isReg()) {
+        MIB.addOperand(Op);
+        continue;
+      }
+      if (!Op.isUse())
+        continue;
+      unsigned UseR = RegMap[Op.getReg()];
+      MIB.addReg(UseR, 0, Op.getSubReg());
+    }
+    RegMap.insert(std::make_pair(DR, NewDR));
+  }
+
+  HBS::replaceReg(OldPhiR, RegMap[G.Out.Reg], *MRI);
+}
+
+bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
+  DEBUG(dbgs() << "Processing loop in BB#" << C.LB->getNumber() << "\n");
+  std::vector<PhiInfo> Phis;
+  for (auto &I : *C.LB) {
+    if (!I.isPHI())
+      break;
+    unsigned PR = getDefReg(&I);
+    if (isConst(PR))
+      continue;
+    bool BadUse = false, GoodUse = false;
+    for (auto UI = MRI->use_begin(PR), UE = MRI->use_end(); UI != UE; ++UI) {
+      MachineInstr *UseI = UI->getParent();
+      if (UseI->getParent() != C.LB) {
+        BadUse = true;
+        break;
+      }
+      if (isBitShuffle(UseI, PR) || isStoreInput(UseI, PR))
+        GoodUse = true;
+    }
+    if (BadUse || !GoodUse)
+      continue;
+
+    Phis.push_back(PhiInfo(I, *C.LB));
+  }
+
+  DEBUG({
+    dbgs() << "Phis: {";
+    for (auto &I : Phis) {
+      dbgs() << ' ' << PrintReg(I.DefR, HRI) << "=phi("
+             << PrintReg(I.PR.Reg, HRI, I.PR.Sub) << ":b" << I.PB->getNumber()
+             << ',' << PrintReg(I.LR.Reg, HRI, I.LR.Sub) << ":b"
+             << I.LB->getNumber() << ')';
+    }
+    dbgs() << " }\n";
+  });
+
+  if (Phis.empty())
+    return false;
+
+  bool Changed = false;
+  InstrList ShufIns;
+
+  // Go backwards in the block: for each bit shuffling instruction, check
+  // if that instruction could potentially be moved to the front of the loop:
+  // the output of the loop cannot be used in a non-shuffling instruction
+  // in this loop.
+  for (auto I = C.LB->rbegin(), E = C.LB->rend(); I != E; ++I) {
+    if (I->isTerminator())
+      continue;
+    if (I->isPHI())
+      break;
+
+    RegisterSet Defs;
+    HBS::getInstrDefs(*I, Defs);
+    if (Defs.count() != 1)
+      continue;
+    unsigned DefR = Defs.find_first();
+    if (!TargetRegisterInfo::isVirtualRegister(DefR))
+      continue;
+    if (!isBitShuffle(&*I, DefR))
+      continue;
+
+    bool BadUse = false;
+    for (auto UI = MRI->use_begin(DefR), UE = MRI->use_end(); UI != UE; ++UI) {
+      MachineInstr *UseI = UI->getParent();
+      if (UseI->getParent() == C.LB) {
+        if (UseI->isPHI()) {
+          // If the use is in a phi node in this loop, then it should be
+          // the value corresponding to the back edge.
+          unsigned Idx = UI.getOperandNo();
+          if (UseI->getOperand(Idx+1).getMBB() != C.LB)
+            BadUse = true;
+        } else {
+          auto F = find(ShufIns, UseI);
+          if (F == ShufIns.end())
+            BadUse = true;
+        }
+      } else {
+        // There is a use outside of the loop, but there is no epilog block
+        // suitable for a copy-out.
+        if (C.EB == nullptr)
+          BadUse = true;
+      }
+      if (BadUse)
+        break;
+    }
+
+    if (BadUse)
+      continue;
+    ShufIns.push_back(&*I);
+  }
+
+  // Partition the list of shuffling instructions into instruction groups,
+  // where each group has to be moved as a whole (i.e. a group is a chain of
+  // dependent instructions). A group produces a single live output register,
+  // which is meant to be the input of the loop phi node (although this is
+  // not checked here yet). It also uses a single register as its input,
+  // which is some value produced in the loop body. After moving the group
+  // to the beginning of the loop, that input register would need to be
+  // the loop-carried register (through a phi node) instead of the (currently
+  // loop-carried) output register.
+  typedef std::vector<InstrGroup> InstrGroupList;
+  InstrGroupList Groups;
+
+  for (unsigned i = 0, n = ShufIns.size(); i < n; ++i) {
+    MachineInstr *SI = ShufIns[i];
+    if (SI == nullptr)
+      continue;
+
+    InstrGroup G;
+    G.Ins.push_back(SI);
+    G.Out.Reg = getDefReg(SI);
+    RegisterSet Inputs;
+    HBS::getInstrUses(*SI, Inputs);
+
+    for (unsigned j = i+1; j < n; ++j) {
+      MachineInstr *MI = ShufIns[j];
+      if (MI == nullptr)
+        continue;
+      RegisterSet Defs;
+      HBS::getInstrDefs(*MI, Defs);
+      // If this instruction does not define any pending inputs, skip it.
+      if (!Defs.intersects(Inputs))
+        continue;
+      // Otherwise, add it to the current group and remove the inputs that
+      // are defined by MI.
+      G.Ins.push_back(MI);
+      Inputs.remove(Defs);
+      // Then add all registers used by MI.
+      HBS::getInstrUses(*MI, Inputs);
+      ShufIns[j] = nullptr;
+    }
+
+    // Only add a group if it requires at most one register.
+    if (Inputs.count() > 1)
+      continue;
+    auto LoopInpEq = [G] (const PhiInfo &P) -> bool {
+      return G.Out.Reg == P.LR.Reg;
+    };
+    if (llvm::find_if(Phis, LoopInpEq) == Phis.end())
+      continue;
+
+    G.Inp.Reg = Inputs.find_first();
+    Groups.push_back(G);
+  }
+
+  DEBUG({
+    for (unsigned i = 0, n = Groups.size(); i < n; ++i) {
+      InstrGroup &G = Groups[i];
+      dbgs() << "Group[" << i << "] inp: "
+             << PrintReg(G.Inp.Reg, HRI, G.Inp.Sub)
+             << "  out: " << PrintReg(G.Out.Reg, HRI, G.Out.Sub) << "\n";
+      for (unsigned j = 0, m = G.Ins.size(); j < m; ++j)
+        dbgs() << "  " << *G.Ins[j];
+    }
+  });
+
+  for (unsigned i = 0, n = Groups.size(); i < n; ++i) {
+    InstrGroup &G = Groups[i];
+    if (!isShuffleOf(G.Out.Reg, G.Inp.Reg))
+      continue;
+    auto LoopInpEq = [G] (const PhiInfo &P) -> bool {
+      return G.Out.Reg == P.LR.Reg;
+    };
+    auto F = llvm::find_if(Phis, LoopInpEq);
+    if (F == Phis.end())
+      continue;
+    unsigned PrehR = 0;
+    if (!isSameShuffle(G.Out.Reg, G.Inp.Reg, F->PR.Reg, PrehR)) {
+      const MachineInstr *DefPrehR = MRI->getVRegDef(F->PR.Reg);
+      unsigned Opc = DefPrehR->getOpcode();
+      if (Opc != Hexagon::A2_tfrsi && Opc != Hexagon::A2_tfrpi)
+        continue;
+      if (!DefPrehR->getOperand(1).isImm())
+        continue;
+      if (DefPrehR->getOperand(1).getImm() != 0)
+        continue;
+      const TargetRegisterClass *RC = MRI->getRegClass(G.Inp.Reg);
+      if (RC != MRI->getRegClass(F->PR.Reg)) {
+        PrehR = MRI->createVirtualRegister(RC);
+        unsigned TfrI = (RC == &Hexagon::IntRegsRegClass) ? Hexagon::A2_tfrsi
+                                                          : Hexagon::A2_tfrpi;
+        auto T = C.PB->getFirstTerminator();
+        DebugLoc DL = (T != C.PB->end()) ? T->getDebugLoc() : DebugLoc();
+        BuildMI(*C.PB, T, DL, HII->get(TfrI), PrehR)
+          .addImm(0);
+      } else {
+        PrehR = F->PR.Reg;
+      }
+    }
+    // isSameShuffle could match with PrehR being of a wider class than
+    // G.Inp.Reg, for example if G shuffles the low 32 bits of its input,
+    // it would match for the input being a 32-bit register, and PrehR
+    // being a 64-bit register (where the low 32 bits match). This could
+    // be handled, but for now skip these cases.
+    if (MRI->getRegClass(PrehR) != MRI->getRegClass(G.Inp.Reg))
+      continue;
+    moveGroup(G, *F->LB, *F->PB, F->LB->getFirstNonPHI(), F->DefR, PrehR);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool HexagonLoopRescheduling::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  HII = HST.getInstrInfo();
+  HRI = HST.getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  const HexagonEvaluator HE(*HRI, *MRI, *HII, MF);
+  BitTracker BT(HE, MF);
+  DEBUG(BT.trace(true));
+  BT.run();
+  BTP = &BT;
+
+  std::vector<LoopCand> Cand;
+
+  for (auto &B : MF) {
+    if (B.pred_size() != 2 || B.succ_size() != 2)
+      continue;
+    MachineBasicBlock *PB = nullptr;
+    bool IsLoop = false;
+    for (auto PI = B.pred_begin(), PE = B.pred_end(); PI != PE; ++PI) {
+      if (*PI != &B)
+        PB = *PI;
+      else
+        IsLoop = true;
+    }
+    if (!IsLoop)
+      continue;
+
+    MachineBasicBlock *EB = nullptr;
+    for (auto SI = B.succ_begin(), SE = B.succ_end(); SI != SE; ++SI) {
+      if (*SI == &B)
+        continue;
+      // Set EP to the epilog block, if it has only 1 predecessor (i.e. the
+      // edge from B to EP is non-critical.
+      if ((*SI)->pred_size() == 1)
+        EB = *SI;
+      break;
+    }
+
+    Cand.push_back(LoopCand(&B, PB, EB));
+  }
+
+  bool Changed = false;
+  for (auto &C : Cand)
+    Changed |= processLoop(C);
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonLoopRescheduling() {
+  return new HexagonLoopRescheduling();
+}
+
+FunctionPass *llvm::createHexagonBitSimplify() {
+  return new HexagonBitSimplify();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
new file mode 100644
index 000000000000..b78c4126e0b1
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -0,0 +1,1191 @@
+//===--- HexagonBitTracker.cpp --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonBitTracker.h"
+
+using namespace llvm;
+
+typedef BitTracker BT;
+
+HexagonEvaluator::HexagonEvaluator(const HexagonRegisterInfo &tri,
+                                   MachineRegisterInfo &mri,
+                                   const HexagonInstrInfo &tii,
+                                   MachineFunction &mf)
+    : MachineEvaluator(tri, mri), MF(mf), MFI(mf.getFrameInfo()), TII(tii) {
+  // Populate the VRX map (VR to extension-type).
+  // Go over all the formal parameters of the function. If a given parameter
+  // P is sign- or zero-extended, locate the virtual register holding that
+  // parameter and create an entry in the VRX map indicating the type of ex-
+  // tension (and the source type).
+  // This is a bit complicated to do accurately, since the memory layout in-
+  // formation is necessary to precisely determine whether an aggregate para-
+  // meter will be passed in a register or in memory. What is given in MRI
+  // is the association between the physical register that is live-in (i.e.
+  // holds an argument), and the virtual register that this value will be
+  // copied into. This, by itself, is not sufficient to map back the virtual
+  // register to a formal parameter from Function (since consecutive live-ins
+  // from MRI may not correspond to consecutive formal parameters from Func-
+  // tion). To avoid the complications with in-memory arguments, only consi-
+  // der the initial sequence of formal parameters that are known to be
+  // passed via registers.
+  unsigned AttrIdx = 0;
+  unsigned InVirtReg, InPhysReg = 0;
+  const Function &F = *MF.getFunction();
+  typedef Function::const_arg_iterator arg_iterator;
+  for (arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
+    AttrIdx++;
+    const Argument &Arg = *I;
+    Type *ATy = Arg.getType();
+    unsigned Width = 0;
+    if (ATy->isIntegerTy())
+      Width = ATy->getIntegerBitWidth();
+    else if (ATy->isPointerTy())
+      Width = 32;
+    // If pointer size is not set through target data, it will default to
+    // Module::AnyPointerSize.
+    if (Width == 0 || Width > 64)
+      break;
+    AttributeSet Attrs = F.getAttributes();
+    if (Attrs.hasAttribute(AttrIdx, Attribute::ByVal))
+      continue;
+    InPhysReg = getNextPhysReg(InPhysReg, Width);
+    if (!InPhysReg)
+      break;
+    InVirtReg = getVirtRegFor(InPhysReg);
+    if (!InVirtReg)
+      continue;
+    if (Attrs.hasAttribute(AttrIdx, Attribute::SExt))
+      VRX.insert(std::make_pair(InVirtReg, ExtType(ExtType::SExt, Width)));
+    else if (Attrs.hasAttribute(AttrIdx, Attribute::ZExt))
+      VRX.insert(std::make_pair(InVirtReg, ExtType(ExtType::ZExt, Width)));
+  }
+}
+
+
+BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
+  if (Sub == 0)
+    return MachineEvaluator::mask(Reg, 0);
+  using namespace Hexagon;
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  unsigned ID = RC->getID();
+  uint16_t RW = getRegBitWidth(RegisterRef(Reg, Sub));
+  auto &HRI = static_cast<const HexagonRegisterInfo&>(TRI);
+  bool IsSubLo = (Sub == HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_lo));
+  switch (ID) {
+    case DoubleRegsRegClassID:
+    case VecDblRegsRegClassID:
+    case VecDblRegs128BRegClassID:
+      return IsSubLo ? BT::BitMask(0, RW-1)
+                     : BT::BitMask(RW, 2*RW-1);
+    default:
+      break;
+  }
+#ifndef NDEBUG
+  dbgs() << PrintReg(Reg, &TRI, Sub) << '\n';
+#endif
+  llvm_unreachable("Unexpected register/subregister");
+}
+
+namespace {
+class RegisterRefs {
+  std::vector<BT::RegisterRef> Vector;
+
+public:
+  RegisterRefs(const MachineInstr &MI) : Vector(MI.getNumOperands()) {
+    for (unsigned i = 0, n = Vector.size(); i < n; ++i) {
+      const MachineOperand &MO = MI.getOperand(i);
+      if (MO.isReg())
+        Vector[i] = BT::RegisterRef(MO);
+      // For indices that don't correspond to registers, the entry will
+      // remain constructed via the default constructor.
+    }
+  }
+
+  size_t size() const { return Vector.size(); }
+  const BT::RegisterRef &operator[](unsigned n) const {
+    // The main purpose of this operator is to assert with bad argument.
+    assert(n < Vector.size());
+    return Vector[n];
+  }
+};
+}
+
+bool HexagonEvaluator::evaluate(const MachineInstr &MI,
+                                const CellMapType &Inputs,
+                                CellMapType &Outputs) const {
+  unsigned NumDefs = 0;
+
+  // Sanity verification: there should not be any defs with subregisters.
+  for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    NumDefs++;
+    assert(MO.getSubReg() == 0);
+  }
+
+  if (NumDefs == 0)
+    return false;
+
+  using namespace Hexagon;
+  unsigned Opc = MI.getOpcode();
+
+  if (MI.mayLoad()) {
+    switch (Opc) {
+      // These instructions may be marked as mayLoad, but they are generating
+      // immediate values, so skip them.
+      case CONST32:
+      case CONST64:
+        break;
+      default:
+        return evaluateLoad(MI, Inputs, Outputs);
+    }
+  }
+
+  // Check COPY instructions that copy formal parameters into virtual
+  // registers. Such parameters can be sign- or zero-extended at the
+  // call site, and we should take advantage of this knowledge. The MRI
+  // keeps a list of pairs of live-in physical and virtual registers,
+  // which provides information about which virtual registers will hold
+  // the argument values. The function will still contain instructions
+  // defining those virtual registers, and in practice those are COPY
+  // instructions from a physical to a virtual register. In such cases,
+  // applying the argument extension to the virtual register can be seen
+  // as simply mirroring the extension that had already been applied to
+  // the physical register at the call site. If the defining instruction
+  // was not a COPY, it would not be clear how to mirror that extension
+  // on the callee's side. For that reason, only check COPY instructions
+  // for potential extensions.
+  if (MI.isCopy()) {
+    if (evaluateFormalCopy(MI, Inputs, Outputs))
+      return true;
+  }
+
+  // Beyond this point, if any operand is a global, skip that instruction.
+  // The reason is that certain instructions that can take an immediate
+  // operand can also have a global symbol in that operand. To avoid
+  // checking what kind of operand a given instruction has individually
+  // for each instruction, do it here. Global symbols as operands gene-
+  // rally do not provide any useful information.
+  for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (MO.isGlobal() || MO.isBlockAddress() || MO.isSymbol() || MO.isJTI() ||
+        MO.isCPI())
+      return false;
+  }
+
+  RegisterRefs Reg(MI);
+#define op(i) MI.getOperand(i)
+#define rc(i) RegisterCell::ref(getCell(Reg[i], Inputs))
+#define im(i) MI.getOperand(i).getImm()
+
+  // If the instruction has no register operands, skip it.
+  if (Reg.size() == 0)
+    return false;
+
+  // Record result for register in operand 0.
+  auto rr0 = [this,Reg] (const BT::RegisterCell &Val, CellMapType &Outputs)
+        -> bool {
+    putCell(Reg[0], Val, Outputs);
+    return true;
+  };
+  // Get the cell corresponding to the N-th operand.
+  auto cop = [this, &Reg, &MI, &Inputs](unsigned N,
+                                        uint16_t W) -> BT::RegisterCell {
+    const MachineOperand &Op = MI.getOperand(N);
+    if (Op.isImm())
+      return eIMM(Op.getImm(), W);
+    if (!Op.isReg())
+      return RegisterCell::self(0, W);
+    assert(getRegBitWidth(Reg[N]) == W && "Register width mismatch");
+    return rc(N);
+  };
+  // Extract RW low bits of the cell.
+  auto lo = [this] (const BT::RegisterCell &RC, uint16_t RW)
+        -> BT::RegisterCell {
+    assert(RW <= RC.width());
+    return eXTR(RC, 0, RW);
+  };
+  // Extract RW high bits of the cell.
+  auto hi = [this] (const BT::RegisterCell &RC, uint16_t RW)
+        -> BT::RegisterCell {
+    uint16_t W = RC.width();
+    assert(RW <= W);
+    return eXTR(RC, W-RW, W);
+  };
+  // Extract N-th halfword (counting from the least significant position).
+  auto half = [this] (const BT::RegisterCell &RC, unsigned N)
+        -> BT::RegisterCell {
+    assert(N*16+16 <= RC.width());
+    return eXTR(RC, N*16, N*16+16);
+  };
+  // Shuffle bits (pick even/odd from cells and merge into result).
+  auto shuffle = [this] (const BT::RegisterCell &Rs, const BT::RegisterCell &Rt,
+                         uint16_t BW, bool Odd) -> BT::RegisterCell {
+    uint16_t I = Odd, Ws = Rs.width();
+    assert(Ws == Rt.width());
+    RegisterCell RC = eXTR(Rt, I*BW, I*BW+BW).cat(eXTR(Rs, I*BW, I*BW+BW));
+    I += 2;
+    while (I*BW < Ws) {
+      RC.cat(eXTR(Rt, I*BW, I*BW+BW)).cat(eXTR(Rs, I*BW, I*BW+BW));
+      I += 2;
+    }
+    return RC;
+  };
+
+  // The bitwidth of the 0th operand. In most (if not all) of the
+  // instructions below, the 0th operand is the defined register.
+  // Pre-compute the bitwidth here, because it is needed in many cases
+  // cases below.
+  uint16_t W0 = (Reg[0].Reg != 0) ? getRegBitWidth(Reg[0]) : 0;
+
+  switch (Opc) {
+    // Transfer immediate:
+
+    case A2_tfrsi:
+    case A2_tfrpi:
+    case CONST32:
+    case CONST64:
+      return rr0(eIMM(im(1), W0), Outputs);
+    case PS_false:
+      return rr0(RegisterCell(W0).fill(0, W0, BT::BitValue::Zero), Outputs);
+    case PS_true:
+      return rr0(RegisterCell(W0).fill(0, W0, BT::BitValue::One), Outputs);
+    case PS_fi: {
+      int FI = op(1).getIndex();
+      int Off = op(2).getImm();
+      unsigned A = MFI.getObjectAlignment(FI) + std::abs(Off);
+      unsigned L = Log2_32(A);
+      RegisterCell RC = RegisterCell::self(Reg[0].Reg, W0);
+      RC.fill(0, L, BT::BitValue::Zero);
+      return rr0(RC, Outputs);
+    }
+
+    // Transfer register:
+
+    case A2_tfr:
+    case A2_tfrp:
+    case C2_pxfer_map:
+      return rr0(rc(1), Outputs);
+    case C2_tfrpr: {
+      uint16_t RW = W0;
+      uint16_t PW = 8; // XXX Pred size: getRegBitWidth(Reg[1]);
+      assert(PW <= RW);
+      RegisterCell PC = eXTR(rc(1), 0, PW);
+      RegisterCell RC = RegisterCell(RW).insert(PC, BT::BitMask(0, PW-1));
+      RC.fill(PW, RW, BT::BitValue::Zero);
+      return rr0(RC, Outputs);
+    }
+    case C2_tfrrp: {
+      RegisterCell RC = RegisterCell::self(Reg[0].Reg, W0);
+      W0 = 8; // XXX Pred size
+      return rr0(eINS(RC, eXTR(rc(1), 0, W0), 0), Outputs);
+    }
+
+    // Arithmetic:
+
+    case A2_abs:
+    case A2_absp:
+      // TODO
+      break;
+
+    case A2_addsp: {
+      uint16_t W1 = getRegBitWidth(Reg[1]);
+      assert(W0 == 64 && W1 == 32);
+      RegisterCell CW = RegisterCell(W0).insert(rc(1), BT::BitMask(0, W1-1));
+      RegisterCell RC = eADD(eSXT(CW, W1), rc(2));
+      return rr0(RC, Outputs);
+    }
+    case A2_add:
+    case A2_addp:
+      return rr0(eADD(rc(1), rc(2)), Outputs);
+    case A2_addi:
+      return rr0(eADD(rc(1), eIMM(im(2), W0)), Outputs);
+    case S4_addi_asl_ri: {
+      RegisterCell RC = eADD(eIMM(im(1), W0), eASL(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case S4_addi_lsr_ri: {
+      RegisterCell RC = eADD(eIMM(im(1), W0), eLSR(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case S4_addaddi: {
+      RegisterCell RC = eADD(rc(1), eADD(rc(2), eIMM(im(3), W0)));
+      return rr0(RC, Outputs);
+    }
+    case M4_mpyri_addi: {
+      RegisterCell M = eMLS(rc(2), eIMM(im(3), W0));
+      RegisterCell RC = eADD(eIMM(im(1), W0), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M4_mpyrr_addi: {
+      RegisterCell M = eMLS(rc(2), rc(3));
+      RegisterCell RC = eADD(eIMM(im(1), W0), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M4_mpyri_addr_u2: {
+      RegisterCell M = eMLS(eIMM(im(2), W0), rc(3));
+      RegisterCell RC = eADD(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M4_mpyri_addr: {
+      RegisterCell M = eMLS(rc(2), eIMM(im(3), W0));
+      RegisterCell RC = eADD(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M4_mpyrr_addr: {
+      RegisterCell M = eMLS(rc(2), rc(3));
+      RegisterCell RC = eADD(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case S4_subaddi: {
+      RegisterCell RC = eADD(rc(1), eSUB(eIMM(im(2), W0), rc(3)));
+      return rr0(RC, Outputs);
+    }
+    case M2_accii: {
+      RegisterCell RC = eADD(rc(1), eADD(rc(2), eIMM(im(3), W0)));
+      return rr0(RC, Outputs);
+    }
+    case M2_acci: {
+      RegisterCell RC = eADD(rc(1), eADD(rc(2), rc(3)));
+      return rr0(RC, Outputs);
+    }
+    case M2_subacc: {
+      RegisterCell RC = eADD(rc(1), eSUB(rc(2), rc(3)));
+      return rr0(RC, Outputs);
+    }
+    case S2_addasl_rrri: {
+      RegisterCell RC = eADD(rc(1), eASL(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case C4_addipc: {
+      RegisterCell RPC = RegisterCell::self(Reg[0].Reg, W0);
+      RPC.fill(0, 2, BT::BitValue::Zero);
+      return rr0(eADD(RPC, eIMM(im(2), W0)), Outputs);
+    }
+    case A2_sub:
+    case A2_subp:
+      return rr0(eSUB(rc(1), rc(2)), Outputs);
+    case A2_subri:
+      return rr0(eSUB(eIMM(im(1), W0), rc(2)), Outputs);
+    case S4_subi_asl_ri: {
+      RegisterCell RC = eSUB(eIMM(im(1), W0), eASL(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case S4_subi_lsr_ri: {
+      RegisterCell RC = eSUB(eIMM(im(1), W0), eLSR(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case M2_naccii: {
+      RegisterCell RC = eSUB(rc(1), eADD(rc(2), eIMM(im(3), W0)));
+      return rr0(RC, Outputs);
+    }
+    case M2_nacci: {
+      RegisterCell RC = eSUB(rc(1), eADD(rc(2), rc(3)));
+      return rr0(RC, Outputs);
+    }
+    // 32-bit negation is done by "Rd = A2_subri 0, Rs"
+    case A2_negp:
+      return rr0(eSUB(eIMM(0, W0), rc(1)), Outputs);
+
+    case M2_mpy_up: {
+      RegisterCell M = eMLS(rc(1), rc(2));
+      return rr0(hi(M, W0), Outputs);
+    }
+    case M2_dpmpyss_s0:
+      return rr0(eMLS(rc(1), rc(2)), Outputs);
+    case M2_dpmpyss_acc_s0:
+      return rr0(eADD(rc(1), eMLS(rc(2), rc(3))), Outputs);
+    case M2_dpmpyss_nac_s0:
+      return rr0(eSUB(rc(1), eMLS(rc(2), rc(3))), Outputs);
+    case M2_mpyi: {
+      RegisterCell M = eMLS(rc(1), rc(2));
+      return rr0(lo(M, W0), Outputs);
+    }
+    case M2_macsip: {
+      RegisterCell M = eMLS(rc(2), eIMM(im(3), W0));
+      RegisterCell RC = eADD(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M2_macsin: {
+      RegisterCell M = eMLS(rc(2), eIMM(im(3), W0));
+      RegisterCell RC = eSUB(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M2_maci: {
+      RegisterCell M = eMLS(rc(2), rc(3));
+      RegisterCell RC = eADD(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
+    case M2_mpysmi: {
+      RegisterCell M = eMLS(rc(1), eIMM(im(2), W0));
+      return rr0(lo(M, 32), Outputs);
+    }
+    case M2_mpysin: {
+      RegisterCell M = eMLS(rc(1), eIMM(-im(2), W0));
+      return rr0(lo(M, 32), Outputs);
+    }
+    case M2_mpysip: {
+      RegisterCell M = eMLS(rc(1), eIMM(im(2), W0));
+      return rr0(lo(M, 32), Outputs);
+    }
+    case M2_mpyu_up: {
+      RegisterCell M = eMLU(rc(1), rc(2));
+      return rr0(hi(M, W0), Outputs);
+    }
+    case M2_dpmpyuu_s0:
+      return rr0(eMLU(rc(1), rc(2)), Outputs);
+    case M2_dpmpyuu_acc_s0:
+      return rr0(eADD(rc(1), eMLU(rc(2), rc(3))), Outputs);
+    case M2_dpmpyuu_nac_s0:
+      return rr0(eSUB(rc(1), eMLU(rc(2), rc(3))), Outputs);
+    //case M2_mpysu_up:
+
+    // Logical/bitwise:
+
+    case A2_andir:
+      return rr0(eAND(rc(1), eIMM(im(2), W0)), Outputs);
+    case A2_and:
+    case A2_andp:
+      return rr0(eAND(rc(1), rc(2)), Outputs);
+    case A4_andn:
+    case A4_andnp:
+      return rr0(eAND(rc(1), eNOT(rc(2))), Outputs);
+    case S4_andi_asl_ri: {
+      RegisterCell RC = eAND(eIMM(im(1), W0), eASL(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case S4_andi_lsr_ri: {
+      RegisterCell RC = eAND(eIMM(im(1), W0), eLSR(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case M4_and_and:
+      return rr0(eAND(rc(1), eAND(rc(2), rc(3))), Outputs);
+    case M4_and_andn:
+      return rr0(eAND(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+    case M4_and_or:
+      return rr0(eAND(rc(1), eORL(rc(2), rc(3))), Outputs);
+    case M4_and_xor:
+      return rr0(eAND(rc(1), eXOR(rc(2), rc(3))), Outputs);
+    case A2_orir:
+      return rr0(eORL(rc(1), eIMM(im(2), W0)), Outputs);
+    case A2_or:
+    case A2_orp:
+      return rr0(eORL(rc(1), rc(2)), Outputs);
+    case A4_orn:
+    case A4_ornp:
+      return rr0(eORL(rc(1), eNOT(rc(2))), Outputs);
+    case S4_ori_asl_ri: {
+      RegisterCell RC = eORL(eIMM(im(1), W0), eASL(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case S4_ori_lsr_ri: {
+      RegisterCell RC = eORL(eIMM(im(1), W0), eLSR(rc(2), im(3)));
+      return rr0(RC, Outputs);
+    }
+    case M4_or_and:
+      return rr0(eORL(rc(1), eAND(rc(2), rc(3))), Outputs);
+    case M4_or_andn:
+      return rr0(eORL(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+    case S4_or_andi:
+    case S4_or_andix: {
+      RegisterCell RC = eORL(rc(1), eAND(rc(2), eIMM(im(3), W0)));
+      return rr0(RC, Outputs);
+    }
+    case S4_or_ori: {
+      RegisterCell RC = eORL(rc(1), eORL(rc(2), eIMM(im(3), W0)));
+      return rr0(RC, Outputs);
+    }
+    case M4_or_or:
+      return rr0(eORL(rc(1), eORL(rc(2), rc(3))), Outputs);
+    case M4_or_xor:
+      return rr0(eORL(rc(1), eXOR(rc(2), rc(3))), Outputs);
+    case A2_xor:
+    case A2_xorp:
+      return rr0(eXOR(rc(1), rc(2)), Outputs);
+    case M4_xor_and:
+      return rr0(eXOR(rc(1), eAND(rc(2), rc(3))), Outputs);
+    case M4_xor_andn:
+      return rr0(eXOR(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+    case M4_xor_or:
+      return rr0(eXOR(rc(1), eORL(rc(2), rc(3))), Outputs);
+    case M4_xor_xacc:
+      return rr0(eXOR(rc(1), eXOR(rc(2), rc(3))), Outputs);
+    case A2_not:
+    case A2_notp:
+      return rr0(eNOT(rc(1)), Outputs);
+
+    case S2_asl_i_r:
+    case S2_asl_i_p:
+      return rr0(eASL(rc(1), im(2)), Outputs);
+    case A2_aslh:
+      return rr0(eASL(rc(1), 16), Outputs);
+    case S2_asl_i_r_acc:
+    case S2_asl_i_p_acc:
+      return rr0(eADD(rc(1), eASL(rc(2), im(3))), Outputs);
+    case S2_asl_i_r_nac:
+    case S2_asl_i_p_nac:
+      return rr0(eSUB(rc(1), eASL(rc(2), im(3))), Outputs);
+    case S2_asl_i_r_and:
+    case S2_asl_i_p_and:
+      return rr0(eAND(rc(1), eASL(rc(2), im(3))), Outputs);
+    case S2_asl_i_r_or:
+    case S2_asl_i_p_or:
+      return rr0(eORL(rc(1), eASL(rc(2), im(3))), Outputs);
+    case S2_asl_i_r_xacc:
+    case S2_asl_i_p_xacc:
+      return rr0(eXOR(rc(1), eASL(rc(2), im(3))), Outputs);
+    case S2_asl_i_vh:
+    case S2_asl_i_vw:
+      // TODO
+      break;
+
+    case S2_asr_i_r:
+    case S2_asr_i_p:
+      return rr0(eASR(rc(1), im(2)), Outputs);
+    case A2_asrh:
+      return rr0(eASR(rc(1), 16), Outputs);
+    case S2_asr_i_r_acc:
+    case S2_asr_i_p_acc:
+      return rr0(eADD(rc(1), eASR(rc(2), im(3))), Outputs);
+    case S2_asr_i_r_nac:
+    case S2_asr_i_p_nac:
+      return rr0(eSUB(rc(1), eASR(rc(2), im(3))), Outputs);
+    case S2_asr_i_r_and:
+    case S2_asr_i_p_and:
+      return rr0(eAND(rc(1), eASR(rc(2), im(3))), Outputs);
+    case S2_asr_i_r_or:
+    case S2_asr_i_p_or:
+      return rr0(eORL(rc(1), eASR(rc(2), im(3))), Outputs);
+    case S2_asr_i_r_rnd: {
+      // The input is first sign-extended to 64 bits, then the output
+      // is truncated back to 32 bits.
+      assert(W0 == 32);
+      RegisterCell XC = eSXT(rc(1).cat(eIMM(0, W0)), W0);
+      RegisterCell RC = eASR(eADD(eASR(XC, im(2)), eIMM(1, 2*W0)), 1);
+      return rr0(eXTR(RC, 0, W0), Outputs);
+    }
+    case S2_asr_i_r_rnd_goodsyntax: {
+      int64_t S = im(2);
+      if (S == 0)
+        return rr0(rc(1), Outputs);
+      // Result: S2_asr_i_r_rnd Rs, u5-1
+      RegisterCell XC = eSXT(rc(1).cat(eIMM(0, W0)), W0);
+      RegisterCell RC = eLSR(eADD(eASR(XC, S-1), eIMM(1, 2*W0)), 1);
+      return rr0(eXTR(RC, 0, W0), Outputs);
+    }
+    case S2_asr_r_vh:
+    case S2_asr_i_vw:
+    case S2_asr_i_svw_trun:
+      // TODO
+      break;
+
+    case S2_lsr_i_r:
+    case S2_lsr_i_p:
+      return rr0(eLSR(rc(1), im(2)), Outputs);
+    case S2_lsr_i_r_acc:
+    case S2_lsr_i_p_acc:
+      return rr0(eADD(rc(1), eLSR(rc(2), im(3))), Outputs);
+    case S2_lsr_i_r_nac:
+    case S2_lsr_i_p_nac:
+      return rr0(eSUB(rc(1), eLSR(rc(2), im(3))), Outputs);
+    case S2_lsr_i_r_and:
+    case S2_lsr_i_p_and:
+      return rr0(eAND(rc(1), eLSR(rc(2), im(3))), Outputs);
+    case S2_lsr_i_r_or:
+    case S2_lsr_i_p_or:
+      return rr0(eORL(rc(1), eLSR(rc(2), im(3))), Outputs);
+    case S2_lsr_i_r_xacc:
+    case S2_lsr_i_p_xacc:
+      return rr0(eXOR(rc(1), eLSR(rc(2), im(3))), Outputs);
+
+    case S2_clrbit_i: {
+      RegisterCell RC = rc(1);
+      RC[im(2)] = BT::BitValue::Zero;
+      return rr0(RC, Outputs);
+    }
+    case S2_setbit_i: {
+      RegisterCell RC = rc(1);
+      RC[im(2)] = BT::BitValue::One;
+      return rr0(RC, Outputs);
+    }
+    case S2_togglebit_i: {
+      RegisterCell RC = rc(1);
+      uint16_t BX = im(2);
+      RC[BX] = RC[BX].is(0) ? BT::BitValue::One
+                            : RC[BX].is(1) ? BT::BitValue::Zero
+                                           : BT::BitValue::self();
+      return rr0(RC, Outputs);
+    }
+
+    case A4_bitspliti: {
+      uint16_t W1 = getRegBitWidth(Reg[1]);
+      uint16_t BX = im(2);
+      // Res.uw[1] = Rs[bx+1:], Res.uw[0] = Rs[0:bx]
+      const BT::BitValue Zero = BT::BitValue::Zero;
+      RegisterCell RZ = RegisterCell(W0).fill(BX, W1, Zero)
+                                        .fill(W1+(W1-BX), W0, Zero);
+      RegisterCell BF1 = eXTR(rc(1), 0, BX), BF2 = eXTR(rc(1), BX, W1);
+      RegisterCell RC = eINS(eINS(RZ, BF1, 0), BF2, W1);
+      return rr0(RC, Outputs);
+    }
+    case S4_extract:
+    case S4_extractp:
+    case S2_extractu:
+    case S2_extractup: {
+      uint16_t Wd = im(2), Of = im(3);
+      assert(Wd <= W0);
+      if (Wd == 0)
+        return rr0(eIMM(0, W0), Outputs);
+      // If the width extends beyond the register size, pad the register
+      // with 0 bits.
+      RegisterCell Pad = (Wd+Of > W0) ? rc(1).cat(eIMM(0, Wd+Of-W0)) : rc(1);
+      RegisterCell Ext = eXTR(Pad, Of, Wd+Of);
+      // Ext is short, need to extend it with 0s or sign bit.
+      RegisterCell RC = RegisterCell(W0).insert(Ext, BT::BitMask(0, Wd-1));
+      if (Opc == S2_extractu || Opc == S2_extractup)
+        return rr0(eZXT(RC, Wd), Outputs);
+      return rr0(eSXT(RC, Wd), Outputs);
+    }
+    case S2_insert:
+    case S2_insertp: {
+      uint16_t Wd = im(3), Of = im(4);
+      assert(Wd < W0 && Of < W0);
+      // If Wd+Of exceeds W0, the inserted bits are truncated.
+      if (Wd+Of > W0)
+        Wd = W0-Of;
+      if (Wd == 0)
+        return rr0(rc(1), Outputs);
+      return rr0(eINS(rc(1), eXTR(rc(2), 0, Wd), Of), Outputs);
+    }
+
+    // Bit permutations:
+
+    case A2_combineii:
+    case A4_combineii:
+    case A4_combineir:
+    case A4_combineri:
+    case A2_combinew:
+    case V6_vcombine:
+    case V6_vcombine_128B:
+      assert(W0 % 2 == 0);
+      return rr0(cop(2, W0/2).cat(cop(1, W0/2)), Outputs);
+    case A2_combine_ll:
+    case A2_combine_lh:
+    case A2_combine_hl:
+    case A2_combine_hh: {
+      assert(W0 == 32);
+      assert(getRegBitWidth(Reg[1]) == 32 && getRegBitWidth(Reg[2]) == 32);
+      // Low half in the output is 0 for _ll and _hl, 1 otherwise:
+      unsigned LoH = !(Opc == A2_combine_ll || Opc == A2_combine_hl);
+      // High half in the output is 0 for _ll and _lh, 1 otherwise:
+      unsigned HiH = !(Opc == A2_combine_ll || Opc == A2_combine_lh);
+      RegisterCell R1 = rc(1);
+      RegisterCell R2 = rc(2);
+      RegisterCell RC = half(R2, LoH).cat(half(R1, HiH));
+      return rr0(RC, Outputs);
+    }
+    case S2_packhl: {
+      assert(W0 == 64);
+      assert(getRegBitWidth(Reg[1]) == 32 && getRegBitWidth(Reg[2]) == 32);
+      RegisterCell R1 = rc(1);
+      RegisterCell R2 = rc(2);
+      RegisterCell RC = half(R2, 0).cat(half(R1, 0)).cat(half(R2, 1))
+                                   .cat(half(R1, 1));
+      return rr0(RC, Outputs);
+    }
+    case S2_shuffeb: {
+      RegisterCell RC = shuffle(rc(1), rc(2), 8, false);
+      return rr0(RC, Outputs);
+    }
+    case S2_shuffeh: {
+      RegisterCell RC = shuffle(rc(1), rc(2), 16, false);
+      return rr0(RC, Outputs);
+    }
+    case S2_shuffob: {
+      RegisterCell RC = shuffle(rc(1), rc(2), 8, true);
+      return rr0(RC, Outputs);
+    }
+    case S2_shuffoh: {
+      RegisterCell RC = shuffle(rc(1), rc(2), 16, true);
+      return rr0(RC, Outputs);
+    }
+    case C2_mask: {
+      uint16_t WR = W0;
+      uint16_t WP = 8; // XXX Pred size: getRegBitWidth(Reg[1]);
+      assert(WR == 64 && WP == 8);
+      RegisterCell R1 = rc(1);
+      RegisterCell RC(WR);
+      for (uint16_t i = 0; i < WP; ++i) {
+        const BT::BitValue &V = R1[i];
+        BT::BitValue F = (V.is(0) || V.is(1)) ? V : BT::BitValue::self();
+        RC.fill(i*8, i*8+8, F);
+      }
+      return rr0(RC, Outputs);
+    }
+
+    // Mux:
+
+    case C2_muxii:
+    case C2_muxir:
+    case C2_muxri:
+    case C2_mux: {
+      BT::BitValue PC0 = rc(1)[0];
+      RegisterCell R2 = cop(2, W0);
+      RegisterCell R3 = cop(3, W0);
+      if (PC0.is(0) || PC0.is(1))
+        return rr0(RegisterCell::ref(PC0 ? R2 : R3), Outputs);
+      R2.meet(R3, Reg[0].Reg);
+      return rr0(R2, Outputs);
+    }
+    case C2_vmux:
+      // TODO
+      break;
+
+    // Sign- and zero-extension:
+
+    case A2_sxtb:
+      return rr0(eSXT(rc(1), 8), Outputs);
+    case A2_sxth:
+      return rr0(eSXT(rc(1), 16), Outputs);
+    case A2_sxtw: {
+      uint16_t W1 = getRegBitWidth(Reg[1]);
+      assert(W0 == 64 && W1 == 32);
+      RegisterCell RC = eSXT(rc(1).cat(eIMM(0, W1)), W1);
+      return rr0(RC, Outputs);
+    }
+    case A2_zxtb:
+      return rr0(eZXT(rc(1), 8), Outputs);
+    case A2_zxth:
+      return rr0(eZXT(rc(1), 16), Outputs);
+
+    // Bit count:
+
+    case S2_cl0:
+    case S2_cl0p:
+      // Always produce a 32-bit result.
+      return rr0(eCLB(rc(1), 0/*bit*/, 32), Outputs);
+    case S2_cl1:
+    case S2_cl1p:
+      return rr0(eCLB(rc(1), 1/*bit*/, 32), Outputs);
+    case S2_clb:
+    case S2_clbp: {
+      uint16_t W1 = getRegBitWidth(Reg[1]);
+      RegisterCell R1 = rc(1);
+      BT::BitValue TV = R1[W1-1];
+      if (TV.is(0) || TV.is(1))
+        return rr0(eCLB(R1, TV, 32), Outputs);
+      break;
+    }
+    case S2_ct0:
+    case S2_ct0p:
+      return rr0(eCTB(rc(1), 0/*bit*/, 32), Outputs);
+    case S2_ct1:
+    case S2_ct1p:
+      return rr0(eCTB(rc(1), 1/*bit*/, 32), Outputs);
+    case S5_popcountp:
+      // TODO
+      break;
+
+    case C2_all8: {
+      RegisterCell P1 = rc(1);
+      bool Has0 = false, All1 = true;
+      for (uint16_t i = 0; i < 8/*XXX*/; ++i) {
+        if (!P1[i].is(1))
+          All1 = false;
+        if (!P1[i].is(0))
+          continue;
+        Has0 = true;
+        break;
+      }
+      if (!Has0 && !All1)
+        break;
+      RegisterCell RC(W0);
+      RC.fill(0, W0, (All1 ? BT::BitValue::One : BT::BitValue::Zero));
+      return rr0(RC, Outputs);
+    }
+    case C2_any8: {
+      RegisterCell P1 = rc(1);
+      bool Has1 = false, All0 = true;
+      for (uint16_t i = 0; i < 8/*XXX*/; ++i) {
+        if (!P1[i].is(0))
+          All0 = false;
+        if (!P1[i].is(1))
+          continue;
+        Has1 = true;
+        break;
+      }
+      if (!Has1 && !All0)
+        break;
+      RegisterCell RC(W0);
+      RC.fill(0, W0, (Has1 ? BT::BitValue::One : BT::BitValue::Zero));
+      return rr0(RC, Outputs);
+    }
+    case C2_and:
+      return rr0(eAND(rc(1), rc(2)), Outputs);
+    case C2_andn:
+      return rr0(eAND(rc(1), eNOT(rc(2))), Outputs);
+    case C2_not:
+      return rr0(eNOT(rc(1)), Outputs);
+    case C2_or:
+      return rr0(eORL(rc(1), rc(2)), Outputs);
+    case C2_orn:
+      return rr0(eORL(rc(1), eNOT(rc(2))), Outputs);
+    case C2_xor:
+      return rr0(eXOR(rc(1), rc(2)), Outputs);
+    case C4_and_and:
+      return rr0(eAND(rc(1), eAND(rc(2), rc(3))), Outputs);
+    case C4_and_andn:
+      return rr0(eAND(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+    case C4_and_or:
+      return rr0(eAND(rc(1), eORL(rc(2), rc(3))), Outputs);
+    case C4_and_orn:
+      return rr0(eAND(rc(1), eORL(rc(2), eNOT(rc(3)))), Outputs);
+    case C4_or_and:
+      return rr0(eORL(rc(1), eAND(rc(2), rc(3))), Outputs);
+    case C4_or_andn:
+      return rr0(eORL(rc(1), eAND(rc(2), eNOT(rc(3)))), Outputs);
+    case C4_or_or:
+      return rr0(eORL(rc(1), eORL(rc(2), rc(3))), Outputs);
+    case C4_or_orn:
+      return rr0(eORL(rc(1), eORL(rc(2), eNOT(rc(3)))), Outputs);
+    case C2_bitsclr:
+    case C2_bitsclri:
+    case C2_bitsset:
+    case C4_nbitsclr:
+    case C4_nbitsclri:
+    case C4_nbitsset:
+      // TODO
+      break;
+    case S2_tstbit_i:
+    case S4_ntstbit_i: {
+      BT::BitValue V = rc(1)[im(2)];
+      if (V.is(0) || V.is(1)) {
+        // If instruction is S2_tstbit_i, test for 1, otherwise test for 0.
+        bool TV = (Opc == S2_tstbit_i);
+        BT::BitValue F = V.is(TV) ? BT::BitValue::One : BT::BitValue::Zero;
+        return rr0(RegisterCell(W0).fill(0, W0, F), Outputs);
+      }
+      break;
+    }
+
+    default:
+      return MachineEvaluator::evaluate(MI, Inputs, Outputs);
+  }
+  #undef im
+  #undef rc
+  #undef op
+  return false;
+}
+
+bool HexagonEvaluator::evaluate(const MachineInstr &BI,
+                                const CellMapType &Inputs,
+                                BranchTargetList &Targets,
+                                bool &FallsThru) const {
+  // We need to evaluate one branch at a time. TII::analyzeBranch checks
+  // all the branches in a basic block at once, so we cannot use it.
+  unsigned Opc = BI.getOpcode();
+  bool SimpleBranch = false;
+  bool Negated = false;
+  switch (Opc) {
+    case Hexagon::J2_jumpf:
+    case Hexagon::J2_jumpfpt:
+    case Hexagon::J2_jumpfnew:
+    case Hexagon::J2_jumpfnewpt:
+      Negated = true;
+    case Hexagon::J2_jumpt:
+    case Hexagon::J2_jumptpt:
+    case Hexagon::J2_jumptnew:
+    case Hexagon::J2_jumptnewpt:
+      // Simple branch:  if([!]Pn) jump ...
+      // i.e. Op0 = predicate, Op1 = branch target.
+      SimpleBranch = true;
+      break;
+    case Hexagon::J2_jump:
+      Targets.insert(BI.getOperand(0).getMBB());
+      FallsThru = false;
+      return true;
+    default:
+      // If the branch is of unknown type, assume that all successors are
+      // executable.
+      return false;
+  }
+
+  if (!SimpleBranch)
+    return false;
+
+  // BI is a conditional branch if we got here.
+  RegisterRef PR = BI.getOperand(0);
+  RegisterCell PC = getCell(PR, Inputs);
+  const BT::BitValue &Test = PC[0];
+
+  // If the condition is neither true nor false, then it's unknown.
+  if (!Test.is(0) && !Test.is(1))
+    return false;
+
+  // "Test.is(!Negated)" means "branch condition is true".
+  if (!Test.is(!Negated)) {
+    // Condition known to be false.
+    FallsThru = true;
+    return true;
+  }
+
+  Targets.insert(BI.getOperand(1).getMBB());
+  FallsThru = false;
+  return true;
+}
+
+bool HexagonEvaluator::evaluateLoad(const MachineInstr &MI,
+                                    const CellMapType &Inputs,
+                                    CellMapType &Outputs) const {
+  if (TII.isPredicated(MI))
+    return false;
+  assert(MI.mayLoad() && "A load that mayn't?");
+  unsigned Opc = MI.getOpcode();
+
+  uint16_t BitNum;
+  bool SignEx;
+  using namespace Hexagon;
+
+  switch (Opc) {
+    default:
+      return false;
+
+#if 0
+    // memb_fifo
+    case L2_loadalignb_pbr:
+    case L2_loadalignb_pcr:
+    case L2_loadalignb_pi:
+    // memh_fifo
+    case L2_loadalignh_pbr:
+    case L2_loadalignh_pcr:
+    case L2_loadalignh_pi:
+    // membh
+    case L2_loadbsw2_pbr:
+    case L2_loadbsw2_pci:
+    case L2_loadbsw2_pcr:
+    case L2_loadbsw2_pi:
+    case L2_loadbsw4_pbr:
+    case L2_loadbsw4_pci:
+    case L2_loadbsw4_pcr:
+    case L2_loadbsw4_pi:
+    // memubh
+    case L2_loadbzw2_pbr:
+    case L2_loadbzw2_pci:
+    case L2_loadbzw2_pcr:
+    case L2_loadbzw2_pi:
+    case L2_loadbzw4_pbr:
+    case L2_loadbzw4_pci:
+    case L2_loadbzw4_pcr:
+    case L2_loadbzw4_pi:
+#endif
+
+    case L2_loadrbgp:
+    case L2_loadrb_io:
+    case L2_loadrb_pbr:
+    case L2_loadrb_pci:
+    case L2_loadrb_pcr:
+    case L2_loadrb_pi:
+    case PS_loadrbabs:
+    case L4_loadrb_ap:
+    case L4_loadrb_rr:
+    case L4_loadrb_ur:
+      BitNum = 8;
+      SignEx = true;
+      break;
+
+    case L2_loadrubgp:
+    case L2_loadrub_io:
+    case L2_loadrub_pbr:
+    case L2_loadrub_pci:
+    case L2_loadrub_pcr:
+    case L2_loadrub_pi:
+    case PS_loadrubabs:
+    case L4_loadrub_ap:
+    case L4_loadrub_rr:
+    case L4_loadrub_ur:
+      BitNum = 8;
+      SignEx = false;
+      break;
+
+    case L2_loadrhgp:
+    case L2_loadrh_io:
+    case L2_loadrh_pbr:
+    case L2_loadrh_pci:
+    case L2_loadrh_pcr:
+    case L2_loadrh_pi:
+    case PS_loadrhabs:
+    case L4_loadrh_ap:
+    case L4_loadrh_rr:
+    case L4_loadrh_ur:
+      BitNum = 16;
+      SignEx = true;
+      break;
+
+    case L2_loadruhgp:
+    case L2_loadruh_io:
+    case L2_loadruh_pbr:
+    case L2_loadruh_pci:
+    case L2_loadruh_pcr:
+    case L2_loadruh_pi:
+    case L4_loadruh_rr:
+    case PS_loadruhabs:
+    case L4_loadruh_ap:
+    case L4_loadruh_ur:
+      BitNum = 16;
+      SignEx = false;
+      break;
+
+    case L2_loadrigp:
+    case L2_loadri_io:
+    case L2_loadri_pbr:
+    case L2_loadri_pci:
+    case L2_loadri_pcr:
+    case L2_loadri_pi:
+    case L2_loadw_locked:
+    case PS_loadriabs:
+    case L4_loadri_ap:
+    case L4_loadri_rr:
+    case L4_loadri_ur:
+    case LDriw_pred:
+      BitNum = 32;
+      SignEx = true;
+      break;
+
+    case L2_loadrdgp:
+    case L2_loadrd_io:
+    case L2_loadrd_pbr:
+    case L2_loadrd_pci:
+    case L2_loadrd_pcr:
+    case L2_loadrd_pi:
+    case L4_loadd_locked:
+    case PS_loadrdabs:
+    case L4_loadrd_ap:
+    case L4_loadrd_rr:
+    case L4_loadrd_ur:
+      BitNum = 64;
+      SignEx = true;
+      break;
+  }
+
+  const MachineOperand &MD = MI.getOperand(0);
+  assert(MD.isReg() && MD.isDef());
+  RegisterRef RD = MD;
+
+  uint16_t W = getRegBitWidth(RD);
+  assert(W >= BitNum && BitNum > 0);
+  RegisterCell Res(W);
+
+  for (uint16_t i = 0; i < BitNum; ++i)
+    Res[i] = BT::BitValue::self(BT::BitRef(RD.Reg, i));
+
+  if (SignEx) {
+    const BT::BitValue &Sign = Res[BitNum-1];
+    for (uint16_t i = BitNum; i < W; ++i)
+      Res[i] = BT::BitValue::ref(Sign);
+  } else {
+    for (uint16_t i = BitNum; i < W; ++i)
+      Res[i] = BT::BitValue::Zero;
+  }
+
+  putCell(RD, Res, Outputs);
+  return true;
+}
+
+bool HexagonEvaluator::evaluateFormalCopy(const MachineInstr &MI,
+                                          const CellMapType &Inputs,
+                                          CellMapType &Outputs) const {
+  // If MI defines a formal parameter, but is not a copy (loads are handled
+  // in evaluateLoad), then it's not clear what to do.
+  assert(MI.isCopy());
+
+  RegisterRef RD = MI.getOperand(0);
+  RegisterRef RS = MI.getOperand(1);
+  assert(RD.Sub == 0);
+  if (!TargetRegisterInfo::isPhysicalRegister(RS.Reg))
+    return false;
+  RegExtMap::const_iterator F = VRX.find(RD.Reg);
+  if (F == VRX.end())
+    return false;
+
+  uint16_t EW = F->second.Width;
+  // Store RD's cell into the map. This will associate the cell with a virtual
+  // register, and make zero-/sign-extends possible (otherwise we would be ex-
+  // tending "self" bit values, which will have no effect, since "self" values
+  // cannot be references to anything).
+  putCell(RD, getCell(RS, Inputs), Outputs);
+
+  RegisterCell Res;
+  // Read RD's cell from the outputs instead of RS's cell from the inputs:
+  if (F->second.Type == ExtType::SExt)
+    Res = eSXT(getCell(RD, Outputs), EW);
+  else if (F->second.Type == ExtType::ZExt)
+    Res = eZXT(getCell(RD, Outputs), EW);
+
+  putCell(RD, Res, Outputs);
+  return true;
+}
+
+
+unsigned HexagonEvaluator::getNextPhysReg(unsigned PReg, unsigned Width) const {
+  using namespace Hexagon;
+  bool Is64 = DoubleRegsRegClass.contains(PReg);
+  assert(PReg == 0 || Is64 || IntRegsRegClass.contains(PReg));
+
+  static const unsigned Phys32[] = { R0, R1, R2, R3, R4, R5 };
+  static const unsigned Phys64[] = { D0, D1, D2 };
+  const unsigned Num32 = sizeof(Phys32)/sizeof(unsigned);
+  const unsigned Num64 = sizeof(Phys64)/sizeof(unsigned);
+
+  // Return the first parameter register of the required width.
+  if (PReg == 0)
+    return (Width <= 32) ? Phys32[0] : Phys64[0];
+
+  // Set Idx32, Idx64 in such a way that Idx+1 would give the index of the
+  // next register.
+  unsigned Idx32 = 0, Idx64 = 0;
+  if (!Is64) {
+    while (Idx32 < Num32) {
+      if (Phys32[Idx32] == PReg)
+        break;
+      Idx32++;
+    }
+    Idx64 = Idx32/2;
+  } else {
+    while (Idx64 < Num64) {
+      if (Phys64[Idx64] == PReg)
+        break;
+      Idx64++;
+    }
+    Idx32 = Idx64*2+1;
+  }
+
+  if (Width <= 32)
+    return (Idx32+1 < Num32) ? Phys32[Idx32+1] : 0;
+  return (Idx64+1 < Num64) ? Phys64[Idx64+1] : 0;
+}
+
+
+unsigned HexagonEvaluator::getVirtRegFor(unsigned PReg) const {
+  typedef MachineRegisterInfo::livein_iterator iterator;
+  for (iterator I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) {
+    if (I->first == PReg)
+      return I->second;
+  }
+  return 0;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.h b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.h
new file mode 100644
index 000000000000..9e7b1dbe298f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.h
@@ -0,0 +1,64 @@
+//===--- HexagonBitTracker.h ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONBITTRACKER_H
+#define HEXAGONBITTRACKER_H
+
+#include "BitTracker.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+  class HexagonInstrInfo;
+  class HexagonRegisterInfo;
+
+struct HexagonEvaluator : public BitTracker::MachineEvaluator {
+  typedef BitTracker::CellMapType CellMapType;
+  typedef BitTracker::RegisterRef RegisterRef;
+  typedef BitTracker::RegisterCell RegisterCell;
+  typedef BitTracker::BranchTargetList BranchTargetList;
+
+  HexagonEvaluator(const HexagonRegisterInfo &tri, MachineRegisterInfo &mri,
+                   const HexagonInstrInfo &tii, MachineFunction &mf);
+
+  bool evaluate(const MachineInstr &MI, const CellMapType &Inputs,
+                CellMapType &Outputs) const override;
+  bool evaluate(const MachineInstr &BI, const CellMapType &Inputs,
+                BranchTargetList &Targets, bool &FallsThru) const override;
+
+  BitTracker::BitMask mask(unsigned Reg, unsigned Sub) const override;
+
+  MachineFunction &MF;
+  MachineFrameInfo &MFI;
+  const HexagonInstrInfo &TII;
+
+private:
+  bool evaluateLoad(const MachineInstr &MI, const CellMapType &Inputs,
+                    CellMapType &Outputs) const;
+  bool evaluateFormalCopy(const MachineInstr &MI, const CellMapType &Inputs,
+                          CellMapType &Outputs) const;
+
+  unsigned getNextPhysReg(unsigned PReg, unsigned Width) const;
+  unsigned getVirtRegFor(unsigned PReg) const;
+
+  // Type of formal parameter extension.
+  struct ExtType {
+    enum { SExt, ZExt };
+    char Type;
+    uint16_t Width;
+    ExtType() : Type(0), Width(0) {}
+    ExtType(char t, uint16_t w) : Type(t), Width(w) {}
+  };
+  // Map VR -> extension type.
+  typedef DenseMap<unsigned, ExtType> RegExtMap;
+  RegExtMap VRX;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
new file mode 100644
index 000000000000..adc213c3d438
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
@@ -0,0 +1,483 @@
+//===--- HexagonBlockRanges.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hbr"
+
+#include "HexagonBlockRanges.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <map>
+
+using namespace llvm;
+
+bool HexagonBlockRanges::IndexRange::overlaps(const IndexRange &A) const {
+  // If A contains start(), or "this" contains A.start(), then overlap.
+  IndexType S = start(), E = end(), AS = A.start(), AE = A.end();
+  if (AS == S)
+    return true;
+  bool SbAE = (S < AE) || (S == AE && A.TiedEnd);  // S-before-AE.
+  bool ASbE = (AS < E) || (AS == E && TiedEnd);    // AS-before-E.
+  if ((AS < S && SbAE) || (S < AS && ASbE))
+    return true;
+  // Otherwise no overlap.
+  return false;
+}
+
+bool HexagonBlockRanges::IndexRange::contains(const IndexRange &A) const {
+  if (start() <= A.start()) {
+    // Treat "None" in the range end as equal to the range start.
+    IndexType E = (end() != IndexType::None) ? end() : start();
+    IndexType AE = (A.end() != IndexType::None) ? A.end() : A.start();
+    if (AE <= E)
+      return true;
+  }
+  return false;
+}
+
+void HexagonBlockRanges::IndexRange::merge(const IndexRange &A) {
+  // Allow merging adjacent ranges.
+  assert(end() == A.start() || overlaps(A));
+  IndexType AS = A.start(), AE = A.end();
+  if (AS < start() || start() == IndexType::None)
+    setStart(AS);
+  if (end() < AE || end() == IndexType::None) {
+    setEnd(AE);
+    TiedEnd = A.TiedEnd;
+  } else {
+    if (end() == AE)
+      TiedEnd |= A.TiedEnd;
+  }
+  if (A.Fixed)
+    Fixed = true;
+}
+
+void HexagonBlockRanges::RangeList::include(const RangeList &RL) {
+  for (auto &R : RL)
+    if (!is_contained(*this, R))
+      push_back(R);
+}
+
+// Merge all overlapping ranges in the list, so that all that remains
+// is a list of disjoint ranges.
+void HexagonBlockRanges::RangeList::unionize(bool MergeAdjacent) {
+  if (empty())
+    return;
+
+  std::sort(begin(), end());
+  iterator Iter = begin();
+
+  while (Iter != end()-1) {
+    iterator Next = std::next(Iter);
+    // If MergeAdjacent is true, merge ranges A and B, where A.end == B.start.
+    // This allows merging dead ranges, but is not valid for live ranges.
+    bool Merge = MergeAdjacent && (Iter->end() == Next->start());
+    if (Merge || Iter->overlaps(*Next)) {
+      Iter->merge(*Next);
+      erase(Next);
+      continue;
+    }
+    ++Iter;
+  }
+}
+
+// Compute a range A-B and add it to the list.
+void HexagonBlockRanges::RangeList::addsub(const IndexRange &A,
+      const IndexRange &B) {
+  // Exclusion of non-overlapping ranges makes some checks simpler
+  // later in this function.
+  if (!A.overlaps(B)) {
+    // A - B = A.
+    add(A);
+    return;
+  }
+
+  IndexType AS = A.start(), AE = A.end();
+  IndexType BS = B.start(), BE = B.end();
+
+  // If AE is None, then A is included in B, since A and B overlap.
+  // The result of subtraction if empty, so just return.
+  if (AE == IndexType::None)
+    return;
+
+  if (AS < BS) {
+    // A starts before B.
+    // AE cannot be None since A and B overlap.
+    assert(AE != IndexType::None);
+    // Add the part of A that extends on the "less" side of B.
+    add(AS, BS, A.Fixed, false);
+  }
+
+  if (BE < AE) {
+    // BE cannot be Exit here.
+    if (BE == IndexType::None)
+      add(BS, AE, A.Fixed, false);
+    else
+      add(BE, AE, A.Fixed, false);
+  }
+}
+
+// Subtract a given range from each element in the list.
+void HexagonBlockRanges::RangeList::subtract(const IndexRange &Range) {
+  // Cannot assume that the list is unionized (i.e. contains only non-
+  // overlapping ranges.
+  RangeList T;
+  for (iterator Next, I = begin(); I != end(); I = Next) {
+    IndexRange &Rg = *I;
+    if (Rg.overlaps(Range)) {
+      T.addsub(Rg, Range);
+      Next = this->erase(I);
+    } else {
+      Next = std::next(I);
+    }
+  }
+  include(T);
+}
+
+HexagonBlockRanges::InstrIndexMap::InstrIndexMap(MachineBasicBlock &B)
+    : Block(B) {
+  IndexType Idx = IndexType::First;
+  First = Idx;
+  for (auto &In : B) {
+    if (In.isDebugValue())
+      continue;
+    assert(getIndex(&In) == IndexType::None && "Instruction already in map");
+    Map.insert(std::make_pair(Idx, &In));
+    ++Idx;
+  }
+  Last = B.empty() ? IndexType::None : unsigned(Idx)-1;
+}
+
+MachineInstr *HexagonBlockRanges::InstrIndexMap::getInstr(IndexType Idx) const {
+  auto F = Map.find(Idx);
+  return (F != Map.end()) ? F->second : nullptr;
+}
+
+HexagonBlockRanges::IndexType HexagonBlockRanges::InstrIndexMap::getIndex(
+      MachineInstr *MI) const {
+  for (auto &I : Map)
+    if (I.second == MI)
+      return I.first;
+  return IndexType::None;
+}
+
+HexagonBlockRanges::IndexType HexagonBlockRanges::InstrIndexMap::getPrevIndex(
+      IndexType Idx) const {
+  assert (Idx != IndexType::None);
+  if (Idx == IndexType::Entry)
+    return IndexType::None;
+  if (Idx == IndexType::Exit)
+    return Last;
+  if (Idx == First)
+    return IndexType::Entry;
+  return unsigned(Idx)-1;
+}
+
+HexagonBlockRanges::IndexType HexagonBlockRanges::InstrIndexMap::getNextIndex(
+      IndexType Idx) const {
+  assert (Idx != IndexType::None);
+  if (Idx == IndexType::Entry)
+    return IndexType::First;
+  if (Idx == IndexType::Exit || Idx == Last)
+    return IndexType::None;
+  return unsigned(Idx)+1;
+}
+
+void HexagonBlockRanges::InstrIndexMap::replaceInstr(MachineInstr *OldMI,
+      MachineInstr *NewMI) {
+  for (auto &I : Map) {
+    if (I.second != OldMI)
+      continue;
+    if (NewMI != nullptr)
+      I.second = NewMI;
+    else
+      Map.erase(I.first);
+    break;
+  }
+}
+
+HexagonBlockRanges::HexagonBlockRanges(MachineFunction &mf)
+  : MF(mf), HST(mf.getSubtarget<HexagonSubtarget>()),
+    TII(*HST.getInstrInfo()), TRI(*HST.getRegisterInfo()),
+    Reserved(TRI.getReservedRegs(mf)) {
+  // Consider all non-allocatable registers as reserved.
+  for (auto I = TRI.regclass_begin(), E = TRI.regclass_end(); I != E; ++I) {
+    auto *RC = *I;
+    if (RC->isAllocatable())
+      continue;
+    for (unsigned R : *RC)
+      Reserved[R] = true;
+  }
+}
+
+HexagonBlockRanges::RegisterSet HexagonBlockRanges::getLiveIns(
+      const MachineBasicBlock &B, const MachineRegisterInfo &MRI,
+      const TargetRegisterInfo &TRI) {
+  RegisterSet LiveIns;
+  RegisterSet Tmp;
+  for (auto I : B.liveins()) {
+    if (I.LaneMask.all()) {
+      Tmp.insert({I.PhysReg,0});
+      continue;
+    }
+    for (MCSubRegIndexIterator S(I.PhysReg, &TRI); S.isValid(); ++S) {
+      LaneBitmask M = TRI.getSubRegIndexLaneMask(S.getSubRegIndex());
+      if ((M & I.LaneMask).any())
+        Tmp.insert({S.getSubReg(), 0});
+    }
+  }
+
+  for (auto R : Tmp) {
+    if (!Reserved[R.Reg])
+      LiveIns.insert(R);
+    for (auto S : expandToSubRegs(R, MRI, TRI))
+      if (!Reserved[S.Reg])
+        LiveIns.insert(S);
+  }
+  return LiveIns;
+}
+
+HexagonBlockRanges::RegisterSet HexagonBlockRanges::expandToSubRegs(
+      RegisterRef R, const MachineRegisterInfo &MRI,
+      const TargetRegisterInfo &TRI) {
+  RegisterSet SRs;
+
+  if (R.Sub != 0) {
+    SRs.insert(R);
+    return SRs;
+  }
+
+  if (TargetRegisterInfo::isPhysicalRegister(R.Reg)) {
+    MCSubRegIterator I(R.Reg, &TRI);
+    if (!I.isValid())
+      SRs.insert({R.Reg, 0});
+    for (; I.isValid(); ++I)
+      SRs.insert({*I, 0});
+  } else {
+    assert(TargetRegisterInfo::isVirtualRegister(R.Reg));
+    auto &RC = *MRI.getRegClass(R.Reg);
+    unsigned PReg = *RC.begin();
+    MCSubRegIndexIterator I(PReg, &TRI);
+    if (!I.isValid())
+      SRs.insert({R.Reg, 0});
+    for (; I.isValid(); ++I)
+      SRs.insert({R.Reg, I.getSubRegIndex()});
+  }
+  return SRs;
+}
+
+void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
+      RegToRangeMap &LiveMap) {
+  std::map<RegisterRef,IndexType> LastDef, LastUse;
+  RegisterSet LiveOnEntry;
+  MachineBasicBlock &B = IndexMap.getBlock();
+  MachineRegisterInfo &MRI = B.getParent()->getRegInfo();
+
+  for (auto R : getLiveIns(B, MRI, TRI))
+    LiveOnEntry.insert(R);
+
+  for (auto R : LiveOnEntry)
+    LastDef[R] = IndexType::Entry;
+
+  auto closeRange = [&LastUse,&LastDef,&LiveMap] (RegisterRef R) -> void {
+    auto LD = LastDef[R], LU = LastUse[R];
+    if (LD == IndexType::None)
+      LD = IndexType::Entry;
+    if (LU == IndexType::None)
+      LU = IndexType::Exit;
+    LiveMap[R].add(LD, LU, false, false);
+    LastUse[R] = LastDef[R] = IndexType::None;
+  };
+
+  for (auto &In : B) {
+    if (In.isDebugValue())
+      continue;
+    IndexType Index = IndexMap.getIndex(&In);
+    // Process uses first.
+    for (auto &Op : In.operands()) {
+      if (!Op.isReg() || !Op.isUse() || Op.isUndef())
+        continue;
+      RegisterRef R = { Op.getReg(), Op.getSubReg() };
+      if (TargetRegisterInfo::isPhysicalRegister(R.Reg) && Reserved[R.Reg])
+        continue;
+      bool IsKill = Op.isKill();
+      for (auto S : expandToSubRegs(R, MRI, TRI)) {
+        LastUse[S] = Index;
+        if (IsKill)
+          closeRange(S);
+      }
+    }
+    // Process defs.
+    for (auto &Op : In.operands()) {
+      if (!Op.isReg() || !Op.isDef() || Op.isUndef())
+        continue;
+      RegisterRef R = { Op.getReg(), Op.getSubReg() };
+      if (TargetRegisterInfo::isPhysicalRegister(R.Reg) && Reserved[R.Reg])
+        continue;
+      for (auto S : expandToSubRegs(R, MRI, TRI)) {
+        if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None)
+          closeRange(S);
+        LastDef[S] = Index;
+      }
+    }
+  }
+
+  // Collect live-on-exit.
+  RegisterSet LiveOnExit;
+  for (auto *SB : B.successors())
+    for (auto R : getLiveIns(*SB, MRI, TRI))
+      LiveOnExit.insert(R);
+
+  for (auto R : LiveOnExit)
+    LastUse[R] = IndexType::Exit;
+
+  // Process remaining registers.
+  RegisterSet Left;
+  for (auto &I : LastUse)
+    if (I.second != IndexType::None)
+      Left.insert(I.first);
+  for (auto &I : LastDef)
+    if (I.second != IndexType::None)
+      Left.insert(I.first);
+  for (auto R : Left)
+    closeRange(R);
+
+  // Finalize the live ranges.
+  for (auto &P : LiveMap)
+    P.second.unionize();
+}
+
+HexagonBlockRanges::RegToRangeMap HexagonBlockRanges::computeLiveMap(
+      InstrIndexMap &IndexMap) {
+  RegToRangeMap LiveMap;
+  DEBUG(dbgs() << __func__ << ": index map\n" << IndexMap << '\n');
+  computeInitialLiveRanges(IndexMap, LiveMap);
+  DEBUG(dbgs() << __func__ << ": live map\n"
+               << PrintRangeMap(LiveMap, TRI) << '\n');
+  return LiveMap;
+}
+
+HexagonBlockRanges::RegToRangeMap HexagonBlockRanges::computeDeadMap(
+      InstrIndexMap &IndexMap, RegToRangeMap &LiveMap) {
+  RegToRangeMap DeadMap;
+
+  auto addDeadRanges = [&IndexMap,&LiveMap,&DeadMap] (RegisterRef R) -> void {
+    auto F = LiveMap.find(R);
+    if (F == LiveMap.end() || F->second.empty()) {
+      DeadMap[R].add(IndexType::Entry, IndexType::Exit, false, false);
+      return;
+    }
+
+    RangeList &RL = F->second;
+    RangeList::iterator A = RL.begin(), Z = RL.end()-1;
+
+    // Try to create the initial range.
+    if (A->start() != IndexType::Entry) {
+      IndexType DE = IndexMap.getPrevIndex(A->start());
+      if (DE != IndexType::Entry)
+        DeadMap[R].add(IndexType::Entry, DE, false, false);
+    }
+
+    while (A != Z) {
+      // Creating a dead range that follows A.  Pay attention to empty
+      // ranges (i.e. those ending with "None").
+      IndexType AE = (A->end() == IndexType::None) ? A->start() : A->end();
+      IndexType DS = IndexMap.getNextIndex(AE);
+      ++A;
+      IndexType DE = IndexMap.getPrevIndex(A->start());
+      if (DS < DE)
+        DeadMap[R].add(DS, DE, false, false);
+    }
+
+    // Try to create the final range.
+    if (Z->end() != IndexType::Exit) {
+      IndexType ZE = (Z->end() == IndexType::None) ? Z->start() : Z->end();
+      IndexType DS = IndexMap.getNextIndex(ZE);
+      if (DS < IndexType::Exit)
+        DeadMap[R].add(DS, IndexType::Exit, false, false);
+    }
+  };
+
+  MachineFunction &MF = *IndexMap.getBlock().getParent();
+  auto &MRI = MF.getRegInfo();
+  unsigned NumRegs = TRI.getNumRegs();
+  BitVector Visited(NumRegs);
+  for (unsigned R = 1; R < NumRegs; ++R) {
+    for (auto S : expandToSubRegs({R,0}, MRI, TRI)) {
+      if (Reserved[S.Reg] || Visited[S.Reg])
+        continue;
+      addDeadRanges(S);
+      Visited[S.Reg] = true;
+    }
+  }
+  for (auto &P : LiveMap)
+    if (TargetRegisterInfo::isVirtualRegister(P.first.Reg))
+      addDeadRanges(P.first);
+
+  DEBUG(dbgs() << __func__ << ": dead map\n"
+               << PrintRangeMap(DeadMap, TRI) << '\n');
+  return DeadMap;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+                              HexagonBlockRanges::IndexType Idx) {
+  if (Idx == HexagonBlockRanges::IndexType::None)
+    return OS << '-';
+  if (Idx == HexagonBlockRanges::IndexType::Entry)
+    return OS << 'n';
+  if (Idx == HexagonBlockRanges::IndexType::Exit)
+    return OS << 'x';
+  return OS << unsigned(Idx)-HexagonBlockRanges::IndexType::First+1;
+}
+
+// A mapping to translate between instructions and their indices.
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+                              const HexagonBlockRanges::IndexRange &IR) {
+  OS << '[' << IR.start() << ':' << IR.end() << (IR.TiedEnd ? '}' : ']');
+  if (IR.Fixed)
+    OS << '!';
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+                              const HexagonBlockRanges::RangeList &RL) {
+  for (auto &R : RL)
+    OS << R << " ";
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+                              const HexagonBlockRanges::InstrIndexMap &M) {
+  for (auto &In : M.Block) {
+    HexagonBlockRanges::IndexType Idx = M.getIndex(&In);
+    OS << Idx << (Idx == M.Last ? ". " : "  ") << In;
+  }
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+                              const HexagonBlockRanges::PrintRangeMap &P) {
+  for (auto &I : P.Map) {
+    const HexagonBlockRanges::RangeList &RL = I.second;
+    OS << PrintReg(I.first.Reg, &P.TRI, I.first.Sub) << " -> " << RL << "\n";
+  }
+  return OS;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.h b/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.h
new file mode 100644
index 000000000000..717480314d16
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.h
@@ -0,0 +1,244 @@
+//===--- HexagonBlockRanges.h -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef HEXAGON_BLOCK_RANGES_H
+#define HEXAGON_BLOCK_RANGES_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include <cassert>
+#include <map>
+#include <set>
+#include <vector>
+#include <utility>
+
+namespace llvm {
+
+class HexagonSubtarget;
+class MachineBasicBlock;
+class MachineFunction;
+class MachineInstr;
+class raw_ostream;
+class TargetInstrInfo;
+class TargetRegisterInfo;
+
+struct HexagonBlockRanges {
+  HexagonBlockRanges(MachineFunction &MF);
+
+  struct RegisterRef {
+    unsigned Reg, Sub;
+    bool operator<(RegisterRef R) const {
+      return Reg < R.Reg || (Reg == R.Reg && Sub < R.Sub);
+    }
+  };
+  typedef std::set<RegisterRef> RegisterSet;
+
+  // This is to represent an "index", which is an abstraction of a position
+  // of an instruction within a basic block.
+  class IndexType {
+  public:
+    enum : unsigned {
+      None  = 0,
+      Entry = 1,
+      Exit  = 2,
+      First = 11  // 10th + 1st
+    };
+
+    IndexType() : Index(None) {}
+    IndexType(unsigned Idx) : Index(Idx) {}
+
+    static bool isInstr(IndexType X) { return X.Index >= First; }
+
+    operator unsigned() const;
+    bool operator== (unsigned x) const;
+    bool operator== (IndexType Idx) const;
+    bool operator!= (unsigned x) const;
+    bool operator!= (IndexType Idx) const;
+    IndexType operator++ ();
+    bool operator< (unsigned Idx) const;
+    bool operator< (IndexType Idx) const;
+    bool operator<= (IndexType Idx) const;
+
+  private:
+    bool operator>  (IndexType Idx) const;
+    bool operator>= (IndexType Idx) const;
+
+    unsigned Index;
+  };
+
+  // A range of indices, essentially a representation of a live range.
+  // This is also used to represent "dead ranges", i.e. ranges where a
+  // register is dead.
+  class IndexRange : public std::pair<IndexType,IndexType> {
+  public:
+    IndexRange() = default;
+    IndexRange(IndexType Start, IndexType End, bool F = false, bool T = false)
+      : std::pair<IndexType,IndexType>(Start, End), Fixed(F), TiedEnd(T) {}
+
+    IndexType start() const { return first; }
+    IndexType end() const   { return second; }
+
+    bool operator< (const IndexRange &A) const {
+      return start() < A.start();
+    }
+
+    bool overlaps(const IndexRange &A) const;
+    bool contains(const IndexRange &A) const;
+    void merge(const IndexRange &A);
+
+    bool Fixed = false;   // Can be renamed?  "Fixed" means "no".
+    bool TiedEnd = false; // The end is not a use, but a dead def tied to a use.
+
+  private:
+    void setStart(const IndexType &S) { first = S; }
+    void setEnd(const IndexType &E)   { second = E; }
+  };
+
+  // A list of index ranges. This represents liveness of a register
+  // in a basic block.
+  class RangeList : public std::vector<IndexRange> {
+  public:
+    void add(IndexType Start, IndexType End, bool Fixed, bool TiedEnd) {
+      push_back(IndexRange(Start, End, Fixed, TiedEnd));
+    }
+    void add(const IndexRange &Range) {
+      push_back(Range);
+    }
+
+    void include(const RangeList &RL);
+    void unionize(bool MergeAdjacent = false);
+    void subtract(const IndexRange &Range);
+
+  private:
+    void addsub(const IndexRange &A, const IndexRange &B);
+  };
+
+  class InstrIndexMap {
+  public:
+    InstrIndexMap(MachineBasicBlock &B);
+
+    MachineInstr *getInstr(IndexType Idx) const;
+    IndexType getIndex(MachineInstr *MI) const;
+    MachineBasicBlock &getBlock() const { return Block; }
+    IndexType getPrevIndex(IndexType Idx) const;
+    IndexType getNextIndex(IndexType Idx) const;
+    void replaceInstr(MachineInstr *OldMI, MachineInstr *NewMI);
+
+    friend raw_ostream &operator<< (raw_ostream &OS, const InstrIndexMap &Map);
+
+    IndexType First, Last;
+
+  private:
+    MachineBasicBlock &Block;
+    std::map<IndexType,MachineInstr*> Map;
+  };
+
+  typedef std::map<RegisterRef,RangeList> RegToRangeMap;
+  RegToRangeMap computeLiveMap(InstrIndexMap &IndexMap);
+  RegToRangeMap computeDeadMap(InstrIndexMap &IndexMap, RegToRangeMap &LiveMap);
+  static RegisterSet expandToSubRegs(RegisterRef R,
+      const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI);
+
+  struct PrintRangeMap {
+    PrintRangeMap(const RegToRangeMap &M, const TargetRegisterInfo &I)
+        : Map(M), TRI(I) {}
+
+    friend raw_ostream &operator<< (raw_ostream &OS, const PrintRangeMap &P);
+
+  private:
+    const RegToRangeMap &Map;
+    const TargetRegisterInfo &TRI;
+  };
+
+private:
+  RegisterSet getLiveIns(const MachineBasicBlock &B,
+      const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI);
+
+  void computeInitialLiveRanges(InstrIndexMap &IndexMap,
+      RegToRangeMap &LiveMap);
+
+  MachineFunction &MF;
+  const HexagonSubtarget &HST;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  BitVector Reserved;
+};
+
+inline HexagonBlockRanges::IndexType::operator unsigned() const {
+  assert(Index >= First);
+  return Index;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator== (unsigned x) const {
+  return Index == x;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator== (IndexType Idx) const {
+  return Index == Idx.Index;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator!= (unsigned x) const {
+  return Index != x;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator!= (IndexType Idx) const {
+  return Index != Idx.Index;
+}
+
+inline
+HexagonBlockRanges::IndexType HexagonBlockRanges::IndexType::operator++ () {
+  assert(Index != None);
+  assert(Index != Exit);
+  if (Index == Entry)
+    Index = First;
+  else
+    ++Index;
+  return *this;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator< (unsigned Idx) const {
+  return operator< (IndexType(Idx));
+}
+
+inline bool HexagonBlockRanges::IndexType::operator< (IndexType Idx) const {
+  // !(x < x).
+  if (Index == Idx.Index)
+    return false;
+  // !(None < x) for all x.
+  // !(x < None) for all x.
+  if (Index == None || Idx.Index == None)
+    return false;
+  // !(Exit < x) for all x.
+  // !(x < Entry) for all x.
+  if (Index == Exit || Idx.Index == Entry)
+    return false;
+  // Entry < x for all x != Entry.
+  // x < Exit for all x != Exit.
+  if (Index == Entry || Idx.Index == Exit)
+    return true;
+
+  return Index < Idx.Index;
+}
+
+inline bool HexagonBlockRanges::IndexType::operator<= (IndexType Idx) const {
+  return operator==(Idx) || operator<(Idx);
+}
+
+raw_ostream &operator<< (raw_ostream &OS, HexagonBlockRanges::IndexType Idx);
+raw_ostream &operator<< (raw_ostream &OS,
+      const HexagonBlockRanges::IndexRange &IR);
+raw_ostream &operator<< (raw_ostream &OS,
+      const HexagonBlockRanges::RangeList &RL);
+raw_ostream &operator<< (raw_ostream &OS,
+      const HexagonBlockRanges::InstrIndexMap &M);
+raw_ostream &operator<< (raw_ostream &OS,
+      const HexagonBlockRanges::PrintRangeMap &P);
+
+} // end namespace llvm
+
+#endif // HEXAGON_BLOCK_RANGES_H
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
new file mode 100644
index 000000000000..84af4b14b9f7
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
@@ -0,0 +1,219 @@
+//===--- HexagonBranchRelaxation.cpp - Identify and relax long jumps ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-brelax"
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+
+using namespace llvm;
+
+// Since we have no exact knowledge of code layout, allow some safety buffer
+// for jump target. This is measured in bytes.
+static cl::opt<uint32_t> BranchRelaxSafetyBuffer("branch-relax-safety-buffer",
+  cl::init(200), cl::Hidden, cl::ZeroOrMore, cl::desc("safety buffer size"));
+
+namespace llvm {
+
+  FunctionPass *createHexagonBranchRelaxation();
+  void initializeHexagonBranchRelaxationPass(PassRegistry&);
+
+} // end namespace llvm
+
+namespace {
+
+  struct HexagonBranchRelaxation : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    HexagonBranchRelaxation() : MachineFunctionPass(ID) {
+      initializeHexagonBranchRelaxationPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    StringRef getPassName() const override {
+      return "Hexagon Branch Relaxation";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    const HexagonInstrInfo *HII;
+    const HexagonRegisterInfo *HRI;
+
+    bool relaxBranches(MachineFunction &MF);
+    void computeOffset(MachineFunction &MF,
+          DenseMap<MachineBasicBlock*, unsigned> &BlockToInstOffset);
+    bool reGenerateBranch(MachineFunction &MF,
+          DenseMap<MachineBasicBlock*, unsigned> &BlockToInstOffset);
+    bool isJumpOutOfRange(MachineInstr &MI,
+          DenseMap<MachineBasicBlock*, unsigned> &BlockToInstOffset);
+  };
+
+  char HexagonBranchRelaxation::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(HexagonBranchRelaxation, "hexagon-brelax",
+                "Hexagon Branch Relaxation", false, false)
+
+FunctionPass *llvm::createHexagonBranchRelaxation() {
+  return new HexagonBranchRelaxation();
+}
+
+bool HexagonBranchRelaxation::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "****** Hexagon Branch Relaxation ******\n");
+
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  HII = HST.getInstrInfo();
+  HRI = HST.getRegisterInfo();
+
+  bool Changed = false;
+  Changed = relaxBranches(MF);
+  return Changed;
+}
+
+void HexagonBranchRelaxation::computeOffset(MachineFunction &MF,
+      DenseMap<MachineBasicBlock*, unsigned> &OffsetMap) {
+  // offset of the current instruction from the start.
+  unsigned InstOffset = 0;
+  for (auto &B : MF) {
+    if (B.getAlignment()) {
+      // Although we don't know the exact layout of the final code, we need
+      // to account for alignment padding somehow. This heuristic pads each
+      // aligned basic block according to the alignment value.
+      int ByteAlign = (1u << B.getAlignment()) - 1;
+      InstOffset = (InstOffset + ByteAlign) & ~(ByteAlign);
+    }
+    OffsetMap[&B] = InstOffset;
+    for (auto &MI : B.instrs())
+      InstOffset += HII->getSize(MI);
+  }
+}
+
+/// relaxBranches - For Hexagon, if the jump target/loop label is too far from
+/// the jump/loop instruction then, we need to make sure that we have constant
+/// extenders set for jumps and loops.
+
+/// There are six iterations in this phase. It's self explanatory below.
+bool HexagonBranchRelaxation::relaxBranches(MachineFunction &MF) {
+  // Compute the offset of each basic block
+  // offset of the current instruction from the start.
+  // map for each instruction to the beginning of the function
+  DenseMap<MachineBasicBlock*, unsigned> BlockToInstOffset;
+  computeOffset(MF, BlockToInstOffset);
+
+  return reGenerateBranch(MF, BlockToInstOffset);
+}
+
+/// Check if a given instruction is:
+/// - a jump to a distant target
+/// - that exceeds its immediate range
+/// If both conditions are true, it requires constant extension.
+bool HexagonBranchRelaxation::isJumpOutOfRange(MachineInstr &MI,
+      DenseMap<MachineBasicBlock*, unsigned> &BlockToInstOffset) {
+  MachineBasicBlock &B = *MI.getParent();
+  auto FirstTerm = B.getFirstInstrTerminator();
+  if (FirstTerm == B.instr_end())
+    return false;
+
+  unsigned InstOffset = BlockToInstOffset[&B];
+  unsigned Distance = 0;
+
+  // To save time, estimate exact position of a branch instruction
+  // as one at the end of the MBB.
+  // Number of instructions times typical instruction size.
+  InstOffset += HII->nonDbgBBSize(&B) * HEXAGON_INSTR_SIZE;
+
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+
+  // Try to analyze this branch.
+  if (HII->analyzeBranch(B, TBB, FBB, Cond, false)) {
+    // Could not analyze it. See if this is something we can recognize.
+    // If it is a NVJ, it should always have its target in
+    // a fixed location.
+    if (HII->isNewValueJump(*FirstTerm))
+      TBB = FirstTerm->getOperand(HII->getCExtOpNum(*FirstTerm)).getMBB();
+  }
+  if (TBB && &MI == &*FirstTerm) {
+    Distance = std::abs((long long)InstOffset - BlockToInstOffset[TBB])
+                + BranchRelaxSafetyBuffer;
+    return !HII->isJumpWithinBranchRange(*FirstTerm, Distance);
+  }
+  if (FBB) {
+    // Look for second terminator.
+    auto SecondTerm = std::next(FirstTerm);
+    assert(SecondTerm != B.instr_end() &&
+          (SecondTerm->isBranch() || SecondTerm->isCall()) &&
+          "Bad second terminator");
+    if (&MI != &*SecondTerm)
+      return false;
+    // Analyze the second branch in the BB.
+    Distance = std::abs((long long)InstOffset - BlockToInstOffset[FBB])
+                + BranchRelaxSafetyBuffer;
+    return !HII->isJumpWithinBranchRange(*SecondTerm, Distance);
+  }
+  return false;
+}
+
+bool HexagonBranchRelaxation::reGenerateBranch(MachineFunction &MF,
+      DenseMap<MachineBasicBlock*, unsigned> &BlockToInstOffset) {
+  bool Changed = false;
+
+  for (auto &B : MF) {
+    for (auto &MI : B) {
+      if (!MI.isBranch() || !isJumpOutOfRange(MI, BlockToInstOffset))
+        continue;
+      DEBUG(dbgs() << "Long distance jump. isExtendable("
+                   << HII->isExtendable(MI) << ") isConstExtended("
+                   << HII->isConstExtended(MI) << ") " << MI);
+
+      // Since we have not merged HW loops relaxation into
+      // this code (yet), soften our approach for the moment.
+      if (!HII->isExtendable(MI) && !HII->isExtended(MI)) {
+        DEBUG(dbgs() << "\tUnderimplemented relax branch instruction.\n");
+      } else {
+        // Find which operand is expandable.
+        int ExtOpNum = HII->getCExtOpNum(MI);
+        MachineOperand &MO = MI.getOperand(ExtOpNum);
+        // This need to be something we understand. So far we assume all
+        // branches have only MBB address as expandable field.
+        // If it changes, this will need to be expanded.
+        assert(MO.isMBB() && "Branch with unknown expandable field type");
+        // Mark given operand as extended.
+        MO.addTargetFlag(HexagonII::HMOTF_ConstExtended);
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
new file mode 100644
index 000000000000..2f8fe6e087f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -0,0 +1,253 @@
+//===-- HexagonCFGOptimizer.cpp - CFG optimizations -----------------------===//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon_cfg"
+
+namespace llvm {
+  FunctionPass *createHexagonCFGOptimizer();
+  void initializeHexagonCFGOptimizerPass(PassRegistry&);
+}
+
+
+namespace {
+
+class HexagonCFGOptimizer : public MachineFunctionPass {
+
+private:
+  void InvertAndChangeJumpTarget(MachineInstr &, MachineBasicBlock *);
+
+public:
+  static char ID;
+  HexagonCFGOptimizer() : MachineFunctionPass(ID) {
+    initializeHexagonCFGOptimizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "Hexagon CFG Optimizer"; }
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+};
+
+
+char HexagonCFGOptimizer::ID = 0;
+
+static bool IsConditionalBranch(int Opc) {
+  switch (Opc) {
+    case Hexagon::J2_jumpt:
+    case Hexagon::J2_jumptpt:
+    case Hexagon::J2_jumpf:
+    case Hexagon::J2_jumpfpt:
+    case Hexagon::J2_jumptnew:
+    case Hexagon::J2_jumpfnew:
+    case Hexagon::J2_jumptnewpt:
+    case Hexagon::J2_jumpfnewpt:
+      return true;
+  }
+  return false;
+}
+
+
+static bool IsUnconditionalJump(int Opc) {
+  return (Opc == Hexagon::J2_jump);
+}
+
+void HexagonCFGOptimizer::InvertAndChangeJumpTarget(
+    MachineInstr &MI, MachineBasicBlock *NewTarget) {
+  const TargetInstrInfo *TII =
+      MI.getParent()->getParent()->getSubtarget().getInstrInfo();
+  int NewOpcode = 0;
+  switch (MI.getOpcode()) {
+  case Hexagon::J2_jumpt:
+    NewOpcode = Hexagon::J2_jumpf;
+    break;
+
+  case Hexagon::J2_jumpf:
+    NewOpcode = Hexagon::J2_jumpt;
+    break;
+
+  case Hexagon::J2_jumptnewpt:
+    NewOpcode = Hexagon::J2_jumpfnewpt;
+    break;
+
+  case Hexagon::J2_jumpfnewpt:
+    NewOpcode = Hexagon::J2_jumptnewpt;
+    break;
+
+  default:
+    llvm_unreachable("Cannot handle this case");
+  }
+
+  MI.setDesc(TII->get(NewOpcode));
+  MI.getOperand(1).setMBB(NewTarget);
+}
+
+
+bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock *MBB = &*MBBb;
+
+    // Traverse the basic block.
+    MachineBasicBlock::iterator MII = MBB->getFirstTerminator();
+    if (MII != MBB->end()) {
+      MachineInstr &MI = *MII;
+      int Opc = MI.getOpcode();
+      if (IsConditionalBranch(Opc)) {
+
+        //
+        // (Case 1) Transform the code if the following condition occurs:
+        //   BB1: if (p0) jump BB3
+        //   ...falls-through to BB2 ...
+        //   BB2: jump BB4
+        //   ...next block in layout is BB3...
+        //   BB3: ...
+        //
+        //  Transform this to:
+        //  BB1: if (!p0) jump BB4
+        //  Remove BB2
+        //  BB3: ...
+        //
+        // (Case 2) A variation occurs when BB3 contains a JMP to BB4:
+        //   BB1: if (p0) jump BB3
+        //   ...falls-through to BB2 ...
+        //   BB2: jump BB4
+        //   ...other basic blocks ...
+        //   BB4:
+        //   ...not a fall-thru
+        //   BB3: ...
+        //     jump BB4
+        //
+        // Transform this to:
+        //   BB1: if (!p0) jump BB4
+        //   Remove BB2
+        //   BB3: ...
+        //   BB4: ...
+        //
+        unsigned NumSuccs = MBB->succ_size();
+        MachineBasicBlock::succ_iterator SI = MBB->succ_begin();
+        MachineBasicBlock* FirstSucc = *SI;
+        MachineBasicBlock* SecondSucc = *(++SI);
+        MachineBasicBlock* LayoutSucc = nullptr;
+        MachineBasicBlock* JumpAroundTarget = nullptr;
+
+        if (MBB->isLayoutSuccessor(FirstSucc)) {
+          LayoutSucc = FirstSucc;
+          JumpAroundTarget = SecondSucc;
+        } else if (MBB->isLayoutSuccessor(SecondSucc)) {
+          LayoutSucc = SecondSucc;
+          JumpAroundTarget = FirstSucc;
+        } else {
+          // Odd case...cannot handle.
+        }
+
+        // The target of the unconditional branch must be JumpAroundTarget.
+        // TODO: If not, we should not invert the unconditional branch.
+        MachineBasicBlock* CondBranchTarget = nullptr;
+        if (MI.getOpcode() == Hexagon::J2_jumpt ||
+            MI.getOpcode() == Hexagon::J2_jumpf) {
+          CondBranchTarget = MI.getOperand(1).getMBB();
+        }
+
+        if (!LayoutSucc || (CondBranchTarget != JumpAroundTarget)) {
+          continue;
+        }
+
+        if ((NumSuccs == 2) && LayoutSucc && (LayoutSucc->pred_size() == 1)) {
+
+          // Ensure that BB2 has one instruction -- an unconditional jump.
+          if ((LayoutSucc->size() == 1) &&
+              IsUnconditionalJump(LayoutSucc->front().getOpcode())) {
+            assert(JumpAroundTarget && "jump target is needed to process second basic block");
+            MachineBasicBlock* UncondTarget =
+              LayoutSucc->front().getOperand(0).getMBB();
+            // Check if the layout successor of BB2 is BB3.
+            bool case1 = LayoutSucc->isLayoutSuccessor(JumpAroundTarget);
+            bool case2 = JumpAroundTarget->isSuccessor(UncondTarget) &&
+              JumpAroundTarget->size() >= 1 &&
+              IsUnconditionalJump(JumpAroundTarget->back().getOpcode()) &&
+              JumpAroundTarget->pred_size() == 1 &&
+              JumpAroundTarget->succ_size() == 1;
+
+            if (case1 || case2) {
+              InvertAndChangeJumpTarget(MI, UncondTarget);
+              MBB->replaceSuccessor(JumpAroundTarget, UncondTarget);
+
+              // Remove the unconditional branch in LayoutSucc.
+              LayoutSucc->erase(LayoutSucc->begin());
+              LayoutSucc->replaceSuccessor(UncondTarget, JumpAroundTarget);
+
+              // This code performs the conversion for case 2, which moves
+              // the block to the fall-thru case (BB3 in the code above).
+              if (case2 && !case1) {
+                JumpAroundTarget->moveAfter(LayoutSucc);
+                // only move a block if it doesn't have a fall-thru. otherwise
+                // the CFG will be incorrect.
+                if (!UncondTarget->canFallThrough()) {
+                  UncondTarget->moveAfter(JumpAroundTarget);
+                }
+              }
+
+              //
+              // Correct live-in information. Is used by post-RA scheduler
+              // The live-in to LayoutSucc is now all values live-in to
+              // JumpAroundTarget.
+              //
+              std::vector<MachineBasicBlock::RegisterMaskPair> OrigLiveIn(
+                  LayoutSucc->livein_begin(), LayoutSucc->livein_end());
+              std::vector<MachineBasicBlock::RegisterMaskPair> NewLiveIn(
+                  JumpAroundTarget->livein_begin(),
+                  JumpAroundTarget->livein_end());
+              for (const auto &OrigLI : OrigLiveIn)
+                LayoutSucc->removeLiveIn(OrigLI.PhysReg);
+              for (const auto &NewLI : NewLiveIn)
+                LayoutSucc->addLiveIn(NewLI);
+            }
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+INITIALIZE_PASS(HexagonCFGOptimizer, "hexagon-cfg", "Hexagon CFG Optimizer",
+                false, false)
+
+FunctionPass *llvm::createHexagonCFGOptimizer() {
+  return new HexagonCFGOptimizer();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td
new file mode 100644
index 000000000000..e61b2a7a58ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td
@@ -0,0 +1,35 @@
+//===- HexagonCallingConv.td - Calling Conventions Hexagon -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the Hexagon architectures.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Hexagon 32-bit C return-value convention.
+def RetCC_Hexagon32 : CallingConv<[
+  CCIfType<[i32, f32], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
+  CCIfType<[i64, f64], CCAssignToReg<[D0, D1, D2]>>,
+
+  // Alternatively, they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
+
+// Hexagon 32-bit C Calling convention.
+def CC_Hexagon32 : CallingConv<[
+  // All arguments get passed in integer registers if there is space.
+  CCIfType<[f32, i32, i16, i8], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
+  CCIfType<[f64, i64], CCAssignToReg<[D0, D1, D2]>>,
+
+  // Alternatively, they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
new file mode 100644
index 000000000000..489da6be923d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -0,0 +1,1304 @@
+//===--- HexagonCommonGEP.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "commgep"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<bool> OptSpeculate("commgep-speculate", cl::init(true),
+  cl::Hidden, cl::ZeroOrMore);
+
+static cl::opt<bool> OptEnableInv("commgep-inv", cl::init(true), cl::Hidden,
+  cl::ZeroOrMore);
+
+static cl::opt<bool> OptEnableConst("commgep-const", cl::init(true),
+  cl::Hidden, cl::ZeroOrMore);
+
+namespace llvm {
+
+  void initializeHexagonCommonGEPPass(PassRegistry&);
+
+} // end namespace llvm
+
+namespace {
+
+  struct GepNode;
+  typedef std::set<GepNode*> NodeSet;
+  typedef std::map<GepNode*,Value*> NodeToValueMap;
+  typedef std::vector<GepNode*> NodeVect;
+  typedef std::map<GepNode*,NodeVect> NodeChildrenMap;
+  typedef std::set<Use*> UseSet;
+  typedef std::map<GepNode*,UseSet> NodeToUsesMap;
+
+  // Numbering map for gep nodes. Used to keep track of ordering for
+  // gep nodes.
+  struct NodeOrdering {
+    NodeOrdering() = default;
+
+    void insert(const GepNode *N) { Map.insert(std::make_pair(N, ++LastNum)); }
+    void clear() { Map.clear(); }
+
+    bool operator()(const GepNode *N1, const GepNode *N2) const {
+      auto F1 = Map.find(N1), F2 = Map.find(N2);
+      assert(F1 != Map.end() && F2 != Map.end());
+      return F1->second < F2->second;
+    }
+
+  private:
+    std::map<const GepNode *, unsigned> Map;
+    unsigned LastNum = 0;
+  };
+
+  class HexagonCommonGEP : public FunctionPass {
+  public:
+    static char ID;
+
+    HexagonCommonGEP() : FunctionPass(ID) {
+      initializeHexagonCommonGEPPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+    StringRef getPassName() const override { return "Hexagon Common GEP"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addRequired<PostDominatorTreeWrapperPass>();
+      AU.addPreserved<PostDominatorTreeWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+      FunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    typedef std::map<Value*,GepNode*> ValueToNodeMap;
+    typedef std::vector<Value*> ValueVect;
+    typedef std::map<GepNode*,ValueVect> NodeToValuesMap;
+
+    void getBlockTraversalOrder(BasicBlock *Root, ValueVect &Order);
+    bool isHandledGepForm(GetElementPtrInst *GepI);
+    void processGepInst(GetElementPtrInst *GepI, ValueToNodeMap &NM);
+    void collect();
+    void common();
+
+    BasicBlock *recalculatePlacement(GepNode *Node, NodeChildrenMap &NCM,
+                                     NodeToValueMap &Loc);
+    BasicBlock *recalculatePlacementRec(GepNode *Node, NodeChildrenMap &NCM,
+                                        NodeToValueMap &Loc);
+    bool isInvariantIn(Value *Val, Loop *L);
+    bool isInvariantIn(GepNode *Node, Loop *L);
+    bool isInMainPath(BasicBlock *B, Loop *L);
+    BasicBlock *adjustForInvariance(GepNode *Node, NodeChildrenMap &NCM,
+                                    NodeToValueMap &Loc);
+    void separateChainForNode(GepNode *Node, Use *U, NodeToValueMap &Loc);
+    void separateConstantChains(GepNode *Node, NodeChildrenMap &NCM,
+                                NodeToValueMap &Loc);
+    void computeNodePlacement(NodeToValueMap &Loc);
+
+    Value *fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
+                        BasicBlock *LocB);
+    void getAllUsersForNode(GepNode *Node, ValueVect &Values,
+                            NodeChildrenMap &NCM);
+    void materialize(NodeToValueMap &Loc);
+
+    void removeDeadCode();
+
+    NodeVect Nodes;
+    NodeToUsesMap Uses;
+    NodeOrdering NodeOrder;   // Node ordering, for deterministic behavior.
+    SpecificBumpPtrAllocator<GepNode> *Mem;
+    LLVMContext *Ctx;
+    LoopInfo *LI;
+    DominatorTree *DT;
+    PostDominatorTree *PDT;
+    Function *Fn;
+  };
+
+} // end anonymous namespace
+
+char HexagonCommonGEP::ID = 0;
+INITIALIZE_PASS_BEGIN(HexagonCommonGEP, "hcommgep", "Hexagon Common GEP",
+      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(HexagonCommonGEP, "hcommgep", "Hexagon Common GEP",
+      false, false)
+
+namespace {
+
+  struct GepNode {
+    enum {
+      None      = 0,
+      Root      = 0x01,
+      Internal  = 0x02,
+      Used      = 0x04
+    };
+
+    uint32_t Flags;
+    union {
+      GepNode *Parent;
+      Value *BaseVal;
+    };
+    Value *Idx;
+    Type *PTy;  // Type of the pointer operand.
+
+    GepNode() : Flags(0), Parent(nullptr), Idx(nullptr), PTy(nullptr) {}
+    GepNode(const GepNode *N) : Flags(N->Flags), Idx(N->Idx), PTy(N->PTy) {
+      if (Flags & Root)
+        BaseVal = N->BaseVal;
+      else
+        Parent = N->Parent;
+    }
+
+    friend raw_ostream &operator<< (raw_ostream &OS, const GepNode &GN);
+  };
+
+  Type *next_type(Type *Ty, Value *Idx) {
+    if (auto *PTy = dyn_cast<PointerType>(Ty))
+      return PTy->getElementType();
+    // Advance the type.
+    if (!Ty->isStructTy()) {
+      Type *NexTy = cast<SequentialType>(Ty)->getElementType();
+      return NexTy;
+    }
+    // Otherwise it is a struct type.
+    ConstantInt *CI = dyn_cast<ConstantInt>(Idx);
+    assert(CI && "Struct type with non-constant index");
+    int64_t i = CI->getValue().getSExtValue();
+    Type *NextTy = cast<StructType>(Ty)->getElementType(i);
+    return NextTy;
+  }
+
+  raw_ostream &operator<< (raw_ostream &OS, const GepNode &GN) {
+    OS << "{ {";
+    bool Comma = false;
+    if (GN.Flags & GepNode::Root) {
+      OS << "root";
+      Comma = true;
+    }
+    if (GN.Flags & GepNode::Internal) {
+      if (Comma)
+        OS << ',';
+      OS << "internal";
+      Comma = true;
+    }
+    if (GN.Flags & GepNode::Used) {
+      if (Comma)
+        OS << ',';
+      OS << "used";
+    }
+    OS << "} ";
+    if (GN.Flags & GepNode::Root)
+      OS << "BaseVal:" << GN.BaseVal->getName() << '(' << GN.BaseVal << ')';
+    else
+      OS << "Parent:" << GN.Parent;
+
+    OS << " Idx:";
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(GN.Idx))
+      OS << CI->getValue().getSExtValue();
+    else if (GN.Idx->hasName())
+      OS << GN.Idx->getName();
+    else
+      OS << "<anon> =" << *GN.Idx;
+
+    OS << " PTy:";
+    if (GN.PTy->isStructTy()) {
+      StructType *STy = cast<StructType>(GN.PTy);
+      if (!STy->isLiteral())
+        OS << GN.PTy->getStructName();
+      else
+        OS << "<anon-struct>:" << *STy;
+    }
+    else
+      OS << *GN.PTy;
+    OS << " }";
+    return OS;
+  }
+
+  template <typename NodeContainer>
+  void dump_node_container(raw_ostream &OS, const NodeContainer &S) {
+    typedef typename NodeContainer::const_iterator const_iterator;
+    for (const_iterator I = S.begin(), E = S.end(); I != E; ++I)
+      OS << *I << ' ' << **I << '\n';
+  }
+
+  raw_ostream &operator<< (raw_ostream &OS,
+                           const NodeVect &S) LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<< (raw_ostream &OS, const NodeVect &S) {
+    dump_node_container(OS, S);
+    return OS;
+  }
+
+  raw_ostream &operator<< (raw_ostream &OS,
+                           const NodeToUsesMap &M) LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<< (raw_ostream &OS, const NodeToUsesMap &M){
+    typedef NodeToUsesMap::const_iterator const_iterator;
+    for (const_iterator I = M.begin(), E = M.end(); I != E; ++I) {
+      const UseSet &Us = I->second;
+      OS << I->first << " -> #" << Us.size() << '{';
+      for (UseSet::const_iterator J = Us.begin(), F = Us.end(); J != F; ++J) {
+        User *R = (*J)->getUser();
+        if (R->hasName())
+          OS << ' ' << R->getName();
+        else
+          OS << " <?>(" << *R << ')';
+      }
+      OS << " }\n";
+    }
+    return OS;
+  }
+
+  struct in_set {
+    in_set(const NodeSet &S) : NS(S) {}
+    bool operator() (GepNode *N) const {
+      return NS.find(N) != NS.end();
+    }
+
+  private:
+    const NodeSet &NS;
+  };
+
+} // end anonymous namespace
+
+inline void *operator new(size_t, SpecificBumpPtrAllocator<GepNode> &A) {
+  return A.Allocate();
+}
+
+void HexagonCommonGEP::getBlockTraversalOrder(BasicBlock *Root,
+      ValueVect &Order) {
+  // Compute block ordering for a typical DT-based traversal of the flow
+  // graph: "before visiting a block, all of its dominators must have been
+  // visited".
+
+  Order.push_back(Root);
+  DomTreeNode *DTN = DT->getNode(Root);
+  typedef GraphTraits<DomTreeNode*> GTN;
+  typedef GTN::ChildIteratorType Iter;
+  for (Iter I = GTN::child_begin(DTN), E = GTN::child_end(DTN); I != E; ++I)
+    getBlockTraversalOrder((*I)->getBlock(), Order);
+}
+
+bool HexagonCommonGEP::isHandledGepForm(GetElementPtrInst *GepI) {
+  // No vector GEPs.
+  if (!GepI->getType()->isPointerTy())
+    return false;
+  // No GEPs without any indices.  (Is this possible?)
+  if (GepI->idx_begin() == GepI->idx_end())
+    return false;
+  return true;
+}
+
+void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI,
+      ValueToNodeMap &NM) {
+  DEBUG(dbgs() << "Visiting GEP: " << *GepI << '\n');
+  GepNode *N = new (*Mem) GepNode;
+  Value *PtrOp = GepI->getPointerOperand();
+  ValueToNodeMap::iterator F = NM.find(PtrOp);
+  if (F == NM.end()) {
+    N->BaseVal = PtrOp;
+    N->Flags |= GepNode::Root;
+  } else {
+    // If PtrOp was a GEP instruction, it must have already been processed.
+    // The ValueToNodeMap entry for it is the last gep node in the generated
+    // chain. Link to it here.
+    N->Parent = F->second;
+  }
+  N->PTy = PtrOp->getType();
+  N->Idx = *GepI->idx_begin();
+
+  // Collect the list of users of this GEP instruction. Will add it to the
+  // last node created for it.
+  UseSet Us;
+  for (Value::user_iterator UI = GepI->user_begin(), UE = GepI->user_end();
+       UI != UE; ++UI) {
+    // Check if this gep is used by anything other than other geps that
+    // we will process.
+    if (isa<GetElementPtrInst>(*UI)) {
+      GetElementPtrInst *UserG = cast<GetElementPtrInst>(*UI);
+      if (isHandledGepForm(UserG))
+        continue;
+    }
+    Us.insert(&UI.getUse());
+  }
+  Nodes.push_back(N);
+  NodeOrder.insert(N);
+
+  // Skip the first index operand, since we only handle 0. This dereferences
+  // the pointer operand.
+  GepNode *PN = N;
+  Type *PtrTy = cast<PointerType>(PtrOp->getType())->getElementType();
+  for (User::op_iterator OI = GepI->idx_begin()+1, OE = GepI->idx_end();
+       OI != OE; ++OI) {
+    Value *Op = *OI;
+    GepNode *Nx = new (*Mem) GepNode;
+    Nx->Parent = PN;  // Link Nx to the previous node.
+    Nx->Flags |= GepNode::Internal;
+    Nx->PTy = PtrTy;
+    Nx->Idx = Op;
+    Nodes.push_back(Nx);
+    NodeOrder.insert(Nx);
+    PN = Nx;
+
+    PtrTy = next_type(PtrTy, Op);
+  }
+
+  // After last node has been created, update the use information.
+  if (!Us.empty()) {
+    PN->Flags |= GepNode::Used;
+    Uses[PN].insert(Us.begin(), Us.end());
+  }
+
+  // Link the last node with the originating GEP instruction. This is to
+  // help with linking chained GEP instructions.
+  NM.insert(std::make_pair(GepI, PN));
+}
+
+void HexagonCommonGEP::collect() {
+  // Establish depth-first traversal order of the dominator tree.
+  ValueVect BO;
+  getBlockTraversalOrder(&Fn->front(), BO);
+
+  // The creation of gep nodes requires DT-traversal. When processing a GEP
+  // instruction that uses another GEP instruction as the base pointer, the
+  // gep node for the base pointer should already exist.
+  ValueToNodeMap NM;
+  for (ValueVect::iterator I = BO.begin(), E = BO.end(); I != E; ++I) {
+    BasicBlock *B = cast<BasicBlock>(*I);
+    for (BasicBlock::iterator J = B->begin(), F = B->end(); J != F; ++J) {
+      if (!isa<GetElementPtrInst>(J))
+        continue;
+      GetElementPtrInst *GepI = cast<GetElementPtrInst>(J);
+      if (isHandledGepForm(GepI))
+        processGepInst(GepI, NM);
+    }
+  }
+
+  DEBUG(dbgs() << "Gep nodes after initial collection:\n" << Nodes);
+}
+
+static void invert_find_roots(const NodeVect &Nodes, NodeChildrenMap &NCM,
+                              NodeVect &Roots) {
+    typedef NodeVect::const_iterator const_iterator;
+    for (const_iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+      GepNode *N = *I;
+      if (N->Flags & GepNode::Root) {
+        Roots.push_back(N);
+        continue;
+      }
+      GepNode *PN = N->Parent;
+      NCM[PN].push_back(N);
+    }
+}
+
+static void nodes_for_root(GepNode *Root, NodeChildrenMap &NCM,
+                           NodeSet &Nodes) {
+    NodeVect Work;
+    Work.push_back(Root);
+    Nodes.insert(Root);
+
+    while (!Work.empty()) {
+      NodeVect::iterator First = Work.begin();
+      GepNode *N = *First;
+      Work.erase(First);
+      NodeChildrenMap::iterator CF = NCM.find(N);
+      if (CF != NCM.end()) {
+        Work.insert(Work.end(), CF->second.begin(), CF->second.end());
+        Nodes.insert(CF->second.begin(), CF->second.end());
+      }
+    }
+}
+
+namespace {
+
+  typedef std::set<NodeSet> NodeSymRel;
+  typedef std::pair<GepNode*,GepNode*> NodePair;
+  typedef std::set<NodePair> NodePairSet;
+
+} // end anonymous namespace
+
+static const NodeSet *node_class(GepNode *N, NodeSymRel &Rel) {
+    for (NodeSymRel::iterator I = Rel.begin(), E = Rel.end(); I != E; ++I)
+      if (I->count(N))
+        return &*I;
+    return nullptr;
+}
+
+  // Create an ordered pair of GepNode pointers. The pair will be used in
+  // determining equality. The only purpose of the ordering is to eliminate
+  // duplication due to the commutativity of equality/non-equality.
+static NodePair node_pair(GepNode *N1, GepNode *N2) {
+    uintptr_t P1 = uintptr_t(N1), P2 = uintptr_t(N2);
+    if (P1 <= P2)
+      return std::make_pair(N1, N2);
+    return std::make_pair(N2, N1);
+}
+
+static unsigned node_hash(GepNode *N) {
+    // Include everything except flags and parent.
+    FoldingSetNodeID ID;
+    ID.AddPointer(N->Idx);
+    ID.AddPointer(N->PTy);
+    return ID.ComputeHash();
+}
+
+static bool node_eq(GepNode *N1, GepNode *N2, NodePairSet &Eq,
+                    NodePairSet &Ne) {
+    // Don't cache the result for nodes with different hashes. The hash
+    // comparison is fast enough.
+    if (node_hash(N1) != node_hash(N2))
+      return false;
+
+    NodePair NP = node_pair(N1, N2);
+    NodePairSet::iterator FEq = Eq.find(NP);
+    if (FEq != Eq.end())
+      return true;
+    NodePairSet::iterator FNe = Ne.find(NP);
+    if (FNe != Ne.end())
+      return false;
+    // Not previously compared.
+    bool Root1 = N1->Flags & GepNode::Root;
+    bool Root2 = N2->Flags & GepNode::Root;
+    NodePair P = node_pair(N1, N2);
+    // If the Root flag has different values, the nodes are different.
+    // If both nodes are root nodes, but their base pointers differ,
+    // they are different.
+    if (Root1 != Root2 || (Root1 && N1->BaseVal != N2->BaseVal)) {
+      Ne.insert(P);
+      return false;
+    }
+    // Here the root flags are identical, and for root nodes the
+    // base pointers are equal, so the root nodes are equal.
+    // For non-root nodes, compare their parent nodes.
+    if (Root1 || node_eq(N1->Parent, N2->Parent, Eq, Ne)) {
+      Eq.insert(P);
+      return true;
+    }
+    return false;
+}
+
+void HexagonCommonGEP::common() {
+  // The essence of this commoning is finding gep nodes that are equal.
+  // To do this we need to compare all pairs of nodes. To save time,
+  // first, partition the set of all nodes into sets of potentially equal
+  // nodes, and then compare pairs from within each partition.
+  typedef std::map<unsigned,NodeSet> NodeSetMap;
+  NodeSetMap MaybeEq;
+
+  for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    GepNode *N = *I;
+    unsigned H = node_hash(N);
+    MaybeEq[H].insert(N);
+  }
+
+  // Compute the equivalence relation for the gep nodes.  Use two caches,
+  // one for equality and the other for non-equality.
+  NodeSymRel EqRel;  // Equality relation (as set of equivalence classes).
+  NodePairSet Eq, Ne;  // Caches.
+  for (NodeSetMap::iterator I = MaybeEq.begin(), E = MaybeEq.end();
+       I != E; ++I) {
+    NodeSet &S = I->second;
+    for (NodeSet::iterator NI = S.begin(), NE = S.end(); NI != NE; ++NI) {
+      GepNode *N = *NI;
+      // If node already has a class, then the class must have been created
+      // in a prior iteration of this loop. Since equality is transitive,
+      // nothing more will be added to that class, so skip it.
+      if (node_class(N, EqRel))
+        continue;
+
+      // Create a new class candidate now.
+      NodeSet C;
+      for (NodeSet::iterator NJ = std::next(NI); NJ != NE; ++NJ)
+        if (node_eq(N, *NJ, Eq, Ne))
+          C.insert(*NJ);
+      // If Tmp is empty, N would be the only element in it. Don't bother
+      // creating a class for it then.
+      if (!C.empty()) {
+        C.insert(N);  // Finalize the set before adding it to the relation.
+        std::pair<NodeSymRel::iterator, bool> Ins = EqRel.insert(C);
+        (void)Ins;
+        assert(Ins.second && "Cannot add a class");
+      }
+    }
+  }
+
+  DEBUG({
+    dbgs() << "Gep node equality:\n";
+    for (NodePairSet::iterator I = Eq.begin(), E = Eq.end(); I != E; ++I)
+      dbgs() << "{ " << I->first << ", " << I->second << " }\n";
+
+    dbgs() << "Gep equivalence classes:\n";
+    for (NodeSymRel::iterator I = EqRel.begin(), E = EqRel.end(); I != E; ++I) {
+      dbgs() << '{';
+      const NodeSet &S = *I;
+      for (NodeSet::const_iterator J = S.begin(), F = S.end(); J != F; ++J) {
+        if (J != S.begin())
+          dbgs() << ',';
+        dbgs() << ' ' << *J;
+      }
+      dbgs() << " }\n";
+    }
+  });
+
+  // Create a projection from a NodeSet to the minimal element in it.
+  typedef std::map<const NodeSet*,GepNode*> ProjMap;
+  ProjMap PM;
+  for (NodeSymRel::iterator I = EqRel.begin(), E = EqRel.end(); I != E; ++I) {
+    const NodeSet &S = *I;
+    GepNode *Min = *std::min_element(S.begin(), S.end(), NodeOrder);
+    std::pair<ProjMap::iterator,bool> Ins = PM.insert(std::make_pair(&S, Min));
+    (void)Ins;
+    assert(Ins.second && "Cannot add minimal element");
+
+    // Update the min element's flags, and user list.
+    uint32_t Flags = 0;
+    UseSet &MinUs = Uses[Min];
+    for (NodeSet::iterator J = S.begin(), F = S.end(); J != F; ++J) {
+      GepNode *N = *J;
+      uint32_t NF = N->Flags;
+      // If N is used, append all original values of N to the list of
+      // original values of Min.
+      if (NF & GepNode::Used)
+        MinUs.insert(Uses[N].begin(), Uses[N].end());
+      Flags |= NF;
+    }
+    if (MinUs.empty())
+      Uses.erase(Min);
+
+    // The collected flags should include all the flags from the min element.
+    assert((Min->Flags & Flags) == Min->Flags);
+    Min->Flags = Flags;
+  }
+
+  // Commoning: for each non-root gep node, replace "Parent" with the
+  // selected (minimum) node from the corresponding equivalence class.
+  // If a given parent does not have an equivalence class, leave it
+  // unchanged (it means that it's the only element in its class).
+  for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    GepNode *N = *I;
+    if (N->Flags & GepNode::Root)
+      continue;
+    const NodeSet *PC = node_class(N->Parent, EqRel);
+    if (!PC)
+      continue;
+    ProjMap::iterator F = PM.find(PC);
+    if (F == PM.end())
+      continue;
+    // Found a replacement, use it.
+    GepNode *Rep = F->second;
+    N->Parent = Rep;
+  }
+
+  DEBUG(dbgs() << "Gep nodes after commoning:\n" << Nodes);
+
+  // Finally, erase the nodes that are no longer used.
+  NodeSet Erase;
+  for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
+    GepNode *N = *I;
+    const NodeSet *PC = node_class(N, EqRel);
+    if (!PC)
+      continue;
+    ProjMap::iterator F = PM.find(PC);
+    if (F == PM.end())
+      continue;
+    if (N == F->second)
+      continue;
+    // Node for removal.
+    Erase.insert(*I);
+  }
+  NodeVect::iterator NewE = remove_if(Nodes, in_set(Erase));
+  Nodes.resize(std::distance(Nodes.begin(), NewE));
+
+  DEBUG(dbgs() << "Gep nodes after post-commoning cleanup:\n" << Nodes);
+}
+
+template <typename T>
+static BasicBlock *nearest_common_dominator(DominatorTree *DT, T &Blocks) {
+    DEBUG({
+      dbgs() << "NCD of {";
+      for (typename T::iterator I = Blocks.begin(), E = Blocks.end();
+           I != E; ++I) {
+        if (!*I)
+          continue;
+        BasicBlock *B = cast<BasicBlock>(*I);
+        dbgs() << ' ' << B->getName();
+      }
+      dbgs() << " }\n";
+    });
+
+    // Allow null basic blocks in Blocks.  In such cases, return nullptr.
+    typename T::iterator I = Blocks.begin(), E = Blocks.end();
+    if (I == E || !*I)
+      return nullptr;
+    BasicBlock *Dom = cast<BasicBlock>(*I);
+    while (++I != E) {
+      BasicBlock *B = cast_or_null<BasicBlock>(*I);
+      Dom = B ? DT->findNearestCommonDominator(Dom, B) : nullptr;
+      if (!Dom)
+        return nullptr;
+    }
+    DEBUG(dbgs() << "computed:" << Dom->getName() << '\n');
+    return Dom;
+}
+
+template <typename T>
+static BasicBlock *nearest_common_dominatee(DominatorTree *DT, T &Blocks) {
+    // If two blocks, A and B, dominate a block C, then A dominates B,
+    // or B dominates A.
+    typename T::iterator I = Blocks.begin(), E = Blocks.end();
+    // Find the first non-null block.
+    while (I != E && !*I)
+      ++I;
+    if (I == E)
+      return DT->getRoot();
+    BasicBlock *DomB = cast<BasicBlock>(*I);
+    while (++I != E) {
+      if (!*I)
+        continue;
+      BasicBlock *B = cast<BasicBlock>(*I);
+      if (DT->dominates(B, DomB))
+        continue;
+      if (!DT->dominates(DomB, B))
+        return nullptr;
+      DomB = B;
+    }
+    return DomB;
+}
+
+// Find the first use in B of any value from Values. If no such use,
+// return B->end().
+template <typename T>
+static BasicBlock::iterator first_use_of_in_block(T &Values, BasicBlock *B) {
+    BasicBlock::iterator FirstUse = B->end(), BEnd = B->end();
+    typedef typename T::iterator iterator;
+    for (iterator I = Values.begin(), E = Values.end(); I != E; ++I) {
+      Value *V = *I;
+      // If V is used in a PHI node, the use belongs to the incoming block,
+      // not the block with the PHI node. In the incoming block, the use
+      // would be considered as being at the end of it, so it cannot
+      // influence the position of the first use (which is assumed to be
+      // at the end to start with).
+      if (isa<PHINode>(V))
+        continue;
+      if (!isa<Instruction>(V))
+        continue;
+      Instruction *In = cast<Instruction>(V);
+      if (In->getParent() != B)
+        continue;
+      BasicBlock::iterator It = In->getIterator();
+      if (std::distance(FirstUse, BEnd) < std::distance(It, BEnd))
+        FirstUse = It;
+    }
+    return FirstUse;
+}
+
+static bool is_empty(const BasicBlock *B) {
+    return B->empty() || (&*B->begin() == B->getTerminator());
+}
+
+BasicBlock *HexagonCommonGEP::recalculatePlacement(GepNode *Node,
+      NodeChildrenMap &NCM, NodeToValueMap &Loc) {
+  DEBUG(dbgs() << "Loc for node:" << Node << '\n');
+  // Recalculate the placement for Node, assuming that the locations of
+  // its children in Loc are valid.
+  // Return nullptr if there is no valid placement for Node (for example, it
+  // uses an index value that is not available at the location required
+  // to dominate all children, etc.).
+
+  // Find the nearest common dominator for:
+  // - all users, if the node is used, and
+  // - all children.
+  ValueVect Bs;
+  if (Node->Flags & GepNode::Used) {
+    // Append all blocks with uses of the original values to the
+    // block vector Bs.
+    NodeToUsesMap::iterator UF = Uses.find(Node);
+    assert(UF != Uses.end() && "Used node with no use information");
+    UseSet &Us = UF->second;
+    for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) {
+      Use *U = *I;
+      User *R = U->getUser();
+      if (!isa<Instruction>(R))
+        continue;
+      BasicBlock *PB = isa<PHINode>(R)
+          ? cast<PHINode>(R)->getIncomingBlock(*U)
+          : cast<Instruction>(R)->getParent();
+      Bs.push_back(PB);
+    }
+  }
+  // Append the location of each child.
+  NodeChildrenMap::iterator CF = NCM.find(Node);
+  if (CF != NCM.end()) {
+    NodeVect &Cs = CF->second;
+    for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) {
+      GepNode *CN = *I;
+      NodeToValueMap::iterator LF = Loc.find(CN);
+      // If the child is only used in GEP instructions (i.e. is not used in
+      // non-GEP instructions), the nearest dominator computed for it may
+      // have been null. In such case it won't have a location available.
+      if (LF == Loc.end())
+        continue;
+      Bs.push_back(LF->second);
+    }
+  }
+
+  BasicBlock *DomB = nearest_common_dominator(DT, Bs);
+  if (!DomB)
+    return nullptr;
+  // Check if the index used by Node dominates the computed dominator.
+  Instruction *IdxI = dyn_cast<Instruction>(Node->Idx);
+  if (IdxI && !DT->dominates(IdxI->getParent(), DomB))
+    return nullptr;
+
+  // Avoid putting nodes into empty blocks.
+  while (is_empty(DomB)) {
+    DomTreeNode *N = (*DT)[DomB]->getIDom();
+    if (!N)
+      break;
+    DomB = N->getBlock();
+  }
+
+  // Otherwise, DomB is fine. Update the location map.
+  Loc[Node] = DomB;
+  return DomB;
+}
+
+BasicBlock *HexagonCommonGEP::recalculatePlacementRec(GepNode *Node,
+      NodeChildrenMap &NCM, NodeToValueMap &Loc) {
+  DEBUG(dbgs() << "LocRec begin for node:" << Node << '\n');
+  // Recalculate the placement of Node, after recursively recalculating the
+  // placements of all its children.
+  NodeChildrenMap::iterator CF = NCM.find(Node);
+  if (CF != NCM.end()) {
+    NodeVect &Cs = CF->second;
+    for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I)
+      recalculatePlacementRec(*I, NCM, Loc);
+  }
+  BasicBlock *LB = recalculatePlacement(Node, NCM, Loc);
+  DEBUG(dbgs() << "LocRec end for node:" << Node << '\n');
+  return LB;
+}
+
+bool HexagonCommonGEP::isInvariantIn(Value *Val, Loop *L) {
+  if (isa<Constant>(Val) || isa<Argument>(Val))
+    return true;
+  Instruction *In = dyn_cast<Instruction>(Val);
+  if (!In)
+    return false;
+  BasicBlock *HdrB = L->getHeader(), *DefB = In->getParent();
+  return DT->properlyDominates(DefB, HdrB);
+}
+
+bool HexagonCommonGEP::isInvariantIn(GepNode *Node, Loop *L) {
+  if (Node->Flags & GepNode::Root)
+    if (!isInvariantIn(Node->BaseVal, L))
+      return false;
+  return isInvariantIn(Node->Idx, L);
+}
+
+bool HexagonCommonGEP::isInMainPath(BasicBlock *B, Loop *L) {
+  BasicBlock *HB = L->getHeader();
+  BasicBlock *LB = L->getLoopLatch();
+  // B must post-dominate the loop header or dominate the loop latch.
+  if (PDT->dominates(B, HB))
+    return true;
+  if (LB && DT->dominates(B, LB))
+    return true;
+  return false;
+}
+
+static BasicBlock *preheader(DominatorTree *DT, Loop *L) {
+  if (BasicBlock *PH = L->getLoopPreheader())
+    return PH;
+  if (!OptSpeculate)
+    return nullptr;
+  DomTreeNode *DN = DT->getNode(L->getHeader());
+  if (!DN)
+    return nullptr;
+  return DN->getIDom()->getBlock();
+}
+
+BasicBlock *HexagonCommonGEP::adjustForInvariance(GepNode *Node,
+      NodeChildrenMap &NCM, NodeToValueMap &Loc) {
+  // Find the "topmost" location for Node: it must be dominated by both,
+  // its parent (or the BaseVal, if it's a root node), and by the index
+  // value.
+  ValueVect Bs;
+  if (Node->Flags & GepNode::Root) {
+    if (Instruction *PIn = dyn_cast<Instruction>(Node->BaseVal))
+      Bs.push_back(PIn->getParent());
+  } else {
+    Bs.push_back(Loc[Node->Parent]);
+  }
+  if (Instruction *IIn = dyn_cast<Instruction>(Node->Idx))
+    Bs.push_back(IIn->getParent());
+  BasicBlock *TopB = nearest_common_dominatee(DT, Bs);
+
+  // Traverse the loop nest upwards until we find a loop in which Node
+  // is no longer invariant, or until we get to the upper limit of Node's
+  // placement. The traversal will also stop when a suitable "preheader"
+  // cannot be found for a given loop. The "preheader" may actually be
+  // a regular block outside of the loop (i.e. not guarded), in which case
+  // the Node will be speculated.
+  // For nodes that are not in the main path of the containing loop (i.e.
+  // are not executed in each iteration), do not move them out of the loop.
+  BasicBlock *LocB = cast_or_null<BasicBlock>(Loc[Node]);
+  if (LocB) {
+    Loop *Lp = LI->getLoopFor(LocB);
+    while (Lp) {
+      if (!isInvariantIn(Node, Lp) || !isInMainPath(LocB, Lp))
+        break;
+      BasicBlock *NewLoc = preheader(DT, Lp);
+      if (!NewLoc || !DT->dominates(TopB, NewLoc))
+        break;
+      Lp = Lp->getParentLoop();
+      LocB = NewLoc;
+    }
+  }
+  Loc[Node] = LocB;
+
+  // Recursively compute the locations of all children nodes.
+  NodeChildrenMap::iterator CF = NCM.find(Node);
+  if (CF != NCM.end()) {
+    NodeVect &Cs = CF->second;
+    for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I)
+      adjustForInvariance(*I, NCM, Loc);
+  }
+  return LocB;
+}
+
+namespace {
+
+  struct LocationAsBlock {
+    LocationAsBlock(const NodeToValueMap &L) : Map(L) {}
+
+    const NodeToValueMap &Map;
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS,
+                           const LocationAsBlock &Loc) LLVM_ATTRIBUTE_UNUSED ;
+  raw_ostream &operator<< (raw_ostream &OS, const LocationAsBlock &Loc) {
+    for (NodeToValueMap::const_iterator I = Loc.Map.begin(), E = Loc.Map.end();
+         I != E; ++I) {
+      OS << I->first << " -> ";
+      BasicBlock *B = cast<BasicBlock>(I->second);
+      OS << B->getName() << '(' << B << ')';
+      OS << '\n';
+    }
+    return OS;
+  }
+
+  inline bool is_constant(GepNode *N) {
+    return isa<ConstantInt>(N->Idx);
+  }
+
+} // end anonymous namespace
+
+void HexagonCommonGEP::separateChainForNode(GepNode *Node, Use *U,
+      NodeToValueMap &Loc) {
+  User *R = U->getUser();
+  DEBUG(dbgs() << "Separating chain for node (" << Node << ") user: "
+               << *R << '\n');
+  BasicBlock *PB = cast<Instruction>(R)->getParent();
+
+  GepNode *N = Node;
+  GepNode *C = nullptr, *NewNode = nullptr;
+  while (is_constant(N) && !(N->Flags & GepNode::Root)) {
+    // XXX if (single-use) dont-replicate;
+    GepNode *NewN = new (*Mem) GepNode(N);
+    Nodes.push_back(NewN);
+    Loc[NewN] = PB;
+
+    if (N == Node)
+      NewNode = NewN;
+    NewN->Flags &= ~GepNode::Used;
+    if (C)
+      C->Parent = NewN;
+    C = NewN;
+    N = N->Parent;
+  }
+  if (!NewNode)
+    return;
+
+  // Move over all uses that share the same user as U from Node to NewNode.
+  NodeToUsesMap::iterator UF = Uses.find(Node);
+  assert(UF != Uses.end());
+  UseSet &Us = UF->second;
+  UseSet NewUs;
+  for (UseSet::iterator I = Us.begin(); I != Us.end(); ) {
+    User *S = (*I)->getUser();
+    UseSet::iterator Nx = std::next(I);
+    if (S == R) {
+      NewUs.insert(*I);
+      Us.erase(I);
+    }
+    I = Nx;
+  }
+  if (Us.empty()) {
+    Node->Flags &= ~GepNode::Used;
+    Uses.erase(UF);
+  }
+
+  // Should at least have U in NewUs.
+  NewNode->Flags |= GepNode::Used;
+  DEBUG(dbgs() << "new node: " << NewNode << "  " << *NewNode << '\n');
+  assert(!NewUs.empty());
+  Uses[NewNode] = NewUs;
+}
+
+void HexagonCommonGEP::separateConstantChains(GepNode *Node,
+      NodeChildrenMap &NCM, NodeToValueMap &Loc) {
+  // First approximation: extract all chains.
+  NodeSet Ns;
+  nodes_for_root(Node, NCM, Ns);
+
+  DEBUG(dbgs() << "Separating constant chains for node: " << Node << '\n');
+  // Collect all used nodes together with the uses from loads and stores,
+  // where the GEP node could be folded into the load/store instruction.
+  NodeToUsesMap FNs; // Foldable nodes.
+  for (NodeSet::iterator I = Ns.begin(), E = Ns.end(); I != E; ++I) {
+    GepNode *N = *I;
+    if (!(N->Flags & GepNode::Used))
+      continue;
+    NodeToUsesMap::iterator UF = Uses.find(N);
+    assert(UF != Uses.end());
+    UseSet &Us = UF->second;
+    // Loads/stores that use the node N.
+    UseSet LSs;
+    for (UseSet::iterator J = Us.begin(), F = Us.end(); J != F; ++J) {
+      Use *U = *J;
+      User *R = U->getUser();
+      // We're interested in uses that provide the address. It can happen
+      // that the value may also be provided via GEP, but we won't handle
+      // those cases here for now.
+      if (LoadInst *Ld = dyn_cast<LoadInst>(R)) {
+        unsigned PtrX = LoadInst::getPointerOperandIndex();
+        if (&Ld->getOperandUse(PtrX) == U)
+          LSs.insert(U);
+      } else if (StoreInst *St = dyn_cast<StoreInst>(R)) {
+        unsigned PtrX = StoreInst::getPointerOperandIndex();
+        if (&St->getOperandUse(PtrX) == U)
+          LSs.insert(U);
+      }
+    }
+    // Even if the total use count is 1, separating the chain may still be
+    // beneficial, since the constant chain may be longer than the GEP alone
+    // would be (e.g. if the parent node has a constant index and also has
+    // other children).
+    if (!LSs.empty())
+      FNs.insert(std::make_pair(N, LSs));
+  }
+
+  DEBUG(dbgs() << "Nodes with foldable users:\n" << FNs);
+
+  for (NodeToUsesMap::iterator I = FNs.begin(), E = FNs.end(); I != E; ++I) {
+    GepNode *N = I->first;
+    UseSet &Us = I->second;
+    for (UseSet::iterator J = Us.begin(), F = Us.end(); J != F; ++J)
+      separateChainForNode(N, *J, Loc);
+  }
+}
+
+void HexagonCommonGEP::computeNodePlacement(NodeToValueMap &Loc) {
+  // Compute the inverse of the Node.Parent links. Also, collect the set
+  // of root nodes.
+  NodeChildrenMap NCM;
+  NodeVect Roots;
+  invert_find_roots(Nodes, NCM, Roots);
+
+  // Compute the initial placement determined by the users' locations, and
+  // the locations of the child nodes.
+  for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
+    recalculatePlacementRec(*I, NCM, Loc);
+
+  DEBUG(dbgs() << "Initial node placement:\n" << LocationAsBlock(Loc));
+
+  if (OptEnableInv) {
+    for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
+      adjustForInvariance(*I, NCM, Loc);
+
+    DEBUG(dbgs() << "Node placement after adjustment for invariance:\n"
+                 << LocationAsBlock(Loc));
+  }
+  if (OptEnableConst) {
+    for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
+      separateConstantChains(*I, NCM, Loc);
+  }
+  DEBUG(dbgs() << "Node use information:\n" << Uses);
+
+  // At the moment, there is no further refinement of the initial placement.
+  // Such a refinement could include splitting the nodes if they are placed
+  // too far from some of its users.
+
+  DEBUG(dbgs() << "Final node placement:\n" << LocationAsBlock(Loc));
+}
+
+Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
+      BasicBlock *LocB) {
+  DEBUG(dbgs() << "Fabricating GEP in " << LocB->getName()
+               << " for nodes:\n" << NA);
+  unsigned Num = NA.size();
+  GepNode *RN = NA[0];
+  assert((RN->Flags & GepNode::Root) && "Creating GEP for non-root");
+
+  Value *NewInst = nullptr;
+  Value *Input = RN->BaseVal;
+  Value **IdxList = new Value*[Num+1];
+  unsigned nax = 0;
+  do {
+    unsigned IdxC = 0;
+    // If the type of the input of the first node is not a pointer,
+    // we need to add an artificial i32 0 to the indices (because the
+    // actual input in the IR will be a pointer).
+    if (!NA[nax]->PTy->isPointerTy()) {
+      Type *Int32Ty = Type::getInt32Ty(*Ctx);
+      IdxList[IdxC++] = ConstantInt::get(Int32Ty, 0);
+    }
+
+    // Keep adding indices from NA until we have to stop and generate
+    // an "intermediate" GEP.
+    while (++nax <= Num) {
+      GepNode *N = NA[nax-1];
+      IdxList[IdxC++] = N->Idx;
+      if (nax < Num) {
+        // We have to stop, if the expected type of the output of this node
+        // is not the same as the input type of the next node.
+        Type *NextTy = next_type(N->PTy, N->Idx);
+        if (NextTy != NA[nax]->PTy)
+          break;
+      }
+    }
+    ArrayRef<Value*> A(IdxList, IdxC);
+    Type *InpTy = Input->getType();
+    Type *ElTy = cast<PointerType>(InpTy->getScalarType())->getElementType();
+    NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", &*At);
+    DEBUG(dbgs() << "new GEP: " << *NewInst << '\n');
+    Input = NewInst;
+  } while (nax <= Num);
+
+  delete[] IdxList;
+  return NewInst;
+}
+
+void HexagonCommonGEP::getAllUsersForNode(GepNode *Node, ValueVect &Values,
+      NodeChildrenMap &NCM) {
+  NodeVect Work;
+  Work.push_back(Node);
+
+  while (!Work.empty()) {
+    NodeVect::iterator First = Work.begin();
+    GepNode *N = *First;
+    Work.erase(First);
+    if (N->Flags & GepNode::Used) {
+      NodeToUsesMap::iterator UF = Uses.find(N);
+      assert(UF != Uses.end() && "No use information for used node");
+      UseSet &Us = UF->second;
+      for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I)
+        Values.push_back((*I)->getUser());
+    }
+    NodeChildrenMap::iterator CF = NCM.find(N);
+    if (CF != NCM.end()) {
+      NodeVect &Cs = CF->second;
+      Work.insert(Work.end(), Cs.begin(), Cs.end());
+    }
+  }
+}
+
+void HexagonCommonGEP::materialize(NodeToValueMap &Loc) {
+  DEBUG(dbgs() << "Nodes before materialization:\n" << Nodes << '\n');
+  NodeChildrenMap NCM;
+  NodeVect Roots;
+  // Compute the inversion again, since computing placement could alter
+  // "parent" relation between nodes.
+  invert_find_roots(Nodes, NCM, Roots);
+
+  while (!Roots.empty()) {
+    NodeVect::iterator First = Roots.begin();
+    GepNode *Root = *First, *Last = *First;
+    Roots.erase(First);
+
+    NodeVect NA;  // Nodes to assemble.
+    // Append to NA all child nodes up to (and including) the first child
+    // that:
+    // (1) has more than 1 child, or
+    // (2) is used, or
+    // (3) has a child located in a different block.
+    bool LastUsed = false;
+    unsigned LastCN = 0;
+    // The location may be null if the computation failed (it can legitimately
+    // happen for nodes created from dead GEPs).
+    Value *LocV = Loc[Last];
+    if (!LocV)
+      continue;
+    BasicBlock *LastB = cast<BasicBlock>(LocV);
+    do {
+      NA.push_back(Last);
+      LastUsed = (Last->Flags & GepNode::Used);
+      if (LastUsed)
+        break;
+      NodeChildrenMap::iterator CF = NCM.find(Last);
+      LastCN = (CF != NCM.end()) ? CF->second.size() : 0;
+      if (LastCN != 1)
+        break;
+      GepNode *Child = CF->second.front();
+      BasicBlock *ChildB = cast_or_null<BasicBlock>(Loc[Child]);
+      if (ChildB != nullptr && LastB != ChildB)
+        break;
+      Last = Child;
+    } while (true);
+
+    BasicBlock::iterator InsertAt = LastB->getTerminator()->getIterator();
+    if (LastUsed || LastCN > 0) {
+      ValueVect Urs;
+      getAllUsersForNode(Root, Urs, NCM);
+      BasicBlock::iterator FirstUse = first_use_of_in_block(Urs, LastB);
+      if (FirstUse != LastB->end())
+        InsertAt = FirstUse;
+    }
+
+    // Generate a new instruction for NA.
+    Value *NewInst = fabricateGEP(NA, InsertAt, LastB);
+
+    // Convert all the children of Last node into roots, and append them
+    // to the Roots list.
+    if (LastCN > 0) {
+      NodeVect &Cs = NCM[Last];
+      for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) {
+        GepNode *CN = *I;
+        CN->Flags &= ~GepNode::Internal;
+        CN->Flags |= GepNode::Root;
+        CN->BaseVal = NewInst;
+        Roots.push_back(CN);
+      }
+    }
+
+    // Lastly, if the Last node was used, replace all uses with the new GEP.
+    // The uses reference the original GEP values.
+    if (LastUsed) {
+      NodeToUsesMap::iterator UF = Uses.find(Last);
+      assert(UF != Uses.end() && "No use information found");
+      UseSet &Us = UF->second;
+      for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) {
+        Use *U = *I;
+        U->set(NewInst);
+      }
+    }
+  }
+}
+
+void HexagonCommonGEP::removeDeadCode() {
+  ValueVect BO;
+  BO.push_back(&Fn->front());
+
+  for (unsigned i = 0; i < BO.size(); ++i) {
+    BasicBlock *B = cast<BasicBlock>(BO[i]);
+    DomTreeNode *N = DT->getNode(B);
+    typedef GraphTraits<DomTreeNode*> GTN;
+    typedef GTN::ChildIteratorType Iter;
+    for (Iter I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
+      BO.push_back((*I)->getBlock());
+  }
+
+  for (unsigned i = BO.size(); i > 0; --i) {
+    BasicBlock *B = cast<BasicBlock>(BO[i-1]);
+    BasicBlock::InstListType &IL = B->getInstList();
+    typedef BasicBlock::InstListType::reverse_iterator reverse_iterator;
+    ValueVect Ins;
+    for (reverse_iterator I = IL.rbegin(), E = IL.rend(); I != E; ++I)
+      Ins.push_back(&*I);
+    for (ValueVect::iterator I = Ins.begin(), E = Ins.end(); I != E; ++I) {
+      Instruction *In = cast<Instruction>(*I);
+      if (isInstructionTriviallyDead(In))
+        In->eraseFromParent();
+    }
+  }
+}
+
+bool HexagonCommonGEP::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  // For now bail out on C++ exception handling.
+  for (Function::iterator A = F.begin(), Z = F.end(); A != Z; ++A)
+    for (BasicBlock::iterator I = A->begin(), E = A->end(); I != E; ++I)
+      if (isa<InvokeInst>(I) || isa<LandingPadInst>(I))
+        return false;
+
+  Fn = &F;
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  Ctx = &F.getContext();
+
+  Nodes.clear();
+  Uses.clear();
+  NodeOrder.clear();
+
+  SpecificBumpPtrAllocator<GepNode> Allocator;
+  Mem = &Allocator;
+
+  collect();
+  common();
+
+  NodeToValueMap Loc;
+  computeNodePlacement(Loc);
+  materialize(Loc);
+  removeDeadCode();
+
+#ifdef EXPENSIVE_CHECKS
+  // Run this only when expensive checks are enabled.
+  verifyFunction(F);
+#endif
+  return true;
+}
+
+namespace llvm {
+
+  FunctionPass *createHexagonCommonGEP() {
+    return new HexagonCommonGEP();
+  }
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
new file mode 100644
index 000000000000..783b916e04b0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -0,0 +1,3149 @@
+//===--- HexagonConstPropagation.cpp --------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hcp"
+
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <map>
+#include <queue>
+#include <set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+namespace {
+
+  // Properties of a value that are tracked by the propagation.
+  // A property that is marked as present (i.e. bit is set) dentes that the
+  // value is known (proven) to have this property. Not all combinations
+  // of bits make sense, for example Zero and NonZero are mutually exclusive,
+  // but on the other hand, Zero implies Finite. In this case, whenever
+  // the Zero property is present, Finite should also be present.
+  class ConstantProperties {
+  public:
+    enum {
+      Unknown   = 0x0000,
+      Zero      = 0x0001,
+      NonZero   = 0x0002,
+      Finite    = 0x0004,
+      Infinity  = 0x0008,
+      NaN       = 0x0010,
+      SignedZero = 0x0020,
+      NumericProperties = (Zero|NonZero|Finite|Infinity|NaN|SignedZero),
+      PosOrZero       = 0x0100,
+      NegOrZero       = 0x0200,
+      SignProperties  = (PosOrZero|NegOrZero),
+      Everything      = (NumericProperties|SignProperties)
+    };
+
+    // For a given constant, deduce the set of trackable properties that this
+    // constant has.
+    static uint32_t deduce(const Constant *C);
+  };
+
+  // A representation of a register as it can appear in a MachineOperand,
+  // i.e. a pair register:subregister.
+  struct Register {
+    unsigned Reg, SubReg;
+
+    explicit Register(unsigned R, unsigned SR = 0) : Reg(R), SubReg(SR) {}
+    explicit Register(const MachineOperand &MO)
+      : Reg(MO.getReg()), SubReg(MO.getSubReg()) {}
+
+    void print(const TargetRegisterInfo *TRI = nullptr) const {
+      dbgs() << PrintReg(Reg, TRI, SubReg);
+    }
+
+    bool operator== (const Register &R) const {
+      return (Reg == R.Reg) && (SubReg == R.SubReg);
+    }
+  };
+
+  // Lattice cell, based on that was described in the W-Z paper on constant
+  // propagation.
+  // Latice cell will be allowed to hold multiple constant values. While
+  // multiple values would normally indicate "bottom", we can still derive
+  // some useful information from them. For example, comparison X > 0
+  // could be folded if all the values in the cell associated with X are
+  // positive.
+  class LatticeCell {
+  private:
+    enum { Normal, Top, Bottom };
+
+    static const unsigned MaxCellSize = 4;
+
+    unsigned Kind:2;
+    unsigned Size:3;
+    unsigned IsSpecial:1;
+    unsigned :0;
+
+  public:
+    union {
+      uint32_t Properties;
+      const Constant *Value;
+      const Constant *Values[MaxCellSize];
+    };
+
+    LatticeCell() : Kind(Top), Size(0), IsSpecial(false) {
+      for (unsigned i = 0; i < MaxCellSize; ++i)
+        Values[i] = nullptr;
+    }
+
+    bool meet(const LatticeCell &L);
+    bool add(const Constant *C);
+    bool add(uint32_t Property);
+    uint32_t properties() const;
+    unsigned size() const { return Size; }
+
+    LatticeCell &operator= (const LatticeCell &L) {
+      if (this != &L) {
+        // This memcpy also copies Properties (when L.Size == 0).
+        uint32_t N = L.IsSpecial ? sizeof L.Properties
+                                 : L.Size*sizeof(const Constant*);
+        memcpy(Values, L.Values, N);
+        Kind = L.Kind;
+        Size = L.Size;
+        IsSpecial = L.IsSpecial;
+      }
+      return *this;
+    }
+
+    bool isSingle() const { return size() == 1; }
+    bool isProperty() const { return IsSpecial; }
+    bool isTop() const { return Kind == Top; }
+    bool isBottom() const { return Kind == Bottom; }
+
+    bool setBottom() {
+      bool Changed = (Kind != Bottom);
+      Kind = Bottom;
+      Size = 0;
+      IsSpecial = false;
+      return Changed;
+    }
+
+    void print(raw_ostream &os) const;
+
+  private:
+    void setProperty() {
+      IsSpecial = true;
+      Size = 0;
+      Kind = Normal;
+    }
+
+    bool convertToProperty();
+  };
+
+  raw_ostream &operator<< (raw_ostream &os, const LatticeCell &L) {
+    L.print(os);
+    return os;
+  }
+
+  class MachineConstEvaluator;
+
+  class MachineConstPropagator {
+  public:
+    MachineConstPropagator(MachineConstEvaluator &E) : MCE(E) {
+      Bottom.setBottom();
+    }
+
+    // Mapping: vreg -> cell
+    // The keys are registers _without_ subregisters. This won't allow
+    // definitions in the form of "vreg:subreg<def> = ...". Such definitions
+    // would be questionable from the point of view of SSA, since the "vreg"
+    // could not be initialized in its entirety (specifically, an instruction
+    // defining the "other part" of "vreg" would also count as a definition
+    // of "vreg", which would violate the SSA).
+    // If a value of a pair vreg:subreg needs to be obtained, the cell for
+    // "vreg" needs to be looked up, and then the value of subregister "subreg"
+    // needs to be evaluated.
+    class CellMap {
+    public:
+      CellMap() {
+        assert(Top.isTop());
+        Bottom.setBottom();
+      }
+
+      void clear() { Map.clear(); }
+
+      bool has(unsigned R) const {
+        // All non-virtual registers are considered "bottom".
+        if (!TargetRegisterInfo::isVirtualRegister(R))
+          return true;
+        MapType::const_iterator F = Map.find(R);
+        return F != Map.end();
+      }
+
+      const LatticeCell &get(unsigned R) const {
+        if (!TargetRegisterInfo::isVirtualRegister(R))
+          return Bottom;
+        MapType::const_iterator F = Map.find(R);
+        if (F != Map.end())
+          return F->second;
+        return Top;
+      }
+
+      // Invalidates any const references.
+      void update(unsigned R, const LatticeCell &L) {
+        Map[R] = L;
+      }
+
+      void print(raw_ostream &os, const TargetRegisterInfo &TRI) const;
+
+    private:
+      typedef std::map<unsigned,LatticeCell> MapType;
+      MapType Map;
+      // To avoid creating "top" entries, return a const reference to
+      // this cell in "get". Also, have a "Bottom" cell to return from
+      // get when a value of a physical register is requested.
+      LatticeCell Top, Bottom;
+
+    public:
+      typedef MapType::const_iterator const_iterator;
+      const_iterator begin() const { return Map.begin(); }
+      const_iterator end() const { return Map.end(); }
+    };
+
+    bool run(MachineFunction &MF);
+
+  private:
+    void visitPHI(const MachineInstr &PN);
+    void visitNonBranch(const MachineInstr &MI);
+    void visitBranchesFrom(const MachineInstr &BrI);
+    void visitUsesOf(unsigned R);
+    bool computeBlockSuccessors(const MachineBasicBlock *MB,
+          SetVector<const MachineBasicBlock*> &Targets);
+    void removeCFGEdge(MachineBasicBlock *From, MachineBasicBlock *To);
+
+    void propagate(MachineFunction &MF);
+    bool rewrite(MachineFunction &MF);
+
+    MachineRegisterInfo      *MRI;
+    MachineConstEvaluator    &MCE;
+
+    typedef std::pair<unsigned,unsigned> CFGEdge;
+    typedef std::set<CFGEdge> SetOfCFGEdge;
+    typedef std::set<const MachineInstr*> SetOfInstr;
+    typedef std::queue<CFGEdge> QueueOfCFGEdge;
+
+    LatticeCell     Bottom;
+    CellMap         Cells;
+    SetOfCFGEdge    EdgeExec;
+    SetOfInstr      InstrExec;
+    QueueOfCFGEdge  FlowQ;
+  };
+
+  // The "evaluator/rewriter" of machine instructions. This is an abstract
+  // base class that provides the interface that the propagator will use,
+  // as well as some helper functions that are target-independent.
+  class MachineConstEvaluator {
+  public:
+    MachineConstEvaluator(MachineFunction &Fn)
+      : TRI(*Fn.getSubtarget().getRegisterInfo()),
+        MF(Fn), CX(Fn.getFunction()->getContext()) {}
+    virtual ~MachineConstEvaluator() = default;
+
+    // The required interface:
+    // - A set of three "evaluate" functions. Each returns "true" if the
+    //       computation succeeded, "false" otherwise.
+    //   (1) Given an instruction MI, and the map with input values "Inputs",
+    //       compute the set of output values "Outputs". An example of when
+    //       the computation can "fail" is if MI is not an instruction that
+    //       is recognized by the evaluator.
+    //   (2) Given a register R (as reg:subreg), compute the cell that
+    //       corresponds to the "subreg" part of the given register.
+    //   (3) Given a branch instruction BrI, compute the set of target blocks.
+    //       If the branch can fall-through, add null (0) to the list of
+    //       possible targets.
+    // - A function "rewrite", that given the cell map after propagation,
+    //   could rewrite instruction MI in a more beneficial form. Return
+    //   "true" if a change has been made, "false" otherwise.
+    typedef MachineConstPropagator::CellMap CellMap;
+    virtual bool evaluate(const MachineInstr &MI, const CellMap &Inputs,
+                          CellMap &Outputs) = 0;
+    virtual bool evaluate(const Register &R, const LatticeCell &SrcC,
+                          LatticeCell &Result) = 0;
+    virtual bool evaluate(const MachineInstr &BrI, const CellMap &Inputs,
+                          SetVector<const MachineBasicBlock*> &Targets,
+                          bool &CanFallThru) = 0;
+    virtual bool rewrite(MachineInstr &MI, const CellMap &Inputs) = 0;
+
+    const TargetRegisterInfo &TRI;
+
+  protected:
+    MachineFunction &MF;
+    LLVMContext     &CX;
+
+    struct Comparison {
+      enum {
+        Unk = 0x00,
+        EQ  = 0x01,
+        NE  = 0x02,
+        L   = 0x04, // Less-than property.
+        G   = 0x08, // Greater-than property.
+        U   = 0x40, // Unsigned property.
+        LTs = L,
+        LEs = L | EQ,
+        GTs = G,
+        GEs = G | EQ,
+        LTu = L      | U,
+        LEu = L | EQ | U,
+        GTu = G      | U,
+        GEu = G | EQ | U
+      };
+
+      static uint32_t negate(uint32_t Cmp) {
+        if (Cmp == EQ)
+          return NE;
+        if (Cmp == NE)
+          return EQ;
+        assert((Cmp & (L|G)) != (L|G));
+        return Cmp ^ (L|G);
+      }
+    };
+
+    // Helper functions.
+
+    bool getCell(const Register &R, const CellMap &Inputs, LatticeCell &RC);
+    bool constToInt(const Constant *C, APInt &Val) const;
+    bool constToFloat(const Constant *C, APFloat &Val) const;
+    const ConstantInt *intToConst(const APInt &Val) const;
+
+    // Compares.
+    bool evaluateCMPrr(uint32_t Cmp, const Register &R1, const Register &R2,
+          const CellMap &Inputs, bool &Result);
+    bool evaluateCMPri(uint32_t Cmp, const Register &R1, const APInt &A2,
+          const CellMap &Inputs, bool &Result);
+    bool evaluateCMPrp(uint32_t Cmp, const Register &R1, uint64_t Props2,
+          const CellMap &Inputs, bool &Result);
+    bool evaluateCMPii(uint32_t Cmp, const APInt &A1, const APInt &A2,
+          bool &Result);
+    bool evaluateCMPpi(uint32_t Cmp, uint32_t Props, const APInt &A2,
+          bool &Result);
+    bool evaluateCMPpp(uint32_t Cmp, uint32_t Props1, uint32_t Props2,
+          bool &Result);
+
+    bool evaluateCOPY(const Register &R1, const CellMap &Inputs,
+          LatticeCell &Result);
+
+    // Logical operations.
+    bool evaluateANDrr(const Register &R1, const Register &R2,
+          const CellMap &Inputs, LatticeCell &Result);
+    bool evaluateANDri(const Register &R1, const APInt &A2,
+          const CellMap &Inputs, LatticeCell &Result);
+    bool evaluateANDii(const APInt &A1, const APInt &A2, APInt &Result);
+    bool evaluateORrr(const Register &R1, const Register &R2,
+          const CellMap &Inputs, LatticeCell &Result);
+    bool evaluateORri(const Register &R1, const APInt &A2,
+          const CellMap &Inputs, LatticeCell &Result);
+    bool evaluateORii(const APInt &A1, const APInt &A2, APInt &Result);
+    bool evaluateXORrr(const Register &R1, const Register &R2,
+          const CellMap &Inputs, LatticeCell &Result);
+    bool evaluateXORri(const Register &R1, const APInt &A2,
+          const CellMap &Inputs, LatticeCell &Result);
+    bool evaluateXORii(const APInt &A1, const APInt &A2, APInt &Result);
+
+    // Extensions.
+    bool evaluateZEXTr(const Register &R1, unsigned Width, unsigned Bits,
+          const CellMap &Inputs, LatticeCell &Result);
+    bool evaluateZEXTi(const APInt &A1, unsigned Width, unsigned Bits,
+          APInt &Result);
+    bool evaluateSEXTr(const Register &R1, unsigned Width, unsigned Bits,
+          const CellMap &Inputs, LatticeCell &Result);
+    bool evaluateSEXTi(const APInt &A1, unsigned Width, unsigned Bits,
+          APInt &Result);
+
+    // Leading/trailing bits.
+    bool evaluateCLBr(const Register &R1, bool Zeros, bool Ones,
+          const CellMap &Inputs, LatticeCell &Result);
+    bool evaluateCLBi(const APInt &A1, bool Zeros, bool Ones, APInt &Result);
+    bool evaluateCTBr(const Register &R1, bool Zeros, bool Ones,
+          const CellMap &Inputs, LatticeCell &Result);
+    bool evaluateCTBi(const APInt &A1, bool Zeros, bool Ones, APInt &Result);
+
+    // Bitfield extract.
+    bool evaluateEXTRACTr(const Register &R1, unsigned Width, unsigned Bits,
+          unsigned Offset, bool Signed, const CellMap &Inputs,
+          LatticeCell &Result);
+    bool evaluateEXTRACTi(const APInt &A1, unsigned Bits, unsigned Offset,
+          bool Signed, APInt &Result);
+    // Vector operations.
+    bool evaluateSplatr(const Register &R1, unsigned Bits, unsigned Count,
+          const CellMap &Inputs, LatticeCell &Result);
+    bool evaluateSplati(const APInt &A1, unsigned Bits, unsigned Count,
+          APInt &Result);
+  };
+
+} // end anonymous namespace
+
+uint32_t ConstantProperties::deduce(const Constant *C) {
+  if (isa<ConstantInt>(C)) {
+    const ConstantInt *CI = cast<ConstantInt>(C);
+    if (CI->isZero())
+      return Zero | PosOrZero | NegOrZero | Finite;
+    uint32_t Props = (NonZero | Finite);
+    if (CI->isNegative())
+      return Props | NegOrZero;
+    return Props | PosOrZero;
+  }
+
+  if (isa<ConstantFP>(C)) {
+    const ConstantFP *CF = cast<ConstantFP>(C);
+    uint32_t Props = CF->isNegative() ? (NegOrZero|NonZero)
+                                      : PosOrZero;
+    if (CF->isZero())
+      return (Props & ~NumericProperties) | (Zero|Finite);
+    Props = (Props & ~NumericProperties) | NonZero;
+    if (CF->isNaN())
+      return (Props & ~NumericProperties) | NaN;
+    const APFloat &Val = CF->getValueAPF();
+    if (Val.isInfinity())
+      return (Props & ~NumericProperties) | Infinity;
+    Props |= Finite;
+    return Props;
+  }
+
+  return Unknown;
+}
+
+// Convert a cell from a set of specific values to a cell that tracks
+// properties.
+bool LatticeCell::convertToProperty() {
+  if (isProperty())
+    return false;
+  // Corner case: converting a fresh (top) cell to "special".
+  // This can happen, when adding a property to a top cell.
+  uint32_t Everything = ConstantProperties::Everything;
+  uint32_t Ps = !isTop() ? properties()
+                         : Everything;
+  if (Ps != ConstantProperties::Unknown) {
+    Properties = Ps;
+    setProperty();
+  } else {
+    setBottom();
+  }
+  return true;
+}
+
+void LatticeCell::print(raw_ostream &os) const {
+  if (isProperty()) {
+    os << "{ ";
+    uint32_t Ps = properties();
+    if (Ps & ConstantProperties::Zero)
+      os << "zero ";
+    if (Ps & ConstantProperties::NonZero)
+      os << "nonzero ";
+    if (Ps & ConstantProperties::Finite)
+      os << "finite ";
+    if (Ps & ConstantProperties::Infinity)
+      os << "infinity ";
+    if (Ps & ConstantProperties::NaN)
+      os << "nan ";
+    if (Ps & ConstantProperties::PosOrZero)
+      os << "poz ";
+    if (Ps & ConstantProperties::NegOrZero)
+      os << "nez ";
+    os << '}';
+    return;
+  }
+
+  os << "{ ";
+  if (isBottom()) {
+    os << "bottom";
+  } else if (isTop()) {
+    os << "top";
+  } else {
+    for (unsigned i = 0; i < size(); ++i) {
+      const Constant *C = Values[i];
+      if (i != 0)
+        os << ", ";
+      C->print(os);
+    }
+  }
+  os << " }";
+}
+
+// "Meet" operation on two cells. This is the key of the propagation
+// algorithm.
+bool LatticeCell::meet(const LatticeCell &L) {
+  bool Changed = false;
+  if (L.isBottom())
+    Changed = setBottom();
+  if (isBottom() || L.isTop())
+    return Changed;
+  if (isTop()) {
+    *this = L;
+    // L can be neither Top nor Bottom, so *this must have changed.
+    return true;
+  }
+
+  // Top/bottom cases covered. Need to integrate L's set into ours.
+  if (L.isProperty())
+    return add(L.properties());
+  for (unsigned i = 0; i < L.size(); ++i) {
+    const Constant *LC = L.Values[i];
+    Changed |= add(LC);
+  }
+  return Changed;
+}
+
+// Add a new constant to the cell. This is actually where the cell update
+// happens. If a cell has room for more constants, the new constant is added.
+// Otherwise, the cell is converted to a "property" cell (i.e. a cell that
+// will track properties of the associated values, and not the values
+// themselves. Care is taken to handle special cases, like "bottom", etc.
+bool LatticeCell::add(const Constant *LC) {
+  assert(LC);
+  if (isBottom())
+    return false;
+
+  if (!isProperty()) {
+    // Cell is not special. Try to add the constant here first,
+    // if there is room.
+    unsigned Index = 0;
+    while (Index < Size) {
+      const Constant *C = Values[Index];
+      // If the constant is already here, no change is needed.
+      if (C == LC)
+        return false;
+      Index++;
+    }
+    if (Index < MaxCellSize) {
+      Values[Index] = LC;
+      Kind = Normal;
+      Size++;
+      return true;
+    }
+  }
+
+  bool Changed = false;
+
+  // This cell is special, or is not special, but is full. After this
+  // it will be special.
+  Changed = convertToProperty();
+  uint32_t Ps = properties();
+  uint32_t NewPs = Ps & ConstantProperties::deduce(LC);
+  if (NewPs == ConstantProperties::Unknown) {
+    setBottom();
+    return true;
+  }
+  if (Ps != NewPs) {
+    Properties = NewPs;
+    Changed = true;
+  }
+  return Changed;
+}
+
+// Add a property to the cell. This will force the cell to become a property-
+// tracking cell.
+bool LatticeCell::add(uint32_t Property) {
+  bool Changed = convertToProperty();
+  uint32_t Ps = properties();
+  if (Ps == (Ps & Property))
+    return Changed;
+  Properties = Property & Ps;
+  return true;
+}
+
+// Return the properties of the values in the cell. This is valid for any
+// cell, and does not alter the cell itself.
+uint32_t LatticeCell::properties() const {
+  if (isProperty())
+    return Properties;
+  assert(!isTop() && "Should not call this for a top cell");
+  if (isBottom())
+    return ConstantProperties::Unknown;
+
+  assert(size() > 0 && "Empty cell");
+  uint32_t Ps = ConstantProperties::deduce(Values[0]);
+  for (unsigned i = 1; i < size(); ++i) {
+    if (Ps == ConstantProperties::Unknown)
+      break;
+    Ps &= ConstantProperties::deduce(Values[i]);
+  }
+  return Ps;
+}
+
+void MachineConstPropagator::CellMap::print(raw_ostream &os,
+      const TargetRegisterInfo &TRI) const {
+  for (auto &I : Map)
+    dbgs() << "  " << PrintReg(I.first, &TRI) << " -> " << I.second << '\n';
+}
+
+void MachineConstPropagator::visitPHI(const MachineInstr &PN) {
+  const MachineBasicBlock *MB = PN.getParent();
+  unsigned MBN = MB->getNumber();
+  DEBUG(dbgs() << "Visiting FI(BB#" << MBN << "): " << PN);
+
+  const MachineOperand &MD = PN.getOperand(0);
+  Register DefR(MD);
+  assert(TargetRegisterInfo::isVirtualRegister(DefR.Reg));
+
+  bool Changed = false;
+
+  // If the def has a sub-register, set the corresponding cell to "bottom".
+  if (DefR.SubReg) {
+Bottomize:
+    const LatticeCell &T = Cells.get(DefR.Reg);
+    Changed = !T.isBottom();
+    Cells.update(DefR.Reg, Bottom);
+    if (Changed)
+      visitUsesOf(DefR.Reg);
+    return;
+  }
+
+  LatticeCell DefC = Cells.get(DefR.Reg);
+
+  for (unsigned i = 1, n = PN.getNumOperands(); i < n; i += 2) {
+    const MachineBasicBlock *PB = PN.getOperand(i+1).getMBB();
+    unsigned PBN = PB->getNumber();
+    if (!EdgeExec.count(CFGEdge(PBN, MBN))) {
+      DEBUG(dbgs() << "  edge BB#" << PBN << "->BB#" << MBN
+                   << " not executable\n");
+      continue;
+    }
+    const MachineOperand &SO = PN.getOperand(i);
+    Register UseR(SO);
+    // If the input is not a virtual register, we don't really know what
+    // value it holds.
+    if (!TargetRegisterInfo::isVirtualRegister(UseR.Reg))
+      goto Bottomize;
+    // If there is no cell for an input register, it means top.
+    if (!Cells.has(UseR.Reg))
+      continue;
+
+    LatticeCell SrcC;
+    bool Eval = MCE.evaluate(UseR, Cells.get(UseR.Reg), SrcC);
+    DEBUG(dbgs() << "  edge from BB#" << PBN << ": "
+                 << PrintReg(UseR.Reg, &MCE.TRI, UseR.SubReg)
+                 << SrcC << '\n');
+    Changed |= Eval ? DefC.meet(SrcC)
+                    : DefC.setBottom();
+    Cells.update(DefR.Reg, DefC);
+    if (DefC.isBottom())
+      break;
+  }
+  if (Changed)
+    visitUsesOf(DefR.Reg);
+}
+
+void MachineConstPropagator::visitNonBranch(const MachineInstr &MI) {
+  DEBUG(dbgs() << "Visiting MI(BB#" << MI.getParent()->getNumber()
+               << "): " << MI);
+  CellMap Outputs;
+  bool Eval = MCE.evaluate(MI, Cells, Outputs);
+  DEBUG({
+    if (Eval) {
+      dbgs() << "  outputs:";
+      for (auto &I : Outputs)
+        dbgs() << ' ' << I.second;
+      dbgs() << '\n';
+    }
+  });
+
+  // Update outputs. If the value was not computed, set all the
+  // def cells to bottom.
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    Register DefR(MO);
+    // Only track virtual registers.
+    if (!TargetRegisterInfo::isVirtualRegister(DefR.Reg))
+      continue;
+    bool Changed = false;
+    // If the evaluation failed, set cells for all output registers to bottom.
+    if (!Eval) {
+      const LatticeCell &T = Cells.get(DefR.Reg);
+      Changed = !T.isBottom();
+      Cells.update(DefR.Reg, Bottom);
+    } else {
+      // Find the corresponding cell in the computed outputs.
+      // If it's not there, go on to the next def.
+      if (!Outputs.has(DefR.Reg))
+        continue;
+      LatticeCell RC = Cells.get(DefR.Reg);
+      Changed = RC.meet(Outputs.get(DefR.Reg));
+      Cells.update(DefR.Reg, RC);
+    }
+    if (Changed)
+      visitUsesOf(DefR.Reg);
+  }
+}
+
+// \brief Starting at a given branch, visit remaining branches in the block.
+// Traverse over the subsequent branches for as long as the preceding one
+// can fall through. Add all the possible targets to the flow work queue,
+// including the potential fall-through to the layout-successor block.
+void MachineConstPropagator::visitBranchesFrom(const MachineInstr &BrI) {
+  const MachineBasicBlock &B = *BrI.getParent();
+  unsigned MBN = B.getNumber();
+  MachineBasicBlock::const_iterator It = BrI.getIterator();
+  MachineBasicBlock::const_iterator End = B.end();
+
+  SetVector<const MachineBasicBlock*> Targets;
+  bool EvalOk = true, FallsThru = true;
+  while (It != End) {
+    const MachineInstr &MI = *It;
+    InstrExec.insert(&MI);
+    DEBUG(dbgs() << "Visiting " << (EvalOk ? "BR" : "br") << "(BB#"
+                 << MBN << "): " << MI);
+    // Do not evaluate subsequent branches if the evaluation of any of the
+    // previous branches failed. Keep iterating over the branches only
+    // to mark them as executable.
+    EvalOk = EvalOk && MCE.evaluate(MI, Cells, Targets, FallsThru);
+    if (!EvalOk)
+      FallsThru = true;
+    if (!FallsThru)
+      break;
+    ++It;
+  }
+
+  if (EvalOk) {
+    // Need to add all CFG successors that lead to EH landing pads.
+    // There won't be explicit branches to these blocks, but they must
+    // be processed.
+    for (const MachineBasicBlock *SB : B.successors()) {
+      if (SB->isEHPad())
+        Targets.insert(SB);
+    }
+    if (FallsThru) {
+      const MachineFunction &MF = *B.getParent();
+      MachineFunction::const_iterator BI = B.getIterator();
+      MachineFunction::const_iterator Next = std::next(BI);
+      if (Next != MF.end())
+        Targets.insert(&*Next);
+    }
+  } else {
+    // If the evaluation of the branches failed, make "Targets" to be the
+    // set of all successors of the block from the CFG.
+    // If the evaluation succeeded for all visited branches, then if the
+    // last one set "FallsThru", then add an edge to the layout successor
+    // to the targets.
+    Targets.clear();
+    DEBUG(dbgs() << "  failed to evaluate a branch...adding all CFG "
+                    "successors\n");
+    for (const MachineBasicBlock *SB : B.successors())
+      Targets.insert(SB);
+  }
+
+  for (const MachineBasicBlock *TB : Targets) {
+    unsigned TBN = TB->getNumber();
+    DEBUG(dbgs() << "  pushing edge BB#" << MBN << " -> BB#" << TBN << "\n");
+    FlowQ.push(CFGEdge(MBN, TBN));
+  }
+}
+
+void MachineConstPropagator::visitUsesOf(unsigned Reg) {
+  DEBUG(dbgs() << "Visiting uses of " << PrintReg(Reg, &MCE.TRI)
+               << Cells.get(Reg) << '\n');
+  for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
+    // Do not process non-executable instructions. They can become exceutable
+    // later (via a flow-edge in the work queue). In such case, the instruc-
+    // tion will be visited at that time.
+    if (!InstrExec.count(&MI))
+      continue;
+    if (MI.isPHI())
+      visitPHI(MI);
+    else if (!MI.isBranch())
+      visitNonBranch(MI);
+    else
+      visitBranchesFrom(MI);
+  }
+}
+
+bool MachineConstPropagator::computeBlockSuccessors(const MachineBasicBlock *MB,
+      SetVector<const MachineBasicBlock*> &Targets) {
+  MachineBasicBlock::const_iterator FirstBr = MB->end();
+  for (const MachineInstr &MI : *MB) {
+    if (MI.isDebugValue())
+      continue;
+    if (MI.isBranch()) {
+      FirstBr = MI.getIterator();
+      break;
+    }
+  }
+
+  Targets.clear();
+  MachineBasicBlock::const_iterator End = MB->end();
+
+  bool DoNext = true;
+  for (MachineBasicBlock::const_iterator I = FirstBr; I != End; ++I) {
+    const MachineInstr &MI = *I;
+    // Can there be debug instructions between branches?
+    if (MI.isDebugValue())
+      continue;
+    if (!InstrExec.count(&MI))
+      continue;
+    bool Eval = MCE.evaluate(MI, Cells, Targets, DoNext);
+    if (!Eval)
+      return false;
+    if (!DoNext)
+      break;
+  }
+  // If the last branch could fall-through, add block's layout successor.
+  if (DoNext) {
+    MachineFunction::const_iterator BI = MB->getIterator();
+    MachineFunction::const_iterator NextI = std::next(BI);
+    if (NextI != MB->getParent()->end())
+      Targets.insert(&*NextI);
+  }
+
+  // Add all the EH landing pads.
+  for (const MachineBasicBlock *SB : MB->successors())
+    if (SB->isEHPad())
+      Targets.insert(SB);
+
+  return true;
+}
+
+void MachineConstPropagator::removeCFGEdge(MachineBasicBlock *From,
+      MachineBasicBlock *To) {
+  // First, remove the CFG successor/predecessor information.
+  From->removeSuccessor(To);
+  // Remove all corresponding PHI operands in the To block.
+  for (auto I = To->begin(), E = To->getFirstNonPHI(); I != E; ++I) {
+    MachineInstr *PN = &*I;
+    // reg0 = PHI reg1, bb2, reg3, bb4, ...
+    int N = PN->getNumOperands()-2;
+    while (N > 0) {
+      if (PN->getOperand(N+1).getMBB() == From) {
+        PN->RemoveOperand(N+1);
+        PN->RemoveOperand(N);
+      }
+      N -= 2;
+    }
+  }
+}
+
+void MachineConstPropagator::propagate(MachineFunction &MF) {
+  MachineBasicBlock *Entry = GraphTraits<MachineFunction*>::getEntryNode(&MF);
+  unsigned EntryNum = Entry->getNumber();
+
+  // Start with a fake edge, just to process the entry node.
+  FlowQ.push(CFGEdge(EntryNum, EntryNum));
+
+  while (!FlowQ.empty()) {
+    CFGEdge Edge = FlowQ.front();
+    FlowQ.pop();
+
+    DEBUG(dbgs() << "Picked edge BB#" << Edge.first << "->BB#"
+                 << Edge.second << '\n');
+    if (Edge.first != EntryNum)
+      if (EdgeExec.count(Edge))
+        continue;
+    EdgeExec.insert(Edge);
+    MachineBasicBlock *SB = MF.getBlockNumbered(Edge.second);
+
+    // Process the block in three stages:
+    // - visit all PHI nodes,
+    // - visit all non-branch instructions,
+    // - visit block branches.
+    MachineBasicBlock::const_iterator It = SB->begin(), End = SB->end();
+
+    // Visit PHI nodes in the successor block.
+    while (It != End && It->isPHI()) {
+      InstrExec.insert(&*It);
+      visitPHI(*It);
+      ++It;
+    }
+
+    // If the successor block just became executable, visit all instructions.
+    // To see if this is the first time we're visiting it, check the first
+    // non-debug instruction to see if it is executable.
+    while (It != End && It->isDebugValue())
+      ++It;
+    assert(It == End || !It->isPHI());
+    // If this block has been visited, go on to the next one.
+    if (It != End && InstrExec.count(&*It))
+      continue;
+    // For now, scan all non-branch instructions. Branches require different
+    // processing.
+    while (It != End && !It->isBranch()) {
+      if (!It->isDebugValue()) {
+        InstrExec.insert(&*It);
+        visitNonBranch(*It);
+      }
+      ++It;
+    }
+
+    // Time to process the end of the block. This is different from
+    // processing regular (non-branch) instructions, because there can
+    // be multiple branches in a block, and they can cause the block to
+    // terminate early.
+    if (It != End) {
+      visitBranchesFrom(*It);
+    } else {
+      // If the block didn't have a branch, add all successor edges to the
+      // work queue. (There should really be only one successor in such case.)
+      unsigned SBN = SB->getNumber();
+      for (const MachineBasicBlock *SSB : SB->successors())
+        FlowQ.push(CFGEdge(SBN, SSB->getNumber()));
+    }
+  } // while (FlowQ)
+
+  DEBUG({
+    dbgs() << "Cells after propagation:\n";
+    Cells.print(dbgs(), MCE.TRI);
+    dbgs() << "Dead CFG edges:\n";
+    for (const MachineBasicBlock &B : MF) {
+      unsigned BN = B.getNumber();
+      for (const MachineBasicBlock *SB : B.successors()) {
+        unsigned SN = SB->getNumber();
+        if (!EdgeExec.count(CFGEdge(BN, SN)))
+          dbgs() << "  BB#" << BN << " -> BB#" << SN << '\n';
+      }
+    }
+  });
+}
+
+bool MachineConstPropagator::rewrite(MachineFunction &MF) {
+  bool Changed = false;
+  // Rewrite all instructions based on the collected cell information.
+  //
+  // Traverse the instructions in a post-order, so that rewriting an
+  // instruction can make changes "downstream" in terms of control-flow
+  // without affecting the rewriting process. (We should not change
+  // instructions that have not yet been visited by the rewriter.)
+  // The reason for this is that the rewriter can introduce new vregs,
+  // and replace uses of old vregs (which had corresponding cells
+  // computed during propagation) with these new vregs (which at this
+  // point would not have any cells, and would appear to be "top").
+  // If an attempt was made to evaluate an instruction with a fresh
+  // "top" vreg, it would cause an error (abend) in the evaluator.
+
+  // Collect the post-order-traversal block ordering. The subsequent
+  // traversal/rewrite will update block successors, so it's safer
+  // if the visiting order it computed ahead of time.
+  std::vector<MachineBasicBlock*> POT;
+  for (MachineBasicBlock *B : post_order(&MF))
+    if (!B->empty())
+      POT.push_back(B);
+
+  for (MachineBasicBlock *B : POT) {
+    // Walk the block backwards (which usually begin with the branches).
+    // If any branch is rewritten, we may need to update the successor
+    // information for this block. Unless the block's successors can be
+    // precisely determined (which may not be the case for indirect
+    // branches), we cannot modify any branch.
+
+    // Compute the successor information.
+    SetVector<const MachineBasicBlock*> Targets;
+    bool HaveTargets = computeBlockSuccessors(B, Targets);
+    // Rewrite the executable instructions. Skip branches if we don't
+    // have block successor information.
+    for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) {
+      MachineInstr &MI = *I;
+      if (InstrExec.count(&MI)) {
+        if (MI.isBranch() && !HaveTargets)
+          continue;
+        Changed |= MCE.rewrite(MI, Cells);
+      }
+    }
+    // The rewriting could rewrite PHI nodes to non-PHI nodes, causing
+    // regular instructions to appear in between PHI nodes. Bring all
+    // the PHI nodes to the beginning of the block.
+    for (auto I = B->begin(), E = B->end(); I != E; ++I) {
+      if (I->isPHI())
+        continue;
+      // I is not PHI. Find the next PHI node P.
+      auto P = I;
+      while (++P != E)
+        if (P->isPHI())
+          break;
+      // Not found.
+      if (P == E)
+        break;
+      // Splice P right before I.
+      B->splice(I, B, P);
+      // Reset I to point at the just spliced PHI node.
+      --I;
+    }
+    // Update the block successor information: remove unnecessary successors.
+    if (HaveTargets) {
+      SmallVector<MachineBasicBlock*,2> ToRemove;
+      for (MachineBasicBlock *SB : B->successors()) {
+        if (!Targets.count(SB))
+          ToRemove.push_back(const_cast<MachineBasicBlock*>(SB));
+        Targets.remove(SB);
+      }
+      for (unsigned i = 0, n = ToRemove.size(); i < n; ++i)
+        removeCFGEdge(B, ToRemove[i]);
+      // If there are any blocks left in the computed targets, it means that
+      // we think that the block could go somewhere, but the CFG does not.
+      // This could legitimately happen in blocks that have non-returning
+      // calls---we would think that the execution can continue, but the
+      // CFG will not have a successor edge.
+    }
+  }
+  // Need to do some final post-processing.
+  // If a branch was not executable, it will not get rewritten, but should
+  // be removed (or replaced with something equivalent to a A2_nop). We can't
+  // erase instructions during rewriting, so this needs to be delayed until
+  // now.
+  for (MachineBasicBlock &B : MF) {
+    MachineBasicBlock::iterator I = B.begin(), E = B.end();
+    while (I != E) {
+      auto Next = std::next(I);
+      if (I->isBranch() && !InstrExec.count(&*I))
+        B.erase(I);
+      I = Next;
+    }
+  }
+  return Changed;
+}
+
+// This is the constant propagation algorithm as described by Wegman-Zadeck.
+// Most of the terminology comes from there.
+bool MachineConstPropagator::run(MachineFunction &MF) {
+  DEBUG(MF.print(dbgs() << "Starting MachineConstPropagator\n", 0));
+
+  MRI = &MF.getRegInfo();
+
+  Cells.clear();
+  EdgeExec.clear();
+  InstrExec.clear();
+  assert(FlowQ.empty());
+
+  propagate(MF);
+  bool Changed = rewrite(MF);
+
+  DEBUG({
+    dbgs() << "End of MachineConstPropagator (Changed=" << Changed << ")\n";
+    if (Changed)
+      MF.print(dbgs(), 0);
+  });
+  return Changed;
+}
+
+// --------------------------------------------------------------------
+// Machine const evaluator.
+
+bool MachineConstEvaluator::getCell(const Register &R, const CellMap &Inputs,
+      LatticeCell &RC) {
+  if (!TargetRegisterInfo::isVirtualRegister(R.Reg))
+    return false;
+  const LatticeCell &L = Inputs.get(R.Reg);
+  if (!R.SubReg) {
+    RC = L;
+    return !RC.isBottom();
+  }
+  bool Eval = evaluate(R, L, RC);
+  return Eval && !RC.isBottom();
+}
+
+bool MachineConstEvaluator::constToInt(const Constant *C,
+      APInt &Val) const {
+  const ConstantInt *CI = dyn_cast<ConstantInt>(C);
+  if (!CI)
+    return false;
+  Val = CI->getValue();
+  return true;
+}
+
+const ConstantInt *MachineConstEvaluator::intToConst(const APInt &Val) const {
+  return ConstantInt::get(CX, Val);
+}
+
+bool MachineConstEvaluator::evaluateCMPrr(uint32_t Cmp, const Register &R1,
+      const Register &R2, const CellMap &Inputs, bool &Result) {
+  assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
+  LatticeCell LS1, LS2;
+  if (!getCell(R1, Inputs, LS1) || !getCell(R2, Inputs, LS2))
+    return false;
+
+  bool IsProp1 = LS1.isProperty();
+  bool IsProp2 = LS2.isProperty();
+  if (IsProp1) {
+    uint32_t Prop1 = LS1.properties();
+    if (IsProp2)
+      return evaluateCMPpp(Cmp, Prop1, LS2.properties(), Result);
+    uint32_t NegCmp = Comparison::negate(Cmp);
+    return evaluateCMPrp(NegCmp, R2, Prop1, Inputs, Result);
+  }
+  if (IsProp2) {
+    uint32_t Prop2 = LS2.properties();
+    return evaluateCMPrp(Cmp, R1, Prop2, Inputs, Result);
+  }
+
+  APInt A;
+  bool IsTrue = true, IsFalse = true;
+  for (unsigned i = 0; i < LS2.size(); ++i) {
+    bool Res;
+    bool Computed = constToInt(LS2.Values[i], A) &&
+                    evaluateCMPri(Cmp, R1, A, Inputs, Res);
+    if (!Computed)
+      return false;
+    IsTrue &= Res;
+    IsFalse &= !Res;
+  }
+  assert(!IsTrue || !IsFalse);
+  // The actual logical value of the comparison is same as IsTrue.
+  Result = IsTrue;
+  // Return true if the result was proven to be true or proven to be false.
+  return IsTrue || IsFalse;
+}
+
+bool MachineConstEvaluator::evaluateCMPri(uint32_t Cmp, const Register &R1,
+      const APInt &A2, const CellMap &Inputs, bool &Result) {
+  assert(Inputs.has(R1.Reg));
+  LatticeCell LS;
+  if (!getCell(R1, Inputs, LS))
+    return false;
+  if (LS.isProperty())
+    return evaluateCMPpi(Cmp, LS.properties(), A2, Result);
+
+  APInt A;
+  bool IsTrue = true, IsFalse = true;
+  for (unsigned i = 0; i < LS.size(); ++i) {
+    bool Res;
+    bool Computed = constToInt(LS.Values[i], A) &&
+                    evaluateCMPii(Cmp, A, A2, Res);
+    if (!Computed)
+      return false;
+    IsTrue &= Res;
+    IsFalse &= !Res;
+  }
+  assert(!IsTrue || !IsFalse);
+  // The actual logical value of the comparison is same as IsTrue.
+  Result = IsTrue;
+  // Return true if the result was proven to be true or proven to be false.
+  return IsTrue || IsFalse;
+}
+
+bool MachineConstEvaluator::evaluateCMPrp(uint32_t Cmp, const Register &R1,
+      uint64_t Props2, const CellMap &Inputs, bool &Result) {
+  assert(Inputs.has(R1.Reg));
+  LatticeCell LS;
+  if (!getCell(R1, Inputs, LS))
+    return false;
+  if (LS.isProperty())
+    return evaluateCMPpp(Cmp, LS.properties(), Props2, Result);
+
+  APInt A;
+  uint32_t NegCmp = Comparison::negate(Cmp);
+  bool IsTrue = true, IsFalse = true;
+  for (unsigned i = 0; i < LS.size(); ++i) {
+    bool Res;
+    bool Computed = constToInt(LS.Values[i], A) &&
+                    evaluateCMPpi(NegCmp, Props2, A, Res);
+    if (!Computed)
+      return false;
+    IsTrue &= Res;
+    IsFalse &= !Res;
+  }
+  assert(!IsTrue || !IsFalse);
+  Result = IsTrue;
+  return IsTrue || IsFalse;
+}
+
+bool MachineConstEvaluator::evaluateCMPii(uint32_t Cmp, const APInt &A1,
+      const APInt &A2, bool &Result) {
+  // NE is a special kind of comparison (not composed of smaller properties).
+  if (Cmp == Comparison::NE) {
+    Result = !APInt::isSameValue(A1, A2);
+    return true;
+  }
+  if (Cmp == Comparison::EQ) {
+    Result = APInt::isSameValue(A1, A2);
+    return true;
+  }
+  if (Cmp & Comparison::EQ) {
+    if (APInt::isSameValue(A1, A2))
+      return (Result = true);
+  }
+  assert((Cmp & (Comparison::L | Comparison::G)) && "Malformed comparison");
+  Result = false;
+
+  unsigned W1 = A1.getBitWidth();
+  unsigned W2 = A2.getBitWidth();
+  unsigned MaxW = (W1 >= W2) ? W1 : W2;
+  if (Cmp & Comparison::U) {
+    const APInt Zx1 = A1.zextOrSelf(MaxW);
+    const APInt Zx2 = A2.zextOrSelf(MaxW);
+    if (Cmp & Comparison::L)
+      Result = Zx1.ult(Zx2);
+    else if (Cmp & Comparison::G)
+      Result = Zx2.ult(Zx1);
+    return true;
+  }
+
+  // Signed comparison.
+  const APInt Sx1 = A1.sextOrSelf(MaxW);
+  const APInt Sx2 = A2.sextOrSelf(MaxW);
+  if (Cmp & Comparison::L)
+    Result = Sx1.slt(Sx2);
+  else if (Cmp & Comparison::G)
+    Result = Sx2.slt(Sx1);
+  return true;
+}
+
+bool MachineConstEvaluator::evaluateCMPpi(uint32_t Cmp, uint32_t Props,
+      const APInt &A2, bool &Result) {
+  if (Props == ConstantProperties::Unknown)
+    return false;
+
+  // Should never see NaN here, but check for it for completeness.
+  if (Props & ConstantProperties::NaN)
+    return false;
+  // Infinity could theoretically be compared to a number, but the
+  // presence of infinity here would be very suspicious. If we don't
+  // know for sure that the number is finite, bail out.
+  if (!(Props & ConstantProperties::Finite))
+    return false;
+
+  // Let X be a number that has properties Props.
+
+  if (Cmp & Comparison::U) {
+    // In case of unsigned comparisons, we can only compare against 0.
+    if (A2 == 0) {
+      // Any x!=0 will be considered >0 in an unsigned comparison.
+      if (Props & ConstantProperties::Zero)
+        Result = (Cmp & Comparison::EQ);
+      else if (Props & ConstantProperties::NonZero)
+        Result = (Cmp & Comparison::G) || (Cmp == Comparison::NE);
+      else
+        return false;
+      return true;
+    }
+    // A2 is not zero. The only handled case is if X = 0.
+    if (Props & ConstantProperties::Zero) {
+      Result = (Cmp & Comparison::L) || (Cmp == Comparison::NE);
+      return true;
+    }
+    return false;
+  }
+
+  // Signed comparisons are different.
+  if (Props & ConstantProperties::Zero) {
+    if (A2 == 0)
+      Result = (Cmp & Comparison::EQ);
+    else
+      Result = (Cmp == Comparison::NE) ||
+               ((Cmp & Comparison::L) && !A2.isNegative()) ||
+               ((Cmp & Comparison::G) &&  A2.isNegative());
+    return true;
+  }
+  if (Props & ConstantProperties::PosOrZero) {
+    // X >= 0 and !(A2 < 0) => cannot compare
+    if (!A2.isNegative())
+      return false;
+    // X >= 0 and A2 < 0
+    Result = (Cmp & Comparison::G) || (Cmp == Comparison::NE);
+    return true;
+  }
+  if (Props & ConstantProperties::NegOrZero) {
+    // X <= 0 and Src1 < 0 => cannot compare
+    if (A2 == 0 || A2.isNegative())
+      return false;
+    // X <= 0 and A2 > 0
+    Result = (Cmp & Comparison::L) || (Cmp == Comparison::NE);
+    return true;
+  }
+
+  return false;
+}
+
+bool MachineConstEvaluator::evaluateCMPpp(uint32_t Cmp, uint32_t Props1,
+      uint32_t Props2, bool &Result) {
+  typedef ConstantProperties P;
+  if ((Props1 & P::NaN) && (Props2 & P::NaN))
+    return false;
+  if (!(Props1 & P::Finite) || !(Props2 & P::Finite))
+    return false;
+
+  bool Zero1 = (Props1 & P::Zero), Zero2 = (Props2 & P::Zero);
+  bool NonZero1 = (Props1 & P::NonZero), NonZero2 = (Props2 & P::NonZero);
+  if (Zero1 && Zero2) {
+    Result = (Cmp & Comparison::EQ);
+    return true;
+  }
+  if (Cmp == Comparison::NE) {
+    if ((Zero1 && NonZero2) || (NonZero1 && Zero2))
+      return (Result = true);
+    return false;
+  }
+
+  if (Cmp & Comparison::U) {
+    // In unsigned comparisons, we can only compare against a known zero,
+    // or a known non-zero.
+    if (Zero1 && NonZero2) {
+      Result = (Cmp & Comparison::L);
+      return true;
+    }
+    if (NonZero1 && Zero2) {
+      Result = (Cmp & Comparison::G);
+      return true;
+    }
+    return false;
+  }
+
+  // Signed comparison. The comparison is not NE.
+  bool Poz1 = (Props1 & P::PosOrZero), Poz2 = (Props2 & P::PosOrZero);
+  bool Nez1 = (Props1 & P::NegOrZero), Nez2 = (Props2 & P::NegOrZero);
+  if (Nez1 && Poz2) {
+    if (NonZero1 || NonZero2) {
+      Result = (Cmp & Comparison::L);
+      return true;
+    }
+    // Either (or both) could be zero. Can only say that X <= Y.
+    if ((Cmp & Comparison::EQ) && (Cmp & Comparison::L))
+      return (Result = true);
+  }
+  if (Poz1 && Nez2) {
+    if (NonZero1 || NonZero2) {
+      Result = (Cmp & Comparison::G);
+      return true;
+    }
+    // Either (or both) could be zero. Can only say that X >= Y.
+    if ((Cmp & Comparison::EQ) && (Cmp & Comparison::G))
+      return (Result = true);
+  }
+
+  return false;
+}
+
+bool MachineConstEvaluator::evaluateCOPY(const Register &R1,
+      const CellMap &Inputs, LatticeCell &Result) {
+  return getCell(R1, Inputs, Result);
+}
+
+bool MachineConstEvaluator::evaluateANDrr(const Register &R1,
+      const Register &R2, const CellMap &Inputs, LatticeCell &Result) {
+  assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
+  const LatticeCell &L1 = Inputs.get(R2.Reg);
+  const LatticeCell &L2 = Inputs.get(R2.Reg);
+  // If both sources are bottom, exit. Otherwise try to evaluate ANDri
+  // with the non-bottom argument passed as the immediate. This is to
+  // catch cases of ANDing with 0.
+  if (L2.isBottom()) {
+    if (L1.isBottom())
+      return false;
+    return evaluateANDrr(R2, R1, Inputs, Result);
+  }
+  LatticeCell LS2;
+  if (!evaluate(R2, L2, LS2))
+    return false;
+  if (LS2.isBottom() || LS2.isProperty())
+    return false;
+
+  APInt A;
+  for (unsigned i = 0; i < LS2.size(); ++i) {
+    LatticeCell RC;
+    bool Eval = constToInt(LS2.Values[i], A) &&
+                evaluateANDri(R1, A, Inputs, RC);
+    if (!Eval)
+      return false;
+    Result.meet(RC);
+  }
+  return !Result.isBottom();
+}
+
+bool MachineConstEvaluator::evaluateANDri(const Register &R1,
+      const APInt &A2, const CellMap &Inputs, LatticeCell &Result) {
+  assert(Inputs.has(R1.Reg));
+  if (A2 == -1)
+    return getCell(R1, Inputs, Result);
+  if (A2 == 0) {
+    LatticeCell RC;
+    RC.add(intToConst(A2));
+    // Overwrite Result.
+    Result = RC;
+    return true;
+  }
+  LatticeCell LS1;
+  if (!getCell(R1, Inputs, LS1))
+    return false;
+  if (LS1.isBottom() || LS1.isProperty())
+    return false;
+
+  APInt A, ResA;
+  for (unsigned i = 0; i < LS1.size(); ++i) {
+    bool Eval = constToInt(LS1.Values[i], A) &&
+                evaluateANDii(A, A2, ResA);
+    if (!Eval)
+      return false;
+    const Constant *C = intToConst(ResA);
+    Result.add(C);
+  }
+  return !Result.isBottom();
+}
+
+bool MachineConstEvaluator::evaluateANDii(const APInt &A1,
+      const APInt &A2, APInt &Result) {
+  Result = A1 & A2;
+  return true;
+}
+
+bool MachineConstEvaluator::evaluateORrr(const Register &R1,
+      const Register &R2, const CellMap &Inputs, LatticeCell &Result) {
+  assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
+  const LatticeCell &L1 = Inputs.get(R2.Reg);
+  const LatticeCell &L2 = Inputs.get(R2.Reg);
+  // If both sources are bottom, exit. Otherwise try to evaluate ORri
+  // with the non-bottom argument passed as the immediate. This is to
+  // catch cases of ORing with -1.
+  if (L2.isBottom()) {
+    if (L1.isBottom())
+      return false;
+    return evaluateORrr(R2, R1, Inputs, Result);
+  }
+  LatticeCell LS2;
+  if (!evaluate(R2, L2, LS2))
+    return false;
+  if (LS2.isBottom() || LS2.isProperty())
+    return false;
+
+  APInt A;
+  for (unsigned i = 0; i < LS2.size(); ++i) {
+    LatticeCell RC;
+    bool Eval = constToInt(LS2.Values[i], A) &&
+                evaluateORri(R1, A, Inputs, RC);
+    if (!Eval)
+      return false;
+    Result.meet(RC);
+  }
+  return !Result.isBottom();
+}
+
+bool MachineConstEvaluator::evaluateORri(const Register &R1,
+      const APInt &A2, const CellMap &Inputs, LatticeCell &Result) {
+  assert(Inputs.has(R1.Reg));
+  if (A2 == 0)
+    return getCell(R1, Inputs, Result);
+  if (A2 == -1) {
+    LatticeCell RC;
+    RC.add(intToConst(A2));
+    // Overwrite Result.
+    Result = RC;
+    return true;
+  }
+  LatticeCell LS1;
+  if (!getCell(R1, Inputs, LS1))
+    return false;
+  if (LS1.isBottom() || LS1.isProperty())
+    return false;
+
+  APInt A, ResA;
+  for (unsigned i = 0; i < LS1.size(); ++i) {
+    bool Eval = constToInt(LS1.Values[i], A) &&
+                evaluateORii(A, A2, ResA);
+    if (!Eval)
+      return false;
+    const Constant *C = intToConst(ResA);
+    Result.add(C);
+  }
+  return !Result.isBottom();
+}
+
+bool MachineConstEvaluator::evaluateORii(const APInt &A1,
+      const APInt &A2, APInt &Result) {
+  Result = A1 | A2;
+  return true;
+}
+
+bool MachineConstEvaluator::evaluateXORrr(const Register &R1,
+      const Register &R2, const CellMap &Inputs, LatticeCell &Result) {
+  assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
+  LatticeCell LS1, LS2;
+  if (!getCell(R1, Inputs, LS1) || !getCell(R2, Inputs, LS2))
+    return false;
+  if (LS1.isProperty()) {
+    if (LS1.properties() & ConstantProperties::Zero)
+      return !(Result = LS2).isBottom();
+    return false;
+  }
+  if (LS2.isProperty()) {
+    if (LS2.properties() & ConstantProperties::Zero)
+      return !(Result = LS1).isBottom();
+    return false;
+  }
+
+  APInt A;
+  for (unsigned i = 0; i < LS2.size(); ++i) {
+    LatticeCell RC;
+    bool Eval = constToInt(LS2.Values[i], A) &&
+                evaluateXORri(R1, A, Inputs, RC);
+    if (!Eval)
+      return false;
+    Result.meet(RC);
+  }
+  return !Result.isBottom();
+}
+
+bool MachineConstEvaluator::evaluateXORri(const Register &R1,
+      const APInt &A2, const CellMap &Inputs, LatticeCell &Result) {
+  assert(Inputs.has(R1.Reg));
+  LatticeCell LS1;
+  if (!getCell(R1, Inputs, LS1))
+    return false;
+  if (LS1.isProperty()) {
+    if (LS1.properties() & ConstantProperties::Zero) {
+      const Constant *C = intToConst(A2);
+      Result.add(C);
+      return !Result.isBottom();
+    }
+    return false;
+  }
+
+  APInt A, XA;
+  for (unsigned i = 0; i < LS1.size(); ++i) {
+    bool Eval = constToInt(LS1.Values[i], A) &&
+                evaluateXORii(A, A2, XA);
+    if (!Eval)
+      return false;
+    const Constant *C = intToConst(XA);
+    Result.add(C);
+  }
+  return !Result.isBottom();
+}
+
+bool MachineConstEvaluator::evaluateXORii(const APInt &A1,
+      const APInt &A2, APInt &Result) {
+  Result = A1 ^ A2;
+  return true;
+}
+
+bool MachineConstEvaluator::evaluateZEXTr(const Register &R1, unsigned Width,
+      unsigned Bits, const CellMap &Inputs, LatticeCell &Result) {
+  assert(Inputs.has(R1.Reg));
+  LatticeCell LS1;
+  if (!getCell(R1, Inputs, LS1))
+    return false;
+  if (LS1.isProperty())
+    return false;
+
+  APInt A, XA;
+  for (unsigned i = 0; i < LS1.size(); ++i) {
+    bool Eval = constToInt(LS1.Values[i], A) &&
+                evaluateZEXTi(A, Width, Bits, XA);
+    if (!Eval)
+      return false;
+    const Constant *C = intToConst(XA);
+    Result.add(C);
+  }
+  return true;
+}
+
+bool MachineConstEvaluator::evaluateZEXTi(const APInt &A1, unsigned Width,
+      unsigned Bits, APInt &Result) {
+  unsigned BW = A1.getBitWidth();
+  (void)BW;
+  assert(Width >= Bits && BW >= Bits);
+  APInt Mask = APInt::getLowBitsSet(Width, Bits);
+  Result = A1.zextOrTrunc(Width) & Mask;
+  return true;
+}
+
+bool MachineConstEvaluator::evaluateSEXTr(const Register &R1, unsigned Width,
+      unsigned Bits, const CellMap &Inputs, LatticeCell &Result) {
+  assert(Inputs.has(R1.Reg));
+  LatticeCell LS1;
+  if (!getCell(R1, Inputs, LS1))
+    return false;
+  if (LS1.isBottom() || LS1.isProperty())
+    return false;
+
+  APInt A, XA;
+  for (unsigned i = 0; i < LS1.size(); ++i) {
+    bool Eval = constToInt(LS1.Values[i], A) &&
+                evaluateSEXTi(A, Width, Bits, XA);
+    if (!Eval)
+      return false;
+    const Constant *C = intToConst(XA);
+    Result.add(C);
+  }
+  return true;
+}
+
+bool MachineConstEvaluator::evaluateSEXTi(const APInt &A1, unsigned Width,
+      unsigned Bits, APInt &Result) {
+  unsigned BW = A1.getBitWidth();
+  assert(Width >= Bits && BW >= Bits);
+  // Special case to make things faster for smaller source widths.
+  // Sign extension of 0 bits generates 0 as a result. This is consistent
+  // with what the HW does.
+  if (Bits == 0) {
+    Result = APInt(Width, 0);
+    return true;
+  }
+  // In C, shifts by 64 invoke undefined behavior: handle that case in APInt.
+  if (BW <= 64 && Bits != 0) {
+    int64_t V = A1.getSExtValue();
+    switch (Bits) {
+      case 8:
+        V = static_cast<int8_t>(V);
+        break;
+      case 16:
+        V = static_cast<int16_t>(V);
+        break;
+      case 32:
+        V = static_cast<int32_t>(V);
+        break;
+      default:
+        // Shift left to lose all bits except lower "Bits" bits, then shift
+        // the value back, replicating what was a sign bit after the first
+        // shift.
+        V = (V << (64-Bits)) >> (64-Bits);
+        break;
+    }
+    // V is a 64-bit sign-extended value. Convert it to APInt of desired
+    // width.
+    Result = APInt(Width, V, true);
+    return true;
+  }
+  // Slow case: the value doesn't fit in int64_t.
+  if (Bits < BW)
+    Result = A1.trunc(Bits).sext(Width);
+  else // Bits == BW
+    Result = A1.sext(Width);
+  return true;
+}
+
+bool MachineConstEvaluator::evaluateCLBr(const Register &R1, bool Zeros,
+      bool Ones, const CellMap &Inputs, LatticeCell &Result) {
+  assert(Inputs.has(R1.Reg));
+  LatticeCell LS1;
+  if (!getCell(R1, Inputs, LS1))
+    return false;
+  if (LS1.isBottom() || LS1.isProperty())
+    return false;
+
+  APInt A, CA;
+  for (unsigned i = 0; i < LS1.size(); ++i) {
+    bool Eval = constToInt(LS1.Values[i], A) &&
+                evaluateCLBi(A, Zeros, Ones, CA);
+    if (!Eval)
+      return false;
+    const Constant *C = intToConst(CA);
+    Result.add(C);
+  }
+  return true;
+}
+
+bool MachineConstEvaluator::evaluateCLBi(const APInt &A1, bool Zeros,
+      bool Ones, APInt &Result) {
+  unsigned BW = A1.getBitWidth();
+  if (!Zeros && !Ones)
+    return false;
+  unsigned Count = 0;
+  if (Zeros && (Count == 0))
+    Count = A1.countLeadingZeros();
+  if (Ones && (Count == 0))
+    Count = A1.countLeadingOnes();
+  Result = APInt(BW, static_cast<uint64_t>(Count), false);
+  return true;
+}
+
+bool MachineConstEvaluator::evaluateCTBr(const Register &R1, bool Zeros,
+      bool Ones, const CellMap &Inputs, LatticeCell &Result) {
+  assert(Inputs.has(R1.Reg));
+  LatticeCell LS1;
+  if (!getCell(R1, Inputs, LS1))
+    return false;
+  if (LS1.isBottom() || LS1.isProperty())
+    return false;
+
+  APInt A, CA;
+  for (unsigned i = 0; i < LS1.size(); ++i) {
+    bool Eval = constToInt(LS1.Values[i], A) &&
+                evaluateCTBi(A, Zeros, Ones, CA);
+    if (!Eval)
+      return false;
+    const Constant *C = intToConst(CA);
+    Result.add(C);
+  }
+  return true;
+}
+
+bool MachineConstEvaluator::evaluateCTBi(const APInt &A1, bool Zeros,
+      bool Ones, APInt &Result) {
+  unsigned BW = A1.getBitWidth();
+  if (!Zeros && !Ones)
+    return false;
+  unsigned Count = 0;
+  if (Zeros && (Count == 0))
+    Count = A1.countTrailingZeros();
+  if (Ones && (Count == 0))
+    Count = A1.countTrailingOnes();
+  Result = APInt(BW, static_cast<uint64_t>(Count), false);
+  return true;
+}
+
+bool MachineConstEvaluator::evaluateEXTRACTr(const Register &R1,
+      unsigned Width, unsigned Bits, unsigned Offset, bool Signed,
+      const CellMap &Inputs, LatticeCell &Result) {
+  assert(Inputs.has(R1.Reg));
+  assert(Bits+Offset <= Width);
+  LatticeCell LS1;
+  if (!getCell(R1, Inputs, LS1))
+    return false;
+  if (LS1.isBottom())
+    return false;
+  if (LS1.isProperty()) {
+    uint32_t Ps = LS1.properties();
+    if (Ps & ConstantProperties::Zero) {
+      const Constant *C = intToConst(APInt(Width, 0, false));
+      Result.add(C);
+      return true;
+    }
+    return false;
+  }
+
+  APInt A, CA;
+  for (unsigned i = 0; i < LS1.size(); ++i) {
+    bool Eval = constToInt(LS1.Values[i], A) &&
+                evaluateEXTRACTi(A, Bits, Offset, Signed, CA);
+    if (!Eval)
+      return false;
+    const Constant *C = intToConst(CA);
+    Result.add(C);
+  }
+  return true;
+}
+
+bool MachineConstEvaluator::evaluateEXTRACTi(const APInt &A1, unsigned Bits,
+      unsigned Offset, bool Signed, APInt &Result) {
+  unsigned BW = A1.getBitWidth();
+  assert(Bits+Offset <= BW);
+  // Extracting 0 bits generates 0 as a result (as indicated by the HW people).
+  if (Bits == 0) {
+    Result = APInt(BW, 0);
+    return true;
+  }
+  if (BW <= 64) {
+    int64_t V = A1.getZExtValue();
+    V <<= (64-Bits-Offset);
+    if (Signed)
+      V >>= (64-Bits);
+    else
+      V = static_cast<uint64_t>(V) >> (64-Bits);
+    Result = APInt(BW, V, Signed);
+    return true;
+  }
+  if (Signed)
+    Result = A1.shl(BW-Bits-Offset).ashr(BW-Bits);
+  else
+    Result = A1.shl(BW-Bits-Offset).lshr(BW-Bits);
+  return true;
+}
+
+bool MachineConstEvaluator::evaluateSplatr(const Register &R1,
+      unsigned Bits, unsigned Count, const CellMap &Inputs,
+      LatticeCell &Result) {
+  assert(Inputs.has(R1.Reg));
+  LatticeCell LS1;
+  if (!getCell(R1, Inputs, LS1))
+    return false;
+  if (LS1.isBottom() || LS1.isProperty())
+    return false;
+
+  APInt A, SA;
+  for (unsigned i = 0; i < LS1.size(); ++i) {
+    bool Eval = constToInt(LS1.Values[i], A) &&
+                evaluateSplati(A, Bits, Count, SA);
+    if (!Eval)
+      return false;
+    const Constant *C = intToConst(SA);
+    Result.add(C);
+  }
+  return true;
+}
+
+bool MachineConstEvaluator::evaluateSplati(const APInt &A1, unsigned Bits,
+      unsigned Count, APInt &Result) {
+  assert(Count > 0);
+  unsigned BW = A1.getBitWidth(), SW = Count*Bits;
+  APInt LoBits = (Bits < BW) ? A1.trunc(Bits) : A1.zextOrSelf(Bits);
+  if (Count > 1)
+    LoBits = LoBits.zext(SW);
+
+  APInt Res(SW, 0, false);
+  for (unsigned i = 0; i < Count; ++i) {
+    Res <<= Bits;
+    Res |= LoBits;
+  }
+  Result = Res;
+  return true;
+}
+
+// ----------------------------------------------------------------------
+// Hexagon-specific code.
+
+namespace llvm {
+
+  FunctionPass *createHexagonConstPropagationPass();
+  void initializeHexagonConstPropagationPass(PassRegistry &Registry);
+
+} // end namespace llvm
+
+namespace {
+
+  class HexagonConstEvaluator : public MachineConstEvaluator {
+  public:
+    HexagonConstEvaluator(MachineFunction &Fn);
+
+    bool evaluate(const MachineInstr &MI, const CellMap &Inputs,
+          CellMap &Outputs) override;
+    bool evaluate(const Register &R, const LatticeCell &SrcC,
+          LatticeCell &Result) override;
+    bool evaluate(const MachineInstr &BrI, const CellMap &Inputs,
+          SetVector<const MachineBasicBlock*> &Targets, bool &FallsThru)
+          override;
+    bool rewrite(MachineInstr &MI, const CellMap &Inputs) override;
+
+  private:
+    unsigned getRegBitWidth(unsigned Reg) const;
+
+    static uint32_t getCmp(unsigned Opc);
+    static APInt getCmpImm(unsigned Opc, unsigned OpX,
+          const MachineOperand &MO);
+    void replaceWithNop(MachineInstr &MI);
+
+    bool evaluateHexRSEQ32(Register RL, Register RH, const CellMap &Inputs,
+          LatticeCell &Result);
+    bool evaluateHexCompare(const MachineInstr &MI, const CellMap &Inputs,
+          CellMap &Outputs);
+    // This is suitable to be called for compare-and-jump instructions.
+    bool evaluateHexCompare2(uint32_t Cmp, const MachineOperand &Src1,
+          const MachineOperand &Src2, const CellMap &Inputs, bool &Result);
+    bool evaluateHexLogical(const MachineInstr &MI, const CellMap &Inputs,
+          CellMap &Outputs);
+    bool evaluateHexCondMove(const MachineInstr &MI, const CellMap &Inputs,
+          CellMap &Outputs);
+    bool evaluateHexExt(const MachineInstr &MI, const CellMap &Inputs,
+          CellMap &Outputs);
+    bool evaluateHexVector1(const MachineInstr &MI, const CellMap &Inputs,
+          CellMap &Outputs);
+    bool evaluateHexVector2(const MachineInstr &MI, const CellMap &Inputs,
+          CellMap &Outputs);
+
+    void replaceAllRegUsesWith(unsigned FromReg, unsigned ToReg);
+    bool rewriteHexBranch(MachineInstr &BrI, const CellMap &Inputs);
+    bool rewriteHexConstDefs(MachineInstr &MI, const CellMap &Inputs,
+          bool &AllDefs);
+    bool rewriteHexConstUses(MachineInstr &MI, const CellMap &Inputs);
+
+    MachineRegisterInfo *MRI;
+    const HexagonInstrInfo &HII;
+    const HexagonRegisterInfo &HRI;
+  };
+
+  class HexagonConstPropagation : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    HexagonConstPropagation() : MachineFunctionPass(ID) {
+      PassRegistry &Registry = *PassRegistry::getPassRegistry();
+      initializeHexagonConstPropagationPass(Registry);
+    }
+
+    StringRef getPassName() const override {
+      return "Hexagon Constant Propagation";
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      const Function *F = MF.getFunction();
+      if (!F)
+        return false;
+      if (skipFunction(*F))
+        return false;
+
+      HexagonConstEvaluator HCE(MF);
+      return MachineConstPropagator(HCE).run(MF);
+    }
+  };
+
+  char HexagonConstPropagation::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(HexagonConstPropagation, "hcp", "Hexagon Constant Propagation",
+                false, false)
+
+HexagonConstEvaluator::HexagonConstEvaluator(MachineFunction &Fn)
+  : MachineConstEvaluator(Fn),
+    HII(*Fn.getSubtarget<HexagonSubtarget>().getInstrInfo()),
+    HRI(*Fn.getSubtarget<HexagonSubtarget>().getRegisterInfo()) {
+  MRI = &Fn.getRegInfo();
+}
+
+bool HexagonConstEvaluator::evaluate(const MachineInstr &MI,
+      const CellMap &Inputs, CellMap &Outputs) {
+  if (MI.isCall())
+    return false;
+  if (MI.getNumOperands() == 0 || !MI.getOperand(0).isReg())
+    return false;
+  const MachineOperand &MD = MI.getOperand(0);
+  if (!MD.isDef())
+    return false;
+
+  unsigned Opc = MI.getOpcode();
+  Register DefR(MD);
+  assert(!DefR.SubReg);
+  if (!TargetRegisterInfo::isVirtualRegister(DefR.Reg))
+    return false;
+
+  if (MI.isCopy()) {
+    LatticeCell RC;
+    Register SrcR(MI.getOperand(1));
+    bool Eval = evaluateCOPY(SrcR, Inputs, RC);
+    if (!Eval)
+      return false;
+    Outputs.update(DefR.Reg, RC);
+    return true;
+  }
+  if (MI.isRegSequence()) {
+    unsigned Sub1 = MI.getOperand(2).getImm();
+    unsigned Sub2 = MI.getOperand(4).getImm();
+    const TargetRegisterClass *DefRC = MRI->getRegClass(DefR.Reg);
+    unsigned SubLo = HRI.getHexagonSubRegIndex(DefRC, Hexagon::ps_sub_lo);
+    unsigned SubHi = HRI.getHexagonSubRegIndex(DefRC, Hexagon::ps_sub_hi);
+    if (Sub1 != SubLo && Sub1 != SubHi)
+      return false;
+    if (Sub2 != SubLo && Sub2 != SubHi)
+      return false;
+    assert(Sub1 != Sub2);
+    bool LoIs1 = (Sub1 == SubLo);
+    const MachineOperand &OpLo = LoIs1 ? MI.getOperand(1) : MI.getOperand(3);
+    const MachineOperand &OpHi = LoIs1 ? MI.getOperand(3) : MI.getOperand(1);
+    LatticeCell RC;
+    Register SrcRL(OpLo), SrcRH(OpHi);
+    bool Eval = evaluateHexRSEQ32(SrcRL, SrcRH, Inputs, RC);
+    if (!Eval)
+      return false;
+    Outputs.update(DefR.Reg, RC);
+    return true;
+  }
+  if (MI.isCompare()) {
+    bool Eval = evaluateHexCompare(MI, Inputs, Outputs);
+    return Eval;
+  }
+
+  switch (Opc) {
+    default:
+      return false;
+    case Hexagon::A2_tfrsi:
+    case Hexagon::A2_tfrpi:
+    case Hexagon::CONST32:
+    case Hexagon::CONST64:
+    {
+      const MachineOperand &VO = MI.getOperand(1);
+      // The operand of CONST32 can be a blockaddress, e.g.
+      //   %vreg0<def> = CONST32 <blockaddress(@eat, %L)>
+      // Do this check for all instructions for safety.
+      if (!VO.isImm())
+        return false;
+      int64_t V = MI.getOperand(1).getImm();
+      unsigned W = getRegBitWidth(DefR.Reg);
+      if (W != 32 && W != 64)
+        return false;
+      IntegerType *Ty = (W == 32) ? Type::getInt32Ty(CX)
+                                  : Type::getInt64Ty(CX);
+      const ConstantInt *CI = ConstantInt::get(Ty, V, true);
+      LatticeCell RC = Outputs.get(DefR.Reg);
+      RC.add(CI);
+      Outputs.update(DefR.Reg, RC);
+      break;
+    }
+
+    case Hexagon::PS_true:
+    case Hexagon::PS_false:
+    {
+      LatticeCell RC = Outputs.get(DefR.Reg);
+      bool NonZero = (Opc == Hexagon::PS_true);
+      uint32_t P = NonZero ? ConstantProperties::NonZero
+                           : ConstantProperties::Zero;
+      RC.add(P);
+      Outputs.update(DefR.Reg, RC);
+      break;
+    }
+
+    case Hexagon::A2_and:
+    case Hexagon::A2_andir:
+    case Hexagon::A2_andp:
+    case Hexagon::A2_or:
+    case Hexagon::A2_orir:
+    case Hexagon::A2_orp:
+    case Hexagon::A2_xor:
+    case Hexagon::A2_xorp:
+    {
+      bool Eval = evaluateHexLogical(MI, Inputs, Outputs);
+      if (!Eval)
+        return false;
+      break;
+    }
+
+    case Hexagon::A2_combineii:  // combine(#s8Ext, #s8)
+    case Hexagon::A4_combineii:  // combine(#s8, #u6Ext)
+    {
+      uint64_t Hi = MI.getOperand(1).getImm();
+      uint64_t Lo = MI.getOperand(2).getImm();
+      uint64_t Res = (Hi << 32) | (Lo & 0xFFFFFFFF);
+      IntegerType *Ty = Type::getInt64Ty(CX);
+      const ConstantInt *CI = ConstantInt::get(Ty, Res, false);
+      LatticeCell RC = Outputs.get(DefR.Reg);
+      RC.add(CI);
+      Outputs.update(DefR.Reg, RC);
+      break;
+    }
+
+    case Hexagon::S2_setbit_i:
+    {
+      int64_t B = MI.getOperand(2).getImm();
+      assert(B >=0 && B < 32);
+      APInt A(32, (1ull << B), false);
+      Register R(MI.getOperand(1));
+      LatticeCell RC = Outputs.get(DefR.Reg);
+      bool Eval = evaluateORri(R, A, Inputs, RC);
+      if (!Eval)
+        return false;
+      Outputs.update(DefR.Reg, RC);
+      break;
+    }
+
+    case Hexagon::C2_mux:
+    case Hexagon::C2_muxir:
+    case Hexagon::C2_muxri:
+    case Hexagon::C2_muxii:
+    {
+      bool Eval = evaluateHexCondMove(MI, Inputs, Outputs);
+      if (!Eval)
+        return false;
+      break;
+    }
+
+    case Hexagon::A2_sxtb:
+    case Hexagon::A2_sxth:
+    case Hexagon::A2_sxtw:
+    case Hexagon::A2_zxtb:
+    case Hexagon::A2_zxth:
+    {
+      bool Eval = evaluateHexExt(MI, Inputs, Outputs);
+      if (!Eval)
+        return false;
+      break;
+    }
+
+    case Hexagon::S2_ct0:
+    case Hexagon::S2_ct0p:
+    case Hexagon::S2_ct1:
+    case Hexagon::S2_ct1p:
+    {
+      using namespace Hexagon;
+
+      bool Ones = (Opc == S2_ct1) || (Opc == S2_ct1p);
+      Register R1(MI.getOperand(1));
+      assert(Inputs.has(R1.Reg));
+      LatticeCell T;
+      bool Eval = evaluateCTBr(R1, !Ones, Ones, Inputs, T);
+      if (!Eval)
+        return false;
+      // All of these instructions return a 32-bit value. The evaluate
+      // will generate the same type as the operand, so truncate the
+      // result if necessary.
+      APInt C;
+      LatticeCell RC = Outputs.get(DefR.Reg);
+      for (unsigned i = 0; i < T.size(); ++i) {
+        const Constant *CI = T.Values[i];
+        if (constToInt(CI, C) && C.getBitWidth() > 32)
+          CI = intToConst(C.trunc(32));
+        RC.add(CI);
+      }
+      Outputs.update(DefR.Reg, RC);
+      break;
+    }
+
+    case Hexagon::S2_cl0:
+    case Hexagon::S2_cl0p:
+    case Hexagon::S2_cl1:
+    case Hexagon::S2_cl1p:
+    case Hexagon::S2_clb:
+    case Hexagon::S2_clbp:
+    {
+      using namespace Hexagon;
+
+      bool OnlyZeros = (Opc == S2_cl0) || (Opc == S2_cl0p);
+      bool OnlyOnes =  (Opc == S2_cl1) || (Opc == S2_cl1p);
+      Register R1(MI.getOperand(1));
+      assert(Inputs.has(R1.Reg));
+      LatticeCell T;
+      bool Eval = evaluateCLBr(R1, !OnlyOnes, !OnlyZeros, Inputs, T);
+      if (!Eval)
+        return false;
+      // All of these instructions return a 32-bit value. The evaluate
+      // will generate the same type as the operand, so truncate the
+      // result if necessary.
+      APInt C;
+      LatticeCell RC = Outputs.get(DefR.Reg);
+      for (unsigned i = 0; i < T.size(); ++i) {
+        const Constant *CI = T.Values[i];
+        if (constToInt(CI, C) && C.getBitWidth() > 32)
+          CI = intToConst(C.trunc(32));
+        RC.add(CI);
+      }
+      Outputs.update(DefR.Reg, RC);
+      break;
+    }
+
+    case Hexagon::S4_extract:
+    case Hexagon::S4_extractp:
+    case Hexagon::S2_extractu:
+    case Hexagon::S2_extractup:
+    {
+      bool Signed = (Opc == Hexagon::S4_extract) ||
+                    (Opc == Hexagon::S4_extractp);
+      Register R1(MI.getOperand(1));
+      unsigned BW = getRegBitWidth(R1.Reg);
+      unsigned Bits = MI.getOperand(2).getImm();
+      unsigned Offset = MI.getOperand(3).getImm();
+      LatticeCell RC = Outputs.get(DefR.Reg);
+      if (Offset >= BW) {
+        APInt Zero(BW, 0, false);
+        RC.add(intToConst(Zero));
+        break;
+      }
+      if (Offset+Bits > BW) {
+        // If the requested bitfield extends beyond the most significant bit,
+        // the extra bits are treated as 0s. To emulate this behavior, reduce
+        // the number of requested bits, and make the extract unsigned.
+        Bits = BW-Offset;
+        Signed = false;
+      }
+      bool Eval = evaluateEXTRACTr(R1, BW, Bits, Offset, Signed, Inputs, RC);
+      if (!Eval)
+        return false;
+      Outputs.update(DefR.Reg, RC);
+      break;
+    }
+
+    case Hexagon::S2_vsplatrb:
+    case Hexagon::S2_vsplatrh:
+    // vabsh, vabsh:sat
+    // vabsw, vabsw:sat
+    // vconj:sat
+    // vrndwh, vrndwh:sat
+    // vsathb, vsathub, vsatwuh
+    // vsxtbh, vsxthw
+    // vtrunehb, vtrunohb
+    // vzxtbh, vzxthw
+    {
+      bool Eval = evaluateHexVector1(MI, Inputs, Outputs);
+      if (!Eval)
+        return false;
+      break;
+    }
+
+    // TODO:
+    // A2_vaddh
+    // A2_vaddhs
+    // A2_vaddw
+    // A2_vaddws
+  }
+
+  return true;
+}
+
+bool HexagonConstEvaluator::evaluate(const Register &R,
+      const LatticeCell &Input, LatticeCell &Result) {
+  if (!R.SubReg) {
+    Result = Input;
+    return true;
+  }
+  const TargetRegisterClass *RC = MRI->getRegClass(R.Reg);
+  if (RC != &Hexagon::DoubleRegsRegClass)
+    return false;
+  if (R.SubReg != Hexagon::isub_lo && R.SubReg != Hexagon::isub_hi)
+    return false;
+
+  assert(!Input.isTop());
+  if (Input.isBottom())
+    return false;
+
+  typedef ConstantProperties P;
+  if (Input.isProperty()) {
+    uint32_t Ps = Input.properties();
+    if (Ps & (P::Zero|P::NaN)) {
+      uint32_t Ns = (Ps & (P::Zero|P::NaN|P::SignProperties));
+      Result.add(Ns);
+      return true;
+    }
+    if (R.SubReg == Hexagon::isub_hi) {
+      uint32_t Ns = (Ps & P::SignProperties);
+      Result.add(Ns);
+      return true;
+    }
+    return false;
+  }
+
+  // The Input cell contains some known values. Pick the word corresponding
+  // to the subregister.
+  APInt A;
+  for (unsigned i = 0; i < Input.size(); ++i) {
+    const Constant *C = Input.Values[i];
+    if (!constToInt(C, A))
+      return false;
+    if (!A.isIntN(64))
+      return false;
+    uint64_t U = A.getZExtValue();
+    if (R.SubReg == Hexagon::isub_hi)
+      U >>= 32;
+    U &= 0xFFFFFFFFULL;
+    uint32_t U32 = Lo_32(U);
+    int32_t V32;
+    memcpy(&V32, &U32, sizeof V32);
+    IntegerType *Ty = Type::getInt32Ty(CX);
+    const ConstantInt *C32 = ConstantInt::get(Ty, static_cast<int64_t>(V32));
+    Result.add(C32);
+  }
+  return true;
+}
+
+bool HexagonConstEvaluator::evaluate(const MachineInstr &BrI,
+      const CellMap &Inputs, SetVector<const MachineBasicBlock*> &Targets,
+      bool &FallsThru) {
+  // We need to evaluate one branch at a time. TII::analyzeBranch checks
+  // all the branches in a basic block at once, so we cannot use it.
+  unsigned Opc = BrI.getOpcode();
+  bool SimpleBranch = false;
+  bool Negated = false;
+  switch (Opc) {
+    case Hexagon::J2_jumpf:
+    case Hexagon::J2_jumpfnew:
+    case Hexagon::J2_jumpfnewpt:
+      Negated = true;
+    case Hexagon::J2_jumpt:
+    case Hexagon::J2_jumptnew:
+    case Hexagon::J2_jumptnewpt:
+      // Simple branch:  if([!]Pn) jump ...
+      // i.e. Op0 = predicate, Op1 = branch target.
+      SimpleBranch = true;
+      break;
+    case Hexagon::J2_jump:
+      Targets.insert(BrI.getOperand(0).getMBB());
+      FallsThru = false;
+      return true;
+    default:
+Undetermined:
+      // If the branch is of unknown type, assume that all successors are
+      // executable.
+      FallsThru = !BrI.isUnconditionalBranch();
+      return false;
+  }
+
+  if (SimpleBranch) {
+    const MachineOperand &MD = BrI.getOperand(0);
+    Register PR(MD);
+    // If the condition operand has a subregister, this is not something
+    // we currently recognize.
+    if (PR.SubReg)
+      goto Undetermined;
+    assert(Inputs.has(PR.Reg));
+    const LatticeCell &PredC = Inputs.get(PR.Reg);
+    if (PredC.isBottom())
+      goto Undetermined;
+
+    uint32_t Props = PredC.properties();
+    bool CTrue = false, CFalse = false;;
+    if (Props & ConstantProperties::Zero)
+      CFalse = true;
+    else if (Props & ConstantProperties::NonZero)
+      CTrue = true;
+    // If the condition is not known to be either, bail out.
+    if (!CTrue && !CFalse)
+      goto Undetermined;
+
+    const MachineBasicBlock *BranchTarget = BrI.getOperand(1).getMBB();
+
+    FallsThru = false;
+    if ((!Negated && CTrue) || (Negated && CFalse))
+      Targets.insert(BranchTarget);
+    else if ((!Negated && CFalse) || (Negated && CTrue))
+      FallsThru = true;
+    else
+      goto Undetermined;
+  }
+
+  return true;
+}
+
+bool HexagonConstEvaluator::rewrite(MachineInstr &MI, const CellMap &Inputs) {
+  if (MI.isBranch())
+    return rewriteHexBranch(MI, Inputs);
+
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+    default:
+      break;
+    case Hexagon::A2_tfrsi:
+    case Hexagon::A2_tfrpi:
+    case Hexagon::CONST32:
+    case Hexagon::CONST64:
+    case Hexagon::PS_true:
+    case Hexagon::PS_false:
+      return false;
+  }
+
+  unsigned NumOp = MI.getNumOperands();
+  if (NumOp == 0)
+    return false;
+
+  bool AllDefs, Changed;
+  Changed = rewriteHexConstDefs(MI, Inputs, AllDefs);
+  // If not all defs have been rewritten (i.e. the instruction defines
+  // a register that is not compile-time constant), then try to rewrite
+  // register operands that are known to be constant with immediates.
+  if (!AllDefs)
+    Changed |= rewriteHexConstUses(MI, Inputs);
+
+  return Changed;
+}
+
+unsigned HexagonConstEvaluator::getRegBitWidth(unsigned Reg) const {
+  const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+  if (Hexagon::IntRegsRegClass.hasSubClassEq(RC))
+    return 32;
+  if (Hexagon::DoubleRegsRegClass.hasSubClassEq(RC))
+    return 64;
+  if (Hexagon::PredRegsRegClass.hasSubClassEq(RC))
+    return 8;
+  llvm_unreachable("Invalid register");
+  return 0;
+}
+
+uint32_t HexagonConstEvaluator::getCmp(unsigned Opc) {
+  switch (Opc) {
+    case Hexagon::C2_cmpeq:
+    case Hexagon::C2_cmpeqp:
+    case Hexagon::A4_cmpbeq:
+    case Hexagon::A4_cmpheq:
+    case Hexagon::A4_cmpbeqi:
+    case Hexagon::A4_cmpheqi:
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::J4_cmpeqn1_t_jumpnv_nt:
+    case Hexagon::J4_cmpeqn1_t_jumpnv_t:
+    case Hexagon::J4_cmpeqi_t_jumpnv_nt:
+    case Hexagon::J4_cmpeqi_t_jumpnv_t:
+    case Hexagon::J4_cmpeq_t_jumpnv_nt:
+    case Hexagon::J4_cmpeq_t_jumpnv_t:
+      return Comparison::EQ;
+
+    case Hexagon::C4_cmpneq:
+    case Hexagon::C4_cmpneqi:
+    case Hexagon::J4_cmpeqn1_f_jumpnv_nt:
+    case Hexagon::J4_cmpeqn1_f_jumpnv_t:
+    case Hexagon::J4_cmpeqi_f_jumpnv_nt:
+    case Hexagon::J4_cmpeqi_f_jumpnv_t:
+    case Hexagon::J4_cmpeq_f_jumpnv_nt:
+    case Hexagon::J4_cmpeq_f_jumpnv_t:
+      return Comparison::NE;
+
+    case Hexagon::C2_cmpgt:
+    case Hexagon::C2_cmpgtp:
+    case Hexagon::A4_cmpbgt:
+    case Hexagon::A4_cmphgt:
+    case Hexagon::A4_cmpbgti:
+    case Hexagon::A4_cmphgti:
+    case Hexagon::C2_cmpgti:
+    case Hexagon::J4_cmpgtn1_t_jumpnv_nt:
+    case Hexagon::J4_cmpgtn1_t_jumpnv_t:
+    case Hexagon::J4_cmpgti_t_jumpnv_nt:
+    case Hexagon::J4_cmpgti_t_jumpnv_t:
+    case Hexagon::J4_cmpgt_t_jumpnv_nt:
+    case Hexagon::J4_cmpgt_t_jumpnv_t:
+      return Comparison::GTs;
+
+    case Hexagon::C4_cmplte:
+    case Hexagon::C4_cmpltei:
+    case Hexagon::J4_cmpgtn1_f_jumpnv_nt:
+    case Hexagon::J4_cmpgtn1_f_jumpnv_t:
+    case Hexagon::J4_cmpgti_f_jumpnv_nt:
+    case Hexagon::J4_cmpgti_f_jumpnv_t:
+    case Hexagon::J4_cmpgt_f_jumpnv_nt:
+    case Hexagon::J4_cmpgt_f_jumpnv_t:
+      return Comparison::LEs;
+
+    case Hexagon::C2_cmpgtu:
+    case Hexagon::C2_cmpgtup:
+    case Hexagon::A4_cmpbgtu:
+    case Hexagon::A4_cmpbgtui:
+    case Hexagon::A4_cmphgtu:
+    case Hexagon::A4_cmphgtui:
+    case Hexagon::C2_cmpgtui:
+    case Hexagon::J4_cmpgtui_t_jumpnv_nt:
+    case Hexagon::J4_cmpgtui_t_jumpnv_t:
+    case Hexagon::J4_cmpgtu_t_jumpnv_nt:
+    case Hexagon::J4_cmpgtu_t_jumpnv_t:
+      return Comparison::GTu;
+
+    case Hexagon::J4_cmpltu_f_jumpnv_nt:
+    case Hexagon::J4_cmpltu_f_jumpnv_t:
+      return Comparison::GEu;
+
+    case Hexagon::J4_cmpltu_t_jumpnv_nt:
+    case Hexagon::J4_cmpltu_t_jumpnv_t:
+      return Comparison::LTu;
+
+    case Hexagon::J4_cmplt_f_jumpnv_nt:
+    case Hexagon::J4_cmplt_f_jumpnv_t:
+      return Comparison::GEs;
+
+    case Hexagon::C4_cmplteu:
+    case Hexagon::C4_cmplteui:
+    case Hexagon::J4_cmpgtui_f_jumpnv_nt:
+    case Hexagon::J4_cmpgtui_f_jumpnv_t:
+    case Hexagon::J4_cmpgtu_f_jumpnv_nt:
+    case Hexagon::J4_cmpgtu_f_jumpnv_t:
+      return Comparison::LEu;
+
+    case Hexagon::J4_cmplt_t_jumpnv_nt:
+    case Hexagon::J4_cmplt_t_jumpnv_t:
+      return Comparison::LTs;
+
+    default:
+      break;
+  }
+  return Comparison::Unk;
+}
+
+APInt HexagonConstEvaluator::getCmpImm(unsigned Opc, unsigned OpX,
+      const MachineOperand &MO) {
+  bool Signed = false;
+  switch (Opc) {
+    case Hexagon::A4_cmpbgtui:   // u7
+    case Hexagon::A4_cmphgtui:   // u7
+      break;
+    case Hexagon::A4_cmpheqi:    // s8
+    case Hexagon::C4_cmpneqi:   // s8
+      Signed = true;
+    case Hexagon::A4_cmpbeqi:    // u8
+      break;
+    case Hexagon::C2_cmpgtui:      // u9
+    case Hexagon::C4_cmplteui:  // u9
+      break;
+    case Hexagon::C2_cmpeqi:       // s10
+    case Hexagon::C2_cmpgti:       // s10
+    case Hexagon::C4_cmpltei:   // s10
+      Signed = true;
+      break;
+    case Hexagon::J4_cmpeqi_f_jumpnv_nt:   // u5
+    case Hexagon::J4_cmpeqi_f_jumpnv_t:    // u5
+    case Hexagon::J4_cmpeqi_t_jumpnv_nt:   // u5
+    case Hexagon::J4_cmpeqi_t_jumpnv_t:    // u5
+    case Hexagon::J4_cmpgti_f_jumpnv_nt:   // u5
+    case Hexagon::J4_cmpgti_f_jumpnv_t:    // u5
+    case Hexagon::J4_cmpgti_t_jumpnv_nt:   // u5
+    case Hexagon::J4_cmpgti_t_jumpnv_t:    // u5
+    case Hexagon::J4_cmpgtui_f_jumpnv_nt:  // u5
+    case Hexagon::J4_cmpgtui_f_jumpnv_t:   // u5
+    case Hexagon::J4_cmpgtui_t_jumpnv_nt:  // u5
+    case Hexagon::J4_cmpgtui_t_jumpnv_t:   // u5
+      break;
+    default:
+      llvm_unreachable("Unhandled instruction");
+      break;
+  }
+
+  uint64_t Val = MO.getImm();
+  return APInt(32, Val, Signed);
+}
+
+void HexagonConstEvaluator::replaceWithNop(MachineInstr &MI) {
+  MI.setDesc(HII.get(Hexagon::A2_nop));
+  while (MI.getNumOperands() > 0)
+    MI.RemoveOperand(0);
+}
+
+bool HexagonConstEvaluator::evaluateHexRSEQ32(Register RL, Register RH,
+      const CellMap &Inputs, LatticeCell &Result) {
+  assert(Inputs.has(RL.Reg) && Inputs.has(RH.Reg));
+  LatticeCell LSL, LSH;
+  if (!getCell(RL, Inputs, LSL) || !getCell(RH, Inputs, LSH))
+    return false;
+  if (LSL.isProperty() || LSH.isProperty())
+    return false;
+
+  unsigned LN = LSL.size(), HN = LSH.size();
+  SmallVector<APInt,4> LoVs(LN), HiVs(HN);
+  for (unsigned i = 0; i < LN; ++i) {
+    bool Eval = constToInt(LSL.Values[i], LoVs[i]);
+    if (!Eval)
+      return false;
+    assert(LoVs[i].getBitWidth() == 32);
+  }
+  for (unsigned i = 0; i < HN; ++i) {
+    bool Eval = constToInt(LSH.Values[i], HiVs[i]);
+    if (!Eval)
+      return false;
+    assert(HiVs[i].getBitWidth() == 32);
+  }
+
+  for (unsigned i = 0; i < HiVs.size(); ++i) {
+    APInt HV = HiVs[i].zextOrSelf(64) << 32;
+    for (unsigned j = 0; j < LoVs.size(); ++j) {
+      APInt LV = LoVs[j].zextOrSelf(64);
+      const Constant *C = intToConst(HV | LV);
+      Result.add(C);
+      if (Result.isBottom())
+        return false;
+    }
+  }
+  return !Result.isBottom();
+}
+
+bool HexagonConstEvaluator::evaluateHexCompare(const MachineInstr &MI,
+      const CellMap &Inputs, CellMap &Outputs) {
+  unsigned Opc = MI.getOpcode();
+  bool Classic = false;
+  switch (Opc) {
+    case Hexagon::C2_cmpeq:
+    case Hexagon::C2_cmpeqp:
+    case Hexagon::C2_cmpgt:
+    case Hexagon::C2_cmpgtp:
+    case Hexagon::C2_cmpgtu:
+    case Hexagon::C2_cmpgtup:
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::C2_cmpgti:
+    case Hexagon::C2_cmpgtui:
+      // Classic compare:  Dst0 = CMP Src1, Src2
+      Classic = true;
+      break;
+    default:
+      // Not handling other compare instructions now.
+      return false;
+  }
+
+  if (Classic) {
+    const MachineOperand &Src1 = MI.getOperand(1);
+    const MachineOperand &Src2 = MI.getOperand(2);
+
+    bool Result;
+    unsigned Opc = MI.getOpcode();
+    bool Computed = evaluateHexCompare2(Opc, Src1, Src2, Inputs, Result);
+    if (Computed) {
+      // Only create a zero/non-zero cell. At this time there isn't really
+      // much need for specific values.
+      Register DefR(MI.getOperand(0));
+      LatticeCell L = Outputs.get(DefR.Reg);
+      uint32_t P = Result ? ConstantProperties::NonZero
+                          : ConstantProperties::Zero;
+      L.add(P);
+      Outputs.update(DefR.Reg, L);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool HexagonConstEvaluator::evaluateHexCompare2(unsigned Opc,
+      const MachineOperand &Src1, const MachineOperand &Src2,
+      const CellMap &Inputs, bool &Result) {
+  uint32_t Cmp = getCmp(Opc);
+  bool Reg1 = Src1.isReg(), Reg2 = Src2.isReg();
+  bool Imm1 = Src1.isImm(), Imm2 = Src2.isImm();
+  if (Reg1) {
+    Register R1(Src1);
+    if (Reg2) {
+      Register R2(Src2);
+      return evaluateCMPrr(Cmp, R1, R2, Inputs, Result);
+    } else if (Imm2) {
+      APInt A2 = getCmpImm(Opc, 2, Src2);
+      return evaluateCMPri(Cmp, R1, A2, Inputs, Result);
+    }
+  } else if (Imm1) {
+    APInt A1 = getCmpImm(Opc, 1, Src1);
+    if (Reg2) {
+      Register R2(Src2);
+      uint32_t NegCmp = Comparison::negate(Cmp);
+      return evaluateCMPri(NegCmp, R2, A1, Inputs, Result);
+    } else if (Imm2) {
+      APInt A2 = getCmpImm(Opc, 2, Src2);
+      return evaluateCMPii(Cmp, A1, A2, Result);
+    }
+  }
+  // Unknown kind of comparison.
+  return false;
+}
+
+bool HexagonConstEvaluator::evaluateHexLogical(const MachineInstr &MI,
+      const CellMap &Inputs, CellMap &Outputs) {
+  unsigned Opc = MI.getOpcode();
+  if (MI.getNumOperands() != 3)
+    return false;
+  const MachineOperand &Src1 = MI.getOperand(1);
+  const MachineOperand &Src2 = MI.getOperand(2);
+  Register R1(Src1);
+  bool Eval = false;
+  LatticeCell RC;
+  switch (Opc) {
+    default:
+      return false;
+    case Hexagon::A2_and:
+    case Hexagon::A2_andp:
+      Eval = evaluateANDrr(R1, Register(Src2), Inputs, RC);
+      break;
+    case Hexagon::A2_andir: {
+      APInt A(32, Src2.getImm(), true);
+      Eval = evaluateANDri(R1, A, Inputs, RC);
+      break;
+    }
+    case Hexagon::A2_or:
+    case Hexagon::A2_orp:
+      Eval = evaluateORrr(R1, Register(Src2), Inputs, RC);
+      break;
+    case Hexagon::A2_orir: {
+      APInt A(32, Src2.getImm(), true);
+      Eval = evaluateORri(R1, A, Inputs, RC);
+      break;
+    }
+    case Hexagon::A2_xor:
+    case Hexagon::A2_xorp:
+      Eval = evaluateXORrr(R1, Register(Src2), Inputs, RC);
+      break;
+  }
+  if (Eval) {
+    Register DefR(MI.getOperand(0));
+    Outputs.update(DefR.Reg, RC);
+  }
+  return Eval;
+}
+
+bool HexagonConstEvaluator::evaluateHexCondMove(const MachineInstr &MI,
+      const CellMap &Inputs, CellMap &Outputs) {
+  // Dst0 = Cond1 ? Src2 : Src3
+  Register CR(MI.getOperand(1));
+  assert(Inputs.has(CR.Reg));
+  LatticeCell LS;
+  if (!getCell(CR, Inputs, LS))
+    return false;
+  uint32_t Ps = LS.properties();
+  unsigned TakeOp;
+  if (Ps & ConstantProperties::Zero)
+    TakeOp = 3;
+  else if (Ps & ConstantProperties::NonZero)
+    TakeOp = 2;
+  else
+    return false;
+
+  const MachineOperand &ValOp = MI.getOperand(TakeOp);
+  Register DefR(MI.getOperand(0));
+  LatticeCell RC = Outputs.get(DefR.Reg);
+
+  if (ValOp.isImm()) {
+    int64_t V = ValOp.getImm();
+    unsigned W = getRegBitWidth(DefR.Reg);
+    APInt A(W, V, true);
+    const Constant *C = intToConst(A);
+    RC.add(C);
+    Outputs.update(DefR.Reg, RC);
+    return true;
+  }
+  if (ValOp.isReg()) {
+    Register R(ValOp);
+    const LatticeCell &LR = Inputs.get(R.Reg);
+    LatticeCell LSR;
+    if (!evaluate(R, LR, LSR))
+      return false;
+    RC.meet(LSR);
+    Outputs.update(DefR.Reg, RC);
+    return true;
+  }
+  return false;
+}
+
+bool HexagonConstEvaluator::evaluateHexExt(const MachineInstr &MI,
+      const CellMap &Inputs, CellMap &Outputs) {
+  // Dst0 = ext R1
+  Register R1(MI.getOperand(1));
+  assert(Inputs.has(R1.Reg));
+
+  unsigned Opc = MI.getOpcode();
+  unsigned Bits;
+  switch (Opc) {
+    case Hexagon::A2_sxtb:
+    case Hexagon::A2_zxtb:
+      Bits = 8;
+      break;
+    case Hexagon::A2_sxth:
+    case Hexagon::A2_zxth:
+      Bits = 16;
+      break;
+    case Hexagon::A2_sxtw:
+      Bits = 32;
+      break;
+  }
+
+  bool Signed = false;
+  switch (Opc) {
+    case Hexagon::A2_sxtb:
+    case Hexagon::A2_sxth:
+    case Hexagon::A2_sxtw:
+      Signed = true;
+      break;
+  }
+
+  Register DefR(MI.getOperand(0));
+  unsigned BW = getRegBitWidth(DefR.Reg);
+  LatticeCell RC = Outputs.get(DefR.Reg);
+  bool Eval = Signed ? evaluateSEXTr(R1, BW, Bits, Inputs, RC)
+                     : evaluateZEXTr(R1, BW, Bits, Inputs, RC);
+  if (!Eval)
+    return false;
+  Outputs.update(DefR.Reg, RC);
+  return true;
+}
+
+bool HexagonConstEvaluator::evaluateHexVector1(const MachineInstr &MI,
+      const CellMap &Inputs, CellMap &Outputs) {
+  // DefR = op R1
+  Register DefR(MI.getOperand(0));
+  Register R1(MI.getOperand(1));
+  assert(Inputs.has(R1.Reg));
+  LatticeCell RC = Outputs.get(DefR.Reg);
+  bool Eval;
+
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+    case Hexagon::S2_vsplatrb:
+      // Rd = 4 times Rs:0..7
+      Eval = evaluateSplatr(R1, 8, 4, Inputs, RC);
+      break;
+    case Hexagon::S2_vsplatrh:
+      // Rdd = 4 times Rs:0..15
+      Eval = evaluateSplatr(R1, 16, 4, Inputs, RC);
+      break;
+    default:
+      return false;
+  }
+
+  if (!Eval)
+    return false;
+  Outputs.update(DefR.Reg, RC);
+  return true;
+}
+
+bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
+      const CellMap &Inputs, bool &AllDefs) {
+  AllDefs = false;
+
+  // Some diagnostics.
+  // DEBUG({...}) gets confused with all this code as an argument.
+#ifndef NDEBUG
+  bool Debugging = DebugFlag && isCurrentDebugType(DEBUG_TYPE);
+  if (Debugging) {
+    bool Const = true, HasUse = false;
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg() || !MO.isUse() || MO.isImplicit())
+        continue;
+      Register R(MO);
+      if (!TargetRegisterInfo::isVirtualRegister(R.Reg))
+        continue;
+      HasUse = true;
+      // PHIs can legitimately have "top" cells after propagation.
+      if (!MI.isPHI() && !Inputs.has(R.Reg)) {
+        dbgs() << "Top " << PrintReg(R.Reg, &HRI, R.SubReg)
+               << " in MI: " << MI;
+        continue;
+      }
+      const LatticeCell &L = Inputs.get(R.Reg);
+      Const &= L.isSingle();
+      if (!Const)
+        break;
+    }
+    if (HasUse && Const) {
+      if (!MI.isCopy()) {
+        dbgs() << "CONST: " << MI;
+        for (const MachineOperand &MO : MI.operands()) {
+          if (!MO.isReg() || !MO.isUse() || MO.isImplicit())
+            continue;
+          unsigned R = MO.getReg();
+          dbgs() << PrintReg(R, &TRI) << ": " << Inputs.get(R) << "\n";
+        }
+      }
+    }
+  }
+#endif
+
+  // Avoid generating TFRIs for register transfers---this will keep the
+  // coalescing opportunities.
+  if (MI.isCopy())
+    return false;
+
+  // Collect all virtual register-def operands.
+  SmallVector<unsigned,2> DefRegs;
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned R = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      continue;
+    assert(!MO.getSubReg());
+    assert(Inputs.has(R));
+    DefRegs.push_back(R);
+  }
+
+  MachineBasicBlock &B = *MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+  unsigned ChangedNum = 0;
+#ifndef NDEBUG
+  SmallVector<const MachineInstr*,4> NewInstrs;
+#endif
+
+  // For each defined register, if it is a constant, create an instruction
+  //   NewR = const
+  // and replace all uses of the defined register with NewR.
+  for (unsigned i = 0, n = DefRegs.size(); i < n; ++i) {
+    unsigned R = DefRegs[i];
+    const LatticeCell &L = Inputs.get(R);
+    if (L.isBottom())
+      continue;
+    const TargetRegisterClass *RC = MRI->getRegClass(R);
+    MachineBasicBlock::iterator At = MI.getIterator();
+
+    if (!L.isSingle()) {
+      // If this a zero/non-zero cell, we can fold a definition
+      // of a predicate register.
+      typedef ConstantProperties P;
+      uint64_t Ps = L.properties();
+      if (!(Ps & (P::Zero|P::NonZero)))
+        continue;
+      const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+      if (RC != PredRC)
+        continue;
+      const MCInstrDesc *NewD = (Ps & P::Zero) ?
+        &HII.get(Hexagon::PS_false) :
+        &HII.get(Hexagon::PS_true);
+      unsigned NewR = MRI->createVirtualRegister(PredRC);
+      const MachineInstrBuilder &MIB = BuildMI(B, At, DL, *NewD, NewR);
+      (void)MIB;
+#ifndef NDEBUG
+      NewInstrs.push_back(&*MIB);
+#endif
+      replaceAllRegUsesWith(R, NewR);
+    } else {
+      // This cell has a single value.
+      APInt A;
+      if (!constToInt(L.Value, A) || !A.isSignedIntN(64))
+        continue;
+      const TargetRegisterClass *NewRC;
+      const MCInstrDesc *NewD;
+
+      unsigned W = getRegBitWidth(R);
+      int64_t V = A.getSExtValue();
+      assert(W == 32 || W == 64);
+      if (W == 32)
+        NewRC = &Hexagon::IntRegsRegClass;
+      else
+        NewRC = &Hexagon::DoubleRegsRegClass;
+      unsigned NewR = MRI->createVirtualRegister(NewRC);
+      const MachineInstr *NewMI;
+
+      if (W == 32) {
+        NewD = &HII.get(Hexagon::A2_tfrsi);
+        NewMI = BuildMI(B, At, DL, *NewD, NewR)
+                  .addImm(V);
+      } else {
+        if (A.isSignedIntN(8)) {
+          NewD = &HII.get(Hexagon::A2_tfrpi);
+          NewMI = BuildMI(B, At, DL, *NewD, NewR)
+                    .addImm(V);
+        } else {
+          int32_t Hi = V >> 32;
+          int32_t Lo = V & 0xFFFFFFFFLL;
+          if (isInt<8>(Hi) && isInt<8>(Lo)) {
+            NewD = &HII.get(Hexagon::A2_combineii);
+            NewMI = BuildMI(B, At, DL, *NewD, NewR)
+                      .addImm(Hi)
+                      .addImm(Lo);
+          } else {
+            NewD = &HII.get(Hexagon::CONST64);
+            NewMI = BuildMI(B, At, DL, *NewD, NewR)
+                      .addImm(V);
+          }
+        }
+      }
+      (void)NewMI;
+#ifndef NDEBUG
+      NewInstrs.push_back(NewMI);
+#endif
+      replaceAllRegUsesWith(R, NewR);
+    }
+    ChangedNum++;
+  }
+
+  DEBUG({
+    if (!NewInstrs.empty()) {
+      MachineFunction &MF = *MI.getParent()->getParent();
+      dbgs() << "In function: " << MF.getFunction()->getName() << "\n";
+      dbgs() << "Rewrite: for " << MI << "  created " << *NewInstrs[0];
+      for (unsigned i = 1; i < NewInstrs.size(); ++i)
+        dbgs() << "          " << *NewInstrs[i];
+    }
+  });
+
+  AllDefs = (ChangedNum == DefRegs.size());
+  return ChangedNum > 0;
+}
+
+bool HexagonConstEvaluator::rewriteHexConstUses(MachineInstr &MI,
+      const CellMap &Inputs) {
+  bool Changed = false;
+  unsigned Opc = MI.getOpcode();
+  MachineBasicBlock &B = *MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock::iterator At = MI.getIterator();
+  MachineInstr *NewMI = nullptr;
+
+  switch (Opc) {
+    case Hexagon::M2_maci:
+    // Convert DefR += mpyi(R2, R3)
+    //   to   DefR += mpyi(R, #imm),
+    //   or   DefR -= mpyi(R, #imm).
+    {
+      Register DefR(MI.getOperand(0));
+      assert(!DefR.SubReg);
+      Register R2(MI.getOperand(2));
+      Register R3(MI.getOperand(3));
+      assert(Inputs.has(R2.Reg) && Inputs.has(R3.Reg));
+      LatticeCell LS2, LS3;
+      // It is enough to get one of the input cells, since we will only try
+      // to replace one argument---whichever happens to be a single constant.
+      bool HasC2 = getCell(R2, Inputs, LS2), HasC3 = getCell(R3, Inputs, LS3);
+      if (!HasC2 && !HasC3)
+        return false;
+      bool Zero = ((HasC2 && (LS2.properties() & ConstantProperties::Zero)) ||
+                   (HasC3 && (LS3.properties() & ConstantProperties::Zero)));
+      // If one of the operands is zero, eliminate the multiplication.
+      if (Zero) {
+        // DefR == R1 (tied operands).
+        MachineOperand &Acc = MI.getOperand(1);
+        Register R1(Acc);
+        unsigned NewR = R1.Reg;
+        if (R1.SubReg) {
+          // Generate COPY. FIXME: Replace with the register:subregister.
+          const TargetRegisterClass *RC = MRI->getRegClass(DefR.Reg);
+          NewR = MRI->createVirtualRegister(RC);
+          NewMI = BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
+                    .addReg(R1.Reg, getRegState(Acc), R1.SubReg);
+        }
+        replaceAllRegUsesWith(DefR.Reg, NewR);
+        MRI->clearKillFlags(NewR);
+        Changed = true;
+        break;
+      }
+
+      bool Swap = false;
+      if (!LS3.isSingle()) {
+        if (!LS2.isSingle())
+          return false;
+        Swap = true;
+      }
+      const LatticeCell &LI = Swap ? LS2 : LS3;
+      const MachineOperand &OpR2 = Swap ? MI.getOperand(3)
+                                        : MI.getOperand(2);
+      // LI is single here.
+      APInt A;
+      if (!constToInt(LI.Value, A) || !A.isSignedIntN(8))
+        return false;
+      int64_t V = A.getSExtValue();
+      const MCInstrDesc &D = (V >= 0) ? HII.get(Hexagon::M2_macsip)
+                                      : HII.get(Hexagon::M2_macsin);
+      if (V < 0)
+        V = -V;
+      const TargetRegisterClass *RC = MRI->getRegClass(DefR.Reg);
+      unsigned NewR = MRI->createVirtualRegister(RC);
+      const MachineOperand &Src1 = MI.getOperand(1);
+      NewMI = BuildMI(B, At, DL, D, NewR)
+                .addReg(Src1.getReg(), getRegState(Src1), Src1.getSubReg())
+                .addReg(OpR2.getReg(), getRegState(OpR2), OpR2.getSubReg())
+                .addImm(V);
+      replaceAllRegUsesWith(DefR.Reg, NewR);
+      Changed = true;
+      break;
+    }
+
+    case Hexagon::A2_and:
+    {
+      Register R1(MI.getOperand(1));
+      Register R2(MI.getOperand(2));
+      assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
+      LatticeCell LS1, LS2;
+      unsigned CopyOf = 0;
+      // Check if any of the operands is -1 (i.e. all bits set).
+      if (getCell(R1, Inputs, LS1) && LS1.isSingle()) {
+        APInt M1;
+        if (constToInt(LS1.Value, M1) && !~M1)
+          CopyOf = 2;
+      }
+      else if (getCell(R2, Inputs, LS2) && LS2.isSingle()) {
+        APInt M1;
+        if (constToInt(LS2.Value, M1) && !~M1)
+          CopyOf = 1;
+      }
+      if (!CopyOf)
+        return false;
+      MachineOperand &SO = MI.getOperand(CopyOf);
+      Register SR(SO);
+      Register DefR(MI.getOperand(0));
+      unsigned NewR = SR.Reg;
+      if (SR.SubReg) {
+        const TargetRegisterClass *RC = MRI->getRegClass(DefR.Reg);
+        NewR = MRI->createVirtualRegister(RC);
+        NewMI = BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
+                  .addReg(SR.Reg, getRegState(SO), SR.SubReg);
+      }
+      replaceAllRegUsesWith(DefR.Reg, NewR);
+      MRI->clearKillFlags(NewR);
+      Changed = true;
+    }
+    break;
+
+    case Hexagon::A2_or:
+    {
+      Register R1(MI.getOperand(1));
+      Register R2(MI.getOperand(2));
+      assert(Inputs.has(R1.Reg) && Inputs.has(R2.Reg));
+      LatticeCell LS1, LS2;
+      unsigned CopyOf = 0;
+      typedef ConstantProperties P;
+      if (getCell(R1, Inputs, LS1) && (LS1.properties() & P::Zero))
+        CopyOf = 2;
+      else if (getCell(R2, Inputs, LS2) && (LS2.properties() & P::Zero))
+        CopyOf = 1;
+      if (!CopyOf)
+        return false;
+      MachineOperand &SO = MI.getOperand(CopyOf);
+      Register SR(SO);
+      Register DefR(MI.getOperand(0));
+      unsigned NewR = SR.Reg;
+      if (SR.SubReg) {
+        const TargetRegisterClass *RC = MRI->getRegClass(DefR.Reg);
+        NewR = MRI->createVirtualRegister(RC);
+        NewMI = BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR)
+                  .addReg(SR.Reg, getRegState(SO), SR.SubReg);
+      }
+      replaceAllRegUsesWith(DefR.Reg, NewR);
+      MRI->clearKillFlags(NewR);
+      Changed = true;
+    }
+    break;
+  }
+
+  if (NewMI) {
+    // clear all the kill flags of this new instruction.
+    for (MachineOperand &MO : NewMI->operands())
+      if (MO.isReg() && MO.isUse())
+        MO.setIsKill(false);
+  }
+
+  DEBUG({
+    if (NewMI) {
+      dbgs() << "Rewrite: for " << MI;
+      if (NewMI != &MI)
+        dbgs() << "  created " << *NewMI;
+      else
+        dbgs() << "  modified the instruction itself and created:" << *NewMI;
+    }
+  });
+
+  return Changed;
+}
+
+void HexagonConstEvaluator::replaceAllRegUsesWith(unsigned FromReg,
+      unsigned ToReg) {
+  assert(TargetRegisterInfo::isVirtualRegister(FromReg));
+  assert(TargetRegisterInfo::isVirtualRegister(ToReg));
+  for (auto I = MRI->use_begin(FromReg), E = MRI->use_end(); I != E;) {
+    MachineOperand &O = *I;
+    ++I;
+    O.setReg(ToReg);
+  }
+}
+
+bool HexagonConstEvaluator::rewriteHexBranch(MachineInstr &BrI,
+      const CellMap &Inputs) {
+  MachineBasicBlock &B = *BrI.getParent();
+  unsigned NumOp = BrI.getNumOperands();
+  if (!NumOp)
+    return false;
+
+  bool FallsThru;
+  SetVector<const MachineBasicBlock*> Targets;
+  bool Eval = evaluate(BrI, Inputs, Targets, FallsThru);
+  unsigned NumTargets = Targets.size();
+  if (!Eval || NumTargets > 1 || (NumTargets == 1 && FallsThru))
+    return false;
+  if (BrI.getOpcode() == Hexagon::J2_jump)
+    return false;
+
+  DEBUG(dbgs() << "Rewrite(BB#" << B.getNumber() << "):" << BrI);
+  bool Rewritten = false;
+  if (NumTargets > 0) {
+    assert(!FallsThru && "This should have been checked before");
+    // MIB.addMBB needs non-const pointer.
+    MachineBasicBlock *TargetB = const_cast<MachineBasicBlock*>(Targets[0]);
+    bool Moot = B.isLayoutSuccessor(TargetB);
+    if (!Moot) {
+      // If we build a branch here, we must make sure that it won't be
+      // erased as "non-executable". We can't mark any new instructions
+      // as executable here, so we need to overwrite the BrI, which we
+      // know is executable.
+      const MCInstrDesc &JD = HII.get(Hexagon::J2_jump);
+      auto NI = BuildMI(B, BrI.getIterator(), BrI.getDebugLoc(), JD)
+                  .addMBB(TargetB);
+      BrI.setDesc(JD);
+      while (BrI.getNumOperands() > 0)
+        BrI.RemoveOperand(0);
+      // This ensures that all implicit operands (e.g. %R31<imp-def>, etc)
+      // are present in the rewritten branch.
+      for (auto &Op : NI->operands())
+        BrI.addOperand(Op);
+      NI->eraseFromParent();
+      Rewritten = true;
+    }
+  }
+
+  // Do not erase instructions. A newly created instruction could get
+  // the same address as an instruction marked as executable during the
+  // propagation.
+  if (!Rewritten)
+    replaceWithNop(BrI);
+  return true;
+}
+
+FunctionPass *llvm::createHexagonConstPropagationPass() {
+  return new HexagonConstPropagation();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
new file mode 100644
index 000000000000..36080997ec6b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -0,0 +1,887 @@
+//===------- HexagonCopyToCombine.cpp - Hexagon Copy-To-Combine Pass ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass replaces transfer instructions by combine instructions.
+// We walk along a basic block and look for two combinable instructions and try
+// to move them together. If we can move them next to each other we do so and
+// replace them with a combine instruction.
+//===----------------------------------------------------------------------===//
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/PassSupport.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-copy-combine"
+
+static
+cl::opt<bool> IsCombinesDisabled("disable-merge-into-combines",
+                                 cl::Hidden, cl::ZeroOrMore,
+                                 cl::init(false),
+                                 cl::desc("Disable merging into combines"));
+static
+cl::opt<bool> IsConst64Disabled("disable-const64",
+                                 cl::Hidden, cl::ZeroOrMore,
+                                 cl::init(false),
+                                 cl::desc("Disable generation of const64"));
+static
+cl::opt<unsigned>
+MaxNumOfInstsBetweenNewValueStoreAndTFR("max-num-inst-between-tfr-and-nv-store",
+                   cl::Hidden, cl::init(4),
+                   cl::desc("Maximum distance between a tfr feeding a store we "
+                            "consider the store still to be newifiable"));
+
+namespace llvm {
+  FunctionPass *createHexagonCopyToCombine();
+  void initializeHexagonCopyToCombinePass(PassRegistry&);
+}
+
+
+namespace {
+
+class HexagonCopyToCombine : public MachineFunctionPass  {
+  const HexagonInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const HexagonSubtarget *ST;
+  bool ShouldCombineAggressively;
+
+  DenseSet<MachineInstr *> PotentiallyNewifiableTFR;
+  SmallVector<MachineInstr *, 8> DbgMItoMove;
+
+public:
+  static char ID;
+
+  HexagonCopyToCombine() : MachineFunctionPass(ID) {
+    initializeHexagonCopyToCombinePass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override {
+    return "Hexagon Copy-To-Combine Pass";
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  MachineInstr *findPairable(MachineInstr &I1, bool &DoInsertAtI1,
+                             bool AllowC64);
+
+  void findPotentialNewifiableTFRs(MachineBasicBlock &);
+
+  void combine(MachineInstr &I1, MachineInstr &I2,
+               MachineBasicBlock::iterator &MI, bool DoInsertAtI1,
+               bool OptForSize);
+
+  bool isSafeToMoveTogether(MachineInstr &I1, MachineInstr &I2,
+                            unsigned I1DestReg, unsigned I2DestReg,
+                            bool &DoInsertAtI1);
+
+  void emitCombineRR(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                     MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+  void emitCombineRI(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                     MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+  void emitCombineIR(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                     MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+  void emitCombineII(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                     MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+  void emitConst64(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                   MachineOperand &HiOperand, MachineOperand &LoOperand);
+};
+
+} // End anonymous namespace.
+
+char HexagonCopyToCombine::ID = 0;
+
+INITIALIZE_PASS(HexagonCopyToCombine, "hexagon-copy-combine",
+                "Hexagon Copy-To-Combine Pass", false, false)
+
+static bool isCombinableInstType(MachineInstr &MI, const HexagonInstrInfo *TII,
+                                 bool ShouldCombineAggressively) {
+  switch (MI.getOpcode()) {
+  case Hexagon::A2_tfr: {
+    // A COPY instruction can be combined if its arguments are IntRegs (32bit).
+    const MachineOperand &Op0 = MI.getOperand(0);
+    const MachineOperand &Op1 = MI.getOperand(1);
+    assert(Op0.isReg() && Op1.isReg());
+
+    unsigned DestReg = Op0.getReg();
+    unsigned SrcReg = Op1.getReg();
+    return Hexagon::IntRegsRegClass.contains(DestReg) &&
+           Hexagon::IntRegsRegClass.contains(SrcReg);
+  }
+
+  case Hexagon::A2_tfrsi: {
+    // A transfer-immediate can be combined if its argument is a signed 8bit
+    // value.
+    const MachineOperand &Op0 = MI.getOperand(0);
+    const MachineOperand &Op1 = MI.getOperand(1);
+    assert(Op0.isReg());
+
+    unsigned DestReg = Op0.getReg();
+    // Ensure that TargetFlags are MO_NO_FLAG for a global. This is a
+    // workaround for an ABI bug that prevents GOT relocations on combine
+    // instructions
+    if (!Op1.isImm() && Op1.getTargetFlags() != HexagonII::MO_NO_FLAG)
+      return false;
+
+    // Only combine constant extended A2_tfrsi if we are in aggressive mode.
+    bool NotExt = Op1.isImm() && isInt<8>(Op1.getImm());
+    return Hexagon::IntRegsRegClass.contains(DestReg) &&
+           (ShouldCombineAggressively || NotExt);
+  }
+
+  case Hexagon::V6_vassign:
+  case Hexagon::V6_vassign_128B:
+    return true;
+
+  default:
+    break;
+  }
+
+  return false;
+}
+
+template <unsigned N> static bool isGreaterThanNBitTFRI(const MachineInstr &I) {
+  if (I.getOpcode() == Hexagon::TFRI64_V4 ||
+      I.getOpcode() == Hexagon::A2_tfrsi) {
+    const MachineOperand &Op = I.getOperand(1);
+    return !Op.isImm() || !isInt<N>(Op.getImm());
+  }
+  return false;
+}
+
+/// areCombinableOperations - Returns true if the two instruction can be merge
+/// into a combine (ignoring register constraints).
+static bool areCombinableOperations(const TargetRegisterInfo *TRI,
+                                    MachineInstr &HighRegInst,
+                                    MachineInstr &LowRegInst, bool AllowC64) {
+  unsigned HiOpc = HighRegInst.getOpcode();
+  unsigned LoOpc = LowRegInst.getOpcode();
+
+  auto verifyOpc = [](unsigned Opc) -> void {
+    switch (Opc) {
+      case Hexagon::A2_tfr:
+      case Hexagon::A2_tfrsi:
+      case Hexagon::V6_vassign:
+        break;
+      default:
+        llvm_unreachable("Unexpected opcode");
+    }
+  };
+  verifyOpc(HiOpc);
+  verifyOpc(LoOpc);
+
+  if (HiOpc == Hexagon::V6_vassign || LoOpc == Hexagon::V6_vassign)
+    return HiOpc == LoOpc;
+
+  if (!AllowC64) {
+    // There is no combine of two constant extended values.
+    if (isGreaterThanNBitTFRI<8>(HighRegInst) &&
+        isGreaterThanNBitTFRI<6>(LowRegInst))
+      return false;
+  }
+
+  // There is a combine of two constant extended values into CONST64,
+  // provided both constants are true immediates.
+  if (isGreaterThanNBitTFRI<16>(HighRegInst) &&
+      isGreaterThanNBitTFRI<16>(LowRegInst))
+    return (HighRegInst.getOperand(1).isImm() &&
+            LowRegInst.getOperand(1).isImm());
+
+  // There is no combine of two constant extended values, unless handled above
+  // Make both 8-bit size checks to allow both combine (#,##) and combine(##,#)
+  if (isGreaterThanNBitTFRI<8>(HighRegInst) &&
+      isGreaterThanNBitTFRI<8>(LowRegInst))
+    return false;
+
+  return true;
+}
+
+static bool isEvenReg(unsigned Reg) {
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+  if (Hexagon::IntRegsRegClass.contains(Reg))
+    return (Reg - Hexagon::R0) % 2 == 0;
+  if (Hexagon::VectorRegsRegClass.contains(Reg) ||
+      Hexagon::VectorRegs128BRegClass.contains(Reg))
+    return (Reg - Hexagon::V0) % 2 == 0;
+  llvm_unreachable("Invalid register");
+}
+
+static void removeKillInfo(MachineInstr &MI, unsigned RegNotKilled) {
+  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+    MachineOperand &Op = MI.getOperand(I);
+    if (!Op.isReg() || Op.getReg() != RegNotKilled || !Op.isKill())
+      continue;
+    Op.setIsKill(false);
+  }
+}
+
+/// Returns true if it is unsafe to move a copy instruction from \p UseReg to
+/// \p DestReg over the instruction \p MI.
+static bool isUnsafeToMoveAcross(MachineInstr &MI, unsigned UseReg,
+                                 unsigned DestReg,
+                                 const TargetRegisterInfo *TRI) {
+  return (UseReg && (MI.modifiesRegister(UseReg, TRI))) ||
+         MI.modifiesRegister(DestReg, TRI) || MI.readsRegister(DestReg, TRI) ||
+         MI.hasUnmodeledSideEffects() || MI.isInlineAsm() || MI.isDebugValue();
+}
+
+static unsigned UseReg(const MachineOperand& MO) {
+  return MO.isReg() ? MO.getReg() : 0;
+}
+
+/// isSafeToMoveTogether - Returns true if it is safe to move I1 next to I2 such
+/// that the two instructions can be paired in a combine.
+bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr &I1,
+                                                MachineInstr &I2,
+                                                unsigned I1DestReg,
+                                                unsigned I2DestReg,
+                                                bool &DoInsertAtI1) {
+  unsigned I2UseReg = UseReg(I2.getOperand(1));
+
+  // It is not safe to move I1 and I2 into one combine if I2 has a true
+  // dependence on I1.
+  if (I2UseReg && I1.modifiesRegister(I2UseReg, TRI))
+    return false;
+
+  bool isSafe = true;
+
+  // First try to move I2 towards I1.
+  {
+    // A reverse_iterator instantiated like below starts before I2, and I1
+    // respectively.
+    // Look at instructions I in between I2 and (excluding) I1.
+    MachineBasicBlock::reverse_iterator I(I2),
+      End = --(MachineBasicBlock::reverse_iterator(I1));
+    // At 03 we got better results (dhrystone!) by being more conservative.
+    if (!ShouldCombineAggressively)
+      End = MachineBasicBlock::reverse_iterator(I1);
+    // If I2 kills its operand and we move I2 over an instruction that also
+    // uses I2's use reg we need to modify that (first) instruction to now kill
+    // this reg.
+    unsigned KilledOperand = 0;
+    if (I2.killsRegister(I2UseReg))
+      KilledOperand = I2UseReg;
+    MachineInstr *KillingInstr = nullptr;
+
+    for (; I != End; ++I) {
+      // If the intervening instruction I:
+      //   * modifies I2's use reg
+      //   * modifies I2's def reg
+      //   * reads I2's def reg
+      //   * or has unmodelled side effects
+      // we can't move I2 across it.
+      if (I->isDebugValue())
+        continue;
+
+      if (isUnsafeToMoveAcross(*I, I2UseReg, I2DestReg, TRI)) {
+        isSafe = false;
+        break;
+      }
+
+      // Update first use of the killed operand.
+      if (!KillingInstr && KilledOperand &&
+          I->readsRegister(KilledOperand, TRI))
+        KillingInstr = &*I;
+    }
+    if (isSafe) {
+      // Update the intermediate instruction to with the kill flag.
+      if (KillingInstr) {
+        bool Added = KillingInstr->addRegisterKilled(KilledOperand, TRI, true);
+        (void)Added; // suppress compiler warning
+        assert(Added && "Must successfully update kill flag");
+        removeKillInfo(I2, KilledOperand);
+      }
+      DoInsertAtI1 = true;
+      return true;
+    }
+  }
+
+  // Try to move I1 towards I2.
+  {
+    // Look at instructions I in between I1 and (excluding) I2.
+    MachineBasicBlock::iterator I(I1), End(I2);
+    // At O3 we got better results (dhrystone) by being more conservative here.
+    if (!ShouldCombineAggressively)
+      End = std::next(MachineBasicBlock::iterator(I2));
+    unsigned I1UseReg = UseReg(I1.getOperand(1));
+    // Track killed operands. If we move across an instruction that kills our
+    // operand, we need to update the kill information on the moved I1. It kills
+    // the operand now.
+    MachineInstr *KillingInstr = nullptr;
+    unsigned KilledOperand = 0;
+
+    while(++I != End) {
+      MachineInstr &MI = *I;
+      // If the intervening instruction MI:
+      //   * modifies I1's use reg
+      //   * modifies I1's def reg
+      //   * reads I1's def reg
+      //   * or has unmodelled side effects
+      //   We introduce this special case because llvm has no api to remove a
+      //   kill flag for a register (a removeRegisterKilled() analogous to
+      //   addRegisterKilled) that handles aliased register correctly.
+      //   * or has a killed aliased register use of I1's use reg
+      //           %D4<def> = A2_tfrpi 16
+      //           %R6<def> = A2_tfr %R9
+      //           %R8<def> = KILL %R8, %D4<imp-use,kill>
+      //      If we want to move R6 = across the KILL instruction we would have
+      //      to remove the %D4<imp-use,kill> operand. For now, we are
+      //      conservative and disallow the move.
+      // we can't move I1 across it.
+      if (MI.isDebugValue()) {
+        if (MI.readsRegister(I1DestReg, TRI)) // Move this instruction after I2.
+          DbgMItoMove.push_back(&MI);
+        continue;
+      }
+
+      if (isUnsafeToMoveAcross(MI, I1UseReg, I1DestReg, TRI) ||
+          // Check for an aliased register kill. Bail out if we see one.
+          (!MI.killsRegister(I1UseReg) && MI.killsRegister(I1UseReg, TRI)))
+        return false;
+
+      // Check for an exact kill (registers match).
+      if (I1UseReg && MI.killsRegister(I1UseReg)) {
+        assert(!KillingInstr && "Should only see one killing instruction");
+        KilledOperand = I1UseReg;
+        KillingInstr = &MI;
+      }
+    }
+    if (KillingInstr) {
+      removeKillInfo(*KillingInstr, KilledOperand);
+      // Update I1 to set the kill flag. This flag will later be picked up by
+      // the new COMBINE instruction.
+      bool Added = I1.addRegisterKilled(KilledOperand, TRI);
+      (void)Added; // suppress compiler warning
+      assert(Added && "Must successfully update kill flag");
+    }
+    DoInsertAtI1 = false;
+  }
+
+  return true;
+}
+
+/// findPotentialNewifiableTFRs - Finds tranfers that feed stores that could be
+/// newified. (A use of a 64 bit register define can not be newified)
+void
+HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
+  DenseMap<unsigned, MachineInstr *> LastDef;
+  for (MachineInstr &MI : BB) {
+    if (MI.isDebugValue())
+      continue;
+
+    // Mark TFRs that feed a potential new value store as such.
+    if (TII->mayBeNewStore(MI)) {
+      // Look for uses of TFR instructions.
+      for (unsigned OpdIdx = 0, OpdE = MI.getNumOperands(); OpdIdx != OpdE;
+           ++OpdIdx) {
+        MachineOperand &Op = MI.getOperand(OpdIdx);
+
+        // Skip over anything except register uses.
+        if (!Op.isReg() || !Op.isUse() || !Op.getReg())
+          continue;
+
+        // Look for the defining instruction.
+        unsigned Reg = Op.getReg();
+        MachineInstr *DefInst = LastDef[Reg];
+        if (!DefInst)
+          continue;
+        if (!isCombinableInstType(*DefInst, TII, ShouldCombineAggressively))
+          continue;
+
+        // Only close newifiable stores should influence the decision.
+        // Ignore the debug instructions in between.
+        MachineBasicBlock::iterator It(DefInst);
+        unsigned NumInstsToDef = 0;
+        while (&*It != &MI) {
+          if (!It->isDebugValue())
+            ++NumInstsToDef;
+          ++It;
+        }
+
+        if (NumInstsToDef > MaxNumOfInstsBetweenNewValueStoreAndTFR)
+          continue;
+
+        PotentiallyNewifiableTFR.insert(DefInst);
+      }
+      // Skip to next instruction.
+      continue;
+    }
+
+    // Put instructions that last defined integer or double registers into the
+    // map.
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+      MachineOperand &Op = MI.getOperand(I);
+      if (!Op.isReg() || !Op.isDef() || !Op.getReg())
+        continue;
+      unsigned Reg = Op.getReg();
+      if (Hexagon::DoubleRegsRegClass.contains(Reg)) {
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+          LastDef[*SubRegs] = &MI;
+        }
+      } else if (Hexagon::IntRegsRegClass.contains(Reg))
+        LastDef[Reg] = &MI;
+    }
+  }
+}
+
+bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
+
+  if (IsCombinesDisabled) return false;
+
+  bool HasChanged = false;
+
+  // Get target info.
+  ST = &MF.getSubtarget<HexagonSubtarget>();
+  TRI = ST->getRegisterInfo();
+  TII = ST->getInstrInfo();
+
+  const Function *F = MF.getFunction();
+  bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
+
+  // Combine aggressively (for code size)
+  ShouldCombineAggressively =
+    MF.getTarget().getOptLevel() <= CodeGenOpt::Default;
+
+  // Traverse basic blocks.
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+    PotentiallyNewifiableTFR.clear();
+    findPotentialNewifiableTFRs(*BI);
+
+    // Traverse instructions in basic block.
+    for(MachineBasicBlock::iterator MI = BI->begin(), End = BI->end();
+        MI != End;) {
+      MachineInstr &I1 = *MI++;
+
+      if (I1.isDebugValue())
+        continue;
+
+      // Don't combine a TFR whose user could be newified (instructions that
+      // define double registers can not be newified - Programmer's Ref Manual
+      // 5.4.2 New-value stores).
+      if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(&I1))
+        continue;
+
+      // Ignore instructions that are not combinable.
+      if (!isCombinableInstType(I1, TII, ShouldCombineAggressively))
+        continue;
+
+      // Find a second instruction that can be merged into a combine
+      // instruction. In addition, also find all the debug instructions that
+      // need to be moved along with it.
+      bool DoInsertAtI1 = false;
+      DbgMItoMove.clear();
+      MachineInstr *I2 = findPairable(I1, DoInsertAtI1, OptForSize);
+      if (I2) {
+        HasChanged = true;
+        combine(I1, *I2, MI, DoInsertAtI1, OptForSize);
+      }
+    }
+  }
+
+  return HasChanged;
+}
+
+/// findPairable - Returns an instruction that can be merged with \p I1 into a
+/// COMBINE instruction or 0 if no such instruction can be found. Returns true
+/// in \p DoInsertAtI1 if the combine must be inserted at instruction \p I1
+/// false if the combine must be inserted at the returned instruction.
+MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr &I1,
+                                                 bool &DoInsertAtI1,
+                                                 bool AllowC64) {
+  MachineBasicBlock::iterator I2 = std::next(MachineBasicBlock::iterator(I1));
+  while (I2 != I1.getParent()->end() && I2->isDebugValue())
+    ++I2;
+
+  unsigned I1DestReg = I1.getOperand(0).getReg();
+
+  for (MachineBasicBlock::iterator End = I1.getParent()->end(); I2 != End;
+       ++I2) {
+    // Bail out early if we see a second definition of I1DestReg.
+    if (I2->modifiesRegister(I1DestReg, TRI))
+      break;
+
+    // Ignore non-combinable instructions.
+    if (!isCombinableInstType(*I2, TII, ShouldCombineAggressively))
+      continue;
+
+    // Don't combine a TFR whose user could be newified.
+    if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(&*I2))
+      continue;
+
+    unsigned I2DestReg = I2->getOperand(0).getReg();
+
+    // Check that registers are adjacent and that the first destination register
+    // is even.
+    bool IsI1LowReg = (I2DestReg - I1DestReg) == 1;
+    bool IsI2LowReg = (I1DestReg - I2DestReg) == 1;
+    unsigned FirstRegIndex = IsI1LowReg ? I1DestReg : I2DestReg;
+    if ((!IsI1LowReg && !IsI2LowReg) || !isEvenReg(FirstRegIndex))
+      continue;
+
+    // Check that the two instructions are combinable. V4 allows more
+    // instructions to be merged into a combine.
+    // The order matters because in a A2_tfrsi we might can encode a int8 as
+    // the hi reg operand but only a uint6 as the low reg operand.
+    if ((IsI2LowReg && !areCombinableOperations(TRI, I1, *I2, AllowC64)) ||
+        (IsI1LowReg && !areCombinableOperations(TRI, *I2, I1, AllowC64)))
+      break;
+
+    if (isSafeToMoveTogether(I1, *I2, I1DestReg, I2DestReg, DoInsertAtI1))
+      return &*I2;
+
+    // Not safe. Stop searching.
+    break;
+  }
+  return nullptr;
+}
+
+void HexagonCopyToCombine::combine(MachineInstr &I1, MachineInstr &I2,
+                                   MachineBasicBlock::iterator &MI,
+                                   bool DoInsertAtI1, bool OptForSize) {
+  // We are going to delete I2. If MI points to I2 advance it to the next
+  // instruction.
+  if (MI == I2.getIterator())
+    ++MI;
+
+  // Figure out whether I1 or I2 goes into the lowreg part.
+  unsigned I1DestReg = I1.getOperand(0).getReg();
+  unsigned I2DestReg = I2.getOperand(0).getReg();
+  bool IsI1Loreg = (I2DestReg - I1DestReg) == 1;
+  unsigned LoRegDef = IsI1Loreg ? I1DestReg : I2DestReg;
+  unsigned SubLo;
+
+  const TargetRegisterClass *SuperRC = nullptr;
+  if (Hexagon::IntRegsRegClass.contains(LoRegDef)) {
+    SuperRC = &Hexagon::DoubleRegsRegClass;
+    SubLo = Hexagon::isub_lo;
+  } else if (Hexagon::VectorRegsRegClass.contains(LoRegDef)) {
+    assert(ST->useHVXOps());
+    if (ST->useHVXSglOps())
+      SuperRC = &Hexagon::VecDblRegsRegClass;
+    else
+      SuperRC = &Hexagon::VecDblRegs128BRegClass;
+    SubLo = Hexagon::vsub_lo;
+  } else
+    llvm_unreachable("Unexpected register class");
+
+  // Get the double word register.
+  unsigned DoubleRegDest = TRI->getMatchingSuperReg(LoRegDef, SubLo, SuperRC);
+  assert(DoubleRegDest != 0 && "Expect a valid register");
+
+  // Setup source operands.
+  MachineOperand &LoOperand = IsI1Loreg ? I1.getOperand(1) : I2.getOperand(1);
+  MachineOperand &HiOperand = IsI1Loreg ? I2.getOperand(1) : I1.getOperand(1);
+
+  // Figure out which source is a register and which a constant.
+  bool IsHiReg = HiOperand.isReg();
+  bool IsLoReg = LoOperand.isReg();
+
+  // There is a combine of two constant extended values into CONST64.
+  bool IsC64 = OptForSize && LoOperand.isImm() && HiOperand.isImm() &&
+               isGreaterThanNBitTFRI<16>(I1) && isGreaterThanNBitTFRI<16>(I2);
+
+  MachineBasicBlock::iterator InsertPt(DoInsertAtI1 ? I1 : I2);
+  // Emit combine.
+  if (IsHiReg && IsLoReg)
+    emitCombineRR(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+  else if (IsHiReg)
+    emitCombineRI(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+  else if (IsLoReg)
+    emitCombineIR(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+  else if (IsC64 && !IsConst64Disabled)
+    emitConst64(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+  else
+    emitCombineII(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+
+  // Move debug instructions along with I1 if it's being
+  // moved towards I2.
+  if (!DoInsertAtI1 && DbgMItoMove.size() != 0) {
+    // Insert debug instructions at the new location before I2.
+    MachineBasicBlock *BB = InsertPt->getParent();
+    for (auto NewMI : DbgMItoMove) {
+      // If iterator MI is pointing to DEBUG_VAL, make sure
+      // MI now points to next relevant instruction.
+      if (NewMI == MI)
+        ++MI;
+      BB->splice(InsertPt, BB, NewMI);
+    }
+  }
+
+  I1.eraseFromParent();
+  I2.eraseFromParent();
+}
+
+void HexagonCopyToCombine::emitConst64(MachineBasicBlock::iterator &InsertPt,
+                                       unsigned DoubleDestReg,
+                                       MachineOperand &HiOperand,
+                                       MachineOperand &LoOperand) {
+  DEBUG(dbgs() << "Found a CONST64\n");
+
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+  assert(LoOperand.isImm() && HiOperand.isImm() &&
+         "Both operands must be immediate");
+
+  int64_t V = HiOperand.getImm();
+  V = (V << 32) | (0x0ffffffffLL & LoOperand.getImm());
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::CONST64), DoubleDestReg)
+    .addImm(V);
+}
+
+void HexagonCopyToCombine::emitCombineII(MachineBasicBlock::iterator &InsertPt,
+                                         unsigned DoubleDestReg,
+                                         MachineOperand &HiOperand,
+                                         MachineOperand &LoOperand) {
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+
+  // Handle globals.
+  if (HiOperand.isGlobal()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A2_combineii), DoubleDestReg)
+      .addGlobalAddress(HiOperand.getGlobal(), HiOperand.getOffset(),
+                        HiOperand.getTargetFlags())
+      .addImm(LoOperand.getImm());
+    return;
+  }
+  if (LoOperand.isGlobal()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineii), DoubleDestReg)
+      .addImm(HiOperand.getImm())
+      .addGlobalAddress(LoOperand.getGlobal(), LoOperand.getOffset(),
+                        LoOperand.getTargetFlags());
+    return;
+  }
+
+  // Handle block addresses.
+  if (HiOperand.isBlockAddress()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A2_combineii), DoubleDestReg)
+      .addBlockAddress(HiOperand.getBlockAddress(), HiOperand.getOffset(),
+                       HiOperand.getTargetFlags())
+      .addImm(LoOperand.getImm());
+    return;
+  }
+  if (LoOperand.isBlockAddress()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineii), DoubleDestReg)
+      .addImm(HiOperand.getImm())
+      .addBlockAddress(LoOperand.getBlockAddress(), LoOperand.getOffset(),
+                       LoOperand.getTargetFlags());
+    return;
+  }
+
+  // Handle jump tables.
+  if (HiOperand.isJTI()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A2_combineii), DoubleDestReg)
+      .addJumpTableIndex(HiOperand.getIndex(), HiOperand.getTargetFlags())
+      .addImm(LoOperand.getImm());
+    return;
+  }
+  if (LoOperand.isJTI()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineii), DoubleDestReg)
+      .addImm(HiOperand.getImm())
+      .addJumpTableIndex(LoOperand.getIndex(), LoOperand.getTargetFlags());
+    return;
+  }
+
+  // Handle constant pools.
+  if (HiOperand.isCPI()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A2_combineii), DoubleDestReg)
+      .addConstantPoolIndex(HiOperand.getIndex(), HiOperand.getOffset(),
+                            HiOperand.getTargetFlags())
+      .addImm(LoOperand.getImm());
+    return;
+  }
+  if (LoOperand.isCPI()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineii), DoubleDestReg)
+      .addImm(HiOperand.getImm())
+      .addConstantPoolIndex(LoOperand.getIndex(), LoOperand.getOffset(),
+                            LoOperand.getTargetFlags());
+    return;
+  }
+
+  // First preference should be given to Hexagon::A2_combineii instruction
+  // as it can include U6 (in Hexagon::A4_combineii) as well.
+  // In this instruction, HiOperand is const extended, if required.
+  if (isInt<8>(LoOperand.getImm())) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A2_combineii), DoubleDestReg)
+      .addImm(HiOperand.getImm())
+      .addImm(LoOperand.getImm());
+      return;
+  }
+
+  // In this instruction, LoOperand is const extended, if required.
+  if (isInt<8>(HiOperand.getImm())) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineii), DoubleDestReg)
+      .addImm(HiOperand.getImm())
+      .addImm(LoOperand.getImm());
+    return;
+  }
+
+  // Insert new combine instruction.
+  //  DoubleRegDest = combine #HiImm, #LoImm
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A2_combineii), DoubleDestReg)
+    .addImm(HiOperand.getImm())
+    .addImm(LoOperand.getImm());
+}
+
+void HexagonCopyToCombine::emitCombineIR(MachineBasicBlock::iterator &InsertPt,
+                                         unsigned DoubleDestReg,
+                                         MachineOperand &HiOperand,
+                                         MachineOperand &LoOperand) {
+  unsigned LoReg = LoOperand.getReg();
+  unsigned LoRegKillFlag = getKillRegState(LoOperand.isKill());
+
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+
+  // Handle globals.
+  if (HiOperand.isGlobal()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineir), DoubleDestReg)
+      .addGlobalAddress(HiOperand.getGlobal(), HiOperand.getOffset(),
+                        HiOperand.getTargetFlags())
+      .addReg(LoReg, LoRegKillFlag);
+    return;
+  }
+  // Handle block addresses.
+  if (HiOperand.isBlockAddress()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineir), DoubleDestReg)
+      .addBlockAddress(HiOperand.getBlockAddress(), HiOperand.getOffset(),
+                       HiOperand.getTargetFlags())
+      .addReg(LoReg, LoRegKillFlag);
+    return;
+  }
+  // Handle jump tables.
+  if (HiOperand.isJTI()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineir), DoubleDestReg)
+      .addJumpTableIndex(HiOperand.getIndex(), HiOperand.getTargetFlags())
+      .addReg(LoReg, LoRegKillFlag);
+    return;
+  }
+  // Handle constant pools.
+  if (HiOperand.isCPI()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineir), DoubleDestReg)
+      .addConstantPoolIndex(HiOperand.getIndex(), HiOperand.getOffset(),
+                            HiOperand.getTargetFlags())
+      .addReg(LoReg, LoRegKillFlag);
+    return;
+  }
+  // Insert new combine instruction.
+  //  DoubleRegDest = combine #HiImm, LoReg
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineir), DoubleDestReg)
+    .addImm(HiOperand.getImm())
+    .addReg(LoReg, LoRegKillFlag);
+}
+
+void HexagonCopyToCombine::emitCombineRI(MachineBasicBlock::iterator &InsertPt,
+                                         unsigned DoubleDestReg,
+                                         MachineOperand &HiOperand,
+                                         MachineOperand &LoOperand) {
+  unsigned HiRegKillFlag = getKillRegState(HiOperand.isKill());
+  unsigned HiReg = HiOperand.getReg();
+
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+
+  // Handle global.
+  if (LoOperand.isGlobal()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineri), DoubleDestReg)
+      .addReg(HiReg, HiRegKillFlag)
+      .addGlobalAddress(LoOperand.getGlobal(), LoOperand.getOffset(),
+                        LoOperand.getTargetFlags());
+    return;
+  }
+  // Handle block addresses.
+  if (LoOperand.isBlockAddress()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineri), DoubleDestReg)
+      .addReg(HiReg, HiRegKillFlag)
+      .addBlockAddress(LoOperand.getBlockAddress(), LoOperand.getOffset(),
+                       LoOperand.getTargetFlags());
+    return;
+  }
+  // Handle jump tables.
+  if (LoOperand.isJTI()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineri), DoubleDestReg)
+      .addReg(HiOperand.getReg(), HiRegKillFlag)
+      .addJumpTableIndex(LoOperand.getIndex(), LoOperand.getTargetFlags());
+    return;
+  }
+  // Handle constant pools.
+  if (LoOperand.isCPI()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineri), DoubleDestReg)
+      .addReg(HiOperand.getReg(), HiRegKillFlag)
+      .addConstantPoolIndex(LoOperand.getIndex(), LoOperand.getOffset(),
+                            LoOperand.getTargetFlags());
+    return;
+  }
+
+  // Insert new combine instruction.
+  //  DoubleRegDest = combine HiReg, #LoImm
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::A4_combineri), DoubleDestReg)
+    .addReg(HiReg, HiRegKillFlag)
+    .addImm(LoOperand.getImm());
+}
+
+void HexagonCopyToCombine::emitCombineRR(MachineBasicBlock::iterator &InsertPt,
+                                         unsigned DoubleDestReg,
+                                         MachineOperand &HiOperand,
+                                         MachineOperand &LoOperand) {
+  unsigned LoRegKillFlag = getKillRegState(LoOperand.isKill());
+  unsigned HiRegKillFlag = getKillRegState(HiOperand.isKill());
+  unsigned LoReg = LoOperand.getReg();
+  unsigned HiReg = HiOperand.getReg();
+
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+
+  // Insert new combine instruction.
+  //  DoubleRegDest = combine HiReg, LoReg
+  unsigned NewOpc;
+  if (Hexagon::DoubleRegsRegClass.contains(DoubleDestReg)) {
+    NewOpc = Hexagon::A2_combinew;
+  } else if (Hexagon::VecDblRegsRegClass.contains(DoubleDestReg)) {
+    assert(ST->useHVXOps());
+    if (ST->useHVXSglOps())
+      NewOpc = Hexagon::V6_vcombine;
+    else
+      NewOpc = Hexagon::V6_vcombine_128B;
+  } else
+    llvm_unreachable("Unexpected register");
+
+  BuildMI(*BB, InsertPt, DL, TII->get(NewOpc), DoubleDestReg)
+    .addReg(HiReg, HiRegKillFlag)
+    .addReg(LoReg, LoRegKillFlag);
+}
+
+FunctionPass *llvm::createHexagonCopyToCombine() {
+  return new HexagonCopyToCombine();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
new file mode 100644
index 000000000000..a5351cd08da5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -0,0 +1,1034 @@
+//===--- HexagonEarlyIfConv.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a Hexagon-specific if-conversion pass that runs on the
+// SSA form.
+// In SSA it is not straightforward to represent instructions that condi-
+// tionally define registers, since a conditionally-defined register may
+// only be used under the same condition on which the definition was based.
+// To avoid complications of this nature, this patch will only generate
+// predicated stores, and speculate other instructions from the "if-conver-
+// ted" block.
+// The code will recognize CFG patterns where a block with a conditional
+// branch "splits" into a "true block" and a "false block". Either of these
+// could be omitted (in case of a triangle, for example).
+// If after conversion of the side block(s) the CFG allows it, the resul-
+// ting blocks may be merged. If the "join" block contained PHI nodes, they
+// will be replaced with MUX (or MUX-like) instructions to maintain the
+// semantics of the PHI.
+//
+// Example:
+//
+//         %vreg40<def> = L2_loadrub_io %vreg39<kill>, 1
+//         %vreg41<def> = S2_tstbit_i %vreg40<kill>, 0
+//         J2_jumpt %vreg41<kill>, <BB#5>, %PC<imp-def,dead>
+//         J2_jump <BB#4>, %PC<imp-def,dead>
+//     Successors according to CFG: BB#4(62) BB#5(62)
+//
+// BB#4: derived from LLVM BB %if.then
+//     Predecessors according to CFG: BB#3
+//         %vreg11<def> = A2_addp %vreg6, %vreg10
+//         S2_storerd_io %vreg32, 16, %vreg11
+//     Successors according to CFG: BB#5
+//
+// BB#5: derived from LLVM BB %if.end
+//     Predecessors according to CFG: BB#3 BB#4
+//         %vreg12<def> = PHI %vreg6, <BB#3>, %vreg11, <BB#4>
+//         %vreg13<def> = A2_addp %vreg7, %vreg12
+//         %vreg42<def> = C2_cmpeqi %vreg9, 10
+//         J2_jumpf %vreg42<kill>, <BB#3>, %PC<imp-def,dead>
+//         J2_jump <BB#6>, %PC<imp-def,dead>
+//     Successors according to CFG: BB#6(4) BB#3(124)
+//
+// would become:
+//
+//         %vreg40<def> = L2_loadrub_io %vreg39<kill>, 1
+//         %vreg41<def> = S2_tstbit_i %vreg40<kill>, 0
+// spec->  %vreg11<def> = A2_addp %vreg6, %vreg10
+// pred->  S2_pstorerdf_io %vreg41, %vreg32, 16, %vreg11
+//         %vreg46<def> = PS_pselect %vreg41, %vreg6, %vreg11
+//         %vreg13<def> = A2_addp %vreg7, %vreg46
+//         %vreg42<def> = C2_cmpeqi %vreg9, 10
+//         J2_jumpf %vreg42<kill>, <BB#3>, %PC<imp-def,dead>
+//         J2_jump <BB#6>, %PC<imp-def,dead>
+//     Successors according to CFG: BB#6 BB#3
+
+#define DEBUG_TYPE "hexagon-eif"
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <iterator>
+
+using namespace llvm;
+
+namespace llvm {
+
+  FunctionPass *createHexagonEarlyIfConversion();
+  void initializeHexagonEarlyIfConversionPass(PassRegistry& Registry);
+
+} // end namespace llvm
+
+namespace {
+
+  cl::opt<bool> EnableHexagonBP("enable-hexagon-br-prob", cl::Hidden,
+    cl::init(false), cl::desc("Enable branch probability info"));
+  cl::opt<unsigned> SizeLimit("eif-limit", cl::init(6), cl::Hidden,
+    cl::desc("Size limit in Hexagon early if-conversion"));
+
+  struct PrintMB {
+    PrintMB(const MachineBasicBlock *B) : MB(B) {}
+    const MachineBasicBlock *MB;
+  };
+  raw_ostream &operator<< (raw_ostream &OS, const PrintMB &P) {
+    if (!P.MB)
+      return OS << "<none>";
+    return OS << '#' << P.MB->getNumber();
+  }
+
+  struct FlowPattern {
+    FlowPattern() = default;
+    FlowPattern(MachineBasicBlock *B, unsigned PR, MachineBasicBlock *TB,
+          MachineBasicBlock *FB, MachineBasicBlock *JB)
+      : SplitB(B), TrueB(TB), FalseB(FB), JoinB(JB), PredR(PR) {}
+
+    MachineBasicBlock *SplitB = nullptr;
+    MachineBasicBlock *TrueB = nullptr;
+    MachineBasicBlock *FalseB = nullptr;
+    MachineBasicBlock *JoinB = nullptr;
+    unsigned PredR = 0;
+  };
+
+  struct PrintFP {
+    PrintFP(const FlowPattern &P, const TargetRegisterInfo &T)
+      : FP(P), TRI(T) {}
+
+    const FlowPattern &FP;
+    const TargetRegisterInfo &TRI;
+    friend raw_ostream &operator<< (raw_ostream &OS, const PrintFP &P);
+  };
+  raw_ostream &operator<<(raw_ostream &OS,
+                          const PrintFP &P) LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<<(raw_ostream &OS, const PrintFP &P) {
+    OS << "{ SplitB:" << PrintMB(P.FP.SplitB)
+       << ", PredR:" << PrintReg(P.FP.PredR, &P.TRI)
+       << ", TrueB:" << PrintMB(P.FP.TrueB) << ", FalseB:"
+       << PrintMB(P.FP.FalseB)
+       << ", JoinB:" << PrintMB(P.FP.JoinB) << " }";
+    return OS;
+  }
+
+  class HexagonEarlyIfConversion : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    HexagonEarlyIfConversion() : MachineFunctionPass(ID),
+        HII(nullptr), TRI(nullptr), MFN(nullptr), MRI(nullptr), MDT(nullptr),
+        MLI(nullptr) {
+      initializeHexagonEarlyIfConversionPass(*PassRegistry::getPassRegistry());
+    }
+
+    StringRef getPassName() const override {
+      return "Hexagon early if conversion";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    typedef DenseSet<MachineBasicBlock*> BlockSetType;
+
+    bool isPreheader(const MachineBasicBlock *B) const;
+    bool matchFlowPattern(MachineBasicBlock *B, MachineLoop *L,
+          FlowPattern &FP);
+    bool visitBlock(MachineBasicBlock *B, MachineLoop *L);
+    bool visitLoop(MachineLoop *L);
+
+    bool hasEHLabel(const MachineBasicBlock *B) const;
+    bool hasUncondBranch(const MachineBasicBlock *B) const;
+    bool isValidCandidate(const MachineBasicBlock *B) const;
+    bool usesUndefVReg(const MachineInstr *MI) const;
+    bool isValid(const FlowPattern &FP) const;
+    unsigned countPredicateDefs(const MachineBasicBlock *B) const;
+    unsigned computePhiCost(MachineBasicBlock *B) const;
+    bool isProfitable(const FlowPattern &FP) const;
+    bool isPredicableStore(const MachineInstr *MI) const;
+    bool isSafeToSpeculate(const MachineInstr *MI) const;
+
+    unsigned getCondStoreOpcode(unsigned Opc, bool IfTrue) const;
+    void predicateInstr(MachineBasicBlock *ToB, MachineBasicBlock::iterator At,
+          MachineInstr *MI, unsigned PredR, bool IfTrue);
+    void predicateBlockNB(MachineBasicBlock *ToB,
+          MachineBasicBlock::iterator At, MachineBasicBlock *FromB,
+          unsigned PredR, bool IfTrue);
+
+    void updatePhiNodes(MachineBasicBlock *WhereB, const FlowPattern &FP);
+    void convert(const FlowPattern &FP);
+
+    void removeBlock(MachineBasicBlock *B);
+    void eliminatePhis(MachineBasicBlock *B);
+    void replacePhiEdges(MachineBasicBlock *OldB, MachineBasicBlock *NewB);
+    void mergeBlocks(MachineBasicBlock *PredB, MachineBasicBlock *SuccB);
+    void simplifyFlowGraph(const FlowPattern &FP);
+
+    const HexagonInstrInfo *HII;
+    const TargetRegisterInfo *TRI;
+    MachineFunction *MFN;
+    MachineRegisterInfo *MRI;
+    MachineDominatorTree *MDT;
+    MachineLoopInfo *MLI;
+    BlockSetType Deleted;
+    const MachineBranchProbabilityInfo *MBPI;
+  };
+
+  char HexagonEarlyIfConversion::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(HexagonEarlyIfConversion, "hexagon-eif",
+  "Hexagon early if conversion", false, false)
+
+bool HexagonEarlyIfConversion::isPreheader(const MachineBasicBlock *B) const {
+  if (B->succ_size() != 1)
+    return false;
+  MachineBasicBlock *SB = *B->succ_begin();
+  MachineLoop *L = MLI->getLoopFor(SB);
+  return L && SB == L->getHeader();
+}
+
+bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
+    MachineLoop *L, FlowPattern &FP) {
+  DEBUG(dbgs() << "Checking flow pattern at BB#" << B->getNumber() << "\n");
+
+  // Interested only in conditional branches, no .new, no new-value, etc.
+  // Check the terminators directly, it's easier than handling all responses
+  // from AnalyzeBranch.
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
+  MachineBasicBlock::const_iterator T1I = B->getFirstTerminator();
+  if (T1I == B->end())
+    return false;
+  unsigned Opc = T1I->getOpcode();
+  if (Opc != Hexagon::J2_jumpt && Opc != Hexagon::J2_jumpf)
+    return false;
+  unsigned PredR = T1I->getOperand(0).getReg();
+
+  // Get the layout successor, or 0 if B does not have one.
+  MachineFunction::iterator NextBI = std::next(MachineFunction::iterator(B));
+  MachineBasicBlock *NextB = (NextBI != MFN->end()) ? &*NextBI : nullptr;
+
+  MachineBasicBlock *T1B = T1I->getOperand(1).getMBB();
+  MachineBasicBlock::const_iterator T2I = std::next(T1I);
+  // The second terminator should be an unconditional branch.
+  assert(T2I == B->end() || T2I->getOpcode() == Hexagon::J2_jump);
+  MachineBasicBlock *T2B = (T2I == B->end()) ? NextB
+                                             : T2I->getOperand(0).getMBB();
+  if (T1B == T2B) {
+    // XXX merge if T1B == NextB, or convert branch to unconditional.
+    // mark as diamond with both sides equal?
+    return false;
+  }
+  // Loop could be null for both.
+  if (MLI->getLoopFor(T1B) != L || MLI->getLoopFor(T2B) != L)
+    return false;
+
+  // Record the true/false blocks in such a way that "true" means "if (PredR)",
+  // and "false" means "if (!PredR)".
+  if (Opc == Hexagon::J2_jumpt)
+    TB = T1B, FB = T2B;
+  else
+    TB = T2B, FB = T1B;
+
+  if (!MDT->properlyDominates(B, TB) || !MDT->properlyDominates(B, FB))
+    return false;
+
+  // Detect triangle first. In case of a triangle, one of the blocks TB/FB
+  // can fall through into the other, in other words, it will be executed
+  // in both cases. We only want to predicate the block that is executed
+  // conditionally.
+  unsigned TNP = TB->pred_size(), FNP = FB->pred_size();
+  unsigned TNS = TB->succ_size(), FNS = FB->succ_size();
+
+  // A block is predicable if it has one predecessor (it must be B), and
+  // it has a single successor. In fact, the block has to end either with
+  // an unconditional branch (which can be predicated), or with a fall-
+  // through.
+  bool TOk = (TNP == 1) && (TNS == 1);
+  bool FOk = (FNP == 1) && (FNS == 1);
+
+  // If neither is predicable, there is nothing interesting.
+  if (!TOk && !FOk)
+    return false;
+
+  MachineBasicBlock *TSB = (TNS > 0) ? *TB->succ_begin() : nullptr;
+  MachineBasicBlock *FSB = (FNS > 0) ? *FB->succ_begin() : nullptr;
+  MachineBasicBlock *JB = nullptr;
+
+  if (TOk) {
+    if (FOk) {
+      if (TSB == FSB)
+        JB = TSB;
+      // Diamond: "if (P) then TB; else FB;".
+    } else {
+      // TOk && !FOk
+      if (TSB == FB) {
+        JB = FB;
+        FB = nullptr;
+      }
+    }
+  } else {
+    // !TOk && FOk  (at least one must be true by now).
+    if (FSB == TB) {
+      JB = TB;
+      TB = nullptr;
+    }
+  }
+  // Don't try to predicate loop preheaders.
+  if ((TB && isPreheader(TB)) || (FB && isPreheader(FB))) {
+    DEBUG(dbgs() << "One of blocks " << PrintMB(TB) << ", " << PrintMB(FB)
+                 << " is a loop preheader. Skipping.\n");
+    return false;
+  }
+
+  FP = FlowPattern(B, PredR, TB, FB, JB);
+  DEBUG(dbgs() << "Detected " << PrintFP(FP, *TRI) << "\n");
+  return true;
+}
+
+// KLUDGE: HexagonInstrInfo::AnalyzeBranch won't work on a block that
+// contains EH_LABEL.
+bool HexagonEarlyIfConversion::hasEHLabel(const MachineBasicBlock *B) const {
+  for (auto &I : *B)
+    if (I.isEHLabel())
+      return true;
+  return false;
+}
+
+// KLUDGE: HexagonInstrInfo::AnalyzeBranch may be unable to recognize
+// that a block can never fall-through.
+bool HexagonEarlyIfConversion::hasUncondBranch(const MachineBasicBlock *B)
+      const {
+  MachineBasicBlock::const_iterator I = B->getFirstTerminator(), E = B->end();
+  while (I != E) {
+    if (I->isBarrier())
+      return true;
+    ++I;
+  }
+  return false;
+}
+
+bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
+      const {
+  if (!B)
+    return true;
+  if (B->isEHPad() || B->hasAddressTaken())
+    return false;
+  if (B->succ_size() == 0)
+    return false;
+
+  for (auto &MI : *B) {
+    if (MI.isDebugValue())
+      continue;
+    if (MI.isConditionalBranch())
+      return false;
+    unsigned Opc = MI.getOpcode();
+    bool IsJMP = (Opc == Hexagon::J2_jump);
+    if (!isPredicableStore(&MI) && !IsJMP && !isSafeToSpeculate(&MI))
+      return false;
+    // Look for predicate registers defined by this instruction. It's ok
+    // to speculate such an instruction, but the predicate register cannot
+    // be used outside of this block (or else it won't be possible to
+    // update the use of it after predication). PHI uses will be updated
+    // to use a result of a MUX, and a MUX cannot be created for predicate
+    // registers.
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned R = MO.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(R))
+        continue;
+      if (MRI->getRegClass(R) != &Hexagon::PredRegsRegClass)
+        continue;
+      for (auto U = MRI->use_begin(R); U != MRI->use_end(); ++U)
+        if (U->getParent()->isPHI())
+          return false;
+    }
+  }
+  return true;
+}
+
+bool HexagonEarlyIfConversion::usesUndefVReg(const MachineInstr *MI) const {
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    unsigned R = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      continue;
+    const MachineInstr *DefI = MRI->getVRegDef(R);
+    // "Undefined" virtual registers are actually defined via IMPLICIT_DEF.
+    assert(DefI && "Expecting a reaching def in MRI");
+    if (DefI->isImplicitDef())
+      return true;
+  }
+  return false;
+}
+
+bool HexagonEarlyIfConversion::isValid(const FlowPattern &FP) const {
+  if (hasEHLabel(FP.SplitB))  // KLUDGE: see function definition
+    return false;
+  if (FP.TrueB && !isValidCandidate(FP.TrueB))
+    return false;
+  if (FP.FalseB && !isValidCandidate(FP.FalseB))
+    return false;
+  // Check the PHIs in the join block. If any of them use a register
+  // that is defined as IMPLICIT_DEF, do not convert this. This can
+  // legitimately happen if one side of the split never executes, but
+  // the compiler is unable to prove it. That side may then seem to
+  // provide an "undef" value to the join block, however it will never
+  // execute at run-time. If we convert this case, the "undef" will
+  // be used in a MUX instruction, and that may seem like actually
+  // using an undefined value to other optimizations. This could lead
+  // to trouble further down the optimization stream, cause assertions
+  // to fail, etc.
+  if (FP.JoinB) {
+    const MachineBasicBlock &B = *FP.JoinB;
+    for (auto &MI : B) {
+      if (!MI.isPHI())
+        break;
+      if (usesUndefVReg(&MI))
+        return false;
+      unsigned DefR = MI.getOperand(0).getReg();
+      const TargetRegisterClass *RC = MRI->getRegClass(DefR);
+      if (RC == &Hexagon::PredRegsRegClass)
+        return false;
+    }
+  }
+  return true;
+}
+
+unsigned HexagonEarlyIfConversion::computePhiCost(MachineBasicBlock *B) const {
+  assert(B->pred_size() <= 2);
+  if (B->pred_size() < 2)
+    return 0;
+
+  unsigned Cost = 0;
+  MachineBasicBlock::const_iterator I, E = B->getFirstNonPHI();
+  for (I = B->begin(); I != E; ++I) {
+    const MachineOperand &RO1 = I->getOperand(1);
+    const MachineOperand &RO3 = I->getOperand(3);
+    assert(RO1.isReg() && RO3.isReg());
+    // Must have a MUX if the phi uses a subregister.
+    if (RO1.getSubReg() != 0 || RO3.getSubReg() != 0) {
+      Cost++;
+      continue;
+    }
+    MachineInstr *Def1 = MRI->getVRegDef(RO1.getReg());
+    MachineInstr *Def3 = MRI->getVRegDef(RO3.getReg());
+    if (!HII->isPredicable(*Def1) || !HII->isPredicable(*Def3))
+      Cost++;
+  }
+  return Cost;
+}
+
+unsigned HexagonEarlyIfConversion::countPredicateDefs(
+      const MachineBasicBlock *B) const {
+  unsigned PredDefs = 0;
+  for (auto &MI : *B) {
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned R = MO.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(R))
+        continue;
+      if (MRI->getRegClass(R) == &Hexagon::PredRegsRegClass)
+        PredDefs++;
+    }
+  }
+  return PredDefs;
+}
+
+bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
+  if (FP.TrueB && FP.FalseB) {
+
+    // Do not IfCovert if the branch is one sided.
+    if (MBPI) {
+      BranchProbability Prob(9, 10);
+      if (MBPI->getEdgeProbability(FP.SplitB, FP.TrueB) > Prob)
+        return false;
+      if (MBPI->getEdgeProbability(FP.SplitB, FP.FalseB) > Prob)
+        return false;
+    }
+
+    // If both sides are predicable, convert them if they join, and the
+    // join block has no other predecessors.
+    MachineBasicBlock *TSB = *FP.TrueB->succ_begin();
+    MachineBasicBlock *FSB = *FP.FalseB->succ_begin();
+    if (TSB != FSB)
+      return false;
+    if (TSB->pred_size() != 2)
+      return false;
+  }
+
+  // Calculate the total size of the predicated blocks.
+  // Assume instruction counts without branches to be the approximation of
+  // the code size. If the predicated blocks are smaller than a packet size,
+  // approximate the spare room in the packet that could be filled with the
+  // predicated/speculated instructions.
+  unsigned TS = 0, FS = 0, Spare = 0;
+  if (FP.TrueB) {
+    TS = std::distance(FP.TrueB->begin(), FP.TrueB->getFirstTerminator());
+    if (TS < HEXAGON_PACKET_SIZE)
+      Spare += HEXAGON_PACKET_SIZE-TS;
+  }
+  if (FP.FalseB) {
+    FS = std::distance(FP.FalseB->begin(), FP.FalseB->getFirstTerminator());
+    if (FS < HEXAGON_PACKET_SIZE)
+      Spare += HEXAGON_PACKET_SIZE-TS;
+  }
+  unsigned TotalIn = TS+FS;
+  DEBUG(dbgs() << "Total number of instructions to be predicated/speculated: "
+               << TotalIn << ", spare room: " << Spare << "\n");
+  if (TotalIn >= SizeLimit+Spare)
+    return false;
+
+  // Count the number of PHI nodes that will need to be updated (converted
+  // to MUX). Those can be later converted to predicated instructions, so
+  // they aren't always adding extra cost.
+  // KLUDGE: Also, count the number of predicate register definitions in
+  // each block. The scheduler may increase the pressure of these and cause
+  // expensive spills (e.g. bitmnp01).
+  unsigned TotalPh = 0;
+  unsigned PredDefs = countPredicateDefs(FP.SplitB);
+  if (FP.JoinB) {
+    TotalPh = computePhiCost(FP.JoinB);
+    PredDefs += countPredicateDefs(FP.JoinB);
+  } else {
+    if (FP.TrueB && FP.TrueB->succ_size() > 0) {
+      MachineBasicBlock *SB = *FP.TrueB->succ_begin();
+      TotalPh += computePhiCost(SB);
+      PredDefs += countPredicateDefs(SB);
+    }
+    if (FP.FalseB && FP.FalseB->succ_size() > 0) {
+      MachineBasicBlock *SB = *FP.FalseB->succ_begin();
+      TotalPh += computePhiCost(SB);
+      PredDefs += countPredicateDefs(SB);
+    }
+  }
+  DEBUG(dbgs() << "Total number of extra muxes from converted phis: "
+               << TotalPh << "\n");
+  if (TotalIn+TotalPh >= SizeLimit+Spare)
+    return false;
+
+  DEBUG(dbgs() << "Total number of predicate registers: " << PredDefs << "\n");
+  if (PredDefs > 4)
+    return false;
+
+  return true;
+}
+
+bool HexagonEarlyIfConversion::visitBlock(MachineBasicBlock *B,
+      MachineLoop *L) {
+  bool Changed = false;
+
+  // Visit all dominated blocks from the same loop first, then process B.
+  MachineDomTreeNode *N = MDT->getNode(B);
+  typedef GraphTraits<MachineDomTreeNode*> GTN;
+  // We will change CFG/DT during this traversal, so take precautions to
+  // avoid problems related to invalidated iterators. In fact, processing
+  // a child C of B cannot cause another child to be removed, but it can
+  // cause a new child to be added (which was a child of C before C itself
+  // was removed. This new child C, however, would have been processed
+  // prior to processing B, so there is no need to process it again.
+  // Simply keep a list of children of B, and traverse that list.
+  typedef SmallVector<MachineDomTreeNode*,4> DTNodeVectType;
+  DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N));
+  for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) {
+    MachineBasicBlock *SB = (*I)->getBlock();
+    if (!Deleted.count(SB))
+      Changed |= visitBlock(SB, L);
+  }
+  // When walking down the dominator tree, we want to traverse through
+  // blocks from nested (other) loops, because they can dominate blocks
+  // that are in L. Skip the non-L blocks only after the tree traversal.
+  if (MLI->getLoopFor(B) != L)
+    return Changed;
+
+  FlowPattern FP;
+  if (!matchFlowPattern(B, L, FP))
+    return Changed;
+
+  if (!isValid(FP)) {
+    DEBUG(dbgs() << "Conversion is not valid\n");
+    return Changed;
+  }
+  if (!isProfitable(FP)) {
+    DEBUG(dbgs() << "Conversion is not profitable\n");
+    return Changed;
+  }
+
+  convert(FP);
+  simplifyFlowGraph(FP);
+  return true;
+}
+
+bool HexagonEarlyIfConversion::visitLoop(MachineLoop *L) {
+  MachineBasicBlock *HB = L ? L->getHeader() : nullptr;
+  DEBUG((L ? dbgs() << "Visiting loop H:" << PrintMB(HB)
+           : dbgs() << "Visiting function") << "\n");
+  bool Changed = false;
+  if (L) {
+    for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+      Changed |= visitLoop(*I);
+  }
+
+  MachineBasicBlock *EntryB = GraphTraits<MachineFunction*>::getEntryNode(MFN);
+  Changed |= visitBlock(L ? HB : EntryB, L);
+  return Changed;
+}
+
+bool HexagonEarlyIfConversion::isPredicableStore(const MachineInstr *MI)
+      const {
+  // HexagonInstrInfo::isPredicable will consider these stores are non-
+  // -predicable if the offset would become constant-extended after
+  // predication.
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case Hexagon::S2_storerb_io:
+    case Hexagon::S2_storerbnew_io:
+    case Hexagon::S2_storerh_io:
+    case Hexagon::S2_storerhnew_io:
+    case Hexagon::S2_storeri_io:
+    case Hexagon::S2_storerinew_io:
+    case Hexagon::S2_storerd_io:
+    case Hexagon::S4_storeirb_io:
+    case Hexagon::S4_storeirh_io:
+    case Hexagon::S4_storeiri_io:
+      return true;
+  }
+
+  // TargetInstrInfo::isPredicable takes a non-const pointer.
+  return MI->mayStore() && HII->isPredicable(const_cast<MachineInstr&>(*MI));
+}
+
+bool HexagonEarlyIfConversion::isSafeToSpeculate(const MachineInstr *MI)
+      const {
+  if (MI->mayLoad() || MI->mayStore())
+    return false;
+  if (MI->isCall() || MI->isBarrier() || MI->isBranch())
+    return false;
+  if (MI->hasUnmodeledSideEffects())
+    return false;
+
+  return true;
+}
+
+unsigned HexagonEarlyIfConversion::getCondStoreOpcode(unsigned Opc,
+      bool IfTrue) const {
+  return HII->getCondOpcode(Opc, !IfTrue);
+}
+
+void HexagonEarlyIfConversion::predicateInstr(MachineBasicBlock *ToB,
+      MachineBasicBlock::iterator At, MachineInstr *MI,
+      unsigned PredR, bool IfTrue) {
+  DebugLoc DL;
+  if (At != ToB->end())
+    DL = At->getDebugLoc();
+  else if (!ToB->empty())
+    DL = ToB->back().getDebugLoc();
+
+  unsigned Opc = MI->getOpcode();
+
+  if (isPredicableStore(MI)) {
+    unsigned COpc = getCondStoreOpcode(Opc, IfTrue);
+    assert(COpc);
+    MachineInstrBuilder MIB = BuildMI(*ToB, At, DL, HII->get(COpc));
+    MachineInstr::mop_iterator MOI = MI->operands_begin();
+    if (HII->isPostIncrement(*MI)) {
+      MIB.addOperand(*MOI);
+      ++MOI;
+    }
+    MIB.addReg(PredR);
+    for (const MachineOperand &MO : make_range(MOI, MI->operands_end()))
+      MIB.addOperand(MO);
+
+    // Set memory references.
+    MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+    MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+    MIB.setMemRefs(MMOBegin, MMOEnd);
+
+    MI->eraseFromParent();
+    return;
+  }
+
+  if (Opc == Hexagon::J2_jump) {
+    MachineBasicBlock *TB = MI->getOperand(0).getMBB();
+    const MCInstrDesc &D = HII->get(IfTrue ? Hexagon::J2_jumpt
+                                           : Hexagon::J2_jumpf);
+    BuildMI(*ToB, At, DL, D)
+      .addReg(PredR)
+      .addMBB(TB);
+    MI->eraseFromParent();
+    return;
+  }
+
+  // Print the offending instruction unconditionally as we are about to
+  // abort.
+  dbgs() << *MI;
+  llvm_unreachable("Unexpected instruction");
+}
+
+// Predicate/speculate non-branch instructions from FromB into block ToB.
+// Leave the branches alone, they will be handled later. Btw, at this point
+// FromB should have at most one branch, and it should be unconditional.
+void HexagonEarlyIfConversion::predicateBlockNB(MachineBasicBlock *ToB,
+      MachineBasicBlock::iterator At, MachineBasicBlock *FromB,
+      unsigned PredR, bool IfTrue) {
+  DEBUG(dbgs() << "Predicating block " << PrintMB(FromB) << "\n");
+  MachineBasicBlock::iterator End = FromB->getFirstTerminator();
+  MachineBasicBlock::iterator I, NextI;
+
+  for (I = FromB->begin(); I != End; I = NextI) {
+    assert(!I->isPHI());
+    NextI = std::next(I);
+    if (isSafeToSpeculate(&*I))
+      ToB->splice(At, FromB, I);
+    else
+      predicateInstr(ToB, At, &*I, PredR, IfTrue);
+  }
+}
+
+void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB,
+      const FlowPattern &FP) {
+  // Visit all PHI nodes in the WhereB block and generate MUX instructions
+  // in the split block. Update the PHI nodes with the values of the MUX.
+  auto NonPHI = WhereB->getFirstNonPHI();
+  for (auto I = WhereB->begin(); I != NonPHI; ++I) {
+    MachineInstr *PN = &*I;
+    // Registers and subregisters corresponding to TrueB, FalseB and SplitB.
+    unsigned TR = 0, TSR = 0, FR = 0, FSR = 0, SR = 0, SSR = 0;
+    for (int i = PN->getNumOperands()-2; i > 0; i -= 2) {
+      const MachineOperand &RO = PN->getOperand(i), &BO = PN->getOperand(i+1);
+      if (BO.getMBB() == FP.SplitB)
+        SR = RO.getReg(), SSR = RO.getSubReg();
+      else if (BO.getMBB() == FP.TrueB)
+        TR = RO.getReg(), TSR = RO.getSubReg();
+      else if (BO.getMBB() == FP.FalseB)
+        FR = RO.getReg(), FSR = RO.getSubReg();
+      else
+        continue;
+      PN->RemoveOperand(i+1);
+      PN->RemoveOperand(i);
+    }
+    if (TR == 0)
+      TR = SR, TSR = SSR;
+    else if (FR == 0)
+      FR = SR, FSR = SSR;
+    assert(TR && FR);
+
+    using namespace Hexagon;
+
+    unsigned DR = PN->getOperand(0).getReg();
+    const TargetRegisterClass *RC = MRI->getRegClass(DR);
+    unsigned Opc = 0;
+    if (RC == &IntRegsRegClass)
+      Opc = C2_mux;
+    else if (RC == &DoubleRegsRegClass)
+      Opc = PS_pselect;
+    else if (RC == &VectorRegsRegClass)
+      Opc = PS_vselect;
+    else if (RC == &VecDblRegsRegClass)
+      Opc = PS_wselect;
+    else if (RC == &VectorRegs128BRegClass)
+      Opc = PS_vselect_128B;
+    else if (RC == &VecDblRegs128BRegClass)
+      Opc = PS_wselect_128B;
+    else
+      llvm_unreachable("unexpected register type");
+    const MCInstrDesc &D = HII->get(Opc);
+
+    MachineBasicBlock::iterator MuxAt = FP.SplitB->getFirstTerminator();
+    DebugLoc DL;
+    if (MuxAt != FP.SplitB->end())
+      DL = MuxAt->getDebugLoc();
+    unsigned MuxR = MRI->createVirtualRegister(RC);
+    BuildMI(*FP.SplitB, MuxAt, DL, D, MuxR)
+      .addReg(FP.PredR)
+      .addReg(TR, 0, TSR)
+      .addReg(FR, 0, FSR);
+
+    PN->addOperand(MachineOperand::CreateReg(MuxR, false));
+    PN->addOperand(MachineOperand::CreateMBB(FP.SplitB));
+  }
+}
+
+void HexagonEarlyIfConversion::convert(const FlowPattern &FP) {
+  MachineBasicBlock *TSB = nullptr, *FSB = nullptr;
+  MachineBasicBlock::iterator OldTI = FP.SplitB->getFirstTerminator();
+  assert(OldTI != FP.SplitB->end());
+  DebugLoc DL = OldTI->getDebugLoc();
+
+  if (FP.TrueB) {
+    TSB = *FP.TrueB->succ_begin();
+    predicateBlockNB(FP.SplitB, OldTI, FP.TrueB, FP.PredR, true);
+  }
+  if (FP.FalseB) {
+    FSB = *FP.FalseB->succ_begin();
+    MachineBasicBlock::iterator At = FP.SplitB->getFirstTerminator();
+    predicateBlockNB(FP.SplitB, At, FP.FalseB, FP.PredR, false);
+  }
+
+  // Regenerate new terminators in the split block and update the successors.
+  // First, remember any information that may be needed later and remove the
+  // existing terminators/successors from the split block.
+  MachineBasicBlock *SSB = nullptr;
+  FP.SplitB->erase(OldTI, FP.SplitB->end());
+  while (FP.SplitB->succ_size() > 0) {
+    MachineBasicBlock *T = *FP.SplitB->succ_begin();
+    // It's possible that the split block had a successor that is not a pre-
+    // dicated block. This could only happen if there was only one block to
+    // be predicated. Example:
+    //   split_b:
+    //     if (p) jump true_b
+    //     jump unrelated2_b
+    //   unrelated1_b:
+    //     ...
+    //   unrelated2_b:  ; can have other predecessors, so it's not "false_b"
+    //     jump other_b
+    //   true_b:        ; only reachable from split_b, can be predicated
+    //     ...
+    //
+    // Find this successor (SSB) if it exists.
+    if (T != FP.TrueB && T != FP.FalseB) {
+      assert(!SSB);
+      SSB = T;
+    }
+    FP.SplitB->removeSuccessor(FP.SplitB->succ_begin());
+  }
+
+  // Insert new branches and update the successors of the split block. This
+  // may create unconditional branches to the layout successor, etc., but
+  // that will be cleaned up later. For now, make sure that correct code is
+  // generated.
+  if (FP.JoinB) {
+    assert(!SSB || SSB == FP.JoinB);
+    BuildMI(*FP.SplitB, FP.SplitB->end(), DL, HII->get(Hexagon::J2_jump))
+      .addMBB(FP.JoinB);
+    FP.SplitB->addSuccessor(FP.JoinB);
+  } else {
+    bool HasBranch = false;
+    if (TSB) {
+      BuildMI(*FP.SplitB, FP.SplitB->end(), DL, HII->get(Hexagon::J2_jumpt))
+        .addReg(FP.PredR)
+        .addMBB(TSB);
+      FP.SplitB->addSuccessor(TSB);
+      HasBranch = true;
+    }
+    if (FSB) {
+      const MCInstrDesc &D = HasBranch ? HII->get(Hexagon::J2_jump)
+                                       : HII->get(Hexagon::J2_jumpf);
+      MachineInstrBuilder MIB = BuildMI(*FP.SplitB, FP.SplitB->end(), DL, D);
+      if (!HasBranch)
+        MIB.addReg(FP.PredR);
+      MIB.addMBB(FSB);
+      FP.SplitB->addSuccessor(FSB);
+    }
+    if (SSB) {
+      // This cannot happen if both TSB and FSB are set. [TF]SB are the
+      // successor blocks of the TrueB and FalseB (or null of the TrueB
+      // or FalseB block is null). SSB is the potential successor block
+      // of the SplitB that is neither TrueB nor FalseB.
+      BuildMI(*FP.SplitB, FP.SplitB->end(), DL, HII->get(Hexagon::J2_jump))
+        .addMBB(SSB);
+      FP.SplitB->addSuccessor(SSB);
+    }
+  }
+
+  // What is left to do is to update the PHI nodes that could have entries
+  // referring to predicated blocks.
+  if (FP.JoinB) {
+    updatePhiNodes(FP.JoinB, FP);
+  } else {
+    if (TSB)
+      updatePhiNodes(TSB, FP);
+    if (FSB)
+      updatePhiNodes(FSB, FP);
+    // Nothing to update in SSB, since SSB's predecessors haven't changed.
+  }
+}
+
+void HexagonEarlyIfConversion::removeBlock(MachineBasicBlock *B) {
+  DEBUG(dbgs() << "Removing block " << PrintMB(B) << "\n");
+
+  // Transfer the immediate dominator information from B to its descendants.
+  MachineDomTreeNode *N = MDT->getNode(B);
+  MachineDomTreeNode *IDN = N->getIDom();
+  if (IDN) {
+    MachineBasicBlock *IDB = IDN->getBlock();
+    typedef GraphTraits<MachineDomTreeNode*> GTN;
+    typedef SmallVector<MachineDomTreeNode*,4> DTNodeVectType;
+    DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N));
+    for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) {
+      MachineBasicBlock *SB = (*I)->getBlock();
+      MDT->changeImmediateDominator(SB, IDB);
+    }
+  }
+
+  while (B->succ_size() > 0)
+    B->removeSuccessor(B->succ_begin());
+
+  for (auto I = B->pred_begin(), E = B->pred_end(); I != E; ++I)
+    (*I)->removeSuccessor(B, true);
+
+  Deleted.insert(B);
+  MDT->eraseNode(B);
+  MFN->erase(B->getIterator());
+}
+
+void HexagonEarlyIfConversion::eliminatePhis(MachineBasicBlock *B) {
+  DEBUG(dbgs() << "Removing phi nodes from block " << PrintMB(B) << "\n");
+  MachineBasicBlock::iterator I, NextI, NonPHI = B->getFirstNonPHI();
+  for (I = B->begin(); I != NonPHI; I = NextI) {
+    NextI = std::next(I);
+    MachineInstr *PN = &*I;
+    assert(PN->getNumOperands() == 3 && "Invalid phi node");
+    MachineOperand &UO = PN->getOperand(1);
+    unsigned UseR = UO.getReg(), UseSR = UO.getSubReg();
+    unsigned DefR = PN->getOperand(0).getReg();
+    unsigned NewR = UseR;
+    if (UseSR) {
+      // MRI.replaceVregUsesWith does not allow to update the subregister,
+      // so instead of doing the use-iteration here, create a copy into a
+      // "non-subregistered" register.
+      const DebugLoc &DL = PN->getDebugLoc();
+      const TargetRegisterClass *RC = MRI->getRegClass(DefR);
+      NewR = MRI->createVirtualRegister(RC);
+      NonPHI = BuildMI(*B, NonPHI, DL, HII->get(TargetOpcode::COPY), NewR)
+        .addReg(UseR, 0, UseSR);
+    }
+    MRI->replaceRegWith(DefR, NewR);
+    B->erase(I);
+  }
+}
+
+void HexagonEarlyIfConversion::replacePhiEdges(MachineBasicBlock *OldB,
+      MachineBasicBlock *NewB) {
+  for (auto I = OldB->succ_begin(), E = OldB->succ_end(); I != E; ++I) {
+    MachineBasicBlock *SB = *I;
+    MachineBasicBlock::iterator P, N = SB->getFirstNonPHI();
+    for (P = SB->begin(); P != N; ++P) {
+      MachineInstr &PN = *P;
+      for (MachineOperand &MO : PN.operands())
+        if (MO.isMBB() && MO.getMBB() == OldB)
+          MO.setMBB(NewB);
+    }
+  }
+}
+
+void HexagonEarlyIfConversion::mergeBlocks(MachineBasicBlock *PredB,
+      MachineBasicBlock *SuccB) {
+  DEBUG(dbgs() << "Merging blocks " << PrintMB(PredB) << " and "
+               << PrintMB(SuccB) << "\n");
+  bool TermOk = hasUncondBranch(SuccB);
+  eliminatePhis(SuccB);
+  HII->removeBranch(*PredB);
+  PredB->removeSuccessor(SuccB);
+  PredB->splice(PredB->end(), SuccB, SuccB->begin(), SuccB->end());
+  MachineBasicBlock::succ_iterator I, E = SuccB->succ_end();
+  for (I = SuccB->succ_begin(); I != E; ++I)
+    PredB->addSuccessor(*I);
+  PredB->normalizeSuccProbs();
+  replacePhiEdges(SuccB, PredB);
+  removeBlock(SuccB);
+  if (!TermOk)
+    PredB->updateTerminator();
+}
+
+void HexagonEarlyIfConversion::simplifyFlowGraph(const FlowPattern &FP) {
+  if (FP.TrueB)
+    removeBlock(FP.TrueB);
+  if (FP.FalseB)
+    removeBlock(FP.FalseB);
+
+  FP.SplitB->updateTerminator();
+  if (FP.SplitB->succ_size() != 1)
+    return;
+
+  MachineBasicBlock *SB = *FP.SplitB->succ_begin();
+  if (SB->pred_size() != 1)
+    return;
+
+  // By now, the split block has only one successor (SB), and SB has only
+  // one predecessor. We can try to merge them. We will need to update ter-
+  // minators in FP.Split+SB, and that requires working AnalyzeBranch, which
+  // fails on Hexagon for blocks that have EH_LABELs. However, if SB ends
+  // with an unconditional branch, we won't need to touch the terminators.
+  if (!hasEHLabel(SB) || hasUncondBranch(SB))
+    mergeBlocks(FP.SplitB, SB);
+}
+
+bool HexagonEarlyIfConversion::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  auto &ST = MF.getSubtarget<HexagonSubtarget>();
+  HII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
+  MFN = &MF;
+  MRI = &MF.getRegInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  MLI = &getAnalysis<MachineLoopInfo>();
+  MBPI = EnableHexagonBP ? &getAnalysis<MachineBranchProbabilityInfo>() :
+    nullptr;
+
+  Deleted.clear();
+  bool Changed = false;
+
+  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); I != E; ++I)
+    Changed |= visitLoop(*I);
+  Changed |= visitLoop(nullptr);
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+FunctionPass *llvm::createHexagonEarlyIfConversion() {
+  return new HexagonEarlyIfConversion();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
new file mode 100644
index 000000000000..8f070d842b8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -0,0 +1,1283 @@
+//===--- HexagonExpandCondsets.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Replace mux instructions with the corresponding legal instructions.
+// It is meant to work post-SSA, but still on virtual registers. It was
+// originally placed between register coalescing and machine instruction
+// scheduler.
+// In this place in the optimization sequence, live interval analysis had
+// been performed, and the live intervals should be preserved. A large part
+// of the code deals with preserving the liveness information.
+//
+// Liveness tracking aside, the main functionality of this pass is divided
+// into two steps. The first step is to replace an instruction
+//   vreg0 = C2_mux vreg1, vreg2, vreg3
+// with a pair of conditional transfers
+//   vreg0 = A2_tfrt vreg1, vreg2
+//   vreg0 = A2_tfrf vreg1, vreg3
+// It is the intention that the execution of this pass could be terminated
+// after this step, and the code generated would be functionally correct.
+//
+// If the uses of the source values vreg1 and vreg2 are kills, and their
+// definitions are predicable, then in the second step, the conditional
+// transfers will then be rewritten as predicated instructions. E.g.
+//   vreg0 = A2_or vreg1, vreg2
+//   vreg3 = A2_tfrt vreg99, vreg0<kill>
+// will be rewritten as
+//   vreg3 = A2_port vreg99, vreg1, vreg2
+//
+// This replacement has two variants: "up" and "down". Consider this case:
+//   vreg0 = A2_or vreg1, vreg2
+//   ... [intervening instructions] ...
+//   vreg3 = A2_tfrt vreg99, vreg0<kill>
+// variant "up":
+//   vreg3 = A2_port vreg99, vreg1, vreg2
+//   ... [intervening instructions, vreg0->vreg3] ...
+//   [deleted]
+// variant "down":
+//   [deleted]
+//   ... [intervening instructions] ...
+//   vreg3 = A2_port vreg99, vreg1, vreg2
+//
+// Both, one or none of these variants may be valid, and checks are made
+// to rule out inapplicable variants.
+//
+// As an additional optimization, before either of the two steps above is
+// executed, the pass attempts to coalesce the target register with one of
+// the source registers, e.g. given an instruction
+//   vreg3 = C2_mux vreg0, vreg1, vreg2
+// vreg3 will be coalesced with either vreg1 or vreg2. If this succeeds,
+// the instruction would then be (for example)
+//   vreg3 = C2_mux vreg0, vreg3, vreg2
+// and, under certain circumstances, this could result in only one predicated
+// instruction:
+//   vreg3 = A2_tfrf vreg0, vreg2
+//
+
+// Splitting a definition of a register into two predicated transfers
+// creates a complication in liveness tracking. Live interval computation
+// will see both instructions as actual definitions, and will mark the
+// first one as dead. The definition is not actually dead, and this
+// situation will need to be fixed. For example:
+//   vreg1<def,dead> = A2_tfrt ...  ; marked as dead
+//   vreg1<def> = A2_tfrf ...
+//
+// Since any of the individual predicated transfers may end up getting
+// removed (in case it is an identity copy), some pre-existing def may
+// be marked as dead after live interval recomputation:
+//   vreg1<def,dead> = ...          ; marked as dead
+//   ...
+//   vreg1<def> = A2_tfrf ...       ; if A2_tfrt is removed
+// This case happens if vreg1 was used as a source in A2_tfrt, which means
+// that is it actually live at the A2_tfrf, and so the now dead definition
+// of vreg1 will need to be updated to non-dead at some point.
+//
+// This issue could be remedied by adding implicit uses to the predicated
+// transfers, but this will create a problem with subsequent predication,
+// since the transfers will no longer be possible to reorder. To avoid
+// that, the initial splitting will not add any implicit uses. These
+// implicit uses will be added later, after predication. The extra price,
+// however, is that finding the locations where the implicit uses need
+// to be added, and updating the live ranges will be more involved.
+
+#define DEBUG_TYPE "expand-condsets"
+
+#include "HexagonInstrInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <iterator>
+#include <set>
+#include <utility>
+
+using namespace llvm;
+
+static cl::opt<unsigned> OptTfrLimit("expand-condsets-tfr-limit",
+  cl::init(~0U), cl::Hidden, cl::desc("Max number of mux expansions"));
+static cl::opt<unsigned> OptCoaLimit("expand-condsets-coa-limit",
+  cl::init(~0U), cl::Hidden, cl::desc("Max number of segment coalescings"));
+
+namespace llvm {
+
+  void initializeHexagonExpandCondsetsPass(PassRegistry&);
+  FunctionPass *createHexagonExpandCondsets();
+
+} // end namespace llvm
+
+namespace {
+
+  class HexagonExpandCondsets : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    HexagonExpandCondsets() :
+        MachineFunctionPass(ID), HII(nullptr), TRI(nullptr), MRI(nullptr),
+        LIS(nullptr), CoaLimitActive(false),
+        TfrLimitActive(false), CoaCounter(0), TfrCounter(0) {
+      if (OptCoaLimit.getPosition())
+        CoaLimitActive = true, CoaLimit = OptCoaLimit;
+      if (OptTfrLimit.getPosition())
+        TfrLimitActive = true, TfrLimit = OptTfrLimit;
+      initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
+    }
+
+    StringRef getPassName() const override { return "Hexagon Expand Condsets"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LiveIntervals>();
+      AU.addPreserved<LiveIntervals>();
+      AU.addPreserved<SlotIndexes>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    const HexagonInstrInfo *HII;
+    const TargetRegisterInfo *TRI;
+    MachineDominatorTree *MDT;
+    MachineRegisterInfo *MRI;
+    LiveIntervals *LIS;
+
+    bool CoaLimitActive, TfrLimitActive;
+    unsigned CoaLimit, TfrLimit, CoaCounter, TfrCounter;
+
+    struct RegisterRef {
+      RegisterRef(const MachineOperand &Op) : Reg(Op.getReg()),
+          Sub(Op.getSubReg()) {}
+      RegisterRef(unsigned R = 0, unsigned S = 0) : Reg(R), Sub(S) {}
+
+      bool operator== (RegisterRef RR) const {
+        return Reg == RR.Reg && Sub == RR.Sub;
+      }
+      bool operator!= (RegisterRef RR) const { return !operator==(RR); }
+      bool operator< (RegisterRef RR) const {
+        return Reg < RR.Reg || (Reg == RR.Reg && Sub < RR.Sub);
+      }
+
+      unsigned Reg, Sub;
+    };
+
+    typedef DenseMap<unsigned,unsigned> ReferenceMap;
+    enum { Sub_Low = 0x1, Sub_High = 0x2, Sub_None = (Sub_Low | Sub_High) };
+    enum { Exec_Then = 0x10, Exec_Else = 0x20 };
+    unsigned getMaskForSub(unsigned Sub);
+    bool isCondset(const MachineInstr &MI);
+    LaneBitmask getLaneMask(unsigned Reg, unsigned Sub);
+
+    void addRefToMap(RegisterRef RR, ReferenceMap &Map, unsigned Exec);
+    bool isRefInMap(RegisterRef, ReferenceMap &Map, unsigned Exec);
+
+    void updateDeadsInRange(unsigned Reg, LaneBitmask LM, LiveRange &Range);
+    void updateKillFlags(unsigned Reg);
+    void updateDeadFlags(unsigned Reg);
+    void recalculateLiveInterval(unsigned Reg);
+    void removeInstr(MachineInstr &MI);
+    void updateLiveness(std::set<unsigned> &RegSet, bool Recalc,
+        bool UpdateKills, bool UpdateDeads);
+
+    unsigned getCondTfrOpcode(const MachineOperand &SO, bool Cond);
+    MachineInstr *genCondTfrFor(MachineOperand &SrcOp,
+        MachineBasicBlock::iterator At, unsigned DstR,
+        unsigned DstSR, const MachineOperand &PredOp, bool PredSense,
+        bool ReadUndef, bool ImpUse);
+    bool split(MachineInstr &MI, std::set<unsigned> &UpdRegs);
+
+    bool isPredicable(MachineInstr *MI);
+    MachineInstr *getReachingDefForPred(RegisterRef RD,
+        MachineBasicBlock::iterator UseIt, unsigned PredR, bool Cond);
+    bool canMoveOver(MachineInstr &MI, ReferenceMap &Defs, ReferenceMap &Uses);
+    bool canMoveMemTo(MachineInstr &MI, MachineInstr &ToI, bool IsDown);
+    void predicateAt(const MachineOperand &DefOp, MachineInstr &MI,
+                     MachineBasicBlock::iterator Where,
+                     const MachineOperand &PredOp, bool Cond,
+                     std::set<unsigned> &UpdRegs);
+    void renameInRange(RegisterRef RO, RegisterRef RN, unsigned PredR,
+        bool Cond, MachineBasicBlock::iterator First,
+        MachineBasicBlock::iterator Last);
+    bool predicate(MachineInstr &TfrI, bool Cond, std::set<unsigned> &UpdRegs);
+    bool predicateInBlock(MachineBasicBlock &B,
+        std::set<unsigned> &UpdRegs);
+
+    bool isIntReg(RegisterRef RR, unsigned &BW);
+    bool isIntraBlocks(LiveInterval &LI);
+    bool coalesceRegisters(RegisterRef R1, RegisterRef R2);
+    bool coalesceSegments(const SmallVectorImpl<MachineInstr*> &Condsets,
+                          std::set<unsigned> &UpdRegs);
+  };
+
+} // end anonymous namespace
+
+char HexagonExpandCondsets::ID = 0;
+
+namespace llvm {
+
+  char &HexagonExpandCondsetsID = HexagonExpandCondsets::ID;
+
+} // end namespace llvm
+
+INITIALIZE_PASS_BEGIN(HexagonExpandCondsets, "expand-condsets",
+  "Hexagon Expand Condsets", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(HexagonExpandCondsets, "expand-condsets",
+  "Hexagon Expand Condsets", false, false)
+
+unsigned HexagonExpandCondsets::getMaskForSub(unsigned Sub) {
+  switch (Sub) {
+    case Hexagon::isub_lo:
+    case Hexagon::vsub_lo:
+      return Sub_Low;
+    case Hexagon::isub_hi:
+    case Hexagon::vsub_hi:
+      return Sub_High;
+    case Hexagon::NoSubRegister:
+      return Sub_None;
+  }
+  llvm_unreachable("Invalid subregister");
+}
+
+bool HexagonExpandCondsets::isCondset(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+    case Hexagon::C2_mux:
+    case Hexagon::C2_muxii:
+    case Hexagon::C2_muxir:
+    case Hexagon::C2_muxri:
+    case Hexagon::PS_pselect:
+        return true;
+      break;
+  }
+  return false;
+}
+
+LaneBitmask HexagonExpandCondsets::getLaneMask(unsigned Reg, unsigned Sub) {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  return Sub != 0 ? TRI->getSubRegIndexLaneMask(Sub)
+                  : MRI->getMaxLaneMaskForVReg(Reg);
+}
+
+void HexagonExpandCondsets::addRefToMap(RegisterRef RR, ReferenceMap &Map,
+      unsigned Exec) {
+  unsigned Mask = getMaskForSub(RR.Sub) | Exec;
+  ReferenceMap::iterator F = Map.find(RR.Reg);
+  if (F == Map.end())
+    Map.insert(std::make_pair(RR.Reg, Mask));
+  else
+    F->second |= Mask;
+}
+
+bool HexagonExpandCondsets::isRefInMap(RegisterRef RR, ReferenceMap &Map,
+      unsigned Exec) {
+  ReferenceMap::iterator F = Map.find(RR.Reg);
+  if (F == Map.end())
+    return false;
+  unsigned Mask = getMaskForSub(RR.Sub) | Exec;
+  if (Mask & F->second)
+    return true;
+  return false;
+}
+
+void HexagonExpandCondsets::updateKillFlags(unsigned Reg) {
+  auto KillAt = [this,Reg] (SlotIndex K, LaneBitmask LM) -> void {
+    // Set the <kill> flag on a use of Reg whose lane mask is contained in LM.
+    MachineInstr *MI = LIS->getInstructionFromIndex(K);
+    for (auto &Op : MI->operands()) {
+      if (!Op.isReg() || !Op.isUse() || Op.getReg() != Reg)
+        continue;
+      LaneBitmask SLM = getLaneMask(Reg, Op.getSubReg());
+      if ((SLM & LM) == SLM) {
+        // Only set the kill flag on the first encountered use of Reg in this
+        // instruction.
+        Op.setIsKill(true);
+        break;
+      }
+    }
+  };
+
+  LiveInterval &LI = LIS->getInterval(Reg);
+  for (auto I = LI.begin(), E = LI.end(); I != E; ++I) {
+    if (!I->end.isRegister())
+      continue;
+    // Do not mark the end of the segment as <kill>, if the next segment
+    // starts with a predicated instruction.
+    auto NextI = std::next(I);
+    if (NextI != E && NextI->start.isRegister()) {
+      MachineInstr *DefI = LIS->getInstructionFromIndex(NextI->start);
+      if (HII->isPredicated(*DefI))
+        continue;
+    }
+    bool WholeReg = true;
+    if (LI.hasSubRanges()) {
+      auto EndsAtI = [I] (LiveInterval::SubRange &S) -> bool {
+        LiveRange::iterator F = S.find(I->end);
+        return F != S.end() && I->end == F->end;
+      };
+      // Check if all subranges end at I->end. If so, make sure to kill
+      // the whole register.
+      for (LiveInterval::SubRange &S : LI.subranges()) {
+        if (EndsAtI(S))
+          KillAt(I->end, S.LaneMask);
+        else
+          WholeReg = false;
+      }
+    }
+    if (WholeReg)
+      KillAt(I->end, MRI->getMaxLaneMaskForVReg(Reg));
+  }
+}
+
+void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
+      LiveRange &Range) {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  if (Range.empty())
+    return;
+
+  auto IsRegDef = [this,Reg,LM] (MachineOperand &Op) -> bool {
+    if (!Op.isReg() || !Op.isDef())
+      return false;
+    unsigned DR = Op.getReg(), DSR = Op.getSubReg();
+    if (!TargetRegisterInfo::isVirtualRegister(DR) || DR != Reg)
+      return false;
+    LaneBitmask SLM = getLaneMask(DR, DSR);
+    return (SLM & LM).any();
+  };
+
+  // The splitting step will create pairs of predicated definitions without
+  // any implicit uses (since implicit uses would interfere with predication).
+  // This can cause the reaching defs to become dead after live range
+  // recomputation, even though they are not really dead.
+  // We need to identify predicated defs that need implicit uses, and
+  // dead defs that are not really dead, and correct both problems.
+
+  auto Dominate = [this] (SetVector<MachineBasicBlock*> &Defs,
+                          MachineBasicBlock *Dest) -> bool {
+    for (MachineBasicBlock *D : Defs)
+      if (D != Dest && MDT->dominates(D, Dest))
+        return true;
+
+    MachineBasicBlock *Entry = &Dest->getParent()->front();
+    SetVector<MachineBasicBlock*> Work(Dest->pred_begin(), Dest->pred_end());
+    for (unsigned i = 0; i < Work.size(); ++i) {
+      MachineBasicBlock *B = Work[i];
+      if (Defs.count(B))
+        continue;
+      if (B == Entry)
+        return false;
+      for (auto *P : B->predecessors())
+        Work.insert(P);
+    }
+    return true;
+  };
+
+  // First, try to extend live range within individual basic blocks. This
+  // will leave us only with dead defs that do not reach any predicated
+  // defs in the same block.
+  SetVector<MachineBasicBlock*> Defs;
+  SmallVector<SlotIndex,4> PredDefs;
+  for (auto &Seg : Range) {
+    if (!Seg.start.isRegister())
+      continue;
+    MachineInstr *DefI = LIS->getInstructionFromIndex(Seg.start);
+    Defs.insert(DefI->getParent());
+    if (HII->isPredicated(*DefI))
+      PredDefs.push_back(Seg.start);
+  }
+
+  SmallVector<SlotIndex,8> Undefs;
+  LiveInterval &LI = LIS->getInterval(Reg);
+  LI.computeSubRangeUndefs(Undefs, LM, *MRI, *LIS->getSlotIndexes());
+
+  for (auto &SI : PredDefs) {
+    MachineBasicBlock *BB = LIS->getMBBFromIndex(SI);
+    auto P = Range.extendInBlock(Undefs, LIS->getMBBStartIdx(BB), SI);
+    if (P.first != nullptr || P.second)
+      SI = SlotIndex();
+  }
+
+  // Calculate reachability for those predicated defs that were not handled
+  // by the in-block extension.
+  SmallVector<SlotIndex,4> ExtTo;
+  for (auto &SI : PredDefs) {
+    if (!SI.isValid())
+      continue;
+    MachineBasicBlock *BB = LIS->getMBBFromIndex(SI);
+    if (BB->pred_empty())
+      continue;
+    // If the defs from this range reach SI via all predecessors, it is live.
+    // It can happen that SI is reached by the defs through some paths, but
+    // not all. In the IR coming into this optimization, SI would not be
+    // considered live, since the defs would then not jointly dominate SI.
+    // That means that SI is an overwriting def, and no implicit use is
+    // needed at this point. Do not add SI to the extension points, since
+    // extendToIndices will abort if there is no joint dominance.
+    // If the abort was avoided by adding extra undefs added to Undefs,
+    // extendToIndices could actually indicate that SI is live, contrary
+    // to the original IR.
+    if (Dominate(Defs, BB))
+      ExtTo.push_back(SI);
+  }
+
+  if (!ExtTo.empty())
+    LIS->extendToIndices(Range, ExtTo, Undefs);
+
+  // Remove <dead> flags from all defs that are not dead after live range
+  // extension, and collect all def operands. They will be used to generate
+  // the necessary implicit uses.
+  std::set<RegisterRef> DefRegs;
+  for (auto &Seg : Range) {
+    if (!Seg.start.isRegister())
+      continue;
+    MachineInstr *DefI = LIS->getInstructionFromIndex(Seg.start);
+    for (auto &Op : DefI->operands()) {
+      if (Seg.start.isDead() || !IsRegDef(Op))
+        continue;
+      DefRegs.insert(Op);
+      Op.setIsDead(false);
+    }
+  }
+
+  // Finally, add implicit uses to each predicated def that is reached
+  // by other defs.
+  for (auto &Seg : Range) {
+    if (!Seg.start.isRegister() || !Range.liveAt(Seg.start.getPrevSlot()))
+      continue;
+    MachineInstr *DefI = LIS->getInstructionFromIndex(Seg.start);
+    if (!HII->isPredicated(*DefI))
+      continue;
+    // Construct the set of all necessary implicit uses, based on the def
+    // operands in the instruction.
+    std::set<RegisterRef> ImpUses;
+    for (auto &Op : DefI->operands())
+      if (Op.isReg() && Op.isDef() && DefRegs.count(Op))
+        ImpUses.insert(Op);
+    if (ImpUses.empty())
+      continue;
+    MachineFunction &MF = *DefI->getParent()->getParent();
+    for (RegisterRef R : ImpUses)
+      MachineInstrBuilder(MF, DefI).addReg(R.Reg, RegState::Implicit, R.Sub);
+  }
+}
+
+void HexagonExpandCondsets::updateDeadFlags(unsigned Reg) {
+  LiveInterval &LI = LIS->getInterval(Reg);
+  if (LI.hasSubRanges()) {
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      updateDeadsInRange(Reg, S.LaneMask, S);
+      LIS->shrinkToUses(S, Reg);
+    }
+    LI.clear();
+    LIS->constructMainRangeFromSubranges(LI);
+  } else {
+    updateDeadsInRange(Reg, MRI->getMaxLaneMaskForVReg(Reg), LI);
+  }
+}
+
+void HexagonExpandCondsets::recalculateLiveInterval(unsigned Reg) {
+  LIS->removeInterval(Reg);
+  LIS->createAndComputeVirtRegInterval(Reg);
+}
+
+void HexagonExpandCondsets::removeInstr(MachineInstr &MI) {
+  LIS->RemoveMachineInstrFromMaps(MI);
+  MI.eraseFromParent();
+}
+
+void HexagonExpandCondsets::updateLiveness(std::set<unsigned> &RegSet,
+      bool Recalc, bool UpdateKills, bool UpdateDeads) {
+  UpdateKills |= UpdateDeads;
+  for (auto R : RegSet) {
+    if (Recalc)
+      recalculateLiveInterval(R);
+    if (UpdateKills)
+      MRI->clearKillFlags(R);
+    if (UpdateDeads)
+      updateDeadFlags(R);
+    // Fixing <dead> flags may extend live ranges, so reset <kill> flags
+    // after that.
+    if (UpdateKills)
+      updateKillFlags(R);
+    LIS->getInterval(R).verify();
+  }
+}
+
+/// Get the opcode for a conditional transfer of the value in SO (source
+/// operand). The condition (true/false) is given in Cond.
+unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO,
+      bool IfTrue) {
+  using namespace Hexagon;
+
+  if (SO.isReg()) {
+    unsigned PhysR;
+    RegisterRef RS = SO;
+    if (TargetRegisterInfo::isVirtualRegister(RS.Reg)) {
+      const TargetRegisterClass *VC = MRI->getRegClass(RS.Reg);
+      assert(VC->begin() != VC->end() && "Empty register class");
+      PhysR = *VC->begin();
+    } else {
+      assert(TargetRegisterInfo::isPhysicalRegister(RS.Reg));
+      PhysR = RS.Reg;
+    }
+    unsigned PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub);
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysS);
+    switch (RC->getSize()) {
+      case 4:
+        return IfTrue ? A2_tfrt : A2_tfrf;
+      case 8:
+        return IfTrue ? A2_tfrpt : A2_tfrpf;
+    }
+    llvm_unreachable("Invalid register operand");
+  }
+  if (SO.isImm() || SO.isFPImm())
+    return IfTrue ? C2_cmoveit : C2_cmoveif;
+  llvm_unreachable("Unexpected source operand");
+}
+
+/// Generate a conditional transfer, copying the value SrcOp to the
+/// destination register DstR:DstSR, and using the predicate register from
+/// PredOp. The Cond argument specifies whether the predicate is to be
+/// if(PredOp), or if(!PredOp).
+MachineInstr *HexagonExpandCondsets::genCondTfrFor(MachineOperand &SrcOp,
+      MachineBasicBlock::iterator At,
+      unsigned DstR, unsigned DstSR, const MachineOperand &PredOp,
+      bool PredSense, bool ReadUndef, bool ImpUse) {
+  MachineInstr *MI = SrcOp.getParent();
+  MachineBasicBlock &B = *At->getParent();
+  const DebugLoc &DL = MI->getDebugLoc();
+
+  // Don't avoid identity copies here (i.e. if the source and the destination
+  // are the same registers). It is actually better to generate them here,
+  // since this would cause the copy to potentially be predicated in the next
+  // step. The predication will remove such a copy if it is unable to
+  /// predicate.
+
+  unsigned Opc = getCondTfrOpcode(SrcOp, PredSense);
+  unsigned DstState = RegState::Define | (ReadUndef ? RegState::Undef : 0);
+  unsigned PredState = getRegState(PredOp) & ~RegState::Kill;
+  MachineInstrBuilder MIB;
+
+  if (SrcOp.isReg()) {
+    unsigned SrcState = getRegState(SrcOp);
+    if (RegisterRef(SrcOp) == RegisterRef(DstR, DstSR))
+      SrcState &= ~RegState::Kill;
+    MIB = BuildMI(B, At, DL, HII->get(Opc))
+          .addReg(DstR, DstState, DstSR)
+          .addReg(PredOp.getReg(), PredState, PredOp.getSubReg())
+          .addReg(SrcOp.getReg(), SrcState, SrcOp.getSubReg());
+  } else {
+    MIB = BuildMI(B, At, DL, HII->get(Opc))
+          .addReg(DstR, DstState, DstSR)
+          .addReg(PredOp.getReg(), PredState, PredOp.getSubReg())
+          .addOperand(SrcOp);
+  }
+
+  DEBUG(dbgs() << "created an initial copy: " << *MIB);
+  return &*MIB;
+}
+
+/// Replace a MUX instruction MI with a pair A2_tfrt/A2_tfrf. This function
+/// performs all necessary changes to complete the replacement.
+bool HexagonExpandCondsets::split(MachineInstr &MI,
+                                  std::set<unsigned> &UpdRegs) {
+  if (TfrLimitActive) {
+    if (TfrCounter >= TfrLimit)
+      return false;
+    TfrCounter++;
+  }
+  DEBUG(dbgs() << "\nsplitting BB#" << MI.getParent()->getNumber() << ": "
+               << MI);
+  MachineOperand &MD = MI.getOperand(0);  // Definition
+  MachineOperand &MP = MI.getOperand(1);  // Predicate register
+  assert(MD.isDef());
+  unsigned DR = MD.getReg(), DSR = MD.getSubReg();
+  bool ReadUndef = MD.isUndef();
+  MachineBasicBlock::iterator At = MI;
+
+  // If this is a mux of the same register, just replace it with COPY.
+  // Ideally, this would happen earlier, so that register coalescing would
+  // see it.
+  MachineOperand &ST = MI.getOperand(2);
+  MachineOperand &SF = MI.getOperand(3);
+  if (ST.isReg() && SF.isReg()) {
+    RegisterRef RT(ST);
+    if (RT == RegisterRef(SF)) {
+      MI.setDesc(HII->get(TargetOpcode::COPY));
+      unsigned S = getRegState(ST);
+      while (MI.getNumOperands() > 1)
+        MI.RemoveOperand(MI.getNumOperands()-1);
+      MachineFunction &MF = *MI.getParent()->getParent();
+      MachineInstrBuilder(MF, MI).addReg(RT.Reg, S, RT.Sub);
+      return true;
+    }
+  }
+
+  // First, create the two invididual conditional transfers, and add each
+  // of them to the live intervals information. Do that first and then remove
+  // the old instruction from live intervals.
+  MachineInstr *TfrT =
+      genCondTfrFor(ST, At, DR, DSR, MP, true, ReadUndef, false);
+  MachineInstr *TfrF =
+      genCondTfrFor(SF, At, DR, DSR, MP, false, ReadUndef, true);
+  LIS->InsertMachineInstrInMaps(*TfrT);
+  LIS->InsertMachineInstrInMaps(*TfrF);
+
+  // Will need to recalculate live intervals for all registers in MI.
+  for (auto &Op : MI.operands())
+    if (Op.isReg())
+      UpdRegs.insert(Op.getReg());
+
+  removeInstr(MI);
+  return true;
+}
+
+bool HexagonExpandCondsets::isPredicable(MachineInstr *MI) {
+  if (HII->isPredicated(*MI) || !HII->isPredicable(*MI))
+    return false;
+  if (MI->hasUnmodeledSideEffects() || MI->mayStore())
+    return false;
+  // Reject instructions with multiple defs (e.g. post-increment loads).
+  bool HasDef = false;
+  for (auto &Op : MI->operands()) {
+    if (!Op.isReg() || !Op.isDef())
+      continue;
+    if (HasDef)
+      return false;
+    HasDef = true;
+  }
+  for (auto &Mo : MI->memoperands())
+    if (Mo->isVolatile())
+      return false;
+  return true;
+}
+
+/// Find the reaching definition for a predicated use of RD. The RD is used
+/// under the conditions given by PredR and Cond, and this function will ignore
+/// definitions that set RD under the opposite conditions.
+MachineInstr *HexagonExpandCondsets::getReachingDefForPred(RegisterRef RD,
+      MachineBasicBlock::iterator UseIt, unsigned PredR, bool Cond) {
+  MachineBasicBlock &B = *UseIt->getParent();
+  MachineBasicBlock::iterator I = UseIt, S = B.begin();
+  if (I == S)
+    return nullptr;
+
+  bool PredValid = true;
+  do {
+    --I;
+    MachineInstr *MI = &*I;
+    // Check if this instruction can be ignored, i.e. if it is predicated
+    // on the complementary condition.
+    if (PredValid && HII->isPredicated(*MI)) {
+      if (MI->readsRegister(PredR) && (Cond != HII->isPredicatedTrue(*MI)))
+        continue;
+    }
+
+    // Check the defs. If the PredR is defined, invalidate it. If RD is
+    // defined, return the instruction or 0, depending on the circumstances.
+    for (auto &Op : MI->operands()) {
+      if (!Op.isReg() || !Op.isDef())
+        continue;
+      RegisterRef RR = Op;
+      if (RR.Reg == PredR) {
+        PredValid = false;
+        continue;
+      }
+      if (RR.Reg != RD.Reg)
+        continue;
+      // If the "Reg" part agrees, there is still the subregister to check.
+      // If we are looking for vreg1:loreg, we can skip vreg1:hireg, but
+      // not vreg1 (w/o subregisters).
+      if (RR.Sub == RD.Sub)
+        return MI;
+      if (RR.Sub == 0 || RD.Sub == 0)
+        return nullptr;
+      // We have different subregisters, so we can continue looking.
+    }
+  } while (I != S);
+
+  return nullptr;
+}
+
+/// Check if the instruction MI can be safely moved over a set of instructions
+/// whose side-effects (in terms of register defs and uses) are expressed in
+/// the maps Defs and Uses. These maps reflect the conditional defs and uses
+/// that depend on the same predicate register to allow moving instructions
+/// over instructions predicated on the opposite condition.
+bool HexagonExpandCondsets::canMoveOver(MachineInstr &MI, ReferenceMap &Defs,
+                                        ReferenceMap &Uses) {
+  // In order to be able to safely move MI over instructions that define
+  // "Defs" and use "Uses", no def operand from MI can be defined or used
+  // and no use operand can be defined.
+  for (auto &Op : MI.operands()) {
+    if (!Op.isReg())
+      continue;
+    RegisterRef RR = Op;
+    // For physical register we would need to check register aliases, etc.
+    // and we don't want to bother with that. It would be of little value
+    // before the actual register rewriting (from virtual to physical).
+    if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+      return false;
+    // No redefs for any operand.
+    if (isRefInMap(RR, Defs, Exec_Then))
+      return false;
+    // For defs, there cannot be uses.
+    if (Op.isDef() && isRefInMap(RR, Uses, Exec_Then))
+      return false;
+  }
+  return true;
+}
+
+/// Check if the instruction accessing memory (TheI) can be moved to the
+/// location ToI.
+bool HexagonExpandCondsets::canMoveMemTo(MachineInstr &TheI, MachineInstr &ToI,
+                                         bool IsDown) {
+  bool IsLoad = TheI.mayLoad(), IsStore = TheI.mayStore();
+  if (!IsLoad && !IsStore)
+    return true;
+  if (HII->areMemAccessesTriviallyDisjoint(TheI, ToI))
+    return true;
+  if (TheI.hasUnmodeledSideEffects())
+    return false;
+
+  MachineBasicBlock::iterator StartI = IsDown ? TheI : ToI;
+  MachineBasicBlock::iterator EndI = IsDown ? ToI : TheI;
+  bool Ordered = TheI.hasOrderedMemoryRef();
+
+  // Search for aliased memory reference in (StartI, EndI).
+  for (MachineBasicBlock::iterator I = std::next(StartI); I != EndI; ++I) {
+    MachineInstr *MI = &*I;
+    if (MI->hasUnmodeledSideEffects())
+      return false;
+    bool L = MI->mayLoad(), S = MI->mayStore();
+    if (!L && !S)
+      continue;
+    if (Ordered && MI->hasOrderedMemoryRef())
+      return false;
+
+    bool Conflict = (L && IsStore) || S;
+    if (Conflict)
+      return false;
+  }
+  return true;
+}
+
+/// Generate a predicated version of MI (where the condition is given via
+/// PredR and Cond) at the point indicated by Where.
+void HexagonExpandCondsets::predicateAt(const MachineOperand &DefOp,
+                                        MachineInstr &MI,
+                                        MachineBasicBlock::iterator Where,
+                                        const MachineOperand &PredOp, bool Cond,
+                                        std::set<unsigned> &UpdRegs) {
+  // The problem with updating live intervals is that we can move one def
+  // past another def. In particular, this can happen when moving an A2_tfrt
+  // over an A2_tfrf defining the same register. From the point of view of
+  // live intervals, these two instructions are two separate definitions,
+  // and each one starts another live segment. LiveIntervals's "handleMove"
+  // does not allow such moves, so we need to handle it ourselves. To avoid
+  // invalidating liveness data while we are using it, the move will be
+  // implemented in 4 steps: (1) add a clone of the instruction MI at the
+  // target location, (2) update liveness, (3) delete the old instruction,
+  // and (4) update liveness again.
+
+  MachineBasicBlock &B = *MI.getParent();
+  DebugLoc DL = Where->getDebugLoc();  // "Where" points to an instruction.
+  unsigned Opc = MI.getOpcode();
+  unsigned PredOpc = HII->getCondOpcode(Opc, !Cond);
+  MachineInstrBuilder MB = BuildMI(B, Where, DL, HII->get(PredOpc));
+  unsigned Ox = 0, NP = MI.getNumOperands();
+  // Skip all defs from MI first.
+  while (Ox < NP) {
+    MachineOperand &MO = MI.getOperand(Ox);
+    if (!MO.isReg() || !MO.isDef())
+      break;
+    Ox++;
+  }
+  // Add the new def, then the predicate register, then the rest of the
+  // operands.
+  MB.addReg(DefOp.getReg(), getRegState(DefOp), DefOp.getSubReg());
+  MB.addReg(PredOp.getReg(), PredOp.isUndef() ? RegState::Undef : 0,
+            PredOp.getSubReg());
+  while (Ox < NP) {
+    MachineOperand &MO = MI.getOperand(Ox);
+    if (!MO.isReg() || !MO.isImplicit())
+      MB.addOperand(MO);
+    Ox++;
+  }
+
+  MachineFunction &MF = *B.getParent();
+  MachineInstr::mmo_iterator I = MI.memoperands_begin();
+  unsigned NR = std::distance(I, MI.memoperands_end());
+  MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(NR);
+  for (unsigned i = 0; i < NR; ++i)
+    MemRefs[i] = *I++;
+  MB.setMemRefs(MemRefs, MemRefs+NR);
+
+  MachineInstr *NewI = MB;
+  NewI->clearKillInfo();
+  LIS->InsertMachineInstrInMaps(*NewI);
+
+  for (auto &Op : NewI->operands())
+    if (Op.isReg())
+      UpdRegs.insert(Op.getReg());
+}
+
+/// In the range [First, Last], rename all references to the "old" register RO
+/// to the "new" register RN, but only in instructions predicated on the given
+/// condition.
+void HexagonExpandCondsets::renameInRange(RegisterRef RO, RegisterRef RN,
+      unsigned PredR, bool Cond, MachineBasicBlock::iterator First,
+      MachineBasicBlock::iterator Last) {
+  MachineBasicBlock::iterator End = std::next(Last);
+  for (MachineBasicBlock::iterator I = First; I != End; ++I) {
+    MachineInstr *MI = &*I;
+    // Do not touch instructions that are not predicated, or are predicated
+    // on the opposite condition.
+    if (!HII->isPredicated(*MI))
+      continue;
+    if (!MI->readsRegister(PredR) || (Cond != HII->isPredicatedTrue(*MI)))
+      continue;
+
+    for (auto &Op : MI->operands()) {
+      if (!Op.isReg() || RO != RegisterRef(Op))
+        continue;
+      Op.setReg(RN.Reg);
+      Op.setSubReg(RN.Sub);
+      // In practice, this isn't supposed to see any defs.
+      assert(!Op.isDef() && "Not expecting a def");
+    }
+  }
+}
+
+/// For a given conditional copy, predicate the definition of the source of
+/// the copy under the given condition (using the same predicate register as
+/// the copy).
+bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
+                                      std::set<unsigned> &UpdRegs) {
+  // TfrI - A2_tfr[tf] Instruction (not A2_tfrsi).
+  unsigned Opc = TfrI.getOpcode();
+  (void)Opc;
+  assert(Opc == Hexagon::A2_tfrt || Opc == Hexagon::A2_tfrf);
+  DEBUG(dbgs() << "\nattempt to predicate if-" << (Cond ? "true" : "false")
+               << ": " << TfrI);
+
+  MachineOperand &MD = TfrI.getOperand(0);
+  MachineOperand &MP = TfrI.getOperand(1);
+  MachineOperand &MS = TfrI.getOperand(2);
+  // The source operand should be a <kill>. This is not strictly necessary,
+  // but it makes things a lot simpler. Otherwise, we would need to rename
+  // some registers, which would complicate the transformation considerably.
+  if (!MS.isKill())
+    return false;
+  // Avoid predicating instructions that define a subregister if subregister
+  // liveness tracking is not enabled.
+  if (MD.getSubReg() && !MRI->shouldTrackSubRegLiveness(MD.getReg()))
+    return false;
+
+  RegisterRef RT(MS);
+  unsigned PredR = MP.getReg();
+  MachineInstr *DefI = getReachingDefForPred(RT, TfrI, PredR, Cond);
+  if (!DefI || !isPredicable(DefI))
+    return false;
+
+  DEBUG(dbgs() << "Source def: " << *DefI);
+
+  // Collect the information about registers defined and used between the
+  // DefI and the TfrI.
+  // Map: reg -> bitmask of subregs
+  ReferenceMap Uses, Defs;
+  MachineBasicBlock::iterator DefIt = DefI, TfrIt = TfrI;
+
+  // Check if the predicate register is valid between DefI and TfrI.
+  // If it is, we can then ignore instructions predicated on the negated
+  // conditions when collecting def and use information.
+  bool PredValid = true;
+  for (MachineBasicBlock::iterator I = std::next(DefIt); I != TfrIt; ++I) {
+    if (!I->modifiesRegister(PredR, nullptr))
+      continue;
+    PredValid = false;
+    break;
+  }
+
+  for (MachineBasicBlock::iterator I = std::next(DefIt); I != TfrIt; ++I) {
+    MachineInstr *MI = &*I;
+    // If this instruction is predicated on the same register, it could
+    // potentially be ignored.
+    // By default assume that the instruction executes on the same condition
+    // as TfrI (Exec_Then), and also on the opposite one (Exec_Else).
+    unsigned Exec = Exec_Then | Exec_Else;
+    if (PredValid && HII->isPredicated(*MI) && MI->readsRegister(PredR))
+      Exec = (Cond == HII->isPredicatedTrue(*MI)) ? Exec_Then : Exec_Else;
+
+    for (auto &Op : MI->operands()) {
+      if (!Op.isReg())
+        continue;
+      // We don't want to deal with physical registers. The reason is that
+      // they can be aliased with other physical registers. Aliased virtual
+      // registers must share the same register number, and can only differ
+      // in the subregisters, which we are keeping track of. Physical
+      // registers ters no longer have subregisters---their super- and
+      // subregisters are other physical registers, and we are not checking
+      // that.
+      RegisterRef RR = Op;
+      if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+        return false;
+
+      ReferenceMap &Map = Op.isDef() ? Defs : Uses;
+      if (Op.isDef() && Op.isUndef()) {
+        assert(RR.Sub && "Expecting a subregister on <def,read-undef>");
+        // If this is a <def,read-undef>, then it invalidates the non-written
+        // part of the register. For the purpose of checking the validity of
+        // the move, assume that it modifies the whole register.
+        RR.Sub = 0;
+      }
+      addRefToMap(RR, Map, Exec);
+    }
+  }
+
+  // The situation:
+  //   RT = DefI
+  //   ...
+  //   RD = TfrI ..., RT
+
+  // If the register-in-the-middle (RT) is used or redefined between
+  // DefI and TfrI, we may not be able proceed with this transformation.
+  // We can ignore a def that will not execute together with TfrI, and a
+  // use that will. If there is such a use (that does execute together with
+  // TfrI), we will not be able to move DefI down. If there is a use that
+  // executed if TfrI's condition is false, then RT must be available
+  // unconditionally (cannot be predicated).
+  // Essentially, we need to be able to rename RT to RD in this segment.
+  if (isRefInMap(RT, Defs, Exec_Then) || isRefInMap(RT, Uses, Exec_Else))
+    return false;
+  RegisterRef RD = MD;
+  // If the predicate register is defined between DefI and TfrI, the only
+  // potential thing to do would be to move the DefI down to TfrI, and then
+  // predicate. The reaching def (DefI) must be movable down to the location
+  // of the TfrI.
+  // If the target register of the TfrI (RD) is not used or defined between
+  // DefI and TfrI, consider moving TfrI up to DefI.
+  bool CanUp =   canMoveOver(TfrI, Defs, Uses);
+  bool CanDown = canMoveOver(*DefI, Defs, Uses);
+  // The TfrI does not access memory, but DefI could. Check if it's safe
+  // to move DefI down to TfrI.
+  if (DefI->mayLoad() || DefI->mayStore())
+    if (!canMoveMemTo(*DefI, TfrI, true))
+      CanDown = false;
+
+  DEBUG(dbgs() << "Can move up: " << (CanUp ? "yes" : "no")
+               << ", can move down: " << (CanDown ? "yes\n" : "no\n"));
+  MachineBasicBlock::iterator PastDefIt = std::next(DefIt);
+  if (CanUp)
+    predicateAt(MD, *DefI, PastDefIt, MP, Cond, UpdRegs);
+  else if (CanDown)
+    predicateAt(MD, *DefI, TfrIt, MP, Cond, UpdRegs);
+  else
+    return false;
+
+  if (RT != RD) {
+    renameInRange(RT, RD, PredR, Cond, PastDefIt, TfrIt);
+    UpdRegs.insert(RT.Reg);
+  }
+
+  removeInstr(TfrI);
+  removeInstr(*DefI);
+  return true;
+}
+
+/// Predicate all cases of conditional copies in the specified block.
+bool HexagonExpandCondsets::predicateInBlock(MachineBasicBlock &B,
+      std::set<unsigned> &UpdRegs) {
+  bool Changed = false;
+  MachineBasicBlock::iterator I, E, NextI;
+  for (I = B.begin(), E = B.end(); I != E; I = NextI) {
+    NextI = std::next(I);
+    unsigned Opc = I->getOpcode();
+    if (Opc == Hexagon::A2_tfrt || Opc == Hexagon::A2_tfrf) {
+      bool Done = predicate(*I, (Opc == Hexagon::A2_tfrt), UpdRegs);
+      if (!Done) {
+        // If we didn't predicate I, we may need to remove it in case it is
+        // an "identity" copy, e.g.  vreg1 = A2_tfrt vreg2, vreg1.
+        if (RegisterRef(I->getOperand(0)) == RegisterRef(I->getOperand(2))) {
+          for (auto &Op : I->operands())
+            if (Op.isReg())
+              UpdRegs.insert(Op.getReg());
+          removeInstr(*I);
+        }
+      }
+      Changed |= Done;
+    }
+  }
+  return Changed;
+}
+
+bool HexagonExpandCondsets::isIntReg(RegisterRef RR, unsigned &BW) {
+  if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
+    return false;
+  const TargetRegisterClass *RC = MRI->getRegClass(RR.Reg);
+  if (RC == &Hexagon::IntRegsRegClass) {
+    BW = 32;
+    return true;
+  }
+  if (RC == &Hexagon::DoubleRegsRegClass) {
+    BW = (RR.Sub != 0) ? 32 : 64;
+    return true;
+  }
+  return false;
+}
+
+bool HexagonExpandCondsets::isIntraBlocks(LiveInterval &LI) {
+  for (LiveInterval::iterator I = LI.begin(), E = LI.end(); I != E; ++I) {
+    LiveRange::Segment &LR = *I;
+    // Range must start at a register...
+    if (!LR.start.isRegister())
+      return false;
+    // ...and end in a register or in a dead slot.
+    if (!LR.end.isRegister() && !LR.end.isDead())
+      return false;
+  }
+  return true;
+}
+
+bool HexagonExpandCondsets::coalesceRegisters(RegisterRef R1, RegisterRef R2) {
+  if (CoaLimitActive) {
+    if (CoaCounter >= CoaLimit)
+      return false;
+    CoaCounter++;
+  }
+  unsigned BW1, BW2;
+  if (!isIntReg(R1, BW1) || !isIntReg(R2, BW2) || BW1 != BW2)
+    return false;
+  if (MRI->isLiveIn(R1.Reg))
+    return false;
+  if (MRI->isLiveIn(R2.Reg))
+    return false;
+
+  LiveInterval &L1 = LIS->getInterval(R1.Reg);
+  LiveInterval &L2 = LIS->getInterval(R2.Reg);
+  if (L2.empty())
+    return false;
+  if (L1.hasSubRanges() || L2.hasSubRanges())
+    return false;
+  bool Overlap = L1.overlaps(L2);
+
+  DEBUG(dbgs() << "compatible registers: ("
+               << (Overlap ? "overlap" : "disjoint") << ")\n  "
+               << PrintReg(R1.Reg, TRI, R1.Sub) << "  " << L1 << "\n  "
+               << PrintReg(R2.Reg, TRI, R2.Sub) << "  " << L2 << "\n");
+  if (R1.Sub || R2.Sub)
+    return false;
+  if (Overlap)
+    return false;
+
+  // Coalescing could have a negative impact on scheduling, so try to limit
+  // to some reasonable extent. Only consider coalescing segments, when one
+  // of them does not cross basic block boundaries.
+  if (!isIntraBlocks(L1) && !isIntraBlocks(L2))
+    return false;
+
+  MRI->replaceRegWith(R2.Reg, R1.Reg);
+
+  // Move all live segments from L2 to L1.
+  typedef DenseMap<VNInfo*,VNInfo*> ValueInfoMap;
+  ValueInfoMap VM;
+  for (LiveInterval::iterator I = L2.begin(), E = L2.end(); I != E; ++I) {
+    VNInfo *NewVN, *OldVN = I->valno;
+    ValueInfoMap::iterator F = VM.find(OldVN);
+    if (F == VM.end()) {
+      NewVN = L1.getNextValue(I->valno->def, LIS->getVNInfoAllocator());
+      VM.insert(std::make_pair(OldVN, NewVN));
+    } else {
+      NewVN = F->second;
+    }
+    L1.addSegment(LiveRange::Segment(I->start, I->end, NewVN));
+  }
+  while (L2.begin() != L2.end())
+    L2.removeSegment(*L2.begin());
+  LIS->removeInterval(R2.Reg);
+
+  updateKillFlags(R1.Reg);
+  DEBUG(dbgs() << "coalesced: " << L1 << "\n");
+  L1.verify();
+
+  return true;
+}
+
+/// Attempt to coalesce one of the source registers to a MUX instruction with
+/// the destination register. This could lead to having only one predicated
+/// instruction in the end instead of two.
+bool HexagonExpandCondsets::coalesceSegments(
+      const SmallVectorImpl<MachineInstr*> &Condsets,
+      std::set<unsigned> &UpdRegs) {
+  SmallVector<MachineInstr*,16> TwoRegs;
+  for (MachineInstr *MI : Condsets) {
+    MachineOperand &S1 = MI->getOperand(2), &S2 = MI->getOperand(3);
+    if (!S1.isReg() && !S2.isReg())
+      continue;
+    TwoRegs.push_back(MI);
+  }
+
+  bool Changed = false;
+  for (MachineInstr *CI : TwoRegs) {
+    RegisterRef RD = CI->getOperand(0);
+    RegisterRef RP = CI->getOperand(1);
+    MachineOperand &S1 = CI->getOperand(2), &S2 = CI->getOperand(3);
+    bool Done = false;
+    // Consider this case:
+    //   vreg1 = instr1 ...
+    //   vreg2 = instr2 ...
+    //   vreg0 = C2_mux ..., vreg1, vreg2
+    // If vreg0 was coalesced with vreg1, we could end up with the following
+    // code:
+    //   vreg0 = instr1 ...
+    //   vreg2 = instr2 ...
+    //   vreg0 = A2_tfrf ..., vreg2
+    // which will later become:
+    //   vreg0 = instr1 ...
+    //   vreg0 = instr2_cNotPt ...
+    // i.e. there will be an unconditional definition (instr1) of vreg0
+    // followed by a conditional one. The output dependency was there before
+    // and it unavoidable, but if instr1 is predicable, we will no longer be
+    // able to predicate it here.
+    // To avoid this scenario, don't coalesce the destination register with
+    // a source register that is defined by a predicable instruction.
+    if (S1.isReg()) {
+      RegisterRef RS = S1;
+      MachineInstr *RDef = getReachingDefForPred(RS, CI, RP.Reg, true);
+      if (!RDef || !HII->isPredicable(*RDef)) {
+        Done = coalesceRegisters(RD, RegisterRef(S1));
+        if (Done) {
+          UpdRegs.insert(RD.Reg);
+          UpdRegs.insert(S1.getReg());
+        }
+      }
+    }
+    if (!Done && S2.isReg()) {
+      RegisterRef RS = S2;
+      MachineInstr *RDef = getReachingDefForPred(RS, CI, RP.Reg, false);
+      if (!RDef || !HII->isPredicable(*RDef)) {
+        Done = coalesceRegisters(RD, RegisterRef(S2));
+        if (Done) {
+          UpdRegs.insert(RD.Reg);
+          UpdRegs.insert(S2.getReg());
+        }
+      }
+    }
+    Changed |= Done;
+  }
+  return Changed;
+}
+
+bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  HII = static_cast<const HexagonInstrInfo*>(MF.getSubtarget().getInstrInfo());
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  LIS = &getAnalysis<LiveIntervals>();
+  MRI = &MF.getRegInfo();
+
+  DEBUG(LIS->print(dbgs() << "Before expand-condsets\n",
+                   MF.getFunction()->getParent()));
+
+  bool Changed = false;
+  std::set<unsigned> CoalUpd, PredUpd;
+
+  SmallVector<MachineInstr*,16> Condsets;
+  for (auto &B : MF)
+    for (auto &I : B)
+      if (isCondset(I))
+        Condsets.push_back(&I);
+
+  // Try to coalesce the target of a mux with one of its sources.
+  // This could eliminate a register copy in some circumstances.
+  Changed |= coalesceSegments(Condsets, CoalUpd);
+
+  // Update kill flags on all source operands. This is done here because
+  // at this moment (when expand-condsets runs), there are no kill flags
+  // in the IR (they have been removed by live range analysis).
+  // Updating them right before we split is the easiest, because splitting
+  // adds definitions which would interfere with updating kills afterwards.
+  std::set<unsigned> KillUpd;
+  for (MachineInstr *MI : Condsets)
+    for (MachineOperand &Op : MI->operands())
+      if (Op.isReg() && Op.isUse())
+        if (!CoalUpd.count(Op.getReg()))
+          KillUpd.insert(Op.getReg());
+  updateLiveness(KillUpd, false, true, false);
+  DEBUG(LIS->print(dbgs() << "After coalescing\n",
+                   MF.getFunction()->getParent()));
+
+  // First, simply split all muxes into a pair of conditional transfers
+  // and update the live intervals to reflect the new arrangement. The
+  // goal is to update the kill flags, since predication will rely on
+  // them.
+  for (MachineInstr *MI : Condsets)
+    Changed |= split(*MI, PredUpd);
+  Condsets.clear(); // The contents of Condsets are invalid here anyway.
+
+  // Do not update live ranges after splitting. Recalculation of live
+  // intervals removes kill flags, which were preserved by splitting on
+  // the source operands of condsets. These kill flags are needed by
+  // predication, and after splitting they are difficult to recalculate
+  // (because of predicated defs), so make sure they are left untouched.
+  // Predication does not use live intervals.
+  DEBUG(LIS->print(dbgs() << "After splitting\n",
+                   MF.getFunction()->getParent()));
+
+  // Traverse all blocks and collapse predicable instructions feeding
+  // conditional transfers into predicated instructions.
+  // Walk over all the instructions again, so we may catch pre-existing
+  // cases that were not created in the previous step.
+  for (auto &B : MF)
+    Changed |= predicateInBlock(B, PredUpd);
+  DEBUG(LIS->print(dbgs() << "After predicating\n",
+                   MF.getFunction()->getParent()));
+
+  PredUpd.insert(CoalUpd.begin(), CoalUpd.end());
+  updateLiveness(PredUpd, true, true, true);
+
+  DEBUG({
+    if (Changed)
+      LIS->print(dbgs() << "After expand-condsets\n",
+                 MF.getFunction()->getParent());
+  });
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonExpandCondsets() {
+  return new HexagonExpandCondsets();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
new file mode 100644
index 000000000000..dfd1f1d4f886
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -0,0 +1,194 @@
+//===---- HexagonFixupHwLoops.cpp - Fixup HW loops too far from LOOPn. ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// The loop start address in the LOOPn instruction is encoded as a distance
+// from the LOOPn instruction itself. If the start address is too far from
+// the LOOPn instruction, the instruction needs to use a constant extender.
+// This pass will identify and convert such LOOPn instructions to a proper
+// form.
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/ADT/DenseMap.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned> MaxLoopRange(
+    "hexagon-loop-range", cl::Hidden, cl::init(200),
+    cl::desc("Restrict range of loopN instructions (testing only)"));
+
+namespace llvm {
+  FunctionPass *createHexagonFixupHwLoops();
+  void initializeHexagonFixupHwLoopsPass(PassRegistry&);
+}
+
+namespace {
+  struct HexagonFixupHwLoops : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    HexagonFixupHwLoops() : MachineFunctionPass(ID) {
+      initializeHexagonFixupHwLoopsPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    StringRef getPassName() const override {
+      return "Hexagon Hardware Loop Fixup";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    /// \brief Check the offset between each loop instruction and
+    /// the loop basic block to determine if we can use the LOOP instruction
+    /// or if we need to set the LC/SA registers explicitly.
+    bool fixupLoopInstrs(MachineFunction &MF);
+
+    /// \brief Replace loop instruction with the constant extended
+    /// version if the loop label is too far from the loop instruction.
+    void useExtLoopInstr(MachineFunction &MF,
+                         MachineBasicBlock::iterator &MII);
+  };
+
+  char HexagonFixupHwLoops::ID = 0;
+}
+
+INITIALIZE_PASS(HexagonFixupHwLoops, "hwloopsfixup",
+                "Hexagon Hardware Loops Fixup", false, false)
+
+FunctionPass *llvm::createHexagonFixupHwLoops() {
+  return new HexagonFixupHwLoops();
+}
+
+/// \brief Returns true if the instruction is a hardware loop instruction.
+static bool isHardwareLoop(const MachineInstr &MI) {
+  return MI.getOpcode() == Hexagon::J2_loop0r ||
+         MI.getOpcode() == Hexagon::J2_loop0i ||
+         MI.getOpcode() == Hexagon::J2_loop1r ||
+         MI.getOpcode() == Hexagon::J2_loop1i;
+}
+
+bool HexagonFixupHwLoops::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+  return fixupLoopInstrs(MF);
+}
+
+/// \brief For Hexagon, if the loop label is to far from the
+/// loop instruction then we need to set the LC0 and SA0 registers
+/// explicitly instead of using LOOP(start,count).  This function
+/// checks the distance, and generates register assignments if needed.
+///
+/// This function makes two passes over the basic blocks.  The first
+/// pass computes the offset of the basic block from the start.
+/// The second pass checks all the loop instructions.
+bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
+
+  // Offset of the current instruction from the start.
+  unsigned InstOffset = 0;
+  // Map for each basic block to it's first instruction.
+  DenseMap<const MachineBasicBlock *, unsigned> BlockToInstOffset;
+
+  const HexagonInstrInfo *HII =
+      static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  // First pass - compute the offset of each basic block.
+  for (const MachineBasicBlock &MBB : MF) {
+    if (MBB.getAlignment()) {
+      // Although we don't know the exact layout of the final code, we need
+      // to account for alignment padding somehow. This heuristic pads each
+      // aligned basic block according to the alignment value.
+      int ByteAlign = (1u << MBB.getAlignment()) - 1;
+      InstOffset = (InstOffset + ByteAlign) & ~(ByteAlign);
+    }
+
+    BlockToInstOffset[&MBB] = InstOffset;
+    for (const MachineInstr &MI : MBB)
+      InstOffset += HII->getSize(MI);
+  }
+
+  // Second pass - check each loop instruction to see if it needs to be
+  // converted.
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF) {
+    InstOffset = BlockToInstOffset[&MBB];
+
+    // Loop over all the instructions.
+    MachineBasicBlock::iterator MII = MBB.begin();
+    MachineBasicBlock::iterator MIE = MBB.end();
+    while (MII != MIE) {
+      InstOffset += HII->getSize(*MII);
+      if (MII->isDebugValue()) {
+        ++MII;
+        continue;
+      }
+      if (isHardwareLoop(*MII)) {
+        assert(MII->getOperand(0).isMBB() &&
+               "Expect a basic block as loop operand");
+        int diff = InstOffset - BlockToInstOffset[MII->getOperand(0).getMBB()];
+        if ((unsigned)abs(diff) > MaxLoopRange) {
+          useExtLoopInstr(MF, MII);
+          MII = MBB.erase(MII);
+          Changed = true;
+        } else {
+          ++MII;
+        }
+      } else {
+        ++MII;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+/// \brief Replace loop instructions with the constant extended version.
+void HexagonFixupHwLoops::useExtLoopInstr(MachineFunction &MF,
+                                          MachineBasicBlock::iterator &MII) {
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  MachineBasicBlock *MBB = MII->getParent();
+  DebugLoc DL = MII->getDebugLoc();
+  MachineInstrBuilder MIB;
+  unsigned newOp;
+  switch (MII->getOpcode()) {
+  case Hexagon::J2_loop0r:
+    newOp = Hexagon::J2_loop0rext;
+    break;
+  case Hexagon::J2_loop0i:
+    newOp = Hexagon::J2_loop0iext;
+    break;
+  case Hexagon::J2_loop1r:
+    newOp = Hexagon::J2_loop1rext;
+    break;
+  case Hexagon::J2_loop1i:
+    newOp = Hexagon::J2_loop1iext;
+    break;
+  default:
+    llvm_unreachable("Invalid Hardware Loop Instruction.");
+  }
+  MIB = BuildMI(*MBB, MII, DL, TII->get(newOp));
+
+  for (unsigned i = 0; i < MII->getNumOperands(); ++i)
+    MIB.addOperand(MII->getOperand(i));
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
new file mode 100644
index 000000000000..a3f6273f9f67
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -0,0 +1,2441 @@
+//===-- HexagonFrameLowering.cpp - Define frame lowering ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-pei"
+
+#include "HexagonBlockRanges.h"
+#include "HexagonFrameLowering.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <map>
+#include <new>
+#include <utility>
+#include <vector>
+
+// Hexagon stack frame layout as defined by the ABI:
+//
+//                                                       Incoming arguments
+//                                                       passed via stack
+//                                                                      |
+//                                                                      |
+//        SP during function's                 FP during function's     |
+//    +-- runtime (top of stack)               runtime (bottom) --+     |
+//    |                                                           |     |
+// --++---------------------+------------------+-----------------++-+-------
+//   |  parameter area for  |  variable-size   |   fixed-size    |LR|  arg
+//   |   called functions   |  local objects   |  local objects  |FP|
+// --+----------------------+------------------+-----------------+--+-------
+//    <-    size known    -> <- size unknown -> <- size known  ->
+//
+// Low address                                                 High address
+//
+// <--- stack growth
+//
+//
+// - In any circumstances, the outgoing function arguments are always accessi-
+//   ble using the SP, and the incoming arguments are accessible using the FP.
+// - If the local objects are not aligned, they can always be accessed using
+//   the FP.
+// - If there are no variable-sized objects, the local objects can always be
+//   accessed using the SP, regardless whether they are aligned or not. (The
+//   alignment padding will be at the bottom of the stack (highest address),
+//   and so the offset with respect to the SP will be known at the compile-
+//   -time.)
+//
+// The only complication occurs if there are both, local aligned objects, and
+// dynamically allocated (variable-sized) objects. The alignment pad will be
+// placed between the FP and the local objects, thus preventing the use of the
+// FP to access the local objects. At the same time, the variable-sized objects
+// will be between the SP and the local objects, thus introducing an unknown
+// distance from the SP to the locals.
+//
+// To avoid this problem, a new register is created that holds the aligned
+// address of the bottom of the stack, referred in the sources as AP (aligned
+// pointer). The AP will be equal to "FP-p", where "p" is the smallest pad
+// that aligns AP to the required boundary (a maximum of the alignments of
+// all stack objects, fixed- and variable-sized). All local objects[1] will
+// then use AP as the base pointer.
+// [1] The exception is with "fixed" stack objects. "Fixed" stack objects get
+// their name from being allocated at fixed locations on the stack, relative
+// to the FP. In the presence of dynamic allocation and local alignment, such
+// objects can only be accessed through the FP.
+//
+// Illustration of the AP:
+//                                                                FP --+
+//                                                                     |
+// ---------------+---------------------+-----+-----------------------++-+--
+//   Rest of the  | Local stack objects | Pad |  Fixed stack objects  |LR|
+//   stack frame  | (aligned)           |     |  (CSR, spills, etc.)  |FP|
+// ---------------+---------------------+-----+-----------------+-----+--+--
+//                                      |<-- Multiple of the -->|
+//                                           stack alignment    +-- AP
+//
+// The AP is set up at the beginning of the function. Since it is not a dedi-
+// cated (reserved) register, it needs to be kept live throughout the function
+// to be available as the base register for local object accesses.
+// Normally, an address of a stack objects is obtained by a pseudo-instruction
+// PS_fi. To access local objects with the AP register present, a different
+// pseudo-instruction needs to be used: PS_fia. The PS_fia takes one extra
+// argument compared to PS_fi: the first input register is the AP register.
+// This keeps the register live between its definition and its uses.
+
+// The AP register is originally set up using pseudo-instruction PS_aligna:
+//   AP = PS_aligna A
+// where
+//   A  - required stack alignment
+// The alignment value must be the maximum of all alignments required by
+// any stack object.
+
+// The dynamic allocation uses a pseudo-instruction PS_alloca:
+//   Rd = PS_alloca Rs, A
+// where
+//   Rd - address of the allocated space
+//   Rs - minimum size (the actual allocated can be larger to accommodate
+//        alignment)
+//   A  - required alignment
+
+using namespace llvm;
+
+static cl::opt<bool> DisableDeallocRet("disable-hexagon-dealloc-ret",
+    cl::Hidden, cl::desc("Disable Dealloc Return for Hexagon target"));
+
+static cl::opt<unsigned> NumberScavengerSlots("number-scavenger-slots",
+    cl::Hidden, cl::desc("Set the number of scavenger slots"), cl::init(2),
+    cl::ZeroOrMore);
+
+static cl::opt<int> SpillFuncThreshold("spill-func-threshold",
+    cl::Hidden, cl::desc("Specify O2(not Os) spill func threshold"),
+    cl::init(6), cl::ZeroOrMore);
+
+static cl::opt<int> SpillFuncThresholdOs("spill-func-threshold-Os",
+    cl::Hidden, cl::desc("Specify Os spill func threshold"),
+    cl::init(1), cl::ZeroOrMore);
+
+static cl::opt<bool> EnableStackOVFSanitizer("enable-stackovf-sanitizer",
+    cl::Hidden, cl::desc("Enable runtime checks for stack overflow."),
+    cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<bool> EnableShrinkWrapping("hexagon-shrink-frame",
+    cl::init(true), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Enable stack frame shrink wrapping"));
+
+static cl::opt<unsigned> ShrinkLimit("shrink-frame-limit",
+    cl::init(std::numeric_limits<unsigned>::max()), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Max count of stack frame shrink-wraps"));
+
+static cl::opt<bool> EnableSaveRestoreLong("enable-save-restore-long",
+    cl::Hidden, cl::desc("Enable long calls for save-restore stubs."),
+    cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<bool> UseAllocframe("use-allocframe", cl::init(true),
+    cl::Hidden, cl::desc("Use allocframe more conservatively"));
+
+static cl::opt<bool> OptimizeSpillSlots("hexagon-opt-spill", cl::Hidden,
+    cl::init(true), cl::desc("Optimize spill slots"));
+
+#ifndef NDEBUG
+static cl::opt<unsigned> SpillOptMax("spill-opt-max", cl::Hidden,
+    cl::init(std::numeric_limits<unsigned>::max()));
+static unsigned SpillOptCount = 0;
+#endif
+
+namespace llvm {
+
+  void initializeHexagonCallFrameInformationPass(PassRegistry&);
+  FunctionPass *createHexagonCallFrameInformation();
+
+} // end namespace llvm
+
+namespace {
+
+  class HexagonCallFrameInformation : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    HexagonCallFrameInformation() : MachineFunctionPass(ID) {
+      PassRegistry &PR = *PassRegistry::getPassRegistry();
+      initializeHexagonCallFrameInformationPass(PR);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+  };
+
+  char HexagonCallFrameInformation::ID = 0;
+
+} // end anonymous namespace
+
+bool HexagonCallFrameInformation::runOnMachineFunction(MachineFunction &MF) {
+  auto &HFI = *MF.getSubtarget<HexagonSubtarget>().getFrameLowering();
+  bool NeedCFI = MF.getMMI().hasDebugInfo() ||
+                 MF.getFunction()->needsUnwindTableEntry();
+
+  if (!NeedCFI)
+    return false;
+  HFI.insertCFIInstructions(MF);
+  return true;
+}
+
+INITIALIZE_PASS(HexagonCallFrameInformation, "hexagon-cfi",
+                "Hexagon call frame information", false, false)
+
+FunctionPass *llvm::createHexagonCallFrameInformation() {
+  return new HexagonCallFrameInformation();
+}
+
+/// Map a register pair Reg to the subregister that has the greater "number",
+/// i.e. D3 (aka R7:6) will be mapped to R7, etc.
+static unsigned getMax32BitSubRegister(unsigned Reg,
+                                       const TargetRegisterInfo &TRI,
+                                       bool hireg = true) {
+    if (Reg < Hexagon::D0 || Reg > Hexagon::D15)
+      return Reg;
+
+    unsigned RegNo = 0;
+    for (MCSubRegIterator SubRegs(Reg, &TRI); SubRegs.isValid(); ++SubRegs) {
+      if (hireg) {
+        if (*SubRegs > RegNo)
+          RegNo = *SubRegs;
+      } else {
+        if (!RegNo || *SubRegs < RegNo)
+          RegNo = *SubRegs;
+      }
+    }
+    return RegNo;
+}
+
+/// Returns the callee saved register with the largest id in the vector.
+static unsigned getMaxCalleeSavedReg(const std::vector<CalleeSavedInfo> &CSI,
+                                     const TargetRegisterInfo &TRI) {
+    static_assert(Hexagon::R1 > 0,
+                  "Assume physical registers are encoded as positive integers");
+    if (CSI.empty())
+      return 0;
+
+    unsigned Max = getMax32BitSubRegister(CSI[0].getReg(), TRI);
+    for (unsigned I = 1, E = CSI.size(); I < E; ++I) {
+      unsigned Reg = getMax32BitSubRegister(CSI[I].getReg(), TRI);
+      if (Reg > Max)
+        Max = Reg;
+    }
+    return Max;
+}
+
+/// Checks if the basic block contains any instruction that needs a stack
+/// frame to be already in place.
+static bool needsStackFrame(const MachineBasicBlock &MBB, const BitVector &CSR,
+                            const HexagonRegisterInfo &HRI) {
+    for (auto &I : MBB) {
+      const MachineInstr *MI = &I;
+      if (MI->isCall())
+        return true;
+      unsigned Opc = MI->getOpcode();
+      switch (Opc) {
+        case Hexagon::PS_alloca:
+        case Hexagon::PS_aligna:
+          return true;
+        default:
+          break;
+      }
+      // Check individual operands.
+      for (const MachineOperand &MO : MI->operands()) {
+        // While the presence of a frame index does not prove that a stack
+        // frame will be required, all frame indexes should be within alloc-
+        // frame/deallocframe. Otherwise, the code that translates a frame
+        // index into an offset would have to be aware of the placement of
+        // the frame creation/destruction instructions.
+        if (MO.isFI())
+          return true;
+        if (!MO.isReg())
+          continue;
+        unsigned R = MO.getReg();
+        // Virtual registers will need scavenging, which then may require
+        // a stack slot.
+        if (TargetRegisterInfo::isVirtualRegister(R))
+          return true;
+        for (MCSubRegIterator S(R, &HRI, true); S.isValid(); ++S)
+          if (CSR[*S])
+            return true;
+      }
+    }
+    return false;
+}
+
+  /// Returns true if MBB has a machine instructions that indicates a tail call
+  /// in the block.
+static bool hasTailCall(const MachineBasicBlock &MBB) {
+    MachineBasicBlock::const_iterator I = MBB.getLastNonDebugInstr();
+    unsigned RetOpc = I->getOpcode();
+    return RetOpc == Hexagon::PS_tailcall_i || RetOpc == Hexagon::PS_tailcall_r;
+}
+
+/// Returns true if MBB contains an instruction that returns.
+static bool hasReturn(const MachineBasicBlock &MBB) {
+    for (auto I = MBB.getFirstTerminator(), E = MBB.end(); I != E; ++I)
+      if (I->isReturn())
+        return true;
+    return false;
+}
+
+/// Returns the "return" instruction from this block, or nullptr if there
+/// isn't any.
+static MachineInstr *getReturn(MachineBasicBlock &MBB) {
+    for (auto &I : MBB)
+      if (I.isReturn())
+        return &I;
+    return nullptr;
+}
+
+static bool isRestoreCall(unsigned Opc) {
+    switch (Opc) {
+      case Hexagon::RESTORE_DEALLOC_RET_JMP_V4:
+      case Hexagon::RESTORE_DEALLOC_RET_JMP_V4_PIC:
+      case Hexagon::RESTORE_DEALLOC_RET_JMP_V4_EXT:
+      case Hexagon::RESTORE_DEALLOC_RET_JMP_V4_EXT_PIC:
+      case Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT:
+      case Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC:
+      case Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4:
+      case Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC:
+        return true;
+    }
+    return false;
+}
+
+static inline bool isOptNone(const MachineFunction &MF) {
+    return MF.getFunction()->hasFnAttribute(Attribute::OptimizeNone) ||
+           MF.getTarget().getOptLevel() == CodeGenOpt::None;
+}
+
+static inline bool isOptSize(const MachineFunction &MF) {
+    const Function &F = *MF.getFunction();
+    return F.optForSize() && !F.optForMinSize();
+}
+
+static inline bool isMinSize(const MachineFunction &MF) {
+    return MF.getFunction()->optForMinSize();
+}
+
+/// Implements shrink-wrapping of the stack frame. By default, stack frame
+/// is created in the function entry block, and is cleaned up in every block
+/// that returns. This function finds alternate blocks: one for the frame
+/// setup (prolog) and one for the cleanup (epilog).
+void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF,
+      MachineBasicBlock *&PrologB, MachineBasicBlock *&EpilogB) const {
+  static unsigned ShrinkCounter = 0;
+
+  if (ShrinkLimit.getPosition()) {
+    if (ShrinkCounter >= ShrinkLimit)
+      return;
+    ShrinkCounter++;
+  }
+
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HRI = *HST.getRegisterInfo();
+
+  MachineDominatorTree MDT;
+  MDT.runOnMachineFunction(MF);
+  MachinePostDominatorTree MPT;
+  MPT.runOnMachineFunction(MF);
+
+  typedef DenseMap<unsigned,unsigned> UnsignedMap;
+  UnsignedMap RPO;
+  typedef ReversePostOrderTraversal<const MachineFunction*> RPOTType;
+  RPOTType RPOT(&MF);
+  unsigned RPON = 0;
+  for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
+    RPO[(*I)->getNumber()] = RPON++;
+
+  // Don't process functions that have loops, at least for now. Placement
+  // of prolog and epilog must take loop structure into account. For simpli-
+  // city don't do it right now.
+  for (auto &I : MF) {
+    unsigned BN = RPO[I.getNumber()];
+    for (auto SI = I.succ_begin(), SE = I.succ_end(); SI != SE; ++SI) {
+      // If found a back-edge, return.
+      if (RPO[(*SI)->getNumber()] <= BN)
+        return;
+    }
+  }
+
+  // Collect the set of blocks that need a stack frame to execute. Scan
+  // each block for uses/defs of callee-saved registers, calls, etc.
+  SmallVector<MachineBasicBlock*,16> SFBlocks;
+  BitVector CSR(Hexagon::NUM_TARGET_REGS);
+  for (const MCPhysReg *P = HRI.getCalleeSavedRegs(&MF); *P; ++P)
+    for (MCSubRegIterator S(*P, &HRI, true); S.isValid(); ++S)
+      CSR[*S] = true;
+
+  for (auto &I : MF)
+    if (needsStackFrame(I, CSR, HRI))
+      SFBlocks.push_back(&I);
+
+  DEBUG({
+    dbgs() << "Blocks needing SF: {";
+    for (auto &B : SFBlocks)
+      dbgs() << " BB#" << B->getNumber();
+    dbgs() << " }\n";
+  });
+  // No frame needed?
+  if (SFBlocks.empty())
+    return;
+
+  // Pick a common dominator and a common post-dominator.
+  MachineBasicBlock *DomB = SFBlocks[0];
+  for (unsigned i = 1, n = SFBlocks.size(); i < n; ++i) {
+    DomB = MDT.findNearestCommonDominator(DomB, SFBlocks[i]);
+    if (!DomB)
+      break;
+  }
+  MachineBasicBlock *PDomB = SFBlocks[0];
+  for (unsigned i = 1, n = SFBlocks.size(); i < n; ++i) {
+    PDomB = MPT.findNearestCommonDominator(PDomB, SFBlocks[i]);
+    if (!PDomB)
+      break;
+  }
+  DEBUG({
+    dbgs() << "Computed dom block: BB#";
+    if (DomB) dbgs() << DomB->getNumber();
+    else      dbgs() << "<null>";
+    dbgs() << ", computed pdom block: BB#";
+    if (PDomB) dbgs() << PDomB->getNumber();
+    else       dbgs() << "<null>";
+    dbgs() << "\n";
+  });
+  if (!DomB || !PDomB)
+    return;
+
+  // Make sure that DomB dominates PDomB and PDomB post-dominates DomB.
+  if (!MDT.dominates(DomB, PDomB)) {
+    DEBUG(dbgs() << "Dom block does not dominate pdom block\n");
+    return;
+  }
+  if (!MPT.dominates(PDomB, DomB)) {
+    DEBUG(dbgs() << "PDom block does not post-dominate dom block\n");
+    return;
+  }
+
+  // Finally, everything seems right.
+  PrologB = DomB;
+  EpilogB = PDomB;
+}
+
+/// Perform most of the PEI work here:
+/// - saving/restoring of the callee-saved registers,
+/// - stack frame creation and destruction.
+/// Normally, this work is distributed among various functions, but doing it
+/// in one place allows shrink-wrapping of the stack frame.
+void HexagonFrameLowering::emitPrologue(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) const {
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HRI = *HST.getRegisterInfo();
+
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+  MachineBasicBlock *PrologB = &MF.front(), *EpilogB = nullptr;
+  if (EnableShrinkWrapping)
+    findShrunkPrologEpilog(MF, PrologB, EpilogB);
+
+  bool PrologueStubs = false;
+  insertCSRSpillsInBlock(*PrologB, CSI, HRI, PrologueStubs);
+  insertPrologueInBlock(*PrologB, PrologueStubs);
+  updateEntryPaths(MF, *PrologB);
+
+  if (EpilogB) {
+    insertCSRRestoresInBlock(*EpilogB, CSI, HRI);
+    insertEpilogueInBlock(*EpilogB);
+  } else {
+    for (auto &B : MF)
+      if (B.isReturnBlock())
+        insertCSRRestoresInBlock(B, CSI, HRI);
+
+    for (auto &B : MF)
+      if (B.isReturnBlock())
+        insertEpilogueInBlock(B);
+
+    for (auto &B : MF) {
+      if (B.empty())
+        continue;
+      MachineInstr *RetI = getReturn(B);
+      if (!RetI || isRestoreCall(RetI->getOpcode()))
+        continue;
+      for (auto &R : CSI)
+        RetI->addOperand(MachineOperand::CreateReg(R.getReg(), false, true));
+    }
+  }
+
+  if (EpilogB) {
+    // If there is an epilog block, it may not have a return instruction.
+    // In such case, we need to add the callee-saved registers as live-ins
+    // in all blocks on all paths from the epilog to any return block.
+    unsigned MaxBN = MF.getNumBlockIDs();
+    BitVector DoneT(MaxBN+1), DoneF(MaxBN+1), Path(MaxBN+1);
+    updateExitPaths(*EpilogB, *EpilogB, DoneT, DoneF, Path);
+  }
+}
+
+void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB,
+      bool PrologueStubs) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HII = *HST.getInstrInfo();
+  auto &HRI = *HST.getRegisterInfo();
+  DebugLoc dl;
+
+  unsigned MaxAlign = std::max(MFI.getMaxAlignment(), getStackAlignment());
+
+  // Calculate the total stack frame size.
+  // Get the number of bytes to allocate from the FrameInfo.
+  unsigned FrameSize = MFI.getStackSize();
+  // Round up the max call frame size to the max alignment on the stack.
+  unsigned MaxCFA = alignTo(MFI.getMaxCallFrameSize(), MaxAlign);
+  MFI.setMaxCallFrameSize(MaxCFA);
+
+  FrameSize = MaxCFA + alignTo(FrameSize, MaxAlign);
+  MFI.setStackSize(FrameSize);
+
+  bool AlignStack = (MaxAlign > getStackAlignment());
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  unsigned NumBytes = MFI.getStackSize();
+  unsigned SP = HRI.getStackRegister();
+  unsigned MaxCF = MFI.getMaxCallFrameSize();
+  MachineBasicBlock::iterator InsertPt = MBB.begin();
+
+  SmallVector<MachineInstr *, 4> AdjustRegs;
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
+      if (MI.getOpcode() == Hexagon::PS_alloca)
+        AdjustRegs.push_back(&MI);
+
+  for (auto MI : AdjustRegs) {
+    assert((MI->getOpcode() == Hexagon::PS_alloca) && "Expected alloca");
+    expandAlloca(MI, HII, SP, MaxCF);
+    MI->eraseFromParent();
+  }
+
+  if (!hasFP(MF))
+    return;
+
+  // Check for overflow.
+  // Hexagon_TODO: Ugh! hardcoding. Is there an API that can be used?
+  const unsigned int ALLOCFRAME_MAX = 16384;
+
+  // Create a dummy memory operand to avoid allocframe from being treated as
+  // a volatile memory reference.
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore,
+                            4, 4);
+
+  if (NumBytes >= ALLOCFRAME_MAX) {
+    // Emit allocframe(#0).
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
+      .addImm(0)
+      .addMemOperand(MMO);
+
+    // Subtract offset from frame pointer.
+    // We use a caller-saved non-parameter register for that.
+    unsigned CallerSavedReg = HRI.getFirstCallerSavedNonParamReg();
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::CONST32),
+            CallerSavedReg).addImm(NumBytes);
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_sub), SP)
+      .addReg(SP)
+      .addReg(CallerSavedReg);
+  } else {
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
+      .addImm(NumBytes)
+      .addMemOperand(MMO);
+  }
+
+  if (AlignStack) {
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_andir), SP)
+        .addReg(SP)
+        .addImm(-int64_t(MaxAlign));
+  }
+
+  // If the stack-checking is enabled, and we spilled the callee-saved
+  // registers inline (i.e. did not use a spill function), then call
+  // the stack checker directly.
+  if (EnableStackOVFSanitizer && !PrologueStubs)
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::PS_call_stk))
+           .addExternalSymbol("__runtime_stack_check");
+}
+
+void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
+  MachineFunction &MF = *MBB.getParent();
+  if (!hasFP(MF))
+    return;
+
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HII = *HST.getInstrInfo();
+  auto &HRI = *HST.getRegisterInfo();
+  unsigned SP = HRI.getStackRegister();
+
+  MachineInstr *RetI = getReturn(MBB);
+  unsigned RetOpc = RetI ? RetI->getOpcode() : 0;
+
+  MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
+  DebugLoc DL;
+  if (InsertPt != MBB.end())
+    DL = InsertPt->getDebugLoc();
+  else if (!MBB.empty())
+    DL = std::prev(MBB.end())->getDebugLoc();
+
+  // Handle EH_RETURN.
+  if (RetOpc == Hexagon::EH_RETURN_JMPR) {
+    BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::L2_deallocframe));
+    BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::A2_add), SP)
+        .addReg(SP)
+        .addReg(Hexagon::R28);
+    return;
+  }
+
+  // Check for RESTORE_DEALLOC_RET* tail call. Don't emit an extra dealloc-
+  // frame instruction if we encounter it.
+  if (RetOpc == Hexagon::RESTORE_DEALLOC_RET_JMP_V4 ||
+      RetOpc == Hexagon::RESTORE_DEALLOC_RET_JMP_V4_PIC ||
+      RetOpc == Hexagon::RESTORE_DEALLOC_RET_JMP_V4_EXT ||
+      RetOpc == Hexagon::RESTORE_DEALLOC_RET_JMP_V4_EXT_PIC) {
+    MachineBasicBlock::iterator It = RetI;
+    ++It;
+    // Delete all instructions after the RESTORE (except labels).
+    while (It != MBB.end()) {
+      if (!It->isLabel())
+        It = MBB.erase(It);
+      else
+        ++It;
+    }
+    return;
+  }
+
+  // It is possible that the restoring code is a call to a library function.
+  // All of the restore* functions include "deallocframe", so we need to make
+  // sure that we don't add an extra one.
+  bool NeedsDeallocframe = true;
+  if (!MBB.empty() && InsertPt != MBB.begin()) {
+    MachineBasicBlock::iterator PrevIt = std::prev(InsertPt);
+    unsigned COpc = PrevIt->getOpcode();
+    if (COpc == Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4 ||
+        COpc == Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC ||
+        COpc == Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT ||
+        COpc == Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC ||
+        COpc == Hexagon::PS_call_nr || COpc == Hexagon::PS_callr_nr)
+      NeedsDeallocframe = false;
+  }
+
+  if (!NeedsDeallocframe)
+    return;
+  // If the returning instruction is PS_jmpret, replace it with dealloc_return,
+  // otherwise just add deallocframe. The function could be returning via a
+  // tail call.
+  if (RetOpc != Hexagon::PS_jmpret || DisableDeallocRet) {
+    BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::L2_deallocframe));
+    return;
+  }
+  unsigned NewOpc = Hexagon::L4_return;
+  MachineInstr *NewI = BuildMI(MBB, RetI, DL, HII.get(NewOpc));
+  // Transfer the function live-out registers.
+  NewI->copyImplicitOps(MF, *RetI);
+  MBB.erase(RetI);
+}
+
+void HexagonFrameLowering::updateEntryPaths(MachineFunction &MF,
+      MachineBasicBlock &SaveB) const {
+  SetVector<unsigned> Worklist;
+
+  MachineBasicBlock &EntryB = MF.front();
+  Worklist.insert(EntryB.getNumber());
+
+  unsigned SaveN = SaveB.getNumber();
+  auto &CSI = MF.getFrameInfo().getCalleeSavedInfo();
+
+  for (unsigned i = 0; i < Worklist.size(); ++i) {
+    unsigned BN = Worklist[i];
+    MachineBasicBlock &MBB = *MF.getBlockNumbered(BN);
+    for (auto &R : CSI)
+      if (!MBB.isLiveIn(R.getReg()))
+        MBB.addLiveIn(R.getReg());
+    if (BN != SaveN)
+      for (auto &SB : MBB.successors())
+        Worklist.insert(SB->getNumber());
+  }
+}
+
+bool HexagonFrameLowering::updateExitPaths(MachineBasicBlock &MBB,
+      MachineBasicBlock &RestoreB, BitVector &DoneT, BitVector &DoneF,
+      BitVector &Path) const {
+  assert(MBB.getNumber() >= 0);
+  unsigned BN = MBB.getNumber();
+  if (Path[BN] || DoneF[BN])
+    return false;
+  if (DoneT[BN])
+    return true;
+
+  auto &CSI = MBB.getParent()->getFrameInfo().getCalleeSavedInfo();
+
+  Path[BN] = true;
+  bool ReachedExit = false;
+  for (auto &SB : MBB.successors())
+    ReachedExit |= updateExitPaths(*SB, RestoreB, DoneT, DoneF, Path);
+
+  if (!MBB.empty() && MBB.back().isReturn()) {
+    // Add implicit uses of all callee-saved registers to the reached
+    // return instructions. This is to prevent the anti-dependency breaker
+    // from renaming these registers.
+    MachineInstr &RetI = MBB.back();
+    if (!isRestoreCall(RetI.getOpcode()))
+      for (auto &R : CSI)
+        RetI.addOperand(MachineOperand::CreateReg(R.getReg(), false, true));
+    ReachedExit = true;
+  }
+
+  // We don't want to add unnecessary live-ins to the restore block: since
+  // the callee-saved registers are being defined in it, the entry of the
+  // restore block cannot be on the path from the definitions to any exit.
+  if (ReachedExit && &MBB != &RestoreB) {
+    for (auto &R : CSI)
+      if (!MBB.isLiveIn(R.getReg()))
+        MBB.addLiveIn(R.getReg());
+    DoneT[BN] = true;
+  }
+  if (!ReachedExit)
+    DoneF[BN] = true;
+
+  Path[BN] = false;
+  return ReachedExit;
+}
+
+static Optional<MachineBasicBlock::iterator>
+findCFILocation(MachineBasicBlock &B) {
+    // The CFI instructions need to be inserted right after allocframe.
+    // An exception to this is a situation where allocframe is bundled
+    // with a call: then the CFI instructions need to be inserted before
+    // the packet with the allocframe+call (in case the call throws an
+    // exception).
+    auto End = B.instr_end();
+
+    for (MachineInstr &I : B) {
+      MachineBasicBlock::iterator It = I.getIterator();
+      if (!I.isBundle()) {
+        if (I.getOpcode() == Hexagon::S2_allocframe)
+          return std::next(It);
+        continue;
+      }
+      // I is a bundle.
+      bool HasCall = false, HasAllocFrame = false;
+      auto T = It.getInstrIterator();
+      while (++T != End && T->isBundled()) {
+        if (T->getOpcode() == Hexagon::S2_allocframe)
+          HasAllocFrame = true;
+        else if (T->isCall())
+          HasCall = true;
+      }
+      if (HasAllocFrame)
+        return HasCall ? It : std::next(It);
+    }
+    return None;
+}
+
+void HexagonFrameLowering::insertCFIInstructions(MachineFunction &MF) const {
+  for (auto &B : MF) {
+    auto At = findCFILocation(B);
+    if (At.hasValue())
+      insertCFIInstructionsAt(B, At.getValue());
+  }
+}
+
+void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator At) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HII = *HST.getInstrInfo();
+  auto &HRI = *HST.getRegisterInfo();
+
+  // If CFI instructions have debug information attached, something goes
+  // wrong with the final assembly generation: the prolog_end is placed
+  // in a wrong location.
+  DebugLoc DL;
+  const MCInstrDesc &CFID = HII.get(TargetOpcode::CFI_INSTRUCTION);
+
+  MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
+  bool HasFP = hasFP(MF);
+
+  if (HasFP) {
+    unsigned DwFPReg = HRI.getDwarfRegNum(HRI.getFrameRegister(), true);
+    unsigned DwRAReg = HRI.getDwarfRegNum(HRI.getRARegister(), true);
+
+    // Define CFA via an offset from the value of FP.
+    //
+    //  -8   -4    0 (SP)
+    // --+----+----+---------------------
+    //   | FP | LR |          increasing addresses -->
+    // --+----+----+---------------------
+    //   |         +-- Old SP (before allocframe)
+    //   +-- New FP (after allocframe)
+    //
+    // MCCFIInstruction::createDefCfa subtracts the offset from the register.
+    // MCCFIInstruction::createOffset takes the offset without sign change.
+    auto DefCfa = MCCFIInstruction::createDefCfa(FrameLabel, DwFPReg, -8);
+    BuildMI(MBB, At, DL, CFID)
+        .addCFIIndex(MF.addFrameInst(DefCfa));
+    // R31 (return addr) = CFA - 4
+    auto OffR31 = MCCFIInstruction::createOffset(FrameLabel, DwRAReg, -4);
+    BuildMI(MBB, At, DL, CFID)
+        .addCFIIndex(MF.addFrameInst(OffR31));
+    // R30 (frame ptr) = CFA - 8
+    auto OffR30 = MCCFIInstruction::createOffset(FrameLabel, DwFPReg, -8);
+    BuildMI(MBB, At, DL, CFID)
+        .addCFIIndex(MF.addFrameInst(OffR30));
+  }
+
+  static unsigned int RegsToMove[] = {
+    Hexagon::R1,  Hexagon::R0,  Hexagon::R3,  Hexagon::R2,
+    Hexagon::R17, Hexagon::R16, Hexagon::R19, Hexagon::R18,
+    Hexagon::R21, Hexagon::R20, Hexagon::R23, Hexagon::R22,
+    Hexagon::R25, Hexagon::R24, Hexagon::R27, Hexagon::R26,
+    Hexagon::D0,  Hexagon::D1,  Hexagon::D8,  Hexagon::D9,
+    Hexagon::D10, Hexagon::D11, Hexagon::D12, Hexagon::D13,
+    Hexagon::NoRegister
+  };
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+  for (unsigned i = 0; RegsToMove[i] != Hexagon::NoRegister; ++i) {
+    unsigned Reg = RegsToMove[i];
+    auto IfR = [Reg] (const CalleeSavedInfo &C) -> bool {
+      return C.getReg() == Reg;
+    };
+    auto F = find_if(CSI, IfR);
+    if (F == CSI.end())
+      continue;
+
+    int64_t Offset;
+    if (HasFP) {
+      // If the function has a frame pointer (i.e. has an allocframe),
+      // then the CFA has been defined in terms of FP. Any offsets in
+      // the following CFI instructions have to be defined relative
+      // to FP, which points to the bottom of the stack frame.
+      // The function getFrameIndexReference can still choose to use SP
+      // for the offset calculation, so we cannot simply call it here.
+      // Instead, get the offset (relative to the FP) directly.
+      Offset = MFI.getObjectOffset(F->getFrameIdx());
+    } else {
+      unsigned FrameReg;
+      Offset = getFrameIndexReference(MF, F->getFrameIdx(), FrameReg);
+    }
+    // Subtract 8 to make room for R30 and R31, which are added above.
+    Offset -= 8;
+
+    if (Reg < Hexagon::D0 || Reg > Hexagon::D15) {
+      unsigned DwarfReg = HRI.getDwarfRegNum(Reg, true);
+      auto OffReg = MCCFIInstruction::createOffset(FrameLabel, DwarfReg,
+                                                   Offset);
+      BuildMI(MBB, At, DL, CFID)
+          .addCFIIndex(MF.addFrameInst(OffReg));
+    } else {
+      // Split the double regs into subregs, and generate appropriate
+      // cfi_offsets.
+      // The only reason, we are split double regs is, llvm-mc does not
+      // understand paired registers for cfi_offset.
+      // Eg .cfi_offset r1:0, -64
+
+      unsigned HiReg = HRI.getSubReg(Reg, Hexagon::isub_hi);
+      unsigned LoReg = HRI.getSubReg(Reg, Hexagon::isub_lo);
+      unsigned HiDwarfReg = HRI.getDwarfRegNum(HiReg, true);
+      unsigned LoDwarfReg = HRI.getDwarfRegNum(LoReg, true);
+      auto OffHi = MCCFIInstruction::createOffset(FrameLabel, HiDwarfReg,
+                                                  Offset+4);
+      BuildMI(MBB, At, DL, CFID)
+          .addCFIIndex(MF.addFrameInst(OffHi));
+      auto OffLo = MCCFIInstruction::createOffset(FrameLabel, LoDwarfReg,
+                                                  Offset);
+      BuildMI(MBB, At, DL, CFID)
+          .addCFIIndex(MF.addFrameInst(OffLo));
+    }
+  }
+}
+
+bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
+  auto &MFI = MF.getFrameInfo();
+  auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+
+  bool HasFixed = MFI.getNumFixedObjects();
+  bool HasPrealloc = const_cast<MachineFrameInfo&>(MFI)
+                        .getLocalFrameObjectCount();
+  bool HasExtraAlign = HRI.needsStackRealignment(MF);
+  bool HasAlloca = MFI.hasVarSizedObjects();
+
+  // Insert ALLOCFRAME if we need to or at -O0 for the debugger.  Think
+  // that this shouldn't be required, but doing so now because gcc does and
+  // gdb can't break at the start of the function without it.  Will remove if
+  // this turns out to be a gdb bug.
+  //
+  if (MF.getTarget().getOptLevel() == CodeGenOpt::None)
+    return true;
+
+  // By default we want to use SP (since it's always there). FP requires
+  // some setup (i.e. ALLOCFRAME).
+  // Fixed and preallocated objects need FP if the distance from them to
+  // the SP is unknown (as is with alloca or aligna).
+  if ((HasFixed || HasPrealloc) && (HasAlloca || HasExtraAlign))
+    return true;
+
+  if (MFI.getStackSize() > 0) {
+    if (EnableStackOVFSanitizer || UseAllocframe)
+      return true;
+  }
+
+  if (MFI.hasCalls() ||
+      MF.getInfo<HexagonMachineFunctionInfo>()->hasClobberLR())
+    return true;
+
+  return false;
+}
+
+enum SpillKind {
+  SK_ToMem,
+  SK_FromMem,
+  SK_FromMemTailcall
+};
+
+static const char *getSpillFunctionFor(unsigned MaxReg, SpillKind SpillType,
+      bool Stkchk = false) {
+  const char * V4SpillToMemoryFunctions[] = {
+    "__save_r16_through_r17",
+    "__save_r16_through_r19",
+    "__save_r16_through_r21",
+    "__save_r16_through_r23",
+    "__save_r16_through_r25",
+    "__save_r16_through_r27" };
+
+  const char * V4SpillToMemoryStkchkFunctions[] = {
+    "__save_r16_through_r17_stkchk",
+    "__save_r16_through_r19_stkchk",
+    "__save_r16_through_r21_stkchk",
+    "__save_r16_through_r23_stkchk",
+    "__save_r16_through_r25_stkchk",
+    "__save_r16_through_r27_stkchk" };
+
+  const char * V4SpillFromMemoryFunctions[] = {
+    "__restore_r16_through_r17_and_deallocframe",
+    "__restore_r16_through_r19_and_deallocframe",
+    "__restore_r16_through_r21_and_deallocframe",
+    "__restore_r16_through_r23_and_deallocframe",
+    "__restore_r16_through_r25_and_deallocframe",
+    "__restore_r16_through_r27_and_deallocframe" };
+
+  const char * V4SpillFromMemoryTailcallFunctions[] = {
+    "__restore_r16_through_r17_and_deallocframe_before_tailcall",
+    "__restore_r16_through_r19_and_deallocframe_before_tailcall",
+    "__restore_r16_through_r21_and_deallocframe_before_tailcall",
+    "__restore_r16_through_r23_and_deallocframe_before_tailcall",
+    "__restore_r16_through_r25_and_deallocframe_before_tailcall",
+    "__restore_r16_through_r27_and_deallocframe_before_tailcall"
+  };
+
+  const char **SpillFunc = nullptr;
+
+  switch(SpillType) {
+  case SK_ToMem:
+    SpillFunc = Stkchk ? V4SpillToMemoryStkchkFunctions
+                       : V4SpillToMemoryFunctions;
+    break;
+  case SK_FromMem:
+    SpillFunc = V4SpillFromMemoryFunctions;
+    break;
+  case SK_FromMemTailcall:
+    SpillFunc = V4SpillFromMemoryTailcallFunctions;
+    break;
+  }
+  assert(SpillFunc && "Unknown spill kind");
+
+  // Spill all callee-saved registers up to the highest register used.
+  switch (MaxReg) {
+  case Hexagon::R17:
+    return SpillFunc[0];
+  case Hexagon::R19:
+    return SpillFunc[1];
+  case Hexagon::R21:
+    return SpillFunc[2];
+  case Hexagon::R23:
+    return SpillFunc[3];
+  case Hexagon::R25:
+    return SpillFunc[4];
+  case Hexagon::R27:
+    return SpillFunc[5];
+  default:
+    llvm_unreachable("Unhandled maximum callee save register");
+  }
+  return nullptr;
+}
+
+int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+      int FI, unsigned &FrameReg) const {
+  auto &MFI = MF.getFrameInfo();
+  auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+
+  int Offset = MFI.getObjectOffset(FI);
+  bool HasAlloca = MFI.hasVarSizedObjects();
+  bool HasExtraAlign = HRI.needsStackRealignment(MF);
+  bool NoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None;
+
+  unsigned SP = HRI.getStackRegister(), FP = HRI.getFrameRegister();
+  auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+  unsigned AP = HMFI.getStackAlignBasePhysReg();
+  unsigned FrameSize = MFI.getStackSize();
+
+  bool UseFP = false, UseAP = false;  // Default: use SP (except at -O0).
+  // Use FP at -O0, except when there are objects with extra alignment.
+  // That additional alignment requirement may cause a pad to be inserted,
+  // which will make it impossible to use FP to access objects located
+  // past the pad.
+  if (NoOpt && !HasExtraAlign)
+    UseFP = true;
+  if (MFI.isFixedObjectIndex(FI) || MFI.isObjectPreAllocated(FI)) {
+    // Fixed and preallocated objects will be located before any padding
+    // so FP must be used to access them.
+    UseFP |= (HasAlloca || HasExtraAlign);
+  } else {
+    if (HasAlloca) {
+      if (HasExtraAlign)
+        UseAP = true;
+      else
+        UseFP = true;
+    }
+  }
+
+  // If FP was picked, then there had better be FP.
+  bool HasFP = hasFP(MF);
+  assert((HasFP || !UseFP) && "This function must have frame pointer");
+
+  // Having FP implies allocframe. Allocframe will store extra 8 bytes:
+  // FP/LR. If the base register is used to access an object across these
+  // 8 bytes, then the offset will need to be adjusted by 8.
+  //
+  // After allocframe:
+  //                    HexagonISelLowering adds 8 to ---+
+  //                    the offsets of all stack-based   |
+  //                    arguments (*)                    |
+  //                                                     |
+  //   getObjectOffset < 0   0     8  getObjectOffset >= 8
+  // ------------------------+-----+------------------------> increasing
+  //     <local objects>     |FP/LR|    <input arguments>     addresses
+  // -----------------+------+-----+------------------------>
+  //                  |      |
+  //    SP/AP point --+      +-- FP points here (**)
+  //    somewhere on
+  //    this side of FP/LR
+  //
+  // (*) See LowerFormalArguments. The FP/LR is assumed to be present.
+  // (**) *FP == old-FP. FP+0..7 are the bytes of FP/LR.
+
+  // The lowering assumes that FP/LR is present, and so the offsets of
+  // the formal arguments start at 8. If FP/LR is not there we need to
+  // reduce the offset by 8.
+  if (Offset > 0 && !HasFP)
+    Offset -= 8;
+
+  if (UseFP)
+    FrameReg = FP;
+  else if (UseAP)
+    FrameReg = AP;
+  else
+    FrameReg = SP;
+
+  // Calculate the actual offset in the instruction. If there is no FP
+  // (in other words, no allocframe), then SP will not be adjusted (i.e.
+  // there will be no SP -= FrameSize), so the frame size should not be
+  // added to the calculated offset.
+  int RealOffset = Offset;
+  if (!UseFP && !UseAP && HasFP)
+    RealOffset = FrameSize+Offset;
+  return RealOffset;
+}
+
+bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB,
+      const CSIVect &CSI, const HexagonRegisterInfo &HRI,
+      bool &PrologueStubs) const {
+  if (CSI.empty())
+    return true;
+
+  MachineBasicBlock::iterator MI = MBB.begin();
+  PrologueStubs = false;
+  MachineFunction &MF = *MBB.getParent();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HII = *HST.getInstrInfo();
+
+  if (useSpillFunction(MF, CSI)) {
+    PrologueStubs = true;
+    unsigned MaxReg = getMaxCalleeSavedReg(CSI, HRI);
+    bool StkOvrFlowEnabled = EnableStackOVFSanitizer;
+    const char *SpillFun = getSpillFunctionFor(MaxReg, SK_ToMem,
+                                               StkOvrFlowEnabled);
+    auto &HTM = static_cast<const HexagonTargetMachine&>(MF.getTarget());
+    bool IsPIC = HTM.isPositionIndependent();
+    bool LongCalls = HST.useLongCalls() || EnableSaveRestoreLong;
+
+    // Call spill function.
+    DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
+    unsigned SpillOpc;
+    if (StkOvrFlowEnabled) {
+      if (LongCalls)
+        SpillOpc = IsPIC ? Hexagon::SAVE_REGISTERS_CALL_V4STK_EXT_PIC
+                         : Hexagon::SAVE_REGISTERS_CALL_V4STK_EXT;
+      else
+        SpillOpc = IsPIC ? Hexagon::SAVE_REGISTERS_CALL_V4STK_PIC
+                         : Hexagon::SAVE_REGISTERS_CALL_V4STK;
+    } else {
+      if (LongCalls)
+        SpillOpc = IsPIC ? Hexagon::SAVE_REGISTERS_CALL_V4_EXT_PIC
+                         : Hexagon::SAVE_REGISTERS_CALL_V4_EXT;
+      else
+        SpillOpc = IsPIC ? Hexagon::SAVE_REGISTERS_CALL_V4_PIC
+                         : Hexagon::SAVE_REGISTERS_CALL_V4;
+    }
+
+    MachineInstr *SaveRegsCall =
+        BuildMI(MBB, MI, DL, HII.get(SpillOpc))
+          .addExternalSymbol(SpillFun);
+
+    // Add callee-saved registers as use.
+    addCalleeSaveRegistersAsImpOperand(SaveRegsCall, CSI, false, true);
+    // Add live in registers.
+    for (unsigned I = 0; I < CSI.size(); ++I)
+      MBB.addLiveIn(CSI[I].getReg());
+    return true;
+  }
+
+  for (unsigned i = 0, n = CSI.size(); i < n; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    // Add live in registers. We treat eh_return callee saved register r0 - r3
+    // specially. They are not really callee saved registers as they are not
+    // supposed to be killed.
+    bool IsKill = !HRI.isEHReturnCalleeSaveReg(Reg);
+    int FI = CSI[i].getFrameIdx();
+    const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
+    HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI);
+    if (IsKill)
+      MBB.addLiveIn(Reg);
+  }
+  return true;
+}
+
+bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB,
+      const CSIVect &CSI, const HexagonRegisterInfo &HRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineBasicBlock::iterator MI = MBB.getFirstTerminator();
+  MachineFunction &MF = *MBB.getParent();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HII = *HST.getInstrInfo();
+
+  if (useRestoreFunction(MF, CSI)) {
+    bool HasTC = hasTailCall(MBB) || !hasReturn(MBB);
+    unsigned MaxR = getMaxCalleeSavedReg(CSI, HRI);
+    SpillKind Kind = HasTC ? SK_FromMemTailcall : SK_FromMem;
+    const char *RestoreFn = getSpillFunctionFor(MaxR, Kind);
+    auto &HTM = static_cast<const HexagonTargetMachine&>(MF.getTarget());
+    bool IsPIC = HTM.isPositionIndependent();
+    bool LongCalls = HST.useLongCalls() || EnableSaveRestoreLong;
+
+    // Call spill function.
+    DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc()
+                                  : MBB.getLastNonDebugInstr()->getDebugLoc();
+    MachineInstr *DeallocCall = nullptr;
+
+    if (HasTC) {
+      unsigned RetOpc;
+      if (LongCalls)
+        RetOpc = IsPIC ? Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC
+                       : Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT;
+      else
+        RetOpc = IsPIC ? Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC
+                       : Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4;
+      DeallocCall = BuildMI(MBB, MI, DL, HII.get(RetOpc))
+          .addExternalSymbol(RestoreFn);
+    } else {
+      // The block has a return.
+      MachineBasicBlock::iterator It = MBB.getFirstTerminator();
+      assert(It->isReturn() && std::next(It) == MBB.end());
+      unsigned RetOpc;
+      if (LongCalls)
+        RetOpc = IsPIC ? Hexagon::RESTORE_DEALLOC_RET_JMP_V4_EXT_PIC
+                       : Hexagon::RESTORE_DEALLOC_RET_JMP_V4_EXT;
+      else
+        RetOpc = IsPIC ? Hexagon::RESTORE_DEALLOC_RET_JMP_V4_PIC
+                       : Hexagon::RESTORE_DEALLOC_RET_JMP_V4;
+      DeallocCall = BuildMI(MBB, It, DL, HII.get(RetOpc))
+          .addExternalSymbol(RestoreFn);
+      // Transfer the function live-out registers.
+      DeallocCall->copyImplicitOps(MF, *It);
+    }
+    addCalleeSaveRegistersAsImpOperand(DeallocCall, CSI, true, false);
+    return true;
+  }
+
+  for (unsigned i = 0; i < CSI.size(); ++i) {
+    unsigned Reg = CSI[i].getReg();
+    const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
+    int FI = CSI[i].getFrameIdx();
+    HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI);
+  }
+
+  return true;
+}
+
+MachineBasicBlock::iterator HexagonFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  MachineInstr &MI = *I;
+  unsigned Opc = MI.getOpcode();
+  (void)Opc; // Silence compiler warning.
+  assert((Opc == Hexagon::ADJCALLSTACKDOWN || Opc == Hexagon::ADJCALLSTACKUP) &&
+         "Cannot handle this call frame pseudo instruction");
+  return MBB.erase(I);
+}
+
+void HexagonFrameLowering::processFunctionBeforeFrameFinalized(
+    MachineFunction &MF, RegScavenger *RS) const {
+  // If this function has uses aligned stack and also has variable sized stack
+  // objects, then we need to map all spill slots to fixed positions, so that
+  // they can be accessed through FP. Otherwise they would have to be accessed
+  // via AP, which may not be available at the particular place in the program.
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool HasAlloca = MFI.hasVarSizedObjects();
+  bool NeedsAlign = (MFI.getMaxAlignment() > getStackAlignment());
+
+  if (!HasAlloca || !NeedsAlign)
+    return;
+
+  unsigned LFS = MFI.getLocalFrameSize();
+  for (int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
+    if (!MFI.isSpillSlotObjectIndex(i) || MFI.isDeadObjectIndex(i))
+      continue;
+    unsigned S = MFI.getObjectSize(i);
+    // Reduce the alignment to at most 8. This will require unaligned vector
+    // stores if they happen here.
+    unsigned A = std::max(MFI.getObjectAlignment(i), 8U);
+    MFI.setObjectAlignment(i, 8);
+    LFS = alignTo(LFS+S, A);
+    MFI.mapLocalFrameObject(i, -LFS);
+  }
+
+  MFI.setLocalFrameSize(LFS);
+  unsigned A = MFI.getLocalFrameMaxAlign();
+  assert(A <= 8 && "Unexpected local frame alignment");
+  if (A == 0)
+    MFI.setLocalFrameMaxAlign(8);
+  MFI.setUseLocalStackAllocationBlock(true);
+
+  // Set the physical aligned-stack base address register.
+  unsigned AP = 0;
+  if (const MachineInstr *AI = getAlignaInstr(MF))
+    AP = AI->getOperand(0).getReg();
+  auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+  HMFI.setStackAlignBasePhysReg(AP);
+}
+
+/// Returns true if there are no caller-saved registers available in class RC.
+static bool needToReserveScavengingSpillSlots(MachineFunction &MF,
+      const HexagonRegisterInfo &HRI, const TargetRegisterClass *RC) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  auto IsUsed = [&HRI,&MRI] (unsigned Reg) -> bool {
+    for (MCRegAliasIterator AI(Reg, &HRI, true); AI.isValid(); ++AI)
+      if (MRI.isPhysRegUsed(*AI))
+        return true;
+    return false;
+  };
+
+  // Check for an unused caller-saved register. Callee-saved registers
+  // have become pristine by now.
+  for (const MCPhysReg *P = HRI.getCallerSavedRegs(&MF, RC); *P; ++P)
+    if (!IsUsed(*P))
+      return false;
+
+  // All caller-saved registers are used.
+  return true;
+}
+
+#ifndef NDEBUG
+static void dump_registers(BitVector &Regs, const TargetRegisterInfo &TRI) {
+  dbgs() << '{';
+  for (int x = Regs.find_first(); x >= 0; x = Regs.find_next(x)) {
+    unsigned R = x;
+    dbgs() << ' ' << PrintReg(R, &TRI);
+  }
+  dbgs() << " }";
+}
+#endif
+
+bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
+      const TargetRegisterInfo *TRI, std::vector<CalleeSavedInfo> &CSI) const {
+  DEBUG(dbgs() << __func__ << " on "
+               << MF.getFunction()->getName() << '\n');
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  BitVector SRegs(Hexagon::NUM_TARGET_REGS);
+
+  // Generate a set of unique, callee-saved registers (SRegs), where each
+  // register in the set is maximal in terms of sub-/super-register relation,
+  // i.e. for each R in SRegs, no proper super-register of R is also in SRegs.
+
+  // (1) For each callee-saved register, add that register and all of its
+  // sub-registers to SRegs.
+  DEBUG(dbgs() << "Initial CS registers: {");
+  for (unsigned i = 0, n = CSI.size(); i < n; ++i) {
+    unsigned R = CSI[i].getReg();
+    DEBUG(dbgs() << ' ' << PrintReg(R, TRI));
+    for (MCSubRegIterator SR(R, TRI, true); SR.isValid(); ++SR)
+      SRegs[*SR] = true;
+  }
+  DEBUG(dbgs() << " }\n");
+  DEBUG(dbgs() << "SRegs.1: "; dump_registers(SRegs, *TRI); dbgs() << "\n");
+
+  // (2) For each reserved register, remove that register and all of its
+  // sub- and super-registers from SRegs.
+  BitVector Reserved = TRI->getReservedRegs(MF);
+  for (int x = Reserved.find_first(); x >= 0; x = Reserved.find_next(x)) {
+    unsigned R = x;
+    for (MCSuperRegIterator SR(R, TRI, true); SR.isValid(); ++SR)
+      SRegs[*SR] = false;
+  }
+  DEBUG(dbgs() << "Res:     "; dump_registers(Reserved, *TRI); dbgs() << "\n");
+  DEBUG(dbgs() << "SRegs.2: "; dump_registers(SRegs, *TRI); dbgs() << "\n");
+
+  // (3) Collect all registers that have at least one sub-register in SRegs,
+  // and also have no sub-registers that are reserved. These will be the can-
+  // didates for saving as a whole instead of their individual sub-registers.
+  // (Saving R17:16 instead of R16 is fine, but only if R17 was not reserved.)
+  BitVector TmpSup(Hexagon::NUM_TARGET_REGS);
+  for (int x = SRegs.find_first(); x >= 0; x = SRegs.find_next(x)) {
+    unsigned R = x;
+    for (MCSuperRegIterator SR(R, TRI); SR.isValid(); ++SR)
+      TmpSup[*SR] = true;
+  }
+  for (int x = TmpSup.find_first(); x >= 0; x = TmpSup.find_next(x)) {
+    unsigned R = x;
+    for (MCSubRegIterator SR(R, TRI, true); SR.isValid(); ++SR) {
+      if (!Reserved[*SR])
+        continue;
+      TmpSup[R] = false;
+      break;
+    }
+  }
+  DEBUG(dbgs() << "TmpSup:  "; dump_registers(TmpSup, *TRI); dbgs() << "\n");
+
+  // (4) Include all super-registers found in (3) into SRegs.
+  SRegs |= TmpSup;
+  DEBUG(dbgs() << "SRegs.4: "; dump_registers(SRegs, *TRI); dbgs() << "\n");
+
+  // (5) For each register R in SRegs, if any super-register of R is in SRegs,
+  // remove R from SRegs.
+  for (int x = SRegs.find_first(); x >= 0; x = SRegs.find_next(x)) {
+    unsigned R = x;
+    for (MCSuperRegIterator SR(R, TRI); SR.isValid(); ++SR) {
+      if (!SRegs[*SR])
+        continue;
+      SRegs[R] = false;
+      break;
+    }
+  }
+  DEBUG(dbgs() << "SRegs.5: "; dump_registers(SRegs, *TRI); dbgs() << "\n");
+
+  // Now, for each register that has a fixed stack slot, create the stack
+  // object for it.
+  CSI.clear();
+
+  typedef TargetFrameLowering::SpillSlot SpillSlot;
+  unsigned NumFixed;
+  int MinOffset = 0;  // CS offsets are negative.
+  const SpillSlot *FixedSlots = getCalleeSavedSpillSlots(NumFixed);
+  for (const SpillSlot *S = FixedSlots; S != FixedSlots+NumFixed; ++S) {
+    if (!SRegs[S->Reg])
+      continue;
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(S->Reg);
+    int FI = MFI.CreateFixedSpillStackObject(RC->getSize(), S->Offset);
+    MinOffset = std::min(MinOffset, S->Offset);
+    CSI.push_back(CalleeSavedInfo(S->Reg, FI));
+    SRegs[S->Reg] = false;
+  }
+
+  // There can be some registers that don't have fixed slots. For example,
+  // we need to store R0-R3 in functions with exception handling. For each
+  // such register, create a non-fixed stack object.
+  for (int x = SRegs.find_first(); x >= 0; x = SRegs.find_next(x)) {
+    unsigned R = x;
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(R);
+    int Off = MinOffset - RC->getSize();
+    unsigned Align = std::min(RC->getAlignment(), getStackAlignment());
+    assert(isPowerOf2_32(Align));
+    Off &= -Align;
+    int FI = MFI.CreateFixedSpillStackObject(RC->getSize(), Off);
+    MinOffset = std::min(MinOffset, Off);
+    CSI.push_back(CalleeSavedInfo(R, FI));
+    SRegs[R] = false;
+  }
+
+  DEBUG({
+    dbgs() << "CS information: {";
+    for (unsigned i = 0, n = CSI.size(); i < n; ++i) {
+      int FI = CSI[i].getFrameIdx();
+      int Off = MFI.getObjectOffset(FI);
+      dbgs() << ' ' << PrintReg(CSI[i].getReg(), TRI) << ":fi#" << FI << ":sp";
+      if (Off >= 0)
+        dbgs() << '+';
+      dbgs() << Off;
+    }
+    dbgs() << " }\n";
+  });
+
+#ifndef NDEBUG
+  // Verify that all registers were handled.
+  bool MissedReg = false;
+  for (int x = SRegs.find_first(); x >= 0; x = SRegs.find_next(x)) {
+    unsigned R = x;
+    dbgs() << PrintReg(R, TRI) << ' ';
+    MissedReg = true;
+  }
+  if (MissedReg)
+    llvm_unreachable("...there are unhandled callee-saved registers!");
+#endif
+
+  return true;
+}
+
+bool HexagonFrameLowering::expandCopy(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  MachineInstr *MI = &*It;
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned DstR = MI->getOperand(0).getReg();
+  unsigned SrcR = MI->getOperand(1).getReg();
+  if (!Hexagon::ModRegsRegClass.contains(DstR) ||
+      !Hexagon::ModRegsRegClass.contains(SrcR))
+    return false;
+
+  unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), TmpR)
+    .addOperand(MI->getOperand(1));
+  BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), DstR)
+    .addReg(TmpR, RegState::Kill);
+
+  NewRegs.push_back(TmpR);
+  B.erase(It);
+  return true;
+}
+
+bool HexagonFrameLowering::expandStoreInt(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  MachineInstr *MI = &*It;
+  if (!MI->getOperand(0).isFI())
+    return false;
+
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Opc = MI->getOpcode();
+  unsigned SrcR = MI->getOperand(2).getReg();
+  bool IsKill = MI->getOperand(2).isKill();
+  int FI = MI->getOperand(0).getIndex();
+
+  // TmpR = C2_tfrpr SrcR   if SrcR is a predicate register
+  // TmpR = A2_tfrcrr SrcR  if SrcR is a modifier register
+  unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  unsigned TfrOpc = (Opc == Hexagon::STriw_pred) ? Hexagon::C2_tfrpr
+                                                 : Hexagon::A2_tfrcrr;
+  BuildMI(B, It, DL, HII.get(TfrOpc), TmpR)
+    .addReg(SrcR, getKillRegState(IsKill));
+
+  // S2_storeri_io FI, 0, TmpR
+  BuildMI(B, It, DL, HII.get(Hexagon::S2_storeri_io))
+    .addFrameIndex(FI)
+    .addImm(0)
+    .addReg(TmpR, RegState::Kill)
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  NewRegs.push_back(TmpR);
+  B.erase(It);
+  return true;
+}
+
+bool HexagonFrameLowering::expandLoadInt(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  MachineInstr *MI = &*It;
+  if (!MI->getOperand(1).isFI())
+    return false;
+
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Opc = MI->getOpcode();
+  unsigned DstR = MI->getOperand(0).getReg();
+  int FI = MI->getOperand(1).getIndex();
+
+  // TmpR = L2_loadri_io FI, 0
+  unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  BuildMI(B, It, DL, HII.get(Hexagon::L2_loadri_io), TmpR)
+    .addFrameIndex(FI)
+    .addImm(0)
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  // DstR = C2_tfrrp TmpR   if DstR is a predicate register
+  // DstR = A2_tfrrcr TmpR  if DstR is a modifier register
+  unsigned TfrOpc = (Opc == Hexagon::LDriw_pred) ? Hexagon::C2_tfrrp
+                                                 : Hexagon::A2_tfrrcr;
+  BuildMI(B, It, DL, HII.get(TfrOpc), DstR)
+    .addReg(TmpR, RegState::Kill);
+
+  NewRegs.push_back(TmpR);
+  B.erase(It);
+  return true;
+}
+
+bool HexagonFrameLowering::expandStoreVecPred(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  auto &HST = B.getParent()->getSubtarget<HexagonSubtarget>();
+  MachineInstr *MI = &*It;
+  if (!MI->getOperand(0).isFI())
+    return false;
+
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned SrcR = MI->getOperand(2).getReg();
+  bool IsKill = MI->getOperand(2).isKill();
+  int FI = MI->getOperand(0).getIndex();
+
+  bool Is128B = HST.useHVXDblOps();
+  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+                     : &Hexagon::VectorRegs128BRegClass;
+
+  // Insert transfer to general vector register.
+  //   TmpR0 = A2_tfrsi 0x01010101
+  //   TmpR1 = V6_vandqrt Qx, TmpR0
+  //   store FI, 0, TmpR1
+  unsigned TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  unsigned TmpR1 = MRI.createVirtualRegister(RC);
+
+  BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0)
+    .addImm(0x01010101);
+
+  unsigned VandOpc = !Is128B ? Hexagon::V6_vandqrt : Hexagon::V6_vandqrt_128B;
+  BuildMI(B, It, DL, HII.get(VandOpc), TmpR1)
+    .addReg(SrcR, getKillRegState(IsKill))
+    .addReg(TmpR0, RegState::Kill);
+
+  auto *HRI = B.getParent()->getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  HII.storeRegToStackSlot(B, It, TmpR1, true, FI, RC, HRI);
+  expandStoreVec(B, std::prev(It), MRI, HII, NewRegs);
+
+  NewRegs.push_back(TmpR0);
+  NewRegs.push_back(TmpR1);
+  B.erase(It);
+  return true;
+}
+
+bool HexagonFrameLowering::expandLoadVecPred(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  auto &HST = B.getParent()->getSubtarget<HexagonSubtarget>();
+  MachineInstr *MI = &*It;
+  if (!MI->getOperand(1).isFI())
+    return false;
+
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned DstR = MI->getOperand(0).getReg();
+  int FI = MI->getOperand(1).getIndex();
+
+  bool Is128B = HST.useHVXDblOps();
+  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+                     : &Hexagon::VectorRegs128BRegClass;
+
+  // TmpR0 = A2_tfrsi 0x01010101
+  // TmpR1 = load FI, 0
+  // DstR = V6_vandvrt TmpR1, TmpR0
+  unsigned TmpR0 = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+  unsigned TmpR1 = MRI.createVirtualRegister(RC);
+
+  BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0)
+    .addImm(0x01010101);
+  auto *HRI = B.getParent()->getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  HII.loadRegFromStackSlot(B, It, TmpR1, FI, RC, HRI);
+  expandLoadVec(B, std::prev(It), MRI, HII, NewRegs);
+
+  unsigned VandOpc = !Is128B ? Hexagon::V6_vandvrt : Hexagon::V6_vandvrt_128B;
+  BuildMI(B, It, DL, HII.get(VandOpc), DstR)
+    .addReg(TmpR1, RegState::Kill)
+    .addReg(TmpR0, RegState::Kill);
+
+  NewRegs.push_back(TmpR0);
+  NewRegs.push_back(TmpR1);
+  B.erase(It);
+  return true;
+}
+
+bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  MachineFunction &MF = *B.getParent();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &MFI = MF.getFrameInfo();
+  auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  MachineInstr *MI = &*It;
+  if (!MI->getOperand(0).isFI())
+    return false;
+
+  // It is possible that the double vector being stored is only partially
+  // defined. From the point of view of the liveness tracking, it is ok to
+  // store it as a whole, but if we break it up we may end up storing a
+  // register that is entirely undefined.
+  LivePhysRegs LPR(&HRI);
+  LPR.addLiveIns(B);
+  SmallVector<std::pair<unsigned, const MachineOperand*>,2> Clobbers;
+  for (auto R = B.begin(); R != It; ++R)
+    LPR.stepForward(*R, Clobbers);
+
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned SrcR = MI->getOperand(2).getReg();
+  unsigned SrcLo = HRI.getSubReg(SrcR, Hexagon::vsub_lo);
+  unsigned SrcHi = HRI.getSubReg(SrcR, Hexagon::vsub_hi);
+  bool IsKill = MI->getOperand(2).isKill();
+  int FI = MI->getOperand(0).getIndex();
+
+  bool Is128B = HST.useHVXDblOps();
+  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+                     : &Hexagon::VectorRegs128BRegClass;
+  unsigned Size = RC->getSize();
+  unsigned NeedAlign = RC->getAlignment();
+  unsigned HasAlign = MFI.getObjectAlignment(FI);
+  unsigned StoreOpc;
+
+  // Store low part.
+  if (LPR.contains(SrcLo)) {
+    if (NeedAlign <= HasAlign)
+      StoreOpc = !Is128B ? Hexagon::V6_vS32b_ai  : Hexagon::V6_vS32b_ai_128B;
+    else
+      StoreOpc = !Is128B ? Hexagon::V6_vS32Ub_ai : Hexagon::V6_vS32Ub_ai_128B;
+
+    BuildMI(B, It, DL, HII.get(StoreOpc))
+      .addFrameIndex(FI)
+      .addImm(0)
+      .addReg(SrcLo, getKillRegState(IsKill))
+      .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+  }
+
+  // Store high part.
+  if (LPR.contains(SrcHi)) {
+    if (NeedAlign <= MinAlign(HasAlign, Size))
+      StoreOpc = !Is128B ? Hexagon::V6_vS32b_ai  : Hexagon::V6_vS32b_ai_128B;
+    else
+      StoreOpc = !Is128B ? Hexagon::V6_vS32Ub_ai : Hexagon::V6_vS32Ub_ai_128B;
+
+    BuildMI(B, It, DL, HII.get(StoreOpc))
+      .addFrameIndex(FI)
+      .addImm(Size)
+      .addReg(SrcHi, getKillRegState(IsKill))
+      .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+  }
+
+  B.erase(It);
+  return true;
+}
+
+bool HexagonFrameLowering::expandLoadVec2(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  MachineFunction &MF = *B.getParent();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &MFI = MF.getFrameInfo();
+  auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  MachineInstr *MI = &*It;
+  if (!MI->getOperand(1).isFI())
+    return false;
+
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned DstR = MI->getOperand(0).getReg();
+  unsigned DstHi = HRI.getSubReg(DstR, Hexagon::vsub_hi);
+  unsigned DstLo = HRI.getSubReg(DstR, Hexagon::vsub_lo);
+  int FI = MI->getOperand(1).getIndex();
+
+  bool Is128B = HST.useHVXDblOps();
+  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+                     : &Hexagon::VectorRegs128BRegClass;
+  unsigned Size = RC->getSize();
+  unsigned NeedAlign = RC->getAlignment();
+  unsigned HasAlign = MFI.getObjectAlignment(FI);
+  unsigned LoadOpc;
+
+  // Load low part.
+  if (NeedAlign <= HasAlign)
+    LoadOpc = !Is128B ? Hexagon::V6_vL32b_ai  : Hexagon::V6_vL32b_ai_128B;
+  else
+    LoadOpc = !Is128B ? Hexagon::V6_vL32Ub_ai : Hexagon::V6_vL32Ub_ai_128B;
+
+  BuildMI(B, It, DL, HII.get(LoadOpc), DstLo)
+    .addFrameIndex(FI)
+    .addImm(0)
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  // Load high part.
+  if (NeedAlign <= MinAlign(HasAlign, Size))
+    LoadOpc = !Is128B ? Hexagon::V6_vL32b_ai  : Hexagon::V6_vL32b_ai_128B;
+  else
+    LoadOpc = !Is128B ? Hexagon::V6_vL32Ub_ai : Hexagon::V6_vL32Ub_ai_128B;
+
+  BuildMI(B, It, DL, HII.get(LoadOpc), DstHi)
+    .addFrameIndex(FI)
+    .addImm(Size)
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  B.erase(It);
+  return true;
+}
+
+bool HexagonFrameLowering::expandStoreVec(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  MachineFunction &MF = *B.getParent();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &MFI = MF.getFrameInfo();
+  MachineInstr *MI = &*It;
+  if (!MI->getOperand(0).isFI())
+    return false;
+
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned SrcR = MI->getOperand(2).getReg();
+  bool IsKill = MI->getOperand(2).isKill();
+  int FI = MI->getOperand(0).getIndex();
+
+  bool Is128B = HST.useHVXDblOps();
+  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+                     : &Hexagon::VectorRegs128BRegClass;
+
+  unsigned NeedAlign = RC->getAlignment();
+  unsigned HasAlign = MFI.getObjectAlignment(FI);
+  unsigned StoreOpc;
+
+  if (NeedAlign <= HasAlign)
+    StoreOpc = !Is128B ? Hexagon::V6_vS32b_ai : Hexagon::V6_vS32b_ai_128B;
+  else
+    StoreOpc = !Is128B ? Hexagon::V6_vS32Ub_ai : Hexagon::V6_vS32Ub_ai_128B;
+
+  BuildMI(B, It, DL, HII.get(StoreOpc))
+    .addFrameIndex(FI)
+    .addImm(0)
+    .addReg(SrcR, getKillRegState(IsKill))
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  B.erase(It);
+  return true;
+}
+
+bool HexagonFrameLowering::expandLoadVec(MachineBasicBlock &B,
+      MachineBasicBlock::iterator It, MachineRegisterInfo &MRI,
+      const HexagonInstrInfo &HII, SmallVectorImpl<unsigned> &NewRegs) const {
+  MachineFunction &MF = *B.getParent();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &MFI = MF.getFrameInfo();
+  MachineInstr *MI = &*It;
+  if (!MI->getOperand(1).isFI())
+    return false;
+
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned DstR = MI->getOperand(0).getReg();
+  int FI = MI->getOperand(1).getIndex();
+
+  bool Is128B = HST.useHVXDblOps();
+  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
+                     : &Hexagon::VectorRegs128BRegClass;
+
+  unsigned NeedAlign = RC->getAlignment();
+  unsigned HasAlign = MFI.getObjectAlignment(FI);
+  unsigned LoadOpc;
+
+  if (NeedAlign <= HasAlign)
+    LoadOpc = !Is128B ? Hexagon::V6_vL32b_ai : Hexagon::V6_vL32b_ai_128B;
+  else
+    LoadOpc = !Is128B ? Hexagon::V6_vL32Ub_ai : Hexagon::V6_vL32Ub_ai_128B;
+
+  BuildMI(B, It, DL, HII.get(LoadOpc), DstR)
+    .addFrameIndex(FI)
+    .addImm(0)
+    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  B.erase(It);
+  return true;
+}
+
+bool HexagonFrameLowering::expandSpillMacros(MachineFunction &MF,
+      SmallVectorImpl<unsigned> &NewRegs) const {
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HII = *HST.getInstrInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  bool Changed = false;
+
+  for (auto &B : MF) {
+    // Traverse the basic block.
+    MachineBasicBlock::iterator NextI;
+    for (auto I = B.begin(), E = B.end(); I != E; I = NextI) {
+      MachineInstr *MI = &*I;
+      NextI = std::next(I);
+      unsigned Opc = MI->getOpcode();
+
+      switch (Opc) {
+        case TargetOpcode::COPY:
+          Changed |= expandCopy(B, I, MRI, HII, NewRegs);
+          break;
+        case Hexagon::STriw_pred:
+        case Hexagon::STriw_mod:
+          Changed |= expandStoreInt(B, I, MRI, HII, NewRegs);
+          break;
+        case Hexagon::LDriw_pred:
+        case Hexagon::LDriw_mod:
+          Changed |= expandLoadInt(B, I, MRI, HII, NewRegs);
+          break;
+        case Hexagon::PS_vstorerq_ai:
+        case Hexagon::PS_vstorerq_ai_128B:
+          Changed |= expandStoreVecPred(B, I, MRI, HII, NewRegs);
+          break;
+        case Hexagon::PS_vloadrq_ai:
+        case Hexagon::PS_vloadrq_ai_128B:
+          Changed |= expandLoadVecPred(B, I, MRI, HII, NewRegs);
+          break;
+        case Hexagon::PS_vloadrw_ai:
+        case Hexagon::PS_vloadrwu_ai:
+        case Hexagon::PS_vloadrw_ai_128B:
+        case Hexagon::PS_vloadrwu_ai_128B:
+          Changed |= expandLoadVec2(B, I, MRI, HII, NewRegs);
+          break;
+        case Hexagon::PS_vstorerw_ai:
+        case Hexagon::PS_vstorerwu_ai:
+        case Hexagon::PS_vstorerw_ai_128B:
+        case Hexagon::PS_vstorerwu_ai_128B:
+          Changed |= expandStoreVec2(B, I, MRI, HII, NewRegs);
+          break;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+void HexagonFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                                BitVector &SavedRegs,
+                                                RegScavenger *RS) const {
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HRI = *HST.getRegisterInfo();
+
+  SavedRegs.resize(HRI.getNumRegs());
+
+  // If we have a function containing __builtin_eh_return we want to spill and
+  // restore all callee saved registers. Pretend that they are used.
+  if (MF.getInfo<HexagonMachineFunctionInfo>()->hasEHReturn())
+    for (const MCPhysReg *R = HRI.getCalleeSavedRegs(&MF); *R; ++R)
+      SavedRegs.set(*R);
+
+  // Replace predicate register pseudo spill code.
+  SmallVector<unsigned,8> NewRegs;
+  expandSpillMacros(MF, NewRegs);
+  if (OptimizeSpillSlots && !isOptNone(MF))
+    optimizeSpillSlots(MF, NewRegs);
+
+  // We need to reserve a a spill slot if scavenging could potentially require
+  // spilling a scavenged register.
+  if (!NewRegs.empty() || mayOverflowFrameOffset(MF)) {
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    SetVector<const TargetRegisterClass*> SpillRCs;
+    // Reserve an int register in any case, because it could be used to hold
+    // the stack offset in case it does not fit into a spill instruction.
+    SpillRCs.insert(&Hexagon::IntRegsRegClass);
+
+    for (unsigned VR : NewRegs)
+      SpillRCs.insert(MRI.getRegClass(VR));
+
+    for (auto *RC : SpillRCs) {
+      if (!needToReserveScavengingSpillSlots(MF, HRI, RC))
+        continue;
+      unsigned Num = RC == &Hexagon::IntRegsRegClass ? NumberScavengerSlots : 1;
+      unsigned S = RC->getSize(), A = RC->getAlignment();
+      for (unsigned i = 0; i < Num; i++) {
+        int NewFI = MFI.CreateSpillStackObject(S, A);
+        RS->addScavengingFrameIndex(NewFI);
+      }
+    }
+  }
+
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+}
+
+unsigned HexagonFrameLowering::findPhysReg(MachineFunction &MF,
+      HexagonBlockRanges::IndexRange &FIR,
+      HexagonBlockRanges::InstrIndexMap &IndexMap,
+      HexagonBlockRanges::RegToRangeMap &DeadMap,
+      const TargetRegisterClass *RC) const {
+  auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  auto &MRI = MF.getRegInfo();
+
+  auto isDead = [&FIR,&DeadMap] (unsigned Reg) -> bool {
+    auto F = DeadMap.find({Reg,0});
+    if (F == DeadMap.end())
+      return false;
+    for (auto &DR : F->second)
+      if (DR.contains(FIR))
+        return true;
+    return false;
+  };
+
+  for (unsigned Reg : RC->getRawAllocationOrder(MF)) {
+    bool Dead = true;
+    for (auto R : HexagonBlockRanges::expandToSubRegs({Reg,0}, MRI, HRI)) {
+      if (isDead(R.Reg))
+        continue;
+      Dead = false;
+      break;
+    }
+    if (Dead)
+      return Reg;
+  }
+  return 0;
+}
+
+void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
+      SmallVectorImpl<unsigned> &VRegs) const {
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HII = *HST.getInstrInfo();
+  auto &HRI = *HST.getRegisterInfo();
+  auto &MRI = MF.getRegInfo();
+  HexagonBlockRanges HBR(MF);
+
+  typedef std::map<MachineBasicBlock*,HexagonBlockRanges::InstrIndexMap>
+      BlockIndexMap;
+  typedef std::map<MachineBasicBlock*,HexagonBlockRanges::RangeList>
+      BlockRangeMap;
+  typedef HexagonBlockRanges::IndexType IndexType;
+
+  struct SlotInfo {
+    BlockRangeMap Map;
+    unsigned Size = 0;
+    const TargetRegisterClass *RC = nullptr;
+
+    SlotInfo() = default;
+  };
+
+  BlockIndexMap BlockIndexes;
+  SmallSet<int,4> BadFIs;
+  std::map<int,SlotInfo> FIRangeMap;
+
+  // Accumulate register classes: get a common class for a pre-existing
+  // class HaveRC and a new class NewRC. Return nullptr if a common class
+  // cannot be found, otherwise return the resulting class. If HaveRC is
+  // nullptr, assume that it is still unset.
+  auto getCommonRC = [&HRI] (const TargetRegisterClass *HaveRC,
+                             const TargetRegisterClass *NewRC)
+        -> const TargetRegisterClass* {
+    if (HaveRC == nullptr || HaveRC == NewRC)
+      return NewRC;
+    // Different classes, both non-null. Pick the more general one.
+    if (HaveRC->hasSubClassEq(NewRC))
+      return HaveRC;
+    if (NewRC->hasSubClassEq(HaveRC))
+      return NewRC;
+    return nullptr;
+  };
+
+  // Scan all blocks in the function. Check all occurrences of frame indexes,
+  // and collect relevant information.
+  for (auto &B : MF) {
+    std::map<int,IndexType> LastStore, LastLoad;
+    // Emplace appears not to be supported in gcc 4.7.2-4.
+    //auto P = BlockIndexes.emplace(&B, HexagonBlockRanges::InstrIndexMap(B));
+    auto P = BlockIndexes.insert(
+                std::make_pair(&B, HexagonBlockRanges::InstrIndexMap(B)));
+    auto &IndexMap = P.first->second;
+    DEBUG(dbgs() << "Index map for BB#" << B.getNumber() << "\n"
+                 << IndexMap << '\n');
+
+    for (auto &In : B) {
+      int LFI, SFI;
+      bool Load = HII.isLoadFromStackSlot(In, LFI) && !HII.isPredicated(In);
+      bool Store = HII.isStoreToStackSlot(In, SFI) && !HII.isPredicated(In);
+      if (Load && Store) {
+        // If it's both a load and a store, then we won't handle it.
+        BadFIs.insert(LFI);
+        BadFIs.insert(SFI);
+        continue;
+      }
+      // Check for register classes of the register used as the source for
+      // the store, and the register used as the destination for the load.
+      // Also, only accept base+imm_offset addressing modes. Other addressing
+      // modes can have side-effects (post-increments, etc.). For stack
+      // slots they are very unlikely, so there is not much loss due to
+      // this restriction.
+      if (Load || Store) {
+        int TFI = Load ? LFI : SFI;
+        unsigned AM = HII.getAddrMode(In);
+        SlotInfo &SI = FIRangeMap[TFI];
+        bool Bad = (AM != HexagonII::BaseImmOffset);
+        if (!Bad) {
+          // If the addressing mode is ok, check the register class.
+          unsigned OpNum = Load ? 0 : 2;
+          auto *RC = HII.getRegClass(In.getDesc(), OpNum, &HRI, MF);
+          RC = getCommonRC(SI.RC, RC);
+          if (RC == nullptr)
+            Bad = true;
+          else
+            SI.RC = RC;
+        }
+        if (!Bad) {
+          // Check sizes.
+          unsigned S = (1U << (HII.getMemAccessSize(In) - 1));
+          if (SI.Size != 0 && SI.Size != S)
+            Bad = true;
+          else
+            SI.Size = S;
+        }
+        if (!Bad) {
+          for (auto *Mo : In.memoperands()) {
+            if (!Mo->isVolatile())
+              continue;
+            Bad = true;
+            break;
+          }
+        }
+        if (Bad)
+          BadFIs.insert(TFI);
+      }
+
+      // Locate uses of frame indices.
+      for (unsigned i = 0, n = In.getNumOperands(); i < n; ++i) {
+        const MachineOperand &Op = In.getOperand(i);
+        if (!Op.isFI())
+          continue;
+        int FI = Op.getIndex();
+        // Make sure that the following operand is an immediate and that
+        // it is 0. This is the offset in the stack object.
+        if (i+1 >= n || !In.getOperand(i+1).isImm() ||
+            In.getOperand(i+1).getImm() != 0)
+          BadFIs.insert(FI);
+        if (BadFIs.count(FI))
+          continue;
+
+        IndexType Index = IndexMap.getIndex(&In);
+        if (Load) {
+          if (LastStore[FI] == IndexType::None)
+            LastStore[FI] = IndexType::Entry;
+          LastLoad[FI] = Index;
+        } else if (Store) {
+          HexagonBlockRanges::RangeList &RL = FIRangeMap[FI].Map[&B];
+          if (LastStore[FI] != IndexType::None)
+            RL.add(LastStore[FI], LastLoad[FI], false, false);
+          else if (LastLoad[FI] != IndexType::None)
+            RL.add(IndexType::Entry, LastLoad[FI], false, false);
+          LastLoad[FI] = IndexType::None;
+          LastStore[FI] = Index;
+        } else {
+          BadFIs.insert(FI);
+        }
+      }
+    }
+
+    for (auto &I : LastLoad) {
+      IndexType LL = I.second;
+      if (LL == IndexType::None)
+        continue;
+      auto &RL = FIRangeMap[I.first].Map[&B];
+      IndexType &LS = LastStore[I.first];
+      if (LS != IndexType::None)
+        RL.add(LS, LL, false, false);
+      else
+        RL.add(IndexType::Entry, LL, false, false);
+      LS = IndexType::None;
+    }
+    for (auto &I : LastStore) {
+      IndexType LS = I.second;
+      if (LS == IndexType::None)
+        continue;
+      auto &RL = FIRangeMap[I.first].Map[&B];
+      RL.add(LS, IndexType::None, false, false);
+    }
+  }
+
+  DEBUG({
+    for (auto &P : FIRangeMap) {
+      dbgs() << "fi#" << P.first;
+      if (BadFIs.count(P.first))
+        dbgs() << " (bad)";
+      dbgs() << "  RC: ";
+      if (P.second.RC != nullptr)
+        dbgs() << HRI.getRegClassName(P.second.RC) << '\n';
+      else
+        dbgs() << "<null>\n";
+      for (auto &R : P.second.Map)
+        dbgs() << "  BB#" << R.first->getNumber() << " { " << R.second << "}\n";
+    }
+  });
+
+  // When a slot is loaded from in a block without being stored to in the
+  // same block, it is live-on-entry to this block. To avoid CFG analysis,
+  // consider this slot to be live-on-exit from all blocks.
+  SmallSet<int,4> LoxFIs;
+
+  std::map<MachineBasicBlock*,std::vector<int>> BlockFIMap;
+
+  for (auto &P : FIRangeMap) {
+    // P = pair(FI, map: BB->RangeList)
+    if (BadFIs.count(P.first))
+      continue;
+    for (auto &B : MF) {
+      auto F = P.second.Map.find(&B);
+      // F = pair(BB, RangeList)
+      if (F == P.second.Map.end() || F->second.empty())
+        continue;
+      HexagonBlockRanges::IndexRange &IR = F->second.front();
+      if (IR.start() == IndexType::Entry)
+        LoxFIs.insert(P.first);
+      BlockFIMap[&B].push_back(P.first);
+    }
+  }
+
+  DEBUG({
+    dbgs() << "Block-to-FI map (* -- live-on-exit):\n";
+    for (auto &P : BlockFIMap) {
+      auto &FIs = P.second;
+      if (FIs.empty())
+        continue;
+      dbgs() << "  BB#" << P.first->getNumber() << ": {";
+      for (auto I : FIs) {
+        dbgs() << " fi#" << I;
+        if (LoxFIs.count(I))
+          dbgs() << '*';
+      }
+      dbgs() << " }\n";
+    }
+  });
+
+#ifndef NDEBUG
+  bool HasOptLimit = SpillOptMax.getPosition();
+#endif
+
+  // eliminate loads, when all loads eliminated, eliminate all stores.
+  for (auto &B : MF) {
+    auto F = BlockIndexes.find(&B);
+    assert(F != BlockIndexes.end());
+    HexagonBlockRanges::InstrIndexMap &IM = F->second;
+    HexagonBlockRanges::RegToRangeMap LM = HBR.computeLiveMap(IM);
+    HexagonBlockRanges::RegToRangeMap DM = HBR.computeDeadMap(IM, LM);
+    DEBUG(dbgs() << "BB#" << B.getNumber() << " dead map\n"
+                 << HexagonBlockRanges::PrintRangeMap(DM, HRI));
+
+    for (auto FI : BlockFIMap[&B]) {
+      if (BadFIs.count(FI))
+        continue;
+      DEBUG(dbgs() << "Working on fi#" << FI << '\n');
+      HexagonBlockRanges::RangeList &RL = FIRangeMap[FI].Map[&B];
+      for (auto &Range : RL) {
+        DEBUG(dbgs() << "--Examining range:" << RL << '\n');
+        if (!IndexType::isInstr(Range.start()) ||
+            !IndexType::isInstr(Range.end()))
+          continue;
+        MachineInstr &SI = *IM.getInstr(Range.start());
+        MachineInstr &EI = *IM.getInstr(Range.end());
+        assert(SI.mayStore() && "Unexpected start instruction");
+        assert(EI.mayLoad() && "Unexpected end instruction");
+        MachineOperand &SrcOp = SI.getOperand(2);
+
+        HexagonBlockRanges::RegisterRef SrcRR = { SrcOp.getReg(),
+                                                  SrcOp.getSubReg() };
+        auto *RC = HII.getRegClass(SI.getDesc(), 2, &HRI, MF);
+        // The this-> is needed to unconfuse MSVC.
+        unsigned FoundR = this->findPhysReg(MF, Range, IM, DM, RC);
+        DEBUG(dbgs() << "Replacement reg:" << PrintReg(FoundR, &HRI) << '\n');
+        if (FoundR == 0)
+          continue;
+#ifndef NDEBUG
+        if (HasOptLimit) {
+          if (SpillOptCount >= SpillOptMax)
+            return;
+          SpillOptCount++;
+        }
+#endif
+
+        // Generate the copy-in: "FoundR = COPY SrcR" at the store location.
+        MachineBasicBlock::iterator StartIt = SI.getIterator(), NextIt;
+        MachineInstr *CopyIn = nullptr;
+        if (SrcRR.Reg != FoundR || SrcRR.Sub != 0) {
+          const DebugLoc &DL = SI.getDebugLoc();
+          CopyIn = BuildMI(B, StartIt, DL, HII.get(TargetOpcode::COPY), FoundR)
+                      .addOperand(SrcOp);
+        }
+
+        ++StartIt;
+        // Check if this is a last store and the FI is live-on-exit.
+        if (LoxFIs.count(FI) && (&Range == &RL.back())) {
+          // Update store's source register.
+          if (unsigned SR = SrcOp.getSubReg())
+            SrcOp.setReg(HRI.getSubReg(FoundR, SR));
+          else
+            SrcOp.setReg(FoundR);
+          SrcOp.setSubReg(0);
+          // We are keeping this register live.
+          SrcOp.setIsKill(false);
+        } else {
+          B.erase(&SI);
+          IM.replaceInstr(&SI, CopyIn);
+        }
+
+        auto EndIt = std::next(EI.getIterator());
+        for (auto It = StartIt; It != EndIt; It = NextIt) {
+          MachineInstr &MI = *It;
+          NextIt = std::next(It);
+          int TFI;
+          if (!HII.isLoadFromStackSlot(MI, TFI) || TFI != FI)
+            continue;
+          unsigned DstR = MI.getOperand(0).getReg();
+          assert(MI.getOperand(0).getSubReg() == 0);
+          MachineInstr *CopyOut = nullptr;
+          if (DstR != FoundR) {
+            DebugLoc DL = MI.getDebugLoc();
+            unsigned MemSize = (1U << (HII.getMemAccessSize(MI) - 1));
+            assert(HII.getAddrMode(MI) == HexagonII::BaseImmOffset);
+            unsigned CopyOpc = TargetOpcode::COPY;
+            if (HII.isSignExtendingLoad(MI))
+              CopyOpc = (MemSize == 1) ? Hexagon::A2_sxtb : Hexagon::A2_sxth;
+            else if (HII.isZeroExtendingLoad(MI))
+              CopyOpc = (MemSize == 1) ? Hexagon::A2_zxtb : Hexagon::A2_zxth;
+            CopyOut = BuildMI(B, It, DL, HII.get(CopyOpc), DstR)
+                        .addReg(FoundR, getKillRegState(&MI == &EI));
+          }
+          IM.replaceInstr(&MI, CopyOut);
+          B.erase(It);
+        }
+
+        // Update the dead map.
+        HexagonBlockRanges::RegisterRef FoundRR = { FoundR, 0 };
+        for (auto RR : HexagonBlockRanges::expandToSubRegs(FoundRR, MRI, HRI))
+          DM[RR].subtract(Range);
+      } // for Range in range list
+    }
+  }
+}
+
+void HexagonFrameLowering::expandAlloca(MachineInstr *AI,
+      const HexagonInstrInfo &HII, unsigned SP, unsigned CF) const {
+  MachineBasicBlock &MB = *AI->getParent();
+  DebugLoc DL = AI->getDebugLoc();
+  unsigned A = AI->getOperand(2).getImm();
+
+  // Have
+  //    Rd  = alloca Rs, #A
+  //
+  // If Rs and Rd are different registers, use this sequence:
+  //    Rd  = sub(r29, Rs)
+  //    r29 = sub(r29, Rs)
+  //    Rd  = and(Rd, #-A)    ; if necessary
+  //    r29 = and(r29, #-A)   ; if necessary
+  //    Rd  = add(Rd, #CF)    ; CF size aligned to at most A
+  // otherwise, do
+  //    Rd  = sub(r29, Rs)
+  //    Rd  = and(Rd, #-A)    ; if necessary
+  //    r29 = Rd
+  //    Rd  = add(Rd, #CF)    ; CF size aligned to at most A
+
+  MachineOperand &RdOp = AI->getOperand(0);
+  MachineOperand &RsOp = AI->getOperand(1);
+  unsigned Rd = RdOp.getReg(), Rs = RsOp.getReg();
+
+  // Rd = sub(r29, Rs)
+  BuildMI(MB, AI, DL, HII.get(Hexagon::A2_sub), Rd)
+      .addReg(SP)
+      .addReg(Rs);
+  if (Rs != Rd) {
+    // r29 = sub(r29, Rs)
+    BuildMI(MB, AI, DL, HII.get(Hexagon::A2_sub), SP)
+        .addReg(SP)
+        .addReg(Rs);
+  }
+  if (A > 8) {
+    // Rd  = and(Rd, #-A)
+    BuildMI(MB, AI, DL, HII.get(Hexagon::A2_andir), Rd)
+        .addReg(Rd)
+        .addImm(-int64_t(A));
+    if (Rs != Rd)
+      BuildMI(MB, AI, DL, HII.get(Hexagon::A2_andir), SP)
+          .addReg(SP)
+          .addImm(-int64_t(A));
+  }
+  if (Rs == Rd) {
+    // r29 = Rd
+    BuildMI(MB, AI, DL, HII.get(TargetOpcode::COPY), SP)
+        .addReg(Rd);
+  }
+  if (CF > 0) {
+    // Rd = add(Rd, #CF)
+    BuildMI(MB, AI, DL, HII.get(Hexagon::A2_addi), Rd)
+        .addReg(Rd)
+        .addImm(CF);
+  }
+}
+
+bool HexagonFrameLowering::needsAligna(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (!MFI.hasVarSizedObjects())
+    return false;
+  unsigned MaxA = MFI.getMaxAlignment();
+  if (MaxA <= getStackAlignment())
+    return false;
+  return true;
+}
+
+const MachineInstr *HexagonFrameLowering::getAlignaInstr(
+      const MachineFunction &MF) const {
+  for (auto &B : MF)
+    for (auto &I : B)
+      if (I.getOpcode() == Hexagon::PS_aligna)
+        return &I;
+  return nullptr;
+}
+
+/// Adds all callee-saved registers as implicit uses or defs to the
+/// instruction.
+void HexagonFrameLowering::addCalleeSaveRegistersAsImpOperand(MachineInstr *MI,
+      const CSIVect &CSI, bool IsDef, bool IsKill) const {
+  // Add the callee-saved registers as implicit uses.
+  for (auto &R : CSI)
+    MI->addOperand(MachineOperand::CreateReg(R.getReg(), IsDef, true, IsKill));
+}
+
+/// Determine whether the callee-saved register saves and restores should
+/// be generated via inline code. If this function returns "true", inline
+/// code will be generated. If this function returns "false", additional
+/// checks are performed, which may still lead to the inline code.
+bool HexagonFrameLowering::shouldInlineCSR(MachineFunction &MF,
+      const CSIVect &CSI) const {
+  if (MF.getInfo<HexagonMachineFunctionInfo>()->hasEHReturn())
+    return true;
+  if (!isOptSize(MF) && !isMinSize(MF))
+    if (MF.getTarget().getOptLevel() > CodeGenOpt::Default)
+      return true;
+
+  // Check if CSI only has double registers, and if the registers form
+  // a contiguous block starting from D8.
+  BitVector Regs(Hexagon::NUM_TARGET_REGS);
+  for (unsigned i = 0, n = CSI.size(); i < n; ++i) {
+    unsigned R = CSI[i].getReg();
+    if (!Hexagon::DoubleRegsRegClass.contains(R))
+      return true;
+    Regs[R] = true;
+  }
+  int F = Regs.find_first();
+  if (F != Hexagon::D8)
+    return true;
+  while (F >= 0) {
+    int N = Regs.find_next(F);
+    if (N >= 0 && N != F+1)
+      return true;
+    F = N;
+  }
+
+  return false;
+}
+
+bool HexagonFrameLowering::useSpillFunction(MachineFunction &MF,
+      const CSIVect &CSI) const {
+  if (shouldInlineCSR(MF, CSI))
+    return false;
+  unsigned NumCSI = CSI.size();
+  if (NumCSI <= 1)
+    return false;
+
+  unsigned Threshold = isOptSize(MF) ? SpillFuncThresholdOs
+                                     : SpillFuncThreshold;
+  return Threshold < NumCSI;
+}
+
+bool HexagonFrameLowering::useRestoreFunction(MachineFunction &MF,
+      const CSIVect &CSI) const {
+  if (shouldInlineCSR(MF, CSI))
+    return false;
+  // The restore functions do a bit more than just restoring registers.
+  // The non-returning versions will go back directly to the caller's
+  // caller, others will clean up the stack frame in preparation for
+  // a tail call. Using them can still save code size even if only one
+  // register is getting restores. Make the decision based on -Oz:
+  // using -Os will use inline restore for a single register.
+  if (isMinSize(MF))
+    return true;
+  unsigned NumCSI = CSI.size();
+  if (NumCSI <= 1)
+    return false;
+
+  unsigned Threshold = isOptSize(MF) ? SpillFuncThresholdOs-1
+                                     : SpillFuncThreshold;
+  return Threshold < NumCSI;
+}
+
+bool HexagonFrameLowering::mayOverflowFrameOffset(MachineFunction &MF) const {
+  unsigned StackSize = MF.getFrameInfo().estimateStackSize(MF);
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  // A fairly simplistic guess as to whether a potential load/store to a
+  // stack location could require an extra register. It does not account
+  // for store-immediate instructions.
+  if (HST.useHVXOps())
+    return StackSize > 256;
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
new file mode 100644
index 000000000000..529a61d4a5b5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -0,0 +1,159 @@
+//=- HexagonFrameLowering.h - Define frame lowering for Hexagon --*- C++ -*--=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONFRAMELOWERING_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONFRAMELOWERING_H
+
+#include "Hexagon.h"
+#include "HexagonBlockRanges.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include <vector>
+
+namespace llvm {
+
+class HexagonInstrInfo;
+class HexagonRegisterInfo;
+
+class HexagonFrameLowering : public TargetFrameLowering {
+public:
+  explicit HexagonFrameLowering()
+      : TargetFrameLowering(StackGrowsDown, 8, 0, 1, true) {}
+
+  // All of the prolog/epilog functionality, including saving and restoring
+  // callee-saved registers is handled in emitPrologue. This is to have the
+  // logic for shrink-wrapping in one place.
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const
+      override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const
+      override {}
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI,
+      const TargetRegisterInfo *TRI) const override {
+    return true;
+  }
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI,
+      const TargetRegisterInfo *TRI) const override {
+    return true;
+  }
+
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+      RegScavenger *RS = nullptr) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+      RegScavenger *RS) const override;
+
+  bool targetHandlesStackFrameRounding() const override {
+    return true;
+  }
+
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+      unsigned &FrameReg) const override;
+  bool hasFP(const MachineFunction &MF) const override;
+
+  const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries)
+      const override {
+    static const SpillSlot Offsets[] = {
+      { Hexagon::R17, -4 }, { Hexagon::R16, -8 }, { Hexagon::D8, -8 },
+      { Hexagon::R19, -12 }, { Hexagon::R18, -16 }, { Hexagon::D9, -16 },
+      { Hexagon::R21, -20 }, { Hexagon::R20, -24 }, { Hexagon::D10, -24 },
+      { Hexagon::R23, -28 }, { Hexagon::R22, -32 }, { Hexagon::D11, -32 },
+      { Hexagon::R25, -36 }, { Hexagon::R24, -40 }, { Hexagon::D12, -40 },
+      { Hexagon::R27, -44 }, { Hexagon::R26, -48 }, { Hexagon::D13, -48 }
+    };
+    NumEntries = array_lengthof(Offsets);
+    return Offsets;
+  }
+
+  bool assignCalleeSavedSpillSlots(MachineFunction &MF,
+      const TargetRegisterInfo *TRI, std::vector<CalleeSavedInfo> &CSI)
+      const override;
+
+  bool needsAligna(const MachineFunction &MF) const;
+  const MachineInstr *getAlignaInstr(const MachineFunction &MF) const;
+
+  void insertCFIInstructions(MachineFunction &MF) const;
+
+private:
+  typedef std::vector<CalleeSavedInfo> CSIVect;
+
+  void expandAlloca(MachineInstr *AI, const HexagonInstrInfo &TII,
+      unsigned SP, unsigned CF) const;
+  void insertPrologueInBlock(MachineBasicBlock &MBB, bool PrologueStubs) const;
+  void insertEpilogueInBlock(MachineBasicBlock &MBB) const;
+  bool insertCSRSpillsInBlock(MachineBasicBlock &MBB, const CSIVect &CSI,
+      const HexagonRegisterInfo &HRI, bool &PrologueStubs) const;
+  bool insertCSRRestoresInBlock(MachineBasicBlock &MBB, const CSIVect &CSI,
+      const HexagonRegisterInfo &HRI) const;
+  void updateEntryPaths(MachineFunction &MF, MachineBasicBlock &SaveB) const;
+  bool updateExitPaths(MachineBasicBlock &MBB, MachineBasicBlock &RestoreB,
+      BitVector &DoneT, BitVector &DoneF, BitVector &Path) const;
+  void insertCFIInstructionsAt(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator At) const;
+
+  void adjustForCalleeSavedRegsSpillCall(MachineFunction &MF) const;
+
+  bool expandCopy(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandStoreInt(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandLoadInt(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandStoreVecPred(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandLoadVecPred(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandStoreVec2(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandLoadVec2(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandStoreVec(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandLoadVec(MachineBasicBlock &B, MachineBasicBlock::iterator It,
+      MachineRegisterInfo &MRI, const HexagonInstrInfo &HII,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+  bool expandSpillMacros(MachineFunction &MF,
+      SmallVectorImpl<unsigned> &NewRegs) const;
+
+  unsigned findPhysReg(MachineFunction &MF, HexagonBlockRanges::IndexRange &FIR,
+      HexagonBlockRanges::InstrIndexMap &IndexMap,
+      HexagonBlockRanges::RegToRangeMap &DeadMap,
+      const TargetRegisterClass *RC) const;
+  void optimizeSpillSlots(MachineFunction &MF,
+      SmallVectorImpl<unsigned> &VRegs) const;
+
+  void findShrunkPrologEpilog(MachineFunction &MF, MachineBasicBlock *&PrologB,
+      MachineBasicBlock *&EpilogB) const;
+
+  void addCalleeSaveRegistersAsImpOperand(MachineInstr *MI, const CSIVect &CSI,
+      bool IsDef, bool IsKill) const;
+  bool shouldInlineCSR(MachineFunction &MF, const CSIVect &CSI) const;
+  bool useSpillFunction(MachineFunction &MF, const CSIVect &CSI) const;
+  bool useRestoreFunction(MachineFunction &MF, const CSIVect &CSI) const;
+  bool mayOverflowFrameOffset(MachineFunction &MF) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONFRAMELOWERING_H
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
new file mode 100644
index 000000000000..bb5e379ce014
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -0,0 +1,271 @@
+//===--- HexagonGenExtract.cpp --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+
+using namespace llvm;
+
+static cl::opt<unsigned> ExtractCutoff("extract-cutoff", cl::init(~0U),
+  cl::Hidden, cl::desc("Cutoff for generating \"extract\""
+  " instructions"));
+
+// This prevents generating extract instructions that have the offset of 0.
+// One of the reasons for "extract" is to put a sequence of bits in a regis-
+// ter, starting at offset 0 (so that these bits can then be used by an
+// "insert"). If the bits are already at offset 0, it is better not to gene-
+// rate "extract", since logical bit operations can be merged into compound
+// instructions (as opposed to "extract").
+static cl::opt<bool> NoSR0("extract-nosr0", cl::init(true), cl::Hidden,
+  cl::desc("No extract instruction with offset 0"));
+
+static cl::opt<bool> NeedAnd("extract-needand", cl::init(true), cl::Hidden,
+  cl::desc("Require & in extract patterns"));
+
+namespace llvm {
+
+  void initializeHexagonGenExtractPass(PassRegistry&);
+  FunctionPass *createHexagonGenExtract();
+
+} // end namespace llvm
+
+namespace {
+
+  class HexagonGenExtract : public FunctionPass {
+  public:
+    static char ID;
+
+    HexagonGenExtract() : FunctionPass(ID), ExtractCount(0) {
+      initializeHexagonGenExtractPass(*PassRegistry::getPassRegistry());
+    }
+
+    StringRef getPassName() const override {
+      return "Hexagon generate \"extract\" instructions";
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      FunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    bool visitBlock(BasicBlock *B);
+    bool convert(Instruction *In);
+
+    unsigned ExtractCount;
+    DominatorTree *DT;
+  };
+
+  char HexagonGenExtract::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(HexagonGenExtract, "hextract", "Hexagon generate "
+  "\"extract\" instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(HexagonGenExtract, "hextract", "Hexagon generate "
+  "\"extract\" instructions", false, false)
+
+bool HexagonGenExtract::convert(Instruction *In) {
+  using namespace PatternMatch;
+
+  Value *BF = nullptr;
+  ConstantInt *CSL = nullptr, *CSR = nullptr, *CM = nullptr;
+  BasicBlock *BB = In->getParent();
+  LLVMContext &Ctx = BB->getContext();
+  bool LogicalSR;
+
+  // (and (shl (lshr x, #sr), #sl), #m)
+  LogicalSR = true;
+  bool Match = match(In, m_And(m_Shl(m_LShr(m_Value(BF), m_ConstantInt(CSR)),
+                               m_ConstantInt(CSL)),
+                         m_ConstantInt(CM)));
+
+  if (!Match) {
+    // (and (shl (ashr x, #sr), #sl), #m)
+    LogicalSR = false;
+    Match = match(In, m_And(m_Shl(m_AShr(m_Value(BF), m_ConstantInt(CSR)),
+                            m_ConstantInt(CSL)),
+                      m_ConstantInt(CM)));
+  }
+  if (!Match) {
+    // (and (shl x, #sl), #m)
+    LogicalSR = true;
+    CSR = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+    Match = match(In, m_And(m_Shl(m_Value(BF), m_ConstantInt(CSL)),
+                      m_ConstantInt(CM)));
+    if (Match && NoSR0)
+      return false;
+  }
+  if (!Match) {
+    // (and (lshr x, #sr), #m)
+    LogicalSR = true;
+    CSL = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+    Match = match(In, m_And(m_LShr(m_Value(BF), m_ConstantInt(CSR)),
+                            m_ConstantInt(CM)));
+  }
+  if (!Match) {
+    // (and (ashr x, #sr), #m)
+    LogicalSR = false;
+    CSL = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+    Match = match(In, m_And(m_AShr(m_Value(BF), m_ConstantInt(CSR)),
+                            m_ConstantInt(CM)));
+  }
+  if (!Match) {
+    CM = nullptr;
+    // (shl (lshr x, #sr), #sl)
+    LogicalSR = true;
+    Match = match(In, m_Shl(m_LShr(m_Value(BF), m_ConstantInt(CSR)),
+                            m_ConstantInt(CSL)));
+  }
+  if (!Match) {
+    CM = nullptr;
+    // (shl (ashr x, #sr), #sl)
+    LogicalSR = false;
+    Match = match(In, m_Shl(m_AShr(m_Value(BF), m_ConstantInt(CSR)),
+                            m_ConstantInt(CSL)));
+  }
+  if (!Match)
+    return false;
+
+  Type *Ty = BF->getType();
+  if (!Ty->isIntegerTy())
+    return false;
+  unsigned BW = Ty->getPrimitiveSizeInBits();
+  if (BW != 32 && BW != 64)
+    return false;
+
+  uint32_t SR = CSR->getZExtValue();
+  uint32_t SL = CSL->getZExtValue();
+
+  if (!CM) {
+    // If there was no and, and the shift left did not remove all potential
+    // sign bits created by the shift right, then extractu cannot reproduce
+    // this value.
+    if (!LogicalSR && (SR > SL))
+      return false;
+    APInt A = APInt(BW, ~0ULL).lshr(SR).shl(SL);
+    CM = ConstantInt::get(Ctx, A);
+  }
+
+  // CM is the shifted-left mask. Shift it back right to remove the zero
+  // bits on least-significant positions.
+  APInt M = CM->getValue().lshr(SL);
+  uint32_t T = M.countTrailingOnes();
+
+  // During the shifts some of the bits will be lost. Calculate how many
+  // of the original value will remain after shift right and then left.
+  uint32_t U = BW - std::max(SL, SR);
+  // The width of the extracted field is the minimum of the original bits
+  // that remain after the shifts and the number of contiguous 1s in the mask.
+  uint32_t W = std::min(U, T);
+  if (W == 0)
+    return false;
+
+  // Check if the extracted bits are contained within the mask that it is
+  // and-ed with. The extract operation will copy these bits, and so the
+  // mask cannot any holes in it that would clear any of the bits of the
+  // extracted field.
+  if (!LogicalSR) {
+    // If the shift right was arithmetic, it could have included some 1 bits.
+    // It is still ok to generate extract, but only if the mask eliminates
+    // those bits (i.e. M does not have any bits set beyond U).
+    APInt C = APInt::getHighBitsSet(BW, BW-U);
+    if (M.intersects(C) || !APIntOps::isMask(W, M))
+      return false;
+  } else {
+    // Check if M starts with a contiguous sequence of W times 1 bits. Get
+    // the low U bits of M (which eliminates the 0 bits shifted in on the
+    // left), and check if the result is APInt's "mask":
+    if (!APIntOps::isMask(W, M.getLoBits(U)))
+      return false;
+  }
+
+  IRBuilder<> IRB(In);
+  Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu
+                                   : Intrinsic::hexagon_S2_extractup;
+  Module *Mod = BB->getParent()->getParent();
+  Value *ExtF = Intrinsic::getDeclaration(Mod, IntId);
+  Value *NewIn = IRB.CreateCall(ExtF, {BF, IRB.getInt32(W), IRB.getInt32(SR)});
+  if (SL != 0)
+    NewIn = IRB.CreateShl(NewIn, SL, CSL->getName());
+  In->replaceAllUsesWith(NewIn);
+  return true;
+}
+
+bool HexagonGenExtract::visitBlock(BasicBlock *B) {
+  // Depth-first, bottom-up traversal.
+  DomTreeNode *DTN = DT->getNode(B);
+  typedef GraphTraits<DomTreeNode*> GTN;
+  typedef GTN::ChildIteratorType Iter;
+  for (Iter I = GTN::child_begin(DTN), E = GTN::child_end(DTN); I != E; ++I)
+    visitBlock((*I)->getBlock());
+
+  // Allow limiting the number of generated extracts for debugging purposes.
+  bool HasCutoff = ExtractCutoff.getPosition();
+  unsigned Cutoff = ExtractCutoff;
+
+  bool Changed = false;
+  BasicBlock::iterator I = std::prev(B->end()), NextI, Begin = B->begin();
+  while (true) {
+    if (HasCutoff && (ExtractCount >= Cutoff))
+      return Changed;
+    bool Last = (I == Begin);
+    if (!Last)
+      NextI = std::prev(I);
+    Instruction *In = &*I;
+    bool Done = convert(In);
+    if (HasCutoff && Done)
+      ExtractCount++;
+    Changed |= Done;
+    if (Last)
+      break;
+    I = NextI;
+  }
+  return Changed;
+}
+
+bool HexagonGenExtract::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  bool Changed;
+
+  // Traverse the function bottom-up, to see super-expressions before their
+  // sub-expressions.
+  BasicBlock *Entry = GraphTraits<Function*>::getEntryNode(&F);
+  Changed = visitBlock(Entry);
+
+  return Changed;
+}
+
+FunctionPass *llvm::createHexagonGenExtract() {
+  return new HexagonGenExtract();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
new file mode 100644
index 000000000000..5a8e392d1275
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -0,0 +1,1601 @@
+//===--- HexagonGenInsert.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexinsert"
+
+#include "BitTracker.h"
+#include "HexagonBitTracker.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<unsigned> VRegIndexCutoff("insert-vreg-cutoff", cl::init(~0U),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Vreg# cutoff for insert generation."));
+// The distance cutoff is selected based on the precheckin-perf results:
+// cutoffs 20, 25, 35, and 40 are worse than 30.
+static cl::opt<unsigned> VRegDistCutoff("insert-dist-cutoff", cl::init(30U),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Vreg distance cutoff for insert "
+  "generation."));
+
+static cl::opt<bool> OptTiming("insert-timing", cl::init(false), cl::Hidden,
+  cl::ZeroOrMore, cl::desc("Enable timing of insert generation"));
+static cl::opt<bool> OptTimingDetail("insert-timing-detail", cl::init(false),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Enable detailed timing of insert "
+  "generation"));
+
+static cl::opt<bool> OptSelectAll0("insert-all0", cl::init(false), cl::Hidden,
+  cl::ZeroOrMore);
+static cl::opt<bool> OptSelectHas0("insert-has0", cl::init(false), cl::Hidden,
+  cl::ZeroOrMore);
+// Whether to construct constant values via "insert". Could eliminate constant
+// extenders, but often not practical.
+static cl::opt<bool> OptConst("insert-const", cl::init(false), cl::Hidden,
+  cl::ZeroOrMore);
+
+// The preprocessor gets confused when the DEBUG macro is passed larger
+// chunks of code. Use this function to detect debugging.
+inline static bool isDebug() {
+#ifndef NDEBUG
+  return DebugFlag && isCurrentDebugType(DEBUG_TYPE);
+#else
+  return false;
+#endif
+}
+
+namespace {
+
+  // Set of virtual registers, based on BitVector.
+  struct RegisterSet : private BitVector {
+    RegisterSet() = default;
+    explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {}
+
+    using BitVector::clear;
+
+    unsigned find_first() const {
+      int First = BitVector::find_first();
+      if (First < 0)
+        return 0;
+      return x2v(First);
+    }
+
+    unsigned find_next(unsigned Prev) const {
+      int Next = BitVector::find_next(v2x(Prev));
+      if (Next < 0)
+        return 0;
+      return x2v(Next);
+    }
+
+    RegisterSet &insert(unsigned R) {
+      unsigned Idx = v2x(R);
+      ensure(Idx);
+      return static_cast<RegisterSet&>(BitVector::set(Idx));
+    }
+    RegisterSet &remove(unsigned R) {
+      unsigned Idx = v2x(R);
+      if (Idx >= size())
+        return *this;
+      return static_cast<RegisterSet&>(BitVector::reset(Idx));
+    }
+
+    RegisterSet &insert(const RegisterSet &Rs) {
+      return static_cast<RegisterSet&>(BitVector::operator|=(Rs));
+    }
+    RegisterSet &remove(const RegisterSet &Rs) {
+      return static_cast<RegisterSet&>(BitVector::reset(Rs));
+    }
+
+    reference operator[](unsigned R) {
+      unsigned Idx = v2x(R);
+      ensure(Idx);
+      return BitVector::operator[](Idx);
+    }
+    bool operator[](unsigned R) const {
+      unsigned Idx = v2x(R);
+      assert(Idx < size());
+      return BitVector::operator[](Idx);
+    }
+    bool has(unsigned R) const {
+      unsigned Idx = v2x(R);
+      if (Idx >= size())
+        return false;
+      return BitVector::test(Idx);
+    }
+
+    bool empty() const {
+      return !BitVector::any();
+    }
+    bool includes(const RegisterSet &Rs) const {
+      // A.BitVector::test(B)  <=>  A-B != {}
+      return !Rs.BitVector::test(*this);
+    }
+    bool intersects(const RegisterSet &Rs) const {
+      return BitVector::anyCommon(Rs);
+    }
+
+  private:
+    void ensure(unsigned Idx) {
+      if (size() <= Idx)
+        resize(std::max(Idx+1, 32U));
+    }
+
+    static inline unsigned v2x(unsigned v) {
+      return TargetRegisterInfo::virtReg2Index(v);
+    }
+
+    static inline unsigned x2v(unsigned x) {
+      return TargetRegisterInfo::index2VirtReg(x);
+    }
+  };
+
+  struct PrintRegSet {
+    PrintRegSet(const RegisterSet &S, const TargetRegisterInfo *RI)
+      : RS(S), TRI(RI) {}
+
+    friend raw_ostream &operator<< (raw_ostream &OS,
+          const PrintRegSet &P);
+
+  private:
+    const RegisterSet &RS;
+    const TargetRegisterInfo *TRI;
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) {
+    OS << '{';
+    for (unsigned R = P.RS.find_first(); R; R = P.RS.find_next(R))
+      OS << ' ' << PrintReg(R, P.TRI);
+    OS << " }";
+    return OS;
+  }
+
+  // A convenience class to associate unsigned numbers (such as virtual
+  // registers) with unsigned numbers.
+  struct UnsignedMap : public DenseMap<unsigned,unsigned> {
+    UnsignedMap() = default;
+
+  private:
+    typedef DenseMap<unsigned,unsigned> BaseType;
+  };
+
+  // A utility to establish an ordering between virtual registers:
+  // VRegA < VRegB  <=>  RegisterOrdering[VRegA] < RegisterOrdering[VRegB]
+  // This is meant as a cache for the ordering of virtual registers defined
+  // by a potentially expensive comparison function, or obtained by a proce-
+  // dure that should not be repeated each time two registers are compared.
+  struct RegisterOrdering : public UnsignedMap {
+    RegisterOrdering() = default;
+
+    unsigned operator[](unsigned VR) const {
+      const_iterator F = find(VR);
+      assert(F != end());
+      return F->second;
+    }
+
+    // Add operator(), so that objects of this class can be used as
+    // comparators in std::sort et al.
+    bool operator() (unsigned VR1, unsigned VR2) const {
+      return operator[](VR1) < operator[](VR2);
+    }
+  };
+
+  // Ordering of bit values. This class does not have operator[], but
+  // is supplies a comparison operator() for use in std:: algorithms.
+  // The order is as follows:
+  // - 0 < 1 < ref
+  // - ref1 < ref2, if ord(ref1.Reg) < ord(ref2.Reg),
+  //   or ord(ref1.Reg) == ord(ref2.Reg), and ref1.Pos < ref2.Pos.
+  struct BitValueOrdering {
+    BitValueOrdering(const RegisterOrdering &RB) : BaseOrd(RB) {}
+
+    bool operator() (const BitTracker::BitValue &V1,
+          const BitTracker::BitValue &V2) const;
+
+    const RegisterOrdering &BaseOrd;
+  };
+
+} // end anonymous namespace
+
+bool BitValueOrdering::operator() (const BitTracker::BitValue &V1,
+      const BitTracker::BitValue &V2) const {
+  if (V1 == V2)
+    return false;
+  // V1==0 => true, V2==0 => false
+  if (V1.is(0) || V2.is(0))
+    return V1.is(0);
+  // Neither of V1,V2 is 0, and V1!=V2.
+  // V2==1 => false, V1==1 => true
+  if (V2.is(1) || V1.is(1))
+    return !V2.is(1);
+  // Both V1,V2 are refs.
+  unsigned Ind1 = BaseOrd[V1.RefI.Reg], Ind2 = BaseOrd[V2.RefI.Reg];
+  if (Ind1 != Ind2)
+    return Ind1 < Ind2;
+  // If V1.Pos==V2.Pos
+  assert(V1.RefI.Pos != V2.RefI.Pos && "Bit values should be different");
+  return V1.RefI.Pos < V2.RefI.Pos;
+}
+
+namespace {
+
+  // Cache for the BitTracker's cell map. Map lookup has a logarithmic
+  // complexity, this class will memoize the lookup results to reduce
+  // the access time for repeated lookups of the same cell.
+  struct CellMapShadow {
+    CellMapShadow(const BitTracker &T) : BT(T) {}
+
+    const BitTracker::RegisterCell &lookup(unsigned VR) {
+      unsigned RInd = TargetRegisterInfo::virtReg2Index(VR);
+      // Grow the vector to at least 32 elements.
+      if (RInd >= CVect.size())
+        CVect.resize(std::max(RInd+16, 32U), nullptr);
+      const BitTracker::RegisterCell *CP = CVect[RInd];
+      if (CP == nullptr)
+        CP = CVect[RInd] = &BT.lookup(VR);
+      return *CP;
+    }
+
+    const BitTracker &BT;
+
+  private:
+    typedef std::vector<const BitTracker::RegisterCell*> CellVectType;
+    CellVectType CVect;
+  };
+
+  // Comparator class for lexicographic ordering of virtual registers
+  // according to the corresponding BitTracker::RegisterCell objects.
+  struct RegisterCellLexCompare {
+    RegisterCellLexCompare(const BitValueOrdering &BO, CellMapShadow &M)
+      : BitOrd(BO), CM(M) {}
+
+    bool operator() (unsigned VR1, unsigned VR2) const;
+
+  private:
+    const BitValueOrdering &BitOrd;
+    CellMapShadow &CM;
+  };
+
+  // Comparator class for lexicographic ordering of virtual registers
+  // according to the specified bits of the corresponding BitTracker::
+  // RegisterCell objects.
+  // Specifically, this class will be used to compare bit B of a register
+  // cell for a selected virtual register R with bit N of any register
+  // other than R.
+  struct RegisterCellBitCompareSel {
+    RegisterCellBitCompareSel(unsigned R, unsigned B, unsigned N,
+          const BitValueOrdering &BO, CellMapShadow &M)
+      : SelR(R), SelB(B), BitN(N), BitOrd(BO), CM(M) {}
+
+    bool operator() (unsigned VR1, unsigned VR2) const;
+
+  private:
+    const unsigned SelR, SelB;
+    const unsigned BitN;
+    const BitValueOrdering &BitOrd;
+    CellMapShadow &CM;
+  };
+
+} // end anonymous namespace
+
+bool RegisterCellLexCompare::operator() (unsigned VR1, unsigned VR2) const {
+  // Ordering of registers, made up from two given orderings:
+  // - the ordering of the register numbers, and
+  // - the ordering of register cells.
+  // Def. R1 < R2 if:
+  // - cell(R1) < cell(R2), or
+  // - cell(R1) == cell(R2), and index(R1) < index(R2).
+  //
+  // For register cells, the ordering is lexicographic, with index 0 being
+  // the most significant.
+  if (VR1 == VR2)
+    return false;
+
+  const BitTracker::RegisterCell &RC1 = CM.lookup(VR1), &RC2 = CM.lookup(VR2);
+  uint16_t W1 = RC1.width(), W2 = RC2.width();
+  for (uint16_t i = 0, w = std::min(W1, W2); i < w; ++i) {
+    const BitTracker::BitValue &V1 = RC1[i], &V2 = RC2[i];
+    if (V1 != V2)
+      return BitOrd(V1, V2);
+  }
+  // Cells are equal up until the common length.
+  if (W1 != W2)
+    return W1 < W2;
+
+  return BitOrd.BaseOrd[VR1] < BitOrd.BaseOrd[VR2];
+}
+
+bool RegisterCellBitCompareSel::operator() (unsigned VR1, unsigned VR2) const {
+  if (VR1 == VR2)
+    return false;
+  const BitTracker::RegisterCell &RC1 = CM.lookup(VR1);
+  const BitTracker::RegisterCell &RC2 = CM.lookup(VR2);
+  uint16_t W1 = RC1.width(), W2 = RC2.width();
+  uint16_t Bit1 = (VR1 == SelR) ? SelB : BitN;
+  uint16_t Bit2 = (VR2 == SelR) ? SelB : BitN;
+  // If Bit1 exceeds the width of VR1, then:
+  // - return false, if at the same time Bit2 exceeds VR2, or
+  // - return true, otherwise.
+  // (I.e. "a bit value that does not exist is less than any bit value
+  // that does exist".)
+  if (W1 <= Bit1)
+    return Bit2 < W2;
+  // If Bit1 is within VR1, but Bit2 is not within VR2, return false.
+  if (W2 <= Bit2)
+    return false;
+
+  const BitTracker::BitValue &V1 = RC1[Bit1], V2 = RC2[Bit2];
+  if (V1 != V2)
+    return BitOrd(V1, V2);
+  return false;
+}
+
+namespace {
+
+  class OrderedRegisterList {
+    typedef std::vector<unsigned> ListType;
+
+  public:
+    OrderedRegisterList(const RegisterOrdering &RO) : Ord(RO) {}
+
+    void insert(unsigned VR);
+    void remove(unsigned VR);
+
+    unsigned operator[](unsigned Idx) const {
+      assert(Idx < Seq.size());
+      return Seq[Idx];
+    }
+
+    unsigned size() const {
+      return Seq.size();
+    }
+
+    typedef ListType::iterator iterator;
+    typedef ListType::const_iterator const_iterator;
+    iterator begin() { return Seq.begin(); }
+    iterator end() { return Seq.end(); }
+    const_iterator begin() const { return Seq.begin(); }
+    const_iterator end() const { return Seq.end(); }
+
+    // Convenience function to convert an iterator to the corresponding index.
+    unsigned idx(iterator It) const { return It-begin(); }
+
+  private:
+    ListType Seq;
+    const RegisterOrdering &Ord;
+  };
+
+  struct PrintORL {
+    PrintORL(const OrderedRegisterList &L, const TargetRegisterInfo *RI)
+      : RL(L), TRI(RI) {}
+
+    friend raw_ostream &operator<< (raw_ostream &OS, const PrintORL &P);
+
+  private:
+    const OrderedRegisterList &RL;
+    const TargetRegisterInfo *TRI;
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS, const PrintORL &P) {
+    OS << '(';
+    OrderedRegisterList::const_iterator B = P.RL.begin(), E = P.RL.end();
+    for (OrderedRegisterList::const_iterator I = B; I != E; ++I) {
+      if (I != B)
+        OS << ", ";
+      OS << PrintReg(*I, P.TRI);
+    }
+    OS << ')';
+    return OS;
+  }
+
+} // end anonymous namespace
+
+void OrderedRegisterList::insert(unsigned VR) {
+  iterator L = std::lower_bound(Seq.begin(), Seq.end(), VR, Ord);
+  if (L == Seq.end())
+    Seq.push_back(VR);
+  else
+    Seq.insert(L, VR);
+}
+
+void OrderedRegisterList::remove(unsigned VR) {
+  iterator L = std::lower_bound(Seq.begin(), Seq.end(), VR, Ord);
+  assert(L != Seq.end());
+  Seq.erase(L);
+}
+
+namespace {
+
+  // A record of the insert form. The fields correspond to the operands
+  // of the "insert" instruction:
+  // ... = insert(SrcR, InsR, #Wdh, #Off)
+  struct IFRecord {
+    IFRecord(unsigned SR = 0, unsigned IR = 0, uint16_t W = 0, uint16_t O = 0)
+      : SrcR(SR), InsR(IR), Wdh(W), Off(O) {}
+
+    unsigned SrcR, InsR;
+    uint16_t Wdh, Off;
+  };
+
+  struct PrintIFR {
+    PrintIFR(const IFRecord &R, const TargetRegisterInfo *RI)
+      : IFR(R), TRI(RI) {}
+
+  private:
+    friend raw_ostream &operator<< (raw_ostream &OS, const PrintIFR &P);
+
+    const IFRecord &IFR;
+    const TargetRegisterInfo *TRI;
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS, const PrintIFR &P) {
+    unsigned SrcR = P.IFR.SrcR, InsR = P.IFR.InsR;
+    OS << '(' << PrintReg(SrcR, P.TRI) << ',' << PrintReg(InsR, P.TRI)
+       << ",#" << P.IFR.Wdh << ",#" << P.IFR.Off << ')';
+    return OS;
+  }
+
+  typedef std::pair<IFRecord,RegisterSet> IFRecordWithRegSet;
+
+} // end anonymous namespace
+
+namespace llvm {
+
+  void initializeHexagonGenInsertPass(PassRegistry&);
+  FunctionPass *createHexagonGenInsert();
+
+} // end namespace llvm
+
+namespace {
+
+  class HexagonGenInsert : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    HexagonGenInsert() : MachineFunctionPass(ID), HII(nullptr), HRI(nullptr) {
+      initializeHexagonGenInsertPass(*PassRegistry::getPassRegistry());
+    }
+
+    StringRef getPassName() const override {
+      return "Hexagon generate \"insert\" instructions";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    typedef DenseMap<std::pair<unsigned,unsigned>,unsigned> PairMapType;
+
+    void buildOrderingMF(RegisterOrdering &RO) const;
+    void buildOrderingBT(RegisterOrdering &RB, RegisterOrdering &RO) const;
+    bool isIntClass(const TargetRegisterClass *RC) const;
+    bool isConstant(unsigned VR) const;
+    bool isSmallConstant(unsigned VR) const;
+    bool isValidInsertForm(unsigned DstR, unsigned SrcR, unsigned InsR,
+          uint16_t L, uint16_t S) const;
+    bool findSelfReference(unsigned VR) const;
+    bool findNonSelfReference(unsigned VR) const;
+    void getInstrDefs(const MachineInstr *MI, RegisterSet &Defs) const;
+    void getInstrUses(const MachineInstr *MI, RegisterSet &Uses) const;
+    unsigned distance(const MachineBasicBlock *FromB,
+          const MachineBasicBlock *ToB, const UnsignedMap &RPO,
+          PairMapType &M) const;
+    unsigned distance(MachineBasicBlock::const_iterator FromI,
+          MachineBasicBlock::const_iterator ToI, const UnsignedMap &RPO,
+          PairMapType &M) const;
+    bool findRecordInsertForms(unsigned VR, OrderedRegisterList &AVs);
+    void collectInBlock(MachineBasicBlock *B, OrderedRegisterList &AVs);
+    void findRemovableRegisters(unsigned VR, IFRecord IF,
+          RegisterSet &RMs) const;
+    void computeRemovableRegisters();
+
+    void pruneEmptyLists();
+    void pruneCoveredSets(unsigned VR);
+    void pruneUsesTooFar(unsigned VR, const UnsignedMap &RPO, PairMapType &M);
+    void pruneRegCopies(unsigned VR);
+    void pruneCandidates();
+    void selectCandidates();
+    bool generateInserts();
+
+    bool removeDeadCode(MachineDomTreeNode *N);
+
+    // IFRecord coupled with a set of potentially removable registers:
+    typedef std::vector<IFRecordWithRegSet> IFListType;
+    typedef DenseMap<unsigned,IFListType> IFMapType;  // vreg -> IFListType
+
+    void dump_map() const;
+
+    const HexagonInstrInfo *HII;
+    const HexagonRegisterInfo *HRI;
+
+    MachineFunction *MFN;
+    MachineRegisterInfo *MRI;
+    MachineDominatorTree *MDT;
+    CellMapShadow *CMS;
+
+    RegisterOrdering BaseOrd;
+    RegisterOrdering CellOrd;
+    IFMapType IFMap;
+  };
+
+  char HexagonGenInsert::ID = 0;
+
+} // end anonymous namespace
+
+void HexagonGenInsert::dump_map() const {
+  typedef IFMapType::const_iterator iterator;
+  for (iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    dbgs() << "  " << PrintReg(I->first, HRI) << ":\n";
+    const IFListType &LL = I->second;
+    for (unsigned i = 0, n = LL.size(); i < n; ++i)
+      dbgs() << "    " << PrintIFR(LL[i].first, HRI) << ", "
+             << PrintRegSet(LL[i].second, HRI) << '\n';
+  }
+}
+
+void HexagonGenInsert::buildOrderingMF(RegisterOrdering &RO) const {
+  unsigned Index = 0;
+  typedef MachineFunction::const_iterator mf_iterator;
+  for (mf_iterator A = MFN->begin(), Z = MFN->end(); A != Z; ++A) {
+    const MachineBasicBlock &B = *A;
+    if (!CMS->BT.reached(&B))
+      continue;
+    typedef MachineBasicBlock::const_iterator mb_iterator;
+    for (mb_iterator I = B.begin(), E = B.end(); I != E; ++I) {
+      const MachineInstr *MI = &*I;
+      for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+        const MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.isDef()) {
+          unsigned R = MO.getReg();
+          assert(MO.getSubReg() == 0 && "Unexpected subregister in definition");
+          if (TargetRegisterInfo::isVirtualRegister(R))
+            RO.insert(std::make_pair(R, Index++));
+        }
+      }
+    }
+  }
+  // Since some virtual registers may have had their def and uses eliminated,
+  // they are no longer referenced in the code, and so they will not appear
+  // in the map.
+}
+
+void HexagonGenInsert::buildOrderingBT(RegisterOrdering &RB,
+      RegisterOrdering &RO) const {
+  // Create a vector of all virtual registers (collect them from the base
+  // ordering RB), and then sort it using the RegisterCell comparator.
+  BitValueOrdering BVO(RB);
+  RegisterCellLexCompare LexCmp(BVO, *CMS);
+  typedef std::vector<unsigned> SortableVectorType;
+  SortableVectorType VRs;
+  for (RegisterOrdering::iterator I = RB.begin(), E = RB.end(); I != E; ++I)
+    VRs.push_back(I->first);
+  std::sort(VRs.begin(), VRs.end(), LexCmp);
+  // Transfer the results to the outgoing register ordering.
+  for (unsigned i = 0, n = VRs.size(); i < n; ++i)
+    RO.insert(std::make_pair(VRs[i], i));
+}
+
+inline bool HexagonGenInsert::isIntClass(const TargetRegisterClass *RC) const {
+  return RC == &Hexagon::IntRegsRegClass || RC == &Hexagon::DoubleRegsRegClass;
+}
+
+bool HexagonGenInsert::isConstant(unsigned VR) const {
+  const BitTracker::RegisterCell &RC = CMS->lookup(VR);
+  uint16_t W = RC.width();
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitTracker::BitValue &BV = RC[i];
+    if (BV.is(0) || BV.is(1))
+      continue;
+    return false;
+  }
+  return true;
+}
+
+bool HexagonGenInsert::isSmallConstant(unsigned VR) const {
+  const BitTracker::RegisterCell &RC = CMS->lookup(VR);
+  uint16_t W = RC.width();
+  if (W > 64)
+    return false;
+  uint64_t V = 0, B = 1;
+  for (uint16_t i = 0; i < W; ++i) {
+    const BitTracker::BitValue &BV = RC[i];
+    if (BV.is(1))
+      V |= B;
+    else if (!BV.is(0))
+      return false;
+    B <<= 1;
+  }
+
+  // For 32-bit registers, consider: Rd = #s16.
+  if (W == 32)
+    return isInt<16>(V);
+
+  // For 64-bit registers, it's Rdd = #s8 or Rdd = combine(#s8,#s8)
+  return isInt<8>(Lo_32(V)) && isInt<8>(Hi_32(V));
+}
+
+bool HexagonGenInsert::isValidInsertForm(unsigned DstR, unsigned SrcR,
+      unsigned InsR, uint16_t L, uint16_t S) const {
+  const TargetRegisterClass *DstRC = MRI->getRegClass(DstR);
+  const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcR);
+  const TargetRegisterClass *InsRC = MRI->getRegClass(InsR);
+  // Only integet (32-/64-bit) register classes.
+  if (!isIntClass(DstRC) || !isIntClass(SrcRC) || !isIntClass(InsRC))
+    return false;
+  // The "source" register must be of the same class as DstR.
+  if (DstRC != SrcRC)
+    return false;
+  if (DstRC == InsRC)
+    return true;
+  // A 64-bit register can only be generated from other 64-bit registers.
+  if (DstRC == &Hexagon::DoubleRegsRegClass)
+    return false;
+  // Otherwise, the L and S cannot span 32-bit word boundary.
+  if (S < 32 && S+L > 32)
+    return false;
+  return true;
+}
+
+bool HexagonGenInsert::findSelfReference(unsigned VR) const {
+  const BitTracker::RegisterCell &RC = CMS->lookup(VR);
+  for (uint16_t i = 0, w = RC.width(); i < w; ++i) {
+    const BitTracker::BitValue &V = RC[i];
+    if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg == VR)
+      return true;
+  }
+  return false;
+}
+
+bool HexagonGenInsert::findNonSelfReference(unsigned VR) const {
+  BitTracker::RegisterCell RC = CMS->lookup(VR);
+  for (uint16_t i = 0, w = RC.width(); i < w; ++i) {
+    const BitTracker::BitValue &V = RC[i];
+    if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg != VR)
+      return true;
+  }
+  return false;
+}
+
+void HexagonGenInsert::getInstrDefs(const MachineInstr *MI,
+      RegisterSet &Defs) const {
+  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned R = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      continue;
+    Defs.insert(R);
+  }
+}
+
+void HexagonGenInsert::getInstrUses(const MachineInstr *MI,
+      RegisterSet &Uses) const {
+  for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    unsigned R = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      continue;
+    Uses.insert(R);
+  }
+}
+
+unsigned HexagonGenInsert::distance(const MachineBasicBlock *FromB,
+      const MachineBasicBlock *ToB, const UnsignedMap &RPO,
+      PairMapType &M) const {
+  // Forward distance from the end of a block to the beginning of it does
+  // not make sense. This function should not be called with FromB == ToB.
+  assert(FromB != ToB);
+
+  unsigned FromN = FromB->getNumber(), ToN = ToB->getNumber();
+  // If we have already computed it, return the cached result.
+  PairMapType::iterator F = M.find(std::make_pair(FromN, ToN));
+  if (F != M.end())
+    return F->second;
+  unsigned ToRPO = RPO.lookup(ToN);
+
+  unsigned MaxD = 0;
+  typedef MachineBasicBlock::const_pred_iterator pred_iterator;
+  for (pred_iterator I = ToB->pred_begin(), E = ToB->pred_end(); I != E; ++I) {
+    const MachineBasicBlock *PB = *I;
+    // Skip back edges. Also, if FromB is a predecessor of ToB, the distance
+    // along that path will be 0, and we don't need to do any calculations
+    // on it.
+    if (PB == FromB || RPO.lookup(PB->getNumber()) >= ToRPO)
+      continue;
+    unsigned D = PB->size() + distance(FromB, PB, RPO, M);
+    if (D > MaxD)
+      MaxD = D;
+  }
+
+  // Memoize the result for later lookup.
+  M.insert(std::make_pair(std::make_pair(FromN, ToN), MaxD));
+  return MaxD;
+}
+
+unsigned HexagonGenInsert::distance(MachineBasicBlock::const_iterator FromI,
+      MachineBasicBlock::const_iterator ToI, const UnsignedMap &RPO,
+      PairMapType &M) const {
+  const MachineBasicBlock *FB = FromI->getParent(), *TB = ToI->getParent();
+  if (FB == TB)
+    return std::distance(FromI, ToI);
+  unsigned D1 = std::distance(TB->begin(), ToI);
+  unsigned D2 = distance(FB, TB, RPO, M);
+  unsigned D3 = std::distance(FromI, FB->end());
+  return D1+D2+D3;
+}
+
+bool HexagonGenInsert::findRecordInsertForms(unsigned VR,
+      OrderedRegisterList &AVs) {
+  if (isDebug()) {
+    dbgs() << __func__ << ": " << PrintReg(VR, HRI)
+           << "  AVs: " << PrintORL(AVs, HRI) << "\n";
+  }
+  if (AVs.size() == 0)
+    return false;
+
+  typedef OrderedRegisterList::iterator iterator;
+  BitValueOrdering BVO(BaseOrd);
+  const BitTracker::RegisterCell &RC = CMS->lookup(VR);
+  uint16_t W = RC.width();
+
+  typedef std::pair<unsigned,uint16_t> RSRecord;  // (reg,shift)
+  typedef std::vector<RSRecord> RSListType;
+  // Have a map, with key being the matching prefix length, and the value
+  // being the list of pairs (R,S), where R's prefix matches VR at S.
+  // (DenseMap<uint16_t,RSListType> fails to instantiate.)
+  typedef DenseMap<unsigned,RSListType> LRSMapType;
+  LRSMapType LM;
+
+  // Conceptually, rotate the cell RC right (i.e. towards the LSB) by S,
+  // and find matching prefixes from AVs with the rotated RC. Such a prefix
+  // would match a string of bits (of length L) in RC starting at S.
+  for (uint16_t S = 0; S < W; ++S) {
+    iterator B = AVs.begin(), E = AVs.end();
+    // The registers in AVs are ordered according to the lexical order of
+    // the corresponding register cells. This means that the range of regis-
+    // ters in AVs that match a prefix of length L+1 will be contained in
+    // the range that matches a prefix of length L. This means that we can
+    // keep narrowing the search space as the prefix length goes up. This
+    // helps reduce the overall complexity of the search.
+    uint16_t L;
+    for (L = 0; L < W-S; ++L) {
+      // Compare against VR's bits starting at S, which emulates rotation
+      // of VR by S.
+      RegisterCellBitCompareSel RCB(VR, S+L, L, BVO, *CMS);
+      iterator NewB = std::lower_bound(B, E, VR, RCB);
+      iterator NewE = std::upper_bound(NewB, E, VR, RCB);
+      // For the registers that are eliminated from the next range, L is
+      // the longest prefix matching VR at position S (their prefixes
+      // differ from VR at S+L). If L>0, record this information for later
+      // use.
+      if (L > 0) {
+        for (iterator I = B; I != NewB; ++I)
+          LM[L].push_back(std::make_pair(*I, S));
+        for (iterator I = NewE; I != E; ++I)
+          LM[L].push_back(std::make_pair(*I, S));
+      }
+      B = NewB, E = NewE;
+      if (B == E)
+        break;
+    }
+    // Record the final register range. If this range is non-empty, then
+    // L=W-S.
+    assert(B == E || L == W-S);
+    if (B != E) {
+      for (iterator I = B; I != E; ++I)
+        LM[L].push_back(std::make_pair(*I, S));
+      // If B!=E, then we found a range of registers whose prefixes cover the
+      // rest of VR from position S. There is no need to further advance S.
+      break;
+    }
+  }
+
+  if (isDebug()) {
+    dbgs() << "Prefixes matching register " << PrintReg(VR, HRI) << "\n";
+    for (LRSMapType::iterator I = LM.begin(), E = LM.end(); I != E; ++I) {
+      dbgs() << "  L=" << I->first << ':';
+      const RSListType &LL = I->second;
+      for (unsigned i = 0, n = LL.size(); i < n; ++i)
+        dbgs() << " (" << PrintReg(LL[i].first, HRI) << ",@"
+               << LL[i].second << ')';
+      dbgs() << '\n';
+    }
+  }
+
+  bool Recorded = false;
+
+  for (iterator I = AVs.begin(), E = AVs.end(); I != E; ++I) {
+    unsigned SrcR = *I;
+    int FDi = -1, LDi = -1;   // First/last different bit.
+    const BitTracker::RegisterCell &AC = CMS->lookup(SrcR);
+    uint16_t AW = AC.width();
+    for (uint16_t i = 0, w = std::min(W, AW); i < w; ++i) {
+      if (RC[i] == AC[i])
+        continue;
+      if (FDi == -1)
+        FDi = i;
+      LDi = i;
+    }
+    if (FDi == -1)
+      continue;  // TODO (future): Record identical registers.
+    // Look for a register whose prefix could patch the range [FD..LD]
+    // where VR and SrcR differ.
+    uint16_t FD = FDi, LD = LDi;  // Switch to unsigned type.
+    uint16_t MinL = LD-FD+1;
+    for (uint16_t L = MinL; L < W; ++L) {
+      LRSMapType::iterator F = LM.find(L);
+      if (F == LM.end())
+        continue;
+      RSListType &LL = F->second;
+      for (unsigned i = 0, n = LL.size(); i < n; ++i) {
+        uint16_t S = LL[i].second;
+        // MinL is the minimum length of the prefix. Any length above MinL
+        // allows some flexibility as to where the prefix can start:
+        // given the extra length EL=L-MinL, the prefix must start between
+        // max(0,FD-EL) and FD.
+        if (S > FD)   // Starts too late.
+          continue;
+        uint16_t EL = L-MinL;
+        uint16_t LowS = (EL < FD) ? FD-EL : 0;
+        if (S < LowS) // Starts too early.
+          continue;
+        unsigned InsR = LL[i].first;
+        if (!isValidInsertForm(VR, SrcR, InsR, L, S))
+          continue;
+        if (isDebug()) {
+          dbgs() << PrintReg(VR, HRI) << " = insert(" << PrintReg(SrcR, HRI)
+                 << ',' << PrintReg(InsR, HRI) << ",#" << L << ",#"
+                 << S << ")\n";
+        }
+        IFRecordWithRegSet RR(IFRecord(SrcR, InsR, L, S), RegisterSet());
+        IFMap[VR].push_back(RR);
+        Recorded = true;
+      }
+    }
+  }
+
+  return Recorded;
+}
+
+void HexagonGenInsert::collectInBlock(MachineBasicBlock *B,
+      OrderedRegisterList &AVs) {
+  if (isDebug())
+    dbgs() << "visiting block BB#" << B->getNumber() << "\n";
+
+  // First, check if this block is reachable at all. If not, the bit tracker
+  // will not have any information about registers in it.
+  if (!CMS->BT.reached(B))
+    return;
+
+  bool DoConst = OptConst;
+  // Keep a separate set of registers defined in this block, so that we
+  // can remove them from the list of available registers once all DT
+  // successors have been processed.
+  RegisterSet BlockDefs, InsDefs;
+  for (MachineBasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) {
+    MachineInstr *MI = &*I;
+    InsDefs.clear();
+    getInstrDefs(MI, InsDefs);
+    // Leave those alone. They are more transparent than "insert".
+    bool Skip = MI->isCopy() || MI->isRegSequence();
+
+    if (!Skip) {
+      // Visit all defined registers, and attempt to find the corresponding
+      // "insert" representations.
+      for (unsigned VR = InsDefs.find_first(); VR; VR = InsDefs.find_next(VR)) {
+        // Do not collect registers that are known to be compile-time cons-
+        // tants, unless requested.
+        if (!DoConst && isConstant(VR))
+          continue;
+        // If VR's cell contains a reference to VR, then VR cannot be defined
+        // via "insert". If VR is a constant that can be generated in a single
+        // instruction (without constant extenders), generating it via insert
+        // makes no sense.
+        if (findSelfReference(VR) || isSmallConstant(VR))
+          continue;
+
+        findRecordInsertForms(VR, AVs);
+      }
+    }
+
+    // Insert the defined registers into the list of available registers
+    // after they have been processed.
+    for (unsigned VR = InsDefs.find_first(); VR; VR = InsDefs.find_next(VR))
+      AVs.insert(VR);
+    BlockDefs.insert(InsDefs);
+  }
+
+  MachineDomTreeNode *N = MDT->getNode(B);
+  typedef GraphTraits<MachineDomTreeNode*> GTN;
+  typedef GTN::ChildIteratorType ChildIter;
+  for (ChildIter I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) {
+    MachineBasicBlock *SB = (*I)->getBlock();
+    collectInBlock(SB, AVs);
+  }
+
+  for (unsigned VR = BlockDefs.find_first(); VR; VR = BlockDefs.find_next(VR))
+    AVs.remove(VR);
+}
+
+void HexagonGenInsert::findRemovableRegisters(unsigned VR, IFRecord IF,
+      RegisterSet &RMs) const {
+  // For a given register VR and a insert form, find the registers that are
+  // used by the current definition of VR, and which would no longer be
+  // needed for it after the definition of VR is replaced with the insert
+  // form. These are the registers that could potentially become dead.
+  RegisterSet Regs[2];
+
+  unsigned S = 0;  // Register set selector.
+  Regs[S].insert(VR);
+
+  while (!Regs[S].empty()) {
+    // Breadth-first search.
+    unsigned OtherS = 1-S;
+    Regs[OtherS].clear();
+    for (unsigned R = Regs[S].find_first(); R; R = Regs[S].find_next(R)) {
+      Regs[S].remove(R);
+      if (R == IF.SrcR || R == IF.InsR)
+        continue;
+      // Check if a given register has bits that are references to any other
+      // registers. This is to detect situations where the instruction that
+      // defines register R takes register Q as an operand, but R itself does
+      // not contain any bits from Q. Loads are examples of how this could
+      // happen:
+      //   R = load Q
+      // In this case (assuming we do not have any knowledge about the loaded
+      // value), we must not treat R as a "conveyance" of the bits from Q.
+      // (The information in BT about R's bits would have them as constants,
+      // in case of zero-extending loads, or refs to R.)
+      if (!findNonSelfReference(R))
+        continue;
+      RMs.insert(R);
+      const MachineInstr *DefI = MRI->getVRegDef(R);
+      assert(DefI);
+      // Do not iterate past PHI nodes to avoid infinite loops. This can
+      // make the final set a bit less accurate, but the removable register
+      // sets are an approximation anyway.
+      if (DefI->isPHI())
+        continue;
+      getInstrUses(DefI, Regs[OtherS]);
+    }
+    S = OtherS;
+  }
+  // The register VR is added to the list as a side-effect of the algorithm,
+  // but it is not "potentially removable". A potentially removable register
+  // is one that may become unused (dead) after conversion to the insert form
+  // IF, and obviously VR (or its replacement) will not become dead by apply-
+  // ing IF.
+  RMs.remove(VR);
+}
+
+void HexagonGenInsert::computeRemovableRegisters() {
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    IFListType &LL = I->second;
+    for (unsigned i = 0, n = LL.size(); i < n; ++i)
+      findRemovableRegisters(I->first, LL[i].first, LL[i].second);
+  }
+}
+
+void HexagonGenInsert::pruneEmptyLists() {
+  // Remove all entries from the map, where the register has no insert forms
+  // associated with it.
+  typedef SmallVector<IFMapType::iterator,16> IterListType;
+  IterListType Prune;
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    if (I->second.empty())
+      Prune.push_back(I);
+  }
+  for (unsigned i = 0, n = Prune.size(); i < n; ++i)
+    IFMap.erase(Prune[i]);
+}
+
+void HexagonGenInsert::pruneCoveredSets(unsigned VR) {
+  IFMapType::iterator F = IFMap.find(VR);
+  assert(F != IFMap.end());
+  IFListType &LL = F->second;
+
+  // First, examine the IF candidates for register VR whose removable-regis-
+  // ter sets are empty. This means that a given candidate will not help eli-
+  // minate any registers, but since "insert" is not a constant-extendable
+  // instruction, using such a candidate may reduce code size if the defini-
+  // tion of VR is constant-extended.
+  // If there exists a candidate with a non-empty set, the ones with empty
+  // sets will not be used and can be removed.
+  MachineInstr *DefVR = MRI->getVRegDef(VR);
+  bool DefEx = HII->isConstExtended(*DefVR);
+  bool HasNE = false;
+  for (unsigned i = 0, n = LL.size(); i < n; ++i) {
+    if (LL[i].second.empty())
+      continue;
+    HasNE = true;
+    break;
+  }
+  if (!DefEx || HasNE) {
+    // The definition of VR is not constant-extended, or there is a candidate
+    // with a non-empty set. Remove all candidates with empty sets.
+    auto IsEmpty = [] (const IFRecordWithRegSet &IR) -> bool {
+      return IR.second.empty();
+    };
+    auto End = llvm::remove_if(LL, IsEmpty);
+    if (End != LL.end())
+      LL.erase(End, LL.end());
+  } else {
+    // The definition of VR is constant-extended, and all candidates have
+    // empty removable-register sets. Pick the maximum candidate, and remove
+    // all others. The "maximum" does not have any special meaning here, it
+    // is only so that the candidate that will remain on the list is selec-
+    // ted deterministically.
+    IFRecord MaxIF = LL[0].first;
+    for (unsigned i = 1, n = LL.size(); i < n; ++i) {
+      // If LL[MaxI] < LL[i], then MaxI = i.
+      const IFRecord &IF = LL[i].first;
+      unsigned M0 = BaseOrd[MaxIF.SrcR], M1 = BaseOrd[MaxIF.InsR];
+      unsigned R0 = BaseOrd[IF.SrcR], R1 = BaseOrd[IF.InsR];
+      if (M0 > R0)
+        continue;
+      if (M0 == R0) {
+        if (M1 > R1)
+          continue;
+        if (M1 == R1) {
+          if (MaxIF.Wdh > IF.Wdh)
+            continue;
+          if (MaxIF.Wdh == IF.Wdh && MaxIF.Off >= IF.Off)
+            continue;
+        }
+      }
+      // MaxIF < IF.
+      MaxIF = IF;
+    }
+    // Remove everything except the maximum candidate. All register sets
+    // are empty, so no need to preserve anything.
+    LL.clear();
+    LL.push_back(std::make_pair(MaxIF, RegisterSet()));
+  }
+
+  // Now, remove those whose sets of potentially removable registers are
+  // contained in another IF candidate for VR. For example, given these
+  // candidates for vreg45,
+  //   %vreg45:
+  //     (%vreg44,%vreg41,#9,#8), { %vreg42 }
+  //     (%vreg43,%vreg41,#9,#8), { %vreg42 %vreg44 }
+  // remove the first one, since it is contained in the second one.
+  for (unsigned i = 0, n = LL.size(); i < n; ) {
+    const RegisterSet &RMi = LL[i].second;
+    unsigned j = 0;
+    while (j < n) {
+      if (j != i && LL[j].second.includes(RMi))
+        break;
+      j++;
+    }
+    if (j == n) {   // RMi not contained in anything else.
+      i++;
+      continue;
+    }
+    LL.erase(LL.begin()+i);
+    n = LL.size();
+  }
+}
+
+void HexagonGenInsert::pruneUsesTooFar(unsigned VR, const UnsignedMap &RPO,
+      PairMapType &M) {
+  IFMapType::iterator F = IFMap.find(VR);
+  assert(F != IFMap.end());
+  IFListType &LL = F->second;
+  unsigned Cutoff = VRegDistCutoff;
+  const MachineInstr *DefV = MRI->getVRegDef(VR);
+
+  for (unsigned i = LL.size(); i > 0; --i) {
+    unsigned SR = LL[i-1].first.SrcR, IR = LL[i-1].first.InsR;
+    const MachineInstr *DefS = MRI->getVRegDef(SR);
+    const MachineInstr *DefI = MRI->getVRegDef(IR);
+    unsigned DSV = distance(DefS, DefV, RPO, M);
+    if (DSV < Cutoff) {
+      unsigned DIV = distance(DefI, DefV, RPO, M);
+      if (DIV < Cutoff)
+        continue;
+    }
+    LL.erase(LL.begin()+(i-1));
+  }
+}
+
+void HexagonGenInsert::pruneRegCopies(unsigned VR) {
+  IFMapType::iterator F = IFMap.find(VR);
+  assert(F != IFMap.end());
+  IFListType &LL = F->second;
+
+  auto IsCopy = [] (const IFRecordWithRegSet &IR) -> bool {
+    return IR.first.Wdh == 32 && (IR.first.Off == 0 || IR.first.Off == 32);
+  };
+  auto End = llvm::remove_if(LL, IsCopy);
+  if (End != LL.end())
+    LL.erase(End, LL.end());
+}
+
+void HexagonGenInsert::pruneCandidates() {
+  // Remove candidates that are not beneficial, regardless of the final
+  // selection method.
+  // First, remove candidates whose potentially removable set is a subset
+  // of another candidate's set.
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I)
+    pruneCoveredSets(I->first);
+
+  UnsignedMap RPO;
+  typedef ReversePostOrderTraversal<const MachineFunction*> RPOTType;
+  RPOTType RPOT(MFN);
+  unsigned RPON = 0;
+  for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
+    RPO[(*I)->getNumber()] = RPON++;
+
+  PairMapType Memo; // Memoization map for distance calculation.
+  // Remove candidates that would use registers defined too far away.
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I)
+    pruneUsesTooFar(I->first, RPO, Memo);
+
+  pruneEmptyLists();
+
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I)
+    pruneRegCopies(I->first);
+}
+
+namespace {
+
+  // Class for comparing IF candidates for registers that have multiple of
+  // them. The smaller the candidate, according to this ordering, the better.
+  // First, compare the number of zeros in the associated potentially remova-
+  // ble register sets. "Zero" indicates that the register is very likely to
+  // become dead after this transformation.
+  // Second, compare "averages", i.e. use-count per size. The lower wins.
+  // After that, it does not really matter which one is smaller. Resolve
+  // the tie in some deterministic way.
+  struct IFOrdering {
+    IFOrdering(const UnsignedMap &UC, const RegisterOrdering &BO)
+      : UseC(UC), BaseOrd(BO) {}
+
+    bool operator() (const IFRecordWithRegSet &A,
+          const IFRecordWithRegSet &B) const;
+
+  private:
+    void stats(const RegisterSet &Rs, unsigned &Size, unsigned &Zero,
+          unsigned &Sum) const;
+
+    const UnsignedMap &UseC;
+    const RegisterOrdering &BaseOrd;
+  };
+
+} // end anonymous namespace
+
+bool IFOrdering::operator() (const IFRecordWithRegSet &A,
+      const IFRecordWithRegSet &B) const {
+  unsigned SizeA = 0, ZeroA = 0, SumA = 0;
+  unsigned SizeB = 0, ZeroB = 0, SumB = 0;
+  stats(A.second, SizeA, ZeroA, SumA);
+  stats(B.second, SizeB, ZeroB, SumB);
+
+  // We will pick the minimum element. The more zeros, the better.
+  if (ZeroA != ZeroB)
+    return ZeroA > ZeroB;
+  // Compare SumA/SizeA with SumB/SizeB, lower is better.
+  uint64_t AvgA = SumA*SizeB, AvgB = SumB*SizeA;
+  if (AvgA != AvgB)
+    return AvgA < AvgB;
+
+  // The sets compare identical so far. Resort to comparing the IF records.
+  // The actual values don't matter, this is only for determinism.
+  unsigned OSA = BaseOrd[A.first.SrcR], OSB = BaseOrd[B.first.SrcR];
+  if (OSA != OSB)
+    return OSA < OSB;
+  unsigned OIA = BaseOrd[A.first.InsR], OIB = BaseOrd[B.first.InsR];
+  if (OIA != OIB)
+    return OIA < OIB;
+  if (A.first.Wdh != B.first.Wdh)
+    return A.first.Wdh < B.first.Wdh;
+  return A.first.Off < B.first.Off;
+}
+
+void IFOrdering::stats(const RegisterSet &Rs, unsigned &Size, unsigned &Zero,
+      unsigned &Sum) const {
+  for (unsigned R = Rs.find_first(); R; R = Rs.find_next(R)) {
+    UnsignedMap::const_iterator F = UseC.find(R);
+    assert(F != UseC.end());
+    unsigned UC = F->second;
+    if (UC == 0)
+      Zero++;
+    Sum += UC;
+    Size++;
+  }
+}
+
+void HexagonGenInsert::selectCandidates() {
+  // Some registers may have multiple valid candidates. Pick the best one
+  // (or decide not to use any).
+
+  // Compute the "removability" measure of R:
+  // For each potentially removable register R, record the number of regis-
+  // ters with IF candidates, where R appears in at least one set.
+  RegisterSet AllRMs;
+  UnsignedMap UseC, RemC;
+  IFMapType::iterator End = IFMap.end();
+
+  for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
+    const IFListType &LL = I->second;
+    RegisterSet TT;
+    for (unsigned i = 0, n = LL.size(); i < n; ++i)
+      TT.insert(LL[i].second);
+    for (unsigned R = TT.find_first(); R; R = TT.find_next(R))
+      RemC[R]++;
+    AllRMs.insert(TT);
+  }
+
+  for (unsigned R = AllRMs.find_first(); R; R = AllRMs.find_next(R)) {
+    typedef MachineRegisterInfo::use_nodbg_iterator use_iterator;
+    typedef SmallSet<const MachineInstr*,16> InstrSet;
+    InstrSet UIs;
+    // Count as the number of instructions in which R is used, not the
+    // number of operands.
+    use_iterator E = MRI->use_nodbg_end();
+    for (use_iterator I = MRI->use_nodbg_begin(R); I != E; ++I)
+      UIs.insert(I->getParent());
+    unsigned C = UIs.size();
+    // Calculate a measure, which is the number of instructions using R,
+    // minus the "removability" count computed earlier.
+    unsigned D = RemC[R];
+    UseC[R] = (C > D) ? C-D : 0;  // doz
+  }
+
+  bool SelectAll0 = OptSelectAll0, SelectHas0 = OptSelectHas0;
+  if (!SelectAll0 && !SelectHas0)
+    SelectAll0 = true;
+
+  // The smaller the number UseC for a given register R, the "less used"
+  // R is aside from the opportunities for removal offered by generating
+  // "insert" instructions.
+  // Iterate over the IF map, and for those registers that have multiple
+  // candidates, pick the minimum one according to IFOrdering.
+  IFOrdering IFO(UseC, BaseOrd);
+  for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
+    IFListType &LL = I->second;
+    if (LL.empty())
+      continue;
+    // Get the minimum element, remember it and clear the list. If the
+    // element found is adequate, we will put it back on the list, other-
+    // wise the list will remain empty, and the entry for this register
+    // will be removed (i.e. this register will not be replaced by insert).
+    IFListType::iterator MinI = std::min_element(LL.begin(), LL.end(), IFO);
+    assert(MinI != LL.end());
+    IFRecordWithRegSet M = *MinI;
+    LL.clear();
+
+    // We want to make sure that this replacement will have a chance to be
+    // beneficial, and that means that we want to have indication that some
+    // register will be removed. The most likely registers to be eliminated
+    // are the use operands in the definition of I->first. Accept/reject a
+    // candidate based on how many of its uses it can potentially eliminate.
+
+    RegisterSet Us;
+    const MachineInstr *DefI = MRI->getVRegDef(I->first);
+    getInstrUses(DefI, Us);
+    bool Accept = false;
+
+    if (SelectAll0) {
+      bool All0 = true;
+      for (unsigned R = Us.find_first(); R; R = Us.find_next(R)) {
+        if (UseC[R] == 0)
+          continue;
+        All0 = false;
+        break;
+      }
+      Accept = All0;
+    } else if (SelectHas0) {
+      bool Has0 = false;
+      for (unsigned R = Us.find_first(); R; R = Us.find_next(R)) {
+        if (UseC[R] != 0)
+          continue;
+        Has0 = true;
+        break;
+      }
+      Accept = Has0;
+    }
+    if (Accept)
+      LL.push_back(M);
+  }
+
+  // Remove candidates that add uses of removable registers, unless the
+  // removable registers are among replacement candidates.
+  // Recompute the removable registers, since some candidates may have
+  // been eliminated.
+  AllRMs.clear();
+  for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
+    const IFListType &LL = I->second;
+    if (!LL.empty())
+      AllRMs.insert(LL[0].second);
+  }
+  for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
+    IFListType &LL = I->second;
+    if (LL.empty())
+      continue;
+    unsigned SR = LL[0].first.SrcR, IR = LL[0].first.InsR;
+    if (AllRMs[SR] || AllRMs[IR])
+      LL.clear();
+  }
+
+  pruneEmptyLists();
+}
+
+bool HexagonGenInsert::generateInserts() {
+  // Create a new register for each one from IFMap, and store them in the
+  // map.
+  UnsignedMap RegMap;
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    unsigned VR = I->first;
+    const TargetRegisterClass *RC = MRI->getRegClass(VR);
+    unsigned NewVR = MRI->createVirtualRegister(RC);
+    RegMap[VR] = NewVR;
+  }
+
+  // We can generate the "insert" instructions using potentially stale re-
+  // gisters: SrcR and InsR for a given VR may be among other registers that
+  // are also replaced. This is fine, we will do the mass "rauw" a bit later.
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    MachineInstr *MI = MRI->getVRegDef(I->first);
+    MachineBasicBlock &B = *MI->getParent();
+    DebugLoc DL = MI->getDebugLoc();
+    unsigned NewR = RegMap[I->first];
+    bool R32 = MRI->getRegClass(NewR) == &Hexagon::IntRegsRegClass;
+    const MCInstrDesc &D = R32 ? HII->get(Hexagon::S2_insert)
+                               : HII->get(Hexagon::S2_insertp);
+    IFRecord IF = I->second[0].first;
+    unsigned Wdh = IF.Wdh, Off = IF.Off;
+    unsigned InsS = 0;
+    if (R32 && MRI->getRegClass(IF.InsR) == &Hexagon::DoubleRegsRegClass) {
+      InsS = Hexagon::isub_lo;
+      if (Off >= 32) {
+        InsS = Hexagon::isub_hi;
+        Off -= 32;
+      }
+    }
+    // Advance to the proper location for inserting instructions. This could
+    // be B.end().
+    MachineBasicBlock::iterator At = MI;
+    if (MI->isPHI())
+      At = B.getFirstNonPHI();
+
+    BuildMI(B, At, DL, D, NewR)
+      .addReg(IF.SrcR)
+      .addReg(IF.InsR, 0, InsS)
+      .addImm(Wdh)
+      .addImm(Off);
+
+    MRI->clearKillFlags(IF.SrcR);
+    MRI->clearKillFlags(IF.InsR);
+  }
+
+  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+    MachineInstr *DefI = MRI->getVRegDef(I->first);
+    MRI->replaceRegWith(I->first, RegMap[I->first]);
+    DefI->eraseFromParent();
+  }
+
+  return true;
+}
+
+bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
+  bool Changed = false;
+  typedef GraphTraits<MachineDomTreeNode*> GTN;
+  for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
+    Changed |= removeDeadCode(*I);
+
+  MachineBasicBlock *B = N->getBlock();
+  std::vector<MachineInstr*> Instrs;
+  for (auto I = B->rbegin(), E = B->rend(); I != E; ++I)
+    Instrs.push_back(&*I);
+
+  for (auto I = Instrs.begin(), E = Instrs.end(); I != E; ++I) {
+    MachineInstr *MI = *I;
+    unsigned Opc = MI->getOpcode();
+    // Do not touch lifetime markers. This is why the target-independent DCE
+    // cannot be used.
+    if (Opc == TargetOpcode::LIFETIME_START ||
+        Opc == TargetOpcode::LIFETIME_END)
+      continue;
+    bool Store = false;
+    if (MI->isInlineAsm() || !MI->isSafeToMove(nullptr, Store))
+      continue;
+
+    bool AllDead = true;
+    SmallVector<unsigned,2> Regs;
+    for (const MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned R = MO.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(R) ||
+          !MRI->use_nodbg_empty(R)) {
+        AllDead = false;
+        break;
+      }
+      Regs.push_back(R);
+    }
+    if (!AllDead)
+      continue;
+
+    B->erase(MI);
+    for (unsigned I = 0, N = Regs.size(); I != N; ++I)
+      MRI->markUsesInDebugValueAsUndef(Regs[I]);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  bool Timing = OptTiming, TimingDetail = Timing && OptTimingDetail;
+  bool Changed = false;
+
+  // Sanity check: one, but not both.
+  assert(!OptSelectAll0 || !OptSelectHas0);
+
+  IFMap.clear();
+  BaseOrd.clear();
+  CellOrd.clear();
+
+  const auto &ST = MF.getSubtarget<HexagonSubtarget>();
+  HII = ST.getInstrInfo();
+  HRI = ST.getRegisterInfo();
+  MFN = &MF;
+  MRI = &MF.getRegInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
+
+  // Clean up before any further processing, so that dead code does not
+  // get used in a newly generated "insert" instruction. Have a custom
+  // version of DCE that preserves lifetime markers. Without it, merging
+  // of stack objects can fail to recognize and merge disjoint objects
+  // leading to unnecessary stack growth.
+  Changed = removeDeadCode(MDT->getRootNode());
+
+  const HexagonEvaluator HE(*HRI, *MRI, *HII, MF);
+  BitTracker BTLoc(HE, MF);
+  BTLoc.trace(isDebug());
+  BTLoc.run();
+  CellMapShadow MS(BTLoc);
+  CMS = &MS;
+
+  buildOrderingMF(BaseOrd);
+  buildOrderingBT(BaseOrd, CellOrd);
+
+  if (isDebug()) {
+    dbgs() << "Cell ordering:\n";
+    for (RegisterOrdering::iterator I = CellOrd.begin(), E = CellOrd.end();
+        I != E; ++I) {
+      unsigned VR = I->first, Pos = I->second;
+      dbgs() << PrintReg(VR, HRI) << " -> " << Pos << "\n";
+    }
+  }
+
+  // Collect candidates for conversion into the insert forms.
+  MachineBasicBlock *RootB = MDT->getRoot();
+  OrderedRegisterList AvailR(CellOrd);
+
+  const char *const TGName = "hexinsert";
+  const char *const TGDesc = "Generate Insert Instructions";
+
+  {
+    NamedRegionTimer _T("collection", "collection", TGName, TGDesc,
+                        TimingDetail);
+    collectInBlock(RootB, AvailR);
+    // Complete the information gathered in IFMap.
+    computeRemovableRegisters();
+  }
+
+  if (isDebug()) {
+    dbgs() << "Candidates after collection:\n";
+    dump_map();
+  }
+
+  if (IFMap.empty())
+    return Changed;
+
+  {
+    NamedRegionTimer _T("pruning", "pruning", TGName, TGDesc, TimingDetail);
+    pruneCandidates();
+  }
+
+  if (isDebug()) {
+    dbgs() << "Candidates after pruning:\n";
+    dump_map();
+  }
+
+  if (IFMap.empty())
+    return Changed;
+
+  {
+    NamedRegionTimer _T("selection", "selection", TGName, TGDesc, TimingDetail);
+    selectCandidates();
+  }
+
+  if (isDebug()) {
+    dbgs() << "Candidates after selection:\n";
+    dump_map();
+  }
+
+  // Filter out vregs beyond the cutoff.
+  if (VRegIndexCutoff.getPosition()) {
+    unsigned Cutoff = VRegIndexCutoff;
+    typedef SmallVector<IFMapType::iterator,16> IterListType;
+    IterListType Out;
+    for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
+      unsigned Idx = TargetRegisterInfo::virtReg2Index(I->first);
+      if (Idx >= Cutoff)
+        Out.push_back(I);
+    }
+    for (unsigned i = 0, n = Out.size(); i < n; ++i)
+      IFMap.erase(Out[i]);
+  }
+  if (IFMap.empty())
+    return Changed;
+
+  {
+    NamedRegionTimer _T("generation", "generation", TGName, TGDesc,
+                        TimingDetail);
+    generateInserts();
+  }
+
+  return true;
+}
+
+FunctionPass *llvm::createHexagonGenInsert() {
+  return new HexagonGenInsert();
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+INITIALIZE_PASS_BEGIN(HexagonGenInsert, "hexinsert",
+  "Hexagon generate \"insert\" instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(HexagonGenInsert, "hexinsert",
+  "Hexagon generate \"insert\" instructions", false, false)
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
new file mode 100644
index 000000000000..a718df9c70ab
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -0,0 +1,351 @@
+//===--- HexagonGenMux.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// During instruction selection, MUX instructions are generated for
+// conditional assignments. Since such assignments often present an
+// opportunity to predicate instructions, HexagonExpandCondsets
+// expands MUXes into pairs of conditional transfers, and then proceeds
+// with predication of the producers/consumers of the registers involved.
+// This happens after exiting from the SSA form, but before the machine
+// instruction scheduler. After the scheduler and after the register
+// allocation there can be cases of pairs of conditional transfers
+// resulting from a MUX where neither of them was further predicated. If
+// these transfers are now placed far enough from the instruction defining
+// the predicate register, they cannot use the .new form. In such cases it
+// is better to collapse them back to a single MUX instruction.
+
+#define DEBUG_TYPE "hexmux"
+
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <limits>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+namespace llvm {
+
+  FunctionPass *createHexagonGenMux();
+  void initializeHexagonGenMuxPass(PassRegistry& Registry);
+
+} // end namespace llvm
+
+namespace {
+
+  class HexagonGenMux : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    HexagonGenMux() : MachineFunctionPass(ID), HII(nullptr), HRI(nullptr) {
+      initializeHexagonGenMuxPass(*PassRegistry::getPassRegistry());
+    }
+
+    StringRef getPassName() const override {
+      return "Hexagon generate mux instructions";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+  private:
+    const HexagonInstrInfo *HII;
+    const HexagonRegisterInfo *HRI;
+
+    struct CondsetInfo {
+      unsigned PredR = 0;
+      unsigned TrueX = std::numeric_limits<unsigned>::max();
+      unsigned FalseX = std::numeric_limits<unsigned>::max();
+
+      CondsetInfo() = default;
+    };
+
+    struct DefUseInfo {
+      BitVector Defs, Uses;
+
+      DefUseInfo() = default;
+      DefUseInfo(const BitVector &D, const BitVector &U) : Defs(D), Uses(U) {}
+    };
+
+    struct MuxInfo {
+      MachineBasicBlock::iterator At;
+      unsigned DefR, PredR;
+      MachineOperand *SrcT, *SrcF;
+      MachineInstr *Def1, *Def2;
+
+      MuxInfo(MachineBasicBlock::iterator It, unsigned DR, unsigned PR,
+              MachineOperand *TOp, MachineOperand *FOp, MachineInstr &D1,
+              MachineInstr &D2)
+          : At(It), DefR(DR), PredR(PR), SrcT(TOp), SrcF(FOp), Def1(&D1),
+            Def2(&D2) {}
+    };
+
+    typedef DenseMap<MachineInstr*,unsigned> InstrIndexMap;
+    typedef DenseMap<unsigned,DefUseInfo> DefUseInfoMap;
+    typedef SmallVector<MuxInfo,4> MuxInfoList;
+
+    bool isRegPair(unsigned Reg) const {
+      return Hexagon::DoubleRegsRegClass.contains(Reg);
+    }
+
+    void getSubRegs(unsigned Reg, BitVector &SRs) const;
+    void expandReg(unsigned Reg, BitVector &Set) const;
+    void getDefsUses(const MachineInstr *MI, BitVector &Defs,
+          BitVector &Uses) const;
+    void buildMaps(MachineBasicBlock &B, InstrIndexMap &I2X,
+          DefUseInfoMap &DUM);
+    bool isCondTransfer(unsigned Opc) const;
+    unsigned getMuxOpcode(const MachineOperand &Src1,
+          const MachineOperand &Src2) const;
+    bool genMuxInBlock(MachineBasicBlock &B);
+  };
+
+  char HexagonGenMux::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(HexagonGenMux, "hexagon-mux",
+  "Hexagon generate mux instructions", false, false)
+
+void HexagonGenMux::getSubRegs(unsigned Reg, BitVector &SRs) const {
+  for (MCSubRegIterator I(Reg, HRI); I.isValid(); ++I)
+    SRs[*I] = true;
+}
+
+void HexagonGenMux::expandReg(unsigned Reg, BitVector &Set) const {
+  if (isRegPair(Reg))
+    getSubRegs(Reg, Set);
+  else
+    Set[Reg] = true;
+}
+
+void HexagonGenMux::getDefsUses(const MachineInstr *MI, BitVector &Defs,
+      BitVector &Uses) const {
+  // First, get the implicit defs and uses for this instruction.
+  unsigned Opc = MI->getOpcode();
+  const MCInstrDesc &D = HII->get(Opc);
+  if (const MCPhysReg *R = D.ImplicitDefs)
+    while (*R)
+      expandReg(*R++, Defs);
+  if (const MCPhysReg *R = D.ImplicitUses)
+    while (*R)
+      expandReg(*R++, Uses);
+
+  // Look over all operands, and collect explicit defs and uses.
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg() || MO.isImplicit())
+      continue;
+    unsigned R = MO.getReg();
+    BitVector &Set = MO.isDef() ? Defs : Uses;
+    expandReg(R, Set);
+  }
+}
+
+void HexagonGenMux::buildMaps(MachineBasicBlock &B, InstrIndexMap &I2X,
+      DefUseInfoMap &DUM) {
+  unsigned Index = 0;
+  unsigned NR = HRI->getNumRegs();
+  BitVector Defs(NR), Uses(NR);
+
+  for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) {
+    MachineInstr *MI = &*I;
+    I2X.insert(std::make_pair(MI, Index));
+    Defs.reset();
+    Uses.reset();
+    getDefsUses(MI, Defs, Uses);
+    DUM.insert(std::make_pair(Index, DefUseInfo(Defs, Uses)));
+    Index++;
+  }
+}
+
+bool HexagonGenMux::isCondTransfer(unsigned Opc) const {
+  switch (Opc) {
+    case Hexagon::A2_tfrt:
+    case Hexagon::A2_tfrf:
+    case Hexagon::C2_cmoveit:
+    case Hexagon::C2_cmoveif:
+      return true;
+  }
+  return false;
+}
+
+unsigned HexagonGenMux::getMuxOpcode(const MachineOperand &Src1,
+      const MachineOperand &Src2) const {
+  bool IsReg1 = Src1.isReg(), IsReg2 = Src2.isReg();
+  if (IsReg1)
+    return IsReg2 ? Hexagon::C2_mux : Hexagon::C2_muxir;
+  if (IsReg2)
+    return Hexagon::C2_muxri;
+
+  // Neither is a register. The first source is extendable, but the second
+  // is not (s8).
+  if (Src2.isImm() && isInt<8>(Src2.getImm()))
+    return Hexagon::C2_muxii;
+
+  return 0;
+}
+
+bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
+  bool Changed = false;
+  InstrIndexMap I2X;
+  DefUseInfoMap DUM;
+  buildMaps(B, I2X, DUM);
+
+  typedef DenseMap<unsigned,CondsetInfo> CondsetMap;
+  CondsetMap CM;
+  MuxInfoList ML;
+
+  MachineBasicBlock::iterator NextI, End = B.end();
+  for (MachineBasicBlock::iterator I = B.begin(); I != End; I = NextI) {
+    MachineInstr *MI = &*I;
+    NextI = std::next(I);
+    unsigned Opc = MI->getOpcode();
+    if (!isCondTransfer(Opc))
+      continue;
+    unsigned DR = MI->getOperand(0).getReg();
+    if (isRegPair(DR))
+      continue;
+
+    unsigned PR = MI->getOperand(1).getReg();
+    unsigned Idx = I2X.lookup(MI);
+    CondsetMap::iterator F = CM.find(DR);
+    bool IfTrue = HII->isPredicatedTrue(Opc);
+
+    // If there is no record of a conditional transfer for this register,
+    // or the predicate register differs, create a new record for it.
+    if (F != CM.end() && F->second.PredR != PR) {
+      CM.erase(F);
+      F = CM.end();
+    }
+    if (F == CM.end()) {
+      auto It = CM.insert(std::make_pair(DR, CondsetInfo()));
+      F = It.first;
+      F->second.PredR = PR;
+    }
+    CondsetInfo &CI = F->second;
+    if (IfTrue)
+      CI.TrueX = Idx;
+    else
+      CI.FalseX = Idx;
+    if (CI.TrueX == std::numeric_limits<unsigned>::max() ||
+        CI.FalseX == std::numeric_limits<unsigned>::max())
+      continue;
+
+    // There is now a complete definition of DR, i.e. we have the predicate
+    // register, the definition if-true, and definition if-false.
+
+    // First, check if both definitions are far enough from the definition
+    // of the predicate register.
+    unsigned MinX = std::min(CI.TrueX, CI.FalseX);
+    unsigned MaxX = std::max(CI.TrueX, CI.FalseX);
+    unsigned SearchX = (MaxX > 4) ? MaxX-4 : 0;
+    bool NearDef = false;
+    for (unsigned X = SearchX; X < MaxX; ++X) {
+      const DefUseInfo &DU = DUM.lookup(X);
+      if (!DU.Defs[PR])
+        continue;
+      NearDef = true;
+      break;
+    }
+    if (NearDef)
+      continue;
+
+    // The predicate register is not defined in the last few instructions.
+    // Check if the conversion to MUX is possible (either "up", i.e. at the
+    // place of the earlier partial definition, or "down", where the later
+    // definition is located). Examine all defs and uses between these two
+    // definitions.
+    // SR1, SR2 - source registers from the first and the second definition.
+    MachineBasicBlock::iterator It1 = B.begin(), It2 = B.begin();
+    std::advance(It1, MinX);
+    std::advance(It2, MaxX);
+    MachineInstr &Def1 = *It1, &Def2 = *It2;
+    MachineOperand *Src1 = &Def1.getOperand(2), *Src2 = &Def2.getOperand(2);
+    unsigned SR1 = Src1->isReg() ? Src1->getReg() : 0;
+    unsigned SR2 = Src2->isReg() ? Src2->getReg() : 0;
+    bool Failure = false, CanUp = true, CanDown = true;
+    for (unsigned X = MinX+1; X < MaxX; X++) {
+      const DefUseInfo &DU = DUM.lookup(X);
+      if (DU.Defs[PR] || DU.Defs[DR] || DU.Uses[DR]) {
+        Failure = true;
+        break;
+      }
+      if (CanDown && DU.Defs[SR1])
+        CanDown = false;
+      if (CanUp && DU.Defs[SR2])
+        CanUp = false;
+    }
+    if (Failure || (!CanUp && !CanDown))
+      continue;
+
+    MachineOperand *SrcT = (MinX == CI.TrueX) ? Src1 : Src2;
+    MachineOperand *SrcF = (MinX == CI.FalseX) ? Src1 : Src2;
+    // Prefer "down", since this will move the MUX farther away from the
+    // predicate definition.
+    MachineBasicBlock::iterator At = CanDown ? Def2 : Def1;
+    ML.push_back(MuxInfo(At, DR, PR, SrcT, SrcF, Def1, Def2));
+  }
+
+  for (unsigned I = 0, N = ML.size(); I < N; ++I) {
+    MuxInfo &MX = ML[I];
+    MachineBasicBlock &B = *MX.At->getParent();
+    DebugLoc DL = MX.At->getDebugLoc();
+    unsigned MxOpc = getMuxOpcode(*MX.SrcT, *MX.SrcF);
+    if (!MxOpc)
+      continue;
+    BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR)
+      .addReg(MX.PredR)
+      .addOperand(*MX.SrcT)
+      .addOperand(*MX.SrcF);
+    B.erase(MX.Def1);
+    B.erase(MX.Def2);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool HexagonGenMux::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+  HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  bool Changed = false;
+  for (auto &I : MF)
+    Changed |= genMuxInBlock(I);
+  return Changed;
+}
+
+FunctionPass *llvm::createHexagonGenMux() {
+  return new HexagonGenMux();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
new file mode 100644
index 000000000000..f14c733dcf51
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -0,0 +1,538 @@
+//===--- HexagonGenPredicate.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "gen-pred"
+
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <iterator>
+#include <map>
+#include <queue>
+#include <set>
+#include <utility>
+
+using namespace llvm;
+
+namespace llvm {
+
+  void initializeHexagonGenPredicatePass(PassRegistry& Registry);
+  FunctionPass *createHexagonGenPredicate();
+
+} // end namespace llvm
+
+namespace {
+
+  struct Register {
+    unsigned R, S;
+
+    Register(unsigned r = 0, unsigned s = 0) : R(r), S(s) {}
+    Register(const MachineOperand &MO) : R(MO.getReg()), S(MO.getSubReg()) {}
+
+    bool operator== (const Register &Reg) const {
+      return R == Reg.R && S == Reg.S;
+    }
+
+    bool operator< (const Register &Reg) const {
+      return R < Reg.R || (R == Reg.R && S < Reg.S);
+    }
+  };
+
+  struct PrintRegister {
+    friend raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR);
+
+    PrintRegister(Register R, const TargetRegisterInfo &I) : Reg(R), TRI(I) {}
+
+  private:
+    Register Reg;
+    const TargetRegisterInfo &TRI;
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR)
+    LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR) {
+    return OS << PrintReg(PR.Reg.R, &PR.TRI, PR.Reg.S);
+  }
+
+  class HexagonGenPredicate : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    HexagonGenPredicate() : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr),
+        MRI(nullptr) {
+      initializeHexagonGenPredicatePass(*PassRegistry::getPassRegistry());
+    }
+
+    StringRef getPassName() const override {
+      return "Hexagon generate predicate operations";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    typedef SetVector<MachineInstr*> VectOfInst;
+    typedef std::set<Register> SetOfReg;
+    typedef std::map<Register,Register> RegToRegMap;
+
+    const HexagonInstrInfo *TII;
+    const HexagonRegisterInfo *TRI;
+    MachineRegisterInfo *MRI;
+    SetOfReg PredGPRs;
+    VectOfInst PUsers;
+    RegToRegMap G2P;
+
+    bool isPredReg(unsigned R);
+    void collectPredicateGPR(MachineFunction &MF);
+    void processPredicateGPR(const Register &Reg);
+    unsigned getPredForm(unsigned Opc);
+    bool isConvertibleToPredForm(const MachineInstr *MI);
+    bool isScalarCmp(unsigned Opc);
+    bool isScalarPred(Register PredReg);
+    Register getPredRegFor(const Register &Reg);
+    bool convertToPredForm(MachineInstr *MI);
+    bool eliminatePredCopies(MachineFunction &MF);
+  };
+
+  char HexagonGenPredicate::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(HexagonGenPredicate, "hexagon-gen-pred",
+  "Hexagon generate predicate operations", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(HexagonGenPredicate, "hexagon-gen-pred",
+  "Hexagon generate predicate operations", false, false)
+
+bool HexagonGenPredicate::isPredReg(unsigned R) {
+  if (!TargetRegisterInfo::isVirtualRegister(R))
+    return false;
+  const TargetRegisterClass *RC = MRI->getRegClass(R);
+  return RC == &Hexagon::PredRegsRegClass;
+}
+
+unsigned HexagonGenPredicate::getPredForm(unsigned Opc) {
+  using namespace Hexagon;
+
+  switch (Opc) {
+    case A2_and:
+    case A2_andp:
+      return C2_and;
+    case A4_andn:
+    case A4_andnp:
+      return C2_andn;
+    case M4_and_and:
+      return C4_and_and;
+    case M4_and_andn:
+      return C4_and_andn;
+    case M4_and_or:
+      return C4_and_or;
+
+    case A2_or:
+    case A2_orp:
+      return C2_or;
+    case A4_orn:
+    case A4_ornp:
+      return C2_orn;
+    case M4_or_and:
+      return C4_or_and;
+    case M4_or_andn:
+      return C4_or_andn;
+    case M4_or_or:
+      return C4_or_or;
+
+    case A2_xor:
+    case A2_xorp:
+      return C2_xor;
+
+    case C2_tfrrp:
+      return COPY;
+  }
+  // The opcode corresponding to 0 is TargetOpcode::PHI. We can use 0 here
+  // to denote "none", but we need to make sure that none of the valid opcodes
+  // that we return will ever be 0.
+  static_assert(PHI == 0, "Use different value for <none>");
+  return 0;
+}
+
+bool HexagonGenPredicate::isConvertibleToPredForm(const MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+  if (getPredForm(Opc) != 0)
+    return true;
+
+  // Comparisons against 0 are also convertible. This does not apply to
+  // A4_rcmpeqi or A4_rcmpneqi, since they produce values 0 or 1, which
+  // may not match the value that the predicate register would have if
+  // it was converted to a predicate form.
+  switch (Opc) {
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::C4_cmpneqi:
+      if (MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0)
+        return true;
+      break;
+  }
+  return false;
+}
+
+void HexagonGenPredicate::collectPredicateGPR(MachineFunction &MF) {
+  for (MachineFunction::iterator A = MF.begin(), Z = MF.end(); A != Z; ++A) {
+    MachineBasicBlock &B = *A;
+    for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) {
+      MachineInstr *MI = &*I;
+      unsigned Opc = MI->getOpcode();
+      switch (Opc) {
+        case Hexagon::C2_tfrpr:
+        case TargetOpcode::COPY:
+          if (isPredReg(MI->getOperand(1).getReg())) {
+            Register RD = MI->getOperand(0);
+            if (TargetRegisterInfo::isVirtualRegister(RD.R))
+              PredGPRs.insert(RD);
+          }
+          break;
+      }
+    }
+  }
+}
+
+void HexagonGenPredicate::processPredicateGPR(const Register &Reg) {
+  DEBUG(dbgs() << __func__ << ": "
+               << PrintReg(Reg.R, TRI, Reg.S) << "\n");
+  typedef MachineRegisterInfo::use_iterator use_iterator;
+  use_iterator I = MRI->use_begin(Reg.R), E = MRI->use_end();
+  if (I == E) {
+    DEBUG(dbgs() << "Dead reg: " << PrintReg(Reg.R, TRI, Reg.S) << '\n');
+    MachineInstr *DefI = MRI->getVRegDef(Reg.R);
+    DefI->eraseFromParent();
+    return;
+  }
+
+  for (; I != E; ++I) {
+    MachineInstr *UseI = I->getParent();
+    if (isConvertibleToPredForm(UseI))
+      PUsers.insert(UseI);
+  }
+}
+
+Register HexagonGenPredicate::getPredRegFor(const Register &Reg) {
+  // Create a predicate register for a given Reg. The newly created register
+  // will have its value copied from Reg, so that it can be later used as
+  // an operand in other instructions.
+  assert(TargetRegisterInfo::isVirtualRegister(Reg.R));
+  RegToRegMap::iterator F = G2P.find(Reg);
+  if (F != G2P.end())
+    return F->second;
+
+  DEBUG(dbgs() << __func__ << ": " << PrintRegister(Reg, *TRI));
+  MachineInstr *DefI = MRI->getVRegDef(Reg.R);
+  assert(DefI);
+  unsigned Opc = DefI->getOpcode();
+  if (Opc == Hexagon::C2_tfrpr || Opc == TargetOpcode::COPY) {
+    assert(DefI->getOperand(0).isDef() && DefI->getOperand(1).isUse());
+    Register PR = DefI->getOperand(1);
+    G2P.insert(std::make_pair(Reg, PR));
+    DEBUG(dbgs() << " -> " << PrintRegister(PR, *TRI) << '\n');
+    return PR;
+  }
+
+  MachineBasicBlock &B = *DefI->getParent();
+  DebugLoc DL = DefI->getDebugLoc();
+  const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+  unsigned NewPR = MRI->createVirtualRegister(PredRC);
+
+  // For convertible instructions, do not modify them, so that they can
+  // be converted later.  Generate a copy from Reg to NewPR.
+  if (isConvertibleToPredForm(DefI)) {
+    MachineBasicBlock::iterator DefIt = DefI;
+    BuildMI(B, std::next(DefIt), DL, TII->get(TargetOpcode::COPY), NewPR)
+      .addReg(Reg.R, 0, Reg.S);
+    G2P.insert(std::make_pair(Reg, Register(NewPR)));
+    DEBUG(dbgs() << " -> !" << PrintRegister(Register(NewPR), *TRI) << '\n');
+    return Register(NewPR);
+  }
+
+  llvm_unreachable("Invalid argument");
+}
+
+bool HexagonGenPredicate::isScalarCmp(unsigned Opc) {
+  switch (Opc) {
+    case Hexagon::C2_cmpeq:
+    case Hexagon::C2_cmpgt:
+    case Hexagon::C2_cmpgtu:
+    case Hexagon::C2_cmpeqp:
+    case Hexagon::C2_cmpgtp:
+    case Hexagon::C2_cmpgtup:
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::C2_cmpgti:
+    case Hexagon::C2_cmpgtui:
+    case Hexagon::C2_cmpgei:
+    case Hexagon::C2_cmpgeui:
+    case Hexagon::C4_cmpneqi:
+    case Hexagon::C4_cmpltei:
+    case Hexagon::C4_cmplteui:
+    case Hexagon::C4_cmpneq:
+    case Hexagon::C4_cmplte:
+    case Hexagon::C4_cmplteu:
+    case Hexagon::A4_cmpbeq:
+    case Hexagon::A4_cmpbeqi:
+    case Hexagon::A4_cmpbgtu:
+    case Hexagon::A4_cmpbgtui:
+    case Hexagon::A4_cmpbgt:
+    case Hexagon::A4_cmpbgti:
+    case Hexagon::A4_cmpheq:
+    case Hexagon::A4_cmphgt:
+    case Hexagon::A4_cmphgtu:
+    case Hexagon::A4_cmpheqi:
+    case Hexagon::A4_cmphgti:
+    case Hexagon::A4_cmphgtui:
+      return true;
+  }
+  return false;
+}
+
+bool HexagonGenPredicate::isScalarPred(Register PredReg) {
+  std::queue<Register> WorkQ;
+  WorkQ.push(PredReg);
+
+  while (!WorkQ.empty()) {
+    Register PR = WorkQ.front();
+    WorkQ.pop();
+    const MachineInstr *DefI = MRI->getVRegDef(PR.R);
+    if (!DefI)
+      return false;
+    unsigned DefOpc = DefI->getOpcode();
+    switch (DefOpc) {
+      case TargetOpcode::COPY: {
+        const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+        if (MRI->getRegClass(PR.R) != PredRC)
+          return false;
+        // If it is a copy between two predicate registers, fall through.
+      }
+      case Hexagon::C2_and:
+      case Hexagon::C2_andn:
+      case Hexagon::C4_and_and:
+      case Hexagon::C4_and_andn:
+      case Hexagon::C4_and_or:
+      case Hexagon::C2_or:
+      case Hexagon::C2_orn:
+      case Hexagon::C4_or_and:
+      case Hexagon::C4_or_andn:
+      case Hexagon::C4_or_or:
+      case Hexagon::C4_or_orn:
+      case Hexagon::C2_xor:
+        // Add operands to the queue.
+        for (const MachineOperand &MO : DefI->operands())
+          if (MO.isReg() && MO.isUse())
+            WorkQ.push(Register(MO.getReg()));
+        break;
+
+      // All non-vector compares are ok, everything else is bad.
+      default:
+        return isScalarCmp(DefOpc);
+    }
+  }
+
+  return true;
+}
+
+bool HexagonGenPredicate::convertToPredForm(MachineInstr *MI) {
+  DEBUG(dbgs() << __func__ << ": " << MI << " " << *MI);
+
+  unsigned Opc = MI->getOpcode();
+  assert(isConvertibleToPredForm(MI));
+  unsigned NumOps = MI->getNumOperands();
+  for (unsigned i = 0; i < NumOps; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    Register Reg(MO);
+    if (Reg.S && Reg.S != Hexagon::isub_lo)
+      return false;
+    if (!PredGPRs.count(Reg))
+      return false;
+  }
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned NewOpc = getPredForm(Opc);
+  // Special case for comparisons against 0.
+  if (NewOpc == 0) {
+    switch (Opc) {
+      case Hexagon::C2_cmpeqi:
+        NewOpc = Hexagon::C2_not;
+        break;
+      case Hexagon::C4_cmpneqi:
+        NewOpc = TargetOpcode::COPY;
+        break;
+      default:
+        return false;
+    }
+
+    // If it's a scalar predicate register, then all bits in it are
+    // the same. Otherwise, to determine whether all bits are 0 or not
+    // we would need to use any8.
+    Register PR = getPredRegFor(MI->getOperand(1));
+    if (!isScalarPred(PR))
+      return false;
+    // This will skip the immediate argument when creating the predicate
+    // version instruction.
+    NumOps = 2;
+  }
+
+  // Some sanity: check that def is in operand #0.
+  MachineOperand &Op0 = MI->getOperand(0);
+  assert(Op0.isDef());
+  Register OutR(Op0);
+
+  // Don't use getPredRegFor, since it will create an association between
+  // the argument and a created predicate register (i.e. it will insert a
+  // copy if a new predicate register is created).
+  const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+  Register NewPR = MRI->createVirtualRegister(PredRC);
+  MachineInstrBuilder MIB = BuildMI(B, MI, DL, TII->get(NewOpc), NewPR.R);
+
+  // Add predicate counterparts of the GPRs.
+  for (unsigned i = 1; i < NumOps; ++i) {
+    Register GPR = MI->getOperand(i);
+    Register Pred = getPredRegFor(GPR);
+    MIB.addReg(Pred.R, 0, Pred.S);
+  }
+  DEBUG(dbgs() << "generated: " << *MIB);
+
+  // Generate a copy-out: NewGPR = NewPR, and replace all uses of OutR
+  // with NewGPR.
+  const TargetRegisterClass *RC = MRI->getRegClass(OutR.R);
+  unsigned NewOutR = MRI->createVirtualRegister(RC);
+  BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), NewOutR)
+    .addReg(NewPR.R, 0, NewPR.S);
+  MRI->replaceRegWith(OutR.R, NewOutR);
+  MI->eraseFromParent();
+
+  // If the processed instruction was C2_tfrrp (i.e. Rn = Pm; Pk = Rn),
+  // then the output will be a predicate register.  Do not visit the
+  // users of it.
+  if (!isPredReg(NewOutR)) {
+    Register R(NewOutR);
+    PredGPRs.insert(R);
+    processPredicateGPR(R);
+  }
+  return true;
+}
+
+bool HexagonGenPredicate::eliminatePredCopies(MachineFunction &MF) {
+  DEBUG(dbgs() << __func__ << "\n");
+  const TargetRegisterClass *PredRC = &Hexagon::PredRegsRegClass;
+  bool Changed = false;
+  VectOfInst Erase;
+
+  // First, replace copies
+  //   IntR = PredR1
+  //   PredR2 = IntR
+  // with
+  //   PredR2 = PredR1
+  // Such sequences can be generated when a copy-into-pred is generated from
+  // a gpr register holding a result of a convertible instruction. After
+  // the convertible instruction is converted, its predicate result will be
+  // copied back into the original gpr.
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (MI.getOpcode() != TargetOpcode::COPY)
+        continue;
+      Register DR = MI.getOperand(0);
+      Register SR = MI.getOperand(1);
+      if (!TargetRegisterInfo::isVirtualRegister(DR.R))
+        continue;
+      if (!TargetRegisterInfo::isVirtualRegister(SR.R))
+        continue;
+      if (MRI->getRegClass(DR.R) != PredRC)
+        continue;
+      if (MRI->getRegClass(SR.R) != PredRC)
+        continue;
+      assert(!DR.S && !SR.S && "Unexpected subregister");
+      MRI->replaceRegWith(DR.R, SR.R);
+      Erase.insert(&MI);
+      Changed = true;
+    }
+  }
+
+  for (VectOfInst::iterator I = Erase.begin(), E = Erase.end(); I != E; ++I)
+    (*I)->eraseFromParent();
+
+  return Changed;
+}
+
+bool HexagonGenPredicate::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  TII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  TRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  PredGPRs.clear();
+  PUsers.clear();
+  G2P.clear();
+
+  bool Changed = false;
+  collectPredicateGPR(MF);
+  for (SetOfReg::iterator I = PredGPRs.begin(), E = PredGPRs.end(); I != E; ++I)
+    processPredicateGPR(*I);
+
+  bool Again;
+  do {
+    Again = false;
+    VectOfInst Processed, Copy;
+
+    typedef VectOfInst::iterator iterator;
+    Copy = PUsers;
+    for (iterator I = Copy.begin(), E = Copy.end(); I != E; ++I) {
+      MachineInstr *MI = *I;
+      bool Done = convertToPredForm(MI);
+      if (Done) {
+        Processed.insert(MI);
+        Again = true;
+      }
+    }
+    Changed |= Again;
+
+    auto Done = [Processed] (MachineInstr *MI) -> bool {
+      return Processed.count(MI);
+    };
+    PUsers.remove_if(Done);
+  } while (Again);
+
+  Changed |= eliminatePredCopies(MF);
+  return Changed;
+}
+
+FunctionPass *llvm::createHexagonGenPredicate() {
+  return new HexagonGenPredicate();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
new file mode 100644
index 000000000000..e477dcc0f64a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -0,0 +1,1980 @@
+//===-- HexagonHardwareLoops.cpp - Identify and generate hardware loops ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies loops where we can generate the Hexagon hardware
+// loop instruction.  The hardware loop can perform loop branches with a
+// zero-cycle overhead.
+//
+// The pattern that defines the induction variable can changed depending on
+// prior optimizations.  For example, the IndVarSimplify phase run by 'opt'
+// normalizes induction variables, and the Loop Strength Reduction pass
+// run by 'llc' may also make changes to the induction variable.
+// The pattern detected by this phase is due to running Strength Reduction.
+//
+// Criteria for hardware loops:
+//  - Countable loops (w/ ind. var for a trip count)
+//  - Assumes loops are normalized by IndVarSimplify
+//  - Try inner-most loops first
+//  - No function calls in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hwloops"
+
+#ifndef NDEBUG
+static cl::opt<int> HWLoopLimit("hexagon-max-hwloop", cl::Hidden, cl::init(-1));
+
+// Option to create preheader only for a specific function.
+static cl::opt<std::string> PHFn("hexagon-hwloop-phfn", cl::Hidden,
+                                 cl::init(""));
+#endif
+
+// Option to create a preheader if one doesn't exist.
+static cl::opt<bool> HWCreatePreheader("hexagon-hwloop-preheader",
+    cl::Hidden, cl::init(true),
+    cl::desc("Add a preheader to a hardware loop if one doesn't exist"));
+
+// Turn it off by default. If a preheader block is not created here, the
+// software pipeliner may be unable to find a block suitable to serve as
+// a preheader. In that case SWP will not run.
+static cl::opt<bool> SpecPreheader("hwloop-spec-preheader", cl::init(false),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Allow speculation of preheader "
+  "instructions"));
+
+STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
+
+namespace llvm {
+
+  FunctionPass *createHexagonHardwareLoops();
+  void initializeHexagonHardwareLoopsPass(PassRegistry&);
+
+} // end namespace llvm
+
+namespace {
+
+  class CountValue;
+
+  struct HexagonHardwareLoops : public MachineFunctionPass {
+    MachineLoopInfo            *MLI;
+    MachineRegisterInfo        *MRI;
+    MachineDominatorTree       *MDT;
+    const HexagonInstrInfo     *TII;
+#ifndef NDEBUG
+    static int Counter;
+#endif
+
+  public:
+    static char ID;
+
+    HexagonHardwareLoops() : MachineFunctionPass(ID) {
+      initializeHexagonHardwareLoopsPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    StringRef getPassName() const override { return "Hexagon Hardware Loops"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    typedef std::map<unsigned, MachineInstr *> LoopFeederMap;
+
+    /// Kinds of comparisons in the compare instructions.
+    struct Comparison {
+      enum Kind {
+        EQ  = 0x01,
+        NE  = 0x02,
+        L   = 0x04,
+        G   = 0x08,
+        U   = 0x40,
+        LTs = L,
+        LEs = L | EQ,
+        GTs = G,
+        GEs = G | EQ,
+        LTu = L      | U,
+        LEu = L | EQ | U,
+        GTu = G      | U,
+        GEu = G | EQ | U
+      };
+
+      static Kind getSwappedComparison(Kind Cmp) {
+        assert ((!((Cmp & L) && (Cmp & G))) && "Malformed comparison operator");
+        if ((Cmp & L) || (Cmp & G))
+          return (Kind)(Cmp ^ (L|G));
+        return Cmp;
+      }
+
+      static Kind getNegatedComparison(Kind Cmp) {
+        if ((Cmp & L) || (Cmp & G))
+          return (Kind)((Cmp ^ (L | G)) ^ EQ);
+        if ((Cmp & NE) || (Cmp & EQ))
+          return (Kind)(Cmp ^ (EQ | NE));
+        return (Kind)0;
+      }
+
+      static bool isSigned(Kind Cmp) {
+        return (Cmp & (L | G) && !(Cmp & U));
+      }
+
+      static bool isUnsigned(Kind Cmp) {
+        return (Cmp & U);
+      }
+    };
+
+    /// \brief Find the register that contains the loop controlling
+    /// induction variable.
+    /// If successful, it will return true and set the \p Reg, \p IVBump
+    /// and \p IVOp arguments.  Otherwise it will return false.
+    /// The returned induction register is the register R that follows the
+    /// following induction pattern:
+    /// loop:
+    ///   R = phi ..., [ R.next, LatchBlock ]
+    ///   R.next = R + #bump
+    ///   if (R.next < #N) goto loop
+    /// IVBump is the immediate value added to R, and IVOp is the instruction
+    /// "R.next = R + #bump".
+    bool findInductionRegister(MachineLoop *L, unsigned &Reg,
+                               int64_t &IVBump, MachineInstr *&IVOp) const;
+
+    /// \brief Return the comparison kind for the specified opcode.
+    Comparison::Kind getComparisonKind(unsigned CondOpc,
+                                       MachineOperand *InitialValue,
+                                       const MachineOperand *Endvalue,
+                                       int64_t IVBump) const;
+
+    /// \brief Analyze the statements in a loop to determine if the loop
+    /// has a computable trip count and, if so, return a value that represents
+    /// the trip count expression.
+    CountValue *getLoopTripCount(MachineLoop *L,
+                                 SmallVectorImpl<MachineInstr *> &OldInsts);
+
+    /// \brief Return the expression that represents the number of times
+    /// a loop iterates.  The function takes the operands that represent the
+    /// loop start value, loop end value, and induction value.  Based upon
+    /// these operands, the function attempts to compute the trip count.
+    /// If the trip count is not directly available (as an immediate value,
+    /// or a register), the function will attempt to insert computation of it
+    /// to the loop's preheader.
+    CountValue *computeCount(MachineLoop *Loop, const MachineOperand *Start,
+                             const MachineOperand *End, unsigned IVReg,
+                             int64_t IVBump, Comparison::Kind Cmp) const;
+
+    /// \brief Return true if the instruction is not valid within a hardware
+    /// loop.
+    bool isInvalidLoopOperation(const MachineInstr *MI,
+                                bool IsInnerHWLoop) const;
+
+    /// \brief Return true if the loop contains an instruction that inhibits
+    /// using the hardware loop.
+    bool containsInvalidInstruction(MachineLoop *L, bool IsInnerHWLoop) const;
+
+    /// \brief Given a loop, check if we can convert it to a hardware loop.
+    /// If so, then perform the conversion and return true.
+    bool convertToHardwareLoop(MachineLoop *L, bool &L0used, bool &L1used);
+
+    /// \brief Return true if the instruction is now dead.
+    bool isDead(const MachineInstr *MI,
+                SmallVectorImpl<MachineInstr *> &DeadPhis) const;
+
+    /// \brief Remove the instruction if it is now dead.
+    void removeIfDead(MachineInstr *MI);
+
+    /// \brief Make sure that the "bump" instruction executes before the
+    /// compare.  We need that for the IV fixup, so that the compare
+    /// instruction would not use a bumped value that has not yet been
+    /// defined.  If the instructions are out of order, try to reorder them.
+    bool orderBumpCompare(MachineInstr *BumpI, MachineInstr *CmpI);
+
+    /// \brief Return true if MO and MI pair is visited only once. If visited
+    /// more than once, this indicates there is recursion. In such a case,
+    /// return false.
+    bool isLoopFeeder(MachineLoop *L, MachineBasicBlock *A, MachineInstr *MI,
+                      const MachineOperand *MO,
+                      LoopFeederMap &LoopFeederPhi) const;
+
+    /// \brief Return true if the Phi may generate a value that may underflow,
+    /// or may wrap.
+    bool phiMayWrapOrUnderflow(MachineInstr *Phi, const MachineOperand *EndVal,
+                               MachineBasicBlock *MBB, MachineLoop *L,
+                               LoopFeederMap &LoopFeederPhi) const;
+
+    /// \brief Return true if the induction variable may underflow an unsigned
+    /// value in the first iteration.
+    bool loopCountMayWrapOrUnderFlow(const MachineOperand *InitVal,
+                                     const MachineOperand *EndVal,
+                                     MachineBasicBlock *MBB, MachineLoop *L,
+                                     LoopFeederMap &LoopFeederPhi) const;
+
+    /// \brief Check if the given operand has a compile-time known constant
+    /// value. Return true if yes, and false otherwise. When returning true, set
+    /// Val to the corresponding constant value.
+    bool checkForImmediate(const MachineOperand &MO, int64_t &Val) const;
+
+    /// \brief Check if the operand has a compile-time known constant value.
+    bool isImmediate(const MachineOperand &MO) const {
+      int64_t V;
+      return checkForImmediate(MO, V);
+    }
+
+    /// \brief Return the immediate for the specified operand.
+    int64_t getImmediate(const MachineOperand &MO) const {
+      int64_t V;
+      if (!checkForImmediate(MO, V))
+        llvm_unreachable("Invalid operand");
+      return V;
+    }
+
+    /// \brief Reset the given machine operand to now refer to a new immediate
+    /// value.  Assumes that the operand was already referencing an immediate
+    /// value, either directly, or via a register.
+    void setImmediate(MachineOperand &MO, int64_t Val);
+
+    /// \brief Fix the data flow of the induction varible.
+    /// The desired flow is: phi ---> bump -+-> comparison-in-latch.
+    ///                                     |
+    ///                                     +-> back to phi
+    /// where "bump" is the increment of the induction variable:
+    ///   iv = iv + #const.
+    /// Due to some prior code transformations, the actual flow may look
+    /// like this:
+    ///   phi -+-> bump ---> back to phi
+    ///        |
+    ///        +-> comparison-in-latch (against upper_bound-bump),
+    /// i.e. the comparison that controls the loop execution may be using
+    /// the value of the induction variable from before the increment.
+    ///
+    /// Return true if the loop's flow is the desired one (i.e. it's
+    /// either been fixed, or no fixing was necessary).
+    /// Otherwise, return false.  This can happen if the induction variable
+    /// couldn't be identified, or if the value in the latch's comparison
+    /// cannot be adjusted to reflect the post-bump value.
+    bool fixupInductionVariable(MachineLoop *L);
+
+    /// \brief Given a loop, if it does not have a preheader, create one.
+    /// Return the block that is the preheader.
+    MachineBasicBlock *createPreheaderForLoop(MachineLoop *L);
+  };
+
+  char HexagonHardwareLoops::ID = 0;
+#ifndef NDEBUG
+  int HexagonHardwareLoops::Counter = 0;
+#endif
+
+  /// \brief Abstraction for a trip count of a loop. A smaller version
+  /// of the MachineOperand class without the concerns of changing the
+  /// operand representation.
+  class CountValue {
+  public:
+    enum CountValueType {
+      CV_Register,
+      CV_Immediate
+    };
+
+  private:
+    CountValueType Kind;
+    union Values {
+      struct {
+        unsigned Reg;
+        unsigned Sub;
+      } R;
+      unsigned ImmVal;
+    } Contents;
+
+  public:
+    explicit CountValue(CountValueType t, unsigned v, unsigned u = 0) {
+      Kind = t;
+      if (Kind == CV_Register) {
+        Contents.R.Reg = v;
+        Contents.R.Sub = u;
+      } else {
+        Contents.ImmVal = v;
+      }
+    }
+
+    bool isReg() const { return Kind == CV_Register; }
+    bool isImm() const { return Kind == CV_Immediate; }
+
+    unsigned getReg() const {
+      assert(isReg() && "Wrong CountValue accessor");
+      return Contents.R.Reg;
+    }
+    unsigned getSubReg() const {
+      assert(isReg() && "Wrong CountValue accessor");
+      return Contents.R.Sub;
+    }
+    unsigned getImm() const {
+      assert(isImm() && "Wrong CountValue accessor");
+      return Contents.ImmVal;
+    }
+
+    void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const {
+      if (isReg()) { OS << PrintReg(Contents.R.Reg, TRI, Contents.R.Sub); }
+      if (isImm()) { OS << Contents.ImmVal; }
+    }
+  };
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(HexagonHardwareLoops, "hwloops",
+                      "Hexagon Hardware Loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(HexagonHardwareLoops, "hwloops",
+                    "Hexagon Hardware Loops", false, false)
+
+FunctionPass *llvm::createHexagonHardwareLoops() {
+  return new HexagonHardwareLoops();
+}
+
+bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********* Hexagon Hardware Loops *********\n");
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  bool Changed = false;
+
+  MLI = &getAnalysis<MachineLoopInfo>();
+  MRI = &MF.getRegInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  TII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+
+  for (auto &L : *MLI)
+    if (!L->getParentLoop()) {
+      bool L0Used = false;
+      bool L1Used = false;
+      Changed |= convertToHardwareLoop(L, L0Used, L1Used);
+    }
+
+  return Changed;
+}
+
+bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L,
+                                                 unsigned &Reg,
+                                                 int64_t &IVBump,
+                                                 MachineInstr *&IVOp
+                                                 ) const {
+  MachineBasicBlock *Header = L->getHeader();
+  MachineBasicBlock *Preheader = MLI->findLoopPreheader(L, SpecPreheader);
+  MachineBasicBlock *Latch = L->getLoopLatch();
+  MachineBasicBlock *ExitingBlock = L->findLoopControlBlock();
+  if (!Header || !Preheader || !Latch || !ExitingBlock)
+    return false;
+
+  // This pair represents an induction register together with an immediate
+  // value that will be added to it in each loop iteration.
+  typedef std::pair<unsigned,int64_t> RegisterBump;
+
+  // Mapping:  R.next -> (R, bump), where R, R.next and bump are derived
+  // from an induction operation
+  //   R.next = R + bump
+  // where bump is an immediate value.
+  typedef std::map<unsigned,RegisterBump> InductionMap;
+
+  InductionMap IndMap;
+
+  typedef MachineBasicBlock::instr_iterator instr_iterator;
+  for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+       I != E && I->isPHI(); ++I) {
+    MachineInstr *Phi = &*I;
+
+    // Have a PHI instruction.  Get the operand that corresponds to the
+    // latch block, and see if is a result of an addition of form "reg+imm",
+    // where the "reg" is defined by the PHI node we are looking at.
+    for (unsigned i = 1, n = Phi->getNumOperands(); i < n; i += 2) {
+      if (Phi->getOperand(i+1).getMBB() != Latch)
+        continue;
+
+      unsigned PhiOpReg = Phi->getOperand(i).getReg();
+      MachineInstr *DI = MRI->getVRegDef(PhiOpReg);
+
+      if (DI->getDesc().isAdd()) {
+        // If the register operand to the add is the PHI we're looking at, this
+        // meets the induction pattern.
+        unsigned IndReg = DI->getOperand(1).getReg();
+        MachineOperand &Opnd2 = DI->getOperand(2);
+        int64_t V;
+        if (MRI->getVRegDef(IndReg) == Phi && checkForImmediate(Opnd2, V)) {
+          unsigned UpdReg = DI->getOperand(0).getReg();
+          IndMap.insert(std::make_pair(UpdReg, std::make_pair(IndReg, V)));
+        }
+      }
+    }  // for (i)
+  }  // for (instr)
+
+  SmallVector<MachineOperand,2> Cond;
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
+  bool NotAnalyzed = TII->analyzeBranch(*ExitingBlock, TB, FB, Cond, false);
+  if (NotAnalyzed)
+    return false;
+
+  unsigned PredR, PredPos, PredRegFlags;
+  if (!TII->getPredReg(Cond, PredR, PredPos, PredRegFlags))
+    return false;
+
+  MachineInstr *PredI = MRI->getVRegDef(PredR);
+  if (!PredI->isCompare())
+    return false;
+
+  unsigned CmpReg1 = 0, CmpReg2 = 0;
+  int CmpImm = 0, CmpMask = 0;
+  bool CmpAnalyzed =
+      TII->analyzeCompare(*PredI, CmpReg1, CmpReg2, CmpMask, CmpImm);
+  // Fail if the compare was not analyzed, or it's not comparing a register
+  // with an immediate value.  Not checking the mask here, since we handle
+  // the individual compare opcodes (including A4_cmpb*) later on.
+  if (!CmpAnalyzed)
+    return false;
+
+  // Exactly one of the input registers to the comparison should be among
+  // the induction registers.
+  InductionMap::iterator IndMapEnd = IndMap.end();
+  InductionMap::iterator F = IndMapEnd;
+  if (CmpReg1 != 0) {
+    InductionMap::iterator F1 = IndMap.find(CmpReg1);
+    if (F1 != IndMapEnd)
+      F = F1;
+  }
+  if (CmpReg2 != 0) {
+    InductionMap::iterator F2 = IndMap.find(CmpReg2);
+    if (F2 != IndMapEnd) {
+      if (F != IndMapEnd)
+        return false;
+      F = F2;
+    }
+  }
+  if (F == IndMapEnd)
+    return false;
+
+  Reg = F->second.first;
+  IVBump = F->second.second;
+  IVOp = MRI->getVRegDef(F->first);
+  return true;
+}
+
+// Return the comparison kind for the specified opcode.
+HexagonHardwareLoops::Comparison::Kind
+HexagonHardwareLoops::getComparisonKind(unsigned CondOpc,
+                                        MachineOperand *InitialValue,
+                                        const MachineOperand *EndValue,
+                                        int64_t IVBump) const {
+  Comparison::Kind Cmp = (Comparison::Kind)0;
+  switch (CondOpc) {
+  case Hexagon::C2_cmpeqi:
+  case Hexagon::C2_cmpeq:
+  case Hexagon::C2_cmpeqp:
+    Cmp = Comparison::EQ;
+    break;
+  case Hexagon::C4_cmpneq:
+  case Hexagon::C4_cmpneqi:
+    Cmp = Comparison::NE;
+    break;
+  case Hexagon::C4_cmplte:
+    Cmp = Comparison::LEs;
+    break;
+  case Hexagon::C4_cmplteu:
+    Cmp = Comparison::LEu;
+    break;
+  case Hexagon::C2_cmpgtui:
+  case Hexagon::C2_cmpgtu:
+  case Hexagon::C2_cmpgtup:
+    Cmp = Comparison::GTu;
+    break;
+  case Hexagon::C2_cmpgti:
+  case Hexagon::C2_cmpgt:
+  case Hexagon::C2_cmpgtp:
+    Cmp = Comparison::GTs;
+    break;
+  default:
+    return (Comparison::Kind)0;
+  }
+  return Cmp;
+}
+
+/// \brief Analyze the statements in a loop to determine if the loop has
+/// a computable trip count and, if so, return a value that represents
+/// the trip count expression.
+///
+/// This function iterates over the phi nodes in the loop to check for
+/// induction variable patterns that are used in the calculation for
+/// the number of time the loop is executed.
+CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
+    SmallVectorImpl<MachineInstr *> &OldInsts) {
+  MachineBasicBlock *TopMBB = L->getTopBlock();
+  MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin();
+  assert(PI != TopMBB->pred_end() &&
+         "Loop must have more than one incoming edge!");
+  MachineBasicBlock *Backedge = *PI++;
+  if (PI == TopMBB->pred_end())  // dead loop?
+    return nullptr;
+  MachineBasicBlock *Incoming = *PI++;
+  if (PI != TopMBB->pred_end())  // multiple backedges?
+    return nullptr;
+
+  // Make sure there is one incoming and one backedge and determine which
+  // is which.
+  if (L->contains(Incoming)) {
+    if (L->contains(Backedge))
+      return nullptr;
+    std::swap(Incoming, Backedge);
+  } else if (!L->contains(Backedge))
+    return nullptr;
+
+  // Look for the cmp instruction to determine if we can get a useful trip
+  // count.  The trip count can be either a register or an immediate.  The
+  // location of the value depends upon the type (reg or imm).
+  MachineBasicBlock *ExitingBlock = L->findLoopControlBlock();
+  if (!ExitingBlock)
+    return nullptr;
+
+  unsigned IVReg = 0;
+  int64_t IVBump = 0;
+  MachineInstr *IVOp;
+  bool FoundIV = findInductionRegister(L, IVReg, IVBump, IVOp);
+  if (!FoundIV)
+    return nullptr;
+
+  MachineBasicBlock *Preheader = MLI->findLoopPreheader(L, SpecPreheader);
+
+  MachineOperand *InitialValue = nullptr;
+  MachineInstr *IV_Phi = MRI->getVRegDef(IVReg);
+  MachineBasicBlock *Latch = L->getLoopLatch();
+  for (unsigned i = 1, n = IV_Phi->getNumOperands(); i < n; i += 2) {
+    MachineBasicBlock *MBB = IV_Phi->getOperand(i+1).getMBB();
+    if (MBB == Preheader)
+      InitialValue = &IV_Phi->getOperand(i);
+    else if (MBB == Latch)
+      IVReg = IV_Phi->getOperand(i).getReg();  // Want IV reg after bump.
+  }
+  if (!InitialValue)
+    return nullptr;
+
+  SmallVector<MachineOperand,2> Cond;
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
+  bool NotAnalyzed = TII->analyzeBranch(*ExitingBlock, TB, FB, Cond, false);
+  if (NotAnalyzed)
+    return nullptr;
+
+  MachineBasicBlock *Header = L->getHeader();
+  // TB must be non-null.  If FB is also non-null, one of them must be
+  // the header.  Otherwise, branch to TB could be exiting the loop, and
+  // the fall through can go to the header.
+  assert (TB && "Exit block without a branch?");
+  if (ExitingBlock != Latch && (TB == Latch || FB == Latch)) {
+    MachineBasicBlock *LTB = nullptr, *LFB = nullptr;
+    SmallVector<MachineOperand,2> LCond;
+    bool NotAnalyzed = TII->analyzeBranch(*Latch, LTB, LFB, LCond, false);
+    if (NotAnalyzed)
+      return nullptr;
+    if (TB == Latch)
+      TB = (LTB == Header) ? LTB : LFB;
+    else
+      FB = (LTB == Header) ? LTB: LFB;
+  }
+  assert ((!FB || TB == Header || FB == Header) && "Branches not to header?");
+  if (!TB || (FB && TB != Header && FB != Header))
+    return nullptr;
+
+  // Branches of form "if (!P) ..." cause HexagonInstrInfo::AnalyzeBranch
+  // to put imm(0), followed by P in the vector Cond.
+  // If TB is not the header, it means that the "not-taken" path must lead
+  // to the header.
+  bool Negated = TII->predOpcodeHasNot(Cond) ^ (TB != Header);
+  unsigned PredReg, PredPos, PredRegFlags;
+  if (!TII->getPredReg(Cond, PredReg, PredPos, PredRegFlags))
+    return nullptr;
+  MachineInstr *CondI = MRI->getVRegDef(PredReg);
+  unsigned CondOpc = CondI->getOpcode();
+
+  unsigned CmpReg1 = 0, CmpReg2 = 0;
+  int Mask = 0, ImmValue = 0;
+  bool AnalyzedCmp =
+      TII->analyzeCompare(*CondI, CmpReg1, CmpReg2, Mask, ImmValue);
+  if (!AnalyzedCmp)
+    return nullptr;
+
+  // The comparison operator type determines how we compute the loop
+  // trip count.
+  OldInsts.push_back(CondI);
+  OldInsts.push_back(IVOp);
+
+  // Sadly, the following code gets information based on the position
+  // of the operands in the compare instruction.  This has to be done
+  // this way, because the comparisons check for a specific relationship
+  // between the operands (e.g. is-less-than), rather than to find out
+  // what relationship the operands are in (as on PPC).
+  Comparison::Kind Cmp;
+  bool isSwapped = false;
+  const MachineOperand &Op1 = CondI->getOperand(1);
+  const MachineOperand &Op2 = CondI->getOperand(2);
+  const MachineOperand *EndValue = nullptr;
+
+  if (Op1.isReg()) {
+    if (Op2.isImm() || Op1.getReg() == IVReg)
+      EndValue = &Op2;
+    else {
+      EndValue = &Op1;
+      isSwapped = true;
+    }
+  }
+
+  if (!EndValue)
+    return nullptr;
+
+  Cmp = getComparisonKind(CondOpc, InitialValue, EndValue, IVBump);
+  if (!Cmp)
+    return nullptr;
+  if (Negated)
+    Cmp = Comparison::getNegatedComparison(Cmp);
+  if (isSwapped)
+    Cmp = Comparison::getSwappedComparison(Cmp);
+
+  if (InitialValue->isReg()) {
+    unsigned R = InitialValue->getReg();
+    MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
+    if (!MDT->properlyDominates(DefBB, Header))
+      return nullptr;
+    OldInsts.push_back(MRI->getVRegDef(R));
+  }
+  if (EndValue->isReg()) {
+    unsigned R = EndValue->getReg();
+    MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
+    if (!MDT->properlyDominates(DefBB, Header))
+      return nullptr;
+    OldInsts.push_back(MRI->getVRegDef(R));
+  }
+
+  return computeCount(L, InitialValue, EndValue, IVReg, IVBump, Cmp);
+}
+
+/// \brief Helper function that returns the expression that represents the
+/// number of times a loop iterates.  The function takes the operands that
+/// represent the loop start value, loop end value, and induction value.
+/// Based upon these operands, the function attempts to compute the trip count.
+CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
+                                               const MachineOperand *Start,
+                                               const MachineOperand *End,
+                                               unsigned IVReg,
+                                               int64_t IVBump,
+                                               Comparison::Kind Cmp) const {
+  // Cannot handle comparison EQ, i.e. while (A == B).
+  if (Cmp == Comparison::EQ)
+    return nullptr;
+
+  // Check if either the start or end values are an assignment of an immediate.
+  // If so, use the immediate value rather than the register.
+  if (Start->isReg()) {
+    const MachineInstr *StartValInstr = MRI->getVRegDef(Start->getReg());
+    if (StartValInstr && (StartValInstr->getOpcode() == Hexagon::A2_tfrsi ||
+                          StartValInstr->getOpcode() == Hexagon::A2_tfrpi))
+      Start = &StartValInstr->getOperand(1);
+  }
+  if (End->isReg()) {
+    const MachineInstr *EndValInstr = MRI->getVRegDef(End->getReg());
+    if (EndValInstr && (EndValInstr->getOpcode() == Hexagon::A2_tfrsi ||
+                        EndValInstr->getOpcode() == Hexagon::A2_tfrpi))
+      End = &EndValInstr->getOperand(1);
+  }
+
+  if (!Start->isReg() && !Start->isImm())
+    return nullptr;
+  if (!End->isReg() && !End->isImm())
+    return nullptr;
+
+  bool CmpLess =     Cmp & Comparison::L;
+  bool CmpGreater =  Cmp & Comparison::G;
+  bool CmpHasEqual = Cmp & Comparison::EQ;
+
+  // Avoid certain wrap-arounds.  This doesn't detect all wrap-arounds.
+  if (CmpLess && IVBump < 0)
+    // Loop going while iv is "less" with the iv value going down.  Must wrap.
+    return nullptr;
+
+  if (CmpGreater && IVBump > 0)
+    // Loop going while iv is "greater" with the iv value going up.  Must wrap.
+    return nullptr;
+
+  // Phis that may feed into the loop.
+  LoopFeederMap LoopFeederPhi;
+
+  // Check if the initial value may be zero and can be decremented in the first
+  // iteration. If the value is zero, the endloop instruction will not decrement
+  // the loop counter, so we shouldn't generate a hardware loop in this case.
+  if (loopCountMayWrapOrUnderFlow(Start, End, Loop->getLoopPreheader(), Loop,
+                                  LoopFeederPhi))
+      return nullptr;
+
+  if (Start->isImm() && End->isImm()) {
+    // Both, start and end are immediates.
+    int64_t StartV = Start->getImm();
+    int64_t EndV = End->getImm();
+    int64_t Dist = EndV - StartV;
+    if (Dist == 0)
+      return nullptr;
+
+    bool Exact = (Dist % IVBump) == 0;
+
+    if (Cmp == Comparison::NE) {
+      if (!Exact)
+        return nullptr;
+      if ((Dist < 0) ^ (IVBump < 0))
+        return nullptr;
+    }
+
+    // For comparisons that include the final value (i.e. include equality
+    // with the final value), we need to increase the distance by 1.
+    if (CmpHasEqual)
+      Dist = Dist > 0 ? Dist+1 : Dist-1;
+
+    // For the loop to iterate, CmpLess should imply Dist > 0.  Similarly,
+    // CmpGreater should imply Dist < 0.  These conditions could actually
+    // fail, for example, in unreachable code (which may still appear to be
+    // reachable in the CFG).
+    if ((CmpLess && Dist < 0) || (CmpGreater && Dist > 0))
+      return nullptr;
+
+    // "Normalized" distance, i.e. with the bump set to +-1.
+    int64_t Dist1 = (IVBump > 0) ? (Dist +  (IVBump - 1)) / IVBump
+                                 : (-Dist + (-IVBump - 1)) / (-IVBump);
+    assert (Dist1 > 0 && "Fishy thing.  Both operands have the same sign.");
+
+    uint64_t Count = Dist1;
+
+    if (Count > 0xFFFFFFFFULL)
+      return nullptr;
+
+    return new CountValue(CountValue::CV_Immediate, Count);
+  }
+
+  // A general case: Start and End are some values, but the actual
+  // iteration count may not be available.  If it is not, insert
+  // a computation of it into the preheader.
+
+  // If the induction variable bump is not a power of 2, quit.
+  // Othwerise we'd need a general integer division.
+  if (!isPowerOf2_64(std::abs(IVBump)))
+    return nullptr;
+
+  MachineBasicBlock *PH = MLI->findLoopPreheader(Loop, SpecPreheader);
+  assert (PH && "Should have a preheader by now");
+  MachineBasicBlock::iterator InsertPos = PH->getFirstTerminator();
+  DebugLoc DL;
+  if (InsertPos != PH->end())
+    DL = InsertPos->getDebugLoc();
+
+  // If Start is an immediate and End is a register, the trip count
+  // will be "reg - imm".  Hexagon's "subtract immediate" instruction
+  // is actually "reg + -imm".
+
+  // If the loop IV is going downwards, i.e. if the bump is negative,
+  // then the iteration count (computed as End-Start) will need to be
+  // negated.  To avoid the negation, just swap Start and End.
+  if (IVBump < 0) {
+    std::swap(Start, End);
+    IVBump = -IVBump;
+  }
+  // Cmp may now have a wrong direction, e.g.  LEs may now be GEs.
+  // Signedness, and "including equality" are preserved.
+
+  bool RegToImm = Start->isReg() && End->isImm(); // for (reg..imm)
+  bool RegToReg = Start->isReg() && End->isReg(); // for (reg..reg)
+
+  int64_t StartV = 0, EndV = 0;
+  if (Start->isImm())
+    StartV = Start->getImm();
+  if (End->isImm())
+    EndV = End->getImm();
+
+  int64_t AdjV = 0;
+  // To compute the iteration count, we would need this computation:
+  //   Count = (End - Start + (IVBump-1)) / IVBump
+  // or, when CmpHasEqual:
+  //   Count = (End - Start + (IVBump-1)+1) / IVBump
+  // The "IVBump-1" part is the adjustment (AdjV).  We can avoid
+  // generating an instruction specifically to add it if we can adjust
+  // the immediate values for Start or End.
+
+  if (CmpHasEqual) {
+    // Need to add 1 to the total iteration count.
+    if (Start->isImm())
+      StartV--;
+    else if (End->isImm())
+      EndV++;
+    else
+      AdjV += 1;
+  }
+
+  if (Cmp != Comparison::NE) {
+    if (Start->isImm())
+      StartV -= (IVBump-1);
+    else if (End->isImm())
+      EndV += (IVBump-1);
+    else
+      AdjV += (IVBump-1);
+  }
+
+  unsigned R = 0, SR = 0;
+  if (Start->isReg()) {
+    R = Start->getReg();
+    SR = Start->getSubReg();
+  } else {
+    R = End->getReg();
+    SR = End->getSubReg();
+  }
+  const TargetRegisterClass *RC = MRI->getRegClass(R);
+  // Hardware loops cannot handle 64-bit registers.  If it's a double
+  // register, it has to have a subregister.
+  if (!SR && RC == &Hexagon::DoubleRegsRegClass)
+    return nullptr;
+  const TargetRegisterClass *IntRC = &Hexagon::IntRegsRegClass;
+
+  // Compute DistR (register with the distance between Start and End).
+  unsigned DistR, DistSR;
+
+  // Avoid special case, where the start value is an imm(0).
+  if (Start->isImm() && StartV == 0) {
+    DistR = End->getReg();
+    DistSR = End->getSubReg();
+  } else {
+    const MCInstrDesc &SubD = RegToReg ? TII->get(Hexagon::A2_sub) :
+                              (RegToImm ? TII->get(Hexagon::A2_subri) :
+                                          TII->get(Hexagon::A2_addi));
+    if (RegToReg || RegToImm) {
+      unsigned SubR = MRI->createVirtualRegister(IntRC);
+      MachineInstrBuilder SubIB =
+        BuildMI(*PH, InsertPos, DL, SubD, SubR);
+
+      if (RegToReg)
+        SubIB.addReg(End->getReg(), 0, End->getSubReg())
+          .addReg(Start->getReg(), 0, Start->getSubReg());
+      else
+        SubIB.addImm(EndV)
+          .addReg(Start->getReg(), 0, Start->getSubReg());
+      DistR = SubR;
+    } else {
+      // If the loop has been unrolled, we should use the original loop count
+      // instead of recalculating the value. This will avoid additional
+      // 'Add' instruction.
+      const MachineInstr *EndValInstr = MRI->getVRegDef(End->getReg());
+      if (EndValInstr->getOpcode() == Hexagon::A2_addi &&
+          EndValInstr->getOperand(2).getImm() == StartV) {
+        DistR = EndValInstr->getOperand(1).getReg();
+      } else {
+        unsigned SubR = MRI->createVirtualRegister(IntRC);
+        MachineInstrBuilder SubIB =
+          BuildMI(*PH, InsertPos, DL, SubD, SubR);
+        SubIB.addReg(End->getReg(), 0, End->getSubReg())
+             .addImm(-StartV);
+        DistR = SubR;
+      }
+    }
+    DistSR = 0;
+  }
+
+  // From DistR, compute AdjR (register with the adjusted distance).
+  unsigned AdjR, AdjSR;
+
+  if (AdjV == 0) {
+    AdjR = DistR;
+    AdjSR = DistSR;
+  } else {
+    // Generate CountR = ADD DistR, AdjVal
+    unsigned AddR = MRI->createVirtualRegister(IntRC);
+    MCInstrDesc const &AddD = TII->get(Hexagon::A2_addi);
+    BuildMI(*PH, InsertPos, DL, AddD, AddR)
+      .addReg(DistR, 0, DistSR)
+      .addImm(AdjV);
+
+    AdjR = AddR;
+    AdjSR = 0;
+  }
+
+  // From AdjR, compute CountR (register with the final count).
+  unsigned CountR, CountSR;
+
+  if (IVBump == 1) {
+    CountR = AdjR;
+    CountSR = AdjSR;
+  } else {
+    // The IV bump is a power of two. Log_2(IV bump) is the shift amount.
+    unsigned Shift = Log2_32(IVBump);
+
+    // Generate NormR = LSR DistR, Shift.
+    unsigned LsrR = MRI->createVirtualRegister(IntRC);
+    const MCInstrDesc &LsrD = TII->get(Hexagon::S2_lsr_i_r);
+    BuildMI(*PH, InsertPos, DL, LsrD, LsrR)
+      .addReg(AdjR, 0, AdjSR)
+      .addImm(Shift);
+
+    CountR = LsrR;
+    CountSR = 0;
+  }
+
+  return new CountValue(CountValue::CV_Register, CountR, CountSR);
+}
+
+/// \brief Return true if the operation is invalid within hardware loop.
+bool HexagonHardwareLoops::isInvalidLoopOperation(const MachineInstr *MI,
+                                                  bool IsInnerHWLoop) const {
+
+  // Call is not allowed because the callee may use a hardware loop except for
+  // the case when the call never returns.
+  if (MI->getDesc().isCall())
+    return !TII->doesNotReturn(*MI);
+
+  // Check if the instruction defines a hardware loop register.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned R = MO.getReg();
+    if (IsInnerHWLoop && (R == Hexagon::LC0 || R == Hexagon::SA0 ||
+                          R == Hexagon::LC1 || R == Hexagon::SA1))
+      return true;
+    if (!IsInnerHWLoop && (R == Hexagon::LC1 || R == Hexagon::SA1))
+      return true;
+  }
+  return false;
+}
+
+/// \brief Return true if the loop contains an instruction that inhibits
+/// the use of the hardware loop instruction.
+bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L,
+    bool IsInnerHWLoop) const {
+  const std::vector<MachineBasicBlock *> &Blocks = L->getBlocks();
+  DEBUG(dbgs() << "\nhw_loop head, BB#" << Blocks[0]->getNumber(););
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = Blocks[i];
+    for (MachineBasicBlock::iterator
+           MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) {
+      const MachineInstr *MI = &*MII;
+      if (isInvalidLoopOperation(MI, IsInnerHWLoop)) {
+        DEBUG(dbgs()<< "\nCannot convert to hw_loop due to:"; MI->dump(););
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// \brief Returns true if the instruction is dead.  This was essentially
+/// copied from DeadMachineInstructionElim::isDead, but with special cases
+/// for inline asm, physical registers and instructions with side effects
+/// removed.
+bool HexagonHardwareLoops::isDead(const MachineInstr *MI,
+                              SmallVectorImpl<MachineInstr *> &DeadPhis) const {
+  // Examine each operand.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (MRI->use_nodbg_empty(Reg))
+      continue;
+
+    typedef MachineRegisterInfo::use_nodbg_iterator use_nodbg_iterator;
+
+    // This instruction has users, but if the only user is the phi node for the
+    // parent block, and the only use of that phi node is this instruction, then
+    // this instruction is dead: both it (and the phi node) can be removed.
+    use_nodbg_iterator I = MRI->use_nodbg_begin(Reg);
+    use_nodbg_iterator End = MRI->use_nodbg_end();
+    if (std::next(I) != End || !I->getParent()->isPHI())
+      return false;
+
+    MachineInstr *OnePhi = I->getParent();
+    for (unsigned j = 0, f = OnePhi->getNumOperands(); j != f; ++j) {
+      const MachineOperand &OPO = OnePhi->getOperand(j);
+      if (!OPO.isReg() || !OPO.isDef())
+        continue;
+
+      unsigned OPReg = OPO.getReg();
+      use_nodbg_iterator nextJ;
+      for (use_nodbg_iterator J = MRI->use_nodbg_begin(OPReg);
+           J != End; J = nextJ) {
+        nextJ = std::next(J);
+        MachineOperand &Use = *J;
+        MachineInstr *UseMI = Use.getParent();
+
+        // If the phi node has a user that is not MI, bail.
+        if (MI != UseMI)
+          return false;
+      }
+    }
+    DeadPhis.push_back(OnePhi);
+  }
+
+  // If there are no defs with uses, the instruction is dead.
+  return true;
+}
+
+void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) {
+  // This procedure was essentially copied from DeadMachineInstructionElim.
+
+  SmallVector<MachineInstr*, 1> DeadPhis;
+  if (isDead(MI, DeadPhis)) {
+    DEBUG(dbgs() << "HW looping will remove: " << *MI);
+
+    // It is possible that some DBG_VALUE instructions refer to this
+    // instruction.  Examine each def operand for such references;
+    // if found, mark the DBG_VALUE as undef (but don't delete it).
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+      MachineRegisterInfo::use_iterator nextI;
+      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
+           E = MRI->use_end(); I != E; I = nextI) {
+        nextI = std::next(I);  // I is invalidated by the setReg
+        MachineOperand &Use = *I;
+        MachineInstr *UseMI = I->getParent();
+        if (UseMI == MI)
+          continue;
+        if (Use.isDebug())
+          UseMI->getOperand(0).setReg(0U);
+      }
+    }
+
+    MI->eraseFromParent();
+    for (unsigned i = 0; i < DeadPhis.size(); ++i)
+      DeadPhis[i]->eraseFromParent();
+  }
+}
+
+/// \brief Check if the loop is a candidate for converting to a hardware
+/// loop.  If so, then perform the transformation.
+///
+/// This function works on innermost loops first.  A loop can be converted
+/// if it is a counting loop; either a register value or an immediate.
+///
+/// The code makes several assumptions about the representation of the loop
+/// in llvm.
+bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L,
+                                                 bool &RecL0used,
+                                                 bool &RecL1used) {
+  // This is just for sanity.
+  assert(L->getHeader() && "Loop without a header?");
+
+  bool Changed = false;
+  bool L0Used = false;
+  bool L1Used = false;
+
+  // Process nested loops first.
+  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
+    Changed |= convertToHardwareLoop(*I, RecL0used, RecL1used);
+    L0Used |= RecL0used;
+    L1Used |= RecL1used;
+  }
+
+  // If a nested loop has been converted, then we can't convert this loop.
+  if (Changed && L0Used && L1Used)
+    return Changed;
+
+  unsigned LOOP_i;
+  unsigned LOOP_r;
+  unsigned ENDLOOP;
+
+  // Flag used to track loopN instruction:
+  // 1 - Hardware loop is being generated for the inner most loop.
+  // 0 - Hardware loop is being generated for the outer loop.
+  unsigned IsInnerHWLoop = 1;
+
+  if (L0Used) {
+    LOOP_i = Hexagon::J2_loop1i;
+    LOOP_r = Hexagon::J2_loop1r;
+    ENDLOOP = Hexagon::ENDLOOP1;
+    IsInnerHWLoop = 0;
+  } else {
+    LOOP_i = Hexagon::J2_loop0i;
+    LOOP_r = Hexagon::J2_loop0r;
+    ENDLOOP = Hexagon::ENDLOOP0;
+  }
+
+#ifndef NDEBUG
+  // Stop trying after reaching the limit (if any).
+  int Limit = HWLoopLimit;
+  if (Limit >= 0) {
+    if (Counter >= HWLoopLimit)
+      return false;
+    Counter++;
+  }
+#endif
+
+  // Does the loop contain any invalid instructions?
+  if (containsInvalidInstruction(L, IsInnerHWLoop))
+    return false;
+
+  MachineBasicBlock *LastMBB = L->findLoopControlBlock();
+  // Don't generate hw loop if the loop has more than one exit.
+  if (!LastMBB)
+    return false;
+
+  MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
+  if (LastI == LastMBB->end())
+    return false;
+
+  // Is the induction variable bump feeding the latch condition?
+  if (!fixupInductionVariable(L))
+    return false;
+
+  // Ensure the loop has a preheader: the loop instruction will be
+  // placed there.
+  MachineBasicBlock *Preheader = MLI->findLoopPreheader(L, SpecPreheader);
+  if (!Preheader) {
+    Preheader = createPreheaderForLoop(L);
+    if (!Preheader)
+      return false;
+  }
+
+  MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator();
+
+  SmallVector<MachineInstr*, 2> OldInsts;
+  // Are we able to determine the trip count for the loop?
+  CountValue *TripCount = getLoopTripCount(L, OldInsts);
+  if (!TripCount)
+    return false;
+
+  // Is the trip count available in the preheader?
+  if (TripCount->isReg()) {
+    // There will be a use of the register inserted into the preheader,
+    // so make sure that the register is actually defined at that point.
+    MachineInstr *TCDef = MRI->getVRegDef(TripCount->getReg());
+    MachineBasicBlock *BBDef = TCDef->getParent();
+    if (!MDT->dominates(BBDef, Preheader))
+      return false;
+  }
+
+  // Determine the loop start.
+  MachineBasicBlock *TopBlock = L->getTopBlock();
+  MachineBasicBlock *ExitingBlock = L->findLoopControlBlock();
+  MachineBasicBlock *LoopStart = nullptr;
+  if (ExitingBlock !=  L->getLoopLatch()) {
+    MachineBasicBlock *TB = nullptr, *FB = nullptr;
+    SmallVector<MachineOperand, 2> Cond;
+
+    if (TII->analyzeBranch(*ExitingBlock, TB, FB, Cond, false))
+      return false;
+
+    if (L->contains(TB))
+      LoopStart = TB;
+    else if (L->contains(FB))
+      LoopStart = FB;
+    else
+      return false;
+  }
+  else
+    LoopStart = TopBlock;
+
+  // Convert the loop to a hardware loop.
+  DEBUG(dbgs() << "Change to hardware loop at "; L->dump());
+  DebugLoc DL;
+  if (InsertPos != Preheader->end())
+    DL = InsertPos->getDebugLoc();
+
+  if (TripCount->isReg()) {
+    // Create a copy of the loop count register.
+    unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+    BuildMI(*Preheader, InsertPos, DL, TII->get(TargetOpcode::COPY), CountReg)
+      .addReg(TripCount->getReg(), 0, TripCount->getSubReg());
+    // Add the Loop instruction to the beginning of the loop.
+    BuildMI(*Preheader, InsertPos, DL, TII->get(LOOP_r)).addMBB(LoopStart)
+      .addReg(CountReg);
+  } else {
+    assert(TripCount->isImm() && "Expecting immediate value for trip count");
+    // Add the Loop immediate instruction to the beginning of the loop,
+    // if the immediate fits in the instructions.  Otherwise, we need to
+    // create a new virtual register.
+    int64_t CountImm = TripCount->getImm();
+    if (!TII->isValidOffset(LOOP_i, CountImm)) {
+      unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+      BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::A2_tfrsi), CountReg)
+        .addImm(CountImm);
+      BuildMI(*Preheader, InsertPos, DL, TII->get(LOOP_r))
+        .addMBB(LoopStart).addReg(CountReg);
+    } else
+      BuildMI(*Preheader, InsertPos, DL, TII->get(LOOP_i))
+        .addMBB(LoopStart).addImm(CountImm);
+  }
+
+  // Make sure the loop start always has a reference in the CFG.  We need
+  // to create a BlockAddress operand to get this mechanism to work both the
+  // MachineBasicBlock and BasicBlock objects need the flag set.
+  LoopStart->setHasAddressTaken();
+  // This line is needed to set the hasAddressTaken flag on the BasicBlock
+  // object.
+  BlockAddress::get(const_cast<BasicBlock *>(LoopStart->getBasicBlock()));
+
+  // Replace the loop branch with an endloop instruction.
+  DebugLoc LastIDL = LastI->getDebugLoc();
+  BuildMI(*LastMBB, LastI, LastIDL, TII->get(ENDLOOP)).addMBB(LoopStart);
+
+  // The loop ends with either:
+  //  - a conditional branch followed by an unconditional branch, or
+  //  - a conditional branch to the loop start.
+  if (LastI->getOpcode() == Hexagon::J2_jumpt ||
+      LastI->getOpcode() == Hexagon::J2_jumpf) {
+    // Delete one and change/add an uncond. branch to out of the loop.
+    MachineBasicBlock *BranchTarget = LastI->getOperand(1).getMBB();
+    LastI = LastMBB->erase(LastI);
+    if (!L->contains(BranchTarget)) {
+      if (LastI != LastMBB->end())
+        LastI = LastMBB->erase(LastI);
+      SmallVector<MachineOperand, 0> Cond;
+      TII->insertBranch(*LastMBB, BranchTarget, nullptr, Cond, LastIDL);
+    }
+  } else {
+    // Conditional branch to loop start; just delete it.
+    LastMBB->erase(LastI);
+  }
+  delete TripCount;
+
+  // The induction operation and the comparison may now be
+  // unneeded. If these are unneeded, then remove them.
+  for (unsigned i = 0; i < OldInsts.size(); ++i)
+    removeIfDead(OldInsts[i]);
+
+  ++NumHWLoops;
+
+  // Set RecL1used and RecL0used only after hardware loop has been
+  // successfully generated. Doing it earlier can cause wrong loop instruction
+  // to be used.
+  if (L0Used) // Loop0 was already used. So, the correct loop must be loop1.
+    RecL1used = true;
+  else
+    RecL0used = true;
+
+  return true;
+}
+
+bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI,
+                                            MachineInstr *CmpI) {
+  assert (BumpI != CmpI && "Bump and compare in the same instruction?");
+
+  MachineBasicBlock *BB = BumpI->getParent();
+  if (CmpI->getParent() != BB)
+    return false;
+
+  typedef MachineBasicBlock::instr_iterator instr_iterator;
+  // Check if things are in order to begin with.
+  for (instr_iterator I(BumpI), E = BB->instr_end(); I != E; ++I)
+    if (&*I == CmpI)
+      return true;
+
+  // Out of order.
+  unsigned PredR = CmpI->getOperand(0).getReg();
+  bool FoundBump = false;
+  instr_iterator CmpIt = CmpI->getIterator(), NextIt = std::next(CmpIt);
+  for (instr_iterator I = NextIt, E = BB->instr_end(); I != E; ++I) {
+    MachineInstr *In = &*I;
+    for (unsigned i = 0, n = In->getNumOperands(); i < n; ++i) {
+      MachineOperand &MO = In->getOperand(i);
+      if (MO.isReg() && MO.isUse()) {
+        if (MO.getReg() == PredR)  // Found an intervening use of PredR.
+          return false;
+      }
+    }
+
+    if (In == BumpI) {
+      BB->splice(++BumpI->getIterator(), BB, CmpI->getIterator());
+      FoundBump = true;
+      break;
+    }
+  }
+  assert (FoundBump && "Cannot determine instruction order");
+  return FoundBump;
+}
+
+/// This function is required to break recursion. Visiting phis in a loop may
+/// result in recursion during compilation. We break the recursion by making
+/// sure that we visit a MachineOperand and its definition in a
+/// MachineInstruction only once. If we attempt to visit more than once, then
+/// there is recursion, and will return false.
+bool HexagonHardwareLoops::isLoopFeeder(MachineLoop *L, MachineBasicBlock *A,
+                                        MachineInstr *MI,
+                                        const MachineOperand *MO,
+                                        LoopFeederMap &LoopFeederPhi) const {
+  if (LoopFeederPhi.find(MO->getReg()) == LoopFeederPhi.end()) {
+    const std::vector<MachineBasicBlock *> &Blocks = L->getBlocks();
+    DEBUG(dbgs() << "\nhw_loop head, BB#" << Blocks[0]->getNumber(););
+    // Ignore all BBs that form Loop.
+    for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+      MachineBasicBlock *MBB = Blocks[i];
+      if (A == MBB)
+        return false;
+    }
+    MachineInstr *Def = MRI->getVRegDef(MO->getReg());
+    LoopFeederPhi.insert(std::make_pair(MO->getReg(), Def));
+    return true;
+  } else
+    // Already visited node.
+    return false;
+}
+
+/// Return true if a Phi may generate a value that can underflow.
+/// This function calls loopCountMayWrapOrUnderFlow for each Phi operand.
+bool HexagonHardwareLoops::phiMayWrapOrUnderflow(
+    MachineInstr *Phi, const MachineOperand *EndVal, MachineBasicBlock *MBB,
+    MachineLoop *L, LoopFeederMap &LoopFeederPhi) const {
+  assert(Phi->isPHI() && "Expecting a Phi.");
+  // Walk through each Phi, and its used operands. Make sure that
+  // if there is recursion in Phi, we won't generate hardware loops.
+  for (int i = 1, n = Phi->getNumOperands(); i < n; i += 2)
+    if (isLoopFeeder(L, MBB, Phi, &(Phi->getOperand(i)), LoopFeederPhi))
+      if (loopCountMayWrapOrUnderFlow(&(Phi->getOperand(i)), EndVal,
+                                      Phi->getParent(), L, LoopFeederPhi))
+        return true;
+  return false;
+}
+
+/// Return true if the induction variable can underflow in the first iteration.
+/// An example, is an initial unsigned value that is 0 and is decrement in the
+/// first itertion of a do-while loop.  In this case, we cannot generate a
+/// hardware loop because the endloop instruction does not decrement the loop
+/// counter if it is <= 1. We only need to perform this analysis if the
+/// initial value is a register.
+///
+/// This function assumes the initial value may underfow unless proven
+/// otherwise. If the type is signed, then we don't care because signed
+/// underflow is undefined. We attempt to prove the initial value is not
+/// zero by perfoming a crude analysis of the loop counter. This function
+/// checks if the initial value is used in any comparison prior to the loop
+/// and, if so, assumes the comparison is a range check. This is inexact,
+/// but will catch the simple cases.
+bool HexagonHardwareLoops::loopCountMayWrapOrUnderFlow(
+    const MachineOperand *InitVal, const MachineOperand *EndVal,
+    MachineBasicBlock *MBB, MachineLoop *L,
+    LoopFeederMap &LoopFeederPhi) const {
+  // Only check register values since they are unknown.
+  if (!InitVal->isReg())
+    return false;
+
+  if (!EndVal->isImm())
+    return false;
+
+  // A register value that is assigned an immediate is a known value, and it
+  // won't underflow in the first iteration.
+  int64_t Imm;
+  if (checkForImmediate(*InitVal, Imm))
+    return (EndVal->getImm() == Imm);
+
+  unsigned Reg = InitVal->getReg();
+
+  // We don't know the value of a physical register.
+  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    return true;
+
+  MachineInstr *Def = MRI->getVRegDef(Reg);
+  if (!Def)
+    return true;
+
+  // If the initial value is a Phi or copy and the operands may not underflow,
+  // then the definition cannot be underflow either.
+  if (Def->isPHI() && !phiMayWrapOrUnderflow(Def, EndVal, Def->getParent(),
+                                             L, LoopFeederPhi))
+    return false;
+  if (Def->isCopy() && !loopCountMayWrapOrUnderFlow(&(Def->getOperand(1)),
+                                                    EndVal, Def->getParent(),
+                                                    L, LoopFeederPhi))
+    return false;
+
+  // Iterate over the uses of the initial value. If the initial value is used
+  // in a compare, then we assume this is a range check that ensures the loop
+  // doesn't underflow. This is not an exact test and should be improved.
+  for (MachineRegisterInfo::use_instr_nodbg_iterator I = MRI->use_instr_nodbg_begin(Reg),
+         E = MRI->use_instr_nodbg_end(); I != E; ++I) {
+    MachineInstr *MI = &*I;
+    unsigned CmpReg1 = 0, CmpReg2 = 0;
+    int CmpMask = 0, CmpValue = 0;
+
+    if (!TII->analyzeCompare(*MI, CmpReg1, CmpReg2, CmpMask, CmpValue))
+      continue;
+
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+    SmallVector<MachineOperand, 2> Cond;
+    if (TII->analyzeBranch(*MI->getParent(), TBB, FBB, Cond, false))
+      continue;
+
+    Comparison::Kind Cmp =
+        getComparisonKind(MI->getOpcode(), nullptr, nullptr, 0);
+    if (Cmp == 0)
+      continue;
+    if (TII->predOpcodeHasNot(Cond) ^ (TBB != MBB))
+      Cmp = Comparison::getNegatedComparison(Cmp);
+    if (CmpReg2 != 0 && CmpReg2 == Reg)
+      Cmp = Comparison::getSwappedComparison(Cmp);
+
+    // Signed underflow is undefined.
+    if (Comparison::isSigned(Cmp))
+      return false;
+
+    // Check if there is a comparison of the initial value. If the initial value
+    // is greater than or not equal to another value, then assume this is a
+    // range check.
+    if ((Cmp & Comparison::G) || Cmp == Comparison::NE)
+      return false;
+  }
+
+  // OK - this is a hack that needs to be improved. We really need to analyze
+  // the instructions performed on the initial value. This works on the simplest
+  // cases only.
+  if (!Def->isCopy() && !Def->isPHI())
+    return false;
+
+  return true;
+}
+
+bool HexagonHardwareLoops::checkForImmediate(const MachineOperand &MO,
+                                             int64_t &Val) const {
+  if (MO.isImm()) {
+    Val = MO.getImm();
+    return true;
+  }
+  if (!MO.isReg())
+    return false;
+
+  // MO is a register. Check whether it is defined as an immediate value,
+  // and if so, get the value of it in TV. That value will then need to be
+  // processed to handle potential subregisters in MO.
+  int64_t TV;
+
+  unsigned R = MO.getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(R))
+    return false;
+  MachineInstr *DI = MRI->getVRegDef(R);
+  unsigned DOpc = DI->getOpcode();
+  switch (DOpc) {
+    case TargetOpcode::COPY:
+    case Hexagon::A2_tfrsi:
+    case Hexagon::A2_tfrpi:
+    case Hexagon::CONST32:
+    case Hexagon::CONST64: {
+      // Call recursively to avoid an extra check whether operand(1) is
+      // indeed an immediate (it could be a global address, for example),
+      // plus we can handle COPY at the same time.
+      if (!checkForImmediate(DI->getOperand(1), TV))
+        return false;
+      break;
+    }
+    case Hexagon::A2_combineii:
+    case Hexagon::A4_combineir:
+    case Hexagon::A4_combineii:
+    case Hexagon::A4_combineri:
+    case Hexagon::A2_combinew: {
+      const MachineOperand &S1 = DI->getOperand(1);
+      const MachineOperand &S2 = DI->getOperand(2);
+      int64_t V1, V2;
+      if (!checkForImmediate(S1, V1) || !checkForImmediate(S2, V2))
+        return false;
+      TV = V2 | (V1 << 32);
+      break;
+    }
+    case TargetOpcode::REG_SEQUENCE: {
+      const MachineOperand &S1 = DI->getOperand(1);
+      const MachineOperand &S3 = DI->getOperand(3);
+      int64_t V1, V3;
+      if (!checkForImmediate(S1, V1) || !checkForImmediate(S3, V3))
+        return false;
+      unsigned Sub2 = DI->getOperand(2).getImm();
+      unsigned Sub4 = DI->getOperand(4).getImm();
+      if (Sub2 == Hexagon::isub_lo && Sub4 == Hexagon::isub_hi)
+        TV = V1 | (V3 << 32);
+      else if (Sub2 == Hexagon::isub_hi && Sub4 == Hexagon::isub_lo)
+        TV = V3 | (V1 << 32);
+      else
+        llvm_unreachable("Unexpected form of REG_SEQUENCE");
+      break;
+    }
+
+    default:
+      return false;
+  }
+
+  // By now, we should have successfully obtained the immediate value defining
+  // the register referenced in MO. Handle a potential use of a subregister.
+  switch (MO.getSubReg()) {
+    case Hexagon::isub_lo:
+      Val = TV & 0xFFFFFFFFULL;
+      break;
+    case Hexagon::isub_hi:
+      Val = (TV >> 32) & 0xFFFFFFFFULL;
+      break;
+    default:
+      Val = TV;
+      break;
+  }
+  return true;
+}
+
+void HexagonHardwareLoops::setImmediate(MachineOperand &MO, int64_t Val) {
+  if (MO.isImm()) {
+    MO.setImm(Val);
+    return;
+  }
+
+  assert(MO.isReg());
+  unsigned R = MO.getReg();
+  MachineInstr *DI = MRI->getVRegDef(R);
+
+  const TargetRegisterClass *RC = MRI->getRegClass(R);
+  unsigned NewR = MRI->createVirtualRegister(RC);
+  MachineBasicBlock &B = *DI->getParent();
+  DebugLoc DL = DI->getDebugLoc();
+  BuildMI(B, DI, DL, TII->get(DI->getOpcode()), NewR).addImm(Val);
+  MO.setReg(NewR);
+}
+
+static bool isImmValidForOpcode(unsigned CmpOpc, int64_t Imm) {
+  // These two instructions are not extendable.
+  if (CmpOpc == Hexagon::A4_cmpbeqi)
+    return isUInt<8>(Imm);
+  if (CmpOpc == Hexagon::A4_cmpbgti)
+    return isInt<8>(Imm);
+  // The rest of the comparison-with-immediate instructions are extendable.
+  return true;
+}
+
+bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
+  MachineBasicBlock *Header = L->getHeader();
+  MachineBasicBlock *Latch = L->getLoopLatch();
+  MachineBasicBlock *ExitingBlock = L->findLoopControlBlock();
+
+  if (!(Header && Latch && ExitingBlock))
+    return false;
+
+  // These data structures follow the same concept as the corresponding
+  // ones in findInductionRegister (where some comments are).
+  typedef std::pair<unsigned,int64_t> RegisterBump;
+  typedef std::pair<unsigned,RegisterBump> RegisterInduction;
+  typedef std::set<RegisterInduction> RegisterInductionSet;
+
+  // Register candidates for induction variables, with their associated bumps.
+  RegisterInductionSet IndRegs;
+
+  // Look for induction patterns:
+  //   vreg1 = PHI ..., [ latch, vreg2 ]
+  //   vreg2 = ADD vreg1, imm
+  typedef MachineBasicBlock::instr_iterator instr_iterator;
+  for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+       I != E && I->isPHI(); ++I) {
+    MachineInstr *Phi = &*I;
+
+    // Have a PHI instruction.
+    for (unsigned i = 1, n = Phi->getNumOperands(); i < n; i += 2) {
+      if (Phi->getOperand(i+1).getMBB() != Latch)
+        continue;
+
+      unsigned PhiReg = Phi->getOperand(i).getReg();
+      MachineInstr *DI = MRI->getVRegDef(PhiReg);
+
+      if (DI->getDesc().isAdd()) {
+        // If the register operand to the add/sub is the PHI we are looking
+        // at, this meets the induction pattern.
+        unsigned IndReg = DI->getOperand(1).getReg();
+        MachineOperand &Opnd2 = DI->getOperand(2);
+        int64_t V;
+        if (MRI->getVRegDef(IndReg) == Phi && checkForImmediate(Opnd2, V)) {
+          unsigned UpdReg = DI->getOperand(0).getReg();
+          IndRegs.insert(std::make_pair(UpdReg, std::make_pair(IndReg, V)));
+        }
+      }
+    }  // for (i)
+  }  // for (instr)
+
+  if (IndRegs.empty())
+    return false;
+
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
+  SmallVector<MachineOperand,2> Cond;
+  // AnalyzeBranch returns true if it fails to analyze branch.
+  bool NotAnalyzed = TII->analyzeBranch(*ExitingBlock, TB, FB, Cond, false);
+  if (NotAnalyzed || Cond.empty())
+    return false;
+
+  if (ExitingBlock != Latch && (TB == Latch || FB == Latch)) {
+    MachineBasicBlock *LTB = nullptr, *LFB = nullptr;
+    SmallVector<MachineOperand,2> LCond;
+    bool NotAnalyzed = TII->analyzeBranch(*Latch, LTB, LFB, LCond, false);
+    if (NotAnalyzed)
+      return false;
+
+    // Since latch is not the exiting block, the latch branch should be an
+    // unconditional branch to the loop header.
+    if (TB == Latch)
+      TB = (LTB == Header) ? LTB : LFB;
+    else
+      FB = (LTB == Header) ? LTB : LFB;
+  }
+  if (TB != Header) {
+    if (FB != Header) {
+      // The latch/exit block does not go back to the header.
+      return false;
+    }
+    // FB is the header (i.e., uncond. jump to branch header)
+    // In this case, the LoopBody -> TB should not be a back edge otherwise
+    // it could result in an infinite loop after conversion to hw_loop.
+    // This case can happen when the Latch has two jumps like this:
+    // Jmp_c OuterLoopHeader <-- TB
+    // Jmp   InnerLoopHeader <-- FB
+    if (MDT->dominates(TB, FB))
+      return false;
+  }
+
+  // Expecting a predicate register as a condition.  It won't be a hardware
+  // predicate register at this point yet, just a vreg.
+  // HexagonInstrInfo::AnalyzeBranch for negated branches inserts imm(0)
+  // into Cond, followed by the predicate register.  For non-negated branches
+  // it's just the register.
+  unsigned CSz = Cond.size();
+  if (CSz != 1 && CSz != 2)
+    return false;
+
+  if (!Cond[CSz-1].isReg())
+    return false;
+
+  unsigned P = Cond[CSz-1].getReg();
+  MachineInstr *PredDef = MRI->getVRegDef(P);
+
+  if (!PredDef->isCompare())
+    return false;
+
+  SmallSet<unsigned,2> CmpRegs;
+  MachineOperand *CmpImmOp = nullptr;
+
+  // Go over all operands to the compare and look for immediate and register
+  // operands.  Assume that if the compare has a single register use and a
+  // single immediate operand, then the register is being compared with the
+  // immediate value.
+  for (unsigned i = 0, n = PredDef->getNumOperands(); i < n; ++i) {
+    MachineOperand &MO = PredDef->getOperand(i);
+    if (MO.isReg()) {
+      // Skip all implicit references.  In one case there was:
+      //   %vreg140<def> = FCMPUGT32_rr %vreg138, %vreg139, %USR<imp-use>
+      if (MO.isImplicit())
+        continue;
+      if (MO.isUse()) {
+        if (!isImmediate(MO)) {
+          CmpRegs.insert(MO.getReg());
+          continue;
+        }
+        // Consider the register to be the "immediate" operand.
+        if (CmpImmOp)
+          return false;
+        CmpImmOp = &MO;
+      }
+    } else if (MO.isImm()) {
+      if (CmpImmOp)    // A second immediate argument?  Confusing.  Bail out.
+        return false;
+      CmpImmOp = &MO;
+    }
+  }
+
+  if (CmpRegs.empty())
+    return false;
+
+  // Check if the compared register follows the order we want.  Fix if needed.
+  for (RegisterInductionSet::iterator I = IndRegs.begin(), E = IndRegs.end();
+       I != E; ++I) {
+    // This is a success.  If the register used in the comparison is one that
+    // we have identified as a bumped (updated) induction register, there is
+    // nothing to do.
+    if (CmpRegs.count(I->first))
+      return true;
+
+    // Otherwise, if the register being compared comes out of a PHI node,
+    // and has been recognized as following the induction pattern, and is
+    // compared against an immediate, we can fix it.
+    const RegisterBump &RB = I->second;
+    if (CmpRegs.count(RB.first)) {
+      if (!CmpImmOp) {
+        // If both operands to the compare instruction are registers, see if
+        // it can be changed to use induction register as one of the operands.
+        MachineInstr *IndI = nullptr;
+        MachineInstr *nonIndI = nullptr;
+        MachineOperand *IndMO = nullptr;
+        MachineOperand *nonIndMO = nullptr;
+
+        for (unsigned i = 1, n = PredDef->getNumOperands(); i < n; ++i) {
+          MachineOperand &MO = PredDef->getOperand(i);
+          if (MO.isReg() && MO.getReg() == RB.first) {
+            DEBUG(dbgs() << "\n DefMI(" << i << ") = "
+                         << *(MRI->getVRegDef(I->first)));
+            if (IndI)
+              return false;
+
+            IndI = MRI->getVRegDef(I->first);
+            IndMO = &MO;
+          } else if (MO.isReg()) {
+            DEBUG(dbgs() << "\n DefMI(" << i << ") = "
+                         << *(MRI->getVRegDef(MO.getReg())));
+            if (nonIndI)
+              return false;
+
+            nonIndI = MRI->getVRegDef(MO.getReg());
+            nonIndMO = &MO;
+          }
+        }
+        if (IndI && nonIndI &&
+            nonIndI->getOpcode() == Hexagon::A2_addi &&
+            nonIndI->getOperand(2).isImm() &&
+            nonIndI->getOperand(2).getImm() == - RB.second) {
+          bool Order = orderBumpCompare(IndI, PredDef);
+          if (Order) {
+            IndMO->setReg(I->first);
+            nonIndMO->setReg(nonIndI->getOperand(1).getReg());
+            return true;
+          }
+        }
+        return false;
+      }
+
+      // It is not valid to do this transformation on an unsigned comparison
+      // because it may underflow.
+      Comparison::Kind Cmp =
+          getComparisonKind(PredDef->getOpcode(), nullptr, nullptr, 0);
+      if (!Cmp || Comparison::isUnsigned(Cmp))
+        return false;
+
+      // If the register is being compared against an immediate, try changing
+      // the compare instruction to use induction register and adjust the
+      // immediate operand.
+      int64_t CmpImm = getImmediate(*CmpImmOp);
+      int64_t V = RB.second;
+      // Handle Overflow (64-bit).
+      if (((V > 0) && (CmpImm > INT64_MAX - V)) ||
+          ((V < 0) && (CmpImm < INT64_MIN - V)))
+        return false;
+      CmpImm += V;
+      // Most comparisons of register against an immediate value allow
+      // the immediate to be constant-extended. There are some exceptions
+      // though. Make sure the new combination will work.
+      if (CmpImmOp->isImm())
+        if (!isImmValidForOpcode(PredDef->getOpcode(), CmpImm))
+          return false;
+
+      // Make sure that the compare happens after the bump.  Otherwise,
+      // after the fixup, the compare would use a yet-undefined register.
+      MachineInstr *BumpI = MRI->getVRegDef(I->first);
+      bool Order = orderBumpCompare(BumpI, PredDef);
+      if (!Order)
+        return false;
+
+      // Finally, fix the compare instruction.
+      setImmediate(*CmpImmOp, CmpImm);
+      for (unsigned i = 0, n = PredDef->getNumOperands(); i < n; ++i) {
+        MachineOperand &MO = PredDef->getOperand(i);
+        if (MO.isReg() && MO.getReg() == RB.first) {
+          MO.setReg(I->first);
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+/// createPreheaderForLoop - Create a preheader for a given loop.
+MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
+      MachineLoop *L) {
+  if (MachineBasicBlock *TmpPH = MLI->findLoopPreheader(L, SpecPreheader))
+    return TmpPH;
+  if (!HWCreatePreheader)
+    return nullptr;
+
+  MachineBasicBlock *Header = L->getHeader();
+  MachineBasicBlock *Latch = L->getLoopLatch();
+  MachineBasicBlock *ExitingBlock = L->findLoopControlBlock();
+  MachineFunction *MF = Header->getParent();
+  DebugLoc DL;
+
+#ifndef NDEBUG
+  if ((PHFn != "") && (PHFn != MF->getName()))
+    return nullptr;
+#endif
+
+  if (!Latch || !ExitingBlock || Header->hasAddressTaken())
+    return nullptr;
+
+  typedef MachineBasicBlock::instr_iterator instr_iterator;
+
+  // Verify that all existing predecessors have analyzable branches
+  // (or no branches at all).
+  typedef std::vector<MachineBasicBlock*> MBBVector;
+  MBBVector Preds(Header->pred_begin(), Header->pred_end());
+  SmallVector<MachineOperand,2> Tmp1;
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
+
+  if (TII->analyzeBranch(*ExitingBlock, TB, FB, Tmp1, false))
+    return nullptr;
+
+  for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
+    MachineBasicBlock *PB = *I;
+    bool NotAnalyzed = TII->analyzeBranch(*PB, TB, FB, Tmp1, false);
+    if (NotAnalyzed)
+      return nullptr;
+  }
+
+  MachineBasicBlock *NewPH = MF->CreateMachineBasicBlock();
+  MF->insert(Header->getIterator(), NewPH);
+
+  if (Header->pred_size() > 2) {
+    // Ensure that the header has only two predecessors: the preheader and
+    // the loop latch.  Any additional predecessors of the header should
+    // join at the newly created preheader. Inspect all PHI nodes from the
+    // header and create appropriate corresponding PHI nodes in the preheader.
+
+    for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+         I != E && I->isPHI(); ++I) {
+      MachineInstr *PN = &*I;
+
+      const MCInstrDesc &PD = TII->get(TargetOpcode::PHI);
+      MachineInstr *NewPN = MF->CreateMachineInstr(PD, DL);
+      NewPH->insert(NewPH->end(), NewPN);
+
+      unsigned PR = PN->getOperand(0).getReg();
+      const TargetRegisterClass *RC = MRI->getRegClass(PR);
+      unsigned NewPR = MRI->createVirtualRegister(RC);
+      NewPN->addOperand(MachineOperand::CreateReg(NewPR, true));
+
+      // Copy all non-latch operands of a header's PHI node to the newly
+      // created PHI node in the preheader.
+      for (unsigned i = 1, n = PN->getNumOperands(); i < n; i += 2) {
+        unsigned PredR = PN->getOperand(i).getReg();
+        unsigned PredRSub = PN->getOperand(i).getSubReg();
+        MachineBasicBlock *PredB = PN->getOperand(i+1).getMBB();
+        if (PredB == Latch)
+          continue;
+
+        MachineOperand MO = MachineOperand::CreateReg(PredR, false);
+        MO.setSubReg(PredRSub);
+        NewPN->addOperand(MO);
+        NewPN->addOperand(MachineOperand::CreateMBB(PredB));
+      }
+
+      // Remove copied operands from the old PHI node and add the value
+      // coming from the preheader's PHI.
+      for (int i = PN->getNumOperands()-2; i > 0; i -= 2) {
+        MachineBasicBlock *PredB = PN->getOperand(i+1).getMBB();
+        if (PredB != Latch) {
+          PN->RemoveOperand(i+1);
+          PN->RemoveOperand(i);
+        }
+      }
+      PN->addOperand(MachineOperand::CreateReg(NewPR, false));
+      PN->addOperand(MachineOperand::CreateMBB(NewPH));
+    }
+  } else {
+    assert(Header->pred_size() == 2);
+
+    // The header has only two predecessors, but the non-latch predecessor
+    // is not a preheader (e.g. it has other successors, etc.)
+    // In such a case we don't need any extra PHI nodes in the new preheader,
+    // all we need is to adjust existing PHIs in the header to now refer to
+    // the new preheader.
+    for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+         I != E && I->isPHI(); ++I) {
+      MachineInstr *PN = &*I;
+      for (unsigned i = 1, n = PN->getNumOperands(); i < n; i += 2) {
+        MachineOperand &MO = PN->getOperand(i+1);
+        if (MO.getMBB() != Latch)
+          MO.setMBB(NewPH);
+      }
+    }
+  }
+
+  // "Reroute" the CFG edges to link in the new preheader.
+  // If any of the predecessors falls through to the header, insert a branch
+  // to the new preheader in that place.
+  SmallVector<MachineOperand,1> Tmp2;
+  SmallVector<MachineOperand,1> EmptyCond;
+
+  TB = FB = nullptr;
+
+  for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
+    MachineBasicBlock *PB = *I;
+    if (PB != Latch) {
+      Tmp2.clear();
+      bool NotAnalyzed = TII->analyzeBranch(*PB, TB, FB, Tmp2, false);
+      (void)NotAnalyzed; // suppress compiler warning
+      assert (!NotAnalyzed && "Should be analyzable!");
+      if (TB != Header && (Tmp2.empty() || FB != Header))
+        TII->insertBranch(*PB, NewPH, nullptr, EmptyCond, DL);
+      PB->ReplaceUsesOfBlockWith(Header, NewPH);
+    }
+  }
+
+  // It can happen that the latch block will fall through into the header.
+  // Insert an unconditional branch to the header.
+  TB = FB = nullptr;
+  bool LatchNotAnalyzed = TII->analyzeBranch(*Latch, TB, FB, Tmp2, false);
+  (void)LatchNotAnalyzed; // suppress compiler warning
+  assert (!LatchNotAnalyzed && "Should be analyzable!");
+  if (!TB && !FB)
+    TII->insertBranch(*Latch, Header, nullptr, EmptyCond, DL);
+
+  // Finally, the branch from the preheader to the header.
+  TII->insertBranch(*NewPH, Header, nullptr, EmptyCond, DL);
+  NewPH->addSuccessor(Header);
+
+  MachineLoop *ParentLoop = L->getParentLoop();
+  if (ParentLoop)
+    ParentLoop->addBasicBlockToLoop(NewPH, MLI->getBase());
+
+  // Update the dominator information with the new preheader.
+  if (MDT) {
+    if (MachineDomTreeNode *HN = MDT->getNode(Header)) {
+      if (MachineDomTreeNode *DHN = HN->getIDom()) {
+        MDT->addNewBlock(NewPH, DHN->getBlock());
+        MDT->changeImmediateDominator(Header, NewPH);
+      }
+    }
+  }
+
+  return NewPH;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
new file mode 100644
index 000000000000..036b18678709
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
@@ -0,0 +1,140 @@
+//===-- HexagonHazardRecognizer.cpp - Hexagon Post RA Hazard Recognizer ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the hazard recognizer for scheduling on Hexagon.
+// Use a DFA based hazard recognizer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonHazardRecognizer.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "post-RA-sched"
+
+void HexagonHazardRecognizer::Reset() {
+  DEBUG(dbgs() << "Reset hazard recognizer\n");
+  Resources->clearResources();
+  PacketNum = 0;
+  UsesDotCur = nullptr;
+  DotCurPNum = -1;
+  RegDefs.clear();
+}
+
+ScheduleHazardRecognizer::HazardType
+HexagonHazardRecognizer::getHazardType(SUnit *SU, int stalls) {
+  MachineInstr *MI = SU->getInstr();
+  if (!MI || TII->isZeroCost(MI->getOpcode()))
+    return NoHazard;
+
+  if (!Resources->canReserveResources(*MI)) {
+    DEBUG(dbgs() << "*** Hazard in cycle " << PacketNum << ", " << *MI);
+    HazardType RetVal = Hazard;
+    if (TII->mayBeNewStore(*MI)) {
+      // Make sure the register to be stored is defined by an instruction in the
+      // packet.
+      MachineOperand &MO = MI->getOperand(MI->getNumOperands() - 1);
+      if (!MO.isReg() || RegDefs.count(MO.getReg()) == 0)
+        return Hazard;
+      // The .new store version uses different resources so check if it
+      // causes a hazard.
+      MachineFunction *MF = MI->getParent()->getParent();
+      MachineInstr *NewMI =
+        MF->CreateMachineInstr(TII->get(TII->getDotNewOp(*MI)),
+                               MI->getDebugLoc());
+      if (Resources->canReserveResources(*NewMI))
+        RetVal = NoHazard;
+      DEBUG(dbgs() << "*** Try .new version? " << (RetVal == NoHazard) << "\n");
+      MF->DeleteMachineInstr(NewMI);
+    }
+    return RetVal;
+  }
+
+  if (SU == UsesDotCur && DotCurPNum != (int)PacketNum) {
+    DEBUG(dbgs() << "*** .cur Hazard in cycle " << PacketNum << ", " << *MI);
+    return Hazard;
+  }
+
+  return NoHazard;
+}
+
+void HexagonHazardRecognizer::AdvanceCycle() {
+  DEBUG(dbgs() << "Advance cycle, clear state\n");
+  Resources->clearResources();
+  if (DotCurPNum != -1 && DotCurPNum != (int)PacketNum) {
+    UsesDotCur = nullptr;
+    DotCurPNum = -1;
+  }
+  PacketNum++;
+  RegDefs.clear();
+}
+
+/// If a packet contains a dot cur instruction, then we may prefer the
+/// instruction that can use the dot cur result. Or, if the use
+/// isn't scheduled in the same packet, then prefer other instructions
+/// in the subsequent packet.
+bool HexagonHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
+  return UsesDotCur && ((SU == UsesDotCur) ^ (DotCurPNum == (int)PacketNum));
+}
+
+void HexagonHazardRecognizer::EmitInstruction(SUnit *SU) {
+  MachineInstr *MI = SU->getInstr();
+  if (!MI)
+    return;
+
+  // Keep the set of definitions for each packet, which is used to determine
+  // if a .new can be used.
+  for (const MachineOperand &MO : MI->operands())
+    if (MO.isReg() && MO.isDef() && !MO.isImplicit())
+      RegDefs.insert(MO.getReg());
+
+  if (TII->isZeroCost(MI->getOpcode()))
+    return;
+
+  if (!Resources->canReserveResources(*MI)) {
+    // It must be a .new store since other instructions must be able to be
+    // reserved at this point.
+    assert(TII->mayBeNewStore(*MI) && "Expecting .new store");
+    MachineFunction *MF = MI->getParent()->getParent();
+    MachineInstr *NewMI =
+        MF->CreateMachineInstr(TII->get(TII->getDotNewOp(*MI)),
+                               MI->getDebugLoc());
+    assert(Resources->canReserveResources(*NewMI));
+    Resources->reserveResources(*NewMI);
+    MF->DeleteMachineInstr(NewMI);
+  }
+  else
+    Resources->reserveResources(*MI);
+  DEBUG(dbgs() << " Add instruction " << *MI);
+
+  // When scheduling a dot cur instruction, check if there is an instruction
+  // that can use the dot cur in the same packet. If so, we'll attempt to
+  // schedule it before other instructions. We only do this if the use has
+  // the same height as the dot cur. Otherwise, we may miss scheduling an
+  // instruction with a greater height, which is more important.
+  if (TII->mayBeCurLoad(*MI))
+    for (auto &S : SU->Succs)
+      if (S.isAssignedRegDep() && S.getLatency() == 0 &&
+          SU->getHeight() == S.getSUnit()->getHeight()) {
+        UsesDotCur = S.getSUnit();
+        DotCurPNum = PacketNum;
+        break;
+      }
+  if (SU == UsesDotCur) {
+    UsesDotCur = nullptr;
+    DotCurPNum = -1;
+  }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h b/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h
new file mode 100644
index 000000000000..70efcb7a9f76
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h
@@ -0,0 +1,78 @@
+//===--- HexagonHazardRecognizer.h - Hexagon Post RA Hazard Recognizer ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file defines the hazard recognizer for scheduling on Hexagon.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONPROFITRECOGNIZER_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONPROFITRECOGNIZER_H
+
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+
+namespace llvm {
+
+class HexagonHazardRecognizer : public ScheduleHazardRecognizer {
+  DFAPacketizer *Resources;
+  const HexagonInstrInfo *TII;
+  unsigned PacketNum;
+  // If the packet contains a potential dot cur instruction. This is
+  // used for the scheduling priority function.
+  SUnit *UsesDotCur;
+  // The packet number when a dor cur is emitted. If its use is not generated
+  // in the same packet, then try to wait another cycle before emitting.
+  int DotCurPNum;
+  // The set of registers defined by instructions in the current packet.
+  SmallSet<unsigned, 8> RegDefs;
+
+public:
+  HexagonHazardRecognizer(const InstrItineraryData *II,
+                          const HexagonInstrInfo *HII,
+                          const HexagonSubtarget &ST)
+    : Resources(ST.createDFAPacketizer(II)), TII(HII), PacketNum(0),
+    UsesDotCur(nullptr), DotCurPNum(-1) { }
+
+  ~HexagonHazardRecognizer() override {
+    if (Resources)
+      delete Resources;
+  }
+
+  /// This callback is invoked when a new block of instructions is about to be
+  /// scheduled. The hazard state is set to an initialized state.
+  void Reset() override;
+
+  /// Return the hazard type of emitting this node.  There are three
+  /// possible results.  Either:
+  ///  * NoHazard: it is legal to issue this instruction on this cycle.
+  ///  * Hazard: issuing this instruction would stall the machine.  If some
+  ///     other instruction is available, issue it first.
+  HazardType getHazardType(SUnit *SU, int stalls) override;
+
+  /// This callback is invoked when an instruction is emitted to be scheduled,
+  /// to advance the hazard state.
+  void EmitInstruction(SUnit *) override;
+
+  /// This callback may be invoked if getHazardType returns NoHazard. If, even
+  /// though there is no hazard, it would be better to schedule another
+  /// available instruction, this callback should return true.
+  bool ShouldPreferAnother(SUnit *) override;
+
+  /// This callback is invoked whenever the next top-down instruction to be
+  /// scheduled cannot issue in the current cycle, either because of latency
+  /// or resource conflicts.  This should increment the internal state of the
+  /// hazard recognizer so that previously "Hazard" instructions will now not
+  /// be hazards.
+  void AdvanceCycle() override;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONPROFITRECOGNIZER_H
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
new file mode 100644
index 000000000000..f6012d29d422
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -0,0 +1,2002 @@
+//===-- HexagonISelDAGToDAG.cpp - A dag to dag inst selector for Hexagon --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the Hexagon target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonISelLowering.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-isel"
+
+static
+cl::opt<bool>
+EnableAddressRebalancing("isel-rebalance-addr", cl::Hidden, cl::init(true),
+  cl::desc("Rebalance address calculation trees to improve "
+          "instruction selection"));
+
+// Rebalance only if this allows e.g. combining a GA with an offset or
+// factoring out a shift.
+static
+cl::opt<bool>
+RebalanceOnlyForOptimizations("rebalance-only-opt", cl::Hidden, cl::init(false),
+  cl::desc("Rebalance address tree only if this allows optimizations"));
+
+static
+cl::opt<bool>
+RebalanceOnlyImbalancedTrees("rebalance-only-imbal", cl::Hidden,
+  cl::init(false), cl::desc("Rebalance address tree only if it is imbalanced"));
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===--------------------------------------------------------------------===//
+/// HexagonDAGToDAGISel - Hexagon specific code to select Hexagon machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+class HexagonDAGToDAGISel : public SelectionDAGISel {
+  const HexagonSubtarget *HST;
+  const HexagonInstrInfo *HII;
+  const HexagonRegisterInfo *HRI;
+public:
+  explicit HexagonDAGToDAGISel(HexagonTargetMachine &tm,
+                               CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(tm, OptLevel), HST(nullptr), HII(nullptr),
+        HRI(nullptr) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    // Reset the subtarget each time through.
+    HST = &MF.getSubtarget<HexagonSubtarget>();
+    HII = HST->getInstrInfo();
+    HRI = HST->getRegisterInfo();
+    SelectionDAGISel::runOnMachineFunction(MF);
+    return true;
+  }
+
+  void PreprocessISelDAG() override;
+  void EmitFunctionEntryCode() override;
+
+  void Select(SDNode *N) override;
+
+  // Complex Pattern Selectors.
+  inline bool SelectAddrGA(SDValue &N, SDValue &R);
+  inline bool SelectAddrGP(SDValue &N, SDValue &R);
+  bool SelectGlobalAddress(SDValue &N, SDValue &R, bool UseGP);
+  bool SelectAddrFI(SDValue &N, SDValue &R);
+
+  StringRef getPassName() const override {
+    return "Hexagon DAG->DAG Pattern Instruction Selection";
+  }
+
+  // Generate a machine instruction node corresponding to the circ/brev
+  // load intrinsic.
+  MachineSDNode *LoadInstrForLoadIntrinsic(SDNode *IntN);
+  // Given the circ/brev load intrinsic and the already generated machine
+  // instruction, generate the appropriate store (that is a part of the
+  // intrinsic's functionality).
+  SDNode *StoreInstrForLoadIntrinsic(MachineSDNode *LoadN, SDNode *IntN);
+
+  void SelectFrameIndex(SDNode *N);
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    unsigned ConstraintID,
+                                    std::vector<SDValue> &OutOps) override;
+  bool tryLoadOfLoadIntrinsic(LoadSDNode *N);
+  void SelectLoad(SDNode *N);
+  void SelectIndexedLoad(LoadSDNode *LD, const SDLoc &dl);
+  void SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl);
+  void SelectStore(SDNode *N);
+  void SelectSHL(SDNode *N);
+  void SelectMul(SDNode *N);
+  void SelectZeroExtend(SDNode *N);
+  void SelectIntrinsicWChain(SDNode *N);
+  void SelectIntrinsicWOChain(SDNode *N);
+  void SelectConstant(SDNode *N);
+  void SelectConstantFP(SDNode *N);
+  void SelectBitcast(SDNode *N);
+
+  // Include the pieces autogenerated from the target description.
+  #include "HexagonGenDAGISel.inc"
+
+private:
+  bool isValueExtension(const SDValue &Val, unsigned FromBits, SDValue &Src);
+  bool isOrEquivalentToAdd(const SDNode *N) const;
+  bool isAlignedMemNode(const MemSDNode *N) const;
+  bool isPositiveHalfWord(const SDNode *N) const;
+
+  SmallDenseMap<SDNode *,int> RootWeights;
+  SmallDenseMap<SDNode *,int> RootHeights;
+  SmallDenseMap<const Value *,int> GAUsesInFunction;
+  int getWeight(SDNode *N);
+  int getHeight(SDNode *N);
+  SDValue getMultiplierForSHL(SDNode *N);
+  SDValue factorOutPowerOf2(SDValue V, unsigned Power);
+  unsigned getUsesInFunction(const Value *V);
+  SDValue balanceSubTree(SDNode *N, bool Factorize = false);
+  void rebalanceAddressTrees();
+}; // end HexagonDAGToDAGISel
+}  // end anonymous namespace
+
+
+/// createHexagonISelDag - This pass converts a legalized DAG into a
+/// Hexagon-specific DAG, ready for instruction scheduling.
+///
+namespace llvm {
+FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
+                                   CodeGenOpt::Level OptLevel) {
+  return new HexagonDAGToDAGISel(TM, OptLevel);
+}
+}
+
+// Intrinsics that return a a predicate.
+static bool doesIntrinsicReturnPredicate(unsigned ID) {
+  switch (ID) {
+    default:
+      return false;
+    case Intrinsic::hexagon_C2_cmpeq:
+    case Intrinsic::hexagon_C2_cmpgt:
+    case Intrinsic::hexagon_C2_cmpgtu:
+    case Intrinsic::hexagon_C2_cmpgtup:
+    case Intrinsic::hexagon_C2_cmpgtp:
+    case Intrinsic::hexagon_C2_cmpeqp:
+    case Intrinsic::hexagon_C2_bitsset:
+    case Intrinsic::hexagon_C2_bitsclr:
+    case Intrinsic::hexagon_C2_cmpeqi:
+    case Intrinsic::hexagon_C2_cmpgti:
+    case Intrinsic::hexagon_C2_cmpgtui:
+    case Intrinsic::hexagon_C2_cmpgei:
+    case Intrinsic::hexagon_C2_cmpgeui:
+    case Intrinsic::hexagon_C2_cmplt:
+    case Intrinsic::hexagon_C2_cmpltu:
+    case Intrinsic::hexagon_C2_bitsclri:
+    case Intrinsic::hexagon_C2_and:
+    case Intrinsic::hexagon_C2_or:
+    case Intrinsic::hexagon_C2_xor:
+    case Intrinsic::hexagon_C2_andn:
+    case Intrinsic::hexagon_C2_not:
+    case Intrinsic::hexagon_C2_orn:
+    case Intrinsic::hexagon_C2_pxfer_map:
+    case Intrinsic::hexagon_C2_any8:
+    case Intrinsic::hexagon_C2_all8:
+    case Intrinsic::hexagon_A2_vcmpbeq:
+    case Intrinsic::hexagon_A2_vcmpbgtu:
+    case Intrinsic::hexagon_A2_vcmpheq:
+    case Intrinsic::hexagon_A2_vcmphgt:
+    case Intrinsic::hexagon_A2_vcmphgtu:
+    case Intrinsic::hexagon_A2_vcmpweq:
+    case Intrinsic::hexagon_A2_vcmpwgt:
+    case Intrinsic::hexagon_A2_vcmpwgtu:
+    case Intrinsic::hexagon_C2_tfrrp:
+    case Intrinsic::hexagon_S2_tstbit_i:
+    case Intrinsic::hexagon_S2_tstbit_r:
+      return true;
+  }
+}
+
+void HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, const SDLoc &dl) {
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  SDValue Offset = LD->getOffset();
+  int32_t Inc = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
+  EVT LoadedVT = LD->getMemoryVT();
+  unsigned Opcode = 0;
+
+  // Check for zero extended loads. Treat any-extend loads as zero extended
+  // loads.
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  bool IsZeroExt = (ExtType == ISD::ZEXTLOAD || ExtType == ISD::EXTLOAD);
+  bool IsValidInc = HII->isValidAutoIncImm(LoadedVT, Inc);
+
+  assert(LoadedVT.isSimple());
+  switch (LoadedVT.getSimpleVT().SimpleTy) {
+  case MVT::i8:
+    if (IsZeroExt)
+      Opcode = IsValidInc ? Hexagon::L2_loadrub_pi : Hexagon::L2_loadrub_io;
+    else
+      Opcode = IsValidInc ? Hexagon::L2_loadrb_pi : Hexagon::L2_loadrb_io;
+    break;
+  case MVT::i16:
+    if (IsZeroExt)
+      Opcode = IsValidInc ? Hexagon::L2_loadruh_pi : Hexagon::L2_loadruh_io;
+    else
+      Opcode = IsValidInc ? Hexagon::L2_loadrh_pi : Hexagon::L2_loadrh_io;
+    break;
+  case MVT::i32:
+    Opcode = IsValidInc ? Hexagon::L2_loadri_pi : Hexagon::L2_loadri_io;
+    break;
+  case MVT::i64:
+    Opcode = IsValidInc ? Hexagon::L2_loadrd_pi : Hexagon::L2_loadrd_io;
+    break;
+  // 64B
+  case MVT::v64i8:
+  case MVT::v32i16:
+  case MVT::v16i32:
+  case MVT::v8i64:
+    if (isAlignedMemNode(LD))
+      Opcode = IsValidInc ? Hexagon::V6_vL32b_pi : Hexagon::V6_vL32b_ai;
+    else
+      Opcode = IsValidInc ? Hexagon::V6_vL32Ub_pi : Hexagon::V6_vL32Ub_ai;
+    break;
+  // 128B
+  case MVT::v128i8:
+  case MVT::v64i16:
+  case MVT::v32i32:
+  case MVT::v16i64:
+    if (isAlignedMemNode(LD))
+      Opcode = IsValidInc ? Hexagon::V6_vL32b_pi_128B
+                          : Hexagon::V6_vL32b_ai_128B;
+    else
+      Opcode = IsValidInc ? Hexagon::V6_vL32Ub_pi_128B
+                          : Hexagon::V6_vL32Ub_ai_128B;
+    break;
+  default:
+    llvm_unreachable("Unexpected memory type in indexed load");
+  }
+
+  SDValue IncV = CurDAG->getTargetConstant(Inc, dl, MVT::i32);
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = LD->getMemOperand();
+
+  auto getExt64 = [this,ExtType] (MachineSDNode *N, const SDLoc &dl)
+        -> MachineSDNode* {
+    if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::EXTLOAD) {
+      SDValue Zero = CurDAG->getTargetConstant(0, dl, MVT::i32);
+      return CurDAG->getMachineNode(Hexagon::A4_combineir, dl, MVT::i64,
+                                    Zero, SDValue(N, 0));
+    }
+    if (ExtType == ISD::SEXTLOAD)
+      return CurDAG->getMachineNode(Hexagon::A2_sxtw, dl, MVT::i64,
+                                    SDValue(N, 0));
+    return N;
+  };
+
+  //                  Loaded value   Next address   Chain
+  SDValue From[3] = { SDValue(LD,0), SDValue(LD,1), SDValue(LD,2) };
+  SDValue To[3];
+
+  EVT ValueVT = LD->getValueType(0);
+  if (ValueVT == MVT::i64 && ExtType != ISD::NON_EXTLOAD) {
+    // A load extending to i64 will actually produce i32, which will then
+    // need to be extended to i64.
+    assert(LoadedVT.getSizeInBits() <= 32);
+    ValueVT = MVT::i32;
+  }
+
+  if (IsValidInc) {
+    MachineSDNode *L = CurDAG->getMachineNode(Opcode, dl, ValueVT,
+                                              MVT::i32, MVT::Other, Base,
+                                              IncV, Chain);
+    L->setMemRefs(MemOp, MemOp+1);
+    To[1] = SDValue(L, 1); // Next address.
+    To[2] = SDValue(L, 2); // Chain.
+    // Handle special case for extension to i64.
+    if (LD->getValueType(0) == MVT::i64)
+      L = getExt64(L, dl);
+    To[0] = SDValue(L, 0); // Loaded (extended) value.
+  } else {
+    SDValue Zero = CurDAG->getTargetConstant(0, dl, MVT::i32);
+    MachineSDNode *L = CurDAG->getMachineNode(Opcode, dl, ValueVT, MVT::Other,
+                                              Base, Zero, Chain);
+    L->setMemRefs(MemOp, MemOp+1);
+    To[2] = SDValue(L, 1); // Chain.
+    MachineSDNode *A = CurDAG->getMachineNode(Hexagon::A2_addi, dl, MVT::i32,
+                                              Base, IncV);
+    To[1] = SDValue(A, 0); // Next address.
+    // Handle special case for extension to i64.
+    if (LD->getValueType(0) == MVT::i64)
+      L = getExt64(L, dl);
+    To[0] = SDValue(L, 0); // Loaded (extended) value.
+  }
+  ReplaceUses(From, To, 3);
+  CurDAG->RemoveDeadNode(LD);
+}
+
+
+MachineSDNode *HexagonDAGToDAGISel::LoadInstrForLoadIntrinsic(SDNode *IntN) {
+  if (IntN->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return nullptr;
+
+  SDLoc dl(IntN);
+  unsigned IntNo = cast<ConstantSDNode>(IntN->getOperand(1))->getZExtValue();
+
+  static std::map<unsigned,unsigned> LoadPciMap = {
+    { Intrinsic::hexagon_circ_ldb,  Hexagon::L2_loadrb_pci  },
+    { Intrinsic::hexagon_circ_ldub, Hexagon::L2_loadrub_pci },
+    { Intrinsic::hexagon_circ_ldh,  Hexagon::L2_loadrh_pci  },
+    { Intrinsic::hexagon_circ_lduh, Hexagon::L2_loadruh_pci },
+    { Intrinsic::hexagon_circ_ldw,  Hexagon::L2_loadri_pci  },
+    { Intrinsic::hexagon_circ_ldd,  Hexagon::L2_loadrd_pci  },
+  };
+  auto FLC = LoadPciMap.find(IntNo);
+  if (FLC != LoadPciMap.end()) {
+    SDNode *Mod = CurDAG->getMachineNode(Hexagon::A2_tfrrcr, dl, MVT::i32,
+          IntN->getOperand(4));
+    EVT ValTy = (IntNo == Intrinsic::hexagon_circ_ldd) ? MVT::i64 : MVT::i32;
+    EVT RTys[] = { ValTy, MVT::i32, MVT::Other };
+    // Operands: { Base, Increment, Modifier, Chain }
+    auto Inc = cast<ConstantSDNode>(IntN->getOperand(5));
+    SDValue I = CurDAG->getTargetConstant(Inc->getSExtValue(), dl, MVT::i32);
+    MachineSDNode *Res = CurDAG->getMachineNode(FLC->second, dl, RTys,
+          { IntN->getOperand(2), I, SDValue(Mod,0), IntN->getOperand(0) });
+    return Res;
+  }
+
+  static std::map<unsigned,unsigned> LoadPbrMap = {
+    { Intrinsic::hexagon_brev_ldb,  Hexagon::L2_loadrb_pbr  },
+    { Intrinsic::hexagon_brev_ldub, Hexagon::L2_loadrub_pbr },
+    { Intrinsic::hexagon_brev_ldh,  Hexagon::L2_loadrh_pbr  },
+    { Intrinsic::hexagon_brev_lduh, Hexagon::L2_loadruh_pbr },
+    { Intrinsic::hexagon_brev_ldw,  Hexagon::L2_loadri_pbr  },
+    { Intrinsic::hexagon_brev_ldd,  Hexagon::L2_loadrd_pbr  },
+  };
+  auto FLB = LoadPbrMap.find(IntNo);
+  if (FLB != LoadPbrMap.end()) {
+    SDNode *Mod = CurDAG->getMachineNode(Hexagon::A2_tfrrcr, dl, MVT::i32,
+            IntN->getOperand(4));
+    EVT ValTy = (IntNo == Intrinsic::hexagon_brev_ldd) ? MVT::i64 : MVT::i32;
+    EVT RTys[] = { ValTy, MVT::i32, MVT::Other };
+    // Operands: { Base, Modifier, Chain }
+    MachineSDNode *Res = CurDAG->getMachineNode(FLB->second, dl, RTys,
+          { IntN->getOperand(2), SDValue(Mod,0), IntN->getOperand(0) });
+    return Res;
+  }
+
+  return nullptr;
+}
+
+SDNode *HexagonDAGToDAGISel::StoreInstrForLoadIntrinsic(MachineSDNode *LoadN,
+      SDNode *IntN) {
+  // The "LoadN" is just a machine load instruction. The intrinsic also
+  // involves storing it. Generate an appropriate store to the location
+  // given in the intrinsic's operand(3).
+  uint64_t F = HII->get(LoadN->getMachineOpcode()).TSFlags;
+  unsigned SizeBits = (F >> HexagonII::MemAccessSizePos) &
+                      HexagonII::MemAccesSizeMask;
+  unsigned Size = 1U << (SizeBits-1);
+
+  SDLoc dl(IntN);
+  MachinePointerInfo PI;
+  SDValue TS;
+  SDValue Loc = IntN->getOperand(3);
+
+  if (Size >= 4)
+    TS = CurDAG->getStore(SDValue(LoadN, 2), dl, SDValue(LoadN, 0), Loc, PI,
+                          Size);
+  else
+    TS = CurDAG->getTruncStore(SDValue(LoadN, 2), dl, SDValue(LoadN, 0), Loc,
+                               PI, MVT::getIntegerVT(Size * 8), Size);
+
+  SDNode *StoreN;
+  {
+    HandleSDNode Handle(TS);
+    SelectStore(TS.getNode());
+    StoreN = Handle.getValue().getNode();
+  }
+
+  // Load's results are { Loaded value, Updated pointer, Chain }
+  ReplaceUses(SDValue(IntN, 0), SDValue(LoadN, 1));
+  ReplaceUses(SDValue(IntN, 1), SDValue(StoreN, 0));
+  return StoreN;
+}
+
+bool HexagonDAGToDAGISel::tryLoadOfLoadIntrinsic(LoadSDNode *N) {
+  // The intrinsics for load circ/brev perform two operations:
+  // 1. Load a value V from the specified location, using the addressing
+  //    mode corresponding to the intrinsic.
+  // 2. Store V into a specified location. This location is typically a
+  //    local, temporary object.
+  // In many cases, the program using these intrinsics will immediately
+  // load V again from the local object. In those cases, when certain
+  // conditions are met, the last load can be removed.
+  // This function identifies and optimizes this pattern. If the pattern
+  // cannot be optimized, it returns nullptr, which will cause the load
+  // to be selected separately from the intrinsic (which will be handled
+  // in SelectIntrinsicWChain).
+
+  SDValue Ch = N->getOperand(0);
+  SDValue Loc = N->getOperand(1);
+
+  // Assume that the load and the intrinsic are connected directly with a
+  // chain:
+  //   t1: i32,ch = int.load ..., ..., ..., Loc, ...    // <-- C
+  //   t2: i32,ch = load t1:1, Loc, ...
+  SDNode *C = Ch.getNode();
+
+  if (C->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return false;
+
+  // The second load can only be eliminated if its extension type matches
+  // that of the load instruction corresponding to the intrinsic. The user
+  // can provide an address of an unsigned variable to store the result of
+  // a sign-extending intrinsic into (or the other way around).
+  ISD::LoadExtType IntExt;
+  switch (cast<ConstantSDNode>(C->getOperand(1))->getZExtValue()) {
+    case Intrinsic::hexagon_brev_ldub:
+    case Intrinsic::hexagon_brev_lduh:
+    case Intrinsic::hexagon_circ_ldub:
+    case Intrinsic::hexagon_circ_lduh:
+      IntExt = ISD::ZEXTLOAD;
+      break;
+    case Intrinsic::hexagon_brev_ldw:
+    case Intrinsic::hexagon_brev_ldd:
+    case Intrinsic::hexagon_circ_ldw:
+    case Intrinsic::hexagon_circ_ldd:
+      IntExt = ISD::NON_EXTLOAD;
+      break;
+    default:
+      IntExt = ISD::SEXTLOAD;
+      break;
+  }
+  if (N->getExtensionType() != IntExt)
+    return false;
+
+  // Make sure the target location for the loaded value in the load intrinsic
+  // is the location from which LD (or N) is loading.
+  if (C->getNumOperands() < 4 || Loc.getNode() != C->getOperand(3).getNode())
+    return false;
+
+  if (MachineSDNode *L = LoadInstrForLoadIntrinsic(C)) {
+    SDNode *S = StoreInstrForLoadIntrinsic(L, C);
+    SDValue F[] = { SDValue(N,0), SDValue(N,1), SDValue(C,0), SDValue(C,1) };
+    SDValue T[] = { SDValue(L,0), SDValue(S,0), SDValue(L,1), SDValue(S,0) };
+    ReplaceUses(F, T, array_lengthof(T));
+    // This transformation will leave the intrinsic dead. If it remains in
+    // the DAG, the selection code will see it again, but without the load,
+    // and it will generate a store that is normally required for it.
+    CurDAG->RemoveDeadNode(C);
+    return true;
+  }
+
+  return false;
+}
+
+void HexagonDAGToDAGISel::SelectLoad(SDNode *N) {
+  SDLoc dl(N);
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+
+  // Handle indexed loads.
+  if (AM != ISD::UNINDEXED) {
+    SelectIndexedLoad(LD, dl);
+    return;
+  }
+
+  // Handle patterns using circ/brev load intrinsics.
+  if (tryLoadOfLoadIntrinsic(LD))
+    return;
+
+  SelectCode(LD);
+}
+
+void HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl) {
+  SDValue Chain = ST->getChain();
+  SDValue Base = ST->getBasePtr();
+  SDValue Offset = ST->getOffset();
+  SDValue Value = ST->getValue();
+  // Get the constant value.
+  int32_t Inc = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
+  EVT StoredVT = ST->getMemoryVT();
+  EVT ValueVT = Value.getValueType();
+
+  bool IsValidInc = HII->isValidAutoIncImm(StoredVT, Inc);
+  unsigned Opcode = 0;
+
+  assert(StoredVT.isSimple());
+  switch (StoredVT.getSimpleVT().SimpleTy) {
+  case MVT::i8:
+    Opcode = IsValidInc ? Hexagon::S2_storerb_pi : Hexagon::S2_storerb_io;
+    break;
+  case MVT::i16:
+    Opcode = IsValidInc ? Hexagon::S2_storerh_pi : Hexagon::S2_storerh_io;
+    break;
+  case MVT::i32:
+    Opcode = IsValidInc ? Hexagon::S2_storeri_pi : Hexagon::S2_storeri_io;
+    break;
+  case MVT::i64:
+    Opcode = IsValidInc ? Hexagon::S2_storerd_pi : Hexagon::S2_storerd_io;
+    break;
+  // 64B
+  case MVT::v64i8:
+  case MVT::v32i16:
+  case MVT::v16i32:
+  case MVT::v8i64:
+    if (isAlignedMemNode(ST))
+      Opcode = IsValidInc ? Hexagon::V6_vS32b_pi : Hexagon::V6_vS32b_ai;
+    else
+      Opcode = IsValidInc ? Hexagon::V6_vS32Ub_pi : Hexagon::V6_vS32Ub_ai;
+    break;
+  // 128B
+  case MVT::v128i8:
+  case MVT::v64i16:
+  case MVT::v32i32:
+  case MVT::v16i64:
+    if (isAlignedMemNode(ST))
+      Opcode = IsValidInc ? Hexagon::V6_vS32b_pi_128B
+                          : Hexagon::V6_vS32b_ai_128B;
+    else
+      Opcode = IsValidInc ? Hexagon::V6_vS32Ub_pi_128B
+                          : Hexagon::V6_vS32Ub_ai_128B;
+    break;
+  default:
+    llvm_unreachable("Unexpected memory type in indexed store");
+  }
+
+  if (ST->isTruncatingStore() && ValueVT.getSizeInBits() == 64) {
+    assert(StoredVT.getSizeInBits() < 64 && "Not a truncating store");
+    Value = CurDAG->getTargetExtractSubreg(Hexagon::isub_lo,
+                                           dl, MVT::i32, Value);
+  }
+
+  SDValue IncV = CurDAG->getTargetConstant(Inc, dl, MVT::i32);
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = ST->getMemOperand();
+
+  //                  Next address   Chain
+  SDValue From[2] = { SDValue(ST,0), SDValue(ST,1) };
+  SDValue To[2];
+
+  if (IsValidInc) {
+    // Build post increment store.
+    SDValue Ops[] = { Base, IncV, Value, Chain };
+    MachineSDNode *S = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::Other,
+                                              Ops);
+    S->setMemRefs(MemOp, MemOp + 1);
+    To[0] = SDValue(S, 0);
+    To[1] = SDValue(S, 1);
+  } else {
+    SDValue Zero = CurDAG->getTargetConstant(0, dl, MVT::i32);
+    SDValue Ops[] = { Base, Zero, Value, Chain };
+    MachineSDNode *S = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+    S->setMemRefs(MemOp, MemOp + 1);
+    To[1] = SDValue(S, 0);
+    MachineSDNode *A = CurDAG->getMachineNode(Hexagon::A2_addi, dl, MVT::i32,
+                                              Base, IncV);
+    To[0] = SDValue(A, 0);
+  }
+
+  ReplaceUses(From, To, 2);
+  CurDAG->RemoveDeadNode(ST);
+}
+
+void HexagonDAGToDAGISel::SelectStore(SDNode *N) {
+  SDLoc dl(N);
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  ISD::MemIndexedMode AM = ST->getAddressingMode();
+
+  // Handle indexed stores.
+  if (AM != ISD::UNINDEXED) {
+    SelectIndexedStore(ST, dl);
+    return;
+  }
+
+  SelectCode(ST);
+}
+
+void HexagonDAGToDAGISel::SelectMul(SDNode *N) {
+  SDLoc dl(N);
+
+  // %conv.i = sext i32 %tmp1 to i64
+  // %conv2.i = sext i32 %add to i64
+  // %mul.i = mul nsw i64 %conv2.i, %conv.i
+  //
+  //   --- match with the following ---
+  //
+  // %mul.i = mpy (%tmp1, %add)
+  //
+
+  if (N->getValueType(0) == MVT::i64) {
+    // Shifting a i64 signed multiply.
+    SDValue MulOp0 = N->getOperand(0);
+    SDValue MulOp1 = N->getOperand(1);
+
+    SDValue OP0;
+    SDValue OP1;
+
+    // Handle sign_extend and sextload.
+    if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) {
+      SDValue Sext0 = MulOp0.getOperand(0);
+      if (Sext0.getNode()->getValueType(0) != MVT::i32) {
+        SelectCode(N);
+        return;
+      }
+      OP0 = Sext0;
+    } else if (MulOp0.getOpcode() == ISD::LOAD) {
+      LoadSDNode *LD = cast<LoadSDNode>(MulOp0.getNode());
+      if (LD->getMemoryVT() != MVT::i32 ||
+          LD->getExtensionType() != ISD::SEXTLOAD ||
+          LD->getAddressingMode() != ISD::UNINDEXED) {
+        SelectCode(N);
+        return;
+      }
+      SDValue Chain = LD->getChain();
+      SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
+      OP0 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32,
+                                            MVT::Other,
+                                            LD->getBasePtr(), TargetConst0,
+                                            Chain), 0);
+    } else {
+      SelectCode(N);
+      return;
+    }
+
+    // Same goes for the second operand.
+    if (MulOp1.getOpcode() == ISD::SIGN_EXTEND) {
+      SDValue Sext1 = MulOp1.getOperand(0);
+      if (Sext1.getNode()->getValueType(0) != MVT::i32) {
+        SelectCode(N);
+        return;
+      }
+      OP1 = Sext1;
+    } else if (MulOp1.getOpcode() == ISD::LOAD) {
+      LoadSDNode *LD = cast<LoadSDNode>(MulOp1.getNode());
+      if (LD->getMemoryVT() != MVT::i32 ||
+          LD->getExtensionType() != ISD::SEXTLOAD ||
+          LD->getAddressingMode() != ISD::UNINDEXED) {
+        SelectCode(N);
+        return;
+      }
+      SDValue Chain = LD->getChain();
+      SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
+      OP1 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32,
+                                            MVT::Other,
+                                            LD->getBasePtr(), TargetConst0,
+                                            Chain), 0);
+    } else {
+      SelectCode(N);
+      return;
+    }
+
+    // Generate a mpy instruction.
+    SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_dpmpyss_s0, dl,
+                                            MVT::i64, OP0, OP1);
+    ReplaceNode(N, Result);
+    return;
+  }
+
+  SelectCode(N);
+}
+
+void HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
+  SDLoc dl(N);
+  SDValue Shl_0 = N->getOperand(0);
+  SDValue Shl_1 = N->getOperand(1);
+
+  auto Default = [this,N] () -> void { SelectCode(N); };
+
+  if (N->getValueType(0) != MVT::i32 || Shl_1.getOpcode() != ISD::Constant)
+    return Default();
+
+  // RHS is const.
+  int32_t ShlConst = cast<ConstantSDNode>(Shl_1)->getSExtValue();
+
+  if (Shl_0.getOpcode() == ISD::MUL) {
+    SDValue Mul_0 = Shl_0.getOperand(0); // Val
+    SDValue Mul_1 = Shl_0.getOperand(1); // Const
+    // RHS of mul is const.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mul_1)) {
+      int32_t ValConst = C->getSExtValue() << ShlConst;
+      if (isInt<9>(ValConst)) {
+        SDValue Val = CurDAG->getTargetConstant(ValConst, dl, MVT::i32);
+        SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_mpysmi, dl,
+                                                MVT::i32, Mul_0, Val);
+        ReplaceNode(N, Result);
+        return;
+      }
+    }
+    return Default();
+  }
+
+  if (Shl_0.getOpcode() == ISD::SUB) {
+    SDValue Sub_0 = Shl_0.getOperand(0); // Const 0
+    SDValue Sub_1 = Shl_0.getOperand(1); // Val
+    if (ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(Sub_0)) {
+      if (C1->getSExtValue() != 0 || Sub_1.getOpcode() != ISD::SHL)
+        return Default();
+      SDValue Shl2_0 = Sub_1.getOperand(0); // Val
+      SDValue Shl2_1 = Sub_1.getOperand(1); // Const
+      if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(Shl2_1)) {
+        int32_t ValConst = 1 << (ShlConst + C2->getSExtValue());
+        if (isInt<9>(-ValConst)) {
+          SDValue Val = CurDAG->getTargetConstant(-ValConst, dl, MVT::i32);
+          SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_mpysmi, dl,
+                                                  MVT::i32, Shl2_0, Val);
+          ReplaceNode(N, Result);
+          return;
+        }
+      }
+    }
+  }
+
+  return Default();
+}
+
+
+//
+// If there is an zero_extend followed an intrinsic in DAG (this means - the
+// result of the intrinsic is predicate); convert the zero_extend to
+// transfer instruction.
+//
+// Zero extend -> transfer is lowered here. Otherwise, zero_extend will be
+// converted into a MUX as predicate registers defined as 1 bit in the
+// compiler. Architecture defines them as 8-bit registers.
+// We want to preserve all the lower 8-bits and, not just 1 LSB bit.
+//
+void HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
+  SDLoc dl(N);
+
+  SDValue Op0 = N->getOperand(0);
+  EVT OpVT = Op0.getValueType();
+  unsigned OpBW = OpVT.getSizeInBits();
+
+  // Special handling for zero-extending a vector of booleans.
+  if (OpVT.isVector() && OpVT.getVectorElementType() == MVT::i1 && OpBW <= 64) {
+    SDNode *Mask = CurDAG->getMachineNode(Hexagon::C2_mask, dl, MVT::i64, Op0);
+    unsigned NE = OpVT.getVectorNumElements();
+    EVT ExVT = N->getValueType(0);
+    unsigned ES = ExVT.getScalarSizeInBits();
+    uint64_t MV = 0, Bit = 1;
+    for (unsigned i = 0; i < NE; ++i) {
+      MV |= Bit;
+      Bit <<= ES;
+    }
+    SDValue Ones = CurDAG->getTargetConstant(MV, dl, MVT::i64);
+    SDNode *OnesReg = CurDAG->getMachineNode(Hexagon::CONST64, dl,
+                                             MVT::i64, Ones);
+    if (ExVT.getSizeInBits() == 32) {
+      SDNode *And = CurDAG->getMachineNode(Hexagon::A2_andp, dl, MVT::i64,
+                                           SDValue(Mask,0), SDValue(OnesReg,0));
+      SDValue SubR = CurDAG->getTargetConstant(Hexagon::isub_lo, dl, MVT::i32);
+      ReplaceNode(N, CurDAG->getMachineNode(Hexagon::EXTRACT_SUBREG, dl, ExVT,
+                                            SDValue(And, 0), SubR));
+      return;
+    }
+    ReplaceNode(N,
+                CurDAG->getMachineNode(Hexagon::A2_andp, dl, ExVT,
+                                       SDValue(Mask, 0), SDValue(OnesReg, 0)));
+    return;
+  }
+
+  SDNode *Int = N->getOperand(0).getNode();
+  if ((Int->getOpcode() == ISD::INTRINSIC_WO_CHAIN)) {
+    unsigned ID = cast<ConstantSDNode>(Int->getOperand(0))->getZExtValue();
+    if (doesIntrinsicReturnPredicate(ID)) {
+      // Now we need to differentiate target data types.
+      if (N->getValueType(0) == MVT::i64) {
+        // Convert the zero_extend to Rs = Pd followed by A2_combinew(0,Rs).
+        SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
+        SDNode *Result_1 = CurDAG->getMachineNode(Hexagon::C2_tfrpr, dl,
+                                                  MVT::i32, SDValue(Int, 0));
+        SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl,
+                                                  MVT::i32, TargetConst0);
+        SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::A2_combinew, dl,
+                                                  MVT::i64, MVT::Other,
+                                                  SDValue(Result_2, 0),
+                                                  SDValue(Result_1, 0));
+        ReplaceNode(N, Result_3);
+        return;
+      }
+      if (N->getValueType(0) == MVT::i32) {
+        // Convert the zero_extend to Rs = Pd
+        SDNode* RsPd = CurDAG->getMachineNode(Hexagon::C2_tfrpr, dl,
+                                              MVT::i32, SDValue(Int, 0));
+        ReplaceNode(N, RsPd);
+        return;
+      }
+      llvm_unreachable("Unexpected value type");
+    }
+  }
+  SelectCode(N);
+}
+
+
+//
+// Handling intrinsics for circular load and bitreverse load.
+//
+void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) {
+  if (MachineSDNode *L = LoadInstrForLoadIntrinsic(N)) {
+    StoreInstrForLoadIntrinsic(L, N);
+    CurDAG->RemoveDeadNode(N);
+    return;
+  }
+  SelectCode(N);
+}
+
+void HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
+  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned Bits;
+  switch (IID) {
+  case Intrinsic::hexagon_S2_vsplatrb:
+    Bits = 8;
+    break;
+  case Intrinsic::hexagon_S2_vsplatrh:
+    Bits = 16;
+    break;
+  default:
+    SelectCode(N);
+    return;
+  }
+
+  SDValue V = N->getOperand(1);
+  SDValue U;
+  if (isValueExtension(V, Bits, U)) {
+    SDValue R = CurDAG->getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+                                N->getOperand(0), U);
+    ReplaceNode(N, R.getNode());
+    SelectCode(R.getNode());
+    return;
+  }
+  SelectCode(N);
+}
+
+//
+// Map floating point constant values.
+//
+void HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) {
+  SDLoc dl(N);
+  ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+  APInt A = CN->getValueAPF().bitcastToAPInt();
+  if (N->getValueType(0) == MVT::f32) {
+    SDValue V = CurDAG->getTargetConstant(A.getZExtValue(), dl, MVT::i32);
+    ReplaceNode(N, CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::f32, V));
+    return;
+  }
+  if (N->getValueType(0) == MVT::f64) {
+    SDValue V = CurDAG->getTargetConstant(A.getZExtValue(), dl, MVT::i64);
+    ReplaceNode(N, CurDAG->getMachineNode(Hexagon::CONST64, dl, MVT::f64, V));
+    return;
+  }
+
+  SelectCode(N);
+}
+
+//
+// Map boolean values.
+//
+void HexagonDAGToDAGISel::SelectConstant(SDNode *N) {
+  if (N->getValueType(0) == MVT::i1) {
+    assert(!(cast<ConstantSDNode>(N)->getZExtValue() >> 1));
+    unsigned Opc = (cast<ConstantSDNode>(N)->getSExtValue() != 0)
+                      ? Hexagon::PS_true
+                      : Hexagon::PS_false;
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i1));
+    return;
+  }
+
+  SelectCode(N);
+}
+
+
+void HexagonDAGToDAGISel::SelectFrameIndex(SDNode *N) {
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  const HexagonFrameLowering *HFI = HST->getFrameLowering();
+  int FX = cast<FrameIndexSDNode>(N)->getIndex();
+  unsigned StkA = HFI->getStackAlignment();
+  unsigned MaxA = MFI.getMaxAlignment();
+  SDValue FI = CurDAG->getTargetFrameIndex(FX, MVT::i32);
+  SDLoc DL(N);
+  SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  SDNode *R = nullptr;
+
+  // Use PS_fi when:
+  // - the object is fixed, or
+  // - there are no objects with higher-than-default alignment, or
+  // - there are no dynamically allocated objects.
+  // Otherwise, use PS_fia.
+  if (FX < 0 || MaxA <= StkA || !MFI.hasVarSizedObjects()) {
+    R = CurDAG->getMachineNode(Hexagon::PS_fi, DL, MVT::i32, FI, Zero);
+  } else {
+    auto &HMFI = *MF->getInfo<HexagonMachineFunctionInfo>();
+    unsigned AR = HMFI.getStackAlignBaseVReg();
+    SDValue CH = CurDAG->getEntryNode();
+    SDValue Ops[] = { CurDAG->getCopyFromReg(CH, DL, AR, MVT::i32), FI, Zero };
+    R = CurDAG->getMachineNode(Hexagon::PS_fia, DL, MVT::i32, Ops);
+  }
+
+  ReplaceNode(N, R);
+}
+
+
+void HexagonDAGToDAGISel::SelectBitcast(SDNode *N) {
+  EVT SVT = N->getOperand(0).getValueType();
+  EVT DVT = N->getValueType(0);
+  if (!SVT.isVector() || !DVT.isVector() ||
+      SVT.getVectorElementType() == MVT::i1 ||
+      DVT.getVectorElementType() == MVT::i1 ||
+      SVT.getSizeInBits() != DVT.getSizeInBits()) {
+    SelectCode(N);
+    return;
+  }
+
+  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N,0), N->getOperand(0));
+  CurDAG->RemoveDeadNode(N);
+}
+
+
+void HexagonDAGToDAGISel::Select(SDNode *N) {
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return;   // Already selected.
+  }
+
+  switch (N->getOpcode()) {
+  case ISD::Constant:
+    SelectConstant(N);
+    return;
+
+  case ISD::ConstantFP:
+    SelectConstantFP(N);
+    return;
+
+  case ISD::FrameIndex:
+    SelectFrameIndex(N);
+    return;
+
+  case ISD::BITCAST:
+    SelectBitcast(N);
+    return;
+
+  case ISD::SHL:
+    SelectSHL(N);
+    return;
+
+  case ISD::LOAD:
+    SelectLoad(N);
+    return;
+
+  case ISD::STORE:
+    SelectStore(N);
+    return;
+
+  case ISD::MUL:
+    SelectMul(N);
+    return;
+
+  case ISD::ZERO_EXTEND:
+    SelectZeroExtend(N);
+    return;
+
+  case ISD::INTRINSIC_W_CHAIN:
+    SelectIntrinsicWChain(N);
+    return;
+
+  case ISD::INTRINSIC_WO_CHAIN:
+    SelectIntrinsicWOChain(N);
+    return;
+  }
+
+  SelectCode(N);
+}
+
+bool HexagonDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Inp = Op, Res;
+
+  switch (ConstraintID) {
+  default:
+    return true;
+  case InlineAsm::Constraint_i:
+  case InlineAsm::Constraint_o: // Offsetable.
+  case InlineAsm::Constraint_v: // Not offsetable.
+  case InlineAsm::Constraint_m: // Memory.
+    if (SelectAddrFI(Inp, Res))
+      OutOps.push_back(Res);
+    else
+      OutOps.push_back(Inp);
+    break;
+  }
+
+  OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
+  return false;
+}
+
+
+void HexagonDAGToDAGISel::PreprocessISelDAG() {
+  SelectionDAG &DAG = *CurDAG;
+  std::vector<SDNode*> Nodes;
+  for (SDNode &Node : DAG.allnodes())
+    Nodes.push_back(&Node);
+
+  // Simplify: (or (select c x 0) z)  ->  (select c (or x z) z)
+  //           (or (select c 0 y) z)  ->  (select c z (or y z))
+  // This may not be the right thing for all targets, so do it here.
+  for (auto I : Nodes) {
+    if (I->getOpcode() != ISD::OR)
+      continue;
+
+    auto IsZero = [] (const SDValue &V) -> bool {
+      if (ConstantSDNode *SC = dyn_cast<ConstantSDNode>(V.getNode()))
+        return SC->isNullValue();
+      return false;
+    };
+    auto IsSelect0 = [IsZero] (const SDValue &Op) -> bool {
+      if (Op.getOpcode() != ISD::SELECT)
+        return false;
+      return IsZero(Op.getOperand(1)) || IsZero(Op.getOperand(2));
+    };
+
+    SDValue N0 = I->getOperand(0), N1 = I->getOperand(1);
+    EVT VT = I->getValueType(0);
+    bool SelN0 = IsSelect0(N0);
+    SDValue SOp = SelN0 ? N0 : N1;
+    SDValue VOp = SelN0 ? N1 : N0;
+
+    if (SOp.getOpcode() == ISD::SELECT && SOp.getNode()->hasOneUse()) {
+      SDValue SC = SOp.getOperand(0);
+      SDValue SX = SOp.getOperand(1);
+      SDValue SY = SOp.getOperand(2);
+      SDLoc DLS = SOp;
+      if (IsZero(SY)) {
+        SDValue NewOr = DAG.getNode(ISD::OR, DLS, VT, SX, VOp);
+        SDValue NewSel = DAG.getNode(ISD::SELECT, DLS, VT, SC, NewOr, VOp);
+        DAG.ReplaceAllUsesWith(I, NewSel.getNode());
+      } else if (IsZero(SX)) {
+        SDValue NewOr = DAG.getNode(ISD::OR, DLS, VT, SY, VOp);
+        SDValue NewSel = DAG.getNode(ISD::SELECT, DLS, VT, SC, VOp, NewOr);
+        DAG.ReplaceAllUsesWith(I, NewSel.getNode());
+      }
+    }
+  }
+
+  // Transform: (store ch addr (add x (add (shl y c) e)))
+  //        to: (store ch addr (add x (shl (add y d) c))),
+  // where e = (shl d c) for some integer d.
+  // The purpose of this is to enable generation of loads/stores with
+  // shifted addressing mode, i.e. mem(x+y<<#c). For that, the shift
+  // value c must be 0, 1 or 2.
+  for (auto I : Nodes) {
+    if (I->getOpcode() != ISD::STORE)
+      continue;
+
+    // I matched: (store ch addr Off)
+    SDValue Off = I->getOperand(2);
+    // Off needs to match: (add x (add (shl y c) (shl d c))))
+    if (Off.getOpcode() != ISD::ADD)
+      continue;
+    // Off matched: (add x T0)
+    SDValue T0 = Off.getOperand(1);
+    // T0 needs to match: (add T1 T2):
+    if (T0.getOpcode() != ISD::ADD)
+      continue;
+    // T0 matched: (add T1 T2)
+    SDValue T1 = T0.getOperand(0);
+    SDValue T2 = T0.getOperand(1);
+    // T1 needs to match: (shl y c)
+    if (T1.getOpcode() != ISD::SHL)
+      continue;
+    SDValue C = T1.getOperand(1);
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(C.getNode());
+    if (CN == nullptr)
+      continue;
+    unsigned CV = CN->getZExtValue();
+    if (CV > 2)
+      continue;
+    // T2 needs to match e, where e = (shl d c) for some d.
+    ConstantSDNode *EN = dyn_cast<ConstantSDNode>(T2.getNode());
+    if (EN == nullptr)
+      continue;
+    unsigned EV = EN->getZExtValue();
+    if (EV % (1 << CV) != 0)
+      continue;
+    unsigned DV = EV / (1 << CV);
+
+    // Replace T0 with: (shl (add y d) c)
+    SDLoc DL = SDLoc(I);
+    EVT VT = T0.getValueType();
+    SDValue D = DAG.getConstant(DV, DL, VT);
+    // NewAdd = (add y d)
+    SDValue NewAdd = DAG.getNode(ISD::ADD, DL, VT, T1.getOperand(0), D);
+    // NewShl = (shl NewAdd c)
+    SDValue NewShl = DAG.getNode(ISD::SHL, DL, VT, NewAdd, C);
+    ReplaceNode(T0.getNode(), NewShl.getNode());
+  }
+
+  if (EnableAddressRebalancing) {
+    rebalanceAddressTrees();
+
+    DEBUG(
+      dbgs() << "************* SelectionDAG after preprocessing: ***********\n";
+      CurDAG->dump();
+      dbgs() << "************* End SelectionDAG after preprocessing ********\n";
+    );
+  }
+}
+
+void HexagonDAGToDAGISel::EmitFunctionEntryCode() {
+  auto &HST = static_cast<const HexagonSubtarget&>(MF->getSubtarget());
+  auto &HFI = *HST.getFrameLowering();
+  if (!HFI.needsAligna(*MF))
+    return;
+
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  MachineBasicBlock *EntryBB = &MF->front();
+  unsigned AR = FuncInfo->CreateReg(MVT::i32);
+  unsigned MaxA = MFI.getMaxAlignment();
+  BuildMI(EntryBB, DebugLoc(), HII->get(Hexagon::PS_aligna), AR)
+      .addImm(MaxA);
+  MF->getInfo<HexagonMachineFunctionInfo>()->setStackAlignBaseVReg(AR);
+}
+
+// Match a frame index that can be used in an addressing mode.
+bool HexagonDAGToDAGISel::SelectAddrFI(SDValue& N, SDValue &R) {
+  if (N.getOpcode() != ISD::FrameIndex)
+    return false;
+  auto &HFI = *HST->getFrameLowering();
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  int FX = cast<FrameIndexSDNode>(N)->getIndex();
+  if (!MFI.isFixedObjectIndex(FX) && HFI.needsAligna(*MF))
+    return false;
+  R = CurDAG->getTargetFrameIndex(FX, MVT::i32);
+  return true;
+}
+
+inline bool HexagonDAGToDAGISel::SelectAddrGA(SDValue &N, SDValue &R) {
+  return SelectGlobalAddress(N, R, false);
+}
+
+inline bool HexagonDAGToDAGISel::SelectAddrGP(SDValue &N, SDValue &R) {
+  return SelectGlobalAddress(N, R, true);
+}
+
+bool HexagonDAGToDAGISel::SelectGlobalAddress(SDValue &N, SDValue &R,
+                                              bool UseGP) {
+  switch (N.getOpcode()) {
+  case ISD::ADD: {
+    SDValue N0 = N.getOperand(0);
+    SDValue N1 = N.getOperand(1);
+    unsigned GAOpc = N0.getOpcode();
+    if (UseGP && GAOpc != HexagonISD::CONST32_GP)
+      return false;
+    if (!UseGP && GAOpc != HexagonISD::CONST32)
+      return false;
+    if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N1)) {
+      SDValue Addr = N0.getOperand(0);
+      if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Addr)) {
+        if (GA->getOpcode() == ISD::TargetGlobalAddress) {
+          uint64_t NewOff = GA->getOffset() + (uint64_t)Const->getSExtValue();
+          R = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(Const),
+                                             N.getValueType(), NewOff);
+          return true;
+        }
+      }
+    }
+    break;
+  }
+  case HexagonISD::CONST32:
+    // The operand(0) of CONST32 is TargetGlobalAddress, which is what we
+    // want in the instruction.
+    if (!UseGP)
+      R = N.getOperand(0);
+    return !UseGP;
+  case HexagonISD::CONST32_GP:
+    if (UseGP)
+      R = N.getOperand(0);
+    return UseGP;
+  default:
+    return false;
+  }
+
+  return false;
+}
+
+bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val,
+      unsigned FromBits, SDValue &Src) {
+  unsigned Opc = Val.getOpcode();
+  switch (Opc) {
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND: {
+    SDValue const &Op0 = Val.getOperand(0);
+    EVT T = Op0.getValueType();
+    if (T.isInteger() && T.getSizeInBits() == FromBits) {
+      Src = Op0;
+      return true;
+    }
+    break;
+  }
+  case ISD::SIGN_EXTEND_INREG:
+  case ISD::AssertSext:
+  case ISD::AssertZext:
+    if (Val.getOperand(0).getValueType().isInteger()) {
+      VTSDNode *T = cast<VTSDNode>(Val.getOperand(1));
+      if (T->getVT().getSizeInBits() == FromBits) {
+        Src = Val.getOperand(0);
+        return true;
+      }
+    }
+    break;
+  case ISD::AND: {
+    // Check if this is an AND with "FromBits" of lower bits set to 1.
+    uint64_t FromMask = (1 << FromBits) - 1;
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(0))) {
+      if (C->getZExtValue() == FromMask) {
+        Src = Val.getOperand(1);
+        return true;
+      }
+    }
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(1))) {
+      if (C->getZExtValue() == FromMask) {
+        Src = Val.getOperand(0);
+        return true;
+      }
+    }
+    break;
+  }
+  case ISD::OR:
+  case ISD::XOR: {
+    // OR/XOR with the lower "FromBits" bits set to 0.
+    uint64_t FromMask = (1 << FromBits) - 1;
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(0))) {
+      if ((C->getZExtValue() & FromMask) == 0) {
+        Src = Val.getOperand(1);
+        return true;
+      }
+    }
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(1))) {
+      if ((C->getZExtValue() & FromMask) == 0) {
+        Src = Val.getOperand(0);
+        return true;
+      }
+    }
+  }
+  default:
+    break;
+  }
+  return false;
+}
+
+
+bool HexagonDAGToDAGISel::isOrEquivalentToAdd(const SDNode *N) const {
+  assert(N->getOpcode() == ISD::OR);
+  auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  assert(C);
+
+  // Detect when "or" is used to add an offset to a stack object.
+  if (auto *FN = dyn_cast<FrameIndexSDNode>(N->getOperand(0))) {
+    MachineFrameInfo &MFI = MF->getFrameInfo();
+    unsigned A = MFI.getObjectAlignment(FN->getIndex());
+    assert(isPowerOf2_32(A));
+    int32_t Off = C->getSExtValue();
+    // If the alleged offset fits in the zero bits guaranteed by
+    // the alignment, then this or is really an add.
+    return (Off >= 0) && (((A-1) & Off) == unsigned(Off));
+  }
+  return false;
+}
+
+bool HexagonDAGToDAGISel::isAlignedMemNode(const MemSDNode *N) const {
+  return N->getAlignment() >= N->getMemoryVT().getStoreSize();
+}
+
+// Return true when the given node fits in a positive half word.
+bool HexagonDAGToDAGISel::isPositiveHalfWord(const SDNode *N) const {
+  if (const ConstantSDNode *CN = dyn_cast<const ConstantSDNode>(N)) {
+    int64_t V = CN->getSExtValue();
+    return V > 0 && isInt<16>(V);
+  }
+  if (N->getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    const VTSDNode *VN = dyn_cast<const VTSDNode>(N->getOperand(1));
+    return VN->getVT().getSizeInBits() <= 16;
+  }
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Rebalancing of address calculation trees
+
+static bool isOpcodeHandled(const SDNode *N) {
+  switch (N->getOpcode()) {
+    case ISD::ADD:
+    case ISD::MUL:
+      return true;
+    case ISD::SHL:
+      // We only handle constant shifts because these can be easily flattened
+      // into multiplications by 2^Op1.
+      return isa<ConstantSDNode>(N->getOperand(1).getNode());
+    default:
+      return false;
+  }
+}
+
+/// \brief Return the weight of an SDNode
+int HexagonDAGToDAGISel::getWeight(SDNode *N) {
+  if (!isOpcodeHandled(N))
+    return 1;
+  assert(RootWeights.count(N) && "Cannot get weight of unseen root!");
+  assert(RootWeights[N] != -1 && "Cannot get weight of unvisited root!");
+  assert(RootWeights[N] != -2 && "Cannot get weight of RAWU'd root!");
+  return RootWeights[N];
+}
+
+int HexagonDAGToDAGISel::getHeight(SDNode *N) {
+  if (!isOpcodeHandled(N))
+    return 0;
+  assert(RootWeights.count(N) && RootWeights[N] >= 0 &&
+      "Cannot query height of unvisited/RAUW'd node!");
+  return RootHeights[N];
+}
+
+namespace {
+struct WeightedLeaf {
+  SDValue Value;
+  int Weight;
+  int InsertionOrder;
+
+  WeightedLeaf() : Value(SDValue()) { }
+
+  WeightedLeaf(SDValue Value, int Weight, int InsertionOrder) :
+    Value(Value), Weight(Weight), InsertionOrder(InsertionOrder) {
+    assert(Weight >= 0 && "Weight must be >= 0");
+  }
+
+  static bool Compare(const WeightedLeaf &A, const WeightedLeaf &B) {
+    assert(A.Value.getNode() && B.Value.getNode());
+    return A.Weight == B.Weight ?
+            (A.InsertionOrder > B.InsertionOrder) :
+            (A.Weight > B.Weight);
+  }
+};
+
+/// A specialized priority queue for WeigthedLeaves. It automatically folds
+/// constants and allows removal of non-top elements while maintaining the
+/// priority order.
+class LeafPrioQueue {
+  SmallVector<WeightedLeaf, 8> Q;
+  bool HaveConst;
+  WeightedLeaf ConstElt;
+  unsigned Opcode;
+
+public:
+  bool empty() {
+    return (!HaveConst && Q.empty());
+  }
+
+  size_t size() {
+    return Q.size() + HaveConst;
+  }
+
+  bool hasConst() {
+    return HaveConst;
+  }
+
+  const WeightedLeaf &top() {
+    if (HaveConst)
+      return ConstElt;
+    return Q.front();
+  }
+
+  WeightedLeaf pop() {
+    if (HaveConst) {
+      HaveConst = false;
+      return ConstElt;
+    }
+    std::pop_heap(Q.begin(), Q.end(), WeightedLeaf::Compare);
+    return Q.pop_back_val();
+  }
+
+  void push(WeightedLeaf L, bool SeparateConst=true) {
+    if (!HaveConst && SeparateConst && isa<ConstantSDNode>(L.Value)) {
+      if (Opcode == ISD::MUL &&
+          cast<ConstantSDNode>(L.Value)->getSExtValue() == 1)
+        return;
+      if (Opcode == ISD::ADD &&
+          cast<ConstantSDNode>(L.Value)->getSExtValue() == 0)
+        return;
+
+      HaveConst = true;
+      ConstElt = L;
+    } else {
+      Q.push_back(L);
+      std::push_heap(Q.begin(), Q.end(), WeightedLeaf::Compare);
+    }
+  }
+
+  /// Push L to the bottom of the queue regardless of its weight. If L is
+  /// constant, it will not be folded with other constants in the queue.
+  void pushToBottom(WeightedLeaf L) {
+    L.Weight = 1000;
+    push(L, false);
+  }
+
+  /// Search for a SHL(x, [<=MaxAmount]) subtree in the queue, return the one of
+  /// lowest weight and remove it from the queue.
+  WeightedLeaf findSHL(uint64_t MaxAmount);
+
+  WeightedLeaf findMULbyConst();
+
+  LeafPrioQueue(unsigned Opcode) :
+    HaveConst(false), Opcode(Opcode) { }
+};
+} // end anonymous namespace
+
+WeightedLeaf LeafPrioQueue::findSHL(uint64_t MaxAmount) {
+  int ResultPos;
+  WeightedLeaf Result;
+
+  for (int Pos = 0, End = Q.size(); Pos != End; ++Pos) {
+    const WeightedLeaf &L = Q[Pos];
+    const SDValue &Val = L.Value;
+    if (Val.getOpcode() != ISD::SHL ||
+        !isa<ConstantSDNode>(Val.getOperand(1)) ||
+        Val.getConstantOperandVal(1) > MaxAmount)
+      continue;
+    if (!Result.Value.getNode() || Result.Weight > L.Weight ||
+        (Result.Weight == L.Weight && Result.InsertionOrder > L.InsertionOrder))
+    {
+      Result = L;
+      ResultPos = Pos;
+    }
+  }
+
+  if (Result.Value.getNode()) {
+    Q.erase(&Q[ResultPos]);
+    std::make_heap(Q.begin(), Q.end(), WeightedLeaf::Compare);
+  }
+
+  return Result;
+}
+
+WeightedLeaf LeafPrioQueue::findMULbyConst() {
+  int ResultPos;
+  WeightedLeaf Result;
+
+  for (int Pos = 0, End = Q.size(); Pos != End; ++Pos) {
+    const WeightedLeaf &L = Q[Pos];
+    const SDValue &Val = L.Value;
+    if (Val.getOpcode() != ISD::MUL ||
+        !isa<ConstantSDNode>(Val.getOperand(1)) ||
+        Val.getConstantOperandVal(1) > 127)
+      continue;
+    if (!Result.Value.getNode() || Result.Weight > L.Weight ||
+        (Result.Weight == L.Weight && Result.InsertionOrder > L.InsertionOrder))
+    {
+      Result = L;
+      ResultPos = Pos;
+    }
+  }
+
+  if (Result.Value.getNode()) {
+    Q.erase(&Q[ResultPos]);
+    std::make_heap(Q.begin(), Q.end(), WeightedLeaf::Compare);
+  }
+
+  return Result;
+}
+
+SDValue HexagonDAGToDAGISel::getMultiplierForSHL(SDNode *N) {
+  uint64_t MulFactor = 1ull << N->getConstantOperandVal(1);
+  return CurDAG->getConstant(MulFactor, SDLoc(N),
+                             N->getOperand(1).getValueType());
+}
+
+/// @returns the value x for which 2^x is a factor of Val
+static unsigned getPowerOf2Factor(SDValue Val) {
+  if (Val.getOpcode() == ISD::MUL) {
+    unsigned MaxFactor = 0;
+    for (int i = 0; i < 2; ++i) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(i));
+      if (!C)
+        continue;
+      const APInt &CInt = C->getAPIntValue();
+      if (CInt.getBoolValue())
+        MaxFactor = CInt.countTrailingZeros();
+    }
+    return MaxFactor;
+  }
+  if (Val.getOpcode() == ISD::SHL) {
+    if (!isa<ConstantSDNode>(Val.getOperand(1).getNode()))
+      return 0;
+    return (unsigned) Val.getConstantOperandVal(1);
+  }
+
+  return 0;
+}
+
+/// @returns true if V>>Amount will eliminate V's operation on its child
+static bool willShiftRightEliminate(SDValue V, unsigned Amount) {
+  if (V.getOpcode() == ISD::MUL) {
+    SDValue Ops[] = { V.getOperand(0), V.getOperand(1) };
+    for (int i = 0; i < 2; ++i)
+      if (isa<ConstantSDNode>(Ops[i].getNode()) &&
+          V.getConstantOperandVal(i) % (1ULL << Amount) == 0) {
+        uint64_t NewConst = V.getConstantOperandVal(i) >> Amount;
+        return (NewConst == 1);
+      }
+  } else if (V.getOpcode() == ISD::SHL) {
+    return (Amount == V.getConstantOperandVal(1));
+  }
+
+  return false;
+}
+
+SDValue HexagonDAGToDAGISel::factorOutPowerOf2(SDValue V, unsigned Power) {
+  SDValue Ops[] = { V.getOperand(0), V.getOperand(1) };
+  if (V.getOpcode() == ISD::MUL) {
+    for (int i=0; i < 2; ++i) {
+      if (isa<ConstantSDNode>(Ops[i].getNode()) &&
+          V.getConstantOperandVal(i) % ((uint64_t)1 << Power) == 0) {
+        uint64_t NewConst = V.getConstantOperandVal(i) >> Power;
+        if (NewConst == 1)
+          return Ops[!i];
+        Ops[i] = CurDAG->getConstant(NewConst,
+                                     SDLoc(V), V.getValueType());
+        break;
+      }
+    }
+  } else if (V.getOpcode() == ISD::SHL) {
+    uint64_t ShiftAmount = V.getConstantOperandVal(1);
+    if (ShiftAmount == Power)
+      return Ops[0];
+    Ops[1] = CurDAG->getConstant(ShiftAmount - Power,
+                                 SDLoc(V), V.getValueType());
+  }
+
+  return CurDAG->getNode(V.getOpcode(), SDLoc(V), V.getValueType(), Ops);
+}
+
+static bool isTargetConstant(const SDValue &V) {
+  return V.getOpcode() == HexagonISD::CONST32 ||
+         V.getOpcode() == HexagonISD::CONST32_GP;
+}
+
+unsigned HexagonDAGToDAGISel::getUsesInFunction(const Value *V) {
+  if (GAUsesInFunction.count(V))
+    return GAUsesInFunction[V];
+
+  unsigned Result = 0;
+  const Function *CurF = CurDAG->getMachineFunction().getFunction();
+  for (const User *U : V->users()) {
+    if (isa<Instruction>(U) &&
+        cast<Instruction>(U)->getParent()->getParent() == CurF)
+      ++Result;
+  }
+
+  GAUsesInFunction[V] = Result;
+
+  return Result;
+}
+
+/// Note - After calling this, N may be dead. It may have been replaced by a
+/// new node, so always use the returned value in place of N.
+///
+/// @returns The SDValue taking the place of N (which could be N if it is
+/// unchanged)
+SDValue HexagonDAGToDAGISel::balanceSubTree(SDNode *N, bool TopLevel) {
+  assert(RootWeights.count(N) && "Cannot balance non-root node.");
+  assert(RootWeights[N] != -2 && "This node was RAUW'd!");
+  assert(!TopLevel || N->getOpcode() == ISD::ADD);
+
+  // Return early if this node was already visited
+  if (RootWeights[N] != -1)
+    return SDValue(N, 0);
+
+  assert(isOpcodeHandled(N));
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // Return early if the operands will remain unchanged or are all roots
+  if ((!isOpcodeHandled(Op0.getNode()) || RootWeights.count(Op0.getNode())) &&
+      (!isOpcodeHandled(Op1.getNode()) || RootWeights.count(Op1.getNode()))) {
+    SDNode *Op0N = Op0.getNode();
+    int Weight;
+    if (isOpcodeHandled(Op0N) && RootWeights[Op0N] == -1) {
+      Weight = getWeight(balanceSubTree(Op0N).getNode());
+      // Weight = calculateWeight(Op0N);
+    } else
+      Weight = getWeight(Op0N);
+
+    SDNode *Op1N = N->getOperand(1).getNode(); // Op1 may have been RAUWd
+    if (isOpcodeHandled(Op1N) && RootWeights[Op1N] == -1) {
+      Weight += getWeight(balanceSubTree(Op1N).getNode());
+      // Weight += calculateWeight(Op1N);
+    } else
+      Weight += getWeight(Op1N);
+
+    RootWeights[N] = Weight;
+    RootHeights[N] = std::max(getHeight(N->getOperand(0).getNode()),
+                              getHeight(N->getOperand(1).getNode())) + 1;
+
+    DEBUG(dbgs() << "--> No need to balance root (Weight=" << Weight
+                 << " Height=" << RootHeights[N] << "): ");
+    DEBUG(N->dump());
+
+    return SDValue(N, 0);
+  }
+
+  DEBUG(dbgs() << "** Balancing root node: ");
+  DEBUG(N->dump());
+
+  unsigned NOpcode = N->getOpcode();
+
+  LeafPrioQueue Leaves(NOpcode);
+  SmallVector<SDValue, 4> Worklist;
+  Worklist.push_back(SDValue(N, 0));
+
+  // SHL nodes will be converted to MUL nodes
+  if (NOpcode == ISD::SHL)
+    NOpcode = ISD::MUL;
+
+  bool CanFactorize = false;
+  WeightedLeaf Mul1, Mul2;
+  unsigned MaxPowerOf2 = 0;
+  WeightedLeaf GA;
+
+  // Do not try to factor out a shift if there is already a shift at the tip of
+  // the tree.
+  bool HaveTopLevelShift = false;
+  if (TopLevel &&
+      ((isOpcodeHandled(Op0.getNode()) && Op0.getOpcode() == ISD::SHL &&
+                        Op0.getConstantOperandVal(1) < 4) ||
+       (isOpcodeHandled(Op1.getNode()) && Op1.getOpcode() == ISD::SHL &&
+                        Op1.getConstantOperandVal(1) < 4)))
+    HaveTopLevelShift = true;
+
+  // Flatten the subtree into an ordered list of leaves; at the same time
+  // determine whether the tree is already balanced.
+  int InsertionOrder = 0;
+  SmallDenseMap<SDValue, int> NodeHeights;
+  bool Imbalanced = false;
+  int CurrentWeight = 0;
+  while (!Worklist.empty()) {
+    SDValue Child = Worklist.pop_back_val();
+
+    if (Child.getNode() != N && RootWeights.count(Child.getNode())) {
+      // CASE 1: Child is a root note
+
+      int Weight = RootWeights[Child.getNode()];
+      if (Weight == -1) {
+        Child = balanceSubTree(Child.getNode());
+        // calculateWeight(Child.getNode());
+        Weight = getWeight(Child.getNode());
+      } else if (Weight == -2) {
+        // Whoops, this node was RAUWd by one of the balanceSubTree calls we
+        // made. Our worklist isn't up to date anymore.
+        // Restart the whole process.
+        DEBUG(dbgs() << "--> Subtree was RAUWd. Restarting...\n");
+        return balanceSubTree(N, TopLevel);
+      }
+
+      NodeHeights[Child] = 1;
+      CurrentWeight += Weight;
+
+      unsigned PowerOf2;
+      if (TopLevel && !CanFactorize && !HaveTopLevelShift &&
+          (Child.getOpcode() == ISD::MUL || Child.getOpcode() == ISD::SHL) &&
+          Child.hasOneUse() && (PowerOf2 = getPowerOf2Factor(Child))) {
+        // Try to identify two factorizable MUL/SHL children greedily. Leave
+        // them out of the priority queue for now so we can deal with them
+        // after.
+        if (!Mul1.Value.getNode()) {
+          Mul1 = WeightedLeaf(Child, Weight, InsertionOrder++);
+          MaxPowerOf2 = PowerOf2;
+        } else {
+          Mul2 = WeightedLeaf(Child, Weight, InsertionOrder++);
+          MaxPowerOf2 = std::min(MaxPowerOf2, PowerOf2);
+
+          // Our addressing modes can only shift by a maximum of 3
+          if (MaxPowerOf2 > 3)
+            MaxPowerOf2 = 3;
+
+          CanFactorize = true;
+        }
+      } else
+        Leaves.push(WeightedLeaf(Child, Weight, InsertionOrder++));
+    } else if (!isOpcodeHandled(Child.getNode())) {
+      // CASE 2: Child is an unhandled kind of node (e.g. constant)
+      int Weight = getWeight(Child.getNode());
+
+      NodeHeights[Child] = getHeight(Child.getNode());
+      CurrentWeight += Weight;
+
+      if (isTargetConstant(Child) && !GA.Value.getNode())
+        GA = WeightedLeaf(Child, Weight, InsertionOrder++);
+      else
+        Leaves.push(WeightedLeaf(Child, Weight, InsertionOrder++));
+    } else {
+      // CASE 3: Child is a subtree of same opcode
+      // Visit children first, then flatten.
+      unsigned ChildOpcode = Child.getOpcode();
+      assert(ChildOpcode == NOpcode ||
+             (NOpcode == ISD::MUL && ChildOpcode == ISD::SHL));
+
+      // Convert SHL to MUL
+      SDValue Op1;
+      if (ChildOpcode == ISD::SHL)
+        Op1 = getMultiplierForSHL(Child.getNode());
+      else
+        Op1 = Child->getOperand(1);
+
+      if (!NodeHeights.count(Op1) || !NodeHeights.count(Child->getOperand(0))) {
+        assert(!NodeHeights.count(Child) && "Parent visited before children?");
+        // Visit children first, then re-visit this node
+        Worklist.push_back(Child);
+        Worklist.push_back(Op1);
+        Worklist.push_back(Child->getOperand(0));
+      } else {
+        // Back at this node after visiting the children
+        if (std::abs(NodeHeights[Op1] - NodeHeights[Child->getOperand(0)]) > 1)
+          Imbalanced = true;
+
+        NodeHeights[Child] = std::max(NodeHeights[Op1],
+                                      NodeHeights[Child->getOperand(0)]) + 1;
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "--> Current height=" << NodeHeights[SDValue(N, 0)]
+               << " weight=" << CurrentWeight << " imbalanced="
+               << Imbalanced << "\n");
+
+  // Transform MUL(x, C * 2^Y) + SHL(z, Y) -> SHL(ADD(MUL(x, C), z), Y)
+  //  This factors out a shift in order to match memw(a<<Y+b).
+  if (CanFactorize && (willShiftRightEliminate(Mul1.Value, MaxPowerOf2) ||
+                       willShiftRightEliminate(Mul2.Value, MaxPowerOf2))) {
+    DEBUG(dbgs() << "--> Found common factor for two MUL children!\n");
+    int Weight = Mul1.Weight + Mul2.Weight;
+    int Height = std::max(NodeHeights[Mul1.Value], NodeHeights[Mul2.Value]) + 1;
+    SDValue Mul1Factored = factorOutPowerOf2(Mul1.Value, MaxPowerOf2);
+    SDValue Mul2Factored = factorOutPowerOf2(Mul2.Value, MaxPowerOf2);
+    SDValue Sum = CurDAG->getNode(ISD::ADD, SDLoc(N), Mul1.Value.getValueType(),
+                                  Mul1Factored, Mul2Factored);
+    SDValue Const = CurDAG->getConstant(MaxPowerOf2, SDLoc(N),
+                                        Mul1.Value.getValueType());
+    SDValue New = CurDAG->getNode(ISD::SHL, SDLoc(N), Mul1.Value.getValueType(),
+                                  Sum, Const);
+    NodeHeights[New] = Height;
+    Leaves.push(WeightedLeaf(New, Weight, Mul1.InsertionOrder));
+  } else if (Mul1.Value.getNode()) {
+    // We failed to factorize two MULs, so now the Muls are left outside the
+    // queue... add them back.
+    Leaves.push(Mul1);
+    if (Mul2.Value.getNode())
+      Leaves.push(Mul2);
+    CanFactorize = false;
+  }
+
+  // Combine GA + Constant -> GA+Offset, but only if GA is not used elsewhere
+  // and the root node itself is not used more than twice. This reduces the
+  // amount of additional constant extenders introduced by this optimization.
+  bool CombinedGA = false;
+  if (NOpcode == ISD::ADD && GA.Value.getNode() && Leaves.hasConst() &&
+      GA.Value.hasOneUse() && N->use_size() < 3) {
+    GlobalAddressSDNode *GANode =
+      cast<GlobalAddressSDNode>(GA.Value.getOperand(0));
+    ConstantSDNode *Offset = cast<ConstantSDNode>(Leaves.top().Value);
+
+    if (getUsesInFunction(GANode->getGlobal()) == 1 && Offset->hasOneUse() &&
+        getTargetLowering()->isOffsetFoldingLegal(GANode)) {
+      DEBUG(dbgs() << "--> Combining GA and offset (" << Offset->getSExtValue()
+          << "): ");
+      DEBUG(GANode->dump());
+
+      SDValue NewTGA =
+        CurDAG->getTargetGlobalAddress(GANode->getGlobal(), SDLoc(GA.Value),
+            GANode->getValueType(0),
+            GANode->getOffset() + (uint64_t)Offset->getSExtValue());
+      GA.Value = CurDAG->getNode(GA.Value.getOpcode(), SDLoc(GA.Value),
+          GA.Value.getValueType(), NewTGA);
+      GA.Weight += Leaves.top().Weight;
+
+      NodeHeights[GA.Value] = getHeight(GA.Value.getNode());
+      CombinedGA = true;
+
+      Leaves.pop(); // Remove the offset constant from the queue
+    }
+  }
+
+  if ((RebalanceOnlyForOptimizations && !CanFactorize && !CombinedGA) ||
+      (RebalanceOnlyImbalancedTrees && !Imbalanced)) {
+    RootWeights[N] = CurrentWeight;
+    RootHeights[N] = NodeHeights[SDValue(N, 0)];
+
+    return SDValue(N, 0);
+  }
+
+  // Combine GA + SHL(x, C<=31) so we will match Rx=add(#u8,asl(Rx,#U5))
+  if (NOpcode == ISD::ADD && GA.Value.getNode()) {
+    WeightedLeaf SHL = Leaves.findSHL(31);
+    if (SHL.Value.getNode()) {
+      int Height = std::max(NodeHeights[GA.Value], NodeHeights[SHL.Value]) + 1;
+      GA.Value = CurDAG->getNode(ISD::ADD, SDLoc(GA.Value),
+                                 GA.Value.getValueType(),
+                                 GA.Value, SHL.Value);
+      GA.Weight = SHL.Weight; // Specifically ignore the GA weight here
+      NodeHeights[GA.Value] = Height;
+    }
+  }
+
+  if (GA.Value.getNode())
+    Leaves.push(GA);
+
+  // If this is the top level and we haven't factored out a shift, we should try
+  // to move a constant to the bottom to match addressing modes like memw(rX+C)
+  if (TopLevel && !CanFactorize && Leaves.hasConst()) {
+    DEBUG(dbgs() << "--> Pushing constant to tip of tree.");
+    Leaves.pushToBottom(Leaves.pop());
+  }
+
+  const DataLayout &DL = CurDAG->getDataLayout();
+  const TargetLowering &TLI = *getTargetLowering();
+
+  // Rebuild the tree using Huffman's algorithm
+  while (Leaves.size() > 1) {
+    WeightedLeaf L0 = Leaves.pop();
+
+    // See whether we can grab a MUL to form an add(Rx,mpyi(Ry,#u6)),
+    // otherwise just get the next leaf
+    WeightedLeaf L1 = Leaves.findMULbyConst();
+    if (!L1.Value.getNode())
+      L1 = Leaves.pop();
+
+    assert(L0.Weight <= L1.Weight && "Priority queue is broken!");
+
+    SDValue V0 = L0.Value;
+    int V0Weight = L0.Weight;
+    SDValue V1 = L1.Value;
+    int V1Weight = L1.Weight;
+
+    // Make sure that none of these nodes have been RAUW'd
+    if ((RootWeights.count(V0.getNode()) && RootWeights[V0.getNode()] == -2) ||
+        (RootWeights.count(V1.getNode()) && RootWeights[V1.getNode()] == -2)) {
+      DEBUG(dbgs() << "--> Subtree was RAUWd. Restarting...\n");
+      return balanceSubTree(N, TopLevel);
+    }
+
+    ConstantSDNode *V0C = dyn_cast<ConstantSDNode>(V0);
+    ConstantSDNode *V1C = dyn_cast<ConstantSDNode>(V1);
+    EVT VT = N->getValueType(0);
+    SDValue NewNode;
+
+    if (V0C && !V1C) {
+      std::swap(V0, V1);
+      std::swap(V0C, V1C);
+    }
+
+    // Calculate height of this node
+    assert(NodeHeights.count(V0) && NodeHeights.count(V1) &&
+           "Children must have been visited before re-combining them!");
+    int Height = std::max(NodeHeights[V0], NodeHeights[V1]) + 1;
+
+    // Rebuild this node (and restore SHL from MUL if needed)
+    if (V1C && NOpcode == ISD::MUL && V1C->getAPIntValue().isPowerOf2())
+      NewNode = CurDAG->getNode(
+          ISD::SHL, SDLoc(V0), VT, V0,
+          CurDAG->getConstant(
+              V1C->getAPIntValue().logBase2(), SDLoc(N),
+              TLI.getScalarShiftAmountTy(DL, V0.getValueType())));
+    else
+      NewNode = CurDAG->getNode(NOpcode, SDLoc(N), VT, V0, V1);
+
+    NodeHeights[NewNode] = Height;
+
+    int Weight = V0Weight + V1Weight;
+    Leaves.push(WeightedLeaf(NewNode, Weight, L0.InsertionOrder));
+
+    DEBUG(dbgs() << "--> Built new node (Weight=" << Weight << ",Height="
+                 << Height << "):\n");
+    DEBUG(NewNode.dump());
+  }
+
+  assert(Leaves.size() == 1);
+  SDValue NewRoot = Leaves.top().Value;
+
+  assert(NodeHeights.count(NewRoot));
+  int Height = NodeHeights[NewRoot];
+
+  // Restore SHL if we earlier converted it to a MUL
+  if (NewRoot.getOpcode() == ISD::MUL) {
+    ConstantSDNode *V1C = dyn_cast<ConstantSDNode>(NewRoot.getOperand(1));
+    if (V1C && V1C->getAPIntValue().isPowerOf2()) {
+      EVT VT = NewRoot.getValueType();
+      SDValue V0 = NewRoot.getOperand(0);
+      NewRoot = CurDAG->getNode(
+          ISD::SHL, SDLoc(NewRoot), VT, V0,
+          CurDAG->getConstant(
+              V1C->getAPIntValue().logBase2(), SDLoc(NewRoot),
+              TLI.getScalarShiftAmountTy(DL, V0.getValueType())));
+    }
+  }
+
+  if (N != NewRoot.getNode()) {
+    DEBUG(dbgs() << "--> Root is now: ");
+    DEBUG(NewRoot.dump());
+
+    // Replace all uses of old root by new root
+    CurDAG->ReplaceAllUsesWith(N, NewRoot.getNode());
+    // Mark that we have RAUW'd N
+    RootWeights[N] = -2;
+  } else {
+    DEBUG(dbgs() << "--> Root unchanged.\n");
+  }
+
+  RootWeights[NewRoot.getNode()] = Leaves.top().Weight;
+  RootHeights[NewRoot.getNode()] = Height;
+
+  return NewRoot;
+}
+
+void HexagonDAGToDAGISel::rebalanceAddressTrees() {
+  for (auto I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E;) {
+    SDNode *N = &*I++;
+    if (N->getOpcode() != ISD::LOAD && N->getOpcode() != ISD::STORE)
+      continue;
+
+    SDValue BasePtr = cast<MemSDNode>(N)->getBasePtr();
+    if (BasePtr.getOpcode() != ISD::ADD)
+      continue;
+
+    // We've already processed this node
+    if (RootWeights.count(BasePtr.getNode()))
+      continue;
+
+    DEBUG(dbgs() << "** Rebalancing address calculation in node: ");
+    DEBUG(N->dump());
+
+    // FindRoots
+    SmallVector<SDNode *, 4> Worklist;
+
+    Worklist.push_back(BasePtr.getOperand(0).getNode());
+    Worklist.push_back(BasePtr.getOperand(1).getNode());
+
+    while (!Worklist.empty()) {
+      SDNode *N = Worklist.pop_back_val();
+      unsigned Opcode = N->getOpcode();
+
+      if (!isOpcodeHandled(N))
+        continue;
+
+      Worklist.push_back(N->getOperand(0).getNode());
+      Worklist.push_back(N->getOperand(1).getNode());
+
+      // Not a root if it has only one use and same opcode as its parent
+      if (N->hasOneUse() && Opcode == N->use_begin()->getOpcode())
+        continue;
+
+      // This root node has already been processed
+      if (RootWeights.count(N))
+        continue;
+
+      RootWeights[N] = -1;
+    }
+
+    // Balance node itself
+    RootWeights[BasePtr.getNode()] = -1;
+    SDValue NewBasePtr = balanceSubTree(BasePtr.getNode(), /*TopLevel=*/ true);
+
+    if (N->getOpcode() == ISD::LOAD)
+      N = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
+            NewBasePtr, N->getOperand(2));
+    else
+      N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1),
+            NewBasePtr, N->getOperand(3));
+
+    DEBUG(dbgs() << "--> Final node: ");
+    DEBUG(N->dump());
+  }
+
+  CurDAG->RemoveDeadNodes();
+  GAUsesInFunction.clear();
+  RootHeights.clear();
+  RootWeights.clear();
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
new file mode 100644
index 000000000000..e87e1e6a7e0f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -0,0 +1,3323 @@
+//===-- HexagonISelLowering.cpp - Hexagon DAG Lowering Implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interfaces that Hexagon uses to lower LLVM code
+// into a selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonISelLowering.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonTargetObjectFile.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetCallingConv.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-lowering"
+
+static cl::opt<bool> EmitJumpTables("hexagon-emit-jump-tables",
+  cl::init(true), cl::Hidden,
+  cl::desc("Control jump table emission on Hexagon target"));
+
+static cl::opt<bool> EnableHexSDNodeSched("enable-hexagon-sdnode-sched",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Enable Hexagon SDNode scheduling"));
+
+static cl::opt<bool> EnableFastMath("ffast-math",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Enable Fast Math processing"));
+
+static cl::opt<int> MinimumJumpTables("minimum-jump-tables",
+  cl::Hidden, cl::ZeroOrMore, cl::init(5),
+  cl::desc("Set minimum jump tables"));
+
+static cl::opt<int> MaxStoresPerMemcpyCL("max-store-memcpy",
+  cl::Hidden, cl::ZeroOrMore, cl::init(6),
+  cl::desc("Max #stores to inline memcpy"));
+
+static cl::opt<int> MaxStoresPerMemcpyOptSizeCL("max-store-memcpy-Os",
+  cl::Hidden, cl::ZeroOrMore, cl::init(4),
+  cl::desc("Max #stores to inline memcpy"));
+
+static cl::opt<int> MaxStoresPerMemmoveCL("max-store-memmove",
+  cl::Hidden, cl::ZeroOrMore, cl::init(6),
+  cl::desc("Max #stores to inline memmove"));
+
+static cl::opt<int> MaxStoresPerMemmoveOptSizeCL("max-store-memmove-Os",
+  cl::Hidden, cl::ZeroOrMore, cl::init(4),
+  cl::desc("Max #stores to inline memmove"));
+
+static cl::opt<int> MaxStoresPerMemsetCL("max-store-memset",
+  cl::Hidden, cl::ZeroOrMore, cl::init(8),
+  cl::desc("Max #stores to inline memset"));
+
+static cl::opt<int> MaxStoresPerMemsetOptSizeCL("max-store-memset-Os",
+  cl::Hidden, cl::ZeroOrMore, cl::init(4),
+  cl::desc("Max #stores to inline memset"));
+
+
+namespace {
+
+  class HexagonCCState : public CCState {
+    unsigned NumNamedVarArgParams;
+
+  public:
+    HexagonCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+                   SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
+                   int NumNamedVarArgParams)
+        : CCState(CC, isVarArg, MF, locs, C),
+          NumNamedVarArgParams(NumNamedVarArgParams) {}
+
+    unsigned getNumNamedVarArgParams() const { return NumNamedVarArgParams; }
+  };
+
+  enum StridedLoadKind {
+    Even = 0,
+    Odd,
+    NoPattern
+  };
+
+} // end anonymous namespace
+
+// Implement calling convention for Hexagon.
+
+static bool isHvxVectorType(MVT ty);
+
+static bool
+CC_Hexagon(unsigned ValNo, MVT ValVT,
+           MVT LocVT, CCValAssign::LocInfo LocInfo,
+           ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_Hexagon32(unsigned ValNo, MVT ValVT,
+             MVT LocVT, CCValAssign::LocInfo LocInfo,
+             ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_Hexagon64(unsigned ValNo, MVT ValVT,
+             MVT LocVT, CCValAssign::LocInfo LocInfo,
+             ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_HexagonVector(unsigned ValNo, MVT ValVT,
+                 MVT LocVT, CCValAssign::LocInfo LocInfo,
+                 ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_Hexagon(unsigned ValNo, MVT ValVT,
+              MVT LocVT, CCValAssign::LocInfo LocInfo,
+              ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
+                MVT LocVT, CCValAssign::LocInfo LocInfo,
+                ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
+                MVT LocVT, CCValAssign::LocInfo LocInfo,
+                ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
+                    MVT LocVT, CCValAssign::LocInfo LocInfo,
+                    ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
+            MVT LocVT, CCValAssign::LocInfo LocInfo,
+            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  HexagonCCState &HState = static_cast<HexagonCCState &>(State);
+
+  if (ValNo < HState.getNumNamedVarArgParams()) {
+    // Deal with named arguments.
+    return CC_Hexagon(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State);
+  }
+
+  // Deal with un-named arguments.
+  unsigned Offset;
+  if (ArgFlags.isByVal()) {
+    // If pass-by-value, the size allocated on stack is decided
+    // by ArgFlags.getByValSize(), not by the size of LocVT.
+    Offset = State.AllocateStack(ArgFlags.getByValSize(),
+                                 ArgFlags.getByValAlign());
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::i1 || LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    ValVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+    Offset = State.AllocateStack(4, 4);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+    Offset = State.AllocateStack(8, 8);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::v2i64 || LocVT == MVT::v4i32 || LocVT == MVT::v8i16 ||
+      LocVT == MVT::v16i8) {
+    Offset = State.AllocateStack(16, 16);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::v4i64 || LocVT == MVT::v8i32 || LocVT == MVT::v16i16 ||
+      LocVT == MVT::v32i8) {
+    Offset = State.AllocateStack(32, 32);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::v8i64 || LocVT == MVT::v16i32 || LocVT == MVT::v32i16 ||
+      LocVT == MVT::v64i8 || LocVT == MVT::v512i1) {
+    Offset = State.AllocateStack(64, 64);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
+      LocVT == MVT::v128i8 || LocVT == MVT::v1024i1) {
+    Offset = State.AllocateStack(128, 128);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::v32i64 || LocVT == MVT::v64i32 || LocVT == MVT::v128i16 ||
+      LocVT == MVT::v256i8) {
+    Offset = State.AllocateStack(256, 256);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+
+  llvm_unreachable(nullptr);
+}
+
+static bool CC_Hexagon (unsigned ValNo, MVT ValVT, MVT LocVT,
+      CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  if (ArgFlags.isByVal()) {
+    // Passed on stack.
+    unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(),
+                                          ArgFlags.getByValAlign());
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+
+  if (LocVT == MVT::i1 || LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    ValVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  } else if (LocVT == MVT::v4i8 || LocVT == MVT::v2i16) {
+    LocVT = MVT::i32;
+    LocInfo = CCValAssign::BCvt;
+  } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) {
+    LocVT = MVT::i64;
+    LocInfo = CCValAssign::BCvt;
+  }
+
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+    if (!CC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+      return false;
+  }
+
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+    if (!CC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+      return false;
+  }
+
+  if (LocVT == MVT::v8i32 || LocVT == MVT::v16i16 || LocVT == MVT::v32i8) {
+    unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(), 32);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+
+  if (isHvxVectorType(LocVT)) {
+    if (!CC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+      return false;
+  }
+
+  return true;  // CC didn't match.
+}
+
+
+static bool CC_Hexagon32(unsigned ValNo, MVT ValVT,
+                         MVT LocVT, CCValAssign::LocInfo LocInfo,
+                         ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static const MCPhysReg RegList[] = {
+    Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
+    Hexagon::R5
+  };
+  if (unsigned Reg = State.AllocateReg(RegList)) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return false;
+  }
+
+  unsigned Offset = State.AllocateStack(4, 4);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+static bool CC_Hexagon64(unsigned ValNo, MVT ValVT,
+                         MVT LocVT, CCValAssign::LocInfo LocInfo,
+                         ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return false;
+  }
+
+  static const MCPhysReg RegList1[] = {
+    Hexagon::D1, Hexagon::D2
+  };
+  static const MCPhysReg RegList2[] = {
+    Hexagon::R1, Hexagon::R3
+  };
+  if (unsigned Reg = State.AllocateReg(RegList1, RegList2)) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return false;
+  }
+
+  unsigned Offset = State.AllocateStack(8, 8, Hexagon::D2);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+static bool CC_HexagonVector(unsigned ValNo, MVT ValVT,
+                             MVT LocVT, CCValAssign::LocInfo LocInfo,
+                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static const MCPhysReg VecLstS[] = {
+      Hexagon::V0, Hexagon::V1, Hexagon::V2, Hexagon::V3, Hexagon::V4,
+      Hexagon::V5, Hexagon::V6, Hexagon::V7, Hexagon::V8, Hexagon::V9,
+      Hexagon::V10, Hexagon::V11, Hexagon::V12, Hexagon::V13, Hexagon::V14,
+      Hexagon::V15
+  };
+  static const MCPhysReg VecLstD[] = {
+      Hexagon::W0, Hexagon::W1, Hexagon::W2, Hexagon::W3, Hexagon::W4,
+      Hexagon::W5, Hexagon::W6, Hexagon::W7
+  };
+  auto &MF = State.getMachineFunction();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  bool UseHVX = HST.useHVXOps();
+  bool UseHVXDbl = HST.useHVXDblOps();
+
+  if ((UseHVX && !UseHVXDbl) &&
+      (LocVT == MVT::v8i64 || LocVT == MVT::v16i32 || LocVT == MVT::v32i16 ||
+       LocVT == MVT::v64i8 || LocVT == MVT::v512i1)) {
+    if (unsigned Reg = State.AllocateReg(VecLstS)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+    unsigned Offset = State.AllocateStack(64, 64);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+  if ((UseHVX && !UseHVXDbl) &&
+      (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
+       LocVT == MVT::v128i8)) {
+    if (unsigned Reg = State.AllocateReg(VecLstD)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+    unsigned Offset = State.AllocateStack(128, 128);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+  // 128B Mode
+  if ((UseHVX && UseHVXDbl) &&
+      (LocVT == MVT::v32i64 || LocVT == MVT::v64i32 || LocVT == MVT::v128i16 ||
+       LocVT == MVT::v256i8)) {
+    if (unsigned Reg = State.AllocateReg(VecLstD)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+    unsigned Offset = State.AllocateStack(256, 256);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+  if ((UseHVX && UseHVXDbl) &&
+      (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 ||
+       LocVT == MVT::v128i8 || LocVT == MVT::v1024i1)) {
+    if (unsigned Reg = State.AllocateReg(VecLstS)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+    unsigned Offset = State.AllocateStack(128, 128);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+  return true;
+}
+
+static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
+                          MVT LocVT, CCValAssign::LocInfo LocInfo,
+                          ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  auto &MF = State.getMachineFunction();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  bool UseHVX = HST.useHVXOps();
+  bool UseHVXDbl = HST.useHVXDblOps();
+
+  if (LocVT == MVT::i1) {
+    // Return values of type MVT::i1 still need to be assigned to R0, but
+    // the value type needs to remain i1. LowerCallResult will deal with it,
+    // but it needs to recognize i1 as the value type.
+    LocVT = MVT::i32;
+  } else if (LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    ValVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  } else if (LocVT == MVT::v4i8 || LocVT == MVT::v2i16) {
+    LocVT = MVT::i32;
+    LocInfo = CCValAssign::BCvt;
+  } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) {
+    LocVT = MVT::i64;
+    LocInfo = CCValAssign::BCvt;
+  } else if (LocVT == MVT::v64i8 || LocVT == MVT::v32i16 ||
+             LocVT == MVT::v16i32 || LocVT == MVT::v8i64 ||
+             LocVT == MVT::v512i1) {
+    LocVT = MVT::v16i32;
+    ValVT = MVT::v16i32;
+    LocInfo = CCValAssign::Full;
+  } else if (LocVT == MVT::v128i8 || LocVT == MVT::v64i16 ||
+             LocVT == MVT::v32i32 || LocVT == MVT::v16i64 ||
+             (LocVT == MVT::v1024i1 && UseHVX && UseHVXDbl)) {
+    LocVT = MVT::v32i32;
+    ValVT = MVT::v32i32;
+    LocInfo = CCValAssign::Full;
+  } else if (LocVT == MVT::v256i8 || LocVT == MVT::v128i16 ||
+             LocVT == MVT::v64i32 || LocVT == MVT::v32i64) {
+    LocVT = MVT::v64i32;
+    ValVT = MVT::v64i32;
+    LocInfo = CCValAssign::Full;
+  }
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+    if (!RetCC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+      return false;
+  }
+
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+    if (!RetCC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+      return false;
+  }
+  if (LocVT == MVT::v16i32 || LocVT == MVT::v32i32 || LocVT == MVT::v64i32) {
+    if (!RetCC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+      return false;
+  }
+  return true;  // CC didn't match.
+}
+
+static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+    // Note that use of registers beyond R1 is not ABI compliant. However there
+    // are (experimental) IR passes which generate internal functions that
+    // return structs using these additional registers.
+    static const uint16_t RegList[] = { Hexagon::R0, Hexagon::R1,
+                                        Hexagon::R2, Hexagon::R3,
+                                        Hexagon::R4, Hexagon::R5 };
+    if (unsigned Reg = State.AllocateReg(RegList)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  unsigned Offset = State.AllocateStack(4, 4);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+    if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  unsigned Offset = State.AllocateStack(8, 8);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+static bool RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
+                                MVT LocVT, CCValAssign::LocInfo LocInfo,
+                                ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  auto &MF = State.getMachineFunction();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  bool UseHVX = HST.useHVXOps();
+  bool UseHVXDbl = HST.useHVXDblOps();
+
+  unsigned OffSiz = 64;
+  if (LocVT == MVT::v16i32) {
+    if (unsigned Reg = State.AllocateReg(Hexagon::V0)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  } else if (LocVT == MVT::v32i32) {
+    unsigned Req = (UseHVX && UseHVXDbl) ? Hexagon::V0 : Hexagon::W0;
+    if (unsigned Reg = State.AllocateReg(Req)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+    OffSiz = 128;
+  } else if (LocVT == MVT::v64i32) {
+    if (unsigned Reg = State.AllocateReg(Hexagon::W0)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+    OffSiz = 256;
+  }
+
+  unsigned Offset = State.AllocateStack(OffSiz, OffSiz);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+void HexagonTargetLowering::promoteLdStType(MVT VT, MVT PromotedLdStVT) {
+  if (VT != PromotedLdStVT) {
+    setOperationAction(ISD::LOAD, VT, Promote);
+    AddPromotedToType(ISD::LOAD, VT, PromotedLdStVT);
+
+    setOperationAction(ISD::STORE, VT, Promote);
+    AddPromotedToType(ISD::STORE, VT, PromotedLdStVT);
+  }
+}
+
+SDValue
+HexagonTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG)
+      const {
+  return SDValue();
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" of size "Size".  Alignment information is
+/// specified by the specific parameter attribute. The copy will be passed as
+/// a byval function parameter.  Sometimes what we are copying is the end of a
+/// larger object, the part that does not fit in registers.
+static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
+                                         SDValue Chain, ISD::ArgFlagsTy Flags,
+                                         SelectionDAG &DAG, const SDLoc &dl) {
+  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       /*isVolatile=*/false, /*AlwaysInline=*/false,
+                       /*isTailCall=*/false,
+                       MachinePointerInfo(), MachinePointerInfo());
+}
+
+static bool isHvxVectorType(MVT Ty) {
+  switch (Ty.SimpleTy) {
+  case MVT::v8i64:
+  case MVT::v16i32:
+  case MVT::v32i16:
+  case MVT::v64i8:
+  case MVT::v16i64:
+  case MVT::v32i32:
+  case MVT::v64i16:
+  case MVT::v128i8:
+  case MVT::v32i64:
+  case MVT::v64i32:
+  case MVT::v128i16:
+  case MVT::v256i8:
+  case MVT::v512i1:
+  case MVT::v1024i1:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// LowerReturn - Lower ISD::RET. If a struct is larger than 8 bytes and is
+// passed by value, the function prototype is modified to return void and
+// the value is stored in memory pointed by a pointer passed by caller.
+SDValue
+HexagonTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                   bool isVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const SmallVectorImpl<SDValue> &OutVals,
+                                   const SDLoc &dl, SelectionDAG &DAG) const {
+  // CCValAssign - represent the assignment of the return value to locations.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  // Analyze return values of ISD::RET
+  CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), OutVals[i], Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(HexagonISD::RET_FLAG, dl, MVT::Other, RetOps);
+}
+
+bool HexagonTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  // If either no tail call or told not to tail call at all, don't.
+  auto Attr =
+      CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
+  if (!CI->isTailCall() || Attr.getValueAsString() == "true")
+    return false;
+
+  return true;
+}
+
+/// LowerCallResult - Lower the result values of an ISD::CALL into the
+/// appropriate copies out of appropriate physical registers.  This assumes that
+/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// being lowered. Returns a SDNode with the same number of values as the
+/// ISD::CALL.
+SDValue HexagonTargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+    const SmallVectorImpl<SDValue> &OutVals, SDValue Callee) const {
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    SDValue RetVal;
+    if (RVLocs[i].getValVT() == MVT::i1) {
+      // Return values of type MVT::i1 require special handling. The reason
+      // is that MVT::i1 is associated with the PredRegs register class, but
+      // values of that type are still returned in R0. Generate an explicit
+      // copy into a predicate register from R0, and treat the value of the
+      // predicate register as the call result.
+      auto &MRI = DAG.getMachineFunction().getRegInfo();
+      SDValue FR0 = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+                                       MVT::i32, InFlag);
+      // FR0 = (Value, Chain, Glue)
+      unsigned PredR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
+      SDValue TPR = DAG.getCopyToReg(FR0.getValue(1), dl, PredR,
+                                     FR0.getValue(0), FR0.getValue(2));
+      // TPR = (Chain, Glue)
+      RetVal = DAG.getCopyFromReg(TPR.getValue(0), dl, PredR, MVT::i1,
+                                  TPR.getValue(1));
+    } else {
+      RetVal = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+                                  RVLocs[i].getValVT(), InFlag);
+    }
+    InVals.push_back(RetVal.getValue(0));
+    Chain = RetVal.getValue(1);
+    InFlag = RetVal.getValue(2);
+  }
+
+  return Chain;
+}
+
+/// LowerCall - Functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+SDValue
+HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                 SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &IsTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool IsVarArg                         = CLI.IsVarArg;
+  bool DoesNotReturn                    = CLI.DoesNotReturn;
+
+  bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+  MachineFunction &MF = DAG.getMachineFunction();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
+
+  // Check for varargs.
+  unsigned NumNamedVarArgParams = -1U;
+  if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = GAN->getGlobal();
+    Callee = DAG.getTargetGlobalAddress(GV, dl, MVT::i32);
+    if (const Function* F = dyn_cast<Function>(GV)) {
+      // If a function has zero args and is a vararg function, that's
+      // disallowed so it must be an undeclared function.  Do not assume
+      // varargs if the callee is undefined.
+      if (F->isVarArg() && F->getFunctionType()->getNumParams() != 0)
+        NumNamedVarArgParams = F->getFunctionType()->getNumParams();
+    }
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  HexagonCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                        *DAG.getContext(), NumNamedVarArgParams);
+
+  if (IsVarArg)
+    CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_VarArg);
+  else
+    CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon);
+
+  auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
+  if (Attr.getValueAsString() == "true")
+    IsTailCall = false;
+
+  if (IsTailCall) {
+    bool StructAttrFlag = MF.getFunction()->hasStructRetAttr();
+    IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                                                   IsVarArg, IsStructRet,
+                                                   StructAttrFlag,
+                                                   Outs, OutVals, Ins, DAG);
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+      CCValAssign &VA = ArgLocs[i];
+      if (VA.isMemLoc()) {
+        IsTailCall = false;
+        break;
+      }
+    }
+    DEBUG(dbgs() << (IsTailCall ? "Eligible for Tail Call\n"
+                                : "Argument must be passed on stack. "
+                                  "Not eligible for Tail Call\n"));
+  }
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+  SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  auto &HRI = *Subtarget.getRegisterInfo();
+  SDValue StackPtr =
+      DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(), PtrVT);
+
+  bool NeedsArgAlign = false;
+  unsigned LargestAlignSeen = 0;
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    // Record if we need > 8 byte alignment on an argument.
+    bool ArgAlign = isHvxVectorType(VA.getValVT());
+    NeedsArgAlign |= ArgAlign;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default:
+        // Loc info must be one of Full, SExt, ZExt, or AExt.
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::BCvt:
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+
+    if (VA.isMemLoc()) {
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      SDValue MemAddr = DAG.getConstant(LocMemOffset, dl,
+                                        StackPtr.getValueType());
+      MemAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, MemAddr);
+      if (ArgAlign)
+        LargestAlignSeen = std::max(LargestAlignSeen,
+                                    VA.getLocVT().getStoreSizeInBits() >> 3);
+      if (Flags.isByVal()) {
+        // The argument is a struct passed by value. According to LLVM, "Arg"
+        // is is pointer.
+        MemOpChains.push_back(CreateCopyOfByValArgument(Arg, MemAddr, Chain,
+                                                        Flags, DAG, dl));
+      } else {
+        MachinePointerInfo LocPI = MachinePointerInfo::getStack(
+            DAG.getMachineFunction(), LocMemOffset);
+        SDValue S = DAG.getStore(Chain, dl, Arg, MemAddr, LocPI);
+        MemOpChains.push_back(S);
+      }
+      continue;
+    }
+
+    // Arguments that can be passed on register must be kept at RegsToPass
+    // vector.
+    if (VA.isRegLoc())
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+  }
+
+  if (NeedsArgAlign && Subtarget.hasV60TOps()) {
+    DEBUG(dbgs() << "Function needs byte stack align due to call args\n");
+    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+    // V6 vectors passed by value have 64 or 128 byte alignment depending
+    // on whether we are 64 byte vector mode or 128 byte.
+    bool UseHVXDbl = Subtarget.useHVXDblOps();
+    assert(Subtarget.useHVXOps());
+    const unsigned ObjAlign = UseHVXDbl ? 128 : 64;
+    LargestAlignSeen = std::max(LargestAlignSeen, ObjAlign);
+    MFI.ensureMaxAlignment(LargestAlignSeen);
+  }
+  // Transform all store nodes into one single node because all store
+  // nodes are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  if (!IsTailCall) {
+    SDValue C = DAG.getConstant(NumBytes, dl, PtrVT, true);
+    Chain = DAG.getCALLSEQ_START(Chain, C, dl);
+  }
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The Glue is necessary since all emitted instructions must be
+  // stuck together.
+  SDValue Glue;
+  if (!IsTailCall) {
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, Glue);
+      Glue = Chain.getValue(1);
+    }
+  } else {
+    // For tail calls lower the arguments to the 'real' stack slot.
+    //
+    // Force all the incoming stack arguments to be loaded from the stack
+    // before any new outgoing arguments are stored to the stack, because the
+    // outgoing stack slots may alias the incoming argument stack slots, and
+    // the alias isn't otherwise explicit. This is slightly more conservative
+    // than necessary, because it means that each store effectively depends
+    // on every argument instead of just those arguments it would clobber.
+    //
+    // Do not flag preceding copytoreg stuff together with the following stuff.
+    Glue = SDValue();
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, Glue);
+      Glue = Chain.getValue(1);
+    }
+    Glue = SDValue();
+  }
+
+  bool LongCalls = MF.getSubtarget<HexagonSubtarget>().useLongCalls();
+  unsigned Flags = LongCalls ? HexagonII::HMOTF_ConstExtended : 0;
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, PtrVT, 0, Flags);
+  } else if (ExternalSymbolSDNode *S =
+             dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, Flags);
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+  }
+
+  if (Glue.getNode())
+    Ops.push_back(Glue);
+
+  if (IsTailCall) {
+    MF.getFrameInfo().setHasTailCall();
+    return DAG.getNode(HexagonISD::TC_RETURN, dl, NodeTys, Ops);
+  }
+
+  unsigned OpCode = DoesNotReturn ? HexagonISD::CALLnr : HexagonISD::CALL;
+  Chain = DAG.getNode(OpCode, dl, NodeTys, Ops);
+  Glue = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
+                             DAG.getIntPtrConstant(0, dl, true), Glue, dl);
+  Glue = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, Glue, CallConv, IsVarArg, Ins, dl, DAG,
+                         InVals, OutVals, Callee);
+}
+
+static bool getIndexedAddressParts(SDNode *Ptr, EVT VT,
+                                   SDValue &Base, SDValue &Offset,
+                                   bool &IsInc, SelectionDAG &DAG) {
+  if (Ptr->getOpcode() != ISD::ADD)
+    return false;
+
+  auto &HST = static_cast<const HexagonSubtarget&>(DAG.getSubtarget());
+  bool UseHVX = HST.useHVXOps();
+  bool UseHVXDbl = HST.useHVXDblOps();
+
+  bool ValidHVXDblType =
+    (UseHVX && UseHVXDbl) && (VT == MVT::v32i32 || VT == MVT::v16i64 ||
+                              VT == MVT::v64i16 || VT == MVT::v128i8);
+  bool ValidHVXType =
+    UseHVX && !UseHVXDbl && (VT == MVT::v16i32 || VT == MVT::v8i64 ||
+                             VT == MVT::v32i16 || VT == MVT::v64i8);
+
+  if (ValidHVXDblType || ValidHVXType ||
+      VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
+    IsInc = (Ptr->getOpcode() == ISD::ADD);
+    Base = Ptr->getOperand(0);
+    Offset = Ptr->getOperand(1);
+    // Ensure that Offset is a constant.
+    return isa<ConstantSDNode>(Offset);
+  }
+
+  return false;
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                                       SDValue &Base,
+                                                       SDValue &Offset,
+                                                       ISD::MemIndexedMode &AM,
+                                                       SelectionDAG &DAG) const
+{
+  EVT VT;
+  SDValue Ptr;
+
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT  = LD->getMemoryVT();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT  = ST->getMemoryVT();
+    if (ST->getValue().getValueType() == MVT::i64 && ST->isTruncatingStore())
+      return false;
+  } else {
+    return false;
+  }
+
+  bool IsInc = false;
+  bool isLegal = getIndexedAddressParts(Op, VT, Base, Offset, IsInc, DAG);
+  if (isLegal) {
+    auto &HII = *Subtarget.getInstrInfo();
+    int32_t OffsetVal = cast<ConstantSDNode>(Offset.getNode())->getSExtValue();
+    if (HII.isValidAutoIncImm(VT, OffsetVal)) {
+      AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+SDValue
+HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  MachineFunction &MF = DAG.getMachineFunction();
+  auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>();
+  switch (Node->getOpcode()) {
+    case ISD::INLINEASM: {
+      unsigned NumOps = Node->getNumOperands();
+      if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
+        --NumOps;  // Ignore the flag operand.
+
+      for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+        if (FuncInfo.hasClobberLR())
+          break;
+        unsigned Flags =
+          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+        unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+        ++i;  // Skip the ID value.
+
+        switch (InlineAsm::getKind(Flags)) {
+        default: llvm_unreachable("Bad flags!");
+          case InlineAsm::Kind_RegDef:
+          case InlineAsm::Kind_RegUse:
+          case InlineAsm::Kind_Imm:
+          case InlineAsm::Kind_Clobber:
+          case InlineAsm::Kind_Mem: {
+            for (; NumVals; --NumVals, ++i) {}
+            break;
+          }
+          case InlineAsm::Kind_RegDefEarlyClobber: {
+            for (; NumVals; --NumVals, ++i) {
+              unsigned Reg =
+                cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+
+              // Check it to be lr
+              const HexagonRegisterInfo *QRI = Subtarget.getRegisterInfo();
+              if (Reg == QRI->getRARegister()) {
+                FuncInfo.setHasClobberLR(true);
+                break;
+              }
+            }
+            break;
+          }
+        }
+      }
+    }
+  } // Node->getOpcode
+  return Op;
+}
+
+// Need to transform ISD::PREFETCH into something that doesn't inherit
+// all of the properties of ISD::PREFETCH, specifically SDNPMayLoad and
+// SDNPMayStore.
+SDValue HexagonTargetLowering::LowerPREFETCH(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Addr = Op.getOperand(1);
+  // Lower it to DCFETCH($reg, #0).  A "pat" will try to merge the offset in,
+  // if the "reg" is fed by an "add".
+  SDLoc DL(Op);
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
+  return DAG.getNode(HexagonISD::DCFETCH, DL, MVT::Other, Chain, Addr, Zero);
+}
+
+SDValue HexagonTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+      SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  // Lower the hexagon_prefetch builtin to DCFETCH, as above.
+  if (IntNo == Intrinsic::hexagon_prefetch) {
+    SDValue Addr = Op.getOperand(2);
+    SDLoc DL(Op);
+    SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
+    return DAG.getNode(HexagonISD::DCFETCH, DL, MVT::Other, Chain, Addr, Zero);
+  }
+  return SDValue();
+}
+
+SDValue
+HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  SDValue Align = Op.getOperand(2);
+  SDLoc dl(Op);
+
+  ConstantSDNode *AlignConst = dyn_cast<ConstantSDNode>(Align);
+  assert(AlignConst && "Non-constant Align in LowerDYNAMIC_STACKALLOC");
+
+  unsigned A = AlignConst->getSExtValue();
+  auto &HFI = *Subtarget.getFrameLowering();
+  // "Zero" means natural stack alignment.
+  if (A == 0)
+    A = HFI.getStackAlignment();
+
+  DEBUG({
+    dbgs () << __func__ << " Align: " << A << " Size: ";
+    Size.getNode()->dump(&DAG);
+    dbgs() << "\n";
+  });
+
+  SDValue AC = DAG.getConstant(A, dl, MVT::i32);
+  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+  SDValue AA = DAG.getNode(HexagonISD::ALLOCA, dl, VTs, Chain, Size, AC);
+
+  DAG.ReplaceAllUsesOfValueWith(Op, AA);
+  return AA;
+}
+
+SDValue HexagonTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon);
+
+  // For LLVM, in the case when returning a struct by value (>8byte),
+  // the first argument is a pointer that points to the location on caller's
+  // stack where the return value will be stored. For Hexagon, the location on
+  // caller's stack is passed only when the struct size is smaller than (and
+  // equal to) 8 bytes. If not, no address will be passed into callee and
+  // callee return the result direclty through R0/R1.
+
+  SmallVector<SDValue, 8> MemOps;
+  bool UseHVX = Subtarget.useHVXOps(), UseHVXDbl = Subtarget.useHVXDblOps();
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+    unsigned ObjSize;
+    unsigned StackLocation;
+    int FI;
+
+    if (   (VA.isRegLoc() && !Flags.isByVal())
+        || (VA.isRegLoc() && Flags.isByVal() && Flags.getByValSize() > 8)) {
+      // Arguments passed in registers
+      // 1. int, long long, ptr args that get allocated in register.
+      // 2. Large struct that gets an register to put its address in.
+      EVT RegVT = VA.getLocVT();
+      if (RegVT == MVT::i8 || RegVT == MVT::i16 ||
+          RegVT == MVT::i32 || RegVT == MVT::f32) {
+        unsigned VReg =
+          RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+      } else if (RegVT == MVT::i64 || RegVT == MVT::f64) {
+        unsigned VReg =
+          RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+
+      // Single Vector
+      } else if ((RegVT == MVT::v8i64 || RegVT == MVT::v16i32 ||
+                  RegVT == MVT::v32i16 || RegVT == MVT::v64i8)) {
+        unsigned VReg =
+          RegInfo.createVirtualRegister(&Hexagon::VectorRegsRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+    } else if (UseHVX && UseHVXDbl &&
+               ((RegVT == MVT::v16i64 || RegVT == MVT::v32i32 ||
+                 RegVT == MVT::v64i16 || RegVT == MVT::v128i8))) {
+        unsigned VReg =
+          RegInfo.createVirtualRegister(&Hexagon::VectorRegs128BRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+
+      // Double Vector
+      } else if ((RegVT == MVT::v16i64 || RegVT == MVT::v32i32 ||
+                  RegVT == MVT::v64i16 || RegVT == MVT::v128i8)) {
+        unsigned VReg =
+          RegInfo.createVirtualRegister(&Hexagon::VecDblRegsRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+      } else if (UseHVX && UseHVXDbl &&
+                ((RegVT == MVT::v32i64 || RegVT == MVT::v64i32 ||
+                  RegVT == MVT::v128i16 || RegVT == MVT::v256i8))) {
+        unsigned VReg =
+          RegInfo.createVirtualRegister(&Hexagon::VecDblRegs128BRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+      } else if (RegVT == MVT::v512i1 || RegVT == MVT::v1024i1) {
+        assert(0 && "need to support VecPred regs");
+        unsigned VReg =
+          RegInfo.createVirtualRegister(&Hexagon::VecPredRegsRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+      } else {
+        assert (0);
+      }
+    } else if (VA.isRegLoc() && Flags.isByVal() && Flags.getByValSize() <= 8) {
+      assert (0 && "ByValSize must be bigger than 8 bytes");
+    } else {
+      // Sanity check.
+      assert(VA.isMemLoc());
+
+      if (Flags.isByVal()) {
+        // If it's a byval parameter, then we need to compute the
+        // "real" size, not the size of the pointer.
+        ObjSize = Flags.getByValSize();
+      } else {
+        ObjSize = VA.getLocVT().getStoreSizeInBits() >> 3;
+      }
+
+      StackLocation = HEXAGON_LRFP_SIZE + VA.getLocMemOffset();
+      // Create the frame index object for this incoming parameter...
+      FI = MFI.CreateFixedObject(ObjSize, StackLocation, true);
+
+      // Create the SelectionDAG nodes cordl, responding to a load
+      // from this parameter.
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+
+      if (Flags.isByVal()) {
+        // If it's a pass-by-value aggregate, then do not dereference the stack
+        // location. Instead, we should generate a reference to the stack
+        // location.
+        InVals.push_back(FIN);
+      } else {
+        InVals.push_back(
+            DAG.getLoad(VA.getLocVT(), dl, Chain, FIN, MachinePointerInfo()));
+      }
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+
+  if (isVarArg) {
+    // This will point to the next argument passed via stack.
+    int FrameIndex = MFI.CreateFixedObject(Hexagon_PointerSize,
+                                           HEXAGON_LRFP_SIZE +
+                                           CCInfo.getNextStackOffset(),
+                                           true);
+    FuncInfo.setVarArgsFrameIndex(FrameIndex);
+  }
+
+  return Chain;
+}
+
+SDValue
+HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  // VASTART stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  MachineFunction &MF = DAG.getMachineFunction();
+  HexagonMachineFunctionInfo *QFI = MF.getInfo<HexagonMachineFunctionInfo>();
+  SDValue Addr = DAG.getFrameIndex(QFI->getVarArgsFrameIndex(), MVT::i32);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), SDLoc(Op), Addr, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
+// Creates a SPLAT instruction for a constant value VAL.
+static SDValue createSplat(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
+                           SDValue Val) {
+  if (VT.getSimpleVT() == MVT::v4i8)
+    return DAG.getNode(HexagonISD::VSPLATB, dl, VT, Val);
+
+  if (VT.getSimpleVT() == MVT::v4i16)
+    return DAG.getNode(HexagonISD::VSPLATH, dl, VT, Val);
+
+  return SDValue();
+}
+
+static bool isSExtFree(SDValue N) {
+  // A sign-extend of a truncate of a sign-extend is free.
+  if (N.getOpcode() == ISD::TRUNCATE &&
+      N.getOperand(0).getOpcode() == ISD::AssertSext)
+    return true;
+  // We have sign-extended loads.
+  if (N.getOpcode() == ISD::LOAD)
+    return true;
+  return false;
+}
+
+SDValue HexagonTargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue InpVal = Op.getOperand(0);
+  if (isa<ConstantSDNode>(InpVal)) {
+    uint64_t V = cast<ConstantSDNode>(InpVal)->getZExtValue();
+    return DAG.getTargetConstant(countPopulation(V), dl, MVT::i64);
+  }
+  SDValue PopOut = DAG.getNode(HexagonISD::POPCOUNT, dl, MVT::i32, InpVal);
+  return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, PopOut);
+}
+
+SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Cmp = Op.getOperand(2);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Cmp)->get();
+
+  EVT VT = Op.getValueType();
+  EVT LHSVT = LHS.getValueType();
+  EVT RHSVT = RHS.getValueType();
+
+  if (LHSVT == MVT::v2i16) {
+    assert(ISD::isSignedIntSetCC(CC) || ISD::isUnsignedIntSetCC(CC));
+    unsigned ExtOpc = ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND
+                                                : ISD::ZERO_EXTEND;
+    SDValue LX = DAG.getNode(ExtOpc, dl, MVT::v2i32, LHS);
+    SDValue RX = DAG.getNode(ExtOpc, dl, MVT::v2i32, RHS);
+    SDValue SC = DAG.getNode(ISD::SETCC, dl, MVT::v2i1, LX, RX, Cmp);
+    return SC;
+  }
+
+  // Treat all other vector types as legal.
+  if (VT.isVector())
+    return Op;
+
+  // Equals and not equals should use sign-extend, not zero-extend, since
+  // we can represent small negative values in the compare instructions.
+  // The LLVM default is to use zero-extend arbitrarily in these cases.
+  if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+      (RHSVT == MVT::i8 || RHSVT == MVT::i16) &&
+      (LHSVT == MVT::i8 || LHSVT == MVT::i16)) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
+    if (C && C->getAPIntValue().isNegative()) {
+      LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, LHS);
+      RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, RHS);
+      return DAG.getNode(ISD::SETCC, dl, Op.getValueType(),
+                         LHS, RHS, Op.getOperand(2));
+    }
+    if (isSExtFree(LHS) || isSExtFree(RHS)) {
+      LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, LHS);
+      RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, RHS);
+      return DAG.getNode(ISD::SETCC, dl, Op.getValueType(),
+                         LHS, RHS, Op.getOperand(2));
+    }
+  }
+  return SDValue();
+}
+
+SDValue
+HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue PredOp = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1), Op2 = Op.getOperand(2);
+  EVT OpVT = Op1.getValueType();
+  SDLoc DL(Op);
+
+  if (OpVT == MVT::v2i16) {
+    SDValue X1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i32, Op1);
+    SDValue X2 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i32, Op2);
+    SDValue SL = DAG.getNode(ISD::VSELECT, DL, MVT::v2i32, PredOp, X1, X2);
+    SDValue TR = DAG.getNode(ISD::TRUNCATE, DL, MVT::v2i16, SL);
+    return TR;
+  }
+
+  return SDValue();
+}
+
+// Handle only specific vector loads.
+SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
+  SDValue Chain = LoadNode->getChain();
+  SDValue Ptr = Op.getOperand(1);
+  SDValue LoweredLoad;
+  SDValue Result;
+  SDValue Base = LoadNode->getBasePtr();
+  ISD::LoadExtType Ext = LoadNode->getExtensionType();
+  unsigned Alignment = LoadNode->getAlignment();
+  SDValue LoadChain;
+
+  if(Ext == ISD::NON_EXTLOAD)
+    Ext = ISD::ZEXTLOAD;
+
+  if (VT == MVT::v4i16) {
+    if (Alignment == 2) {
+      SDValue Loads[4];
+      // Base load.
+      Loads[0] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Base,
+                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
+                                LoadNode->getMemOperand()->getFlags());
+      // Base+2 load.
+      SDValue Increment = DAG.getConstant(2, DL, MVT::i32);
+      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
+      Loads[1] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
+                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
+                                LoadNode->getMemOperand()->getFlags());
+      // SHL 16, then OR base and base+2.
+      SDValue ShiftAmount = DAG.getConstant(16, DL, MVT::i32);
+      SDValue Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[1], ShiftAmount);
+      SDValue Tmp2 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[0]);
+      // Base + 4.
+      Increment = DAG.getConstant(4, DL, MVT::i32);
+      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
+      Loads[2] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
+                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
+                                LoadNode->getMemOperand()->getFlags());
+      // Base + 6.
+      Increment = DAG.getConstant(6, DL, MVT::i32);
+      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
+      Loads[3] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
+                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
+                                LoadNode->getMemOperand()->getFlags());
+      // SHL 16, then OR base+4 and base+6.
+      Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[3], ShiftAmount);
+      SDValue Tmp4 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[2]);
+      // Combine to i64. This could be optimised out later if we can
+      // affect reg allocation of this code.
+      Result = DAG.getNode(HexagonISD::COMBINE, DL, MVT::i64, Tmp4, Tmp2);
+      LoadChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                              Loads[0].getValue(1), Loads[1].getValue(1),
+                              Loads[2].getValue(1), Loads[3].getValue(1));
+    } else {
+      // Perform default type expansion.
+      Result = DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
+                           LoadNode->getAlignment(),
+                           LoadNode->getMemOperand()->getFlags());
+      LoadChain = Result.getValue(1);
+    }
+  } else
+    llvm_unreachable("Custom lowering unsupported load");
+
+  Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
+  // Since we pretend to lower a load, we need the original chain
+  // info attached to the result.
+  SDValue Ops[] = { Result, LoadChain };
+
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue
+HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+  EVT ValTy = Op.getValueType();
+  ConstantPoolSDNode *CPN = cast<ConstantPoolSDNode>(Op);
+  unsigned Align = CPN->getAlignment();
+  bool IsPositionIndependent = isPositionIndependent();
+  unsigned char TF = IsPositionIndependent ? HexagonII::MO_PCREL : 0;
+
+  unsigned Offset = 0;
+  SDValue T;
+  if (CPN->isMachineConstantPoolEntry())
+    T = DAG.getTargetConstantPool(CPN->getMachineCPVal(), ValTy, Align, Offset,
+                                  TF);
+  else
+    T = DAG.getTargetConstantPool(CPN->getConstVal(), ValTy, Align, Offset,
+                                  TF);
+
+  assert(cast<ConstantPoolSDNode>(T)->getTargetFlags() == TF &&
+         "Inconsistent target flag encountered");
+
+  if (IsPositionIndependent)
+    return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), ValTy, T);
+  return DAG.getNode(HexagonISD::CP, SDLoc(Op), ValTy, T);
+}
+
+SDValue
+HexagonTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  int Idx = cast<JumpTableSDNode>(Op)->getIndex();
+  if (isPositionIndependent()) {
+    SDValue T = DAG.getTargetJumpTable(Idx, VT, HexagonII::MO_PCREL);
+    return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), VT, T);
+  }
+
+  SDValue T = DAG.getTargetJumpTable(Idx, VT);
+  return DAG.getNode(HexagonISD::JT, SDLoc(Op), VT, T);
+}
+
+SDValue
+HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
+  const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setReturnAddressIsTaken(true);
+
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
+    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
+                       MachinePointerInfo());
+  }
+
+  // Return LR, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(HRI.getRARegister(), getRegClassFor(MVT::i32));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+}
+
+SDValue
+HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+  const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI.setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                                         HRI.getFrameRegister(), VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo());
+  return FrameAddr;
+}
+
+SDValue
+HexagonTargetLowering::LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const {
+  SDLoc dl(Op);
+  return DAG.getNode(HexagonISD::BARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+SDValue
+HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  auto *GAN = cast<GlobalAddressSDNode>(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  auto *GV = GAN->getGlobal();
+  int64_t Offset = GAN->getOffset();
+
+  auto &HLOF = *HTM.getObjFileLowering();
+  Reloc::Model RM = HTM.getRelocationModel();
+
+  if (RM == Reloc::Static) {
+    SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
+    const GlobalObject *GO = GV->getBaseObject();
+    if (GO && HLOF.isGlobalInSmallSection(GO, HTM))
+      return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, GA);
+    return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, GA);
+  }
+
+  bool UsePCRel = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+  if (UsePCRel) {
+    SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset,
+                                            HexagonII::MO_PCREL);
+    return DAG.getNode(HexagonISD::AT_PCREL, dl, PtrVT, GA);
+  }
+
+  // Use GOT index.
+  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
+  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, HexagonII::MO_GOT);
+  SDValue Off = DAG.getConstant(Offset, dl, MVT::i32);
+  return DAG.getNode(HexagonISD::AT_GOT, dl, PtrVT, GOT, GA, Off);
+}
+
+// Specifies that for loads and stores VT can be promoted to PromotedLdStVT.
+SDValue
+HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  SDLoc dl(Op);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  Reloc::Model RM = HTM.getRelocationModel();
+  if (RM == Reloc::Static) {
+    SDValue A = DAG.getTargetBlockAddress(BA, PtrVT);
+    return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, A);
+  }
+
+  SDValue A = DAG.getTargetBlockAddress(BA, PtrVT, 0, HexagonII::MO_PCREL);
+  return DAG.getNode(HexagonISD::AT_PCREL, dl, PtrVT, A);
+}
+
+SDValue
+HexagonTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG)
+      const {
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue GOTSym = DAG.getTargetExternalSymbol(HEXAGON_GOT_SYM_NAME, PtrVT,
+                                               HexagonII::MO_PCREL);
+  return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), PtrVT, GOTSym);
+}
+
+SDValue
+HexagonTargetLowering::GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
+      GlobalAddressSDNode *GA, SDValue *InFlag, EVT PtrVT, unsigned ReturnReg,
+      unsigned char OperandFlags) const {
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDLoc dl(GA);
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                           GA->getValueType(0),
+                                           GA->getOffset(),
+                                           OperandFlags);
+  // Create Operands for the call.The Operands should have the following:
+  // 1. Chain SDValue
+  // 2. Callee which in this case is the Global address value.
+  // 3. Registers live into the call.In this case its R0, as we
+  //    have just one argument to be passed.
+  // 4. InFlag if there is any.
+  // Note: The order is important.
+
+  if (InFlag) {
+    SDValue Ops[] = { Chain, TGA,
+                      DAG.getRegister(Hexagon::R0, PtrVT), *InFlag };
+    Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
+  } else {
+    SDValue Ops[]  = { Chain, TGA, DAG.getRegister(Hexagon::R0, PtrVT)};
+    Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
+  }
+
+  // Inform MFI that function has calls.
+  MFI.setAdjustsStack(true);
+
+  SDValue Flag = Chain.getValue(1);
+  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+}
+
+//
+// Lower using the intial executable model for TLS addresses
+//
+SDValue
+HexagonTargetLowering::LowerToTLSInitialExecModel(GlobalAddressSDNode *GA,
+      SelectionDAG &DAG) const {
+  SDLoc dl(GA);
+  int64_t Offset = GA->getOffset();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // Get the thread pointer.
+  SDValue TP = DAG.getCopyFromReg(DAG.getEntryNode(), dl, Hexagon::UGP, PtrVT);
+
+  bool IsPositionIndependent = isPositionIndependent();
+  unsigned char TF =
+      IsPositionIndependent ? HexagonII::MO_IEGOT : HexagonII::MO_IE;
+
+  // First generate the TLS symbol address
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, PtrVT,
+                                           Offset, TF);
+
+  SDValue Sym = DAG.getNode(HexagonISD::CONST32, dl, PtrVT, TGA);
+
+  if (IsPositionIndependent) {
+    // Generate the GOT pointer in case of position independent code
+    SDValue GOT = LowerGLOBAL_OFFSET_TABLE(Sym, DAG);
+
+    // Add the TLS Symbol address to GOT pointer.This gives
+    // GOT relative relocation for the symbol.
+    Sym = DAG.getNode(ISD::ADD, dl, PtrVT, GOT, Sym);
+  }
+
+  // Load the offset value for TLS symbol.This offset is relative to
+  // thread pointer.
+  SDValue LoadOffset =
+      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Sym, MachinePointerInfo());
+
+  // Address of the thread local variable is the add of thread
+  // pointer and the offset of the variable.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, TP, LoadOffset);
+}
+
+//
+// Lower using the local executable model for TLS addresses
+//
+SDValue
+HexagonTargetLowering::LowerToTLSLocalExecModel(GlobalAddressSDNode *GA,
+      SelectionDAG &DAG) const {
+  SDLoc dl(GA);
+  int64_t Offset = GA->getOffset();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // Get the thread pointer.
+  SDValue TP = DAG.getCopyFromReg(DAG.getEntryNode(), dl, Hexagon::UGP, PtrVT);
+  // Generate the TLS symbol address
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, PtrVT, Offset,
+                                           HexagonII::MO_TPREL);
+  SDValue Sym = DAG.getNode(HexagonISD::CONST32, dl, PtrVT, TGA);
+
+  // Address of the thread local variable is the add of thread
+  // pointer and the offset of the variable.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, TP, Sym);
+}
+
+//
+// Lower using the general dynamic model for TLS addresses
+//
+SDValue
+HexagonTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+      SelectionDAG &DAG) const {
+  SDLoc dl(GA);
+  int64_t Offset = GA->getOffset();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // First generate the TLS symbol address
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, PtrVT, Offset,
+                                           HexagonII::MO_GDGOT);
+
+  // Then, generate the GOT pointer
+  SDValue GOT = LowerGLOBAL_OFFSET_TABLE(TGA, DAG);
+
+  // Add the TLS symbol and the GOT pointer
+  SDValue Sym = DAG.getNode(HexagonISD::CONST32, dl, PtrVT, TGA);
+  SDValue Chain = DAG.getNode(ISD::ADD, dl, PtrVT, GOT, Sym);
+
+  // Copy over the argument to R0
+  SDValue InFlag;
+  Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, Hexagon::R0, Chain, InFlag);
+  InFlag = Chain.getValue(1);
+
+  return GetDynamicTLSAddr(DAG, Chain, GA, &InFlag, PtrVT,
+                           Hexagon::R0, HexagonII::MO_GDPLT);
+}
+
+//
+// Lower TLS addresses.
+//
+// For now for dynamic models, we only support the general dynamic model.
+//
+SDValue
+HexagonTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+      SelectionDAG &DAG) const {
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+  switch (HTM.getTLSModel(GA->getGlobal())) {
+    case TLSModel::GeneralDynamic:
+    case TLSModel::LocalDynamic:
+      return LowerToTLSGeneralDynamicModel(GA, DAG);
+    case TLSModel::InitialExec:
+      return LowerToTLSInitialExecModel(GA, DAG);
+    case TLSModel::LocalExec:
+      return LowerToTLSLocalExecModel(GA, DAG);
+  }
+  llvm_unreachable("Bogus TLS model");
+}
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===----------------------------------------------------------------------===//
+
+HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
+                                             const HexagonSubtarget &ST)
+    : TargetLowering(TM), HTM(static_cast<const HexagonTargetMachine&>(TM)),
+      Subtarget(ST) {
+  bool IsV4 = !Subtarget.hasV5TOps();
+  auto &HRI = *Subtarget.getRegisterInfo();
+  bool UseHVX = Subtarget.useHVXOps();
+  bool UseHVXSgl = Subtarget.useHVXSglOps();
+  bool UseHVXDbl = Subtarget.useHVXDblOps();
+
+  setPrefLoopAlignment(4);
+  setPrefFunctionAlignment(4);
+  setMinFunctionAlignment(2);
+  setStackPointerRegisterToSaveRestore(HRI.getStackRegister());
+
+  setMaxAtomicSizeInBitsSupported(64);
+  setMinCmpXchgSizeInBits(32);
+
+  if (EnableHexSDNodeSched)
+    setSchedulingPreference(Sched::VLIW);
+  else
+    setSchedulingPreference(Sched::Source);
+
+  // Limits for inline expansion of memcpy/memmove
+  MaxStoresPerMemcpy = MaxStoresPerMemcpyCL;
+  MaxStoresPerMemcpyOptSize = MaxStoresPerMemcpyOptSizeCL;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveCL;
+  MaxStoresPerMemmoveOptSize = MaxStoresPerMemmoveOptSizeCL;
+  MaxStoresPerMemset = MaxStoresPerMemsetCL;
+  MaxStoresPerMemsetOptSize = MaxStoresPerMemsetOptSizeCL;
+
+  //
+  // Set up register classes.
+  //
+
+  addRegisterClass(MVT::i1,    &Hexagon::PredRegsRegClass);
+  addRegisterClass(MVT::v2i1,  &Hexagon::PredRegsRegClass);  // bbbbaaaa
+  addRegisterClass(MVT::v4i1,  &Hexagon::PredRegsRegClass);  // ddccbbaa
+  addRegisterClass(MVT::v8i1,  &Hexagon::PredRegsRegClass);  // hgfedcba
+  addRegisterClass(MVT::i32,   &Hexagon::IntRegsRegClass);
+  addRegisterClass(MVT::v4i8,  &Hexagon::IntRegsRegClass);
+  addRegisterClass(MVT::v2i16, &Hexagon::IntRegsRegClass);
+  addRegisterClass(MVT::i64,   &Hexagon::DoubleRegsRegClass);
+  addRegisterClass(MVT::v8i8,  &Hexagon::DoubleRegsRegClass);
+  addRegisterClass(MVT::v4i16, &Hexagon::DoubleRegsRegClass);
+  addRegisterClass(MVT::v2i32, &Hexagon::DoubleRegsRegClass);
+
+  if (Subtarget.hasV5TOps()) {
+    addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
+    addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
+  }
+
+  if (Subtarget.hasV60TOps()) {
+    if (Subtarget.useHVXSglOps()) {
+      addRegisterClass(MVT::v64i8,  &Hexagon::VectorRegsRegClass);
+      addRegisterClass(MVT::v32i16, &Hexagon::VectorRegsRegClass);
+      addRegisterClass(MVT::v16i32, &Hexagon::VectorRegsRegClass);
+      addRegisterClass(MVT::v8i64,  &Hexagon::VectorRegsRegClass);
+      addRegisterClass(MVT::v128i8, &Hexagon::VecDblRegsRegClass);
+      addRegisterClass(MVT::v64i16, &Hexagon::VecDblRegsRegClass);
+      addRegisterClass(MVT::v32i32, &Hexagon::VecDblRegsRegClass);
+      addRegisterClass(MVT::v16i64, &Hexagon::VecDblRegsRegClass);
+      addRegisterClass(MVT::v512i1, &Hexagon::VecPredRegsRegClass);
+    } else if (Subtarget.useHVXDblOps()) {
+      addRegisterClass(MVT::v128i8,  &Hexagon::VectorRegs128BRegClass);
+      addRegisterClass(MVT::v64i16,  &Hexagon::VectorRegs128BRegClass);
+      addRegisterClass(MVT::v32i32,  &Hexagon::VectorRegs128BRegClass);
+      addRegisterClass(MVT::v16i64,  &Hexagon::VectorRegs128BRegClass);
+      addRegisterClass(MVT::v256i8,  &Hexagon::VecDblRegs128BRegClass);
+      addRegisterClass(MVT::v128i16, &Hexagon::VecDblRegs128BRegClass);
+      addRegisterClass(MVT::v64i32,  &Hexagon::VecDblRegs128BRegClass);
+      addRegisterClass(MVT::v32i64,  &Hexagon::VecDblRegs128BRegClass);
+      addRegisterClass(MVT::v1024i1, &Hexagon::VecPredRegs128BRegClass);
+    }
+  }
+
+  //
+  // Handling of scalar operations.
+  //
+  // All operations default to "legal", except:
+  // - indexed loads and stores (pre-/post-incremented),
+  // - ANY_EXTEND_VECTOR_INREG, ATOMIC_CMP_SWAP_WITH_SUCCESS, CONCAT_VECTORS,
+  //   ConstantFP, DEBUGTRAP, FCEIL, FCOPYSIGN, FEXP, FEXP2, FFLOOR, FGETSIGN,
+  //   FLOG, FLOG2, FLOG10, FMAXNUM, FMINNUM, FNEARBYINT, FRINT, FROUND, TRAP,
+  //   FTRUNC, PREFETCH, SIGN_EXTEND_VECTOR_INREG, ZERO_EXTEND_VECTOR_INREG,
+  // which default to "expand" for at least one type.
+
+  // Misc operations.
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal); // Default: expand
+  setOperationAction(ISD::ConstantFP, MVT::f64, Legal); // Default: expand
+
+  setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
+  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+  setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+  setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
+  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
+  setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+
+  // Custom legalize GlobalAddress nodes into CONST32.
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i8,  Custom);
+  setOperationAction(ISD::BlockAddress,  MVT::i32, Custom);
+
+  // Hexagon needs to optimize cases with negative constants.
+  setOperationAction(ISD::SETCC, MVT::i8,  Custom);
+  setOperationAction(ISD::SETCC, MVT::i16, Custom);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAEND,   MVT::Other, Expand);
+  setOperationAction(ISD::VAARG,   MVT::Other, Expand);
+
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+
+  if (EmitJumpTables)
+    setMinimumJumpTableEntries(MinimumJumpTables);
+  else
+    setMinimumJumpTableEntries(std::numeric_limits<int>::max());
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+
+  // Hexagon has instructions for add/sub with carry. The problem with
+  // modeling these instructions is that they produce 2 results: Rdd and Px.
+  // To model the update of Px, we will have to use Defs[p0..p3] which will
+  // cause any predicate live range to spill. So, we pretend we dont't have
+  // these instructions.
+  setOperationAction(ISD::ADDE, MVT::i8,  Expand);
+  setOperationAction(ISD::ADDE, MVT::i16, Expand);
+  setOperationAction(ISD::ADDE, MVT::i32, Expand);
+  setOperationAction(ISD::ADDE, MVT::i64, Expand);
+  setOperationAction(ISD::SUBE, MVT::i8,  Expand);
+  setOperationAction(ISD::SUBE, MVT::i16, Expand);
+  setOperationAction(ISD::SUBE, MVT::i32, Expand);
+  setOperationAction(ISD::SUBE, MVT::i64, Expand);
+  setOperationAction(ISD::ADDC, MVT::i8,  Expand);
+  setOperationAction(ISD::ADDC, MVT::i16, Expand);
+  setOperationAction(ISD::ADDC, MVT::i32, Expand);
+  setOperationAction(ISD::ADDC, MVT::i64, Expand);
+  setOperationAction(ISD::SUBC, MVT::i8,  Expand);
+  setOperationAction(ISD::SUBC, MVT::i16, Expand);
+  setOperationAction(ISD::SUBC, MVT::i32, Expand);
+  setOperationAction(ISD::SUBC, MVT::i64, Expand);
+
+  // Only add and sub that detect overflow are the saturating ones.
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setOperationAction(ISD::UADDO, VT, Expand);
+    setOperationAction(ISD::SADDO, VT, Expand);
+    setOperationAction(ISD::USUBO, VT, Expand);
+    setOperationAction(ISD::SSUBO, VT, Expand);
+  }
+
+  setOperationAction(ISD::CTLZ, MVT::i8,  Promote);
+  setOperationAction(ISD::CTLZ, MVT::i16, Promote);
+  setOperationAction(ISD::CTTZ, MVT::i8,  Promote);
+  setOperationAction(ISD::CTTZ, MVT::i16, Promote);
+
+  // In V5, popcount can count # of 1s in i64 but returns i32.
+  // On V4 it will be expanded (set later).
+  setOperationAction(ISD::CTPOP, MVT::i8,  Promote);
+  setOperationAction(ISD::CTPOP, MVT::i16, Promote);
+  setOperationAction(ISD::CTPOP, MVT::i32, Promote);
+  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
+
+  // We custom lower i64 to i64 mul, so that it is not considered as a legal
+  // operation. There is a pattern that will match i64 mul and transform it
+  // to a series of instructions.
+  setOperationAction(ISD::MUL,   MVT::i64, Expand);
+
+  for (unsigned IntExpOp :
+       { ISD::SDIV,      ISD::UDIV,      ISD::SREM,      ISD::UREM,
+         ISD::SDIVREM,   ISD::UDIVREM,   ISD::ROTL,      ISD::ROTR,
+         ISD::BSWAP,     ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS,
+         ISD::SMUL_LOHI, ISD::UMUL_LOHI }) {
+    setOperationAction(IntExpOp, MVT::i32, Expand);
+    setOperationAction(IntExpOp, MVT::i64, Expand);
+  }
+
+  for (unsigned FPExpOp :
+       {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FSINCOS,
+        ISD::FPOW, ISD::FCOPYSIGN}) {
+    setOperationAction(FPExpOp, MVT::f32, Expand);
+    setOperationAction(FPExpOp, MVT::f64, Expand);
+  }
+
+  // No extending loads from i32.
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
+    setLoadExtAction(ISD::EXTLOAD,  VT, MVT::i32, Expand);
+  }
+  // Turn FP truncstore into trunc + store.
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  // Turn FP extload into load/fpextend.
+  for (MVT VT : MVT::fp_valuetypes())
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+
+  // Expand BR_CC and SELECT_CC for all integer and fp types.
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setOperationAction(ISD::BR_CC,     VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+  }
+  for (MVT VT : MVT::fp_valuetypes()) {
+    setOperationAction(ISD::BR_CC,     VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+  }
+  setOperationAction(ISD::BR_CC, MVT::Other, Expand);
+
+  //
+  // Handling of vector operations.
+  //
+
+  // Custom lower v4i16 load only. Let v4i16 store to be
+  // promoted for now.
+  promoteLdStType(MVT::v4i8,  MVT::i32);
+  promoteLdStType(MVT::v2i16, MVT::i32);
+  promoteLdStType(MVT::v8i8,  MVT::i64);
+  promoteLdStType(MVT::v2i32, MVT::i64);
+
+  setOperationAction(ISD::LOAD,  MVT::v4i16, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i16, Promote);
+  AddPromotedToType(ISD::LOAD,  MVT::v4i16, MVT::i64);
+  AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::i64);
+
+  // Set the action for vector operations to "expand", then override it with
+  // either "custom" or "legal" for specific cases.
+  static const unsigned VectExpOps[] = {
+    // Integer arithmetic:
+    ISD::ADD,     ISD::SUB,     ISD::MUL,     ISD::SDIV,    ISD::UDIV,
+    ISD::SREM,    ISD::UREM,    ISD::SDIVREM, ISD::UDIVREM, ISD::ADDC,
+    ISD::SUBC,    ISD::SADDO,   ISD::UADDO,   ISD::SSUBO,   ISD::USUBO,
+    ISD::SMUL_LOHI,             ISD::UMUL_LOHI,
+    // Logical/bit:
+    ISD::AND,     ISD::OR,      ISD::XOR,     ISD::ROTL,    ISD::ROTR,
+    ISD::CTPOP,   ISD::CTLZ,    ISD::CTTZ,
+    // Floating point arithmetic/math functions:
+    ISD::FADD,    ISD::FSUB,    ISD::FMUL,    ISD::FMA,     ISD::FDIV,
+    ISD::FREM,    ISD::FNEG,    ISD::FABS,    ISD::FSQRT,   ISD::FSIN,
+    ISD::FCOS,    ISD::FPOWI,   ISD::FPOW,    ISD::FLOG,    ISD::FLOG2,
+    ISD::FLOG10,  ISD::FEXP,    ISD::FEXP2,   ISD::FCEIL,   ISD::FTRUNC,
+    ISD::FRINT,   ISD::FNEARBYINT,            ISD::FROUND,  ISD::FFLOOR,
+    ISD::FMINNUM, ISD::FMAXNUM, ISD::FSINCOS,
+    // Misc:
+    ISD::BR_CC,   ISD::SELECT_CC,             ISD::ConstantPool,
+    // Vector:
+    ISD::BUILD_VECTOR,          ISD::SCALAR_TO_VECTOR,
+    ISD::EXTRACT_VECTOR_ELT,    ISD::INSERT_VECTOR_ELT,
+    ISD::EXTRACT_SUBVECTOR,     ISD::INSERT_SUBVECTOR,
+    ISD::CONCAT_VECTORS,        ISD::VECTOR_SHUFFLE
+  };
+
+  for (MVT VT : MVT::vector_valuetypes()) {
+    for (unsigned VectExpOp : VectExpOps)
+      setOperationAction(VectExpOp, VT, Expand);
+
+    // Expand all extending loads and truncating stores:
+    for (MVT TargetVT : MVT::vector_valuetypes()) {
+      if (TargetVT == VT)
+        continue;
+      setLoadExtAction(ISD::EXTLOAD, TargetVT, VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, TargetVT, VT, Expand);
+      setLoadExtAction(ISD::SEXTLOAD, TargetVT, VT, Expand);
+      setTruncStoreAction(VT, TargetVT, Expand);
+    }
+
+    // Normalize all inputs to SELECT to be vectors of i32.
+    if (VT.getVectorElementType() != MVT::i32) {
+      MVT VT32 = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType(ISD::SELECT, VT, VT32);
+    }
+    setOperationAction(ISD::SRA, VT, Custom);
+    setOperationAction(ISD::SHL, VT, Custom);
+    setOperationAction(ISD::SRL, VT, Custom);
+  }
+
+  // Types natively supported:
+  for (MVT NativeVT : {MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v32i1, MVT::v64i1,
+                       MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16, MVT::v1i32,
+                       MVT::v2i32, MVT::v1i64}) {
+    setOperationAction(ISD::BUILD_VECTOR,       NativeVT, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, NativeVT, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  NativeVT, Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR,  NativeVT, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   NativeVT, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     NativeVT, Custom);
+
+    setOperationAction(ISD::ADD, NativeVT, Legal);
+    setOperationAction(ISD::SUB, NativeVT, Legal);
+    setOperationAction(ISD::MUL, NativeVT, Legal);
+    setOperationAction(ISD::AND, NativeVT, Legal);
+    setOperationAction(ISD::OR,  NativeVT, Legal);
+    setOperationAction(ISD::XOR, NativeVT, Legal);
+  }
+
+  setOperationAction(ISD::SETCC,          MVT::v2i16, Custom);
+  setOperationAction(ISD::VSELECT,        MVT::v2i16, Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8,  Custom);
+
+  if (UseHVX) {
+    if (UseHVXSgl) {
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i8,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i16,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i32,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i64,  Custom);
+      // We try to generate the vpack{e/o} instructions. If we fail
+      // we fall back upon ExpandOp.
+      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8,  Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v64i8, Custom);
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i16, Custom);
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
+    } else if (UseHVXDbl) {
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v256i8,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i16, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i32,  Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i64,  Custom);
+      // We try to generate the vpack{e/o} instructions. If we fail
+      // we fall back upon ExpandOp.
+      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v128i8,  Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i16,  Custom);
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v128i8, Custom);
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v64i16, Custom);
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
+    } else {
+      llvm_unreachable("Unrecognized HVX mode");
+    }
+  }
+  // Subtarget-specific operation actions.
+  //
+  if (Subtarget.hasV5TOps()) {
+    setOperationAction(ISD::FMA,  MVT::f64, Expand);
+    setOperationAction(ISD::FADD, MVT::f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::f64, Expand);
+
+    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+
+    setOperationAction(ISD::FP_TO_UINT, MVT::i1,  Promote);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i8,  Promote);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i1,  Promote);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i8,  Promote);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i1,  Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i1,  Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
+  } else { // V4
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i64, Expand);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+    setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand);
+    setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand);
+    setOperationAction(ISD::FP_EXTEND,  MVT::f32, Expand);
+    setOperationAction(ISD::FP_ROUND,   MVT::f64, Expand);
+    setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+
+    setOperationAction(ISD::CTPOP, MVT::i8,  Expand);
+    setOperationAction(ISD::CTPOP, MVT::i16, Expand);
+    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+    // Expand these operations for both f32 and f64:
+    for (unsigned FPExpOpV4 :
+         {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FABS, ISD::FNEG, ISD::FMA}) {
+      setOperationAction(FPExpOpV4, MVT::f32, Expand);
+      setOperationAction(FPExpOpV4, MVT::f64, Expand);
+    }
+
+    for (ISD::CondCode FPExpCCV4 :
+         {ISD::SETOEQ, ISD::SETOGT, ISD::SETOLT, ISD::SETOGE, ISD::SETOLE,
+          ISD::SETUO,  ISD::SETO}) {
+      setCondCodeAction(FPExpCCV4, MVT::f32, Expand);
+      setCondCodeAction(FPExpCCV4, MVT::f64, Expand);
+    }
+  }
+
+  // Handling of indexed loads/stores: default is "expand".
+  //
+  for (MVT VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
+    setIndexedLoadAction(ISD::POST_INC, VT, Legal);
+    setIndexedStoreAction(ISD::POST_INC, VT, Legal);
+  }
+
+  if (UseHVXSgl) {
+    for (MVT VT : {MVT::v64i8,  MVT::v32i16, MVT::v16i32, MVT::v8i64,
+                   MVT::v128i8, MVT::v64i16, MVT::v32i32, MVT::v16i64}) {
+      setIndexedLoadAction(ISD::POST_INC, VT, Legal);
+      setIndexedStoreAction(ISD::POST_INC, VT, Legal);
+    }
+  } else if (UseHVXDbl) {
+    for (MVT VT : {MVT::v128i8, MVT::v64i16,  MVT::v32i32, MVT::v16i64,
+                   MVT::v256i8, MVT::v128i16, MVT::v64i32, MVT::v32i64}) {
+      setIndexedLoadAction(ISD::POST_INC, VT, Legal);
+      setIndexedStoreAction(ISD::POST_INC, VT, Legal);
+    }
+  }
+
+  computeRegisterProperties(&HRI);
+
+  //
+  // Library calls for unsupported operations
+  //
+  bool FastMath  = EnableFastMath;
+
+  setLibcallName(RTLIB::SDIV_I32, "__hexagon_divsi3");
+  setLibcallName(RTLIB::SDIV_I64, "__hexagon_divdi3");
+  setLibcallName(RTLIB::UDIV_I32, "__hexagon_udivsi3");
+  setLibcallName(RTLIB::UDIV_I64, "__hexagon_udivdi3");
+  setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3");
+  setLibcallName(RTLIB::SREM_I64, "__hexagon_moddi3");
+  setLibcallName(RTLIB::UREM_I32, "__hexagon_umodsi3");
+  setLibcallName(RTLIB::UREM_I64, "__hexagon_umoddi3");
+
+  setLibcallName(RTLIB::SINTTOFP_I128_F64, "__hexagon_floattidf");
+  setLibcallName(RTLIB::SINTTOFP_I128_F32, "__hexagon_floattisf");
+  setLibcallName(RTLIB::FPTOUINT_F32_I128, "__hexagon_fixunssfti");
+  setLibcallName(RTLIB::FPTOUINT_F64_I128, "__hexagon_fixunsdfti");
+  setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti");
+  setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti");
+
+  if (IsV4) {
+    // Handle single-precision floating point operations on V4.
+    if (FastMath) {
+      setLibcallName(RTLIB::ADD_F32, "__hexagon_fast_addsf3");
+      setLibcallName(RTLIB::SUB_F32, "__hexagon_fast_subsf3");
+      setLibcallName(RTLIB::MUL_F32, "__hexagon_fast_mulsf3");
+      setLibcallName(RTLIB::OGT_F32, "__hexagon_fast_gtsf2");
+      setLibcallName(RTLIB::OLT_F32, "__hexagon_fast_ltsf2");
+      // Double-precision compares.
+      setLibcallName(RTLIB::OGT_F64, "__hexagon_fast_gtdf2");
+      setLibcallName(RTLIB::OLT_F64, "__hexagon_fast_ltdf2");
+    } else {
+      setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
+      setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3");
+      setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3");
+      setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2");
+      setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2");
+      // Double-precision compares.
+      setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2");
+      setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2");
+    }
+  }
+
+  // This is the only fast library function for sqrtd.
+  if (FastMath)
+    setLibcallName(RTLIB::SQRT_F64, "__hexagon_fast2_sqrtdf2");
+
+  // Prefix is: nothing  for "slow-math",
+  //            "fast2_" for V4 fast-math and V5+ fast-math double-precision
+  // (actually, keep fast-math and fast-math2 separate for now)
+  if (FastMath) {
+    setLibcallName(RTLIB::ADD_F64, "__hexagon_fast_adddf3");
+    setLibcallName(RTLIB::SUB_F64, "__hexagon_fast_subdf3");
+    setLibcallName(RTLIB::MUL_F64, "__hexagon_fast_muldf3");
+    setLibcallName(RTLIB::DIV_F64, "__hexagon_fast_divdf3");
+    // Calling __hexagon_fast2_divsf3 with fast-math on V5 (ok).
+    setLibcallName(RTLIB::DIV_F32, "__hexagon_fast_divsf3");
+  } else {
+    setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3");
+    setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3");
+    setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3");
+    setLibcallName(RTLIB::DIV_F64, "__hexagon_divdf3");
+    setLibcallName(RTLIB::DIV_F32, "__hexagon_divsf3");
+  }
+
+  if (Subtarget.hasV5TOps()) {
+    if (FastMath)
+      setLibcallName(RTLIB::SQRT_F32, "__hexagon_fast2_sqrtf");
+    else
+      setLibcallName(RTLIB::SQRT_F32, "__hexagon_sqrtf");
+  } else {
+    // V4
+    setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf");
+    setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf");
+    setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf");
+    setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf");
+    setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf");
+    setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf");
+    setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf");
+    setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf");
+    setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi");
+    setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi");
+    setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi");
+    setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi");
+    setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi");
+    setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi");
+    setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi");
+    setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi");
+    setLibcallName(RTLIB::FPEXT_F32_F64,    "__hexagon_extendsfdf2");
+    setLibcallName(RTLIB::FPROUND_F64_F32,  "__hexagon_truncdfsf2");
+    setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2");
+    setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2");
+    setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2");
+    setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2");
+    setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2");
+    setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2");
+    setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2");
+    setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2");
+    setLibcallName(RTLIB::UO_F32,  "__hexagon_unordsf2");
+    setLibcallName(RTLIB::UO_F64,  "__hexagon_unorddf2");
+    setLibcallName(RTLIB::O_F32,   "__hexagon_unordsf2");
+    setLibcallName(RTLIB::O_F64,   "__hexagon_unorddf2");
+  }
+
+  // These cause problems when the shift amount is non-constant.
+  setLibcallName(RTLIB::SHL_I128, nullptr);
+  setLibcallName(RTLIB::SRL_I128, nullptr);
+  setLibcallName(RTLIB::SRA_I128, nullptr);
+}
+
+const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch ((HexagonISD::NodeType)Opcode) {
+  case HexagonISD::ALLOCA:        return "HexagonISD::ALLOCA";
+  case HexagonISD::AT_GOT:        return "HexagonISD::AT_GOT";
+  case HexagonISD::AT_PCREL:      return "HexagonISD::AT_PCREL";
+  case HexagonISD::BARRIER:       return "HexagonISD::BARRIER";
+  case HexagonISD::CALL:          return "HexagonISD::CALL";
+  case HexagonISD::CALLnr:        return "HexagonISD::CALLnr";
+  case HexagonISD::CALLR:         return "HexagonISD::CALLR";
+  case HexagonISD::COMBINE:       return "HexagonISD::COMBINE";
+  case HexagonISD::CONST32_GP:    return "HexagonISD::CONST32_GP";
+  case HexagonISD::CONST32:       return "HexagonISD::CONST32";
+  case HexagonISD::CP:            return "HexagonISD::CP";
+  case HexagonISD::DCFETCH:       return "HexagonISD::DCFETCH";
+  case HexagonISD::EH_RETURN:     return "HexagonISD::EH_RETURN";
+  case HexagonISD::EXTRACTU:      return "HexagonISD::EXTRACTU";
+  case HexagonISD::EXTRACTURP:    return "HexagonISD::EXTRACTURP";
+  case HexagonISD::INSERT:        return "HexagonISD::INSERT";
+  case HexagonISD::INSERTRP:      return "HexagonISD::INSERTRP";
+  case HexagonISD::JT:            return "HexagonISD::JT";
+  case HexagonISD::PACKHL:        return "HexagonISD::PACKHL";
+  case HexagonISD::POPCOUNT:      return "HexagonISD::POPCOUNT";
+  case HexagonISD::RET_FLAG:      return "HexagonISD::RET_FLAG";
+  case HexagonISD::SHUFFEB:       return "HexagonISD::SHUFFEB";
+  case HexagonISD::SHUFFEH:       return "HexagonISD::SHUFFEH";
+  case HexagonISD::SHUFFOB:       return "HexagonISD::SHUFFOB";
+  case HexagonISD::SHUFFOH:       return "HexagonISD::SHUFFOH";
+  case HexagonISD::TC_RETURN:     return "HexagonISD::TC_RETURN";
+  case HexagonISD::VCMPBEQ:       return "HexagonISD::VCMPBEQ";
+  case HexagonISD::VCMPBGT:       return "HexagonISD::VCMPBGT";
+  case HexagonISD::VCMPBGTU:      return "HexagonISD::VCMPBGTU";
+  case HexagonISD::VCMPHEQ:       return "HexagonISD::VCMPHEQ";
+  case HexagonISD::VCMPHGT:       return "HexagonISD::VCMPHGT";
+  case HexagonISD::VCMPHGTU:      return "HexagonISD::VCMPHGTU";
+  case HexagonISD::VCMPWEQ:       return "HexagonISD::VCMPWEQ";
+  case HexagonISD::VCMPWGT:       return "HexagonISD::VCMPWGT";
+  case HexagonISD::VCMPWGTU:      return "HexagonISD::VCMPWGTU";
+  case HexagonISD::VCOMBINE:      return "HexagonISD::VCOMBINE";
+  case HexagonISD::VPACK:         return "HexagonISD::VPACK";
+  case HexagonISD::VSHLH:         return "HexagonISD::VSHLH";
+  case HexagonISD::VSHLW:         return "HexagonISD::VSHLW";
+  case HexagonISD::VSPLATB:       return "HexagonISD::VSPLTB";
+  case HexagonISD::VSPLATH:       return "HexagonISD::VSPLATH";
+  case HexagonISD::VSRAH:         return "HexagonISD::VSRAH";
+  case HexagonISD::VSRAW:         return "HexagonISD::VSRAW";
+  case HexagonISD::VSRLH:         return "HexagonISD::VSRLH";
+  case HexagonISD::VSRLW:         return "HexagonISD::VSRLW";
+  case HexagonISD::VSXTBH:        return "HexagonISD::VSXTBH";
+  case HexagonISD::VSXTBW:        return "HexagonISD::VSXTBW";
+  case HexagonISD::OP_END:        break;
+  }
+  return nullptr;
+}
+
+bool HexagonTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  EVT MTy1 = EVT::getEVT(Ty1);
+  EVT MTy2 = EVT::getEVT(Ty2);
+  if (!MTy1.isSimple() || !MTy2.isSimple())
+    return false;
+  return (MTy1.getSimpleVT() == MVT::i64) && (MTy2.getSimpleVT() == MVT::i32);
+}
+
+bool HexagonTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isSimple() || !VT2.isSimple())
+    return false;
+  return (VT1.getSimpleVT() == MVT::i64) && (VT2.getSimpleVT() == MVT::i32);
+}
+
+bool HexagonTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  return isOperationLegalOrCustom(ISD::FMA, VT);
+}
+
+// Should we expand the build vector with shuffles?
+bool HexagonTargetLowering::shouldExpandBuildVectorWithShuffles(EVT VT,
+      unsigned DefinedValues) const {
+  // Hexagon vector shuffle operates on element sizes of bytes or halfwords
+  EVT EltVT = VT.getVectorElementType();
+  int EltBits = EltVT.getSizeInBits();
+  if ((EltBits != 8) && (EltBits != 16))
+    return false;
+
+  return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
+}
+
+static StridedLoadKind isStridedLoad(const ArrayRef<int> &Mask) {
+  int even_start = -2;
+  int odd_start = -1;
+  size_t mask_len = Mask.size();
+  for (auto idx : Mask) {
+    if ((idx - even_start) == 2)
+      even_start = idx;
+    else
+      break;
+  }
+  if (even_start == (int)(mask_len * 2) - 2)
+    return StridedLoadKind::Even;
+  for (auto idx : Mask) {
+    if ((idx - odd_start) == 2)
+      odd_start = idx;
+    else
+      break;
+  }
+  if (odd_start == (int)(mask_len * 2) - 1)
+    return StridedLoadKind::Odd;
+
+  return StridedLoadKind::NoPattern;
+}
+
+bool HexagonTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+      EVT VT) const {
+  if (Subtarget.useHVXOps())
+    return isStridedLoad(Mask) != StridedLoadKind::NoPattern;
+  return true;
+}
+
+// Lower a vector shuffle (V1, V2, V3).  V1 and V2 are the two vectors
+// to select data from, V3 is the permutation.
+SDValue
+HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
+      const {
+  const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  bool UseHVX = Subtarget.useHVXOps();
+
+  if (V2.isUndef())
+    V2 = V1;
+
+  if (SVN->isSplat()) {
+    int Lane = SVN->getSplatIndex();
+    if (Lane == -1) Lane = 0;
+
+    // Test if V1 is a SCALAR_TO_VECTOR.
+    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
+      return createSplat(DAG, dl, VT, V1.getOperand(0));
+
+    // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
+    // (and probably will turn into a SCALAR_TO_VECTOR once legalization
+    // reaches it).
+    if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
+        !isa<ConstantSDNode>(V1.getOperand(0))) {
+      bool IsScalarToVector = true;
+      for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) {
+        if (!V1.getOperand(i).isUndef()) {
+          IsScalarToVector = false;
+          break;
+        }
+      }
+      if (IsScalarToVector)
+        return createSplat(DAG, dl, VT, V1.getOperand(0));
+    }
+    return createSplat(DAG, dl, VT, DAG.getConstant(Lane, dl, MVT::i32));
+  }
+
+  if (UseHVX) {
+    ArrayRef<int> Mask = SVN->getMask();
+    size_t MaskLen = Mask.size();
+    int ElemSizeInBits = VT.getScalarSizeInBits();
+    if ((Subtarget.useHVXSglOps() && (ElemSizeInBits * MaskLen) == 64 * 8) ||
+        (Subtarget.useHVXDblOps() && (ElemSizeInBits * MaskLen) == 128 * 8)) {
+      // Return 1 for odd and 2 of even
+      StridedLoadKind Pattern = isStridedLoad(Mask);
+
+      if (Pattern == StridedLoadKind::NoPattern)
+        return SDValue();
+
+      SDValue Vec0 = Op.getOperand(0);
+      SDValue Vec1 = Op.getOperand(1);
+      SDValue StridePattern = DAG.getConstant(Pattern, dl, MVT::i32);
+      SDValue Ops[] = { Vec1, Vec0, StridePattern };
+      return DAG.getNode(HexagonISD::VPACK, dl, VT, Ops);
+    }
+    // We used to assert in the "else" part here, but that is bad for Halide
+    // Halide creates intermediate double registers by interleaving two
+    // concatenated vector registers. The interleaving requires vector_shuffle
+    // nodes and we shouldn't barf on a double register result of a
+    // vector_shuffle because it is most likely an intermediate result.
+  }
+  // FIXME: We need to support more general vector shuffles.  See
+  // below the comment from the ARM backend that deals in the general
+  // case with the vector shuffles.  For now, let expand handle these.
+  return SDValue();
+
+  // If the shuffle is not directly supported and it has 4 elements, use
+  // the PerfectShuffle-generated table to synthesize it from other shuffles.
+}
+
+// If BUILD_VECTOR has same base element repeated several times,
+// report true.
+static bool isCommonSplatElement(BuildVectorSDNode *BVN) {
+  unsigned NElts = BVN->getNumOperands();
+  SDValue V0 = BVN->getOperand(0);
+
+  for (unsigned i = 1, e = NElts; i != e; ++i) {
+    if (BVN->getOperand(i) != V0)
+      return false;
+  }
+  return true;
+}
+
+// Lower a vector shift. Try to convert
+// <VT> = SHL/SRA/SRL <VT> by <VT> to Hexagon specific
+// <VT> = SHL/SRA/SRL <VT> by <IT/i32>.
+SDValue
+HexagonTargetLowering::LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const {
+  BuildVectorSDNode *BVN = nullptr;
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDValue V3;
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  if ((BVN = dyn_cast<BuildVectorSDNode>(V1.getNode())) &&
+      isCommonSplatElement(BVN))
+    V3 = V2;
+  else if ((BVN = dyn_cast<BuildVectorSDNode>(V2.getNode())) &&
+           isCommonSplatElement(BVN))
+    V3 = V1;
+  else
+    return SDValue();
+
+  SDValue CommonSplat = BVN->getOperand(0);
+  SDValue Result;
+
+  if (VT.getSimpleVT() == MVT::v4i16) {
+    switch (Op.getOpcode()) {
+    case ISD::SRA:
+      Result = DAG.getNode(HexagonISD::VSRAH, dl, VT, V3, CommonSplat);
+      break;
+    case ISD::SHL:
+      Result = DAG.getNode(HexagonISD::VSHLH, dl, VT, V3, CommonSplat);
+      break;
+    case ISD::SRL:
+      Result = DAG.getNode(HexagonISD::VSRLH, dl, VT, V3, CommonSplat);
+      break;
+    default:
+      return SDValue();
+    }
+  } else if (VT.getSimpleVT() == MVT::v2i32) {
+    switch (Op.getOpcode()) {
+    case ISD::SRA:
+      Result = DAG.getNode(HexagonISD::VSRAW, dl, VT, V3, CommonSplat);
+      break;
+    case ISD::SHL:
+      Result = DAG.getNode(HexagonISD::VSHLW, dl, VT, V3, CommonSplat);
+      break;
+    case ISD::SRL:
+      Result = DAG.getNode(HexagonISD::VSRLW, dl, VT, V3, CommonSplat);
+      break;
+    default:
+      return SDValue();
+    }
+  } else {
+    return SDValue();
+  }
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+}
+
+SDValue
+HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  unsigned Size = VT.getSizeInBits();
+
+  // Only handle vectors of 64 bits or shorter.
+  if (Size > 64)
+    return SDValue();
+
+  APInt APSplatBits, APSplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  unsigned NElts = BVN->getNumOperands();
+
+  // Try to generate a SPLAT instruction.
+  if ((VT.getSimpleVT() == MVT::v4i8 || VT.getSimpleVT() == MVT::v4i16) &&
+      (BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                            HasAnyUndefs, 0, true) && SplatBitSize <= 16)) {
+    unsigned SplatBits = APSplatBits.getZExtValue();
+    int32_t SextVal = ((int32_t) (SplatBits << (32 - SplatBitSize)) >>
+                       (32 - SplatBitSize));
+    return createSplat(DAG, dl, VT, DAG.getConstant(SextVal, dl, MVT::i32));
+  }
+
+  // Try to generate COMBINE to build v2i32 vectors.
+  if (VT.getSimpleVT() == MVT::v2i32) {
+    SDValue V0 = BVN->getOperand(0);
+    SDValue V1 = BVN->getOperand(1);
+
+    if (V0.isUndef())
+      V0 = DAG.getConstant(0, dl, MVT::i32);
+    if (V1.isUndef())
+      V1 = DAG.getConstant(0, dl, MVT::i32);
+
+    ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(V0);
+    ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(V1);
+    // If the element isn't a constant, it is in a register:
+    // generate a COMBINE Register Register instruction.
+    if (!C0 || !C1)
+      return DAG.getNode(HexagonISD::COMBINE, dl, VT, V1, V0);
+
+    // If one of the operands is an 8 bit integer constant, generate
+    // a COMBINE Immediate Immediate instruction.
+    if (isInt<8>(C0->getSExtValue()) ||
+        isInt<8>(C1->getSExtValue()))
+      return DAG.getNode(HexagonISD::COMBINE, dl, VT, V1, V0);
+  }
+
+  // Try to generate a S2_packhl to build v2i16 vectors.
+  if (VT.getSimpleVT() == MVT::v2i16) {
+    for (unsigned i = 0, e = NElts; i != e; ++i) {
+      if (BVN->getOperand(i).isUndef())
+        continue;
+      ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(BVN->getOperand(i));
+      // If the element isn't a constant, it is in a register:
+      // generate a S2_packhl instruction.
+      if (!Cst) {
+        SDValue pack = DAG.getNode(HexagonISD::PACKHL, dl, MVT::v4i16,
+                                   BVN->getOperand(1), BVN->getOperand(0));
+
+        return DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::v2i16,
+                                          pack);
+      }
+    }
+  }
+
+  // In the general case, generate a CONST32 or a CONST64 for constant vectors,
+  // and insert_vector_elt for all the other cases.
+  uint64_t Res = 0;
+  unsigned EltSize = Size / NElts;
+  SDValue ConstVal;
+  uint64_t Mask = ~uint64_t(0ULL) >> (64 - EltSize);
+  bool HasNonConstantElements = false;
+
+  for (unsigned i = 0, e = NElts; i != e; ++i) {
+    // LLVM's BUILD_VECTOR operands are in Little Endian mode, whereas Hexagon's
+    // combine, const64, etc. are Big Endian.
+    unsigned OpIdx = NElts - i - 1;
+    SDValue Operand = BVN->getOperand(OpIdx);
+    if (Operand.isUndef())
+      continue;
+
+    int64_t Val = 0;
+    if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Operand))
+      Val = Cst->getSExtValue();
+    else
+      HasNonConstantElements = true;
+
+    Val &= Mask;
+    Res = (Res << EltSize) | Val;
+  }
+
+  if (Size > 64)
+    return SDValue();
+
+  if (Size == 64)
+    ConstVal = DAG.getConstant(Res, dl, MVT::i64);
+  else
+    ConstVal = DAG.getConstant(Res, dl, MVT::i32);
+
+  // When there are non constant operands, add them with INSERT_VECTOR_ELT to
+  // ConstVal, the constant part of the vector.
+  if (HasNonConstantElements) {
+    EVT EltVT = VT.getVectorElementType();
+    SDValue Width = DAG.getConstant(EltVT.getSizeInBits(), dl, MVT::i64);
+    SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width,
+                                  DAG.getConstant(32, dl, MVT::i64));
+
+    for (unsigned i = 0, e = NElts; i != e; ++i) {
+      // LLVM's BUILD_VECTOR operands are in Little Endian mode, whereas Hexagon
+      // is Big Endian.
+      unsigned OpIdx = NElts - i - 1;
+      SDValue Operand = BVN->getOperand(OpIdx);
+      if (isa<ConstantSDNode>(Operand))
+        // This operand is already in ConstVal.
+        continue;
+
+      if (VT.getSizeInBits() == 64 &&
+          Operand.getValueSizeInBits() == 32) {
+        SDValue C = DAG.getConstant(0, dl, MVT::i32);
+        Operand = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, Operand);
+      }
+
+      SDValue Idx = DAG.getConstant(OpIdx, dl, MVT::i64);
+      SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, Width);
+      SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset);
+      const SDValue Ops[] = {ConstVal, Operand, Combined};
+
+      if (VT.getSizeInBits() == 32)
+        ConstVal = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, Ops);
+      else
+        ConstVal = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, Ops);
+    }
+  }
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, ConstVal);
+}
+
+SDValue
+HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  bool UseHVX = Subtarget.useHVXOps();
+  EVT VT = Op.getValueType();
+  unsigned NElts = Op.getNumOperands();
+  SDValue Vec0 = Op.getOperand(0);
+  EVT VecVT = Vec0.getValueType();
+  unsigned Width = VecVT.getSizeInBits();
+
+  if (NElts == 2) {
+    MVT ST = VecVT.getSimpleVT();
+    // We are trying to concat two v2i16 to a single v4i16, or two v4i8
+    // into a single v8i8.
+    if (ST == MVT::v2i16 || ST == MVT::v4i8)
+      return DAG.getNode(HexagonISD::COMBINE, dl, VT, Op.getOperand(1), Vec0);
+
+    if (UseHVX) {
+      assert((Width ==  64*8 && Subtarget.useHVXSglOps()) ||
+             (Width == 128*8 && Subtarget.useHVXDblOps()));
+      SDValue Vec1 = Op.getOperand(1);
+      MVT OpTy = Subtarget.useHVXSglOps() ? MVT::v16i32 : MVT::v32i32;
+      MVT ReTy = Subtarget.useHVXSglOps() ? MVT::v32i32 : MVT::v64i32;
+      SDValue B0 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec0);
+      SDValue B1 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec1);
+      SDValue VC = DAG.getNode(HexagonISD::VCOMBINE, dl, ReTy, B1, B0);
+      return DAG.getNode(ISD::BITCAST, dl, VT, VC);
+    }
+  }
+
+  if (VT.getSizeInBits() != 32 && VT.getSizeInBits() != 64)
+    return SDValue();
+
+  SDValue C0 = DAG.getConstant(0, dl, MVT::i64);
+  SDValue C32 = DAG.getConstant(32, dl, MVT::i64);
+  SDValue W = DAG.getConstant(Width, dl, MVT::i64);
+  // Create the "width" part of the argument to insert_rp/insertp_rp.
+  SDValue S = DAG.getNode(ISD::SHL, dl, MVT::i64, W, C32);
+  SDValue V = C0;
+
+  for (unsigned i = 0, e = NElts; i != e; ++i) {
+    unsigned N = NElts-i-1;
+    SDValue OpN = Op.getOperand(N);
+
+    if (VT.getSizeInBits() == 64 && OpN.getValueSizeInBits() == 32) {
+      SDValue C = DAG.getConstant(0, dl, MVT::i32);
+      OpN = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, OpN);
+    }
+    SDValue Idx = DAG.getConstant(N, dl, MVT::i64);
+    SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, W);
+    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, S, Offset);
+    if (VT.getSizeInBits() == 32)
+      V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, {V, OpN, Or});
+    else if (VT.getSizeInBits() == 64)
+      V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, {V, OpN, Or});
+    else
+      return SDValue();
+  }
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, V);
+}
+
+SDValue
+HexagonTargetLowering::LowerEXTRACT_SUBVECTOR_HVX(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  EVT VT = Op.getOperand(0).getValueType();
+  SDLoc dl(Op);
+  bool UseHVX = Subtarget.useHVXOps();
+  bool UseHVXSgl = Subtarget.useHVXSglOps();
+  // Just in case...
+
+  if (!VT.isVector() || !UseHVX)
+    return SDValue();
+
+  EVT ResVT = Op.getValueType();
+  unsigned ResSize = ResVT.getSizeInBits();
+  unsigned VectorSizeInBits = UseHVXSgl ? (64 * 8) : (128 * 8);
+  unsigned OpSize = VT.getSizeInBits();
+
+  // We deal only with cases where the result is the vector size
+  // and the vector operand is a double register.
+  if (!(ResVT.isByteSized() && ResSize == VectorSizeInBits) ||
+      !(VT.isByteSized() && OpSize == 2 * VectorSizeInBits))
+    return SDValue();
+
+  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!Cst)
+    return SDValue();
+  unsigned Val = Cst->getZExtValue();
+
+  // These two will get lowered to an appropriate EXTRACT_SUBREG in ISel.
+  if (Val == 0) {
+    SDValue Vec = Op.getOperand(0);
+    return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, ResVT, Vec);
+  }
+
+  if (ResVT.getVectorNumElements() == Val) {
+    SDValue Vec = Op.getOperand(0);
+    return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, ResVT, Vec);
+  }
+
+  return SDValue();
+}
+
+SDValue
+HexagonTargetLowering::LowerEXTRACT_VECTOR(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  // If we are dealing with EXTRACT_SUBVECTOR on a HVX type, we may
+  // be able to simplify it to an EXTRACT_SUBREG.
+  if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && Subtarget.useHVXOps() &&
+      isHvxVectorType(Op.getValueType().getSimpleVT()))
+    return LowerEXTRACT_SUBVECTOR_HVX(Op, DAG);
+
+  EVT VT = Op.getValueType();
+  int VTN = VT.isVector() ? VT.getVectorNumElements() : 1;
+  SDLoc dl(Op);
+  SDValue Idx = Op.getOperand(1);
+  SDValue Vec = Op.getOperand(0);
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  int EltSize = EltVT.getSizeInBits();
+  SDValue Width = DAG.getConstant(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT ?
+                                  EltSize : VTN * EltSize, dl, MVT::i64);
+
+  // Constant element number.
+  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Idx)) {
+    uint64_t X = CI->getZExtValue();
+    SDValue Offset = DAG.getConstant(X * EltSize, dl, MVT::i32);
+    const SDValue Ops[] = {Vec, Width, Offset};
+
+    ConstantSDNode *CW = dyn_cast<ConstantSDNode>(Width);
+    assert(CW && "Non constant width in LowerEXTRACT_VECTOR");
+
+    SDValue N;
+    MVT SVT = VecVT.getSimpleVT();
+    uint64_t W = CW->getZExtValue();
+
+    if (W == 32) {
+      // Translate this node into EXTRACT_SUBREG.
+      unsigned Subreg = (X == 0) ? Hexagon::isub_lo : 0;
+
+      if (X == 0)
+        Subreg = Hexagon::isub_lo;
+      else if (SVT == MVT::v2i32 && X == 1)
+        Subreg = Hexagon::isub_hi;
+      else if (SVT == MVT::v4i16 && X == 2)
+        Subreg = Hexagon::isub_hi;
+      else if (SVT == MVT::v8i8 && X == 4)
+        Subreg = Hexagon::isub_hi;
+      else
+        llvm_unreachable("Bad offset");
+      N = DAG.getTargetExtractSubreg(Subreg, dl, MVT::i32, Vec);
+
+    } else if (SVT.getSizeInBits() == 32) {
+      N = DAG.getNode(HexagonISD::EXTRACTU, dl, MVT::i32, Ops);
+    } else if (SVT.getSizeInBits() == 64) {
+      N = DAG.getNode(HexagonISD::EXTRACTU, dl, MVT::i64, Ops);
+      if (VT.getSizeInBits() == 32)
+        N = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, N);
+    } else
+      return SDValue();
+
+    return DAG.getNode(ISD::BITCAST, dl, VT, N);
+  }
+
+  // Variable element number.
+  SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i32, Idx,
+                               DAG.getConstant(EltSize, dl, MVT::i32));
+  SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width,
+                                DAG.getConstant(32, dl, MVT::i64));
+  SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset);
+
+  const SDValue Ops[] = {Vec, Combined};
+
+  SDValue N;
+  if (VecVT.getSizeInBits() == 32) {
+    N = DAG.getNode(HexagonISD::EXTRACTURP, dl, MVT::i32, Ops);
+  } else {
+    N = DAG.getNode(HexagonISD::EXTRACTURP, dl, MVT::i64, Ops);
+    if (VT.getSizeInBits() == 32)
+      N = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, N);
+  }
+  return DAG.getNode(ISD::BITCAST, dl, VT, N);
+}
+
+SDValue
+HexagonTargetLowering::LowerINSERT_VECTOR(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  int VTN = VT.isVector() ? VT.getVectorNumElements() : 1;
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue Val = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  int EltSize = EltVT.getSizeInBits();
+  SDValue Width = DAG.getConstant(Op.getOpcode() == ISD::INSERT_VECTOR_ELT ?
+                                  EltSize : VTN * EltSize, dl, MVT::i64);
+
+  if (ConstantSDNode *C = cast<ConstantSDNode>(Idx)) {
+    SDValue Offset = DAG.getConstant(C->getSExtValue() * EltSize, dl, MVT::i32);
+    const SDValue Ops[] = {Vec, Val, Width, Offset};
+
+    SDValue N;
+    if (VT.getSizeInBits() == 32)
+      N = DAG.getNode(HexagonISD::INSERT, dl, MVT::i32, Ops);
+    else if (VT.getSizeInBits() == 64)
+      N = DAG.getNode(HexagonISD::INSERT, dl, MVT::i64, Ops);
+    else
+      return SDValue();
+
+    return DAG.getNode(ISD::BITCAST, dl, VT, N);
+  }
+
+  // Variable element number.
+  SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i32, Idx,
+                               DAG.getConstant(EltSize, dl, MVT::i32));
+  SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width,
+                                DAG.getConstant(32, dl, MVT::i64));
+  SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset);
+
+  if (VT.getSizeInBits() == 64 && Val.getValueSizeInBits() == 32) {
+    SDValue C = DAG.getConstant(0, dl, MVT::i32);
+    Val = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, Val);
+  }
+
+  const SDValue Ops[] = {Vec, Val, Combined};
+
+  SDValue N;
+  if (VT.getSizeInBits() == 32)
+    N = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, Ops);
+  else if (VT.getSizeInBits() == 64)
+    N = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, Ops);
+  else
+    return SDValue();
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, N);
+}
+
+bool
+HexagonTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+  // Assuming the caller does not have either a signext or zeroext modifier, and
+  // only one value is accepted, any reasonable truncation is allowed.
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+
+  // FIXME: in principle up to 64-bit could be made safe, but it would be very
+  // fragile at the moment: any support for multiple value returns would be
+  // liable to disallow tail calls involving i64 -> iN truncation in many cases.
+  return Ty1->getPrimitiveSizeInBits() <= 32;
+}
+
+SDValue
+HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain     = Op.getOperand(0);
+  SDValue Offset    = Op.getOperand(1);
+  SDValue Handler   = Op.getOperand(2);
+  SDLoc dl(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // Mark function as containing a call to EH_RETURN.
+  HexagonMachineFunctionInfo *FuncInfo =
+    DAG.getMachineFunction().getInfo<HexagonMachineFunctionInfo>();
+  FuncInfo->setHasEHReturn();
+
+  unsigned OffsetReg = Hexagon::R28;
+
+  SDValue StoreAddr =
+      DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getRegister(Hexagon::R30, PtrVT),
+                  DAG.getIntPtrConstant(4, dl));
+  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
+  Chain = DAG.getCopyToReg(Chain, dl, OffsetReg, Offset);
+
+  // Not needed we already use it as explict input to EH_RETURN.
+  // MF.getRegInfo().addLiveOut(OffsetReg);
+
+  return DAG.getNode(HexagonISD::EH_RETURN, dl, MVT::Other, Chain);
+}
+
+SDValue
+HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  unsigned Opc = Op.getOpcode();
+  switch (Opc) {
+    default:
+#ifndef NDEBUG
+      Op.getNode()->dumpr(&DAG);
+      if (Opc > HexagonISD::OP_BEGIN && Opc < HexagonISD::OP_END)
+        errs() << "Check for a non-legal type in this operation\n";
+#endif
+      llvm_unreachable("Should not custom lower this!");
+    case ISD::CONCAT_VECTORS:       return LowerCONCAT_VECTORS(Op, DAG);
+    case ISD::INSERT_SUBVECTOR:     return LowerINSERT_VECTOR(Op, DAG);
+    case ISD::INSERT_VECTOR_ELT:    return LowerINSERT_VECTOR(Op, DAG);
+    case ISD::EXTRACT_SUBVECTOR:    return LowerEXTRACT_VECTOR(Op, DAG);
+    case ISD::EXTRACT_VECTOR_ELT:   return LowerEXTRACT_VECTOR(Op, DAG);
+    case ISD::BUILD_VECTOR:         return LowerBUILD_VECTOR(Op, DAG);
+    case ISD::VECTOR_SHUFFLE:       return LowerVECTOR_SHUFFLE(Op, DAG);
+    case ISD::SRA:
+    case ISD::SHL:
+    case ISD::SRL:                  return LowerVECTOR_SHIFT(Op, DAG);
+    case ISD::ConstantPool:         return LowerConstantPool(Op, DAG);
+    case ISD::JumpTable:            return LowerJumpTable(Op, DAG);
+    case ISD::EH_RETURN:            return LowerEH_RETURN(Op, DAG);
+      // Frame & Return address. Currently unimplemented.
+    case ISD::RETURNADDR:           return LowerRETURNADDR(Op, DAG);
+    case ISD::FRAMEADDR:            return LowerFRAMEADDR(Op, DAG);
+    case ISD::GlobalTLSAddress:     return LowerGlobalTLSAddress(Op, DAG);
+    case ISD::ATOMIC_FENCE:         return LowerATOMIC_FENCE(Op, DAG);
+    case ISD::GlobalAddress:        return LowerGLOBALADDRESS(Op, DAG);
+    case ISD::BlockAddress:         return LowerBlockAddress(Op, DAG);
+    case ISD::GLOBAL_OFFSET_TABLE:  return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
+    case ISD::VASTART:              return LowerVASTART(Op, DAG);
+    // Custom lower some vector loads.
+    case ISD::LOAD:                 return LowerLOAD(Op, DAG);
+    case ISD::DYNAMIC_STACKALLOC:   return LowerDYNAMIC_STACKALLOC(Op, DAG);
+    case ISD::SETCC:                return LowerSETCC(Op, DAG);
+    case ISD::VSELECT:              return LowerVSELECT(Op, DAG);
+    case ISD::CTPOP:                return LowerCTPOP(Op, DAG);
+    case ISD::INTRINSIC_WO_CHAIN:   return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+    case ISD::INTRINSIC_VOID:       return LowerINTRINSIC_VOID(Op, DAG);
+    case ISD::INLINEASM:            return LowerINLINEASM(Op, DAG);
+    case ISD::PREFETCH:             return LowerPREFETCH(Op, DAG);
+  }
+}
+
+/// Returns relocation base for the given PIC jumptable.
+SDValue
+HexagonTargetLowering::getPICJumpTableRelocBase(SDValue Table,
+                                                SelectionDAG &DAG) const {
+  int Idx = cast<JumpTableSDNode>(Table)->getIndex();
+  EVT VT = Table.getValueType();
+  SDValue T = DAG.getTargetJumpTable(Idx, VT, HexagonII::MO_PCREL);
+  return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Table), VT, T);
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+TargetLowering::ConstraintType
+HexagonTargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+      case 'q':
+      case 'v':
+        if (Subtarget.useHVXOps())
+          return C_Register;
+        break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+HexagonTargetLowering::getRegForInlineAsmConstraint(
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+  bool UseHVX = Subtarget.useHVXOps(), UseHVXDbl = Subtarget.useHVXDblOps();
+
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':   // R0-R31
+      switch (VT.SimpleTy) {
+      default:
+        llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
+      case MVT::i1:
+      case MVT::i8:
+      case MVT::i16:
+      case MVT::i32:
+      case MVT::f32:
+        return std::make_pair(0U, &Hexagon::IntRegsRegClass);
+      case MVT::i64:
+      case MVT::f64:
+        return std::make_pair(0U, &Hexagon::DoubleRegsRegClass);
+      }
+    case 'q': // q0-q3
+      switch (VT.SimpleTy) {
+      default:
+        llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
+      case MVT::v1024i1:
+      case MVT::v512i1:
+      case MVT::v32i16:
+      case MVT::v16i32:
+      case MVT::v64i8:
+      case MVT::v8i64:
+        return std::make_pair(0U, &Hexagon::VecPredRegsRegClass);
+      }
+    case 'v': // V0-V31
+      switch (VT.SimpleTy) {
+      default:
+        llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
+      case MVT::v16i32:
+      case MVT::v32i16:
+      case MVT::v64i8:
+      case MVT::v8i64:
+        return std::make_pair(0U, &Hexagon::VectorRegsRegClass);
+      case MVT::v32i32:
+      case MVT::v64i16:
+      case MVT::v16i64:
+      case MVT::v128i8:
+        if (Subtarget.hasV60TOps() && UseHVX && UseHVXDbl)
+          return std::make_pair(0U, &Hexagon::VectorRegs128BRegClass);
+        return std::make_pair(0U, &Hexagon::VecDblRegsRegClass);
+      case MVT::v256i8:
+      case MVT::v128i16:
+      case MVT::v64i32:
+      case MVT::v32i64:
+        return std::make_pair(0U, &Hexagon::VecDblRegs128BRegClass);
+      }
+
+    default:
+      llvm_unreachable("Unknown asm register class");
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  return Subtarget.hasV5TOps();
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented by
+/// AM is legal for this target, for a load/store of the specified type.
+bool HexagonTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                  const AddrMode &AM, Type *Ty,
+                                                  unsigned AS) const {
+  if (Ty->isSized()) {
+    // When LSR detects uses of the same base address to access different
+    // types (e.g. unions), it will assume a conservative type for these
+    // uses:
+    //   LSR Use: Kind=Address of void in addrspace(4294967295), ...
+    // The type Ty passed here would then be "void". Skip the alignment
+    // checks, but do not return false right away, since that confuses
+    // LSR into crashing.
+    unsigned A = DL.getABITypeAlignment(Ty);
+    // The base offset must be a multiple of the alignment.
+    if ((AM.BaseOffs % A) != 0)
+      return false;
+    // The shifted offset must fit in 11 bits.
+    if (!isInt<11>(AM.BaseOffs >> Log2_32(A)))
+      return false;
+  }
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  int Scale = AM.Scale;
+  if (Scale < 0)
+    Scale = -Scale;
+  switch (Scale) {
+  case 0:  // No scale reg, "r+i", "r", or just "i".
+    break;
+  default: // No scaled addressing mode.
+    return false;
+  }
+  return true;
+}
+
+/// Return true if folding a constant offset with the given GlobalAddress is
+/// legal.  It is frequently not legal in PIC relocation models.
+bool HexagonTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA)
+      const {
+  return HTM.getRelocationModel() == Reloc::Static;
+}
+
+/// isLegalICmpImmediate - Return true if the specified immediate is legal
+/// icmp immediate, that is the target has icmp instructions which can compare
+/// a register against the immediate without having to materialize the
+/// immediate into a register.
+bool HexagonTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  return Imm >= -512 && Imm <= 511;
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool HexagonTargetLowering::IsEligibleForTailCallOptimization(
+                                 SDValue Callee,
+                                 CallingConv::ID CalleeCC,
+                                 bool isVarArg,
+                                 bool isCalleeStructRet,
+                                 bool isCallerStructRet,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 const SmallVectorImpl<ISD::InputArg> &Ins,
+                                 SelectionDAG& DAG) const {
+  const Function *CallerF = DAG.getMachineFunction().getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+
+  // ***************************************************************************
+  //  Look for obvious safe cases to perform tail call optimization that do not
+  //  require ABI changes.
+  // ***************************************************************************
+
+  // If this is a tail call via a function pointer, then don't do it!
+  if (!isa<GlobalAddressSDNode>(Callee) &&
+      !isa<ExternalSymbolSDNode>(Callee)) {
+    return false;
+  }
+
+  // Do not optimize if the calling conventions do not match and the conventions
+  // used are not C or Fast.
+  if (!CCMatch) {
+    bool R = (CallerCC == CallingConv::C || CallerCC == CallingConv::Fast);
+    bool E = (CalleeCC == CallingConv::C || CalleeCC == CallingConv::Fast);
+    // If R & E, then ok.
+    if (!R || !E)
+      return false;
+  }
+
+  // Do not tail call optimize vararg calls.
+  if (isVarArg)
+    return false;
+
+  // Also avoid tail call optimization if either caller or callee uses struct
+  // return semantics.
+  if (isCalleeStructRet || isCallerStructRet)
+    return false;
+
+  // In addition to the cases above, we also disable Tail Call Optimization if
+  // the calling convention code that at least one outgoing argument needs to
+  // go on the stack. We cannot check that here because at this point that
+  // information is not available.
+  return true;
+}
+
+/// Returns the target specific optimal type for load and store operations as
+/// a result of memset, memcpy, and memmove lowering.
+///
+/// If DstAlign is zero that means it's safe to destination alignment can
+/// satisfy any constraint. Similarly if SrcAlign is zero it means there isn't
+/// a need to check it against alignment requirement, probably because the
+/// source does not need to be loaded. If 'IsMemset' is true, that means it's
+/// expanding a memset. If 'ZeroMemset' is true, that means it's a memset of
+/// zero. 'MemcpyStrSrc' indicates whether the memcpy source is constant so it
+/// does not need to be loaded.  It returns EVT::Other if the type should be
+/// determined using generic target-independent logic.
+EVT HexagonTargetLowering::getOptimalMemOpType(uint64_t Size,
+      unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset,
+      bool MemcpyStrSrc, MachineFunction &MF) const {
+
+  auto Aligned = [](unsigned GivenA, unsigned MinA) -> bool {
+    return (GivenA % MinA) == 0;
+  };
+
+  if (Size >= 8 && Aligned(DstAlign, 8) && (IsMemset || Aligned(SrcAlign, 8)))
+    return MVT::i64;
+  if (Size >= 4 && Aligned(DstAlign, 4) && (IsMemset || Aligned(SrcAlign, 4)))
+    return MVT::i32;
+  if (Size >= 2 && Aligned(DstAlign, 2) && (IsMemset || Aligned(SrcAlign, 2)))
+    return MVT::i16;
+
+  return MVT::Other;
+}
+
+bool HexagonTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+      unsigned AS, unsigned Align, bool *Fast) const {
+  if (Fast)
+    *Fast = false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    return false;
+  case MVT::v64i8:
+  case MVT::v128i8:
+  case MVT::v256i8:
+  case MVT::v32i16:
+  case MVT::v64i16:
+  case MVT::v128i16:
+  case MVT::v16i32:
+  case MVT::v32i32:
+  case MVT::v64i32:
+  case MVT::v8i64:
+  case MVT::v16i64:
+  case MVT::v32i64:
+    return true;
+  }
+  return false;
+}
+
+std::pair<const TargetRegisterClass*, uint8_t>
+HexagonTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+      MVT VT) const {
+  const TargetRegisterClass *RRC = nullptr;
+
+  uint8_t Cost = 1;
+  switch (VT.SimpleTy) {
+  default:
+    return TargetLowering::findRepresentativeClass(TRI, VT);
+  case MVT::v64i8:
+  case MVT::v32i16:
+  case MVT::v16i32:
+  case MVT::v8i64:
+    RRC = &Hexagon::VectorRegsRegClass;
+    break;
+  case MVT::v128i8:
+  case MVT::v64i16:
+  case MVT::v32i32:
+  case MVT::v16i64:
+    if (Subtarget.hasV60TOps() && Subtarget.useHVXOps() &&
+        Subtarget.useHVXDblOps())
+      RRC = &Hexagon::VectorRegs128BRegClass;
+    else
+      RRC = &Hexagon::VecDblRegsRegClass;
+    break;
+  case MVT::v256i8:
+  case MVT::v128i16:
+  case MVT::v64i32:
+  case MVT::v32i64:
+    RRC = &Hexagon::VecDblRegs128BRegClass;
+    break;
+  }
+  return std::make_pair(RRC, Cost);
+}
+
+Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+      AtomicOrdering Ord) const {
+  BasicBlock *BB = Builder.GetInsertBlock();
+  Module *M = BB->getParent()->getParent();
+  Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
+  unsigned SZ = Ty->getPrimitiveSizeInBits();
+  assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic loads supported");
+  Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_L2_loadw_locked
+                                   : Intrinsic::hexagon_L4_loadd_locked;
+  Value *Fn = Intrinsic::getDeclaration(M, IntID);
+  return Builder.CreateCall(Fn, Addr, "larx");
+}
+
+/// Perform a store-conditional operation to Addr. Return the status of the
+/// store. This should be 0 if the store succeeded, non-zero otherwise.
+Value *HexagonTargetLowering::emitStoreConditional(IRBuilder<> &Builder,
+      Value *Val, Value *Addr, AtomicOrdering Ord) const {
+  BasicBlock *BB = Builder.GetInsertBlock();
+  Module *M = BB->getParent()->getParent();
+  Type *Ty = Val->getType();
+  unsigned SZ = Ty->getPrimitiveSizeInBits();
+  assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic stores supported");
+  Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_S2_storew_locked
+                                   : Intrinsic::hexagon_S4_stored_locked;
+  Value *Fn = Intrinsic::getDeclaration(M, IntID);
+  Value *Call = Builder.CreateCall(Fn, {Addr, Val}, "stcx");
+  Value *Cmp = Builder.CreateICmpEQ(Call, Builder.getInt32(0), "");
+  Value *Ext = Builder.CreateZExt(Cmp, Type::getInt32Ty(M->getContext()));
+  return Ext;
+}
+
+TargetLowering::AtomicExpansionKind
+HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  // Do not expand loads and stores that don't exceed 64 bits.
+  return LI->getType()->getPrimitiveSizeInBits() > 64
+             ? AtomicExpansionKind::LLOnly
+             : AtomicExpansionKind::None;
+}
+
+bool HexagonTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  // Do not expand loads and stores that don't exceed 64 bits.
+  return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() > 64;
+}
+
+bool HexagonTargetLowering::shouldExpandAtomicCmpXchgInIR(
+      AtomicCmpXchgInst *AI) const {
+  const DataLayout &DL = AI->getModule()->getDataLayout();
+  unsigned Size = DL.getTypeStoreSize(AI->getCompareOperand()->getType());
+  return Size >= 4 && Size <= 8;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
new file mode 100644
index 000000000000..a8ed29e585d4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -0,0 +1,294 @@
+//===-- HexagonISelLowering.h - Hexagon DAG Lowering Interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Hexagon uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONISELLOWERING_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONISELLOWERING_H
+
+#include "Hexagon.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Target/TargetLowering.h"
+#include <cstdint>
+#include <utility>
+
+namespace llvm {
+
+namespace HexagonISD {
+
+    enum NodeType : unsigned {
+      OP_BEGIN = ISD::BUILTIN_OP_END,
+
+      CONST32 = OP_BEGIN,
+      CONST32_GP,  // For marking data present in GP.
+      ALLOCA,
+
+      AT_GOT,      // Index in GOT.
+      AT_PCREL,    // Offset relative to PC.
+
+      CALL,        // Function call.
+      CALLnr,      // Function call that does not return.
+      CALLR,
+
+      RET_FLAG,    // Return with a flag operand.
+      BARRIER,     // Memory barrier.
+      JT,          // Jump table.
+      CP,          // Constant pool.
+
+      POPCOUNT,
+      COMBINE,
+      PACKHL,
+      VSPLATB,
+      VSPLATH,
+      SHUFFEB,
+      SHUFFEH,
+      SHUFFOB,
+      SHUFFOH,
+      VSXTBH,
+      VSXTBW,
+      VSRAW,
+      VSRAH,
+      VSRLW,
+      VSRLH,
+      VSHLW,
+      VSHLH,
+      VCMPBEQ,
+      VCMPBGT,
+      VCMPBGTU,
+      VCMPHEQ,
+      VCMPHGT,
+      VCMPHGTU,
+      VCMPWEQ,
+      VCMPWGT,
+      VCMPWGTU,
+
+      INSERT,
+      INSERTRP,
+      EXTRACTU,
+      EXTRACTURP,
+      VCOMBINE,
+      VPACK,
+      TC_RETURN,
+      EH_RETURN,
+      DCFETCH,
+
+      OP_END
+    };
+
+} // end namespace HexagonISD
+
+  class HexagonSubtarget;
+
+  class HexagonTargetLowering : public TargetLowering {
+    int VarArgsFrameOffset;   // Frame offset to start of varargs area.
+    const HexagonTargetMachine &HTM;
+    const HexagonSubtarget &Subtarget;
+
+    bool CanReturnSmallStruct(const Function* CalleeFn, unsigned& RetSize)
+        const;
+    void promoteLdStType(MVT VT, MVT PromotedLdStVT);
+
+  public:
+    explicit HexagonTargetLowering(const TargetMachine &TM,
+                                   const HexagonSubtarget &ST);
+
+    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization. Targets which want to do tail call
+    /// optimization should implement this function.
+    bool IsEligibleForTailCallOptimization(SDValue Callee,
+        CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet,
+        bool isCallerStructRet, const SmallVectorImpl<ISD::OutputArg> &Outs,
+        const SmallVectorImpl<SDValue> &OutVals,
+        const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG& DAG) const;
+
+    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+    bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+    bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
+
+    /// Return true if an FMA operation is faster than a pair of mul and add
+    /// instructions. fmuladd intrinsics will be expanded to FMAs when this
+    /// method returns true (and FMAs are legal), otherwise fmuladd is
+    /// expanded to mul + add.
+    bool isFMAFasterThanFMulAndFAdd(EVT) const override;
+
+    // Should we expand the build vector with shuffles?
+    bool shouldExpandBuildVectorWithShuffles(EVT VT,
+        unsigned DefinedValues) const override;
+
+    bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask, EVT VT)
+        const override;
+
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+    const char *getTargetNodeName(unsigned Opcode) const override;
+    SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEXTRACT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEXTRACT_SUBVECTOR_HVX(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINSERT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+        SelectionDAG &DAG) const;
+    SDValue LowerToTLSInitialExecModel(GlobalAddressSDNode *GA,
+        SelectionDAG &DAG) const;
+    SDValue LowerToTLSLocalExecModel(GlobalAddressSDNode *GA,
+        SelectionDAG &DAG) const;
+    SDValue GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
+        GlobalAddressSDNode *GA, SDValue *InFlag, EVT PtrVT,
+        unsigned ReturnReg, unsigned char OperandFlags) const;
+    SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+        SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            const SDLoc &dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals,
+                            const SmallVectorImpl<SDValue> &OutVals,
+                            SDValue Callee) const;
+
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
+
+    bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+
+    /// If a physical register, this returns the register that receives the
+    /// exception address on entry to an EH pad.
+    unsigned
+    getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+      return Hexagon::R0;
+    }
+
+    /// If a physical register, this returns the register that receives the
+    /// exception typeid on entry to a landing pad.
+    unsigned
+    getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+      return Hexagon::R1;
+    }
+
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+
+    EVT getSetCCResultType(const DataLayout &, LLVMContext &C,
+                           EVT VT) const override {
+      if (!VT.isVector())
+        return MVT::i1;
+      else
+        return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
+    }
+
+    bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                    SDValue &Base, SDValue &Offset,
+                                    ISD::MemIndexedMode &AM,
+                                    SelectionDAG &DAG) const override;
+
+    ConstraintType getConstraintType(StringRef Constraint) const override;
+
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 StringRef Constraint, MVT VT) const override;
+
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+      if (ConstraintCode == "o")
+        return InlineAsm::Constraint_o;
+      return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+    }
+
+    // Intrinsics
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    /// The type may be VoidTy, in which case only return true if the addressing
+    /// mode is legal for a load/store of any legal type.
+    /// TODO: Handle pre/postinc as well.
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
+    /// Return true if folding a constant offset with the given GlobalAddress
+    /// is legal.  It is frequently not legal in PIC relocation models.
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalICmpImmediate(int64_t Imm) const override;
+
+    EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+        unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+        MachineFunction &MF) const override;
+
+    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+        unsigned Align, bool *Fast) const override;
+
+    /// Returns relocation base for the given PIC jumptable.
+    SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG)
+                                     const override;
+
+    // Handling of atomic RMW instructions.
+    Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+        AtomicOrdering Ord) const override;
+    Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+        Value *Addr, AtomicOrdering Ord) const override;
+    AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+    bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+    bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
+    AtomicExpansionKind
+    shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override {
+      return AtomicExpansionKind::LLSC;
+    }
+
+  protected:
+    std::pair<const TargetRegisterClass*, uint8_t>
+    findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT)
+        const override;
+  };
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td
new file mode 100644
index 000000000000..7283d94ee759
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td
@@ -0,0 +1,652 @@
+//==- HexagonInstrAlias.td - Hexagon Instruction Aliases ---*- tablegen -*--==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//                     Hexagon Instruction Mappings
+//===----------------------------------------------------------------------===//
+
+
+def : InstAlias<"memb({GP}+#$addr) = $Nt.new",
+                (S2_storerbnewgp u16_0Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memh({GP}+#$addr) = $Nt.new",
+                (S2_storerhnewgp u16_1Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memw({GP}+#$addr) = $Nt.new",
+                (S2_storerinewgp u16_2Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memb({GP}+#$addr) = $Nt",
+                (S2_storerbgp u16_0Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memh({GP}+#$addr) = $Nt",
+                (S2_storerhgp u16_1Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memh({GP}+#$addr) = $Nt.h",
+                (S2_storerfgp u16_1Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memw({GP}+#$addr) = $Nt",
+                (S2_storerigp u16_2Imm:$addr, IntRegs:$Nt)>;
+def : InstAlias<"memd({GP}+#$addr) = $Nt",
+                (S2_storerdgp u16_3Imm:$addr, DoubleRegs:$Nt)>;
+
+def : InstAlias<"$Nt = memb({GP}+#$addr)",
+                (L2_loadrbgp IntRegs:$Nt, u16_0Imm:$addr)>;
+def : InstAlias<"$Nt = memub({GP}+#$addr)",
+                (L2_loadrubgp IntRegs:$Nt, u16_0Imm:$addr)>;
+def : InstAlias<"$Nt = memh({GP}+#$addr)",
+                (L2_loadrhgp IntRegs:$Nt, u16_1Imm:$addr)>;
+def : InstAlias<"$Nt = memuh({GP}+#$addr)",
+                (L2_loadruhgp IntRegs:$Nt, u16_1Imm:$addr)>;
+def : InstAlias<"$Nt = memw({GP}+#$addr)",
+                (L2_loadrigp IntRegs:$Nt, u16_2Imm:$addr)>;
+def : InstAlias<"$Nt = memd({GP}+#$addr)",
+                (L2_loadrdgp DoubleRegs:$Nt, u16_3Imm:$addr)>;
+
+// Alias of: memXX($Rs+#XX) = $Rt to memXX($Rs) = $Rt
+def : InstAlias<"memb($Rs) = $Rt",
+      (S2_storerb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memh($Rs) = $Rt",
+      (S2_storerh_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memh($Rs) = $Rt.h",
+      (S2_storerf_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memw($Rs) = $Rt",
+      (S2_storeri_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memb($Rs) = $Rt.new",
+      (S2_storerbnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memh($Rs) = $Rt.new",
+      (S2_storerhnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memw($Rs) = $Rt.new",
+      (S2_storerinew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"memb($Rs) = #$S8",
+      (S4_storeirb_io IntRegs:$Rs, 0, s8_0Ext:$S8), 0>;
+
+def : InstAlias<"memh($Rs) = #$S8",
+      (S4_storeirh_io IntRegs:$Rs, 0, s8_0Ext:$S8), 0>;
+
+def : InstAlias<"memw($Rs) = #$S8",
+      (S4_storeiri_io IntRegs:$Rs, 0, s8_0Ext:$S8), 0>;
+
+def : InstAlias<"memd($Rs) = $Rtt",
+      (S2_storerd_io IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"memb($Rs) = setbit(#$U5)",
+      (L4_ior_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
+
+def : InstAlias<"memh($Rs) = setbit(#$U5)",
+      (L4_ior_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
+
+def : InstAlias<"memw($Rs) = setbit(#$U5)",
+      (L4_ior_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
+
+def : InstAlias<"memb($Rs) = clrbit(#$U5)",
+      (L4_iand_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
+
+def : InstAlias<"memh($Rs) = clrbit(#$U5)",
+      (L4_iand_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
+
+def : InstAlias<"memw($Rs) = clrbit(#$U5)",
+      (L4_iand_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
+
+// Alias of: $Rd = memXX($Rs+#XX) to $Rd = memXX($Rs)
+def : InstAlias<"$Rd = memb($Rs)",
+      (L2_loadrb_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memub($Rs)",
+      (L2_loadrub_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memh($Rs)",
+      (L2_loadrh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memuh($Rs)",
+      (L2_loadruh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memw($Rs)",
+      (L2_loadri_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = memd($Rs)",
+      (L2_loadrd_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = memubh($Rs)",
+      (L2_loadbzw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = memubh($Rs)",
+      (L2_loadbzw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rd = membh($Rs)",
+      (L2_loadbsw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = membh($Rs)",
+      (L2_loadbsw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = memb_fifo($Rs)",
+      (L2_loadalignb_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"$Rdd = memh_fifo($Rs)",
+      (L2_loadalignh_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
+
+// Alias of: if ($Pt) $Rd = memXX($Rs + #$u6_X)
+//       to: if ($Pt) $Rd = memXX($Rs)
+def : InstAlias<"if ($Pt) $Rd = memb($Rs)",
+      (L2_ploadrbt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rd = memub($Rs)",
+      (L2_ploadrubt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rd = memh($Rs)",
+      (L2_ploadrht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rd = memuh($Rs)",
+      (L2_ploadruht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rd = memw($Rs)",
+      (L2_ploadrit_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt) $Rdd = memd($Rs)",
+      (L2_ploadrdt_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+// Alias of: if ($Pt) memXX($Rs + #$u6_X) = $Rt
+//       to: if ($Pt) memXX($Rs) = $Rt
+def : InstAlias<"if ($Pt) memb($Rs) = $Rt",
+      (S2_pstorerbt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memh($Rs) = $Rt",
+      (S2_pstorerht_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memh($Rs) = $Rt.h",
+      (S2_pstorerft_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memw($Rs) = $Rt",
+      (S2_pstorerit_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memd($Rs) = $Rtt",
+      (S2_pstorerdt_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"if ($Pt) memb($Rs) = $Rt.new",
+      (S2_pstorerbnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memh($Rs) = $Rt.new",
+      (S2_pstorerhnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memw($Rs) = $Rt.new",
+      (S2_pstorerinewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt.new) memb($Rs) = $Rt.new",
+      (S4_pstorerbnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt.new) memh($Rs) = $Rt.new",
+      (S4_pstorerhnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt.new) memw($Rs) = $Rt.new",
+      (S4_pstorerinewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+
+// Alias of: if (!$Pt) $Rd = memXX($Rs + #$u6_X)
+//       to: if (!$Pt) $Rd = memXX($Rs)
+def : InstAlias<"if (!$Pt) $Rd = memb($Rs)",
+      (L2_ploadrbf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rd = memub($Rs)",
+      (L2_ploadrubf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rd = memh($Rs)",
+      (L2_ploadrhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rd = memuh($Rs)",
+      (L2_ploadruhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rd = memw($Rs)",
+      (L2_ploadrif_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt) $Rdd = memd($Rs)",
+      (L2_ploadrdf_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+// Alias of: if (!$Pt) memXX($Rs + #$u6_X) = $Rt
+//       to: if (!$Pt) memXX($Rs) = $Rt
+def : InstAlias<"if (!$Pt) memb($Rs) = $Rt",
+      (S2_pstorerbf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memh($Rs) = $Rt",
+      (S2_pstorerhf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.h",
+      (S2_pstorerff_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memw($Rs) = $Rt",
+      (S2_pstorerif_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memd($Rs) = $Rtt",
+      (S2_pstorerdf_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"if (!$Pt) memb($Rs) = $Rt.new",
+      (S2_pstorerbnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.new",
+      (S2_pstorerhnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt) memw($Rs) = $Rt.new",
+      (S2_pstorerinewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt.new) memb($Rs) = $Rt.new",
+      (S4_pstorerbnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt.new) memh($Rs) = $Rt.new",
+      (S4_pstorerhnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pt.new) memw($Rs) = $Rt.new",
+      (S4_pstorerinewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pt) memb($Rs) = #$S6",
+      (S4_storeirbt_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt) memh($Rs) = #$S6",
+      (S4_storeirht_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt) memw($Rs) = #$S6",
+      (S4_storeirit_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt.new) memb($Rs) = #$S6",
+      (S4_storeirbtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt.new) memh($Rs) = #$S6",
+      (S4_storeirhtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if ($Pt.new) memw($Rs) = #$S6",
+      (S4_storeiritnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt) memb($Rs) = #$S6",
+      (S4_storeirbf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt) memh($Rs) = #$S6",
+      (S4_storeirhf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt) memw($Rs) = #$S6",
+      (S4_storeirif_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt.new) memb($Rs) = #$S6",
+      (S4_storeirbfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt.new) memh($Rs) = #$S6",
+      (S4_storeirhfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+def : InstAlias<"if (!$Pt.new) memw($Rs) = #$S6",
+      (S4_storeirifnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
+
+// Alias of: memXX($Rs + $u6_X) |= $Rt, also &=, +=, -=
+//       to: memXX($Rs) |= $Rt
+def : InstAlias<"memb($Rs) &= $Rt",
+      (L4_and_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) |= $Rt",
+      (L4_or_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) += $Rt",
+      (L4_add_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) -= $Rt",
+      (L4_sub_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) += #$U5",
+      (L4_iadd_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memb($Rs) -= #$U5",
+      (L4_isub_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) &= $Rt",
+      (L4_and_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) |= $Rt",
+      (L4_or_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) += $Rt",
+      (L4_add_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) -= $Rt",
+      (L4_sub_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) += #$U5",
+      (L4_iadd_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memh($Rs) -= #$U5",
+      (L4_isub_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) &= $Rt",
+      (L4_and_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) |= $Rt",
+      (L4_or_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) += $Rt",
+      (L4_add_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) -= $Rt",
+      (L4_sub_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) += #$U5",
+      (L4_iadd_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
+      Requires<[UseMEMOP]>;
+
+def : InstAlias<"memw($Rs) -= #$U5",
+      (L4_isub_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
+      Requires<[UseMEMOP]>;
+
+//
+// Alias of: if ($Pv.new) memX($Rs) = $Rt
+//       to: if (p3.new) memX(r17 + #0) = $Rt
+def : InstAlias<"if ($Pv.new) memb($Rs) = $Rt",
+      (S4_pstorerbtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt",
+      (S4_pstorerhtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt.h",
+      (S4_pstorerftnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pv.new) memw($Rs) = $Rt",
+      (S4_pstoreritnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if ($Pv.new) memd($Rs) = $Rtt",
+      (S4_pstorerdtnew_io
+       PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memb($Rs) = $Rt",
+      (S4_pstorerbfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt",
+      (S4_pstorerhfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt.h",
+      (S4_pstorerffnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memw($Rs) = $Rt",
+      (S4_pstorerifnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
+
+def : InstAlias<"if (!$Pv.new) memd($Rs) = $Rtt",
+      (S4_pstorerdfnew_io
+       PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
+
+//
+// Alias of: if ($Pt.new) $Rd = memub($Rs) -- And if (!$Pt.new) ...
+//       to: if ($Pt.new) $Rd = memub($Rs + #$u6_0)
+def : InstAlias<"if ($Pt.new) $Rd = memub($Rs)",
+      (L2_ploadrubtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rd = memb($Rs)",
+      (L2_ploadrbtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rd = memh($Rs)",
+      (L2_ploadrhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rd = memuh($Rs)",
+      (L2_ploadruhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rd = memw($Rs)",
+      (L2_ploadritnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if ($Pt.new) $Rdd = memd($Rs)",
+      (L2_ploadrdtnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memub($Rs)",
+      (L2_ploadrubfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memb($Rs)",
+      (L2_ploadrbfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memh($Rs)",
+      (L2_ploadrhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memuh($Rs)",
+      (L2_ploadruhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rd = memw($Rs)",
+      (L2_ploadrifnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"if (!$Pt.new) $Rdd = memd($Rs)",
+      (L2_ploadrdfnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
+
+def : InstAlias<"dcfetch($Rs)",
+      (Y2_dcfetchbo IntRegs:$Rs, 0), 0>;
+
+// Alias of some insn mappings, others must be handled by the parser
+def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)",
+      (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
+def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)",
+      (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
+
+// Rd=neg(Rs) is aliased to Rd=sub(#0,Rs)
+def : InstAlias<"$Rd = neg($Rs)",
+      (A2_subri IntRegs:$Rd, 0, IntRegs:$Rs), 0>;
+
+def : InstAlias<"m0 = $Rs", (A2_tfrrcr C6, IntRegs:$Rs)>;
+def : InstAlias<"$Rd = m0", (A2_tfrcrr IntRegs:$Rd, C6)>;
+def : InstAlias<"m1 = $Rs", (A2_tfrrcr C7, IntRegs:$Rs)>;
+def : InstAlias<"$Rd = m1", (A2_tfrcrr IntRegs:$Rd, C7)>;
+
+def : InstAlias<"$Pd = $Ps",
+      (C2_or PredRegs:$Pd, PredRegs:$Ps, PredRegs:$Ps), 0>;
+
+def : InstAlias<"$Rdd = vaddb($Rss, $Rtt)",
+      (A2_vaddub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 1>;
+
+def : InstAlias<"$Rdd = vsubb($Rss,$Rtt)",
+      (A2_vsubub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 0>;
+
+def : InstAlias<"$Rd = mpyui($Rs,$Rt)",
+      (M2_mpyi IntRegs:$Rd, IntRegs:$Rs, IntRegs:$Rt), 0>;
+
+// Assembler mapped insns: cmp.lt(a,b) -> cmp.gt(b,a)
+def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)",
+      (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
+def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)",
+      (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
+
+// maps if (!Pu) jumpr Rs -> if (!Pu) jumpr:nt Rs
+def : InstAlias<"if (!$Pu) jumpr $Rs",
+      (J2_jumprf PredRegs:$Pu, IntRegs:$Rs)>,
+      Requires<[HasV60T]>;
+
+// maps if (Pu) jumpr Rs -> if (Pu) jumpr:nt Rs
+def : InstAlias<"if ($Pu) jumpr $Rs",
+      (J2_jumprt PredRegs:$Pu, IntRegs:$Rs)>,
+      Requires<[HasV60T]>;
+
+// maps if (!Pu) jump $r15_2 -> if (!Pu) jump:nt $r15_2
+def : InstAlias<"if (!$Pu) jump $r15_2",
+      (J2_jumpf PredRegs:$Pu, brtarget:$r15_2)>,
+      Requires<[HasV60T]>;
+
+// maps if (Pu) jump $r15_2 -> if (Pu) jump:nt $r15_2
+def : InstAlias<"if ($Pu) jump $r15_2",
+     (J2_jumpt PredRegs:$Pu, brtarget:$r15_2)>,
+     Requires<[HasV60T]>;
+
+def : InstAlias<"if ($src) jump $r15_2",
+      (J2_jumpt PredRegs:$src, brtarget:$r15_2), 0>;
+
+def : InstAlias<"if (!$src) jump $r15_2",
+      (J2_jumpf PredRegs:$src, brtarget:$r15_2), 0>;
+
+def : InstAlias<"if ($src1) jumpr $src2",
+      (J2_jumprt PredRegs:$src1, IntRegs:$src2), 0>;
+
+def : InstAlias<"if (!$src1) jumpr $src2",
+      (J2_jumprf PredRegs:$src1, IntRegs:$src2), 0>;
+
+// maps Vdd = Vss to Vdd = V6_vassignp(Vss)
+def : InstAlias<"$Vdd = $Vss",
+      (V6_vassignp VecDblRegs:$Vdd, VecDblRegs:$Vss)>,
+      Requires<[HasV60T]>;
+
+// maps Vd = #0 to Vd = vxor(Vd, Vd)
+def : InstAlias<"$Vd = #0",
+      (V6_vxor VectorRegs:$Vd, VectorRegs:$Vd, VectorRegs:$Vd)>,
+      Requires<[HasV60T]>;
+
+// maps Vdd  = #0 to Vdd = vsub(Vdd, Vdd)
+def : InstAlias<"$Vdd = #0",
+      (V6_vsubw_dv VecDblRegs:$Vdd, VecDblRegs:$Vdd, VecDblRegs:$Vdd)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd = vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd = vcmp.eq($Vu.h, $Vv.h)"
+def : InstAlias<"$Qd = vcmp.eq($Vu.uh, $Vv.uh)",
+      (V6_veqh VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd &= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd &= vcmp.eq($Vu.h, $Vv.h)"
+def : InstAlias<"$Qd &= vcmp.eq($Vu.uh, $Vv.uh)",
+      (V6_veqh_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd |= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd |= vcmp.eq($Vu.h, $Vv.h)"
+def : InstAlias<"$Qd |= vcmp.eq($Vu.uh, $Vv.uh)",
+      (V6_veqh_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd ^= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd ^= vcmp.eq($Vu.h, $Vv.h)"
+def : InstAlias<"$Qd ^= vcmp.eq($Vu.uh, $Vv.uh)",
+      (V6_veqh_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd = vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd = vcmp.eq($Vu.w, $Vv.w)"
+def : InstAlias<"$Qd = vcmp.eq($Vu.uw, $Vv.uw)",
+      (V6_veqw VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd &= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd &= vcmp.eq($Vu.w, $Vv.w)"
+def : InstAlias<"$Qd &= vcmp.eq($Vu.uw, $Vv.uw)",
+      (V6_veqw_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd |= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd |= vcmp.eq($Vu.w, $Vv.w)"
+def : InstAlias<"$Qd |= vcmp.eq($Vu.uw, $Vv.uw)",
+      (V6_veqh_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd ^= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd ^= vcmp.eq($Vu.w, $Vv.w)"
+def : InstAlias<"$Qd ^= vcmp.eq($Vu.uw, $Vv.uw)",
+      (V6_veqw_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd = vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd = vcmp.eq($Vu.b, $Vv.b)"
+def : InstAlias<"$Qd = vcmp.eq($Vu.ub, $Vv.ub)",
+      (V6_veqb VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd &= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd &= vcmp.eq($Vu.b, $Vv.b)"
+def : InstAlias<"$Qd &= vcmp.eq($Vu.ub, $Vv.ub)",
+      (V6_veqb_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd |= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd |= vcmp.eq($Vu.b, $Vv.b)"
+def : InstAlias<"$Qd |= vcmp.eq($Vu.ub, $Vv.ub)",
+      (V6_veqb_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Qd ^= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd ^= vcmp.eq($Vu.b, $Vv.b)"
+def : InstAlias<"$Qd ^= vcmp.eq($Vu.ub, $Vv.ub)",
+      (V6_veqb_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
+      Requires<[HasV60T]>;
+
+// maps   "$Rd.w = vextract($Vu, $Rs)" -> "$Rd = vextract($Vu, $Rs)"
+def : InstAlias<"$Rd.w = vextract($Vu, $Rs)",
+      (V6_extractw IntRegs:$Rd, VectorRegs:$Vu, IntRegs:$Rs)>,
+      Requires<[HasV60T]>;
+
+// Mapping from vtrans2x2(Vy32,Vx32,Rt32) to vshuff(Vy32,Vx32,Rt32)
+def : InstAlias<"vtrans2x2($Vy, $Vx, $Rt)",
+      (V6_vshuff VectorRegs:$Vy, VectorRegs:$Vx, IntRegs:$Rt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"$Vt=vmem($Rs)",
+      (V6_vL32b_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"$Vt=vmem($Rs):nt",
+      (V6_vL32b_nt_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"vmem($Rs)=$Vt",
+      (V6_vS32b_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"vmem($Rs):nt=$Vt",
+      (V6_vS32b_nt_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"vmem($Rs)=$Vt.new",
+      (V6_vS32b_new_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"vmem($Rs):nt=$Vt.new",
+      (V6_vS32b_nt_new_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if ($Qv) vmem($Rs)=$Vt",
+      (V6_vS32b_qpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if (!$Qv) vmem($Rs)=$Vt",
+      (V6_vS32b_nqpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if ($Qv) vmem($Rs):nt=$Vt",
+      (V6_vS32b_nt_qpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if (!$Qv) vmem($Rs):nt=$Vt",
+      (V6_vS32b_nt_nqpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if ($Pv) vmem($Rs)=$Vt",
+      (V6_vS32b_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if (!$Pv) vmem($Rs)=$Vt",
+      (V6_vS32b_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if ($Pv) vmem($Rs):nt=$Vt",
+      (V6_vS32b_nt_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if (!$Pv) vmem($Rs):nt=$Vt",
+      (V6_vS32b_nt_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"$Vt=vmemu($Rs)",
+      (V6_vL32Ub_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"vmemu($Rs)=$Vt",
+      (V6_vS32Ub_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if ($Pv) vmemu($Rs)=$Vt",
+      (V6_vS32Ub_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+def : InstAlias<"if (!$Pv) vmemu($Rs)=$Vt",
+      (V6_vS32Ub_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
+      Requires<[HasV60T]>;
+
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td
new file mode 100644
index 000000000000..280832fd167f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td
@@ -0,0 +1,1019 @@
+class Enc_COPROC_VX_3op_v<bits<15> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+
+  let Inst{31-16} = { opc{14-4}, src2};
+  let Inst{13-0} = { opc{3}, src1, opc{2-0}, dst};
+}
+
+class V6_vtmpyb_enc : Enc_COPROC_VX_3op_v<0b000110010000000>;
+class V6_vtmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000001>;
+class V6_vdmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110010000010>;
+class V6_vrmpyub_enc : Enc_COPROC_VX_3op_v<0b000110010000011>;
+class V6_vrmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000100>;
+class V6_vdsaduh_enc : Enc_COPROC_VX_3op_v<0b000110010000101>;
+class V6_vdmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000110>;
+class V6_vdmpybus_dv_enc : Enc_COPROC_VX_3op_v<0b000110010000111>;
+class V6_vtmpyb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001000>;
+class V6_vtmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001001>;
+class V6_vtmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001010>;
+class V6_vdmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001011>;
+class V6_vrmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001100>;
+class V6_vrmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001101>;
+class V6_vdmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001110>;
+class V6_vdmpybus_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001111>;
+class V6_vdmpyhsusat_enc : Enc_COPROC_VX_3op_v<0b000110010010000>;
+class V6_vdmpyhsuisat_enc : Enc_COPROC_VX_3op_v<0b000110010010001>;
+class V6_vdmpyhsat_enc : Enc_COPROC_VX_3op_v<0b000110010010010>;
+class V6_vdmpyhisat_enc : Enc_COPROC_VX_3op_v<0b000110010010011>;
+class V6_vdmpyhb_dv_enc : Enc_COPROC_VX_3op_v<0b000110010010100>;
+class V6_vmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010010101>;
+class V6_vmpabus_enc : Enc_COPROC_VX_3op_v<0b000110010010110>;
+class V6_vmpahb_enc : Enc_COPROC_VX_3op_v<0b000110010010111>;
+class V6_vdmpyhsusat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011000>;
+class V6_vdmpyhsuisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011001>;
+class V6_vdmpyhisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011010>;
+class V6_vdmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011011>;
+class V6_vdmpyhb_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011100>;
+class V6_vmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011101>;
+class V6_vmpabus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011110>;
+class V6_vmpahb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011111>;
+class V6_vmpyh_enc : Enc_COPROC_VX_3op_v<0b000110010100000>;
+class V6_vmpyhss_enc : Enc_COPROC_VX_3op_v<0b000110010100001>;
+class V6_vmpyhsrs_enc : Enc_COPROC_VX_3op_v<0b000110010100010>;
+class V6_vmpyuh_enc : Enc_COPROC_VX_3op_v<0b000110010100011>;
+class V6_vmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101000>;
+class V6_vmpyuh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101001>;
+class V6_vmpyiwb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101010>;
+class V6_vmpyiwh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101011>;
+class V6_vmpyihb_enc : Enc_COPROC_VX_3op_v<0b000110010110000>;
+class V6_vror_enc : Enc_COPROC_VX_3op_v<0b000110010110001>;
+class V6_vasrw_enc : Enc_COPROC_VX_3op_v<0b000110010110101>;
+class V6_vasrh_enc : Enc_COPROC_VX_3op_v<0b000110010110110>;
+class V6_vaslw_enc : Enc_COPROC_VX_3op_v<0b000110010110111>;
+class V6_vdsaduh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111000>;
+class V6_vmpyihb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111001>;
+class V6_vaslw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111010>;
+class V6_vasrw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111101>;
+class V6_vaslh_enc : Enc_COPROC_VX_3op_v<0b000110011000000>;
+class V6_vlsrw_enc : Enc_COPROC_VX_3op_v<0b000110011000001>;
+class V6_vlsrh_enc : Enc_COPROC_VX_3op_v<0b000110011000010>;
+class V6_vmpyiwh_enc : Enc_COPROC_VX_3op_v<0b000110011000111>;
+class V6_vmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110011001000>;
+class V6_vmpyiwb_enc : Enc_COPROC_VX_3op_v<0b000110011010000>;
+class V6_vtmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110011010100>;
+class V6_vmpyub_enc : Enc_COPROC_VX_3op_v<0b000110011100000>;
+class V6_vrmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000000>;
+class V6_vrmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000001>;
+class V6_vrmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000010>;
+class V6_vdmpyhvsat_enc : Enc_COPROC_VX_3op_v<0b000111000000011>;
+class V6_vmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000100>;
+class V6_vmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000101>;
+class V6_vmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000110>;
+class V6_vmpyhv_enc : Enc_COPROC_VX_3op_v<0b000111000000111>;
+class V6_vrmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001000>;
+class V6_vrmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001001>;
+class V6_vrmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001010>;
+class V6_vdmpyhvsat_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001011>;
+class V6_vmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001100>;
+class V6_vmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001101>;
+class V6_vmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001110>;
+class V6_vmpyhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001111>;
+class V6_vmpyuhv_enc : Enc_COPROC_VX_3op_v<0b000111000010000>;
+class V6_vmpyhvsrs_enc : Enc_COPROC_VX_3op_v<0b000111000010001>;
+class V6_vmpyhus_enc : Enc_COPROC_VX_3op_v<0b000111000010010>;
+class V6_vmpabusv_enc : Enc_COPROC_VX_3op_v<0b000111000010011>;
+class V6_vmpyih_enc : Enc_COPROC_VX_3op_v<0b000111000010100>;
+class V6_vand_enc : Enc_COPROC_VX_3op_v<0b000111000010101>;
+class V6_vor_enc : Enc_COPROC_VX_3op_v<0b000111000010110>;
+class V6_vxor_enc : Enc_COPROC_VX_3op_v<0b000111000010111>;
+class V6_vmpyuhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011000>;
+class V6_vmpyhus_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011001>;
+class V6_vmpyih_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011100>;
+class V6_vmpyiewuh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011101>;
+class V6_vmpyowh_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011110>;
+class V6_vmpyowh_rnd_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011111>;
+class V6_vaddw_enc : Enc_COPROC_VX_3op_v<0b000111000100000>;
+class V6_vaddubsat_enc : Enc_COPROC_VX_3op_v<0b000111000100001>;
+class V6_vadduhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100010>;
+class V6_vaddhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100011>;
+class V6_vaddwsat_enc : Enc_COPROC_VX_3op_v<0b000111000100100>;
+class V6_vsubb_enc : Enc_COPROC_VX_3op_v<0b000111000100101>;
+class V6_vsubh_enc : Enc_COPROC_VX_3op_v<0b000111000100110>;
+class V6_vsubw_enc : Enc_COPROC_VX_3op_v<0b000111000100111>;
+class V6_vmpyiewh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000101000>;
+class V6_vsububsat_enc : Enc_COPROC_VX_3op_v<0b000111000110000>;
+class V6_vsubuhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110001>;
+class V6_vsubhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110010>;
+class V6_vsubwsat_enc : Enc_COPROC_VX_3op_v<0b000111000110011>;
+class V6_vaddb_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110100>;
+class V6_vaddh_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110101>;
+class V6_vaddw_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110110>;
+class V6_vaddubsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110111>;
+class V6_vadduhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000000>;
+class V6_vaddhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000001>;
+class V6_vaddwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000010>;
+class V6_vsubb_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000011>;
+class V6_vsubh_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000100>;
+class V6_vsubw_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000101>;
+class V6_vsububsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000110>;
+class V6_vsubuhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000111>;
+class V6_vsubhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010000>;
+class V6_vsubwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010001>;
+class V6_vaddubh_enc : Enc_COPROC_VX_3op_v<0b000111001010010>;
+class V6_vadduhw_enc : Enc_COPROC_VX_3op_v<0b000111001010011>;
+class V6_vaddhw_enc : Enc_COPROC_VX_3op_v<0b000111001010100>;
+class V6_vsububh_enc : Enc_COPROC_VX_3op_v<0b000111001010101>;
+class V6_vsubuhw_enc : Enc_COPROC_VX_3op_v<0b000111001010110>;
+class V6_vsubhw_enc : Enc_COPROC_VX_3op_v<0b000111001010111>;
+class V6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b000111001100000>;
+class V6_vabsdiffh_enc : Enc_COPROC_VX_3op_v<0b000111001100001>;
+class V6_vabsdiffuh_enc : Enc_COPROC_VX_3op_v<0b000111001100010>;
+class V6_vabsdiffw_enc : Enc_COPROC_VX_3op_v<0b000111001100011>;
+class V6_vavgub_enc : Enc_COPROC_VX_3op_v<0b000111001100100>;
+class V6_vavguh_enc : Enc_COPROC_VX_3op_v<0b000111001100101>;
+class V6_vavgh_enc : Enc_COPROC_VX_3op_v<0b000111001100110>;
+class V6_vavgw_enc : Enc_COPROC_VX_3op_v<0b000111001100111>;
+class V6_vnavgub_enc : Enc_COPROC_VX_3op_v<0b000111001110000>;
+class V6_vnavgh_enc : Enc_COPROC_VX_3op_v<0b000111001110001>;
+class V6_vnavgw_enc : Enc_COPROC_VX_3op_v<0b000111001110010>;
+class V6_vavgubrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110011>;
+class V6_vavguhrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110100>;
+class V6_vavghrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110101>;
+class V6_vavgwrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110110>;
+class V6_vmpabuuv_enc : Enc_COPROC_VX_3op_v<0b000111001110111>;
+class V6_vminub_enc : Enc_COPROC_VX_3op_v<0b000111110000001>;
+class V6_vminuh_enc : Enc_COPROC_VX_3op_v<0b000111110000010>;
+class V6_vminh_enc : Enc_COPROC_VX_3op_v<0b000111110000011>;
+class V6_vminw_enc : Enc_COPROC_VX_3op_v<0b000111110000100>;
+class V6_vmaxub_enc : Enc_COPROC_VX_3op_v<0b000111110000101>;
+class V6_vmaxuh_enc : Enc_COPROC_VX_3op_v<0b000111110000110>;
+class V6_vmaxh_enc : Enc_COPROC_VX_3op_v<0b000111110000111>;
+class V6_vmaxw_enc : Enc_COPROC_VX_3op_v<0b000111110010000>;
+class V6_vdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010001>;
+class V6_vrdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010011>;
+class V6_vdealb4w_enc : Enc_COPROC_VX_3op_v<0b000111110010111>;
+class V6_vmpyowh_rnd_enc : Enc_COPROC_VX_3op_v<0b000111110100000>;
+class V6_vshuffeb_enc : Enc_COPROC_VX_3op_v<0b000111110100001>;
+class V6_vshuffob_enc : Enc_COPROC_VX_3op_v<0b000111110100010>;
+class V6_vshufeh_enc : Enc_COPROC_VX_3op_v<0b000111110100011>;
+class V6_vshufoh_enc : Enc_COPROC_VX_3op_v<0b000111110100100>;
+class V6_vshufoeh_enc : Enc_COPROC_VX_3op_v<0b000111110100101>;
+class V6_vshufoeb_enc : Enc_COPROC_VX_3op_v<0b000111110100110>;
+class V6_vcombine_enc : Enc_COPROC_VX_3op_v<0b000111110100111>;
+class V6_vmpyieoh_enc : Enc_COPROC_VX_3op_v<0b000111110110000>;
+class V6_vsathub_enc : Enc_COPROC_VX_3op_v<0b000111110110010>;
+class V6_vsatwh_enc : Enc_COPROC_VX_3op_v<0b000111110110011>;
+class V6_vroundwh_enc : Enc_COPROC_VX_3op_v<0b000111110110100>;
+class V6_vroundwuh_enc : Enc_COPROC_VX_3op_v<0b000111110110101>;
+class V6_vroundhb_enc : Enc_COPROC_VX_3op_v<0b000111110110110>;
+class V6_vroundhub_enc : Enc_COPROC_VX_3op_v<0b000111110110111>;
+class V6_vasrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010000>;
+class V6_vlsrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010001>;
+class V6_vlsrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010010>;
+class V6_vasrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010011>;
+class V6_vaslwv_enc : Enc_COPROC_VX_3op_v<0b000111111010100>;
+class V6_vaslhv_enc : Enc_COPROC_VX_3op_v<0b000111111010101>;
+class V6_vaddb_enc : Enc_COPROC_VX_3op_v<0b000111111010110>;
+class V6_vaddh_enc : Enc_COPROC_VX_3op_v<0b000111111010111>;
+class V6_vmpyiewuh_enc : Enc_COPROC_VX_3op_v<0b000111111100000>;
+class V6_vmpyiowh_enc : Enc_COPROC_VX_3op_v<0b000111111100001>;
+class V6_vpackeb_enc : Enc_COPROC_VX_3op_v<0b000111111100010>;
+class V6_vpackeh_enc : Enc_COPROC_VX_3op_v<0b000111111100011>;
+class V6_vpackhub_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100101>;
+class V6_vpackhb_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100110>;
+class V6_vpackwuh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100111>;
+class V6_vpackwh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111110000>;
+class V6_vpackob_enc : Enc_COPROC_VX_3op_v<0b000111111110001>;
+class V6_vpackoh_enc : Enc_COPROC_VX_3op_v<0b000111111110010>;
+class V6_vmpyewuh_enc : Enc_COPROC_VX_3op_v<0b000111111110101>;
+class V6_vmpyowh_enc : Enc_COPROC_VX_3op_v<0b000111111110111>;
+class V6_extractw_enc : Enc_COPROC_VX_3op_v<0b100100100000001>;
+class M6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b111010001010000>;
+class M6_vabsdiffb_enc : Enc_COPROC_VX_3op_v<0b111010001110000>;
+
+class Enc_COPROC_VX_cmp<bits<13> opc> : OpcodeHexagon {
+  bits<2> dst;
+  bits<5> src1;
+  bits<5> src2;
+
+  let Inst{31-16} = { 0b00011, opc{12-7}, src2{4-0} };
+  let Inst{13-0} = { opc{6}, src1{4-0}, opc{5-0}, dst{1-0} };
+}
+
+class V6_vandvrt_acc_enc : Enc_COPROC_VX_cmp<0b0010111100000>;
+class V6_vandvrt_enc : Enc_COPROC_VX_cmp<0b0011010010010>;
+class V6_veqb_and_enc : Enc_COPROC_VX_cmp<0b1001001000000>;
+class V6_veqh_and_enc : Enc_COPROC_VX_cmp<0b1001001000001>;
+class V6_veqw_and_enc : Enc_COPROC_VX_cmp<0b1001001000010>;
+class V6_vgtb_and_enc : Enc_COPROC_VX_cmp<0b1001001000100>;
+class V6_vgth_and_enc : Enc_COPROC_VX_cmp<0b1001001000101>;
+class V6_vgtw_and_enc : Enc_COPROC_VX_cmp<0b1001001000110>;
+class V6_vgtub_and_enc : Enc_COPROC_VX_cmp<0b1001001001000>;
+class V6_vgtuh_and_enc : Enc_COPROC_VX_cmp<0b1001001001001>;
+class V6_vgtuw_and_enc : Enc_COPROC_VX_cmp<0b1001001001010>;
+class V6_veqb_or_enc : Enc_COPROC_VX_cmp<0b1001001010000>;
+class V6_veqh_or_enc : Enc_COPROC_VX_cmp<0b1001001010001>;
+class V6_veqw_or_enc : Enc_COPROC_VX_cmp<0b1001001010010>;
+class V6_vgtb_or_enc : Enc_COPROC_VX_cmp<0b1001001010100>;
+class V6_vgth_or_enc : Enc_COPROC_VX_cmp<0b1001001010101>;
+class V6_vgtw_or_enc : Enc_COPROC_VX_cmp<0b1001001010110>;
+class V6_vgtub_or_enc : Enc_COPROC_VX_cmp<0b1001001011000>;
+class V6_vgtuh_or_enc : Enc_COPROC_VX_cmp<0b1001001011001>;
+class V6_vgtuw_or_enc : Enc_COPROC_VX_cmp<0b1001001011010>;
+class V6_veqb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100000>;
+class V6_veqh_xor_enc : Enc_COPROC_VX_cmp<0b1001001100001>;
+class V6_veqw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100010>;
+class V6_vgtb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100100>;
+class V6_vgth_xor_enc : Enc_COPROC_VX_cmp<0b1001001100101>;
+class V6_vgtw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100110>;
+class V6_vgtub_xor_enc : Enc_COPROC_VX_cmp<0b1001001101000>;
+class V6_vgtuh_xor_enc : Enc_COPROC_VX_cmp<0b1001001101001>;
+class V6_vgtuw_xor_enc : Enc_COPROC_VX_cmp<0b1001001101010>;
+class V6_veqb_enc : Enc_COPROC_VX_cmp<0b1111000000000>;
+class V6_veqh_enc : Enc_COPROC_VX_cmp<0b1111000000001>;
+class V6_veqw_enc : Enc_COPROC_VX_cmp<0b1111000000010>;
+class V6_vgtb_enc : Enc_COPROC_VX_cmp<0b1111000000100>;
+class V6_vgth_enc : Enc_COPROC_VX_cmp<0b1111000000101>;
+class V6_vgtw_enc : Enc_COPROC_VX_cmp<0b1111000000110>;
+class V6_vgtub_enc : Enc_COPROC_VX_cmp<0b1111000001000>;
+class V6_vgtuh_enc : Enc_COPROC_VX_cmp<0b1111000001001>;
+class V6_vgtuw_enc : Enc_COPROC_VX_cmp<0b1111000001010>;
+
+class Enc_COPROC_VX_p2op<bits<5> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> dst;
+  bits<5> src2;
+
+  let Inst{31-16} = { 0b00011110, src1{1-0}, 0b0000, opc{4-3} };
+  let Inst{13-0} = { 1, src2{4-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vaddbq_enc : Enc_COPROC_VX_p2op<0b01000>;
+class V6_vaddhq_enc : Enc_COPROC_VX_p2op<0b01001>;
+class V6_vaddwq_enc : Enc_COPROC_VX_p2op<0b01010>;
+class V6_vaddbnq_enc : Enc_COPROC_VX_p2op<0b01011>;
+class V6_vaddhnq_enc : Enc_COPROC_VX_p2op<0b01100>;
+class V6_vaddwnq_enc : Enc_COPROC_VX_p2op<0b01101>;
+class V6_vsubbq_enc : Enc_COPROC_VX_p2op<0b01110>;
+class V6_vsubhq_enc : Enc_COPROC_VX_p2op<0b01111>;
+class V6_vsubwq_enc : Enc_COPROC_VX_p2op<0b10000>;
+class V6_vsubbnq_enc : Enc_COPROC_VX_p2op<0b10001>;
+class V6_vsubhnq_enc : Enc_COPROC_VX_p2op<0b10010>;
+class V6_vsubwnq_enc : Enc_COPROC_VX_p2op<0b10011>;
+
+class Enc_COPROC_VX_2op<bits<6> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+
+  let Inst{31-16} = { 0b00011110000000, opc{5-4} };
+  let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vabsh_enc : Enc_COPROC_VX_2op<0b000000>;
+class V6_vabsh_sat_enc : Enc_COPROC_VX_2op<0b000001>;
+class V6_vabsw_enc : Enc_COPROC_VX_2op<0b000010>;
+class V6_vabsw_sat_enc : Enc_COPROC_VX_2op<0b000011>;
+class V6_vnot_enc : Enc_COPROC_VX_2op<0b000100>;
+class V6_vdealh_enc : Enc_COPROC_VX_2op<0b000110>;
+class V6_vdealb_enc : Enc_COPROC_VX_2op<0b000111>;
+class V6_vunpackob_enc : Enc_COPROC_VX_2op<0b001000>;
+class V6_vunpackoh_enc : Enc_COPROC_VX_2op<0b001001>;
+class V6_vunpackub_enc : Enc_COPROC_VX_2op<0b010000>;
+class V6_vunpackuh_enc : Enc_COPROC_VX_2op<0b010001>;
+class V6_vunpackb_enc : Enc_COPROC_VX_2op<0b010010>;
+class V6_vunpackh_enc : Enc_COPROC_VX_2op<0b010011>;
+class V6_vshuffh_enc : Enc_COPROC_VX_2op<0b010111>;
+class V6_vshuffb_enc : Enc_COPROC_VX_2op<0b100000>;
+class V6_vzb_enc : Enc_COPROC_VX_2op<0b100001>;
+class V6_vzh_enc : Enc_COPROC_VX_2op<0b100010>;
+class V6_vsb_enc : Enc_COPROC_VX_2op<0b100011>;
+class V6_vsh_enc : Enc_COPROC_VX_2op<0b100100>;
+class V6_vcl0w_enc : Enc_COPROC_VX_2op<0b100101>;
+class V6_vpopcounth_enc : Enc_COPROC_VX_2op<0b100110>;
+class V6_vcl0h_enc : Enc_COPROC_VX_2op<0b100111>;
+class V6_vnormamtw_enc : Enc_COPROC_VX_2op<0b110100>;
+class V6_vnormamth_enc : Enc_COPROC_VX_2op<0b110101>;
+class V6_vassign_enc : Enc_COPROC_VX_2op<0b111111>;
+
+class Enc_COPROC_VMEM_vL32_b_ai<bits<4> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<10> src2;
+  bits<4> src2_vector;
+
+  let src2_vector = src2{9-6};
+  let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} };
+  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0000>;
+class V6_vL32b_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0001>;
+class V6_vL32b_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0010>;
+class V6_vL32Ub_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0111>;
+class V6_vL32b_nt_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1000>;
+class V6_vL32b_nt_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1001>;
+class V6_vL32b_nt_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1010>;
+
+class Enc_COPROC_VMEM_vL32_b_ai_128B<bits<4> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<11> src2;
+  bits<4> src2_vector;
+
+  let src2_vector = src2{10-7};
+  let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} };
+  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0000>;
+class V6_vL32b_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0001>;
+class V6_vL32b_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0010>;
+class V6_vL32Ub_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0111>;
+class V6_vL32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1000>;
+class V6_vL32b_nt_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1001>;
+class V6_vL32b_nt_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1010>;
+
+class Enc_COPROC_VMEM_vS32_b_ai_64B<bits<4> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<10> src2;
+  bits<4> src2_vector;
+  bits<5> src3;
+
+  let src2_vector = src2{9-6};
+  let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} };
+  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} };
+}
+
+class Enc_COPROC_VMEM_vS32_b_ai_128B<bits<4> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<11> src2;
+  bits<4> src2_vector;
+  bits<5> src3;
+
+  let src2_vector = src2{10-7};
+  let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} };
+  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} };
+}
+
+class V6_vS32b_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0000>;
+class V6_vS32Ub_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0111>;
+class V6_vS32b_nt_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b1000>;
+
+class V6_vS32b_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0000>;
+class V6_vS32Ub_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0111>;
+class V6_vS32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b1000>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<bits<1> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<10> src2;
+  bits<4> src2_vector;
+  bits<3> src3;
+
+  let src2_vector = src2{9-6};
+  let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} };
+  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} };
+}
+
+class V6_vS32b_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<0>;
+class V6_vS32b_nt_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<1>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<bits<1> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<11> src2;
+  bits<4> src2_vector;
+  bits<3> src3;
+
+  let src2_vector = src2{10-7};
+  let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} };
+  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} };
+}
+
+class V6_vS32b_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<0>;
+class V6_vS32b_nt_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<1>;
+
+class Enc_COPROC_VMEM_vS32_b_pred_ai<bits<5> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<10> src3;
+  bits<4> src3_vector;
+  bits<5> src4;
+
+  let src3_vector = src3{9-6};
+  let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} };
+  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
+}
+
+class Enc_COPROC_VMEM_vS32_b_pred_ai_128B<bits<5> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<11> src3;
+  bits<4> src3_vector;
+  bits<5> src4;
+
+  let src3_vector = src3{10-7};
+  let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} };
+  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
+}
+
+class V6_vS32b_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00000>;
+class V6_vS32b_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00001>;
+class V6_vS32b_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01000>;
+class V6_vS32b_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01001>;
+class V6_vS32Ub_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01110>;
+class V6_vS32Ub_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01111>;
+class V6_vS32b_nt_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10000>;
+class V6_vS32b_nt_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10001>;
+class V6_vS32b_nt_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11000>;
+class V6_vS32b_nt_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11001>;
+
+class V6_vS32b_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00000>;
+class V6_vS32b_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00001>;
+class V6_vS32b_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01000>;
+class V6_vS32b_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01001>;
+class V6_vS32Ub_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01110>;
+class V6_vS32Ub_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01111>;
+class V6_vS32b_nt_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10000>;
+class V6_vS32b_nt_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10001>;
+class V6_vS32b_nt_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11000>;
+class V6_vS32b_nt_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11001>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<bits<4> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<10> src3;
+  bits<4> src3_vector;
+  bits<3> src4;
+
+  let src3_vector = src3{9-6};
+  let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} };
+  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0000>;
+class V6_vS32b_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0101>;
+class V6_vS32b_nt_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1010>;
+class V6_vS32b_nt_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1111>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<bits<4> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<11> src3;
+  bits<4> src3_vector;
+  bits<3> src4;
+
+  let src3_vector = src3{10-7};
+  let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} };
+  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0000>;
+class V6_vS32b_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0101>;
+class V6_vS32b_nt_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1010>;
+class V6_vS32b_nt_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1111>;
+
+// TODO: Change script to generate dst, src1, src2 instead of
+// dst, dst2, src1.
+class Enc_COPROC_VMEM_vL32_b_pi<bits<4> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<9> src2;
+  bits<3> src2_vector;
+
+  let src2_vector = src2{8-6};
+  let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} };
+  let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0000>;
+class V6_vL32b_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0001>;
+class V6_vL32b_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0010>;
+class V6_vL32Ub_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0111>;
+class V6_vL32b_nt_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1000>;
+class V6_vL32b_nt_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1001>;
+class V6_vL32b_nt_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1010>;
+
+class Enc_COPROC_VMEM_vL32_b_pi_128B<bits<4> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<10> src2;
+  bits<3> src2_vector;
+
+  let src2_vector = src2{9-7};
+  let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} };
+  let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0000>;
+class V6_vL32b_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0001>;
+class V6_vL32b_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0010>;
+class V6_vL32Ub_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0111>;
+class V6_vL32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1000>;
+class V6_vL32b_nt_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1001>;
+class V6_vL32b_nt_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1010>;
+
+
+// TODO: Change script to generate src1, src2 and src3 instead of
+// dst, src1, src2.
+class Enc_COPROC_VMEM_vS32_b_pi<bits<4> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<9> src2;
+  bits<3> src2_vector;
+  bits<5> src3;
+
+  let src2_vector = src2{8-6};
+  let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} };
+  let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} };
+}
+
+class V6_vS32b_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0000>;
+class V6_vS32Ub_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0111>;
+class V6_vS32b_nt_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b1000>;
+
+class Enc_COPROC_VMEM_vS32_b_pi_128B<bits<4> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<10> src2;
+  bits<3> src2_vector;
+  bits<5> src3;
+
+  let src2_vector = src2{9-7};
+  let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} };
+  let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} };
+}
+
+class V6_vS32b_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0000>;
+class V6_vS32Ub_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0111>;
+class V6_vS32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b1000>;
+
+// TODO: Change script to generate src1, src2 and src3 instead of
+// dst, src1, src2.
+class Enc_COPROC_VMEM_vS32b_n_ew_pi<bits<1> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<9> src2;
+  bits<3> src2_vector;
+  bits<3> src3;
+
+  let src2_vector = src2{8-6};
+  let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} };
+  let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} };
+}
+
+class V6_vS32b_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<0>;
+class V6_vS32b_nt_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<1>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<bits<1> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<10> src2;
+  bits<3> src2_vector;
+  bits<3> src3;
+
+  let src2_vector = src2{9-7};
+  let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} };
+  let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} };
+}
+
+class V6_vS32b_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<0>;
+class V6_vS32b_nt_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<1>;
+
+// TODO: Change script to generate src1, src2,src3 and src4 instead of
+// dst, src1, src2, src3.
+class Enc_COPROC_VMEM_vS32_b_pred_pi<bits<5> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<9> src3;
+  bits<3> src3_vector;
+  bits<5> src4;
+
+  let src3_vector = src3{8-6};
+  let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} };
+  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
+}
+
+class V6_vS32b_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00000>;
+class V6_vS32b_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00001>;
+class V6_vS32b_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01000>;
+class V6_vS32b_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01001>;
+class V6_vS32Ub_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01110>;
+class V6_vS32Ub_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01111>;
+class V6_vS32b_nt_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10000>;
+class V6_vS32b_nt_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10001>;
+class V6_vS32b_nt_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11000>;
+class V6_vS32b_nt_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11001>;
+
+// TODO: Change script to generate src1, src2,src3 and src4 instead of
+// dst, src1, src2, src3.
+class Enc_COPROC_VMEM_vS32_b_pred_pi_128B<bits<5> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<10> src3;
+  bits<3> src3_vector;
+  bits<5> src4;
+
+  let src3_vector = src3{9-7};
+  let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} };
+  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
+}
+
+class V6_vS32b_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00000>;
+class V6_vS32b_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00001>;
+class V6_vS32b_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01000>;
+class V6_vS32b_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01001>;
+class V6_vS32Ub_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01110>;
+class V6_vS32Ub_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01111>;
+class V6_vS32b_nt_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10000>;
+class V6_vS32b_nt_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10001>;
+class V6_vS32b_nt_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11000>;
+class V6_vS32b_nt_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11001>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<bits<4> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<9> src3;
+  bits<3> src3_vector;
+  bits<3> src4;
+
+  let src3_vector = src3{8-6};
+  let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} };
+  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0000>;
+class V6_vS32b_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0101>;
+class V6_vS32b_nt_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1010>;
+class V6_vS32b_nt_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1111>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<bits<4> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<10> src3;
+  bits<3> src3_vector;
+  bits<3> src4;
+
+  let src3_vector = src3{9-7};
+  let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} };
+  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0000>;
+class V6_vS32b_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0101>;
+class V6_vS32b_nt_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1010>;
+class V6_vS32b_nt_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1111>;
+
+class Enc_LD_load_m<bits<13> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<1> src2;
+
+  let Inst{31-16} = { opc{12}, 0, opc{11-10}, 1, opc{9-4}, src1{4-0} };
+  let Inst{13-0} = { src2{0}, 0b000, opc{3}, 0, opc{2-0}, dst{4-0} };
+}
+
+class V6_vL32b_ppu_enc : Enc_LD_load_m<0b0100110000000>;
+class V6_vL32b_cur_ppu_enc : Enc_LD_load_m<0b0100110000001>;
+class V6_vL32b_tmp_ppu_enc : Enc_LD_load_m<0b0100110000010>;
+class V6_vL32Ub_ppu_enc : Enc_LD_load_m<0b0100110000111>;
+class V6_vL32b_nt_ppu_enc : Enc_LD_load_m<0b0100110100000>;
+class V6_vL32b_nt_cur_ppu_enc : Enc_LD_load_m<0b0100110100001>;
+class V6_vL32b_nt_tmp_ppu_enc : Enc_LD_load_m<0b0100110100010>;
+
+class Enc_COPROC_VMEM_vS32_b_ppu<bits<4> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<1> src2;
+  bits<5> src3;
+
+  let Inst{31-16} = { 0b001010110, opc{3}, 1, src1{4-0} };
+  let Inst{13-0} = { src2{0}, 0b00000, opc{2-0}, src3{4-0} };
+}
+
+class V6_vS32b_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0000>;
+class V6_vS32Ub_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0111>;
+class V6_vS32b_nt_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b1000>;
+
+class Enc_COPROC_VMEM_vS32b_new_ppu<bits<1> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<1> src2;
+  bits<3> src3;
+
+  let Inst{31-16} = { 0b001010110, opc{0}, 1, src1{4-0} };
+  let Inst{13-0} = { src2{0}, 0b0000000100, src3{2-0} };
+}
+
+class V6_vS32b_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<0>;
+class V6_vS32b_nt_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<1>;
+
+class Enc_COPROC_VMEM_vS32_b_pred_ppu<bits<5> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<1> src3;
+  bits<5> src4;
+
+  let Inst{31-16} = { 0b001010111, opc{4-3}, src2{4-0} };
+  let Inst{13-0} = { src3{0}, src1{1-0}, 0b000, opc{2-0}, src4{4-0} };
+}
+
+class V6_vS32b_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00000>;
+class V6_vS32b_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00001>;
+class V6_vS32b_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01000>;
+class V6_vS32b_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01001>;
+class V6_vS32Ub_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01110>;
+class V6_vS32Ub_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01111>;
+class V6_vS32b_nt_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10000>;
+class V6_vS32b_nt_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10001>;
+class V6_vS32b_nt_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11000>;
+class V6_vS32b_nt_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11001>;
+
+class Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<bits<4> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> src2;
+  bits<1> src3;
+  bits<3> src4;
+
+  let Inst{31-16} = { 0b001010111, opc{3}, 1, src2{4-0} };
+  let Inst{13-0} = { src3{0}, src1{1-0}, 0b00001, opc{2-0}, src4{2-0} };
+}
+
+class V6_vS32b_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0000>;
+class V6_vS32b_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0101>;
+class V6_vS32b_nt_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1010>;
+class V6_vS32b_nt_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1111>;
+
+
+class Enc_COPROC_VX_4op_i<bits<5> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<1> src3;
+
+  let Inst{31-16} = { 0b00011001, opc{4-2}, src2{4-0} };
+  let Inst{13-0} = { opc{1}, src1{4-0}, 1, opc{0}, src3{0}, dst{4-0} };
+}
+
+class V6_vrmpybusi_enc : Enc_COPROC_VX_4op_i<0b01000>;
+class V6_vrsadubi_enc : Enc_COPROC_VX_4op_i<0b01001>;
+class V6_vrmpybusi_acc_enc : Enc_COPROC_VX_4op_i<0b01010>;
+class V6_vrsadubi_acc_enc : Enc_COPROC_VX_4op_i<0b01011>;
+class V6_vrmpyubi_acc_enc : Enc_COPROC_VX_4op_i<0b01111>;
+class V6_vrmpyubi_enc : Enc_COPROC_VX_4op_i<0b10101>;
+
+class Enc_COPROC_VX_vandqrt<bits<5> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<2> src1;
+  bits<5> src2;
+
+  let Inst{31-16} = { 0b00011001, opc{4-3}, 1, src2{4-0} };
+  let Inst{13-0} = { opc{2}, 0b000, src1{1-0}, opc{1-0}, 1, dst{4-0} };
+}
+
+class V6_vandqrt_acc_enc : Enc_COPROC_VX_vandqrt<0b01101>;
+class V6_vandqrt_enc : Enc_COPROC_VX_vandqrt<0b10010>;
+
+class Enc_COPROC_VX_cards<bits<2> opc> : OpcodeHexagon {
+  bits<5> src1;
+  bits<5> src2;
+  bits<5> src3;
+
+  let Inst{31-16} = { 0b00011001111, src3{4-0} };
+  let Inst{13-0} = { 1, src1{4-0}, 0, opc{1-0}, src2{4-0} };
+}
+
+class V6_vshuff_enc : Enc_COPROC_VX_cards<0b01>;
+class V6_vdeal_enc : Enc_COPROC_VX_cards<0b10>;
+
+
+class Enc_COPROC_VX_v_cmov<bits<1> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> dst;
+  bits<5> src2;
+
+  let Inst{31-16} = { 0b0001101000, opc{0}, 0b00000 };
+  let Inst{13-0} = { 0, src2{4-0}, 0, src1{1-0}, dst{4-0} };
+}
+
+class V6_vcmov_enc : Enc_COPROC_VX_v_cmov<0>;
+class V6_vncmov_enc : Enc_COPROC_VX_v_cmov<1>;
+
+class Enc_X_p3op<bits<8> opc> : OpcodeHexagon {
+  bits<2> src1;
+  bits<5> dst;
+  bits<5> src2;
+  bits<5> src3;
+
+  let Inst{31-16} = { opc{7-5}, 0b1101, opc{4}, 0, opc{3-2}, src3{4-0} };
+  let Inst{13-0} = { opc{1}, src2{4-0}, opc{0}, src1{1-0}, dst{4-0} };
+}
+
+class V6_vnccombine_enc : Enc_X_p3op<0b00001000>;
+class V6_vccombine_enc : Enc_X_p3op<0b00001100>;
+
+class Enc_COPROC_VX_4op_r<bits<4> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<3> src3;
+
+  let Inst{31-16} = { 0b00011011, src2{4-0}, src3{2-0} };
+  let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} };
+}
+
+class V6_valignb_enc : Enc_COPROC_VX_4op_r<0b0000>;
+class V6_vlalignb_enc : Enc_COPROC_VX_4op_r<0b0001>;
+class V6_vasrwh_enc : Enc_COPROC_VX_4op_r<0b0010>;
+class V6_vasrwhsat_enc : Enc_COPROC_VX_4op_r<0b0011>;
+class V6_vasrwhrndsat_enc : Enc_COPROC_VX_4op_r<0b0100>;
+class V6_vasrwuhsat_enc : Enc_COPROC_VX_4op_r<0b0101>;
+class V6_vasrhubsat_enc : Enc_COPROC_VX_4op_r<0b0110>;
+class V6_vasrhubrndsat_enc : Enc_COPROC_VX_4op_r<0b0111>;
+class V6_vasrhbrndsat_enc : Enc_COPROC_VX_4op_r<0b1000>;
+class V6_vlutvvb_enc : Enc_COPROC_VX_4op_r<0b1001>;
+class V6_vshuffvdd_enc : Enc_COPROC_VX_4op_r<0b1011>;
+class V6_vdealvdd_enc : Enc_COPROC_VX_4op_r<0b1100>;
+class V6_vlutvvb_oracc_enc : Enc_COPROC_VX_4op_r<0b1101>;
+class V6_vlutvwh_enc : Enc_COPROC_VX_4op_r<0b1110>;
+class V6_vlutvwh_oracc_enc : Enc_COPROC_VX_4op_r<0b1111>;
+
+class Enc_S_3op_valign_i<bits<9> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<3> src3;
+
+  let Inst{31-16} = { opc{8-7}, 0, opc{6-3}, 0b00, opc{2-1}, src2{4-0} };
+  let Inst{13-0} = { opc{0}, src1{4-0}, src3{2-0}, dst{4-0} };
+}
+
+class V6_vlutb_enc : Enc_S_3op_valign_i<0b001100000>;
+class V6_vlutb_dv_enc : Enc_S_3op_valign_i<0b001100010>;
+class V6_vlutb_acc_enc : Enc_S_3op_valign_i<0b001100100>;
+class V6_vlutb_dv_acc_enc : Enc_S_3op_valign_i<0b001100110>;
+class V6_valignbi_enc : Enc_S_3op_valign_i<0b001111011>;
+class V6_vlalignbi_enc : Enc_S_3op_valign_i<0b001111111>;
+class S2_valignib_enc : Enc_S_3op_valign_i<0b110000000>;
+class S2_addasl_rrri_enc : Enc_S_3op_valign_i<0b110010000>;
+
+class Enc_COPROC_VX_3op_q<bits<3> opc> : OpcodeHexagon {
+  bits<2> dst;
+  bits<2> src1;
+  bits<2> src2;
+
+  let Inst{31-16} = { 0b00011110, src2{1-0}, 0b000011 };
+  let Inst{13-0} = { 0b0000, src1{1-0}, 0b000, opc{2-0}, dst{1-0} };
+}
+
+class V6_pred_and_enc : Enc_COPROC_VX_3op_q<0b000>;
+class V6_pred_or_enc : Enc_COPROC_VX_3op_q<0b001>;
+class V6_pred_xor_enc : Enc_COPROC_VX_3op_q<0b011>;
+class V6_pred_or_n_enc : Enc_COPROC_VX_3op_q<0b100>;
+class V6_pred_and_n_enc : Enc_COPROC_VX_3op_q<0b101>;
+
+class V6_pred_not_enc : OpcodeHexagon {
+  bits<2> dst;
+  bits<2> src1;
+
+  let Inst{31-16} = { 0b0001111000000011 };
+  let Inst{13-0} = { 0b0000, src1{1-0}, 0b000010, dst{1-0} };
+}
+
+class Enc_COPROC_VX_4op_q<bits<1> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<2> src1;
+  bits<5> src2;
+  bits<5> src3;
+
+  let Inst{31-16} = { 0b000111101, opc{0}, 1, src3{4-0} };
+  let Inst{13-0} = { 1, src2{4-0}, 0, src1{1-0}, dst{4-0} };
+}
+
+class V6_vswap_enc : Enc_COPROC_VX_4op_q<0>;
+class V6_vmux_enc : Enc_COPROC_VX_4op_q<1>;
+
+class Enc_X_2op<bits<16> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+
+  let Inst{31-16} = { opc{15-5}, src1{4-0} };
+  let Inst{13-0} = { opc{4-3}, 0b0000, opc{2-0}, dst{4-0} };
+}
+
+class V6_lvsplatw_enc : Enc_X_2op<0b0001100110100001>;
+class V6_vinsertwr_enc : Enc_X_2op<0b0001100110110001>;
+class S6_vsplatrbp_enc : Enc_X_2op<0b1000010001000100>;
+
+
+class Enc_CR_2op_r<bits<12> opc> : OpcodeHexagon {
+  bits<2> dst;
+  bits<5> src1;
+
+  let Inst{31-16} = { opc{11}, 0, opc{10-7}, 0, opc{6-3}, src1{4-0} };
+  let Inst{13-0} = { opc{2}, 0b000000, opc{1}, 0b000, opc{0}, dst{1-0} };
+}
+
+class V6_pred_scalar2_enc : Enc_CR_2op_r<0b001101101011>;
+class Y5_l2locka_enc : Enc_CR_2op_r<0b110000111100>;
+
+class Enc_S_3op_i6<bits<9> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<6> src2;
+
+  let Inst{31-16} = { 0b1000, opc{8-6}, 0, opc{5-3}, src1{4-0} };
+  let Inst{13-0} = { src2{5-0}, opc{2-0}, dst{4-0} };
+}
+
+class S6_rol_i_p_enc : Enc_S_3op_i6<0b000000011>;
+class S6_rol_i_p_nac_enc : Enc_S_3op_i6<0b001000011>;
+class S6_rol_i_p_acc_enc : Enc_S_3op_i6<0b001000111>;
+class S6_rol_i_p_and_enc : Enc_S_3op_i6<0b001010011>;
+class S6_rol_i_p_or_enc : Enc_S_3op_i6<0b001010111>;
+class S6_rol_i_p_xacc_enc : Enc_S_3op_i6<0b001100011>;
+
+class Enc_X_3op_r<bits<15> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+
+  let Inst{31-16} = { opc{14-4}, src1{4-0} };
+  let Inst{13-0} = { opc{3}, src2{4-0}, opc{2-0}, dst{4-0} };
+}
+
+class S6_rol_i_r_enc : Enc_X_3op_r<0b100011000000011>;
+class S6_rol_i_r_nac_enc : Enc_X_3op_r<0b100011100000011>;
+class S6_rol_i_r_acc_enc : Enc_X_3op_r<0b100011100000111>;
+class S6_rol_i_r_and_enc : Enc_X_3op_r<0b100011100100011>;
+class S6_rol_i_r_or_enc : Enc_X_3op_r<0b100011100100111>;
+class S6_rol_i_r_xacc_enc : Enc_X_3op_r<0b100011101000011>;
+class S6_vtrunehb_ppp_enc : Enc_X_3op_r<0b110000011000011>;
+class S6_vtrunohb_ppp_enc : Enc_X_3op_r<0b110000011000101>;
+
+class Enc_no_operands<bits<25> opc> : OpcodeHexagon {
+
+  let Inst{31-16} = { opc{24-10}, 0 };
+  let Inst{13-0} = { opc{9-7}, 0b000, opc{6-0}, 0 };
+}
+
+class Y5_l2gunlock_enc : Enc_no_operands<0b1010100000100000010000000>;
+class Y5_l2gclean_enc : Enc_no_operands<0b1010100000100000100000000>;
+class Y5_l2gcleaninv_enc : Enc_no_operands<0b1010100000100000110000000>;
+class V6_vhist_enc : Enc_no_operands<0b0001111000000001001000000>;
+
+class Enc_J_jumpr<bits<13> opc> : OpcodeHexagon {
+  bits<5> src1;
+
+  let Inst{31-16} = { opc{12-6}, 0, opc{5-3}, src1{4-0} };
+  let Inst{13-0} = { 0b00, opc{2}, 0b0000, opc{1-0}, 0b00000 };
+}
+
+class Y5_l2unlocka_enc : Enc_J_jumpr<0b1010011011000>;
+class Y2_l2cleaninvidx_enc : Enc_J_jumpr<0b1010100011000>;
+
+class Enc_ST_l2gclean_pa<bits<2> opc> : OpcodeHexagon {
+  bits<5> src1;
+
+  let Inst{31-16} = { 0b101001101, opc{1-0}, 0b00000 };
+  let Inst{13-0} = { 0, src1{4-0}, 0b00000000 };
+}
+
+class Y6_l2gcleanpa_enc : Enc_ST_l2gclean_pa<0b01>;
+class Y6_l2gcleaninvpa_enc : Enc_ST_l2gclean_pa<0b10>;
+
+class A5_ACS_enc : OpcodeHexagon {
+  bits<5> dst1;
+  bits<2> dst2;
+  bits<5> src1;
+  bits<5> src2;
+
+  let Inst{31-16} = { 0b11101010101, src1{4-0} };
+  let Inst{13-0} = { 0, src2{4-0}, 0, dst2{1-0}, dst1{4-0} };
+}
+
+class Enc_X_4op_r<bits<8> opc> : OpcodeHexagon {
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<2> src3;
+
+  let Inst{31-16} = { 0b11, opc{7}, 0, opc{6-5}, 1, opc{4-1}, src1{4-0} };
+  let Inst{13-0} = { 0, src2{4-0}, opc{0}, src3{1-0}, dst{4-0} };
+}
+
+class S2_vsplicerb_enc : Enc_X_4op_r<0b00001000>;
+class S2_cabacencbin_enc : Enc_X_4op_r<0b00001010>;
+class F2_sffma_sc_enc : Enc_X_4op_r<0b11110111>;
+
+class V6_vhistq_enc : OpcodeHexagon {
+  bits<2> src1;
+
+  let Inst{31-16} = { 0b00011110, src1{1-0}, 0b000010 };
+  let Inst{13-0} = { 0b10000010000000 };
+}
+
+// TODO: Change script to generate dst1 instead of dst.
+class A6_vminub_RdP_enc : OpcodeHexagon {
+  bits<5> dst1;
+  bits<2> dst2;
+  bits<5> src1;
+  bits<5> src2;
+
+  let Inst{31-16} = { 0b11101010111, src2{4-0} };
+  let Inst{13-0} = { 0, src1{4-0}, 0, dst2{1-0}, dst1{4-0} };
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
new file mode 100644
index 000000000000..fa3cccbd0879
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -0,0 +1,445 @@
+//==- HexagonInstrFormats.td - Hexagon Instruction Formats --*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//                         Hexagon Instruction Flags +
+//
+//                    *** Must match HexagonBaseInfo.h ***
+//===----------------------------------------------------------------------===//
+
+class IType<bits<5> t> {
+  bits<5> Value = t;
+}
+def TypePSEUDO : IType<0>;
+def TypeALU32  : IType<1>;
+def TypeCR     : IType<2>;
+def TypeJR     : IType<3>;
+def TypeJ      : IType<4>;
+def TypeLD     : IType<5>;
+def TypeST     : IType<6>;
+def TypeSYSTEM : IType<7>;
+def TypeXTYPE  : IType<8>;
+def TypeENDLOOP: IType<31>;
+
+// Maintain list of valid subtargets for each instruction.
+class SubTarget<bits<6> value> {
+  bits<6> Value = value;
+}
+
+def HasAnySubT    : SubTarget<0x3f>;  // 111111
+def HasV5SubT     : SubTarget<0x3e>;  // 111110
+def HasV55SubT    : SubTarget<0x3c>;  // 111100
+def HasV60SubT    : SubTarget<0x38>;  // 111000
+
+// Addressing modes for load/store instructions
+class AddrModeType<bits<3> value> {
+  bits<3> Value = value;
+}
+
+def NoAddrMode     : AddrModeType<0>;  // No addressing mode
+def Absolute       : AddrModeType<1>;  // Absolute addressing mode
+def AbsoluteSet    : AddrModeType<2>;  // Absolute set addressing mode
+def BaseImmOffset  : AddrModeType<3>;  // Indirect with offset
+def BaseLongOffset : AddrModeType<4>;  // Indirect with long offset
+def BaseRegOffset  : AddrModeType<5>;  // Indirect with register offset
+def PostInc        : AddrModeType<6>;  // Post increment addressing mode
+
+class MemAccessSize<bits<4> value> {
+  bits<4> Value = value;
+}
+
+def NoMemAccess      : MemAccessSize<0>;// Not a memory access instruction.
+def ByteAccess       : MemAccessSize<1>;// Byte access instruction (memb).
+def HalfWordAccess   : MemAccessSize<2>;// Half word access instruction (memh).
+def WordAccess       : MemAccessSize<3>;// Word access instruction (memw).
+def DoubleWordAccess : MemAccessSize<4>;// Double word access instruction (memd)
+def Vector64Access   : MemAccessSize<7>;// Vector access instruction (memv)
+def Vector128Access  : MemAccessSize<8>;// Vector access instruction (memv)
+
+
+//===----------------------------------------------------------------------===//
+//                         Instruction Class Declaration +
+//===----------------------------------------------------------------------===//
+
+class OpcodeHexagon {
+  field bits<32> Inst = ?; // Default to an invalid insn.
+  bits<4> IClass = 0; // ICLASS
+
+  let Inst{31-28} = IClass;
+
+  bits<1> zero = 0;
+}
+
+class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
+                  string cstr, InstrItinClass itin, IType type>
+  : Instruction {
+  let Namespace = "Hexagon";
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+  let Constraints = cstr;
+  let Itinerary = itin;
+  let Size = 4;
+
+  // SoftFail is a field the disassembler can use to provide a way for
+  // instructions to not match without killing the whole decode process. It is
+  // mainly used for ARM, but Tablegen expects this field to exist or it fails
+  // to build the decode table.
+  field bits<32> SoftFail = 0;
+
+  // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
+
+  // Instruction type according to the ISA.
+  IType Type = type;
+  let TSFlags{4-0} = Type.Value;
+
+  // Solo instructions, i.e., those that cannot be in a packet with others.
+  bits<1> isSolo = 0;
+  let TSFlags{5} = isSolo;
+  // Packed only with A or X-type instructions.
+  bits<1> isSoloAX = 0;
+  let TSFlags{6} = isSoloAX;
+  // Only A-type instruction in first slot or nothing.
+  bits<1> isSoloAin1 = 0;
+  let TSFlags{7} = isSoloAin1;
+
+  // Predicated instructions.
+  bits<1> isPredicated = 0;
+  let TSFlags{8} = isPredicated;
+  bits<1> isPredicatedFalse = 0;
+  let TSFlags{9} = isPredicatedFalse;
+  bits<1> isPredicatedNew = 0;
+  let TSFlags{10} = isPredicatedNew;
+  bits<1> isPredicateLate = 0;
+  let TSFlags{11} = isPredicateLate; // Late predicate producer insn.
+
+  // New-value insn helper fields.
+  bits<1> isNewValue = 0;
+  let TSFlags{12} = isNewValue; // New-value consumer insn.
+  bits<1> hasNewValue = 0;
+  let TSFlags{13} = hasNewValue; // New-value producer insn.
+  bits<3> opNewValue = 0;
+  let TSFlags{16-14} = opNewValue; // New-value produced operand.
+  bits<1> isNVStorable = 0;
+  let TSFlags{17} = isNVStorable; // Store that can become new-value store.
+  bits<1> isNVStore = 0;
+  let TSFlags{18} = isNVStore; // New-value store insn.
+  bits<1> isCVLoadable = 0;
+  let TSFlags{19} = isCVLoadable; // Load that can become cur-value load.
+  bits<1> isCVLoad = 0;
+  let TSFlags{20} = isCVLoad; // Cur-value load insn.
+
+  // Immediate extender helper fields.
+  bits<1> isExtendable = 0;
+  let TSFlags{21} = isExtendable; // Insn may be extended.
+  bits<1> isExtended = 0;
+  let TSFlags{22} = isExtended; // Insn must be extended.
+  bits<3> opExtendable = 0;
+  let TSFlags{25-23} = opExtendable; // Which operand may be extended.
+  bits<1> isExtentSigned = 0;
+  let TSFlags{26} = isExtentSigned; // Signed or unsigned range.
+  bits<5> opExtentBits = 0;
+  let TSFlags{31-27} = opExtentBits; //Number of bits of range before extending.
+  bits<2> opExtentAlign = 0;
+  let TSFlags{33-32} = opExtentAlign; // Alignment exponent before extending.
+
+  // If an instruction is valid on a subtarget, set the corresponding
+  // bit from validSubTargets.
+  // By default, instruction is valid on all subtargets.
+  SubTarget validSubTargets = HasAnySubT;
+  let TSFlags{39-34} = validSubTargets.Value;
+
+  // Addressing mode for load/store instructions.
+  AddrModeType addrMode = NoAddrMode;
+  let TSFlags{42-40} = addrMode.Value;
+
+  // Memory access size for mem access instructions (load/store)
+  MemAccessSize accessSize = NoMemAccess;
+  let TSFlags{46-43} = accessSize.Value;
+
+  bits<1> isTaken = 0;
+  let TSFlags {47} = isTaken; // Branch prediction.
+
+  bits<1> isFP = 0;
+  let TSFlags {48} = isFP; // Floating-point.
+
+  bits<1> hasNewValue2 = 0;
+  let TSFlags{50} = hasNewValue2; // Second New-value producer insn.
+  bits<3> opNewValue2 = 0;
+  let TSFlags{53-51} = opNewValue2; // Second New-value produced operand.
+
+  bits<1> isAccumulator = 0;
+  let TSFlags{54} = isAccumulator;
+
+  bit cofMax1 = 0;
+  let TSFlags{60} = cofMax1;
+
+  // Fields used for relation models.
+  bit isNonTemporal = 0;
+  string isNT = ""; // set to "true" for non-temporal vector stores.
+  string BaseOpcode = "";
+  string CextOpcode = "";
+  string PredSense = "";
+  string PNewValue = "";
+  string NValueST  = "";    // Set to "true" for new-value stores.
+  string InputType = "";    // Input is "imm" or "reg" type.
+  string isFloat = "false"; // Set to "true" for the floating-point load/store.
+  string isBrTaken = !if(isTaken, "true", "false"); // Set to "true"/"false" for jump instructions
+
+  let PredSense = !if(isPredicated, !if(isPredicatedFalse, "false", "true"),
+                                    "");
+  let PNewValue = !if(isPredicatedNew, "new", "");
+  let NValueST = !if(isNVStore, "true", "false");
+  let isNT = !if(isNonTemporal, "true", "false");
+
+  // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
+}
+
+//===----------------------------------------------------------------------===//
+//                         Instruction Classes Definitions +
+//===----------------------------------------------------------------------===//
+
+// LD Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+let mayLoad = 1 in
+class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon;
+
+let mayLoad = 1 in
+class LDInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+class CONSTLDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                  string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+// LD Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                 string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayLoad = 1 in
+class LD0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon;
+
+let mayLoad = 1 in
+class LD1Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>;
+
+// ST Instruction Class in V2/V3 can take SLOT0 only.
+// ST Instruction Class in V4    can take SLOT0 & SLOT1.
+// Definition of the instruction class CHANGED from V2/V3 to V4.
+let mayStore = 1 in
+class STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>, OpcodeHexagon;
+
+class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : STInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayStore = 1 in
+class ST0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "", InstrItinClass itin = ST_tc_ld_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>, OpcodeHexagon;
+
+// Same as ST0Inst but doesn't derive from OpcodeHexagon.
+let mayStore = 1 in
+class ST1Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>;
+
+// ST Instruction Class in V2/V3 can take SLOT0 only.
+// ST Instruction Class in V4    can take SLOT0 & SLOT1.
+// Definition of the instruction class CHANGED from V2/V3 to V4.
+class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                 string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
+  : STInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// SYSTEM Instruction Class in V4 can take SLOT0 only
+// In V2/V3 we used ST for this but in v4 ST can take SLOT0 or SLOT1.
+class SYSInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "",  InstrItinClass itin = ST_tc_3stall_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeSYSTEM>,
+    OpcodeHexagon;
+
+// ALU32 Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class ALU32Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeALU32>, OpcodeHexagon;
+
+// ALU64 Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4.
+class ALU64Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
+     OpcodeHexagon;
+
+class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
+  : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+
+// M Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
+class MInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
+    OpcodeHexagon;
+
+// Same as above but doesn't derive from OpcodeHexagon
+class MInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
+
+// M Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
+class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = M_tc_2_SLOT23>
+    : MInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// S Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
+class SInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
+    OpcodeHexagon;
+
+class SInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
+
+// S Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
+class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = S_3op_tc_1_SLOT23>
+  : SInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// J Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class JInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "", InstrItinClass itin = J_tc_2early_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJ>, OpcodeHexagon;
+
+class JInst_CJUMP_UCJUMP<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "", InstrItinClass itin = J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJ>, OpcodeHexagon;
+
+// JR Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "", InstrItinClass itin = J_tc_2early_SLOT2>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJR>, OpcodeHexagon;
+
+// CR Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class CRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "", InstrItinClass itin = CR_tc_2early_SLOT3>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCR>, OpcodeHexagon;
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+class Endloop<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "", InstrItinClass itin = J_tc_2early_SLOT0123>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeENDLOOP>,
+    OpcodeHexagon;
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, PSEUDO, TypePSEUDO>,
+    OpcodeHexagon;
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+class PseudoM<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr="">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, PSEUDOM, TypePSEUDO>,
+    OpcodeHexagon;
+
+//===----------------------------------------------------------------------===//
+//                         Instruction Classes Definitions -
+//===----------------------------------------------------------------------===//
+
+
+//
+// ALU32 patterns
+//.
+class ALU32_rr<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+class ALU32_ir<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+class ALU32_ri<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+class ALU32_ii<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+//
+// ALU64 patterns.
+//
+class ALU64_rr<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU64_tc_1_SLOT23>
+   : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+class ALU64_ri<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU64_tc_1_SLOT23>
+   : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// Post increment ST Instruction.
+class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "">
+  : STInst<outs, ins, asmstr, pattern, cstr>;
+
+// Post increment LD Instruction.
+class LDInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+//===----------------------------------------------------------------------===//
+// V4 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrFormatsV4.td"
+
+//===----------------------------------------------------------------------===//
+// V4 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V60 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrFormatsV60.td"
+
+//===----------------------------------------------------------------------===//
+// V60 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td
new file mode 100644
index 000000000000..493d04703da9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -0,0 +1,160 @@
+//==- HexagonInstrFormats.td - Hexagon Instruction Formats --*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V4 instruction classes in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//----------------------------------------------------------------------------//
+//                         Hexagon Instruction Flags
+//
+//                        *** Must match BaseInfo.h ***
+//----------------------------------------------------------------------------//
+
+def TypeV4LDST    : IType<9>;
+def TypeNV       : IType<10>;
+def TypeDUPLEX   : IType<11>;
+def TypeCOMPOUND : IType<12>;
+def TypePREFIX   : IType<30>;
+
+//                      Duplex Instruction Class Declaration
+//===----------------------------------------------------------------------===//
+
+class OpcodeDuplex {
+  field bits<32> Inst = ?; // Default to an invalid insn.
+  bits<4> IClass = 0; // ICLASS
+  bits<13> ISubHi = 0; // Low sub-insn
+  bits<13> ISubLo = 0; // High sub-insn
+
+  let Inst{31-29} = IClass{3-1};
+  let Inst{13}    = IClass{0};
+  let Inst{15-14} = 0;
+  let Inst{28-16} = ISubHi;
+  let Inst{12-0}  = ISubLo;
+}
+
+class InstDuplex<bits<4> iClass, list<dag> pattern = [],
+                 string cstr = "">
+  : Instruction, OpcodeDuplex {
+  let Namespace = "Hexagon";
+  IType Type = TypeDUPLEX;  // uses slot 0,1
+  let isCodeGenOnly = 1;
+  let hasSideEffects = 0;
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins);
+  let IClass = iClass;
+  let Constraints = cstr;
+  let Itinerary = DUPLEX;
+  let Size = 4;
+
+  // SoftFail is a field the disassembler can use to provide a way for
+  // instructions to not match without killing the whole decode process. It is
+  // mainly used for ARM, but Tablegen expects this field to exist or it fails
+  // to build the decode table.
+  field bits<32> SoftFail = 0;
+
+  // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
+
+  let TSFlags{4-0} = Type.Value;
+
+  // Predicated instructions.
+  bits<1> isPredicated = 0;
+  let TSFlags{6} = isPredicated;
+  bits<1> isPredicatedFalse = 0;
+  let TSFlags{7} = isPredicatedFalse;
+  bits<1> isPredicatedNew = 0;
+  let TSFlags{8} = isPredicatedNew;
+
+  // New-value insn helper fields.
+  bits<1> isNewValue = 0;
+  let TSFlags{9} = isNewValue; // New-value consumer insn.
+  bits<1> hasNewValue = 0;
+  let TSFlags{10} = hasNewValue; // New-value producer insn.
+  bits<3> opNewValue = 0;
+  let TSFlags{13-11} = opNewValue; // New-value produced operand.
+  bits<1> isNVStorable = 0;
+  let TSFlags{14} = isNVStorable; // Store that can become new-value store.
+  bits<1> isNVStore = 0;
+  let TSFlags{15} = isNVStore; // New-value store insn.
+
+  // Immediate extender helper fields.
+  bits<1> isExtendable = 0;
+  let TSFlags{16} = isExtendable; // Insn may be extended.
+  bits<1> isExtended = 0;
+  let TSFlags{17} = isExtended; // Insn must be extended.
+  bits<3> opExtendable = 0;
+  let TSFlags{20-18} = opExtendable; // Which operand may be extended.
+  bits<1> isExtentSigned = 0;
+  let TSFlags{21} = isExtentSigned; // Signed or unsigned range.
+  bits<5> opExtentBits = 0;
+  let TSFlags{26-22} = opExtentBits; //Number of bits of range before extending.
+  bits<2> opExtentAlign = 0;
+  let TSFlags{28-27} = opExtentAlign; // Alignment exponent before extending.
+}
+
+//----------------------------------------------------------------------------//
+//                         Instruction Classes Definitions
+//----------------------------------------------------------------------------//
+
+//
+// NV type instructions.
+//
+class NVInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeNV>, OpcodeHexagon;
+
+class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0>
+  : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// Definition of Post increment new value store.
+class NVInstPost_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
+  : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// Post increment ST Instruction.
+let mayStore = 1 in
+class NVInstPI_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
+  : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+// New-value conditional branch.
+class NCJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : NVInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayLoad = 1, mayStore = 1 in
+class MEMInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "", InstrItinClass itin = V4LDST_tc_st_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeV4LDST>,
+    OpcodeHexagon;
+
+class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                 string cstr = "", InstrItinClass itin = V4LDST_tc_st_SLOT0>
+  : MEMInst<outs, ins, asmstr, pattern, cstr, itin>;
+
+class EXTENDERInst<dag outs, dag ins, string asmstr, list<dag> pattern = []>
+  : InstHexagon<outs, ins, asmstr, pattern, "", EXTENDER_tc_1_SLOT0123,
+                TypePREFIX>, OpcodeHexagon;
+
+class SUBInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, "", PREFIX, TypeDUPLEX>,
+    OpcodeHexagon;
+
+class CJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>,
+    OpcodeHexagon;
+
+class CJInst_JMPSET<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND, TypeCOMPOUND>,
+    OpcodeHexagon;
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
new file mode 100644
index 000000000000..b9f4373a0b79
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
@@ -0,0 +1,238 @@
+//==- HexagonInstrFormatsV60.td - Hexagon Instruction Formats -*- tablegen -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V60 instruction classes in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//----------------------------------------------------------------------------//
+//                         Hexagon Instruction Flags +
+//
+//                        *** Must match BaseInfo.h ***
+//----------------------------------------------------------------------------//
+
+def TypeCVI_VA         : IType<13>;
+def TypeCVI_VA_DV      : IType<14>;
+def TypeCVI_VX         : IType<15>;
+def TypeCVI_VX_DV      : IType<16>;
+def TypeCVI_VP         : IType<17>;
+def TypeCVI_VP_VS      : IType<18>;
+def TypeCVI_VS         : IType<19>;
+def TypeCVI_VINLANESAT : IType<20>;
+def TypeCVI_VM_LD      : IType<21>;
+def TypeCVI_VM_TMP_LD  : IType<22>;
+def TypeCVI_VM_CUR_LD  : IType<23>;
+def TypeCVI_VM_VP_LDU  : IType<24>;
+def TypeCVI_VM_ST      : IType<25>;
+def TypeCVI_VM_NEW_ST  : IType<26>;
+def TypeCVI_VM_STU     : IType<27>;
+def TypeCVI_HIST       : IType<28>;
+//----------------------------------------------------------------------------//
+//                         Instruction Classes Definitions +
+//----------------------------------------------------------------------------//
+
+let validSubTargets = HasV60SubT in
+{
+class CVI_VA_Resource<dag outs, dag ins, string asmstr,
+                       list<dag> pattern = [], string cstr = "",
+                       InstrItinClass itin = CVI_VA>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VA_DV_Resource<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VA_DV>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA_DV>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_Resource_long<dag outs, dag ins, string asmstr,
+                       list<dag> pattern = [], string cstr = "",
+                       InstrItinClass itin = CVI_VX_LONG>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_Resource_late<dag outs, dag ins, string asmstr,
+                       list<dag> pattern = [], string cstr = "",
+                       InstrItinClass itin = CVI_VX_LATE>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>,
+     Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_Resource<dag outs, dag ins, string asmstr,
+                       list<dag> pattern = [], string cstr = "",
+                       InstrItinClass itin = CVI_VX>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_DV_Resource<dag outs, dag ins, string asmstr,
+                       list<dag> pattern = [], string cstr = "",
+                       InstrItinClass itin = CVI_VX_DV>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_DV_Slot2_Resource<dag outs, dag ins, string asmstr,
+                       list<dag> pattern = [], string cstr = "",
+                       InstrItinClass itin = CVI_VX_DV_SLOT2>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_DV_Resource_long<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VX_DV_LONG>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VP_Resource_long<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VP_LONG>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VP_VS_Resource_early<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VP_VS_EARLY>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VP_VS_Resource_long<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VP_VS_LONG>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VP_VS_Resource_long_early<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VP_VS_LONG_EARLY>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VS_Resource<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VS>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VS>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VINLANESAT_Resource<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VINLANESAT>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VINLANESAT>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VS_Resource_long<dag outs, dag ins, string asmstr,
+                           list<dag> pattern = [], string cstr = "",
+                           InstrItinClass itin = CVI_VS>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VS>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_LD_Resource<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VM_LD>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_LD>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_LD_Resource_long<dag outs, dag ins, string asmstr,
+                              list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VM_LD>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_LD>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_TMP_LD_Resource<dag outs, dag ins, string asmstr,
+                             list<dag> pattern = [], string cstr = "",
+                             InstrItinClass itin = CVI_VM_TMP_LD>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_TMP_LD>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_TMP_LD_Resource_long<dag outs, dag ins, string asmstr,
+                                  list<dag> pattern = [], string cstr = "",
+                                  InstrItinClass itin = CVI_VM_TMP_LD>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_TMP_LD>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_CUR_LD_Resource<dag outs, dag ins, string asmstr,
+                             list<dag> pattern = [], string cstr = "",
+                             InstrItinClass itin = CVI_VM_CUR_LD>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_CUR_LD>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_VP_LDU_Resource<dag outs, dag ins, string asmstr,
+                             list<dag> pattern = [], string cstr = "",
+                             InstrItinClass itin = CVI_VM_VP_LDU>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_VP_LDU>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_VP_LDU_Resource_long<dag outs, dag ins, string asmstr,
+                                  list<dag> pattern = [], string cstr = "",
+                                  InstrItinClass itin = CVI_VM_VP_LDU>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_VP_LDU>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_ST_Resource<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VM_ST>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_ST>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_ST_Resource_long<dag outs, dag ins, string asmstr,
+                              list<dag> pattern = [], string cstr = "",
+                              InstrItinClass itin = CVI_VM_ST>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_ST>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_NEW_ST_Resource<dag outs, dag ins, string asmstr,
+                             list<dag> pattern = [], string cstr = "",
+                             InstrItinClass itin = CVI_VM_NEW_ST>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_NEW_ST>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_NEW_ST_Resource_long<dag outs, dag ins, string asmstr,
+                                  list<dag> pattern = [], string cstr = "",
+                                  InstrItinClass itin = CVI_VM_NEW_ST>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_NEW_ST>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_STU_Resource<dag outs, dag ins, string asmstr,
+                          list<dag> pattern = [], string cstr = "",
+                          InstrItinClass itin = CVI_VM_STU>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_STU>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_VM_STU_Resource_long<dag outs, dag ins, string asmstr,
+                               list<dag> pattern = [], string cstr = "",
+                               InstrItinClass itin = CVI_VM_STU>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_STU>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+
+class CVI_HIST_Resource<dag outs, dag ins, string asmstr,
+                        list<dag> pattern = [], string cstr = "",
+                        InstrItinClass itin = CVI_HIST>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_HIST>,
+     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
+}
+
+let validSubTargets = HasV60SubT in
+{
+class CVI_VA_Resource1<dag outs, dag ins, string asmstr,
+                       list<dag> pattern = [], string cstr = "",
+                       InstrItinClass itin = CVI_VA>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>,
+     Requires<[HasV60T, UseHVX]>;
+
+class CVI_VX_DV_Resource1<dag outs, dag ins, string asmstr,
+                         list<dag> pattern = [], string cstr = "",
+                         InstrItinClass itin = CVI_VX_DV>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
+     Requires<[HasV60T, UseHVX]>;
+
+class CVI_HIST_Resource1<dag outs, dag ins, string asmstr,
+                        list<dag> pattern = [], string cstr = "",
+                        InstrItinClass itin = CVI_HIST>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_HIST>,
+     Requires<[HasV60T, UseHVX]>;
+}
+
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
new file mode 100644
index 000000000000..34ce3e652995
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -0,0 +1,4283 @@
+//===-- HexagonInstrInfo.cpp - Hexagon Instruction Information ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonHazardRecognizer.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-instrinfo"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRMAP_INFO
+#include "HexagonGenInstrInfo.inc"
+#include "HexagonGenDFAPacketizer.inc"
+
+cl::opt<bool> ScheduleInlineAsm("hexagon-sched-inline-asm", cl::Hidden,
+  cl::init(false), cl::desc("Do not consider inline-asm a scheduling/"
+                            "packetization boundary."));
+
+static cl::opt<bool> EnableBranchPrediction("hexagon-enable-branch-prediction",
+  cl::Hidden, cl::init(true), cl::desc("Enable branch prediction"));
+
+static cl::opt<bool> DisableNVSchedule("disable-hexagon-nv-schedule",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Disable schedule adjustment for new value stores."));
+
+static cl::opt<bool> EnableTimingClassLatency(
+  "enable-timing-class-latency", cl::Hidden, cl::init(false),
+  cl::desc("Enable timing class latency"));
+
+static cl::opt<bool> EnableALUForwarding(
+  "enable-alu-forwarding", cl::Hidden, cl::init(true),
+  cl::desc("Enable vec alu forwarding"));
+
+static cl::opt<bool> EnableACCForwarding(
+  "enable-acc-forwarding", cl::Hidden, cl::init(true),
+  cl::desc("Enable vec acc forwarding"));
+
+static cl::opt<bool> BranchRelaxAsmLarge("branch-relax-asm-large",
+  cl::init(true), cl::Hidden, cl::ZeroOrMore, cl::desc("branch relax asm"));
+
+static cl::opt<bool> UseDFAHazardRec("dfa-hazard-rec",
+  cl::init(true), cl::Hidden, cl::ZeroOrMore,
+  cl::desc("Use the DFA based hazard recognizer."));
+
+///
+/// Constants for Hexagon instructions.
+///
+const int Hexagon_MEMV_OFFSET_MAX_128B = 896;   // #s4: -8*128...7*128
+const int Hexagon_MEMV_OFFSET_MIN_128B = -1024; // #s4
+const int Hexagon_MEMV_OFFSET_MAX = 448;  // #s4: -8*64...7*64
+const int Hexagon_MEMV_OFFSET_MIN = -512; // #s4
+const int Hexagon_MEMW_OFFSET_MAX = 4095;
+const int Hexagon_MEMW_OFFSET_MIN = -4096;
+const int Hexagon_MEMD_OFFSET_MAX = 8191;
+const int Hexagon_MEMD_OFFSET_MIN = -8192;
+const int Hexagon_MEMH_OFFSET_MAX = 2047;
+const int Hexagon_MEMH_OFFSET_MIN = -2048;
+const int Hexagon_MEMB_OFFSET_MAX = 1023;
+const int Hexagon_MEMB_OFFSET_MIN = -1024;
+const int Hexagon_ADDI_OFFSET_MAX = 32767;
+const int Hexagon_ADDI_OFFSET_MIN = -32768;
+const int Hexagon_MEMD_AUTOINC_MAX = 56;
+const int Hexagon_MEMD_AUTOINC_MIN = -64;
+const int Hexagon_MEMW_AUTOINC_MAX = 28;
+const int Hexagon_MEMW_AUTOINC_MIN = -32;
+const int Hexagon_MEMH_AUTOINC_MAX = 14;
+const int Hexagon_MEMH_AUTOINC_MIN = -16;
+const int Hexagon_MEMB_AUTOINC_MAX = 7;
+const int Hexagon_MEMB_AUTOINC_MIN = -8;
+const int Hexagon_MEMV_AUTOINC_MAX = 192;   // #s3
+const int Hexagon_MEMV_AUTOINC_MIN = -256;  // #s3
+const int Hexagon_MEMV_AUTOINC_MAX_128B = 384;  // #s3
+const int Hexagon_MEMV_AUTOINC_MIN_128B = -512; // #s3
+
+// Pin the vtable to this file.
+void HexagonInstrInfo::anchor() {}
+
+HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
+    : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP),
+      RI() {}
+
+
+static bool isIntRegForSubInst(unsigned Reg) {
+  return (Reg >= Hexagon::R0 && Reg <= Hexagon::R7) ||
+         (Reg >= Hexagon::R16 && Reg <= Hexagon::R23);
+}
+
+
+static bool isDblRegForSubInst(unsigned Reg, const HexagonRegisterInfo &HRI) {
+  return isIntRegForSubInst(HRI.getSubReg(Reg, Hexagon::isub_lo)) &&
+         isIntRegForSubInst(HRI.getSubReg(Reg, Hexagon::isub_hi));
+}
+
+
+/// Calculate number of instructions excluding the debug instructions.
+static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB,
+                              MachineBasicBlock::const_instr_iterator MIE) {
+  unsigned Count = 0;
+  for (; MIB != MIE; ++MIB) {
+    if (!MIB->isDebugValue())
+      ++Count;
+  }
+  return Count;
+}
+
+
+/// Find the hardware loop instruction used to set-up the specified loop.
+/// On Hexagon, we have two instructions used to set-up the hardware loop
+/// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions
+/// to indicate the end of a loop.
+static MachineInstr *findLoopInstr(MachineBasicBlock *BB, int EndLoopOp,
+      SmallPtrSet<MachineBasicBlock *, 8> &Visited) {
+  int LOOPi;
+  int LOOPr;
+  if (EndLoopOp == Hexagon::ENDLOOP0) {
+    LOOPi = Hexagon::J2_loop0i;
+    LOOPr = Hexagon::J2_loop0r;
+  } else { // EndLoopOp == Hexagon::EndLOOP1
+    LOOPi = Hexagon::J2_loop1i;
+    LOOPr = Hexagon::J2_loop1r;
+  }
+
+  // The loop set-up instruction will be in a predecessor block
+  for (MachineBasicBlock::pred_iterator PB = BB->pred_begin(),
+         PE = BB->pred_end(); PB != PE; ++PB) {
+    // If this has been visited, already skip it.
+    if (!Visited.insert(*PB).second)
+      continue;
+    if (*PB == BB)
+      continue;
+    for (MachineBasicBlock::reverse_instr_iterator I = (*PB)->instr_rbegin(),
+           E = (*PB)->instr_rend(); I != E; ++I) {
+      int Opc = I->getOpcode();
+      if (Opc == LOOPi || Opc == LOOPr)
+        return &*I;
+      // We've reached a different loop, which means the loop0 has been removed.
+      if (Opc == EndLoopOp)
+        return 0;
+    }
+    // Check the predecessors for the LOOP instruction.
+    MachineInstr *loop = findLoopInstr(*PB, EndLoopOp, Visited);
+    if (loop)
+      return loop;
+  }
+  return 0;
+}
+
+
+/// Gather register def/uses from MI.
+/// This treats possible (predicated) defs as actually happening ones
+/// (conservatively).
+static inline void parseOperands(const MachineInstr &MI,
+      SmallVector<unsigned, 4> &Defs, SmallVector<unsigned, 8> &Uses) {
+  Defs.clear();
+  Uses.clear();
+
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+
+    if (!MO.isReg())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+
+    if (MO.isUse())
+      Uses.push_back(MO.getReg());
+
+    if (MO.isDef())
+      Defs.push_back(MO.getReg());
+  }
+}
+
+
+// Position dependent, so check twice for swap.
+static bool isDuplexPairMatch(unsigned Ga, unsigned Gb) {
+  switch (Ga) {
+  case HexagonII::HSIG_None:
+  default:
+    return false;
+  case HexagonII::HSIG_L1:
+    return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_L2:
+    return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+            Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_S1:
+    return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+            Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_S2:
+    return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+            Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_S2 ||
+            Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_A:
+    return (Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_Compound:
+    return (Gb == HexagonII::HSIG_Compound);
+  }
+  return false;
+}
+
+
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                               int &FrameIndex) const {
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case Hexagon::L2_loadri_io:
+  case Hexagon::L2_loadrd_io:
+  case Hexagon::V6_vL32b_ai:
+  case Hexagon::V6_vL32b_ai_128B:
+  case Hexagon::V6_vL32Ub_ai:
+  case Hexagon::V6_vL32Ub_ai_128B:
+  case Hexagon::LDriw_pred:
+  case Hexagon::LDriw_mod:
+  case Hexagon::PS_vloadrq_ai:
+  case Hexagon::PS_vloadrw_ai:
+  case Hexagon::PS_vloadrq_ai_128B:
+  case Hexagon::PS_vloadrw_ai_128B: {
+    const MachineOperand OpFI = MI.getOperand(1);
+    if (!OpFI.isFI())
+      return 0;
+    const MachineOperand OpOff = MI.getOperand(2);
+    if (!OpOff.isImm() || OpOff.getImm() != 0)
+      return 0;
+    FrameIndex = OpFI.getIndex();
+    return MI.getOperand(0).getReg();
+  }
+
+  case Hexagon::L2_ploadrit_io:
+  case Hexagon::L2_ploadrif_io:
+  case Hexagon::L2_ploadrdt_io:
+  case Hexagon::L2_ploadrdf_io: {
+    const MachineOperand OpFI = MI.getOperand(2);
+    if (!OpFI.isFI())
+      return 0;
+    const MachineOperand OpOff = MI.getOperand(3);
+    if (!OpOff.isImm() || OpOff.getImm() != 0)
+      return 0;
+    FrameIndex = OpFI.getIndex();
+    return MI.getOperand(0).getReg();
+  }
+  }
+
+  return 0;
+}
+
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                              int &FrameIndex) const {
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case Hexagon::S2_storerb_io:
+  case Hexagon::S2_storerh_io:
+  case Hexagon::S2_storeri_io:
+  case Hexagon::S2_storerd_io:
+  case Hexagon::V6_vS32b_ai:
+  case Hexagon::V6_vS32b_ai_128B:
+  case Hexagon::V6_vS32Ub_ai:
+  case Hexagon::V6_vS32Ub_ai_128B:
+  case Hexagon::STriw_pred:
+  case Hexagon::STriw_mod:
+  case Hexagon::PS_vstorerq_ai:
+  case Hexagon::PS_vstorerw_ai:
+  case Hexagon::PS_vstorerq_ai_128B:
+  case Hexagon::PS_vstorerw_ai_128B: {
+    const MachineOperand &OpFI = MI.getOperand(0);
+    if (!OpFI.isFI())
+      return 0;
+    const MachineOperand &OpOff = MI.getOperand(1);
+    if (!OpOff.isImm() || OpOff.getImm() != 0)
+      return 0;
+    FrameIndex = OpFI.getIndex();
+    return MI.getOperand(2).getReg();
+  }
+
+  case Hexagon::S2_pstorerbt_io:
+  case Hexagon::S2_pstorerbf_io:
+  case Hexagon::S2_pstorerht_io:
+  case Hexagon::S2_pstorerhf_io:
+  case Hexagon::S2_pstorerit_io:
+  case Hexagon::S2_pstorerif_io:
+  case Hexagon::S2_pstorerdt_io:
+  case Hexagon::S2_pstorerdf_io: {
+    const MachineOperand &OpFI = MI.getOperand(1);
+    if (!OpFI.isFI())
+      return 0;
+    const MachineOperand &OpOff = MI.getOperand(2);
+    if (!OpOff.isImm() || OpOff.getImm() != 0)
+      return 0;
+    FrameIndex = OpFI.getIndex();
+    return MI.getOperand(3).getReg();
+  }
+  }
+
+  return 0;
+}
+
+
+/// This function can analyze one/two way branching only and should (mostly) be
+/// called by target independent side.
+/// First entry is always the opcode of the branching instruction, except when
+/// the Cond vector is supposed to be empty, e.g., when AnalyzeBranch fails, a
+/// BB with only unconditional jump. Subsequent entries depend upon the opcode,
+/// e.g. Jump_c p will have
+/// Cond[0] = Jump_c
+/// Cond[1] = p
+/// HW-loop ENDLOOP:
+/// Cond[0] = ENDLOOP
+/// Cond[1] = MBB
+/// New value jump:
+/// Cond[0] = Hexagon::CMPEQri_f_Jumpnv_t_V4 -- specific opcode
+/// Cond[1] = R
+/// Cond[2] = Imm
+///
+bool HexagonInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *&TBB,
+                                     MachineBasicBlock *&FBB,
+                                     SmallVectorImpl<MachineOperand> &Cond,
+                                     bool AllowModify) const {
+  TBB = nullptr;
+  FBB = nullptr;
+  Cond.clear();
+
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::instr_iterator I = MBB.instr_end();
+  if (I == MBB.instr_begin())
+    return false;
+
+  // A basic block may looks like this:
+  //
+  //  [   insn
+  //     EH_LABEL
+  //      insn
+  //      insn
+  //      insn
+  //     EH_LABEL
+  //      insn     ]
+  //
+  // It has two succs but does not have a terminator
+  // Don't know how to handle it.
+  do {
+    --I;
+    if (I->isEHLabel())
+      // Don't analyze EH branches.
+      return true;
+  } while (I != MBB.instr_begin());
+
+  I = MBB.instr_end();
+  --I;
+
+  while (I->isDebugValue()) {
+    if (I == MBB.instr_begin())
+      return false;
+    --I;
+  }
+
+  bool JumpToBlock = I->getOpcode() == Hexagon::J2_jump &&
+                     I->getOperand(0).isMBB();
+  // Delete the J2_jump if it's equivalent to a fall-through.
+  if (AllowModify && JumpToBlock &&
+      MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+    DEBUG(dbgs()<< "\nErasing the jump to successor block\n";);
+    I->eraseFromParent();
+    I = MBB.instr_end();
+    if (I == MBB.instr_begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(*I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = &*I;
+  MachineInstr *SecondLastInst = nullptr;
+  // Find one more terminator if present.
+  for (;;) {
+    if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(*I)) {
+      if (!SecondLastInst)
+        SecondLastInst = &*I;
+      else
+        // This is a third branch.
+        return true;
+    }
+    if (I == MBB.instr_begin())
+      break;
+    --I;
+  }
+
+  int LastOpcode = LastInst->getOpcode();
+  int SecLastOpcode = SecondLastInst ? SecondLastInst->getOpcode() : 0;
+  // If the branch target is not a basic block, it could be a tail call.
+  // (It is, if the target is a function.)
+  if (LastOpcode == Hexagon::J2_jump && !LastInst->getOperand(0).isMBB())
+    return true;
+  if (SecLastOpcode == Hexagon::J2_jump &&
+      !SecondLastInst->getOperand(0).isMBB())
+    return true;
+
+  bool LastOpcodeHasJMP_c = PredOpcodeHasJMP_c(LastOpcode);
+  bool LastOpcodeHasNVJump = isNewValueJump(*LastInst);
+
+  if (LastOpcodeHasJMP_c && !LastInst->getOperand(1).isMBB())
+    return true;
+
+  // If there is only one terminator instruction, process it.
+  if (LastInst && !SecondLastInst) {
+    if (LastOpcode == Hexagon::J2_jump) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (isEndLoopN(LastOpcode)) {
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    if (LastOpcodeHasJMP_c) {
+      TBB = LastInst->getOperand(1).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    // Only supporting rr/ri versions of new-value jumps.
+    if (LastOpcodeHasNVJump && (LastInst->getNumExplicitOperands() == 3)) {
+      TBB = LastInst->getOperand(2).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+      Cond.push_back(LastInst->getOperand(0));
+      Cond.push_back(LastInst->getOperand(1));
+      return false;
+    }
+    DEBUG(dbgs() << "\nCant analyze BB#" << MBB.getNumber()
+                 << " with one jump\n";);
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  bool SecLastOpcodeHasJMP_c = PredOpcodeHasJMP_c(SecLastOpcode);
+  bool SecLastOpcodeHasNVJump = isNewValueJump(*SecondLastInst);
+  if (SecLastOpcodeHasJMP_c && (LastOpcode == Hexagon::J2_jump)) {
+    if (!SecondLastInst->getOperand(1).isMBB())
+      return true;
+    TBB =  SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(SecondLastInst->getOpcode()));
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // Only supporting rr/ri versions of new-value jumps.
+  if (SecLastOpcodeHasNVJump &&
+      (SecondLastInst->getNumExplicitOperands() == 3) &&
+      (LastOpcode == Hexagon::J2_jump)) {
+    TBB = SecondLastInst->getOperand(2).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(SecondLastInst->getOpcode()));
+    Cond.push_back(SecondLastInst->getOperand(0));
+    Cond.push_back(SecondLastInst->getOperand(1));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two Hexagon:JMPs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecLastOpcode == Hexagon::J2_jump && LastOpcode == Hexagon::J2_jump) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst->getIterator();
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // If the block ends with an ENDLOOP, and J2_jump, handle it.
+  if (isEndLoopN(SecLastOpcode) && LastOpcode == Hexagon::J2_jump) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(SecondLastInst->getOpcode()));
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+  DEBUG(dbgs() << "\nCant analyze BB#" << MBB.getNumber()
+               << " with two jumps";);
+  // Otherwise, can't handle this.
+  return true;
+}
+
+
+unsigned HexagonInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                        int *BytesRemoved) const {
+  assert(!BytesRemoved && "code size not handled");
+
+  DEBUG(dbgs() << "\nRemoving branches out of BB#" << MBB.getNumber());
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    // Only removing branches from end of MBB.
+    if (!I->isBranch())
+      return Count;
+    if (Count && (I->getOpcode() == Hexagon::J2_jump))
+      llvm_unreachable("Malformed basic block: unconditional branch not last");
+    MBB.erase(&MBB.back());
+    I = MBB.end();
+    ++Count;
+  }
+  return Count;
+}
+
+unsigned HexagonInstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                        MachineBasicBlock *TBB,
+                                        MachineBasicBlock *FBB,
+                                        ArrayRef<MachineOperand> Cond,
+                                        const DebugLoc &DL,
+                                        int *BytesAdded) const {
+  unsigned BOpc   = Hexagon::J2_jump;
+  unsigned BccOpc = Hexagon::J2_jumpt;
+  assert(validateBranchCond(Cond) && "Invalid branching condition");
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+  assert(!BytesAdded && "code size not handled");
+
+  // Check if reverseBranchCondition has asked to reverse this branch
+  // If we want to reverse the branch an odd number of times, we want
+  // J2_jumpf.
+  if (!Cond.empty() && Cond[0].isImm())
+    BccOpc = Cond[0].getImm();
+
+  if (!FBB) {
+    if (Cond.empty()) {
+      // Due to a bug in TailMerging/CFG Optimization, we need to add a
+      // special case handling of a predicated jump followed by an
+      // unconditional jump. If not, Tail Merging and CFG Optimization go
+      // into an infinite loop.
+      MachineBasicBlock *NewTBB, *NewFBB;
+      SmallVector<MachineOperand, 4> Cond;
+      auto Term = MBB.getFirstTerminator();
+      if (Term != MBB.end() && isPredicated(*Term) &&
+          !analyzeBranch(MBB, NewTBB, NewFBB, Cond, false) &&
+          MachineFunction::iterator(NewTBB) == ++MBB.getIterator()) {
+        reverseBranchCondition(Cond);
+        removeBranch(MBB);
+        return insertBranch(MBB, TBB, nullptr, Cond, DL);
+      }
+      BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
+    } else if (isEndLoopN(Cond[0].getImm())) {
+      int EndLoopOp = Cond[0].getImm();
+      assert(Cond[1].isMBB());
+      // Since we're adding an ENDLOOP, there better be a LOOP instruction.
+      // Check for it, and change the BB target if needed.
+      SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
+      MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
+      assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
+      Loop->getOperand(0).setMBB(TBB);
+      // Add the ENDLOOP after the finding the LOOP0.
+      BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB);
+    } else if (isNewValueJump(Cond[0].getImm())) {
+      assert((Cond.size() == 3) && "Only supporting rr/ri version of nvjump");
+      // New value jump
+      // (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset)
+      // (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset)
+      unsigned Flags1 = getUndefRegState(Cond[1].isUndef());
+      DEBUG(dbgs() << "\nInserting NVJump for BB#" << MBB.getNumber(););
+      if (Cond[2].isReg()) {
+        unsigned Flags2 = getUndefRegState(Cond[2].isUndef());
+        BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1).
+          addReg(Cond[2].getReg(), Flags2).addMBB(TBB);
+      } else if(Cond[2].isImm()) {
+        BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1).
+          addImm(Cond[2].getImm()).addMBB(TBB);
+      } else
+        llvm_unreachable("Invalid condition for branching");
+    } else {
+      assert((Cond.size() == 2) && "Malformed cond vector");
+      const MachineOperand &RO = Cond[1];
+      unsigned Flags = getUndefRegState(RO.isUndef());
+      BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB);
+    }
+    return 1;
+  }
+  assert((!Cond.empty()) &&
+         "Cond. cannot be empty when multiple branchings are required");
+  assert((!isNewValueJump(Cond[0].getImm())) &&
+         "NV-jump cannot be inserted with another branch");
+  // Special case for hardware loops.  The condition is a basic block.
+  if (isEndLoopN(Cond[0].getImm())) {
+    int EndLoopOp = Cond[0].getImm();
+    assert(Cond[1].isMBB());
+    // Since we're adding an ENDLOOP, there better be a LOOP instruction.
+    // Check for it, and change the BB target if needed.
+    SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
+    MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
+    assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
+    Loop->getOperand(0).setMBB(TBB);
+    // Add the ENDLOOP after the finding the LOOP0.
+    BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB);
+  } else {
+    const MachineOperand &RO = Cond[1];
+    unsigned Flags = getUndefRegState(RO.isUndef());
+    BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB);
+  }
+  BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
+
+  return 2;
+}
+
+/// Analyze the loop code to find the loop induction variable and compare used
+/// to compute the number of iterations. Currently, we analyze loop that are
+/// controlled using hardware loops.  In this case, the induction variable
+/// instruction is null.  For all other cases, this function returns true, which
+/// means we're unable to analyze it.
+bool HexagonInstrInfo::analyzeLoop(MachineLoop &L,
+                                   MachineInstr *&IndVarInst,
+                                   MachineInstr *&CmpInst) const {
+
+  MachineBasicBlock *LoopEnd = L.getBottomBlock();
+  MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator();
+  // We really "analyze" only hardware loops right now.
+  if (I != LoopEnd->end() && isEndLoopN(I->getOpcode())) {
+    IndVarInst = nullptr;
+    CmpInst = &*I;
+    return false;
+  }
+  return true;
+}
+
+/// Generate code to reduce the loop iteration by one and check if the loop is
+/// finished. Return the value/register of the new loop count. this function
+/// assumes the nth iteration is peeled first.
+unsigned HexagonInstrInfo::reduceLoopCount(MachineBasicBlock &MBB,
+      MachineInstr *IndVar, MachineInstr &Cmp,
+      SmallVectorImpl<MachineOperand> &Cond,
+      SmallVectorImpl<MachineInstr *> &PrevInsts,
+      unsigned Iter, unsigned MaxIter) const {
+  // We expect a hardware loop currently. This means that IndVar is set
+  // to null, and the compare is the ENDLOOP instruction.
+  assert((!IndVar) && isEndLoopN(Cmp.getOpcode())
+                   && "Expecting a hardware loop");
+  MachineFunction *MF = MBB.getParent();
+  DebugLoc DL = Cmp.getDebugLoc();
+  SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
+  MachineInstr *Loop = findLoopInstr(&MBB, Cmp.getOpcode(), VisitedBBs);
+  if (!Loop)
+    return 0;
+  // If the loop trip count is a compile-time value, then just change the
+  // value.
+  if (Loop->getOpcode() == Hexagon::J2_loop0i ||
+      Loop->getOpcode() == Hexagon::J2_loop1i) {
+    int64_t Offset = Loop->getOperand(1).getImm();
+    if (Offset <= 1)
+      Loop->eraseFromParent();
+    else
+      Loop->getOperand(1).setImm(Offset - 1);
+    return Offset - 1;
+  }
+  // The loop trip count is a run-time value. We generate code to subtract
+  // one from the trip count, and update the loop instruction.
+  assert(Loop->getOpcode() == Hexagon::J2_loop0r && "Unexpected instruction");
+  unsigned LoopCount = Loop->getOperand(1).getReg();
+  // Check if we're done with the loop.
+  unsigned LoopEnd = createVR(MF, MVT::i1);
+  MachineInstr *NewCmp = BuildMI(&MBB, DL, get(Hexagon::C2_cmpgtui), LoopEnd).
+    addReg(LoopCount).addImm(1);
+  unsigned NewLoopCount = createVR(MF, MVT::i32);
+  MachineInstr *NewAdd = BuildMI(&MBB, DL, get(Hexagon::A2_addi), NewLoopCount).
+    addReg(LoopCount).addImm(-1);
+  // Update the previously generated instructions with the new loop counter.
+  for (SmallVectorImpl<MachineInstr *>::iterator I = PrevInsts.begin(),
+         E = PrevInsts.end(); I != E; ++I)
+    (*I)->substituteRegister(LoopCount, NewLoopCount, 0, getRegisterInfo());
+  PrevInsts.clear();
+  PrevInsts.push_back(NewCmp);
+  PrevInsts.push_back(NewAdd);
+  // Insert the new loop instruction if this is the last time the loop is
+  // decremented.
+  if (Iter == MaxIter)
+    BuildMI(&MBB, DL, get(Hexagon::J2_loop0r)).
+      addMBB(Loop->getOperand(0).getMBB()).addReg(NewLoopCount);
+  // Delete the old loop instruction.
+  if (Iter == 0)
+    Loop->eraseFromParent();
+  Cond.push_back(MachineOperand::CreateImm(Hexagon::J2_jumpf));
+  Cond.push_back(NewCmp->getOperand(0));
+  return NewLoopCount;
+}
+
+bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
+      unsigned NumCycles, unsigned ExtraPredCycles,
+      BranchProbability Probability) const {
+  return nonDbgBBSize(&MBB) <= 3;
+}
+
+
+bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
+      unsigned NumTCycles, unsigned ExtraTCycles, MachineBasicBlock &FMBB,
+      unsigned NumFCycles, unsigned ExtraFCycles, BranchProbability Probability)
+      const {
+  return nonDbgBBSize(&TMBB) <= 3 && nonDbgBBSize(&FMBB) <= 3;
+}
+
+
+bool HexagonInstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
+      unsigned NumInstrs, BranchProbability Probability) const {
+  return NumInstrs <= 4;
+}
+
+void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I,
+                                   const DebugLoc &DL, unsigned DestReg,
+                                   unsigned SrcReg, bool KillSrc) const {
+  auto &HRI = getRegisterInfo();
+  unsigned KillFlag = getKillRegState(KillSrc);
+
+  if (Hexagon::IntRegsRegClass.contains(SrcReg, DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::A2_tfr), DestReg)
+      .addReg(SrcReg, KillFlag);
+    return;
+  }
+  if (Hexagon::DoubleRegsRegClass.contains(SrcReg, DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::A2_tfrp), DestReg)
+      .addReg(SrcReg, KillFlag);
+    return;
+  }
+  if (Hexagon::PredRegsRegClass.contains(SrcReg, DestReg)) {
+    // Map Pd = Ps to Pd = or(Ps, Ps).
+    BuildMI(MBB, I, DL, get(Hexagon::C2_or), DestReg)
+      .addReg(SrcReg).addReg(SrcReg, KillFlag);
+    return;
+  }
+  if (Hexagon::CtrRegsRegClass.contains(DestReg) &&
+      Hexagon::IntRegsRegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::A2_tfrrcr), DestReg)
+      .addReg(SrcReg, KillFlag);
+    return;
+  }
+  if (Hexagon::IntRegsRegClass.contains(DestReg) &&
+      Hexagon::CtrRegsRegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::A2_tfrcrr), DestReg)
+      .addReg(SrcReg, KillFlag);
+    return;
+  }
+  if (Hexagon::ModRegsRegClass.contains(DestReg) &&
+      Hexagon::IntRegsRegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::A2_tfrrcr), DestReg)
+      .addReg(SrcReg, KillFlag);
+    return;
+  }
+  if (Hexagon::PredRegsRegClass.contains(SrcReg) &&
+      Hexagon::IntRegsRegClass.contains(DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::C2_tfrpr), DestReg)
+      .addReg(SrcReg, KillFlag);
+    return;
+  }
+  if (Hexagon::IntRegsRegClass.contains(SrcReg) &&
+      Hexagon::PredRegsRegClass.contains(DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::C2_tfrrp), DestReg)
+      .addReg(SrcReg, KillFlag);
+    return;
+  }
+  if (Hexagon::PredRegsRegClass.contains(SrcReg) &&
+      Hexagon::IntRegsRegClass.contains(DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::C2_tfrpr), DestReg)
+      .addReg(SrcReg, KillFlag);
+    return;
+  }
+  if (Hexagon::VectorRegsRegClass.contains(SrcReg, DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::V6_vassign), DestReg).
+      addReg(SrcReg, KillFlag);
+    return;
+  }
+  if (Hexagon::VecDblRegsRegClass.contains(SrcReg, DestReg)) {
+    unsigned LoSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
+    unsigned HiSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+    BuildMI(MBB, I, DL, get(Hexagon::V6_vcombine), DestReg)
+      .addReg(HiSrc, KillFlag)
+      .addReg(LoSrc, KillFlag);
+    return;
+  }
+  if (Hexagon::VecPredRegsRegClass.contains(SrcReg, DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), DestReg)
+      .addReg(SrcReg)
+      .addReg(SrcReg, KillFlag);
+    return;
+  }
+  if (Hexagon::VecPredRegsRegClass.contains(SrcReg) &&
+      Hexagon::VectorRegsRegClass.contains(DestReg)) {
+    llvm_unreachable("Unimplemented pred to vec");
+    return;
+  }
+  if (Hexagon::VecPredRegsRegClass.contains(DestReg) &&
+      Hexagon::VectorRegsRegClass.contains(SrcReg)) {
+    llvm_unreachable("Unimplemented vec to pred");
+    return;
+  }
+  if (Hexagon::VecPredRegs128BRegClass.contains(SrcReg, DestReg)) {
+    unsigned HiDst = HRI.getSubReg(DestReg, Hexagon::vsub_hi);
+    unsigned LoDst = HRI.getSubReg(DestReg, Hexagon::vsub_lo);
+    unsigned HiSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+    unsigned LoSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
+    BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), HiDst)
+      .addReg(HiSrc, KillFlag);
+    BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), LoDst)
+      .addReg(LoSrc, KillFlag);
+    return;
+  }
+
+#ifndef NDEBUG
+  // Show the invalid registers to ease debugging.
+  dbgs() << "Invalid registers for copy in BB#" << MBB.getNumber()
+         << ": " << PrintReg(DestReg, &HRI)
+         << " = " << PrintReg(SrcReg, &HRI) << '\n';
+#endif
+  llvm_unreachable("Unimplemented");
+}
+
+
+void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator I, unsigned SrcReg, bool isKill, int FI,
+      const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBB.findDebugLoc(I);
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+  unsigned KillFlag = getKillRegState(isKill);
+
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
+      MFI.getObjectSize(FI), Align);
+
+  if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::S2_storeri_io))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else if (Hexagon::DoubleRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::S2_storerd_io))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else if (Hexagon::PredRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::STriw_pred))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else if (Hexagon::ModRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::STriw_mod))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else if (Hexagon::VecPredRegs128BRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::PS_vstorerq_ai_128B))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else if (Hexagon::VecPredRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::PS_vstorerq_ai))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else if (Hexagon::VectorRegs128BRegClass.hasSubClassEq(RC)) {
+    unsigned Opc = Align < 128 ? Hexagon::V6_vS32Ub_ai_128B
+                               : Hexagon::V6_vS32b_ai_128B;
+    BuildMI(MBB, I, DL, get(Opc))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else if (Hexagon::VectorRegsRegClass.hasSubClassEq(RC)) {
+    unsigned Opc = Align < 64 ? Hexagon::V6_vS32Ub_ai
+                              : Hexagon::V6_vS32b_ai;
+    BuildMI(MBB, I, DL, get(Opc))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else if (Hexagon::VecDblRegsRegClass.hasSubClassEq(RC)) {
+    unsigned Opc = Align < 64 ? Hexagon::PS_vstorerwu_ai
+                              : Hexagon::PS_vstorerw_ai;
+    BuildMI(MBB, I, DL, get(Opc))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else if (Hexagon::VecDblRegs128BRegClass.hasSubClassEq(RC)) {
+    unsigned Opc = Align < 128 ? Hexagon::PS_vstorerwu_ai_128B
+                               : Hexagon::PS_vstorerw_ai_128B;
+    BuildMI(MBB, I, DL, get(Opc))
+      .addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, KillFlag).addMemOperand(MMO);
+  } else {
+    llvm_unreachable("Unimplemented");
+  }
+}
+
+void HexagonInstrInfo::loadRegFromStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg,
+    int FI, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBB.findDebugLoc(I);
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
+      MFI.getObjectSize(FI), Align);
+
+  if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::L2_loadri_io), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::DoubleRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::L2_loadrd_io), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::PredRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::LDriw_pred), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::ModRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::LDriw_mod), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::VecPredRegs128BRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::PS_vloadrq_ai_128B), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::VecPredRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::PS_vloadrq_ai), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::VecDblRegs128BRegClass.hasSubClassEq(RC)) {
+    unsigned Opc = Align < 128 ? Hexagon::PS_vloadrwu_ai_128B
+                               : Hexagon::PS_vloadrw_ai_128B;
+    BuildMI(MBB, I, DL, get(Opc), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::VectorRegs128BRegClass.hasSubClassEq(RC)) {
+    unsigned Opc = Align < 128 ? Hexagon::V6_vL32Ub_ai_128B
+                               : Hexagon::V6_vL32b_ai_128B;
+    BuildMI(MBB, I, DL, get(Opc), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::VectorRegsRegClass.hasSubClassEq(RC)) {
+    unsigned Opc = Align < 64 ? Hexagon::V6_vL32Ub_ai
+                              : Hexagon::V6_vL32b_ai;
+    BuildMI(MBB, I, DL, get(Opc), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (Hexagon::VecDblRegsRegClass.hasSubClassEq(RC)) {
+    unsigned Opc = Align < 64 ? Hexagon::PS_vloadrwu_ai
+                              : Hexagon::PS_vloadrw_ai;
+    BuildMI(MBB, I, DL, get(Opc), DestReg)
+      .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else {
+    llvm_unreachable("Can't store this register to stack slot");
+  }
+}
+
+
+static void getLiveRegsAt(LivePhysRegs &Regs, const MachineInstr &MI) {
+  const MachineBasicBlock &B = *MI.getParent();
+  Regs.addLiveOuts(B);
+  auto E = ++MachineBasicBlock::const_iterator(MI.getIterator()).getReverse();
+  for (auto I = B.rbegin(); I != E; ++I)
+    Regs.stepBackward(*I);
+}
+
+/// expandPostRAPseudo - This function is called for all pseudo instructions
+/// that remain after register allocation. Many pseudo instructions are
+/// created to help register allocation. This is the place to convert them
+/// into real instructions. The target can edit MI in place, or it can insert
+/// new instructions and erase MI. The function should return true if
+/// anything was changed.
+bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  const HexagonRegisterInfo &HRI = getRegisterInfo();
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Opc = MI.getOpcode();
+  const unsigned VecOffset = 1;
+
+  switch (Opc) {
+    case TargetOpcode::COPY: {
+      MachineOperand &MD = MI.getOperand(0);
+      MachineOperand &MS = MI.getOperand(1);
+      MachineBasicBlock::iterator MBBI = MI.getIterator();
+      if (MD.getReg() != MS.getReg() && !MS.isUndef()) {
+        copyPhysReg(MBB, MI, DL, MD.getReg(), MS.getReg(), MS.isKill());
+        std::prev(MBBI)->copyImplicitOps(*MBB.getParent(), MI);
+      }
+      MBB.erase(MBBI);
+      return true;
+    }
+    case Hexagon::PS_aligna:
+      BuildMI(MBB, MI, DL, get(Hexagon::A2_andir), MI.getOperand(0).getReg())
+          .addReg(HRI.getFrameRegister())
+          .addImm(-MI.getOperand(1).getImm());
+      MBB.erase(MI);
+      return true;
+    case Hexagon::V6_vassignp_128B:
+    case Hexagon::V6_vassignp: {
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned Kill = getKillRegState(MI.getOperand(1).isKill());
+      BuildMI(MBB, MI, DL, get(Hexagon::V6_vcombine), DstReg)
+        .addReg(HRI.getSubReg(SrcReg, Hexagon::vsub_hi), Kill)
+        .addReg(HRI.getSubReg(SrcReg, Hexagon::vsub_lo), Kill);
+      MBB.erase(MI);
+      return true;
+    }
+    case Hexagon::V6_lo_128B:
+    case Hexagon::V6_lo: {
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
+      copyPhysReg(MBB, MI, DL, DstReg, SrcSubLo, MI.getOperand(1).isKill());
+      MBB.erase(MI);
+      MRI.clearKillFlags(SrcSubLo);
+      return true;
+    }
+    case Hexagon::V6_hi_128B:
+    case Hexagon::V6_hi: {
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+      copyPhysReg(MBB, MI, DL, DstReg, SrcSubHi, MI.getOperand(1).isKill());
+      MBB.erase(MI);
+      MRI.clearKillFlags(SrcSubHi);
+      return true;
+    }
+    case Hexagon::PS_vstorerw_ai:
+    case Hexagon::PS_vstorerwu_ai:
+    case Hexagon::PS_vstorerw_ai_128B:
+    case Hexagon::PS_vstorerwu_ai_128B: {
+      bool Is128B = (Opc == Hexagon::PS_vstorerw_ai_128B ||
+                     Opc == Hexagon::PS_vstorerwu_ai_128B);
+      bool Aligned = (Opc == Hexagon::PS_vstorerw_ai ||
+                      Opc == Hexagon::PS_vstorerw_ai_128B);
+      unsigned SrcReg = MI.getOperand(2).getReg();
+      unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+      unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
+      unsigned NewOpc;
+      if (Aligned)
+        NewOpc = Is128B ? Hexagon::V6_vS32b_ai_128B
+                        : Hexagon::V6_vS32b_ai;
+      else
+        NewOpc = Is128B ? Hexagon::V6_vS32Ub_ai_128B
+                        : Hexagon::V6_vS32Ub_ai;
+
+      unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6;
+      MachineInstr *MI1New =
+          BuildMI(MBB, MI, DL, get(NewOpc))
+              .addOperand(MI.getOperand(0))
+              .addImm(MI.getOperand(1).getImm())
+              .addReg(SrcSubLo)
+              .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      MI1New->getOperand(0).setIsKill(false);
+      BuildMI(MBB, MI, DL, get(NewOpc))
+          .addOperand(MI.getOperand(0))
+          // The Vectors are indexed in multiples of vector size.
+          .addImm(MI.getOperand(1).getImm() + Offset)
+          .addReg(SrcSubHi)
+          .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      MBB.erase(MI);
+      return true;
+    }
+    case Hexagon::PS_vloadrw_ai:
+    case Hexagon::PS_vloadrwu_ai:
+    case Hexagon::PS_vloadrw_ai_128B:
+    case Hexagon::PS_vloadrwu_ai_128B: {
+      bool Is128B = (Opc == Hexagon::PS_vloadrw_ai_128B ||
+                     Opc == Hexagon::PS_vloadrwu_ai_128B);
+      bool Aligned = (Opc == Hexagon::PS_vloadrw_ai ||
+                      Opc == Hexagon::PS_vloadrw_ai_128B);
+      unsigned NewOpc;
+      if (Aligned)
+        NewOpc = Is128B ? Hexagon::V6_vL32b_ai_128B
+                        : Hexagon::V6_vL32b_ai;
+      else
+        NewOpc = Is128B ? Hexagon::V6_vL32Ub_ai_128B
+                        : Hexagon::V6_vL32Ub_ai;
+
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6;
+      MachineInstr *MI1New =
+          BuildMI(MBB, MI, DL, get(NewOpc),
+                  HRI.getSubReg(DstReg, Hexagon::vsub_lo))
+              .addOperand(MI.getOperand(1))
+              .addImm(MI.getOperand(2).getImm());
+      MI1New->getOperand(1).setIsKill(false);
+      BuildMI(MBB, MI, DL, get(NewOpc),
+              HRI.getSubReg(DstReg, Hexagon::vsub_hi))
+          .addOperand(MI.getOperand(1))
+          // The Vectors are indexed in multiples of vector size.
+          .addImm(MI.getOperand(2).getImm() + Offset)
+          .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      MBB.erase(MI);
+      return true;
+    }
+    case Hexagon::PS_true: {
+      unsigned Reg = MI.getOperand(0).getReg();
+      BuildMI(MBB, MI, DL, get(Hexagon::C2_orn), Reg)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
+      MBB.erase(MI);
+      return true;
+    }
+    case Hexagon::PS_false: {
+      unsigned Reg = MI.getOperand(0).getReg();
+      BuildMI(MBB, MI, DL, get(Hexagon::C2_andn), Reg)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
+      MBB.erase(MI);
+      return true;
+    }
+    case Hexagon::PS_vmulw: {
+      // Expand a 64-bit vector multiply into 2 32-bit scalar multiplies.
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned Src1Reg = MI.getOperand(1).getReg();
+      unsigned Src2Reg = MI.getOperand(2).getReg();
+      unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::isub_hi);
+      unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::isub_lo);
+      unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::isub_hi);
+      unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::isub_lo);
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_mpyi),
+              HRI.getSubReg(DstReg, Hexagon::isub_hi))
+          .addReg(Src1SubHi)
+          .addReg(Src2SubHi);
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_mpyi),
+              HRI.getSubReg(DstReg, Hexagon::isub_lo))
+          .addReg(Src1SubLo)
+          .addReg(Src2SubLo);
+      MBB.erase(MI);
+      MRI.clearKillFlags(Src1SubHi);
+      MRI.clearKillFlags(Src1SubLo);
+      MRI.clearKillFlags(Src2SubHi);
+      MRI.clearKillFlags(Src2SubLo);
+      return true;
+    }
+    case Hexagon::PS_vmulw_acc: {
+      // Expand 64-bit vector multiply with addition into 2 scalar multiplies.
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned Src1Reg = MI.getOperand(1).getReg();
+      unsigned Src2Reg = MI.getOperand(2).getReg();
+      unsigned Src3Reg = MI.getOperand(3).getReg();
+      unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::isub_hi);
+      unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::isub_lo);
+      unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::isub_hi);
+      unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::isub_lo);
+      unsigned Src3SubHi = HRI.getSubReg(Src3Reg, Hexagon::isub_hi);
+      unsigned Src3SubLo = HRI.getSubReg(Src3Reg, Hexagon::isub_lo);
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_maci),
+              HRI.getSubReg(DstReg, Hexagon::isub_hi))
+          .addReg(Src1SubHi)
+          .addReg(Src2SubHi)
+          .addReg(Src3SubHi);
+      BuildMI(MBB, MI, MI.getDebugLoc(), get(Hexagon::M2_maci),
+              HRI.getSubReg(DstReg, Hexagon::isub_lo))
+          .addReg(Src1SubLo)
+          .addReg(Src2SubLo)
+          .addReg(Src3SubLo);
+      MBB.erase(MI);
+      MRI.clearKillFlags(Src1SubHi);
+      MRI.clearKillFlags(Src1SubLo);
+      MRI.clearKillFlags(Src2SubHi);
+      MRI.clearKillFlags(Src2SubLo);
+      MRI.clearKillFlags(Src3SubHi);
+      MRI.clearKillFlags(Src3SubLo);
+      return true;
+    }
+    case Hexagon::PS_pselect: {
+      const MachineOperand &Op0 = MI.getOperand(0);
+      const MachineOperand &Op1 = MI.getOperand(1);
+      const MachineOperand &Op2 = MI.getOperand(2);
+      const MachineOperand &Op3 = MI.getOperand(3);
+      unsigned Rd = Op0.getReg();
+      unsigned Pu = Op1.getReg();
+      unsigned Rs = Op2.getReg();
+      unsigned Rt = Op3.getReg();
+      DebugLoc DL = MI.getDebugLoc();
+      unsigned K1 = getKillRegState(Op1.isKill());
+      unsigned K2 = getKillRegState(Op2.isKill());
+      unsigned K3 = getKillRegState(Op3.isKill());
+      if (Rd != Rs)
+        BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrpt), Rd)
+          .addReg(Pu, (Rd == Rt) ? K1 : 0)
+          .addReg(Rs, K2);
+      if (Rd != Rt)
+        BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrpf), Rd)
+          .addReg(Pu, K1)
+          .addReg(Rt, K3);
+      MBB.erase(MI);
+      return true;
+    }
+    case Hexagon::PS_vselect:
+    case Hexagon::PS_vselect_128B: {
+      const MachineOperand &Op0 = MI.getOperand(0);
+      const MachineOperand &Op1 = MI.getOperand(1);
+      const MachineOperand &Op2 = MI.getOperand(2);
+      const MachineOperand &Op3 = MI.getOperand(3);
+      LivePhysRegs LiveAtMI(&HRI);
+      getLiveRegsAt(LiveAtMI, MI);
+      bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
+      if (Op0.getReg() != Op2.getReg()) {
+        auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vcmov))
+                    .addOperand(Op0)
+                    .addOperand(Op1)
+                    .addOperand(Op2);
+        if (IsDestLive)
+          T.addReg(Op0.getReg(), RegState::Implicit);
+        IsDestLive = true;
+      }
+      if (Op0.getReg() != Op3.getReg()) {
+        auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vncmov))
+                    .addOperand(Op0)
+                    .addOperand(Op1)
+                    .addOperand(Op3);
+        if (IsDestLive)
+          T.addReg(Op0.getReg(), RegState::Implicit);
+      }
+      MBB.erase(MI);
+      return true;
+    }
+    case Hexagon::PS_wselect:
+    case Hexagon::PS_wselect_128B: {
+      MachineOperand &Op0 = MI.getOperand(0);
+      MachineOperand &Op1 = MI.getOperand(1);
+      MachineOperand &Op2 = MI.getOperand(2);
+      MachineOperand &Op3 = MI.getOperand(3);
+      LivePhysRegs LiveAtMI(&HRI);
+      getLiveRegsAt(LiveAtMI, MI);
+      bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
+
+      if (Op0.getReg() != Op2.getReg()) {
+        unsigned SrcLo = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_lo);
+        unsigned SrcHi = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_hi);
+        auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vccombine))
+                    .addOperand(Op0)
+                    .addOperand(Op1)
+                    .addReg(SrcHi)
+                    .addReg(SrcLo);
+        if (IsDestLive)
+          T.addReg(Op0.getReg(), RegState::Implicit);
+        IsDestLive = true;
+      }
+      if (Op0.getReg() != Op3.getReg()) {
+        unsigned SrcLo = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_lo);
+        unsigned SrcHi = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_hi);
+        auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vnccombine))
+                    .addOperand(Op0)
+                    .addOperand(Op1)
+                    .addReg(SrcHi)
+                    .addReg(SrcLo);
+        if (IsDestLive)
+          T.addReg(Op0.getReg(), RegState::Implicit);
+      }
+      MBB.erase(MI);
+      return true;
+    }
+    case Hexagon::PS_tailcall_i:
+      MI.setDesc(get(Hexagon::J2_jump));
+      return true;
+    case Hexagon::PS_tailcall_r:
+    case Hexagon::PS_jmpret:
+      MI.setDesc(get(Hexagon::J2_jumpr));
+      return true;
+    case Hexagon::PS_jmprett:
+      MI.setDesc(get(Hexagon::J2_jumprt));
+      return true;
+    case Hexagon::PS_jmpretf:
+      MI.setDesc(get(Hexagon::J2_jumprf));
+      return true;
+    case Hexagon::PS_jmprettnewpt:
+      MI.setDesc(get(Hexagon::J2_jumprtnewpt));
+      return true;
+    case Hexagon::PS_jmpretfnewpt:
+      MI.setDesc(get(Hexagon::J2_jumprfnewpt));
+      return true;
+    case Hexagon::PS_jmprettnew:
+      MI.setDesc(get(Hexagon::J2_jumprtnew));
+      return true;
+    case Hexagon::PS_jmpretfnew:
+      MI.setDesc(get(Hexagon::J2_jumprfnew));
+      return true;
+  }
+
+  return false;
+}
+
+
+// We indicate that we want to reverse the branch by
+// inserting the reversed branching opcode.
+bool HexagonInstrInfo::reverseBranchCondition(
+      SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond.empty())
+    return true;
+  assert(Cond[0].isImm() && "First entry in the cond vector not imm-val");
+  unsigned opcode = Cond[0].getImm();
+  //unsigned temp;
+  assert(get(opcode).isBranch() && "Should be a branching condition.");
+  if (isEndLoopN(opcode))
+    return true;
+  unsigned NewOpcode = getInvertedPredicatedOpcode(opcode);
+  Cond[0].setImm(NewOpcode);
+  return false;
+}
+
+
+void HexagonInstrInfo::insertNoop(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator MI) const {
+  DebugLoc DL;
+  BuildMI(MBB, MI, DL, get(Hexagon::A2_nop));
+}
+
+
+bool HexagonInstrInfo::isPostIncrement(const MachineInstr &MI) const {
+  return getAddrMode(MI) == HexagonII::PostInc;
+}
+
+
+// Returns true if an instruction is predicated irrespective of the predicate
+// sense. For example, all of the following will return true.
+// if (p0) R1 = add(R2, R3)
+// if (!p0) R1 = add(R2, R3)
+// if (p0.new) R1 = add(R2, R3)
+// if (!p0.new) R1 = add(R2, R3)
+// Note: New-value stores are not included here as in the current
+// implementation, we don't need to check their predicate sense.
+bool HexagonInstrInfo::isPredicated(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  return (F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask;
+}
+
+
+bool HexagonInstrInfo::PredicateInstruction(
+    MachineInstr &MI, ArrayRef<MachineOperand> Cond) const {
+  if (Cond.empty() || isNewValueJump(Cond[0].getImm()) ||
+      isEndLoopN(Cond[0].getImm())) {
+    DEBUG(dbgs() << "\nCannot predicate:"; MI.dump(););
+    return false;
+  }
+  int Opc = MI.getOpcode();
+  assert (isPredicable(MI) && "Expected predicable instruction");
+  bool invertJump = predOpcodeHasNot(Cond);
+
+  // We have to predicate MI "in place", i.e. after this function returns,
+  // MI will need to be transformed into a predicated form. To avoid com-
+  // plicated manipulations with the operands (handling tied operands,
+  // etc.), build a new temporary instruction, then overwrite MI with it.
+
+  MachineBasicBlock &B = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned PredOpc = getCondOpcode(Opc, invertJump);
+  MachineInstrBuilder T = BuildMI(B, MI, DL, get(PredOpc));
+  unsigned NOp = 0, NumOps = MI.getNumOperands();
+  while (NOp < NumOps) {
+    MachineOperand &Op = MI.getOperand(NOp);
+    if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
+      break;
+    T.addOperand(Op);
+    NOp++;
+  }
+
+  unsigned PredReg, PredRegPos, PredRegFlags;
+  bool GotPredReg = getPredReg(Cond, PredReg, PredRegPos, PredRegFlags);
+  (void)GotPredReg;
+  assert(GotPredReg);
+  T.addReg(PredReg, PredRegFlags);
+  while (NOp < NumOps)
+    T.addOperand(MI.getOperand(NOp++));
+
+  MI.setDesc(get(PredOpc));
+  while (unsigned n = MI.getNumOperands())
+    MI.RemoveOperand(n-1);
+  for (unsigned i = 0, n = T->getNumOperands(); i < n; ++i)
+    MI.addOperand(T->getOperand(i));
+
+  MachineBasicBlock::instr_iterator TI = T->getIterator();
+  B.erase(TI);
+
+  MachineRegisterInfo &MRI = B.getParent()->getRegInfo();
+  MRI.clearKillFlags(PredReg);
+  return true;
+}
+
+
+bool HexagonInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+      ArrayRef<MachineOperand> Pred2) const {
+  // TODO: Fix this
+  return false;
+}
+
+
+bool HexagonInstrInfo::DefinesPredicate(
+    MachineInstr &MI, std::vector<MachineOperand> &Pred) const {
+  auto &HRI = getRegisterInfo();
+  for (unsigned oper = 0; oper < MI.getNumOperands(); ++oper) {
+    MachineOperand MO = MI.getOperand(oper);
+    if (MO.isReg() && MO.isDef()) {
+      const TargetRegisterClass* RC = HRI.getMinimalPhysRegClass(MO.getReg());
+      if (RC == &Hexagon::PredRegsRegClass) {
+        Pred.push_back(MO);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+
+bool HexagonInstrInfo::isPredicable(MachineInstr &MI) const {
+  return MI.getDesc().isPredicable();
+}
+
+bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                            const MachineBasicBlock *MBB,
+                                            const MachineFunction &MF) const {
+  // Debug info is never a scheduling boundary. It's necessary to be explicit
+  // due to the special treatment of IT instructions below, otherwise a
+  // dbg_value followed by an IT will result in the IT instruction being
+  // considered a scheduling hazard, which is wrong. It should be the actual
+  // instruction preceding the dbg_value instruction(s), just like it is
+  // when debug info is not present.
+  if (MI.isDebugValue())
+    return false;
+
+  // Throwing call is a boundary.
+  if (MI.isCall()) {
+    // Don't mess around with no return calls.
+    if (doesNotReturn(MI))
+      return true;
+    // If any of the block's successors is a landing pad, this could be a
+    // throwing call.
+    for (auto I : MBB->successors())
+      if (I->isEHPad())
+        return true;
+  }
+
+  // Terminators and labels can't be scheduled around.
+  if (MI.getDesc().isTerminator() || MI.isPosition())
+    return true;
+
+  if (MI.isInlineAsm() && !ScheduleInlineAsm)
+    return true;
+
+  return false;
+}
+
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorString or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorString or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+/// Hexagon counts the number of ##'s and adjust for that many
+/// constant exenders.
+unsigned HexagonInstrInfo::getInlineAsmLength(const char *Str,
+      const MCAsmInfo &MAI) const {
+  StringRef AStr(Str);
+  // Count the number of instructions in the asm.
+  bool atInsnStart = true;
+  unsigned Length = 0;
+  for (; *Str; ++Str) {
+    if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
+                                strlen(MAI.getSeparatorString())) == 0)
+      atInsnStart = true;
+    if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
+      Length += MAI.getMaxInstLength();
+      atInsnStart = false;
+    }
+    if (atInsnStart && strncmp(Str, MAI.getCommentString().data(),
+                               MAI.getCommentString().size()) == 0)
+      atInsnStart = false;
+  }
+
+  // Add to size number of constant extenders seen * 4.
+  StringRef Occ("##");
+  Length += AStr.count(Occ)*4;
+  return Length;
+}
+
+
+ScheduleHazardRecognizer*
+HexagonInstrInfo::CreateTargetPostRAHazardRecognizer(
+      const InstrItineraryData *II, const ScheduleDAG *DAG) const {
+  if (UseDFAHazardRec) {
+    auto &HST = DAG->MF.getSubtarget<HexagonSubtarget>();
+    return new HexagonHazardRecognizer(II, this, HST);
+  }
+  return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
+}
+
+
+/// \brief For a comparison instruction, return the source registers in
+/// \p SrcReg and \p SrcReg2 if having two register operands, and the value it
+/// compares against in CmpValue. Return true if the comparison instruction
+/// can be analyzed.
+bool HexagonInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                                      unsigned &SrcReg2, int &Mask,
+                                      int &Value) const {
+  unsigned Opc = MI.getOpcode();
+
+  // Set mask and the first source register.
+  switch (Opc) {
+    case Hexagon::C2_cmpeq:
+    case Hexagon::C2_cmpeqp:
+    case Hexagon::C2_cmpgt:
+    case Hexagon::C2_cmpgtp:
+    case Hexagon::C2_cmpgtu:
+    case Hexagon::C2_cmpgtup:
+    case Hexagon::C4_cmpneq:
+    case Hexagon::C4_cmplte:
+    case Hexagon::C4_cmplteu:
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::C2_cmpgti:
+    case Hexagon::C2_cmpgtui:
+    case Hexagon::C4_cmpneqi:
+    case Hexagon::C4_cmplteui:
+    case Hexagon::C4_cmpltei:
+      SrcReg = MI.getOperand(1).getReg();
+      Mask = ~0;
+      break;
+    case Hexagon::A4_cmpbeq:
+    case Hexagon::A4_cmpbgt:
+    case Hexagon::A4_cmpbgtu:
+    case Hexagon::A4_cmpbeqi:
+    case Hexagon::A4_cmpbgti:
+    case Hexagon::A4_cmpbgtui:
+      SrcReg = MI.getOperand(1).getReg();
+      Mask = 0xFF;
+      break;
+    case Hexagon::A4_cmpheq:
+    case Hexagon::A4_cmphgt:
+    case Hexagon::A4_cmphgtu:
+    case Hexagon::A4_cmpheqi:
+    case Hexagon::A4_cmphgti:
+    case Hexagon::A4_cmphgtui:
+      SrcReg = MI.getOperand(1).getReg();
+      Mask = 0xFFFF;
+      break;
+  }
+
+  // Set the value/second source register.
+  switch (Opc) {
+    case Hexagon::C2_cmpeq:
+    case Hexagon::C2_cmpeqp:
+    case Hexagon::C2_cmpgt:
+    case Hexagon::C2_cmpgtp:
+    case Hexagon::C2_cmpgtu:
+    case Hexagon::C2_cmpgtup:
+    case Hexagon::A4_cmpbeq:
+    case Hexagon::A4_cmpbgt:
+    case Hexagon::A4_cmpbgtu:
+    case Hexagon::A4_cmpheq:
+    case Hexagon::A4_cmphgt:
+    case Hexagon::A4_cmphgtu:
+    case Hexagon::C4_cmpneq:
+    case Hexagon::C4_cmplte:
+    case Hexagon::C4_cmplteu:
+      SrcReg2 = MI.getOperand(2).getReg();
+      return true;
+
+    case Hexagon::C2_cmpeqi:
+    case Hexagon::C2_cmpgtui:
+    case Hexagon::C2_cmpgti:
+    case Hexagon::C4_cmpneqi:
+    case Hexagon::C4_cmplteui:
+    case Hexagon::C4_cmpltei:
+    case Hexagon::A4_cmpbeqi:
+    case Hexagon::A4_cmpbgti:
+    case Hexagon::A4_cmpbgtui:
+    case Hexagon::A4_cmpheqi:
+    case Hexagon::A4_cmphgti:
+    case Hexagon::A4_cmphgtui:
+      SrcReg2 = 0;
+      Value = MI.getOperand(2).getImm();
+      return true;
+  }
+
+  return false;
+}
+
+unsigned HexagonInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                           const MachineInstr &MI,
+                                           unsigned *PredCost) const {
+  return getInstrTimingClassLatency(ItinData, MI);
+}
+
+
+DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState(
+    const TargetSubtargetInfo &STI) const {
+  const InstrItineraryData *II = STI.getInstrItineraryData();
+  return static_cast<const HexagonSubtarget&>(STI).createDFAPacketizer(II);
+}
+
+
+// Inspired by this pair:
+//  %R13<def> = L2_loadri_io %R29, 136; mem:LD4[FixedStack0]
+//  S2_storeri_io %R29, 132, %R1<kill>; flags:  mem:ST4[FixedStack1]
+// Currently AA considers the addresses in these instructions to be aliasing.
+bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(
+    MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const {
+  int OffsetA = 0, OffsetB = 0;
+  unsigned SizeA = 0, SizeB = 0;
+
+  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
+      MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
+    return false;
+
+  // Instructions that are pure loads, not loads and stores like memops are not
+  // dependent.
+  if (MIa.mayLoad() && !isMemOp(MIa) && MIb.mayLoad() && !isMemOp(MIb))
+    return true;
+
+  // Get base, offset, and access size in MIa.
+  unsigned BaseRegA = getBaseAndOffset(MIa, OffsetA, SizeA);
+  if (!BaseRegA || !SizeA)
+    return false;
+
+  // Get base, offset, and access size in MIb.
+  unsigned BaseRegB = getBaseAndOffset(MIb, OffsetB, SizeB);
+  if (!BaseRegB || !SizeB)
+    return false;
+
+  if (BaseRegA != BaseRegB)
+    return false;
+
+  // This is a mem access with the same base register and known offsets from it.
+  // Reason about it.
+  if (OffsetA > OffsetB) {
+    uint64_t offDiff = (uint64_t)((int64_t)OffsetA - (int64_t)OffsetB);
+    return (SizeB <= offDiff);
+  } else if (OffsetA < OffsetB) {
+    uint64_t offDiff = (uint64_t)((int64_t)OffsetB - (int64_t)OffsetA);
+    return (SizeA <= offDiff);
+  }
+
+  return false;
+}
+
+
+/// If the instruction is an increment of a constant value, return the amount.
+bool HexagonInstrInfo::getIncrementValue(const MachineInstr &MI,
+      int &Value) const {
+  if (isPostIncrement(MI)) {
+    unsigned AccessSize;
+    return getBaseAndOffset(MI, Value, AccessSize);
+  }
+  if (MI.getOpcode() == Hexagon::A2_addi) {
+    Value = MI.getOperand(2).getImm();
+    return true;
+  }
+
+  return false;
+}
+
+
+unsigned HexagonInstrInfo::createVR(MachineFunction *MF, MVT VT) const {
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterClass *TRC;
+  if (VT == MVT::i1) {
+    TRC = &Hexagon::PredRegsRegClass;
+  } else if (VT == MVT::i32 || VT == MVT::f32) {
+    TRC = &Hexagon::IntRegsRegClass;
+  } else if (VT == MVT::i64 || VT == MVT::f64) {
+    TRC = &Hexagon::DoubleRegsRegClass;
+  } else {
+    llvm_unreachable("Cannot handle this register class");
+  }
+
+  unsigned NewReg = MRI.createVirtualRegister(TRC);
+  return NewReg;
+}
+
+
+bool HexagonInstrInfo::isAbsoluteSet(const MachineInstr &MI) const {
+  return (getAddrMode(MI) == HexagonII::AbsoluteSet);
+}
+
+
+bool HexagonInstrInfo::isAccumulator(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  return((F >> HexagonII::AccumulatorPos) & HexagonII::AccumulatorMask);
+}
+
+
+bool HexagonInstrInfo::isComplex(const MachineInstr &MI) const {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+  if (!(isTC1(MI))
+      && !(QII->isTC2Early(MI))
+      && !(MI.getDesc().mayLoad())
+      && !(MI.getDesc().mayStore())
+      && (MI.getDesc().getOpcode() != Hexagon::S2_allocframe)
+      && (MI.getDesc().getOpcode() != Hexagon::L2_deallocframe)
+      && !(QII->isMemOp(MI))
+      && !(MI.isBranch())
+      && !(MI.isReturn())
+      && !MI.isCall())
+    return true;
+
+  return false;
+}
+
+
+// Return true if the instruction is a compund branch instruction.
+bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr &MI) const {
+  return (getType(MI) == HexagonII::TypeCOMPOUND && MI.isBranch());
+}
+
+
+bool HexagonInstrInfo::isCondInst(const MachineInstr &MI) const {
+  return (MI.isBranch() && isPredicated(MI)) ||
+         isConditionalTransfer(MI) ||
+         isConditionalALU32(MI)    ||
+         isConditionalLoad(MI)     ||
+         // Predicated stores which don't have a .new on any operands.
+         (MI.mayStore() && isPredicated(MI) && !isNewValueStore(MI) &&
+          !isPredicatedNew(MI));
+}
+
+
+bool HexagonInstrInfo::isConditionalALU32(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+    case Hexagon::A2_paddf:
+    case Hexagon::A2_paddfnew:
+    case Hexagon::A2_paddif:
+    case Hexagon::A2_paddifnew:
+    case Hexagon::A2_paddit:
+    case Hexagon::A2_padditnew:
+    case Hexagon::A2_paddt:
+    case Hexagon::A2_paddtnew:
+    case Hexagon::A2_pandf:
+    case Hexagon::A2_pandfnew:
+    case Hexagon::A2_pandt:
+    case Hexagon::A2_pandtnew:
+    case Hexagon::A2_porf:
+    case Hexagon::A2_porfnew:
+    case Hexagon::A2_port:
+    case Hexagon::A2_portnew:
+    case Hexagon::A2_psubf:
+    case Hexagon::A2_psubfnew:
+    case Hexagon::A2_psubt:
+    case Hexagon::A2_psubtnew:
+    case Hexagon::A2_pxorf:
+    case Hexagon::A2_pxorfnew:
+    case Hexagon::A2_pxort:
+    case Hexagon::A2_pxortnew:
+    case Hexagon::A4_paslhf:
+    case Hexagon::A4_paslhfnew:
+    case Hexagon::A4_paslht:
+    case Hexagon::A4_paslhtnew:
+    case Hexagon::A4_pasrhf:
+    case Hexagon::A4_pasrhfnew:
+    case Hexagon::A4_pasrht:
+    case Hexagon::A4_pasrhtnew:
+    case Hexagon::A4_psxtbf:
+    case Hexagon::A4_psxtbfnew:
+    case Hexagon::A4_psxtbt:
+    case Hexagon::A4_psxtbtnew:
+    case Hexagon::A4_psxthf:
+    case Hexagon::A4_psxthfnew:
+    case Hexagon::A4_psxtht:
+    case Hexagon::A4_psxthtnew:
+    case Hexagon::A4_pzxtbf:
+    case Hexagon::A4_pzxtbfnew:
+    case Hexagon::A4_pzxtbt:
+    case Hexagon::A4_pzxtbtnew:
+    case Hexagon::A4_pzxthf:
+    case Hexagon::A4_pzxthfnew:
+    case Hexagon::A4_pzxtht:
+    case Hexagon::A4_pzxthtnew:
+    case Hexagon::C2_ccombinewf:
+    case Hexagon::C2_ccombinewt:
+      return true;
+  }
+  return false;
+}
+
+
+// FIXME - Function name and it's functionality don't match.
+// It should be renamed to hasPredNewOpcode()
+bool HexagonInstrInfo::isConditionalLoad(const MachineInstr &MI) const {
+  if (!MI.getDesc().mayLoad() || !isPredicated(MI))
+    return false;
+
+  int PNewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode());
+  // Instruction with valid predicated-new opcode can be promoted to .new.
+  return PNewOpcode >= 0;
+}
+
+
+// Returns true if an instruction is a conditional store.
+//
+// Note: It doesn't include conditional new-value stores as they can't be
+// converted to .new predicate.
+bool HexagonInstrInfo::isConditionalStore(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+    default: return false;
+    case Hexagon::S4_storeirbt_io:
+    case Hexagon::S4_storeirbf_io:
+    case Hexagon::S4_pstorerbt_rr:
+    case Hexagon::S4_pstorerbf_rr:
+    case Hexagon::S2_pstorerbt_io:
+    case Hexagon::S2_pstorerbf_io:
+    case Hexagon::S2_pstorerbt_pi:
+    case Hexagon::S2_pstorerbf_pi:
+    case Hexagon::S2_pstorerdt_io:
+    case Hexagon::S2_pstorerdf_io:
+    case Hexagon::S4_pstorerdt_rr:
+    case Hexagon::S4_pstorerdf_rr:
+    case Hexagon::S2_pstorerdt_pi:
+    case Hexagon::S2_pstorerdf_pi:
+    case Hexagon::S2_pstorerht_io:
+    case Hexagon::S2_pstorerhf_io:
+    case Hexagon::S4_storeirht_io:
+    case Hexagon::S4_storeirhf_io:
+    case Hexagon::S4_pstorerht_rr:
+    case Hexagon::S4_pstorerhf_rr:
+    case Hexagon::S2_pstorerht_pi:
+    case Hexagon::S2_pstorerhf_pi:
+    case Hexagon::S2_pstorerit_io:
+    case Hexagon::S2_pstorerif_io:
+    case Hexagon::S4_storeirit_io:
+    case Hexagon::S4_storeirif_io:
+    case Hexagon::S4_pstorerit_rr:
+    case Hexagon::S4_pstorerif_rr:
+    case Hexagon::S2_pstorerit_pi:
+    case Hexagon::S2_pstorerif_pi:
+
+    // V4 global address store before promoting to dot new.
+    case Hexagon::S4_pstorerdt_abs:
+    case Hexagon::S4_pstorerdf_abs:
+    case Hexagon::S4_pstorerbt_abs:
+    case Hexagon::S4_pstorerbf_abs:
+    case Hexagon::S4_pstorerht_abs:
+    case Hexagon::S4_pstorerhf_abs:
+    case Hexagon::S4_pstorerit_abs:
+    case Hexagon::S4_pstorerif_abs:
+      return true;
+
+    // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
+    // from the "Conditional Store" list. Because a predicated new value store
+    // would NOT be promoted to a double dot new store.
+    // This function returns yes for those stores that are predicated but not
+    // yet promoted to predicate dot new instructions.
+  }
+}
+
+
+bool HexagonInstrInfo::isConditionalTransfer(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+    case Hexagon::A2_tfrt:
+    case Hexagon::A2_tfrf:
+    case Hexagon::C2_cmoveit:
+    case Hexagon::C2_cmoveif:
+    case Hexagon::A2_tfrtnew:
+    case Hexagon::A2_tfrfnew:
+    case Hexagon::C2_cmovenewit:
+    case Hexagon::C2_cmovenewif:
+    case Hexagon::A2_tfrpt:
+    case Hexagon::A2_tfrpf:
+      return true;
+
+    default:
+      return false;
+  }
+  return false;
+}
+
+
+// TODO: In order to have isExtendable for fpimm/f32Ext, we need to handle
+// isFPImm and later getFPImm as well.
+bool HexagonInstrInfo::isConstExtended(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  unsigned isExtended = (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
+  if (isExtended) // Instruction must be extended.
+    return true;
+
+  unsigned isExtendable =
+    (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
+  if (!isExtendable)
+    return false;
+
+  if (MI.isCall())
+    return false;
+
+  short ExtOpNum = getCExtOpNum(MI);
+  const MachineOperand &MO = MI.getOperand(ExtOpNum);
+  // Use MO operand flags to determine if MO
+  // has the HMOTF_ConstExtended flag set.
+  if (MO.getTargetFlags() && HexagonII::HMOTF_ConstExtended)
+    return true;
+  // If this is a Machine BB address we are talking about, and it is
+  // not marked as extended, say so.
+  if (MO.isMBB())
+    return false;
+
+  // We could be using an instruction with an extendable immediate and shoehorn
+  // a global address into it. If it is a global address it will be constant
+  // extended. We do this for COMBINE.
+  // We currently only handle isGlobal() because it is the only kind of
+  // object we are going to end up with here for now.
+  // In the future we probably should add isSymbol(), etc.
+  if (MO.isGlobal() || MO.isSymbol() || MO.isBlockAddress() ||
+      MO.isJTI() || MO.isCPI() || MO.isFPImm())
+    return true;
+
+  // If the extendable operand is not 'Immediate' type, the instruction should
+  // have 'isExtended' flag set.
+  assert(MO.isImm() && "Extendable operand must be Immediate type");
+
+  int MinValue = getMinValue(MI);
+  int MaxValue = getMaxValue(MI);
+  int ImmValue = MO.getImm();
+
+  return (ImmValue < MinValue || ImmValue > MaxValue);
+}
+
+
+bool HexagonInstrInfo::isDeallocRet(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case Hexagon::L4_return :
+  case Hexagon::L4_return_t :
+  case Hexagon::L4_return_f :
+  case Hexagon::L4_return_tnew_pnt :
+  case Hexagon::L4_return_fnew_pnt :
+  case Hexagon::L4_return_tnew_pt :
+  case Hexagon::L4_return_fnew_pt :
+   return true;
+  }
+  return false;
+}
+
+
+// Return true when ConsMI uses a register defined by ProdMI.
+bool HexagonInstrInfo::isDependent(const MachineInstr &ProdMI,
+      const MachineInstr &ConsMI) const {
+  if (!ProdMI.getDesc().getNumDefs())
+    return false;
+
+  auto &HRI = getRegisterInfo();
+
+  SmallVector<unsigned, 4> DefsA;
+  SmallVector<unsigned, 4> DefsB;
+  SmallVector<unsigned, 8> UsesA;
+  SmallVector<unsigned, 8> UsesB;
+
+  parseOperands(ProdMI, DefsA, UsesA);
+  parseOperands(ConsMI, DefsB, UsesB);
+
+  for (auto &RegA : DefsA)
+    for (auto &RegB : UsesB) {
+      // True data dependency.
+      if (RegA == RegB)
+        return true;
+
+      if (Hexagon::DoubleRegsRegClass.contains(RegA))
+        for (MCSubRegIterator SubRegs(RegA, &HRI); SubRegs.isValid(); ++SubRegs)
+          if (RegB == *SubRegs)
+            return true;
+
+      if (Hexagon::DoubleRegsRegClass.contains(RegB))
+        for (MCSubRegIterator SubRegs(RegB, &HRI); SubRegs.isValid(); ++SubRegs)
+          if (RegA == *SubRegs)
+            return true;
+    }
+
+  return false;
+}
+
+
+// Returns true if the instruction is alread a .cur.
+bool HexagonInstrInfo::isDotCurInst(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case Hexagon::V6_vL32b_cur_pi:
+  case Hexagon::V6_vL32b_cur_ai:
+  case Hexagon::V6_vL32b_cur_pi_128B:
+  case Hexagon::V6_vL32b_cur_ai_128B:
+    return true;
+  }
+  return false;
+}
+
+
+// Returns true, if any one of the operands is a dot new
+// insn, whether it is predicated dot new or register dot new.
+bool HexagonInstrInfo::isDotNewInst(const MachineInstr &MI) const {
+  if (isNewValueInst(MI) || (isPredicated(MI) && isPredicatedNew(MI)))
+    return true;
+
+  return false;
+}
+
+
+/// Symmetrical. See if these two instructions are fit for duplex pair.
+bool HexagonInstrInfo::isDuplexPair(const MachineInstr &MIa,
+      const MachineInstr &MIb) const {
+  HexagonII::SubInstructionGroup MIaG = getDuplexCandidateGroup(MIa);
+  HexagonII::SubInstructionGroup MIbG = getDuplexCandidateGroup(MIb);
+  return (isDuplexPairMatch(MIaG, MIbG) || isDuplexPairMatch(MIbG, MIaG));
+}
+
+
+bool HexagonInstrInfo::isEarlySourceInstr(const MachineInstr &MI) const {
+  if (MI.mayLoad() || MI.mayStore() || MI.isCompare())
+    return true;
+
+  // Multiply
+  unsigned SchedClass = MI.getDesc().getSchedClass();
+  if (SchedClass == Hexagon::Sched::M_tc_3or4x_SLOT23)
+    return true;
+  return false;
+}
+
+
+bool HexagonInstrInfo::isEndLoopN(unsigned Opcode) const {
+  return (Opcode == Hexagon::ENDLOOP0 ||
+          Opcode == Hexagon::ENDLOOP1);
+}
+
+
+bool HexagonInstrInfo::isExpr(unsigned OpType) const {
+  switch(OpType) {
+  case MachineOperand::MO_MachineBasicBlock:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_BlockAddress:
+    return true;
+  default:
+    return false;
+  }
+}
+
+
+bool HexagonInstrInfo::isExtendable(const MachineInstr &MI) const {
+  const MCInstrDesc &MID = MI.getDesc();
+  const uint64_t F = MID.TSFlags;
+  if ((F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask)
+    return true;
+
+  // TODO: This is largely obsolete now. Will need to be removed
+  // in consecutive patches.
+  switch (MI.getOpcode()) {
+    // PS_fi and PS_fia remain special cases.
+    case Hexagon::PS_fi:
+    case Hexagon::PS_fia:
+      return true;
+    default:
+      return false;
+  }
+  return  false;
+}
+
+
+// This returns true in two cases:
+// - The OP code itself indicates that this is an extended instruction.
+// - One of MOs has been marked with HMOTF_ConstExtended flag.
+bool HexagonInstrInfo::isExtended(const MachineInstr &MI) const {
+  // First check if this is permanently extended op code.
+  const uint64_t F = MI.getDesc().TSFlags;
+  if ((F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask)
+    return true;
+  // Use MO operand flags to determine if one of MI's operands
+  // has HMOTF_ConstExtended flag set.
+  for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
+       E = MI.operands_end(); I != E; ++I) {
+    if (I->getTargetFlags() && HexagonII::HMOTF_ConstExtended)
+      return true;
+  }
+  return  false;
+}
+
+
+bool HexagonInstrInfo::isFloat(const MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  const uint64_t F = get(Opcode).TSFlags;
+  return (F >> HexagonII::FPPos) & HexagonII::FPMask;
+}
+
+
+// No V60 HVX VMEM with A_INDIRECT.
+bool HexagonInstrInfo::isHVXMemWithAIndirect(const MachineInstr &I,
+      const MachineInstr &J) const {
+  if (!isV60VectorInstruction(I))
+    return false;
+  if (!I.mayLoad() && !I.mayStore())
+    return false;
+  return J.isIndirectBranch() || isIndirectCall(J) || isIndirectL4Return(J);
+}
+
+
+bool HexagonInstrInfo::isIndirectCall(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case Hexagon::J2_callr :
+  case Hexagon::J2_callrf :
+  case Hexagon::J2_callrt :
+  case Hexagon::PS_call_nr :
+    return true;
+  }
+  return false;
+}
+
+
+bool HexagonInstrInfo::isIndirectL4Return(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case Hexagon::L4_return :
+  case Hexagon::L4_return_t :
+  case Hexagon::L4_return_f :
+  case Hexagon::L4_return_fnew_pnt :
+  case Hexagon::L4_return_fnew_pt :
+  case Hexagon::L4_return_tnew_pnt :
+  case Hexagon::L4_return_tnew_pt :
+    return true;
+  }
+  return false;
+}
+
+
+bool HexagonInstrInfo::isJumpR(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case Hexagon::J2_jumpr :
+  case Hexagon::J2_jumprt :
+  case Hexagon::J2_jumprf :
+  case Hexagon::J2_jumprtnewpt :
+  case Hexagon::J2_jumprfnewpt  :
+  case Hexagon::J2_jumprtnew :
+  case Hexagon::J2_jumprfnew :
+    return true;
+  }
+  return false;
+}
+
+
+// Return true if a given MI can accommodate given offset.
+// Use abs estimate as oppose to the exact number.
+// TODO: This will need to be changed to use MC level
+// definition of instruction extendable field size.
+bool HexagonInstrInfo::isJumpWithinBranchRange(const MachineInstr &MI,
+      unsigned offset) const {
+  // This selection of jump instructions matches to that what
+  // AnalyzeBranch can parse, plus NVJ.
+  if (isNewValueJump(MI)) // r9:2
+    return isInt<11>(offset);
+
+  switch (MI.getOpcode()) {
+  // Still missing Jump to address condition on register value.
+  default:
+    return false;
+  case Hexagon::J2_jump: // bits<24> dst; // r22:2
+  case Hexagon::J2_call:
+  case Hexagon::PS_call_nr:
+    return isInt<24>(offset);
+  case Hexagon::J2_jumpt: //bits<17> dst; // r15:2
+  case Hexagon::J2_jumpf:
+  case Hexagon::J2_jumptnew:
+  case Hexagon::J2_jumptnewpt:
+  case Hexagon::J2_jumpfnew:
+  case Hexagon::J2_jumpfnewpt:
+  case Hexagon::J2_callt:
+  case Hexagon::J2_callf:
+    return isInt<17>(offset);
+  case Hexagon::J2_loop0i:
+  case Hexagon::J2_loop0iext:
+  case Hexagon::J2_loop0r:
+  case Hexagon::J2_loop0rext:
+  case Hexagon::J2_loop1i:
+  case Hexagon::J2_loop1iext:
+  case Hexagon::J2_loop1r:
+  case Hexagon::J2_loop1rext:
+    return isInt<9>(offset);
+  // TODO: Add all the compound branches here. Can we do this in Relation model?
+  case Hexagon::J4_cmpeqi_tp0_jump_nt:
+  case Hexagon::J4_cmpeqi_tp1_jump_nt:
+    return isInt<11>(offset);
+  }
+}
+
+
+bool HexagonInstrInfo::isLateInstrFeedsEarlyInstr(const MachineInstr &LRMI,
+      const MachineInstr &ESMI) const {
+  bool isLate = isLateResultInstr(LRMI);
+  bool isEarly = isEarlySourceInstr(ESMI);
+
+  DEBUG(dbgs() << "V60" <<  (isLate ? "-LR  " : " --  "));
+  DEBUG(LRMI.dump());
+  DEBUG(dbgs() << "V60" <<  (isEarly ? "-ES  " : " --  "));
+  DEBUG(ESMI.dump());
+
+  if (isLate && isEarly) {
+    DEBUG(dbgs() << "++Is Late Result feeding Early Source\n");
+    return true;
+  }
+
+  return false;
+}
+
+
+bool HexagonInstrInfo::isLateResultInstr(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+  case TargetOpcode::PHI:
+    return false;
+  default:
+    break;
+  }
+
+  unsigned SchedClass = MI.getDesc().getSchedClass();
+
+  switch (SchedClass) {
+  case Hexagon::Sched::ALU32_2op_tc_1_SLOT0123:
+  case Hexagon::Sched::ALU32_3op_tc_1_SLOT0123:
+  case Hexagon::Sched::ALU32_ADDI_tc_1_SLOT0123:
+  case Hexagon::Sched::ALU64_tc_1_SLOT23:
+  case Hexagon::Sched::EXTENDER_tc_1_SLOT0123:
+  case Hexagon::Sched::S_2op_tc_1_SLOT23:
+  case Hexagon::Sched::S_3op_tc_1_SLOT23:
+  case Hexagon::Sched::V2LDST_tc_ld_SLOT01:
+  case Hexagon::Sched::V2LDST_tc_st_SLOT0:
+  case Hexagon::Sched::V2LDST_tc_st_SLOT01:
+  case Hexagon::Sched::V4LDST_tc_ld_SLOT01:
+  case Hexagon::Sched::V4LDST_tc_st_SLOT0:
+  case Hexagon::Sched::V4LDST_tc_st_SLOT01:
+    return false;
+  }
+  return true;
+}
+
+
+bool HexagonInstrInfo::isLateSourceInstr(const MachineInstr &MI) const {
+  // Instructions with iclass A_CVI_VX and attribute A_CVI_LATE uses a multiply
+  // resource, but all operands can be received late like an ALU instruction.
+  return MI.getDesc().getSchedClass() == Hexagon::Sched::CVI_VX_LATE;
+}
+
+
+bool HexagonInstrInfo::isLoopN(const MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  return Opcode == Hexagon::J2_loop0i    ||
+         Opcode == Hexagon::J2_loop0r    ||
+         Opcode == Hexagon::J2_loop0iext ||
+         Opcode == Hexagon::J2_loop0rext ||
+         Opcode == Hexagon::J2_loop1i    ||
+         Opcode == Hexagon::J2_loop1r    ||
+         Opcode == Hexagon::J2_loop1iext ||
+         Opcode == Hexagon::J2_loop1rext;
+}
+
+
+bool HexagonInstrInfo::isMemOp(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+    default: return false;
+    case Hexagon::L4_iadd_memopw_io :
+    case Hexagon::L4_isub_memopw_io :
+    case Hexagon::L4_add_memopw_io :
+    case Hexagon::L4_sub_memopw_io :
+    case Hexagon::L4_and_memopw_io :
+    case Hexagon::L4_or_memopw_io :
+    case Hexagon::L4_iadd_memoph_io :
+    case Hexagon::L4_isub_memoph_io :
+    case Hexagon::L4_add_memoph_io :
+    case Hexagon::L4_sub_memoph_io :
+    case Hexagon::L4_and_memoph_io :
+    case Hexagon::L4_or_memoph_io :
+    case Hexagon::L4_iadd_memopb_io :
+    case Hexagon::L4_isub_memopb_io :
+    case Hexagon::L4_add_memopb_io :
+    case Hexagon::L4_sub_memopb_io :
+    case Hexagon::L4_and_memopb_io :
+    case Hexagon::L4_or_memopb_io :
+    case Hexagon::L4_ior_memopb_io:
+    case Hexagon::L4_ior_memoph_io:
+    case Hexagon::L4_ior_memopw_io:
+    case Hexagon::L4_iand_memopb_io:
+    case Hexagon::L4_iand_memoph_io:
+    case Hexagon::L4_iand_memopw_io:
+    return true;
+  }
+  return false;
+}
+
+
+bool HexagonInstrInfo::isNewValue(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  return (F >> HexagonII::NewValuePos) & HexagonII::NewValueMask;
+}
+
+
+bool HexagonInstrInfo::isNewValue(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+  return (F >> HexagonII::NewValuePos) & HexagonII::NewValueMask;
+}
+
+
+bool HexagonInstrInfo::isNewValueInst(const MachineInstr &MI) const {
+  return isNewValueJump(MI) || isNewValueStore(MI);
+}
+
+
+bool HexagonInstrInfo::isNewValueJump(const MachineInstr &MI) const {
+  return isNewValue(MI) && MI.isBranch();
+}
+
+
+bool HexagonInstrInfo::isNewValueJump(unsigned Opcode) const {
+  return isNewValue(Opcode) && get(Opcode).isBranch() && isPredicated(Opcode);
+}
+
+
+bool HexagonInstrInfo::isNewValueStore(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  return (F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask;
+}
+
+
+bool HexagonInstrInfo::isNewValueStore(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+  return (F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask;
+}
+
+
+// Returns true if a particular operand is extendable for an instruction.
+bool HexagonInstrInfo::isOperandExtended(const MachineInstr &MI,
+    unsigned OperandNum) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask)
+          == OperandNum;
+}
+
+
+bool HexagonInstrInfo::isPredicatedNew(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  assert(isPredicated(MI));
+  return (F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask;
+}
+
+
+bool HexagonInstrInfo::isPredicatedNew(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+  assert(isPredicated(Opcode));
+  return (F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask;
+}
+
+
+bool HexagonInstrInfo::isPredicatedTrue(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  return !((F >> HexagonII::PredicatedFalsePos) &
+           HexagonII::PredicatedFalseMask);
+}
+
+
+bool HexagonInstrInfo::isPredicatedTrue(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+  // Make sure that the instruction is predicated.
+  assert((F>> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+  return !((F >> HexagonII::PredicatedFalsePos) &
+           HexagonII::PredicatedFalseMask);
+}
+
+
+bool HexagonInstrInfo::isPredicated(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+  return (F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask;
+}
+
+
+bool HexagonInstrInfo::isPredicateLate(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+  return ~(F >> HexagonII::PredicateLatePos) & HexagonII::PredicateLateMask;
+}
+
+
+bool HexagonInstrInfo::isPredictedTaken(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+  assert(get(Opcode).isBranch() &&
+         (isPredicatedNew(Opcode) || isNewValue(Opcode)));
+  return (F >> HexagonII::TakenPos) & HexagonII::TakenMask;
+}
+
+
+bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr &MI) const {
+  return MI.getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4 ||
+         MI.getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4_EXT ||
+         MI.getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4_PIC ||
+         MI.getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4_EXT_PIC;
+}
+
+bool HexagonInstrInfo::isSignExtendingLoad(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  // Byte
+  case Hexagon::L2_loadrb_io:
+  case Hexagon::L4_loadrb_ur:
+  case Hexagon::L4_loadrb_ap:
+  case Hexagon::L2_loadrb_pr:
+  case Hexagon::L2_loadrb_pbr:
+  case Hexagon::L2_loadrb_pi:
+  case Hexagon::L2_loadrb_pci:
+  case Hexagon::L2_loadrb_pcr:
+  case Hexagon::L2_loadbsw2_io:
+  case Hexagon::L4_loadbsw2_ur:
+  case Hexagon::L4_loadbsw2_ap:
+  case Hexagon::L2_loadbsw2_pr:
+  case Hexagon::L2_loadbsw2_pbr:
+  case Hexagon::L2_loadbsw2_pi:
+  case Hexagon::L2_loadbsw2_pci:
+  case Hexagon::L2_loadbsw2_pcr:
+  case Hexagon::L2_loadbsw4_io:
+  case Hexagon::L4_loadbsw4_ur:
+  case Hexagon::L4_loadbsw4_ap:
+  case Hexagon::L2_loadbsw4_pr:
+  case Hexagon::L2_loadbsw4_pbr:
+  case Hexagon::L2_loadbsw4_pi:
+  case Hexagon::L2_loadbsw4_pci:
+  case Hexagon::L2_loadbsw4_pcr:
+  case Hexagon::L4_loadrb_rr:
+  case Hexagon::L2_ploadrbt_io:
+  case Hexagon::L2_ploadrbt_pi:
+  case Hexagon::L2_ploadrbf_io:
+  case Hexagon::L2_ploadrbf_pi:
+  case Hexagon::L2_ploadrbtnew_io:
+  case Hexagon::L2_ploadrbfnew_io:
+  case Hexagon::L4_ploadrbt_rr:
+  case Hexagon::L4_ploadrbf_rr:
+  case Hexagon::L4_ploadrbtnew_rr:
+  case Hexagon::L4_ploadrbfnew_rr:
+  case Hexagon::L2_ploadrbtnew_pi:
+  case Hexagon::L2_ploadrbfnew_pi:
+  case Hexagon::L4_ploadrbt_abs:
+  case Hexagon::L4_ploadrbf_abs:
+  case Hexagon::L4_ploadrbtnew_abs:
+  case Hexagon::L4_ploadrbfnew_abs:
+  case Hexagon::L2_loadrbgp:
+  // Half
+  case Hexagon::L2_loadrh_io:
+  case Hexagon::L4_loadrh_ur:
+  case Hexagon::L4_loadrh_ap:
+  case Hexagon::L2_loadrh_pr:
+  case Hexagon::L2_loadrh_pbr:
+  case Hexagon::L2_loadrh_pi:
+  case Hexagon::L2_loadrh_pci:
+  case Hexagon::L2_loadrh_pcr:
+  case Hexagon::L4_loadrh_rr:
+  case Hexagon::L2_ploadrht_io:
+  case Hexagon::L2_ploadrht_pi:
+  case Hexagon::L2_ploadrhf_io:
+  case Hexagon::L2_ploadrhf_pi:
+  case Hexagon::L2_ploadrhtnew_io:
+  case Hexagon::L2_ploadrhfnew_io:
+  case Hexagon::L4_ploadrht_rr:
+  case Hexagon::L4_ploadrhf_rr:
+  case Hexagon::L4_ploadrhtnew_rr:
+  case Hexagon::L4_ploadrhfnew_rr:
+  case Hexagon::L2_ploadrhtnew_pi:
+  case Hexagon::L2_ploadrhfnew_pi:
+  case Hexagon::L4_ploadrht_abs:
+  case Hexagon::L4_ploadrhf_abs:
+  case Hexagon::L4_ploadrhtnew_abs:
+  case Hexagon::L4_ploadrhfnew_abs:
+  case Hexagon::L2_loadrhgp:
+    return true;
+  default:
+    return false;
+  }
+}
+
+
+bool HexagonInstrInfo::isSolo(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  return (F >> HexagonII::SoloPos) & HexagonII::SoloMask;
+}
+
+
+bool HexagonInstrInfo::isSpillPredRegOp(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case Hexagon::STriw_pred :
+  case Hexagon::LDriw_pred :
+    return true;
+  default:
+    return false;
+  }
+}
+
+
+bool HexagonInstrInfo::isTailCall(const MachineInstr &MI) const {
+  if (!MI.isBranch())
+    return false;
+
+  for (auto &Op : MI.operands())
+    if (Op.isGlobal() || Op.isSymbol())
+      return true;
+  return false;
+}
+
+
+// Returns true when SU has a timing class TC1.
+bool HexagonInstrInfo::isTC1(const MachineInstr &MI) const {
+  unsigned SchedClass = MI.getDesc().getSchedClass();
+  switch (SchedClass) {
+  case Hexagon::Sched::ALU32_2op_tc_1_SLOT0123:
+  case Hexagon::Sched::ALU32_3op_tc_1_SLOT0123:
+  case Hexagon::Sched::ALU32_ADDI_tc_1_SLOT0123:
+  case Hexagon::Sched::ALU64_tc_1_SLOT23:
+  case Hexagon::Sched::EXTENDER_tc_1_SLOT0123:
+  //case Hexagon::Sched::M_tc_1_SLOT23:
+  case Hexagon::Sched::S_2op_tc_1_SLOT23:
+  case Hexagon::Sched::S_3op_tc_1_SLOT23:
+    return true;
+
+  default:
+    return false;
+  }
+}
+
+
+bool HexagonInstrInfo::isTC2(const MachineInstr &MI) const {
+  unsigned SchedClass = MI.getDesc().getSchedClass();
+  switch (SchedClass) {
+  case Hexagon::Sched::ALU32_3op_tc_2_SLOT0123:
+  case Hexagon::Sched::ALU64_tc_2_SLOT23:
+  case Hexagon::Sched::CR_tc_2_SLOT3:
+  case Hexagon::Sched::M_tc_2_SLOT23:
+  case Hexagon::Sched::S_2op_tc_2_SLOT23:
+  case Hexagon::Sched::S_3op_tc_2_SLOT23:
+    return true;
+
+  default:
+    return false;
+  }
+}
+
+
+bool HexagonInstrInfo::isTC2Early(const MachineInstr &MI) const {
+  unsigned SchedClass = MI.getDesc().getSchedClass();
+  switch (SchedClass) {
+  case Hexagon::Sched::ALU32_2op_tc_2early_SLOT0123:
+  case Hexagon::Sched::ALU32_3op_tc_2early_SLOT0123:
+  case Hexagon::Sched::ALU64_tc_2early_SLOT23:
+  case Hexagon::Sched::CR_tc_2early_SLOT23:
+  case Hexagon::Sched::CR_tc_2early_SLOT3:
+  case Hexagon::Sched::J_tc_2early_SLOT0123:
+  case Hexagon::Sched::J_tc_2early_SLOT2:
+  case Hexagon::Sched::J_tc_2early_SLOT23:
+  case Hexagon::Sched::S_2op_tc_2early_SLOT23:
+  case Hexagon::Sched::S_3op_tc_2early_SLOT23:
+    return true;
+
+  default:
+    return false;
+  }
+}
+
+
+bool HexagonInstrInfo::isTC4x(const MachineInstr &MI) const {
+  unsigned SchedClass = MI.getDesc().getSchedClass();
+  return SchedClass == Hexagon::Sched::M_tc_3or4x_SLOT23;
+}
+
+
+// Schedule this ASAP.
+bool HexagonInstrInfo::isToBeScheduledASAP(const MachineInstr &MI1,
+      const MachineInstr &MI2) const {
+  if (mayBeCurLoad(MI1)) {
+    // if (result of SU is used in Next) return true;
+    unsigned DstReg = MI1.getOperand(0).getReg();
+    int N = MI2.getNumOperands();
+    for (int I = 0; I < N; I++)
+      if (MI2.getOperand(I).isReg() && DstReg == MI2.getOperand(I).getReg())
+        return true;
+  }
+  if (mayBeNewStore(MI2))
+    if (MI2.getOpcode() == Hexagon::V6_vS32b_pi)
+      if (MI1.getOperand(0).isReg() && MI2.getOperand(3).isReg() &&
+          MI1.getOperand(0).getReg() == MI2.getOperand(3).getReg())
+        return true;
+  return false;
+}
+
+
+bool HexagonInstrInfo::isV60VectorInstruction(const MachineInstr &MI) const {
+  const uint64_t V = getType(MI);
+  return HexagonII::TypeCVI_FIRST <= V && V <= HexagonII::TypeCVI_LAST;
+}
+
+
+// Check if the Offset is a valid auto-inc imm by Load/Store Type.
+//
+bool HexagonInstrInfo::isValidAutoIncImm(const EVT VT, const int Offset) const {
+  if (VT == MVT::v16i32 || VT == MVT::v8i64 ||
+      VT == MVT::v32i16 || VT == MVT::v64i8) {
+      return (Offset >= Hexagon_MEMV_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMV_AUTOINC_MAX &&
+              (Offset & 0x3f) == 0);
+  }
+  // 128B
+  if (VT == MVT::v32i32 || VT == MVT::v16i64 ||
+      VT == MVT::v64i16 || VT == MVT::v128i8) {
+      return (Offset >= Hexagon_MEMV_AUTOINC_MIN_128B &&
+              Offset <= Hexagon_MEMV_AUTOINC_MAX_128B &&
+              (Offset & 0x7f) == 0);
+  }
+  if (VT == MVT::i64) {
+      return (Offset >= Hexagon_MEMD_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMD_AUTOINC_MAX &&
+              (Offset & 0x7) == 0);
+  }
+  if (VT == MVT::i32) {
+      return (Offset >= Hexagon_MEMW_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMW_AUTOINC_MAX &&
+              (Offset & 0x3) == 0);
+  }
+  if (VT == MVT::i16) {
+      return (Offset >= Hexagon_MEMH_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMH_AUTOINC_MAX &&
+              (Offset & 0x1) == 0);
+  }
+  if (VT == MVT::i8) {
+      return (Offset >= Hexagon_MEMB_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMB_AUTOINC_MAX);
+  }
+  llvm_unreachable("Not an auto-inc opc!");
+}
+
+
+bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
+      bool Extend) const {
+  // This function is to check whether the "Offset" is in the correct range of
+  // the given "Opcode". If "Offset" is not in the correct range, "A2_addi" is
+  // inserted to calculate the final address. Due to this reason, the function
+  // assumes that the "Offset" has correct alignment.
+  // We used to assert if the offset was not properly aligned, however,
+  // there are cases where a misaligned pointer recast can cause this
+  // problem, and we need to allow for it. The front end warns of such
+  // misaligns with respect to load size.
+
+  switch (Opcode) {
+  case Hexagon::PS_vstorerq_ai:
+  case Hexagon::PS_vstorerw_ai:
+  case Hexagon::PS_vloadrq_ai:
+  case Hexagon::PS_vloadrw_ai:
+  case Hexagon::V6_vL32b_ai:
+  case Hexagon::V6_vS32b_ai:
+  case Hexagon::V6_vL32Ub_ai:
+  case Hexagon::V6_vS32Ub_ai:
+    return (Offset >= Hexagon_MEMV_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMV_OFFSET_MAX);
+
+  case Hexagon::PS_vstorerq_ai_128B:
+  case Hexagon::PS_vstorerw_ai_128B:
+  case Hexagon::PS_vloadrq_ai_128B:
+  case Hexagon::PS_vloadrw_ai_128B:
+  case Hexagon::V6_vL32b_ai_128B:
+  case Hexagon::V6_vS32b_ai_128B:
+  case Hexagon::V6_vL32Ub_ai_128B:
+  case Hexagon::V6_vS32Ub_ai_128B:
+    return (Offset >= Hexagon_MEMV_OFFSET_MIN_128B) &&
+      (Offset <= Hexagon_MEMV_OFFSET_MAX_128B);
+
+  case Hexagon::J2_loop0i:
+  case Hexagon::J2_loop1i:
+    return isUInt<10>(Offset);
+
+  case Hexagon::S4_storeirb_io:
+  case Hexagon::S4_storeirbt_io:
+  case Hexagon::S4_storeirbf_io:
+    return isUInt<6>(Offset);
+
+  case Hexagon::S4_storeirh_io:
+  case Hexagon::S4_storeirht_io:
+  case Hexagon::S4_storeirhf_io:
+    return isShiftedUInt<6,1>(Offset);
+
+  case Hexagon::S4_storeiri_io:
+  case Hexagon::S4_storeirit_io:
+  case Hexagon::S4_storeirif_io:
+    return isShiftedUInt<6,2>(Offset);
+  }
+
+  if (Extend)
+    return true;
+
+  switch (Opcode) {
+  case Hexagon::L2_loadri_io:
+  case Hexagon::S2_storeri_io:
+    return (Offset >= Hexagon_MEMW_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMW_OFFSET_MAX);
+
+  case Hexagon::L2_loadrd_io:
+  case Hexagon::S2_storerd_io:
+    return (Offset >= Hexagon_MEMD_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMD_OFFSET_MAX);
+
+  case Hexagon::L2_loadrh_io:
+  case Hexagon::L2_loadruh_io:
+  case Hexagon::S2_storerh_io:
+    return (Offset >= Hexagon_MEMH_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMH_OFFSET_MAX);
+
+  case Hexagon::L2_loadrb_io:
+  case Hexagon::L2_loadrub_io:
+  case Hexagon::S2_storerb_io:
+    return (Offset >= Hexagon_MEMB_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMB_OFFSET_MAX);
+
+  case Hexagon::A2_addi:
+    return (Offset >= Hexagon_ADDI_OFFSET_MIN) &&
+      (Offset <= Hexagon_ADDI_OFFSET_MAX);
+
+  case Hexagon::L4_iadd_memopw_io :
+  case Hexagon::L4_isub_memopw_io :
+  case Hexagon::L4_add_memopw_io :
+  case Hexagon::L4_sub_memopw_io :
+  case Hexagon::L4_and_memopw_io :
+  case Hexagon::L4_or_memopw_io :
+    return (0 <= Offset && Offset <= 255);
+
+  case Hexagon::L4_iadd_memoph_io :
+  case Hexagon::L4_isub_memoph_io :
+  case Hexagon::L4_add_memoph_io :
+  case Hexagon::L4_sub_memoph_io :
+  case Hexagon::L4_and_memoph_io :
+  case Hexagon::L4_or_memoph_io :
+    return (0 <= Offset && Offset <= 127);
+
+  case Hexagon::L4_iadd_memopb_io :
+  case Hexagon::L4_isub_memopb_io :
+  case Hexagon::L4_add_memopb_io :
+  case Hexagon::L4_sub_memopb_io :
+  case Hexagon::L4_and_memopb_io :
+  case Hexagon::L4_or_memopb_io :
+    return (0 <= Offset && Offset <= 63);
+
+  // LDriw_xxx and STriw_xxx are pseudo operations, so it has to take offset of
+  // any size. Later pass knows how to handle it.
+  case Hexagon::STriw_pred:
+  case Hexagon::LDriw_pred:
+  case Hexagon::STriw_mod:
+  case Hexagon::LDriw_mod:
+    return true;
+
+  case Hexagon::PS_fi:
+  case Hexagon::PS_fia:
+  case Hexagon::INLINEASM:
+    return true;
+
+  case Hexagon::L2_ploadrbt_io:
+  case Hexagon::L2_ploadrbf_io:
+  case Hexagon::L2_ploadrubt_io:
+  case Hexagon::L2_ploadrubf_io:
+  case Hexagon::S2_pstorerbt_io:
+  case Hexagon::S2_pstorerbf_io:
+    return isUInt<6>(Offset);
+
+  case Hexagon::L2_ploadrht_io:
+  case Hexagon::L2_ploadrhf_io:
+  case Hexagon::L2_ploadruht_io:
+  case Hexagon::L2_ploadruhf_io:
+  case Hexagon::S2_pstorerht_io:
+  case Hexagon::S2_pstorerhf_io:
+    return isShiftedUInt<6,1>(Offset);
+
+  case Hexagon::L2_ploadrit_io:
+  case Hexagon::L2_ploadrif_io:
+  case Hexagon::S2_pstorerit_io:
+  case Hexagon::S2_pstorerif_io:
+    return isShiftedUInt<6,2>(Offset);
+
+  case Hexagon::L2_ploadrdt_io:
+  case Hexagon::L2_ploadrdf_io:
+  case Hexagon::S2_pstorerdt_io:
+  case Hexagon::S2_pstorerdf_io:
+    return isShiftedUInt<6,3>(Offset);
+  } // switch
+
+  llvm_unreachable("No offset range is defined for this opcode. "
+                   "Please define it in the above switch statement!");
+}
+
+
+bool HexagonInstrInfo::isVecAcc(const MachineInstr &MI) const {
+  return isV60VectorInstruction(MI) && isAccumulator(MI);
+}
+
+
+bool HexagonInstrInfo::isVecALU(const MachineInstr &MI) const {
+  const uint64_t F = get(MI.getOpcode()).TSFlags;
+  const uint64_t V = ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
+  return
+    V == HexagonII::TypeCVI_VA         ||
+    V == HexagonII::TypeCVI_VA_DV;
+}
+
+
+bool HexagonInstrInfo::isVecUsableNextPacket(const MachineInstr &ProdMI,
+      const MachineInstr &ConsMI) const {
+  if (EnableACCForwarding && isVecAcc(ProdMI) && isVecAcc(ConsMI))
+    return true;
+
+  if (EnableALUForwarding && (isVecALU(ConsMI) || isLateSourceInstr(ConsMI)))
+    return true;
+
+  if (mayBeNewStore(ConsMI))
+    return true;
+
+  return false;
+}
+
+bool HexagonInstrInfo::isZeroExtendingLoad(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  // Byte
+  case Hexagon::L2_loadrub_io:
+  case Hexagon::L4_loadrub_ur:
+  case Hexagon::L4_loadrub_ap:
+  case Hexagon::L2_loadrub_pr:
+  case Hexagon::L2_loadrub_pbr:
+  case Hexagon::L2_loadrub_pi:
+  case Hexagon::L2_loadrub_pci:
+  case Hexagon::L2_loadrub_pcr:
+  case Hexagon::L2_loadbzw2_io:
+  case Hexagon::L4_loadbzw2_ur:
+  case Hexagon::L4_loadbzw2_ap:
+  case Hexagon::L2_loadbzw2_pr:
+  case Hexagon::L2_loadbzw2_pbr:
+  case Hexagon::L2_loadbzw2_pi:
+  case Hexagon::L2_loadbzw2_pci:
+  case Hexagon::L2_loadbzw2_pcr:
+  case Hexagon::L2_loadbzw4_io:
+  case Hexagon::L4_loadbzw4_ur:
+  case Hexagon::L4_loadbzw4_ap:
+  case Hexagon::L2_loadbzw4_pr:
+  case Hexagon::L2_loadbzw4_pbr:
+  case Hexagon::L2_loadbzw4_pi:
+  case Hexagon::L2_loadbzw4_pci:
+  case Hexagon::L2_loadbzw4_pcr:
+  case Hexagon::L4_loadrub_rr:
+  case Hexagon::L2_ploadrubt_io:
+  case Hexagon::L2_ploadrubt_pi:
+  case Hexagon::L2_ploadrubf_io:
+  case Hexagon::L2_ploadrubf_pi:
+  case Hexagon::L2_ploadrubtnew_io:
+  case Hexagon::L2_ploadrubfnew_io:
+  case Hexagon::L4_ploadrubt_rr:
+  case Hexagon::L4_ploadrubf_rr:
+  case Hexagon::L4_ploadrubtnew_rr:
+  case Hexagon::L4_ploadrubfnew_rr:
+  case Hexagon::L2_ploadrubtnew_pi:
+  case Hexagon::L2_ploadrubfnew_pi:
+  case Hexagon::L4_ploadrubt_abs:
+  case Hexagon::L4_ploadrubf_abs:
+  case Hexagon::L4_ploadrubtnew_abs:
+  case Hexagon::L4_ploadrubfnew_abs:
+  case Hexagon::L2_loadrubgp:
+  // Half
+  case Hexagon::L2_loadruh_io:
+  case Hexagon::L4_loadruh_ur:
+  case Hexagon::L4_loadruh_ap:
+  case Hexagon::L2_loadruh_pr:
+  case Hexagon::L2_loadruh_pbr:
+  case Hexagon::L2_loadruh_pi:
+  case Hexagon::L2_loadruh_pci:
+  case Hexagon::L2_loadruh_pcr:
+  case Hexagon::L4_loadruh_rr:
+  case Hexagon::L2_ploadruht_io:
+  case Hexagon::L2_ploadruht_pi:
+  case Hexagon::L2_ploadruhf_io:
+  case Hexagon::L2_ploadruhf_pi:
+  case Hexagon::L2_ploadruhtnew_io:
+  case Hexagon::L2_ploadruhfnew_io:
+  case Hexagon::L4_ploadruht_rr:
+  case Hexagon::L4_ploadruhf_rr:
+  case Hexagon::L4_ploadruhtnew_rr:
+  case Hexagon::L4_ploadruhfnew_rr:
+  case Hexagon::L2_ploadruhtnew_pi:
+  case Hexagon::L2_ploadruhfnew_pi:
+  case Hexagon::L4_ploadruht_abs:
+  case Hexagon::L4_ploadruhf_abs:
+  case Hexagon::L4_ploadruhtnew_abs:
+  case Hexagon::L4_ploadruhfnew_abs:
+  case Hexagon::L2_loadruhgp:
+    return true;
+  default:
+    return false;
+  }
+}
+
+
+// Add latency to instruction.
+bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr &MI1,
+      const MachineInstr &MI2) const {
+  if (isV60VectorInstruction(MI1) && isV60VectorInstruction(MI2))
+    if (!isVecUsableNextPacket(MI1, MI2))
+      return true;
+  return false;
+}
+
+
+/// \brief Get the base register and byte offset of a load/store instr.
+bool HexagonInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt,
+      unsigned &BaseReg, int64_t &Offset, const TargetRegisterInfo *TRI)
+      const {
+  unsigned AccessSize = 0;
+  int OffsetVal = 0;
+  BaseReg = getBaseAndOffset(LdSt, OffsetVal, AccessSize);
+  Offset = OffsetVal;
+  return BaseReg != 0;
+}
+
+
+/// \brief Can these instructions execute at the same time in a bundle.
+bool HexagonInstrInfo::canExecuteInBundle(const MachineInstr &First,
+      const MachineInstr &Second) const {
+  if (DisableNVSchedule)
+    return false;
+  if (mayBeNewStore(Second)) {
+    // Make sure the definition of the first instruction is the value being
+    // stored.
+    const MachineOperand &Stored =
+      Second.getOperand(Second.getNumOperands() - 1);
+    if (!Stored.isReg())
+      return false;
+    for (unsigned i = 0, e = First.getNumOperands(); i < e; ++i) {
+      const MachineOperand &Op = First.getOperand(i);
+      if (Op.isReg() && Op.isDef() && Op.getReg() == Stored.getReg())
+        return true;
+    }
+  }
+  return false;
+}
+
+
+bool HexagonInstrInfo::doesNotReturn(const MachineInstr &CallMI) const {
+  unsigned Opc = CallMI.getOpcode();
+  return Opc == Hexagon::PS_call_nr || Opc == Hexagon::PS_callr_nr;
+}
+
+
+bool HexagonInstrInfo::hasEHLabel(const MachineBasicBlock *B) const {
+  for (auto &I : *B)
+    if (I.isEHLabel())
+      return true;
+  return false;
+}
+
+
+// Returns true if an instruction can be converted into a non-extended
+// equivalent instruction.
+bool HexagonInstrInfo::hasNonExtEquivalent(const MachineInstr &MI) const {
+  short NonExtOpcode;
+  // Check if the instruction has a register form that uses register in place
+  // of the extended operand, if so return that as the non-extended form.
+  if (Hexagon::getRegForm(MI.getOpcode()) >= 0)
+    return true;
+
+  if (MI.getDesc().mayLoad() || MI.getDesc().mayStore()) {
+    // Check addressing mode and retrieve non-ext equivalent instruction.
+
+    switch (getAddrMode(MI)) {
+    case HexagonII::Absolute :
+      // Load/store with absolute addressing mode can be converted into
+      // base+offset mode.
+      NonExtOpcode = Hexagon::getBaseWithImmOffset(MI.getOpcode());
+      break;
+    case HexagonII::BaseImmOffset :
+      // Load/store with base+offset addressing mode can be converted into
+      // base+register offset addressing mode. However left shift operand should
+      // be set to 0.
+      NonExtOpcode = Hexagon::getBaseWithRegOffset(MI.getOpcode());
+      break;
+    case HexagonII::BaseLongOffset:
+      NonExtOpcode = Hexagon::getRegShlForm(MI.getOpcode());
+      break;
+    default:
+      return false;
+    }
+    if (NonExtOpcode < 0)
+      return false;
+    return true;
+  }
+  return false;
+}
+
+
+bool HexagonInstrInfo::hasPseudoInstrPair(const MachineInstr &MI) const {
+  return Hexagon::getRealHWInstr(MI.getOpcode(),
+                                 Hexagon::InstrType_Pseudo) >= 0;
+}
+
+
+bool HexagonInstrInfo::hasUncondBranch(const MachineBasicBlock *B)
+      const {
+  MachineBasicBlock::const_iterator I = B->getFirstTerminator(), E = B->end();
+  while (I != E) {
+    if (I->isBarrier())
+      return true;
+    ++I;
+  }
+  return false;
+}
+
+
+// Returns true, if a LD insn can be promoted to a cur load.
+bool HexagonInstrInfo::mayBeCurLoad(const MachineInstr &MI) const {
+  auto &HST = MI.getParent()->getParent()->getSubtarget<HexagonSubtarget>();
+  const uint64_t F = MI.getDesc().TSFlags;
+  return ((F >> HexagonII::mayCVLoadPos) & HexagonII::mayCVLoadMask) &&
+         HST.hasV60TOps();
+}
+
+
+// Returns true, if a ST insn can be promoted to a new-value store.
+bool HexagonInstrInfo::mayBeNewStore(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  return (F >> HexagonII::mayNVStorePos) & HexagonII::mayNVStoreMask;
+}
+
+
+bool HexagonInstrInfo::producesStall(const MachineInstr &ProdMI,
+      const MachineInstr &ConsMI) const {
+  // There is no stall when ProdMI is not a V60 vector.
+  if (!isV60VectorInstruction(ProdMI))
+    return false;
+
+  // There is no stall when ProdMI and ConsMI are not dependent.
+  if (!isDependent(ProdMI, ConsMI))
+    return false;
+
+  // When Forward Scheduling is enabled, there is no stall if ProdMI and ConsMI
+  // are scheduled in consecutive packets.
+  if (isVecUsableNextPacket(ProdMI, ConsMI))
+    return false;
+
+  return true;
+}
+
+
+bool HexagonInstrInfo::producesStall(const MachineInstr &MI,
+      MachineBasicBlock::const_instr_iterator BII) const {
+  // There is no stall when I is not a V60 vector.
+  if (!isV60VectorInstruction(MI))
+    return false;
+
+  MachineBasicBlock::const_instr_iterator MII = BII;
+  MachineBasicBlock::const_instr_iterator MIE = MII->getParent()->instr_end();
+
+  if (!MII->isBundle()) {
+    const MachineInstr &J = *MII;
+    if (!isV60VectorInstruction(J))
+      return false;
+    else if (isVecUsableNextPacket(J, MI))
+      return false;
+    return true;
+  }
+
+  for (++MII; MII != MIE && MII->isInsideBundle(); ++MII) {
+    const MachineInstr &J = *MII;
+    if (producesStall(J, MI))
+      return true;
+  }
+  return false;
+}
+
+
+bool HexagonInstrInfo::predCanBeUsedAsDotNew(const MachineInstr &MI,
+      unsigned PredReg) const {
+  for (unsigned opNum = 0; opNum < MI.getNumOperands(); opNum++) {
+    const MachineOperand &MO = MI.getOperand(opNum);
+    if (MO.isReg() && MO.isDef() && MO.isImplicit() && (MO.getReg() == PredReg))
+      return false; // Predicate register must be explicitly defined.
+  }
+
+  // Hexagon Programmer's Reference says that decbin, memw_locked, and
+  // memd_locked cannot be used as .new as well,
+  // but we don't seem to have these instructions defined.
+  return MI.getOpcode() != Hexagon::A4_tlbmatch;
+}
+
+
+bool HexagonInstrInfo::PredOpcodeHasJMP_c(unsigned Opcode) const {
+  return (Opcode == Hexagon::J2_jumpt)      ||
+         (Opcode == Hexagon::J2_jumpf)      ||
+         (Opcode == Hexagon::J2_jumptnew)   ||
+         (Opcode == Hexagon::J2_jumpfnew)   ||
+         (Opcode == Hexagon::J2_jumptnewpt) ||
+         (Opcode == Hexagon::J2_jumpfnewpt);
+}
+
+
+bool HexagonInstrInfo::predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const {
+  if (Cond.empty() || !isPredicated(Cond[0].getImm()))
+    return false;
+  return !isPredicatedTrue(Cond[0].getImm());
+}
+
+
+short HexagonInstrInfo::getAbsoluteForm(const MachineInstr &MI) const {
+  return Hexagon::getAbsoluteForm(MI.getOpcode());
+}
+
+
+unsigned HexagonInstrInfo::getAddrMode(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  return (F >> HexagonII::AddrModePos) & HexagonII::AddrModeMask;
+}
+
+
+// Returns the base register in a memory access (load/store). The offset is
+// returned in Offset and the access size is returned in AccessSize.
+unsigned HexagonInstrInfo::getBaseAndOffset(const MachineInstr &MI,
+      int &Offset, unsigned &AccessSize) const {
+  // Return if it is not a base+offset type instruction or a MemOp.
+  if (getAddrMode(MI) != HexagonII::BaseImmOffset &&
+      getAddrMode(MI) != HexagonII::BaseLongOffset &&
+      !isMemOp(MI) && !isPostIncrement(MI))
+    return 0;
+
+  // Since it is a memory access instruction, getMemAccessSize() should never
+  // return 0.
+  assert (getMemAccessSize(MI) &&
+          "BaseImmOffset or BaseLongOffset or MemOp without accessSize");
+
+  // Return Values of getMemAccessSize() are
+  // 0 - Checked in the assert above.
+  // 1, 2, 3, 4 & 7, 8 - The statement below is correct for all these.
+  // MemAccessSize is represented as 1+log2(N) where N is size in bits.
+  AccessSize = (1U << (getMemAccessSize(MI) - 1));
+
+  unsigned basePos = 0, offsetPos = 0;
+  if (!getBaseAndOffsetPosition(MI, basePos, offsetPos))
+    return 0;
+
+  // Post increment updates its EA after the mem access,
+  // so we need to treat its offset as zero.
+  if (isPostIncrement(MI))
+    Offset = 0;
+  else {
+    Offset = MI.getOperand(offsetPos).getImm();
+  }
+
+  return MI.getOperand(basePos).getReg();
+}
+
+
+/// Return the position of the base and offset operands for this instruction.
+bool HexagonInstrInfo::getBaseAndOffsetPosition(const MachineInstr &MI,
+      unsigned &BasePos, unsigned &OffsetPos) const {
+  // Deal with memops first.
+  if (isMemOp(MI)) {
+    BasePos = 0;
+    OffsetPos = 1;
+  } else if (MI.mayStore()) {
+    BasePos = 0;
+    OffsetPos = 1;
+  } else if (MI.mayLoad()) {
+    BasePos = 1;
+    OffsetPos = 2;
+  } else
+    return false;
+
+  if (isPredicated(MI)) {
+    BasePos++;
+    OffsetPos++;
+  }
+  if (isPostIncrement(MI)) {
+    BasePos++;
+    OffsetPos++;
+  }
+
+  if (!MI.getOperand(BasePos).isReg() || !MI.getOperand(OffsetPos).isImm())
+    return false;
+
+  return true;
+}
+
+
+// Inserts branching instructions in reverse order of their occurrence.
+// e.g. jump_t t1 (i1)
+// jump t2        (i2)
+// Jumpers = {i2, i1}
+SmallVector<MachineInstr*, 2> HexagonInstrInfo::getBranchingInstrs(
+      MachineBasicBlock& MBB) const {
+  SmallVector<MachineInstr*, 2> Jumpers;
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::instr_iterator I = MBB.instr_end();
+  if (I == MBB.instr_begin())
+    return Jumpers;
+
+  // A basic block may looks like this:
+  //
+  //  [   insn
+  //     EH_LABEL
+  //      insn
+  //      insn
+  //      insn
+  //     EH_LABEL
+  //      insn     ]
+  //
+  // It has two succs but does not have a terminator
+  // Don't know how to handle it.
+  do {
+    --I;
+    if (I->isEHLabel())
+      return Jumpers;
+  } while (I != MBB.instr_begin());
+
+  I = MBB.instr_end();
+  --I;
+
+  while (I->isDebugValue()) {
+    if (I == MBB.instr_begin())
+      return Jumpers;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(*I))
+    return Jumpers;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = &*I;
+  Jumpers.push_back(LastInst);
+  MachineInstr *SecondLastInst = nullptr;
+  // Find one more terminator if present.
+  do {
+    if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(*I)) {
+      if (!SecondLastInst) {
+        SecondLastInst = &*I;
+        Jumpers.push_back(SecondLastInst);
+      } else // This is a third branch.
+        return Jumpers;
+    }
+    if (I == MBB.instr_begin())
+      break;
+    --I;
+  } while (true);
+  return Jumpers;
+}
+
+
+short HexagonInstrInfo::getBaseWithLongOffset(short Opcode) const {
+  if (Opcode < 0)
+    return -1;
+  return Hexagon::getBaseWithLongOffset(Opcode);
+}
+
+
+short HexagonInstrInfo::getBaseWithLongOffset(const MachineInstr &MI) const {
+  return Hexagon::getBaseWithLongOffset(MI.getOpcode());
+}
+
+
+short HexagonInstrInfo::getBaseWithRegOffset(const MachineInstr &MI) const {
+  return Hexagon::getBaseWithRegOffset(MI.getOpcode());
+}
+
+
+// Returns Operand Index for the constant extended instruction.
+unsigned HexagonInstrInfo::getCExtOpNum(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  return (F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask;
+}
+
+// See if instruction could potentially be a duplex candidate.
+// If so, return its group. Zero otherwise.
+HexagonII::CompoundGroup HexagonInstrInfo::getCompoundCandidateGroup(
+      const MachineInstr &MI) const {
+  unsigned DstReg, SrcReg, Src1Reg, Src2Reg;
+
+  switch (MI.getOpcode()) {
+  default:
+    return HexagonII::HCG_None;
+  //
+  // Compound pairs.
+  // "p0=cmp.eq(Rs16,Rt16); if (p0.new) jump:nt #r9:2"
+  // "Rd16=#U6 ; jump #r9:2"
+  // "Rd16=Rs16 ; jump #r9:2"
+  //
+  case Hexagon::C2_cmpeq:
+  case Hexagon::C2_cmpgt:
+  case Hexagon::C2_cmpgtu:
+    DstReg = MI.getOperand(0).getReg();
+    Src1Reg = MI.getOperand(1).getReg();
+    Src2Reg = MI.getOperand(2).getReg();
+    if (Hexagon::PredRegsRegClass.contains(DstReg) &&
+        (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+        isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg))
+      return HexagonII::HCG_A;
+    break;
+  case Hexagon::C2_cmpeqi:
+  case Hexagon::C2_cmpgti:
+  case Hexagon::C2_cmpgtui:
+    // P0 = cmp.eq(Rs,#u2)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if (Hexagon::PredRegsRegClass.contains(DstReg) &&
+        (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+        isIntRegForSubInst(SrcReg) && MI.getOperand(2).isImm() &&
+        ((isUInt<5>(MI.getOperand(2).getImm())) ||
+         (MI.getOperand(2).getImm() == -1)))
+      return HexagonII::HCG_A;
+    break;
+  case Hexagon::A2_tfr:
+    // Rd = Rs
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg))
+      return HexagonII::HCG_A;
+    break;
+  case Hexagon::A2_tfrsi:
+    // Rd = #u6
+    // Do not test for #u6 size since the const is getting extended
+    // regardless and compound could be formed.
+    DstReg = MI.getOperand(0).getReg();
+    if (isIntRegForSubInst(DstReg))
+      return HexagonII::HCG_A;
+    break;
+  case Hexagon::S2_tstbit_i:
+    DstReg = MI.getOperand(0).getReg();
+    Src1Reg = MI.getOperand(1).getReg();
+    if (Hexagon::PredRegsRegClass.contains(DstReg) &&
+        (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+        MI.getOperand(2).isImm() &&
+        isIntRegForSubInst(Src1Reg) && (MI.getOperand(2).getImm() == 0))
+      return HexagonII::HCG_A;
+    break;
+  // The fact that .new form is used pretty much guarantees
+  // that predicate register will match. Nevertheless,
+  // there could be some false positives without additional
+  // checking.
+  case Hexagon::J2_jumptnew:
+  case Hexagon::J2_jumpfnew:
+  case Hexagon::J2_jumptnewpt:
+  case Hexagon::J2_jumpfnewpt:
+    Src1Reg = MI.getOperand(0).getReg();
+    if (Hexagon::PredRegsRegClass.contains(Src1Reg) &&
+        (Hexagon::P0 == Src1Reg || Hexagon::P1 == Src1Reg))
+      return HexagonII::HCG_B;
+    break;
+  // Transfer and jump:
+  // Rd=#U6 ; jump #r9:2
+  // Rd=Rs ; jump #r9:2
+  // Do not test for jump range here.
+  case Hexagon::J2_jump:
+  case Hexagon::RESTORE_DEALLOC_RET_JMP_V4:
+  case Hexagon::RESTORE_DEALLOC_RET_JMP_V4_PIC:
+    return HexagonII::HCG_C;
+    break;
+  }
+
+  return HexagonII::HCG_None;
+}
+
+
+// Returns -1 when there is no opcode found.
+unsigned HexagonInstrInfo::getCompoundOpcode(const MachineInstr &GA,
+      const MachineInstr &GB) const {
+  assert(getCompoundCandidateGroup(GA) == HexagonII::HCG_A);
+  assert(getCompoundCandidateGroup(GB) == HexagonII::HCG_B);
+  if ((GA.getOpcode() != Hexagon::C2_cmpeqi) ||
+      (GB.getOpcode() != Hexagon::J2_jumptnew))
+    return -1;
+  unsigned DestReg = GA.getOperand(0).getReg();
+  if (!GB.readsRegister(DestReg))
+    return -1;
+  if (DestReg == Hexagon::P0)
+    return Hexagon::J4_cmpeqi_tp0_jump_nt;
+  if (DestReg == Hexagon::P1)
+    return Hexagon::J4_cmpeqi_tp1_jump_nt;
+  return -1;
+}
+
+
+int HexagonInstrInfo::getCondOpcode(int Opc, bool invertPredicate) const {
+  enum Hexagon::PredSense inPredSense;
+  inPredSense = invertPredicate ? Hexagon::PredSense_false :
+                                  Hexagon::PredSense_true;
+  int CondOpcode = Hexagon::getPredOpcode(Opc, inPredSense);
+  if (CondOpcode >= 0) // Valid Conditional opcode/instruction
+    return CondOpcode;
+
+  llvm_unreachable("Unexpected predicable instruction");
+}
+
+
+// Return the cur value instruction for a given store.
+int HexagonInstrInfo::getDotCurOp(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default: llvm_unreachable("Unknown .cur type");
+  case Hexagon::V6_vL32b_pi:
+    return Hexagon::V6_vL32b_cur_pi;
+  case Hexagon::V6_vL32b_ai:
+    return Hexagon::V6_vL32b_cur_ai;
+  //128B
+  case Hexagon::V6_vL32b_pi_128B:
+    return Hexagon::V6_vL32b_cur_pi_128B;
+  case Hexagon::V6_vL32b_ai_128B:
+    return Hexagon::V6_vL32b_cur_ai_128B;
+  }
+  return 0;
+}
+
+
+
+// The diagram below shows the steps involved in the conversion of a predicated
+// store instruction to its .new predicated new-value form.
+//
+//               p.new NV store [ if(p0.new)memw(R0+#0)=R2.new ]
+//                ^           ^
+//               /             \ (not OK. it will cause new-value store to be
+//              /               X conditional on p0.new while R2 producer is
+//             /                 \ on p0)
+//            /                   \.
+//     p.new store                 p.old NV store
+// [if(p0.new)memw(R0+#0)=R2]    [if(p0)memw(R0+#0)=R2.new]
+//            ^                  ^
+//             \                /
+//              \              /
+//               \            /
+//                 p.old store
+//             [if (p0)memw(R0+#0)=R2]
+//
+//
+// The following set of instructions further explains the scenario where
+// conditional new-value store becomes invalid when promoted to .new predicate
+// form.
+//
+// { 1) if (p0) r0 = add(r1, r2)
+//   2) p0 = cmp.eq(r3, #0) }
+//
+//   3) if (p0) memb(r1+#0) = r0  --> this instruction can't be grouped with
+// the first two instructions because in instr 1, r0 is conditional on old value
+// of p0 but its use in instr 3 is conditional on p0 modified by instr 2 which
+// is not valid for new-value stores.
+// Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
+// from the "Conditional Store" list. Because a predicated new value store
+// would NOT be promoted to a double dot new store. See diagram below:
+// This function returns yes for those stores that are predicated but not
+// yet promoted to predicate dot new instructions.
+//
+//                          +---------------------+
+//                    /-----| if (p0) memw(..)=r0 |---------\~
+//                   ||     +---------------------+         ||
+//          promote  ||       /\       /\                   ||  promote
+//                   ||      /||\     /||\                  ||
+//                  \||/    demote     ||                  \||/
+//                   \/       ||       ||                   \/
+//       +-------------------------+   ||   +-------------------------+
+//       | if (p0.new) memw(..)=r0 |   ||   | if (p0) memw(..)=r0.new |
+//       +-------------------------+   ||   +-------------------------+
+//                        ||           ||         ||
+//                        ||         demote      \||/
+//                      promote        ||         \/ NOT possible
+//                        ||           ||         /\~
+//                       \||/          ||        /||\~
+//                        \/           ||         ||
+//                      +-----------------------------+
+//                      | if (p0.new) memw(..)=r0.new |
+//                      +-----------------------------+
+//                           Double Dot New Store
+//
+// Returns the most basic instruction for the .new predicated instructions and
+// new-value stores.
+// For example, all of the following instructions will be converted back to the
+// same instruction:
+// 1) if (p0.new) memw(R0+#0) = R1.new  --->
+// 2) if (p0) memw(R0+#0)= R1.new      -------> if (p0) memw(R0+#0) = R1
+// 3) if (p0.new) memw(R0+#0) = R1      --->
+//
+// To understand the translation of instruction 1 to its original form, consider
+// a packet with 3 instructions.
+// { p0 = cmp.eq(R0,R1)
+//   if (p0.new) R2 = add(R3, R4)
+//   R5 = add (R3, R1)
+// }
+// if (p0) memw(R5+#0) = R2 <--- trying to include it in the previous packet
+//
+// This instruction can be part of the previous packet only if both p0 and R2
+// are promoted to .new values. This promotion happens in steps, first
+// predicate register is promoted to .new and in the next iteration R2 is
+// promoted. Therefore, in case of dependence check failure (due to R5) during
+// next iteration, it should be converted back to its most basic form.
+
+
+// Return the new value instruction for a given store.
+int HexagonInstrInfo::getDotNewOp(const MachineInstr &MI) const {
+  int NVOpcode = Hexagon::getNewValueOpcode(MI.getOpcode());
+  if (NVOpcode >= 0) // Valid new-value store instruction.
+    return NVOpcode;
+
+  switch (MI.getOpcode()) {
+  default: llvm_unreachable("Unknown .new type");
+  case Hexagon::S4_storerb_ur:
+    return Hexagon::S4_storerbnew_ur;
+
+  case Hexagon::S2_storerb_pci:
+    return Hexagon::S2_storerb_pci;
+
+  case Hexagon::S2_storeri_pci:
+    return Hexagon::S2_storeri_pci;
+
+  case Hexagon::S2_storerh_pci:
+    return Hexagon::S2_storerh_pci;
+
+  case Hexagon::S2_storerd_pci:
+    return Hexagon::S2_storerd_pci;
+
+  case Hexagon::S2_storerf_pci:
+    return Hexagon::S2_storerf_pci;
+
+  case Hexagon::V6_vS32b_ai:
+    return Hexagon::V6_vS32b_new_ai;
+
+  case Hexagon::V6_vS32b_pi:
+    return Hexagon::V6_vS32b_new_pi;
+
+  // 128B
+  case Hexagon::V6_vS32b_ai_128B:
+    return Hexagon::V6_vS32b_new_ai_128B;
+
+  case Hexagon::V6_vS32b_pi_128B:
+    return Hexagon::V6_vS32b_new_pi_128B;
+  }
+  return 0;
+}
+
+
+// Returns the opcode to use when converting MI, which is a conditional jump,
+// into a conditional instruction which uses the .new value of the predicate.
+// We also use branch probabilities to add a hint to the jump.
+int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI,
+      const MachineBranchProbabilityInfo *MBPI) const {
+  // We assume that block can have at most two successors.
+  bool taken = false;
+  const MachineBasicBlock *Src = MI.getParent();
+  const MachineOperand &BrTarget = MI.getOperand(1);
+  const MachineBasicBlock *Dst = BrTarget.getMBB();
+
+  const BranchProbability Prediction = MBPI->getEdgeProbability(Src, Dst);
+  if (Prediction >= BranchProbability(1,2))
+    taken = true;
+
+  switch (MI.getOpcode()) {
+  case Hexagon::J2_jumpt:
+    return taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew;
+  case Hexagon::J2_jumpf:
+    return taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew;
+
+  default:
+    llvm_unreachable("Unexpected jump instruction.");
+  }
+}
+
+
+// Return .new predicate version for an instruction.
+int HexagonInstrInfo::getDotNewPredOp(const MachineInstr &MI,
+      const MachineBranchProbabilityInfo *MBPI) const {
+  int NewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode());
+  if (NewOpcode >= 0) // Valid predicate new instruction
+    return NewOpcode;
+
+  switch (MI.getOpcode()) {
+  // Condtional Jumps
+  case Hexagon::J2_jumpt:
+  case Hexagon::J2_jumpf:
+    return getDotNewPredJumpOp(MI, MBPI);
+
+  default:
+    assert(0 && "Unknown .new type");
+  }
+  return 0;
+}
+
+
+int HexagonInstrInfo::getDotOldOp(const int opc) const {
+  int NewOp = opc;
+  if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form
+    NewOp = Hexagon::getPredOldOpcode(NewOp);
+    assert(NewOp >= 0 &&
+           "Couldn't change predicate new instruction to its old form.");
+  }
+
+  if (isNewValueStore(NewOp)) { // Convert into non-new-value format
+    NewOp = Hexagon::getNonNVStore(NewOp);
+    assert(NewOp >= 0 && "Couldn't change new-value store to its old form.");
+  }
+  return NewOp;
+}
+
+
+// See if instruction could potentially be a duplex candidate.
+// If so, return its group. Zero otherwise.
+HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
+      const MachineInstr &MI) const {
+  unsigned DstReg, SrcReg, Src1Reg, Src2Reg;
+  auto &HRI = getRegisterInfo();
+
+  switch (MI.getOpcode()) {
+  default:
+    return HexagonII::HSIG_None;
+  //
+  // Group L1:
+  //
+  // Rd = memw(Rs+#u4:2)
+  // Rd = memub(Rs+#u4:0)
+  case Hexagon::L2_loadri_io:
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    // Special case this one from Group L2.
+    // Rd = memw(r29+#u5:2)
+    if (isIntRegForSubInst(DstReg)) {
+      if (Hexagon::IntRegsRegClass.contains(SrcReg) &&
+          HRI.getStackRegister() == SrcReg &&
+          MI.getOperand(2).isImm() &&
+          isShiftedUInt<5,2>(MI.getOperand(2).getImm()))
+        return HexagonII::HSIG_L2;
+      // Rd = memw(Rs+#u4:2)
+      if (isIntRegForSubInst(SrcReg) &&
+          (MI.getOperand(2).isImm() &&
+          isShiftedUInt<4,2>(MI.getOperand(2).getImm())))
+        return HexagonII::HSIG_L1;
+    }
+    break;
+  case Hexagon::L2_loadrub_io:
+    // Rd = memub(Rs+#u4:0)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) &&
+        MI.getOperand(2).isImm() && isUInt<4>(MI.getOperand(2).getImm()))
+      return HexagonII::HSIG_L1;
+    break;
+  //
+  // Group L2:
+  //
+  // Rd = memh/memuh(Rs+#u3:1)
+  // Rd = memb(Rs+#u3:0)
+  // Rd = memw(r29+#u5:2) - Handled above.
+  // Rdd = memd(r29+#u5:3)
+  // deallocframe
+  // [if ([!]p0[.new])] dealloc_return
+  // [if ([!]p0[.new])] jumpr r31
+  case Hexagon::L2_loadrh_io:
+  case Hexagon::L2_loadruh_io:
+    // Rd = memh/memuh(Rs+#u3:1)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) &&
+        MI.getOperand(2).isImm() &&
+        isShiftedUInt<3,1>(MI.getOperand(2).getImm()))
+      return HexagonII::HSIG_L2;
+    break;
+  case Hexagon::L2_loadrb_io:
+    // Rd = memb(Rs+#u3:0)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) &&
+        MI.getOperand(2).isImm() &&
+        isUInt<3>(MI.getOperand(2).getImm()))
+      return HexagonII::HSIG_L2;
+    break;
+  case Hexagon::L2_loadrd_io:
+    // Rdd = memd(r29+#u5:3)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if (isDblRegForSubInst(DstReg, HRI) &&
+        Hexagon::IntRegsRegClass.contains(SrcReg) &&
+        HRI.getStackRegister() == SrcReg &&
+        MI.getOperand(2).isImm() &&
+        isShiftedUInt<5,3>(MI.getOperand(2).getImm()))
+      return HexagonII::HSIG_L2;
+    break;
+  // dealloc_return is not documented in Hexagon Manual, but marked
+  // with A_SUBINSN attribute in iset_v4classic.py.
+  case Hexagon::RESTORE_DEALLOC_RET_JMP_V4:
+  case Hexagon::RESTORE_DEALLOC_RET_JMP_V4_PIC:
+  case Hexagon::L4_return:
+  case Hexagon::L2_deallocframe:
+    return HexagonII::HSIG_L2;
+  case Hexagon::EH_RETURN_JMPR:
+  case Hexagon::PS_jmpret:
+    // jumpr r31
+    // Actual form JMPR %PC<imp-def>, %R31<imp-use>, %R0<imp-use,internal>.
+    DstReg = MI.getOperand(0).getReg();
+    if (Hexagon::IntRegsRegClass.contains(DstReg) && (Hexagon::R31 == DstReg))
+      return HexagonII::HSIG_L2;
+    break;
+  case Hexagon::PS_jmprett:
+  case Hexagon::PS_jmpretf:
+  case Hexagon::PS_jmprettnewpt:
+  case Hexagon::PS_jmpretfnewpt:
+  case Hexagon::PS_jmprettnew:
+  case Hexagon::PS_jmpretfnew:
+    DstReg = MI.getOperand(1).getReg();
+    SrcReg = MI.getOperand(0).getReg();
+    // [if ([!]p0[.new])] jumpr r31
+    if ((Hexagon::PredRegsRegClass.contains(SrcReg) &&
+        (Hexagon::P0 == SrcReg)) &&
+        (Hexagon::IntRegsRegClass.contains(DstReg) && (Hexagon::R31 == DstReg)))
+      return HexagonII::HSIG_L2;
+    break;
+  case Hexagon::L4_return_t :
+  case Hexagon::L4_return_f :
+  case Hexagon::L4_return_tnew_pnt :
+  case Hexagon::L4_return_fnew_pnt :
+  case Hexagon::L4_return_tnew_pt :
+  case Hexagon::L4_return_fnew_pt :
+    // [if ([!]p0[.new])] dealloc_return
+    SrcReg = MI.getOperand(0).getReg();
+    if (Hexagon::PredRegsRegClass.contains(SrcReg) && (Hexagon::P0 == SrcReg))
+      return HexagonII::HSIG_L2;
+    break;
+  //
+  // Group S1:
+  //
+  // memw(Rs+#u4:2) = Rt
+  // memb(Rs+#u4:0) = Rt
+  case Hexagon::S2_storeri_io:
+    // Special case this one from Group S2.
+    // memw(r29+#u5:2) = Rt
+    Src1Reg = MI.getOperand(0).getReg();
+    Src2Reg = MI.getOperand(2).getReg();
+    if (Hexagon::IntRegsRegClass.contains(Src1Reg) &&
+        isIntRegForSubInst(Src2Reg) &&
+        HRI.getStackRegister() == Src1Reg && MI.getOperand(1).isImm() &&
+        isShiftedUInt<5,2>(MI.getOperand(1).getImm()))
+      return HexagonII::HSIG_S2;
+    // memw(Rs+#u4:2) = Rt
+    if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) &&
+        MI.getOperand(1).isImm() &&
+        isShiftedUInt<4,2>(MI.getOperand(1).getImm()))
+      return HexagonII::HSIG_S1;
+    break;
+  case Hexagon::S2_storerb_io:
+    // memb(Rs+#u4:0) = Rt
+    Src1Reg = MI.getOperand(0).getReg();
+    Src2Reg = MI.getOperand(2).getReg();
+    if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) &&
+        MI.getOperand(1).isImm() && isUInt<4>(MI.getOperand(1).getImm()))
+      return HexagonII::HSIG_S1;
+    break;
+  //
+  // Group S2:
+  //
+  // memh(Rs+#u3:1) = Rt
+  // memw(r29+#u5:2) = Rt
+  // memd(r29+#s6:3) = Rtt
+  // memw(Rs+#u4:2) = #U1
+  // memb(Rs+#u4) = #U1
+  // allocframe(#u5:3)
+  case Hexagon::S2_storerh_io:
+    // memh(Rs+#u3:1) = Rt
+    Src1Reg = MI.getOperand(0).getReg();
+    Src2Reg = MI.getOperand(2).getReg();
+    if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) &&
+        MI.getOperand(1).isImm() &&
+        isShiftedUInt<3,1>(MI.getOperand(1).getImm()))
+      return HexagonII::HSIG_S1;
+    break;
+  case Hexagon::S2_storerd_io:
+    // memd(r29+#s6:3) = Rtt
+    Src1Reg = MI.getOperand(0).getReg();
+    Src2Reg = MI.getOperand(2).getReg();
+    if (isDblRegForSubInst(Src2Reg, HRI) &&
+        Hexagon::IntRegsRegClass.contains(Src1Reg) &&
+        HRI.getStackRegister() == Src1Reg && MI.getOperand(1).isImm() &&
+        isShiftedInt<6,3>(MI.getOperand(1).getImm()))
+      return HexagonII::HSIG_S2;
+    break;
+  case Hexagon::S4_storeiri_io:
+    // memw(Rs+#u4:2) = #U1
+    Src1Reg = MI.getOperand(0).getReg();
+    if (isIntRegForSubInst(Src1Reg) && MI.getOperand(1).isImm() &&
+        isShiftedUInt<4,2>(MI.getOperand(1).getImm()) &&
+        MI.getOperand(2).isImm() && isUInt<1>(MI.getOperand(2).getImm()))
+      return HexagonII::HSIG_S2;
+    break;
+  case Hexagon::S4_storeirb_io:
+    // memb(Rs+#u4) = #U1
+    Src1Reg = MI.getOperand(0).getReg();
+    if (isIntRegForSubInst(Src1Reg) &&
+        MI.getOperand(1).isImm() && isUInt<4>(MI.getOperand(1).getImm()) &&
+        MI.getOperand(2).isImm() && isUInt<1>(MI.getOperand(2).getImm()))
+      return HexagonII::HSIG_S2;
+    break;
+  case Hexagon::S2_allocframe:
+    if (MI.getOperand(0).isImm() &&
+        isShiftedUInt<5,3>(MI.getOperand(0).getImm()))
+      return HexagonII::HSIG_S1;
+    break;
+  //
+  // Group A:
+  //
+  // Rx = add(Rx,#s7)
+  // Rd = Rs
+  // Rd = #u6
+  // Rd = #-1
+  // if ([!]P0[.new]) Rd = #0
+  // Rd = add(r29,#u6:2)
+  // Rx = add(Rx,Rs)
+  // P0 = cmp.eq(Rs,#u2)
+  // Rdd = combine(#0,Rs)
+  // Rdd = combine(Rs,#0)
+  // Rdd = combine(#u2,#U2)
+  // Rd = add(Rs,#1)
+  // Rd = add(Rs,#-1)
+  // Rd = sxth/sxtb/zxtb/zxth(Rs)
+  // Rd = and(Rs,#1)
+  case Hexagon::A2_addi:
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg)) {
+      // Rd = add(r29,#u6:2)
+      if (Hexagon::IntRegsRegClass.contains(SrcReg) &&
+        HRI.getStackRegister() == SrcReg && MI.getOperand(2).isImm() &&
+        isShiftedUInt<6,2>(MI.getOperand(2).getImm()))
+        return HexagonII::HSIG_A;
+      // Rx = add(Rx,#s7)
+      if ((DstReg == SrcReg) && MI.getOperand(2).isImm() &&
+          isInt<7>(MI.getOperand(2).getImm()))
+        return HexagonII::HSIG_A;
+      // Rd = add(Rs,#1)
+      // Rd = add(Rs,#-1)
+      if (isIntRegForSubInst(SrcReg) && MI.getOperand(2).isImm() &&
+          ((MI.getOperand(2).getImm() == 1) ||
+          (MI.getOperand(2).getImm() == -1)))
+        return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::A2_add:
+    // Rx = add(Rx,Rs)
+    DstReg = MI.getOperand(0).getReg();
+    Src1Reg = MI.getOperand(1).getReg();
+    Src2Reg = MI.getOperand(2).getReg();
+    if (isIntRegForSubInst(DstReg) && (DstReg == Src1Reg) &&
+        isIntRegForSubInst(Src2Reg))
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::A2_andir:
+    // Same as zxtb.
+    // Rd16=and(Rs16,#255)
+    // Rd16=and(Rs16,#1)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) &&
+        MI.getOperand(2).isImm() &&
+        ((MI.getOperand(2).getImm() == 1) ||
+        (MI.getOperand(2).getImm() == 255)))
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::A2_tfr:
+    // Rd = Rs
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg))
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::A2_tfrsi:
+    // Rd = #u6
+    // Do not test for #u6 size since the const is getting extended
+    // regardless and compound could be formed.
+    // Rd = #-1
+    DstReg = MI.getOperand(0).getReg();
+    if (isIntRegForSubInst(DstReg))
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::C2_cmoveit:
+  case Hexagon::C2_cmovenewit:
+  case Hexagon::C2_cmoveif:
+  case Hexagon::C2_cmovenewif:
+    // if ([!]P0[.new]) Rd = #0
+    // Actual form:
+    // %R16<def> = C2_cmovenewit %P0<internal>, 0, %R16<imp-use,undef>;
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg) &&
+        Hexagon::PredRegsRegClass.contains(SrcReg) && Hexagon::P0 == SrcReg &&
+        MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0)
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::C2_cmpeqi:
+    // P0 = cmp.eq(Rs,#u2)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if (Hexagon::PredRegsRegClass.contains(DstReg) &&
+        Hexagon::P0 == DstReg && isIntRegForSubInst(SrcReg) &&
+        MI.getOperand(2).isImm() && isUInt<2>(MI.getOperand(2).getImm()))
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::A2_combineii:
+  case Hexagon::A4_combineii:
+    // Rdd = combine(#u2,#U2)
+    DstReg = MI.getOperand(0).getReg();
+    if (isDblRegForSubInst(DstReg, HRI) &&
+        ((MI.getOperand(1).isImm() && isUInt<2>(MI.getOperand(1).getImm())) ||
+        (MI.getOperand(1).isGlobal() &&
+        isUInt<2>(MI.getOperand(1).getOffset()))) &&
+        ((MI.getOperand(2).isImm() && isUInt<2>(MI.getOperand(2).getImm())) ||
+        (MI.getOperand(2).isGlobal() &&
+        isUInt<2>(MI.getOperand(2).getOffset()))))
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::A4_combineri:
+    // Rdd = combine(Rs,#0)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if (isDblRegForSubInst(DstReg, HRI) && isIntRegForSubInst(SrcReg) &&
+        ((MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) ||
+        (MI.getOperand(2).isGlobal() && MI.getOperand(2).getOffset() == 0)))
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::A4_combineir:
+    // Rdd = combine(#0,Rs)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(2).getReg();
+    if (isDblRegForSubInst(DstReg, HRI) && isIntRegForSubInst(SrcReg) &&
+        ((MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) ||
+        (MI.getOperand(1).isGlobal() && MI.getOperand(1).getOffset() == 0)))
+      return HexagonII::HSIG_A;
+    break;
+  case Hexagon::A2_sxtb:
+  case Hexagon::A2_sxth:
+  case Hexagon::A2_zxtb:
+  case Hexagon::A2_zxth:
+    // Rd = sxth/sxtb/zxtb/zxth(Rs)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg))
+      return HexagonII::HSIG_A;
+    break;
+  }
+
+  return HexagonII::HSIG_None;
+}
+
+
+short HexagonInstrInfo::getEquivalentHWInstr(const MachineInstr &MI) const {
+  return Hexagon::getRealHWInstr(MI.getOpcode(), Hexagon::InstrType_Real);
+}
+
+
+// Return first non-debug instruction in the basic block.
+MachineInstr *HexagonInstrInfo::getFirstNonDbgInst(MachineBasicBlock *BB)
+      const {
+  for (auto MII = BB->instr_begin(), End = BB->instr_end(); MII != End; MII++) {
+    MachineInstr &MI = *MII;
+    if (MI.isDebugValue())
+      continue;
+    return &MI;
+  }
+  return nullptr;
+}
+
+
+unsigned HexagonInstrInfo::getInstrTimingClassLatency(
+      const InstrItineraryData *ItinData, const MachineInstr &MI) const {
+  // Default to one cycle for no itinerary. However, an "empty" itinerary may
+  // still have a MinLatency property, which getStageLatency checks.
+  if (!ItinData)
+    return getInstrLatency(ItinData, MI);
+
+  // Get the latency embedded in the itinerary. If we're not using timing class
+  // latencies or if we using BSB scheduling, then restrict the maximum latency
+  // to 1 (that is, either 0 or 1).
+  if (MI.isTransient())
+    return 0;
+  unsigned Latency = ItinData->getStageLatency(MI.getDesc().getSchedClass());
+  if (!EnableTimingClassLatency ||
+      MI.getParent()->getParent()->getSubtarget<HexagonSubtarget>().
+      useBSBScheduling())
+    if (Latency > 1)
+      Latency = 1;
+  return Latency;
+}
+
+
+// inverts the predication logic.
+// p -> NotP
+// NotP -> P
+bool HexagonInstrInfo::getInvertedPredSense(
+      SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond.empty())
+    return false;
+  unsigned Opc = getInvertedPredicatedOpcode(Cond[0].getImm());
+  Cond[0].setImm(Opc);
+  return true;
+}
+
+
+unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
+  int InvPredOpcode;
+  InvPredOpcode = isPredicatedTrue(Opc) ? Hexagon::getFalsePredOpcode(Opc)
+                                        : Hexagon::getTruePredOpcode(Opc);
+  if (InvPredOpcode >= 0) // Valid instruction with the inverted predicate.
+    return InvPredOpcode;
+
+  llvm_unreachable("Unexpected predicated instruction");
+}
+
+
+// Returns the max value that doesn't need to be extended.
+int HexagonInstrInfo::getMaxValue(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
+                    & HexagonII::ExtentSignedMask;
+  unsigned bits =  (F >> HexagonII::ExtentBitsPos)
+                    & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return ~(-1U << (bits - 1));
+  else
+    return ~(-1U << bits);
+}
+
+
+unsigned HexagonInstrInfo::getMemAccessSize(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  return (F >> HexagonII::MemAccessSizePos) & HexagonII::MemAccesSizeMask;
+}
+
+
+// Returns the min value that doesn't need to be extended.
+int HexagonInstrInfo::getMinValue(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
+                    & HexagonII::ExtentSignedMask;
+  unsigned bits =  (F >> HexagonII::ExtentBitsPos)
+                    & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return -1U << (bits - 1);
+  else
+    return 0;
+}
+
+
+// Returns opcode of the non-extended equivalent instruction.
+short HexagonInstrInfo::getNonExtOpcode(const MachineInstr &MI) const {
+  // Check if the instruction has a register form that uses register in place
+  // of the extended operand, if so return that as the non-extended form.
+  short NonExtOpcode = Hexagon::getRegForm(MI.getOpcode());
+    if (NonExtOpcode >= 0)
+      return NonExtOpcode;
+
+  if (MI.getDesc().mayLoad() || MI.getDesc().mayStore()) {
+    // Check addressing mode and retrieve non-ext equivalent instruction.
+    switch (getAddrMode(MI)) {
+    case HexagonII::Absolute :
+      return Hexagon::getBaseWithImmOffset(MI.getOpcode());
+    case HexagonII::BaseImmOffset :
+      return Hexagon::getBaseWithRegOffset(MI.getOpcode());
+    case HexagonII::BaseLongOffset:
+      return Hexagon::getRegShlForm(MI.getOpcode());
+
+    default:
+      return -1;
+    }
+  }
+  return -1;
+}
+
+
+bool HexagonInstrInfo::getPredReg(ArrayRef<MachineOperand> Cond,
+      unsigned &PredReg, unsigned &PredRegPos, unsigned &PredRegFlags) const {
+  if (Cond.empty())
+    return false;
+  assert(Cond.size() == 2);
+  if (isNewValueJump(Cond[0].getImm()) || Cond[1].isMBB()) {
+    DEBUG(dbgs() << "No predregs for new-value jumps/endloop");
+    return false;
+  }
+  PredReg = Cond[1].getReg();
+  PredRegPos = 1;
+  // See IfConversion.cpp why we add RegState::Implicit | RegState::Undef
+  PredRegFlags = 0;
+  if (Cond[1].isImplicit())
+    PredRegFlags = RegState::Implicit;
+  if (Cond[1].isUndef())
+    PredRegFlags |= RegState::Undef;
+  return true;
+}
+
+
+short HexagonInstrInfo::getPseudoInstrPair(const MachineInstr &MI) const {
+  return Hexagon::getRealHWInstr(MI.getOpcode(), Hexagon::InstrType_Pseudo);
+}
+
+
+short HexagonInstrInfo::getRegForm(const MachineInstr &MI) const {
+  return Hexagon::getRegForm(MI.getOpcode());
+}
+
+
+// Return the number of bytes required to encode the instruction.
+// Hexagon instructions are fixed length, 4 bytes, unless they
+// use a constant extender, which requires another 4 bytes.
+// For debug instructions and prolog labels, return 0.
+unsigned HexagonInstrInfo::getSize(const MachineInstr &MI) const {
+  if (MI.isDebugValue() || MI.isPosition())
+    return 0;
+
+  unsigned Size = MI.getDesc().getSize();
+  if (!Size)
+    // Assume the default insn size in case it cannot be determined
+    // for whatever reason.
+    Size = HEXAGON_INSTR_SIZE;
+
+  if (isConstExtended(MI) || isExtended(MI))
+    Size += HEXAGON_INSTR_SIZE;
+
+  // Try and compute number of instructions in asm.
+  if (BranchRelaxAsmLarge && MI.getOpcode() == Hexagon::INLINEASM) {
+    const MachineBasicBlock &MBB = *MI.getParent();
+    const MachineFunction *MF = MBB.getParent();
+    const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+
+    // Count the number of register definitions to find the asm string.
+    unsigned NumDefs = 0;
+    for (; MI.getOperand(NumDefs).isReg() && MI.getOperand(NumDefs).isDef();
+         ++NumDefs)
+      assert(NumDefs != MI.getNumOperands()-2 && "No asm string?");
+
+    assert(MI.getOperand(NumDefs).isSymbol() && "No asm string?");
+    // Disassemble the AsmStr and approximate number of instructions.
+    const char *AsmStr = MI.getOperand(NumDefs).getSymbolName();
+    Size = getInlineAsmLength(AsmStr, *MAI);
+  }
+
+  return Size;
+}
+
+
+uint64_t HexagonInstrInfo::getType(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  return (F >> HexagonII::TypePos) & HexagonII::TypeMask;
+}
+
+
+unsigned HexagonInstrInfo::getUnits(const MachineInstr &MI) const {
+  const TargetSubtargetInfo &ST = MI.getParent()->getParent()->getSubtarget();
+  const InstrItineraryData &II = *ST.getInstrItineraryData();
+  const InstrStage &IS = *II.beginStage(MI.getDesc().getSchedClass());
+
+  return IS.getUnits();
+}
+
+
+unsigned HexagonInstrInfo::getValidSubTargets(const unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+  return (F >> HexagonII::validSubTargetPos) & HexagonII::validSubTargetMask;
+}
+
+
+// Calculate size of the basic block without debug instructions.
+unsigned HexagonInstrInfo::nonDbgBBSize(const MachineBasicBlock *BB) const {
+  return nonDbgMICount(BB->instr_begin(), BB->instr_end());
+}
+
+
+unsigned HexagonInstrInfo::nonDbgBundleSize(
+      MachineBasicBlock::const_iterator BundleHead) const {
+  assert(BundleHead->isBundle() && "Not a bundle header");
+  auto MII = BundleHead.getInstrIterator();
+  // Skip the bundle header.
+  return nonDbgMICount(++MII, getBundleEnd(BundleHead.getInstrIterator()));
+}
+
+
+/// immediateExtend - Changes the instruction in place to one using an immediate
+/// extender.
+void HexagonInstrInfo::immediateExtend(MachineInstr &MI) const {
+  assert((isExtendable(MI)||isConstExtended(MI)) &&
+                               "Instruction must be extendable");
+  // Find which operand is extendable.
+  short ExtOpNum = getCExtOpNum(MI);
+  MachineOperand &MO = MI.getOperand(ExtOpNum);
+  // This needs to be something we understand.
+  assert((MO.isMBB() || MO.isImm()) &&
+         "Branch with unknown extendable field type");
+  // Mark given operand as extended.
+  MO.addTargetFlag(HexagonII::HMOTF_ConstExtended);
+}
+
+
+bool HexagonInstrInfo::invertAndChangeJumpTarget(
+      MachineInstr &MI, MachineBasicBlock *NewTarget) const {
+  DEBUG(dbgs() << "\n[invertAndChangeJumpTarget] to BB#"
+               << NewTarget->getNumber(); MI.dump(););
+  assert(MI.isBranch());
+  unsigned NewOpcode = getInvertedPredicatedOpcode(MI.getOpcode());
+  int TargetPos = MI.getNumOperands() - 1;
+  // In general branch target is the last operand,
+  // but some implicit defs added at the end might change it.
+  while ((TargetPos > -1) && !MI.getOperand(TargetPos).isMBB())
+    --TargetPos;
+  assert((TargetPos >= 0) && MI.getOperand(TargetPos).isMBB());
+  MI.getOperand(TargetPos).setMBB(NewTarget);
+  if (EnableBranchPrediction && isPredicatedNew(MI)) {
+    NewOpcode = reversePrediction(NewOpcode);
+  }
+  MI.setDesc(get(NewOpcode));
+  return true;
+}
+
+
+void HexagonInstrInfo::genAllInsnTimingClasses(MachineFunction &MF) const {
+  /* +++ The code below is used to generate complete set of Hexagon Insn +++ */
+  MachineFunction::iterator A = MF.begin();
+  MachineBasicBlock &B = *A;
+  MachineBasicBlock::iterator I = B.begin();
+  DebugLoc DL = I->getDebugLoc();
+  MachineInstr *NewMI;
+
+  for (unsigned insn = TargetOpcode::GENERIC_OP_END+1;
+       insn < Hexagon::INSTRUCTION_LIST_END; ++insn) {
+    NewMI = BuildMI(B, I, DL, get(insn));
+    DEBUG(dbgs() << "\n" << getName(NewMI->getOpcode()) <<
+          "  Class: " << NewMI->getDesc().getSchedClass());
+    NewMI->eraseFromParent();
+  }
+  /* --- The code above is used to generate complete set of Hexagon Insn --- */
+}
+
+
+// inverts the predication logic.
+// p -> NotP
+// NotP -> P
+bool HexagonInstrInfo::reversePredSense(MachineInstr &MI) const {
+  DEBUG(dbgs() << "\nTrying to reverse pred. sense of:"; MI.dump());
+  MI.setDesc(get(getInvertedPredicatedOpcode(MI.getOpcode())));
+  return true;
+}
+
+
+// Reverse the branch prediction.
+unsigned HexagonInstrInfo::reversePrediction(unsigned Opcode) const {
+  int PredRevOpcode = -1;
+  if (isPredictedTaken(Opcode))
+    PredRevOpcode = Hexagon::notTakenBranchPrediction(Opcode);
+  else
+    PredRevOpcode = Hexagon::takenBranchPrediction(Opcode);
+  assert(PredRevOpcode > 0);
+  return PredRevOpcode;
+}
+
+
+// TODO: Add more rigorous validation.
+bool HexagonInstrInfo::validateBranchCond(const ArrayRef<MachineOperand> &Cond)
+      const {
+  return Cond.empty() || (Cond[0].isImm() && (Cond.size() != 1));
+}
+
+
+short HexagonInstrInfo::xformRegToImmOffset(const MachineInstr &MI) const {
+  return Hexagon::xformRegToImmOffset(MI.getOpcode());
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
new file mode 100644
index 000000000000..2d184d1484e9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -0,0 +1,443 @@
+//===- HexagonInstrInfo.h - Hexagon Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONINSTRINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONINSTRINFO_H
+
+#include "HexagonRegisterInfo.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "HexagonGenInstrInfo.inc"
+
+namespace llvm {
+
+struct EVT;
+class HexagonSubtarget;
+
+class HexagonInstrInfo : public HexagonGenInstrInfo {
+  virtual void anchor();
+  const HexagonRegisterInfo RI;
+
+public:
+  explicit HexagonInstrInfo(HexagonSubtarget &ST);
+
+  /// TargetInstrInfo overrides.
+  ///
+
+  /// If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+
+  /// If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+
+  /// Analyze the branching code at the end of MBB, returning
+  /// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+  /// implemented for a target).  Upon success, this returns false and returns
+  /// with the following information in various cases:
+  ///
+  /// 1. If this block ends with no branches (it just falls through to its succ)
+  ///    just return false, leaving TBB/FBB null.
+  /// 2. If this block ends with only an unconditional branch, it sets TBB to be
+  ///    the destination block.
+  /// 3. If this block ends with a conditional branch and it falls through to a
+  ///    successor block, it sets TBB to be the branch destination block and a
+  ///    list of operands that evaluate the condition. These operands can be
+  ///    passed to other TargetInstrInfo methods to create new branches.
+  /// 4. If this block ends with a conditional branch followed by an
+  ///    unconditional branch, it returns the 'true' destination in TBB, the
+  ///    'false' destination in FBB, and a list of operands that evaluate the
+  ///    condition.  These operands can be passed to other TargetInstrInfo
+  ///    methods to create new branches.
+  ///
+  /// Note that removeBranch and insertBranch must be implemented to support
+  /// cases where this method returns success.
+  ///
+  /// If AllowModify is true, then this routine is allowed to modify the basic
+  /// block (e.g. delete instructions after the unconditional branch).
+  ///
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  /// Remove the branching code at the end of the specific MBB.
+  /// This is only invoked in cases where AnalyzeBranch returns success. It
+  /// returns the number of instructions that were removed.
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+
+  /// Insert branch code into the end of the specified MachineBasicBlock.
+  /// The operands to this method are the same as those
+  /// returned by AnalyzeBranch.  This is only invoked in cases where
+  /// AnalyzeBranch returns success. It returns the number of instructions
+  /// inserted.
+  ///
+  /// It is also invoked by tail merging to add unconditional branches in
+  /// cases where AnalyzeBranch doesn't apply because there was no original
+  /// branch to analyze.  At least this much must be implemented, else tail
+  /// merging needs to be disabled.
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+
+  /// Analyze the loop code, return true if it cannot be understood. Upon
+  /// success, this function returns false and returns information about the
+  /// induction variable and compare instruction used at the end.
+  bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
+                   MachineInstr *&CmpInst) const override;
+
+  /// Generate code to reduce the loop iteration by one and check if the loop is
+  /// finished.  Return the value/register of the the new loop count.  We need
+  /// this function when peeling off one or more iterations of a loop. This
+  /// function assumes the nth iteration is peeled first.
+  unsigned reduceLoopCount(MachineBasicBlock &MBB,
+                           MachineInstr *IndVar, MachineInstr &Cmp,
+                           SmallVectorImpl<MachineOperand> &Cond,
+                           SmallVectorImpl<MachineInstr *> &PrevInsts,
+                           unsigned Iter, unsigned MaxIter) const override;
+
+  /// Return true if it's profitable to predicate
+  /// instructions with accumulated instruction latency of "NumCycles"
+  /// of the specified basic block, where the probability of the instructions
+  /// being executed is given by Probability, and Confidence is a measure
+  /// of our confidence that it will be properly predicted.
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                           unsigned ExtraPredCycles,
+                           BranchProbability Probability) const override;
+
+  /// Second variant of isProfitableToIfCvt. This one
+  /// checks for the case where two basic blocks from true and false path
+  /// of a if-then-else (diamond) are predicated on mutally exclusive
+  /// predicates, where the probability of the true path being taken is given
+  /// by Probability, and Confidence is a measure of our confidence that it
+  /// will be properly predicted.
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                           unsigned NumTCycles, unsigned ExtraTCycles,
+                           MachineBasicBlock &FMBB,
+                           unsigned NumFCycles, unsigned ExtraFCycles,
+                           BranchProbability Probability) const override;
+
+  /// Return true if it's profitable for if-converter to duplicate instructions
+  /// of specified accumulated instruction latencies in the specified MBB to
+  /// enable if-conversion.
+  /// The probability of the instructions being executed is given by
+  /// Probability, and Confidence is a measure of our confidence that it
+  /// will be properly predicted.
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                                 BranchProbability Probability) const override;
+
+  /// Emit instructions to copy a pair of physical registers.
+  ///
+  /// This function should support copies within any legal register class as
+  /// well as any cross-class copies created during instruction selection.
+  ///
+  /// The source and destination registers may overlap, which may require a
+  /// careful implementation when multiple copy instructions are required for
+  /// large registers. See for example the ARM target.
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  /// Store the specified register of the given register class to the specified
+  /// stack frame index. The store instruction is to be added to the given
+  /// machine basic block before the specified machine instruction. If isKill
+  /// is true, the register operand is the last use and must be marked kill.
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  /// Load the specified register of the given register class from the specified
+  /// stack frame index. The load instruction is to be added to the given
+  /// machine basic block before the specified machine instruction.
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  /// This function is called for all pseudo instructions
+  /// that remain after register allocation. Many pseudo instructions are
+  /// created to help register allocation. This is the place to convert them
+  /// into real instructions. The target can edit MI in place, or it can insert
+  /// new instructions and erase MI. The function should return true if
+  /// anything was changed.
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+  /// \brief Get the base register and byte offset of a load/store instr.
+  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+                             int64_t &Offset,
+                             const TargetRegisterInfo *TRI) const override;
+
+  /// Reverses the branch condition of the specified condition list,
+  /// returning false on success and true if it cannot be reversed.
+  bool reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
+        const override;
+
+  /// Insert a noop into the instruction stream at the specified point.
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const override;
+
+  /// Returns true if the instruction is already predicated.
+  bool isPredicated(const MachineInstr &MI) const override;
+
+  /// Return true for post-incremented instructions.
+  bool isPostIncrement(const MachineInstr &MI) const override;
+
+  /// Convert the instruction into a predicated instruction.
+  /// It returns true if the operation was successful.
+  bool PredicateInstruction(MachineInstr &MI,
+                            ArrayRef<MachineOperand> Cond) const override;
+
+  /// Returns true if the first specified predicate
+  /// subsumes the second, e.g. GE subsumes GT.
+  bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+                         ArrayRef<MachineOperand> Pred2) const override;
+
+  /// If the specified instruction defines any predicate
+  /// or condition code register(s) used for predication, returns true as well
+  /// as the definition predicate(s) by reference.
+  bool DefinesPredicate(MachineInstr &MI,
+                        std::vector<MachineOperand> &Pred) const override;
+
+  /// Return true if the specified instruction can be predicated.
+  /// By default, this returns true for every instruction with a
+  /// PredicateOperand.
+  bool isPredicable(MachineInstr &MI) const override;
+
+  /// Test if the given instruction should be considered a scheduling boundary.
+  /// This primarily includes labels and terminators.
+  bool isSchedulingBoundary(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
+  /// Measure the specified inline asm to determine an approximation of its
+  /// length.
+  unsigned getInlineAsmLength(const char *Str,
+                              const MCAsmInfo &MAI) const override;
+
+  /// Allocate and return a hazard recognizer to use for this target when
+  /// scheduling the machine instructions after register allocation.
+  ScheduleHazardRecognizer*
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData*,
+                                     const ScheduleDAG *DAG) const override;
+
+  /// For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2 if having two register operands, and the value it
+  /// compares against in CmpValue. Return true if the comparison instruction
+  /// can be analyzed.
+  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &Mask, int &Value) const override;
+
+  /// Compute the instruction latency of a given instruction.
+  /// If the instruction has higher cost when predicated, it's returned via
+  /// PredCost.
+  unsigned getInstrLatency(const InstrItineraryData *ItinData,
+                           const MachineInstr &MI,
+                           unsigned *PredCost = 0) const override;
+
+  /// Create machine specific model for scheduling.
+  DFAPacketizer *
+  CreateTargetScheduleState(const TargetSubtargetInfo &STI) const override;
+
+  // Sometimes, it is possible for the target
+  // to tell, even without aliasing information, that two MIs access different
+  // memory addresses. This function returns true if two MIs access different
+  // memory addresses and false otherwise.
+  bool
+  areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+                                  AliasAnalysis *AA = nullptr) const override;
+
+  /// For instructions with a base and offset, return the position of the
+  /// base register and offset operands.
+  bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos,
+                                unsigned &OffsetPos) const override;
+
+  /// If the instruction is an increment of a constant value, return the amount.
+  bool getIncrementValue(const MachineInstr &MI, int &Value) const override;
+
+  bool isTailCall(const MachineInstr &MI) const override;
+
+  /// HexagonInstrInfo specifics.
+  ///
+
+  const HexagonRegisterInfo &getRegisterInfo() const { return RI; }
+
+  unsigned createVR(MachineFunction* MF, MVT VT) const;
+
+  bool isAbsoluteSet(const MachineInstr &MI) const;
+  bool isAccumulator(const MachineInstr &MI) const;
+  bool isComplex(const MachineInstr &MI) const;
+  bool isCompoundBranchInstr(const MachineInstr &MI) const;
+  bool isCondInst(const MachineInstr &MI) const;
+  bool isConditionalALU32 (const MachineInstr &MI) const;
+  bool isConditionalLoad(const MachineInstr &MI) const;
+  bool isConditionalStore(const MachineInstr &MI) const;
+  bool isConditionalTransfer(const MachineInstr &MI) const;
+  bool isConstExtended(const MachineInstr &MI) const;
+  bool isDeallocRet(const MachineInstr &MI) const;
+  bool isDependent(const MachineInstr &ProdMI,
+                   const MachineInstr &ConsMI) const;
+  bool isDotCurInst(const MachineInstr &MI) const;
+  bool isDotNewInst(const MachineInstr &MI) const;
+  bool isDuplexPair(const MachineInstr &MIa, const MachineInstr &MIb) const;
+  bool isEarlySourceInstr(const MachineInstr &MI) const;
+  bool isEndLoopN(unsigned Opcode) const;
+  bool isExpr(unsigned OpType) const;
+  bool isExtendable(const MachineInstr &MI) const;
+  bool isExtended(const MachineInstr &MI) const;
+  bool isFloat(const MachineInstr &MI) const;
+  bool isHVXMemWithAIndirect(const MachineInstr &I,
+                             const MachineInstr &J) const;
+  bool isIndirectCall(const MachineInstr &MI) const;
+  bool isIndirectL4Return(const MachineInstr &MI) const;
+  bool isJumpR(const MachineInstr &MI) const;
+  bool isJumpWithinBranchRange(const MachineInstr &MI, unsigned offset) const;
+  bool isLateInstrFeedsEarlyInstr(const MachineInstr &LRMI,
+                                  const MachineInstr &ESMI) const;
+  bool isLateResultInstr(const MachineInstr &MI) const;
+  bool isLateSourceInstr(const MachineInstr &MI) const;
+  bool isLoopN(const MachineInstr &MI) const;
+  bool isMemOp(const MachineInstr &MI) const;
+  bool isNewValue(const MachineInstr &MI) const;
+  bool isNewValue(unsigned Opcode) const;
+  bool isNewValueInst(const MachineInstr &MI) const;
+  bool isNewValueJump(const MachineInstr &MI) const;
+  bool isNewValueJump(unsigned Opcode) const;
+  bool isNewValueStore(const MachineInstr &MI) const;
+  bool isNewValueStore(unsigned Opcode) const;
+  bool isOperandExtended(const MachineInstr &MI, unsigned OperandNum) const;
+  bool isPredicatedNew(const MachineInstr &MI) const;
+  bool isPredicatedNew(unsigned Opcode) const;
+  bool isPredicatedTrue(const MachineInstr &MI) const;
+  bool isPredicatedTrue(unsigned Opcode) const;
+  bool isPredicated(unsigned Opcode) const;
+  bool isPredicateLate(unsigned Opcode) const;
+  bool isPredictedTaken(unsigned Opcode) const;
+  bool isSaveCalleeSavedRegsCall(const MachineInstr &MI) const;
+  bool isSignExtendingLoad(const MachineInstr &MI) const;
+  bool isSolo(const MachineInstr &MI) const;
+  bool isSpillPredRegOp(const MachineInstr &MI) const;
+  bool isTC1(const MachineInstr &MI) const;
+  bool isTC2(const MachineInstr &MI) const;
+  bool isTC2Early(const MachineInstr &MI) const;
+  bool isTC4x(const MachineInstr &MI) const;
+  bool isToBeScheduledASAP(const MachineInstr &MI1,
+                           const MachineInstr &MI2) const;
+  bool isV60VectorInstruction(const MachineInstr &MI) const;
+  bool isValidAutoIncImm(const EVT VT, const int Offset) const;
+  bool isValidOffset(unsigned Opcode, int Offset, bool Extend = true) const;
+  bool isVecAcc(const MachineInstr &MI) const;
+  bool isVecALU(const MachineInstr &MI) const;
+  bool isVecUsableNextPacket(const MachineInstr &ProdMI,
+                             const MachineInstr &ConsMI) const;
+  bool isZeroExtendingLoad(const MachineInstr &MI) const;
+
+  bool addLatencyToSchedule(const MachineInstr &MI1,
+                            const MachineInstr &MI2) const;
+  bool canExecuteInBundle(const MachineInstr &First,
+                          const MachineInstr &Second) const;
+  bool doesNotReturn(const MachineInstr &CallMI) const;
+  bool hasEHLabel(const MachineBasicBlock *B) const;
+  bool hasNonExtEquivalent(const MachineInstr &MI) const;
+  bool hasPseudoInstrPair(const MachineInstr &MI) const;
+  bool hasUncondBranch(const MachineBasicBlock *B) const;
+  bool mayBeCurLoad(const MachineInstr &MI) const;
+  bool mayBeNewStore(const MachineInstr &MI) const;
+  bool producesStall(const MachineInstr &ProdMI,
+                     const MachineInstr &ConsMI) const;
+  bool producesStall(const MachineInstr &MI,
+                     MachineBasicBlock::const_instr_iterator MII) const;
+  bool predCanBeUsedAsDotNew(const MachineInstr &MI, unsigned PredReg) const;
+  bool PredOpcodeHasJMP_c(unsigned Opcode) const;
+  bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const;
+
+
+  short getAbsoluteForm(const MachineInstr &MI) const;
+  unsigned getAddrMode(const MachineInstr &MI) const;
+  unsigned getBaseAndOffset(const MachineInstr &MI, int &Offset,
+                            unsigned &AccessSize) const;
+  short getBaseWithLongOffset(short Opcode) const;
+  short getBaseWithLongOffset(const MachineInstr &MI) const;
+  short getBaseWithRegOffset(const MachineInstr &MI) const;
+  SmallVector<MachineInstr*,2> getBranchingInstrs(MachineBasicBlock& MBB) const;
+  unsigned getCExtOpNum(const MachineInstr &MI) const;
+  HexagonII::CompoundGroup
+  getCompoundCandidateGroup(const MachineInstr &MI) const;
+  unsigned getCompoundOpcode(const MachineInstr &GA,
+                             const MachineInstr &GB) const;
+  int getCondOpcode(int Opc, bool sense) const;
+  int getDotCurOp(const MachineInstr &MI) const;
+  int getDotNewOp(const MachineInstr &MI) const;
+  int getDotNewPredJumpOp(const MachineInstr &MI,
+                          const MachineBranchProbabilityInfo *MBPI) const;
+  int getDotNewPredOp(const MachineInstr &MI,
+                      const MachineBranchProbabilityInfo *MBPI) const;
+  int getDotOldOp(const int opc) const;
+  HexagonII::SubInstructionGroup getDuplexCandidateGroup(const MachineInstr &MI)
+                                                         const;
+  short getEquivalentHWInstr(const MachineInstr &MI) const;
+  MachineInstr *getFirstNonDbgInst(MachineBasicBlock *BB) const;
+  unsigned getInstrTimingClassLatency(const InstrItineraryData *ItinData,
+                                      const MachineInstr &MI) const;
+  bool getInvertedPredSense(SmallVectorImpl<MachineOperand> &Cond) const;
+  unsigned getInvertedPredicatedOpcode(const int Opc) const;
+  int getMaxValue(const MachineInstr &MI) const;
+  unsigned getMemAccessSize(const MachineInstr &MI) const;
+  int getMinValue(const MachineInstr &MI) const;
+  short getNonExtOpcode(const MachineInstr &MI) const;
+  bool getPredReg(ArrayRef<MachineOperand> Cond, unsigned &PredReg,
+                  unsigned &PredRegPos, unsigned &PredRegFlags) const;
+  short getPseudoInstrPair(const MachineInstr &MI) const;
+  short getRegForm(const MachineInstr &MI) const;
+  unsigned getSize(const MachineInstr &MI) const;
+  uint64_t getType(const MachineInstr &MI) const;
+  unsigned getUnits(const MachineInstr &MI) const;
+  unsigned getValidSubTargets(const unsigned Opcode) const;
+
+
+  /// getInstrTimingClassLatency - Compute the instruction latency of a given
+  /// instruction using Timing Class information, if available.
+  unsigned nonDbgBBSize(const MachineBasicBlock *BB) const;
+  unsigned nonDbgBundleSize(MachineBasicBlock::const_iterator BundleHead) const;
+
+
+  void immediateExtend(MachineInstr &MI) const;
+  bool invertAndChangeJumpTarget(MachineInstr &MI,
+                                 MachineBasicBlock* NewTarget) const;
+  void genAllInsnTimingClasses(MachineFunction &MF) const;
+  bool reversePredSense(MachineInstr &MI) const;
+  unsigned reversePrediction(unsigned Opcode) const;
+  bool validateBranchCond(const ArrayRef<MachineOperand> &Cond) const;
+  short xformRegToImmOffset(const MachineInstr &MI) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td
new file mode 100644
index 000000000000..c5719ad5b6d8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -0,0 +1,4799 @@
+//==- HexagonInstrInfo.td - Target Description for Hexagon -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrFormats.td"
+include "HexagonOperands.td"
+include "HexagonInstrEnc.td"
+
+//===----------------------------------------------------------------------===//
+// Compare
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, isCompare = 1, InputType = "imm", isExtendable = 1,
+    opExtendable = 2 in
+class T_CMP <string mnemonic, bits<2> MajOp, bit isNot, Operand ImmOp>
+  : ALU32Inst <(outs PredRegs:$dst),
+               (ins IntRegs:$src1, ImmOp:$src2),
+  "$dst = "#!if(isNot, "!","")#mnemonic#"($src1, #$src2)",
+  [], "",ALU32_2op_tc_2early_SLOT0123 >, ImmRegRel {
+    bits<2> dst;
+    bits<5> src1;
+    bits<10> src2;
+    let CextOpcode = mnemonic;
+    let opExtentBits  = !if(!eq(mnemonic, "cmp.gtu"), 9, 10);
+    let isExtentSigned = !if(!eq(mnemonic, "cmp.gtu"), 0, 1);
+
+    let IClass = 0b0111;
+
+    let Inst{27-24} = 0b0101;
+    let Inst{23-22} = MajOp;
+    let Inst{21}    = !if(!eq(mnemonic, "cmp.gtu"), 0, src2{9});
+    let Inst{20-16} = src1;
+    let Inst{13-5}  = src2{8-0};
+    let Inst{4}     = isNot;
+    let Inst{3-2}   = 0b00;
+    let Inst{1-0}   = dst;
+  }
+
+def C2_cmpeqi   : T_CMP <"cmp.eq",  0b00, 0, s10_0Ext>;
+def C2_cmpgti   : T_CMP <"cmp.gt",  0b01, 0, s10_0Ext>;
+def C2_cmpgtui  : T_CMP <"cmp.gtu", 0b10, 0, u9_0Ext>;
+
+//===----------------------------------------------------------------------===//
+// ALU32/ALU +
+//===----------------------------------------------------------------------===//
+// Add.
+
+let hasSideEffects = 0, hasNewValue = 1, InputType = "reg" in
+class T_ALU32_3op<string mnemonic, bits<3> MajOp, bits<3> MinOp, bit OpsRev,
+                  bit IsComm>
+  : ALU32_rr<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
+             "$Rd = "#mnemonic#"($Rs, $Rt)",
+             [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel, PredRel {
+  let isCommutable = IsComm;
+  let BaseOpcode = mnemonic#_rr;
+  let CextOpcode = mnemonic;
+
+  bits<5> Rs;
+  bits<5> Rt;
+  bits<5> Rd;
+
+  let IClass = 0b1111;
+  let Inst{27} = 0b0;
+  let Inst{26-24} = MajOp;
+  let Inst{23-21} = MinOp;
+  let Inst{20-16} = !if(OpsRev,Rt,Rs);
+  let Inst{12-8} = !if(OpsRev,Rs,Rt);
+  let Inst{4-0} = Rd;
+}
+
+let hasSideEffects = 0, hasNewValue = 1 in
+class T_ALU32_3op_pred<string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                       bit OpsRev, bit PredNot, bit PredNew>
+  : ALU32_rr<(outs IntRegs:$Rd), (ins PredRegs:$Pu, IntRegs:$Rs, IntRegs:$Rt),
+             "if ("#!if(PredNot,"!","")#"$Pu"#!if(PredNew,".new","")#") "#
+             "$Rd = "#mnemonic#"($Rs, $Rt)",
+             [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel, PredNewRel {
+  let isPredicated = 1;
+  let isPredicatedFalse = PredNot;
+  let isPredicatedNew = PredNew;
+  let BaseOpcode = mnemonic#_rr;
+  let CextOpcode = mnemonic;
+
+  bits<2> Pu;
+  bits<5> Rs;
+  bits<5> Rt;
+  bits<5> Rd;
+
+  let IClass = 0b1111;
+  let Inst{27} = 0b1;
+  let Inst{26-24} = MajOp;
+  let Inst{23-21} = MinOp;
+  let Inst{20-16} = !if(OpsRev,Rt,Rs);
+  let Inst{13} = PredNew;
+  let Inst{12-8} = !if(OpsRev,Rs,Rt);
+  let Inst{7} = PredNot;
+  let Inst{6-5} = Pu;
+  let Inst{4-0} = Rd;
+}
+
+class T_ALU32_combineh<string Op1, string Op2, bits<3> MajOp, bits<3> MinOp,
+                      bit OpsRev>
+  : T_ALU32_3op<"", MajOp, MinOp, OpsRev, 0> {
+  let AsmString = "$Rd = combine($Rs"#Op1#", $Rt"#Op2#")";
+}
+
+def A2_combine_hh : T_ALU32_combineh<".h", ".h", 0b011, 0b100, 1>;
+def A2_combine_hl : T_ALU32_combineh<".h", ".l", 0b011, 0b101, 1>;
+def A2_combine_lh : T_ALU32_combineh<".l", ".h", 0b011, 0b110, 1>;
+def A2_combine_ll : T_ALU32_combineh<".l", ".l", 0b011, 0b111, 1>;
+
+class T_ALU32_3op_sfx<string mnemonic, string suffix, bits<3> MajOp,
+                      bits<3> MinOp, bit OpsRev, bit IsComm>
+  : T_ALU32_3op<"", MajOp, MinOp, OpsRev, IsComm> {
+  let AsmString = "$Rd = "#mnemonic#"($Rs, $Rt)"#suffix;
+}
+
+def A2_svaddh   : T_ALU32_3op<"vaddh",   0b110, 0b000, 0, 1>;
+def A2_svsubh   : T_ALU32_3op<"vsubh",   0b110, 0b100, 1, 0>;
+
+let Defs = [USR_OVF], Itinerary = ALU32_3op_tc_2_SLOT0123 in {
+  def A2_svaddhs  : T_ALU32_3op_sfx<"vaddh",  ":sat", 0b110, 0b001, 0, 1>;
+  def A2_addsat   : T_ALU32_3op_sfx<"add",    ":sat", 0b110, 0b010, 0, 1>;
+  def A2_svadduhs : T_ALU32_3op_sfx<"vadduh", ":sat", 0b110, 0b011, 0, 1>;
+  def A2_svsubhs  : T_ALU32_3op_sfx<"vsubh",  ":sat", 0b110, 0b101, 1, 0>;
+  def A2_subsat   : T_ALU32_3op_sfx<"sub",    ":sat", 0b110, 0b110, 1, 0>;
+  def A2_svsubuhs : T_ALU32_3op_sfx<"vsubuh", ":sat", 0b110, 0b111, 1, 0>;
+}
+
+let Itinerary = ALU32_3op_tc_2_SLOT0123 in
+def A2_svavghs  : T_ALU32_3op_sfx<"vavgh",  ":rnd", 0b111, 0b001, 0, 1>;
+
+def A2_svavgh   : T_ALU32_3op<"vavgh",   0b111, 0b000, 0, 1>;
+def A2_svnavgh  : T_ALU32_3op<"vnavgh",  0b111, 0b011, 1, 0>;
+
+multiclass T_ALU32_3op_p<string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                         bit OpsRev> {
+  def t    : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 0, 0>;
+  def f    : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 1, 0>;
+  def tnew : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 0, 1>;
+  def fnew : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 1, 1>;
+}
+
+multiclass T_ALU32_3op_A2<string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                          bit OpsRev, bit IsComm> {
+  let isPredicable = 1 in
+  def  A2_#NAME  : T_ALU32_3op  <mnemonic, MajOp, MinOp, OpsRev, IsComm>;
+  defm A2_p#NAME : T_ALU32_3op_p<mnemonic, MajOp, MinOp, OpsRev>;
+}
+
+defm add : T_ALU32_3op_A2<"add", 0b011, 0b000, 0, 1>;
+defm and : T_ALU32_3op_A2<"and", 0b001, 0b000, 0, 1>;
+defm or  : T_ALU32_3op_A2<"or",  0b001, 0b001, 0, 1>;
+defm sub : T_ALU32_3op_A2<"sub", 0b011, 0b001, 1, 0>;
+defm xor : T_ALU32_3op_A2<"xor", 0b001, 0b011, 0, 1>;
+
+// A few special cases producing register pairs:
+let OutOperandList = (outs DoubleRegs:$Rd), hasNewValue = 0 in {
+  def S2_packhl    : T_ALU32_3op  <"packhl",  0b101, 0b100, 0, 0>;
+
+  let isPredicable = 1 in
+    def A2_combinew  : T_ALU32_3op  <"combine", 0b101, 0b000, 0, 0>;
+
+  // Conditional combinew uses "newt/f" instead of "t/fnew".
+  def C2_ccombinewt    : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 0, 0>;
+  def C2_ccombinewf    : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 1, 0>;
+  def C2_ccombinewnewt : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 0, 1>;
+  def C2_ccombinewnewf : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 1, 1>;
+}
+
+let hasSideEffects = 0, hasNewValue = 1, isCompare = 1, InputType = "reg"  in
+class T_ALU32_3op_cmp<string mnemonic, bits<2> MinOp, bit IsNeg, bit IsComm>
+  : ALU32_rr<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
+             "$Pd = "#mnemonic#"($Rs, $Rt)",
+             [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel {
+  let CextOpcode = mnemonic;
+  let isCommutable = IsComm;
+  bits<5> Rs;
+  bits<5> Rt;
+  bits<2> Pd;
+
+  let IClass = 0b1111;
+  let Inst{27-24} = 0b0010;
+  let Inst{22-21} = MinOp;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{4} = IsNeg;
+  let Inst{3-2} = 0b00;
+  let Inst{1-0} = Pd;
+}
+
+let Itinerary = ALU32_3op_tc_2early_SLOT0123 in {
+  def C2_cmpeq   : T_ALU32_3op_cmp< "cmp.eq",  0b00, 0, 1>;
+  def C2_cmpgt   : T_ALU32_3op_cmp< "cmp.gt",  0b10, 0, 0>;
+  def C2_cmpgtu  : T_ALU32_3op_cmp< "cmp.gtu", 0b11, 0, 0>;
+}
+
+let CextOpcode = "MUX", InputType = "reg", hasNewValue = 1 in
+def C2_mux: ALU32_rr<(outs IntRegs:$Rd),
+                     (ins PredRegs:$Pu, IntRegs:$Rs, IntRegs:$Rt),
+      "$Rd = mux($Pu, $Rs, $Rt)", [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel {
+  bits<5> Rd;
+  bits<2> Pu;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let CextOpcode = "mux";
+  let InputType = "reg";
+  let hasSideEffects = 0;
+  let IClass = 0b1111;
+
+  let Inst{27-24} = 0b0100;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{6-5} = Pu;
+  let Inst{4-0} = Rd;
+}
+
+// Combines the two immediates into a double register.
+// Increase complexity to make it greater than any complexity of a combine
+// that involves a register.
+
+let isReMaterializable = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
+    isExtentSigned = 1, isExtendable = 1, opExtentBits = 8, opExtendable = 1,
+    AddedComplexity = 75 in
+def A2_combineii: ALU32Inst <(outs DoubleRegs:$Rdd), (ins s8_0Ext:$s8, s8_0Imm:$S8),
+  "$Rdd = combine(#$s8, #$S8)",
+  []> {
+    bits<5> Rdd;
+    bits<8> s8;
+    bits<8> S8;
+
+    let IClass = 0b0111;
+    let Inst{27-23} = 0b11000;
+    let Inst{22-16} = S8{7-1};
+    let Inst{13}    = S8{0};
+    let Inst{12-5}  = s8;
+    let Inst{4-0}   = Rdd;
+  }
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated ADD of a reg and an Immediate value.
+//===----------------------------------------------------------------------===//
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_Addri_Pred <bit PredNot, bit PredNew>
+  : ALU32_ri <(outs IntRegs:$Rd),
+              (ins PredRegs:$Pu, IntRegs:$Rs, s8_0Ext:$s8),
+  !if(PredNot, "if (!$Pu", "if ($Pu")#!if(PredNew,".new) $Rd = ",
+  ") $Rd = ")#"add($Rs, #$s8)"> {
+    bits<5> Rd;
+    bits<2> Pu;
+    bits<5> Rs;
+    bits<8> s8;
+
+    let isPredicatedNew = PredNew;
+    let IClass = 0b0111;
+
+    let Inst{27-24} = 0b0100;
+    let Inst{23}    = PredNot;
+    let Inst{22-21} = Pu;
+    let Inst{20-16} = Rs;
+    let Inst{13}    = PredNew;
+    let Inst{12-5}  = s8;
+    let Inst{4-0}   = Rd;
+  }
+
+//===----------------------------------------------------------------------===//
+// A2_addi: Add a signed immediate to a register.
+//===----------------------------------------------------------------------===//
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_Addri <Operand immOp>
+  : ALU32_ri <(outs IntRegs:$Rd),
+              (ins IntRegs:$Rs, immOp:$s16),
+  "$Rd = add($Rs, #$s16)", [], "", ALU32_ADDI_tc_1_SLOT0123> {
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<16> s16;
+
+    let IClass = 0b1011;
+
+    let Inst{27-21} = s16{15-9};
+    let Inst{20-16} = Rs;
+    let Inst{13-5}  = s16{8-0};
+    let Inst{4-0}   = Rd;
+  }
+
+//===----------------------------------------------------------------------===//
+// Multiclass for ADD of a register and an immediate value.
+//===----------------------------------------------------------------------===//
+multiclass Addri_Pred<string mnemonic, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    def NAME     : T_Addri_Pred<PredNot, 0>;
+    // Predicate new
+    def NAME#new : T_Addri_Pred<PredNot, 1>;
+  }
+}
+
+let isExtendable = 1, isExtentSigned = 1, InputType = "imm" in
+multiclass Addri_base<string mnemonic, SDNode OpNode> {
+  let CextOpcode = mnemonic, BaseOpcode = mnemonic#_ri in {
+    let opExtendable = 2, opExtentBits = 16, isPredicable = 1, isAdd = 1 in
+    def A2_#NAME : T_Addri<s16_0Ext>;
+
+    let opExtendable = 3, opExtentBits = 8, isPredicated = 1 in {
+      defm A2_p#NAME#t : Addri_Pred<mnemonic, 0>;
+      defm A2_p#NAME#f : Addri_Pred<mnemonic, 1>;
+    }
+  }
+}
+
+defm addi : Addri_base<"add", add>, ImmRegRel, PredNewRel;
+
+let hasNewValue = 1, hasSideEffects = 0, isPseudo = 1 in
+def A2_iconst
+  : ALU32_ri <(outs IntRegs:$Rd),
+              (ins s23_2Imm:$s23_2),
+  "$Rd = iconst(#$s23_2)"> {}
+
+//===----------------------------------------------------------------------===//
+// Template class used for the following ALU32 instructions.
+// Rd=and(Rs,#s10)
+// Rd=or(Rs,#s10)
+//===----------------------------------------------------------------------===//
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 10,
+InputType = "imm", hasNewValue = 1 in
+class T_ALU32ri_logical <string mnemonic, SDNode OpNode, bits<2> MinOp>
+  : ALU32_ri <(outs IntRegs:$Rd),
+              (ins IntRegs:$Rs, s10_0Ext:$s10),
+  "$Rd = "#mnemonic#"($Rs, #$s10)" ,
+  []> {
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<10> s10;
+    let CextOpcode = mnemonic;
+
+    let IClass = 0b0111;
+
+    let Inst{27-24} = 0b0110;
+    let Inst{23-22} = MinOp;
+    let Inst{21}    = s10{9};
+    let Inst{20-16} = Rs;
+    let Inst{13-5}  = s10{8-0};
+    let Inst{4-0}   = Rd;
+  }
+
+def A2_orir  : T_ALU32ri_logical<"or", or, 0b10>, ImmRegRel;
+def A2_andir : T_ALU32ri_logical<"and", and, 0b00>, ImmRegRel;
+
+// Subtract register from immediate
+// Rd32=sub(#s10,Rs32)
+let isExtendable = 1, CextOpcode = "sub", opExtendable = 1, isExtentSigned = 1,
+    opExtentBits = 10, InputType = "imm", hasNewValue = 1, hasSideEffects = 0 in
+def A2_subri: ALU32_ri <(outs IntRegs:$Rd), (ins s10_0Ext:$s10, IntRegs:$Rs),
+  "$Rd = sub(#$s10, $Rs)", []>, ImmRegRel {
+    bits<5> Rd;
+    bits<10> s10;
+    bits<5> Rs;
+
+    let IClass = 0b0111;
+
+    let Inst{27-22} = 0b011001;
+    let Inst{21}    = s10{9};
+    let Inst{20-16} = Rs;
+    let Inst{13-5}  = s10{8-0};
+    let Inst{4-0}   = Rd;
+  }
+
+// Nop.
+let hasSideEffects = 0 in
+def A2_nop: ALU32Inst <(outs), (ins), "nop" > {
+  let IClass = 0b0111;
+  let Inst{27-24} = 0b1111;
+}
+
+let hasSideEffects = 0, hasNewValue = 1 in
+class T_tfr16<bit isHi>
+  : ALU32Inst <(outs IntRegs:$Rx), (ins IntRegs:$src1, u16_0Imm:$u16),
+  "$Rx"#!if(isHi, ".h", ".l")#" = #$u16",
+  [], "$src1 = $Rx" > {
+    bits<5> Rx;
+    bits<16> u16;
+
+    let IClass = 0b0111;
+    let Inst{27-26} = 0b00;
+    let Inst{25-24} = !if(isHi, 0b10, 0b01);
+    let Inst{23-22} = u16{15-14};
+    let Inst{21}    = 0b1;
+    let Inst{20-16} = Rx;
+    let Inst{13-0}  = u16{13-0};
+  }
+
+def A2_tfril: T_tfr16<0>;
+def A2_tfrih: T_tfr16<1>;
+
+// Conditional transfer is an alias to conditional "Rd = add(Rs, #0)".
+let isPredicated = 1, hasNewValue = 1, opNewValue = 0 in
+class T_tfr_pred<bit isPredNot, bit isPredNew>
+  : ALU32Inst<(outs IntRegs:$dst),
+              (ins PredRegs:$src1, IntRegs:$src2),
+              "if ("#!if(isPredNot, "!", "")#
+              "$src1"#!if(isPredNew, ".new", "")#
+              ") $dst = $src2"> {
+    bits<5> dst;
+    bits<2> src1;
+    bits<5> src2;
+
+    let isPredicatedFalse = isPredNot;
+    let isPredicatedNew = isPredNew;
+    let IClass = 0b0111;
+
+    let Inst{27-24} = 0b0100;
+    let Inst{23} = isPredNot;
+    let Inst{13} = isPredNew;
+    let Inst{12-5} = 0;
+    let Inst{4-0} = dst;
+    let Inst{22-21} = src1;
+    let Inst{20-16} = src2;
+  }
+
+let isPredicable = 1 in
+class T_tfr : ALU32Inst<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = $src"> {
+    bits<5> dst;
+    bits<5> src;
+
+    let IClass = 0b0111;
+
+    let Inst{27-21} = 0b0000011;
+    let Inst{20-16} = src;
+    let Inst{13}    = 0b0;
+    let Inst{4-0}   = dst;
+  }
+
+let InputType = "reg", hasNewValue = 1, hasSideEffects = 0 in
+multiclass tfr_base<string CextOp> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp in {
+    def NAME : T_tfr;
+
+    // Predicate
+    def t : T_tfr_pred<0, 0>;
+    def f : T_tfr_pred<1, 0>;
+    // Predicate new
+    def tnew : T_tfr_pred<0, 1>;
+    def fnew : T_tfr_pred<1, 1>;
+  }
+}
+
+// Assembler mapped to C2_ccombinew[t|f|newt|newf].
+// Please don't add bits to this instruction as it'll be converted into
+// 'combine' before object code emission.
+let isPredicated = 1 in
+class T_tfrp_pred<bit PredNot, bit PredNew>
+  : ALU32_rr <(outs DoubleRegs:$dst),
+              (ins PredRegs:$src1, DoubleRegs:$src2),
+  "if ("#!if(PredNot, "!", "")#"$src1"
+        #!if(PredNew, ".new", "")#") $dst = $src2" > {
+    let isPredicatedFalse = PredNot;
+    let isPredicatedNew = PredNew;
+  }
+
+// Assembler mapped to A2_combinew.
+// Please don't add bits to this instruction as it'll be converted into
+// 'combine' before object code emission.
+class T_tfrp : ALU32Inst <(outs DoubleRegs:$dst),
+               (ins DoubleRegs:$src),
+    "$dst = $src">;
+
+let hasSideEffects = 0 in
+multiclass TFR64_base<string BaseName> {
+  let BaseOpcode = BaseName in {
+    let isPredicable = 1 in
+    def NAME : T_tfrp;
+    // Predicate
+    def t : T_tfrp_pred <0, 0>;
+    def f : T_tfrp_pred <1, 0>;
+    // Predicate new
+    def tnew : T_tfrp_pred <0, 1>;
+    def fnew : T_tfrp_pred <1, 1>;
+  }
+}
+
+let InputType = "imm", isExtendable = 1, isExtentSigned = 1, opExtentBits = 12,
+    isMoveImm = 1, opExtendable = 2, BaseOpcode = "TFRI", CextOpcode = "TFR",
+    hasSideEffects = 0, isPredicated = 1, hasNewValue = 1 in
+class T_TFRI_Pred<bit PredNot, bit PredNew>
+  : ALU32_ri<(outs IntRegs:$Rd), (ins PredRegs:$Pu, s12_0Ext:$s12),
+    "if ("#!if(PredNot,"!","")#"$Pu"#!if(PredNew,".new","")#") $Rd = #$s12",
+    [], "", ALU32_2op_tc_1_SLOT0123>, ImmRegRel, PredNewRel {
+  let isPredicatedFalse = PredNot;
+  let isPredicatedNew = PredNew;
+
+  bits<5> Rd;
+  bits<2> Pu;
+  bits<12> s12;
+
+  let IClass = 0b0111;
+  let Inst{27-24} = 0b1110;
+  let Inst{23} = PredNot;
+  let Inst{22-21} = Pu;
+  let Inst{20} = 0b0;
+  let Inst{19-16,12-5} = s12;
+  let Inst{13} = PredNew;
+  let Inst{4-0} = Rd;
+}
+
+def C2_cmoveit    : T_TFRI_Pred<0, 0>;
+def C2_cmoveif    : T_TFRI_Pred<1, 0>;
+def C2_cmovenewit : T_TFRI_Pred<0, 1>;
+def C2_cmovenewif : T_TFRI_Pred<1, 1>;
+
+let InputType = "imm", isExtendable = 1, isExtentSigned = 1,
+    CextOpcode = "TFR", BaseOpcode = "TFRI", hasNewValue = 1, opNewValue = 0,
+    isAsCheapAsAMove = 1 , opExtendable = 1, opExtentBits = 16, isMoveImm = 1,
+    isPredicated = 0, isPredicable = 1, isReMaterializable = 1 in
+def A2_tfrsi : ALU32Inst<(outs IntRegs:$Rd), (ins s16_0Ext:$s16), "$Rd = #$s16",
+    [], "", ALU32_2op_tc_1_SLOT0123>,
+    ImmRegRel, PredRel {
+  bits<5> Rd;
+  bits<16> s16;
+
+  let IClass = 0b0111;
+  let Inst{27-24} = 0b1000;
+  let Inst{23-22,20-16,13-5} = s16;
+  let Inst{4-0} = Rd;
+}
+
+defm A2_tfr  : tfr_base<"TFR">, ImmRegRel, PredNewRel;
+let isAsmParserOnly = 1 in
+defm A2_tfrp : TFR64_base<"TFR64">, PredNewRel;
+
+// Assembler mapped
+let isReMaterializable = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
+    isAsmParserOnly = 1 in
+def A2_tfrpi : ALU64_rr<(outs DoubleRegs:$dst), (ins s8_0Imm64:$src1),
+                      "$dst = #$src1",
+                      []>;
+
+// TODO: see if this instruction can be deleted..
+let isExtendable = 1, opExtendable = 1, opExtentBits = 6,
+    isAsmParserOnly = 1 in {
+def TFRI64_V4 : ALU64_rr<(outs DoubleRegs:$dst), (ins u64_0Imm:$src1),
+                         "$dst = #$src1">;
+def TFRI64_V2_ext : ALU64_rr<(outs DoubleRegs:$dst),
+                             (ins s8_0Ext:$src1, s8_0Imm:$src2),
+                             "$dst = combine(##$src1, #$src2)">;
+}
+
+//===----------------------------------------------------------------------===//
+// ALU32/ALU -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM +
+//===----------------------------------------------------------------------===//
+// Scalar mux register immediate.
+let hasSideEffects = 0, isExtentSigned = 1, CextOpcode = "MUX",
+    InputType = "imm", hasNewValue = 1, isExtendable = 1, opExtentBits = 8 in
+class T_MUX1 <bit MajOp, dag ins, string AsmStr>
+      : ALU32Inst <(outs IntRegs:$Rd), ins, AsmStr>, ImmRegRel {
+  bits<5> Rd;
+  bits<2> Pu;
+  bits<8> s8;
+  bits<5> Rs;
+
+  let IClass = 0b0111;
+  let Inst{27-24} = 0b0011;
+  let Inst{23} = MajOp;
+  let Inst{22-21} = Pu;
+  let Inst{20-16} = Rs;
+  let Inst{13}    = 0b0;
+  let Inst{12-5}  = s8;
+  let Inst{4-0}   = Rd;
+}
+
+let opExtendable = 2 in
+def C2_muxri : T_MUX1<0b1, (ins PredRegs:$Pu, s8_0Ext:$s8, IntRegs:$Rs),
+                           "$Rd = mux($Pu, #$s8, $Rs)">;
+
+let opExtendable = 3 in
+def C2_muxir : T_MUX1<0b0, (ins PredRegs:$Pu, IntRegs:$Rs, s8_0Ext:$s8),
+                           "$Rd = mux($Pu, $Rs, #$s8)">;
+
+// C2_muxii: Scalar mux immediates.
+let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1,
+    opExtentBits = 8, opExtendable = 2 in
+def C2_muxii: ALU32Inst <(outs IntRegs:$Rd),
+                         (ins PredRegs:$Pu, s8_0Ext:$s8, s8_0Imm:$S8),
+  "$Rd = mux($Pu, #$s8, #$S8)" ,
+  []> {
+    bits<5> Rd;
+    bits<2> Pu;
+    bits<8> s8;
+    bits<8> S8;
+
+    let IClass = 0b0111;
+
+    let Inst{27-25} = 0b101;
+    let Inst{24-23} = Pu;
+    let Inst{22-16} = S8{7-1};
+    let Inst{13}    = S8{0};
+    let Inst{12-5}  = s8;
+    let Inst{4-0}   = Rd;
+  }
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+def PS_pselect : ALU64_rr<(outs DoubleRegs:$Rd),
+      (ins PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt),
+      ".error \"should not emit\" ", []>;
+
+
+//===----------------------------------------------------------------------===//
+// template class for non-predicated alu32_2op instructions
+// - aslh, asrh, sxtb, sxth, zxth
+//===----------------------------------------------------------------------===//
+let hasNewValue = 1, opNewValue = 0 in
+class T_ALU32_2op <string mnemonic, bits<3> minOp> :
+  ALU32Inst <(outs IntRegs:$Rd), (ins IntRegs:$Rs),
+             "$Rd = "#mnemonic#"($Rs)", [] > {
+  bits<5> Rd;
+  bits<5> Rs;
+
+  let IClass = 0b0111;
+
+  let Inst{27-24} = 0b0000;
+  let Inst{23-21} = minOp;
+  let Inst{13} = 0b0;
+  let Inst{4-0} = Rd;
+  let Inst{20-16} = Rs;
+}
+
+//===----------------------------------------------------------------------===//
+// template class for predicated alu32_2op instructions
+// - aslh, asrh, sxtb, sxth, zxtb, zxth
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class T_ALU32_2op_Pred <string mnemonic, bits<3> minOp, bit isPredNot,
+                        bit isPredNew > :
+  ALU32Inst <(outs IntRegs:$Rd), (ins PredRegs:$Pu, IntRegs:$Rs),
+             !if(isPredNot, "if (!$Pu", "if ($Pu")
+             #!if(isPredNew, ".new) ",") ")#"$Rd = "#mnemonic#"($Rs)"> {
+  bits<5> Rd;
+  bits<2> Pu;
+  bits<5> Rs;
+
+  let IClass = 0b0111;
+
+  let Inst{27-24} = 0b0000;
+  let Inst{23-21} = minOp;
+  let Inst{13} = 0b1;
+  let Inst{11} = isPredNot;
+  let Inst{10} = isPredNew;
+  let Inst{4-0} = Rd;
+  let Inst{9-8} = Pu;
+  let Inst{20-16} = Rs;
+}
+
+multiclass ALU32_2op_Pred<string mnemonic, bits<3> minOp, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    def NAME : T_ALU32_2op_Pred<mnemonic, minOp, PredNot, 0>;
+
+    // Predicate new
+    let isPredicatedNew = 1 in
+    def NAME#new : T_ALU32_2op_Pred<mnemonic, minOp, PredNot, 1>;
+  }
+}
+
+multiclass ALU32_2op_base<string mnemonic, bits<3> minOp> {
+  let BaseOpcode = mnemonic in {
+    let isPredicable = 1, hasSideEffects = 0 in
+    def A2_#NAME : T_ALU32_2op<mnemonic, minOp>;
+
+    let isPredicated = 1, hasSideEffects = 0 in {
+      defm A4_p#NAME#t : ALU32_2op_Pred<mnemonic, minOp, 0>;
+      defm A4_p#NAME#f : ALU32_2op_Pred<mnemonic, minOp, 1>;
+    }
+  }
+}
+
+defm aslh : ALU32_2op_base<"aslh", 0b000>, PredNewRel;
+defm asrh : ALU32_2op_base<"asrh", 0b001>, PredNewRel;
+defm sxtb : ALU32_2op_base<"sxtb", 0b101>, PredNewRel;
+defm sxth : ALU32_2op_base<"sxth", 0b111>, PredNewRel;
+defm zxth : ALU32_2op_base<"zxth", 0b110>, PredNewRel;
+
+// Rd=zxtb(Rs): assembler mapped to Rd=and(Rs,#255).
+// Compiler would want to generate 'zxtb' instead of 'and' because 'zxtb' has
+// predicated forms while 'and' doesn't. Since integrated assembler can't
+// handle 'mapped' instructions, we need to encode 'zxtb' same as 'and' where
+// immediate operand is set to '255'.
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_ZXTB: ALU32Inst < (outs IntRegs:$Rd), (ins IntRegs:$Rs),
+  "$Rd = zxtb($Rs)", [] > { // Rd = and(Rs,255)
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<10> s10 = 255;
+
+    let IClass = 0b0111;
+
+    let Inst{27-22} = 0b011000;
+    let Inst{4-0} = Rd;
+    let Inst{20-16} = Rs;
+    let Inst{21} = s10{9};
+    let Inst{13-5} = s10{8-0};
+}
+
+//Rd=zxtb(Rs): assembler mapped to "Rd=and(Rs,#255)
+multiclass ZXTB_base <string mnemonic, bits<3> minOp> {
+  let BaseOpcode = mnemonic in {
+    let isPredicable = 1, hasSideEffects = 0 in
+    def A2_#NAME : T_ZXTB;
+
+    let isPredicated = 1, hasSideEffects = 0 in {
+      defm A4_p#NAME#t : ALU32_2op_Pred<mnemonic, minOp, 0>;
+      defm A4_p#NAME#f : ALU32_2op_Pred<mnemonic, minOp, 1>;
+    }
+  }
+}
+
+defm zxtb : ZXTB_base<"zxtb",0b100>, PredNewRel;
+
+//===----------------------------------------------------------------------===//
+// Template class for vector add and avg
+//===----------------------------------------------------------------------===//
+
+class T_VectALU_64 <string opc, bits<3> majOp, bits<3> minOp,
+                   bit isSat, bit isRnd, bit isCrnd, bit SwapOps >
+  : ALU64_rr < (outs DoubleRegs:$Rdd),
+                (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Rdd = "#opc#"($Rss, $Rtt)"#!if(isRnd, ":rnd", "")
+                             #!if(isCrnd,":crnd","")
+                             #!if(isSat, ":sat", ""),
+  [], "", ALU64_tc_2_SLOT23 > {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1101;
+
+    let Inst{27-24} = 0b0011;
+    let Inst{23-21} = majOp;
+    let Inst{20-16} = !if (SwapOps, Rtt, Rss);
+    let Inst{12-8} = !if (SwapOps, Rss, Rtt);
+    let Inst{7-5} = minOp;
+    let Inst{4-0} = Rdd;
+  }
+
+// ALU64 - Vector add
+// Rdd=vadd[u][bhw](Rss,Rtt)
+let Itinerary = ALU64_tc_1_SLOT23 in {
+  def A2_vaddub  : T_VectALU_64 < "vaddub", 0b000, 0b000, 0, 0, 0, 0>;
+  def A2_vaddh   : T_VectALU_64 < "vaddh",  0b000, 0b010, 0, 0, 0, 0>;
+  def A2_vaddw   : T_VectALU_64 < "vaddw",  0b000, 0b101, 0, 0, 0, 0>;
+}
+
+// Rdd=vadd[u][bhw](Rss,Rtt):sat
+let Defs = [USR_OVF] in {
+  def A2_vaddubs : T_VectALU_64 < "vaddub", 0b000, 0b001, 1, 0, 0, 0>;
+  def A2_vaddhs  : T_VectALU_64 < "vaddh",  0b000, 0b011, 1, 0, 0, 0>;
+  def A2_vadduhs : T_VectALU_64 < "vadduh", 0b000, 0b100, 1, 0, 0, 0>;
+  def A2_vaddws  : T_VectALU_64 < "vaddw",  0b000, 0b110, 1, 0, 0, 0>;
+}
+
+// ALU64 - Vector average
+// Rdd=vavg[u][bhw](Rss,Rtt)
+let Itinerary = ALU64_tc_1_SLOT23 in {
+  def A2_vavgub : T_VectALU_64 < "vavgub", 0b010, 0b000, 0, 0, 0, 0>;
+  def A2_vavgh  : T_VectALU_64 < "vavgh",  0b010, 0b010, 0, 0, 0, 0>;
+  def A2_vavguh : T_VectALU_64 < "vavguh", 0b010, 0b101, 0, 0, 0, 0>;
+  def A2_vavgw  : T_VectALU_64 < "vavgw",  0b011, 0b000, 0, 0, 0, 0>;
+  def A2_vavguw : T_VectALU_64 < "vavguw", 0b011, 0b011, 0, 0, 0, 0>;
+}
+
+// Rdd=vavg[u][bhw](Rss,Rtt)[:rnd|:crnd]
+def A2_vavgubr : T_VectALU_64 < "vavgub", 0b010, 0b001, 0, 1, 0, 0>;
+def A2_vavghr  : T_VectALU_64 < "vavgh",  0b010, 0b011, 0, 1, 0, 0>;
+def A2_vavghcr : T_VectALU_64 < "vavgh",  0b010, 0b100, 0, 0, 1, 0>;
+def A2_vavguhr : T_VectALU_64 < "vavguh", 0b010, 0b110, 0, 1, 0, 0>;
+
+def A2_vavgwr  : T_VectALU_64 < "vavgw",  0b011, 0b001, 0, 1, 0, 0>;
+def A2_vavgwcr : T_VectALU_64 < "vavgw",  0b011, 0b010, 0, 0, 1, 0>;
+def A2_vavguwr : T_VectALU_64 < "vavguw", 0b011, 0b100, 0, 1, 0, 0>;
+
+// Rdd=vnavg[bh](Rss,Rtt)
+let Itinerary = ALU64_tc_1_SLOT23 in {
+  def A2_vnavgh   : T_VectALU_64 < "vnavgh", 0b100, 0b000, 0, 0, 0, 1>;
+  def A2_vnavgw   : T_VectALU_64 < "vnavgw", 0b100, 0b011, 0, 0, 0, 1>;
+}
+
+// Rdd=vnavg[bh](Rss,Rtt)[:rnd|:crnd]:sat
+let Defs = [USR_OVF] in {
+  def A2_vnavghr  : T_VectALU_64 < "vnavgh", 0b100, 0b001, 1, 1, 0, 1>;
+  def A2_vnavghcr : T_VectALU_64 < "vnavgh", 0b100, 0b010, 1, 0, 1, 1>;
+  def A2_vnavgwr  : T_VectALU_64 < "vnavgw", 0b100, 0b100, 1, 1, 0, 1>;
+  def A2_vnavgwcr : T_VectALU_64 < "vnavgw", 0b100, 0b110, 1, 0, 1, 1>;
+}
+
+// Rdd=vsub[u][bh](Rss,Rtt)
+let Itinerary = ALU64_tc_1_SLOT23 in {
+  def A2_vsubub  : T_VectALU_64 < "vsubub", 0b001, 0b000, 0, 0, 0, 1>;
+  def A2_vsubh   : T_VectALU_64 < "vsubh",  0b001, 0b010, 0, 0, 0, 1>;
+  def A2_vsubw   : T_VectALU_64 < "vsubw",  0b001, 0b101, 0, 0, 0, 1>;
+}
+
+// Rdd=vsub[u][bh](Rss,Rtt):sat
+let Defs = [USR_OVF] in {
+  def A2_vsububs : T_VectALU_64 < "vsubub", 0b001, 0b001, 1, 0, 0, 1>;
+  def A2_vsubhs  : T_VectALU_64 < "vsubh",  0b001, 0b011, 1, 0, 0, 1>;
+  def A2_vsubuhs : T_VectALU_64 < "vsubuh", 0b001, 0b100, 1, 0, 0, 1>;
+  def A2_vsubws  : T_VectALU_64 < "vsubw",  0b001, 0b110, 1, 0, 0, 1>;
+}
+
+// Rdd=vmax[u][bhw](Rss,Rtt)
+def A2_vmaxb  : T_VectALU_64 < "vmaxb",  0b110, 0b110, 0, 0, 0, 1>;
+def A2_vmaxub : T_VectALU_64 < "vmaxub", 0b110, 0b000, 0, 0, 0, 1>;
+def A2_vmaxh  : T_VectALU_64 < "vmaxh",  0b110, 0b001, 0, 0, 0, 1>;
+def A2_vmaxuh : T_VectALU_64 < "vmaxuh", 0b110, 0b010, 0, 0, 0, 1>;
+def A2_vmaxw  : T_VectALU_64 < "vmaxw",  0b110, 0b011, 0, 0, 0, 1>;
+def A2_vmaxuw : T_VectALU_64 < "vmaxuw", 0b101, 0b101, 0, 0, 0, 1>;
+
+// Rdd=vmin[u][bhw](Rss,Rtt)
+def A2_vminb  : T_VectALU_64 < "vminb",  0b110, 0b111, 0, 0, 0, 1>;
+def A2_vminub : T_VectALU_64 < "vminub", 0b101, 0b000, 0, 0, 0, 1>;
+def A2_vminh  : T_VectALU_64 < "vminh",  0b101, 0b001, 0, 0, 0, 1>;
+def A2_vminuh : T_VectALU_64 < "vminuh", 0b101, 0b010, 0, 0, 0, 1>;
+def A2_vminw  : T_VectALU_64 < "vminw",  0b101, 0b011, 0, 0, 0, 1>;
+def A2_vminuw : T_VectALU_64 < "vminuw", 0b101, 0b100, 0, 0, 0, 1>;
+
+//===----------------------------------------------------------------------===//
+// Template class for vector compare
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_vcmp <string Str, bits<4> minOp>
+  : ALU64_rr <(outs PredRegs:$Pd),
+              (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Pd = "#Str#"($Rss, $Rtt)", [],
+  "", ALU64_tc_2early_SLOT23> {
+    bits<2> Pd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1101;
+
+    let Inst{27-23} = 0b00100;
+    let Inst{13} = minOp{3};
+    let Inst{7-5} = minOp{2-0};
+    let Inst{1-0} = Pd;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
+
+// Vector compare bytes
+def A2_vcmpbeq  : T_vcmp <"vcmpb.eq",  0b0110>;
+def A2_vcmpbgtu : T_vcmp <"vcmpb.gtu", 0b0111>;
+
+// Vector compare halfwords
+def A2_vcmpheq  : T_vcmp <"vcmph.eq",  0b0011>;
+def A2_vcmphgt  : T_vcmp <"vcmph.gt",  0b0100>;
+def A2_vcmphgtu : T_vcmp <"vcmph.gtu", 0b0101>;
+
+// Vector compare words
+def A2_vcmpweq  : T_vcmp <"vcmpw.eq",  0b0000>;
+def A2_vcmpwgt  : T_vcmp <"vcmpw.gt",  0b0001>;
+def A2_vcmpwgtu : T_vcmp <"vcmpw.gtu", 0b0010>;
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU32/PRED +
+//===----------------------------------------------------------------------===//
+// No bits needed.  If cmp.ge is found the assembler parser will
+// transform it to cmp.gt subtracting 1 from the immediate.
+let isPseudo = 1 in {
+def C2_cmpgei: ALU32Inst <
+  (outs PredRegs:$Pd), (ins IntRegs:$Rs, s8_0Ext:$s8),
+  "$Pd = cmp.ge($Rs, #$s8)">;
+def C2_cmpgeui: ALU32Inst <
+  (outs PredRegs:$Pd), (ins IntRegs:$Rs, u8_0Ext:$s8),
+  "$Pd = cmp.geu($Rs, #$s8)">;
+}
+
+
+//===----------------------------------------------------------------------===//
+// ALU32/PRED -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU +
+//===----------------------------------------------------------------------===//
+// Add.
+//===----------------------------------------------------------------------===//
+// Template Class
+// Add/Subtract halfword
+// Rd=add(Rt.L,Rs.[HL])[:sat]
+// Rd=sub(Rt.L,Rs.[HL])[:sat]
+// Rd=add(Rt.[LH],Rs.[HL])[:sat][:<16]
+// Rd=sub(Rt.[LH],Rs.[HL])[:sat][:<16]
+//===----------------------------------------------------------------------===//
+
+let  hasNewValue = 1, opNewValue = 0 in
+class T_XTYPE_ADD_SUB <bits<2> LHbits, bit isSat, bit hasShift, bit isSub>
+  : ALU64Inst <(outs IntRegs:$Rd), (ins IntRegs:$Rt, IntRegs:$Rs),
+  "$Rd = "#!if(isSub,"sub","add")#"($Rt."
+          #!if(hasShift, !if(LHbits{1},"h","l"),"l") #", $Rs."
+          #!if(hasShift, !if(LHbits{0},"h)","l)"), !if(LHbits{1},"h)","l)"))
+          #!if(isSat,":sat","")
+          #!if(hasShift,":<<16",""), [], "", ALU64_tc_1_SLOT23> {
+    bits<5> Rd;
+    bits<5> Rt;
+    bits<5> Rs;
+    let IClass = 0b1101;
+
+    let Inst{27-23} = 0b01010;
+    let Inst{22} = hasShift;
+    let Inst{21} = isSub;
+    let Inst{7} = isSat;
+    let Inst{6-5} = LHbits;
+    let Inst{4-0} = Rd;
+    let Inst{12-8} = Rt;
+    let Inst{20-16} = Rs;
+  }
+
+//Rd=sub(Rt.L,Rs.[LH])
+def A2_subh_l16_ll : T_XTYPE_ADD_SUB <0b00, 0, 0, 1>;
+def A2_subh_l16_hl : T_XTYPE_ADD_SUB <0b10, 0, 0, 1>;
+
+//Rd=add(Rt.L,Rs.[LH])
+def A2_addh_l16_ll : T_XTYPE_ADD_SUB <0b00, 0, 0, 0>;
+def A2_addh_l16_hl : T_XTYPE_ADD_SUB <0b10, 0, 0, 0>;
+
+let Itinerary = ALU64_tc_2_SLOT23, Defs = [USR_OVF] in {
+  //Rd=sub(Rt.L,Rs.[LH]):sat
+  def A2_subh_l16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 0, 1>;
+  def A2_subh_l16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 0, 1>;
+
+  //Rd=add(Rt.L,Rs.[LH]):sat
+  def A2_addh_l16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 0, 0>;
+  def A2_addh_l16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 0, 0>;
+}
+
+//Rd=sub(Rt.[LH],Rs.[LH]):<<16
+def A2_subh_h16_ll : T_XTYPE_ADD_SUB <0b00, 0, 1, 1>;
+def A2_subh_h16_lh : T_XTYPE_ADD_SUB <0b01, 0, 1, 1>;
+def A2_subh_h16_hl : T_XTYPE_ADD_SUB <0b10, 0, 1, 1>;
+def A2_subh_h16_hh : T_XTYPE_ADD_SUB <0b11, 0, 1, 1>;
+
+//Rd=add(Rt.[LH],Rs.[LH]):<<16
+def A2_addh_h16_ll : T_XTYPE_ADD_SUB <0b00, 0, 1, 0>;
+def A2_addh_h16_lh : T_XTYPE_ADD_SUB <0b01, 0, 1, 0>;
+def A2_addh_h16_hl : T_XTYPE_ADD_SUB <0b10, 0, 1, 0>;
+def A2_addh_h16_hh : T_XTYPE_ADD_SUB <0b11, 0, 1, 0>;
+
+let Itinerary = ALU64_tc_2_SLOT23, Defs = [USR_OVF] in {
+  //Rd=sub(Rt.[LH],Rs.[LH]):sat:<<16
+  def A2_subh_h16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 1, 1>;
+  def A2_subh_h16_sat_lh : T_XTYPE_ADD_SUB <0b01, 1, 1, 1>;
+  def A2_subh_h16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 1, 1>;
+  def A2_subh_h16_sat_hh : T_XTYPE_ADD_SUB <0b11, 1, 1, 1>;
+
+  //Rd=add(Rt.[LH],Rs.[LH]):sat:<<16
+  def A2_addh_h16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 1, 0>;
+  def A2_addh_h16_sat_lh : T_XTYPE_ADD_SUB <0b01, 1, 1, 0>;
+  def A2_addh_h16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 1, 0>;
+  def A2_addh_h16_sat_hh : T_XTYPE_ADD_SUB <0b11, 1, 1, 0>;
+}
+
+let hasSideEffects = 0, hasNewValue = 1 in
+def S2_parityp: ALU64Inst<(outs IntRegs:$Rd),
+      (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
+      "$Rd = parity($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-24} = 0b0000;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{4-0} = Rd;
+}
+
+let hasNewValue = 1, opNewValue = 0, hasSideEffects = 0 in
+class T_XTYPE_MIN_MAX < bit isMax, bit isUnsigned >
+  : ALU64Inst < (outs IntRegs:$Rd), (ins IntRegs:$Rt, IntRegs:$Rs),
+  "$Rd = "#!if(isMax,"max","min")#!if(isUnsigned,"u","")
+          #"($Rt, $Rs)", [], "", ALU64_tc_2_SLOT23> {
+    bits<5> Rd;
+    bits<5> Rt;
+    bits<5> Rs;
+
+    let IClass = 0b1101;
+
+    let Inst{27-23} = 0b01011;
+    let Inst{22-21} = !if(isMax, 0b10, 0b01);
+    let Inst{7} = isUnsigned;
+    let Inst{4-0} = Rd;
+    let Inst{12-8} = !if(isMax, Rs, Rt);
+    let Inst{20-16} = !if(isMax, Rt, Rs);
+  }
+
+def A2_min  : T_XTYPE_MIN_MAX < 0, 0 >;
+def A2_minu : T_XTYPE_MIN_MAX < 0, 1 >;
+def A2_max  : T_XTYPE_MIN_MAX < 1, 0 >;
+def A2_maxu : T_XTYPE_MIN_MAX < 1, 1 >;
+
+class T_cmp64_rr<string mnemonic, bits<3> MinOp, bit IsComm>
+  : ALU64_rr<(outs PredRegs:$Pd), (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
+             "$Pd = "#mnemonic#"($Rs, $Rt)", [], "", ALU64_tc_2early_SLOT23> {
+  let isCompare = 1;
+  let isCommutable = IsComm;
+  let hasSideEffects = 0;
+
+  bits<2> Pd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-21} = 0b0010100;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{7-5} = MinOp;
+  let Inst{1-0} = Pd;
+}
+
+def C2_cmpeqp  : T_cmp64_rr<"cmp.eq",  0b000, 1>;
+def C2_cmpgtp  : T_cmp64_rr<"cmp.gt",  0b010, 0>;
+def C2_cmpgtup : T_cmp64_rr<"cmp.gtu", 0b100, 0>;
+
+def C2_vmux : ALU64_rr<(outs DoubleRegs:$Rd),
+      (ins PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt),
+      "$Rd = vmux($Pu, $Rs, $Rt)", [], "", ALU64_tc_1_SLOT23> {
+  let hasSideEffects = 0;
+
+  bits<5> Rd;
+  bits<2> Pu;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-24} = 0b0001;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{6-5} = Pu;
+  let Inst{4-0} = Rd;
+}
+
+class T_ALU64_rr<string mnemonic, string suffix, bits<4> RegType,
+                 bits<3> MajOp, bits<3> MinOp, bit OpsRev, bit IsComm,
+                 string Op2Pfx>
+  : ALU64_rr<(outs DoubleRegs:$Rd), (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
+             "$Rd = " #mnemonic# "($Rs, " #Op2Pfx# "$Rt)" #suffix, [],
+             "", ALU64_tc_1_SLOT23> {
+  let hasSideEffects = 0;
+  let isCommutable = IsComm;
+
+  bits<5> Rs;
+  bits<5> Rt;
+  bits<5> Rd;
+
+  let IClass = 0b1101;
+  let Inst{27-24} = RegType;
+  let Inst{23-21} = MajOp;
+  let Inst{20-16} = !if (OpsRev,Rt,Rs);
+  let Inst{12-8} = !if (OpsRev,Rs,Rt);
+  let Inst{7-5} = MinOp;
+  let Inst{4-0} = Rd;
+}
+
+class T_ALU64_arith<string mnemonic, bits<3> MajOp, bits<3> MinOp, bit IsSat,
+                    bit OpsRev, bit IsComm>
+  : T_ALU64_rr<mnemonic, !if(IsSat,":sat",""), 0b0011, MajOp, MinOp, OpsRev,
+               IsComm, "">;
+
+let isAdd = 1 in
+def A2_addp : T_ALU64_arith<"add", 0b000, 0b111, 0, 0, 1>;
+def A2_subp : T_ALU64_arith<"sub", 0b001, 0b111, 0, 1, 0>;
+
+class T_ALU64_logical<string mnemonic, bits<3> MinOp, bit OpsRev, bit IsComm,
+                      bit IsNeg>
+  : T_ALU64_rr<mnemonic, "", 0b0011, 0b111, MinOp, OpsRev, IsComm,
+               !if(IsNeg,"~","")>;
+
+def A2_andp : T_ALU64_logical<"and", 0b000, 0, 1, 0>;
+def A2_orp  : T_ALU64_logical<"or",  0b010, 0, 1, 0>;
+def A2_xorp : T_ALU64_logical<"xor", 0b100, 0, 1, 0>;
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/BIT +
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+// ALU64/BIT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/PERM +
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+// ALU64/PERM -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// CR +
+//===----------------------------------------------------------------------===//
+// Logical reductions on predicates.
+
+// Looping instructions.
+
+// Pipelined looping instructions.
+
+// Logical operations on predicates.
+let hasSideEffects = 0 in
+class T_LOGICAL_1OP<string MnOp, bits<2> OpBits>
+    : CRInst<(outs PredRegs:$Pd), (ins PredRegs:$Ps),
+             "$Pd = " # MnOp # "($Ps)", [], "", CR_tc_2early_SLOT23> {
+  bits<2> Pd;
+  bits<2> Ps;
+
+  let IClass = 0b0110;
+  let Inst{27-23} = 0b10111;
+  let Inst{22-21} = OpBits;
+  let Inst{20} = 0b0;
+  let Inst{17-16} = Ps;
+  let Inst{13} = 0b0;
+  let Inst{1-0} = Pd;
+}
+
+def C2_any8 : T_LOGICAL_1OP<"any8", 0b00>;
+def C2_all8 : T_LOGICAL_1OP<"all8", 0b01>;
+def C2_not  : T_LOGICAL_1OP<"not",  0b10>;
+
+let hasSideEffects = 0 in
+class T_LOGICAL_2OP<string MnOp, bits<3> OpBits, bit IsNeg, bit Rev>
+    : CRInst<(outs PredRegs:$Pd), (ins PredRegs:$Ps, PredRegs:$Pt),
+             "$Pd = " # MnOp # "($Ps, " # !if (IsNeg,"!","") # "$Pt)",
+             [], "", CR_tc_2early_SLOT23> {
+  bits<2> Pd;
+  bits<2> Ps;
+  bits<2> Pt;
+
+  let IClass = 0b0110;
+  let Inst{27-24} = 0b1011;
+  let Inst{23-21} = OpBits;
+  let Inst{20} = 0b0;
+  let Inst{17-16} = !if(Rev,Pt,Ps);  // Rs and Rt are reversed for some
+  let Inst{13} = 0b0;                // instructions.
+  let Inst{9-8} = !if(Rev,Ps,Pt);
+  let Inst{1-0} = Pd;
+}
+
+def C2_and  : T_LOGICAL_2OP<"and", 0b000, 0, 1>;
+def C2_or   : T_LOGICAL_2OP<"or",  0b001, 0, 1>;
+def C2_xor  : T_LOGICAL_2OP<"xor", 0b010, 0, 0>;
+def C2_andn : T_LOGICAL_2OP<"and", 0b011, 1, 1>;
+def C2_orn  : T_LOGICAL_2OP<"or",  0b111, 1, 1>;
+
+let hasSideEffects = 0, hasNewValue = 1 in
+def C2_vitpack : SInst<(outs IntRegs:$Rd), (ins PredRegs:$Ps, PredRegs:$Pt),
+      "$Rd = vitpack($Ps, $Pt)", [], "", S_2op_tc_1_SLOT23> {
+  bits<5> Rd;
+  bits<2> Ps;
+  bits<2> Pt;
+
+  let IClass = 0b1000;
+  let Inst{27-24} = 0b1001;
+  let Inst{22-21} = 0b00;
+  let Inst{17-16} = Ps;
+  let Inst{9-8} = Pt;
+  let Inst{4-0} = Rd;
+}
+
+let hasSideEffects = 0 in
+def C2_mask : SInst<(outs DoubleRegs:$Rd), (ins PredRegs:$Pt),
+      "$Rd = mask($Pt)", [], "", S_2op_tc_1_SLOT23> {
+  bits<5> Rd;
+  bits<2> Pt;
+
+  let IClass = 0b1000;
+  let Inst{27-24} = 0b0110;
+  let Inst{9-8} = Pt;
+  let Inst{4-0} = Rd;
+}
+
+// User control register transfer.
+//===----------------------------------------------------------------------===//
+// CR -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// JR +
+//===----------------------------------------------------------------------===//
+
+class CondStr<string CReg, bit True, bit New> {
+  string S = "if (" # !if(True,"","!") # CReg # !if(New,".new","") # ") ";
+}
+class JumpOpcStr<string Mnemonic, bit New, bit Taken> {
+  string S = Mnemonic # !if(Taken, ":t", ":nt");
+}
+
+let isBranch = 1, isBarrier = 1, Defs = [PC], hasSideEffects = 0,
+    isPredicable = 1,
+    isExtendable = 1, opExtendable = 0, isExtentSigned = 1,
+    opExtentBits = 24, opExtentAlign = 2, InputType = "imm" in
+class T_JMP<string ExtStr>
+  : JInst_CJUMP_UCJUMP<(outs), (ins brtarget:$dst),
+      "jump " # ExtStr # "$dst",
+      [], "", J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT> {
+    bits<24> dst;
+    let IClass = 0b0101;
+
+    let Inst{27-25} = 0b100;
+    let Inst{24-16} = dst{23-15};
+    let Inst{13-1} = dst{14-2};
+}
+
+let isBranch = 1, Defs = [PC], hasSideEffects = 0, isPredicated = 1,
+    isExtendable = 1, opExtendable = 1, isExtentSigned = 1,
+    opExtentBits = 17, opExtentAlign = 2, InputType = "imm" in
+class T_JMP_c<bit PredNot, bit isPredNew, bit isTak, string ExtStr>
+  : JInst_CJUMP_UCJUMP<(outs), (ins PredRegs:$src, brtarget:$dst),
+      CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
+        JumpOpcStr<"jump", isPredNew, isTak>.S # " " #
+        ExtStr # "$dst",
+      [], "", J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT>, ImmRegRel {
+    let isTaken = isTak;
+    let isPredicatedFalse = PredNot;
+    let isPredicatedNew = isPredNew;
+    bits<2> src;
+    bits<17> dst;
+
+    let IClass = 0b0101;
+
+    let Inst{27-24} = 0b1100;
+    let Inst{21} = PredNot;
+    let Inst{12} = isTak;
+    let Inst{11} = isPredNew;
+    let Inst{9-8} = src;
+    let Inst{23-22} = dst{16-15};
+    let Inst{20-16} = dst{14-10};
+    let Inst{13} = dst{9};
+    let Inst{7-1} = dst{8-2};
+  }
+
+multiclass JMP_Pred<bit PredNot, string ExtStr> {
+  def NAME       : T_JMP_c<PredNot, 0, 0, ExtStr>; // not taken
+  // Predicate new
+  def NAME#newpt : T_JMP_c<PredNot, 1, 1, ExtStr>; // taken
+  def NAME#new   : T_JMP_c<PredNot, 1, 0, ExtStr>; // not taken
+}
+
+multiclass JMP_base<string BaseOp, string ExtStr> {
+  let BaseOpcode = BaseOp in {
+    def NAME : T_JMP<ExtStr>;
+    defm t : JMP_Pred<0, ExtStr>;
+    defm f : JMP_Pred<1, ExtStr>;
+  }
+}
+
+// Jumps to address stored in a register, JUMPR_MISC
+// if ([[!]P[.new]]) jumpr[:t/nt] Rs
+let isBranch = 1, isIndirectBranch = 1, isBarrier = 1, Defs = [PC],
+    isPredicable = 1, hasSideEffects = 0, InputType = "reg" in
+class T_JMPr
+  : JRInst<(outs), (ins IntRegs:$dst),
+      "jumpr $dst", [], "", J_tc_2early_SLOT2> {
+    bits<5> dst;
+
+    let IClass = 0b0101;
+    let Inst{27-21} = 0b0010100;
+    let Inst{20-16} = dst;
+}
+
+let isBranch = 1, isIndirectBranch = 1, Defs = [PC], isPredicated = 1,
+    hasSideEffects = 0, InputType = "reg" in
+class T_JMPr_c <bit PredNot, bit isPredNew, bit isTak>
+  : JRInst <(outs), (ins PredRegs:$src, IntRegs:$dst),
+      CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
+        JumpOpcStr<"jumpr", isPredNew, isTak>.S # " $dst", [],
+      "", J_tc_2early_SLOT2> {
+
+    let isTaken = isTak;
+    let isPredicatedFalse = PredNot;
+    let isPredicatedNew = isPredNew;
+    bits<2> src;
+    bits<5> dst;
+
+    let IClass = 0b0101;
+
+    let Inst{27-22} = 0b001101;
+    let Inst{21} = PredNot;
+    let Inst{20-16} = dst;
+    let Inst{12} = isTak;
+    let Inst{11} = isPredNew;
+    let Inst{9-8} = src;
+}
+
+multiclass JMPR_Pred<bit PredNot> {
+  def NAME        : T_JMPr_c<PredNot, 0, 0>; // not taken
+  // Predicate new
+  def NAME#newpt  : T_JMPr_c<PredNot, 1, 1>; // taken
+  def NAME#new    : T_JMPr_c<PredNot, 1, 0>; // not taken
+}
+
+multiclass JMPR_base<string BaseOp> {
+  let BaseOpcode = BaseOp in {
+    def NAME : T_JMPr;
+    defm t : JMPR_Pred<0>;
+    defm f : JMPR_Pred<1>;
+  }
+}
+
+let isCall = 1, hasSideEffects = 1 in
+class JUMPR_MISC_CALLR<bit isPred, bit isPredNot,
+               dag InputDag = (ins IntRegs:$Rs)>
+  : JRInst<(outs), InputDag,
+      !if(isPred, !if(isPredNot, "if (!$Pu) callr $Rs",
+                                 "if ($Pu) callr $Rs"),
+                                 "callr $Rs"),
+      [], "", J_tc_2early_SLOT2> {
+    bits<5> Rs;
+    bits<2> Pu;
+    let isPredicated = isPred;
+    let isPredicatedFalse = isPredNot;
+
+    let IClass = 0b0101;
+    let Inst{27-25} = 0b000;
+    let Inst{24-23} = !if (isPred, 0b10, 0b01);
+    let Inst{22} = 0;
+    let Inst{21} = isPredNot;
+    let Inst{9-8} = !if (isPred, Pu, 0b00);
+    let Inst{20-16} = Rs;
+
+  }
+
+let Defs = VolatileV3.Regs in {
+  def J2_callrt : JUMPR_MISC_CALLR<1, 0, (ins PredRegs:$Pu, IntRegs:$Rs)>;
+  def J2_callrf : JUMPR_MISC_CALLR<1, 1, (ins PredRegs:$Pu, IntRegs:$Rs)>;
+}
+
+let isTerminator = 1, hasSideEffects = 0 in {
+  defm J2_jump : JMP_base<"JMP", "">, PredNewRel;
+
+  defm J2_jumpr : JMPR_base<"JMPr">, PredNewRel;
+
+  let isReturn = 1, isPseudo = 1, isCodeGenOnly = 1 in
+  defm PS_jmpret : JMPR_base<"JMPret">, PredNewRel;
+}
+
+let validSubTargets  = HasV60SubT in
+multiclass JMPpt_base<string BaseOp> {
+  let BaseOpcode = BaseOp in {
+    def tpt : T_JMP_c <0, 0, 1, "">; // Predicate true - taken
+    def fpt : T_JMP_c <1, 0, 1, "">; // Predicate false - taken
+  }
+}
+
+let validSubTargets  = HasV60SubT in
+multiclass JMPRpt_base<string BaseOp> {
+  let BaseOpcode = BaseOp in {
+    def tpt : T_JMPr_c<0, 0, 1>; // predicate true - taken
+    def fpt : T_JMPr_c<1, 0, 1>; // predicate false - taken
+  }
+}
+
+defm J2_jumpr : JMPRpt_base<"JMPr">;
+defm J2_jump  : JMPpt_base<"JMP">;
+
+// A return through builtin_eh_return.
+let isReturn = 1, isTerminator = 1, isBarrier = 1, hasSideEffects = 0,
+    isCodeGenOnly = 1, Defs = [PC], Uses = [R28], isPredicable = 0 in
+def EH_RETURN_JMPR : T_JMPr;
+
+//===----------------------------------------------------------------------===//
+// JR -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LD +
+//===----------------------------------------------------------------------===//
+
+// Load - Base with Immediate offset addressing mode
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, AddedComplexity = 20 in
+class T_load_io <string mnemonic, RegisterClass RC, bits<4> MajOp,
+                 Operand ImmOp>
+  : LDInst<(outs RC:$dst), (ins IntRegs:$src1, ImmOp:$offset),
+  "$dst = "#mnemonic#"($src1 + #$offset)", []>, AddrModeRel {
+    bits<4> name;
+    bits<5> dst;
+    bits<5> src1;
+    bits<14> offset;
+    bits<11> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "s11_3Ext"), offset{13-3},
+                     !if (!eq(ImmOpStr, "s11_2Ext"), offset{12-2},
+                     !if (!eq(ImmOpStr, "s11_1Ext"), offset{11-1},
+                                      /* s11_0Ext */ offset{10-0})));
+    let opExtentBits = !if (!eq(ImmOpStr, "s11_3Ext"), 14,
+                       !if (!eq(ImmOpStr, "s11_2Ext"), 13,
+                       !if (!eq(ImmOpStr, "s11_1Ext"), 12,
+                                        /* s11_0Ext */ 11)));
+    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
+
+    let IClass = 0b1001;
+
+    let Inst{27}    = 0b0;
+    let Inst{26-25} = offsetBits{10-9};
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13-5}  = offsetBits{8-0};
+    let Inst{4-0}   = dst;
+  }
+
+let opExtendable = 3, isExtentSigned = 0, isPredicated = 1 in
+class T_pload_io <string mnemonic, RegisterClass RC, bits<4>MajOp,
+                  Operand ImmOp, bit isNot, bit isPredNew>
+  : LDInst<(outs RC:$dst),
+           (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset),
+  "if ("#!if(isNot, "!$src1", "$src1")
+       #!if(isPredNew, ".new", "")
+       #") $dst = "#mnemonic#"($src2 + #$offset)",
+  [],"", V2LDST_tc_ld_SLOT01> , AddrModeRel {
+    bits<5> dst;
+    bits<2> src1;
+    bits<5> src2;
+    bits<9> offset;
+    bits<6> offsetBits;
+    string ImmOpStr = !cast<string>(ImmOp);
+
+    let offsetBits = !if (!eq(ImmOpStr, "u6_3Ext"), offset{8-3},
+                     !if (!eq(ImmOpStr, "u6_2Ext"), offset{7-2},
+                     !if (!eq(ImmOpStr, "u6_1Ext"), offset{6-1},
+                                      /* u6_0Ext */ offset{5-0})));
+    let opExtentBits = !if (!eq(ImmOpStr, "u6_3Ext"), 9,
+                       !if (!eq(ImmOpStr, "u6_2Ext"), 8,
+                       !if (!eq(ImmOpStr, "u6_1Ext"), 7,
+                                        /* u6_0Ext */ 6)));
+    let hasNewValue = !if (!eq(ImmOpStr, "u6_3Ext"), 0, 1);
+    let isPredicatedNew = isPredNew;
+    let isPredicatedFalse = isNot;
+
+    let IClass = 0b0100;
+
+    let Inst{27}    = 0b0;
+    let Inst{27}    = 0b0;
+    let Inst{26}    = isNot;
+    let Inst{25}    = isPredNew;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{13}    = 0b0;
+    let Inst{12-11} = src1;
+    let Inst{10-5}  = offsetBits;
+    let Inst{4-0}   = dst;
+  }
+
+let isExtendable = 1, hasSideEffects = 0, addrMode = BaseImmOffset in
+multiclass LD_Idxd<string mnemonic, string CextOp, RegisterClass RC,
+                   Operand ImmOp, Operand predImmOp, bits<4>MajOp> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
+    let isPredicable = 1 in
+    def L2_#NAME#_io : T_load_io <mnemonic, RC, MajOp, ImmOp>;
+
+    // Predicated
+    def L2_p#NAME#t_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 0, 0>;
+    def L2_p#NAME#f_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 1, 0>;
+
+    // Predicated new
+    def L2_p#NAME#tnew_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 0, 1>;
+    def L2_p#NAME#fnew_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 1, 1>;
+  }
+}
+
+let accessSize = ByteAccess in {
+  defm loadrb:  LD_Idxd <"memb", "LDrib", IntRegs, s11_0Ext, u6_0Ext, 0b1000>;
+  defm loadrub: LD_Idxd <"memub", "LDriub", IntRegs, s11_0Ext, u6_0Ext, 0b1001>;
+}
+
+let accessSize = HalfWordAccess, opExtentAlign = 1 in {
+  defm loadrh:  LD_Idxd <"memh", "LDrih", IntRegs, s11_1Ext, u6_1Ext, 0b1010>;
+  defm loadruh: LD_Idxd <"memuh", "LDriuh", IntRegs, s11_1Ext, u6_1Ext, 0b1011>;
+}
+
+let accessSize = WordAccess, opExtentAlign = 2 in
+defm loadri: LD_Idxd <"memw", "LDriw", IntRegs, s11_2Ext, u6_2Ext, 0b1100>;
+
+let accessSize = DoubleWordAccess, opExtentAlign = 3 in
+defm loadrd: LD_Idxd <"memd", "LDrid", DoubleRegs, s11_3Ext, u6_3Ext, 0b1110>;
+
+let accessSize = HalfWordAccess, opExtentAlign = 1 in {
+  def L2_loadbsw2_io:   T_load_io<"membh",  IntRegs, 0b0001, s11_1Ext>;
+  def L2_loadbzw2_io:   T_load_io<"memubh", IntRegs, 0b0011, s11_1Ext>;
+}
+
+let accessSize = WordAccess, opExtentAlign = 2 in {
+  def L2_loadbzw4_io: T_load_io<"memubh", DoubleRegs, 0b0101, s11_2Ext>;
+  def L2_loadbsw4_io: T_load_io<"membh",  DoubleRegs, 0b0111, s11_2Ext>;
+}
+
+let addrMode = BaseImmOffset, isExtendable = 1, hasSideEffects = 0,
+    opExtendable = 3, isExtentSigned = 1  in
+class T_loadalign_io <string str, bits<4> MajOp, Operand ImmOp>
+  : LDInst<(outs DoubleRegs:$dst),
+           (ins DoubleRegs:$src1, IntRegs:$src2, ImmOp:$offset),
+  "$dst = "#str#"($src2 + #$offset)", [],
+  "$src1 = $dst">, AddrModeRel {
+    bits<4> name;
+    bits<5> dst;
+    bits<5> src2;
+    bits<12> offset;
+    bits<11> offsetBits;
+
+    let offsetBits = !if (!eq(!cast<string>(ImmOp), "s11_1Ext"), offset{11-1},
+                                                  /* s11_0Ext */ offset{10-0});
+    let IClass = 0b1001;
+
+    let Inst{27}    = 0b0;
+    let Inst{26-25} = offsetBits{10-9};
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{13-5}  = offsetBits{8-0};
+    let Inst{4-0}   = dst;
+  }
+
+let accessSize = HalfWordAccess, opExtentBits = 12, opExtentAlign = 1 in
+def L2_loadalignh_io: T_loadalign_io <"memh_fifo", 0b0010, s11_1Ext>;
+
+let accessSize = ByteAccess, opExtentBits = 11 in
+def L2_loadalignb_io: T_loadalign_io <"memb_fifo", 0b0100, s11_0Ext>;
+
+//===----------------------------------------------------------------------===//
+// Post increment load
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Template class for non-predicated post increment loads with immediate offset.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, addrMode = PostInc in
+class T_load_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
+                     bits<4> MajOp >
+  : LDInstPI <(outs RC:$dst, IntRegs:$dst2),
+  (ins IntRegs:$src1, ImmOp:$offset),
+  "$dst = "#mnemonic#"($src1++#$offset)" ,
+  [],
+  "$src1 = $dst2" > ,
+  PredNewRel {
+    bits<5> dst;
+    bits<5> src1;
+    bits<7> offset;
+    bits<4> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
+                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+                                      /* s4_0Imm */ offset{3-0})));
+    let hasNewValue = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
+
+    let IClass = 0b1001;
+
+    let Inst{27-25} = 0b101;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13-12} = 0b00;
+    let Inst{8-5} = offsetBits;
+    let Inst{4-0}   = dst;
+  }
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated post increment loads with immediate offset.
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc in
+class T_pload_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
+                          bits<4> MajOp, bit isPredNot, bit isPredNew >
+  : LDInst <(outs RC:$dst, IntRegs:$dst2),
+            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset),
+  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+  ") ")#"$dst = "#mnemonic#"($src2++#$offset)",
+  [] ,
+  "$src2 = $dst2" > ,
+  PredNewRel {
+    bits<5> dst;
+    bits<2> src1;
+    bits<5> src2;
+    bits<7> offset;
+    bits<4> offsetBits;
+
+    let isPredicatedNew = isPredNew;
+    let isPredicatedFalse = isPredNot;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
+                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+                                      /* s4_0Imm */ offset{3-0})));
+    let hasNewValue = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
+
+    let IClass = 0b1001;
+
+    let Inst{27-25} = 0b101;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{13} = 0b1;
+    let Inst{12} = isPredNew;
+    let Inst{11} = isPredNot;
+    let Inst{10-9} = src1;
+    let Inst{8-5}  = offsetBits;
+    let Inst{4-0}  = dst;
+  }
+
+//===----------------------------------------------------------------------===//
+// Multiclass for post increment loads with immediate offset.
+//===----------------------------------------------------------------------===//
+
+multiclass LD_PostInc <string mnemonic, string BaseOp, RegisterClass RC,
+                       Operand ImmOp, bits<4> MajOp> {
+  let BaseOpcode = "POST_"#BaseOp in {
+    let isPredicable = 1 in
+    def L2_#NAME#_pi : T_load_pi < mnemonic, RC, ImmOp, MajOp>;
+
+    // Predicated
+    def L2_p#NAME#t_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 0, 0>;
+    def L2_p#NAME#f_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 1, 0>;
+
+    // Predicated new
+    def L2_p#NAME#tnew_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 0, 1>;
+    def L2_p#NAME#fnew_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 1, 1>;
+  }
+}
+
+// post increment byte loads with immediate offset
+let accessSize = ByteAccess in {
+  defm loadrb  : LD_PostInc <"memb",  "LDrib", IntRegs, s4_0Imm, 0b1000>;
+  defm loadrub : LD_PostInc <"memub", "LDriub", IntRegs, s4_0Imm, 0b1001>;
+}
+
+// post increment halfword loads with immediate offset
+let accessSize = HalfWordAccess, opExtentAlign = 1 in {
+  defm loadrh  : LD_PostInc <"memh",  "LDrih", IntRegs, s4_1Imm, 0b1010>;
+  defm loadruh : LD_PostInc <"memuh", "LDriuh", IntRegs, s4_1Imm, 0b1011>;
+}
+
+// post increment word loads with immediate offset
+let accessSize = WordAccess, opExtentAlign = 2 in
+defm loadri : LD_PostInc <"memw", "LDriw", IntRegs, s4_2Imm, 0b1100>;
+
+// post increment doubleword loads with immediate offset
+let accessSize = DoubleWordAccess, opExtentAlign = 3 in
+defm loadrd : LD_PostInc <"memd", "LDrid", DoubleRegs, s4_3Imm, 0b1110>;
+
+// Rd=memb[u]h(Rx++#s4:1)
+// Rdd=memb[u]h(Rx++#s4:2)
+let accessSize = HalfWordAccess, opExtentAlign = 1 in {
+  def L2_loadbsw2_pi   : T_load_pi <"membh", IntRegs, s4_1Imm, 0b0001>;
+  def L2_loadbzw2_pi   : T_load_pi <"memubh", IntRegs, s4_1Imm, 0b0011>;
+}
+let accessSize = WordAccess, opExtentAlign = 2, hasNewValue = 0 in {
+  def L2_loadbsw4_pi   : T_load_pi <"membh", DoubleRegs, s4_2Imm, 0b0111>;
+  def L2_loadbzw4_pi   : T_load_pi <"memubh", DoubleRegs, s4_2Imm, 0b0101>;
+}
+
+//===----------------------------------------------------------------------===//
+// Template class for post increment fifo loads with immediate offset.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, addrMode = PostInc in
+class T_loadalign_pi <string mnemonic, Operand ImmOp, bits<4> MajOp >
+  : LDInstPI <(outs DoubleRegs:$dst, IntRegs:$dst2),
+  (ins DoubleRegs:$src1, IntRegs:$src2, ImmOp:$offset),
+  "$dst = "#mnemonic#"($src2++#$offset)" ,
+  [], "$src2 = $dst2, $src1 = $dst" > ,
+  PredNewRel {
+    bits<5> dst;
+    bits<5> src2;
+    bits<5> offset;
+    bits<4> offsetBits;
+
+    let offsetBits = !if (!eq(!cast<string>(ImmOp), "s4_1Imm"), offset{4-1},
+                                                  /* s4_0Imm */ offset{3-0});
+    let IClass = 0b1001;
+
+    let Inst{27-25} = 0b101;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{13-12} = 0b00;
+    let Inst{8-5} = offsetBits;
+    let Inst{4-0}   = dst;
+  }
+
+// Ryy=memh_fifo(Rx++#s4:1)
+// Ryy=memb_fifo(Rx++#s4:0)
+let accessSize = ByteAccess in
+def L2_loadalignb_pi : T_loadalign_pi <"memb_fifo", s4_0Imm, 0b0100>;
+
+let accessSize = HalfWordAccess, opExtentAlign = 1 in
+def L2_loadalignh_pi : T_loadalign_pi <"memh_fifo", s4_1Imm, 0b0010>;
+
+//===----------------------------------------------------------------------===//
+// Template class for post increment loads with register offset.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, addrMode = PostInc in
+class T_load_pr <string mnemonic, RegisterClass RC, bits<4> MajOp,
+                       MemAccessSize AccessSz>
+  : LDInstPI <(outs RC:$dst, IntRegs:$_dst_),
+              (ins IntRegs:$src1, ModRegs:$src2),
+  "$dst = "#mnemonic#"($src1++$src2)" ,
+  [], "$src1 = $_dst_" > {
+    bits<5> dst;
+    bits<5> src1;
+    bits<1> src2;
+
+    let accessSize = AccessSz;
+    let IClass = 0b1001;
+
+    let Inst{27-25} = 0b110;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13}    = src2;
+    let Inst{12}    = 0b0;
+    let Inst{7}     = 0b0;
+    let Inst{4-0}   = dst;
+  }
+
+let hasNewValue = 1 in {
+  def L2_loadrb_pr  : T_load_pr <"memb",  IntRegs, 0b1000, ByteAccess>;
+  def L2_loadrub_pr : T_load_pr <"memub", IntRegs, 0b1001, ByteAccess>;
+  def L2_loadrh_pr  : T_load_pr <"memh",  IntRegs, 0b1010, HalfWordAccess>;
+  def L2_loadruh_pr : T_load_pr <"memuh", IntRegs, 0b1011, HalfWordAccess>;
+  def L2_loadri_pr  : T_load_pr <"memw",  IntRegs, 0b1100, WordAccess>;
+
+  def L2_loadbzw2_pr : T_load_pr <"memubh", IntRegs, 0b0011, HalfWordAccess>;
+}
+
+def L2_loadrd_pr   : T_load_pr <"memd", DoubleRegs, 0b1110, DoubleWordAccess>;
+def L2_loadbzw4_pr : T_load_pr <"memubh", DoubleRegs, 0b0101, WordAccess>;
+
+// Load predicate.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def LDriw_pred : LDInst<(outs PredRegs:$dst),
+                        (ins IntRegs:$addr, s11_2Ext:$off),
+                        ".error \"should not emit\"", []>;
+// Load modifier.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def LDriw_mod : LDInst<(outs ModRegs:$dst),
+                        (ins IntRegs:$addr, s11_2Ext:$off),
+                        ".error \"should not emit\"", []>;
+
+let Defs = [R29, R30, R31], Uses = [R30], hasSideEffects = 0 in
+  def L2_deallocframe : LDInst<(outs), (ins),
+                     "deallocframe",
+                     []> {
+    let IClass = 0b1001;
+
+    let Inst{27-16} = 0b000000011110;
+    let Inst{13} = 0b0;
+    let Inst{4-0} = 0b11110;
+}
+
+// Load / Post increment circular addressing mode.
+let Uses = [CS], hasSideEffects = 0, addrMode = PostInc in
+class T_load_pcr<string mnemonic, RegisterClass RC, bits<4> MajOp>
+  : LDInst <(outs RC:$dst, IntRegs:$_dst_),
+            (ins IntRegs:$Rz, ModRegs:$Mu),
+  "$dst = "#mnemonic#"($Rz ++ I:circ($Mu))", [],
+  "$Rz = $_dst_" > {
+    bits<5> dst;
+    bits<5> Rz;
+    bit Mu;
+
+    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
+    let IClass = 0b1001;
+
+    let Inst{27-25} = 0b100;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = Rz;
+    let Inst{13} = Mu;
+    let Inst{12} = 0b0;
+    let Inst{9} = 0b1;
+    let Inst{7} = 0b0;
+    let Inst{4-0} = dst;
+ }
+
+let accessSize = ByteAccess in {
+  def L2_loadrb_pcr  : T_load_pcr <"memb",  IntRegs, 0b1000>;
+  def L2_loadrub_pcr : T_load_pcr <"memub", IntRegs, 0b1001>;
+}
+
+let accessSize = HalfWordAccess in {
+  def L2_loadrh_pcr   : T_load_pcr <"memh",   IntRegs, 0b1010>;
+  def L2_loadruh_pcr  : T_load_pcr <"memuh",  IntRegs, 0b1011>;
+  def L2_loadbsw2_pcr : T_load_pcr <"membh",  IntRegs, 0b0001>;
+  def L2_loadbzw2_pcr : T_load_pcr <"memubh", IntRegs, 0b0011>;
+}
+
+let accessSize = WordAccess in {
+  def  L2_loadri_pcr  : T_load_pcr <"memw", IntRegs, 0b1100>;
+  let hasNewValue = 0 in {
+    def L2_loadbzw4_pcr : T_load_pcr <"memubh", DoubleRegs, 0b0101>;
+    def L2_loadbsw4_pcr : T_load_pcr <"membh",  DoubleRegs, 0b0111>;
+  }
+}
+
+let accessSize = DoubleWordAccess in
+def L2_loadrd_pcr  : T_load_pcr <"memd", DoubleRegs, 0b1110>;
+
+// Load / Post increment circular addressing mode.
+let Uses = [CS], hasSideEffects = 0, addrMode = PostInc in
+class T_loadalign_pcr<string mnemonic, bits<4> MajOp, MemAccessSize AccessSz >
+  : LDInst <(outs DoubleRegs:$dst, IntRegs:$_dst_),
+            (ins DoubleRegs:$_src_, IntRegs:$Rz, ModRegs:$Mu),
+  "$dst = "#mnemonic#"($Rz ++ I:circ($Mu))", [],
+  "$Rz = $_dst_, $dst = $_src_" > {
+    bits<5> dst;
+    bits<5> Rz;
+    bit Mu;
+
+    let accessSize = AccessSz;
+    let IClass = 0b1001;
+
+    let Inst{27-25} = 0b100;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = Rz;
+    let Inst{13}    = Mu;
+    let Inst{12}    = 0b0;
+    let Inst{9}     = 0b1;
+    let Inst{7}     = 0b0;
+    let Inst{4-0}   = dst;
+ }
+
+def L2_loadalignb_pcr : T_loadalign_pcr <"memb_fifo", 0b0100, ByteAccess>;
+def L2_loadalignh_pcr : T_loadalign_pcr <"memh_fifo", 0b0010, HalfWordAccess>;
+
+//===----------------------------------------------------------------------===//
+// Circular loads with immediate offset.
+//===----------------------------------------------------------------------===//
+let Uses = [CS], mayLoad = 1, hasSideEffects = 0, addrMode = PostInc in
+class T_load_pci <string mnemonic, RegisterClass RC,
+                  Operand ImmOp, bits<4> MajOp>
+  : LDInstPI<(outs RC:$dst, IntRegs:$_dst_),
+             (ins IntRegs:$Rz, ImmOp:$offset, ModRegs:$Mu),
+  "$dst = "#mnemonic#"($Rz ++ #$offset:circ($Mu))", [],
+  "$Rz = $_dst_"> {
+    bits<5> dst;
+    bits<5> Rz;
+    bits<1> Mu;
+    bits<7> offset;
+    bits<4> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
+    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
+                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+                                      /* s4_0Imm */ offset{3-0})));
+    let IClass      = 0b1001;
+    let Inst{27-25} = 0b100;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = Rz;
+    let Inst{13}    = Mu;
+    let Inst{12}    = 0b0;
+    let Inst{9}     = 0b0;
+    let Inst{8-5}   = offsetBits;
+    let Inst{4-0}   = dst;
+  }
+
+// Byte variants of circ load
+let accessSize = ByteAccess in {
+  def L2_loadrb_pci  : T_load_pci <"memb",  IntRegs, s4_0Imm, 0b1000>;
+  def L2_loadrub_pci : T_load_pci <"memub", IntRegs, s4_0Imm, 0b1001>;
+}
+
+// Half word variants of circ load
+let accessSize = HalfWordAccess in {
+  def L2_loadrh_pci   : T_load_pci <"memh",   IntRegs, s4_1Imm, 0b1010>;
+  def L2_loadruh_pci  : T_load_pci <"memuh",  IntRegs, s4_1Imm, 0b1011>;
+  def L2_loadbzw2_pci : T_load_pci <"memubh", IntRegs, s4_1Imm, 0b0011>;
+  def L2_loadbsw2_pci : T_load_pci <"membh",  IntRegs, s4_1Imm, 0b0001>;
+}
+
+// Word variants of circ load
+let accessSize = WordAccess in
+def L2_loadri_pci   : T_load_pci <"memw",   IntRegs,    s4_2Imm, 0b1100>;
+
+let accessSize = WordAccess, hasNewValue = 0 in {
+  def L2_loadbzw4_pci : T_load_pci <"memubh", DoubleRegs, s4_2Imm, 0b0101>;
+  def L2_loadbsw4_pci : T_load_pci <"membh",  DoubleRegs, s4_2Imm, 0b0111>;
+}
+
+let accessSize = DoubleWordAccess, hasNewValue = 0 in
+def L2_loadrd_pci : T_load_pci <"memd", DoubleRegs, s4_3Imm, 0b1110>;
+
+
+// TODO: memb_fifo and memh_fifo must take destination register as input.
+// One-off circ loads - not enough in common to break into a class.
+let accessSize = ByteAccess in
+def L2_loadalignb_pci : T_load_pci <"memb_fifo", DoubleRegs, s4_0Imm, 0b0100>;
+
+let accessSize = HalfWordAccess, opExtentAlign = 1 in
+def L2_loadalignh_pci : T_load_pci <"memh_fifo", DoubleRegs, s4_1Imm, 0b0010>;
+
+// L[24]_load[wd]_locked: Load word/double with lock.
+let isSoloAX = 1 in
+class T_load_locked <string mnemonic, RegisterClass RC>
+  : LD0Inst <(outs RC:$dst),
+             (ins IntRegs:$src),
+    "$dst = "#mnemonic#"($src)"> {
+    bits<5> dst;
+    bits<5> src;
+    let IClass = 0b1001;
+    let Inst{27-21} = 0b0010000;
+    let Inst{20-16} = src;
+    let Inst{13-12} = !if (!eq(mnemonic, "memd_locked"), 0b01, 0b00);
+    let Inst{5}   = 0;
+    let Inst{4-0} = dst;
+}
+let hasNewValue = 1, accessSize = WordAccess, opNewValue = 0 in
+  def L2_loadw_locked : T_load_locked <"memw_locked", IntRegs>;
+let accessSize = DoubleWordAccess in
+  def L4_loadd_locked : T_load_locked <"memd_locked", DoubleRegs>;
+
+// S[24]_store[wd]_locked: Store word/double conditionally.
+let isSoloAX = 1, isPredicateLate = 1 in
+class T_store_locked <string mnemonic, RegisterClass RC>
+  : ST0Inst <(outs PredRegs:$Pd), (ins IntRegs:$Rs, RC:$Rt),
+    mnemonic#"($Rs, $Pd) = $Rt"> {
+    bits<2> Pd;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1010;
+    let Inst{27-23} = 0b00001;
+    let Inst{22} = !if (!eq(mnemonic, "memw_locked"), 0b0, 0b1);
+    let Inst{21} = 0b1;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = Rt;
+    let Inst{1-0} = Pd;
+}
+
+let accessSize = WordAccess in
+def S2_storew_locked : T_store_locked <"memw_locked", IntRegs>;
+
+let accessSize = DoubleWordAccess in
+def S4_stored_locked : T_store_locked <"memd_locked", DoubleRegs>;
+
+//===----------------------------------------------------------------------===//
+// Bit-reversed loads with auto-increment register
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, addrMode = PostInc in
+class T_load_pbr<string mnemonic, RegisterClass RC,
+                            MemAccessSize addrSize, bits<4> majOp>
+  : LDInst
+    <(outs RC:$dst, IntRegs:$_dst_),
+     (ins IntRegs:$Rz, ModRegs:$Mu),
+     "$dst = "#mnemonic#"($Rz ++ $Mu:brev)" ,
+      [] , "$Rz = $_dst_" > {
+
+      let accessSize = addrSize;
+
+      bits<5> dst;
+      bits<5> Rz;
+      bits<1> Mu;
+
+      let IClass = 0b1001;
+
+      let Inst{27-25} = 0b111;
+      let Inst{24-21} = majOp;
+      let Inst{20-16} = Rz;
+      let Inst{13} = Mu;
+      let Inst{12} = 0b0;
+      let Inst{7} = 0b0;
+      let Inst{4-0} = dst;
+  }
+
+let hasNewValue =1, opNewValue = 0 in {
+  def L2_loadrb_pbr   : T_load_pbr <"memb",  IntRegs, ByteAccess, 0b1000>;
+  def L2_loadrub_pbr  : T_load_pbr <"memub", IntRegs, ByteAccess, 0b1001>;
+  def L2_loadrh_pbr   : T_load_pbr <"memh",  IntRegs, HalfWordAccess, 0b1010>;
+  def L2_loadruh_pbr  : T_load_pbr <"memuh", IntRegs, HalfWordAccess, 0b1011>;
+  def L2_loadbsw2_pbr : T_load_pbr <"membh", IntRegs, HalfWordAccess, 0b0001>;
+  def L2_loadbzw2_pbr : T_load_pbr <"memubh", IntRegs, HalfWordAccess, 0b0011>;
+  def L2_loadri_pbr : T_load_pbr <"memw", IntRegs, WordAccess, 0b1100>;
+}
+
+def L2_loadbzw4_pbr : T_load_pbr <"memubh", DoubleRegs, WordAccess, 0b0101>;
+def L2_loadbsw4_pbr : T_load_pbr <"membh",  DoubleRegs, WordAccess, 0b0111>;
+def L2_loadrd_pbr : T_load_pbr <"memd", DoubleRegs, DoubleWordAccess, 0b1110>;
+
+def L2_loadalignb_pbr :T_load_pbr <"memb_fifo", DoubleRegs, ByteAccess, 0b0100>;
+def L2_loadalignh_pbr :T_load_pbr <"memh_fifo", DoubleRegs,
+                                   HalfWordAccess, 0b0010>;
+
+//===----------------------------------------------------------------------===//
+// LD -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/ALU +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/COMPLEX +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/COMPLEX -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYH +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Template Class
+// MPYS / Multipy signed/unsigned halfwords
+//Rd=mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat]
+//===----------------------------------------------------------------------===//
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_M2_mpy < bits<2> LHbits, bit isSat, bit isRnd,
+                 bit hasShift, bit isUnsigned>
+  : MInst < (outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
+  "$Rd = "#!if(isUnsigned,"mpyu","mpy")#"($Rs."#!if(LHbits{1},"h","l")
+                                       #", $Rt."#!if(LHbits{0},"h)","l)")
+                                       #!if(hasShift,":<<1","")
+                                       #!if(isRnd,":rnd","")
+                                       #!if(isSat,":sat",""),
+  [], "", M_tc_3x_SLOT23 > {
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1100;
+    let Inst{23} = hasShift;
+    let Inst{22} = isUnsigned;
+    let Inst{21} = isRnd;
+    let Inst{7} = isSat;
+    let Inst{6-5} = LHbits;
+    let Inst{4-0} = Rd;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = Rt;
+  }
+
+//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpy_ll_s1: T_M2_mpy<0b00, 0, 0, 1, 0>;
+def M2_mpy_ll_s0: T_M2_mpy<0b00, 0, 0, 0, 0>;
+def M2_mpy_lh_s1: T_M2_mpy<0b01, 0, 0, 1, 0>;
+def M2_mpy_lh_s0: T_M2_mpy<0b01, 0, 0, 0, 0>;
+def M2_mpy_hl_s1: T_M2_mpy<0b10, 0, 0, 1, 0>;
+def M2_mpy_hl_s0: T_M2_mpy<0b10, 0, 0, 0, 0>;
+def M2_mpy_hh_s1: T_M2_mpy<0b11, 0, 0, 1, 0>;
+def M2_mpy_hh_s0: T_M2_mpy<0b11, 0, 0, 0, 0>;
+
+//Rd=mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpyu_ll_s1: T_M2_mpy<0b00, 0, 0, 1, 1>;
+def M2_mpyu_ll_s0: T_M2_mpy<0b00, 0, 0, 0, 1>;
+def M2_mpyu_lh_s1: T_M2_mpy<0b01, 0, 0, 1, 1>;
+def M2_mpyu_lh_s0: T_M2_mpy<0b01, 0, 0, 0, 1>;
+def M2_mpyu_hl_s1: T_M2_mpy<0b10, 0, 0, 1, 1>;
+def M2_mpyu_hl_s0: T_M2_mpy<0b10, 0, 0, 0, 1>;
+def M2_mpyu_hh_s1: T_M2_mpy<0b11, 0, 0, 1, 1>;
+def M2_mpyu_hh_s0: T_M2_mpy<0b11, 0, 0, 0, 1>;
+
+//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1]:rnd
+def M2_mpy_rnd_ll_s1: T_M2_mpy <0b00, 0, 1, 1, 0>;
+def M2_mpy_rnd_ll_s0: T_M2_mpy <0b00, 0, 1, 0, 0>;
+def M2_mpy_rnd_lh_s1: T_M2_mpy <0b01, 0, 1, 1, 0>;
+def M2_mpy_rnd_lh_s0: T_M2_mpy <0b01, 0, 1, 0, 0>;
+def M2_mpy_rnd_hl_s1: T_M2_mpy <0b10, 0, 1, 1, 0>;
+def M2_mpy_rnd_hl_s0: T_M2_mpy <0b10, 0, 1, 0, 0>;
+def M2_mpy_rnd_hh_s1: T_M2_mpy <0b11, 0, 1, 1, 0>;
+def M2_mpy_rnd_hh_s0: T_M2_mpy <0b11, 0, 1, 0, 0>;
+
+//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1][:sat]
+//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat]
+let Defs = [USR_OVF] in {
+  def M2_mpy_sat_ll_s1: T_M2_mpy <0b00, 1, 0, 1, 0>;
+  def M2_mpy_sat_ll_s0: T_M2_mpy <0b00, 1, 0, 0, 0>;
+  def M2_mpy_sat_lh_s1: T_M2_mpy <0b01, 1, 0, 1, 0>;
+  def M2_mpy_sat_lh_s0: T_M2_mpy <0b01, 1, 0, 0, 0>;
+  def M2_mpy_sat_hl_s1: T_M2_mpy <0b10, 1, 0, 1, 0>;
+  def M2_mpy_sat_hl_s0: T_M2_mpy <0b10, 1, 0, 0, 0>;
+  def M2_mpy_sat_hh_s1: T_M2_mpy <0b11, 1, 0, 1, 0>;
+  def M2_mpy_sat_hh_s0: T_M2_mpy <0b11, 1, 0, 0, 0>;
+
+  def M2_mpy_sat_rnd_ll_s1: T_M2_mpy <0b00, 1, 1, 1, 0>;
+  def M2_mpy_sat_rnd_ll_s0: T_M2_mpy <0b00, 1, 1, 0, 0>;
+  def M2_mpy_sat_rnd_lh_s1: T_M2_mpy <0b01, 1, 1, 1, 0>;
+  def M2_mpy_sat_rnd_lh_s0: T_M2_mpy <0b01, 1, 1, 0, 0>;
+  def M2_mpy_sat_rnd_hl_s1: T_M2_mpy <0b10, 1, 1, 1, 0>;
+  def M2_mpy_sat_rnd_hl_s0: T_M2_mpy <0b10, 1, 1, 0, 0>;
+  def M2_mpy_sat_rnd_hh_s1: T_M2_mpy <0b11, 1, 1, 1, 0>;
+  def M2_mpy_sat_rnd_hh_s0: T_M2_mpy <0b11, 1, 1, 0, 0>;
+}
+
+//===----------------------------------------------------------------------===//
+// Template Class
+// MPYS / Multipy signed/unsigned halfwords and add/subtract the
+// result from the accumulator.
+//Rx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
+//===----------------------------------------------------------------------===//
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_M2_mpy_acc < bits<2> LHbits, bit isSat, bit isNac,
+                 bit hasShift, bit isUnsigned >
+  : MInst_acc<(outs IntRegs:$Rx), (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
+  "$Rx "#!if(isNac,"-= ","+= ")#!if(isUnsigned,"mpyu","mpy")
+                              #"($Rs."#!if(LHbits{1},"h","l")
+                              #", $Rt."#!if(LHbits{0},"h)","l)")
+                              #!if(hasShift,":<<1","")
+                              #!if(isSat,":sat",""),
+  [], "$dst2 = $Rx", M_tc_3x_SLOT23 > {
+    bits<5> Rx;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+    let Inst{27-24} = 0b1110;
+    let Inst{23} = hasShift;
+    let Inst{22} = isUnsigned;
+    let Inst{21} = isNac;
+    let Inst{7} = isSat;
+    let Inst{6-5} = LHbits;
+    let Inst{4-0} = Rx;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = Rt;
+  }
+
+//Rx += mpy(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpy_acc_ll_s1: T_M2_mpy_acc <0b00, 0, 0, 1, 0>;
+def M2_mpy_acc_ll_s0: T_M2_mpy_acc <0b00, 0, 0, 0, 0>;
+def M2_mpy_acc_lh_s1: T_M2_mpy_acc <0b01, 0, 0, 1, 0>;
+def M2_mpy_acc_lh_s0: T_M2_mpy_acc <0b01, 0, 0, 0, 0>;
+def M2_mpy_acc_hl_s1: T_M2_mpy_acc <0b10, 0, 0, 1, 0>;
+def M2_mpy_acc_hl_s0: T_M2_mpy_acc <0b10, 0, 0, 0, 0>;
+def M2_mpy_acc_hh_s1: T_M2_mpy_acc <0b11, 0, 0, 1, 0>;
+def M2_mpy_acc_hh_s0: T_M2_mpy_acc <0b11, 0, 0, 0, 0>;
+
+//Rx += mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpyu_acc_ll_s1: T_M2_mpy_acc <0b00, 0, 0, 1, 1>;
+def M2_mpyu_acc_ll_s0: T_M2_mpy_acc <0b00, 0, 0, 0, 1>;
+def M2_mpyu_acc_lh_s1: T_M2_mpy_acc <0b01, 0, 0, 1, 1>;
+def M2_mpyu_acc_lh_s0: T_M2_mpy_acc <0b01, 0, 0, 0, 1>;
+def M2_mpyu_acc_hl_s1: T_M2_mpy_acc <0b10, 0, 0, 1, 1>;
+def M2_mpyu_acc_hl_s0: T_M2_mpy_acc <0b10, 0, 0, 0, 1>;
+def M2_mpyu_acc_hh_s1: T_M2_mpy_acc <0b11, 0, 0, 1, 1>;
+def M2_mpyu_acc_hh_s0: T_M2_mpy_acc <0b11, 0, 0, 0, 1>;
+
+//Rx -= mpy(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpy_nac_ll_s1: T_M2_mpy_acc <0b00, 0, 1, 1, 0>;
+def M2_mpy_nac_ll_s0: T_M2_mpy_acc <0b00, 0, 1, 0, 0>;
+def M2_mpy_nac_lh_s1: T_M2_mpy_acc <0b01, 0, 1, 1, 0>;
+def M2_mpy_nac_lh_s0: T_M2_mpy_acc <0b01, 0, 1, 0, 0>;
+def M2_mpy_nac_hl_s1: T_M2_mpy_acc <0b10, 0, 1, 1, 0>;
+def M2_mpy_nac_hl_s0: T_M2_mpy_acc <0b10, 0, 1, 0, 0>;
+def M2_mpy_nac_hh_s1: T_M2_mpy_acc <0b11, 0, 1, 1, 0>;
+def M2_mpy_nac_hh_s0: T_M2_mpy_acc <0b11, 0, 1, 0, 0>;
+
+//Rx -= mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
+def M2_mpyu_nac_ll_s1: T_M2_mpy_acc <0b00, 0, 1, 1, 1>;
+def M2_mpyu_nac_ll_s0: T_M2_mpy_acc <0b00, 0, 1, 0, 1>;
+def M2_mpyu_nac_lh_s1: T_M2_mpy_acc <0b01, 0, 1, 1, 1>;
+def M2_mpyu_nac_lh_s0: T_M2_mpy_acc <0b01, 0, 1, 0, 1>;
+def M2_mpyu_nac_hl_s1: T_M2_mpy_acc <0b10, 0, 1, 1, 1>;
+def M2_mpyu_nac_hl_s0: T_M2_mpy_acc <0b10, 0, 1, 0, 1>;
+def M2_mpyu_nac_hh_s1: T_M2_mpy_acc <0b11, 0, 1, 1, 1>;
+def M2_mpyu_nac_hh_s0: T_M2_mpy_acc <0b11, 0, 1, 0, 1>;
+
+//Rx += mpy(Rs.[H|L],Rt.[H|L])[:<<1]:sat
+def M2_mpy_acc_sat_ll_s1: T_M2_mpy_acc <0b00, 1, 0, 1, 0>;
+def M2_mpy_acc_sat_ll_s0: T_M2_mpy_acc <0b00, 1, 0, 0, 0>;
+def M2_mpy_acc_sat_lh_s1: T_M2_mpy_acc <0b01, 1, 0, 1, 0>;
+def M2_mpy_acc_sat_lh_s0: T_M2_mpy_acc <0b01, 1, 0, 0, 0>;
+def M2_mpy_acc_sat_hl_s1: T_M2_mpy_acc <0b10, 1, 0, 1, 0>;
+def M2_mpy_acc_sat_hl_s0: T_M2_mpy_acc <0b10, 1, 0, 0, 0>;
+def M2_mpy_acc_sat_hh_s1: T_M2_mpy_acc <0b11, 1, 0, 1, 0>;
+def M2_mpy_acc_sat_hh_s0: T_M2_mpy_acc <0b11, 1, 0, 0, 0>;
+
+//Rx -= mpy(Rs.[H|L],Rt.[H|L])[:<<1]:sat
+def M2_mpy_nac_sat_ll_s1: T_M2_mpy_acc <0b00, 1, 1, 1, 0>;
+def M2_mpy_nac_sat_ll_s0: T_M2_mpy_acc <0b00, 1, 1, 0, 0>;
+def M2_mpy_nac_sat_lh_s1: T_M2_mpy_acc <0b01, 1, 1, 1, 0>;
+def M2_mpy_nac_sat_lh_s0: T_M2_mpy_acc <0b01, 1, 1, 0, 0>;
+def M2_mpy_nac_sat_hl_s1: T_M2_mpy_acc <0b10, 1, 1, 1, 0>;
+def M2_mpy_nac_sat_hl_s0: T_M2_mpy_acc <0b10, 1, 1, 0, 0>;
+def M2_mpy_nac_sat_hh_s1: T_M2_mpy_acc <0b11, 1, 1, 1, 0>;
+def M2_mpy_nac_sat_hh_s0: T_M2_mpy_acc <0b11, 1, 1, 0, 0>;
+
+//===----------------------------------------------------------------------===//
+// Template Class
+// MPYS / Multipy signed/unsigned halfwords and add/subtract the
+// result from the 64-bit destination register.
+//Rxx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
+//===----------------------------------------------------------------------===//
+
+class T_M2_mpyd_acc < bits<2> LHbits, bit isNac, bit hasShift, bit isUnsigned>
+  : MInst_acc<(outs DoubleRegs:$Rxx),
+              (ins DoubleRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
+  "$Rxx "#!if(isNac,"-= ","+= ")#!if(isUnsigned,"mpyu","mpy")
+                                #"($Rs."#!if(LHbits{1},"h","l")
+                                #", $Rt."#!if(LHbits{0},"h)","l)")
+                                #!if(hasShift,":<<1",""),
+  [], "$dst2 = $Rxx", M_tc_3x_SLOT23 > {
+    bits<5> Rxx;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b0110;
+    let Inst{23} = hasShift;
+    let Inst{22} = isUnsigned;
+    let Inst{21} = isNac;
+    let Inst{7} = 0;
+    let Inst{6-5} = LHbits;
+    let Inst{4-0} = Rxx;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = Rt;
+  }
+
+def M2_mpyd_acc_hh_s0: T_M2_mpyd_acc <0b11, 0, 0, 0>;
+def M2_mpyd_acc_hl_s0: T_M2_mpyd_acc <0b10, 0, 0, 0>;
+def M2_mpyd_acc_lh_s0: T_M2_mpyd_acc <0b01, 0, 0, 0>;
+def M2_mpyd_acc_ll_s0: T_M2_mpyd_acc <0b00, 0, 0, 0>;
+
+def M2_mpyd_acc_hh_s1: T_M2_mpyd_acc <0b11, 0, 1, 0>;
+def M2_mpyd_acc_hl_s1: T_M2_mpyd_acc <0b10, 0, 1, 0>;
+def M2_mpyd_acc_lh_s1: T_M2_mpyd_acc <0b01, 0, 1, 0>;
+def M2_mpyd_acc_ll_s1: T_M2_mpyd_acc <0b00, 0, 1, 0>;
+
+def M2_mpyd_nac_hh_s0: T_M2_mpyd_acc <0b11, 1, 0, 0>;
+def M2_mpyd_nac_hl_s0: T_M2_mpyd_acc <0b10, 1, 0, 0>;
+def M2_mpyd_nac_lh_s0: T_M2_mpyd_acc <0b01, 1, 0, 0>;
+def M2_mpyd_nac_ll_s0: T_M2_mpyd_acc <0b00, 1, 0, 0>;
+
+def M2_mpyd_nac_hh_s1: T_M2_mpyd_acc <0b11, 1, 1, 0>;
+def M2_mpyd_nac_hl_s1: T_M2_mpyd_acc <0b10, 1, 1, 0>;
+def M2_mpyd_nac_lh_s1: T_M2_mpyd_acc <0b01, 1, 1, 0>;
+def M2_mpyd_nac_ll_s1: T_M2_mpyd_acc <0b00, 1, 1, 0>;
+
+def M2_mpyud_acc_hh_s0: T_M2_mpyd_acc <0b11, 0, 0, 1>;
+def M2_mpyud_acc_hl_s0: T_M2_mpyd_acc <0b10, 0, 0, 1>;
+def M2_mpyud_acc_lh_s0: T_M2_mpyd_acc <0b01, 0, 0, 1>;
+def M2_mpyud_acc_ll_s0: T_M2_mpyd_acc <0b00, 0, 0, 1>;
+
+def M2_mpyud_acc_hh_s1: T_M2_mpyd_acc <0b11, 0, 1, 1>;
+def M2_mpyud_acc_hl_s1: T_M2_mpyd_acc <0b10, 0, 1, 1>;
+def M2_mpyud_acc_lh_s1: T_M2_mpyd_acc <0b01, 0, 1, 1>;
+def M2_mpyud_acc_ll_s1: T_M2_mpyd_acc <0b00, 0, 1, 1>;
+
+def M2_mpyud_nac_hh_s0: T_M2_mpyd_acc <0b11, 1, 0, 1>;
+def M2_mpyud_nac_hl_s0: T_M2_mpyd_acc <0b10, 1, 0, 1>;
+def M2_mpyud_nac_lh_s0: T_M2_mpyd_acc <0b01, 1, 0, 1>;
+def M2_mpyud_nac_ll_s0: T_M2_mpyd_acc <0b00, 1, 0, 1>;
+
+def M2_mpyud_nac_hh_s1: T_M2_mpyd_acc <0b11, 1, 1, 1>;
+def M2_mpyud_nac_hl_s1: T_M2_mpyd_acc <0b10, 1, 1, 1>;
+def M2_mpyud_nac_lh_s1: T_M2_mpyd_acc <0b01, 1, 1, 1>;
+def M2_mpyud_nac_ll_s1: T_M2_mpyd_acc <0b00, 1, 1, 1>;
+
+//===----------------------------------------------------------------------===//
+// Template Class -- Vector Multipy
+// Used for complex multiply real or imaginary, dual multiply and even halfwords
+//===----------------------------------------------------------------------===//
+class T_M2_vmpy < string opc, bits<3> MajOp, bits<3> MinOp, bit hasShift,
+                  bit isRnd, bit isSat >
+  : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Rdd = "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
+                              #!if(isRnd,":rnd","")
+                              #!if(isSat,":sat",""),
+  [] > {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1000;
+    let Inst{23-21} = MajOp;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = Rdd;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
+
+// Vector complex multiply imaginary: Rdd=vcmpyi(Rss,Rtt)[:<<1]:sat
+let Defs = [USR_OVF] in {
+def M2_vcmpy_s1_sat_i: T_M2_vmpy <"vcmpyi", 0b110, 0b110, 1, 0, 1>;
+def M2_vcmpy_s0_sat_i: T_M2_vmpy <"vcmpyi", 0b010, 0b110, 0, 0, 1>;
+
+// Vector complex multiply real: Rdd=vcmpyr(Rss,Rtt)[:<<1]:sat
+def M2_vcmpy_s1_sat_r: T_M2_vmpy <"vcmpyr", 0b101, 0b110, 1, 0, 1>;
+def M2_vcmpy_s0_sat_r: T_M2_vmpy <"vcmpyr", 0b001, 0b110, 0, 0, 1>;
+
+// Vector dual multiply: Rdd=vdmpy(Rss,Rtt)[:<<1]:sat
+def M2_vdmpys_s1: T_M2_vmpy <"vdmpy", 0b100, 0b100, 1, 0, 1>;
+def M2_vdmpys_s0: T_M2_vmpy <"vdmpy", 0b000, 0b100, 0, 0, 1>;
+
+// Vector multiply even halfwords: Rdd=vmpyeh(Rss,Rtt)[:<<1]:sat
+def M2_vmpy2es_s1: T_M2_vmpy <"vmpyeh", 0b100, 0b110, 1, 0, 1>;
+def M2_vmpy2es_s0: T_M2_vmpy <"vmpyeh", 0b000, 0b110, 0, 0, 1>;
+
+//Rdd=vmpywoh(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmpyh_s0:  T_M2_vmpy <"vmpywoh", 0b000, 0b111, 0, 0, 1>;
+def M2_mmpyh_s1:  T_M2_vmpy <"vmpywoh", 0b100, 0b111, 1, 0, 1>;
+def M2_mmpyh_rs0: T_M2_vmpy <"vmpywoh", 0b001, 0b111, 0, 1, 1>;
+def M2_mmpyh_rs1: T_M2_vmpy <"vmpywoh", 0b101, 0b111, 1, 1, 1>;
+
+//Rdd=vmpyweh(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmpyl_s0:  T_M2_vmpy <"vmpyweh", 0b000, 0b101, 0, 0, 1>;
+def M2_mmpyl_s1:  T_M2_vmpy <"vmpyweh", 0b100, 0b101, 1, 0, 1>;
+def M2_mmpyl_rs0: T_M2_vmpy <"vmpyweh", 0b001, 0b101, 0, 1, 1>;
+def M2_mmpyl_rs1: T_M2_vmpy <"vmpyweh", 0b101, 0b101, 1, 1, 1>;
+
+//Rdd=vmpywouh(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmpyuh_s0:  T_M2_vmpy <"vmpywouh", 0b010, 0b111, 0, 0, 1>;
+def M2_mmpyuh_s1:  T_M2_vmpy <"vmpywouh", 0b110, 0b111, 1, 0, 1>;
+def M2_mmpyuh_rs0: T_M2_vmpy <"vmpywouh", 0b011, 0b111, 0, 1, 1>;
+def M2_mmpyuh_rs1: T_M2_vmpy <"vmpywouh", 0b111, 0b111, 1, 1, 1>;
+
+//Rdd=vmpyweuh(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmpyul_s0:  T_M2_vmpy <"vmpyweuh", 0b010, 0b101, 0, 0, 1>;
+def M2_mmpyul_s1:  T_M2_vmpy <"vmpyweuh", 0b110, 0b101, 1, 0, 1>;
+def M2_mmpyul_rs0: T_M2_vmpy <"vmpyweuh", 0b011, 0b101, 0, 1, 1>;
+def M2_mmpyul_rs1: T_M2_vmpy <"vmpyweuh", 0b111, 0b101, 1, 1, 1>;
+}
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_MType_mpy <string mnemonic, bits<4> RegTyBits, RegisterClass RC,
+                   bits<3> MajOp, bits<3> MinOp, bit isSat = 0, bit isRnd = 0,
+                   string op2Suffix = "", bit isRaw = 0, bit isHi = 0 >
+  : MInst <(outs IntRegs:$dst), (ins RC:$src1, RC:$src2),
+  "$dst = "#mnemonic
+           #"($src1, $src2"#op2Suffix#")"
+           #!if(MajOp{2}, ":<<1", "")
+           #!if(isRnd, ":rnd", "")
+           #!if(isSat, ":sat", "")
+           #!if(isRaw, !if(isHi, ":raw:hi", ":raw:lo"), ""), [] > {
+    bits<5> dst;
+    bits<5> src1;
+    bits<5> src2;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = RegTyBits;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13}    = 0b0;
+    let Inst{12-8}  = src2;
+    let Inst{7-5}   = MinOp;
+    let Inst{4-0}   = dst;
+  }
+
+class T_MType_vrcmpy <string mnemonic, bits<3> MajOp, bits<3> MinOp, bit isHi>
+  : T_MType_mpy <mnemonic, 0b1001, DoubleRegs, MajOp, MinOp, 1, 1, "", 1, isHi>;
+
+class T_MType_dd  <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                   bit isSat = 0, bit isRnd = 0 >
+  : T_MType_mpy <mnemonic, 0b1001, DoubleRegs, MajOp, MinOp, isSat, isRnd>;
+
+class T_MType_rr1  <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                    bit isSat = 0, bit isRnd = 0 >
+  : T_MType_mpy<mnemonic, 0b1101, IntRegs, MajOp, MinOp, isSat, isRnd>;
+
+class T_MType_rr2 <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                   bit isSat = 0, bit isRnd = 0, string op2str = "" >
+  : T_MType_mpy<mnemonic, 0b1101, IntRegs, MajOp, MinOp, isSat, isRnd, op2str>;
+
+def M2_vradduh    : T_MType_dd <"vradduh", 0b000, 0b001, 0, 0>;
+def M2_vdmpyrs_s0 : T_MType_dd <"vdmpy",   0b000, 0b000, 1, 1>;
+def M2_vdmpyrs_s1 : T_MType_dd <"vdmpy",   0b100, 0b000, 1, 1>;
+
+let CextOpcode = "mpyi", InputType = "reg" in
+def M2_mpyi    : T_MType_rr1 <"mpyi", 0b000, 0b000>, ImmRegRel;
+
+def M2_mpy_up  : T_MType_rr1 <"mpy",  0b000, 0b001>;
+def M2_mpyu_up : T_MType_rr1 <"mpyu", 0b010, 0b001>;
+
+def M2_dpmpyss_rnd_s0 : T_MType_rr1 <"mpy", 0b001, 0b001, 0, 1>;
+
+def M2_vmpy2s_s0pack : T_MType_rr1 <"vmpyh", 0b001, 0b111, 1, 1>;
+def M2_vmpy2s_s1pack : T_MType_rr1 <"vmpyh", 0b101, 0b111, 1, 1>;
+
+def M2_hmmpyh_rs1 : T_MType_rr2 <"mpy", 0b101, 0b100, 1, 1, ".h">;
+def M2_hmmpyl_rs1 : T_MType_rr2 <"mpy", 0b111, 0b100, 1, 1, ".l">;
+
+def M2_cmpyrs_s0  : T_MType_rr2 <"cmpy", 0b001, 0b110, 1, 1>;
+def M2_cmpyrs_s1  : T_MType_rr2 <"cmpy", 0b101, 0b110, 1, 1>;
+def M2_cmpyrsc_s0 : T_MType_rr2 <"cmpy", 0b011, 0b110, 1, 1, "*">;
+def M2_cmpyrsc_s1 : T_MType_rr2 <"cmpy", 0b111, 0b110, 1, 1, "*">;
+
+// V4 Instructions
+def M2_vraddh : T_MType_dd <"vraddh", 0b001, 0b111, 0>;
+def M2_mpysu_up : T_MType_rr1 <"mpysu", 0b011, 0b001, 0>;
+def M2_mpy_up_s1 : T_MType_rr1 <"mpy", 0b101, 0b010, 0>;
+def M2_mpy_up_s1_sat : T_MType_rr1 <"mpy", 0b111, 0b000, 1>;
+
+def M2_hmmpyh_s1 : T_MType_rr2 <"mpy", 0b101, 0b000, 1, 0, ".h">;
+def M2_hmmpyl_s1 : T_MType_rr2 <"mpy", 0b101, 0b001, 1, 0, ".l">;
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_MType_mpy_ri <bit isNeg, Operand ImmOp, list<dag> pattern>
+  : MInst < (outs IntRegs:$Rd), (ins IntRegs:$Rs, ImmOp:$u8),
+  "$Rd ="#!if(isNeg, "- ", "+ ")#"mpyi($Rs, #$u8)" ,
+   pattern, "", M_tc_3x_SLOT23> {
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<8> u8;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b0000;
+    let Inst{23} = isNeg;
+    let Inst{13} = 0b0;
+    let Inst{4-0} = Rd;
+    let Inst{20-16} = Rs;
+    let Inst{12-5} = u8;
+  }
+
+let isExtendable = 1, opExtentBits = 8, opExtendable = 2 in
+def M2_mpysip : T_MType_mpy_ri <0, u8_0Ext, []>;
+
+def M2_mpysin :  T_MType_mpy_ri <1, u8_0Imm, []>;
+
+// Assember mapped to M2_mpyi
+let isAsmParserOnly = 1 in
+def M2_mpyui : MInst<(outs IntRegs:$dst),
+                     (ins IntRegs:$src1, IntRegs:$src2),
+  "$dst = mpyui($src1, $src2)">;
+
+// Rd=mpyi(Rs,#m9)
+// s9 is NOT the same as m9 - but it works.. so far.
+// Assembler maps to either Rd=+mpyi(Rs,#u8) or Rd=-mpyi(Rs,#u8)
+// depending on the value of m9. See Arch Spec.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 9,
+    CextOpcode = "mpyi", InputType = "imm", hasNewValue = 1,
+    isAsmParserOnly = 1 in
+def M2_mpysmi : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9_0Ext:$src2),
+    "$dst = mpyi($src1, #$src2)", []>, ImmRegRel;
+
+let hasNewValue = 1, isExtendable = 1,  opExtentBits = 8, opExtendable = 3,
+    InputType = "imm" in
+class T_MType_acc_ri <string mnemonic, bits<3> MajOp, Operand ImmOp,
+                      list<dag> pattern = []>
+ : MInst < (outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, ImmOp:$src3),
+  "$dst "#mnemonic#"($src2, #$src3)",
+  pattern, "$src1 = $dst", M_tc_2_SLOT23> {
+    bits<5> dst;
+    bits<5> src2;
+    bits<8> src3;
+
+    let IClass = 0b1110;
+
+    let Inst{27-26} = 0b00;
+    let Inst{25-23} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{13} = 0b0;
+    let Inst{12-5} = src3;
+    let Inst{4-0} = dst;
+  }
+
+let InputType = "reg", hasNewValue = 1 in
+class T_MType_acc_rr <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                      bit isSwap = 0, list<dag> pattern = [], bit hasNot = 0,
+                      bit isSat = 0, bit isShift = 0>
+  : MInst < (outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+  "$dst "#mnemonic#"($src2, "#!if(hasNot, "~$src3)","$src3)")
+                          #!if(isShift, ":<<1", "")
+                          #!if(isSat, ":sat", ""),
+  pattern, "$src1 = $dst", M_tc_2_SLOT23 > {
+    bits<5> dst;
+    bits<5> src2;
+    bits<5> src3;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1111;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = !if(isSwap, src3, src2);
+    let Inst{13} = 0b0;
+    let Inst{12-8} = !if(isSwap, src2, src3);
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = dst;
+  }
+
+let CextOpcode = "MPYI_acc", Itinerary = M_tc_3x_SLOT23 in {
+  def M2_macsip : T_MType_acc_ri <"+= mpyi", 0b010, u8_0Ext, []>, ImmRegRel;
+
+  def M2_maci   : T_MType_acc_rr <"+= mpyi", 0b000, 0b000, 0, []>, ImmRegRel;
+}
+
+let CextOpcode = "ADD_acc" in {
+  let isExtentSigned = 1 in
+  def M2_accii : T_MType_acc_ri <"+= add", 0b100, s8_0Ext, []>, ImmRegRel;
+
+  def M2_acci  : T_MType_acc_rr <"+= add",  0b000, 0b001, 0, []>, ImmRegRel;
+}
+
+let CextOpcode = "SUB_acc" in {
+  let isExtentSigned = 1 in
+  def M2_naccii : T_MType_acc_ri <"-= add", 0b101, s8_0Ext>, ImmRegRel;
+
+  def M2_nacci  : T_MType_acc_rr <"-= add",  0b100, 0b001, 0>, ImmRegRel;
+}
+
+let Itinerary = M_tc_3x_SLOT23 in
+def M2_macsin : T_MType_acc_ri <"-= mpyi", 0b011, u8_0Ext>;
+
+def M2_xor_xacc : T_MType_acc_rr < "^= xor", 0b100, 0b011, 0>;
+def M2_subacc : T_MType_acc_rr <"+= sub",  0b000, 0b011, 1>;
+
+//===----------------------------------------------------------------------===//
+// Template Class -- XType Vector Instructions
+//===----------------------------------------------------------------------===//
+class T_XTYPE_Vect < string opc, bits<3> MajOp, bits<3> MinOp, bit isConj >
+  : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Rdd = "#opc#"($Rss, $Rtt"#!if(isConj,"*)",")"),
+  [] > {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1000;
+    let Inst{23-21} = MajOp;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = Rdd;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
+
+class T_XTYPE_Vect_acc < string opc, bits<3> MajOp, bits<3> MinOp, bit isConj >
+  : MInst <(outs DoubleRegs:$Rdd),
+           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Rdd += "#opc#"($Rss, $Rtt"#!if(isConj,"*)",")"),
+  [], "$dst2 = $Rdd",M_tc_3x_SLOT23 > {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1010;
+    let Inst{23-21} = MajOp;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = Rdd;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
+
+class T_XTYPE_Vect_diff < bits<3> MajOp, string opc >
+  : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rtt, DoubleRegs:$Rss),
+  "$Rdd = "#opc#"($Rtt, $Rss)",
+  [], "",M_tc_2_SLOT23 > {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1000;
+    let Inst{23-21} = MajOp;
+    let Inst{7-5} = 0b000;
+    let Inst{4-0} = Rdd;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
+
+// Vector reduce add unsigned bytes: Rdd32=vrmpybu(Rss32,Rtt32)
+def A2_vraddub: T_XTYPE_Vect <"vraddub", 0b010, 0b001, 0>;
+def A2_vraddub_acc: T_XTYPE_Vect_acc <"vraddub", 0b010, 0b001, 0>;
+
+// Vector sum of absolute differences unsigned bytes: Rdd=vrsadub(Rss,Rtt)
+def A2_vrsadub: T_XTYPE_Vect <"vrsadub", 0b010, 0b010, 0>;
+def A2_vrsadub_acc: T_XTYPE_Vect_acc <"vrsadub", 0b010, 0b010, 0>;
+
+// Vector absolute difference: Rdd=vabsdiffh(Rtt,Rss)
+def M2_vabsdiffh: T_XTYPE_Vect_diff<0b011, "vabsdiffh">;
+
+// Vector absolute difference words: Rdd=vabsdiffw(Rtt,Rss)
+def M2_vabsdiffw: T_XTYPE_Vect_diff<0b001, "vabsdiffw">;
+
+// Vector reduce complex multiply real or imaginary:
+// Rdd[+]=vrcmpy[ir](Rss,Rtt[*])
+def M2_vrcmpyi_s0:  T_XTYPE_Vect <"vrcmpyi", 0b000, 0b000, 0>;
+def M2_vrcmpyi_s0c: T_XTYPE_Vect <"vrcmpyi", 0b010, 0b000, 1>;
+def M2_vrcmaci_s0:  T_XTYPE_Vect_acc <"vrcmpyi", 0b000, 0b000, 0>;
+def M2_vrcmaci_s0c: T_XTYPE_Vect_acc <"vrcmpyi", 0b010, 0b000, 1>;
+
+def M2_vrcmpyr_s0:  T_XTYPE_Vect <"vrcmpyr", 0b000, 0b001, 0>;
+def M2_vrcmpyr_s0c: T_XTYPE_Vect <"vrcmpyr", 0b011, 0b001, 1>;
+def M2_vrcmacr_s0:  T_XTYPE_Vect_acc <"vrcmpyr", 0b000, 0b001, 0>;
+def M2_vrcmacr_s0c: T_XTYPE_Vect_acc <"vrcmpyr", 0b011, 0b001, 1>;
+
+// Vector reduce halfwords:
+// Rdd[+]=vrmpyh(Rss,Rtt)
+def M2_vrmpy_s0: T_XTYPE_Vect <"vrmpyh", 0b000, 0b010, 0>;
+def M2_vrmac_s0: T_XTYPE_Vect_acc <"vrmpyh", 0b000, 0b010, 0>;
+
+//===----------------------------------------------------------------------===//
+// Template Class -- Vector Multipy with accumulation.
+// Used for complex multiply real or imaginary, dual multiply and even halfwords
+//===----------------------------------------------------------------------===//
+let Defs = [USR_OVF] in
+class T_M2_vmpy_acc_sat < string opc, bits<3> MajOp, bits<3> MinOp,
+                          bit hasShift, bit isRnd >
+  : MInst <(outs DoubleRegs:$Rxx),
+           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Rxx += "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
+                               #!if(isRnd,":rnd","")#":sat",
+  [], "$dst2 = $Rxx",M_tc_3x_SLOT23 > {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1010;
+    let Inst{23-21} = MajOp;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = Rxx;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
+
+class T_M2_vmpy_acc < string opc, bits<3> MajOp, bits<3> MinOp,
+                      bit hasShift, bit isRnd >
+  : MInst <(outs DoubleRegs:$Rxx),
+           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Rxx += "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
+                               #!if(isRnd,":rnd",""),
+  [], "$dst2 = $Rxx",M_tc_3x_SLOT23 > {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1010;
+    let Inst{23-21} = MajOp;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = Rxx;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
+
+// Vector multiply word by signed half with accumulation
+// Rxx+=vmpyw[eo]h(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmacls_s1:  T_M2_vmpy_acc_sat <"vmpyweh", 0b100, 0b101, 1, 0>;
+def M2_mmacls_s0:  T_M2_vmpy_acc_sat <"vmpyweh", 0b000, 0b101, 0, 0>;
+def M2_mmacls_rs1: T_M2_vmpy_acc_sat <"vmpyweh", 0b101, 0b101, 1, 1>;
+def M2_mmacls_rs0: T_M2_vmpy_acc_sat <"vmpyweh", 0b001, 0b101, 0, 1>;
+
+def M2_mmachs_s1:  T_M2_vmpy_acc_sat <"vmpywoh", 0b100, 0b111, 1, 0>;
+def M2_mmachs_s0:  T_M2_vmpy_acc_sat <"vmpywoh", 0b000, 0b111, 0, 0>;
+def M2_mmachs_rs1: T_M2_vmpy_acc_sat <"vmpywoh", 0b101, 0b111, 1, 1>;
+def M2_mmachs_rs0: T_M2_vmpy_acc_sat <"vmpywoh", 0b001, 0b111, 0, 1>;
+
+// Vector multiply word by unsigned half with accumulation
+// Rxx+=vmpyw[eo]uh(Rss,Rtt)[:<<1][:rnd]:sat
+def M2_mmaculs_s1:  T_M2_vmpy_acc_sat <"vmpyweuh", 0b110, 0b101, 1, 0>;
+def M2_mmaculs_s0:  T_M2_vmpy_acc_sat <"vmpyweuh", 0b010, 0b101, 0, 0>;
+def M2_mmaculs_rs1: T_M2_vmpy_acc_sat <"vmpyweuh", 0b111, 0b101, 1, 1>;
+def M2_mmaculs_rs0: T_M2_vmpy_acc_sat <"vmpyweuh", 0b011, 0b101, 0, 1>;
+
+def M2_mmacuhs_s1:  T_M2_vmpy_acc_sat <"vmpywouh", 0b110, 0b111, 1, 0>;
+def M2_mmacuhs_s0:  T_M2_vmpy_acc_sat <"vmpywouh", 0b010, 0b111, 0, 0>;
+def M2_mmacuhs_rs1: T_M2_vmpy_acc_sat <"vmpywouh", 0b111, 0b111, 1, 1>;
+def M2_mmacuhs_rs0: T_M2_vmpy_acc_sat <"vmpywouh", 0b011, 0b111, 0, 1>;
+
+// Vector multiply even halfwords with accumulation
+// Rxx+=vmpyeh(Rss,Rtt)[:<<1][:sat]
+def M2_vmac2es:    T_M2_vmpy_acc     <"vmpyeh", 0b001, 0b010, 0, 0>;
+def M2_vmac2es_s1: T_M2_vmpy_acc_sat <"vmpyeh", 0b100, 0b110, 1, 0>;
+def M2_vmac2es_s0: T_M2_vmpy_acc_sat <"vmpyeh", 0b000, 0b110, 0, 0>;
+
+// Vector dual multiply with accumulation
+// Rxx+=vdmpy(Rss,Rtt)[:sat]
+def M2_vdmacs_s1: T_M2_vmpy_acc_sat <"vdmpy", 0b100, 0b100, 1, 0>;
+def M2_vdmacs_s0: T_M2_vmpy_acc_sat <"vdmpy", 0b000, 0b100, 0, 0>;
+
+// Vector complex multiply real or imaginary with accumulation
+// Rxx+=vcmpy[ir](Rss,Rtt):sat
+def M2_vcmac_s0_sat_r: T_M2_vmpy_acc_sat <"vcmpyr", 0b001, 0b100, 0, 0>;
+def M2_vcmac_s0_sat_i: T_M2_vmpy_acc_sat <"vcmpyi", 0b010, 0b100, 0, 0>;
+
+//===----------------------------------------------------------------------===//
+// Template Class -- Multiply signed/unsigned halfwords with and without
+// saturation and rounding
+//===----------------------------------------------------------------------===//
+class T_M2_mpyd < bits<2> LHbits, bit isRnd, bit hasShift, bit isUnsigned >
+  : MInst < (outs DoubleRegs:$Rdd), (ins IntRegs:$Rs, IntRegs:$Rt),
+  "$Rdd = "#!if(isUnsigned,"mpyu","mpy")#"($Rs."#!if(LHbits{1},"h","l")
+                                       #", $Rt."#!if(LHbits{0},"h)","l)")
+                                       #!if(hasShift,":<<1","")
+                                       #!if(isRnd,":rnd",""),
+  [] > {
+    bits<5> Rdd;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b0100;
+    let Inst{23} = hasShift;
+    let Inst{22} = isUnsigned;
+    let Inst{21} = isRnd;
+    let Inst{6-5} = LHbits;
+    let Inst{4-0} = Rdd;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = Rt;
+}
+
+def M2_mpyd_hh_s0: T_M2_mpyd<0b11, 0, 0, 0>;
+def M2_mpyd_hl_s0: T_M2_mpyd<0b10, 0, 0, 0>;
+def M2_mpyd_lh_s0: T_M2_mpyd<0b01, 0, 0, 0>;
+def M2_mpyd_ll_s0: T_M2_mpyd<0b00, 0, 0, 0>;
+
+def M2_mpyd_hh_s1: T_M2_mpyd<0b11, 0, 1, 0>;
+def M2_mpyd_hl_s1: T_M2_mpyd<0b10, 0, 1, 0>;
+def M2_mpyd_lh_s1: T_M2_mpyd<0b01, 0, 1, 0>;
+def M2_mpyd_ll_s1: T_M2_mpyd<0b00, 0, 1, 0>;
+
+def M2_mpyd_rnd_hh_s0: T_M2_mpyd<0b11, 1, 0, 0>;
+def M2_mpyd_rnd_hl_s0: T_M2_mpyd<0b10, 1, 0, 0>;
+def M2_mpyd_rnd_lh_s0: T_M2_mpyd<0b01, 1, 0, 0>;
+def M2_mpyd_rnd_ll_s0: T_M2_mpyd<0b00, 1, 0, 0>;
+
+def M2_mpyd_rnd_hh_s1: T_M2_mpyd<0b11, 1, 1, 0>;
+def M2_mpyd_rnd_hl_s1: T_M2_mpyd<0b10, 1, 1, 0>;
+def M2_mpyd_rnd_lh_s1: T_M2_mpyd<0b01, 1, 1, 0>;
+def M2_mpyd_rnd_ll_s1: T_M2_mpyd<0b00, 1, 1, 0>;
+
+//Rdd=mpyu(Rs.[HL],Rt.[HL])[:<<1]
+def M2_mpyud_hh_s0: T_M2_mpyd<0b11, 0, 0, 1>;
+def M2_mpyud_hl_s0: T_M2_mpyd<0b10, 0, 0, 1>;
+def M2_mpyud_lh_s0: T_M2_mpyd<0b01, 0, 0, 1>;
+def M2_mpyud_ll_s0: T_M2_mpyd<0b00, 0, 0, 1>;
+
+def M2_mpyud_hh_s1: T_M2_mpyd<0b11, 0, 1, 1>;
+def M2_mpyud_hl_s1: T_M2_mpyd<0b10, 0, 1, 1>;
+def M2_mpyud_lh_s1: T_M2_mpyd<0b01, 0, 1, 1>;
+def M2_mpyud_ll_s1: T_M2_mpyd<0b00, 0, 1, 1>;
+
+//===----------------------------------------------------------------------===//
+// Template Class for xtype mpy:
+// Vector multiply
+// Complex multiply
+// multiply 32X32 and use full result
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_XTYPE_mpy64 <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                     bit isSat, bit hasShift, bit isConj>
+   : MInst <(outs DoubleRegs:$Rdd),
+            (ins IntRegs:$Rs, IntRegs:$Rt),
+  "$Rdd = "#mnemonic#"($Rs, $Rt"#!if(isConj,"*)",")")
+                                #!if(hasShift,":<<1","")
+                                #!if(isSat,":sat",""),
+  [] > {
+    bits<5> Rdd;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b0101;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = Rt;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = Rdd;
+  }
+
+//===----------------------------------------------------------------------===//
+// Template Class for xtype mpy with accumulation into 64-bit:
+// Vector multiply
+// Complex multiply
+// multiply 32X32 and use full result
+//===----------------------------------------------------------------------===//
+class T_XTYPE_mpy64_acc <string op1, string op2, bits<3> MajOp, bits<3> MinOp,
+                         bit isSat, bit hasShift, bit isConj>
+  : MInst <(outs DoubleRegs:$Rxx),
+           (ins DoubleRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
+  "$Rxx "#op2#"= "#op1#"($Rs, $Rt"#!if(isConj,"*)",")")
+                                   #!if(hasShift,":<<1","")
+                                   #!if(isSat,":sat",""),
+
+  [] , "$dst2 = $Rxx" > {
+    bits<5> Rxx;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b0111;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = Rt;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = Rxx;
+  }
+
+// MPY - Multiply and use full result
+// Rdd = mpy[u](Rs,Rt)
+def M2_dpmpyss_s0 : T_XTYPE_mpy64 < "mpy", 0b000, 0b000, 0, 0, 0>;
+def M2_dpmpyuu_s0 : T_XTYPE_mpy64 < "mpyu", 0b010, 0b000, 0, 0, 0>;
+
+// Rxx[+-]= mpy[u](Rs,Rt)
+def M2_dpmpyss_acc_s0 : T_XTYPE_mpy64_acc < "mpy",  "+", 0b000, 0b000, 0, 0, 0>;
+def M2_dpmpyss_nac_s0 : T_XTYPE_mpy64_acc < "mpy",  "-", 0b001, 0b000, 0, 0, 0>;
+def M2_dpmpyuu_acc_s0 : T_XTYPE_mpy64_acc < "mpyu", "+", 0b010, 0b000, 0, 0, 0>;
+def M2_dpmpyuu_nac_s0 : T_XTYPE_mpy64_acc < "mpyu", "-", 0b011, 0b000, 0, 0, 0>;
+
+// Complex multiply real or imaginary
+// Rxx=cmpy[ir](Rs,Rt)
+def M2_cmpyi_s0 : T_XTYPE_mpy64 < "cmpyi", 0b000, 0b001, 0, 0, 0>;
+def M2_cmpyr_s0 : T_XTYPE_mpy64 < "cmpyr", 0b000, 0b010, 0, 0, 0>;
+
+// Rxx+=cmpy[ir](Rs,Rt)
+def M2_cmaci_s0 : T_XTYPE_mpy64_acc < "cmpyi", "+", 0b000, 0b001, 0, 0, 0>;
+def M2_cmacr_s0 : T_XTYPE_mpy64_acc < "cmpyr", "+", 0b000, 0b010, 0, 0, 0>;
+
+// Complex multiply
+// Rdd=cmpy(Rs,Rt)[:<<]:sat
+def M2_cmpys_s0 : T_XTYPE_mpy64 < "cmpy", 0b000, 0b110, 1, 0, 0>;
+def M2_cmpys_s1 : T_XTYPE_mpy64 < "cmpy", 0b100, 0b110, 1, 1, 0>;
+
+// Rdd=cmpy(Rs,Rt*)[:<<]:sat
+def M2_cmpysc_s0 : T_XTYPE_mpy64 < "cmpy", 0b010, 0b110, 1, 0, 1>;
+def M2_cmpysc_s1 : T_XTYPE_mpy64 < "cmpy", 0b110, 0b110, 1, 1, 1>;
+
+// Rxx[-+]=cmpy(Rs,Rt)[:<<1]:sat
+def M2_cmacs_s0  : T_XTYPE_mpy64_acc < "cmpy", "+", 0b000, 0b110, 1, 0, 0>;
+def M2_cnacs_s0  : T_XTYPE_mpy64_acc < "cmpy", "-", 0b000, 0b111, 1, 0, 0>;
+def M2_cmacs_s1  : T_XTYPE_mpy64_acc < "cmpy", "+", 0b100, 0b110, 1, 1, 0>;
+def M2_cnacs_s1  : T_XTYPE_mpy64_acc < "cmpy", "-", 0b100, 0b111, 1, 1, 0>;
+
+// Rxx[-+]=cmpy(Rs,Rt*)[:<<1]:sat
+def M2_cmacsc_s0 : T_XTYPE_mpy64_acc < "cmpy", "+", 0b010, 0b110, 1, 0, 1>;
+def M2_cnacsc_s0 : T_XTYPE_mpy64_acc < "cmpy", "-", 0b010, 0b111, 1, 0, 1>;
+def M2_cmacsc_s1 : T_XTYPE_mpy64_acc < "cmpy", "+", 0b110, 0b110, 1, 1, 1>;
+def M2_cnacsc_s1 : T_XTYPE_mpy64_acc < "cmpy", "-", 0b110, 0b111, 1, 1, 1>;
+
+// Vector multiply halfwords
+// Rdd=vmpyh(Rs,Rt)[:<<]:sat
+//let Defs = [USR_OVF] in {
+  def M2_vmpy2s_s1 : T_XTYPE_mpy64 < "vmpyh", 0b100, 0b101, 1, 1, 0>;
+  def M2_vmpy2s_s0 : T_XTYPE_mpy64 < "vmpyh", 0b000, 0b101, 1, 0, 0>;
+//}
+
+// Rxx+=vmpyh(Rs,Rt)[:<<1][:sat]
+def M2_vmac2     : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b001, 0b001, 0, 0, 0>;
+def M2_vmac2s_s1 : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b100, 0b101, 1, 1, 0>;
+def M2_vmac2s_s0 : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b000, 0b101, 1, 0, 0>;
+
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYH -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYS +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYS -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/VB +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/VB -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/VH  +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/VH  -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ST +
+//===----------------------------------------------------------------------===//
+///
+// Store doubleword.
+//===----------------------------------------------------------------------===//
+// Template class for non-predicated post increment stores with immediate offset
+//===----------------------------------------------------------------------===//
+let isPredicable = 1, hasSideEffects = 0, addrMode = PostInc in
+class T_store_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
+                 bits<4> MajOp, bit isHalf >
+  : STInst <(outs IntRegs:$_dst_),
+            (ins IntRegs:$src1, ImmOp:$offset, RC:$src2),
+  mnemonic#"($src1++#$offset) = $src2"#!if(isHalf, ".h", ""),
+  [], "$src1 = $_dst_" >,
+  AddrModeRel {
+    bits<5> src1;
+    bits<5> src2;
+    bits<7> offset;
+    bits<4> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
+                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+                                      /* s4_0Imm */ offset{3-0})));
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1));
+
+    let IClass = 0b1010;
+
+    let Inst{27-25} = 0b101;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13}    = 0b0;
+    let Inst{12-8}  = src2;
+    let Inst{7}     = 0b0;
+    let Inst{6-3}   = offsetBits;
+    let Inst{1}     = 0b0;
+  }
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated post increment stores with immediate offset
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc in
+class T_pstore_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
+                   bits<4> MajOp, bit isHalf, bit isPredNot, bit isPredNew>
+  : STInst <(outs IntRegs:$_dst_),
+            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3),
+  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+  ") ")#mnemonic#"($src2++#$offset) = $src3"#!if(isHalf, ".h", ""),
+  [], "$src2 = $_dst_" >,
+  AddrModeRel {
+    bits<2> src1;
+    bits<5> src2;
+    bits<7> offset;
+    bits<5> src3;
+    bits<4> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
+                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+                                      /* s4_0Imm */ offset{3-0})));
+
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1));
+    let isPredicatedNew = isPredNew;
+    let isPredicatedFalse = isPredNot;
+
+    let IClass = 0b1010;
+
+    let Inst{27-25} = 0b101;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{13} = 0b1;
+    let Inst{12-8} = src3;
+    let Inst{7} = isPredNew;
+    let Inst{6-3} = offsetBits;
+    let Inst{2} = isPredNot;
+    let Inst{1-0} = src1;
+  }
+
+multiclass ST_PostInc<string mnemonic, string BaseOp, RegisterClass RC,
+                      Operand ImmOp, bits<4> MajOp, bit isHalf = 0 > {
+
+  let BaseOpcode = "POST_"#BaseOp in {
+    def S2_#NAME#_pi : T_store_pi <mnemonic, RC, ImmOp, MajOp, isHalf>;
+
+    // Predicated
+    def S2_p#NAME#t_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp, isHalf, 0, 0>;
+    def S2_p#NAME#f_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp, isHalf, 1, 0>;
+
+    // Predicated new
+    def S2_p#NAME#tnew_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp,
+                                          isHalf, 0, 1>;
+    def S2_p#NAME#fnew_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp,
+                                          isHalf, 1, 1>;
+  }
+}
+
+let accessSize = ByteAccess in
+defm storerb: ST_PostInc <"memb", "STrib", IntRegs, s4_0Imm, 0b1000>;
+
+let accessSize = HalfWordAccess in
+defm storerh: ST_PostInc <"memh", "STrih", IntRegs, s4_1Imm, 0b1010>;
+
+let accessSize = WordAccess in
+defm storeri: ST_PostInc <"memw", "STriw", IntRegs, s4_2Imm, 0b1100>;
+
+let accessSize = DoubleWordAccess in
+defm storerd: ST_PostInc <"memd", "STrid", DoubleRegs, s4_3Imm, 0b1110>;
+
+let accessSize = HalfWordAccess, isNVStorable = 0 in
+defm storerf: ST_PostInc <"memh", "STrih_H", IntRegs, s4_1Imm, 0b1011, 1>;
+
+//===----------------------------------------------------------------------===//
+// Template class for post increment stores with register offset.
+//===----------------------------------------------------------------------===//
+class T_store_pr <string mnemonic, RegisterClass RC, bits<3> MajOp,
+                     MemAccessSize AccessSz, bit isHalf = 0>
+  : STInst <(outs IntRegs:$_dst_),
+            (ins IntRegs:$src1, ModRegs:$src2, RC:$src3),
+  mnemonic#"($src1++$src2) = $src3"#!if(isHalf, ".h", ""),
+  [], "$src1 = $_dst_" > {
+    bits<5> src1;
+    bits<1> src2;
+    bits<5> src3;
+    let accessSize = AccessSz;
+
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if(!eq(mnemonic,"memd"), 0, !if(isHalf,0,1));
+
+    let IClass = 0b1010;
+
+    let Inst{27-24} = 0b1101;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13} = src2;
+    let Inst{12-8} = src3;
+    let Inst{7} = 0b0;
+  }
+
+def S2_storerb_pr : T_store_pr<"memb", IntRegs, 0b000, ByteAccess>;
+def S2_storerh_pr : T_store_pr<"memh", IntRegs, 0b010, HalfWordAccess>;
+def S2_storeri_pr : T_store_pr<"memw", IntRegs, 0b100, WordAccess>;
+def S2_storerd_pr : T_store_pr<"memd", DoubleRegs, 0b110, DoubleWordAccess>;
+def S2_storerf_pr : T_store_pr<"memh", IntRegs, 0b011, HalfWordAccess, 1>;
+
+let opExtendable = 1, isExtentSigned = 1, isPredicable = 1 in
+class T_store_io <string mnemonic, RegisterClass RC, Operand ImmOp,
+                  bits<3> MajOp, bit isH = 0>
+  : STInst <(outs),
+            (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+  mnemonic#"($src1+#$src2) = $src3"#!if(isH,".h","")>,
+  AddrModeRel, ImmRegRel {
+    bits<5> src1;
+    bits<14> src2; // Actual address offset
+    bits<5> src3;
+    bits<11> offsetBits; // Represents offset encoding
+
+    string ImmOpStr = !cast<string>(ImmOp);
+
+    let opExtentBits = !if (!eq(ImmOpStr, "s11_3Ext"), 14,
+                       !if (!eq(ImmOpStr, "s11_2Ext"), 13,
+                       !if (!eq(ImmOpStr, "s11_1Ext"), 12,
+                                        /* s11_0Ext */ 11)));
+    let offsetBits = !if (!eq(ImmOpStr, "s11_3Ext"), src2{13-3},
+                     !if (!eq(ImmOpStr, "s11_2Ext"), src2{12-2},
+                     !if (!eq(ImmOpStr, "s11_1Ext"), src2{11-1},
+                                      /* s11_0Ext */ src2{10-0})));
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
+    let IClass = 0b1010;
+
+    let Inst{27} = 0b0;
+    let Inst{26-25} = offsetBits{10-9};
+    let Inst{24} = 0b1;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13} = offsetBits{8};
+    let Inst{12-8} = src3;
+    let Inst{7-0} = offsetBits{7-0};
+  }
+
+let opExtendable = 2, isPredicated = 1 in
+class T_pstore_io <string mnemonic, RegisterClass RC, Operand ImmOp,
+                   bits<3>MajOp, bit PredNot, bit isPredNew, bit isH = 0>
+  : STInst <(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+  !if(PredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+  ") ")#mnemonic#"($src2+#$src3) = $src4"#!if(isH,".h",""),
+  [],"",V2LDST_tc_st_SLOT01 >,
+   AddrModeRel, ImmRegRel {
+    bits<2> src1;
+    bits<5> src2;
+    bits<9> src3; // Actual address offset
+    bits<5> src4;
+    bits<6> offsetBits; // Represents offset encoding
+
+    let isPredicatedNew = isPredNew;
+    let isPredicatedFalse = PredNot;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let opExtentBits = !if (!eq(ImmOpStr, "u6_3Ext"), 9,
+                       !if (!eq(ImmOpStr, "u6_2Ext"), 8,
+                       !if (!eq(ImmOpStr, "u6_1Ext"), 7,
+                                        /* u6_0Ext */ 6)));
+    let offsetBits = !if (!eq(ImmOpStr, "u6_3Ext"), src3{8-3},
+                     !if (!eq(ImmOpStr, "u6_2Ext"), src3{7-2},
+                     !if (!eq(ImmOpStr, "u6_1Ext"), src3{6-1},
+                                      /* u6_0Ext */ src3{5-0})));
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
+
+    let IClass = 0b0100;
+
+    let Inst{27} = 0b0;
+    let Inst{26} = PredNot;
+    let Inst{25} = isPredNew;
+    let Inst{24} = 0b0;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{13} = offsetBits{5};
+    let Inst{12-8} = src4;
+    let Inst{7-3} = offsetBits{4-0};
+    let Inst{1-0} = src1;
+  }
+
+let isExtendable = 1, hasSideEffects = 0 in
+multiclass ST_Idxd<string mnemonic, string CextOp, RegisterClass RC,
+                 Operand ImmOp, Operand predImmOp, bits<3> MajOp, bit isH = 0> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
+    def S2_#NAME#_io : T_store_io <mnemonic, RC, ImmOp, MajOp, isH>;
+
+    // Predicated
+    def S2_p#NAME#t_io : T_pstore_io<mnemonic, RC, predImmOp, MajOp, 0, 0, isH>;
+    def S2_p#NAME#f_io : T_pstore_io<mnemonic, RC, predImmOp, MajOp, 1, 0, isH>;
+
+    // Predicated new
+    def S4_p#NAME#tnew_io : T_pstore_io <mnemonic, RC, predImmOp,
+                                         MajOp, 0, 1, isH>;
+    def S4_p#NAME#fnew_io : T_pstore_io <mnemonic, RC, predImmOp,
+                                         MajOp, 1, 1, isH>;
+  }
+}
+
+let addrMode = BaseImmOffset, InputType = "imm" in {
+  let accessSize = ByteAccess in
+    defm storerb: ST_Idxd < "memb", "STrib", IntRegs, s11_0Ext, u6_0Ext, 0b000>;
+
+  let accessSize = HalfWordAccess, opExtentAlign = 1 in
+    defm storerh: ST_Idxd < "memh", "STrih", IntRegs, s11_1Ext, u6_1Ext, 0b010>;
+
+  let accessSize = WordAccess, opExtentAlign = 2 in
+    defm storeri: ST_Idxd < "memw", "STriw", IntRegs, s11_2Ext, u6_2Ext, 0b100>;
+
+  let accessSize = DoubleWordAccess, isNVStorable = 0, opExtentAlign = 3 in
+    defm storerd: ST_Idxd < "memd", "STrid", DoubleRegs, s11_3Ext,
+                            u6_3Ext, 0b110>;
+
+  let accessSize = HalfWordAccess, opExtentAlign = 1 in
+    defm storerf: ST_Idxd < "memh", "STrif", IntRegs, s11_1Ext,
+                            u6_1Ext, 0b011, 1>;
+}
+
+// Store predicate.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def STriw_pred : STInst<(outs),
+      (ins IntRegs:$addr, s11_2Ext:$off, PredRegs:$src1),
+      ".error \"should not emit\"", []>;
+// Store modifier.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def STriw_mod : STInst<(outs),
+      (ins IntRegs:$addr, s11_2Ext:$off, ModRegs:$src1),
+      ".error \"should not emit\"", []>;
+
+// S2_allocframe: Allocate stack frame.
+let Defs = [R29, R30], Uses = [R29, R31, R30],
+    hasSideEffects = 0, accessSize = DoubleWordAccess in
+def S2_allocframe: ST0Inst <
+  (outs), (ins u11_3Imm:$u11_3),
+  "allocframe(#$u11_3)" > {
+    bits<14> u11_3;
+
+    let IClass = 0b1010;
+    let Inst{27-16} = 0b000010011101;
+    let Inst{13-11} = 0b000;
+    let Inst{10-0} = u11_3{13-3};
+  }
+
+// S2_storer[bhwdf]_pci: Store byte/half/word/double.
+// S2_storer[bhwdf]_pci -> S2_storerbnew_pci
+let Uses = [CS], addrMode = PostInc in
+class T_store_pci <string mnemonic, RegisterClass RC,
+                         Operand Imm, bits<4>MajOp,
+                         MemAccessSize AlignSize, string RegSrc = "Rt">
+  : STInst <(outs IntRegs:$_dst_),
+  (ins IntRegs:$Rz, Imm:$offset, ModRegs:$Mu, RC:$Rt),
+  #mnemonic#"($Rz ++ #$offset:circ($Mu)) = $"#RegSrc#"",
+  [] ,
+  "$Rz = $_dst_" > {
+    bits<5> Rz;
+    bits<7> offset;
+    bits<1> Mu;
+    bits<5> Rt;
+    let accessSize = AlignSize;
+    let isNVStorable = !if(!eq(mnemonic,"memd"), 0,
+                       !if(!eq(RegSrc,"Rt.h"), 0, 1));
+
+    let IClass = 0b1010;
+    let Inst{27-25} = 0b100;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = Rz;
+    let Inst{13} = Mu;
+    let Inst{12-8} = Rt;
+    let Inst{7} = 0b0;
+    let Inst{6-3} =
+      !if (!eq(!cast<string>(AlignSize), "DoubleWordAccess"), offset{6-3},
+      !if (!eq(!cast<string>(AlignSize), "WordAccess"),       offset{5-2},
+      !if (!eq(!cast<string>(AlignSize), "HalfWordAccess"),   offset{4-1},
+                                       /* ByteAccess */       offset{3-0})));
+    let Inst{1} = 0b0;
+  }
+
+def S2_storerb_pci : T_store_pci<"memb", IntRegs, s4_0Imm, 0b1000,
+                                 ByteAccess>;
+def S2_storerh_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1010,
+                                 HalfWordAccess>;
+def S2_storerf_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1011,
+                                 HalfWordAccess, "Rt.h">;
+def S2_storeri_pci : T_store_pci<"memw", IntRegs, s4_2Imm, 0b1100,
+                                 WordAccess>;
+def S2_storerd_pci : T_store_pci<"memd", DoubleRegs, s4_3Imm, 0b1110,
+                                 DoubleWordAccess>;
+
+let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 4,
+    addrMode = PostInc in
+class T_storenew_pci <string mnemonic, Operand Imm,
+                             bits<2>MajOp, MemAccessSize AlignSize>
+  : NVInst < (outs IntRegs:$_dst_),
+  (ins IntRegs:$Rz, Imm:$offset, ModRegs:$Mu, IntRegs:$Nt),
+  #mnemonic#"($Rz ++ #$offset:circ($Mu)) = $Nt.new",
+  [],
+  "$Rz = $_dst_"> {
+    bits<5> Rz;
+    bits<6> offset;
+    bits<1> Mu;
+    bits<3> Nt;
+
+    let accessSize = AlignSize;
+
+    let IClass = 0b1010;
+    let Inst{27-21} = 0b1001101;
+    let Inst{20-16} = Rz;
+    let Inst{13} = Mu;
+    let Inst{12-11} = MajOp;
+    let Inst{10-8} = Nt;
+    let Inst{7} = 0b0;
+    let Inst{6-3} =
+      !if (!eq(!cast<string>(AlignSize), "WordAccess"),     offset{5-2},
+      !if (!eq(!cast<string>(AlignSize), "HalfWordAccess"), offset{4-1},
+                                       /* ByteAccess */     offset{3-0}));
+    let Inst{1} = 0b0;
+  }
+
+def S2_storerbnew_pci : T_storenew_pci <"memb", s4_0Imm, 0b00, ByteAccess>;
+def S2_storerhnew_pci : T_storenew_pci <"memh", s4_1Imm, 0b01, HalfWordAccess>;
+def S2_storerinew_pci : T_storenew_pci <"memw", s4_2Imm, 0b10, WordAccess>;
+
+//===----------------------------------------------------------------------===//
+// Circular stores with auto-increment register
+//===----------------------------------------------------------------------===//
+let Uses = [CS], addrMode = PostInc in
+class T_store_pcr <string mnemonic, RegisterClass RC, bits<4>MajOp,
+                               MemAccessSize AlignSize, string RegSrc = "Rt">
+  : STInst <(outs IntRegs:$_dst_),
+  (ins IntRegs:$Rz, ModRegs:$Mu, RC:$Rt),
+  #mnemonic#"($Rz ++ I:circ($Mu)) = $"#RegSrc#"",
+  [],
+  "$Rz = $_dst_" > {
+    bits<5> Rz;
+    bits<1> Mu;
+    bits<5> Rt;
+
+    let accessSize = AlignSize;
+    let isNVStorable = !if(!eq(mnemonic,"memd"), 0,
+                       !if(!eq(RegSrc,"Rt.h"), 0, 1));
+
+    let IClass = 0b1010;
+    let Inst{27-25} = 0b100;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = Rz;
+    let Inst{13} = Mu;
+    let Inst{12-8} = Rt;
+    let Inst{7} = 0b0;
+    let Inst{1} = 0b1;
+  }
+
+def S2_storerb_pcr : T_store_pcr<"memb", IntRegs, 0b1000, ByteAccess>;
+def S2_storerh_pcr : T_store_pcr<"memh", IntRegs, 0b1010, HalfWordAccess>;
+def S2_storeri_pcr : T_store_pcr<"memw", IntRegs, 0b1100, WordAccess>;
+def S2_storerd_pcr : T_store_pcr<"memd", DoubleRegs, 0b1110, DoubleWordAccess>;
+def S2_storerf_pcr : T_store_pcr<"memh", IntRegs, 0b1011,
+                                 HalfWordAccess, "Rt.h">;
+
+//===----------------------------------------------------------------------===//
+// Circular .new stores with auto-increment register
+//===----------------------------------------------------------------------===//
+let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3,
+    addrMode = PostInc in
+class T_storenew_pcr <string mnemonic, bits<2>MajOp,
+                                   MemAccessSize AlignSize>
+  : NVInst <(outs IntRegs:$_dst_),
+  (ins IntRegs:$Rz, ModRegs:$Mu, IntRegs:$Nt),
+  #mnemonic#"($Rz ++ I:circ($Mu)) = $Nt.new" ,
+  [] ,
+  "$Rz = $_dst_"> {
+    bits<5> Rz;
+    bits<1> Mu;
+    bits<3> Nt;
+
+    let accessSize = AlignSize;
+
+    let IClass = 0b1010;
+    let Inst{27-21} = 0b1001101;
+    let Inst{20-16} = Rz;
+    let Inst{13} = Mu;
+    let Inst{12-11} = MajOp;
+    let Inst{10-8} = Nt;
+    let Inst{7} = 0b0;
+    let Inst{1} = 0b1;
+  }
+
+def S2_storerbnew_pcr : T_storenew_pcr <"memb", 0b00, ByteAccess>;
+def S2_storerhnew_pcr : T_storenew_pcr <"memh", 0b01, HalfWordAccess>;
+def S2_storerinew_pcr : T_storenew_pcr <"memw", 0b10, WordAccess>;
+
+//===----------------------------------------------------------------------===//
+// Bit-reversed stores with auto-increment register
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, addrMode = PostInc in
+class T_store_pbr<string mnemonic, RegisterClass RC,
+                            MemAccessSize addrSize, bits<3> majOp,
+                            bit isHalf = 0>
+  : STInst
+    <(outs IntRegs:$_dst_),
+     (ins IntRegs:$Rz, ModRegs:$Mu, RC:$src),
+     #mnemonic#"($Rz ++ $Mu:brev) = $src"#!if (!eq(isHalf, 1), ".h", ""),
+     [], "$Rz = $_dst_" > {
+
+      let accessSize = addrSize;
+
+      bits<5> Rz;
+      bits<1> Mu;
+      bits<5> src;
+
+      let IClass = 0b1010;
+
+      let Inst{27-24} = 0b1111;
+      let Inst{23-21} = majOp;
+      let Inst{7} = 0b0;
+      let Inst{20-16} = Rz;
+      let Inst{13} = Mu;
+      let Inst{12-8} = src;
+    }
+
+let isNVStorable = 1 in {
+  let BaseOpcode = "S2_storerb_pbr" in
+  def S2_storerb_pbr : T_store_pbr<"memb", IntRegs, ByteAccess,
+                                             0b000>, NewValueRel;
+  let BaseOpcode = "S2_storerh_pbr" in
+  def S2_storerh_pbr : T_store_pbr<"memh", IntRegs, HalfWordAccess,
+                                             0b010>, NewValueRel;
+  let BaseOpcode = "S2_storeri_pbr" in
+  def S2_storeri_pbr : T_store_pbr<"memw", IntRegs, WordAccess,
+                                             0b100>, NewValueRel;
+}
+
+def S2_storerf_pbr : T_store_pbr<"memh", IntRegs, HalfWordAccess, 0b011, 1>;
+def S2_storerd_pbr : T_store_pbr<"memd", DoubleRegs, DoubleWordAccess, 0b110>;
+
+//===----------------------------------------------------------------------===//
+// Bit-reversed .new stores with auto-increment register
+//===----------------------------------------------------------------------===//
+let isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3,
+    hasSideEffects = 0, addrMode = PostInc in
+class T_storenew_pbr<string mnemonic, MemAccessSize addrSize, bits<2> majOp>
+  : NVInst <(outs IntRegs:$_dst_),
+            (ins IntRegs:$Rz, ModRegs:$Mu, IntRegs:$Nt),
+     #mnemonic#"($Rz ++ $Mu:brev) = $Nt.new", [],
+     "$Rz = $_dst_">, NewValueRel {
+    let accessSize = addrSize;
+    bits<5> Rz;
+    bits<1> Mu;
+    bits<3> Nt;
+
+    let IClass = 0b1010;
+
+    let Inst{27-21} = 0b1111101;
+    let Inst{12-11} = majOp;
+    let Inst{7} = 0b0;
+    let Inst{20-16} = Rz;
+    let Inst{13} = Mu;
+    let Inst{10-8} = Nt;
+  }
+
+let BaseOpcode = "S2_storerb_pbr" in
+def S2_storerbnew_pbr : T_storenew_pbr<"memb", ByteAccess, 0b00>;
+
+let BaseOpcode = "S2_storerh_pbr" in
+def S2_storerhnew_pbr : T_storenew_pbr<"memh", HalfWordAccess, 0b01>;
+
+let BaseOpcode = "S2_storeri_pbr" in
+def S2_storerinew_pbr : T_storenew_pbr<"memw", WordAccess, 0b10>;
+
+//===----------------------------------------------------------------------===//
+// ST -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Template class for S_2op instructions.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_S2op_1 <string mnemonic, bits<4> RegTyBits, RegisterClass RCOut,
+                RegisterClass RCIn, bits<2> MajOp, bits<3> MinOp, bit isSat>
+  : SInst <(outs RCOut:$dst), (ins RCIn:$src),
+  "$dst = "#mnemonic#"($src)"#!if(isSat, ":sat", ""),
+  [], "", S_2op_tc_1_SLOT23 > {
+    bits<5> dst;
+    bits<5> src;
+
+    let IClass = 0b1000;
+
+    let Inst{27-24} = RegTyBits;
+    let Inst{23-22} = MajOp;
+    let Inst{21} = 0b0;
+    let Inst{20-16} = src;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = dst;
+  }
+
+class T_S2op_1_di <string mnemonic, bits<2> MajOp, bits<3> MinOp>
+  : T_S2op_1 <mnemonic, 0b0100, DoubleRegs, IntRegs, MajOp, MinOp, 0>;
+
+let hasNewValue = 1 in
+class T_S2op_1_id <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit isSat = 0>
+  : T_S2op_1 <mnemonic, 0b1000, IntRegs, DoubleRegs, MajOp, MinOp, isSat>;
+
+let hasNewValue = 1 in
+class T_S2op_1_ii <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit isSat = 0>
+  : T_S2op_1 <mnemonic, 0b1100, IntRegs, IntRegs, MajOp, MinOp, isSat>;
+
+// Vector sign/zero extend
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+  def S2_vsxtbh : T_S2op_1_di <"vsxtbh", 0b00, 0b000>;
+  def S2_vsxthw : T_S2op_1_di <"vsxthw", 0b00, 0b100>;
+  def S2_vzxtbh : T_S2op_1_di <"vzxtbh", 0b00, 0b010>;
+  def S2_vzxthw : T_S2op_1_di <"vzxthw", 0b00, 0b110>;
+}
+
+// Vector splat bytes/halfwords
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+  def S2_vsplatrb : T_S2op_1_ii <"vsplatb", 0b01, 0b111>;
+  def S2_vsplatrh : T_S2op_1_di <"vsplath", 0b01, 0b010>;
+}
+
+// Sign extend word to doubleword
+def A2_sxtw   : T_S2op_1_di <"sxtw", 0b01, 0b000>;
+
+// Vector saturate and pack
+let Defs = [USR_OVF] in {
+  def S2_svsathb  : T_S2op_1_ii <"vsathb", 0b10, 0b000>;
+  def S2_svsathub : T_S2op_1_ii <"vsathub", 0b10, 0b010>;
+  def S2_vsathb   : T_S2op_1_id <"vsathb", 0b00, 0b110>;
+  def S2_vsathub  : T_S2op_1_id <"vsathub", 0b00, 0b000>;
+  def S2_vsatwh   : T_S2op_1_id <"vsatwh", 0b00, 0b010>;
+  def S2_vsatwuh  : T_S2op_1_id <"vsatwuh", 0b00, 0b100>;
+}
+
+// Vector truncate
+def S2_vtrunohb : T_S2op_1_id <"vtrunohb", 0b10, 0b000>;
+def S2_vtrunehb : T_S2op_1_id <"vtrunehb", 0b10, 0b010>;
+
+// Swizzle the bytes of a word
+def A2_swiz : T_S2op_1_ii <"swiz", 0b10, 0b111>;
+
+// Saturate
+let Defs = [USR_OVF] in {
+  def A2_sat   : T_S2op_1_id <"sat", 0b11, 0b000>;
+  def A2_satb  : T_S2op_1_ii <"satb", 0b11, 0b111>;
+  def A2_satub : T_S2op_1_ii <"satub", 0b11, 0b110>;
+  def A2_sath  : T_S2op_1_ii <"sath", 0b11, 0b100>;
+  def A2_satuh : T_S2op_1_ii <"satuh", 0b11, 0b101>;
+  def A2_roundsat : T_S2op_1_id <"round", 0b11, 0b001, 0b1>;
+}
+
+let Itinerary = S_2op_tc_2_SLOT23 in {
+  // Vector round and pack
+  def S2_vrndpackwh   : T_S2op_1_id <"vrndwh", 0b10, 0b100>;
+
+  let Defs = [USR_OVF] in
+  def S2_vrndpackwhs  : T_S2op_1_id <"vrndwh", 0b10, 0b110, 1>;
+
+  // Bit reverse
+  def S2_brev : T_S2op_1_ii <"brev", 0b01, 0b110>;
+
+  // Absolute value word
+  def A2_abs    : T_S2op_1_ii <"abs", 0b10, 0b100>;
+
+  let Defs = [USR_OVF] in
+  def A2_abssat : T_S2op_1_ii <"abs", 0b10, 0b101, 1>;
+
+  // Negate with saturation
+  let Defs = [USR_OVF] in
+  def A2_negsat : T_S2op_1_ii <"neg", 0b10, 0b110, 1>;
+}
+
+class T_S2op_2 <string mnemonic, bits<4> RegTyBits, RegisterClass RCOut,
+                RegisterClass RCIn, bits<3> MajOp, bits<3> MinOp,
+                bit isSat, bit isRnd, list<dag> pattern = []>
+  : SInst <(outs RCOut:$dst),
+  (ins RCIn:$src, u5_0Imm:$u5),
+  "$dst = "#mnemonic#"($src, #$u5)"#!if(isSat, ":sat", "")
+                                   #!if(isRnd, ":rnd", ""),
+  pattern, "", S_2op_tc_2_SLOT23> {
+    bits<5> dst;
+    bits<5> src;
+    bits<5> u5;
+
+    let IClass = 0b1000;
+
+    let Inst{27-24} = RegTyBits;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = src;
+    let Inst{13} = 0b0;
+    let Inst{12-8} = u5;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = dst;
+  }
+
+class T_S2op_2_di <string mnemonic, bits<3> MajOp, bits<3> MinOp>
+  : T_S2op_2 <mnemonic, 0b1000, DoubleRegs, IntRegs, MajOp, MinOp, 0, 0>;
+
+let hasNewValue = 1 in
+class T_S2op_2_id <string mnemonic, bits<3> MajOp, bits<3> MinOp>
+  : T_S2op_2 <mnemonic, 0b1000, IntRegs, DoubleRegs, MajOp, MinOp, 0, 0>;
+
+let hasNewValue = 1 in
+class T_S2op_2_ii <string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                   bit isSat = 0, bit isRnd = 0, list<dag> pattern = []>
+  : T_S2op_2 <mnemonic, 0b1100, IntRegs, IntRegs, MajOp, MinOp,
+              isSat, isRnd, pattern>;
+
+class T_S2op_shift <string mnemonic, bits<3> MajOp, bits<3> MinOp, SDNode OpNd>
+  : T_S2op_2_ii <mnemonic, MajOp, MinOp, 0, 0, []>;
+
+// Vector arithmetic shift right by immediate with truncate and pack
+def S2_asr_i_svw_trun : T_S2op_2_id <"vasrw", 0b110, 0b010>;
+
+// Arithmetic/logical shift right/left by immediate
+let Itinerary = S_2op_tc_1_SLOT23 in {
+  def S2_asr_i_r : T_S2op_shift <"asr", 0b000, 0b000, sra>;
+  def S2_lsr_i_r : T_S2op_shift <"lsr", 0b000, 0b001, srl>;
+  def S2_asl_i_r : T_S2op_shift <"asl", 0b000, 0b010, shl>;
+}
+
+// Shift left by immediate with saturation
+let Defs = [USR_OVF] in
+def S2_asl_i_r_sat : T_S2op_2_ii <"asl", 0b010, 0b010, 1>;
+
+// Shift right with round
+def S2_asr_i_r_rnd : T_S2op_2_ii <"asr", 0b010, 0b000, 0, 1>;
+
+let isAsmParserOnly = 1 in
+def S2_asr_i_r_rnd_goodsyntax
+  : SInst <(outs IntRegs:$dst), (ins  IntRegs:$src, u5_0Imm:$u5),
+  "$dst = asrrnd($src, #$u5)",
+  [], "", S_2op_tc_1_SLOT23>;
+
+let isAsmParserOnly = 1 in
+def A2_not: ALU32_rr<(outs IntRegs:$dst),(ins IntRegs:$src),
+  "$dst = not($src)">;
+
+class T_S2op_3<string opc, bits<2>MajOp, bits<3>minOp, bits<1> sat = 0>
+  : SInst<(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss),
+           "$Rdd = "#opc#"($Rss)"#!if(!eq(sat, 1),":sat","")> {
+  bits<5> Rss;
+  bits<5> Rdd;
+  let IClass = 0b1000;
+  let Inst{27-24} = 0;
+  let Inst{23-22} = MajOp;
+  let Inst{20-16} = Rss;
+  let Inst{7-5} = minOp;
+  let Inst{4-0} = Rdd;
+}
+
+def A2_absp : T_S2op_3 <"abs", 0b10, 0b110>;
+def A2_negp : T_S2op_3 <"neg", 0b10, 0b101>;
+def A2_notp : T_S2op_3 <"not", 0b10, 0b100>;
+
+// Innterleave/deinterleave
+def S2_interleave   : T_S2op_3 <"interleave",   0b11, 0b101>;
+def S2_deinterleave : T_S2op_3 <"deinterleave", 0b11, 0b100>;
+
+// Vector Complex conjugate
+def A2_vconj : T_S2op_3 <"vconj", 0b10, 0b111, 1>;
+
+// Vector saturate without pack
+def S2_vsathb_nopack  : T_S2op_3 <"vsathb",  0b00, 0b111>;
+def S2_vsathub_nopack : T_S2op_3 <"vsathub", 0b00, 0b100>;
+def S2_vsatwh_nopack  : T_S2op_3 <"vsatwh",  0b00, 0b110>;
+def S2_vsatwuh_nopack : T_S2op_3 <"vsatwuh", 0b00, 0b101>;
+
+// Vector absolute value halfwords with and without saturation
+// Rdd64=vabsh(Rss64)[:sat]
+def A2_vabsh    : T_S2op_3 <"vabsh", 0b01, 0b100>;
+def A2_vabshsat : T_S2op_3 <"vabsh", 0b01, 0b101, 1>;
+
+// Vector absolute value words with and without saturation
+def A2_vabsw    : T_S2op_3 <"vabsw", 0b01, 0b110>;
+def A2_vabswsat : T_S2op_3 <"vabsw", 0b01, 0b111, 1>;
+
+//===----------------------------------------------------------------------===//
+// STYPE/BIT +
+//===----------------------------------------------------------------------===//
+// Bit count
+
+let hasSideEffects = 0, hasNewValue = 1 in
+class T_COUNT_LEADING<string MnOp, bits<3> MajOp, bits<3> MinOp, bit Is32,
+                dag Out, dag Inp>
+    : SInst<Out, Inp, "$Rd = "#MnOp#"($Rs)", [], "", S_2op_tc_1_SLOT23> {
+  bits<5> Rs;
+  bits<5> Rd;
+  let IClass = 0b1000;
+  let Inst{27} = 0b1;
+  let Inst{26} = Is32;
+  let Inst{25-24} = 0b00;
+  let Inst{23-21} = MajOp;
+  let Inst{20-16} = Rs;
+  let Inst{7-5} = MinOp;
+  let Inst{4-0} = Rd;
+}
+
+class T_COUNT_LEADING_32<string MnOp, bits<3> MajOp, bits<3> MinOp>
+    : T_COUNT_LEADING<MnOp, MajOp, MinOp, 0b1,
+                      (outs IntRegs:$Rd), (ins IntRegs:$Rs)>;
+
+class T_COUNT_LEADING_64<string MnOp, bits<3> MajOp, bits<3> MinOp>
+    : T_COUNT_LEADING<MnOp, MajOp, MinOp, 0b0,
+                      (outs IntRegs:$Rd), (ins DoubleRegs:$Rs)>;
+
+def S2_cl0     : T_COUNT_LEADING_32<"cl0",     0b000, 0b101>;
+def S2_cl1     : T_COUNT_LEADING_32<"cl1",     0b000, 0b110>;
+def S2_ct0     : T_COUNT_LEADING_32<"ct0",     0b010, 0b100>;
+def S2_ct1     : T_COUNT_LEADING_32<"ct1",     0b010, 0b101>;
+def S2_cl0p    : T_COUNT_LEADING_64<"cl0",     0b010, 0b010>;
+def S2_cl1p    : T_COUNT_LEADING_64<"cl1",     0b010, 0b100>;
+def S2_clb     : T_COUNT_LEADING_32<"clb",     0b000, 0b100>;
+def S2_clbp    : T_COUNT_LEADING_64<"clb",     0b010, 0b000>;
+def S2_clbnorm : T_COUNT_LEADING_32<"normamt", 0b000, 0b111>;
+
+// The 64-bit counts leading/trailing are defined in HexagonInstrInfoV4.td.
+
+// Bit set/clear/toggle
+
+let hasSideEffects = 0, hasNewValue = 1 in
+class T_SCT_BIT_IMM<string MnOp, bits<3> MinOp>
+    : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, u5_0Imm:$u5),
+            "$Rd = "#MnOp#"($Rs, #$u5)", [], "", S_2op_tc_1_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> u5;
+  let IClass = 0b1000;
+  let Inst{27-21} = 0b1100110;
+  let Inst{20-16} = Rs;
+  let Inst{13} = 0b0;
+  let Inst{12-8} = u5;
+  let Inst{7-5} = MinOp;
+  let Inst{4-0} = Rd;
+}
+
+let hasSideEffects = 0, hasNewValue = 1 in
+class T_SCT_BIT_REG<string MnOp, bits<2> MinOp>
+    : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
+            "$Rd = "#MnOp#"($Rs, $Rt)", [], "", S_3op_tc_1_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+  let IClass = 0b1100;
+  let Inst{27-22} = 0b011010;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{7-6} = MinOp;
+  let Inst{4-0} = Rd;
+}
+
+def S2_clrbit_i    : T_SCT_BIT_IMM<"clrbit",    0b001>;
+def S2_setbit_i    : T_SCT_BIT_IMM<"setbit",    0b000>;
+def S2_togglebit_i : T_SCT_BIT_IMM<"togglebit", 0b010>;
+def S2_clrbit_r    : T_SCT_BIT_REG<"clrbit",    0b01>;
+def S2_setbit_r    : T_SCT_BIT_REG<"setbit",    0b00>;
+def S2_togglebit_r : T_SCT_BIT_REG<"togglebit", 0b10>;
+
+// Bit test
+
+let hasSideEffects = 0 in
+class T_TEST_BIT_IMM<string MnOp, bits<3> MajOp>
+    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, u5_0Imm:$u5),
+            "$Pd = "#MnOp#"($Rs, #$u5)",
+            [], "", S_2op_tc_2early_SLOT23> {
+  bits<2> Pd;
+  bits<5> Rs;
+  bits<5> u5;
+  let IClass = 0b1000;
+  let Inst{27-24} = 0b0101;
+  let Inst{23-21} = MajOp;
+  let Inst{20-16} = Rs;
+  let Inst{13} = 0;
+  let Inst{12-8} = u5;
+  let Inst{1-0} = Pd;
+}
+
+let hasSideEffects = 0 in
+class T_TEST_BIT_REG<string MnOp, bit IsNeg>
+    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
+            "$Pd = "#MnOp#"($Rs, $Rt)",
+            [], "", S_3op_tc_2early_SLOT23> {
+  bits<2> Pd;
+  bits<5> Rs;
+  bits<5> Rt;
+  let IClass = 0b1100;
+  let Inst{27-22} = 0b011100;
+  let Inst{21} = IsNeg;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{1-0} = Pd;
+}
+
+def S2_tstbit_i : T_TEST_BIT_IMM<"tstbit", 0b000>;
+def S2_tstbit_r : T_TEST_BIT_REG<"tstbit", 0>;
+
+let hasSideEffects = 0 in
+class T_TEST_BITS_IMM<string MnOp, bits<2> MajOp, bit IsNeg>
+    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, u6_0Imm:$u6),
+            "$Pd = "#MnOp#"($Rs, #$u6)",
+            [], "", S_2op_tc_2early_SLOT23> {
+  bits<2> Pd;
+  bits<5> Rs;
+  bits<6> u6;
+  let IClass = 0b1000;
+  let Inst{27-24} = 0b0101;
+  let Inst{23-22} = MajOp;
+  let Inst{21} = IsNeg;
+  let Inst{20-16} = Rs;
+  let Inst{13-8} = u6;
+  let Inst{1-0} = Pd;
+}
+
+let hasSideEffects = 0 in
+class T_TEST_BITS_REG<string MnOp, bits<2> MajOp, bit IsNeg>
+    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
+            "$Pd = "#MnOp#"($Rs, $Rt)",
+            [], "", S_3op_tc_2early_SLOT23> {
+  bits<2> Pd;
+  bits<5> Rs;
+  bits<5> Rt;
+  let IClass = 0b1100;
+  let Inst{27-24} = 0b0111;
+  let Inst{23-22} = MajOp;
+  let Inst{21} = IsNeg;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{1-0} = Pd;
+}
+
+def C2_bitsclri : T_TEST_BITS_IMM<"bitsclr", 0b10, 0>;
+def C2_bitsclr  : T_TEST_BITS_REG<"bitsclr", 0b10, 0>;
+def C2_bitsset  : T_TEST_BITS_REG<"bitsset", 0b01, 0>;
+
+//===----------------------------------------------------------------------===//
+// STYPE/BIT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/COMPLEX +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// STYPE/COMPLEX -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XTYPE/PERM +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XTYPE/PERM -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/PRED +
+//===----------------------------------------------------------------------===//
+
+// Predicate transfer.
+let hasSideEffects = 0, hasNewValue = 1 in
+def C2_tfrpr : SInst<(outs IntRegs:$Rd), (ins PredRegs:$Ps),
+      "$Rd = $Ps", [], "", S_2op_tc_1_SLOT23> {
+  bits<5> Rd;
+  bits<2> Ps;
+
+  let IClass = 0b1000;
+  let Inst{27-24} = 0b1001;
+  let Inst{22} = 0b1;
+  let Inst{17-16} = Ps;
+  let Inst{4-0} = Rd;
+}
+
+// Transfer general register to predicate.
+let hasSideEffects = 0 in
+def C2_tfrrp: SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs),
+      "$Pd = $Rs", [], "", S_2op_tc_2early_SLOT23> {
+  bits<2> Pd;
+  bits<5> Rs;
+
+  let IClass = 0b1000;
+  let Inst{27-21} = 0b0101010;
+  let Inst{20-16} = Rs;
+  let Inst{1-0} = Pd;
+}
+
+let hasSideEffects = 0, isCodeGenOnly = 1 in
+def C2_pxfer_map: SInst<(outs PredRegs:$dst), (ins PredRegs:$src),
+     "$dst = $src">;
+
+//===----------------------------------------------------------------------===//
+// STYPE/PRED -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/SHIFT +
+//===----------------------------------------------------------------------===//
+class S_2OpInstImm<string Mnemonic, bits<3>MajOp, bits<3>MinOp,
+                   Operand Imm, list<dag> pattern = [], bit isRnd = 0>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, Imm:$src2),
+           "$dst = "#Mnemonic#"($src1, #$src2)"#!if(isRnd, ":rnd", ""),
+           pattern> {
+  bits<5> src1;
+  bits<5> dst;
+  let IClass = 0b1000;
+  let Inst{27-24} = 0;
+  let Inst{23-21} = MajOp;
+  let Inst{20-16} = src1;
+  let Inst{7-5} = MinOp;
+  let Inst{4-0} = dst;
+}
+
+class S_2OpInstImmI6<string Mnemonic, SDNode OpNode, bits<3>MinOp>
+  : S_2OpInstImm<Mnemonic, 0b000, MinOp, u6_0Imm, []> {
+  bits<6> src2;
+  let Inst{13-8} = src2;
+}
+
+// Shift by immediate.
+def S2_asr_i_p : S_2OpInstImmI6<"asr", sra, 0b000>;
+def S2_asl_i_p : S_2OpInstImmI6<"asl", shl, 0b010>;
+def S2_lsr_i_p : S_2OpInstImmI6<"lsr", srl, 0b001>;
+
+// Shift left by small amount and add.
+let AddedComplexity = 100, hasNewValue = 1, hasSideEffects = 0 in
+def S2_addasl_rrri: SInst <(outs IntRegs:$Rd),
+                           (ins IntRegs:$Rt, IntRegs:$Rs, u3_0Imm:$u3),
+  "$Rd = addasl($Rt, $Rs, #$u3)" , [],
+  "", S_3op_tc_2_SLOT23> {
+    bits<5> Rd;
+    bits<5> Rt;
+    bits<5> Rs;
+    bits<3> u3;
+
+    let IClass = 0b1100;
+
+    let Inst{27-21} = 0b0100000;
+    let Inst{20-16} = Rs;
+    let Inst{13}    = 0b0;
+    let Inst{12-8}  = Rt;
+    let Inst{7-5}   = u3;
+    let Inst{4-0}   = Rd;
+  }
+
+//===----------------------------------------------------------------------===//
+// STYPE/SHIFT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/VH +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// STYPE/VH -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/VW +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// STYPE/VW -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SYSTEM/SUPER +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SYSTEM/USER +
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 1, isSoloAX = 1 in
+def Y2_barrier : SYSInst<(outs), (ins), "barrier", [],"",ST_tc_st_SLOT0> {
+  let Inst{31-28} = 0b1010;
+  let Inst{27-21} = 0b1000000;
+}
+
+//===----------------------------------------------------------------------===//
+// SYSTEM/SUPER -
+//===----------------------------------------------------------------------===//
+
+// Generate frameindex addresses. The main reason for the offset operand is
+// that every instruction that is allowed to have frame index as an operand
+// will then have that operand followed by an immediate operand (the offset).
+// This simplifies the frame-index elimination code.
+//
+let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1,
+    isPseudo = 1, isCodeGenOnly = 1, hasSideEffects = 0 in {
+  def PS_fi  : ALU32_ri<(outs IntRegs:$Rd),
+                        (ins IntRegs:$fi, s32_0Imm:$off), "">;
+  def PS_fia : ALU32_ri<(outs IntRegs:$Rd),
+                        (ins IntRegs:$Rs, IntRegs:$fi, s32_0Imm:$off), "">;
+}
+
+//===----------------------------------------------------------------------===//
+// CRUSER - Type.
+//===----------------------------------------------------------------------===//
+// HW loop
+let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
+    opExtendable = 0, hasSideEffects = 0 in
+class LOOP_iBase<string mnemonic, Operand brOp, bit mustExtend = 0>
+         : CRInst<(outs), (ins brOp:$offset, u10_0Imm:$src2),
+           #mnemonic#"($offset, #$src2)",
+           [], "" , CR_tc_3x_SLOT3> {
+    bits<9> offset;
+    bits<10> src2;
+
+    let IClass = 0b0110;
+
+    let Inst{27-22} = 0b100100;
+    let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
+    let Inst{20-16} = src2{9-5};
+    let Inst{12-8} = offset{8-4};
+    let Inst{7-5} = src2{4-2};
+    let Inst{4-3} = offset{3-2};
+    let Inst{1-0} = src2{1-0};
+}
+
+let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
+    opExtendable = 0, hasSideEffects = 0 in
+class LOOP_rBase<string mnemonic, Operand brOp, bit mustExtend = 0>
+         : CRInst<(outs), (ins brOp:$offset, IntRegs:$src2),
+           #mnemonic#"($offset, $src2)",
+           [], "" ,CR_tc_3x_SLOT3> {
+    bits<9> offset;
+    bits<5> src2;
+
+    let IClass = 0b0110;
+
+    let Inst{27-22} = 0b000000;
+    let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
+    let Inst{20-16} = src2;
+    let Inst{12-8} = offset{8-4};
+    let Inst{4-3} = offset{3-2};
+  }
+
+multiclass LOOP_ri<string mnemonic> {
+  def i : LOOP_iBase<mnemonic, brtarget>;
+  def r : LOOP_rBase<mnemonic, brtarget>;
+
+  let isCodeGenOnly = 1, isExtended = 1, opExtendable = 0 in {
+    def iext: LOOP_iBase<mnemonic, brtargetExt, 1>;
+    def rext: LOOP_rBase<mnemonic, brtargetExt, 1>;
+  }
+}
+
+
+let Defs = [SA0, LC0, USR] in
+defm J2_loop0 : LOOP_ri<"loop0">;
+
+// Interestingly only loop0's appear to set usr.lpcfg
+let Defs = [SA1, LC1] in
+defm J2_loop1 : LOOP_ri<"loop1">;
+
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
+    Defs = [PC, LC0], Uses = [SA0, LC0] in {
+def ENDLOOP0 : Endloop<(outs), (ins brtarget:$offset),
+                       ":endloop0",
+                       []>;
+}
+
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
+    Defs = [PC, LC1], Uses = [SA1, LC1] in {
+def ENDLOOP1 : Endloop<(outs), (ins brtarget:$offset),
+                       ":endloop1",
+                       []>;
+}
+
+// Pipelined loop instructions, sp[123]loop0
+let Defs = [LC0, SA0, P3, USR], hasSideEffects = 0,
+    isExtentSigned = 1, isExtendable = 1, opExtentBits = 9, opExtentAlign = 2,
+    opExtendable = 0, isPredicateLate = 1 in
+class SPLOOP_iBase<string SP, bits<2> op>
+  : CRInst <(outs), (ins brtarget:$r7_2, u10_0Imm:$U10),
+  "p3 = sp"#SP#"loop0($r7_2, #$U10)" > {
+    bits<9> r7_2;
+    bits<10> U10;
+
+    let IClass = 0b0110;
+
+    let Inst{22-21} = op;
+    let Inst{27-23} = 0b10011;
+    let Inst{20-16} = U10{9-5};
+    let Inst{12-8} = r7_2{8-4};
+    let Inst{7-5} = U10{4-2};
+    let Inst{4-3} = r7_2{3-2};
+    let Inst{1-0} = U10{1-0};
+  }
+
+let Defs = [LC0, SA0, P3, USR], hasSideEffects = 0,
+    isExtentSigned = 1, isExtendable = 1, opExtentBits = 9, opExtentAlign = 2,
+    opExtendable = 0, isPredicateLate = 1 in
+class SPLOOP_rBase<string SP, bits<2> op>
+  : CRInst <(outs), (ins brtarget:$r7_2, IntRegs:$Rs),
+  "p3 = sp"#SP#"loop0($r7_2, $Rs)" > {
+    bits<9> r7_2;
+    bits<5> Rs;
+
+    let IClass = 0b0110;
+
+    let Inst{22-21} = op;
+    let Inst{27-23} = 0b00001;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = r7_2{8-4};
+    let Inst{4-3} = r7_2{3-2};
+  }
+
+multiclass SPLOOP_ri<string mnemonic, bits<2> op> {
+  def i : SPLOOP_iBase<mnemonic, op>;
+  def r : SPLOOP_rBase<mnemonic, op>;
+}
+
+defm J2_ploop1s : SPLOOP_ri<"1", 0b01>;
+defm J2_ploop2s : SPLOOP_ri<"2", 0b10>;
+defm J2_ploop3s : SPLOOP_ri<"3", 0b11>;
+
+// if (Rs[!>=<]=#0) jump:[t/nt]
+let Defs = [PC], isPredicated = 1, isBranch = 1, hasSideEffects = 0,
+    hasSideEffects = 0 in
+class J2_jump_0_Base<string compare, bit isTak, bits<2> op>
+  : CRInst <(outs), (ins IntRegs:$Rs, brtarget:$r13_2),
+  "if ($Rs"#compare#"#0) jump"#!if(isTak, ":t", ":nt")#" $r13_2" > {
+    bits<5> Rs;
+    bits<15> r13_2;
+
+    let IClass = 0b0110;
+
+    let Inst{27-24} = 0b0001;
+    let Inst{23-22} = op;
+    let Inst{12} = isTak;
+    let Inst{21} = r13_2{14};
+    let Inst{20-16} = Rs;
+    let Inst{11-1} = r13_2{12-2};
+    let Inst{13} = r13_2{13};
+  }
+
+multiclass J2_jump_compare_0<string compare, bits<2> op> {
+  def NAME    : J2_jump_0_Base<compare, 0, op>;
+  def NAME#pt : J2_jump_0_Base<compare, 1, op>;
+}
+
+defm J2_jumprz    : J2_jump_compare_0<"!=", 0b00>;
+defm J2_jumprgtez : J2_jump_compare_0<">=", 0b01>;
+defm J2_jumprnz   : J2_jump_compare_0<"==", 0b10>;
+defm J2_jumprltez : J2_jump_compare_0<"<=", 0b11>;
+
+// Transfer to/from Control/GPR Guest/GPR
+let hasSideEffects = 0 in
+class TFR_CR_RS_base<RegisterClass CTRC, RegisterClass RC, bit isDouble>
+  : CRInst <(outs CTRC:$dst), (ins RC:$src),
+  "$dst = $src", [], "", CR_tc_3x_SLOT3> {
+    bits<5> dst;
+    bits<5> src;
+
+    let IClass = 0b0110;
+
+    let Inst{27-25} = 0b001;
+    let Inst{24} = isDouble;
+    let Inst{23-21} = 0b001;
+    let Inst{20-16} = src;
+    let Inst{4-0} = dst;
+  }
+
+def A2_tfrrcr : TFR_CR_RS_base<CtrRegs, IntRegs, 0b0>;
+def A4_tfrpcp : TFR_CR_RS_base<CtrRegs64, DoubleRegs, 0b1>;
+def : InstAlias<"m0 = $Rs", (A2_tfrrcr C6, IntRegs:$Rs)>;
+def : InstAlias<"m1 = $Rs", (A2_tfrrcr C7, IntRegs:$Rs)>;
+
+let hasSideEffects = 0 in
+class TFR_RD_CR_base<RegisterClass RC, RegisterClass CTRC, bit isSingle>
+  : CRInst <(outs RC:$dst), (ins CTRC:$src),
+  "$dst = $src", [], "", CR_tc_3x_SLOT3> {
+    bits<5> dst;
+    bits<5> src;
+
+    let IClass = 0b0110;
+
+    let Inst{27-26} = 0b10;
+    let Inst{25} = isSingle;
+    let Inst{24-21} = 0b0000;
+    let Inst{20-16} = src;
+    let Inst{4-0} = dst;
+  }
+
+let hasNewValue = 1, opNewValue = 0 in
+def A2_tfrcrr : TFR_RD_CR_base<IntRegs, CtrRegs, 1>;
+def A4_tfrcpp : TFR_RD_CR_base<DoubleRegs, CtrRegs64, 0>;
+def : InstAlias<"$Rd = m0", (A2_tfrcrr IntRegs:$Rd, C6)>;
+def : InstAlias<"$Rd = m1", (A2_tfrcrr IntRegs:$Rd, C7)>;
+
+// Y4_trace: Send value to etm trace.
+let isSoloAX = 1, hasSideEffects = 0 in
+def Y4_trace: CRInst <(outs), (ins IntRegs:$Rs),
+  "trace($Rs)"> {
+    bits<5> Rs;
+
+    let IClass = 0b0110;
+    let Inst{27-21} = 0b0010010;
+    let Inst{20-16} = Rs;
+  }
+
+// HI/LO Instructions
+let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0,
+    hasNewValue = 1, opNewValue = 0 in
+class REG_IMMED<string RegHalf, bit Rs, bits<3> MajOp, bit MinOp>
+  : ALU32_ri<(outs IntRegs:$dst),
+              (ins u16_0Imm:$imm_value),
+              "$dst"#RegHalf#" = $imm_value", []> {
+    bits<5> dst;
+    bits<32> imm_value;
+    let IClass = 0b0111;
+
+    let Inst{27} = Rs;
+    let Inst{26-24} = MajOp;
+    let Inst{21} = MinOp;
+    let Inst{20-16} = dst;
+    let Inst{23-22} = imm_value{15-14};
+    let Inst{13-0} = imm_value{13-0};
+}
+
+let isAsmParserOnly = 1 in {
+  def LO : REG_IMMED<".l", 0b0, 0b001, 0b1>;
+  def HI : REG_IMMED<".h", 0b0, 0b010, 0b1>;
+}
+
+let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in {
+  def CONST32 : CONSTLDInst<(outs IntRegs:$Rd), (ins i32imm:$v),
+                "$Rd = CONST32(#$v)", []>;
+  def CONST64 : CONSTLDInst<(outs DoubleRegs:$Rd), (ins i64imm:$v),
+                "$Rd = CONST64(#$v)", []>;
+}
+
+let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
+    isCodeGenOnly = 1 in
+def PS_true : SInst<(outs PredRegs:$dst), (ins), "", []>;
+
+let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
+    isCodeGenOnly = 1 in
+def PS_false : SInst<(outs PredRegs:$dst), (ins), "", []>;
+
+let Defs = [R29, R30], Uses = [R31, R30, R29], isPseudo = 1 in
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                              ".error \"should not emit\" ", []>;
+
+let Defs = [R29, R30, R31], Uses = [R29], isPseudo = 1 in
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                             ".error \"should not emit\" ", []>;
+
+// Call subroutine indirectly.
+let Defs = VolatileV3.Regs in
+def J2_callr : JUMPR_MISC_CALLR<0, 1>;
+
+// Indirect tail-call.
+let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
+    isTerminator = 1, isCodeGenOnly = 1 in
+def PS_tailcall_r : T_JMPr;
+
+// Direct tail-calls.
+let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
+    isTerminator = 1, isCodeGenOnly = 1 in
+def PS_tailcall_i : JInst<(outs), (ins calltarget:$dst), "", []>;
+
+// The reason for the custom inserter is to record all ALLOCA instructions
+// in MachineFunctionInfo.
+let Defs = [R29], isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 1 in
+def PS_alloca: ALU32Inst<(outs IntRegs:$Rd),
+      (ins IntRegs:$Rs, u32_0Imm:$A), "", []>;
+
+let isCodeGenOnly = 1, isPseudo = 1, Uses = [R30], hasSideEffects = 0 in
+def PS_aligna : ALU32Inst<(outs IntRegs:$Rd), (ins u32_0Imm:$A), "", []>;
+
+// XTYPE/SHIFT
+//
+//===----------------------------------------------------------------------===//
+// Template Class
+// Shift by immediate/register and accumulate/logical
+//===----------------------------------------------------------------------===//
+
+// Rx[+-&|]=asr(Rs,#u5)
+// Rx[+-&|^]=lsr(Rs,#u5)
+// Rx[+-&|^]=asl(Rs,#u5)
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_shift_imm_acc_r <string opc1, string opc2, SDNode OpNode1,
+                         SDNode OpNode2, bits<3> majOp, bits<2> minOp>
+  : SInst_acc<(outs IntRegs:$Rx),
+              (ins IntRegs:$src1, IntRegs:$Rs, u5_0Imm:$u5),
+  "$Rx "#opc2#opc1#"($Rs, #$u5)", [],
+  "$src1 = $Rx", S_2op_tc_2_SLOT23> {
+    bits<5> Rx;
+    bits<5> Rs;
+    bits<5> u5;
+
+    let IClass = 0b1000;
+
+    let Inst{27-24} = 0b1110;
+    let Inst{23-22} = majOp{2-1};
+    let Inst{13} = 0b0;
+    let Inst{7} = majOp{0};
+    let Inst{6-5} = minOp;
+    let Inst{4-0} = Rx;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = u5;
+  }
+
+// Rx[+-&|]=asr(Rs,Rt)
+// Rx[+-&|^]=lsr(Rs,Rt)
+// Rx[+-&|^]=asl(Rs,Rt)
+
+let hasNewValue = 1, opNewValue = 0 in
+class T_shift_reg_acc_r <string opc1, string opc2, SDNode OpNode1,
+                         SDNode OpNode2, bits<2> majOp, bits<2> minOp>
+  : SInst_acc<(outs IntRegs:$Rx),
+              (ins IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt),
+  "$Rx "#opc2#opc1#"($Rs, $Rt)", [],
+  "$src1 = $Rx", S_3op_tc_2_SLOT23 > {
+    bits<5> Rx;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1100;
+
+    let Inst{27-24} = 0b1100;
+    let Inst{23-22} = majOp;
+    let Inst{7-6} = minOp;
+    let Inst{4-0} = Rx;
+    let Inst{20-16} = Rs;
+    let Inst{12-8} = Rt;
+  }
+
+// Rxx[+-&|]=asr(Rss,#u6)
+// Rxx[+-&|^]=lsr(Rss,#u6)
+// Rxx[+-&|^]=asl(Rss,#u6)
+
+class T_shift_imm_acc_p <string opc1, string opc2, SDNode OpNode1,
+                         SDNode OpNode2, bits<3> majOp, bits<2> minOp>
+  : SInst_acc<(outs DoubleRegs:$Rxx),
+              (ins DoubleRegs:$src1, DoubleRegs:$Rss, u6_0Imm:$u6),
+  "$Rxx "#opc2#opc1#"($Rss, #$u6)", [],
+  "$src1 = $Rxx", S_2op_tc_2_SLOT23> {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<6> u6;
+
+    let IClass = 0b1000;
+
+    let Inst{27-24} = 0b0010;
+    let Inst{23-22} = majOp{2-1};
+    let Inst{7} = majOp{0};
+    let Inst{6-5} = minOp;
+    let Inst{4-0} = Rxx;
+    let Inst{20-16} = Rss;
+    let Inst{13-8} = u6;
+  }
+
+
+// Rxx[+-&|]=asr(Rss,Rt)
+// Rxx[+-&|^]=lsr(Rss,Rt)
+// Rxx[+-&|^]=asl(Rss,Rt)
+// Rxx[+-&|^]=lsl(Rss,Rt)
+
+class T_shift_reg_acc_p <string opc1, string opc2, SDNode OpNode1,
+                         SDNode OpNode2, bits<3> majOp, bits<2> minOp>
+  : SInst_acc<(outs DoubleRegs:$Rxx),
+              (ins DoubleRegs:$src1, DoubleRegs:$Rss, IntRegs:$Rt),
+  "$Rxx "#opc2#opc1#"($Rss, $Rt)", [],
+  "$src1 = $Rxx", S_3op_tc_2_SLOT23> {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<5> Rt;
+
+    let IClass = 0b1100;
+
+    let Inst{27-24} = 0b1011;
+    let Inst{23-21} = majOp;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rt;
+    let Inst{7-6} = minOp;
+    let Inst{4-0} = Rxx;
+  }
+
+//===----------------------------------------------------------------------===//
+// Multi-class for the shift instructions with logical/arithmetic operators.
+//===----------------------------------------------------------------------===//
+
+multiclass xtype_imm_base<string OpcStr1, string OpcStr2, SDNode OpNode1,
+                         SDNode OpNode2, bits<3> majOp, bits<2> minOp > {
+  def _i_r#NAME : T_shift_imm_acc_r< OpcStr1, OpcStr2, OpNode1,
+                                     OpNode2, majOp, minOp >;
+  def _i_p#NAME : T_shift_imm_acc_p< OpcStr1, OpcStr2, OpNode1,
+                                     OpNode2, majOp, minOp >;
+}
+
+multiclass xtype_imm_acc<string opc1, SDNode OpNode, bits<2>minOp> {
+  let AddedComplexity = 100 in
+  defm _acc  : xtype_imm_base< opc1, "+= ", OpNode, add, 0b001, minOp>;
+
+  defm _nac  : xtype_imm_base< opc1, "-= ", OpNode, sub, 0b000, minOp>;
+  defm _and  : xtype_imm_base< opc1, "&= ", OpNode, and, 0b010, minOp>;
+  defm _or   : xtype_imm_base< opc1, "|= ", OpNode,  or, 0b011, minOp>;
+}
+
+multiclass xtype_xor_imm_acc<string opc1, SDNode OpNode, bits<2>minOp> {
+let AddedComplexity = 100 in
+  defm _xacc  : xtype_imm_base< opc1, "^= ", OpNode, xor, 0b100, minOp>;
+}
+
+defm S2_asr : xtype_imm_acc<"asr", sra, 0b00>;
+
+defm S2_lsr : xtype_imm_acc<"lsr", srl, 0b01>,
+              xtype_xor_imm_acc<"lsr", srl, 0b01>;
+
+defm S2_asl : xtype_imm_acc<"asl", shl, 0b10>,
+              xtype_xor_imm_acc<"asl", shl, 0b10>;
+
+multiclass xtype_reg_acc_r<string opc1, SDNode OpNode, bits<2>minOp> {
+  let AddedComplexity = 100 in
+  def _acc : T_shift_reg_acc_r <opc1, "+= ", OpNode, add, 0b11, minOp>;
+
+  def _nac : T_shift_reg_acc_r <opc1, "-= ", OpNode, sub, 0b10, minOp>;
+  def _and : T_shift_reg_acc_r <opc1, "&= ", OpNode, and, 0b01, minOp>;
+  def _or  : T_shift_reg_acc_r <opc1, "|= ", OpNode,  or, 0b00, minOp>;
+}
+
+multiclass xtype_reg_acc_p<string opc1, SDNode OpNode, bits<2>minOp> {
+  let AddedComplexity = 100 in
+  def _acc : T_shift_reg_acc_p <opc1, "+= ", OpNode, add, 0b110, minOp>;
+
+  def _nac : T_shift_reg_acc_p <opc1, "-= ", OpNode, sub, 0b100, minOp>;
+  def _and : T_shift_reg_acc_p <opc1, "&= ", OpNode, and, 0b010, minOp>;
+  def _or  : T_shift_reg_acc_p <opc1, "|= ", OpNode,  or, 0b000, minOp>;
+  def _xor : T_shift_reg_acc_p <opc1, "^= ", OpNode, xor, 0b011, minOp>;
+}
+
+multiclass xtype_reg_acc<string OpcStr, SDNode OpNode, bits<2> minOp > {
+  defm _r_r : xtype_reg_acc_r <OpcStr, OpNode, minOp>;
+  defm _r_p : xtype_reg_acc_p <OpcStr, OpNode, minOp>;
+}
+
+defm S2_asl : xtype_reg_acc<"asl", shl, 0b10>;
+defm S2_asr : xtype_reg_acc<"asr", sra, 0b00>;
+defm S2_lsr : xtype_reg_acc<"lsr", srl, 0b01>;
+defm S2_lsl : xtype_reg_acc<"lsl", shl, 0b11>;
+
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_S3op_1 <string mnemonic, RegisterClass RC, bits<2> MajOp, bits<3> MinOp,
+                bit SwapOps, bit isSat = 0, bit isRnd = 0, bit hasShift = 0>
+  : SInst <(outs RC:$dst),
+           (ins DoubleRegs:$src1, DoubleRegs:$src2),
+  "$dst = "#mnemonic#"($src1, $src2)"#!if(isRnd, ":rnd", "")
+                                     #!if(hasShift,":>>1","")
+                                     #!if(isSat, ":sat", ""),
+  [], "", S_3op_tc_2_SLOT23 > {
+    bits<5> dst;
+    bits<5> src1;
+    bits<5> src2;
+
+    let IClass = 0b1100;
+
+    let Inst{27-24} = 0b0001;
+    let Inst{23-22} = MajOp;
+    let Inst{20-16} = !if (SwapOps, src2, src1);
+    let Inst{12-8}  = !if (SwapOps, src1, src2);
+    let Inst{7-5}   = MinOp;
+    let Inst{4-0}   = dst;
+  }
+
+class T_S3op_64 <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit SwapOps,
+                 bit isSat = 0, bit isRnd = 0, bit hasShift = 0 >
+  : T_S3op_1 <mnemonic, DoubleRegs, MajOp, MinOp, SwapOps,
+              isSat, isRnd, hasShift>;
+
+let Itinerary = S_3op_tc_1_SLOT23 in {
+  def S2_shuffeb : T_S3op_64 < "shuffeb", 0b00, 0b010, 0>;
+  def S2_shuffeh : T_S3op_64 < "shuffeh", 0b00, 0b110, 0>;
+  def S2_shuffob : T_S3op_64 < "shuffob", 0b00, 0b100, 1>;
+  def S2_shuffoh : T_S3op_64 < "shuffoh", 0b10, 0b000, 1>;
+
+  def S2_vtrunewh : T_S3op_64 < "vtrunewh", 0b10, 0b010, 0>;
+  def S2_vtrunowh : T_S3op_64 < "vtrunowh", 0b10, 0b100, 0>;
+}
+
+def S2_lfsp : T_S3op_64 < "lfs", 0b10, 0b110, 0>;
+
+let hasSideEffects = 0 in
+class T_S3op_2 <string mnemonic, bits<3> MajOp, bit SwapOps>
+  : SInst < (outs DoubleRegs:$Rdd),
+            (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, PredRegs:$Pu),
+  "$Rdd = "#mnemonic#"($Rss, $Rtt, $Pu)",
+  [], "", S_3op_tc_1_SLOT23 > {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+    bits<2> Pu;
+
+    let IClass = 0b1100;
+
+    let Inst{27-24} = 0b0010;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = !if (SwapOps, Rtt, Rss);
+    let Inst{12-8} = !if (SwapOps, Rss, Rtt);
+    let Inst{6-5} = Pu;
+    let Inst{4-0} = Rdd;
+  }
+
+def S2_valignrb  : T_S3op_2 < "valignb",  0b000, 1>;
+def S2_vsplicerb : T_S3op_2 < "vspliceb", 0b100, 0>;
+
+//===----------------------------------------------------------------------===//
+// Template class used by vector shift, vector rotate, vector neg,
+// 32-bit shift, 64-bit shifts, etc.
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in
+class T_S3op_3 <string mnemonic, RegisterClass RC, bits<2> MajOp,
+                 bits<2> MinOp, bit isSat = 0, list<dag> pattern = [] >
+  : SInst <(outs RC:$dst),
+           (ins RC:$src1, IntRegs:$src2),
+  "$dst = "#mnemonic#"($src1, $src2)"#!if(isSat, ":sat", ""),
+  pattern, "", S_3op_tc_1_SLOT23> {
+    bits<5> dst;
+    bits<5> src1;
+    bits<5> src2;
+
+    let IClass = 0b1100;
+
+    let Inst{27-24} = !if(!eq(!cast<string>(RC), "IntRegs"), 0b0110, 0b0011);
+    let Inst{23-22} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{12-8} = src2;
+    let Inst{7-6} = MinOp;
+    let Inst{4-0} = dst;
+  }
+
+let hasNewValue = 1 in
+class T_S3op_shift32 <string mnemonic, SDNode OpNode, bits<2> MinOp>
+  : T_S3op_3 <mnemonic, IntRegs, 0b01, MinOp, 0, []>;
+
+let hasNewValue = 1, Itinerary = S_3op_tc_2_SLOT23 in
+class T_S3op_shift32_Sat <string mnemonic, bits<2> MinOp>
+  : T_S3op_3 <mnemonic, IntRegs, 0b00, MinOp, 1, []>;
+
+
+class T_S3op_shift64 <string mnemonic, SDNode OpNode, bits<2> MinOp>
+  : T_S3op_3 <mnemonic, DoubleRegs, 0b10, MinOp, 0, []>;
+
+
+class T_S3op_shiftVect <string mnemonic, bits<2> MajOp, bits<2> MinOp>
+  : T_S3op_3 <mnemonic, DoubleRegs, MajOp, MinOp, 0, []>;
+
+
+// Shift by register
+// Rdd=[asr|lsr|asl|lsl](Rss,Rt)
+
+def S2_asr_r_p : T_S3op_shift64 < "asr", sra, 0b00>;
+def S2_lsr_r_p : T_S3op_shift64 < "lsr", srl, 0b01>;
+def S2_asl_r_p : T_S3op_shift64 < "asl", shl, 0b10>;
+def S2_lsl_r_p : T_S3op_shift64 < "lsl", shl, 0b11>;
+
+// Rd=[asr|lsr|asl|lsl](Rs,Rt)
+
+def S2_asr_r_r : T_S3op_shift32<"asr", sra, 0b00>;
+def S2_lsr_r_r : T_S3op_shift32<"lsr", srl, 0b01>;
+def S2_asl_r_r : T_S3op_shift32<"asl", shl, 0b10>;
+def S2_lsl_r_r : T_S3op_shift32<"lsl", shl, 0b11>;
+
+// Shift by register with saturation
+// Rd=asr(Rs,Rt):sat
+// Rd=asl(Rs,Rt):sat
+
+let Defs = [USR_OVF] in {
+  def S2_asr_r_r_sat : T_S3op_shift32_Sat<"asr", 0b00>;
+  def S2_asl_r_r_sat : T_S3op_shift32_Sat<"asl", 0b10>;
+}
+
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_S3op_8 <string opc, bits<3> MinOp, bit isSat, bit isRnd, bit hasShift, bit hasSplat = 0>
+  : SInst < (outs IntRegs:$Rd),
+            (ins DoubleRegs:$Rss, IntRegs:$Rt),
+  "$Rd = "#opc#"($Rss, $Rt"#!if(hasSplat, "*", "")#")"
+                           #!if(hasShift, ":<<1", "")
+                           #!if(isRnd, ":rnd", "")
+                           #!if(isSat, ":sat", ""),
+  [], "", S_3op_tc_1_SLOT23 > {
+    bits<5> Rd;
+    bits<5> Rss;
+    bits<5> Rt;
+
+    let IClass = 0b1100;
+
+    let Inst{27-24} = 0b0101;
+    let Inst{20-16} = Rss;
+    let Inst{12-8}  = Rt;
+    let Inst{7-5}   = MinOp;
+    let Inst{4-0}   = Rd;
+  }
+
+def S2_asr_r_svw_trun : T_S3op_8<"vasrw", 0b010, 0, 0, 0>;
+
+let Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
+def S2_vcrotate : T_S3op_shiftVect < "vcrotate", 0b11, 0b00>;
+
+let hasSideEffects = 0 in
+class T_S3op_7 <string mnemonic, bit MajOp >
+  : SInst <(outs DoubleRegs:$Rdd),
+           (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, u3_0Imm:$u3),
+  "$Rdd = "#mnemonic#"($Rss, $Rtt, #$u3)" ,
+  [], "", S_3op_tc_1_SLOT23 > {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+    bits<3> u3;
+
+    let IClass = 0b1100;
+
+    let Inst{27-24} = 0b0000;
+    let Inst{23}    = MajOp;
+    let Inst{20-16} = !if(MajOp, Rss, Rtt);
+    let Inst{12-8}  =  !if(MajOp, Rtt, Rss);
+    let Inst{7-5}   = u3;
+    let Inst{4-0}   = Rdd;
+  }
+
+def S2_valignib  : T_S3op_7 < "valignb", 0>;
+def S2_vspliceib : T_S3op_7 < "vspliceb", 1>;
+
+//===----------------------------------------------------------------------===//
+// Template class for 'insert bitfield' instructions
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in
+class T_S3op_insert <string mnemonic, RegisterClass RC>
+  : SInst <(outs RC:$dst),
+           (ins RC:$src1, RC:$src2, DoubleRegs:$src3),
+  "$dst = "#mnemonic#"($src2, $src3)" ,
+  [], "$src1 = $dst", S_3op_tc_1_SLOT23 > {
+    bits<5> dst;
+    bits<5> src2;
+    bits<5> src3;
+
+    let IClass = 0b1100;
+
+    let Inst{27-26} = 0b10;
+    let Inst{25-24} = !if(!eq(!cast<string>(RC), "IntRegs"), 0b00, 0b10);
+    let Inst{23}    = 0b0;
+    let Inst{20-16} = src2;
+    let Inst{12-8}  = src3;
+    let Inst{4-0}   = dst;
+  }
+
+let hasSideEffects = 0 in
+class T_S2op_insert <bits<4> RegTyBits, RegisterClass RC, Operand ImmOp>
+  : SInst <(outs RC:$dst), (ins RC:$dst2, RC:$src1, ImmOp:$src2, ImmOp:$src3),
+  "$dst = insert($src1, #$src2, #$src3)",
+  [], "$dst2 = $dst", S_2op_tc_2_SLOT23> {
+    bits<5> dst;
+    bits<5> src1;
+    bits<6> src2;
+    bits<6> src3;
+    bit bit23;
+    bit bit13;
+    string ImmOpStr = !cast<string>(ImmOp);
+
+    let bit23 = !if (!eq(ImmOpStr, "u6_0Imm"), src3{5}, 0);
+    let bit13 = !if (!eq(ImmOpStr, "u6_0Imm"), src2{5}, 0);
+
+    let IClass = 0b1000;
+
+    let Inst{27-24} = RegTyBits;
+    let Inst{23}    = bit23;
+    let Inst{22-21} = src3{4-3};
+    let Inst{20-16} = src1;
+    let Inst{13}    = bit13;
+    let Inst{12-8}  = src2{4-0};
+    let Inst{7-5}   = src3{2-0};
+    let Inst{4-0}   = dst;
+  }
+
+// Rx=insert(Rs,Rtt)
+// Rx=insert(Rs,#u5,#U5)
+let hasNewValue = 1 in {
+  def S2_insert_rp : T_S3op_insert <"insert", IntRegs>;
+  def S2_insert    : T_S2op_insert <0b1111, IntRegs, u5_0Imm>;
+}
+
+// Rxx=insert(Rss,Rtt)
+// Rxx=insert(Rss,#u6,#U6)
+def S2_insertp_rp : T_S3op_insert<"insert", DoubleRegs>;
+def S2_insertp    : T_S2op_insert <0b0011, DoubleRegs, u6_0Imm>;
+
+
+//===----------------------------------------------------------------------===//
+// Template class for 'extract bitfield' instructions
+//===----------------------------------------------------------------------===//
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_S3op_extract <string mnemonic, bits<2> MinOp>
+  : SInst <(outs IntRegs:$Rd), (ins IntRegs:$Rs, DoubleRegs:$Rtt),
+  "$Rd = "#mnemonic#"($Rs, $Rtt)",
+  [], "", S_3op_tc_2_SLOT23 > {
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<5> Rtt;
+
+    let IClass = 0b1100;
+
+    let Inst{27-22} = 0b100100;
+    let Inst{20-16} = Rs;
+    let Inst{12-8}  = Rtt;
+    let Inst{7-6}   = MinOp;
+    let Inst{4-0}   = Rd;
+  }
+
+let hasSideEffects = 0 in
+class T_S2op_extract <string mnemonic, bits<4> RegTyBits,
+                      RegisterClass RC, Operand ImmOp>
+  : SInst <(outs RC:$dst), (ins RC:$src1, ImmOp:$src2, ImmOp:$src3),
+  "$dst = "#mnemonic#"($src1, #$src2, #$src3)",
+  [], "", S_2op_tc_2_SLOT23> {
+    bits<5> dst;
+    bits<5> src1;
+    bits<6> src2;
+    bits<6> src3;
+    bit bit23;
+    bit bit13;
+    string ImmOpStr = !cast<string>(ImmOp);
+
+    let bit23 = !if (!eq(ImmOpStr, "u6_0Imm"), src3{5},
+                !if (!eq(mnemonic, "extractu"), 0, 1));
+
+    let bit13 = !if (!eq(ImmOpStr, "u6_0Imm"), src2{5}, 0);
+
+    let IClass = 0b1000;
+
+    let Inst{27-24} = RegTyBits;
+    let Inst{23}    = bit23;
+    let Inst{22-21} = src3{4-3};
+    let Inst{20-16} = src1;
+    let Inst{13}    = bit13;
+    let Inst{12-8}  = src2{4-0};
+    let Inst{7-5}   = src3{2-0};
+    let Inst{4-0}   = dst;
+  }
+
+// Extract bitfield
+
+// Rdd=extractu(Rss,Rtt)
+// Rdd=extractu(Rss,#u6,#U6)
+def S2_extractup_rp : T_S3op_64 < "extractu", 0b00, 0b000, 0>;
+def S2_extractup    : T_S2op_extract <"extractu", 0b0001, DoubleRegs, u6_0Imm>;
+
+// Rd=extractu(Rs,Rtt)
+// Rd=extractu(Rs,#u5,#U5)
+let hasNewValue = 1 in {
+  def S2_extractu_rp : T_S3op_extract<"extractu", 0b00>;
+  def S2_extractu    : T_S2op_extract <"extractu", 0b1101, IntRegs, u5_0Imm>;
+}
+
+//===----------------------------------------------------------------------===//
+// :raw for of tableindx[bdhw] insns
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class tableidxRaw<string OpStr, bits<2>MinOp>
+  : SInst <(outs IntRegs:$Rx),
+           (ins IntRegs:$_dst_, IntRegs:$Rs, u4_0Imm:$u4, s6_0Imm:$S6),
+           "$Rx = "#OpStr#"($Rs, #$u4, #$S6):raw",
+    [], "$Rx = $_dst_" > {
+    bits<5> Rx;
+    bits<5> Rs;
+    bits<4> u4;
+    bits<6> S6;
+
+    let IClass = 0b1000;
+
+    let Inst{27-24} = 0b0111;
+    let Inst{23-22} = MinOp;
+    let Inst{21}    = u4{3};
+    let Inst{20-16} = Rs;
+    let Inst{13-8}  = S6;
+    let Inst{7-5}   = u4{2-0};
+    let Inst{4-0}   = Rx;
+  }
+
+def S2_tableidxb : tableidxRaw<"tableidxb", 0b00>;
+def S2_tableidxh : tableidxRaw<"tableidxh", 0b01>;
+def S2_tableidxw : tableidxRaw<"tableidxw", 0b10>;
+def S2_tableidxd : tableidxRaw<"tableidxd", 0b11>;
+
+//===----------------------------------------------------------------------===//
+// Template class for 'table index' instructions which are assembler mapped
+// to their :raw format.
+//===----------------------------------------------------------------------===//
+let isPseudo = 1 in
+class tableidx_goodsyntax <string mnemonic>
+  : SInst <(outs IntRegs:$Rx),
+           (ins IntRegs:$_dst_, IntRegs:$Rs, u4_0Imm:$u4, u5_0Imm:$u5),
+           "$Rx = "#mnemonic#"($Rs, #$u4, #$u5)",
+           [], "$Rx = $_dst_" >;
+
+def S2_tableidxb_goodsyntax : tableidx_goodsyntax<"tableidxb">;
+def S2_tableidxh_goodsyntax : tableidx_goodsyntax<"tableidxh">;
+def S2_tableidxw_goodsyntax : tableidx_goodsyntax<"tableidxw">;
+def S2_tableidxd_goodsyntax : tableidx_goodsyntax<"tableidxd">;
+
+//===----------------------------------------------------------------------===//
+// V3 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV3.td"
+
+//===----------------------------------------------------------------------===//
+// V3 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V4 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV4.td"
+
+//===----------------------------------------------------------------------===//
+// V4 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V5 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV5.td"
+
+//===----------------------------------------------------------------------===//
+// V5 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V60 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV60.td"
+
+//===----------------------------------------------------------------------===//
+// V60 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU32/64/Vector +
+//===----------------------------------------------------------------------===///
+
+include "HexagonInstrInfoVector.td"
+
+include "HexagonInstrAlias.td"
+include "HexagonSystemInst.td"
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV3.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV3.td
new file mode 100644
index 000000000000..225f94405076
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV3.td
@@ -0,0 +1,215 @@
+//=- HexagonInstrInfoV3.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V3 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// J +
+//===----------------------------------------------------------------------===//
+// Call subroutine.
+let isCall = 1, hasSideEffects = 1, isPredicable = 1,
+    isExtended = 0, isExtendable = 1, opExtendable = 0,
+    isExtentSigned = 1, opExtentBits = 24, opExtentAlign = 2 in
+class T_Call<bit CSR, string ExtStr>
+  : JInst<(outs), (ins calltarget:$dst),
+      "call " # ExtStr # "$dst", [], "", J_tc_2early_SLOT23> {
+  let BaseOpcode = "call";
+  bits<24> dst;
+
+  let Defs = !if (CSR, VolatileV3.Regs, []);
+  let IClass = 0b0101;
+  let Inst{27-25} = 0b101;
+  let Inst{24-16,13-1} = dst{23-2};
+  let Inst{0} = 0b0;
+}
+
+let isCall = 1, hasSideEffects = 1, isPredicated = 1,
+    isExtended = 0, isExtendable = 1, opExtendable = 1,
+    isExtentSigned = 1, opExtentBits = 17, opExtentAlign = 2 in
+class T_CallPred<bit CSR, bit IfTrue, string ExtStr>
+  : JInst<(outs), (ins PredRegs:$Pu, calltarget:$dst),
+      CondStr<"$Pu", IfTrue, 0>.S # "call " # ExtStr # "$dst",
+      [], "", J_tc_2early_SLOT23> {
+  let BaseOpcode = "call";
+  let isPredicatedFalse = !if(IfTrue,0,1);
+  bits<2> Pu;
+  bits<17> dst;
+
+  let Defs = !if (CSR, VolatileV3.Regs, []);
+  let IClass = 0b0101;
+  let Inst{27-24} = 0b1101;
+  let Inst{23-22,20-16,13,7-1} = dst{16-2};
+  let Inst{21} = !if(IfTrue,0,1);
+  let Inst{11} = 0b0;
+  let Inst{9-8} = Pu;
+}
+
+multiclass T_Calls<bit CSR, string ExtStr> {
+  def NAME : T_Call<CSR, ExtStr>;
+  def t    : T_CallPred<CSR, 1, ExtStr>;
+  def f    : T_CallPred<CSR, 0, ExtStr>;
+}
+
+defm J2_call: T_Calls<1, "">, PredRel;
+
+let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1,
+    Defs = VolatileV3.Regs in
+def PS_call_nr : T_Call<1, "">, PredRel;
+
+let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1,
+    Defs = [PC, R31, R6, R7, P0] in
+def PS_call_stk :  T_Call<0, "">, PredRel;
+
+//===----------------------------------------------------------------------===//
+// J -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// JR +
+//===----------------------------------------------------------------------===//
+// Call subroutine from register.
+
+let isCodeGenOnly = 1, Defs = VolatileV3.Regs in {
+  def PS_callr_nr : JUMPR_MISC_CALLR<0, 1>; // Call, no return.
+}
+
+//===----------------------------------------------------------------------===//
+// JR -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU +
+//===----------------------------------------------------------------------===//
+
+let Defs = [USR_OVF], Itinerary = ALU64_tc_2_SLOT23 in
+def A2_addpsat : T_ALU64_arith<"add", 0b011, 0b101, 1, 0, 1>;
+
+class T_ALU64_addsp_hl<string suffix, bits<3> MinOp>
+  : T_ALU64_rr<"add", suffix, 0b0011, 0b011, MinOp, 0, 0, "">;
+
+def A2_addspl : T_ALU64_addsp_hl<":raw:lo", 0b110>;
+def A2_addsph : T_ALU64_addsp_hl<":raw:hi", 0b111>;
+
+let hasSideEffects = 0, isAsmParserOnly = 1 in
+def A2_addsp : ALU64_rr<(outs DoubleRegs:$Rd),
+  (ins IntRegs:$Rs, DoubleRegs:$Rt), "$Rd = add($Rs, $Rt)", [],
+  "", ALU64_tc_1_SLOT23>;
+
+
+let hasSideEffects = 0 in
+class T_XTYPE_MIN_MAX_P<bit isMax, bit isUnsigned>
+  : ALU64Inst<(outs DoubleRegs:$Rd), (ins DoubleRegs:$Rt, DoubleRegs:$Rs),
+  "$Rd = "#!if(isMax,"max","min")#!if(isUnsigned,"u","")
+          #"($Rt, $Rs)", [], "", ALU64_tc_2_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+
+  let Inst{27-23} = 0b00111;
+  let Inst{22-21} = !if(isMax, 0b10, 0b01);
+  let Inst{20-16} = !if(isMax, Rt, Rs);
+  let Inst{12-8} = !if(isMax, Rs, Rt);
+  let Inst{7} = 0b1;
+  let Inst{6} = !if(isMax, 0b0, 0b1);
+  let Inst{5} = isUnsigned;
+  let Inst{4-0} = Rd;
+}
+
+def A2_minp  : T_XTYPE_MIN_MAX_P<0, 0>;
+def A2_minup : T_XTYPE_MIN_MAX_P<0, 1>;
+def A2_maxp  : T_XTYPE_MIN_MAX_P<1, 0>;
+def A2_maxup : T_XTYPE_MIN_MAX_P<1, 1>;
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// :raw form of vrcmpys:hi/lo insns
+//===----------------------------------------------------------------------===//
+// Vector reduce complex multiply by scalar.
+let Defs = [USR_OVF], hasSideEffects = 0 in
+class T_vrcmpRaw<string HiLo, bits<3>MajOp>:
+  MInst<(outs DoubleRegs:$Rdd),
+         (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+         "$Rdd = vrcmpys($Rss, $Rtt):<<1:sat:raw:"#HiLo, []> {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1000;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = Rss;
+    let Inst{12-8}  = Rtt;
+    let Inst{7-5}   = 0b100;
+    let Inst{4-0}   = Rdd;
+}
+
+def M2_vrcmpys_s1_h: T_vrcmpRaw<"hi", 0b101>;
+def M2_vrcmpys_s1_l: T_vrcmpRaw<"lo", 0b111>;
+
+// Assembler mapped to M2_vrcmpys_s1_h or M2_vrcmpys_s1_l
+let hasSideEffects = 0, isAsmParserOnly = 1 in
+def M2_vrcmpys_s1
+ : MInst<(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, IntRegs:$Rt),
+ "$Rdd=vrcmpys($Rss,$Rt):<<1:sat">;
+
+// Vector reduce complex multiply by scalar with accumulation.
+let Defs = [USR_OVF], hasSideEffects = 0 in
+class T_vrcmpys_acc<string HiLo, bits<3>MajOp>:
+  MInst <(outs DoubleRegs:$Rxx),
+         (ins DoubleRegs:$_src_, DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Rxx += vrcmpys($Rss, $Rtt):<<1:sat:raw:"#HiLo, [],
+  "$Rxx = $_src_"> {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1010;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = Rss;
+    let Inst{12-8}  = Rtt;
+    let Inst{7-5}   = 0b100;
+    let Inst{4-0}   = Rxx;
+  }
+
+def M2_vrcmpys_acc_s1_h: T_vrcmpys_acc<"hi", 0b101>;
+def M2_vrcmpys_acc_s1_l: T_vrcmpys_acc<"lo", 0b111>;
+
+// Assembler mapped to M2_vrcmpys_acc_s1_h or M2_vrcmpys_acc_s1_l
+
+let isAsmParserOnly = 1 in
+def M2_vrcmpys_acc_s1
+  : MInst <(outs DoubleRegs:$dst),
+           (ins DoubleRegs:$dst2, DoubleRegs:$src1, IntRegs:$src2),
+           "$dst += vrcmpys($src1, $src2):<<1:sat", [],
+           "$dst2 = $dst">;
+
+def M2_vrcmpys_s1rp_h : T_MType_vrcmpy <"vrcmpys", 0b101, 0b110, 1>;
+def M2_vrcmpys_s1rp_l : T_MType_vrcmpy <"vrcmpys", 0b101, 0b111, 0>;
+
+// Assembler mapped to M2_vrcmpys_s1rp_h or M2_vrcmpys_s1rp_l
+let isAsmParserOnly = 1 in
+def M2_vrcmpys_s1rp
+  : MInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss, IntRegs:$Rt),
+  "$Rd=vrcmpys($Rss,$Rt):<<1:rnd:sat">;
+
+
+// S2_cabacdecbin: Cabac decode bin.
+let Defs = [P0], isPredicateLate = 1, Itinerary = S_3op_tc_1_SLOT23 in
+def S2_cabacdecbin : T_S3op_64 < "decbin", 0b11, 0b110, 0>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td
new file mode 100644
index 000000000000..18943a082d28
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -0,0 +1,3301 @@
+//=- HexagonInstrInfoV4.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V4 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+def DuplexIClass0:  InstDuplex < 0 >;
+def DuplexIClass1:  InstDuplex < 1 >;
+def DuplexIClass2:  InstDuplex < 2 >;
+let isExtendable = 1 in {
+  def DuplexIClass3:  InstDuplex < 3 >;
+  def DuplexIClass4:  InstDuplex < 4 >;
+  def DuplexIClass5:  InstDuplex < 5 >;
+  def DuplexIClass6:  InstDuplex < 6 >;
+  def DuplexIClass7:  InstDuplex < 7 >;
+}
+def DuplexIClass8:  InstDuplex < 8 >;
+def DuplexIClass9:  InstDuplex < 9 >;
+def DuplexIClassA:  InstDuplex < 0xA >;
+def DuplexIClassB:  InstDuplex < 0xB >;
+def DuplexIClassC:  InstDuplex < 0xC >;
+def DuplexIClassD:  InstDuplex < 0xD >;
+def DuplexIClassE:  InstDuplex < 0xE >;
+def DuplexIClassF:  InstDuplex < 0xF >;
+
+let hasSideEffects = 0 in
+class T_Immext<Operand ImmType>
+  : EXTENDERInst<(outs), (ins ImmType:$imm),
+                 "immext(#$imm)", []> {
+    bits<32> imm;
+    let IClass = 0b0000;
+
+    let Inst{27-16} = imm{31-20};
+    let Inst{13-0} = imm{19-6};
+  }
+
+def A4_ext : T_Immext<u26_6Imm>;
+let isCodeGenOnly = 1 in {
+  let isBranch = 1 in
+    def A4_ext_b : T_Immext<brtarget>;
+  let isCall = 1 in
+    def A4_ext_c : T_Immext<calltarget>;
+  def A4_ext_g : T_Immext<globaladdress>;
+}
+
+// Hexagon V4 Architecture spec defines 8 instruction classes:
+// LD ST ALU32 XTYPE J JR MEMOP NV CR SYSTEM(system is not implemented in the
+// compiler)
+
+// LD Instructions:
+// ========================================
+// Loads (8/16/32/64 bit)
+// Deallocframe
+
+// ST Instructions:
+// ========================================
+// Stores (8/16/32/64 bit)
+// Allocframe
+
+// ALU32 Instructions:
+// ========================================
+// Arithmetic / Logical (32 bit)
+// Vector Halfword
+
+// XTYPE Instructions (32/64 bit):
+// ========================================
+// Arithmetic, Logical, Bit Manipulation
+// Multiply (Integer, Fractional, Complex)
+// Permute / Vector Permute Operations
+// Predicate Operations
+// Shift / Shift with Add/Sub/Logical
+// Vector Byte ALU
+// Vector Halfword (ALU, Shift, Multiply)
+// Vector Word (ALU, Shift)
+
+// J Instructions:
+// ========================================
+// Jump/Call PC-relative
+
+// JR Instructions:
+// ========================================
+// Jump/Call Register
+
+// MEMOP Instructions:
+// ========================================
+// Operation on memory (8/16/32 bit)
+
+// NV Instructions:
+// ========================================
+// New-value Jumps
+// New-value Stores
+
+// CR Instructions:
+// ========================================
+// Control-Register Transfers
+// Hardware Loop Setup
+// Predicate Logicals & Reductions
+
+// SYSTEM Instructions (not implemented in the compiler):
+// ========================================
+// Prefetch
+// Cache Maintenance
+// Bus Operations
+
+
+//===----------------------------------------------------------------------===//
+// ALU32 +
+//===----------------------------------------------------------------------===//
+
+class T_ALU32_3op_not<string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                      bit OpsRev>
+  : T_ALU32_3op<mnemonic, MajOp, MinOp, OpsRev, 0> {
+  let AsmString = "$Rd = "#mnemonic#"($Rs, ~$Rt)";
+}
+
+let BaseOpcode = "andn_rr", CextOpcode = "andn" in
+def A4_andn    : T_ALU32_3op_not<"and", 0b001, 0b100, 1>;
+let BaseOpcode = "orn_rr", CextOpcode = "orn" in
+def A4_orn     : T_ALU32_3op_not<"or",  0b001, 0b101, 1>;
+
+let CextOpcode = "rcmp.eq" in
+def A4_rcmpeq  : T_ALU32_3op<"cmp.eq",  0b011, 0b010, 0, 1>;
+let CextOpcode = "!rcmp.eq" in
+def A4_rcmpneq : T_ALU32_3op<"!cmp.eq", 0b011, 0b011, 0, 1>;
+
+def C4_cmpneq  : T_ALU32_3op_cmp<"!cmp.eq",  0b00, 1, 1>;
+def C4_cmplte  : T_ALU32_3op_cmp<"!cmp.gt",  0b10, 1, 0>;
+def C4_cmplteu : T_ALU32_3op_cmp<"!cmp.gtu", 0b11, 1, 0>;
+
+class T_CMP_rrbh<string mnemonic, bits<3> MinOp, bit IsComm>
+  : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
+    "$Pd = "#mnemonic#"($Rs, $Rt)", [], "", S_3op_tc_2early_SLOT23>,
+    ImmRegRel {
+  let InputType = "reg";
+  let CextOpcode = mnemonic;
+  let isCompare = 1;
+  let isCommutable = IsComm;
+  let hasSideEffects = 0;
+
+  bits<2> Pd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1100;
+  let Inst{27-21} = 0b0111110;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{7-5} = MinOp;
+  let Inst{1-0} = Pd;
+}
+
+def A4_cmpbeq  : T_CMP_rrbh<"cmpb.eq",  0b110, 1>;
+def A4_cmpbgt  : T_CMP_rrbh<"cmpb.gt",  0b010, 0>;
+def A4_cmpbgtu : T_CMP_rrbh<"cmpb.gtu", 0b111, 0>;
+def A4_cmpheq  : T_CMP_rrbh<"cmph.eq",  0b011, 1>;
+def A4_cmphgt  : T_CMP_rrbh<"cmph.gt",  0b100, 0>;
+def A4_cmphgtu : T_CMP_rrbh<"cmph.gtu", 0b101, 0>;
+
+class T_CMP_ribh<string mnemonic, bits<2> MajOp, bit IsHalf, bit IsComm,
+                 Operand ImmType, bit IsImmExt, bit IsImmSigned, int ImmBits>
+  : ALU64Inst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, ImmType:$Imm),
+    "$Pd = "#mnemonic#"($Rs, #$Imm)", [], "", ALU64_tc_2early_SLOT23>,
+    ImmRegRel {
+  let InputType = "imm";
+  let CextOpcode = mnemonic;
+  let isCompare = 1;
+  let isCommutable = IsComm;
+  let hasSideEffects = 0;
+  let isExtendable = IsImmExt;
+  let opExtendable = !if (IsImmExt, 2, 0);
+  let isExtentSigned = IsImmSigned;
+  let opExtentBits = ImmBits;
+
+  bits<2> Pd;
+  bits<5> Rs;
+  bits<8> Imm;
+
+  let IClass = 0b1101;
+  let Inst{27-24} = 0b1101;
+  let Inst{22-21} = MajOp;
+  let Inst{20-16} = Rs;
+  let Inst{12-5} = Imm;
+  let Inst{4} = 0b0;
+  let Inst{3} = IsHalf;
+  let Inst{1-0} = Pd;
+}
+
+def A4_cmpbeqi  : T_CMP_ribh<"cmpb.eq",  0b00, 0, 1, u8_0Imm, 0, 0, 8>;
+def A4_cmpbgti  : T_CMP_ribh<"cmpb.gt",  0b01, 0, 0, s8_0Imm, 0, 1, 8>;
+def A4_cmpbgtui : T_CMP_ribh<"cmpb.gtu", 0b10, 0, 0, u7_0Ext, 1, 0, 7>;
+def A4_cmpheqi  : T_CMP_ribh<"cmph.eq",  0b00, 1, 1, s8_0Ext, 1, 1, 8>;
+def A4_cmphgti  : T_CMP_ribh<"cmph.gt",  0b01, 1, 0, s8_0Ext, 1, 1, 8>;
+def A4_cmphgtui : T_CMP_ribh<"cmph.gtu", 0b10, 1, 0, u7_0Ext, 1, 0, 7>;
+
+class T_RCMP_EQ_ri<string mnemonic, bit IsNeg>
+  : ALU32_ri<(outs IntRegs:$Rd), (ins IntRegs:$Rs, s8_0Ext:$s8),
+    "$Rd = "#mnemonic#"($Rs, #$s8)", [], "", ALU32_2op_tc_1_SLOT0123>,
+    ImmRegRel {
+  let InputType = "imm";
+  let CextOpcode = !if (IsNeg, "!rcmp.eq", "rcmp.eq");
+  let isExtendable = 1;
+  let opExtendable = 2;
+  let isExtentSigned = 1;
+  let opExtentBits = 8;
+  let hasNewValue = 1;
+
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<8> s8;
+
+  let IClass = 0b0111;
+  let Inst{27-24} = 0b0011;
+  let Inst{22} = 0b1;
+  let Inst{21} = IsNeg;
+  let Inst{20-16} = Rs;
+  let Inst{13} = 0b1;
+  let Inst{12-5} = s8;
+  let Inst{4-0} = Rd;
+}
+
+def A4_rcmpeqi  : T_RCMP_EQ_ri<"cmp.eq",  0>;
+def A4_rcmpneqi : T_RCMP_EQ_ri<"!cmp.eq", 1>;
+
+//===----------------------------------------------------------------------===//
+// ALU32 -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM +
+//===----------------------------------------------------------------------===//
+
+// Combine a word and an immediate into a register pair.
+let hasSideEffects = 0, isExtentSigned = 1, isExtendable = 1,
+    opExtentBits = 8 in
+class T_Combine1 <bits<2> MajOp, dag ins, string AsmStr>
+  : ALU32Inst <(outs DoubleRegs:$Rdd), ins, AsmStr> {
+    bits<5> Rdd;
+    bits<5> Rs;
+    bits<8> s8;
+
+    let IClass      = 0b0111;
+    let Inst{27-24} = 0b0011;
+    let Inst{22-21} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{13}    = 0b1;
+    let Inst{12-5}  = s8;
+    let Inst{4-0}   = Rdd;
+  }
+
+let opExtendable = 2 in
+def A4_combineri : T_Combine1<0b00, (ins IntRegs:$Rs, s8_0Ext:$s8),
+                                    "$Rdd = combine($Rs, #$s8)">;
+
+let opExtendable = 1 in
+def A4_combineir : T_Combine1<0b01, (ins s8_0Ext:$s8, IntRegs:$Rs),
+                                    "$Rdd = combine(#$s8, $Rs)">;
+
+// A4_combineii: Set two small immediates.
+let hasSideEffects = 0, isExtendable = 1, opExtentBits = 6, opExtendable = 2 in
+def A4_combineii: ALU32Inst<(outs DoubleRegs:$Rdd), (ins s8_0Imm:$s8, u6_0Ext:$U6),
+  "$Rdd = combine(#$s8, #$U6)"> {
+    bits<5> Rdd;
+    bits<8> s8;
+    bits<6> U6;
+
+    let IClass = 0b0111;
+    let Inst{27-23} = 0b11001;
+    let Inst{20-16} = U6{5-1};
+    let Inst{13}    = U6{0};
+    let Inst{12-5}  = s8;
+    let Inst{4-0}   = Rdd;
+  }
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LD +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Template class for load instructions with Absolute set addressing mode.
+//===----------------------------------------------------------------------===//
+let isExtended = 1, opExtendable = 2, opExtentBits = 6, addrMode = AbsoluteSet,
+    hasSideEffects = 0 in
+class T_LD_abs_set<string mnemonic, RegisterClass RC, bits<4>MajOp>:
+            LDInst<(outs RC:$dst1, IntRegs:$dst2),
+            (ins u6_0Ext:$addr),
+            "$dst1 = "#mnemonic#"($dst2 = #$addr)",
+            []> {
+  bits<7> name;
+  bits<5> dst1;
+  bits<5> dst2;
+  bits<6> addr;
+
+  let IClass = 0b1001;
+  let Inst{27-25} = 0b101;
+  let Inst{24-21} = MajOp;
+  let Inst{13-12} = 0b01;
+  let Inst{4-0}   = dst1;
+  let Inst{20-16} = dst2;
+  let Inst{11-8}  = addr{5-2};
+  let Inst{6-5}   = addr{1-0};
+}
+
+let accessSize = ByteAccess, hasNewValue = 1 in {
+  def L4_loadrb_ap   : T_LD_abs_set <"memb",   IntRegs, 0b1000>;
+  def L4_loadrub_ap  : T_LD_abs_set <"memub",  IntRegs, 0b1001>;
+}
+
+let accessSize = HalfWordAccess, hasNewValue = 1 in {
+  def L4_loadrh_ap  : T_LD_abs_set <"memh",  IntRegs, 0b1010>;
+  def L4_loadruh_ap : T_LD_abs_set <"memuh", IntRegs, 0b1011>;
+  def L4_loadbsw2_ap : T_LD_abs_set <"membh",  IntRegs, 0b0001>;
+  def L4_loadbzw2_ap : T_LD_abs_set <"memubh", IntRegs, 0b0011>;
+}
+
+let accessSize = WordAccess, hasNewValue = 1 in
+  def L4_loadri_ap : T_LD_abs_set <"memw", IntRegs, 0b1100>;
+
+let accessSize = WordAccess in {
+  def L4_loadbzw4_ap : T_LD_abs_set <"memubh", DoubleRegs, 0b0101>;
+  def L4_loadbsw4_ap : T_LD_abs_set <"membh",  DoubleRegs, 0b0111>;
+}
+
+let accessSize = DoubleWordAccess in
+def L4_loadrd_ap : T_LD_abs_set <"memd", DoubleRegs, 0b1110>;
+
+let accessSize = ByteAccess in
+  def L4_loadalignb_ap : T_LD_abs_set <"memb_fifo", DoubleRegs, 0b0100>;
+
+let accessSize = HalfWordAccess in
+def L4_loadalignh_ap : T_LD_abs_set <"memh_fifo", DoubleRegs, 0b0010>;
+
+// Load - Indirect with long offset
+let InputType = "imm", addrMode = BaseLongOffset, isExtended = 1,
+opExtentBits = 6, opExtendable = 3 in
+class T_LoadAbsReg <string mnemonic, string CextOp, RegisterClass RC,
+                    bits<4> MajOp>
+  : LDInst <(outs RC:$dst), (ins IntRegs:$src1, u2_0Imm:$src2, u6_0Ext:$src3),
+  "$dst = "#mnemonic#"($src1<<#$src2 + #$src3)",
+  [] >, ImmRegShl {
+    bits<5> dst;
+    bits<5> src1;
+    bits<2> src2;
+    bits<6> src3;
+    let CextOpcode = CextOp;
+    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
+
+    let IClass = 0b1001;
+    let Inst{27-25} = 0b110;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13}    = src2{1};
+    let Inst{12}    = 0b1;
+    let Inst{11-8}  = src3{5-2};
+    let Inst{7}     = src2{0};
+    let Inst{6-5}   = src3{1-0};
+    let Inst{4-0}   = dst;
+  }
+
+let accessSize = ByteAccess in {
+  def L4_loadrb_ur  : T_LoadAbsReg<"memb",  "LDrib", IntRegs, 0b1000>;
+  def L4_loadrub_ur : T_LoadAbsReg<"memub", "LDriub", IntRegs, 0b1001>;
+  def L4_loadalignb_ur : T_LoadAbsReg<"memb_fifo", "LDrib_fifo",
+                                      DoubleRegs, 0b0100>;
+}
+
+let accessSize = HalfWordAccess in {
+  def L4_loadrh_ur   : T_LoadAbsReg<"memh",   "LDrih",    IntRegs, 0b1010>;
+  def L4_loadruh_ur  : T_LoadAbsReg<"memuh",  "LDriuh",   IntRegs, 0b1011>;
+  def L4_loadbsw2_ur : T_LoadAbsReg<"membh",  "LDribh2",  IntRegs, 0b0001>;
+  def L4_loadbzw2_ur : T_LoadAbsReg<"memubh", "LDriubh2", IntRegs, 0b0011>;
+  def L4_loadalignh_ur : T_LoadAbsReg<"memh_fifo", "LDrih_fifo",
+                                      DoubleRegs, 0b0010>;
+}
+
+let accessSize = WordAccess in {
+  def L4_loadri_ur   : T_LoadAbsReg<"memw", "LDriw", IntRegs, 0b1100>;
+  def L4_loadbsw4_ur : T_LoadAbsReg<"membh", "LDribh4", DoubleRegs, 0b0111>;
+  def L4_loadbzw4_ur : T_LoadAbsReg<"memubh", "LDriubh4", DoubleRegs, 0b0101>;
+}
+
+let accessSize = DoubleWordAccess in
+def L4_loadrd_ur  : T_LoadAbsReg<"memd", "LDrid", DoubleRegs, 0b1110>;
+
+
+//===----------------------------------------------------------------------===//
+// Template classes for the non-predicated load instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+class T_load_rr <string mnemonic, RegisterClass RC, bits<3> MajOp>:
+   LDInst<(outs RC:$dst), (ins IntRegs:$src1, IntRegs:$src2, u2_0Imm:$u2),
+  "$dst = "#mnemonic#"($src1 + $src2<<#$u2)",
+  [], "", V4LDST_tc_ld_SLOT01>, ImmRegShl, AddrModeRel {
+    bits<5> dst;
+    bits<5> src1;
+    bits<5> src2;
+    bits<2> u2;
+
+    let IClass = 0b0011;
+
+    let Inst{27-24} = 0b1010;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{12-8}  = src2;
+    let Inst{13}    = u2{1};
+    let Inst{7}     = u2{0};
+    let Inst{4-0}   = dst;
+  }
+
+//===----------------------------------------------------------------------===//
+// Template classes for the predicated load instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+let isPredicated =  1 in
+class T_pload_rr <string mnemonic, RegisterClass RC, bits<3> MajOp,
+                  bit isNot, bit isPredNew>:
+   LDInst <(outs RC:$dst),
+           (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2_0Imm:$u2),
+  !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+  ") ")#"$dst = "#mnemonic#"($src2+$src3<<#$u2)",
+  [], "", V4LDST_tc_ld_SLOT01>, AddrModeRel {
+    bits<5> dst;
+    bits<2> src1;
+    bits<5> src2;
+    bits<5> src3;
+    bits<2> u2;
+
+    let isPredicatedFalse = isNot;
+    let isPredicatedNew = isPredNew;
+
+    let IClass = 0b0011;
+
+    let Inst{27-26} = 0b00;
+    let Inst{25}    = isPredNew;
+    let Inst{24}    = isNot;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{12-8}  = src3;
+    let Inst{13}    = u2{1};
+    let Inst{7}     = u2{0};
+    let Inst{6-5}   = src1;
+    let Inst{4-0}   = dst;
+  }
+
+//===----------------------------------------------------------------------===//
+// multiclass for load instructions with base + register offset
+// addressing mode
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, addrMode = BaseRegOffset in
+multiclass ld_idxd_shl <string mnemonic, string CextOp, RegisterClass RC,
+                        bits<3> MajOp > {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl,
+      InputType = "reg" in {
+    let isPredicable = 1 in
+    def L4_#NAME#_rr : T_load_rr <mnemonic, RC, MajOp>;
+
+    // Predicated
+    def L4_p#NAME#t_rr : T_pload_rr <mnemonic, RC, MajOp, 0, 0>;
+    def L4_p#NAME#f_rr : T_pload_rr <mnemonic, RC, MajOp, 1, 0>;
+
+    // Predicated new
+    def L4_p#NAME#tnew_rr : T_pload_rr <mnemonic, RC, MajOp, 0, 1>;
+    def L4_p#NAME#fnew_rr : T_pload_rr <mnemonic, RC, MajOp, 1, 1>;
+  }
+}
+
+let hasNewValue = 1, accessSize = ByteAccess in {
+  defm loadrb  : ld_idxd_shl<"memb", "LDrib", IntRegs, 0b000>;
+  defm loadrub : ld_idxd_shl<"memub", "LDriub", IntRegs, 0b001>;
+}
+
+let hasNewValue = 1, accessSize = HalfWordAccess in {
+  defm loadrh  : ld_idxd_shl<"memh", "LDrih", IntRegs, 0b010>;
+  defm loadruh : ld_idxd_shl<"memuh", "LDriuh", IntRegs, 0b011>;
+}
+
+let hasNewValue = 1, accessSize = WordAccess in
+defm loadri : ld_idxd_shl<"memw", "LDriw", IntRegs, 0b100>;
+
+let accessSize = DoubleWordAccess in
+defm loadrd  : ld_idxd_shl<"memd", "LDrid", DoubleRegs, 0b110>;
+
+//===----------------------------------------------------------------------===//
+// LD -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ST +
+//===----------------------------------------------------------------------===//
+///
+//===----------------------------------------------------------------------===//
+// Template class for store instructions with Absolute set addressing mode.
+//===----------------------------------------------------------------------===//
+let isExtended = 1, opExtendable = 1, opExtentBits = 6,
+    addrMode = AbsoluteSet in
+class T_ST_absset <string mnemonic, string BaseOp, RegisterClass RC,
+                   bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0>
+  : STInst<(outs IntRegs:$dst),
+           (ins u6_0Ext:$addr, RC:$src),
+    mnemonic#"($dst = #$addr) = $src"#!if(isHalf, ".h","")>, NewValueRel {
+    bits<5> dst;
+    bits<6> addr;
+    bits<5> src;
+    let accessSize = AccessSz;
+    let BaseOpcode = BaseOp#"_AbsSet";
+
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
+
+    let IClass = 0b1010;
+
+    let Inst{27-24} = 0b1011;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = dst;
+    let Inst{13}    = 0b0;
+    let Inst{12-8}  = src;
+    let Inst{7}     = 0b1;
+    let Inst{5-0}   = addr;
+  }
+
+def S4_storerb_ap : T_ST_absset <"memb", "STrib", IntRegs, 0b000, ByteAccess>;
+def S4_storerh_ap : T_ST_absset <"memh", "STrih", IntRegs, 0b010,
+                                 HalfWordAccess>;
+def S4_storeri_ap : T_ST_absset <"memw", "STriw", IntRegs, 0b100, WordAccess>;
+
+let isNVStorable = 0 in {
+  def S4_storerf_ap : T_ST_absset <"memh", "STrif", IntRegs,
+                                   0b011, HalfWordAccess, 1>;
+  def S4_storerd_ap : T_ST_absset <"memd", "STrid", DoubleRegs,
+                                   0b110, DoubleWordAccess>;
+}
+
+let opExtendable = 1, isNewValue = 1, isNVStore = 1, opNewValue = 2,
+isExtended = 1, opExtentBits= 6 in
+class T_ST_absset_nv <string mnemonic, string BaseOp, bits<2> MajOp,
+                      MemAccessSize AccessSz >
+  : NVInst <(outs IntRegs:$dst),
+            (ins u6_0Ext:$addr, IntRegs:$src),
+    mnemonic#"($dst = #$addr) = $src.new">, NewValueRel {
+    bits<5> dst;
+    bits<6> addr;
+    bits<3> src;
+    let accessSize = AccessSz;
+    let BaseOpcode = BaseOp#"_AbsSet";
+
+    let IClass = 0b1010;
+
+    let Inst{27-21} = 0b1011101;
+    let Inst{20-16} = dst;
+    let Inst{13-11} = 0b000;
+    let Inst{12-11} = MajOp;
+    let Inst{10-8}  = src;
+    let Inst{7}     = 0b1;
+    let Inst{5-0}   = addr;
+  }
+
+let mayStore = 1, addrMode = AbsoluteSet in {
+  def S4_storerbnew_ap : T_ST_absset_nv <"memb", "STrib", 0b00, ByteAccess>;
+  def S4_storerhnew_ap : T_ST_absset_nv <"memh", "STrih", 0b01, HalfWordAccess>;
+  def S4_storerinew_ap : T_ST_absset_nv <"memw", "STriw", 0b10, WordAccess>;
+}
+
+let isExtended = 1, opExtendable = 2, opExtentBits = 6, InputType = "imm",
+    addrMode = BaseLongOffset, AddedComplexity = 40 in
+class T_StoreAbsReg <string mnemonic, string CextOp, RegisterClass RC,
+                     bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0>
+  : STInst<(outs),
+           (ins IntRegs:$src1, u2_0Imm:$src2, u6_0Ext:$src3, RC:$src4),
+   mnemonic#"($src1<<#$src2 + #$src3) = $src4"#!if(isHalf, ".h",""),
+   []>, ImmRegShl, NewValueRel {
+
+    bits<5> src1;
+    bits<2> src2;
+    bits<6> src3;
+    bits<5> src4;
+
+    let accessSize = AccessSz;
+    let CextOpcode = CextOp;
+    let BaseOpcode = CextOp#"_shl";
+
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
+
+    let IClass = 0b1010;
+
+    let Inst{27-24} =0b1101;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = src1;
+    let Inst{13}    = src2{1};
+    let Inst{12-8}  = src4;
+    let Inst{7}     = 0b1;
+    let Inst{6}     = src2{0};
+    let Inst{5-0}   = src3;
+}
+
+def S4_storerb_ur : T_StoreAbsReg <"memb", "STrib", IntRegs, 0b000, ByteAccess>;
+def S4_storerh_ur : T_StoreAbsReg <"memh", "STrih", IntRegs, 0b010,
+                                   HalfWordAccess>;
+def S4_storerf_ur : T_StoreAbsReg <"memh", "STrif", IntRegs, 0b011,
+                                   HalfWordAccess, 1>;
+def S4_storeri_ur : T_StoreAbsReg <"memw", "STriw", IntRegs, 0b100, WordAccess>;
+def S4_storerd_ur : T_StoreAbsReg <"memd", "STrid", DoubleRegs, 0b110,
+                                   DoubleWordAccess>;
+
+let mayStore = 1, isNVStore = 1, isExtended = 1, addrMode = BaseLongOffset,
+    opExtentBits = 6, isNewValue = 1, opNewValue = 3, opExtendable = 2 in
+class T_StoreAbsRegNV <string mnemonic, string CextOp, bits<2> MajOp,
+                       MemAccessSize AccessSz>
+  : NVInst <(outs ),
+            (ins IntRegs:$src1, u2_0Imm:$src2, u6_0Ext:$src3, IntRegs:$src4),
+  mnemonic#"($src1<<#$src2 + #$src3) = $src4.new">, NewValueRel {
+    bits<5> src1;
+    bits<2> src2;
+    bits<6> src3;
+    bits<3> src4;
+
+    let CextOpcode  = CextOp;
+    let BaseOpcode  = CextOp#"_shl";
+    let IClass      = 0b1010;
+
+    let Inst{27-21} = 0b1101101;
+    let Inst{12-11} = 0b00;
+    let Inst{7}     = 0b1;
+    let Inst{20-16} = src1;
+    let Inst{13}    = src2{1};
+    let Inst{12-11} = MajOp;
+    let Inst{10-8}  = src4;
+    let Inst{6}     = src2{0};
+    let Inst{5-0}   = src3;
+  }
+
+def S4_storerbnew_ur : T_StoreAbsRegNV <"memb", "STrib", 0b00, ByteAccess>;
+def S4_storerhnew_ur : T_StoreAbsRegNV <"memh", "STrih", 0b01, HalfWordAccess>;
+def S4_storerinew_ur : T_StoreAbsRegNV <"memw", "STriw", 0b10, WordAccess>;
+
+//===----------------------------------------------------------------------===//
+// Template classes for the non-predicated store instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+let isPredicable = 1 in
+class T_store_rr <string mnemonic, RegisterClass RC, bits<3> MajOp, bit isH>
+  : STInst < (outs ), (ins IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, RC:$Rt),
+  mnemonic#"($Rs + $Ru<<#$u2) = $Rt"#!if(isH, ".h",""),
+  [],"",V4LDST_tc_st_SLOT01>, ImmRegShl, AddrModeRel {
+
+    bits<5> Rs;
+    bits<5> Ru;
+    bits<2> u2;
+    bits<5> Rt;
+
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
+
+    let IClass = 0b0011;
+
+    let Inst{27-24} = 0b1011;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{12-8}  = Ru;
+    let Inst{13}    = u2{1};
+    let Inst{7}     = u2{0};
+    let Inst{4-0}   = Rt;
+  }
+
+//===----------------------------------------------------------------------===//
+// Template classes for the predicated store instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+let isPredicated = 1 in
+class T_pstore_rr <string mnemonic, RegisterClass RC, bits<3> MajOp,
+                   bit isNot, bit isPredNew, bit isH>
+  : STInst <(outs),
+            (ins PredRegs:$Pv, IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, RC:$Rt),
+
+  !if(isNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
+  ") ")#mnemonic#"($Rs+$Ru<<#$u2) = $Rt"#!if(isH, ".h",""),
+  [], "", V4LDST_tc_st_SLOT01> , AddrModeRel{
+    bits<2> Pv;
+    bits<5> Rs;
+    bits<5> Ru;
+    bits<2> u2;
+    bits<5> Rt;
+
+    let isPredicatedFalse = isNot;
+    let isPredicatedNew = isPredNew;
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
+
+    let IClass = 0b0011;
+
+    let Inst{27-26} = 0b01;
+    let Inst{25}    = isPredNew;
+    let Inst{24}    = isNot;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{12-8}  = Ru;
+    let Inst{13}    = u2{1};
+    let Inst{7}     = u2{0};
+    let Inst{6-5}   = Pv;
+    let Inst{4-0}   = Rt;
+  }
+
+//===----------------------------------------------------------------------===//
+// Template classes for the new-value store instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+let isPredicable = 1, isNewValue = 1, opNewValue = 3 in
+class T_store_new_rr <string mnemonic, bits<2> MajOp> :
+  NVInst < (outs ), (ins IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, IntRegs:$Nt),
+  mnemonic#"($Rs + $Ru<<#$u2) = $Nt.new",
+  [],"",V4LDST_tc_st_SLOT0>, ImmRegShl, AddrModeRel {
+
+    bits<5> Rs;
+    bits<5> Ru;
+    bits<2> u2;
+    bits<3> Nt;
+
+    let IClass = 0b0011;
+
+    let Inst{27-21} = 0b1011101;
+    let Inst{20-16} = Rs;
+    let Inst{12-8}  = Ru;
+    let Inst{13}    = u2{1};
+    let Inst{7}     = u2{0};
+    let Inst{4-3}   = MajOp;
+    let Inst{2-0}   = Nt;
+  }
+
+//===----------------------------------------------------------------------===//
+// Template classes for the predicated new-value store instructions with
+// base + register offset addressing mode
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, isNewValue = 1, opNewValue = 4 in
+class T_pstore_new_rr <string mnemonic, bits<2> MajOp, bit isNot, bit isPredNew>
+  : NVInst<(outs),
+           (ins PredRegs:$Pv, IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, IntRegs:$Nt),
+   !if(isNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
+   ") ")#mnemonic#"($Rs+$Ru<<#$u2) = $Nt.new",
+   [], "", V4LDST_tc_st_SLOT0>, AddrModeRel {
+    bits<2> Pv;
+    bits<5> Rs;
+    bits<5> Ru;
+    bits<2> u2;
+    bits<3> Nt;
+
+    let isPredicatedFalse = isNot;
+    let isPredicatedNew = isPredNew;
+
+    let IClass = 0b0011;
+    let Inst{27-26} = 0b01;
+    let Inst{25}    = isPredNew;
+    let Inst{24}    = isNot;
+    let Inst{23-21} = 0b101;
+    let Inst{20-16} = Rs;
+    let Inst{12-8}  = Ru;
+    let Inst{13}    = u2{1};
+    let Inst{7}     = u2{0};
+    let Inst{6-5}   = Pv;
+    let Inst{4-3}   = MajOp;
+    let Inst{2-0}   = Nt;
+  }
+
+//===----------------------------------------------------------------------===//
+// multiclass for store instructions with base + register offset addressing
+// mode
+//===----------------------------------------------------------------------===//
+let isNVStorable = 1 in
+multiclass ST_Idxd_shl<string mnemonic, string CextOp, RegisterClass RC,
+                       bits<3> MajOp, bit isH = 0> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
+    def S4_#NAME#_rr : T_store_rr <mnemonic, RC, MajOp, isH>;
+
+    // Predicated
+    def S4_p#NAME#t_rr : T_pstore_rr <mnemonic, RC, MajOp, 0, 0, isH>;
+    def S4_p#NAME#f_rr : T_pstore_rr <mnemonic, RC, MajOp, 1, 0, isH>;
+
+    // Predicated new
+    def S4_p#NAME#tnew_rr : T_pstore_rr <mnemonic, RC, MajOp, 0, 1, isH>;
+    def S4_p#NAME#fnew_rr : T_pstore_rr <mnemonic, RC, MajOp, 1, 1, isH>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// multiclass for new-value store instructions with base + register offset
+// addressing mode.
+//===----------------------------------------------------------------------===//
+let mayStore = 1, isNVStore = 1 in
+multiclass ST_Idxd_shl_nv <string mnemonic, string CextOp, RegisterClass RC,
+                           bits<2> MajOp> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
+    def S4_#NAME#new_rr : T_store_new_rr<mnemonic, MajOp>;
+
+    // Predicated
+    def S4_p#NAME#newt_rr : T_pstore_new_rr <mnemonic, MajOp, 0, 0>;
+    def S4_p#NAME#newf_rr : T_pstore_new_rr <mnemonic, MajOp, 1, 0>;
+
+    // Predicated new
+    def S4_p#NAME#newtnew_rr : T_pstore_new_rr <mnemonic, MajOp, 0, 1>;
+    def S4_p#NAME#newfnew_rr : T_pstore_new_rr <mnemonic, MajOp, 1, 1>;
+  }
+}
+
+let addrMode = BaseRegOffset, InputType = "reg", hasSideEffects = 0 in {
+  let accessSize = ByteAccess in
+  defm storerb: ST_Idxd_shl<"memb", "STrib", IntRegs, 0b000>,
+                ST_Idxd_shl_nv<"memb", "STrib", IntRegs, 0b00>;
+
+  let accessSize = HalfWordAccess in
+  defm storerh: ST_Idxd_shl<"memh", "STrih", IntRegs, 0b010>,
+                ST_Idxd_shl_nv<"memh", "STrih", IntRegs, 0b01>;
+
+  let accessSize = WordAccess in
+  defm storeri: ST_Idxd_shl<"memw", "STriw", IntRegs, 0b100>,
+                ST_Idxd_shl_nv<"memw", "STriw", IntRegs, 0b10>;
+
+  let isNVStorable = 0, accessSize = DoubleWordAccess in
+  defm storerd: ST_Idxd_shl<"memd", "STrid", DoubleRegs, 0b110>;
+
+  let isNVStorable = 0, accessSize = HalfWordAccess in
+  defm storerf: ST_Idxd_shl<"memh", "STrif", IntRegs, 0b011, 1>;
+}
+
+//===----------------------------------------------------------------------===//
+// Template class
+//===----------------------------------------------------------------------===//
+let isPredicable = 1, isExtendable = 1, isExtentSigned = 1, opExtentBits = 8,
+    opExtendable = 2 in
+class T_StoreImm <string mnemonic, Operand OffsetOp, bits<2> MajOp >
+  : STInst <(outs ), (ins IntRegs:$Rs, OffsetOp:$offset, s8_0Ext:$S8),
+  mnemonic#"($Rs+#$offset)=#$S8",
+  [], "", V4LDST_tc_st_SLOT01>,
+  ImmRegRel, PredNewRel {
+    bits<5> Rs;
+    bits<8> S8;
+    bits<8> offset;
+    bits<6> offsetBits;
+
+    string OffsetOpStr = !cast<string>(OffsetOp);
+    let offsetBits = !if (!eq(OffsetOpStr, "u6_2Imm"), offset{7-2},
+                     !if (!eq(OffsetOpStr, "u6_1Imm"), offset{6-1},
+                                         /* u6_0Imm */ offset{5-0}));
+
+    let IClass = 0b0011;
+
+    let Inst{27-25} = 0b110;
+    let Inst{22-21} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{12-7}  = offsetBits;
+    let Inst{13}    = S8{7};
+    let Inst{6-0}   = S8{6-0};
+  }
+
+let isPredicated = 1, isExtendable = 1, isExtentSigned = 1, opExtentBits = 6,
+    opExtendable = 3 in
+class T_StoreImm_pred <string mnemonic, Operand OffsetOp, bits<2> MajOp,
+                       bit isPredNot, bit isPredNew >
+  : STInst <(outs ),
+            (ins PredRegs:$Pv, IntRegs:$Rs, OffsetOp:$offset, s6_0Ext:$S6),
+  !if(isPredNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
+  ") ")#mnemonic#"($Rs+#$offset)=#$S6",
+  [], "", V4LDST_tc_st_SLOT01>,
+  ImmRegRel, PredNewRel {
+    bits<2> Pv;
+    bits<5> Rs;
+    bits<6> S6;
+    bits<8> offset;
+    bits<6> offsetBits;
+
+    string OffsetOpStr = !cast<string>(OffsetOp);
+    let offsetBits = !if (!eq(OffsetOpStr, "u6_2Imm"), offset{7-2},
+                     !if (!eq(OffsetOpStr, "u6_1Imm"), offset{6-1},
+                                         /* u6_0Imm */ offset{5-0}));
+    let isPredicatedNew = isPredNew;
+    let isPredicatedFalse = isPredNot;
+
+    let IClass = 0b0011;
+
+    let Inst{27-25} = 0b100;
+    let Inst{24}    = isPredNew;
+    let Inst{23}    = isPredNot;
+    let Inst{22-21} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{13}    = S6{5};
+    let Inst{12-7}  = offsetBits;
+    let Inst{6-5}   = Pv;
+    let Inst{4-0}   = S6{4-0};
+  }
+
+
+//===----------------------------------------------------------------------===//
+// multiclass for store instructions with base + immediate offset
+// addressing mode and immediate stored value.
+// mem[bhw](Rx++#s4:3)=#s8
+// if ([!]Pv[.new]) mem[bhw](Rx++#s4:3)=#s6
+//===----------------------------------------------------------------------===//
+
+multiclass ST_Imm_Pred <string mnemonic, Operand OffsetOp, bits<2> MajOp,
+                        bit PredNot> {
+  def _io    : T_StoreImm_pred <mnemonic, OffsetOp, MajOp, PredNot, 0>;
+  // Predicate new
+  def new_io : T_StoreImm_pred <mnemonic, OffsetOp, MajOp, PredNot, 1>;
+}
+
+multiclass ST_Imm <string mnemonic, string CextOp, Operand OffsetOp,
+                   bits<2> MajOp> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_imm in {
+    def _io : T_StoreImm <mnemonic, OffsetOp, MajOp>;
+
+    defm t : ST_Imm_Pred <mnemonic, OffsetOp, MajOp, 0>;
+    defm f : ST_Imm_Pred <mnemonic, OffsetOp, MajOp, 1>;
+  }
+}
+
+let hasSideEffects = 0, addrMode = BaseImmOffset,
+    InputType = "imm" in {
+  let accessSize = ByteAccess in
+  defm S4_storeirb : ST_Imm<"memb", "STrib", u6_0Imm, 0b00>;
+
+  let accessSize = HalfWordAccess in
+  defm S4_storeirh : ST_Imm<"memh", "STrih", u6_1Imm, 0b01>;
+
+  let accessSize = WordAccess in
+  defm S4_storeiri : ST_Imm<"memw", "STriw", u6_2Imm, 0b10>;
+}
+
+//===----------------------------------------------------------------------===
+// ST -
+//===----------------------------------------------------------------------===
+
+
+//===----------------------------------------------------------------------===//
+// NV/ST +
+//===----------------------------------------------------------------------===//
+
+let opNewValue = 2, opExtendable = 1, isExtentSigned = 1, isPredicable = 1 in
+class T_store_io_nv <string mnemonic, RegisterClass RC,
+                    Operand ImmOp, bits<2>MajOp>
+  : NVInst_V4 <(outs),
+               (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+  mnemonic#"($src1+#$src2) = $src3.new",
+  [],"",ST_tc_st_SLOT0> {
+    bits<5> src1;
+    bits<13> src2; // Actual address offset
+    bits<3> src3;
+    bits<11> offsetBits; // Represents offset encoding
+
+    let opExtentBits = !if (!eq(mnemonic, "memb"), 11,
+                       !if (!eq(mnemonic, "memh"), 12,
+                       !if (!eq(mnemonic, "memw"), 13, 0)));
+
+    let opExtentAlign = !if (!eq(mnemonic, "memb"), 0,
+                        !if (!eq(mnemonic, "memh"), 1,
+                        !if (!eq(mnemonic, "memw"), 2, 0)));
+
+    let offsetBits = !if (!eq(mnemonic, "memb"),  src2{10-0},
+                     !if (!eq(mnemonic, "memh"),  src2{11-1},
+                     !if (!eq(mnemonic, "memw"),  src2{12-2}, 0)));
+
+    let IClass = 0b1010;
+
+    let Inst{27} = 0b0;
+    let Inst{26-25} = offsetBits{10-9};
+    let Inst{24-21} = 0b1101;
+    let Inst{20-16} = src1;
+    let Inst{13} = offsetBits{8};
+    let Inst{12-11} = MajOp;
+    let Inst{10-8} = src3;
+    let Inst{7-0} = offsetBits{7-0};
+  }
+
+let opExtendable = 2, opNewValue = 3, isPredicated = 1 in
+class T_pstore_io_nv <string mnemonic, RegisterClass RC, Operand predImmOp,
+                         bits<2>MajOp, bit PredNot, bit isPredNew>
+  : NVInst_V4 <(outs),
+               (ins PredRegs:$src1, IntRegs:$src2, predImmOp:$src3, RC:$src4),
+  !if(PredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+  ") ")#mnemonic#"($src2+#$src3) = $src4.new",
+  [],"",V2LDST_tc_st_SLOT0> {
+    bits<2> src1;
+    bits<5> src2;
+    bits<9> src3;
+    bits<3> src4;
+    bits<6> offsetBits; // Represents offset encoding
+
+    let isPredicatedNew = isPredNew;
+    let isPredicatedFalse = PredNot;
+    let opExtentBits = !if (!eq(mnemonic, "memb"), 6,
+                       !if (!eq(mnemonic, "memh"), 7,
+                       !if (!eq(mnemonic, "memw"), 8, 0)));
+
+    let opExtentAlign = !if (!eq(mnemonic, "memb"), 0,
+                        !if (!eq(mnemonic, "memh"), 1,
+                        !if (!eq(mnemonic, "memw"), 2, 0)));
+
+    let offsetBits = !if (!eq(mnemonic, "memb"), src3{5-0},
+                     !if (!eq(mnemonic, "memh"), src3{6-1},
+                     !if (!eq(mnemonic, "memw"), src3{7-2}, 0)));
+
+    let IClass = 0b0100;
+
+    let Inst{27}    = 0b0;
+    let Inst{26}    = PredNot;
+    let Inst{25}    = isPredNew;
+    let Inst{24-21} = 0b0101;
+    let Inst{20-16} = src2;
+    let Inst{13}    = offsetBits{5};
+    let Inst{12-11} = MajOp;
+    let Inst{10-8}  = src4;
+    let Inst{7-3}   = offsetBits{4-0};
+    let Inst{2}     = 0b0;
+    let Inst{1-0}   = src1;
+  }
+
+// multiclass for new-value store instructions with base + immediate offset.
+//
+let mayStore = 1, isNVStore = 1, isNewValue = 1, hasSideEffects = 0,
+    isExtendable = 1 in
+multiclass ST_Idxd_nv<string mnemonic, string CextOp, RegisterClass RC,
+                   Operand ImmOp, Operand predImmOp, bits<2> MajOp> {
+
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
+    def S2_#NAME#new_io : T_store_io_nv <mnemonic, RC, ImmOp, MajOp>;
+    // Predicated
+    def S2_p#NAME#newt_io :T_pstore_io_nv <mnemonic, RC, predImmOp, MajOp, 0, 0>;
+    def S2_p#NAME#newf_io :T_pstore_io_nv <mnemonic, RC, predImmOp, MajOp, 1, 0>;
+    // Predicated new
+    def S4_p#NAME#newtnew_io :T_pstore_io_nv <mnemonic, RC, predImmOp,
+                                              MajOp, 0, 1>;
+    def S4_p#NAME#newfnew_io :T_pstore_io_nv <mnemonic, RC, predImmOp,
+                                              MajOp, 1, 1>;
+  }
+}
+
+let addrMode = BaseImmOffset, InputType = "imm" in {
+  let accessSize = ByteAccess in
+  defm storerb: ST_Idxd_nv<"memb", "STrib", IntRegs, s11_0Ext,
+                           u6_0Ext, 0b00>, AddrModeRel;
+
+  let accessSize = HalfWordAccess, opExtentAlign = 1 in
+  defm storerh: ST_Idxd_nv<"memh", "STrih", IntRegs, s11_1Ext,
+                           u6_1Ext, 0b01>, AddrModeRel;
+
+  let accessSize = WordAccess, opExtentAlign = 2 in
+  defm storeri: ST_Idxd_nv<"memw", "STriw", IntRegs, s11_2Ext,
+                           u6_2Ext, 0b10>, AddrModeRel;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment loads with register offset.
+//===----------------------------------------------------------------------===//
+
+let hasNewValue = 1 in
+def L2_loadbsw2_pr : T_load_pr <"membh", IntRegs, 0b0001, HalfWordAccess>;
+
+def L2_loadbsw4_pr : T_load_pr <"membh", DoubleRegs, 0b0111, WordAccess>;
+
+let hasSideEffects = 0, addrMode = PostInc in
+class T_loadalign_pr <string mnemonic, bits<4> MajOp, MemAccessSize AccessSz>
+  : LDInstPI <(outs DoubleRegs:$dst, IntRegs:$_dst_),
+              (ins DoubleRegs:$src1, IntRegs:$src2, ModRegs:$src3),
+  "$dst = "#mnemonic#"($src2++$src3)", [],
+  "$src1 = $dst, $src2 = $_dst_"> {
+    bits<5> dst;
+    bits<5> src2;
+    bits<1> src3;
+
+    let accessSize = AccessSz;
+    let IClass = 0b1001;
+
+    let Inst{27-25} = 0b110;
+    let Inst{24-21} = MajOp;
+    let Inst{20-16} = src2;
+    let Inst{13}    = src3;
+    let Inst{12}    = 0b0;
+    let Inst{7}     = 0b0;
+    let Inst{4-0}   = dst;
+  }
+
+def L2_loadalignb_pr : T_loadalign_pr <"memb_fifo", 0b0100, ByteAccess>;
+def L2_loadalignh_pr : T_loadalign_pr <"memh_fifo", 0b0010, HalfWordAccess>;
+
+//===----------------------------------------------------------------------===//
+// Template class for non-predicated post increment .new stores
+// mem[bhwd](Rx++#s4:[0123])=Nt.new
+//===----------------------------------------------------------------------===//
+let isPredicable = 1, hasSideEffects = 0, addrMode = PostInc, isNVStore = 1,
+    isNewValue = 1, opNewValue = 3 in
+class T_StorePI_nv <string mnemonic, Operand ImmOp, bits<2> MajOp >
+  : NVInstPI_V4 <(outs IntRegs:$_dst_),
+                 (ins IntRegs:$src1, ImmOp:$offset, IntRegs:$src2),
+  mnemonic#"($src1++#$offset) = $src2.new",
+  [], "$src1 = $_dst_">,
+  AddrModeRel {
+    bits<5> src1;
+    bits<3> src2;
+    bits<7> offset;
+    bits<4> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+                                      /* s4_0Imm */ offset{3-0}));
+    let IClass = 0b1010;
+
+    let Inst{27-21} = 0b1011101;
+    let Inst{20-16} = src1;
+    let Inst{13} = 0b0;
+    let Inst{12-11} = MajOp;
+    let Inst{10-8} = src2;
+    let Inst{7} = 0b0;
+    let Inst{6-3} = offsetBits;
+    let Inst{1} = 0b0;
+  }
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated post increment .new stores
+// if([!]Pv[.new]) mem[bhwd](Rx++#s4:[0123])=Nt.new
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc, isNVStore = 1,
+    isNewValue = 1, opNewValue = 4 in
+class T_StorePI_nv_pred <string mnemonic, Operand ImmOp,
+                         bits<2> MajOp, bit isPredNot, bit isPredNew >
+  : NVInstPI_V4 <(outs IntRegs:$_dst_),
+                 (ins PredRegs:$src1, IntRegs:$src2,
+                      ImmOp:$offset, IntRegs:$src3),
+  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+  ") ")#mnemonic#"($src2++#$offset) = $src3.new",
+  [], "$src2 = $_dst_">,
+  AddrModeRel {
+    bits<2> src1;
+    bits<5> src2;
+    bits<3> src3;
+    bits<7> offset;
+    bits<4> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
+                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
+                                      /* s4_0Imm */ offset{3-0}));
+    let isPredicatedNew = isPredNew;
+    let isPredicatedFalse = isPredNot;
+
+    let IClass = 0b1010;
+
+    let Inst{27-21} = 0b1011101;
+    let Inst{20-16} = src2;
+    let Inst{13} = 0b1;
+    let Inst{12-11} = MajOp;
+    let Inst{10-8} = src3;
+    let Inst{7} = isPredNew;
+    let Inst{6-3} = offsetBits;
+    let Inst{2} = isPredNot;
+    let Inst{1-0} = src1;
+  }
+
+multiclass ST_PostInc_Pred_nv<string mnemonic, Operand ImmOp,
+                              bits<2> MajOp, bit PredNot> {
+  def _pi : T_StorePI_nv_pred <mnemonic, ImmOp, MajOp, PredNot, 0>;
+
+  // Predicate new
+  def new_pi : T_StorePI_nv_pred <mnemonic, ImmOp, MajOp, PredNot, 1>;
+}
+
+multiclass ST_PostInc_nv<string mnemonic, string BaseOp, Operand ImmOp,
+                         bits<2> MajOp> {
+  let BaseOpcode = "POST_"#BaseOp in {
+    def S2_#NAME#_pi : T_StorePI_nv <mnemonic, ImmOp, MajOp>;
+
+    // Predicated
+    defm S2_p#NAME#t : ST_PostInc_Pred_nv <mnemonic, ImmOp, MajOp, 0>;
+    defm S2_p#NAME#f : ST_PostInc_Pred_nv <mnemonic, ImmOp, MajOp, 1>;
+  }
+}
+
+let accessSize = ByteAccess in
+defm storerbnew: ST_PostInc_nv <"memb", "STrib", s4_0Imm, 0b00>;
+
+let accessSize = HalfWordAccess in
+defm storerhnew: ST_PostInc_nv <"memh", "STrih", s4_1Imm, 0b01>;
+
+let accessSize = WordAccess in
+defm storerinew: ST_PostInc_nv <"memw", "STriw", s4_2Imm, 0b10>;
+
+//===----------------------------------------------------------------------===//
+// Template class for post increment .new stores with register offset
+//===----------------------------------------------------------------------===//
+let isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3 in
+class T_StorePI_RegNV <string mnemonic, bits<2> MajOp, MemAccessSize AccessSz>
+  : NVInstPI_V4 <(outs IntRegs:$_dst_),
+                 (ins IntRegs:$src1, ModRegs:$src2, IntRegs:$src3),
+  #mnemonic#"($src1++$src2) = $src3.new",
+  [], "$src1 = $_dst_"> {
+    bits<5> src1;
+    bits<1> src2;
+    bits<3> src3;
+    let accessSize = AccessSz;
+
+    let IClass = 0b1010;
+
+    let Inst{27-21} = 0b1101101;
+    let Inst{20-16} = src1;
+    let Inst{13}    = src2;
+    let Inst{12-11} = MajOp;
+    let Inst{10-8}  = src3;
+    let Inst{7}     = 0b0;
+  }
+
+def S2_storerbnew_pr : T_StorePI_RegNV<"memb", 0b00, ByteAccess>;
+def S2_storerhnew_pr : T_StorePI_RegNV<"memh", 0b01, HalfWordAccess>;
+def S2_storerinew_pr : T_StorePI_RegNV<"memw", 0b10, WordAccess>;
+
+// memb(Rx++#s4:0:circ(Mu))=Nt.new
+// memb(Rx++I:circ(Mu))=Nt.new
+// memb(Rx++Mu:brev)=Nt.new
+// memh(Rx++#s4:1:circ(Mu))=Nt.new
+// memh(Rx++I:circ(Mu))=Nt.new
+// memh(Rx++Mu)=Nt.new
+// memh(Rx++Mu:brev)=Nt.new
+
+// memw(Rx++#s4:2:circ(Mu))=Nt.new
+// memw(Rx++I:circ(Mu))=Nt.new
+// memw(Rx++Mu)=Nt.new
+// memw(Rx++Mu:brev)=Nt.new
+
+//===----------------------------------------------------------------------===//
+// NV/ST -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// NV/J +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// multiclass/template class for the new-value compare jumps with the register
+// operands.
+//===----------------------------------------------------------------------===//
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11,
+    opExtentAlign = 2 in
+class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
+                      bit isNegCond, bit isTak>
+  : NVInst_V4<(outs),
+    (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
+    "if ("#!if(isNegCond, "!","")#mnemonic#
+    "($src1"#!if(!eq(NvOpNum, 0),".new, ",", ")#
+    "$src2"#!if(!eq(NvOpNum, 1),".new))","))")#" jump:"
+    #!if(isTak, "t","nt")#" $offset", []> {
+
+      bits<5> src1;
+      bits<5> src2;
+      bits<3> Ns;    // New-Value Operand
+      bits<5> RegOp; // Non-New-Value Operand
+      bits<11> offset;
+
+      let isTaken = isTak;
+      let isPredicatedFalse = isNegCond;
+      let opNewValue{0} = NvOpNum;
+
+      let Ns = !if(!eq(NvOpNum, 0), src1{2-0}, src2{2-0});
+      let RegOp = !if(!eq(NvOpNum, 0), src2, src1);
+
+      let IClass = 0b0010;
+      let Inst{27-26} = 0b00;
+      let Inst{25-23} = majOp;
+      let Inst{22} = isNegCond;
+      let Inst{18-16} = Ns;
+      let Inst{13} = isTak;
+      let Inst{12-8} = RegOp;
+      let Inst{21-20} = offset{10-9};
+      let Inst{7-1} = offset{8-2};
+}
+
+
+multiclass NVJrr_cond<string mnemonic, bits<3> majOp, bit NvOpNum,
+                       bit isNegCond> {
+  // Branch not taken:
+  def _nt: NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 0>;
+  // Branch taken:
+  def _t : NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 1>;
+}
+
+// NvOpNum = 0 -> First Operand is a new-value Register
+// NvOpNum = 1 -> Second Operand is a new-value Register
+
+multiclass NVJrr_base<string mnemonic, string BaseOp, bits<3> majOp,
+                       bit NvOpNum> {
+  let BaseOpcode = BaseOp#_NVJ in {
+    defm _t_jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 0>; // True cond
+    defm _f_jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 1>; // False cond
+  }
+}
+
+// if ([!]cmp.eq(Ns.new,Rt)) jump:[n]t #r9:2
+// if ([!]cmp.gt(Ns.new,Rt)) jump:[n]t #r9:2
+// if ([!]cmp.gtu(Ns.new,Rt)) jump:[n]t #r9:2
+// if ([!]cmp.gt(Rt,Ns.new)) jump:[n]t #r9:2
+// if ([!]cmp.gtu(Rt,Ns.new)) jump:[n]t #r9:2
+
+let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
+    Defs = [PC], hasSideEffects = 0 in {
+  defm J4_cmpeq  : NVJrr_base<"cmp.eq",  "CMPEQ",  0b000, 0>, PredRel;
+  defm J4_cmpgt  : NVJrr_base<"cmp.gt",  "CMPGT",  0b001, 0>, PredRel;
+  defm J4_cmpgtu : NVJrr_base<"cmp.gtu", "CMPGTU", 0b010, 0>, PredRel;
+  defm J4_cmplt  : NVJrr_base<"cmp.gt",  "CMPLT",  0b011, 1>, PredRel;
+  defm J4_cmpltu : NVJrr_base<"cmp.gtu", "CMPLTU", 0b100, 1>, PredRel;
+}
+
+//===----------------------------------------------------------------------===//
+// multiclass/template class for the new-value compare jumps instruction
+// with a register and an unsigned immediate (U5) operand.
+//===----------------------------------------------------------------------===//
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11,
+    opExtentAlign = 2 in
+class NVJri_template<string mnemonic, bits<3> majOp, bit isNegCond,
+                         bit isTak>
+  : NVInst_V4<(outs),
+    (ins IntRegs:$src1, u5_0Imm:$src2, brtarget:$offset),
+    "if ("#!if(isNegCond, "!","")#mnemonic#"($src1.new, #$src2)) jump:"
+    #!if(isTak, "t","nt")#" $offset", []> {
+
+      let isTaken = isTak;
+      let isPredicatedFalse = isNegCond;
+      let isTaken = isTak;
+
+      bits<3> src1;
+      bits<5> src2;
+      bits<11> offset;
+
+      let IClass = 0b0010;
+      let Inst{26} = 0b1;
+      let Inst{25-23} = majOp;
+      let Inst{22} = isNegCond;
+      let Inst{18-16} = src1;
+      let Inst{13} = isTak;
+      let Inst{12-8} = src2;
+      let Inst{21-20} = offset{10-9};
+      let Inst{7-1} = offset{8-2};
+}
+
+multiclass NVJri_cond<string mnemonic, bits<3> majOp, bit isNegCond> {
+  // Branch not taken:
+  def _nt: NVJri_template<mnemonic, majOp, isNegCond, 0>;
+  // Branch taken:
+  def _t : NVJri_template<mnemonic, majOp, isNegCond, 1>;
+}
+
+multiclass NVJri_base<string mnemonic, string BaseOp, bits<3> majOp> {
+  let BaseOpcode = BaseOp#_NVJri in {
+    defm _t_jumpnv : NVJri_cond<mnemonic, majOp, 0>; // True Cond
+    defm _f_jumpnv : NVJri_cond<mnemonic, majOp, 1>; // False cond
+  }
+}
+
+// if ([!]cmp.eq(Ns.new,#U5)) jump:[n]t #r9:2
+// if ([!]cmp.gt(Ns.new,#U5)) jump:[n]t #r9:2
+// if ([!]cmp.gtu(Ns.new,#U5)) jump:[n]t #r9:2
+
+let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
+    Defs = [PC], hasSideEffects = 0 in {
+  defm J4_cmpeqi  : NVJri_base<"cmp.eq", "CMPEQ", 0b000>, PredRel;
+  defm J4_cmpgti  : NVJri_base<"cmp.gt", "CMPGT", 0b001>, PredRel;
+  defm J4_cmpgtui : NVJri_base<"cmp.gtu", "CMPGTU", 0b010>, PredRel;
+}
+
+//===----------------------------------------------------------------------===//
+// multiclass/template class for the new-value compare jumps instruction
+// with a register and an hardcoded 0/-1 immediate value.
+//===----------------------------------------------------------------------===//
+
+let isExtendable = 1, isExtentSigned = 1, opExtentBits = 11,
+    opExtentAlign = 2 in
+class NVJ_ConstImm_template<string mnemonic, bits<3> majOp, string ImmVal,
+                            bit isNegCond, bit isTak>
+  : NVInst_V4<(outs),
+    !if(!eq(ImmVal, "{-1}"),
+        (ins IntRegs:$src1, n1Const:$n1, brtarget:$offset),
+        (ins IntRegs:$src1, brtarget:$offset)),
+    "if ("#!if(isNegCond, "!","")#mnemonic
+    #"($src1.new, #" # !if(!eq(ImmVal, "{-1}"), "$n1", ImmVal) # ")) jump:"
+    #!if(isTak, "t","nt")#" $offset", []> {
+
+      let isTaken = isTak;
+      let isPredicatedFalse = isNegCond;
+      let isTaken = isTak;
+      let opExtendable = !if(!eq(ImmVal, "{-1}"), 2, 1);
+
+      bits<3> src1;
+      bits<11> offset;
+      let IClass = 0b0010;
+      let Inst{26} = 0b1;
+      let Inst{25-23} = majOp;
+      let Inst{22} = isNegCond;
+      let Inst{18-16} = src1;
+      let Inst{13} = isTak;
+      let Inst{21-20} = offset{10-9};
+      let Inst{7-1} = offset{8-2};
+}
+
+multiclass NVJ_ConstImm_cond<string mnemonic, bits<3> majOp, string ImmVal,
+                             bit isNegCond> {
+  // Branch not taken:
+  def _nt: NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 0>;
+  // Branch taken:
+  def _t : NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 1>;
+}
+
+multiclass NVJ_ConstImm_base<string mnemonic, string BaseOp, bits<3> majOp,
+                             string ImmVal> {
+  let BaseOpcode = BaseOp#_NVJ_ConstImm in {
+    defm _t_jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 0>; // True
+    defm _f_jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 1>; // False
+  }
+}
+
+// if ([!]tstbit(Ns.new,#0)) jump:[n]t #r9:2
+// if ([!]cmp.eq(Ns.new,#-1)) jump:[n]t #r9:2
+// if ([!]cmp.gt(Ns.new,#-1)) jump:[n]t #r9:2
+
+let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator=1,
+    Defs = [PC], hasSideEffects = 0 in {
+  defm J4_tstbit0 : NVJ_ConstImm_base<"tstbit", "TSTBIT", 0b011, "0">, PredRel;
+  defm J4_cmpeqn1 : NVJ_ConstImm_base<"cmp.eq", "CMPEQ",  0b100, "{-1}">, PredRel;
+  defm J4_cmpgtn1 : NVJ_ConstImm_base<"cmp.gt", "CMPGT",  0b101, "{-1}">, PredRel;
+}
+
+// J4_hintjumpr: Hint indirect conditional jump.
+let isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def J4_hintjumpr: JRInst <
+  (outs),
+  (ins IntRegs:$Rs),
+  "hintjr($Rs)"> {
+    bits<5> Rs;
+    let IClass = 0b0101;
+    let Inst{27-21} = 0b0010101;
+    let Inst{20-16} = Rs;
+  }
+
+//===----------------------------------------------------------------------===//
+// NV/J -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// CR +
+//===----------------------------------------------------------------------===//
+
+// PC-relative add
+let hasNewValue = 1, isExtendable = 1, opExtendable = 1,
+    isExtentSigned = 0, opExtentBits = 6, hasSideEffects = 0, Uses = [PC] in
+def C4_addipc : CRInst <(outs IntRegs:$Rd), (ins u6_0Ext:$u6),
+  "$Rd = add(pc, #$u6)", [], "", CR_tc_2_SLOT3 > {
+    bits<5> Rd;
+    bits<6> u6;
+
+    let IClass = 0b0110;
+    let Inst{27-16} = 0b101001001001;
+    let Inst{12-7} = u6;
+    let Inst{4-0} = Rd;
+  }
+
+
+
+let hasSideEffects = 0 in
+class T_LOGICAL_3OP<string MnOp1, string MnOp2, bits<2> OpBits, bit IsNeg>
+    : CRInst<(outs PredRegs:$Pd),
+             (ins PredRegs:$Ps, PredRegs:$Pt, PredRegs:$Pu),
+             "$Pd = " # MnOp1 # "($Ps, " # MnOp2 # "($Pt, " #
+                   !if (IsNeg,"!","") # "$Pu))",
+             [], "", CR_tc_2early_SLOT23> {
+  bits<2> Pd;
+  bits<2> Ps;
+  bits<2> Pt;
+  bits<2> Pu;
+
+  let IClass = 0b0110;
+  let Inst{27-24} = 0b1011;
+  let Inst{23} = IsNeg;
+  let Inst{22-21} = OpBits;
+  let Inst{20} = 0b1;
+  let Inst{17-16} = Ps;
+  let Inst{13} = 0b0;
+  let Inst{9-8} = Pt;
+  let Inst{7-6} = Pu;
+  let Inst{1-0} = Pd;
+}
+
+def C4_and_and  : T_LOGICAL_3OP<"and", "and", 0b00, 0>;
+def C4_and_or   : T_LOGICAL_3OP<"and", "or",  0b01, 0>;
+def C4_or_and   : T_LOGICAL_3OP<"or",  "and", 0b10, 0>;
+def C4_or_or    : T_LOGICAL_3OP<"or",  "or",  0b11, 0>;
+def C4_and_andn : T_LOGICAL_3OP<"and", "and", 0b00, 1>;
+def C4_and_orn  : T_LOGICAL_3OP<"and", "or",  0b01, 1>;
+def C4_or_andn  : T_LOGICAL_3OP<"or",  "and", 0b10, 1>;
+def C4_or_orn   : T_LOGICAL_3OP<"or",  "or",  0b11, 1>;
+
+//===----------------------------------------------------------------------===//
+// CR -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XTYPE/ALU +
+//===----------------------------------------------------------------------===//
+
+// Logical with-not instructions.
+def A4_andnp : T_ALU64_logical<"and", 0b001, 1, 0, 1>;
+def A4_ornp  : T_ALU64_logical<"or",  0b011, 1, 0, 1>;
+
+let hasNewValue = 1, hasSideEffects = 0 in
+def S4_parity: ALU64Inst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
+      "$Rd = parity($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-21} = 0b0101111;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{4-0} = Rd;
+}
+
+//  Add and accumulate.
+//  Rd=add(Rs,add(Ru,#s6))
+let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1, opExtentBits = 6,
+    opExtendable = 3 in
+def S4_addaddi : ALU64Inst <(outs IntRegs:$Rd),
+                            (ins IntRegs:$Rs, IntRegs:$Ru, s6_0Ext:$s6),
+  "$Rd = add($Rs, add($Ru, #$s6))" , [],
+  "", ALU64_tc_2_SLOT23> {
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<5> Ru;
+    bits<6> s6;
+
+    let IClass = 0b1101;
+
+    let Inst{27-23} = 0b10110;
+    let Inst{22-21} = s6{5-4};
+    let Inst{20-16} = Rs;
+    let Inst{13}    = s6{3};
+    let Inst{12-8}  = Rd;
+    let Inst{7-5}   = s6{2-0};
+    let Inst{4-0}   = Ru;
+  }
+
+let isExtentSigned = 1, hasSideEffects = 0, hasNewValue = 1, isExtendable = 1,
+    opExtentBits = 6, opExtendable = 2 in
+def S4_subaddi: ALU64Inst <(outs IntRegs:$Rd),
+                           (ins IntRegs:$Rs, s6_0Ext:$s6, IntRegs:$Ru),
+  "$Rd = add($Rs, sub(#$s6, $Ru))",
+  [], "", ALU64_tc_2_SLOT23> {
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<6> s6;
+    bits<5> Ru;
+
+    let IClass = 0b1101;
+
+    let Inst{27-23} = 0b10111;
+    let Inst{22-21} = s6{5-4};
+    let Inst{20-16} = Rs;
+    let Inst{13}    = s6{3};
+    let Inst{12-8}  = Rd;
+    let Inst{7-5}   = s6{2-0};
+    let Inst{4-0}   = Ru;
+  }
+
+def S4_extractp_rp : T_S3op_64 < "extract",  0b11, 0b100, 0>;
+def S4_extractp    : T_S2op_extract <"extract",  0b1010, DoubleRegs, u6_0Imm>;
+
+let hasNewValue = 1 in {
+  def S4_extract_rp : T_S3op_extract<"extract",  0b01>;
+  def S4_extract    : T_S2op_extract <"extract",  0b1101, IntRegs, u5_0Imm>;
+}
+
+// Complex add/sub halfwords/words
+let Defs = [USR_OVF] in {
+  def S4_vxaddsubh : T_S3op_64 < "vxaddsubh", 0b01, 0b100, 0, 1>;
+  def S4_vxaddsubw : T_S3op_64 < "vxaddsubw", 0b01, 0b000, 0, 1>;
+  def S4_vxsubaddh : T_S3op_64 < "vxsubaddh", 0b01, 0b110, 0, 1>;
+  def S4_vxsubaddw : T_S3op_64 < "vxsubaddw", 0b01, 0b010, 0, 1>;
+}
+
+let Defs = [USR_OVF] in {
+  def S4_vxaddsubhr : T_S3op_64 < "vxaddsubh", 0b11, 0b000, 0, 1, 1, 1>;
+  def S4_vxsubaddhr : T_S3op_64 < "vxsubaddh", 0b11, 0b010, 0, 1, 1, 1>;
+}
+
+let Itinerary = M_tc_3x_SLOT23, Defs = [USR_OVF] in {
+  def M4_mac_up_s1_sat: T_MType_acc_rr<"+= mpy", 0b011, 0b000, 0, [], 0, 1, 1>;
+  def M4_nac_up_s1_sat: T_MType_acc_rr<"-= mpy", 0b011, 0b001, 0, [], 0, 1, 1>;
+}
+
+// Logical xor with xor accumulation.
+// Rxx^=xor(Rss,Rtt)
+let hasSideEffects = 0 in
+def M4_xor_xacc
+  : SInst <(outs DoubleRegs:$Rxx),
+           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Rxx ^= xor($Rss, $Rtt)", [],
+  "$dst2 = $Rxx", S_3op_tc_1_SLOT23> {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1100;
+
+    let Inst{27-22} = 0b101010;
+    let Inst{20-16} = Rss;
+    let Inst{12-8}  = Rtt;
+    let Inst{7-5}   = 0b000;
+    let Inst{4-0}   = Rxx;
+  }
+
+// Rotate and reduce bytes
+// Rdd=vrcrotate(Rss,Rt,#u2)
+let hasSideEffects = 0 in
+def S4_vrcrotate
+  : SInst <(outs DoubleRegs:$Rdd),
+           (ins DoubleRegs:$Rss, IntRegs:$Rt, u2_0Imm:$u2),
+  "$Rdd = vrcrotate($Rss, $Rt, #$u2)",
+  [], "", S_3op_tc_3x_SLOT23> {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rt;
+    bits<2> u2;
+
+    let IClass = 0b1100;
+
+    let Inst{27-22} = 0b001111;
+    let Inst{20-16} = Rss;
+    let Inst{13}    = u2{1};
+    let Inst{12-8}  = Rt;
+    let Inst{7-6}   = 0b11;
+    let Inst{5}     = u2{0};
+    let Inst{4-0}   = Rdd;
+  }
+
+// Rotate and reduce bytes with accumulation
+// Rxx+=vrcrotate(Rss,Rt,#u2)
+let hasSideEffects = 0 in
+def S4_vrcrotate_acc
+  : SInst <(outs DoubleRegs:$Rxx),
+           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Rt, u2_0Imm:$u2),
+  "$Rxx += vrcrotate($Rss, $Rt, #$u2)", [],
+  "$dst2 = $Rxx", S_3op_tc_3x_SLOT23> {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<5> Rt;
+    bits<2> u2;
+
+    let IClass = 0b1100;
+
+    let Inst{27-21} = 0b1011101;
+    let Inst{20-16} = Rss;
+    let Inst{13}    = u2{1};
+    let Inst{12-8}  = Rt;
+    let Inst{5}     = u2{0};
+    let Inst{4-0}   = Rxx;
+  }
+
+// Vector reduce conditional negate halfwords
+let hasSideEffects = 0 in
+def S2_vrcnegh
+  : SInst <(outs DoubleRegs:$Rxx),
+           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Rt),
+  "$Rxx += vrcnegh($Rss, $Rt)", [],
+  "$dst2 = $Rxx", S_3op_tc_3x_SLOT23> {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<5> Rt;
+
+    let IClass = 0b1100;
+
+    let Inst{27-21} = 0b1011001;
+    let Inst{20-16} = Rss;
+    let Inst{13}    = 0b1;
+    let Inst{12-8}  = Rt;
+    let Inst{7-5}   = 0b111;
+    let Inst{4-0}   = Rxx;
+  }
+
+// Split bitfield
+def A4_bitspliti : T_S2op_2_di <"bitsplit", 0b110, 0b100>;
+
+// Arithmetic/Convergent round
+def A4_cround_ri : T_S2op_2_ii <"cround", 0b111, 0b000>;
+
+def A4_round_ri  : T_S2op_2_ii <"round", 0b111, 0b100>;
+
+let Defs = [USR_OVF] in
+def A4_round_ri_sat : T_S2op_2_ii <"round", 0b111, 0b110, 1>;
+
+// Logical-logical words.
+// Compound or-and -- Rx=or(Ru,and(Rx,#s10))
+let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1, opExtentBits = 10,
+    opExtendable = 3 in
+def S4_or_andix:
+  ALU64Inst<(outs IntRegs:$Rx),
+            (ins IntRegs:$Ru, IntRegs:$_src_, s10_0Ext:$s10),
+  "$Rx = or($Ru, and($_src_, #$s10))" , [] ,
+  "$_src_ = $Rx", ALU64_tc_2_SLOT23> {
+    bits<5> Rx;
+    bits<5> Ru;
+    bits<10> s10;
+
+    let IClass = 0b1101;
+
+    let Inst{27-22} = 0b101001;
+    let Inst{20-16} = Rx;
+    let Inst{21}    = s10{9};
+    let Inst{13-5}  = s10{8-0};
+    let Inst{4-0}   = Ru;
+  }
+
+// Miscellaneous ALU64 instructions.
+//
+let hasNewValue = 1, hasSideEffects = 0 in
+def A4_modwrapu: ALU64Inst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
+      "$Rd = modwrap($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-21} = 0b0011111;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{7-5} = 0b111;
+  let Inst{4-0} = Rd;
+}
+
+let hasSideEffects = 0 in
+def A4_bitsplit: ALU64Inst<(outs DoubleRegs:$Rd),
+      (ins IntRegs:$Rs, IntRegs:$Rt),
+      "$Rd = bitsplit($Rs, $Rt)", [], "", ALU64_tc_1_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-24} = 0b0100;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{4-0} = Rd;
+}
+
+let hasSideEffects = 0 in
+def dep_S2_packhl: ALU64Inst<(outs DoubleRegs:$Rd),
+      (ins IntRegs:$Rs, IntRegs:$Rt),
+      "$Rd = packhl($Rs, $Rt):deprecated", [], "", ALU64_tc_1_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-24} = 0b0100;
+  let Inst{21} = 0b0;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{4-0} = Rd;
+}
+
+let hasNewValue = 1, hasSideEffects = 0 in
+def dep_A2_addsat: ALU64Inst<(outs IntRegs:$Rd),
+      (ins IntRegs:$Rs, IntRegs:$Rt),
+      "$Rd = add($Rs, $Rt):sat:deprecated", [], "", ALU64_tc_2_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-21} = 0b0101100;
+  let Inst{20-16} = Rs;
+  let Inst{12-8} = Rt;
+  let Inst{7} = 0b0;
+  let Inst{4-0} = Rd;
+}
+
+let hasNewValue = 1, hasSideEffects = 0 in
+def dep_A2_subsat: ALU64Inst<(outs IntRegs:$Rd),
+      (ins IntRegs:$Rs, IntRegs:$Rt),
+      "$Rd = sub($Rs, $Rt):sat:deprecated", [], "", ALU64_tc_2_SLOT23> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rt;
+
+  let IClass = 0b1101;
+  let Inst{27-21} = 0b0101100;
+  let Inst{20-16} = Rt;
+  let Inst{12-8} = Rs;
+  let Inst{7} = 0b1;
+  let Inst{4-0} = Rd;
+}
+
+// Rx[&|]=xor(Rs,Rt)
+def M4_or_xor   : T_MType_acc_rr < "|= xor", 0b110, 0b001, 0>;
+def M4_and_xor  : T_MType_acc_rr < "&= xor", 0b010, 0b010, 0>;
+
+// Rx[&|^]=or(Rs,Rt)
+def M4_xor_or   : T_MType_acc_rr < "^= or",  0b110, 0b011, 0>;
+
+let CextOpcode = "ORr_ORr" in
+def M4_or_or    : T_MType_acc_rr < "|= or",  0b110, 0b000, 0>;
+def M4_and_or   : T_MType_acc_rr < "&= or",  0b010, 0b001, 0>;
+
+// Rx[&|^]=and(Rs,Rt)
+def M4_xor_and  : T_MType_acc_rr < "^= and", 0b110, 0b010, 0>;
+
+let CextOpcode = "ORr_ANDr" in
+def M4_or_and   : T_MType_acc_rr < "|= and", 0b010, 0b011, 0>;
+def M4_and_and  : T_MType_acc_rr < "&= and", 0b010, 0b000, 0>;
+
+// Rx[&|^]=and(Rs,~Rt)
+def M4_xor_andn : T_MType_acc_rr < "^= and", 0b001, 0b010, 0, [], 1>;
+def M4_or_andn  : T_MType_acc_rr < "|= and", 0b001, 0b000, 0, [], 1>;
+def M4_and_andn : T_MType_acc_rr < "&= and", 0b001, 0b001, 0, [], 1>;
+
+// Compound or-or and or-and
+let isExtentSigned = 1, InputType = "imm", hasNewValue = 1, isExtendable = 1,
+    opExtentBits = 10, opExtendable = 3 in
+class T_CompOR <string mnemonic, bits<2> MajOp, SDNode OpNode>
+  : MInst_acc <(outs IntRegs:$Rx),
+               (ins IntRegs:$src1, IntRegs:$Rs, s10_0Ext:$s10),
+  "$Rx |= "#mnemonic#"($Rs, #$s10)", [],
+  "$src1 = $Rx", ALU64_tc_2_SLOT23>, ImmRegRel {
+    bits<5> Rx;
+    bits<5> Rs;
+    bits<10> s10;
+
+    let IClass = 0b1101;
+
+    let Inst{27-24} = 0b1010;
+    let Inst{23-22} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{21}    = s10{9};
+    let Inst{13-5}  = s10{8-0};
+    let Inst{4-0}   = Rx;
+  }
+
+let CextOpcode = "ORr_ANDr" in
+def S4_or_andi : T_CompOR <"and", 0b00, and>;
+
+let CextOpcode = "ORr_ORr" in
+def S4_or_ori : T_CompOR <"or", 0b10, or>;
+
+//    Modulo wrap
+//        Rd=modwrap(Rs,Rt)
+//    Round
+//        Rd=cround(Rs,#u5)
+//        Rd=cround(Rs,Rt)
+//        Rd=round(Rs,#u5)[:sat]
+//        Rd=round(Rs,Rt)[:sat]
+//    Vector reduce add unsigned halfwords
+//        Rd=vraddh(Rss,Rtt)
+//    Vector add bytes
+//        Rdd=vaddb(Rss,Rtt)
+//    Vector conditional negate
+//        Rdd=vcnegh(Rss,Rt)
+//        Rxx+=vrcnegh(Rss,Rt)
+//    Vector maximum bytes
+//        Rdd=vmaxb(Rtt,Rss)
+//    Vector reduce maximum halfwords
+//        Rxx=vrmaxh(Rss,Ru)
+//        Rxx=vrmaxuh(Rss,Ru)
+//    Vector reduce maximum words
+//        Rxx=vrmaxuw(Rss,Ru)
+//        Rxx=vrmaxw(Rss,Ru)
+//    Vector minimum bytes
+//        Rdd=vminb(Rtt,Rss)
+//    Vector reduce minimum halfwords
+//        Rxx=vrminh(Rss,Ru)
+//        Rxx=vrminuh(Rss,Ru)
+//    Vector reduce minimum words
+//        Rxx=vrminuw(Rss,Ru)
+//        Rxx=vrminw(Rss,Ru)
+//    Vector subtract bytes
+//        Rdd=vsubb(Rss,Rtt)
+
+//===----------------------------------------------------------------------===//
+// XTYPE/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XTYPE/BIT +
+//===----------------------------------------------------------------------===//
+
+// Bit reverse
+def S2_brevp : T_S2op_3 <"brev", 0b11, 0b110>;
+
+// Bit count
+def S2_ct0p : T_COUNT_LEADING_64<"ct0", 0b111, 0b010>;
+def S2_ct1p : T_COUNT_LEADING_64<"ct1", 0b111, 0b100>;
+def S4_clbpnorm : T_COUNT_LEADING_64<"normamt", 0b011, 0b000>;
+
+let hasSideEffects = 0, hasNewValue = 1 in
+def S4_clbaddi : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, s6_0Imm:$s6),
+    "$Rd = add(clb($Rs), #$s6)", [], "", S_2op_tc_2_SLOT23> {
+  bits<5> Rs;
+  bits<5> Rd;
+  bits<6> s6;
+  let IClass = 0b1000;
+  let Inst{27-24} = 0b1100;
+  let Inst{23-21} = 0b001;
+  let Inst{20-16} = Rs;
+  let Inst{13-8} = s6;
+  let Inst{7-5} = 0b000;
+  let Inst{4-0} = Rd;
+}
+
+let hasSideEffects = 0, hasNewValue = 1 in
+def S4_clbpaddi : SInst<(outs IntRegs:$Rd), (ins DoubleRegs:$Rs, s6_0Imm:$s6),
+    "$Rd = add(clb($Rs), #$s6)", [], "", S_2op_tc_2_SLOT23> {
+  bits<5> Rs;
+  bits<5> Rd;
+  bits<6> s6;
+  let IClass = 0b1000;
+  let Inst{27-24} = 0b1000;
+  let Inst{23-21} = 0b011;
+  let Inst{20-16} = Rs;
+  let Inst{13-8} = s6;
+  let Inst{7-5} = 0b010;
+  let Inst{4-0} = Rd;
+}
+
+
+// Bit test/set/clear
+def S4_ntstbit_i : T_TEST_BIT_IMM<"!tstbit", 0b001>;
+def S4_ntstbit_r : T_TEST_BIT_REG<"!tstbit", 1>;
+
+def C4_nbitsset  : T_TEST_BITS_REG<"!bitsset", 0b01, 1>;
+def C4_nbitsclr  : T_TEST_BITS_REG<"!bitsclr", 0b10, 1>;
+def C4_nbitsclri : T_TEST_BITS_IMM<"!bitsclr", 0b10, 1>;
+
+//===----------------------------------------------------------------------===//
+// XTYPE/BIT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XTYPE/MPY +
+//===----------------------------------------------------------------------===//
+
+// Rd=add(#u6,mpyi(Rs,#U6)) -- Multiply by immed and add immed.
+
+let hasNewValue = 1, isExtendable = 1, opExtentBits = 6, opExtendable = 1 in
+def M4_mpyri_addi : MInst<(outs IntRegs:$Rd),
+  (ins u6_0Ext:$u6, IntRegs:$Rs, u6_0Imm:$U6),
+  "$Rd = add(#$u6, mpyi($Rs, #$U6))" , [],"",ALU64_tc_3x_SLOT23> {
+    bits<5> Rd;
+    bits<6> u6;
+    bits<5> Rs;
+    bits<6> U6;
+
+    let IClass = 0b1101;
+
+    let Inst{27-24} = 0b1000;
+    let Inst{23}    = U6{5};
+    let Inst{22-21} = u6{5-4};
+    let Inst{20-16} = Rs;
+    let Inst{13}    = u6{3};
+    let Inst{12-8}  = Rd;
+    let Inst{7-5}   = u6{2-0};
+    let Inst{4-0}   = U6{4-0};
+  }
+
+// Rd=add(#u6,mpyi(Rs,Rt))
+let CextOpcode = "ADD_MPY", InputType = "imm", hasNewValue = 1,
+    isExtendable = 1, opExtentBits = 6, opExtendable = 1 in
+def M4_mpyrr_addi : MInst <(outs IntRegs:$Rd),
+  (ins u6_0Ext:$u6, IntRegs:$Rs, IntRegs:$Rt),
+  "$Rd = add(#$u6, mpyi($Rs, $Rt))" , [], "", ALU64_tc_3x_SLOT23>, ImmRegRel {
+    bits<5> Rd;
+    bits<6> u6;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1101;
+
+    let Inst{27-23} = 0b01110;
+    let Inst{22-21} = u6{5-4};
+    let Inst{20-16} = Rs;
+    let Inst{13}    = u6{3};
+    let Inst{12-8}  = Rt;
+    let Inst{7-5}   = u6{2-0};
+    let Inst{4-0}   = Rd;
+  }
+
+let hasNewValue = 1 in
+class T_AddMpy <bit MajOp, PatLeaf ImmPred, dag ins>
+  : ALU64Inst <(outs IntRegs:$dst), ins,
+  "$dst = add($src1, mpyi("#!if(MajOp,"$src3, #$src2))",
+                                      "#$src2, $src3))"), [],
+  "", ALU64_tc_3x_SLOT23> {
+    bits<5> dst;
+    bits<5> src1;
+    bits<8> src2;
+    bits<5> src3;
+
+    let IClass = 0b1101;
+
+    bits<6> ImmValue = !if(MajOp, src2{5-0}, src2{7-2});
+
+    let Inst{27-24} = 0b1111;
+    let Inst{23}    = MajOp;
+    let Inst{22-21} = ImmValue{5-4};
+    let Inst{20-16} = src3;
+    let Inst{13}    = ImmValue{3};
+    let Inst{12-8}  = dst;
+    let Inst{7-5}   = ImmValue{2-0};
+    let Inst{4-0}   = src1;
+  }
+
+def M4_mpyri_addr_u2 : T_AddMpy<0b0, u6_2ImmPred,
+                       (ins IntRegs:$src1, u6_2Imm:$src2, IntRegs:$src3)>;
+
+let isExtendable = 1, opExtentBits = 6, opExtendable = 3,
+    CextOpcode = "ADD_MPY", InputType = "imm" in
+def M4_mpyri_addr : T_AddMpy<0b1, u32_0ImmPred,
+                    (ins IntRegs:$src1, IntRegs:$src3, u6_0Ext:$src2)>, ImmRegRel;
+
+// Rx=add(Ru,mpyi(Rx,Rs))
+let CextOpcode = "ADD_MPY", InputType = "reg", hasNewValue = 1 in
+def M4_mpyrr_addr: MInst_acc <(outs IntRegs:$Rx),
+                              (ins IntRegs:$Ru, IntRegs:$_src_, IntRegs:$Rs),
+  "$Rx = add($Ru, mpyi($_src_, $Rs))", [],
+  "$_src_ = $Rx", M_tc_3x_SLOT23>, ImmRegRel {
+    bits<5> Rx;
+    bits<5> Ru;
+    bits<5> Rs;
+
+    let IClass = 0b1110;
+
+    let Inst{27-21} = 0b0011000;
+    let Inst{12-8} = Rx;
+    let Inst{4-0} = Ru;
+    let Inst{20-16} = Rs;
+  }
+
+
+// Vector reduce multiply word by signed half (32x16)
+//Rdd=vrmpyweh(Rss,Rtt)[:<<1]
+def M4_vrmpyeh_s0 : T_M2_vmpy<"vrmpyweh", 0b010, 0b100, 0, 0, 0>;
+def M4_vrmpyeh_s1 : T_M2_vmpy<"vrmpyweh", 0b110, 0b100, 1, 0, 0>;
+
+//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+def M4_vrmpyoh_s0 : T_M2_vmpy<"vrmpywoh", 0b001, 0b010, 0, 0, 0>;
+def M4_vrmpyoh_s1 : T_M2_vmpy<"vrmpywoh", 0b101, 0b010, 1, 0, 0>;
+
+//Rdd+=vrmpyweh(Rss,Rtt)[:<<1]
+def M4_vrmpyeh_acc_s0: T_M2_vmpy_acc<"vrmpyweh", 0b001, 0b110, 0, 0>;
+def M4_vrmpyeh_acc_s1: T_M2_vmpy_acc<"vrmpyweh", 0b101, 0b110, 1, 0>;
+
+//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+def M4_vrmpyoh_acc_s0: T_M2_vmpy_acc<"vrmpywoh", 0b011, 0b110, 0, 0>;
+def M4_vrmpyoh_acc_s1: T_M2_vmpy_acc<"vrmpywoh", 0b111, 0b110, 1, 0>;
+
+// Vector multiply halfwords, signed by unsigned
+// Rdd=vmpyhsu(Rs,Rt)[:<<]:sat
+def M2_vmpy2su_s0 : T_XTYPE_mpy64 < "vmpyhsu", 0b000, 0b111, 1, 0, 0>;
+def M2_vmpy2su_s1 : T_XTYPE_mpy64 < "vmpyhsu", 0b100, 0b111, 1, 1, 0>;
+
+// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat
+def M2_vmac2su_s0 : T_XTYPE_mpy64_acc < "vmpyhsu", "+", 0b011, 0b101, 1, 0, 0>;
+def M2_vmac2su_s1 : T_XTYPE_mpy64_acc < "vmpyhsu", "+", 0b111, 0b101, 1, 1, 0>;
+
+// Vector polynomial multiply halfwords
+// Rdd=vpmpyh(Rs,Rt)
+def M4_vpmpyh : T_XTYPE_mpy64 < "vpmpyh", 0b110, 0b111, 0, 0, 0>;
+
+// Rxx^=vpmpyh(Rs,Rt)
+def M4_vpmpyh_acc : T_XTYPE_mpy64_acc < "vpmpyh", "^", 0b101, 0b111, 0, 0, 0>;
+
+// Polynomial multiply words
+// Rdd=pmpyw(Rs,Rt)
+def M4_pmpyw : T_XTYPE_mpy64 < "pmpyw", 0b010, 0b111, 0, 0, 0>;
+
+// Rxx^=pmpyw(Rs,Rt)
+def M4_pmpyw_acc  : T_XTYPE_mpy64_acc < "pmpyw", "^", 0b001, 0b111, 0, 0, 0>;
+
+//===----------------------------------------------------------------------===//
+// XTYPE/MPY -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/Vector compare
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Template class for vector compare
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in
+class T_vcmpImm <string Str, bits<2> cmpOp, bits<2> minOp, Operand ImmOprnd>
+  : ALU64_rr <(outs PredRegs:$Pd),
+              (ins DoubleRegs:$Rss, ImmOprnd:$Imm),
+  "$Pd = "#Str#"($Rss, #$Imm)",
+  [], "", ALU64_tc_2early_SLOT23> {
+    bits<2> Pd;
+    bits<5> Rss;
+    bits<32> Imm;
+    bits<8> ImmBits;
+    let ImmBits{6-0} = Imm{6-0};
+    let ImmBits{7} = !if (!eq(cmpOp,0b10), 0b0, Imm{7}); // 0 for vcmp[bhw].gtu
+
+    let IClass = 0b1101;
+
+    let Inst{27-24} = 0b1100;
+    let Inst{22-21} = cmpOp;
+    let Inst{20-16} = Rss;
+    let Inst{12-5} = ImmBits;
+    let Inst{4-3} = minOp;
+    let Inst{1-0} = Pd;
+  }
+
+// Vector compare bytes
+def A4_vcmpbgt   : T_vcmp <"vcmpb.gt", 0b1010>;
+
+let AsmString = "$Pd = any8(vcmpb.eq($Rss, $Rtt))" in
+def A4_vcmpbeq_any : T_vcmp <"any8(vcmpb.gt", 0b1000>;
+
+def A4_vcmpbeqi  : T_vcmpImm <"vcmpb.eq",  0b00, 0b00, u8_0Imm>;
+def A4_vcmpbgti  : T_vcmpImm <"vcmpb.gt",  0b01, 0b00, s8_0Imm>;
+def A4_vcmpbgtui : T_vcmpImm <"vcmpb.gtu", 0b10, 0b00, u7_0Imm>;
+
+// Vector compare halfwords
+def A4_vcmpheqi  : T_vcmpImm <"vcmph.eq",  0b00, 0b01, s8_0Imm>;
+def A4_vcmphgti  : T_vcmpImm <"vcmph.gt",  0b01, 0b01, s8_0Imm>;
+def A4_vcmphgtui : T_vcmpImm <"vcmph.gtu", 0b10, 0b01, u7_0Imm>;
+
+// Vector compare words
+def A4_vcmpweqi  : T_vcmpImm <"vcmpw.eq",  0b00, 0b10, s8_0Imm>;
+def A4_vcmpwgti  : T_vcmpImm <"vcmpw.gt",  0b01, 0b10, s8_0Imm>;
+def A4_vcmpwgtui : T_vcmpImm <"vcmpw.gtu", 0b10, 0b10, u7_0Imm>;
+
+//===----------------------------------------------------------------------===//
+// XTYPE/SHIFT +
+//===----------------------------------------------------------------------===//
+// Shift by immediate and accumulate/logical.
+// Rx=add(#u8,asl(Rx,#U5))  Rx=add(#u8,lsr(Rx,#U5))
+// Rx=sub(#u8,asl(Rx,#U5))  Rx=sub(#u8,lsr(Rx,#U5))
+// Rx=and(#u8,asl(Rx,#U5))  Rx=and(#u8,lsr(Rx,#U5))
+// Rx=or(#u8,asl(Rx,#U5))   Rx=or(#u8,lsr(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+    hasNewValue = 1, opNewValue = 0 in
+class T_S4_ShiftOperate<string MnOp, string MnSh, bit asl_lsr,
+                        bits<2> MajOp, InstrItinClass Itin>
+  : MInst_acc<(outs IntRegs:$Rd), (ins u8_0Ext:$u8, IntRegs:$Rx, u5_0Imm:$U5),
+      "$Rd = "#MnOp#"(#$u8, "#MnSh#"($Rx, #$U5))",
+      [], "$Rd = $Rx", Itin> {
+
+  bits<5> Rd;
+  bits<8> u8;
+  bits<5> Rx;
+  bits<5> U5;
+
+  let IClass = 0b1101;
+  let Inst{27-24} = 0b1110;
+  let Inst{23-21} = u8{7-5};
+  let Inst{20-16} = Rd;
+  let Inst{13} = u8{4};
+  let Inst{12-8} = U5;
+  let Inst{7-5} = u8{3-1};
+  let Inst{4} = asl_lsr;
+  let Inst{3} = u8{0};
+  let Inst{2-1} = MajOp;
+}
+
+multiclass T_ShiftOperate<string mnemonic, bits<2> MajOp, InstrItinClass Itin> {
+  def _asl_ri : T_S4_ShiftOperate<mnemonic, "asl", 0, MajOp, Itin>;
+  def _lsr_ri : T_S4_ShiftOperate<mnemonic, "lsr", 1, MajOp, Itin>;
+}
+
+defm S4_addi : T_ShiftOperate<"add", 0b10, ALU64_tc_2_SLOT23>;
+defm S4_andi : T_ShiftOperate<"and", 0b00, ALU64_tc_2_SLOT23>;
+defm S4_ori  : T_ShiftOperate<"or",  0b01, ALU64_tc_1_SLOT23>;
+defm S4_subi : T_ShiftOperate<"sub", 0b11, ALU64_tc_1_SLOT23>;
+
+// Vector conditional negate
+// Rdd=vcnegh(Rss,Rt)
+let Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
+def S2_vcnegh   : T_S3op_shiftVect < "vcnegh",   0b11, 0b01>;
+
+// Rd=[cround|round](Rs,Rt)
+let hasNewValue = 1, Itinerary = S_3op_tc_2_SLOT23 in {
+  def A4_cround_rr    : T_S3op_3 < "cround", IntRegs, 0b11, 0b00>;
+  def A4_round_rr     : T_S3op_3 < "round", IntRegs, 0b11, 0b10>;
+}
+
+// Rd=round(Rs,Rt):sat
+let hasNewValue = 1, Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
+def A4_round_rr_sat : T_S3op_3 < "round", IntRegs, 0b11, 0b11, 1>;
+
+// Rd=[cmpyiwh|cmpyrwh](Rss,Rt):<<1:rnd:sat
+let Defs = [USR_OVF], Itinerary = S_3op_tc_3x_SLOT23 in {
+  def M4_cmpyi_wh     : T_S3op_8<"cmpyiwh", 0b100, 1, 1, 1>;
+  def M4_cmpyr_wh     : T_S3op_8<"cmpyrwh", 0b110, 1, 1, 1>;
+}
+
+// Rdd=[add|sub](Rss,Rtt,Px):carry
+let isPredicateLate = 1, hasSideEffects = 0 in
+class T_S3op_carry <string mnemonic, bits<3> MajOp>
+  : SInst < (outs DoubleRegs:$Rdd, PredRegs:$Px),
+            (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, PredRegs:$Pu),
+  "$Rdd = "#mnemonic#"($Rss, $Rtt, $Pu):carry",
+  [], "$Px = $Pu", S_3op_tc_1_SLOT23 > {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<5> Rtt;
+    bits<2> Pu;
+
+    let IClass = 0b1100;
+
+    let Inst{27-24} = 0b0010;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = Rss;
+    let Inst{12-8}  = Rtt;
+    let Inst{6-5}   = Pu;
+    let Inst{4-0}   = Rdd;
+  }
+
+def A4_addp_c : T_S3op_carry < "add", 0b110 >;
+def A4_subp_c : T_S3op_carry < "sub", 0b111 >;
+
+let Itinerary = S_3op_tc_3_SLOT23, hasSideEffects = 0 in
+class T_S3op_6 <string mnemonic, bits<3> MinOp, bit isUnsigned>
+  : SInst <(outs DoubleRegs:$Rxx),
+           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Ru),
+  "$Rxx = "#mnemonic#"($Rss, $Ru)" ,
+  [] , "$dst2 = $Rxx"> {
+    bits<5> Rxx;
+    bits<5> Rss;
+    bits<5> Ru;
+
+    let IClass = 0b1100;
+
+    let Inst{27-21} = 0b1011001;
+    let Inst{20-16} = Rss;
+    let Inst{13}    = isUnsigned;
+    let Inst{12-8}  = Rxx;
+    let Inst{7-5}   = MinOp;
+    let Inst{4-0}   = Ru;
+  }
+
+// Vector reduce maximum halfwords
+// Rxx=vrmax[u]h(Rss,Ru)
+def A4_vrmaxh  : T_S3op_6 < "vrmaxh",  0b001, 0>;
+def A4_vrmaxuh : T_S3op_6 < "vrmaxuh", 0b001, 1>;
+
+// Vector reduce maximum words
+// Rxx=vrmax[u]w(Rss,Ru)
+def A4_vrmaxw  : T_S3op_6 < "vrmaxw",  0b010, 0>;
+def A4_vrmaxuw : T_S3op_6 < "vrmaxuw", 0b010, 1>;
+
+// Vector reduce minimum halfwords
+// Rxx=vrmin[u]h(Rss,Ru)
+def A4_vrminh  : T_S3op_6 < "vrminh",  0b101, 0>;
+def A4_vrminuh : T_S3op_6 < "vrminuh", 0b101, 1>;
+
+// Vector reduce minimum words
+// Rxx=vrmin[u]w(Rss,Ru)
+def A4_vrminw  : T_S3op_6 < "vrminw",  0b110, 0>;
+def A4_vrminuw : T_S3op_6 < "vrminuw", 0b110, 1>;
+
+// Shift an immediate left by register amount.
+let hasNewValue = 1, hasSideEffects = 0 in
+def S4_lsli: SInst <(outs IntRegs:$Rd), (ins s6_0Imm:$s6, IntRegs:$Rt),
+  "$Rd = lsl(#$s6, $Rt)" , [], "", S_3op_tc_1_SLOT23> {
+    bits<5> Rd;
+    bits<6> s6;
+    bits<5> Rt;
+
+    let IClass = 0b1100;
+
+    let Inst{27-22} = 0b011010;
+    let Inst{20-16} = s6{5-1};
+    let Inst{12-8}  = Rt;
+    let Inst{7-6}   = 0b11;
+    let Inst{4-0}   = Rd;
+    let Inst{5}     = s6{0};
+  }
+
+//===----------------------------------------------------------------------===//
+// XTYPE/SHIFT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MEMOP
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// Template class for MemOp instructions with the register value.
+//===----------------------------------------------------------------------===//
+class MemOp_rr_base <string opc, bits<2> opcBits, Operand ImmOp,
+                     string memOp, bits<2> memOpBits> :
+      MEMInst_V4<(outs),
+                 (ins IntRegs:$base, ImmOp:$offset, IntRegs:$delta),
+                 opc#"($base+#$offset)"#memOp#"$delta",
+                 []>,
+                 Requires<[UseMEMOP]> {
+
+    bits<5> base;
+    bits<5> delta;
+    bits<32> offset;
+    bits<6> offsetBits; // memb - u6:0 , memh - u6:1, memw - u6:2
+
+    let offsetBits = !if (!eq(opcBits, 0b00), offset{5-0},
+                     !if (!eq(opcBits, 0b01), offset{6-1},
+                     !if (!eq(opcBits, 0b10), offset{7-2},0)));
+
+    let opExtentAlign = opcBits;
+    let IClass = 0b0011;
+    let Inst{27-24} = 0b1110;
+    let Inst{22-21} = opcBits;
+    let Inst{20-16} = base;
+    let Inst{13} = 0b0;
+    let Inst{12-7} = offsetBits;
+    let Inst{6-5} = memOpBits;
+    let Inst{4-0} = delta;
+}
+
+//===----------------------------------------------------------------------===//
+// Template class for MemOp instructions with the immediate value.
+//===----------------------------------------------------------------------===//
+class MemOp_ri_base <string opc, bits<2> opcBits, Operand ImmOp,
+                     string memOp, bits<2> memOpBits> :
+      MEMInst_V4 <(outs),
+                  (ins IntRegs:$base, ImmOp:$offset, u5_0Imm:$delta),
+                  opc#"($base+#$offset)"#memOp#"#$delta"
+                  #!if(memOpBits{1},")", ""), // clrbit, setbit - include ')'
+                  []>,
+                  Requires<[UseMEMOP]> {
+
+    bits<5> base;
+    bits<5> delta;
+    bits<32> offset;
+    bits<6> offsetBits; // memb - u6:0 , memh - u6:1, memw - u6:2
+
+    let offsetBits = !if (!eq(opcBits, 0b00), offset{5-0},
+                     !if (!eq(opcBits, 0b01), offset{6-1},
+                     !if (!eq(opcBits, 0b10), offset{7-2},0)));
+
+    let opExtentAlign = opcBits;
+    let IClass = 0b0011;
+    let Inst{27-24} = 0b1111;
+    let Inst{22-21} = opcBits;
+    let Inst{20-16} = base;
+    let Inst{13} = 0b0;
+    let Inst{12-7} = offsetBits;
+    let Inst{6-5} = memOpBits;
+    let Inst{4-0} = delta;
+}
+
+// multiclass to define MemOp instructions with register operand.
+multiclass MemOp_rr<string opc, bits<2> opcBits, Operand ImmOp> {
+  def L4_add#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " += ", 0b00>; // add
+  def L4_sub#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " -= ", 0b01>; // sub
+  def L4_and#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " &= ", 0b10>; // and
+  def L4_or#NAME  : MemOp_rr_base <opc, opcBits, ImmOp, " |= ", 0b11>; // or
+}
+
+// multiclass to define MemOp instructions with immediate Operand.
+multiclass MemOp_ri<string opc, bits<2> opcBits, Operand ImmOp> {
+  def L4_iadd#NAME : MemOp_ri_base <opc, opcBits, ImmOp, " += ", 0b00 >;
+  def L4_isub#NAME : MemOp_ri_base <opc, opcBits, ImmOp, " -= ", 0b01 >;
+  def L4_iand#NAME : MemOp_ri_base<opc, opcBits, ImmOp, " = clrbit(", 0b10>;
+  def L4_ior#NAME : MemOp_ri_base<opc, opcBits, ImmOp, " = setbit(", 0b11>;
+}
+
+multiclass MemOp_base <string opc, bits<2> opcBits, Operand ImmOp> {
+  defm _#NAME : MemOp_rr <opc, opcBits, ImmOp>;
+  defm _#NAME : MemOp_ri <opc, opcBits, ImmOp>;
+}
+
+// Define MemOp instructions.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0 in {
+  let opExtentBits = 6, accessSize = ByteAccess in
+  defm memopb_io : MemOp_base <"memb", 0b00, u6_0Ext>;
+
+  let opExtentBits = 7, accessSize = HalfWordAccess in
+  defm memoph_io : MemOp_base <"memh", 0b01, u6_1Ext>;
+
+  let opExtentBits = 8, accessSize = WordAccess in
+  defm memopw_io : MemOp_base <"memw", 0b10, u6_2Ext>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// XTYPE/PRED +
+//===----------------------------------------------------------------------===//
+
+// Hexagon V4 only supports these flavors of byte/half compare instructions:
+// EQ/GT/GTU. Other flavors like GE/GEU/LT/LTU/LE/LEU are not supported by
+// hardware. However, compiler can still implement these patterns through
+// appropriate patterns combinations based on current implemented patterns.
+// The implemented patterns are: EQ/GT/GTU.
+// Missing patterns are: GE/GEU/LT/LTU/LE/LEU.
+
+// Following instruction is not being extended as it results into the
+// incorrect code for negative numbers.
+// Pd=cmpb.eq(Rs,#u8)
+
+// p=!cmp.eq(r1,#s10)
+def C4_cmpneqi  : T_CMP <"cmp.eq",  0b00, 1, s10_0Ext>;
+def C4_cmpltei  : T_CMP <"cmp.gt",  0b01, 1, s10_0Ext>;
+def C4_cmplteui : T_CMP <"cmp.gtu", 0b10, 1, u9_0Ext>;
+
+//===----------------------------------------------------------------------===//
+// XTYPE/PRED -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Multiclass for DeallocReturn
+//===----------------------------------------------------------------------===//
+class L4_RETURN<string mnemonic, bit isNot, bit isPredNew, bit isTak>
+  : LD0Inst<(outs), (ins PredRegs:$src),
+  !if(isNot, "if (!$src", "if ($src")#
+  !if(isPredNew, ".new) ", ") ")#mnemonic#
+  !if(isPredNew, #!if(isTak,":t", ":nt"),""),
+  [], "", LD_tc_3or4stall_SLOT0> {
+
+    bits<2> src;
+    let BaseOpcode = "L4_RETURN";
+    let isPredicatedFalse = isNot;
+    let isPredicatedNew = isPredNew;
+    let isTaken = isTak;
+    let IClass = 0b1001;
+
+    let Inst{27-16} = 0b011000011110;
+
+    let Inst{13} = isNot;
+    let Inst{12} = isTak;
+    let Inst{11} = isPredNew;
+    let Inst{10} = 0b0;
+    let Inst{9-8} = src;
+    let Inst{4-0} = 0b11110;
+  }
+
+// Produce all predicated forms, p, !p, p.new, !p.new, :t, :nt
+multiclass L4_RETURN_PRED<string mnemonic, bit PredNot> {
+  let isPredicated = 1 in {
+    def _#NAME# : L4_RETURN <mnemonic, PredNot, 0, 1>;
+    def _#NAME#new_pnt : L4_RETURN <mnemonic, PredNot, 1, 0>;
+    def _#NAME#new_pt : L4_RETURN <mnemonic, PredNot, 1, 1>;
+  }
+}
+
+multiclass LD_MISC_L4_RETURN<string mnemonic> {
+  let isBarrier = 1, isPredicable = 1 in
+    def NAME : LD0Inst <(outs), (ins), mnemonic, [], "",
+                        LD_tc_3or4stall_SLOT0> {
+      let BaseOpcode = "L4_RETURN";
+      let IClass = 0b1001;
+      let Inst{27-16} = 0b011000011110;
+      let Inst{13-10} = 0b0000;
+      let Inst{4-0} = 0b11110;
+    }
+  defm t : L4_RETURN_PRED<mnemonic, 0 >;
+  defm f : L4_RETURN_PRED<mnemonic, 1 >;
+}
+
+let isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R30], hasSideEffects = 0 in
+defm L4_return: LD_MISC_L4_RETURN <"dealloc_return">, PredNewRel;
+
+// Restore registers and dealloc return function call.
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], isPredicable = 0, isAsmParserOnly = 1 in {
+  def RESTORE_DEALLOC_RET_JMP_V4 : T_JMP<"">;
+
+  let isExtended = 1, opExtendable = 0 in
+  def RESTORE_DEALLOC_RET_JMP_V4_EXT : T_JMP<"">;
+
+  let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
+    def RESTORE_DEALLOC_RET_JMP_V4_PIC : T_JMP<"">;
+
+    let isExtended = 1, opExtendable = 0 in
+    def RESTORE_DEALLOC_RET_JMP_V4_EXT_PIC : T_JMP<"">;
+  }
+}
+
+// Restore registers and dealloc frame before a tail call.
+let isCall = 1, Defs = [R29, R30, R31, PC], isAsmParserOnly = 1 in {
+  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : T_Call<0, "">, PredRel;
+
+  let isExtended = 1, opExtendable = 0 in
+  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT : T_Call<0, "">, PredRel;
+
+  let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
+    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC : T_Call<0, "">, PredRel;
+
+    let isExtended = 1, opExtendable = 0 in
+    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC : T_Call<0, "">, PredRel;
+  }
+}
+
+// Save registers function call.
+let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in {
+  def SAVE_REGISTERS_CALL_V4 : T_Call<0, "">, PredRel;
+
+  let isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4_EXT : T_Call<0, "">, PredRel;
+
+  let Defs = [P0] in
+  def SAVE_REGISTERS_CALL_V4STK : T_Call<0, "">, PredRel;
+
+  let Defs = [P0], isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4STK_EXT : T_Call<0, "">, PredRel;
+
+  let Defs = [R14, R15, R28] in
+  def SAVE_REGISTERS_CALL_V4_PIC : T_Call<0, "">, PredRel;
+
+  let Defs = [R14, R15, R28], isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4_EXT_PIC : T_Call<0, "">, PredRel;
+
+  let Defs = [R14, R15, R28, P0] in
+  def SAVE_REGISTERS_CALL_V4STK_PIC : T_Call<0, "">, PredRel;
+
+  let Defs = [R14, R15, R28, P0], isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4STK_EXT_PIC : T_Call<0, "">, PredRel;
+}
+
+//===----------------------------------------------------------------------===//
+// Template class for non predicated store instructions with
+// GP-Relative or absolute addressing.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, isPredicable = 1 in
+class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
+                    bits<2>MajOp, bit isAbs, bit isHalf>
+  : STInst<(outs), (ins ImmOp:$addr, RC:$src),
+  mnemonic # "(#$addr) = $src"#!if(isHalf, ".h",""),
+  [], "", V2LDST_tc_st_SLOT01> {
+    bits<19> addr;
+    bits<5> src;
+    bits<16> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
+                     !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
+                     !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
+                                      /* u16_0Imm */ addr{15-0})));
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
+    let Uses = !if (isAbs, [], [GP]);
+
+    let IClass = 0b0100;
+    let Inst{27} = 1;
+    let Inst{26-25} = offsetBits{15-14};
+    let Inst{24}    = 0b0;
+    let Inst{23-22} = MajOp;
+    let Inst{21}    = isHalf;
+    let Inst{20-16} = offsetBits{13-9};
+    let Inst{13}    = offsetBits{8};
+    let Inst{12-8}  = src;
+    let Inst{7-0}   = offsetBits{7-0};
+  }
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated store instructions with
+// GP-Relative or absolute addressing.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, isPredicated = 1, opExtentBits = 6, opExtendable = 1 in
+class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp,
+                       bit isHalf, bit isNot, bit isNew>
+  : STInst<(outs), (ins PredRegs:$src1, u32_0MustExt:$absaddr, RC: $src2),
+  !if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ",
+  ") ")#mnemonic#"(#$absaddr) = $src2"#!if(isHalf, ".h",""),
+  [], "", ST_tc_st_SLOT01>, AddrModeRel {
+    bits<2> src1;
+    bits<6> absaddr;
+    bits<5> src2;
+
+    let isPredicatedNew = isNew;
+    let isPredicatedFalse = isNot;
+    // Store upper-half and store doubleword cannot be NV.
+    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
+
+    let IClass = 0b1010;
+
+    let Inst{27-24} = 0b1111;
+    let Inst{23-22} = MajOp;
+    let Inst{21}    = isHalf;
+    let Inst{17-16} = absaddr{5-4};
+    let Inst{13}    = isNew;
+    let Inst{12-8}  = src2;
+    let Inst{7}     = 0b1;
+    let Inst{6-3}   = absaddr{3-0};
+    let Inst{2}     = isNot;
+    let Inst{1-0}   = src1;
+  }
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated store instructions with absolute addressing.
+//===----------------------------------------------------------------------===//
+class T_StoreAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
+                 bits<2> MajOp, bit isHalf>
+  : T_StoreAbsGP <mnemonic, RC, u32_0MustExt, MajOp, 1, isHalf>,
+                  AddrModeRel {
+  string ImmOpStr = !cast<string>(ImmOp);
+  let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
+                     !if (!eq(ImmOpStr, "u16_2Imm"), 18,
+                     !if (!eq(ImmOpStr, "u16_1Imm"), 17,
+                                      /* u16_0Imm */ 16)));
+
+  let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
+                      !if (!eq(ImmOpStr, "u16_2Imm"), 2,
+                      !if (!eq(ImmOpStr, "u16_1Imm"), 1,
+                                       /* u16_0Imm */ 0)));
+}
+
+//===----------------------------------------------------------------------===//
+// Multiclass for store instructions with absolute addressing.
+//===----------------------------------------------------------------------===//
+let addrMode = Absolute, isExtended = 1 in
+multiclass ST_Abs<string mnemonic, string CextOp, RegisterClass RC,
+                  Operand ImmOp, bits<2> MajOp, bit isHalf = 0> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+    let opExtendable = 0, isPredicable = 1 in
+    def PS_#NAME#abs : T_StoreAbs <mnemonic, RC, ImmOp, MajOp, isHalf>;
+
+    // Predicated
+    def S4_p#NAME#t_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 0, 0>;
+    def S4_p#NAME#f_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 1, 0>;
+
+    // .new Predicated
+    def S4_p#NAME#tnew_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 0, 1>;
+    def S4_p#NAME#fnew_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 1, 1>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Template class for non predicated new-value store instructions with
+// GP-Relative or absolute addressing.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, isPredicable = 1, mayStore = 1, isNVStore = 1,
+    isNewValue = 1, opNewValue = 1 in
+class T_StoreAbsGP_NV <string mnemonic, Operand ImmOp, bits<2>MajOp>
+  : NVInst_V4<(outs), (ins ImmOp:$addr, IntRegs:$src),
+  mnemonic #"(#$addr) = $src.new",
+  [], "", V2LDST_tc_st_SLOT0> {
+    bits<19> addr;
+    bits<3> src;
+    bits<16> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
+                     !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
+                     !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
+                                      /* u16_0Imm */ addr{15-0})));
+    let IClass = 0b0100;
+
+    let Inst{27} = 1;
+    let Inst{26-25} = offsetBits{15-14};
+    let Inst{24-21} = 0b0101;
+    let Inst{20-16} = offsetBits{13-9};
+    let Inst{13}    = offsetBits{8};
+    let Inst{12-11} = MajOp;
+    let Inst{10-8}  = src;
+    let Inst{7-0}   = offsetBits{7-0};
+  }
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated new-value store instructions with
+// absolute addressing.
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, isPredicated = 1, mayStore = 1, isNVStore = 1,
+    isNewValue = 1, opNewValue = 2, opExtentBits = 6, opExtendable = 1 in
+class T_StoreAbs_NV_Pred <string mnemonic, bits<2> MajOp, bit isNot, bit isNew>
+  : NVInst_V4<(outs), (ins PredRegs:$src1, u32_0MustExt:$absaddr, IntRegs:$src2),
+  !if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ",
+  ") ")#mnemonic#"(#$absaddr) = $src2.new",
+  [], "", ST_tc_st_SLOT0>, AddrModeRel {
+    bits<2> src1;
+    bits<6> absaddr;
+    bits<3> src2;
+
+    let isPredicatedNew = isNew;
+    let isPredicatedFalse = isNot;
+
+    let IClass = 0b1010;
+
+    let Inst{27-24} = 0b1111;
+    let Inst{23-21} = 0b101;
+    let Inst{17-16} = absaddr{5-4};
+    let Inst{13}    = isNew;
+    let Inst{12-11} = MajOp;
+    let Inst{10-8}  = src2;
+    let Inst{7}     = 0b1;
+    let Inst{6-3}   = absaddr{3-0};
+    let Inst{2}     = isNot;
+    let Inst{1-0}   = src1;
+}
+
+//===----------------------------------------------------------------------===//
+// Template class for non-predicated new-value store instructions with
+// absolute addressing.
+//===----------------------------------------------------------------------===//
+class T_StoreAbs_NV <string mnemonic, Operand ImmOp, bits<2> MajOp>
+  : T_StoreAbsGP_NV <mnemonic, u32_0MustExt, MajOp>, AddrModeRel {
+
+  string ImmOpStr = !cast<string>(ImmOp);
+  let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
+                     !if (!eq(ImmOpStr, "u16_2Imm"), 18,
+                     !if (!eq(ImmOpStr, "u16_1Imm"), 17,
+                                      /* u16_0Imm */ 16)));
+
+  let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
+                      !if (!eq(ImmOpStr, "u16_2Imm"), 2,
+                      !if (!eq(ImmOpStr, "u16_1Imm"), 1,
+                                       /* u16_0Imm */ 0)));
+}
+
+//===----------------------------------------------------------------------===//
+// Multiclass for new-value store instructions with absolute addressing.
+//===----------------------------------------------------------------------===//
+let addrMode = Absolute, isExtended = 1  in
+multiclass ST_Abs_NV <string mnemonic, string CextOp, Operand ImmOp,
+                   bits<2> MajOp> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+    let opExtendable = 0, isPredicable = 1 in
+    def PS_#NAME#newabs : T_StoreAbs_NV <mnemonic, ImmOp, MajOp>;
+
+    // Predicated
+    def S4_p#NAME#newt_abs  : T_StoreAbs_NV_Pred <mnemonic, MajOp, 0, 0>;
+    def S4_p#NAME#newf_abs  : T_StoreAbs_NV_Pred <mnemonic, MajOp, 1, 0>;
+
+    // .new Predicated
+    def S4_p#NAME#newtnew_abs : T_StoreAbs_NV_Pred <mnemonic, MajOp, 0, 1>;
+    def S4_p#NAME#newfnew_abs : T_StoreAbs_NV_Pred <mnemonic, MajOp, 1, 1>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Stores with absolute addressing
+//===----------------------------------------------------------------------===//
+let accessSize = ByteAccess in
+defm storerb : ST_Abs    <"memb", "STrib", IntRegs, u16_0Imm, 0b00>,
+               ST_Abs_NV <"memb", "STrib", u16_0Imm, 0b00>;
+
+let accessSize = HalfWordAccess in
+defm storerh : ST_Abs    <"memh", "STrih", IntRegs, u16_1Imm, 0b01>,
+               ST_Abs_NV <"memh", "STrih", u16_1Imm, 0b01>;
+
+let accessSize = WordAccess in
+defm storeri : ST_Abs    <"memw", "STriw", IntRegs, u16_2Imm, 0b10>,
+               ST_Abs_NV <"memw", "STriw", u16_2Imm, 0b10>;
+
+let isNVStorable = 0, accessSize = DoubleWordAccess in
+defm storerd : ST_Abs <"memd", "STrid", DoubleRegs, u16_3Imm, 0b11>;
+
+let isNVStorable = 0, accessSize = HalfWordAccess in
+defm storerf : ST_Abs <"memh", "STrif", IntRegs, u16_1Imm, 0b01, 1>;
+
+//===----------------------------------------------------------------------===//
+// GP-relative stores.
+// mem[bhwd](#global)=Rt
+// Once predicated, these instructions map to absolute addressing mode.
+// if ([!]Pv[.new]) mem[bhwd](##global)=Rt
+//===----------------------------------------------------------------------===//
+
+let Uses = [GP], isAsmParserOnly = 1 in
+class T_StoreGP <string mnemonic, string BaseOp, RegisterClass RC,
+                 Operand ImmOp, bits<2> MajOp, bit isHalf = 0>
+  : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, 0, isHalf> {
+    // Set BaseOpcode same as absolute addressing instructions so that
+    // non-predicated GP-Rel instructions can have relate with predicated
+    // Absolute instruction.
+    let BaseOpcode = BaseOp#_abs;
+  }
+
+let Uses = [GP], isAsmParserOnly = 1 in
+multiclass ST_GP <string mnemonic, string BaseOp, Operand ImmOp,
+                  bits<2> MajOp, bit isHalf = 0> {
+  // Set BaseOpcode same as absolute addressing instructions so that
+  // non-predicated GP-Rel instructions can have relate with predicated
+  // Absolute instruction.
+  let BaseOpcode = BaseOp#_abs in {
+    def NAME#gp : T_StoreAbsGP <mnemonic, IntRegs, ImmOp, MajOp,
+                                0, isHalf>;
+    // New-value store
+    def NAME#newgp : T_StoreAbsGP_NV <mnemonic, ImmOp, MajOp> ;
+  }
+}
+
+let accessSize = ByteAccess in
+defm S2_storerb : ST_GP<"memb", "STrib", u16_0Imm, 0b00>, NewValueRel;
+
+let accessSize = HalfWordAccess in
+defm S2_storerh : ST_GP<"memh", "STrih", u16_1Imm, 0b01>, NewValueRel;
+
+let accessSize = WordAccess in
+defm S2_storeri : ST_GP<"memw", "STriw", u16_2Imm, 0b10>, NewValueRel;
+
+let isNVStorable = 0, accessSize = DoubleWordAccess in
+def S2_storerdgp : T_StoreGP <"memd", "STrid", DoubleRegs,
+                              u16_3Imm, 0b11>, PredNewRel;
+
+let isNVStorable = 0, accessSize = HalfWordAccess in
+def S2_storerfgp : T_StoreGP <"memh", "STrif", IntRegs,
+                              u16_1Imm, 0b01, 1>, PredNewRel;
+
+//===----------------------------------------------------------------------===//
+// Template class for non predicated load instructions with
+// absolute addressing mode.
+//===----------------------------------------------------------------------===//
+let isPredicable = 1, hasSideEffects = 0 in
+class T_LoadAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
+                   bits<3> MajOp>
+  : LDInst <(outs RC:$dst), (ins ImmOp:$addr),
+  "$dst = "#mnemonic# "(#$addr)",
+  [], "", V2LDST_tc_ld_SLOT01> {
+    bits<5> dst;
+    bits<19> addr;
+    bits<16> offsetBits;
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
+                     !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
+                     !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
+                                      /* u16_0Imm */ addr{15-0})));
+
+    let IClass = 0b0100;
+
+    let Inst{27}    = 0b1;
+    let Inst{26-25} = offsetBits{15-14};
+    let Inst{24}    = 0b1;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = offsetBits{13-9};
+    let Inst{13-5}  = offsetBits{8-0};
+    let Inst{4-0}   = dst;
+  }
+
+class T_LoadAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
+                 bits<3> MajOp>
+  : T_LoadAbsGP <mnemonic, RC, u32_0MustExt, MajOp>, AddrModeRel {
+
+    string ImmOpStr = !cast<string>(ImmOp);
+    let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
+                       !if (!eq(ImmOpStr, "u16_2Imm"), 18,
+                       !if (!eq(ImmOpStr, "u16_1Imm"), 17,
+                                        /* u16_0Imm */ 16)));
+
+    let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
+                        !if (!eq(ImmOpStr, "u16_2Imm"), 2,
+                        !if (!eq(ImmOpStr, "u16_1Imm"), 1,
+                                        /* u16_0Imm */ 0)));
+  }
+
+//===----------------------------------------------------------------------===//
+// Template class for predicated load instructions with
+// absolute addressing mode.
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opExtentBits = 6,
+    opExtendable = 2 in
+class T_LoadAbs_Pred <string mnemonic, RegisterClass RC, bits<3> MajOp,
+                      bit isPredNot, bit isPredNew>
+  : LDInst <(outs RC:$dst), (ins PredRegs:$src1, u32_0MustExt:$absaddr),
+  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+  ") ")#"$dst = "#mnemonic#"(#$absaddr)">, AddrModeRel {
+    bits<5> dst;
+    bits<2> src1;
+    bits<6> absaddr;
+
+    let isPredicatedNew = isPredNew;
+    let isPredicatedFalse = isPredNot;
+    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
+
+    let IClass = 0b1001;
+
+    let Inst{27-24} = 0b1111;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = absaddr{5-1};
+    let Inst{13} = 0b1;
+    let Inst{12} = isPredNew;
+    let Inst{11} = isPredNot;
+    let Inst{10-9} = src1;
+    let Inst{8} = absaddr{0};
+    let Inst{7} = 0b1;
+    let Inst{4-0} = dst;
+  }
+
+//===----------------------------------------------------------------------===//
+// Multiclass for the load instructions with absolute addressing mode.
+//===----------------------------------------------------------------------===//
+multiclass LD_Abs_Pred<string mnemonic, RegisterClass RC, bits<3> MajOp,
+                       bit PredNot> {
+  def _abs : T_LoadAbs_Pred <mnemonic, RC, MajOp, PredNot, 0>;
+  // Predicate new
+  def new_abs : T_LoadAbs_Pred <mnemonic, RC, MajOp, PredNot, 1>;
+}
+
+let addrMode = Absolute, isExtended = 1 in
+multiclass LD_Abs<string mnemonic, string CextOp, RegisterClass RC,
+                  Operand ImmOp, bits<3> MajOp> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+    let opExtendable = 1, isPredicable = 1 in
+    def PS_#NAME#abs: T_LoadAbs <mnemonic, RC, ImmOp, MajOp>;
+
+    // Predicated
+    defm L4_p#NAME#t : LD_Abs_Pred<mnemonic, RC, MajOp, 0>;
+    defm L4_p#NAME#f : LD_Abs_Pred<mnemonic, RC, MajOp, 1>;
+  }
+}
+
+let accessSize = ByteAccess, hasNewValue = 1 in {
+  defm loadrb  : LD_Abs<"memb",  "LDrib",  IntRegs, u16_0Imm, 0b000>;
+  defm loadrub : LD_Abs<"memub", "LDriub", IntRegs, u16_0Imm, 0b001>;
+}
+
+let accessSize = HalfWordAccess, hasNewValue = 1 in {
+  defm loadrh  : LD_Abs<"memh",  "LDrih",  IntRegs, u16_1Imm, 0b010>;
+  defm loadruh : LD_Abs<"memuh", "LDriuh", IntRegs, u16_1Imm, 0b011>;
+}
+
+let accessSize = WordAccess, hasNewValue = 1 in
+defm loadri  : LD_Abs<"memw",  "LDriw",  IntRegs, u16_2Imm, 0b100>;
+
+let accessSize = DoubleWordAccess in
+defm loadrd  : LD_Abs<"memd",  "LDrid", DoubleRegs, u16_3Imm, 0b110>;
+
+//===----------------------------------------------------------------------===//
+// multiclass for load instructions with GP-relative addressing mode.
+// Rx=mem[bhwd](##global)
+// Once predicated, these instructions map to absolute addressing mode.
+// if ([!]Pv[.new]) Rx=mem[bhwd](##global)
+//===----------------------------------------------------------------------===//
+
+let isAsmParserOnly = 1, Uses = [GP] in
+class T_LoadGP <string mnemonic, string BaseOp, RegisterClass RC, Operand ImmOp,
+                bits<3> MajOp>
+  : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp>, PredNewRel {
+    let BaseOpcode = BaseOp#_abs;
+  }
+
+let accessSize = ByteAccess, hasNewValue = 1 in {
+  def L2_loadrbgp  : T_LoadGP<"memb",  "LDrib",  IntRegs, u16_0Imm, 0b000>;
+  def L2_loadrubgp : T_LoadGP<"memub", "LDriub", IntRegs, u16_0Imm, 0b001>;
+}
+
+let accessSize = HalfWordAccess, hasNewValue = 1 in {
+  def L2_loadrhgp  : T_LoadGP<"memh",  "LDrih",  IntRegs, u16_1Imm, 0b010>;
+  def L2_loadruhgp : T_LoadGP<"memuh", "LDriuh", IntRegs, u16_1Imm, 0b011>;
+}
+
+let accessSize = WordAccess, hasNewValue = 1 in
+def L2_loadrigp  : T_LoadGP<"memw",  "LDriw",  IntRegs, u16_2Imm, 0b100>;
+
+let accessSize = DoubleWordAccess in
+def L2_loadrdgp  : T_LoadGP<"memd", "LDrid", DoubleRegs, u16_3Imm, 0b110>;
+
+//===----------------------------------------------------------------------===//
+// :raw for of boundscheck:hi:lo insns
+//===----------------------------------------------------------------------===//
+
+// A4_boundscheck_lo: Detect if a register is within bounds.
+let hasSideEffects = 0 in
+def A4_boundscheck_lo: ALU64Inst <
+  (outs PredRegs:$Pd),
+  (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Pd = boundscheck($Rss, $Rtt):raw:lo"> {
+    bits<2> Pd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1101;
+
+    let Inst{27-23} = 0b00100;
+    let Inst{13} = 0b1;
+    let Inst{7-5} = 0b100;
+    let Inst{1-0} = Pd;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
+
+// A4_boundscheck_hi: Detect if a register is within bounds.
+let hasSideEffects = 0 in
+def A4_boundscheck_hi: ALU64Inst <
+  (outs PredRegs:$Pd),
+  (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
+  "$Pd = boundscheck($Rss, $Rtt):raw:hi"> {
+    bits<2> Pd;
+    bits<5> Rss;
+    bits<5> Rtt;
+
+    let IClass = 0b1101;
+
+    let Inst{27-23} = 0b00100;
+    let Inst{13} = 0b1;
+    let Inst{7-5} = 0b101;
+    let Inst{1-0} = Pd;
+    let Inst{20-16} = Rss;
+    let Inst{12-8} = Rtt;
+  }
+
+let hasSideEffects = 0, isAsmParserOnly = 1 in
+def A4_boundscheck : MInst <
+  (outs PredRegs:$Pd), (ins IntRegs:$Rs, DoubleRegs:$Rtt),
+  "$Pd=boundscheck($Rs,$Rtt)">;
+
+// A4_tlbmatch: Detect if a VA/ASID matches a TLB entry.
+let isPredicateLate = 1, hasSideEffects = 0 in
+def A4_tlbmatch : ALU64Inst<(outs PredRegs:$Pd),
+  (ins DoubleRegs:$Rs, IntRegs:$Rt),
+  "$Pd = tlbmatch($Rs, $Rt)",
+  [], "", ALU64_tc_2early_SLOT23> {
+    bits<2> Pd;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1101;
+    let Inst{27-23} = 0b00100;
+    let Inst{20-16} = Rs;
+    let Inst{13} = 0b1;
+    let Inst{12-8} = Rt;
+    let Inst{7-5} = 0b011;
+    let Inst{1-0} = Pd;
+  }
+
+// Use LD0Inst for dcfetch, but set "mayLoad" to 0 because this doesn't
+// really do a load.
+let hasSideEffects = 1, mayLoad = 0 in
+def Y2_dcfetchbo : LD0Inst<(outs), (ins IntRegs:$Rs, u11_3Imm:$u11_3),
+      "dcfetch($Rs + #$u11_3)",
+      [], "", LD_tc_ld_SLOT0> {
+  bits<5> Rs;
+  bits<14> u11_3;
+
+  let IClass = 0b1001;
+  let Inst{27-21} = 0b0100000;
+  let Inst{20-16} = Rs;
+  let Inst{13} = 0b0;
+  let Inst{10-0} = u11_3{13-3};
+}
+
+
+//===----------------------------------------------------------------------===//
+// Compound instructions
+//===----------------------------------------------------------------------===//
+
+let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
+    isPredicated = 1, isPredicatedNew = 1, isExtendable = 1,
+    opExtentBits = 11, opExtentAlign = 2, opExtendable = 1,
+    isTerminator = 1 in
+class CJInst_tstbit_R0<string px, bit np, string tnt>
+  : InstHexagon<(outs), (ins IntRegs:$Rs, brtarget:$r9_2),
+  ""#px#" = tstbit($Rs, #0); if ("
+    #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
+  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
+  bits<4> Rs;
+  bits<11> r9_2;
+
+  // np: !p[01]
+  let isPredicatedFalse = np;
+  // tnt: Taken/Not Taken
+  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
+  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
+
+  let IClass = 0b0001;
+  let Inst{27-26} = 0b00;
+  let Inst{25} = !if (!eq(px, "!p1"), 1,
+                 !if (!eq(px,  "p1"), 1, 0));
+  let Inst{24-23} = 0b11;
+  let Inst{22} = np;
+  let Inst{21-20} = r9_2{10-9};
+  let Inst{19-16} = Rs;
+  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
+  let Inst{9-8} = 0b11;
+  let Inst{7-1} = r9_2{8-2};
+}
+
+let Defs = [PC, P0], Uses = [P0] in {
+  def J4_tstbit0_tp0_jump_nt : CJInst_tstbit_R0<"p0", 0, "nt">;
+  def J4_tstbit0_tp0_jump_t : CJInst_tstbit_R0<"p0", 0, "t">;
+  def J4_tstbit0_fp0_jump_nt : CJInst_tstbit_R0<"p0", 1, "nt">;
+  def J4_tstbit0_fp0_jump_t : CJInst_tstbit_R0<"p0", 1, "t">;
+}
+
+let Defs = [PC, P1], Uses = [P1] in {
+  def J4_tstbit0_tp1_jump_nt : CJInst_tstbit_R0<"p1", 0, "nt">;
+  def J4_tstbit0_tp1_jump_t : CJInst_tstbit_R0<"p1", 0, "t">;
+  def J4_tstbit0_fp1_jump_nt : CJInst_tstbit_R0<"p1", 1, "nt">;
+  def J4_tstbit0_fp1_jump_t : CJInst_tstbit_R0<"p1", 1, "t">;
+}
+
+
+let isBranch = 1, hasSideEffects = 0,
+    isExtentSigned = 1, isPredicated = 1, isPredicatedNew = 1,
+    isExtendable = 1, opExtentBits = 11, opExtentAlign = 2,
+    opExtendable = 2, isTerminator = 1 in
+class CJInst_RR<string px, string op, bit np, string tnt>
+  : InstHexagon<(outs), (ins IntRegs:$Rs, IntRegs:$Rt, brtarget:$r9_2),
+  ""#px#" = cmp."#op#"($Rs, $Rt); if ("
+   #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
+  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
+  bits<4> Rs;
+  bits<4> Rt;
+  bits<11> r9_2;
+
+  // np: !p[01]
+  let isPredicatedFalse = np;
+  // tnt: Taken/Not Taken
+  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
+  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
+
+  let IClass = 0b0001;
+  let Inst{27-23} = !if (!eq(op, "eq"),  0b01000,
+                    !if (!eq(op, "gt"),  0b01001,
+                    !if (!eq(op, "gtu"), 0b01010, 0)));
+  let Inst{22} = np;
+  let Inst{21-20} = r9_2{10-9};
+  let Inst{19-16} = Rs;
+  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
+  // px: Predicate reg 0/1
+  let Inst{12} = !if (!eq(px, "!p1"), 1,
+                 !if (!eq(px,  "p1"), 1, 0));
+  let Inst{11-8} = Rt;
+  let Inst{7-1} = r9_2{8-2};
+}
+
+// P[10] taken/not taken.
+multiclass T_tnt_CJInst_RR<string op, bit np> {
+  let Defs = [PC, P0], Uses = [P0] in {
+    def NAME#p0_jump_nt : CJInst_RR<"p0", op, np, "nt">;
+    def NAME#p0_jump_t : CJInst_RR<"p0", op, np, "t">;
+  }
+  let Defs = [PC, P1], Uses = [P1] in {
+    def NAME#p1_jump_nt : CJInst_RR<"p1", op, np, "nt">;
+    def NAME#p1_jump_t : CJInst_RR<"p1", op, np, "t">;
+  }
+}
+// Predicate / !Predicate
+multiclass T_pnp_CJInst_RR<string op>{
+  defm J4_cmp#NAME#_t : T_tnt_CJInst_RR<op, 0>;
+  defm J4_cmp#NAME#_f : T_tnt_CJInst_RR<op, 1>;
+}
+// TypeCJ Instructions compare RR and jump
+defm eq : T_pnp_CJInst_RR<"eq">;
+defm gt : T_pnp_CJInst_RR<"gt">;
+defm gtu : T_pnp_CJInst_RR<"gtu">;
+
+let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
+    isPredicated = 1, isPredicatedNew = 1, isExtendable = 1, opExtentBits = 11,
+    opExtentAlign = 2, opExtendable = 2, isTerminator = 1 in
+class CJInst_RU5<string px, string op, bit np, string tnt>
+  : InstHexagon<(outs), (ins IntRegs:$Rs, u5_0Imm:$U5, brtarget:$r9_2),
+  ""#px#" = cmp."#op#"($Rs, #$U5); if ("
+    #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
+  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
+  bits<4> Rs;
+  bits<5> U5;
+  bits<11> r9_2;
+
+  // np: !p[01]
+  let isPredicatedFalse = np;
+  // tnt: Taken/Not Taken
+  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
+  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
+
+  let IClass = 0b0001;
+  let Inst{27-26} = 0b00;
+  // px: Predicate reg 0/1
+  let Inst{25} = !if (!eq(px, "!p1"), 1,
+                 !if (!eq(px,  "p1"), 1, 0));
+  let Inst{24-23} = !if (!eq(op, "eq"),  0b00,
+                    !if (!eq(op, "gt"),  0b01,
+                    !if (!eq(op, "gtu"), 0b10, 0)));
+  let Inst{22} = np;
+  let Inst{21-20} = r9_2{10-9};
+  let Inst{19-16} = Rs;
+  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
+  let Inst{12-8} = U5;
+  let Inst{7-1} = r9_2{8-2};
+}
+// P[10] taken/not taken.
+multiclass T_tnt_CJInst_RU5<string op, bit np> {
+  let Defs = [PC, P0], Uses = [P0] in {
+    def NAME#p0_jump_nt : CJInst_RU5<"p0", op, np, "nt">;
+    def NAME#p0_jump_t : CJInst_RU5<"p0", op, np, "t">;
+  }
+  let Defs = [PC, P1], Uses = [P1] in {
+    def NAME#p1_jump_nt : CJInst_RU5<"p1", op, np, "nt">;
+    def NAME#p1_jump_t : CJInst_RU5<"p1", op, np, "t">;
+  }
+}
+// Predicate / !Predicate
+multiclass T_pnp_CJInst_RU5<string op>{
+  defm J4_cmp#NAME#i_t : T_tnt_CJInst_RU5<op, 0>;
+  defm J4_cmp#NAME#i_f : T_tnt_CJInst_RU5<op, 1>;
+}
+// TypeCJ Instructions compare RI and jump
+defm eq : T_pnp_CJInst_RU5<"eq">;
+defm gt : T_pnp_CJInst_RU5<"gt">;
+defm gtu : T_pnp_CJInst_RU5<"gtu">;
+
+let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
+    isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1,
+    isExtendable = 1, opExtentBits = 11, opExtentAlign = 2, opExtendable = 2,
+    isTerminator = 1 in
+class CJInst_Rn1<string px, string op, bit np, string tnt>
+  : InstHexagon<(outs), (ins IntRegs:$Rs, n1Const:$n1, brtarget:$r9_2),
+  ""#px#" = cmp."#op#"($Rs,#$n1); if ("
+  #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
+  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
+  bits<4> Rs;
+  bits<11> r9_2;
+
+  // np: !p[01]
+  let isPredicatedFalse = np;
+  // tnt: Taken/Not Taken
+  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
+  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
+
+  let IClass = 0b0001;
+  let Inst{27-26} = 0b00;
+  let Inst{25} = !if (!eq(px, "!p1"), 1,
+                 !if (!eq(px,  "p1"), 1, 0));
+
+  let Inst{24-23} = 0b11;
+  let Inst{22} = np;
+  let Inst{21-20} = r9_2{10-9};
+  let Inst{19-16} = Rs;
+  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
+  let Inst{9-8} = !if (!eq(op, "eq"),  0b00,
+                  !if (!eq(op, "gt"),  0b01, 0));
+  let Inst{7-1} = r9_2{8-2};
+}
+
+// P[10] taken/not taken.
+multiclass T_tnt_CJInst_Rn1<string op, bit np> {
+  let Defs = [PC, P0], Uses = [P0] in {
+    def NAME#p0_jump_nt : CJInst_Rn1<"p0", op, np, "nt">;
+    def NAME#p0_jump_t : CJInst_Rn1<"p0", op, np, "t">;
+  }
+  let Defs = [PC, P1], Uses = [P1] in {
+    def NAME#p1_jump_nt : CJInst_Rn1<"p1", op, np, "nt">;
+    def NAME#p1_jump_t : CJInst_Rn1<"p1", op, np, "t">;
+  }
+}
+// Predicate / !Predicate
+multiclass T_pnp_CJInst_Rn1<string op>{
+  defm J4_cmp#NAME#n1_t : T_tnt_CJInst_Rn1<op, 0>;
+  defm J4_cmp#NAME#n1_f : T_tnt_CJInst_Rn1<op, 1>;
+}
+// TypeCJ Instructions compare -1 and jump
+defm eq : T_pnp_CJInst_Rn1<"eq">;
+defm gt : T_pnp_CJInst_Rn1<"gt">;
+
+// J4_jumpseti: Direct unconditional jump and set register to immediate.
+let Defs = [PC], isBranch = 1, hasSideEffects = 0, hasNewValue = 1,
+    isExtentSigned = 1, opNewValue = 0, isExtendable = 1, opExtentBits = 11,
+    opExtentAlign = 2, opExtendable = 2 in
+def J4_jumpseti: CJInst_JMPSET <
+  (outs IntRegs:$Rd),
+  (ins u6_0Imm:$U6, brtarget:$r9_2),
+  "$Rd = #$U6 ; jump $r9_2"> {
+    bits<4> Rd;
+    bits<6> U6;
+    bits<11> r9_2;
+
+    let IClass = 0b0001;
+    let Inst{27-24} = 0b0110;
+    let Inst{21-20} = r9_2{10-9};
+    let Inst{19-16} = Rd;
+    let Inst{13-8} = U6;
+    let Inst{7-1} = r9_2{8-2};
+  }
+
+// J4_jumpsetr: Direct unconditional jump and transfer register.
+let Defs = [PC], isBranch = 1, hasSideEffects = 0, hasNewValue = 1,
+    isExtentSigned = 1, opNewValue = 0, isExtendable = 1, opExtentBits = 11,
+    opExtentAlign = 2, opExtendable = 2 in
+def J4_jumpsetr: CJInst_JMPSET <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs, brtarget:$r9_2),
+  "$Rd = $Rs ; jump $r9_2"> {
+    bits<4> Rd;
+    bits<4> Rs;
+    bits<11> r9_2;
+
+    let IClass = 0b0001;
+    let Inst{27-24} = 0b0111;
+    let Inst{21-20} = r9_2{10-9};
+    let Inst{11-8} = Rd;
+    let Inst{19-16} = Rs;
+    let Inst{7-1} = r9_2{8-2};
+  }
+
+// Duplex instructions
+//===----------------------------------------------------------------------===//
+include "HexagonIsetDx.td"
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td
new file mode 100644
index 000000000000..cd19b6916f21
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td
@@ -0,0 +1,497 @@
+//=- HexagonInstrInfoV5.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V5 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XTYPE/MPY
+//===----------------------------------------------------------------------===//
+
+  //Rdd[+]=vrmpybsu(Rss,Rtt)
+let Predicates = [HasV5T] in {
+  def M5_vrmpybsu: T_XTYPE_Vect<"vrmpybsu", 0b110, 0b001, 0>;
+  def M5_vrmacbsu: T_XTYPE_Vect_acc<"vrmpybsu", 0b110, 0b001, 0>;
+
+  //Rdd[+]=vrmpybu(Rss,Rtt)
+  def M5_vrmpybuu: T_XTYPE_Vect<"vrmpybu", 0b100, 0b001, 0>;
+  def M5_vrmacbuu: T_XTYPE_Vect_acc<"vrmpybu", 0b100, 0b001, 0>;
+
+  def M5_vdmpybsu: T_M2_vmpy<"vdmpybsu", 0b101, 0b001, 0, 0, 1>;
+  def M5_vdmacbsu: T_M2_vmpy_acc_sat <"vdmpybsu", 0b001, 0b001, 0, 0>;
+}
+
+// Vector multiply bytes
+// Rdd=vmpyb[s]u(Rs,Rt)
+let Predicates = [HasV5T] in {
+  def M5_vmpybsu: T_XTYPE_mpy64 <"vmpybsu", 0b010, 0b001, 0, 0, 0>;
+  def M5_vmpybuu: T_XTYPE_mpy64 <"vmpybu",  0b100, 0b001, 0, 0, 0>;
+
+  // Rxx+=vmpyb[s]u(Rs,Rt)
+  def M5_vmacbsu: T_XTYPE_mpy64_acc <"vmpybsu", "+", 0b110, 0b001, 0, 0, 0>;
+  def M5_vmacbuu: T_XTYPE_mpy64_acc <"vmpybu", "+", 0b100, 0b001, 0, 0, 0>;
+
+  // Rd=vaddhub(Rss,Rtt):sat
+  let hasNewValue = 1, opNewValue = 0 in
+    def A5_vaddhubs: T_S3op_1 <"vaddhub", IntRegs, 0b01, 0b001, 0, 1>;
+}
+
+def S2_asr_i_p_rnd : S_2OpInstImm<"asr", 0b110, 0b111, u6_0Imm, [], 1>,
+      Requires<[HasV5T]> {
+  bits<6> src2;
+  let Inst{13-8} = src2;
+}
+
+let isAsmParserOnly = 1 in
+def S2_asr_i_p_rnd_goodsyntax
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6_0Imm:$src2),
+    "$dst = asrrnd($src1, #$src2)">;
+
+def C4_fastcorner9 : T_LOGICAL_2OP<"fastcorner9", 0b000, 0, 0>,
+  Requires<[HasV5T]> {
+  let Inst{13,7,4} = 0b111;
+}
+
+def C4_fastcorner9_not : T_LOGICAL_2OP<"!fastcorner9", 0b000, 0, 0>,
+  Requires<[HasV5T]> {
+  let Inst{20,13,7,4} = 0b1111;
+}
+
+let hasNewValue = 1, validSubTargets = HasV5SubT in
+def S5_popcountp : ALU64_rr<(outs IntRegs:$Rd), (ins DoubleRegs:$Rss),
+  "$Rd = popcount($Rss)", [], "", S_2op_tc_2_SLOT23>,
+  Requires<[HasV5T]> {
+    bits<5> Rd;
+    bits<5> Rss;
+
+    let IClass = 0b1000;
+
+    let Inst{27-21} = 0b1000011;
+    let Inst{7-5} = 0b011;
+    let Inst{4-0} = Rd;
+    let Inst{20-16} = Rss;
+  }
+
+let isFP = 1, hasNewValue = 1, opNewValue = 0 in
+class T_MInstFloat <string mnemonic, bits<3> MajOp, bits<3> MinOp>
+  : MInst<(outs IntRegs:$Rd),
+          (ins IntRegs:$Rs, IntRegs:$Rt),
+  "$Rd = "#mnemonic#"($Rs, $Rt)", [],
+  "" , M_tc_3or4x_SLOT23 > ,
+  Requires<[HasV5T]> {
+    bits<5> Rd;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-24} = 0b1011;
+    let Inst{23-21} = MajOp;
+    let Inst{20-16} = Rs;
+    let Inst{13} = 0b0;
+    let Inst{12-8} = Rt;
+    let Inst{7-5} = MinOp;
+    let Inst{4-0} = Rd;
+  }
+
+let isCommutable = 1 in {
+  def F2_sfadd : T_MInstFloat < "sfadd", 0b000, 0b000>;
+  def F2_sfmpy : T_MInstFloat < "sfmpy", 0b010, 0b000>;
+}
+
+def F2_sfsub : T_MInstFloat < "sfsub", 0b000, 0b001>;
+
+let Itinerary = M_tc_3x_SLOT23 in {
+  def F2_sfmax : T_MInstFloat < "sfmax", 0b100, 0b000>;
+  def F2_sfmin : T_MInstFloat < "sfmin", 0b100, 0b001>;
+}
+
+let Itinerary = M_tc_3or4x_SLOT23 in {
+def F2_sffixupn : T_MInstFloat < "sffixupn", 0b110, 0b000>;
+def F2_sffixupd : T_MInstFloat < "sffixupd", 0b110, 0b001>;
+}
+
+// F2_sfrecipa: Reciprocal approximation for division.
+let Uses = [USR], isPredicateLate = 1, isFP = 1,
+    hasSideEffects = 0, hasNewValue = 1, Itinerary = M_tc_3or4x_SLOT23 in
+def F2_sfrecipa: MInst <
+  (outs IntRegs:$Rd, PredRegs:$Pe),
+  (ins IntRegs:$Rs, IntRegs:$Rt),
+  "$Rd, $Pe = sfrecipa($Rs, $Rt)">,
+  Requires<[HasV5T]> {
+    bits<5> Rd;
+    bits<2> Pe;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+    let Inst{27-21} = 0b1011111;
+    let Inst{20-16} = Rs;
+    let Inst{13}    = 0b0;
+    let Inst{12-8}  = Rt;
+    let Inst{7}     = 0b1;
+    let Inst{6-5}   = Pe;
+    let Inst{4-0}   = Rd;
+  }
+
+// F2_dfcmpeq: Floating point compare for equal.
+let Uses = [USR], isCompare = 1, isFP = 1 in
+class T_fcmp <string mnemonic, RegisterClass RC, bits<3> MinOp,
+              list<dag> pattern = [] >
+  : ALU64Inst <(outs PredRegs:$dst), (ins RC:$src1, RC:$src2),
+  "$dst = "#mnemonic#"($src1, $src2)", pattern,
+  "" , ALU64_tc_2early_SLOT23 > ,
+  Requires<[HasV5T]> {
+    bits<2> dst;
+    bits<5> src1;
+    bits<5> src2;
+
+    let IClass = 0b1101;
+
+    let Inst{27-21} = 0b0010111;
+    let Inst{20-16} = src1;
+    let Inst{12-8}  = src2;
+    let Inst{7-5}   = MinOp;
+    let Inst{1-0}   = dst;
+  }
+
+class T_fcmp64 <string mnemonic, PatFrag OpNode, bits<3> MinOp>
+  : T_fcmp <mnemonic, DoubleRegs, MinOp, []> {
+  let IClass = 0b1101;
+  let Inst{27-21} = 0b0010111;
+}
+
+class T_fcmp32 <string mnemonic, PatFrag OpNode, bits<3> MinOp>
+  : T_fcmp <mnemonic, IntRegs, MinOp, []> {
+  let IClass = 0b1100;
+  let Inst{27-21} = 0b0111111;
+}
+
+def F2_dfcmpeq : T_fcmp64<"dfcmp.eq", setoeq, 0b000>;
+def F2_dfcmpgt : T_fcmp64<"dfcmp.gt", setogt, 0b001>;
+def F2_dfcmpge : T_fcmp64<"dfcmp.ge", setoge, 0b010>;
+def F2_dfcmpuo : T_fcmp64<"dfcmp.uo", setuo,  0b011>;
+
+def F2_sfcmpge : T_fcmp32<"sfcmp.ge", setoge, 0b000>;
+def F2_sfcmpuo : T_fcmp32<"sfcmp.uo", setuo,  0b001>;
+def F2_sfcmpeq : T_fcmp32<"sfcmp.eq", setoeq, 0b011>;
+def F2_sfcmpgt : T_fcmp32<"sfcmp.gt", setogt, 0b100>;
+
+// F2 convert template classes:
+let Uses = [USR], isFP = 1 in
+class F2_RDD_RSS_CONVERT<string mnemonic, bits<3> MinOp,
+                         string chop ="">
+  : SInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss),
+   "$Rdd = "#mnemonic#"($Rss)"#chop, [], "",
+   S_2op_tc_3or4x_SLOT23> {
+     bits<5> Rdd;
+     bits<5> Rss;
+
+     let IClass = 0b1000;
+
+     let Inst{27-21} = 0b0000111;
+     let Inst{20-16} = Rss;
+     let Inst{7-5} = MinOp;
+     let Inst{4-0} = Rdd;
+  }
+
+let Uses = [USR], isFP = 1 in
+class F2_RDD_RS_CONVERT<string mnemonic, bits<3> MinOp,
+                        string chop ="">
+  : SInst <(outs DoubleRegs:$Rdd), (ins IntRegs:$Rs),
+   "$Rdd = "#mnemonic#"($Rs)"#chop, [], "",
+   S_2op_tc_3or4x_SLOT23> {
+     bits<5> Rdd;
+     bits<5> Rs;
+
+     let IClass = 0b1000;
+
+     let Inst{27-21} = 0b0100100;
+     let Inst{20-16} = Rs;
+     let Inst{7-5} = MinOp;
+     let Inst{4-0} = Rdd;
+  }
+
+let Uses = [USR], isFP = 1, hasNewValue = 1 in
+class F2_RD_RSS_CONVERT<string mnemonic, bits<3> MinOp,
+                        string chop ="">
+  : SInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss),
+   "$Rd = "#mnemonic#"($Rss)"#chop, [], "",
+   S_2op_tc_3or4x_SLOT23> {
+     bits<5> Rd;
+     bits<5> Rss;
+
+     let IClass = 0b1000;
+
+     let Inst{27-24} = 0b1000;
+     let Inst{23-21} = MinOp;
+     let Inst{20-16} = Rss;
+     let Inst{7-5} = 0b001;
+     let Inst{4-0} = Rd;
+  }
+
+let Uses = [USR], isFP = 1, hasNewValue = 1 in
+class F2_RD_RS_CONVERT<string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                        string chop ="">
+  : SInst <(outs IntRegs:$Rd), (ins IntRegs:$Rs),
+   "$Rd = "#mnemonic#"($Rs)"#chop, [], "",
+   S_2op_tc_3or4x_SLOT23> {
+     bits<5> Rd;
+     bits<5> Rs;
+
+     let IClass = 0b1000;
+
+     let Inst{27-24} = 0b1011;
+     let Inst{23-21} = MajOp;
+     let Inst{20-16} = Rs;
+     let Inst{7-5} = MinOp;
+     let Inst{4-0} = Rd;
+  }
+
+// Convert single precision to double precision and vice-versa.
+def F2_conv_sf2df : F2_RDD_RS_CONVERT <"convert_sf2df", 0b000>;
+def F2_conv_df2sf : F2_RD_RSS_CONVERT <"convert_df2sf", 0b000>;
+
+// Convert Integer to Floating Point.
+def F2_conv_d2sf : F2_RD_RSS_CONVERT <"convert_d2sf", 0b010>;
+def F2_conv_ud2sf : F2_RD_RSS_CONVERT <"convert_ud2sf", 0b001>;
+def F2_conv_uw2sf : F2_RD_RS_CONVERT <"convert_uw2sf", 0b001, 0b000>;
+def F2_conv_w2sf : F2_RD_RS_CONVERT <"convert_w2sf", 0b010, 0b000>;
+def F2_conv_d2df : F2_RDD_RSS_CONVERT <"convert_d2df", 0b011>;
+def F2_conv_ud2df : F2_RDD_RSS_CONVERT <"convert_ud2df", 0b010>;
+def F2_conv_uw2df : F2_RDD_RS_CONVERT <"convert_uw2df", 0b001>;
+def F2_conv_w2df : F2_RDD_RS_CONVERT <"convert_w2df", 0b010>;
+
+// Convert Floating Point to Integer.
+def F2_conv_df2uw_chop : F2_RD_RSS_CONVERT <"convert_df2uw", 0b101, ":chop">;
+def F2_conv_df2w_chop : F2_RD_RSS_CONVERT <"convert_df2w", 0b111, ":chop">;
+def F2_conv_sf2uw_chop : F2_RD_RS_CONVERT <"convert_sf2uw", 0b011, 0b001,
+                                           ":chop">;
+def F2_conv_sf2w_chop : F2_RD_RS_CONVERT <"convert_sf2w", 0b100, 0b001,
+                                          ":chop">;
+def F2_conv_df2d_chop : F2_RDD_RSS_CONVERT <"convert_df2d", 0b110, ":chop">;
+def F2_conv_df2ud_chop : F2_RDD_RSS_CONVERT <"convert_df2ud", 0b111, ":chop">;
+def F2_conv_sf2d_chop : F2_RDD_RS_CONVERT <"convert_sf2d", 0b110, ":chop">;
+def F2_conv_sf2ud_chop : F2_RDD_RS_CONVERT <"convert_sf2ud", 0b101, ":chop">;
+
+// Convert Floating Point to Integer: non-chopped.
+let AddedComplexity = 20, Predicates = [HasV5T] in {
+  def F2_conv_df2d : F2_RDD_RSS_CONVERT <"convert_df2d", 0b000>;
+  def F2_conv_df2ud : F2_RDD_RSS_CONVERT <"convert_df2ud", 0b001>;
+  def F2_conv_sf2ud : F2_RDD_RS_CONVERT <"convert_sf2ud", 0b011>;
+  def F2_conv_sf2d : F2_RDD_RS_CONVERT <"convert_sf2d", 0b100>;
+  def F2_conv_df2uw : F2_RD_RSS_CONVERT <"convert_df2uw", 0b011>;
+  def F2_conv_df2w : F2_RD_RSS_CONVERT <"convert_df2w", 0b100>;
+  def F2_conv_sf2uw : F2_RD_RS_CONVERT <"convert_sf2uw", 0b011, 0b000>;
+  def F2_conv_sf2w : F2_RD_RS_CONVERT <"convert_sf2w", 0b100, 0b000>;
+}
+
+// Fix up radicand.
+let Uses = [USR], isFP = 1, hasNewValue = 1 in
+def F2_sffixupr: SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs),
+  "$Rd = sffixupr($Rs)",
+  [], "" , S_2op_tc_3or4x_SLOT23>, Requires<[HasV5T]> {
+    bits<5> Rd;
+    bits<5> Rs;
+
+    let IClass = 0b1000;
+
+    let Inst{27-21} = 0b1011101;
+    let Inst{20-16} = Rs;
+    let Inst{7-5}   = 0b000;
+    let Inst{4-0}   = Rd;
+  }
+
+// F2_sffma: Floating-point fused multiply add.
+let Uses = [USR], isFP = 1, hasNewValue = 1 in
+class T_sfmpy_acc <bit isSub, bit isLib>
+  : MInst<(outs IntRegs:$Rx),
+          (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
+  "$Rx "#!if(isSub, "-=","+=")#" sfmpy($Rs, $Rt)"#!if(isLib, ":lib",""),
+  [], "$dst2 = $Rx" , M_tc_3or4x_SLOT23 > ,
+  Requires<[HasV5T]> {
+    bits<5> Rx;
+    bits<5> Rs;
+    bits<5> Rt;
+
+    let IClass = 0b1110;
+
+    let Inst{27-21} = 0b1111000;
+    let Inst{20-16} = Rs;
+    let Inst{13}    = 0b0;
+    let Inst{12-8}  = Rt;
+    let Inst{7}     = 0b1;
+    let Inst{6}     = isLib;
+    let Inst{5}     = isSub;
+    let Inst{4-0}   = Rx;
+  }
+
+def F2_sffma: T_sfmpy_acc <0, 0>;
+def F2_sffms: T_sfmpy_acc <1, 0>;
+def F2_sffma_lib: T_sfmpy_acc <0, 1>;
+def F2_sffms_lib: T_sfmpy_acc <1, 1>;
+
+// Floating-point fused multiply add w/ additional scaling (2**pu).
+let Uses = [USR], isFP = 1, hasNewValue = 1 in
+def F2_sffma_sc: MInst <
+  (outs IntRegs:$Rx),
+  (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt, PredRegs:$Pu),
+  "$Rx += sfmpy($Rs, $Rt, $Pu):scale" ,
+  [], "$dst2 = $Rx" , M_tc_3or4x_SLOT23 > ,
+  Requires<[HasV5T]> {
+    bits<5> Rx;
+    bits<5> Rs;
+    bits<5> Rt;
+    bits<2> Pu;
+
+    let IClass = 0b1110;
+
+    let Inst{27-21} = 0b1111011;
+    let Inst{20-16} = Rs;
+    let Inst{13}    = 0b0;
+    let Inst{12-8}  = Rt;
+    let Inst{7}     = 0b1;
+    let Inst{6-5}   = Pu;
+    let Inst{4-0}   = Rx;
+  }
+
+//===----------------------------------------------------------------------===//
+// :natural forms of vasrh and vasrhub insns
+//===----------------------------------------------------------------------===//
+// S5_asrhub_rnd_sat: Vector arithmetic shift right by immediate with round,
+// saturate, and pack.
+let Defs = [USR_OVF], hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class T_ASRHUB<bit isSat>
+  : SInst <(outs IntRegs:$Rd),
+  (ins DoubleRegs:$Rss, u4_0Imm:$u4),
+  "$Rd = vasrhub($Rss, #$u4):"#!if(isSat, "sat", "raw"),
+  [], "", S_2op_tc_2_SLOT23>,
+  Requires<[HasV5T]> {
+    bits<5> Rd;
+    bits<5> Rss;
+    bits<4> u4;
+
+    let IClass = 0b1000;
+
+    let Inst{27-21} = 0b1000011;
+    let Inst{20-16} = Rss;
+    let Inst{13-12} = 0b00;
+    let Inst{11-8} = u4;
+    let Inst{7-6} = 0b10;
+    let Inst{5} = isSat;
+    let Inst{4-0} = Rd;
+  }
+
+def S5_asrhub_rnd_sat : T_ASRHUB <0>;
+def S5_asrhub_sat : T_ASRHUB <1>;
+
+let isAsmParserOnly = 1 in
+def S5_asrhub_rnd_sat_goodsyntax
+  : SInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss, u4_0Imm:$u4),
+  "$Rd = vasrhub($Rss, #$u4):rnd:sat">, Requires<[HasV5T]>;
+
+// S5_vasrhrnd: Vector arithmetic shift right by immediate with round.
+let hasSideEffects = 0 in
+def S5_vasrhrnd : SInst <(outs DoubleRegs:$Rdd),
+                         (ins DoubleRegs:$Rss, u4_0Imm:$u4),
+  "$Rdd = vasrh($Rss, #$u4):raw">,
+  Requires<[HasV5T]> {
+    bits<5> Rdd;
+    bits<5> Rss;
+    bits<4> u4;
+
+    let IClass = 0b1000;
+
+    let Inst{27-21} = 0b0000001;
+    let Inst{20-16} = Rss;
+    let Inst{13-12} = 0b00;
+    let Inst{11-8}  = u4;
+    let Inst{7-5}   = 0b000;
+    let Inst{4-0}   = Rdd;
+  }
+
+let isAsmParserOnly = 1 in
+def S5_vasrhrnd_goodsyntax
+  : SInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, u4_0Imm:$u4),
+  "$Rdd = vasrh($Rss,#$u4):rnd">, Requires<[HasV5T]>;
+
+// Floating point reciprocal square root approximation
+let Uses = [USR], isPredicateLate = 1, isFP = 1,
+    hasSideEffects = 0, hasNewValue = 1, opNewValue = 0,
+    validSubTargets = HasV5SubT in
+def F2_sfinvsqrta: SInst <
+  (outs IntRegs:$Rd, PredRegs:$Pe),
+  (ins IntRegs:$Rs),
+  "$Rd, $Pe = sfinvsqrta($Rs)" > ,
+  Requires<[HasV5T]> {
+    bits<5> Rd;
+    bits<2> Pe;
+    bits<5> Rs;
+
+    let IClass = 0b1000;
+
+    let Inst{27-21} = 0b1011111;
+    let Inst{20-16} = Rs;
+    let Inst{7} = 0b0;
+    let Inst{6-5} = Pe;
+    let Inst{4-0} = Rd;
+  }
+
+// Complex multiply 32x16
+let Defs = [USR_OVF], Itinerary = S_3op_tc_3x_SLOT23 in {
+  def M4_cmpyi_whc : T_S3op_8<"cmpyiwh", 0b101, 1, 1, 1, 1>;
+  def M4_cmpyr_whc : T_S3op_8<"cmpyrwh", 0b111, 1, 1, 1, 1>;
+}
+
+// Classify floating-point value
+let Uses = [USR], isFP = 1 in
+def F2_sfclass : T_TEST_BIT_IMM<"sfclass", 0b111>, Requires<[HasV5T]>;
+
+let Uses = [USR], isFP = 1 in
+def F2_dfclass: ALU64Inst<(outs PredRegs:$Pd), (ins DoubleRegs:$Rss, u5_0Imm:$u5),
+  "$Pd = dfclass($Rss, #$u5)",
+  [], "" , ALU64_tc_2early_SLOT23 > , Requires<[HasV5T]> {
+    bits<2> Pd;
+    bits<5> Rss;
+    bits<5> u5;
+
+    let IClass = 0b1101;
+    let Inst{27-21} = 0b1100100;
+    let Inst{20-16} = Rss;
+    let Inst{12-10} = 0b000;
+    let Inst{9-5}   = u5;
+    let Inst{4-3}   = 0b10;
+    let Inst{1-0}   = Pd;
+  }
+
+// Instructions to create floating point constant
+class T_fimm <string mnemonic, RegisterClass RC, bits<4> RegType, bit isNeg>
+  : ALU64Inst<(outs RC:$dst), (ins u10_0Imm:$src),
+  "$dst = "#mnemonic#"(#$src)"#!if(isNeg, ":neg", ":pos"),
+  [], "", ALU64_tc_2_SLOT23>, Requires<[HasV5T]> {
+    bits<5> dst;
+    bits<10> src;
+
+    let IClass = 0b1101;
+    let Inst{27-24} = RegType;
+    let Inst{23}    = 0b0;
+    let Inst{22}    = isNeg;
+    let Inst{21}    = src{9};
+    let Inst{13-5}  = src{8-0};
+    let Inst{4-0}   = dst;
+  }
+
+let hasNewValue = 1, opNewValue = 0 in {
+  def F2_sfimm_p : T_fimm <"sfmake", IntRegs, 0b0110, 0>;
+  def F2_sfimm_n : T_fimm <"sfmake", IntRegs, 0b0110, 1>;
+}
+
+def F2_dfimm_p : T_fimm <"dfmake", DoubleRegs, 0b1001, 0>;
+def F2_dfimm_n : T_fimm <"dfmake", DoubleRegs, 0b1001, 1>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td
new file mode 100644
index 000000000000..c50141b18ead
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td
@@ -0,0 +1,2068 @@
+//=- HexagonInstrInfoV60.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V60 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+// Vector load
+let Predicates = [HasV60T, UseHVX] in
+let mayLoad = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
+  class V6_LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                  string cstr = "", InstrItinClass itin = CVI_VM_LD,
+                  IType type = TypeCVI_VM_LD>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+// Vector store
+let Predicates = [HasV60T, UseHVX] in
+let mayStore = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
+class V6_STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = CVI_VM_ST,
+                IType type = TypeCVI_VM_ST>
+: InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+//===----------------------------------------------------------------------===//
+// Vector loads with base + immediate offset
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, accessSize = Vector64Access in
+class T_vload_ai<string asmStr>
+  : V6_LDInst <(outs VectorRegs:$dst), (ins IntRegs:$src1, s4_6Imm:$src2),
+                asmStr>;
+
+let isCodeGenOnly = 1, addrMode = BaseImmOffset, accessSize = Vector128Access in
+class T_vload_ai_128B<string asmStr>
+  : V6_LDInst <(outs VectorRegs128B:$dst), (ins IntRegs:$src1, s4_7Imm:$src2),
+                asmStr>;
+
+let isCVLoadable = 1, hasNewValue = 1 in {
+  def V6_vL32b_ai         : T_vload_ai <"$dst = vmem($src1+#$src2)">,
+                            V6_vL32b_ai_enc;
+  def V6_vL32b_nt_ai      : T_vload_ai <"$dst = vmem($src1+#$src2):nt">,
+                            V6_vL32b_nt_ai_enc;
+  // 128B
+  def V6_vL32b_ai_128B    : T_vload_ai_128B <"$dst = vmem($src1+#$src2)">,
+                            V6_vL32b_ai_128B_enc;
+  def V6_vL32b_nt_ai_128B : T_vload_ai_128B <"$dst = vmem($src1+#$src2):nt">,
+                            V6_vL32b_nt_ai_128B_enc;
+}
+
+let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU, hasNewValue = 1 in {
+  def V6_vL32Ub_ai      : T_vload_ai <"$dst = vmemu($src1+#$src2)">,
+                          V6_vL32Ub_ai_enc;
+  def V6_vL32Ub_ai_128B : T_vload_ai_128B <"$dst = vmemu($src1+#$src2)">,
+                          V6_vL32Ub_ai_128B_enc;
+}
+
+let Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD, isCVLoad = 1,
+    hasNewValue = 1 in {
+  def V6_vL32b_cur_ai    : T_vload_ai <"$dst.cur = vmem($src1+#$src2)">,
+                           V6_vL32b_cur_ai_enc;
+  def V6_vL32b_nt_cur_ai : T_vload_ai <"$dst.cur = vmem($src1+#$src2):nt">,
+                           V6_vL32b_nt_cur_ai_enc;
+  // 128B
+  def V6_vL32b_cur_ai_128B    : T_vload_ai_128B
+                                <"$dst.cur = vmem($src1+#$src2)">,
+                                V6_vL32b_cur_ai_128B_enc;
+  def V6_vL32b_nt_cur_ai_128B : T_vload_ai_128B
+                                <"$dst.cur = vmem($src1+#$src2):nt">,
+                                V6_vL32b_nt_cur_ai_128B_enc;
+}
+
+
+let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD, hasNewValue = 1 in {
+  def V6_vL32b_tmp_ai    : T_vload_ai <"$dst.tmp = vmem($src1+#$src2)">,
+                           V6_vL32b_tmp_ai_enc;
+  def V6_vL32b_nt_tmp_ai : T_vload_ai <"$dst.tmp = vmem($src1+#$src2):nt">,
+                           V6_vL32b_nt_tmp_ai_enc;
+  // 128B
+  def V6_vL32b_tmp_ai_128B    : T_vload_ai_128B
+                                <"$dst.tmp = vmem($src1+#$src2)">,
+                                V6_vL32b_tmp_ai_128B_enc;
+  def V6_vL32b_nt_tmp_ai_128B : T_vload_ai_128B
+                                <"$dst.tmp = vmem($src1+#$src2)">,
+                                V6_vL32b_nt_tmp_ai_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - unconditional
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, accessSize = Vector64Access, isPredicable = 1 in
+class T_vstore_ai <string mnemonic, string baseOp, Operand ImmOp,
+                   RegisterClass RC, bit isNT>
+  : V6_STInst <(outs), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+    mnemonic#"($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3">, NewValueRel {
+  let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_ai_64B <string mnemonic, string baseOp, bit isNT = 0>
+  : T_vstore_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_ai_128B <string mnemonic, string baseOp, bit isNT = 0>
+  : T_vstore_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>;
+
+let isNVStorable = 1 in {
+  def V6_vS32b_ai         : T_vstore_ai_64B <"vmem", "vS32b_ai">,
+                            V6_vS32b_ai_enc;
+  def V6_vS32b_ai_128B    : T_vstore_ai_128B <"vmem", "vS32b_ai">,
+                            V6_vS32b_ai_128B_enc;
+}
+
+let isNVStorable = 1, isNonTemporal = 1 in {
+  def V6_vS32b_nt_ai      : T_vstore_ai_64B <"vmem", "vS32b_ai", 1>,
+                            V6_vS32b_nt_ai_enc;
+  def V6_vS32b_nt_ai_128B : T_vstore_ai_128B <"vmem", "vS32b_ai", 1>,
+                            V6_vS32b_nt_ai_128B_enc;
+}
+
+let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
+  def V6_vS32Ub_ai      : T_vstore_ai_64B <"vmemu", "vS32Ub_ai">,
+                          V6_vS32Ub_ai_enc;
+  def V6_vS32Ub_ai_128B : T_vstore_ai_128B <"vmemu", "vS32Ub_ai">,
+                          V6_vS32Ub_ai_128B_enc;
+}
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - unconditional new
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, isNewValue = 1, opNewValue = 2, isNVStore = 1,
+    isPredicable = 1, Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST in
+class T_vstore_new_ai <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT>
+  : V6_STInst <(outs ), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+    "vmem($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3.new">, NewValueRel {
+  let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_new_ai_64B <string baseOp, bit isNT = 0>
+  : T_vstore_new_ai <baseOp, s4_6Imm, VectorRegs, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_new_ai_128B <string baseOp, bit isNT = 0>
+  : T_vstore_new_ai <baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>;
+
+def V6_vS32b_new_ai      : T_vstore_new_ai_64B <"vS32b_ai">, V6_vS32b_new_ai_enc;
+def V6_vS32b_new_ai_128B : T_vstore_new_ai_128B <"vS32b_ai">,
+                           V6_vS32b_new_ai_128B_enc;
+
+let isNonTemporal = 1 in {
+  def V6_vS32b_nt_new_ai      : T_vstore_new_ai_64B<"vS32b_ai", 1>,
+                                V6_vS32b_nt_new_ai_enc;
+  def V6_vS32b_nt_new_ai_128B : T_vstore_new_ai_128B<"vS32b_ai", 1>,
+                                V6_vS32b_nt_new_ai_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - conditional
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, isPredicated = 1 in
+class T_vstore_pred_ai <string mnemonic, string baseOp, Operand ImmOp,
+                        RegisterClass RC, bit isPredNot = 0, bit isNT = 0>
+  : V6_STInst <(outs),
+               (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+    "if ("#!if(isPredNot, "!", "")#"$src1) "
+     #mnemonic#"($src2+#$src3)"#!if(isNT, ":nt", "")#" = $src4">, NewValueRel {
+  let isPredicatedFalse = isPredNot;
+  let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_pred_ai_64B <string mnemonic, string baseOp,
+                            bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_pred_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_pred_ai_128B <string mnemonic, string baseOp,
+                             bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_pred_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B,
+                      isPredNot, isNT>;
+
+let isNVStorable = 1 in {
+  def V6_vS32b_pred_ai     : T_vstore_pred_ai_64B <"vmem", "vS32b_ai">,
+                             V6_vS32b_pred_ai_enc;
+  def V6_vS32b_npred_ai    : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1>,
+                             V6_vS32b_npred_ai_enc;
+  // 128B
+  def V6_vS32b_pred_ai_128B    : T_vstore_pred_ai_128B <"vmem", "vS32b_ai">,
+                                 V6_vS32b_pred_ai_128B_enc;
+  def V6_vS32b_npred_ai_128B   : T_vstore_pred_ai_128B <"vmem", "vS32b_ai", 1>,
+                                 V6_vS32b_npred_ai_128B_enc;
+}
+
+
+let isNVStorable = 1, isNonTemporal = 1 in {
+  def V6_vS32b_nt_pred_ai  : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 0, 1>,
+                             V6_vS32b_nt_pred_ai_enc;
+  def V6_vS32b_nt_npred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1, 1>,
+                             V6_vS32b_nt_npred_ai_enc;
+  // 128B
+  def V6_vS32b_nt_pred_ai_128B  : T_vstore_pred_ai_128B
+                                  <"vmem", "vS32b_ai", 0, 1>,
+                                  V6_vS32b_nt_pred_ai_128B_enc;
+  def V6_vS32b_nt_npred_ai_128B : T_vstore_pred_ai_128B
+                                  <"vmem", "vS32b_ai", 1, 1>,
+                                  V6_vS32b_nt_npred_ai_128B_enc;
+}
+
+let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
+  def V6_vS32Ub_pred_ai  : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai">,
+                           V6_vS32Ub_pred_ai_enc;
+  def V6_vS32Ub_npred_ai : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai", 1>,
+                           V6_vS32Ub_npred_ai_enc;
+  // 128B
+  def V6_vS32Ub_pred_ai_128B  :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai">,
+                               V6_vS32Ub_pred_ai_128B_enc;
+  def V6_vS32Ub_npred_ai_128B :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai", 1>,
+                               V6_vS32Ub_npred_ai_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - byte-enabled aligned
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset in
+class T_vstore_qpred_ai <Operand ImmOp, RegisterClass RC,
+                         bit isPredNot = 0, bit isNT = 0>
+  : V6_STInst <(outs),
+               (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+    "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)"
+          #!if(isNT, ":nt", "")#" = $src4"> {
+  let isPredicatedFalse = isPredNot;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_qpred_ai_64B <bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_qpred_ai <s4_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_qpred_ai_128B <bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_qpred_ai <s4_7Imm, VectorRegs128B, isPredNot, isNT>;
+
+def V6_vS32b_qpred_ai  : T_vstore_qpred_ai_64B, V6_vS32b_qpred_ai_enc;
+def V6_vS32b_nqpred_ai : T_vstore_qpred_ai_64B <1>,
+                         V6_vS32b_nqpred_ai_enc;
+def V6_vS32b_nt_qpred_ai  : T_vstore_qpred_ai_64B <0, 1>,
+                            V6_vS32b_nt_qpred_ai_enc;
+def V6_vS32b_nt_nqpred_ai : T_vstore_qpred_ai_64B <1, 1>,
+                            V6_vS32b_nt_nqpred_ai_enc;
+// 128B
+def V6_vS32b_qpred_ai_128B  : T_vstore_qpred_ai_128B, V6_vS32b_qpred_ai_128B_enc;
+def V6_vS32b_nqpred_ai_128B : T_vstore_qpred_ai_128B<1>,
+                              V6_vS32b_nqpred_ai_128B_enc;
+def V6_vS32b_nt_qpred_ai_128B  : T_vstore_qpred_ai_128B<0, 1>,
+                                 V6_vS32b_nt_qpred_ai_128B_enc;
+def V6_vS32b_nt_nqpred_ai_128B : T_vstore_qpred_ai_128B<1, 1>,
+                                 V6_vS32b_nt_nqpred_ai_128B_enc;
+
+
+//===----------------------------------------------------------------------===//
+// Vector stores with base + immediate offset - conditional new
+//===----------------------------------------------------------------------===//
+let addrMode = BaseImmOffset, isPredicated = 1, isNewValue = 1, opNewValue = 3,
+    isNVStore = 1, Type = TypeCVI_VM_NEW_ST, Itinerary = CVI_VM_NEW_ST in
+class T_vstore_new_pred_ai <string baseOp, Operand ImmOp, RegisterClass RC,
+                            bit isPredNot, bit isNT>
+  : V6_STInst <(outs),
+               (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+    "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)"
+         #!if(isNT, ":nt", "")#" = $src4.new">, NewValueRel {
+  let isPredicatedFalse = isPredNot;
+  let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_new_pred_ai_64B <string baseOp, bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_new_pred_ai <baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_new_pred_ai_128B <string baseOp, bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_new_pred_ai <baseOp#"128B", s4_7Imm, VectorRegs128B,
+                          isPredNot, isNT>;
+
+
+def V6_vS32b_new_pred_ai     : T_vstore_new_pred_ai_64B <"vS32b_ai">,
+                               V6_vS32b_new_pred_ai_enc;
+def V6_vS32b_new_npred_ai    : T_vstore_new_pred_ai_64B <"vS32b_ai", 1>,
+                               V6_vS32b_new_npred_ai_enc;
+// 128B
+def V6_vS32b_new_pred_ai_128B     : T_vstore_new_pred_ai_128B <"vS32b_ai">,
+                                    V6_vS32b_new_pred_ai_128B_enc;
+def V6_vS32b_new_npred_ai_128B    : T_vstore_new_pred_ai_128B <"vS32b_ai", 1>,
+                                    V6_vS32b_new_npred_ai_128B_enc;
+let isNonTemporal = 1 in {
+  def V6_vS32b_nt_new_pred_ai  : T_vstore_new_pred_ai_64B <"vS32b_ai", 0, 1>,
+                                 V6_vS32b_nt_new_pred_ai_enc;
+  def V6_vS32b_nt_new_npred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 1, 1>,
+                                 V6_vS32b_nt_new_npred_ai_enc;
+  // 128B
+  def V6_vS32b_nt_new_pred_ai_128B  : T_vstore_new_pred_ai_128B
+                                      <"vS32b_ai", 0, 1>,
+                                      V6_vS32b_nt_new_pred_ai_128B_enc;
+  def V6_vS32b_nt_new_npred_ai_128B : T_vstore_new_pred_ai_128B
+                                      <"vS32b_ai", 1, 1>,
+                                      V6_vS32b_nt_new_npred_ai_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector loads with immediate offset.
+//===----------------------------------------------------------------------===//
+let addrMode = PostInc, hasNewValue = 1 in
+class T_vload_pi<string asmStr, Operand ImmOp, RegisterClass RC>
+  : V6_LDInst <(outs RC:$dst, IntRegs:$_dst_),
+               (ins IntRegs:$src1, ImmOp:$src2), asmStr, [],
+    "$src1 = $_dst_">;
+
+let accessSize = Vector64Access in
+class T_vload_pi_64B <string asmStr>
+  : T_vload_pi <asmStr, s3_6Imm, VectorRegs>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vload_pi_128B <string asmStr>
+  : T_vload_pi <asmStr, s3_7Imm, VectorRegs128B>;
+
+let isCVLoadable = 1 in {
+  def V6_vL32b_pi    : T_vload_pi_64B <"$dst = vmem($src1++#$src2)">,
+                       V6_vL32b_pi_enc;
+  def V6_vL32b_nt_pi : T_vload_pi_64B <"$dst = vmem($src1++#$src2):nt">,
+                       V6_vL32b_nt_pi_enc;
+  // 128B
+  def V6_vL32b_pi_128B    : T_vload_pi_128B <"$dst = vmem($src1++#$src2)">,
+                            V6_vL32b_pi_128B_enc;
+  def V6_vL32b_nt_pi_128B : T_vload_pi_128B <"$dst = vmem($src1++#$src2):nt">,
+                            V6_vL32b_nt_pi_128B_enc;
+}
+
+let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in {
+  def V6_vL32Ub_pi : T_vload_pi_64B <"$dst = vmemu($src1++#$src2)">,
+                     V6_vL32Ub_pi_enc;
+  // 128B
+  def V6_vL32Ub_pi_128B : T_vload_pi_128B <"$dst = vmemu($src1++#$src2)">,
+                          V6_vL32Ub_pi_128B_enc;
+}
+
+let isCVLoad = 1, Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD in {
+  def V6_vL32b_cur_pi    : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2)">,
+                           V6_vL32b_cur_pi_enc;
+  def V6_vL32b_nt_cur_pi : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2):nt">,
+                           V6_vL32b_nt_cur_pi_enc;
+  // 128B
+  def V6_vL32b_cur_pi_128B    : T_vload_pi_128B
+                                <"$dst.cur = vmem($src1++#$src2)">,
+                                V6_vL32b_cur_pi_128B_enc;
+  def V6_vL32b_nt_cur_pi_128B : T_vload_pi_128B
+                                <"$dst.cur = vmem($src1++#$src2):nt">,
+                                V6_vL32b_nt_cur_pi_128B_enc;
+}
+
+let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in {
+  def V6_vL32b_tmp_pi    : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2)">,
+                           V6_vL32b_tmp_pi_enc;
+  def V6_vL32b_nt_tmp_pi : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2):nt">,
+                           V6_vL32b_nt_tmp_pi_enc;
+  //128B
+  def V6_vL32b_tmp_pi_128B    : T_vload_pi_128B
+                                <"$dst.tmp = vmem($src1++#$src2)">,
+                                V6_vL32b_tmp_pi_128B_enc;
+  def V6_vL32b_nt_tmp_pi_128B : T_vload_pi_128B
+                                <"$dst.tmp = vmem($src1++#$src2):nt">,
+                                V6_vL32b_nt_tmp_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector stores with immediate offset.
+//===----------------------------------------------------------------------===//
+let addrMode = PostInc, isPredicable = 1 in
+class T_vstore_pi <string mnemonic, string baseOp, Operand ImmOp,
+                   RegisterClass RC, bit isNT>
+  : V6_STInst <(outs IntRegs:$_dst_),
+               (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+    mnemonic#"($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3", [],
+    "$src1 = $_dst_">, NewValueRel {
+  let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_pi_64B <string mnemonic, string baseOp, bit isNT = 0>
+  : T_vstore_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_pi_128B <string mnemonic, string baseOp, bit isNT = 0>
+  : T_vstore_pi <mnemonic, baseOp#"128B", s3_7Imm, VectorRegs128B, isNT>;
+
+let isNVStorable = 1 in {
+  def V6_vS32b_pi      : T_vstore_pi_64B <"vmem", "vS32b_pi">, V6_vS32b_pi_enc;
+  def V6_vS32b_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi">,
+                         V6_vS32b_pi_128B_enc;
+}
+
+let isNVStorable = 1 , isNonTemporal = 1  in {
+  def V6_vS32b_nt_pi      : T_vstore_pi_64B <"vmem", "vS32b_pi", 1>,
+                            V6_vS32b_nt_pi_enc;
+  def V6_vS32b_nt_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi", 1>,
+                            V6_vS32b_nt_pi_128B_enc;
+}
+
+
+let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
+  def V6_vS32Ub_pi      : T_vstore_pi_64B <"vmemu", "vS32Ub_pi">,
+                          V6_vS32Ub_pi_enc;
+  def V6_vS32Ub_pi_128B : T_vstore_pi_128B <"vmemu", "vS32Ub_pi">,
+                          V6_vS32Ub_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment unconditional .new vector stores with immediate offset.
+//===----------------------------------------------------------------------===//
+let addrMode = PostInc, isNVStore = 1 in
+let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1,
+    isPredicable = 1, opNewValue = 3, isNVStore = 1 in
+class T_vstore_new_pi <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT>
+  : V6_STInst <(outs IntRegs:$_dst_),
+               (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+    "vmem($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [],
+    "$src1 = $_dst_">, NewValueRel {
+  let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_new_pi_64B <string baseOp, bit isNT = 0>
+  : T_vstore_new_pi <baseOp, s3_6Imm, VectorRegs, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_new_pi_128B <string baseOp, bit isNT = 0>
+  : T_vstore_new_pi <baseOp#"128B", s3_7Imm, VectorRegs128B, isNT>;
+
+
+def V6_vS32b_new_pi      : T_vstore_new_pi_64B <"vS32b_pi">,
+                           V6_vS32b_new_pi_enc;
+def V6_vS32b_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi">,
+                           V6_vS32b_new_pi_128B_enc;
+
+let isNonTemporal = 1 in {
+  def V6_vS32b_nt_new_pi      : T_vstore_new_pi_64B <"vS32b_pi", 1>,
+                                V6_vS32b_nt_new_pi_enc;
+  def V6_vS32b_nt_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi", 1>,
+                                V6_vS32b_nt_new_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment conditional vector stores with immediate offset
+//===----------------------------------------------------------------------===//
+let isPredicated = 1, addrMode = PostInc in
+class T_vstore_pred_pi <string mnemonic, string baseOp, Operand ImmOp,
+                        RegisterClass RC, bit isPredNot, bit isNT>
+  : V6_STInst<(outs IntRegs:$_dst_),
+             (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+    "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++#$src3)"
+          #!if(isNT, ":nt", "")#" = $src4", [],
+    "$src2 = $_dst_">, NewValueRel {
+  let isPredicatedFalse = isPredNot;
+  let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_pred_pi_64B <string mnemonic, string baseOp,
+                            bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_pred_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_pred_pi_128B <string mnemonic, string baseOp,
+                             bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_pred_pi <mnemonic, baseOp#"128B", s3_7Imm, VectorRegs128B,
+                      isPredNot, isNT>;
+
+let isNVStorable = 1 in {
+  def V6_vS32b_pred_pi     : T_vstore_pred_pi_64B <"vmem", "vS32b_pi">,
+                             V6_vS32b_pred_pi_enc;
+  def V6_vS32b_npred_pi    : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1>,
+                             V6_vS32b_npred_pi_enc;
+  // 128B
+  def V6_vS32b_pred_pi_128B  : T_vstore_pred_pi_128B <"vmem", "vS32b_pi">,
+                               V6_vS32b_pred_pi_128B_enc;
+  def V6_vS32b_npred_pi_128B : T_vstore_pred_pi_128B <"vmem", "vS32b_pi", 1>,
+                               V6_vS32b_npred_pi_128B_enc;
+}
+let isNVStorable = 1, isNonTemporal = 1 in {
+  def V6_vS32b_nt_pred_pi  : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 0, 1>,
+                             V6_vS32b_nt_pred_pi_enc;
+  def V6_vS32b_nt_npred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1, 1>,
+                             V6_vS32b_nt_npred_pi_enc;
+  // 128B
+  def V6_vS32b_nt_pred_pi_128B  : T_vstore_pred_pi_128B
+                                  <"vmem", "vS32b_pi", 0, 1>,
+                                  V6_vS32b_nt_pred_pi_128B_enc;
+  def V6_vS32b_nt_npred_pi_128B : T_vstore_pred_pi_128B
+                                  <"vmem", "vS32b_pi", 1, 1>,
+                                  V6_vS32b_nt_npred_pi_128B_enc;
+}
+
+let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
+  def V6_vS32Ub_pred_pi  : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi">,
+                           V6_vS32Ub_pred_pi_enc;
+  def V6_vS32Ub_npred_pi : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi", 1>,
+                           V6_vS32Ub_npred_pi_enc;
+  // 128B
+  def V6_vS32Ub_pred_pi_128B  : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi">,
+                                V6_vS32Ub_pred_pi_128B_enc;
+  def V6_vS32Ub_npred_pi_128B : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi", 1>,
+                                V6_vS32Ub_npred_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector stores with immediate offset - byte-enabled aligned
+//===----------------------------------------------------------------------===//
+let addrMode = PostInc in
+class T_vstore_qpred_pi <Operand ImmOp, RegisterClass RC, bit isPredNot = 0,
+                         bit isNT = 0>
+  : V6_STInst <(outs IntRegs:$_dst_),
+               (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+    "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)"
+          #!if(isNT, ":nt", "")#" = $src4", [],
+    "$src2 = $_dst_">;
+
+let accessSize = Vector64Access in
+class T_vstore_qpred_pi_64B <bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_qpred_pi <s3_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_qpred_pi_128B <bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_qpred_pi <s3_7Imm, VectorRegs128B, isPredNot, isNT>;
+
+def V6_vS32b_qpred_pi  : T_vstore_qpred_pi_64B, V6_vS32b_qpred_pi_enc;
+def V6_vS32b_nqpred_pi : T_vstore_qpred_pi_64B <1>, V6_vS32b_nqpred_pi_enc;
+// 128B
+def V6_vS32b_qpred_pi_128B  : T_vstore_qpred_pi_128B,
+                              V6_vS32b_qpred_pi_128B_enc;
+def V6_vS32b_nqpred_pi_128B : T_vstore_qpred_pi_128B<1>,
+                              V6_vS32b_nqpred_pi_128B_enc;
+
+let isNonTemporal = 1 in {
+  def V6_vS32b_nt_qpred_pi  : T_vstore_qpred_pi_64B <0, 1>,
+                              V6_vS32b_nt_qpred_pi_enc;
+  def V6_vS32b_nt_nqpred_pi : T_vstore_qpred_pi_64B <1, 1>,
+                              V6_vS32b_nt_nqpred_pi_enc;
+  // 128B
+  def V6_vS32b_nt_qpred_pi_128B  : T_vstore_qpred_pi_128B<0, 1>,
+                                   V6_vS32b_nt_qpred_pi_128B_enc;
+  def V6_vS32b_nt_nqpred_pi_128B : T_vstore_qpred_pi_128B<1, 1>,
+                                   V6_vS32b_nt_nqpred_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment conditional .new vector stores with immediate offset
+//===----------------------------------------------------------------------===//
+let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1,
+    isNewValue = 1, opNewValue = 4, addrMode = PostInc, isNVStore = 1 in
+class T_vstore_new_pred_pi <string baseOp, Operand ImmOp, RegisterClass RC,
+                            bit isPredNot, bit isNT>
+  : V6_STInst <(outs IntRegs:$_dst_),
+               (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
+    "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)"
+         #!if(isNT, ":nt", "")#" = $src4.new", [],
+    "$src2 = $_dst_"> , NewValueRel {
+  let isPredicatedFalse = isPredNot;
+  let BaseOpcode = baseOp;
+}
+
+let accessSize = Vector64Access in
+class T_vstore_new_pred_pi_64B <string baseOp, bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_new_pred_pi <baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>;
+
+let isCodeGenOnly = 1, accessSize = Vector128Access in
+class T_vstore_new_pred_pi_128B <string baseOp, bit isPredNot = 0, bit isNT = 0>
+  : T_vstore_new_pred_pi <baseOp#"128B", s3_7Imm, VectorRegs128B,
+                          isPredNot, isNT>;
+
+def V6_vS32b_new_pred_pi     : T_vstore_new_pred_pi_64B <"vS32b_pi">,
+                               V6_vS32b_new_pred_pi_enc;
+def V6_vS32b_new_npred_pi    : T_vstore_new_pred_pi_64B <"vS32b_pi", 1>,
+                               V6_vS32b_new_npred_pi_enc;
+// 128B
+def V6_vS32b_new_pred_pi_128B    : T_vstore_new_pred_pi_128B <"vS32b_pi">,
+                                   V6_vS32b_new_pred_pi_128B_enc;
+def V6_vS32b_new_npred_pi_128B   : T_vstore_new_pred_pi_128B <"vS32b_pi", 1>,
+                                   V6_vS32b_new_npred_pi_128B_enc;
+let isNonTemporal = 1 in {
+  def V6_vS32b_nt_new_pred_pi  : T_vstore_new_pred_pi_64B <"vS32b_pi", 0, 1>,
+                                 V6_vS32b_nt_new_pred_pi_enc;
+  def V6_vS32b_nt_new_npred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 1, 1>,
+                                 V6_vS32b_nt_new_npred_pi_enc;
+  // 128B
+  def V6_vS32b_nt_new_pred_pi_128B : T_vstore_new_pred_pi_128B
+                                     <"vS32b_pi", 0, 1>,
+                                     V6_vS32b_nt_new_pred_pi_128B_enc;
+  def V6_vS32b_nt_new_npred_pi_128B : T_vstore_new_pred_pi_128B
+                                      <"vS32b_pi", 1, 1>,
+                                      V6_vS32b_nt_new_npred_pi_128B_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector loads with register offset
+//===----------------------------------------------------------------------===//
+let hasNewValue = 1 in
+class T_vload_ppu<string asmStr>
+  : V6_LDInst <(outs VectorRegs:$dst, IntRegs:$_dst_),
+               (ins IntRegs:$src1, ModRegs:$src2), asmStr, [],
+    "$src1 = $_dst_">, NewValueRel;
+
+let isCVLoadable = 1 in {
+  def V6_vL32b_ppu    : T_vload_ppu <"$dst = vmem($src1++$src2)">,
+                        V6_vL32b_ppu_enc;
+  def V6_vL32b_nt_ppu : T_vload_ppu <"$dst = vmem($src1++$src2):nt">,
+                        V6_vL32b_nt_ppu_enc;
+}
+
+let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in
+def V6_vL32Ub_ppu : T_vload_ppu <"$dst = vmemu($src1++$src2)">,
+                     V6_vL32Ub_ppu_enc;
+
+let isCVLoad = 1, Itinerary = CVI_VM_CUR_LD, Type = TypeCVI_VM_CUR_LD in {
+  def V6_vL32b_cur_ppu    : T_vload_ppu <"$dst.cur = vmem($src1++$src2)">,
+                             V6_vL32b_cur_ppu_enc;
+  def V6_vL32b_nt_cur_ppu : T_vload_ppu <"$dst.cur = vmem($src1++$src2):nt">,
+                             V6_vL32b_nt_cur_ppu_enc;
+}
+
+let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in {
+  def V6_vL32b_tmp_ppu    : T_vload_ppu <"$dst.tmp = vmem($src1++$src2)">,
+                             V6_vL32b_tmp_ppu_enc;
+  def V6_vL32b_nt_tmp_ppu : T_vload_ppu <"$dst.tmp = vmem($src1++$src2):nt">,
+                             V6_vL32b_nt_tmp_ppu_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector stores with register offset
+//===----------------------------------------------------------------------===//
+let isPredicable = 1 in
+class T_vstore_ppu <string mnemonic, bit isNT = 0>
+  : V6_STInst <(outs IntRegs:$_dst_),
+               (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3),
+    mnemonic#"($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3", [],
+    "$src1 = $_dst_">, NewValueRel;
+
+let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in {
+  def V6_vS32b_ppu    : T_vstore_ppu <"vmem">,
+                        V6_vS32b_ppu_enc;
+  let isNonTemporal = 1, BaseOpcode = "vS32b_ppu" in
+  def V6_vS32b_nt_ppu : T_vstore_ppu <"vmem", 1>,
+                        V6_vS32b_nt_ppu_enc;
+}
+
+let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in
+def V6_vS32Ub_ppu   : T_vstore_ppu <"vmemu">, V6_vS32Ub_ppu_enc;
+
+//===----------------------------------------------------------------------===//
+// Post increment .new vector stores with register offset
+//===----------------------------------------------------------------------===//
+let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1,
+    isPredicable = 1, opNewValue = 3, isNVStore = 1 in
+class T_vstore_new_ppu <bit isNT = 0>
+  : V6_STInst <(outs IntRegs:$_dst_),
+               (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3),
+    "vmem($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [],
+    "$src1 = $_dst_">, NewValueRel;
+
+let BaseOpcode = "vS32b_ppu" in
+def V6_vS32b_new_ppu    : T_vstore_new_ppu, V6_vS32b_new_ppu_enc;
+
+let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in
+def V6_vS32b_nt_new_ppu : T_vstore_new_ppu<1>, V6_vS32b_nt_new_ppu_enc;
+
+//===----------------------------------------------------------------------===//
+// Post increment conditional .new vector stores with register offset
+//===----------------------------------------------------------------------===//
+let isPredicated = 1 in
+class T_vstore_pred_ppu <string mnemonic, bit isPredNot = 0, bit isNT = 0>
+  : V6_STInst<(outs IntRegs:$_dst_),
+           (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
+    "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++$src3)"
+          #!if(isNT, ":nt", "")#" = $src4", [],
+    "$src2 = $_dst_">, NewValueRel {
+  let isPredicatedFalse = isPredNot;
+}
+
+let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in {
+  def V6_vS32b_pred_ppu : T_vstore_pred_ppu<"vmem">, V6_vS32b_pred_ppu_enc;
+  def V6_vS32b_npred_ppu: T_vstore_pred_ppu<"vmem", 1>, V6_vS32b_npred_ppu_enc;
+}
+
+let isNVStorable = 1, BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in {
+  def V6_vS32b_nt_pred_ppu  : T_vstore_pred_ppu <"vmem", 0, 1>,
+                              V6_vS32b_nt_pred_ppu_enc;
+  def V6_vS32b_nt_npred_ppu : T_vstore_pred_ppu <"vmem", 1, 1>,
+                              V6_vS32b_nt_npred_ppu_enc;
+}
+
+let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU,
+    Type = TypeCVI_VM_STU in {
+  def V6_vS32Ub_pred_ppu  : T_vstore_pred_ppu <"vmemu">,
+                            V6_vS32Ub_pred_ppu_enc;
+  def V6_vS32Ub_npred_ppu : T_vstore_pred_ppu <"vmemu", 1>,
+                            V6_vS32Ub_npred_ppu_enc;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment vector stores with register offset - byte-enabled aligned
+//===----------------------------------------------------------------------===//
+class T_vstore_qpred_ppu <bit isPredNot = 0, bit isNT = 0>
+  : V6_STInst <(outs IntRegs:$_dst_),
+        (ins VecPredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
+    "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)"
+          #!if(isNT, ":nt", "")#" = $src4", [],
+    "$src2 = $_dst_">, NewValueRel;
+
+def V6_vS32b_qpred_ppu  : T_vstore_qpred_ppu, V6_vS32b_qpred_ppu_enc;
+def V6_vS32b_nqpred_ppu : T_vstore_qpred_ppu<1>, V6_vS32b_nqpred_ppu_enc;
+def V6_vS32b_nt_qpred_ppu  : T_vstore_qpred_ppu<0, 1>,
+                             V6_vS32b_nt_qpred_ppu_enc;
+def V6_vS32b_nt_nqpred_ppu : T_vstore_qpred_ppu<1, 1>,
+                             V6_vS32b_nt_nqpred_ppu_enc;
+
+//===----------------------------------------------------------------------===//
+// Post increment conditional .new vector stores with register offset
+//===----------------------------------------------------------------------===//
+let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1,
+    isNewValue = 1, opNewValue = 4, isNVStore = 1 in
+class T_vstore_new_pred_ppu <bit isPredNot = 0, bit isNT = 0>
+  : V6_STInst <(outs IntRegs:$_dst_),
+           (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
+    "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)"
+         #!if(isNT, ":nt", "")#" = $src4.new", [],
+    "$src2 = $_dst_">, NewValueRel {
+  let isPredicatedFalse = isPredNot;
+}
+
+let BaseOpcode = "vS32b_ppu" in {
+  def V6_vS32b_new_pred_ppu  : T_vstore_new_pred_ppu,
+                               V6_vS32b_new_pred_ppu_enc;
+  def V6_vS32b_new_npred_ppu : T_vstore_new_pred_ppu<1>,
+                               V6_vS32b_new_npred_ppu_enc;
+}
+
+let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in {
+def V6_vS32b_nt_new_pred_ppu :  T_vstore_new_pred_ppu<0, 1>,
+                                V6_vS32b_nt_new_pred_ppu_enc;
+def V6_vS32b_nt_new_npred_ppu : T_vstore_new_pred_ppu<1, 1>,
+                                V6_vS32b_nt_new_npred_ppu_enc;
+}
+
+
+// Vector load/store pseudos
+
+let isPseudo = 1, isCodeGenOnly = 1, validSubTargets = HasV60SubT in
+class STrivv_template<RegisterClass RC>
+  : V6_STInst<(outs), (ins IntRegs:$addr, s32_0Imm:$off, RC:$src), "", []>;
+
+def PS_vstorerw_ai: STrivv_template<VecDblRegs>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vstorerwu_ai: STrivv_template<VecDblRegs>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vstorerw_ai_128B: STrivv_template<VecDblRegs128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+def PS_vstorerwu_ai_128B: STrivv_template<VecDblRegs128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+
+
+let isPseudo = 1, isCodeGenOnly = 1, validSubTargets = HasV60SubT in
+class LDrivv_template<RegisterClass RC>
+  : V6_LDInst<(outs RC:$dst), (ins IntRegs:$addr, s32_0Imm:$off), "", []>;
+
+def PS_vloadrw_ai: LDrivv_template<VecDblRegs>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vloadrwu_ai: LDrivv_template<VecDblRegs>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vloadrw_ai_128B: LDrivv_template<VecDblRegs128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+def PS_vloadrwu_ai_128B: LDrivv_template<VecDblRegs128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+
+// Store vector predicate pseudo.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+  def PS_vstorerq_ai : STInst<(outs),
+              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs:$src1),
+              ".error \"should not emit\"", []>,
+              Requires<[HasV60T,UseHVXSgl]>;
+  def PS_vstorerq_ai_128B : STInst<(outs),
+              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1),
+              ".error \"should not emit\"", []>,
+              Requires<[HasV60T,UseHVXDbl]>;
+}
+
+// Load vector predicate pseudo.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+    opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
+  def PS_vloadrq_ai : LDInst<(outs VecPredRegs:$dst),
+              (ins IntRegs:$base, s32_0Imm:$offset),
+              ".error \"should not emit\"", []>,
+              Requires<[HasV60T,UseHVXSgl]>;
+  def PS_vloadrq_ai_128B : LDInst<(outs VecPredRegs128B:$dst),
+              (ins IntRegs:$base, s32_0Imm:$offset),
+              ".error \"should not emit\"", []>,
+              Requires<[HasV60T,UseHVXDbl]>;
+}
+
+class VSELInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "", InstrItinClass itin = CVI_VA_DV,
+              IType type = TypeCVI_VA_DV>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
+  def PS_vselect: VSELInst<(outs VectorRegs:$dst),
+        (ins PredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3), "", []>,
+        Requires<[HasV60T,UseHVXSgl]>;
+  def PS_vselect_128B: VSELInst<(outs VectorRegs128B:$dst),
+        (ins PredRegs:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3),
+        "", []>, Requires<[HasV60T,UseHVXDbl]>;
+  def PS_wselect: VSELInst<(outs VecDblRegs:$dst),
+        (ins PredRegs:$src1, VecDblRegs:$src2, VecDblRegs:$src3), "", []>,
+        Requires<[HasV60T,UseHVXSgl]>;
+  def PS_wselect_128B: VSELInst<(outs VecDblRegs128B:$dst),
+        (ins PredRegs:$src1, VecDblRegs128B:$src2, VecDblRegs128B:$src3),
+        "", []>, Requires<[HasV60T,UseHVXDbl]>;
+}
+
+let hasNewValue = 1 in
+class T_vmpy <string asmString, RegisterClass RCout, RegisterClass RCin>
+  : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2),
+    asmString >;
+
+multiclass T_vmpy <string asmString, RegisterClass RCout,
+                        RegisterClass RCin> {
+  def NAME : T_vmpy <asmString, RCout, RCin>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_vmpy <asmString, !cast<RegisterClass>(RCout#"128B"),
+                                      !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_vmpy_VV <string asmString>:
+  T_vmpy <asmString, VectorRegs, VectorRegs>;
+
+multiclass T_vmpy_WW <string asmString>:
+  T_vmpy <asmString, VecDblRegs, VecDblRegs>;
+
+multiclass T_vmpy_VW <string asmString>:
+  T_vmpy <asmString, VectorRegs, VecDblRegs>;
+
+multiclass T_vmpy_WV <string asmString>:
+  T_vmpy <asmString, VecDblRegs, VectorRegs>;
+
+defm V6_vtmpyb   :T_vmpy_WW<"$dst.h = vtmpy($src1.b,$src2.b)">, V6_vtmpyb_enc;
+defm V6_vtmpybus :T_vmpy_WW<"$dst.h = vtmpy($src1.ub,$src2.b)">, V6_vtmpybus_enc;
+defm V6_vdsaduh  :T_vmpy_WW<"$dst.uw = vdsad($src1.uh,$src2.uh)">, V6_vdsaduh_enc;
+defm V6_vmpybus  :T_vmpy_WV<"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybus_enc;
+defm V6_vmpabus  :T_vmpy_WW<"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabus_enc;
+defm V6_vmpahb   :T_vmpy_WW<"$dst.w = vmpa($src1.h,$src2.b)">, V6_vmpahb_enc;
+defm V6_vmpyh    :T_vmpy_WV<"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyh_enc;
+defm V6_vmpyuh   :T_vmpy_WV<"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuh_enc;
+defm V6_vmpyiwh  :T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_enc;
+defm V6_vtmpyhb  :T_vmpy_WW<"$dst.w = vtmpy($src1.h,$src2.b)">, V6_vtmpyhb_enc;
+defm V6_vmpyub   :T_vmpy_WV<"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyub_enc;
+
+let Itinerary = CVI_VX_LONG, Type = TypeCVI_VX in
+defm V6_vmpyihb  :T_vmpy_VV<"$dst.h = vmpyi($src1.h,$src2.b)">, V6_vmpyihb_enc;
+
+defm V6_vdmpybus_dv :
+     T_vmpy_WW <"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_dv_enc;
+defm V6_vdmpyhsusat :
+     T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.uh):sat">, V6_vdmpyhsusat_enc;
+defm V6_vdmpyhsuisat :
+     T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.uh,#1):sat">, V6_vdmpyhsuisat_enc;
+defm V6_vdmpyhsat :
+     T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhsat_enc;
+defm V6_vdmpyhisat :
+     T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhisat_enc;
+defm V6_vdmpyhb_dv :
+     T_vmpy_WW <"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_dv_enc;
+defm V6_vmpyhss :
+     T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:sat">, V6_vmpyhss_enc;
+defm V6_vmpyhsrs :
+     T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhsrs_enc;
+
+let Itinerary = CVI_VP, Type = TypeCVI_VP in
+defm V6_vror : T_vmpy_VV <"$dst = vror($src1,$src2)">, V6_vror_enc;
+
+let Itinerary = CVI_VX, Type = TypeCVI_VX in {
+defm V6_vdmpyhb  : T_vmpy_VV<"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_enc;
+defm V6_vrmpybus : T_vmpy_VV<"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybus_enc;
+defm V6_vdmpybus : T_vmpy_VV<"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_enc;
+defm V6_vmpyiwb  : T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.b)">, V6_vmpyiwb_enc;
+defm V6_vrmpyub : T_vmpy_VV<"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyub_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vasrw  : T_vmpy_VV <"$dst.w = vasr($src1.w,$src2)">, V6_vasrw_enc;
+defm V6_vasrh  : T_vmpy_VV <"$dst.h = vasr($src1.h,$src2)">, V6_vasrh_enc;
+defm V6_vaslw  : T_vmpy_VV <"$dst.w = vasl($src1.w,$src2)">, V6_vaslw_enc;
+defm V6_vaslh  : T_vmpy_VV <"$dst.h = vasl($src1.h,$src2)">, V6_vaslh_enc;
+defm V6_vlsrw  : T_vmpy_VV <"$dst.uw = vlsr($src1.uw,$src2)">, V6_vlsrw_enc;
+defm V6_vlsrh  : T_vmpy_VV <"$dst.uh = vlsr($src1.uh,$src2)">, V6_vlsrh_enc;
+}
+
+let hasNewValue = 1 in
+class T_HVX_alu <string asmString, InstrItinClass itin,
+                 RegisterClass RCout, RegisterClass RCin>
+  : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2),
+    asmString >{
+  let Itinerary = itin;
+  let Type = !cast<IType>("Type"#itin);
+}
+
+multiclass T_HVX_alu <string asmString, RegisterClass RCout,
+           RegisterClass RCin, InstrItinClass itin> {
+  def NAME : T_HVX_alu <asmString, itin, RCout, RCin>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_alu <asmString, itin,
+                              !cast<RegisterClass>(RCout#"128B"),
+                              !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_HVX_alu_VV <string asmString>:
+  T_HVX_alu <asmString, VectorRegs, VectorRegs, CVI_VA>;
+
+multiclass T_HVX_alu_WW <string asmString>:
+  T_HVX_alu <asmString, VecDblRegs, VecDblRegs, CVI_VA_DV>;
+
+multiclass T_HVX_alu_WV <string asmString>:
+  T_HVX_alu <asmString, VecDblRegs, VectorRegs, CVI_VX_DV>;
+
+
+let Itinerary  =  CVI_VX, Type  =  TypeCVI_VX in {
+defm V6_vrmpyubv :
+     T_HVX_alu_VV <"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyubv_enc;
+defm V6_vrmpybv :
+     T_HVX_alu_VV <"$dst.w = vrmpy($src1.b,$src2.b)">, V6_vrmpybv_enc;
+defm V6_vrmpybusv :
+     T_HVX_alu_VV <"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybusv_enc;
+defm V6_vabsdiffub :
+     T_HVX_alu_VV <"$dst.ub = vabsdiff($src1.ub,$src2.ub)">, V6_vabsdiffub_enc;
+defm V6_vabsdiffh :
+     T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.h,$src2.h)">, V6_vabsdiffh_enc;
+defm V6_vabsdiffuh :
+     T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.uh,$src2.uh)">, V6_vabsdiffuh_enc;
+defm V6_vabsdiffw :
+     T_HVX_alu_VV <"$dst.uw = vabsdiff($src1.w,$src2.w)">, V6_vabsdiffw_enc;
+}
+
+let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
+defm V6_vdmpyhvsat :
+     T_HVX_alu_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhvsat_enc;
+defm V6_vmpyhvsrs :
+     T_HVX_alu_VV<"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhvsrs_enc;
+defm V6_vmpyih :
+     T_HVX_alu_VV <"$dst.h = vmpyi($src1.h,$src2.h)">, V6_vmpyih_enc;
+}
+
+defm V6_vand :
+     T_HVX_alu_VV <"$dst = vand($src1,$src2)">, V6_vand_enc;
+defm V6_vor :
+     T_HVX_alu_VV <"$dst = vor($src1,$src2)">, V6_vor_enc;
+defm V6_vxor :
+     T_HVX_alu_VV <"$dst = vxor($src1,$src2)">, V6_vxor_enc;
+defm V6_vaddw :
+     T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_enc;
+defm V6_vaddubsat :
+     T_HVX_alu_VV <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_enc;
+defm V6_vadduhsat :
+     T_HVX_alu_VV <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_enc;
+defm V6_vaddhsat :
+     T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_enc;
+defm V6_vaddwsat :
+     T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_enc;
+defm V6_vsubb :
+     T_HVX_alu_VV <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_enc;
+defm V6_vsubh :
+     T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_enc;
+defm V6_vsubw :
+     T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_enc;
+defm V6_vsububsat :
+     T_HVX_alu_VV <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_enc;
+defm V6_vsubuhsat :
+     T_HVX_alu_VV <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_enc;
+defm V6_vsubhsat :
+     T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_enc;
+defm V6_vsubwsat :
+     T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_enc;
+defm V6_vavgub :
+     T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub)">, V6_vavgub_enc;
+defm V6_vavguh :
+     T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh)">, V6_vavguh_enc;
+defm V6_vavgh :
+     T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h)">, V6_vavgh_enc;
+defm V6_vavgw :
+     T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w)">, V6_vavgw_enc;
+defm V6_vnavgub :
+     T_HVX_alu_VV <"$dst.b = vnavg($src1.ub,$src2.ub)">, V6_vnavgub_enc;
+defm V6_vnavgh :
+     T_HVX_alu_VV <"$dst.h = vnavg($src1.h,$src2.h)">, V6_vnavgh_enc;
+defm V6_vnavgw :
+     T_HVX_alu_VV <"$dst.w = vnavg($src1.w,$src2.w)">, V6_vnavgw_enc;
+defm V6_vavgubrnd :
+     T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub):rnd">, V6_vavgubrnd_enc;
+defm V6_vavguhrnd :
+     T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh):rnd">, V6_vavguhrnd_enc;
+defm V6_vavghrnd :
+     T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h):rnd">, V6_vavghrnd_enc;
+defm V6_vavgwrnd :
+     T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w):rnd">, V6_vavgwrnd_enc;
+
+defm V6_vmpybv :
+     T_HVX_alu_WV <"$dst.h = vmpy($src1.b,$src2.b)">, V6_vmpybv_enc;
+defm V6_vmpyubv :
+     T_HVX_alu_WV <"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyubv_enc;
+defm V6_vmpybusv :
+     T_HVX_alu_WV <"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybusv_enc;
+defm V6_vmpyhv :
+     T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyhv_enc;
+defm V6_vmpyuhv :
+     T_HVX_alu_WV <"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuhv_enc;
+defm V6_vmpyhus :
+     T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.uh)">, V6_vmpyhus_enc;
+defm V6_vaddubh :
+     T_HVX_alu_WV <"$dst.h = vadd($src1.ub,$src2.ub)">, V6_vaddubh_enc;
+defm V6_vadduhw :
+     T_HVX_alu_WV <"$dst.w = vadd($src1.uh,$src2.uh)">, V6_vadduhw_enc;
+defm V6_vaddhw :
+     T_HVX_alu_WV <"$dst.w = vadd($src1.h,$src2.h)">, V6_vaddhw_enc;
+defm V6_vsububh :
+     T_HVX_alu_WV <"$dst.h = vsub($src1.ub,$src2.ub)">, V6_vsububh_enc;
+defm V6_vsubuhw :
+     T_HVX_alu_WV <"$dst.w = vsub($src1.uh,$src2.uh)">, V6_vsubuhw_enc;
+defm V6_vsubhw :
+     T_HVX_alu_WV <"$dst.w = vsub($src1.h,$src2.h)">, V6_vsubhw_enc;
+
+defm V6_vaddb_dv :
+     T_HVX_alu_WW <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_dv_enc;
+defm V6_vaddh_dv :
+     T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_dv_enc;
+defm V6_vaddw_dv :
+     T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_dv_enc;
+defm V6_vaddubsat_dv :
+     T_HVX_alu_WW <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_dv_enc;
+defm V6_vadduhsat_dv :
+     T_HVX_alu_WW <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_dv_enc;
+defm V6_vaddhsat_dv :
+     T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_dv_enc;
+defm V6_vaddwsat_dv :
+     T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_dv_enc;
+defm V6_vsubb_dv :
+     T_HVX_alu_WW <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_dv_enc;
+defm V6_vsubh_dv :
+     T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_dv_enc;
+defm V6_vsubw_dv :
+     T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_dv_enc;
+defm V6_vsububsat_dv :
+     T_HVX_alu_WW <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_dv_enc;
+defm V6_vsubuhsat_dv :
+     T_HVX_alu_WW <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_dv_enc;
+defm V6_vsubhsat_dv :
+     T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_dv_enc;
+defm V6_vsubwsat_dv :
+     T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_dv_enc;
+
+let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV in {
+defm V6_vmpabusv :
+     T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabusv_enc;
+defm V6_vmpabuuv :
+     T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.ub)">, V6_vmpabuuv_enc;
+}
+
+let isAccumulator = 1, hasNewValue = 1 in
+class T_HVX_vmpyacc <string asmString, InstrItinClass itin, RegisterClass RCout,
+                     RegisterClass RCin1, RegisterClass RCin2>
+  : CVI_VA_Resource1 <(outs RCout:$dst),
+                      (ins RCout:$_src_, RCin1:$src1, RCin2:$src2), asmString,
+                      [], "$dst = $_src_" > {
+  let Itinerary = itin;
+  let Type = !cast<IType>("Type"#itin);
+}
+
+multiclass T_HVX_vmpyacc_both <string asmString, RegisterClass RCout,
+           RegisterClass RCin1, RegisterClass RCin2, InstrItinClass itin > {
+  def NAME : T_HVX_vmpyacc <asmString, itin, RCout, RCin1, RCin2>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_vmpyacc <asmString, itin,
+                   !cast<RegisterClass>(RCout#"128B"),
+                   !cast<RegisterClass>(RCin1#"128B"),
+                   !cast<RegisterClass>(RCin2#
+                   !if(!eq (!cast<string>(RCin2), "IntRegs"), "", "128B"))>;
+}
+
+multiclass T_HVX_vmpyacc_VVR <string asmString>:
+  T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, IntRegs, CVI_VX>;
+
+multiclass T_HVX_vmpyacc_VWR <string asmString>:
+  T_HVX_vmpyacc_both <asmString, VectorRegs, VecDblRegs, IntRegs, CVI_VX_DV>;
+
+multiclass T_HVX_vmpyacc_WVR <string asmString>:
+  T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, IntRegs, CVI_VX_DV>;
+
+multiclass T_HVX_vmpyacc_WWR <string asmString>:
+  T_HVX_vmpyacc_both <asmString, VecDblRegs, VecDblRegs, IntRegs, CVI_VX_DV>;
+
+multiclass T_HVX_vmpyacc_VVV <string asmString>:
+  T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, VectorRegs, CVI_VX_DV>;
+
+multiclass T_HVX_vmpyacc_WVV <string asmString>:
+  T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, VectorRegs, CVI_VX_DV>;
+
+
+defm V6_vtmpyb_acc :
+     T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.b,$src2.b)">,
+     V6_vtmpyb_acc_enc;
+defm V6_vtmpybus_acc :
+     T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.ub,$src2.b)">,
+     V6_vtmpybus_acc_enc;
+defm V6_vtmpyhb_acc :
+     T_HVX_vmpyacc_WWR <"$dst.w += vtmpy($src1.h,$src2.b)">,
+     V6_vtmpyhb_acc_enc;
+defm V6_vdmpyhb_acc :
+     T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.b)">,
+     V6_vdmpyhb_acc_enc;
+defm V6_vrmpyub_acc :
+     T_HVX_vmpyacc_VVR <"$dst.uw += vrmpy($src1.ub,$src2.ub)">,
+     V6_vrmpyub_acc_enc;
+defm V6_vrmpybus_acc :
+     T_HVX_vmpyacc_VVR <"$dst.w += vrmpy($src1.ub,$src2.b)">,
+     V6_vrmpybus_acc_enc;
+defm V6_vdmpybus_acc :
+     T_HVX_vmpyacc_VVR <"$dst.h += vdmpy($src1.ub,$src2.b)">,
+     V6_vdmpybus_acc_enc;
+defm V6_vdmpybus_dv_acc :
+     T_HVX_vmpyacc_WWR <"$dst.h += vdmpy($src1.ub,$src2.b)">,
+     V6_vdmpybus_dv_acc_enc;
+defm V6_vdmpyhsuisat_acc :
+     T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.uh,#1):sat">,
+     V6_vdmpyhsuisat_acc_enc;
+defm V6_vdmpyhisat_acc :
+     T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
+     V6_vdmpyhisat_acc_enc;
+defm V6_vdmpyhb_dv_acc :
+     T_HVX_vmpyacc_WWR <"$dst.w += vdmpy($src1.h,$src2.b)">,
+     V6_vdmpyhb_dv_acc_enc;
+defm V6_vmpybus_acc :
+     T_HVX_vmpyacc_WVR <"$dst.h += vmpy($src1.ub,$src2.b)">,
+     V6_vmpybus_acc_enc;
+defm V6_vmpabus_acc :
+     T_HVX_vmpyacc_WWR <"$dst.h += vmpa($src1.ub,$src2.b)">,
+     V6_vmpabus_acc_enc;
+defm V6_vmpahb_acc :
+     T_HVX_vmpyacc_WWR <"$dst.w += vmpa($src1.h,$src2.b)">,
+     V6_vmpahb_acc_enc;
+defm V6_vmpyhsat_acc :
+     T_HVX_vmpyacc_WVR <"$dst.w += vmpy($src1.h,$src2.h):sat">,
+     V6_vmpyhsat_acc_enc;
+defm V6_vmpyuh_acc :
+     T_HVX_vmpyacc_WVR <"$dst.uw += vmpy($src1.uh,$src2.uh)">,
+     V6_vmpyuh_acc_enc;
+defm V6_vmpyiwb_acc :
+     T_HVX_vmpyacc_VVR <"$dst.w += vmpyi($src1.w,$src2.b)">,
+     V6_vmpyiwb_acc_enc;
+defm V6_vdsaduh_acc :
+     T_HVX_vmpyacc_WWR <"$dst.uw += vdsad($src1.uh,$src2.uh)">,
+     V6_vdsaduh_acc_enc;
+defm V6_vmpyihb_acc :
+     T_HVX_vmpyacc_VVR <"$dst.h += vmpyi($src1.h,$src2.b)">,
+     V6_vmpyihb_acc_enc;
+defm V6_vmpyub_acc :
+     T_HVX_vmpyacc_WVR <"$dst.uh += vmpy($src1.ub,$src2.ub)">,
+     V6_vmpyub_acc_enc;
+
+let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
+defm V6_vdmpyhsusat_acc :
+     T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.uh):sat">,
+     V6_vdmpyhsusat_acc_enc;
+defm V6_vdmpyhsat_acc :
+     T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
+     V6_vdmpyhsat_acc_enc;
+defm V6_vmpyiwh_acc : T_HVX_vmpyacc_VVR
+     <"$dst.w += vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_acc_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vaslw_acc :
+     T_HVX_vmpyacc_VVR <"$dst.w += vasl($src1.w,$src2)">, V6_vaslw_acc_enc;
+defm V6_vasrw_acc :
+     T_HVX_vmpyacc_VVR <"$dst.w += vasr($src1.w,$src2)">, V6_vasrw_acc_enc;
+}
+
+defm V6_vdmpyhvsat_acc :
+     T_HVX_vmpyacc_VVV <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
+     V6_vdmpyhvsat_acc_enc;
+defm V6_vmpybusv_acc :
+     T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.ub,$src2.b)">,
+     V6_vmpybusv_acc_enc;
+defm V6_vmpybv_acc :
+     T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.b,$src2.b)">, V6_vmpybv_acc_enc;
+defm V6_vmpyhus_acc :
+     T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.uh)">, V6_vmpyhus_acc_enc;
+defm V6_vmpyhv_acc :
+     T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.h)">, V6_vmpyhv_acc_enc;
+defm V6_vmpyiewh_acc :
+     T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.h)">,
+     V6_vmpyiewh_acc_enc;
+defm V6_vmpyiewuh_acc :
+     T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.uh)">,
+     V6_vmpyiewuh_acc_enc;
+defm V6_vmpyih_acc :
+     T_HVX_vmpyacc_VVV <"$dst.h += vmpyi($src1.h,$src2.h)">, V6_vmpyih_acc_enc;
+defm V6_vmpyowh_rnd_sacc :
+     T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:rnd:sat:shift">,
+     V6_vmpyowh_rnd_sacc_enc;
+defm V6_vmpyowh_sacc :
+     T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:sat:shift">,
+     V6_vmpyowh_sacc_enc;
+defm V6_vmpyubv_acc :
+     T_HVX_vmpyacc_WVV <"$dst.uh += vmpy($src1.ub,$src2.ub)">,
+     V6_vmpyubv_acc_enc;
+defm V6_vmpyuhv_acc :
+     T_HVX_vmpyacc_WVV <"$dst.uw += vmpy($src1.uh,$src2.uh)">,
+     V6_vmpyuhv_acc_enc;
+defm V6_vrmpybusv_acc :
+     T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.ub,$src2.b)">,
+     V6_vrmpybusv_acc_enc;
+defm V6_vrmpybv_acc :
+     T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.b,$src2.b)">, V6_vrmpybv_acc_enc;
+defm V6_vrmpyubv_acc :
+     T_HVX_vmpyacc_VVV <"$dst.uw += vrmpy($src1.ub,$src2.ub)">,
+     V6_vrmpyubv_acc_enc;
+
+
+class T_HVX_vcmp <string asmString, RegisterClass RCout, RegisterClass RCin>
+  : CVI_VA_Resource1 <(outs RCout:$dst),
+                      (ins RCout:$_src_, RCin:$src1, RCin:$src2), asmString,
+                      [], "$dst = $_src_" > {
+  let Itinerary = CVI_VA;
+  let Type = TypeCVI_VA;
+}
+
+multiclass T_HVX_vcmp <string asmString> {
+  def NAME : T_HVX_vcmp <asmString, VecPredRegs, VectorRegs>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_vcmp <asmString, VecPredRegs128B, VectorRegs128B>;
+}
+
+defm V6_veqb_and :
+     T_HVX_vcmp <"$dst &= vcmp.eq($src1.b,$src2.b)">, V6_veqb_and_enc;
+defm V6_veqh_and :
+     T_HVX_vcmp <"$dst &= vcmp.eq($src1.h,$src2.h)">, V6_veqh_and_enc;
+defm V6_veqw_and :
+     T_HVX_vcmp <"$dst &= vcmp.eq($src1.w,$src2.w)">, V6_veqw_and_enc;
+defm V6_vgtb_and :
+     T_HVX_vcmp <"$dst &= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_and_enc;
+defm V6_vgth_and :
+     T_HVX_vcmp <"$dst &= vcmp.gt($src1.h,$src2.h)">, V6_vgth_and_enc;
+defm V6_vgtw_and :
+     T_HVX_vcmp <"$dst &= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_and_enc;
+defm V6_vgtub_and :
+     T_HVX_vcmp <"$dst &= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_and_enc;
+defm V6_vgtuh_and :
+     T_HVX_vcmp <"$dst &= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_and_enc;
+defm V6_vgtuw_and :
+     T_HVX_vcmp <"$dst &= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_and_enc;
+defm V6_veqb_or :
+     T_HVX_vcmp <"$dst |= vcmp.eq($src1.b,$src2.b)">, V6_veqb_or_enc;
+defm V6_veqh_or :
+     T_HVX_vcmp <"$dst |= vcmp.eq($src1.h,$src2.h)">, V6_veqh_or_enc;
+defm V6_veqw_or :
+     T_HVX_vcmp <"$dst |= vcmp.eq($src1.w,$src2.w)">, V6_veqw_or_enc;
+defm V6_vgtb_or :
+     T_HVX_vcmp <"$dst |= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_or_enc;
+defm V6_vgth_or :
+     T_HVX_vcmp <"$dst |= vcmp.gt($src1.h,$src2.h)">, V6_vgth_or_enc;
+defm V6_vgtw_or :
+     T_HVX_vcmp <"$dst |= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_or_enc;
+defm V6_vgtub_or :
+     T_HVX_vcmp <"$dst |= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_or_enc;
+defm V6_vgtuh_or :
+     T_HVX_vcmp <"$dst |= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_or_enc;
+defm V6_vgtuw_or :
+     T_HVX_vcmp <"$dst |= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_or_enc;
+defm V6_veqb_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.eq($src1.b,$src2.b)">, V6_veqb_xor_enc;
+defm V6_veqh_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.eq($src1.h,$src2.h)">, V6_veqh_xor_enc;
+defm V6_veqw_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.eq($src1.w,$src2.w)">, V6_veqw_xor_enc;
+defm V6_vgtb_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_xor_enc;
+defm V6_vgth_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.h,$src2.h)">, V6_vgth_xor_enc;
+defm V6_vgtw_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_xor_enc;
+defm V6_vgtub_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_xor_enc;
+defm V6_vgtuh_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_xor_enc;
+defm V6_vgtuw_xor :
+     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_xor_enc;
+
+defm V6_vminub :
+     T_HVX_alu_VV <"$dst.ub = vmin($src1.ub,$src2.ub)">, V6_vminub_enc;
+defm V6_vminuh :
+     T_HVX_alu_VV <"$dst.uh = vmin($src1.uh,$src2.uh)">, V6_vminuh_enc;
+defm V6_vminh :
+     T_HVX_alu_VV <"$dst.h = vmin($src1.h,$src2.h)">, V6_vminh_enc;
+defm V6_vminw :
+     T_HVX_alu_VV <"$dst.w = vmin($src1.w,$src2.w)">, V6_vminw_enc;
+defm V6_vmaxub :
+     T_HVX_alu_VV <"$dst.ub = vmax($src1.ub,$src2.ub)">, V6_vmaxub_enc;
+defm V6_vmaxuh :
+     T_HVX_alu_VV <"$dst.uh = vmax($src1.uh,$src2.uh)">, V6_vmaxuh_enc;
+defm V6_vmaxh :
+     T_HVX_alu_VV <"$dst.h = vmax($src1.h,$src2.h)">, V6_vmaxh_enc;
+defm V6_vmaxw :
+     T_HVX_alu_VV <"$dst.w = vmax($src1.w,$src2.w)">, V6_vmaxw_enc;
+defm V6_vshuffeb :
+     T_HVX_alu_VV <"$dst.b = vshuffe($src1.b,$src2.b)">, V6_vshuffeb_enc;
+defm V6_vshuffob :
+     T_HVX_alu_VV <"$dst.b = vshuffo($src1.b,$src2.b)">, V6_vshuffob_enc;
+defm V6_vshufeh :
+     T_HVX_alu_VV <"$dst.h = vshuffe($src1.h,$src2.h)">, V6_vshufeh_enc;
+defm V6_vshufoh :
+     T_HVX_alu_VV <"$dst.h = vshuffo($src1.h,$src2.h)">, V6_vshufoh_enc;
+
+let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
+defm V6_vmpyowh_rnd :
+     T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:rnd:sat">,
+     V6_vmpyowh_rnd_enc;
+defm V6_vmpyiewuh :
+     T_HVX_alu_VV <"$dst.w = vmpyie($src1.w,$src2.uh)">, V6_vmpyiewuh_enc;
+defm V6_vmpyewuh :
+     T_HVX_alu_VV <"$dst.w = vmpye($src1.w,$src2.uh)">, V6_vmpyewuh_enc;
+defm V6_vmpyowh :
+     T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:sat">, V6_vmpyowh_enc;
+defm V6_vmpyiowh :
+     T_HVX_alu_VV <"$dst.w = vmpyio($src1.w,$src2.h)">, V6_vmpyiowh_enc;
+}
+let Itinerary = CVI_VX, Type = TypeCVI_VX in
+defm V6_vmpyieoh :
+     T_HVX_alu_VV <"$dst.w = vmpyieo($src1.h,$src2.h)">, V6_vmpyieoh_enc;
+
+let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in {
+defm V6_vshufoeh :
+     T_HVX_alu_WV <"$dst.h = vshuffoe($src1.h,$src2.h)">, V6_vshufoeh_enc;
+defm V6_vshufoeb :
+     T_HVX_alu_WV <"$dst.b = vshuffoe($src1.b,$src2.b)">, V6_vshufoeb_enc;
+}
+
+let isRegSequence = 1, Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in
+defm V6_vcombine :
+     T_HVX_alu_WV <"$dst = vcombine($src1,$src2)">, V6_vcombine_enc;
+
+let Itinerary = CVI_VINLANESAT, Type = TypeCVI_VINLANESAT in {
+defm V6_vsathub :
+     T_HVX_alu_VV <"$dst.ub = vsat($src1.h,$src2.h)">, V6_vsathub_enc;
+defm V6_vsatwh :
+     T_HVX_alu_VV <"$dst.h = vsat($src1.w,$src2.w)">, V6_vsatwh_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vroundwh :
+     T_HVX_alu_VV <"$dst.h = vround($src1.w,$src2.w):sat">, V6_vroundwh_enc;
+defm V6_vroundwuh :
+     T_HVX_alu_VV <"$dst.uh = vround($src1.w,$src2.w):sat">, V6_vroundwuh_enc;
+defm V6_vroundhb :
+     T_HVX_alu_VV <"$dst.b = vround($src1.h,$src2.h):sat">, V6_vroundhb_enc;
+defm V6_vroundhub :
+     T_HVX_alu_VV <"$dst.ub = vround($src1.h,$src2.h):sat">, V6_vroundhub_enc;
+defm V6_vasrwv :
+     T_HVX_alu_VV <"$dst.w = vasr($src1.w,$src2.w)">, V6_vasrwv_enc;
+defm V6_vlsrwv :
+     T_HVX_alu_VV <"$dst.w = vlsr($src1.w,$src2.w)">, V6_vlsrwv_enc;
+defm V6_vlsrhv :
+     T_HVX_alu_VV <"$dst.h = vlsr($src1.h,$src2.h)">, V6_vlsrhv_enc;
+defm V6_vasrhv :
+     T_HVX_alu_VV <"$dst.h = vasr($src1.h,$src2.h)">, V6_vasrhv_enc;
+defm V6_vaslwv :
+     T_HVX_alu_VV <"$dst.w = vasl($src1.w,$src2.w)">, V6_vaslwv_enc;
+defm V6_vaslhv :
+     T_HVX_alu_VV <"$dst.h = vasl($src1.h,$src2.h)">, V6_vaslhv_enc;
+}
+
+defm V6_vaddb :
+     T_HVX_alu_VV <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_enc;
+defm V6_vaddh :
+     T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_enc;
+
+let Itinerary = CVI_VP, Type = TypeCVI_VP in {
+defm V6_vdelta :
+     T_HVX_alu_VV <"$dst = vdelta($src1,$src2)">, V6_vdelta_enc;
+defm V6_vrdelta :
+     T_HVX_alu_VV <"$dst = vrdelta($src1,$src2)">, V6_vrdelta_enc;
+defm V6_vdealb4w :
+     T_HVX_alu_VV <"$dst.b = vdeale($src1.b,$src2.b)">, V6_vdealb4w_enc;
+defm V6_vpackeb :
+     T_HVX_alu_VV <"$dst.b = vpacke($src1.h,$src2.h)">, V6_vpackeb_enc;
+defm V6_vpackeh :
+     T_HVX_alu_VV <"$dst.h = vpacke($src1.w,$src2.w)">, V6_vpackeh_enc;
+defm V6_vpackhub_sat :
+     T_HVX_alu_VV <"$dst.ub = vpack($src1.h,$src2.h):sat">, V6_vpackhub_sat_enc;
+defm V6_vpackhb_sat :
+     T_HVX_alu_VV <"$dst.b = vpack($src1.h,$src2.h):sat">, V6_vpackhb_sat_enc;
+defm V6_vpackwuh_sat :
+     T_HVX_alu_VV <"$dst.uh = vpack($src1.w,$src2.w):sat">, V6_vpackwuh_sat_enc;
+defm V6_vpackwh_sat :
+     T_HVX_alu_VV <"$dst.h = vpack($src1.w,$src2.w):sat">, V6_vpackwh_sat_enc;
+defm V6_vpackob :
+     T_HVX_alu_VV <"$dst.b = vpacko($src1.h,$src2.h)">, V6_vpackob_enc;
+defm V6_vpackoh :
+     T_HVX_alu_VV <"$dst.h = vpacko($src1.w,$src2.w)">, V6_vpackoh_enc;
+}
+
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_HVX_condALU <string asmString, RegisterClass RC1, RegisterClass RC2>
+  : CVI_VA_Resource1 <(outs RC2:$dst),
+                      (ins RC1:$src1, RC2:$_src_, RC2:$src2), asmString,
+                      [], "$dst = $_src_" > {
+  let Itinerary = CVI_VA;
+  let Type = TypeCVI_VA;
+}
+
+multiclass T_HVX_condALU <string asmString> {
+  def NAME : T_HVX_condALU <asmString, VecPredRegs, VectorRegs>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_condALU <asmString, VecPredRegs128B, VectorRegs128B>;
+}
+
+defm V6_vaddbq  : T_HVX_condALU <"if ($src1) $dst.b += $src2.b">,
+                  V6_vaddbq_enc;
+defm V6_vaddhq  : T_HVX_condALU <"if ($src1) $dst.h += $src2.h">,
+                  V6_vaddhq_enc;
+defm V6_vaddwq  : T_HVX_condALU <"if ($src1) $dst.w += $src2.w">,
+                  V6_vaddwq_enc;
+defm V6_vsubbq  : T_HVX_condALU <"if ($src1) $dst.b -= $src2.b">,
+                  V6_vsubbq_enc;
+defm V6_vsubhq  : T_HVX_condALU <"if ($src1) $dst.h -= $src2.h">,
+                  V6_vsubhq_enc;
+defm V6_vsubwq  : T_HVX_condALU <"if ($src1) $dst.w -= $src2.w">,
+                  V6_vsubwq_enc;
+defm V6_vaddbnq : T_HVX_condALU <"if (!$src1) $dst.b += $src2.b">,
+                  V6_vaddbnq_enc;
+defm V6_vaddhnq : T_HVX_condALU <"if (!$src1) $dst.h += $src2.h">,
+                  V6_vaddhnq_enc;
+defm V6_vaddwnq : T_HVX_condALU <"if (!$src1) $dst.w += $src2.w">,
+                  V6_vaddwnq_enc;
+defm V6_vsubbnq : T_HVX_condALU <"if (!$src1) $dst.b -= $src2.b">,
+                  V6_vsubbnq_enc;
+defm V6_vsubhnq : T_HVX_condALU <"if (!$src1) $dst.h -= $src2.h">,
+                  V6_vsubhnq_enc;
+defm V6_vsubwnq : T_HVX_condALU <"if (!$src1) $dst.w -= $src2.w">,
+                  V6_vsubwnq_enc;
+
+let hasNewValue = 1 in
+class T_HVX_alu_2op <string asmString, InstrItinClass itin,
+                 RegisterClass RCout, RegisterClass RCin>
+  : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1),
+    asmString >{
+  let Itinerary = itin;
+  let Type = !cast<IType>("Type"#itin);
+}
+
+multiclass T_HVX_alu_2op <string asmString, RegisterClass RCout,
+           RegisterClass RCin, InstrItinClass itin> {
+  def NAME : T_HVX_alu_2op <asmString, itin, RCout, RCin>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_alu_2op <asmString, itin,
+                              !cast<RegisterClass>(RCout#"128B"),
+                              !cast<RegisterClass>(RCin#"128B")>;
+}
+
+let hasNewValue = 1 in
+multiclass T_HVX_alu_2op_VV <string asmString>:
+  T_HVX_alu_2op <asmString, VectorRegs, VectorRegs, CVI_VA>;
+
+multiclass T_HVX_alu_2op_WV <string asmString>:
+  T_HVX_alu_2op <asmString, VecDblRegs, VectorRegs, CVI_VA_DV>;
+
+
+defm V6_vabsh     : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h)">,
+                    V6_vabsh_enc;
+defm V6_vabsw     : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w)">,
+                    V6_vabsw_enc;
+defm V6_vabsh_sat : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h):sat">,
+                    V6_vabsh_sat_enc;
+defm V6_vabsw_sat : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w):sat">,
+                    V6_vabsw_sat_enc;
+defm V6_vnot      : T_HVX_alu_2op_VV <"$dst = vnot($src1)">,
+                    V6_vnot_enc;
+defm V6_vassign   : T_HVX_alu_2op_VV <"$dst = $src1">,
+                    V6_vassign_enc;
+
+defm V6_vzb       : T_HVX_alu_2op_WV <"$dst.uh = vzxt($src1.ub)">,
+                    V6_vzb_enc;
+defm V6_vzh       : T_HVX_alu_2op_WV <"$dst.uw = vzxt($src1.uh)">,
+                    V6_vzh_enc;
+defm V6_vsb       : T_HVX_alu_2op_WV <"$dst.h = vsxt($src1.b)">,
+                    V6_vsb_enc;
+defm V6_vsh       : T_HVX_alu_2op_WV <"$dst.w = vsxt($src1.h)">,
+                    V6_vsh_enc;
+
+let Itinerary = CVI_VP, Type = TypeCVI_VP in {
+defm V6_vdealh    : T_HVX_alu_2op_VV <"$dst.h = vdeal($src1.h)">,
+                    V6_vdealh_enc;
+defm V6_vdealb    : T_HVX_alu_2op_VV <"$dst.b = vdeal($src1.b)">,
+                    V6_vdealb_enc;
+defm V6_vshuffh   : T_HVX_alu_2op_VV <"$dst.h = vshuff($src1.h)">,
+                    V6_vshuffh_enc;
+defm V6_vshuffb   : T_HVX_alu_2op_VV <"$dst.b = vshuff($src1.b)">,
+                    V6_vshuffb_enc;
+}
+
+let Itinerary = CVI_VP_VS, Type = TypeCVI_VP_VS in {
+defm V6_vunpackub : T_HVX_alu_2op_WV <"$dst.uh = vunpack($src1.ub)">,
+                    V6_vunpackub_enc;
+defm V6_vunpackuh : T_HVX_alu_2op_WV <"$dst.uw = vunpack($src1.uh)">,
+                    V6_vunpackuh_enc;
+defm V6_vunpackb  : T_HVX_alu_2op_WV <"$dst.h = vunpack($src1.b)">,
+                    V6_vunpackb_enc;
+defm V6_vunpackh  : T_HVX_alu_2op_WV <"$dst.w = vunpack($src1.h)">,
+                    V6_vunpackh_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vcl0w     : T_HVX_alu_2op_VV <"$dst.uw = vcl0($src1.uw)">,
+                    V6_vcl0w_enc;
+defm V6_vcl0h     : T_HVX_alu_2op_VV <"$dst.uh = vcl0($src1.uh)">,
+                    V6_vcl0h_enc;
+defm V6_vnormamtw : T_HVX_alu_2op_VV <"$dst.w = vnormamt($src1.w)">,
+                    V6_vnormamtw_enc;
+defm V6_vnormamth : T_HVX_alu_2op_VV <"$dst.h = vnormamt($src1.h)">,
+                    V6_vnormamth_enc;
+defm V6_vpopcounth : T_HVX_alu_2op_VV <"$dst.h = vpopcount($src1.h)">,
+                     V6_vpopcounth_enc;
+}
+
+let isAccumulator = 1, hasNewValue = 1, Itinerary = CVI_VX_DV_LONG,
+    Type = TypeCVI_VX_DV in
+class T_HVX_vmpyacc2 <string asmString, RegisterClass RC>
+  : CVI_VA_Resource1 <(outs RC:$dst),
+                      (ins RC:$_src_, RC:$src1, IntRegs:$src2, u1_0Imm:$src3),
+    asmString, [], "$dst = $_src_" > ;
+
+
+multiclass T_HVX_vmpyacc2 <string asmString> {
+  def NAME : T_HVX_vmpyacc2 <asmString, VecDblRegs>;
+
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_vmpyacc2 <asmString, VecDblRegs128B>;
+}
+
+defm V6_vrmpybusi_acc :
+     T_HVX_vmpyacc2<"$dst.w += vrmpy($src1.ub,$src2.b,#$src3)">,
+     V6_vrmpybusi_acc_enc;
+defm V6_vrsadubi_acc :
+     T_HVX_vmpyacc2<"$dst.uw += vrsad($src1.ub,$src2.ub,#$src3)">,
+     V6_vrsadubi_acc_enc;
+defm V6_vrmpyubi_acc :
+     T_HVX_vmpyacc2<"$dst.uw += vrmpy($src1.ub,$src2.ub,#$src3)">,
+     V6_vrmpyubi_acc_enc;
+
+
+let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV, hasNewValue = 1 in
+class T_HVX_vmpy2 <string asmString, RegisterClass RC>
+  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, IntRegs:$src2, u1_0Imm:$src3),
+    asmString>;
+
+
+multiclass T_HVX_vmpy2 <string asmString> {
+  def NAME : T_HVX_vmpy2 <asmString, VecDblRegs>;
+
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_vmpy2 <asmString, VecDblRegs128B>;
+}
+
+defm V6_vrmpybusi :
+     T_HVX_vmpy2 <"$dst.w = vrmpy($src1.ub,$src2.b,#$src3)">, V6_vrmpybusi_enc;
+defm V6_vrsadubi :
+     T_HVX_vmpy2 <"$dst.uw = vrsad($src1.ub,$src2.ub,#$src3)">, V6_vrsadubi_enc;
+defm V6_vrmpyubi :
+     T_HVX_vmpy2 <"$dst.uw = vrmpy($src1.ub,$src2.ub,#$src3)">, V6_vrmpyubi_enc;
+
+
+let Itinerary = CVI_VP_VS_LONG_EARLY, Type = TypeCVI_VP_VS,
+    hasSideEffects = 0, hasNewValue2 = 1, opNewValue2 = 1 in
+class T_HVX_perm <string asmString, RegisterClass RC>
+  : CVI_VA_Resource1 <(outs RC:$_dst1_, RC:$_dst2_),
+                      (ins RC:$src1, RC:$src2, IntRegs:$src3),
+    asmString, [], "$_dst1_ = $src1, $_dst2_ = $src2" >;
+
+multiclass T_HVX_perm <string asmString> {
+  def NAME : T_HVX_perm <asmString, VectorRegs>;
+
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_perm <asmString, VectorRegs128B>;
+}
+
+let hasNewValue = 1, opNewValue = 0, hasNewValue2 = 1, opNewValue2 = 1 in {
+  defm V6_vshuff : T_HVX_perm <"vshuff($src1,$src2,$src3)">, V6_vshuff_enc;
+  defm V6_vdeal : T_HVX_perm <"vdeal($src1,$src2,$src3)">, V6_vdeal_enc;
+}
+
+// Conditional vector move.
+let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class T_HVX_cmov <bit isPredNot, RegisterClass RC>
+  : CVI_VA_Resource1 <(outs RC:$dst), (ins PredRegs:$src1, RC:$src2),
+    "if ("#!if(isPredNot, "!", "")#"$src1) $dst = $src2"> {
+  let isPredicatedFalse = isPredNot;
+}
+
+multiclass T_HVX_cmov <bit isPredNot = 0> {
+  def NAME : T_HVX_cmov <isPredNot, VectorRegs>;
+
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_cmov <isPredNot, VectorRegs128B>;
+}
+
+defm V6_vcmov : T_HVX_cmov, V6_vcmov_enc;
+defm V6_vncmov : T_HVX_cmov<1>, V6_vncmov_enc;
+
+// Conditional vector combine.
+let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, isPredicated = 1,
+    hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+class T_HVX_ccombine <bit isPredNot, RegisterClass RCout, RegisterClass RCin>
+  : CVI_VA_Resource1 < (outs RCout:$dst),
+    (ins PredRegs:$src1, RCin:$src2, RCin:$src3),
+    "if ("#!if(isPredNot, "!", "")#"$src1) $dst = vcombine($src2,$src3)"> {
+  let isPredicatedFalse = isPredNot;
+}
+
+multiclass T_HVX_ccombine <bit isPredNot = 0> {
+  def NAME : T_HVX_ccombine <isPredNot, VecDblRegs, VectorRegs>;
+
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_ccombine <isPredNot, VecDblRegs128B, VectorRegs128B>;
+}
+
+defm V6_vccombine : T_HVX_ccombine, V6_vccombine_enc;
+defm V6_vnccombine : T_HVX_ccombine<1>, V6_vnccombine_enc;
+
+let hasNewValue = 1 in
+class T_HVX_shift <string asmString, RegisterClass RCout, RegisterClass RCin>
+  : CVI_VX_DV_Resource1<(outs RCout:$dst),
+    (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3),
+    asmString >;
+
+multiclass T_HVX_shift <string asmString, RegisterClass RCout,
+                        RegisterClass RCin> {
+  def NAME : T_HVX_shift <asmString, RCout, RCin>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_shift <asmString, !cast<RegisterClass>(RCout#"128B"),
+                                           !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_HVX_shift_VV <string asmString>:
+  T_HVX_shift <asmString, VectorRegs, VectorRegs>;
+
+multiclass T_HVX_shift_WV <string asmString>:
+  T_HVX_shift <asmString, VecDblRegs, VectorRegs>;
+
+let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in {
+defm V6_valignb :
+     T_HVX_shift_VV <"$dst = valign($src1,$src2,$src3)">, V6_valignb_enc;
+defm V6_vlalignb :
+     T_HVX_shift_VV <"$dst = vlalign($src1,$src2,$src3)">, V6_vlalignb_enc;
+}
+
+let Itinerary = CVI_VS, Type = TypeCVI_VS in {
+defm V6_vasrwh :
+     T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3)">, V6_vasrwh_enc;
+defm V6_vasrwhsat :
+     T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):sat">,
+     V6_vasrwhsat_enc;
+defm V6_vasrwhrndsat :
+     T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):rnd:sat">,
+     V6_vasrwhrndsat_enc;
+defm V6_vasrwuhsat :
+     T_HVX_shift_VV <"$dst.uh = vasr($src1.w,$src2.w,$src3):sat">,
+     V6_vasrwuhsat_enc;
+defm V6_vasrhubsat :
+     T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):sat">,
+     V6_vasrhubsat_enc;
+defm V6_vasrhubrndsat :
+     T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):rnd:sat">,
+     V6_vasrhubrndsat_enc;
+defm V6_vasrhbrndsat :
+     T_HVX_shift_VV <"$dst.b = vasr($src1.h,$src2.h,$src3):rnd:sat">,
+     V6_vasrhbrndsat_enc;
+}
+
+// Assembler mapped -- alias?
+//defm V6_vtran2x2vdd : T_HVX_shift_VV <"">, V6_vtran2x2vdd_enc;
+let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in {
+defm V6_vshuffvdd :
+     T_HVX_shift_WV <"$dst = vshuff($src1,$src2,$src3)">, V6_vshuffvdd_enc;
+defm V6_vdealvdd :
+     T_HVX_shift_WV <"$dst = vdeal($src1,$src2,$src3)">, V6_vdealvdd_enc;
+}
+
+let hasNewValue = 1, Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in
+class T_HVX_unpack <string asmString, RegisterClass RCout, RegisterClass RCin>
+  : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCout:$_src_, RCin:$src1),
+    asmString, [], "$dst = $_src_">;
+
+multiclass T_HVX_unpack <string asmString> {
+  def NAME : T_HVX_unpack <asmString, VecDblRegs, VectorRegs>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_unpack <asmString, VecDblRegs128B, VectorRegs128B>;
+}
+
+defm V6_vunpackob : T_HVX_unpack <"$dst.h |= vunpacko($src1.b)">, V6_vunpackob_enc;
+defm V6_vunpackoh : T_HVX_unpack <"$dst.w |= vunpacko($src1.h)">, V6_vunpackoh_enc;
+
+let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1,
+    hasSideEffects = 0 in
+class T_HVX_valign <string asmString, RegisterClass RC>
+  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2, u3_0Imm:$src3),
+    asmString>;
+
+multiclass T_HVX_valign <string asmString> {
+  def NAME : T_HVX_valign <asmString, VectorRegs>;
+
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_valign <asmString, VectorRegs128B>;
+}
+
+defm V6_valignbi :
+     T_HVX_valign <"$dst = valign($src1,$src2,#$src3)">, V6_valignbi_enc;
+defm V6_vlalignbi :
+     T_HVX_valign <"$dst = vlalign($src1,$src2,#$src3)">, V6_vlalignbi_enc;
+
+let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in
+class T_HVX_predAlu <string asmString, RegisterClass RC>
+  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2),
+    asmString>;
+
+multiclass T_HVX_predAlu <string asmString> {
+  def NAME : T_HVX_predAlu <asmString, VecPredRegs>;
+
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_predAlu <asmString, VecPredRegs128B>;
+}
+
+defm V6_pred_and  : T_HVX_predAlu <"$dst = and($src1,$src2)">, V6_pred_and_enc;
+defm V6_pred_or   : T_HVX_predAlu <"$dst = or($src1,$src2)">, V6_pred_or_enc;
+defm V6_pred_xor  : T_HVX_predAlu <"$dst = xor($src1,$src2)">, V6_pred_xor_enc;
+defm V6_pred_or_n : T_HVX_predAlu <"$dst = or($src1,!$src2)">, V6_pred_or_n_enc;
+defm V6_pred_and_n :
+     T_HVX_predAlu <"$dst = and($src1,!$src2)">, V6_pred_and_n_enc;
+
+let Itinerary = CVI_VA, Type = TypeCVI_VA in
+class T_HVX_prednot <RegisterClass RC>
+  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1),
+    "$dst = not($src1)">, V6_pred_not_enc;
+
+def V6_pred_not : T_HVX_prednot <VecPredRegs>;
+let isCodeGenOnly =  1 in
+def V6_pred_not_128B : T_HVX_prednot <VecPredRegs128B>;
+
+let Itinerary = CVI_VA, Type = TypeCVI_VA in
+class T_HVX_vcmp2 <string asmString, RegisterClass RCout, RegisterClass RCin>
+  : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2),
+    asmString >;
+
+multiclass T_HVX_vcmp2 <string asmString> {
+  def NAME : T_HVX_vcmp2 <asmString, VecPredRegs, VectorRegs>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_vcmp2 <asmString, VecPredRegs128B, VectorRegs128B>;
+}
+
+defm V6_veqb : T_HVX_vcmp2  <"$dst = vcmp.eq($src1.b,$src2.b)">, V6_veqb_enc;
+defm V6_veqh : T_HVX_vcmp2  <"$dst = vcmp.eq($src1.h,$src2.h)">, V6_veqh_enc;
+defm V6_veqw : T_HVX_vcmp2  <"$dst = vcmp.eq($src1.w,$src2.w)">, V6_veqw_enc;
+defm V6_vgtb : T_HVX_vcmp2  <"$dst = vcmp.gt($src1.b,$src2.b)">, V6_vgtb_enc;
+defm V6_vgth : T_HVX_vcmp2  <"$dst = vcmp.gt($src1.h,$src2.h)">, V6_vgth_enc;
+defm V6_vgtw : T_HVX_vcmp2  <"$dst = vcmp.gt($src1.w,$src2.w)">, V6_vgtw_enc;
+defm V6_vgtub : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_enc;
+defm V6_vgtuh : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_enc;
+defm V6_vgtuw : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_enc;
+
+let isAccumulator = 1, hasNewValue = 1, hasSideEffects = 0 in
+class T_V6_vandqrt_acc <RegisterClass RCout, RegisterClass RCin>
+  : CVI_VX_Resource_late<(outs RCout:$dst),
+    (ins RCout:$_src_, RCin:$src1, IntRegs:$src2),
+    "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandqrt_acc_enc;
+
+def V6_vandqrt_acc : T_V6_vandqrt_acc <VectorRegs, VecPredRegs>;
+let isCodeGenOnly = 1 in
+def V6_vandqrt_acc_128B : T_V6_vandqrt_acc <VectorRegs128B, VecPredRegs128B>;
+
+let isAccumulator = 1 in
+class T_V6_vandvrt_acc <RegisterClass RCout, RegisterClass RCin>
+  : CVI_VX_Resource_late<(outs RCout:$dst),
+    (ins RCout:$_src_, RCin:$src1, IntRegs:$src2),
+    "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandvrt_acc_enc;
+
+def V6_vandvrt_acc : T_V6_vandvrt_acc <VecPredRegs, VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_vandvrt_acc_128B : T_V6_vandvrt_acc <VecPredRegs128B, VectorRegs128B>;
+
+let hasNewValue =  1, hasSideEffects = 0 in
+class T_V6_vandqrt <RegisterClass RCout, RegisterClass RCin>
+  : CVI_VX_Resource_late<(outs RCout:$dst),
+    (ins RCin:$src1, IntRegs:$src2),
+    "$dst = vand($src1,$src2)" >, V6_vandqrt_enc;
+
+def V6_vandqrt : T_V6_vandqrt <VectorRegs, VecPredRegs>;
+let isCodeGenOnly = 1 in
+def V6_vandqrt_128B : T_V6_vandqrt <VectorRegs128B, VecPredRegs128B>;
+
+let hasNewValue = 1, hasSideEffects = 0 in
+class T_V6_lvsplatw <RegisterClass RC>
+  : CVI_VX_Resource_late<(outs RC:$dst), (ins IntRegs:$src1),
+    "$dst = vsplat($src1)" >, V6_lvsplatw_enc;
+
+def V6_lvsplatw : T_V6_lvsplatw <VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_lvsplatw_128B : T_V6_lvsplatw <VectorRegs128B>;
+
+
+let hasNewValue = 1 in
+class T_V6_vinsertwr <RegisterClass RC>
+  : CVI_VX_Resource_late<(outs RC:$dst), (ins RC:$_src_, IntRegs:$src1),
+    "$dst.w = vinsert($src1)", [], "$dst = $_src_">,
+    V6_vinsertwr_enc;
+
+def V6_vinsertwr : T_V6_vinsertwr <VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_vinsertwr_128B : T_V6_vinsertwr <VectorRegs128B>;
+
+
+let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in
+class T_V6_pred_scalar2 <RegisterClass RC>
+  : CVI_VA_Resource1<(outs RC:$dst), (ins IntRegs:$src1),
+    "$dst = vsetq($src1)">, V6_pred_scalar2_enc;
+
+def V6_pred_scalar2 : T_V6_pred_scalar2 <VecPredRegs>;
+let isCodeGenOnly = 1 in
+def V6_pred_scalar2_128B : T_V6_pred_scalar2 <VecPredRegs128B>;
+
+class T_V6_vandvrt <RegisterClass RCout, RegisterClass RCin>
+  : CVI_VX_Resource_late<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2),
+    "$dst = vand($src1,$src2)">, V6_vandvrt_enc;
+
+def V6_vandvrt : T_V6_vandvrt <VecPredRegs, VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_vandvrt_128B : T_V6_vandvrt <VecPredRegs128B, VectorRegs128B>;
+
+let validSubTargets = HasV60SubT in
+class T_HVX_rol <string asmString, RegisterClass RC, Operand ImmOp >
+  : SInst2 <(outs RC:$dst), (ins  RC:$src1, ImmOp:$src2), asmString>;
+
+class T_HVX_rol_R <string asmString>
+  : T_HVX_rol <asmString, IntRegs, u5_0Imm>;
+class T_HVX_rol_P <string asmString>
+  : T_HVX_rol <asmString, DoubleRegs, u6_0Imm>;
+
+def S6_rol_i_p : T_HVX_rol_P <"$dst = rol($src1,#$src2)">, S6_rol_i_p_enc;
+let hasNewValue = 1, opNewValue = 0 in
+def S6_rol_i_r : T_HVX_rol_R <"$dst = rol($src1,#$src2)">, S6_rol_i_r_enc;
+
+let validSubTargets = HasV60SubT in
+class T_HVX_rol_acc <string asmString, RegisterClass RC, Operand ImmOp>
+  : SInst2 <(outs RC:$dst), (ins RC:$_src_, RC:$src1, ImmOp:$src2),
+    asmString, [], "$dst = $_src_" >;
+
+class T_HVX_rol_acc_P <string asmString>
+  : T_HVX_rol_acc <asmString, DoubleRegs, u6_0Imm>;
+
+class T_HVX_rol_acc_R <string asmString>
+  : T_HVX_rol_acc <asmString, IntRegs, u5_0Imm>;
+
+def S6_rol_i_p_nac :
+    T_HVX_rol_acc_P <"$dst -= rol($src1,#$src2)">, S6_rol_i_p_nac_enc;
+def S6_rol_i_p_acc :
+    T_HVX_rol_acc_P <"$dst += rol($src1,#$src2)">, S6_rol_i_p_acc_enc;
+def S6_rol_i_p_and :
+    T_HVX_rol_acc_P <"$dst &= rol($src1,#$src2)">, S6_rol_i_p_and_enc;
+def S6_rol_i_p_or  :
+    T_HVX_rol_acc_P <"$dst |= rol($src1,#$src2)">, S6_rol_i_p_or_enc;
+def S6_rol_i_p_xacc :
+    T_HVX_rol_acc_P<"$dst ^= rol($src1,#$src2)">, S6_rol_i_p_xacc_enc;
+
+let hasNewValue = 1, opNewValue = 0 in {
+def S6_rol_i_r_nac :
+    T_HVX_rol_acc_R <"$dst -= rol($src1,#$src2)">, S6_rol_i_r_nac_enc;
+def S6_rol_i_r_acc :
+    T_HVX_rol_acc_R <"$dst += rol($src1,#$src2)">, S6_rol_i_r_acc_enc;
+def S6_rol_i_r_and :
+    T_HVX_rol_acc_R <"$dst &= rol($src1,#$src2)">, S6_rol_i_r_and_enc;
+def S6_rol_i_r_or :
+    T_HVX_rol_acc_R <"$dst |= rol($src1,#$src2)">, S6_rol_i_r_or_enc;
+def S6_rol_i_r_xacc :
+    T_HVX_rol_acc_R <"$dst ^= rol($src1,#$src2)">, S6_rol_i_r_xacc_enc;
+}
+
+let isSolo = 1, Itinerary = LD_tc_ld_SLOT0, Type = TypeLD in
+class T_V6_extractw <RegisterClass RC>
+  : LD1Inst <(outs IntRegs:$dst), (ins RC:$src1, IntRegs:$src2),
+    "$dst = vextract($src1,$src2)">, V6_extractw_enc;
+
+def V6_extractw : T_V6_extractw <VectorRegs>;
+let isCodeGenOnly = 1 in
+def V6_extractw_128B : T_V6_extractw <VectorRegs128B>;
+
+let Itinerary = ST_tc_st_SLOT0, validSubTargets = HasV55SubT  in
+class T_sys0op <string asmString>
+  : ST1Inst <(outs), (ins), asmString>;
+
+let isSolo = 1, validSubTargets = HasV55SubT in {
+def Y5_l2gunlock   : T_sys0op <"l2gunlock">, Y5_l2gunlock_enc;
+def Y5_l2gclean    : T_sys0op <"l2gclean">, Y5_l2gclean_enc;
+def Y5_l2gcleaninv : T_sys0op <"l2gcleaninv">, Y5_l2gcleaninv_enc;
+}
+
+class T_sys1op <string asmString, RegisterClass RC>
+  : ST1Inst <(outs), (ins RC:$src1), asmString>;
+
+class T_sys1op_R <string asmString> : T_sys1op <asmString, IntRegs>;
+class T_sys1op_P <string asmString> : T_sys1op <asmString, DoubleRegs>;
+
+let isSoloAX = 1, validSubTargets = HasV55SubT in
+def Y5_l2unlocka     : T_sys1op_R <"l2unlocka($src1)">, Y5_l2unlocka_enc;
+
+let isSolo = 1, validSubTargets = HasV60SubT in {
+def Y6_l2gcleanpa    : T_sys1op_P <"l2gclean($src1)">, Y6_l2gcleanpa_enc;
+def Y6_l2gcleaninvpa : T_sys1op_P <"l2gcleaninv($src1)">, Y6_l2gcleaninvpa_enc;
+}
+
+let Itinerary = ST_tc_3stall_SLOT0, isPredicateLate = 1, isSoloAX = 1,
+    validSubTargets = HasV55SubT in
+def Y5_l2locka : ST1Inst <(outs PredRegs:$dst), (ins IntRegs:$src1),
+  "$dst = l2locka($src1)">, Y5_l2locka_enc;
+
+// not defined on etc side. why?
+// defm S2_cabacencbin : _VV <"Rdd=encbin(Rss,$src2,Pu)">, S2_cabacencbin_enc;
+
+let Defs = [USR_OVF], Itinerary = M_tc_3stall_SLOT23, isPredicateLate = 1,
+    hasSideEffects = 0,
+validSubTargets = HasV55SubT in
+def A5_ACS : MInst2 <(outs DoubleRegs:$dst1, PredRegs:$dst2),
+  (ins DoubleRegs:$_src_, DoubleRegs:$src1, DoubleRegs:$src2),
+  "$dst1,$dst2 = vacsh($src1,$src2)", [],
+  "$dst1 = $_src_" >, Requires<[HasV55T]>, A5_ACS_enc;
+
+let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, hasNewValue = 1,
+    hasSideEffects = 0 in
+class T_HVX_alu2 <string asmString, RegisterClass RCout, RegisterClass RCin1,
+                  RegisterClass RCin2>
+  : CVI_VA_Resource1<(outs RCout:$dst),
+    (ins RCin1:$src1, RCin2:$src2, RCin2:$src3), asmString>;
+
+multiclass T_HVX_alu2 <string asmString, RegisterClass RC > {
+  def NAME : T_HVX_alu2 <asmString, RC, VecPredRegs, VectorRegs>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_alu2 <asmString, !cast<RegisterClass>(RC#"128B"),
+                               VecPredRegs128B, VectorRegs128B>;
+}
+
+multiclass T_HVX_alu2_V <string asmString> :
+  T_HVX_alu2 <asmString, VectorRegs>;
+
+multiclass T_HVX_alu2_W <string asmString> :
+  T_HVX_alu2 <asmString, VecDblRegs>;
+
+defm V6_vswap : T_HVX_alu2_W <"$dst = vswap($src1,$src2,$src3)">, V6_vswap_enc;
+
+let Itinerary = CVI_VA, Type = TypeCVI_VA, hasNewValue = 1,
+    hasSideEffects = 0 in
+defm V6_vmux  : T_HVX_alu2_V <"$dst = vmux($src1,$src2,$src3)">, V6_vmux_enc;
+
+class T_HVX_vlutb <string asmString, RegisterClass RCout, RegisterClass RCin>
+  : CVI_VA_Resource1<(outs RCout:$dst),
+    (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3), asmString>;
+
+multiclass T_HVX_vlutb <string asmString, RegisterClass RCout,
+                        RegisterClass RCin> {
+  def NAME : T_HVX_vlutb <asmString, RCout, RCin>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_vlutb <asmString, !cast<RegisterClass>(RCout#"128B"),
+                                           !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_HVX_vlutb_V <string asmString> :
+  T_HVX_vlutb <asmString, VectorRegs, VectorRegs>;
+
+multiclass T_HVX_vlutb_W <string asmString> :
+  T_HVX_vlutb <asmString, VecDblRegs, VectorRegs>;
+
+let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, isAccumulator = 1 in
+class T_HVX_vlutb_acc <string asmString, RegisterClass RCout,
+                       RegisterClass RCin>
+  : CVI_VA_Resource1<(outs RCout:$dst),
+    (ins RCout:$_src_, RCin:$src1, RCin:$src2, IntRegsLow8:$src3),
+    asmString, [], "$dst = $_src_">;
+
+multiclass T_HVX_vlutb_acc <string asmString, RegisterClass RCout,
+                            RegisterClass RCin> {
+  def NAME : T_HVX_vlutb_acc <asmString, RCout, RCin>;
+  let isCodeGenOnly = 1 in
+  def NAME#_128B : T_HVX_vlutb_acc<asmString,
+                                   !cast<RegisterClass>(RCout#"128B"),
+                                   !cast<RegisterClass>(RCin#"128B")>;
+}
+
+multiclass T_HVX_vlutb_acc_V <string asmString> :
+  T_HVX_vlutb_acc <asmString, VectorRegs, VectorRegs>;
+
+multiclass T_HVX_vlutb_acc_W <string asmString> :
+  T_HVX_vlutb_acc <asmString, VecDblRegs, VectorRegs>;
+
+
+let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1 in
+defm V6_vlutvvb:
+     T_HVX_vlutb_V <"$dst.b = vlut32($src1.b,$src2.b,$src3)">, V6_vlutvvb_enc;
+
+let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, hasNewValue = 1 in
+defm V6_vlutvwh:
+     T_HVX_vlutb_W <"$dst.h = vlut16($src1.b,$src2.h,$src3)">, V6_vlutvwh_enc;
+
+let hasNewValue = 1 in {
+  defm V6_vlutvvb_oracc:
+       T_HVX_vlutb_acc_V <"$dst.b |= vlut32($src1.b,$src2.b,$src3)">,
+       V6_vlutvvb_oracc_enc;
+  defm V6_vlutvwh_oracc:
+       T_HVX_vlutb_acc_W <"$dst.h |= vlut16($src1.b,$src2.h,$src3)">,
+       V6_vlutvwh_oracc_enc;
+}
+
+// It's a fake instruction and should not be defined?
+def S2_cabacencbin
+  : SInst2<(outs DoubleRegs:$dst),
+          (ins DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
+    "$dst = encbin($src1,$src2,$src3)">, S2_cabacencbin_enc;
+
+// Vhist instructions
+def V6_vhistq
+  : CVI_HIST_Resource1 <(outs), (ins VecPredRegs:$src1),
+    "vhist($src1)">, V6_vhistq_enc;
+
+def V6_vhist
+  : CVI_HIST_Resource1 <(outs), (ins),
+    "vhist" >, V6_vhist_enc;
+
+
+let isPseudo = 1, isCodeGenOnly = 1, hasSideEffects = 0 in {
+  def V6_vd0: CVI_VA_Resource<(outs VectorRegs:$dst), (ins), "$dst = #0", []>;
+  def V6_vd0_128B: CVI_VA_Resource<(outs VectorRegs128B:$dst), (ins),
+      "$dst = #0", []>;
+
+  def V6_vassignp: CVI_VA_Resource<(outs VecDblRegs:$dst),
+      (ins VecDblRegs:$src), "", []>;
+  def V6_vassignp_128B : CVI_VA_Resource<(outs VecDblRegs128B:$dst),
+      (ins VecDblRegs128B:$src), "", []>;
+
+  def V6_lo: CVI_VA_Resource<(outs VectorRegs:$dst), (ins VecDblRegs:$src1),
+      "", []>;
+  def V6_lo_128B: CVI_VA_Resource<(outs VectorRegs128B:$dst),
+      (ins VecDblRegs128B:$src1), "", []>;
+
+  def V6_hi: CVI_VA_Resource<(outs VectorRegs:$dst), (ins VecDblRegs:$src1),
+      "", []>;
+  def V6_hi_128B: CVI_VA_Resource<(outs VectorRegs128B:$dst),
+      (ins VecDblRegs128B:$src1), "", []>;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td
new file mode 100644
index 000000000000..e3520bd6e515
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td
@@ -0,0 +1,69 @@
+//===- HexagonInstrInfoVector.td - Hexagon Vector Patterns -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon Vector instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+// Vector shift support. Vector shifting in Hexagon is rather different
+// from internal representation of LLVM.
+// LLVM assumes all shifts (in vector case) will have the form
+// <VT> = SHL/SRA/SRL <VT> by <VT>
+// while Hexagon has the following format:
+// <VT> = SHL/SRA/SRL <VT> by <IT/i32>
+// As a result, special care is needed to guarantee correctness and
+// performance.
+class vshift_v4i16<SDNode Op, string Str, bits<3>MajOp, bits<3>MinOp>
+  : S_2OpInstImm<Str, MajOp, MinOp, u4_0Imm, []> {
+  bits<4> src2;
+  let Inst{11-8} = src2;
+}
+
+class vshift_v2i32<SDNode Op, string Str, bits<3>MajOp, bits<3>MinOp>
+  : S_2OpInstImm<Str, MajOp, MinOp, u5_0Imm, []> {
+  bits<5> src2;
+  let Inst{12-8} = src2;
+}
+
+def S2_asr_i_vw : vshift_v2i32<sra, "vasrw", 0b010, 0b000>;
+def S2_lsr_i_vw : vshift_v2i32<srl, "vlsrw", 0b010, 0b001>;
+def S2_asl_i_vw : vshift_v2i32<shl, "vaslw", 0b010, 0b010>;
+
+def S2_asr_i_vh : vshift_v4i16<sra, "vasrh", 0b100, 0b000>;
+def S2_lsr_i_vh : vshift_v4i16<srl, "vlsrh", 0b100, 0b001>;
+def S2_asl_i_vh : vshift_v4i16<shl, "vaslh", 0b100, 0b010>;
+
+// Vector shift words by register
+def S2_asr_r_vw : T_S3op_shiftVect < "vasrw", 0b00, 0b00>;
+def S2_lsr_r_vw : T_S3op_shiftVect < "vlsrw", 0b00, 0b01>;
+def S2_asl_r_vw : T_S3op_shiftVect < "vaslw", 0b00, 0b10>;
+def S2_lsl_r_vw : T_S3op_shiftVect < "vlslw", 0b00, 0b11>;
+
+// Vector shift halfwords by register
+def S2_asr_r_vh : T_S3op_shiftVect < "vasrh", 0b01, 0b00>;
+def S2_lsr_r_vh : T_S3op_shiftVect < "vlsrh", 0b01, 0b01>;
+def S2_asl_r_vh : T_S3op_shiftVect < "vaslh", 0b01, 0b10>;
+def S2_lsl_r_vh : T_S3op_shiftVect < "vlslh", 0b01, 0b11>;
+
+
+// Hexagon doesn't have a vector multiply with C semantics.
+// Instead, generate a pseudo instruction that gets expaneded into two
+// scalar MPYI instructions.
+// This is expanded by ExpandPostRAPseudos.
+let isPseudo = 1 in
+def PS_vmulw : PseudoM<(outs DoubleRegs:$Rd),
+      (ins DoubleRegs:$Rs, DoubleRegs:$Rt), "", []>;
+
+let isPseudo = 1 in
+def PS_vmulw_acc : PseudoM<(outs DoubleRegs:$Rd),
+      (ins DoubleRegs:$Rx, DoubleRegs:$Rs, DoubleRegs:$Rt), "", [],
+      "$Rd = $Rx">;
+
+
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
new file mode 100644
index 000000000000..d4f303bf6ff0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -0,0 +1,1353 @@
+//===-- HexagonIntrinsics.td - Instruction intrinsics ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is populated based on the following specs:
+// Hexagon V2 Architecture
+// Application-Level Specification
+// 80-V9418-8 Rev. B
+// March 4, 2008
+//===----------------------------------------------------------------------===//
+
+class T_I_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID imm:$Is),
+         (MI imm:$Is)>;
+
+class T_R_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs),
+         (MI I32:$Rs)>;
+
+class T_P_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs),
+         (MI I64:$Rs)>;
+
+class T_II_pat <InstHexagon MI, Intrinsic IntID, PatFrag Imm1, PatFrag Imm2>
+  : Pat<(IntID Imm1:$Is, Imm2:$It),
+        (MI Imm1:$Is, Imm2:$It)>;
+
+class T_RI_pat <InstHexagon MI, Intrinsic IntID,
+                PatLeaf ImmPred = PatLeaf<(i32 imm)>>
+  : Pat<(IntID I32:$Rs, ImmPred:$It),
+        (MI I32:$Rs, ImmPred:$It)>;
+
+class T_IR_pat <InstHexagon MI, Intrinsic IntID,
+                PatFrag ImmPred = PatLeaf<(i32 imm)>>
+  : Pat<(IntID ImmPred:$Is, I32:$Rt),
+        (MI ImmPred:$Is, I32:$Rt)>;
+
+class T_PI_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID I64:$Rs, imm:$It),
+        (MI I64:$Rs, imm:$It)>;
+
+class T_RP_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID I32:$Rs, I64:$Rt),
+        (MI I32:$Rs, I64:$Rt)>;
+
+class T_RR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs, I32:$Rt),
+         (MI I32:$Rs, I32:$Rt)>;
+
+class T_PP_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I64:$Rt),
+         (MI I64:$Rs, I64:$Rt)>;
+
+class T_QQ_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs, I32:$Rt),
+         (MI (C2_tfrrp I32:$Rs), (C2_tfrrp I32:$Rt))>;
+
+class T_QII_pat <InstHexagon MI, Intrinsic IntID, PatFrag Imm1, PatFrag Imm2>
+  : Pat <(IntID I32:$Rp, Imm1:$Is, Imm2:$It),
+         (MI (C2_tfrrp I32:$Rp), Imm1:$Is, Imm2:$It)>;
+
+class T_QRR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rp, I32:$Rs, I32:$Rt),
+         (MI (C2_tfrrp I32:$Rp), I32:$Rs, I32:$Rt)>;
+
+class T_QRI_pat <InstHexagon MI, Intrinsic IntID, PatFrag ImmPred>
+  : Pat <(IntID I32:$Rp, I32:$Rs, ImmPred:$Is),
+         (MI (C2_tfrrp I32:$Rp), I32:$Rs, ImmPred:$Is)>;
+
+class T_QIR_pat <InstHexagon MI, Intrinsic IntID, PatFrag ImmPred>
+  : Pat <(IntID I32:$Rp, ImmPred:$Is, I32:$Rs),
+         (MI (C2_tfrrp I32:$Rp), ImmPred:$Is, I32:$Rs)>;
+
+class T_QPP_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rp, I64:$Rs, I64:$Rt),
+         (MI (C2_tfrrp I32:$Rp), I64:$Rs, I64:$Rt)>;
+
+class T_RRI_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs, I32:$Rt, imm:$Iu),
+         (MI I32:$Rs, I32:$Rt, imm:$Iu)>;
+
+class T_RII_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs, imm:$It, imm:$Iu),
+         (MI I32:$Rs, imm:$It, imm:$Iu)>;
+
+class T_IRI_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID imm:$It, I32:$Rs, imm:$Iu),
+         (MI imm:$It, I32:$Rs, imm:$Iu)>;
+
+class T_IRR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID imm:$Is, I32:$Rs, I32:$Rt),
+         (MI imm:$Is, I32:$Rs, I32:$Rt)>;
+
+class T_RIR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs, imm:$Is, I32:$Rt),
+         (MI I32:$Rs, imm:$Is, I32:$Rt)>;
+
+class T_RRR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs, I32:$Rt, I32:$Ru),
+         (MI I32:$Rs, I32:$Rt, I32:$Ru)>;
+
+class T_PPI_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I64:$Rt, imm:$Iu),
+         (MI I64:$Rs, I64:$Rt, imm:$Iu)>;
+
+class T_PII_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, imm:$It, imm:$Iu),
+         (MI I64:$Rs, imm:$It, imm:$Iu)>;
+
+class T_PPP_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I64:$Rt, I64:$Ru),
+         (MI I64:$Rs, I64:$Rt, I64:$Ru)>;
+
+class T_PPR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I64:$Rt, I32:$Ru),
+         (MI I64:$Rs, I64:$Rt, I32:$Ru)>;
+
+class T_PRR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I32:$Rt, I32:$Ru),
+         (MI I64:$Rs, I32:$Rt, I32:$Ru)>;
+
+class T_PPQ_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I64:$Rt, I32:$Rp),
+         (MI I64:$Rs, I64:$Rt, (C2_tfrrp I32:$Rp))>;
+
+class T_PR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I32:$Rt),
+         (MI I64:$Rs, I32:$Rt)>;
+
+class T_D_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID (F64:$Rs)),
+        (MI (F64:$Rs))>;
+
+class T_DI_pat <InstHexagon MI, Intrinsic IntID,
+                PatLeaf ImmPred = PatLeaf<(i32 imm)>>
+  : Pat<(IntID F64:$Rs, ImmPred:$It),
+        (MI F64:$Rs, ImmPred:$It)>;
+
+class T_F_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID F32:$Rs),
+        (MI F32:$Rs)>;
+
+class T_FI_pat <InstHexagon MI, Intrinsic IntID,
+                PatLeaf ImmPred = PatLeaf<(i32 imm)>>
+  : Pat<(IntID F32:$Rs, ImmPred:$It),
+        (MI F32:$Rs, ImmPred:$It)>;
+
+class T_FF_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID F32:$Rs, F32:$Rt),
+        (MI F32:$Rs, F32:$Rt)>;
+
+class T_DD_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID F64:$Rs, F64:$Rt),
+        (MI F64:$Rs, F64:$Rt)>;
+
+class T_FFF_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID F32:$Rs, F32:$Rt, F32:$Ru),
+        (MI F32:$Rs, F32:$Rt, F32:$Ru)>;
+
+class T_FFFQ_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID F32:$Rs, F32:$Rt, F32:$Ru, I32:$Rp),
+         (MI F32:$Rs, F32:$Rt, F32:$Ru, (C2_tfrrp I32:$Rp))>;
+
+class T_Q_RI_pat <InstHexagon MI, Intrinsic IntID,
+                  PatLeaf ImmPred = PatLeaf<(i32 imm)>>
+  : Pat<(IntID I32:$Rs, ImmPred:$It),
+        (C2_tfrpr (MI I32:$Rs, ImmPred:$It))>;
+
+class T_Q_RR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs, I32:$Rt),
+         (C2_tfrpr (MI I32:$Rs, I32:$Rt))>;
+
+class T_Q_RP_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rs, I64:$Rt),
+         (C2_tfrpr (MI I32:$Rs, I64:$Rt))>;
+
+class T_Q_PR_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I32:$Rt),
+         (C2_tfrpr (MI I64:$Rs, I32:$Rt))>;
+
+class T_Q_PI_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID I64:$Rs, imm:$It),
+        (C2_tfrpr (MI I64:$Rs, imm:$It))>;
+
+class T_Q_PP_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I64:$Rs, I64:$Rt),
+         (C2_tfrpr (MI I64:$Rs, I64:$Rt))>;
+
+class T_Q_Q_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rp),
+         (C2_tfrpr (MI (C2_tfrrp I32:$Rp)))>;
+
+class T_Q_QQ_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rp, I32:$Rq),
+         (C2_tfrpr (MI (C2_tfrrp I32:$Rp), (C2_tfrrp I32:$Rq)))>;
+
+class T_Q_FF_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID F32:$Rs, F32:$Rt),
+        (C2_tfrpr (MI F32:$Rs, F32:$Rt))>;
+
+class T_Q_DD_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID F64:$Rs, F64:$Rt),
+        (C2_tfrpr (MI F64:$Rs, F64:$Rt))>;
+
+class T_Q_FI_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID F32:$Rs, imm:$It),
+        (C2_tfrpr (MI F32:$Rs, imm:$It))>;
+
+class T_Q_DI_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID F64:$Rs, imm:$It),
+        (C2_tfrpr (MI F64:$Rs, imm:$It))>;
+
+class T_Q_QQQ_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat <(IntID I32:$Rp, I32:$Rq, I32:$Rs),
+         (C2_tfrpr (MI (C2_tfrrp I32:$Rp), (C2_tfrrp I32:$Rq),
+                       (C2_tfrrp I32:$Rs)))>;
+
+//===----------------------------------------------------------------------===//
+// MPYS / Multipy signed/unsigned halfwords
+//Rd=mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat]
+//===----------------------------------------------------------------------===//
+
+def : T_RR_pat <M2_mpy_ll_s1, int_hexagon_M2_mpy_ll_s1>;
+def : T_RR_pat <M2_mpy_ll_s0, int_hexagon_M2_mpy_ll_s0>;
+def : T_RR_pat <M2_mpy_lh_s1, int_hexagon_M2_mpy_lh_s1>;
+def : T_RR_pat <M2_mpy_lh_s0, int_hexagon_M2_mpy_lh_s0>;
+def : T_RR_pat <M2_mpy_hl_s1, int_hexagon_M2_mpy_hl_s1>;
+def : T_RR_pat <M2_mpy_hl_s0, int_hexagon_M2_mpy_hl_s0>;
+def : T_RR_pat <M2_mpy_hh_s1, int_hexagon_M2_mpy_hh_s1>;
+def : T_RR_pat <M2_mpy_hh_s0, int_hexagon_M2_mpy_hh_s0>;
+
+def : T_RR_pat <M2_mpyu_ll_s1, int_hexagon_M2_mpyu_ll_s1>;
+def : T_RR_pat <M2_mpyu_ll_s0, int_hexagon_M2_mpyu_ll_s0>;
+def : T_RR_pat <M2_mpyu_lh_s1, int_hexagon_M2_mpyu_lh_s1>;
+def : T_RR_pat <M2_mpyu_lh_s0, int_hexagon_M2_mpyu_lh_s0>;
+def : T_RR_pat <M2_mpyu_hl_s1, int_hexagon_M2_mpyu_hl_s1>;
+def : T_RR_pat <M2_mpyu_hl_s0, int_hexagon_M2_mpyu_hl_s0>;
+def : T_RR_pat <M2_mpyu_hh_s1, int_hexagon_M2_mpyu_hh_s1>;
+def : T_RR_pat <M2_mpyu_hh_s0, int_hexagon_M2_mpyu_hh_s0>;
+
+def : T_RR_pat <M2_mpy_sat_ll_s1, int_hexagon_M2_mpy_sat_ll_s1>;
+def : T_RR_pat <M2_mpy_sat_ll_s0, int_hexagon_M2_mpy_sat_ll_s0>;
+def : T_RR_pat <M2_mpy_sat_lh_s1, int_hexagon_M2_mpy_sat_lh_s1>;
+def : T_RR_pat <M2_mpy_sat_lh_s0, int_hexagon_M2_mpy_sat_lh_s0>;
+def : T_RR_pat <M2_mpy_sat_hl_s1, int_hexagon_M2_mpy_sat_hl_s1>;
+def : T_RR_pat <M2_mpy_sat_hl_s0, int_hexagon_M2_mpy_sat_hl_s0>;
+def : T_RR_pat <M2_mpy_sat_hh_s1, int_hexagon_M2_mpy_sat_hh_s1>;
+def : T_RR_pat <M2_mpy_sat_hh_s0, int_hexagon_M2_mpy_sat_hh_s0>;
+
+def : T_RR_pat <M2_mpy_rnd_ll_s1, int_hexagon_M2_mpy_rnd_ll_s1>;
+def : T_RR_pat <M2_mpy_rnd_ll_s0, int_hexagon_M2_mpy_rnd_ll_s0>;
+def : T_RR_pat <M2_mpy_rnd_lh_s1, int_hexagon_M2_mpy_rnd_lh_s1>;
+def : T_RR_pat <M2_mpy_rnd_lh_s0, int_hexagon_M2_mpy_rnd_lh_s0>;
+def : T_RR_pat <M2_mpy_rnd_hl_s1, int_hexagon_M2_mpy_rnd_hl_s1>;
+def : T_RR_pat <M2_mpy_rnd_hl_s0, int_hexagon_M2_mpy_rnd_hl_s0>;
+def : T_RR_pat <M2_mpy_rnd_hh_s1, int_hexagon_M2_mpy_rnd_hh_s1>;
+def : T_RR_pat <M2_mpy_rnd_hh_s0, int_hexagon_M2_mpy_rnd_hh_s0>;
+
+def : T_RR_pat <M2_mpy_sat_rnd_ll_s1, int_hexagon_M2_mpy_sat_rnd_ll_s1>;
+def : T_RR_pat <M2_mpy_sat_rnd_ll_s0, int_hexagon_M2_mpy_sat_rnd_ll_s0>;
+def : T_RR_pat <M2_mpy_sat_rnd_lh_s1, int_hexagon_M2_mpy_sat_rnd_lh_s1>;
+def : T_RR_pat <M2_mpy_sat_rnd_lh_s0, int_hexagon_M2_mpy_sat_rnd_lh_s0>;
+def : T_RR_pat <M2_mpy_sat_rnd_hl_s1, int_hexagon_M2_mpy_sat_rnd_hl_s1>;
+def : T_RR_pat <M2_mpy_sat_rnd_hl_s0, int_hexagon_M2_mpy_sat_rnd_hl_s0>;
+def : T_RR_pat <M2_mpy_sat_rnd_hh_s1, int_hexagon_M2_mpy_sat_rnd_hh_s1>;
+def : T_RR_pat <M2_mpy_sat_rnd_hh_s0, int_hexagon_M2_mpy_sat_rnd_hh_s0>;
+
+
+//===----------------------------------------------------------------------===//
+// MPYS / Multipy signed/unsigned halfwords and add/subtract the
+// result from the accumulator.
+//Rx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
+//===----------------------------------------------------------------------===//
+
+def : T_RRR_pat <M2_mpy_acc_ll_s1, int_hexagon_M2_mpy_acc_ll_s1>;
+def : T_RRR_pat <M2_mpy_acc_ll_s0, int_hexagon_M2_mpy_acc_ll_s0>;
+def : T_RRR_pat <M2_mpy_acc_lh_s1, int_hexagon_M2_mpy_acc_lh_s1>;
+def : T_RRR_pat <M2_mpy_acc_lh_s0, int_hexagon_M2_mpy_acc_lh_s0>;
+def : T_RRR_pat <M2_mpy_acc_hl_s1, int_hexagon_M2_mpy_acc_hl_s1>;
+def : T_RRR_pat <M2_mpy_acc_hl_s0, int_hexagon_M2_mpy_acc_hl_s0>;
+def : T_RRR_pat <M2_mpy_acc_hh_s1, int_hexagon_M2_mpy_acc_hh_s1>;
+def : T_RRR_pat <M2_mpy_acc_hh_s0, int_hexagon_M2_mpy_acc_hh_s0>;
+
+def : T_RRR_pat <M2_mpyu_acc_ll_s1, int_hexagon_M2_mpyu_acc_ll_s1>;
+def : T_RRR_pat <M2_mpyu_acc_ll_s0, int_hexagon_M2_mpyu_acc_ll_s0>;
+def : T_RRR_pat <M2_mpyu_acc_lh_s1, int_hexagon_M2_mpyu_acc_lh_s1>;
+def : T_RRR_pat <M2_mpyu_acc_lh_s0, int_hexagon_M2_mpyu_acc_lh_s0>;
+def : T_RRR_pat <M2_mpyu_acc_hl_s1, int_hexagon_M2_mpyu_acc_hl_s1>;
+def : T_RRR_pat <M2_mpyu_acc_hl_s0, int_hexagon_M2_mpyu_acc_hl_s0>;
+def : T_RRR_pat <M2_mpyu_acc_hh_s1, int_hexagon_M2_mpyu_acc_hh_s1>;
+def : T_RRR_pat <M2_mpyu_acc_hh_s0, int_hexagon_M2_mpyu_acc_hh_s0>;
+
+def : T_RRR_pat <M2_mpy_nac_ll_s1, int_hexagon_M2_mpy_nac_ll_s1>;
+def : T_RRR_pat <M2_mpy_nac_ll_s0, int_hexagon_M2_mpy_nac_ll_s0>;
+def : T_RRR_pat <M2_mpy_nac_lh_s1, int_hexagon_M2_mpy_nac_lh_s1>;
+def : T_RRR_pat <M2_mpy_nac_lh_s0, int_hexagon_M2_mpy_nac_lh_s0>;
+def : T_RRR_pat <M2_mpy_nac_hl_s1, int_hexagon_M2_mpy_nac_hl_s1>;
+def : T_RRR_pat <M2_mpy_nac_hl_s0, int_hexagon_M2_mpy_nac_hl_s0>;
+def : T_RRR_pat <M2_mpy_nac_hh_s1, int_hexagon_M2_mpy_nac_hh_s1>;
+def : T_RRR_pat <M2_mpy_nac_hh_s0, int_hexagon_M2_mpy_nac_hh_s0>;
+
+def : T_RRR_pat <M2_mpyu_nac_ll_s1, int_hexagon_M2_mpyu_nac_ll_s1>;
+def : T_RRR_pat <M2_mpyu_nac_ll_s0, int_hexagon_M2_mpyu_nac_ll_s0>;
+def : T_RRR_pat <M2_mpyu_nac_lh_s1, int_hexagon_M2_mpyu_nac_lh_s1>;
+def : T_RRR_pat <M2_mpyu_nac_lh_s0, int_hexagon_M2_mpyu_nac_lh_s0>;
+def : T_RRR_pat <M2_mpyu_nac_hl_s1, int_hexagon_M2_mpyu_nac_hl_s1>;
+def : T_RRR_pat <M2_mpyu_nac_hl_s0, int_hexagon_M2_mpyu_nac_hl_s0>;
+def : T_RRR_pat <M2_mpyu_nac_hh_s1, int_hexagon_M2_mpyu_nac_hh_s1>;
+def : T_RRR_pat <M2_mpyu_nac_hh_s0, int_hexagon_M2_mpyu_nac_hh_s0>;
+
+def : T_RRR_pat <M2_mpy_acc_sat_ll_s1, int_hexagon_M2_mpy_acc_sat_ll_s1>;
+def : T_RRR_pat <M2_mpy_acc_sat_ll_s0, int_hexagon_M2_mpy_acc_sat_ll_s0>;
+def : T_RRR_pat <M2_mpy_acc_sat_lh_s1, int_hexagon_M2_mpy_acc_sat_lh_s1>;
+def : T_RRR_pat <M2_mpy_acc_sat_lh_s0, int_hexagon_M2_mpy_acc_sat_lh_s0>;
+def : T_RRR_pat <M2_mpy_acc_sat_hl_s1, int_hexagon_M2_mpy_acc_sat_hl_s1>;
+def : T_RRR_pat <M2_mpy_acc_sat_hl_s0, int_hexagon_M2_mpy_acc_sat_hl_s0>;
+def : T_RRR_pat <M2_mpy_acc_sat_hh_s1, int_hexagon_M2_mpy_acc_sat_hh_s1>;
+def : T_RRR_pat <M2_mpy_acc_sat_hh_s0, int_hexagon_M2_mpy_acc_sat_hh_s0>;
+
+def : T_RRR_pat <M2_mpy_nac_sat_ll_s1, int_hexagon_M2_mpy_nac_sat_ll_s1>;
+def : T_RRR_pat <M2_mpy_nac_sat_ll_s0, int_hexagon_M2_mpy_nac_sat_ll_s0>;
+def : T_RRR_pat <M2_mpy_nac_sat_lh_s1, int_hexagon_M2_mpy_nac_sat_lh_s1>;
+def : T_RRR_pat <M2_mpy_nac_sat_lh_s0, int_hexagon_M2_mpy_nac_sat_lh_s0>;
+def : T_RRR_pat <M2_mpy_nac_sat_hl_s1, int_hexagon_M2_mpy_nac_sat_hl_s1>;
+def : T_RRR_pat <M2_mpy_nac_sat_hl_s0, int_hexagon_M2_mpy_nac_sat_hl_s0>;
+def : T_RRR_pat <M2_mpy_nac_sat_hh_s1, int_hexagon_M2_mpy_nac_sat_hh_s1>;
+def : T_RRR_pat <M2_mpy_nac_sat_hh_s0, int_hexagon_M2_mpy_nac_sat_hh_s0>;
+
+
+//===----------------------------------------------------------------------===//
+// Multiply signed/unsigned halfwords with and without saturation and rounding
+// into a 64-bits destination register.
+//===----------------------------------------------------------------------===//
+
+def : T_RR_pat <M2_mpyd_hh_s0, int_hexagon_M2_mpyd_hh_s0>;
+def : T_RR_pat <M2_mpyd_hl_s0, int_hexagon_M2_mpyd_hl_s0>;
+def : T_RR_pat <M2_mpyd_lh_s0, int_hexagon_M2_mpyd_lh_s0>;
+def : T_RR_pat <M2_mpyd_ll_s0, int_hexagon_M2_mpyd_ll_s0>;
+def : T_RR_pat <M2_mpyd_hh_s1, int_hexagon_M2_mpyd_hh_s1>;
+def : T_RR_pat <M2_mpyd_hl_s1, int_hexagon_M2_mpyd_hl_s1>;
+def : T_RR_pat <M2_mpyd_lh_s1, int_hexagon_M2_mpyd_lh_s1>;
+def : T_RR_pat <M2_mpyd_ll_s1, int_hexagon_M2_mpyd_ll_s1>;
+
+def : T_RR_pat <M2_mpyd_rnd_hh_s0, int_hexagon_M2_mpyd_rnd_hh_s0>;
+def : T_RR_pat <M2_mpyd_rnd_hl_s0, int_hexagon_M2_mpyd_rnd_hl_s0>;
+def : T_RR_pat <M2_mpyd_rnd_lh_s0, int_hexagon_M2_mpyd_rnd_lh_s0>;
+def : T_RR_pat <M2_mpyd_rnd_ll_s0, int_hexagon_M2_mpyd_rnd_ll_s0>;
+def : T_RR_pat <M2_mpyd_rnd_hh_s1, int_hexagon_M2_mpyd_rnd_hh_s1>;
+def : T_RR_pat <M2_mpyd_rnd_hl_s1, int_hexagon_M2_mpyd_rnd_hl_s1>;
+def : T_RR_pat <M2_mpyd_rnd_lh_s1, int_hexagon_M2_mpyd_rnd_lh_s1>;
+def : T_RR_pat <M2_mpyd_rnd_ll_s1, int_hexagon_M2_mpyd_rnd_ll_s1>;
+
+def : T_RR_pat <M2_mpyud_hh_s0, int_hexagon_M2_mpyud_hh_s0>;
+def : T_RR_pat <M2_mpyud_hl_s0, int_hexagon_M2_mpyud_hl_s0>;
+def : T_RR_pat <M2_mpyud_lh_s0, int_hexagon_M2_mpyud_lh_s0>;
+def : T_RR_pat <M2_mpyud_ll_s0, int_hexagon_M2_mpyud_ll_s0>;
+def : T_RR_pat <M2_mpyud_hh_s1, int_hexagon_M2_mpyud_hh_s1>;
+def : T_RR_pat <M2_mpyud_hl_s1, int_hexagon_M2_mpyud_hl_s1>;
+def : T_RR_pat <M2_mpyud_lh_s1, int_hexagon_M2_mpyud_lh_s1>;
+def : T_RR_pat <M2_mpyud_ll_s1, int_hexagon_M2_mpyud_ll_s1>;
+
+//===----------------------------------------------------------------------===//
+// MPYS / Multipy signed/unsigned halfwords and add/subtract the
+// result from the 64-bit destination register.
+//Rxx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
+//===----------------------------------------------------------------------===//
+
+def : T_PRR_pat <M2_mpyd_acc_hh_s0, int_hexagon_M2_mpyd_acc_hh_s0>;
+def : T_PRR_pat <M2_mpyd_acc_hl_s0, int_hexagon_M2_mpyd_acc_hl_s0>;
+def : T_PRR_pat <M2_mpyd_acc_lh_s0, int_hexagon_M2_mpyd_acc_lh_s0>;
+def : T_PRR_pat <M2_mpyd_acc_ll_s0, int_hexagon_M2_mpyd_acc_ll_s0>;
+
+def : T_PRR_pat <M2_mpyd_acc_hh_s1, int_hexagon_M2_mpyd_acc_hh_s1>;
+def : T_PRR_pat <M2_mpyd_acc_hl_s1, int_hexagon_M2_mpyd_acc_hl_s1>;
+def : T_PRR_pat <M2_mpyd_acc_lh_s1, int_hexagon_M2_mpyd_acc_lh_s1>;
+def : T_PRR_pat <M2_mpyd_acc_ll_s1, int_hexagon_M2_mpyd_acc_ll_s1>;
+
+def : T_PRR_pat <M2_mpyd_nac_hh_s0, int_hexagon_M2_mpyd_nac_hh_s0>;
+def : T_PRR_pat <M2_mpyd_nac_hl_s0, int_hexagon_M2_mpyd_nac_hl_s0>;
+def : T_PRR_pat <M2_mpyd_nac_lh_s0, int_hexagon_M2_mpyd_nac_lh_s0>;
+def : T_PRR_pat <M2_mpyd_nac_ll_s0, int_hexagon_M2_mpyd_nac_ll_s0>;
+
+def : T_PRR_pat <M2_mpyd_nac_hh_s1, int_hexagon_M2_mpyd_nac_hh_s1>;
+def : T_PRR_pat <M2_mpyd_nac_hl_s1, int_hexagon_M2_mpyd_nac_hl_s1>;
+def : T_PRR_pat <M2_mpyd_nac_lh_s1, int_hexagon_M2_mpyd_nac_lh_s1>;
+def : T_PRR_pat <M2_mpyd_nac_ll_s1, int_hexagon_M2_mpyd_nac_ll_s1>;
+
+def : T_PRR_pat <M2_mpyud_acc_hh_s0, int_hexagon_M2_mpyud_acc_hh_s0>;
+def : T_PRR_pat <M2_mpyud_acc_hl_s0, int_hexagon_M2_mpyud_acc_hl_s0>;
+def : T_PRR_pat <M2_mpyud_acc_lh_s0, int_hexagon_M2_mpyud_acc_lh_s0>;
+def : T_PRR_pat <M2_mpyud_acc_ll_s0, int_hexagon_M2_mpyud_acc_ll_s0>;
+
+def : T_PRR_pat <M2_mpyud_acc_hh_s1, int_hexagon_M2_mpyud_acc_hh_s1>;
+def : T_PRR_pat <M2_mpyud_acc_hl_s1, int_hexagon_M2_mpyud_acc_hl_s1>;
+def : T_PRR_pat <M2_mpyud_acc_lh_s1, int_hexagon_M2_mpyud_acc_lh_s1>;
+def : T_PRR_pat <M2_mpyud_acc_ll_s1, int_hexagon_M2_mpyud_acc_ll_s1>;
+
+def : T_PRR_pat <M2_mpyud_nac_hh_s0, int_hexagon_M2_mpyud_nac_hh_s0>;
+def : T_PRR_pat <M2_mpyud_nac_hl_s0, int_hexagon_M2_mpyud_nac_hl_s0>;
+def : T_PRR_pat <M2_mpyud_nac_lh_s0, int_hexagon_M2_mpyud_nac_lh_s0>;
+def : T_PRR_pat <M2_mpyud_nac_ll_s0, int_hexagon_M2_mpyud_nac_ll_s0>;
+
+def : T_PRR_pat <M2_mpyud_nac_hh_s1, int_hexagon_M2_mpyud_nac_hh_s1>;
+def : T_PRR_pat <M2_mpyud_nac_hl_s1, int_hexagon_M2_mpyud_nac_hl_s1>;
+def : T_PRR_pat <M2_mpyud_nac_lh_s1, int_hexagon_M2_mpyud_nac_lh_s1>;
+def : T_PRR_pat <M2_mpyud_nac_ll_s1, int_hexagon_M2_mpyud_nac_ll_s1>;
+
+// Vector complex multiply imaginary: Rdd=vcmpyi(Rss,Rtt)[:<<1]:sat
+def : T_PP_pat <M2_vcmpy_s1_sat_i, int_hexagon_M2_vcmpy_s1_sat_i>;
+def : T_PP_pat <M2_vcmpy_s0_sat_i, int_hexagon_M2_vcmpy_s0_sat_i>;
+
+// Vector complex multiply real: Rdd=vcmpyr(Rss,Rtt)[:<<1]:sat
+def : T_PP_pat <M2_vcmpy_s1_sat_r, int_hexagon_M2_vcmpy_s1_sat_r>;
+def : T_PP_pat <M2_vcmpy_s0_sat_r, int_hexagon_M2_vcmpy_s0_sat_r>;
+
+// Vector dual multiply: Rdd=vdmpy(Rss,Rtt)[:<<1]:sat
+def : T_PP_pat <M2_vdmpys_s1, int_hexagon_M2_vdmpys_s1>;
+def : T_PP_pat <M2_vdmpys_s0, int_hexagon_M2_vdmpys_s0>;
+
+// Vector multiply even halfwords: Rdd=vmpyeh(Rss,Rtt)[:<<1]:sat
+def : T_PP_pat <M2_vmpy2es_s1, int_hexagon_M2_vmpy2es_s1>;
+def : T_PP_pat <M2_vmpy2es_s0, int_hexagon_M2_vmpy2es_s0>;
+
+//Rdd=vmpywoh(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PP_pat <M2_mmpyh_s0,  int_hexagon_M2_mmpyh_s0>;
+def : T_PP_pat <M2_mmpyh_s1,  int_hexagon_M2_mmpyh_s1>;
+def : T_PP_pat <M2_mmpyh_rs0, int_hexagon_M2_mmpyh_rs0>;
+def : T_PP_pat <M2_mmpyh_rs1, int_hexagon_M2_mmpyh_rs1>;
+
+//Rdd=vmpyweh(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PP_pat <M2_mmpyl_s0,  int_hexagon_M2_mmpyl_s0>;
+def : T_PP_pat <M2_mmpyl_s1,  int_hexagon_M2_mmpyl_s1>;
+def : T_PP_pat <M2_mmpyl_rs0, int_hexagon_M2_mmpyl_rs0>;
+def : T_PP_pat <M2_mmpyl_rs1, int_hexagon_M2_mmpyl_rs1>;
+
+//Rdd=vmpywouh(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PP_pat <M2_mmpyuh_s0,  int_hexagon_M2_mmpyuh_s0>;
+def : T_PP_pat <M2_mmpyuh_s1,  int_hexagon_M2_mmpyuh_s1>;
+def : T_PP_pat <M2_mmpyuh_rs0, int_hexagon_M2_mmpyuh_rs0>;
+def : T_PP_pat <M2_mmpyuh_rs1, int_hexagon_M2_mmpyuh_rs1>;
+
+//Rdd=vmpyweuh(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PP_pat <M2_mmpyul_s0,  int_hexagon_M2_mmpyul_s0>;
+def : T_PP_pat <M2_mmpyul_s1,  int_hexagon_M2_mmpyul_s1>;
+def : T_PP_pat <M2_mmpyul_rs0, int_hexagon_M2_mmpyul_rs0>;
+def : T_PP_pat <M2_mmpyul_rs1, int_hexagon_M2_mmpyul_rs1>;
+
+// Vector reduce add unsigned bytes: Rdd32[+]=vrmpybu(Rss32,Rtt32)
+def : T_PP_pat  <A2_vraddub,     int_hexagon_A2_vraddub>;
+def : T_PPP_pat <A2_vraddub_acc, int_hexagon_A2_vraddub_acc>;
+
+// Vector sum of absolute differences unsigned bytes: Rdd=vrsadub(Rss,Rtt)
+def : T_PP_pat  <A2_vrsadub,     int_hexagon_A2_vrsadub>;
+def : T_PPP_pat <A2_vrsadub_acc, int_hexagon_A2_vrsadub_acc>;
+
+// Vector absolute difference: Rdd=vabsdiffh(Rtt,Rss)
+def : T_PP_pat <M2_vabsdiffh, int_hexagon_M2_vabsdiffh>;
+
+// Vector absolute difference words: Rdd=vabsdiffw(Rtt,Rss)
+def : T_PP_pat <M2_vabsdiffw, int_hexagon_M2_vabsdiffw>;
+
+// Vector reduce complex multiply real or imaginary:
+// Rdd[+]=vrcmpy[ir](Rss,Rtt[*])
+def : T_PP_pat  <M2_vrcmpyi_s0,  int_hexagon_M2_vrcmpyi_s0>;
+def : T_PP_pat  <M2_vrcmpyi_s0c, int_hexagon_M2_vrcmpyi_s0c>;
+def : T_PPP_pat <M2_vrcmaci_s0,  int_hexagon_M2_vrcmaci_s0>;
+def : T_PPP_pat <M2_vrcmaci_s0c, int_hexagon_M2_vrcmaci_s0c>;
+
+def : T_PP_pat  <M2_vrcmpyr_s0,  int_hexagon_M2_vrcmpyr_s0>;
+def : T_PP_pat  <M2_vrcmpyr_s0c, int_hexagon_M2_vrcmpyr_s0c>;
+def : T_PPP_pat <M2_vrcmacr_s0,  int_hexagon_M2_vrcmacr_s0>;
+def : T_PPP_pat <M2_vrcmacr_s0c, int_hexagon_M2_vrcmacr_s0c>;
+
+// Vector reduce halfwords
+// Rdd[+]=vrmpyh(Rss,Rtt)
+def : T_PP_pat  <M2_vrmpy_s0, int_hexagon_M2_vrmpy_s0>;
+def : T_PPP_pat <M2_vrmac_s0, int_hexagon_M2_vrmac_s0>;
+
+//===----------------------------------------------------------------------===//
+// Vector Multipy with accumulation
+//===----------------------------------------------------------------------===//
+
+// Vector multiply word by signed half with accumulation
+// Rxx+=vmpyw[eo]h(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PPP_pat <M2_mmacls_s1, int_hexagon_M2_mmacls_s1>;
+def : T_PPP_pat <M2_mmacls_s0, int_hexagon_M2_mmacls_s0>;
+def : T_PPP_pat <M2_mmacls_rs1, int_hexagon_M2_mmacls_rs1>;
+def : T_PPP_pat <M2_mmacls_rs0, int_hexagon_M2_mmacls_rs0>;
+def : T_PPP_pat <M2_mmachs_s1, int_hexagon_M2_mmachs_s1>;
+def : T_PPP_pat <M2_mmachs_s0, int_hexagon_M2_mmachs_s0>;
+def : T_PPP_pat <M2_mmachs_rs1, int_hexagon_M2_mmachs_rs1>;
+def : T_PPP_pat <M2_mmachs_rs0, int_hexagon_M2_mmachs_rs0>;
+
+// Vector multiply word by unsigned half with accumulation
+// Rxx+=vmpyw[eo]uh(Rss,Rtt)[:<<1][:rnd]:sat
+def : T_PPP_pat <M2_mmaculs_s1, int_hexagon_M2_mmaculs_s1>;
+def : T_PPP_pat <M2_mmaculs_s0, int_hexagon_M2_mmaculs_s0>;
+def : T_PPP_pat <M2_mmaculs_rs1, int_hexagon_M2_mmaculs_rs1>;
+def : T_PPP_pat <M2_mmaculs_rs0, int_hexagon_M2_mmaculs_rs0>;
+def : T_PPP_pat <M2_mmacuhs_s1, int_hexagon_M2_mmacuhs_s1>;
+def : T_PPP_pat <M2_mmacuhs_s0, int_hexagon_M2_mmacuhs_s0>;
+def : T_PPP_pat <M2_mmacuhs_rs1, int_hexagon_M2_mmacuhs_rs1>;
+def : T_PPP_pat <M2_mmacuhs_rs0, int_hexagon_M2_mmacuhs_rs0>;
+
+// Vector multiply even halfwords with accumulation
+// Rxx+=vmpyeh(Rss,Rtt)[:<<1][:sat]
+def : T_PPP_pat <M2_vmac2es, int_hexagon_M2_vmac2es>;
+def : T_PPP_pat <M2_vmac2es_s1, int_hexagon_M2_vmac2es_s1>;
+def : T_PPP_pat <M2_vmac2es_s0, int_hexagon_M2_vmac2es_s0>;
+
+// Vector dual multiply with accumulation
+// Rxx+=vdmpy(Rss,Rtt)[:sat]
+def : T_PPP_pat <M2_vdmacs_s1, int_hexagon_M2_vdmacs_s1>;
+def : T_PPP_pat <M2_vdmacs_s0, int_hexagon_M2_vdmacs_s0>;
+
+// Vector complex multiply real or imaginary with accumulation
+// Rxx+=vcmpy[ir](Rss,Rtt):sat
+def : T_PPP_pat <M2_vcmac_s0_sat_r, int_hexagon_M2_vcmac_s0_sat_r>;
+def : T_PPP_pat <M2_vcmac_s0_sat_i, int_hexagon_M2_vcmac_s0_sat_i>;
+
+//===----------------------------------------------------------------------===//
+// Add/Subtract halfword
+// Rd=add(Rt.L,Rs.[HL])[:sat]
+// Rd=sub(Rt.L,Rs.[HL])[:sat]
+// Rd=add(Rt.[LH],Rs.[HL])[:sat][:<16]
+// Rd=sub(Rt.[LH],Rs.[HL])[:sat][:<16]
+//===----------------------------------------------------------------------===//
+
+//Rd=add(Rt.L,Rs.[LH])
+def : T_RR_pat <A2_addh_l16_ll,     int_hexagon_A2_addh_l16_ll>;
+def : T_RR_pat <A2_addh_l16_hl,     int_hexagon_A2_addh_l16_hl>;
+
+//Rd=add(Rt.L,Rs.[LH]):sat
+def : T_RR_pat <A2_addh_l16_sat_ll, int_hexagon_A2_addh_l16_sat_ll>;
+def : T_RR_pat <A2_addh_l16_sat_hl, int_hexagon_A2_addh_l16_sat_hl>;
+
+//Rd=sub(Rt.L,Rs.[LH])
+def : T_RR_pat <A2_subh_l16_ll,     int_hexagon_A2_subh_l16_ll>;
+def : T_RR_pat <A2_subh_l16_hl,     int_hexagon_A2_subh_l16_hl>;
+
+//Rd=sub(Rt.L,Rs.[LH]):sat
+def : T_RR_pat <A2_subh_l16_sat_ll, int_hexagon_A2_subh_l16_sat_ll>;
+def : T_RR_pat <A2_subh_l16_sat_hl, int_hexagon_A2_subh_l16_sat_hl>;
+
+//Rd=add(Rt.[LH],Rs.[LH]):<<16
+def : T_RR_pat <A2_addh_h16_ll,     int_hexagon_A2_addh_h16_ll>;
+def : T_RR_pat <A2_addh_h16_lh,     int_hexagon_A2_addh_h16_lh>;
+def : T_RR_pat <A2_addh_h16_hl,     int_hexagon_A2_addh_h16_hl>;
+def : T_RR_pat <A2_addh_h16_hh,     int_hexagon_A2_addh_h16_hh>;
+
+//Rd=sub(Rt.[LH],Rs.[LH]):<<16
+def : T_RR_pat <A2_subh_h16_ll,     int_hexagon_A2_subh_h16_ll>;
+def : T_RR_pat <A2_subh_h16_lh,     int_hexagon_A2_subh_h16_lh>;
+def : T_RR_pat <A2_subh_h16_hl,     int_hexagon_A2_subh_h16_hl>;
+def : T_RR_pat <A2_subh_h16_hh,     int_hexagon_A2_subh_h16_hh>;
+
+//Rd=add(Rt.[LH],Rs.[LH]):sat:<<16
+def : T_RR_pat <A2_addh_h16_sat_ll, int_hexagon_A2_addh_h16_sat_ll>;
+def : T_RR_pat <A2_addh_h16_sat_lh, int_hexagon_A2_addh_h16_sat_lh>;
+def : T_RR_pat <A2_addh_h16_sat_hl, int_hexagon_A2_addh_h16_sat_hl>;
+def : T_RR_pat <A2_addh_h16_sat_hh, int_hexagon_A2_addh_h16_sat_hh>;
+
+//Rd=sub(Rt.[LH],Rs.[LH]):sat:<<16
+def : T_RR_pat <A2_subh_h16_sat_ll, int_hexagon_A2_subh_h16_sat_ll>;
+def : T_RR_pat <A2_subh_h16_sat_lh, int_hexagon_A2_subh_h16_sat_lh>;
+def : T_RR_pat <A2_subh_h16_sat_hl, int_hexagon_A2_subh_h16_sat_hl>;
+def : T_RR_pat <A2_subh_h16_sat_hh, int_hexagon_A2_subh_h16_sat_hh>;
+
+// ALU64 / ALU / min max
+def : T_RR_pat<A2_max,  int_hexagon_A2_max>;
+def : T_RR_pat<A2_min,  int_hexagon_A2_min>;
+def : T_RR_pat<A2_maxu, int_hexagon_A2_maxu>;
+def : T_RR_pat<A2_minu, int_hexagon_A2_minu>;
+
+// Shift and accumulate
+def : T_RRI_pat <S2_asr_i_r_nac,  int_hexagon_S2_asr_i_r_nac>;
+def : T_RRI_pat <S2_lsr_i_r_nac,  int_hexagon_S2_lsr_i_r_nac>;
+def : T_RRI_pat <S2_asl_i_r_nac,  int_hexagon_S2_asl_i_r_nac>;
+def : T_RRI_pat <S2_asr_i_r_acc,  int_hexagon_S2_asr_i_r_acc>;
+def : T_RRI_pat <S2_lsr_i_r_acc,  int_hexagon_S2_lsr_i_r_acc>;
+def : T_RRI_pat <S2_asl_i_r_acc,  int_hexagon_S2_asl_i_r_acc>;
+
+def : T_RRI_pat <S2_asr_i_r_and,  int_hexagon_S2_asr_i_r_and>;
+def : T_RRI_pat <S2_lsr_i_r_and,  int_hexagon_S2_lsr_i_r_and>;
+def : T_RRI_pat <S2_asl_i_r_and,  int_hexagon_S2_asl_i_r_and>;
+def : T_RRI_pat <S2_asr_i_r_or,   int_hexagon_S2_asr_i_r_or>;
+def : T_RRI_pat <S2_lsr_i_r_or,   int_hexagon_S2_lsr_i_r_or>;
+def : T_RRI_pat <S2_asl_i_r_or,   int_hexagon_S2_asl_i_r_or>;
+def : T_RRI_pat <S2_lsr_i_r_xacc, int_hexagon_S2_lsr_i_r_xacc>;
+def : T_RRI_pat <S2_asl_i_r_xacc, int_hexagon_S2_asl_i_r_xacc>;
+
+def : T_PPI_pat <S2_asr_i_p_nac,  int_hexagon_S2_asr_i_p_nac>;
+def : T_PPI_pat <S2_lsr_i_p_nac,  int_hexagon_S2_lsr_i_p_nac>;
+def : T_PPI_pat <S2_asl_i_p_nac,  int_hexagon_S2_asl_i_p_nac>;
+def : T_PPI_pat <S2_asr_i_p_acc,  int_hexagon_S2_asr_i_p_acc>;
+def : T_PPI_pat <S2_lsr_i_p_acc,  int_hexagon_S2_lsr_i_p_acc>;
+def : T_PPI_pat <S2_asl_i_p_acc,  int_hexagon_S2_asl_i_p_acc>;
+
+def : T_PPI_pat <S2_asr_i_p_and,  int_hexagon_S2_asr_i_p_and>;
+def : T_PPI_pat <S2_lsr_i_p_and,  int_hexagon_S2_lsr_i_p_and>;
+def : T_PPI_pat <S2_asl_i_p_and,  int_hexagon_S2_asl_i_p_and>;
+def : T_PPI_pat <S2_asr_i_p_or,   int_hexagon_S2_asr_i_p_or>;
+def : T_PPI_pat <S2_lsr_i_p_or,   int_hexagon_S2_lsr_i_p_or>;
+def : T_PPI_pat <S2_asl_i_p_or,   int_hexagon_S2_asl_i_p_or>;
+def : T_PPI_pat <S2_lsr_i_p_xacc, int_hexagon_S2_lsr_i_p_xacc>;
+def : T_PPI_pat <S2_asl_i_p_xacc, int_hexagon_S2_asl_i_p_xacc>;
+
+def : T_RRR_pat <S2_asr_r_r_nac,  int_hexagon_S2_asr_r_r_nac>;
+def : T_RRR_pat <S2_lsr_r_r_nac,  int_hexagon_S2_lsr_r_r_nac>;
+def : T_RRR_pat <S2_asl_r_r_nac,  int_hexagon_S2_asl_r_r_nac>;
+def : T_RRR_pat <S2_lsl_r_r_nac,  int_hexagon_S2_lsl_r_r_nac>;
+def : T_RRR_pat <S2_asr_r_r_acc,  int_hexagon_S2_asr_r_r_acc>;
+def : T_RRR_pat <S2_lsr_r_r_acc,  int_hexagon_S2_lsr_r_r_acc>;
+def : T_RRR_pat <S2_asl_r_r_acc,  int_hexagon_S2_asl_r_r_acc>;
+def : T_RRR_pat <S2_lsl_r_r_acc,  int_hexagon_S2_lsl_r_r_acc>;
+
+def : T_RRR_pat <S2_asr_r_r_and,  int_hexagon_S2_asr_r_r_and>;
+def : T_RRR_pat <S2_lsr_r_r_and,  int_hexagon_S2_lsr_r_r_and>;
+def : T_RRR_pat <S2_asl_r_r_and,  int_hexagon_S2_asl_r_r_and>;
+def : T_RRR_pat <S2_lsl_r_r_and,  int_hexagon_S2_lsl_r_r_and>;
+def : T_RRR_pat <S2_asr_r_r_or,   int_hexagon_S2_asr_r_r_or>;
+def : T_RRR_pat <S2_lsr_r_r_or,   int_hexagon_S2_lsr_r_r_or>;
+def : T_RRR_pat <S2_asl_r_r_or,   int_hexagon_S2_asl_r_r_or>;
+def : T_RRR_pat <S2_lsl_r_r_or,   int_hexagon_S2_lsl_r_r_or>;
+
+def : T_PPR_pat <S2_asr_r_p_nac,  int_hexagon_S2_asr_r_p_nac>;
+def : T_PPR_pat <S2_lsr_r_p_nac,  int_hexagon_S2_lsr_r_p_nac>;
+def : T_PPR_pat <S2_asl_r_p_nac,  int_hexagon_S2_asl_r_p_nac>;
+def : T_PPR_pat <S2_lsl_r_p_nac,  int_hexagon_S2_lsl_r_p_nac>;
+def : T_PPR_pat <S2_asr_r_p_acc,  int_hexagon_S2_asr_r_p_acc>;
+def : T_PPR_pat <S2_lsr_r_p_acc,  int_hexagon_S2_lsr_r_p_acc>;
+def : T_PPR_pat <S2_asl_r_p_acc,  int_hexagon_S2_asl_r_p_acc>;
+def : T_PPR_pat <S2_lsl_r_p_acc,  int_hexagon_S2_lsl_r_p_acc>;
+
+def : T_PPR_pat <S2_asr_r_p_and,  int_hexagon_S2_asr_r_p_and>;
+def : T_PPR_pat <S2_lsr_r_p_and,  int_hexagon_S2_lsr_r_p_and>;
+def : T_PPR_pat <S2_asl_r_p_and,  int_hexagon_S2_asl_r_p_and>;
+def : T_PPR_pat <S2_lsl_r_p_and,  int_hexagon_S2_lsl_r_p_and>;
+def : T_PPR_pat <S2_asr_r_p_or,   int_hexagon_S2_asr_r_p_or>;
+def : T_PPR_pat <S2_lsr_r_p_or,   int_hexagon_S2_lsr_r_p_or>;
+def : T_PPR_pat <S2_asl_r_p_or,   int_hexagon_S2_asl_r_p_or>;
+def : T_PPR_pat <S2_lsl_r_p_or,   int_hexagon_S2_lsl_r_p_or>;
+
+def : T_RRI_pat <S2_asr_i_r_nac,  int_hexagon_S2_asr_i_r_nac>;
+def : T_RRI_pat <S2_lsr_i_r_nac,  int_hexagon_S2_lsr_i_r_nac>;
+def : T_RRI_pat <S2_asl_i_r_nac,  int_hexagon_S2_asl_i_r_nac>;
+def : T_RRI_pat <S2_asr_i_r_acc,  int_hexagon_S2_asr_i_r_acc>;
+def : T_RRI_pat <S2_lsr_i_r_acc,  int_hexagon_S2_lsr_i_r_acc>;
+def : T_RRI_pat <S2_asl_i_r_acc,  int_hexagon_S2_asl_i_r_acc>;
+
+def : T_RRI_pat <S2_asr_i_r_and,  int_hexagon_S2_asr_i_r_and>;
+def : T_RRI_pat <S2_lsr_i_r_and,  int_hexagon_S2_lsr_i_r_and>;
+def : T_RRI_pat <S2_asl_i_r_and,  int_hexagon_S2_asl_i_r_and>;
+def : T_RRI_pat <S2_asr_i_r_or,   int_hexagon_S2_asr_i_r_or>;
+def : T_RRI_pat <S2_lsr_i_r_or,   int_hexagon_S2_lsr_i_r_or>;
+def : T_RRI_pat <S2_asl_i_r_or,   int_hexagon_S2_asl_i_r_or>;
+def : T_RRI_pat <S2_lsr_i_r_xacc, int_hexagon_S2_lsr_i_r_xacc>;
+def : T_RRI_pat <S2_asl_i_r_xacc, int_hexagon_S2_asl_i_r_xacc>;
+
+def : T_PPI_pat <S2_asr_i_p_nac,  int_hexagon_S2_asr_i_p_nac>;
+def : T_PPI_pat <S2_lsr_i_p_nac,  int_hexagon_S2_lsr_i_p_nac>;
+def : T_PPI_pat <S2_asl_i_p_nac,  int_hexagon_S2_asl_i_p_nac>;
+def : T_PPI_pat <S2_asr_i_p_acc,  int_hexagon_S2_asr_i_p_acc>;
+def : T_PPI_pat <S2_lsr_i_p_acc,  int_hexagon_S2_lsr_i_p_acc>;
+def : T_PPI_pat <S2_asl_i_p_acc,  int_hexagon_S2_asl_i_p_acc>;
+
+def : T_PPI_pat <S2_asr_i_p_and,  int_hexagon_S2_asr_i_p_and>;
+def : T_PPI_pat <S2_lsr_i_p_and,  int_hexagon_S2_lsr_i_p_and>;
+def : T_PPI_pat <S2_asl_i_p_and,  int_hexagon_S2_asl_i_p_and>;
+def : T_PPI_pat <S2_asr_i_p_or,   int_hexagon_S2_asr_i_p_or>;
+def : T_PPI_pat <S2_lsr_i_p_or,   int_hexagon_S2_lsr_i_p_or>;
+def : T_PPI_pat <S2_asl_i_p_or,   int_hexagon_S2_asl_i_p_or>;
+def : T_PPI_pat <S2_lsr_i_p_xacc, int_hexagon_S2_lsr_i_p_xacc>;
+def : T_PPI_pat <S2_asl_i_p_xacc, int_hexagon_S2_asl_i_p_xacc>;
+
+def : T_RRR_pat <S2_asr_r_r_nac,  int_hexagon_S2_asr_r_r_nac>;
+def : T_RRR_pat <S2_lsr_r_r_nac,  int_hexagon_S2_lsr_r_r_nac>;
+def : T_RRR_pat <S2_asl_r_r_nac,  int_hexagon_S2_asl_r_r_nac>;
+def : T_RRR_pat <S2_lsl_r_r_nac,  int_hexagon_S2_lsl_r_r_nac>;
+def : T_RRR_pat <S2_asr_r_r_acc,  int_hexagon_S2_asr_r_r_acc>;
+def : T_RRR_pat <S2_lsr_r_r_acc,  int_hexagon_S2_lsr_r_r_acc>;
+def : T_RRR_pat <S2_asl_r_r_acc,  int_hexagon_S2_asl_r_r_acc>;
+def : T_RRR_pat <S2_lsl_r_r_acc,  int_hexagon_S2_lsl_r_r_acc>;
+
+def : T_RRR_pat <S2_asr_r_r_and,  int_hexagon_S2_asr_r_r_and>;
+def : T_RRR_pat <S2_lsr_r_r_and,  int_hexagon_S2_lsr_r_r_and>;
+def : T_RRR_pat <S2_asl_r_r_and,  int_hexagon_S2_asl_r_r_and>;
+def : T_RRR_pat <S2_lsl_r_r_and,  int_hexagon_S2_lsl_r_r_and>;
+def : T_RRR_pat <S2_asr_r_r_or,   int_hexagon_S2_asr_r_r_or>;
+def : T_RRR_pat <S2_lsr_r_r_or,   int_hexagon_S2_lsr_r_r_or>;
+def : T_RRR_pat <S2_asl_r_r_or,   int_hexagon_S2_asl_r_r_or>;
+def : T_RRR_pat <S2_lsl_r_r_or,   int_hexagon_S2_lsl_r_r_or>;
+
+def : T_PPR_pat <S2_asr_r_p_nac,  int_hexagon_S2_asr_r_p_nac>;
+def : T_PPR_pat <S2_lsr_r_p_nac,  int_hexagon_S2_lsr_r_p_nac>;
+def : T_PPR_pat <S2_asl_r_p_nac,  int_hexagon_S2_asl_r_p_nac>;
+def : T_PPR_pat <S2_lsl_r_p_nac,  int_hexagon_S2_lsl_r_p_nac>;
+def : T_PPR_pat <S2_asr_r_p_acc,  int_hexagon_S2_asr_r_p_acc>;
+def : T_PPR_pat <S2_lsr_r_p_acc,  int_hexagon_S2_lsr_r_p_acc>;
+def : T_PPR_pat <S2_asl_r_p_acc,  int_hexagon_S2_asl_r_p_acc>;
+def : T_PPR_pat <S2_lsl_r_p_acc,  int_hexagon_S2_lsl_r_p_acc>;
+
+def : T_PPR_pat <S2_asr_r_p_and,  int_hexagon_S2_asr_r_p_and>;
+def : T_PPR_pat <S2_lsr_r_p_and,  int_hexagon_S2_lsr_r_p_and>;
+def : T_PPR_pat <S2_asl_r_p_and,  int_hexagon_S2_asl_r_p_and>;
+def : T_PPR_pat <S2_lsl_r_p_and,  int_hexagon_S2_lsl_r_p_and>;
+def : T_PPR_pat <S2_asr_r_p_or,   int_hexagon_S2_asr_r_p_or>;
+def : T_PPR_pat <S2_lsr_r_p_or,   int_hexagon_S2_lsr_r_p_or>;
+def : T_PPR_pat <S2_asl_r_p_or,   int_hexagon_S2_asl_r_p_or>;
+def : T_PPR_pat <S2_lsl_r_p_or,   int_hexagon_S2_lsl_r_p_or>;
+
+//*******************************************************************
+//           ALU32/ALU
+//*******************************************************************
+def : T_RR_pat<A2_add,      int_hexagon_A2_add>;
+def : T_RI_pat<A2_addi,     int_hexagon_A2_addi>;
+def : T_RR_pat<A2_sub,      int_hexagon_A2_sub>;
+def : T_IR_pat<A2_subri,    int_hexagon_A2_subri>;
+def : T_RR_pat<A2_and,      int_hexagon_A2_and>;
+def : T_RI_pat<A2_andir,    int_hexagon_A2_andir>;
+def : T_RR_pat<A2_or,       int_hexagon_A2_or>;
+def : T_RI_pat<A2_orir,     int_hexagon_A2_orir>;
+def : T_RR_pat<A2_xor,      int_hexagon_A2_xor>;
+def : T_RR_pat<A2_combinew, int_hexagon_A2_combinew>;
+
+// Assembler mapped from Rd32=not(Rs32) to Rd32=sub(#-1,Rs32)
+def : Pat <(int_hexagon_A2_not I32:$Rs),
+           (A2_subri -1, I32:$Rs)>;
+
+// Assembler mapped from Rd32=neg(Rs32) to Rd32=sub(#0,Rs32)
+def : Pat <(int_hexagon_A2_neg I32:$Rs),
+           (A2_subri 0, I32:$Rs)>;
+
+// Transfer immediate
+def  : Pat <(int_hexagon_A2_tfril I32:$Rs, u16_0ImmPred:$Is),
+            (A2_tfril I32:$Rs, u16_0ImmPred:$Is)>;
+def  : Pat <(int_hexagon_A2_tfrih I32:$Rs, u16_0ImmPred:$Is),
+            (A2_tfrih I32:$Rs, u16_0ImmPred:$Is)>;
+
+//  Transfer Register/immediate.
+def : T_R_pat <A2_tfr, int_hexagon_A2_tfr>;
+def : T_I_pat <A2_tfrsi, int_hexagon_A2_tfrsi>;
+
+def ImmExt64: SDNodeXForm<imm, [{
+  int64_t V = N->getSExtValue();
+  return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i64);
+}]>;
+
+// A2_tfrpi has an operand of type i64. This is necessary, since it is
+// generated from "(set I64:$Rd, imm)". That pattern would not appear
+// in the DAG, if the immediate was not a 64-bit value.
+// The builtin for A2_tfrpi, on the other hand, takes a 32-bit value,
+// which makes it impossible to simply replace it with the instruction.
+// To connect the builtin with the instruction, the builtin's operand
+// needs to be extended to the right type.
+
+def : Pat<(int_hexagon_A2_tfrpi imm:$Is),
+          (A2_tfrpi (ImmExt64 $Is))>;
+
+// Assembler mapped from Rdd32=Rss32 to Rdd32=combine(Rss.H32,Rss.L32)
+def : Pat<(int_hexagon_A2_tfrp I64:$src),
+          (A2_combinew (HiReg I64:$src), (LoReg I64:$src))>;
+
+//*******************************************************************
+//           ALU32/PERM
+//*******************************************************************
+// Combine
+def: T_RR_pat<A2_combine_hh, int_hexagon_A2_combine_hh>;
+def: T_RR_pat<A2_combine_hl, int_hexagon_A2_combine_hl>;
+def: T_RR_pat<A2_combine_lh, int_hexagon_A2_combine_lh>;
+def: T_RR_pat<A2_combine_ll, int_hexagon_A2_combine_ll>;
+
+def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s32_0ImmPred, s8_0ImmPred>;
+
+// Mux
+def : T_QRR_pat<C2_mux,   int_hexagon_C2_mux>;
+def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s32_0ImmPred>;
+def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s32_0ImmPred>;
+def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s32_0ImmPred, s8_0ImmPred>;
+
+// Shift halfword
+def : T_R_pat<A2_aslh, int_hexagon_A2_aslh>;
+def : T_R_pat<A2_asrh, int_hexagon_A2_asrh>;
+def : T_R_pat<A2_asrh, int_hexagon_SI_to_SXTHI_asrh>;
+
+// Sign/zero extend
+def : T_R_pat<A2_sxth, int_hexagon_A2_sxth>;
+def : T_R_pat<A2_sxtb, int_hexagon_A2_sxtb>;
+def : T_R_pat<A2_zxth, int_hexagon_A2_zxth>;
+def : T_R_pat<A2_zxtb, int_hexagon_A2_zxtb>;
+
+//*******************************************************************
+//           ALU32/PRED
+//*******************************************************************
+// Compare
+def : T_Q_RR_pat<C2_cmpeq,  int_hexagon_C2_cmpeq>;
+def : T_Q_RR_pat<C2_cmpgt,  int_hexagon_C2_cmpgt>;
+def : T_Q_RR_pat<C2_cmpgtu, int_hexagon_C2_cmpgtu>;
+
+def : T_Q_RI_pat<C2_cmpeqi,  int_hexagon_C2_cmpeqi, s32_0ImmPred>;
+def : T_Q_RI_pat<C2_cmpgti,  int_hexagon_C2_cmpgti, s32_0ImmPred>;
+def : T_Q_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u32_0ImmPred>;
+
+def : Pat <(int_hexagon_C2_cmpgei I32:$src1, s32_0ImmPred:$src2),
+           (C2_tfrpr (C2_cmpgti I32:$src1, (SDEC1 s32_0ImmPred:$src2)))>;
+
+def : Pat <(int_hexagon_C2_cmpgeui I32:$src1, u32_0ImmPred:$src2),
+           (C2_tfrpr (C2_cmpgtui I32:$src1, (UDEC1 u32_0ImmPred:$src2)))>;
+
+def : Pat <(int_hexagon_C2_cmpgeui I32:$src, 0),
+           (C2_tfrpr (C2_cmpeq I32:$src, I32:$src))>;
+def : Pat <(int_hexagon_C2_cmplt I32:$src1, I32:$src2),
+           (C2_tfrpr (C2_cmpgt I32:$src2, I32:$src1))>;
+def : Pat <(int_hexagon_C2_cmpltu I32:$src1, I32:$src2),
+           (C2_tfrpr (C2_cmpgtu I32:$src2, I32:$src1))>;
+
+//*******************************************************************
+//           ALU32/VH
+//*******************************************************************
+// Vector add, subtract, average halfwords
+def: T_RR_pat<A2_svaddh,   int_hexagon_A2_svaddh>;
+def: T_RR_pat<A2_svaddhs,  int_hexagon_A2_svaddhs>;
+def: T_RR_pat<A2_svadduhs, int_hexagon_A2_svadduhs>;
+
+def: T_RR_pat<A2_svsubh,   int_hexagon_A2_svsubh>;
+def: T_RR_pat<A2_svsubhs,  int_hexagon_A2_svsubhs>;
+def: T_RR_pat<A2_svsubuhs, int_hexagon_A2_svsubuhs>;
+
+def: T_RR_pat<A2_svavgh,   int_hexagon_A2_svavgh>;
+def: T_RR_pat<A2_svavghs,  int_hexagon_A2_svavghs>;
+def: T_RR_pat<A2_svnavgh,  int_hexagon_A2_svnavgh>;
+
+//*******************************************************************
+//           ALU64/ALU
+//*******************************************************************
+def: T_RR_pat<A2_addsat,     int_hexagon_A2_addsat>;
+def: T_RR_pat<A2_subsat,     int_hexagon_A2_subsat>;
+def: T_PP_pat<A2_addp,       int_hexagon_A2_addp>;
+def: T_PP_pat<A2_subp,       int_hexagon_A2_subp>;
+
+def: T_PP_pat<A2_andp,       int_hexagon_A2_andp>;
+def: T_PP_pat<A2_orp,        int_hexagon_A2_orp>;
+def: T_PP_pat<A2_xorp,       int_hexagon_A2_xorp>;
+
+def: T_Q_PP_pat<C2_cmpeqp,   int_hexagon_C2_cmpeqp>;
+def: T_Q_PP_pat<C2_cmpgtp,   int_hexagon_C2_cmpgtp>;
+def: T_Q_PP_pat<C2_cmpgtup,  int_hexagon_C2_cmpgtup>;
+
+def: T_PP_pat<S2_parityp,    int_hexagon_S2_parityp>;
+def: T_RR_pat<S2_packhl,     int_hexagon_S2_packhl>;
+
+//*******************************************************************
+//           ALU64/VB
+//*******************************************************************
+// ALU64 - Vector add
+def : T_PP_pat <A2_vaddub,   int_hexagon_A2_vaddub>;
+def : T_PP_pat <A2_vaddubs,  int_hexagon_A2_vaddubs>;
+def : T_PP_pat <A2_vaddh,    int_hexagon_A2_vaddh>;
+def : T_PP_pat <A2_vaddhs,   int_hexagon_A2_vaddhs>;
+def : T_PP_pat <A2_vadduhs,  int_hexagon_A2_vadduhs>;
+def : T_PP_pat <A2_vaddw,    int_hexagon_A2_vaddw>;
+def : T_PP_pat <A2_vaddws,   int_hexagon_A2_vaddws>;
+
+// ALU64 - Vector average
+def : T_PP_pat <A2_vavgub,   int_hexagon_A2_vavgub>;
+def : T_PP_pat <A2_vavgubr,  int_hexagon_A2_vavgubr>;
+def : T_PP_pat <A2_vavgh,    int_hexagon_A2_vavgh>;
+def : T_PP_pat <A2_vavghr,   int_hexagon_A2_vavghr>;
+def : T_PP_pat <A2_vavghcr,  int_hexagon_A2_vavghcr>;
+def : T_PP_pat <A2_vavguh,   int_hexagon_A2_vavguh>;
+def : T_PP_pat <A2_vavguhr,  int_hexagon_A2_vavguhr>;
+
+def : T_PP_pat <A2_vavgw,    int_hexagon_A2_vavgw>;
+def : T_PP_pat <A2_vavgwr,   int_hexagon_A2_vavgwr>;
+def : T_PP_pat <A2_vavgwcr,  int_hexagon_A2_vavgwcr>;
+def : T_PP_pat <A2_vavguw,   int_hexagon_A2_vavguw>;
+def : T_PP_pat <A2_vavguwr,  int_hexagon_A2_vavguwr>;
+
+// ALU64 - Vector negative average
+def : T_PP_pat <A2_vnavgh,   int_hexagon_A2_vnavgh>;
+def : T_PP_pat <A2_vnavghr,  int_hexagon_A2_vnavghr>;
+def : T_PP_pat <A2_vnavghcr, int_hexagon_A2_vnavghcr>;
+def : T_PP_pat <A2_vnavgw,   int_hexagon_A2_vnavgw>;
+def : T_PP_pat <A2_vnavgwr,  int_hexagon_A2_vnavgwr>;
+def : T_PP_pat <A2_vnavgwcr, int_hexagon_A2_vnavgwcr>;
+
+// ALU64 - Vector max
+def : T_PP_pat <A2_vmaxh,    int_hexagon_A2_vmaxh>;
+def : T_PP_pat <A2_vmaxw,    int_hexagon_A2_vmaxw>;
+def : T_PP_pat <A2_vmaxub,   int_hexagon_A2_vmaxub>;
+def : T_PP_pat <A2_vmaxuh,   int_hexagon_A2_vmaxuh>;
+def : T_PP_pat <A2_vmaxuw,   int_hexagon_A2_vmaxuw>;
+
+// ALU64 - Vector min
+def : T_PP_pat <A2_vminh,    int_hexagon_A2_vminh>;
+def : T_PP_pat <A2_vminw,    int_hexagon_A2_vminw>;
+def : T_PP_pat <A2_vminub,   int_hexagon_A2_vminub>;
+def : T_PP_pat <A2_vminuh,   int_hexagon_A2_vminuh>;
+def : T_PP_pat <A2_vminuw,   int_hexagon_A2_vminuw>;
+
+// ALU64 - Vector sub
+def : T_PP_pat <A2_vsubub,   int_hexagon_A2_vsubub>;
+def : T_PP_pat <A2_vsububs,  int_hexagon_A2_vsububs>;
+def : T_PP_pat <A2_vsubh,    int_hexagon_A2_vsubh>;
+def : T_PP_pat <A2_vsubhs,   int_hexagon_A2_vsubhs>;
+def : T_PP_pat <A2_vsubuhs,  int_hexagon_A2_vsubuhs>;
+def : T_PP_pat <A2_vsubw,    int_hexagon_A2_vsubw>;
+def : T_PP_pat <A2_vsubws,   int_hexagon_A2_vsubws>;
+
+// ALU64 - Vector compare bytes
+def : T_Q_PP_pat <A2_vcmpbeq,  int_hexagon_A2_vcmpbeq>;
+def : T_Q_PP_pat <A4_vcmpbgt,  int_hexagon_A4_vcmpbgt>;
+def : T_Q_PP_pat <A2_vcmpbgtu, int_hexagon_A2_vcmpbgtu>;
+
+// ALU64 - Vector compare halfwords
+def : T_Q_PP_pat <A2_vcmpheq,  int_hexagon_A2_vcmpheq>;
+def : T_Q_PP_pat <A2_vcmphgt,  int_hexagon_A2_vcmphgt>;
+def : T_Q_PP_pat <A2_vcmphgtu, int_hexagon_A2_vcmphgtu>;
+
+// ALU64 - Vector compare words
+def : T_Q_PP_pat <A2_vcmpweq,  int_hexagon_A2_vcmpweq>;
+def : T_Q_PP_pat <A2_vcmpwgt,  int_hexagon_A2_vcmpwgt>;
+def : T_Q_PP_pat <A2_vcmpwgtu, int_hexagon_A2_vcmpwgtu>;
+
+// ALU64 / VB / Vector mux.
+def : T_QPP_pat <C2_vmux,      int_hexagon_C2_vmux>;
+
+// MPY - Multiply and use full result
+// Rdd = mpy[u](Rs, Rt)
+def : T_RR_pat <M2_dpmpyss_s0, int_hexagon_M2_dpmpyss_s0>;
+def : T_RR_pat <M2_dpmpyuu_s0, int_hexagon_M2_dpmpyuu_s0>;
+
+// Complex multiply real or imaginary
+def : T_RR_pat <M2_cmpyi_s0,   int_hexagon_M2_cmpyi_s0>;
+def : T_RR_pat <M2_cmpyr_s0,   int_hexagon_M2_cmpyr_s0>;
+
+// Complex multiply
+def : T_RR_pat <M2_cmpys_s0,   int_hexagon_M2_cmpys_s0>;
+def : T_RR_pat <M2_cmpysc_s0,  int_hexagon_M2_cmpysc_s0>;
+def : T_RR_pat <M2_cmpys_s1,   int_hexagon_M2_cmpys_s1>;
+def : T_RR_pat <M2_cmpysc_s1,  int_hexagon_M2_cmpysc_s1>;
+
+// Vector multiply halfwords
+// Rdd=vmpyh(Rs,Rt)[:<<1]:sat
+def : T_RR_pat <M2_vmpy2s_s0,  int_hexagon_M2_vmpy2s_s0>;
+def : T_RR_pat <M2_vmpy2s_s1,  int_hexagon_M2_vmpy2s_s1>;
+
+// Rxx[+-]= mpy[u](Rs,Rt)
+def : T_PRR_pat <M2_dpmpyss_acc_s0, int_hexagon_M2_dpmpyss_acc_s0>;
+def : T_PRR_pat <M2_dpmpyss_nac_s0, int_hexagon_M2_dpmpyss_nac_s0>;
+def : T_PRR_pat <M2_dpmpyuu_acc_s0, int_hexagon_M2_dpmpyuu_acc_s0>;
+def : T_PRR_pat <M2_dpmpyuu_nac_s0, int_hexagon_M2_dpmpyuu_nac_s0>;
+
+// Rxx[-+]=cmpy(Rs,Rt)[:<<1]:sat
+def : T_PRR_pat <M2_cmacs_s0, int_hexagon_M2_cmacs_s0>;
+def : T_PRR_pat <M2_cnacs_s0, int_hexagon_M2_cnacs_s0>;
+def : T_PRR_pat <M2_cmacs_s1, int_hexagon_M2_cmacs_s1>;
+def : T_PRR_pat <M2_cnacs_s1, int_hexagon_M2_cnacs_s1>;
+
+// Rxx[-+]=cmpy(Rs,Rt*)[:<<1]:sat
+def : T_PRR_pat <M2_cmacsc_s0, int_hexagon_M2_cmacsc_s0>;
+def : T_PRR_pat <M2_cnacsc_s0, int_hexagon_M2_cnacsc_s0>;
+def : T_PRR_pat <M2_cmacsc_s1, int_hexagon_M2_cmacsc_s1>;
+def : T_PRR_pat <M2_cnacsc_s1, int_hexagon_M2_cnacsc_s1>;
+
+// Rxx+=cmpy[ir](Rs,Rt)
+def : T_PRR_pat <M2_cmaci_s0, int_hexagon_M2_cmaci_s0>;
+def : T_PRR_pat <M2_cmacr_s0, int_hexagon_M2_cmacr_s0>;
+
+// Rxx+=vmpyh(Rs,Rt)[:<<1][:sat]
+def : T_PRR_pat <M2_vmac2, int_hexagon_M2_vmac2>;
+def : T_PRR_pat <M2_vmac2s_s0, int_hexagon_M2_vmac2s_s0>;
+def : T_PRR_pat <M2_vmac2s_s1, int_hexagon_M2_vmac2s_s1>;
+
+//*******************************************************************
+//           CR
+//*******************************************************************
+def: T_Q_Q_pat<C2_not,       int_hexagon_C2_not>;
+def: T_Q_Q_pat<C2_all8,      int_hexagon_C2_all8>;
+def: T_Q_Q_pat<C2_any8,      int_hexagon_C2_any8>;
+def: T_Q_Q_pat<C2_pxfer_map, int_hexagon_C2_pxfer_map>;
+
+def: T_Q_QQ_pat<C2_and,      int_hexagon_C2_and>;
+def: T_Q_QQ_pat<C2_andn,     int_hexagon_C2_andn>;
+def: T_Q_QQ_pat<C2_or,       int_hexagon_C2_or>;
+def: T_Q_QQ_pat<C2_orn,      int_hexagon_C2_orn>;
+def: T_Q_QQ_pat<C2_xor,      int_hexagon_C2_xor>;
+
+// Multiply 32x32 and use lower result
+def : T_RRI_pat <M2_macsip, int_hexagon_M2_macsip>;
+def : T_RRI_pat <M2_macsin, int_hexagon_M2_macsin>;
+def : T_RRR_pat <M2_maci,   int_hexagon_M2_maci>;
+
+// Subtract and accumulate
+def : T_RRR_pat <M2_subacc, int_hexagon_M2_subacc>;
+
+// Add and accumulate
+def : T_RRR_pat <M2_acci,   int_hexagon_M2_acci>;
+def : T_RRR_pat <M2_nacci,  int_hexagon_M2_nacci>;
+def : T_RRI_pat <M2_accii,  int_hexagon_M2_accii>;
+def : T_RRI_pat <M2_naccii, int_hexagon_M2_naccii>;
+
+// XOR and XOR with destination
+def : T_RRR_pat <M2_xor_xacc, int_hexagon_M2_xor_xacc>;
+
+// Vector dual multiply with round and pack
+def : T_PP_pat <M2_vdmpyrs_s0, int_hexagon_M2_vdmpyrs_s0>;
+def : T_PP_pat <M2_vdmpyrs_s1, int_hexagon_M2_vdmpyrs_s1>;
+
+// Vector multiply halfwords with round and pack
+def : T_RR_pat <M2_vmpy2s_s0pack, int_hexagon_M2_vmpy2s_s0pack>;
+def : T_RR_pat <M2_vmpy2s_s1pack, int_hexagon_M2_vmpy2s_s1pack>;
+
+// Multiply and use lower result
+def : T_RR_pat <M2_mpyi, int_hexagon_M2_mpyi>;
+def : T_RI_pat <M2_mpysmi, int_hexagon_M2_mpysmi>;
+
+// Assembler mapped from Rd32=mpyui(Rs32,Rt32) to Rd32=mpyi(Rs32,Rt32)
+def : T_RR_pat <M2_mpyi, int_hexagon_M2_mpyui>;
+
+// Multiply and use upper result
+def : T_RR_pat <M2_mpy_up, int_hexagon_M2_mpy_up>;
+def : T_RR_pat <M2_mpyu_up, int_hexagon_M2_mpyu_up>;
+def : T_RR_pat <M2_hmmpyh_rs1, int_hexagon_M2_hmmpyh_rs1>;
+def : T_RR_pat <M2_hmmpyl_rs1, int_hexagon_M2_hmmpyl_rs1>;
+def : T_RR_pat <M2_dpmpyss_rnd_s0, int_hexagon_M2_dpmpyss_rnd_s0>;
+
+// Complex multiply with round and pack
+// Rxx32+=cmpy(Rs32,[*]Rt32:<<1]:rnd:sat
+def : T_RR_pat <M2_cmpyrs_s0, int_hexagon_M2_cmpyrs_s0>;
+def : T_RR_pat <M2_cmpyrs_s1, int_hexagon_M2_cmpyrs_s1>;
+def : T_RR_pat <M2_cmpyrsc_s0, int_hexagon_M2_cmpyrsc_s0>;
+def : T_RR_pat <M2_cmpyrsc_s1, int_hexagon_M2_cmpyrsc_s1>;
+
+//*******************************************************************
+//           STYPE/ALU
+//*******************************************************************
+def : T_P_pat <A2_absp, int_hexagon_A2_absp>;
+def : T_P_pat <A2_negp, int_hexagon_A2_negp>;
+def : T_P_pat <A2_notp, int_hexagon_A2_notp>;
+
+//*******************************************************************
+//           STYPE/BIT
+//*******************************************************************
+
+// Count leading/trailing
+def: T_R_pat<S2_cl0,     int_hexagon_S2_cl0>;
+def: T_P_pat<S2_cl0p,    int_hexagon_S2_cl0p>;
+def: T_R_pat<S2_cl1,     int_hexagon_S2_cl1>;
+def: T_P_pat<S2_cl1p,    int_hexagon_S2_cl1p>;
+def: T_R_pat<S2_clb,     int_hexagon_S2_clb>;
+def: T_P_pat<S2_clbp,    int_hexagon_S2_clbp>;
+def: T_R_pat<S2_clbnorm, int_hexagon_S2_clbnorm>;
+def: T_R_pat<S2_ct0,     int_hexagon_S2_ct0>;
+def: T_R_pat<S2_ct1,     int_hexagon_S2_ct1>;
+
+// Compare bit mask
+def: T_RR_pat<C2_bitsclr,  int_hexagon_C2_bitsclr>;
+def: T_RI_pat<C2_bitsclri, int_hexagon_C2_bitsclri>;
+def: T_RR_pat<C2_bitsset,  int_hexagon_C2_bitsset>;
+
+// Vector shuffle
+def : T_PP_pat <S2_shuffeb, int_hexagon_S2_shuffeb>;
+def : T_PP_pat <S2_shuffob, int_hexagon_S2_shuffob>;
+def : T_PP_pat <S2_shuffeh, int_hexagon_S2_shuffeh>;
+def : T_PP_pat <S2_shuffoh, int_hexagon_S2_shuffoh>;
+
+// Vector truncate
+def : T_PP_pat <S2_vtrunewh, int_hexagon_S2_vtrunewh>;
+def : T_PP_pat <S2_vtrunowh, int_hexagon_S2_vtrunowh>;
+
+// Linear feedback-shift Iteration.
+def : T_PP_pat <S2_lfsp, int_hexagon_S2_lfsp>;
+
+// Vector align
+// Need custom lowering
+def : T_PPQ_pat <S2_valignrb, int_hexagon_S2_valignrb>;
+def : T_PPI_pat <S2_valignib, int_hexagon_S2_valignib>;
+
+// Vector splice
+def : T_PPQ_pat <S2_vsplicerb, int_hexagon_S2_vsplicerb>;
+def : T_PPI_pat <S2_vspliceib, int_hexagon_S2_vspliceib>;
+
+// Shift by immediate and add
+def : T_RRI_pat<S2_addasl_rrri, int_hexagon_S2_addasl_rrri>;
+
+// Extract bitfield
+def : T_PII_pat<S2_extractup,    int_hexagon_S2_extractup>;
+def : T_RII_pat<S2_extractu,     int_hexagon_S2_extractu>;
+def : T_RP_pat <S2_extractu_rp,  int_hexagon_S2_extractu_rp>;
+def : T_PP_pat <S2_extractup_rp, int_hexagon_S2_extractup_rp>;
+
+// Insert bitfield
+def : Pat <(int_hexagon_S2_insert_rp I32:$src1, I32:$src2, I64:$src3),
+           (S2_insert_rp I32:$src1, I32:$src2, I64:$src3)>;
+
+def : Pat<(i64 (int_hexagon_S2_insertp_rp I64:$src1, I64:$src2, I64:$src3)),
+          (i64 (S2_insertp_rp I64:$src1, I64:$src2, I64:$src3))>;
+
+def : Pat<(int_hexagon_S2_insert I32:$src1, I32:$src2,
+                                 u5_0ImmPred:$src3, u5_0ImmPred:$src4),
+          (S2_insert I32:$src1, I32:$src2,
+                     u5_0ImmPred:$src3, u5_0ImmPred:$src4)>;
+
+def : Pat<(i64 (int_hexagon_S2_insertp I64:$src1, I64:$src2,
+                                       u6_0ImmPred:$src3, u6_0ImmPred:$src4)),
+          (i64 (S2_insertp I64:$src1, I64:$src2,
+                           u6_0ImmPred:$src3, u6_0ImmPred:$src4))>;
+
+// Innterleave/deinterleave
+def : T_P_pat <S2_interleave, int_hexagon_S2_interleave>;
+def : T_P_pat <S2_deinterleave, int_hexagon_S2_deinterleave>;
+
+// Set/Clear/Toggle Bit
+def: T_RI_pat<S2_setbit_i,    int_hexagon_S2_setbit_i>;
+def: T_RI_pat<S2_clrbit_i,    int_hexagon_S2_clrbit_i>;
+def: T_RI_pat<S2_togglebit_i, int_hexagon_S2_togglebit_i>;
+
+def: T_RR_pat<S2_setbit_r,    int_hexagon_S2_setbit_r>;
+def: T_RR_pat<S2_clrbit_r,    int_hexagon_S2_clrbit_r>;
+def: T_RR_pat<S2_togglebit_r, int_hexagon_S2_togglebit_r>;
+
+// Test Bit
+def: T_Q_RI_pat<S2_tstbit_i,  int_hexagon_S2_tstbit_i>;
+def: T_Q_RR_pat<S2_tstbit_r,  int_hexagon_S2_tstbit_r>;
+
+//*******************************************************************
+//           STYPE/COMPLEX
+//*******************************************************************
+// Vector Complex conjugate
+def : T_P_pat <A2_vconj, int_hexagon_A2_vconj>;
+
+// Vector Complex rotate
+def : T_PR_pat <S2_vcrotate, int_hexagon_S2_vcrotate>;
+
+//*******************************************************************
+//           STYPE/PERM
+//*******************************************************************
+
+// Vector saturate without pack
+def : T_P_pat <S2_vsathb_nopack, int_hexagon_S2_vsathb_nopack>;
+def : T_P_pat <S2_vsathub_nopack, int_hexagon_S2_vsathub_nopack>;
+def : T_P_pat <S2_vsatwh_nopack, int_hexagon_S2_vsatwh_nopack>;
+def : T_P_pat <S2_vsatwuh_nopack, int_hexagon_S2_vsatwuh_nopack>;
+
+//*******************************************************************
+//           STYPE/PRED
+//*******************************************************************
+
+// Predicate transfer
+def: Pat<(i32 (int_hexagon_C2_tfrpr I32:$Rs)),
+         (i32 (C2_tfrpr (C2_tfrrp I32:$Rs)))>;
+def: Pat<(i32 (int_hexagon_C2_tfrrp I32:$Rs)),
+         (i32 (C2_tfrpr (C2_tfrrp I32:$Rs)))>;
+
+// Mask generate from predicate
+def: Pat<(i64 (int_hexagon_C2_mask I32:$Rs)),
+         (i64 (C2_mask (C2_tfrrp I32:$Rs)))>;
+
+// Viterbi pack even and odd predicate bits
+def: T_QQ_pat<C2_vitpack, int_hexagon_C2_vitpack>;
+
+//*******************************************************************
+//           STYPE/SHIFT
+//*******************************************************************
+
+def : T_PI_pat <S2_asr_i_p, int_hexagon_S2_asr_i_p>;
+def : T_PI_pat <S2_lsr_i_p, int_hexagon_S2_lsr_i_p>;
+def : T_PI_pat <S2_asl_i_p, int_hexagon_S2_asl_i_p>;
+
+def : T_PR_pat <S2_asr_r_p, int_hexagon_S2_asr_r_p>;
+def : T_PR_pat <S2_lsr_r_p, int_hexagon_S2_lsr_r_p>;
+def : T_PR_pat <S2_asl_r_p, int_hexagon_S2_asl_r_p>;
+def : T_PR_pat <S2_lsl_r_p, int_hexagon_S2_lsl_r_p>;
+
+def : T_RR_pat <S2_asr_r_r, int_hexagon_S2_asr_r_r>;
+def : T_RR_pat <S2_lsr_r_r, int_hexagon_S2_lsr_r_r>;
+def : T_RR_pat <S2_asl_r_r, int_hexagon_S2_asl_r_r>;
+def : T_RR_pat <S2_lsl_r_r, int_hexagon_S2_lsl_r_r>;
+
+def : T_RR_pat <S2_asr_r_r_sat, int_hexagon_S2_asr_r_r_sat>;
+def : T_RR_pat <S2_asl_r_r_sat, int_hexagon_S2_asl_r_r_sat>;
+
+def : T_R_pat <S2_vsxtbh,   int_hexagon_S2_vsxtbh>;
+def : T_R_pat <S2_vzxtbh,   int_hexagon_S2_vzxtbh>;
+def : T_R_pat <S2_vsxthw,   int_hexagon_S2_vsxthw>;
+def : T_R_pat <S2_vzxthw,   int_hexagon_S2_vzxthw>;
+def : T_R_pat <S2_vsplatrh, int_hexagon_S2_vsplatrh>;
+def : T_R_pat <A2_sxtw,     int_hexagon_A2_sxtw>;
+
+// Vector saturate and pack
+def : T_R_pat <S2_svsathb,  int_hexagon_S2_svsathb>;
+def : T_R_pat <S2_svsathub, int_hexagon_S2_svsathub>;
+def : T_P_pat <S2_vsathub,  int_hexagon_S2_vsathub>;
+def : T_P_pat <S2_vsatwh,   int_hexagon_S2_vsatwh>;
+def : T_P_pat <S2_vsatwuh,  int_hexagon_S2_vsatwuh>;
+def : T_P_pat <S2_vsathb,   int_hexagon_S2_vsathb>;
+
+def : T_P_pat <S2_vtrunohb,    int_hexagon_S2_vtrunohb>;
+def : T_P_pat <S2_vtrunehb,    int_hexagon_S2_vtrunehb>;
+def : T_P_pat <S2_vrndpackwh,  int_hexagon_S2_vrndpackwh>;
+def : T_P_pat <S2_vrndpackwhs, int_hexagon_S2_vrndpackwhs>;
+def : T_R_pat <S2_brev,        int_hexagon_S2_brev>;
+def : T_R_pat <S2_vsplatrb,    int_hexagon_S2_vsplatrb>;
+
+def : T_R_pat <A2_abs,    int_hexagon_A2_abs>;
+def : T_R_pat <A2_abssat, int_hexagon_A2_abssat>;
+def : T_R_pat <A2_negsat, int_hexagon_A2_negsat>;
+
+def : T_R_pat <A2_swiz,   int_hexagon_A2_swiz>;
+
+def : T_P_pat <A2_sat,    int_hexagon_A2_sat>;
+def : T_R_pat <A2_sath,   int_hexagon_A2_sath>;
+def : T_R_pat <A2_satuh,  int_hexagon_A2_satuh>;
+def : T_R_pat <A2_satub,  int_hexagon_A2_satub>;
+def : T_R_pat <A2_satb,   int_hexagon_A2_satb>;
+
+// Vector arithmetic shift right by immediate with truncate and pack.
+def : T_PI_pat<S2_asr_i_svw_trun, int_hexagon_S2_asr_i_svw_trun>;
+
+def : T_RI_pat <S2_asr_i_r,     int_hexagon_S2_asr_i_r>;
+def : T_RI_pat <S2_lsr_i_r,     int_hexagon_S2_lsr_i_r>;
+def : T_RI_pat <S2_asl_i_r,     int_hexagon_S2_asl_i_r>;
+def : T_RI_pat <S2_asr_i_r_rnd, int_hexagon_S2_asr_i_r_rnd>;
+def : T_RI_pat <S2_asr_i_r_rnd_goodsyntax,
+                int_hexagon_S2_asr_i_r_rnd_goodsyntax>;
+
+// Shift left by immediate with saturation.
+def : T_RI_pat <S2_asl_i_r_sat, int_hexagon_S2_asl_i_r_sat>;
+
+//===----------------------------------------------------------------------===//
+// Template 'def pat' to map tableidx[bhwd] intrinsics to :raw instructions.
+//===----------------------------------------------------------------------===//
+class S2op_tableidx_pat <Intrinsic IntID, InstHexagon OutputInst,
+                         SDNodeXForm XformImm>
+  : Pat <(IntID I32:$src1, I32:$src2, u4_0ImmPred:$src3, u5_0ImmPred:$src4),
+         (OutputInst I32:$src1, I32:$src2, u4_0ImmPred:$src3,
+                     (XformImm u5_0ImmPred:$src4))>;
+
+def SDEC2 : SDNodeXForm<imm, [{
+  int32_t V = N->getSExtValue();
+  return CurDAG->getTargetConstant(V-2, SDLoc(N), MVT::i32);
+}]>;
+
+def SDEC3 : SDNodeXForm<imm, [{
+  int32_t V = N->getSExtValue();
+  return CurDAG->getTargetConstant(V-3, SDLoc(N), MVT::i32);
+}]>;
+
+// Table Index : Extract and insert bits.
+// Map to the real hardware instructions after subtracting appropriate
+// values from the 4th input operand. Please note that subtraction is not
+// needed for int_hexagon_S2_tableidxb_goodsyntax.
+
+def : Pat <(int_hexagon_S2_tableidxb_goodsyntax I32:$src1, I32:$src2,
+                                              u4_0ImmPred:$src3, u5_0ImmPred:$src4),
+           (S2_tableidxb I32:$src1, I32:$src2,
+                         u4_0ImmPred:$src3, u5_0ImmPred:$src4)>;
+
+def : S2op_tableidx_pat <int_hexagon_S2_tableidxh_goodsyntax, S2_tableidxh,
+                         SDEC1>;
+def : S2op_tableidx_pat <int_hexagon_S2_tableidxw_goodsyntax, S2_tableidxw,
+                         SDEC2>;
+def : S2op_tableidx_pat <int_hexagon_S2_tableidxd_goodsyntax, S2_tableidxd,
+                         SDEC3>;
+
+//*******************************************************************
+//           STYPE/VH
+//*******************************************************************
+
+// Vector absolute value halfwords with and without saturation
+// Rdd64=vabsh(Rss64)[:sat]
+def : T_P_pat <A2_vabsh, int_hexagon_A2_vabsh>;
+def : T_P_pat <A2_vabshsat, int_hexagon_A2_vabshsat>;
+
+// Vector shift halfwords by immediate
+// Rdd64=[vaslh/vasrh/vlsrh](Rss64,u4)
+def : T_PI_pat <S2_asr_i_vh, int_hexagon_S2_asr_i_vh>;
+def : T_PI_pat <S2_lsr_i_vh, int_hexagon_S2_lsr_i_vh>;
+def : T_PI_pat <S2_asl_i_vh, int_hexagon_S2_asl_i_vh>;
+
+// Vector shift halfwords by register
+// Rdd64=[vaslw/vasrw/vlslw/vlsrw](Rss64,Rt32)
+def : T_PR_pat <S2_asr_r_vh, int_hexagon_S2_asr_r_vh>;
+def : T_PR_pat <S2_lsr_r_vh, int_hexagon_S2_lsr_r_vh>;
+def : T_PR_pat <S2_asl_r_vh, int_hexagon_S2_asl_r_vh>;
+def : T_PR_pat <S2_lsl_r_vh, int_hexagon_S2_lsl_r_vh>;
+
+//*******************************************************************
+//           STYPE/VW
+//*******************************************************************
+
+// Vector absolute value words with and without saturation
+def : T_P_pat <A2_vabsw, int_hexagon_A2_vabsw>;
+def : T_P_pat <A2_vabswsat, int_hexagon_A2_vabswsat>;
+
+// Vector shift words by immediate.
+// Rdd64=[vasrw/vlsrw|vaslw](Rss64,u5)
+def : T_PI_pat <S2_asr_i_vw, int_hexagon_S2_asr_i_vw>;
+def : T_PI_pat <S2_lsr_i_vw, int_hexagon_S2_lsr_i_vw>;
+def : T_PI_pat <S2_asl_i_vw, int_hexagon_S2_asl_i_vw>;
+
+// Vector shift words by register.
+// Rdd64=[vasrw/vlsrw|vaslw|vlslw](Rss64,Rt32)
+def : T_PR_pat <S2_asr_r_vw, int_hexagon_S2_asr_r_vw>;
+def : T_PR_pat <S2_lsr_r_vw, int_hexagon_S2_lsr_r_vw>;
+def : T_PR_pat <S2_asl_r_vw, int_hexagon_S2_asl_r_vw>;
+def : T_PR_pat <S2_lsl_r_vw, int_hexagon_S2_lsl_r_vw>;
+
+// Vector shift words with truncate and pack
+def : T_PR_pat <S2_asr_r_svw_trun, int_hexagon_S2_asr_r_svw_trun>;
+
+// Load/store locked.
+def : T_R_pat<L2_loadw_locked, int_hexagon_L2_loadw_locked>;
+def : T_R_pat<L4_loadd_locked, int_hexagon_L4_loadd_locked>;
+
+def : Pat<(int_hexagon_S2_storew_locked I32:$Rs, I32:$Rt),
+          (C2_tfrpr (S2_storew_locked I32:$Rs, I32:$Rt))>;
+def : Pat<(int_hexagon_S4_stored_locked I32:$Rs, I64:$Rt),
+          (C2_tfrpr (S4_stored_locked I32:$Rs, I64:$Rt))>;
+
+//*******************************************************************
+//           ST
+//*******************************************************************
+
+class T_stb_pat <InstHexagon MI, Intrinsic IntID, PatLeaf Val>
+  : Pat<(IntID I32:$Rs, Val:$Rt, I32:$Ru),
+        (MI I32:$Rs, I32:$Ru, Val:$Rt)>;
+
+def : T_stb_pat <S2_storerh_pbr, int_hexagon_brev_sth,   I32>;
+def : T_stb_pat <S2_storerb_pbr, int_hexagon_brev_stb,   I32>;
+def : T_stb_pat <S2_storeri_pbr, int_hexagon_brev_stw,   I32>;
+def : T_stb_pat <S2_storerf_pbr, int_hexagon_brev_sthhi, I32>;
+def : T_stb_pat <S2_storerd_pbr, int_hexagon_brev_std,   I64>;
+
+class T_stc_pat <InstHexagon MI, Intrinsic IntID, PatLeaf Imm, PatLeaf Val>
+  : Pat<(IntID I32:$Rs, Val:$Rt, I32:$Ru, Imm:$s),
+        (MI I32:$Rs, Imm:$s, I32:$Ru, Val:$Rt)>;
+
+def: T_stc_pat<S2_storerb_pci, int_hexagon_circ_stb,   s4_0ImmPred, I32>;
+def: T_stc_pat<S2_storerh_pci, int_hexagon_circ_sth,   s4_1ImmPred, I32>;
+def: T_stc_pat<S2_storeri_pci, int_hexagon_circ_stw,   s4_2ImmPred, I32>;
+def: T_stc_pat<S2_storerd_pci, int_hexagon_circ_std,   s4_3ImmPred, I64>;
+def: T_stc_pat<S2_storerf_pci, int_hexagon_circ_sthhi, s4_1ImmPred, I32>;
+
+include "HexagonIntrinsicsV3.td"
+include "HexagonIntrinsicsV4.td"
+include "HexagonIntrinsicsV5.td"
+include "HexagonIntrinsicsV60.td"
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsDerived.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
new file mode 100644
index 000000000000..400c17333f73
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
@@ -0,0 +1,40 @@
+//===-- HexagonIntrinsicsDerived.td - Derived intrinsics ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Multiply 64-bit and use lower result
+//
+// Optimized with intrinisics accumulates
+//
+def : Pat <(mul DoubleRegs:$src1, DoubleRegs:$src2),
+      (i64
+       (A2_combinew
+        (M2_maci
+         (M2_maci
+          (i32
+           (EXTRACT_SUBREG
+            (i64
+             (M2_dpmpyuu_s0 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                          isub_lo)),
+                     (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                          isub_lo)))),
+            isub_hi)),
+          (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), isub_lo)),
+          (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), isub_hi))),
+         (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), isub_lo)),
+         (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), isub_hi))),
+        (i32
+         (EXTRACT_SUBREG
+          (i64
+           (M2_dpmpyuu_s0 
+             (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), isub_lo)),
+                   (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                        isub_lo)))), isub_lo))))>;
+
+
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV3.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV3.td
new file mode 100644
index 000000000000..6152cb098825
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV3.td
@@ -0,0 +1,27 @@
+//=- HexagonIntrinsicsV3.td - Target Description for Hexagon -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V3 Compiler Intrinsics in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+// Vector reduce complex multiply real or imaginary
+def : T_PR_pat <M2_vrcmpys_s1,     int_hexagon_M2_vrcmpys_s1>;
+def : T_PPR_pat<M2_vrcmpys_acc_s1, int_hexagon_M2_vrcmpys_acc_s1>;
+def : T_PR_pat <M2_vrcmpys_s1rp,   int_hexagon_M2_vrcmpys_s1rp>;
+
+// Vector reduce add unsigned halfwords
+def : T_PP_pat<M2_vradduh, int_hexagon_M2_vradduh>;
+
+def: T_RP_pat<A2_addsp,   int_hexagon_A2_addsp>;
+def: T_PP_pat<A2_addpsat, int_hexagon_A2_addpsat>;
+def: T_PP_pat<A2_minp,    int_hexagon_A2_minp>;
+def: T_PP_pat<A2_minup,   int_hexagon_A2_minup>;
+def: T_PP_pat<A2_maxp,    int_hexagon_A2_maxp>;
+def: T_PP_pat<A2_maxup,   int_hexagon_A2_maxup>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV4.td
new file mode 100644
index 000000000000..2affe531515d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV4.td
@@ -0,0 +1,305 @@
+//===- HexagonIntrinsicsV4.td - V4 Instruction intrinsics --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is populated based on the following specs:
+// Hexagon V4 Architecture Extensions
+// Application-Level Specification
+// 80-V9418-12 Rev. A
+// June 15, 2010
+
+// Vector reduce multiply word by signed half (32x16)
+//Rdd=vrmpyweh(Rss,Rtt)[:<<1]
+def : T_PP_pat <M4_vrmpyeh_s0, int_hexagon_M4_vrmpyeh_s0>;
+def : T_PP_pat <M4_vrmpyeh_s1, int_hexagon_M4_vrmpyeh_s1>;
+
+//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+def : T_PP_pat <M4_vrmpyoh_s0, int_hexagon_M4_vrmpyoh_s0>;
+def : T_PP_pat <M4_vrmpyoh_s1, int_hexagon_M4_vrmpyoh_s1>;
+
+//Rdd+=vrmpyweh(Rss,Rtt)[:<<1]
+def : T_PPP_pat <M4_vrmpyeh_acc_s0, int_hexagon_M4_vrmpyeh_acc_s0>;
+def : T_PPP_pat <M4_vrmpyeh_acc_s1, int_hexagon_M4_vrmpyeh_acc_s1>;
+
+//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+def : T_PPP_pat <M4_vrmpyoh_acc_s0, int_hexagon_M4_vrmpyoh_acc_s0>;
+def : T_PPP_pat <M4_vrmpyoh_acc_s1, int_hexagon_M4_vrmpyoh_acc_s1>;
+
+// Vector multiply halfwords, signed by unsigned
+// Rdd=vmpyhsu(Rs,Rt)[:<<1]:sat
+def : T_RR_pat <M2_vmpy2su_s0, int_hexagon_M2_vmpy2su_s0>;
+def : T_RR_pat <M2_vmpy2su_s1, int_hexagon_M2_vmpy2su_s1>;
+
+// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat
+def : T_PRR_pat <M2_vmac2su_s0, int_hexagon_M2_vmac2su_s0>;
+def : T_PRR_pat <M2_vmac2su_s1, int_hexagon_M2_vmac2su_s1>;
+
+// Vector polynomial multiply halfwords
+// Rdd=vpmpyh(Rs,Rt)
+def : T_RR_pat <M4_vpmpyh, int_hexagon_M4_vpmpyh>;
+// Rxx[^]=vpmpyh(Rs,Rt)
+def : T_PRR_pat <M4_vpmpyh_acc, int_hexagon_M4_vpmpyh_acc>;
+
+// Polynomial multiply words
+// Rdd=pmpyw(Rs,Rt)
+def : T_RR_pat <M4_pmpyw, int_hexagon_M4_pmpyw>;
+// Rxx^=pmpyw(Rs,Rt)
+def : T_PRR_pat <M4_pmpyw_acc, int_hexagon_M4_pmpyw_acc>;
+
+//Rxx^=asr(Rss,Rt)
+def : T_PPR_pat <S2_asr_r_p_xor, int_hexagon_S2_asr_r_p_xor>;
+//Rxx^=asl(Rss,Rt)
+def : T_PPR_pat <S2_asl_r_p_xor, int_hexagon_S2_asl_r_p_xor>;
+//Rxx^=lsr(Rss,Rt)
+def : T_PPR_pat <S2_lsr_r_p_xor, int_hexagon_S2_lsr_r_p_xor>;
+//Rxx^=lsl(Rss,Rt)
+def : T_PPR_pat <S2_lsl_r_p_xor, int_hexagon_S2_lsl_r_p_xor>;
+
+// Multiply and use upper result
+def : T_RR_pat <M2_mpysu_up, int_hexagon_M2_mpysu_up>;
+def : T_RR_pat <M2_mpy_up_s1, int_hexagon_M2_mpy_up_s1>;
+def : T_RR_pat <M2_hmmpyh_s1, int_hexagon_M2_hmmpyh_s1>;
+def : T_RR_pat <M2_hmmpyl_s1, int_hexagon_M2_hmmpyl_s1>;
+def : T_RR_pat <M2_mpy_up_s1_sat, int_hexagon_M2_mpy_up_s1_sat>;
+
+def : T_PP_pat <A2_vaddub, int_hexagon_A2_vaddb_map>;
+def : T_PP_pat <A2_vsubub, int_hexagon_A2_vsubb_map>;
+
+// Vector reduce add unsigned halfwords
+def : T_PP_pat <M2_vraddh, int_hexagon_M2_vraddh>;
+
+def: T_P_pat<S2_brevp, int_hexagon_S2_brevp>;
+def: T_P_pat<S2_ct0p,  int_hexagon_S2_ct0p>;
+def: T_P_pat<S2_ct1p,  int_hexagon_S2_ct1p>;
+
+def: T_Q_RR_pat<C4_nbitsset,  int_hexagon_C4_nbitsset>;
+def: T_Q_RR_pat<C4_nbitsclr,  int_hexagon_C4_nbitsclr>;
+def: T_Q_RI_pat<C4_nbitsclri, int_hexagon_C4_nbitsclri>;
+
+def : T_Q_PI_pat<A4_vcmpbeqi,     int_hexagon_A4_vcmpbeqi>;
+def : T_Q_PI_pat<A4_vcmpbgti,     int_hexagon_A4_vcmpbgti>;
+def : T_Q_PI_pat<A4_vcmpbgtui,    int_hexagon_A4_vcmpbgtui>;
+def : T_Q_PI_pat<A4_vcmpheqi,     int_hexagon_A4_vcmpheqi>;
+def : T_Q_PI_pat<A4_vcmphgti,     int_hexagon_A4_vcmphgti>;
+def : T_Q_PI_pat<A4_vcmphgtui,    int_hexagon_A4_vcmphgtui>;
+def : T_Q_PI_pat<A4_vcmpweqi,     int_hexagon_A4_vcmpweqi>;
+def : T_Q_PI_pat<A4_vcmpwgti,     int_hexagon_A4_vcmpwgti>;
+def : T_Q_PI_pat<A4_vcmpwgtui,    int_hexagon_A4_vcmpwgtui>;
+def : T_Q_PP_pat<A4_vcmpbeq_any,  int_hexagon_A4_vcmpbeq_any>;
+
+def : T_Q_RR_pat<A4_cmpbeq,   int_hexagon_A4_cmpbeq>;
+def : T_Q_RR_pat<A4_cmpbgt,   int_hexagon_A4_cmpbgt>;
+def : T_Q_RR_pat<A4_cmpbgtu,  int_hexagon_A4_cmpbgtu>;
+def : T_Q_RR_pat<A4_cmpheq,   int_hexagon_A4_cmpheq>;
+def : T_Q_RR_pat<A4_cmphgt,   int_hexagon_A4_cmphgt>;
+def : T_Q_RR_pat<A4_cmphgtu,  int_hexagon_A4_cmphgtu>;
+
+def : T_Q_RI_pat<A4_cmpbeqi,  int_hexagon_A4_cmpbeqi>;
+def : T_Q_RI_pat<A4_cmpbgti,  int_hexagon_A4_cmpbgti>;
+def : T_Q_RI_pat<A4_cmpbgtui, int_hexagon_A4_cmpbgtui>;
+
+def : T_Q_RI_pat<A4_cmpheqi,  int_hexagon_A4_cmpheqi>;
+def : T_Q_RI_pat<A4_cmphgti,  int_hexagon_A4_cmphgti>;
+def : T_Q_RI_pat<A4_cmphgtui, int_hexagon_A4_cmphgtui>;
+
+def : T_Q_RP_pat<A4_boundscheck, int_hexagon_A4_boundscheck>;
+def : T_Q_PR_pat<A4_tlbmatch,    int_hexagon_A4_tlbmatch>;
+
+def : T_RRR_pat <M4_mpyrr_addr,    int_hexagon_M4_mpyrr_addr>;
+def : T_IRR_pat <M4_mpyrr_addi,    int_hexagon_M4_mpyrr_addi>;
+def : T_IRI_pat <M4_mpyri_addi,    int_hexagon_M4_mpyri_addi>;
+def : T_RIR_pat <M4_mpyri_addr_u2, int_hexagon_M4_mpyri_addr_u2>;
+def : T_RRI_pat <M4_mpyri_addr,    int_hexagon_M4_mpyri_addr>;
+def : T_RRR_pat <M4_mac_up_s1_sat, int_hexagon_M4_mac_up_s1_sat>;
+def : T_RRR_pat <M4_nac_up_s1_sat, int_hexagon_M4_nac_up_s1_sat>;
+
+// Complex multiply 32x16
+def : T_PR_pat <M4_cmpyi_wh, int_hexagon_M4_cmpyi_wh>;
+def : T_PR_pat <M4_cmpyr_wh, int_hexagon_M4_cmpyr_wh>;
+
+def : T_PR_pat <M4_cmpyi_whc, int_hexagon_M4_cmpyi_whc>;
+def : T_PR_pat <M4_cmpyr_whc, int_hexagon_M4_cmpyr_whc>;
+
+def : T_PP_pat<A4_andnp, int_hexagon_A4_andnp>;
+def : T_PP_pat<A4_ornp,  int_hexagon_A4_ornp>;
+
+// Complex add/sub halfwords/words
+def : T_PP_pat <S4_vxaddsubw, int_hexagon_S4_vxaddsubw>;
+def : T_PP_pat <S4_vxsubaddw, int_hexagon_S4_vxsubaddw>;
+def : T_PP_pat <S4_vxaddsubh, int_hexagon_S4_vxaddsubh>;
+def : T_PP_pat <S4_vxsubaddh, int_hexagon_S4_vxsubaddh>;
+
+def : T_PP_pat <S4_vxaddsubhr, int_hexagon_S4_vxaddsubhr>;
+def : T_PP_pat <S4_vxsubaddhr, int_hexagon_S4_vxsubaddhr>;
+
+// Extract bitfield
+def : T_PP_pat  <S4_extractp_rp, int_hexagon_S4_extractp_rp>;
+def : T_RP_pat  <S4_extract_rp, int_hexagon_S4_extract_rp>;
+def : T_PII_pat <S4_extractp, int_hexagon_S4_extractp>;
+def : T_RII_pat <S4_extract, int_hexagon_S4_extract>;
+
+// Vector conditional negate
+// Rdd=vcnegh(Rss,Rt)
+def : T_PR_pat <S2_vcnegh, int_hexagon_S2_vcnegh>;
+
+// Shift an immediate left by register amount
+def : T_IR_pat<S4_lsli, int_hexagon_S4_lsli>;
+
+// Vector reduce maximum halfwords
+def : T_PPR_pat <A4_vrmaxh, int_hexagon_A4_vrmaxh>;
+def : T_PPR_pat <A4_vrmaxuh, int_hexagon_A4_vrmaxuh>;
+
+// Vector reduce maximum words
+def : T_PPR_pat <A4_vrmaxw, int_hexagon_A4_vrmaxw>;
+def : T_PPR_pat <A4_vrmaxuw, int_hexagon_A4_vrmaxuw>;
+
+// Vector reduce minimum halfwords
+def : T_PPR_pat <A4_vrminh, int_hexagon_A4_vrminh>;
+def : T_PPR_pat <A4_vrminuh, int_hexagon_A4_vrminuh>;
+
+// Vector reduce minimum words
+def : T_PPR_pat <A4_vrminw, int_hexagon_A4_vrminw>;
+def : T_PPR_pat <A4_vrminuw, int_hexagon_A4_vrminuw>;
+
+// Rotate and reduce bytes
+def : Pat <(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2,
+                                     u2_0ImmPred:$src3),
+           (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3)>;
+
+// Rotate and reduce bytes with accumulation
+// Rxx+=vrcrotate(Rss,Rt,#u2)
+def : Pat <(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
+                                         IntRegs:$src3, u2_0ImmPred:$src4),
+           (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
+                             IntRegs:$src3, u2_0ImmPred:$src4)>;
+
+// Vector conditional negate
+def : T_PPR_pat<S2_vrcnegh, int_hexagon_S2_vrcnegh>;
+
+// Logical xor with xor accumulation
+def : T_PPP_pat<M4_xor_xacc, int_hexagon_M4_xor_xacc>;
+
+// ALU64 - Vector min/max byte
+def : T_PP_pat <A2_vminb, int_hexagon_A2_vminb>;
+def : T_PP_pat <A2_vmaxb, int_hexagon_A2_vmaxb>;
+
+// Shift and add/sub/and/or
+def : T_IRI_pat <S4_andi_asl_ri, int_hexagon_S4_andi_asl_ri>;
+def : T_IRI_pat <S4_ori_asl_ri,  int_hexagon_S4_ori_asl_ri>;
+def : T_IRI_pat <S4_addi_asl_ri, int_hexagon_S4_addi_asl_ri>;
+def : T_IRI_pat <S4_subi_asl_ri, int_hexagon_S4_subi_asl_ri>;
+def : T_IRI_pat <S4_andi_lsr_ri, int_hexagon_S4_andi_lsr_ri>;
+def : T_IRI_pat <S4_ori_lsr_ri,  int_hexagon_S4_ori_lsr_ri>;
+def : T_IRI_pat <S4_addi_lsr_ri, int_hexagon_S4_addi_lsr_ri>;
+def : T_IRI_pat <S4_subi_lsr_ri, int_hexagon_S4_subi_lsr_ri>;
+
+// Split bitfield
+def : T_RI_pat <A4_bitspliti, int_hexagon_A4_bitspliti>;
+def : T_RR_pat <A4_bitsplit,  int_hexagon_A4_bitsplit>;
+
+def: T_RR_pat<S4_parity,      int_hexagon_S4_parity>;
+
+def: T_Q_RI_pat<S4_ntstbit_i, int_hexagon_S4_ntstbit_i>;
+def: T_Q_RR_pat<S4_ntstbit_r, int_hexagon_S4_ntstbit_r>;
+
+def: T_RI_pat<S4_clbaddi,     int_hexagon_S4_clbaddi>;
+def: T_PI_pat<S4_clbpaddi,    int_hexagon_S4_clbpaddi>;
+def: T_P_pat <S4_clbpnorm,    int_hexagon_S4_clbpnorm>;
+
+//*******************************************************************
+//            ALU32/ALU
+//*******************************************************************
+
+// ALU32 / ALU / Logical Operations.
+def: T_RR_pat<A4_andn, int_hexagon_A4_andn>;
+def: T_RR_pat<A4_orn,  int_hexagon_A4_orn>;
+
+//*******************************************************************
+//            ALU32/PERM
+//*******************************************************************
+
+// Combine Words Into Doublewords.
+def: T_RI_pat<A4_combineri, int_hexagon_A4_combineri, s32_0ImmPred>;
+def: T_IR_pat<A4_combineir, int_hexagon_A4_combineir, s32_0ImmPred>;
+
+//*******************************************************************
+//           ALU32/PRED
+//*******************************************************************
+
+// Compare
+def : T_Q_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi, s32_0ImmPred>;
+def : T_Q_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei, s32_0ImmPred>;
+def : T_Q_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui, u32_0ImmPred>;
+
+// Compare To General Register.
+def: T_Q_RR_pat<C4_cmpneq,  int_hexagon_C4_cmpneq>;
+def: T_Q_RR_pat<C4_cmplte,  int_hexagon_C4_cmplte>;
+def: T_Q_RR_pat<C4_cmplteu, int_hexagon_C4_cmplteu>;
+
+def: T_RR_pat<A4_rcmpeq,  int_hexagon_A4_rcmpeq>;
+def: T_RR_pat<A4_rcmpneq, int_hexagon_A4_rcmpneq>;
+
+def: T_RI_pat<A4_rcmpeqi,  int_hexagon_A4_rcmpeqi>;
+def: T_RI_pat<A4_rcmpneqi, int_hexagon_A4_rcmpneqi>;
+
+//*******************************************************************
+//           CR
+//*******************************************************************
+
+// CR / Logical Operations On Predicates.
+def: T_Q_QQQ_pat<C4_and_and,  int_hexagon_C4_and_and>;
+def: T_Q_QQQ_pat<C4_and_andn, int_hexagon_C4_and_andn>;
+def: T_Q_QQQ_pat<C4_and_or,   int_hexagon_C4_and_or>;
+def: T_Q_QQQ_pat<C4_and_orn,  int_hexagon_C4_and_orn>;
+def: T_Q_QQQ_pat<C4_or_and,   int_hexagon_C4_or_and>;
+def: T_Q_QQQ_pat<C4_or_andn,  int_hexagon_C4_or_andn>;
+def: T_Q_QQQ_pat<C4_or_or,    int_hexagon_C4_or_or>;
+def: T_Q_QQQ_pat<C4_or_orn,   int_hexagon_C4_or_orn>;
+
+//*******************************************************************
+//           XTYPE/ALU
+//*******************************************************************
+
+// Add And Accumulate.
+
+def : T_RRI_pat <S4_addaddi, int_hexagon_S4_addaddi>;
+def : T_RIR_pat <S4_subaddi, int_hexagon_S4_subaddi>;
+
+
+// XTYPE / ALU / Logical-logical Words.
+def : T_RRR_pat <M4_or_xor,   int_hexagon_M4_or_xor>;
+def : T_RRR_pat <M4_and_xor,  int_hexagon_M4_and_xor>;
+def : T_RRR_pat <M4_or_and,   int_hexagon_M4_or_and>;
+def : T_RRR_pat <M4_and_and,  int_hexagon_M4_and_and>;
+def : T_RRR_pat <M4_xor_and,  int_hexagon_M4_xor_and>;
+def : T_RRR_pat <M4_or_or,    int_hexagon_M4_or_or>;
+def : T_RRR_pat <M4_and_or,   int_hexagon_M4_and_or>;
+def : T_RRR_pat <M4_xor_or,   int_hexagon_M4_xor_or>;
+def : T_RRR_pat <M4_or_andn,  int_hexagon_M4_or_andn>;
+def : T_RRR_pat <M4_and_andn, int_hexagon_M4_and_andn>;
+def : T_RRR_pat <M4_xor_andn, int_hexagon_M4_xor_andn>;
+
+def : T_RRI_pat <S4_or_andi, int_hexagon_S4_or_andi>;
+def : T_RRI_pat <S4_or_andix,  int_hexagon_S4_or_andix>;
+def : T_RRI_pat <S4_or_ori, int_hexagon_S4_or_ori>;
+
+// Modulo wrap.
+def : T_RR_pat <A4_modwrapu, int_hexagon_A4_modwrapu>;
+
+// Arithmetic/Convergent round
+// Rd=[cround|round](Rs,Rt)[:sat]
+// Rd=[cround|round](Rs,#u5)[:sat]
+def : T_RI_pat <A4_cround_ri, int_hexagon_A4_cround_ri>;
+def : T_RR_pat <A4_cround_rr, int_hexagon_A4_cround_rr>;
+
+def : T_RI_pat <A4_round_ri, int_hexagon_A4_round_ri>;
+def : T_RR_pat <A4_round_rr, int_hexagon_A4_round_rr>;
+
+def : T_RI_pat <A4_round_ri_sat, int_hexagon_A4_round_ri_sat>;
+def : T_RR_pat <A4_round_rr_sat, int_hexagon_A4_round_rr_sat>;
+
+def : T_P_pat <A2_roundsat, int_hexagon_A2_roundsat>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
new file mode 100644
index 000000000000..f27a63e20e61
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
@@ -0,0 +1,111 @@
+//===- HexagonIntrinsicsV5.td - V5 Instruction intrinsics --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//Rdd[+]=vrmpybsu(Rss,Rtt)
+//Rdd[+]=vrmpybuu(Rss,Rtt)
+let Predicates = [HasV5T]  in {
+def : T_PP_pat  <M5_vrmpybsu, int_hexagon_M5_vrmpybsu>;
+def : T_PP_pat  <M5_vrmpybuu, int_hexagon_M5_vrmpybuu>;
+
+def : T_PP_pat <M5_vdmpybsu, int_hexagon_M5_vdmpybsu>;
+
+def : T_PPP_pat <M5_vrmacbsu, int_hexagon_M5_vrmacbsu>;
+def : T_PPP_pat <M5_vrmacbuu, int_hexagon_M5_vrmacbuu>;
+//Rxx+=vdmpybsu(Rss,Rtt):sat
+def : T_PPP_pat <M5_vdmacbsu, int_hexagon_M5_vdmacbsu>;
+
+// Vector multiply bytes
+// Rdd=vmpyb[s]u(Rs,Rt)
+def : T_RR_pat <M5_vmpybsu, int_hexagon_M5_vmpybsu>;
+def : T_RR_pat <M5_vmpybuu, int_hexagon_M5_vmpybuu>;
+
+// Rxx+=vmpyb[s]u(Rs,Rt)
+def : T_PRR_pat <M5_vmacbsu, int_hexagon_M5_vmacbsu>;
+def : T_PRR_pat <M5_vmacbuu, int_hexagon_M5_vmacbuu>;
+
+// Rd=vaddhub(Rss,Rtt):sat
+def : T_PP_pat <A5_vaddhubs, int_hexagon_A5_vaddhubs>;
+}
+
+def : T_FF_pat<F2_sfadd, int_hexagon_F2_sfadd>;
+def : T_FF_pat<F2_sfsub, int_hexagon_F2_sfsub>;
+def : T_FF_pat<F2_sfmpy, int_hexagon_F2_sfmpy>;
+def : T_FF_pat<F2_sfmax, int_hexagon_F2_sfmax>;
+def : T_FF_pat<F2_sfmin, int_hexagon_F2_sfmin>;
+
+def : T_FF_pat<F2_sffixupn, int_hexagon_F2_sffixupn>;
+def : T_FF_pat<F2_sffixupd, int_hexagon_F2_sffixupd>;
+def : T_F_pat <F2_sffixupr, int_hexagon_F2_sffixupr>;
+
+def : T_Q_QQ_pat<C4_fastcorner9,     int_hexagon_C4_fastcorner9>;
+def : T_Q_QQ_pat<C4_fastcorner9_not, int_hexagon_C4_fastcorner9_not>;
+
+def : T_P_pat <S5_popcountp, int_hexagon_S5_popcountp>;
+def : T_PI_pat <S5_asrhub_sat, int_hexagon_S5_asrhub_sat>;
+
+def : T_PI_pat <S2_asr_i_p_rnd, int_hexagon_S2_asr_i_p_rnd>;
+def : T_PI_pat <S2_asr_i_p_rnd_goodsyntax,
+                int_hexagon_S2_asr_i_p_rnd_goodsyntax>;
+
+def : T_PI_pat <S5_asrhub_rnd_sat_goodsyntax,
+                int_hexagon_S5_asrhub_rnd_sat_goodsyntax>;
+
+def : T_PI_pat <S5_vasrhrnd_goodsyntax, int_hexagon_S5_vasrhrnd_goodsyntax>;
+
+def : T_FFF_pat <F2_sffma, int_hexagon_F2_sffma>;
+def : T_FFF_pat <F2_sffms, int_hexagon_F2_sffms>;
+def : T_FFF_pat <F2_sffma_lib, int_hexagon_F2_sffma_lib>;
+def : T_FFF_pat <F2_sffms_lib, int_hexagon_F2_sffms_lib>;
+def : T_FFFQ_pat <F2_sffma_sc, int_hexagon_F2_sffma_sc>;
+
+// Compare floating-point value
+def : T_Q_FF_pat <F2_sfcmpge, int_hexagon_F2_sfcmpge>;
+def : T_Q_FF_pat <F2_sfcmpuo, int_hexagon_F2_sfcmpuo>;
+def : T_Q_FF_pat <F2_sfcmpeq, int_hexagon_F2_sfcmpeq>;
+def : T_Q_FF_pat <F2_sfcmpgt, int_hexagon_F2_sfcmpgt>;
+
+def : T_Q_DD_pat <F2_dfcmpeq, int_hexagon_F2_dfcmpeq>;
+def : T_Q_DD_pat <F2_dfcmpgt, int_hexagon_F2_dfcmpgt>;
+def : T_Q_DD_pat <F2_dfcmpge, int_hexagon_F2_dfcmpge>;
+def : T_Q_DD_pat <F2_dfcmpuo, int_hexagon_F2_dfcmpuo>;
+
+// Create floating-point value
+def : T_I_pat <F2_sfimm_p, int_hexagon_F2_sfimm_p>;
+def : T_I_pat <F2_sfimm_n, int_hexagon_F2_sfimm_n>;
+def : T_I_pat <F2_dfimm_p, int_hexagon_F2_dfimm_p>;
+def : T_I_pat <F2_dfimm_n, int_hexagon_F2_dfimm_n>;
+
+def : T_Q_DI_pat <F2_dfclass, int_hexagon_F2_dfclass>;
+def : T_Q_FI_pat <F2_sfclass, int_hexagon_F2_sfclass>;
+def : T_F_pat <F2_conv_sf2df, int_hexagon_F2_conv_sf2df>;
+def : T_D_pat <F2_conv_df2sf, int_hexagon_F2_conv_df2sf>;
+def : T_R_pat <F2_conv_uw2sf, int_hexagon_F2_conv_uw2sf>;
+def : T_R_pat <F2_conv_uw2df, int_hexagon_F2_conv_uw2df>;
+def : T_R_pat <F2_conv_w2sf,  int_hexagon_F2_conv_w2sf>;
+def : T_R_pat <F2_conv_w2df,  int_hexagon_F2_conv_w2df>;
+def : T_P_pat <F2_conv_ud2sf, int_hexagon_F2_conv_ud2sf>;
+def : T_P_pat <F2_conv_ud2df, int_hexagon_F2_conv_ud2df>;
+def : T_P_pat <F2_conv_d2sf,  int_hexagon_F2_conv_d2sf>;
+def : T_P_pat <F2_conv_d2df,  int_hexagon_F2_conv_d2df>;
+def : T_F_pat <F2_conv_sf2uw, int_hexagon_F2_conv_sf2uw>;
+def : T_F_pat <F2_conv_sf2w,  int_hexagon_F2_conv_sf2w>;
+def : T_F_pat <F2_conv_sf2ud, int_hexagon_F2_conv_sf2ud>;
+def : T_F_pat <F2_conv_sf2d,  int_hexagon_F2_conv_sf2d>;
+def : T_D_pat <F2_conv_df2uw, int_hexagon_F2_conv_df2uw>;
+def : T_D_pat <F2_conv_df2w,  int_hexagon_F2_conv_df2w>;
+def : T_D_pat <F2_conv_df2ud, int_hexagon_F2_conv_df2ud>;
+def : T_D_pat <F2_conv_df2d,  int_hexagon_F2_conv_df2d>;
+def : T_F_pat <F2_conv_sf2uw_chop, int_hexagon_F2_conv_sf2uw_chop>;
+def : T_F_pat <F2_conv_sf2w_chop,  int_hexagon_F2_conv_sf2w_chop>;
+def : T_F_pat <F2_conv_sf2ud_chop, int_hexagon_F2_conv_sf2ud_chop>;
+def : T_F_pat <F2_conv_sf2d_chop,  int_hexagon_F2_conv_sf2d_chop>;
+def : T_D_pat <F2_conv_df2uw_chop, int_hexagon_F2_conv_df2uw_chop>;
+def : T_D_pat <F2_conv_df2w_chop,  int_hexagon_F2_conv_df2w_chop>;
+def : T_D_pat <F2_conv_df2ud_chop, int_hexagon_F2_conv_df2ud_chop>;
+def : T_D_pat <F2_conv_df2d_chop,  int_hexagon_F2_conv_df2d_chop>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
new file mode 100644
index 000000000000..a45e1c9d7be4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
@@ -0,0 +1,803 @@
+//=- HexagonIntrinsicsV60.td - Target Description for Hexagon -*- tablegen *-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V60 Compiler Intrinsics in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+
+let AddedComplexity = 100 in {
+def : Pat < (v16i32 (int_hexagon_V6_lo (v32i32 VecDblRegs:$src1))),
+            (v16i32 (EXTRACT_SUBREG (v32i32 VecDblRegs:$src1), vsub_lo)) >,
+            Requires<[UseHVXSgl]>;
+
+def : Pat < (v16i32 (int_hexagon_V6_hi (v32i32 VecDblRegs:$src1))),
+            (v16i32 (EXTRACT_SUBREG (v32i32 VecDblRegs:$src1), vsub_hi)) >,
+            Requires<[UseHVXSgl]>;
+
+def : Pat < (v32i32 (int_hexagon_V6_lo_128B (v64i32 VecDblRegs128B:$src1))),
+            (v32i32 (EXTRACT_SUBREG (v64i32 VecDblRegs128B:$src1), vsub_lo)) >,
+            Requires<[UseHVXDbl]>;
+
+def : Pat < (v32i32 (int_hexagon_V6_hi_128B (v64i32 VecDblRegs128B:$src1))),
+            (v32i32 (EXTRACT_SUBREG (v64i32 VecDblRegs128B:$src1), vsub_hi)) >,
+            Requires<[UseHVXDbl]>;
+}
+
+def : Pat <(v512i1 (bitconvert (v16i32 VectorRegs:$src1))),
+           (v512i1 (V6_vandvrt(v16i32 VectorRegs:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v512i1 (bitconvert (v32i16 VectorRegs:$src1))),
+           (v512i1 (V6_vandvrt(v32i16 VectorRegs:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v512i1 (bitconvert (v64i8  VectorRegs:$src1))),
+           (v512i1 (V6_vandvrt(v64i8  VectorRegs:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v512i1 (bitconvert (v8i64  VectorRegs:$src1))),
+           (v512i1 (V6_vandvrt(v8i64  VectorRegs:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v16i32 (bitconvert (v512i1 VecPredRegs:$src1))),
+           (v16i32 (V6_vandqrt(v512i1 VecPredRegs:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v32i16 (bitconvert (v512i1 VecPredRegs:$src1))),
+           (v32i16 (V6_vandqrt(v512i1 VecPredRegs:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v64i8  (bitconvert (v512i1 VecPredRegs:$src1))),
+           (v64i8  (V6_vandqrt(v512i1 VecPredRegs:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v8i64  (bitconvert (v512i1 VecPredRegs:$src1))),
+           (v8i64  (V6_vandqrt(v512i1 VecPredRegs:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v1024i1 (bitconvert (v32i32 VectorRegs128B:$src1))),
+           (v1024i1 (V6_vandvrt_128B(v32i32 VectorRegs128B:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+
+def : Pat <(v1024i1 (bitconvert (v64i16 VectorRegs128B:$src1))),
+           (v1024i1 (V6_vandvrt_128B(v64i16 VectorRegs128B:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+
+def : Pat <(v1024i1 (bitconvert (v128i8  VectorRegs128B:$src1))),
+           (v1024i1 (V6_vandvrt_128B(v128i8  VectorRegs128B:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+
+def : Pat <(v1024i1 (bitconvert (v16i64  VectorRegs128B:$src1))),
+           (v1024i1 (V6_vandvrt_128B(v16i64  VectorRegs128B:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+
+def : Pat <(v32i32 (bitconvert (v1024i1 VecPredRegs128B:$src1))),
+           (v32i32 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+
+def : Pat <(v64i16 (bitconvert (v1024i1 VecPredRegs128B:$src1))),
+           (v64i16 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+
+def : Pat <(v128i8  (bitconvert (v1024i1 VecPredRegs128B:$src1))),
+           (v128i8  (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+
+def : Pat <(v16i64  (bitconvert (v1024i1 VecPredRegs128B:$src1))),
+           (v16i64  (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1),
+                                              (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+
+let AddedComplexity = 140 in {
+def : Pat <(store (v512i1 VecPredRegs:$src1), (i32 IntRegs:$addr)),
+           (V6_vS32b_ai IntRegs:$addr, 0,
+           (v16i32 (V6_vandqrt (v512i1 VecPredRegs:$src1),
+                                       (A2_tfrsi 0x01010101))))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(v512i1 (load (i32 IntRegs:$addr))),
+           (v512i1 (V6_vandvrt
+           (v16i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXSgl]>;
+
+def : Pat <(store (v1024i1 VecPredRegs128B:$src1), (i32 IntRegs:$addr)),
+           (V6_vS32b_ai_128B IntRegs:$addr, 0,
+           (v32i32 (V6_vandqrt_128B (v1024i1 VecPredRegs128B:$src1),
+                                       (A2_tfrsi 0x01010101))))>,
+            Requires<[UseHVXDbl]>;
+
+def : Pat <(v1024i1 (load (i32 IntRegs:$addr))),
+           (v1024i1 (V6_vandvrt_128B
+           (v32i32 (V6_vL32b_ai_128B IntRegs:$addr, 0)),
+                                       (A2_tfrsi 0x01010101)))>,
+            Requires<[UseHVXDbl]>;
+}
+
+multiclass T_R_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID IntRegs:$src1), (MI IntRegs:$src1)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1),
+           (!cast<InstHexagon>(MI#"_128B") IntRegs:$src1)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_V_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1),
+           (MI    VectorRegs:$src1)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_W_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1),
+           (MI    VecDblRegs:$src1)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1),
+           (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_Q_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1),
+           (MI    VecPredRegs:$src1)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1),
+           (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, IntRegs:$src2),
+           (MI    VecDblRegs:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B")VecDblRegs128B:$src1, IntRegs:$src2),
+           (!cast<InstHexagon>(MI#"_128B")VecDblRegs128B:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, IntRegs:$src2),
+           (MI    VectorRegs:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B")VectorRegs128B:$src1, IntRegs:$src2),
+           (!cast<InstHexagon>(MI#"_128B")VectorRegs128B:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WV_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2),
+           (MI    VecDblRegs:$src1, VectorRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+                                            VectorRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B")  VecDblRegs128B:$src1,
+                                            VectorRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WW_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2),
+           (MI    VecDblRegs:$src1, VecDblRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+                                            VecDblRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B")  VecDblRegs128B:$src1,
+                                            VecDblRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VV_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2),
+           (MI    VectorRegs:$src1, VectorRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B")  VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_QR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, IntRegs:$src2),
+           (MI    VecPredRegs:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+                                            IntRegs:$src2),
+           (!cast<InstHexagon>(MI#"_128B")  VecPredRegs128B:$src1,
+                                            IntRegs:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_QQ_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, VecPredRegs:$src2),
+           (MI    VecPredRegs:$src1, VecPredRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+                                            VecPredRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B")  VecPredRegs128B:$src1,
+                                            VecPredRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WWR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3),
+           (MI    VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+                                            VecDblRegs128B:$src2,
+                                            IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VecDblRegs128B:$src1,
+                                            VecDblRegs128B:$src2,
+                                            IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3),
+           (MI    VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, IntRegs:$src3),
+           (MI    VecDblRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VecDblRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VWR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VecDblRegs:$src2, IntRegs:$src3),
+           (MI    VectorRegs:$src1, VecDblRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+                                            VecDblRegs128B:$src2,
+                                            IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VectorRegs128B:$src1,
+                                            VecDblRegs128B:$src2,
+                                            IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVV_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+           (MI    VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVV_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+           (MI    VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VecDblRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_QVV_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+           (MI    VecPredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VecPredRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VQR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3),
+           (MI    VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+                                            VecPredRegs128B:$src2,
+                                            IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VectorRegs128B:$src1,
+                                            VecPredRegs128B:$src2,
+                                            IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+
+multiclass T_QVR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, VectorRegs:$src2, IntRegs:$src3),
+           (MI    VecPredRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VecPredRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVI_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, imm:$src3),
+           (MI    VectorRegs:$src1, VectorRegs:$src2, imm:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2, imm:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2, imm:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WRI_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, IntRegs:$src2, imm:$src3),
+           (MI    VecDblRegs:$src1, IntRegs:$src2, imm:$src3)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+                                            IntRegs:$src2, imm:$src3),
+           (!cast<InstHexagon>(MI#"_128B")  VecDblRegs128B:$src1,
+                                            IntRegs:$src2, imm:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WWRI_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3, imm:$src4),
+           (MI   VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3, imm:$src4)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+                                            VecDblRegs128B:$src2,
+                                            IntRegs:$src3, imm:$src4),
+           (!cast<InstHexagon>(MI#"_128B")  VecDblRegs128B:$src1,
+                                            VecDblRegs128B:$src2,
+                                            IntRegs:$src3, imm:$src4)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVVR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3,
+                  IntRegs:$src4),
+           (MI    VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3,
+                  IntRegs:$src4)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3,
+                                            IntRegs:$src4),
+           (!cast<InstHexagon>(MI#"_128B")  VectorRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3,
+                                            IntRegs:$src4)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVVR_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3,
+                  IntRegs:$src4),
+           (MI    VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3,
+                  IntRegs:$src4)>,
+       Requires<[UseHVXSgl]>;
+
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3,
+                                            IntRegs:$src4),
+           (!cast<InstHexagon>(MI#"_128B")  VecDblRegs128B:$src1,
+                                            VectorRegs128B:$src2,
+                                            VectorRegs128B:$src3,
+                                            IntRegs:$src4)>,
+       Requires<[UseHVXDbl]>;
+}
+
+defm : T_WR_pat <V6_vtmpyb, int_hexagon_V6_vtmpyb>;
+defm : T_WR_pat <V6_vtmpybus, int_hexagon_V6_vtmpybus>;
+defm : T_VR_pat <V6_vdmpyhb, int_hexagon_V6_vdmpyhb>;
+defm : T_VR_pat <V6_vrmpyub, int_hexagon_V6_vrmpyub>;
+defm : T_VR_pat <V6_vrmpybus, int_hexagon_V6_vrmpybus>;
+defm : T_WR_pat <V6_vdsaduh, int_hexagon_V6_vdsaduh>;
+defm : T_VR_pat <V6_vdmpybus, int_hexagon_V6_vdmpybus>;
+defm : T_WR_pat <V6_vdmpybus_dv, int_hexagon_V6_vdmpybus_dv>;
+defm : T_VR_pat <V6_vdmpyhsusat, int_hexagon_V6_vdmpyhsusat>;
+defm : T_WR_pat <V6_vdmpyhsuisat, int_hexagon_V6_vdmpyhsuisat>;
+defm : T_VR_pat <V6_vdmpyhsat, int_hexagon_V6_vdmpyhsat>;
+defm : T_WR_pat <V6_vdmpyhisat, int_hexagon_V6_vdmpyhisat>;
+defm : T_WR_pat <V6_vdmpyhb_dv, int_hexagon_V6_vdmpyhb_dv>;
+defm : T_VR_pat <V6_vmpybus, int_hexagon_V6_vmpybus>;
+defm : T_WR_pat <V6_vmpabus, int_hexagon_V6_vmpabus>;
+defm : T_WR_pat <V6_vmpahb, int_hexagon_V6_vmpahb>;
+defm : T_VR_pat <V6_vmpyh, int_hexagon_V6_vmpyh>;
+defm : T_VR_pat <V6_vmpyhss, int_hexagon_V6_vmpyhss>;
+defm : T_VR_pat <V6_vmpyhsrs, int_hexagon_V6_vmpyhsrs>;
+defm : T_VR_pat <V6_vmpyuh, int_hexagon_V6_vmpyuh>;
+defm : T_VR_pat <V6_vmpyihb, int_hexagon_V6_vmpyihb>;
+defm : T_VR_pat <V6_vror, int_hexagon_V6_vror>;
+defm : T_VR_pat <V6_vasrw, int_hexagon_V6_vasrw>;
+defm : T_VR_pat <V6_vasrh, int_hexagon_V6_vasrh>;
+defm : T_VR_pat <V6_vaslw, int_hexagon_V6_vaslw>;
+defm : T_VR_pat <V6_vaslh, int_hexagon_V6_vaslh>;
+defm : T_VR_pat <V6_vlsrw, int_hexagon_V6_vlsrw>;
+defm : T_VR_pat <V6_vlsrh, int_hexagon_V6_vlsrh>;
+defm : T_VR_pat <V6_vmpyiwh, int_hexagon_V6_vmpyiwh>;
+defm : T_VR_pat <V6_vmpyiwb, int_hexagon_V6_vmpyiwb>;
+defm : T_WR_pat <V6_vtmpyhb, int_hexagon_V6_vtmpyhb>;
+defm : T_VR_pat <V6_vmpyub, int_hexagon_V6_vmpyub>;
+
+defm : T_VV_pat <V6_vrmpyubv, int_hexagon_V6_vrmpyubv>;
+defm : T_VV_pat <V6_vrmpybv, int_hexagon_V6_vrmpybv>;
+defm : T_VV_pat <V6_vrmpybusv, int_hexagon_V6_vrmpybusv>;
+defm : T_VV_pat <V6_vdmpyhvsat, int_hexagon_V6_vdmpyhvsat>;
+defm : T_VV_pat <V6_vmpybv, int_hexagon_V6_vmpybv>;
+defm : T_VV_pat <V6_vmpyubv, int_hexagon_V6_vmpyubv>;
+defm : T_VV_pat <V6_vmpybusv, int_hexagon_V6_vmpybusv>;
+defm : T_VV_pat <V6_vmpyhv, int_hexagon_V6_vmpyhv>;
+defm : T_VV_pat <V6_vmpyuhv, int_hexagon_V6_vmpyuhv>;
+defm : T_VV_pat <V6_vmpyhvsrs, int_hexagon_V6_vmpyhvsrs>;
+defm : T_VV_pat <V6_vmpyhus, int_hexagon_V6_vmpyhus>;
+defm : T_WW_pat <V6_vmpabusv, int_hexagon_V6_vmpabusv>;
+defm : T_VV_pat <V6_vmpyih, int_hexagon_V6_vmpyih>;
+defm : T_VV_pat <V6_vand, int_hexagon_V6_vand>;
+defm : T_VV_pat <V6_vor, int_hexagon_V6_vor>;
+defm : T_VV_pat <V6_vxor, int_hexagon_V6_vxor>;
+defm : T_VV_pat <V6_vaddw, int_hexagon_V6_vaddw>;
+defm : T_VV_pat <V6_vaddubsat, int_hexagon_V6_vaddubsat>;
+defm : T_VV_pat <V6_vadduhsat, int_hexagon_V6_vadduhsat>;
+defm : T_VV_pat <V6_vaddhsat, int_hexagon_V6_vaddhsat>;
+defm : T_VV_pat <V6_vaddwsat, int_hexagon_V6_vaddwsat>;
+defm : T_VV_pat <V6_vsubb, int_hexagon_V6_vsubb>;
+defm : T_VV_pat <V6_vsubh, int_hexagon_V6_vsubh>;
+defm : T_VV_pat <V6_vsubw, int_hexagon_V6_vsubw>;
+defm : T_VV_pat <V6_vsububsat, int_hexagon_V6_vsububsat>;
+defm : T_VV_pat <V6_vsubuhsat, int_hexagon_V6_vsubuhsat>;
+defm : T_VV_pat <V6_vsubhsat, int_hexagon_V6_vsubhsat>;
+defm : T_VV_pat <V6_vsubwsat, int_hexagon_V6_vsubwsat>;
+defm : T_WW_pat <V6_vaddb_dv, int_hexagon_V6_vaddb_dv>;
+defm : T_WW_pat <V6_vaddh_dv, int_hexagon_V6_vaddh_dv>;
+defm : T_WW_pat <V6_vaddw_dv, int_hexagon_V6_vaddw_dv>;
+defm : T_WW_pat <V6_vaddubsat_dv, int_hexagon_V6_vaddubsat_dv>;
+defm : T_WW_pat <V6_vadduhsat_dv, int_hexagon_V6_vadduhsat_dv>;
+defm : T_WW_pat <V6_vaddhsat_dv, int_hexagon_V6_vaddhsat_dv>;
+defm : T_WW_pat <V6_vaddwsat_dv, int_hexagon_V6_vaddwsat_dv>;
+defm : T_WW_pat <V6_vsubb_dv, int_hexagon_V6_vsubb_dv>;
+defm : T_WW_pat <V6_vsubh_dv, int_hexagon_V6_vsubh_dv>;
+defm : T_WW_pat <V6_vsubw_dv, int_hexagon_V6_vsubw_dv>;
+defm : T_WW_pat <V6_vsububsat_dv, int_hexagon_V6_vsububsat_dv>;
+defm : T_WW_pat <V6_vsubuhsat_dv, int_hexagon_V6_vsubuhsat_dv>;
+defm : T_WW_pat <V6_vsubhsat_dv, int_hexagon_V6_vsubhsat_dv>;
+defm : T_WW_pat <V6_vsubwsat_dv, int_hexagon_V6_vsubwsat_dv>;
+defm : T_VV_pat <V6_vaddubh, int_hexagon_V6_vaddubh>;
+defm : T_VV_pat <V6_vadduhw, int_hexagon_V6_vadduhw>;
+defm : T_VV_pat <V6_vaddhw, int_hexagon_V6_vaddhw>;
+defm : T_VV_pat <V6_vsububh, int_hexagon_V6_vsububh>;
+defm : T_VV_pat <V6_vsubuhw, int_hexagon_V6_vsubuhw>;
+defm : T_VV_pat <V6_vsubhw, int_hexagon_V6_vsubhw>;
+defm : T_VV_pat <V6_vabsdiffub, int_hexagon_V6_vabsdiffub>;
+defm : T_VV_pat <V6_vabsdiffh, int_hexagon_V6_vabsdiffh>;
+defm : T_VV_pat <V6_vabsdiffuh, int_hexagon_V6_vabsdiffuh>;
+defm : T_VV_pat <V6_vabsdiffw, int_hexagon_V6_vabsdiffw>;
+defm : T_VV_pat <V6_vavgub, int_hexagon_V6_vavgub>;
+defm : T_VV_pat <V6_vavguh, int_hexagon_V6_vavguh>;
+defm : T_VV_pat <V6_vavgh, int_hexagon_V6_vavgh>;
+defm : T_VV_pat <V6_vavgw, int_hexagon_V6_vavgw>;
+defm : T_VV_pat <V6_vnavgub, int_hexagon_V6_vnavgub>;
+defm : T_VV_pat <V6_vnavgh, int_hexagon_V6_vnavgh>;
+defm : T_VV_pat <V6_vnavgw, int_hexagon_V6_vnavgw>;
+defm : T_VV_pat <V6_vavgubrnd, int_hexagon_V6_vavgubrnd>;
+defm : T_VV_pat <V6_vavguhrnd, int_hexagon_V6_vavguhrnd>;
+defm : T_VV_pat <V6_vavghrnd, int_hexagon_V6_vavghrnd>;
+defm : T_VV_pat <V6_vavgwrnd, int_hexagon_V6_vavgwrnd>;
+defm : T_WW_pat <V6_vmpabuuv, int_hexagon_V6_vmpabuuv>;
+
+defm : T_VVR_pat <V6_vdmpyhb_acc, int_hexagon_V6_vdmpyhb_acc>;
+defm : T_VVR_pat <V6_vrmpyub_acc, int_hexagon_V6_vrmpyub_acc>;
+defm : T_VVR_pat <V6_vrmpybus_acc, int_hexagon_V6_vrmpybus_acc>;
+defm : T_VVR_pat <V6_vdmpybus_acc, int_hexagon_V6_vdmpybus_acc>;
+defm : T_VVR_pat <V6_vdmpyhsusat_acc, int_hexagon_V6_vdmpyhsusat_acc>;
+defm : T_VVR_pat <V6_vdmpyhsat_acc, int_hexagon_V6_vdmpyhsat_acc>;
+defm : T_VVR_pat <V6_vmpyiwb_acc, int_hexagon_V6_vmpyiwb_acc>;
+defm : T_VVR_pat <V6_vmpyiwh_acc, int_hexagon_V6_vmpyiwh_acc>;
+defm : T_VVR_pat <V6_vmpyihb_acc, int_hexagon_V6_vmpyihb_acc>;
+defm : T_VVR_pat <V6_vaslw_acc, int_hexagon_V6_vaslw_acc>;
+defm : T_VVR_pat <V6_vasrw_acc, int_hexagon_V6_vasrw_acc>;
+
+defm : T_VWR_pat <V6_vdmpyhsuisat_acc, int_hexagon_V6_vdmpyhsuisat_acc>;
+defm : T_VWR_pat <V6_vdmpyhisat_acc, int_hexagon_V6_vdmpyhisat_acc>;
+
+defm : T_WVR_pat <V6_vmpybus_acc, int_hexagon_V6_vmpybus_acc>;
+defm : T_WVR_pat <V6_vmpyhsat_acc, int_hexagon_V6_vmpyhsat_acc>;
+defm : T_WVR_pat <V6_vmpyuh_acc, int_hexagon_V6_vmpyuh_acc>;
+defm : T_WVR_pat <V6_vmpyub_acc, int_hexagon_V6_vmpyub_acc>;
+
+defm : T_WWR_pat <V6_vtmpyb_acc, int_hexagon_V6_vtmpyb_acc>;
+defm : T_WWR_pat <V6_vtmpybus_acc, int_hexagon_V6_vtmpybus_acc>;
+defm : T_WWR_pat <V6_vtmpyhb_acc, int_hexagon_V6_vtmpyhb_acc>;
+defm : T_WWR_pat <V6_vdmpybus_dv_acc, int_hexagon_V6_vdmpybus_dv_acc>;
+defm : T_WWR_pat <V6_vdmpyhb_dv_acc, int_hexagon_V6_vdmpyhb_dv_acc>;
+defm : T_WWR_pat <V6_vmpabus_acc, int_hexagon_V6_vmpabus_acc>;
+defm : T_WWR_pat <V6_vmpahb_acc, int_hexagon_V6_vmpahb_acc>;
+defm : T_WWR_pat <V6_vdsaduh_acc, int_hexagon_V6_vdsaduh_acc>;
+
+defm : T_VVV_pat <V6_vdmpyhvsat_acc, int_hexagon_V6_vdmpyhvsat_acc>;
+defm : T_WVV_pat <V6_vmpybusv_acc, int_hexagon_V6_vmpybusv_acc>;
+defm : T_WVV_pat <V6_vmpybv_acc, int_hexagon_V6_vmpybv_acc>;
+defm : T_WVV_pat <V6_vmpyhus_acc, int_hexagon_V6_vmpyhus_acc>;
+defm : T_WVV_pat <V6_vmpyhv_acc, int_hexagon_V6_vmpyhv_acc>;
+defm : T_VVV_pat <V6_vmpyiewh_acc, int_hexagon_V6_vmpyiewh_acc>;
+defm : T_VVV_pat <V6_vmpyiewuh_acc, int_hexagon_V6_vmpyiewuh_acc>;
+defm : T_VVV_pat <V6_vmpyih_acc, int_hexagon_V6_vmpyih_acc>;
+defm : T_VVV_pat <V6_vmpyowh_rnd_sacc, int_hexagon_V6_vmpyowh_rnd_sacc>;
+defm : T_VVV_pat <V6_vmpyowh_sacc, int_hexagon_V6_vmpyowh_sacc>;
+defm : T_WVV_pat <V6_vmpyubv_acc, int_hexagon_V6_vmpyubv_acc>;
+defm : T_WVV_pat <V6_vmpyuhv_acc, int_hexagon_V6_vmpyuhv_acc>;
+defm : T_VVV_pat <V6_vrmpybusv_acc, int_hexagon_V6_vrmpybusv_acc>;
+defm : T_VVV_pat <V6_vrmpybv_acc, int_hexagon_V6_vrmpybv_acc>;
+defm : T_VVV_pat <V6_vrmpyubv_acc, int_hexagon_V6_vrmpyubv_acc>;
+
+// Compare instructions
+defm : T_QVV_pat <V6_veqb_and, int_hexagon_V6_veqb_and>;
+defm : T_QVV_pat <V6_veqh_and, int_hexagon_V6_veqh_and>;
+defm : T_QVV_pat <V6_veqw_and, int_hexagon_V6_veqw_and>;
+defm : T_QVV_pat <V6_vgtb_and, int_hexagon_V6_vgtb_and>;
+defm : T_QVV_pat <V6_vgth_and, int_hexagon_V6_vgth_and>;
+defm : T_QVV_pat <V6_vgtw_and, int_hexagon_V6_vgtw_and>;
+defm : T_QVV_pat <V6_vgtub_and, int_hexagon_V6_vgtub_and>;
+defm : T_QVV_pat <V6_vgtuh_and, int_hexagon_V6_vgtuh_and>;
+defm : T_QVV_pat <V6_vgtuw_and, int_hexagon_V6_vgtuw_and>;
+defm : T_QVV_pat <V6_veqb_or, int_hexagon_V6_veqb_or>;
+defm : T_QVV_pat <V6_veqh_or, int_hexagon_V6_veqh_or>;
+defm : T_QVV_pat <V6_veqw_or, int_hexagon_V6_veqw_or>;
+defm : T_QVV_pat <V6_vgtb_or, int_hexagon_V6_vgtb_or>;
+defm : T_QVV_pat <V6_vgth_or, int_hexagon_V6_vgth_or>;
+defm : T_QVV_pat <V6_vgtw_or, int_hexagon_V6_vgtw_or>;
+defm : T_QVV_pat <V6_vgtub_or, int_hexagon_V6_vgtub_or>;
+defm : T_QVV_pat <V6_vgtuh_or, int_hexagon_V6_vgtuh_or>;
+defm : T_QVV_pat <V6_vgtuw_or, int_hexagon_V6_vgtuw_or>;
+defm : T_QVV_pat <V6_veqb_xor, int_hexagon_V6_veqb_xor>;
+defm : T_QVV_pat <V6_veqh_xor, int_hexagon_V6_veqh_xor>;
+defm : T_QVV_pat <V6_veqw_xor, int_hexagon_V6_veqw_xor>;
+defm : T_QVV_pat <V6_vgtb_xor, int_hexagon_V6_vgtb_xor>;
+defm : T_QVV_pat <V6_vgth_xor, int_hexagon_V6_vgth_xor>;
+defm : T_QVV_pat <V6_vgtw_xor, int_hexagon_V6_vgtw_xor>;
+defm : T_QVV_pat <V6_vgtub_xor, int_hexagon_V6_vgtub_xor>;
+defm : T_QVV_pat <V6_vgtuh_xor, int_hexagon_V6_vgtuh_xor>;
+defm : T_QVV_pat <V6_vgtuw_xor, int_hexagon_V6_vgtuw_xor>;
+
+defm : T_VV_pat <V6_vminub, int_hexagon_V6_vminub>;
+defm : T_VV_pat <V6_vminuh, int_hexagon_V6_vminuh>;
+defm : T_VV_pat <V6_vminh, int_hexagon_V6_vminh>;
+defm : T_VV_pat <V6_vminw, int_hexagon_V6_vminw>;
+defm : T_VV_pat <V6_vmaxub, int_hexagon_V6_vmaxub>;
+defm : T_VV_pat <V6_vmaxuh, int_hexagon_V6_vmaxuh>;
+defm : T_VV_pat <V6_vmaxh, int_hexagon_V6_vmaxh>;
+defm : T_VV_pat <V6_vmaxw, int_hexagon_V6_vmaxw>;
+defm : T_VV_pat <V6_vdelta, int_hexagon_V6_vdelta>;
+defm : T_VV_pat <V6_vrdelta, int_hexagon_V6_vrdelta>;
+defm : T_VV_pat <V6_vdealb4w, int_hexagon_V6_vdealb4w>;
+defm : T_VV_pat <V6_vmpyowh_rnd, int_hexagon_V6_vmpyowh_rnd>;
+defm : T_VV_pat <V6_vshuffeb, int_hexagon_V6_vshuffeb>;
+defm : T_VV_pat <V6_vshuffob, int_hexagon_V6_vshuffob>;
+defm : T_VV_pat <V6_vshufeh, int_hexagon_V6_vshufeh>;
+defm : T_VV_pat <V6_vshufoh, int_hexagon_V6_vshufoh>;
+defm : T_VV_pat <V6_vshufoeh, int_hexagon_V6_vshufoeh>;
+defm : T_VV_pat <V6_vshufoeb, int_hexagon_V6_vshufoeb>;
+defm : T_VV_pat <V6_vcombine, int_hexagon_V6_vcombine>;
+defm : T_VV_pat <V6_vmpyieoh, int_hexagon_V6_vmpyieoh>;
+defm : T_VV_pat <V6_vsathub, int_hexagon_V6_vsathub>;
+defm : T_VV_pat <V6_vsatwh, int_hexagon_V6_vsatwh>;
+defm : T_VV_pat <V6_vroundwh, int_hexagon_V6_vroundwh>;
+defm : T_VV_pat <V6_vroundwuh, int_hexagon_V6_vroundwuh>;
+defm : T_VV_pat <V6_vroundhb, int_hexagon_V6_vroundhb>;
+defm : T_VV_pat <V6_vroundhub, int_hexagon_V6_vroundhub>;
+defm : T_VV_pat <V6_vasrwv, int_hexagon_V6_vasrwv>;
+defm : T_VV_pat <V6_vlsrwv, int_hexagon_V6_vlsrwv>;
+defm : T_VV_pat <V6_vlsrhv, int_hexagon_V6_vlsrhv>;
+defm : T_VV_pat <V6_vasrhv, int_hexagon_V6_vasrhv>;
+defm : T_VV_pat <V6_vaslwv, int_hexagon_V6_vaslwv>;
+defm : T_VV_pat <V6_vaslhv, int_hexagon_V6_vaslhv>;
+defm : T_VV_pat <V6_vaddb, int_hexagon_V6_vaddb>;
+defm : T_VV_pat <V6_vaddh, int_hexagon_V6_vaddh>;
+defm : T_VV_pat <V6_vmpyiewuh, int_hexagon_V6_vmpyiewuh>;
+defm : T_VV_pat <V6_vmpyiowh, int_hexagon_V6_vmpyiowh>;
+defm : T_VV_pat <V6_vpackeb, int_hexagon_V6_vpackeb>;
+defm : T_VV_pat <V6_vpackeh, int_hexagon_V6_vpackeh>;
+defm : T_VV_pat <V6_vpackhub_sat, int_hexagon_V6_vpackhub_sat>;
+defm : T_VV_pat <V6_vpackhb_sat, int_hexagon_V6_vpackhb_sat>;
+defm : T_VV_pat <V6_vpackwuh_sat, int_hexagon_V6_vpackwuh_sat>;
+defm : T_VV_pat <V6_vpackwh_sat, int_hexagon_V6_vpackwh_sat>;
+defm : T_VV_pat <V6_vpackob, int_hexagon_V6_vpackob>;
+defm : T_VV_pat <V6_vpackoh, int_hexagon_V6_vpackoh>;
+defm : T_VV_pat <V6_vmpyewuh, int_hexagon_V6_vmpyewuh>;
+defm : T_VV_pat <V6_vmpyowh, int_hexagon_V6_vmpyowh>;
+
+defm : T_QVV_pat <V6_vaddbq, int_hexagon_V6_vaddbq>;
+defm : T_QVV_pat <V6_vaddhq, int_hexagon_V6_vaddhq>;
+defm : T_QVV_pat <V6_vaddwq, int_hexagon_V6_vaddwq>;
+defm : T_QVV_pat <V6_vaddbnq, int_hexagon_V6_vaddbnq>;
+defm : T_QVV_pat <V6_vaddhnq, int_hexagon_V6_vaddhnq>;
+defm : T_QVV_pat <V6_vaddwnq, int_hexagon_V6_vaddwnq>;
+defm : T_QVV_pat <V6_vsubbq, int_hexagon_V6_vsubbq>;
+defm : T_QVV_pat <V6_vsubhq, int_hexagon_V6_vsubhq>;
+defm : T_QVV_pat <V6_vsubwq, int_hexagon_V6_vsubwq>;
+defm : T_QVV_pat <V6_vsubbnq, int_hexagon_V6_vsubbnq>;
+defm : T_QVV_pat <V6_vsubhnq, int_hexagon_V6_vsubhnq>;
+defm : T_QVV_pat <V6_vsubwnq, int_hexagon_V6_vsubwnq>;
+
+defm : T_V_pat <V6_vabsh, int_hexagon_V6_vabsh>;
+defm : T_V_pat <V6_vabsw, int_hexagon_V6_vabsw>;
+defm : T_V_pat <V6_vabsw_sat, int_hexagon_V6_vabsw_sat>;
+defm : T_V_pat <V6_vabsh_sat, int_hexagon_V6_vabsh_sat>;
+defm : T_V_pat <V6_vnot, int_hexagon_V6_vnot>;
+defm : T_V_pat <V6_vassign, int_hexagon_V6_vassign>;
+defm : T_V_pat <V6_vzb, int_hexagon_V6_vzb>;
+defm : T_V_pat <V6_vzh, int_hexagon_V6_vzh>;
+defm : T_V_pat <V6_vsb, int_hexagon_V6_vsb>;
+defm : T_V_pat <V6_vsh, int_hexagon_V6_vsh>;
+defm : T_V_pat <V6_vdealh, int_hexagon_V6_vdealh>;
+defm : T_V_pat <V6_vdealb, int_hexagon_V6_vdealb>;
+defm : T_V_pat <V6_vunpackub, int_hexagon_V6_vunpackub>;
+defm : T_V_pat <V6_vunpackuh, int_hexagon_V6_vunpackuh>;
+defm : T_V_pat <V6_vunpackb, int_hexagon_V6_vunpackb>;
+defm : T_V_pat <V6_vunpackh, int_hexagon_V6_vunpackh>;
+defm : T_V_pat <V6_vshuffh, int_hexagon_V6_vshuffh>;
+defm : T_V_pat <V6_vshuffb, int_hexagon_V6_vshuffb>;
+defm : T_V_pat <V6_vcl0w, int_hexagon_V6_vcl0w>;
+defm : T_V_pat <V6_vpopcounth, int_hexagon_V6_vpopcounth>;
+defm : T_V_pat <V6_vcl0h, int_hexagon_V6_vcl0h>;
+defm : T_V_pat <V6_vnormamtw, int_hexagon_V6_vnormamtw>;
+defm : T_V_pat <V6_vnormamth, int_hexagon_V6_vnormamth>;
+
+defm : T_W_pat <V6_lo, int_hexagon_V6_lo>;
+defm : T_W_pat <V6_hi, int_hexagon_V6_hi>;
+defm : T_W_pat <V6_vassignp, int_hexagon_V6_vassignp>;
+
+defm : T_WRI_pat <V6_vrmpybusi, int_hexagon_V6_vrmpybusi>;
+defm : T_WRI_pat <V6_vrsadubi, int_hexagon_V6_vrsadubi>;
+defm : T_WRI_pat <V6_vrmpyubi, int_hexagon_V6_vrmpyubi>;
+
+defm : T_WWRI_pat <V6_vrmpybusi_acc, int_hexagon_V6_vrmpybusi_acc>;
+defm : T_WWRI_pat <V6_vrsadubi_acc, int_hexagon_V6_vrsadubi_acc>;
+defm : T_WWRI_pat <V6_vrmpyubi_acc, int_hexagon_V6_vrmpyubi_acc>;
+
+// assembler mapped.
+//defm : T_V_pat <V6_vtran2x2, int_hexagon_V6_vtran2x2>;
+// not present earlier.. need to add intrinsic
+defm : T_VVR_pat <V6_valignb, int_hexagon_V6_valignb>;
+defm : T_VVR_pat <V6_vlalignb, int_hexagon_V6_vlalignb>;
+defm : T_VVR_pat <V6_vasrwh, int_hexagon_V6_vasrwh>;
+defm : T_VVR_pat <V6_vasrwhsat, int_hexagon_V6_vasrwhsat>;
+defm : T_VVR_pat <V6_vasrwhrndsat, int_hexagon_V6_vasrwhrndsat>;
+defm : T_VVR_pat <V6_vasrwuhsat, int_hexagon_V6_vasrwuhsat>;
+defm : T_VVR_pat <V6_vasrhubsat, int_hexagon_V6_vasrhubsat>;
+defm : T_VVR_pat <V6_vasrhubrndsat, int_hexagon_V6_vasrhubrndsat>;
+defm : T_VVR_pat <V6_vasrhbrndsat, int_hexagon_V6_vasrhbrndsat>;
+
+defm : T_VVR_pat <V6_vshuffvdd, int_hexagon_V6_vshuffvdd>;
+defm : T_VVR_pat <V6_vdealvdd, int_hexagon_V6_vdealvdd>;
+
+defm : T_WV_pat <V6_vunpackob, int_hexagon_V6_vunpackob>;
+defm : T_WV_pat <V6_vunpackoh, int_hexagon_V6_vunpackoh>;
+defm : T_VVI_pat <V6_valignbi, int_hexagon_V6_valignbi>;
+defm : T_VVI_pat <V6_vlalignbi, int_hexagon_V6_vlalignbi>;
+
+defm : T_QVV_pat <V6_vswap, int_hexagon_V6_vswap>;
+defm : T_QVV_pat <V6_vmux, int_hexagon_V6_vmux>;
+defm : T_QQ_pat <V6_pred_and, int_hexagon_V6_pred_and>;
+defm : T_QQ_pat <V6_pred_or, int_hexagon_V6_pred_or>;
+defm : T_Q_pat <V6_pred_not, int_hexagon_V6_pred_not>;
+defm : T_QQ_pat <V6_pred_xor, int_hexagon_V6_pred_xor>;
+defm : T_QQ_pat <V6_pred_or_n, int_hexagon_V6_pred_or_n>;
+defm : T_QQ_pat <V6_pred_and_n, int_hexagon_V6_pred_and_n>;
+defm : T_VV_pat <V6_veqb, int_hexagon_V6_veqb>;
+defm : T_VV_pat <V6_veqh, int_hexagon_V6_veqh>;
+defm : T_VV_pat <V6_veqw, int_hexagon_V6_veqw>;
+defm : T_VV_pat <V6_vgtb, int_hexagon_V6_vgtb>;
+defm : T_VV_pat <V6_vgth, int_hexagon_V6_vgth>;
+defm : T_VV_pat <V6_vgtw, int_hexagon_V6_vgtw>;
+defm : T_VV_pat <V6_vgtub, int_hexagon_V6_vgtub>;
+defm : T_VV_pat <V6_vgtuh, int_hexagon_V6_vgtuh>;
+defm : T_VV_pat <V6_vgtuw, int_hexagon_V6_vgtuw>;
+
+defm : T_VQR_pat <V6_vandqrt_acc, int_hexagon_V6_vandqrt_acc>;
+defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>;
+defm : T_QR_pat <V6_vandqrt, int_hexagon_V6_vandqrt>;
+defm : T_R_pat <V6_lvsplatw, int_hexagon_V6_lvsplatw>;
+defm : T_R_pat <V6_pred_scalar2, int_hexagon_V6_pred_scalar2>;
+defm : T_VR_pat <V6_vandvrt, int_hexagon_V6_vandvrt>;
+
+defm : T_VVR_pat <V6_vlutvvb, int_hexagon_V6_vlutvvb>;
+defm : T_VVR_pat <V6_vlutvwh, int_hexagon_V6_vlutvwh>;
+defm : T_VVVR_pat <V6_vlutvvb_oracc, int_hexagon_V6_vlutvvb_oracc>;
+defm : T_WVVR_pat <V6_vlutvwh_oracc, int_hexagon_V6_vlutvwh_oracc>;
+
+defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>;
+def : T_PI_pat <S6_rol_i_p, int_hexagon_S6_rol_i_p>;
+def : T_RI_pat <S6_rol_i_r, int_hexagon_S6_rol_i_r>;
+def : T_PPI_pat <S6_rol_i_p_nac, int_hexagon_S6_rol_i_p_nac>;
+def : T_PPI_pat <S6_rol_i_p_acc, int_hexagon_S6_rol_i_p_acc>;
+def : T_PPI_pat <S6_rol_i_p_and, int_hexagon_S6_rol_i_p_and>;
+def : T_PPI_pat <S6_rol_i_p_or, int_hexagon_S6_rol_i_p_or>;
+def : T_PPI_pat <S6_rol_i_p_xacc, int_hexagon_S6_rol_i_p_xacc>;
+def : T_RRI_pat <S6_rol_i_r_nac, int_hexagon_S6_rol_i_r_nac>;
+def : T_RRI_pat <S6_rol_i_r_acc, int_hexagon_S6_rol_i_r_acc>;
+def : T_RRI_pat <S6_rol_i_r_and, int_hexagon_S6_rol_i_r_and>;
+def : T_RRI_pat <S6_rol_i_r_or, int_hexagon_S6_rol_i_r_or>;
+def : T_RRI_pat <S6_rol_i_r_xacc, int_hexagon_S6_rol_i_r_xacc>;
+
+defm : T_VR_pat <V6_extractw, int_hexagon_V6_extractw>;
+defm : T_VR_pat <V6_vinsertwr, int_hexagon_V6_vinsertwr>;
+
+def : T_PPQ_pat <S2_cabacencbin, int_hexagon_S2_cabacencbin>;
+
+def: Pat<(v64i16 (trunc v64i32:$Vdd)),
+         (v64i16 (V6_vpackwh_sat_128B
+                 (v32i32 (V6_hi_128B VecDblRegs128B:$Vdd)),
+                 (v32i32 (V6_lo_128B VecDblRegs128B:$Vdd))))>,
+     Requires<[UseHVXDbl]>;
+
+def: Pat<(int_hexagon_V6_vd0),      (V6_vd0)>;
+def: Pat<(int_hexagon_V6_vd0_128B), (V6_vd0_128B)>;
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIsetDx.td b/contrib/llvm/lib/Target/Hexagon/HexagonIsetDx.td
new file mode 100644
index 000000000000..ebedf2cbaf17
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIsetDx.td
@@ -0,0 +1,728 @@
+//=- HexagonIsetDx.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon duplex instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// SA1_combine1i: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def SA1_combine1i: SUBInst <
+  (outs DoubleRegs:$Rdd),
+  (ins u2_0Imm:$u2),
+  "$Rdd = combine(#1, #$u2)"> {
+    bits<3> Rdd;
+    bits<2> u2;
+
+    let Inst{12-10} = 0b111;
+    let Inst{8} = 0b0;
+    let Inst{4-3} = 0b01;
+    let Inst{2-0} = Rdd;
+    let Inst{6-5} = u2;
+  }
+
+// SL2_jumpr31_f: Indirect conditional jump if false.
+// SL2_jumpr31_f -> SL2_jumpr31_fnew
+let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def SL2_jumpr31_f: SUBInst <
+  (outs ),
+  (ins ),
+  "if (!p0) jumpr r31"> {
+    let Inst{12-6} = 0b1111111;
+    let Inst{2-0} = 0b101;
+  }
+
+// SL2_deallocframe: Deallocate stack frame.
+let Defs = [R31, R29, R30], Uses = [R30], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess in
+def SL2_deallocframe: SUBInst <
+  (outs ),
+  (ins ),
+  "deallocframe"> {
+    let Inst{12-6} = 0b1111100;
+    let Inst{2} = 0b0;
+  }
+
+// SL2_return_f: Deallocate stack frame and return.
+// SL2_return_f -> SL2_return_fnew
+let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
+def SL2_return_f: SUBInst <
+  (outs ),
+  (ins ),
+  "if (!p0) dealloc_return"> {
+    let Inst{12-6} = 0b1111101;
+    let Inst{2-0} = 0b101;
+  }
+
+// SA1_combine3i: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def SA1_combine3i: SUBInst <
+  (outs DoubleRegs:$Rdd),
+  (ins u2_0Imm:$u2),
+  "$Rdd = combine(#3, #$u2)"> {
+    bits<3> Rdd;
+    bits<2> u2;
+
+    let Inst{12-10} = 0b111;
+    let Inst{8} = 0b0;
+    let Inst{4-3} = 0b11;
+    let Inst{2-0} = Rdd;
+    let Inst{6-5} = u2;
+  }
+
+// SS2_storebi0: Store byte.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
+def SS2_storebi0: SUBInst <
+  (outs ),
+  (ins IntRegs:$Rs, u4_0Imm:$u4_0),
+  "memb($Rs + #$u4_0)=#0"> {
+    bits<4> Rs;
+    bits<4> u4_0;
+
+    let Inst{12-8} = 0b10010;
+    let Inst{7-4} = Rs;
+    let Inst{3-0} = u4_0;
+  }
+
+// SA1_clrtnew: Clear if true.
+let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_clrtnew: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins PredRegs:$Pu),
+  "if ($Pu.new) $Rd = #0"> {
+    bits<4> Rd;
+
+    let Inst{12-9} = 0b1101;
+    let Inst{6-4} = 0b100;
+    let Inst{3-0} = Rd;
+  }
+
+// SL2_loadruh_io: Load half.
+let isCodeGenOnly = 1, mayLoad = 1, accessSize = HalfWordAccess, hasNewValue = 1, opNewValue = 0 in
+def SL2_loadruh_io: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs, u3_1Imm:$u3_1),
+  "$Rd = memuh($Rs + #$u3_1)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+    bits<4> u3_1;
+
+    let Inst{12-11} = 0b01;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+    let Inst{10-8} = u3_1{3-1};
+  }
+
+// SL2_jumpr31_tnew: Indirect conditional jump if true.
+let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def SL2_jumpr31_tnew: SUBInst <
+  (outs ),
+  (ins ),
+  "if (p0.new) jumpr:nt r31"> {
+    let Inst{12-6} = 0b1111111;
+    let Inst{2-0} = 0b110;
+  }
+
+// SA1_addi: Add.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0, isExtendable = 1, isExtentSigned = 1, opExtentBits = 7, opExtendable = 2 in
+def SA1_addi: SUBInst <
+  (outs IntRegs:$Rx),
+  (ins IntRegs:$_src_, s7_0Ext:$s7),
+  "$Rx = add($_src_, #$s7)" ,
+  [] ,
+  "$_src_ = $Rx"> {
+    bits<4> Rx;
+    bits<7> s7;
+
+    let Inst{12-11} = 0b00;
+    let Inst{3-0} = Rx;
+    let Inst{10-4} = s7;
+  }
+
+// SL1_loadrub_io: Load byte.
+let isCodeGenOnly = 1, mayLoad = 1, accessSize = ByteAccess, hasNewValue = 1, opNewValue = 0 in
+def SL1_loadrub_io: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs, u4_0Imm:$u4_0),
+  "$Rd = memub($Rs + #$u4_0)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+    bits<4> u4_0;
+
+    let Inst{12} = 0b1;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+    let Inst{11-8} = u4_0;
+  }
+
+// SL1_loadri_io: Load word.
+let isCodeGenOnly = 1, mayLoad = 1, accessSize = WordAccess, hasNewValue = 1, opNewValue = 0 in
+def SL1_loadri_io: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs, u4_2Imm:$u4_2),
+  "$Rd = memw($Rs + #$u4_2)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+    bits<6> u4_2;
+
+    let Inst{12} = 0b0;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+    let Inst{11-8} = u4_2{5-2};
+  }
+
+// SA1_cmpeqi: Compareimmed.
+let Defs = [P0], isCodeGenOnly = 1, hasSideEffects = 0 in
+def SA1_cmpeqi: SUBInst <
+  (outs ),
+  (ins IntRegs:$Rs, u2_0Imm:$u2),
+  "p0 = cmp.eq($Rs, #$u2)"> {
+    bits<4> Rs;
+    bits<2> u2;
+
+    let Inst{12-8} = 0b11001;
+    let Inst{7-4} = Rs;
+    let Inst{1-0} = u2;
+  }
+
+// SA1_combinerz: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def SA1_combinerz: SUBInst <
+  (outs DoubleRegs:$Rdd),
+  (ins IntRegs:$Rs),
+  "$Rdd = combine($Rs, #0)"> {
+    bits<3> Rdd;
+    bits<4> Rs;
+
+    let Inst{12-10} = 0b111;
+    let Inst{8} = 0b1;
+    let Inst{3} = 0b1;
+    let Inst{2-0} = Rdd;
+    let Inst{7-4} = Rs;
+  }
+
+// SL2_return_t: Deallocate stack frame and return.
+// SL2_return_t -> SL2_return_tnew
+let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
+def SL2_return_t: SUBInst <
+  (outs ),
+  (ins ),
+  "if (p0) dealloc_return"> {
+    let Inst{12-6} = 0b1111101;
+    let Inst{2-0} = 0b100;
+  }
+
+// SS2_allocframe: Allocate stack frame.
+let Defs = [R29, R30], Uses = [R30, R31, R29], isCodeGenOnly = 1, mayStore = 1, accessSize = DoubleWordAccess in
+def SS2_allocframe: SUBInst <
+  (outs ),
+  (ins u5_3Imm:$u5_3),
+  "allocframe(#$u5_3)"> {
+    bits<8> u5_3;
+
+    let Inst{12-9} = 0b1110;
+    let Inst{8-4} = u5_3{7-3};
+  }
+
+// SS2_storeh_io: Store half.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = HalfWordAccess in
+def SS2_storeh_io: SUBInst <
+  (outs ),
+  (ins IntRegs:$Rs, u3_1Imm:$u3_1, IntRegs:$Rt),
+  "memh($Rs + #$u3_1) = $Rt"> {
+    bits<4> Rs;
+    bits<4> u3_1;
+    bits<4> Rt;
+
+    let Inst{12-11} = 0b00;
+    let Inst{7-4} = Rs;
+    let Inst{10-8} = u3_1{3-1};
+    let Inst{3-0} = Rt;
+  }
+
+// SS2_storewi0: Store word.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
+def SS2_storewi0: SUBInst <
+  (outs ),
+  (ins IntRegs:$Rs, u4_2Imm:$u4_2),
+  "memw($Rs + #$u4_2)=#0"> {
+    bits<4> Rs;
+    bits<6> u4_2;
+
+    let Inst{12-8} = 0b10000;
+    let Inst{7-4} = Rs;
+    let Inst{3-0} = u4_2{5-2};
+  }
+
+// SS2_storewi1: Store word.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
+def SS2_storewi1: SUBInst <
+  (outs ),
+  (ins IntRegs:$Rs, u4_2Imm:$u4_2),
+  "memw($Rs + #$u4_2)=#1"> {
+    bits<4> Rs;
+    bits<6> u4_2;
+
+    let Inst{12-8} = 0b10001;
+    let Inst{7-4} = Rs;
+    let Inst{3-0} = u4_2{5-2};
+  }
+
+// SL2_jumpr31: Indirect conditional jump if true.
+let Defs = [PC], Uses = [R31], isCodeGenOnly = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def SL2_jumpr31: SUBInst <
+  (outs ),
+  (ins ),
+  "jumpr r31"> {
+    let Inst{12-6} = 0b1111111;
+    let Inst{2} = 0b0;
+  }
+
+// SA1_combinezr: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def SA1_combinezr: SUBInst <
+  (outs DoubleRegs:$Rdd),
+  (ins IntRegs:$Rs),
+  "$Rdd = combine(#0, $Rs)"> {
+    bits<3> Rdd;
+    bits<4> Rs;
+
+    let Inst{12-10} = 0b111;
+    let Inst{8} = 0b1;
+    let Inst{3} = 0b0;
+    let Inst{2-0} = Rdd;
+    let Inst{7-4} = Rs;
+  }
+
+// SL2_loadrh_io: Load half.
+let isCodeGenOnly = 1, mayLoad = 1, accessSize = HalfWordAccess, hasNewValue = 1, opNewValue = 0 in
+def SL2_loadrh_io: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs, u3_1Imm:$u3_1),
+  "$Rd = memh($Rs + #$u3_1)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+    bits<4> u3_1;
+
+    let Inst{12-11} = 0b00;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+    let Inst{10-8} = u3_1{3-1};
+  }
+
+// SA1_addrx: Add.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_addrx: SUBInst <
+  (outs IntRegs:$Rx),
+  (ins IntRegs:$_src_, IntRegs:$Rs),
+  "$Rx = add($_src_, $Rs)" ,
+  [] ,
+  "$_src_ = $Rx"> {
+    bits<4> Rx;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b11000;
+    let Inst{3-0} = Rx;
+    let Inst{7-4} = Rs;
+  }
+
+// SA1_setin1: Set to -1.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_setin1: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins ),
+  "$Rd = #{-1}"> {
+    bits<4> Rd;
+
+    let Inst{12-9} = 0b1101;
+    let Inst{6} = 0b0;
+    let Inst{3-0} = Rd;
+  }
+
+// SA1_sxth: Sxth.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_sxth: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs),
+  "$Rd = sxth($Rs)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b10100;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+  }
+
+// SA1_combine0i: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def SA1_combine0i: SUBInst <
+  (outs DoubleRegs:$Rdd),
+  (ins u2_0Imm:$u2),
+  "$Rdd = combine(#0, #$u2)"> {
+    bits<3> Rdd;
+    bits<2> u2;
+
+    let Inst{12-10} = 0b111;
+    let Inst{8} = 0b0;
+    let Inst{4-3} = 0b00;
+    let Inst{2-0} = Rdd;
+    let Inst{6-5} = u2;
+  }
+
+// SA1_combine2i: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def SA1_combine2i: SUBInst <
+  (outs DoubleRegs:$Rdd),
+  (ins u2_0Imm:$u2),
+  "$Rdd = combine(#2, #$u2)"> {
+    bits<3> Rdd;
+    bits<2> u2;
+
+    let Inst{12-10} = 0b111;
+    let Inst{8} = 0b0;
+    let Inst{4-3} = 0b10;
+    let Inst{2-0} = Rdd;
+    let Inst{6-5} = u2;
+  }
+
+// SA1_sxtb: Sxtb.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_sxtb: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs),
+  "$Rd = sxtb($Rs)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b10101;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+  }
+
+// SA1_clrf: Clear if false.
+// SA1_clrf -> SA1_clrfnew
+let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_clrf: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins PredRegs:$Pu),
+  "if (!$Pu) $Rd = #0"> {
+    bits<4> Rd;
+
+    let Inst{12-9} = 0b1101;
+    let Inst{6-4} = 0b111;
+    let Inst{3-0} = Rd;
+  }
+
+// SL2_loadrb_io: Load byte.
+let isCodeGenOnly = 1, mayLoad = 1, accessSize = ByteAccess, hasNewValue = 1, opNewValue = 0 in
+def SL2_loadrb_io: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs, u3_0Imm:$u3_0),
+  "$Rd = memb($Rs + #$u3_0)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+    bits<3> u3_0;
+
+    let Inst{12-11} = 0b10;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+    let Inst{10-8} = u3_0;
+  }
+
+// SA1_tfr: Tfr.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_tfr: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs),
+  "$Rd = $Rs"> {
+    bits<4> Rd;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b10000;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+  }
+
+// SL2_loadrd_sp: Load dword.
+let Uses = [R29], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess in
+def SL2_loadrd_sp: SUBInst <
+  (outs DoubleRegs:$Rdd),
+  (ins u5_3Imm:$u5_3),
+  "$Rdd = memd(r29 + #$u5_3)"> {
+    bits<3> Rdd;
+    bits<8> u5_3;
+
+    let Inst{12-8} = 0b11110;
+    let Inst{2-0} = Rdd;
+    let Inst{7-3} = u5_3{7-3};
+  }
+
+// SA1_and1: And #1.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_and1: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs),
+  "$Rd = and($Rs, #1)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b10010;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+  }
+
+// SS2_storebi1: Store byte.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
+def SS2_storebi1: SUBInst <
+  (outs ),
+  (ins IntRegs:$Rs, u4_0Imm:$u4_0),
+  "memb($Rs + #$u4_0)=#1"> {
+    bits<4> Rs;
+    bits<4> u4_0;
+
+    let Inst{12-8} = 0b10011;
+    let Inst{7-4} = Rs;
+    let Inst{3-0} = u4_0;
+  }
+
+// SA1_inc: Inc.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_inc: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs),
+  "$Rd = add($Rs, #1)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b10001;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+  }
+
+// SS2_stored_sp: Store dword.
+let Uses = [R29], isCodeGenOnly = 1, mayStore = 1, accessSize = DoubleWordAccess in
+def SS2_stored_sp: SUBInst <
+  (outs ),
+  (ins s6_3Imm:$s6_3, DoubleRegs:$Rtt),
+  "memd(r29 + #$s6_3) = $Rtt"> {
+    bits<9> s6_3;
+    bits<3> Rtt;
+
+    let Inst{12-9} = 0b0101;
+    let Inst{8-3} = s6_3{8-3};
+    let Inst{2-0} = Rtt;
+  }
+
+// SS2_storew_sp: Store word.
+let Uses = [R29], isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
+def SS2_storew_sp: SUBInst <
+  (outs ),
+  (ins u5_2Imm:$u5_2, IntRegs:$Rt),
+  "memw(r29 + #$u5_2) = $Rt"> {
+    bits<7> u5_2;
+    bits<4> Rt;
+
+    let Inst{12-9} = 0b0100;
+    let Inst{8-4} = u5_2{6-2};
+    let Inst{3-0} = Rt;
+  }
+
+// SL2_jumpr31_fnew: Indirect conditional jump if false.
+let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def SL2_jumpr31_fnew: SUBInst <
+  (outs ),
+  (ins ),
+  "if (!p0.new) jumpr:nt r31"> {
+    let Inst{12-6} = 0b1111111;
+    let Inst{2-0} = 0b111;
+  }
+
+// SA1_clrt: Clear if true.
+// SA1_clrt -> SA1_clrtnew
+let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_clrt: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins PredRegs:$Pu),
+  "if ($Pu) $Rd = #0"> {
+    bits<4> Rd;
+
+    let Inst{12-9} = 0b1101;
+    let Inst{6-4} = 0b110;
+    let Inst{3-0} = Rd;
+  }
+
+// SL2_return: Deallocate stack frame and return.
+let Defs = [PC, R31, R29, R30], Uses = [R30], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
+def SL2_return: SUBInst <
+  (outs ),
+  (ins ),
+  "dealloc_return"> {
+    let Inst{12-6} = 0b1111101;
+    let Inst{2} = 0b0;
+  }
+
+// SA1_dec: Dec.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_dec: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs),
+  "$Rd = add($Rs,#{-1})"> {
+    bits<4> Rd;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b10011;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+  }
+
+// SA1_seti: Set immed.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0, isExtendable = 1, isExtentSigned = 0, opExtentBits = 6, opExtendable = 1 in
+def SA1_seti: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins u6_0Ext:$u6),
+  "$Rd = #$u6"> {
+    bits<4> Rd;
+    bits<6> u6;
+
+    let Inst{12-10} = 0b010;
+    let Inst{3-0} = Rd;
+    let Inst{9-4} = u6;
+  }
+
+// SL2_jumpr31_t: Indirect conditional jump if true.
+// SL2_jumpr31_t -> SL2_jumpr31_tnew
+let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def SL2_jumpr31_t: SUBInst <
+  (outs ),
+  (ins ),
+  "if (p0) jumpr r31"> {
+    let Inst{12-6} = 0b1111111;
+    let Inst{2-0} = 0b100;
+  }
+
+// SA1_clrfnew: Clear if false.
+let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_clrfnew: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins PredRegs:$Pu),
+  "if (!$Pu.new) $Rd = #0"> {
+    bits<4> Rd;
+
+    let Inst{12-9} = 0b1101;
+    let Inst{6-4} = 0b101;
+    let Inst{3-0} = Rd;
+  }
+
+// SS1_storew_io: Store word.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
+def SS1_storew_io: SUBInst <
+  (outs ),
+  (ins IntRegs:$Rs, u4_2Imm:$u4_2, IntRegs:$Rt),
+  "memw($Rs + #$u4_2) = $Rt"> {
+    bits<4> Rs;
+    bits<6> u4_2;
+    bits<4> Rt;
+
+    let Inst{12} = 0b0;
+    let Inst{7-4} = Rs;
+    let Inst{11-8} = u4_2{5-2};
+    let Inst{3-0} = Rt;
+  }
+
+// SA1_zxtb: Zxtb.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_zxtb: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs),
+  "$Rd = and($Rs, #255)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b10111;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+  }
+
+// SA1_addsp: Add.
+let Uses = [R29], isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_addsp: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins u6_2Imm:$u6_2),
+  "$Rd = add(r29, #$u6_2)"> {
+    bits<4> Rd;
+    bits<8> u6_2;
+
+    let Inst{12-10} = 0b011;
+    let Inst{3-0} = Rd;
+    let Inst{9-4} = u6_2{7-2};
+  }
+
+// SL2_loadri_sp: Load word.
+let Uses = [R29], isCodeGenOnly = 1, mayLoad = 1, accessSize = WordAccess, hasNewValue = 1, opNewValue = 0 in
+def SL2_loadri_sp: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins u5_2Imm:$u5_2),
+  "$Rd = memw(r29 + #$u5_2)"> {
+    bits<4> Rd;
+    bits<7> u5_2;
+
+    let Inst{12-9} = 0b1110;
+    let Inst{3-0} = Rd;
+    let Inst{8-4} = u5_2{6-2};
+  }
+
+// SS1_storeb_io: Store byte.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
+def SS1_storeb_io: SUBInst <
+  (outs ),
+  (ins IntRegs:$Rs, u4_0Imm:$u4_0, IntRegs:$Rt),
+  "memb($Rs + #$u4_0) = $Rt"> {
+    bits<4> Rs;
+    bits<4> u4_0;
+    bits<4> Rt;
+
+    let Inst{12} = 0b1;
+    let Inst{7-4} = Rs;
+    let Inst{11-8} = u4_0;
+    let Inst{3-0} = Rt;
+  }
+
+// SL2_return_tnew: Deallocate stack frame and return.
+let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
+def SL2_return_tnew: SUBInst <
+  (outs ),
+  (ins ),
+  "if (p0.new) dealloc_return:nt"> {
+    let Inst{12-6} = 0b1111101;
+    let Inst{2-0} = 0b110;
+  }
+
+// SL2_return_fnew: Deallocate stack frame and return.
+let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
+def SL2_return_fnew: SUBInst <
+  (outs ),
+  (ins ),
+  "if (!p0.new) dealloc_return:nt"> {
+    let Inst{12-6} = 0b1111101;
+    let Inst{2-0} = 0b111;
+  }
+
+// SA1_zxth: Zxth.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def SA1_zxth: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs),
+  "$Rd = zxth($Rs)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b10110;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+  }
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
new file mode 100644
index 000000000000..a5dc002642c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -0,0 +1,170 @@
+//===- HexagonMCInstLower.cpp - Convert Hexagon MachineInstr to an MCInst -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Hexagon MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonAsmPrinter.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+namespace llvm {
+  void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+                        MCInst &MCB, HexagonAsmPrinter &AP);
+}
+
+static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
+                              HexagonAsmPrinter &Printer, bool MustExtend) {
+  MCContext &MC = Printer.OutContext;
+  const MCExpr *ME;
+
+  // Populate the relocation type based on Hexagon target flags
+  // set on an operand
+  MCSymbolRefExpr::VariantKind RelocationType;
+  switch (MO.getTargetFlags()) {
+  default:
+    RelocationType = MCSymbolRefExpr::VK_None;
+    break;
+  case HexagonII::MO_PCREL:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_PCREL;
+    break;
+  case HexagonII::MO_GOT:
+    RelocationType = MCSymbolRefExpr::VK_GOT;
+    break;
+  case HexagonII::MO_LO16:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_LO16;
+    break;
+  case HexagonII::MO_HI16:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_HI16;
+    break;
+  case HexagonII::MO_GPREL:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_GPREL;
+    break;
+  case HexagonII::MO_GDGOT:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_GD_GOT;
+    break;
+  case HexagonII::MO_GDPLT:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_GD_PLT;
+    break;
+  case HexagonII::MO_IE:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_IE;
+    break;
+  case HexagonII::MO_IEGOT:
+    RelocationType = MCSymbolRefExpr::VK_Hexagon_IE_GOT;
+    break;
+  case HexagonII::MO_TPREL:
+    RelocationType = MCSymbolRefExpr::VK_TPREL;
+    break;
+  }
+
+  ME = MCSymbolRefExpr::create(Symbol, RelocationType, MC);
+
+  if (!MO.isJTI() && MO.getOffset())
+    ME = MCBinaryExpr::createAdd(ME, MCConstantExpr::create(MO.getOffset(), MC),
+                                 MC);
+
+  ME = HexagonMCExpr::create(ME, MC);
+  HexagonMCInstrInfo::setMustExtend(*ME, MustExtend);
+  return MCOperand::createExpr(ME);
+}
+
+// Create an MCInst from a MachineInstr
+void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
+                            MCInst &MCB, HexagonAsmPrinter &AP) {
+  if (MI->getOpcode() == Hexagon::ENDLOOP0) {
+    HexagonMCInstrInfo::setInnerLoop(MCB);
+    return;
+  }
+  if (MI->getOpcode() == Hexagon::ENDLOOP1) {
+    HexagonMCInstrInfo::setOuterLoop(MCB);
+    return;
+  }
+  MCInst *MCI = new (AP.OutContext) MCInst;
+  MCI->setOpcode(MI->getOpcode());
+  assert(MCI->getOpcode() == static_cast<unsigned>(MI->getOpcode()) &&
+         "MCI opcode should have been set on construction");
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCO;
+    bool MustExtend = MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended;
+
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
+      MCO = MCOperand::createReg(MO.getReg());
+      break;
+    case MachineOperand::MO_FPImmediate: {
+      APFloat Val = MO.getFPImm()->getValueAPF();
+      // FP immediates are used only when setting GPRs, so they may be dealt
+      // with like regular immediates from this point on.
+      auto Expr = HexagonMCExpr::create(
+          MCConstantExpr::create(*Val.bitcastToAPInt().getRawData(),
+                                 AP.OutContext),
+          AP.OutContext);
+      HexagonMCInstrInfo::setMustExtend(*Expr, MustExtend);
+      MCO = MCOperand::createExpr(Expr);
+      break;
+    }
+    case MachineOperand::MO_Immediate: {
+      auto Expr = HexagonMCExpr::create(
+          MCConstantExpr::create(MO.getImm(), AP.OutContext), AP.OutContext);
+      HexagonMCInstrInfo::setMustExtend(*Expr, MustExtend);
+      MCO = MCOperand::createExpr(Expr);
+      break;
+    }
+    case MachineOperand::MO_MachineBasicBlock: {
+      MCExpr const *Expr = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(),
+                                                   AP.OutContext);
+      Expr = HexagonMCExpr::create(Expr, AP.OutContext);
+      HexagonMCInstrInfo::setMustExtend(*Expr, MustExtend);
+      MCO = MCOperand::createExpr(Expr);
+      break;
+    }
+    case MachineOperand::MO_GlobalAddress:
+      MCO = GetSymbolRef(MO, AP.getSymbol(MO.getGlobal()), AP, MustExtend);
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCO = GetSymbolRef(MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()),
+                         AP, MustExtend);
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, MustExtend);
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, MustExtend);
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCO = GetSymbolRef(MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()), AP,
+                         MustExtend);
+      break;
+    }
+
+    MCI->addOperand(MCO);
+  }
+  AP.HexagonProcessInstruction(*MCI, *MI);
+  HexagonMCInstrInfo::extendIfNeeded(AP.OutContext, MCII, MCB, *MCI);
+  MCB.addOperand(MCOperand::createInst(MCI));
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..9579c8b6df16
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
@@ -0,0 +1,16 @@
+//= HexagonMachineFunctionInfo.cpp - Hexagon machine function info *- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMachineFunctionInfo.h"
+
+using namespace llvm;
+
+// pin vtable to this file
+void HexagonMachineFunctionInfo::anchor() {}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
new file mode 100644
index 000000000000..371b52108b9b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
@@ -0,0 +1,80 @@
+//=- HexagonMachineFunctionInfo.h - Hexagon machine function info -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include <map>
+
+namespace llvm {
+
+  namespace Hexagon {
+    const unsigned int StartPacket = 0x1;
+    const unsigned int EndPacket = 0x2;
+  }
+
+
+/// Hexagon target-specific information for each MachineFunction.
+class HexagonMachineFunctionInfo : public MachineFunctionInfo {
+  // SRetReturnReg - Some subtargets require that sret lowering includes
+  // returning the value of the returned struct in a register. This field
+  // holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+  unsigned StackAlignBaseVReg;    // Aligned-stack base register (virtual)
+  unsigned StackAlignBasePhysReg; //                             (physical)
+  int VarArgsFrameIndex;
+  bool HasClobberLR;
+  bool HasEHReturn;
+  std::map<const MachineInstr*, unsigned> PacketInfo;
+  virtual void anchor();
+
+public:
+  HexagonMachineFunctionInfo() : SRetReturnReg(0), StackAlignBaseVReg(0),
+      StackAlignBasePhysReg(0), HasClobberLR(0), HasEHReturn(false) {}
+
+  HexagonMachineFunctionInfo(MachineFunction &MF) : SRetReturnReg(0),
+      StackAlignBaseVReg(0), StackAlignBasePhysReg(0), HasClobberLR(0),
+      HasEHReturn(false) {}
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  void setVarArgsFrameIndex(int v) { VarArgsFrameIndex = v; }
+  int getVarArgsFrameIndex() { return VarArgsFrameIndex; }
+
+  void setStartPacket(MachineInstr* MI) {
+    PacketInfo[MI] |= Hexagon::StartPacket;
+  }
+  void setEndPacket(MachineInstr* MI)   {
+    PacketInfo[MI] |= Hexagon::EndPacket;
+  }
+  bool isStartPacket(const MachineInstr* MI) const {
+    return (PacketInfo.count(MI) &&
+            (PacketInfo.find(MI)->second & Hexagon::StartPacket));
+  }
+  bool isEndPacket(const MachineInstr* MI) const {
+    return (PacketInfo.count(MI) &&
+            (PacketInfo.find(MI)->second & Hexagon::EndPacket));
+  }
+  void setHasClobberLR(bool v) { HasClobberLR = v;  }
+  bool hasClobberLR() const { return HasClobberLR; }
+
+  bool hasEHReturn() const { return HasEHReturn; };
+  void setHasEHReturn(bool H = true) { HasEHReturn = H; };
+
+  void setStackAlignBaseVReg(unsigned R) { StackAlignBaseVReg = R; }
+  unsigned getStackAlignBaseVReg() const { return StackAlignBaseVReg; }
+
+  void setStackAlignBasePhysReg(unsigned R) { StackAlignBasePhysReg = R; }
+  unsigned getStackAlignBasePhysReg() const { return StackAlignBasePhysReg; }
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
new file mode 100644
index 000000000000..9ff9d93ea0c3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -0,0 +1,1031 @@
+//===- HexagonMachineScheduler.cpp - MI Scheduler for Hexagon -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineScheduler schedules machine instructions after phi elimination. It
+// preserves LiveIntervals so it can be invoked before register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMachineScheduler.h"
+#include "HexagonSubtarget.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/IR/Function.h"
+
+#include <iomanip>
+#include <sstream>
+
+static cl::opt<bool> IgnoreBBRegPressure("ignore-bb-reg-pressure",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> SchedPredsCloser("sched-preds-closer",
+    cl::Hidden, cl::ZeroOrMore, cl::init(true));
+
+static cl::opt<unsigned> SchedDebugVerboseLevel("misched-verbose-level",
+    cl::Hidden, cl::ZeroOrMore, cl::init(1));
+
+static cl::opt<bool> TopUseShorterTie("top-use-shorter-tie",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> BotUseShorterTie("bot-use-shorter-tie",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> DisableTCTie("disable-tc-tie",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> SchedRetvalOptimization("sched-retval-optimization",
+    cl::Hidden, cl::ZeroOrMore, cl::init(true));
+
+// Check if the scheduler should penalize instructions that are available to
+// early due to a zero-latency dependence.
+static cl::opt<bool> CheckEarlyAvail("check-early-avail", cl::Hidden,
+    cl::ZeroOrMore, cl::init(true));
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+namespace {
+class HexagonCallMutation : public ScheduleDAGMutation {
+public:
+  void apply(ScheduleDAGInstrs *DAG) override;
+private:
+  bool shouldTFRICallBind(const HexagonInstrInfo &HII,
+                          const SUnit &Inst1, const SUnit &Inst2) const;
+};
+} // end anonymous namespace
+
+// Check if a call and subsequent A2_tfrpi instructions should maintain
+// scheduling affinity. We are looking for the TFRI to be consumed in
+// the next instruction. This should help reduce the instances of
+// double register pairs being allocated and scheduled before a call
+// when not used until after the call. This situation is exacerbated
+// by the fact that we allocate the pair from the callee saves list,
+// leading to excess spills and restores.
+bool HexagonCallMutation::shouldTFRICallBind(const HexagonInstrInfo &HII,
+      const SUnit &Inst1, const SUnit &Inst2) const {
+  if (Inst1.getInstr()->getOpcode() != Hexagon::A2_tfrpi)
+    return false;
+
+  // TypeXTYPE are 64 bit operations.
+  if (HII.getType(*Inst2.getInstr()) == HexagonII::TypeXTYPE)
+    return true;
+  return false;
+}
+
+void HexagonCallMutation::apply(ScheduleDAGInstrs *DAG) {
+  SUnit* LastSequentialCall = nullptr;
+  unsigned VRegHoldingRet = 0;
+  unsigned RetRegister;
+  SUnit* LastUseOfRet = nullptr;
+  auto &TRI = *DAG->MF.getSubtarget().getRegisterInfo();
+  auto &HII = *DAG->MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+
+  // Currently we only catch the situation when compare gets scheduled
+  // before preceding call.
+  for (unsigned su = 0, e = DAG->SUnits.size(); su != e; ++su) {
+    // Remember the call.
+    if (DAG->SUnits[su].getInstr()->isCall())
+      LastSequentialCall = &DAG->SUnits[su];
+    // Look for a compare that defines a predicate.
+    else if (DAG->SUnits[su].getInstr()->isCompare() && LastSequentialCall)
+      DAG->SUnits[su].addPred(SDep(LastSequentialCall, SDep::Barrier));
+    // Look for call and tfri* instructions.
+    else if (SchedPredsCloser && LastSequentialCall && su > 1 && su < e-1 &&
+             shouldTFRICallBind(HII, DAG->SUnits[su], DAG->SUnits[su+1]))
+      DAG->SUnits[su].addPred(SDep(&DAG->SUnits[su-1], SDep::Barrier));
+    // Prevent redundant register copies between two calls, which are caused by
+    // both the return value and the argument for the next call being in %R0.
+    // Example:
+    //   1: <call1>
+    //   2: %VregX = COPY %R0
+    //   3: <use of %VregX>
+    //   4: %R0 = ...
+    //   5: <call2>
+    // The scheduler would often swap 3 and 4, so an additional register is
+    // needed. This code inserts a Barrier dependence between 3 & 4 to prevent
+    // this. The same applies for %D0 and %V0/%W0, which are also handled.
+    else if (SchedRetvalOptimization) {
+      const MachineInstr *MI = DAG->SUnits[su].getInstr();
+      if (MI->isCopy() && (MI->readsRegister(Hexagon::R0, &TRI) ||
+                           MI->readsRegister(Hexagon::V0, &TRI)))  {
+        // %vregX = COPY %R0
+        VRegHoldingRet = MI->getOperand(0).getReg();
+        RetRegister = MI->getOperand(1).getReg();
+        LastUseOfRet = nullptr;
+      } else if (VRegHoldingRet && MI->readsVirtualRegister(VRegHoldingRet))
+        // <use of %vregX>
+        LastUseOfRet = &DAG->SUnits[su];
+      else if (LastUseOfRet && MI->definesRegister(RetRegister, &TRI))
+        // %R0 = ...
+        DAG->SUnits[su].addPred(SDep(LastUseOfRet, SDep::Barrier));
+    }
+  }
+}
+
+
+/// Save the last formed packet
+void VLIWResourceModel::savePacket() {
+  OldPacket = Packet;
+}
+
+/// Check if scheduling of this SU is possible
+/// in the current packet.
+/// It is _not_ precise (statefull), it is more like
+/// another heuristic. Many corner cases are figured
+/// empirically.
+bool VLIWResourceModel::isResourceAvailable(SUnit *SU) {
+  if (!SU || !SU->getInstr())
+    return false;
+
+  // First see if the pipeline could receive this instruction
+  // in the current cycle.
+  switch (SU->getInstr()->getOpcode()) {
+  default:
+    if (!ResourcesModel->canReserveResources(*SU->getInstr()))
+      return false;
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+    break;
+  }
+
+  MachineFunction &MF = *SU->getInstr()->getParent()->getParent();
+  auto &QII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+
+  // Now see if there are no other dependencies to instructions already
+  // in the packet.
+  for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
+    if (Packet[i]->Succs.size() == 0)
+      continue;
+
+    // Enable .cur formation.
+    if (QII.mayBeCurLoad(*Packet[i]->getInstr()))
+      continue;
+
+    for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(),
+         E = Packet[i]->Succs.end(); I != E; ++I) {
+      // Since we do not add pseudos to packets, might as well
+      // ignore order dependencies.
+      if (I->isCtrl())
+        continue;
+
+      if (I->getSUnit() == SU)
+        return false;
+    }
+  }
+  return true;
+}
+
+/// Keep track of available resources.
+bool VLIWResourceModel::reserveResources(SUnit *SU) {
+  bool startNewCycle = false;
+  // Artificially reset state.
+  if (!SU) {
+    ResourcesModel->clearResources();
+    savePacket();
+    Packet.clear();
+    TotalPackets++;
+    return false;
+  }
+  // If this SU does not fit in the packet
+  // start a new one.
+  if (!isResourceAvailable(SU)) {
+    ResourcesModel->clearResources();
+    savePacket();
+    Packet.clear();
+    TotalPackets++;
+    startNewCycle = true;
+  }
+
+  switch (SU->getInstr()->getOpcode()) {
+  default:
+    ResourcesModel->reserveResources(*SU->getInstr());
+    break;
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::CFI_INSTRUCTION:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+    break;
+  }
+  Packet.push_back(SU);
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n");
+  for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
+    DEBUG(dbgs() << "\t[" << i << "] SU(");
+    DEBUG(dbgs() << Packet[i]->NodeNum << ")\t");
+    DEBUG(Packet[i]->getInstr()->dump());
+  }
+#endif
+
+  // If packet is now full, reset the state so in the next cycle
+  // we start fresh.
+  if (Packet.size() >= SchedModel->getIssueWidth()) {
+    ResourcesModel->clearResources();
+    savePacket();
+    Packet.clear();
+    TotalPackets++;
+    startNewCycle = true;
+  }
+
+  return startNewCycle;
+}
+
+/// schedule - Called back from MachineScheduler::runOnMachineFunction
+/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
+/// only includes instructions that have DAG nodes, not scheduling boundaries.
+void VLIWMachineScheduler::schedule() {
+  DEBUG(dbgs()
+        << "********** MI Converging Scheduling VLIW BB#" << BB->getNumber()
+        << " " << BB->getName()
+        << " in_func " << BB->getParent()->getFunction()->getName()
+        << " at loop depth "  << MLI->getLoopDepth(BB)
+        << " \n");
+
+  buildDAGWithRegPressure();
+
+  SmallVector<SUnit*, 8> TopRoots, BotRoots;
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+
+  // Initialize the strategy before modifying the DAG.
+  SchedImpl->initialize(this);
+
+  DEBUG(unsigned maxH = 0;
+        for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          if (SUnits[su].getHeight() > maxH)
+            maxH = SUnits[su].getHeight();
+        dbgs() << "Max Height " << maxH << "\n";);
+  DEBUG(unsigned maxD = 0;
+        for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          if (SUnits[su].getDepth() > maxD)
+            maxD = SUnits[su].getDepth();
+        dbgs() << "Max Depth " << maxD << "\n";);
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+
+  initQueues(TopRoots, BotRoots);
+
+  bool IsTopNode = false;
+  while (true) {
+    DEBUG(dbgs() << "** VLIWMachineScheduler::schedule picking next node\n");
+    SUnit *SU = SchedImpl->pickNode(IsTopNode);
+    if (!SU) break;
+
+    if (!checkSchedLimit())
+      break;
+
+    scheduleMI(SU, IsTopNode);
+
+    updateQueues(SU, IsTopNode);
+
+    // Notify the scheduling strategy after updating the DAG.
+    SchedImpl->schedNode(SU, IsTopNode);
+  }
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+
+  DEBUG({
+    unsigned BBNum = begin()->getParent()->getNumber();
+    dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+    dumpSchedule();
+    dbgs() << '\n';
+  });
+}
+
+void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
+  DAG = static_cast<VLIWMachineScheduler*>(dag);
+  SchedModel = DAG->getSchedModel();
+
+  Top.init(DAG, SchedModel);
+  Bot.init(DAG, SchedModel);
+
+  // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
+  // are disabled, then these HazardRecs will be disabled.
+  const InstrItineraryData *Itin = DAG->getSchedModel()->getInstrItineraries();
+  const TargetSubtargetInfo &STI = DAG->MF.getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  delete Top.HazardRec;
+  delete Bot.HazardRec;
+  Top.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG);
+  Bot.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG);
+
+  delete Top.ResourceModel;
+  delete Bot.ResourceModel;
+  Top.ResourceModel = new VLIWResourceModel(STI, DAG->getSchedModel());
+  Bot.ResourceModel = new VLIWResourceModel(STI, DAG->getSchedModel());
+
+  assert((!llvm::ForceTopDown || !llvm::ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+
+  DAG->addMutation(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+  DAG->addMutation(make_unique<HexagonCallMutation>());
+}
+
+void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  for (const SDep &PI : SU->Preds) {
+    unsigned PredReadyCycle = PI.getSUnit()->TopReadyCycle;
+    unsigned MinLatency = PI.getLatency();
+#ifndef NDEBUG
+    Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency);
+#endif
+    if (SU->TopReadyCycle < PredReadyCycle + MinLatency)
+      SU->TopReadyCycle = PredReadyCycle + MinLatency;
+  }
+  Top.releaseNode(SU, SU->TopReadyCycle);
+}
+
+void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
+    unsigned MinLatency = I->getLatency();
+#ifndef NDEBUG
+    Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency);
+#endif
+    if (SU->BotReadyCycle < SuccReadyCycle + MinLatency)
+      SU->BotReadyCycle = SuccReadyCycle + MinLatency;
+  }
+  Bot.releaseNode(SU, SU->BotReadyCycle);
+}
+
+/// Does this SU have a hazard within the current instruction group.
+///
+/// The scheduler supports two modes of hazard recognition. The first is the
+/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
+/// supports highly complicated in-order reservation tables
+/// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
+///
+/// The second is a streamlined mechanism that checks for hazards based on
+/// simple counters that the scheduler itself maintains. It explicitly checks
+/// for instruction dispatch limitations, including the number of micro-ops that
+/// can dispatch per cycle.
+///
+/// TODO: Also check whether the SU must start a new group.
+bool ConvergingVLIWScheduler::VLIWSchedBoundary::checkHazard(SUnit *SU) {
+  if (HazardRec->isEnabled())
+    return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
+
+  unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
+  if (IssueCount + uops > SchedModel->getIssueWidth())
+    return true;
+
+  return false;
+}
+
+void ConvergingVLIWScheduler::VLIWSchedBoundary::releaseNode(SUnit *SU,
+                                                     unsigned ReadyCycle) {
+  if (ReadyCycle < MinReadyCycle)
+    MinReadyCycle = ReadyCycle;
+
+  // Check for interlocks first. For the purpose of other heuristics, an
+  // instruction that cannot issue appears as if it's not in the ReadyQueue.
+  if (ReadyCycle > CurrCycle || checkHazard(SU))
+
+    Pending.push(SU);
+  else
+    Available.push(SU);
+}
+
+/// Move the boundary of scheduled code by one cycle.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpCycle() {
+  unsigned Width = SchedModel->getIssueWidth();
+  IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
+
+  assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
+  unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle);
+
+  if (!HazardRec->isEnabled()) {
+    // Bypass HazardRec virtual calls.
+    CurrCycle = NextCycle;
+  } else {
+    // Bypass getHazardType calls in case of long latency.
+    for (; CurrCycle != NextCycle; ++CurrCycle) {
+      if (isTop())
+        HazardRec->AdvanceCycle();
+      else
+        HazardRec->RecedeCycle();
+    }
+  }
+  CheckPending = true;
+
+  DEBUG(dbgs() << "*** Next cycle " << Available.getName() << " cycle "
+               << CurrCycle << '\n');
+}
+
+/// Move the boundary of scheduled code by one SUnit.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpNode(SUnit *SU) {
+  bool startNewCycle = false;
+
+  // Update the reservation table.
+  if (HazardRec->isEnabled()) {
+    if (!isTop() && SU->isCall) {
+      // Calls are scheduled with their preceding instructions. For bottom-up
+      // scheduling, clear the pipeline state before emitting.
+      HazardRec->Reset();
+    }
+    HazardRec->EmitInstruction(SU);
+  }
+
+  // Update DFA model.
+  startNewCycle = ResourceModel->reserveResources(SU);
+
+  // Check the instruction group dispatch limit.
+  // TODO: Check if this SU must end a dispatch group.
+  IssueCount += SchedModel->getNumMicroOps(SU->getInstr());
+  if (startNewCycle) {
+    DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
+    bumpCycle();
+  }
+  else
+    DEBUG(dbgs() << "*** IssueCount " << IssueCount
+          << " at cycle " << CurrCycle << '\n');
+}
+
+/// Release pending ready nodes in to the available queue. This makes them
+/// visible to heuristics.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::releasePending() {
+  // If the available queue is empty, it is safe to reset MinReadyCycle.
+  if (Available.empty())
+    MinReadyCycle = UINT_MAX;
+
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
+    SUnit *SU = *(Pending.begin()+i);
+    unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+
+    if (ReadyCycle < MinReadyCycle)
+      MinReadyCycle = ReadyCycle;
+
+    if (ReadyCycle > CurrCycle)
+      continue;
+
+    if (checkHazard(SU))
+      continue;
+
+    Available.push(SU);
+    Pending.remove(Pending.begin()+i);
+    --i; --e;
+  }
+  CheckPending = false;
+}
+
+/// Remove SU from the ready set for this boundary.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::removeReady(SUnit *SU) {
+  if (Available.isInQueue(SU))
+    Available.remove(Available.find(SU));
+  else {
+    assert(Pending.isInQueue(SU) && "bad ready count");
+    Pending.remove(Pending.find(SU));
+  }
+}
+
+/// If this queue only has one ready candidate, return it. As a side effect,
+/// advance the cycle until at least one node is ready. If multiple instructions
+/// are ready, return NULL.
+SUnit *ConvergingVLIWScheduler::VLIWSchedBoundary::pickOnlyChoice() {
+  if (CheckPending)
+    releasePending();
+
+  for (unsigned i = 0; Available.empty(); ++i) {
+    assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
+           "permanent hazard"); (void)i;
+    ResourceModel->reserveResources(nullptr);
+    bumpCycle();
+    releasePending();
+  }
+  if (Available.size() == 1)
+    return *Available.begin();
+  return nullptr;
+}
+
+#ifndef NDEBUG
+void ConvergingVLIWScheduler::traceCandidate(const char *Label,
+      const ReadyQueue &Q, SUnit *SU, int Cost, PressureChange P) {
+  dbgs() << Label << " " << Q.getName() << " ";
+  if (P.isValid())
+    dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":"
+           << P.getUnitInc() << " ";
+  else
+    dbgs() << "     ";
+  dbgs() << "cost(" << Cost << ")\t";
+  SU->dump(DAG);
+}
+
+// Very detailed queue dump, to be used with higher verbosity levels.
+void ConvergingVLIWScheduler::readyQueueVerboseDump(
+      const RegPressureTracker &RPTracker, SchedCandidate &Candidate,
+      ReadyQueue &Q) {
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
+
+  dbgs() << ">>> " << Q.getName() << "\n";
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+    RegPressureDelta RPDelta;
+    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
+                                    DAG->getRegionCriticalPSets(),
+                                    DAG->getRegPressure().MaxSetPressure);
+    std::stringstream dbgstr;
+    dbgstr << "SU(" << std::setw(3) << (*I)->NodeNum << ")";
+    dbgs() << dbgstr.str();
+    SchedulingCost(Q, *I, Candidate, RPDelta, true);
+    dbgs() << "\t";
+    (*I)->getInstr()->dump();
+  }
+  dbgs() << "\n";
+}
+#endif
+
+/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
+/// of SU, return it, otherwise return null.
+static SUnit *getSingleUnscheduledPred(SUnit *SU) {
+  SUnit *OnlyAvailablePred = nullptr;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    SUnit &Pred = *I->getSUnit();
+    if (!Pred.isScheduled) {
+      // We found an available, but not scheduled, predecessor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
+        return nullptr;
+      OnlyAvailablePred = &Pred;
+    }
+  }
+  return OnlyAvailablePred;
+}
+
+/// getSingleUnscheduledSucc - If there is exactly one unscheduled successor
+/// of SU, return it, otherwise return null.
+static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
+  SUnit *OnlyAvailableSucc = nullptr;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    SUnit &Succ = *I->getSUnit();
+    if (!Succ.isScheduled) {
+      // We found an available, but not scheduled, successor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailableSucc && OnlyAvailableSucc != &Succ)
+        return nullptr;
+      OnlyAvailableSucc = &Succ;
+    }
+  }
+  return OnlyAvailableSucc;
+}
+
+// Constants used to denote relative importance of
+// heuristic components for cost computation.
+static const unsigned PriorityOne = 200;
+static const unsigned PriorityTwo = 50;
+static const unsigned PriorityThree = 75;
+static const unsigned ScaleTwo = 10;
+static const unsigned FactorOne = 2;
+
+/// Single point to compute overall scheduling cost.
+/// TODO: More heuristics will be used soon.
+int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
+                                            SchedCandidate &Candidate,
+                                            RegPressureDelta &Delta,
+                                            bool verbose) {
+  // Initial trivial priority.
+  int ResCount = 1;
+
+  // Do not waste time on a node that is already scheduled.
+  if (!SU || SU->isScheduled)
+    return ResCount;
+
+  MachineInstr &Instr = *SU->getInstr();
+
+  DEBUG(if (verbose) dbgs() << ((Q.getID() == TopQID) ? "(top|" : "(bot|"));
+  // Forced priority is high.
+  if (SU->isScheduleHigh) {
+    ResCount += PriorityOne;
+    DEBUG(dbgs() << "H|");
+  }
+
+  // Critical path first.
+  if (Q.getID() == TopQID) {
+    ResCount += (SU->getHeight() * ScaleTwo);
+
+    DEBUG(if (verbose) {
+      std::stringstream dbgstr;
+      dbgstr << "h" << std::setw(3) << SU->getHeight() << "|";
+      dbgs() << dbgstr.str();
+    });
+
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (Top.ResourceModel->isResourceAvailable(SU)) {
+      ResCount <<= FactorOne;
+      ResCount += PriorityThree;
+      DEBUG(if (verbose) dbgs() << "A|");
+    } else
+      DEBUG(if (verbose) dbgs() << " |");
+  } else {
+    ResCount += (SU->getDepth() * ScaleTwo);
+
+    DEBUG(if (verbose) {
+      std::stringstream dbgstr;
+      dbgstr << "d" << std::setw(3) << SU->getDepth() << "|";
+      dbgs() << dbgstr.str();
+    });
+
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (Bot.ResourceModel->isResourceAvailable(SU)) {
+      ResCount <<= FactorOne;
+      ResCount += PriorityThree;
+      DEBUG(if (verbose) dbgs() << "A|");
+    } else
+      DEBUG(if (verbose) dbgs() << " |");
+  }
+
+  unsigned NumNodesBlocking = 0;
+  if (Q.getID() == TopQID) {
+    // How many SUs does it block from scheduling?
+    // Look at all of the successors of this node.
+    // Count the number of nodes that
+    // this node is the sole unscheduled node for.
+    for (const SDep &SI : SU->Succs)
+      if (getSingleUnscheduledPred(SI.getSUnit()) == SU)
+        ++NumNodesBlocking;
+  } else {
+    // How many unscheduled predecessors block this node?
+    for (const SDep &PI : SU->Preds)
+      if (getSingleUnscheduledSucc(PI.getSUnit()) == SU)
+        ++NumNodesBlocking;
+  }
+  ResCount += (NumNodesBlocking * ScaleTwo);
+
+  DEBUG(if (verbose) {
+    std::stringstream dbgstr;
+    dbgstr << "blk " << std::setw(2) << NumNodesBlocking << ")|";
+    dbgs() << dbgstr.str();
+  });
+
+  // Factor in reg pressure as a heuristic.
+  if (!IgnoreBBRegPressure) {
+    // Decrease priority by the amount that register pressure exceeds the limit.
+    ResCount -= (Delta.Excess.getUnitInc()*PriorityOne);
+    // Decrease priority if register pressure exceeds the limit.
+    ResCount -= (Delta.CriticalMax.getUnitInc()*PriorityOne);
+    // Decrease priority slightly if register pressure would increase over the
+    // current maximum.
+    ResCount -= (Delta.CurrentMax.getUnitInc()*PriorityTwo);
+    DEBUG(if (verbose) {
+        dbgs() << "RP " << Delta.Excess.getUnitInc() << "/"
+               << Delta.CriticalMax.getUnitInc() <<"/"
+               << Delta.CurrentMax.getUnitInc() << ")|";
+    });
+  }
+
+  // Give a little extra priority to a .cur instruction if there is a resource
+  // available for it.
+  auto &QST = DAG->MF.getSubtarget<HexagonSubtarget>();
+  auto &QII = *QST.getInstrInfo();
+  if (SU->isInstr() && QII.mayBeCurLoad(*SU->getInstr())) {
+    if (Q.getID() == TopQID && Top.ResourceModel->isResourceAvailable(SU)) {
+      ResCount += PriorityTwo;
+      DEBUG(if (verbose) dbgs() << "C|");
+    } else if (Q.getID() == BotQID &&
+               Bot.ResourceModel->isResourceAvailable(SU)) {
+      ResCount += PriorityTwo;
+      DEBUG(if (verbose) dbgs() << "C|");
+    }
+  }
+
+  // Give preference to a zero latency instruction if the dependent
+  // instruction is in the current packet.
+  if (Q.getID() == TopQID) {
+    for (const SDep &PI : SU->Preds) {
+      if (!PI.getSUnit()->getInstr()->isPseudo() && PI.isAssignedRegDep() &&
+          PI.getLatency() == 0 &&
+          Top.ResourceModel->isInPacket(PI.getSUnit())) {
+        ResCount += PriorityThree;
+        DEBUG(if (verbose) dbgs() << "Z|");
+      }
+    }
+  } else {
+    for (const SDep &SI : SU->Succs) {
+      if (!SI.getSUnit()->getInstr()->isPseudo() && SI.isAssignedRegDep() &&
+          SI.getLatency() == 0 &&
+          Bot.ResourceModel->isInPacket(SI.getSUnit())) {
+        ResCount += PriorityThree;
+        DEBUG(if (verbose) dbgs() << "Z|");
+      }
+    }
+  }
+
+  // Give less preference to an instruction that will cause a stall with
+  // an instruction in the previous packet.
+  if (QII.isV60VectorInstruction(Instr)) {
+    // Check for stalls in the previous packet.
+    if (Q.getID() == TopQID) {
+      for (auto J : Top.ResourceModel->OldPacket)
+        if (QII.producesStall(*J->getInstr(), Instr))
+          ResCount -= PriorityOne;
+    } else {
+      for (auto J : Bot.ResourceModel->OldPacket)
+        if (QII.producesStall(Instr, *J->getInstr()))
+          ResCount -= PriorityOne;
+    }
+  }
+
+  // If the instruction has a non-zero latency dependence with an instruction in
+  // the current packet, then it should not be scheduled yet. The case occurs
+  // when the dependent instruction is scheduled in a new packet, so the
+  // scheduler updates the current cycle and pending instructions become
+  // available.
+  if (CheckEarlyAvail) {
+    if (Q.getID() == TopQID) {
+      for (const auto &PI : SU->Preds) {
+        if (PI.getLatency() > 0 &&
+            Top.ResourceModel->isInPacket(PI.getSUnit())) {
+          ResCount -= PriorityOne;
+          DEBUG(if (verbose) dbgs() << "D|");
+        }
+      }
+    } else {
+      for (const auto &SI : SU->Succs) {
+        if (SI.getLatency() > 0 &&
+            Bot.ResourceModel->isInPacket(SI.getSUnit())) {
+          ResCount -= PriorityOne;
+          DEBUG(if (verbose) dbgs() << "D|");
+        }
+      }
+    }
+  }
+
+  DEBUG(if (verbose) {
+    std::stringstream dbgstr;
+    dbgstr << "Total " << std::setw(4) << ResCount << ")";
+    dbgs() << dbgstr.str();
+  });
+
+  return ResCount;
+}
+
+/// Pick the best candidate from the top queue.
+///
+/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
+/// DAG building. To adjust for the current scheduling location we need to
+/// maintain the number of vreg uses remaining to be top-scheduled.
+ConvergingVLIWScheduler::CandResult ConvergingVLIWScheduler::
+pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
+                  SchedCandidate &Candidate) {
+  DEBUG(if (SchedDebugVerboseLevel > 1)
+        readyQueueVerboseDump(RPTracker, Candidate, Q);
+        else Q.dump(););
+
+  // getMaxPressureDelta temporarily modifies the tracker.
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+  // BestSU remains NULL if no top candidates beat the best existing candidate.
+  CandResult FoundCandidate = NoCand;
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+    RegPressureDelta RPDelta;
+    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
+                                    DAG->getRegionCriticalPSets(),
+                                    DAG->getRegPressure().MaxSetPressure);
+
+    int CurrentCost = SchedulingCost(Q, *I, Candidate, RPDelta, false);
+
+    // Initialize the candidate if needed.
+    if (!Candidate.SU) {
+      DEBUG(traceCandidate("DCAND", Q, *I, CurrentCost));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      Candidate.SCost = CurrentCost;
+      FoundCandidate = NodeOrder;
+      continue;
+    }
+
+    // Best cost.
+    if (CurrentCost > Candidate.SCost) {
+      DEBUG(traceCandidate("CCAND", Q, *I, CurrentCost));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      Candidate.SCost = CurrentCost;
+      FoundCandidate = BestCost;
+      continue;
+    }
+
+    // Tie breaker using Timing Class.
+    if (!DisableTCTie) {
+      auto &QST = DAG->MF.getSubtarget<HexagonSubtarget>();
+      auto &QII = *QST.getInstrInfo();
+
+      const MachineInstr *MI = (*I)->getInstr();
+      const MachineInstr *CandI = Candidate.SU->getInstr();
+      const InstrItineraryData *InstrItins = QST.getInstrItineraryData();
+
+      unsigned InstrLatency = QII.getInstrTimingClassLatency(InstrItins, *MI);
+      unsigned CandLatency = QII.getInstrTimingClassLatency(InstrItins, *CandI);
+      DEBUG(dbgs() << "TC Tie Breaker Cand: "
+                   << CandLatency << " Instr:" << InstrLatency << "\n"
+                   << *MI << *CandI << "\n");
+      if (Q.getID() == TopQID && CurrentCost == Candidate.SCost) {
+        if (InstrLatency < CandLatency && TopUseShorterTie) {
+          Candidate.SU = *I;
+          Candidate.RPDelta = RPDelta;
+          Candidate.SCost = CurrentCost;
+          FoundCandidate = BestCost;
+          DEBUG(dbgs() << "Used top shorter tie breaker\n");
+          continue;
+        } else if (InstrLatency > CandLatency && !TopUseShorterTie) {
+          Candidate.SU = *I;
+          Candidate.RPDelta = RPDelta;
+          Candidate.SCost = CurrentCost;
+          FoundCandidate = BestCost;
+          DEBUG(dbgs() << "Used top longer tie breaker\n");
+          continue;
+        }
+      } else if (Q.getID() == BotQID && CurrentCost == Candidate.SCost) {
+        if (InstrLatency < CandLatency && BotUseShorterTie) {
+          Candidate.SU = *I;
+          Candidate.RPDelta = RPDelta;
+          Candidate.SCost = CurrentCost;
+          FoundCandidate = BestCost;
+          DEBUG(dbgs() << "Used Bot shorter tie breaker\n");
+          continue;
+        } else if (InstrLatency > CandLatency && !BotUseShorterTie) {
+          Candidate.SU = *I;
+          Candidate.RPDelta = RPDelta;
+          Candidate.SCost = CurrentCost;
+          FoundCandidate = BestCost;
+          DEBUG(dbgs() << "Used Bot longer tie breaker\n");
+          continue;
+        }
+      }
+    }
+
+    if (CurrentCost == Candidate.SCost) {
+      if ((Q.getID() == TopQID &&
+           (*I)->Succs.size() > Candidate.SU->Succs.size()) ||
+          (Q.getID() == BotQID &&
+           (*I)->Preds.size() < Candidate.SU->Preds.size())) {
+        DEBUG(traceCandidate("SPCAND", Q, *I, CurrentCost));
+        Candidate.SU = *I;
+        Candidate.RPDelta = RPDelta;
+        Candidate.SCost = CurrentCost;
+        FoundCandidate = BestCost;
+        continue;
+      }
+    }
+
+    // Fall through to original instruction order.
+    // Only consider node order if Candidate was chosen from this Q.
+    if (FoundCandidate == NoCand)
+      continue;
+  }
+  return FoundCandidate;
+}
+
+/// Pick the best candidate node from either the top or bottom queue.
+SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
+  // Schedule as far as possible in the direction of no choice. This is most
+  // efficient, but also provides the best heuristics for CriticalPSets.
+  if (SUnit *SU = Bot.pickOnlyChoice()) {
+    DEBUG(dbgs() << "Picked only Bottom\n");
+    IsTopNode = false;
+    return SU;
+  }
+  if (SUnit *SU = Top.pickOnlyChoice()) {
+    DEBUG(dbgs() << "Picked only Top\n");
+    IsTopNode = true;
+    return SU;
+  }
+  SchedCandidate BotCand;
+  // Prefer bottom scheduling when heuristics are silent.
+  CandResult BotResult = pickNodeFromQueue(Bot.Available,
+                                           DAG->getBotRPTracker(), BotCand);
+  assert(BotResult != NoCand && "failed to find the first candidate");
+
+  // If either Q has a single candidate that provides the least increase in
+  // Excess pressure, we can immediately schedule from that Q.
+  //
+  // RegionCriticalPSets summarizes the pressure within the scheduled region and
+  // affects picking from either Q. If scheduling in one direction must
+  // increase pressure for one of the excess PSets, then schedule in that
+  // direction first to provide more freedom in the other direction.
+  if (BotResult == SingleExcess || BotResult == SingleCritical) {
+    DEBUG(dbgs() << "Prefered Bottom Node\n");
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  // Check if the top Q has a better candidate.
+  SchedCandidate TopCand;
+  CandResult TopResult = pickNodeFromQueue(Top.Available,
+                                           DAG->getTopRPTracker(), TopCand);
+  assert(TopResult != NoCand && "failed to find the first candidate");
+
+  if (TopResult == SingleExcess || TopResult == SingleCritical) {
+    DEBUG(dbgs() << "Prefered Top Node\n");
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // If either Q has a single candidate that minimizes pressure above the
+  // original region's pressure pick it.
+  if (BotResult == SingleMax) {
+    DEBUG(dbgs() << "Prefered Bottom Node SingleMax\n");
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  if (TopResult == SingleMax) {
+    DEBUG(dbgs() << "Prefered Top Node SingleMax\n");
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  if (TopCand.SCost > BotCand.SCost) {
+    DEBUG(dbgs() << "Prefered Top Node Cost\n");
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // Otherwise prefer the bottom candidate in node order.
+  DEBUG(dbgs() << "Prefered Bottom in Node order\n");
+  IsTopNode = false;
+  return BotCand.SU;
+}
+
+/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
+SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+    return nullptr;
+  }
+  SUnit *SU;
+  if (llvm::ForceTopDown) {
+    SU = Top.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate TopCand;
+      CandResult TopResult =
+        pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand);
+      assert(TopResult != NoCand && "failed to find the first candidate");
+      (void)TopResult;
+      SU = TopCand.SU;
+    }
+    IsTopNode = true;
+  } else if (llvm::ForceBottomUp) {
+    SU = Bot.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate BotCand;
+      CandResult BotResult =
+        pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand);
+      assert(BotResult != NoCand && "failed to find the first candidate");
+      (void)BotResult;
+      SU = BotCand.SU;
+    }
+    IsTopNode = false;
+  } else {
+    SU = pickNodeBidrectional(IsTopNode);
+  }
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
+
+  DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
+        << " Scheduling Instruction in cycle "
+        << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << '\n';
+        SU->dump(DAG));
+  return SU;
+}
+
+/// Update the scheduler's state after scheduling a node. This is the same node
+/// that was just returned by pickNode(). However, VLIWMachineScheduler needs
+/// to update it's state based on the current cycle before MachineSchedStrategy
+/// does.
+void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+  if (IsTopNode) {
+    SU->TopReadyCycle = Top.CurrCycle;
+    Top.bumpNode(SU);
+  } else {
+    SU->BotReadyCycle = Bot.CurrCycle;
+    Bot.bumpNode(SU);
+  }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
new file mode 100644
index 000000000000..dc10028c0424
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -0,0 +1,254 @@
+//===-- HexagonMachineScheduler.h - Custom Hexagon MI scheduler.      ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Custom Hexagon MI scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINESCHEDULER_H
+
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ResourcePriorityQueue.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+//===----------------------------------------------------------------------===//
+// ConvergingVLIWScheduler - Implementation of the standard
+// MachineSchedStrategy.
+//===----------------------------------------------------------------------===//
+
+class VLIWResourceModel {
+  /// ResourcesModel - Represents VLIW state.
+  /// Not limited to VLIW targets per say, but assumes
+  /// definition of DFA by a target.
+  DFAPacketizer *ResourcesModel;
+
+  const TargetSchedModel *SchedModel;
+
+  /// Local packet/bundle model. Purely
+  /// internal to the MI schedulre at the time.
+  std::vector<SUnit*> Packet;
+
+  /// Total packets created.
+  unsigned TotalPackets;
+
+public:
+  /// Save the last formed packet.
+  std::vector<SUnit*> OldPacket;
+
+public:
+  VLIWResourceModel(const TargetSubtargetInfo &STI, const TargetSchedModel *SM)
+      : SchedModel(SM), TotalPackets(0) {
+  ResourcesModel = STI.getInstrInfo()->CreateTargetScheduleState(STI);
+
+    // This hard requirement could be relaxed,
+    // but for now do not let it proceed.
+    assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
+
+    Packet.resize(SchedModel->getIssueWidth());
+    Packet.clear();
+    OldPacket.resize(SchedModel->getIssueWidth());
+    OldPacket.clear();
+    ResourcesModel->clearResources();
+  }
+
+  ~VLIWResourceModel() {
+    delete ResourcesModel;
+  }
+
+  void resetPacketState() {
+    Packet.clear();
+  }
+
+  void resetDFA() {
+    ResourcesModel->clearResources();
+  }
+
+  void reset() {
+    Packet.clear();
+    ResourcesModel->clearResources();
+  }
+
+  bool isResourceAvailable(SUnit *SU);
+  bool reserveResources(SUnit *SU);
+  void savePacket();
+  unsigned getTotalPackets() const { return TotalPackets; }
+
+  bool isInPacket(SUnit *SU) const { return is_contained(Packet, SU); }
+};
+
+/// Extend the standard ScheduleDAGMI to provide more context and override the
+/// top-level schedule() driver.
+class VLIWMachineScheduler : public ScheduleDAGMILive {
+public:
+  VLIWMachineScheduler(MachineSchedContext *C,
+                       std::unique_ptr<MachineSchedStrategy> S)
+      : ScheduleDAGMILive(C, std::move(S)) {}
+
+  /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
+  /// time to do some work.
+  void schedule() override;
+};
+
+/// ConvergingVLIWScheduler shrinks the unscheduled zone using heuristics
+/// to balance the schedule.
+class ConvergingVLIWScheduler : public MachineSchedStrategy {
+
+  /// Store the state used by ConvergingVLIWScheduler heuristics, required
+  ///  for the lifetime of one invocation of pickNode().
+  struct SchedCandidate {
+    // The best SUnit candidate.
+    SUnit *SU;
+
+    // Register pressure values for the best candidate.
+    RegPressureDelta RPDelta;
+
+    // Best scheduling cost.
+    int SCost;
+
+    SchedCandidate(): SU(nullptr), SCost(0) {}
+  };
+  /// Represent the type of SchedCandidate found within a single queue.
+  enum CandResult {
+    NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure,
+    BestCost};
+
+  /// Each Scheduling boundary is associated with ready queues. It tracks the
+  /// current cycle in whichever direction at has moved, and maintains the state
+  /// of "hazards" and other interlocks at the current cycle.
+  struct VLIWSchedBoundary {
+    VLIWMachineScheduler *DAG;
+    const TargetSchedModel *SchedModel;
+
+    ReadyQueue Available;
+    ReadyQueue Pending;
+    bool CheckPending;
+
+    ScheduleHazardRecognizer *HazardRec;
+    VLIWResourceModel *ResourceModel;
+
+    unsigned CurrCycle;
+    unsigned IssueCount;
+
+    /// MinReadyCycle - Cycle of the soonest available instruction.
+    unsigned MinReadyCycle;
+
+    // Remember the greatest min operand latency.
+    unsigned MaxMinLatency;
+
+    /// Pending queues extend the ready queues with the same ID and the
+    /// PendingFlag set.
+    VLIWSchedBoundary(unsigned ID, const Twine &Name):
+      DAG(nullptr), SchedModel(nullptr), Available(ID, Name+".A"),
+      Pending(ID << ConvergingVLIWScheduler::LogMaxQID, Name+".P"),
+      CheckPending(false), HazardRec(nullptr), ResourceModel(nullptr),
+      CurrCycle(0), IssueCount(0),
+      MinReadyCycle(UINT_MAX), MaxMinLatency(0) {}
+
+    ~VLIWSchedBoundary() {
+      delete ResourceModel;
+      delete HazardRec;
+    }
+
+    void init(VLIWMachineScheduler *dag, const TargetSchedModel *smodel) {
+      DAG = dag;
+      SchedModel = smodel;
+      IssueCount = 0;
+    }
+
+    bool isTop() const {
+      return Available.getID() == ConvergingVLIWScheduler::TopQID;
+    }
+
+    bool checkHazard(SUnit *SU);
+
+    void releaseNode(SUnit *SU, unsigned ReadyCycle);
+
+    void bumpCycle();
+
+    void bumpNode(SUnit *SU);
+
+    void releasePending();
+
+    void removeReady(SUnit *SU);
+
+    SUnit *pickOnlyChoice();
+  };
+
+  VLIWMachineScheduler *DAG;
+  const TargetSchedModel *SchedModel;
+
+  // State of the top and bottom scheduled instruction boundaries.
+  VLIWSchedBoundary Top;
+  VLIWSchedBoundary Bot;
+
+public:
+  /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
+  enum {
+    TopQID = 1,
+    BotQID = 2,
+    LogMaxQID = 2
+  };
+
+  ConvergingVLIWScheduler()
+    : DAG(nullptr), SchedModel(nullptr), Top(TopQID, "TopQ"),
+      Bot(BotQID, "BotQ") {}
+
+  void initialize(ScheduleDAGMI *dag) override;
+
+  SUnit *pickNode(bool &IsTopNode) override;
+
+  void schedNode(SUnit *SU, bool IsTopNode) override;
+
+  void releaseTopNode(SUnit *SU) override;
+
+  void releaseBottomNode(SUnit *SU) override;
+
+  unsigned ReportPackets() {
+    return Top.ResourceModel->getTotalPackets() +
+           Bot.ResourceModel->getTotalPackets();
+  }
+
+protected:
+  SUnit *pickNodeBidrectional(bool &IsTopNode);
+
+  int SchedulingCost(ReadyQueue &Q,
+                     SUnit *SU, SchedCandidate &Candidate,
+                     RegPressureDelta &Delta, bool verbose);
+
+  CandResult pickNodeFromQueue(ReadyQueue &Q,
+                               const RegPressureTracker &RPTracker,
+                               SchedCandidate &Candidate);
+#ifndef NDEBUG
+  void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
+                      int Cost, PressureChange P = PressureChange());
+
+  void readyQueueVerboseDump(const RegPressureTracker &RPTracker,
+                             SchedCandidate &Candidate, ReadyQueue &Q);
+#endif
+};
+
+} // namespace
+
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
new file mode 100644
index 000000000000..72d8011277e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -0,0 +1,701 @@
+//===----- HexagonNewValueJump.cpp - Hexagon Backend New Value Jump -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements NewValueJump pass in Hexagon.
+// Ideally, we should merge this as a Peephole pass prior to register
+// allocation, but because we have a spill in between the feeder and new value
+// jump instructions, we are forced to write after register allocation.
+// Having said that, we should re-attempt to pull this earlier at some point
+// in future.
+
+// The basic approach looks for sequence of predicated jump, compare instruciton
+// that genereates the predicate and, the feeder to the predicate. Once it finds
+// all, it collapses compare and jump instruction into a new valu jump
+// intstructions.
+//
+//
+//===----------------------------------------------------------------------===//
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-nvj"
+
+STATISTIC(NumNVJGenerated, "Number of New Value Jump Instructions created");
+
+static cl::opt<int>
+DbgNVJCount("nvj-count", cl::init(-1), cl::Hidden, cl::desc(
+  "Maximum number of predicated jumps to be converted to New Value Jump"));
+
+static cl::opt<bool> DisableNewValueJumps("disable-nvjump", cl::Hidden,
+    cl::ZeroOrMore, cl::init(false),
+    cl::desc("Disable New Value Jumps"));
+
+namespace llvm {
+  FunctionPass *createHexagonNewValueJump();
+  void initializeHexagonNewValueJumpPass(PassRegistry&);
+}
+
+
+namespace {
+  struct HexagonNewValueJump : public MachineFunctionPass {
+    const HexagonInstrInfo    *QII;
+    const HexagonRegisterInfo *QRI;
+
+  public:
+    static char ID;
+
+    HexagonNewValueJump() : MachineFunctionPass(ID) {
+      initializeHexagonNewValueJumpPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineBranchProbabilityInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    StringRef getPassName() const override { return "Hexagon NewValueJump"; }
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+  private:
+    /// \brief A handle to the branch probability pass.
+    const MachineBranchProbabilityInfo *MBPI;
+
+    bool isNewValueJumpCandidate(const MachineInstr &MI) const;
+  };
+
+} // end of anonymous namespace
+
+char HexagonNewValueJump::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonNewValueJump, "hexagon-nvj",
+                      "Hexagon NewValueJump", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_END(HexagonNewValueJump, "hexagon-nvj",
+                    "Hexagon NewValueJump", false, false)
+
+
+// We have identified this II could be feeder to NVJ,
+// verify that it can be.
+static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
+                                      const TargetRegisterInfo *TRI,
+                                      MachineBasicBlock::iterator II,
+                                      MachineBasicBlock::iterator end,
+                                      MachineBasicBlock::iterator skip,
+                                      MachineFunction &MF) {
+
+  // Predicated instruction can not be feeder to NVJ.
+  if (QII->isPredicated(*II))
+    return false;
+
+  // Bail out if feederReg is a paired register (double regs in
+  // our case). One would think that we can check to see if a given
+  // register cmpReg1 or cmpReg2 is a sub register of feederReg
+  // using -- if (QRI->isSubRegister(feederReg, cmpReg1) logic
+  // before the callsite of this function
+  // But we can not as it comes in the following fashion.
+  //    %D0<def> = Hexagon_S2_lsr_r_p %D0<kill>, %R2<kill>
+  //    %R0<def> = KILL %R0, %D0<imp-use,kill>
+  //    %P0<def> = CMPEQri %R0<kill>, 0
+  // Hence, we need to check if it's a KILL instruction.
+  if (II->getOpcode() == TargetOpcode::KILL)
+    return false;
+
+
+  // Make sure there there is no 'def' or 'use' of any of the uses of
+  // feeder insn between it's definition, this MI and jump, jmpInst
+  // skipping compare, cmpInst.
+  // Here's the example.
+  //    r21=memub(r22+r24<<#0)
+  //    p0 = cmp.eq(r21, #0)
+  //    r4=memub(r3+r21<<#0)
+  //    if (p0.new) jump:t .LBB29_45
+  // Without this check, it will be converted into
+  //    r4=memub(r3+r21<<#0)
+  //    r21=memub(r22+r24<<#0)
+  //    p0 = cmp.eq(r21, #0)
+  //    if (p0.new) jump:t .LBB29_45
+  // and result WAR hazards if converted to New Value Jump.
+
+  for (unsigned i = 0; i < II->getNumOperands(); ++i) {
+    if (II->getOperand(i).isReg() &&
+        (II->getOperand(i).isUse() || II->getOperand(i).isDef())) {
+      MachineBasicBlock::iterator localII = II;
+      ++localII;
+      unsigned Reg = II->getOperand(i).getReg();
+      for (MachineBasicBlock::iterator localBegin = localII;
+                        localBegin != end; ++localBegin) {
+        if (localBegin == skip ) continue;
+        // Check for Subregisters too.
+        if (localBegin->modifiesRegister(Reg, TRI) ||
+            localBegin->readsRegister(Reg, TRI))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+// These are the common checks that need to performed
+// to determine if
+// 1. compare instruction can be moved before jump.
+// 2. feeder to the compare instruction can be moved before jump.
+static bool commonChecksToProhibitNewValueJump(bool afterRA,
+                          MachineBasicBlock::iterator MII) {
+
+  // If store in path, bail out.
+  if (MII->getDesc().mayStore())
+    return false;
+
+  // if call in path, bail out.
+  if (MII->isCall())
+    return false;
+
+  // if NVJ is running prior to RA, do the following checks.
+  if (!afterRA) {
+    // The following Target Opcode instructions are spurious
+    // to new value jump. If they are in the path, bail out.
+    // KILL sets kill flag on the opcode. It also sets up a
+    // single register, out of pair.
+    //    %D0<def> = S2_lsr_r_p %D0<kill>, %R2<kill>
+    //    %R0<def> = KILL %R0, %D0<imp-use,kill>
+    //    %P0<def> = C2_cmpeqi %R0<kill>, 0
+    // PHI can be anything after RA.
+    // COPY can remateriaze things in between feeder, compare and nvj.
+    if (MII->getOpcode() == TargetOpcode::KILL ||
+        MII->getOpcode() == TargetOpcode::PHI  ||
+        MII->getOpcode() == TargetOpcode::COPY)
+      return false;
+
+    // The following pseudo Hexagon instructions sets "use" and "def"
+    // of registers by individual passes in the backend. At this time,
+    // we don't know the scope of usage and definitions of these
+    // instructions.
+    if (MII->getOpcode() == Hexagon::LDriw_pred ||
+        MII->getOpcode() == Hexagon::STriw_pred)
+      return false;
+  }
+
+  return true;
+}
+
+static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
+                                     const TargetRegisterInfo *TRI,
+                                     MachineBasicBlock::iterator II,
+                                     unsigned pReg,
+                                     bool secondReg,
+                                     bool optLocation,
+                                     MachineBasicBlock::iterator end,
+                                     MachineFunction &MF) {
+
+  MachineInstr &MI = *II;
+
+  // If the second operand of the compare is an imm, make sure it's in the
+  // range specified by the arch.
+  if (!secondReg) {
+    int64_t v = MI.getOperand(2).getImm();
+    bool Valid = false;
+
+    switch (MI.getOpcode()) {
+      case Hexagon::C2_cmpeqi:
+      case Hexagon::C2_cmpgti:
+        Valid = (isUInt<5>(v) || v == -1);
+        break;
+      case Hexagon::C2_cmpgtui:
+        Valid = isUInt<5>(v);
+        break;
+      case Hexagon::S2_tstbit_i:
+      case Hexagon::S4_ntstbit_i:
+        Valid = (v == 0);
+        break;
+    }
+
+    if (!Valid)
+      return false;
+  }
+
+  unsigned cmpReg1, cmpOp2 = 0; // cmpOp2 assignment silences compiler warning.
+  cmpReg1 = MI.getOperand(1).getReg();
+
+  if (secondReg) {
+    cmpOp2 = MI.getOperand(2).getReg();
+
+    // If the same register appears as both operands, we cannot generate a new
+    // value compare. Only one operand may use the .new suffix.
+    if (cmpReg1 == cmpOp2)
+      return false;
+
+    // Make sure that that second register is not from COPY
+    // At machine code level, we don't need this, but if we decide
+    // to move new value jump prior to RA, we would be needing this.
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    if (secondReg && !TargetRegisterInfo::isPhysicalRegister(cmpOp2)) {
+      MachineInstr *def = MRI.getVRegDef(cmpOp2);
+      if (def->getOpcode() == TargetOpcode::COPY)
+        return false;
+    }
+  }
+
+  // Walk the instructions after the compare (predicate def) to the jump,
+  // and satisfy the following conditions.
+  ++II ;
+  for (MachineBasicBlock::iterator localII = II; localII != end;
+       ++localII) {
+    if (localII->isDebugValue())
+      continue;
+
+    // Check 1.
+    // If "common" checks fail, bail out.
+    if (!commonChecksToProhibitNewValueJump(optLocation, localII))
+      return false;
+
+    // Check 2.
+    // If there is a def or use of predicate (result of compare), bail out.
+    if (localII->modifiesRegister(pReg, TRI) ||
+        localII->readsRegister(pReg, TRI))
+      return false;
+
+    // Check 3.
+    // If there is a def of any of the use of the compare (operands of compare),
+    // bail out.
+    // Eg.
+    //    p0 = cmp.eq(r2, r0)
+    //    r2 = r4
+    //    if (p0.new) jump:t .LBB28_3
+    if (localII->modifiesRegister(cmpReg1, TRI) ||
+        (secondReg && localII->modifiesRegister(cmpOp2, TRI)))
+      return false;
+  }
+  return true;
+}
+
+
+// Given a compare operator, return a matching New Value Jump compare operator.
+// Make sure that MI here is included in isNewValueJumpCandidate.
+static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg,
+                                      bool secondRegNewified,
+                                      MachineBasicBlock *jmpTarget,
+                                      const MachineBranchProbabilityInfo
+                                      *MBPI) {
+  bool taken = false;
+  MachineBasicBlock *Src = MI->getParent();
+  const BranchProbability Prediction =
+    MBPI->getEdgeProbability(Src, jmpTarget);
+
+  if (Prediction >= BranchProbability(1,2))
+    taken = true;
+
+  switch (MI->getOpcode()) {
+    case Hexagon::C2_cmpeq:
+      return taken ? Hexagon::J4_cmpeq_t_jumpnv_t
+                   : Hexagon::J4_cmpeq_t_jumpnv_nt;
+
+    case Hexagon::C2_cmpeqi: {
+      if (reg >= 0)
+        return taken ? Hexagon::J4_cmpeqi_t_jumpnv_t
+                     : Hexagon::J4_cmpeqi_t_jumpnv_nt;
+      else
+        return taken ? Hexagon::J4_cmpeqn1_t_jumpnv_t
+                     : Hexagon::J4_cmpeqn1_t_jumpnv_nt;
+    }
+
+    case Hexagon::C2_cmpgt: {
+      if (secondRegNewified)
+        return taken ? Hexagon::J4_cmplt_t_jumpnv_t
+                     : Hexagon::J4_cmplt_t_jumpnv_nt;
+      else
+        return taken ? Hexagon::J4_cmpgt_t_jumpnv_t
+                     : Hexagon::J4_cmpgt_t_jumpnv_nt;
+    }
+
+    case Hexagon::C2_cmpgti: {
+      if (reg >= 0)
+        return taken ? Hexagon::J4_cmpgti_t_jumpnv_t
+                     : Hexagon::J4_cmpgti_t_jumpnv_nt;
+      else
+        return taken ? Hexagon::J4_cmpgtn1_t_jumpnv_t
+                     : Hexagon::J4_cmpgtn1_t_jumpnv_nt;
+    }
+
+    case Hexagon::C2_cmpgtu: {
+      if (secondRegNewified)
+        return taken ? Hexagon::J4_cmpltu_t_jumpnv_t
+                     : Hexagon::J4_cmpltu_t_jumpnv_nt;
+      else
+        return taken ? Hexagon::J4_cmpgtu_t_jumpnv_t
+                     : Hexagon::J4_cmpgtu_t_jumpnv_nt;
+    }
+
+    case Hexagon::C2_cmpgtui:
+      return taken ? Hexagon::J4_cmpgtui_t_jumpnv_t
+                   : Hexagon::J4_cmpgtui_t_jumpnv_nt;
+
+    case Hexagon::C4_cmpneq:
+      return taken ? Hexagon::J4_cmpeq_f_jumpnv_t
+                   : Hexagon::J4_cmpeq_f_jumpnv_nt;
+
+    case Hexagon::C4_cmplte:
+      if (secondRegNewified)
+        return taken ? Hexagon::J4_cmplt_f_jumpnv_t
+                     : Hexagon::J4_cmplt_f_jumpnv_nt;
+      return taken ? Hexagon::J4_cmpgt_f_jumpnv_t
+                   : Hexagon::J4_cmpgt_f_jumpnv_nt;
+
+    case Hexagon::C4_cmplteu:
+      if (secondRegNewified)
+        return taken ? Hexagon::J4_cmpltu_f_jumpnv_t
+                     : Hexagon::J4_cmpltu_f_jumpnv_nt;
+      return taken ? Hexagon::J4_cmpgtu_f_jumpnv_t
+                   : Hexagon::J4_cmpgtu_f_jumpnv_nt;
+
+    default:
+       llvm_unreachable("Could not find matching New Value Jump instruction.");
+  }
+  // return *some value* to avoid compiler warning
+  return 0;
+}
+
+bool HexagonNewValueJump::isNewValueJumpCandidate(
+    const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case Hexagon::C2_cmpeq:
+  case Hexagon::C2_cmpeqi:
+  case Hexagon::C2_cmpgt:
+  case Hexagon::C2_cmpgti:
+  case Hexagon::C2_cmpgtu:
+  case Hexagon::C2_cmpgtui:
+  case Hexagon::C4_cmpneq:
+  case Hexagon::C4_cmplte:
+  case Hexagon::C4_cmplteu:
+    return true;
+
+  default:
+    return false;
+  }
+}
+
+
+bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
+
+  DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n"
+               << "********** Function: "
+               << MF.getName() << "\n");
+
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  // If we move NewValueJump before register allocation we'll need live variable
+  // analysis here too.
+
+  QII = static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  QRI = static_cast<const HexagonRegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+
+  if (DisableNewValueJumps) {
+    return false;
+  }
+
+  int nvjCount = DbgNVJCount;
+  int nvjGenerated = 0;
+
+  // Loop through all the bb's of the function
+  for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
+        MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock *MBB = &*MBBb;
+
+    DEBUG(dbgs() << "** dumping bb ** "
+                 << MBB->getNumber() << "\n");
+    DEBUG(MBB->dump());
+    DEBUG(dbgs() << "\n" << "********** dumping instr bottom up **********\n");
+    bool foundJump    = false;
+    bool foundCompare = false;
+    bool invertPredicate = false;
+    unsigned predReg = 0; // predicate reg of the jump.
+    unsigned cmpReg1 = 0;
+    int cmpOp2 = 0;
+    bool MO1IsKill = false;
+    bool MO2IsKill = false;
+    MachineBasicBlock::iterator jmpPos;
+    MachineBasicBlock::iterator cmpPos;
+    MachineInstr *cmpInstr = nullptr, *jmpInstr = nullptr;
+    MachineBasicBlock *jmpTarget = nullptr;
+    bool afterRA = false;
+    bool isSecondOpReg = false;
+    bool isSecondOpNewified = false;
+    // Traverse the basic block - bottom up
+    for (MachineBasicBlock::iterator MII = MBB->end(), E = MBB->begin();
+             MII != E;) {
+      MachineInstr &MI = *--MII;
+      if (MI.isDebugValue()) {
+        continue;
+      }
+
+      if ((nvjCount == 0) || (nvjCount > -1 && nvjCount <= nvjGenerated))
+        break;
+
+      DEBUG(dbgs() << "Instr: "; MI.dump(); dbgs() << "\n");
+
+      if (!foundJump && (MI.getOpcode() == Hexagon::J2_jumpt ||
+                         MI.getOpcode() == Hexagon::J2_jumptpt ||
+                         MI.getOpcode() == Hexagon::J2_jumpf ||
+                         MI.getOpcode() == Hexagon::J2_jumpfpt ||
+                         MI.getOpcode() == Hexagon::J2_jumptnewpt ||
+                         MI.getOpcode() == Hexagon::J2_jumptnew ||
+                         MI.getOpcode() == Hexagon::J2_jumpfnewpt ||
+                         MI.getOpcode() == Hexagon::J2_jumpfnew)) {
+        // This is where you would insert your compare and
+        // instr that feeds compare
+        jmpPos = MII;
+        jmpInstr = &MI;
+        predReg = MI.getOperand(0).getReg();
+        afterRA = TargetRegisterInfo::isPhysicalRegister(predReg);
+
+        // If ifconverter had not messed up with the kill flags of the
+        // operands, the following check on the kill flag would suffice.
+        // if(!jmpInstr->getOperand(0).isKill()) break;
+
+        // This predicate register is live out out of BB
+        // this would only work if we can actually use Live
+        // variable analysis on phy regs - but LLVM does not
+        // provide LV analysis on phys regs.
+        //if(LVs.isLiveOut(predReg, *MBB)) break;
+
+        // Get all the successors of this block - which will always
+        // be 2. Check if the predicate register is live-in in those
+        // successor. If yes, we can not delete the predicate -
+        // I am doing this only because LLVM does not provide LiveOut
+        // at the BB level.
+        bool predLive = false;
+        for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
+                            SIE = MBB->succ_end(); SI != SIE; ++SI) {
+          MachineBasicBlock* succMBB = *SI;
+         if (succMBB->isLiveIn(predReg)) {
+            predLive = true;
+          }
+        }
+        if (predLive)
+          break;
+
+        if (!MI.getOperand(1).isMBB())
+          continue;
+        jmpTarget = MI.getOperand(1).getMBB();
+        foundJump = true;
+        if (MI.getOpcode() == Hexagon::J2_jumpf ||
+            MI.getOpcode() == Hexagon::J2_jumpfnewpt ||
+            MI.getOpcode() == Hexagon::J2_jumpfnew) {
+          invertPredicate = true;
+        }
+        continue;
+      }
+
+      // No new value jump if there is a barrier. A barrier has to be in its
+      // own packet. A barrier has zero operands. We conservatively bail out
+      // here if we see any instruction with zero operands.
+      if (foundJump && MI.getNumOperands() == 0)
+        break;
+
+      if (foundJump && !foundCompare && MI.getOperand(0).isReg() &&
+          MI.getOperand(0).getReg() == predReg) {
+
+        // Not all compares can be new value compare. Arch Spec: 7.6.1.1
+        if (isNewValueJumpCandidate(MI)) {
+
+          assert(
+              (MI.getDesc().isCompare()) &&
+              "Only compare instruction can be collapsed into New Value Jump");
+          isSecondOpReg = MI.getOperand(2).isReg();
+
+          if (!canCompareBeNewValueJump(QII, QRI, MII, predReg, isSecondOpReg,
+                                        afterRA, jmpPos, MF))
+            break;
+
+          cmpInstr = &MI;
+          cmpPos = MII;
+          foundCompare = true;
+
+          // We need cmpReg1 and cmpOp2(imm or reg) while building
+          // new value jump instruction.
+          cmpReg1 = MI.getOperand(1).getReg();
+          if (MI.getOperand(1).isKill())
+            MO1IsKill = true;
+
+          if (isSecondOpReg) {
+            cmpOp2 = MI.getOperand(2).getReg();
+            if (MI.getOperand(2).isKill())
+              MO2IsKill = true;
+          } else
+            cmpOp2 = MI.getOperand(2).getImm();
+          continue;
+        }
+      }
+
+      if (foundCompare && foundJump) {
+
+        // If "common" checks fail, bail out on this BB.
+        if (!commonChecksToProhibitNewValueJump(afterRA, MII))
+          break;
+
+        bool foundFeeder = false;
+        MachineBasicBlock::iterator feederPos = MII;
+        if (MI.getOperand(0).isReg() && MI.getOperand(0).isDef() &&
+            (MI.getOperand(0).getReg() == cmpReg1 ||
+             (isSecondOpReg &&
+              MI.getOperand(0).getReg() == (unsigned)cmpOp2))) {
+
+          unsigned feederReg = MI.getOperand(0).getReg();
+
+          // First try to see if we can get the feeder from the first operand
+          // of the compare. If we can not, and if secondOpReg is true
+          // (second operand of the compare is also register), try that one.
+          // TODO: Try to come up with some heuristic to figure out which
+          // feeder would benefit.
+
+          if (feederReg == cmpReg1) {
+            if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF)) {
+              if (!isSecondOpReg)
+                break;
+              else
+                continue;
+            } else
+              foundFeeder = true;
+          }
+
+          if (!foundFeeder &&
+               isSecondOpReg &&
+               feederReg == (unsigned) cmpOp2)
+            if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF))
+              break;
+
+          if (isSecondOpReg) {
+            // In case of CMPLT, or CMPLTU, or EQ with the second register
+            // to newify, swap the operands.
+            unsigned COp = cmpInstr->getOpcode();
+            if ((COp == Hexagon::C2_cmpeq || COp == Hexagon::C4_cmpneq) &&
+                (feederReg == (unsigned) cmpOp2)) {
+              unsigned tmp = cmpReg1;
+              bool tmpIsKill = MO1IsKill;
+              cmpReg1 = cmpOp2;
+              MO1IsKill = MO2IsKill;
+              cmpOp2 = tmp;
+              MO2IsKill = tmpIsKill;
+            }
+
+            // Now we have swapped the operands, all we need to check is,
+            // if the second operand (after swap) is the feeder.
+            // And if it is, make a note.
+            if (feederReg == (unsigned)cmpOp2)
+              isSecondOpNewified = true;
+          }
+
+          // Now that we are moving feeder close the jump,
+          // make sure we are respecting the kill values of
+          // the operands of the feeder.
+
+          bool updatedIsKill = false;
+          for (unsigned i = 0; i < MI.getNumOperands(); i++) {
+            MachineOperand &MO = MI.getOperand(i);
+            if (MO.isReg() && MO.isUse()) {
+              unsigned feederReg = MO.getReg();
+              for (MachineBasicBlock::iterator localII = feederPos,
+                   end = jmpPos; localII != end; localII++) {
+                MachineInstr &localMI = *localII;
+                for (unsigned j = 0; j < localMI.getNumOperands(); j++) {
+                  MachineOperand &localMO = localMI.getOperand(j);
+                  if (localMO.isReg() && localMO.isUse() &&
+                      localMO.isKill() && feederReg == localMO.getReg()) {
+                    // We found that there is kill of a use register
+                    // Set up a kill flag on the register
+                    localMO.setIsKill(false);
+                    MO.setIsKill();
+                    updatedIsKill = true;
+                    break;
+                  }
+                }
+                if (updatedIsKill) break;
+              }
+            }
+            if (updatedIsKill) break;
+          }
+
+          MBB->splice(jmpPos, MI.getParent(), MI);
+          MBB->splice(jmpPos, MI.getParent(), cmpInstr);
+          DebugLoc dl = MI.getDebugLoc();
+          MachineInstr *NewMI;
+
+          assert((isNewValueJumpCandidate(*cmpInstr)) &&
+                 "This compare is not a New Value Jump candidate.");
+          unsigned opc = getNewValueJumpOpcode(cmpInstr, cmpOp2,
+                                               isSecondOpNewified,
+                                               jmpTarget, MBPI);
+          if (invertPredicate)
+            opc = QII->getInvertedPredicatedOpcode(opc);
+
+          if (isSecondOpReg)
+            NewMI = BuildMI(*MBB, jmpPos, dl,
+                                  QII->get(opc))
+                                    .addReg(cmpReg1, getKillRegState(MO1IsKill))
+                                    .addReg(cmpOp2, getKillRegState(MO2IsKill))
+                                    .addMBB(jmpTarget);
+
+          else
+            NewMI = BuildMI(*MBB, jmpPos, dl,
+                                  QII->get(opc))
+                                    .addReg(cmpReg1, getKillRegState(MO1IsKill))
+                                    .addImm(cmpOp2)
+                                    .addMBB(jmpTarget);
+
+          assert(NewMI && "New Value Jump Instruction Not created!");
+          (void)NewMI;
+          if (cmpInstr->getOperand(0).isReg() &&
+              cmpInstr->getOperand(0).isKill())
+            cmpInstr->getOperand(0).setIsKill(false);
+          if (cmpInstr->getOperand(1).isReg() &&
+              cmpInstr->getOperand(1).isKill())
+            cmpInstr->getOperand(1).setIsKill(false);
+          cmpInstr->eraseFromParent();
+          jmpInstr->eraseFromParent();
+          ++nvjGenerated;
+          ++NumNVJGenerated;
+          break;
+        }
+      }
+    }
+  }
+
+  return true;
+
+}
+
+FunctionPass *llvm::createHexagonNewValueJump() {
+  return new HexagonNewValueJump();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td
new file mode 100644
index 000000000000..983310571563
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td
@@ -0,0 +1,332 @@
+//===- HexagonImmediates.td - Hexagon immediate processing -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illnois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def s32_0ImmOperand : AsmOperandClass { let Name = "s32_0Imm"; }
+def s23_2ImmOperand : AsmOperandClass { let Name = "s23_2Imm"; }
+def s8_0ImmOperand : AsmOperandClass { let Name = "s8_0Imm"; }
+def s8_0Imm64Operand : AsmOperandClass { let Name = "s8_0Imm64"; }
+def s6_0ImmOperand : AsmOperandClass { let Name = "s6_0Imm"; }
+def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; }
+def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; }
+def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; }
+def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; }
+def s4_6ImmOperand : AsmOperandClass { let Name = "s4_6Imm"; }
+def s3_6ImmOperand : AsmOperandClass { let Name = "s3_6Imm"; }
+def u64_0ImmOperand : AsmOperandClass { let Name = "u64_0Imm"; }
+def u32_0ImmOperand : AsmOperandClass { let Name = "u32_0Imm"; }
+def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; }
+def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; }
+def u16_1ImmOperand : AsmOperandClass { let Name = "u16_1Imm"; }
+def u16_2ImmOperand : AsmOperandClass { let Name = "u16_2Imm"; }
+def u16_3ImmOperand : AsmOperandClass { let Name = "u16_3Imm"; }
+def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; }
+def u10_0ImmOperand : AsmOperandClass { let Name = "u10_0Imm"; }
+def u9_0ImmOperand : AsmOperandClass { let Name = "u9_0Imm"; }
+def u8_0ImmOperand : AsmOperandClass { let Name = "u8_0Imm"; }
+def u7_0ImmOperand : AsmOperandClass { let Name = "u7_0Imm"; }
+def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; }
+def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; }
+def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; }
+def u6_3ImmOperand : AsmOperandClass { let Name = "u6_3Imm"; }
+def u5_0ImmOperand : AsmOperandClass { let Name = "u5_0Imm"; }
+def u4_0ImmOperand : AsmOperandClass { let Name = "u4_0Imm"; }
+def u3_0ImmOperand : AsmOperandClass { let Name = "u3_0Imm"; }
+def u2_0ImmOperand : AsmOperandClass { let Name = "u2_0Imm"; }
+def u1_0ImmOperand : AsmOperandClass { let Name = "u1_0Imm"; }
+def n8_0ImmOperand : AsmOperandClass { let Name = "n8_0Imm"; }
+// Immediate operands.
+
+let OperandType = "OPERAND_IMMEDIATE",
+    DecoderMethod = "unsignedImmDecoder" in {
+  def s32_0Imm : Operand<i32> { let ParserMatchClass = s32_0ImmOperand;
+                                let DecoderMethod = "s32_0ImmDecoder"; }
+  def s23_2Imm : Operand<i32> { let ParserMatchClass = s23_2ImmOperand; }
+  def s8_0Imm : Operand<i32> { let ParserMatchClass = s8_0ImmOperand;
+                               let DecoderMethod = "s8_0ImmDecoder"; }
+  def s8_0Imm64 : Operand<i64>  { let ParserMatchClass = s8_0Imm64Operand;
+                                  let DecoderMethod = "s8_0ImmDecoder"; }
+  def s6_0Imm : Operand<i32> { let ParserMatchClass = s6_0ImmOperand;
+                             let DecoderMethod = "s6_0ImmDecoder"; }
+  def s6_3Imm : Operand<i32>;
+  def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand;
+                               let DecoderMethod = "s4_0ImmDecoder"; }
+  def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand;
+                               let DecoderMethod = "s4_1ImmDecoder"; }
+  def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand;
+                               let DecoderMethod = "s4_2ImmDecoder"; }
+  def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand;
+                               let DecoderMethod = "s4_3ImmDecoder"; }
+  def u64_0Imm : Operand<i64> { let ParserMatchClass = u64_0ImmOperand; }
+  def u32_0Imm : Operand<i32> { let ParserMatchClass = u32_0ImmOperand; }
+  def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; }
+  def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; }
+  def u16_1Imm : Operand<i32> { let ParserMatchClass = u16_1ImmOperand; }
+  def u16_2Imm : Operand<i32> { let ParserMatchClass = u16_2ImmOperand; }
+  def u16_3Imm : Operand<i32> { let ParserMatchClass = u16_3ImmOperand; }
+  def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; }
+  def u10_0Imm : Operand<i32> { let ParserMatchClass = u10_0ImmOperand; }
+  def u9_0Imm : Operand<i32> { let ParserMatchClass = u9_0ImmOperand; }
+  def u8_0Imm : Operand<i32> { let ParserMatchClass = u8_0ImmOperand; }
+  def u7_0Imm : Operand<i32> { let ParserMatchClass = u7_0ImmOperand; }
+  def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; }
+  def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; }
+  def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; }
+  def u6_3Imm : Operand<i32> { let ParserMatchClass = u6_3ImmOperand; }
+  def u5_0Imm : Operand<i32> { let ParserMatchClass = u5_0ImmOperand; }
+  def u5_1Imm : Operand<i32>;
+  def u5_2Imm : Operand<i32>;
+  def u5_3Imm : Operand<i32>;
+  def u4_0Imm : Operand<i32> { let ParserMatchClass = u4_0ImmOperand; }
+  def u4_1Imm : Operand<i32>;
+  def u4_2Imm : Operand<i32>;
+  def u4_3Imm : Operand<i32>;
+  def u3_0Imm : Operand<i32> { let ParserMatchClass = u3_0ImmOperand; }
+  def u3_1Imm : Operand<i32>;
+  def u3_2Imm : Operand<i32>;
+  def u3_3Imm : Operand<i32>;
+  def u2_0Imm : Operand<i32> { let ParserMatchClass = u2_0ImmOperand; }
+  def u1_0Imm : Operand<i32> { let ParserMatchClass = u1_0ImmOperand; }
+  def n8_0Imm : Operand<i32> { let ParserMatchClass = n8_0ImmOperand; }
+}
+
+let OperandType = "OPERAND_IMMEDIATE" in {
+  def s4_6Imm : Operand<i32> { let ParserMatchClass = s4_6ImmOperand;
+                               let PrintMethod = "prints4_6ImmOperand";
+                               let DecoderMethod = "s4_6ImmDecoder";}
+  def s4_7Imm : Operand<i32> { let PrintMethod = "prints4_7ImmOperand";
+                               let DecoderMethod = "s4_6ImmDecoder";}
+  def s3_6Imm : Operand<i32> { let ParserMatchClass = s3_6ImmOperand;
+                               let PrintMethod = "prints3_6ImmOperand";
+                               let DecoderMethod = "s3_6ImmDecoder";}
+  def s3_7Imm : Operand<i32> { let PrintMethod = "prints3_7ImmOperand";
+                               let DecoderMethod = "s3_6ImmDecoder";}
+}
+def n1ConstOperand : AsmOperandClass { let Name = "n1Const"; }
+def n1Const : Operand<i32> { let ParserMatchClass = n1ConstOperand; }
+
+//
+// Immediate predicates
+//
+def s32_0ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<32>(v);
+}]>;
+
+def s31_1ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<31,1>(v);
+}]>;
+
+def s30_2ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<30,2>(v);
+}]>;
+
+def s29_3ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<29,3>(v);
+}]>;
+
+def s10_0ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<10>(v);
+}]>;
+
+def s8_0ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<8>(v);
+}]>;
+
+def s8_0Imm64Pred  : PatLeaf<(i64 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<8>(v);
+}]>;
+
+def s6_0ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<6>(v);
+}]>;
+
+def s4_0ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<4>(v);
+}]>;
+
+def s4_1ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<4,1>(v);
+}]>;
+
+def s4_2ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<4,2>(v);
+}]>;
+
+def s4_3ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<4,3>(v);
+}]>;
+
+def u32_0ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<32>(v);
+}]>;
+
+def u16_0ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<16>(v);
+}]>;
+
+def u11_3ImmPred : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<11,3>(v);
+}]>;
+
+def u9_0ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<9>(v);
+}]>;
+
+def u8_0ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<8>(v);
+}]>;
+
+def u6_0ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<6>(v);
+}]>;
+
+def u6_1ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,1>(v);
+}]>;
+
+def u6_2ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,2>(v);
+}]>;
+
+def u5_0ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<5>(v);
+}]>;
+
+def u4_0ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<4>(v);
+}]>;
+
+def u3_0ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<3>(v);
+}]>;
+
+def u2_0ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<2>(v);
+}]>;
+
+// Extendable immediate operands.
+def f32ExtOperand : AsmOperandClass { let Name = "f32Ext"; }
+def s16_0ExtOperand : AsmOperandClass { let Name = "s16_0Ext"; }
+def s12_0ExtOperand : AsmOperandClass { let Name = "s12_0Ext"; }
+def s10_0ExtOperand : AsmOperandClass { let Name = "s10_0Ext"; }
+def s9_0ExtOperand : AsmOperandClass { let Name = "s9_0Ext"; }
+def s8_0ExtOperand : AsmOperandClass { let Name = "s8_0Ext"; }
+def s7_0ExtOperand : AsmOperandClass { let Name = "s7_0Ext"; }
+def s6_0ExtOperand : AsmOperandClass { let Name = "s6_0Ext"; }
+def s11_0ExtOperand : AsmOperandClass { let Name = "s11_0Ext"; }
+def s11_1ExtOperand : AsmOperandClass { let Name = "s11_1Ext"; }
+def s11_2ExtOperand : AsmOperandClass { let Name = "s11_2Ext"; }
+def s11_3ExtOperand : AsmOperandClass { let Name = "s11_3Ext"; }
+def u6_0ExtOperand : AsmOperandClass { let Name = "u6_0Ext"; }
+def u7_0ExtOperand : AsmOperandClass { let Name = "u7_0Ext"; }
+def u8_0ExtOperand : AsmOperandClass { let Name = "u8_0Ext"; }
+def u9_0ExtOperand : AsmOperandClass { let Name = "u9_0Ext"; }
+def u10_0ExtOperand : AsmOperandClass { let Name = "u10_0Ext"; }
+def u6_1ExtOperand : AsmOperandClass { let Name = "u6_1Ext"; }
+def u6_2ExtOperand : AsmOperandClass { let Name = "u6_2Ext"; }
+def u6_3ExtOperand : AsmOperandClass { let Name = "u6_3Ext"; }
+def u32_0MustExtOperand : AsmOperandClass { let Name = "u32_0MustExt"; }
+
+
+
+let OperandType = "OPERAND_IMMEDIATE", PrintMethod = "printExtOperand",
+    DecoderMethod = "unsignedImmDecoder" in {
+  def f32Ext : Operand<f32> { let ParserMatchClass = f32ExtOperand; }
+  def s16_0Ext : Operand<i32> { let ParserMatchClass = s16_0ExtOperand;
+                                let DecoderMethod = "s16_0ImmDecoder"; }
+  def s12_0Ext : Operand<i32> { let ParserMatchClass = s12_0ExtOperand;
+                                let DecoderMethod = "s12_0ImmDecoder"; }
+  def s11_0Ext : Operand<i32> { let ParserMatchClass = s11_0ExtOperand;
+                                let DecoderMethod = "s11_0ImmDecoder"; }
+  def s11_1Ext : Operand<i32> { let ParserMatchClass = s11_1ExtOperand;
+                                let DecoderMethod = "s11_1ImmDecoder"; }
+  def s11_2Ext : Operand<i32> { let ParserMatchClass = s11_2ExtOperand;
+                                let DecoderMethod = "s11_2ImmDecoder"; }
+  def s11_3Ext : Operand<i32> { let ParserMatchClass = s11_3ExtOperand;
+                                let DecoderMethod = "s11_3ImmDecoder"; }
+  def s10_0Ext : Operand<i32> { let ParserMatchClass = s10_0ExtOperand;
+                                let DecoderMethod = "s10_0ImmDecoder"; }
+  def s9_0Ext : Operand<i32> { let ParserMatchClass = s9_0ExtOperand;
+                               let DecoderMethod = "s9_0ImmDecoder"; }
+  def s8_0Ext : Operand<i32> { let ParserMatchClass = s8_0ExtOperand;
+                               let DecoderMethod = "s8_0ImmDecoder"; }
+  def s7_0Ext : Operand<i32> { let ParserMatchClass = s7_0ExtOperand; }
+  def s6_0Ext : Operand<i32> { let ParserMatchClass = s6_0ExtOperand;
+                               let DecoderMethod = "s6_0ImmDecoder"; }
+  def u7_0Ext : Operand<i32> { let ParserMatchClass = u7_0ExtOperand; }
+  def u8_0Ext : Operand<i32> { let ParserMatchClass = u8_0ExtOperand; }
+  def u9_0Ext : Operand<i32> { let ParserMatchClass = u9_0ExtOperand; }
+  def u10_0Ext : Operand<i32> { let ParserMatchClass = u10_0ExtOperand; }
+  def u6_0Ext : Operand<i32> { let ParserMatchClass = u6_0ExtOperand; }
+  def u6_1Ext : Operand<i32> { let ParserMatchClass = u6_1ExtOperand; }
+  def u6_2Ext : Operand<i32> { let ParserMatchClass = u6_2ExtOperand; }
+  def u6_3Ext : Operand<i32> { let ParserMatchClass = u6_3ExtOperand; }
+  def u32_0MustExt : Operand<i32> { let ParserMatchClass = u32_0MustExtOperand; }
+}
+
+
+// This complex pattern exists only to create a machine instruction operand
+// of type "frame index". There doesn't seem to be a way to do that directly
+// in the patterns.
+def AddrFI : ComplexPattern<i32, 1, "SelectAddrFI", [frameindex], []>;
+
+// These complex patterns are not strictly necessary, since global address
+// folding will happen during DAG combining. For distinguishing between GA
+// and GP, pat frags with HexagonCONST32 and HexagonCONST32_GP can be used.
+def AddrGA : ComplexPattern<i32, 1, "SelectAddrGA", [], []>;
+def AddrGP : ComplexPattern<i32, 1, "SelectAddrGP", [], []>;
+
+// Address operands.
+
+let PrintMethod = "printGlobalOperand" in {
+  def globaladdress : Operand<i32>;
+  def globaladdressExt : Operand<i32>;
+}
+
+let PrintMethod = "printJumpTable" in
+def jumptablebase : Operand<i32>;
+
+def brtarget : Operand<OtherVT> {
+  let DecoderMethod = "brtargetDecoder";
+  let PrintMethod = "printBrtarget";
+}
+def brtargetExt : Operand<OtherVT> {
+  let DecoderMethod = "brtargetDecoder";
+  let PrintMethod = "printBrtarget";
+}
+def calltarget : Operand<i32> {
+  let DecoderMethod = "brtargetDecoder";
+  let PrintMethod = "printBrtarget";
+}
+
+def bblabel : Operand<i32>;
+def bbl     : SDNode<"ISD::BasicBlock", SDTPtrLeaf, [], "BasicBlockSDNode">;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
new file mode 100644
index 000000000000..89db46799cb3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -0,0 +1,678 @@
+//===--- HexagonOptAddrMode.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This implements a Hexagon-specific pass to optimize addressing mode for
+// load/store instructions.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "opt-addr-mode"
+
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <map>
+
+static cl::opt<int> CodeGrowthLimit("hexagon-amode-growth-limit",
+  cl::Hidden, cl::init(0), cl::desc("Code growth limit for address mode "
+  "optimization"));
+
+using namespace llvm;
+using namespace rdf;
+
+namespace llvm {
+
+  FunctionPass *createHexagonOptAddrMode();
+  void initializeHexagonOptAddrModePass(PassRegistry &);
+
+} // end namespace llvm
+
+namespace {
+
+class HexagonOptAddrMode : public MachineFunctionPass {
+public:
+  static char ID;
+
+  HexagonOptAddrMode()
+      : MachineFunctionPass(ID), HII(nullptr), MDT(nullptr), DFG(nullptr),
+        LV(nullptr) {
+    PassRegistry &R = *PassRegistry::getPassRegistry();
+    initializeHexagonOptAddrModePass(R);
+  }
+
+  StringRef getPassName() const override {
+    return "Optimize addressing mode of load/store";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominanceFrontier>();
+    AU.setPreservesAll();
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  typedef DenseSet<MachineInstr *> MISetType;
+  typedef DenseMap<MachineInstr *, bool> InstrEvalMap;
+  const HexagonInstrInfo *HII;
+  MachineDominatorTree *MDT;
+  DataFlowGraph *DFG;
+  DataFlowGraph::DefStackMap DefM;
+  std::map<RegisterRef, std::map<NodeId, NodeId>> RDefMap;
+  Liveness *LV;
+  MISetType Deleted;
+
+  bool processBlock(NodeAddr<BlockNode *> BA);
+  bool xformUseMI(MachineInstr *TfrMI, MachineInstr *UseMI,
+                  NodeAddr<UseNode *> UseN, unsigned UseMOnum);
+  bool analyzeUses(unsigned DefR, const NodeList &UNodeList,
+                   InstrEvalMap &InstrEvalResult, short &SizeInc);
+  bool hasRepForm(MachineInstr &MI, unsigned TfrDefR);
+  bool canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN, MachineInstr &MI,
+                       const NodeList &UNodeList);
+  void getAllRealUses(NodeAddr<StmtNode *> SN, NodeList &UNodeList);
+  bool allValidCandidates(NodeAddr<StmtNode *> SA, NodeList &UNodeList);
+  short getBaseWithLongOffset(const MachineInstr &MI) const;
+  void updateMap(NodeAddr<InstrNode *> IA);
+  bool constructDefMap(MachineBasicBlock *B);
+  bool changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
+                   unsigned ImmOpNum);
+  bool changeLoad(MachineInstr *OldMI, MachineOperand ImmOp, unsigned ImmOpNum);
+  bool changeAddAsl(NodeAddr<UseNode *> AddAslUN, MachineInstr *AddAslMI,
+                    const MachineOperand &ImmOp, unsigned ImmOpNum);
+};
+
+} // end anonymous namespace
+
+char HexagonOptAddrMode::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonOptAddrMode, "opt-amode",
+                      "Optimize addressing mode", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
+INITIALIZE_PASS_END(HexagonOptAddrMode, "opt-amode", "Optimize addressing mode",
+                    false, false)
+
+bool HexagonOptAddrMode::hasRepForm(MachineInstr &MI, unsigned TfrDefR) {
+  const MCInstrDesc &MID = MI.getDesc();
+
+  if ((!MID.mayStore() && !MID.mayLoad()) || HII->isPredicated(MI))
+    return false;
+
+  if (MID.mayStore()) {
+    MachineOperand StOp = MI.getOperand(MI.getNumOperands() - 1);
+    if (StOp.isReg() && StOp.getReg() == TfrDefR)
+      return false;
+  }
+
+  if (HII->getAddrMode(MI) == HexagonII::BaseRegOffset)
+    // Tranform to Absolute plus register offset.
+    return (HII->getBaseWithLongOffset(MI) >= 0);
+  else if (HII->getAddrMode(MI) == HexagonII::BaseImmOffset)
+    // Tranform to absolute addressing mode.
+    return (HII->getAbsoluteForm(MI) >= 0);
+
+  return false;
+}
+
+// Check if addasl instruction can be removed. This is possible only
+// if it's feeding to only load/store instructions with base + register
+// offset as these instruction can be tranformed to use 'absolute plus
+// shifted register offset'.
+// ex:
+// Rs = ##foo
+// Rx = addasl(Rs, Rt, #2)
+// Rd = memw(Rx + #28)
+// Above three instructions can be replaced with Rd = memw(Rt<<#2 + ##foo+28)
+
+bool HexagonOptAddrMode::canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN,
+                                         MachineInstr &MI,
+                                         const NodeList &UNodeList) {
+  // check offset size in addasl. if 'offset > 3' return false
+  const MachineOperand &OffsetOp = MI.getOperand(3);
+  if (!OffsetOp.isImm() || OffsetOp.getImm() > 3)
+    return false;
+
+  unsigned OffsetReg = MI.getOperand(2).getReg();
+  RegisterRef OffsetRR;
+  NodeId OffsetRegRD = 0;
+  for (NodeAddr<UseNode *> UA : AddAslSN.Addr->members_if(DFG->IsUse, *DFG)) {
+    RegisterRef RR = UA.Addr->getRegRef(*DFG);
+    if (OffsetReg == RR.Reg) {
+      OffsetRR = RR;
+      OffsetRegRD = UA.Addr->getReachingDef();
+    }
+  }
+
+  for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+    NodeAddr<UseNode *> UA = *I;
+    NodeAddr<InstrNode *> IA = UA.Addr->getOwner(*DFG);
+    if ((UA.Addr->getFlags() & NodeAttrs::PhiRef) ||
+        RDefMap[OffsetRR][IA.Id] != OffsetRegRD)
+      return false;
+
+    MachineInstr &UseMI = *NodeAddr<StmtNode *>(IA).Addr->getCode();
+    NodeAddr<DefNode *> OffsetRegDN = DFG->addr<DefNode *>(OffsetRegRD);
+    // Reaching Def to an offset register can't be a phi.
+    if ((OffsetRegDN.Addr->getFlags() & NodeAttrs::PhiRef) &&
+        MI.getParent() != UseMI.getParent())
+    return false;
+
+    const MCInstrDesc &UseMID = UseMI.getDesc();
+    if ((!UseMID.mayLoad() && !UseMID.mayStore()) ||
+        HII->getAddrMode(UseMI) != HexagonII::BaseImmOffset ||
+        getBaseWithLongOffset(UseMI) < 0)
+      return false;
+
+    // Addasl output can't be a store value.
+    if (UseMID.mayStore() && UseMI.getOperand(2).isReg() &&
+        UseMI.getOperand(2).getReg() == MI.getOperand(0).getReg())
+      return false;
+
+    for (auto &Mo : UseMI.operands())
+      if (Mo.isFI())
+        return false;
+  }
+  return true;
+}
+
+bool HexagonOptAddrMode::allValidCandidates(NodeAddr<StmtNode *> SA,
+                                            NodeList &UNodeList) {
+  for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+    NodeAddr<UseNode *> UN = *I;
+    RegisterRef UR = UN.Addr->getRegRef(*DFG);
+    NodeSet Visited, Defs;
+    const auto &ReachingDefs = LV->getAllReachingDefsRec(UR, UN, Visited, Defs);
+    if (ReachingDefs.size() > 1) {
+      DEBUG({
+        dbgs() << "*** Multiple Reaching Defs found!!! ***\n";
+        for (auto DI : ReachingDefs) {
+          NodeAddr<UseNode *> DA = DFG->addr<UseNode *>(DI);
+          NodeAddr<StmtNode *> TempIA = DA.Addr->getOwner(*DFG);
+          dbgs() << "\t\t[Reaching Def]: "
+                 << Print<NodeAddr<InstrNode *>>(TempIA, *DFG) << "\n";
+        }
+      });
+      return false;
+    }
+  }
+  return true;
+}
+
+void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
+                                        NodeList &UNodeList) {
+  for (NodeAddr<DefNode *> DA : SA.Addr->members_if(DFG->IsDef, *DFG)) {
+    DEBUG(dbgs() << "\t\t[DefNode]: " << Print<NodeAddr<DefNode *>>(DA, *DFG)
+                 << "\n");
+    RegisterRef DR = DFG->normalizeRef(DA.Addr->getRegRef(*DFG));
+
+    auto UseSet = LV->getAllReachedUses(DR, DA);
+
+    for (auto UI : UseSet) {
+      NodeAddr<UseNode *> UA = DFG->addr<UseNode *>(UI);
+      DEBUG({
+        NodeAddr<StmtNode *> TempIA = UA.Addr->getOwner(*DFG);
+        dbgs() << "\t\t\t[Reached Use]: "
+               << Print<NodeAddr<InstrNode *>>(TempIA, *DFG) << "\n";
+      });
+
+      if (UA.Addr->getFlags() & NodeAttrs::PhiRef) {
+        NodeAddr<PhiNode *> PA = UA.Addr->getOwner(*DFG);
+        NodeId id = PA.Id;
+        const Liveness::RefMap &phiUse = LV->getRealUses(id);
+        DEBUG(dbgs() << "\t\t\t\tphi real Uses"
+                     << Print<Liveness::RefMap>(phiUse, *DFG) << "\n");
+        if (!phiUse.empty()) {
+          for (auto I : phiUse) {
+            if (DR.Reg != I.first)
+              continue;
+            auto phiUseSet = I.second;
+            for (auto phiUI : phiUseSet) {
+              NodeAddr<UseNode *> phiUA = DFG->addr<UseNode *>(phiUI.first);
+              UNodeList.push_back(phiUA);
+            }
+          }
+        }
+      } else
+        UNodeList.push_back(UA);
+    }
+  }
+}
+
+bool HexagonOptAddrMode::analyzeUses(unsigned tfrDefR,
+                                     const NodeList &UNodeList,
+                                     InstrEvalMap &InstrEvalResult,
+                                     short &SizeInc) {
+  bool KeepTfr = false;
+  bool HasRepInstr = false;
+  InstrEvalResult.clear();
+
+  for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+    bool CanBeReplaced = false;
+    NodeAddr<UseNode *> UN = *I;
+    NodeAddr<StmtNode *> SN = UN.Addr->getOwner(*DFG);
+    MachineInstr &MI = *SN.Addr->getCode();
+    const MCInstrDesc &MID = MI.getDesc();
+    if ((MID.mayLoad() || MID.mayStore())) {
+      if (!hasRepForm(MI, tfrDefR)) {
+        KeepTfr = true;
+        continue;
+      }
+      SizeInc++;
+      CanBeReplaced = true;
+    } else if (MI.getOpcode() == Hexagon::S2_addasl_rrri) {
+      NodeList AddaslUseList;
+
+      DEBUG(dbgs() << "\nGetting ReachedUses for === " << MI << "\n");
+      getAllRealUses(SN, AddaslUseList);
+      // Process phi nodes.
+      if (allValidCandidates(SN, AddaslUseList) &&
+          canRemoveAddasl(SN, MI, AddaslUseList)) {
+        SizeInc += AddaslUseList.size();
+        SizeInc -= 1; // Reduce size by 1 as addasl itself can be removed.
+        CanBeReplaced = true;
+      } else
+        SizeInc++;
+    } else
+      // Currently, only load/store and addasl are handled.
+      // Some other instructions to consider -
+      // A2_add -> A2_addi
+      // M4_mpyrr_addr -> M4_mpyrr_addi
+      KeepTfr = true;
+
+    InstrEvalResult[&MI] = CanBeReplaced;
+    HasRepInstr |= CanBeReplaced;
+  }
+
+  // Reduce total size by 2 if original tfr can be deleted.
+  if (!KeepTfr)
+    SizeInc -= 2;
+
+  return HasRepInstr;
+}
+
+bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
+                                    unsigned ImmOpNum) {
+  bool Changed = false;
+  MachineBasicBlock *BB = OldMI->getParent();
+  auto UsePos = MachineBasicBlock::iterator(OldMI);
+  MachineBasicBlock::instr_iterator InsertPt = UsePos.getInstrIterator();
+  ++InsertPt;
+  unsigned OpStart;
+  unsigned OpEnd = OldMI->getNumOperands();
+  MachineInstrBuilder MIB;
+
+  if (ImmOpNum == 1) {
+    if (HII->getAddrMode(*OldMI) == HexagonII::BaseRegOffset) {
+      short NewOpCode = HII->getBaseWithLongOffset(*OldMI);
+      assert(NewOpCode >= 0 && "Invalid New opcode\n");
+      MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
+      MIB.addOperand(OldMI->getOperand(0));
+      MIB.addOperand(OldMI->getOperand(2));
+      MIB.addOperand(OldMI->getOperand(3));
+      MIB.addOperand(ImmOp);
+      OpStart = 4;
+      Changed = true;
+    } else if (HII->getAddrMode(*OldMI) == HexagonII::BaseImmOffset) {
+      short NewOpCode = HII->getAbsoluteForm(*OldMI);
+      assert(NewOpCode >= 0 && "Invalid New opcode\n");
+      MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode))
+                .addOperand(OldMI->getOperand(0));
+      const GlobalValue *GV = ImmOp.getGlobal();
+      int64_t Offset = ImmOp.getOffset() + OldMI->getOperand(2).getImm();
+
+      MIB.addGlobalAddress(GV, Offset, ImmOp.getTargetFlags());
+      OpStart = 3;
+      Changed = true;
+    } else
+      Changed = false;
+
+    DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+    DEBUG(dbgs() << "[TO]: " << MIB << "\n");
+  } else if (ImmOpNum == 2 && OldMI->getOperand(3).getImm() == 0) {
+    short NewOpCode = HII->xformRegToImmOffset(*OldMI);
+    assert(NewOpCode >= 0 && "Invalid New opcode\n");
+    MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
+    MIB.addOperand(OldMI->getOperand(0));
+    MIB.addOperand(OldMI->getOperand(1));
+    MIB.addOperand(ImmOp);
+    OpStart = 4;
+    Changed = true;
+    DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+    DEBUG(dbgs() << "[TO]: " << MIB << "\n");
+  }
+
+  if (Changed)
+    for (unsigned i = OpStart; i < OpEnd; ++i)
+      MIB.addOperand(OldMI->getOperand(i));
+
+  return Changed;
+}
+
+bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
+                                     unsigned ImmOpNum) {
+  bool Changed = false;
+  unsigned OpStart;
+  unsigned OpEnd = OldMI->getNumOperands();
+  MachineBasicBlock *BB = OldMI->getParent();
+  auto UsePos = MachineBasicBlock::iterator(OldMI);
+  MachineBasicBlock::instr_iterator InsertPt = UsePos.getInstrIterator();
+  ++InsertPt;
+  MachineInstrBuilder MIB;
+  if (ImmOpNum == 0) {
+    if (HII->getAddrMode(*OldMI) == HexagonII::BaseRegOffset) {
+      short NewOpCode = HII->getBaseWithLongOffset(*OldMI);
+      assert(NewOpCode >= 0 && "Invalid New opcode\n");
+      MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
+      MIB.addOperand(OldMI->getOperand(1));
+      MIB.addOperand(OldMI->getOperand(2));
+      MIB.addOperand(ImmOp);
+      MIB.addOperand(OldMI->getOperand(3));
+      OpStart = 4;
+    } else if (HII->getAddrMode(*OldMI) == HexagonII::BaseImmOffset) {
+      short NewOpCode = HII->getAbsoluteForm(*OldMI);
+      assert(NewOpCode >= 0 && "Invalid New opcode\n");
+      MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
+      const GlobalValue *GV = ImmOp.getGlobal();
+      int64_t Offset = ImmOp.getOffset() + OldMI->getOperand(1).getImm();
+      MIB.addGlobalAddress(GV, Offset, ImmOp.getTargetFlags());
+      MIB.addOperand(OldMI->getOperand(2));
+      OpStart = 3;
+    }
+    Changed = true;
+    DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+    DEBUG(dbgs() << "[TO]: " << MIB << "\n");
+  } else if (ImmOpNum == 1 && OldMI->getOperand(2).getImm() == 0) {
+    short NewOpCode = HII->xformRegToImmOffset(*OldMI);
+    assert(NewOpCode >= 0 && "Invalid New opcode\n");
+    MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
+    MIB.addOperand(OldMI->getOperand(0));
+    MIB.addOperand(ImmOp);
+    MIB.addOperand(OldMI->getOperand(1));
+    OpStart = 2;
+    Changed = true;
+    DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
+    DEBUG(dbgs() << "[TO]: " << MIB << "\n");
+  }
+  if (Changed)
+    for (unsigned i = OpStart; i < OpEnd; ++i)
+      MIB.addOperand(OldMI->getOperand(i));
+
+  return Changed;
+}
+
+short HexagonOptAddrMode::getBaseWithLongOffset(const MachineInstr &MI) const {
+  if (HII->getAddrMode(MI) == HexagonII::BaseImmOffset) {
+    short TempOpCode = HII->getBaseWithRegOffset(MI);
+    return HII->getBaseWithLongOffset(TempOpCode);
+  } else
+    return HII->getBaseWithLongOffset(MI);
+}
+
+bool HexagonOptAddrMode::changeAddAsl(NodeAddr<UseNode *> AddAslUN,
+                                      MachineInstr *AddAslMI,
+                                      const MachineOperand &ImmOp,
+                                      unsigned ImmOpNum) {
+  NodeAddr<StmtNode *> SA = AddAslUN.Addr->getOwner(*DFG);
+
+  DEBUG(dbgs() << "Processing addasl :" << *AddAslMI << "\n");
+
+  NodeList UNodeList;
+  getAllRealUses(SA, UNodeList);
+
+  for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+    NodeAddr<UseNode *> UseUN = *I;
+    assert(!(UseUN.Addr->getFlags() & NodeAttrs::PhiRef) &&
+           "Can't transform this 'AddAsl' instruction!");
+
+    NodeAddr<StmtNode *> UseIA = UseUN.Addr->getOwner(*DFG);
+    DEBUG(dbgs() << "[InstrNode]: " << Print<NodeAddr<InstrNode *>>(UseIA, *DFG)
+                 << "\n");
+    MachineInstr *UseMI = UseIA.Addr->getCode();
+    DEBUG(dbgs() << "[MI <BB#" << UseMI->getParent()->getNumber()
+                 << ">]: " << *UseMI << "\n");
+    const MCInstrDesc &UseMID = UseMI->getDesc();
+    assert(HII->getAddrMode(*UseMI) == HexagonII::BaseImmOffset);
+
+    auto UsePos = MachineBasicBlock::iterator(UseMI);
+    MachineBasicBlock::instr_iterator InsertPt = UsePos.getInstrIterator();
+    short NewOpCode = getBaseWithLongOffset(*UseMI);
+    assert(NewOpCode >= 0 && "Invalid New opcode\n");
+
+    unsigned OpStart;
+    unsigned OpEnd = UseMI->getNumOperands();
+
+    MachineBasicBlock *BB = UseMI->getParent();
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, InsertPt, UseMI->getDebugLoc(), HII->get(NewOpCode));
+    // change mem(Rs + # ) -> mem(Rt << # + ##)
+    if (UseMID.mayLoad()) {
+      MIB.addOperand(UseMI->getOperand(0));
+      MIB.addOperand(AddAslMI->getOperand(2));
+      MIB.addOperand(AddAslMI->getOperand(3));
+      const GlobalValue *GV = ImmOp.getGlobal();
+      MIB.addGlobalAddress(GV, UseMI->getOperand(2).getImm(),
+                           ImmOp.getTargetFlags());
+      OpStart = 3;
+    } else if (UseMID.mayStore()) {
+      MIB.addOperand(AddAslMI->getOperand(2));
+      MIB.addOperand(AddAslMI->getOperand(3));
+      const GlobalValue *GV = ImmOp.getGlobal();
+      MIB.addGlobalAddress(GV, UseMI->getOperand(1).getImm(),
+                           ImmOp.getTargetFlags());
+      MIB.addOperand(UseMI->getOperand(2));
+      OpStart = 3;
+    } else
+      llvm_unreachable("Unhandled instruction");
+
+    for (unsigned i = OpStart; i < OpEnd; ++i)
+      MIB.addOperand(UseMI->getOperand(i));
+
+    Deleted.insert(UseMI);
+  }
+
+  return true;
+}
+
+bool HexagonOptAddrMode::xformUseMI(MachineInstr *TfrMI, MachineInstr *UseMI,
+                                    NodeAddr<UseNode *> UseN,
+                                    unsigned UseMOnum) {
+  const MachineOperand ImmOp = TfrMI->getOperand(1);
+  const MCInstrDesc &MID = UseMI->getDesc();
+  unsigned Changed = false;
+  if (MID.mayLoad())
+    Changed = changeLoad(UseMI, ImmOp, UseMOnum);
+  else if (MID.mayStore())
+    Changed = changeStore(UseMI, ImmOp, UseMOnum);
+  else if (UseMI->getOpcode() == Hexagon::S2_addasl_rrri)
+    Changed = changeAddAsl(UseN, UseMI, ImmOp, UseMOnum);
+
+  if (Changed)
+    Deleted.insert(UseMI);
+
+  return Changed;
+}
+
+bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
+  bool Changed = false;
+
+  for (auto IA : BA.Addr->members(*DFG)) {
+    if (!DFG->IsCode<NodeAttrs::Stmt>(IA))
+      continue;
+
+    NodeAddr<StmtNode *> SA = IA;
+    MachineInstr *MI = SA.Addr->getCode();
+    if (MI->getOpcode() != Hexagon::A2_tfrsi ||
+        !MI->getOperand(1).isGlobal())
+      continue;
+
+    DEBUG(dbgs() << "[Analyzing A2_tfrsi]: " << *MI << "\n");
+    DEBUG(dbgs() << "\t[InstrNode]: " << Print<NodeAddr<InstrNode *>>(IA, *DFG)
+                 << "\n");
+
+    NodeList UNodeList;
+    getAllRealUses(SA, UNodeList);
+
+    if (!allValidCandidates(SA, UNodeList))
+      continue;
+
+    short SizeInc = 0;
+    unsigned DefR = MI->getOperand(0).getReg();
+    InstrEvalMap InstrEvalResult;
+
+    // Analyze all uses and calculate increase in size. Perform the optimization
+    // only if there is no increase in size.
+    if (!analyzeUses(DefR, UNodeList, InstrEvalResult, SizeInc))
+      continue;
+    if (SizeInc > CodeGrowthLimit)
+      continue;
+
+    bool KeepTfr = false;
+
+    DEBUG(dbgs() << "\t[Total reached uses] : " << UNodeList.size() << "\n");
+    DEBUG(dbgs() << "\t[Processing Reached Uses] ===\n");
+    for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
+      NodeAddr<UseNode *> UseN = *I;
+      assert(!(UseN.Addr->getFlags() & NodeAttrs::PhiRef) &&
+             "Found a PhiRef node as a real reached use!!");
+
+      NodeAddr<StmtNode *> OwnerN = UseN.Addr->getOwner(*DFG);
+      MachineInstr *UseMI = OwnerN.Addr->getCode();
+      DEBUG(dbgs() << "\t\t[MI <BB#" << UseMI->getParent()->getNumber()
+                   << ">]: " << *UseMI << "\n");
+
+      int UseMOnum = -1;
+      unsigned NumOperands = UseMI->getNumOperands();
+      for (unsigned j = 0; j < NumOperands - 1; ++j) {
+        const MachineOperand &op = UseMI->getOperand(j);
+        if (op.isReg() && op.isUse() && DefR == op.getReg())
+          UseMOnum = j;
+      }
+      assert(UseMOnum >= 0 && "Invalid reached use!");
+
+      if (InstrEvalResult[UseMI])
+        // Change UseMI if replacement is possible.
+        Changed |= xformUseMI(MI, UseMI, UseN, UseMOnum);
+      else
+        KeepTfr = true;
+    }
+    if (!KeepTfr)
+      Deleted.insert(MI);
+  }
+  return Changed;
+}
+
+void HexagonOptAddrMode::updateMap(NodeAddr<InstrNode *> IA) {
+  RegisterSet RRs;
+  for (NodeAddr<RefNode *> RA : IA.Addr->members(*DFG))
+    RRs.insert(RA.Addr->getRegRef(*DFG));
+  bool Common = false;
+  for (auto &R : RDefMap) {
+    if (!RRs.count(R.first))
+      continue;
+    Common = true;
+    break;
+  }
+  if (!Common)
+    return;
+
+  for (auto &R : RDefMap) {
+    auto F = DefM.find(R.first.Reg);
+    if (F == DefM.end() || F->second.empty())
+      continue;
+    R.second[IA.Id] = F->second.top()->Id;
+  }
+}
+
+bool HexagonOptAddrMode::constructDefMap(MachineBasicBlock *B) {
+  bool Changed = false;
+  auto BA = DFG->getFunc().Addr->findBlock(B, *DFG);
+  DFG->markBlock(BA.Id, DefM);
+
+  for (NodeAddr<InstrNode *> IA : BA.Addr->members(*DFG)) {
+    updateMap(IA);
+    DFG->pushDefs(IA, DefM);
+  }
+
+  MachineDomTreeNode *N = MDT->getNode(B);
+  for (auto I : *N)
+    Changed |= constructDefMap(I->getBlock());
+
+  DFG->releaseBlock(BA.Id, DefM);
+  return Changed;
+}
+
+bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &MRI = MF.getRegInfo();
+  HII = HST.getInstrInfo();
+  const auto &MDF = getAnalysis<MachineDominanceFrontier>();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  const auto &TRI = *MF.getSubtarget().getRegisterInfo();
+  const TargetOperandInfo TOI(*HII);
+
+  DataFlowGraph G(MF, *HII, TRI, *MDT, MDF, TOI);
+  G.build();
+  DFG = &G;
+
+  Liveness L(MRI, *DFG);
+  L.computePhiInfo();
+  LV = &L;
+
+  constructDefMap(&DFG->getMF().front());
+
+  Deleted.clear();
+  NodeAddr<FuncNode *> FA = DFG->getFunc();
+  DEBUG(dbgs() << "==== [RefMap#]=====:\n "
+               << Print<NodeAddr<FuncNode *>>(FA, *DFG) << "\n");
+
+  for (NodeAddr<BlockNode *> BA : FA.Addr->members(*DFG))
+    Changed |= processBlock(BA);
+
+  for (auto MI : Deleted)
+    MI->eraseFromParent();
+
+  if (Changed) {
+    G.build();
+    L.computeLiveIns();
+    L.resetLiveIns();
+    L.resetKills();
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonOptAddrMode() {
+  return new HexagonOptAddrMode();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
new file mode 100644
index 000000000000..101de3d8fbee
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
@@ -0,0 +1,148 @@
+//===- HexagonOptimizeSZextends.cpp - Remove unnecessary argument extends -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Pass that removes sign extends for function parameters. These parameters
+// are already sign extended by the caller per Hexagon's ABI
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+
+#include "Hexagon.h"
+
+using namespace llvm;
+
+namespace llvm {
+  FunctionPass *createHexagonOptimizeSZextends();
+  void initializeHexagonOptimizeSZextendsPass(PassRegistry&);
+}
+
+namespace {
+  struct HexagonOptimizeSZextends : public FunctionPass {
+  public:
+    static char ID;
+    HexagonOptimizeSZextends() : FunctionPass(ID) {
+      initializeHexagonOptimizeSZextendsPass(*PassRegistry::getPassRegistry());
+    }
+    bool runOnFunction(Function &F) override;
+
+    StringRef getPassName() const override { return "Remove sign extends"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addPreserved<StackProtector>();
+      FunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool intrinsicAlreadySextended(Intrinsic::ID IntID);
+  };
+}
+
+char HexagonOptimizeSZextends::ID = 0;
+
+INITIALIZE_PASS(HexagonOptimizeSZextends, "reargs",
+                "Remove Sign and Zero Extends for Args", false, false)
+
+bool HexagonOptimizeSZextends::intrinsicAlreadySextended(Intrinsic::ID IntID) {
+  switch(IntID) {
+    case llvm::Intrinsic::hexagon_A2_addh_l16_sat_ll:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+bool HexagonOptimizeSZextends::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  unsigned Idx = 1;
+  // Try to optimize sign extends in formal parameters. It's relying on
+  // callee already sign extending the values. I'm not sure if our ABI
+  // requires callee to sign extend though.
+  for (auto &Arg : F.args()) {
+    if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) {
+      if (!isa<PointerType>(Arg.getType())) {
+        for (auto UI = Arg.use_begin(); UI != Arg.use_end();) {
+          if (isa<SExtInst>(*UI)) {
+            Instruction* Use = cast<Instruction>(*UI);
+            SExtInst* SI = new SExtInst(&Arg, Use->getType());
+            assert (EVT::getEVT(SI->getType()) ==
+                    (EVT::getEVT(Use->getType())));
+            ++UI;
+            Use->replaceAllUsesWith(SI);
+            Instruction* First = &F.getEntryBlock().front();
+            SI->insertBefore(First);
+            Use->eraseFromParent();
+          } else {
+            ++UI;
+          }
+        }
+      }
+    }
+    ++Idx;
+  }
+
+  // Try to remove redundant sext operations on Hexagon. The hardware
+  // already sign extends many 16 bit intrinsic operations to 32 bits.
+  // For example:
+  // %34 = tail call i32 @llvm.hexagon.A2.addh.l16.sat.ll(i32 %x, i32 %y)
+  // %sext233 = shl i32 %34, 16
+  // %conv52 = ashr exact i32 %sext233, 16
+  for (auto &B : F) {
+    for (auto &I : B) {
+      // Look for arithmetic shift right by 16.
+      BinaryOperator *Ashr = dyn_cast<BinaryOperator>(&I);
+      if (!(Ashr && Ashr->getOpcode() == Instruction::AShr))
+        continue;
+      Value *AshrOp1 = Ashr->getOperand(1);
+      ConstantInt *C = dyn_cast<ConstantInt>(AshrOp1);
+      // Right shifted by 16.
+      if (!(C && C->getSExtValue() == 16))
+        continue;
+
+      // The first operand of Ashr comes from logical shift left.
+      Instruction *Shl = dyn_cast<Instruction>(Ashr->getOperand(0));
+      if (!(Shl && Shl->getOpcode() == Instruction::Shl))
+        continue;
+      Value *Intr = Shl->getOperand(0);
+      Value *ShlOp1 = Shl->getOperand(1);
+      C = dyn_cast<ConstantInt>(ShlOp1);
+      // Left shifted by 16.
+      if (!(C && C->getSExtValue() == 16))
+        continue;
+
+      // The first operand of Shl comes from an intrinsic.
+      if (IntrinsicInst *I = dyn_cast<IntrinsicInst>(Intr)) {
+        if (!intrinsicAlreadySextended(I->getIntrinsicID()))
+          continue;
+        // All is well. Replace all uses of AShr with I.
+        for (auto UI = Ashr->user_begin(), UE = Ashr->user_end();
+             UI != UE; ++UI) {
+          const Use &TheUse = UI.getUse();
+          if (Instruction *J = dyn_cast<Instruction>(TheUse.getUser())) {
+            J->replaceUsesOfWith(Ashr, I);
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+
+FunctionPass *llvm::createHexagonOptimizeSZextends() {
+  return new HexagonOptimizeSZextends();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonPatterns.td b/contrib/llvm/lib/Target/Hexagon/HexagonPatterns.td
new file mode 100644
index 000000000000..ad81287007e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -0,0 +1,3347 @@
+// Pattern fragment that combines the value type and the register class
+// into a single parameter.
+// The pat frags in the definitions below need to have a named register,
+// otherwise i32 will be assumed regardless of the register class. The
+// name of the register does not matter.
+def I1  : PatLeaf<(i1 PredRegs:$R)>;
+def I32 : PatLeaf<(i32 IntRegs:$R)>;
+def I64 : PatLeaf<(i64 DoubleRegs:$R)>;
+def F32 : PatLeaf<(f32 IntRegs:$R)>;
+def F64 : PatLeaf<(f64 DoubleRegs:$R)>;
+
+// Pattern fragments to extract the low and high subregisters from a
+// 64-bit value.
+def LoReg: OutPatFrag<(ops node:$Rs), (EXTRACT_SUBREG (i64 $Rs), isub_lo)>;
+def HiReg: OutPatFrag<(ops node:$Rs), (EXTRACT_SUBREG (i64 $Rs), isub_hi)>;
+
+def IsOrAdd: PatFrag<(ops node:$Addr, node:$off),
+    (or node:$Addr, node:$off), [{ return isOrEquivalentToAdd(N); }]>;
+
+def IsPow2_32 : PatLeaf<(i32 imm), [{
+  uint32_t V = N->getZExtValue();
+  return isPowerOf2_32(V);
+}]>;
+
+def IsPow2_64 : PatLeaf<(i64 imm), [{
+  uint64_t V = N->getZExtValue();
+  return isPowerOf2_64(V);
+}]>;
+
+def IsNPow2_32 : PatLeaf<(i32 imm), [{
+  uint32_t NV = ~N->getZExtValue();
+  return isPowerOf2_32(NV);
+}]>;
+
+def IsPow2_64L : PatLeaf<(i64 imm), [{
+  uint64_t V = N->getZExtValue();
+  return isPowerOf2_64(V) && Log2_64(V) < 32;
+}]>;
+
+def IsPow2_64H : PatLeaf<(i64 imm), [{
+  uint64_t V = N->getZExtValue();
+  return isPowerOf2_64(V) && Log2_64(V) >= 32;
+}]>;
+
+def IsNPow2_64L : PatLeaf<(i64 imm), [{
+  uint64_t NV = ~N->getZExtValue();
+  return isPowerOf2_64(NV) && Log2_64(NV) < 32;
+}]>;
+
+def IsNPow2_64H : PatLeaf<(i64 imm), [{
+  uint64_t NV = ~N->getZExtValue();
+  return isPowerOf2_64(NV) && Log2_64(NV) >= 32;
+}]>;
+
+def SDEC1 : SDNodeXForm<imm, [{
+  int32_t V = N->getSExtValue();
+  return CurDAG->getTargetConstant(V-1, SDLoc(N), MVT::i32);
+}]>;
+
+def UDEC1 : SDNodeXForm<imm, [{
+  uint32_t V = N->getZExtValue();
+  assert(V >= 1);
+  return CurDAG->getTargetConstant(V-1, SDLoc(N), MVT::i32);
+}]>;
+
+def UDEC32 : SDNodeXForm<imm, [{
+  uint32_t V = N->getZExtValue();
+  assert(V >= 32);
+  return CurDAG->getTargetConstant(V-32, SDLoc(N), MVT::i32);
+}]>;
+
+def Log2_32 : SDNodeXForm<imm, [{
+  uint32_t V = N->getZExtValue();
+  return CurDAG->getTargetConstant(Log2_32(V), SDLoc(N), MVT::i32);
+}]>;
+
+def Log2_64 : SDNodeXForm<imm, [{
+  uint64_t V = N->getZExtValue();
+  return CurDAG->getTargetConstant(Log2_64(V), SDLoc(N), MVT::i32);
+}]>;
+
+def LogN2_32 : SDNodeXForm<imm, [{
+  uint32_t NV = ~N->getZExtValue();
+  return CurDAG->getTargetConstant(Log2_32(NV), SDLoc(N), MVT::i32);
+}]>;
+
+def LogN2_64 : SDNodeXForm<imm, [{
+  uint64_t NV = ~N->getZExtValue();
+  return CurDAG->getTargetConstant(Log2_64(NV), SDLoc(N), MVT::i32);
+}]>;
+
+
+class T_CMP_pat <InstHexagon MI, PatFrag OpNode, PatLeaf ImmPred>
+  : Pat<(i1 (OpNode I32:$src1, ImmPred:$src2)),
+        (MI IntRegs:$src1, ImmPred:$src2)>;
+
+def : T_CMP_pat <C2_cmpeqi,  seteq,  s10_0ImmPred>;
+def : T_CMP_pat <C2_cmpgti,  setgt,  s10_0ImmPred>;
+def : T_CMP_pat <C2_cmpgtui, setugt, u9_0ImmPred>;
+
+def SDTHexagonI64I32I32 : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
+
+def HexagonCOMBINE : SDNode<"HexagonISD::COMBINE", SDTHexagonI64I32I32>;
+def HexagonPACKHL  : SDNode<"HexagonISD::PACKHL",  SDTHexagonI64I32I32>;
+
+// Pats for instruction selection.
+class BinOp32_pat<SDNode Op, InstHexagon MI, ValueType ResT>
+  : Pat<(ResT (Op I32:$Rs, I32:$Rt)),
+        (ResT (MI IntRegs:$Rs, IntRegs:$Rt))>;
+
+def: BinOp32_pat<add, A2_add, i32>;
+def: BinOp32_pat<and, A2_and, i32>;
+def: BinOp32_pat<or,  A2_or,  i32>;
+def: BinOp32_pat<sub, A2_sub, i32>;
+def: BinOp32_pat<xor, A2_xor, i32>;
+
+def: BinOp32_pat<HexagonCOMBINE, A2_combinew, i64>;
+def: BinOp32_pat<HexagonPACKHL,  S2_packhl,   i64>;
+
+// Patfrag to convert the usual comparison patfrags (e.g. setlt) to ones
+// that reverse the order of the operands.
+class RevCmp<PatFrag F> : PatFrag<(ops node:$rhs, node:$lhs), F.Fragment>;
+
+// Pats for compares. They use PatFrags as operands, not SDNodes,
+// since seteq/setgt/etc. are defined as ParFrags.
+class T_cmp32_rr_pat<InstHexagon MI, PatFrag Op, ValueType VT>
+  : Pat<(VT (Op I32:$Rs, I32:$Rt)),
+        (MI IntRegs:$Rs, IntRegs:$Rt)>;
+
+def: T_cmp32_rr_pat<C2_cmpeq,  seteq,  i1>;
+def: T_cmp32_rr_pat<C2_cmpgt,  setgt,  i1>;
+def: T_cmp32_rr_pat<C2_cmpgtu, setugt, i1>;
+
+def: T_cmp32_rr_pat<C2_cmpgt,  RevCmp<setlt>,  i1>;
+def: T_cmp32_rr_pat<C2_cmpgtu, RevCmp<setult>, i1>;
+
+def: Pat<(select I1:$Pu, I32:$Rs, I32:$Rt),
+         (C2_mux PredRegs:$Pu, IntRegs:$Rs, IntRegs:$Rt)>;
+
+def: Pat<(add I32:$Rs, s32_0ImmPred:$s16),
+         (A2_addi I32:$Rs, imm:$s16)>;
+
+def: Pat<(or I32:$Rs, s32_0ImmPred:$s10),
+         (A2_orir IntRegs:$Rs, imm:$s10)>;
+def: Pat<(and I32:$Rs, s32_0ImmPred:$s10),
+         (A2_andir IntRegs:$Rs, imm:$s10)>;
+
+def: Pat<(sub s32_0ImmPred:$s10, IntRegs:$Rs),
+         (A2_subri imm:$s10, IntRegs:$Rs)>;
+
+// Rd = not(Rs) gets mapped to Rd=sub(#-1, Rs).
+def: Pat<(not I32:$src1),
+         (A2_subri -1, IntRegs:$src1)>;
+
+def: Pat<(s32_0ImmPred:$s16), (A2_tfrsi imm:$s16)>;
+def: Pat<(s8_0Imm64Pred:$s8), (A2_tfrpi imm:$s8)>;
+
+def : Pat<(select I1:$Pu, s32_0ImmPred:$s8, I32:$Rs),
+          (C2_muxri I1:$Pu, imm:$s8, I32:$Rs)>;
+
+def : Pat<(select I1:$Pu, I32:$Rs, s32_0ImmPred:$s8),
+          (C2_muxir I1:$Pu, I32:$Rs, imm:$s8)>;
+
+def : Pat<(select I1:$Pu, s32_0ImmPred:$s8, s8_0ImmPred:$S8),
+          (C2_muxii I1:$Pu, imm:$s8, imm:$S8)>;
+
+def: Pat<(shl I32:$src1, (i32 16)),   (A2_aslh I32:$src1)>;
+def: Pat<(sra I32:$src1, (i32 16)),   (A2_asrh I32:$src1)>;
+def: Pat<(sext_inreg I32:$src1, i8),  (A2_sxtb I32:$src1)>;
+def: Pat<(sext_inreg I32:$src1, i16), (A2_sxth I32:$src1)>;
+
+class T_vcmp_pat<InstHexagon MI, PatFrag Op, ValueType T>
+  : Pat<(i1 (Op (T DoubleRegs:$Rss), (T DoubleRegs:$Rtt))),
+        (i1 (MI DoubleRegs:$Rss, DoubleRegs:$Rtt))>;
+
+def: T_vcmp_pat<A2_vcmpbeq,  seteq,  v8i8>;
+def: T_vcmp_pat<A2_vcmpbgtu, setugt, v8i8>;
+def: T_vcmp_pat<A2_vcmpheq,  seteq,  v4i16>;
+def: T_vcmp_pat<A2_vcmphgt,  setgt,  v4i16>;
+def: T_vcmp_pat<A2_vcmphgtu, setugt, v4i16>;
+def: T_vcmp_pat<A2_vcmpweq,  seteq,  v2i32>;
+def: T_vcmp_pat<A2_vcmpwgt,  setgt,  v2i32>;
+def: T_vcmp_pat<A2_vcmpwgtu, setugt, v2i32>;
+
+// Add halfword.
+def: Pat<(sext_inreg (add I32:$src1, I32:$src2), i16),
+         (A2_addh_l16_ll I32:$src1, I32:$src2)>;
+
+def: Pat<(sra (add (shl I32:$src1, (i32 16)), I32:$src2), (i32 16)),
+         (A2_addh_l16_hl I32:$src1, I32:$src2)>;
+
+def: Pat<(shl (add I32:$src1, I32:$src2), (i32 16)),
+         (A2_addh_h16_ll I32:$src1, I32:$src2)>;
+
+// Subtract halfword.
+def: Pat<(sext_inreg (sub I32:$src1, I32:$src2), i16),
+         (A2_subh_l16_ll I32:$src1, I32:$src2)>;
+
+def: Pat<(shl (sub I32:$src1, I32:$src2), (i32 16)),
+         (A2_subh_h16_ll I32:$src1, I32:$src2)>;
+
+// Here, depending on  the operand being selected, we'll either generate a
+// min or max instruction.
+// Ex:
+// (a>b)?a:b --> max(a,b) => Here check performed is '>' and the value selected
+// is the larger of two. So, the corresponding HexagonInst is passed in 'Inst'.
+// (a>b)?b:a --> min(a,b) => Here check performed is '>' but the smaller value
+// is selected and the corresponding HexagonInst is passed in 'SwapInst'.
+
+multiclass T_MinMax_pats <PatFrag Op, PatLeaf Val,
+                          InstHexagon Inst, InstHexagon SwapInst> {
+  def: Pat<(select (i1 (Op Val:$src1, Val:$src2)), Val:$src1, Val:$src2),
+           (Inst Val:$src1, Val:$src2)>;
+  def: Pat<(select (i1 (Op Val:$src1, Val:$src2)), Val:$src2, Val:$src1),
+           (SwapInst Val:$src1, Val:$src2)>;
+}
+
+def IsPosHalf : PatLeaf<(i32 IntRegs:$a), [{
+  return isPositiveHalfWord(N);
+}]>;
+
+multiclass MinMax_pats <PatFrag Op, InstHexagon Inst, InstHexagon SwapInst> {
+  defm: T_MinMax_pats<Op, I32, Inst, SwapInst>;
+
+  def: Pat<(sext_inreg (select (i1 (Op IsPosHalf:$src1, IsPosHalf:$src2)),
+                               IsPosHalf:$src1, IsPosHalf:$src2),
+                       i16),
+           (Inst IntRegs:$src1, IntRegs:$src2)>;
+
+  def: Pat<(sext_inreg (select (i1 (Op IsPosHalf:$src1, IsPosHalf:$src2)),
+                               IsPosHalf:$src2, IsPosHalf:$src1),
+                       i16),
+           (SwapInst IntRegs:$src1, IntRegs:$src2)>;
+}
+
+let AddedComplexity = 200 in {
+  defm: MinMax_pats<setge,  A2_max,  A2_min>;
+  defm: MinMax_pats<setgt,  A2_max,  A2_min>;
+  defm: MinMax_pats<setle,  A2_min,  A2_max>;
+  defm: MinMax_pats<setlt,  A2_min,  A2_max>;
+  defm: MinMax_pats<setuge, A2_maxu, A2_minu>;
+  defm: MinMax_pats<setugt, A2_maxu, A2_minu>;
+  defm: MinMax_pats<setule, A2_minu, A2_maxu>;
+  defm: MinMax_pats<setult, A2_minu, A2_maxu>;
+}
+
+class T_cmp64_rr_pat<InstHexagon MI, PatFrag CmpOp>
+  : Pat<(i1 (CmpOp I64:$Rs, I64:$Rt)),
+        (i1 (MI DoubleRegs:$Rs, DoubleRegs:$Rt))>;
+
+def: T_cmp64_rr_pat<C2_cmpeqp,  seteq>;
+def: T_cmp64_rr_pat<C2_cmpgtp,  setgt>;
+def: T_cmp64_rr_pat<C2_cmpgtup, setugt>;
+def: T_cmp64_rr_pat<C2_cmpgtp,  RevCmp<setlt>>;
+def: T_cmp64_rr_pat<C2_cmpgtup, RevCmp<setult>>;
+
+def: Pat<(i64 (add I64:$Rs, I64:$Rt)), (A2_addp I64:$Rs, I64:$Rt)>;
+def: Pat<(i64 (sub I64:$Rs, I64:$Rt)), (A2_subp I64:$Rs, I64:$Rt)>;
+
+def: Pat<(i64 (and I64:$Rs, I64:$Rt)), (A2_andp I64:$Rs, I64:$Rt)>;
+def: Pat<(i64 (or  I64:$Rs, I64:$Rt)), (A2_orp  I64:$Rs, I64:$Rt)>;
+def: Pat<(i64 (xor I64:$Rs, I64:$Rt)), (A2_xorp I64:$Rs, I64:$Rt)>;
+
+def: Pat<(i1 (not I1:$Ps)), (C2_not PredRegs:$Ps)>;
+
+def: Pat<(i1 (and I1:$Ps, I1:$Pt)),       (C2_and  I1:$Ps, I1:$Pt)>;
+def: Pat<(i1 (or  I1:$Ps, I1:$Pt)),       (C2_or   I1:$Ps, I1:$Pt)>;
+def: Pat<(i1 (xor I1:$Ps, I1:$Pt)),       (C2_xor  I1:$Ps, I1:$Pt)>;
+def: Pat<(i1 (and I1:$Ps, (not I1:$Pt))), (C2_andn I1:$Ps, I1:$Pt)>;
+def: Pat<(i1 (or  I1:$Ps, (not I1:$Pt))), (C2_orn  I1:$Ps, I1:$Pt)>;
+
+def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone,
+                     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def eh_return: SDNode<"HexagonISD::EH_RETURN", SDTNone, [SDNPHasChain]>;
+
+def: Pat<(br bb:$dst),                  (J2_jump brtarget:$dst)>;
+def: Pat<(brcond I1:$src1, bb:$block),  (J2_jumpt PredRegs:$src1, bb:$block)>;
+def: Pat<(brind I32:$dst),              (J2_jumpr IntRegs:$dst)>;
+
+def: Pat<(retflag),   (PS_jmpret (i32 R31))>;
+def: Pat<(eh_return), (EH_RETURN_JMPR (i32 R31))>;
+
+// Patterns to select load-indexed (i.e. load from base+offset).
+multiclass Loadx_pat<PatFrag Load, ValueType VT, PatLeaf ImmPred,
+                     InstHexagon MI> {
+  def: Pat<(VT (Load AddrFI:$fi)), (VT (MI AddrFI:$fi, 0))>;
+  def: Pat<(VT (Load (add (i32 AddrFI:$fi), ImmPred:$Off))),
+           (VT (MI AddrFI:$fi, imm:$Off))>;
+  def: Pat<(VT (Load (IsOrAdd (i32 AddrFI:$fi), ImmPred:$Off))),
+           (VT (MI AddrFI:$fi, imm:$Off))>;
+  def: Pat<(VT (Load (add I32:$Rs, ImmPred:$Off))),
+           (VT (MI IntRegs:$Rs, imm:$Off))>;
+  def: Pat<(VT (Load I32:$Rs)), (VT (MI IntRegs:$Rs, 0))>;
+}
+
+let AddedComplexity = 20 in {
+  defm: Loadx_pat<load,           i32, s30_2ImmPred, L2_loadri_io>;
+  defm: Loadx_pat<load,           i64, s29_3ImmPred, L2_loadrd_io>;
+  defm: Loadx_pat<atomic_load_8 , i32, s32_0ImmPred, L2_loadrub_io>;
+  defm: Loadx_pat<atomic_load_16, i32, s31_1ImmPred, L2_loadruh_io>;
+  defm: Loadx_pat<atomic_load_32, i32, s30_2ImmPred, L2_loadri_io>;
+  defm: Loadx_pat<atomic_load_64, i64, s29_3ImmPred, L2_loadrd_io>;
+
+  defm: Loadx_pat<extloadi1,      i32, s32_0ImmPred, L2_loadrub_io>;
+  defm: Loadx_pat<extloadi8,      i32, s32_0ImmPred, L2_loadrub_io>;
+  defm: Loadx_pat<extloadi16,     i32, s31_1ImmPred, L2_loadruh_io>;
+  defm: Loadx_pat<sextloadi8,     i32, s32_0ImmPred, L2_loadrb_io>;
+  defm: Loadx_pat<sextloadi16,    i32, s31_1ImmPred, L2_loadrh_io>;
+  defm: Loadx_pat<zextloadi1,     i32, s32_0ImmPred, L2_loadrub_io>;
+  defm: Loadx_pat<zextloadi8,     i32, s32_0ImmPred, L2_loadrub_io>;
+  defm: Loadx_pat<zextloadi16,    i32, s31_1ImmPred, L2_loadruh_io>;
+  // No sextloadi1.
+}
+
+// Sign-extending loads of i1 need to replicate the lowest bit throughout
+// the 32-bit value. Since the loaded value can only be 0 or 1, 0-v should
+// do the trick.
+let AddedComplexity = 20 in
+def: Pat<(i32 (sextloadi1 I32:$Rs)),
+         (A2_subri 0, (L2_loadrub_io IntRegs:$Rs, 0))>;
+
+def: Pat<(i32 (mul   I32:$src1, I32:$src2)), (M2_mpyi    I32:$src1, I32:$src2)>;
+def: Pat<(i32 (mulhs I32:$src1, I32:$src2)), (M2_mpy_up  I32:$src1, I32:$src2)>;
+def: Pat<(i32 (mulhu I32:$src1, I32:$src2)), (M2_mpyu_up I32:$src1, I32:$src2)>;
+
+def: Pat<(mul IntRegs:$Rs, u32_0ImmPred:$u8),
+         (M2_mpysip IntRegs:$Rs, imm:$u8)>;
+def: Pat<(ineg (mul IntRegs:$Rs, u8_0ImmPred:$u8)),
+         (M2_mpysin IntRegs:$Rs, imm:$u8)>;
+def: Pat<(mul IntRegs:$src1, s32_0ImmPred:$src2),
+         (M2_mpysmi IntRegs:$src1, imm:$src2)>;
+def: Pat<(add (mul IntRegs:$src2, u32_0ImmPred:$src3), IntRegs:$src1),
+         (M2_macsip IntRegs:$src1, IntRegs:$src2, imm:$src3)>;
+def: Pat<(add (mul I32:$src2, I32:$src3), I32:$src1),
+         (M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+def: Pat<(add (add IntRegs:$src2, u32_0ImmPred:$src3), IntRegs:$src1),
+         (M2_accii IntRegs:$src1, IntRegs:$src2, imm:$src3)>;
+def: Pat<(add (add I32:$src2, I32:$src3), I32:$src1),
+         (M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+class T_MType_acc_pat1 <InstHexagon MI, SDNode firstOp, SDNode secOp,
+                        PatLeaf ImmPred>
+  : Pat <(secOp IntRegs:$src1, (firstOp IntRegs:$src2, ImmPred:$src3)),
+         (MI IntRegs:$src1, IntRegs:$src2, ImmPred:$src3)>;
+
+class T_MType_acc_pat2 <InstHexagon MI, SDNode firstOp, SDNode secOp>
+  : Pat <(i32 (secOp IntRegs:$src1, (firstOp IntRegs:$src2, IntRegs:$src3))),
+         (MI IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def : T_MType_acc_pat2 <M2_xor_xacc, xor, xor>;
+def : T_MType_acc_pat1 <M2_macsin, mul, sub, u32_0ImmPred>;
+
+def : T_MType_acc_pat1 <M2_naccii, add, sub, s32_0ImmPred>;
+def : T_MType_acc_pat2 <M2_nacci, add, sub>;
+
+def: T_MType_acc_pat2 <M4_or_xor, xor, or>;
+def: T_MType_acc_pat2 <M4_and_xor, xor, and>;
+def: T_MType_acc_pat2 <M4_or_and, and, or>;
+def: T_MType_acc_pat2 <M4_and_and, and, and>;
+def: T_MType_acc_pat2 <M4_xor_and, and, xor>;
+def: T_MType_acc_pat2 <M4_or_or, or, or>;
+def: T_MType_acc_pat2 <M4_and_or, or, and>;
+def: T_MType_acc_pat2 <M4_xor_or, or, xor>;
+
+class T_MType_acc_pat3 <InstHexagon MI, SDNode firstOp, SDNode secOp>
+  : Pat <(secOp I32:$src1, (firstOp I32:$src2, (not I32:$src3))),
+         (MI IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: T_MType_acc_pat3 <M4_or_andn, and, or>;
+def: T_MType_acc_pat3 <M4_and_andn, and, and>;
+def: T_MType_acc_pat3 <M4_xor_andn, and, xor>;
+
+def Aext64: PatFrag<(ops node:$Rs), (i64 (anyext node:$Rs))>;
+def Sext64: PatFrag<(ops node:$Rs), (i64 (sext node:$Rs))>;
+def Zext64: PatFrag<(ops node:$Rs), (i64 (zext node:$Rs))>;
+
+// Return true if for a 32 to 64-bit sign-extended load.
+def Sext64Ld : PatLeaf<(i64 DoubleRegs:$src1), [{
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
+  if (!LD)
+    return false;
+  return LD->getExtensionType() == ISD::SEXTLOAD &&
+         LD->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def: Pat<(mul (Aext64 I32:$src1), (Aext64 I32:$src2)),
+         (M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2)>;
+
+def: Pat<(mul (Sext64 I32:$src1), (Sext64 I32:$src2)),
+         (M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>;
+
+def: Pat<(mul Sext64Ld:$src1, Sext64Ld:$src2),
+         (M2_dpmpyss_s0 (LoReg DoubleRegs:$src1), (LoReg DoubleRegs:$src2))>;
+
+// Multiply and accumulate, use full result.
+// Rxx[+-]=mpy(Rs,Rt)
+
+def: Pat<(add I64:$src1, (mul (Sext64 I32:$src2), (Sext64 I32:$src3))),
+         (M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: Pat<(sub I64:$src1, (mul (Sext64 I32:$src2), (Sext64 I32:$src3))),
+         (M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: Pat<(add I64:$src1, (mul (Aext64 I32:$src2), (Aext64 I32:$src3))),
+         (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: Pat<(add I64:$src1, (mul (Zext64 I32:$src2), (Zext64 I32:$src3))),
+         (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: Pat<(sub I64:$src1, (mul (Aext64 I32:$src2), (Aext64 I32:$src3))),
+         (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+def: Pat<(sub I64:$src1, (mul (Zext64 I32:$src2), (Zext64 I32:$src3))),
+         (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+
+class Storepi_pat<PatFrag Store, PatFrag Value, PatFrag Offset,
+                  InstHexagon MI>
+  : Pat<(Store Value:$src1, I32:$src2, Offset:$offset),
+        (MI I32:$src2, imm:$offset, Value:$src1)>;
+
+def: Storepi_pat<post_truncsti8,  I32, s4_0ImmPred, S2_storerb_pi>;
+def: Storepi_pat<post_truncsti16, I32, s4_1ImmPred, S2_storerh_pi>;
+def: Storepi_pat<post_store,      I32, s4_2ImmPred, S2_storeri_pi>;
+def: Storepi_pat<post_store,      I64, s4_3ImmPred, S2_storerd_pi>;
+
+// Patterns for generating stores, where the address takes different forms:
+// - frameindex,
+// - frameindex + offset,
+// - base + offset,
+// - simple (base address without offset).
+// These would usually be used together (via Storex_pat defined below), but
+// in some cases one may want to apply different properties (such as
+// AddedComplexity) to the individual patterns.
+class Storex_fi_pat<PatFrag Store, PatFrag Value, InstHexagon MI>
+  : Pat<(Store Value:$Rs, AddrFI:$fi), (MI AddrFI:$fi, 0, Value:$Rs)>;
+multiclass Storex_fi_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
+                             InstHexagon MI> {
+  def: Pat<(Store Value:$Rs, (add (i32 AddrFI:$fi), ImmPred:$Off)),
+           (MI AddrFI:$fi, imm:$Off, Value:$Rs)>;
+  def: Pat<(Store Value:$Rs, (IsOrAdd (i32 AddrFI:$fi), ImmPred:$Off)),
+           (MI AddrFI:$fi, imm:$Off, Value:$Rs)>;
+}
+multiclass Storex_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
+                          InstHexagon MI> {
+  def: Pat<(Store Value:$Rt, (add I32:$Rs, ImmPred:$Off)),
+           (MI IntRegs:$Rs, imm:$Off, Value:$Rt)>;
+  def: Pat<(Store Value:$Rt, (IsOrAdd I32:$Rs, ImmPred:$Off)),
+           (MI IntRegs:$Rs, imm:$Off, Value:$Rt)>;
+}
+class Storex_simple_pat<PatFrag Store, PatFrag Value, InstHexagon MI>
+  : Pat<(Store Value:$Rt, I32:$Rs),
+        (MI IntRegs:$Rs, 0, Value:$Rt)>;
+
+// Patterns for generating stores, where the address takes different forms,
+// and where the value being stored is transformed through the value modifier
+// ValueMod.  The address forms are same as above.
+class Storexm_fi_pat<PatFrag Store, PatFrag Value, PatFrag ValueMod,
+                     InstHexagon MI>
+  : Pat<(Store Value:$Rs, AddrFI:$fi),
+        (MI AddrFI:$fi, 0, (ValueMod Value:$Rs))>;
+multiclass Storexm_fi_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
+                              PatFrag ValueMod, InstHexagon MI> {
+  def: Pat<(Store Value:$Rs, (add (i32 AddrFI:$fi), ImmPred:$Off)),
+           (MI AddrFI:$fi, imm:$Off, (ValueMod Value:$Rs))>;
+  def: Pat<(Store Value:$Rs, (IsOrAdd (i32 AddrFI:$fi), ImmPred:$Off)),
+           (MI AddrFI:$fi, imm:$Off, (ValueMod Value:$Rs))>;
+}
+multiclass Storexm_add_pat<PatFrag Store, PatFrag Value, PatFrag ImmPred,
+                           PatFrag ValueMod, InstHexagon MI> {
+  def: Pat<(Store Value:$Rt, (add I32:$Rs, ImmPred:$Off)),
+           (MI IntRegs:$Rs, imm:$Off, (ValueMod Value:$Rt))>;
+  def: Pat<(Store Value:$Rt, (IsOrAdd I32:$Rs, ImmPred:$Off)),
+           (MI IntRegs:$Rs, imm:$Off, (ValueMod Value:$Rt))>;
+}
+class Storexm_simple_pat<PatFrag Store, PatFrag Value, PatFrag ValueMod,
+                         InstHexagon MI>
+  : Pat<(Store Value:$Rt, I32:$Rs),
+        (MI IntRegs:$Rs, 0, (ValueMod Value:$Rt))>;
+
+multiclass Storex_pat<PatFrag Store, PatFrag Value, PatLeaf ImmPred,
+                      InstHexagon MI> {
+  def:  Storex_fi_pat     <Store, Value,          MI>;
+  defm: Storex_fi_add_pat <Store, Value, ImmPred, MI>;
+  defm: Storex_add_pat    <Store, Value, ImmPred, MI>;
+}
+
+multiclass Storexm_pat<PatFrag Store, PatFrag Value, PatLeaf ImmPred,
+                       PatFrag ValueMod, InstHexagon MI> {
+  def:  Storexm_fi_pat     <Store, Value,          ValueMod, MI>;
+  defm: Storexm_fi_add_pat <Store, Value, ImmPred, ValueMod, MI>;
+  defm: Storexm_add_pat    <Store, Value, ImmPred, ValueMod, MI>;
+}
+
+// Regular stores in the DAG have two operands: value and address.
+// Atomic stores also have two, but they are reversed: address, value.
+// To use atomic stores with the patterns, they need to have their operands
+// swapped. This relies on the knowledge that the F.Fragment uses names
+// "ptr" and "val".
+class SwapSt<PatFrag F>
+  : PatFrag<(ops node:$val, node:$ptr), F.Fragment, F.PredicateCode,
+            F.OperandTransform>;
+
+let AddedComplexity = 20 in {
+  defm: Storex_pat<truncstorei8,    I32, s32_0ImmPred, S2_storerb_io>;
+  defm: Storex_pat<truncstorei16,   I32, s31_1ImmPred, S2_storerh_io>;
+  defm: Storex_pat<store,           I32, s30_2ImmPred, S2_storeri_io>;
+  defm: Storex_pat<store,           I64, s29_3ImmPred, S2_storerd_io>;
+
+  defm: Storex_pat<SwapSt<atomic_store_8>,  I32, s32_0ImmPred, S2_storerb_io>;
+  defm: Storex_pat<SwapSt<atomic_store_16>, I32, s31_1ImmPred, S2_storerh_io>;
+  defm: Storex_pat<SwapSt<atomic_store_32>, I32, s30_2ImmPred, S2_storeri_io>;
+  defm: Storex_pat<SwapSt<atomic_store_64>, I64, s29_3ImmPred, S2_storerd_io>;
+}
+
+// Simple patterns should be tried with the least priority.
+def: Storex_simple_pat<truncstorei8,    I32, S2_storerb_io>;
+def: Storex_simple_pat<truncstorei16,   I32, S2_storerh_io>;
+def: Storex_simple_pat<store,           I32, S2_storeri_io>;
+def: Storex_simple_pat<store,           I64, S2_storerd_io>;
+
+def: Storex_simple_pat<SwapSt<atomic_store_8>,  I32, S2_storerb_io>;
+def: Storex_simple_pat<SwapSt<atomic_store_16>, I32, S2_storerh_io>;
+def: Storex_simple_pat<SwapSt<atomic_store_32>, I32, S2_storeri_io>;
+def: Storex_simple_pat<SwapSt<atomic_store_64>, I64, S2_storerd_io>;
+
+let AddedComplexity = 20 in {
+  defm: Storexm_pat<truncstorei8,  I64, s32_0ImmPred, LoReg, S2_storerb_io>;
+  defm: Storexm_pat<truncstorei16, I64, s31_1ImmPred, LoReg, S2_storerh_io>;
+  defm: Storexm_pat<truncstorei32, I64, s30_2ImmPred, LoReg, S2_storeri_io>;
+}
+
+def: Storexm_simple_pat<truncstorei8,  I64, LoReg, S2_storerb_io>;
+def: Storexm_simple_pat<truncstorei16, I64, LoReg, S2_storerh_io>;
+def: Storexm_simple_pat<truncstorei32, I64, LoReg, S2_storeri_io>;
+
+def: Pat <(Sext64 I32:$src), (A2_sxtw I32:$src)>;
+
+def: Pat<(select (i1 (setlt I32:$src, 0)), (sub 0, I32:$src), I32:$src),
+         (A2_abs IntRegs:$src)>;
+
+let AddedComplexity = 50 in
+def: Pat<(xor (add (sra I32:$src, (i32 31)),
+                   I32:$src),
+              (sra I32:$src, (i32 31))),
+         (A2_abs IntRegs:$src)>;
+
+def: Pat<(sra I32:$src, u5_0ImmPred:$u5),
+         (S2_asr_i_r IntRegs:$src, imm:$u5)>;
+def: Pat<(srl I32:$src, u5_0ImmPred:$u5),
+         (S2_lsr_i_r IntRegs:$src, imm:$u5)>;
+def: Pat<(shl I32:$src, u5_0ImmPred:$u5),
+         (S2_asl_i_r IntRegs:$src, imm:$u5)>;
+
+def: Pat<(sra (add (sra I32:$src1, u5_0ImmPred:$src2), 1), (i32 1)),
+         (S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred:$src2)>;
+
+def : Pat<(not I64:$src1),
+          (A2_notp DoubleRegs:$src1)>;
+
+// Count leading zeros.
+def: Pat<(ctlz I32:$Rs), (S2_cl0 I32:$Rs)>;
+def: Pat<(i32 (trunc (ctlz I64:$Rss))), (S2_cl0p I64:$Rss)>;
+
+// Count trailing zeros: 32-bit.
+def: Pat<(cttz I32:$Rs), (S2_ct0 I32:$Rs)>;
+
+// Count leading ones.
+def: Pat<(ctlz (not I32:$Rs)), (S2_cl1 I32:$Rs)>;
+def: Pat<(i32 (trunc (ctlz (not I64:$Rss)))), (S2_cl1p I64:$Rss)>;
+
+// Count trailing ones: 32-bit.
+def: Pat<(cttz (not I32:$Rs)), (S2_ct1 I32:$Rs)>;
+
+let AddedComplexity = 20 in { // Complexity greater than and/or/xor
+  def: Pat<(and I32:$Rs, IsNPow2_32:$V),
+           (S2_clrbit_i IntRegs:$Rs, (LogN2_32 $V))>;
+  def: Pat<(or I32:$Rs, IsPow2_32:$V),
+           (S2_setbit_i IntRegs:$Rs, (Log2_32 $V))>;
+  def: Pat<(xor I32:$Rs, IsPow2_32:$V),
+           (S2_togglebit_i IntRegs:$Rs, (Log2_32 $V))>;
+
+  def: Pat<(and I32:$Rs, (not (shl 1, I32:$Rt))),
+           (S2_clrbit_r IntRegs:$Rs, IntRegs:$Rt)>;
+  def: Pat<(or I32:$Rs, (shl 1, I32:$Rt)),
+           (S2_setbit_r IntRegs:$Rs, IntRegs:$Rt)>;
+  def: Pat<(xor I32:$Rs, (shl 1, I32:$Rt)),
+           (S2_togglebit_r IntRegs:$Rs, IntRegs:$Rt)>;
+}
+
+// Clr/set/toggle bit for 64-bit values with immediate bit index.
+let AddedComplexity = 20 in { // Complexity greater than and/or/xor
+  def: Pat<(and I64:$Rss, IsNPow2_64L:$V),
+           (REG_SEQUENCE DoubleRegs,
+                (i32 (HiReg $Rss)), isub_hi,
+                (S2_clrbit_i (LoReg $Rss), (LogN2_64 $V)), isub_lo)>;
+  def: Pat<(and I64:$Rss, IsNPow2_64H:$V),
+           (REG_SEQUENCE DoubleRegs,
+                (S2_clrbit_i (HiReg $Rss), (UDEC32 (i32 (LogN2_64 $V)))),
+                isub_hi,
+                (i32 (LoReg $Rss)), isub_lo)>;
+
+  def: Pat<(or I64:$Rss, IsPow2_64L:$V),
+           (REG_SEQUENCE DoubleRegs,
+                (i32 (HiReg $Rss)), isub_hi,
+                (S2_setbit_i (LoReg $Rss), (Log2_64 $V)), isub_lo)>;
+  def: Pat<(or I64:$Rss, IsPow2_64H:$V),
+           (REG_SEQUENCE DoubleRegs,
+                (S2_setbit_i (HiReg $Rss), (UDEC32 (i32 (Log2_64 $V)))),
+                isub_hi,
+                (i32 (LoReg $Rss)), isub_lo)>;
+
+  def: Pat<(xor I64:$Rss, IsPow2_64L:$V),
+           (REG_SEQUENCE DoubleRegs,
+                (i32 (HiReg $Rss)), isub_hi,
+                (S2_togglebit_i (LoReg $Rss), (Log2_64 $V)), isub_lo)>;
+  def: Pat<(xor I64:$Rss, IsPow2_64H:$V),
+           (REG_SEQUENCE DoubleRegs,
+                (S2_togglebit_i (HiReg $Rss), (UDEC32 (i32 (Log2_64 $V)))),
+                isub_hi,
+                (i32 (LoReg $Rss)), isub_lo)>;
+}
+
+let AddedComplexity = 20 in { // Complexity greater than cmp reg-imm.
+  def: Pat<(i1 (setne (and (shl 1, u5_0ImmPred:$u5), I32:$Rs), 0)),
+           (S2_tstbit_i IntRegs:$Rs, u5_0ImmPred:$u5)>;
+  def: Pat<(i1 (setne (and (shl 1, I32:$Rt), I32:$Rs), 0)),
+           (S2_tstbit_r IntRegs:$Rs, IntRegs:$Rt)>;
+  def: Pat<(i1 (trunc I32:$Rs)),
+           (S2_tstbit_i IntRegs:$Rs, 0)>;
+  def: Pat<(i1 (trunc I64:$Rs)),
+           (S2_tstbit_i (LoReg DoubleRegs:$Rs), 0)>;
+}
+
+let AddedComplexity = 20 in { // Complexity greater than compare reg-imm.
+  def: Pat<(i1 (seteq (and I32:$Rs, u6_0ImmPred:$u6), 0)),
+           (C2_bitsclri IntRegs:$Rs, u6_0ImmPred:$u6)>;
+  def: Pat<(i1 (seteq (and I32:$Rs, I32:$Rt), 0)),
+           (C2_bitsclr IntRegs:$Rs, IntRegs:$Rt)>;
+}
+
+let AddedComplexity = 10 in   // Complexity greater than compare reg-reg.
+def: Pat<(i1 (seteq (and I32:$Rs, I32:$Rt), IntRegs:$Rt)),
+         (C2_bitsset IntRegs:$Rs, IntRegs:$Rt)>;
+
+def: Pat<(or (or (shl (or (shl (i32 (extloadi8 (add I32:$b, 3))),
+                               (i32 8)),
+                          (i32 (zextloadi8 (add I32:$b, 2)))),
+                      (i32 16)),
+                 (shl (i32 (zextloadi8 (add I32:$b, 1))), (i32 8))),
+             (zextloadi8 I32:$b)),
+         (A2_swiz (L2_loadri_io IntRegs:$b, 0))>;
+
+// Patterns for loads of i1:
+def: Pat<(i1 (load AddrFI:$fi)),
+         (C2_tfrrp (L2_loadrub_io AddrFI:$fi, 0))>;
+def: Pat<(i1 (load (add I32:$Rs, s32_0ImmPred:$Off))),
+         (C2_tfrrp (L2_loadrub_io IntRegs:$Rs, imm:$Off))>;
+def: Pat<(i1 (load I32:$Rs)),
+         (C2_tfrrp (L2_loadrub_io IntRegs:$Rs, 0))>;
+
+def I1toI32: OutPatFrag<(ops node:$Rs),
+                        (C2_muxii (i1 $Rs), 1, 0)>;
+
+def I32toI1: OutPatFrag<(ops node:$Rs),
+                        (i1 (C2_tfrrp (i32 $Rs)))>;
+
+defm: Storexm_pat<store, I1, s32_0ImmPred, I1toI32, S2_storerb_io>;
+def: Storexm_simple_pat<store, I1, I1toI32, S2_storerb_io>;
+
+def: Pat<(sra I64:$src, u6_0ImmPred:$u6),
+         (S2_asr_i_p DoubleRegs:$src, imm:$u6)>;
+def: Pat<(srl I64:$src, u6_0ImmPred:$u6),
+         (S2_lsr_i_p DoubleRegs:$src, imm:$u6)>;
+def: Pat<(shl I64:$src, u6_0ImmPred:$u6),
+         (S2_asl_i_p DoubleRegs:$src, imm:$u6)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I32:$Rt, (shl I32:$Rs, u3_0ImmPred:$u3)),
+         (S2_addasl_rrri IntRegs:$Rt, IntRegs:$Rs, imm:$u3)>;
+
+def HexagonBARRIER: SDNode<"HexagonISD::BARRIER", SDTNone, [SDNPHasChain]>;
+def: Pat<(HexagonBARRIER), (Y2_barrier)>;
+
+def: Pat<(IsOrAdd (i32 AddrFI:$Rs), s32_0ImmPred:$off),
+         (PS_fi (i32 AddrFI:$Rs), s32_0ImmPred:$off)>;
+
+
+// Support for generating global address.
+// Taken from X86InstrInfo.td.
+def SDTHexagonCONST32 : SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisPtrTy<0>]>;
+def HexagonCONST32    : SDNode<"HexagonISD::CONST32",    SDTHexagonCONST32>;
+def HexagonCONST32_GP : SDNode<"HexagonISD::CONST32_GP", SDTHexagonCONST32>;
+
+// Map TLS addressses to A2_tfrsi.
+def: Pat<(HexagonCONST32 tglobaltlsaddr:$addr), (A2_tfrsi s16_0Ext:$addr)>;
+def: Pat<(HexagonCONST32 bbl:$label),           (A2_tfrsi s16_0Ext:$label)>;
+
+def: Pat<(i64 imm:$v), (CONST64 imm:$v)>;
+def: Pat<(i1 0), (PS_false)>;
+def: Pat<(i1 1), (PS_true)>;
+
+// Pseudo instructions.
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPCallSeqStart,
+                    [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_SPCallSeqEnd,
+                    [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def SDT_SPCall  : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+
+// For tailcalls a HexagonTCRet SDNode has 3 SDNode Properties - a chain,
+// Optional Flag and Variable Arguments.
+// Its 1 Operand has pointer type.
+def HexagonTCRet : SDNode<"HexagonISD::TC_RETURN", SDT_SPCall,
+                          [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+
+def: Pat<(callseq_start timm:$amt),
+          (ADJCALLSTACKDOWN imm:$amt)>;
+def: Pat<(callseq_end timm:$amt1, timm:$amt2),
+         (ADJCALLSTACKUP imm:$amt1, imm:$amt2)>;
+
+//Tail calls.
+def: Pat<(HexagonTCRet tglobaladdr:$dst),
+         (PS_tailcall_i tglobaladdr:$dst)>;
+def: Pat<(HexagonTCRet texternalsym:$dst),
+         (PS_tailcall_i texternalsym:$dst)>;
+def: Pat<(HexagonTCRet I32:$dst),
+         (PS_tailcall_r I32:$dst)>;
+
+// Map from r0 = and(r1, 65535) to r0 = zxth(r1)
+def: Pat<(and I32:$src1, 65535),
+         (A2_zxth IntRegs:$src1)>;
+
+// Map from r0 = and(r1, 255) to r0 = zxtb(r1).
+def: Pat<(and I32:$src1, 255),
+         (A2_zxtb IntRegs:$src1)>;
+
+// Map Add(p1, true) to p1 = not(p1).
+//     Add(p1, false) should never be produced,
+//     if it does, it got to be mapped to NOOP.
+def: Pat<(add I1:$src1, -1),
+         (C2_not PredRegs:$src1)>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, #i, #j) => r0 = mux(p0, #j, #i).
+def: Pat<(select (not I1:$src1), s8_0ImmPred:$src2, s32_0ImmPred:$src3),
+         (C2_muxii PredRegs:$src1, s32_0ImmPred:$src3, s8_0ImmPred:$src2)>;
+
+// Map from p0 = pnot(p0); r0 = select(p0, #i, r1)
+// => r0 = C2_muxir(p0, r1, #i)
+def: Pat<(select (not I1:$src1), s32_0ImmPred:$src2,
+                 I32:$src3),
+         (C2_muxir PredRegs:$src1, IntRegs:$src3, s32_0ImmPred:$src2)>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, r1, #i)
+// => r0 = C2_muxri (p0, #i, r1)
+def: Pat<(select (not I1:$src1), IntRegs:$src2, s32_0ImmPred:$src3),
+         (C2_muxri PredRegs:$src1, s32_0ImmPred:$src3, IntRegs:$src2)>;
+
+// Map from p0 = pnot(p0); if (p0) jump => if (!p0) jump.
+def: Pat<(brcond (not I1:$src1), bb:$offset),
+         (J2_jumpf PredRegs:$src1, bb:$offset)>;
+
+// Map from Rdd = sign_extend_inreg(Rss, i32) -> Rdd = A2_sxtw(Rss.lo).
+def: Pat<(i64 (sext_inreg I64:$src1, i32)),
+         (A2_sxtw (LoReg DoubleRegs:$src1))>;
+
+// Map from Rdd = sign_extend_inreg(Rss, i16) -> Rdd = A2_sxtw(A2_sxth(Rss.lo)).
+def: Pat<(i64 (sext_inreg I64:$src1, i16)),
+         (A2_sxtw (A2_sxth (LoReg DoubleRegs:$src1)))>;
+
+// Map from Rdd = sign_extend_inreg(Rss, i8) -> Rdd = A2_sxtw(A2_sxtb(Rss.lo)).
+def: Pat<(i64 (sext_inreg I64:$src1, i8)),
+         (A2_sxtw (A2_sxtb (LoReg DoubleRegs:$src1)))>;
+
+// We want to prevent emitting pnot's as much as possible.
+// Map brcond with an unsupported setcc to a J2_jumpf.
+def : Pat <(brcond (i1 (setne I32:$src1, I32:$src2)),
+                        bb:$offset),
+      (J2_jumpf (C2_cmpeq I32:$src1, I32:$src2),
+                bb:$offset)>;
+
+def : Pat <(brcond (i1 (setne I32:$src1, s10_0ImmPred:$src2)),
+                        bb:$offset),
+      (J2_jumpf (C2_cmpeqi I32:$src1, s10_0ImmPred:$src2), bb:$offset)>;
+
+def: Pat<(brcond (i1 (setne I1:$src1, (i1 -1))), bb:$offset),
+         (J2_jumpf PredRegs:$src1, bb:$offset)>;
+
+def: Pat<(brcond (i1 (setne I1:$src1, (i1 0))), bb:$offset),
+         (J2_jumpt PredRegs:$src1, bb:$offset)>;
+
+// cmp.lt(Rs, Imm) -> !cmp.ge(Rs, Imm) -> !cmp.gt(Rs, Imm-1)
+def: Pat<(brcond (i1 (setlt I32:$src1, s8_0ImmPred:$src2)), bb:$offset),
+        (J2_jumpf (C2_cmpgti IntRegs:$src1, (SDEC1 s8_0ImmPred:$src2)),
+                  bb:$offset)>;
+
+// Map from a 64-bit select to an emulated 64-bit mux.
+// Hexagon does not support 64-bit MUXes; so emulate with combines.
+def: Pat<(select I1:$src1, I64:$src2,
+                 I64:$src3),
+         (A2_combinew (C2_mux PredRegs:$src1, (HiReg DoubleRegs:$src2),
+                                              (HiReg DoubleRegs:$src3)),
+                      (C2_mux PredRegs:$src1, (LoReg DoubleRegs:$src2),
+                                              (LoReg DoubleRegs:$src3)))>;
+
+// Map from a 1-bit select to logical ops.
+// From LegalizeDAG.cpp: (B1 ? B2 : B3) <=> (B1 & B2)|(!B1&B3).
+def: Pat<(select I1:$src1, I1:$src2, I1:$src3),
+         (C2_or (C2_and PredRegs:$src1, PredRegs:$src2),
+                (C2_and (C2_not PredRegs:$src1), PredRegs:$src3))>;
+
+// Map for truncating from 64 immediates to 32 bit immediates.
+def: Pat<(i32 (trunc I64:$src)),
+         (LoReg DoubleRegs:$src)>;
+
+// Map for truncating from i64 immediates to i1 bit immediates.
+def: Pat<(i1 (trunc I64:$src)),
+         (C2_tfrrp (LoReg DoubleRegs:$src))>;
+
+// rs <= rt -> !(rs > rt).
+let AddedComplexity = 30 in
+def: Pat<(i1 (setle I32:$src1, s32_0ImmPred:$src2)),
+         (C2_not (C2_cmpgti IntRegs:$src1, s32_0ImmPred:$src2))>;
+
+// rs <= rt -> !(rs > rt).
+def : Pat<(i1 (setle I32:$src1, I32:$src2)),
+      (i1 (C2_not (C2_cmpgt I32:$src1, I32:$src2)))>;
+
+// Rss <= Rtt -> !(Rss > Rtt).
+def: Pat<(i1 (setle I64:$src1, I64:$src2)),
+         (C2_not (C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2))>;
+
+// Map cmpne -> cmpeq.
+// Hexagon_TODO: We should improve on this.
+// rs != rt -> !(rs == rt).
+let AddedComplexity = 30 in
+def: Pat<(i1 (setne I32:$src1, s32_0ImmPred:$src2)),
+         (C2_not (C2_cmpeqi IntRegs:$src1, s32_0ImmPred:$src2))>;
+
+// Convert setne back to xor for hexagon since we compute w/ pred registers.
+def: Pat<(i1 (setne I1:$src1, I1:$src2)),
+         (C2_xor PredRegs:$src1, PredRegs:$src2)>;
+
+// Map cmpne(Rss) -> !cmpew(Rss).
+// rs != rt -> !(rs == rt).
+def: Pat<(i1 (setne I64:$src1, I64:$src2)),
+         (C2_not (C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2))>;
+
+// Map cmpge(Rs, Rt) -> !cmpgt(Rs, Rt).
+// rs >= rt -> !(rt > rs).
+def : Pat <(i1 (setge I32:$src1, I32:$src2)),
+      (i1 (C2_not (i1 (C2_cmpgt I32:$src2, I32:$src1))))>;
+
+// cmpge(Rs, Imm) -> cmpgt(Rs, Imm-1)
+let AddedComplexity = 30 in
+def: Pat<(i1 (setge I32:$src1, s32_0ImmPred:$src2)),
+         (C2_cmpgti IntRegs:$src1, (SDEC1 s32_0ImmPred:$src2))>;
+
+// Map cmpge(Rss, Rtt) -> !cmpgt(Rtt, Rss).
+// rss >= rtt -> !(rtt > rss).
+def: Pat<(i1 (setge I64:$src1, I64:$src2)),
+         (C2_not (C2_cmpgtp DoubleRegs:$src2, DoubleRegs:$src1))>;
+
+// Map cmplt(Rs, Imm) -> !cmpge(Rs, Imm).
+// !cmpge(Rs, Imm) -> !cmpgt(Rs, Imm-1).
+// rs < rt -> !(rs >= rt).
+let AddedComplexity = 30 in
+def: Pat<(i1 (setlt I32:$src1, s32_0ImmPred:$src2)),
+         (C2_not (C2_cmpgti IntRegs:$src1, (SDEC1 s32_0ImmPred:$src2)))>;
+
+// Generate cmpgeu(Rs, #0) -> cmpeq(Rs, Rs)
+def: Pat<(i1 (setuge I32:$src1, 0)),
+         (C2_cmpeq IntRegs:$src1, IntRegs:$src1)>;
+
+// Generate cmpgeu(Rs, #u8) -> cmpgtu(Rs, #u8 -1)
+def: Pat<(i1 (setuge I32:$src1, u32_0ImmPred:$src2)),
+         (C2_cmpgtui IntRegs:$src1, (UDEC1 u32_0ImmPred:$src2))>;
+
+// Generate cmpgtu(Rs, #u9)
+def: Pat<(i1 (setugt I32:$src1, u32_0ImmPred:$src2)),
+         (C2_cmpgtui IntRegs:$src1, u32_0ImmPred:$src2)>;
+
+// Map from Rs >= Rt -> !(Rt > Rs).
+// rs >= rt -> !(rt > rs).
+def: Pat<(i1 (setuge I64:$src1, I64:$src2)),
+         (C2_not (C2_cmpgtup DoubleRegs:$src2, DoubleRegs:$src1))>;
+
+// Map from cmpleu(Rss, Rtt) -> !cmpgtu(Rss, Rtt-1).
+// Map from (Rs <= Rt) -> !(Rs > Rt).
+def: Pat<(i1 (setule I64:$src1, I64:$src2)),
+         (C2_not (C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2))>;
+
+// Sign extends.
+// i1 -> i32
+def: Pat<(i32 (sext I1:$src1)),
+         (C2_muxii PredRegs:$src1, -1, 0)>;
+
+// i1 -> i64
+def: Pat<(i64 (sext I1:$src1)),
+         (A2_combinew (A2_tfrsi -1), (C2_muxii PredRegs:$src1, -1, 0))>;
+
+// Zero extends.
+// i1 -> i32
+def: Pat<(i32 (zext I1:$src1)),
+         (C2_muxii PredRegs:$src1, 1, 0)>;
+
+// Map from Rs = Pd to Pd = mux(Pd, #1, #0)
+def: Pat<(i32 (anyext I1:$src1)),
+         (C2_muxii PredRegs:$src1, 1, 0)>;
+
+// Map from Rss = Pd to Rdd = sxtw (mux(Pd, #1, #0))
+def: Pat<(i64 (anyext I1:$src1)),
+         (A2_sxtw (C2_muxii PredRegs:$src1, 1, 0))>;
+
+// Clear the sign bit in a 64-bit register.
+def ClearSign : OutPatFrag<(ops node:$Rss),
+  (A2_combinew (S2_clrbit_i (HiReg $Rss), 31), (LoReg $Rss))>;
+
+def MulHU : OutPatFrag<(ops node:$Rss, node:$Rtt),
+  (A2_addp
+    (M2_dpmpyuu_acc_s0
+      (S2_lsr_i_p
+        (A2_addp
+          (M2_dpmpyuu_acc_s0
+            (S2_lsr_i_p (M2_dpmpyuu_s0 (LoReg $Rss), (LoReg $Rtt)), 32),
+            (HiReg $Rss),
+            (LoReg $Rtt)),
+          (A2_combinew (A2_tfrsi 0),
+                       (LoReg (M2_dpmpyuu_s0 (LoReg $Rss), (HiReg $Rtt))))),
+        32),
+      (HiReg $Rss),
+      (HiReg $Rtt)),
+    (S2_lsr_i_p (M2_dpmpyuu_s0 (LoReg $Rss), (HiReg $Rtt)), 32))>;
+
+// Multiply 64-bit unsigned and use upper result.
+def : Pat <(mulhu I64:$Rss, I64:$Rtt), (MulHU $Rss, $Rtt)>;
+
+// Multiply 64-bit signed and use upper result.
+//
+// For two signed 64-bit integers A and B, let A' and B' denote A and B
+// with the sign bit cleared. Then A = -2^63*s(A) + A', where s(A) is the
+// sign bit of A (and identically for B). With this notation, the signed
+// product A*B can be written as:
+//   AB = (-2^63 s(A) + A') * (-2^63 s(B) + B')
+//      = 2^126 s(A)s(B) - 2^63 [s(A)B'+s(B)A'] + A'B'
+//      = 2^126 s(A)s(B) + 2^63 [s(A)B'+s(B)A'] + A'B' - 2*2^63 [s(A)B'+s(B)A']
+//      = (unsigned product AB) - 2^64 [s(A)B'+s(B)A']
+
+def : Pat <(mulhs I64:$Rss, I64:$Rtt),
+  (A2_subp
+    (MulHU $Rss, $Rtt),
+    (A2_addp
+      (A2_andp (S2_asr_i_p $Rss, 63), (ClearSign $Rtt)),
+      (A2_andp (S2_asr_i_p $Rtt, 63), (ClearSign $Rss))))>;
+
+// Hexagon specific ISD nodes.
+def SDTHexagonALLOCA : SDTypeProfile<1, 2,
+      [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+def HexagonALLOCA : SDNode<"HexagonISD::ALLOCA", SDTHexagonALLOCA,
+      [SDNPHasChain]>;
+
+
+def: Pat<(HexagonALLOCA I32:$Rs, (i32 imm:$A)),
+         (PS_alloca IntRegs:$Rs, imm:$A)>;
+
+def HexagonJT:     SDNode<"HexagonISD::JT", SDTIntUnaryOp>;
+def HexagonCP:     SDNode<"HexagonISD::CP", SDTIntUnaryOp>;
+
+def: Pat<(HexagonJT tjumptable:$dst), (A2_tfrsi imm:$dst)>;
+def: Pat<(HexagonCP tconstpool:$dst), (A2_tfrsi imm:$dst)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I32:$src1, (sra I32:$Rs, u5_0ImmPred:$u5)), (S2_asr_i_r_acc IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(sub I32:$src1, (sra I32:$Rs, u5_0ImmPred:$u5)), (S2_asr_i_r_nac IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(and I32:$src1, (sra I32:$Rs, u5_0ImmPred:$u5)), (S2_asr_i_r_and IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(or I32:$src1, (sra I32:$Rs, u5_0ImmPred:$u5)), (S2_asr_i_r_or IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I64:$src1, (sra I64:$Rs, u6_0ImmPred:$u5)), (S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(sub I64:$src1, (sra I64:$Rs, u6_0ImmPred:$u5)), (S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(and I64:$src1, (sra I64:$Rs, u6_0ImmPred:$u5)), (S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(or I64:$src1, (sra I64:$Rs, u6_0ImmPred:$u5)), (S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I32:$src1, (srl I32:$Rs, u5_0ImmPred:$u5)), (S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(sub I32:$src1, (srl I32:$Rs, u5_0ImmPred:$u5)), (S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(and I32:$src1, (srl I32:$Rs, u5_0ImmPred:$u5)), (S2_lsr_i_r_and IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(or I32:$src1, (srl I32:$Rs, u5_0ImmPred:$u5)), (S2_lsr_i_r_or IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+let AddedComplexity = 100 in
+def: Pat<(xor I32:$src1, (srl I32:$Rs, u5_0ImmPred:$u5)), (S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I64:$src1, (srl I64:$Rs, u6_0ImmPred:$u5)), (S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(sub I64:$src1, (srl I64:$Rs, u6_0ImmPred:$u5)), (S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(and I64:$src1, (srl I64:$Rs, u6_0ImmPred:$u5)), (S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(or I64:$src1, (srl I64:$Rs, u6_0ImmPred:$u5)), (S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+let AddedComplexity = 100 in
+def: Pat<(xor I64:$src1, (srl I64:$Rs, u6_0ImmPred:$u5)), (S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I32:$src1, (shl I32:$Rs, u5_0ImmPred:$u5)), (S2_asl_i_r_acc IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(sub I32:$src1, (shl I32:$Rs, u5_0ImmPred:$u5)), (S2_asl_i_r_nac IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(and I32:$src1, (shl I32:$Rs, u5_0ImmPred:$u5)), (S2_asl_i_r_and IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+def: Pat<(or I32:$src1, (shl I32:$Rs, u5_0ImmPred:$u5)), (S2_asl_i_r_or IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+let AddedComplexity = 100 in
+def: Pat<(xor I32:$src1, (shl I32:$Rs, u5_0ImmPred:$u5)), (S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$Rs, u5_0ImmPred:$u5)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I64:$src1, (shl I64:$Rs, u6_0ImmPred:$u5)), (S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(sub I64:$src1, (shl I64:$Rs, u6_0ImmPred:$u5)), (S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(and I64:$src1, (shl I64:$Rs, u6_0ImmPred:$u5)), (S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+def: Pat<(or I64:$src1, (shl I64:$Rs, u6_0ImmPred:$u5)), (S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+let AddedComplexity = 100 in
+def: Pat<(xor I64:$src1, (shl I64:$Rs, u6_0ImmPred:$u5)), (S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$Rs, u6_0ImmPred:$u5)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I32:$src1, (shl I32:$Rs, I32:$Rt)), (S2_asl_r_r_acc IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(sub I32:$src1, (shl I32:$Rs, I32:$Rt)), (S2_asl_r_r_nac IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(and I32:$src1, (shl I32:$Rs, I32:$Rt)), (S2_asl_r_r_and IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(or I32:$src1, (shl I32:$Rs, I32:$Rt)), (S2_asl_r_r_or IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+let AddedComplexity = 100 in
+def: Pat<(add I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_asl_r_p_acc DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(sub I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(and I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_asl_r_p_and DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(or I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(xor I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I32:$src1, (sra I32:$Rs, I32:$Rt)), (S2_asr_r_r_acc IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(sub I32:$src1, (sra I32:$Rs, I32:$Rt)), (S2_asr_r_r_nac IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(and I32:$src1, (sra I32:$Rs, I32:$Rt)), (S2_asr_r_r_and IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(or I32:$src1, (sra I32:$Rs, I32:$Rt)), (S2_asr_r_r_or IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+let AddedComplexity = 100 in
+def: Pat<(add I64:$src1, (sra I64:$Rs, I32:$Rt)), (S2_asr_r_p_acc DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(sub I64:$src1, (sra I64:$Rs, I32:$Rt)), (S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(and I64:$src1, (sra I64:$Rs, I32:$Rt)), (S2_asr_r_p_and DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(or I64:$src1, (sra I64:$Rs, I32:$Rt)), (S2_asr_r_p_or DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(xor I64:$src1, (sra I64:$Rs, I32:$Rt)), (S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I32:$src1, (srl I32:$Rs, I32:$Rt)), (S2_lsr_r_r_acc IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(sub I32:$src1, (srl I32:$Rs, I32:$Rt)), (S2_lsr_r_r_nac IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(and I32:$src1, (srl I32:$Rs, I32:$Rt)), (S2_lsr_r_r_and IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(or I32:$src1, (srl I32:$Rs, I32:$Rt)), (S2_lsr_r_r_or IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+let AddedComplexity = 100 in
+def: Pat<(add I64:$src1, (srl I64:$Rs, I32:$Rt)), (S2_lsr_r_p_acc DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(sub I64:$src1, (srl I64:$Rs, I32:$Rt)), (S2_lsr_r_p_nac DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(and I64:$src1, (srl I64:$Rs, I32:$Rt)), (S2_lsr_r_p_and DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(or I64:$src1, (srl I64:$Rs, I32:$Rt)), (S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(xor I64:$src1, (srl I64:$Rs, I32:$Rt)), (S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+
+let AddedComplexity = 100 in
+def: Pat<(add I32:$src1, (shl I32:$Rs, I32:$Rt)), (S2_lsl_r_r_acc IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(sub I32:$src1, (shl I32:$Rs, I32:$Rt)), (S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(and I32:$src1, (shl I32:$Rs, I32:$Rt)), (S2_lsl_r_r_and IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(or I32:$src1, (shl I32:$Rs, I32:$Rt)), (S2_lsl_r_r_or IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt)>;
+let AddedComplexity = 100 in
+def: Pat<(add I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_lsl_r_p_acc DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(sub I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(and I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(or I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_lsl_r_p_or DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+def: Pat<(xor I64:$src1, (shl I64:$Rs, I32:$Rt)), (S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$Rs, IntRegs:$Rt)>;
+
+def: Pat<(sra I64:$src1, I32:$src2), (S2_asr_r_p DoubleRegs:$src1, IntRegs:$src2)>;
+def: Pat<(srl I64:$src1, I32:$src2), (S2_lsr_r_p DoubleRegs:$src1, IntRegs:$src2)>;
+def: Pat<(shl I64:$src1, I32:$src2), (S2_asl_r_p DoubleRegs:$src1, IntRegs:$src2)>;
+def: Pat<(shl I64:$src1, I32:$src2), (S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2)>;
+
+def: Pat<(sra I32:$src1, I32:$src2), (S2_asr_r_r IntRegs:$src1, IntRegs:$src2)>;
+def: Pat<(srl I32:$src1, I32:$src2), (S2_lsr_r_r IntRegs:$src1, IntRegs:$src2)>;
+def: Pat<(shl I32:$src1, I32:$src2), (S2_asl_r_r IntRegs:$src1, IntRegs:$src2)>;
+def: Pat<(shl I32:$src1, I32:$src2), (S2_lsl_r_r IntRegs:$src1, IntRegs:$src2)>;
+
+def SDTHexagonINSERT:
+  SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                       SDTCisInt<0>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
+def SDTHexagonINSERTRP:
+  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                       SDTCisInt<0>, SDTCisVT<3, i64>]>;
+
+def HexagonINSERT   : SDNode<"HexagonISD::INSERT",   SDTHexagonINSERT>;
+def HexagonINSERTRP : SDNode<"HexagonISD::INSERTRP", SDTHexagonINSERTRP>;
+
+def: Pat<(HexagonINSERT I32:$Rs, I32:$Rt, u5_0ImmPred:$u1, u5_0ImmPred:$u2),
+         (S2_insert I32:$Rs, I32:$Rt, u5_0ImmPred:$u1, u5_0ImmPred:$u2)>;
+def: Pat<(HexagonINSERT I64:$Rs, I64:$Rt, u6_0ImmPred:$u1, u6_0ImmPred:$u2),
+         (S2_insertp I64:$Rs, I64:$Rt, u6_0ImmPred:$u1, u6_0ImmPred:$u2)>;
+def: Pat<(HexagonINSERTRP I32:$Rs, I32:$Rt, I64:$Ru),
+         (S2_insert_rp I32:$Rs, I32:$Rt, I64:$Ru)>;
+def: Pat<(HexagonINSERTRP I64:$Rs, I64:$Rt, I64:$Ru),
+         (S2_insertp_rp I64:$Rs, I64:$Rt, I64:$Ru)>;
+
+let AddedComplexity = 100 in
+def: Pat<(or (or (shl (HexagonINSERT (i32 (zextloadi8 (add I32:$b, 2))),
+                                     (i32 (extloadi8  (add I32:$b, 3))),
+                                     24, 8),
+                      (i32 16)),
+                 (shl (i32 (zextloadi8 (add I32:$b, 1))), (i32 8))),
+             (zextloadi8 I32:$b)),
+         (A2_swiz (L2_loadri_io I32:$b, 0))>;
+
+def SDTHexagonEXTRACTU:
+  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<1>,
+                       SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+def SDTHexagonEXTRACTURP:
+  SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<1>,
+                       SDTCisVT<2, i64>]>;
+
+def HexagonEXTRACTU   : SDNode<"HexagonISD::EXTRACTU",   SDTHexagonEXTRACTU>;
+def HexagonEXTRACTURP : SDNode<"HexagonISD::EXTRACTURP", SDTHexagonEXTRACTURP>;
+
+def: Pat<(HexagonEXTRACTU I32:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3),
+         (S2_extractu I32:$src1, u5_0ImmPred:$src2, u5_0ImmPred:$src3)>;
+def: Pat<(HexagonEXTRACTU I64:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3),
+         (S2_extractup I64:$src1, u6_0ImmPred:$src2, u6_0ImmPred:$src3)>;
+def: Pat<(HexagonEXTRACTURP I32:$src1, I64:$src2),
+         (S2_extractu_rp I32:$src1, I64:$src2)>;
+def: Pat<(HexagonEXTRACTURP I64:$src1, I64:$src2),
+         (S2_extractup_rp I64:$src1, I64:$src2)>;
+
+def n8_0ImmPred: PatLeaf<(i32 imm), [{
+  int64_t V = N->getSExtValue();
+  return -255 <= V && V <= 0;
+}]>;
+
+// Change the sign of the immediate for Rd=-mpyi(Rs,#u8)
+def: Pat<(mul I32:$src1, (ineg n8_0ImmPred:$src2)),
+         (M2_mpysin IntRegs:$src1, u8_0ImmPred:$src2)>;
+
+multiclass MinMax_pats_p<PatFrag Op, InstHexagon Inst, InstHexagon SwapInst> {
+  defm: T_MinMax_pats<Op, I64, Inst, SwapInst>;
+}
+
+def: Pat<(add (Sext64 I32:$Rs), I64:$Rt),
+         (A2_addsp IntRegs:$Rs, DoubleRegs:$Rt)>;
+
+let AddedComplexity = 200 in {
+  defm: MinMax_pats_p<setge,  A2_maxp,  A2_minp>;
+  defm: MinMax_pats_p<setgt,  A2_maxp,  A2_minp>;
+  defm: MinMax_pats_p<setle,  A2_minp,  A2_maxp>;
+  defm: MinMax_pats_p<setlt,  A2_minp,  A2_maxp>;
+  defm: MinMax_pats_p<setuge, A2_maxup, A2_minup>;
+  defm: MinMax_pats_p<setugt, A2_maxup, A2_minup>;
+  defm: MinMax_pats_p<setule, A2_minup, A2_maxup>;
+  defm: MinMax_pats_p<setult, A2_minup, A2_maxup>;
+}
+
+def callv3 : SDNode<"HexagonISD::CALL", SDT_SPCall,
+           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
+
+def callv3nr : SDNode<"HexagonISD::CALLnr", SDT_SPCall,
+           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
+
+
+// Map call instruction
+def : Pat<(callv3 I32:$dst),
+          (J2_callr I32:$dst)>;
+def : Pat<(callv3 tglobaladdr:$dst),
+          (J2_call tglobaladdr:$dst)>;
+def : Pat<(callv3 texternalsym:$dst),
+          (J2_call texternalsym:$dst)>;
+def : Pat<(callv3 tglobaltlsaddr:$dst),
+          (J2_call tglobaltlsaddr:$dst)>;
+
+def : Pat<(callv3nr I32:$dst),
+          (PS_callr_nr I32:$dst)>;
+def : Pat<(callv3nr tglobaladdr:$dst),
+          (PS_call_nr tglobaladdr:$dst)>;
+def : Pat<(callv3nr texternalsym:$dst),
+          (PS_call_nr texternalsym:$dst)>;
+
+
+def addrga: PatLeaf<(i32 AddrGA:$Addr)>;
+def addrgp: PatLeaf<(i32 AddrGP:$Addr)>;
+
+
+// Pats for instruction selection.
+
+// A class to embed the usual comparison patfrags within a zext to i32.
+// The seteq/setne frags use "lhs" and "rhs" as operands, so use the same
+// names, or else the frag's "body" won't match the operands.
+class CmpInReg<PatFrag Op>
+  : PatFrag<(ops node:$lhs, node:$rhs),(i32 (zext (i1 Op.Fragment)))>;
+
+def: T_cmp32_rr_pat<A4_rcmpeq,  CmpInReg<seteq>, i32>;
+def: T_cmp32_rr_pat<A4_rcmpneq, CmpInReg<setne>, i32>;
+
+def: T_cmp32_rr_pat<C4_cmpneq,  setne,  i1>;
+def: T_cmp32_rr_pat<C4_cmplte,  setle,  i1>;
+def: T_cmp32_rr_pat<C4_cmplteu, setule, i1>;
+
+def: T_cmp32_rr_pat<C4_cmplte,  RevCmp<setge>,  i1>;
+def: T_cmp32_rr_pat<C4_cmplteu, RevCmp<setuge>, i1>;
+
+let AddedComplexity = 100 in {
+  def: Pat<(i1 (seteq (and (xor I32:$Rs, I32:$Rt),
+                       255), 0)),
+           (A4_cmpbeq IntRegs:$Rs, IntRegs:$Rt)>;
+  def: Pat<(i1 (setne (and (xor I32:$Rs, I32:$Rt),
+                       255), 0)),
+           (C2_not (A4_cmpbeq IntRegs:$Rs, IntRegs:$Rt))>;
+  def: Pat<(i1 (seteq (and (xor I32:$Rs, I32:$Rt),
+                           65535), 0)),
+           (A4_cmpheq IntRegs:$Rs, IntRegs:$Rt)>;
+  def: Pat<(i1 (setne (and (xor I32:$Rs, I32:$Rt),
+                           65535), 0)),
+           (C2_not (A4_cmpheq IntRegs:$Rs, IntRegs:$Rt))>;
+}
+
+def: Pat<(i32 (zext (i1 (seteq I32:$Rs, s32_0ImmPred:$s8)))),
+         (A4_rcmpeqi IntRegs:$Rs, s32_0ImmPred:$s8)>;
+def: Pat<(i32 (zext (i1 (setne I32:$Rs, s32_0ImmPred:$s8)))),
+         (A4_rcmpneqi IntRegs:$Rs, s32_0ImmPred:$s8)>;
+
+// Preserve the S2_tstbit_r generation
+def: Pat<(i32 (zext (i1 (setne (i32 (and (i32 (shl 1, I32:$src2)),
+                                         I32:$src1)), 0)))),
+         (C2_muxii (S2_tstbit_r IntRegs:$src1, IntRegs:$src2), 1, 0)>;
+
+// The complexity of the combines involving immediates should be greater
+// than the complexity of the combine with two registers.
+let AddedComplexity = 50 in {
+def: Pat<(HexagonCOMBINE IntRegs:$r, s32_0ImmPred:$i),
+         (A4_combineri IntRegs:$r, s32_0ImmPred:$i)>;
+
+def: Pat<(HexagonCOMBINE s32_0ImmPred:$i, IntRegs:$r),
+         (A4_combineir s32_0ImmPred:$i, IntRegs:$r)>;
+}
+
+// The complexity of the combine with two immediates should be greater than
+// the complexity of a combine involving a register.
+let AddedComplexity = 75 in {
+def: Pat<(HexagonCOMBINE s8_0ImmPred:$s8, u32_0ImmPred:$u6),
+         (A4_combineii imm:$s8, imm:$u6)>;
+def: Pat<(HexagonCOMBINE s32_0ImmPred:$s8, s8_0ImmPred:$S8),
+         (A2_combineii imm:$s8, imm:$S8)>;
+}
+
+
+def ToZext64: OutPatFrag<(ops node:$Rs),
+  (i64 (A4_combineir 0, (i32 $Rs)))>;
+def ToSext64: OutPatFrag<(ops node:$Rs),
+  (i64 (A2_sxtw (i32 $Rs)))>;
+
+// Patterns to generate indexed loads with different forms of the address:
+// - frameindex,
+// - base + offset,
+// - base (without offset).
+multiclass Loadxm_pat<PatFrag Load, ValueType VT, PatFrag ValueMod,
+                      PatLeaf ImmPred, InstHexagon MI> {
+  def: Pat<(VT (Load AddrFI:$fi)),
+           (VT (ValueMod (MI AddrFI:$fi, 0)))>;
+  def: Pat<(VT (Load (add AddrFI:$fi, ImmPred:$Off))),
+           (VT (ValueMod (MI AddrFI:$fi, imm:$Off)))>;
+  def: Pat<(VT (Load (add IntRegs:$Rs, ImmPred:$Off))),
+           (VT (ValueMod (MI IntRegs:$Rs, imm:$Off)))>;
+  def: Pat<(VT (Load I32:$Rs)),
+           (VT (ValueMod (MI IntRegs:$Rs, 0)))>;
+}
+
+defm: Loadxm_pat<extloadi1,   i64, ToZext64, s32_0ImmPred, L2_loadrub_io>;
+defm: Loadxm_pat<extloadi8,   i64, ToZext64, s32_0ImmPred, L2_loadrub_io>;
+defm: Loadxm_pat<extloadi16,  i64, ToZext64, s31_1ImmPred, L2_loadruh_io>;
+defm: Loadxm_pat<zextloadi1,  i64, ToZext64, s32_0ImmPred, L2_loadrub_io>;
+defm: Loadxm_pat<zextloadi8,  i64, ToZext64, s32_0ImmPred, L2_loadrub_io>;
+defm: Loadxm_pat<zextloadi16, i64, ToZext64, s31_1ImmPred, L2_loadruh_io>;
+defm: Loadxm_pat<sextloadi8,  i64, ToSext64, s32_0ImmPred, L2_loadrb_io>;
+defm: Loadxm_pat<sextloadi16, i64, ToSext64, s31_1ImmPred, L2_loadrh_io>;
+
+// Map Rdd = anyext(Rs) -> Rdd = combine(#0, Rs).
+def: Pat<(Aext64 I32:$src1), (ToZext64 IntRegs:$src1)>;
+
+multiclass T_LoadAbsReg_Pat <PatFrag ldOp, InstHexagon MI, ValueType VT = i32> {
+  def  : Pat <(VT (ldOp (add (shl IntRegs:$src1, u2_0ImmPred:$src2),
+                             (HexagonCONST32 tglobaladdr:$src3)))),
+              (MI IntRegs:$src1, u2_0ImmPred:$src2, tglobaladdr:$src3)>;
+  def  : Pat <(VT (ldOp (add IntRegs:$src1,
+                             (HexagonCONST32 tglobaladdr:$src2)))),
+              (MI IntRegs:$src1, 0, tglobaladdr:$src2)>;
+
+  def  : Pat <(VT (ldOp (add (shl IntRegs:$src1, u2_0ImmPred:$src2),
+                             (HexagonCONST32 tconstpool:$src3)))),
+              (MI IntRegs:$src1, u2_0ImmPred:$src2, tconstpool:$src3)>;
+  def  : Pat <(VT (ldOp (add IntRegs:$src1,
+                             (HexagonCONST32 tconstpool:$src2)))),
+              (MI IntRegs:$src1, 0, tconstpool:$src2)>;
+
+  def  : Pat <(VT (ldOp (add (shl IntRegs:$src1, u2_0ImmPred:$src2),
+                             (HexagonCONST32 tjumptable:$src3)))),
+              (MI IntRegs:$src1, u2_0ImmPred:$src2, tjumptable:$src3)>;
+  def  : Pat <(VT (ldOp (add IntRegs:$src1,
+                             (HexagonCONST32 tjumptable:$src2)))),
+              (MI IntRegs:$src1, 0, tjumptable:$src2)>;
+}
+
+let AddedComplexity  = 60 in {
+defm : T_LoadAbsReg_Pat <sextloadi8, L4_loadrb_ur>;
+defm : T_LoadAbsReg_Pat <zextloadi8, L4_loadrub_ur>;
+defm : T_LoadAbsReg_Pat <extloadi8,  L4_loadrub_ur>;
+
+defm : T_LoadAbsReg_Pat <sextloadi16, L4_loadrh_ur>;
+defm : T_LoadAbsReg_Pat <zextloadi16, L4_loadruh_ur>;
+defm : T_LoadAbsReg_Pat <extloadi16,  L4_loadruh_ur>;
+
+defm : T_LoadAbsReg_Pat <load, L4_loadri_ur>;
+defm : T_LoadAbsReg_Pat <load, L4_loadrd_ur, i64>;
+}
+
+// 'def pats' for load instructions with base + register offset and non-zero
+// immediate value. Immediate value is used to left-shift the second
+// register operand.
+class Loadxs_pat<PatFrag Load, ValueType VT, InstHexagon MI>
+  : Pat<(VT (Load (add I32:$Rs,
+                       (i32 (shl I32:$Rt, u2_0ImmPred:$u2))))),
+        (VT (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2))>;
+
+let AddedComplexity = 40 in {
+  def: Loadxs_pat<extloadi8,   i32, L4_loadrub_rr>;
+  def: Loadxs_pat<zextloadi8,  i32, L4_loadrub_rr>;
+  def: Loadxs_pat<sextloadi8,  i32, L4_loadrb_rr>;
+  def: Loadxs_pat<extloadi16,  i32, L4_loadruh_rr>;
+  def: Loadxs_pat<zextloadi16, i32, L4_loadruh_rr>;
+  def: Loadxs_pat<sextloadi16, i32, L4_loadrh_rr>;
+  def: Loadxs_pat<load,        i32, L4_loadri_rr>;
+  def: Loadxs_pat<load,        i64, L4_loadrd_rr>;
+}
+
+// 'def pats' for load instruction base + register offset and
+// zero immediate value.
+class Loadxs_simple_pat<PatFrag Load, ValueType VT, InstHexagon MI>
+  : Pat<(VT (Load (add I32:$Rs, I32:$Rt))),
+        (VT (MI IntRegs:$Rs, IntRegs:$Rt, 0))>;
+
+let AddedComplexity = 20 in {
+  def: Loadxs_simple_pat<extloadi8,   i32, L4_loadrub_rr>;
+  def: Loadxs_simple_pat<zextloadi8,  i32, L4_loadrub_rr>;
+  def: Loadxs_simple_pat<sextloadi8,  i32, L4_loadrb_rr>;
+  def: Loadxs_simple_pat<extloadi16,  i32, L4_loadruh_rr>;
+  def: Loadxs_simple_pat<zextloadi16, i32, L4_loadruh_rr>;
+  def: Loadxs_simple_pat<sextloadi16, i32, L4_loadrh_rr>;
+  def: Loadxs_simple_pat<load,        i32, L4_loadri_rr>;
+  def: Loadxs_simple_pat<load,        i64, L4_loadrd_rr>;
+}
+
+// zext i1->i64
+def: Pat<(i64 (zext I1:$src1)),
+         (ToZext64 (C2_muxii PredRegs:$src1, 1, 0))>;
+
+// zext i32->i64
+def: Pat<(Zext64 I32:$src1),
+         (ToZext64 IntRegs:$src1)>;
+
+let AddedComplexity = 40 in
+multiclass T_StoreAbsReg_Pats <InstHexagon MI, RegisterClass RC, ValueType VT,
+                           PatFrag stOp> {
+ def : Pat<(stOp (VT RC:$src4),
+                 (add (shl I32:$src1, u2_0ImmPred:$src2),
+                      u32_0ImmPred:$src3)),
+          (MI IntRegs:$src1, u2_0ImmPred:$src2, u32_0ImmPred:$src3, RC:$src4)>;
+
+ def : Pat<(stOp (VT RC:$src4),
+                 (add (shl IntRegs:$src1, u2_0ImmPred:$src2),
+                      (HexagonCONST32 tglobaladdr:$src3))),
+           (MI IntRegs:$src1, u2_0ImmPred:$src2, tglobaladdr:$src3, RC:$src4)>;
+
+ def : Pat<(stOp (VT RC:$src4),
+                 (add IntRegs:$src1, (HexagonCONST32 tglobaladdr:$src3))),
+           (MI IntRegs:$src1, 0, tglobaladdr:$src3, RC:$src4)>;
+}
+
+defm : T_StoreAbsReg_Pats <S4_storerd_ur, DoubleRegs, i64, store>;
+defm : T_StoreAbsReg_Pats <S4_storeri_ur, IntRegs, i32, store>;
+defm : T_StoreAbsReg_Pats <S4_storerb_ur, IntRegs, i32, truncstorei8>;
+defm : T_StoreAbsReg_Pats <S4_storerh_ur, IntRegs, i32, truncstorei16>;
+
+class Storexs_pat<PatFrag Store, PatFrag Value, InstHexagon MI>
+  : Pat<(Store Value:$Ru, (add I32:$Rs,
+                               (i32 (shl I32:$Rt, u2_0ImmPred:$u2)))),
+        (MI IntRegs:$Rs, IntRegs:$Rt, imm:$u2, Value:$Ru)>;
+
+let AddedComplexity = 40 in {
+  def: Storexs_pat<truncstorei8,  I32, S4_storerb_rr>;
+  def: Storexs_pat<truncstorei16, I32, S4_storerh_rr>;
+  def: Storexs_pat<store,         I32, S4_storeri_rr>;
+  def: Storexs_pat<store,         I64, S4_storerd_rr>;
+}
+
+def s30_2ProperPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<30,2>(v) && !isShiftedInt<29,3>(v);
+}]>;
+def RoundTo8 : SDNodeXForm<imm, [{
+  int32_t Imm = N->getSExtValue();
+  return CurDAG->getTargetConstant(Imm & -8, SDLoc(N), MVT::i32);
+}]>;
+
+let AddedComplexity = 40 in
+def: Pat<(store I64:$Ru, (add I32:$Rs, s30_2ProperPred:$Off)),
+         (S2_storerd_io (A2_addi I32:$Rs, 4), (RoundTo8 $Off), I64:$Ru)>;
+
+class Store_rr_pat<PatFrag Store, PatFrag Value, InstHexagon MI>
+  : Pat<(Store Value:$Ru, (add I32:$Rs, I32:$Rt)),
+        (MI IntRegs:$Rs, IntRegs:$Rt, 0, Value:$Ru)>;
+
+let AddedComplexity = 20 in {
+  def: Store_rr_pat<truncstorei8,  I32, S4_storerb_rr>;
+  def: Store_rr_pat<truncstorei16, I32, S4_storerh_rr>;
+  def: Store_rr_pat<store,         I32, S4_storeri_rr>;
+  def: Store_rr_pat<store,         I64, S4_storerd_rr>;
+}
+
+
+def IMM_BYTE : SDNodeXForm<imm, [{
+  // -1 etc is  represented as 255 etc
+  // assigning to a byte restores our desired signed value.
+  int8_t imm = N->getSExtValue();
+  return CurDAG->getTargetConstant(imm, SDLoc(N), MVT::i32);
+}]>;
+
+def IMM_HALF : SDNodeXForm<imm, [{
+  // -1 etc is  represented as 65535 etc
+  // assigning to a short restores our desired signed value.
+  int16_t imm = N->getSExtValue();
+  return CurDAG->getTargetConstant(imm, SDLoc(N), MVT::i32);
+}]>;
+
+def IMM_WORD : SDNodeXForm<imm, [{
+  // -1 etc can be represented as 4294967295 etc
+  // Currently, it's not doing this. But some optimization
+  // might convert -1 to a large +ve number.
+  // assigning to a word restores our desired signed value.
+  int32_t imm = N->getSExtValue();
+  return CurDAG->getTargetConstant(imm, SDLoc(N), MVT::i32);
+}]>;
+
+def ToImmByte : OutPatFrag<(ops node:$R), (IMM_BYTE $R)>;
+def ToImmHalf : OutPatFrag<(ops node:$R), (IMM_HALF $R)>;
+def ToImmWord : OutPatFrag<(ops node:$R), (IMM_WORD $R)>;
+
+// Emit store-immediate, but only when the stored value will not be constant-
+// extended. The reason for that is that there is no pass that can optimize
+// constant extenders in store-immediate instructions. In some cases we can
+// end up will a number of such stores, all of which store the same extended
+// value (e.g. after unrolling a loop that initializes floating point array).
+
+// Predicates to determine if the 16-bit immediate is expressible as a sign-
+// extended 8-bit immediate. Store-immediate-halfword will ignore any bits
+// beyond 0..15, so we don't care what is in there.
+
+def i16in8ImmPred: PatLeaf<(i32 imm), [{
+  int64_t v = (int16_t)N->getSExtValue();
+  return v == (int64_t)(int8_t)v;
+}]>;
+
+// Predicates to determine if the 32-bit immediate is expressible as a sign-
+// extended 8-bit immediate.
+def i32in8ImmPred: PatLeaf<(i32 imm), [{
+  int64_t v = (int32_t)N->getSExtValue();
+  return v == (int64_t)(int8_t)v;
+}]>;
+
+
+let AddedComplexity = 40 in {
+  // Even though the offset is not extendable in the store-immediate, we
+  // can still generate the fi# in the base address. If the final offset
+  // is not valid for the instruction, we will replace it with a scratch
+  // register.
+//  def: Storexm_fi_pat <truncstorei8, s32_0ImmPred, ToImmByte, S4_storeirb_io>;
+//  def: Storexm_fi_pat <truncstorei16, i16in8ImmPred, ToImmHalf,
+//                       S4_storeirh_io>;
+//  def: Storexm_fi_pat <store, i32in8ImmPred, ToImmWord, S4_storeiri_io>;
+
+//  defm: Storexm_fi_add_pat <truncstorei8, s32_0ImmPred, u6_0ImmPred, ToImmByte,
+//                            S4_storeirb_io>;
+//  defm: Storexm_fi_add_pat <truncstorei16, i16in8ImmPred, u6_1ImmPred,
+//                            ToImmHalf, S4_storeirh_io>;
+//  defm: Storexm_fi_add_pat <store, i32in8ImmPred, u6_2ImmPred, ToImmWord,
+//                            S4_storeiri_io>;
+
+  defm: Storexm_add_pat<truncstorei8, s32_0ImmPred, u6_0ImmPred, ToImmByte,
+                        S4_storeirb_io>;
+  defm: Storexm_add_pat<truncstorei16, i16in8ImmPred, u6_1ImmPred, ToImmHalf,
+                        S4_storeirh_io>;
+  defm: Storexm_add_pat<store, i32in8ImmPred, u6_2ImmPred, ToImmWord,
+                        S4_storeiri_io>;
+}
+
+def: Storexm_simple_pat<truncstorei8,  s32_0ImmPred, ToImmByte, S4_storeirb_io>;
+def: Storexm_simple_pat<truncstorei16, s32_0ImmPred, ToImmHalf, S4_storeirh_io>;
+def: Storexm_simple_pat<store,         s32_0ImmPred, ToImmWord, S4_storeiri_io>;
+
+// op(Ps, op(Pt, Pu))
+class LogLog_pat<SDNode Op1, SDNode Op2, InstHexagon MI>
+  : Pat<(i1 (Op1 I1:$Ps, (Op2 I1:$Pt, I1:$Pu))),
+        (MI I1:$Ps, I1:$Pt, I1:$Pu)>;
+
+// op(Ps, op(Pt, ~Pu))
+class LogLogNot_pat<SDNode Op1, SDNode Op2, InstHexagon MI>
+  : Pat<(i1 (Op1 I1:$Ps, (Op2 I1:$Pt, (not I1:$Pu)))),
+        (MI I1:$Ps, I1:$Pt, I1:$Pu)>;
+
+def: LogLog_pat<and, and, C4_and_and>;
+def: LogLog_pat<and, or,  C4_and_or>;
+def: LogLog_pat<or,  and, C4_or_and>;
+def: LogLog_pat<or,  or,  C4_or_or>;
+
+def: LogLogNot_pat<and, and, C4_and_andn>;
+def: LogLogNot_pat<and, or,  C4_and_orn>;
+def: LogLogNot_pat<or,  and, C4_or_andn>;
+def: LogLogNot_pat<or,  or,  C4_or_orn>;
+
+//===----------------------------------------------------------------------===//
+// PIC: Support for PIC compilations. The patterns and SD nodes defined
+// below are needed to support code generation for PIC
+//===----------------------------------------------------------------------===//
+
+def SDT_HexagonAtGot
+  : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
+def SDT_HexagonAtPcrel
+  : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+// AT_GOT address-of-GOT, address-of-global, offset-in-global
+def HexagonAtGot       : SDNode<"HexagonISD::AT_GOT", SDT_HexagonAtGot>;
+// AT_PCREL address-of-global
+def HexagonAtPcrel     : SDNode<"HexagonISD::AT_PCREL", SDT_HexagonAtPcrel>;
+
+def: Pat<(HexagonAtGot I32:$got, I32:$addr, (i32 0)),
+         (L2_loadri_io I32:$got, imm:$addr)>;
+def: Pat<(HexagonAtGot I32:$got, I32:$addr, s30_2ImmPred:$off),
+         (A2_addi (L2_loadri_io I32:$got, imm:$addr), imm:$off)>;
+def: Pat<(HexagonAtPcrel I32:$addr),
+         (C4_addipc imm:$addr)>;
+
+def: Pat<(i64 (and I64:$Rs, (i64 (not I64:$Rt)))),
+         (A4_andnp DoubleRegs:$Rs, DoubleRegs:$Rt)>;
+def: Pat<(i64 (or  I64:$Rs, (i64 (not I64:$Rt)))),
+         (A4_ornp DoubleRegs:$Rs, DoubleRegs:$Rt)>;
+
+def: Pat<(add I32:$Rs, (add I32:$Ru, s32_0ImmPred:$s6)),
+         (S4_addaddi IntRegs:$Rs, IntRegs:$Ru, imm:$s6)>;
+
+// Rd=add(Rs,sub(#s6,Ru))
+def: Pat<(add I32:$src1, (sub s32_0ImmPred:$src2,
+                                        I32:$src3)),
+         (S4_subaddi IntRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3)>;
+
+// Rd=sub(add(Rs,#s6),Ru)
+def: Pat<(sub (add I32:$src1, s32_0ImmPred:$src2),
+                   I32:$src3),
+         (S4_subaddi IntRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3)>;
+
+// Rd=add(sub(Rs,Ru),#s6)
+def: Pat<(add (sub I32:$src1, I32:$src3),
+                   (s32_0ImmPred:$src2)),
+         (S4_subaddi IntRegs:$src1, s32_0ImmPred:$src2, IntRegs:$src3)>;
+
+def: Pat<(xor I64:$dst2,
+              (xor I64:$Rss, I64:$Rtt)),
+         (M4_xor_xacc DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt)>;
+def: Pat<(or I32:$Ru, (and (i32 IntRegs:$_src_), s32_0ImmPred:$s10)),
+         (S4_or_andix IntRegs:$Ru, IntRegs:$_src_, imm:$s10)>;
+
+def: Pat<(or I32:$src1, (and I32:$Rs, s32_0ImmPred:$s10)),
+         (S4_or_andi IntRegs:$src1, IntRegs:$Rs, imm:$s10)>;
+
+def: Pat<(or I32:$src1, (or I32:$Rs, s32_0ImmPred:$s10)),
+         (S4_or_ori IntRegs:$src1, IntRegs:$Rs, imm:$s10)>;
+
+
+
+// Count trailing zeros: 64-bit.
+def: Pat<(i32 (trunc (cttz I64:$Rss))), (S2_ct0p I64:$Rss)>;
+
+// Count trailing ones: 64-bit.
+def: Pat<(i32 (trunc (cttz (not I64:$Rss)))), (S2_ct1p I64:$Rss)>;
+
+// Define leading/trailing patterns that require zero-extensions to 64 bits.
+def: Pat<(i64 (ctlz I64:$Rss)), (ToZext64 (S2_cl0p I64:$Rss))>;
+def: Pat<(i64 (cttz I64:$Rss)), (ToZext64 (S2_ct0p I64:$Rss))>;
+def: Pat<(i64 (ctlz (not I64:$Rss))), (ToZext64 (S2_cl1p I64:$Rss))>;
+def: Pat<(i64 (cttz (not I64:$Rss))), (ToZext64 (S2_ct1p I64:$Rss))>;
+
+
+let AddedComplexity = 20 in {   // Complexity greater than cmp reg-imm.
+  def: Pat<(i1 (seteq (and (shl 1, u5_0ImmPred:$u5), I32:$Rs), 0)),
+           (S4_ntstbit_i I32:$Rs, u5_0ImmPred:$u5)>;
+  def: Pat<(i1 (seteq (and (shl 1, I32:$Rt), I32:$Rs), 0)),
+           (S4_ntstbit_r I32:$Rs, I32:$Rt)>;
+}
+
+// Add extra complexity to prefer these instructions over bitsset/bitsclr.
+// The reason is that tstbit/ntstbit can be folded into a compound instruction:
+//   if ([!]tstbit(...)) jump ...
+let AddedComplexity = 100 in
+def: Pat<(i1 (setne (and I32:$Rs, (i32 IsPow2_32:$u5)), (i32 0))),
+         (S2_tstbit_i I32:$Rs, (Log2_32 imm:$u5))>;
+
+let AddedComplexity = 100 in
+def: Pat<(i1 (seteq (and I32:$Rs, (i32 IsPow2_32:$u5)), (i32 0))),
+         (S4_ntstbit_i I32:$Rs, (Log2_32 imm:$u5))>;
+
+// Do not increase complexity of these patterns. In the DAG, "cmp i8" may be
+// represented as a compare against "value & 0xFF", which is an exact match
+// for cmpb (same for cmph). The patterns below do not contain any additional
+// complexity that would make them preferable, and if they were actually used
+// instead of cmpb/cmph, they would result in a compare against register that
+// is loaded with the byte/half mask (i.e. 0xFF or 0xFFFF).
+def: Pat<(i1 (setne (and I32:$Rs, u6_0ImmPred:$u6), 0)),
+         (C4_nbitsclri I32:$Rs, u6_0ImmPred:$u6)>;
+def: Pat<(i1 (setne (and I32:$Rs, I32:$Rt), 0)),
+         (C4_nbitsclr I32:$Rs, I32:$Rt)>;
+def: Pat<(i1 (setne (and I32:$Rs, I32:$Rt), I32:$Rt)),
+         (C4_nbitsset I32:$Rs, I32:$Rt)>;
+
+
+def: Pat<(add (mul I32:$Rs, u6_0ImmPred:$U6), u32_0ImmPred:$u6),
+         (M4_mpyri_addi imm:$u6, IntRegs:$Rs, imm:$U6)>;
+def: Pat<(add (mul I32:$Rs, I32:$Rt), u32_0ImmPred:$u6),
+         (M4_mpyrr_addi imm:$u6, IntRegs:$Rs, IntRegs:$Rt)>;
+
+def: Pat<(add I32:$src1, (mul I32:$src3, u6_2ImmPred:$src2)),
+         (M4_mpyri_addr_u2 IntRegs:$src1, imm:$src2, IntRegs:$src3)>;
+def: Pat<(add I32:$src1, (mul I32:$src3, u32_0ImmPred:$src2)),
+         (M4_mpyri_addr IntRegs:$src1, IntRegs:$src3, imm:$src2)>;
+
+def: Pat<(add I32:$Ru, (mul (i32 IntRegs:$_src_), I32:$Rs)),
+         (M4_mpyrr_addr IntRegs:$Ru, IntRegs:$_src_, IntRegs:$Rs)>;
+
+def: T_vcmp_pat<A4_vcmpbgt, setgt, v8i8>;
+
+class T_Shift_CommOp_pat<InstHexagon MI, SDNode Op, SDNode ShOp>
+  : Pat<(Op (ShOp IntRegs:$Rx, u5_0ImmPred:$U5), u32_0ImmPred:$u8),
+        (MI u32_0ImmPred:$u8, IntRegs:$Rx, u5_0ImmPred:$U5)>;
+
+let AddedComplexity = 200 in {
+  def : T_Shift_CommOp_pat <S4_addi_asl_ri, add, shl>;
+  def : T_Shift_CommOp_pat <S4_addi_lsr_ri, add, srl>;
+  def : T_Shift_CommOp_pat <S4_andi_asl_ri, and, shl>;
+  def : T_Shift_CommOp_pat <S4_andi_lsr_ri, and, srl>;
+}
+
+let AddedComplexity = 30 in {
+  def : T_Shift_CommOp_pat <S4_ori_asl_ri,  or,  shl>;
+  def : T_Shift_CommOp_pat <S4_ori_lsr_ri,  or,  srl>;
+}
+
+class T_Shift_Op_pat<InstHexagon MI, SDNode Op, SDNode ShOp>
+  : Pat<(Op u32_0ImmPred:$u8, (ShOp IntRegs:$Rx, u5_0ImmPred:$U5)),
+        (MI u32_0ImmPred:$u8, IntRegs:$Rx, u5_0ImmPred:$U5)>;
+
+def : T_Shift_Op_pat <S4_subi_asl_ri, sub, shl>;
+def : T_Shift_Op_pat <S4_subi_lsr_ri, sub, srl>;
+
+let AddedComplexity = 200 in {
+  def: Pat<(add addrga:$addr, (shl I32:$src2, u5_0ImmPred:$src3)),
+           (S4_addi_asl_ri addrga:$addr, IntRegs:$src2, u5_0ImmPred:$src3)>;
+  def: Pat<(add addrga:$addr, (srl I32:$src2, u5_0ImmPred:$src3)),
+           (S4_addi_lsr_ri addrga:$addr, IntRegs:$src2, u5_0ImmPred:$src3)>;
+  def: Pat<(sub addrga:$addr, (shl I32:$src2, u5_0ImmPred:$src3)),
+           (S4_subi_asl_ri addrga:$addr, IntRegs:$src2, u5_0ImmPred:$src3)>;
+  def: Pat<(sub addrga:$addr, (srl I32:$src2, u5_0ImmPred:$src3)),
+           (S4_subi_lsr_ri addrga:$addr, IntRegs:$src2, u5_0ImmPred:$src3)>;
+}
+
+def: Pat<(shl s6_0ImmPred:$s6, I32:$Rt),
+         (S4_lsli imm:$s6, IntRegs:$Rt)>;
+
+
+//===----------------------------------------------------------------------===//
+// MEMOP
+//===----------------------------------------------------------------------===//
+
+def m5_0Imm8Pred : PatLeaf<(i32 imm), [{
+  int8_t V = N->getSExtValue();
+  return -32 < V && V <= -1;
+}]>;
+
+def m5_0Imm16Pred : PatLeaf<(i32 imm), [{
+  int16_t V = N->getSExtValue();
+  return -32 < V && V <= -1;
+}]>;
+
+def m5_0ImmPred  : PatLeaf<(i32 imm), [{
+  int64_t V = N->getSExtValue();
+  return -31 <= V && V <= -1;
+}]>;
+
+def IsNPow2_8 : PatLeaf<(i32 imm), [{
+  uint8_t NV = ~N->getZExtValue();
+  return isPowerOf2_32(NV);
+}]>;
+
+def IsNPow2_16 : PatLeaf<(i32 imm), [{
+  uint16_t NV = ~N->getZExtValue();
+  return isPowerOf2_32(NV);
+}]>;
+
+def Log2_8 : SDNodeXForm<imm, [{
+  uint8_t V = N->getZExtValue();
+  return CurDAG->getTargetConstant(Log2_32(V), SDLoc(N), MVT::i32);
+}]>;
+
+def Log2_16 : SDNodeXForm<imm, [{
+  uint16_t V = N->getZExtValue();
+  return CurDAG->getTargetConstant(Log2_32(V), SDLoc(N), MVT::i32);
+}]>;
+
+def LogN2_8 : SDNodeXForm<imm, [{
+  uint8_t NV = ~N->getZExtValue();
+  return CurDAG->getTargetConstant(Log2_32(NV), SDLoc(N), MVT::i32);
+}]>;
+
+def LogN2_16 : SDNodeXForm<imm, [{
+  uint16_t NV = ~N->getZExtValue();
+  return CurDAG->getTargetConstant(Log2_32(NV), SDLoc(N), MVT::i32);
+}]>;
+
+def NegImm8 : SDNodeXForm<imm, [{
+  int8_t NV = -N->getSExtValue();
+  return CurDAG->getTargetConstant(NV, SDLoc(N), MVT::i32);
+}]>;
+
+def NegImm16 : SDNodeXForm<imm, [{
+  int16_t NV = -N->getSExtValue();
+  return CurDAG->getTargetConstant(NV, SDLoc(N), MVT::i32);
+}]>;
+
+def NegImm32 : SDNodeXForm<imm, [{
+  int32_t NV = -N->getSExtValue();
+  return CurDAG->getTargetConstant(NV, SDLoc(N), MVT::i32);
+}]>;
+
+def IdImm : SDNodeXForm<imm, [{ return SDValue(N, 0); }]>;
+
+multiclass Memopxr_simple_pat<PatFrag Load, PatFrag Store, SDNode Oper,
+                              InstHexagon MI> {
+  // Addr: i32
+  def: Pat<(Store (Oper (Load I32:$Rs), I32:$A), I32:$Rs),
+           (MI I32:$Rs, 0, I32:$A)>;
+  // Addr: fi
+  def: Pat<(Store (Oper (Load AddrFI:$Rs), I32:$A), AddrFI:$Rs),
+           (MI AddrFI:$Rs, 0, I32:$A)>;
+}
+
+multiclass Memopxr_add_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
+                           SDNode Oper, InstHexagon MI> {
+  // Addr: i32
+  def: Pat<(Store (Oper (Load (add I32:$Rs, ImmPred:$Off)), I32:$A),
+                  (add I32:$Rs, ImmPred:$Off)),
+           (MI I32:$Rs, imm:$Off, I32:$A)>;
+  def: Pat<(Store (Oper (Load (IsOrAdd I32:$Rs, ImmPred:$Off)), I32:$A),
+                  (IsOrAdd I32:$Rs, ImmPred:$Off)),
+           (MI I32:$Rs, imm:$Off, I32:$A)>;
+  // Addr: fi
+  def: Pat<(Store (Oper (Load (add AddrFI:$Rs, ImmPred:$Off)), I32:$A),
+                  (add AddrFI:$Rs, ImmPred:$Off)),
+           (MI AddrFI:$Rs, imm:$Off, I32:$A)>;
+  def: Pat<(Store (Oper (Load (IsOrAdd AddrFI:$Rs, ImmPred:$Off)), I32:$A),
+                  (IsOrAdd AddrFI:$Rs, ImmPred:$Off)),
+           (MI AddrFI:$Rs, imm:$Off, I32:$A)>;
+}
+
+multiclass Memopxr_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
+                       SDNode Oper, InstHexagon MI> {
+  defm: Memopxr_simple_pat <Load, Store,          Oper, MI>;
+  defm: Memopxr_add_pat    <Load, Store, ImmPred, Oper, MI>;
+}
+
+let AddedComplexity = 180 in {
+  // add reg
+  defm: Memopxr_pat<extloadi8, truncstorei8, u6_0ImmPred, add,
+        /*anyext*/  L4_add_memopb_io>;
+  defm: Memopxr_pat<sextloadi8, truncstorei8, u6_0ImmPred, add,
+        /*sext*/    L4_add_memopb_io>;
+  defm: Memopxr_pat<zextloadi8, truncstorei8, u6_0ImmPred, add,
+        /*zext*/    L4_add_memopb_io>;
+  defm: Memopxr_pat<extloadi16, truncstorei16, u6_1ImmPred, add,
+        /*anyext*/  L4_add_memoph_io>;
+  defm: Memopxr_pat<sextloadi16, truncstorei16, u6_1ImmPred, add,
+        /*sext*/    L4_add_memoph_io>;
+  defm: Memopxr_pat<zextloadi16, truncstorei16, u6_1ImmPred, add,
+        /*zext*/    L4_add_memoph_io>;
+  defm: Memopxr_pat<load, store, u6_2ImmPred, add, L4_add_memopw_io>;
+
+  // sub reg
+  defm: Memopxr_pat<extloadi8, truncstorei8, u6_0ImmPred, sub,
+        /*anyext*/  L4_sub_memopb_io>;
+  defm: Memopxr_pat<sextloadi8, truncstorei8, u6_0ImmPred, sub,
+        /*sext*/    L4_sub_memopb_io>;
+  defm: Memopxr_pat<zextloadi8, truncstorei8, u6_0ImmPred, sub,
+        /*zext*/    L4_sub_memopb_io>;
+  defm: Memopxr_pat<extloadi16, truncstorei16, u6_1ImmPred, sub,
+        /*anyext*/  L4_sub_memoph_io>;
+  defm: Memopxr_pat<sextloadi16, truncstorei16, u6_1ImmPred, sub,
+        /*sext*/    L4_sub_memoph_io>;
+  defm: Memopxr_pat<zextloadi16, truncstorei16, u6_1ImmPred, sub,
+        /*zext*/    L4_sub_memoph_io>;
+  defm: Memopxr_pat<load, store, u6_2ImmPred, sub, L4_sub_memopw_io>;
+
+  // and reg
+  defm: Memopxr_pat<extloadi8, truncstorei8, u6_0ImmPred, and,
+        /*anyext*/  L4_and_memopb_io>;
+  defm: Memopxr_pat<sextloadi8, truncstorei8, u6_0ImmPred, and,
+        /*sext*/    L4_and_memopb_io>;
+  defm: Memopxr_pat<zextloadi8, truncstorei8, u6_0ImmPred, and,
+        /*zext*/    L4_and_memopb_io>;
+  defm: Memopxr_pat<extloadi16, truncstorei16, u6_1ImmPred, and,
+        /*anyext*/  L4_and_memoph_io>;
+  defm: Memopxr_pat<sextloadi16, truncstorei16, u6_1ImmPred, and,
+        /*sext*/    L4_and_memoph_io>;
+  defm: Memopxr_pat<zextloadi16, truncstorei16, u6_1ImmPred, and,
+        /*zext*/    L4_and_memoph_io>;
+  defm: Memopxr_pat<load, store, u6_2ImmPred, and, L4_and_memopw_io>;
+
+  // or reg
+  defm: Memopxr_pat<extloadi8, truncstorei8, u6_0ImmPred, or,
+        /*anyext*/  L4_or_memopb_io>;
+  defm: Memopxr_pat<sextloadi8, truncstorei8, u6_0ImmPred, or,
+        /*sext*/    L4_or_memopb_io>;
+  defm: Memopxr_pat<zextloadi8, truncstorei8, u6_0ImmPred, or,
+        /*zext*/    L4_or_memopb_io>;
+  defm: Memopxr_pat<extloadi16, truncstorei16, u6_1ImmPred, or,
+        /*anyext*/  L4_or_memoph_io>;
+  defm: Memopxr_pat<sextloadi16, truncstorei16, u6_1ImmPred, or,
+        /*sext*/    L4_or_memoph_io>;
+  defm: Memopxr_pat<zextloadi16, truncstorei16, u6_1ImmPred, or,
+        /*zext*/    L4_or_memoph_io>;
+  defm: Memopxr_pat<load, store, u6_2ImmPred, or, L4_or_memopw_io>;
+}
+
+
+multiclass Memopxi_simple_pat<PatFrag Load, PatFrag Store, SDNode Oper,
+                              PatFrag Arg, SDNodeXForm ArgMod,
+                              InstHexagon MI> {
+  // Addr: i32
+  def: Pat<(Store (Oper (Load I32:$Rs), Arg:$A), I32:$Rs),
+           (MI I32:$Rs, 0, (ArgMod Arg:$A))>;
+  // Addr: fi
+  def: Pat<(Store (Oper (Load AddrFI:$Rs), Arg:$A), AddrFI:$Rs),
+           (MI AddrFI:$Rs, 0, (ArgMod Arg:$A))>;
+}
+
+multiclass Memopxi_add_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
+                           SDNode Oper, PatFrag Arg, SDNodeXForm ArgMod,
+                           InstHexagon MI> {
+  // Addr: i32
+  def: Pat<(Store (Oper (Load (add I32:$Rs, ImmPred:$Off)), Arg:$A),
+                  (add I32:$Rs, ImmPred:$Off)),
+           (MI I32:$Rs, imm:$Off, (ArgMod Arg:$A))>;
+  def: Pat<(Store (Oper (Load (IsOrAdd I32:$Rs, ImmPred:$Off)), Arg:$A),
+                  (IsOrAdd I32:$Rs, ImmPred:$Off)),
+           (MI I32:$Rs, imm:$Off, (ArgMod Arg:$A))>;
+  // Addr: fi
+  def: Pat<(Store (Oper (Load (add AddrFI:$Rs, ImmPred:$Off)), Arg:$A),
+                  (add AddrFI:$Rs, ImmPred:$Off)),
+           (MI AddrFI:$Rs, imm:$Off, (ArgMod Arg:$A))>;
+  def: Pat<(Store (Oper (Load (IsOrAdd AddrFI:$Rs, ImmPred:$Off)), Arg:$A),
+                  (IsOrAdd AddrFI:$Rs, ImmPred:$Off)),
+           (MI AddrFI:$Rs, imm:$Off, (ArgMod Arg:$A))>;
+}
+
+multiclass Memopxi_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
+                       SDNode Oper, PatFrag Arg, SDNodeXForm ArgMod,
+                       InstHexagon MI> {
+  defm: Memopxi_simple_pat <Load, Store,          Oper, Arg, ArgMod, MI>;
+  defm: Memopxi_add_pat    <Load, Store, ImmPred, Oper, Arg, ArgMod, MI>;
+}
+
+
+let AddedComplexity = 200 in {
+  // add imm
+  defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, add, u5_0ImmPred,
+        /*anyext*/  IdImm, L4_iadd_memopb_io>;
+  defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, add, u5_0ImmPred,
+        /*sext*/    IdImm, L4_iadd_memopb_io>;
+  defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, add, u5_0ImmPred,
+        /*zext*/    IdImm, L4_iadd_memopb_io>;
+  defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, add, u5_0ImmPred,
+        /*anyext*/  IdImm, L4_iadd_memoph_io>;
+  defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, add, u5_0ImmPred,
+        /*sext*/    IdImm, L4_iadd_memoph_io>;
+  defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, add, u5_0ImmPred,
+        /*zext*/    IdImm, L4_iadd_memoph_io>;
+  defm: Memopxi_pat<load, store, u6_2ImmPred, add, u5_0ImmPred, IdImm,
+                    L4_iadd_memopw_io>;
+  defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, sub, m5_0Imm8Pred,
+        /*anyext*/  NegImm8, L4_iadd_memopb_io>;
+  defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, sub, m5_0Imm8Pred,
+        /*sext*/    NegImm8, L4_iadd_memopb_io>;
+  defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, sub, m5_0Imm8Pred,
+        /*zext*/    NegImm8, L4_iadd_memopb_io>;
+  defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, sub, m5_0Imm16Pred,
+        /*anyext*/  NegImm16, L4_iadd_memoph_io>;
+  defm: Memopxi_pat<sextloadi16, truncstorei16, u6_1ImmPred, sub, m5_0Imm16Pred,
+        /*sext*/    NegImm16, L4_iadd_memoph_io>;
+  defm: Memopxi_pat<zextloadi16, truncstorei16, u6_1ImmPred, sub, m5_0Imm16Pred,
+        /*zext*/    NegImm16, L4_iadd_memoph_io>;
+  defm: Memopxi_pat<load, store, u6_2ImmPred, sub, m5_0ImmPred, NegImm32,
+                    L4_iadd_memopw_io>;
+
+  // sub imm
+  defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, sub, u5_0ImmPred,
+        /*anyext*/  IdImm, L4_isub_memopb_io>;
+  defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, sub, u5_0ImmPred,
+        /*sext*/    IdImm, L4_isub_memopb_io>;
+  defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, sub, u5_0ImmPred,
+        /*zext*/    IdImm, L4_isub_memopb_io>;
+  defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, sub, u5_0ImmPred,
+        /*anyext*/  IdImm, L4_isub_memoph_io>;
+  defm: Memopxi_pat<sextloadi16, truncstorei16, u6_1ImmPred, sub, u5_0ImmPred,
+        /*sext*/    IdImm, L4_isub_memoph_io>;
+  defm: Memopxi_pat<zextloadi16, truncstorei16, u6_1ImmPred, sub, u5_0ImmPred,
+        /*zext*/    IdImm, L4_isub_memoph_io>;
+  defm: Memopxi_pat<load, store, u6_2ImmPred, sub, u5_0ImmPred, IdImm,
+                    L4_isub_memopw_io>;
+  defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, add, m5_0Imm8Pred,
+        /*anyext*/  NegImm8, L4_isub_memopb_io>;
+  defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, add, m5_0Imm8Pred,
+        /*sext*/    NegImm8, L4_isub_memopb_io>;
+  defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, add, m5_0Imm8Pred,
+        /*zext*/    NegImm8, L4_isub_memopb_io>;
+  defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, add, m5_0Imm16Pred,
+        /*anyext*/  NegImm16, L4_isub_memoph_io>;
+  defm: Memopxi_pat<sextloadi16, truncstorei16, u6_1ImmPred, add, m5_0Imm16Pred,
+        /*sext*/    NegImm16, L4_isub_memoph_io>;
+  defm: Memopxi_pat<zextloadi16, truncstorei16, u6_1ImmPred, add, m5_0Imm16Pred,
+        /*zext*/    NegImm16, L4_isub_memoph_io>;
+  defm: Memopxi_pat<load, store, u6_2ImmPred, add, m5_0ImmPred, NegImm32,
+                    L4_isub_memopw_io>;
+
+  // clrbit imm
+  defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, and, IsNPow2_8,
+        /*anyext*/  LogN2_8, L4_iand_memopb_io>;
+  defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, and, IsNPow2_8,
+        /*sext*/    LogN2_8, L4_iand_memopb_io>;
+  defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, and, IsNPow2_8,
+        /*zext*/    LogN2_8, L4_iand_memopb_io>;
+  defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, and, IsNPow2_16,
+        /*anyext*/  LogN2_16, L4_iand_memoph_io>;
+  defm: Memopxi_pat<sextloadi16, truncstorei16, u6_1ImmPred, and, IsNPow2_16,
+        /*sext*/    LogN2_16, L4_iand_memoph_io>;
+  defm: Memopxi_pat<zextloadi16, truncstorei16, u6_1ImmPred, and, IsNPow2_16,
+        /*zext*/    LogN2_16, L4_iand_memoph_io>;
+  defm: Memopxi_pat<load, store, u6_2ImmPred, and, IsNPow2_32,
+		    LogN2_32, L4_iand_memopw_io>;
+
+  // setbit imm
+  defm: Memopxi_pat<extloadi8, truncstorei8, u6_0ImmPred, or, IsPow2_32,
+        /*anyext*/  Log2_8, L4_ior_memopb_io>;
+  defm: Memopxi_pat<sextloadi8, truncstorei8, u6_0ImmPred, or, IsPow2_32,
+        /*sext*/    Log2_8, L4_ior_memopb_io>;
+  defm: Memopxi_pat<zextloadi8, truncstorei8, u6_0ImmPred, or, IsPow2_32,
+        /*zext*/    Log2_8, L4_ior_memopb_io>;
+  defm: Memopxi_pat<extloadi16, truncstorei16, u6_1ImmPred, or, IsPow2_32,
+        /*anyext*/  Log2_16, L4_ior_memoph_io>;
+  defm: Memopxi_pat<sextloadi16, truncstorei16, u6_1ImmPred, or, IsPow2_32,
+        /*sext*/    Log2_16, L4_ior_memoph_io>;
+  defm: Memopxi_pat<zextloadi16, truncstorei16, u6_1ImmPred, or, IsPow2_32,
+        /*zext*/    Log2_16, L4_ior_memoph_io>;
+  defm: Memopxi_pat<load, store, u6_2ImmPred, or, IsPow2_32,
+		    Log2_32, L4_ior_memopw_io>;
+}
+
+def : T_CMP_pat <C4_cmpneqi,  setne,  s32_0ImmPred>;
+def : T_CMP_pat <C4_cmpltei,  setle,  s32_0ImmPred>;
+def : T_CMP_pat <C4_cmplteui, setule, u9_0ImmPred>;
+
+// Map cmplt(Rs, Imm) -> !cmpgt(Rs, Imm-1).
+def: Pat<(i1 (setlt I32:$src1, s32_0ImmPred:$src2)),
+         (C4_cmpltei IntRegs:$src1, (SDEC1 s32_0ImmPred:$src2))>;
+
+// rs != rt -> !(rs == rt).
+def: Pat<(i1 (setne I32:$src1, s32_0ImmPred:$src2)),
+         (C4_cmpneqi IntRegs:$src1, s32_0ImmPred:$src2)>;
+
+// For the sequence
+//   zext( setult ( and(Rs, 255), u8))
+// Use the isdigit transformation below
+
+
+def u7_0PosImmPred : ImmLeaf<i32, [{
+  // True if the immediate fits in an 7-bit unsigned field and
+  // is strictly greater than 0.
+  return Imm > 0 && isUInt<7>(Imm);
+}]>;
+
+
+// Generate code of the form 'C2_muxii(cmpbgtui(Rdd, C-1),0,1)'
+// for C code of the form r = ((c>='0') & (c<='9')) ? 1 : 0;.
+// The isdigit transformation relies on two 'clever' aspects:
+// 1) The data type is unsigned which allows us to eliminate a zero test after
+//    biasing the expression by 48. We are depending on the representation of
+//    the unsigned types, and semantics.
+// 2) The front end has converted <= 9 into < 10 on entry to LLVM
+//
+// For the C code:
+//   retval = ((c>='0') & (c<='9')) ? 1 : 0;
+// The code is transformed upstream of llvm into
+//   retval = (c-48) < 10 ? 1 : 0;
+
+let AddedComplexity = 139 in
+def: Pat<(i32 (zext (i1 (setult (and I32:$src1, 255), u7_0PosImmPred:$src2)))),
+         (C2_muxii (A4_cmpbgtui IntRegs:$src1, (UDEC1 imm:$src2)), 0, 1)>;
+
+class Loada_pat<PatFrag Load, ValueType VT, PatFrag Addr, InstHexagon MI>
+  : Pat<(VT (Load Addr:$addr)), (MI Addr:$addr)>;
+
+class Loadam_pat<PatFrag Load, ValueType VT, PatFrag Addr, PatFrag ValueMod,
+                 InstHexagon MI>
+  : Pat<(VT (Load Addr:$addr)), (ValueMod (MI Addr:$addr))>;
+
+class Storea_pat<PatFrag Store, PatFrag Value, PatFrag Addr, InstHexagon MI>
+  : Pat<(Store Value:$val, Addr:$addr), (MI Addr:$addr, Value:$val)>;
+
+class Stoream_pat<PatFrag Store, PatFrag Value, PatFrag Addr, PatFrag ValueMod,
+                  InstHexagon MI>
+  : Pat<(Store Value:$val, Addr:$addr),
+        (MI Addr:$addr, (ValueMod Value:$val))>;
+
+let AddedComplexity = 30 in {
+  def: Storea_pat<truncstorei8,  I32, addrga, PS_storerbabs>;
+  def: Storea_pat<truncstorei16, I32, addrga, PS_storerhabs>;
+  def: Storea_pat<store,         I32, addrga, PS_storeriabs>;
+  def: Storea_pat<store,         I64, addrga, PS_storerdabs>;
+
+  def: Stoream_pat<truncstorei8,  I64, addrga, LoReg, PS_storerbabs>;
+  def: Stoream_pat<truncstorei16, I64, addrga, LoReg, PS_storerhabs>;
+  def: Stoream_pat<truncstorei32, I64, addrga, LoReg, PS_storeriabs>;
+}
+
+def: Storea_pat<SwapSt<atomic_store_8>,  I32, addrgp, S2_storerbgp>;
+def: Storea_pat<SwapSt<atomic_store_16>, I32, addrgp, S2_storerhgp>;
+def: Storea_pat<SwapSt<atomic_store_32>, I32, addrgp, S2_storerigp>;
+def: Storea_pat<SwapSt<atomic_store_64>, I64, addrgp, S2_storerdgp>;
+
+let AddedComplexity = 100 in {
+  def: Storea_pat<truncstorei8,  I32, addrgp, S2_storerbgp>;
+  def: Storea_pat<truncstorei16, I32, addrgp, S2_storerhgp>;
+  def: Storea_pat<store,         I32, addrgp, S2_storerigp>;
+  def: Storea_pat<store,         I64, addrgp, S2_storerdgp>;
+
+  // Map from "i1 = constant<-1>; memw(CONST32(#foo)) = i1"
+  //       to "r0 = 1; memw(#foo) = r0"
+  let AddedComplexity = 100 in
+  def: Pat<(store (i1 -1), (HexagonCONST32_GP tglobaladdr:$global)),
+           (S2_storerbgp tglobaladdr:$global, (A2_tfrsi 1))>;
+}
+
+class LoadAbs_pats <PatFrag ldOp, InstHexagon MI, ValueType VT = i32>
+  : Pat <(VT (ldOp (HexagonCONST32 tglobaladdr:$absaddr))),
+         (VT (MI tglobaladdr:$absaddr))>;
+
+let AddedComplexity  = 30 in {
+  def: LoadAbs_pats <load,        PS_loadriabs>;
+  def: LoadAbs_pats <zextloadi1,  PS_loadrubabs>;
+  def: LoadAbs_pats <sextloadi8,  PS_loadrbabs>;
+  def: LoadAbs_pats <extloadi8,   PS_loadrubabs>;
+  def: LoadAbs_pats <zextloadi8,  PS_loadrubabs>;
+  def: LoadAbs_pats <sextloadi16, PS_loadrhabs>;
+  def: LoadAbs_pats <extloadi16,  PS_loadruhabs>;
+  def: LoadAbs_pats <zextloadi16, PS_loadruhabs>;
+  def: LoadAbs_pats <load,        PS_loadrdabs, i64>;
+}
+
+let AddedComplexity  = 30 in
+def: Pat<(i64 (zextloadi1 (HexagonCONST32 tglobaladdr:$absaddr))),
+         (ToZext64 (PS_loadrubabs tglobaladdr:$absaddr))>;
+
+def: Loada_pat<atomic_load_8,  i32, addrgp, L2_loadrubgp>;
+def: Loada_pat<atomic_load_16, i32, addrgp, L2_loadruhgp>;
+def: Loada_pat<atomic_load_32, i32, addrgp, L2_loadrigp>;
+def: Loada_pat<atomic_load_64, i64, addrgp, L2_loadrdgp>;
+
+def: Loadam_pat<load, i1, addrga, I32toI1, PS_loadrubabs>;
+def: Loadam_pat<load, i1, addrgp, I32toI1, L2_loadrubgp>;
+
+def: Stoream_pat<store, I1, addrga, I1toI32, PS_storerbabs>;
+def: Stoream_pat<store, I1, addrgp, I1toI32, S2_storerbgp>;
+
+// Map from load(globaladdress) -> mem[u][bhwd](#foo)
+class LoadGP_pats <PatFrag ldOp, InstHexagon MI, ValueType VT = i32>
+  : Pat <(VT (ldOp (HexagonCONST32_GP tglobaladdr:$global))),
+         (VT (MI tglobaladdr:$global))>;
+
+let AddedComplexity = 100 in {
+  def: LoadGP_pats <extloadi8,   L2_loadrubgp>;
+  def: LoadGP_pats <sextloadi8,  L2_loadrbgp>;
+  def: LoadGP_pats <zextloadi8,  L2_loadrubgp>;
+  def: LoadGP_pats <extloadi16,  L2_loadruhgp>;
+  def: LoadGP_pats <sextloadi16, L2_loadrhgp>;
+  def: LoadGP_pats <zextloadi16, L2_loadruhgp>;
+  def: LoadGP_pats <load,        L2_loadrigp>;
+  def: LoadGP_pats <load,        L2_loadrdgp, i64>;
+}
+
+// When the Interprocedural Global Variable optimizer realizes that a certain
+// global variable takes only two constant values, it shrinks the global to
+// a boolean. Catch those loads here in the following 3 patterns.
+let AddedComplexity = 100 in {
+  def: LoadGP_pats <extloadi1, L2_loadrubgp>;
+  def: LoadGP_pats <zextloadi1, L2_loadrubgp>;
+}
+
+// Transfer global address into a register
+def: Pat<(HexagonCONST32 tglobaladdr:$Rs),      (A2_tfrsi imm:$Rs)>;
+def: Pat<(HexagonCONST32_GP tblockaddress:$Rs), (A2_tfrsi imm:$Rs)>;
+def: Pat<(HexagonCONST32_GP tglobaladdr:$Rs),   (A2_tfrsi imm:$Rs)>;
+
+let AddedComplexity  = 30 in {
+  def: Storea_pat<truncstorei8,  I32, u32_0ImmPred, PS_storerbabs>;
+  def: Storea_pat<truncstorei16, I32, u32_0ImmPred, PS_storerhabs>;
+  def: Storea_pat<store,         I32, u32_0ImmPred, PS_storeriabs>;
+}
+
+let AddedComplexity  = 30 in {
+  def: Loada_pat<load,        i32, u32_0ImmPred, PS_loadriabs>;
+  def: Loada_pat<sextloadi8,  i32, u32_0ImmPred, PS_loadrbabs>;
+  def: Loada_pat<zextloadi8,  i32, u32_0ImmPred, PS_loadrubabs>;
+  def: Loada_pat<sextloadi16, i32, u32_0ImmPred, PS_loadrhabs>;
+  def: Loada_pat<zextloadi16, i32, u32_0ImmPred, PS_loadruhabs>;
+}
+
+// Indexed store word - global address.
+// memw(Rs+#u6:2)=#S8
+let AddedComplexity = 100 in
+defm: Storex_add_pat<store, addrga, u6_2ImmPred, S4_storeiri_io>;
+
+// Load from a global address that has only one use in the current basic block.
+let AddedComplexity = 100 in {
+  def: Loada_pat<extloadi8,   i32, addrga, PS_loadrubabs>;
+  def: Loada_pat<sextloadi8,  i32, addrga, PS_loadrbabs>;
+  def: Loada_pat<zextloadi8,  i32, addrga, PS_loadrubabs>;
+
+  def: Loada_pat<extloadi16,  i32, addrga, PS_loadruhabs>;
+  def: Loada_pat<sextloadi16, i32, addrga, PS_loadrhabs>;
+  def: Loada_pat<zextloadi16, i32, addrga, PS_loadruhabs>;
+
+  def: Loada_pat<load,        i32, addrga, PS_loadriabs>;
+  def: Loada_pat<load,        i64, addrga, PS_loadrdabs>;
+}
+
+// Store to a global address that has only one use in the current basic block.
+let AddedComplexity = 100 in {
+  def: Storea_pat<truncstorei8,  I32, addrga, PS_storerbabs>;
+  def: Storea_pat<truncstorei16, I32, addrga, PS_storerhabs>;
+  def: Storea_pat<store,         I32, addrga, PS_storeriabs>;
+  def: Storea_pat<store,         I64, addrga, PS_storerdabs>;
+
+  def: Stoream_pat<truncstorei32, I64, addrga, LoReg, PS_storeriabs>;
+}
+
+// i8/i16/i32 -> i64 loads
+// We need a complexity of 120 here to override preceding handling of
+// zextload.
+let AddedComplexity = 120 in {
+  def: Loadam_pat<extloadi8,   i64, addrga, ToZext64, PS_loadrubabs>;
+  def: Loadam_pat<sextloadi8,  i64, addrga, ToSext64, PS_loadrbabs>;
+  def: Loadam_pat<zextloadi8,  i64, addrga, ToZext64, PS_loadrubabs>;
+
+  def: Loadam_pat<extloadi16,  i64, addrga, ToZext64, PS_loadruhabs>;
+  def: Loadam_pat<sextloadi16, i64, addrga, ToSext64, PS_loadrhabs>;
+  def: Loadam_pat<zextloadi16, i64, addrga, ToZext64, PS_loadruhabs>;
+
+  def: Loadam_pat<extloadi32,  i64, addrga, ToZext64, PS_loadriabs>;
+  def: Loadam_pat<sextloadi32, i64, addrga, ToSext64, PS_loadriabs>;
+  def: Loadam_pat<zextloadi32, i64, addrga, ToZext64, PS_loadriabs>;
+}
+
+let AddedComplexity = 100 in {
+  def: Loada_pat<extloadi8,   i32, addrgp, PS_loadrubabs>;
+  def: Loada_pat<sextloadi8,  i32, addrgp, PS_loadrbabs>;
+  def: Loada_pat<zextloadi8,  i32, addrgp, PS_loadrubabs>;
+
+  def: Loada_pat<extloadi16,  i32, addrgp, PS_loadruhabs>;
+  def: Loada_pat<sextloadi16, i32, addrgp, PS_loadrhabs>;
+  def: Loada_pat<zextloadi16, i32, addrgp, PS_loadruhabs>;
+
+  def: Loada_pat<load,        i32, addrgp, PS_loadriabs>;
+  def: Loada_pat<load,        i64, addrgp, PS_loadrdabs>;
+}
+
+let AddedComplexity = 100 in {
+  def: Storea_pat<truncstorei8,  I32, addrgp, PS_storerbabs>;
+  def: Storea_pat<truncstorei16, I32, addrgp, PS_storerhabs>;
+  def: Storea_pat<store,         I32, addrgp, PS_storeriabs>;
+  def: Storea_pat<store,         I64, addrgp, PS_storerdabs>;
+}
+
+def: Loada_pat<atomic_load_8,  i32, addrgp, PS_loadrubabs>;
+def: Loada_pat<atomic_load_16, i32, addrgp, PS_loadruhabs>;
+def: Loada_pat<atomic_load_32, i32, addrgp, PS_loadriabs>;
+def: Loada_pat<atomic_load_64, i64, addrgp, PS_loadrdabs>;
+
+def: Storea_pat<SwapSt<atomic_store_8>,  I32, addrgp, PS_storerbabs>;
+def: Storea_pat<SwapSt<atomic_store_16>, I32, addrgp, PS_storerhabs>;
+def: Storea_pat<SwapSt<atomic_store_32>, I32, addrgp, PS_storeriabs>;
+def: Storea_pat<SwapSt<atomic_store_64>, I64, addrgp, PS_storerdabs>;
+
+def: Pat<(or (or (or (shl (i64 (zext (and I32:$b, (i32 65535)))), (i32 16)),
+                     (i64 (zext (i32 (and I32:$a, (i32 65535)))))),
+                 (shl (i64 (anyext (and I32:$c, (i32 65535)))), (i32 32))),
+             (shl (Aext64 I32:$d), (i32 48))),
+         (A2_combinew (A2_combine_ll I32:$d, I32:$c),
+                      (A2_combine_ll I32:$b, I32:$a))>;
+
+// We need custom lowering of ISD::PREFETCH into HexagonISD::DCFETCH
+// because the SDNode ISD::PREFETCH has properties MayLoad and MayStore.
+// We don't really want either one here.
+def SDTHexagonDCFETCH : SDTypeProfile<0, 2, [SDTCisPtrTy<0>,SDTCisInt<1>]>;
+def HexagonDCFETCH : SDNode<"HexagonISD::DCFETCH", SDTHexagonDCFETCH,
+                            [SDNPHasChain]>;
+
+def: Pat<(HexagonDCFETCH IntRegs:$Rs, u11_3ImmPred:$u11_3),
+         (Y2_dcfetchbo IntRegs:$Rs, imm:$u11_3)>;
+def: Pat<(HexagonDCFETCH (i32 (add IntRegs:$Rs, u11_3ImmPred:$u11_3)), (i32 0)),
+         (Y2_dcfetchbo IntRegs:$Rs, imm:$u11_3)>;
+
+def f32ImmPred : PatLeaf<(f32 fpimm:$F)>;
+def f64ImmPred : PatLeaf<(f64 fpimm:$F)>;
+
+def ftoi : SDNodeXForm<fpimm, [{
+  APInt I = N->getValueAPF().bitcastToAPInt();
+  return CurDAG->getTargetConstant(I.getZExtValue(), SDLoc(N),
+                                   MVT::getIntegerVT(I.getBitWidth()));
+}]>;
+
+
+def: Pat<(sra (i64 (add (sra I64:$src1, u6_0ImmPred:$src2), 1)), (i32 1)),
+         (S2_asr_i_p_rnd I64:$src1, imm:$src2)>;
+
+def SDTHexagonI32I64: SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
+                                           SDTCisVT<1, i64>]>;
+def HexagonPOPCOUNT: SDNode<"HexagonISD::POPCOUNT", SDTHexagonI32I64>;
+
+def: Pat<(HexagonPOPCOUNT I64:$Rss), (S5_popcountp I64:$Rss)>;
+
+let AddedComplexity = 20 in {
+  defm: Loadx_pat<load, f32, s30_2ImmPred, L2_loadri_io>;
+  defm: Loadx_pat<load, f64, s29_3ImmPred, L2_loadrd_io>;
+}
+
+let AddedComplexity = 60 in {
+  defm : T_LoadAbsReg_Pat <load, L4_loadri_ur, f32>;
+  defm : T_LoadAbsReg_Pat <load, L4_loadrd_ur, f64>;
+}
+
+let AddedComplexity = 40 in {
+  def: Loadxs_pat<load, f32, L4_loadri_rr>;
+  def: Loadxs_pat<load, f64, L4_loadrd_rr>;
+}
+
+let AddedComplexity = 20 in {
+  def: Loadxs_simple_pat<load, f32, L4_loadri_rr>;
+  def: Loadxs_simple_pat<load, f64, L4_loadrd_rr>;
+}
+
+let AddedComplexity  = 80 in {
+  def: Loada_pat<load, f32, u32_0ImmPred, PS_loadriabs>;
+  def: Loada_pat<load, f32, addrga, PS_loadriabs>;
+  def: Loada_pat<load, f64, addrga, PS_loadrdabs>;
+}
+
+let AddedComplexity = 100 in {
+  def: LoadGP_pats <load, L2_loadrigp, f32>;
+  def: LoadGP_pats <load, L2_loadrdgp, f64>;
+}
+
+let AddedComplexity = 20 in {
+  defm: Storex_pat<store, F32, s30_2ImmPred, S2_storeri_io>;
+  defm: Storex_pat<store, F64, s29_3ImmPred, S2_storerd_io>;
+}
+
+// Simple patterns should be tried with the least priority.
+def: Storex_simple_pat<store, F32, S2_storeri_io>;
+def: Storex_simple_pat<store, F64, S2_storerd_io>;
+
+let AddedComplexity = 60 in {
+  defm : T_StoreAbsReg_Pats <S4_storeri_ur, IntRegs, f32, store>;
+  defm : T_StoreAbsReg_Pats <S4_storerd_ur, DoubleRegs, f64, store>;
+}
+
+let AddedComplexity = 40 in {
+  def: Storexs_pat<store, F32, S4_storeri_rr>;
+  def: Storexs_pat<store, F64, S4_storerd_rr>;
+}
+
+let AddedComplexity = 20 in {
+  def: Store_rr_pat<store, F32, S4_storeri_rr>;
+  def: Store_rr_pat<store, F64, S4_storerd_rr>;
+}
+
+let AddedComplexity = 80 in {
+  def: Storea_pat<store, F32, addrga, PS_storeriabs>;
+  def: Storea_pat<store, F64, addrga, PS_storerdabs>;
+}
+
+let AddedComplexity = 100 in {
+  def: Storea_pat<store, F32, addrgp, S2_storerigp>;
+  def: Storea_pat<store, F64, addrgp, S2_storerdgp>;
+}
+
+defm: Storex_pat<store, F32, s30_2ImmPred, S2_storeri_io>;
+defm: Storex_pat<store, F64, s29_3ImmPred, S2_storerd_io>;
+def: Storex_simple_pat<store, F32, S2_storeri_io>;
+def: Storex_simple_pat<store, F64, S2_storerd_io>;
+
+def: Pat<(fadd F32:$src1, F32:$src2),
+         (F2_sfadd F32:$src1, F32:$src2)>;
+
+def: Pat<(fsub F32:$src1, F32:$src2),
+         (F2_sfsub F32:$src1, F32:$src2)>;
+
+def: Pat<(fmul F32:$src1, F32:$src2),
+         (F2_sfmpy F32:$src1, F32:$src2)>;
+
+let Predicates = [HasV5T] in {
+  def: Pat<(f32 (fminnum F32:$Rs, F32:$Rt)), (F2_sfmin F32:$Rs, F32:$Rt)>;
+  def: Pat<(f32 (fmaxnum F32:$Rs, F32:$Rt)), (F2_sfmax F32:$Rs, F32:$Rt)>;
+}
+
+let AddedComplexity = 100, Predicates = [HasV5T] in {
+  class SfSel12<PatFrag Cmp, InstHexagon MI>
+    : Pat<(select (i1 (Cmp F32:$Rs, F32:$Rt)), F32:$Rs, F32:$Rt),
+          (MI F32:$Rs, F32:$Rt)>;
+  class SfSel21<PatFrag Cmp, InstHexagon MI>
+    : Pat<(select (i1 (Cmp F32:$Rs, F32:$Rt)), F32:$Rt, F32:$Rs),
+          (MI F32:$Rs, F32:$Rt)>;
+
+  def: SfSel12<setolt, F2_sfmin>;
+  def: SfSel12<setole, F2_sfmin>;
+  def: SfSel12<setogt, F2_sfmax>;
+  def: SfSel12<setoge, F2_sfmax>;
+  def: SfSel21<setolt, F2_sfmax>;
+  def: SfSel21<setole, F2_sfmax>;
+  def: SfSel21<setogt, F2_sfmin>;
+  def: SfSel21<setoge, F2_sfmin>;
+}
+
+class T_fcmp32_pat<PatFrag OpNode, InstHexagon MI>
+  : Pat<(i1 (OpNode F32:$src1, F32:$src2)),
+        (MI F32:$src1, F32:$src2)>;
+class T_fcmp64_pat<PatFrag OpNode, InstHexagon MI>
+  : Pat<(i1 (OpNode F64:$src1, F64:$src2)),
+        (MI F64:$src1, F64:$src2)>;
+
+def: T_fcmp32_pat<setoge, F2_sfcmpge>;
+def: T_fcmp32_pat<setuo,  F2_sfcmpuo>;
+def: T_fcmp32_pat<setoeq, F2_sfcmpeq>;
+def: T_fcmp32_pat<setogt, F2_sfcmpgt>;
+
+def: T_fcmp64_pat<setoge, F2_dfcmpge>;
+def: T_fcmp64_pat<setuo,  F2_dfcmpuo>;
+def: T_fcmp64_pat<setoeq, F2_dfcmpeq>;
+def: T_fcmp64_pat<setogt, F2_dfcmpgt>;
+
+let Predicates = [HasV5T] in
+multiclass T_fcmp_pats<PatFrag cmpOp, InstHexagon IntMI, InstHexagon DoubleMI> {
+  // IntRegs
+  def: Pat<(i1 (cmpOp F32:$src1, F32:$src2)),
+           (IntMI F32:$src1, F32:$src2)>;
+  // DoubleRegs
+  def: Pat<(i1 (cmpOp F64:$src1, F64:$src2)),
+           (DoubleMI F64:$src1, F64:$src2)>;
+}
+
+defm : T_fcmp_pats <seteq, F2_sfcmpeq, F2_dfcmpeq>;
+defm : T_fcmp_pats <setgt, F2_sfcmpgt, F2_dfcmpgt>;
+defm : T_fcmp_pats <setge, F2_sfcmpge, F2_dfcmpge>;
+
+//===----------------------------------------------------------------------===//
+// Multiclass to define 'Def Pats' for unordered gt, ge, eq operations.
+//===----------------------------------------------------------------------===//
+let Predicates = [HasV5T] in
+multiclass unord_Pats <PatFrag cmpOp, InstHexagon IntMI, InstHexagon DoubleMI> {
+  // IntRegs
+  def: Pat<(i1 (cmpOp F32:$src1, F32:$src2)),
+           (C2_or (F2_sfcmpuo F32:$src1, F32:$src2),
+                  (IntMI F32:$src1, F32:$src2))>;
+
+  // DoubleRegs
+  def: Pat<(i1 (cmpOp F64:$src1, F64:$src2)),
+           (C2_or (F2_dfcmpuo F64:$src1, F64:$src2),
+                  (DoubleMI F64:$src1, F64:$src2))>;
+}
+
+defm : unord_Pats <setuge, F2_sfcmpge, F2_dfcmpge>;
+defm : unord_Pats <setugt, F2_sfcmpgt, F2_dfcmpgt>;
+defm : unord_Pats <setueq, F2_sfcmpeq, F2_dfcmpeq>;
+
+//===----------------------------------------------------------------------===//
+// Multiclass to define 'Def Pats' for the following dags:
+// seteq(setoeq(op1, op2), 0) -> not(setoeq(op1, op2))
+// seteq(setoeq(op1, op2), 1) -> setoeq(op1, op2)
+// setne(setoeq(op1, op2), 0) -> setoeq(op1, op2)
+// setne(setoeq(op1, op2), 1) -> not(setoeq(op1, op2))
+//===----------------------------------------------------------------------===//
+let Predicates = [HasV5T] in
+multiclass eq_ordgePats <PatFrag cmpOp, InstHexagon IntMI,
+                         InstHexagon DoubleMI> {
+  // IntRegs
+  def: Pat<(i1 (seteq (i1 (cmpOp F32:$src1, F32:$src2)), 0)),
+           (C2_not (IntMI F32:$src1, F32:$src2))>;
+  def: Pat<(i1 (seteq (i1 (cmpOp F32:$src1, F32:$src2)), 1)),
+           (IntMI F32:$src1, F32:$src2)>;
+  def: Pat<(i1 (setne (i1 (cmpOp F32:$src1, F32:$src2)), 0)),
+           (IntMI F32:$src1, F32:$src2)>;
+  def: Pat<(i1 (setne (i1 (cmpOp F32:$src1, F32:$src2)), 1)),
+           (C2_not (IntMI F32:$src1, F32:$src2))>;
+
+  // DoubleRegs
+  def : Pat<(i1 (seteq (i1 (cmpOp F64:$src1, F64:$src2)), 0)),
+            (C2_not (DoubleMI F64:$src1, F64:$src2))>;
+  def : Pat<(i1 (seteq (i1 (cmpOp F64:$src1, F64:$src2)), 1)),
+            (DoubleMI F64:$src1, F64:$src2)>;
+  def : Pat<(i1 (setne (i1 (cmpOp F64:$src1, F64:$src2)), 0)),
+            (DoubleMI F64:$src1, F64:$src2)>;
+  def : Pat<(i1 (setne (i1 (cmpOp F64:$src1, F64:$src2)), 1)),
+            (C2_not (DoubleMI F64:$src1, F64:$src2))>;
+}
+
+defm : eq_ordgePats<setoeq, F2_sfcmpeq, F2_dfcmpeq>;
+defm : eq_ordgePats<setoge, F2_sfcmpge, F2_dfcmpge>;
+defm : eq_ordgePats<setogt, F2_sfcmpgt, F2_dfcmpgt>;
+
+//===----------------------------------------------------------------------===//
+// Multiclass to define 'Def Pats' for the following dags:
+// seteq(setolt(op1, op2), 0) -> not(setogt(op2, op1))
+// seteq(setolt(op1, op2), 1) -> setogt(op2, op1)
+// setne(setolt(op1, op2), 0) -> setogt(op2, op1)
+// setne(setolt(op1, op2), 1) -> not(setogt(op2, op1))
+//===----------------------------------------------------------------------===//
+let Predicates = [HasV5T] in
+multiclass eq_ordltPats <PatFrag cmpOp, InstHexagon IntMI,
+                         InstHexagon DoubleMI> {
+  // IntRegs
+  def: Pat<(i1 (seteq (i1 (cmpOp F32:$src1, F32:$src2)), 0)),
+           (C2_not (IntMI F32:$src2, F32:$src1))>;
+  def: Pat<(i1 (seteq (i1 (cmpOp F32:$src1, F32:$src2)), 1)),
+           (IntMI F32:$src2, F32:$src1)>;
+  def: Pat<(i1 (setne (i1 (cmpOp F32:$src1, F32:$src2)), 0)),
+           (IntMI F32:$src2, F32:$src1)>;
+  def: Pat<(i1 (setne (i1 (cmpOp F32:$src1, F32:$src2)), 1)),
+           (C2_not (IntMI F32:$src2, F32:$src1))>;
+
+  // DoubleRegs
+  def: Pat<(i1 (seteq (i1 (cmpOp F64:$src1, F64:$src2)), 0)),
+           (C2_not (DoubleMI F64:$src2, F64:$src1))>;
+  def: Pat<(i1 (seteq (i1 (cmpOp F64:$src1, F64:$src2)), 1)),
+           (DoubleMI F64:$src2, F64:$src1)>;
+  def: Pat<(i1 (setne (i1 (cmpOp F64:$src1, F64:$src2)), 0)),
+           (DoubleMI F64:$src2, F64:$src1)>;
+  def: Pat<(i1 (setne (i1 (cmpOp F64:$src1, F64:$src2)), 0)),
+           (C2_not (DoubleMI F64:$src2, F64:$src1))>;
+}
+
+defm : eq_ordltPats<setole, F2_sfcmpge, F2_dfcmpge>;
+defm : eq_ordltPats<setolt, F2_sfcmpgt, F2_dfcmpgt>;
+
+
+// o. seto inverse of setuo. http://llvm.org/docs/LangRef.html#i_fcmp
+let Predicates = [HasV5T] in {
+  def: Pat<(i1 (seto F32:$src1, F32:$src2)),
+           (C2_not (F2_sfcmpuo F32:$src2, F32:$src1))>;
+  def: Pat<(i1 (seto F32:$src1, f32ImmPred:$src2)),
+           (C2_not (F2_sfcmpuo (f32 (A2_tfrsi (ftoi $src2))), F32:$src1))>;
+  def: Pat<(i1 (seto F64:$src1, F64:$src2)),
+           (C2_not (F2_dfcmpuo F64:$src2, F64:$src1))>;
+  def: Pat<(i1 (seto F64:$src1, f64ImmPred:$src2)),
+           (C2_not (F2_dfcmpuo (CONST64 (ftoi $src2)), F64:$src1))>;
+}
+
+// Ordered lt.
+let Predicates = [HasV5T] in {
+  def: Pat<(i1 (setolt F32:$src1, F32:$src2)),
+           (F2_sfcmpgt F32:$src2, F32:$src1)>;
+  def: Pat<(i1 (setolt F32:$src1, f32ImmPred:$src2)),
+           (F2_sfcmpgt (f32 (A2_tfrsi (ftoi $src2))), F32:$src1)>;
+  def: Pat<(i1 (setolt F64:$src1, F64:$src2)),
+           (F2_dfcmpgt F64:$src2, F64:$src1)>;
+  def: Pat<(i1 (setolt F64:$src1, f64ImmPred:$src2)),
+           (F2_dfcmpgt (CONST64 (ftoi $src2)), F64:$src1)>;
+}
+
+// Unordered lt.
+let Predicates = [HasV5T] in {
+  def: Pat<(i1 (setult F32:$src1, F32:$src2)),
+           (C2_or (F2_sfcmpuo F32:$src1, F32:$src2),
+                  (F2_sfcmpgt F32:$src2, F32:$src1))>;
+  def: Pat<(i1 (setult F32:$src1, f32ImmPred:$src2)),
+           (C2_or (F2_sfcmpuo F32:$src1, (f32 (A2_tfrsi (ftoi $src2)))),
+                  (F2_sfcmpgt (f32 (A2_tfrsi (ftoi $src2))), F32:$src1))>;
+  def: Pat<(i1 (setult F64:$src1, F64:$src2)),
+           (C2_or (F2_dfcmpuo F64:$src1, F64:$src2),
+                  (F2_dfcmpgt F64:$src2, F64:$src1))>;
+  def: Pat<(i1 (setult F64:$src1, f64ImmPred:$src2)),
+           (C2_or (F2_dfcmpuo F64:$src1, (CONST64 (ftoi $src2))),
+                  (F2_dfcmpgt (CONST64 (ftoi $src2)), F64:$src1))>;
+}
+
+// Ordered le.
+let Predicates = [HasV5T] in {
+  // rs <= rt -> rt >= rs.
+  def: Pat<(i1 (setole F32:$src1, F32:$src2)),
+           (F2_sfcmpge F32:$src2, F32:$src1)>;
+  def: Pat<(i1 (setole F32:$src1, f32ImmPred:$src2)),
+           (F2_sfcmpge (f32 (A2_tfrsi (ftoi $src2))), F32:$src1)>;
+
+  // Rss <= Rtt -> Rtt >= Rss.
+  def: Pat<(i1 (setole F64:$src1, F64:$src2)),
+           (F2_dfcmpge F64:$src2, F64:$src1)>;
+  def: Pat<(i1 (setole F64:$src1, f64ImmPred:$src2)),
+           (F2_dfcmpge (CONST64 (ftoi $src2)), F64:$src1)>;
+}
+
+// Unordered le.
+let Predicates = [HasV5T] in {
+// rs <= rt -> rt >= rs.
+  def: Pat<(i1 (setule F32:$src1, F32:$src2)),
+           (C2_or (F2_sfcmpuo F32:$src1, F32:$src2),
+                  (F2_sfcmpge F32:$src2, F32:$src1))>;
+  def: Pat<(i1 (setule F32:$src1, f32ImmPred:$src2)),
+           (C2_or (F2_sfcmpuo F32:$src1, (f32 (A2_tfrsi (ftoi $src2)))),
+                  (F2_sfcmpge (f32 (A2_tfrsi (ftoi $src2))), F32:$src1))>;
+  def: Pat<(i1 (setule F64:$src1, F64:$src2)),
+           (C2_or (F2_dfcmpuo F64:$src1, F64:$src2),
+                  (F2_dfcmpge F64:$src2, F64:$src1))>;
+  def: Pat<(i1 (setule F64:$src1, f64ImmPred:$src2)),
+           (C2_or (F2_dfcmpuo F64:$src1, (CONST64 (ftoi $src2))),
+                  (F2_dfcmpge (CONST64 (ftoi $src2)), F64:$src1))>;
+}
+
+// Ordered ne.
+let Predicates = [HasV5T] in {
+  def: Pat<(i1 (setone F32:$src1, F32:$src2)),
+           (C2_not (F2_sfcmpeq F32:$src1, F32:$src2))>;
+  def: Pat<(i1 (setone F64:$src1, F64:$src2)),
+           (C2_not (F2_dfcmpeq F64:$src1, F64:$src2))>;
+  def: Pat<(i1 (setone F32:$src1, f32ImmPred:$src2)),
+           (C2_not (F2_sfcmpeq F32:$src1, (f32 (A2_tfrsi (ftoi $src2)))))>;
+  def: Pat<(i1 (setone F64:$src1, f64ImmPred:$src2)),
+           (C2_not (F2_dfcmpeq F64:$src1, (CONST64 (ftoi $src2))))>;
+}
+
+// Unordered ne.
+let Predicates = [HasV5T] in {
+  def: Pat<(i1 (setune F32:$src1, F32:$src2)),
+           (C2_or (F2_sfcmpuo F32:$src1, F32:$src2),
+                  (C2_not (F2_sfcmpeq F32:$src1, F32:$src2)))>;
+  def: Pat<(i1 (setune F64:$src1, F64:$src2)),
+           (C2_or (F2_dfcmpuo F64:$src1, F64:$src2),
+                  (C2_not (F2_dfcmpeq F64:$src1, F64:$src2)))>;
+  def: Pat<(i1 (setune F32:$src1, f32ImmPred:$src2)),
+           (C2_or (F2_sfcmpuo F32:$src1, (f32 (A2_tfrsi (ftoi $src2)))),
+                  (C2_not (F2_sfcmpeq F32:$src1,
+                                      (f32 (A2_tfrsi (ftoi $src2))))))>;
+  def: Pat<(i1 (setune F64:$src1, f64ImmPred:$src2)),
+           (C2_or (F2_dfcmpuo F64:$src1, (CONST64 (ftoi $src2))),
+                  (C2_not (F2_dfcmpeq F64:$src1,
+                                      (CONST64 (ftoi $src2)))))>;
+}
+
+// Besides set[o|u][comparions], we also need set[comparisons].
+let Predicates = [HasV5T] in {
+  // lt.
+  def: Pat<(i1 (setlt F32:$src1, F32:$src2)),
+           (F2_sfcmpgt F32:$src2, F32:$src1)>;
+  def: Pat<(i1 (setlt F32:$src1, f32ImmPred:$src2)),
+           (F2_sfcmpgt (f32 (A2_tfrsi (ftoi $src2))), F32:$src1)>;
+  def: Pat<(i1 (setlt F64:$src1, F64:$src2)),
+           (F2_dfcmpgt F64:$src2, F64:$src1)>;
+  def: Pat<(i1 (setlt F64:$src1, f64ImmPred:$src2)),
+           (F2_dfcmpgt (CONST64 (ftoi $src2)), F64:$src1)>;
+
+  // le.
+  // rs <= rt -> rt >= rs.
+  def: Pat<(i1 (setle F32:$src1, F32:$src2)),
+           (F2_sfcmpge F32:$src2, F32:$src1)>;
+  def: Pat<(i1 (setle F32:$src1, f32ImmPred:$src2)),
+           (F2_sfcmpge (f32 (A2_tfrsi (ftoi $src2))), F32:$src1)>;
+
+  // Rss <= Rtt -> Rtt >= Rss.
+  def: Pat<(i1 (setle F64:$src1, F64:$src2)),
+           (F2_dfcmpge F64:$src2, F64:$src1)>;
+  def: Pat<(i1 (setle F64:$src1, f64ImmPred:$src2)),
+           (F2_dfcmpge (CONST64 (ftoi $src2)), F64:$src1)>;
+
+  // ne.
+  def: Pat<(i1 (setne F32:$src1, F32:$src2)),
+           (C2_not (F2_sfcmpeq F32:$src1, F32:$src2))>;
+  def: Pat<(i1 (setne F64:$src1, F64:$src2)),
+           (C2_not (F2_dfcmpeq F64:$src1, F64:$src2))>;
+  def: Pat<(i1 (setne F32:$src1, f32ImmPred:$src2)),
+           (C2_not (F2_sfcmpeq F32:$src1, (f32 (A2_tfrsi (ftoi $src2)))))>;
+  def: Pat<(i1 (setne F64:$src1, f64ImmPred:$src2)),
+           (C2_not (F2_dfcmpeq F64:$src1, (CONST64 (ftoi $src2))))>;
+}
+
+
+def: Pat<(f64 (fpextend F32:$Rs)), (F2_conv_sf2df F32:$Rs)>;
+def: Pat<(f32 (fpround F64:$Rs)), (F2_conv_df2sf F64:$Rs)>;
+
+def: Pat<(f32 (sint_to_fp I32:$Rs)), (F2_conv_w2sf I32:$Rs)>;
+def: Pat<(f32 (sint_to_fp I64:$Rs)), (F2_conv_d2sf I64:$Rs)>;
+def: Pat<(f64 (sint_to_fp I32:$Rs)), (F2_conv_w2df I32:$Rs)>;
+def: Pat<(f64 (sint_to_fp I64:$Rs)), (F2_conv_d2df I64:$Rs)>;
+
+def: Pat<(f32 (uint_to_fp I32:$Rs)), (F2_conv_uw2sf I32:$Rs)>;
+def: Pat<(f32 (uint_to_fp I64:$Rs)), (F2_conv_ud2sf I64:$Rs)>;
+def: Pat<(f64 (uint_to_fp I32:$Rs)), (F2_conv_uw2df I32:$Rs)>;
+def: Pat<(f64 (uint_to_fp I64:$Rs)), (F2_conv_ud2df I64:$Rs)>;
+
+def: Pat<(i32 (fp_to_sint F32:$Rs)), (F2_conv_sf2w_chop F32:$Rs)>;
+def: Pat<(i32 (fp_to_sint F64:$Rs)), (F2_conv_df2w_chop F64:$Rs)>;
+def: Pat<(i64 (fp_to_sint F32:$Rs)), (F2_conv_sf2d_chop F32:$Rs)>;
+def: Pat<(i64 (fp_to_sint F64:$Rs)), (F2_conv_df2d_chop F64:$Rs)>;
+
+def: Pat<(i32 (fp_to_uint F32:$Rs)), (F2_conv_sf2uw_chop F32:$Rs)>;
+def: Pat<(i32 (fp_to_uint F64:$Rs)), (F2_conv_df2uw_chop F64:$Rs)>;
+def: Pat<(i64 (fp_to_uint F32:$Rs)), (F2_conv_sf2ud_chop F32:$Rs)>;
+def: Pat<(i64 (fp_to_uint F64:$Rs)), (F2_conv_df2ud_chop F64:$Rs)>;
+
+// Bitcast is different than [fp|sint|uint]_to_[sint|uint|fp].
+let Predicates = [HasV5T] in {
+  def: Pat <(i32 (bitconvert F32:$src)), (I32:$src)>;
+  def: Pat <(f32 (bitconvert I32:$src)), (F32:$src)>;
+  def: Pat <(i64 (bitconvert F64:$src)), (I64:$src)>;
+  def: Pat <(f64 (bitconvert I64:$src)), (F64:$src)>;
+}
+
+def : Pat <(fma F32:$src2, F32:$src3, F32:$src1),
+           (F2_sffma F32:$src1, F32:$src2, F32:$src3)>;
+
+def : Pat <(fma (fneg F32:$src2), F32:$src3, F32:$src1),
+           (F2_sffms F32:$src1, F32:$src2, F32:$src3)>;
+
+def : Pat <(fma F32:$src2, (fneg F32:$src3), F32:$src1),
+           (F2_sffms F32:$src1, F32:$src2, F32:$src3)>;
+
+def: Pat<(select I1:$Pu, F32:$Rs, f32ImmPred:$imm),
+         (C2_muxir I1:$Pu, F32:$Rs, (ftoi $imm))>,
+    Requires<[HasV5T]>;
+
+def: Pat<(select I1:$Pu, f32ImmPred:$imm, F32:$Rt),
+         (C2_muxri I1:$Pu, (ftoi $imm), F32:$Rt)>,
+    Requires<[HasV5T]>;
+
+def: Pat<(select I1:$src1, F32:$src2, F32:$src3),
+         (C2_mux I1:$src1, F32:$src2, F32:$src3)>,
+     Requires<[HasV5T]>;
+
+def: Pat<(select (i1 (setult F32:$src1, F32:$src2)), F32:$src3, F32:$src4),
+         (C2_mux (F2_sfcmpgt F32:$src2, F32:$src1), F32:$src4, F32:$src3)>,
+     Requires<[HasV5T]>;
+
+def: Pat<(select I1:$src1, F64:$src2, F64:$src3),
+         (C2_vmux I1:$src1, F64:$src2, F64:$src3)>,
+    Requires<[HasV5T]>;
+
+def: Pat<(select (i1 (setult F64:$src1, F64:$src2)), F64:$src3, F64:$src4),
+         (C2_vmux (F2_dfcmpgt F64:$src2, F64:$src1), F64:$src3, F64:$src4)>,
+     Requires<[HasV5T]>;
+
+// Map from p0 = pnot(p0); r0 = select(p0, #i, r1)
+// => r0 = mux(p0, #i, r1)
+def: Pat<(select (not I1:$src1), f32ImmPred:$src2, F32:$src3),
+         (C2_muxir I1:$src1, F32:$src3, (ftoi $src2))>,
+     Requires<[HasV5T]>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, r1, #i)
+// => r0 = mux(p0, r1, #i)
+def: Pat<(select (not I1:$src1), F32:$src2, f32ImmPred:$src3),
+         (C2_muxri I1:$src1, (ftoi $src3), F32:$src2)>,
+     Requires<[HasV5T]>;
+
+def: Pat<(i32 (fp_to_sint F64:$src1)),
+         (LoReg (F2_conv_df2d_chop F64:$src1))>,
+     Requires<[HasV5T]>;
+
+def : Pat <(fabs F32:$src1),
+           (S2_clrbit_i F32:$src1, 31)>,
+          Requires<[HasV5T]>;
+
+def : Pat <(fneg F32:$src1),
+           (S2_togglebit_i F32:$src1, 31)>,
+          Requires<[HasV5T]>;
+
+def: Pat<(fabs F64:$Rs),
+         (REG_SEQUENCE DoubleRegs,
+              (S2_clrbit_i (HiReg $Rs), 31), isub_hi,
+              (i32 (LoReg $Rs)), isub_lo)>;
+
+def: Pat<(fneg F64:$Rs),
+         (REG_SEQUENCE DoubleRegs,
+              (S2_togglebit_i (HiReg $Rs), 31), isub_hi,
+              (i32 (LoReg $Rs)), isub_lo)>;
+
+def alignedload : PatFrag<(ops node:$addr), (load $addr), [{
+  return isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+def unalignedload : PatFrag<(ops node:$addr), (load $addr), [{
+  return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+def alignedstore : PatFrag<(ops node:$val, node:$addr), (store $val, $addr), [{
+  return isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+def unalignedstore : PatFrag<(ops node:$val, node:$addr), (store $val, $addr), [{
+  return !isAlignedMemNode(dyn_cast<MemSDNode>(N));
+}]>;
+
+
+def s4_6ImmPred: PatLeaf<(i32 imm), [{
+  int64_t V = N->getSExtValue();
+  return isShiftedInt<4,6>(V);
+}]>;
+
+def s4_7ImmPred: PatLeaf<(i32 imm), [{
+  int64_t V = N->getSExtValue();
+  return isShiftedInt<4,7>(V);
+}]>;
+
+
+multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
+  // Aligned stores
+  def : Pat<(alignedstore (VTSgl VectorRegs:$src1), IntRegs:$addr),
+            (V6_vS32b_ai IntRegs:$addr, 0, (VTSgl VectorRegs:$src1))>,
+            Requires<[UseHVXSgl]>;
+  def : Pat<(unalignedstore (VTSgl VectorRegs:$src1), IntRegs:$addr),
+            (V6_vS32Ub_ai IntRegs:$addr, 0, (VTSgl VectorRegs:$src1))>,
+            Requires<[UseHVXSgl]>;
+
+  // 128B Aligned stores
+  def : Pat<(alignedstore (VTDbl VectorRegs128B:$src1), IntRegs:$addr),
+            (V6_vS32b_ai_128B IntRegs:$addr, 0, (VTDbl VectorRegs128B:$src1))>,
+            Requires<[UseHVXDbl]>;
+  def : Pat<(unalignedstore (VTDbl VectorRegs128B:$src1), IntRegs:$addr),
+            (V6_vS32Ub_ai_128B IntRegs:$addr, 0, (VTDbl VectorRegs128B:$src1))>,
+            Requires<[UseHVXDbl]>;
+
+  // Fold Add R+OFF into vector store.
+  let AddedComplexity = 10 in {
+    def : Pat<(alignedstore (VTSgl VectorRegs:$src1),
+                     (add IntRegs:$src2, s4_6ImmPred:$offset)),
+              (V6_vS32b_ai IntRegs:$src2, s4_6ImmPred:$offset,
+                           (VTSgl VectorRegs:$src1))>,
+              Requires<[UseHVXSgl]>;
+    def : Pat<(unalignedstore (VTSgl VectorRegs:$src1),
+                     (add IntRegs:$src2, s4_6ImmPred:$offset)),
+              (V6_vS32Ub_ai IntRegs:$src2, s4_6ImmPred:$offset,
+                           (VTSgl VectorRegs:$src1))>,
+              Requires<[UseHVXSgl]>;
+
+    // Fold Add R+OFF into vector store 128B.
+    def : Pat<(alignedstore (VTDbl VectorRegs128B:$src1),
+                     (add IntRegs:$src2, s4_7ImmPred:$offset)),
+              (V6_vS32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset,
+                                (VTDbl VectorRegs128B:$src1))>,
+              Requires<[UseHVXDbl]>;
+    def : Pat<(unalignedstore (VTDbl VectorRegs128B:$src1),
+                     (add IntRegs:$src2, s4_7ImmPred:$offset)),
+              (V6_vS32Ub_ai_128B IntRegs:$src2, s4_7ImmPred:$offset,
+                                (VTDbl VectorRegs128B:$src1))>,
+              Requires<[UseHVXDbl]>;
+  }
+}
+
+defm : vS32b_ai_pats <v64i8,  v128i8>;
+defm : vS32b_ai_pats <v32i16, v64i16>;
+defm : vS32b_ai_pats <v16i32, v32i32>;
+defm : vS32b_ai_pats <v8i64,  v16i64>;
+
+
+multiclass vL32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
+  // Aligned loads
+  def : Pat < (VTSgl (alignedload IntRegs:$addr)),
+              (V6_vL32b_ai IntRegs:$addr, 0) >,
+              Requires<[UseHVXSgl]>;
+  def : Pat < (VTSgl (unalignedload IntRegs:$addr)),
+              (V6_vL32Ub_ai IntRegs:$addr, 0) >,
+              Requires<[UseHVXSgl]>;
+
+  // 128B Load
+  def : Pat < (VTDbl (alignedload IntRegs:$addr)),
+              (V6_vL32b_ai_128B IntRegs:$addr, 0) >,
+              Requires<[UseHVXDbl]>;
+  def : Pat < (VTDbl (unalignedload IntRegs:$addr)),
+              (V6_vL32Ub_ai_128B IntRegs:$addr, 0) >,
+              Requires<[UseHVXDbl]>;
+
+  // Fold Add R+OFF into vector load.
+  let AddedComplexity = 10 in {
+    def : Pat<(VTDbl (alignedload (add IntRegs:$src2, s4_7ImmPred:$offset))),
+              (V6_vL32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>,
+               Requires<[UseHVXDbl]>;
+    def : Pat<(VTDbl (unalignedload (add IntRegs:$src2, s4_7ImmPred:$offset))),
+              (V6_vL32Ub_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>,
+               Requires<[UseHVXDbl]>;
+
+    def : Pat<(VTSgl (alignedload (add IntRegs:$src2, s4_6ImmPred:$offset))),
+              (V6_vL32b_ai IntRegs:$src2, s4_6ImmPred:$offset)>,
+              Requires<[UseHVXSgl]>;
+    def : Pat<(VTSgl (unalignedload (add IntRegs:$src2, s4_6ImmPred:$offset))),
+              (V6_vL32Ub_ai IntRegs:$src2, s4_6ImmPred:$offset)>,
+              Requires<[UseHVXSgl]>;
+  }
+}
+
+defm : vL32b_ai_pats <v64i8,  v128i8>;
+defm : vL32b_ai_pats <v32i16, v64i16>;
+defm : vL32b_ai_pats <v16i32, v32i32>;
+defm : vL32b_ai_pats <v8i64,  v16i64>;
+
+multiclass STrivv_pats <ValueType VTSgl, ValueType VTDbl> {
+  def : Pat<(alignedstore (VTSgl VecDblRegs:$src1), IntRegs:$addr),
+            (PS_vstorerw_ai IntRegs:$addr, 0, (VTSgl VecDblRegs:$src1))>,
+           Requires<[UseHVXSgl]>;
+  def : Pat<(unalignedstore (VTSgl VecDblRegs:$src1), IntRegs:$addr),
+            (PS_vstorerwu_ai IntRegs:$addr, 0, (VTSgl VecDblRegs:$src1))>,
+           Requires<[UseHVXSgl]>;
+
+  def : Pat<(alignedstore (VTDbl VecDblRegs128B:$src1), IntRegs:$addr),
+            (PS_vstorerw_ai_128B IntRegs:$addr, 0,
+                  (VTDbl VecDblRegs128B:$src1))>,
+            Requires<[UseHVXDbl]>;
+  def : Pat<(unalignedstore (VTDbl VecDblRegs128B:$src1), IntRegs:$addr),
+            (PS_vstorerwu_ai_128B IntRegs:$addr, 0,
+                  (VTDbl VecDblRegs128B:$src1))>,
+            Requires<[UseHVXDbl]>;
+}
+
+defm : STrivv_pats <v128i8, v256i8>;
+defm : STrivv_pats <v64i16, v128i16>;
+defm : STrivv_pats <v32i32, v64i32>;
+defm : STrivv_pats <v16i64, v32i64>;
+
+multiclass LDrivv_pats <ValueType VTSgl, ValueType VTDbl> {
+  def : Pat<(VTSgl (alignedload I32:$addr)),
+            (PS_vloadrw_ai I32:$addr, 0)>,
+           Requires<[UseHVXSgl]>;
+  def : Pat<(VTSgl (unalignedload I32:$addr)),
+            (PS_vloadrwu_ai I32:$addr, 0)>,
+           Requires<[UseHVXSgl]>;
+
+  def : Pat<(VTDbl (alignedload I32:$addr)),
+            (PS_vloadrw_ai_128B I32:$addr, 0)>,
+           Requires<[UseHVXDbl]>;
+  def : Pat<(VTDbl (unalignedload I32:$addr)),
+            (PS_vloadrwu_ai_128B I32:$addr, 0)>,
+           Requires<[UseHVXDbl]>;
+}
+
+defm : LDrivv_pats <v128i8, v256i8>;
+defm : LDrivv_pats <v64i16, v128i16>;
+defm : LDrivv_pats <v32i32, v64i32>;
+defm : LDrivv_pats <v16i64, v32i64>;
+
+let Predicates = [HasV60T,UseHVXSgl] in {
+  def: Pat<(select I1:$Pu, (v16i32 VectorRegs:$Vs), VectorRegs:$Vt),
+           (PS_vselect I1:$Pu, VectorRegs:$Vs, VectorRegs:$Vt)>;
+  def: Pat<(select I1:$Pu, (v32i32 VecDblRegs:$Vs), VecDblRegs:$Vt),
+           (PS_wselect I1:$Pu, VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+}
+let Predicates = [HasV60T,UseHVXDbl] in {
+  def: Pat<(select I1:$Pu, (v32i32 VectorRegs128B:$Vs), VectorRegs128B:$Vt),
+           (PS_vselect_128B I1:$Pu, VectorRegs128B:$Vs, VectorRegs128B:$Vt)>;
+  def: Pat<(select I1:$Pu, (v64i32 VecDblRegs128B:$Vs), VecDblRegs128B:$Vt),
+           (PS_wselect_128B I1:$Pu, VecDblRegs128B:$Vs, VecDblRegs128B:$Vt)>;
+}
+
+
+def SDTHexagonVCOMBINE: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>,
+      SDTCisSubVecOfVec<1, 0>]>;
+
+def HexagonVCOMBINE: SDNode<"HexagonISD::VCOMBINE", SDTHexagonVCOMBINE>;
+
+def: Pat<(v32i32 (HexagonVCOMBINE (v16i32 VectorRegs:$Vs),
+                                  (v16i32 VectorRegs:$Vt))),
+         (V6_vcombine VectorRegs:$Vs, VectorRegs:$Vt)>,
+         Requires<[UseHVXSgl]>;
+def: Pat<(v64i32 (HexagonVCOMBINE (v32i32 VecDblRegs:$Vs),
+                                  (v32i32 VecDblRegs:$Vt))),
+         (V6_vcombine_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
+         Requires<[UseHVXDbl]>;
+
+def SDTHexagonVPACK: SDTypeProfile<1, 3, [SDTCisSameAs<1, 2>,
+                                          SDTCisInt<3>]>;
+
+def HexagonVPACK: SDNode<"HexagonISD::VPACK", SDTHexagonVPACK>;
+
+// 0 as the last argument denotes vpacke. 1 denotes vpacko
+def: Pat<(v64i8 (HexagonVPACK (v64i8 VectorRegs:$Vs),
+                              (v64i8 VectorRegs:$Vt), (i32 0))),
+         (V6_vpackeb VectorRegs:$Vs, VectorRegs:$Vt)>,
+         Requires<[UseHVXSgl]>;
+def: Pat<(v64i8 (HexagonVPACK (v64i8 VectorRegs:$Vs),
+                              (v64i8 VectorRegs:$Vt), (i32 1))),
+         (V6_vpackob VectorRegs:$Vs, VectorRegs:$Vt)>,
+         Requires<[UseHVXSgl]>;
+def: Pat<(v32i16 (HexagonVPACK (v32i16 VectorRegs:$Vs),
+                               (v32i16 VectorRegs:$Vt), (i32 0))),
+         (V6_vpackeh VectorRegs:$Vs, VectorRegs:$Vt)>,
+         Requires<[UseHVXSgl]>;
+def: Pat<(v32i16 (HexagonVPACK (v32i16 VectorRegs:$Vs),
+                             (v32i16 VectorRegs:$Vt), (i32 1))),
+         (V6_vpackoh VectorRegs:$Vs, VectorRegs:$Vt)>,
+         Requires<[UseHVXSgl]>;
+
+def: Pat<(v128i8 (HexagonVPACK (v128i8 VecDblRegs:$Vs),
+                             (v128i8 VecDblRegs:$Vt), (i32 0))),
+         (V6_vpackeb_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
+         Requires<[UseHVXDbl]>;
+def: Pat<(v128i8 (HexagonVPACK (v128i8 VecDblRegs:$Vs),
+                             (v128i8 VecDblRegs:$Vt), (i32 1))),
+         (V6_vpackob_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
+         Requires<[UseHVXDbl]>;
+def: Pat<(v64i16 (HexagonVPACK (v64i16 VecDblRegs:$Vs),
+                             (v64i16 VecDblRegs:$Vt), (i32 0))),
+         (V6_vpackeh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
+         Requires<[UseHVXDbl]>;
+def: Pat<(v64i16 (HexagonVPACK (v64i16 VecDblRegs:$Vs),
+                            (v64i16 VecDblRegs:$Vt), (i32 1))),
+        (V6_vpackoh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
+        Requires<[UseHVXDbl]>;
+
+def V2I1:  PatLeaf<(v2i1  PredRegs:$R)>;
+def V4I1:  PatLeaf<(v4i1  PredRegs:$R)>;
+def V8I1:  PatLeaf<(v8i1  PredRegs:$R)>;
+def V4I8:  PatLeaf<(v4i8  IntRegs:$R)>;
+def V2I16: PatLeaf<(v2i16 IntRegs:$R)>;
+def V8I8:  PatLeaf<(v8i8  DoubleRegs:$R)>;
+def V4I16: PatLeaf<(v4i16 DoubleRegs:$R)>;
+def V2I32: PatLeaf<(v2i32 DoubleRegs:$R)>;
+
+
+multiclass bitconvert_32<ValueType a, ValueType b> {
+  def : Pat <(b (bitconvert (a IntRegs:$src))),
+             (b IntRegs:$src)>;
+  def : Pat <(a (bitconvert (b IntRegs:$src))),
+             (a IntRegs:$src)>;
+}
+
+multiclass bitconvert_64<ValueType a, ValueType b> {
+  def : Pat <(b (bitconvert (a DoubleRegs:$src))),
+             (b DoubleRegs:$src)>;
+  def : Pat <(a (bitconvert (b DoubleRegs:$src))),
+             (a DoubleRegs:$src)>;
+}
+
+// Bit convert vector types to integers.
+defm : bitconvert_32<v4i8,  i32>;
+defm : bitconvert_32<v2i16, i32>;
+defm : bitconvert_64<v8i8,  i64>;
+defm : bitconvert_64<v4i16, i64>;
+defm : bitconvert_64<v2i32, i64>;
+
+def: Pat<(sra (v4i16 DoubleRegs:$src1), u4_0ImmPred:$src2),
+         (S2_asr_i_vh DoubleRegs:$src1, imm:$src2)>;
+def: Pat<(srl (v4i16 DoubleRegs:$src1), u4_0ImmPred:$src2),
+         (S2_lsr_i_vh DoubleRegs:$src1, imm:$src2)>;
+def: Pat<(shl (v4i16 DoubleRegs:$src1), u4_0ImmPred:$src2),
+         (S2_asl_i_vh DoubleRegs:$src1, imm:$src2)>;
+
+def: Pat<(sra (v2i32 DoubleRegs:$src1), u5_0ImmPred:$src2),
+         (S2_asr_i_vw DoubleRegs:$src1, imm:$src2)>;
+def: Pat<(srl (v2i32 DoubleRegs:$src1), u5_0ImmPred:$src2),
+         (S2_lsr_i_vw DoubleRegs:$src1, imm:$src2)>;
+def: Pat<(shl (v2i32 DoubleRegs:$src1), u5_0ImmPred:$src2),
+         (S2_asl_i_vw DoubleRegs:$src1, imm:$src2)>;
+
+def : Pat<(v2i16 (add (v2i16 IntRegs:$src1), (v2i16 IntRegs:$src2))),
+          (A2_svaddh IntRegs:$src1, IntRegs:$src2)>;
+
+def : Pat<(v2i16 (sub (v2i16 IntRegs:$src1), (v2i16 IntRegs:$src2))),
+          (A2_svsubh IntRegs:$src1, IntRegs:$src2)>;
+
+def HexagonVSPLATB: SDNode<"HexagonISD::VSPLATB", SDTUnaryOp>;
+def HexagonVSPLATH: SDNode<"HexagonISD::VSPLATH", SDTUnaryOp>;
+
+// Replicate the low 8-bits from 32-bits input register into each of the
+// four bytes of 32-bits destination register.
+def: Pat<(v4i8  (HexagonVSPLATB I32:$Rs)), (S2_vsplatrb I32:$Rs)>;
+
+// Replicate the low 16-bits from 32-bits input register into each of the
+// four halfwords of 64-bits destination register.
+def: Pat<(v4i16 (HexagonVSPLATH I32:$Rs)), (S2_vsplatrh I32:$Rs)>;
+
+
+class VArith_pat <InstHexagon MI, SDNode Op, PatFrag Type>
+  : Pat <(Op Type:$Rss, Type:$Rtt),
+         (MI Type:$Rss, Type:$Rtt)>;
+
+def: VArith_pat <A2_vaddub, add, V8I8>;
+def: VArith_pat <A2_vaddh,  add, V4I16>;
+def: VArith_pat <A2_vaddw,  add, V2I32>;
+def: VArith_pat <A2_vsubub, sub, V8I8>;
+def: VArith_pat <A2_vsubh,  sub, V4I16>;
+def: VArith_pat <A2_vsubw,  sub, V2I32>;
+
+def: VArith_pat <A2_and,    and, V2I16>;
+def: VArith_pat <A2_xor,    xor, V2I16>;
+def: VArith_pat <A2_or,     or,  V2I16>;
+
+def: VArith_pat <A2_andp,   and, V8I8>;
+def: VArith_pat <A2_andp,   and, V4I16>;
+def: VArith_pat <A2_andp,   and, V2I32>;
+def: VArith_pat <A2_orp,    or,  V8I8>;
+def: VArith_pat <A2_orp,    or,  V4I16>;
+def: VArith_pat <A2_orp,    or,  V2I32>;
+def: VArith_pat <A2_xorp,   xor, V8I8>;
+def: VArith_pat <A2_xorp,   xor, V4I16>;
+def: VArith_pat <A2_xorp,   xor, V2I32>;
+
+def: Pat<(v2i32 (sra V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
+                                                    (i32 u5_0ImmPred:$c))))),
+         (S2_asr_i_vw V2I32:$b, imm:$c)>;
+def: Pat<(v2i32 (srl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
+                                                    (i32 u5_0ImmPred:$c))))),
+         (S2_lsr_i_vw V2I32:$b, imm:$c)>;
+def: Pat<(v2i32 (shl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
+                                                    (i32 u5_0ImmPred:$c))))),
+         (S2_asl_i_vw V2I32:$b, imm:$c)>;
+
+def: Pat<(v4i16 (sra V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4_0ImmPred:$c)))))),
+         (S2_asr_i_vh V4I16:$b, imm:$c)>;
+def: Pat<(v4i16 (srl V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4_0ImmPred:$c)))))),
+         (S2_lsr_i_vh V4I16:$b, imm:$c)>;
+def: Pat<(v4i16 (shl V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4_0ImmPred:$c)))))),
+         (S2_asl_i_vh V4I16:$b, imm:$c)>;
+
+
+def SDTHexagon_v2i32_v2i32_i32 : SDTypeProfile<1, 2,
+  [SDTCisSameAs<0, 1>, SDTCisVT<0, v2i32>, SDTCisInt<2>]>;
+def SDTHexagon_v4i16_v4i16_i32 : SDTypeProfile<1, 2,
+  [SDTCisSameAs<0, 1>, SDTCisVT<0, v4i16>, SDTCisInt<2>]>;
+
+def HexagonVSRAW: SDNode<"HexagonISD::VSRAW", SDTHexagon_v2i32_v2i32_i32>;
+def HexagonVSRAH: SDNode<"HexagonISD::VSRAH", SDTHexagon_v4i16_v4i16_i32>;
+def HexagonVSRLW: SDNode<"HexagonISD::VSRLW", SDTHexagon_v2i32_v2i32_i32>;
+def HexagonVSRLH: SDNode<"HexagonISD::VSRLH", SDTHexagon_v4i16_v4i16_i32>;
+def HexagonVSHLW: SDNode<"HexagonISD::VSHLW", SDTHexagon_v2i32_v2i32_i32>;
+def HexagonVSHLH: SDNode<"HexagonISD::VSHLH", SDTHexagon_v4i16_v4i16_i32>;
+
+def: Pat<(v2i32 (HexagonVSRAW V2I32:$Rs, u5_0ImmPred:$u5)),
+         (S2_asr_i_vw V2I32:$Rs, imm:$u5)>;
+def: Pat<(v4i16 (HexagonVSRAH V4I16:$Rs, u4_0ImmPred:$u4)),
+         (S2_asr_i_vh V4I16:$Rs, imm:$u4)>;
+def: Pat<(v2i32 (HexagonVSRLW V2I32:$Rs, u5_0ImmPred:$u5)),
+         (S2_lsr_i_vw V2I32:$Rs, imm:$u5)>;
+def: Pat<(v4i16 (HexagonVSRLH V4I16:$Rs, u4_0ImmPred:$u4)),
+         (S2_lsr_i_vh V4I16:$Rs, imm:$u4)>;
+def: Pat<(v2i32 (HexagonVSHLW V2I32:$Rs, u5_0ImmPred:$u5)),
+         (S2_asl_i_vw V2I32:$Rs, imm:$u5)>;
+def: Pat<(v4i16 (HexagonVSHLH V4I16:$Rs, u4_0ImmPred:$u4)),
+         (S2_asl_i_vh V4I16:$Rs, imm:$u4)>;
+
+class vshift_rr_pat<InstHexagon MI, SDNode Op, PatFrag Value>
+  : Pat <(Op Value:$Rs, I32:$Rt),
+         (MI Value:$Rs, I32:$Rt)>;
+
+def: vshift_rr_pat <S2_asr_r_vw, HexagonVSRAW, V2I32>;
+def: vshift_rr_pat <S2_asr_r_vh, HexagonVSRAH, V4I16>;
+def: vshift_rr_pat <S2_lsr_r_vw, HexagonVSRLW, V2I32>;
+def: vshift_rr_pat <S2_lsr_r_vh, HexagonVSRLH, V4I16>;
+def: vshift_rr_pat <S2_asl_r_vw, HexagonVSHLW, V2I32>;
+def: vshift_rr_pat <S2_asl_r_vh, HexagonVSHLH, V4I16>;
+
+
+def SDTHexagonVecCompare_v8i8 : SDTypeProfile<1, 2,
+  [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v8i8>]>;
+def SDTHexagonVecCompare_v4i16 : SDTypeProfile<1, 2,
+  [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v4i16>]>;
+def SDTHexagonVecCompare_v2i32 : SDTypeProfile<1, 2,
+  [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v2i32>]>;
+
+def HexagonVCMPBEQ:  SDNode<"HexagonISD::VCMPBEQ",  SDTHexagonVecCompare_v8i8>;
+def HexagonVCMPBGT:  SDNode<"HexagonISD::VCMPBGT",  SDTHexagonVecCompare_v8i8>;
+def HexagonVCMPBGTU: SDNode<"HexagonISD::VCMPBGTU", SDTHexagonVecCompare_v8i8>;
+def HexagonVCMPHEQ:  SDNode<"HexagonISD::VCMPHEQ",  SDTHexagonVecCompare_v4i16>;
+def HexagonVCMPHGT:  SDNode<"HexagonISD::VCMPHGT",  SDTHexagonVecCompare_v4i16>;
+def HexagonVCMPHGTU: SDNode<"HexagonISD::VCMPHGTU", SDTHexagonVecCompare_v4i16>;
+def HexagonVCMPWEQ:  SDNode<"HexagonISD::VCMPWEQ",  SDTHexagonVecCompare_v2i32>;
+def HexagonVCMPWGT:  SDNode<"HexagonISD::VCMPWGT",  SDTHexagonVecCompare_v2i32>;
+def HexagonVCMPWGTU: SDNode<"HexagonISD::VCMPWGTU", SDTHexagonVecCompare_v2i32>;
+
+
+class vcmp_i1_pat<InstHexagon MI, SDNode Op, PatFrag Value>
+  : Pat <(i1 (Op Value:$Rs, Value:$Rt)),
+         (MI Value:$Rs, Value:$Rt)>;
+
+def: vcmp_i1_pat<A2_vcmpbeq,  HexagonVCMPBEQ,  V8I8>;
+def: vcmp_i1_pat<A4_vcmpbgt,  HexagonVCMPBGT,  V8I8>;
+def: vcmp_i1_pat<A2_vcmpbgtu, HexagonVCMPBGTU, V8I8>;
+
+def: vcmp_i1_pat<A2_vcmpheq,  HexagonVCMPHEQ,  V4I16>;
+def: vcmp_i1_pat<A2_vcmphgt,  HexagonVCMPHGT,  V4I16>;
+def: vcmp_i1_pat<A2_vcmphgtu, HexagonVCMPHGTU, V4I16>;
+
+def: vcmp_i1_pat<A2_vcmpweq,  HexagonVCMPWEQ,  V2I32>;
+def: vcmp_i1_pat<A2_vcmpwgt,  HexagonVCMPWGT,  V2I32>;
+def: vcmp_i1_pat<A2_vcmpwgtu, HexagonVCMPWGTU, V2I32>;
+
+
+class vcmp_vi1_pat<InstHexagon MI, PatFrag Op, PatFrag InVal, ValueType OutTy>
+  : Pat <(OutTy (Op InVal:$Rs, InVal:$Rt)),
+         (MI InVal:$Rs, InVal:$Rt)>;
+
+def: vcmp_vi1_pat<A2_vcmpweq,  seteq,  V2I32, v2i1>;
+def: vcmp_vi1_pat<A2_vcmpwgt,  setgt,  V2I32, v2i1>;
+def: vcmp_vi1_pat<A2_vcmpwgtu, setugt, V2I32, v2i1>;
+
+def: vcmp_vi1_pat<A2_vcmpheq,  seteq,  V4I16, v4i1>;
+def: vcmp_vi1_pat<A2_vcmphgt,  setgt,  V4I16, v4i1>;
+def: vcmp_vi1_pat<A2_vcmphgtu, setugt, V4I16, v4i1>;
+
+def: Pat<(mul V2I32:$Rs, V2I32:$Rt),
+         (PS_vmulw DoubleRegs:$Rs, DoubleRegs:$Rt)>;
+def: Pat<(add V2I32:$Rx, (mul V2I32:$Rs, V2I32:$Rt)),
+         (PS_vmulw_acc DoubleRegs:$Rx, DoubleRegs:$Rs, DoubleRegs:$Rt)>;
+
+
+// Adds two v4i8: Hexagon does not have an insn for this one, so we
+// use the double add v8i8, and use only the low part of the result.
+def: Pat<(v4i8 (add (v4i8 IntRegs:$Rs), (v4i8 IntRegs:$Rt))),
+         (LoReg (A2_vaddub (ToZext64 $Rs), (ToZext64 $Rt)))>;
+
+// Subtract two v4i8: Hexagon does not have an insn for this one, so we
+// use the double sub v8i8, and use only the low part of the result.
+def: Pat<(v4i8 (sub (v4i8 IntRegs:$Rs), (v4i8 IntRegs:$Rt))),
+         (LoReg (A2_vsubub (ToZext64 $Rs), (ToZext64 $Rt)))>;
+
+//
+// No 32 bit vector mux.
+//
+def: Pat<(v4i8 (select I1:$Pu, V4I8:$Rs, V4I8:$Rt)),
+         (LoReg (C2_vmux I1:$Pu, (ToZext64 $Rs), (ToZext64 $Rt)))>;
+def: Pat<(v2i16 (select I1:$Pu, V2I16:$Rs, V2I16:$Rt)),
+         (LoReg (C2_vmux I1:$Pu, (ToZext64 $Rs), (ToZext64 $Rt)))>;
+
+//
+// 64-bit vector mux.
+//
+def: Pat<(v8i8 (vselect V8I1:$Pu, V8I8:$Rs, V8I8:$Rt)),
+         (C2_vmux V8I1:$Pu, V8I8:$Rs, V8I8:$Rt)>;
+def: Pat<(v4i16 (vselect V4I1:$Pu, V4I16:$Rs, V4I16:$Rt)),
+         (C2_vmux V4I1:$Pu, V4I16:$Rs, V4I16:$Rt)>;
+def: Pat<(v2i32 (vselect V2I1:$Pu, V2I32:$Rs, V2I32:$Rt)),
+         (C2_vmux V2I1:$Pu, V2I32:$Rs, V2I32:$Rt)>;
+
+//
+// No 32 bit vector compare.
+//
+def: Pat<(i1 (seteq V4I8:$Rs, V4I8:$Rt)),
+         (A2_vcmpbeq (ToZext64 $Rs), (ToZext64 $Rt))>;
+def: Pat<(i1 (setgt V4I8:$Rs, V4I8:$Rt)),
+         (A4_vcmpbgt (ToZext64 $Rs), (ToZext64 $Rt))>;
+def: Pat<(i1 (setugt V4I8:$Rs, V4I8:$Rt)),
+         (A2_vcmpbgtu (ToZext64 $Rs), (ToZext64 $Rt))>;
+
+def: Pat<(i1 (seteq V2I16:$Rs, V2I16:$Rt)),
+         (A2_vcmpheq (ToZext64 $Rs), (ToZext64 $Rt))>;
+def: Pat<(i1 (setgt V2I16:$Rs, V2I16:$Rt)),
+         (A2_vcmphgt (ToZext64 $Rs), (ToZext64 $Rt))>;
+def: Pat<(i1 (setugt V2I16:$Rs, V2I16:$Rt)),
+         (A2_vcmphgtu (ToZext64 $Rs), (ToZext64 $Rt))>;
+
+
+class InvertCmp_pat<InstHexagon InvMI, PatFrag CmpOp, PatFrag Value,
+                    ValueType CmpTy>
+  : Pat<(CmpTy (CmpOp Value:$Rs, Value:$Rt)),
+        (InvMI Value:$Rt, Value:$Rs)>;
+
+// Map from a compare operation to the corresponding instruction with the
+// order of operands reversed, e.g.  x > y --> cmp.lt(y,x).
+def: InvertCmp_pat<A4_vcmpbgt,  setlt,  V8I8,  i1>;
+def: InvertCmp_pat<A4_vcmpbgt,  setlt,  V8I8,  v8i1>;
+def: InvertCmp_pat<A2_vcmphgt,  setlt,  V4I16, i1>;
+def: InvertCmp_pat<A2_vcmphgt,  setlt,  V4I16, v4i1>;
+def: InvertCmp_pat<A2_vcmpwgt,  setlt,  V2I32, i1>;
+def: InvertCmp_pat<A2_vcmpwgt,  setlt,  V2I32, v2i1>;
+
+def: InvertCmp_pat<A2_vcmpbgtu, setult, V8I8,  i1>;
+def: InvertCmp_pat<A2_vcmpbgtu, setult, V8I8,  v8i1>;
+def: InvertCmp_pat<A2_vcmphgtu, setult, V4I16, i1>;
+def: InvertCmp_pat<A2_vcmphgtu, setult, V4I16, v4i1>;
+def: InvertCmp_pat<A2_vcmpwgtu, setult, V2I32, i1>;
+def: InvertCmp_pat<A2_vcmpwgtu, setult, V2I32, v2i1>;
+
+// Map from vcmpne(Rss) -> !vcmpew(Rss).
+// rs != rt -> !(rs == rt).
+def: Pat<(v2i1 (setne V2I32:$Rs, V2I32:$Rt)),
+         (C2_not (v2i1 (A2_vcmpbeq V2I32:$Rs, V2I32:$Rt)))>;
+
+
+// Truncate: from vector B copy all 'E'ven 'B'yte elements:
+// A[0] = B[0];  A[1] = B[2];  A[2] = B[4];  A[3] = B[6];
+def: Pat<(v4i8 (trunc V4I16:$Rs)),
+         (S2_vtrunehb V4I16:$Rs)>;
+
+// Truncate: from vector B copy all 'O'dd 'B'yte elements:
+// A[0] = B[1];  A[1] = B[3];  A[2] = B[5];  A[3] = B[7];
+// S2_vtrunohb
+
+// Truncate: from vectors B and C copy all 'E'ven 'H'alf-word elements:
+// A[0] = B[0];  A[1] = B[2];  A[2] = C[0];  A[3] = C[2];
+// S2_vtruneh
+
+def: Pat<(v2i16 (trunc V2I32:$Rs)),
+         (LoReg (S2_packhl (HiReg $Rs), (LoReg $Rs)))>;
+
+
+def HexagonVSXTBH : SDNode<"HexagonISD::VSXTBH", SDTUnaryOp>;
+def HexagonVSXTBW : SDNode<"HexagonISD::VSXTBW", SDTUnaryOp>;
+
+def: Pat<(i64 (HexagonVSXTBH I32:$Rs)), (S2_vsxtbh I32:$Rs)>;
+def: Pat<(i64 (HexagonVSXTBW I32:$Rs)), (S2_vsxthw I32:$Rs)>;
+
+def: Pat<(v4i16 (zext   V4I8:$Rs)),  (S2_vzxtbh V4I8:$Rs)>;
+def: Pat<(v2i32 (zext   V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>;
+def: Pat<(v4i16 (anyext V4I8:$Rs)),  (S2_vzxtbh V4I8:$Rs)>;
+def: Pat<(v2i32 (anyext V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>;
+def: Pat<(v4i16 (sext   V4I8:$Rs)),  (S2_vsxtbh V4I8:$Rs)>;
+def: Pat<(v2i32 (sext   V2I16:$Rs)), (S2_vsxthw V2I16:$Rs)>;
+
+// Sign extends a v2i8 into a v2i32.
+def: Pat<(v2i32 (sext_inreg V2I32:$Rs, v2i8)),
+         (A2_combinew (A2_sxtb (HiReg $Rs)), (A2_sxtb (LoReg $Rs)))>;
+
+// Sign extends a v2i16 into a v2i32.
+def: Pat<(v2i32 (sext_inreg V2I32:$Rs, v2i16)),
+         (A2_combinew (A2_sxth (HiReg $Rs)), (A2_sxth (LoReg $Rs)))>;
+
+
+// Multiplies two v2i16 and returns a v2i32.  We are using here the
+// saturating multiply, as hexagon does not provide a non saturating
+// vector multiply, and saturation does not impact the result that is
+// in double precision of the operands.
+
+// Multiplies two v2i16 vectors: as Hexagon does not have a multiply
+// with the C semantics for this one, this pattern uses the half word
+// multiply vmpyh that takes two v2i16 and returns a v2i32.  This is
+// then truncated to fit this back into a v2i16 and to simulate the
+// wrap around semantics for unsigned in C.
+def vmpyh: OutPatFrag<(ops node:$Rs, node:$Rt),
+                      (M2_vmpy2s_s0 (i32 $Rs), (i32 $Rt))>;
+
+def: Pat<(v2i16 (mul V2I16:$Rs, V2I16:$Rt)),
+         (LoReg (S2_vtrunewh (v2i32 (A2_combineii 0, 0)),
+                             (v2i32 (vmpyh V2I16:$Rs, V2I16:$Rt))))>;
+
+// Multiplies two v4i16 vectors.
+def: Pat<(v4i16 (mul V4I16:$Rs, V4I16:$Rt)),
+         (S2_vtrunewh (vmpyh (HiReg $Rs), (HiReg $Rt)),
+                      (vmpyh (LoReg $Rs), (LoReg $Rt)))>;
+
+def VMPYB_no_V5: OutPatFrag<(ops node:$Rs, node:$Rt),
+  (S2_vtrunewh (vmpyh (HiReg (S2_vsxtbh $Rs)), (HiReg (S2_vsxtbh $Rt))),
+               (vmpyh (LoReg (S2_vsxtbh $Rs)), (LoReg (S2_vsxtbh $Rt))))>;
+
+// Multiplies two v4i8 vectors.
+def: Pat<(v4i8 (mul V4I8:$Rs, V4I8:$Rt)),
+         (S2_vtrunehb (M5_vmpybsu V4I8:$Rs, V4I8:$Rt))>,
+     Requires<[HasV5T]>;
+
+def: Pat<(v4i8 (mul V4I8:$Rs, V4I8:$Rt)),
+         (S2_vtrunehb (VMPYB_no_V5 V4I8:$Rs, V4I8:$Rt))>;
+
+// Multiplies two v8i8 vectors.
+def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)),
+         (A2_combinew (S2_vtrunehb (M5_vmpybsu (HiReg $Rs), (HiReg $Rt))),
+                      (S2_vtrunehb (M5_vmpybsu (LoReg $Rs), (LoReg $Rt))))>,
+     Requires<[HasV5T]>;
+
+def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)),
+         (A2_combinew (S2_vtrunehb (VMPYB_no_V5 (HiReg $Rs), (HiReg $Rt))),
+                      (S2_vtrunehb (VMPYB_no_V5 (LoReg $Rs), (LoReg $Rt))))>;
+
+def SDTHexagonBinOp64 : SDTypeProfile<1, 2,
+  [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<0, i64>]>;
+
+def HexagonSHUFFEB: SDNode<"HexagonISD::SHUFFEB", SDTHexagonBinOp64>;
+def HexagonSHUFFEH: SDNode<"HexagonISD::SHUFFEH", SDTHexagonBinOp64>;
+def HexagonSHUFFOB: SDNode<"HexagonISD::SHUFFOB", SDTHexagonBinOp64>;
+def HexagonSHUFFOH: SDNode<"HexagonISD::SHUFFOH", SDTHexagonBinOp64>;
+
+class ShufflePat<InstHexagon MI, SDNode Op>
+  : Pat<(i64 (Op DoubleRegs:$src1, DoubleRegs:$src2)),
+        (i64 (MI DoubleRegs:$src1, DoubleRegs:$src2))>;
+
+// Shuffles even bytes for i=0..3: A[2*i].b = C[2*i].b; A[2*i+1].b = B[2*i].b
+def: ShufflePat<S2_shuffeb, HexagonSHUFFEB>;
+
+// Shuffles odd bytes for i=0..3: A[2*i].b = C[2*i+1].b; A[2*i+1].b = B[2*i+1].b
+def: ShufflePat<S2_shuffob, HexagonSHUFFOB>;
+
+// Shuffles even half for i=0,1: A[2*i].h = C[2*i].h; A[2*i+1].h = B[2*i].h
+def: ShufflePat<S2_shuffeh, HexagonSHUFFEH>;
+
+// Shuffles odd half for i=0,1: A[2*i].h = C[2*i+1].h; A[2*i+1].h = B[2*i+1].h
+def: ShufflePat<S2_shuffoh, HexagonSHUFFOH>;
+
+
+// Truncated store from v4i16 to v4i8.
+def truncstorev4i8: PatFrag<(ops node:$val, node:$ptr),
+                            (truncstore node:$val, node:$ptr),
+    [{ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4i8; }]>;
+
+// Truncated store from v2i32 to v2i16.
+def truncstorev2i16: PatFrag<(ops node:$val, node:$ptr),
+                             (truncstore node:$val, node:$ptr),
+    [{ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v2i16; }]>;
+
+def: Pat<(truncstorev2i16 V2I32:$Rs, I32:$Rt),
+         (S2_storeri_io I32:$Rt, 0, (LoReg (S2_packhl (HiReg $Rs),
+                                                      (LoReg $Rs))))>;
+
+def: Pat<(truncstorev4i8 V4I16:$Rs, I32:$Rt),
+         (S2_storeri_io I32:$Rt, 0, (S2_vtrunehb V4I16:$Rs))>;
+
+
+// Zero and sign extended load from v2i8 into v2i16.
+def zextloadv2i8: PatFrag<(ops node:$ptr), (zextload node:$ptr),
+    [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::v2i8; }]>;
+
+def sextloadv2i8: PatFrag<(ops node:$ptr), (sextload node:$ptr),
+    [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::v2i8; }]>;
+
+def: Pat<(v2i16 (zextloadv2i8 I32:$Rs)),
+         (LoReg (v4i16 (S2_vzxtbh (L2_loadruh_io I32:$Rs, 0))))>;
+
+def: Pat<(v2i16 (sextloadv2i8 I32:$Rs)),
+         (LoReg (v4i16 (S2_vsxtbh (L2_loadrh_io I32:$Rs, 0))))>;
+
+def: Pat<(v2i32 (zextloadv2i8 I32:$Rs)),
+         (S2_vzxthw (LoReg (v4i16 (S2_vzxtbh (L2_loadruh_io I32:$Rs, 0)))))>;
+
+def: Pat<(v2i32 (sextloadv2i8 I32:$Rs)),
+         (S2_vsxthw (LoReg (v4i16 (S2_vsxtbh (L2_loadrh_io I32:$Rs, 0)))))>;
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
new file mode 100644
index 000000000000..ee3209354688
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -0,0 +1,338 @@
+//===-- HexagonPeephole.cpp - Hexagon Peephole Optimiztions ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// This peephole pass optimizes in the following cases.
+// 1. Optimizes redundant sign extends for the following case
+//    Transform the following pattern
+//    %vreg170<def> = SXTW %vreg166
+//    ...
+//    %vreg176<def> = COPY %vreg170:isub_lo
+//
+//    Into
+//    %vreg176<def> = COPY vreg166
+//
+//  2. Optimizes redundant negation of predicates.
+//     %vreg15<def> = CMPGTrr %vreg6, %vreg2
+//     ...
+//     %vreg16<def> = NOT_p %vreg15<kill>
+//     ...
+//     JMP_c %vreg16<kill>, <BB#1>, %PC<imp-def,dead>
+//
+//     Into
+//     %vreg15<def> = CMPGTrr %vreg6, %vreg2;
+//     ...
+//     JMP_cNot %vreg15<kill>, <BB#1>, %PC<imp-def,dead>;
+//
+// Note: The peephole pass makes the instrucstions like
+// %vreg170<def> = SXTW %vreg166 or %vreg16<def> = NOT_p %vreg15<kill>
+// redundant and relies on some form of dead removal instructions, like
+// DCE or DIE to actually eliminate them.
+
+
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-peephole"
+
+static cl::opt<bool> DisableHexagonPeephole("disable-hexagon-peephole",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::desc("Disable Peephole Optimization"));
+
+static cl::opt<bool> DisablePNotP("disable-hexagon-pnotp",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::desc("Disable Optimization of PNotP"));
+
+static cl::opt<bool> DisableOptSZExt("disable-hexagon-optszext",
+    cl::Hidden, cl::ZeroOrMore, cl::init(true),
+    cl::desc("Disable Optimization of Sign/Zero Extends"));
+
+static cl::opt<bool> DisableOptExtTo64("disable-hexagon-opt-ext-to-64",
+    cl::Hidden, cl::ZeroOrMore, cl::init(true),
+    cl::desc("Disable Optimization of extensions to i64."));
+
+namespace llvm {
+  FunctionPass *createHexagonPeephole();
+  void initializeHexagonPeepholePass(PassRegistry&);
+}
+
+namespace {
+  struct HexagonPeephole : public MachineFunctionPass {
+    const HexagonInstrInfo    *QII;
+    const HexagonRegisterInfo *QRI;
+    const MachineRegisterInfo *MRI;
+
+  public:
+    static char ID;
+    HexagonPeephole() : MachineFunctionPass(ID) {
+      initializeHexagonPeepholePass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    StringRef getPassName() const override {
+      return "Hexagon optimize redundant zero and size extends";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    void ChangeOpInto(MachineOperand &Dst, MachineOperand &Src);
+  };
+}
+
+char HexagonPeephole::ID = 0;
+
+INITIALIZE_PASS(HexagonPeephole, "hexagon-peephole", "Hexagon Peephole",
+                false, false)
+
+bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  QII = static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  QRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  DenseMap<unsigned, unsigned> PeepholeMap;
+  DenseMap<unsigned, std::pair<unsigned, unsigned> > PeepholeDoubleRegsMap;
+
+  if (DisableHexagonPeephole) return false;
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock *MBB = &*MBBb;
+    PeepholeMap.clear();
+    PeepholeDoubleRegsMap.clear();
+
+    // Traverse the basic block.
+    for (MachineInstr &MI : *MBB) {
+      // Look for sign extends:
+      // %vreg170<def> = SXTW %vreg166
+      if (!DisableOptSZExt && MI.getOpcode() == Hexagon::A2_sxtw) {
+        assert(MI.getNumOperands() == 2);
+        MachineOperand &Dst = MI.getOperand(0);
+        MachineOperand &Src = MI.getOperand(1);
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src.getReg();
+        // Just handle virtual registers.
+        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+          // Map the following:
+          // %vreg170<def> = SXTW %vreg166
+          // PeepholeMap[170] = vreg166
+          PeepholeMap[DstReg] = SrcReg;
+        }
+      }
+
+      // Look for  %vreg170<def> = COMBINE_ir_V4 (0, %vreg169)
+      // %vreg170:DoublRegs, %vreg169:IntRegs
+      if (!DisableOptExtTo64 && MI.getOpcode() == Hexagon::A4_combineir) {
+        assert(MI.getNumOperands() == 3);
+        MachineOperand &Dst = MI.getOperand(0);
+        MachineOperand &Src1 = MI.getOperand(1);
+        MachineOperand &Src2 = MI.getOperand(2);
+        if (Src1.getImm() != 0)
+          continue;
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src2.getReg();
+        PeepholeMap[DstReg] = SrcReg;
+      }
+
+      // Look for this sequence below
+      // %vregDoubleReg1 = LSRd_ri %vregDoubleReg0, 32
+      // %vregIntReg = COPY %vregDoubleReg1:isub_lo.
+      // and convert into
+      // %vregIntReg = COPY %vregDoubleReg0:isub_hi.
+      if (MI.getOpcode() == Hexagon::S2_lsr_i_p) {
+        assert(MI.getNumOperands() == 3);
+        MachineOperand &Dst = MI.getOperand(0);
+        MachineOperand &Src1 = MI.getOperand(1);
+        MachineOperand &Src2 = MI.getOperand(2);
+        if (Src2.getImm() != 32)
+          continue;
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src1.getReg();
+        PeepholeDoubleRegsMap[DstReg] =
+          std::make_pair(*&SrcReg, Hexagon::isub_hi);
+      }
+
+      // Look for P=NOT(P).
+      if (!DisablePNotP && MI.getOpcode() == Hexagon::C2_not) {
+        assert(MI.getNumOperands() == 2);
+        MachineOperand &Dst = MI.getOperand(0);
+        MachineOperand &Src = MI.getOperand(1);
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src.getReg();
+        // Just handle virtual registers.
+        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+          // Map the following:
+          // %vreg170<def> = NOT_xx %vreg166
+          // PeepholeMap[170] = vreg166
+          PeepholeMap[DstReg] = SrcReg;
+        }
+      }
+
+      // Look for copy:
+      // %vreg176<def> = COPY %vreg170:isub_lo
+      if (!DisableOptSZExt && MI.isCopy()) {
+        assert(MI.getNumOperands() == 2);
+        MachineOperand &Dst = MI.getOperand(0);
+        MachineOperand &Src = MI.getOperand(1);
+
+        // Make sure we are copying the lower 32 bits.
+        if (Src.getSubReg() != Hexagon::isub_lo)
+          continue;
+
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src.getReg();
+        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+          // Try to find in the map.
+          if (unsigned PeepholeSrc = PeepholeMap.lookup(SrcReg)) {
+            // Change the 1st operand.
+            MI.RemoveOperand(1);
+            MI.addOperand(MachineOperand::CreateReg(PeepholeSrc, false));
+          } else  {
+            DenseMap<unsigned, std::pair<unsigned, unsigned> >::iterator DI =
+              PeepholeDoubleRegsMap.find(SrcReg);
+            if (DI != PeepholeDoubleRegsMap.end()) {
+              std::pair<unsigned,unsigned> PeepholeSrc = DI->second;
+              MI.RemoveOperand(1);
+              MI.addOperand(MachineOperand::CreateReg(
+                  PeepholeSrc.first, false /*isDef*/, false /*isImp*/,
+                  false /*isKill*/, false /*isDead*/, false /*isUndef*/,
+                  false /*isEarlyClobber*/, PeepholeSrc.second));
+            }
+          }
+        }
+      }
+
+      // Look for Predicated instructions.
+      if (!DisablePNotP) {
+        bool Done = false;
+        if (QII->isPredicated(MI)) {
+          MachineOperand &Op0 = MI.getOperand(0);
+          unsigned Reg0 = Op0.getReg();
+          const TargetRegisterClass *RC0 = MRI->getRegClass(Reg0);
+          if (RC0->getID() == Hexagon::PredRegsRegClassID) {
+            // Handle instructions that have a prediate register in op0
+            // (most cases of predicable instructions).
+            if (TargetRegisterInfo::isVirtualRegister(Reg0)) {
+              // Try to find in the map.
+              if (unsigned PeepholeSrc = PeepholeMap.lookup(Reg0)) {
+                // Change the 1st operand and, flip the opcode.
+                MI.getOperand(0).setReg(PeepholeSrc);
+                MRI->clearKillFlags(PeepholeSrc);
+                int NewOp = QII->getInvertedPredicatedOpcode(MI.getOpcode());
+                MI.setDesc(QII->get(NewOp));
+                Done = true;
+              }
+            }
+          }
+        }
+
+        if (!Done) {
+          // Handle special instructions.
+          unsigned Op = MI.getOpcode();
+          unsigned NewOp = 0;
+          unsigned PR = 1, S1 = 2, S2 = 3;   // Operand indices.
+
+          switch (Op) {
+            case Hexagon::C2_mux:
+            case Hexagon::C2_muxii:
+              NewOp = Op;
+              break;
+            case Hexagon::C2_muxri:
+              NewOp = Hexagon::C2_muxir;
+              break;
+            case Hexagon::C2_muxir:
+              NewOp = Hexagon::C2_muxri;
+              break;
+          }
+          if (NewOp) {
+            unsigned PSrc = MI.getOperand(PR).getReg();
+            if (unsigned POrig = PeepholeMap.lookup(PSrc)) {
+              MI.getOperand(PR).setReg(POrig);
+              MRI->clearKillFlags(POrig);
+              MI.setDesc(QII->get(NewOp));
+              // Swap operands S1 and S2.
+              MachineOperand Op1 = MI.getOperand(S1);
+              MachineOperand Op2 = MI.getOperand(S2);
+              ChangeOpInto(MI.getOperand(S1), Op2);
+              ChangeOpInto(MI.getOperand(S2), Op1);
+            }
+          } // if (NewOp)
+        } // if (!Done)
+
+      } // if (!DisablePNotP)
+
+    } // Instruction
+  } // Basic Block
+  return true;
+}
+
+void HexagonPeephole::ChangeOpInto(MachineOperand &Dst, MachineOperand &Src) {
+  assert (&Dst != &Src && "Cannot duplicate into itself");
+  switch (Dst.getType()) {
+    case MachineOperand::MO_Register:
+      if (Src.isReg()) {
+        Dst.setReg(Src.getReg());
+        Dst.setSubReg(Src.getSubReg());
+        MRI->clearKillFlags(Src.getReg());
+      } else if (Src.isImm()) {
+        Dst.ChangeToImmediate(Src.getImm());
+      } else {
+        llvm_unreachable("Unexpected src operand type");
+      }
+      break;
+
+    case MachineOperand::MO_Immediate:
+      if (Src.isImm()) {
+        Dst.setImm(Src.getImm());
+      } else if (Src.isReg()) {
+        Dst.ChangeToRegister(Src.getReg(), Src.isDef(), Src.isImplicit(),
+                             false, Src.isDead(), Src.isUndef(),
+                             Src.isDebug());
+        Dst.setSubReg(Src.getSubReg());
+      } else {
+        llvm_unreachable("Unexpected src operand type");
+      }
+      break;
+
+    default:
+      llvm_unreachable("Unexpected dst operand type");
+      break;
+  }
+}
+
+FunctionPass *llvm::createHexagonPeephole() {
+  return new HexagonPeephole();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
new file mode 100644
index 000000000000..30640e19ebac
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -0,0 +1,330 @@
+//===--- HexagonRDFOpt.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "RDFCopy.h"
+#include "RDFDeadCode.h"
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+using namespace rdf;
+
+namespace llvm {
+  void initializeHexagonRDFOptPass(PassRegistry&);
+  FunctionPass *createHexagonRDFOpt();
+}
+
+namespace {
+  unsigned RDFCount = 0;
+  cl::opt<unsigned> RDFLimit("rdf-limit", cl::init(UINT_MAX));
+  cl::opt<bool> RDFDump("rdf-dump", cl::init(false));
+
+  class HexagonRDFOpt : public MachineFunctionPass {
+  public:
+    HexagonRDFOpt() : MachineFunctionPass(ID) {
+      initializeHexagonRDFOptPass(*PassRegistry::getPassRegistry());
+    }
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineDominanceFrontier>();
+      AU.setPreservesAll();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    StringRef getPassName() const override {
+      return "Hexagon RDF optimizations";
+    }
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    static char ID;
+
+  private:
+    MachineDominatorTree *MDT;
+    MachineRegisterInfo *MRI;
+  };
+
+  char HexagonRDFOpt::ID = 0;
+}
+
+INITIALIZE_PASS_BEGIN(HexagonRDFOpt, "rdfopt", "Hexagon RDF opt", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
+INITIALIZE_PASS_END(HexagonRDFOpt, "rdfopt", "Hexagon RDF opt", false, false)
+
+
+namespace {
+struct HexagonCP : public CopyPropagation {
+  HexagonCP(DataFlowGraph &G) : CopyPropagation(G) {}
+  bool interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) override;
+};
+
+
+struct HexagonDCE : public DeadCodeElimination {
+  HexagonDCE(DataFlowGraph &G, MachineRegisterInfo &MRI)
+    : DeadCodeElimination(G, MRI) {}
+  bool rewrite(NodeAddr<InstrNode*> IA, SetVector<NodeId> &Remove);
+  void removeOperand(NodeAddr<InstrNode*> IA, unsigned OpNum);
+
+  bool run();
+};
+} // end anonymous namespace
+
+
+bool HexagonCP::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
+  auto mapRegs = [MI,&EM] (RegisterRef DstR, RegisterRef SrcR) -> void {
+    EM.insert(std::make_pair(DstR, SrcR));
+  };
+
+  DataFlowGraph &DFG = getDFG();
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case Hexagon::A2_combinew: {
+      const MachineOperand &DstOp = MI->getOperand(0);
+      const MachineOperand &HiOp = MI->getOperand(1);
+      const MachineOperand &LoOp = MI->getOperand(2);
+      assert(DstOp.getSubReg() == 0 && "Unexpected subregister");
+      mapRegs(DFG.makeRegRef(DstOp.getReg(), Hexagon::isub_hi),
+              DFG.makeRegRef(HiOp.getReg(),  HiOp.getSubReg()));
+      mapRegs(DFG.makeRegRef(DstOp.getReg(), Hexagon::isub_lo),
+              DFG.makeRegRef(LoOp.getReg(), LoOp.getSubReg()));
+      return true;
+    }
+    case Hexagon::A2_addi: {
+      const MachineOperand &A = MI->getOperand(2);
+      if (!A.isImm() || A.getImm() != 0)
+        return false;
+      LLVM_FALLTHROUGH;
+    }
+    case Hexagon::A2_tfr: {
+      const MachineOperand &DstOp = MI->getOperand(0);
+      const MachineOperand &SrcOp = MI->getOperand(1);
+      mapRegs(DFG.makeRegRef(DstOp.getReg(), DstOp.getSubReg()),
+              DFG.makeRegRef(SrcOp.getReg(), SrcOp.getSubReg()));
+      return true;
+    }
+  }
+
+  return CopyPropagation::interpretAsCopy(MI, EM);
+}
+
+
+bool HexagonDCE::run() {
+  bool Collected = collect();
+  if (!Collected)
+    return false;
+
+  const SetVector<NodeId> &DeadNodes = getDeadNodes();
+  const SetVector<NodeId> &DeadInstrs = getDeadInstrs();
+
+  typedef DenseMap<NodeId,NodeId> RefToInstrMap;
+  RefToInstrMap R2I;
+  SetVector<NodeId> PartlyDead;
+  DataFlowGraph &DFG = getDFG();
+
+  for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) {
+    for (auto TA : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Stmt>, DFG)) {
+      NodeAddr<StmtNode*> SA = TA;
+      for (NodeAddr<RefNode*> RA : SA.Addr->members(DFG)) {
+        R2I.insert(std::make_pair(RA.Id, SA.Id));
+        if (DFG.IsDef(RA) && DeadNodes.count(RA.Id))
+          if (!DeadInstrs.count(SA.Id))
+            PartlyDead.insert(SA.Id);
+      }
+    }
+  }
+
+
+  // Nodes to remove.
+  SetVector<NodeId> Remove = DeadInstrs;
+
+  bool Changed = false;
+  for (NodeId N : PartlyDead) {
+    auto SA = DFG.addr<StmtNode*>(N);
+    if (trace())
+      dbgs() << "Partly dead: " << *SA.Addr->getCode();
+    Changed |= rewrite(SA, Remove);
+  }
+
+  return erase(Remove) || Changed;
+}
+
+
+void HexagonDCE::removeOperand(NodeAddr<InstrNode*> IA, unsigned OpNum) {
+  MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
+
+  auto getOpNum = [MI] (MachineOperand &Op) -> unsigned {
+    for (unsigned i = 0, n = MI->getNumOperands(); i != n; ++i)
+      if (&MI->getOperand(i) == &Op)
+        return i;
+    llvm_unreachable("Invalid operand");
+  };
+  DenseMap<NodeId,unsigned> OpMap;
+  DataFlowGraph &DFG = getDFG();
+  NodeList Refs = IA.Addr->members(DFG);
+  for (NodeAddr<RefNode*> RA : Refs)
+    OpMap.insert(std::make_pair(RA.Id, getOpNum(RA.Addr->getOp())));
+
+  MI->RemoveOperand(OpNum);
+
+  for (NodeAddr<RefNode*> RA : Refs) {
+    unsigned N = OpMap[RA.Id];
+    if (N < OpNum)
+      RA.Addr->setRegRef(&MI->getOperand(N), DFG);
+    else if (N > OpNum)
+      RA.Addr->setRegRef(&MI->getOperand(N-1), DFG);
+  }
+}
+
+
+bool HexagonDCE::rewrite(NodeAddr<InstrNode*> IA, SetVector<NodeId> &Remove) {
+  if (!getDFG().IsCode<NodeAttrs::Stmt>(IA))
+    return false;
+  DataFlowGraph &DFG = getDFG();
+  MachineInstr &MI = *NodeAddr<StmtNode*>(IA).Addr->getCode();
+  auto &HII = static_cast<const HexagonInstrInfo&>(DFG.getTII());
+  if (HII.getAddrMode(MI) != HexagonII::PostInc)
+    return false;
+  unsigned Opc = MI.getOpcode();
+  unsigned OpNum, NewOpc;
+  switch (Opc) {
+    case Hexagon::L2_loadri_pi:
+      NewOpc = Hexagon::L2_loadri_io;
+      OpNum = 1;
+      break;
+    case Hexagon::L2_loadrd_pi:
+      NewOpc = Hexagon::L2_loadrd_io;
+      OpNum = 1;
+      break;
+    case Hexagon::V6_vL32b_pi:
+      NewOpc = Hexagon::V6_vL32b_ai;
+      OpNum = 1;
+      break;
+    case Hexagon::S2_storeri_pi:
+      NewOpc = Hexagon::S2_storeri_io;
+      OpNum = 0;
+      break;
+    case Hexagon::S2_storerd_pi:
+      NewOpc = Hexagon::S2_storerd_io;
+      OpNum = 0;
+      break;
+    case Hexagon::V6_vS32b_pi:
+      NewOpc = Hexagon::V6_vS32b_ai;
+      OpNum = 0;
+      break;
+    default:
+      return false;
+  }
+  auto IsDead = [this] (NodeAddr<DefNode*> DA) -> bool {
+    return getDeadNodes().count(DA.Id);
+  };
+  NodeList Defs;
+  MachineOperand &Op = MI.getOperand(OpNum);
+  for (NodeAddr<DefNode*> DA : IA.Addr->members_if(DFG.IsDef, DFG)) {
+    if (&DA.Addr->getOp() != &Op)
+      continue;
+    Defs = DFG.getRelatedRefs(IA, DA);
+    if (!all_of(Defs, IsDead))
+      return false;
+    break;
+  }
+
+  // Mark all nodes in Defs for removal.
+  for (auto D : Defs)
+    Remove.insert(D.Id);
+
+  if (trace())
+    dbgs() << "Rewriting: " << MI;
+  MI.setDesc(HII.get(NewOpc));
+  MI.getOperand(OpNum+2).setImm(0);
+  removeOperand(IA, OpNum);
+  if (trace())
+    dbgs() << "       to: " << MI;
+
+  return true;
+}
+
+
+bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  if (RDFLimit.getPosition()) {
+    if (RDFCount >= RDFLimit)
+      return false;
+    RDFCount++;
+  }
+
+  MDT = &getAnalysis<MachineDominatorTree>();
+  const auto &MDF = getAnalysis<MachineDominanceFrontier>();
+  const auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  const auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  bool Changed;
+
+  if (RDFDump)
+    MF.print(dbgs() << "Before " << getPassName() << "\n", nullptr);
+
+  TargetOperandInfo TOI(HII);
+  DataFlowGraph G(MF, HII, HRI, *MDT, MDF, TOI);
+  // Dead phi nodes are necessary for copy propagation: we can add a use
+  // of a register in a block where it would need a phi node, but which
+  // was dead (and removed) during the graph build time.
+  G.build(BuildOptions::KeepDeadPhis);
+
+  if (RDFDump)
+    dbgs() << "Starting copy propagation on: " << MF.getName() << '\n'
+           << PrintNode<FuncNode*>(G.getFunc(), G) << '\n';
+  HexagonCP CP(G);
+  CP.trace(RDFDump);
+  Changed = CP.run();
+
+  if (RDFDump)
+    dbgs() << "Starting dead code elimination on: " << MF.getName() << '\n'
+           << PrintNode<FuncNode*>(G.getFunc(), G) << '\n';
+  HexagonDCE DCE(G, *MRI);
+  DCE.trace(RDFDump);
+  Changed |= DCE.run();
+
+  if (Changed) {
+    if (RDFDump)
+      dbgs() << "Starting liveness recomputation on: " << MF.getName() << '\n';
+    Liveness LV(*MRI, G);
+    LV.trace(RDFDump);
+    LV.computeLiveIns();
+    LV.resetLiveIns();
+    LV.resetKills();
+  }
+
+  if (RDFDump)
+    MF.print(dbgs() << "After " << getPassName() << "\n", nullptr);
+
+  return false;
+}
+
+
+FunctionPass *llvm::createHexagonRDFOpt() {
+  return new HexagonRDFOpt();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
new file mode 100644
index 000000000000..d3f230d3f8a6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -0,0 +1,272 @@
+//===-- HexagonRegisterInfo.cpp - Hexagon Register Information ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonRegisterInfo.h"
+#include "Hexagon.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+HexagonRegisterInfo::HexagonRegisterInfo()
+    : HexagonGenRegisterInfo(Hexagon::R31) {}
+
+
+bool HexagonRegisterInfo::isEHReturnCalleeSaveReg(unsigned R) const {
+  return R == Hexagon::R0 || R == Hexagon::R1 || R == Hexagon::R2 ||
+         R == Hexagon::R3 || R == Hexagon::D0 || R == Hexagon::D1;
+}
+
+bool HexagonRegisterInfo::isCalleeSaveReg(unsigned Reg) const {
+  return Hexagon::R16 <= Reg && Reg <= Hexagon::R27;
+}
+
+
+const MCPhysReg *
+HexagonRegisterInfo::getCallerSavedRegs(const MachineFunction *MF,
+      const TargetRegisterClass *RC) const {
+  using namespace Hexagon;
+
+  static const MCPhysReg Int32[] = {
+    R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, 0
+  };
+  static const MCPhysReg Int64[] = {
+    D0, D1, D2, D3, D4, D5, D6, D7, 0
+  };
+  static const MCPhysReg Pred[] = {
+    P0, P1, P2, P3, 0
+  };
+  static const MCPhysReg VecSgl[] = {
+     V0,  V1,  V2,  V3,  V4,  V5,  V6,  V7,  V8,  V9, V10, V11, V12, V13,
+    V14, V15, V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27,
+    V28, V29, V30, V31,   0
+  };
+  static const MCPhysReg VecDbl[] = {
+    W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, 0
+  };
+
+  switch (RC->getID()) {
+    case IntRegsRegClassID:
+      return Int32;
+    case DoubleRegsRegClassID:
+      return Int64;
+    case PredRegsRegClassID:
+      return Pred;
+    case VectorRegsRegClassID:
+    case VectorRegs128BRegClassID:
+      return VecSgl;
+    case VecDblRegsRegClassID:
+    case VecDblRegs128BRegClassID:
+      return VecDbl;
+    default:
+      break;
+  }
+
+  static const MCPhysReg Empty[] = { 0 };
+#ifndef NDEBUG
+  dbgs() << "Register class: " << getRegClassName(RC) << "\n";
+#endif
+  llvm_unreachable("Unexpected register class");
+  return Empty;
+}
+
+
+const MCPhysReg *
+HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const MCPhysReg CalleeSavedRegsV3[] = {
+    Hexagon::R16,   Hexagon::R17,   Hexagon::R18,   Hexagon::R19,
+    Hexagon::R20,   Hexagon::R21,   Hexagon::R22,   Hexagon::R23,
+    Hexagon::R24,   Hexagon::R25,   Hexagon::R26,   Hexagon::R27, 0
+  };
+
+  // Functions that contain a call to __builtin_eh_return also save the first 4
+  // parameter registers.
+  static const MCPhysReg CalleeSavedRegsV3EHReturn[] = {
+    Hexagon::R0,    Hexagon::R1,    Hexagon::R2,    Hexagon::R3,
+    Hexagon::R16,   Hexagon::R17,   Hexagon::R18,   Hexagon::R19,
+    Hexagon::R20,   Hexagon::R21,   Hexagon::R22,   Hexagon::R23,
+    Hexagon::R24,   Hexagon::R25,   Hexagon::R26,   Hexagon::R27, 0
+  };
+
+  bool HasEHReturn = MF->getInfo<HexagonMachineFunctionInfo>()->hasEHReturn();
+
+  switch (MF->getSubtarget<HexagonSubtarget>().getHexagonArchVersion()) {
+  case HexagonSubtarget::V4:
+  case HexagonSubtarget::V5:
+  case HexagonSubtarget::V55:
+  case HexagonSubtarget::V60:
+    return HasEHReturn ? CalleeSavedRegsV3EHReturn : CalleeSavedRegsV3;
+  }
+
+  llvm_unreachable("Callee saved registers requested for unknown architecture "
+                   "version");
+}
+
+
+BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
+  const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(Hexagon::R29);
+  Reserved.set(Hexagon::R30);
+  Reserved.set(Hexagon::R31);
+  Reserved.set(Hexagon::PC);
+  Reserved.set(Hexagon::D14);
+  Reserved.set(Hexagon::D15);
+  Reserved.set(Hexagon::LC0);
+  Reserved.set(Hexagon::LC1);
+  Reserved.set(Hexagon::SA0);
+  Reserved.set(Hexagon::SA1);
+  Reserved.set(Hexagon::UGP);
+  Reserved.set(Hexagon::GP);
+  Reserved.set(Hexagon::CS0);
+  Reserved.set(Hexagon::CS1);
+  Reserved.set(Hexagon::CS);
+  Reserved.set(Hexagon::USR);
+  return Reserved;
+}
+
+
+void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                              int SPAdj, unsigned FIOp,
+                                              RegScavenger *RS) const {
+  //
+  // Hexagon_TODO: Do we need to enforce this for Hexagon?
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MB = *MI.getParent();
+  MachineFunction &MF = *MB.getParent();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HII = *HST.getInstrInfo();
+  auto &HFI = *HST.getFrameLowering();
+
+  unsigned BP = 0;
+  int FI = MI.getOperand(FIOp).getIndex();
+  // Select the base pointer (BP) and calculate the actual offset from BP
+  // to the beginning of the object at index FI.
+  int Offset = HFI.getFrameIndexReference(MF, FI, BP);
+  // Add the offset from the instruction.
+  int RealOffset = Offset + MI.getOperand(FIOp+1).getImm();
+  bool IsKill = false;
+
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+    case Hexagon::PS_fia:
+      MI.setDesc(HII.get(Hexagon::A2_addi));
+      MI.getOperand(FIOp).ChangeToImmediate(RealOffset);
+      MI.RemoveOperand(FIOp+1);
+      return;
+    case Hexagon::PS_fi:
+      // Set up the instruction for updating below.
+      MI.setDesc(HII.get(Hexagon::A2_addi));
+      break;
+  }
+
+  if (!HII.isValidOffset(Opc, RealOffset)) {
+    // If the offset is not valid, calculate the address in a temporary
+    // register and use it with offset 0.
+    auto &MRI = MF.getRegInfo();
+    unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
+    const DebugLoc &DL = MI.getDebugLoc();
+    BuildMI(MB, II, DL, HII.get(Hexagon::A2_addi), TmpR)
+      .addReg(BP)
+      .addImm(RealOffset);
+    BP = TmpR;
+    RealOffset = 0;
+    IsKill = true;
+  }
+
+  MI.getOperand(FIOp).ChangeToRegister(BP, false, false, IsKill);
+  MI.getOperand(FIOp+1).ChangeToImmediate(RealOffset);
+}
+
+
+unsigned HexagonRegisterInfo::getRARegister() const {
+  return Hexagon::R31;
+}
+
+
+unsigned HexagonRegisterInfo::getFrameRegister(const MachineFunction
+                                               &MF) const {
+  const HexagonFrameLowering *TFI = getFrameLowering(MF);
+  if (TFI->hasFP(MF))
+    return getFrameRegister();
+  return getStackRegister();
+}
+
+
+unsigned HexagonRegisterInfo::getFrameRegister() const {
+  return Hexagon::R30;
+}
+
+
+unsigned HexagonRegisterInfo::getStackRegister() const {
+  return Hexagon::R29;
+}
+
+
+unsigned HexagonRegisterInfo::getHexagonSubRegIndex(
+      const TargetRegisterClass *RC, unsigned GenIdx) const {
+  assert(GenIdx == Hexagon::ps_sub_lo || GenIdx == Hexagon::ps_sub_hi);
+
+  static const unsigned ISub[] = { Hexagon::isub_lo, Hexagon::isub_hi };
+  static const unsigned VSub[] = { Hexagon::vsub_lo, Hexagon::vsub_hi };
+
+  switch (RC->getID()) {
+    case Hexagon::CtrRegs64RegClassID:
+    case Hexagon::DoubleRegsRegClassID:
+      return ISub[GenIdx];
+    case Hexagon::VecDblRegsRegClassID:
+    case Hexagon::VecDblRegs128BRegClassID:
+      return VSub[GenIdx];
+  }
+
+  if (const TargetRegisterClass *SuperRC = *RC->getSuperClasses())
+    return getHexagonSubRegIndex(SuperRC, GenIdx);
+
+  llvm_unreachable("Invalid register class");
+}
+
+bool HexagonRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF)
+      const {
+  return MF.getSubtarget<HexagonSubtarget>().getFrameLowering()->hasFP(MF);
+}
+
+
+unsigned HexagonRegisterInfo::getFirstCallerSavedNonParamReg() const {
+  return Hexagon::R6;
+}
+
+
+#define GET_REGINFO_TARGET_DESC
+#include "HexagonGenRegisterInfo.inc"
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
new file mode 100644
index 000000000000..1fb295b5bd8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -0,0 +1,84 @@
+//==- HexagonRegisterInfo.h - Hexagon Register Information Impl --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONREGISTERINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONREGISTERINFO_H
+
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "HexagonGenRegisterInfo.inc"
+
+namespace llvm {
+
+namespace Hexagon {
+  // Generic (pseudo) subreg indices for use with getHexagonSubRegIndex.
+  enum { ps_sub_lo = 0, ps_sub_hi = 1 };
+}
+
+class HexagonRegisterInfo : public HexagonGenRegisterInfo {
+public:
+  HexagonRegisterInfo();
+
+  /// Code Generation virtual methods...
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF)
+        const override;
+
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+        unsigned FIOperandNum, RegScavenger *RS = nullptr) const override;
+
+  /// Returns true since we may need scavenging for a temporary register
+  /// when generating hardware loop instructions.
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  /// Returns true. Spill code for predicate registers might need an extra
+  /// register.
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  /// Returns true if the frame pointer is valid.
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+  unsigned getFrameRegister() const;
+  unsigned getStackRegister() const;
+
+  unsigned getHexagonSubRegIndex(const TargetRegisterClass *RC,
+        unsigned GenIdx) const;
+
+  const MCPhysReg *getCallerSavedRegs(const MachineFunction *MF,
+        const TargetRegisterClass *RC) const;
+
+  unsigned getFirstCallerSavedNonParamReg() const;
+
+  bool isEHReturnCalleeSaveReg(unsigned Reg) const;
+  bool isCalleeSaveReg(unsigned Reg) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
new file mode 100644
index 000000000000..a75f3514dbd2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -0,0 +1,286 @@
+//===-- HexagonRegisterInfo.td - Hexagon Register defs -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the Hexagon register file.
+//===----------------------------------------------------------------------===//
+
+let Namespace = "Hexagon" in {
+
+  class HexagonReg<bits<5> num, string n, list<string> alt = [],
+                   list<Register> alias = []> : Register<n, alt> {
+    field bits<5> Num;
+    let Aliases = alias;
+    let HWEncoding{4-0} = num;
+  }
+
+  class HexagonDoubleReg<bits<5> num, string n, list<Register> subregs,
+                         list<string> alt = []> :
+        RegisterWithSubRegs<n, subregs> {
+    field bits<5> Num;
+
+    let AltNames = alt;
+    let HWEncoding{4-0} = num;
+  }
+
+  // Registers are identified with 5-bit ID numbers.
+  // Ri - 32-bit integer registers.
+  class Ri<bits<5> num, string n, list<string> alt = []> :
+        HexagonReg<num, n, alt> {
+    let Num = num;
+  }
+
+  // Rf - 32-bit floating-point registers.
+  class Rf<bits<5> num, string n> : HexagonReg<num, n> {
+    let Num = num;
+  }
+
+
+  // Rd - 64-bit registers.
+  class Rd<bits<5> num, string n, list<Register> subregs,
+           list<string> alt = []> :
+        HexagonDoubleReg<num, n, subregs, alt> {
+    let Num = num;
+    let SubRegs = subregs;
+  }
+
+  // Rp - predicate registers
+  class Rp<bits<5> num, string n> : HexagonReg<num, n> {
+    let Num = num;
+  }
+
+
+  // Rq - vector predicate registers
+  class Rq<bits<3> num, string n> : Register<n, []> {
+    let HWEncoding{2-0} = num;
+  }
+
+  // Rc - control registers
+  class Rc<bits<5> num, string n,
+           list<string> alt = [], list<Register> alias = []> : 
+        HexagonReg<num, n, alt, alias> {
+    let Num = num;
+  }
+
+  // Rcc - 64-bit control registers.
+  class Rcc<bits<5> num, string n, list<Register> subregs,
+            list<string> alt = []> :
+        HexagonDoubleReg<num, n, subregs, alt> {
+    let Num = num;
+    let SubRegs = subregs;
+  }
+
+  // Mx - address modifier registers
+  class Mx<bits<1> num, string n> : HexagonReg<{0b0000, num}, n> {
+    let Num = !cast<bits<5>>(num);
+  }
+
+  def isub_lo  : SubRegIndex<32>;
+  def isub_hi  : SubRegIndex<32, 32>;
+  def vsub_lo  : SubRegIndex<512>;
+  def vsub_hi  : SubRegIndex<512, 512>;
+  def subreg_overflow : SubRegIndex<1, 0>;
+
+  // Integer registers.
+  foreach i = 0-28 in {
+    def R#i  : Ri<i, "r"#i>,  DwarfRegNum<[i]>;
+  }
+
+  def R29 : Ri<29, "r29", ["sp"]>, DwarfRegNum<[29]>;
+  def R30 : Ri<30, "r30", ["fp"]>, DwarfRegNum<[30]>;
+  def R31 : Ri<31, "r31", ["lr"]>, DwarfRegNum<[31]>;
+
+  // Aliases of the R* registers used to hold 64-bit int values (doubles).
+  let SubRegIndices = [isub_lo, isub_hi], CoveredBySubRegs = 1 in {
+  def D0  : Rd< 0,  "r1:0",  [R0,  R1]>,  DwarfRegNum<[32]>;
+  def D1  : Rd< 2,  "r3:2",  [R2,  R3]>,  DwarfRegNum<[34]>;
+  def D2  : Rd< 4,  "r5:4",  [R4,  R5]>,  DwarfRegNum<[36]>;
+  def D3  : Rd< 6,  "r7:6",  [R6,  R7]>,  DwarfRegNum<[38]>;
+  def D4  : Rd< 8,  "r9:8",  [R8,  R9]>,  DwarfRegNum<[40]>;
+  def D5  : Rd<10, "r11:10", [R10, R11]>, DwarfRegNum<[42]>;
+  def D6  : Rd<12, "r13:12", [R12, R13]>, DwarfRegNum<[44]>;
+  def D7  : Rd<14, "r15:14", [R14, R15]>, DwarfRegNum<[46]>;
+  def D8  : Rd<16, "r17:16", [R16, R17]>, DwarfRegNum<[48]>;
+  def D9  : Rd<18, "r19:18", [R18, R19]>, DwarfRegNum<[50]>;
+  def D10 : Rd<20, "r21:20", [R20, R21]>, DwarfRegNum<[52]>;
+  def D11 : Rd<22, "r23:22", [R22, R23]>, DwarfRegNum<[54]>;
+  def D12 : Rd<24, "r25:24", [R24, R25]>, DwarfRegNum<[56]>;
+  def D13 : Rd<26, "r27:26", [R26, R27]>, DwarfRegNum<[58]>;
+  def D14 : Rd<28, "r29:28", [R28, R29]>, DwarfRegNum<[60]>;
+  def D15 : Rd<30, "r31:30", [R30, R31], ["lr:fp"]>, DwarfRegNum<[62]>;
+  }
+
+  // Predicate registers.
+  def P0 : Rp<0, "p0">, DwarfRegNum<[63]>;
+  def P1 : Rp<1, "p1">, DwarfRegNum<[64]>;
+  def P2 : Rp<2, "p2">, DwarfRegNum<[65]>;
+  def P3 : Rp<3, "p3">, DwarfRegNum<[66]>;
+
+  // Modifier registers.
+  // C6 and C7 can also be M0 and M1, but register names must be unique, even
+  // if belonging to different register classes.
+  def M0 : Mx<0, "m0">, DwarfRegNum<[72]>;
+  def M1 : Mx<1, "m1">, DwarfRegNum<[73]>;
+
+  // Fake register to represent USR.OVF bit. Artihmetic/saturating instruc-
+  // tions modify this bit, and multiple such instructions are allowed in the
+  // same packet. We need to ignore output dependencies on this bit, but not
+  // on the entire USR.
+  def USR_OVF : Rc<?, "usr.ovf">;
+
+  def USR  : Rc<8,  "usr",       ["c8"]>,   DwarfRegNum<[75]> {
+    let SubRegIndices = [subreg_overflow];
+    let SubRegs = [USR_OVF];
+  }
+
+  // Control registers.
+  def SA0  : Rc<0,  "sa0",       ["c0"]>,   DwarfRegNum<[67]>;
+  def LC0  : Rc<1,  "lc0",       ["c1"]>,   DwarfRegNum<[68]>;
+  def SA1  : Rc<2,  "sa1",       ["c2"]>,   DwarfRegNum<[69]>;
+  def LC1  : Rc<3,  "lc1",       ["c3"]>,   DwarfRegNum<[70]>;
+  def P3_0 : Rc<4,  "p3:0",      ["c4"], [P0, P1, P2, P3]>,
+                                            DwarfRegNum<[71]>;
+  def C5   : Rc<5,  "c5",        ["c5"]>,   DwarfRegNum<[72]>; // future use
+  def C6   : Rc<6,  "c6",        [], [M0]>, DwarfRegNum<[73]>;
+  def C7   : Rc<7,  "c7",        [], [M1]>, DwarfRegNum<[74]>;
+  // Define C8 separately and make it aliased with USR.
+  // The problem is that USR has subregisters (e.g. overflow). If USR was
+  // specified as a subregister of C9_8, it would imply that subreg_overflow
+  // and isub_lo can be composed, which leads to all kinds of issues
+  // with lane masks.
+  def C8   : Rc<8,  "c8",       [], [USR]>, DwarfRegNum<[75]>;
+  def PC   : Rc<9,  "pc">,                  DwarfRegNum<[76]>;
+  def UGP  : Rc<10, "ugp",       ["c10"]>,  DwarfRegNum<[77]>;
+  def GP   : Rc<11, "gp",        ["c11"]>,  DwarfRegNum<[78]>;
+  def CS0  : Rc<12, "cs0",       ["c12"]>,  DwarfRegNum<[79]>;
+  def CS1  : Rc<13, "cs1",       ["c13"]>,  DwarfRegNum<[80]>;
+  def UPCL : Rc<14, "upcyclelo", ["c14"]>,  DwarfRegNum<[81]>;
+  def UPCH : Rc<15, "upcyclehi", ["c15"]>,  DwarfRegNum<[82]>;
+}
+
+  // Control registers pairs.
+  let SubRegIndices = [isub_lo, isub_hi], CoveredBySubRegs = 1 in {
+    def C1_0   : Rcc<0,   "c1:0",  [SA0, LC0], ["lc0:sa0"]>, DwarfRegNum<[67]>;
+    def C3_2   : Rcc<2,   "c3:2",  [SA1, LC1], ["lc1:sa1"]>, DwarfRegNum<[69]>;
+    def C5_4   : Rcc<4,   "c5:4",  [P3_0, C5]>,              DwarfRegNum<[71]>;
+    def C7_6   : Rcc<6,   "c7:6",  [C6, C7],   ["m1:0"]>,    DwarfRegNum<[72]>;
+    // Use C8 instead of USR as a subregister of C9_8.
+    def C9_8   : Rcc<8,   "c9:8",  [C8, PC]>,                DwarfRegNum<[74]>;
+    def C11_10 : Rcc<10, "c11:10", [UGP, GP]>,               DwarfRegNum<[76]>;
+    def CS     : Rcc<12, "c13:12", [CS0, CS1], ["cs1:0"]>,   DwarfRegNum<[78]>;
+    def UPC    : Rcc<14, "c15:14", [UPCL, UPCH]>,            DwarfRegNum<[80]>;
+  }
+
+  foreach i = 0-31 in {
+    def V#i  : Ri<i, "v"#i>,  DwarfRegNum<[!add(i, 99)]>;
+  }
+
+  // Aliases of the V* registers used to hold double vec values.
+  let SubRegIndices = [vsub_lo, vsub_hi], CoveredBySubRegs = 1 in {
+  def W0  : Rd< 0,  "v1:0",  [V0,  V1]>,  DwarfRegNum<[99]>;
+  def W1  : Rd< 2,  "v3:2",  [V2,  V3]>,  DwarfRegNum<[101]>;
+  def W2  : Rd< 4,  "v5:4",  [V4,  V5]>,  DwarfRegNum<[103]>;
+  def W3  : Rd< 6,  "v7:6",  [V6,  V7]>,  DwarfRegNum<[105]>;
+  def W4  : Rd< 8,  "v9:8",  [V8,  V9]>,  DwarfRegNum<[107]>;
+  def W5  : Rd<10, "v11:10", [V10, V11]>, DwarfRegNum<[109]>;
+  def W6  : Rd<12, "v13:12", [V12, V13]>, DwarfRegNum<[111]>;
+  def W7  : Rd<14, "v15:14", [V14, V15]>, DwarfRegNum<[113]>;
+  def W8  : Rd<16, "v17:16", [V16, V17]>, DwarfRegNum<[115]>;
+  def W9  : Rd<18, "v19:18", [V18, V19]>, DwarfRegNum<[117]>;
+  def W10 : Rd<20, "v21:20", [V20, V21]>, DwarfRegNum<[119]>;
+  def W11 : Rd<22, "v23:22", [V22, V23]>, DwarfRegNum<[121]>;
+  def W12 : Rd<24, "v25:24", [V24, V25]>, DwarfRegNum<[123]>;
+  def W13 : Rd<26, "v27:26", [V26, V27]>, DwarfRegNum<[125]>;
+  def W14 : Rd<28, "v29:28", [V28, V29]>, DwarfRegNum<[127]>;
+  def W15 : Rd<30, "v31:30", [V30, V31]>, DwarfRegNum<[129]>;
+  }
+
+  // Vector Predicate registers.
+  def Q0 : Rq<0, "q0">, DwarfRegNum<[131]>;
+  def Q1 : Rq<1, "q1">, DwarfRegNum<[132]>;
+  def Q2 : Rq<2, "q2">, DwarfRegNum<[133]>;
+  def Q3 : Rq<3, "q3">, DwarfRegNum<[134]>;
+
+// Register classes.
+//
+// FIXME: the register order should be defined in terms of the preferred
+// allocation order...
+//
+def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32,
+                            (add (sequence "R%u", 0, 9),
+                                 (sequence "R%u", 12, 28),
+                                 R10, R11, R29, R30, R31)> {
+}
+
+// Registers are listed in reverse order for allocation preference reasons.
+def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32,
+                                (add R7, R6, R5, R4, R3, R2, R1, R0)> ;
+
+def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64,
+                               (add (sequence "D%u", 0, 4),
+                                    (sequence "D%u", 6, 13), D5, D14, D15)>;
+
+def VectorRegs : RegisterClass<"Hexagon", [v64i8, v32i16, v16i32, v8i64], 512,
+                               (add (sequence "V%u", 0, 31))>;
+
+def VecDblRegs : RegisterClass<"Hexagon",
+                         [v128i8, v64i16, v32i32, v16i64], 1024,
+                               (add (sequence "W%u", 0, 15))>;
+
+def VectorRegs128B : RegisterClass<"Hexagon",
+                         [v128i8, v64i16, v32i32, v16i64], 1024,
+                               (add (sequence "V%u", 0, 31))>;
+
+def VecDblRegs128B : RegisterClass<"Hexagon",
+                         [v256i8,v128i16,v64i32,v32i64], 2048,
+                               (add (sequence "W%u", 0, 15))>;
+
+def VecPredRegs : RegisterClass<"Hexagon", [v512i1], 512,
+                                (add (sequence "Q%u", 0, 3))>;
+
+def VecPredRegs128B : RegisterClass<"Hexagon", [v1024i1], 1024,
+                                   (add (sequence "Q%u", 0, 3))>;
+
+def PredRegs : RegisterClass<"Hexagon", 
+                             [i1, v2i1, v4i1, v8i1, v4i8, v2i16, i32], 32,
+                             (add (sequence "P%u", 0, 3))>
+{
+  let Size = 32;
+}
+
+let Size = 32 in
+def ModRegs : RegisterClass<"Hexagon", [i32], 32, (add M0, M1)>;
+
+let Size = 32, isAllocatable = 0 in
+def CtrRegs : RegisterClass<"Hexagon", [i32], 32,
+                            (add LC0, SA0, LC1, SA1,
+                                 P3_0, C5,
+                                 M0, M1, C6, C7, C8, CS0, CS1, UPCL, UPCH,
+                                 USR, UGP, GP, PC)>;
+
+let isAllocatable = 0 in
+def UsrBits : RegisterClass<"Hexagon", [i1], 0, (add USR_OVF)>;
+
+let Size = 64, isAllocatable = 0 in
+def CtrRegs64 : RegisterClass<"Hexagon", [i64], 64,
+                              (add C1_0, C3_2, C7_6, C9_8, C11_10, CS, UPC)>;
+
+def VolatileV3 {
+  list<Register> Regs = [D0, D1, D2, D3, D4, D5, D6, D7,
+                         R28, R31,
+                         P0, P1, P2, P3,
+                         M0, M1,
+                         LC0, LC1, SA0, SA1, USR, USR_OVF, CS0, CS1,
+                         V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11,
+                         V12, V13, V14, V15, V16, V17, V18, V19, V20, V21,
+                         V22, V23, V24, V25, V26, V27, V28, V29, V30, V31,
+                         W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11,
+                         W12, W13, W14, W15,
+                         Q0, Q1, Q2, Q3];
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td b/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td
new file mode 100644
index 000000000000..6e4987b7e4e3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td
@@ -0,0 +1,24 @@
+//===- HexagonSchedule.td - Hexagon Scheduling Definitions -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V4 Machine Info +
+//===----------------------------------------------------------------------===//
+
+include "HexagonScheduleV4.td"
+
+// V55 Machine Info +
+include "HexagonScheduleV55.td"
+
+//===----------------------------------------------------------------------===//
+// V60 Machine Info -
+//===----------------------------------------------------------------------===//
+
+include "HexagonScheduleV60.td"
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td
new file mode 100644
index 000000000000..7416baab392c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -0,0 +1,203 @@
+//=-HexagonScheduleV4.td - HexagonV4 Scheduling Definitions --*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// There are four SLOTS (four parallel pipelines) in Hexagon V4 machine.
+// This file describes that machine information.
+
+//
+//    |===========|==================================================|
+//    | PIPELINE  |              Instruction Classes                 |
+//    |===========|==================================================|
+//    | SLOT0     |  LD       ST    ALU32     MEMOP     NV    SYSTEM |
+//    |-----------|--------------------------------------------------|
+//    | SLOT1     |  LD       ST    ALU32                            |
+//    |-----------|--------------------------------------------------|
+//    | SLOT2     |  XTYPE          ALU32     J         JR           |
+//    |-----------|--------------------------------------------------|
+//    | SLOT3     |  XTYPE          ALU32     J         CR           |
+//    |===========|==================================================|
+
+// Functional Units.
+def SLOT0       : FuncUnit;
+def SLOT1       : FuncUnit;
+def SLOT2       : FuncUnit;
+def SLOT3       : FuncUnit;
+// Endloop is a pseudo instruction that is encoded with 2 bits in a packet
+// rather than taking an execution slot. This special unit is needed
+// to schedule an ENDLOOP with 4 other instructions.
+def SLOT_ENDLOOP: FuncUnit;
+
+// Itinerary classes.
+def PSEUDO      : InstrItinClass;
+def PSEUDOM     : InstrItinClass;
+// ALU64/M/S Instruction classes of V2 are collectively knownn as XTYPE in V4.
+def DUPLEX      : InstrItinClass;
+def PREFIX      : InstrItinClass;
+def COMPOUND_CJ_ARCHDEPSLOT    : InstrItinClass;
+def COMPOUND    : InstrItinClass;
+
+def ALU32_2op_tc_1_SLOT0123  : InstrItinClass;
+def ALU32_2op_tc_2early_SLOT0123  : InstrItinClass;
+def ALU32_3op_tc_2early_SLOT0123  : InstrItinClass;
+def ALU32_3op_tc_1_SLOT0123  : InstrItinClass;
+def ALU32_3op_tc_2_SLOT0123  : InstrItinClass;
+def ALU32_ADDI_tc_1_SLOT0123 : InstrItinClass;
+def ALU64_tc_1_SLOT23        : InstrItinClass;
+def ALU64_tc_2_SLOT23        : InstrItinClass;
+def ALU64_tc_2early_SLOT23   : InstrItinClass;
+def ALU64_tc_3x_SLOT23       : InstrItinClass;
+def CR_tc_2_SLOT3            : InstrItinClass;
+def CR_tc_2early_SLOT23      : InstrItinClass;
+def CR_tc_2early_SLOT3       : InstrItinClass;
+def CR_tc_3x_SLOT23          : InstrItinClass;
+def CR_tc_3x_SLOT3           : InstrItinClass;
+def J_tc_2early_SLOT23       : InstrItinClass;
+def J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT       : InstrItinClass;
+def J_tc_2early_SLOT2        : InstrItinClass;
+def LD_tc_ld_SLOT01          : InstrItinClass;
+def LD_tc_ld_SLOT0           : InstrItinClass;
+def LD_tc_3or4stall_SLOT0    : InstrItinClass;
+def M_tc_2_SLOT23            : InstrItinClass;
+def M_tc_3_SLOT23            : InstrItinClass;
+def M_tc_1_SLOT23            : InstrItinClass;
+def M_tc_3x_SLOT23           : InstrItinClass;
+def M_tc_3or4x_SLOT23        : InstrItinClass;
+def ST_tc_st_SLOT01          : InstrItinClass;
+def ST_tc_st_SLOT0           : InstrItinClass;
+def ST_tc_ld_SLOT0           : InstrItinClass;
+def ST_tc_3stall_SLOT0       : InstrItinClass;
+def S_2op_tc_1_SLOT23        : InstrItinClass;
+def S_2op_tc_2_SLOT23        : InstrItinClass;
+def S_2op_tc_2early_SLOT23   : InstrItinClass;
+def S_2op_tc_3or4x_SLOT23    : InstrItinClass;
+def S_3op_tc_1_SLOT23        : InstrItinClass;
+def S_3op_tc_2_SLOT23        : InstrItinClass;
+def S_3op_tc_2early_SLOT23   : InstrItinClass;
+def S_3op_tc_3_SLOT23        : InstrItinClass;
+def S_3op_tc_3x_SLOT23       : InstrItinClass;
+def NCJ_tc_3or4stall_SLOT0   : InstrItinClass;
+def V2LDST_tc_ld_SLOT01      : InstrItinClass;
+def V2LDST_tc_st_SLOT0       : InstrItinClass;
+def V2LDST_tc_st_SLOT01      : InstrItinClass;
+def V4LDST_tc_ld_SLOT01      : InstrItinClass;
+def V4LDST_tc_st_SLOT0       : InstrItinClass;
+def V4LDST_tc_st_SLOT01      : InstrItinClass;
+def J_tc_2early_SLOT0123     : InstrItinClass;
+def EXTENDER_tc_1_SLOT0123   : InstrItinClass;
+def S_3op_tc_3stall_SLOT23   : InstrItinClass;
+
+def HexagonItinerariesV4 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [
+        // ALU32
+        InstrItinData<ALU32_2op_tc_1_SLOT0123  ,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_2op_tc_2early_SLOT0123,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_1_SLOT0123   ,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_2early_SLOT0123,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_2_SLOT0123   ,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_ADDI_tc_1_SLOT0123  ,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        // ALU64
+        InstrItinData<ALU64_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_3x_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+        // CR -> System
+        InstrItinData<CR_tc_2_SLOT3          , [InstrStage<1, [SLOT3]>]>,
+        InstrItinData<CR_tc_2early_SLOT3     , [InstrStage<1, [SLOT3]>]>,
+        InstrItinData<CR_tc_3x_SLOT3         , [InstrStage<1, [SLOT3]>]>,
+
+        // Jump (conditional/unconditional/return etc)
+        // CR
+        InstrItinData<CR_tc_2early_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<CR_tc_3x_SLOT23        , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        // J
+        InstrItinData<J_tc_2early_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        // JR
+        InstrItinData<J_tc_2early_SLOT2      , [InstrStage<1, [SLOT2]>]>,
+
+        //Load
+        InstrItinData<LD_tc_ld_SLOT01        , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<LD_tc_ld_SLOT0         , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<LD_tc_3or4stall_SLOT0  , [InstrStage<1, [SLOT0]>]>,
+
+        // M
+        InstrItinData<M_tc_1_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_2_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3x_SLOT23         , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3or4x_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+        // Store
+        // ST
+        InstrItinData<ST_tc_st_SLOT01        , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        // ST0
+        InstrItinData<ST_tc_st_SLOT0         , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<ST_tc_ld_SLOT0         , [InstrStage<1, [SLOT0]>]>,
+
+        // S
+        InstrItinData<S_2op_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_3or4x_SLOT23  , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3x_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3stall_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+        // SYS
+        InstrItinData<ST_tc_3stall_SLOT0     , [InstrStage<1, [SLOT0]>]>,
+
+        // New Value Compare Jump
+        InstrItinData<NCJ_tc_3or4stall_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+
+        // Mem ops - MEM_V4
+        InstrItinData<V2LDST_tc_st_SLOT0     , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<V2LDST_tc_ld_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<V2LDST_tc_st_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<V4LDST_tc_st_SLOT0     , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<V4LDST_tc_ld_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<V4LDST_tc_st_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+
+        InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>]>,
+
+        // ENDLOOP
+        InstrItinData<J_tc_2early_SLOT0123   , [InstrStage<1, [SLOT_ENDLOOP]>]>,
+
+        // Extender/PREFIX
+        InstrItinData<EXTENDER_tc_1_SLOT0123,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                InstrStage<1, [SLOT2, SLOT3]>]>
+      ]>;
+
+def HexagonModelV4 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV4;
+  let LoadLatency = 1;
+  let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V4 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td
new file mode 100644
index 000000000000..b2a75f7200d7
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td
@@ -0,0 +1,194 @@
+//=-HexagonScheduleV4.td - HexagonV4 Scheduling Definitions --*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// There are four SLOTS (four parallel pipelines) in Hexagon V4 machine.
+// This file describes that machine information.
+
+//
+//    |===========|==================================================|
+//    | PIPELINE  |              Instruction Classes                 |
+//    |===========|==================================================|
+//    | SLOT0     |  LD       ST    ALU32     MEMOP     NV    SYSTEM |
+//    |-----------|--------------------------------------------------|
+//    | SLOT1     |  LD       ST    ALU32                            |
+//    |-----------|--------------------------------------------------|
+//    | SLOT2     |  XTYPE          ALU32     J         JR           |
+//    |-----------|--------------------------------------------------|
+//    | SLOT3     |  XTYPE          ALU32     J         CR           |
+//    |===========|==================================================|
+
+def CJ_tc_1_SLOT23              : InstrItinClass;
+def CJ_tc_2early_SLOT23         : InstrItinClass;
+def COPROC_VMEM_vtc_long_SLOT01 : InstrItinClass;
+def COPROC_VX_vtc_long_SLOT23   : InstrItinClass;
+def COPROC_VX_vtc_SLOT23        : InstrItinClass;
+def J_tc_3stall_SLOT2           : InstrItinClass;
+def MAPPING_tc_1_SLOT0123       : InstrItinClass;
+def M_tc_3stall_SLOT23          : InstrItinClass;
+
+def HexagonItinerariesV55 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [
+        // ALU32
+        InstrItinData<ALU32_2op_tc_1_SLOT0123     ,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
+        InstrItinData<ALU32_2op_tc_2early_SLOT0123,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
+        InstrItinData<ALU32_3op_tc_1_SLOT0123     ,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
+        InstrItinData<ALU32_3op_tc_2_SLOT0123     ,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
+        InstrItinData<ALU32_3op_tc_2early_SLOT0123,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
+        InstrItinData<ALU32_ADDI_tc_1_SLOT0123    ,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
+
+        // ALU64
+        InstrItinData<ALU64_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [1, 1, 1]>,
+        InstrItinData<ALU64_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [2, 1, 1]>,
+        InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [2, 1, 1]>,
+        InstrItinData<ALU64_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [3, 1, 1]>,
+
+        // CR -> System
+        InstrItinData<CR_tc_2_SLOT3      , [InstrStage<1, [SLOT3]>], [2, 1, 1]>,
+        InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<1, [SLOT3]>], [2, 1, 1]>,
+        InstrItinData<CR_tc_3x_SLOT3     , [InstrStage<1, [SLOT3]>], [3, 1, 1]>,
+
+        // Jump (conditional/unconditional/return etc)
+        InstrItinData<CR_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                           [2, 1, 1, 1]>,
+        InstrItinData<CR_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
+                                           [3, 1, 1, 1]>,
+        InstrItinData<CJ_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                           [1, 1, 1, 1]>,
+        InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                           [2, 1, 1, 1]>,
+        InstrItinData<J_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                           [2, 1, 1, 1]>,
+        InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT,
+                                 [InstrStage<1, [SLOT2, SLOT3]>], [2, 1, 1, 1]>,
+
+        // JR
+        InstrItinData<J_tc_2early_SLOT2  , [InstrStage<1, [SLOT2]>], [2, 1, 1]>,
+        InstrItinData<J_tc_3stall_SLOT2  , [InstrStage<1, [SLOT2]>], [3, 1, 1]>,
+
+        // Extender
+        InstrItinData<EXTENDER_tc_1_SLOT0123,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
+
+        // Load
+        InstrItinData<LD_tc_ld_SLOT01      , [InstrStage<1, [SLOT0, SLOT1]>],
+                                             [2, 1]>,
+        InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<1, [SLOT0]>], [2, 1]>,
+        InstrItinData<LD_tc_ld_SLOT0       , [InstrStage<1, [SLOT0]>], [2, 1]>,
+
+        // M
+        InstrItinData<M_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [1, 1, 1]>,
+        InstrItinData<M_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [2, 1, 1]>,
+        InstrItinData<M_tc_3_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [1, 1, 1]>,
+        InstrItinData<M_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1]>,
+        InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1]>,
+        InstrItinData<M_tc_3stall_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1]>,
+
+        // Store
+        InstrItinData<ST_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>],
+                                          [1, 1, 1]>,
+        InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<1, [SLOT0]>], [2, 1, 1]>,
+        InstrItinData<ST_tc_ld_SLOT0    , [InstrStage<1, [SLOT0]>], [2, 1, 1]>,
+        InstrItinData<ST_tc_st_SLOT0    , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
+
+        // S
+        InstrItinData<S_2op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [1, 1, 1]>,
+        InstrItinData<S_2op_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [2, 1, 1]>,
+        InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [2, 1, 1]>,
+        InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [3, 1, 1]>,
+        InstrItinData<S_3op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [1, 1, 1]>,
+        InstrItinData<S_3op_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [2, 1, 1]>,
+        InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [2, 1, 1]>,
+        InstrItinData<S_3op_tc_3_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [3, 1, 1]>,
+        InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [3, 1, 1]>,
+        InstrItinData<S_3op_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
+                                              [3, 1, 1]>,
+
+        // New Value Compare Jump
+        InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<1, [SLOT0]>],
+                                              [3, 1, 1, 1]>,
+
+        // Mem ops
+        InstrItinData<V2LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>],
+                                            [1, 1, 1, 1]>,
+        InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+                                            [2, 1, 1, 1]>,
+        InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+                                            [1, 1, 1, 1]>,
+        InstrItinData<V4LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>],
+                                            [1, 1, 1, 1]>,
+        InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+                                            [3, 1, 1, 1]>,
+        InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+                                            [1, 1, 1, 1]>,
+
+        // Endloop
+        InstrItinData<J_tc_2early_SLOT0123, [InstrStage<1, [SLOT_ENDLOOP]>],
+                                            [2]>,
+
+        // Vector
+        InstrItinData<COPROC_VMEM_vtc_long_SLOT01,
+                      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 1]>,
+        InstrItinData<COPROC_VX_vtc_long_SLOT23  ,
+                      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1, 1]>,
+        InstrItinData<COPROC_VX_vtc_SLOT23 ,
+                      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1, 1]>,
+        InstrItinData<MAPPING_tc_1_SLOT0123      ,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+                      [1, 1, 1, 1]>,
+
+        // Misc
+        InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>],
+                                                [1, 1, 1]>,
+        InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>],
+                                 [1, 1, 1]>,
+        InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
+        InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+                               [1, 1, 1]>,
+        InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+                               [1, 1, 1]>,
+        InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 1]>
+      ]>;
+
+def HexagonModelV55 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV55;
+  let LoadLatency = 1;
+  let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V4 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td
new file mode 100644
index 000000000000..dc2ce43b0579
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td
@@ -0,0 +1,301 @@
+//=-HexagonScheduleV60.td - HexagonV60 Scheduling Definitions *- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// CVI pipes from the "Hexagon Multimedia Co-Processor Extensions Arch Spec".
+def CVI_ST     : FuncUnit;
+def CVI_XLANE  : FuncUnit;
+def CVI_SHIFT  : FuncUnit;
+def CVI_MPY0   : FuncUnit;
+def CVI_MPY1   : FuncUnit;
+def CVI_LD     : FuncUnit;
+
+// Combined functional units.
+def CVI_XLSHF  : FuncUnit;
+def CVI_MPY01  : FuncUnit;
+def CVI_ALL    : FuncUnit;
+
+// Combined functional unit data.
+def HexagonComboFuncsV60 :
+    ComboFuncUnits<[
+      ComboFuncData<CVI_XLSHF    , [CVI_XLANE, CVI_SHIFT]>,
+      ComboFuncData<CVI_MPY01    , [CVI_MPY0, CVI_MPY1]>,
+      ComboFuncData<CVI_ALL      , [CVI_ST, CVI_XLANE, CVI_SHIFT,
+                                    CVI_MPY0, CVI_MPY1, CVI_LD]>
+    ]>;
+
+// Note: When adding additional vector scheduling classes, add the
+// corresponding methods to the class HexagonInstrInfo.
+def CVI_VA           : InstrItinClass;
+def CVI_VA_DV        : InstrItinClass;
+def CVI_VX_LONG      : InstrItinClass;
+def CVI_VX_LATE      : InstrItinClass;
+def CVI_VX           : InstrItinClass;
+def CVI_VX_DV_LONG   : InstrItinClass;
+def CVI_VX_DV        : InstrItinClass;
+def CVI_VX_DV_SLOT2  : InstrItinClass;
+def CVI_VP           : InstrItinClass;
+def CVI_VP_LONG      : InstrItinClass;
+def CVI_VP_VS_EARLY  : InstrItinClass;
+def CVI_VP_VS_LONG_EARLY   : InstrItinClass;
+def CVI_VP_VS_LONG   : InstrItinClass;
+def CVI_VP_VS   : InstrItinClass;
+def CVI_VP_DV        : InstrItinClass;
+def CVI_VS           : InstrItinClass;
+def CVI_VINLANESAT   : InstrItinClass;
+def CVI_VM_LD        : InstrItinClass;
+def CVI_VM_TMP_LD    : InstrItinClass;
+def CVI_VM_CUR_LD    : InstrItinClass;
+def CVI_VM_VP_LDU    : InstrItinClass;
+def CVI_VM_ST        : InstrItinClass;
+def CVI_VM_NEW_ST    : InstrItinClass;
+def CVI_VM_STU       : InstrItinClass;
+def CVI_HIST         : InstrItinClass;
+def CVI_VA_EXT       : InstrItinClass;
+
+// There are four SLOTS (four parallel pipelines) in Hexagon V60 machine.
+// This file describes that machine information.
+//
+//    |===========|==================================================|
+//    | PIPELINE  |              Instruction Classes                 |
+//    |===========|==================================================|
+//    | SLOT0     |  LD       ST    ALU32     MEMOP     NV    SYSTEM |
+//    |-----------|--------------------------------------------------|
+//    | SLOT1     |  LD       ST    ALU32                            |
+//    |-----------|--------------------------------------------------|
+//    | SLOT2     |  XTYPE          ALU32     J         JR           |
+//    |-----------|--------------------------------------------------|
+//    | SLOT3     |  XTYPE          ALU32     J         CR           |
+//    |===========|==================================================|
+//
+//
+// In addition to using the above SLOTS, there are also six vector pipelines
+// in the CVI co-processor in the Hexagon V60 machine.
+//
+//      |=========| |=========| |=========| |=========| |=========| |=========|
+// SLOT | CVI_LD  | |CVI_MPY3 | |CVI_MPY2 | |CVI_SHIFT| |CVI_XLANE| | CVI_ST  |
+// ==== |=========| |=========| |=========| |=========| |=========| |=========|
+// S0-3 |         | | CVI_VA  | | CVI_VA  | | CVI_VA  | | CVI_VA  | |         |
+// S2-3 |         | | CVI_VX  | | CVI_VX  | |         | |         | |         |
+// S0-3 |         | |         | |         | |         | | CVI_VP  | |         |
+// S0-3 |         | |         | |         | | CVI_VS  | |         | |         |
+// S0-1 |(CVI_LD) | | CVI_LD  | | CVI_LD  | | CVI_LD  | | CVI_LD  | |         |
+// S0-1 |(C*TMP_LD) |         | |         | |         | |         | |         |
+// S01  |(C*_LDU) | |         | |         | |         | | C*_LDU  | |         |
+// S0   |         | | CVI_ST  | | CVI_ST  | | CVI_ST  | | CVI_ST  | |(CVI_ST) |
+// S0   |         | |         | |         | |         | |         | |(C*TMP_ST)
+// S01  |         | |         | |         | |         | | VSTU    | |(C*_STU) |
+//      |=========| |=========| |=========| |=========| |=========| |=========|
+//                  |=====================| |=====================|
+//                  | CVI_MPY2 & CVI_MPY3 | |CVI_XLANE & CVI_SHIFT|
+//                  |=====================| |=====================|
+// S0-3             | CVI_VA_DV           | | CVI_VA_DV           |
+// S0-3             |                     | | CVI_VP_DV           |
+// S2-3             | CVI_VX_DV           | |                     |
+//                  |=====================| |=====================|
+//      |=====================================================================|
+// S0-3 | CVI_HIST   Histogram                                                |
+// S0123| CVI_VA_EXT Extract                                                  |
+//      |=====================================================================|
+
+def HexagonItinerariesV60 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
+                            CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
+                            CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL], [], [
+        // ALU32
+        InstrItinData<ALU32_2op_tc_1_SLOT0123     ,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_2op_tc_2early_SLOT0123,
+                      [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_1_SLOT0123     ,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_2_SLOT0123     ,
+                      [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_2early_SLOT0123,
+                      [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_ADDI_tc_1_SLOT0123    ,
+                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        // ALU64
+        InstrItinData<ALU64_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+        // CR -> System
+        InstrItinData<CR_tc_2_SLOT3      , [InstrStage<2, [SLOT3]>]>,
+        InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<2, [SLOT3]>]>,
+        InstrItinData<CR_tc_3x_SLOT3     , [InstrStage<3, [SLOT3]>]>,
+
+        // Jump (conditional/unconditional/return etc)
+        InstrItinData<CR_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<CR_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<CJ_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<J_tc_2early_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+        // JR
+        InstrItinData<J_tc_2early_SLOT2  , [InstrStage<2, [SLOT2]>]>,
+        InstrItinData<J_tc_3stall_SLOT2  , [InstrStage<3, [SLOT2]>]>,
+
+        // Extender
+        InstrItinData<EXTENDER_tc_1_SLOT0123, [InstrStage<1,
+                              [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        // Load
+        InstrItinData<LD_tc_ld_SLOT01      , [InstrStage<3, [SLOT0, SLOT1]>]>,
+        InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<4, [SLOT0]>]>,
+        InstrItinData<LD_tc_ld_SLOT0       , [InstrStage<3, [SLOT0]>]>,
+
+        // M
+        InstrItinData<M_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3_SLOT23     , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+        // Store
+        InstrItinData<ST_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
+        InstrItinData<ST_tc_ld_SLOT0    , [InstrStage<3, [SLOT0]>]>,
+        InstrItinData<ST_tc_st_SLOT0    , [InstrStage<1, [SLOT0]>]>,
+
+        // S
+        InstrItinData<S_2op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+        // The S_2op_tc_3x_SLOT23 slots are 4 cycles on v60.
+        InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3_SLOT23     , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
+
+        // New Value Compare Jump
+        InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<4, [SLOT0]>]>,
+
+        // Mem ops
+        InstrItinData<V2LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<2, [SLOT0, SLOT1]>]>,
+        InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<V4LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>,
+        InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
+
+        // Endloop
+        InstrItinData<J_tc_2early_SLOT0123, [InstrStage<2, [SLOT_ENDLOOP]>]>,
+
+        // Vector
+        InstrItinData<COPROC_VMEM_vtc_long_SLOT01,
+                             [InstrStage<3, [SLOT0, SLOT1]>]>,
+        InstrItinData<COPROC_VX_vtc_long_SLOT23  ,
+                             [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<COPROC_VX_vtc_SLOT23 ,
+                             [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<MAPPING_tc_1_SLOT0123      ,
+                             [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        // Duplex and Compound
+        InstrItinData<DUPLEX     , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<COMPOUND_CJ_ARCHDEPSLOT   , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        // Misc
+        InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<PSEUDOM    , [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [SLOT2, SLOT3]>]>,
+
+        // Latest CVI spec definitions.
+        InstrItinData<CVI_VA,[InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLANE,CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>]>,
+        InstrItinData<CVI_VA_DV,
+                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF, CVI_MPY01]>]>,
+        InstrItinData<CVI_VX_LONG, [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>,
+        InstrItinData<CVI_VX_LATE, [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>,
+        InstrItinData<CVI_VX,[InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>,
+        InstrItinData<CVI_VX_DV_LONG,
+                                   [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY01]>]>,
+        InstrItinData<CVI_VX_DV,
+                                   [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY01]>]>,
+        InstrItinData<CVI_VX_DV_SLOT2,
+                                   [InstrStage<1, [SLOT2], 0>,
+                                    InstrStage<1, [CVI_MPY01]>]>,
+        InstrItinData<CVI_VP,      [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLANE]>]>,
+        InstrItinData<CVI_VP_LONG, [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLANE]>]>,
+        InstrItinData<CVI_VP_VS_EARLY,
+                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>]>,
+        InstrItinData<CVI_VP_VS_LONG,
+                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>]>,
+        InstrItinData<CVI_VP_VS,
+                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>]>,
+        InstrItinData<CVI_VP_VS_LONG_EARLY,
+                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>]>,
+        InstrItinData<CVI_VP_DV  , [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>]>,
+        InstrItinData<CVI_VS,
+                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_SHIFT]>]>,
+        InstrItinData<CVI_VINLANESAT,
+                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_SHIFT]>]>,
+        InstrItinData<CVI_VM_LD  , [InstrStage<1, [SLOT0, SLOT1], 0>,
+                                    InstrStage<1, [CVI_LD], 0>,
+                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>]>,
+        InstrItinData<CVI_VM_TMP_LD,[InstrStage<1,[SLOT0, SLOT1], 0>,
+                                    InstrStage<1, [CVI_LD]>]>,
+        InstrItinData<CVI_VM_CUR_LD,[InstrStage<1,[SLOT0, SLOT1], 0>,
+                                    InstrStage<1, [CVI_LD], 0>,
+                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>]>,
+        InstrItinData<CVI_VM_VP_LDU,[InstrStage<1,[SLOT0], 0>,
+                                    InstrStage<1, [SLOT1], 0>,
+                                    InstrStage<1, [CVI_LD], 0>,
+                                    InstrStage<1, [CVI_XLANE]>]>,
+        InstrItinData<CVI_VM_ST  , [InstrStage<1, [SLOT0], 0>,
+                                    InstrStage<1, [CVI_ST], 0>,
+                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>]>,
+        InstrItinData<CVI_VM_NEW_ST,[InstrStage<1,[SLOT0], 0>,
+                                    InstrStage<1, [CVI_ST]>]>,
+        InstrItinData<CVI_VM_STU , [InstrStage<1, [SLOT0], 0>,
+                                    InstrStage<1, [SLOT1], 0>,
+                                    InstrStage<1, [CVI_ST], 0>,
+                                    InstrStage<1, [CVI_XLANE]>]>,
+        InstrItinData<CVI_HIST   , [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_ALL]>]>
+      ]>;
+
+def HexagonModelV60 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV60;
+  let LoadLatency = 1;
+  let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V60 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..10730536080e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -0,0 +1,63 @@
+//===-- HexagonSelectionDAGInfo.cpp - Hexagon SelectionDAG Info -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the HexagonSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-selectiondag-info"
+
+SDValue HexagonSelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (AlwaysInline || (Align & 0x3) != 0 || !ConstantSize)
+    return SDValue();
+
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  if (SizeVal < 32 || (SizeVal % 8) != 0)
+    return SDValue();
+
+  // Special case aligned memcpys with size >= 32 bytes and a multiple of 8.
+  //
+  const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering();
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+  Entry.Node = Dst;
+  Args.push_back(Entry);
+  Entry.Node = Src;
+  Args.push_back(Entry);
+  Entry.Node = Size;
+  Args.push_back(Entry);
+
+  const char *SpecialMemcpyName =
+      "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes";
+  const MachineFunction &MF = DAG.getMachineFunction();
+  bool LongCalls = MF.getSubtarget<HexagonSubtarget>().useLongCalls();
+  unsigned Flags = LongCalls ? HexagonII::HMOTF_ConstExtended : 0;
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+                 Type::getVoidTy(*DAG.getContext()),
+                 DAG.getTargetExternalSymbol(SpecialMemcpyName,
+                      TLI.getPointerTy(DAG.getDataLayout()), Flags),
+                 std::move(Args))
+      .setDiscardResult();
+
+  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+  return CallResult.second;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
new file mode 100644
index 000000000000..a83a8efb7588
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
@@ -0,0 +1,35 @@
+//===-- HexagonSelectionDAGInfo.h - Hexagon SelectionDAG Info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Hexagon subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONSELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class HexagonSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+  explicit HexagonSelectionDAGInfo() = default;
+
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
new file mode 100644
index 000000000000..68484344fded
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -0,0 +1,115 @@
+//=== HexagonSplitConst32AndConst64.cpp - split CONST32/Const64 into HI/LO ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// When the compiler is invoked with no small data, for instance, with the -G0
+// command line option, then all CONST* opcodes should be broken down into
+// appropriate LO and HI instructions. This splitting is done by this pass.
+// The only reason this is not done in the DAG lowering itself is that there
+// is no simple way of getting the register allocator to allot the same hard
+// register to the result of LO and HI instructions. This pass is always
+// scheduled after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonTargetObjectFile.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xfer"
+
+namespace llvm {
+  FunctionPass *createHexagonSplitConst32AndConst64();
+  void initializeHexagonSplitConst32AndConst64Pass(PassRegistry&);
+}
+
+namespace {
+  class HexagonSplitConst32AndConst64 : public MachineFunctionPass {
+  public:
+    static char ID;
+    HexagonSplitConst32AndConst64() : MachineFunctionPass(ID) {
+      PassRegistry &R = *PassRegistry::getPassRegistry();
+      initializeHexagonSplitConst32AndConst64Pass(R);
+    }
+    StringRef getPassName() const override {
+      return "Hexagon Split Const32s and Const64s";
+    }
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+  };
+}
+
+char HexagonSplitConst32AndConst64::ID = 0;
+
+INITIALIZE_PASS(HexagonSplitConst32AndConst64, "split-const-for-sdata",
+      "Hexagon Split Const32s and Const64s", false, false)
+
+bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
+  const HexagonTargetObjectFile &TLOF =
+      *static_cast<const HexagonTargetObjectFile *>(
+          Fn.getTarget().getObjFileLowering());
+  if (TLOF.isSmallDataEnabled())
+    return true;
+
+  const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+
+  // Loop over all of the basic blocks
+  for (MachineBasicBlock &B : Fn) {
+    for (auto I = B.begin(), E = B.end(); I != E; ) {
+      MachineInstr &MI = *I;
+      ++I;
+      unsigned Opc = MI.getOpcode();
+
+      if (Opc == Hexagon::CONST32) {
+        unsigned DestReg = MI.getOperand(0).getReg();
+        uint64_t ImmValue = MI.getOperand(1).getImm();
+        const DebugLoc &DL = MI.getDebugLoc();
+        BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), DestReg)
+            .addImm(ImmValue);
+        B.erase(&MI);
+      } else if (Opc == Hexagon::CONST64) {
+        unsigned DestReg = MI.getOperand(0).getReg();
+        int64_t ImmValue = MI.getOperand(1).getImm();
+        const DebugLoc &DL = MI.getDebugLoc();
+        unsigned DestLo = TRI->getSubReg(DestReg, Hexagon::isub_lo);
+        unsigned DestHi = TRI->getSubReg(DestReg, Hexagon::isub_hi);
+
+        int32_t LowWord = (ImmValue & 0xFFFFFFFF);
+        int32_t HighWord = (ImmValue >> 32) & 0xFFFFFFFF;
+
+        BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), DestLo)
+            .addImm(LowWord);
+        BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), DestHi)
+            .addImm(HighWord);
+        B.erase(&MI);
+      }
+    }
+  }
+
+  return true;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonSplitConst32AndConst64() {
+  return new HexagonSplitConst32AndConst64();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
new file mode 100644
index 000000000000..2c937216d463
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -0,0 +1,1205 @@
+//===--- HexagonSplitDouble.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hsdr"
+
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+
+  FunctionPass *createHexagonSplitDoubleRegs();
+  void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
+
+} // end namespace llvm
+
+namespace {
+
+  static cl::opt<int> MaxHSDR("max-hsdr", cl::Hidden, cl::init(-1),
+      cl::desc("Maximum number of split partitions"));
+  static cl::opt<bool> MemRefsFixed("hsdr-no-mem", cl::Hidden, cl::init(true),
+      cl::desc("Do not split loads or stores"));
+
+  class HexagonSplitDoubleRegs : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    HexagonSplitDoubleRegs() : MachineFunctionPass(ID), TRI(nullptr),
+        TII(nullptr) {
+      initializeHexagonSplitDoubleRegsPass(*PassRegistry::getPassRegistry());
+    }
+
+    StringRef getPassName() const override {
+      return "Hexagon Split Double Registers";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    static const TargetRegisterClass *const DoubleRC;
+
+    const HexagonRegisterInfo *TRI;
+    const HexagonInstrInfo *TII;
+    const MachineLoopInfo *MLI;
+    MachineRegisterInfo *MRI;
+
+    typedef std::set<unsigned> USet;
+    typedef std::map<unsigned,USet> UUSetMap;
+    typedef std::pair<unsigned,unsigned> UUPair;
+    typedef std::map<unsigned,UUPair> UUPairMap;
+    typedef std::map<const MachineLoop*,USet> LoopRegMap;
+
+    bool isInduction(unsigned Reg, LoopRegMap &IRM) const;
+    bool isVolatileInstr(const MachineInstr *MI) const;
+    bool isFixedInstr(const MachineInstr *MI) const;
+    void partitionRegisters(UUSetMap &P2Rs);
+    int32_t profit(const MachineInstr *MI) const;
+    bool isProfitable(const USet &Part, LoopRegMap &IRM) const;
+
+    void collectIndRegsForLoop(const MachineLoop *L, USet &Rs);
+    void collectIndRegs(LoopRegMap &IRM);
+
+    void createHalfInstr(unsigned Opc, MachineInstr *MI,
+        const UUPairMap &PairMap, unsigned SubR);
+    void splitMemRef(MachineInstr *MI, const UUPairMap &PairMap);
+    void splitImmediate(MachineInstr *MI, const UUPairMap &PairMap);
+    void splitCombine(MachineInstr *MI, const UUPairMap &PairMap);
+    void splitExt(MachineInstr *MI, const UUPairMap &PairMap);
+    void splitShift(MachineInstr *MI, const UUPairMap &PairMap);
+    void splitAslOr(MachineInstr *MI, const UUPairMap &PairMap);
+    bool splitInstr(MachineInstr *MI, const UUPairMap &PairMap);
+    void replaceSubregUses(MachineInstr *MI, const UUPairMap &PairMap);
+    void collapseRegPairs(MachineInstr *MI, const UUPairMap &PairMap);
+    bool splitPartition(const USet &Part);
+
+    static int Counter;
+    static void dump_partition(raw_ostream&, const USet&,
+       const TargetRegisterInfo&);
+  };
+
+  char HexagonSplitDoubleRegs::ID;
+  int HexagonSplitDoubleRegs::Counter = 0;
+  const TargetRegisterClass *const HexagonSplitDoubleRegs::DoubleRC
+      = &Hexagon::DoubleRegsRegClass;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(HexagonSplitDoubleRegs, "hexagon-split-double",
+  "Hexagon Split Double Registers", false, false)
+
+void HexagonSplitDoubleRegs::dump_partition(raw_ostream &os,
+      const USet &Part, const TargetRegisterInfo &TRI) {
+  dbgs() << '{';
+  for (auto I : Part)
+    dbgs() << ' ' << PrintReg(I, &TRI);
+  dbgs() << " }";
+}
+
+bool HexagonSplitDoubleRegs::isInduction(unsigned Reg, LoopRegMap &IRM) const {
+  for (auto I : IRM) {
+    const USet &Rs = I.second;
+    if (Rs.find(Reg) != Rs.end())
+      return true;
+  }
+  return false;
+}
+
+bool HexagonSplitDoubleRegs::isVolatileInstr(const MachineInstr *MI) const {
+  for (auto &I : MI->memoperands())
+    if (I->isVolatile())
+      return true;
+  return false;
+}
+
+bool HexagonSplitDoubleRegs::isFixedInstr(const MachineInstr *MI) const {
+  if (MI->mayLoad() || MI->mayStore())
+    if (MemRefsFixed || isVolatileInstr(MI))
+      return true;
+  if (MI->isDebugValue())
+    return false;
+
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    default:
+      return true;
+
+    case TargetOpcode::PHI:
+    case TargetOpcode::COPY:
+      break;
+
+    case Hexagon::L2_loadrd_io:
+      // Not handling stack stores (only reg-based addresses).
+      if (MI->getOperand(1).isReg())
+        break;
+      return true;
+    case Hexagon::S2_storerd_io:
+      // Not handling stack stores (only reg-based addresses).
+      if (MI->getOperand(0).isReg())
+        break;
+      return true;
+    case Hexagon::L2_loadrd_pi:
+    case Hexagon::S2_storerd_pi:
+
+    case Hexagon::A2_tfrpi:
+    case Hexagon::A2_combineii:
+    case Hexagon::A4_combineir:
+    case Hexagon::A4_combineii:
+    case Hexagon::A4_combineri:
+    case Hexagon::A2_combinew:
+    case Hexagon::CONST64:
+
+    case Hexagon::A2_sxtw:
+
+    case Hexagon::A2_andp:
+    case Hexagon::A2_orp:
+    case Hexagon::A2_xorp:
+    case Hexagon::S2_asl_i_p_or:
+    case Hexagon::S2_asl_i_p:
+    case Hexagon::S2_asr_i_p:
+    case Hexagon::S2_lsr_i_p:
+      break;
+  }
+
+  for (auto &Op : MI->operands()) {
+    if (!Op.isReg())
+      continue;
+    unsigned R = Op.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      return true;
+  }
+  return false;
+}
+
+void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
+  typedef std::map<unsigned,unsigned> UUMap;
+  typedef std::vector<unsigned> UVect;
+
+  unsigned NumRegs = MRI->getNumVirtRegs();
+  BitVector DoubleRegs(NumRegs);
+  for (unsigned i = 0; i < NumRegs; ++i) {
+    unsigned R = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI->getRegClass(R) == DoubleRC)
+      DoubleRegs.set(i);
+  }
+
+  BitVector FixedRegs(NumRegs);
+  for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
+    unsigned R = TargetRegisterInfo::index2VirtReg(x);
+    MachineInstr *DefI = MRI->getVRegDef(R);
+    // In some cases a register may exist, but never be defined or used.
+    // It should never appear anywhere, but mark it as "fixed", just to be
+    // safe.
+    if (!DefI || isFixedInstr(DefI))
+      FixedRegs.set(x);
+  }
+
+  UUSetMap AssocMap;
+  for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
+    if (FixedRegs[x])
+      continue;
+    unsigned R = TargetRegisterInfo::index2VirtReg(x);
+    DEBUG(dbgs() << PrintReg(R, TRI) << " ~~");
+    USet &Asc = AssocMap[R];
+    for (auto U = MRI->use_nodbg_begin(R), Z = MRI->use_nodbg_end();
+         U != Z; ++U) {
+      MachineOperand &Op = *U;
+      MachineInstr *UseI = Op.getParent();
+      if (isFixedInstr(UseI))
+        continue;
+      for (unsigned i = 0, n = UseI->getNumOperands(); i < n; ++i) {
+        MachineOperand &MO = UseI->getOperand(i);
+        // Skip non-registers or registers with subregisters.
+        if (&MO == &Op || !MO.isReg() || MO.getSubReg())
+          continue;
+        unsigned T = MO.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(T)) {
+          FixedRegs.set(x);
+          continue;
+        }
+        if (MRI->getRegClass(T) != DoubleRC)
+          continue;
+        unsigned u = TargetRegisterInfo::virtReg2Index(T);
+        if (FixedRegs[u])
+          continue;
+        DEBUG(dbgs() << ' ' << PrintReg(T, TRI));
+        Asc.insert(T);
+        // Make it symmetric.
+        AssocMap[T].insert(R);
+      }
+    }
+    DEBUG(dbgs() << '\n');
+  }
+
+  UUMap R2P;
+  unsigned NextP = 1;
+  USet Visited;
+  for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
+    unsigned R = TargetRegisterInfo::index2VirtReg(x);
+    if (Visited.count(R))
+      continue;
+    // Create a new partition for R.
+    unsigned ThisP = FixedRegs[x] ? 0 : NextP++;
+    UVect WorkQ;
+    WorkQ.push_back(R);
+    for (unsigned i = 0; i < WorkQ.size(); ++i) {
+      unsigned T = WorkQ[i];
+      if (Visited.count(T))
+        continue;
+      R2P[T] = ThisP;
+      Visited.insert(T);
+      // Add all registers associated with T.
+      USet &Asc = AssocMap[T];
+      for (USet::iterator J = Asc.begin(), F = Asc.end(); J != F; ++J)
+        WorkQ.push_back(*J);
+    }
+  }
+
+  for (auto I : R2P)
+    P2Rs[I.second].insert(I.first);
+}
+
+static inline int32_t profitImm(unsigned Lo, unsigned Hi) {
+  int32_t P = 0;
+  bool LoZ1 = false, HiZ1 = false;
+  if (Lo == 0 || Lo == 0xFFFFFFFF)
+    P += 10, LoZ1 = true;
+  if (Hi == 0 || Hi == 0xFFFFFFFF)
+    P += 10, HiZ1 = true;
+  if (!LoZ1 && !HiZ1 && Lo == Hi)
+    P += 3;
+  return P;
+}
+
+int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
+  unsigned ImmX = 0;
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case TargetOpcode::PHI:
+      for (const auto &Op : MI->operands())
+        if (!Op.getSubReg())
+          return 0;
+      return 10;
+    case TargetOpcode::COPY:
+      if (MI->getOperand(1).getSubReg() != 0)
+        return 10;
+      return 0;
+
+    case Hexagon::L2_loadrd_io:
+    case Hexagon::S2_storerd_io:
+      return -1;
+    case Hexagon::L2_loadrd_pi:
+    case Hexagon::S2_storerd_pi:
+      return 2;
+
+    case Hexagon::A2_tfrpi:
+    case Hexagon::CONST64: {
+      uint64_t D = MI->getOperand(1).getImm();
+      unsigned Lo = D & 0xFFFFFFFFULL;
+      unsigned Hi = D >> 32;
+      return profitImm(Lo, Hi);
+    }
+    case Hexagon::A2_combineii:
+    case Hexagon::A4_combineii:
+      return profitImm(MI->getOperand(1).getImm(),
+                       MI->getOperand(2).getImm());
+    case Hexagon::A4_combineri:
+      ImmX++;
+    case Hexagon::A4_combineir: {
+      ImmX++;
+      int64_t V = MI->getOperand(ImmX).getImm();
+      if (V == 0 || V == -1)
+        return 10;
+      // Fall through into A2_combinew.
+      LLVM_FALLTHROUGH;
+    }
+    case Hexagon::A2_combinew:
+      return 2;
+
+    case Hexagon::A2_sxtw:
+      return 3;
+
+    case Hexagon::A2_andp:
+    case Hexagon::A2_orp:
+    case Hexagon::A2_xorp:
+      return 1;
+
+    case Hexagon::S2_asl_i_p_or: {
+      unsigned S = MI->getOperand(3).getImm();
+      if (S == 0 || S == 32)
+        return 10;
+      return -1;
+    }
+    case Hexagon::S2_asl_i_p:
+    case Hexagon::S2_asr_i_p:
+    case Hexagon::S2_lsr_i_p:
+      unsigned S = MI->getOperand(2).getImm();
+      if (S == 0 || S == 32)
+        return 10;
+      if (S == 16)
+        return 5;
+      if (S == 48)
+        return 7;
+      return -10;
+  }
+
+  return 0;
+}
+
+bool HexagonSplitDoubleRegs::isProfitable(const USet &Part, LoopRegMap &IRM)
+      const {
+  unsigned FixedNum = 0, SplitNum = 0, LoopPhiNum = 0;
+  int32_t TotalP = 0;
+
+  for (unsigned DR : Part) {
+    MachineInstr *DefI = MRI->getVRegDef(DR);
+    int32_t P = profit(DefI);
+    if (P == std::numeric_limits<int>::min())
+      return false;
+    TotalP += P;
+    // Reduce the profitability of splitting induction registers.
+    if (isInduction(DR, IRM))
+      TotalP -= 30;
+
+    for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end();
+         U != W; ++U) {
+      MachineInstr *UseI = U->getParent();
+      if (isFixedInstr(UseI)) {
+        FixedNum++;
+        // Calculate the cost of generating REG_SEQUENCE instructions.
+        for (auto &Op : UseI->operands()) {
+          if (Op.isReg() && Part.count(Op.getReg()))
+            if (Op.getSubReg())
+              TotalP -= 2;
+        }
+        continue;
+      }
+      // If a register from this partition is used in a fixed instruction,
+      // and there is also a register in this partition that is used in
+      // a loop phi node, then decrease the splitting profit as this can
+      // confuse the modulo scheduler.
+      if (UseI->isPHI()) {
+        const MachineBasicBlock *PB = UseI->getParent();
+        const MachineLoop *L = MLI->getLoopFor(PB);
+        if (L && L->getHeader() == PB)
+          LoopPhiNum++;
+      }
+      // Splittable instruction.
+      SplitNum++;
+      int32_t P = profit(UseI);
+      if (P == std::numeric_limits<int>::min())
+        return false;
+      TotalP += P;
+    }
+  }
+
+  if (FixedNum > 0 && LoopPhiNum > 0)
+    TotalP -= 20*LoopPhiNum;
+
+  DEBUG(dbgs() << "Partition profit: " << TotalP << '\n');
+  return TotalP > 0;
+}
+
+void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L,
+      USet &Rs) {
+  const MachineBasicBlock *HB = L->getHeader();
+  const MachineBasicBlock *LB = L->getLoopLatch();
+  if (!HB || !LB)
+    return;
+
+  // Examine the latch branch. Expect it to be a conditional branch to
+  // the header (either "br-cond header" or "br-cond exit; br header").
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
+  MachineBasicBlock *TmpLB = const_cast<MachineBasicBlock*>(LB);
+  SmallVector<MachineOperand,2> Cond;
+  bool BadLB = TII->analyzeBranch(*TmpLB, TB, FB, Cond, false);
+  // Only analyzable conditional branches. HII::analyzeBranch will put
+  // the branch opcode as the first element of Cond, and the predicate
+  // operand as the second.
+  if (BadLB || Cond.size() != 2)
+    return;
+  // Only simple jump-conditional (with or without negation).
+  if (!TII->PredOpcodeHasJMP_c(Cond[0].getImm()))
+    return;
+  // Must go to the header.
+  if (TB != HB && FB != HB)
+    return;
+  assert(Cond[1].isReg() && "Unexpected Cond vector from analyzeBranch");
+  // Expect a predicate register.
+  unsigned PR = Cond[1].getReg();
+  assert(MRI->getRegClass(PR) == &Hexagon::PredRegsRegClass);
+
+  // Get the registers on which the loop controlling compare instruction
+  // depends.
+  unsigned CmpR1 = 0, CmpR2 = 0;
+  const MachineInstr *CmpI = MRI->getVRegDef(PR);
+  while (CmpI->getOpcode() == Hexagon::C2_not)
+    CmpI = MRI->getVRegDef(CmpI->getOperand(1).getReg());
+
+  int Mask = 0, Val = 0;
+  bool OkCI = TII->analyzeCompare(*CmpI, CmpR1, CmpR2, Mask, Val);
+  if (!OkCI)
+    return;
+  // Eliminate non-double input registers.
+  if (CmpR1 && MRI->getRegClass(CmpR1) != DoubleRC)
+    CmpR1 = 0;
+  if (CmpR2 && MRI->getRegClass(CmpR2) != DoubleRC)
+    CmpR2 = 0;
+  if (!CmpR1 && !CmpR2)
+    return;
+
+  // Now examine the top of the loop: the phi nodes that could poten-
+  // tially define loop induction registers. The registers defined by
+  // such a phi node would be used in a 64-bit add, which then would
+  // be used in the loop compare instruction.
+
+  // Get the set of all double registers defined by phi nodes in the
+  // loop header.
+  typedef std::vector<unsigned> UVect;
+  UVect DP;
+  for (auto &MI : *HB) {
+    if (!MI.isPHI())
+      break;
+    const MachineOperand &MD = MI.getOperand(0);
+    unsigned R = MD.getReg();
+    if (MRI->getRegClass(R) == DoubleRC)
+      DP.push_back(R);
+  }
+  if (DP.empty())
+    return;
+
+  auto NoIndOp = [this, CmpR1, CmpR2] (unsigned R) -> bool {
+    for (auto I = MRI->use_nodbg_begin(R), E = MRI->use_nodbg_end();
+         I != E; ++I) {
+      const MachineInstr *UseI = I->getParent();
+      if (UseI->getOpcode() != Hexagon::A2_addp)
+        continue;
+      // Get the output from the add. If it is one of the inputs to the
+      // loop-controlling compare instruction, then R is likely an induc-
+      // tion register.
+      unsigned T = UseI->getOperand(0).getReg();
+      if (T == CmpR1 || T == CmpR2)
+        return false;
+    }
+    return true;
+  };
+  UVect::iterator End = llvm::remove_if(DP, NoIndOp);
+  Rs.insert(DP.begin(), End);
+  Rs.insert(CmpR1);
+  Rs.insert(CmpR2);
+
+  DEBUG({
+    dbgs() << "For loop at BB#" << HB->getNumber() << " ind regs: ";
+    dump_partition(dbgs(), Rs, *TRI);
+    dbgs() << '\n';
+  });
+}
+
+void HexagonSplitDoubleRegs::collectIndRegs(LoopRegMap &IRM) {
+  typedef std::vector<MachineLoop*> LoopVector;
+  LoopVector WorkQ;
+
+  for (auto I : *MLI)
+    WorkQ.push_back(I);
+  for (unsigned i = 0; i < WorkQ.size(); ++i) {
+    for (auto I : *WorkQ[i])
+      WorkQ.push_back(I);
+  }
+
+  USet Rs;
+  for (unsigned i = 0, n = WorkQ.size(); i < n; ++i) {
+    MachineLoop *L = WorkQ[i];
+    Rs.clear();
+    collectIndRegsForLoop(L, Rs);
+    if (!Rs.empty())
+      IRM.insert(std::make_pair(L, Rs));
+  }
+}
+
+void HexagonSplitDoubleRegs::createHalfInstr(unsigned Opc, MachineInstr *MI,
+      const UUPairMap &PairMap, unsigned SubR) {
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineInstr *NewI = BuildMI(B, MI, DL, TII->get(Opc));
+
+  for (auto &Op : MI->operands()) {
+    if (!Op.isReg()) {
+      NewI->addOperand(Op);
+      continue;
+    }
+    // For register operands, set the subregister.
+    unsigned R = Op.getReg();
+    unsigned SR = Op.getSubReg();
+    bool isVirtReg = TargetRegisterInfo::isVirtualRegister(R);
+    bool isKill = Op.isKill();
+    if (isVirtReg && MRI->getRegClass(R) == DoubleRC) {
+      isKill = false;
+      UUPairMap::const_iterator F = PairMap.find(R);
+      if (F == PairMap.end()) {
+        SR = SubR;
+      } else {
+        const UUPair &P = F->second;
+        R = (SubR == Hexagon::isub_lo) ? P.first : P.second;
+        SR = 0;
+      }
+    }
+    auto CO = MachineOperand::CreateReg(R, Op.isDef(), Op.isImplicit(), isKill,
+          Op.isDead(), Op.isUndef(), Op.isEarlyClobber(), SR, Op.isDebug(),
+          Op.isInternalRead());
+    NewI->addOperand(CO);
+  }
+}
+
+void HexagonSplitDoubleRegs::splitMemRef(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  bool Load = MI->mayLoad();
+  unsigned OrigOpc = MI->getOpcode();
+  bool PostInc = (OrigOpc == Hexagon::L2_loadrd_pi ||
+                  OrigOpc == Hexagon::S2_storerd_pi);
+  MachineInstr *LowI, *HighI;
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  // Index of the base-address-register operand.
+  unsigned AdrX = PostInc ? (Load ? 2 : 1)
+                          : (Load ? 1 : 0);
+  MachineOperand &AdrOp = MI->getOperand(AdrX);
+  unsigned RSA = getRegState(AdrOp);
+  MachineOperand &ValOp = Load ? MI->getOperand(0)
+                               : (PostInc ? MI->getOperand(3)
+                                          : MI->getOperand(2));
+  UUPairMap::const_iterator F = PairMap.find(ValOp.getReg());
+  assert(F != PairMap.end());
+
+  if (Load) {
+    const UUPair &P = F->second;
+    int64_t Off = PostInc ? 0 : MI->getOperand(2).getImm();
+    LowI = BuildMI(B, MI, DL, TII->get(Hexagon::L2_loadri_io), P.first)
+             .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg())
+             .addImm(Off);
+    HighI = BuildMI(B, MI, DL, TII->get(Hexagon::L2_loadri_io), P.second)
+              .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg())
+              .addImm(Off+4);
+  } else {
+    const UUPair &P = F->second;
+    int64_t Off = PostInc ? 0 : MI->getOperand(1).getImm();
+    LowI = BuildMI(B, MI, DL, TII->get(Hexagon::S2_storeri_io))
+             .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg())
+             .addImm(Off)
+             .addReg(P.first);
+    HighI = BuildMI(B, MI, DL, TII->get(Hexagon::S2_storeri_io))
+              .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg())
+              .addImm(Off+4)
+              .addReg(P.second);
+  }
+
+  if (PostInc) {
+    // Create the increment of the address register.
+    int64_t Inc = Load ? MI->getOperand(3).getImm()
+                       : MI->getOperand(2).getImm();
+    MachineOperand &UpdOp = Load ? MI->getOperand(1) : MI->getOperand(0);
+    const TargetRegisterClass *RC = MRI->getRegClass(UpdOp.getReg());
+    unsigned NewR = MRI->createVirtualRegister(RC);
+    assert(!UpdOp.getSubReg() && "Def operand with subreg");
+    BuildMI(B, MI, DL, TII->get(Hexagon::A2_addi), NewR)
+      .addReg(AdrOp.getReg(), RSA)
+      .addImm(Inc);
+    MRI->replaceRegWith(UpdOp.getReg(), NewR);
+    // The original instruction will be deleted later.
+  }
+
+  // Generate a new pair of memory-operands.
+  MachineFunction &MF = *B.getParent();
+  for (auto &MO : MI->memoperands()) {
+    const MachinePointerInfo &Ptr = MO->getPointerInfo();
+    MachineMemOperand::Flags F = MO->getFlags();
+    int A = MO->getAlignment();
+
+    auto *Tmp1 = MF.getMachineMemOperand(Ptr, F, 4/*size*/, A);
+    LowI->addMemOperand(MF, Tmp1);
+    auto *Tmp2 = MF.getMachineMemOperand(Ptr, F, 4/*size*/, std::min(A, 4));
+    HighI->addMemOperand(MF, Tmp2);
+  }
+}
+
+void HexagonSplitDoubleRegs::splitImmediate(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  MachineOperand &Op0 = MI->getOperand(0);
+  MachineOperand &Op1 = MI->getOperand(1);
+  assert(Op0.isReg() && Op1.isImm());
+  uint64_t V = Op1.getImm();
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+  assert(F != PairMap.end());
+  const UUPair &P = F->second;
+
+  // The operand to A2_tfrsi can only have 32 significant bits. Immediate
+  // values in MachineOperand are stored as 64-bit integers, and so the
+  // value -1 may be represented either as 64-bit -1, or 4294967295. Both
+  // will have the 32 higher bits truncated in the end, but -1 will remain
+  // as -1, while the latter may appear to be a large unsigned value
+  // requiring a constant extender. The casting to int32_t will select the
+  // former representation. (The same reasoning applies to all 32-bit
+  // values.)
+  BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.first)
+    .addImm(int32_t(V & 0xFFFFFFFFULL));
+  BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.second)
+    .addImm(int32_t(V >> 32));
+}
+
+void HexagonSplitDoubleRegs::splitCombine(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  MachineOperand &Op0 = MI->getOperand(0);
+  MachineOperand &Op1 = MI->getOperand(1);
+  MachineOperand &Op2 = MI->getOperand(2);
+  assert(Op0.isReg());
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+  assert(F != PairMap.end());
+  const UUPair &P = F->second;
+
+  if (Op1.isImm()) {
+    BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.second)
+      .addImm(Op1.getImm());
+  } else if (Op1.isReg()) {
+    BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.second)
+      .addReg(Op1.getReg(), getRegState(Op1), Op1.getSubReg());
+  } else
+    llvm_unreachable("Unexpected operand");
+
+  if (Op2.isImm()) {
+    BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.first)
+      .addImm(Op2.getImm());
+  } else if (Op2.isReg()) {
+    BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.first)
+      .addReg(Op2.getReg(), getRegState(Op2), Op2.getSubReg());
+  } else
+    llvm_unreachable("Unexpected operand");
+}
+
+void HexagonSplitDoubleRegs::splitExt(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  MachineOperand &Op0 = MI->getOperand(0);
+  MachineOperand &Op1 = MI->getOperand(1);
+  assert(Op0.isReg() && Op1.isReg());
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+  assert(F != PairMap.end());
+  const UUPair &P = F->second;
+  unsigned RS = getRegState(Op1);
+
+  BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.first)
+    .addReg(Op1.getReg(), RS & ~RegState::Kill, Op1.getSubReg());
+  BuildMI(B, MI, DL, TII->get(Hexagon::S2_asr_i_r), P.second)
+    .addReg(Op1.getReg(), RS, Op1.getSubReg())
+    .addImm(31);
+}
+
+void HexagonSplitDoubleRegs::splitShift(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  using namespace Hexagon;
+
+  MachineOperand &Op0 = MI->getOperand(0);
+  MachineOperand &Op1 = MI->getOperand(1);
+  MachineOperand &Op2 = MI->getOperand(2);
+  assert(Op0.isReg() && Op1.isReg() && Op2.isImm());
+  int64_t Sh64 = Op2.getImm();
+  assert(Sh64 >= 0 && Sh64 < 64);
+  unsigned S = Sh64;
+
+  UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+  assert(F != PairMap.end());
+  const UUPair &P = F->second;
+  unsigned LoR = P.first;
+  unsigned HiR = P.second;
+
+  unsigned Opc = MI->getOpcode();
+  bool Right = (Opc == S2_lsr_i_p || Opc == S2_asr_i_p);
+  bool Left = !Right;
+  bool Signed = (Opc == S2_asr_i_p);
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned RS = getRegState(Op1);
+  unsigned ShiftOpc = Left ? S2_asl_i_r
+                           : (Signed ? S2_asr_i_r : S2_lsr_i_r);
+  unsigned LoSR = isub_lo;
+  unsigned HiSR = isub_hi;
+
+  if (S == 0) {
+    // No shift, subregister copy.
+    BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR)
+      .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR);
+    BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), HiR)
+      .addReg(Op1.getReg(), RS, HiSR);
+  } else if (S < 32) {
+    const TargetRegisterClass *IntRC = &IntRegsRegClass;
+    unsigned TmpR = MRI->createVirtualRegister(IntRC);
+    // Expansion:
+    // Shift left:    DR = shl R, #s
+    //   LoR  = shl R.lo, #s
+    //   TmpR = extractu R.lo, #s, #32-s
+    //   HiR  = or (TmpR, asl(R.hi, #s))
+    // Shift right:   DR = shr R, #s
+    //   HiR  = shr R.hi, #s
+    //   TmpR = shr R.lo, #s
+    //   LoR  = insert TmpR, R.hi, #s, #32-s
+
+    // Shift left:
+    //   LoR  = shl R.lo, #s
+    // Shift right:
+    //   TmpR = shr R.lo, #s
+
+    // Make a special case for A2_aslh and A2_asrh (they are predicable as
+    // opposed to S2_asl_i_r/S2_asr_i_r).
+    if (S == 16 && Left)
+      BuildMI(B, MI, DL, TII->get(A2_aslh), LoR)
+        .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR);
+    else if (S == 16 && Signed)
+      BuildMI(B, MI, DL, TII->get(A2_asrh), TmpR)
+        .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR);
+    else
+      BuildMI(B, MI, DL, TII->get(ShiftOpc), (Left ? LoR : TmpR))
+        .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR)
+        .addImm(S);
+
+    if (Left) {
+      // TmpR = extractu R.lo, #s, #32-s
+      BuildMI(B, MI, DL, TII->get(S2_extractu), TmpR)
+        .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR)
+        .addImm(S)
+        .addImm(32-S);
+      // HiR  = or (TmpR, asl(R.hi, #s))
+      BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR)
+        .addReg(TmpR)
+        .addReg(Op1.getReg(), RS, HiSR)
+        .addImm(S);
+    } else {
+      // HiR  = shr R.hi, #s
+      BuildMI(B, MI, DL, TII->get(ShiftOpc), HiR)
+        .addReg(Op1.getReg(), RS & ~RegState::Kill, HiSR)
+        .addImm(S);
+      // LoR  = insert TmpR, R.hi, #s, #32-s
+      BuildMI(B, MI, DL, TII->get(S2_insert), LoR)
+        .addReg(TmpR)
+        .addReg(Op1.getReg(), RS, HiSR)
+        .addImm(S)
+        .addImm(32-S);
+    }
+  } else if (S == 32) {
+    BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), (Left ? HiR : LoR))
+      .addReg(Op1.getReg(), RS & ~RegState::Kill, (Left ? LoSR : HiSR));
+    if (!Signed)
+      BuildMI(B, MI, DL, TII->get(A2_tfrsi), (Left ? LoR : HiR))
+        .addImm(0);
+    else  // Must be right shift.
+      BuildMI(B, MI, DL, TII->get(S2_asr_i_r), HiR)
+        .addReg(Op1.getReg(), RS, HiSR)
+        .addImm(31);
+  } else if (S < 64) {
+    S -= 32;
+    if (S == 16 && Left)
+      BuildMI(B, MI, DL, TII->get(A2_aslh), HiR)
+        .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR);
+    else if (S == 16 && Signed)
+      BuildMI(B, MI, DL, TII->get(A2_asrh), LoR)
+        .addReg(Op1.getReg(), RS & ~RegState::Kill, HiSR);
+    else
+      BuildMI(B, MI, DL, TII->get(ShiftOpc), (Left ? HiR : LoR))
+        .addReg(Op1.getReg(), RS & ~RegState::Kill, (Left ? LoSR : HiSR))
+        .addImm(S);
+
+    if (Signed)
+      BuildMI(B, MI, DL, TII->get(S2_asr_i_r), HiR)
+        .addReg(Op1.getReg(), RS, HiSR)
+        .addImm(31);
+    else
+      BuildMI(B, MI, DL, TII->get(A2_tfrsi), (Left ? LoR : HiR))
+        .addImm(0);
+  }
+}
+
+void HexagonSplitDoubleRegs::splitAslOr(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  using namespace Hexagon;
+
+  MachineOperand &Op0 = MI->getOperand(0);
+  MachineOperand &Op1 = MI->getOperand(1);
+  MachineOperand &Op2 = MI->getOperand(2);
+  MachineOperand &Op3 = MI->getOperand(3);
+  assert(Op0.isReg() && Op1.isReg() && Op2.isReg() && Op3.isImm());
+  int64_t Sh64 = Op3.getImm();
+  assert(Sh64 >= 0 && Sh64 < 64);
+  unsigned S = Sh64;
+
+  UUPairMap::const_iterator F = PairMap.find(Op0.getReg());
+  assert(F != PairMap.end());
+  const UUPair &P = F->second;
+  unsigned LoR = P.first;
+  unsigned HiR = P.second;
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned RS1 = getRegState(Op1);
+  unsigned RS2 = getRegState(Op2);
+  const TargetRegisterClass *IntRC = &IntRegsRegClass;
+
+  unsigned LoSR = isub_lo;
+  unsigned HiSR = isub_hi;
+
+  // Op0 = S2_asl_i_p_or Op1, Op2, Op3
+  // means:  Op0 = or (Op1, asl(Op2, Op3))
+
+  // Expansion of
+  //   DR = or (R1, asl(R2, #s))
+  //
+  //   LoR  = or (R1.lo, asl(R2.lo, #s))
+  //   Tmp1 = extractu R2.lo, #s, #32-s
+  //   Tmp2 = or R1.hi, Tmp1
+  //   HiR  = or (Tmp2, asl(R2.hi, #s))
+
+  if (S == 0) {
+    // DR  = or (R1, asl(R2, #0))
+    //    -> or (R1, R2)
+    // i.e. LoR = or R1.lo, R2.lo
+    //      HiR = or R1.hi, R2.hi
+    BuildMI(B, MI, DL, TII->get(A2_or), LoR)
+      .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR)
+      .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR);
+    BuildMI(B, MI, DL, TII->get(A2_or), HiR)
+      .addReg(Op1.getReg(), RS1, HiSR)
+      .addReg(Op2.getReg(), RS2, HiSR);
+  } else if (S < 32) {
+    BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), LoR)
+      .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR)
+      .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR)
+      .addImm(S);
+    unsigned TmpR1 = MRI->createVirtualRegister(IntRC);
+    BuildMI(B, MI, DL, TII->get(S2_extractu), TmpR1)
+      .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR)
+      .addImm(S)
+      .addImm(32-S);
+    unsigned TmpR2 = MRI->createVirtualRegister(IntRC);
+    BuildMI(B, MI, DL, TII->get(A2_or), TmpR2)
+      .addReg(Op1.getReg(), RS1, HiSR)
+      .addReg(TmpR1);
+    BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR)
+      .addReg(TmpR2)
+      .addReg(Op2.getReg(), RS2, HiSR)
+      .addImm(S);
+  } else if (S == 32) {
+    // DR  = or (R1, asl(R2, #32))
+    //    -> or R1, R2.lo
+    // LoR = R1.lo
+    // HiR = or R1.hi, R2.lo
+    BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR)
+      .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR);
+    BuildMI(B, MI, DL, TII->get(A2_or), HiR)
+      .addReg(Op1.getReg(), RS1, HiSR)
+      .addReg(Op2.getReg(), RS2, LoSR);
+  } else if (S < 64) {
+    // DR  = or (R1, asl(R2, #s))
+    //
+    // LoR = R1:lo
+    // HiR = or (R1:hi, asl(R2:lo, #s-32))
+    S -= 32;
+    BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR)
+      .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR);
+    BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR)
+      .addReg(Op1.getReg(), RS1, HiSR)
+      .addReg(Op2.getReg(), RS2, LoSR)
+      .addImm(S);
+  }
+}
+
+bool HexagonSplitDoubleRegs::splitInstr(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  using namespace Hexagon;
+
+  DEBUG(dbgs() << "Splitting: " << *MI);
+  bool Split = false;
+  unsigned Opc = MI->getOpcode();
+
+  switch (Opc) {
+    case TargetOpcode::PHI:
+    case TargetOpcode::COPY: {
+      unsigned DstR = MI->getOperand(0).getReg();
+      if (MRI->getRegClass(DstR) == DoubleRC) {
+        createHalfInstr(Opc, MI, PairMap, isub_lo);
+        createHalfInstr(Opc, MI, PairMap, isub_hi);
+        Split = true;
+      }
+      break;
+    }
+    case A2_andp:
+      createHalfInstr(A2_and, MI, PairMap, isub_lo);
+      createHalfInstr(A2_and, MI, PairMap, isub_hi);
+      Split = true;
+      break;
+    case A2_orp:
+      createHalfInstr(A2_or, MI, PairMap, isub_lo);
+      createHalfInstr(A2_or, MI, PairMap, isub_hi);
+      Split = true;
+      break;
+    case A2_xorp:
+      createHalfInstr(A2_xor, MI, PairMap, isub_lo);
+      createHalfInstr(A2_xor, MI, PairMap, isub_hi);
+      Split = true;
+      break;
+
+    case L2_loadrd_io:
+    case L2_loadrd_pi:
+    case S2_storerd_io:
+    case S2_storerd_pi:
+      splitMemRef(MI, PairMap);
+      Split = true;
+      break;
+
+    case A2_tfrpi:
+    case CONST64:
+      splitImmediate(MI, PairMap);
+      Split = true;
+      break;
+
+    case A2_combineii:
+    case A4_combineir:
+    case A4_combineii:
+    case A4_combineri:
+    case A2_combinew:
+      splitCombine(MI, PairMap);
+      Split = true;
+      break;
+
+    case A2_sxtw:
+      splitExt(MI, PairMap);
+      Split = true;
+      break;
+
+    case S2_asl_i_p:
+    case S2_asr_i_p:
+    case S2_lsr_i_p:
+      splitShift(MI, PairMap);
+      Split = true;
+      break;
+
+    case S2_asl_i_p_or:
+      splitAslOr(MI, PairMap);
+      Split = true;
+      break;
+
+    default:
+      llvm_unreachable("Instruction not splitable");
+      return false;
+  }
+
+  return Split;
+}
+
+void HexagonSplitDoubleRegs::replaceSubregUses(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  for (auto &Op : MI->operands()) {
+    if (!Op.isReg() || !Op.isUse() || !Op.getSubReg())
+      continue;
+    unsigned R = Op.getReg();
+    UUPairMap::const_iterator F = PairMap.find(R);
+    if (F == PairMap.end())
+      continue;
+    const UUPair &P = F->second;
+    switch (Op.getSubReg()) {
+      case Hexagon::isub_lo:
+        Op.setReg(P.first);
+        break;
+      case Hexagon::isub_hi:
+        Op.setReg(P.second);
+        break;
+    }
+    Op.setSubReg(0);
+  }
+}
+
+void HexagonSplitDoubleRegs::collapseRegPairs(MachineInstr *MI,
+      const UUPairMap &PairMap) {
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+
+  for (auto &Op : MI->operands()) {
+    if (!Op.isReg() || !Op.isUse())
+      continue;
+    unsigned R = Op.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(R))
+      continue;
+    if (MRI->getRegClass(R) != DoubleRC || Op.getSubReg())
+      continue;
+    UUPairMap::const_iterator F = PairMap.find(R);
+    if (F == PairMap.end())
+      continue;
+    const UUPair &Pr = F->second;
+    unsigned NewDR = MRI->createVirtualRegister(DoubleRC);
+    BuildMI(B, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), NewDR)
+      .addReg(Pr.first)
+      .addImm(Hexagon::isub_lo)
+      .addReg(Pr.second)
+      .addImm(Hexagon::isub_hi);
+    Op.setReg(NewDR);
+  }
+}
+
+bool HexagonSplitDoubleRegs::splitPartition(const USet &Part) {
+  const TargetRegisterClass *IntRC = &Hexagon::IntRegsRegClass;
+  typedef std::set<MachineInstr*> MISet;
+  bool Changed = false;
+
+  DEBUG(dbgs() << "Splitting partition: "; dump_partition(dbgs(), Part, *TRI);
+        dbgs() << '\n');
+
+  UUPairMap PairMap;
+
+  MISet SplitIns;
+  for (unsigned DR : Part) {
+    MachineInstr *DefI = MRI->getVRegDef(DR);
+    SplitIns.insert(DefI);
+
+    // Collect all instructions, including fixed ones.  We won't split them,
+    // but we need to visit them again to insert the REG_SEQUENCE instructions.
+    for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end();
+         U != W; ++U)
+      SplitIns.insert(U->getParent());
+
+    unsigned LoR = MRI->createVirtualRegister(IntRC);
+    unsigned HiR = MRI->createVirtualRegister(IntRC);
+    DEBUG(dbgs() << "Created mapping: " << PrintReg(DR, TRI) << " -> "
+                 << PrintReg(HiR, TRI) << ':' << PrintReg(LoR, TRI) << '\n');
+    PairMap.insert(std::make_pair(DR, UUPair(LoR, HiR)));
+  }
+
+  MISet Erase;
+  for (auto MI : SplitIns) {
+    if (isFixedInstr(MI)) {
+      collapseRegPairs(MI, PairMap);
+    } else {
+      bool Done = splitInstr(MI, PairMap);
+      if (Done)
+        Erase.insert(MI);
+      Changed |= Done;
+    }
+  }
+
+  for (unsigned DR : Part) {
+    // Before erasing "double" instructions, revisit all uses of the double
+    // registers in this partition, and replace all uses of them with subre-
+    // gisters, with the corresponding single registers.
+    MISet Uses;
+    for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end();
+         U != W; ++U)
+      Uses.insert(U->getParent());
+    for (auto M : Uses)
+      replaceSubregUses(M, PairMap);
+  }
+
+  for (auto MI : Erase) {
+    MachineBasicBlock *B = MI->getParent();
+    B->erase(MI);
+  }
+
+  return Changed;
+}
+
+bool HexagonSplitDoubleRegs::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "Splitting double registers in function: "
+        << MF.getName() << '\n');
+
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  auto &ST = MF.getSubtarget<HexagonSubtarget>();
+  TRI = ST.getRegisterInfo();
+  TII = ST.getInstrInfo();
+  MRI = &MF.getRegInfo();
+  MLI = &getAnalysis<MachineLoopInfo>();
+
+  UUSetMap P2Rs;
+  LoopRegMap IRM;
+
+  collectIndRegs(IRM);
+  partitionRegisters(P2Rs);
+
+  DEBUG({
+    dbgs() << "Register partitioning: (partition #0 is fixed)\n";
+    for (UUSetMap::iterator I = P2Rs.begin(), E = P2Rs.end(); I != E; ++I) {
+      dbgs() << '#' << I->first << " -> ";
+      dump_partition(dbgs(), I->second, *TRI);
+      dbgs() << '\n';
+    }
+  });
+
+  bool Changed = false;
+  int Limit = MaxHSDR;
+
+  for (UUSetMap::iterator I = P2Rs.begin(), E = P2Rs.end(); I != E; ++I) {
+    if (I->first == 0)
+      continue;
+    if (Limit >= 0 && Counter >= Limit)
+      break;
+    USet &Part = I->second;
+    DEBUG(dbgs() << "Calculating profit for partition #" << I->first << '\n');
+    if (!isProfitable(Part, IRM))
+      continue;
+    Counter++;
+    Changed |= splitPartition(Part);
+  }
+
+  return Changed;
+}
+
+FunctionPass *llvm::createHexagonSplitDoubleRegs() {
+  return new HexagonSplitDoubleRegs();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
new file mode 100644
index 000000000000..af1bf48b6320
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
@@ -0,0 +1,609 @@
+//===--- HexagonStoreWidening.cpp------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Replace sequences of "narrow" stores to adjacent memory locations with
+// a fewer "wide" stores that have the same effect.
+// For example, replace:
+//   S4_storeirb_io  %vreg100, 0, 0   ; store-immediate-byte
+//   S4_storeirb_io  %vreg100, 1, 0   ; store-immediate-byte
+// with
+//   S4_storeirh_io  %vreg100, 0, 0   ; store-immediate-halfword
+// The above is the general idea.  The actual cases handled by the code
+// may be a bit more complex.
+// The purpose of this pass is to reduce the number of outstanding stores,
+// or as one could say, "reduce store queue pressure".  Also, wide stores
+// mean fewer stores, and since there are only two memory instructions allowed
+// per packet, it also means fewer packets, and ultimately fewer cycles.
+//===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-widen-stores"
+
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+
+  FunctionPass *createHexagonStoreWidening();
+  void initializeHexagonStoreWideningPass(PassRegistry&);
+
+} // end namespace llvm
+
+namespace {
+
+  struct HexagonStoreWidening : public MachineFunctionPass {
+    const HexagonInstrInfo      *TII;
+    const HexagonRegisterInfo   *TRI;
+    const MachineRegisterInfo   *MRI;
+    AliasAnalysis               *AA;
+    MachineFunction             *MF;
+
+  public:
+    static char ID;
+
+    HexagonStoreWidening() : MachineFunctionPass(ID) {
+      initializeHexagonStoreWideningPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    StringRef getPassName() const override { return "Hexagon Store Widening"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addPreserved<AAResultsWrapperPass>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    static bool handledStoreType(const MachineInstr *MI);
+
+  private:
+    static const int MaxWideSize = 4;
+
+    typedef std::vector<MachineInstr*> InstrGroup;
+    typedef std::vector<InstrGroup> InstrGroupList;
+
+    bool instrAliased(InstrGroup &Stores, const MachineMemOperand &MMO);
+    bool instrAliased(InstrGroup &Stores, const MachineInstr *MI);
+    void createStoreGroup(MachineInstr *BaseStore, InstrGroup::iterator Begin,
+        InstrGroup::iterator End, InstrGroup &Group);
+    void createStoreGroups(MachineBasicBlock &MBB,
+        InstrGroupList &StoreGroups);
+    bool processBasicBlock(MachineBasicBlock &MBB);
+    bool processStoreGroup(InstrGroup &Group);
+    bool selectStores(InstrGroup::iterator Begin, InstrGroup::iterator End,
+        InstrGroup &OG, unsigned &TotalSize, unsigned MaxSize);
+    bool createWideStores(InstrGroup &OG, InstrGroup &NG, unsigned TotalSize);
+    bool replaceStores(InstrGroup &OG, InstrGroup &NG);
+    bool storesAreAdjacent(const MachineInstr *S1, const MachineInstr *S2);
+  };
+
+char HexagonStoreWidening::ID = 0;
+
+} // end anonymous namespace
+
+// Some local helper functions...
+static unsigned getBaseAddressRegister(const MachineInstr *MI) {
+  const MachineOperand &MO = MI->getOperand(0);
+  assert(MO.isReg() && "Expecting register operand");
+  return MO.getReg();
+}
+
+static int64_t getStoreOffset(const MachineInstr *MI) {
+  unsigned OpC = MI->getOpcode();
+  assert(HexagonStoreWidening::handledStoreType(MI) && "Unhandled opcode");
+
+  switch (OpC) {
+    case Hexagon::S4_storeirb_io:
+    case Hexagon::S4_storeirh_io:
+    case Hexagon::S4_storeiri_io: {
+      const MachineOperand &MO = MI->getOperand(1);
+      assert(MO.isImm() && "Expecting immediate offset");
+      return MO.getImm();
+    }
+  }
+  dbgs() << *MI;
+  llvm_unreachable("Store offset calculation missing for a handled opcode");
+  return 0;
+}
+
+static const MachineMemOperand &getStoreTarget(const MachineInstr *MI) {
+  assert(!MI->memoperands_empty() && "Expecting memory operands");
+  return **MI->memoperands_begin();
+}
+
+INITIALIZE_PASS_BEGIN(HexagonStoreWidening, "hexagon-widen-stores",
+                "Hexason Store Widening", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(HexagonStoreWidening, "hexagon-widen-stores",
+                "Hexagon Store Widening", false, false)
+
+// Filtering function: any stores whose opcodes are not "approved" of by
+// this function will not be subjected to widening.
+inline bool HexagonStoreWidening::handledStoreType(const MachineInstr *MI) {
+  // For now, only handle stores of immediate values.
+  // Also, reject stores to stack slots.
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case Hexagon::S4_storeirb_io:
+    case Hexagon::S4_storeirh_io:
+    case Hexagon::S4_storeiri_io:
+      // Base address must be a register. (Implement FI later.)
+      return MI->getOperand(0).isReg();
+    default:
+      return false;
+  }
+}
+
+// Check if the machine memory operand MMO is aliased with any of the
+// stores in the store group Stores.
+bool HexagonStoreWidening::instrAliased(InstrGroup &Stores,
+      const MachineMemOperand &MMO) {
+  if (!MMO.getValue())
+    return true;
+
+  MemoryLocation L(MMO.getValue(), MMO.getSize(), MMO.getAAInfo());
+
+  for (auto SI : Stores) {
+    const MachineMemOperand &SMO = getStoreTarget(SI);
+    if (!SMO.getValue())
+      return true;
+
+    MemoryLocation SL(SMO.getValue(), SMO.getSize(), SMO.getAAInfo());
+    if (AA->alias(L, SL))
+      return true;
+  }
+
+  return false;
+}
+
+// Check if the machine instruction MI accesses any storage aliased with
+// any store in the group Stores.
+bool HexagonStoreWidening::instrAliased(InstrGroup &Stores,
+      const MachineInstr *MI) {
+  for (auto &I : MI->memoperands())
+    if (instrAliased(Stores, *I))
+      return true;
+  return false;
+}
+
+// Inspect a machine basic block, and generate store groups out of stores
+// encountered in the block.
+//
+// A store group is a group of stores that use the same base register,
+// and which can be reordered within that group without altering the
+// semantics of the program.  A single store group could be widened as
+// a whole, if there existed a single store instruction with the same
+// semantics as the entire group.  In many cases, a single store group
+// may need more than one wide store.
+void HexagonStoreWidening::createStoreGroups(MachineBasicBlock &MBB,
+      InstrGroupList &StoreGroups) {
+  InstrGroup AllInsns;
+
+  // Copy all instruction pointers from the basic block to a temporary
+  // list.  This will allow operating on the list, and modifying its
+  // elements without affecting the basic block.
+  for (auto &I : MBB)
+    AllInsns.push_back(&I);
+
+  // Traverse all instructions in the AllInsns list, and if we encounter
+  // a store, then try to create a store group starting at that instruction
+  // i.e. a sequence of independent stores that can be widened.
+  for (auto I = AllInsns.begin(), E = AllInsns.end(); I != E; ++I) {
+    MachineInstr *MI = *I;
+    // Skip null pointers (processed instructions).
+    if (!MI || !handledStoreType(MI))
+      continue;
+
+    // Found a store.  Try to create a store group.
+    InstrGroup G;
+    createStoreGroup(MI, I+1, E, G);
+    if (G.size() > 1)
+      StoreGroups.push_back(G);
+  }
+}
+
+// Create a single store group.  The stores need to be independent between
+// themselves, and also there cannot be other instructions between them
+// that could read or modify storage being stored into.
+void HexagonStoreWidening::createStoreGroup(MachineInstr *BaseStore,
+      InstrGroup::iterator Begin, InstrGroup::iterator End, InstrGroup &Group) {
+  assert(handledStoreType(BaseStore) && "Unexpected instruction");
+  unsigned BaseReg = getBaseAddressRegister(BaseStore);
+  InstrGroup Other;
+
+  Group.push_back(BaseStore);
+
+  for (auto I = Begin; I != End; ++I) {
+    MachineInstr *MI = *I;
+    if (!MI)
+      continue;
+
+    if (handledStoreType(MI)) {
+      // If this store instruction is aliased with anything already in the
+      // group, terminate the group now.
+      if (instrAliased(Group, getStoreTarget(MI)))
+        return;
+      // If this store is aliased to any of the memory instructions we have
+      // seen so far (that are not a part of this group), terminate the group.
+      if (instrAliased(Other, getStoreTarget(MI)))
+        return;
+
+      unsigned BR = getBaseAddressRegister(MI);
+      if (BR == BaseReg) {
+        Group.push_back(MI);
+        *I = nullptr;
+        continue;
+      }
+    }
+
+    // Assume calls are aliased to everything.
+    if (MI->isCall() || MI->hasUnmodeledSideEffects())
+      return;
+
+    if (MI->mayLoad() || MI->mayStore()) {
+      if (MI->hasOrderedMemoryRef() || instrAliased(Group, MI))
+        return;
+      Other.push_back(MI);
+    }
+  } // for
+}
+
+// Check if store instructions S1 and S2 are adjacent.  More precisely,
+// S2 has to access memory immediately following that accessed by S1.
+bool HexagonStoreWidening::storesAreAdjacent(const MachineInstr *S1,
+      const MachineInstr *S2) {
+  if (!handledStoreType(S1) || !handledStoreType(S2))
+    return false;
+
+  const MachineMemOperand &S1MO = getStoreTarget(S1);
+
+  // Currently only handling immediate stores.
+  int Off1 = S1->getOperand(1).getImm();
+  int Off2 = S2->getOperand(1).getImm();
+
+  return (Off1 >= 0) ? Off1+S1MO.getSize() == unsigned(Off2)
+                     : int(Off1+S1MO.getSize()) == Off2;
+}
+
+/// Given a sequence of adjacent stores, and a maximum size of a single wide
+/// store, pick a group of stores that  can be replaced by a single store
+/// of size not exceeding MaxSize.  The selected sequence will be recorded
+/// in OG ("old group" of instructions).
+/// OG should be empty on entry, and should be left empty if the function
+/// fails.
+bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin,
+      InstrGroup::iterator End, InstrGroup &OG, unsigned &TotalSize,
+      unsigned MaxSize) {
+  assert(Begin != End && "No instructions to analyze");
+  assert(OG.empty() && "Old group not empty on entry");
+
+  if (std::distance(Begin, End) <= 1)
+    return false;
+
+  MachineInstr *FirstMI = *Begin;
+  assert(!FirstMI->memoperands_empty() && "Expecting some memory operands");
+  const MachineMemOperand &FirstMMO = getStoreTarget(FirstMI);
+  unsigned Alignment = FirstMMO.getAlignment();
+  unsigned SizeAccum = FirstMMO.getSize();
+  unsigned FirstOffset = getStoreOffset(FirstMI);
+
+  // The initial value of SizeAccum should always be a power of 2.
+  assert(isPowerOf2_32(SizeAccum) && "First store size not a power of 2");
+
+  // If the size of the first store equals to or exceeds the limit, do nothing.
+  if (SizeAccum >= MaxSize)
+    return false;
+
+  // If the size of the first store is greater than or equal to the address
+  // stored to, then the store cannot be made any wider.
+  if (SizeAccum >= Alignment)
+    return false;
+
+  // The offset of a store will put restrictions on how wide the store can be.
+  // Offsets in stores of size 2^n bytes need to have the n lowest bits be 0.
+  // If the first store already exhausts the offset limits, quit.  Test this
+  // by checking if the next wider size would exceed the limit.
+  if ((2*SizeAccum-1) & FirstOffset)
+    return false;
+
+  OG.push_back(FirstMI);
+  MachineInstr *S1 = FirstMI, *S2 = *(Begin+1);
+  InstrGroup::iterator I = Begin+1;
+
+  // Pow2Num will be the largest number of elements in OG such that the sum
+  // of sizes of stores 0...Pow2Num-1 will be a power of 2.
+  unsigned Pow2Num = 1;
+  unsigned Pow2Size = SizeAccum;
+
+  // Be greedy: keep accumulating stores as long as they are to adjacent
+  // memory locations, and as long as the total number of bytes stored
+  // does not exceed the limit (MaxSize).
+  // Keep track of when the total size covered is a power of 2, since
+  // this is a size a single store can cover.
+  while (I != End) {
+    S2 = *I;
+    // Stores are sorted, so if S1 and S2 are not adjacent, there won't be
+    // any other store to fill the "hole".
+    if (!storesAreAdjacent(S1, S2))
+      break;
+
+    unsigned S2Size = getStoreTarget(S2).getSize();
+    if (SizeAccum + S2Size > std::min(MaxSize, Alignment))
+      break;
+
+    OG.push_back(S2);
+    SizeAccum += S2Size;
+    if (isPowerOf2_32(SizeAccum)) {
+      Pow2Num = OG.size();
+      Pow2Size = SizeAccum;
+    }
+    if ((2*Pow2Size-1) & FirstOffset)
+      break;
+
+    S1 = S2;
+    ++I;
+  }
+
+  // The stores don't add up to anything that can be widened.  Clean up.
+  if (Pow2Num <= 1) {
+    OG.clear();
+    return false;
+  }
+
+  // Only leave the stored being widened.
+  OG.resize(Pow2Num);
+  TotalSize = Pow2Size;
+  return true;
+}
+
+/// Given an "old group" OG of stores, create a "new group" NG of instructions
+/// to replace them.  Ideally, NG would only have a single instruction in it,
+/// but that may only be possible for store-immediate.
+bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
+      unsigned TotalSize) {
+  // XXX Current limitations:
+  // - only expect stores of immediate values in OG,
+  // - only handle a TotalSize of up to 4.
+
+  if (TotalSize > 4)
+    return false;
+
+  unsigned Acc = 0;  // Value accumulator.
+  unsigned Shift = 0;
+
+  for (InstrGroup::iterator I = OG.begin(), E = OG.end(); I != E; ++I) {
+    MachineInstr *MI = *I;
+    const MachineMemOperand &MMO = getStoreTarget(MI);
+    MachineOperand &SO = MI->getOperand(2);  // Source.
+    assert(SO.isImm() && "Expecting an immediate operand");
+
+    unsigned NBits = MMO.getSize()*8;
+    unsigned Mask = (0xFFFFFFFFU >> (32-NBits));
+    unsigned Val = (SO.getImm() & Mask) << Shift;
+    Acc |= Val;
+    Shift += NBits;
+  }
+
+  MachineInstr *FirstSt = OG.front();
+  DebugLoc DL = OG.back()->getDebugLoc();
+  const MachineMemOperand &OldM = getStoreTarget(FirstSt);
+  MachineMemOperand *NewM =
+    MF->getMachineMemOperand(OldM.getPointerInfo(), OldM.getFlags(),
+                             TotalSize, OldM.getAlignment(),
+                             OldM.getAAInfo());
+
+  if (Acc < 0x10000) {
+    // Create mem[hw] = #Acc
+    unsigned WOpc = (TotalSize == 2) ? Hexagon::S4_storeirh_io :
+                    (TotalSize == 4) ? Hexagon::S4_storeiri_io : 0;
+    assert(WOpc && "Unexpected size");
+
+    int Val = (TotalSize == 2) ? int16_t(Acc) : int(Acc);
+    const MCInstrDesc &StD = TII->get(WOpc);
+    MachineOperand &MR = FirstSt->getOperand(0);
+    int64_t Off = FirstSt->getOperand(1).getImm();
+    MachineInstr *StI = BuildMI(*MF, DL, StD)
+                          .addReg(MR.getReg(), getKillRegState(MR.isKill()))
+                          .addImm(Off)
+                          .addImm(Val);
+    StI->addMemOperand(*MF, NewM);
+    NG.push_back(StI);
+  } else {
+    // Create vreg = A2_tfrsi #Acc; mem[hw] = vreg
+    const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi);
+    const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI, *MF);
+    unsigned VReg = MF->getRegInfo().createVirtualRegister(RC);
+    MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg)
+                           .addImm(int(Acc));
+    NG.push_back(TfrI);
+
+    unsigned WOpc = (TotalSize == 2) ? Hexagon::S2_storerh_io :
+                    (TotalSize == 4) ? Hexagon::S2_storeri_io : 0;
+    assert(WOpc && "Unexpected size");
+
+    const MCInstrDesc &StD = TII->get(WOpc);
+    MachineOperand &MR = FirstSt->getOperand(0);
+    int64_t Off = FirstSt->getOperand(1).getImm();
+    MachineInstr *StI = BuildMI(*MF, DL, StD)
+                          .addReg(MR.getReg(), getKillRegState(MR.isKill()))
+                          .addImm(Off)
+                          .addReg(VReg, RegState::Kill);
+    StI->addMemOperand(*MF, NewM);
+    NG.push_back(StI);
+  }
+
+  return true;
+}
+
+// Replace instructions from the old group OG with instructions from the
+// new group NG.  Conceptually, remove all instructions in OG, and then
+// insert all instructions in NG, starting at where the first instruction
+// from OG was (in the order in which they appeared in the basic block).
+// (The ordering in OG does not have to match the order in the basic block.)
+bool HexagonStoreWidening::replaceStores(InstrGroup &OG, InstrGroup &NG) {
+  DEBUG({
+    dbgs() << "Replacing:\n";
+    for (auto I : OG)
+      dbgs() << "  " << *I;
+    dbgs() << "with\n";
+    for (auto I : NG)
+      dbgs() << "  " << *I;
+  });
+
+  MachineBasicBlock *MBB = OG.back()->getParent();
+  MachineBasicBlock::iterator InsertAt = MBB->end();
+
+  // Need to establish the insertion point.  The best one is right before
+  // the first store in the OG, but in the order in which the stores occur
+  // in the program list.  Since the ordering in OG does not correspond
+  // to the order in the program list, we need to do some work to find
+  // the insertion point.
+
+  // Create a set of all instructions in OG (for quick lookup).
+  SmallPtrSet<MachineInstr*, 4> InstrSet;
+  for (auto I : OG)
+    InstrSet.insert(I);
+
+  // Traverse the block, until we hit an instruction from OG.
+  for (auto &I : *MBB) {
+    if (InstrSet.count(&I)) {
+      InsertAt = I;
+      break;
+    }
+  }
+
+  assert((InsertAt != MBB->end()) && "Cannot locate any store from the group");
+
+  bool AtBBStart = false;
+
+  // InsertAt points at the first instruction that will be removed.  We need
+  // to move it out of the way, so it remains valid after removing all the
+  // old stores, and so we are able to recover it back to the proper insertion
+  // position.
+  if (InsertAt != MBB->begin())
+    --InsertAt;
+  else
+    AtBBStart = true;
+
+  for (auto I : OG)
+    I->eraseFromParent();
+
+  if (!AtBBStart)
+    ++InsertAt;
+  else
+    InsertAt = MBB->begin();
+
+  for (auto I : NG)
+    MBB->insert(InsertAt, I);
+
+  return true;
+}
+
+// Break up the group into smaller groups, each of which can be replaced by
+// a single wide store.  Widen each such smaller group and replace the old
+// instructions with the widened ones.
+bool HexagonStoreWidening::processStoreGroup(InstrGroup &Group) {
+  bool Changed = false;
+  InstrGroup::iterator I = Group.begin(), E = Group.end();
+  InstrGroup OG, NG;   // Old and new groups.
+  unsigned CollectedSize;
+
+  while (I != E) {
+    OG.clear();
+    NG.clear();
+
+    bool Succ = selectStores(I++, E, OG, CollectedSize, MaxWideSize) &&
+                createWideStores(OG, NG, CollectedSize)              &&
+                replaceStores(OG, NG);
+    if (!Succ)
+      continue;
+
+    assert(OG.size() > 1 && "Created invalid group");
+    assert(distance(I, E)+1 >= int(OG.size()) && "Too many elements");
+    I += OG.size()-1;
+
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+// Process a single basic block: create the store groups, and replace them
+// with the widened stores, if possible.  Processing of each basic block
+// is independent from processing of any other basic block.  This transfor-
+// mation could be stopped after having processed any basic block without
+// any ill effects (other than not having performed widening in the unpro-
+// cessed blocks).  Also, the basic blocks can be processed in any order.
+bool HexagonStoreWidening::processBasicBlock(MachineBasicBlock &MBB) {
+  InstrGroupList SGs;
+  bool Changed = false;
+
+  createStoreGroups(MBB, SGs);
+
+  auto Less = [] (const MachineInstr *A, const MachineInstr *B) -> bool {
+    return getStoreOffset(A) < getStoreOffset(B);
+  };
+  for (auto &G : SGs) {
+    assert(G.size() > 1 && "Store group with fewer than 2 elements");
+    std::sort(G.begin(), G.end(), Less);
+
+    Changed |= processStoreGroup(G);
+  }
+
+  return Changed;
+}
+
+bool HexagonStoreWidening::runOnMachineFunction(MachineFunction &MFn) {
+  if (skipFunction(*MFn.getFunction()))
+    return false;
+
+  MF = &MFn;
+  auto &ST = MFn.getSubtarget<HexagonSubtarget>();
+  TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
+  MRI = &MFn.getRegInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  bool Changed = false;
+
+  for (auto &B : MFn)
+    Changed |= processBasicBlock(B);
+
+  return Changed;
+}
+
+FunctionPass *llvm::createHexagonStoreWidening() {
+  return new HexagonStoreWidening();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
new file mode 100644
index 000000000000..8c23a2465dd6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -0,0 +1,393 @@
+//===-- HexagonSubtarget.cpp - Hexagon Subtarget Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Hexagon specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonSubtarget.h"
+#include "Hexagon.h"
+#include "HexagonRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <map>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-subtarget"
+
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
+#include "HexagonGenSubtargetInfo.inc"
+
+static cl::opt<bool> EnableMemOps("enable-hexagon-memops",
+  cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(true),
+  cl::desc("Generate V4 MEMOP in code generation for Hexagon target"));
+
+static cl::opt<bool> DisableMemOps("disable-hexagon-memops",
+  cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(false),
+  cl::desc("Do not generate V4 MEMOP in code generation for Hexagon target"));
+
+static cl::opt<bool> EnableIEEERndNear("enable-hexagon-ieee-rnd-near",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Generate non-chopped conversion from fp to int."));
+
+static cl::opt<bool> EnableBSBSched("enable-bsb-sched",
+  cl::Hidden, cl::ZeroOrMore, cl::init(true));
+
+static cl::opt<bool> EnableHexagonHVXDouble("enable-hexagon-hvx-double",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Enable Hexagon Double Vector eXtensions"));
+
+static cl::opt<bool> EnableHexagonHVX("enable-hexagon-hvx",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Enable Hexagon Vector eXtensions"));
+
+static cl::opt<bool> EnableTCLatencySched("enable-tc-latency-sched",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false));
+
+static cl::opt<bool> EnableDotCurSched("enable-cur-sched",
+  cl::Hidden, cl::ZeroOrMore, cl::init(true),
+  cl::desc("Enable the scheduler to generate .cur"));
+
+static cl::opt<bool> EnableVecFrwdSched("enable-evec-frwd-sched",
+  cl::Hidden, cl::ZeroOrMore, cl::init(true));
+
+static cl::opt<bool> DisableHexagonMISched("disable-hexagon-misched",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Disable Hexagon MI Scheduling"));
+
+static cl::opt<bool> EnableSubregLiveness("hexagon-subreg-liveness",
+  cl::Hidden, cl::ZeroOrMore, cl::init(true),
+  cl::desc("Enable subregister liveness tracking for Hexagon"));
+
+static cl::opt<bool> OverrideLongCalls("hexagon-long-calls",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("If present, forces/disables the use of long calls"));
+
+void HexagonSubtarget::initializeEnvironment() {
+  UseMemOps = false;
+  ModeIEEERndNear = false;
+  UseBSBScheduling = false;
+}
+
+HexagonSubtarget &
+HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
+  CPUString = Hexagon_MC::selectHexagonCPU(getTargetTriple(), CPU);
+
+  static std::map<StringRef, HexagonArchEnum> CpuTable {
+    { "hexagonv4", V4 },
+    { "hexagonv5", V5 },
+    { "hexagonv55", V55 },
+    { "hexagonv60", V60 },
+  };
+
+  auto foundIt = CpuTable.find(CPUString);
+  if (foundIt != CpuTable.end())
+    HexagonArchVersion = foundIt->second;
+  else
+    llvm_unreachable("Unrecognized Hexagon processor version");
+
+  UseHVXOps = false;
+  UseHVXDblOps = false;
+  UseLongCalls = false;
+  ParseSubtargetFeatures(CPUString, FS);
+
+  if (EnableHexagonHVX.getPosition())
+    UseHVXOps = EnableHexagonHVX;
+  if (EnableHexagonHVXDouble.getPosition())
+    UseHVXDblOps = EnableHexagonHVXDouble;
+  if (OverrideLongCalls.getPosition())
+    UseLongCalls = OverrideLongCalls;
+
+  return *this;
+}
+
+HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
+                                   StringRef FS, const TargetMachine &TM)
+    : HexagonGenSubtargetInfo(TT, CPU, FS), CPUString(CPU),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
+      FrameLowering() {
+
+  initializeEnvironment();
+
+  // Initialize scheduling itinerary for the specified CPU.
+  InstrItins = getInstrItineraryForCPU(CPUString);
+
+  // UseMemOps on by default unless disabled explicitly
+  if (DisableMemOps)
+    UseMemOps = false;
+  else if (EnableMemOps)
+    UseMemOps = true;
+  else
+    UseMemOps = false;
+
+  if (EnableIEEERndNear)
+    ModeIEEERndNear = true;
+  else
+    ModeIEEERndNear = false;
+
+  UseBSBScheduling = hasV60TOps() && EnableBSBSched;
+}
+
+
+void HexagonSubtarget::HexagonDAGMutation::apply(ScheduleDAGInstrs *DAG) {
+  for (auto &SU : DAG->SUnits) {
+    if (!SU.isInstr())
+      continue;
+    SmallVector<SDep, 4> Erase;
+    for (auto &D : SU.Preds)
+      if (D.getKind() == SDep::Output && D.getReg() == Hexagon::USR_OVF)
+        Erase.push_back(D);
+    for (auto &E : Erase)
+      SU.removePred(E);
+  }
+
+  for (auto &SU : DAG->SUnits) {
+    // Update the latency of chain edges between v60 vector load or store
+    // instructions to be 1. These instructions cannot be scheduled in the
+    // same packet.
+    MachineInstr &MI1 = *SU.getInstr();
+    auto *QII = static_cast<const HexagonInstrInfo*>(DAG->TII);
+    bool IsStoreMI1 = MI1.mayStore();
+    bool IsLoadMI1 = MI1.mayLoad();
+    if (!QII->isV60VectorInstruction(MI1) || !(IsStoreMI1 || IsLoadMI1))
+      continue;
+    for (auto &SI : SU.Succs) {
+      if (SI.getKind() != SDep::Order || SI.getLatency() != 0)
+        continue;
+      MachineInstr &MI2 = *SI.getSUnit()->getInstr();
+      if (!QII->isV60VectorInstruction(MI2))
+        continue;
+      if ((IsStoreMI1 && MI2.mayStore()) || (IsLoadMI1 && MI2.mayLoad())) {
+        SI.setLatency(1);
+        SU.setHeightDirty();
+        // Change the dependence in the opposite direction too.
+        for (auto &PI : SI.getSUnit()->Preds) {
+          if (PI.getSUnit() != &SU || PI.getKind() != SDep::Order)
+            continue;
+          PI.setLatency(1);
+          SI.getSUnit()->setDepthDirty();
+        }
+      }
+    }
+  }
+}
+
+
+void HexagonSubtarget::getPostRAMutations(
+      std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+  Mutations.push_back(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+}
+
+void HexagonSubtarget::getSMSMutations(
+      std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+  Mutations.push_back(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+}
+
+
+// Pin the vtable to this file.
+void HexagonSubtarget::anchor() {}
+
+bool HexagonSubtarget::enableMachineScheduler() const {
+  if (DisableHexagonMISched.getNumOccurrences())
+    return !DisableHexagonMISched;
+  return true;
+}
+
+bool HexagonSubtarget::enableSubRegLiveness() const {
+  return EnableSubregLiveness;
+}
+
+// This helper function is responsible for increasing the latency only.
+void HexagonSubtarget::updateLatency(MachineInstr &SrcInst,
+      MachineInstr &DstInst, SDep &Dep) const {
+  if (!hasV60TOps())
+    return;
+
+  auto &QII = static_cast<const HexagonInstrInfo&>(*getInstrInfo());
+
+  if (EnableVecFrwdSched && QII.addLatencyToSchedule(SrcInst, DstInst)) {
+    // Vec frwd scheduling.
+    Dep.setLatency(Dep.getLatency() + 1);
+  } else if (useBSBScheduling() &&
+             QII.isLateInstrFeedsEarlyInstr(SrcInst, DstInst)) {
+    // BSB scheduling.
+    Dep.setLatency(Dep.getLatency() + 1);
+  } else if (EnableTCLatencySched) {
+    // TClass latency scheduling.
+    // Check if SrcInst produces in 2C an operand of DstInst taken in stage 2B.
+    if (QII.isTC1(SrcInst) || QII.isTC2(SrcInst))
+      if (!QII.isTC1(DstInst) && !QII.isTC2(DstInst))
+        Dep.setLatency(Dep.getLatency() + 1);
+  }
+}
+
+/// If the SUnit has a zero latency edge, return the other SUnit.
+static SUnit *getZeroLatency(SUnit *N, SmallVector<SDep, 4> &Deps) {
+  for (auto &I : Deps)
+    if (I.isAssignedRegDep() && I.getLatency() == 0 &&
+        !I.getSUnit()->getInstr()->isPseudo())
+      return I.getSUnit();
+  return nullptr;
+}
+
+/// Change the latency between the two SUnits.
+void HexagonSubtarget::changeLatency(SUnit *Src, SmallVector<SDep, 4> &Deps,
+      SUnit *Dst, unsigned Lat) const {
+  MachineInstr &SrcI = *Src->getInstr();
+  for (auto &I : Deps) {
+    if (I.getSUnit() != Dst)
+      continue;
+    I.setLatency(Lat);
+    SUnit *UpdateDst = I.getSUnit();
+    updateLatency(SrcI, *UpdateDst->getInstr(), I);
+    // Update the latency of opposite edge too.
+    for (auto &PI : UpdateDst->Preds) {
+      if (PI.getSUnit() != Src || !PI.isAssignedRegDep())
+        continue;
+      PI.setLatency(Lat);
+      updateLatency(SrcI, *UpdateDst->getInstr(), PI);
+    }
+  }
+}
+
+// Return true if these are the best two instructions to schedule
+// together with a zero latency. Only one dependence should have a zero
+// latency. If there are multiple choices, choose the best, and change
+// ther others, if needed.
+bool HexagonSubtarget::isBestZeroLatency(SUnit *Src, SUnit *Dst,
+      const HexagonInstrInfo *TII) const {
+  MachineInstr &SrcInst = *Src->getInstr();
+  MachineInstr &DstInst = *Dst->getInstr();
+
+  // Ignore Boundary SU nodes as these have null instructions.
+  if (Dst->isBoundaryNode())
+    return false;
+
+  if (SrcInst.isPHI() || DstInst.isPHI())
+    return false;
+
+  // Check if the Dst instruction is the best candidate first.
+  SUnit *Best = nullptr;
+  SUnit *DstBest = nullptr;
+  SUnit *SrcBest = getZeroLatency(Dst, Dst->Preds);
+  if (SrcBest == nullptr || Src->NodeNum >= SrcBest->NodeNum) {
+    // Check that Src doesn't have a better candidate.
+    DstBest = getZeroLatency(Src, Src->Succs);
+    if (DstBest == nullptr || Dst->NodeNum <= DstBest->NodeNum)
+      Best = Dst;
+  }
+  if (Best != Dst)
+    return false;
+
+  // The caller frequents adds the same dependence twice. If so, then
+  // return true for this case too.
+  if (Src == SrcBest && Dst == DstBest)
+    return true;
+
+  // Reassign the latency for the previous bests, which requires setting
+  // the dependence edge in both directions.
+  if (SrcBest != nullptr)
+    changeLatency(SrcBest, SrcBest->Succs, Dst, 1);
+  if (DstBest != nullptr)
+    changeLatency(Src, Src->Succs, DstBest, 1);
+  // If there is an edge from SrcBest to DstBst, then try to change that
+  // to 0 now.
+  if (SrcBest && DstBest)
+    changeLatency(SrcBest, SrcBest->Succs, DstBest, 0);
+
+  return true;
+}
+
+// Update the latency of a Phi when the Phi bridges two instructions that
+// require a multi-cycle latency.
+void HexagonSubtarget::changePhiLatency(MachineInstr &SrcInst, SUnit *Dst,
+      SDep &Dep) const {
+  if (!SrcInst.isPHI() || Dst->NumPreds == 0 || Dep.getLatency() != 0)
+    return;
+
+  for (const SDep &PI : Dst->Preds) {
+    if (PI.getLatency() != 0)
+      continue;
+    Dep.setLatency(2);
+    break;
+  }
+}
+
+/// \brief Perform target specific adjustments to the latency of a schedule
+/// dependency.
+void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
+                                             SDep &Dep) const {
+  MachineInstr *SrcInst = Src->getInstr();
+  MachineInstr *DstInst = Dst->getInstr();
+  if (!Src->isInstr() || !Dst->isInstr())
+    return;
+
+  const HexagonInstrInfo *QII = static_cast<const HexagonInstrInfo *>(getInstrInfo());
+
+  // Instructions with .new operands have zero latency.
+  if (QII->canExecuteInBundle(*SrcInst, *DstInst) &&
+      isBestZeroLatency(Src, Dst, QII)) {
+    Dep.setLatency(0);
+    return;
+  }
+
+  if (!hasV60TOps())
+    return;
+
+  // Don't adjust the latency of post-increment part of the instruction.
+  if (QII->isPostIncrement(*SrcInst) && Dep.isAssignedRegDep()) {
+    if (SrcInst->mayStore())
+      return;
+    if (Dep.getReg() != SrcInst->getOperand(0).getReg())
+      return;
+  } else if (QII->isPostIncrement(*DstInst) && Dep.getKind() == SDep::Anti) {
+    if (DstInst->mayStore())
+      return;
+    if (Dep.getReg() != DstInst->getOperand(0).getReg())
+      return;
+  } else if (QII->isPostIncrement(*DstInst) && DstInst->mayStore() &&
+             Dep.isAssignedRegDep()) {
+    MachineOperand &Op = DstInst->getOperand(DstInst->getNumOperands() - 1);
+    if (Op.isReg() && Dep.getReg() != Op.getReg())
+      return;
+  }
+
+  // Check if we need to change any the latency values when Phis are added.
+  if (useBSBScheduling() && SrcInst->isPHI()) {
+    changePhiLatency(*SrcInst, Dst, Dep);
+    return;
+  }
+
+  // If it's a REG_SEQUENCE, use its destination instruction to determine
+  // the correct latency.
+  if (DstInst->isRegSequence() && Dst->NumSuccs == 1)
+    DstInst = Dst->Succs[0].getSUnit()->getInstr();
+
+  // Try to schedule uses near definitions to generate .cur.
+  if (EnableDotCurSched && QII->isToBeScheduledASAP(*SrcInst, *DstInst) &&
+      isBestZeroLatency(Src, Dst, QII)) {
+    Dep.setLatency(0);
+    return;
+  }
+
+  updateLatency(*SrcInst, *DstInst, Dep);
+}
+
+unsigned HexagonSubtarget::getL1CacheLineSize() const {
+  return 32;
+}
+
+unsigned HexagonSubtarget::getL1PrefetchDistance() const {
+  return 32;
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
new file mode 100644
index 000000000000..f2b9cdaad1ae
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -0,0 +1,157 @@
+//===-- HexagonSubtarget.h - Define Subtarget for the Hexagon ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Hexagon specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
+
+#include "HexagonFrameLowering.h"
+#include "HexagonISelLowering.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonSelectionDAGInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "HexagonGenSubtargetInfo.inc"
+
+#define Hexagon_SMALL_DATA_THRESHOLD 8
+#define Hexagon_SLOTS 4
+
+namespace llvm {
+
+class HexagonSubtarget : public HexagonGenSubtargetInfo {
+  virtual void anchor();
+
+  bool UseMemOps, UseHVXOps, UseHVXDblOps;
+  bool UseLongCalls;
+  bool ModeIEEERndNear;
+
+public:
+  enum HexagonArchEnum {
+    V4, V5, V55, V60
+  };
+
+  HexagonArchEnum HexagonArchVersion;
+  /// True if the target should use Back-Skip-Back scheduling. This is the
+  /// default for V60.
+  bool UseBSBScheduling;
+
+  class HexagonDAGMutation : public ScheduleDAGMutation {
+  public:
+    void apply(ScheduleDAGInstrs *DAG) override;
+  };
+
+private:
+  std::string CPUString;
+  HexagonInstrInfo InstrInfo;
+  HexagonTargetLowering TLInfo;
+  HexagonSelectionDAGInfo TSInfo;
+  HexagonFrameLowering FrameLowering;
+  InstrItineraryData InstrItins;
+  void initializeEnvironment();
+
+public:
+  HexagonSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                   const TargetMachine &TM);
+
+  /// getInstrItins - Return the instruction itineraries based on subtarget
+  /// selection.
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
+  const HexagonInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const HexagonRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const HexagonTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const HexagonFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const HexagonSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+  HexagonSubtarget &initializeSubtargetDependencies(StringRef CPU,
+                                                    StringRef FS);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  bool useMemOps() const { return UseMemOps; }
+  bool hasV5TOps() const { return getHexagonArchVersion() >= V5; }
+  bool hasV5TOpsOnly() const { return getHexagonArchVersion() == V5; }
+  bool hasV55TOps() const { return getHexagonArchVersion() >= V55; }
+  bool hasV55TOpsOnly() const { return getHexagonArchVersion() == V55; }
+  bool hasV60TOps() const { return getHexagonArchVersion() >= V60; }
+  bool hasV60TOpsOnly() const { return getHexagonArchVersion() == V60; }
+  bool modeIEEERndNear() const { return ModeIEEERndNear; }
+  bool useHVXOps() const { return UseHVXOps; }
+  bool useHVXDblOps() const { return UseHVXOps && UseHVXDblOps; }
+  bool useHVXSglOps() const { return UseHVXOps && !UseHVXDblOps; }
+  bool useLongCalls() const { return UseLongCalls; }
+
+  bool useBSBScheduling() const { return UseBSBScheduling; }
+  bool enableMachineScheduler() const override;
+  // Always use the TargetLowering default scheduler.
+  // FIXME: This will use the vliw scheduler which is probably just hurting
+  // compiler time and will be removed eventually anyway.
+  bool enableMachineSchedDefaultSched() const override { return false; }
+
+  AntiDepBreakMode getAntiDepBreakMode() const override { return ANTIDEP_ALL; }
+  bool enablePostRAScheduler() const override { return true; }
+
+  bool enableSubRegLiveness() const override;
+
+  const std::string &getCPUString () const { return CPUString; }
+
+  // Threshold for small data section
+  unsigned getSmallDataThreshold() const {
+    return Hexagon_SMALL_DATA_THRESHOLD;
+  }
+  const HexagonArchEnum &getHexagonArchVersion() const {
+    return HexagonArchVersion;
+  }
+
+  void getPostRAMutations(
+      std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
+      const override;
+
+  void getSMSMutations(
+      std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
+      const override;
+
+  /// \brief Perform target specific adjustments to the latency of a schedule
+  /// dependency.
+  void adjustSchedDependency(SUnit *def, SUnit *use, SDep& dep) const override;
+
+  unsigned getL1CacheLineSize() const;
+  unsigned getL1PrefetchDistance() const;
+
+private:
+  // Helper function responsible for increasing the latency only.
+  void updateLatency(MachineInstr &SrcInst, MachineInstr &DstInst, SDep &Dep)
+      const;
+  void changeLatency(SUnit *Src, SmallVector<SDep, 4> &Deps, SUnit *Dst,
+      unsigned Lat) const;
+  bool isBestZeroLatency(SUnit *Src, SUnit *Dst, const HexagonInstrInfo *TII)
+      const;
+  void changePhiLatency(MachineInstr &SrcInst, SUnit *Dst, SDep &Dep) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSystemInst.td b/contrib/llvm/lib/Target/Hexagon/HexagonSystemInst.td
new file mode 100644
index 000000000000..629a98749ee9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSystemInst.td
@@ -0,0 +1,134 @@
+//==- HexagonSystemInst.td - System Instructions for Hexagon -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//                     Cache manipulation instructions.
+//===----------------------------------------------------------------------===//
+let mayStore = 1 in
+class ST_MISC_CACHEOP<dag outs, dag ins,
+              string asmstr, list<dag> pattern = [],
+              bits<3> amode, bits<3> type, bits<1> un>
+  : ST0Inst<outs, ins, asmstr, pattern, "", ST_tc_ld_SLOT0> {
+
+    bits<5> Rs;
+    bits<5> Rt;
+    bits<5> Rd;
+    let Inst{31-28} = 0b1010;
+    let Inst{27-25} = amode;
+    let Inst{24-22} = type;
+    let Inst{21}    = un;
+    let Inst{20-16} = Rs;
+    let Inst{12-8}  = Rt;
+    let Inst{4-0}   = Rd;
+}
+
+let mayStore = 1 in
+class ST_MISC_CACHEOP_SYS<dag outs, dag ins,
+              string asmstr, list<dag> pattern = [],
+              bits<3> amode, bits<3> type, bits<1> un>
+  : SYSInst<outs, ins, asmstr, pattern, ""> {
+
+    bits<5> Rs;
+    bits<5> Rt;
+    bits<5> Rd;
+    let Inst{31-28} = 0b1010;
+    let Inst{27-25} = amode;
+    let Inst{24-22} = type;
+    let Inst{21}    = un;
+    let Inst{20-16} = Rs;
+    let Inst{12-8}  = Rt;
+    let Inst{4-0}   = Rd;
+}
+
+
+let isSolo = 1, Rs = 0, Rt = 0, Rd = 0 in {
+def Y2_syncht: ST_MISC_CACHEOP <(outs), (ins),
+    "syncht" , [], 0b100, 0b001, 0b0>;
+}
+
+let Rt = 0, Rd = 0 in {
+let isSoloAin1 = 1 in {
+  def Y2_dccleana: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
+      "dccleana($Rs)", [], 0b000, 0b000, 0b0>;
+  def Y2_dcinva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
+      "dcinva($Rs)", [], 0b000, 0b000, 0b1>;
+  def Y2_dccleaninva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
+      "dccleaninva($Rs)", [], 0b000, 0b001, 0b0>;
+  }
+}
+
+let isSoloAX = 1, hasSideEffects = 1, Rd = 0 in {
+  def Y4_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, IntRegs:$Rt),
+      "l2fetch($Rs, $Rt)", [], 0b011, 0b000, 0b0>;
+  def Y5_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, DoubleRegs:$Rt),
+      "l2fetch($Rs, $Rt)", [], 0b011, 0b010, 0b0>;
+}
+
+let hasSideEffects = 0, isSolo = 1 in
+class Y2_INVALIDATE_CACHE<string mnemonic, bit MajOp>
+  : JRInst <
+  (outs), (ins IntRegs:$Rs),
+  #mnemonic#"($Rs)" > {
+    bits<5> Rs;
+
+    let IClass = 0b0101;
+    let Inst{27-21} = 0b0110110;
+    let Inst{20-16} = Rs;
+    let Inst{13-12} = 0b00;
+    let Inst{11} = MajOp;
+  }
+// Instruction cache invalidate
+def Y2_icinva : Y2_INVALIDATE_CACHE<"icinva", 0b0>;
+
+// Zero an aligned 32-byte cacheline.
+let isSoloAin1 = 1 in
+def Y2_dczeroa: ST0Inst <(outs), (ins IntRegs:$Rs),
+  "dczeroa($Rs)"> {
+    bits<5> Rs;
+    let IClass = 0b1010;
+    let Inst{27-21} = 0b0000110;
+    let Inst{13} = 0b0;
+    let Inst{20-16} = Rs;
+  }
+
+// Memory synchronization.
+let hasSideEffects = 0, isSolo = 1 in
+def Y2_isync: JRInst <(outs), (ins),
+  "isync"> {
+    let IClass = 0b0101;
+    let Inst{27-16} = 0b011111000000;
+    let Inst{13} = 0b0;
+    let Inst{9-0} = 0b0000000010;
+  }
+
+//===----------------------------------------------------------------------===//
+//                     System/User instructions.
+//===----------------------------------------------------------------------===//
+// traps and pause
+let hasSideEffects = 0, isSolo = 1 in
+class J2_MISC_TRAP_PAUSE<string mnemonic, bits<2> MajOp>
+  : JRInst
+  <(outs), (ins u8_0Imm:$u8),
+   #mnemonic#"(#$u8)"> {
+    bits<8> u8;
+
+    let IClass = 0b0101;
+    let Inst{27-24} = 0b0100;
+    let Inst{23-22} = MajOp;
+    let Inst{12-8} = u8{7-3};
+    let Inst{4-2} = u8{2-0};
+  }
+def J2_trap0 : J2_MISC_TRAP_PAUSE<"trap0", 0b00>;
+def J2_trap1 : J2_MISC_TRAP_PAUSE<"trap1", 0b10>;
+def J2_pause : J2_MISC_TRAP_PAUSE<"pause", 0b01>;
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
new file mode 100644
index 000000000000..132d12a66d46
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -0,0 +1,345 @@
+//===-- HexagonTargetMachine.cpp - Define TargetMachine for Hexagon -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Hexagon target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetMachine.h"
+#include "Hexagon.h"
+#include "HexagonISelLowering.h"
+#include "HexagonMachineScheduler.h"
+#include "HexagonTargetObjectFile.h"
+#include "HexagonTargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableRDFOpt("rdf-opt", cl::Hidden, cl::ZeroOrMore,
+  cl::init(true), cl::desc("Enable RDF-based optimizations"));
+
+static cl::opt<bool> DisableHardwareLoops("disable-hexagon-hwloops",
+  cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target"));
+
+static cl::opt<bool> DisableAModeOpt("disable-hexagon-amodeopt",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Disable Hexagon Addressing Mode Optimization"));
+
+static cl::opt<bool> DisableHexagonCFGOpt("disable-hexagon-cfgopt",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Disable Hexagon CFG Optimization"));
+
+static cl::opt<bool> DisableHCP("disable-hcp", cl::init(false), cl::Hidden,
+  cl::ZeroOrMore, cl::desc("Disable Hexagon constant propagation"));
+
+static cl::opt<bool> DisableStoreWidening("disable-store-widen",
+  cl::Hidden, cl::init(false), cl::desc("Disable store widening"));
+
+static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets",
+  cl::init(true), cl::Hidden, cl::ZeroOrMore,
+  cl::desc("Early expansion of MUX"));
+
+static cl::opt<bool> EnableEarlyIf("hexagon-eif", cl::init(true), cl::Hidden,
+  cl::ZeroOrMore, cl::desc("Enable early if-conversion"));
+
+static cl::opt<bool> EnableGenInsert("hexagon-insert", cl::init(true),
+  cl::Hidden, cl::desc("Generate \"insert\" instructions"));
+
+static cl::opt<bool> EnableCommGEP("hexagon-commgep", cl::init(true),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Enable commoning of GEP instructions"));
+
+static cl::opt<bool> EnableGenExtract("hexagon-extract", cl::init(true),
+  cl::Hidden, cl::desc("Generate \"extract\" instructions"));
+
+static cl::opt<bool> EnableGenMux("hexagon-mux", cl::init(true), cl::Hidden,
+  cl::desc("Enable converting conditional transfers into MUX instructions"));
+
+static cl::opt<bool> EnableGenPred("hexagon-gen-pred", cl::init(true),
+  cl::Hidden, cl::desc("Enable conversion of arithmetic operations to "
+  "predicate instructions"));
+
+static cl::opt<bool> EnableLoopPrefetch("hexagon-loop-prefetch",
+  cl::init(false), cl::Hidden, cl::ZeroOrMore,
+  cl::desc("Enable loop data prefetch on Hexagon"));
+
+static cl::opt<bool> DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden,
+  cl::desc("Disable splitting double registers"));
+
+static cl::opt<bool> EnableBitSimplify("hexagon-bit", cl::init(true),
+  cl::Hidden, cl::desc("Bit simplification"));
+
+static cl::opt<bool> EnableLoopResched("hexagon-loop-resched", cl::init(true),
+  cl::Hidden, cl::desc("Loop rescheduling"));
+
+static cl::opt<bool> HexagonNoOpt("hexagon-noopt", cl::init(false),
+  cl::Hidden, cl::desc("Disable backend optimizations"));
+
+static cl::opt<bool> EnableVectorPrint("enable-hexagon-vector-print",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Enable Hexagon Vector print instr pass"));
+
+/// HexagonTargetMachineModule - Note that this is used on hosts that
+/// cannot link in a library unless there are references into the
+/// library.  In particular, it seems that it is not possible to get
+/// things to work on Win32 without this.  Though it is unused, do not
+/// remove it.
+extern "C" int HexagonTargetMachineModule;
+int HexagonTargetMachineModule = 0;
+
+extern "C" void LLVMInitializeHexagonTarget() {
+  // Register the target.
+  RegisterTargetMachine<HexagonTargetMachine> X(getTheHexagonTarget());
+}
+
+static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) {
+  return new VLIWMachineScheduler(C, make_unique<ConvergingVLIWScheduler>());
+}
+
+static MachineSchedRegistry
+SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
+                    createVLIWMachineSched);
+
+namespace llvm {
+  extern char &HexagonExpandCondsetsID;
+  void initializeHexagonExpandCondsetsPass(PassRegistry&);
+
+  FunctionPass *createHexagonBitSimplify();
+  FunctionPass *createHexagonBranchRelaxation();
+  FunctionPass *createHexagonCallFrameInformation();
+  FunctionPass *createHexagonCFGOptimizer();
+  FunctionPass *createHexagonCommonGEP();
+  FunctionPass *createHexagonConstPropagationPass();
+  FunctionPass *createHexagonCopyToCombine();
+  FunctionPass *createHexagonEarlyIfConversion();
+  FunctionPass *createHexagonFixupHwLoops();
+  FunctionPass *createHexagonGenExtract();
+  FunctionPass *createHexagonGenInsert();
+  FunctionPass *createHexagonGenMux();
+  FunctionPass *createHexagonGenPredicate();
+  FunctionPass *createHexagonHardwareLoops();
+  FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel);
+  FunctionPass *createHexagonLoopRescheduling();
+  FunctionPass *createHexagonNewValueJump();
+  FunctionPass *createHexagonOptimizeSZextends();
+  FunctionPass *createHexagonOptAddrMode();
+  FunctionPass *createHexagonPacketizer();
+  FunctionPass *createHexagonPeephole();
+  FunctionPass *createHexagonRDFOpt();
+  FunctionPass *createHexagonSplitConst32AndConst64();
+  FunctionPass *createHexagonSplitDoubleRegs();
+  FunctionPass *createHexagonStoreWidening();
+  FunctionPass *createHexagonVectorPrint();
+} // end namespace llvm;
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::Static;
+  return *RM;
+}
+
+HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+    // Specify the vector alignment explicitly. For v512x1, the calculated
+    // alignment would be 512*alignment(i1), which is 512 bytes, instead of
+    // the required minimum of 64 bytes.
+    : LLVMTargetMachine(
+          T, "e-m:e-p:32:32:32-a:0-n16:32-"
+             "i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-"
+             "v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048",
+          TT, CPU, FS, Options, getEffectiveRelocModel(RM), CM,
+          (HexagonNoOpt ? CodeGenOpt::None : OL)),
+      TLOF(make_unique<HexagonTargetObjectFile>()) {
+  initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
+  initAsmInfo();
+}
+
+const HexagonSubtarget *
+HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
+  AttributeSet FnAttrs = F.getAttributes();
+  Attribute CPUAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+  Attribute FSAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<HexagonSubtarget>(TargetTriple, CPU, FS, *this);
+  }
+  return I.get();
+}
+
+TargetIRAnalysis HexagonTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(HexagonTTIImpl(this, F));
+  });
+}
+
+
+HexagonTargetMachine::~HexagonTargetMachine() {}
+
+namespace {
+/// Hexagon Code Generator Pass Configuration Options.
+class HexagonPassConfig : public TargetPassConfig {
+public:
+  HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  HexagonTargetMachine &getHexagonTargetMachine() const {
+    return getTM<HexagonTargetMachine>();
+  }
+
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override {
+    return createVLIWMachineSched(C);
+  }
+
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  void addPreRegAlloc() override;
+  void addPostRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new HexagonPassConfig(this, PM);
+}
+
+void HexagonPassConfig::addIRPasses() {
+  TargetPassConfig::addIRPasses();
+  bool NoOpt = (getOptLevel() == CodeGenOpt::None);
+
+  addPass(createAtomicExpandPass(TM));
+  if (!NoOpt) {
+    if (EnableLoopPrefetch)
+      addPass(createLoopDataPrefetchPass());
+    if (EnableCommGEP)
+      addPass(createHexagonCommonGEP());
+    // Replace certain combinations of shifts and ands with extracts.
+    if (EnableGenExtract)
+      addPass(createHexagonGenExtract());
+  }
+}
+
+bool HexagonPassConfig::addInstSelector() {
+  HexagonTargetMachine &TM = getHexagonTargetMachine();
+  bool NoOpt = (getOptLevel() == CodeGenOpt::None);
+
+  if (!NoOpt)
+    addPass(createHexagonOptimizeSZextends());
+
+  addPass(createHexagonISelDag(TM, getOptLevel()));
+
+  if (!NoOpt) {
+    // Create logical operations on predicate registers.
+    if (EnableGenPred)
+      addPass(createHexagonGenPredicate(), false);
+    // Rotate loops to expose bit-simplification opportunities.
+    if (EnableLoopResched)
+      addPass(createHexagonLoopRescheduling(), false);
+    // Split double registers.
+    if (!DisableHSDR)
+      addPass(createHexagonSplitDoubleRegs());
+    // Bit simplification.
+    if (EnableBitSimplify)
+      addPass(createHexagonBitSimplify(), false);
+    addPass(createHexagonPeephole());
+    printAndVerify("After hexagon peephole pass");
+    // Constant propagation.
+    if (!DisableHCP) {
+      addPass(createHexagonConstPropagationPass(), false);
+      addPass(&UnreachableMachineBlockElimID, false);
+    }
+    if (EnableGenInsert)
+      addPass(createHexagonGenInsert(), false);
+    if (EnableEarlyIf)
+      addPass(createHexagonEarlyIfConversion(), false);
+  }
+
+  return false;
+}
+
+void HexagonPassConfig::addPreRegAlloc() {
+  if (getOptLevel() != CodeGenOpt::None) {
+    if (EnableExpandCondsets)
+      insertPass(&RegisterCoalescerID, &HexagonExpandCondsetsID);
+    if (!DisableStoreWidening)
+      addPass(createHexagonStoreWidening(), false);
+    if (!DisableHardwareLoops)
+      addPass(createHexagonHardwareLoops(), false);
+  }
+  if (TM->getOptLevel() >= CodeGenOpt::Default)
+    addPass(&MachinePipelinerID);
+}
+
+void HexagonPassConfig::addPostRegAlloc() {
+  if (getOptLevel() != CodeGenOpt::None) {
+    if (EnableRDFOpt)
+      addPass(createHexagonRDFOpt());
+    if (!DisableHexagonCFGOpt)
+      addPass(createHexagonCFGOptimizer(), false);
+    if (!DisableAModeOpt)
+      addPass(createHexagonOptAddrMode(), false);
+  }
+}
+
+void HexagonPassConfig::addPreSched2() {
+  addPass(createHexagonCopyToCombine(), false);
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(&IfConverterID, false);
+  addPass(createHexagonSplitConst32AndConst64());
+}
+
+void HexagonPassConfig::addPreEmitPass() {
+  bool NoOpt = (getOptLevel() == CodeGenOpt::None);
+
+  if (!NoOpt)
+    addPass(createHexagonNewValueJump(), false);
+
+  addPass(createHexagonBranchRelaxation(), false);
+
+  // Create Packets.
+  if (!NoOpt) {
+    if (!DisableHardwareLoops)
+      addPass(createHexagonFixupHwLoops(), false);
+    // Generate MUX from pairs of conditional transfers.
+    if (EnableGenMux)
+      addPass(createHexagonGenMux(), false);
+
+    addPass(createHexagonPacketizer(), false);
+  }
+  if (EnableVectorPrint)
+    addPass(createHexagonVectorPrint(), false);
+
+  // Add CFI instructions if necessary.
+  addPass(createHexagonCallFrameInformation(), false);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
new file mode 100644
index 000000000000..70835c0d4ac5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -0,0 +1,50 @@
+//=-- HexagonTargetMachine.h - Define TargetMachine for Hexagon ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Hexagon specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETMACHINE_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETMACHINE_H
+
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class Module;
+
+class HexagonTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  mutable StringMap<std::unique_ptr<HexagonSubtarget>> SubtargetMap;
+
+public:
+  HexagonTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+  ~HexagonTargetMachine() override;
+  const HexagonSubtarget *getSubtargetImpl(const Function &F) const override;
+
+  static unsigned getModuleMatchQuality(const Module &M);
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  TargetIRAnalysis getTargetIRAnalysis() override;
+
+  HexagonTargetObjectFile *getObjFileLowering() const override {
+    return static_cast<HexagonTargetObjectFile*>(TLOF.get());
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
new file mode 100644
index 000000000000..e902f600e881
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -0,0 +1,382 @@
+//===-- HexagonTargetObjectFile.cpp ---------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the HexagonTargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "hexagon-sdata"
+
+#include "HexagonTargetMachine.h"
+#include "HexagonTargetObjectFile.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned> SmallDataThreshold("hexagon-small-data-threshold",
+  cl::init(8), cl::Hidden,
+  cl::desc("The maximum size of an object in the sdata section"));
+
+static cl::opt<bool> NoSmallDataSorting("mno-sort-sda", cl::init(false),
+  cl::Hidden, cl::desc("Disable small data sections sorting"));
+
+static cl::opt<bool> StaticsInSData("hexagon-statics-in-small-data",
+  cl::init(false), cl::Hidden, cl::ZeroOrMore,
+  cl::desc("Allow static variables in .sdata"));
+
+static cl::opt<bool> TraceGVPlacement("trace-gv-placement",
+  cl::Hidden, cl::init(false),
+  cl::desc("Trace global value placement"));
+
+// TraceGVPlacement controls messages for all builds. For builds with assertions
+// (debug or release), messages are also controlled by the usual debug flags
+// (e.g. -debug and -debug-only=globallayout)
+#define TRACE_TO(s, X) s << X
+#ifdef NDEBUG
+#define TRACE(X) do { if (TraceGVPlacement) { TRACE_TO(errs(), X); } } while (0)
+#else
+#define TRACE(X) \
+  do { \
+    if (TraceGVPlacement) { TRACE_TO(errs(), X); } \
+    else { DEBUG( TRACE_TO(dbgs(), X) ); } \
+  } while (0)
+#endif
+
+// Returns true if the section name is such that the symbol will be put
+// in a small data section.
+// For instance, global variables with section attributes such as ".sdata"
+// ".sdata.*", ".sbss", and ".sbss.*" will go into small data.
+static bool isSmallDataSection(StringRef Sec) {
+  // sectionName is either ".sdata" or ".sbss". Looking for an exact match
+  // obviates the need for checks for section names such as ".sdatafoo".
+  if (Sec.equals(".sdata") || Sec.equals(".sbss") || Sec.equals(".scommon"))
+    return true;
+  // If either ".sdata." or ".sbss." is a substring of the section name
+  // then put the symbol in small data.
+  return Sec.find(".sdata.") != StringRef::npos ||
+         Sec.find(".sbss.") != StringRef::npos ||
+         Sec.find(".scommon.") != StringRef::npos;
+}
+
+
+static const char *getSectionSuffixForSize(unsigned Size) {
+  switch (Size) {
+  default:
+    return "";
+  case 1:
+    return ".1";
+  case 2:
+    return ".2";
+  case 4:
+    return ".4";
+  case 8:
+    return ".8";
+  }
+}
+
+void HexagonTargetObjectFile::Initialize(MCContext &Ctx,
+      const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+
+  SmallDataSection =
+    getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
+                               ELF::SHF_WRITE | ELF::SHF_ALLOC |
+                               ELF::SHF_HEX_GPREL);
+  SmallBSSSection =
+    getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+                               ELF::SHF_WRITE | ELF::SHF_ALLOC |
+                               ELF::SHF_HEX_GPREL);
+}
+
+MCSection *HexagonTargetObjectFile::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  TRACE("[SelectSectionForGlobal] GO(" << GO->getName() << ") ");
+  TRACE("input section(" << GO->getSection() << ") ");
+
+  TRACE((GO->hasPrivateLinkage() ? "private_linkage " : "")
+         << (GO->hasLocalLinkage() ? "local_linkage " : "")
+         << (GO->hasInternalLinkage() ? "internal " : "")
+         << (GO->hasExternalLinkage() ? "external " : "")
+         << (GO->hasCommonLinkage() ? "common_linkage " : "")
+         << (GO->hasCommonLinkage() ? "common " : "" )
+         << (Kind.isCommon() ? "kind_common " : "" )
+         << (Kind.isBSS() ? "kind_bss " : "" )
+         << (Kind.isBSSLocal() ? "kind_bss_local " : "" ));
+
+  if (isGlobalInSmallSection(GO, TM))
+    return selectSmallSectionForGlobal(GO, Kind, TM);
+
+  if (Kind.isCommon()) {
+    // This is purely for LTO+Linker Script because commons don't really have a
+    // section. However, the BitcodeSectionWriter pass will query for the
+    // sections of commons (and the linker expects us to know their section) so
+    // we'll return one here.
+    return BSSSection;
+  }
+
+  TRACE("default_ELF_section\n");
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
+}
+
+MCSection *HexagonTargetObjectFile::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  TRACE("[getExplicitSectionGlobal] GO(" << GO->getName() << ") from("
+        << GO->getSection() << ") ");
+  TRACE((GO->hasPrivateLinkage() ? "private_linkage " : "")
+         << (GO->hasLocalLinkage() ? "local_linkage " : "")
+         << (GO->hasInternalLinkage() ? "internal " : "")
+         << (GO->hasExternalLinkage() ? "external " : "")
+         << (GO->hasCommonLinkage() ? "common_linkage " : "")
+         << (GO->hasCommonLinkage() ? "common " : "" )
+         << (Kind.isCommon() ? "kind_common " : "" )
+         << (Kind.isBSS() ? "kind_bss " : "" )
+         << (Kind.isBSSLocal() ? "kind_bss_local " : "" ));
+
+  if (GO->hasSection()) {
+    StringRef Section = GO->getSection();
+    if (Section.find(".access.text.group") != StringRef::npos)
+      return getContext().getELFSection(GO->getSection(), ELF::SHT_PROGBITS,
+                                        ELF::SHF_ALLOC | ELF::SHF_EXECINSTR);
+    if (Section.find(".access.data.group") != StringRef::npos)
+      return getContext().getELFSection(GO->getSection(), ELF::SHT_PROGBITS,
+                                        ELF::SHF_WRITE | ELF::SHF_ALLOC);
+  }
+
+  if (isGlobalInSmallSection(GO, TM))
+    return selectSmallSectionForGlobal(GO, Kind, TM);
+
+  // Otherwise, we work the same as ELF.
+  TRACE("default_ELF_section\n");
+  return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, Kind, TM);
+}
+
+
+/// Return true if this global value should be placed into small data/bss
+/// section.
+bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
+      const TargetMachine &TM) const {
+  // Only global variables, not functions.
+  DEBUG(dbgs() << "Checking if value is in small-data, -G"
+               << SmallDataThreshold << ": \"" << GO->getName() << "\": ");
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GO);
+  if (!GVar) {
+    DEBUG(dbgs() << "no, not a global variable\n");
+    return false;
+  }
+
+  // Globals with external linkage that have an original section set must be
+  // emitted to that section, regardless of whether we would put them into
+  // small data or not. This is how we can support mixing -G0/-G8 in LTO.
+  if (GVar->hasSection()) {
+    bool IsSmall = isSmallDataSection(GVar->getSection());
+    DEBUG(dbgs() << (IsSmall ? "yes" : "no") << ", has section: "
+                 << GVar->getSection() << '\n');
+    return IsSmall;
+  }
+
+  if (GVar->isConstant()) {
+    DEBUG(dbgs() << "no, is a constant\n");
+    return false;
+  }
+
+  bool IsLocal = GVar->hasLocalLinkage();
+  if (!StaticsInSData && IsLocal) {
+    DEBUG(dbgs() << "no, is static\n");
+    return false;
+  }
+
+  Type *GType = GVar->getType();
+  if (PointerType *PT = dyn_cast<PointerType>(GType))
+    GType = PT->getElementType();
+
+  if (isa<ArrayType>(GType)) {
+    DEBUG(dbgs() << "no, is an array\n");
+    return false;
+  }
+
+  // If the type is a struct with no body provided, treat is conservatively.
+  // There cannot be actual definitions of object of such a type in this CU
+  // (only references), so assuming that they are not in sdata is safe. If
+  // these objects end up in the sdata, the references will still be valid.
+  if (StructType *ST = dyn_cast<StructType>(GType)) {
+    if (ST->isOpaque()) {
+      DEBUG(dbgs() << "no, has opaque type\n");
+      return false;
+    }
+  }
+
+  unsigned Size = GVar->getParent()->getDataLayout().getTypeAllocSize(GType);
+  if (Size == 0) {
+    DEBUG(dbgs() << "no, has size 0\n");
+    return false;
+  }
+  if (Size > SmallDataThreshold) {
+    DEBUG(dbgs() << "no, size exceeds sdata threshold: " << Size << '\n');
+    return false;
+  }
+
+  DEBUG(dbgs() << "yes\n");
+  return true;
+}
+
+
+bool HexagonTargetObjectFile::isSmallDataEnabled() const {
+  return SmallDataThreshold > 0;
+}
+
+
+unsigned HexagonTargetObjectFile::getSmallDataSize() const {
+  return SmallDataThreshold;
+}
+
+
+/// Descends any type down to "elementary" components,
+/// discovering the smallest addressable one.
+/// If zero is returned, declaration will not be modified.
+unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty,
+      const GlobalValue *GV, const TargetMachine &TM) const {
+  // Assign the smallest element access size to the highest
+  // value which assembler can handle.
+  unsigned SmallestElement = 8;
+
+  if (!Ty)
+    return 0;
+  switch (Ty->getTypeID()) {
+  case Type::StructTyID: {
+    const StructType *STy = cast<const StructType>(Ty);
+    for (auto &E : STy->elements()) {
+      unsigned AtomicSize = getSmallestAddressableSize(E, GV, TM);
+      if (AtomicSize < SmallestElement)
+        SmallestElement = AtomicSize;
+    }
+    return (STy->getNumElements() == 0) ? 0 : SmallestElement;
+  }
+  case Type::ArrayTyID: {
+    const ArrayType *ATy = cast<const ArrayType>(Ty);
+    return getSmallestAddressableSize(ATy->getElementType(), GV, TM);
+  }
+  case Type::VectorTyID: {
+    const VectorType *PTy = cast<const VectorType>(Ty);
+    return getSmallestAddressableSize(PTy->getElementType(), GV, TM);
+  }
+  case Type::PointerTyID:
+  case Type::HalfTyID:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  case Type::IntegerTyID: {
+    const DataLayout &DL = GV->getParent()->getDataLayout();
+    // It is unfortunate that DL's function take non-const Type*.
+    return DL.getTypeAllocSize(const_cast<Type*>(Ty));
+  }
+  case Type::FunctionTyID:
+  case Type::VoidTyID:
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+  case Type::MetadataTyID:
+  case Type::X86_MMXTyID:
+  case Type::TokenTyID:
+    return 0;
+  }
+
+  return 0;
+}
+
+MCSection *HexagonTargetObjectFile::selectSmallSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  const Type *GTy = GO->getType()->getElementType();
+  unsigned Size = getSmallestAddressableSize(GTy, GO, TM);
+
+  // If we have -ffunction-section or -fdata-section then we should emit the
+  // global value to a unique section specifically for it... even for sdata.
+  bool EmitUniquedSection = TM.getDataSections();
+
+  TRACE("Small data. Size(" << Size << ")");
+  // Handle Small Section classification here.
+  if (Kind.isBSS() || Kind.isBSSLocal()) {
+    // If -mno-sort-sda is not set, find out smallest accessible entity in
+    // declaration and add it to the section name string.
+    // Note. It does not track the actual usage of the value, only its de-
+    // claration. Also, compiler adds explicit pad fields to some struct
+    // declarations - they are currently counted towards smallest addres-
+    // sable entity.
+    if (NoSmallDataSorting) {
+      TRACE(" default sbss\n");
+      return SmallBSSSection;
+    }
+
+    StringRef Prefix(".sbss");
+    SmallString<128> Name(Prefix);
+    Name.append(getSectionSuffixForSize(Size));
+
+    if (EmitUniquedSection) {
+      Name.append(".");
+      Name.append(GO->getName());
+    }
+    TRACE(" unique sbss(" << Name << ")\n");
+    return getContext().getELFSection(Name.str(), ELF::SHT_NOBITS,
+                ELF::SHF_WRITE | ELF::SHF_ALLOC | ELF::SHF_HEX_GPREL);
+  }
+
+  if (Kind.isCommon()) {
+    // This is purely for LTO+Linker Script because commons don't really have a
+    // section. However, the BitcodeSectionWriter pass will query for the
+    // sections of commons (and the linker expects us to know their section) so
+    // we'll return one here.
+    if (NoSmallDataSorting)
+      return BSSSection;
+
+    Twine Name = Twine(".scommon") + getSectionSuffixForSize(Size);
+    TRACE(" small COMMON (" << Name << ")\n");
+
+    return getContext().getELFSection(Name.str(), ELF::SHT_NOBITS,
+                                      ELF::SHF_WRITE | ELF::SHF_ALLOC |
+                                      ELF::SHF_HEX_GPREL);
+  }
+
+  // We could have changed sdata object to a constant... in this
+  // case the Kind could be wrong for it.
+  if (Kind.isMergeableConst()) {
+    TRACE(" const_object_as_data ");
+    const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GO);
+    if (GVar->hasSection() && isSmallDataSection(GVar->getSection()))
+      Kind = SectionKind::getData();
+  }
+
+  if (Kind.isData()) {
+    if (NoSmallDataSorting) {
+      TRACE(" default sdata\n");
+      return SmallDataSection;
+    }
+
+    StringRef Prefix(".sdata");
+    SmallString<128> Name(Prefix);
+    Name.append(getSectionSuffixForSize(Size));
+
+    if (EmitUniquedSection) {
+      Name.append(".");
+      Name.append(GO->getName());
+    }
+    TRACE(" unique sdata(" << Name << ")\n");
+    return getContext().getELFSection(Name.str(), ELF::SHT_PROGBITS,
+                ELF::SHF_WRITE | ELF::SHF_ALLOC | ELF::SHF_HEX_GPREL);
+  }
+
+  TRACE("default ELF section\n");
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h
new file mode 100644
index 000000000000..58dff2b95e19
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -0,0 +1,50 @@
+//===-- HexagonTargetObjectFile.h -----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCSectionELF.h"
+
+namespace llvm {
+
+  class HexagonTargetObjectFile : public TargetLoweringObjectFileELF {
+  public:
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+    MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+                                      const TargetMachine &TM) const override;
+
+    MCSection *getExplicitSectionGlobal(const GlobalObject *GO,
+                                        SectionKind Kind,
+                                        const TargetMachine &TM) const override;
+
+    bool isGlobalInSmallSection(const GlobalObject *GO,
+                                const TargetMachine &TM) const;
+
+    bool isSmallDataEnabled() const;
+
+    unsigned getSmallDataSize() const;
+
+  private:
+    MCSectionELF *SmallDataSection;
+    MCSectionELF *SmallBSSSection;
+
+    unsigned getSmallestAddressableSize(const Type *Ty, const GlobalValue *GV,
+        const TargetMachine &TM) const;
+
+    MCSection *selectSmallSectionForGlobal(const GlobalObject *GO,
+                                           SectionKind Kind,
+                                           const TargetMachine &TM) const;
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h
new file mode 100644
index 000000000000..e19c404450e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h
@@ -0,0 +1,31 @@
+//===-- HexagonTargetStreamer.h - Hexagon Target Streamer ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONTARGETSTREAMER_H
+#define HEXAGONTARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class HexagonTargetStreamer : public MCTargetStreamer {
+public:
+  HexagonTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+  virtual void EmitCodeAlignment(unsigned ByteAlignment,
+                                 unsigned MaxBytesToEmit = 0){};
+  virtual void emitFAlign(unsigned Size, unsigned MaxBytesToEmit){};
+  virtual void EmitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
+                                      unsigned ByteAlignment,
+                                      unsigned AccessGranularity){};
+  virtual void EmitLocalCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
+                                           unsigned ByteAlign,
+                                           unsigned AccessGranularity){};
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
new file mode 100644
index 000000000000..d578bfab3658
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -0,0 +1,71 @@
+//===-- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// Hexagon target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetTransformInfo.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagontti"
+
+TargetTransformInfo::PopcntSupportKind
+HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
+  // Return Fast Hardware support as every input  < 64 bits will be promoted
+  // to 64 bits.
+  return TargetTransformInfo::PSK_FastHardware;
+}
+
+// The Hexagon target can unroll loops with run-time trip counts.
+void HexagonTTIImpl::getUnrollingPreferences(Loop *L,
+                                             TTI::UnrollingPreferences &UP) {
+  UP.Runtime = UP.Partial = true;
+}
+
+unsigned HexagonTTIImpl::getNumberOfRegisters(bool vector) const {
+  return vector ? 0 : 32;
+}
+
+unsigned HexagonTTIImpl::getPrefetchDistance() const {
+  return getST()->getL1PrefetchDistance();
+}
+
+unsigned HexagonTTIImpl::getCacheLineSize() const {
+  return getST()->getL1CacheLineSize();
+}
+
+int HexagonTTIImpl::getUserCost(const User *U) {
+  auto isCastFoldedIntoLoad = [] (const CastInst *CI) -> bool {
+    if (!CI->isIntegerCast())
+      return false;
+    const LoadInst *LI = dyn_cast<const LoadInst>(CI->getOperand(0));
+    // Technically, this code could allow multiple uses of the load, and
+    // check if all the uses are the same extension operation, but this
+    // should be sufficient for most cases.
+    if (!LI || !LI->hasOneUse())
+      return false;
+
+    // Only extensions from an integer type shorter than 32-bit to i32
+    // can be folded into the load.
+    unsigned SBW = CI->getSrcTy()->getIntegerBitWidth();
+    unsigned DBW = CI->getDestTy()->getIntegerBitWidth();
+    return DBW == 32 && (SBW < DBW);
+  };
+
+  if (const CastInst *CI = dyn_cast<const CastInst>(U))
+    if (isCastFoldedIntoLoad(CI))
+      return TargetTransformInfo::TCC_Free;
+  return BaseT::getUserCost(U);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
new file mode 100644
index 000000000000..8414bfc4e197
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -0,0 +1,69 @@
+//===-- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// Hexagon target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETTRANSFORMINFO_H
+
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
+  typedef BasicTTIImplBase<HexagonTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const HexagonSubtarget *ST;
+  const HexagonTargetLowering *TLI;
+
+  const HexagonSubtarget *getST() const { return ST; }
+  const HexagonTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit HexagonTTIImpl(const HexagonTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  /// \name Scalar TTI Implementations
+  /// @{
+
+  TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
+
+  // The Hexagon target can unroll loops with run-time trip counts.
+  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+  // L1 cache prefetch.
+  unsigned getPrefetchDistance() const;
+  unsigned getCacheLineSize() const;
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool vector) const;
+
+  /// @}
+
+  int getUserCost(const User *U);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
new file mode 100644
index 000000000000..7b1247d815a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -0,0 +1,1683 @@
+//===----- HexagonPacketizer.cpp - vliw packetizer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a simple VLIW packetizer using DFA. The packetizer works on
+// machine basic blocks. For each instruction I in BB, the packetizer consults
+// the DFA to see if machine resources are available to execute I. If so, the
+// packetizer checks if I depends on any instruction J in the current packet.
+// If no dependency is found, I is added to current packet and machine resource
+// is marked as taken. If any dependency is found, a target API call is made to
+// prune the dependence.
+//
+//===----------------------------------------------------------------------===//
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonVLIWPacketizer.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "packets"
+
+static cl::opt<bool> DisablePacketizer("disable-packetizer", cl::Hidden,
+  cl::ZeroOrMore, cl::init(false),
+  cl::desc("Disable Hexagon packetizer pass"));
+
+static cl::opt<bool> PacketizeVolatiles("hexagon-packetize-volatiles",
+  cl::ZeroOrMore, cl::Hidden, cl::init(true),
+  cl::desc("Allow non-solo packetization of volatile memory references"));
+
+static cl::opt<bool> EnableGenAllInsnClass("enable-gen-insn", cl::init(false),
+  cl::Hidden, cl::ZeroOrMore, cl::desc("Generate all instruction with TC"));
+
+static cl::opt<bool> DisableVecDblNVStores("disable-vecdbl-nv-stores",
+  cl::init(false), cl::Hidden, cl::ZeroOrMore,
+  cl::desc("Disable vector double new-value-stores"));
+
+extern cl::opt<bool> ScheduleInlineAsm;
+
+namespace llvm {
+  FunctionPass *createHexagonPacketizer();
+  void initializeHexagonPacketizerPass(PassRegistry&);
+}
+
+
+namespace {
+  class HexagonPacketizer : public MachineFunctionPass {
+  public:
+    static char ID;
+    HexagonPacketizer() : MachineFunctionPass(ID) {
+      initializeHexagonPacketizerPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    StringRef getPassName() const override { return "Hexagon Packetizer"; }
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+  private:
+    const HexagonInstrInfo *HII;
+    const HexagonRegisterInfo *HRI;
+  };
+
+  char HexagonPacketizer::ID = 0;
+}
+
+INITIALIZE_PASS_BEGIN(HexagonPacketizer, "packets", "Hexagon Packetizer",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(HexagonPacketizer, "packets", "Hexagon Packetizer",
+                    false, false)
+
+HexagonPacketizerList::HexagonPacketizerList(MachineFunction &MF,
+      MachineLoopInfo &MLI, AliasAnalysis *AA,
+      const MachineBranchProbabilityInfo *MBPI)
+    : VLIWPacketizerList(MF, MLI, AA), MBPI(MBPI), MLI(&MLI) {
+  HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+
+  addMutation(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+}
+
+// Check if FirstI modifies a register that SecondI reads.
+static bool hasWriteToReadDep(const MachineInstr &FirstI,
+                              const MachineInstr &SecondI,
+                              const TargetRegisterInfo *TRI) {
+  for (auto &MO : FirstI.operands()) {
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned R = MO.getReg();
+    if (SecondI.readsRegister(R, TRI))
+      return true;
+  }
+  return false;
+}
+
+
+static MachineBasicBlock::iterator moveInstrOut(MachineInstr &MI,
+      MachineBasicBlock::iterator BundleIt, bool Before) {
+  MachineBasicBlock::instr_iterator InsertPt;
+  if (Before)
+    InsertPt = BundleIt.getInstrIterator();
+  else
+    InsertPt = std::next(BundleIt).getInstrIterator();
+
+  MachineBasicBlock &B = *MI.getParent();
+  // The instruction should at least be bundled with the preceding instruction
+  // (there will always be one, i.e. BUNDLE, if nothing else).
+  assert(MI.isBundledWithPred());
+  if (MI.isBundledWithSucc()) {
+    MI.clearFlag(MachineInstr::BundledSucc);
+    MI.clearFlag(MachineInstr::BundledPred);
+  } else {
+    // If it's not bundled with the successor (i.e. it is the last one
+    // in the bundle), then we can simply unbundle it from the predecessor,
+    // which will take care of updating the predecessor's flag.
+    MI.unbundleFromPred();
+  }
+  B.splice(InsertPt, &B, MI.getIterator());
+
+  // Get the size of the bundle without asserting.
+  MachineBasicBlock::const_instr_iterator I = BundleIt.getInstrIterator();
+  MachineBasicBlock::const_instr_iterator E = B.instr_end();
+  unsigned Size = 0;
+  for (++I; I != E && I->isBundledWithPred(); ++I)
+    ++Size;
+
+  // If there are still two or more instructions, then there is nothing
+  // else to be done.
+  if (Size > 1)
+    return BundleIt;
+
+  // Otherwise, extract the single instruction out and delete the bundle.
+  MachineBasicBlock::iterator NextIt = std::next(BundleIt);
+  MachineInstr &SingleI = *BundleIt->getNextNode();
+  SingleI.unbundleFromPred();
+  assert(!SingleI.isBundledWithSucc());
+  BundleIt->eraseFromParent();
+  return NextIt;
+}
+
+
+bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) {
+  if (DisablePacketizer || skipFunction(*MF.getFunction()))
+    return false;
+
+  HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  auto &MLI = getAnalysis<MachineLoopInfo>();
+  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  auto *MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+
+  if (EnableGenAllInsnClass)
+    HII->genAllInsnTimingClasses(MF);
+
+  // Instantiate the packetizer.
+  HexagonPacketizerList Packetizer(MF, MLI, AA, MBPI);
+
+  // DFA state table should not be empty.
+  assert(Packetizer.getResourceTracker() && "Empty DFA table!");
+
+  //
+  // Loop over all basic blocks and remove KILL pseudo-instructions
+  // These instructions confuse the dependence analysis. Consider:
+  // D0 = ...   (Insn 0)
+  // R0 = KILL R0, D0 (Insn 1)
+  // R0 = ... (Insn 2)
+  // Here, Insn 1 will result in the dependence graph not emitting an output
+  // dependence between Insn 0 and Insn 2. This can lead to incorrect
+  // packetization
+  //
+  for (auto &MB : MF) {
+    auto End = MB.end();
+    auto MI = MB.begin();
+    while (MI != End) {
+      auto NextI = std::next(MI);
+      if (MI->isKill()) {
+        MB.erase(MI);
+        End = MB.end();
+      }
+      MI = NextI;
+    }
+  }
+
+  // Loop over all of the basic blocks.
+  for (auto &MB : MF) {
+    auto Begin = MB.begin(), End = MB.end();
+    while (Begin != End) {
+      // First the first non-boundary starting from the end of the last
+      // scheduling region.
+      MachineBasicBlock::iterator RB = Begin;
+      while (RB != End && HII->isSchedulingBoundary(*RB, &MB, MF))
+        ++RB;
+      // First the first boundary starting from the beginning of the new
+      // region.
+      MachineBasicBlock::iterator RE = RB;
+      while (RE != End && !HII->isSchedulingBoundary(*RE, &MB, MF))
+        ++RE;
+      // Add the scheduling boundary if it's not block end.
+      if (RE != End)
+        ++RE;
+      // If RB == End, then RE == End.
+      if (RB != End)
+        Packetizer.PacketizeMIs(&MB, RB, RE);
+
+      Begin = RE;
+    }
+  }
+
+  Packetizer.unpacketizeSoloInstrs(MF);
+  return true;
+}
+
+
+// Reserve resources for a constant extender. Trigger an assertion if the
+// reservation fails.
+void HexagonPacketizerList::reserveResourcesForConstExt() {
+  if (!tryAllocateResourcesForConstExt(true))
+    llvm_unreachable("Resources not available");
+}
+
+bool HexagonPacketizerList::canReserveResourcesForConstExt() {
+  return tryAllocateResourcesForConstExt(false);
+}
+
+// Allocate resources (i.e. 4 bytes) for constant extender. If succeeded,
+// return true, otherwise, return false.
+bool HexagonPacketizerList::tryAllocateResourcesForConstExt(bool Reserve) {
+  auto *ExtMI = MF.CreateMachineInstr(HII->get(Hexagon::A4_ext), DebugLoc());
+  bool Avail = ResourceTracker->canReserveResources(*ExtMI);
+  if (Reserve && Avail)
+    ResourceTracker->reserveResources(*ExtMI);
+  MF.DeleteMachineInstr(ExtMI);
+  return Avail;
+}
+
+
+bool HexagonPacketizerList::isCallDependent(const MachineInstr &MI,
+      SDep::Kind DepType, unsigned DepReg) {
+  // Check for LR dependence.
+  if (DepReg == HRI->getRARegister())
+    return true;
+
+  if (HII->isDeallocRet(MI))
+    if (DepReg == HRI->getFrameRegister() || DepReg == HRI->getStackRegister())
+      return true;
+
+  // Check if this is a predicate dependence.
+  const TargetRegisterClass* RC = HRI->getMinimalPhysRegClass(DepReg);
+  if (RC == &Hexagon::PredRegsRegClass)
+    return true;
+
+  // Assumes that the first operand of the CALLr is the function address.
+  if (HII->isIndirectCall(MI) && (DepType == SDep::Data)) {
+    const MachineOperand MO = MI.getOperand(0);
+    if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg))
+      return true;
+  }
+
+  if (HII->isJumpR(MI)) {
+    const MachineOperand &MO = HII->isPredicated(MI) ? MI.getOperand(1)
+                                                     : MI.getOperand(0);
+    assert(MO.isReg() && MO.isUse());
+    if (MO.getReg() == DepReg)
+      return true;
+  }
+  return false;
+}
+
+static bool isRegDependence(const SDep::Kind DepType) {
+  return DepType == SDep::Data || DepType == SDep::Anti ||
+         DepType == SDep::Output;
+}
+
+static bool isDirectJump(const MachineInstr &MI) {
+  return MI.getOpcode() == Hexagon::J2_jump;
+}
+
+static bool isSchedBarrier(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case Hexagon::Y2_barrier:
+    return true;
+  }
+  return false;
+}
+
+static bool isControlFlow(const MachineInstr &MI) {
+  return MI.getDesc().isTerminator() || MI.getDesc().isCall();
+}
+
+
+/// Returns true if the instruction modifies a callee-saved register.
+static bool doesModifyCalleeSavedReg(const MachineInstr &MI,
+                                     const TargetRegisterInfo *TRI) {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  for (auto *CSR = TRI->getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR)
+    if (MI.modifiesRegister(*CSR, TRI))
+      return true;
+  return false;
+}
+
+// Returns true if an instruction can be promoted to .new predicate or
+// new-value store.
+bool HexagonPacketizerList::isNewifiable(const MachineInstr &MI,
+      const TargetRegisterClass *NewRC) {
+  // Vector stores can be predicated, and can be new-value stores, but
+  // they cannot be predicated on a .new predicate value.
+  if (NewRC == &Hexagon::PredRegsRegClass)
+    if (HII->isV60VectorInstruction(MI) && MI.mayStore())
+      return false;
+  return HII->isCondInst(MI) || HII->isJumpR(MI) || MI.isReturn() ||
+         HII->mayBeNewStore(MI);
+}
+
+// Promote an instructiont to its .cur form.
+// At this time, we have already made a call to canPromoteToDotCur and made
+// sure that it can *indeed* be promoted.
+bool HexagonPacketizerList::promoteToDotCur(MachineInstr &MI,
+      SDep::Kind DepType, MachineBasicBlock::iterator &MII,
+      const TargetRegisterClass* RC) {
+  assert(DepType == SDep::Data);
+  int CurOpcode = HII->getDotCurOp(MI);
+  MI.setDesc(HII->get(CurOpcode));
+  return true;
+}
+
+void HexagonPacketizerList::cleanUpDotCur() {
+  MachineInstr *MI = nullptr;
+  for (auto BI : CurrentPacketMIs) {
+    DEBUG(dbgs() << "Cleanup packet has "; BI->dump(););
+    if (BI->getOpcode() == Hexagon::V6_vL32b_cur_ai) {
+      MI = BI;
+      continue;
+    }
+    if (MI) {
+      for (auto &MO : BI->operands())
+        if (MO.isReg() && MO.getReg() == MI->getOperand(0).getReg())
+          return;
+    }
+  }
+  if (!MI)
+    return;
+  // We did not find a use of the CUR, so de-cur it.
+  MI->setDesc(HII->get(Hexagon::V6_vL32b_ai));
+  DEBUG(dbgs() << "Demoted CUR "; MI->dump(););
+}
+
+// Check to see if an instruction can be dot cur.
+bool HexagonPacketizerList::canPromoteToDotCur(const MachineInstr &MI,
+      const SUnit *PacketSU, unsigned DepReg, MachineBasicBlock::iterator &MII,
+      const TargetRegisterClass *RC) {
+  if (!HII->isV60VectorInstruction(MI))
+    return false;
+  if (!HII->isV60VectorInstruction(*MII))
+    return false;
+
+  // Already a dot new instruction.
+  if (HII->isDotCurInst(MI) && !HII->mayBeCurLoad(MI))
+    return false;
+
+  if (!HII->mayBeCurLoad(MI))
+    return false;
+
+  // The "cur value" cannot come from inline asm.
+  if (PacketSU->getInstr()->isInlineAsm())
+    return false;
+
+  // Make sure candidate instruction uses cur.
+  DEBUG(dbgs() << "Can we DOT Cur Vector MI\n";
+        MI.dump();
+        dbgs() << "in packet\n";);
+  MachineInstr &MJ = *MII;
+  DEBUG({
+    dbgs() << "Checking CUR against ";
+    MJ.dump();
+  });
+  unsigned DestReg = MI.getOperand(0).getReg();
+  bool FoundMatch = false;
+  for (auto &MO : MJ.operands())
+    if (MO.isReg() && MO.getReg() == DestReg)
+      FoundMatch = true;
+  if (!FoundMatch)
+    return false;
+
+  // Check for existing uses of a vector register within the packet which
+  // would be affected by converting a vector load into .cur formt.
+  for (auto BI : CurrentPacketMIs) {
+    DEBUG(dbgs() << "packet has "; BI->dump(););
+    if (BI->readsRegister(DepReg, MF.getSubtarget().getRegisterInfo()))
+      return false;
+  }
+
+  DEBUG(dbgs() << "Can Dot CUR MI\n"; MI.dump(););
+  // We can convert the opcode into a .cur.
+  return true;
+}
+
+// Promote an instruction to its .new form. At this time, we have already
+// made a call to canPromoteToDotNew and made sure that it can *indeed* be
+// promoted.
+bool HexagonPacketizerList::promoteToDotNew(MachineInstr &MI,
+      SDep::Kind DepType, MachineBasicBlock::iterator &MII,
+      const TargetRegisterClass* RC) {
+  assert (DepType == SDep::Data);
+  int NewOpcode;
+  if (RC == &Hexagon::PredRegsRegClass)
+    NewOpcode = HII->getDotNewPredOp(MI, MBPI);
+  else
+    NewOpcode = HII->getDotNewOp(MI);
+  MI.setDesc(HII->get(NewOpcode));
+  return true;
+}
+
+bool HexagonPacketizerList::demoteToDotOld(MachineInstr &MI) {
+  int NewOpcode = HII->getDotOldOp(MI.getOpcode());
+  MI.setDesc(HII->get(NewOpcode));
+  return true;
+}
+
+bool HexagonPacketizerList::useCallersSP(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+    case Hexagon::S2_storerd_io:
+    case Hexagon::S2_storeri_io:
+    case Hexagon::S2_storerh_io:
+    case Hexagon::S2_storerb_io:
+      break;
+    default:
+      llvm_unreachable("Unexpected instruction");
+  }
+  unsigned FrameSize = MF.getFrameInfo().getStackSize();
+  MachineOperand &Off = MI.getOperand(1);
+  int64_t NewOff = Off.getImm() - (FrameSize + HEXAGON_LRFP_SIZE);
+  if (HII->isValidOffset(Opc, NewOff)) {
+    Off.setImm(NewOff);
+    return true;
+  }
+  return false;
+}
+
+void HexagonPacketizerList::useCalleesSP(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+    case Hexagon::S2_storerd_io:
+    case Hexagon::S2_storeri_io:
+    case Hexagon::S2_storerh_io:
+    case Hexagon::S2_storerb_io:
+      break;
+    default:
+      llvm_unreachable("Unexpected instruction");
+  }
+  unsigned FrameSize = MF.getFrameInfo().getStackSize();
+  MachineOperand &Off = MI.getOperand(1);
+  Off.setImm(Off.getImm() + FrameSize + HEXAGON_LRFP_SIZE);
+}
+
+enum PredicateKind {
+  PK_False,
+  PK_True,
+  PK_Unknown
+};
+
+/// Returns true if an instruction is predicated on p0 and false if it's
+/// predicated on !p0.
+static PredicateKind getPredicateSense(const MachineInstr &MI,
+                                       const HexagonInstrInfo *HII) {
+  if (!HII->isPredicated(MI))
+    return PK_Unknown;
+  if (HII->isPredicatedTrue(MI))
+    return PK_True;
+  return PK_False;
+}
+
+static const MachineOperand &getPostIncrementOperand(const MachineInstr &MI,
+      const HexagonInstrInfo *HII) {
+  assert(HII->isPostIncrement(MI) && "Not a post increment operation.");
+#ifndef NDEBUG
+  // Post Increment means duplicates. Use dense map to find duplicates in the
+  // list. Caution: Densemap initializes with the minimum of 64 buckets,
+  // whereas there are at most 5 operands in the post increment.
+  DenseSet<unsigned> DefRegsSet;
+  for (auto &MO : MI.operands())
+    if (MO.isReg() && MO.isDef())
+      DefRegsSet.insert(MO.getReg());
+
+  for (auto &MO : MI.operands())
+    if (MO.isReg() && MO.isUse() && DefRegsSet.count(MO.getReg()))
+      return MO;
+#else
+  if (MI.mayLoad()) {
+    const MachineOperand &Op1 = MI.getOperand(1);
+    // The 2nd operand is always the post increment operand in load.
+    assert(Op1.isReg() && "Post increment operand has be to a register.");
+    return Op1;
+  }
+  if (MI.getDesc().mayStore()) {
+    const MachineOperand &Op0 = MI.getOperand(0);
+    // The 1st operand is always the post increment operand in store.
+    assert(Op0.isReg() && "Post increment operand has be to a register.");
+    return Op0;
+  }
+#endif
+  // we should never come here.
+  llvm_unreachable("mayLoad or mayStore not set for Post Increment operation");
+}
+
+// Get the value being stored.
+static const MachineOperand& getStoreValueOperand(const MachineInstr &MI) {
+  // value being stored is always the last operand.
+  return MI.getOperand(MI.getNumOperands()-1);
+}
+
+static bool isLoadAbsSet(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+    case Hexagon::L4_loadrd_ap:
+    case Hexagon::L4_loadrb_ap:
+    case Hexagon::L4_loadrh_ap:
+    case Hexagon::L4_loadrub_ap:
+    case Hexagon::L4_loadruh_ap:
+    case Hexagon::L4_loadri_ap:
+      return true;
+  }
+  return false;
+}
+
+static const MachineOperand &getAbsSetOperand(const MachineInstr &MI) {
+  assert(isLoadAbsSet(MI));
+  return MI.getOperand(1);
+}
+
+
+// Can be new value store?
+// Following restrictions are to be respected in convert a store into
+// a new value store.
+// 1. If an instruction uses auto-increment, its address register cannot
+//    be a new-value register. Arch Spec 5.4.2.1
+// 2. If an instruction uses absolute-set addressing mode, its address
+//    register cannot be a new-value register. Arch Spec 5.4.2.1.
+// 3. If an instruction produces a 64-bit result, its registers cannot be used
+//    as new-value registers. Arch Spec 5.4.2.2.
+// 4. If the instruction that sets the new-value register is conditional, then
+//    the instruction that uses the new-value register must also be conditional,
+//    and both must always have their predicates evaluate identically.
+//    Arch Spec 5.4.2.3.
+// 5. There is an implied restriction that a packet cannot have another store,
+//    if there is a new value store in the packet. Corollary: if there is
+//    already a store in a packet, there can not be a new value store.
+//    Arch Spec: 3.4.4.2
+bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI,
+      const MachineInstr &PacketMI, unsigned DepReg) {
+  // Make sure we are looking at the store, that can be promoted.
+  if (!HII->mayBeNewStore(MI))
+    return false;
+
+  // Make sure there is dependency and can be new value'd.
+  const MachineOperand &Val = getStoreValueOperand(MI);
+  if (Val.isReg() && Val.getReg() != DepReg)
+    return false;
+
+  const MCInstrDesc& MCID = PacketMI.getDesc();
+
+  // First operand is always the result.
+  const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0, HRI, MF);
+  // Double regs can not feed into new value store: PRM section: 5.4.2.2.
+  if (PacketRC == &Hexagon::DoubleRegsRegClass)
+    return false;
+
+  // New-value stores are of class NV (slot 0), dual stores require class ST
+  // in slot 0 (PRM 5.5).
+  for (auto I : CurrentPacketMIs) {
+    SUnit *PacketSU = MIToSUnit.find(I)->second;
+    if (PacketSU->getInstr()->mayStore())
+      return false;
+  }
+
+  // Make sure it's NOT the post increment register that we are going to
+  // new value.
+  if (HII->isPostIncrement(MI) &&
+      getPostIncrementOperand(MI, HII).getReg() == DepReg) {
+    return false;
+  }
+
+  if (HII->isPostIncrement(PacketMI) && PacketMI.mayLoad() &&
+      getPostIncrementOperand(PacketMI, HII).getReg() == DepReg) {
+    // If source is post_inc, or absolute-set addressing, it can not feed
+    // into new value store
+    //   r3 = memw(r2++#4)
+    //   memw(r30 + #-1404) = r2.new -> can not be new value store
+    // arch spec section: 5.4.2.1.
+    return false;
+  }
+
+  if (isLoadAbsSet(PacketMI) && getAbsSetOperand(PacketMI).getReg() == DepReg)
+    return false;
+
+  // If the source that feeds the store is predicated, new value store must
+  // also be predicated.
+  if (HII->isPredicated(PacketMI)) {
+    if (!HII->isPredicated(MI))
+      return false;
+
+    // Check to make sure that they both will have their predicates
+    // evaluate identically.
+    unsigned predRegNumSrc = 0;
+    unsigned predRegNumDst = 0;
+    const TargetRegisterClass* predRegClass = nullptr;
+
+    // Get predicate register used in the source instruction.
+    for (auto &MO : PacketMI.operands()) {
+      if (!MO.isReg())
+        continue;
+      predRegNumSrc = MO.getReg();
+      predRegClass = HRI->getMinimalPhysRegClass(predRegNumSrc);
+      if (predRegClass == &Hexagon::PredRegsRegClass)
+        break;
+    }
+    assert((predRegClass == &Hexagon::PredRegsRegClass) &&
+        "predicate register not found in a predicated PacketMI instruction");
+
+    // Get predicate register used in new-value store instruction.
+    for (auto &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      predRegNumDst = MO.getReg();
+      predRegClass = HRI->getMinimalPhysRegClass(predRegNumDst);
+      if (predRegClass == &Hexagon::PredRegsRegClass)
+        break;
+    }
+    assert((predRegClass == &Hexagon::PredRegsRegClass) &&
+           "predicate register not found in a predicated MI instruction");
+
+    // New-value register producer and user (store) need to satisfy these
+    // constraints:
+    // 1) Both instructions should be predicated on the same register.
+    // 2) If producer of the new-value register is .new predicated then store
+    // should also be .new predicated and if producer is not .new predicated
+    // then store should not be .new predicated.
+    // 3) Both new-value register producer and user should have same predicate
+    // sense, i.e, either both should be negated or both should be non-negated.
+    if (predRegNumDst != predRegNumSrc ||
+        HII->isDotNewInst(PacketMI) != HII->isDotNewInst(MI) ||
+        getPredicateSense(MI, HII) != getPredicateSense(PacketMI, HII))
+      return false;
+  }
+
+  // Make sure that other than the new-value register no other store instruction
+  // register has been modified in the same packet. Predicate registers can be
+  // modified by they should not be modified between the producer and the store
+  // instruction as it will make them both conditional on different values.
+  // We already know this to be true for all the instructions before and
+  // including PacketMI. Howerver, we need to perform the check for the
+  // remaining instructions in the packet.
+
+  unsigned StartCheck = 0;
+
+  for (auto I : CurrentPacketMIs) {
+    SUnit *TempSU = MIToSUnit.find(I)->second;
+    MachineInstr &TempMI = *TempSU->getInstr();
+
+    // Following condition is true for all the instructions until PacketMI is
+    // reached (StartCheck is set to 0 before the for loop).
+    // StartCheck flag is 1 for all the instructions after PacketMI.
+    if (&TempMI != &PacketMI && !StartCheck) // Start processing only after
+      continue;                              // encountering PacketMI.
+
+    StartCheck = 1;
+    if (&TempMI == &PacketMI) // We don't want to check PacketMI for dependence.
+      continue;
+
+    for (auto &MO : MI.operands())
+      if (MO.isReg() && TempSU->getInstr()->modifiesRegister(MO.getReg(), HRI))
+        return false;
+  }
+
+  // Make sure that for non-POST_INC stores:
+  // 1. The only use of reg is DepReg and no other registers.
+  //    This handles V4 base+index registers.
+  //    The following store can not be dot new.
+  //    Eg.   r0 = add(r0, #3)
+  //          memw(r1+r0<<#2) = r0
+  if (!HII->isPostIncrement(MI)) {
+    for (unsigned opNum = 0; opNum < MI.getNumOperands()-1; opNum++) {
+      const MachineOperand &MO = MI.getOperand(opNum);
+      if (MO.isReg() && MO.getReg() == DepReg)
+        return false;
+    }
+  }
+
+  // If data definition is because of implicit definition of the register,
+  // do not newify the store. Eg.
+  // %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def>
+  // S2_storerh_io %R8, 2, %R12<kill>; mem:ST2[%scevgep343]
+  for (auto &MO : PacketMI.operands()) {
+    if (!MO.isReg() || !MO.isDef() || !MO.isImplicit())
+      continue;
+    unsigned R = MO.getReg();
+    if (R == DepReg || HRI->isSuperRegister(DepReg, R))
+      return false;
+  }
+
+  // Handle imp-use of super reg case. There is a target independent side
+  // change that should prevent this situation but I am handling it for
+  // just-in-case. For example, we cannot newify R2 in the following case:
+  // %R3<def> = A2_tfrsi 0;
+  // S2_storeri_io %R0<kill>, 0, %R2<kill>, %D1<imp-use,kill>;
+  for (auto &MO : MI.operands()) {
+    if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.getReg() == DepReg)
+      return false;
+  }
+
+  // Can be dot new store.
+  return true;
+}
+
+// Can this MI to promoted to either new value store or new value jump.
+bool HexagonPacketizerList::canPromoteToNewValue(const MachineInstr &MI,
+      const SUnit *PacketSU, unsigned DepReg,
+      MachineBasicBlock::iterator &MII) {
+  if (!HII->mayBeNewStore(MI))
+    return false;
+
+  // Check to see the store can be new value'ed.
+  MachineInstr &PacketMI = *PacketSU->getInstr();
+  if (canPromoteToNewValueStore(MI, PacketMI, DepReg))
+    return true;
+
+  // Check to see the compare/jump can be new value'ed.
+  // This is done as a pass on its own. Don't need to check it here.
+  return false;
+}
+
+static bool isImplicitDependency(const MachineInstr &I, unsigned DepReg) {
+  for (auto &MO : I.operands())
+    if (MO.isReg() && MO.isDef() && (MO.getReg() == DepReg) && MO.isImplicit())
+      return true;
+  return false;
+}
+
+// Check to see if an instruction can be dot new
+// There are three kinds.
+// 1. dot new on predicate - V2/V3/V4
+// 2. dot new on stores NV/ST - V4
+// 3. dot new on jump NV/J - V4 -- This is generated in a pass.
+bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI,
+      const SUnit *PacketSU, unsigned DepReg, MachineBasicBlock::iterator &MII,
+      const TargetRegisterClass* RC) {
+  // Already a dot new instruction.
+  if (HII->isDotNewInst(MI) && !HII->mayBeNewStore(MI))
+    return false;
+
+  if (!isNewifiable(MI, RC))
+    return false;
+
+  const MachineInstr &PI = *PacketSU->getInstr();
+
+  // The "new value" cannot come from inline asm.
+  if (PI.isInlineAsm())
+    return false;
+
+  // IMPLICIT_DEFs won't materialize as real instructions, so .new makes no
+  // sense.
+  if (PI.isImplicitDef())
+    return false;
+
+  // If dependency is trough an implicitly defined register, we should not
+  // newify the use.
+  if (isImplicitDependency(PI, DepReg))
+    return false;
+
+  const MCInstrDesc& MCID = PI.getDesc();
+  const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0, HRI, MF);
+  if (DisableVecDblNVStores && VecRC == &Hexagon::VecDblRegsRegClass)
+    return false;
+
+  // predicate .new
+  if (RC == &Hexagon::PredRegsRegClass)
+    if (HII->isCondInst(MI) || HII->isJumpR(MI) || MI.isReturn())
+      return HII->predCanBeUsedAsDotNew(PI, DepReg);
+
+  if (RC != &Hexagon::PredRegsRegClass && !HII->mayBeNewStore(MI))
+    return false;
+
+  // Create a dot new machine instruction to see if resources can be
+  // allocated. If not, bail out now.
+  int NewOpcode = HII->getDotNewOp(MI);
+  const MCInstrDesc &D = HII->get(NewOpcode);
+  MachineInstr *NewMI = MF.CreateMachineInstr(D, DebugLoc());
+  bool ResourcesAvailable = ResourceTracker->canReserveResources(*NewMI);
+  MF.DeleteMachineInstr(NewMI);
+  if (!ResourcesAvailable)
+    return false;
+
+  // New Value Store only. New Value Jump generated as a separate pass.
+  if (!canPromoteToNewValue(MI, PacketSU, DepReg, MII))
+    return false;
+
+  return true;
+}
+
+// Go through the packet instructions and search for an anti dependency between
+// them and DepReg from MI. Consider this case:
+// Trying to add
+// a) %R1<def> = TFRI_cdNotPt %P3, 2
+// to this packet:
+// {
+//   b) %P0<def> = C2_or %P3<kill>, %P0<kill>
+//   c) %P3<def> = C2_tfrrp %R23
+//   d) %R1<def> = C2_cmovenewit %P3, 4
+//  }
+// The P3 from a) and d) will be complements after
+// a)'s P3 is converted to .new form
+// Anti-dep between c) and b) is irrelevant for this case
+bool HexagonPacketizerList::restrictingDepExistInPacket(MachineInstr &MI,
+                                                        unsigned DepReg) {
+  SUnit *PacketSUDep = MIToSUnit.find(&MI)->second;
+
+  for (auto I : CurrentPacketMIs) {
+    // We only care for dependencies to predicated instructions
+    if (!HII->isPredicated(*I))
+      continue;
+
+    // Scheduling Unit for current insn in the packet
+    SUnit *PacketSU = MIToSUnit.find(I)->second;
+
+    // Look at dependencies between current members of the packet and
+    // predicate defining instruction MI. Make sure that dependency is
+    // on the exact register we care about.
+    if (PacketSU->isSucc(PacketSUDep)) {
+      for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) {
+        auto &Dep = PacketSU->Succs[i];
+        if (Dep.getSUnit() == PacketSUDep && Dep.getKind() == SDep::Anti &&
+            Dep.getReg() == DepReg)
+          return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+
+/// Gets the predicate register of a predicated instruction.
+static unsigned getPredicatedRegister(MachineInstr &MI,
+                                      const HexagonInstrInfo *QII) {
+  /// We use the following rule: The first predicate register that is a use is
+  /// the predicate register of a predicated instruction.
+  assert(QII->isPredicated(MI) && "Must be predicated instruction");
+
+  for (auto &Op : MI.operands()) {
+    if (Op.isReg() && Op.getReg() && Op.isUse() &&
+        Hexagon::PredRegsRegClass.contains(Op.getReg()))
+      return Op.getReg();
+  }
+
+  llvm_unreachable("Unknown instruction operand layout");
+  return 0;
+}
+
+// Given two predicated instructions, this function detects whether
+// the predicates are complements.
+bool HexagonPacketizerList::arePredicatesComplements(MachineInstr &MI1,
+                                                     MachineInstr &MI2) {
+  // If we don't know the predicate sense of the instructions bail out early, we
+  // need it later.
+  if (getPredicateSense(MI1, HII) == PK_Unknown ||
+      getPredicateSense(MI2, HII) == PK_Unknown)
+    return false;
+
+  // Scheduling unit for candidate.
+  SUnit *SU = MIToSUnit[&MI1];
+
+  // One corner case deals with the following scenario:
+  // Trying to add
+  // a) %R24<def> = A2_tfrt %P0, %R25
+  // to this packet:
+  // {
+  //   b) %R25<def> = A2_tfrf %P0, %R24
+  //   c) %P0<def> = C2_cmpeqi %R26, 1
+  // }
+  //
+  // On general check a) and b) are complements, but presence of c) will
+  // convert a) to .new form, and then it is not a complement.
+  // We attempt to detect it by analyzing existing dependencies in the packet.
+
+  // Analyze relationships between all existing members of the packet.
+  // Look for Anti dependecy on the same predicate reg as used in the
+  // candidate.
+  for (auto I : CurrentPacketMIs) {
+    // Scheduling Unit for current insn in the packet.
+    SUnit *PacketSU = MIToSUnit.find(I)->second;
+
+    // If this instruction in the packet is succeeded by the candidate...
+    if (PacketSU->isSucc(SU)) {
+      for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) {
+        auto Dep = PacketSU->Succs[i];
+        // The corner case exist when there is true data dependency between
+        // candidate and one of current packet members, this dep is on
+        // predicate reg, and there already exist anti dep on the same pred in
+        // the packet.
+        if (Dep.getSUnit() == SU && Dep.getKind() == SDep::Data &&
+            Hexagon::PredRegsRegClass.contains(Dep.getReg())) {
+          // Here I know that I is predicate setting instruction with true
+          // data dep to candidate on the register we care about - c) in the
+          // above example. Now I need to see if there is an anti dependency
+          // from c) to any other instruction in the same packet on the pred
+          // reg of interest.
+          if (restrictingDepExistInPacket(*I, Dep.getReg()))
+            return false;
+        }
+      }
+    }
+  }
+
+  // If the above case does not apply, check regular complement condition.
+  // Check that the predicate register is the same and that the predicate
+  // sense is different We also need to differentiate .old vs. .new: !p0
+  // is not complementary to p0.new.
+  unsigned PReg1 = getPredicatedRegister(MI1, HII);
+  unsigned PReg2 = getPredicatedRegister(MI2, HII);
+  return PReg1 == PReg2 &&
+         Hexagon::PredRegsRegClass.contains(PReg1) &&
+         Hexagon::PredRegsRegClass.contains(PReg2) &&
+         getPredicateSense(MI1, HII) != getPredicateSense(MI2, HII) &&
+         HII->isDotNewInst(MI1) == HII->isDotNewInst(MI2);
+}
+
+// Initialize packetizer flags.
+void HexagonPacketizerList::initPacketizerState() {
+  Dependence = false;
+  PromotedToDotNew = false;
+  GlueToNewValueJump = false;
+  GlueAllocframeStore = false;
+  FoundSequentialDependence = false;
+}
+
+// Ignore bundling of pseudo instructions.
+bool HexagonPacketizerList::ignorePseudoInstruction(const MachineInstr &MI,
+                                                    const MachineBasicBlock *) {
+  if (MI.isDebugValue())
+    return true;
+
+  if (MI.isCFIInstruction())
+    return false;
+
+  // We must print out inline assembly.
+  if (MI.isInlineAsm())
+    return false;
+
+  if (MI.isImplicitDef())
+    return false;
+
+  // We check if MI has any functional units mapped to it. If it doesn't,
+  // we ignore the instruction.
+  const MCInstrDesc& TID = MI.getDesc();
+  auto *IS = ResourceTracker->getInstrItins()->beginStage(TID.getSchedClass());
+  unsigned FuncUnits = IS->getUnits();
+  return !FuncUnits;
+}
+
+bool HexagonPacketizerList::isSoloInstruction(const MachineInstr &MI) {
+  if (MI.isEHLabel() || MI.isCFIInstruction())
+    return true;
+
+  // Consider inline asm to not be a solo instruction by default.
+  // Inline asm will be put in a packet temporarily, but then it will be
+  // removed, and placed outside of the packet (before or after, depending
+  // on dependencies).  This is to reduce the impact of inline asm as a
+  // "packet splitting" instruction.
+  if (MI.isInlineAsm() && !ScheduleInlineAsm)
+    return true;
+
+  // From Hexagon V4 Programmer's Reference Manual 3.4.4 Grouping constraints:
+  // trap, pause, barrier, icinva, isync, and syncht are solo instructions.
+  // They must not be grouped with other instructions in a packet.
+  if (isSchedBarrier(MI))
+    return true;
+
+  if (HII->isSolo(MI))
+    return true;
+
+  if (MI.getOpcode() == Hexagon::A2_nop)
+    return true;
+
+  return false;
+}
+
+
+// Quick check if instructions MI and MJ cannot coexist in the same packet.
+// Limit the tests to be "one-way", e.g.  "if MI->isBranch and MJ->isInlineAsm",
+// but not the symmetric case: "if MJ->isBranch and MI->isInlineAsm".
+// For full test call this function twice:
+//   cannotCoexistAsymm(MI, MJ) || cannotCoexistAsymm(MJ, MI)
+// Doing the test only one way saves the amount of code in this function,
+// since every test would need to be repeated with the MI and MJ reversed.
+static bool cannotCoexistAsymm(const MachineInstr &MI, const MachineInstr &MJ,
+      const HexagonInstrInfo &HII) {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  if (MF->getSubtarget<HexagonSubtarget>().hasV60TOpsOnly() &&
+      HII.isHVXMemWithAIndirect(MI, MJ))
+    return true;
+
+  // An inline asm cannot be together with a branch, because we may not be
+  // able to remove the asm out after packetizing (i.e. if the asm must be
+  // moved past the bundle).  Similarly, two asms cannot be together to avoid
+  // complications when determining their relative order outside of a bundle.
+  if (MI.isInlineAsm())
+    return MJ.isInlineAsm() || MJ.isBranch() || MJ.isBarrier() ||
+           MJ.isCall() || MJ.isTerminator();
+
+  switch (MI.getOpcode()) {
+  case (Hexagon::S2_storew_locked):
+  case (Hexagon::S4_stored_locked):
+  case (Hexagon::L2_loadw_locked):
+  case (Hexagon::L4_loadd_locked):
+  case (Hexagon::Y4_l2fetch): {
+    // These instructions can only be grouped with ALU32 or non-floating-point
+    // XTYPE instructions.  Since there is no convenient way of identifying fp
+    // XTYPE instructions, only allow grouping with ALU32 for now.
+    unsigned TJ = HII.getType(MJ);
+    if (TJ != HexagonII::TypeALU32)
+      return true;
+    break;
+  }
+  default:
+    break;
+  }
+
+  // "False" really means that the quick check failed to determine if
+  // I and J cannot coexist.
+  return false;
+}
+
+
+// Full, symmetric check.
+bool HexagonPacketizerList::cannotCoexist(const MachineInstr &MI,
+      const MachineInstr &MJ) {
+  return cannotCoexistAsymm(MI, MJ, *HII) || cannotCoexistAsymm(MJ, MI, *HII);
+}
+
+void HexagonPacketizerList::unpacketizeSoloInstrs(MachineFunction &MF) {
+  for (auto &B : MF) {
+    MachineBasicBlock::iterator BundleIt;
+    MachineBasicBlock::instr_iterator NextI;
+    for (auto I = B.instr_begin(), E = B.instr_end(); I != E; I = NextI) {
+      NextI = std::next(I);
+      MachineInstr &MI = *I;
+      if (MI.isBundle())
+        BundleIt = I;
+      if (!MI.isInsideBundle())
+        continue;
+
+      // Decide on where to insert the instruction that we are pulling out.
+      // Debug instructions always go before the bundle, but the placement of
+      // INLINE_ASM depends on potential dependencies.  By default, try to
+      // put it before the bundle, but if the asm writes to a register that
+      // other instructions in the bundle read, then we need to place it
+      // after the bundle (to preserve the bundle semantics).
+      bool InsertBeforeBundle;
+      if (MI.isInlineAsm())
+        InsertBeforeBundle = !hasWriteToReadDep(MI, *BundleIt, HRI);
+      else if (MI.isDebugValue())
+        InsertBeforeBundle = true;
+      else
+        continue;
+
+      BundleIt = moveInstrOut(MI, BundleIt, InsertBeforeBundle);
+    }
+  }
+}
+
+// Check if a given instruction is of class "system".
+static bool isSystemInstr(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+    case Hexagon::Y2_barrier:
+    case Hexagon::Y2_dcfetchbo:
+      return true;
+  }
+  return false;
+}
+
+bool HexagonPacketizerList::hasDeadDependence(const MachineInstr &I,
+                                              const MachineInstr &J) {
+  // The dependence graph may not include edges between dead definitions,
+  // so without extra checks, we could end up packetizing two instruction
+  // defining the same (dead) register.
+  if (I.isCall() || J.isCall())
+    return false;
+  if (HII->isPredicated(I) || HII->isPredicated(J))
+    return false;
+
+  BitVector DeadDefs(Hexagon::NUM_TARGET_REGS);
+  for (auto &MO : I.operands()) {
+    if (!MO.isReg() || !MO.isDef() || !MO.isDead())
+      continue;
+    DeadDefs[MO.getReg()] = true;
+  }
+
+  for (auto &MO : J.operands()) {
+    if (!MO.isReg() || !MO.isDef() || !MO.isDead())
+      continue;
+    unsigned R = MO.getReg();
+    if (R != Hexagon::USR_OVF && DeadDefs[R])
+      return true;
+  }
+  return false;
+}
+
+bool HexagonPacketizerList::hasControlDependence(const MachineInstr &I,
+                                                 const MachineInstr &J) {
+  // A save callee-save register function call can only be in a packet
+  // with instructions that don't write to the callee-save registers.
+  if ((HII->isSaveCalleeSavedRegsCall(I) &&
+       doesModifyCalleeSavedReg(J, HRI)) ||
+      (HII->isSaveCalleeSavedRegsCall(J) &&
+       doesModifyCalleeSavedReg(I, HRI)))
+    return true;
+
+  // Two control flow instructions cannot go in the same packet.
+  if (isControlFlow(I) && isControlFlow(J))
+    return true;
+
+  // \ref-manual (7.3.4) A loop setup packet in loopN or spNloop0 cannot
+  // contain a speculative indirect jump,
+  // a new-value compare jump or a dealloc_return.
+  auto isBadForLoopN = [this] (const MachineInstr &MI) -> bool {
+    if (MI.isCall() || HII->isDeallocRet(MI) || HII->isNewValueJump(MI))
+      return true;
+    if (HII->isPredicated(MI) && HII->isPredicatedNew(MI) && HII->isJumpR(MI))
+      return true;
+    return false;
+  };
+
+  if (HII->isLoopN(I) && isBadForLoopN(J))
+    return true;
+  if (HII->isLoopN(J) && isBadForLoopN(I))
+    return true;
+
+  // dealloc_return cannot appear in the same packet as a conditional or
+  // unconditional jump.
+  return HII->isDeallocRet(I) &&
+         (J.isBranch() || J.isCall() || J.isBarrier());
+}
+
+bool HexagonPacketizerList::hasV4SpecificDependence(const MachineInstr &I,
+                                                    const MachineInstr &J) {
+  bool SysI = isSystemInstr(I), SysJ = isSystemInstr(J);
+  bool StoreI = I.mayStore(), StoreJ = J.mayStore();
+  if ((SysI && StoreJ) || (SysJ && StoreI))
+    return true;
+
+  if (StoreI && StoreJ) {
+    if (HII->isNewValueInst(J) || HII->isMemOp(J) || HII->isMemOp(I))
+      return true;
+  } else {
+    // A memop cannot be in the same packet with another memop or a store.
+    // Two stores can be together, but here I and J cannot both be stores.
+    bool MopStI = HII->isMemOp(I) || StoreI;
+    bool MopStJ = HII->isMemOp(J) || StoreJ;
+    if (MopStI && MopStJ)
+      return true;
+  }
+
+  return (StoreJ && HII->isDeallocRet(I)) || (StoreI && HII->isDeallocRet(J));
+}
+
+// SUI is the current instruction that is out side of the current packet.
+// SUJ is the current instruction inside the current packet against which that
+// SUI will be packetized.
+bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
+  assert(SUI->getInstr() && SUJ->getInstr());
+  MachineInstr &I = *SUI->getInstr();
+  MachineInstr &J = *SUJ->getInstr();
+
+  // Clear IgnoreDepMIs when Packet starts.
+  if (CurrentPacketMIs.size() == 1)
+    IgnoreDepMIs.clear();
+
+  MachineBasicBlock::iterator II = I.getIterator();
+
+  // Solo instructions cannot go in the packet.
+  assert(!isSoloInstruction(I) && "Unexpected solo instr!");
+
+  if (cannotCoexist(I, J))
+    return false;
+
+  Dependence = hasDeadDependence(I, J) || hasControlDependence(I, J);
+  if (Dependence)
+    return false;
+
+  // V4 allows dual stores. It does not allow second store, if the first
+  // store is not in SLOT0. New value store, new value jump, dealloc_return
+  // and memop always take SLOT0. Arch spec 3.4.4.2.
+  Dependence = hasV4SpecificDependence(I, J);
+  if (Dependence)
+    return false;
+
+  // If an instruction feeds new value jump, glue it.
+  MachineBasicBlock::iterator NextMII = I.getIterator();
+  ++NextMII;
+  if (NextMII != I.getParent()->end() && HII->isNewValueJump(*NextMII)) {
+    MachineInstr &NextMI = *NextMII;
+
+    bool secondRegMatch = false;
+    const MachineOperand &NOp0 = NextMI.getOperand(0);
+    const MachineOperand &NOp1 = NextMI.getOperand(1);
+
+    if (NOp1.isReg() && I.getOperand(0).getReg() == NOp1.getReg())
+      secondRegMatch = true;
+
+    for (auto T : CurrentPacketMIs) {
+      SUnit *PacketSU = MIToSUnit.find(T)->second;
+      MachineInstr &PI = *PacketSU->getInstr();
+      // NVJ can not be part of the dual jump - Arch Spec: section 7.8.
+      if (PI.isCall()) {
+        Dependence = true;
+        break;
+      }
+      // Validate:
+      // 1. Packet does not have a store in it.
+      // 2. If the first operand of the nvj is newified, and the second
+      //    operand is also a reg, it (second reg) is not defined in
+      //    the same packet.
+      // 3. If the second operand of the nvj is newified, (which means
+      //    first operand is also a reg), first reg is not defined in
+      //    the same packet.
+      if (PI.getOpcode() == Hexagon::S2_allocframe || PI.mayStore() ||
+          HII->isLoopN(PI)) {
+        Dependence = true;
+        break;
+      }
+      // Check #2/#3.
+      const MachineOperand &OpR = secondRegMatch ? NOp0 : NOp1;
+      if (OpR.isReg() && PI.modifiesRegister(OpR.getReg(), HRI)) {
+        Dependence = true;
+        break;
+      }
+    }
+
+    if (Dependence)
+      return false;
+    GlueToNewValueJump = true;
+  }
+
+  // There no dependency between a prolog instruction and its successor.
+  if (!SUJ->isSucc(SUI))
+    return true;
+
+  for (unsigned i = 0; i < SUJ->Succs.size(); ++i) {
+    if (FoundSequentialDependence)
+      break;
+
+    if (SUJ->Succs[i].getSUnit() != SUI)
+      continue;
+
+    SDep::Kind DepType = SUJ->Succs[i].getKind();
+    // For direct calls:
+    // Ignore register dependences for call instructions for packetization
+    // purposes except for those due to r31 and predicate registers.
+    //
+    // For indirect calls:
+    // Same as direct calls + check for true dependences to the register
+    // used in the indirect call.
+    //
+    // We completely ignore Order dependences for call instructions.
+    //
+    // For returns:
+    // Ignore register dependences for return instructions like jumpr,
+    // dealloc return unless we have dependencies on the explicit uses
+    // of the registers used by jumpr (like r31) or dealloc return
+    // (like r29 or r30).
+    unsigned DepReg = 0;
+    const TargetRegisterClass *RC = nullptr;
+    if (DepType == SDep::Data) {
+      DepReg = SUJ->Succs[i].getReg();
+      RC = HRI->getMinimalPhysRegClass(DepReg);
+    }
+
+    if (I.isCall() || HII->isJumpR(I) || I.isReturn() || HII->isTailCall(I)) {
+      if (!isRegDependence(DepType))
+        continue;
+      if (!isCallDependent(I, DepType, SUJ->Succs[i].getReg()))
+        continue;
+    }
+
+    if (DepType == SDep::Data) {
+      if (canPromoteToDotCur(J, SUJ, DepReg, II, RC))
+        if (promoteToDotCur(J, DepType, II, RC))
+          continue;
+    }
+
+    // Data dpendence ok if we have load.cur.
+    if (DepType == SDep::Data && HII->isDotCurInst(J)) {
+      if (HII->isV60VectorInstruction(I))
+        continue;
+    }
+
+    // For instructions that can be promoted to dot-new, try to promote.
+    if (DepType == SDep::Data) {
+      if (canPromoteToDotNew(I, SUJ, DepReg, II, RC)) {
+        if (promoteToDotNew(I, DepType, II, RC)) {
+          PromotedToDotNew = true;
+          continue;
+        }
+      }
+      if (HII->isNewValueJump(I))
+        continue;
+    }
+
+    // For predicated instructions, if the predicates are complements then
+    // there can be no dependence.
+    if (HII->isPredicated(I) && HII->isPredicated(J) &&
+        arePredicatesComplements(I, J)) {
+      // Not always safe to do this translation.
+      // DAG Builder attempts to reduce dependence edges using transitive
+      // nature of dependencies. Here is an example:
+      //
+      // r0 = tfr_pt ... (1)
+      // r0 = tfr_pf ... (2)
+      // r0 = tfr_pt ... (3)
+      //
+      // There will be an output dependence between (1)->(2) and (2)->(3).
+      // However, there is no dependence edge between (1)->(3). This results
+      // in all 3 instructions going in the same packet. We ignore dependce
+      // only once to avoid this situation.
+      auto Itr = find(IgnoreDepMIs, &J);
+      if (Itr != IgnoreDepMIs.end()) {
+        Dependence = true;
+        return false;
+      }
+      IgnoreDepMIs.push_back(&I);
+      continue;
+    }
+
+    // Ignore Order dependences between unconditional direct branches
+    // and non-control-flow instructions.
+    if (isDirectJump(I) && !J.isBranch() && !J.isCall() &&
+        DepType == SDep::Order)
+      continue;
+
+    // Ignore all dependences for jumps except for true and output
+    // dependences.
+    if (I.isConditionalBranch() && DepType != SDep::Data &&
+        DepType != SDep::Output)
+      continue;
+
+    // Ignore output dependences due to superregs. We can write to two
+    // different subregisters of R1:0 for instance in the same cycle.
+
+    // If neither I nor J defines DepReg, then this is a superfluous output
+    // dependence. The dependence must be of the form:
+    //   R0 = ...
+    //   R1 = ...
+    // and there is an output dependence between the two instructions with
+    // DepReg = D0.
+    // We want to ignore these dependences. Ideally, the dependence
+    // constructor should annotate such dependences. We can then avoid this
+    // relatively expensive check.
+    //
+    if (DepType == SDep::Output) {
+      // DepReg is the register that's responsible for the dependence.
+      unsigned DepReg = SUJ->Succs[i].getReg();
+
+      // Check if I and J really defines DepReg.
+      if (!I.definesRegister(DepReg) && !J.definesRegister(DepReg))
+        continue;
+      FoundSequentialDependence = true;
+      break;
+    }
+
+    // For Order dependences:
+    // 1. On V4 or later, volatile loads/stores can be packetized together,
+    //    unless other rules prevent is.
+    // 2. Store followed by a load is not allowed.
+    // 3. Store followed by a store is only valid on V4 or later.
+    // 4. Load followed by any memory operation is allowed.
+    if (DepType == SDep::Order) {
+      if (!PacketizeVolatiles) {
+        bool OrdRefs = I.hasOrderedMemoryRef() || J.hasOrderedMemoryRef();
+        if (OrdRefs) {
+          FoundSequentialDependence = true;
+          break;
+        }
+      }
+      // J is first, I is second.
+      bool LoadJ = J.mayLoad(), StoreJ = J.mayStore();
+      bool LoadI = I.mayLoad(), StoreI = I.mayStore();
+      if (StoreJ) {
+        // Two stores are only allowed on V4+. Load following store is never
+        // allowed.
+        if (LoadI) {
+          FoundSequentialDependence = true;
+          break;
+        }
+      } else if (!LoadJ || (!LoadI && !StoreI)) {
+        // If J is neither load nor store, assume a dependency.
+        // If J is a load, but I is neither, also assume a dependency.
+        FoundSequentialDependence = true;
+        break;
+      }
+      // Store followed by store: not OK on V2.
+      // Store followed by load: not OK on all.
+      // Load followed by store: OK on all.
+      // Load followed by load: OK on all.
+      continue;
+    }
+
+    // For V4, special case ALLOCFRAME. Even though there is dependency
+    // between ALLOCFRAME and subsequent store, allow it to be packetized
+    // in a same packet. This implies that the store is using the caller's
+    // SP. Hence, offset needs to be updated accordingly.
+    if (DepType == SDep::Data && J.getOpcode() == Hexagon::S2_allocframe) {
+      unsigned Opc = I.getOpcode();
+      switch (Opc) {
+        case Hexagon::S2_storerd_io:
+        case Hexagon::S2_storeri_io:
+        case Hexagon::S2_storerh_io:
+        case Hexagon::S2_storerb_io:
+          if (I.getOperand(0).getReg() == HRI->getStackRegister()) {
+            // Since this store is to be glued with allocframe in the same
+            // packet, it will use SP of the previous stack frame, i.e.
+            // caller's SP. Therefore, we need to recalculate offset
+            // according to this change.
+            GlueAllocframeStore = useCallersSP(I);
+            if (GlueAllocframeStore)
+              continue;
+          }
+        default:
+          break;
+      }
+    }
+
+    // There are certain anti-dependencies that cannot be ignored.
+    // Specifically:
+    //   J2_call ... %R0<imp-def>   ; SUJ
+    //   R0 = ...                   ; SUI
+    // Those cannot be packetized together, since the call will observe
+    // the effect of the assignment to R0.
+    if (DepType == SDep::Anti && J.isCall()) {
+      // Check if I defines any volatile register. We should also check
+      // registers that the call may read, but these happen to be a
+      // subset of the volatile register set.
+      for (const MCPhysReg *P = J.getDesc().ImplicitDefs; P && *P; ++P) {
+        if (!I.modifiesRegister(*P, HRI))
+          continue;
+        FoundSequentialDependence = true;
+        break;
+      }
+    }
+
+    // Skip over remaining anti-dependences. Two instructions that are
+    // anti-dependent can share a packet, since in most such cases all
+    // operands are read before any modifications take place.
+    // The exceptions are branch and call instructions, since they are
+    // executed after all other instructions have completed (at least
+    // conceptually).
+    if (DepType != SDep::Anti) {
+      FoundSequentialDependence = true;
+      break;
+    }
+  }
+
+  if (FoundSequentialDependence) {
+    Dependence = true;
+    return false;
+  }
+
+  return true;
+}
+
+bool HexagonPacketizerList::isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {
+  assert(SUI->getInstr() && SUJ->getInstr());
+  MachineInstr &I = *SUI->getInstr();
+  MachineInstr &J = *SUJ->getInstr();
+
+  if (cannotCoexist(I, J))
+    return false;
+
+  if (!Dependence)
+    return true;
+
+  // Check if the instruction was promoted to a dot-new. If so, demote it
+  // back into a dot-old.
+  if (PromotedToDotNew)
+    demoteToDotOld(I);
+
+  cleanUpDotCur();
+  // Check if the instruction (must be a store) was glued with an allocframe
+  // instruction. If so, restore its offset to its original value, i.e. use
+  // current SP instead of caller's SP.
+  if (GlueAllocframeStore) {
+    useCalleesSP(I);
+    GlueAllocframeStore = false;
+  }
+  return false;
+}
+
+MachineBasicBlock::iterator
+HexagonPacketizerList::addToPacket(MachineInstr &MI) {
+  MachineBasicBlock::iterator MII = MI.getIterator();
+  MachineBasicBlock *MBB = MI.getParent();
+  if (MI.isImplicitDef()) {
+    unsigned R = MI.getOperand(0).getReg();
+    if (Hexagon::IntRegsRegClass.contains(R)) {
+      MCSuperRegIterator S(R, HRI, false);
+      MI.addOperand(MachineOperand::CreateReg(*S, true, true));
+    }
+    return MII;
+  }
+  assert(ResourceTracker->canReserveResources(MI));
+
+  bool ExtMI = HII->isExtended(MI) || HII->isConstExtended(MI);
+  bool Good = true;
+
+  if (GlueToNewValueJump) {
+    MachineInstr &NvjMI = *++MII;
+    // We need to put both instructions in the same packet: MI and NvjMI.
+    // Either of them can require a constant extender. Try to add both to
+    // the current packet, and if that fails, end the packet and start a
+    // new one.
+    ResourceTracker->reserveResources(MI);
+    if (ExtMI)
+      Good = tryAllocateResourcesForConstExt(true);
+
+    bool ExtNvjMI = HII->isExtended(NvjMI) || HII->isConstExtended(NvjMI);
+    if (Good) {
+      if (ResourceTracker->canReserveResources(NvjMI))
+        ResourceTracker->reserveResources(NvjMI);
+      else
+        Good = false;
+    }
+    if (Good && ExtNvjMI)
+      Good = tryAllocateResourcesForConstExt(true);
+
+    if (!Good) {
+      endPacket(MBB, MI);
+      assert(ResourceTracker->canReserveResources(MI));
+      ResourceTracker->reserveResources(MI);
+      if (ExtMI) {
+        assert(canReserveResourcesForConstExt());
+        tryAllocateResourcesForConstExt(true);
+      }
+      assert(ResourceTracker->canReserveResources(NvjMI));
+      ResourceTracker->reserveResources(NvjMI);
+      if (ExtNvjMI) {
+        assert(canReserveResourcesForConstExt());
+        reserveResourcesForConstExt();
+      }
+    }
+    CurrentPacketMIs.push_back(&MI);
+    CurrentPacketMIs.push_back(&NvjMI);
+    return MII;
+  }
+
+  ResourceTracker->reserveResources(MI);
+  if (ExtMI && !tryAllocateResourcesForConstExt(true)) {
+    endPacket(MBB, MI);
+    if (PromotedToDotNew)
+      demoteToDotOld(MI);
+    if (GlueAllocframeStore) {
+      useCalleesSP(MI);
+      GlueAllocframeStore = false;
+    }
+    ResourceTracker->reserveResources(MI);
+    reserveResourcesForConstExt();
+  }
+
+  CurrentPacketMIs.push_back(&MI);
+  return MII;
+}
+
+void HexagonPacketizerList::endPacket(MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator MI) {
+  OldPacketMIs = CurrentPacketMIs;
+  VLIWPacketizerList::endPacket(MBB, MI);
+}
+
+bool HexagonPacketizerList::shouldAddToPacket(const MachineInstr &MI) {
+  return !producesStall(MI);
+}
+
+
+// Return true when ConsMI uses a register defined by ProdMI.
+static bool isDependent(const MachineInstr &ProdMI,
+      const MachineInstr &ConsMI) {
+  if (!ProdMI.getOperand(0).isReg())
+    return false;
+  unsigned DstReg = ProdMI.getOperand(0).getReg();
+
+  for (auto &Op : ConsMI.operands())
+    if (Op.isReg() && Op.isUse() && Op.getReg() == DstReg)
+      // The MIs depend on each other.
+      return true;
+
+  return false;
+}
+
+// V60 forward scheduling.
+bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
+  // Check whether the previous packet is in a different loop. If this is the
+  // case, there is little point in trying to avoid a stall because that would
+  // favor the rare case (loop entry) over the common case (loop iteration).
+  //
+  // TODO: We should really be able to check all the incoming edges if this is
+  // the first packet in a basic block, so we can avoid stalls from the loop
+  // backedge.
+  if (!OldPacketMIs.empty()) {
+    auto *OldBB = OldPacketMIs.front()->getParent();
+    auto *ThisBB = I.getParent();
+    if (MLI->getLoopFor(OldBB) != MLI->getLoopFor(ThisBB))
+      return false;
+  }
+
+  // Check for stall between two vector instructions.
+  if (HII->isV60VectorInstruction(I)) {
+    for (auto J : OldPacketMIs) {
+      if (!HII->isV60VectorInstruction(*J))
+        continue;
+      if (isDependent(*J, I) && !HII->isVecUsableNextPacket(*J, I))
+        return true;
+    }
+    return false;
+  }
+
+  // Check for stall between two scalar instructions. First, check that
+  // there is no definition of a use in the current packet, because it
+  // may be a candidate for .new.
+  for (auto J : CurrentPacketMIs)
+    if (!HII->isV60VectorInstruction(*J) && isDependent(*J, I))
+      return false;
+
+  // Check for stall between I and instructions in the previous packet.
+  if (MF.getSubtarget<HexagonSubtarget>().useBSBScheduling()) {
+    for (auto J : OldPacketMIs) {
+      if (HII->isV60VectorInstruction(*J))
+        continue;
+      if (!HII->isLateInstrFeedsEarlyInstr(*J, I))
+        continue;
+      if (isDependent(*J, I) && !HII->canExecuteInBundle(*J, I))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonPacketizer() {
+  return new HexagonPacketizer();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
new file mode 100644
index 000000000000..b28b926ec300
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -0,0 +1,117 @@
+#ifndef HEXAGONVLIWPACKETIZER_H
+#define HEXAGONVLIWPACKETIZER_H
+
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+
+namespace llvm {
+class HexagonPacketizerList : public VLIWPacketizerList {
+  // Vector of instructions assigned to the packet that has just been created.
+  std::vector<MachineInstr*> OldPacketMIs;
+
+  // Has the instruction been promoted to a dot-new instruction.
+  bool PromotedToDotNew;
+
+  // Has the instruction been glued to allocframe.
+  bool GlueAllocframeStore;
+
+  // Has the feeder instruction been glued to new value jump.
+  bool GlueToNewValueJump;
+
+  // Check if there is a dependence between some instruction already in this
+  // packet and this instruction.
+  bool Dependence;
+
+  // Only check for dependence if there are resources available to
+  // schedule this instruction.
+  bool FoundSequentialDependence;
+
+  // Track MIs with ignored dependence.
+  std::vector<MachineInstr*> IgnoreDepMIs;
+
+protected:
+  /// \brief A handle to the branch probability pass.
+  const MachineBranchProbabilityInfo *MBPI;
+  const MachineLoopInfo *MLI;
+
+private:
+  const HexagonInstrInfo *HII;
+  const HexagonRegisterInfo *HRI;
+
+public:
+  // Ctor.
+  HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
+                        AliasAnalysis *AA,
+                        const MachineBranchProbabilityInfo *MBPI);
+
+  // initPacketizerState - initialize some internal flags.
+  void initPacketizerState() override;
+
+  // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+  bool ignorePseudoInstruction(const MachineInstr &MI,
+                               const MachineBasicBlock *MBB) override;
+
+  // isSoloInstruction - return true if instruction MI can not be packetized
+  // with any other instruction, which means that MI itself is a packet.
+  bool isSoloInstruction(const MachineInstr &MI) override;
+
+  // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
+  // together.
+  bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override;
+
+  // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
+  // and SUJ.
+  bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override;
+
+  MachineBasicBlock::iterator addToPacket(MachineInstr &MI) override;
+  void endPacket(MachineBasicBlock *MBB,
+                 MachineBasicBlock::iterator MI) override;
+  bool shouldAddToPacket(const MachineInstr &MI) override;
+
+  void unpacketizeSoloInstrs(MachineFunction &MF);
+
+protected:
+  bool isCallDependent(const MachineInstr &MI, SDep::Kind DepType,
+                       unsigned DepReg);
+  bool promoteToDotCur(MachineInstr &MI, SDep::Kind DepType,
+                       MachineBasicBlock::iterator &MII,
+                       const TargetRegisterClass *RC);
+  bool canPromoteToDotCur(const MachineInstr &MI, const SUnit *PacketSU,
+                          unsigned DepReg, MachineBasicBlock::iterator &MII,
+                          const TargetRegisterClass *RC);
+  void cleanUpDotCur();
+
+  bool promoteToDotNew(MachineInstr &MI, SDep::Kind DepType,
+                       MachineBasicBlock::iterator &MII,
+                       const TargetRegisterClass *RC);
+  bool canPromoteToDotNew(const MachineInstr &MI, const SUnit *PacketSU,
+                          unsigned DepReg, MachineBasicBlock::iterator &MII,
+                          const TargetRegisterClass *RC);
+  bool canPromoteToNewValue(const MachineInstr &MI, const SUnit *PacketSU,
+                            unsigned DepReg, MachineBasicBlock::iterator &MII);
+  bool canPromoteToNewValueStore(const MachineInstr &MI,
+                                 const MachineInstr &PacketMI, unsigned DepReg);
+  bool demoteToDotOld(MachineInstr &MI);
+  bool useCallersSP(MachineInstr &MI);
+  void useCalleesSP(MachineInstr &MI);
+  bool arePredicatesComplements(MachineInstr &MI1, MachineInstr &MI2);
+  bool restrictingDepExistInPacket(MachineInstr&, unsigned);
+  bool isNewifiable(const MachineInstr &MI, const TargetRegisterClass *NewRC);
+  bool isCurifiable(MachineInstr &MI);
+  bool cannotCoexist(const MachineInstr &MI, const MachineInstr &MJ);
+  inline bool isPromotedToDotNew() const {
+    return PromotedToDotNew;
+  }
+  bool tryAllocateResourcesForConstExt(bool Reserve);
+  bool canReserveResourcesForConstExt();
+  void reserveResourcesForConstExt();
+  bool hasDeadDependence(const MachineInstr &I, const MachineInstr &J);
+  bool hasControlDependence(const MachineInstr &I, const MachineInstr &J);
+  bool hasV4SpecificDependence(const MachineInstr &I, const MachineInstr &J);
+  bool producesStall(const MachineInstr &MI);
+};
+} // namespace llvm
+#endif // HEXAGONVLIWPACKETIZER_H
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp
new file mode 100644
index 000000000000..085d4645df06
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp
@@ -0,0 +1,209 @@
+//===-- HexagonVectorPrint.cpp - Generate vector printing instructions -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass adds the capability to generate pseudo vector/predicate register
+// printing instructions. These pseudo instructions should be used with the
+// simulator, NEVER on hardware.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-vector-print"
+
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<bool> TraceHexVectorStoresOnly("trace-hex-vector-stores-only",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Enables tracing of vector stores"));
+
+namespace llvm {
+
+  FunctionPass *createHexagonVectorPrint();
+  void initializeHexagonVectorPrintPass(PassRegistry&);
+
+} // end namespace llvm
+
+namespace {
+
+class HexagonVectorPrint : public MachineFunctionPass {
+  const HexagonSubtarget *QST;
+  const HexagonInstrInfo *QII;
+  const HexagonRegisterInfo *QRI;
+
+public:
+  static char ID;
+
+  HexagonVectorPrint()
+      : MachineFunctionPass(ID), QST(nullptr), QII(nullptr), QRI(nullptr) {
+    initializeHexagonVectorPrintPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "Hexagon VectorPrint pass"; }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+};
+
+char HexagonVectorPrint::ID = 0;
+
+} // end anonymous namespace
+
+static bool isVecReg(unsigned Reg) {
+  return (Reg >= Hexagon::V0 && Reg <= Hexagon::V31)
+      || (Reg >= Hexagon::W0 && Reg <= Hexagon::W15)
+      || (Reg >= Hexagon::Q0 && Reg <= Hexagon::Q3);
+}
+
+static std::string getStringReg(unsigned R) {
+  if (R >= Hexagon::V0 && R <= Hexagon::V31) {
+    static const char* S[] = { "20", "21", "22", "23", "24", "25", "26", "27",
+                        "28", "29", "2a", "2b", "2c", "2d", "2e", "2f",
+                        "30", "31", "32", "33", "34", "35", "36", "37",
+                        "38", "39", "3a", "3b", "3c", "3d", "3e", "3f"};
+    return S[R-Hexagon::V0];
+  }
+  if (R >= Hexagon::Q0 && R <= Hexagon::Q3) {
+    static const char* S[] = { "00", "01", "02", "03"};
+    return S[R-Hexagon::Q0];
+
+  }
+  llvm_unreachable("valid vreg");
+}
+
+static void addAsmInstr(MachineBasicBlock *MBB, unsigned Reg,
+                        MachineBasicBlock::instr_iterator I,
+                        const DebugLoc &DL, const HexagonInstrInfo *QII,
+                        MachineFunction &Fn) {
+
+  std::string VDescStr = ".long 0x1dffe0" + getStringReg(Reg);
+  const char *cstr = Fn.createExternalSymbolName(VDescStr);
+  unsigned ExtraInfo = InlineAsm::Extra_HasSideEffects;
+  BuildMI(*MBB, I, DL, QII->get(TargetOpcode::INLINEASM))
+    .addExternalSymbol(cstr)
+    .addImm(ExtraInfo);
+}
+
+static bool getInstrVecReg(const MachineInstr &MI, unsigned &Reg) {
+  if (MI.getNumOperands() < 1) return false;
+  // Vec load or compute.
+  if (MI.getOperand(0).isReg() && MI.getOperand(0).isDef()) {
+    Reg = MI.getOperand(0).getReg();
+    if (isVecReg(Reg))
+      return !TraceHexVectorStoresOnly;
+  }
+  // Vec store.
+  if (MI.mayStore() && MI.getNumOperands() >= 3 && MI.getOperand(2).isReg()) {
+    Reg = MI.getOperand(2).getReg();
+    if (isVecReg(Reg))
+      return true;
+  }
+  // Vec store post increment.
+  if (MI.mayStore() && MI.getNumOperands() >= 4 && MI.getOperand(3).isReg()) {
+    Reg = MI.getOperand(3).getReg();
+    if (isVecReg(Reg))
+      return true;
+  }
+  return false;
+}
+
+bool HexagonVectorPrint::runOnMachineFunction(MachineFunction &Fn) {
+  bool Changed = false;
+  QST = &Fn.getSubtarget<HexagonSubtarget>();
+  QRI = QST->getRegisterInfo();
+  QII = QST->getInstrInfo();
+  std::vector<MachineInstr *> VecPrintList;
+  for (auto &MBB : Fn)
+    for (auto &MI : MBB) {
+      if (MI.isBundle()) {
+        MachineBasicBlock::instr_iterator MII = MI.getIterator();
+        for (++MII; MII != MBB.instr_end() && MII->isInsideBundle(); ++MII) {
+          if (MII->getNumOperands() < 1)
+            continue;
+          unsigned Reg = 0;
+          if (getInstrVecReg(*MII, Reg)) {
+            VecPrintList.push_back((&*MII));
+            DEBUG(dbgs() << "Found vector reg inside bundle \n"; MII->dump());
+          }
+        }
+      } else {
+        unsigned Reg = 0;
+        if (getInstrVecReg(MI, Reg)) {
+          VecPrintList.push_back(&MI);
+          DEBUG(dbgs() << "Found vector reg \n"; MI.dump());
+        }
+      }
+    }
+
+  Changed = !VecPrintList.empty();
+  if (!Changed)
+    return Changed;
+
+  for (auto *I : VecPrintList) {
+    DebugLoc DL = I->getDebugLoc();
+    MachineBasicBlock *MBB = I->getParent();
+    DEBUG(dbgs() << "Evaluating V MI\n"; I->dump());
+    unsigned Reg = 0;
+    if (!getInstrVecReg(*I, Reg))
+      llvm_unreachable("Need a vector reg");
+    MachineBasicBlock::instr_iterator MII = I->getIterator();
+    if (I->isInsideBundle()) {
+      DEBUG(dbgs() << "add to end of bundle\n"; I->dump());
+      while (MBB->instr_end() != MII && MII->isInsideBundle())
+        MII++;
+    } else {
+      DEBUG(dbgs() << "add after instruction\n"; I->dump());
+      MII++;
+    }
+    if (MBB->instr_end() == MII)
+      continue;
+
+    if (Reg >= Hexagon::V0 && Reg <= Hexagon::V31) {
+      DEBUG(dbgs() << "adding dump for V" << Reg-Hexagon::V0 << '\n');
+      addAsmInstr(MBB, Reg, MII, DL, QII, Fn);
+    } else if (Reg >= Hexagon::W0 && Reg <= Hexagon::W15) {
+      DEBUG(dbgs() << "adding dump for W" << Reg-Hexagon::W0 << '\n');
+      addAsmInstr(MBB, Hexagon::V0 + (Reg - Hexagon::W0) * 2 + 1,
+                  MII, DL, QII, Fn);
+      addAsmInstr(MBB, Hexagon::V0 + (Reg - Hexagon::W0) * 2,
+                   MII, DL, QII, Fn);
+    } else if (Reg >= Hexagon::Q0 && Reg <= Hexagon::Q3) {
+      DEBUG(dbgs() << "adding dump for Q" << Reg-Hexagon::Q0 << '\n');
+      addAsmInstr(MBB, Reg, MII, DL, QII, Fn);
+    } else
+      llvm_unreachable("Bad Vector reg");
+  }
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+INITIALIZE_PASS(HexagonVectorPrint, "hexagon-vector-print",
+  "Hexagon VectorPrint pass", false, false)
+
+FunctionPass *llvm::createHexagonVectorPrint() {
+  return new HexagonVectorPrint();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
new file mode 100644
index 000000000000..c140bd1d7ee2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -0,0 +1,753 @@
+//===-- HexagonAsmBackend.cpp - Hexagon Assembler Backend -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonFixupKinds.h"
+#include "HexagonMCTargetDesc.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCChecker.h"
+#include "MCTargetDesc/HexagonMCCodeEmitter.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include <sstream>
+
+using namespace llvm;
+using namespace Hexagon;
+
+#define DEBUG_TYPE "hexagon-asm-backend"
+
+static cl::opt<bool> DisableFixup
+  ("mno-fixup", cl::desc("Disable fixing up resolved relocations for Hexagon"));
+
+namespace {
+
+class HexagonAsmBackend : public MCAsmBackend {
+  uint8_t OSABI;
+  StringRef CPU;
+  mutable uint64_t relaxedCnt;
+  std::unique_ptr <MCInstrInfo> MCII;
+  std::unique_ptr <MCInst *> RelaxTarget;
+  MCInst * Extender;
+
+  void ReplaceInstruction(MCCodeEmitter &E, MCRelaxableFragment &RF,
+                          MCInst &HMB) const {
+    SmallVector<MCFixup, 4> Fixups;
+    SmallString<256> Code;
+    raw_svector_ostream VecOS(Code);
+    E.encodeInstruction(HMB, VecOS, Fixups, RF.getSubtargetInfo());
+
+    // Update the fragment.
+    RF.setInst(HMB);
+    RF.getContents() = Code;
+    RF.getFixups() = Fixups;
+  }
+public:
+  HexagonAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) :
+    OSABI(OSABI), MCII (T.createMCInstrInfo()), RelaxTarget(new MCInst *),
+    Extender(nullptr) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createHexagonELFObjectWriter(OS, OSABI, CPU);
+  }
+
+  void setExtender(MCContext &Context) const {
+    if (Extender == nullptr)
+      const_cast<HexagonAsmBackend *>(this)->Extender = new (Context) MCInst;
+  }
+
+  MCInst *takeExtender() const {
+    assert(Extender != nullptr);
+    MCInst * Result = Extender;
+    const_cast<HexagonAsmBackend *>(this)->Extender = nullptr;
+    return Result;
+  }
+
+  unsigned getNumFixupKinds() const override {
+    return Hexagon::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    const static MCFixupKindInfo Infos[Hexagon::NumTargetFixupKinds] = {
+      // This table *must* be in same the order of fixup_* kinds in
+      // HexagonFixupKinds.h.
+      //
+      // namei                          offset  bits  flags
+      { "fixup_Hexagon_B22_PCREL",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B15_PCREL",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B7_PCREL",         0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_LO16",             0,    32,   0 },
+      { "fixup_Hexagon_HI16",             0,    32,   0 },
+      { "fixup_Hexagon_32",               0,    32,   0 },
+      { "fixup_Hexagon_16",               0,    32,   0 },
+      { "fixup_Hexagon_8",                0,    32,   0 },
+      { "fixup_Hexagon_GPREL16_0",        0,    32,   0 },
+      { "fixup_Hexagon_GPREL16_1",        0,    32,   0 },
+      { "fixup_Hexagon_GPREL16_2",        0,    32,   0 },
+      { "fixup_Hexagon_GPREL16_3",        0,    32,   0 },
+      { "fixup_Hexagon_HL16",             0,    32,   0 },
+      { "fixup_Hexagon_B13_PCREL",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B9_PCREL",         0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B32_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_32_6_X",           0,    32,   0 },
+      { "fixup_Hexagon_B22_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B15_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B13_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B9_PCREL_X",       0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B7_PCREL_X",       0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_16_X",             0,    32,   0 },
+      { "fixup_Hexagon_12_X",             0,    32,   0 },
+      { "fixup_Hexagon_11_X",             0,    32,   0 },
+      { "fixup_Hexagon_10_X",             0,    32,   0 },
+      { "fixup_Hexagon_9_X",              0,    32,   0 },
+      { "fixup_Hexagon_8_X",              0,    32,   0 },
+      { "fixup_Hexagon_7_X",              0,    32,   0 },
+      { "fixup_Hexagon_6_X",              0,    32,   0 },
+      { "fixup_Hexagon_32_PCREL",         0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_COPY",             0,    32,   0 },
+      { "fixup_Hexagon_GLOB_DAT",         0,    32,   0 },
+      { "fixup_Hexagon_JMP_SLOT",         0,    32,   0 },
+      { "fixup_Hexagon_RELATIVE",         0,    32,   0 },
+      { "fixup_Hexagon_PLT_B22_PCREL",    0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_GOTREL_LO16",      0,    32,   0 },
+      { "fixup_Hexagon_GOTREL_HI16",      0,    32,   0 },
+      { "fixup_Hexagon_GOTREL_32",        0,    32,   0 },
+      { "fixup_Hexagon_GOT_LO16",         0,    32,   0 },
+      { "fixup_Hexagon_GOT_HI16",         0,    32,   0 },
+      { "fixup_Hexagon_GOT_32",           0,    32,   0 },
+      { "fixup_Hexagon_GOT_16",           0,    32,   0 },
+      { "fixup_Hexagon_DTPMOD_32",        0,    32,   0 },
+      { "fixup_Hexagon_DTPREL_LO16",      0,    32,   0 },
+      { "fixup_Hexagon_DTPREL_HI16",      0,    32,   0 },
+      { "fixup_Hexagon_DTPREL_32",        0,    32,   0 },
+      { "fixup_Hexagon_DTPREL_16",        0,    32,   0 },
+      { "fixup_Hexagon_GD_PLT_B22_PCREL", 0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_LD_PLT_B22_PCREL", 0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_GD_GOT_LO16",      0,    32,   0 },
+      { "fixup_Hexagon_GD_GOT_HI16",      0,    32,   0 },
+      { "fixup_Hexagon_GD_GOT_32",        0,    32,   0 },
+      { "fixup_Hexagon_GD_GOT_16",        0,    32,   0 },
+      { "fixup_Hexagon_LD_GOT_LO16",      0,    32,   0 },
+      { "fixup_Hexagon_LD_GOT_HI16",      0,    32,   0 },
+      { "fixup_Hexagon_LD_GOT_32",        0,    32,   0 },
+      { "fixup_Hexagon_LD_GOT_16",        0,    32,   0 },
+      { "fixup_Hexagon_IE_LO16",          0,    32,   0 },
+      { "fixup_Hexagon_IE_HI16",          0,    32,   0 },
+      { "fixup_Hexagon_IE_32",            0,    32,   0 },
+      { "fixup_Hexagon_IE_16",            0,    32,   0 },
+      { "fixup_Hexagon_IE_GOT_LO16",      0,    32,   0 },
+      { "fixup_Hexagon_IE_GOT_HI16",      0,    32,   0 },
+      { "fixup_Hexagon_IE_GOT_32",        0,    32,   0 },
+      { "fixup_Hexagon_IE_GOT_16",        0,    32,   0 },
+      { "fixup_Hexagon_TPREL_LO16",       0,    32,   0 },
+      { "fixup_Hexagon_TPREL_HI16",       0,    32,   0 },
+      { "fixup_Hexagon_TPREL_32",         0,    32,   0 },
+      { "fixup_Hexagon_TPREL_16",         0,    32,   0 },
+      { "fixup_Hexagon_6_PCREL_X",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_GOTREL_32_6_X",    0,    32,   0 },
+      { "fixup_Hexagon_GOTREL_16_X",      0,    32,   0 },
+      { "fixup_Hexagon_GOTREL_11_X",      0,    32,   0 },
+      { "fixup_Hexagon_GOT_32_6_X",       0,    32,   0 },
+      { "fixup_Hexagon_GOT_16_X",         0,    32,   0 },
+      { "fixup_Hexagon_GOT_11_X",         0,    32,   0 },
+      { "fixup_Hexagon_DTPREL_32_6_X",    0,    32,   0 },
+      { "fixup_Hexagon_DTPREL_16_X",      0,    32,   0 },
+      { "fixup_Hexagon_DTPREL_11_X",      0,    32,   0 },
+      { "fixup_Hexagon_GD_GOT_32_6_X",    0,    32,   0 },
+      { "fixup_Hexagon_GD_GOT_16_X",      0,    32,   0 },
+      { "fixup_Hexagon_GD_GOT_11_X",      0,    32,   0 },
+      { "fixup_Hexagon_LD_GOT_32_6_X",    0,    32,   0 },
+      { "fixup_Hexagon_LD_GOT_16_X",      0,    32,   0 },
+      { "fixup_Hexagon_LD_GOT_11_X",      0,    32,   0 },
+      { "fixup_Hexagon_IE_32_6_X",        0,    32,   0 },
+      { "fixup_Hexagon_IE_16_X",          0,    32,   0 },
+      { "fixup_Hexagon_IE_GOT_32_6_X",    0,    32,   0 },
+      { "fixup_Hexagon_IE_GOT_16_X",      0,    32,   0 },
+      { "fixup_Hexagon_IE_GOT_11_X",      0,    32,   0 },
+      { "fixup_Hexagon_TPREL_32_6_X",     0,    32,   0 },
+      { "fixup_Hexagon_TPREL_16_X",       0,    32,   0 },
+      { "fixup_Hexagon_TPREL_11_X",       0,    32,   0 }
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  /// processFixupValue - Target hook to adjust the literal value of a fixup
+  /// if necessary. IsResolved signals whether the caller believes a relocation
+  /// is needed; the target can modify the value. The default does nothing.
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override {
+    MCFixupKind Kind = Fixup.getKind();
+
+    switch((unsigned)Kind) {
+      default:
+        llvm_unreachable("Unknown Fixup Kind!");
+
+      case fixup_Hexagon_LO16:
+      case fixup_Hexagon_HI16:
+      case fixup_Hexagon_16:
+      case fixup_Hexagon_8:
+      case fixup_Hexagon_GPREL16_0:
+      case fixup_Hexagon_GPREL16_1:
+      case fixup_Hexagon_GPREL16_2:
+      case fixup_Hexagon_GPREL16_3:
+      case fixup_Hexagon_HL16:
+      case fixup_Hexagon_32_6_X:
+      case fixup_Hexagon_16_X:
+      case fixup_Hexagon_12_X:
+      case fixup_Hexagon_11_X:
+      case fixup_Hexagon_10_X:
+      case fixup_Hexagon_9_X:
+      case fixup_Hexagon_8_X:
+      case fixup_Hexagon_7_X:
+      case fixup_Hexagon_6_X:
+      case fixup_Hexagon_COPY:
+      case fixup_Hexagon_GLOB_DAT:
+      case fixup_Hexagon_JMP_SLOT:
+      case fixup_Hexagon_RELATIVE:
+      case fixup_Hexagon_PLT_B22_PCREL:
+      case fixup_Hexagon_GOTREL_LO16:
+      case fixup_Hexagon_GOTREL_HI16:
+      case fixup_Hexagon_GOTREL_32:
+      case fixup_Hexagon_GOT_LO16:
+      case fixup_Hexagon_GOT_HI16:
+      case fixup_Hexagon_GOT_32:
+      case fixup_Hexagon_GOT_16:
+      case fixup_Hexagon_DTPMOD_32:
+      case fixup_Hexagon_DTPREL_LO16:
+      case fixup_Hexagon_DTPREL_HI16:
+      case fixup_Hexagon_DTPREL_32:
+      case fixup_Hexagon_DTPREL_16:
+      case fixup_Hexagon_GD_PLT_B22_PCREL:
+      case fixup_Hexagon_LD_PLT_B22_PCREL:
+      case fixup_Hexagon_GD_GOT_LO16:
+      case fixup_Hexagon_GD_GOT_HI16:
+      case fixup_Hexagon_GD_GOT_32:
+      case fixup_Hexagon_GD_GOT_16:
+      case fixup_Hexagon_LD_GOT_LO16:
+      case fixup_Hexagon_LD_GOT_HI16:
+      case fixup_Hexagon_LD_GOT_32:
+      case fixup_Hexagon_LD_GOT_16:
+      case fixup_Hexagon_IE_LO16:
+      case fixup_Hexagon_IE_HI16:
+      case fixup_Hexagon_IE_32:
+      case fixup_Hexagon_IE_16:
+      case fixup_Hexagon_IE_GOT_LO16:
+      case fixup_Hexagon_IE_GOT_HI16:
+      case fixup_Hexagon_IE_GOT_32:
+      case fixup_Hexagon_IE_GOT_16:
+      case fixup_Hexagon_TPREL_LO16:
+      case fixup_Hexagon_TPREL_HI16:
+      case fixup_Hexagon_TPREL_32:
+      case fixup_Hexagon_TPREL_16:
+      case fixup_Hexagon_GOTREL_32_6_X:
+      case fixup_Hexagon_GOTREL_16_X:
+      case fixup_Hexagon_GOTREL_11_X:
+      case fixup_Hexagon_GOT_32_6_X:
+      case fixup_Hexagon_GOT_16_X:
+      case fixup_Hexagon_GOT_11_X:
+      case fixup_Hexagon_DTPREL_32_6_X:
+      case fixup_Hexagon_DTPREL_16_X:
+      case fixup_Hexagon_DTPREL_11_X:
+      case fixup_Hexagon_GD_GOT_32_6_X:
+      case fixup_Hexagon_GD_GOT_16_X:
+      case fixup_Hexagon_GD_GOT_11_X:
+      case fixup_Hexagon_LD_GOT_32_6_X:
+      case fixup_Hexagon_LD_GOT_16_X:
+      case fixup_Hexagon_LD_GOT_11_X:
+      case fixup_Hexagon_IE_32_6_X:
+      case fixup_Hexagon_IE_16_X:
+      case fixup_Hexagon_IE_GOT_32_6_X:
+      case fixup_Hexagon_IE_GOT_16_X:
+      case fixup_Hexagon_IE_GOT_11_X:
+      case fixup_Hexagon_TPREL_32_6_X:
+      case fixup_Hexagon_TPREL_16_X:
+      case fixup_Hexagon_TPREL_11_X:
+      case fixup_Hexagon_32_PCREL:
+      case fixup_Hexagon_6_PCREL_X:
+      case fixup_Hexagon_23_REG:
+        // These relocations should always have a relocation recorded
+        IsResolved = false;
+        return;
+
+      case fixup_Hexagon_B22_PCREL:
+        //IsResolved = false;
+        break;
+
+      case fixup_Hexagon_B13_PCREL:
+      case fixup_Hexagon_B13_PCREL_X:
+      case fixup_Hexagon_B32_PCREL_X:
+      case fixup_Hexagon_B22_PCREL_X:
+      case fixup_Hexagon_B15_PCREL:
+      case fixup_Hexagon_B15_PCREL_X:
+      case fixup_Hexagon_B9_PCREL:
+      case fixup_Hexagon_B9_PCREL_X:
+      case fixup_Hexagon_B7_PCREL:
+      case fixup_Hexagon_B7_PCREL_X:
+        if (DisableFixup)
+          IsResolved = false;
+        break;
+
+      case FK_Data_1:
+      case FK_Data_2:
+      case FK_Data_4:
+      case FK_PCRel_4:
+      case fixup_Hexagon_32:
+        // Leave these relocations alone as they are used for EH.
+        return;
+    }
+  }
+
+  /// getFixupKindNumBytes - The number of bytes the fixup may change.
+  static unsigned getFixupKindNumBytes(unsigned Kind) {
+    switch (Kind) {
+    default:
+        return 0;
+
+      case FK_Data_1:
+        return 1;
+      case FK_Data_2:
+        return 2;
+      case FK_Data_4:         // this later gets mapped to R_HEX_32
+      case FK_PCRel_4:        // this later gets mapped to R_HEX_32_PCREL
+      case fixup_Hexagon_32:
+      case fixup_Hexagon_B32_PCREL_X:
+      case fixup_Hexagon_B22_PCREL:
+      case fixup_Hexagon_B22_PCREL_X:
+      case fixup_Hexagon_B15_PCREL:
+      case fixup_Hexagon_B15_PCREL_X:
+      case fixup_Hexagon_B13_PCREL:
+      case fixup_Hexagon_B13_PCREL_X:
+      case fixup_Hexagon_B9_PCREL:
+      case fixup_Hexagon_B9_PCREL_X:
+      case fixup_Hexagon_B7_PCREL:
+      case fixup_Hexagon_B7_PCREL_X:
+        return 4;
+    }
+  }
+
+  // Make up for left shift when encoding the operand.
+  static uint64_t adjustFixupValue(MCFixupKind Kind, uint64_t Value) {
+    switch((unsigned)Kind) {
+      default:
+        break;
+
+      case fixup_Hexagon_B7_PCREL:
+      case fixup_Hexagon_B9_PCREL:
+      case fixup_Hexagon_B13_PCREL:
+      case fixup_Hexagon_B15_PCREL:
+      case fixup_Hexagon_B22_PCREL:
+        Value >>= 2;
+        break;
+
+      case fixup_Hexagon_B7_PCREL_X:
+      case fixup_Hexagon_B9_PCREL_X:
+      case fixup_Hexagon_B13_PCREL_X:
+      case fixup_Hexagon_B15_PCREL_X:
+      case fixup_Hexagon_B22_PCREL_X:
+        Value &= 0x3f;
+        break;
+
+      case fixup_Hexagon_B32_PCREL_X:
+        Value >>= 6;
+        break;
+    }
+    return (Value);
+  }
+
+  void HandleFixupError(const int bits, const int align_bits,
+    const int64_t FixupValue, const char *fixupStr) const {
+    // Error: value 1124 out of range: -1024-1023 when resolving
+    // symbol in file xprtsock.S
+    const APInt IntMin = APInt::getSignedMinValue(bits+align_bits);
+    const APInt IntMax = APInt::getSignedMaxValue(bits+align_bits);
+    std::stringstream errStr;
+    errStr << "\nError: value " <<
+      FixupValue <<
+      " out of range: " <<
+      IntMin.getSExtValue() <<
+      "-" <<
+      IntMax.getSExtValue() <<
+      " when resolving " <<
+      fixupStr <<
+      " fixup\n";
+    llvm_unreachable(errStr.str().c_str());
+  }
+
+  /// ApplyFixup - Apply the \arg Value for given \arg Fixup into the provided
+  /// data fragment, at the offset specified by the fixup and following the
+  /// fixup kind as appropriate.
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t FixupValue, bool IsPCRel) const override {
+
+    // When FixupValue is 0 the relocation is external and there
+    // is nothing for us to do.
+    if (!FixupValue) return;
+
+    MCFixupKind Kind = Fixup.getKind();
+    uint64_t Value;
+    uint32_t InstMask;
+    uint32_t Reloc;
+
+    // LLVM gives us an encoded value, we have to convert it back
+    // to a real offset before we can use it.
+    uint32_t Offset = Fixup.getOffset();
+    unsigned NumBytes = getFixupKindNumBytes(Kind);
+    assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+    char *InstAddr = Data + Offset;
+
+    Value = adjustFixupValue(Kind, FixupValue);
+    if(!Value)
+      return;
+    int sValue = (int)Value;
+
+    switch((unsigned)Kind) {
+      default:
+        return;
+
+      case fixup_Hexagon_B7_PCREL:
+        if (!(isIntN(7, sValue)))
+          HandleFixupError(7, 2, (int64_t)FixupValue, "B7_PCREL");
+      case fixup_Hexagon_B7_PCREL_X:
+        InstMask = 0x00001f18;  // Word32_B7
+        Reloc = (((Value >> 2) & 0x1f) << 8) |    // Value 6-2 = Target 12-8
+                ((Value & 0x3) << 3);             // Value 1-0 = Target 4-3
+        break;
+
+      case fixup_Hexagon_B9_PCREL:
+        if (!(isIntN(9, sValue)))
+          HandleFixupError(9, 2, (int64_t)FixupValue, "B9_PCREL");
+      case fixup_Hexagon_B9_PCREL_X:
+        InstMask = 0x003000fe;  // Word32_B9
+        Reloc = (((Value >> 7) & 0x3) << 20) |    // Value 8-7 = Target 21-20
+                ((Value & 0x7f) << 1);            // Value 6-0 = Target 7-1
+        break;
+
+        // Since the existing branches that use this relocation cannot be
+        // extended, they should only be fixed up if the target is within range.
+      case fixup_Hexagon_B13_PCREL:
+        if (!(isIntN(13, sValue)))
+          HandleFixupError(13, 2, (int64_t)FixupValue, "B13_PCREL");
+      case fixup_Hexagon_B13_PCREL_X:
+        InstMask = 0x00202ffe;  // Word32_B13
+        Reloc = (((Value >> 12) & 0x1) << 21) |    // Value 12   = Target 21
+                (((Value >> 11) & 0x1) << 13) |    // Value 11   = Target 13
+                ((Value & 0x7ff) << 1);            // Value 10-0 = Target 11-1
+        break;
+
+      case fixup_Hexagon_B15_PCREL:
+        if (!(isIntN(15, sValue)))
+          HandleFixupError(15, 2, (int64_t)FixupValue, "B15_PCREL");
+      case fixup_Hexagon_B15_PCREL_X:
+        InstMask = 0x00df20fe;  // Word32_B15
+        Reloc = (((Value >> 13) & 0x3) << 22) |    // Value 14-13 = Target 23-22
+                (((Value >> 8) & 0x1f) << 16) |    // Value 12-8  = Target 20-16
+                (((Value >> 7) & 0x1)  << 13) |    // Value 7     = Target 13
+                ((Value & 0x7f) << 1);             // Value 6-0   = Target 7-1
+        break;
+
+      case fixup_Hexagon_B22_PCREL:
+        if (!(isIntN(22, sValue)))
+          HandleFixupError(22, 2, (int64_t)FixupValue, "B22_PCREL");
+      case fixup_Hexagon_B22_PCREL_X:
+        InstMask = 0x01ff3ffe;  // Word32_B22
+        Reloc = (((Value >> 13) & 0x1ff) << 16) |  // Value 21-13 = Target 24-16
+                ((Value & 0x1fff) << 1);           // Value 12-0  = Target 13-1
+        break;
+
+      case fixup_Hexagon_B32_PCREL_X:
+        InstMask = 0x0fff3fff;  // Word32_X26
+        Reloc = (((Value >> 14) & 0xfff) << 16) |  // Value 25-14 = Target 27-16
+                (Value & 0x3fff);                  // Value 13-0  = Target 13-0
+        break;
+
+      case FK_Data_1:
+      case FK_Data_2:
+      case FK_Data_4:
+      case fixup_Hexagon_32:
+        InstMask = 0xffffffff;  // Word32
+        Reloc = Value;
+        break;
+    }
+
+    DEBUG(dbgs() << "Name=" << getFixupKindInfo(Kind).Name << "(" <<
+          (unsigned)Kind << ")\n");
+    DEBUG(uint32_t OldData = 0;
+          for (unsigned i = 0; i < NumBytes; i++)
+            OldData |= (InstAddr[i] << (i * 8)) & (0xff << (i * 8));
+          dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) <<
+            ": AValue=0x"; dbgs().write_hex(FixupValue) <<
+            ": Offset=" << Offset <<
+            ": Size=" << DataSize <<
+            ": OInst=0x"; dbgs().write_hex(OldData) <<
+            ": Reloc=0x"; dbgs().write_hex(Reloc););
+
+    // For each byte of the fragment that the fixup touches, mask in the
+    // bits from the fixup value. The Value has been "split up" into the
+    // appropriate bitfields above.
+    for (unsigned i = 0; i < NumBytes; i++){
+      InstAddr[i] &= uint8_t(~InstMask >> (i * 8)) & 0xff; // Clear reloc bits
+      InstAddr[i] |= uint8_t(Reloc >> (i * 8)) & 0xff;     // Apply new reloc
+    }
+
+    DEBUG(uint32_t NewData = 0;
+          for (unsigned i = 0; i < NumBytes; i++)
+            NewData |= (InstAddr[i] << (i * 8)) & (0xff << (i * 8));
+          dbgs() << ": NInst=0x"; dbgs().write_hex(NewData) << "\n";);
+  }
+
+  bool isInstRelaxable(MCInst const &HMI) const {
+    const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(*MCII, HMI);
+    bool Relaxable = false;
+    // Branches and loop-setup insns are handled as necessary by relaxation.
+    if (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeJ ||
+        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) ==
+             HexagonII::TypeCOMPOUND &&
+         MCID.isBranch()) ||
+        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeNV &&
+         MCID.isBranch()) ||
+        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeCR &&
+         HMI.getOpcode() != Hexagon::C4_addipc))
+      if (HexagonMCInstrInfo::isExtendable(*MCII, HMI)) {
+        Relaxable = true;
+        MCOperand const &Operand =
+            HMI.getOperand(HexagonMCInstrInfo::getExtendableOp(*MCII, HMI));
+        if (HexagonMCInstrInfo::mustNotExtend(*Operand.getExpr()))
+          Relaxable = false;
+      }
+
+    return Relaxable;
+  }
+
+  /// MayNeedRelaxation - Check whether the given instruction may need
+  /// relaxation.
+  ///
+  /// \param Inst - The instruction to test.
+  bool mayNeedRelaxation(MCInst const &Inst) const override {
+    return true;
+  }
+
+  /// fixupNeedsRelaxation - Target specific predicate for whether a given
+  /// fixup requires the associated instruction to be relaxed.
+  bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved,
+                                    uint64_t Value,
+                                    const MCRelaxableFragment *DF,
+                                    const MCAsmLayout &Layout) const override {
+    MCInst const &MCB = DF->getInst();
+    assert(HexagonMCInstrInfo::isBundle(MCB));
+
+    *RelaxTarget = nullptr;
+    MCInst &MCI = const_cast<MCInst &>(HexagonMCInstrInfo::instruction(
+        MCB, Fixup.getOffset() / HEXAGON_INSTR_SIZE));
+    bool Relaxable = isInstRelaxable(MCI);
+    if (Relaxable == false)
+      return false;
+    // If we cannot resolve the fixup value, it requires relaxation.
+    if (!Resolved) {
+      switch ((unsigned)Fixup.getKind()) {
+      case fixup_Hexagon_B22_PCREL:
+        // GetFixupCount assumes B22 won't relax
+        LLVM_FALLTHROUGH;
+      default:
+        return false;
+        break;
+      case fixup_Hexagon_B13_PCREL:
+      case fixup_Hexagon_B15_PCREL:
+      case fixup_Hexagon_B9_PCREL:
+      case fixup_Hexagon_B7_PCREL: {
+        if (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_SIZE) {
+          ++relaxedCnt;
+          *RelaxTarget = &MCI;
+          setExtender(Layout.getAssembler().getContext());
+          return true;
+        } else {
+          return false;
+        }
+        break;
+      }
+      }
+    }
+
+    MCFixupKind Kind = Fixup.getKind();
+    int64_t sValue = Value;
+    int64_t maxValue;
+
+    switch ((unsigned)Kind) {
+    case fixup_Hexagon_B7_PCREL:
+      maxValue = 1 << 8;
+      break;
+    case fixup_Hexagon_B9_PCREL:
+      maxValue = 1 << 10;
+      break;
+    case fixup_Hexagon_B15_PCREL:
+      maxValue = 1 << 16;
+      break;
+    case fixup_Hexagon_B22_PCREL:
+      maxValue = 1 << 23;
+      break;
+    default:
+      maxValue = INT64_MAX;
+      break;
+    }
+
+    bool isFarAway = -maxValue > sValue || sValue > maxValue - 1;
+
+    if (isFarAway) {
+      if (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_SIZE) {
+        ++relaxedCnt;
+        *RelaxTarget = &MCI;
+        setExtender(Layout.getAssembler().getContext());
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  /// Simple predicate for targets where !Resolved implies requiring relaxation
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    llvm_unreachable("Handled by fixupNeedsRelaxationAdvanced");
+  }
+
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {
+    assert(HexagonMCInstrInfo::isBundle(Inst) &&
+           "Hexagon relaxInstruction only works on bundles");
+
+    Res = HexagonMCInstrInfo::createBundle();
+    // Copy the results into the bundle.
+    bool Update = false;
+    for (auto &I : HexagonMCInstrInfo::bundleInstructions(Inst)) {
+      MCInst &CrntHMI = const_cast<MCInst &>(*I.getInst());
+
+      // if immediate extender needed, add it in
+      if (*RelaxTarget == &CrntHMI) {
+        Update = true;
+        assert((HexagonMCInstrInfo::bundleSize(Res) < HEXAGON_PACKET_SIZE) &&
+               "No room to insert extender for relaxation");
+
+        MCInst *HMIx = takeExtender();
+        *HMIx = HexagonMCInstrInfo::deriveExtender(
+                *MCII, CrntHMI,
+                HexagonMCInstrInfo::getExtendableOperand(*MCII, CrntHMI));
+        Res.addOperand(MCOperand::createInst(HMIx));
+        *RelaxTarget = nullptr;
+      }
+      // now copy over the original instruction(the one we may have extended)
+      Res.addOperand(MCOperand::createInst(I.getInst()));
+    }
+    (void)Update;
+    assert(Update && "Didn't find relaxation target");
+  }
+
+  bool writeNopData(uint64_t Count,
+                    MCObjectWriter * OW) const override {
+    static const uint32_t Nopcode  = 0x7f000000, // Hard-coded NOP.
+                          ParseIn  = 0x00004000, // In packet parse-bits.
+                          ParseEnd = 0x0000c000; // End of packet parse-bits.
+
+    while(Count % HEXAGON_INSTR_SIZE) {
+      DEBUG(dbgs() << "Alignment not a multiple of the instruction size:" <<
+          Count % HEXAGON_INSTR_SIZE << "/" << HEXAGON_INSTR_SIZE << "\n");
+      --Count;
+      OW->write8(0);
+    }
+
+    while(Count) {
+      Count -= HEXAGON_INSTR_SIZE;
+      // Close the packet whenever a multiple of the maximum packet size remains
+      uint32_t ParseBits = (Count % (HEXAGON_PACKET_SIZE * HEXAGON_INSTR_SIZE))?
+                           ParseIn: ParseEnd;
+      OW->write32(Nopcode | ParseBits);
+    }
+    return true;
+  }
+
+  void finishLayout(MCAssembler const &Asm,
+                    MCAsmLayout &Layout) const override {
+    for (auto I : Layout.getSectionOrder()) {
+      auto &Fragments = I->getFragmentList();
+      for (auto &J : Fragments) {
+        switch (J.getKind()) {
+        default:
+          break;
+        case MCFragment::FT_Align: {
+          auto Size = Asm.computeFragmentSize(Layout, J);
+          for (auto K = J.getIterator();
+               K != Fragments.begin() && Size >= HEXAGON_PACKET_SIZE;) {
+            --K;
+            switch (K->getKind()) {
+            default:
+              break;
+            case MCFragment::FT_Align: {
+              // Don't pad before other alignments
+              Size = 0;
+              break;
+            }
+            case MCFragment::FT_Relaxable: {
+              auto &RF = cast<MCRelaxableFragment>(*K);
+              auto &Inst = const_cast<MCInst &>(RF.getInst());
+              while (Size > 0 && HexagonMCInstrInfo::bundleSize(Inst) < 4) {
+                MCInst *Nop = new (Asm.getContext()) MCInst;
+                Nop->setOpcode(Hexagon::A2_nop);
+                Inst.addOperand(MCOperand::createInst(Nop));
+                Size -= 4;
+                if (!HexagonMCChecker(
+                           *MCII, RF.getSubtargetInfo(), Inst, Inst,
+                           *Asm.getContext().getRegisterInfo()).check()) {
+                  Inst.erase(Inst.end() - 1);
+                  Size = 0;
+                }
+              }
+              bool Error = HexagonMCShuffle(*MCII, RF.getSubtargetInfo(), Inst);
+              //assert(!Error);
+              (void)Error;
+              ReplaceInstruction(Asm.getEmitter(), RF, Inst);
+              Layout.invalidateFragmentsFrom(&RF);
+              Size = 0; // Only look back one instruction
+              break;
+            }
+            }
+          }
+        }
+        }
+      }
+    }
+  }
+};
+} // end anonymous namespace
+
+namespace llvm {
+MCAsmBackend *createHexagonAsmBackend(Target const &T,
+                                      MCRegisterInfo const & /*MRI*/,
+                                      const Triple &TT, StringRef CPU,
+                                      const MCTargetOptions &Options) {
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
+  return new HexagonAsmBackend(T, OSABI, CPU);
+}
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
new file mode 100644
index 000000000000..4292f6b3faa4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -0,0 +1,308 @@
+//===-- HexagonBaseInfo.h - Top level definitions for Hexagon --*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the Hexagon target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONBASEINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONBASEINFO_H
+
+#include "HexagonMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <stdint.h>
+
+namespace llvm {
+
+/// HexagonII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace HexagonII {
+  // *** The code below must match HexagonInstrFormat*.td *** //
+
+  // Insn types.
+  // *** Must match HexagonInstrFormat*.td ***
+  enum Type {
+    TypePSEUDO  = 0,
+    TypeALU32   = 1,
+    TypeCR      = 2,
+    TypeJR      = 3,
+    TypeJ       = 4,
+    TypeLD      = 5,
+    TypeST      = 6,
+    TypeSYSTEM  = 7,
+    TypeXTYPE   = 8,
+    TypeV4LDST  = 9,
+    TypeNV      = 10,
+    TypeDUPLEX  = 11,
+    TypeCOMPOUND = 12,
+    TypeCVI_FIRST     = 13,
+    TypeCVI_VA        = TypeCVI_FIRST,
+    TypeCVI_VA_DV     = 14,
+    TypeCVI_VX        = 15,
+    TypeCVI_VX_DV     = 16,
+    TypeCVI_VP        = 17,
+    TypeCVI_VP_VS     = 18,
+    TypeCVI_VS        = 19,
+    TypeCVI_VINLANESAT= 20,
+    TypeCVI_VM_LD     = 21,
+    TypeCVI_VM_TMP_LD = 22,
+    TypeCVI_VM_CUR_LD = 23,
+    TypeCVI_VM_VP_LDU = 24,
+    TypeCVI_VM_ST     = 25,
+    TypeCVI_VM_NEW_ST = 26,
+    TypeCVI_VM_STU    = 27,
+    TypeCVI_HIST      = 28,
+    TypeCVI_LAST      = TypeCVI_HIST,
+    TypePREFIX  = 30, // Such as extenders.
+    TypeENDLOOP = 31  // Such as end of a HW loop.
+  };
+
+  enum SubTarget {
+    HasV2SubT     = 0xf,
+    HasV2SubTOnly = 0x1,
+    NoV2SubT      = 0x0,
+    HasV3SubT     = 0xe,
+    HasV3SubTOnly = 0x2,
+    NoV3SubT      = 0x1,
+    HasV4SubT     = 0xc,
+    NoV4SubT      = 0x3,
+    HasV5SubT     = 0x8,
+    NoV5SubT      = 0x7
+  };
+
+  enum AddrMode {
+    NoAddrMode     = 0,  // No addressing mode
+    Absolute       = 1,  // Absolute addressing mode
+    AbsoluteSet    = 2,  // Absolute set addressing mode
+    BaseImmOffset  = 3,  // Indirect with offset
+    BaseLongOffset = 4,  // Indirect with long offset
+    BaseRegOffset  = 5,  // Indirect with register offset
+    PostInc        = 6   // Post increment addressing mode
+  };
+
+  // MemAccessSize is represented as 1+log2(N) where N is size in bits.
+  enum class MemAccessSize {
+    NoMemAccess = 0,            // Not a memory access instruction.
+    ByteAccess = 1,             // Byte access instruction (memb).
+    HalfWordAccess = 2,         // Half word access instruction (memh).
+    WordAccess = 3,             // Word access instruction (memw).
+    DoubleWordAccess = 4,       // Double word access instruction (memd)
+                    // 5,       // We do not have a 16 byte vector access.
+    Vector64Access = 7,         // 64 Byte vector access instruction (vmem).
+    Vector128Access = 8         // 128 Byte vector access instruction (vmem).
+  };
+
+  // MCInstrDesc TSFlags
+  // *** Must match HexagonInstrFormat*.td ***
+  enum {
+    // This 5-bit field describes the insn type.
+    TypePos  = 0,
+    TypeMask = 0x1f,
+
+    // Solo instructions.
+    SoloPos  = 5,
+    SoloMask = 0x1,
+    // Packed only with A or X-type instructions.
+    SoloAXPos  = 6,
+    SoloAXMask = 0x1,
+    // Only A-type instruction in first slot or nothing.
+    SoloAin1Pos  = 7,
+    SoloAin1Mask = 0x1,
+
+    // Predicated instructions.
+    PredicatedPos  = 8,
+    PredicatedMask = 0x1,
+    PredicatedFalsePos  = 9,
+    PredicatedFalseMask = 0x1,
+    PredicatedNewPos  = 10,
+    PredicatedNewMask = 0x1,
+    PredicateLatePos  = 11,
+    PredicateLateMask = 0x1,
+
+    // New-Value consumer instructions.
+    NewValuePos  = 12,
+    NewValueMask = 0x1,
+    // New-Value producer instructions.
+    hasNewValuePos  = 13,
+    hasNewValueMask = 0x1,
+    // Which operand consumes or produces a new value.
+    NewValueOpPos  = 14,
+    NewValueOpMask = 0x7,
+    // Stores that can become new-value stores.
+    mayNVStorePos  = 17,
+    mayNVStoreMask = 0x1,
+    // New-value store instructions.
+    NVStorePos  = 18,
+    NVStoreMask = 0x1,
+    // Loads that can become current-value loads.
+    mayCVLoadPos  = 19,
+    mayCVLoadMask = 0x1,
+    // Current-value load instructions.
+    CVLoadPos  = 20,
+    CVLoadMask = 0x1,
+
+    // Extendable insns.
+    ExtendablePos  = 21,
+    ExtendableMask = 0x1,
+    // Insns must be extended.
+    ExtendedPos  = 22,
+    ExtendedMask = 0x1,
+    // Which operand may be extended.
+    ExtendableOpPos  = 23,
+    ExtendableOpMask = 0x7,
+    // Signed or unsigned range.
+    ExtentSignedPos  = 26,
+    ExtentSignedMask = 0x1,
+    // Number of bits of range before extending operand.
+    ExtentBitsPos  = 27,
+    ExtentBitsMask = 0x1f,
+    // Alignment power-of-two before extending operand.
+    ExtentAlignPos  = 32,
+    ExtentAlignMask = 0x3,
+
+    // Valid subtargets
+    validSubTargetPos  = 34,
+    validSubTargetMask = 0xf,
+
+    // Addressing mode for load/store instructions.
+    AddrModePos  = 40,
+    AddrModeMask = 0x7,
+    // Access size for load/store instructions.
+    MemAccessSizePos = 43,
+    MemAccesSizeMask = 0xf,
+
+    // Branch predicted taken.
+    TakenPos = 47,
+    TakenMask = 0x1,
+
+    // Floating-point instructions.
+    FPPos  = 48,
+    FPMask = 0x1,
+
+    // New-Value producer-2 instructions.
+    hasNewValuePos2  = 50,
+    hasNewValueMask2 = 0x1,
+
+    // Which operand consumes or produces a new value.
+    NewValueOpPos2  = 51,
+    NewValueOpMask2 = 0x7,
+
+    // Accumulator instructions.
+    AccumulatorPos = 54,
+    AccumulatorMask = 0x1,
+
+    // Complex XU, prevent xu competition by preferring slot3
+    PrefersSlot3Pos = 55,
+    PrefersSlot3Mask = 0x1,
+
+    CofMax1Pos = 60,
+    CofMax1Mask = 0x1
+  };
+
+  // *** The code above must match HexagonInstrFormat*.td *** //
+
+  // Hexagon specific MO operand flag mask.
+  enum HexagonMOTargetFlagVal {
+    //===------------------------------------------------------------------===//
+    // Hexagon Specific MachineOperand flags.
+    MO_NO_FLAG,
+
+    HMOTF_ConstExtended = 1,
+
+    /// MO_PCREL - On a symbol operand, indicates a PC-relative relocation
+    /// Used for computing a global address for PIC compilations
+    MO_PCREL,
+
+    /// MO_GOT - Indicates a GOT-relative relocation
+    MO_GOT,
+
+    // Low or high part of a symbol.
+    MO_LO16, MO_HI16,
+
+    // Offset from the base of the SDA.
+    MO_GPREL,
+
+    // MO_GDGOT - indicates GOT relative relocation for TLS
+    // GeneralDynamic method
+    MO_GDGOT,
+
+    // MO_GDPLT - indicates PLT relative relocation for TLS
+    // GeneralDynamic method
+    MO_GDPLT,
+
+    // MO_IE - indicates non PIC relocation for TLS
+    // Initial Executable method
+    MO_IE,
+
+    // MO_IEGOT - indicates PIC relocation for TLS
+    // Initial Executable method
+    MO_IEGOT,
+
+    // MO_TPREL - indicates relocation for TLS
+    // local Executable method
+    MO_TPREL
+  };
+
+  // Hexagon Sub-instruction classes.
+  enum SubInstructionGroup {
+    HSIG_None = 0,
+    HSIG_L1,
+    HSIG_L2,
+    HSIG_S1,
+    HSIG_S2,
+    HSIG_A,
+    HSIG_Compound
+  };
+
+  // Hexagon Compound classes.
+  enum CompoundGroup {
+    HCG_None = 0,
+    HCG_A,
+    HCG_B,
+    HCG_C
+  };
+
+  enum InstParseBits {
+    INST_PARSE_MASK       = 0x0000c000,
+    INST_PARSE_PACKET_END = 0x0000c000,
+    INST_PARSE_LOOP_END   = 0x00008000,
+    INST_PARSE_NOT_END    = 0x00004000,
+    INST_PARSE_DUPLEX     = 0x00000000,
+    INST_PARSE_EXTENDER   = 0x00000000
+  };
+
+  enum InstIClassBits : unsigned {
+    INST_ICLASS_MASK      = 0xf0000000,
+    INST_ICLASS_EXTENDER  = 0x00000000,
+    INST_ICLASS_J_1       = 0x10000000,
+    INST_ICLASS_J_2       = 0x20000000,
+    INST_ICLASS_LD_ST_1   = 0x30000000,
+    INST_ICLASS_LD_ST_2   = 0x40000000,
+    INST_ICLASS_J_3       = 0x50000000,
+    INST_ICLASS_CR        = 0x60000000,
+    INST_ICLASS_ALU32_1   = 0x70000000,
+    INST_ICLASS_XTYPE_1   = 0x80000000,
+    INST_ICLASS_LD        = 0x90000000,
+    INST_ICLASS_ST        = 0xa0000000,
+    INST_ICLASS_ALU32_2   = 0xb0000000,
+    INST_ICLASS_XTYPE_2   = 0xc0000000,
+    INST_ICLASS_XTYPE_3   = 0xd0000000,
+    INST_ICLASS_XTYPE_4   = 0xe0000000,
+    INST_ICLASS_ALU32_3   = 0xf0000000
+  };
+
+} // End namespace HexagonII.
+
+} // End namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
new file mode 100644
index 000000000000..944e235e72f2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
@@ -0,0 +1,295 @@
+//===-- HexagonELFObjectWriter.cpp - Hexagon Target Descriptions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonFixupKinds.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "hexagon-elf-writer"
+
+using namespace llvm;
+using namespace Hexagon;
+
+namespace {
+
+class HexagonELFObjectWriter : public MCELFObjectTargetWriter {
+private:
+  StringRef CPU;
+
+public:
+  HexagonELFObjectWriter(uint8_t OSABI, StringRef C);
+
+  unsigned getRelocType(MCContext &Ctx, MCValue const &Target,
+                        MCFixup const &Fixup, bool IsPCRel) const override;
+};
+}
+
+HexagonELFObjectWriter::HexagonELFObjectWriter(uint8_t OSABI, StringRef C)
+    : MCELFObjectTargetWriter(/*Is64bit*/ false, OSABI, ELF::EM_HEXAGON,
+                              /*HasRelocationAddend*/ true),
+      CPU(C) {}
+
+unsigned HexagonELFObjectWriter::getRelocType(MCContext &Ctx,
+                                              MCValue const &Target,
+                                              MCFixup const &Fixup,
+                                              bool IsPCRel) const {
+  MCSymbolRefExpr::VariantKind Variant = Target.getAccessVariant();
+  switch ((unsigned)Fixup.getKind()) {
+  default:
+    report_fatal_error("Unrecognized relocation type");
+    break;
+  case FK_Data_4:
+    switch(Variant) {
+    case MCSymbolRefExpr::VariantKind::VK_DTPREL:
+      return ELF::R_HEX_DTPREL_32;
+    case MCSymbolRefExpr::VariantKind::VK_GOT:
+      return ELF::R_HEX_GOT_32;
+    case MCSymbolRefExpr::VariantKind::VK_GOTREL:
+      return ELF::R_HEX_GOTREL_32;
+    case MCSymbolRefExpr::VariantKind::VK_Hexagon_GD_GOT:
+      return ELF::R_HEX_GD_GOT_32;
+    case MCSymbolRefExpr::VariantKind::VK_Hexagon_IE:
+      return ELF::R_HEX_IE_32;
+    case MCSymbolRefExpr::VariantKind::VK_Hexagon_IE_GOT:
+      return ELF::R_HEX_IE_GOT_32;
+    case MCSymbolRefExpr::VariantKind::VK_Hexagon_LD_GOT:
+      return ELF::R_HEX_LD_GOT_32;
+    case MCSymbolRefExpr::VariantKind::VK_Hexagon_PCREL:
+      return ELF::R_HEX_32_PCREL;
+    case MCSymbolRefExpr::VariantKind::VK_TPREL:
+      return ELF::R_HEX_TPREL_32;
+    case MCSymbolRefExpr::VariantKind::VK_None:
+      return IsPCRel ? ELF::R_HEX_32_PCREL : ELF::R_HEX_32;
+    default:
+      report_fatal_error("Unrecognized variant type");
+    };
+  case FK_PCRel_4:
+    return ELF::R_HEX_32_PCREL;
+  case FK_Data_2:
+    switch(Variant) {
+    case MCSymbolRefExpr::VariantKind::VK_DTPREL:
+      return ELF::R_HEX_DTPREL_16;
+    case MCSymbolRefExpr::VariantKind::VK_GOT:
+      return ELF::R_HEX_GOT_16;
+    case MCSymbolRefExpr::VariantKind::VK_Hexagon_GD_GOT:
+      return ELF::R_HEX_GD_GOT_16;
+    case MCSymbolRefExpr::VariantKind::VK_Hexagon_IE_GOT:
+      return ELF::R_HEX_IE_GOT_16;
+    case MCSymbolRefExpr::VariantKind::VK_Hexagon_LD_GOT:
+      return ELF::R_HEX_LD_GOT_16;
+    case MCSymbolRefExpr::VariantKind::VK_TPREL:
+      return ELF::R_HEX_TPREL_16;
+    case MCSymbolRefExpr::VariantKind::VK_None:
+      return ELF::R_HEX_16;
+    default:
+      report_fatal_error("Unrecognized variant type");
+    };
+  case FK_Data_1:
+    return ELF::R_HEX_8;
+  case fixup_Hexagon_B22_PCREL:
+    return ELF::R_HEX_B22_PCREL;
+  case fixup_Hexagon_B15_PCREL:
+    return ELF::R_HEX_B15_PCREL;
+  case fixup_Hexagon_B7_PCREL:
+    return ELF::R_HEX_B7_PCREL;
+  case fixup_Hexagon_LO16:
+    return ELF::R_HEX_LO16;
+  case fixup_Hexagon_HI16:
+    return ELF::R_HEX_HI16;
+  case fixup_Hexagon_32:
+    return ELF::R_HEX_32;
+  case fixup_Hexagon_16:
+    return ELF::R_HEX_16;
+  case fixup_Hexagon_8:
+    return ELF::R_HEX_8;
+  case fixup_Hexagon_GPREL16_0:
+    return ELF::R_HEX_GPREL16_0;
+  case fixup_Hexagon_GPREL16_1:
+    return ELF::R_HEX_GPREL16_1;
+  case fixup_Hexagon_GPREL16_2:
+    return ELF::R_HEX_GPREL16_2;
+  case fixup_Hexagon_GPREL16_3:
+    return ELF::R_HEX_GPREL16_3;
+  case fixup_Hexagon_HL16:
+    return ELF::R_HEX_HL16;
+  case fixup_Hexagon_B13_PCREL:
+    return ELF::R_HEX_B13_PCREL;
+  case fixup_Hexagon_B9_PCREL:
+    return ELF::R_HEX_B9_PCREL;
+  case fixup_Hexagon_B32_PCREL_X:
+    return ELF::R_HEX_B32_PCREL_X;
+  case fixup_Hexagon_32_6_X:
+    return ELF::R_HEX_32_6_X;
+  case fixup_Hexagon_B22_PCREL_X:
+    return ELF::R_HEX_B22_PCREL_X;
+  case fixup_Hexagon_B15_PCREL_X:
+    return ELF::R_HEX_B15_PCREL_X;
+  case fixup_Hexagon_B13_PCREL_X:
+    return ELF::R_HEX_B13_PCREL_X;
+  case fixup_Hexagon_B9_PCREL_X:
+    return ELF::R_HEX_B9_PCREL_X;
+  case fixup_Hexagon_B7_PCREL_X:
+    return ELF::R_HEX_B7_PCREL_X;
+  case fixup_Hexagon_16_X:
+    return ELF::R_HEX_16_X;
+  case fixup_Hexagon_12_X:
+    return ELF::R_HEX_12_X;
+  case fixup_Hexagon_11_X:
+    return ELF::R_HEX_11_X;
+  case fixup_Hexagon_10_X:
+    return ELF::R_HEX_10_X;
+  case fixup_Hexagon_9_X:
+    return ELF::R_HEX_9_X;
+  case fixup_Hexagon_8_X:
+    return ELF::R_HEX_8_X;
+  case fixup_Hexagon_7_X:
+    return ELF::R_HEX_7_X;
+  case fixup_Hexagon_6_X:
+    return ELF::R_HEX_6_X;
+  case fixup_Hexagon_32_PCREL:
+    return ELF::R_HEX_32_PCREL;
+  case fixup_Hexagon_COPY:
+    return ELF::R_HEX_COPY;
+  case fixup_Hexagon_GLOB_DAT:
+    return ELF::R_HEX_GLOB_DAT;
+  case fixup_Hexagon_JMP_SLOT:
+    return ELF::R_HEX_JMP_SLOT;
+  case fixup_Hexagon_RELATIVE:
+    return ELF::R_HEX_RELATIVE;
+  case fixup_Hexagon_PLT_B22_PCREL:
+    return ELF::R_HEX_PLT_B22_PCREL;
+  case fixup_Hexagon_GOTREL_LO16:
+    return ELF::R_HEX_GOTREL_LO16;
+  case fixup_Hexagon_GOTREL_HI16:
+    return ELF::R_HEX_GOTREL_HI16;
+  case fixup_Hexagon_GOTREL_32:
+    return ELF::R_HEX_GOTREL_32;
+  case fixup_Hexagon_GOT_LO16:
+    return ELF::R_HEX_GOT_LO16;
+  case fixup_Hexagon_GOT_HI16:
+    return ELF::R_HEX_GOT_HI16;
+  case fixup_Hexagon_GOT_32:
+    return ELF::R_HEX_GOT_32;
+  case fixup_Hexagon_GOT_16:
+    return ELF::R_HEX_GOT_16;
+  case fixup_Hexagon_DTPMOD_32:
+    return ELF::R_HEX_DTPMOD_32;
+  case fixup_Hexagon_DTPREL_LO16:
+    return ELF::R_HEX_DTPREL_LO16;
+  case fixup_Hexagon_DTPREL_HI16:
+    return ELF::R_HEX_DTPREL_HI16;
+  case fixup_Hexagon_DTPREL_32:
+    return ELF::R_HEX_DTPREL_32;
+  case fixup_Hexagon_DTPREL_16:
+    return ELF::R_HEX_DTPREL_16;
+  case fixup_Hexagon_GD_PLT_B22_PCREL:
+    return ELF::R_HEX_GD_PLT_B22_PCREL;
+  case fixup_Hexagon_LD_PLT_B22_PCREL:
+    return ELF::R_HEX_LD_PLT_B22_PCREL;
+  case fixup_Hexagon_GD_GOT_LO16:
+    return ELF::R_HEX_GD_GOT_LO16;
+  case fixup_Hexagon_GD_GOT_HI16:
+    return ELF::R_HEX_GD_GOT_HI16;
+  case fixup_Hexagon_GD_GOT_32:
+    return ELF::R_HEX_GD_GOT_32;
+  case fixup_Hexagon_GD_GOT_16:
+    return ELF::R_HEX_GD_GOT_16;
+  case fixup_Hexagon_LD_GOT_LO16:
+    return ELF::R_HEX_LD_GOT_LO16;
+  case fixup_Hexagon_LD_GOT_HI16:
+    return ELF::R_HEX_LD_GOT_HI16;
+  case fixup_Hexagon_LD_GOT_32:
+    return ELF::R_HEX_LD_GOT_32;
+  case fixup_Hexagon_LD_GOT_16:
+    return ELF::R_HEX_LD_GOT_16;
+  case fixup_Hexagon_IE_LO16:
+    return ELF::R_HEX_IE_LO16;
+  case fixup_Hexagon_IE_HI16:
+    return ELF::R_HEX_IE_HI16;
+  case fixup_Hexagon_IE_32:
+    return ELF::R_HEX_IE_32;
+  case fixup_Hexagon_IE_GOT_LO16:
+    return ELF::R_HEX_IE_GOT_LO16;
+  case fixup_Hexagon_IE_GOT_HI16:
+    return ELF::R_HEX_IE_GOT_HI16;
+  case fixup_Hexagon_IE_GOT_32:
+    return ELF::R_HEX_IE_GOT_32;
+  case fixup_Hexagon_IE_GOT_16:
+    return ELF::R_HEX_IE_GOT_16;
+  case fixup_Hexagon_TPREL_LO16:
+    return ELF::R_HEX_TPREL_LO16;
+  case fixup_Hexagon_TPREL_HI16:
+    return ELF::R_HEX_TPREL_HI16;
+  case fixup_Hexagon_TPREL_32:
+    return ELF::R_HEX_TPREL_32;
+  case fixup_Hexagon_TPREL_16:
+    return ELF::R_HEX_TPREL_16;
+  case fixup_Hexagon_6_PCREL_X:
+    return ELF::R_HEX_6_PCREL_X;
+  case fixup_Hexagon_GOTREL_32_6_X:
+    return ELF::R_HEX_GOTREL_32_6_X;
+  case fixup_Hexagon_GOTREL_16_X:
+    return ELF::R_HEX_GOTREL_16_X;
+  case fixup_Hexagon_GOTREL_11_X:
+    return ELF::R_HEX_GOTREL_11_X;
+  case fixup_Hexagon_GOT_32_6_X:
+    return ELF::R_HEX_GOT_32_6_X;
+  case fixup_Hexagon_GOT_16_X:
+    return ELF::R_HEX_GOT_16_X;
+  case fixup_Hexagon_GOT_11_X:
+    return ELF::R_HEX_GOT_11_X;
+  case fixup_Hexagon_DTPREL_32_6_X:
+    return ELF::R_HEX_DTPREL_32_6_X;
+  case fixup_Hexagon_DTPREL_16_X:
+    return ELF::R_HEX_DTPREL_16_X;
+  case fixup_Hexagon_DTPREL_11_X:
+    return ELF::R_HEX_DTPREL_11_X;
+  case fixup_Hexagon_GD_GOT_32_6_X:
+    return ELF::R_HEX_GD_GOT_32_6_X;
+  case fixup_Hexagon_GD_GOT_16_X:
+    return ELF::R_HEX_GD_GOT_16_X;
+  case fixup_Hexagon_GD_GOT_11_X:
+    return ELF::R_HEX_GD_GOT_11_X;
+  case fixup_Hexagon_LD_GOT_32_6_X:
+    return ELF::R_HEX_LD_GOT_32_6_X;
+  case fixup_Hexagon_LD_GOT_16_X:
+    return ELF::R_HEX_LD_GOT_16_X;
+  case fixup_Hexagon_LD_GOT_11_X:
+    return ELF::R_HEX_LD_GOT_11_X;
+  case fixup_Hexagon_IE_32_6_X:
+    return ELF::R_HEX_IE_32_6_X;
+  case fixup_Hexagon_IE_16_X:
+    return ELF::R_HEX_IE_16_X;
+  case fixup_Hexagon_IE_GOT_32_6_X:
+    return ELF::R_HEX_IE_GOT_32_6_X;
+  case fixup_Hexagon_IE_GOT_16_X:
+    return ELF::R_HEX_IE_GOT_16_X;
+  case fixup_Hexagon_IE_GOT_11_X:
+    return ELF::R_HEX_IE_GOT_11_X;
+  case fixup_Hexagon_TPREL_32_6_X:
+    return ELF::R_HEX_TPREL_32_6_X;
+  case fixup_Hexagon_TPREL_16_X:
+    return ELF::R_HEX_TPREL_16_X;
+  case fixup_Hexagon_TPREL_11_X:
+    return ELF::R_HEX_TPREL_11_X;
+  case fixup_Hexagon_23_REG:
+    return ELF::R_HEX_23_REG;
+  }
+}
+
+MCObjectWriter *llvm::createHexagonELFObjectWriter(raw_pwrite_stream &OS,
+                                                   uint8_t OSABI,
+                                                   StringRef CPU) {
+  MCELFObjectTargetWriter *MOTW = new HexagonELFObjectWriter(OSABI, CPU);
+  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian*/ true);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h
new file mode 100644
index 000000000000..4c97ebbdd346
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h
@@ -0,0 +1,138 @@
+//===-- HexagonFixupKinds.h - Hexagon Specific Fixup Entries --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_HEXAGON_HEXAGONFIXUPKINDS_H
+#define LLVM_HEXAGON_HEXAGONFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace Hexagon {
+enum Fixups {
+  // Branch fixups for R_HEX_B{22,15,7}_PCREL.
+  fixup_Hexagon_B22_PCREL = FirstTargetFixupKind,
+  fixup_Hexagon_B15_PCREL,
+  fixup_Hexagon_B7_PCREL,
+  fixup_Hexagon_LO16,
+  fixup_Hexagon_HI16,
+  fixup_Hexagon_32,
+  fixup_Hexagon_16,
+  fixup_Hexagon_8,
+  fixup_Hexagon_GPREL16_0,
+  fixup_Hexagon_GPREL16_1,
+  fixup_Hexagon_GPREL16_2,
+  fixup_Hexagon_GPREL16_3,
+  fixup_Hexagon_HL16,
+  fixup_Hexagon_B13_PCREL,
+  fixup_Hexagon_B9_PCREL,
+  fixup_Hexagon_B32_PCREL_X,
+  fixup_Hexagon_32_6_X,
+  fixup_Hexagon_B22_PCREL_X,
+  fixup_Hexagon_B15_PCREL_X,
+  fixup_Hexagon_B13_PCREL_X,
+  fixup_Hexagon_B9_PCREL_X,
+  fixup_Hexagon_B7_PCREL_X,
+  fixup_Hexagon_16_X,
+  fixup_Hexagon_12_X,
+  fixup_Hexagon_11_X,
+  fixup_Hexagon_10_X,
+  fixup_Hexagon_9_X,
+  fixup_Hexagon_8_X,
+  fixup_Hexagon_7_X,
+  fixup_Hexagon_6_X,
+  fixup_Hexagon_32_PCREL,
+  fixup_Hexagon_COPY,
+  fixup_Hexagon_GLOB_DAT,
+  fixup_Hexagon_JMP_SLOT,
+  fixup_Hexagon_RELATIVE,
+  fixup_Hexagon_PLT_B22_PCREL,
+  fixup_Hexagon_GOTREL_LO16,
+  fixup_Hexagon_GOTREL_HI16,
+  fixup_Hexagon_GOTREL_32,
+  fixup_Hexagon_GOT_LO16,
+  fixup_Hexagon_GOT_HI16,
+  fixup_Hexagon_GOT_32,
+  fixup_Hexagon_GOT_16,
+  fixup_Hexagon_DTPMOD_32,
+  fixup_Hexagon_DTPREL_LO16,
+  fixup_Hexagon_DTPREL_HI16,
+  fixup_Hexagon_DTPREL_32,
+  fixup_Hexagon_DTPREL_16,
+  fixup_Hexagon_GD_PLT_B22_PCREL,
+  fixup_Hexagon_LD_PLT_B22_PCREL,
+  fixup_Hexagon_GD_GOT_LO16,
+  fixup_Hexagon_GD_GOT_HI16,
+  fixup_Hexagon_GD_GOT_32,
+  fixup_Hexagon_GD_GOT_16,
+  fixup_Hexagon_LD_GOT_LO16,
+  fixup_Hexagon_LD_GOT_HI16,
+  fixup_Hexagon_LD_GOT_32,
+  fixup_Hexagon_LD_GOT_16,
+  fixup_Hexagon_IE_LO16,
+  fixup_Hexagon_IE_HI16,
+  fixup_Hexagon_IE_32,
+  fixup_Hexagon_IE_16,
+  fixup_Hexagon_IE_GOT_LO16,
+  fixup_Hexagon_IE_GOT_HI16,
+  fixup_Hexagon_IE_GOT_32,
+  fixup_Hexagon_IE_GOT_16,
+  fixup_Hexagon_TPREL_LO16,
+  fixup_Hexagon_TPREL_HI16,
+  fixup_Hexagon_TPREL_32,
+  fixup_Hexagon_TPREL_16,
+  fixup_Hexagon_6_PCREL_X,
+  fixup_Hexagon_GOTREL_32_6_X,
+  fixup_Hexagon_GOTREL_16_X,
+  fixup_Hexagon_GOTREL_11_X,
+  fixup_Hexagon_GOT_32_6_X,
+  fixup_Hexagon_GOT_16_X,
+  fixup_Hexagon_GOT_11_X,
+  fixup_Hexagon_DTPREL_32_6_X,
+  fixup_Hexagon_DTPREL_16_X,
+  fixup_Hexagon_DTPREL_11_X,
+  fixup_Hexagon_GD_GOT_32_6_X,
+  fixup_Hexagon_GD_GOT_16_X,
+  fixup_Hexagon_GD_GOT_11_X,
+  fixup_Hexagon_LD_GOT_32_6_X,
+  fixup_Hexagon_LD_GOT_16_X,
+  fixup_Hexagon_LD_GOT_11_X,
+  fixup_Hexagon_IE_32_6_X,
+  fixup_Hexagon_IE_16_X,
+  fixup_Hexagon_IE_GOT_32_6_X,
+  fixup_Hexagon_IE_GOT_16_X,
+  fixup_Hexagon_IE_GOT_11_X,
+  fixup_Hexagon_TPREL_32_6_X,
+  fixup_Hexagon_TPREL_16_X,
+  fixup_Hexagon_TPREL_11_X,
+  fixup_Hexagon_23_REG,
+
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+enum FixupBitmaps : unsigned {
+  Word8 = 0xff,
+  Word16 = 0xffff,
+  Word32 = 0xffffffff,
+  Word32_LO = 0x00c03fff,
+  Word32_HL = 0x0, // Not Implemented
+  Word32_GP = 0x0, // Not Implemented
+  Word32_B7 = 0x00001f18,
+  Word32_B9 = 0x003000fe,
+  Word32_B13 = 0x00202ffe,
+  Word32_B15 = 0x00df20fe,
+  Word32_B22 = 0x01ff3ffe,
+  Word32_R6 = 0x000007e0,
+  Word32_U6 = 0x0,  // Not Implemented
+  Word32_U16 = 0x0, // Not Implemented
+  Word32_X26 = 0x0fff3fff
+};
+} // namespace Hexagon
+} // namespace llvm
+
+#endif // LLVM_HEXAGON_HEXAGONFIXUPKINDS_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
new file mode 100644
index 000000000000..42fcc5a6aa89
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -0,0 +1,227 @@
+//===- HexagonInstPrinter.cpp - Convert Hexagon MCInst to assembly syntax -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Hexagon MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonAsmPrinter.h"
+#include "HexagonInstPrinter.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#define GET_INSTRUCTION_NAME
+#include "HexagonGenAsmWriter.inc"
+
+HexagonInstPrinter::HexagonInstPrinter(MCAsmInfo const &MAI,
+                                       MCInstrInfo const &MII,
+                                       MCRegisterInfo const &MRI)
+    : MCInstPrinter(MAI, MII, MRI), MII(MII), HasExtender(false) {
+}
+
+StringRef HexagonInstPrinter::getOpcodeName(unsigned Opcode) const {
+  return MII.getName(Opcode);
+}
+
+void HexagonInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
+  O << getRegName(RegNo);
+}
+
+StringRef HexagonInstPrinter::getRegName(unsigned RegNo) const {
+  return getRegisterName(RegNo);
+}
+
+void HexagonInstPrinter::setExtender(MCInst const &MCI) {
+  HasExtender = HexagonMCInstrInfo::isImmext(MCI);
+}
+
+void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                   StringRef Annot, const MCSubtargetInfo &STI) {
+  assert(HexagonMCInstrInfo::isBundle(*MI));
+  assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE);
+  assert(HexagonMCInstrInfo::bundleSize(*MI) > 0);
+  HasExtender = false;
+  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MI)) {
+    MCInst const &MCI = *I.getInst();
+    if (HexagonMCInstrInfo::isDuplex(MII, MCI)) {
+      printInstruction(MCI.getOperand(1).getInst(), OS);
+      OS << '\v';
+      HasExtender = false;
+      printInstruction(MCI.getOperand(0).getInst(), OS);
+    } else
+      printInstruction(&MCI, OS);
+    setExtender(MCI);
+    OS << "\n";
+  }
+
+  auto Separator = "";
+  if (HexagonMCInstrInfo::isInnerLoop(*MI)) {
+    OS << Separator;
+    Separator = " ";
+    MCInst ME;
+    ME.setOpcode(Hexagon::ENDLOOP0);
+    printInstruction(&ME, OS);
+  }
+  if (HexagonMCInstrInfo::isOuterLoop(*MI)) {
+    OS << Separator;
+    MCInst ME;
+    ME.setOpcode(Hexagon::ENDLOOP1);
+    printInstruction(&ME, OS);
+  }
+}
+
+void HexagonInstPrinter::printOperand(MCInst const *MI, unsigned OpNo,
+                                      raw_ostream &O) const {
+  if (HexagonMCInstrInfo::getExtendableOp(MII, *MI) == OpNo &&
+      (HasExtender || HexagonMCInstrInfo::isConstExtended(MII, *MI)))
+    O << "#";
+  MCOperand const &MO = MI->getOperand(OpNo);
+  if (MO.isReg()) {
+    O << getRegisterName(MO.getReg());
+  } else if (MO.isExpr()) {
+    int64_t Value;
+    if (MO.getExpr()->evaluateAsAbsolute(Value))
+      O << formatImm(Value);
+    else
+      O << *MO.getExpr();
+  } else {
+    llvm_unreachable("Unknown operand");
+  }
+}
+
+void HexagonInstPrinter::printExtOperand(MCInst const *MI, unsigned OpNo,
+                                         raw_ostream &O) const {
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printUnsignedImmOperand(MCInst const *MI,
+                                                 unsigned OpNo,
+                                                 raw_ostream &O) const {
+  O << MI->getOperand(OpNo).getImm();
+}
+
+void HexagonInstPrinter::printNegImmOperand(MCInst const *MI, unsigned OpNo,
+                                            raw_ostream &O) const {
+  O << -MI->getOperand(OpNo).getImm();
+}
+
+void HexagonInstPrinter::printNOneImmOperand(MCInst const *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
+  O << -1;
+}
+
+void HexagonInstPrinter::prints3_6ImmOperand(MCInst const *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
+  int64_t Imm;
+  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
+  Imm = SignExtend64<9>(Imm);
+  assert(Success); (void)Success;
+  assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO.");
+  O << formatImm(Imm/64);
+}
+
+void HexagonInstPrinter::prints3_7ImmOperand(MCInst const *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
+  int64_t Imm;
+  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
+  Imm = SignExtend64<10>(Imm);
+  assert(Success); (void)Success;
+  assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO.");
+  O << formatImm(Imm/128);
+}
+
+void HexagonInstPrinter::prints4_6ImmOperand(MCInst const *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
+  int64_t Imm;
+  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
+  Imm = SignExtend64<10>(Imm);
+  assert(Success); (void)Success;
+  assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO.");
+  O << formatImm(Imm/64);
+}
+
+void HexagonInstPrinter::prints4_7ImmOperand(MCInst const *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
+  int64_t Imm;
+  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
+  Imm = SignExtend64<11>(Imm);
+  assert(Success); (void)Success;
+  assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO.");
+  O << formatImm(Imm/128);
+}
+
+void HexagonInstPrinter::printGlobalOperand(MCInst const *MI, unsigned OpNo,
+                                            raw_ostream &O) const {
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printJumpTable(MCInst const *MI, unsigned OpNo,
+                                        raw_ostream &O) const {
+  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
+
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printConstantPool(MCInst const *MI, unsigned OpNo,
+                                           raw_ostream &O) const {
+  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
+
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printBranchOperand(MCInst const *MI, unsigned OpNo,
+                                            raw_ostream &O) const {
+  // Branches can take an immediate operand.  This is used by the branch
+  // selection pass to print $+8, an eight byte displacement from the PC.
+  llvm_unreachable("Unknown branch operand.");
+}
+
+void HexagonInstPrinter::printCallOperand(MCInst const *MI, unsigned OpNo,
+                                          raw_ostream &O) const {}
+
+void HexagonInstPrinter::printAbsAddrOperand(MCInst const *MI, unsigned OpNo,
+                                             raw_ostream &O) const {}
+
+void HexagonInstPrinter::printPredicateOperand(MCInst const *MI, unsigned OpNo,
+                                               raw_ostream &O) const {}
+
+void HexagonInstPrinter::printSymbol(MCInst const *MI, unsigned OpNo,
+                                     raw_ostream &O, bool hi) const {
+  assert(MI->getOperand(OpNo).isImm() && "Unknown symbol operand");
+
+  O << '#' << (hi ? "HI" : "LO") << '(';
+  O << '#';
+  printOperand(MI, OpNo, O);
+  O << ')';
+}
+
+void HexagonInstPrinter::printBrtarget(MCInst const *MI, unsigned OpNo,
+                                       raw_ostream &O) const {
+  MCOperand const &MO = MI->getOperand(OpNo);
+  assert (MO.isExpr());
+  MCExpr const &Expr = *MO.getExpr();
+  int64_t Value;
+  if (Expr.evaluateAsAbsolute(Value))
+    O << format("0x%" PRIx64, Value);
+  else {
+    if (HasExtender || HexagonMCInstrInfo::isConstExtended(MII, *MI))
+      if (HexagonMCInstrInfo::getExtendableOp(MII, *MI) == OpNo)
+        O << "##";
+    O << Expr;
+  }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
new file mode 100644
index 000000000000..5f421184b20a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
@@ -0,0 +1,92 @@
+//===-- HexagonInstPrinter.h - Convert Hexagon MCInst to assembly syntax --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_INSTPRINTER_HEXAGONINSTPRINTER_H
+#define LLVM_LIB_TARGET_HEXAGON_INSTPRINTER_HEXAGONINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+/// Prints bundles as a newline separated list of individual instructions
+/// Duplexes are separated by a vertical tab \v character
+/// A trailing line includes bundle properties such as endloop0/1
+///
+/// r0 = add(r1, r2)
+/// r0 = #0 \v jump 0x0
+/// :endloop0 :endloop1
+class HexagonInstPrinter : public MCInstPrinter {
+public:
+  explicit HexagonInstPrinter(MCAsmInfo const &MAI, MCInstrInfo const &MII,
+                              MCRegisterInfo const &MRI);
+  void printInst(MCInst const *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+  virtual StringRef getOpcodeName(unsigned Opcode) const;
+  void printInstruction(MCInst const *MI, raw_ostream &O);
+
+  StringRef getRegName(unsigned RegNo) const;
+  static char const *getRegisterName(unsigned RegNo);
+  void printRegName(raw_ostream &O, unsigned RegNo) const override;
+
+  void printOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+  void printExtOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+  void printUnsignedImmOperand(MCInst const *MI, unsigned OpNo,
+                               raw_ostream &O) const;
+  void printNegImmOperand(MCInst const *MI, unsigned OpNo,
+                          raw_ostream &O) const;
+  void printNOneImmOperand(MCInst const *MI, unsigned OpNo,
+                           raw_ostream &O) const;
+  void prints3_6ImmOperand(MCInst const *MI, unsigned OpNo,
+                           raw_ostream &O) const;
+  void prints3_7ImmOperand(MCInst const *MI, unsigned OpNo,
+                           raw_ostream &O) const;
+  void prints4_6ImmOperand(MCInst const *MI, unsigned OpNo,
+                           raw_ostream &O) const;
+  void prints4_7ImmOperand(MCInst const *MI, unsigned OpNo,
+                           raw_ostream &O) const;
+  void printBranchOperand(MCInst const *MI, unsigned OpNo,
+                          raw_ostream &O) const;
+  void printCallOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+  void printAbsAddrOperand(MCInst const *MI, unsigned OpNo,
+                           raw_ostream &O) const;
+  void printPredicateOperand(MCInst const *MI, unsigned OpNo,
+                             raw_ostream &O) const;
+  void printGlobalOperand(MCInst const *MI, unsigned OpNo,
+                          raw_ostream &O) const;
+  void printJumpTable(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+  void printBrtarget(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+
+  void printConstantPool(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
+
+  void printSymbolHi(MCInst const *MI, unsigned OpNo, raw_ostream &O) const {
+    printSymbol(MI, OpNo, O, true);
+  }
+  void printSymbolLo(MCInst const *MI, unsigned OpNo, raw_ostream &O) const {
+    printSymbol(MI, OpNo, O, false);
+  }
+
+  MCAsmInfo const &getMAI() const { return MAI; }
+  MCInstrInfo const &getMII() const { return MII; }
+
+protected:
+  void printSymbol(MCInst const *MI, unsigned OpNo, raw_ostream &O,
+                   bool hi) const;
+
+private:
+  MCInstrInfo const &MII;
+
+  bool HasExtender;
+  void setExtender(MCInst const &MCI);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
new file mode 100644
index 000000000000..c619c36164cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -0,0 +1,37 @@
+//===-- HexagonMCAsmInfo.cpp - Hexagon asm properties ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the HexagonMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCAsmInfo.h"
+
+using namespace llvm;
+
+// Pin the vtable to this file.
+void HexagonMCAsmInfo::anchor() {}
+
+HexagonMCAsmInfo::HexagonMCAsmInfo(const Triple &TT) {
+  Data16bitsDirective = "\t.half\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = nullptr;  // .xword is only supported by V9.
+  CommentString = "//";
+
+  LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
+  InlineAsmStart = "# InlineAsm Start";
+  InlineAsmEnd = "# InlineAsm End";
+  ZeroDirective = "\t.space\t";
+  AscizDirective = "\t.string\t";
+
+  SupportsDebugInformation = true;
+  MinInstAlignment = 4;
+  UsesELFSectionDirectiveForBSS  = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
new file mode 100644
index 000000000000..efeff2436234
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- HexagonTargetAsmInfo.h - Hexagon asm properties --------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the HexagonMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCASMINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class HexagonMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit HexagonMCAsmInfo(const Triple &TT);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
new file mode 100644
index 000000000000..07c9ad96a0d7
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -0,0 +1,585 @@
+//===----- HexagonMCChecker.cpp - Instruction bundle checking -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the checking of insns inside a bundle according to the
+// packet constraint rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCChecker.h"
+
+#include "HexagonBaseInfo.h"
+
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool> RelaxNVChecks("relax-nv-checks", cl::init(false),
+  cl::ZeroOrMore, cl::Hidden, cl::desc("Relax checks of new-value validity"));
+
+const HexagonMCChecker::PredSense
+  HexagonMCChecker::Unconditional(Hexagon::NoRegister, false);
+
+void HexagonMCChecker::init() {
+  // Initialize read-only registers set.
+  ReadOnly.insert(Hexagon::PC);
+
+  // Figure out the loop-registers definitions.
+  if (HexagonMCInstrInfo::isInnerLoop(MCB)) {
+    Defs[Hexagon::SA0].insert(Unconditional); // FIXME: define or change SA0?
+    Defs[Hexagon::LC0].insert(Unconditional);
+  }
+  if (HexagonMCInstrInfo::isOuterLoop(MCB)) {
+    Defs[Hexagon::SA1].insert(Unconditional); // FIXME: define or change SA0?
+    Defs[Hexagon::LC1].insert(Unconditional);
+  }
+
+  if (HexagonMCInstrInfo::isBundle(MCB))
+    // Unfurl a bundle.
+    for (auto const&I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+      init(*I.getInst());
+    }
+  else
+    init(MCB);
+}
+
+void HexagonMCChecker::init(MCInst const& MCI) {
+  const MCInstrDesc& MCID = HexagonMCInstrInfo::getDesc(MCII, MCI);
+  unsigned PredReg = Hexagon::NoRegister;
+  bool isTrue = false;
+
+  // Get used registers.
+  for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i)
+    if (MCI.getOperand(i).isReg()) {
+      unsigned R = MCI.getOperand(i).getReg();
+
+      if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && isPredicateRegister(R)) {
+        // Note an used predicate register.
+        PredReg = R;
+        isTrue = HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI);
+
+        // Note use of new predicate register.
+        if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
+          NewPreds.insert(PredReg);
+      }
+      else
+        // Note register use.  Super-registers are not tracked directly,
+        // but their components.
+        for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+           SRI.isValid();
+           ++SRI)
+         if (!MCSubRegIterator(*SRI, &RI).isValid())
+           // Skip super-registers used indirectly.
+           Uses.insert(*SRI);
+    }
+
+  // Get implicit register definitions.
+  if (const MCPhysReg *ImpDef = MCID.getImplicitDefs())
+    for (; *ImpDef; ++ImpDef) {
+      unsigned R = *ImpDef;
+
+      if (Hexagon::R31 != R && MCID.isCall())
+        // Any register other than the LR and the PC are actually volatile ones
+        // as defined by the ABI, not modified implicitly by the call insn.
+        continue;
+      if (Hexagon::PC == R)
+        // Branches are the only insns that can change the PC,
+        // otherwise a read-only register.
+        continue;
+
+      if (Hexagon::USR_OVF == R)
+        // Many insns change the USR implicitly, but only one or another flag.
+        // The instruction table models the USR.OVF flag, which can be implicitly
+        // modified more than once, but cannot be modified in the same packet
+        // with an instruction that modifies is explicitly. Deal with such situ-
+        // ations individually.
+        SoftDefs.insert(R);
+      else if (isPredicateRegister(R) &&
+               HexagonMCInstrInfo::isPredicateLate(MCII, MCI))
+        // Include implicit late predicates.
+        LatePreds.insert(R);
+      else
+        Defs[R].insert(PredSense(PredReg, isTrue));
+    }
+
+  // Figure out explicit register definitions.
+  for (unsigned i = 0; i < MCID.getNumDefs(); ++i) {
+    unsigned R = MCI.getOperand(i).getReg(),
+             S = Hexagon::NoRegister;
+    // USR has subregisters (while C8 does not for technical reasons), so
+    // reset R to USR, since we know how to handle multiple defs of USR,
+    // taking into account its subregisters.
+    if (R == Hexagon::C8)
+      R = Hexagon::USR;
+
+    // Note register definitions, direct ones as well as indirect side-effects.
+    // Super-registers are not tracked directly, but their components.
+    for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+        SRI.isValid();
+        ++SRI) {
+      if (MCSubRegIterator(*SRI, &RI).isValid())
+        // Skip super-registers defined indirectly.
+        continue;
+
+      if (R == *SRI) {
+        if (S == R)
+          // Avoid scoring the defined register multiple times.
+          continue;
+        else
+          // Note that the defined register has already been scored.
+          S = R;
+      }
+
+      if (Hexagon::P3_0 != R && Hexagon::P3_0 == *SRI)
+        // P3:0 is a special case, since multiple predicate register definitions
+        // in a packet is allowed as the equivalent of their logical "and".
+        // Only an explicit definition of P3:0 is noted as such; if a
+        // side-effect, then note as a soft definition.
+        SoftDefs.insert(*SRI);
+      else if (HexagonMCInstrInfo::isPredicateLate(MCII, MCI) && isPredicateRegister(*SRI))
+        // Some insns produce predicates too late to be used in the same packet.
+        LatePreds.insert(*SRI);
+      else if (i == 0 && llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCVI_VM_CUR_LD)
+        // Current loads should be used in the same packet.
+        // TODO: relies on the impossibility of a current and a temporary loads
+        // in the same packet.
+        CurDefs.insert(*SRI), Defs[*SRI].insert(PredSense(PredReg, isTrue));
+      else if (i == 0 && llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCVI_VM_TMP_LD)
+        // Temporary loads should be used in the same packet, but don't commit
+        // results, so it should be disregarded if another insn changes the same
+        // register.
+        // TODO: relies on the impossibility of a current and a temporary loads
+        // in the same packet.
+        TmpDefs.insert(*SRI);
+      else if (i <= 1 && llvm::HexagonMCInstrInfo::hasNewValue2(MCII, MCI) )
+        // vshuff(Vx, Vy, Rx) <- Vx(0) and Vy(1) are both source and
+        // destination registers with this instruction. same for vdeal(Vx,Vy,Rx)
+        Uses.insert(*SRI);
+      else
+        Defs[*SRI].insert(PredSense(PredReg, isTrue));
+    }
+  }
+
+  // Figure out register definitions that produce new values.
+  if (HexagonMCInstrInfo::hasNewValue(MCII, MCI)) {
+    unsigned R = HexagonMCInstrInfo::getNewValueOperand(MCII, MCI).getReg();
+
+    if (HexagonMCInstrInfo::isCompound(MCII, MCI))
+      compoundRegisterMap(R); // Compound insns have a limited register range.
+
+    for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+        SRI.isValid();
+        ++SRI)
+      if (!MCSubRegIterator(*SRI, &RI).isValid())
+        // No super-registers defined indirectly.
+        NewDefs[*SRI].push_back(NewSense::Def(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI),
+                                              HexagonMCInstrInfo::isFloat(MCII, MCI)));
+
+    // For fairly unique 2-dot-new producers, example:
+    // vdeal(V1, V9, R0) V1.new and V9.new can be used by consumers.
+    if (HexagonMCInstrInfo::hasNewValue2(MCII, MCI)) {
+      unsigned R2 = HexagonMCInstrInfo::getNewValueOperand2(MCII, MCI).getReg();
+
+      for(MCRegAliasIterator SRI(R2, &RI, !MCSubRegIterator(R2, &RI).isValid());
+          SRI.isValid();
+          ++SRI)
+        if (!MCSubRegIterator(*SRI, &RI).isValid())
+          NewDefs[*SRI].push_back(NewSense::Def(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI),
+                                                HexagonMCInstrInfo::isFloat(MCII, MCI)));
+    }
+  }
+
+  // Figure out definitions of new predicate registers.
+  if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
+    for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i)
+      if (MCI.getOperand(i).isReg()) {
+        unsigned P = MCI.getOperand(i).getReg();
+
+        if (isPredicateRegister(P))
+          NewPreds.insert(P);
+      }
+
+  // Figure out uses of new values.
+  if (HexagonMCInstrInfo::isNewValue(MCII, MCI)) {
+    unsigned N = HexagonMCInstrInfo::getNewValueOperand(MCII, MCI).getReg();
+
+    if (!MCSubRegIterator(N, &RI).isValid()) {
+      // Super-registers cannot use new values.
+      if (MCID.isBranch())
+        NewUses[N] = NewSense::Jmp(llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV);
+      else
+        NewUses[N] = NewSense::Use(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI));
+    }
+  }
+}
+
+HexagonMCChecker::HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst &mcb, MCInst &mcbdx,
+                                   MCRegisterInfo const &ri)
+    : MCB(mcb), MCBDX(mcbdx), RI(ri), MCII(MCII), STI(STI),
+      bLoadErrInfo(false) {
+  init();
+}
+
+bool HexagonMCChecker::check() {
+  bool chkB = checkBranches();
+  bool chkP = checkPredicates();
+  bool chkNV = checkNewValues();
+  bool chkR = checkRegisters();
+  bool chkS = checkSolo();
+  bool chkSh = checkShuffle();
+  bool chkSl = checkSlots();
+  bool chk = chkB && chkP && chkNV && chkR && chkS && chkSh && chkSl;
+
+  return chk;
+}
+
+bool HexagonMCChecker::checkSlots()
+
+{
+  unsigned slotsUsed = 0;
+  for (auto HMI: HexagonMCInstrInfo::bundleInstructions(MCBDX)) {
+    MCInst const& MCI = *HMI.getInst();
+    if (HexagonMCInstrInfo::isImmext(MCI))
+      continue;
+    if (HexagonMCInstrInfo::isDuplex(MCII, MCI))
+      slotsUsed += 2;
+    else
+      ++slotsUsed;
+  }
+
+  if (slotsUsed > HEXAGON_PACKET_SIZE) {
+    HexagonMCErrInfo errInfo;
+    errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NOSLOTS);
+    addErrInfo(errInfo);
+    return false;
+  }
+  return true;
+}
+
+// Check legal use of branches.
+bool HexagonMCChecker::checkBranches() {
+  HexagonMCErrInfo errInfo;
+  if (HexagonMCInstrInfo::isBundle(MCB)) {
+    bool hasConditional = false;
+    unsigned Branches = 0, Returns = 0, NewIndirectBranches = 0,
+             NewValueBranches = 0, Conditional = HEXAGON_PRESHUFFLE_PACKET_SIZE,
+             Unconditional = HEXAGON_PRESHUFFLE_PACKET_SIZE;
+
+    for (unsigned i = HexagonMCInstrInfo::bundleInstructionsOffset;
+         i < MCB.size(); ++i) {
+      MCInst const &MCI = *MCB.begin()[i].getInst();
+
+      if (HexagonMCInstrInfo::isImmext(MCI))
+        continue;
+      if (HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch() ||
+          HexagonMCInstrInfo::getDesc(MCII, MCI).isCall()) {
+        ++Branches;
+        if (HexagonMCInstrInfo::getDesc(MCII, MCI).isIndirectBranch() &&
+            HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
+          ++NewIndirectBranches;
+        if (HexagonMCInstrInfo::isNewValue(MCII, MCI))
+          ++NewValueBranches;
+
+        if (HexagonMCInstrInfo::isPredicated(MCII, MCI) ||
+            HexagonMCInstrInfo::isPredicatedNew(MCII, MCI)) {
+          hasConditional = true;
+          Conditional = i; // Record the position of the conditional branch.
+        } else {
+          Unconditional = i; // Record the position of the unconditional branch.
+        }
+      }
+      if (HexagonMCInstrInfo::getDesc(MCII, MCI).isReturn() &&
+          HexagonMCInstrInfo::getDesc(MCII, MCI).mayLoad())
+        ++Returns;
+    }
+
+    if (Branches) // FIXME: should "Defs.count(Hexagon::PC)" be here too?
+      if (HexagonMCInstrInfo::isInnerLoop(MCB) ||
+          HexagonMCInstrInfo::isOuterLoop(MCB)) {
+        // Error out if there's any branch in a loop-end packet.
+        errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_ENDLOOP, Hexagon::PC);
+        addErrInfo(errInfo);
+        return false;
+      }
+    if (Branches > 1)
+      if (!hasConditional || Conditional > Unconditional) {
+        // Error out if more than one unconditional branch or
+        // the conditional branch appears after the unconditional one.
+        errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_BRANCHES);
+        addErrInfo(errInfo);
+        return false;
+      }
+  }
+
+  return true;
+}
+
+// Check legal use of predicate registers.
+bool HexagonMCChecker::checkPredicates() {
+  HexagonMCErrInfo errInfo;
+  // Check for proper use of new predicate registers.
+  for (const auto& I : NewPreds) {
+    unsigned P = I;
+
+    if (!Defs.count(P) || LatePreds.count(P)) {
+      // Error out if the new predicate register is not defined,
+      // or defined "late"
+      // (e.g., "{ if (p3.new)... ; p3 = sp1loop0(#r7:2, Rs) }").
+      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NEWP, P);
+      addErrInfo(errInfo);
+      return false;
+    }
+  }
+
+  // Check for proper use of auto-anded of predicate registers.
+  for (const auto& I : LatePreds) {
+    unsigned P = I;
+
+    if (LatePreds.count(P) > 1 || Defs.count(P)) {
+      // Error out if predicate register defined "late" multiple times or
+      // defined late and regularly defined
+      // (e.g., "{ p3 = sp1loop0(...); p3 = cmp.eq(...) }".
+      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, P);
+      addErrInfo(errInfo);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Check legal use of new values.
+bool HexagonMCChecker::checkNewValues() {
+  HexagonMCErrInfo errInfo;
+  memset(&errInfo, 0, sizeof(errInfo));
+  for (auto& I : NewUses) {
+    unsigned R = I.first;
+    NewSense &US = I.second;
+
+    if (!hasValidNewValueDef(US, NewDefs[R])) {
+      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NEWV, R);
+      addErrInfo(errInfo);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Check for legal register uses and definitions.
+bool HexagonMCChecker::checkRegisters() {
+  HexagonMCErrInfo errInfo;
+  // Check for proper register definitions.
+  for (const auto& I : Defs) {
+    unsigned R = I.first;
+
+    if (ReadOnly.count(R)) {
+      // Error out for definitions of read-only registers.
+      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_READONLY, R);
+      addErrInfo(errInfo);
+      return false;
+    }
+    if (isLoopRegister(R) && Defs.count(R) > 1 &&
+        (HexagonMCInstrInfo::isInnerLoop(MCB) ||
+         HexagonMCInstrInfo::isOuterLoop(MCB))) {
+      // Error out for definitions of loop registers at the end of a loop.
+      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_LOOP, R);
+      addErrInfo(errInfo);
+      return false;
+    }
+    if (SoftDefs.count(R)) {
+      // Error out for explicit changes to registers also weakly defined
+      // (e.g., "{ usr = r0; r0 = sfadd(...) }").
+      unsigned UsrR = Hexagon::USR; // Silence warning about mixed types in ?:.
+      unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
+      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, BadR);
+      addErrInfo(errInfo);
+      return false;
+    }
+    if (!isPredicateRegister(R) && Defs[R].size() > 1) {
+      // Check for multiple register definitions.
+      PredSet &PM = Defs[R];
+
+      // Check for multiple unconditional register definitions.
+      if (PM.count(Unconditional)) {
+        // Error out on an unconditional change when there are any other
+        // changes, conditional or not.
+        unsigned UsrR = Hexagon::USR;
+        unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
+        errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, BadR);
+        addErrInfo(errInfo);
+        return false;
+      }
+      // Check for multiple conditional register definitions.
+      for (const auto& J : PM) {
+        PredSense P = J;
+
+        // Check for multiple uses of the same condition.
+        if (PM.count(P) > 1) {
+          // Error out on conditional changes based on the same predicate
+          // (e.g., "{ if (!p0) r0 =...; if (!p0) r0 =... }").
+          errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, R);
+          addErrInfo(errInfo);
+          return false;
+        }
+        // Check for the use of the complementary condition.
+        P.second = !P.second;
+        if (PM.count(P) && PM.size() > 2) {
+          // Error out on conditional changes based on the same predicate
+          // multiple times
+          // (e.g., "{ if (p0) r0 =...; if (!p0) r0 =... }; if (!p0) r0 =... }").
+          errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, R);
+          addErrInfo(errInfo);
+          return false;
+        }
+      }
+    }
+  }
+
+  // Check for use of current definitions.
+  for (const auto& I : CurDefs) {
+    unsigned R = I;
+
+    if (!Uses.count(R)) {
+      // Warn on an unused current definition.
+      errInfo.setWarning(HexagonMCErrInfo::CHECK_WARN_CURRENT, R);
+      addErrInfo(errInfo);
+      return true;
+    }
+  }
+
+  // Check for use of temporary definitions.
+  for (const auto& I : TmpDefs) {
+    unsigned R = I;
+
+    if (!Uses.count(R)) {
+      // special case for vhist
+      bool vHistFound = false;
+      for (auto const&HMI : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+        if(llvm::HexagonMCInstrInfo::getType(MCII, *HMI.getInst()) == HexagonII::TypeCVI_HIST) {
+          vHistFound = true;  // vhist() implicitly uses ALL REGxx.tmp
+          break;
+        }
+      }
+      // Warn on an unused temporary definition.
+      if (vHistFound == false) {
+        errInfo.setWarning(HexagonMCErrInfo::CHECK_WARN_TEMPORARY, R);
+        addErrInfo(errInfo);
+        return true;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Check for legal use of solo insns.
+bool HexagonMCChecker::checkSolo() {
+  HexagonMCErrInfo errInfo;
+  if (HexagonMCInstrInfo::isBundle(MCB) &&
+      HexagonMCInstrInfo::bundleSize(MCB) > 1) {
+    for (auto const&I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+      if (llvm::HexagonMCInstrInfo::isSolo(MCII, *I.getInst())) {
+        errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SOLO);
+        addErrInfo(errInfo);
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+bool HexagonMCChecker::checkShuffle() {
+  HexagonMCErrInfo errInfo;
+  // Branch info is lost when duplexing. The unduplexed insns must be
+  // checked and only branch errors matter for this case.
+  HexagonMCShuffler MCS(MCII, STI, MCB);
+  if (!MCS.check()) {
+    if (MCS.getError() == HexagonShuffler::SHUFFLE_ERROR_BRANCHES) {
+      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE);
+      errInfo.setShuffleError(MCS.getError());
+      addErrInfo(errInfo);
+      return false;
+    }
+  }
+  HexagonMCShuffler MCSDX(MCII, STI, MCBDX);
+  if (!MCSDX.check()) {
+    errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE);
+    errInfo.setShuffleError(MCSDX.getError());
+    addErrInfo(errInfo);
+    return false;
+  }
+  return true;
+}
+
+void HexagonMCChecker::compoundRegisterMap(unsigned& Register) {
+  switch (Register) {
+  default:
+    break;
+  case Hexagon::R15:
+    Register = Hexagon::R23;
+    break;
+  case Hexagon::R14:
+    Register = Hexagon::R22;
+    break;
+  case Hexagon::R13:
+    Register = Hexagon::R21;
+    break;
+  case Hexagon::R12:
+    Register = Hexagon::R20;
+    break;
+  case Hexagon::R11:
+    Register = Hexagon::R19;
+    break;
+  case Hexagon::R10:
+    Register = Hexagon::R18;
+    break;
+  case Hexagon::R9:
+    Register = Hexagon::R17;
+    break;
+  case Hexagon::R8:
+    Register = Hexagon::R16;
+    break;
+  }
+}
+
+bool HexagonMCChecker::hasValidNewValueDef(const NewSense &Use,
+      const NewSenseList &Defs) const {
+  bool Strict = !RelaxNVChecks;
+
+  for (unsigned i = 0, n = Defs.size(); i < n; ++i) {
+    const NewSense &Def = Defs[i];
+    // NVJ cannot use a new FP value [7.6.1]
+    if (Use.IsNVJ && (Def.IsFloat || Def.PredReg != 0))
+      continue;
+    // If the definition was not predicated, then it does not matter if
+    // the use is.
+    if (Def.PredReg == 0)
+      return true;
+    // With the strict checks, both the definition and the use must be
+    // predicated on the same register and condition.
+    if (Strict) {
+      if (Def.PredReg == Use.PredReg && Def.Cond == Use.Cond)
+        return true;
+    } else {
+      // With the relaxed checks, if the definition was predicated, the only
+      // detectable violation is if the use is predicated on the opposing
+      // condition, otherwise, it's ok.
+      if (Def.PredReg != Use.PredReg || Def.Cond == Use.Cond)
+        return true;
+    }
+  }
+  return false;
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
new file mode 100644
index 000000000000..33e22798c954
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -0,0 +1,217 @@
+//===----- HexagonMCChecker.h - Instruction bundle checking ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the checking of insns inside a bundle according to the
+// packet constraint rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCCHECKER_H
+#define HEXAGONMCCHECKER_H
+
+#include "MCTargetDesc/HexagonMCShuffler.h"
+#include <queue>
+#include <set>
+
+using namespace llvm;
+
+namespace llvm {
+class MCOperandInfo;
+
+typedef struct {
+  unsigned Error, Warning, ShuffleError;
+  unsigned Register;
+} ErrInfo_T;
+
+class HexagonMCErrInfo {
+public:
+  enum {
+    CHECK_SUCCESS         = 0,
+    // Errors.
+    CHECK_ERROR_BRANCHES  = 0x00001,
+    CHECK_ERROR_NEWP      = 0x00002,
+    CHECK_ERROR_NEWV      = 0x00004,
+    CHECK_ERROR_REGISTERS = 0x00008,
+    CHECK_ERROR_READONLY  = 0x00010,
+    CHECK_ERROR_LOOP      = 0x00020,
+    CHECK_ERROR_ENDLOOP   = 0x00040,
+    CHECK_ERROR_SOLO      = 0x00080,
+    CHECK_ERROR_SHUFFLE   = 0x00100,
+    CHECK_ERROR_NOSLOTS   = 0x00200,
+    CHECK_ERROR_UNKNOWN   = 0x00400,
+    // Warnings.
+    CHECK_WARN_CURRENT    = 0x10000,
+    CHECK_WARN_TEMPORARY  = 0x20000
+  };
+  ErrInfo_T s;
+
+  void reset() {
+    s.Error = CHECK_SUCCESS;
+    s.Warning = CHECK_SUCCESS;
+    s.ShuffleError = HexagonShuffler::SHUFFLE_SUCCESS;
+    s.Register = Hexagon::NoRegister;
+  };
+  HexagonMCErrInfo() {
+    reset();
+  };
+
+  void setError(unsigned e, unsigned r = Hexagon::NoRegister)
+    { s.Error = e; s.Register = r; };
+  void setWarning(unsigned w, unsigned r = Hexagon::NoRegister)
+    { s.Warning = w; s.Register = r; };
+  void setShuffleError(unsigned e) { s.ShuffleError = e; };
+};
+
+/// Check for a valid bundle.
+class HexagonMCChecker {
+  /// Insn bundle.
+  MCInst& MCB;
+  MCInst& MCBDX;
+  const MCRegisterInfo& RI;
+  MCInstrInfo const &MCII;
+  MCSubtargetInfo const &STI;
+  bool bLoadErrInfo;
+
+  /// Set of definitions: register #, if predicated, if predicated true.
+  typedef std::pair<unsigned, bool> PredSense;
+  static const PredSense Unconditional;
+  typedef std::multiset<PredSense> PredSet;
+  typedef std::multiset<PredSense>::iterator PredSetIterator;
+
+  typedef llvm::DenseMap<unsigned, PredSet>::iterator DefsIterator;
+  llvm::DenseMap<unsigned, PredSet> Defs;
+
+  /// Information about how a new-value register is defined or used:
+  ///   PredReg = predicate register, 0 if use/def not predicated,
+  ///   Cond    = true/false for if(PredReg)/if(!PredReg) respectively,
+  ///   IsFloat = true if definition produces a floating point value
+  ///             (not valid for uses),
+  ///   IsNVJ   = true if the use is a new-value branch (not valid for
+  ///             definitions).
+  struct NewSense {
+    unsigned PredReg;
+    bool IsFloat, IsNVJ, Cond;
+    // The special-case "constructors":
+    static NewSense Jmp(bool isNVJ) {
+      NewSense NS = { /*PredReg=*/ 0, /*IsFloat=*/ false, /*IsNVJ=*/ isNVJ,
+                      /*Cond=*/ false };
+      return NS;
+    }
+    static NewSense Use(unsigned PR, bool True) {
+      NewSense NS = { /*PredReg=*/ PR, /*IsFloat=*/ false, /*IsNVJ=*/ false,
+                      /*Cond=*/ True };
+      return NS;
+    }
+    static NewSense Def(unsigned PR, bool True, bool Float) {
+      NewSense NS = { /*PredReg=*/ PR, /*IsFloat=*/ Float, /*IsNVJ=*/ false,
+                      /*Cond=*/ True };
+      return NS;
+    }
+  };
+  /// Set of definitions that produce new register:
+  typedef llvm::SmallVector<NewSense,2> NewSenseList;
+  typedef llvm::DenseMap<unsigned, NewSenseList>::iterator NewDefsIterator;
+  llvm::DenseMap<unsigned, NewSenseList> NewDefs;
+
+  /// Set of weak definitions whose clashes should be enforced selectively.
+  typedef std::set<unsigned>::iterator SoftDefsIterator;
+  std::set<unsigned> SoftDefs;
+
+  /// Set of current definitions committed to the register file.
+  typedef std::set<unsigned>::iterator CurDefsIterator;
+  std::set<unsigned> CurDefs;
+
+  /// Set of temporary definitions not committed to the register file.
+  typedef std::set<unsigned>::iterator TmpDefsIterator;
+  std::set<unsigned> TmpDefs;
+
+  /// Set of new predicates used.
+  typedef std::set<unsigned>::iterator NewPredsIterator;
+  std::set<unsigned> NewPreds;
+
+  /// Set of predicates defined late.
+  typedef std::multiset<unsigned>::iterator LatePredsIterator;
+  std::multiset<unsigned> LatePreds;
+
+  /// Set of uses.
+  typedef std::set<unsigned>::iterator UsesIterator;
+  std::set<unsigned> Uses;
+
+  /// Set of new values used: new register, if new-value jump.
+  typedef llvm::DenseMap<unsigned, NewSense>::iterator NewUsesIterator;
+  llvm::DenseMap<unsigned, NewSense> NewUses;
+
+  /// Pre-defined set of read-only registers.
+  typedef std::set<unsigned>::iterator ReadOnlyIterator;
+  std::set<unsigned> ReadOnly;
+
+  std::queue<ErrInfo_T> ErrInfoQ;
+  HexagonMCErrInfo CrntErrInfo;
+
+  void getErrInfo() {
+    if (bLoadErrInfo == true) {
+      if (ErrInfoQ.empty()) {
+        CrntErrInfo.reset();
+      } else {
+        CrntErrInfo.s = ErrInfoQ.front();
+        ErrInfoQ.pop();
+      }
+    }
+    bLoadErrInfo = false;
+  }
+
+  void init();
+  void init(MCInst const&);
+
+  // Checks performed.
+  bool checkBranches();
+  bool checkPredicates();
+  bool checkNewValues();
+  bool checkRegisters();
+  bool checkSolo();
+  bool checkShuffle();
+  bool checkSlots();
+
+  static void compoundRegisterMap(unsigned&);
+
+  bool isPredicateRegister(unsigned R) const {
+    return (Hexagon::P0 == R || Hexagon::P1 == R ||
+            Hexagon::P2 == R || Hexagon::P3 == R);
+  };
+  bool isLoopRegister(unsigned R) const {
+    return (Hexagon::SA0 == R || Hexagon::LC0 == R ||
+            Hexagon::SA1 == R || Hexagon::LC1 == R);
+  };
+
+  bool hasValidNewValueDef(const NewSense &Use,
+                           const NewSenseList &Defs) const;
+
+  public:
+  explicit HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst& mcb, MCInst &mcbdx,
+                            const MCRegisterInfo& ri);
+
+  bool check();
+
+  /// add a new error/warning
+  void addErrInfo(HexagonMCErrInfo &err) { ErrInfoQ.push(err.s); };
+
+  /// Return the error code for the last operation in the insn bundle.
+  unsigned getError() { getErrInfo(); return CrntErrInfo.s.Error; };
+  unsigned getWarning() { getErrInfo(); return CrntErrInfo.s.Warning; };
+  unsigned getShuffleError() { getErrInfo(); return CrntErrInfo.s.ShuffleError; };
+  unsigned getErrRegister() { getErrInfo(); return CrntErrInfo.s.Register; };
+  bool getNextErrInfo() {
+    bLoadErrInfo = true;
+    return (ErrInfoQ.empty()) ? false : (getErrInfo(), true);
+  }
+};
+
+}
+
+#endif // HEXAGONMCCHECKER_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
new file mode 100644
index 000000000000..2645a17b9bd0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -0,0 +1,824 @@
+//===-- HexagonMCCodeEmitter.cpp - Hexagon Target Descriptions ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonFixupKinds.h"
+#include "MCTargetDesc/HexagonMCCodeEmitter.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "mccodeemitter"
+
+using namespace llvm;
+using namespace Hexagon;
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+HexagonMCCodeEmitter::HexagonMCCodeEmitter(MCInstrInfo const &aMII,
+                                           MCContext &aMCT)
+    : MCT(aMCT), MCII(aMII), Addend(new unsigned(0)),
+      Extended(new bool(false)), CurrentBundle(new MCInst const *) {}
+
+uint32_t HexagonMCCodeEmitter::parseBits(size_t Instruction, size_t Last,
+                                         MCInst const &MCB,
+                                         MCInst const &MCI) const {
+  bool Duplex = HexagonMCInstrInfo::isDuplex(MCII, MCI);
+  if (Instruction == 0) {
+    if (HexagonMCInstrInfo::isInnerLoop(MCB)) {
+      assert(!Duplex);
+      assert(Instruction != Last);
+      return HexagonII::INST_PARSE_LOOP_END;
+    }
+  }
+  if (Instruction == 1) {
+    if (HexagonMCInstrInfo::isOuterLoop(MCB)) {
+      assert(!Duplex);
+      assert(Instruction != Last);
+      return HexagonII::INST_PARSE_LOOP_END;
+    }
+  }
+  if (Duplex) {
+    assert(Instruction == Last);
+    return HexagonII::INST_PARSE_DUPLEX;
+  }
+  if(Instruction == Last)
+    return HexagonII::INST_PARSE_PACKET_END;
+  return HexagonII::INST_PARSE_NOT_END;
+}
+
+void HexagonMCCodeEmitter::encodeInstruction(MCInst const &MI, raw_ostream &OS,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             MCSubtargetInfo const &STI) const {
+  MCInst &HMB = const_cast<MCInst &>(MI);
+
+  assert(HexagonMCInstrInfo::isBundle(HMB));
+  DEBUG(dbgs() << "Encoding bundle\n";);
+  *Addend = 0;
+  *Extended = false;
+  *CurrentBundle = &MI;
+  size_t Instruction = 0;
+  size_t Last = HexagonMCInstrInfo::bundleSize(HMB) - 1;
+  for (auto &I : HexagonMCInstrInfo::bundleInstructions(HMB)) {
+    MCInst &HMI = const_cast<MCInst &>(*I.getInst());
+    verifyInstructionPredicates(HMI,
+                                computeAvailableFeatures(STI.getFeatureBits()));
+
+    EncodeSingleInstruction(HMI, OS, Fixups, STI,
+                            parseBits(Instruction, Last, HMB, HMI),
+                            Instruction);
+    *Extended = HexagonMCInstrInfo::isImmext(HMI);
+    *Addend += HEXAGON_INSTR_SIZE;
+    ++Instruction;
+  }
+  return;
+}
+
+static bool RegisterMatches(unsigned Consumer, unsigned Producer,
+                            unsigned Producer2) {
+  if (Consumer == Producer)
+    return true;
+  if (Consumer == Producer2)
+    return true;
+  // Calculate if we're a single vector consumer referencing a double producer
+  if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
+    if (Consumer >= Hexagon::V0 && Consumer <= Hexagon::V31)
+      return ((Consumer - Hexagon::V0) >> 1) == (Producer - Hexagon::W0);
+  return false;
+}
+
+/// EncodeSingleInstruction - Emit a single
+void HexagonMCCodeEmitter::EncodeSingleInstruction(
+    const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI, uint32_t Parse, size_t Index) const {
+  MCInst HMB = MI;
+  assert(!HexagonMCInstrInfo::isBundle(HMB));
+  uint64_t Binary;
+
+  // Compound instructions are limited to using registers 0-7 and 16-23
+  // and here we make a map 16-23 to 8-15 so they can be correctly encoded.
+  static unsigned RegMap[8] = {Hexagon::R8,  Hexagon::R9,  Hexagon::R10,
+                               Hexagon::R11, Hexagon::R12, Hexagon::R13,
+                               Hexagon::R14, Hexagon::R15};
+
+  // Pseudo instructions don't get encoded and shouldn't be here
+  // in the first place!
+  assert(!HexagonMCInstrInfo::getDesc(MCII, HMB).isPseudo() &&
+         "pseudo-instruction found");
+  DEBUG(dbgs() << "Encoding insn"
+                  " `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'"
+                                                                    "\n");
+
+  if (llvm::HexagonMCInstrInfo::getType(MCII, HMB) == HexagonII::TypeCOMPOUND) {
+    for (unsigned i = 0; i < HMB.getNumOperands(); ++i)
+      if (HMB.getOperand(i).isReg()) {
+        unsigned Reg =
+            MCT.getRegisterInfo()->getEncodingValue(HMB.getOperand(i).getReg());
+        if ((Reg <= 23) && (Reg >= 16))
+          HMB.getOperand(i).setReg(RegMap[Reg - 16]);
+      }
+  }
+
+  if (HexagonMCInstrInfo::isNewValue(MCII, HMB)) {
+    // Calculate the new value distance to the associated producer
+    MCOperand &MCO =
+        HMB.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, HMB));
+    unsigned SOffset = 0;
+    unsigned VOffset = 0;
+    unsigned Register = MCO.getReg();
+    unsigned Register1;
+    unsigned Register2;
+    auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+    auto i = Instructions.begin() + Index - 1;
+    for (;; --i) {
+      assert(i != Instructions.begin() - 1 && "Couldn't find producer");
+      MCInst const &Inst = *i->getInst();
+      if (HexagonMCInstrInfo::isImmext(Inst))
+        continue;
+      ++SOffset;
+      if (HexagonMCInstrInfo::isVector(MCII, Inst))
+        // Vector instructions don't count scalars
+        ++VOffset;
+      Register1 =
+          HexagonMCInstrInfo::hasNewValue(MCII, Inst)
+              ? HexagonMCInstrInfo::getNewValueOperand(MCII, Inst).getReg()
+              : static_cast<unsigned>(Hexagon::NoRegister);
+      Register2 =
+          HexagonMCInstrInfo::hasNewValue2(MCII, Inst)
+              ? HexagonMCInstrInfo::getNewValueOperand2(MCII, Inst).getReg()
+              : static_cast<unsigned>(Hexagon::NoRegister);
+      if (!RegisterMatches(Register, Register1, Register2))
+        // This isn't the register we're looking for
+        continue;
+      if (!HexagonMCInstrInfo::isPredicated(MCII, Inst))
+        // Producer is unpredicated
+        break;
+      assert(HexagonMCInstrInfo::isPredicated(MCII, HMB) &&
+             "Unpredicated consumer depending on predicated producer");
+      if (HexagonMCInstrInfo::isPredicatedTrue(MCII, Inst) ==
+          HexagonMCInstrInfo::isPredicatedTrue(MCII, HMB))
+        // Producer predicate sense matched ours
+        break;
+    }
+    // Hexagon PRM 10.11 Construct Nt from distance
+    unsigned Offset =
+        HexagonMCInstrInfo::isVector(MCII, HMB) ? VOffset : SOffset;
+    Offset <<= 1;
+    Offset |=
+        HexagonMCInstrInfo::SubregisterBit(Register, Register1, Register2);
+    MCO.setReg(Offset + Hexagon::R0);
+  }
+
+  Binary = getBinaryCodeForInstr(HMB, Fixups, STI);
+  // Check for unimplemented instructions. Immediate extenders
+  // are encoded as zero, so they need to be accounted for.
+  if ((!Binary) &&
+      ((HMB.getOpcode() != DuplexIClass0) && (HMB.getOpcode() != A4_ext) &&
+       (HMB.getOpcode() != A4_ext_b) && (HMB.getOpcode() != A4_ext_c) &&
+       (HMB.getOpcode() != A4_ext_g))) {
+    DEBUG(dbgs() << "Unimplemented inst: "
+                    " `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'"
+                                                                      "\n");
+    llvm_unreachable("Unimplemented Instruction");
+  }
+  Binary |= Parse;
+
+  // if we need to emit a duplexed instruction
+  if (HMB.getOpcode() >= Hexagon::DuplexIClass0 &&
+      HMB.getOpcode() <= Hexagon::DuplexIClassF) {
+    assert(Parse == HexagonII::INST_PARSE_DUPLEX &&
+           "Emitting duplex without duplex parse bits");
+    unsigned dupIClass;
+    switch (HMB.getOpcode()) {
+    case Hexagon::DuplexIClass0:
+      dupIClass = 0;
+      break;
+    case Hexagon::DuplexIClass1:
+      dupIClass = 1;
+      break;
+    case Hexagon::DuplexIClass2:
+      dupIClass = 2;
+      break;
+    case Hexagon::DuplexIClass3:
+      dupIClass = 3;
+      break;
+    case Hexagon::DuplexIClass4:
+      dupIClass = 4;
+      break;
+    case Hexagon::DuplexIClass5:
+      dupIClass = 5;
+      break;
+    case Hexagon::DuplexIClass6:
+      dupIClass = 6;
+      break;
+    case Hexagon::DuplexIClass7:
+      dupIClass = 7;
+      break;
+    case Hexagon::DuplexIClass8:
+      dupIClass = 8;
+      break;
+    case Hexagon::DuplexIClass9:
+      dupIClass = 9;
+      break;
+    case Hexagon::DuplexIClassA:
+      dupIClass = 10;
+      break;
+    case Hexagon::DuplexIClassB:
+      dupIClass = 11;
+      break;
+    case Hexagon::DuplexIClassC:
+      dupIClass = 12;
+      break;
+    case Hexagon::DuplexIClassD:
+      dupIClass = 13;
+      break;
+    case Hexagon::DuplexIClassE:
+      dupIClass = 14;
+      break;
+    case Hexagon::DuplexIClassF:
+      dupIClass = 15;
+      break;
+    default:
+      llvm_unreachable("Unimplemented DuplexIClass");
+      break;
+    }
+    // 29 is the bit position.
+    // 0b1110 =0xE bits are masked off and down shifted by 1 bit.
+    // Last bit is moved to bit position 13
+    Binary = ((dupIClass & 0xE) << (29 - 1)) | ((dupIClass & 0x1) << 13);
+
+    const MCInst *subInst0 = HMB.getOperand(0).getInst();
+    const MCInst *subInst1 = HMB.getOperand(1).getInst();
+
+    // get subinstruction slot 0
+    unsigned subInstSlot0Bits = getBinaryCodeForInstr(*subInst0, Fixups, STI);
+    // get subinstruction slot 1
+    unsigned subInstSlot1Bits = getBinaryCodeForInstr(*subInst1, Fixups, STI);
+
+    Binary |= subInstSlot0Bits | (subInstSlot1Bits << 16);
+  }
+  support::endian::Writer<support::little>(OS).write<uint32_t>(Binary);
+  ++MCNumEmitted;
+}
+
+namespace {
+void raise_relocation_error(unsigned bits, unsigned kind) {
+  std::string Text;
+  {
+    llvm::raw_string_ostream Stream(Text);
+    Stream << "Unrecognized relocation combination bits: " << bits
+           << " kind: " << kind;
+  }
+  report_fatal_error(Text);
+}
+}
+
+/// getFixupNoBits - Some insns are not extended and thus have no
+/// bits.  These cases require a more brute force method for determining
+/// the correct relocation.
+namespace {
+Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
+                                      const MCOperand &MO,
+                                      const MCSymbolRefExpr::VariantKind kind) {
+  const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(MCII, MI);
+  unsigned insnType = llvm::HexagonMCInstrInfo::getType(MCII, MI);
+
+  if (insnType == HexagonII::TypePREFIX) {
+    switch (kind) {
+    case MCSymbolRefExpr::VK_GOTREL:
+      return Hexagon::fixup_Hexagon_GOTREL_32_6_X;
+    case MCSymbolRefExpr::VK_GOT:
+      return Hexagon::fixup_Hexagon_GOT_32_6_X;
+    case MCSymbolRefExpr::VK_TPREL:
+      return Hexagon::fixup_Hexagon_TPREL_32_6_X;
+    case MCSymbolRefExpr::VK_DTPREL:
+      return Hexagon::fixup_Hexagon_DTPREL_32_6_X;
+    case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+      return Hexagon::fixup_Hexagon_GD_GOT_32_6_X;
+    case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+      return Hexagon::fixup_Hexagon_LD_GOT_32_6_X;
+    case MCSymbolRefExpr::VK_Hexagon_IE:
+      return Hexagon::fixup_Hexagon_IE_32_6_X;
+    case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+      return Hexagon::fixup_Hexagon_IE_GOT_32_6_X;
+    case MCSymbolRefExpr::VK_Hexagon_PCREL:
+    case MCSymbolRefExpr::VK_None:
+      if (MCID.isBranch())
+        return Hexagon::fixup_Hexagon_B32_PCREL_X;
+      else
+        return Hexagon::fixup_Hexagon_32_6_X;
+    default:
+      raise_relocation_error(0, kind);
+    }
+  } else if (MCID.isBranch())
+    return Hexagon::fixup_Hexagon_B13_PCREL;
+
+  switch (MCID.getOpcode()) {
+  case Hexagon::HI:
+  case Hexagon::A2_tfrih:
+    switch (kind) {
+    case MCSymbolRefExpr::VK_GOT:
+      return Hexagon::fixup_Hexagon_GOT_HI16;
+    case MCSymbolRefExpr::VK_GOTREL:
+      return Hexagon::fixup_Hexagon_GOTREL_HI16;
+    case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+      return Hexagon::fixup_Hexagon_GD_GOT_HI16;
+    case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+      return Hexagon::fixup_Hexagon_LD_GOT_HI16;
+    case MCSymbolRefExpr::VK_Hexagon_IE:
+      return Hexagon::fixup_Hexagon_IE_HI16;
+    case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+      return Hexagon::fixup_Hexagon_IE_GOT_HI16;
+    case MCSymbolRefExpr::VK_TPREL:
+      return Hexagon::fixup_Hexagon_TPREL_HI16;
+    case MCSymbolRefExpr::VK_DTPREL:
+      return Hexagon::fixup_Hexagon_DTPREL_HI16;
+    case MCSymbolRefExpr::VK_None:
+      return Hexagon::fixup_Hexagon_HI16;
+    default:
+      raise_relocation_error(0, kind);
+    }
+
+  case Hexagon::LO:
+  case Hexagon::A2_tfril:
+    switch (kind) {
+    case MCSymbolRefExpr::VK_GOT:
+      return Hexagon::fixup_Hexagon_GOT_LO16;
+    case MCSymbolRefExpr::VK_GOTREL:
+      return Hexagon::fixup_Hexagon_GOTREL_LO16;
+    case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+      return Hexagon::fixup_Hexagon_GD_GOT_LO16;
+    case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+      return Hexagon::fixup_Hexagon_LD_GOT_LO16;
+    case MCSymbolRefExpr::VK_Hexagon_IE:
+      return Hexagon::fixup_Hexagon_IE_LO16;
+    case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+      return Hexagon::fixup_Hexagon_IE_GOT_LO16;
+    case MCSymbolRefExpr::VK_TPREL:
+      return Hexagon::fixup_Hexagon_TPREL_LO16;
+    case MCSymbolRefExpr::VK_DTPREL:
+      return Hexagon::fixup_Hexagon_DTPREL_LO16;
+    case MCSymbolRefExpr::VK_None:
+      return Hexagon::fixup_Hexagon_LO16;
+    default:
+      raise_relocation_error(0, kind);
+    }
+
+  // The only relocs left should be GP relative:
+  default:
+    if (MCID.mayStore() || MCID.mayLoad()) {
+      for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses;
+           ++ImpUses) {
+        if (*ImpUses != Hexagon::GP)
+          continue;
+        switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) {
+        case HexagonII::MemAccessSize::ByteAccess:
+          return fixup_Hexagon_GPREL16_0;
+        case HexagonII::MemAccessSize::HalfWordAccess:
+          return fixup_Hexagon_GPREL16_1;
+        case HexagonII::MemAccessSize::WordAccess:
+          return fixup_Hexagon_GPREL16_2;
+        case HexagonII::MemAccessSize::DoubleWordAccess:
+          return fixup_Hexagon_GPREL16_3;
+        default:
+          raise_relocation_error(0, kind);
+        }
+      }
+    }
+    raise_relocation_error(0, kind);
+  }
+  llvm_unreachable("Relocation exit not taken");
+}
+}
+
+namespace llvm {
+extern const MCInstrDesc HexagonInsts[];
+}
+
+namespace {
+  bool isPCRel (unsigned Kind) {
+    switch(Kind){
+    case fixup_Hexagon_B22_PCREL:
+    case fixup_Hexagon_B15_PCREL:
+    case fixup_Hexagon_B7_PCREL:
+    case fixup_Hexagon_B13_PCREL:
+    case fixup_Hexagon_B9_PCREL:
+    case fixup_Hexagon_B32_PCREL_X:
+    case fixup_Hexagon_B22_PCREL_X:
+    case fixup_Hexagon_B15_PCREL_X:
+    case fixup_Hexagon_B13_PCREL_X:
+    case fixup_Hexagon_B9_PCREL_X:
+    case fixup_Hexagon_B7_PCREL_X:
+    case fixup_Hexagon_32_PCREL:
+    case fixup_Hexagon_PLT_B22_PCREL:
+    case fixup_Hexagon_GD_PLT_B22_PCREL:
+    case fixup_Hexagon_LD_PLT_B22_PCREL:
+    case fixup_Hexagon_6_PCREL_X:
+      return true;
+    default:
+      return false;
+    }
+  }
+}
+
+unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
+                                              const MCOperand &MO,
+                                              const MCExpr *ME,
+                                              SmallVectorImpl<MCFixup> &Fixups,
+                                              const MCSubtargetInfo &STI) const
+
+{
+  if (isa<HexagonMCExpr>(ME))
+    ME = &HexagonMCInstrInfo::getExpr(*ME);
+  int64_t Value;
+  if (ME->evaluateAsAbsolute(Value))
+    return Value;
+  assert(ME->getKind() == MCExpr::SymbolRef || ME->getKind() == MCExpr::Binary);
+  if (ME->getKind() == MCExpr::Binary) {
+    MCBinaryExpr const *Binary = cast<MCBinaryExpr>(ME);
+    getExprOpValue(MI, MO, Binary->getLHS(), Fixups, STI);
+    getExprOpValue(MI, MO, Binary->getRHS(), Fixups, STI);
+    return 0;
+  }
+  Hexagon::Fixups FixupKind =
+      Hexagon::Fixups(Hexagon::fixup_Hexagon_TPREL_LO16);
+  const MCSymbolRefExpr *MCSRE = static_cast<const MCSymbolRefExpr *>(ME);
+  const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(MCII, MI);
+  unsigned bits = HexagonMCInstrInfo::getExtentBits(MCII, MI) -
+                  HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
+  const MCSymbolRefExpr::VariantKind kind = MCSRE->getKind();
+
+  DEBUG(dbgs() << "----------------------------------------\n");
+  DEBUG(dbgs() << "Opcode Name: " << HexagonMCInstrInfo::getName(MCII, MI)
+               << "\n");
+  DEBUG(dbgs() << "Opcode: " << MCID.getOpcode() << "\n");
+  DEBUG(dbgs() << "Relocation bits: " << bits << "\n");
+  DEBUG(dbgs() << "Addend: " << *Addend << "\n");
+  DEBUG(dbgs() << "----------------------------------------\n");
+
+  switch (bits) {
+  default:
+    raise_relocation_error(bits, kind);
+  case 32:
+    switch (kind) {
+    case MCSymbolRefExpr::VK_DTPREL:
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_DTPREL_32_6_X
+                            : Hexagon::fixup_Hexagon_DTPREL_32;
+      break;
+    case MCSymbolRefExpr::VK_GOT:
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_GOT_32_6_X
+                            : Hexagon::fixup_Hexagon_GOT_32;
+      break;
+    case MCSymbolRefExpr::VK_GOTREL:
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_GOTREL_32_6_X
+                            : Hexagon::fixup_Hexagon_GOTREL_32;
+      break;
+    case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_GD_GOT_32_6_X
+                            : Hexagon::fixup_Hexagon_GD_GOT_32;
+      break;
+    case MCSymbolRefExpr::VK_Hexagon_IE:
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_IE_32_6_X
+                            : Hexagon::fixup_Hexagon_IE_32;
+      break;
+    case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_IE_GOT_32_6_X
+                            : Hexagon::fixup_Hexagon_IE_GOT_32;
+      break;
+    case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_LD_GOT_32_6_X
+                            : Hexagon::fixup_Hexagon_LD_GOT_32;
+      break;
+    case MCSymbolRefExpr::VK_Hexagon_PCREL:
+      FixupKind = Hexagon::fixup_Hexagon_32_PCREL;
+      break;
+    case MCSymbolRefExpr::VK_None:
+      FixupKind =
+          *Extended ? Hexagon::fixup_Hexagon_32_6_X : Hexagon::fixup_Hexagon_32;
+      break;
+    case MCSymbolRefExpr::VK_TPREL:
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_TPREL_32_6_X
+                            : Hexagon::fixup_Hexagon_TPREL_32;
+      break;
+    default:
+      raise_relocation_error(bits, kind);
+    }
+    break;
+
+  case 22:
+    switch (kind) {
+    case MCSymbolRefExpr::VK_Hexagon_GD_PLT:
+      FixupKind = Hexagon::fixup_Hexagon_GD_PLT_B22_PCREL;
+      break;
+    case MCSymbolRefExpr::VK_Hexagon_LD_PLT:
+      FixupKind = Hexagon::fixup_Hexagon_LD_PLT_B22_PCREL;
+      break;
+    case MCSymbolRefExpr::VK_None:
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_B22_PCREL_X
+                            : Hexagon::fixup_Hexagon_B22_PCREL;
+      break;
+    case MCSymbolRefExpr::VK_PLT:
+      FixupKind = Hexagon::fixup_Hexagon_PLT_B22_PCREL;
+      break;
+    default:
+      raise_relocation_error(bits, kind);
+    }
+    break;
+
+  case 16:
+    if (*Extended) {
+      switch (kind) {
+      case MCSymbolRefExpr::VK_DTPREL:
+        FixupKind = Hexagon::fixup_Hexagon_DTPREL_16_X;
+        break;
+      case MCSymbolRefExpr::VK_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_GOT_16_X;
+        break;
+      case MCSymbolRefExpr::VK_GOTREL:
+        FixupKind = Hexagon::fixup_Hexagon_GOTREL_16_X;
+        break;
+      case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_GD_GOT_16_X;
+        break;
+      case MCSymbolRefExpr::VK_Hexagon_IE:
+        FixupKind = Hexagon::fixup_Hexagon_IE_16_X;
+        break;
+      case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_IE_GOT_16_X;
+        break;
+      case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_LD_GOT_16_X;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        FixupKind = Hexagon::fixup_Hexagon_16_X;
+        break;
+      case MCSymbolRefExpr::VK_TPREL:
+        FixupKind = Hexagon::fixup_Hexagon_TPREL_16_X;
+        break;
+      default:
+        raise_relocation_error(bits, kind);
+      }
+    } else
+      switch (kind) {
+      case MCSymbolRefExpr::VK_None: {
+        if (HexagonMCInstrInfo::s23_2_reloc(*MO.getExpr()))
+          FixupKind = Hexagon::fixup_Hexagon_23_REG;
+        else
+          raise_relocation_error(bits, kind);
+        break;
+      }
+      case MCSymbolRefExpr::VK_DTPREL:
+        FixupKind = Hexagon::fixup_Hexagon_DTPREL_16;
+        break;
+      case MCSymbolRefExpr::VK_GOTREL:
+        if (MCID.getOpcode() == Hexagon::HI)
+          FixupKind = Hexagon::fixup_Hexagon_GOTREL_HI16;
+        else
+          FixupKind = Hexagon::fixup_Hexagon_GOTREL_LO16;
+        break;
+      case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_GD_GOT_16;
+        break;
+      case MCSymbolRefExpr::VK_Hexagon_GPREL:
+        FixupKind = Hexagon::fixup_Hexagon_GPREL16_0;
+        break;
+      case MCSymbolRefExpr::VK_Hexagon_HI16:
+        FixupKind = Hexagon::fixup_Hexagon_HI16;
+        break;
+      case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_IE_GOT_16;
+        break;
+      case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_LD_GOT_16;
+        break;
+      case MCSymbolRefExpr::VK_Hexagon_LO16:
+        FixupKind = Hexagon::fixup_Hexagon_LO16;
+        break;
+      case MCSymbolRefExpr::VK_TPREL:
+        FixupKind = Hexagon::fixup_Hexagon_TPREL_16;
+        break;
+      default:
+        raise_relocation_error(bits, kind);
+      }
+    break;
+
+  case 15:
+    switch (kind) {
+    case MCSymbolRefExpr::VK_None:
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_B15_PCREL_X
+                            : Hexagon::fixup_Hexagon_B15_PCREL;
+      break;
+    default:
+      raise_relocation_error(bits, kind);
+    }
+    break;
+
+  case 13:
+    switch (kind) {
+    case MCSymbolRefExpr::VK_None:
+      FixupKind = Hexagon::fixup_Hexagon_B13_PCREL;
+      break;
+    default:
+      raise_relocation_error(bits, kind);
+    }
+    break;
+
+  case 12:
+    if (*Extended)
+      switch (kind) {
+      // There isn't a GOT_12_X, both 11_X and 16_X resolve to 6/26
+      case MCSymbolRefExpr::VK_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_GOT_16_X;
+        break;
+      case MCSymbolRefExpr::VK_GOTREL:
+        FixupKind = Hexagon::fixup_Hexagon_GOTREL_16_X;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        FixupKind = Hexagon::fixup_Hexagon_12_X;
+        break;
+      default:
+        raise_relocation_error(bits, kind);
+      }
+    else
+      raise_relocation_error(bits, kind);
+    break;
+
+  case 11:
+    if (*Extended)
+      switch (kind) {
+      case MCSymbolRefExpr::VK_DTPREL:
+        FixupKind = Hexagon::fixup_Hexagon_DTPREL_11_X;
+        break;
+      case MCSymbolRefExpr::VK_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_GOT_11_X;
+        break;
+      case MCSymbolRefExpr::VK_GOTREL:
+        FixupKind = Hexagon::fixup_Hexagon_GOTREL_11_X;
+        break;
+      case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_GD_GOT_11_X;
+        break;
+      case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_IE_GOT_11_X;
+        break;
+      case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_LD_GOT_11_X;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        FixupKind = Hexagon::fixup_Hexagon_11_X;
+        break;
+      case MCSymbolRefExpr::VK_TPREL:
+        FixupKind = Hexagon::fixup_Hexagon_TPREL_11_X;
+        break;
+      default:
+        raise_relocation_error(bits, kind);
+      }
+    else {
+      switch (kind) {
+      case MCSymbolRefExpr::VK_TPREL:
+        FixupKind = Hexagon::fixup_Hexagon_TPREL_11_X;
+        break;
+      default:
+        raise_relocation_error(bits, kind);
+      }
+    }
+    break;
+
+  case 10:
+    if (*Extended) {
+      switch (kind) {
+      case MCSymbolRefExpr::VK_None:
+        FixupKind = Hexagon::fixup_Hexagon_10_X;
+        break;
+      default:
+        raise_relocation_error(bits, kind);
+      }
+    } else
+      raise_relocation_error(bits, kind);
+    break;
+
+  case 9:
+    if (MCID.isBranch() ||
+        (HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCR))
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_B9_PCREL_X
+                            : Hexagon::fixup_Hexagon_B9_PCREL;
+    else if (*Extended)
+      FixupKind = Hexagon::fixup_Hexagon_9_X;
+    else
+      raise_relocation_error(bits, kind);
+    break;
+
+  case 8:
+    if (*Extended)
+      FixupKind = Hexagon::fixup_Hexagon_8_X;
+    else
+      raise_relocation_error(bits, kind);
+    break;
+
+  case 7:
+    if (MCID.isBranch() ||
+        (HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCR))
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_B7_PCREL_X
+                            : Hexagon::fixup_Hexagon_B7_PCREL;
+    else if (*Extended)
+      FixupKind = Hexagon::fixup_Hexagon_7_X;
+    else
+      raise_relocation_error(bits, kind);
+    break;
+
+  case 6:
+    if (*Extended) {
+      switch (kind) {
+      case MCSymbolRefExpr::VK_DTPREL:
+        FixupKind = Hexagon::fixup_Hexagon_DTPREL_16_X;
+        break;
+      // This is part of an extender, GOT_11 is a
+      // Word32_U6 unsigned/truncated reloc.
+      case MCSymbolRefExpr::VK_GOT:
+        FixupKind = Hexagon::fixup_Hexagon_GOT_11_X;
+        break;
+      case MCSymbolRefExpr::VK_GOTREL:
+        FixupKind = Hexagon::fixup_Hexagon_GOTREL_11_X;
+        break;
+      case MCSymbolRefExpr::VK_Hexagon_PCREL:
+        FixupKind = Hexagon::fixup_Hexagon_6_PCREL_X;
+        break;
+      case MCSymbolRefExpr::VK_TPREL:
+        FixupKind = Hexagon::fixup_Hexagon_TPREL_16_X;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        FixupKind = Hexagon::fixup_Hexagon_6_X;
+        break;
+      default:
+        raise_relocation_error(bits, kind);
+      }
+    } else
+      raise_relocation_error(bits, kind);
+    break;
+
+  case 0:
+    FixupKind = getFixupNoBits(MCII, MI, MO, kind);
+    break;
+  }
+
+  MCExpr const *FixupExpression =
+      (*Addend > 0 && isPCRel(FixupKind))
+          ? MCBinaryExpr::createAdd(MO.getExpr(),
+                                    MCConstantExpr::create(*Addend, MCT), MCT)
+          : MO.getExpr();
+
+  MCFixup fixup = MCFixup::create(*Addend, FixupExpression,
+                                  MCFixupKind(FixupKind), MI.getLoc());
+  Fixups.push_back(fixup);
+  // All of the information is in the fixup.
+  return 0;
+}
+
+unsigned
+HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        MCSubtargetInfo const &STI) const {
+  assert(!MO.isImm());
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+    if (HexagonMCInstrInfo::isSubInstruction(MI))
+      return HexagonMCInstrInfo::getDuplexRegisterNumbering(Reg);
+    switch(MI.getOpcode()){
+    case Hexagon::A2_tfrrcr:
+    case Hexagon::A2_tfrcrr:
+      if(Reg == Hexagon::M0)
+        Reg = Hexagon::C6;
+      if(Reg == Hexagon::M1)
+        Reg = Hexagon::C7;
+    }
+    return MCT.getRegisterInfo()->getEncodingValue(Reg);
+  }
+
+  return getExprOpValue(MI, MO, MO.getExpr(), Fixups, STI);
+}
+
+MCCodeEmitter *llvm::createHexagonMCCodeEmitter(MCInstrInfo const &MII,
+                                                MCRegisterInfo const &MRI,
+                                                MCContext &MCT) {
+  return new HexagonMCCodeEmitter(MII, MCT);
+}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "HexagonGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
new file mode 100644
index 000000000000..8e0667d9ac8e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
@@ -0,0 +1,75 @@
+//===-- HexagonMCCodeEmitter.h - Hexagon Target Descriptions ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Definition for classes that emit Hexagon machine code from MCInsts
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCCODEEMITTER_H
+#define HEXAGONMCCODEEMITTER_H
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class HexagonMCCodeEmitter : public MCCodeEmitter {
+  MCContext &MCT;
+  MCInstrInfo const &MCII;
+  std::unique_ptr<unsigned> Addend;
+  std::unique_ptr<bool> Extended;
+  std::unique_ptr<MCInst const *> CurrentBundle;
+
+  // helper routine for getMachineOpValue()
+  unsigned getExprOpValue(const MCInst &MI, const MCOperand &MO,
+                          const MCExpr *ME, SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const;
+
+public:
+  HexagonMCCodeEmitter(MCInstrInfo const &aMII, MCContext &aMCT);
+
+  // Return parse bits for instruction `MCI' inside bundle `MCB'
+  uint32_t parseBits(size_t Instruction, size_t Last, MCInst const &MCB,
+                    MCInst const &MCI) const;
+
+  void encodeInstruction(MCInst const &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         MCSubtargetInfo const &STI) const override;
+
+  void EncodeSingleInstruction(const MCInst &MI, raw_ostream &OS,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI,
+                               uint32_t Parse, size_t Index) const;
+
+  // \brief TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(MCInst const &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 MCSubtargetInfo const &STI) const;
+
+  /// \brief Return binary encoding of operand.
+  unsigned getMachineOpValue(MCInst const &MI, MCOperand const &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             MCSubtargetInfo const &STI) const;
+
+private:
+  uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+  void verifyInstructionPredicates(const MCInst &MI,
+                                   uint64_t AvailableFeatures) const;
+}; // class HexagonMCCodeEmitter
+
+} // namespace llvm
+
+#endif /* HEXAGONMCCODEEMITTER_H */
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
new file mode 100644
index 000000000000..5feaffe6efb9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@@ -0,0 +1,425 @@
+
+//=== HexagonMCCompound.cpp - Hexagon Compound checker  -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is looks at a packet and tries to form compound insns
+//
+//===----------------------------------------------------------------------===//
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace Hexagon;
+
+#define DEBUG_TYPE "hexagon-mccompound"
+
+enum OpcodeIndex {
+  fp0_jump_nt = 0,
+  fp0_jump_t,
+  fp1_jump_nt,
+  fp1_jump_t,
+  tp0_jump_nt,
+  tp0_jump_t,
+  tp1_jump_nt,
+  tp1_jump_t
+};
+
+static const unsigned tstBitOpcode[8] = {
+    J4_tstbit0_fp0_jump_nt, J4_tstbit0_fp0_jump_t,  J4_tstbit0_fp1_jump_nt,
+    J4_tstbit0_fp1_jump_t,  J4_tstbit0_tp0_jump_nt, J4_tstbit0_tp0_jump_t,
+    J4_tstbit0_tp1_jump_nt, J4_tstbit0_tp1_jump_t};
+static const unsigned cmpeqBitOpcode[8] = {
+    J4_cmpeq_fp0_jump_nt, J4_cmpeq_fp0_jump_t,  J4_cmpeq_fp1_jump_nt,
+    J4_cmpeq_fp1_jump_t,  J4_cmpeq_tp0_jump_nt, J4_cmpeq_tp0_jump_t,
+    J4_cmpeq_tp1_jump_nt, J4_cmpeq_tp1_jump_t};
+static const unsigned cmpgtBitOpcode[8] = {
+    J4_cmpgt_fp0_jump_nt, J4_cmpgt_fp0_jump_t,  J4_cmpgt_fp1_jump_nt,
+    J4_cmpgt_fp1_jump_t,  J4_cmpgt_tp0_jump_nt, J4_cmpgt_tp0_jump_t,
+    J4_cmpgt_tp1_jump_nt, J4_cmpgt_tp1_jump_t};
+static const unsigned cmpgtuBitOpcode[8] = {
+    J4_cmpgtu_fp0_jump_nt, J4_cmpgtu_fp0_jump_t,  J4_cmpgtu_fp1_jump_nt,
+    J4_cmpgtu_fp1_jump_t,  J4_cmpgtu_tp0_jump_nt, J4_cmpgtu_tp0_jump_t,
+    J4_cmpgtu_tp1_jump_nt, J4_cmpgtu_tp1_jump_t};
+static const unsigned cmpeqiBitOpcode[8] = {
+    J4_cmpeqi_fp0_jump_nt, J4_cmpeqi_fp0_jump_t,  J4_cmpeqi_fp1_jump_nt,
+    J4_cmpeqi_fp1_jump_t,  J4_cmpeqi_tp0_jump_nt, J4_cmpeqi_tp0_jump_t,
+    J4_cmpeqi_tp1_jump_nt, J4_cmpeqi_tp1_jump_t};
+static const unsigned cmpgtiBitOpcode[8] = {
+    J4_cmpgti_fp0_jump_nt, J4_cmpgti_fp0_jump_t,  J4_cmpgti_fp1_jump_nt,
+    J4_cmpgti_fp1_jump_t,  J4_cmpgti_tp0_jump_nt, J4_cmpgti_tp0_jump_t,
+    J4_cmpgti_tp1_jump_nt, J4_cmpgti_tp1_jump_t};
+static const unsigned cmpgtuiBitOpcode[8] = {
+    J4_cmpgtui_fp0_jump_nt, J4_cmpgtui_fp0_jump_t,  J4_cmpgtui_fp1_jump_nt,
+    J4_cmpgtui_fp1_jump_t,  J4_cmpgtui_tp0_jump_nt, J4_cmpgtui_tp0_jump_t,
+    J4_cmpgtui_tp1_jump_nt, J4_cmpgtui_tp1_jump_t};
+static const unsigned cmpeqn1BitOpcode[8] = {
+    J4_cmpeqn1_fp0_jump_nt, J4_cmpeqn1_fp0_jump_t,  J4_cmpeqn1_fp1_jump_nt,
+    J4_cmpeqn1_fp1_jump_t,  J4_cmpeqn1_tp0_jump_nt, J4_cmpeqn1_tp0_jump_t,
+    J4_cmpeqn1_tp1_jump_nt, J4_cmpeqn1_tp1_jump_t};
+static const unsigned cmpgtn1BitOpcode[8] = {
+    J4_cmpgtn1_fp0_jump_nt, J4_cmpgtn1_fp0_jump_t,  J4_cmpgtn1_fp1_jump_nt,
+    J4_cmpgtn1_fp1_jump_t,  J4_cmpgtn1_tp0_jump_nt, J4_cmpgtn1_tp0_jump_t,
+    J4_cmpgtn1_tp1_jump_nt, J4_cmpgtn1_tp1_jump_t,
+};
+
+// enum HexagonII::CompoundGroup
+namespace {
+unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {
+  unsigned DstReg, SrcReg, Src1Reg, Src2Reg;
+
+  switch (MI.getOpcode()) {
+  default:
+    return HexagonII::HCG_None;
+  //
+  // Compound pairs.
+  // "p0=cmp.eq(Rs16,Rt16); if (p0.new) jump:nt #r9:2"
+  // "Rd16=#U6 ; jump #r9:2"
+  // "Rd16=Rs16 ; jump #r9:2"
+  //
+  case Hexagon::C2_cmpeq:
+  case Hexagon::C2_cmpgt:
+  case Hexagon::C2_cmpgtu:
+    if (IsExtended)
+      return false;
+    DstReg = MI.getOperand(0).getReg();
+    Src1Reg = MI.getOperand(1).getReg();
+    Src2Reg = MI.getOperand(2).getReg();
+    if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg))
+      return HexagonII::HCG_A;
+    break;
+  case Hexagon::C2_cmpeqi:
+  case Hexagon::C2_cmpgti:
+  case Hexagon::C2_cmpgtui:
+    if (IsExtended)
+      return false;
+    // P0 = cmp.eq(Rs,#u2)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+        (HexagonMCInstrInfo::inRange<5>(MI, 2) ||
+         HexagonMCInstrInfo::minConstant(MI, 2) == -1))
+      return HexagonII::HCG_A;
+    break;
+  case Hexagon::A2_tfr:
+    if (IsExtended)
+      return false;
+    // Rd = Rs
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg))
+      return HexagonII::HCG_A;
+    break;
+  case Hexagon::A2_tfrsi:
+    if (IsExtended)
+      return false;
+    // Rd = #u6
+    DstReg = MI.getOperand(0).getReg();
+    if (HexagonMCInstrInfo::minConstant(MI, 1) <= 63 &&
+        HexagonMCInstrInfo::minConstant(MI, 1) >= 0 &&
+        HexagonMCInstrInfo::isIntRegForSubInst(DstReg))
+      return HexagonII::HCG_A;
+    break;
+  case Hexagon::S2_tstbit_i:
+    if (IsExtended)
+      return false;
+    DstReg = MI.getOperand(0).getReg();
+    Src1Reg = MI.getOperand(1).getReg();
+    if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+        HexagonMCInstrInfo::minConstant(MI, 2) == 0)
+      return HexagonII::HCG_A;
+    break;
+  // The fact that .new form is used pretty much guarantees
+  // that predicate register will match. Nevertheless,
+  // there could be some false positives without additional
+  // checking.
+  case Hexagon::J2_jumptnew:
+  case Hexagon::J2_jumpfnew:
+  case Hexagon::J2_jumptnewpt:
+  case Hexagon::J2_jumpfnewpt:
+    Src1Reg = MI.getOperand(0).getReg();
+    if (Hexagon::P0 == Src1Reg || Hexagon::P1 == Src1Reg)
+      return HexagonII::HCG_B;
+    break;
+  // Transfer and jump:
+  // Rd=#U6 ; jump #r9:2
+  // Rd=Rs ; jump #r9:2
+  // Do not test for jump range here.
+  case Hexagon::J2_jump:
+  case Hexagon::RESTORE_DEALLOC_RET_JMP_V4:
+    return HexagonII::HCG_C;
+    break;
+  }
+
+  return HexagonII::HCG_None;
+}
+}
+
+/// getCompoundOp - Return the index from 0-7 into the above opcode lists.
+namespace {
+unsigned getCompoundOp(MCInst const &HMCI) {
+  const MCOperand &Predicate = HMCI.getOperand(0);
+  unsigned PredReg = Predicate.getReg();
+
+  assert((PredReg == Hexagon::P0) || (PredReg == Hexagon::P1) ||
+         (PredReg == Hexagon::P2) || (PredReg == Hexagon::P3));
+
+  switch (HMCI.getOpcode()) {
+  default:
+    llvm_unreachable("Expected match not found.\n");
+    break;
+  case Hexagon::J2_jumpfnew:
+    return (PredReg == Hexagon::P0) ? fp0_jump_nt : fp1_jump_nt;
+  case Hexagon::J2_jumpfnewpt:
+    return (PredReg == Hexagon::P0) ? fp0_jump_t : fp1_jump_t;
+  case Hexagon::J2_jumptnew:
+    return (PredReg == Hexagon::P0) ? tp0_jump_nt : tp1_jump_nt;
+  case Hexagon::J2_jumptnewpt:
+    return (PredReg == Hexagon::P0) ? tp0_jump_t : tp1_jump_t;
+  }
+}
+}
+
+namespace {
+MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) {
+  MCInst *CompoundInsn = 0;
+  unsigned compoundOpcode;
+  MCOperand Rs, Rt;
+  int64_t Value;
+  bool Success;
+
+  switch (L.getOpcode()) {
+  default:
+    DEBUG(dbgs() << "Possible compound ignored\n");
+    return CompoundInsn;
+
+  case Hexagon::A2_tfrsi:
+    Rt = L.getOperand(0);
+    compoundOpcode = J4_jumpseti;
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+
+    CompoundInsn->addOperand(Rt);
+    CompoundInsn->addOperand(L.getOperand(1)); // Immediate
+    CompoundInsn->addOperand(R.getOperand(0)); // Jump target
+    break;
+
+  case Hexagon::A2_tfr:
+    Rt = L.getOperand(0);
+    Rs = L.getOperand(1);
+
+    compoundOpcode = J4_jumpsetr;
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+    CompoundInsn->addOperand(Rt);
+    CompoundInsn->addOperand(Rs);
+    CompoundInsn->addOperand(R.getOperand(0)); // Jump target.
+
+    break;
+
+  case Hexagon::C2_cmpeq:
+    DEBUG(dbgs() << "CX: C2_cmpeq\n");
+    Rs = L.getOperand(1);
+    Rt = L.getOperand(2);
+
+    compoundOpcode = cmpeqBitOpcode[getCompoundOp(R)];
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+    CompoundInsn->addOperand(Rs);
+    CompoundInsn->addOperand(Rt);
+    CompoundInsn->addOperand(R.getOperand(1));
+    break;
+
+  case Hexagon::C2_cmpgt:
+    DEBUG(dbgs() << "CX: C2_cmpgt\n");
+    Rs = L.getOperand(1);
+    Rt = L.getOperand(2);
+
+    compoundOpcode = cmpgtBitOpcode[getCompoundOp(R)];
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+    CompoundInsn->addOperand(Rs);
+    CompoundInsn->addOperand(Rt);
+    CompoundInsn->addOperand(R.getOperand(1));
+    break;
+
+  case Hexagon::C2_cmpgtu:
+    DEBUG(dbgs() << "CX: C2_cmpgtu\n");
+    Rs = L.getOperand(1);
+    Rt = L.getOperand(2);
+
+    compoundOpcode = cmpgtuBitOpcode[getCompoundOp(R)];
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+    CompoundInsn->addOperand(Rs);
+    CompoundInsn->addOperand(Rt);
+    CompoundInsn->addOperand(R.getOperand(1));
+    break;
+
+  case Hexagon::C2_cmpeqi:
+    DEBUG(dbgs() << "CX: C2_cmpeqi\n");
+    Success = L.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+    (void)Success;
+    assert(Success);
+    if (Value == -1)
+      compoundOpcode = cmpeqn1BitOpcode[getCompoundOp(R)];
+    else
+      compoundOpcode = cmpeqiBitOpcode[getCompoundOp(R)];
+
+    Rs = L.getOperand(1);
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+    CompoundInsn->addOperand(Rs);
+    CompoundInsn->addOperand(L.getOperand(2));
+    CompoundInsn->addOperand(R.getOperand(1));
+    break;
+
+  case Hexagon::C2_cmpgti:
+    DEBUG(dbgs() << "CX: C2_cmpgti\n");
+    Success = L.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+    (void)Success;
+    assert(Success);
+    if (Value == -1)
+      compoundOpcode = cmpgtn1BitOpcode[getCompoundOp(R)];
+    else
+      compoundOpcode = cmpgtiBitOpcode[getCompoundOp(R)];
+
+    Rs = L.getOperand(1);
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+    CompoundInsn->addOperand(Rs);
+    CompoundInsn->addOperand(L.getOperand(2));
+    CompoundInsn->addOperand(R.getOperand(1));
+    break;
+
+  case Hexagon::C2_cmpgtui:
+    DEBUG(dbgs() << "CX: C2_cmpgtui\n");
+    Rs = L.getOperand(1);
+    compoundOpcode = cmpgtuiBitOpcode[getCompoundOp(R)];
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+    CompoundInsn->addOperand(Rs);
+    CompoundInsn->addOperand(L.getOperand(2));
+    CompoundInsn->addOperand(R.getOperand(1));
+    break;
+
+  case Hexagon::S2_tstbit_i:
+    DEBUG(dbgs() << "CX: S2_tstbit_i\n");
+    Rs = L.getOperand(1);
+    compoundOpcode = tstBitOpcode[getCompoundOp(R)];
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+    CompoundInsn->addOperand(Rs);
+    CompoundInsn->addOperand(R.getOperand(1));
+    break;
+  }
+
+  return CompoundInsn;
+}
+}
+
+/// Non-Symmetrical. See if these two instructions are fit for compound pair.
+namespace {
+bool isOrderedCompoundPair(MCInst const &MIa, bool IsExtendedA,
+                           MCInst const &MIb, bool IsExtendedB) {
+  unsigned MIaG = getCompoundCandidateGroup(MIa, IsExtendedA);
+  unsigned MIbG = getCompoundCandidateGroup(MIb, IsExtendedB);
+  // We have two candidates - check that this is the same register
+  // we are talking about.
+  unsigned Opca = MIa.getOpcode();
+  if (MIaG == HexagonII::HCG_A && MIbG == HexagonII::HCG_C &&
+      (Opca == Hexagon::A2_tfr || Opca == Hexagon::A2_tfrsi))
+    return true;
+  return ((MIaG == HexagonII::HCG_A && MIbG == HexagonII::HCG_B) &&
+          (MIa.getOperand(0).getReg() == MIb.getOperand(0).getReg()));
+}
+}
+
+namespace {
+bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) {
+  assert(HexagonMCInstrInfo::isBundle(MCI));
+  bool JExtended = false;
+  for (MCInst::iterator J =
+           MCI.begin() + HexagonMCInstrInfo::bundleInstructionsOffset;
+       J != MCI.end(); ++J) {
+    MCInst const *JumpInst = J->getInst();
+    if (HexagonMCInstrInfo::isImmext(*JumpInst)) {
+      JExtended = true;
+      continue;
+    }
+    if (llvm::HexagonMCInstrInfo::getType(MCII, *JumpInst) ==
+        HexagonII::TypeJ) {
+      // Try to pair with another insn (B)undled with jump.
+      bool BExtended = false;
+      for (MCInst::iterator B =
+               MCI.begin() + HexagonMCInstrInfo::bundleInstructionsOffset;
+           B != MCI.end(); ++B) {
+        MCInst const *Inst = B->getInst();
+        if (JumpInst == Inst)
+          continue;
+        if (HexagonMCInstrInfo::isImmext(*Inst)) {
+          BExtended = true;
+          continue;
+        }
+        DEBUG(dbgs() << "J,B: " << JumpInst->getOpcode() << ","
+                     << Inst->getOpcode() << "\n");
+        if (isOrderedCompoundPair(*Inst, BExtended, *JumpInst, JExtended)) {
+          MCInst *CompoundInsn = getCompoundInsn(Context, *Inst, *JumpInst);
+          if (CompoundInsn) {
+            DEBUG(dbgs() << "B: " << Inst->getOpcode() << ","
+                         << JumpInst->getOpcode() << " Compounds to "
+                         << CompoundInsn->getOpcode() << "\n");
+            J->setInst(CompoundInsn);
+            MCI.erase(B);
+            return true;
+          }
+        }
+        BExtended = false;
+      }
+    }
+    JExtended = false;
+  }
+  return false;
+}
+}
+
+/// tryCompound - Given a bundle check for compound insns when one
+/// is found update the contents fo the bundle with the compound insn.
+/// If a compound instruction is found then the bundle will have one
+/// additional slot.
+void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII,
+                                     MCContext &Context, MCInst &MCI) {
+  assert(HexagonMCInstrInfo::isBundle(MCI) &&
+         "Non-Bundle where Bundle expected");
+
+  // By definition a compound must have 2 insn.
+  if (MCI.size() < 2)
+    return;
+
+  // Look for compounds until none are found, only update the bundle when
+  // a compound is found.
+  while (lookForCompound(MCII, Context, MCI))
+    ;
+
+  return;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
new file mode 100644
index 000000000000..413f052aa4bd
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -0,0 +1,1081 @@
+//===----- HexagonMCDuplexInfo.cpp - Instruction bundle checking ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements duplexing of instructions to reduce code size
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <map>
+
+using namespace llvm;
+using namespace Hexagon;
+
+#define DEBUG_TYPE "hexagon-mcduplex-info"
+
+// pair table of subInstructions with opcodes
+static const std::pair<unsigned, unsigned> opcodeData[] = {
+    std::make_pair((unsigned)SA1_addi, 0),
+    std::make_pair((unsigned)SA1_addrx, 6144),
+    std::make_pair((unsigned)SA1_addsp, 3072),
+    std::make_pair((unsigned)SA1_and1, 4608),
+    std::make_pair((unsigned)SA1_clrf, 6768),
+    std::make_pair((unsigned)SA1_clrfnew, 6736),
+    std::make_pair((unsigned)SA1_clrt, 6752),
+    std::make_pair((unsigned)SA1_clrtnew, 6720),
+    std::make_pair((unsigned)SA1_cmpeqi, 6400),
+    std::make_pair((unsigned)SA1_combine0i, 7168),
+    std::make_pair((unsigned)SA1_combine1i, 7176),
+    std::make_pair((unsigned)SA1_combine2i, 7184),
+    std::make_pair((unsigned)SA1_combine3i, 7192),
+    std::make_pair((unsigned)SA1_combinerz, 7432),
+    std::make_pair((unsigned)SA1_combinezr, 7424),
+    std::make_pair((unsigned)SA1_dec, 4864),
+    std::make_pair((unsigned)SA1_inc, 4352),
+    std::make_pair((unsigned)SA1_seti, 2048),
+    std::make_pair((unsigned)SA1_setin1, 6656),
+    std::make_pair((unsigned)SA1_sxtb, 5376),
+    std::make_pair((unsigned)SA1_sxth, 5120),
+    std::make_pair((unsigned)SA1_tfr, 4096),
+    std::make_pair((unsigned)SA1_zxtb, 5888),
+    std::make_pair((unsigned)SA1_zxth, 5632),
+    std::make_pair((unsigned)SL1_loadri_io, 0),
+    std::make_pair((unsigned)SL1_loadrub_io, 4096),
+    std::make_pair((unsigned)SL2_deallocframe, 7936),
+    std::make_pair((unsigned)SL2_jumpr31, 8128),
+    std::make_pair((unsigned)SL2_jumpr31_f, 8133),
+    std::make_pair((unsigned)SL2_jumpr31_fnew, 8135),
+    std::make_pair((unsigned)SL2_jumpr31_t, 8132),
+    std::make_pair((unsigned)SL2_jumpr31_tnew, 8134),
+    std::make_pair((unsigned)SL2_loadrb_io, 4096),
+    std::make_pair((unsigned)SL2_loadrd_sp, 7680),
+    std::make_pair((unsigned)SL2_loadrh_io, 0),
+    std::make_pair((unsigned)SL2_loadri_sp, 7168),
+    std::make_pair((unsigned)SL2_loadruh_io, 2048),
+    std::make_pair((unsigned)SL2_return, 8000),
+    std::make_pair((unsigned)SL2_return_f, 8005),
+    std::make_pair((unsigned)SL2_return_fnew, 8007),
+    std::make_pair((unsigned)SL2_return_t, 8004),
+    std::make_pair((unsigned)SL2_return_tnew, 8006),
+    std::make_pair((unsigned)SS1_storeb_io, 4096),
+    std::make_pair((unsigned)SS1_storew_io, 0),
+    std::make_pair((unsigned)SS2_allocframe, 7168),
+    std::make_pair((unsigned)SS2_storebi0, 4608),
+    std::make_pair((unsigned)SS2_storebi1, 4864),
+    std::make_pair((unsigned)SS2_stored_sp, 2560),
+    std::make_pair((unsigned)SS2_storeh_io, 0),
+    std::make_pair((unsigned)SS2_storew_sp, 2048),
+    std::make_pair((unsigned)SS2_storewi0, 4096),
+    std::make_pair((unsigned)SS2_storewi1, 4352)};
+
+bool HexagonMCInstrInfo::isDuplexPairMatch(unsigned Ga, unsigned Gb) {
+  switch (Ga) {
+  case HexagonII::HSIG_None:
+  default:
+    return false;
+  case HexagonII::HSIG_L1:
+    return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_L2:
+    return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+            Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_S1:
+    return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+            Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_S2:
+    return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+            Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_S2 ||
+            Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_A:
+    return (Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_Compound:
+    return (Gb == HexagonII::HSIG_Compound);
+  }
+  return false;
+}
+
+unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
+  switch (Ga) {
+  case HexagonII::HSIG_None:
+  default:
+    break;
+  case HexagonII::HSIG_L1:
+    switch (Gb) {
+    default:
+      break;
+    case HexagonII::HSIG_L1:
+      return 0;
+    case HexagonII::HSIG_A:
+      return 0x4;
+    }
+  case HexagonII::HSIG_L2:
+    switch (Gb) {
+    default:
+      break;
+    case HexagonII::HSIG_L1:
+      return 0x1;
+    case HexagonII::HSIG_L2:
+      return 0x2;
+    case HexagonII::HSIG_A:
+      return 0x5;
+    }
+  case HexagonII::HSIG_S1:
+    switch (Gb) {
+    default:
+      break;
+    case HexagonII::HSIG_L1:
+      return 0x8;
+    case HexagonII::HSIG_L2:
+      return 0x9;
+    case HexagonII::HSIG_S1:
+      return 0xA;
+    case HexagonII::HSIG_A:
+      return 0x6;
+    }
+  case HexagonII::HSIG_S2:
+    switch (Gb) {
+    default:
+      break;
+    case HexagonII::HSIG_L1:
+      return 0xC;
+    case HexagonII::HSIG_L2:
+      return 0xD;
+    case HexagonII::HSIG_S1:
+      return 0xB;
+    case HexagonII::HSIG_S2:
+      return 0xE;
+    case HexagonII::HSIG_A:
+      return 0x7;
+    }
+  case HexagonII::HSIG_A:
+    switch (Gb) {
+    default:
+      break;
+    case HexagonII::HSIG_A:
+      return 0x3;
+    }
+  case HexagonII::HSIG_Compound:
+    switch (Gb) {
+    case HexagonII::HSIG_Compound:
+      return 0xFFFFFFFF;
+    }
+  }
+  return 0xFFFFFFFF;
+}
+
+unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
+  unsigned DstReg, PredReg, SrcReg, Src1Reg, Src2Reg;
+
+  switch (MCI.getOpcode()) {
+  default:
+    return HexagonII::HSIG_None;
+  //
+  // Group L1:
+  //
+  // Rd = memw(Rs+#u4:2)
+  // Rd = memub(Rs+#u4:0)
+  case Hexagon::L2_loadri_io:
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    // Special case this one from Group L2.
+    // Rd = memw(r29+#u5:2)
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
+      if (HexagonMCInstrInfo::isIntReg(SrcReg) &&
+          Hexagon::R29 == SrcReg && inRange<5, 2>(MCI, 2)) {
+        return HexagonII::HSIG_L2;
+      }
+      // Rd = memw(Rs+#u4:2)
+      if (HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+          inRange<4, 2>(MCI, 2)) {
+        return HexagonII::HSIG_L1;
+      }
+    }
+    break;
+  case Hexagon::L2_loadrub_io:
+    // Rd = memub(Rs+#u4:0)
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+        inRange<4>(MCI, 2)) {
+      return HexagonII::HSIG_L1;
+    }
+    break;
+  //
+  // Group L2:
+  //
+  // Rd = memh/memuh(Rs+#u3:1)
+  // Rd = memb(Rs+#u3:0)
+  // Rd = memw(r29+#u5:2) - Handled above.
+  // Rdd = memd(r29+#u5:3)
+  // deallocframe
+  // [if ([!]p0[.new])] dealloc_return
+  // [if ([!]p0[.new])] jumpr r31
+  case Hexagon::L2_loadrh_io:
+  case Hexagon::L2_loadruh_io:
+    // Rd = memh/memuh(Rs+#u3:1)
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+        inRange<3, 1>(MCI, 2)) {
+      return HexagonII::HSIG_L2;
+    }
+    break;
+  case Hexagon::L2_loadrb_io:
+    // Rd = memb(Rs+#u3:0)
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+        inRange<3>(MCI, 2)) {
+      return HexagonII::HSIG_L2;
+    }
+    break;
+  case Hexagon::L2_loadrd_io:
+    // Rdd = memd(r29+#u5:3)
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg &&
+        inRange<5, 3>(MCI, 2)) {
+      return HexagonII::HSIG_L2;
+    }
+    break;
+
+  case Hexagon::L4_return:
+
+  case Hexagon::L2_deallocframe:
+
+    return HexagonII::HSIG_L2;
+  case Hexagon::EH_RETURN_JMPR:
+
+  case Hexagon::J2_jumpr:
+    // jumpr r31
+    // Actual form JMPR %PC<imp-def>, %R31<imp-use>, %R0<imp-use,internal>.
+    DstReg = MCI.getOperand(0).getReg();
+    if (Hexagon::R31 == DstReg)
+      return HexagonII::HSIG_L2;
+    break;
+
+  case Hexagon::J2_jumprt:
+  case Hexagon::J2_jumprf:
+  case Hexagon::J2_jumprtnew:
+  case Hexagon::J2_jumprfnew:
+  case Hexagon::J2_jumprtnewpt:
+  case Hexagon::J2_jumprfnewpt:
+    DstReg = MCI.getOperand(1).getReg();
+    SrcReg = MCI.getOperand(0).getReg();
+    // [if ([!]p0[.new])] jumpr r31
+    if ((HexagonMCInstrInfo::isPredReg(SrcReg) && (Hexagon::P0 == SrcReg)) &&
+        (Hexagon::R31 == DstReg)) {
+      return HexagonII::HSIG_L2;
+    }
+    break;
+  case Hexagon::L4_return_t:
+
+  case Hexagon::L4_return_f:
+
+  case Hexagon::L4_return_tnew_pnt:
+
+  case Hexagon::L4_return_fnew_pnt:
+
+  case Hexagon::L4_return_tnew_pt:
+
+  case Hexagon::L4_return_fnew_pt:
+    // [if ([!]p0[.new])] dealloc_return
+    SrcReg = MCI.getOperand(0).getReg();
+    if (Hexagon::P0 == SrcReg) {
+      return HexagonII::HSIG_L2;
+    }
+    break;
+  //
+  // Group S1:
+  //
+  // memw(Rs+#u4:2) = Rt
+  // memb(Rs+#u4:0) = Rt
+  case Hexagon::S2_storeri_io:
+    // Special case this one from Group S2.
+    // memw(r29+#u5:2) = Rt
+    Src1Reg = MCI.getOperand(0).getReg();
+    Src2Reg = MCI.getOperand(2).getReg();
+    if (HexagonMCInstrInfo::isIntReg(Src1Reg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
+        Hexagon::R29 == Src1Reg && inRange<5, 2>(MCI, 1)) {
+      return HexagonII::HSIG_S2;
+    }
+    // memw(Rs+#u4:2) = Rt
+    if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
+        inRange<4, 2>(MCI, 1)) {
+      return HexagonII::HSIG_S1;
+    }
+    break;
+  case Hexagon::S2_storerb_io:
+    // memb(Rs+#u4:0) = Rt
+    Src1Reg = MCI.getOperand(0).getReg();
+    Src2Reg = MCI.getOperand(2).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
+        inRange<4>(MCI, 1)) {
+      return HexagonII::HSIG_S1;
+    }
+    break;
+  //
+  // Group S2:
+  //
+  // memh(Rs+#u3:1) = Rt
+  // memw(r29+#u5:2) = Rt
+  // memd(r29+#s6:3) = Rtt
+  // memw(Rs+#u4:2) = #U1
+  // memb(Rs+#u4) = #U1
+  // allocframe(#u5:3)
+  case Hexagon::S2_storerh_io:
+    // memh(Rs+#u3:1) = Rt
+    Src1Reg = MCI.getOperand(0).getReg();
+    Src2Reg = MCI.getOperand(2).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
+        inRange<3, 1>(MCI, 1)) {
+      return HexagonII::HSIG_S2;
+    }
+    break;
+  case Hexagon::S2_storerd_io:
+    // memd(r29+#s6:3) = Rtt
+    Src1Reg = MCI.getOperand(0).getReg();
+    Src2Reg = MCI.getOperand(2).getReg();
+    if (HexagonMCInstrInfo::isDblRegForSubInst(Src2Reg) &&
+        HexagonMCInstrInfo::isIntReg(Src1Reg) && Hexagon::R29 == Src1Reg &&
+        inSRange<6, 3>(MCI, 1)) {
+      return HexagonII::HSIG_S2;
+    }
+    break;
+  case Hexagon::S4_storeiri_io:
+    // memw(Rs+#u4:2) = #U1
+    Src1Reg = MCI.getOperand(0).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+        inRange<4, 2>(MCI, 1) && inRange<1>(MCI, 2)) {
+      return HexagonII::HSIG_S2;
+    }
+    break;
+  case Hexagon::S4_storeirb_io:
+    // memb(Rs+#u4) = #U1
+    Src1Reg = MCI.getOperand(0).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+        inRange<4>(MCI, 1) && inRange<1>(MCI, 2)) {
+      return HexagonII::HSIG_S2;
+    }
+    break;
+  case Hexagon::S2_allocframe:
+    if (inRange<5, 3>(MCI, 0))
+      return HexagonII::HSIG_S2;
+    break;
+  //
+  // Group A:
+  //
+  // Rx = add(Rx,#s7)
+  // Rd = Rs
+  // Rd = #u6
+  // Rd = #-1
+  // if ([!]P0[.new]) Rd = #0
+  // Rd = add(r29,#u6:2)
+  // Rx = add(Rx,Rs)
+  // P0 = cmp.eq(Rs,#u2)
+  // Rdd = combine(#0,Rs)
+  // Rdd = combine(Rs,#0)
+  // Rdd = combine(#u2,#U2)
+  // Rd = add(Rs,#1)
+  // Rd = add(Rs,#-1)
+  // Rd = sxth/sxtb/zxtb/zxth(Rs)
+  // Rd = and(Rs,#1)
+  case Hexagon::A2_addi:
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
+      // Rd = add(r29,#u6:2)
+      if (HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg &&
+          inRange<6, 2>(MCI, 2)) {
+        return HexagonII::HSIG_A;
+      }
+      // Rx = add(Rx,#s7)
+      if (DstReg == SrcReg) {
+        return HexagonII::HSIG_A;
+      }
+      // Rd = add(Rs,#1)
+      // Rd = add(Rs,#-1)
+      if (HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+          (minConstant(MCI, 2) == 1 || minConstant(MCI, 2) == -1)) {
+        return HexagonII::HSIG_A;
+      }
+    }
+    break;
+  case Hexagon::A2_add:
+    // Rx = add(Rx,Rs)
+    DstReg = MCI.getOperand(0).getReg();
+    Src1Reg = MCI.getOperand(1).getReg();
+    Src2Reg = MCI.getOperand(2).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) && (DstReg == Src1Reg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg)) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::A2_andir:
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+        (minConstant(MCI, 2) == 1 || minConstant(MCI, 2) == 255)) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::A2_tfr:
+    // Rd = Rs
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg)) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::A2_tfrsi:
+    DstReg = MCI.getOperand(0).getReg();
+
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::C2_cmoveit:
+  case Hexagon::C2_cmovenewit:
+  case Hexagon::C2_cmoveif:
+  case Hexagon::C2_cmovenewif:
+    // if ([!]P0[.new]) Rd = #0
+    // Actual form:
+    // %R16<def> = C2_cmovenewit %P0<internal>, 0, %R16<imp-use,undef>;
+    DstReg = MCI.getOperand(0).getReg();  // Rd
+    PredReg = MCI.getOperand(1).getReg(); // P0
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+        Hexagon::P0 == PredReg && minConstant(MCI, 2) == 0) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::C2_cmpeqi:
+    // P0 = cmp.eq(Rs,#u2)
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (Hexagon::P0 == DstReg &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+        inRange<2>(MCI, 2)) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::A2_combineii:
+  case Hexagon::A4_combineii:
+    // Rdd = combine(#u2,#U2)
+    DstReg = MCI.getOperand(0).getReg();
+    if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
+        inRange<2>(MCI, 1) && inRange<2>(MCI, 2)) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::A4_combineri:
+    // Rdd = combine(Rs,#0)
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+        minConstant(MCI, 2) == 0) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::A4_combineir:
+    // Rdd = combine(#0,Rs)
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(2).getReg();
+    if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+        minConstant(MCI, 1) == 0) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::A2_sxtb:
+  case Hexagon::A2_sxth:
+  case Hexagon::A2_zxtb:
+  case Hexagon::A2_zxth:
+    // Rd = sxth/sxtb/zxtb/zxth(Rs)
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg)) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  }
+
+  return HexagonII::HSIG_None;
+}
+
+bool HexagonMCInstrInfo::subInstWouldBeExtended(MCInst const &potentialDuplex) {
+  unsigned DstReg, SrcReg;
+  switch (potentialDuplex.getOpcode()) {
+  case Hexagon::A2_addi:
+    // testing for case of: Rx = add(Rx,#s7)
+    DstReg = potentialDuplex.getOperand(0).getReg();
+    SrcReg = potentialDuplex.getOperand(1).getReg();
+    if (DstReg == SrcReg && HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
+      int64_t Value;
+      if (!potentialDuplex.getOperand(2).getExpr()->evaluateAsAbsolute(Value))
+        return true;
+      if (!isShiftedInt<7, 0>(Value))
+        return true;
+    }
+    break;
+  case Hexagon::A2_tfrsi:
+    DstReg = potentialDuplex.getOperand(0).getReg();
+
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
+      int64_t Value;
+      if (!potentialDuplex.getOperand(1).getExpr()->evaluateAsAbsolute(Value))
+        return true;
+      // Check for case of Rd = #-1.
+      if (Value == -1)
+        return false;
+      // Check for case of Rd = #u6.
+      if (!isShiftedUInt<6, 0>(Value))
+        return true;
+    }
+    break;
+  default:
+    break;
+  }
+  return false;
+}
+
+/// non-Symmetrical. See if these two instructions are fit for duplex pair.
+bool HexagonMCInstrInfo::isOrderedDuplexPair(MCInstrInfo const &MCII,
+                                             MCInst const &MIa, bool ExtendedA,
+                                             MCInst const &MIb, bool ExtendedB,
+                                             bool bisReversable) {
+  // Slot 1 cannot be extended in duplexes PRM 10.5
+  if (ExtendedA)
+    return false;
+  // Only A2_addi and A2_tfrsi can be extended in duplex form PRM 10.5
+  if (ExtendedB) {
+    unsigned Opcode = MIb.getOpcode();
+    if ((Opcode != Hexagon::A2_addi) && (Opcode != Hexagon::A2_tfrsi))
+      return false;
+  }
+  unsigned MIaG = HexagonMCInstrInfo::getDuplexCandidateGroup(MIa),
+           MIbG = HexagonMCInstrInfo::getDuplexCandidateGroup(MIb);
+
+  static std::map<unsigned, unsigned> subinstOpcodeMap(std::begin(opcodeData),
+                                                       std::end(opcodeData));
+
+  // If a duplex contains 2 insns in the same group, the insns must be
+  // ordered such that the numerically smaller opcode is in slot 1.
+  if ((MIaG != HexagonII::HSIG_None) && (MIaG == MIbG) && bisReversable) {
+    MCInst SubInst0 = HexagonMCInstrInfo::deriveSubInst(MIa);
+    MCInst SubInst1 = HexagonMCInstrInfo::deriveSubInst(MIb);
+
+    unsigned zeroedSubInstS0 =
+        subinstOpcodeMap.find(SubInst0.getOpcode())->second;
+    unsigned zeroedSubInstS1 =
+        subinstOpcodeMap.find(SubInst1.getOpcode())->second;
+
+    if (zeroedSubInstS0 < zeroedSubInstS1)
+      // subinstS0 (maps to slot 0) must be greater than
+      // subinstS1 (maps to slot 1)
+      return false;
+  }
+
+  // allocframe must always be in slot 0
+  if (MIb.getOpcode() == Hexagon::S2_allocframe)
+    return false;
+
+  if ((MIaG != HexagonII::HSIG_None) && (MIbG != HexagonII::HSIG_None)) {
+    // Prevent 2 instructions with extenders from duplexing
+    // Note that MIb (slot1) can be extended and MIa (slot0)
+    //   can never be extended
+    if (subInstWouldBeExtended(MIa))
+      return false;
+
+    // If duplexing produces an extender, but the original did not
+    //   have an extender, do not duplex.
+    if (subInstWouldBeExtended(MIb) && !ExtendedB)
+      return false;
+  }
+
+  // If jumpr r31 appears, it must be in slot 0, and never slot 1 (MIb).
+  if (MIbG == HexagonII::HSIG_L2) {
+    if ((MIb.getNumOperands() > 1) && MIb.getOperand(1).isReg() &&
+        (MIb.getOperand(1).getReg() == Hexagon::R31))
+      return false;
+    if ((MIb.getNumOperands() > 0) && MIb.getOperand(0).isReg() &&
+        (MIb.getOperand(0).getReg() == Hexagon::R31))
+      return false;
+  }
+
+  // If a store appears, it must be in slot 0 (MIa) 1st, and then slot 1 (MIb);
+  //   therefore, not duplexable if slot 1 is a store, and slot 0 is not.
+  if ((MIbG == HexagonII::HSIG_S1) || (MIbG == HexagonII::HSIG_S2)) {
+    if ((MIaG != HexagonII::HSIG_S1) && (MIaG != HexagonII::HSIG_S2))
+      return false;
+  }
+
+  return (isDuplexPairMatch(MIaG, MIbG));
+}
+
+/// Symmetrical. See if these two instructions are fit for duplex pair.
+bool HexagonMCInstrInfo::isDuplexPair(MCInst const &MIa, MCInst const &MIb) {
+  unsigned MIaG = getDuplexCandidateGroup(MIa),
+           MIbG = getDuplexCandidateGroup(MIb);
+  return (isDuplexPairMatch(MIaG, MIbG) || isDuplexPairMatch(MIbG, MIaG));
+}
+
+inline static void addOps(MCInst &subInstPtr, MCInst const &Inst,
+                          unsigned opNum) {
+  if (Inst.getOperand(opNum).isReg()) {
+    switch (Inst.getOperand(opNum).getReg()) {
+    default:
+      llvm_unreachable("Not Duplexable Register");
+      break;
+    case Hexagon::R0:
+    case Hexagon::R1:
+    case Hexagon::R2:
+    case Hexagon::R3:
+    case Hexagon::R4:
+    case Hexagon::R5:
+    case Hexagon::R6:
+    case Hexagon::R7:
+    case Hexagon::D0:
+    case Hexagon::D1:
+    case Hexagon::D2:
+    case Hexagon::D3:
+    case Hexagon::R16:
+    case Hexagon::R17:
+    case Hexagon::R18:
+    case Hexagon::R19:
+    case Hexagon::R20:
+    case Hexagon::R21:
+    case Hexagon::R22:
+    case Hexagon::R23:
+    case Hexagon::D8:
+    case Hexagon::D9:
+    case Hexagon::D10:
+    case Hexagon::D11:
+    case Hexagon::P0:
+      subInstPtr.addOperand(Inst.getOperand(opNum));
+      break;
+    }
+  } else
+    subInstPtr.addOperand(Inst.getOperand(opNum));
+}
+
+MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
+  MCInst Result;
+  bool Absolute;
+  int64_t Value;
+  switch (Inst.getOpcode()) {
+  default:
+    // dbgs() << "opcode: "<< Inst->getOpcode() << "\n";
+    llvm_unreachable("Unimplemented subinstruction \n");
+    break;
+  case Hexagon::A2_addi:
+    Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);(void)Absolute;
+    if (Value == 1) {
+      Result.setOpcode(Hexagon::SA1_inc);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break;
+    } //  1,2 SUBInst $Rd = add($Rs, #1)
+    else if (Value == -1) {
+      Result.setOpcode(Hexagon::SA1_dec);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break;
+    } //  1,2 SUBInst $Rd = add($Rs,#-1)
+    else if (Inst.getOperand(1).getReg() == Hexagon::R29) {
+      Result.setOpcode(Hexagon::SA1_addsp);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 2);
+      break;
+    } //  1,3 SUBInst $Rd = add(r29, #$u6_2)
+    else {
+      Result.setOpcode(Hexagon::SA1_addi);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      addOps(Result, Inst, 2);
+      break;
+    } //    1,2,3 SUBInst $Rx = add($Rx, #$s7)
+  case Hexagon::A2_add:
+    Result.setOpcode(Hexagon::SA1_addrx);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst $Rx = add($_src_, $Rs)
+  case Hexagon::S2_allocframe:
+    Result.setOpcode(Hexagon::SS2_allocframe);
+    addOps(Result, Inst, 0);
+    break; //    1 SUBInst allocframe(#$u5_3)
+  case Hexagon::A2_andir:
+    if (minConstant(Inst, 2) == 255) {
+      Result.setOpcode(Hexagon::SA1_zxtb);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break; //    1,2    $Rd = and($Rs, #255)
+    } else {
+      Result.setOpcode(Hexagon::SA1_and1);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break; //    1,2 SUBInst $Rd = and($Rs, #1)
+    }
+  case Hexagon::C2_cmpeqi:
+    Result.setOpcode(Hexagon::SA1_cmpeqi);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    2,3 SUBInst p0 = cmp.eq($Rs, #$u2)
+  case Hexagon::A4_combineii:
+  case Hexagon::A2_combineii:
+    Absolute = Inst.getOperand(1).getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);(void)Absolute;
+    if (Value == 1) {
+      Result.setOpcode(Hexagon::SA1_combine1i);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 2);
+      break; //  1,3 SUBInst $Rdd = combine(#1, #$u2)
+    }
+    if (Value == 3) {
+      Result.setOpcode(Hexagon::SA1_combine3i);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 2);
+      break; //  1,3 SUBInst $Rdd = combine(#3, #$u2)
+    }
+    if (Value == 0) {
+      Result.setOpcode(Hexagon::SA1_combine0i);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 2);
+      break; //  1,3 SUBInst $Rdd = combine(#0, #$u2)
+    }
+    if (Value == 2) {
+      Result.setOpcode(Hexagon::SA1_combine2i);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 2);
+      break; //  1,3 SUBInst $Rdd = combine(#2, #$u2)
+    }
+  case Hexagon::A4_combineir:
+    Result.setOpcode(Hexagon::SA1_combinezr);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 2);
+    break; //    1,3 SUBInst $Rdd = combine(#0, $Rs)
+
+  case Hexagon::A4_combineri:
+    Result.setOpcode(Hexagon::SA1_combinerz);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    break; //    1,2 SUBInst $Rdd = combine($Rs, #0)
+  case Hexagon::L4_return_tnew_pnt:
+  case Hexagon::L4_return_tnew_pt:
+    Result.setOpcode(Hexagon::SL2_return_tnew);
+    break; //    none  SUBInst if (p0.new) dealloc_return:nt
+  case Hexagon::L4_return_fnew_pnt:
+  case Hexagon::L4_return_fnew_pt:
+    Result.setOpcode(Hexagon::SL2_return_fnew);
+    break; //    none  SUBInst if (!p0.new) dealloc_return:nt
+  case Hexagon::L4_return_f:
+    Result.setOpcode(Hexagon::SL2_return_f);
+    break; //    none  SUBInst if (!p0) dealloc_return
+  case Hexagon::L4_return_t:
+    Result.setOpcode(Hexagon::SL2_return_t);
+    break; //    none  SUBInst if (p0) dealloc_return
+  case Hexagon::L4_return:
+    Result.setOpcode(Hexagon::SL2_return);
+    break; //    none  SUBInst dealloc_return
+  case Hexagon::L2_deallocframe:
+    Result.setOpcode(Hexagon::SL2_deallocframe);
+    break; //    none  SUBInst deallocframe
+  case Hexagon::EH_RETURN_JMPR:
+  case Hexagon::J2_jumpr:
+    Result.setOpcode(Hexagon::SL2_jumpr31);
+    break; //    none  SUBInst jumpr r31
+  case Hexagon::J2_jumprf:
+    Result.setOpcode(Hexagon::SL2_jumpr31_f);
+    break; //    none  SUBInst if (!p0) jumpr r31
+  case Hexagon::J2_jumprfnew:
+  case Hexagon::J2_jumprfnewpt:
+    Result.setOpcode(Hexagon::SL2_jumpr31_fnew);
+    break; //    none  SUBInst if (!p0.new) jumpr:nt r31
+  case Hexagon::J2_jumprt:
+    Result.setOpcode(Hexagon::SL2_jumpr31_t);
+    break; //    none  SUBInst if (p0) jumpr r31
+  case Hexagon::J2_jumprtnew:
+  case Hexagon::J2_jumprtnewpt:
+    Result.setOpcode(Hexagon::SL2_jumpr31_tnew);
+    break; //    none  SUBInst if (p0.new) jumpr:nt r31
+  case Hexagon::L2_loadrb_io:
+    Result.setOpcode(Hexagon::SL2_loadrb_io);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst $Rd = memb($Rs + #$u3_0)
+  case Hexagon::L2_loadrd_io:
+    Result.setOpcode(Hexagon::SL2_loadrd_sp);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 2);
+    break; //    1,3 SUBInst $Rdd = memd(r29 + #$u5_3)
+  case Hexagon::L2_loadrh_io:
+    Result.setOpcode(Hexagon::SL2_loadrh_io);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst $Rd = memh($Rs + #$u3_1)
+  case Hexagon::L2_loadrub_io:
+    Result.setOpcode(Hexagon::SL1_loadrub_io);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst $Rd = memub($Rs + #$u4_0)
+  case Hexagon::L2_loadruh_io:
+    Result.setOpcode(Hexagon::SL2_loadruh_io);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst $Rd = memuh($Rs + #$u3_1)
+  case Hexagon::L2_loadri_io:
+    if (Inst.getOperand(1).getReg() == Hexagon::R29) {
+      Result.setOpcode(Hexagon::SL2_loadri_sp);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 2);
+      break; //  2 1,3 SUBInst $Rd = memw(r29 + #$u5_2)
+    } else {
+      Result.setOpcode(Hexagon::SL1_loadri_io);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      addOps(Result, Inst, 2);
+      break; //    1,2,3 SUBInst $Rd = memw($Rs + #$u4_2)
+    }
+  case Hexagon::S4_storeirb_io:
+    Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);(void)Absolute;
+    if (Value == 0) {
+      Result.setOpcode(Hexagon::SS2_storebi0);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break; //    1,2 SUBInst memb($Rs + #$u4_0)=#0
+    } else if (Value == 1) {
+      Result.setOpcode(Hexagon::SS2_storebi1);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break; //  2 1,2 SUBInst memb($Rs + #$u4_0)=#1
+    }
+  case Hexagon::S2_storerb_io:
+    Result.setOpcode(Hexagon::SS1_storeb_io);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst memb($Rs + #$u4_0) = $Rt
+  case Hexagon::S2_storerd_io:
+    Result.setOpcode(Hexagon::SS2_stored_sp);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    2,3 SUBInst memd(r29 + #$s6_3) = $Rtt
+  case Hexagon::S2_storerh_io:
+    Result.setOpcode(Hexagon::SS2_storeh_io);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst memb($Rs + #$u4_0) = $Rt
+  case Hexagon::S4_storeiri_io:
+    Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
+    assert(Absolute);(void)Absolute;
+    if (Value == 0) {
+      Result.setOpcode(Hexagon::SS2_storewi0);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break; //  3 1,2 SUBInst memw($Rs + #$u4_2)=#0
+    } else if (Value == 1) {
+      Result.setOpcode(Hexagon::SS2_storewi1);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break; //  3 1,2 SUBInst memw($Rs + #$u4_2)=#1
+    } else if (Inst.getOperand(0).getReg() == Hexagon::R29) {
+      Result.setOpcode(Hexagon::SS2_storew_sp);
+      addOps(Result, Inst, 1);
+      addOps(Result, Inst, 2);
+      break; //  1 2,3 SUBInst memw(r29 + #$u5_2) = $Rt
+    }
+  case Hexagon::S2_storeri_io:
+    if (Inst.getOperand(0).getReg() == Hexagon::R29) {
+      Result.setOpcode(Hexagon::SS2_storew_sp);
+      addOps(Result, Inst, 1);
+      addOps(Result, Inst, 2); //  1,2,3 SUBInst memw(sp + #$u5_2) = $Rt
+    } else {
+      Result.setOpcode(Hexagon::SS1_storew_io);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      addOps(Result, Inst, 2); //  1,2,3 SUBInst memw($Rs + #$u4_2) = $Rt
+    }
+    break;
+  case Hexagon::A2_sxtb:
+    Result.setOpcode(Hexagon::SA1_sxtb);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    break; //  1,2 SUBInst $Rd = sxtb($Rs)
+  case Hexagon::A2_sxth:
+    Result.setOpcode(Hexagon::SA1_sxth);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    break; //  1,2 SUBInst $Rd = sxth($Rs)
+  case Hexagon::A2_tfr:
+    Result.setOpcode(Hexagon::SA1_tfr);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    break; //  1,2 SUBInst $Rd = $Rs
+  case Hexagon::C2_cmovenewif:
+    Result.setOpcode(Hexagon::SA1_clrfnew);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    break; //  2 SUBInst if (!p0.new) $Rd = #0
+  case Hexagon::C2_cmovenewit:
+    Result.setOpcode(Hexagon::SA1_clrtnew);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    break; //  2 SUBInst if (p0.new) $Rd = #0
+  case Hexagon::C2_cmoveif:
+    Result.setOpcode(Hexagon::SA1_clrf);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    break; //  2 SUBInst if (!p0) $Rd = #0
+  case Hexagon::C2_cmoveit:
+    Result.setOpcode(Hexagon::SA1_clrt);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    break; //  2 SUBInst if (p0) $Rd = #0
+  case Hexagon::A2_tfrsi:
+    Absolute = Inst.getOperand(1).getExpr()->evaluateAsAbsolute(Value);
+    if (Absolute && Value == -1) {
+      Result.setOpcode(Hexagon::SA1_setin1);
+      addOps(Result, Inst, 0);
+      break; //  2 1 SUBInst $Rd = #-1
+    } else {
+      Result.setOpcode(Hexagon::SA1_seti);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break; //    1,2 SUBInst $Rd = #$u6
+    }
+  case Hexagon::A2_zxtb:
+    Result.setOpcode(Hexagon::SA1_zxtb);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    break; //    1,2    $Rd = and($Rs, #255)
+
+  case Hexagon::A2_zxth:
+    Result.setOpcode(Hexagon::SA1_zxth);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    break; //    1,2 SUBInst $Rd = zxth($Rs)
+  }
+  return Result;
+}
+
+static bool isStoreInst(unsigned opCode) {
+  switch (opCode) {
+  case Hexagon::S2_storeri_io:
+  case Hexagon::S2_storerb_io:
+  case Hexagon::S2_storerh_io:
+  case Hexagon::S2_storerd_io:
+  case Hexagon::S4_storeiri_io:
+  case Hexagon::S4_storeirb_io:
+  case Hexagon::S2_allocframe:
+    return true;
+  default:
+    return false;
+  }
+}
+
+SmallVector<DuplexCandidate, 8>
+HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
+                                          MCInst const &MCB) {
+  assert(isBundle(MCB));
+  SmallVector<DuplexCandidate, 8> duplexToTry;
+  // Use an "order matters" version of isDuplexPair.
+  unsigned numInstrInPacket = MCB.getNumOperands();
+
+  for (unsigned distance = 1; distance < numInstrInPacket; ++distance) {
+    for (unsigned j = HexagonMCInstrInfo::bundleInstructionsOffset,
+                  k = j + distance;
+         (j < numInstrInPacket) && (k < numInstrInPacket); ++j, ++k) {
+
+      // Check if reversible.
+      bool bisReversable = true;
+      if (isStoreInst(MCB.getOperand(j).getInst()->getOpcode()) &&
+          isStoreInst(MCB.getOperand(k).getInst()->getOpcode())) {
+        DEBUG(dbgs() << "skip out of order write pair: " << k << "," << j
+                     << "\n");
+        bisReversable = false;
+      }
+      if (HexagonMCInstrInfo::isMemReorderDisabled(MCB)) // }:mem_noshuf
+        bisReversable = false;
+
+      // Try in order.
+      if (isOrderedDuplexPair(
+              MCII, *MCB.getOperand(k).getInst(),
+              HexagonMCInstrInfo::hasExtenderForIndex(MCB, k - 1),
+              *MCB.getOperand(j).getInst(),
+              HexagonMCInstrInfo::hasExtenderForIndex(MCB, j - 1),
+              bisReversable)) {
+        // Get iClass.
+        unsigned iClass = iClassOfDuplexPair(
+            getDuplexCandidateGroup(*MCB.getOperand(k).getInst()),
+            getDuplexCandidateGroup(*MCB.getOperand(j).getInst()));
+
+        // Save off pairs for duplex checking.
+        duplexToTry.push_back(DuplexCandidate(j, k, iClass));
+        DEBUG(dbgs() << "adding pair: " << j << "," << k << ":"
+                     << MCB.getOperand(j).getInst()->getOpcode() << ","
+                     << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+        continue;
+      } else {
+        DEBUG(dbgs() << "skipping pair: " << j << "," << k << ":"
+                     << MCB.getOperand(j).getInst()->getOpcode() << ","
+                     << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+      }
+
+      // Try reverse.
+      if (bisReversable) {
+        if (isOrderedDuplexPair(
+                MCII, *MCB.getOperand(j).getInst(),
+                HexagonMCInstrInfo::hasExtenderForIndex(MCB, j - 1),
+                *MCB.getOperand(k).getInst(),
+                HexagonMCInstrInfo::hasExtenderForIndex(MCB, k - 1),
+                bisReversable)) {
+          // Get iClass.
+          unsigned iClass = iClassOfDuplexPair(
+              getDuplexCandidateGroup(*MCB.getOperand(j).getInst()),
+              getDuplexCandidateGroup(*MCB.getOperand(k).getInst()));
+
+          // Save off pairs for duplex checking.
+          duplexToTry.push_back(DuplexCandidate(k, j, iClass));
+          DEBUG(dbgs() << "adding pair:" << k << "," << j << ":"
+                       << MCB.getOperand(j).getInst()->getOpcode() << ","
+                       << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+        } else {
+          DEBUG(dbgs() << "skipping pair: " << k << "," << j << ":"
+                       << MCB.getOperand(j).getInst()->getOpcode() << ","
+                       << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+        }
+      }
+    }
+  }
+  return duplexToTry;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
new file mode 100644
index 000000000000..226470cfbced
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -0,0 +1,164 @@
+//=== HexagonMCELFStreamer.cpp - Hexagon subclass of MCELFStreamer -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a stub that parses a MCInst bundle and passes the
+// instructions on to the real streamer.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "hexagonmcelfstreamer"
+
+#include "MCTargetDesc/HexagonMCELFStreamer.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+static cl::opt<unsigned>
+    GPSize("gpsize", cl::NotHidden,
+           cl::desc("Global Pointer Addressing Size.  The default size is 8."),
+           cl::Prefix, cl::init(8));
+
+void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK,
+                                           const MCSubtargetInfo &STI) {
+  MCInst HMI = HexagonMCInstrInfo::createBundle();
+  MCInst *MCB;
+
+  if (MCK.getOpcode() != Hexagon::BUNDLE) {
+    HMI.addOperand(MCOperand::createInst(&MCK));
+    MCB = &HMI;
+  } else
+    MCB = const_cast<MCInst *>(&MCK);
+
+  // Examines packet and pad the packet, if needed, when an
+  // end-loop is in the bundle.
+  HexagonMCInstrInfo::padEndloop(getContext(), *MCB);
+  HexagonMCShuffle(*MCII, STI, *MCB);
+
+  assert(HexagonMCInstrInfo::bundleSize(*MCB) <= HEXAGON_PACKET_SIZE);
+  bool Extended = false;
+  for (auto &I : HexagonMCInstrInfo::bundleInstructions(*MCB)) {
+    MCInst *MCI = const_cast<MCInst *>(I.getInst());
+    if (Extended) {
+      if (HexagonMCInstrInfo::isDuplex(*MCII, *MCI)) {
+        MCInst *SubInst = const_cast<MCInst *>(MCI->getOperand(1).getInst());
+        HexagonMCInstrInfo::clampExtended(*MCII, getContext(), *SubInst);
+      } else {
+        HexagonMCInstrInfo::clampExtended(*MCII, getContext(), *MCI);
+      }
+      Extended = false;
+    } else {
+      Extended = HexagonMCInstrInfo::isImmext(*MCI);
+    }
+  }
+
+  // At this point, MCB is a bundle
+  // Iterate through the bundle and assign addends for the instructions
+  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MCB)) {
+    MCInst *MCI = const_cast<MCInst *>(I.getInst());
+    EmitSymbol(*MCI);
+  }
+  MCObjectStreamer::EmitInstruction(*MCB, STI);
+}
+
+void HexagonMCELFStreamer::EmitSymbol(const MCInst &Inst) {
+  // Scan for values.
+  for (unsigned i = Inst.getNumOperands(); i--;)
+    if (Inst.getOperand(i).isExpr())
+      visitUsedExpr(*Inst.getOperand(i).getExpr());
+}
+
+// EmitCommonSymbol and EmitLocalCommonSymbol are extended versions of the
+// functions found in MCELFStreamer.cpp taking AccessSize as an additional
+// parameter.
+void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
+                                                     uint64_t Size,
+                                                     unsigned ByteAlignment,
+                                                     unsigned AccessSize) {
+  getAssembler().registerSymbol(*Symbol);
+  StringRef sbss[4] = {".sbss.1", ".sbss.2", ".sbss.4", ".sbss.8"};
+
+  auto ELFSymbol = cast<MCSymbolELF>(Symbol);
+  if (!ELFSymbol->isBindingSet()) {
+    ELFSymbol->setBinding(ELF::STB_GLOBAL);
+    ELFSymbol->setExternal(true);
+  }
+
+  ELFSymbol->setType(ELF::STT_OBJECT);
+
+  if (ELFSymbol->getBinding() == ELF::STB_LOCAL) {
+    StringRef SectionName =
+        ((AccessSize == 0) || (Size == 0) || (Size > GPSize))
+            ? ".bss"
+            : sbss[(Log2_64(AccessSize))];
+    MCSection &Section = *getAssembler().getContext().getELFSection(
+        SectionName, ELF::SHT_NOBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+    MCSectionSubPair P = getCurrentSection();
+    SwitchSection(&Section);
+
+    EmitValueToAlignment(ByteAlignment, 0, 1, 0);
+    EmitLabel(Symbol);
+    EmitZeros(Size);
+
+    // Update the maximum alignment of the section if necessary.
+    if (ByteAlignment > Section.getAlignment())
+      Section.setAlignment(ByteAlignment);
+
+    SwitchSection(P.first, P.second);
+  } else {
+    if (ELFSymbol->declareCommon(Size, ByteAlignment))
+      report_fatal_error("Symbol: " + Symbol->getName() +
+                         " redeclared as different type");
+    if ((AccessSize) && (Size <= GPSize)) {
+      uint64_t SectionIndex =
+          (AccessSize <= GPSize)
+              ? ELF::SHN_HEXAGON_SCOMMON + (Log2_64(AccessSize) + 1)
+              : (unsigned)ELF::SHN_HEXAGON_SCOMMON;
+      ELFSymbol->setIndex(SectionIndex);
+    }
+  }
+
+  ELFSymbol->setSize(MCConstantExpr::create(Size, getContext()));
+}
+
+void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(
+    MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment,
+    unsigned AccessSize) {
+  getAssembler().registerSymbol(*Symbol);
+  auto ELFSymbol = cast<MCSymbolELF>(Symbol);
+  ELFSymbol->setBinding(ELF::STB_LOCAL);
+  ELFSymbol->setExternal(false);
+  HexagonMCEmitCommonSymbol(Symbol, Size, ByteAlignment, AccessSize);
+}
+
+namespace llvm {
+
+MCStreamer *createHexagonELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     raw_pwrite_stream &OS, MCCodeEmitter *CE) {
+  return new HexagonMCELFStreamer(Context, MAB, OS, CE);
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
new file mode 100644
index 000000000000..0ac1a68d4ef9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
@@ -0,0 +1,44 @@
+//===- HexagonMCELFStreamer.h - Hexagon subclass of MCElfStreamer ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCELFSTREAMER_H
+#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCELFSTREAMER_H
+
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include <cstdint>
+#include <memory>
+
+namespace llvm {
+
+class HexagonMCELFStreamer : public MCELFStreamer {
+  std::unique_ptr<MCInstrInfo> MCII;
+
+public:
+  HexagonMCELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                       raw_pwrite_stream &OS, MCCodeEmitter *Emitter)
+      : MCELFStreamer(Context, TAB, OS, Emitter),
+        MCII(createHexagonMCInstrInfo()) {}
+
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+  void EmitSymbol(const MCInst &Inst);
+  void HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                      unsigned ByteAlignment,
+                                      unsigned AccessSize);
+  void HexagonMCEmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                 unsigned ByteAlignment, unsigned AccessSize);
+};
+
+MCStreamer *createHexagonELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     raw_pwrite_stream &OS, MCCodeEmitter *CE);
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCELFSTREAMER_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
new file mode 100644
index 000000000000..e93906a0a396
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
@@ -0,0 +1,78 @@
+//===-- HexagonMCExpr.cpp - Hexagon specific MC expression classes
+//----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-mcexpr"
+
+HexagonMCExpr *HexagonMCExpr::create(MCExpr const *Expr, MCContext &Ctx) {
+  return new (Ctx) HexagonMCExpr(Expr);
+}
+
+bool HexagonMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+                                              MCAsmLayout const *Layout,
+                                              MCFixup const *Fixup) const {
+  return Expr->evaluateAsRelocatable(Res, Layout, Fixup);
+}
+
+void HexagonMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*Expr);
+}
+
+MCFragment *llvm::HexagonMCExpr::findAssociatedFragment() const {
+  return Expr->findAssociatedFragment();
+}
+
+void HexagonMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+
+MCExpr const *HexagonMCExpr::getExpr() const { return Expr; }
+
+void HexagonMCExpr::setMustExtend(bool Val) {
+  assert((!Val || !MustNotExtend) && "Extension contradiction");
+  MustExtend = Val;
+}
+
+bool HexagonMCExpr::mustExtend() const { return MustExtend; }
+void HexagonMCExpr::setMustNotExtend(bool Val) {
+  assert((!Val || !MustExtend) && "Extension contradiction");
+  MustNotExtend = Val;
+}
+bool HexagonMCExpr::mustNotExtend() const { return MustNotExtend; }
+
+bool HexagonMCExpr::s23_2_reloc() const { return S23_2_reloc; }
+void HexagonMCExpr::setS23_2_reloc(bool Val) {
+  S23_2_reloc = Val;
+}
+
+bool HexagonMCExpr::classof(MCExpr const *E) {
+  return E->getKind() == MCExpr::Target;
+}
+
+HexagonMCExpr::HexagonMCExpr(MCExpr const *Expr)
+    : Expr(Expr), MustNotExtend(false), MustExtend(false), S23_2_reloc(false),
+      SignMismatch(false) {}
+
+void HexagonMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  Expr->print(OS, MAI);
+}
+
+void HexagonMCExpr::setSignMismatch(bool Val) {
+  SignMismatch = Val;
+}
+
+bool HexagonMCExpr::signMismatch() const {
+  return SignMismatch;
+}
+\ No newline at end of file
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
new file mode 100644
index 000000000000..bca40cfaf6f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
@@ -0,0 +1,47 @@
+//==- HexagonMCExpr.h - Hexagon specific MC expression classes --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+class MCInst;
+class HexagonMCExpr : public MCTargetExpr {
+public:
+  static HexagonMCExpr *create(MCExpr const *Expr, MCContext &Ctx);
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+  MCFragment *findAssociatedFragment() const override;
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
+  static bool classof(MCExpr const *E);
+  MCExpr const *getExpr() const;
+  void setMustExtend(bool Val = true);
+  bool mustExtend() const;
+  void setMustNotExtend(bool Val = true);
+  bool mustNotExtend() const;
+  void setS23_2_reloc(bool Val = true);
+  bool s23_2_reloc() const;
+  void setSignMismatch(bool Val = true);
+  bool signMismatch() const;
+
+private:
+  HexagonMCExpr(MCExpr const *Expr);
+  MCExpr const *Expr;
+  bool MustNotExtend;
+  bool MustExtend;
+  bool S23_2_reloc;
+  bool SignMismatch;
+};
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
new file mode 100644
index 000000000000..e627f026c8ad
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -0,0 +1,809 @@
+//===- HexagonMCInstrInfo.cpp - Hexagon sub-class of MCInst ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class extends MCInstrInfo to allow Hexagon specific MCInstr queries
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCInstrInfo.h"
+
+#include "Hexagon.h"
+#include "HexagonBaseInfo.h"
+#include "HexagonMCChecker.h"
+
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+void HexagonMCInstrInfo::addConstant(MCInst &MI, uint64_t Value,
+                                     MCContext &Context) {
+  MI.addOperand(MCOperand::createExpr(MCConstantExpr::create(Value, Context)));
+}
+
+void HexagonMCInstrInfo::addConstExtender(MCContext &Context,
+                                          MCInstrInfo const &MCII, MCInst &MCB,
+                                          MCInst const &MCI) {
+  assert(HexagonMCInstrInfo::isBundle(MCB));
+  MCOperand const &exOp =
+      MCI.getOperand(HexagonMCInstrInfo::getExtendableOp(MCII, MCI));
+
+  // Create the extender.
+  MCInst *XMCI =
+      new (Context) MCInst(HexagonMCInstrInfo::deriveExtender(MCII, MCI, exOp));
+
+  MCB.addOperand(MCOperand::createInst(XMCI));
+}
+
+iterator_range<MCInst::const_iterator>
+HexagonMCInstrInfo::bundleInstructions(MCInst const &MCI) {
+  assert(isBundle(MCI));
+  return make_range(MCI.begin() + bundleInstructionsOffset, MCI.end());
+}
+
+size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) {
+  if (HexagonMCInstrInfo::isBundle(MCI))
+    return (MCI.size() - bundleInstructionsOffset);
+  else
+    return (1);
+}
+
+bool HexagonMCInstrInfo::canonicalizePacket(MCInstrInfo const &MCII,
+                                            MCSubtargetInfo const &STI,
+                                            MCContext &Context, MCInst &MCB,
+                                            HexagonMCChecker *Check) {
+  // Examine the packet and convert pairs of instructions to compound
+  // instructions when possible.
+  if (!HexagonDisableCompound)
+    HexagonMCInstrInfo::tryCompound(MCII, Context, MCB);
+  // Check the bundle for errors.
+  bool CheckOk = Check ? Check->check() : true;
+  if (!CheckOk)
+    return false;
+  HexagonMCShuffle(MCII, STI, MCB);
+  // Examine the packet and convert pairs of instructions to duplex
+  // instructions when possible.
+  MCInst InstBundlePreDuplex = MCInst(MCB);
+  if (!HexagonDisableDuplex) {
+    SmallVector<DuplexCandidate, 8> possibleDuplexes;
+    possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(MCII, MCB);
+    HexagonMCShuffle(MCII, STI, Context, MCB, possibleDuplexes);
+  }
+  // Examines packet and pad the packet, if needed, when an
+  // end-loop is in the bundle.
+  HexagonMCInstrInfo::padEndloop(Context, MCB);
+  // If compounding and duplexing didn't reduce the size below
+  // 4 or less we have a packet that is too big.
+  if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE)
+    return false;
+  HexagonMCShuffle(MCII, STI, MCB);
+  return true;
+}
+
+void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII,
+                                       MCContext &Context, MCInst &MCI) {
+  assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
+         HexagonMCInstrInfo::isExtended(MCII, MCI));
+  MCOperand &exOp =
+      MCI.getOperand(HexagonMCInstrInfo::getExtendableOp(MCII, MCI));
+  // If the extended value is a constant, then use it for the extended and
+  // for the extender instructions, masking off the lower 6 bits and
+  // including the assumed bits.
+  int64_t Value;
+  if (exOp.getExpr()->evaluateAsAbsolute(Value)) {
+    unsigned Shift = HexagonMCInstrInfo::getExtentAlignment(MCII, MCI);
+    exOp.setExpr(HexagonMCExpr::create(
+        MCConstantExpr::create((Value & 0x3f) << Shift, Context), Context));
+  }
+}
+
+MCInst HexagonMCInstrInfo::createBundle() {
+  MCInst Result;
+  Result.setOpcode(Hexagon::BUNDLE);
+  Result.addOperand(MCOperand::createImm(0));
+  return Result;
+}
+
+MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass,
+                                         MCInst const &inst0,
+                                         MCInst const &inst1) {
+  assert((iClass <= 0xf) && "iClass must have range of 0 to 0xf");
+  MCInst *duplexInst = new (Context) MCInst;
+  duplexInst->setOpcode(Hexagon::DuplexIClass0 + iClass);
+
+  MCInst *SubInst0 = new (Context) MCInst(deriveSubInst(inst0));
+  MCInst *SubInst1 = new (Context) MCInst(deriveSubInst(inst1));
+  duplexInst->addOperand(MCOperand::createInst(SubInst0));
+  duplexInst->addOperand(MCOperand::createInst(SubInst1));
+  return duplexInst;
+}
+
+MCInst HexagonMCInstrInfo::deriveExtender(MCInstrInfo const &MCII,
+                                          MCInst const &Inst,
+                                          MCOperand const &MO) {
+  assert(HexagonMCInstrInfo::isExtendable(MCII, Inst) ||
+         HexagonMCInstrInfo::isExtended(MCII, Inst));
+
+  MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, Inst);
+  MCInst XMI;
+  XMI.setOpcode((Desc.isBranch() || Desc.isCall() ||
+                 HexagonMCInstrInfo::getType(MCII, Inst) == HexagonII::TypeCR)
+                    ? Hexagon::A4_ext_b
+                    : Hexagon::A4_ext);
+  if (MO.isImm())
+    XMI.addOperand(MCOperand::createImm(MO.getImm() & (~0x3f)));
+  else if (MO.isExpr())
+    XMI.addOperand(MCOperand::createExpr(MO.getExpr()));
+  else
+    llvm_unreachable("invalid extendable operand");
+  return XMI;
+}
+
+MCInst const *HexagonMCInstrInfo::extenderForIndex(MCInst const &MCB,
+                                                   size_t Index) {
+  assert(Index <= bundleSize(MCB));
+  if (Index == 0)
+    return nullptr;
+  MCInst const *Inst =
+      MCB.getOperand(Index + bundleInstructionsOffset - 1).getInst();
+  if (isImmext(*Inst))
+    return Inst;
+  return nullptr;
+}
+
+void HexagonMCInstrInfo::extendIfNeeded(MCContext &Context,
+                                        MCInstrInfo const &MCII, MCInst &MCB,
+                                        MCInst const &MCI) {
+  if (isConstExtended(MCII, MCI))
+    addConstExtender(Context, MCII, MCB, MCI);
+}
+
+HexagonII::MemAccessSize
+HexagonMCInstrInfo::getAccessSize(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+
+  return (HexagonII::MemAccessSize((F >> HexagonII::MemAccessSizePos) &
+                                   HexagonII::MemAccesSizeMask));
+}
+
+unsigned HexagonMCInstrInfo::getBitCount(MCInstrInfo const &MCII,
+                                         MCInst const &MCI) {
+  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
+}
+
+// Return constant extended operand number.
+unsigned short HexagonMCInstrInfo::getCExtOpNum(MCInstrInfo const &MCII,
+                                                MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
+}
+
+MCInstrDesc const &HexagonMCInstrInfo::getDesc(MCInstrInfo const &MCII,
+                                               MCInst const &MCI) {
+  return (MCII.get(MCI.getOpcode()));
+}
+
+unsigned HexagonMCInstrInfo::getDuplexRegisterNumbering(unsigned Reg) {
+  using namespace Hexagon;
+  switch (Reg) {
+  default:
+    llvm_unreachable("unknown duplex register");
+  // Rs       Rss
+  case R0:
+  case D0:
+    return 0;
+  case R1:
+  case D1:
+    return 1;
+  case R2:
+  case D2:
+    return 2;
+  case R3:
+  case D3:
+    return 3;
+  case R4:
+  case D8:
+    return 4;
+  case R5:
+  case D9:
+    return 5;
+  case R6:
+  case D10:
+    return 6;
+  case R7:
+  case D11:
+    return 7;
+  case R16:
+    return 8;
+  case R17:
+    return 9;
+  case R18:
+    return 10;
+  case R19:
+    return 11;
+  case R20:
+    return 12;
+  case R21:
+    return 13;
+  case R22:
+    return 14;
+  case R23:
+    return 15;
+  }
+}
+
+MCExpr const &HexagonMCInstrInfo::getExpr(MCExpr const &Expr) {
+  const auto &HExpr = cast<HexagonMCExpr>(Expr);
+  assert(HExpr.getExpr());
+  return *HExpr.getExpr();
+}
+
+unsigned short HexagonMCInstrInfo::getExtendableOp(MCInstrInfo const &MCII,
+                                                   MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
+}
+
+MCOperand const &
+HexagonMCInstrInfo::getExtendableOperand(MCInstrInfo const &MCII,
+                                         MCInst const &MCI) {
+  unsigned O = HexagonMCInstrInfo::getExtendableOp(MCII, MCI);
+  MCOperand const &MO = MCI.getOperand(O);
+
+  assert((HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
+          HexagonMCInstrInfo::isExtended(MCII, MCI)) &&
+         (MO.isImm() || MO.isExpr()));
+  return (MO);
+}
+
+unsigned HexagonMCInstrInfo::getExtentAlignment(MCInstrInfo const &MCII,
+                                                MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::ExtentAlignPos) & HexagonII::ExtentAlignMask);
+}
+
+unsigned HexagonMCInstrInfo::getExtentBits(MCInstrInfo const &MCII,
+                                           MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
+}
+
+// Return the max value that a constant extendable operand can have
+// without being extended.
+int HexagonMCInstrInfo::getMaxValue(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
+  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  unsigned isSigned =
+      (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
+  unsigned bits = (F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return ~(-1U << (bits - 1));
+  else
+    return ~(-1U << bits);
+}
+
+// Return the min value that a constant extendable operand can have
+// without being extended.
+int HexagonMCInstrInfo::getMinValue(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
+  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  unsigned isSigned =
+      (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
+  unsigned bits = (F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return -1U << (bits - 1);
+  else
+    return 0;
+}
+
+StringRef HexagonMCInstrInfo::getName(MCInstrInfo const &MCII,
+                                        MCInst const &MCI) {
+  return MCII.getName(MCI.getOpcode());
+}
+
+unsigned short HexagonMCInstrInfo::getNewValueOp(MCInstrInfo const &MCII,
+                                                 MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::NewValueOpPos) & HexagonII::NewValueOpMask);
+}
+
+MCOperand const &HexagonMCInstrInfo::getNewValueOperand(MCInstrInfo const &MCII,
+                                                        MCInst const &MCI) {
+  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  unsigned const O =
+      (F >> HexagonII::NewValueOpPos) & HexagonII::NewValueOpMask;
+  MCOperand const &MCO = MCI.getOperand(O);
+
+  assert((HexagonMCInstrInfo::isNewValue(MCII, MCI) ||
+          HexagonMCInstrInfo::hasNewValue(MCII, MCI)) &&
+         MCO.isReg());
+  return (MCO);
+}
+
+/// Return the new value or the newly produced value.
+unsigned short HexagonMCInstrInfo::getNewValueOp2(MCInstrInfo const &MCII,
+                                                  MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::NewValueOpPos2) & HexagonII::NewValueOpMask2);
+}
+
+MCOperand const &
+HexagonMCInstrInfo::getNewValueOperand2(MCInstrInfo const &MCII,
+                                        MCInst const &MCI) {
+  unsigned O = HexagonMCInstrInfo::getNewValueOp2(MCII, MCI);
+  MCOperand const &MCO = MCI.getOperand(O);
+
+  assert((HexagonMCInstrInfo::isNewValue(MCII, MCI) ||
+          HexagonMCInstrInfo::hasNewValue2(MCII, MCI)) &&
+         MCO.isReg());
+  return (MCO);
+}
+
+int HexagonMCInstrInfo::getSubTarget(MCInstrInfo const &MCII,
+                                     MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+
+  HexagonII::SubTarget Target = static_cast<HexagonII::SubTarget>(
+      (F >> HexagonII::validSubTargetPos) & HexagonII::validSubTargetMask);
+
+  switch (Target) {
+  default:
+    return Hexagon::ArchV4;
+  case HexagonII::HasV5SubT:
+    return Hexagon::ArchV5;
+  }
+}
+
+// Return the Hexagon ISA class for the insn.
+unsigned HexagonMCInstrInfo::getType(MCInstrInfo const &MCII,
+                                     MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+
+  return ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
+}
+
+unsigned HexagonMCInstrInfo::getUnits(MCInstrInfo const &MCII,
+                                      MCSubtargetInfo const &STI,
+                                      MCInst const &MCI) {
+
+  const InstrItinerary *II = STI.getSchedModel().InstrItineraries;
+  int SchedClass = HexagonMCInstrInfo::getDesc(MCII, MCI).getSchedClass();
+  return ((II[SchedClass].FirstStage + HexagonStages)->getUnits());
+}
+
+bool HexagonMCInstrInfo::hasImmExt(MCInst const &MCI) {
+  if (!HexagonMCInstrInfo::isBundle(MCI))
+    return false;
+
+  for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCI)) {
+    auto MI = I.getInst();
+    if (isImmext(*MI))
+      return true;
+  }
+
+  return false;
+}
+
+bool HexagonMCInstrInfo::hasExtenderForIndex(MCInst const &MCB, size_t Index) {
+  return extenderForIndex(MCB, Index) != nullptr;
+}
+
+// Return whether the instruction is a legal new-value producer.
+bool HexagonMCInstrInfo::hasNewValue(MCInstrInfo const &MCII,
+                                     MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::hasNewValuePos) & HexagonII::hasNewValueMask);
+}
+
+/// Return whether the insn produces a second value.
+bool HexagonMCInstrInfo::hasNewValue2(MCInstrInfo const &MCII,
+                                      MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::hasNewValuePos2) & HexagonII::hasNewValueMask2);
+}
+
+MCInst const &HexagonMCInstrInfo::instruction(MCInst const &MCB, size_t Index) {
+  assert(isBundle(MCB));
+  assert(Index < HEXAGON_PACKET_SIZE);
+  return *MCB.getOperand(bundleInstructionsOffset + Index).getInst();
+}
+
+bool HexagonMCInstrInfo::isBundle(MCInst const &MCI) {
+  auto Result = Hexagon::BUNDLE == MCI.getOpcode();
+  assert(!Result || (MCI.size() > 0 && MCI.getOperand(0).isImm()));
+  return Result;
+}
+
+// Return whether the insn is an actual insn.
+bool HexagonMCInstrInfo::isCanon(MCInstrInfo const &MCII, MCInst const &MCI) {
+  return (!HexagonMCInstrInfo::getDesc(MCII, MCI).isPseudo() &&
+          !HexagonMCInstrInfo::isPrefix(MCII, MCI) &&
+          HexagonMCInstrInfo::getType(MCII, MCI) != HexagonII::TypeENDLOOP);
+}
+
+bool HexagonMCInstrInfo::isCofMax1(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::CofMax1Pos) & HexagonII::CofMax1Mask);
+}
+
+bool HexagonMCInstrInfo::isCompound(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
+  return (getType(MCII, MCI) == HexagonII::TypeCOMPOUND);
+}
+
+bool HexagonMCInstrInfo::isDblRegForSubInst(unsigned Reg) {
+  return ((Reg >= Hexagon::D0 && Reg <= Hexagon::D3) ||
+          (Reg >= Hexagon::D8 && Reg <= Hexagon::D11));
+}
+
+bool HexagonMCInstrInfo::isDuplex(MCInstrInfo const &MCII, MCInst const &MCI) {
+  return HexagonII::TypeDUPLEX == HexagonMCInstrInfo::getType(MCII, MCI);
+}
+
+// Return whether the instruction needs to be constant extended.
+// 1) Always return true if the instruction has 'isExtended' flag set.
+//
+// isExtendable:
+// 2) For immediate extended operands, return true only if the value is
+//    out-of-range.
+// 3) For global address, always return true.
+
+bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
+                                         MCInst const &MCI) {
+  if (HexagonMCInstrInfo::isExtended(MCII, MCI))
+    return true;
+  if (!HexagonMCInstrInfo::isExtendable(MCII, MCI))
+    return false;
+  MCOperand const &MO = HexagonMCInstrInfo::getExtendableOperand(MCII, MCI);
+  if (isa<HexagonMCExpr>(MO.getExpr()) &&
+      HexagonMCInstrInfo::mustExtend(*MO.getExpr()))
+    return true;
+  // Branch insns are handled as necessary by relaxation.
+  if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeJ) ||
+      (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCOMPOUND &&
+       HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch()) ||
+      (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV &&
+       HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch()))
+    return false;
+  // Otherwise loop instructions and other CR insts are handled by relaxation
+  else if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCR) &&
+           (MCI.getOpcode() != Hexagon::C4_addipc))
+    return false;
+
+  assert(!MO.isImm());
+  if (isa<HexagonMCExpr>(MO.getExpr()) &&
+      HexagonMCInstrInfo::mustNotExtend(*MO.getExpr()))
+    return false;
+  int64_t Value;
+  if (!MO.getExpr()->evaluateAsAbsolute(Value))
+    return true;
+  int MinValue = HexagonMCInstrInfo::getMinValue(MCII, MCI);
+  int MaxValue = HexagonMCInstrInfo::getMaxValue(MCII, MCI);
+  return (MinValue > Value || Value > MaxValue);
+}
+
+bool HexagonMCInstrInfo::isExtendable(MCInstrInfo const &MCII,
+                                      MCInst const &MCI) {
+  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
+}
+
+bool HexagonMCInstrInfo::isExtended(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
+  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
+}
+
+bool HexagonMCInstrInfo::isFloat(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::FPPos) & HexagonII::FPMask);
+}
+
+bool HexagonMCInstrInfo::isImmext(MCInst const &MCI) {
+  auto Op = MCI.getOpcode();
+  return (Op == Hexagon::A4_ext_b || Op == Hexagon::A4_ext_c ||
+          Op == Hexagon::A4_ext_g || Op == Hexagon::A4_ext);
+}
+
+bool HexagonMCInstrInfo::isInnerLoop(MCInst const &MCI) {
+  assert(isBundle(MCI));
+  int64_t Flags = MCI.getOperand(0).getImm();
+  return (Flags & innerLoopMask) != 0;
+}
+
+bool HexagonMCInstrInfo::isIntReg(unsigned Reg) {
+  return (Reg >= Hexagon::R0 && Reg <= Hexagon::R31);
+}
+
+bool HexagonMCInstrInfo::isIntRegForSubInst(unsigned Reg) {
+  return ((Reg >= Hexagon::R0 && Reg <= Hexagon::R7) ||
+          (Reg >= Hexagon::R16 && Reg <= Hexagon::R23));
+}
+
+// Return whether the insn is a new-value consumer.
+bool HexagonMCInstrInfo::isNewValue(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
+}
+
+// Return whether the operand can be constant extended.
+bool HexagonMCInstrInfo::isOperandExtended(MCInstrInfo const &MCII,
+                                           MCInst const &MCI,
+                                           unsigned short OperandNum) {
+  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask) ==
+         OperandNum;
+}
+
+bool HexagonMCInstrInfo::isOuterLoop(MCInst const &MCI) {
+  assert(isBundle(MCI));
+  int64_t Flags = MCI.getOperand(0).getImm();
+  return (Flags & outerLoopMask) != 0;
+}
+
+bool HexagonMCInstrInfo::isPredicated(MCInstrInfo const &MCII,
+                                      MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+}
+
+bool HexagonMCInstrInfo::isPredicateLate(MCInstrInfo const &MCII,
+                                         MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return (F >> HexagonII::PredicateLatePos & HexagonII::PredicateLateMask);
+}
+
+/// Return whether the insn is newly predicated.
+bool HexagonMCInstrInfo::isPredicatedNew(MCInstrInfo const &MCII,
+                                         MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask);
+}
+
+bool HexagonMCInstrInfo::isPredicatedTrue(MCInstrInfo const &MCII,
+                                          MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return (
+      !((F >> HexagonII::PredicatedFalsePos) & HexagonII::PredicatedFalseMask));
+}
+
+bool HexagonMCInstrInfo::isPredReg(unsigned Reg) {
+  return (Reg >= Hexagon::P0 && Reg <= Hexagon::P3_0);
+}
+
+bool HexagonMCInstrInfo::isPrefix(MCInstrInfo const &MCII, MCInst const &MCI) {
+  return (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypePREFIX);
+}
+
+bool HexagonMCInstrInfo::isSolo(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::SoloPos) & HexagonII::SoloMask);
+}
+
+bool HexagonMCInstrInfo::isMemReorderDisabled(MCInst const &MCI) {
+  assert(isBundle(MCI));
+  auto Flags = MCI.getOperand(0).getImm();
+  return (Flags & memReorderDisabledMask) != 0;
+}
+
+bool HexagonMCInstrInfo::isMemStoreReorderEnabled(MCInst const &MCI) {
+  assert(isBundle(MCI));
+  auto Flags = MCI.getOperand(0).getImm();
+  return (Flags & memStoreReorderEnabledMask) != 0;
+}
+
+bool HexagonMCInstrInfo::isSubInstruction(MCInst const &MCI) {
+  switch (MCI.getOpcode()) {
+  default:
+    return false;
+  case Hexagon::SA1_addi:
+  case Hexagon::SA1_addrx:
+  case Hexagon::SA1_addsp:
+  case Hexagon::SA1_and1:
+  case Hexagon::SA1_clrf:
+  case Hexagon::SA1_clrfnew:
+  case Hexagon::SA1_clrt:
+  case Hexagon::SA1_clrtnew:
+  case Hexagon::SA1_cmpeqi:
+  case Hexagon::SA1_combine0i:
+  case Hexagon::SA1_combine1i:
+  case Hexagon::SA1_combine2i:
+  case Hexagon::SA1_combine3i:
+  case Hexagon::SA1_combinerz:
+  case Hexagon::SA1_combinezr:
+  case Hexagon::SA1_dec:
+  case Hexagon::SA1_inc:
+  case Hexagon::SA1_seti:
+  case Hexagon::SA1_setin1:
+  case Hexagon::SA1_sxtb:
+  case Hexagon::SA1_sxth:
+  case Hexagon::SA1_tfr:
+  case Hexagon::SA1_zxtb:
+  case Hexagon::SA1_zxth:
+  case Hexagon::SL1_loadri_io:
+  case Hexagon::SL1_loadrub_io:
+  case Hexagon::SL2_deallocframe:
+  case Hexagon::SL2_jumpr31:
+  case Hexagon::SL2_jumpr31_f:
+  case Hexagon::SL2_jumpr31_fnew:
+  case Hexagon::SL2_jumpr31_t:
+  case Hexagon::SL2_jumpr31_tnew:
+  case Hexagon::SL2_loadrb_io:
+  case Hexagon::SL2_loadrd_sp:
+  case Hexagon::SL2_loadrh_io:
+  case Hexagon::SL2_loadri_sp:
+  case Hexagon::SL2_loadruh_io:
+  case Hexagon::SL2_return:
+  case Hexagon::SL2_return_f:
+  case Hexagon::SL2_return_fnew:
+  case Hexagon::SL2_return_t:
+  case Hexagon::SL2_return_tnew:
+  case Hexagon::SS1_storeb_io:
+  case Hexagon::SS1_storew_io:
+  case Hexagon::SS2_allocframe:
+  case Hexagon::SS2_storebi0:
+  case Hexagon::SS2_storebi1:
+  case Hexagon::SS2_stored_sp:
+  case Hexagon::SS2_storeh_io:
+  case Hexagon::SS2_storew_sp:
+  case Hexagon::SS2_storewi0:
+  case Hexagon::SS2_storewi1:
+    return true;
+  }
+}
+
+bool HexagonMCInstrInfo::isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::SoloAXPos) & HexagonII::SoloAXMask);
+}
+
+bool HexagonMCInstrInfo::isSoloAin1(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::SoloAin1Pos) & HexagonII::SoloAin1Mask);
+}
+
+bool HexagonMCInstrInfo::isVector(MCInstrInfo const &MCII, MCInst const &MCI) {
+  if ((getType(MCII, MCI) <= HexagonII::TypeCVI_LAST) &&
+      (getType(MCII, MCI) >= HexagonII::TypeCVI_FIRST))
+    return true;
+  return false;
+}
+
+int64_t HexagonMCInstrInfo::minConstant(MCInst const &MCI, size_t Index) {
+  auto Sentinal = static_cast<int64_t>(std::numeric_limits<uint32_t>::max())
+                  << 8;
+  if (MCI.size() <= Index)
+    return Sentinal;
+  MCOperand const &MCO = MCI.getOperand(Index);
+  if (!MCO.isExpr())
+    return Sentinal;
+  int64_t Value;
+  if (!MCO.getExpr()->evaluateAsAbsolute(Value))
+    return Sentinal;
+  return Value;
+}
+
+void HexagonMCInstrInfo::setMustExtend(MCExpr const &Expr, bool Val) {
+  HexagonMCExpr &HExpr = const_cast<HexagonMCExpr &>(cast<HexagonMCExpr>(Expr));
+  HExpr.setMustExtend(Val);
+}
+
+bool HexagonMCInstrInfo::mustExtend(MCExpr const &Expr) {
+  HexagonMCExpr const &HExpr = cast<HexagonMCExpr>(Expr);
+  return HExpr.mustExtend();
+}
+void HexagonMCInstrInfo::setMustNotExtend(MCExpr const &Expr, bool Val) {
+  HexagonMCExpr &HExpr =
+      const_cast<HexagonMCExpr &>(cast<HexagonMCExpr>(Expr));
+  HExpr.setMustNotExtend(Val);
+}
+bool HexagonMCInstrInfo::mustNotExtend(MCExpr const &Expr) {
+  HexagonMCExpr const &HExpr = cast<HexagonMCExpr>(Expr);
+  return HExpr.mustNotExtend();
+}
+
+void HexagonMCInstrInfo::padEndloop(MCContext &Context, MCInst &MCB) {
+  MCInst Nop;
+  Nop.setOpcode(Hexagon::A2_nop);
+  assert(isBundle(MCB));
+  while ((HexagonMCInstrInfo::isInnerLoop(MCB) &&
+          (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_INNER_SIZE)) ||
+         ((HexagonMCInstrInfo::isOuterLoop(MCB) &&
+           (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_OUTER_SIZE))))
+    MCB.addOperand(MCOperand::createInst(new (Context) MCInst(Nop)));
+}
+
+bool HexagonMCInstrInfo::prefersSlot3(MCInstrInfo const &MCII,
+                                      MCInst const &MCI) {
+  if (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCR)
+    return false;
+
+  unsigned SchedClass = HexagonMCInstrInfo::getDesc(MCII, MCI).getSchedClass();
+  switch (SchedClass) {
+  case Hexagon::Sched::ALU32_3op_tc_2_SLOT0123:
+  case Hexagon::Sched::ALU64_tc_2_SLOT23:
+  case Hexagon::Sched::ALU64_tc_3x_SLOT23:
+  case Hexagon::Sched::M_tc_2_SLOT23:
+  case Hexagon::Sched::M_tc_3x_SLOT23:
+  case Hexagon::Sched::S_2op_tc_2_SLOT23:
+  case Hexagon::Sched::S_3op_tc_2_SLOT23:
+  case Hexagon::Sched::S_3op_tc_3x_SLOT23:
+    return true;
+  }
+  return false;
+}
+
+void HexagonMCInstrInfo::replaceDuplex(MCContext &Context, MCInst &MCB,
+                                       DuplexCandidate Candidate) {
+  assert(Candidate.packetIndexI < MCB.size());
+  assert(Candidate.packetIndexJ < MCB.size());
+  assert(isBundle(MCB));
+  MCInst *Duplex =
+      deriveDuplex(Context, Candidate.iClass,
+                   *MCB.getOperand(Candidate.packetIndexJ).getInst(),
+                   *MCB.getOperand(Candidate.packetIndexI).getInst());
+  assert(Duplex != nullptr);
+  MCB.getOperand(Candidate.packetIndexI).setInst(Duplex);
+  MCB.erase(MCB.begin() + Candidate.packetIndexJ);
+}
+
+void HexagonMCInstrInfo::setInnerLoop(MCInst &MCI) {
+  assert(isBundle(MCI));
+  MCOperand &Operand = MCI.getOperand(0);
+  Operand.setImm(Operand.getImm() | innerLoopMask);
+}
+
+void HexagonMCInstrInfo::setMemReorderDisabled(MCInst &MCI) {
+  assert(isBundle(MCI));
+  MCOperand &Operand = MCI.getOperand(0);
+  Operand.setImm(Operand.getImm() | memReorderDisabledMask);
+  assert(isMemReorderDisabled(MCI));
+}
+
+void HexagonMCInstrInfo::setMemStoreReorderEnabled(MCInst &MCI) {
+  assert(isBundle(MCI));
+  MCOperand &Operand = MCI.getOperand(0);
+  Operand.setImm(Operand.getImm() | memStoreReorderEnabledMask);
+  assert(isMemStoreReorderEnabled(MCI));
+}
+void HexagonMCInstrInfo::setS23_2_reloc(MCExpr const &Expr, bool Val) {
+  HexagonMCExpr &HExpr =
+      const_cast<HexagonMCExpr &>(*llvm::cast<HexagonMCExpr>(&Expr));
+  HExpr.setS23_2_reloc(Val);
+}
+bool HexagonMCInstrInfo::s23_2_reloc(MCExpr const &Expr) {
+  HexagonMCExpr const &HExpr = *llvm::cast<HexagonMCExpr>(&Expr);
+  return HExpr.s23_2_reloc();
+}
+
+void HexagonMCInstrInfo::setOuterLoop(MCInst &MCI) {
+  assert(isBundle(MCI));
+  MCOperand &Operand = MCI.getOperand(0);
+  Operand.setImm(Operand.getImm() | outerLoopMask);
+}
+
+unsigned HexagonMCInstrInfo::SubregisterBit(unsigned Consumer,
+                                            unsigned Producer,
+                                            unsigned Producer2) {
+  // If we're a single vector consumer of a double producer, set subreg bit
+  // based on if we're accessing the lower or upper register component
+  if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
+    if (Consumer >= Hexagon::V0 && Consumer <= Hexagon::V31)
+      return (Consumer - Hexagon::V0) & 0x1;
+  if (Consumer == Producer2)
+    return 0x1;
+  return 0;
+}
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
new file mode 100644
index 000000000000..d701c3ade69e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -0,0 +1,302 @@
+//===- HexagonMCInstrInfo.cpp - Utility functions on Hexagon MCInsts ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility functions for Hexagon specific MCInst queries
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H
+
+#include "HexagonMCExpr.h"
+#include "llvm/MC/MCInst.h"
+
+namespace llvm {
+class HexagonMCChecker;
+class MCContext;
+class MCInstrDesc;
+class MCInstrInfo;
+class MCInst;
+class MCOperand;
+class MCSubtargetInfo;
+namespace HexagonII {
+enum class MemAccessSize;
+}
+class DuplexCandidate {
+public:
+  unsigned packetIndexI, packetIndexJ, iClass;
+  DuplexCandidate(unsigned i, unsigned j, unsigned iClass)
+      : packetIndexI(i), packetIndexJ(j), iClass(iClass) {}
+};
+namespace HexagonMCInstrInfo {
+size_t const innerLoopOffset = 0;
+int64_t const innerLoopMask = 1 << innerLoopOffset;
+
+size_t const outerLoopOffset = 1;
+int64_t const outerLoopMask = 1 << outerLoopOffset;
+
+// do not reorder memory load/stores by default load/stores are re-ordered
+// and by default loads can be re-ordered
+size_t const memReorderDisabledOffset = 2;
+int64_t const memReorderDisabledMask = 1 << memReorderDisabledOffset;
+
+// allow re-ordering of memory stores by default stores cannot be re-ordered
+size_t const memStoreReorderEnabledOffset = 3;
+int64_t const memStoreReorderEnabledMask = 1 << memStoreReorderEnabledOffset;
+
+size_t const bundleInstructionsOffset = 1;
+
+void addConstant(MCInst &MI, uint64_t Value, MCContext &Context);
+void addConstExtender(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
+                      MCInst const &MCI);
+
+// Returns a iterator range of instructions in this bundle
+iterator_range<MCInst::const_iterator> bundleInstructions(MCInst const &MCI);
+
+// Returns the number of instructions in the bundle
+size_t bundleSize(MCInst const &MCI);
+
+// Put the packet in to canonical form, compound, duplex, pad, and shuffle
+bool canonicalizePacket(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                        MCContext &Context, MCInst &MCB,
+                        HexagonMCChecker *Checker);
+
+// Clamp off upper 26 bits of extendable operand for emission
+void clampExtended(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
+
+MCInst createBundle();
+
+// Return the extender for instruction at Index or nullptr if none
+MCInst const *extenderForIndex(MCInst const &MCB, size_t Index);
+void extendIfNeeded(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
+                    MCInst const &MCI);
+
+// Create a duplex instruction given the two subinsts
+MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0,
+                     MCInst const &inst1);
+MCInst deriveExtender(MCInstrInfo const &MCII, MCInst const &Inst,
+                      MCOperand const &MO);
+
+// Convert this instruction in to a duplex subinst
+MCInst deriveSubInst(MCInst const &Inst);
+
+// Return the extender for instruction at Index or nullptr if none
+MCInst const *extenderForIndex(MCInst const &MCB, size_t Index);
+
+// Return memory access size
+HexagonII::MemAccessSize getAccessSize(MCInstrInfo const &MCII,
+                                       MCInst const &MCI);
+
+// Return number of bits in the constant extended operand.
+unsigned getBitCount(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return constant extended operand number.
+unsigned short getCExtOpNum(MCInstrInfo const &MCII, MCInst const &MCI);
+
+MCInstrDesc const &getDesc(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return which duplex group this instruction belongs to
+unsigned getDuplexCandidateGroup(MCInst const &MI);
+
+// Return a list of all possible instruction duplex combinations
+SmallVector<DuplexCandidate, 8> getDuplexPossibilties(MCInstrInfo const &MCII,
+                                                      MCInst const &MCB);
+unsigned getDuplexRegisterNumbering(unsigned Reg);
+
+MCExpr const &getExpr(MCExpr const &Expr);
+
+// Return the index of the extendable operand
+unsigned short getExtendableOp(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return a reference to the extendable operand
+MCOperand const &getExtendableOperand(MCInstrInfo const &MCII,
+                                      MCInst const &MCI);
+
+// Return the implicit alignment of the extendable operand
+unsigned getExtentAlignment(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the number of logical bits of the extendable operand
+unsigned getExtentBits(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the max value that a constant extendable operand can have
+// without being extended.
+int getMaxValue(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the min value that a constant extendable operand can have
+// without being extended.
+int getMinValue(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return instruction name
+StringRef getName(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the operand index for the new value.
+unsigned short getNewValueOp(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the operand that consumes or produces a new value.
+MCOperand const &getNewValueOperand(MCInstrInfo const &MCII, MCInst const &MCI);
+unsigned short getNewValueOp2(MCInstrInfo const &MCII, MCInst const &MCI);
+MCOperand const &getNewValueOperand2(MCInstrInfo const &MCII,
+                                     MCInst const &MCI);
+
+int getSubTarget(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the Hexagon ISA class for the insn.
+unsigned getType(MCInstrInfo const &MCII, MCInst const &MCI);
+
+/// Return the slots used by the insn.
+unsigned getUnits(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                  MCInst const &MCI);
+
+// Does the packet have an extender for the instruction at Index
+bool hasExtenderForIndex(MCInst const &MCB, size_t Index);
+
+bool hasImmExt(MCInst const &MCI);
+
+// Return whether the instruction is a legal new-value producer.
+bool hasNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
+bool hasNewValue2(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the instruction at Index
+MCInst const &instruction(MCInst const &MCB, size_t Index);
+
+// Returns whether this MCInst is a wellformed bundle
+bool isBundle(MCInst const &MCI);
+
+// Return whether the insn is an actual insn.
+bool isCanon(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isCofMax1(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isCompound(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return the duplex iclass given the two duplex classes
+unsigned iClassOfDuplexPair(unsigned Ga, unsigned Gb);
+
+int64_t minConstant(MCInst const &MCI, size_t Index);
+template <unsigned N, unsigned S>
+bool inRange(MCInst const &MCI, size_t Index) {
+  return isShiftedUInt<N, S>(minConstant(MCI, Index));
+}
+template <unsigned N, unsigned S>
+bool inSRange(MCInst const &MCI, size_t Index) {
+  return isShiftedInt<N, S>(minConstant(MCI, Index));
+}
+template <unsigned N> bool inRange(MCInst const &MCI, size_t Index) {
+  return isUInt<N>(minConstant(MCI, Index));
+}
+
+// Return whether the instruction needs to be constant extended.
+bool isConstExtended(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Is this double register suitable for use in a duplex subinst
+bool isDblRegForSubInst(unsigned Reg);
+
+// Is this a duplex instruction
+bool isDuplex(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Can these instructions be duplexed
+bool isDuplexPair(MCInst const &MIa, MCInst const &MIb);
+
+// Can these duplex classes be combine in to a duplex instruction
+bool isDuplexPairMatch(unsigned Ga, unsigned Gb);
+
+// Return true if the insn may be extended based on the operand value.
+bool isExtendable(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return whether the instruction must be always extended.
+bool isExtended(MCInstrInfo const &MCII, MCInst const &MCI);
+
+/// Return whether it is a floating-point insn.
+bool isFloat(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Returns whether this instruction is an immediate extender
+bool isImmext(MCInst const &MCI);
+
+// Returns whether this bundle is an endloop0
+bool isInnerLoop(MCInst const &MCI);
+
+// Is this an integer register
+bool isIntReg(unsigned Reg);
+
+// Is this register suitable for use in a duplex subinst
+bool isIntRegForSubInst(unsigned Reg);
+bool isMemReorderDisabled(MCInst const &MCI);
+bool isMemStoreReorderEnabled(MCInst const &MCI);
+
+// Return whether the insn is a new-value consumer.
+bool isNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return true if the operand can be constant extended.
+bool isOperandExtended(MCInstrInfo const &MCII, MCInst const &MCI,
+                       unsigned short OperandNum);
+
+// Can these two instructions be duplexed
+bool isOrderedDuplexPair(MCInstrInfo const &MCII, MCInst const &MIa,
+                         bool ExtendedA, MCInst const &MIb, bool ExtendedB,
+                         bool bisReversable);
+
+// Returns whether this bundle is an endloop1
+bool isOuterLoop(MCInst const &MCI);
+
+// Return whether this instruction is predicated
+bool isPredicated(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isPredicateLate(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isPredicatedNew(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return whether the predicate sense is true
+bool isPredicatedTrue(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Is this a predicate register
+bool isPredReg(unsigned Reg);
+
+// Return whether the insn is a prefix.
+bool isPrefix(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return whether the insn is solo, i.e., cannot be in a packet.
+bool isSolo(MCInstrInfo const &MCII, MCInst const &MCI);
+
+/// Return whether the insn can be packaged only with A and X-type insns.
+bool isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI);
+
+/// Return whether the insn can be packaged only with an A-type insn in slot #1.
+bool isSoloAin1(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isSubInstruction(MCInst const &MCI);
+bool isVector(MCInstrInfo const &MCII, MCInst const &MCI);
+bool mustExtend(MCExpr const &Expr);
+bool mustNotExtend(MCExpr const &Expr);
+
+// Pad the bundle with nops to satisfy endloop requirements
+void padEndloop(MCContext &Context, MCInst &MCI);
+
+bool prefersSlot3(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Replace the instructions inside MCB, represented by Candidate
+void replaceDuplex(MCContext &Context, MCInst &MCB, DuplexCandidate Candidate);
+
+bool s23_2_reloc(MCExpr const &Expr);
+// Marks a bundle as endloop0
+void setInnerLoop(MCInst &MCI);
+void setMemReorderDisabled(MCInst &MCI);
+void setMemStoreReorderEnabled(MCInst &MCI);
+void setMustExtend(MCExpr const &Expr, bool Val = true);
+void setMustNotExtend(MCExpr const &Expr, bool Val = true);
+void setS23_2_reloc(MCExpr const &Expr, bool Val = true);
+
+// Marks a bundle as endloop1
+void setOuterLoop(MCInst &MCI);
+
+// Would duplexing this instruction create a requirement to extend
+bool subInstWouldBeExtended(MCInst const &potentialDuplex);
+unsigned SubregisterBit(unsigned Consumer, unsigned Producer,
+                        unsigned Producer2);
+
+// Attempt to find and replace compound pairs
+void tryCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
+}
+}
+
+#endif // LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
new file mode 100644
index 000000000000..7f8e7a4edb0c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
@@ -0,0 +1,236 @@
+//===----- HexagonMCShuffler.cpp - MC bundle shuffling --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the shuffling of insns inside a bundle according to the
+// packet formation rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-shuffle"
+
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    DisableShuffle("disable-hexagon-shuffle", cl::Hidden, cl::init(false),
+                   cl::desc("Disable Hexagon instruction shuffling"));
+
+void HexagonMCShuffler::init(MCInst &MCB) {
+  if (HexagonMCInstrInfo::isBundle(MCB)) {
+    MCInst const *Extender = nullptr;
+    // Copy the bundle for the shuffling.
+    for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+      assert(!HexagonMCInstrInfo::getDesc(MCII, *I.getInst()).isPseudo());
+      MCInst *MI = const_cast<MCInst *>(I.getInst());
+
+      if (!HexagonMCInstrInfo::isImmext(*MI)) {
+        append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, *MI),
+               false);
+        Extender = nullptr;
+      } else
+        Extender = MI;
+    }
+  }
+
+  BundleFlags = MCB.getOperand(0).getImm();
+}
+
+void HexagonMCShuffler::init(MCInst &MCB, MCInst const *AddMI,
+                             bool bInsertAtFront) {
+  if (HexagonMCInstrInfo::isBundle(MCB)) {
+    if (bInsertAtFront && AddMI)
+      append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, *AddMI),
+             false);
+    MCInst const *Extender = nullptr;
+    // Copy the bundle for the shuffling.
+    for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+      assert(!HexagonMCInstrInfo::getDesc(MCII, *I.getInst()).isPseudo());
+      MCInst *MI = const_cast<MCInst *>(I.getInst());
+      if (!HexagonMCInstrInfo::isImmext(*MI)) {
+        append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, *MI),
+               false);
+        Extender = nullptr;
+      } else
+        Extender = MI;
+    }
+    if (!bInsertAtFront && AddMI)
+      append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, *AddMI),
+             false);
+  }
+
+  BundleFlags = MCB.getOperand(0).getImm();
+}
+
+void HexagonMCShuffler::copyTo(MCInst &MCB) {
+  MCB.clear();
+  MCB.addOperand(MCOperand::createImm(BundleFlags));
+  // Copy the results into the bundle.
+  for (HexagonShuffler::iterator I = begin(); I != end(); ++I) {
+
+    MCInst const *MI = I->getDesc();
+    MCInst const *Extender = I->getExtender();
+    if (Extender)
+      MCB.addOperand(MCOperand::createInst(Extender));
+    MCB.addOperand(MCOperand::createInst(MI));
+  }
+}
+
+bool HexagonMCShuffler::reshuffleTo(MCInst &MCB) {
+  if (shuffle()) {
+    // Copy the results into the bundle.
+    copyTo(MCB);
+  } else
+    DEBUG(MCB.dump());
+
+  return (!getError());
+}
+
+bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                            MCInst &MCB) {
+  HexagonMCShuffler MCS(MCII, STI, MCB);
+
+  if (DisableShuffle)
+    // Ignore if user chose so.
+    return false;
+
+  if (!HexagonMCInstrInfo::bundleSize(MCB)) {
+    // There once was a bundle:
+    //    BUNDLE %D2<imp-def>, %R4<imp-def>, %R5<imp-def>, %D7<imp-def>, ...
+    //      * %D2<def> = IMPLICIT_DEF; flags:
+    //      * %D7<def> = IMPLICIT_DEF; flags:
+    // After the IMPLICIT_DEFs were removed by the asm printer, the bundle
+    // became empty.
+    DEBUG(dbgs() << "Skipping empty bundle");
+    return false;
+  } else if (!HexagonMCInstrInfo::isBundle(MCB)) {
+    DEBUG(dbgs() << "Skipping stand-alone insn");
+    return false;
+  }
+
+  // Reorder the bundle and copy the result.
+  if (!MCS.reshuffleTo(MCB)) {
+    // Unless there is any error, which should not happen at this point.
+    unsigned shuffleError = MCS.getError();
+    switch (shuffleError) {
+    default:
+      llvm_unreachable("unknown error");
+    case HexagonShuffler::SHUFFLE_ERROR_INVALID:
+      llvm_unreachable("invalid packet");
+    case HexagonShuffler::SHUFFLE_ERROR_STORES:
+      llvm_unreachable("too many stores");
+    case HexagonShuffler::SHUFFLE_ERROR_LOADS:
+      llvm_unreachable("too many loads");
+    case HexagonShuffler::SHUFFLE_ERROR_BRANCHES:
+      llvm_unreachable("too many branches");
+    case HexagonShuffler::SHUFFLE_ERROR_NOSLOTS:
+      llvm_unreachable("no suitable slot");
+    case HexagonShuffler::SHUFFLE_ERROR_SLOTS:
+      llvm_unreachable("over-subscribed slots");
+    case HexagonShuffler::SHUFFLE_SUCCESS: // Single instruction case.
+      return true;
+    }
+  }
+
+  return true;
+}
+
+unsigned
+llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                       MCContext &Context, MCInst &MCB,
+                       SmallVector<DuplexCandidate, 8> possibleDuplexes) {
+
+  if (DisableShuffle)
+    return HexagonShuffler::SHUFFLE_SUCCESS;
+
+  if (!HexagonMCInstrInfo::bundleSize(MCB)) {
+    // There once was a bundle:
+    //    BUNDLE %D2<imp-def>, %R4<imp-def>, %R5<imp-def>, %D7<imp-def>, ...
+    //      * %D2<def> = IMPLICIT_DEF; flags:
+    //      * %D7<def> = IMPLICIT_DEF; flags:
+    // After the IMPLICIT_DEFs were removed by the asm printer, the bundle
+    // became empty.
+    DEBUG(dbgs() << "Skipping empty bundle");
+    return HexagonShuffler::SHUFFLE_SUCCESS;
+  } else if (!HexagonMCInstrInfo::isBundle(MCB)) {
+    DEBUG(dbgs() << "Skipping stand-alone insn");
+    return HexagonShuffler::SHUFFLE_SUCCESS;
+  }
+
+  bool doneShuffling = false;
+  unsigned shuffleError;
+  while (possibleDuplexes.size() > 0 && (!doneShuffling)) {
+    // case of Duplex Found
+    DuplexCandidate duplexToTry = possibleDuplexes.pop_back_val();
+    MCInst Attempt(MCB);
+    HexagonMCInstrInfo::replaceDuplex(Context, Attempt, duplexToTry);
+    HexagonMCShuffler MCS(MCII, STI, Attempt); // copy packet to the shuffler
+    if (MCS.size() == 1) {                     // case of one duplex
+      // copy the created duplex in the shuffler to the bundle
+      MCS.copyTo(MCB);
+      return HexagonShuffler::SHUFFLE_SUCCESS;
+    }
+    // try shuffle with this duplex
+    doneShuffling = MCS.reshuffleTo(MCB);
+    shuffleError = MCS.getError();
+
+    if (doneShuffling)
+      break;
+  }
+
+  if (doneShuffling == false) {
+    HexagonMCShuffler MCS(MCII, STI, MCB);
+    doneShuffling = MCS.reshuffleTo(MCB); // shuffle
+    shuffleError = MCS.getError();
+  }
+  if (!doneShuffling)
+    return shuffleError;
+
+  return HexagonShuffler::SHUFFLE_SUCCESS;
+}
+
+bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                            MCInst &MCB, MCInst const *AddMI, int fixupCount) {
+  if (!HexagonMCInstrInfo::isBundle(MCB) || !AddMI)
+    return false;
+
+  // if fixups present, make sure we don't insert too many nops that would
+  // later prevent an extender from being inserted.
+  unsigned int bundleSize = HexagonMCInstrInfo::bundleSize(MCB);
+  if (bundleSize >= HEXAGON_PACKET_SIZE)
+    return false;
+  if (fixupCount >= 2) {
+    return false;
+  } else {
+    if (bundleSize == HEXAGON_PACKET_SIZE - 1 && fixupCount)
+      return false;
+  }
+
+  if (DisableShuffle)
+    return false;
+
+  HexagonMCShuffler MCS(MCII, STI, MCB, AddMI);
+  if (!MCS.reshuffleTo(MCB)) {
+    unsigned shuffleError = MCS.getError();
+    switch (shuffleError) {
+    default:
+      return false;
+    case HexagonShuffler::SHUFFLE_SUCCESS: // single instruction case
+      return true;
+    }
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
new file mode 100644
index 000000000000..a21cce1fc240
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
@@ -0,0 +1,65 @@
+//=-- HexagonMCShuffler.h ---------------------------------------------------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This declares the shuffling of insns inside a bundle according to the
+// packet formation rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCSHUFFLER_H
+#define HEXAGONMCSHUFFLER_H
+
+#include "MCTargetDesc/HexagonShuffler.h"
+
+namespace llvm {
+
+class MCInst;
+
+// Insn bundle shuffler.
+class HexagonMCShuffler : public HexagonShuffler {
+  bool immext_present;
+  bool duplex_present;
+
+public:
+  HexagonMCShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                    MCInst &MCB)
+      : HexagonShuffler(MCII, STI) {
+    init(MCB);
+  };
+  HexagonMCShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                    MCInst &MCB, const MCInst *AddMI,
+                    bool bInsertAtFront = false)
+      : HexagonShuffler(MCII, STI) {
+    init(MCB, AddMI, bInsertAtFront);
+  };
+
+  // Copy reordered bundle to another.
+  void copyTo(MCInst &MCB);
+  // Reorder and copy result to another.
+  bool reshuffleTo(MCInst &MCB);
+
+  bool immextPresent() const { return immext_present; };
+  bool duplexPresent() const { return duplex_present; };
+
+private:
+  void init(MCInst &MCB);
+  void init(MCInst &MCB, const MCInst *AddMI, bool bInsertAtFront = false);
+};
+
+// Invocation of the shuffler.
+bool HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                      MCInst &);
+bool HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                      MCInst &, const MCInst *, int);
+unsigned HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                          MCContext &Context, MCInst &,
+                          SmallVector<DuplexCandidate, 8>);
+}
+
+#endif // HEXAGONMCSHUFFLER_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
new file mode 100644
index 000000000000..694cf582f8d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -0,0 +1,278 @@
+//===-- HexagonMCTargetDesc.cpp - Hexagon Target Descriptions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Hexagon specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonTargetStreamer.h"
+#include "MCTargetDesc/HexagonInstPrinter.h"
+#include "MCTargetDesc/HexagonMCAsmInfo.h"
+#include "MCTargetDesc/HexagonMCELFStreamer.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <cassert>
+#include <cstdint>
+#include <new>
+#include <string>
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "HexagonGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "HexagonGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "HexagonGenRegisterInfo.inc"
+
+cl::opt<bool> llvm::HexagonDisableCompound
+  ("mno-compound",
+   cl::desc("Disable looking for compound instructions for Hexagon"));
+
+cl::opt<bool> llvm::HexagonDisableDuplex
+  ("mno-pairing",
+   cl::desc("Disable looking for duplex instructions for Hexagon"));
+
+static cl::opt<bool> HexagonV4ArchVariant("mv4", cl::Hidden, cl::init(false),
+  cl::desc("Build for Hexagon V4"));
+
+static cl::opt<bool> HexagonV5ArchVariant("mv5", cl::Hidden, cl::init(false),
+  cl::desc("Build for Hexagon V5"));
+
+static cl::opt<bool> HexagonV55ArchVariant("mv55", cl::Hidden, cl::init(false),
+  cl::desc("Build for Hexagon V55"));
+
+static cl::opt<bool> HexagonV60ArchVariant("mv60", cl::Hidden, cl::init(false),
+  cl::desc("Build for Hexagon V60"));
+
+static StringRef DefaultArch = "hexagonv60";
+
+static StringRef HexagonGetArchVariant() {
+  if (HexagonV4ArchVariant)
+    return "hexagonv4";
+  if (HexagonV5ArchVariant)
+    return "hexagonv5";
+  if (HexagonV55ArchVariant)
+    return "hexagonv55";
+  if (HexagonV60ArchVariant)
+    return "hexagonv60";
+  return "";
+}
+
+StringRef Hexagon_MC::selectHexagonCPU(const Triple &TT, StringRef CPU) {
+  StringRef ArchV = HexagonGetArchVariant();
+  if (!ArchV.empty() && !CPU.empty()) {
+    if (ArchV != CPU)
+      report_fatal_error("conflicting architectures specified.");
+    return CPU;
+  }
+  if (ArchV.empty()) {
+    if (CPU.empty())
+      CPU = DefaultArch;
+    return CPU;
+  }
+  return ArchV;
+}
+
+MCInstrInfo *llvm::createHexagonMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitHexagonMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createHexagonMCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitHexagonMCRegisterInfo(X, Hexagon::R31);
+  return X;
+}
+
+static MCSubtargetInfo *
+createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  CPU = Hexagon_MC::selectHexagonCPU(TT, CPU);
+  return createHexagonMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+namespace {
+
+class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
+public:
+  HexagonTargetAsmStreamer(MCStreamer &S,
+                           formatted_raw_ostream &, bool,
+                           MCInstPrinter &)
+      : HexagonTargetStreamer(S) {}
+
+  void prettyPrintAsm(MCInstPrinter &InstPrinter, raw_ostream &OS,
+                      const MCInst &Inst, const MCSubtargetInfo &STI) override {
+    assert(HexagonMCInstrInfo::isBundle(Inst));
+    assert(HexagonMCInstrInfo::bundleSize(Inst) <= HEXAGON_PACKET_SIZE);
+    std::string Buffer;
+    {
+      raw_string_ostream TempStream(Buffer);
+      InstPrinter.printInst(&Inst, TempStream, "", STI);
+    }
+    StringRef Contents(Buffer);
+    auto PacketBundle = Contents.rsplit('\n');
+    auto HeadTail = PacketBundle.first.split('\n');
+    StringRef Separator = "\n";
+    StringRef Indent = "\t\t";
+    OS << "\t{\n";
+    while (!HeadTail.first.empty()) {
+      StringRef InstTxt;
+      auto Duplex = HeadTail.first.split('\v');
+      if (!Duplex.second.empty()) {
+        OS << Indent << Duplex.first << Separator;
+        InstTxt = Duplex.second;
+      } else if (!HeadTail.first.trim().startswith("immext")) {
+        InstTxt = Duplex.first;
+      }
+      if (!InstTxt.empty())
+        OS << Indent << InstTxt << Separator;
+      HeadTail = HeadTail.second.split('\n');
+    }
+    OS << "\t}" << PacketBundle.second;
+  }
+};
+
+class HexagonTargetELFStreamer : public HexagonTargetStreamer {
+public:
+  HexagonTargetELFStreamer(MCStreamer &S, MCSubtargetInfo const &STI)
+      : HexagonTargetStreamer(S) {
+    auto Bits = STI.getFeatureBits();
+    unsigned Flags = 0;
+    if (Bits[Hexagon::ArchV60])
+      Flags = ELF::EF_HEXAGON_MACH_V60;
+    else if (Bits[Hexagon::ArchV55])
+      Flags = ELF::EF_HEXAGON_MACH_V55;
+    else if (Bits[Hexagon::ArchV5])
+      Flags = ELF::EF_HEXAGON_MACH_V5;
+    else if (Bits[Hexagon::ArchV4])
+      Flags = ELF::EF_HEXAGON_MACH_V4;
+    getStreamer().getAssembler().setELFHeaderEFlags(Flags);
+  }
+
+  MCELFStreamer &getStreamer() {
+    return static_cast<MCELFStreamer &>(Streamer);
+  }
+
+  void EmitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
+                              unsigned ByteAlignment,
+                              unsigned AccessSize) override {
+    HexagonMCELFStreamer &HexagonELFStreamer =
+        static_cast<HexagonMCELFStreamer &>(getStreamer());
+    HexagonELFStreamer.HexagonMCEmitCommonSymbol(Symbol, Size, ByteAlignment,
+                                                 AccessSize);
+  }
+
+  void EmitLocalCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
+                                   unsigned ByteAlignment,
+                                   unsigned AccessSize) override {
+    HexagonMCELFStreamer &HexagonELFStreamer =
+        static_cast<HexagonMCELFStreamer &>(getStreamer());
+    HexagonELFStreamer.HexagonMCEmitLocalCommonSymbol(
+        Symbol, Size, ByteAlignment, AccessSize);
+  }
+};
+
+} // end anonymous namespace
+
+static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI,
+                                         const Triple &TT) {
+  MCAsmInfo *MAI = new HexagonMCAsmInfo(TT);
+
+  // VirtualFP = (R30 + #0).
+  MCCFIInstruction Inst =
+      MCCFIInstruction::createDefCfa(nullptr, Hexagon::R30, 0);
+  MAI->addInitialFrameState(Inst);
+
+  return MAI;
+}
+
+static MCInstPrinter *createHexagonMCInstPrinter(const Triple &T,
+                                                 unsigned SyntaxVariant,
+                                                 const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI) {
+  if (SyntaxVariant == 0)
+    return (new HexagonInstPrinter(MAI, MII, MRI));
+  else
+    return nullptr;
+}
+
+static MCTargetStreamer *createMCAsmTargetStreamer(MCStreamer &S,
+                                                   formatted_raw_ostream &OS,
+                                                   MCInstPrinter *InstPrint,
+                                                   bool IsVerboseAsm) {
+  return new HexagonTargetAsmStreamer(S,  OS, IsVerboseAsm, *InstPrint);
+}
+
+static MCStreamer *createMCStreamer(Triple const &T, MCContext &Context,
+                                    MCAsmBackend &MAB, raw_pwrite_stream &OS,
+                                    MCCodeEmitter *Emitter, bool RelaxAll) {
+  return createHexagonELFStreamer(Context, MAB, OS, Emitter);
+}
+
+static MCTargetStreamer *
+createHexagonObjectTargetStreamer(MCStreamer &S, MCSubtargetInfo const &STI) {
+  return new HexagonTargetELFStreamer(S, STI);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeHexagonTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(getTheHexagonTarget(), createHexagonMCAsmInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(getTheHexagonTarget(),
+                                      createHexagonMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(getTheHexagonTarget(),
+                                    createHexagonMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(getTheHexagonTarget(),
+                                          createHexagonMCSubtargetInfo);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(getTheHexagonTarget(),
+                                        createHexagonMCCodeEmitter);
+
+  // Register the asm backend
+  TargetRegistry::RegisterMCAsmBackend(getTheHexagonTarget(),
+                                       createHexagonAsmBackend);
+
+  // Register the obj streamer
+  TargetRegistry::RegisterELFStreamer(getTheHexagonTarget(), createMCStreamer);
+
+  // Register the asm streamer
+  TargetRegistry::RegisterAsmTargetStreamer(getTheHexagonTarget(),
+                                            createMCAsmTargetStreamer);
+
+  // Register the MC Inst Printer
+  TargetRegistry::RegisterMCInstPrinter(getTheHexagonTarget(),
+                                        createHexagonMCInstPrinter);
+
+  TargetRegistry::RegisterObjectTargetStreamer(
+      getTheHexagonTarget(), createHexagonObjectTargetStreamer);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
new file mode 100644
index 000000000000..6e677e9d9f86
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -0,0 +1,79 @@
+//===-- HexagonMCTargetDesc.h - Hexagon Target Descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Hexagon specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCTARGETDESC_H
+#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCTARGETDESC_H
+
+#include "llvm/Support/CommandLine.h"
+#include <cstdint>
+
+namespace llvm {
+
+struct InstrItinerary;
+struct InstrStage;
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class Target;
+class Triple;
+class StringRef;
+class raw_ostream;
+class raw_pwrite_stream;
+
+Target &getTheHexagonTarget();
+extern cl::opt<bool> HexagonDisableCompound;
+extern cl::opt<bool> HexagonDisableDuplex;
+extern const InstrStage HexagonStages[];
+
+MCInstrInfo *createHexagonMCInstrInfo();
+
+MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII,
+                                          const MCRegisterInfo &MRI,
+                                          MCContext &MCT);
+
+MCAsmBackend *createHexagonAsmBackend(const Target &T,
+                                      const MCRegisterInfo &MRI,
+                                      const Triple &TT, StringRef CPU,
+                                      const MCTargetOptions &Options);
+
+MCObjectWriter *createHexagonELFObjectWriter(raw_pwrite_stream &OS,
+                                             uint8_t OSABI, StringRef CPU);
+
+namespace Hexagon_MC {
+
+  StringRef selectHexagonCPU(const Triple &TT, StringRef CPU);
+
+} // end namespace Hexagon_MC
+
+} // end namespace llvm
+
+// Define symbolic names for Hexagon registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "HexagonGenRegisterInfo.inc"
+
+// Defines symbolic names for the Hexagon instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "HexagonGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "HexagonGenSubtargetInfo.inc"
+
+#endif // LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCTARGETDESC_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
new file mode 100644
index 000000000000..88f37d620dcf
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -0,0 +1,463 @@
+//===----- HexagonShuffler.cpp - Instruction bundle shuffling -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the shuffling of insns inside a bundle according to the
+// packet formation rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-shuffle"
+
+#include <algorithm>
+#include <utility>
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "HexagonShuffler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+// Insn shuffling priority.
+class HexagonBid {
+  // The priority is directly proportional to how restricted the insn is based
+  // on its flexibility to run on the available slots.  So, the fewer slots it
+  // may run on, the higher its priority.
+  enum { MAX = 360360 }; // LCD of 1/2, 1/3, 1/4,... 1/15.
+  unsigned Bid;
+
+public:
+  HexagonBid() : Bid(0){};
+  HexagonBid(unsigned B) { Bid = B ? MAX / countPopulation(B) : 0; };
+
+  // Check if the insn priority is overflowed.
+  bool isSold() const { return (Bid >= MAX); };
+
+  HexagonBid &operator+=(const HexagonBid &B) {
+    Bid += B.Bid;
+    return *this;
+  };
+};
+
+// Slot shuffling allocation.
+class HexagonUnitAuction {
+  HexagonBid Scores[HEXAGON_PACKET_SIZE];
+  // Mask indicating which slot is unavailable.
+  unsigned isSold : HEXAGON_PACKET_SIZE;
+
+public:
+  HexagonUnitAuction() : isSold(0){};
+
+  // Allocate slots.
+  bool bid(unsigned B) {
+    // Exclude already auctioned slots from the bid.
+    unsigned b = B & ~isSold;
+    if (b) {
+      for (unsigned i = 0; i < HEXAGON_PACKET_SIZE; ++i)
+        if (b & (1 << i)) {
+          // Request candidate slots.
+          Scores[i] += HexagonBid(b);
+          isSold |= Scores[i].isSold() << i;
+        }
+      return true;
+      ;
+    } else
+      // Error if the desired slots are already full.
+      return false;
+  };
+};
+} // end anonymous namespace
+
+unsigned HexagonResource::setWeight(unsigned s) {
+  const unsigned SlotWeight = 8;
+  const unsigned MaskWeight = SlotWeight - 1;
+  bool Key = (1 << s) & getUnits();
+
+  // TODO: Improve this API so that we can prevent misuse statically.
+  assert(SlotWeight * s < 32 && "Argument to setWeight too large.");
+
+  // Calculate relative weight of the insn for the given slot, weighing it the
+  // heavier the more restrictive the insn is and the lowest the slots that the
+  // insn may be executed in.
+  Weight =
+      (Key << (SlotWeight * s)) * ((MaskWeight - countPopulation(getUnits()))
+                                   << countTrailingZeros(getUnits()));
+  return (Weight);
+}
+
+void HexagonCVIResource::SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU) {
+  (*TUL)[HexagonII::TypeCVI_VA] =
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VA_DV] = UnitsAndLanes(CVI_XLANE | CVI_MPY0, 2);
+  (*TUL)[HexagonII::TypeCVI_VX] = UnitsAndLanes(CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VX_DV] = UnitsAndLanes(CVI_MPY0, 2);
+  (*TUL)[HexagonII::TypeCVI_VP] = UnitsAndLanes(CVI_XLANE, 1);
+  (*TUL)[HexagonII::TypeCVI_VP_VS] = UnitsAndLanes(CVI_XLANE, 2);
+  (*TUL)[HexagonII::TypeCVI_VS] = UnitsAndLanes(CVI_SHIFT, 1);
+  (*TUL)[HexagonII::TypeCVI_VINLANESAT] = UnitsAndLanes(CVI_SHIFT, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_LD] =
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_TMP_LD] = UnitsAndLanes(CVI_NONE, 0);
+  (*TUL)[HexagonII::TypeCVI_VM_CUR_LD] =
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_VP_LDU] = UnitsAndLanes(CVI_XLANE, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_ST] =
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VM_NEW_ST] = UnitsAndLanes(CVI_NONE, 0);
+  (*TUL)[HexagonII::TypeCVI_VM_STU] = UnitsAndLanes(CVI_XLANE, 1);
+  (*TUL)[HexagonII::TypeCVI_HIST] = UnitsAndLanes(CVI_XLANE, 4);
+}
+
+HexagonCVIResource::HexagonCVIResource(TypeUnitsAndLanes *TUL,
+                                       MCInstrInfo const &MCII, unsigned s,
+                                       MCInst const *id)
+    : HexagonResource(s), TUL(TUL) {
+  unsigned T = HexagonMCInstrInfo::getType(MCII, *id);
+
+  if (TUL->count(T)) {
+    // For an HVX insn.
+    Valid = true;
+    setUnits((*TUL)[T].first);
+    setLanes((*TUL)[T].second);
+    setLoad(HexagonMCInstrInfo::getDesc(MCII, *id).mayLoad());
+    setStore(HexagonMCInstrInfo::getDesc(MCII, *id).mayStore());
+  } else {
+    // For core insns.
+    Valid = false;
+    setUnits(0);
+    setLanes(0);
+    setLoad(false);
+    setStore(false);
+  }
+}
+
+HexagonShuffler::HexagonShuffler(MCInstrInfo const &MCII,
+                                 MCSubtargetInfo const &STI)
+    : MCII(MCII), STI(STI) {
+  reset();
+  HexagonCVIResource::SetupTUL(&TUL, STI.getCPU());
+}
+
+void HexagonShuffler::reset() {
+  Packet.clear();
+  BundleFlags = 0;
+  Error = SHUFFLE_SUCCESS;
+}
+
+void HexagonShuffler::append(MCInst const *ID, MCInst const *Extender,
+                             unsigned S, bool X) {
+  HexagonInstr PI(&TUL, MCII, ID, Extender, S, X);
+
+  Packet.push_back(PI);
+}
+
+/// Check that the packet is legal and enforce relative insn order.
+bool HexagonShuffler::check() {
+  // Descriptive slot masks.
+  const unsigned slotSingleLoad = 0x1, slotSingleStore = 0x1, slotOne = 0x2,
+                 slotThree = 0x8, slotFirstJump = 0x8, slotLastJump = 0x4,
+                 slotFirstLoadStore = 0x2, slotLastLoadStore = 0x1;
+  // Highest slots for branches and stores used to keep their original order.
+  unsigned slotJump = slotFirstJump;
+  unsigned slotLoadStore = slotFirstLoadStore;
+  // Number of branches, solo branches, indirect branches.
+  unsigned jumps = 0, jump1 = 0;
+  // Number of memory operations, loads, solo loads, stores, solo stores, single
+  // stores.
+  unsigned memory = 0, loads = 0, load0 = 0, stores = 0, store0 = 0, store1 = 0;
+  // Number of HVX loads, HVX stores.
+  unsigned CVIloads = 0, CVIstores = 0;
+  // Number of duplex insns, solo insns.
+  unsigned duplex = 0, solo = 0;
+  // Number of insns restricting other insns in the packet to A and X types,
+  // which is neither A or X types.
+  unsigned onlyAX = 0, neitherAnorX = 0;
+  // Number of insns restricting other insns in slot #1 to A type.
+  unsigned onlyAin1 = 0;
+  // Number of insns restricting any insn in slot #1, except A2_nop.
+  unsigned onlyNo1 = 0;
+  unsigned xtypeFloat = 0;
+  unsigned pSlot3Cnt = 0;
+  iterator slot3ISJ = end();
+
+  // Collect information from the insns in the packet.
+  for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
+    MCInst const *ID = ISJ->getDesc();
+
+    if (HexagonMCInstrInfo::isSolo(MCII, *ID))
+      solo += !ISJ->isSoloException();
+    else if (HexagonMCInstrInfo::isSoloAX(MCII, *ID))
+      onlyAX += !ISJ->isSoloException();
+    else if (HexagonMCInstrInfo::isSoloAin1(MCII, *ID))
+      onlyAin1 += !ISJ->isSoloException();
+    if (HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeALU32 &&
+        HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeXTYPE)
+      ++neitherAnorX;
+    if (HexagonMCInstrInfo::prefersSlot3(MCII, *ID)) {
+      ++pSlot3Cnt;
+      slot3ISJ = ISJ;
+    }
+    if (HexagonMCInstrInfo::isCofMax1(MCII, *ID))
+      ++jump1;
+
+    switch (HexagonMCInstrInfo::getType(MCII, *ID)) {
+    case HexagonII::TypeXTYPE:
+      if (HexagonMCInstrInfo::isFloat(MCII, *ID))
+        ++xtypeFloat;
+      break;
+    case HexagonII::TypeJR:
+    case HexagonII::TypeJ:
+      ++jumps;
+      break;
+    case HexagonII::TypeCVI_VM_VP_LDU:
+      ++onlyNo1;
+    case HexagonII::TypeCVI_VM_LD:
+    case HexagonII::TypeCVI_VM_TMP_LD:
+    case HexagonII::TypeCVI_VM_CUR_LD:
+      ++CVIloads;
+    case HexagonII::TypeLD:
+      ++loads;
+      ++memory;
+      if (ISJ->Core.getUnits() == slotSingleLoad)
+        ++load0;
+      if (HexagonMCInstrInfo::getDesc(MCII, *ID).isReturn())
+        ++jumps, ++jump1; // DEALLOC_RETURN is of type LD.
+      break;
+    case HexagonII::TypeCVI_VM_STU:
+      ++onlyNo1;
+    case HexagonII::TypeCVI_VM_ST:
+    case HexagonII::TypeCVI_VM_NEW_ST:
+      ++CVIstores;
+    case HexagonII::TypeST:
+      ++stores;
+      ++memory;
+      if (ISJ->Core.getUnits() == slotSingleStore)
+        ++store0;
+      break;
+    case HexagonII::TypeV4LDST:
+      ++loads;
+      ++stores;
+      ++store1;
+      ++memory;
+      break;
+    case HexagonII::TypeNV:
+      ++memory; // NV insns are memory-like.
+      if (HexagonMCInstrInfo::getDesc(MCII, *ID).isBranch())
+        ++jumps, ++jump1;
+      break;
+    case HexagonII::TypeCR:
+    // Legacy conditional branch predicated on a register.
+    case HexagonII::TypeSYSTEM:
+      if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayLoad())
+        ++loads;
+      break;
+    }
+  }
+
+  // Check if the packet is legal.
+  if ((load0 > 1 || store0 > 1 || CVIloads > 1 || CVIstores > 1) ||
+      (duplex > 1 || (duplex && memory)) || (solo && size() > 1) ||
+      (onlyAX && neitherAnorX > 1) || (onlyAX && xtypeFloat)) {
+    Error = SHUFFLE_ERROR_INVALID;
+    return false;
+  }
+
+  if (jump1 && jumps > 1) {
+    // Error if single branch with another branch.
+    Error = SHUFFLE_ERROR_BRANCHES;
+    return false;
+  }
+
+  // Modify packet accordingly.
+  // TODO: need to reserve slots #0 and #1 for duplex insns.
+  bool bOnlySlot3 = false;
+  for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
+    MCInst const *ID = ISJ->getDesc();
+
+    if (!ISJ->Core.getUnits()) {
+      // Error if insn may not be executed in any slot.
+      Error = SHUFFLE_ERROR_UNKNOWN;
+      return false;
+    }
+
+    // Exclude from slot #1 any insn but A2_nop.
+    if (HexagonMCInstrInfo::getDesc(MCII, *ID).getOpcode() != Hexagon::A2_nop)
+      if (onlyNo1)
+        ISJ->Core.setUnits(ISJ->Core.getUnits() & ~slotOne);
+
+    // Exclude from slot #1 any insn but A-type.
+    if (HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeALU32)
+      if (onlyAin1)
+        ISJ->Core.setUnits(ISJ->Core.getUnits() & ~slotOne);
+
+    // Branches must keep the original order.
+    if (HexagonMCInstrInfo::getDesc(MCII, *ID).isBranch() ||
+        HexagonMCInstrInfo::getDesc(MCII, *ID).isCall())
+      if (jumps > 1) {
+        if (slotJump < slotLastJump) {
+          // Error if indirect branch with another branch or
+          // no more slots available for branches.
+          Error = SHUFFLE_ERROR_BRANCHES;
+          return false;
+        }
+        // Pin the branch to the highest slot available to it.
+        ISJ->Core.setUnits(ISJ->Core.getUnits() & slotJump);
+        // Update next highest slot available to branches.
+        slotJump >>= 1;
+      }
+
+    // A single load must use slot #0.
+    if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayLoad()) {
+      if (loads == 1 && loads == memory)
+        // Pin the load to slot #0.
+        ISJ->Core.setUnits(ISJ->Core.getUnits() & slotSingleLoad);
+    }
+
+    // A single store must use slot #0.
+    if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayStore()) {
+      if (!store0) {
+        if (stores == 1)
+          ISJ->Core.setUnits(ISJ->Core.getUnits() & slotSingleStore);
+        else if (stores > 1) {
+          if (slotLoadStore < slotLastLoadStore) {
+            // Error if no more slots available for stores.
+            Error = SHUFFLE_ERROR_STORES;
+            return false;
+          }
+          // Pin the store to the highest slot available to it.
+          ISJ->Core.setUnits(ISJ->Core.getUnits() & slotLoadStore);
+          // Update the next highest slot available to stores.
+          slotLoadStore >>= 1;
+        }
+      }
+      if (store1 && stores > 1) {
+        // Error if a single store with another store.
+        Error = SHUFFLE_ERROR_STORES;
+        return false;
+      }
+    }
+
+    // flag if an instruction can only be executed in slot 3
+    if (ISJ->Core.getUnits() == slotThree)
+      bOnlySlot3 = true;
+
+    if (!ISJ->Core.getUnits()) {
+      // Error if insn may not be executed in any slot.
+      Error = SHUFFLE_ERROR_NOSLOTS;
+      return false;
+    }
+  }
+
+  bool validateSlots = true;
+  if (bOnlySlot3 == false && pSlot3Cnt == 1 && slot3ISJ != end()) {
+    // save off slot mask of instruction marked with A_PREFER_SLOT3
+    // and then pin it to slot #3
+    unsigned saveUnits = slot3ISJ->Core.getUnits();
+    slot3ISJ->Core.setUnits(saveUnits & slotThree);
+
+    HexagonUnitAuction AuctionCore;
+    std::sort(begin(), end(), HexagonInstr::lessCore);
+
+    // see if things ok with that instruction being pinned to slot #3
+    bool bFail = false;
+    for (iterator I = begin(); I != end() && bFail != true; ++I)
+      if (!AuctionCore.bid(I->Core.getUnits()))
+        bFail = true;
+
+    // if yes, great, if not then restore original slot mask
+    if (!bFail)
+      validateSlots = false; // all good, no need to re-do auction
+    else
+      for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
+        MCInst const *ID = ISJ->getDesc();
+        if (HexagonMCInstrInfo::prefersSlot3(MCII, *ID))
+          ISJ->Core.setUnits(saveUnits);
+      }
+  }
+
+  // Check if any slot, core, is over-subscribed.
+  // Verify the core slot subscriptions.
+  if (validateSlots) {
+    HexagonUnitAuction AuctionCore;
+
+    std::sort(begin(), end(), HexagonInstr::lessCore);
+
+    for (iterator I = begin(); I != end(); ++I)
+      if (!AuctionCore.bid(I->Core.getUnits())) {
+        Error = SHUFFLE_ERROR_SLOTS;
+        return false;
+      }
+  }
+  // Verify the CVI slot subscriptions.
+  {
+    HexagonUnitAuction AuctionCVI;
+
+    std::sort(begin(), end(), HexagonInstr::lessCVI);
+
+    for (iterator I = begin(); I != end(); ++I)
+      for (unsigned i = 0; i < I->CVI.getLanes(); ++i) // TODO: I->CVI.isValid?
+        if (!AuctionCVI.bid(I->CVI.getUnits() << i)) {
+          Error = SHUFFLE_ERROR_SLOTS;
+          return false;
+        }
+  }
+
+  Error = SHUFFLE_SUCCESS;
+  return true;
+}
+
+bool HexagonShuffler::shuffle() {
+  if (size() > HEXAGON_PACKET_SIZE) {
+    // Ignore a packet with with more than what a packet can hold
+    // or with compound or duplex insns for now.
+    Error = SHUFFLE_ERROR_INVALID;
+    return false;
+  }
+
+  // Check and prepare packet.
+  if (size() > 1 && check())
+    // Reorder the handles for each slot.
+    for (unsigned nSlot = 0, emptySlots = 0; nSlot < HEXAGON_PACKET_SIZE;
+         ++nSlot) {
+      iterator ISJ, ISK;
+      unsigned slotSkip, slotWeight;
+
+      // Prioritize the handles considering their restrictions.
+      for (ISJ = ISK = Packet.begin(), slotSkip = slotWeight = 0;
+           ISK != Packet.end(); ++ISK, ++slotSkip)
+        if (slotSkip < nSlot - emptySlots)
+          // Note which handle to begin at.
+          ++ISJ;
+        else
+          // Calculate the weight of the slot.
+          slotWeight += ISK->Core.setWeight(HEXAGON_PACKET_SIZE - nSlot - 1);
+
+      if (slotWeight)
+        // Sort the packet, favoring source order,
+        // beginning after the previous slot.
+        std::sort(ISJ, Packet.end());
+      else
+        // Skip unused slot.
+        ++emptySlots;
+    }
+
+  for (iterator ISJ = begin(); ISJ != end(); ++ISJ)
+    DEBUG(dbgs().write_hex(ISJ->Core.getUnits());
+          dbgs() << ':'
+                 << HexagonMCInstrInfo::getDesc(MCII, *ISJ->getDesc())
+                        .getOpcode();
+          dbgs() << '\n');
+  DEBUG(dbgs() << '\n');
+
+  return (!getError());
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
new file mode 100644
index 000000000000..a093f8545132
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
@@ -0,0 +1,190 @@
+//===----- HexagonShuffler.h - Instruction bundle shuffling ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the shuffling of insns inside a bundle according to the
+// packet formation rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONSHUFFLER_H
+#define HEXAGONSHUFFLER_H
+
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+// Insn resources.
+class HexagonResource {
+  // Mask of the slots or units that may execute the insn and
+  // the weight or priority that the insn requires to be assigned a slot.
+  unsigned Slots, Weight;
+
+public:
+  HexagonResource(unsigned s) { setUnits(s); };
+
+  void setUnits(unsigned s) {
+    Slots = s & ~(~0U << HEXAGON_PACKET_SIZE);
+  };
+  unsigned setWeight(unsigned s);
+
+  unsigned getUnits() const { return (Slots); };
+  unsigned getWeight() const { return (Weight); };
+
+  // Check if the resources are in ascending slot order.
+  static bool lessUnits(const HexagonResource &A, const HexagonResource &B) {
+    return (countPopulation(A.getUnits()) < countPopulation(B.getUnits()));
+  };
+  // Check if the resources are in ascending weight order.
+  static bool lessWeight(const HexagonResource &A, const HexagonResource &B) {
+    return (A.getWeight() < B.getWeight());
+  };
+};
+
+// HVX insn resources.
+class HexagonCVIResource : public HexagonResource {
+public:
+  typedef std::pair<unsigned, unsigned> UnitsAndLanes;
+  typedef llvm::DenseMap<unsigned, UnitsAndLanes> TypeUnitsAndLanes;
+
+private:
+  // Available HVX slots.
+  enum {
+    CVI_NONE = 0,
+    CVI_XLANE = 1 << 0,
+    CVI_SHIFT = 1 << 1,
+    CVI_MPY0 = 1 << 2,
+    CVI_MPY1 = 1 << 3
+  };
+
+  TypeUnitsAndLanes *TUL;
+
+  // Count of adjacent slots that the insn requires to be executed.
+  unsigned Lanes;
+  // Flag whether the insn is a load or a store.
+  bool Load, Store;
+  // Flag whether the HVX resources are valid.
+  bool Valid;
+
+  void setLanes(unsigned l) { Lanes = l; };
+  void setLoad(bool f = true) { Load = f; };
+  void setStore(bool f = true) { Store = f; };
+
+public:
+  HexagonCVIResource(TypeUnitsAndLanes *TUL, MCInstrInfo const &MCII,
+                     unsigned s, MCInst const *id);
+  static void SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU);
+
+  bool isValid() const { return (Valid); };
+  unsigned getLanes() const { return (Lanes); };
+  bool mayLoad() const { return (Load); };
+  bool mayStore() const { return (Store); };
+};
+
+// Handle to an insn used by the shuffling algorithm.
+class HexagonInstr {
+  friend class HexagonShuffler;
+
+  MCInst const *ID;
+  MCInst const *Extender;
+  HexagonResource Core;
+  HexagonCVIResource CVI;
+  bool SoloException;
+
+public:
+  HexagonInstr(HexagonCVIResource::TypeUnitsAndLanes *T,
+               MCInstrInfo const &MCII, MCInst const *id,
+               MCInst const *Extender, unsigned s, bool x = false)
+      : ID(id), Extender(Extender), Core(s), CVI(T, MCII, s, id),
+        SoloException(x) {};
+
+  MCInst const *getDesc() const { return (ID); };
+
+  MCInst const *getExtender() const { return Extender; }
+
+  unsigned isSoloException() const { return (SoloException); };
+
+  // Check if the handles are in ascending order for shuffling purposes.
+  bool operator<(const HexagonInstr &B) const {
+    return (HexagonResource::lessWeight(B.Core, Core));
+  };
+  // Check if the handles are in ascending order by core slots.
+  static bool lessCore(const HexagonInstr &A, const HexagonInstr &B) {
+    return (HexagonResource::lessUnits(A.Core, B.Core));
+  };
+  // Check if the handles are in ascending order by HVX slots.
+  static bool lessCVI(const HexagonInstr &A, const HexagonInstr &B) {
+    return (HexagonResource::lessUnits(A.CVI, B.CVI));
+  };
+};
+
+// Bundle shuffler.
+class HexagonShuffler {
+  typedef SmallVector<HexagonInstr, HEXAGON_PRESHUFFLE_PACKET_SIZE>
+      HexagonPacket;
+
+  // Insn handles in a bundle.
+  HexagonPacket Packet;
+
+  // Shuffling error code.
+  unsigned Error;
+
+  HexagonCVIResource::TypeUnitsAndLanes TUL;
+
+protected:
+  int64_t BundleFlags;
+  MCInstrInfo const &MCII;
+  MCSubtargetInfo const &STI;
+
+public:
+  typedef HexagonPacket::iterator iterator;
+
+  enum {
+    SHUFFLE_SUCCESS = 0,    ///< Successful operation.
+    SHUFFLE_ERROR_INVALID,  ///< Invalid bundle.
+    SHUFFLE_ERROR_STORES,   ///< No free slots for store insns.
+    SHUFFLE_ERROR_LOADS,    ///< No free slots for load insns.
+    SHUFFLE_ERROR_BRANCHES, ///< No free slots for branch insns.
+    SHUFFLE_ERROR_NOSLOTS,  ///< No free slots for other insns.
+    SHUFFLE_ERROR_SLOTS,    ///< Over-subscribed slots.
+    SHUFFLE_ERROR_ERRATA2, ///< Errata violation (v60).
+    SHUFFLE_ERROR_STORE_LOAD_CONFLICT, ///< store/load conflict
+    SHUFFLE_ERROR_UNKNOWN   ///< Unknown error.
+  };
+
+  explicit HexagonShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI);
+
+  // Reset to initial state.
+  void reset();
+  // Check if the bundle may be validly shuffled.
+  bool check();
+  // Reorder the insn handles in the bundle.
+  bool shuffle();
+
+  unsigned size() const { return (Packet.size()); };
+
+  iterator begin() { return (Packet.begin()); };
+  iterator end() { return (Packet.end()); };
+
+  // Add insn handle to the bundle .
+  void append(MCInst const *ID, MCInst const *Extender, unsigned S,
+              bool X = false);
+
+  // Return the error code for the last check or shuffling of the bundle.
+  void setError(unsigned Err) { Error = Err; };
+  unsigned getError() const { return (Error); };
+};
+}
+
+#endif // HEXAGONSHUFFLER_H
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp b/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp
new file mode 100644
index 000000000000..392871628d98
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp
@@ -0,0 +1,240 @@
+//===--- RDFCopy.cpp ------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RDF-based copy propagation.
+
+#include "RDFCopy.h"
+#include "RDFGraph.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+using namespace rdf;
+
+#ifndef NDEBUG
+static cl::opt<unsigned> CpLimit("rdf-cp-limit", cl::init(0), cl::Hidden);
+static unsigned CpCount = 0;
+#endif
+
+bool CopyPropagation::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case TargetOpcode::COPY: {
+      const MachineOperand &Dst = MI->getOperand(0);
+      const MachineOperand &Src = MI->getOperand(1);
+      RegisterRef DstR = DFG.makeRegRef(Dst.getReg(), Dst.getSubReg());
+      RegisterRef SrcR = DFG.makeRegRef(Src.getReg(), Src.getSubReg());
+      assert(TargetRegisterInfo::isPhysicalRegister(DstR.Reg));
+      assert(TargetRegisterInfo::isPhysicalRegister(SrcR.Reg));
+      const TargetRegisterInfo &TRI = DFG.getTRI();
+      if (TRI.getMinimalPhysRegClass(DstR.Reg) !=
+          TRI.getMinimalPhysRegClass(SrcR.Reg))
+        return false;
+      EM.insert(std::make_pair(DstR, SrcR));
+      return true;
+    }
+    case TargetOpcode::REG_SEQUENCE:
+      llvm_unreachable("Unexpected REG_SEQUENCE");
+  }
+  return false;
+}
+
+
+void CopyPropagation::recordCopy(NodeAddr<StmtNode*> SA, EqualityMap &EM) {
+  CopyMap.insert(std::make_pair(SA.Id, EM));
+  Copies.push_back(SA.Id);
+
+  for (auto I : EM) {
+    auto FS = DefM.find(I.second.Reg);
+    if (FS == DefM.end() || FS->second.empty())
+      continue; // Undefined source
+    RDefMap[I.second][SA.Id] = FS->second.top()->Id;
+    // Insert DstR into the map.
+    RDefMap[I.first];
+  }
+}
+
+
+void CopyPropagation::updateMap(NodeAddr<InstrNode*> IA) {
+  RegisterSet RRs;
+  for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
+    RRs.insert(RA.Addr->getRegRef(DFG));
+  bool Common = false;
+  for (auto &R : RDefMap) {
+    if (!RRs.count(R.first))
+      continue;
+    Common = true;
+    break;
+  }
+  if (!Common)
+    return;
+
+  for (auto &R : RDefMap) {
+    if (!RRs.count(R.first))
+      continue;
+    auto F = DefM.find(R.first.Reg);
+    if (F == DefM.end() || F->second.empty())
+      continue;
+    R.second[IA.Id] = F->second.top()->Id;
+  }
+}
+
+
+bool CopyPropagation::scanBlock(MachineBasicBlock *B) {
+  bool Changed = false;
+  auto BA = DFG.getFunc().Addr->findBlock(B, DFG);
+  DFG.markBlock(BA.Id, DefM);
+
+  for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
+    if (DFG.IsCode<NodeAttrs::Stmt>(IA)) {
+      NodeAddr<StmtNode*> SA = IA;
+      EqualityMap EM;
+      if (interpretAsCopy(SA.Addr->getCode(), EM))
+        recordCopy(SA, EM);
+    }
+
+    updateMap(IA);
+    DFG.pushDefs(IA, DefM);
+  }
+
+  MachineDomTreeNode *N = MDT.getNode(B);
+  for (auto I : *N)
+    Changed |= scanBlock(I->getBlock());
+
+  DFG.releaseBlock(BA.Id, DefM);
+  return Changed;
+}
+
+
+bool CopyPropagation::run() {
+  scanBlock(&DFG.getMF().front());
+
+  if (trace()) {
+    dbgs() << "Copies:\n";
+    for (auto I : Copies) {
+      dbgs() << "Instr: " << *DFG.addr<StmtNode*>(I).Addr->getCode();
+      dbgs() << "   eq: {";
+      for (auto J : CopyMap[I])
+        dbgs() << ' ' << Print<RegisterRef>(J.first, DFG) << '='
+               << Print<RegisterRef>(J.second, DFG);
+      dbgs() << " }\n";
+    }
+    dbgs() << "\nRDef map:\n";
+    for (auto R : RDefMap) {
+      dbgs() << Print<RegisterRef>(R.first, DFG) << " -> {";
+      for (auto &M : R.second)
+        dbgs() << ' ' << Print<NodeId>(M.first, DFG) << ':'
+               << Print<NodeId>(M.second, DFG);
+      dbgs() << " }\n";
+    }
+  }
+
+  bool Changed = false;
+#ifndef NDEBUG
+  bool HasLimit = CpLimit.getNumOccurrences() > 0;
+#endif
+
+  auto MinPhysReg = [this] (RegisterRef RR) -> unsigned {
+    const TargetRegisterInfo &TRI = DFG.getTRI();
+    const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.Reg);
+    if ((RC.LaneMask & RR.Mask) == RC.LaneMask)
+      return RR.Reg;
+    for (MCSubRegIndexIterator S(RR.Reg, &TRI); S.isValid(); ++S)
+      if (RR.Mask == TRI.getSubRegIndexLaneMask(S.getSubRegIndex()))
+        return S.getSubReg();
+    llvm_unreachable("Should have found a register");
+    return 0;
+  };
+
+  for (auto C : Copies) {
+#ifndef NDEBUG
+    if (HasLimit && CpCount >= CpLimit)
+      break;
+#endif
+    auto SA = DFG.addr<InstrNode*>(C);
+    auto FS = CopyMap.find(SA.Id);
+    if (FS == CopyMap.end())
+      continue;
+
+    EqualityMap &EM = FS->second;
+    for (NodeAddr<DefNode*> DA : SA.Addr->members_if(DFG.IsDef, DFG)) {
+      RegisterRef DR = DA.Addr->getRegRef(DFG);
+      auto FR = EM.find(DR);
+      if (FR == EM.end())
+        continue;
+      RegisterRef SR = FR->second;
+      if (DR == SR)
+        continue;
+
+      auto &RDefSR = RDefMap[SR];
+      NodeId RDefSR_SA = RDefSR[SA.Id];
+
+      for (NodeId N = DA.Addr->getReachedUse(), NextN; N; N = NextN) {
+        auto UA = DFG.addr<UseNode*>(N);
+        NextN = UA.Addr->getSibling();
+        uint16_t F = UA.Addr->getFlags();
+        if ((F & NodeAttrs::PhiRef) || (F & NodeAttrs::Fixed))
+          continue;
+        if (UA.Addr->getRegRef(DFG) != DR)
+          continue;
+
+        NodeAddr<InstrNode*> IA = UA.Addr->getOwner(DFG);
+        assert(DFG.IsCode<NodeAttrs::Stmt>(IA));
+        if (RDefSR[IA.Id] != RDefSR_SA)
+          continue;
+
+        MachineOperand &Op = UA.Addr->getOp();
+        if (Op.isTied())
+          continue;
+        if (trace()) {
+          dbgs() << "Can replace " << Print<RegisterRef>(DR, DFG)
+                 << " with " << Print<RegisterRef>(SR, DFG) << " in "
+                 << *NodeAddr<StmtNode*>(IA).Addr->getCode();
+        }
+
+        unsigned NewReg = MinPhysReg(SR);
+        Op.setReg(NewReg);
+        Op.setSubReg(0);
+        DFG.unlinkUse(UA, false);
+        if (RDefSR_SA != 0) {
+          UA.Addr->linkToDef(UA.Id, DFG.addr<DefNode*>(RDefSR_SA));
+        } else {
+          UA.Addr->setReachingDef(0);
+          UA.Addr->setSibling(0);
+        }
+
+        Changed = true;
+  #ifndef NDEBUG
+        if (HasLimit && CpCount >= CpLimit)
+          break;
+        CpCount++;
+  #endif
+
+        auto FC = CopyMap.find(IA.Id);
+        if (FC != CopyMap.end()) {
+          // Update the EM map in the copy's entry.
+          auto &M = FC->second;
+          for (auto &J : M) {
+            if (J.second != DR)
+              continue;
+            J.second = SR;
+            break;
+          }
+        }
+      } // for (N in reached-uses)
+    } // for (DA in defs)
+  } // for (C in Copies)
+
+  return Changed;
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFCopy.h b/contrib/llvm/lib/Target/Hexagon/RDFCopy.h
new file mode 100644
index 000000000000..517f17cc9c64
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFCopy.h
@@ -0,0 +1,55 @@
+//===--- RDFCopy.h --------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef RDF_COPY_H
+#define RDF_COPY_H
+
+#include "RDFGraph.h"
+#include <map>
+#include <vector>
+
+namespace llvm {
+  class MachineBasicBlock;
+  class MachineDominatorTree;
+  class MachineInstr;
+
+namespace rdf {
+  struct CopyPropagation {
+    CopyPropagation(DataFlowGraph &dfg) : MDT(dfg.getDT()), DFG(dfg),
+        Trace(false) {}
+    virtual ~CopyPropagation() {}
+
+    bool run();
+    void trace(bool On) { Trace = On; }
+    bool trace() const { return Trace; }
+    DataFlowGraph &getDFG() { return DFG; }
+
+    typedef std::map<RegisterRef, RegisterRef> EqualityMap;
+    virtual bool interpretAsCopy(const MachineInstr *MI, EqualityMap &EM);
+
+  private:
+    const MachineDominatorTree &MDT;
+    DataFlowGraph &DFG;
+    DataFlowGraph::DefStackMap DefM;
+    bool Trace;
+
+    // map: register -> (map: stmt -> reaching def)
+    std::map<RegisterRef,std::map<NodeId,NodeId>> RDefMap;
+    // map: statement -> (map: dst reg -> src reg)
+    std::map<NodeId, EqualityMap> CopyMap;
+    std::vector<NodeId> Copies;
+
+    void recordCopy(NodeAddr<StmtNode*> SA, EqualityMap &EM);
+    void updateMap(NodeAddr<InstrNode*> IA);
+    bool scanBlock(MachineBasicBlock *B);
+  };
+} // namespace rdf
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
new file mode 100644
index 000000000000..63177d51cada
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -0,0 +1,232 @@
+//===--- RDFDeadCode.cpp --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RDF-based generic dead code elimination.
+
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "RDFDeadCode.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#include <queue>
+
+using namespace llvm;
+using namespace rdf;
+
+// This drastically improves execution time in "collect" over using
+// SetVector as a work queue, and popping the first element from it.
+template<typename T> struct DeadCodeElimination::SetQueue {
+  SetQueue() : Set(), Queue() {}
+
+  bool empty() const {
+    return Queue.empty();
+  }
+  T pop_front() {
+    T V = Queue.front();
+    Queue.pop();
+    Set.erase(V);
+    return V;
+  }
+  void push_back(T V) {
+    if (Set.count(V))
+      return;
+    Queue.push(V);
+    Set.insert(V);
+  }
+
+private:
+  DenseSet<T> Set;
+  std::queue<T> Queue;
+};
+
+
+// Check if the given instruction has observable side-effects, i.e. if
+// it should be considered "live". It is safe for this function to be
+// overly conservative (i.e. return "true" for all instructions), but it
+// is not safe to return "false" for an instruction that should not be
+// considered removable.
+bool DeadCodeElimination::isLiveInstr(const MachineInstr *MI) const {
+  if (MI->mayStore() || MI->isBranch() || MI->isCall() || MI->isReturn())
+    return true;
+  if (MI->hasOrderedMemoryRef() || MI->hasUnmodeledSideEffects())
+    return true;
+  if (MI->isPHI())
+    return false;
+  for (auto &Op : MI->operands())
+    if (Op.isReg() && MRI.isReserved(Op.getReg()))
+      return true;
+  return false;
+}
+
+void DeadCodeElimination::scanInstr(NodeAddr<InstrNode*> IA,
+      SetQueue<NodeId> &WorkQ) {
+  if (!DFG.IsCode<NodeAttrs::Stmt>(IA))
+    return;
+  if (!isLiveInstr(NodeAddr<StmtNode*>(IA).Addr->getCode()))
+    return;
+  for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG)) {
+    if (!LiveNodes.count(RA.Id))
+      WorkQ.push_back(RA.Id);
+  }
+}
+
+void DeadCodeElimination::processDef(NodeAddr<DefNode*> DA,
+      SetQueue<NodeId> &WorkQ) {
+  NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG);
+  for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
+    if (!LiveNodes.count(UA.Id))
+      WorkQ.push_back(UA.Id);
+  }
+  for (NodeAddr<DefNode*> TA : DFG.getRelatedRefs(IA, DA))
+    LiveNodes.insert(TA.Id);
+}
+
+void DeadCodeElimination::processUse(NodeAddr<UseNode*> UA,
+      SetQueue<NodeId> &WorkQ) {
+  for (NodeAddr<DefNode*> DA : LV.getAllReachingDefs(UA)) {
+    if (!LiveNodes.count(DA.Id))
+      WorkQ.push_back(DA.Id);
+  }
+}
+
+// Traverse the DFG and collect the set dead RefNodes and the set of
+// dead instructions. Return "true" if any of these sets is non-empty,
+// "false" otherwise.
+bool DeadCodeElimination::collect() {
+  // This function works by first finding all live nodes. The dead nodes
+  // are then the complement of the set of live nodes.
+  //
+  // Assume that all nodes are dead. Identify instructions which must be
+  // considered live, i.e. instructions with observable side-effects, such
+  // as calls and stores. All arguments of such instructions are considered
+  // live. For each live def, all operands used in the corresponding
+  // instruction are considered live. For each live use, all its reaching
+  // defs are considered live.
+  LiveNodes.clear();
+  SetQueue<NodeId> WorkQ;
+  for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG))
+    for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG))
+      scanInstr(IA, WorkQ);
+
+  while (!WorkQ.empty()) {
+    NodeId N = WorkQ.pop_front();
+    LiveNodes.insert(N);
+    auto RA = DFG.addr<RefNode*>(N);
+    if (DFG.IsDef(RA))
+      processDef(RA, WorkQ);
+    else
+      processUse(RA, WorkQ);
+  }
+
+  if (trace()) {
+    dbgs() << "Live nodes:\n";
+    for (NodeId N : LiveNodes) {
+      auto RA = DFG.addr<RefNode*>(N);
+      dbgs() << PrintNode<RefNode*>(RA, DFG) << "\n";
+    }
+  }
+
+  auto IsDead = [this] (NodeAddr<InstrNode*> IA) -> bool {
+    for (NodeAddr<DefNode*> DA : IA.Addr->members_if(DFG.IsDef, DFG))
+      if (LiveNodes.count(DA.Id))
+        return false;
+    return true;
+  };
+
+  for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) {
+    for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
+      for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
+        if (!LiveNodes.count(RA.Id))
+          DeadNodes.insert(RA.Id);
+      if (DFG.IsCode<NodeAttrs::Stmt>(IA))
+        if (isLiveInstr(NodeAddr<StmtNode*>(IA).Addr->getCode()))
+          continue;
+      if (IsDead(IA)) {
+        DeadInstrs.insert(IA.Id);
+        if (trace())
+          dbgs() << "Dead instr: " << PrintNode<InstrNode*>(IA, DFG) << "\n";
+      }
+    }
+  }
+
+  return !DeadNodes.empty();
+}
+
+// Erase the nodes given in the Nodes set from DFG. In addition to removing
+// them from the DFG, if a node corresponds to a statement, the corresponding
+// machine instruction is erased from the function.
+bool DeadCodeElimination::erase(const SetVector<NodeId> &Nodes) {
+  if (Nodes.empty())
+    return false;
+
+  // Prepare the actual set of ref nodes to remove: ref nodes from Nodes
+  // are included directly, for each InstrNode in Nodes, include the set
+  // of all RefNodes from it.
+  NodeList DRNs, DINs;
+  for (auto I : Nodes) {
+    auto BA = DFG.addr<NodeBase*>(I);
+    uint16_t Type = BA.Addr->getType();
+    if (Type == NodeAttrs::Ref) {
+      DRNs.push_back(DFG.addr<RefNode*>(I));
+      continue;
+    }
+
+    // If it's a code node, add all ref nodes from it.
+    uint16_t Kind = BA.Addr->getKind();
+    if (Kind == NodeAttrs::Stmt || Kind == NodeAttrs::Phi) {
+      for (auto N : NodeAddr<CodeNode*>(BA).Addr->members(DFG))
+        DRNs.push_back(N);
+      DINs.push_back(DFG.addr<InstrNode*>(I));
+    } else {
+      llvm_unreachable("Unexpected code node");
+      return false;
+    }
+  }
+
+  // Sort the list so that use nodes are removed first. This makes the
+  // "unlink" functions a bit faster.
+  auto UsesFirst = [] (NodeAddr<RefNode*> A, NodeAddr<RefNode*> B) -> bool {
+    uint16_t KindA = A.Addr->getKind(), KindB = B.Addr->getKind();
+    if (KindA == NodeAttrs::Use && KindB == NodeAttrs::Def)
+      return true;
+    if (KindA == NodeAttrs::Def && KindB == NodeAttrs::Use)
+      return false;
+    return A.Id < B.Id;
+  };
+  std::sort(DRNs.begin(), DRNs.end(), UsesFirst);
+
+  if (trace())
+    dbgs() << "Removing dead ref nodes:\n";
+  for (NodeAddr<RefNode*> RA : DRNs) {
+    if (trace())
+      dbgs() << "  " << PrintNode<RefNode*>(RA, DFG) << '\n';
+    if (DFG.IsUse(RA))
+      DFG.unlinkUse(RA, true);
+    else if (DFG.IsDef(RA))
+      DFG.unlinkDef(RA, true);
+  }
+
+  // Now, remove all dead instruction nodes.
+  for (NodeAddr<InstrNode*> IA : DINs) {
+    NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
+    BA.Addr->removeMember(IA, DFG);
+    if (!DFG.IsCode<NodeAttrs::Stmt>(IA))
+      continue;
+
+    MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
+    if (trace())
+      dbgs() << "erasing: " << *MI;
+    MI->eraseFromParent();
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h
new file mode 100644
index 000000000000..8977e730b855
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h
@@ -0,0 +1,67 @@
+//===--- RDFDeadCode.h ----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RDF-based generic dead code elimination.
+//
+// The main interface of this class are functions "collect" and "erase".
+// This allows custom processing of the function being optimized by a
+// particular consumer. The simplest way to use this class would be to
+// instantiate an object, and then simply call "collect" and "erase",
+// passing the result of "getDeadInstrs()" to it.
+// A more complex scenario would be to call "collect" first, then visit
+// all post-increment instructions to see if the address update is dead
+// or not, and if it is, convert the instruction to a non-updating form.
+// After that "erase" can be called with the set of nodes including both,
+// dead defs from the updating instructions and the nodes corresponding
+// to the dead instructions.
+
+#ifndef RDF_DEADCODE_H
+#define RDF_DEADCODE_H
+
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/ADT/SetVector.h"
+
+namespace llvm {
+  class MachineRegisterInfo;
+
+namespace rdf {
+  struct DeadCodeElimination {
+    DeadCodeElimination(DataFlowGraph &dfg, MachineRegisterInfo &mri)
+      : Trace(false), DFG(dfg), MRI(mri), LV(mri, dfg) {}
+
+    bool collect();
+    bool erase(const SetVector<NodeId> &Nodes);
+    void trace(bool On) { Trace = On; }
+    bool trace() const { return Trace; }
+
+    SetVector<NodeId> getDeadNodes() { return DeadNodes; }
+    SetVector<NodeId> getDeadInstrs() { return DeadInstrs; }
+    DataFlowGraph &getDFG() { return DFG; }
+
+  private:
+    bool Trace;
+    SetVector<NodeId> LiveNodes;
+    SetVector<NodeId> DeadNodes;
+    SetVector<NodeId> DeadInstrs;
+    DataFlowGraph &DFG;
+    MachineRegisterInfo &MRI;
+    Liveness LV;
+
+    template<typename T> struct SetQueue;
+
+    bool isLiveInstr(const MachineInstr *MI) const;
+    void scanInstr(NodeAddr<InstrNode*> IA, SetQueue<NodeId> &WorkQ);
+    void processDef(NodeAddr<DefNode*> DA, SetQueue<NodeId> &WorkQ);
+    void processUse(NodeAddr<UseNode*> UA, SetQueue<NodeId> &WorkQ);
+  };
+} // namespace rdf
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp b/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp
new file mode 100644
index 000000000000..33c3f03790f3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp
@@ -0,0 +1,1950 @@
+//===--- RDFGraph.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Target-independent, SSA-based data flow graph for register data flow (RDF).
+//
+#include "RDFGraph.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+using namespace rdf;
+
+// Printing functions. Have them here first, so that the rest of the code
+// can use them.
+namespace llvm {
+namespace rdf {
+
+raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P) {
+  if (!P.Mask.all())
+    OS << ':' << PrintLaneMask(P.Mask);
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterRef> &P) {
+  auto &TRI = P.G.getTRI();
+  if (P.Obj.Reg > 0 && P.Obj.Reg < TRI.getNumRegs())
+    OS << TRI.getName(P.Obj.Reg);
+  else
+    OS << '#' << P.Obj.Reg;
+  OS << PrintLaneMaskOpt(P.Obj.Mask);
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeId> &P) {
+  auto NA = P.G.addr<NodeBase*>(P.Obj);
+  uint16_t Attrs = NA.Addr->getAttrs();
+  uint16_t Kind = NodeAttrs::kind(Attrs);
+  uint16_t Flags = NodeAttrs::flags(Attrs);
+  switch (NodeAttrs::type(Attrs)) {
+    case NodeAttrs::Code:
+      switch (Kind) {
+        case NodeAttrs::Func:   OS << 'f'; break;
+        case NodeAttrs::Block:  OS << 'b'; break;
+        case NodeAttrs::Stmt:   OS << 's'; break;
+        case NodeAttrs::Phi:    OS << 'p'; break;
+        default:                OS << "c?"; break;
+      }
+      break;
+    case NodeAttrs::Ref:
+      if (Flags & NodeAttrs::Undef)
+        OS << '/';
+      if (Flags & NodeAttrs::Dead)
+        OS << '\\';
+      if (Flags & NodeAttrs::Preserving)
+        OS << '+';
+      if (Flags & NodeAttrs::Clobbering)
+        OS << '~';
+      switch (Kind) {
+        case NodeAttrs::Use:    OS << 'u'; break;
+        case NodeAttrs::Def:    OS << 'd'; break;
+        case NodeAttrs::Block:  OS << 'b'; break;
+        default:                OS << "r?"; break;
+      }
+      break;
+    default:
+      OS << '?';
+      break;
+  }
+  OS << P.Obj;
+  if (Flags & NodeAttrs::Shadow)
+    OS << '"';
+  return OS;
+}
+
+namespace {
+  void printRefHeader(raw_ostream &OS, const NodeAddr<RefNode*> RA,
+        const DataFlowGraph &G) {
+    OS << Print<NodeId>(RA.Id, G) << '<'
+       << Print<RegisterRef>(RA.Addr->getRegRef(G), G) << '>';
+    if (RA.Addr->getFlags() & NodeAttrs::Fixed)
+      OS << '!';
+  }
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<DefNode*>> &P) {
+  printRefHeader(OS, P.Obj, P.G);
+  OS << '(';
+  if (NodeId N = P.Obj.Addr->getReachingDef())
+    OS << Print<NodeId>(N, P.G);
+  OS << ',';
+  if (NodeId N = P.Obj.Addr->getReachedDef())
+    OS << Print<NodeId>(N, P.G);
+  OS << ',';
+  if (NodeId N = P.Obj.Addr->getReachedUse())
+    OS << Print<NodeId>(N, P.G);
+  OS << "):";
+  if (NodeId N = P.Obj.Addr->getSibling())
+    OS << Print<NodeId>(N, P.G);
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<UseNode*>> &P) {
+  printRefHeader(OS, P.Obj, P.G);
+  OS << '(';
+  if (NodeId N = P.Obj.Addr->getReachingDef())
+    OS << Print<NodeId>(N, P.G);
+  OS << "):";
+  if (NodeId N = P.Obj.Addr->getSibling())
+    OS << Print<NodeId>(N, P.G);
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<NodeAddr<PhiUseNode*>> &P) {
+  printRefHeader(OS, P.Obj, P.G);
+  OS << '(';
+  if (NodeId N = P.Obj.Addr->getReachingDef())
+    OS << Print<NodeId>(N, P.G);
+  OS << ',';
+  if (NodeId N = P.Obj.Addr->getPredecessor())
+    OS << Print<NodeId>(N, P.G);
+  OS << "):";
+  if (NodeId N = P.Obj.Addr->getSibling())
+    OS << Print<NodeId>(N, P.G);
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<RefNode*>> &P) {
+  switch (P.Obj.Addr->getKind()) {
+    case NodeAttrs::Def:
+      OS << PrintNode<DefNode*>(P.Obj, P.G);
+      break;
+    case NodeAttrs::Use:
+      if (P.Obj.Addr->getFlags() & NodeAttrs::PhiRef)
+        OS << PrintNode<PhiUseNode*>(P.Obj, P.G);
+      else
+        OS << PrintNode<UseNode*>(P.Obj, P.G);
+      break;
+  }
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeList> &P) {
+  unsigned N = P.Obj.size();
+  for (auto I : P.Obj) {
+    OS << Print<NodeId>(I.Id, P.G);
+    if (--N)
+      OS << ' ';
+  }
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeSet> &P) {
+  unsigned N = P.Obj.size();
+  for (auto I : P.Obj) {
+    OS << Print<NodeId>(I, P.G);
+    if (--N)
+      OS << ' ';
+  }
+  return OS;
+}
+
+namespace {
+  template <typename T>
+  struct PrintListV {
+    PrintListV(const NodeList &L, const DataFlowGraph &G) : List(L), G(G) {}
+    typedef T Type;
+    const NodeList &List;
+    const DataFlowGraph &G;
+  };
+
+  template <typename T>
+  raw_ostream &operator<< (raw_ostream &OS, const PrintListV<T> &P) {
+    unsigned N = P.List.size();
+    for (NodeAddr<T> A : P.List) {
+      OS << PrintNode<T>(A, P.G);
+      if (--N)
+        OS << ", ";
+    }
+    return OS;
+  }
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<PhiNode*>> &P) {
+  OS << Print<NodeId>(P.Obj.Id, P.G) << ": phi ["
+     << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']';
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<NodeAddr<StmtNode*>> &P) {
+  const MachineInstr &MI = *P.Obj.Addr->getCode();
+  unsigned Opc = MI.getOpcode();
+  OS << Print<NodeId>(P.Obj.Id, P.G) << ": " << P.G.getTII().getName(Opc);
+  // Print the target for calls and branches (for readability).
+  if (MI.isCall() || MI.isBranch()) {
+    MachineInstr::const_mop_iterator T =
+          find_if(MI.operands(),
+                  [] (const MachineOperand &Op) -> bool {
+                    return Op.isMBB() || Op.isGlobal() || Op.isSymbol();
+                  });
+    if (T != MI.operands_end()) {
+      OS << ' ';
+      if (T->isMBB())
+        OS << "BB#" << T->getMBB()->getNumber();
+      else if (T->isGlobal())
+        OS << T->getGlobal()->getName();
+      else if (T->isSymbol())
+        OS << T->getSymbolName();
+    }
+  }
+  OS << " [" << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']';
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<NodeAddr<InstrNode*>> &P) {
+  switch (P.Obj.Addr->getKind()) {
+    case NodeAttrs::Phi:
+      OS << PrintNode<PhiNode*>(P.Obj, P.G);
+      break;
+    case NodeAttrs::Stmt:
+      OS << PrintNode<StmtNode*>(P.Obj, P.G);
+      break;
+    default:
+      OS << "instr? " << Print<NodeId>(P.Obj.Id, P.G);
+      break;
+  }
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<NodeAddr<BlockNode*>> &P) {
+  MachineBasicBlock *BB = P.Obj.Addr->getCode();
+  unsigned NP = BB->pred_size();
+  std::vector<int> Ns;
+  auto PrintBBs = [&OS,&P] (std::vector<int> Ns) -> void {
+    unsigned N = Ns.size();
+    for (int I : Ns) {
+      OS << "BB#" << I;
+      if (--N)
+        OS << ", ";
+    }
+  };
+
+  OS << Print<NodeId>(P.Obj.Id, P.G) << ": --- BB#" << BB->getNumber()
+     << " --- preds(" << NP << "): ";
+  for (MachineBasicBlock *B : BB->predecessors())
+    Ns.push_back(B->getNumber());
+  PrintBBs(Ns);
+
+  unsigned NS = BB->succ_size();
+  OS << "  succs(" << NS << "): ";
+  Ns.clear();
+  for (MachineBasicBlock *B : BB->successors())
+    Ns.push_back(B->getNumber());
+  PrintBBs(Ns);
+  OS << '\n';
+
+  for (auto I : P.Obj.Addr->members(P.G))
+    OS << PrintNode<InstrNode*>(I, P.G) << '\n';
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<NodeAddr<FuncNode*>> &P) {
+  OS << "DFG dump:[\n" << Print<NodeId>(P.Obj.Id, P.G) << ": Function: "
+     << P.Obj.Addr->getCode()->getName() << '\n';
+  for (auto I : P.Obj.Addr->members(P.G))
+    OS << PrintNode<BlockNode*>(I, P.G) << '\n';
+  OS << "]\n";
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterSet> &P) {
+  OS << '{';
+  for (auto I : P.Obj)
+    OS << ' ' << Print<RegisterRef>(I, P.G);
+  OS << " }";
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterAggr> &P) {
+  P.Obj.print(OS);
+  return OS;
+}
+
+template<>
+raw_ostream &operator<< (raw_ostream &OS,
+      const Print<DataFlowGraph::DefStack> &P) {
+  for (auto I = P.Obj.top(), E = P.Obj.bottom(); I != E; ) {
+    OS << Print<NodeId>(I->Id, P.G)
+       << '<' << Print<RegisterRef>(I->Addr->getRegRef(P.G), P.G) << '>';
+    I.down();
+    if (I != E)
+      OS << ' ';
+  }
+  return OS;
+}
+
+} // namespace rdf
+} // namespace llvm
+
+// Node allocation functions.
+//
+// Node allocator is like a slab memory allocator: it allocates blocks of
+// memory in sizes that are multiples of the size of a node. Each block has
+// the same size. Nodes are allocated from the currently active block, and
+// when it becomes full, a new one is created.
+// There is a mapping scheme between node id and its location in a block,
+// and within that block is described in the header file.
+//
+void NodeAllocator::startNewBlock() {
+  void *T = MemPool.Allocate(NodesPerBlock*NodeMemSize, NodeMemSize);
+  char *P = static_cast<char*>(T);
+  Blocks.push_back(P);
+  // Check if the block index is still within the allowed range, i.e. less
+  // than 2^N, where N is the number of bits in NodeId for the block index.
+  // BitsPerIndex is the number of bits per node index.
+  assert((Blocks.size() < ((size_t)1 << (8*sizeof(NodeId)-BitsPerIndex))) &&
+         "Out of bits for block index");
+  ActiveEnd = P;
+}
+
+bool NodeAllocator::needNewBlock() {
+  if (Blocks.empty())
+    return true;
+
+  char *ActiveBegin = Blocks.back();
+  uint32_t Index = (ActiveEnd-ActiveBegin)/NodeMemSize;
+  return Index >= NodesPerBlock;
+}
+
+NodeAddr<NodeBase*> NodeAllocator::New() {
+  if (needNewBlock())
+    startNewBlock();
+
+  uint32_t ActiveB = Blocks.size()-1;
+  uint32_t Index = (ActiveEnd - Blocks[ActiveB])/NodeMemSize;
+  NodeAddr<NodeBase*> NA = { reinterpret_cast<NodeBase*>(ActiveEnd),
+                             makeId(ActiveB, Index) };
+  ActiveEnd += NodeMemSize;
+  return NA;
+}
+
+NodeId NodeAllocator::id(const NodeBase *P) const {
+  uintptr_t A = reinterpret_cast<uintptr_t>(P);
+  for (unsigned i = 0, n = Blocks.size(); i != n; ++i) {
+    uintptr_t B = reinterpret_cast<uintptr_t>(Blocks[i]);
+    if (A < B || A >= B + NodesPerBlock*NodeMemSize)
+      continue;
+    uint32_t Idx = (A-B)/NodeMemSize;
+    return makeId(i, Idx);
+  }
+  llvm_unreachable("Invalid node address");
+}
+
+void NodeAllocator::clear() {
+  MemPool.Reset();
+  Blocks.clear();
+  ActiveEnd = nullptr;
+}
+
+
+// Insert node NA after "this" in the circular chain.
+void NodeBase::append(NodeAddr<NodeBase*> NA) {
+  NodeId Nx = Next;
+  // If NA is already "next", do nothing.
+  if (Next != NA.Id) {
+    Next = NA.Id;
+    NA.Addr->Next = Nx;
+  }
+}
+
+
+// Fundamental node manipulator functions.
+
+// Obtain the register reference from a reference node.
+RegisterRef RefNode::getRegRef(const DataFlowGraph &G) const {
+  assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
+  if (NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef)
+    return G.unpack(Ref.PR);
+  assert(Ref.Op != nullptr);
+  return G.makeRegRef(Ref.Op->getReg(), Ref.Op->getSubReg());
+}
+
+// Set the register reference in the reference node directly (for references
+// in phi nodes).
+void RefNode::setRegRef(RegisterRef RR, DataFlowGraph &G) {
+  assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
+  assert(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef);
+  Ref.PR = G.pack(RR);
+}
+
+// Set the register reference in the reference node based on a machine
+// operand (for references in statement nodes).
+void RefNode::setRegRef(MachineOperand *Op, DataFlowGraph &G) {
+  assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
+  assert(!(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef));
+  (void)G;
+  Ref.Op = Op;
+}
+
+// Get the owner of a given reference node.
+NodeAddr<NodeBase*> RefNode::getOwner(const DataFlowGraph &G) {
+  NodeAddr<NodeBase*> NA = G.addr<NodeBase*>(getNext());
+
+  while (NA.Addr != this) {
+    if (NA.Addr->getType() == NodeAttrs::Code)
+      return NA;
+    NA = G.addr<NodeBase*>(NA.Addr->getNext());
+  }
+  llvm_unreachable("No owner in circular list");
+}
+
+// Connect the def node to the reaching def node.
+void DefNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) {
+  Ref.RD = DA.Id;
+  Ref.Sib = DA.Addr->getReachedDef();
+  DA.Addr->setReachedDef(Self);
+}
+
+// Connect the use node to the reaching def node.
+void UseNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) {
+  Ref.RD = DA.Id;
+  Ref.Sib = DA.Addr->getReachedUse();
+  DA.Addr->setReachedUse(Self);
+}
+
+// Get the first member of the code node.
+NodeAddr<NodeBase*> CodeNode::getFirstMember(const DataFlowGraph &G) const {
+  if (Code.FirstM == 0)
+    return NodeAddr<NodeBase*>();
+  return G.addr<NodeBase*>(Code.FirstM);
+}
+
+// Get the last member of the code node.
+NodeAddr<NodeBase*> CodeNode::getLastMember(const DataFlowGraph &G) const {
+  if (Code.LastM == 0)
+    return NodeAddr<NodeBase*>();
+  return G.addr<NodeBase*>(Code.LastM);
+}
+
+// Add node NA at the end of the member list of the given code node.
+void CodeNode::addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) {
+  NodeAddr<NodeBase*> ML = getLastMember(G);
+  if (ML.Id != 0) {
+    ML.Addr->append(NA);
+  } else {
+    Code.FirstM = NA.Id;
+    NodeId Self = G.id(this);
+    NA.Addr->setNext(Self);
+  }
+  Code.LastM = NA.Id;
+}
+
+// Add node NA after member node MA in the given code node.
+void CodeNode::addMemberAfter(NodeAddr<NodeBase*> MA, NodeAddr<NodeBase*> NA,
+      const DataFlowGraph &G) {
+  MA.Addr->append(NA);
+  if (Code.LastM == MA.Id)
+    Code.LastM = NA.Id;
+}
+
+// Remove member node NA from the given code node.
+void CodeNode::removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) {
+  NodeAddr<NodeBase*> MA = getFirstMember(G);
+  assert(MA.Id != 0);
+
+  // Special handling if the member to remove is the first member.
+  if (MA.Id == NA.Id) {
+    if (Code.LastM == MA.Id) {
+      // If it is the only member, set both first and last to 0.
+      Code.FirstM = Code.LastM = 0;
+    } else {
+      // Otherwise, advance the first member.
+      Code.FirstM = MA.Addr->getNext();
+    }
+    return;
+  }
+
+  while (MA.Addr != this) {
+    NodeId MX = MA.Addr->getNext();
+    if (MX == NA.Id) {
+      MA.Addr->setNext(NA.Addr->getNext());
+      // If the member to remove happens to be the last one, update the
+      // LastM indicator.
+      if (Code.LastM == NA.Id)
+        Code.LastM = MA.Id;
+      return;
+    }
+    MA = G.addr<NodeBase*>(MX);
+  }
+  llvm_unreachable("No such member");
+}
+
+// Return the list of all members of the code node.
+NodeList CodeNode::members(const DataFlowGraph &G) const {
+  static auto True = [] (NodeAddr<NodeBase*>) -> bool { return true; };
+  return members_if(True, G);
+}
+
+// Return the owner of the given instr node.
+NodeAddr<NodeBase*> InstrNode::getOwner(const DataFlowGraph &G) {
+  NodeAddr<NodeBase*> NA = G.addr<NodeBase*>(getNext());
+
+  while (NA.Addr != this) {
+    assert(NA.Addr->getType() == NodeAttrs::Code);
+    if (NA.Addr->getKind() == NodeAttrs::Block)
+      return NA;
+    NA = G.addr<NodeBase*>(NA.Addr->getNext());
+  }
+  llvm_unreachable("No owner in circular list");
+}
+
+// Add the phi node PA to the given block node.
+void BlockNode::addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G) {
+  NodeAddr<NodeBase*> M = getFirstMember(G);
+  if (M.Id == 0) {
+    addMember(PA, G);
+    return;
+  }
+
+  assert(M.Addr->getType() == NodeAttrs::Code);
+  if (M.Addr->getKind() == NodeAttrs::Stmt) {
+    // If the first member of the block is a statement, insert the phi as
+    // the first member.
+    Code.FirstM = PA.Id;
+    PA.Addr->setNext(M.Id);
+  } else {
+    // If the first member is a phi, find the last phi, and append PA to it.
+    assert(M.Addr->getKind() == NodeAttrs::Phi);
+    NodeAddr<NodeBase*> MN = M;
+    do {
+      M = MN;
+      MN = G.addr<NodeBase*>(M.Addr->getNext());
+      assert(MN.Addr->getType() == NodeAttrs::Code);
+    } while (MN.Addr->getKind() == NodeAttrs::Phi);
+
+    // M is the last phi.
+    addMemberAfter(M, PA, G);
+  }
+}
+
+// Find the block node corresponding to the machine basic block BB in the
+// given func node.
+NodeAddr<BlockNode*> FuncNode::findBlock(const MachineBasicBlock *BB,
+      const DataFlowGraph &G) const {
+  auto EqBB = [BB] (NodeAddr<NodeBase*> NA) -> bool {
+    return NodeAddr<BlockNode*>(NA).Addr->getCode() == BB;
+  };
+  NodeList Ms = members_if(EqBB, G);
+  if (!Ms.empty())
+    return Ms[0];
+  return NodeAddr<BlockNode*>();
+}
+
+// Get the block node for the entry block in the given function.
+NodeAddr<BlockNode*> FuncNode::getEntryBlock(const DataFlowGraph &G) {
+  MachineBasicBlock *EntryB = &getCode()->front();
+  return findBlock(EntryB, G);
+}
+
+
+// Target operand information.
+//
+
+// For a given instruction, check if there are any bits of RR that can remain
+// unchanged across this def.
+bool TargetOperandInfo::isPreserving(const MachineInstr &In, unsigned OpNum)
+      const {
+  return TII.isPredicated(In);
+}
+
+// Check if the definition of RR produces an unspecified value.
+bool TargetOperandInfo::isClobbering(const MachineInstr &In, unsigned OpNum)
+      const {
+  if (In.isCall())
+    if (In.getOperand(OpNum).isImplicit())
+      return true;
+  return false;
+}
+
+// Check if the given instruction specifically requires
+bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum)
+      const {
+  if (In.isCall() || In.isReturn() || In.isInlineAsm())
+    return true;
+  // Check for a tail call.
+  if (In.isBranch())
+    for (const MachineOperand &O : In.operands())
+      if (O.isGlobal() || O.isSymbol())
+        return true;
+
+  const MCInstrDesc &D = In.getDesc();
+  if (!D.getImplicitDefs() && !D.getImplicitUses())
+    return false;
+  const MachineOperand &Op = In.getOperand(OpNum);
+  // If there is a sub-register, treat the operand as non-fixed. Currently,
+  // fixed registers are those that are listed in the descriptor as implicit
+  // uses or defs, and those lists do not allow sub-registers.
+  if (Op.getSubReg() != 0)
+    return false;
+  RegisterId Reg = Op.getReg();
+  const MCPhysReg *ImpR = Op.isDef() ? D.getImplicitDefs()
+                                     : D.getImplicitUses();
+  if (!ImpR)
+    return false;
+  while (*ImpR)
+    if (*ImpR++ == Reg)
+      return true;
+  return false;
+}
+
+
+RegisterRef RegisterAggr::normalize(RegisterRef RR) const {
+  RegisterId SuperReg = RR.Reg;
+  while (true) {
+    MCSuperRegIterator SR(SuperReg, &TRI, false);
+    if (!SR.isValid())
+      break;
+    SuperReg = *SR;
+  }
+
+  const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.Reg);
+  LaneBitmask Common = RR.Mask & RC.LaneMask;
+  uint32_t Sub = TRI.getSubRegIndex(SuperReg, RR.Reg);
+  LaneBitmask SuperMask = TRI.composeSubRegIndexLaneMask(Sub, Common);
+  return RegisterRef(SuperReg, SuperMask);
+}
+
+bool RegisterAggr::hasAliasOf(RegisterRef RR) const {
+  RegisterRef NR = normalize(RR);
+  auto F = Masks.find(NR.Reg);
+  if (F != Masks.end()) {
+    if ((F->second & NR.Mask).any())
+      return true;
+  }
+  if (CheckUnits) {
+    for (MCRegUnitIterator U(RR.Reg, &TRI); U.isValid(); ++U)
+      if (ExpAliasUnits.test(*U))
+        return true;
+  }
+  return false;
+}
+
+bool RegisterAggr::hasCoverOf(RegisterRef RR) const {
+  // Always have a cover for empty lane mask.
+  RegisterRef NR = normalize(RR);
+  if (NR.Mask.none())
+    return true;
+  auto F = Masks.find(NR.Reg);
+  if (F == Masks.end())
+    return false;
+  return (NR.Mask & F->second) == NR.Mask;
+}
+
+RegisterAggr &RegisterAggr::insert(RegisterRef RR) {
+  RegisterRef NR = normalize(RR);
+  auto F = Masks.find(NR.Reg);
+  if (F == Masks.end())
+    Masks.insert({NR.Reg, NR.Mask});
+  else
+    F->second |= NR.Mask;
+
+  // Visit all register units to see if there are any that were created
+  // by explicit aliases. Add those that were to the bit vector.
+  for (MCRegUnitIterator U(RR.Reg, &TRI); U.isValid(); ++U) {
+    MCRegUnitRootIterator R(*U, &TRI);
+    ++R;
+    if (!R.isValid())
+      continue;
+    ExpAliasUnits.set(*U);
+    CheckUnits = true;
+  }
+  return *this;
+}
+
+RegisterAggr &RegisterAggr::insert(const RegisterAggr &RG) {
+  for (std::pair<RegisterId,LaneBitmask> P : RG.Masks)
+    insert(RegisterRef(P.first, P.second));
+  return *this;
+}
+
+RegisterAggr &RegisterAggr::clear(RegisterRef RR) {
+  RegisterRef NR = normalize(RR);
+  auto F = Masks.find(NR.Reg);
+  if (F == Masks.end())
+    return *this;
+  LaneBitmask NewM = F->second & ~NR.Mask;
+  if (NewM.none())
+    Masks.erase(F);
+  else
+    F->second = NewM;
+  return *this;
+}
+
+RegisterAggr &RegisterAggr::clear(const RegisterAggr &RG) {
+  for (std::pair<RegisterId,LaneBitmask> P : RG.Masks)
+    clear(RegisterRef(P.first, P.second));
+  return *this;
+}
+
+RegisterRef RegisterAggr::clearIn(RegisterRef RR) const {
+  RegisterAggr T(TRI);
+  T.insert(RR).clear(*this);
+  if (T.empty())
+    return RegisterRef();
+  return RegisterRef(T.begin()->first, T.begin()->second);
+}
+
+void RegisterAggr::print(raw_ostream &OS) const {
+  OS << '{';
+  for (auto I : Masks)
+    OS << ' ' << PrintReg(I.first, &TRI) << PrintLaneMaskOpt(I.second);
+  OS << " }";
+}
+
+
+//
+// The data flow graph construction.
+//
+
+DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
+      const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
+      const MachineDominanceFrontier &mdf, const TargetOperandInfo &toi)
+    : LMI(), MF(mf), TII(tii), TRI(tri), MDT(mdt), MDF(mdf), TOI(toi) {
+}
+
+
+// The implementation of the definition stack.
+// Each register reference has its own definition stack. In particular,
+// for a register references "Reg" and "Reg:subreg" will each have their
+// own definition stacks.
+
+// Construct a stack iterator.
+DataFlowGraph::DefStack::Iterator::Iterator(const DataFlowGraph::DefStack &S,
+      bool Top) : DS(S) {
+  if (!Top) {
+    // Initialize to bottom.
+    Pos = 0;
+    return;
+  }
+  // Initialize to the top, i.e. top-most non-delimiter (or 0, if empty).
+  Pos = DS.Stack.size();
+  while (Pos > 0 && DS.isDelimiter(DS.Stack[Pos-1]))
+    Pos--;
+}
+
+// Return the size of the stack, including block delimiters.
+unsigned DataFlowGraph::DefStack::size() const {
+  unsigned S = 0;
+  for (auto I = top(), E = bottom(); I != E; I.down())
+    S++;
+  return S;
+}
+
+// Remove the top entry from the stack. Remove all intervening delimiters
+// so that after this, the stack is either empty, or the top of the stack
+// is a non-delimiter.
+void DataFlowGraph::DefStack::pop() {
+  assert(!empty());
+  unsigned P = nextDown(Stack.size());
+  Stack.resize(P);
+}
+
+// Push a delimiter for block node N on the stack.
+void DataFlowGraph::DefStack::start_block(NodeId N) {
+  assert(N != 0);
+  Stack.push_back(NodeAddr<DefNode*>(nullptr, N));
+}
+
+// Remove all nodes from the top of the stack, until the delimited for
+// block node N is encountered. Remove the delimiter as well. In effect,
+// this will remove from the stack all definitions from block N.
+void DataFlowGraph::DefStack::clear_block(NodeId N) {
+  assert(N != 0);
+  unsigned P = Stack.size();
+  while (P > 0) {
+    bool Found = isDelimiter(Stack[P-1], N);
+    P--;
+    if (Found)
+      break;
+  }
+  // This will also remove the delimiter, if found.
+  Stack.resize(P);
+}
+
+// Move the stack iterator up by one.
+unsigned DataFlowGraph::DefStack::nextUp(unsigned P) const {
+  // Get the next valid position after P (skipping all delimiters).
+  // The input position P does not have to point to a non-delimiter.
+  unsigned SS = Stack.size();
+  bool IsDelim;
+  assert(P < SS);
+  do {
+    P++;
+    IsDelim = isDelimiter(Stack[P-1]);
+  } while (P < SS && IsDelim);
+  assert(!IsDelim);
+  return P;
+}
+
+// Move the stack iterator down by one.
+unsigned DataFlowGraph::DefStack::nextDown(unsigned P) const {
+  // Get the preceding valid position before P (skipping all delimiters).
+  // The input position P does not have to point to a non-delimiter.
+  assert(P > 0 && P <= Stack.size());
+  bool IsDelim = isDelimiter(Stack[P-1]);
+  do {
+    if (--P == 0)
+      break;
+    IsDelim = isDelimiter(Stack[P-1]);
+  } while (P > 0 && IsDelim);
+  assert(!IsDelim);
+  return P;
+}
+
+
+// Register information.
+
+// Get the list of references aliased to RR. Lane masks are ignored.
+RegisterSet DataFlowGraph::getAliasSet(RegisterId Reg) const {
+  // Do not include RR in the alias set.
+  RegisterSet AS;
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+
+  for (MCRegAliasIterator AI(Reg, &TRI, false); AI.isValid(); ++AI)
+    AS.insert(RegisterRef(*AI));
+  return AS;
+}
+
+RegisterSet DataFlowGraph::getLandingPadLiveIns() const {
+  RegisterSet LR;
+  const Function &F = *MF.getFunction();
+  const Constant *PF = F.hasPersonalityFn() ? F.getPersonalityFn()
+                                            : nullptr;
+  const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
+  if (RegisterId R = TLI.getExceptionPointerRegister(PF))
+    LR.insert(RegisterRef(R));
+  if (RegisterId R = TLI.getExceptionSelectorRegister(PF))
+    LR.insert(RegisterRef(R));
+  return LR;
+}
+
+// Node management functions.
+
+// Get the pointer to the node with the id N.
+NodeBase *DataFlowGraph::ptr(NodeId N) const {
+  if (N == 0)
+    return nullptr;
+  return Memory.ptr(N);
+}
+
+// Get the id of the node at the address P.
+NodeId DataFlowGraph::id(const NodeBase *P) const {
+  if (P == nullptr)
+    return 0;
+  return Memory.id(P);
+}
+
+// Allocate a new node and set the attributes to Attrs.
+NodeAddr<NodeBase*> DataFlowGraph::newNode(uint16_t Attrs) {
+  NodeAddr<NodeBase*> P = Memory.New();
+  P.Addr->init();
+  P.Addr->setAttrs(Attrs);
+  return P;
+}
+
+// Make a copy of the given node B, except for the data-flow links, which
+// are set to 0.
+NodeAddr<NodeBase*> DataFlowGraph::cloneNode(const NodeAddr<NodeBase*> B) {
+  NodeAddr<NodeBase*> NA = newNode(0);
+  memcpy(NA.Addr, B.Addr, sizeof(NodeBase));
+  // Ref nodes need to have the data-flow links reset.
+  if (NA.Addr->getType() == NodeAttrs::Ref) {
+    NodeAddr<RefNode*> RA = NA;
+    RA.Addr->setReachingDef(0);
+    RA.Addr->setSibling(0);
+    if (NA.Addr->getKind() == NodeAttrs::Def) {
+      NodeAddr<DefNode*> DA = NA;
+      DA.Addr->setReachedDef(0);
+      DA.Addr->setReachedUse(0);
+    }
+  }
+  return NA;
+}
+
+
+// Allocation routines for specific node types/kinds.
+
+NodeAddr<UseNode*> DataFlowGraph::newUse(NodeAddr<InstrNode*> Owner,
+      MachineOperand &Op, uint16_t Flags) {
+  NodeAddr<UseNode*> UA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags);
+  UA.Addr->setRegRef(&Op, *this);
+  return UA;
+}
+
+NodeAddr<PhiUseNode*> DataFlowGraph::newPhiUse(NodeAddr<PhiNode*> Owner,
+      RegisterRef RR, NodeAddr<BlockNode*> PredB, uint16_t Flags) {
+  NodeAddr<PhiUseNode*> PUA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags);
+  assert(Flags & NodeAttrs::PhiRef);
+  PUA.Addr->setRegRef(RR, *this);
+  PUA.Addr->setPredecessor(PredB.Id);
+  return PUA;
+}
+
+NodeAddr<DefNode*> DataFlowGraph::newDef(NodeAddr<InstrNode*> Owner,
+      MachineOperand &Op, uint16_t Flags) {
+  NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags);
+  DA.Addr->setRegRef(&Op, *this);
+  return DA;
+}
+
+NodeAddr<DefNode*> DataFlowGraph::newDef(NodeAddr<InstrNode*> Owner,
+      RegisterRef RR, uint16_t Flags) {
+  NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags);
+  assert(Flags & NodeAttrs::PhiRef);
+  DA.Addr->setRegRef(RR, *this);
+  return DA;
+}
+
+NodeAddr<PhiNode*> DataFlowGraph::newPhi(NodeAddr<BlockNode*> Owner) {
+  NodeAddr<PhiNode*> PA = newNode(NodeAttrs::Code | NodeAttrs::Phi);
+  Owner.Addr->addPhi(PA, *this);
+  return PA;
+}
+
+NodeAddr<StmtNode*> DataFlowGraph::newStmt(NodeAddr<BlockNode*> Owner,
+      MachineInstr *MI) {
+  NodeAddr<StmtNode*> SA = newNode(NodeAttrs::Code | NodeAttrs::Stmt);
+  SA.Addr->setCode(MI);
+  Owner.Addr->addMember(SA, *this);
+  return SA;
+}
+
+NodeAddr<BlockNode*> DataFlowGraph::newBlock(NodeAddr<FuncNode*> Owner,
+      MachineBasicBlock *BB) {
+  NodeAddr<BlockNode*> BA = newNode(NodeAttrs::Code | NodeAttrs::Block);
+  BA.Addr->setCode(BB);
+  Owner.Addr->addMember(BA, *this);
+  return BA;
+}
+
+NodeAddr<FuncNode*> DataFlowGraph::newFunc(MachineFunction *MF) {
+  NodeAddr<FuncNode*> FA = newNode(NodeAttrs::Code | NodeAttrs::Func);
+  FA.Addr->setCode(MF);
+  return FA;
+}
+
+// Build the data flow graph.
+void DataFlowGraph::build(unsigned Options) {
+  reset();
+  Func = newFunc(&MF);
+
+  if (MF.empty())
+    return;
+
+  for (MachineBasicBlock &B : MF) {
+    NodeAddr<BlockNode*> BA = newBlock(Func, &B);
+    BlockNodes.insert(std::make_pair(&B, BA));
+    for (MachineInstr &I : B) {
+      if (I.isDebugValue())
+        continue;
+      buildStmt(BA, I);
+    }
+  }
+
+  NodeAddr<BlockNode*> EA = Func.Addr->getEntryBlock(*this);
+  NodeList Blocks = Func.Addr->members(*this);
+
+  // Collect information about block references.
+  BlockRefsMap RefM;
+  buildBlockRefs(EA, RefM);
+
+  // Add function-entry phi nodes.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) {
+    NodeAddr<PhiNode*> PA = newPhi(EA);
+    RegisterRef RR = RegisterRef(I->first);
+    uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving;
+    NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
+    PA.Addr->addMember(DA, *this);
+  }
+
+  // Add phis for landing pads.
+  // Landing pads, unlike usual backs blocks, are not entered through
+  // branches in the program, or fall-throughs from other blocks. They
+  // are entered from the exception handling runtime and target's ABI
+  // may define certain registers as defined on entry to such a block.
+  RegisterSet EHRegs = getLandingPadLiveIns();
+  if (!EHRegs.empty()) {
+    for (NodeAddr<BlockNode*> BA : Blocks) {
+      const MachineBasicBlock &B = *BA.Addr->getCode();
+      if (!B.isEHPad())
+        continue;
+
+      // Prepare a list of NodeIds of the block's predecessors.
+      NodeList Preds;
+      for (MachineBasicBlock *PB : B.predecessors())
+        Preds.push_back(findBlock(PB));
+
+      // Build phi nodes for each live-in.
+      for (RegisterRef RR : EHRegs) {
+        NodeAddr<PhiNode*> PA = newPhi(BA);
+        uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving;
+        // Add def:
+        NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
+        PA.Addr->addMember(DA, *this);
+        // Add uses (no reaching defs for phi uses):
+        for (NodeAddr<BlockNode*> PBA : Preds) {
+          NodeAddr<PhiUseNode*> PUA = newPhiUse(PA, RR, PBA);
+          PA.Addr->addMember(PUA, *this);
+        }
+      }
+    }
+  }
+
+  // Build a map "PhiM" which will contain, for each block, the set
+  // of references that will require phi definitions in that block.
+  BlockRefsMap PhiM;
+  for (NodeAddr<BlockNode*> BA : Blocks)
+    recordDefsForDF(PhiM, RefM, BA);
+  for (NodeAddr<BlockNode*> BA : Blocks)
+    buildPhis(PhiM, RefM, BA);
+
+  // Link all the refs. This will recursively traverse the dominator tree.
+  DefStackMap DM;
+  linkBlockRefs(DM, EA);
+
+  // Finally, remove all unused phi nodes.
+  if (!(Options & BuildOptions::KeepDeadPhis))
+    removeUnusedPhis();
+}
+
+RegisterRef DataFlowGraph::makeRegRef(unsigned Reg, unsigned Sub) const {
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+  if (Sub != 0)
+    Reg = TRI.getSubReg(Reg, Sub);
+  return RegisterRef(Reg);
+}
+
+RegisterRef DataFlowGraph::normalizeRef(RegisterRef RR) const {
+  // FIXME copied from RegisterAggr
+  RegisterId SuperReg = RR.Reg;
+  while (true) {
+    MCSuperRegIterator SR(SuperReg, &TRI, false);
+    if (!SR.isValid())
+      break;
+    SuperReg = *SR;
+  }
+
+  uint32_t Sub = TRI.getSubRegIndex(SuperReg, RR.Reg);
+  const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.Reg);
+  LaneBitmask SuperMask = RR.Mask &
+                          TRI.composeSubRegIndexLaneMask(Sub, RC.LaneMask);
+  return RegisterRef(SuperReg, SuperMask);
+}
+
+RegisterRef DataFlowGraph::restrictRef(RegisterRef AR, RegisterRef BR) const {
+  if (AR.Reg == BR.Reg) {
+    LaneBitmask M = AR.Mask & BR.Mask;
+    return M.any() ? RegisterRef(AR.Reg, M) : RegisterRef();
+  }
+#ifndef NDEBUG
+  RegisterRef NAR = normalizeRef(AR);
+  RegisterRef NBR = normalizeRef(BR);
+  assert(NAR.Reg != NBR.Reg);
+#endif
+  // This isn't strictly correct, because the overlap may happen in the
+  // part masked out.
+  if (TRI.regsOverlap(AR.Reg, BR.Reg))
+    return AR;
+  return RegisterRef();
+}
+
+// For each stack in the map DefM, push the delimiter for block B on it.
+void DataFlowGraph::markBlock(NodeId B, DefStackMap &DefM) {
+  // Push block delimiters.
+  for (auto I = DefM.begin(), E = DefM.end(); I != E; ++I)
+    I->second.start_block(B);
+}
+
+// Remove all definitions coming from block B from each stack in DefM.
+void DataFlowGraph::releaseBlock(NodeId B, DefStackMap &DefM) {
+  // Pop all defs from this block from the definition stack. Defs that were
+  // added to the map during the traversal of instructions will not have a
+  // delimiter, but for those, the whole stack will be emptied.
+  for (auto I = DefM.begin(), E = DefM.end(); I != E; ++I)
+    I->second.clear_block(B);
+
+  // Finally, remove empty stacks from the map.
+  for (auto I = DefM.begin(), E = DefM.end(), NextI = I; I != E; I = NextI) {
+    NextI = std::next(I);
+    // This preserves the validity of iterators other than I.
+    if (I->second.empty())
+      DefM.erase(I);
+  }
+}
+
+// Push all definitions from the instruction node IA to an appropriate
+// stack in DefM.
+void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
+  NodeList Defs = IA.Addr->members_if(IsDef, *this);
+  NodeSet Visited;
+#ifndef NDEBUG
+  RegisterSet Defined;
+#endif
+
+  // The important objectives of this function are:
+  // - to be able to handle instructions both while the graph is being
+  //   constructed, and after the graph has been constructed, and
+  // - maintain proper ordering of definitions on the stack for each
+  //   register reference:
+  //   - if there are two or more related defs in IA (i.e. coming from
+  //     the same machine operand), then only push one def on the stack,
+  //   - if there are multiple unrelated defs of non-overlapping
+  //     subregisters of S, then the stack for S will have both (in an
+  //     unspecified order), but the order does not matter from the data-
+  //     -flow perspective.
+
+  for (NodeAddr<DefNode*> DA : Defs) {
+    if (Visited.count(DA.Id))
+      continue;
+
+    NodeList Rel = getRelatedRefs(IA, DA);
+    NodeAddr<DefNode*> PDA = Rel.front();
+    RegisterRef RR = PDA.Addr->getRegRef(*this);
+#ifndef NDEBUG
+    // Assert if the register is defined in two or more unrelated defs.
+    // This could happen if there are two or more def operands defining it.
+    if (!Defined.insert(RR).second) {
+      MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
+      dbgs() << "Multiple definitions of register: "
+             << Print<RegisterRef>(RR, *this) << " in\n  " << *MI
+             << "in BB#" << MI->getParent()->getNumber() << '\n';
+      llvm_unreachable(nullptr);
+    }
+#endif
+    // Push the definition on the stack for the register and all aliases.
+    // The def stack traversal in linkNodeUp will check the exact aliasing.
+    DefM[RR.Reg].push(DA);
+    for (RegisterRef A : getAliasSet(RR.Reg /*FIXME? use RegisterRef*/)) {
+      // Check that we don't push the same def twice.
+      assert(A != RR);
+      DefM[A.Reg].push(DA);
+    }
+    // Mark all the related defs as visited.
+    for (NodeAddr<NodeBase*> T : Rel)
+      Visited.insert(T.Id);
+  }
+}
+
+// Return the list of all reference nodes related to RA, including RA itself.
+// See "getNextRelated" for the meaning of a "related reference".
+NodeList DataFlowGraph::getRelatedRefs(NodeAddr<InstrNode*> IA,
+      NodeAddr<RefNode*> RA) const {
+  assert(IA.Id != 0 && RA.Id != 0);
+
+  NodeList Refs;
+  NodeId Start = RA.Id;
+  do {
+    Refs.push_back(RA);
+    RA = getNextRelated(IA, RA);
+  } while (RA.Id != 0 && RA.Id != Start);
+  return Refs;
+}
+
+// Return true if RA and RB overlap, false otherwise.
+bool DataFlowGraph::alias(RegisterRef RA, RegisterRef RB) const {
+  assert(TargetRegisterInfo::isPhysicalRegister(RA.Reg));
+  assert(TargetRegisterInfo::isPhysicalRegister(RB.Reg));
+
+  MCRegUnitMaskIterator UMA(RA.Reg, &TRI);
+  MCRegUnitMaskIterator UMB(RB.Reg, &TRI);
+  // Reg units are returned in the numerical order.
+  while (UMA.isValid() && UMB.isValid()) {
+    std::pair<uint32_t,LaneBitmask> PA = *UMA;
+    std::pair<uint32_t,LaneBitmask> PB = *UMB;
+    if (PA.first == PB.first) {
+      // Lane mask of 0 (given by the iterator) should be treated as "full".
+      // This can happen when the register has only one unit, or when the
+      // unit corresponds to explicit aliasing. In such cases, the lane mask
+      // from RegisterRef should be ignored.
+      if (PA.second.none() || PB.second.none())
+        return true;
+
+      // At this point the common unit corresponds to a subregister. The lane
+      // masks correspond to the lane mask of that unit within the original
+      // register, for example assuming register quadruple q0 = r3:0, and
+      // a register pair d1 = r3:2, the lane mask of r2 in q0 may be 0b0100,
+      // while the lane mask of r2 in d1 may be 0b0001.
+      LaneBitmask LA = PA.second & RA.Mask;
+      LaneBitmask LB = PB.second & RB.Mask;
+      if (LA.any() && LB.any()) {
+        unsigned Root = *MCRegUnitRootIterator(PA.first, &TRI);
+        // If register units were guaranteed to only have 1 bit in any lane
+        // mask, the code below would not be necessary. This is because LA
+        // and LB would have at most 1 bit set each, and that bit would be
+        // guaranteed to correspond to the given register unit.
+        uint32_t SubA = TRI.getSubRegIndex(RA.Reg, Root);
+        uint32_t SubB = TRI.getSubRegIndex(RB.Reg, Root);
+        const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(Root);
+        LaneBitmask MaskA = TRI.reverseComposeSubRegIndexLaneMask(SubA, LA);
+        LaneBitmask MaskB = TRI.reverseComposeSubRegIndexLaneMask(SubB, LB);
+        if ((MaskA & MaskB & RC.LaneMask).any())
+          return true;
+      }
+
+      ++UMA;
+      ++UMB;
+      continue;
+    }
+    if (PA.first < PB.first)
+      ++UMA;
+    else if (PB.first < PA.first)
+      ++UMB;
+  }
+  return false;
+}
+
+
+// Clear all information in the graph.
+void DataFlowGraph::reset() {
+  Memory.clear();
+  BlockNodes.clear();
+  Func = NodeAddr<FuncNode*>();
+}
+
+
+// Return the next reference node in the instruction node IA that is related
+// to RA. Conceptually, two reference nodes are related if they refer to the
+// same instance of a register access, but differ in flags or other minor
+// characteristics. Specific examples of related nodes are shadow reference
+// nodes.
+// Return the equivalent of nullptr if there are no more related references.
+NodeAddr<RefNode*> DataFlowGraph::getNextRelated(NodeAddr<InstrNode*> IA,
+      NodeAddr<RefNode*> RA) const {
+  assert(IA.Id != 0 && RA.Id != 0);
+
+  auto Related = [this,RA](NodeAddr<RefNode*> TA) -> bool {
+    if (TA.Addr->getKind() != RA.Addr->getKind())
+      return false;
+    if (TA.Addr->getRegRef(*this) != RA.Addr->getRegRef(*this))
+      return false;
+    return true;
+  };
+  auto RelatedStmt = [&Related,RA](NodeAddr<RefNode*> TA) -> bool {
+    return Related(TA) &&
+           &RA.Addr->getOp() == &TA.Addr->getOp();
+  };
+  auto RelatedPhi = [&Related,RA](NodeAddr<RefNode*> TA) -> bool {
+    if (!Related(TA))
+      return false;
+    if (TA.Addr->getKind() != NodeAttrs::Use)
+      return true;
+    // For phi uses, compare predecessor blocks.
+    const NodeAddr<const PhiUseNode*> TUA = TA;
+    const NodeAddr<const PhiUseNode*> RUA = RA;
+    return TUA.Addr->getPredecessor() == RUA.Addr->getPredecessor();
+  };
+
+  RegisterRef RR = RA.Addr->getRegRef(*this);
+  if (IA.Addr->getKind() == NodeAttrs::Stmt)
+    return RA.Addr->getNextRef(RR, RelatedStmt, true, *this);
+  return RA.Addr->getNextRef(RR, RelatedPhi, true, *this);
+}
+
+// Find the next node related to RA in IA that satisfies condition P.
+// If such a node was found, return a pair where the second element is the
+// located node. If such a node does not exist, return a pair where the
+// first element is the element after which such a node should be inserted,
+// and the second element is a null-address.
+template <typename Predicate>
+std::pair<NodeAddr<RefNode*>,NodeAddr<RefNode*>>
+DataFlowGraph::locateNextRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
+      Predicate P) const {
+  assert(IA.Id != 0 && RA.Id != 0);
+
+  NodeAddr<RefNode*> NA;
+  NodeId Start = RA.Id;
+  while (true) {
+    NA = getNextRelated(IA, RA);
+    if (NA.Id == 0 || NA.Id == Start)
+      break;
+    if (P(NA))
+      break;
+    RA = NA;
+  }
+
+  if (NA.Id != 0 && NA.Id != Start)
+    return std::make_pair(RA, NA);
+  return std::make_pair(RA, NodeAddr<RefNode*>());
+}
+
+// Get the next shadow node in IA corresponding to RA, and optionally create
+// such a node if it does not exist.
+NodeAddr<RefNode*> DataFlowGraph::getNextShadow(NodeAddr<InstrNode*> IA,
+      NodeAddr<RefNode*> RA, bool Create) {
+  assert(IA.Id != 0 && RA.Id != 0);
+
+  uint16_t Flags = RA.Addr->getFlags() | NodeAttrs::Shadow;
+  auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool {
+    return TA.Addr->getFlags() == Flags;
+  };
+  auto Loc = locateNextRef(IA, RA, IsShadow);
+  if (Loc.second.Id != 0 || !Create)
+    return Loc.second;
+
+  // Create a copy of RA and mark is as shadow.
+  NodeAddr<RefNode*> NA = cloneNode(RA);
+  NA.Addr->setFlags(Flags | NodeAttrs::Shadow);
+  IA.Addr->addMemberAfter(Loc.first, NA, *this);
+  return NA;
+}
+
+// Get the next shadow node in IA corresponding to RA. Return null-address
+// if such a node does not exist.
+NodeAddr<RefNode*> DataFlowGraph::getNextShadow(NodeAddr<InstrNode*> IA,
+      NodeAddr<RefNode*> RA) const {
+  assert(IA.Id != 0 && RA.Id != 0);
+  uint16_t Flags = RA.Addr->getFlags() | NodeAttrs::Shadow;
+  auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool {
+    return TA.Addr->getFlags() == Flags;
+  };
+  return locateNextRef(IA, RA, IsShadow).second;
+}
+
+// Create a new statement node in the block node BA that corresponds to
+// the machine instruction MI.
+void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
+  NodeAddr<StmtNode*> SA = newStmt(BA, &In);
+
+  auto isCall = [] (const MachineInstr &In) -> bool {
+    if (In.isCall())
+      return true;
+    // Is tail call?
+    if (In.isBranch())
+      for (const MachineOperand &Op : In.operands())
+        if (Op.isGlobal() || Op.isSymbol())
+          return true;
+    return false;
+  };
+
+  auto isDefUndef = [this] (const MachineInstr &In, RegisterRef DR) -> bool {
+    // This instruction defines DR. Check if there is a use operand that
+    // would make DR live on entry to the instruction.
+    for (const MachineOperand &UseOp : In.operands()) {
+      if (!UseOp.isReg() || !UseOp.isUse() || UseOp.isUndef())
+        continue;
+      RegisterRef UR = makeRegRef(UseOp.getReg(), UseOp.getSubReg());
+      if (alias(DR, UR))
+        return false;
+    }
+    return true;
+  };
+
+  // Collect a set of registers that this instruction implicitly uses
+  // or defines. Implicit operands from an instruction will be ignored
+  // unless they are listed here.
+  RegisterSet ImpUses, ImpDefs;
+  if (const uint16_t *ImpD = In.getDesc().getImplicitDefs())
+    while (uint16_t R = *ImpD++)
+      ImpDefs.insert(RegisterRef(R));
+  if (const uint16_t *ImpU = In.getDesc().getImplicitUses())
+    while (uint16_t R = *ImpU++)
+      ImpUses.insert(RegisterRef(R));
+
+  bool IsCall = isCall(In);
+  bool NeedsImplicit = IsCall || In.isInlineAsm() || In.isReturn();
+  bool IsPredicated = TII.isPredicated(In);
+  unsigned NumOps = In.getNumOperands();
+
+  // Avoid duplicate implicit defs. This will not detect cases of implicit
+  // defs that define registers that overlap, but it is not clear how to
+  // interpret that in the absence of explicit defs. Overlapping explicit
+  // defs are likely illegal already.
+  RegisterSet DoneDefs;
+  // Process explicit defs first.
+  for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+    MachineOperand &Op = In.getOperand(OpN);
+    if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
+      continue;
+    RegisterRef RR = makeRegRef(Op.getReg(), Op.getSubReg());
+    uint16_t Flags = NodeAttrs::None;
+    if (TOI.isPreserving(In, OpN)) {
+      Flags |= NodeAttrs::Preserving;
+      // If the def is preserving, check if it is also undefined.
+      if (isDefUndef(In, RR))
+        Flags |= NodeAttrs::Undef;
+    }
+    if (TOI.isClobbering(In, OpN))
+      Flags |= NodeAttrs::Clobbering;
+    if (TOI.isFixedReg(In, OpN))
+      Flags |= NodeAttrs::Fixed;
+    if (IsCall && Op.isDead())
+      Flags |= NodeAttrs::Dead;
+    NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
+    SA.Addr->addMember(DA, *this);
+    DoneDefs.insert(RR);
+  }
+
+  // Process implicit defs, skipping those that have already been added
+  // as explicit.
+  for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+    MachineOperand &Op = In.getOperand(OpN);
+    if (!Op.isReg() || !Op.isDef() || !Op.isImplicit())
+      continue;
+    RegisterRef RR = makeRegRef(Op.getReg(), Op.getSubReg());
+    if (!NeedsImplicit && !ImpDefs.count(RR))
+      continue;
+    if (DoneDefs.count(RR))
+      continue;
+    uint16_t Flags = NodeAttrs::None;
+    if (TOI.isPreserving(In, OpN)) {
+      Flags |= NodeAttrs::Preserving;
+      // If the def is preserving, check if it is also undefined.
+      if (isDefUndef(In, RR))
+        Flags |= NodeAttrs::Undef;
+    }
+    if (TOI.isClobbering(In, OpN))
+      Flags |= NodeAttrs::Clobbering;
+    if (TOI.isFixedReg(In, OpN))
+      Flags |= NodeAttrs::Fixed;
+    if (IsCall && Op.isDead())
+      Flags |= NodeAttrs::Dead;
+    NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
+    SA.Addr->addMember(DA, *this);
+    DoneDefs.insert(RR);
+  }
+
+  for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+    MachineOperand &Op = In.getOperand(OpN);
+    if (!Op.isReg() || !Op.isUse())
+      continue;
+    RegisterRef RR = makeRegRef(Op.getReg(), Op.getSubReg());
+    // Add implicit uses on return and call instructions, and on predicated
+    // instructions regardless of whether or not they appear in the instruction
+    // descriptor's list.
+    bool Implicit = Op.isImplicit();
+    bool TakeImplicit = NeedsImplicit || IsPredicated;
+    if (Implicit && !TakeImplicit && !ImpUses.count(RR))
+      continue;
+    uint16_t Flags = NodeAttrs::None;
+    if (Op.isUndef())
+      Flags |= NodeAttrs::Undef;
+    if (TOI.isFixedReg(In, OpN))
+      Flags |= NodeAttrs::Fixed;
+    NodeAddr<UseNode*> UA = newUse(SA, Op, Flags);
+    SA.Addr->addMember(UA, *this);
+  }
+}
+
+// Build a map that for each block will have the set of all references from
+// that block, and from all blocks dominated by it.
+void DataFlowGraph::buildBlockRefs(NodeAddr<BlockNode*> BA,
+      BlockRefsMap &RefM) {
+  RegisterSet &Refs = RefM[BA.Id];
+  MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode());
+  assert(N);
+  for (auto I : *N) {
+    MachineBasicBlock *SB = I->getBlock();
+    NodeAddr<BlockNode*> SBA = findBlock(SB);
+    buildBlockRefs(SBA, RefM);
+    const RegisterSet &RefsS = RefM[SBA.Id];
+    Refs.insert(RefsS.begin(), RefsS.end());
+  }
+
+  for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this))
+    for (NodeAddr<RefNode*> RA : IA.Addr->members(*this))
+      Refs.insert(RA.Addr->getRegRef(*this));
+}
+
+// Scan all defs in the block node BA and record in PhiM the locations of
+// phi nodes corresponding to these defs.
+void DataFlowGraph::recordDefsForDF(BlockRefsMap &PhiM, BlockRefsMap &RefM,
+      NodeAddr<BlockNode*> BA) {
+  // Check all defs from block BA and record them in each block in BA's
+  // iterated dominance frontier. This information will later be used to
+  // create phi nodes.
+  MachineBasicBlock *BB = BA.Addr->getCode();
+  assert(BB);
+  auto DFLoc = MDF.find(BB);
+  if (DFLoc == MDF.end() || DFLoc->second.empty())
+    return;
+
+  // Traverse all instructions in the block and collect the set of all
+  // defined references. For each reference there will be a phi created
+  // in the block's iterated dominance frontier.
+  // This is done to make sure that each defined reference gets only one
+  // phi node, even if it is defined multiple times.
+  RegisterSet Defs;
+  for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this))
+    for (NodeAddr<RefNode*> RA : IA.Addr->members_if(IsDef, *this))
+      Defs.insert(RA.Addr->getRegRef(*this));
+
+  // Calculate the iterated dominance frontier of BB.
+  const MachineDominanceFrontier::DomSetType &DF = DFLoc->second;
+  SetVector<MachineBasicBlock*> IDF(DF.begin(), DF.end());
+  for (unsigned i = 0; i < IDF.size(); ++i) {
+    auto F = MDF.find(IDF[i]);
+    if (F != MDF.end())
+      IDF.insert(F->second.begin(), F->second.end());
+  }
+
+  // Get the register references that are reachable from this block.
+  RegisterSet &Refs = RefM[BA.Id];
+  for (auto DB : IDF) {
+    NodeAddr<BlockNode*> DBA = findBlock(DB);
+    const RegisterSet &RefsD = RefM[DBA.Id];
+    Refs.insert(RefsD.begin(), RefsD.end());
+  }
+
+  // Finally, add the set of defs to each block in the iterated dominance
+  // frontier.
+  for (auto DB : IDF) {
+    NodeAddr<BlockNode*> DBA = findBlock(DB);
+    PhiM[DBA.Id].insert(Defs.begin(), Defs.end());
+  }
+}
+
+// Given the locations of phi nodes in the map PhiM, create the phi nodes
+// that are located in the block node BA.
+void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, BlockRefsMap &RefM,
+      NodeAddr<BlockNode*> BA) {
+  // Check if this blocks has any DF defs, i.e. if there are any defs
+  // that this block is in the iterated dominance frontier of.
+  auto HasDF = PhiM.find(BA.Id);
+  if (HasDF == PhiM.end() || HasDF->second.empty())
+    return;
+
+  // First, remove all R in Refs in such that there exists T in Refs
+  // such that T covers R. In other words, only leave those refs that
+  // are not covered by another ref (i.e. maximal with respect to covering).
+
+  auto MaxCoverIn = [this] (RegisterRef RR, RegisterSet &RRs) -> RegisterRef {
+    for (RegisterRef I : RRs)
+      if (I != RR && RegisterAggr::isCoverOf(I, RR, TRI))
+        RR = I;
+    return RR;
+  };
+
+  RegisterSet MaxDF;
+  for (RegisterRef I : HasDF->second)
+    MaxDF.insert(MaxCoverIn(I, HasDF->second));
+
+  std::vector<RegisterRef> MaxRefs;
+  RegisterSet &RefB = RefM[BA.Id];
+  for (RegisterRef I : MaxDF)
+    MaxRefs.push_back(MaxCoverIn(I, RefB));
+
+  // Now, for each R in MaxRefs, get the alias closure of R. If the closure
+  // only has R in it, create a phi a def for R. Otherwise, create a phi,
+  // and add a def for each S in the closure.
+
+  // Sort the refs so that the phis will be created in a deterministic order.
+  std::sort(MaxRefs.begin(), MaxRefs.end());
+  // Remove duplicates.
+  auto NewEnd = std::unique(MaxRefs.begin(), MaxRefs.end());
+  MaxRefs.erase(NewEnd, MaxRefs.end());
+
+  auto Aliased = [this,&MaxRefs](RegisterRef RR,
+                                 std::vector<unsigned> &Closure) -> bool {
+    for (unsigned I : Closure)
+      if (alias(RR, MaxRefs[I]))
+        return true;
+    return false;
+  };
+
+  // Prepare a list of NodeIds of the block's predecessors.
+  NodeList Preds;
+  const MachineBasicBlock *MBB = BA.Addr->getCode();
+  for (MachineBasicBlock *PB : MBB->predecessors())
+    Preds.push_back(findBlock(PB));
+
+  while (!MaxRefs.empty()) {
+    // Put the first element in the closure, and then add all subsequent
+    // elements from MaxRefs to it, if they alias at least one element
+    // already in the closure.
+    // ClosureIdx: vector of indices in MaxRefs of members of the closure.
+    std::vector<unsigned> ClosureIdx = { 0 };
+    for (unsigned i = 1; i != MaxRefs.size(); ++i)
+      if (Aliased(MaxRefs[i], ClosureIdx))
+        ClosureIdx.push_back(i);
+
+    // Build a phi for the closure.
+    unsigned CS = ClosureIdx.size();
+    NodeAddr<PhiNode*> PA = newPhi(BA);
+
+    // Add defs.
+    for (unsigned X = 0; X != CS; ++X) {
+      RegisterRef RR = MaxRefs[ClosureIdx[X]];
+      uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving;
+      NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
+      PA.Addr->addMember(DA, *this);
+    }
+    // Add phi uses.
+    for (NodeAddr<BlockNode*> PBA : Preds) {
+      for (unsigned X = 0; X != CS; ++X) {
+        RegisterRef RR = MaxRefs[ClosureIdx[X]];
+        NodeAddr<PhiUseNode*> PUA = newPhiUse(PA, RR, PBA);
+        PA.Addr->addMember(PUA, *this);
+      }
+    }
+
+    // Erase from MaxRefs all elements in the closure.
+    auto Begin = MaxRefs.begin();
+    for (unsigned i = ClosureIdx.size(); i != 0; --i)
+      MaxRefs.erase(Begin + ClosureIdx[i-1]);
+  }
+}
+
+// Remove any unneeded phi nodes that were created during the build process.
+void DataFlowGraph::removeUnusedPhis() {
+  // This will remove unused phis, i.e. phis where each def does not reach
+  // any uses or other defs. This will not detect or remove circular phi
+  // chains that are otherwise dead. Unused/dead phis are created during
+  // the build process and this function is intended to remove these cases
+  // that are easily determinable to be unnecessary.
+
+  SetVector<NodeId> PhiQ;
+  for (NodeAddr<BlockNode*> BA : Func.Addr->members(*this)) {
+    for (auto P : BA.Addr->members_if(IsPhi, *this))
+      PhiQ.insert(P.Id);
+  }
+
+  static auto HasUsedDef = [](NodeList &Ms) -> bool {
+    for (NodeAddr<NodeBase*> M : Ms) {
+      if (M.Addr->getKind() != NodeAttrs::Def)
+        continue;
+      NodeAddr<DefNode*> DA = M;
+      if (DA.Addr->getReachedDef() != 0 || DA.Addr->getReachedUse() != 0)
+        return true;
+    }
+    return false;
+  };
+
+  // Any phi, if it is removed, may affect other phis (make them dead).
+  // For each removed phi, collect the potentially affected phis and add
+  // them back to the queue.
+  while (!PhiQ.empty()) {
+    auto PA = addr<PhiNode*>(PhiQ[0]);
+    PhiQ.remove(PA.Id);
+    NodeList Refs = PA.Addr->members(*this);
+    if (HasUsedDef(Refs))
+      continue;
+    for (NodeAddr<RefNode*> RA : Refs) {
+      if (NodeId RD = RA.Addr->getReachingDef()) {
+        auto RDA = addr<DefNode*>(RD);
+        NodeAddr<InstrNode*> OA = RDA.Addr->getOwner(*this);
+        if (IsPhi(OA))
+          PhiQ.insert(OA.Id);
+      }
+      if (RA.Addr->isDef())
+        unlinkDef(RA, true);
+      else
+        unlinkUse(RA, true);
+    }
+    NodeAddr<BlockNode*> BA = PA.Addr->getOwner(*this);
+    BA.Addr->removeMember(PA, *this);
+  }
+}
+
+// For a given reference node TA in an instruction node IA, connect the
+// reaching def of TA to the appropriate def node. Create any shadow nodes
+// as appropriate.
+template <typename T>
+void DataFlowGraph::linkRefUp(NodeAddr<InstrNode*> IA, NodeAddr<T> TA,
+      DefStack &DS) {
+  if (DS.empty())
+    return;
+  RegisterRef RR = TA.Addr->getRegRef(*this);
+  NodeAddr<T> TAP;
+
+  // References from the def stack that have been examined so far.
+  RegisterAggr Defs(TRI);
+
+  for (auto I = DS.top(), E = DS.bottom(); I != E; I.down()) {
+    RegisterRef QR = I->Addr->getRegRef(*this);
+
+    // Skip all defs that are aliased to any of the defs that we have already
+    // seen. If this completes a cover of RR, stop the stack traversal.
+    bool Alias = Defs.hasAliasOf(QR);
+    bool Cover = Defs.insert(QR).hasCoverOf(RR);
+    if (Alias) {
+      if (Cover)
+        break;
+      continue;
+    }
+
+    // The reaching def.
+    NodeAddr<DefNode*> RDA = *I;
+
+    // Pick the reached node.
+    if (TAP.Id == 0) {
+      TAP = TA;
+    } else {
+      // Mark the existing ref as "shadow" and create a new shadow.
+      TAP.Addr->setFlags(TAP.Addr->getFlags() | NodeAttrs::Shadow);
+      TAP = getNextShadow(IA, TAP, true);
+    }
+
+    // Create the link.
+    TAP.Addr->linkToDef(TAP.Id, RDA);
+
+    if (Cover)
+      break;
+  }
+}
+
+// Create data-flow links for all reference nodes in the statement node SA.
+void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA) {
+#ifndef NDEBUG
+  RegisterSet Defs;
+#endif
+
+  // Link all nodes (upwards in the data-flow) with their reaching defs.
+  for (NodeAddr<RefNode*> RA : SA.Addr->members(*this)) {
+    uint16_t Kind = RA.Addr->getKind();
+    assert(Kind == NodeAttrs::Def || Kind == NodeAttrs::Use);
+    RegisterRef RR = RA.Addr->getRegRef(*this);
+#ifndef NDEBUG
+    // Do not expect multiple defs of the same reference.
+    assert(Kind != NodeAttrs::Def || !Defs.count(RR));
+    Defs.insert(RR);
+#endif
+
+    auto F = DefM.find(RR.Reg);
+    if (F == DefM.end())
+      continue;
+    DefStack &DS = F->second;
+    if (Kind == NodeAttrs::Use)
+      linkRefUp<UseNode*>(SA, RA, DS);
+    else if (Kind == NodeAttrs::Def)
+      linkRefUp<DefNode*>(SA, RA, DS);
+    else
+      llvm_unreachable("Unexpected node in instruction");
+  }
+}
+
+// Create data-flow links for all instructions in the block node BA. This
+// will include updating any phi nodes in BA.
+void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) {
+  // Push block delimiters.
+  markBlock(BA.Id, DefM);
+
+  assert(BA.Addr && "block node address is needed to create a data-flow link");
+  // For each non-phi instruction in the block, link all the defs and uses
+  // to their reaching defs. For any member of the block (including phis),
+  // push the defs on the corresponding stacks.
+  for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this)) {
+    // Ignore phi nodes here. They will be linked part by part from the
+    // predecessors.
+    if (IA.Addr->getKind() == NodeAttrs::Stmt)
+      linkStmtRefs(DefM, IA);
+
+    // Push the definitions on the stack.
+    pushDefs(IA, DefM);
+  }
+
+  // Recursively process all children in the dominator tree.
+  MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode());
+  for (auto I : *N) {
+    MachineBasicBlock *SB = I->getBlock();
+    NodeAddr<BlockNode*> SBA = findBlock(SB);
+    linkBlockRefs(DefM, SBA);
+  }
+
+  // Link the phi uses from the successor blocks.
+  auto IsUseForBA = [BA](NodeAddr<NodeBase*> NA) -> bool {
+    if (NA.Addr->getKind() != NodeAttrs::Use)
+      return false;
+    assert(NA.Addr->getFlags() & NodeAttrs::PhiRef);
+    NodeAddr<PhiUseNode*> PUA = NA;
+    return PUA.Addr->getPredecessor() == BA.Id;
+  };
+
+  RegisterSet EHLiveIns = getLandingPadLiveIns();
+  MachineBasicBlock *MBB = BA.Addr->getCode();
+
+  for (MachineBasicBlock *SB : MBB->successors()) {
+    bool IsEHPad = SB->isEHPad();
+    NodeAddr<BlockNode*> SBA = findBlock(SB);
+    for (NodeAddr<InstrNode*> IA : SBA.Addr->members_if(IsPhi, *this)) {
+      // Do not link phi uses for landing pad live-ins.
+      if (IsEHPad) {
+        // Find what register this phi is for.
+        NodeAddr<RefNode*> RA = IA.Addr->getFirstMember(*this);
+        assert(RA.Id != 0);
+        if (EHLiveIns.count(RA.Addr->getRegRef(*this)))
+          continue;
+      }
+      // Go over each phi use associated with MBB, and link it.
+      for (auto U : IA.Addr->members_if(IsUseForBA, *this)) {
+        NodeAddr<PhiUseNode*> PUA = U;
+        RegisterRef RR = PUA.Addr->getRegRef(*this);
+        linkRefUp<UseNode*>(IA, PUA, DefM[RR.Reg]);
+      }
+    }
+  }
+
+  // Pop all defs from this block from the definition stacks.
+  releaseBlock(BA.Id, DefM);
+}
+
+// Remove the use node UA from any data-flow and structural links.
+void DataFlowGraph::unlinkUseDF(NodeAddr<UseNode*> UA) {
+  NodeId RD = UA.Addr->getReachingDef();
+  NodeId Sib = UA.Addr->getSibling();
+
+  if (RD == 0) {
+    assert(Sib == 0);
+    return;
+  }
+
+  auto RDA = addr<DefNode*>(RD);
+  auto TA = addr<UseNode*>(RDA.Addr->getReachedUse());
+  if (TA.Id == UA.Id) {
+    RDA.Addr->setReachedUse(Sib);
+    return;
+  }
+
+  while (TA.Id != 0) {
+    NodeId S = TA.Addr->getSibling();
+    if (S == UA.Id) {
+      TA.Addr->setSibling(UA.Addr->getSibling());
+      return;
+    }
+    TA = addr<UseNode*>(S);
+  }
+}
+
+// Remove the def node DA from any data-flow and structural links.
+void DataFlowGraph::unlinkDefDF(NodeAddr<DefNode*> DA) {
+  //
+  //         RD
+  //         | reached
+  //         | def
+  //         :
+  //         .
+  //        +----+
+  // ... -- | DA | -- ... -- 0  : sibling chain of DA
+  //        +----+
+  //         |  | reached
+  //         |  : def
+  //         |  .
+  //         | ...  : Siblings (defs)
+  //         |
+  //         : reached
+  //         . use
+  //        ... : sibling chain of reached uses
+
+  NodeId RD = DA.Addr->getReachingDef();
+
+  // Visit all siblings of the reached def and reset their reaching defs.
+  // Also, defs reached by DA are now "promoted" to being reached by RD,
+  // so all of them will need to be spliced into the sibling chain where
+  // DA belongs.
+  auto getAllNodes = [this] (NodeId N) -> NodeList {
+    NodeList Res;
+    while (N) {
+      auto RA = addr<RefNode*>(N);
+      // Keep the nodes in the exact sibling order.
+      Res.push_back(RA);
+      N = RA.Addr->getSibling();
+    }
+    return Res;
+  };
+  NodeList ReachedDefs = getAllNodes(DA.Addr->getReachedDef());
+  NodeList ReachedUses = getAllNodes(DA.Addr->getReachedUse());
+
+  if (RD == 0) {
+    for (NodeAddr<RefNode*> I : ReachedDefs)
+      I.Addr->setSibling(0);
+    for (NodeAddr<RefNode*> I : ReachedUses)
+      I.Addr->setSibling(0);
+  }
+  for (NodeAddr<DefNode*> I : ReachedDefs)
+    I.Addr->setReachingDef(RD);
+  for (NodeAddr<UseNode*> I : ReachedUses)
+    I.Addr->setReachingDef(RD);
+
+  NodeId Sib = DA.Addr->getSibling();
+  if (RD == 0) {
+    assert(Sib == 0);
+    return;
+  }
+
+  // Update the reaching def node and remove DA from the sibling list.
+  auto RDA = addr<DefNode*>(RD);
+  auto TA = addr<DefNode*>(RDA.Addr->getReachedDef());
+  if (TA.Id == DA.Id) {
+    // If DA is the first reached def, just update the RD's reached def
+    // to the DA's sibling.
+    RDA.Addr->setReachedDef(Sib);
+  } else {
+    // Otherwise, traverse the sibling list of the reached defs and remove
+    // DA from it.
+    while (TA.Id != 0) {
+      NodeId S = TA.Addr->getSibling();
+      if (S == DA.Id) {
+        TA.Addr->setSibling(Sib);
+        break;
+      }
+      TA = addr<DefNode*>(S);
+    }
+  }
+
+  // Splice the DA's reached defs into the RDA's reached def chain.
+  if (!ReachedDefs.empty()) {
+    auto Last = NodeAddr<DefNode*>(ReachedDefs.back());
+    Last.Addr->setSibling(RDA.Addr->getReachedDef());
+    RDA.Addr->setReachedDef(ReachedDefs.front().Id);
+  }
+  // Splice the DA's reached uses into the RDA's reached use chain.
+  if (!ReachedUses.empty()) {
+    auto Last = NodeAddr<UseNode*>(ReachedUses.back());
+    Last.Addr->setSibling(RDA.Addr->getReachedUse());
+    RDA.Addr->setReachedUse(ReachedUses.front().Id);
+  }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFGraph.h b/contrib/llvm/lib/Target/Hexagon/RDFGraph.h
new file mode 100644
index 000000000000..871062ff2b05
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFGraph.h
@@ -0,0 +1,997 @@
+//===--- RDFGraph.h -------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Target-independent, SSA-based data flow graph for register data flow (RDF)
+// for a non-SSA program representation (e.g. post-RA machine code).
+//
+//
+// *** Introduction
+//
+// The RDF graph is a collection of nodes, each of which denotes some element
+// of the program. There are two main types of such elements: code and refe-
+// rences. Conceptually, "code" is something that represents the structure
+// of the program, e.g. basic block or a statement, while "reference" is an
+// instance of accessing a register, e.g. a definition or a use. Nodes are
+// connected with each other based on the structure of the program (such as
+// blocks, instructions, etc.), and based on the data flow (e.g. reaching
+// definitions, reached uses, etc.). The single-reaching-definition principle
+// of SSA is generally observed, although, due to the non-SSA representation
+// of the program, there are some differences between the graph and a "pure"
+// SSA representation.
+//
+//
+// *** Implementation remarks
+//
+// Since the graph can contain a large number of nodes, memory consumption
+// was one of the major design considerations. As a result, there is a single
+// base class NodeBase which defines all members used by all possible derived
+// classes. The members are arranged in a union, and a derived class cannot
+// add any data members of its own. Each derived class only defines the
+// functional interface, i.e. member functions. NodeBase must be a POD,
+// which implies that all of its members must also be PODs.
+// Since nodes need to be connected with other nodes, pointers have been
+// replaced with 32-bit identifiers: each node has an id of type NodeId.
+// There are mapping functions in the graph that translate between actual
+// memory addresses and the corresponding identifiers.
+// A node id of 0 is equivalent to nullptr.
+//
+//
+// *** Structure of the graph
+//
+// A code node is always a collection of other nodes. For example, a code
+// node corresponding to a basic block will contain code nodes corresponding
+// to instructions. In turn, a code node corresponding to an instruction will
+// contain a list of reference nodes that correspond to the definitions and
+// uses of registers in that instruction. The members are arranged into a
+// circular list, which is yet another consequence of the effort to save
+// memory: for each member node it should be possible to obtain its owner,
+// and it should be possible to access all other members. There are other
+// ways to accomplish that, but the circular list seemed the most natural.
+//
+// +- CodeNode -+
+// |            | <---------------------------------------------------+
+// +-+--------+-+                                                     |
+//   |FirstM  |LastM                                                  |
+//   |        +-------------------------------------+                 |
+//   |                                              |                 |
+//   V                                              V                 |
+//  +----------+ Next +----------+ Next       Next +----------+ Next  |
+//  |          |----->|          |-----> ... ----->|          |----->-+
+//  +- Member -+      +- Member -+                 +- Member -+
+//
+// The order of members is such that related reference nodes (see below)
+// should be contiguous on the member list.
+//
+// A reference node is a node that encapsulates an access to a register,
+// in other words, data flowing into or out of a register. There are two
+// major kinds of reference nodes: defs and uses. A def node will contain
+// the id of the first reached use, and the id of the first reached def.
+// Each def and use will contain the id of the reaching def, and also the
+// id of the next reached def (for def nodes) or use (for use nodes).
+// The "next node sharing the same reaching def" is denoted as "sibling".
+// In summary:
+// - Def node contains: reaching def, sibling, first reached def, and first
+// reached use.
+// - Use node contains: reaching def and sibling.
+//
+// +-- DefNode --+
+// | R2 = ...    | <---+--------------------+
+// ++---------+--+     |                    |
+//  |Reached  |Reached |                    |
+//  |Def      |Use     |                    |
+//  |         |        |Reaching            |Reaching
+//  |         V        |Def                 |Def
+//  |      +-- UseNode --+ Sib  +-- UseNode --+ Sib       Sib
+//  |      | ... = R2    |----->| ... = R2    |----> ... ----> 0
+//  |      +-------------+      +-------------+
+//  V
+// +-- DefNode --+ Sib
+// | R2 = ...    |----> ...
+// ++---------+--+
+//  |         |
+//  |         |
+// ...       ...
+//
+// To get a full picture, the circular lists connecting blocks within a
+// function, instructions within a block, etc. should be superimposed with
+// the def-def, def-use links shown above.
+// To illustrate this, consider a small example in a pseudo-assembly:
+// foo:
+//   add r2, r0, r1   ; r2 = r0+r1
+//   addi r0, r2, 1   ; r0 = r2+1
+//   ret r0           ; return value in r0
+//
+// The graph (in a format used by the debugging functions) would look like:
+//
+//   DFG dump:[
+//   f1: Function foo
+//   b2: === BB#0 === preds(0), succs(0):
+//   p3: phi [d4<r0>(,d12,u9):]
+//   p5: phi [d6<r1>(,,u10):]
+//   s7: add [d8<r2>(,,u13):, u9<r0>(d4):, u10<r1>(d6):]
+//   s11: addi [d12<r0>(d4,,u15):, u13<r2>(d8):]
+//   s14: ret [u15<r0>(d12):]
+//   ]
+//
+// The f1, b2, p3, etc. are node ids. The letter is prepended to indicate the
+// kind of the node (i.e. f - function, b - basic block, p - phi, s - state-
+// ment, d - def, u - use).
+// The format of a def node is:
+//   dN<R>(rd,d,u):sib,
+// where
+//   N   - numeric node id,
+//   R   - register being defined
+//   rd  - reaching def,
+//   d   - reached def,
+//   u   - reached use,
+//   sib - sibling.
+// The format of a use node is:
+//   uN<R>[!](rd):sib,
+// where
+//   N   - numeric node id,
+//   R   - register being used,
+//   rd  - reaching def,
+//   sib - sibling.
+// Possible annotations (usually preceding the node id):
+//   +   - preserving def,
+//   ~   - clobbering def,
+//   "   - shadow ref (follows the node id),
+//   !   - fixed register (appears after register name).
+//
+// The circular lists are not explicit in the dump.
+//
+//
+// *** Node attributes
+//
+// NodeBase has a member "Attrs", which is the primary way of determining
+// the node's characteristics. The fields in this member decide whether
+// the node is a code node or a reference node (i.e. node's "type"), then
+// within each type, the "kind" determines what specifically this node
+// represents. The remaining bits, "flags", contain additional information
+// that is even more detailed than the "kind".
+// CodeNode's kinds are:
+// - Phi:   Phi node, members are reference nodes.
+// - Stmt:  Statement, members are reference nodes.
+// - Block: Basic block, members are instruction nodes (i.e. Phi or Stmt).
+// - Func:  The whole function. The members are basic block nodes.
+// RefNode's kinds are:
+// - Use.
+// - Def.
+//
+// Meaning of flags:
+// - Preserving: applies only to defs. A preserving def is one that can
+//   preserve some of the original bits among those that are included in
+//   the register associated with that def. For example, if R0 is a 32-bit
+//   register, but a def can only change the lower 16 bits, then it will
+//   be marked as preserving.
+// - Shadow: a reference that has duplicates holding additional reaching
+//   defs (see more below).
+// - Clobbering: applied only to defs, indicates that the value generated
+//   by this def is unspecified. A typical example would be volatile registers
+//   after function calls.
+// - Fixed: the register in this def/use cannot be replaced with any other
+//   register. A typical case would be a parameter register to a call, or
+//   the register with the return value from a function.
+// - Undef: the register in this reference the register is assumed to have
+//   no pre-existing value, even if it appears to be reached by some def.
+//   This is typically used to prevent keeping registers artificially live
+//   in cases when they are defined via predicated instructions. For example:
+//     r0 = add-if-true cond, r10, r11                (1)
+//     r0 = add-if-false cond, r12, r13, r0<imp-use>  (2)
+//     ... = r0                                       (3)
+//   Before (1), r0 is not intended to be live, and the use of r0 in (3) is
+//   not meant to be reached by any def preceding (1). However, since the
+//   defs in (1) and (2) are both preserving, these properties alone would
+//   imply that the use in (3) may indeed be reached by some prior def.
+//   Adding Undef flag to the def in (1) prevents that. The Undef flag
+//   may be applied to both defs and uses.
+// - Dead: applies only to defs. The value coming out of a "dead" def is
+//   assumed to be unused, even if the def appears to be reaching other defs
+//   or uses. The motivation for this flag comes from dead defs on function
+//   calls: there is no way to determine if such a def is dead without
+//   analyzing the target's ABI. Hence the graph should contain this info,
+//   as it is unavailable otherwise. On the other hand, a def without any
+//   uses on a typical instruction is not the intended target for this flag.
+//
+// *** Shadow references
+//
+// It may happen that a super-register can have two (or more) non-overlapping
+// sub-registers. When both of these sub-registers are defined and followed
+// by a use of the super-register, the use of the super-register will not
+// have a unique reaching def: both defs of the sub-registers need to be
+// accounted for. In such cases, a duplicate use of the super-register is
+// added and it points to the extra reaching def. Both uses are marked with
+// a flag "shadow". Example:
+// Assume t0 is a super-register of r0 and r1, r0 and r1 do not overlap:
+//   set r0, 1        ; r0 = 1
+//   set r1, 1        ; r1 = 1
+//   addi t1, t0, 1   ; t1 = t0+1
+//
+// The DFG:
+//   s1: set [d2<r0>(,,u9):]
+//   s3: set [d4<r1>(,,u10):]
+//   s5: addi [d6<t1>(,,):, u7"<t0>(d2):, u8"<t0>(d4):]
+//
+// The statement s5 has two use nodes for t0: u7" and u9". The quotation
+// mark " indicates that the node is a shadow.
+//
+#ifndef RDF_GRAPH_H
+#define RDF_GRAPH_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include <functional>
+#include <map>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+// RDF uses uint32_t to refer to registers. This is to ensure that the type
+// size remains specific. In other places, registers are often stored using
+// unsigned.
+static_assert(sizeof(uint32_t) == sizeof(unsigned), "Those should be equal");
+
+namespace llvm {
+  class MachineBasicBlock;
+  class MachineFunction;
+  class MachineInstr;
+  class MachineOperand;
+  class MachineDominanceFrontier;
+  class MachineDominatorTree;
+  class TargetInstrInfo;
+
+namespace rdf {
+  typedef uint32_t NodeId;
+  typedef uint32_t RegisterId;
+
+  struct DataFlowGraph;
+
+  struct NodeAttrs {
+    enum : uint16_t {
+      None          = 0x0000,   // Nothing
+
+      // Types: 2 bits
+      TypeMask      = 0x0003,
+      Code          = 0x0001,   // 01, Container
+      Ref           = 0x0002,   // 10, Reference
+
+      // Kind: 3 bits
+      KindMask      = 0x0007 << 2,
+      Def           = 0x0001 << 2,  // 001
+      Use           = 0x0002 << 2,  // 010
+      Phi           = 0x0003 << 2,  // 011
+      Stmt          = 0x0004 << 2,  // 100
+      Block         = 0x0005 << 2,  // 101
+      Func          = 0x0006 << 2,  // 110
+
+      // Flags: 7 bits for now
+      FlagMask      = 0x007F << 5,
+      Shadow        = 0x0001 << 5,  // 0000001, Has extra reaching defs.
+      Clobbering    = 0x0002 << 5,  // 0000010, Produces unspecified values.
+      PhiRef        = 0x0004 << 5,  // 0000100, Member of PhiNode.
+      Preserving    = 0x0008 << 5,  // 0001000, Def can keep original bits.
+      Fixed         = 0x0010 << 5,  // 0010000, Fixed register.
+      Undef         = 0x0020 << 5,  // 0100000, Has no pre-existing value.
+      Dead          = 0x0040 << 5,  // 1000000, Does not define a value.
+    };
+
+    static uint16_t type(uint16_t T)  { return T & TypeMask; }
+    static uint16_t kind(uint16_t T)  { return T & KindMask; }
+    static uint16_t flags(uint16_t T) { return T & FlagMask; }
+
+    static uint16_t set_type(uint16_t A, uint16_t T) {
+      return (A & ~TypeMask) | T;
+    }
+    static uint16_t set_kind(uint16_t A, uint16_t K) {
+      return (A & ~KindMask) | K;
+    }
+    static uint16_t set_flags(uint16_t A, uint16_t F) {
+      return (A & ~FlagMask) | F;
+    }
+
+    // Test if A contains B.
+    static bool contains(uint16_t A, uint16_t B) {
+      if (type(A) != Code)
+        return false;
+      uint16_t KB = kind(B);
+      switch (kind(A)) {
+        case Func:
+          return KB == Block;
+        case Block:
+          return KB == Phi || KB == Stmt;
+        case Phi:
+        case Stmt:
+          return type(B) == Ref;
+      }
+      return false;
+    }
+  };
+
+  struct BuildOptions {
+    enum : unsigned {
+      None          = 0x00,
+      KeepDeadPhis  = 0x01,   // Do not remove dead phis during build.
+    };
+  };
+
+  template <typename T> struct NodeAddr {
+    NodeAddr() : Addr(nullptr), Id(0) {}
+    NodeAddr(T A, NodeId I) : Addr(A), Id(I) {}
+
+    bool operator== (const NodeAddr<T> &NA) const {
+      assert((Addr == NA.Addr) == (Id == NA.Id));
+      return Addr == NA.Addr;
+    }
+    bool operator!= (const NodeAddr<T> &NA) const {
+      return !operator==(NA);
+    }
+    // Type cast (casting constructor). The reason for having this class
+    // instead of std::pair.
+    template <typename S> NodeAddr(const NodeAddr<S> &NA)
+      : Addr(static_cast<T>(NA.Addr)), Id(NA.Id) {}
+
+    T Addr;
+    NodeId Id;
+  };
+
+  struct NodeBase;
+
+  // Fast memory allocation and translation between node id and node address.
+  // This is really the same idea as the one underlying the "bump pointer
+  // allocator", the difference being in the translation. A node id is
+  // composed of two components: the index of the block in which it was
+  // allocated, and the index within the block. With the default settings,
+  // where the number of nodes per block is 4096, the node id (minus 1) is:
+  //
+  // bit position:                11             0
+  // +----------------------------+--------------+
+  // | Index of the block         |Index in block|
+  // +----------------------------+--------------+
+  //
+  // The actual node id is the above plus 1, to avoid creating a node id of 0.
+  //
+  // This method significantly improved the build time, compared to using maps
+  // (std::unordered_map or DenseMap) to translate between pointers and ids.
+  struct NodeAllocator {
+    // Amount of storage for a single node.
+    enum { NodeMemSize = 32 };
+    NodeAllocator(uint32_t NPB = 4096)
+        : NodesPerBlock(NPB), BitsPerIndex(Log2_32(NPB)),
+          IndexMask((1 << BitsPerIndex)-1), ActiveEnd(nullptr) {
+      assert(isPowerOf2_32(NPB));
+    }
+    NodeBase *ptr(NodeId N) const {
+      uint32_t N1 = N-1;
+      uint32_t BlockN = N1 >> BitsPerIndex;
+      uint32_t Offset = (N1 & IndexMask) * NodeMemSize;
+      return reinterpret_cast<NodeBase*>(Blocks[BlockN]+Offset);
+    }
+    NodeId id(const NodeBase *P) const;
+    NodeAddr<NodeBase*> New();
+    void clear();
+
+  private:
+    void startNewBlock();
+    bool needNewBlock();
+    uint32_t makeId(uint32_t Block, uint32_t Index) const {
+      // Add 1 to the id, to avoid the id of 0, which is treated as "null".
+      return ((Block << BitsPerIndex) | Index) + 1;
+    }
+
+    const uint32_t NodesPerBlock;
+    const uint32_t BitsPerIndex;
+    const uint32_t IndexMask;
+    char *ActiveEnd;
+    std::vector<char*> Blocks;
+    typedef BumpPtrAllocatorImpl<MallocAllocator, 65536> AllocatorTy;
+    AllocatorTy MemPool;
+  };
+
+  struct RegisterRef {
+    RegisterId Reg;
+    LaneBitmask Mask;
+
+    RegisterRef() : RegisterRef(0) {}
+    explicit RegisterRef(RegisterId R, LaneBitmask M = LaneBitmask::getAll())
+      : Reg(R), Mask(R != 0 ? M : LaneBitmask::getNone()) {}
+    operator bool() const { return Reg != 0 && Mask.any(); }
+    bool operator== (const RegisterRef &RR) const {
+      return Reg == RR.Reg && Mask == RR.Mask;
+    }
+    bool operator!= (const RegisterRef &RR) const {
+      return !operator==(RR);
+    }
+    bool operator< (const RegisterRef &RR) const {
+      return Reg < RR.Reg || (Reg == RR.Reg && Mask < RR.Mask);
+    }
+  };
+  typedef std::set<RegisterRef> RegisterSet;
+
+  struct TargetOperandInfo {
+    TargetOperandInfo(const TargetInstrInfo &tii) : TII(tii) {}
+    virtual ~TargetOperandInfo() {}
+    virtual bool isPreserving(const MachineInstr &In, unsigned OpNum) const;
+    virtual bool isClobbering(const MachineInstr &In, unsigned OpNum) const;
+    virtual bool isFixedReg(const MachineInstr &In, unsigned OpNum) const;
+
+    const TargetInstrInfo &TII;
+  };
+
+
+  // Packed register reference. Only used for storage.
+  struct PackedRegisterRef {
+    RegisterId Reg;
+    uint32_t MaskId;
+  };
+
+  // Template class for a map translating uint32_t into arbitrary types.
+  // The map will act like an indexed set: upon insertion of a new object,
+  // it will automatically assign a new index to it. Index of 0 is treated
+  // as invalid and is never allocated.
+  template <typename T, unsigned N = 32>
+  struct IndexedSet {
+    IndexedSet() : Map() { Map.reserve(N); }
+    T get(uint32_t Idx) const {
+      // Index Idx corresponds to Map[Idx-1].
+      assert(Idx != 0 && !Map.empty() && Idx-1 < Map.size());
+      return Map[Idx-1];
+    }
+    uint32_t insert(T Val) {
+      // Linear search.
+      auto F = llvm::find(Map, Val);
+      if (F != Map.end())
+        return F - Map.begin() + 1;
+      Map.push_back(Val);
+      return Map.size();  // Return actual_index + 1.
+    }
+    uint32_t find(T Val) const {
+      auto F = llvm::find(Map, Val);
+      assert(F != Map.end());
+      return F - Map.begin();
+    }
+  private:
+    std::vector<T> Map;
+  };
+
+  struct LaneMaskIndex : private IndexedSet<LaneBitmask> {
+    LaneMaskIndex() = default;
+
+    LaneBitmask getLaneMaskForIndex(uint32_t K) const {
+      return K == 0 ? LaneBitmask::getAll() : get(K);
+    }
+    uint32_t getIndexForLaneMask(LaneBitmask LM) {
+      assert(LM.any());
+      return LM.all() ? 0 : insert(LM);
+    }
+    uint32_t getIndexForLaneMask(LaneBitmask LM) const {
+      assert(LM.any());
+      return LM.all() ? 0 : find(LM);
+    }
+    PackedRegisterRef pack(RegisterRef RR) {
+      return { RR.Reg, getIndexForLaneMask(RR.Mask) };
+    }
+    PackedRegisterRef pack(RegisterRef RR) const {
+      return { RR.Reg, getIndexForLaneMask(RR.Mask) };
+    }
+    RegisterRef unpack(PackedRegisterRef PR) const {
+      return RegisterRef(PR.Reg, getLaneMaskForIndex(PR.MaskId));
+    }
+  };
+
+  struct RegisterAggr {
+    RegisterAggr(const TargetRegisterInfo &tri)
+        : Masks(), ExpAliasUnits(tri.getNumRegUnits()), CheckUnits(false),
+          TRI(tri) {}
+    RegisterAggr(const RegisterAggr &RG)
+        : Masks(RG.Masks), ExpAliasUnits(RG.ExpAliasUnits),
+          CheckUnits(RG.CheckUnits), TRI(RG.TRI) {}
+
+    bool empty() const { return Masks.empty(); }
+    bool hasAliasOf(RegisterRef RR) const;
+    bool hasCoverOf(RegisterRef RR) const;
+    static bool isCoverOf(RegisterRef RA, RegisterRef RB,
+                          const TargetRegisterInfo &TRI) {
+      return RegisterAggr(TRI).insert(RA).hasCoverOf(RB);
+    }
+
+    RegisterAggr &insert(RegisterRef RR);
+    RegisterAggr &insert(const RegisterAggr &RG);
+    RegisterAggr &clear(RegisterRef RR);
+    RegisterAggr &clear(const RegisterAggr &RG);
+
+    RegisterRef clearIn(RegisterRef RR) const;
+
+    void print(raw_ostream &OS) const;
+
+  private:
+    typedef std::unordered_map<RegisterId, LaneBitmask> MapType;
+
+  public:
+    typedef MapType::const_iterator iterator;
+    iterator begin() const { return Masks.begin(); }
+    iterator end() const { return Masks.end(); }
+    RegisterRef normalize(RegisterRef RR) const;
+
+  private:
+    MapType Masks;
+    BitVector ExpAliasUnits; // Register units for explicit aliases.
+    bool CheckUnits;
+    const TargetRegisterInfo &TRI;
+  };
+
+
+  struct NodeBase {
+  public:
+    // Make sure this is a POD.
+    NodeBase() = default;
+    uint16_t getType()  const { return NodeAttrs::type(Attrs); }
+    uint16_t getKind()  const { return NodeAttrs::kind(Attrs); }
+    uint16_t getFlags() const { return NodeAttrs::flags(Attrs); }
+    NodeId   getNext()  const { return Next; }
+
+    uint16_t getAttrs() const { return Attrs; }
+    void setAttrs(uint16_t A) { Attrs = A; }
+    void setFlags(uint16_t F) { setAttrs(NodeAttrs::set_flags(getAttrs(), F)); }
+
+    // Insert node NA after "this" in the circular chain.
+    void append(NodeAddr<NodeBase*> NA);
+    // Initialize all members to 0.
+    void init() { memset(this, 0, sizeof *this); }
+    void setNext(NodeId N) { Next = N; }
+
+  protected:
+    uint16_t Attrs;
+    uint16_t Reserved;
+    NodeId Next;                // Id of the next node in the circular chain.
+    // Definitions of nested types. Using anonymous nested structs would make
+    // this class definition clearer, but unnamed structs are not a part of
+    // the standard.
+    struct Def_struct  {
+      NodeId DD, DU;          // Ids of the first reached def and use.
+    };
+    struct PhiU_struct  {
+      NodeId PredB;           // Id of the predecessor block for a phi use.
+    };
+    struct Code_struct {
+      void *CP;               // Pointer to the actual code.
+      NodeId FirstM, LastM;   // Id of the first member and last.
+    };
+    struct Ref_struct {
+      NodeId RD, Sib;         // Ids of the reaching def and the sibling.
+      union {
+        Def_struct Def;
+        PhiU_struct PhiU;
+      };
+      union {
+        MachineOperand *Op;   // Non-phi refs point to a machine operand.
+        PackedRegisterRef PR; // Phi refs store register info directly.
+      };
+    };
+
+    // The actual payload.
+    union {
+      Ref_struct Ref;
+      Code_struct Code;
+    };
+  };
+  // The allocator allocates chunks of 32 bytes for each node. The fact that
+  // each node takes 32 bytes in memory is used for fast translation between
+  // the node id and the node address.
+  static_assert(sizeof(NodeBase) <= NodeAllocator::NodeMemSize,
+        "NodeBase must be at most NodeAllocator::NodeMemSize bytes");
+
+  typedef std::vector<NodeAddr<NodeBase*>> NodeList;
+  typedef std::set<NodeId> NodeSet;
+
+  struct RefNode : public NodeBase {
+    RefNode() = default;
+    RegisterRef getRegRef(const DataFlowGraph &G) const;
+    MachineOperand &getOp() {
+      assert(!(getFlags() & NodeAttrs::PhiRef));
+      return *Ref.Op;
+    }
+    void setRegRef(RegisterRef RR, DataFlowGraph &G);
+    void setRegRef(MachineOperand *Op, DataFlowGraph &G);
+    NodeId getReachingDef() const {
+      return Ref.RD;
+    }
+    void setReachingDef(NodeId RD) {
+      Ref.RD = RD;
+    }
+    NodeId getSibling() const {
+      return Ref.Sib;
+    }
+    void setSibling(NodeId Sib) {
+      Ref.Sib = Sib;
+    }
+    bool isUse() const {
+      assert(getType() == NodeAttrs::Ref);
+      return getKind() == NodeAttrs::Use;
+    }
+    bool isDef() const {
+      assert(getType() == NodeAttrs::Ref);
+      return getKind() == NodeAttrs::Def;
+    }
+
+    template <typename Predicate>
+    NodeAddr<RefNode*> getNextRef(RegisterRef RR, Predicate P, bool NextOnly,
+        const DataFlowGraph &G);
+    NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G);
+  };
+
+  struct DefNode : public RefNode {
+    NodeId getReachedDef() const {
+      return Ref.Def.DD;
+    }
+    void setReachedDef(NodeId D) {
+      Ref.Def.DD = D;
+    }
+    NodeId getReachedUse() const {
+      return Ref.Def.DU;
+    }
+    void setReachedUse(NodeId U) {
+      Ref.Def.DU = U;
+    }
+
+    void linkToDef(NodeId Self, NodeAddr<DefNode*> DA);
+  };
+
+  struct UseNode : public RefNode {
+    void linkToDef(NodeId Self, NodeAddr<DefNode*> DA);
+  };
+
+  struct PhiUseNode : public UseNode {
+    NodeId getPredecessor() const {
+      assert(getFlags() & NodeAttrs::PhiRef);
+      return Ref.PhiU.PredB;
+    }
+    void setPredecessor(NodeId B) {
+      assert(getFlags() & NodeAttrs::PhiRef);
+      Ref.PhiU.PredB = B;
+    }
+  };
+
+  struct CodeNode : public NodeBase {
+    template <typename T> T getCode() const {
+      return static_cast<T>(Code.CP);
+    }
+    void setCode(void *C) {
+      Code.CP = C;
+    }
+
+    NodeAddr<NodeBase*> getFirstMember(const DataFlowGraph &G) const;
+    NodeAddr<NodeBase*> getLastMember(const DataFlowGraph &G) const;
+    void addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G);
+    void addMemberAfter(NodeAddr<NodeBase*> MA, NodeAddr<NodeBase*> NA,
+        const DataFlowGraph &G);
+    void removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G);
+
+    NodeList members(const DataFlowGraph &G) const;
+    template <typename Predicate>
+    NodeList members_if(Predicate P, const DataFlowGraph &G) const;
+  };
+
+  struct InstrNode : public CodeNode {
+    NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G);
+  };
+
+  struct PhiNode : public InstrNode {
+    MachineInstr *getCode() const {
+      return nullptr;
+    }
+  };
+
+  struct StmtNode : public InstrNode {
+    MachineInstr *getCode() const {
+      return CodeNode::getCode<MachineInstr*>();
+    }
+  };
+
+  struct BlockNode : public CodeNode {
+    MachineBasicBlock *getCode() const {
+      return CodeNode::getCode<MachineBasicBlock*>();
+    }
+    void addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G);
+  };
+
+  struct FuncNode : public CodeNode {
+    MachineFunction *getCode() const {
+      return CodeNode::getCode<MachineFunction*>();
+    }
+    NodeAddr<BlockNode*> findBlock(const MachineBasicBlock *BB,
+        const DataFlowGraph &G) const;
+    NodeAddr<BlockNode*> getEntryBlock(const DataFlowGraph &G);
+  };
+
+  struct DataFlowGraph {
+    DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
+        const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
+        const MachineDominanceFrontier &mdf, const TargetOperandInfo &toi);
+
+    NodeBase *ptr(NodeId N) const;
+    template <typename T> T ptr(NodeId N) const {
+      return static_cast<T>(ptr(N));
+    }
+    NodeId id(const NodeBase *P) const;
+
+    template <typename T> NodeAddr<T> addr(NodeId N) const {
+      return { ptr<T>(N), N };
+    }
+
+    NodeAddr<FuncNode*> getFunc() const { return Func; }
+    MachineFunction &getMF() const { return MF; }
+    const TargetInstrInfo &getTII() const { return TII; }
+    const TargetRegisterInfo &getTRI() const { return TRI; }
+    const MachineDominatorTree &getDT() const { return MDT; }
+    const MachineDominanceFrontier &getDF() const { return MDF; }
+
+    struct DefStack {
+      DefStack() = default;
+      bool empty() const { return Stack.empty() || top() == bottom(); }
+    private:
+      typedef NodeAddr<DefNode*> value_type;
+      struct Iterator {
+        typedef DefStack::value_type value_type;
+        Iterator &up() { Pos = DS.nextUp(Pos); return *this; }
+        Iterator &down() { Pos = DS.nextDown(Pos); return *this; }
+        value_type operator*() const {
+          assert(Pos >= 1);
+          return DS.Stack[Pos-1];
+        }
+        const value_type *operator->() const {
+          assert(Pos >= 1);
+          return &DS.Stack[Pos-1];
+        }
+        bool operator==(const Iterator &It) const { return Pos == It.Pos; }
+        bool operator!=(const Iterator &It) const { return Pos != It.Pos; }
+      private:
+        Iterator(const DefStack &S, bool Top);
+        // Pos-1 is the index in the StorageType object that corresponds to
+        // the top of the DefStack.
+        const DefStack &DS;
+        unsigned Pos;
+        friend struct DefStack;
+      };
+    public:
+      typedef Iterator iterator;
+      iterator top() const { return Iterator(*this, true); }
+      iterator bottom() const { return Iterator(*this, false); }
+      unsigned size() const;
+
+      void push(NodeAddr<DefNode*> DA) { Stack.push_back(DA); }
+      void pop();
+      void start_block(NodeId N);
+      void clear_block(NodeId N);
+    private:
+      friend struct Iterator;
+      typedef std::vector<value_type> StorageType;
+      bool isDelimiter(const StorageType::value_type &P, NodeId N = 0) const {
+        return (P.Addr == nullptr) && (N == 0 || P.Id == N);
+      }
+      unsigned nextUp(unsigned P) const;
+      unsigned nextDown(unsigned P) const;
+      StorageType Stack;
+    };
+
+    // Make this std::unordered_map for speed of accessing elements.
+    // Map: Register (physical or virtual) -> DefStack
+    typedef std::unordered_map<RegisterId,DefStack> DefStackMap;
+
+    void build(unsigned Options = BuildOptions::None);
+    void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
+    void markBlock(NodeId B, DefStackMap &DefM);
+    void releaseBlock(NodeId B, DefStackMap &DefM);
+
+    PackedRegisterRef pack(RegisterRef RR)       { return LMI.pack(RR); }
+    PackedRegisterRef pack(RegisterRef RR) const { return LMI.pack(RR); }
+    RegisterRef unpack(PackedRegisterRef PR) const { return LMI.unpack(PR); }
+    RegisterRef makeRegRef(unsigned Reg, unsigned Sub) const;
+    RegisterRef normalizeRef(RegisterRef RR) const;
+    RegisterRef restrictRef(RegisterRef AR, RegisterRef BR) const;
+
+    NodeAddr<RefNode*> getNextRelated(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA) const;
+    NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA, bool Create);
+    NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA) const;
+    NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA, bool Create);
+    NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA) const;
+
+    NodeList getRelatedRefs(NodeAddr<InstrNode*> IA,
+        NodeAddr<RefNode*> RA) const;
+
+    void unlinkUse(NodeAddr<UseNode*> UA, bool RemoveFromOwner) {
+      unlinkUseDF(UA);
+      if (RemoveFromOwner)
+        removeFromOwner(UA);
+    }
+    void unlinkDef(NodeAddr<DefNode*> DA, bool RemoveFromOwner) {
+      unlinkDefDF(DA);
+      if (RemoveFromOwner)
+        removeFromOwner(DA);
+    }
+
+    // Some useful filters.
+    template <uint16_t Kind>
+    static bool IsRef(const NodeAddr<NodeBase*> BA) {
+      return BA.Addr->getType() == NodeAttrs::Ref &&
+             BA.Addr->getKind() == Kind;
+    }
+    template <uint16_t Kind>
+    static bool IsCode(const NodeAddr<NodeBase*> BA) {
+      return BA.Addr->getType() == NodeAttrs::Code &&
+             BA.Addr->getKind() == Kind;
+    }
+    static bool IsDef(const NodeAddr<NodeBase*> BA) {
+      return BA.Addr->getType() == NodeAttrs::Ref &&
+             BA.Addr->getKind() == NodeAttrs::Def;
+    }
+    static bool IsUse(const NodeAddr<NodeBase*> BA) {
+      return BA.Addr->getType() == NodeAttrs::Ref &&
+             BA.Addr->getKind() == NodeAttrs::Use;
+    }
+    static bool IsPhi(const NodeAddr<NodeBase*> BA) {
+      return BA.Addr->getType() == NodeAttrs::Code &&
+             BA.Addr->getKind() == NodeAttrs::Phi;
+    }
+    static bool IsPreservingDef(const NodeAddr<DefNode*> DA) {
+      uint16_t Flags = DA.Addr->getFlags();
+      return (Flags & NodeAttrs::Preserving) && !(Flags & NodeAttrs::Undef);
+    }
+
+    // Register aliasing.
+    bool alias(RegisterRef RA, RegisterRef RB) const;
+
+  private:
+    void reset();
+
+    RegisterSet getAliasSet(RegisterId Reg) const;
+    RegisterSet getLandingPadLiveIns() const;
+
+    NodeAddr<NodeBase*> newNode(uint16_t Attrs);
+    NodeAddr<NodeBase*> cloneNode(const NodeAddr<NodeBase*> B);
+    NodeAddr<UseNode*> newUse(NodeAddr<InstrNode*> Owner,
+        MachineOperand &Op, uint16_t Flags = NodeAttrs::None);
+    NodeAddr<PhiUseNode*> newPhiUse(NodeAddr<PhiNode*> Owner,
+        RegisterRef RR, NodeAddr<BlockNode*> PredB,
+        uint16_t Flags = NodeAttrs::PhiRef);
+    NodeAddr<DefNode*> newDef(NodeAddr<InstrNode*> Owner,
+        MachineOperand &Op, uint16_t Flags = NodeAttrs::None);
+    NodeAddr<DefNode*> newDef(NodeAddr<InstrNode*> Owner,
+        RegisterRef RR, uint16_t Flags = NodeAttrs::PhiRef);
+    NodeAddr<PhiNode*> newPhi(NodeAddr<BlockNode*> Owner);
+    NodeAddr<StmtNode*> newStmt(NodeAddr<BlockNode*> Owner,
+        MachineInstr *MI);
+    NodeAddr<BlockNode*> newBlock(NodeAddr<FuncNode*> Owner,
+        MachineBasicBlock *BB);
+    NodeAddr<FuncNode*> newFunc(MachineFunction *MF);
+
+    template <typename Predicate>
+    std::pair<NodeAddr<RefNode*>,NodeAddr<RefNode*>>
+    locateNextRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
+        Predicate P) const;
+
+    typedef std::map<NodeId,RegisterSet> BlockRefsMap;
+
+    void buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In);
+    void buildBlockRefs(NodeAddr<BlockNode*> BA, BlockRefsMap &RefM);
+    void recordDefsForDF(BlockRefsMap &PhiM, BlockRefsMap &RefM,
+        NodeAddr<BlockNode*> BA);
+    void buildPhis(BlockRefsMap &PhiM, BlockRefsMap &RefM,
+        NodeAddr<BlockNode*> BA);
+    void removeUnusedPhis();
+
+    template <typename T> void linkRefUp(NodeAddr<InstrNode*> IA,
+        NodeAddr<T> TA, DefStack &DS);
+    void linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA);
+    void linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA);
+
+    void unlinkUseDF(NodeAddr<UseNode*> UA);
+    void unlinkDefDF(NodeAddr<DefNode*> DA);
+    void removeFromOwner(NodeAddr<RefNode*> RA) {
+      NodeAddr<InstrNode*> IA = RA.Addr->getOwner(*this);
+      IA.Addr->removeMember(RA, *this);
+    }
+
+    NodeAddr<BlockNode*> findBlock(MachineBasicBlock *BB) {
+      return BlockNodes[BB];
+    }
+
+    NodeAddr<FuncNode*> Func;
+    NodeAllocator Memory;
+    // Local map:  MachineBasicBlock -> NodeAddr<BlockNode*>
+    std::map<MachineBasicBlock*,NodeAddr<BlockNode*>> BlockNodes;
+    // Lane mask map.
+    LaneMaskIndex LMI;
+
+    MachineFunction &MF;
+    const TargetInstrInfo &TII;
+    const TargetRegisterInfo &TRI;
+    const MachineDominatorTree &MDT;
+    const MachineDominanceFrontier &MDF;
+    const TargetOperandInfo &TOI;
+  };  // struct DataFlowGraph
+
+  template <typename Predicate>
+  NodeAddr<RefNode*> RefNode::getNextRef(RegisterRef RR, Predicate P,
+        bool NextOnly, const DataFlowGraph &G) {
+    // Get the "Next" reference in the circular list that references RR and
+    // satisfies predicate "Pred".
+    auto NA = G.addr<NodeBase*>(getNext());
+
+    while (NA.Addr != this) {
+      if (NA.Addr->getType() == NodeAttrs::Ref) {
+        NodeAddr<RefNode*> RA = NA;
+        if (RA.Addr->getRegRef(G) == RR && P(NA))
+          return NA;
+        if (NextOnly)
+          break;
+        NA = G.addr<NodeBase*>(NA.Addr->getNext());
+      } else {
+        // We've hit the beginning of the chain.
+        assert(NA.Addr->getType() == NodeAttrs::Code);
+        NodeAddr<CodeNode*> CA = NA;
+        NA = CA.Addr->getFirstMember(G);
+      }
+    }
+    // Return the equivalent of "nullptr" if such a node was not found.
+    return NodeAddr<RefNode*>();
+  }
+
+  template <typename Predicate>
+  NodeList CodeNode::members_if(Predicate P, const DataFlowGraph &G) const {
+    NodeList MM;
+    auto M = getFirstMember(G);
+    if (M.Id == 0)
+      return MM;
+
+    while (M.Addr != this) {
+      if (P(M))
+        MM.push_back(M);
+      M = G.addr<NodeBase*>(M.Addr->getNext());
+    }
+    return MM;
+  }
+
+
+  // Optionally print the lane mask, if it is not ~0.
+  struct PrintLaneMaskOpt {
+    PrintLaneMaskOpt(LaneBitmask M) : Mask(M) {}
+    LaneBitmask Mask;
+  };
+  raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P);
+
+  template <typename T> struct Print;
+  template <typename T>
+  raw_ostream &operator<< (raw_ostream &OS, const Print<T> &P);
+
+  template <typename T>
+  struct Print {
+    Print(const T &x, const DataFlowGraph &g) : Obj(x), G(g) {}
+    const T &Obj;
+    const DataFlowGraph &G;
+  };
+
+  template <typename T>
+  struct PrintNode : Print<NodeAddr<T>> {
+    PrintNode(const NodeAddr<T> &x, const DataFlowGraph &g)
+      : Print<NodeAddr<T>>(x, g) {}
+  };
+} // namespace rdf
+} // namespace llvm
+
+#endif // RDF_GRAPH_H
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp
new file mode 100644
index 000000000000..e74c4bfc1645
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp
@@ -0,0 +1,1030 @@
+//===--- RDFLiveness.cpp --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Computation of the liveness information from the data-flow graph.
+//
+// The main functionality of this code is to compute block live-in
+// information. With the live-in information in place, the placement
+// of kill flags can also be recalculated.
+//
+// The block live-in calculation is based on the ideas from the following
+// publication:
+//
+// Dibyendu Das, Ramakrishna Upadrasta, Benoit Dupont de Dinechin.
+// "Efficient Liveness Computation Using Merge Sets and DJ-Graphs."
+// ACM Transactions on Architecture and Code Optimization, Association for
+// Computing Machinery, 2012, ACM TACO Special Issue on "High-Performance
+// and Embedded Architectures and Compilers", 8 (4),
+// <10.1145/2086696.2086706>. <hal-00647369>
+//
+#include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+using namespace rdf;
+
+namespace llvm {
+namespace rdf {
+  template<>
+  raw_ostream &operator<< (raw_ostream &OS, const Print<Liveness::RefMap> &P) {
+    OS << '{';
+    for (auto &I : P.Obj) {
+      OS << ' ' << PrintReg(I.first, &P.G.getTRI()) << '{';
+      for (auto J = I.second.begin(), E = I.second.end(); J != E; ) {
+        OS << Print<NodeId>(J->first, P.G) << PrintLaneMaskOpt(J->second);
+        if (++J != E)
+          OS << ',';
+      }
+      OS << '}';
+    }
+    OS << " }";
+    return OS;
+  }
+} // namespace rdf
+} // namespace llvm
+
+// The order in the returned sequence is the order of reaching defs in the
+// upward traversal: the first def is the closest to the given reference RefA,
+// the next one is further up, and so on.
+// The list ends at a reaching phi def, or when the reference from RefA is
+// covered by the defs in the list (see FullChain).
+// This function provides two modes of operation:
+// (1) Returning the sequence of reaching defs for a particular reference
+// node. This sequence will terminate at the first phi node [1].
+// (2) Returning a partial sequence of reaching defs, where the final goal
+// is to traverse past phi nodes to the actual defs arising from the code
+// itself.
+// In mode (2), the register reference for which the search was started
+// may be different from the reference node RefA, for which this call was
+// made, hence the argument RefRR, which holds the original register.
+// Also, some definitions may have already been encountered in a previous
+// call that will influence register covering. The register references
+// already defined are passed in through DefRRs.
+// In mode (1), the "continuation" considerations do not apply, and the
+// RefRR is the same as the register in RefA, and the set DefRRs is empty.
+//
+// [1] It is possible for multiple phi nodes to be included in the returned
+// sequence:
+//   SubA = phi ...
+//   SubB = phi ...
+//   ...  = SuperAB(rdef:SubA), SuperAB"(rdef:SubB)
+// However, these phi nodes are independent from one another in terms of
+// the data-flow.
+
+NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
+      NodeAddr<RefNode*> RefA, bool FullChain, const RegisterAggr &DefRRs) {
+  NodeList RDefs; // Return value.
+  SetVector<NodeId> DefQ;
+  SetVector<NodeId> Owners;
+
+  // Dead defs will be treated as if they were live, since they are actually
+  // on the data-flow path. They cannot be ignored because even though they
+  // do not generate meaningful values, they still modify registers.
+
+  // If the reference is undefined, there is nothing to do.
+  if (RefA.Addr->getFlags() & NodeAttrs::Undef)
+    return RDefs;
+
+  // The initial queue should not have reaching defs for shadows. The
+  // whole point of a shadow is that it will have a reaching def that
+  // is not aliased to the reaching defs of the related shadows.
+  NodeId Start = RefA.Id;
+  auto SNA = DFG.addr<RefNode*>(Start);
+  if (NodeId RD = SNA.Addr->getReachingDef())
+    DefQ.insert(RD);
+
+  // Collect all the reaching defs, going up until a phi node is encountered,
+  // or there are no more reaching defs. From this set, the actual set of
+  // reaching defs will be selected.
+  // The traversal upwards must go on until a covering def is encountered.
+  // It is possible that a collection of non-covering (individually) defs
+  // will be sufficient, but keep going until a covering one is found.
+  for (unsigned i = 0; i < DefQ.size(); ++i) {
+    auto TA = DFG.addr<DefNode*>(DefQ[i]);
+    if (TA.Addr->getFlags() & NodeAttrs::PhiRef)
+      continue;
+    // Stop at the covering/overwriting def of the initial register reference.
+    RegisterRef RR = TA.Addr->getRegRef(DFG);
+    if (!DFG.IsPreservingDef(TA))
+      if (RegisterAggr::isCoverOf(RR, RefRR, TRI))
+        continue;
+    // Get the next level of reaching defs. This will include multiple
+    // reaching defs for shadows.
+    for (auto S : DFG.getRelatedRefs(TA.Addr->getOwner(DFG), TA))
+      if (NodeId RD = NodeAddr<RefNode*>(S).Addr->getReachingDef())
+        DefQ.insert(RD);
+  }
+
+  // Remove all non-phi defs that are not aliased to RefRR, and collect
+  // the owners of the remaining defs.
+  SetVector<NodeId> Defs;
+  for (NodeId N : DefQ) {
+    auto TA = DFG.addr<DefNode*>(N);
+    bool IsPhi = TA.Addr->getFlags() & NodeAttrs::PhiRef;
+    if (!IsPhi && !DFG.alias(RefRR, TA.Addr->getRegRef(DFG)))
+      continue;
+    Defs.insert(TA.Id);
+    Owners.insert(TA.Addr->getOwner(DFG).Id);
+  }
+
+  // Return the MachineBasicBlock containing a given instruction.
+  auto Block = [this] (NodeAddr<InstrNode*> IA) -> MachineBasicBlock* {
+    if (IA.Addr->getKind() == NodeAttrs::Stmt)
+      return NodeAddr<StmtNode*>(IA).Addr->getCode()->getParent();
+    assert(IA.Addr->getKind() == NodeAttrs::Phi);
+    NodeAddr<PhiNode*> PA = IA;
+    NodeAddr<BlockNode*> BA = PA.Addr->getOwner(DFG);
+    return BA.Addr->getCode();
+  };
+  // Less(A,B) iff instruction A is further down in the dominator tree than B.
+  auto Less = [&Block,this] (NodeId A, NodeId B) -> bool {
+    if (A == B)
+      return false;
+    auto OA = DFG.addr<InstrNode*>(A), OB = DFG.addr<InstrNode*>(B);
+    MachineBasicBlock *BA = Block(OA), *BB = Block(OB);
+    if (BA != BB)
+      return MDT.dominates(BB, BA);
+    // They are in the same block.
+    bool StmtA = OA.Addr->getKind() == NodeAttrs::Stmt;
+    bool StmtB = OB.Addr->getKind() == NodeAttrs::Stmt;
+    if (StmtA) {
+      if (!StmtB)   // OB is a phi and phis dominate statements.
+        return true;
+      MachineInstr *CA = NodeAddr<StmtNode*>(OA).Addr->getCode();
+      MachineInstr *CB = NodeAddr<StmtNode*>(OB).Addr->getCode();
+      // The order must be linear, so tie-break such equalities.
+      if (CA == CB)
+        return A < B;
+      return MDT.dominates(CB, CA);
+    } else {
+      // OA is a phi.
+      if (StmtB)
+        return false;
+      // Both are phis. There is no ordering between phis (in terms of
+      // the data-flow), so tie-break this via node id comparison.
+      return A < B;
+    }
+  };
+
+  std::vector<NodeId> Tmp(Owners.begin(), Owners.end());
+  std::sort(Tmp.begin(), Tmp.end(), Less);
+
+  // The vector is a list of instructions, so that defs coming from
+  // the same instruction don't need to be artificially ordered.
+  // Then, when computing the initial segment, and iterating over an
+  // instruction, pick the defs that contribute to the covering (i.e. is
+  // not covered by previously added defs). Check the defs individually,
+  // i.e. first check each def if is covered or not (without adding them
+  // to the tracking set), and then add all the selected ones.
+
+  // The reason for this is this example:
+  // *d1<A>, *d2<B>, ... Assume A and B are aliased (can happen in phi nodes).
+  // *d3<C>              If A \incl BuC, and B \incl AuC, then *d2 would be
+  //                     covered if we added A first, and A would be covered
+  //                     if we added B first.
+
+  RegisterAggr RRs(DefRRs);
+
+  auto DefInSet = [&Defs] (NodeAddr<RefNode*> TA) -> bool {
+    return TA.Addr->getKind() == NodeAttrs::Def &&
+           Defs.count(TA.Id);
+  };
+  for (NodeId T : Tmp) {
+    if (!FullChain && RRs.hasCoverOf(RefRR))
+      break;
+    auto TA = DFG.addr<InstrNode*>(T);
+    bool IsPhi = DFG.IsCode<NodeAttrs::Phi>(TA);
+    NodeList Ds;
+    for (NodeAddr<DefNode*> DA : TA.Addr->members_if(DefInSet, DFG)) {
+      RegisterRef QR = DA.Addr->getRegRef(DFG);
+      // Add phi defs even if they are covered by subsequent defs. This is
+      // for cases where the reached use is not covered by any of the defs
+      // encountered so far: the phi def is needed to expose the liveness
+      // of that use to the entry of the block.
+      // Example:
+      //   phi d1<R3>(,d2,), ...  Phi def d1 is covered by d2.
+      //   d2<R3>(d1,,u3), ...
+      //   ..., u3<D1>(d2)        This use needs to be live on entry.
+      if (FullChain || IsPhi || !RRs.hasCoverOf(QR))
+        Ds.push_back(DA);
+    }
+    RDefs.insert(RDefs.end(), Ds.begin(), Ds.end());
+    for (NodeAddr<DefNode*> DA : Ds) {
+      // When collecting a full chain of definitions, do not consider phi
+      // defs to actually define a register.
+      uint16_t Flags = DA.Addr->getFlags();
+      if (!FullChain || !(Flags & NodeAttrs::PhiRef))
+        if (!(Flags & NodeAttrs::Preserving)) // Don't care about Undef here.
+          RRs.insert(DA.Addr->getRegRef(DFG));
+    }
+  }
+
+  auto DeadP = [](const NodeAddr<DefNode*> DA) -> bool {
+    return DA.Addr->getFlags() & NodeAttrs::Dead;
+  };
+  RDefs.resize(std::distance(RDefs.begin(), remove_if(RDefs, DeadP)));
+
+  return RDefs;
+}
+
+
+NodeSet Liveness::getAllReachingDefsRec(RegisterRef RefRR,
+      NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs) {
+  // Collect all defined registers. Do not consider phis to be defining
+  // anything, only collect "real" definitions.
+  RegisterAggr DefRRs(TRI);
+  for (NodeId D : Defs) {
+    const auto DA = DFG.addr<const DefNode*>(D);
+    if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef))
+      DefRRs.insert(DA.Addr->getRegRef(DFG));
+  }
+
+  NodeList RDs = getAllReachingDefs(RefRR, RefA, true, DefRRs);
+  if (RDs.empty())
+    return Defs;
+
+  // Make a copy of the preexisting definitions and add the newly found ones.
+  NodeSet TmpDefs = Defs;
+  for (NodeAddr<NodeBase*> R : RDs)
+    TmpDefs.insert(R.Id);
+
+  NodeSet Result = Defs;
+
+  for (NodeAddr<DefNode*> DA : RDs) {
+    Result.insert(DA.Id);
+    if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef))
+      continue;
+    NodeAddr<PhiNode*> PA = DA.Addr->getOwner(DFG);
+    if (Visited.count(PA.Id))
+      continue;
+    Visited.insert(PA.Id);
+    // Go over all phi uses and get the reaching defs for each use.
+    for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
+      const auto &T = getAllReachingDefsRec(RefRR, U, Visited, TmpDefs);
+      Result.insert(T.begin(), T.end());
+    }
+  }
+
+  return Result;
+}
+
+
+NodeSet Liveness::getAllReachedUses(RegisterRef RefRR,
+      NodeAddr<DefNode*> DefA, const RegisterAggr &DefRRs) {
+  NodeSet Uses;
+
+  // If the original register is already covered by all the intervening
+  // defs, no more uses can be reached.
+  if (DefRRs.hasCoverOf(RefRR))
+    return Uses;
+
+  // Add all directly reached uses.
+  // If the def is dead, it does not provide a value for any use.
+  bool IsDead = DefA.Addr->getFlags() & NodeAttrs::Dead;
+  NodeId U = !IsDead ? DefA.Addr->getReachedUse() : 0;
+  while (U != 0) {
+    auto UA = DFG.addr<UseNode*>(U);
+    if (!(UA.Addr->getFlags() & NodeAttrs::Undef)) {
+      RegisterRef UR = UA.Addr->getRegRef(DFG);
+      if (DFG.alias(RefRR, UR) && !DefRRs.hasCoverOf(UR))
+        Uses.insert(U);
+    }
+    U = UA.Addr->getSibling();
+  }
+
+  // Traverse all reached defs. This time dead defs cannot be ignored.
+  for (NodeId D = DefA.Addr->getReachedDef(), NextD; D != 0; D = NextD) {
+    auto DA = DFG.addr<DefNode*>(D);
+    NextD = DA.Addr->getSibling();
+    RegisterRef DR = DA.Addr->getRegRef(DFG);
+    // If this def is already covered, it cannot reach anything new.
+    // Similarly, skip it if it is not aliased to the interesting register.
+    if (DefRRs.hasCoverOf(DR) || !DFG.alias(RefRR, DR))
+      continue;
+    NodeSet T;
+    if (DFG.IsPreservingDef(DA)) {
+      // If it is a preserving def, do not update the set of intervening defs.
+      T = getAllReachedUses(RefRR, DA, DefRRs);
+    } else {
+      RegisterAggr NewDefRRs = DefRRs;
+      NewDefRRs.insert(DR);
+      T = getAllReachedUses(RefRR, DA, NewDefRRs);
+    }
+    Uses.insert(T.begin(), T.end());
+  }
+  return Uses;
+}
+
+
+void Liveness::computePhiInfo() {
+  RealUseMap.clear();
+
+  NodeList Phis;
+  NodeAddr<FuncNode*> FA = DFG.getFunc();
+  NodeList Blocks = FA.Addr->members(DFG);
+  for (NodeAddr<BlockNode*> BA : Blocks) {
+    auto Ps = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG);
+    Phis.insert(Phis.end(), Ps.begin(), Ps.end());
+  }
+
+  // phi use -> (map: reaching phi -> set of registers defined in between)
+  std::map<NodeId,std::map<NodeId,RegisterAggr>> PhiUp;
+  std::vector<NodeId> PhiUQ;  // Work list of phis for upward propagation.
+
+  // Go over all phis.
+  for (NodeAddr<PhiNode*> PhiA : Phis) {
+    // Go over all defs and collect the reached uses that are non-phi uses
+    // (i.e. the "real uses").
+    RefMap &RealUses = RealUseMap[PhiA.Id];
+    NodeList PhiRefs = PhiA.Addr->members(DFG);
+
+    // Have a work queue of defs whose reached uses need to be found.
+    // For each def, add to the queue all reached (non-phi) defs.
+    SetVector<NodeId> DefQ;
+    NodeSet PhiDefs;
+    for (NodeAddr<RefNode*> R : PhiRefs) {
+      if (!DFG.IsRef<NodeAttrs::Def>(R))
+        continue;
+      DefQ.insert(R.Id);
+      PhiDefs.insert(R.Id);
+    }
+
+    // Collect the super-set of all possible reached uses. This set will
+    // contain all uses reached from this phi, either directly from the
+    // phi defs, or (recursively) via non-phi defs reached by the phi defs.
+    // This set of uses will later be trimmed to only contain these uses that
+    // are actually reached by the phi defs.
+    for (unsigned i = 0; i < DefQ.size(); ++i) {
+      NodeAddr<DefNode*> DA = DFG.addr<DefNode*>(DefQ[i]);
+      // Visit all reached uses. Phi defs should not really have the "dead"
+      // flag set, but check it anyway for consistency.
+      bool IsDead = DA.Addr->getFlags() & NodeAttrs::Dead;
+      NodeId UN = !IsDead ? DA.Addr->getReachedUse() : 0;
+      while (UN != 0) {
+        NodeAddr<UseNode*> A = DFG.addr<UseNode*>(UN);
+        uint16_t F = A.Addr->getFlags();
+        if ((F & (NodeAttrs::Undef | NodeAttrs::PhiRef)) == 0) {
+	  RegisterRef R = DFG.normalizeRef(getRestrictedRegRef(A));
+          RealUses[R.Reg].insert({A.Id,R.Mask});
+	}
+        UN = A.Addr->getSibling();
+      }
+      // Visit all reached defs, and add them to the queue. These defs may
+      // override some of the uses collected here, but that will be handled
+      // later.
+      NodeId DN = DA.Addr->getReachedDef();
+      while (DN != 0) {
+        NodeAddr<DefNode*> A = DFG.addr<DefNode*>(DN);
+        for (auto T : DFG.getRelatedRefs(A.Addr->getOwner(DFG), A)) {
+          uint16_t Flags = NodeAddr<DefNode*>(T).Addr->getFlags();
+          // Must traverse the reached-def chain. Consider:
+          //   def(D0) -> def(R0) -> def(R0) -> use(D0)
+          // The reachable use of D0 passes through a def of R0.
+          if (!(Flags & NodeAttrs::PhiRef))
+            DefQ.insert(T.Id);
+        }
+        DN = A.Addr->getSibling();
+      }
+    }
+    // Filter out these uses that appear to be reachable, but really
+    // are not. For example:
+    //
+    // R1:0 =          d1
+    //      = R1:0     u2     Reached by d1.
+    //   R0 =          d3
+    //      = R1:0     u4     Still reached by d1: indirectly through
+    //                        the def d3.
+    //   R1 =          d5
+    //      = R1:0     u6     Not reached by d1 (covered collectively
+    //                        by d3 and d5), but following reached
+    //                        defs and uses from d1 will lead here.
+    auto InPhiDefs = [&PhiDefs] (NodeAddr<DefNode*> DA) -> bool {
+      return PhiDefs.count(DA.Id);
+    };
+    for (auto UI = RealUses.begin(), UE = RealUses.end(); UI != UE; ) {
+      // For each reached register UI->first, there is a set UI->second, of
+      // uses of it. For each such use, check if it is reached by this phi,
+      // i.e. check if the set of its reaching uses intersects the set of
+      // this phi's defs.
+      NodeRefSet &Uses = UI->second;
+      for (auto I = Uses.begin(), E = Uses.end(); I != E; ) {
+        auto UA = DFG.addr<UseNode*>(I->first);
+        // Undef flag is checked above.
+        assert((UA.Addr->getFlags() & NodeAttrs::Undef) == 0);
+	RegisterRef R(UI->first, I->second);
+        NodeList RDs = getAllReachingDefs(R, UA);
+        if (any_of(RDs, InPhiDefs))
+          ++I;
+        else
+          I = Uses.erase(I);
+      }
+      if (Uses.empty())
+        UI = RealUses.erase(UI);
+      else
+        ++UI;
+    }
+
+    // If this phi reaches some "real" uses, add it to the queue for upward
+    // propagation.
+    if (!RealUses.empty())
+      PhiUQ.push_back(PhiA.Id);
+
+    // Go over all phi uses and check if the reaching def is another phi.
+    // Collect the phis that are among the reaching defs of these uses.
+    // While traversing the list of reaching defs for each phi use, accumulate
+    // the set of registers defined between this phi (PhiA) and the owner phi
+    // of the reaching def.
+    NodeSet SeenUses;
+
+    for (auto I : PhiRefs) {
+      if (!DFG.IsRef<NodeAttrs::Use>(I) || SeenUses.count(I.Id))
+        continue;
+      NodeAddr<UseNode*> UA = I;
+
+      // Given a phi use UA, traverse all related phi uses (including UA).
+      // The related phi uses may reach different phi nodes or may reach the
+      // same phi node. If multiple uses reach the same phi P, the intervening
+      // defs must be accumulated for all such uses. To group all such uses
+      // into one set, map their node ids to the first use id that reaches P.
+      std::map<NodeId,NodeId> FirstUse; // Phi reached up -> first phi use.
+
+      for (NodeAddr<UseNode*> VA : DFG.getRelatedRefs(PhiA, UA)) {
+        SeenUses.insert(VA.Id);
+        RegisterAggr DefRRs(TRI);
+        for (NodeAddr<DefNode*> DA : getAllReachingDefs(VA)) {
+          if (DA.Addr->getFlags() & NodeAttrs::PhiRef) {
+            NodeId RP = DA.Addr->getOwner(DFG).Id;
+            NodeId FU = FirstUse.insert({RP,VA.Id}).first->second;
+            std::map<NodeId,RegisterAggr> &M = PhiUp[FU];
+            auto F = M.find(RP);
+            if (F == M.end())
+              M.insert(std::make_pair(RP, DefRRs));
+            else
+              F->second.insert(DefRRs);
+          }
+          DefRRs.insert(DA.Addr->getRegRef(DFG));
+        }
+      }
+    }
+  }
+
+  if (Trace) {
+    dbgs() << "Phi-up-to-phi map with intervening defs:\n";
+    for (auto I : PhiUp) {
+      dbgs() << "phi " << Print<NodeId>(I.first, DFG) << " -> {";
+      for (auto R : I.second)
+        dbgs() << ' ' << Print<NodeId>(R.first, DFG)
+               << Print<RegisterAggr>(R.second, DFG);
+      dbgs() << " }\n";
+    }
+  }
+
+  // Propagate the reached registers up in the phi chain.
+  //
+  // The following type of situation needs careful handling:
+  //
+  //   phi d1<R1:0>  (1)
+  //        |
+  //   ... d2<R1>
+  //        |
+  //   phi u3<R1:0>  (2)
+  //        |
+  //   ... u4<R1>
+  //
+  // The phi node (2) defines a register pair R1:0, and reaches a "real"
+  // use u4 of just R1. The same phi node is also known to reach (upwards)
+  // the phi node (1). However, the use u4 is not reached by phi (1),
+  // because of the intervening definition d2 of R1. The data flow between
+  // phis (1) and (2) is restricted to R1:0 minus R1, i.e. R0.
+  //
+  // When propagating uses up the phi chains, get the all reaching defs
+  // for a given phi use, and traverse the list until the propagated ref
+  // is covered, or until reaching the final phi. Only assume that the
+  // reference reaches the phi in the latter case.
+
+  for (unsigned i = 0; i < PhiUQ.size(); ++i) {
+    auto PA = DFG.addr<PhiNode*>(PhiUQ[i]);
+    NodeList PUs = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG);
+    RefMap &RUM = RealUseMap[PA.Id];
+
+    for (NodeAddr<UseNode*> UA : PUs) {
+      std::map<NodeId,RegisterAggr> &PUM = PhiUp[UA.Id];
+      RegisterRef UR = DFG.normalizeRef(getRestrictedRegRef(UA));
+      for (const std::pair<NodeId,RegisterAggr> &P : PUM) {
+        bool Changed = false;
+        const RegisterAggr &MidDefs = P.second;
+
+        // Collect the set PropUp of uses that are reached by the current
+        // phi PA, and are not covered by any intervening def between the
+        // currently visited use UA and the the upward phi P.
+
+        if (MidDefs.hasCoverOf(UR))
+          continue;
+
+        // General algorithm:
+        //   for each (R,U) : U is use node of R, U is reached by PA
+        //     if MidDefs does not cover (R,U)
+        //       then add (R-MidDefs,U) to RealUseMap[P]
+        //
+        for (const std::pair<RegisterId,NodeRefSet> &T : RUM) {
+          RegisterRef R = DFG.restrictRef(RegisterRef(T.first), UR);
+          if (!R)
+            continue;
+          for (std::pair<NodeId,LaneBitmask> V : T.second) {
+            RegisterRef S = DFG.restrictRef(RegisterRef(R.Reg, V.second), R);
+            if (!S)
+              continue;
+            if (RegisterRef SS = MidDefs.clearIn(S)) {
+              NodeRefSet &RS = RealUseMap[P.first][SS.Reg];
+              Changed |= RS.insert({V.first,SS.Mask}).second;
+            }
+          }
+        }
+
+        if (Changed)
+          PhiUQ.push_back(P.first);
+      }
+    }
+  }
+
+  if (Trace) {
+    dbgs() << "Real use map:\n";
+    for (auto I : RealUseMap) {
+      dbgs() << "phi " << Print<NodeId>(I.first, DFG);
+      NodeAddr<PhiNode*> PA = DFG.addr<PhiNode*>(I.first);
+      NodeList Ds = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Def>, DFG);
+      if (!Ds.empty()) {
+        RegisterRef RR = NodeAddr<DefNode*>(Ds[0]).Addr->getRegRef(DFG);
+        dbgs() << '<' << Print<RegisterRef>(RR, DFG) << '>';
+      } else {
+        dbgs() << "<noreg>";
+      }
+      dbgs() << " -> " << Print<RefMap>(I.second, DFG) << '\n';
+    }
+  }
+}
+
+
+void Liveness::computeLiveIns() {
+  // Populate the node-to-block map. This speeds up the calculations
+  // significantly.
+  NBMap.clear();
+  for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) {
+    MachineBasicBlock *BB = BA.Addr->getCode();
+    for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
+      for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
+        NBMap.insert(std::make_pair(RA.Id, BB));
+      NBMap.insert(std::make_pair(IA.Id, BB));
+    }
+  }
+
+  MachineFunction &MF = DFG.getMF();
+
+  // Compute IDF first, then the inverse.
+  decltype(IIDF) IDF;
+  for (MachineBasicBlock &B : MF) {
+    auto F1 = MDF.find(&B);
+    if (F1 == MDF.end())
+      continue;
+    SetVector<MachineBasicBlock*> IDFB(F1->second.begin(), F1->second.end());
+    for (unsigned i = 0; i < IDFB.size(); ++i) {
+      auto F2 = MDF.find(IDFB[i]);
+      if (F2 != MDF.end())
+        IDFB.insert(F2->second.begin(), F2->second.end());
+    }
+    // Add B to the IDF(B). This will put B in the IIDF(B).
+    IDFB.insert(&B);
+    IDF[&B].insert(IDFB.begin(), IDFB.end());
+  }
+
+  for (auto I : IDF)
+    for (auto S : I.second)
+      IIDF[S].insert(I.first);
+
+  computePhiInfo();
+
+  NodeAddr<FuncNode*> FA = DFG.getFunc();
+  NodeList Blocks = FA.Addr->members(DFG);
+
+  // Build the phi live-on-entry map.
+  for (NodeAddr<BlockNode*> BA : Blocks) {
+    MachineBasicBlock *MB = BA.Addr->getCode();
+    RefMap &LON = PhiLON[MB];
+    for (auto P : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG))
+      for (const RefMap::value_type &S : RealUseMap[P.Id])
+        LON[S.first].insert(S.second.begin(), S.second.end());
+  }
+
+  if (Trace) {
+    dbgs() << "Phi live-on-entry map:\n";
+    for (auto &I : PhiLON)
+      dbgs() << "block #" << I.first->getNumber() << " -> "
+             << Print<RefMap>(I.second, DFG) << '\n';
+  }
+
+  // Build the phi live-on-exit map. Each phi node has some set of reached
+  // "real" uses. Propagate this set backwards into the block predecessors
+  // through the reaching defs of the corresponding phi uses.
+  for (NodeAddr<BlockNode*> BA : Blocks) {
+    NodeList Phis = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG);
+    for (NodeAddr<PhiNode*> PA : Phis) {
+      RefMap &RUs = RealUseMap[PA.Id];
+      if (RUs.empty())
+        continue;
+
+      for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
+        NodeAddr<PhiUseNode*> PUA = U;
+        if (PUA.Addr->getReachingDef() == 0)
+          continue;
+
+        // Mark all reached "real" uses of P as live on exit in the
+        // predecessor.
+        // Remap all the RUs so that they have a correct reaching def.
+        auto PrA = DFG.addr<BlockNode*>(PUA.Addr->getPredecessor());
+        RefMap &LOX = PhiLOX[PrA.Addr->getCode()];
+
+        RegisterRef UR = DFG.normalizeRef(getRestrictedRegRef(PUA));
+        for (const std::pair<RegisterId,NodeRefSet> &T : RUs) {
+          // Check if T.first aliases UR?
+          LaneBitmask M;
+          for (std::pair<NodeId,LaneBitmask> P : T.second)
+            M |= P.second;
+
+          RegisterRef S = DFG.restrictRef(RegisterRef(T.first, M), UR);
+          if (!S)
+            continue;
+          for (NodeAddr<DefNode*> D : getAllReachingDefs(S, PUA))
+            LOX[S.Reg].insert({D.Id, S.Mask});
+        }
+      }  // for U : phi uses
+    }  // for P : Phis
+  }  // for B : Blocks
+
+  if (Trace) {
+    dbgs() << "Phi live-on-exit map:\n";
+    for (auto &I : PhiLOX)
+      dbgs() << "block #" << I.first->getNumber() << " -> "
+             << Print<RefMap>(I.second, DFG) << '\n';
+  }
+
+  RefMap LiveIn;
+  traverse(&MF.front(), LiveIn);
+
+  // Add function live-ins to the live-in set of the function entry block.
+  auto &EntryIn = LiveMap[&MF.front()];
+  for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I)
+    EntryIn.insert(RegisterRef(I->first));
+
+  if (Trace) {
+    // Dump the liveness map
+    for (MachineBasicBlock &B : MF) {
+      std::vector<RegisterRef> LV;
+      for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I)
+        LV.push_back(RegisterRef(I->PhysReg, I->LaneMask));
+      std::sort(LV.begin(), LV.end());
+      dbgs() << "BB#" << B.getNumber() << "\t rec = {";
+      for (auto I : LV)
+        dbgs() << ' ' << Print<RegisterRef>(I, DFG);
+      dbgs() << " }\n";
+      //dbgs() << "\tcomp = " << Print<RegisterAggr>(LiveMap[&B], DFG) << '\n';
+
+      LV.clear();
+      for (std::pair<RegisterId,LaneBitmask> P : LiveMap[&B]) {
+        MCSubRegIndexIterator S(P.first, &TRI);
+        if (!S.isValid()) {
+          LV.push_back(RegisterRef(P.first));
+          continue;
+        }
+        do {
+          LaneBitmask M = TRI.getSubRegIndexLaneMask(S.getSubRegIndex());
+          if ((M & P.second).any())
+            LV.push_back(RegisterRef(S.getSubReg()));
+          ++S;
+        } while (S.isValid());
+      }
+      std::sort(LV.begin(), LV.end());
+      dbgs() << "\tcomp = {";
+      for (auto I : LV)
+        dbgs() << ' ' << Print<RegisterRef>(I, DFG);
+      dbgs() << " }\n";
+
+    }
+  }
+}
+
+
+void Liveness::resetLiveIns() {
+  for (auto &B : DFG.getMF()) {
+    // Remove all live-ins.
+    std::vector<unsigned> T;
+    for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I)
+      T.push_back(I->PhysReg);
+    for (auto I : T)
+      B.removeLiveIn(I);
+    // Add the newly computed live-ins.
+    auto &LiveIns = LiveMap[&B];
+    for (auto I : LiveIns) {
+      B.addLiveIn({MCPhysReg(I.first), I.second});
+    }
+  }
+}
+
+
+void Liveness::resetKills() {
+  for (auto &B : DFG.getMF())
+    resetKills(&B);
+}
+
+
+void Liveness::resetKills(MachineBasicBlock *B) {
+  auto CopyLiveIns = [this] (MachineBasicBlock *B, BitVector &LV) -> void {
+    for (auto I : B->liveins()) {
+      MCSubRegIndexIterator S(I.PhysReg, &TRI);
+      if (!S.isValid()) {
+        LV.set(I.PhysReg);
+        continue;
+      }
+      do {
+        LaneBitmask M = TRI.getSubRegIndexLaneMask(S.getSubRegIndex());
+        if ((M & I.LaneMask).any())
+          LV.set(S.getSubReg());
+        ++S;
+      } while (S.isValid());
+    }
+  };
+
+  BitVector LiveIn(TRI.getNumRegs()), Live(TRI.getNumRegs());
+  CopyLiveIns(B, LiveIn);
+  for (auto SI : B->successors())
+    CopyLiveIns(SI, Live);
+
+  for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) {
+    MachineInstr *MI = &*I;
+    if (MI->isDebugValue())
+      continue;
+
+    MI->clearKillInfo();
+    for (auto &Op : MI->operands()) {
+      // An implicit def of a super-register may not necessarily start a
+      // live range of it, since an implicit use could be used to keep parts
+      // of it live. Instead of analyzing the implicit operands, ignore
+      // implicit defs.
+      if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
+        continue;
+      unsigned R = Op.getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(R))
+        continue;
+      for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR)
+        Live.reset(*SR);
+    }
+    for (auto &Op : MI->operands()) {
+      if (!Op.isReg() || !Op.isUse())
+        continue;
+      unsigned R = Op.getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(R))
+        continue;
+      bool IsLive = false;
+      for (MCRegAliasIterator AR(R, &TRI, true); AR.isValid(); ++AR) {
+        if (!Live[*AR])
+          continue;
+        IsLive = true;
+        break;
+      }
+      if (IsLive)
+        continue;
+      Op.setIsKill(true);
+      for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR)
+        Live.set(*SR);
+    }
+  }
+}
+
+
+RegisterRef Liveness::getRestrictedRegRef(NodeAddr<RefNode*> RA) const {
+  assert(DFG.IsRef<NodeAttrs::Use>(RA));
+  if (RA.Addr->getFlags() & NodeAttrs::Shadow) {
+    NodeId RD = RA.Addr->getReachingDef();
+    assert(RD);
+    RA = DFG.addr<DefNode*>(RD);
+  }
+  return RA.Addr->getRegRef(DFG);
+}
+
+
+// Helper function to obtain the basic block containing the reaching def
+// of the given use.
+MachineBasicBlock *Liveness::getBlockWithRef(NodeId RN) const {
+  auto F = NBMap.find(RN);
+  if (F != NBMap.end())
+    return F->second;
+  llvm_unreachable("Node id not in map");
+}
+
+
+void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
+  // The LiveIn map, for each (physical) register, contains the set of live
+  // reaching defs of that register that are live on entry to the associated
+  // block.
+
+  // The summary of the traversal algorithm:
+  //
+  // R is live-in in B, if there exists a U(R), such that rdef(R) dom B
+  // and (U \in IDF(B) or B dom U).
+  //
+  // for (C : children) {
+  //   LU = {}
+  //   traverse(C, LU)
+  //   LiveUses += LU
+  // }
+  //
+  // LiveUses -= Defs(B);
+  // LiveUses += UpwardExposedUses(B);
+  // for (C : IIDF[B])
+  //   for (U : LiveUses)
+  //     if (Rdef(U) dom C)
+  //       C.addLiveIn(U)
+  //
+
+  // Go up the dominator tree (depth-first).
+  MachineDomTreeNode *N = MDT.getNode(B);
+  for (auto I : *N) {
+    RefMap L;
+    MachineBasicBlock *SB = I->getBlock();
+    traverse(SB, L);
+
+    for (auto S : L)
+      LiveIn[S.first].insert(S.second.begin(), S.second.end());
+  }
+
+  if (Trace) {
+    dbgs() << "\n-- BB#" << B->getNumber() << ": " << __func__
+           << " after recursion into: {";
+    for (auto I : *N)
+      dbgs() << ' ' << I->getBlock()->getNumber();
+    dbgs() << " }\n";
+    dbgs() << "  LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print<RegisterAggr>(LiveMap[B], DFG) << '\n';
+  }
+
+  // Add reaching defs of phi uses that are live on exit from this block.
+  RefMap &PUs = PhiLOX[B];
+  for (auto &S : PUs)
+    LiveIn[S.first].insert(S.second.begin(), S.second.end());
+
+  if (Trace) {
+    dbgs() << "after LOX\n";
+    dbgs() << "  LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print<RegisterAggr>(LiveMap[B], DFG) << '\n';
+  }
+
+  // The LiveIn map at this point has all defs that are live-on-exit from B,
+  // as if they were live-on-entry to B. First, we need to filter out all
+  // defs that are present in this block. Then we will add reaching defs of
+  // all upward-exposed uses.
+
+  // To filter out the defs, first make a copy of LiveIn, and then re-populate
+  // LiveIn with the defs that should remain.
+  RefMap LiveInCopy = LiveIn;
+  LiveIn.clear();
+
+  for (const std::pair<RegisterId,NodeRefSet> &LE : LiveInCopy) {
+    RegisterRef LRef(LE.first);
+    NodeRefSet &NewDefs = LiveIn[LRef.Reg]; // To be filled.
+    const NodeRefSet &OldDefs = LE.second;
+    for (NodeRef OR : OldDefs) {
+      // R is a def node that was live-on-exit
+      auto DA = DFG.addr<DefNode*>(OR.first);
+      NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG);
+      NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
+      if (B != BA.Addr->getCode()) {
+        // Defs from a different block need to be preserved. Defs from this
+        // block will need to be processed further, except for phi defs, the
+        // liveness of which is handled through the PhiLON/PhiLOX maps.
+        NewDefs.insert(OR);
+        continue;
+      }
+
+      // Defs from this block need to stop the liveness from being
+      // propagated upwards. This only applies to non-preserving defs,
+      // and to the parts of the register actually covered by those defs.
+      // (Note that phi defs should always be preserving.)
+      RegisterAggr RRs(TRI);
+      LRef.Mask = OR.second;
+
+      if (!DFG.IsPreservingDef(DA)) {
+        assert(!(IA.Addr->getFlags() & NodeAttrs::Phi));
+        // DA is a non-phi def that is live-on-exit from this block, and
+        // that is also located in this block. LRef is a register ref
+        // whose use this def reaches. If DA covers LRef, then no part
+        // of LRef is exposed upwards.A
+        if (RRs.insert(DA.Addr->getRegRef(DFG)).hasCoverOf(LRef))
+          continue;
+      }
+
+      // DA itself was not sufficient to cover LRef. In general, it is
+      // the last in a chain of aliased defs before the exit from this block.
+      // There could be other defs in this block that are a part of that
+      // chain. Check that now: accumulate the registers from these defs,
+      // and if they all together cover LRef, it is not live-on-entry.
+      for (NodeAddr<DefNode*> TA : getAllReachingDefs(DA)) {
+        // DefNode -> InstrNode -> BlockNode.
+        NodeAddr<InstrNode*> ITA = TA.Addr->getOwner(DFG);
+        NodeAddr<BlockNode*> BTA = ITA.Addr->getOwner(DFG);
+        // Reaching defs are ordered in the upward direction.
+        if (BTA.Addr->getCode() != B) {
+          // We have reached past the beginning of B, and the accumulated
+          // registers are not covering LRef. The first def from the
+          // upward chain will be live.
+          // Subtract all accumulated defs (RRs) from LRef.
+          RegisterAggr L(TRI);
+          L.insert(LRef).clear(RRs);
+          assert(!L.empty());
+          NewDefs.insert({TA.Id,L.begin()->second});
+          break;
+        }
+
+        // TA is in B. Only add this def to the accumulated cover if it is
+        // not preserving.
+        if (!(TA.Addr->getFlags() & NodeAttrs::Preserving))
+          RRs.insert(TA.Addr->getRegRef(DFG));
+        // If this is enough to cover LRef, then stop.
+        if (RRs.hasCoverOf(LRef))
+          break;
+      }
+    }
+  }
+
+  emptify(LiveIn);
+
+  if (Trace) {
+    dbgs() << "after defs in block\n";
+    dbgs() << "  LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print<RegisterAggr>(LiveMap[B], DFG) << '\n';
+  }
+
+  // Scan the block for upward-exposed uses and add them to the tracking set.
+  for (auto I : DFG.getFunc().Addr->findBlock(B, DFG).Addr->members(DFG)) {
+    NodeAddr<InstrNode*> IA = I;
+    if (IA.Addr->getKind() != NodeAttrs::Stmt)
+      continue;
+    for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
+      if (UA.Addr->getFlags() & NodeAttrs::Undef)
+        continue;
+      RegisterRef RR = DFG.normalizeRef(UA.Addr->getRegRef(DFG));
+      for (NodeAddr<DefNode*> D : getAllReachingDefs(UA))
+        if (getBlockWithRef(D.Id) != B)
+          LiveIn[RR.Reg].insert({D.Id,RR.Mask});
+    }
+  }
+
+  if (Trace) {
+    dbgs() << "after uses in block\n";
+    dbgs() << "  LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print<RegisterAggr>(LiveMap[B], DFG) << '\n';
+  }
+
+  // Phi uses should not be propagated up the dominator tree, since they
+  // are not dominated by their corresponding reaching defs.
+  RegisterAggr &Local = LiveMap[B];
+  RefMap &LON = PhiLON[B];
+  for (auto &R : LON) {
+    LaneBitmask M;
+    for (auto P : R.second)
+      M |= P.second;
+    Local.insert(RegisterRef(R.first,M));
+  }
+
+  if (Trace) {
+    dbgs() << "after phi uses in block\n";
+    dbgs() << "  LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print<RegisterAggr>(Local, DFG) << '\n';
+  }
+
+  for (auto C : IIDF[B]) {
+    RegisterAggr &LiveC = LiveMap[C];
+    for (const std::pair<RegisterId,NodeRefSet> &S : LiveIn)
+      for (auto R : S.second)
+        if (MDT.properlyDominates(getBlockWithRef(R.first), C))
+          LiveC.insert(RegisterRef(S.first, R.second));
+  }
+}
+
+
+void Liveness::emptify(RefMap &M) {
+  for (auto I = M.begin(), E = M.end(); I != E; )
+    I = I->second.empty() ? M.erase(I) : std::next(I);
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h
new file mode 100644
index 000000000000..c88396f36bbb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h
@@ -0,0 +1,134 @@
+//===--- RDFLiveness.h ----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Recalculate the liveness information given a data flow graph.
+// This includes block live-ins and kill flags.
+
+#ifndef RDF_LIVENESS_H
+#define RDF_LIVENESS_H
+
+#include "RDFGraph.h"
+#include "llvm/ADT/DenseMap.h"
+#include <map>
+
+using namespace llvm;
+
+namespace llvm {
+  class MachineBasicBlock;
+  class MachineFunction;
+  class MachineRegisterInfo;
+  class TargetRegisterInfo;
+  class MachineDominatorTree;
+  class MachineDominanceFrontier;
+
+namespace rdf {
+  struct Liveness {
+  public:
+    // This is really a std::map, except that it provides a non-trivial
+    // default constructor to the element accessed via [].
+    struct LiveMapType {
+      LiveMapType(const TargetRegisterInfo &tri) : Empty(tri) {}
+
+      RegisterAggr &operator[] (MachineBasicBlock *B) {
+        return Map.emplace(B, Empty).first->second;
+      }
+    private:
+      RegisterAggr Empty;
+      std::map<MachineBasicBlock*,RegisterAggr> Map;
+    };
+
+    typedef std::pair<NodeId,LaneBitmask> NodeRef;
+    typedef std::set<NodeRef> NodeRefSet;
+    // RegisterId in RefMap must be normalized.
+    typedef std::map<RegisterId,NodeRefSet> RefMap;
+
+    Liveness(MachineRegisterInfo &mri, const DataFlowGraph &g)
+      : DFG(g), TRI(g.getTRI()), MDT(g.getDT()), MDF(g.getDF()),
+        MRI(mri), LiveMap(g.getTRI()), Empty(), NoRegs(g.getTRI()),
+        Trace(false) {}
+
+    NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
+        bool FullChain, const RegisterAggr &DefRRs);
+    NodeList getAllReachingDefs(NodeAddr<RefNode*> RefA) {
+      return getAllReachingDefs(RefA.Addr->getRegRef(DFG), RefA, false, NoRegs);
+    }
+    NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA) {
+      return getAllReachingDefs(RefRR, RefA, false, NoRegs);
+    }
+    NodeSet getAllReachingDefsRec(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
+        NodeSet &Visited, const NodeSet &Defs);
+    NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA,
+        const RegisterAggr &DefRRs);
+    NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA) {
+      return getAllReachedUses(RefRR, DefA, NoRegs);
+    }
+
+    LiveMapType &getLiveMap() { return LiveMap; }
+    const LiveMapType &getLiveMap() const { return LiveMap; }
+    const RefMap &getRealUses(NodeId P) const {
+      auto F = RealUseMap.find(P);
+      return F == RealUseMap.end() ? Empty : F->second;
+    }
+
+    void computePhiInfo();
+    void computeLiveIns();
+    void resetLiveIns();
+    void resetKills();
+    void resetKills(MachineBasicBlock *B);
+
+    void trace(bool T) { Trace = T; }
+
+  private:
+    const DataFlowGraph &DFG;
+    const TargetRegisterInfo &TRI;
+    const MachineDominatorTree &MDT;
+    const MachineDominanceFrontier &MDF;
+    MachineRegisterInfo &MRI;
+    LiveMapType LiveMap;
+    const RefMap Empty;
+    const RegisterAggr NoRegs;
+    bool Trace;
+
+    // Cache of mapping from node ids (for RefNodes) to the containing
+    // basic blocks. Not computing it each time for each node reduces
+    // the liveness calculation time by a large fraction.
+    typedef DenseMap<NodeId,MachineBasicBlock*> NodeBlockMap;
+    NodeBlockMap NBMap;
+
+    // Phi information:
+    //
+    // RealUseMap
+    // map: NodeId -> (map: RegisterId -> NodeRefSet)
+    //      phi id -> (map: register -> set of reached non-phi uses)
+    std::map<NodeId, RefMap> RealUseMap;
+
+    // Inverse iterated dominance frontier.
+    std::map<MachineBasicBlock*,std::set<MachineBasicBlock*>> IIDF;
+
+    // Live on entry.
+    std::map<MachineBasicBlock*,RefMap> PhiLON;
+
+    // Phi uses are considered to be located at the end of the block that
+    // they are associated with. The reaching def of a phi use dominates the
+    // block that the use corresponds to, but not the block that contains
+    // the phi itself. To include these uses in the liveness propagation (up
+    // the dominator tree), create a map: block -> set of uses live on exit.
+    std::map<MachineBasicBlock*,RefMap> PhiLOX;
+
+    bool isRestrictedToRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
+        RegisterRef RR) const;
+    RegisterRef getRestrictedRegRef(NodeAddr<RefNode*> RA) const;
+    MachineBasicBlock *getBlockWithRef(NodeId RN) const;
+    void traverse(MachineBasicBlock *B, RefMap &LiveIn);
+    void emptify(RefMap &M);
+  };
+} // namespace rdf
+} // namespace llvm
+
+#endif // RDF_LIVENESS_H
diff --git a/contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp b/contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
new file mode 100644
index 000000000000..0554646bb6be
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- HexagonTargetInfo.cpp - Hexagon Target Implementation ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheHexagonTarget() {
+  static Target TheHexagonTarget;
+  return TheHexagonTarget;
+}
+
+extern "C" void LLVMInitializeHexagonTargetInfo() {
+  RegisterTarget<Triple::hexagon, /*HasJIT=*/false> X(getTheHexagonTarget(),
+                                                      "hexagon", "Hexagon");
+}
diff --git a/contrib/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/contrib/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
new file mode 100644
index 000000000000..903f92a04431
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -0,0 +1,1213 @@
+//===-- LanaiAsmParser.cpp - Parse Lanai assembly to MCInst instructions --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "MCTargetDesc/LanaiMCExpr.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+namespace llvm {
+namespace {
+struct LanaiOperand;
+
+class LanaiAsmParser : public MCTargetAsmParser {
+  // Parse operands
+  std::unique_ptr<LanaiOperand> parseRegister();
+
+  std::unique_ptr<LanaiOperand> parseImmediate();
+
+  std::unique_ptr<LanaiOperand> parseIdentifier();
+
+  unsigned parseAluOperator(bool PreOp, bool PostOp);
+
+  // Split the mnemonic stripping conditional code and quantifiers
+  StringRef splitMnemonic(StringRef Name, SMLoc NameLoc,
+                          OperandVector *Operands);
+
+  bool parsePrePost(StringRef Type, int *OffsetValue);
+
+  bool ParseDirective(AsmToken DirectiveID) override;
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+
+  bool ParseRegister(unsigned &RegNum, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+  bool MatchAndEmitInstruction(SMLoc IdLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+
+// Auto-generated instruction matching functions
+#define GET_ASSEMBLER_HEADER
+#include "LanaiGenAsmMatcher.inc"
+
+  OperandMatchResultTy parseOperand(OperandVector *Operands,
+                                    StringRef Mnemonic);
+
+  OperandMatchResultTy parseMemoryOperand(OperandVector &Operands);
+
+public:
+  LanaiAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
+                 const MCInstrInfo &MII, const MCTargetOptions &Options)
+      : MCTargetAsmParser(Options, STI), Parser(Parser),
+        Lexer(Parser.getLexer()), SubtargetInfo(STI) {
+    setAvailableFeatures(
+        ComputeAvailableFeatures(SubtargetInfo.getFeatureBits()));
+  }
+
+private:
+  MCAsmParser &Parser;
+  MCAsmLexer &Lexer;
+
+  const MCSubtargetInfo &SubtargetInfo;
+};
+
+// Auto-generated by TableGen
+static unsigned MatchRegisterName(llvm::StringRef Name);
+
+// LanaiOperand - Instances of this class represented a parsed machine
+// instruction
+struct LanaiOperand : public MCParsedAsmOperand {
+  enum KindTy {
+    TOKEN,
+    REGISTER,
+    IMMEDIATE,
+    MEMORY_IMM,
+    MEMORY_REG_IMM,
+    MEMORY_REG_REG,
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+  struct Token {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+  };
+
+  struct ImmOp {
+    const MCExpr *Value;
+  };
+
+  struct MemOp {
+    unsigned BaseReg;
+    unsigned OffsetReg;
+    unsigned AluOp;
+    const MCExpr *Offset;
+  };
+
+  union {
+    struct Token Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+  };
+
+  explicit LanaiOperand(KindTy Kind) : MCParsedAsmOperand(), Kind(Kind) {}
+
+public:
+  // The functions below are used by the autogenerated ASM matcher and hence to
+  // be of the form expected.
+
+  // getStartLoc - Gets location of the first token of this operand
+  SMLoc getStartLoc() const override { return StartLoc; }
+
+  // getEndLoc - Gets location of the last token of this operand
+  SMLoc getEndLoc() const override { return EndLoc; }
+
+  unsigned getReg() const override {
+    assert(isReg() && "Invalid type access!");
+    return Reg.RegNum;
+  }
+
+  const MCExpr *getImm() const {
+    assert(isImm() && "Invalid type access!");
+    return Imm.Value;
+  }
+
+  StringRef getToken() const {
+    assert(isToken() && "Invalid type access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getMemBaseReg() const {
+    assert(isMem() && "Invalid type access!");
+    return Mem.BaseReg;
+  }
+
+  unsigned getMemOffsetReg() const {
+    assert(isMem() && "Invalid type access!");
+    return Mem.OffsetReg;
+  }
+
+  const MCExpr *getMemOffset() const {
+    assert(isMem() && "Invalid type access!");
+    return Mem.Offset;
+  }
+
+  unsigned getMemOp() const {
+    assert(isMem() && "Invalid type access!");
+    return Mem.AluOp;
+  }
+
+  // Functions for testing operand type
+  bool isReg() const override { return Kind == REGISTER; }
+
+  bool isImm() const override { return Kind == IMMEDIATE; }
+
+  bool isMem() const override {
+    return isMemImm() || isMemRegImm() || isMemRegReg();
+  }
+
+  bool isMemImm() const { return Kind == MEMORY_IMM; }
+
+  bool isMemRegImm() const { return Kind == MEMORY_REG_IMM; }
+
+  bool isMemRegReg() const { return Kind == MEMORY_REG_REG; }
+
+  bool isMemSpls() const { return isMemRegImm() || isMemRegReg(); }
+
+  bool isToken() const override { return Kind == TOKEN; }
+
+  bool isBrImm() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Imm.Value);
+    if (!MCE)
+      return true;
+    int64_t Value = MCE->getValue();
+    // Check if value fits in 25 bits with 2 least significant bits 0.
+    return isShiftedUInt<23, 2>(static_cast<int32_t>(Value));
+  }
+
+  bool isBrTarget() { return isBrImm() || isToken(); }
+
+  bool isCallTarget() { return isImm() || isToken(); }
+
+  bool isHiImm16() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value)) {
+      int64_t Value = ConstExpr->getValue();
+      return Value != 0 && isShiftedUInt<16, 16>(Value);
+    }
+
+    // Symbolic reference expression
+    if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
+      return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_HI;
+
+    // Binary expression
+    if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value))
+      if (const LanaiMCExpr *SymbolRefExpr =
+              dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_HI;
+
+    return false;
+  }
+
+  bool isHiImm16And() {
+    if (!isImm())
+      return false;
+
+    const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value);
+    if (ConstExpr) {
+      int64_t Value = ConstExpr->getValue();
+      // Check if in the form 0xXYZWffff
+      return (Value != 0) && ((Value & ~0xffff0000) == 0xffff);
+    }
+    return false;
+  }
+
+  bool isLoImm16() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value)) {
+      int64_t Value = ConstExpr->getValue();
+      // Check if value fits in 16 bits
+      return isUInt<16>(static_cast<int32_t>(Value));
+    }
+
+    // Symbolic reference expression
+    if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
+      return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO;
+
+    // Binary expression
+    if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value))
+      if (const LanaiMCExpr *SymbolRefExpr =
+              dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO;
+
+    return false;
+  }
+
+  bool isLoImm16Signed() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value)) {
+      int64_t Value = ConstExpr->getValue();
+      // Check if value fits in 16 bits or value of the form 0xffffxyzw
+      return isInt<16>(static_cast<int32_t>(Value));
+    }
+
+    // Symbolic reference expression
+    if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
+      return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO;
+
+    // Binary expression
+    if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value))
+      if (const LanaiMCExpr *SymbolRefExpr =
+              dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO;
+
+    return false;
+  }
+
+  bool isLoImm16And() {
+    if (!isImm())
+      return false;
+
+    const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value);
+    if (ConstExpr) {
+      int64_t Value = ConstExpr->getValue();
+      // Check if in the form 0xffffXYZW
+      return ((Value & ~0xffff) == 0xffff0000);
+    }
+    return false;
+  }
+
+  bool isImmShift() {
+    if (!isImm())
+      return false;
+
+    const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value);
+    if (!ConstExpr)
+      return false;
+    int64_t Value = ConstExpr->getValue();
+    return (Value >= -31) && (Value <= 31);
+  }
+
+  bool isLoImm21() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value)) {
+      int64_t Value = ConstExpr->getValue();
+      return isUInt<21>(Value);
+    }
+
+    // Symbolic reference expression
+    if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
+      return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None;
+    if (const MCSymbolRefExpr *SymbolRefExpr =
+            dyn_cast<MCSymbolRefExpr>(Imm.Value)) {
+      return SymbolRefExpr->getKind() == MCSymbolRefExpr::VK_None;
+    }
+
+    // Binary expression
+    if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value)) {
+      if (const LanaiMCExpr *SymbolRefExpr =
+              dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None;
+      if (const MCSymbolRefExpr *SymbolRefExpr =
+              dyn_cast<MCSymbolRefExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getKind() == MCSymbolRefExpr::VK_None;
+    }
+
+    return false;
+  }
+
+  bool isImm10() {
+    if (!isImm())
+      return false;
+
+    const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value);
+    if (!ConstExpr)
+      return false;
+    int64_t Value = ConstExpr->getValue();
+    return isInt<10>(Value);
+  }
+
+  bool isCondCode() {
+    if (!isImm())
+      return false;
+
+    const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Value);
+    if (!ConstExpr)
+      return false;
+    uint64_t Value = ConstExpr->getValue();
+    // The condition codes are between 0 (ICC_T) and 15 (ICC_LE). If the
+    // unsigned value of the immediate is less than LPCC::UNKNOWN (16) then
+    // value corresponds to a valid condition code.
+    return Value < LPCC::UNKNOWN;
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates where possible. Null MCExpr = 0
+    if (Expr == nullptr)
+      Inst.addOperand(MCOperand::createImm(0));
+    else if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(
+          MCOperand::createImm(static_cast<int32_t>(ConstExpr->getValue())));
+    else
+      Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addBrTargetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addCallTargetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addMemImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCExpr *Expr = getMemOffset();
+    addExpr(Inst, Expr);
+  }
+
+  void addMemRegImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+    const MCExpr *Expr = getMemOffset();
+    addExpr(Inst, Expr);
+    Inst.addOperand(MCOperand::createImm(getMemOp()));
+  }
+
+  void addMemRegRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+    assert(getMemOffsetReg() != 0 && "Invalid offset");
+    Inst.addOperand(MCOperand::createReg(getMemOffsetReg()));
+    Inst.addOperand(MCOperand::createImm(getMemOp()));
+  }
+
+  void addMemSplsOperands(MCInst &Inst, unsigned N) const {
+    if (isMemRegImm())
+      addMemRegImmOperands(Inst, N);
+    if (isMemRegReg())
+      addMemRegRegOperands(Inst, N);
+  }
+
+  void addImmShiftOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addImm10Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addLoImm16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
+      Inst.addOperand(
+          MCOperand::createImm(static_cast<int32_t>(ConstExpr->getValue())));
+    else if (isa<LanaiMCExpr>(getImm())) {
+#ifndef NDEBUG
+      const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
+      assert(SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_LO);
+#endif
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+    } else if (isa<MCBinaryExpr>(getImm())) {
+#ifndef NDEBUG
+      const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
+      assert(dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()) &&
+             dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
+                 LanaiMCExpr::VK_Lanai_ABS_LO);
+#endif
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+    } else
+      assert(false && "Operand type not supported.");
+  }
+
+  void addLoImm16AndOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
+      Inst.addOperand(MCOperand::createImm(ConstExpr->getValue() & 0xffff));
+    else
+      assert(false && "Operand type not supported.");
+  }
+
+  void addHiImm16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
+      Inst.addOperand(MCOperand::createImm(ConstExpr->getValue() >> 16));
+    else if (isa<LanaiMCExpr>(getImm())) {
+#ifndef NDEBUG
+      const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
+      assert(SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_ABS_HI);
+#endif
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+    } else if (isa<MCBinaryExpr>(getImm())) {
+#ifndef NDEBUG
+      const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
+      assert(dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()) &&
+             dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS())->getKind() ==
+                 LanaiMCExpr::VK_Lanai_ABS_HI);
+#endif
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+    } else
+      assert(false && "Operand type not supported.");
+  }
+
+  void addHiImm16AndOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
+      Inst.addOperand(MCOperand::createImm(ConstExpr->getValue() >> 16));
+    else
+      assert(false && "Operand type not supported.");
+  }
+
+  void addLoImm21Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
+      Inst.addOperand(MCOperand::createImm(ConstExpr->getValue() & 0x1fffff));
+    else if (isa<LanaiMCExpr>(getImm())) {
+#ifndef NDEBUG
+      const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
+      assert(SymbolRefExpr &&
+             SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None);
+#endif
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+    } else if (isa<MCSymbolRefExpr>(getImm())) {
+#ifndef NDEBUG
+      const MCSymbolRefExpr *SymbolRefExpr =
+          dyn_cast<MCSymbolRefExpr>(getImm());
+      assert(SymbolRefExpr &&
+             SymbolRefExpr->getKind() == MCSymbolRefExpr::VK_None);
+#endif
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+    } else if (isa<MCBinaryExpr>(getImm())) {
+#ifndef NDEBUG
+      const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
+      const LanaiMCExpr *SymbolRefExpr =
+          dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS());
+      assert(SymbolRefExpr &&
+             SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None);
+#endif
+      Inst.addOperand(MCOperand::createExpr(getImm()));
+    } else
+      assert(false && "Operand type not supported.");
+  }
+
+  void print(raw_ostream &OS) const override {
+    switch (Kind) {
+    case IMMEDIATE:
+      OS << "Imm: " << getImm() << "\n";
+      break;
+    case TOKEN:
+      OS << "Token: " << getToken() << "\n";
+      break;
+    case REGISTER:
+      OS << "Reg: %r" << getReg() << "\n";
+      break;
+    case MEMORY_IMM:
+      OS << "MemImm: " << *getMemOffset() << "\n";
+      break;
+    case MEMORY_REG_IMM:
+      OS << "MemRegImm: " << getMemBaseReg() << "+" << *getMemOffset() << "\n";
+      break;
+    case MEMORY_REG_REG:
+      assert(getMemOffset() == nullptr);
+      OS << "MemRegReg: " << getMemBaseReg() << "+"
+         << "%r" << getMemOffsetReg() << "\n";
+      break;
+    }
+  }
+
+  static std::unique_ptr<LanaiOperand> CreateToken(StringRef Str, SMLoc Start) {
+    auto Op = make_unique<LanaiOperand>(TOKEN);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = Start;
+    Op->EndLoc = Start;
+    return Op;
+  }
+
+  static std::unique_ptr<LanaiOperand> createReg(unsigned RegNum, SMLoc Start,
+                                                 SMLoc End) {
+    auto Op = make_unique<LanaiOperand>(REGISTER);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = Start;
+    Op->EndLoc = End;
+    return Op;
+  }
+
+  static std::unique_ptr<LanaiOperand> createImm(const MCExpr *Value,
+                                                 SMLoc Start, SMLoc End) {
+    auto Op = make_unique<LanaiOperand>(IMMEDIATE);
+    Op->Imm.Value = Value;
+    Op->StartLoc = Start;
+    Op->EndLoc = End;
+    return Op;
+  }
+
+  static std::unique_ptr<LanaiOperand>
+  MorphToMemImm(std::unique_ptr<LanaiOperand> Op) {
+    const MCExpr *Imm = Op->getImm();
+    Op->Kind = MEMORY_IMM;
+    Op->Mem.BaseReg = 0;
+    Op->Mem.AluOp = LPAC::ADD;
+    Op->Mem.OffsetReg = 0;
+    Op->Mem.Offset = Imm;
+    return Op;
+  }
+
+  static std::unique_ptr<LanaiOperand>
+  MorphToMemRegReg(unsigned BaseReg, std::unique_ptr<LanaiOperand> Op,
+                   unsigned AluOp) {
+    unsigned OffsetReg = Op->getReg();
+    Op->Kind = MEMORY_REG_REG;
+    Op->Mem.BaseReg = BaseReg;
+    Op->Mem.AluOp = AluOp;
+    Op->Mem.OffsetReg = OffsetReg;
+    Op->Mem.Offset = nullptr;
+    return Op;
+  }
+
+  static std::unique_ptr<LanaiOperand>
+  MorphToMemRegImm(unsigned BaseReg, std::unique_ptr<LanaiOperand> Op,
+                   unsigned AluOp) {
+    const MCExpr *Imm = Op->getImm();
+    Op->Kind = MEMORY_REG_IMM;
+    Op->Mem.BaseReg = BaseReg;
+    Op->Mem.AluOp = AluOp;
+    Op->Mem.OffsetReg = 0;
+    Op->Mem.Offset = Imm;
+    return Op;
+  }
+};
+
+bool LanaiAsmParser::ParseDirective(AsmToken /*DirectiveId*/) { return true; }
+
+bool LanaiAsmParser::MatchAndEmitInstruction(SMLoc IdLoc, unsigned &Opcode,
+                                             OperandVector &Operands,
+                                             MCStreamer &Out,
+                                             uint64_t &ErrorInfo,
+                                             bool MatchingInlineAsm) {
+  MCInst Inst;
+  SMLoc ErrorLoc;
+
+  switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
+  case Match_Success:
+    Out.EmitInstruction(Inst, SubtargetInfo);
+    Opcode = Inst.getOpcode();
+    return false;
+  case Match_MissingFeature:
+    return Error(IdLoc, "Instruction use requires option to be enabled");
+  case Match_MnemonicFail:
+    return Error(IdLoc, "Unrecognized instruction mnemonic");
+  case Match_InvalidOperand: {
+    ErrorLoc = IdLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IdLoc, "Too few operands for instruction");
+
+      ErrorLoc = ((LanaiOperand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IdLoc;
+    }
+    return Error(ErrorLoc, "Invalid operand for instruction");
+  }
+  default:
+    break;
+  }
+
+  llvm_unreachable("Unknown match type detected!");
+}
+
+// Both '%rN' and 'rN' are parsed as valid registers. This was done to remain
+// backwards compatible with GCC and the different ways inline assembly is
+// handled.
+// TODO: see if there isn't a better way to do this.
+std::unique_ptr<LanaiOperand> LanaiAsmParser::parseRegister() {
+  SMLoc Start = Parser.getTok().getLoc();
+  SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+  unsigned RegNum;
+  // Eat the '%'.
+  if (Lexer.getKind() == AsmToken::Percent)
+    Parser.Lex();
+  if (Lexer.getKind() == AsmToken::Identifier) {
+    RegNum = MatchRegisterName(Lexer.getTok().getIdentifier());
+    if (RegNum == 0)
+      return 0;
+    Parser.Lex(); // Eat identifier token
+    return LanaiOperand::createReg(RegNum, Start, End);
+  }
+  return 0;
+}
+
+bool LanaiAsmParser::ParseRegister(unsigned &RegNum, SMLoc &StartLoc,
+                                   SMLoc &EndLoc) {
+  const AsmToken &Tok = getParser().getTok();
+  StartLoc = Tok.getLoc();
+  EndLoc = Tok.getEndLoc();
+  std::unique_ptr<LanaiOperand> Op = parseRegister();
+  if (Op != nullptr)
+    RegNum = Op->getReg();
+  return (Op == nullptr);
+}
+
+std::unique_ptr<LanaiOperand> LanaiAsmParser::parseIdentifier() {
+  SMLoc Start = Parser.getTok().getLoc();
+  SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  const MCExpr *Res, *RHS = 0;
+  LanaiMCExpr::VariantKind Kind = LanaiMCExpr::VK_Lanai_None;
+
+  if (Lexer.getKind() != AsmToken::Identifier)
+    return 0;
+
+  StringRef Identifier;
+  if (Parser.parseIdentifier(Identifier))
+    return 0;
+
+  // Check if identifier has a modifier
+  if (Identifier.equals_lower("hi"))
+    Kind = LanaiMCExpr::VK_Lanai_ABS_HI;
+  else if (Identifier.equals_lower("lo"))
+    Kind = LanaiMCExpr::VK_Lanai_ABS_LO;
+
+  // If the identifier corresponds to a variant then extract the real
+  // identifier.
+  if (Kind != LanaiMCExpr::VK_Lanai_None) {
+    if (Lexer.getKind() != AsmToken::LParen) {
+      Error(Lexer.getLoc(), "Expected '('");
+      return 0;
+    }
+    Lexer.Lex(); // lex '('
+
+    // Parse identifier
+    if (Parser.parseIdentifier(Identifier))
+      return 0;
+  }
+
+  // If addition parse the RHS.
+  if (Lexer.getKind() == AsmToken::Plus && Parser.parseExpression(RHS))
+    return 0;
+
+  // For variants parse the final ')'
+  if (Kind != LanaiMCExpr::VK_Lanai_None) {
+    if (Lexer.getKind() != AsmToken::RParen) {
+      Error(Lexer.getLoc(), "Expected ')'");
+      return 0;
+    }
+    Lexer.Lex(); // lex ')'
+  }
+
+  End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
+  Res = LanaiMCExpr::create(Kind, Expr, getContext());
+
+  // Nest if this was an addition
+  if (RHS)
+    Res = MCBinaryExpr::createAdd(Res, RHS, getContext());
+
+  return LanaiOperand::createImm(Res, Start, End);
+}
+
+std::unique_ptr<LanaiOperand> LanaiAsmParser::parseImmediate() {
+  SMLoc Start = Parser.getTok().getLoc();
+  SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+  const MCExpr *ExprVal;
+  switch (Lexer.getKind()) {
+  case AsmToken::Identifier:
+    return parseIdentifier();
+  case AsmToken::Plus:
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+  case AsmToken::Dot:
+    if (!Parser.parseExpression(ExprVal))
+      return LanaiOperand::createImm(ExprVal, Start, End);
+  default:
+    return 0;
+  }
+}
+
+static unsigned AluWithPrePost(unsigned AluCode, bool PreOp, bool PostOp) {
+  if (PreOp)
+    return LPAC::makePreOp(AluCode);
+  if (PostOp)
+    return LPAC::makePostOp(AluCode);
+  return AluCode;
+}
+
+unsigned LanaiAsmParser::parseAluOperator(bool PreOp, bool PostOp) {
+  StringRef IdString;
+  Parser.parseIdentifier(IdString);
+  unsigned AluCode = LPAC::stringToLanaiAluCode(IdString);
+  if (AluCode == LPAC::UNKNOWN) {
+    Error(Parser.getTok().getLoc(), "Can't parse ALU operator");
+    return 0;
+  }
+  return AluCode;
+}
+
+static int SizeForSuffix(StringRef T) {
+  return StringSwitch<int>(T).EndsWith(".h", 2).EndsWith(".b", 1).Default(4);
+}
+
+bool LanaiAsmParser::parsePrePost(StringRef Type, int *OffsetValue) {
+  bool PreOrPost = false;
+  if (Lexer.getKind() == Lexer.peekTok(true).getKind()) {
+    PreOrPost = true;
+    if (Lexer.is(AsmToken::Minus))
+      *OffsetValue = -SizeForSuffix(Type);
+    else if (Lexer.is(AsmToken::Plus))
+      *OffsetValue = SizeForSuffix(Type);
+    else
+      return false;
+
+    // Eat the '-' '-' or '+' '+'
+    Parser.Lex();
+    Parser.Lex();
+  } else if (Lexer.is(AsmToken::Star)) {
+    Parser.Lex(); // Eat the '*'
+    PreOrPost = true;
+  }
+
+  return PreOrPost;
+}
+
+bool shouldBeSls(const LanaiOperand &Op) {
+  // The instruction should be encoded as an SLS if the constant is word
+  // aligned and will fit in 21 bits
+  if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(Op.getImm())) {
+    int64_t Value = ConstExpr->getValue();
+    return (Value % 4 == 0) && (Value >= 0) && (Value <= 0x1fffff);
+  }
+  // The instruction should be encoded as an SLS if the operand is a symbolic
+  // reference with no variant.
+  if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Op.getImm()))
+    return SymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None;
+  // The instruction should be encoded as an SLS if the operand is a binary
+  // expression with the left-hand side being a symbolic reference with no
+  // variant.
+  if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Op.getImm())) {
+    const LanaiMCExpr *LHSSymbolRefExpr =
+        dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS());
+    return (LHSSymbolRefExpr &&
+            LHSSymbolRefExpr->getKind() == LanaiMCExpr::VK_Lanai_None);
+  }
+  return false;
+}
+
+// Matches memory operand. Returns true if error encountered.
+OperandMatchResultTy
+LanaiAsmParser::parseMemoryOperand(OperandVector &Operands) {
+  // Try to match a memory operand.
+  // The memory operands are of the form:
+  //  (1)  Register|Immediate|'' '[' '*'? Register '*'? ']' or
+  //                            ^
+  //  (2)  '[' '*'? Register '*'? AluOperator Register ']'
+  //      ^
+  //  (3)  '[' '--'|'++' Register '--'|'++' ']'
+  //
+  //  (4) '[' Immediate ']' (for SLS)
+
+  // Store the type for use in parsing pre/post increment/decrement operators
+  StringRef Type;
+  if (Operands[0]->isToken())
+    Type = static_cast<LanaiOperand *>(Operands[0].get())->getToken();
+
+  // Use 0 if no offset given
+  int OffsetValue = 0;
+  unsigned BaseReg = 0;
+  unsigned AluOp = LPAC::ADD;
+  bool PostOp = false, PreOp = false;
+
+  // Try to parse the offset
+  std::unique_ptr<LanaiOperand> Op = parseRegister();
+  if (!Op)
+    Op = parseImmediate();
+
+  // Only continue if next token is '['
+  if (Lexer.isNot(AsmToken::LBrac)) {
+    if (!Op)
+      return MatchOperand_NoMatch;
+
+    // The start of this custom parsing overlaps with register/immediate so
+    // consider this as a successful match of an operand of that type as the
+    // token stream can't be rewound to allow them to match separately.
+    Operands.push_back(std::move(Op));
+    return MatchOperand_Success;
+  }
+
+  Parser.Lex(); // Eat the '['.
+  std::unique_ptr<LanaiOperand> Offset = nullptr;
+  if (Op)
+    Offset.swap(Op);
+
+  // Determine if a pre operation
+  PreOp = parsePrePost(Type, &OffsetValue);
+
+  Op = parseRegister();
+  if (!Op) {
+    if (!Offset) {
+      if ((Op = parseImmediate()) && Lexer.is(AsmToken::RBrac)) {
+        Parser.Lex(); // Eat the ']'
+
+        // Memory address operations aligned to word boundary are encoded as
+        // SLS, the rest as RM.
+        if (shouldBeSls(*Op)) {
+          Operands.push_back(LanaiOperand::MorphToMemImm(std::move(Op)));
+        } else {
+          if (!Op->isLoImm16Signed()) {
+            Error(Parser.getTok().getLoc(),
+                  "Memory address is not word "
+                  "aligned and larger than class RM can handle");
+            return MatchOperand_ParseFail;
+          }
+          Operands.push_back(LanaiOperand::MorphToMemRegImm(
+              Lanai::R0, std::move(Op), LPAC::ADD));
+        }
+        return MatchOperand_Success;
+      }
+    }
+
+    Error(Parser.getTok().getLoc(),
+          "Unknown operand, expected register or immediate");
+    return MatchOperand_ParseFail;
+  }
+  BaseReg = Op->getReg();
+
+  // Determine if a post operation
+  if (!PreOp)
+    PostOp = parsePrePost(Type, &OffsetValue);
+
+  // If ] match form (1) else match form (2)
+  if (Lexer.is(AsmToken::RBrac)) {
+    Parser.Lex(); // Eat the ']'.
+    if (!Offset) {
+      SMLoc Start = Parser.getTok().getLoc();
+      SMLoc End =
+          SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      const MCConstantExpr *OffsetConstExpr =
+          MCConstantExpr::create(OffsetValue, getContext());
+      Offset = LanaiOperand::createImm(OffsetConstExpr, Start, End);
+    }
+  } else {
+    if (Offset || OffsetValue != 0) {
+      Error(Parser.getTok().getLoc(), "Expected ']'");
+      return MatchOperand_ParseFail;
+    }
+
+    // Parse operator
+    AluOp = parseAluOperator(PreOp, PostOp);
+
+    // Second form requires offset register
+    Offset = parseRegister();
+    if (!BaseReg || Lexer.isNot(AsmToken::RBrac)) {
+      Error(Parser.getTok().getLoc(), "Expected ']'");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex(); // Eat the ']'.
+  }
+
+  // First form has addition as operator. Add pre- or post-op indicator as
+  // needed.
+  AluOp = AluWithPrePost(AluOp, PreOp, PostOp);
+
+  // Ensure immediate offset is not too large
+  if (Offset->isImm() && !Offset->isLoImm16Signed()) {
+    Error(Parser.getTok().getLoc(),
+          "Memory address is not word "
+          "aligned and larger than class RM can handle");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(
+      Offset->isImm()
+          ? LanaiOperand::MorphToMemRegImm(BaseReg, std::move(Offset), AluOp)
+          : LanaiOperand::MorphToMemRegReg(BaseReg, std::move(Offset), AluOp));
+
+  return MatchOperand_Success;
+}
+
+// Looks at a token type and creates the relevant operand from this
+// information, adding to operands.
+// If operand was parsed, returns false, else true.
+OperandMatchResultTy
+LanaiAsmParser::parseOperand(OperandVector *Operands, StringRef Mnemonic) {
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy Result = MatchOperandParserImpl(*Operands, Mnemonic);
+
+  if (Result == MatchOperand_Success)
+    return Result;
+  if (Result == MatchOperand_ParseFail) {
+    Parser.eatToEndOfStatement();
+    return Result;
+  }
+
+  // Attempt to parse token as register
+  std::unique_ptr<LanaiOperand> Op = parseRegister();
+
+  // Attempt to parse token as immediate
+  if (!Op)
+    Op = parseImmediate();
+
+  // If the token could not be parsed then fail
+  if (!Op) {
+    Error(Parser.getTok().getLoc(), "Unknown operand");
+    Parser.eatToEndOfStatement();
+    return MatchOperand_ParseFail;
+  }
+
+  // Push back parsed operand into list of operands
+  Operands->push_back(std::move(Op));
+
+  return MatchOperand_Success;
+}
+
+// Split the mnemonic into ASM operand, conditional code and instruction
+// qualifier (half-word, byte).
+StringRef LanaiAsmParser::splitMnemonic(StringRef Name, SMLoc NameLoc,
+                                        OperandVector *Operands) {
+  size_t Next = Name.find('.');
+
+  StringRef Mnemonic = Name;
+
+  bool IsBRR = false;
+  if (Name.endswith(".r")) {
+    Mnemonic = Name.substr(0, Name.size() - 2);
+    IsBRR = true;
+  }
+
+  // Match b?? and s?? (BR, BRR, and SCC instruction classes).
+  if (Mnemonic[0] == 'b' ||
+      (Mnemonic[0] == 's' && !Mnemonic.startswith("sel") &&
+       !Mnemonic.startswith("st"))) {
+    // Parse instructions with a conditional code. For example, 'bne' is
+    // converted into two operands 'b' and 'ne'.
+    LPCC::CondCode CondCode =
+        LPCC::suffixToLanaiCondCode(Mnemonic.substr(1, Next));
+    if (CondCode != LPCC::UNKNOWN) {
+      Mnemonic = Mnemonic.slice(0, 1);
+      Operands->push_back(LanaiOperand::CreateToken(Mnemonic, NameLoc));
+      Operands->push_back(LanaiOperand::createImm(
+          MCConstantExpr::create(CondCode, getContext()), NameLoc, NameLoc));
+      if (IsBRR) {
+        Operands->push_back(LanaiOperand::CreateToken(".r", NameLoc));
+      }
+      return Mnemonic;
+    }
+  }
+
+  // Parse other instructions with condition codes (RR instructions).
+  // We ignore .f here and assume they are flag-setting operations, not
+  // conditional codes (except for select instructions where flag-setting
+  // variants are not yet implemented).
+  if (Mnemonic.startswith("sel") ||
+      (!Mnemonic.endswith(".f") && !Mnemonic.startswith("st"))) {
+    LPCC::CondCode CondCode = LPCC::suffixToLanaiCondCode(Mnemonic);
+    if (CondCode != LPCC::UNKNOWN) {
+      size_t Next = Mnemonic.rfind('.', Name.size());
+      // 'sel' doesn't use a predicate operand whose printer adds the period,
+      // but instead has the period as part of the identifier (i.e., 'sel.' is
+      // expected by the generated matcher). If the mnemonic starts with 'sel'
+      // then include the period as part of the mnemonic, else don't include it
+      // as part of the mnemonic.
+      if (Mnemonic.startswith("sel")) {
+        Mnemonic = Mnemonic.substr(0, Next + 1);
+      } else {
+        Mnemonic = Mnemonic.substr(0, Next);
+      }
+      Operands->push_back(LanaiOperand::CreateToken(Mnemonic, NameLoc));
+      Operands->push_back(LanaiOperand::createImm(
+          MCConstantExpr::create(CondCode, getContext()), NameLoc, NameLoc));
+      return Mnemonic;
+    }
+  }
+
+  Operands->push_back(LanaiOperand::CreateToken(Mnemonic, NameLoc));
+  if (IsBRR) {
+    Operands->push_back(LanaiOperand::CreateToken(".r", NameLoc));
+  }
+
+  return Mnemonic;
+}
+
+bool IsMemoryAssignmentError(const OperandVector &Operands) {
+  // Detects if a memory operation has an erroneous base register modification.
+  // Memory operations are detected by matching the types of operands.
+  //
+  // TODO: This test is focussed on one specific instance (ld/st).
+  // Extend it to handle more cases or be more robust.
+  bool Modifies = false;
+
+  int Offset = 0;
+
+  if (Operands.size() < 5)
+    return false;
+  else if (Operands[0]->isToken() && Operands[1]->isReg() &&
+           Operands[2]->isImm() && Operands[3]->isImm() && Operands[4]->isReg())
+    Offset = 0;
+  else if (Operands[0]->isToken() && Operands[1]->isToken() &&
+           Operands[2]->isReg() && Operands[3]->isImm() &&
+           Operands[4]->isImm() && Operands[5]->isReg())
+    Offset = 1;
+  else
+    return false;
+
+  int PossibleAluOpIdx = Offset + 3;
+  int PossibleBaseIdx = Offset + 1;
+  int PossibleDestIdx = Offset + 4;
+  if (LanaiOperand *PossibleAluOp =
+          static_cast<LanaiOperand *>(Operands[PossibleAluOpIdx].get()))
+    if (PossibleAluOp->isImm())
+      if (const MCConstantExpr *ConstExpr =
+              dyn_cast<MCConstantExpr>(PossibleAluOp->getImm()))
+        Modifies = LPAC::modifiesOp(ConstExpr->getValue());
+  return Modifies && Operands[PossibleBaseIdx]->isReg() &&
+         Operands[PossibleDestIdx]->isReg() &&
+         Operands[PossibleBaseIdx]->getReg() ==
+             Operands[PossibleDestIdx]->getReg();
+}
+
+static bool IsRegister(const MCParsedAsmOperand &op) {
+  return static_cast<const LanaiOperand &>(op).isReg();
+}
+
+static bool MaybePredicatedInst(const OperandVector &Operands) {
+  if (Operands.size() < 4 || !IsRegister(*Operands[1]) ||
+      !IsRegister(*Operands[2]))
+    return false;
+  return StringSwitch<bool>(
+             static_cast<const LanaiOperand &>(*Operands[0]).getToken())
+      .StartsWith("addc", true)
+      .StartsWith("add", true)
+      .StartsWith("and", true)
+      .StartsWith("sh", true)
+      .StartsWith("subb", true)
+      .StartsWith("sub", true)
+      .StartsWith("or", true)
+      .StartsWith("xor", true)
+      .Default(false);
+}
+
+bool LanaiAsmParser::ParseInstruction(ParseInstructionInfo & /*Info*/,
+                                      StringRef Name, SMLoc NameLoc,
+                                      OperandVector &Operands) {
+  // First operand is token for instruction
+  StringRef Mnemonic = splitMnemonic(Name, NameLoc, &Operands);
+
+  // If there are no more operands, then finish
+  if (Lexer.is(AsmToken::EndOfStatement))
+    return false;
+
+  // Parse first operand
+  if (parseOperand(&Operands, Mnemonic) != MatchOperand_Success)
+    return true;
+
+  // If it is a st instruction with one 1 operand then it is a "store true".
+  // Transform <"st"> to <"s">, <LPCC:ICC_T>
+  if (Lexer.is(AsmToken::EndOfStatement) && Name == "st" &&
+      Operands.size() == 2) {
+    Operands.erase(Operands.begin(), Operands.begin() + 1);
+    Operands.insert(Operands.begin(), LanaiOperand::CreateToken("s", NameLoc));
+    Operands.insert(Operands.begin() + 1,
+                    LanaiOperand::createImm(
+                        MCConstantExpr::create(LPCC::ICC_T, getContext()),
+                        NameLoc, NameLoc));
+  }
+
+  // If the instruction is a bt instruction with 1 operand (in assembly) then it
+  // is an unconditional branch instruction and the first two elements of
+  // operands need to be merged.
+  if (Lexer.is(AsmToken::EndOfStatement) && Name.startswith("bt") &&
+      Operands.size() == 3) {
+    Operands.erase(Operands.begin(), Operands.begin() + 2);
+    Operands.insert(Operands.begin(), LanaiOperand::CreateToken("bt", NameLoc));
+  }
+
+  // Parse until end of statement, consuming commas between operands
+  while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.is(AsmToken::Comma)) {
+    // Consume comma token
+    Lex();
+
+    // Parse next operand
+    if (parseOperand(&Operands, Mnemonic) != MatchOperand_Success)
+      return true;
+  }
+
+  if (IsMemoryAssignmentError(Operands)) {
+    Error(Parser.getTok().getLoc(),
+          "the destination register can't equal the base register in an "
+          "instruction that modifies the base register.");
+    return true;
+  }
+
+  // Insert always true operand for instruction that may be predicated but
+  // are not. Currently the autogenerated parser always expects a predicate.
+  if (MaybePredicatedInst(Operands)) {
+    Operands.insert(Operands.begin() + 1,
+                    LanaiOperand::createImm(
+                        MCConstantExpr::create(LPCC::ICC_T, getContext()),
+                        NameLoc, NameLoc));
+  }
+
+  return false;
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "LanaiGenAsmMatcher.inc"
+} // namespace
+
+extern "C" void LLVMInitializeLanaiAsmParser() {
+  RegisterMCAsmParser<LanaiAsmParser> x(getTheLanaiTarget());
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp b/contrib/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
new file mode 100644
index 000000000000..609b650e5d32
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
@@ -0,0 +1,240 @@
+//===- LanaiDisassembler.cpp - Disassembler for Lanai -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Lanai Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiDisassembler.h"
+
+#include "Lanai.h"
+#include "LanaiSubtarget.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace llvm {
+Target &getTheLanaiTarget();
+}
+
+static MCDisassembler *createLanaiDisassembler(const Target & /*T*/,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new LanaiDisassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeLanaiDisassembler() {
+  // Register the disassembler
+  TargetRegistry::RegisterMCDisassembler(getTheLanaiTarget(),
+                                         createLanaiDisassembler);
+}
+
+LanaiDisassembler::LanaiDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+    : MCDisassembler(STI, Ctx) {}
+
+// Forward declare because the autogenerated code will reference this.
+// Definition is further down.
+static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus decodeRiMemoryValue(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeRrMemoryValue(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeSplsValue(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeBranch(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const void *Decoder);
+
+static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus decodeShiftImm(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder);
+
+#include "LanaiGenDisassemblerTables.inc"
+
+static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t &Size,
+                                      uint32_t &Insn) {
+  // We want to read exactly 4 bytes of data.
+  if (Bytes.size() < 4) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Encoded as big-endian 32-bit word in the stream.
+  Insn =
+      (Bytes[0] << 24) | (Bytes[1] << 16) | (Bytes[2] << 8) | (Bytes[3] << 0);
+
+  return MCDisassembler::Success;
+}
+
+static void PostOperandDecodeAdjust(MCInst &Instr, uint32_t Insn) {
+  unsigned AluOp = LPAC::ADD;
+  // Fix up for pre and post operations.
+  int PqShift = -1;
+  if (isRMOpcode(Instr.getOpcode()))
+    PqShift = 16;
+  else if (isSPLSOpcode(Instr.getOpcode()))
+    PqShift = 10;
+  else if (isRRMOpcode(Instr.getOpcode())) {
+    PqShift = 16;
+    // Determine RRM ALU op.
+    AluOp = (Insn >> 8) & 0x7;
+    if (AluOp == 7)
+      // Handle JJJJJ
+      // 0b10000 or 0b11000
+      AluOp |= 0x20 | (((Insn >> 3) & 0xf) << 1);
+  }
+
+  if (PqShift != -1) {
+    unsigned PQ = (Insn >> PqShift) & 0x3;
+    switch (PQ) {
+    case 0x0:
+      if (Instr.getOperand(2).isReg()) {
+        Instr.getOperand(2).setReg(Lanai::R0);
+      }
+      if (Instr.getOperand(2).isImm())
+        Instr.getOperand(2).setImm(0);
+      break;
+    case 0x1:
+      AluOp = LPAC::makePostOp(AluOp);
+      break;
+    case 0x2:
+      break;
+    case 0x3:
+      AluOp = LPAC::makePreOp(AluOp);
+      break;
+    }
+    Instr.addOperand(MCOperand::createImm(AluOp));
+  }
+}
+
+DecodeStatus LanaiDisassembler::getInstruction(
+    MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address,
+    raw_ostream & /*VStream*/, raw_ostream & /*CStream*/) const {
+  uint32_t Insn;
+
+  DecodeStatus Result = readInstruction32(Bytes, Size, Insn);
+
+  if (Result == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  // Call auto-generated decoder function
+  Result =
+      decodeInstruction(DecoderTableLanai32, Instr, Insn, Address, this, STI);
+
+  if (Result != MCDisassembler::Fail) {
+    PostOperandDecodeAdjust(Instr, Insn);
+    Size = 4;
+    return Result;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+static const unsigned GPRDecoderTable[] = {
+    Lanai::R0,  Lanai::R1,  Lanai::PC,  Lanai::R3,  Lanai::SP,  Lanai::FP,
+    Lanai::R6,  Lanai::R7,  Lanai::RV,  Lanai::R9,  Lanai::RR1, Lanai::RR2,
+    Lanai::R12, Lanai::R13, Lanai::R14, Lanai::RCA, Lanai::R16, Lanai::R17,
+    Lanai::R18, Lanai::R19, Lanai::R20, Lanai::R21, Lanai::R22, Lanai::R23,
+    Lanai::R24, Lanai::R25, Lanai::R26, Lanai::R27, Lanai::R28, Lanai::R29,
+    Lanai::R30, Lanai::R31};
+
+DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                    uint64_t /*Address*/,
+                                    const void * /*Decoder*/) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = GPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeRiMemoryValue(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder) {
+  // RI memory values encoded using 23 bits:
+  //   5 bit register, 16 bit constant
+  unsigned Register = (Insn >> 18) & 0x1f;
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+  unsigned Offset = (Insn & 0xffff);
+  Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeRrMemoryValue(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder) {
+  // RR memory values encoded using 20 bits:
+  //   5 bit register, 5 bit register, 2 bit PQ, 3 bit ALU operator, 5 bit JJJJJ
+  unsigned Register = (Insn >> 15) & 0x1f;
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+  Register = (Insn >> 10) & 0x1f;
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeSplsValue(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  // RI memory values encoded using 17 bits:
+  //   5 bit register, 10 bit constant
+  unsigned Register = (Insn >> 12) & 0x1f;
+  Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
+  unsigned Offset = (Insn & 0x3ff);
+  Inst.addOperand(MCOperand::createImm(SignExtend32<10>(Offset)));
+
+  return MCDisassembler::Success;
+}
+
+static bool tryAddingSymbolicOperand(int64_t Value, bool IsBranch,
+                                     uint64_t Address, uint64_t Offset,
+                                     uint64_t Width, MCInst &MI,
+                                     const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler *>(Decoder);
+  return Dis->tryAddingSymbolicOperand(MI, Value, Address, IsBranch, Offset,
+                                       Width);
+}
+
+static DecodeStatus decodeBranch(MCInst &MI, unsigned Insn, uint64_t Address,
+                                 const void *Decoder) {
+  if (!tryAddingSymbolicOperand(Insn + Address, false, Address, 2, 23, MI,
+                                Decoder))
+    MI.addOperand(MCOperand::createImm(Insn));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeShiftImm(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned Offset = (Insn & 0xffff);
+  Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodePredicateOperand(MCInst &Inst, unsigned Val,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (Val >= LPCC::UNKNOWN)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(Val));
+  return MCDisassembler::Success;
+}
diff --git a/contrib/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.h b/contrib/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
new file mode 100644
index 000000000000..a317cd88ad63
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.h
@@ -0,0 +1,41 @@
+//===- LanaiDisassembler.cpp - Disassembler for Lanai -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Lanai Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_DISASSEMBLER_LANAIDISASSEMBLER_H
+#define LLVM_LIB_TARGET_LANAI_DISASSEMBLER_LANAIDISASSEMBLER_H
+
+#define DEBUG_TYPE "lanai-disassembler"
+
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class raw_ostream;
+
+class LanaiDisassembler : public MCDisassembler {
+public:
+  LanaiDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx);
+
+  ~LanaiDisassembler() override {}
+
+  // getInstruction - See MCDisassembler.
+  MCDisassembler::DecodeStatus
+  getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes,
+                 uint64_t Address, raw_ostream &VStream,
+                 raw_ostream &CStream) const override;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_DISASSEMBLER_LANAIDISASSEMBLER_H
diff --git a/contrib/llvm/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp b/contrib/llvm/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp
new file mode 100644
index 000000000000..2fa411fcfd87
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.cpp
@@ -0,0 +1,305 @@
+//===-- LanaiInstPrinter.cpp - Convert Lanai MCInst to asm syntax ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Lanai MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiInstPrinter.h"
+#include "Lanai.h"
+#include "MCTargetDesc/LanaiMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "LanaiGenAsmWriter.inc"
+
+void LanaiInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << StringRef(getRegisterName(RegNo)).lower();
+}
+
+bool LanaiInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                 StringRef Alias, unsigned OpNo0,
+                                 unsigned OpNo1) {
+  OS << "\t" << Alias << " ";
+  printOperand(MI, OpNo0, OS);
+  OS << ", ";
+  printOperand(MI, OpNo1, OS);
+  return true;
+}
+
+static bool usesGivenOffset(const MCInst *MI, int AddOffset) {
+  unsigned AluCode = MI->getOperand(3).getImm();
+  return LPAC::encodeLanaiAluCode(AluCode) == LPAC::ADD &&
+         (MI->getOperand(2).getImm() == AddOffset ||
+          MI->getOperand(2).getImm() == -AddOffset);
+}
+
+static bool isPreIncrementForm(const MCInst *MI, int AddOffset) {
+  unsigned AluCode = MI->getOperand(3).getImm();
+  return LPAC::isPreOp(AluCode) && usesGivenOffset(MI, AddOffset);
+}
+
+static bool isPostIncrementForm(const MCInst *MI, int AddOffset) {
+  unsigned AluCode = MI->getOperand(3).getImm();
+  return LPAC::isPostOp(AluCode) && usesGivenOffset(MI, AddOffset);
+}
+
+static StringRef decIncOperator(const MCInst *MI) {
+  if (MI->getOperand(2).getImm() < 0)
+    return "--";
+  return "++";
+}
+
+bool LanaiInstPrinter::printMemoryLoadIncrement(const MCInst *MI,
+                                                raw_ostream &OS,
+                                                StringRef Opcode,
+                                                int AddOffset) {
+  if (isPreIncrementForm(MI, AddOffset)) {
+    OS << "\t" << Opcode << "\t[" << decIncOperator(MI) << "%"
+       << getRegisterName(MI->getOperand(1).getReg()) << "], %"
+       << getRegisterName(MI->getOperand(0).getReg());
+    return true;
+  }
+  if (isPostIncrementForm(MI, AddOffset)) {
+    OS << "\t" << Opcode << "\t[%"
+       << getRegisterName(MI->getOperand(1).getReg()) << decIncOperator(MI)
+       << "], %" << getRegisterName(MI->getOperand(0).getReg());
+    return true;
+  }
+  return false;
+}
+
+bool LanaiInstPrinter::printMemoryStoreIncrement(const MCInst *MI,
+                                                 raw_ostream &OS,
+                                                 StringRef Opcode,
+                                                 int AddOffset) {
+  if (isPreIncrementForm(MI, AddOffset)) {
+    OS << "\t" << Opcode << "\t%" << getRegisterName(MI->getOperand(0).getReg())
+       << ", [" << decIncOperator(MI) << "%"
+       << getRegisterName(MI->getOperand(1).getReg()) << "]";
+    return true;
+  }
+  if (isPostIncrementForm(MI, AddOffset)) {
+    OS << "\t" << Opcode << "\t%" << getRegisterName(MI->getOperand(0).getReg())
+       << ", [%" << getRegisterName(MI->getOperand(1).getReg())
+       << decIncOperator(MI) << "]";
+    return true;
+  }
+  return false;
+}
+
+bool LanaiInstPrinter::printAlias(const MCInst *MI, raw_ostream &OS) {
+  switch (MI->getOpcode()) {
+  case Lanai::LDW_RI:
+    // ld 4[*%rN], %rX => ld [++imm], %rX
+    // ld -4[*%rN], %rX => ld [--imm], %rX
+    // ld 4[%rN*], %rX => ld [imm++], %rX
+    // ld -4[%rN*], %rX => ld [imm--], %rX
+    return printMemoryLoadIncrement(MI, OS, "ld", 4);
+  case Lanai::LDHs_RI:
+    return printMemoryLoadIncrement(MI, OS, "ld.h", 2);
+  case Lanai::LDHz_RI:
+    return printMemoryLoadIncrement(MI, OS, "uld.h", 2);
+  case Lanai::LDBs_RI:
+    return printMemoryLoadIncrement(MI, OS, "ld.b", 1);
+  case Lanai::LDBz_RI:
+    return printMemoryLoadIncrement(MI, OS, "uld.b", 1);
+  case Lanai::SW_RI:
+    // st %rX, 4[*%rN] => st %rX, [++imm]
+    // st %rX, -4[*%rN] => st %rX, [--imm]
+    // st %rX, 4[%rN*] => st %rX, [imm++]
+    // st %rX, -4[%rN*] => st %rX, [imm--]
+    return printMemoryStoreIncrement(MI, OS, "st", 4);
+  case Lanai::STH_RI:
+    return printMemoryStoreIncrement(MI, OS, "st.h", 2);
+  case Lanai::STB_RI:
+    return printMemoryStoreIncrement(MI, OS, "st.b", 1);
+  default:
+    return false;
+  }
+}
+
+void LanaiInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                 StringRef Annotation,
+                                 const MCSubtargetInfo & /*STI*/) {
+  if (!printAlias(MI, OS) && !printAliasInstr(MI, OS))
+    printInstruction(MI, OS);
+  printAnnotation(OS, Annotation);
+}
+
+void LanaiInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &OS, const char *Modifier) {
+  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg())
+    OS << "%" << getRegisterName(Op.getReg());
+  else if (Op.isImm())
+    OS << formatHex(Op.getImm());
+  else {
+    assert(Op.isExpr() && "Expected an expression");
+    Op.getExpr()->print(OS, &MAI);
+  }
+}
+
+void LanaiInstPrinter::printMemImmOperand(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &OS) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm()) {
+    OS << '[' << formatHex(Op.getImm()) << ']';
+  } else {
+    // Symbolic operand will be lowered to immediate value by linker
+    assert(Op.isExpr() && "Expected an expression");
+    OS << '[';
+    Op.getExpr()->print(OS, &MAI);
+    OS << ']';
+  }
+}
+
+void LanaiInstPrinter::printHi16ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &OS) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm()) {
+    OS << formatHex(Op.getImm() << 16);
+  } else {
+    // Symbolic operand will be lowered to immediate value by linker
+    assert(Op.isExpr() && "Expected an expression");
+    Op.getExpr()->print(OS, &MAI);
+  }
+}
+
+void LanaiInstPrinter::printHi16AndImmOperand(const MCInst *MI, unsigned OpNo,
+                                              raw_ostream &OS) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm()) {
+    OS << formatHex((Op.getImm() << 16) | 0xffff);
+  } else {
+    // Symbolic operand will be lowered to immediate value by linker
+    assert(Op.isExpr() && "Expected an expression");
+    Op.getExpr()->print(OS, &MAI);
+  }
+}
+
+void LanaiInstPrinter::printLo16AndImmOperand(const MCInst *MI, unsigned OpNo,
+                                              raw_ostream &OS) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm()) {
+    OS << formatHex(0xffff0000 | Op.getImm());
+  } else {
+    // Symbolic operand will be lowered to immediate value by linker
+    assert(Op.isExpr() && "Expected an expression");
+    Op.getExpr()->print(OS, &MAI);
+  }
+}
+
+static void printMemoryBaseRegister(raw_ostream &OS, const unsigned AluCode,
+                                    const MCOperand &RegOp) {
+  assert(RegOp.isReg() && "Register operand expected");
+  OS << "[";
+  if (LPAC::isPreOp(AluCode))
+    OS << "*";
+  OS << "%" << LanaiInstPrinter::getRegisterName(RegOp.getReg());
+  if (LPAC::isPostOp(AluCode))
+    OS << "*";
+  OS << "]";
+}
+
+template <unsigned SizeInBits>
+static void printMemoryImmediateOffset(const MCAsmInfo &MAI,
+                                       const MCOperand &OffsetOp,
+                                       raw_ostream &OS) {
+  assert((OffsetOp.isImm() || OffsetOp.isExpr()) && "Immediate expected");
+  if (OffsetOp.isImm()) {
+    assert(isInt<SizeInBits>(OffsetOp.getImm()) && "Constant value truncated");
+    OS << OffsetOp.getImm();
+  } else
+    OffsetOp.getExpr()->print(OS, &MAI);
+}
+
+void LanaiInstPrinter::printMemRiOperand(const MCInst *MI, int OpNo,
+                                         raw_ostream &OS,
+                                         const char * /*Modifier*/) {
+  const MCOperand &RegOp = MI->getOperand(OpNo);
+  const MCOperand &OffsetOp = MI->getOperand(OpNo + 1);
+  const MCOperand &AluOp = MI->getOperand(OpNo + 2);
+  const unsigned AluCode = AluOp.getImm();
+
+  // Offset
+  printMemoryImmediateOffset<16>(MAI, OffsetOp, OS);
+
+  // Register
+  printMemoryBaseRegister(OS, AluCode, RegOp);
+}
+
+void LanaiInstPrinter::printMemRrOperand(const MCInst *MI, int OpNo,
+                                         raw_ostream &OS,
+                                         const char * /*Modifier*/) {
+  const MCOperand &RegOp = MI->getOperand(OpNo);
+  const MCOperand &OffsetOp = MI->getOperand(OpNo + 1);
+  const MCOperand &AluOp = MI->getOperand(OpNo + 2);
+  const unsigned AluCode = AluOp.getImm();
+  assert(OffsetOp.isReg() && RegOp.isReg() && "Registers expected.");
+
+  // [ Base OP Offset ]
+  OS << "[";
+  if (LPAC::isPreOp(AluCode))
+    OS << "*";
+  OS << "%" << getRegisterName(RegOp.getReg());
+  if (LPAC::isPostOp(AluCode))
+    OS << "*";
+  OS << " " << LPAC::lanaiAluCodeToString(AluCode) << " ";
+  OS << "%" << getRegisterName(OffsetOp.getReg());
+  OS << "]";
+}
+
+void LanaiInstPrinter::printMemSplsOperand(const MCInst *MI, int OpNo,
+                                           raw_ostream &OS,
+                                           const char * /*Modifier*/) {
+  const MCOperand &RegOp = MI->getOperand(OpNo);
+  const MCOperand &OffsetOp = MI->getOperand(OpNo + 1);
+  const MCOperand &AluOp = MI->getOperand(OpNo + 2);
+  const unsigned AluCode = AluOp.getImm();
+
+  // Offset
+  printMemoryImmediateOffset<10>(MAI, OffsetOp, OS);
+
+  // Register
+  printMemoryBaseRegister(OS, AluCode, RegOp);
+}
+
+void LanaiInstPrinter::printCCOperand(const MCInst *MI, int OpNo,
+                                      raw_ostream &OS) {
+  LPCC::CondCode CC =
+      static_cast<LPCC::CondCode>(MI->getOperand(OpNo).getImm());
+  // Handle the undefined value here for printing so we don't abort().
+  if (CC >= LPCC::UNKNOWN)
+    OS << "<und>";
+  else
+    OS << lanaiCondCodeToString(CC);
+}
+
+void LanaiInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &OS) {
+  LPCC::CondCode CC =
+      static_cast<LPCC::CondCode>(MI->getOperand(OpNo).getImm());
+  // Handle the undefined value here for printing so we don't abort().
+  if (CC >= LPCC::UNKNOWN)
+    OS << "<und>";
+  else if (CC != LPCC::ICC_T)
+    OS << "." << lanaiCondCodeToString(CC);
+}
diff --git a/contrib/llvm/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h b/contrib/llvm/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h
new file mode 100644
index 000000000000..1c9d186ad819
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/InstPrinter/LanaiInstPrinter.h
@@ -0,0 +1,65 @@
+//= LanaiInstPrinter.h - Convert Lanai MCInst to asm syntax -------*- C++ -*--//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a Lanai MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
+#define LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+class MCOperand;
+
+class LanaiInstPrinter : public MCInstPrinter {
+public:
+  LanaiInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                   const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                    const char *Modifier = 0);
+  void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemRiOperand(const MCInst *MI, int OpNo, raw_ostream &O,
+                         const char *Modifier = 0);
+  void printMemRrOperand(const MCInst *MI, int OpNo, raw_ostream &O,
+                         const char *Modifier = 0);
+  void printMemSplsOperand(const MCInst *MI, int OpNo, raw_ostream &O,
+                           const char *Modifier = 0);
+  void printCCOperand(const MCInst *MI, int OpNo, raw_ostream &O);
+  void printAluOperand(const MCInst *MI, int OpNo, raw_ostream &O);
+  void printHi16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printHi16AndImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printLo16AndImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+
+private:
+  bool printAlias(const MCInst *MI, raw_ostream &Ostream);
+  bool printInst(const MCInst *MI, raw_ostream &Ostream, StringRef Alias,
+                 unsigned OpNo0, unsigned OpnNo1);
+  bool printMemoryLoadIncrement(const MCInst *MI, raw_ostream &Ostream,
+                                StringRef Opcode, int AddOffset);
+  bool printMemoryStoreIncrement(const MCInst *MI, raw_ostream &Ostream,
+                                 StringRef Opcode, int AddOffset);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_INSTPRINTER_LANAIINSTPRINTER_H
diff --git a/contrib/llvm/lib/Target/Lanai/Lanai.h b/contrib/llvm/lib/Target/Lanai/Lanai.h
new file mode 100644
index 000000000000..c1fdf793305b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/Lanai.h
@@ -0,0 +1,51 @@
+//===-- Lanai.h - Top-level interface for Lanai representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Lanai back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAI_H
+#define LLVM_LIB_TARGET_LANAI_LANAI_H
+
+#include "LanaiAluCode.h"
+#include "LanaiCondCode.h"
+#include "MCTargetDesc/LanaiBaseInfo.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class FunctionPass;
+class LanaiTargetMachine;
+class MachineFunctionPass;
+class TargetMachine;
+class formatted_raw_ostream;
+
+// createLanaiISelDag - This pass converts a legalized DAG into a
+// Lanai-specific DAG, ready for instruction scheduling.
+FunctionPass *createLanaiISelDag(LanaiTargetMachine &TM);
+
+// createLanaiDelaySlotFillerPass - This pass fills delay slots
+// with useful instructions or nop's
+FunctionPass *createLanaiDelaySlotFillerPass(const LanaiTargetMachine &TM);
+
+// createLanaiMemAluCombinerPass - This pass combines loads/stores and
+// arithmetic operations.
+FunctionPass *createLanaiMemAluCombinerPass();
+
+// createLanaiSetflagAluCombinerPass - This pass combines SET_FLAG and ALU
+// operations.
+FunctionPass *createLanaiSetflagAluCombinerPass();
+
+Target &getTheLanaiTarget();
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAI_H
diff --git a/contrib/llvm/lib/Target/Lanai/Lanai.td b/contrib/llvm/lib/Target/Lanai/Lanai.td
new file mode 100644
index 000000000000..73d080457034
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/Lanai.td
@@ -0,0 +1,47 @@
+//===- Lanai.td - Describe the Lanai Target Machine --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "LanaiSchedule.td"
+include "LanaiRegisterInfo.td"
+include "LanaiCallingConv.td"
+include "LanaiInstrInfo.td"
+
+def LanaiInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Lanai processors supported.
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"generic", LanaiSchedModel, []>;
+def : ProcessorModel<"v11", LanaiSchedModel, []>;
+
+def LanaiInstPrinter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def Lanai : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = LanaiInstrInfo;
+  let AssemblyWriters = [LanaiInstPrinter];
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiAluCode.h b/contrib/llvm/lib/Target/Lanai/LanaiAluCode.h
new file mode 100644
index 000000000000..d5145694fe46
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiAluCode.h
@@ -0,0 +1,148 @@
+//===-- LanaiAluCode.h - ALU operator encoding ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The encoding for ALU operators used in RM and RRM operands
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIALUCODE_H
+#define LLVM_LIB_TARGET_LANAI_LANAIALUCODE_H
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+namespace LPAC {
+enum AluCode {
+  ADD = 0x00,
+  ADDC = 0x01,
+  SUB = 0x02,
+  SUBB = 0x03,
+  AND = 0x04,
+  OR = 0x05,
+  XOR = 0x06,
+  SPECIAL = 0x07,
+
+  // Shift instructions are treated as SPECIAL when encoding the machine
+  // instruction, but kept distinct until lowering. The constant values are
+  // chosen to ease lowering.
+  SHL = 0x17,
+  SRL = 0x27,
+  SRA = 0x37,
+
+  // Indicates an unknown/unsupported operator
+  UNKNOWN = 0xFF,
+};
+
+// Bits indicating post- and pre-operators should be tested and set using Is*
+// and Make* utility functions
+const int Lanai_PRE_OP = 0x40;
+const int Lanai_POST_OP = 0x80;
+
+inline static unsigned encodeLanaiAluCode(unsigned AluOp) {
+  unsigned const OP_ENCODING_MASK = 0x07;
+  return AluOp & OP_ENCODING_MASK;
+}
+
+inline static unsigned getAluOp(unsigned AluOp) {
+  unsigned const ALU_MASK = 0x3F;
+  return AluOp & ALU_MASK;
+}
+
+inline static bool isPreOp(unsigned AluOp) { return AluOp & Lanai_PRE_OP; }
+
+inline static bool isPostOp(unsigned AluOp) { return AluOp & Lanai_POST_OP; }
+
+inline static unsigned makePreOp(unsigned AluOp) {
+  assert(!isPostOp(AluOp) && "Operator can't be a post- and pre-op");
+  return AluOp | Lanai_PRE_OP;
+}
+
+inline static unsigned makePostOp(unsigned AluOp) {
+  assert(!isPreOp(AluOp) && "Operator can't be a post- and pre-op");
+  return AluOp | Lanai_POST_OP;
+}
+
+inline static bool modifiesOp(unsigned AluOp) {
+  return isPreOp(AluOp) | isPostOp(AluOp);
+}
+
+inline static const char *lanaiAluCodeToString(unsigned AluOp) {
+  switch (getAluOp(AluOp)) {
+  case ADD:
+    return "add";
+  case ADDC:
+    return "addc";
+  case SUB:
+    return "sub";
+  case SUBB:
+    return "subb";
+  case AND:
+    return "and";
+  case OR:
+    return "or";
+  case XOR:
+    return "xor";
+  case SHL:
+    return "sh";
+  case SRL:
+    return "sh";
+  case SRA:
+    return "sha";
+  default:
+    llvm_unreachable("Invalid ALU code.");
+  }
+}
+
+inline static AluCode stringToLanaiAluCode(StringRef S) {
+  return StringSwitch<AluCode>(S)
+      .Case("add", ADD)
+      .Case("addc", ADDC)
+      .Case("sub", SUB)
+      .Case("subb", SUBB)
+      .Case("and", AND)
+      .Case("or", OR)
+      .Case("xor", XOR)
+      .Case("sh", SHL)
+      .Case("srl", SRL)
+      .Case("sha", SRA)
+      .Default(UNKNOWN);
+}
+
+inline static AluCode isdToLanaiAluCode(ISD::NodeType Node_type) {
+  switch (Node_type) {
+  case ISD::ADD:
+    return AluCode::ADD;
+  case ISD::ADDE:
+    return AluCode::ADDC;
+  case ISD::SUB:
+    return AluCode::SUB;
+  case ISD::SUBE:
+    return AluCode::SUBB;
+  case ISD::AND:
+    return AluCode::AND;
+  case ISD::OR:
+    return AluCode::OR;
+  case ISD::XOR:
+    return AluCode::XOR;
+  case ISD::SHL:
+    return AluCode::SHL;
+  case ISD::SRL:
+    return AluCode::SRL;
+  case ISD::SRA:
+    return AluCode::SRA;
+  default:
+    return AluCode::UNKNOWN;
+  }
+}
+} // namespace LPAC
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIALUCODE_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp b/contrib/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
new file mode 100644
index 000000000000..607b2a97b29f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
@@ -0,0 +1,243 @@
+//===-- LanaiAsmPrinter.cpp - Lanai LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the Lanai assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstPrinter/LanaiInstPrinter.h"
+#include "Lanai.h"
+#include "LanaiInstrInfo.h"
+#include "LanaiMCInstLower.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "asm-printer"
+
+using namespace llvm;
+
+namespace {
+class LanaiAsmPrinter : public AsmPrinter {
+public:
+  explicit LanaiAsmPrinter(TargetMachine &TM,
+                           std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)) {}
+
+  StringRef getPassName() const override { return "Lanai Assembly Printer"; }
+
+  void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O) override;
+  void EmitInstruction(const MachineInstr *MI) override;
+  bool isBlockOnlyReachableByFallthrough(
+      const MachineBasicBlock *MBB) const override;
+
+private:
+  void customEmitInstruction(const MachineInstr *MI);
+  void emitCallInstruction(const MachineInstr *MI);
+};
+} // end of anonymous namespace
+
+void LanaiAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                   raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << LanaiInstPrinter::getRegisterName(MO.getReg());
+    break;
+
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    break;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    O << *getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_BlockAddress: {
+    MCSymbol *BA = GetBlockAddressSymbol(MO.getBlockAddress());
+    O << BA->getName();
+    break;
+  }
+
+  case MachineOperand::MO_ExternalSymbol:
+    O << *GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << '_'
+      << MO.getIndex();
+    break;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+      << MO.getIndex();
+    return;
+
+  default:
+    llvm_unreachable("<unknown operand type>");
+  }
+}
+
+// PrintAsmOperand - Print out an operand for an inline asm expression.
+bool LanaiAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned /*AsmVariant*/,
+                                      const char *ExtraCode, raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1])
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    // The highest-numbered register of a pair.
+    case 'H': {
+      if (OpNo == 0)
+        return true;
+      const MachineOperand &FlagsOP = MI->getOperand(OpNo - 1);
+      if (!FlagsOP.isImm())
+        return true;
+      unsigned Flags = FlagsOP.getImm();
+      unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+      if (NumVals != 2)
+        return true;
+      unsigned RegOp = OpNo + 1;
+      if (RegOp >= MI->getNumOperands())
+        return true;
+      const MachineOperand &MO = MI->getOperand(RegOp);
+      if (!MO.isReg())
+        return true;
+      unsigned Reg = MO.getReg();
+      O << LanaiInstPrinter::getRegisterName(Reg);
+      return false;
+    }
+    default:
+      return true; // Unknown modifier.
+    }
+  }
+  printOperand(MI, OpNo, O);
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+void LanaiAsmPrinter::emitCallInstruction(const MachineInstr *MI) {
+  assert((MI->getOpcode() == Lanai::CALL || MI->getOpcode() == Lanai::CALLR) &&
+         "Unsupported call function");
+
+  LanaiMCInstLower MCInstLowering(OutContext, *this);
+  MCSubtargetInfo STI = getSubtargetInfo();
+  // Insert save rca instruction immediately before the call.
+  // TODO: We should generate a pc-relative mov instruction here instead
+  // of pc + 16 (should be mov .+16 %rca).
+  OutStreamer->EmitInstruction(MCInstBuilder(Lanai::ADD_I_LO)
+                                   .addReg(Lanai::RCA)
+                                   .addReg(Lanai::PC)
+                                   .addImm(16),
+                               STI);
+
+  // Push rca onto the stack.
+  //   st %rca, [--%sp]
+  OutStreamer->EmitInstruction(MCInstBuilder(Lanai::SW_RI)
+                                   .addReg(Lanai::RCA)
+                                   .addReg(Lanai::SP)
+                                   .addImm(-4)
+                                   .addImm(LPAC::makePreOp(LPAC::ADD)),
+                               STI);
+
+  // Lower the call instruction.
+  if (MI->getOpcode() == Lanai::CALL) {
+    MCInst TmpInst;
+    MCInstLowering.Lower(MI, TmpInst);
+    TmpInst.setOpcode(Lanai::BT);
+    OutStreamer->EmitInstruction(TmpInst, STI);
+  } else {
+    OutStreamer->EmitInstruction(MCInstBuilder(Lanai::ADD_R)
+                                     .addReg(Lanai::PC)
+                                     .addReg(MI->getOperand(0).getReg())
+                                     .addReg(Lanai::R0)
+                                     .addImm(LPCC::ICC_T),
+                                 STI);
+  }
+}
+
+void LanaiAsmPrinter::customEmitInstruction(const MachineInstr *MI) {
+  LanaiMCInstLower MCInstLowering(OutContext, *this);
+  MCSubtargetInfo STI = getSubtargetInfo();
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  OutStreamer->EmitInstruction(TmpInst, STI);
+}
+
+void LanaiAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MachineBasicBlock::const_instr_iterator I = MI->getIterator();
+  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+
+  do {
+    if (I->isCall()) {
+      emitCallInstruction(&*I);
+      continue;
+    }
+
+    customEmitInstruction(&*I);
+  } while ((++I != E) && I->isInsideBundle());
+}
+
+// isBlockOnlyReachableByFallthough - Return true if the basic block has
+// exactly one predecessor and the control transfer mechanism between
+// the predecessor and this block is a fall-through.
+// FIXME: could the overridden cases be handled in AnalyzeBranch?
+bool LanaiAsmPrinter::isBlockOnlyReachableByFallthrough(
+    const MachineBasicBlock *MBB) const {
+  // The predecessor has to be immediately before this block.
+  const MachineBasicBlock *Pred = *MBB->pred_begin();
+
+  // If the predecessor is a switch statement, assume a jump table
+  // implementation, so it is not a fall through.
+  if (const BasicBlock *B = Pred->getBasicBlock())
+    if (isa<SwitchInst>(B->getTerminator()))
+      return false;
+
+  // Check default implementation
+  if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
+    return false;
+
+  // Otherwise, check the last instruction.
+  // Check if the last terminator is an unconditional branch.
+  MachineBasicBlock::const_iterator I = Pred->end();
+  while (I != Pred->begin() && !(--I)->isTerminator()) {
+  }
+
+  return !I->isBarrier();
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeLanaiAsmPrinter() {
+  RegisterAsmPrinter<LanaiAsmPrinter> X(getTheLanaiTarget());
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiCallingConv.td b/contrib/llvm/lib/Target/Lanai/LanaiCallingConv.td
new file mode 100644
index 000000000000..056b329c33c5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiCallingConv.td
@@ -0,0 +1,50 @@
+//===- LanaiCallingConv.td - Calling Conventions Lanai -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the Lanai architectures.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Lanai 32-bit C Calling convention.
+def CC_Lanai32 : CallingConv<[
+  // Promote i8/i16 args to i32
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Put argument in registers if marked 'inreg' and not a vararg call.
+  CCIfNotVarArg<CCIfInReg<CCIfType<[i32],
+                                   CCAssignToReg<[R6, R7, R18, R19]>>>>,
+
+  // Otherwise they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
+
+// Lanai 32-bit Fast Calling convention.
+def CC_Lanai32_Fast : CallingConv<[
+  // Promote i8/i16 args to i32
+  CCIfType<[ i8, i16 ], CCPromoteToType<i32>>,
+
+  // Put arguments in registers.
+  CCIfNotVarArg<CCIfType<[i32], CCAssignToReg<[ R6, R7, R18, R19 ]>>>,
+
+  // Otherwise they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
+
+// Lanai 32-bit C return-value convention.
+def RetCC_Lanai32 : CallingConv<[
+  // Specify two registers to allow returning 64-bit results that have already
+  // been lowered to 2 32-bit values.
+  CCIfType<[i32], CCAssignToReg<[RV, R9]>>
+]>;
+
+def CSR: CalleeSavedRegs<(add)>;
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiCondCode.h b/contrib/llvm/lib/Target/Lanai/LanaiCondCode.h
new file mode 100644
index 000000000000..6c5bdefc83dc
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiCondCode.h
@@ -0,0 +1,100 @@
+// The encoding used for conditional codes used in BR instructions
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAICONDCODE_H
+#define LLVM_LIB_TARGET_LANAI_LANAICONDCODE_H
+
+#include "llvm/ADT/StringSwitch.h"
+
+namespace llvm {
+namespace LPCC {
+enum CondCode {
+  ICC_T = 0,   //  true
+  ICC_F = 1,   //  false
+  ICC_HI = 2,  //  high
+  ICC_UGT = 2, //  unsigned greater than
+  ICC_LS = 3,  //  low or same
+  ICC_ULE = 3, //  unsigned less than or equal
+  ICC_CC = 4,  //  carry cleared
+  ICC_ULT = 4, //  unsigned less than
+  ICC_CS = 5,  //  carry set
+  ICC_UGE = 5, //  unsigned greater than or equal
+  ICC_NE = 6,  //  not equal
+  ICC_EQ = 7,  //  equal
+  ICC_VC = 8,  //  oVerflow cleared
+  ICC_VS = 9,  //  oVerflow set
+  ICC_PL = 10, //  plus
+  ICC_MI = 11, //  minus
+  ICC_GE = 12, //  greater than or equal
+  ICC_LT = 13, //  less than
+  ICC_GT = 14, //  greater than
+  ICC_LE = 15, //  less than or equal
+  UNKNOWN
+};
+
+inline static StringRef lanaiCondCodeToString(LPCC::CondCode CC) {
+  switch (CC) {
+  case LPCC::ICC_T:
+    return "t"; // true
+  case LPCC::ICC_F:
+    return "f"; // false
+  case LPCC::ICC_NE:
+    return "ne"; // not equal
+  case LPCC::ICC_EQ:
+    return "eq"; // equal
+  case LPCC::ICC_VC:
+    return "vc"; // oVerflow cleared
+  case LPCC::ICC_VS:
+    return "vs"; // oVerflow set
+  case LPCC::ICC_PL:
+    return "pl"; // plus
+  case LPCC::ICC_MI:
+    return "mi"; // minus
+  case LPCC::ICC_GE:
+    return "ge"; // greater than or equal
+  case LPCC::ICC_LT:
+    return "lt"; // less than
+  case LPCC::ICC_GT:
+    return "gt"; // greater than
+  case LPCC::ICC_LE:
+    return "le"; // less than or equal
+  case LPCC::ICC_UGT:
+    return "ugt"; // high | unsigned greater than
+  case LPCC::ICC_ULE:
+    return "ule"; // low or same | unsigned less or equal
+  case LPCC::ICC_ULT:
+    return "ult"; // carry cleared | unsigned less than
+  case LPCC::ICC_UGE:
+    return "uge"; // carry set | unsigned than or equal
+  default:
+    llvm_unreachable("Invalid cond code");
+  }
+}
+
+inline static CondCode suffixToLanaiCondCode(StringRef S) {
+  return StringSwitch<CondCode>(S)
+      .EndsWith("f", LPCC::ICC_F)
+      .EndsWith("hi", LPCC::ICC_HI)
+      .EndsWith("ugt", LPCC::ICC_UGT)
+      .EndsWith("ls", LPCC::ICC_LS)
+      .EndsWith("ule", LPCC::ICC_ULE)
+      .EndsWith("cc", LPCC::ICC_CC)
+      .EndsWith("ult", LPCC::ICC_ULT)
+      .EndsWith("cs", LPCC::ICC_CS)
+      .EndsWith("uge", LPCC::ICC_UGE)
+      .EndsWith("ne", LPCC::ICC_NE)
+      .EndsWith("eq", LPCC::ICC_EQ)
+      .EndsWith("vc", LPCC::ICC_VC)
+      .EndsWith("vs", LPCC::ICC_VS)
+      .EndsWith("pl", LPCC::ICC_PL)
+      .EndsWith("mi", LPCC::ICC_MI)
+      .EndsWith("ge", LPCC::ICC_GE)
+      .EndsWith("lt", LPCC::ICC_LT)
+      .EndsWith("gt", LPCC::ICC_GT)
+      .EndsWith("le", LPCC::ICC_LE)
+      .EndsWith("t", LPCC::ICC_T) // Has to be after others with suffix t
+      .Default(LPCC::UNKNOWN);
+}
+} // namespace LPCC
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAICONDCODE_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp b/contrib/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
new file mode 100644
index 000000000000..802232b05828
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiDelaySlotFiller.cpp
@@ -0,0 +1,262 @@
+//===-- LanaiDelaySlotFiller.cpp - Lanai delay slot filler ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple pass to fills delay slots with useful instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "delay-slot-filler"
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+
+static cl::opt<bool>
+    NopDelaySlotFiller("lanai-nop-delay-filler", cl::init(false),
+                       cl::desc("Fill Lanai delay slots with NOPs."),
+                       cl::Hidden);
+
+namespace {
+struct Filler : public MachineFunctionPass {
+  // Target machine description which we query for reg. names, data
+  // layout, etc.
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineBasicBlock::instr_iterator LastFiller;
+
+  static char ID;
+  explicit Filler() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "Lanai Delay Slot Filler"; }
+
+  bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    const LanaiSubtarget &Subtarget = MF.getSubtarget<LanaiSubtarget>();
+    TII = Subtarget.getInstrInfo();
+    TRI = Subtarget.getRegisterInfo();
+
+    bool Changed = false;
+    for (MachineFunction::iterator FI = MF.begin(), FE = MF.end(); FI != FE;
+         ++FI)
+      Changed |= runOnMachineBasicBlock(*FI);
+    return Changed;
+  }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  void insertDefsUses(MachineBasicBlock::instr_iterator MI,
+                      SmallSet<unsigned, 32> &RegDefs,
+                      SmallSet<unsigned, 32> &RegUses);
+
+  bool isRegInSet(SmallSet<unsigned, 32> &RegSet, unsigned Reg);
+
+  bool delayHasHazard(MachineBasicBlock::instr_iterator MI, bool &SawLoad,
+                      bool &SawStore, SmallSet<unsigned, 32> &RegDefs,
+                      SmallSet<unsigned, 32> &RegUses);
+
+  bool findDelayInstr(MachineBasicBlock &MBB,
+                      MachineBasicBlock::instr_iterator Slot,
+                      MachineBasicBlock::instr_iterator &Filler);
+};
+char Filler::ID = 0;
+} // end of anonymous namespace
+
+// createLanaiDelaySlotFillerPass - Returns a pass that fills in delay
+// slots in Lanai MachineFunctions
+FunctionPass *
+llvm::createLanaiDelaySlotFillerPass(const LanaiTargetMachine & /*tm*/) {
+  return new Filler();
+}
+
+// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+// There is one or two delay slot per delayed instruction.
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  LastFiller = MBB.instr_end();
+
+  for (MachineBasicBlock::instr_iterator I = MBB.instr_begin();
+       I != MBB.instr_end(); ++I) {
+    if (I->getDesc().hasDelaySlot()) {
+      MachineBasicBlock::instr_iterator InstrWithSlot = I;
+      MachineBasicBlock::instr_iterator J = I;
+
+      // Treat RET specially as it is only instruction with 2 delay slots
+      // generated while all others generated have 1 delay slot.
+      if (I->getOpcode() == Lanai::RET) {
+        // RET is generated as part of epilogue generation and hence we know
+        // what the two instructions preceding it are and that it is safe to
+        // insert RET above them.
+        MachineBasicBlock::reverse_instr_iterator RI = ++I.getReverse();
+        assert(RI->getOpcode() == Lanai::LDW_RI && RI->getOperand(0).isReg() &&
+               RI->getOperand(0).getReg() == Lanai::FP &&
+               RI->getOperand(1).isReg() &&
+               RI->getOperand(1).getReg() == Lanai::FP &&
+               RI->getOperand(2).isImm() && RI->getOperand(2).getImm() == -8);
+        ++RI;
+        assert(RI->getOpcode() == Lanai::ADD_I_LO &&
+               RI->getOperand(0).isReg() &&
+               RI->getOperand(0).getReg() == Lanai::SP &&
+               RI->getOperand(1).isReg() &&
+               RI->getOperand(1).getReg() == Lanai::FP);
+        MachineBasicBlock::instr_iterator FI = RI.getReverse();
+        MBB.splice(std::next(I), &MBB, FI, I);
+        FilledSlots += 2;
+      } else {
+        if (!NopDelaySlotFiller && findDelayInstr(MBB, I, J)) {
+          MBB.splice(std::next(I), &MBB, J);
+        } else {
+          BuildMI(MBB, std::next(I), DebugLoc(), TII->get(Lanai::NOP));
+        }
+        ++FilledSlots;
+      }
+
+      Changed = true;
+      // Record the filler instruction that filled the delay slot.
+      // The instruction after it will be visited in the next iteration.
+      LastFiller = ++I;
+
+      // Bundle the delay slot filler to InstrWithSlot so that the machine
+      // verifier doesn't expect this instruction to be a terminator.
+      MIBundleBuilder(MBB, InstrWithSlot, std::next(LastFiller));
+    }
+  }
+  return Changed;
+}
+
+bool Filler::findDelayInstr(MachineBasicBlock &MBB,
+                            MachineBasicBlock::instr_iterator Slot,
+                            MachineBasicBlock::instr_iterator &Filler) {
+  SmallSet<unsigned, 32> RegDefs;
+  SmallSet<unsigned, 32> RegUses;
+
+  insertDefsUses(Slot, RegDefs, RegUses);
+
+  bool SawLoad = false;
+  bool SawStore = false;
+
+  for (MachineBasicBlock::reverse_instr_iterator I = ++Slot.getReverse();
+       I != MBB.instr_rend(); ++I) {
+    // skip debug value
+    if (I->isDebugValue())
+      continue;
+
+    // Convert to forward iterator.
+    MachineBasicBlock::instr_iterator FI = I.getReverse();
+
+    if (I->hasUnmodeledSideEffects() || I->isInlineAsm() || I->isLabel() ||
+        FI == LastFiller || I->isPseudo())
+      break;
+
+    if (delayHasHazard(FI, SawLoad, SawStore, RegDefs, RegUses)) {
+      insertDefsUses(FI, RegDefs, RegUses);
+      continue;
+    }
+    Filler = FI;
+    return true;
+  }
+  return false;
+}
+
+bool Filler::delayHasHazard(MachineBasicBlock::instr_iterator MI, bool &SawLoad,
+                            bool &SawStore, SmallSet<unsigned, 32> &RegDefs,
+                            SmallSet<unsigned, 32> &RegUses) {
+  if (MI->isImplicitDef() || MI->isKill())
+    return true;
+
+  // Loads or stores cannot be moved past a store to the delay slot
+  // and stores cannot be moved past a load.
+  if (MI->mayLoad()) {
+    if (SawStore)
+      return true;
+    SawLoad = true;
+  }
+
+  if (MI->mayStore()) {
+    if (SawStore)
+      return true;
+    SawStore = true;
+    if (SawLoad)
+      return true;
+  }
+
+  assert((!MI->isCall() && !MI->isReturn()) &&
+         "Cannot put calls or returns in delay slot.");
+
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    unsigned Reg;
+
+    if (!MO.isReg() || !(Reg = MO.getReg()))
+      continue; // skip
+
+    if (MO.isDef()) {
+      // check whether Reg is defined or used before delay slot.
+      if (isRegInSet(RegDefs, Reg) || isRegInSet(RegUses, Reg))
+        return true;
+    }
+    if (MO.isUse()) {
+      // check whether Reg is defined before delay slot.
+      if (isRegInSet(RegDefs, Reg))
+        return true;
+    }
+  }
+  return false;
+}
+
+// Insert Defs and Uses of MI into the sets RegDefs and RegUses.
+void Filler::insertDefsUses(MachineBasicBlock::instr_iterator MI,
+                            SmallSet<unsigned, 32> &RegDefs,
+                            SmallSet<unsigned, 32> &RegUses) {
+  // If MI is a call or return, just examine the explicit non-variadic operands.
+  MCInstrDesc MCID = MI->getDesc();
+  unsigned E = MI->isCall() || MI->isReturn() ? MCID.getNumOperands()
+                                              : MI->getNumOperands();
+  for (unsigned I = 0; I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    unsigned Reg;
+
+    if (!MO.isReg() || !(Reg = MO.getReg()))
+      continue;
+
+    if (MO.isDef())
+      RegDefs.insert(Reg);
+    else if (MO.isUse())
+      RegUses.insert(Reg);
+  }
+
+  // Call & return instructions defines SP implicitly. Implicit defines are not
+  // included in the RegDefs set of calls but instructions modifying SP cannot
+  // be inserted in the delay slot of a call/return as these instructions are
+  // expanded to multiple instructions with SP modified before the branch that
+  // has the delay slot.
+  if (MI->isCall() || MI->isReturn())
+    RegDefs.insert(Lanai::SP);
+}
+
+// Returns true if the Reg or its alias is in the RegSet.
+bool Filler::isRegInSet(SmallSet<unsigned, 32> &RegSet, unsigned Reg) {
+  // Check Reg and all aliased Registers.
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    if (RegSet.count(*AI))
+      return true;
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp b/contrib/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
new file mode 100644
index 000000000000..0723668c743e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
@@ -0,0 +1,220 @@
+//===-- LanaiFrameLowering.cpp - Lanai Frame Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Lanai implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiFrameLowering.h"
+
+#include "LanaiInstrInfo.h"
+#include "LanaiMachineFunctionInfo.h"
+#include "LanaiSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+// Determines the size of the frame and maximum call frame size.
+void LanaiFrameLowering::determineFrameLayout(MachineFunction &MF) const {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const LanaiRegisterInfo *LRI = STI.getRegisterInfo();
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  unsigned FrameSize = MFI.getStackSize();
+
+  // Get the alignment.
+  unsigned StackAlign = LRI->needsStackRealignment(MF) ? MFI.getMaxAlignment()
+                                                       : getStackAlignment();
+
+  // Get the maximum call frame size of all the calls.
+  unsigned MaxCallFrameSize = MFI.getMaxCallFrameSize();
+
+  // If we have dynamic alloca then MaxCallFrameSize needs to be aligned so
+  // that allocations will be aligned.
+  if (MFI.hasVarSizedObjects())
+    MaxCallFrameSize = alignTo(MaxCallFrameSize, StackAlign);
+
+  // Update maximum call frame size.
+  MFI.setMaxCallFrameSize(MaxCallFrameSize);
+
+  // Include call frame size in total.
+  if (!(hasReservedCallFrame(MF) && MFI.adjustsStack()))
+    FrameSize += MaxCallFrameSize;
+
+  // Make sure the frame is aligned.
+  FrameSize = alignTo(FrameSize, StackAlign);
+
+  // Update frame info.
+  MFI.setStackSize(FrameSize);
+}
+
+// Iterates through each basic block in a machine function and replaces
+// ADJDYNALLOC pseudo instructions with a Lanai:ADDI with the
+// maximum call frame size as the immediate.
+void LanaiFrameLowering::replaceAdjDynAllocPseudo(MachineFunction &MF) const {
+  const LanaiInstrInfo &LII =
+      *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo());
+  unsigned MaxCallFrameSize = MF.getFrameInfo().getMaxCallFrameSize();
+
+  for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); MBB != E;
+       ++MBB) {
+    MachineBasicBlock::iterator MBBI = MBB->begin();
+    while (MBBI != MBB->end()) {
+      MachineInstr &MI = *MBBI++;
+      if (MI.getOpcode() == Lanai::ADJDYNALLOC) {
+        DebugLoc DL = MI.getDebugLoc();
+        unsigned Dst = MI.getOperand(0).getReg();
+        unsigned Src = MI.getOperand(1).getReg();
+
+        BuildMI(*MBB, MI, DL, LII.get(Lanai::ADD_I_LO), Dst)
+            .addReg(Src)
+            .addImm(MaxCallFrameSize);
+        MI.eraseFromParent();
+      }
+    }
+  }
+}
+
+// Generates the following sequence for function entry:
+//   st %fp,-4[*%sp]        !push old FP
+//   add %sp,8,%fp          !generate new FP
+//   sub %sp,0x4,%sp        !allocate stack space (as needed)
+void LanaiFrameLowering::emitPrologue(MachineFunction &MF,
+                                      MachineBasicBlock &MBB) const {
+  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const LanaiInstrInfo &LII =
+      *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo());
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc DL;
+
+  // Determine the correct frame layout
+  determineFrameLayout(MF);
+
+  // FIXME: This appears to be overallocating.  Needs investigation.
+  // Get the number of bytes to allocate from the FrameInfo.
+  unsigned StackSize = MFI.getStackSize();
+
+  // Push old FP
+  // st %fp,-4[*%sp]
+  BuildMI(MBB, MBBI, DL, LII.get(Lanai::SW_RI))
+      .addReg(Lanai::FP)
+      .addReg(Lanai::SP)
+      .addImm(-4)
+      .addImm(LPAC::makePreOp(LPAC::ADD))
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // Generate new FP
+  // add %sp,8,%fp
+  BuildMI(MBB, MBBI, DL, LII.get(Lanai::ADD_I_LO), Lanai::FP)
+      .addReg(Lanai::SP)
+      .addImm(8)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // Allocate space on the stack if needed
+  // sub %sp,StackSize,%sp
+  if (StackSize != 0) {
+    BuildMI(MBB, MBBI, DL, LII.get(Lanai::SUB_I_LO), Lanai::SP)
+        .addReg(Lanai::SP)
+        .addImm(StackSize)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // Replace ADJDYNANALLOC
+  if (MFI.hasVarSizedObjects())
+    replaceAdjDynAllocPseudo(MF);
+}
+
+MachineBasicBlock::iterator LanaiFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction & /*MF*/, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  // Discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  return MBB.erase(I);
+}
+
+// The function epilogue should not depend on the current stack pointer!
+// It should use the frame pointer only.  This is mandatory because
+// of alloca; we also take advantage of it to omit stack adjustments
+// before returning.
+//
+// Note that when we go to restore the preserved register values we must
+// not try to address their slots by using offsets from the stack pointer.
+// That's because the stack pointer may have been moved during the function
+// execution due to a call to alloca().  Rather, we must restore all
+// preserved registers via offsets from the frame pointer value.
+//
+// Note also that when the current frame is being "popped" (by adjusting
+// the value of the stack pointer) on function exit, we must (for the
+// sake of alloca) set the new value of the stack pointer based upon
+// the current value of the frame pointer.  We can't just add what we
+// believe to be the (static) frame size to the stack pointer because
+// if we did that, and alloca() had been called during this function,
+// we would end up returning *without* having fully deallocated all of
+// the space grabbed by alloca.  If that happened, and a function
+// containing one or more alloca() calls was called over and over again,
+// then the stack would grow without limit!
+//
+// RET is lowered to
+//      ld -4[%fp],%pc  # modify %pc (two delay slots)
+// as the return address is in the stack frame and mov to pc is allowed.
+// emitEpilogue emits
+//      mov %fp,%sp     # restore the stack pointer
+//      ld -8[%fp],%fp  # restore the caller's frame pointer
+// before RET and the delay slot filler will move RET such that these
+// instructions execute in the delay slots of the load to PC.
+void LanaiFrameLowering::emitEpilogue(MachineFunction & /*MF*/,
+                                      MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  const LanaiInstrInfo &LII =
+      *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo());
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  // Restore the stack pointer using the callee's frame pointer value.
+  BuildMI(MBB, MBBI, DL, LII.get(Lanai::ADD_I_LO), Lanai::SP)
+      .addReg(Lanai::FP)
+      .addImm(0);
+
+  // Restore the frame pointer from the stack.
+  BuildMI(MBB, MBBI, DL, LII.get(Lanai::LDW_RI), Lanai::FP)
+      .addReg(Lanai::FP)
+      .addImm(-8)
+      .addImm(LPAC::ADD);
+}
+
+void LanaiFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                              BitVector &SavedRegs,
+                                              RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const LanaiRegisterInfo *LRI =
+      static_cast<const LanaiRegisterInfo *>(STI.getRegisterInfo());
+  int Offset = -4;
+
+  // Reserve 4 bytes for the saved RCA
+  MFI.CreateFixedObject(4, Offset, true);
+  Offset -= 4;
+
+  // Reserve 4 bytes for the saved FP
+  MFI.CreateFixedObject(4, Offset, true);
+  Offset -= 4;
+
+  if (LRI->hasBasePointer(MF)) {
+    MFI.CreateFixedObject(4, Offset, true);
+    SavedRegs.reset(LRI->getBaseRegister());
+  }
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiFrameLowering.h b/contrib/llvm/lib/Target/Lanai/LanaiFrameLowering.h
new file mode 100644
index 000000000000..2f9b6c3c158f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiFrameLowering.h
@@ -0,0 +1,57 @@
+//===-- LanaiFrameLowering.h - Define frame lowering for Lanai --*- C++-*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements Lanai-specific bits of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H
+#define LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H
+
+#include "Lanai.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class BitVector;
+class LanaiSubtarget;
+
+class LanaiFrameLowering : public TargetFrameLowering {
+private:
+  void determineFrameLayout(MachineFunction &MF) const;
+  void replaceAdjDynAllocPseudo(MachineFunction &MF) const;
+
+protected:
+  const LanaiSubtarget &STI;
+
+public:
+  explicit LanaiFrameLowering(const LanaiSubtarget &Subtarget)
+      : TargetFrameLowering(StackGrowsDown,
+                            /*StackAlignment=*/8,
+                            /*LocalAreaOffset=*/0),
+        STI(Subtarget) {}
+
+  // emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  // the function.
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
+
+  bool hasFP(const MachineFunction & /*MF*/) const override { return true; }
+
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS = nullptr) const override;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIFRAMELOWERING_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
new file mode 100644
index 000000000000..ed0c99a76ce4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiISelDAGToDAG.cpp
@@ -0,0 +1,337 @@
+//===-- LanaiISelDAGToDAG.cpp - A dag to dag inst selector for Lanai ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the Lanai target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "LanaiMachineFunctionInfo.h"
+#include "LanaiRegisterInfo.h"
+#include "LanaiSubtarget.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lanai-isel"
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LanaiDAGToDAGISel - Lanai specific code to select Lanai machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+namespace {
+
+class LanaiDAGToDAGISel : public SelectionDAGISel {
+public:
+  explicit LanaiDAGToDAGISel(LanaiTargetMachine &TargetMachine)
+      : SelectionDAGISel(TargetMachine) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
+
+  // Pass Name
+  StringRef getPassName() const override {
+    return "Lanai DAG->DAG Pattern Instruction Selection";
+  }
+
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
+
+private:
+// Include the pieces autogenerated from the target description.
+#include "LanaiGenDAGISel.inc"
+
+  // Instruction Selection not handled by the auto-generated tablgen
+  void Select(SDNode *N) override;
+
+  // Support functions for the opcodes of Instruction Selection
+  // not handled by the auto-generated tablgen
+  void selectFrameIndex(SDNode *N);
+
+  // Complex Pattern for address selection.
+  bool selectAddrRi(SDValue Addr, SDValue &Base, SDValue &Offset,
+                    SDValue &AluOp);
+  bool selectAddrRr(SDValue Addr, SDValue &R1, SDValue &R2, SDValue &AluOp);
+  bool selectAddrSls(SDValue Addr, SDValue &Offset);
+  bool selectAddrSpls(SDValue Addr, SDValue &Base, SDValue &Offset,
+                      SDValue &AluOp);
+
+  // getI32Imm - Return a target constant with the specified value, of type i32.
+  inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
+    return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
+  }
+
+private:
+  bool selectAddrRiSpls(SDValue Addr, SDValue &Base, SDValue &Offset,
+                        SDValue &AluOp, bool RiMode);
+};
+
+bool canBeRepresentedAsSls(const ConstantSDNode &CN) {
+  // Fits in 21-bit signed immediate and two low-order bits are zero.
+  return isInt<21>(CN.getSExtValue()) && ((CN.getSExtValue() & 0x3) == 0);
+}
+
+} // namespace
+
+// Helper functions for ComplexPattern used on LanaiInstrInfo
+// Used on Lanai Load/Store instructions.
+bool LanaiDAGToDAGISel::selectAddrSls(SDValue Addr, SDValue &Offset) {
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr)) {
+    SDLoc DL(Addr);
+    // Loading from a constant address.
+    if (canBeRepresentedAsSls(*CN)) {
+      int32_t Imm = CN->getSExtValue();
+      Offset = CurDAG->getTargetConstant(Imm, DL, CN->getValueType(0));
+      return true;
+    }
+  }
+  if (Addr.getOpcode() == ISD::OR &&
+      Addr.getOperand(1).getOpcode() == LanaiISD::SMALL) {
+    Offset = Addr.getOperand(1).getOperand(0);
+    return true;
+  }
+  return false;
+}
+
+bool LanaiDAGToDAGISel::selectAddrRiSpls(SDValue Addr, SDValue &Base,
+                                         SDValue &Offset, SDValue &AluOp,
+                                         bool RiMode) {
+  SDLoc DL(Addr);
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr)) {
+    if (RiMode) {
+      // Fits in 16-bit signed immediate.
+      if (isInt<16>(CN->getSExtValue())) {
+        int16_t Imm = CN->getSExtValue();
+        Offset = CurDAG->getTargetConstant(Imm, DL, CN->getValueType(0));
+        Base = CurDAG->getRegister(Lanai::R0, CN->getValueType(0));
+        AluOp = CurDAG->getTargetConstant(LPAC::ADD, DL, MVT::i32);
+        return true;
+      }
+      // Allow SLS to match if the constant doesn't fit in 16 bits but can be
+      // represented as an SLS.
+      if (canBeRepresentedAsSls(*CN))
+        return false;
+    } else {
+      // Fits in 10-bit signed immediate.
+      if (isInt<10>(CN->getSExtValue())) {
+        int16_t Imm = CN->getSExtValue();
+        Offset = CurDAG->getTargetConstant(Imm, DL, CN->getValueType(0));
+        Base = CurDAG->getRegister(Lanai::R0, CN->getValueType(0));
+        AluOp = CurDAG->getTargetConstant(LPAC::ADD, DL, MVT::i32);
+        return true;
+      }
+    }
+  }
+
+  // if Address is FI, get the TargetFrameIndex.
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(
+        FIN->getIndex(),
+        getTargetLowering()->getPointerTy(CurDAG->getDataLayout()));
+    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+    AluOp = CurDAG->getTargetConstant(LPAC::ADD, DL, MVT::i32);
+    return true;
+  }
+
+  // Skip direct calls
+  if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+       Addr.getOpcode() == ISD::TargetGlobalAddress))
+    return false;
+
+  // Address of the form imm + reg
+  ISD::NodeType AluOperator = static_cast<ISD::NodeType>(Addr.getOpcode());
+  if (AluOperator == ISD::ADD) {
+    AluOp = CurDAG->getTargetConstant(LPAC::ADD, DL, MVT::i32);
+    // Addresses of the form FI+const
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      if ((RiMode && isInt<16>(CN->getSExtValue())) ||
+          (!RiMode && isInt<10>(CN->getSExtValue()))) {
+        // If the first operand is a FI, get the TargetFI Node
+        if (FrameIndexSDNode *FIN =
+                dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
+          Base = CurDAG->getTargetFrameIndex(
+              FIN->getIndex(),
+              getTargetLowering()->getPointerTy(CurDAG->getDataLayout()));
+        } else {
+          Base = Addr.getOperand(0);
+        }
+
+        Offset = CurDAG->getTargetConstant(CN->getSExtValue(), DL, MVT::i32);
+        return true;
+      }
+  }
+
+  // Let SLS match SMALL instead of RI.
+  if (AluOperator == ISD::OR && RiMode &&
+      Addr.getOperand(1).getOpcode() == LanaiISD::SMALL)
+    return false;
+
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  AluOp = CurDAG->getTargetConstant(LPAC::ADD, DL, MVT::i32);
+  return true;
+}
+
+bool LanaiDAGToDAGISel::selectAddrRi(SDValue Addr, SDValue &Base,
+                                     SDValue &Offset, SDValue &AluOp) {
+  return selectAddrRiSpls(Addr, Base, Offset, AluOp, /*RiMode=*/true);
+}
+
+bool LanaiDAGToDAGISel::selectAddrSpls(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset, SDValue &AluOp) {
+  return selectAddrRiSpls(Addr, Base, Offset, AluOp, /*RiMode=*/false);
+}
+
+bool LanaiDAGToDAGISel::selectAddrRr(SDValue Addr, SDValue &R1, SDValue &R2,
+                                     SDValue &AluOp) {
+  // if Address is FI, get the TargetFrameIndex.
+  if (Addr.getOpcode() == ISD::FrameIndex)
+    return false;
+
+  // Skip direct calls
+  if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+       Addr.getOpcode() == ISD::TargetGlobalAddress))
+    return false;
+
+  // Address of the form OP + OP
+  ISD::NodeType AluOperator = static_cast<ISD::NodeType>(Addr.getOpcode());
+  LPAC::AluCode AluCode = LPAC::isdToLanaiAluCode(AluOperator);
+  if (AluCode != LPAC::UNKNOWN) {
+    // Skip addresses of the form FI OP const
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      if (isInt<16>(CN->getSExtValue()))
+        return false;
+
+    // Skip addresses with hi/lo operands
+    if (Addr.getOperand(0).getOpcode() == LanaiISD::HI ||
+        Addr.getOperand(0).getOpcode() == LanaiISD::LO ||
+        Addr.getOperand(0).getOpcode() == LanaiISD::SMALL ||
+        Addr.getOperand(1).getOpcode() == LanaiISD::HI ||
+        Addr.getOperand(1).getOpcode() == LanaiISD::LO ||
+        Addr.getOperand(1).getOpcode() == LanaiISD::SMALL)
+      return false;
+
+    // Addresses of the form register OP register
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+    AluOp = CurDAG->getTargetConstant(AluCode, SDLoc(Addr), MVT::i32);
+    return true;
+  }
+
+  // Skip addresses with zero offset
+  return false;
+}
+
+bool LanaiDAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, unsigned ConstraintCode, std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1, AluOp;
+  switch (ConstraintCode) {
+  default:
+    return true;
+  case InlineAsm::Constraint_m: // memory
+    if (!selectAddrRr(Op, Op0, Op1, AluOp) &&
+        !selectAddrRi(Op, Op0, Op1, AluOp))
+      return true;
+    break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  OutOps.push_back(AluOp);
+  return false;
+}
+
+// Select instructions not customized! Used for
+// expanded, promoted and normal instructions
+void LanaiDAGToDAGISel::Select(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    return;
+  }
+
+  // Instruction Selection not handled by the auto-generated tablegen selection
+  // should be handled here.
+  EVT VT = Node->getValueType(0);
+  switch (Opcode) {
+  case ISD::Constant:
+    if (VT == MVT::i32) {
+      ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
+      // Materialize zero constants as copies from R0. This allows the coalescer
+      // to propagate these into other instructions.
+      if (ConstNode->isNullValue()) {
+        SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                             SDLoc(Node), Lanai::R0, MVT::i32);
+        return ReplaceNode(Node, New.getNode());
+      }
+      // Materialize all ones constants as copies from R1. This allows the
+      // coalescer to propagate these into other instructions.
+      if (ConstNode->isAllOnesValue()) {
+        SDValue New = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                             SDLoc(Node), Lanai::R1, MVT::i32);
+        return ReplaceNode(Node, New.getNode());
+      }
+    }
+    break;
+  case ISD::FrameIndex:
+    selectFrameIndex(Node);
+    return;
+  default:
+    break;
+  }
+
+  // Select the default instruction
+  SelectCode(Node);
+}
+
+void LanaiDAGToDAGISel::selectFrameIndex(SDNode *Node) {
+  SDLoc DL(Node);
+  SDValue Imm = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  int FI = dyn_cast<FrameIndexSDNode>(Node)->getIndex();
+  EVT VT = Node->getValueType(0);
+  SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
+  unsigned Opc = Lanai::ADD_I_LO;
+  if (Node->hasOneUse()) {
+    CurDAG->SelectNodeTo(Node, Opc, VT, TFI, Imm);
+    return;
+  }
+  ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, VT, TFI, Imm));
+}
+
+// createLanaiISelDag - This pass converts a legalized DAG into a
+// Lanai-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createLanaiISelDag(LanaiTargetMachine &TM) {
+  return new LanaiDAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
new file mode 100644
index 000000000000..ae7870e07d42
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -0,0 +1,1488 @@
+//===-- LanaiISelLowering.cpp - Lanai DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LanaiTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiISelLowering.h"
+
+#include "Lanai.h"
+#include "LanaiMachineFunctionInfo.h"
+#include "LanaiSubtarget.h"
+#include "LanaiTargetMachine.h"
+#include "LanaiTargetObjectFile.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "lanai-lower"
+
+using namespace llvm;
+
+// Limit on number of instructions the lowered multiplication may have before a
+// call to the library function should be generated instead. The threshold is
+// currently set to 14 as this was the smallest threshold that resulted in all
+// constant multiplications being lowered. A threshold of 5 covered all cases
+// except for one multiplication which required 14. mulsi3 requires 16
+// instructions (including the prologue and epilogue but excluding instructions
+// at call site). Until we can inline mulsi3, generating at most 14 instructions
+// will be faster than invoking mulsi3.
+static cl::opt<int> LanaiLowerConstantMulThreshold(
+    "lanai-constant-mul-threshold", cl::Hidden,
+    cl::desc("Maximum number of instruction to generate when lowering constant "
+             "multiplication instead of calling library function [default=14]"),
+    cl::init(14));
+
+LanaiTargetLowering::LanaiTargetLowering(const TargetMachine &TM,
+                                         const LanaiSubtarget &STI)
+    : TargetLowering(TM) {
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &Lanai::GPRRegClass);
+
+  // Compute derived properties from the register classes
+  TRI = STI.getRegisterInfo();
+  computeRegisterProperties(TRI);
+
+  setStackPointerRegisterToSaveRestore(Lanai::SP);
+
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+  setOperationAction(ISD::SETCCE, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
+  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
+
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAARG, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
+  setOperationAction(ISD::SDIV, MVT::i32, Expand);
+  setOperationAction(ISD::UDIV, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+
+  setOperationAction(ISD::MUL, MVT::i32, Custom);
+  setOperationAction(ISD::MULHU, MVT::i32, Expand);
+  setOperationAction(ISD::MULHS, MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+
+  setOperationAction(ISD::ROTR, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+  setOperationAction(ISD::CTLZ, MVT::i32, Legal);
+  setOperationAction(ISD::CTTZ, MVT::i32, Legal);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+
+  // Extended load operations for i1 types must be promoted
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+  }
+
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::XOR);
+
+  // Function alignments (log2)
+  setMinFunctionAlignment(2);
+  setPrefFunctionAlignment(2);
+
+  setJumpIsExpensive(true);
+
+  // TODO: Setting the minimum jump table entries needed before a
+  // switch is transformed to a jump table to 100 to avoid creating jump tables
+  // as this was causing bad performance compared to a large group of if
+  // statements. Re-evaluate this on new benchmarks.
+  setMinimumJumpTableEntries(100);
+
+  // Use fast calling convention for library functions.
+  for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
+    setLibcallCallingConv(static_cast<RTLIB::Libcall>(I), CallingConv::Fast);
+  }
+
+  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+  MaxStoresPerMemsetOptSize = 8;
+  MaxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
+  MaxStoresPerMemcpyOptSize = 8;
+  MaxStoresPerMemmove = 16; // For @llvm.memmove -> sequence of stores
+  MaxStoresPerMemmoveOptSize = 8;
+
+  // Booleans always contain 0 or 1.
+  setBooleanContents(ZeroOrOneBooleanContent);
+}
+
+SDValue LanaiTargetLowering::LowerOperation(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  case ISD::MUL:
+    return LowerMUL(Op, DAG);
+  case ISD::BR_CC:
+    return LowerBR_CC(Op, DAG);
+  case ISD::ConstantPool:
+    return LowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::BlockAddress:
+    return LowerBlockAddress(Op, DAG);
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG);
+  case ISD::SELECT_CC:
+    return LowerSELECT_CC(Op, DAG);
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG);
+  case ISD::SETCCE:
+    return LowerSETCCE(Op, DAG);
+  case ISD::SHL_PARTS:
+    return LowerSHL_PARTS(Op, DAG);
+  case ISD::SRL_PARTS:
+    return LowerSRL_PARTS(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::RETURNADDR:
+    return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:
+    return LowerFRAMEADDR(Op, DAG);
+  default:
+    llvm_unreachable("unimplemented operand");
+  }
+}
+//===----------------------------------------------------------------------===//
+//                       Lanai Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+unsigned LanaiTargetLowering::getRegisterByName(const char *RegName, EVT /*VT*/,
+                                                SelectionDAG & /*DAG*/) const {
+  // Only unallocatable registers should be matched here.
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                     .Case("pc", Lanai::PC)
+                     .Case("sp", Lanai::SP)
+                     .Case("fp", Lanai::FP)
+                     .Case("rr1", Lanai::RR1)
+                     .Case("r10", Lanai::R10)
+                     .Case("rr2", Lanai::RR2)
+                     .Case("r11", Lanai::R11)
+                     .Case("rca", Lanai::RCA)
+                     .Default(0);
+
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+LanaiTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                  StringRef Constraint,
+                                                  MVT VT) const {
+  if (Constraint.size() == 1)
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    case 'r': // GENERAL_REGS
+      return std::make_pair(0U, &Lanai::GPRRegClass);
+    default:
+      break;
+    }
+
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+// Examine constraint type and operand type and determine a weight value.
+// This object must already have been set up with the operand type
+// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+LanaiTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &Info, const char *Constraint) const {
+  ConstraintWeight Weight = CW_Invalid;
+  Value *CallOperandVal = Info.CallOperandVal;
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  if (CallOperandVal == NULL)
+    return CW_Default;
+  // Look at the constraint type.
+  switch (*Constraint) {
+  case 'I': // signed 16 bit immediate
+  case 'J': // integer zero
+  case 'K': // unsigned 16 bit immediate
+  case 'L': // immediate in the range 0 to 31
+  case 'M': // signed 32 bit immediate where lower 16 bits are 0
+  case 'N': // signed 26 bit immediate
+  case 'O': // integer zero
+    if (isa<ConstantInt>(CallOperandVal))
+      Weight = CW_Constant;
+    break;
+  default:
+    Weight = TargetLowering::getSingleConstraintMatchWeight(Info, Constraint);
+    break;
+  }
+  return Weight;
+}
+
+// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+// vector.  If it is invalid, don't add anything to Ops.
+void LanaiTargetLowering::LowerAsmOperandForConstraint(
+    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+    SelectionDAG &DAG) const {
+  SDValue Result(0, 0);
+
+  // Only support length 1 constraints for now.
+  if (Constraint.length() > 1)
+    return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  case 'I': // Signed 16 bit constant
+    // If this fails, the parent routine will give an error
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (isInt<16>(C->getSExtValue())) {
+        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(C),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'J': // integer zero
+  case 'O':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() == 0) {
+        Result = DAG.getTargetConstant(0, SDLoc(C), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'K': // unsigned 16 bit immediate
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (isUInt<16>(C->getZExtValue())) {
+        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(C),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'L': // immediate in the range 0 to 31
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 31) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(C),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'M': // signed 32 bit immediate where lower 16 bits are 0
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      int64_t Val = C->getSExtValue();
+      if ((isInt<32>(Val)) && ((Val & 0xffff) == 0)) {
+        Result = DAG.getTargetConstant(Val, SDLoc(C), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'N': // signed 26 bit immediate
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      int64_t Val = C->getSExtValue();
+      if ((Val >= -33554432) && (Val <= 33554431)) {
+        Result = DAG.getTargetConstant(Val, SDLoc(C), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  default:
+    break; // This will fall through to the generic implementation
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "LanaiGenCallingConv.inc"
+
+static unsigned NumFixedArgs;
+static bool CC_Lanai32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                              CCValAssign::LocInfo LocInfo,
+                              ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  // Handle fixed arguments with default CC.
+  // Note: Both the default and fast CC handle VarArg the same and hence the
+  // calling convention of the function is not considered here.
+  if (ValNo < NumFixedArgs) {
+    return CC_Lanai32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State);
+  }
+
+  // Promote i8/i16 args to i32
+  if (LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+
+  // VarArgs get passed on stack
+  unsigned Offset = State.AllocateStack(4, 4);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+SDValue LanaiTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  switch (CallConv) {
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return LowerCCCArguments(Chain, CallConv, IsVarArg, Ins, DL, DAG, InVals);
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  }
+}
+
+SDValue LanaiTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                       SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &DL = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &IsTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
+
+  // Lanai target does not yet support tail call optimization.
+  IsTailCall = false;
+
+  switch (CallConv) {
+  case CallingConv::Fast:
+  case CallingConv::C:
+    return LowerCCCCallTo(Chain, Callee, CallConv, IsVarArg, IsTailCall, Outs,
+                          OutVals, Ins, DL, DAG, InVals);
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  }
+}
+
+// LowerCCCArguments - transform physical registers into virtual registers and
+// generate load operations for arguments places on the stack.
+SDValue LanaiTargetLowering::LowerCCCArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  LanaiMachineFunctionInfo *LanaiMFI = MF.getInfo<LanaiMachineFunctionInfo>();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  if (CallConv == CallingConv::Fast) {
+    CCInfo.AnalyzeFormalArguments(Ins, CC_Lanai32_Fast);
+  } else {
+    CCInfo.AnalyzeFormalArguments(Ins, CC_Lanai32);
+  }
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      EVT RegVT = VA.getLocVT();
+      switch (RegVT.getSimpleVT().SimpleTy) {
+      case MVT::i32: {
+        unsigned VReg = RegInfo.createVirtualRegister(&Lanai::GPRRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, RegVT);
+
+        // If this is an 8/16-bit value, it is really passed promoted to 32
+        // bits. Insert an assert[sz]ext to capture this, then truncate to the
+        // right size.
+        if (VA.getLocInfo() == CCValAssign::SExt)
+          ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+        else if (VA.getLocInfo() == CCValAssign::ZExt)
+          ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+
+        if (VA.getLocInfo() != CCValAssign::Full)
+          ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
+
+        InVals.push_back(ArgValue);
+        break;
+      }
+      default:
+        DEBUG(dbgs() << "LowerFormalArguments Unhandled argument type: "
+                     << RegVT.getEVTString() << "\n");
+        llvm_unreachable("unhandled argument type");
+      }
+    } else {
+      // Sanity check
+      assert(VA.isMemLoc());
+      // Load the argument to a virtual register
+      unsigned ObjSize = VA.getLocVT().getSizeInBits() / 8;
+      // Check that the argument fits in stack slot
+      if (ObjSize > 4) {
+        errs() << "LowerFormalArguments Unhandled argument type: "
+               << EVT(VA.getLocVT()).getEVTString() << "\n";
+      }
+      // Create the frame index object for this incoming parameter...
+      int FI = MFI.CreateFixedObject(ObjSize, VA.getLocMemOffset(), true);
+
+      // Create the SelectionDAG nodes corresponding to a load
+      // from this parameter
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+      InVals.push_back(DAG.getLoad(
+          VA.getLocVT(), DL, Chain, FIN,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
+    }
+  }
+
+  // The Lanai ABI for returning structs by value requires that we copy
+  // the sret argument into rv for the return. Save the argument into
+  // a virtual register so that we can access it from the return points.
+  if (MF.getFunction()->hasStructRetAttr()) {
+    unsigned Reg = LanaiMFI->getSRetReturnReg();
+    if (!Reg) {
+      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i32));
+      LanaiMFI->setSRetReturnReg(Reg);
+    }
+    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[0]);
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
+  }
+
+  if (IsVarArg) {
+    // Record the frame index of the first variable argument
+    // which is a value necessary to VASTART.
+    int FI = MFI.CreateFixedObject(4, CCInfo.getNextStackOffset(), true);
+    LanaiMFI->setVarArgsFrameIndex(FI);
+  }
+
+  return Chain;
+}
+
+SDValue
+LanaiTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                 bool IsVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 const SDLoc &DL, SelectionDAG &DAG) const {
+  // CCValAssign - represent the assignment of the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_Lanai32);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVals[i], Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  // The Lanai ABI for returning structs by value requires that we copy
+  // the sret argument into rv for the return. We saved the argument into
+  // a virtual register in the entry block, so now we copy the value out
+  // and into rv.
+  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    LanaiMachineFunctionInfo *LanaiMFI = MF.getInfo<LanaiMachineFunctionInfo>();
+    unsigned Reg = LanaiMFI->getSRetReturnReg();
+    assert(Reg &&
+           "SRetReturnReg should have been set in LowerFormalArguments().");
+    SDValue Val =
+        DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(DAG.getDataLayout()));
+
+    Chain = DAG.getCopyToReg(Chain, DL, Lanai::RV, Val, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(
+        DAG.getRegister(Lanai::RV, getPointerTy(DAG.getDataLayout())));
+  }
+
+  RetOps[0] = Chain; // Update chain
+
+  unsigned Opc = LanaiISD::RET_FLAG;
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  // Return Void
+  return DAG.getNode(Opc, DL, MVT::Other,
+                     ArrayRef<SDValue>(&RetOps[0], RetOps.size()));
+}
+
+// LowerCCCCallTo - functions arguments are copied from virtual regs to
+// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+SDValue LanaiTargetLowering::LowerCCCCallTo(
+    SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool IsVarArg,
+    bool /*IsTailCall*/, const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+  NumFixedArgs = 0;
+  if (IsVarArg && G) {
+    const Function *CalleeFn = dyn_cast<Function>(G->getGlobal());
+    if (CalleeFn)
+      NumFixedArgs = CalleeFn->getFunctionType()->getNumParams();
+  }
+  if (NumFixedArgs)
+    CCInfo.AnalyzeCallOperands(Outs, CC_Lanai32_VarArg);
+  else {
+    if (CallConv == CallingConv::Fast)
+      CCInfo.AnalyzeCallOperands(Outs, CC_Lanai32_Fast);
+    else
+      CCInfo.AnalyzeCallOperands(Outs, CC_Lanai32);
+  }
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  // Create local copies for byval args.
+  SmallVector<SDValue, 8> ByValArgs;
+  for (unsigned I = 0, E = Outs.size(); I != E; ++I) {
+    ISD::ArgFlagsTy Flags = Outs[I].Flags;
+    if (!Flags.isByVal())
+      continue;
+
+    SDValue Arg = OutVals[I];
+    unsigned Size = Flags.getByValSize();
+    unsigned Align = Flags.getByValAlign();
+
+    int FI = MFI.CreateStackObject(Size, Align, false);
+    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+    SDValue SizeNode = DAG.getConstant(Size, DL, MVT::i32);
+
+    Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Align,
+                          /*IsVolatile=*/false,
+                          /*AlwaysInline=*/false,
+                          /*isTailCall=*/false, MachinePointerInfo(),
+                          MachinePointerInfo());
+    ByValArgs.push_back(FIPtr);
+  }
+
+  Chain = DAG.getCALLSEQ_START(
+      Chain,
+      DAG.getConstant(NumBytes, DL, getPointerTy(DAG.getDataLayout()), true),
+      DL);
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+  SDValue StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned I = 0, J = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    SDValue Arg = OutVals[I];
+    ISD::ArgFlagsTy Flags = Outs[I].Flags;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    default:
+      llvm_unreachable("Unknown loc info!");
+    }
+
+    // Use local copy if it is a byval arg.
+    if (Flags.isByVal())
+      Arg = ByValArgs[J++];
+
+    // Arguments that can be passed on register must be kept at RegsToPass
+    // vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      if (StackPtr.getNode() == 0)
+        StackPtr = DAG.getCopyFromReg(Chain, DL, Lanai::SP,
+                                      getPointerTy(DAG.getDataLayout()));
+
+      SDValue PtrOff =
+          DAG.getNode(ISD::ADD, DL, getPointerTy(DAG.getDataLayout()), StackPtr,
+                      DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));
+
+      MemOpChains.push_back(
+          DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
+    }
+  }
+
+  // Transform all store nodes into one single node because all store nodes are
+  // independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                        ArrayRef<SDValue>(&MemOpChains[0], MemOpChains.size()));
+
+  SDValue InFlag;
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain and
+  // flag operands which copy the outgoing args into registers.  The InFlag in
+  // necessary since all emitted instructions must be stuck together.
+  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
+                             RegsToPass[I].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  uint8_t OpFlag = LanaiII::MO_NO_FLAG;
+  if (G) {
+    Callee = DAG.getTargetGlobalAddress(
+        G->getGlobal(), DL, getPointerTy(DAG.getDataLayout()), 0, OpFlag);
+  } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    Callee = DAG.getTargetExternalSymbol(
+        E->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlag);
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add a register mask operand representing the call-preserved registers.
+  // TODO: Should return-twice functions be handled?
+  const uint32_t *Mask =
+      TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I)
+    Ops.push_back(DAG.getRegister(RegsToPass[I].first,
+                                  RegsToPass[I].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(LanaiISD::CALL, DL, NodeTys,
+                      ArrayRef<SDValue>(&Ops[0], Ops.size()));
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(
+      Chain,
+      DAG.getConstant(NumBytes, DL, getPointerTy(DAG.getDataLayout()), true),
+      DAG.getConstant(0, DL, getPointerTy(DAG.getDataLayout()), true), InFlag,
+      DL);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+                         InVals);
+}
+
+// LowerCallResult - Lower the result values of a call into the
+// appropriate copies out of appropriate physical registers.
+SDValue LanaiTargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_Lanai32);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned I = 0; I != RVLocs.size(); ++I) {
+    Chain = DAG.getCopyFromReg(Chain, DL, RVLocs[I].getLocReg(),
+                               RVLocs[I].getValVT(), InFlag)
+                .getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//                      Custom Lowerings
+//===----------------------------------------------------------------------===//
+
+static LPCC::CondCode IntCondCCodeToICC(SDValue CC, const SDLoc &DL,
+                                        SDValue &RHS, SelectionDAG &DAG) {
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+
+  // For integer, only the SETEQ, SETNE, SETLT, SETLE, SETGT, SETGE, SETULT,
+  // SETULE, SETUGT, and SETUGE opcodes are used (see CodeGen/ISDOpcodes.h)
+  // and Lanai only supports integer comparisons, so only provide definitions
+  // for them.
+  switch (SetCCOpcode) {
+  case ISD::SETEQ:
+    return LPCC::ICC_EQ;
+  case ISD::SETGT:
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS))
+      if (RHSC->getZExtValue() == 0xFFFFFFFF) {
+        // X > -1 -> X >= 0 -> is_plus(X)
+        RHS = DAG.getConstant(0, DL, RHS.getValueType());
+        return LPCC::ICC_PL;
+      }
+    return LPCC::ICC_GT;
+  case ISD::SETUGT:
+    return LPCC::ICC_UGT;
+  case ISD::SETLT:
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS))
+      if (RHSC->getZExtValue() == 0)
+        // X < 0 -> is_minus(X)
+        return LPCC::ICC_MI;
+    return LPCC::ICC_LT;
+  case ISD::SETULT:
+    return LPCC::ICC_ULT;
+  case ISD::SETLE:
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS))
+      if (RHSC->getZExtValue() == 0xFFFFFFFF) {
+        // X <= -1 -> X < 0 -> is_minus(X)
+        RHS = DAG.getConstant(0, DL, RHS.getValueType());
+        return LPCC::ICC_MI;
+      }
+    return LPCC::ICC_LE;
+  case ISD::SETULE:
+    return LPCC::ICC_ULE;
+  case ISD::SETGE:
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS))
+      if (RHSC->getZExtValue() == 0)
+        // X >= 0 -> is_plus(X)
+        return LPCC::ICC_PL;
+    return LPCC::ICC_GE;
+  case ISD::SETUGE:
+    return LPCC::ICC_UGE;
+  case ISD::SETNE:
+    return LPCC::ICC_NE;
+  case ISD::SETONE:
+  case ISD::SETUNE:
+  case ISD::SETOGE:
+  case ISD::SETOLE:
+  case ISD::SETOLT:
+  case ISD::SETOGT:
+  case ISD::SETOEQ:
+  case ISD::SETUEQ:
+  case ISD::SETO:
+  case ISD::SETUO:
+    llvm_unreachable("Unsupported comparison.");
+  default:
+    llvm_unreachable("Unknown integer condition code!");
+  }
+}
+
+SDValue LanaiTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond = Op.getOperand(1);
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  SDLoc DL(Op);
+
+  LPCC::CondCode CC = IntCondCCodeToICC(Cond, DL, RHS, DAG);
+  SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i32);
+  SDValue Flag =
+      DAG.getNode(LanaiISD::SET_FLAG, DL, MVT::Glue, LHS, RHS, TargetCC);
+
+  return DAG.getNode(LanaiISD::BR_CC, DL, Op.getValueType(), Chain, Dest,
+                     TargetCC, Flag);
+}
+
+SDValue LanaiTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op->getValueType(0);
+  if (VT != MVT::i32)
+    return SDValue();
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+  if (!C)
+    return SDValue();
+
+  int64_t MulAmt = C->getSExtValue();
+  int32_t HighestOne = -1;
+  uint32_t NonzeroEntries = 0;
+  int SignedDigit[32] = {0};
+
+  // Convert to non-adjacent form (NAF) signed-digit representation.
+  // NAF is a signed-digit form where no adjacent digits are non-zero. It is the
+  // minimal Hamming weight representation of a number (on average 1/3 of the
+  // digits will be non-zero vs 1/2 for regular binary representation). And as
+  // the non-zero digits will be the only digits contributing to the instruction
+  // count, this is desirable. The next loop converts it to NAF (following the
+  // approach in 'Guide to Elliptic Curve Cryptography' [ISBN: 038795273X]) by
+  // choosing the non-zero coefficients such that the resulting quotient is
+  // divisible by 2 which will cause the next coefficient to be zero.
+  int64_t E = std::abs(MulAmt);
+  int S = (MulAmt < 0 ? -1 : 1);
+  int I = 0;
+  while (E > 0) {
+    int ZI = 0;
+    if (E % 2 == 1) {
+      ZI = 2 - (E % 4);
+      if (ZI != 0)
+        ++NonzeroEntries;
+    }
+    SignedDigit[I] = S * ZI;
+    if (SignedDigit[I] == 1)
+      HighestOne = I;
+    E = (E - ZI) / 2;
+    ++I;
+  }
+
+  // Compute number of instructions required. Due to differences in lowering
+  // between the different processors this count is not exact.
+  // Start by assuming a shift and a add/sub for every non-zero entry (hence
+  // every non-zero entry requires 1 shift and 1 add/sub except for the first
+  // entry).
+  int32_t InstrRequired = 2 * NonzeroEntries - 1;
+  // Correct possible over-adding due to shift by 0 (which is not emitted).
+  if (std::abs(MulAmt) % 2 == 1)
+    --InstrRequired;
+  // Return if the form generated would exceed the instruction threshold.
+  if (InstrRequired > LanaiLowerConstantMulThreshold)
+    return SDValue();
+
+  SDValue Res;
+  SDLoc DL(Op);
+  SDValue V = Op->getOperand(0);
+
+  // Initialize the running sum. Set the running sum to the maximal shifted
+  // positive value (i.e., largest i such that zi == 1 and MulAmt has V<<i as a
+  // term NAF).
+  if (HighestOne == -1)
+    Res = DAG.getConstant(0, DL, MVT::i32);
+  else {
+    Res = DAG.getNode(ISD::SHL, DL, VT, V,
+                      DAG.getConstant(HighestOne, DL, MVT::i32));
+    SignedDigit[HighestOne] = 0;
+  }
+
+  // Assemble multiplication from shift, add, sub using NAF form and running
+  // sum.
+  for (unsigned int I = 0; I < sizeof(SignedDigit) / sizeof(SignedDigit[0]);
+       ++I) {
+    if (SignedDigit[I] == 0)
+      continue;
+
+    // Shifted multiplicand (v<<i).
+    SDValue Op =
+        DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(I, DL, MVT::i32));
+    if (SignedDigit[I] == 1)
+      Res = DAG.getNode(ISD::ADD, DL, VT, Res, Op);
+    else if (SignedDigit[I] == -1)
+      Res = DAG.getNode(ISD::SUB, DL, VT, Res, Op);
+  }
+  return Res;
+}
+
+SDValue LanaiTargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Carry = Op.getOperand(2);
+  SDValue Cond = Op.getOperand(3);
+  SDLoc DL(Op);
+
+  LPCC::CondCode CC = IntCondCCodeToICC(Cond, DL, RHS, DAG);
+  SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i32);
+  SDValue Flag = DAG.getNode(LanaiISD::SUBBF, DL, MVT::Glue, LHS, RHS, Carry);
+  return DAG.getNode(LanaiISD::SETCC, DL, Op.getValueType(), TargetCC, Flag);
+}
+
+SDValue LanaiTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Cond = Op.getOperand(2);
+  SDLoc DL(Op);
+
+  LPCC::CondCode CC = IntCondCCodeToICC(Cond, DL, RHS, DAG);
+  SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i32);
+  SDValue Flag =
+      DAG.getNode(LanaiISD::SET_FLAG, DL, MVT::Glue, LHS, RHS, TargetCC);
+
+  return DAG.getNode(LanaiISD::SETCC, DL, Op.getValueType(), TargetCC, Flag);
+}
+
+SDValue LanaiTargetLowering::LowerSELECT_CC(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue TrueV = Op.getOperand(2);
+  SDValue FalseV = Op.getOperand(3);
+  SDValue Cond = Op.getOperand(4);
+  SDLoc DL(Op);
+
+  LPCC::CondCode CC = IntCondCCodeToICC(Cond, DL, RHS, DAG);
+  SDValue TargetCC = DAG.getConstant(CC, DL, MVT::i32);
+  SDValue Flag =
+      DAG.getNode(LanaiISD::SET_FLAG, DL, MVT::Glue, LHS, RHS, TargetCC);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  return DAG.getNode(LanaiISD::SELECT_CC, DL, VTs, TrueV, FalseV, TargetCC,
+                     Flag);
+}
+
+SDValue LanaiTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  LanaiMachineFunctionInfo *FuncInfo = MF.getInfo<LanaiMachineFunctionInfo>();
+
+  SDLoc DL(Op);
+  SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                 getPointerTy(DAG.getDataLayout()));
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
+SDValue LanaiTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  SDLoc DL(Op);
+
+  unsigned SPReg = getStackPointerRegisterToSaveRestore();
+
+  // Get a reference to the stack pointer.
+  SDValue StackPointer = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i32);
+
+  // Subtract the dynamic size from the actual stack size to
+  // obtain the new stack size.
+  SDValue Sub = DAG.getNode(ISD::SUB, DL, MVT::i32, StackPointer, Size);
+
+  // For Lanai, the outgoing memory arguments area should be on top of the
+  // alloca area on the stack i.e., the outgoing memory arguments should be
+  // at a lower address than the alloca area. Move the alloca area down the
+  // stack by adding back the space reserved for outgoing arguments to SP
+  // here.
+  //
+  // We do not know what the size of the outgoing args is at this point.
+  // So, we add a pseudo instruction ADJDYNALLOC that will adjust the
+  // stack pointer. We replace this instruction with on that has the correct,
+  // known offset in emitPrologue().
+  SDValue ArgAdjust = DAG.getNode(LanaiISD::ADJDYNALLOC, DL, MVT::i32, Sub);
+
+  // The Sub result contains the new stack start address, so it
+  // must be placed in the stack pointer register.
+  SDValue CopyChain = DAG.getCopyToReg(Chain, DL, SPReg, Sub);
+
+  SDValue Ops[2] = {ArgAdjust, CopyChain};
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue LanaiTargetLowering::LowerRETURNADDR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setReturnAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    const unsigned Offset = -4;
+    SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
+                              DAG.getIntPtrConstant(Offset, DL));
+    return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
+  }
+
+  // Return the link register, which contains the return address.
+  // Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(TRI->getRARegister(), getRegClassFor(MVT::i32));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
+}
+
+SDValue LanaiTargetLowering::LowerFRAMEADDR(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI.setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Lanai::FP, VT);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  while (Depth--) {
+    const unsigned Offset = -8;
+    SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
+                              DAG.getIntPtrConstant(Offset, DL));
+    FrameAddr =
+        DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
+  }
+  return FrameAddr;
+}
+
+const char *LanaiTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  case LanaiISD::ADJDYNALLOC:
+    return "LanaiISD::ADJDYNALLOC";
+  case LanaiISD::RET_FLAG:
+    return "LanaiISD::RET_FLAG";
+  case LanaiISD::CALL:
+    return "LanaiISD::CALL";
+  case LanaiISD::SELECT_CC:
+    return "LanaiISD::SELECT_CC";
+  case LanaiISD::SETCC:
+    return "LanaiISD::SETCC";
+  case LanaiISD::SUBBF:
+    return "LanaiISD::SUBBF";
+  case LanaiISD::SET_FLAG:
+    return "LanaiISD::SET_FLAG";
+  case LanaiISD::BR_CC:
+    return "LanaiISD::BR_CC";
+  case LanaiISD::Wrapper:
+    return "LanaiISD::Wrapper";
+  case LanaiISD::HI:
+    return "LanaiISD::HI";
+  case LanaiISD::LO:
+    return "LanaiISD::LO";
+  case LanaiISD::SMALL:
+    return "LanaiISD::SMALL";
+  default:
+    return NULL;
+  }
+}
+
+SDValue LanaiTargetLowering::LowerConstantPool(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+  const Constant *C = N->getConstVal();
+  const LanaiTargetObjectFile *TLOF =
+      static_cast<const LanaiTargetObjectFile *>(
+          getTargetMachine().getObjFileLowering());
+
+  // If the code model is small or constant will be placed in the small section,
+  // then assume address will fit in 21-bits.
+  if (getTargetMachine().getCodeModel() == CodeModel::Small ||
+      TLOF->isConstantInSmallSection(DAG.getDataLayout(), C)) {
+    SDValue Small = DAG.getTargetConstantPool(
+        C, MVT::i32, N->getAlignment(), N->getOffset(), LanaiII::MO_NO_FLAG);
+    return DAG.getNode(ISD::OR, DL, MVT::i32,
+                       DAG.getRegister(Lanai::R0, MVT::i32),
+                       DAG.getNode(LanaiISD::SMALL, DL, MVT::i32, Small));
+  } else {
+    uint8_t OpFlagHi = LanaiII::MO_ABS_HI;
+    uint8_t OpFlagLo = LanaiII::MO_ABS_LO;
+
+    SDValue Hi = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
+                                           N->getOffset(), OpFlagHi);
+    SDValue Lo = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
+                                           N->getOffset(), OpFlagLo);
+    Hi = DAG.getNode(LanaiISD::HI, DL, MVT::i32, Hi);
+    Lo = DAG.getNode(LanaiISD::LO, DL, MVT::i32, Lo);
+    SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Hi, Lo);
+    return Result;
+  }
+}
+
+SDValue LanaiTargetLowering::LowerGlobalAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+
+  const LanaiTargetObjectFile *TLOF =
+      static_cast<const LanaiTargetObjectFile *>(
+          getTargetMachine().getObjFileLowering());
+
+  // If the code model is small or global variable will be placed in the small
+  // section, then assume address will fit in 21-bits.
+  const GlobalObject *GO = GV->getBaseObject();
+  if (TLOF->isGlobalInSmallSection(GO, getTargetMachine())) {
+    SDValue Small = DAG.getTargetGlobalAddress(
+        GV, DL, getPointerTy(DAG.getDataLayout()), Offset, LanaiII::MO_NO_FLAG);
+    return DAG.getNode(ISD::OR, DL, MVT::i32,
+                       DAG.getRegister(Lanai::R0, MVT::i32),
+                       DAG.getNode(LanaiISD::SMALL, DL, MVT::i32, Small));
+  } else {
+    uint8_t OpFlagHi = LanaiII::MO_ABS_HI;
+    uint8_t OpFlagLo = LanaiII::MO_ABS_LO;
+
+    // Create the TargetGlobalAddress node, folding in the constant offset.
+    SDValue Hi = DAG.getTargetGlobalAddress(
+        GV, DL, getPointerTy(DAG.getDataLayout()), Offset, OpFlagHi);
+    SDValue Lo = DAG.getTargetGlobalAddress(
+        GV, DL, getPointerTy(DAG.getDataLayout()), Offset, OpFlagLo);
+    Hi = DAG.getNode(LanaiISD::HI, DL, MVT::i32, Hi);
+    Lo = DAG.getNode(LanaiISD::LO, DL, MVT::i32, Lo);
+    return DAG.getNode(ISD::OR, DL, MVT::i32, Hi, Lo);
+  }
+}
+
+SDValue LanaiTargetLowering::LowerBlockAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+
+  uint8_t OpFlagHi = LanaiII::MO_ABS_HI;
+  uint8_t OpFlagLo = LanaiII::MO_ABS_LO;
+
+  SDValue Hi = DAG.getBlockAddress(BA, MVT::i32, true, OpFlagHi);
+  SDValue Lo = DAG.getBlockAddress(BA, MVT::i32, true, OpFlagLo);
+  Hi = DAG.getNode(LanaiISD::HI, DL, MVT::i32, Hi);
+  Lo = DAG.getNode(LanaiISD::LO, DL, MVT::i32, Lo);
+  SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Hi, Lo);
+  return Result;
+}
+
+SDValue LanaiTargetLowering::LowerJumpTable(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+  // If the code model is small assume address will fit in 21-bits.
+  if (getTargetMachine().getCodeModel() == CodeModel::Small) {
+    SDValue Small = DAG.getTargetJumpTable(
+        JT->getIndex(), getPointerTy(DAG.getDataLayout()), LanaiII::MO_NO_FLAG);
+    return DAG.getNode(ISD::OR, DL, MVT::i32,
+                       DAG.getRegister(Lanai::R0, MVT::i32),
+                       DAG.getNode(LanaiISD::SMALL, DL, MVT::i32, Small));
+  } else {
+    uint8_t OpFlagHi = LanaiII::MO_ABS_HI;
+    uint8_t OpFlagLo = LanaiII::MO_ABS_LO;
+
+    SDValue Hi = DAG.getTargetJumpTable(
+        JT->getIndex(), getPointerTy(DAG.getDataLayout()), OpFlagHi);
+    SDValue Lo = DAG.getTargetJumpTable(
+        JT->getIndex(), getPointerTy(DAG.getDataLayout()), OpFlagLo);
+    Hi = DAG.getNode(LanaiISD::HI, DL, MVT::i32, Hi);
+    Lo = DAG.getNode(LanaiISD::LO, DL, MVT::i32, Lo);
+    SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Hi, Lo);
+    return Result;
+  }
+}
+
+SDValue LanaiTargetLowering::LowerSHL_PARTS(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  assert(Op.getNumOperands() == 3 && "Unexpected SHL!");
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+
+  // Performs the following for (ShOpLo + (ShOpHi << 32)) << ShAmt:
+  //   LoBitsForHi = (ShAmt == 0) ? 0 : (ShOpLo >> (32-ShAmt))
+  //   HiBitsForHi = ShOpHi << ShAmt
+  //   Hi = (ShAmt >= 32) ? (ShOpLo << (ShAmt-32)) : (LoBitsForHi | HiBitsForHi)
+  //   Lo = (ShAmt >= 32) ? 0 : (ShOpLo << ShAmt)
+  //   return (Hi << 32) | Lo;
+
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
+  SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+
+  // If ShAmt == 0, we just calculated "(SRL ShOpLo, 32)" which is "undef". We
+  // wanted 0, so CSEL it directly.
+  SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+  SDValue SetCC = DAG.getSetCC(dl, MVT::i32, ShAmt, Zero, ISD::SETEQ);
+  LoBitsForHi = DAG.getSelect(dl, MVT::i32, SetCC, Zero, LoBitsForHi);
+
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                   DAG.getConstant(VTBits, dl, MVT::i32));
+  SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue HiForNormalShift =
+      DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
+
+  SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+  SetCC = DAG.getSetCC(dl, MVT::i32, ExtraShAmt, Zero, ISD::SETGE);
+  SDValue Hi =
+      DAG.getSelect(dl, MVT::i32, SetCC, HiForBigShift, HiForNormalShift);
+
+  // Lanai shifts of larger than register sizes are wrapped rather than
+  // clamped, so we can't just emit "lo << b" if b is too big.
+  SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  SDValue Lo = DAG.getSelect(
+      dl, MVT::i32, SetCC, DAG.getConstant(0, dl, MVT::i32), LoForNormalShift);
+
+  SDValue Ops[2] = {Lo, Hi};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue LanaiTargetLowering::LowerSRL_PARTS(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  MVT VT = Op.getSimpleValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+
+  // Performs the following for a >> b:
+  //   unsigned r_high = a_high >> b;
+  //   r_high = (32 - b <= 0) ? 0 : r_high;
+  //
+  //   unsigned r_low = a_low >> b;
+  //   r_low = (32 - b <= 0) ? r_high : r_low;
+  //   r_low = (b == 0) ? r_low : r_low | (a_high << (32 - b));
+  //   return (unsigned long long)r_high << 32 | r_low;
+  // Note: This takes advantage of Lanai's shift behavior to avoid needing to
+  // mask the shift amount.
+
+  SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+  SDValue NegatedPlus32 = DAG.getNode(
+      ISD::SUB, dl, MVT::i32, DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
+  SDValue SetCC = DAG.getSetCC(dl, MVT::i32, NegatedPlus32, Zero, ISD::SETLE);
+
+  SDValue Hi = DAG.getNode(ISD::SRL, dl, MVT::i32, ShOpHi, ShAmt);
+  Hi = DAG.getSelect(dl, MVT::i32, SetCC, Zero, Hi);
+
+  SDValue Lo = DAG.getNode(ISD::SRL, dl, MVT::i32, ShOpLo, ShAmt);
+  Lo = DAG.getSelect(dl, MVT::i32, SetCC, Hi, Lo);
+  SDValue CarryBits =
+      DAG.getNode(ISD::SHL, dl, MVT::i32, ShOpHi, NegatedPlus32);
+  SDValue ShiftIsZero = DAG.getSetCC(dl, MVT::i32, ShAmt, Zero, ISD::SETEQ);
+  Lo = DAG.getSelect(dl, MVT::i32, ShiftIsZero, Lo,
+                     DAG.getNode(ISD::OR, dl, MVT::i32, Lo, CarryBits));
+
+  SDValue Ops[2] = {Lo, Hi};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+// Helper function that checks if N is a null or all ones constant.
+static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
+  return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
+}
+
+// Return true if N is conditionally 0 or all ones.
+// Detects these expressions where cc is an i1 value:
+//
+//   (select cc 0, y)   [AllOnes=0]
+//   (select cc y, 0)   [AllOnes=0]
+//   (zext cc)          [AllOnes=0]
+//   (sext cc)          [AllOnes=0/1]
+//   (select cc -1, y)  [AllOnes=1]
+//   (select cc y, -1)  [AllOnes=1]
+//
+// * AllOnes determines whether to check for an all zero (AllOnes false) or an
+//   all ones operand (AllOnes true).
+// * Invert is set when N is the all zero/ones constant when CC is false.
+// * OtherOp is set to the alternative value of N.
+//
+// For example, for (select cc X, Y) and AllOnes = 0 if:
+// * X = 0, Invert = False and OtherOp = Y
+// * Y = 0, Invert = True and OtherOp = X
+static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC,
+                                       bool &Invert, SDValue &OtherOp,
+                                       SelectionDAG &DAG) {
+  switch (N->getOpcode()) {
+  default:
+    return false;
+  case ISD::SELECT: {
+    CC = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    if (isZeroOrAllOnes(N1, AllOnes)) {
+      Invert = false;
+      OtherOp = N2;
+      return true;
+    }
+    if (isZeroOrAllOnes(N2, AllOnes)) {
+      Invert = true;
+      OtherOp = N1;
+      return true;
+    }
+    return false;
+  }
+  case ISD::ZERO_EXTEND: {
+    // (zext cc) can never be the all ones value.
+    if (AllOnes)
+      return false;
+    CC = N->getOperand(0);
+    if (CC.getValueType() != MVT::i1)
+      return false;
+    SDLoc dl(N);
+    EVT VT = N->getValueType(0);
+    OtherOp = DAG.getConstant(1, dl, VT);
+    Invert = true;
+    return true;
+  }
+  case ISD::SIGN_EXTEND: {
+    CC = N->getOperand(0);
+    if (CC.getValueType() != MVT::i1)
+      return false;
+    SDLoc dl(N);
+    EVT VT = N->getValueType(0);
+    Invert = !AllOnes;
+    if (AllOnes)
+      // When looking for an AllOnes constant, N is an sext, and the 'other'
+      // value is 0.
+      OtherOp = DAG.getConstant(0, dl, VT);
+    else
+      OtherOp =
+          DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, VT);
+    return true;
+  }
+  }
+}
+
+// Combine a constant select operand into its use:
+//
+//   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
+//   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
+//   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
+//   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
+//   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
+//
+// The transform is rejected if the select doesn't have a constant operand that
+// is null, or all ones when AllOnes is set.
+//
+// Also recognize sext/zext from i1:
+//
+//   (add (zext cc), x) -> (select cc (add x, 1), x)
+//   (add (sext cc), x) -> (select cc (add x, -1), x)
+//
+// These transformations eventually create predicated instructions.
+static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   bool AllOnes) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  SDValue NonConstantVal;
+  SDValue CCOp;
+  bool SwapSelectOps;
+  if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
+                                  NonConstantVal, DAG))
+    return SDValue();
+
+  // Slct is now know to be the desired identity constant when CC is true.
+  SDValue TrueVal = OtherOp;
+  SDValue FalseVal =
+      DAG.getNode(N->getOpcode(), SDLoc(N), VT, OtherOp, NonConstantVal);
+  // Unless SwapSelectOps says CC should be false.
+  if (SwapSelectOps)
+    std::swap(TrueVal, FalseVal);
+
+  return DAG.getNode(ISD::SELECT, SDLoc(N), VT, CCOp, TrueVal, FalseVal);
+}
+
+// Attempt combineSelectAndUse on each operand of a commutative operator N.
+static SDValue
+combineSelectAndUseCommutative(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                               bool AllOnes) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (N0.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
+      return Result;
+  if (N1.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
+      return Result;
+  return SDValue();
+}
+
+// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
+static SDValue PerformSUBCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
+  if (N1.getNode()->hasOneUse())
+    if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, /*AllOnes=*/false))
+      return Result;
+
+  return SDValue();
+}
+
+SDValue LanaiTargetLowering::PerformDAGCombine(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::ADD:
+  case ISD::OR:
+  case ISD::XOR:
+    return combineSelectAndUseCommutative(N, DCI, /*AllOnes=*/false);
+  case ISD::AND:
+    return combineSelectAndUseCommutative(N, DCI, /*AllOnes=*/true);
+  case ISD::SUB:
+    return PerformSUBCombine(N, DCI);
+  }
+
+  return SDValue();
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.h b/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.h
new file mode 100644
index 000000000000..c2fba4f9d167
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.h
@@ -0,0 +1,149 @@
+//===-- LanaiISelLowering.h - Lanai DAG Lowering Interface -....-*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Lanai uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIISELLOWERING_H
+#define LLVM_LIB_TARGET_LANAI_LANAIISELLOWERING_H
+
+#include "Lanai.h"
+#include "LanaiRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+namespace LanaiISD {
+enum {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+  ADJDYNALLOC,
+
+  // Return with a flag operand. Operand 0 is the chain operand.
+  RET_FLAG,
+
+  // CALL - These operations represent an abstract call instruction, which
+  // includes a bunch of information.
+  CALL,
+
+  // SELECT_CC - Operand 0 and operand 1 are selection variable, operand 3
+  // is condition code and operand 4 is flag operand.
+  SELECT_CC,
+
+  // SETCC - Store the conditional code to a register.
+  SETCC,
+
+  // SET_FLAG - Set flag compare.
+  SET_FLAG,
+
+  // SUBBF - Subtract with borrow that sets flags.
+  SUBBF,
+
+  // BR_CC - Used to glue together a conditional branch and comparison
+  BR_CC,
+
+  // Wrapper - A wrapper node for TargetConstantPool, TargetExternalSymbol,
+  // and TargetGlobalAddress.
+  Wrapper,
+
+  // Get the Higher/Lower 16 bits from a 32-bit immediate.
+  HI,
+  LO,
+
+  // Small 21-bit immediate in global memory.
+  SMALL
+};
+} // namespace LanaiISD
+
+class LanaiSubtarget;
+
+class LanaiTargetLowering : public TargetLowering {
+public:
+  LanaiTargetLowering(const TargetMachine &TM, const LanaiSubtarget &STI);
+
+  // LowerOperation - Provide custom lowering hooks for some operations.
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  // getTargetNodeName - This method returns the name of a target specific
+  // DAG node.
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+
+  unsigned getRegisterByName(const char *RegName, EVT VT,
+                             SelectionDAG &DAG) const override;
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+  ConstraintWeight
+  getSingleConstraintMatchWeight(AsmOperandInfo &Info,
+                                 const char *Constraint) const override;
+  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+private:
+  SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                         CallingConv::ID CallConv, bool IsVarArg,
+                         bool IsTailCall,
+                         const SmallVectorImpl<ISD::OutputArg> &Outs,
+                         const SmallVectorImpl<SDValue> &OutVals,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const;
+
+  SDValue LowerCCCArguments(SDValue Chain, CallingConv::ID CallConv,
+                            bool IsVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            const SDLoc &DL, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals) const;
+
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool IsVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins,
+                          const SDLoc &DL, SelectionDAG &DAG,
+                          SmallVectorImpl<SDValue> &InVals) const;
+
+  SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool IsVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+                      SelectionDAG &DAG) const override;
+
+  const LanaiRegisterInfo *TRI;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiInstrFormats.td b/contrib/llvm/lib/Target/Lanai/LanaiInstrFormats.td
new file mode 100644
index 000000000000..30289ea4ac0b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiInstrFormats.td
@@ -0,0 +1,561 @@
+//===- LanaiInstrFormats.td - Lanai Instruction Formats ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class InstLanai<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : Instruction {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+  let Size = 4;
+
+  let Namespace = "Lanai";
+  let DecoderNamespace = "Lanai";
+
+  bits<4> Opcode;
+  let Inst{31 - 28} = Opcode;
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+}
+
+//------------------------------------------------------------------------------
+// Register Immediate (RI)
+//------------------------------------------------------------------------------
+// Encoding:
+//           -----------------------------------------------------------------
+//           |0.A.A.A| . . . . | . . . . |F.H| . . . . . . . . . . . . . . . |
+//           -----------------------------------------------------------------
+//            opcode     Rd        Rs1                constant (16)
+//
+// Action:
+//           Rd <- Rs1 op constant
+//
+// Except for shift instructions, `H' determines whether the constant
+// is in the high (1) or low (0) word.  The other halfword is 0x0000,
+// except for the `AND' instruction (`AAA' = 100), for which the other
+// halfword is 0xFFFF, and shifts (`AAA' = 111), for which the constant is
+// sign extended.
+//
+// `F' determines whether the instruction modifies (1) or does not
+// modify (0) the program flags.
+//
+// `AAA' specifies the operation: `add' (000), `addc' (001), `sub'
+// (010), `subb' (011), `and' (100), `or' (101), `xor' (110), or `shift'
+// (111).  For the shift, `H' specifies a logical (0) or arithmetic (1)
+// shift.  The amount and direction of the shift are determined by the
+// sign extended constant interpreted as a two's complement number.  The
+// shift operation is defined only for the range of:
+//      31 ... 0 -1 ... -31
+//      \      / \        /
+//        left     right
+//        shift    shift
+//
+// If and only if the `F' bit is 1, RI instructions modify the
+// condition bits, `Z' (Zero), `N' (Negative), `V' (oVerflow), and `C'
+// (Carry), according to the result.  If the flags are updated, they are
+// updated as follows:
+// `Z'
+//      is set if the result is zero and cleared otherwise.
+//
+// `N'
+//      is set to the most significant bit of the result.
+//
+// `V'
+//      For arithmetic instructions (`add', `addc', `sub', `subb') `V' is
+//      set if the sign (most significant) bits of the input operands are
+//      the same but different from the sign bit of the result and cleared
+//      otherwise.  For other RI instructions, `V' is cleared.
+//
+// `C'
+//      For arithmetic instructions, `C' is set/cleared if there is/is_not
+//      a carry generated out of the most significant when performing the
+//      twos-complement addition (`sub(a,b) == a + ~b + 1', `subb(a,b) ==
+//      a + ~b + `C'').  For left shifts, `C' is set to the least
+//      significant bit discarded by the shift operation.  For all other
+//      operations, `C' is cleared.
+//
+// A Jump is accomplished by `Rd' being `pc', and it has one shadow.
+//
+// The all-0s word is the instruction `R0 <- R0 + 0', which is a no-op.
+class InstRI<bits<3> op, dag outs, dag ins, string asmstr,
+             list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern>, Sched<[WriteALU]> {
+  let Itinerary = IIC_ALU;
+  bits<5> Rd;
+  bits<5> Rs1;
+  bit F;
+  bit H;
+  bits<16> imm16;
+
+  let Opcode{3} = 0;
+  let Opcode{2 - 0} = op;
+  let Inst{27 - 23} = Rd;
+  let Inst{22 - 18} = Rs1;
+  let Inst{17} = F;
+  let Inst{16} = H;
+  let Inst{15 - 0} = imm16;
+}
+
+//------------------------------------------------------------------------------
+// Register Register (RR)
+//------------------------------------------------------------------------------
+// Encoding:
+//           -----------------------------------------------------------------
+//           |1.1.0.0| . . . . | . . . . |F.I| . . . . |B.B.B|J.J.J.J.J|D.D.D|
+//           -----------------------------------------------------------------
+//            opcode     Rd        Rs1           Rs2   \       operation     /
+//
+// Action:
+//           `Rd <- Rs1 op Rs2' iff condition DDDI is true.
+//
+// `DDDI' is as described for the BR instruction.
+//
+// `F' determines whether the instruction modifies (1) or does not
+// modify (0) the program flags.
+//
+// `BBB' determines the operation: `add' (000), `addc' (001), `sub'
+// (010), `subb' (011), `and' (100), `or' (101), `xor' (110), or "special"
+// (111).  The `JJJJJ' field is irrelevant except for special.
+//
+// `JJJJJ' determines which special operation is performed.  `10---'
+// is a logical shift, and `11---' is an arithmetic shift, and ‘00000` is
+// the SELECT operation.  The amount and direction of the shift are
+// determined by the contents of `Rs2' interpreted as a two's complement
+// number (in the same way as shifts in the Register-Immediate
+// instructions in *Note RI::).  For the SELECT operation, Rd gets Rs1 if
+// condition DDDI is true, Rs2 otherwise. All other `JJJJJ' combinations
+// are reserved for instructions that may be defined in the future.
+//
+// If the `F' bit is 1, RR instructions modify the condition bits, `Z'
+// (Zero), `N' (Negative), `V' (oVerflow), and `C' (Carry), according to
+// the result.  All RR instructions modify the `Z', `N', and `V' flags.
+// Except for arithmetic instructions (`add', `addc', `sub', `subb'), `V'
+// is cleared.  Only arithmetic instructions and shifts modify `C'. Right
+// shifts clear C.
+//
+// DDDI is as described in the table for the BR instruction and only used for
+// the select instruction.
+//
+// A Jump is accomplished by `Rd' being `pc', and it has one shadow.
+class InstRR<bits<3> op, dag outs, dag ins, string asmstr,
+             list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern>, Sched<[WriteALU]> {
+  let Itinerary = IIC_ALU;
+  bits<5> Rd;
+  bits<5> Rs1;
+  bits<5> Rs2;
+  bit F;
+  bits<4> DDDI;
+  bits<5> JJJJJ;
+
+  let Opcode = 0b1100;
+  let Inst{27 - 23} = Rd;
+  let Inst{22 - 18} = Rs1;
+  let Inst{17} = F;
+  let Inst{16} = DDDI{0};
+  let Inst{15 - 11} = Rs2;
+  let Inst{10 - 8} = op;
+  let Inst{7 - 3} = JJJJJ;
+  let Inst{2 - 0} = DDDI{3 - 1};
+}
+
+//------------------------------------------------------------------------------
+// Register Memory (RM)
+//------------------------------------------------------------------------------
+// Encoding:
+//          -----------------------------------------------------------------
+//          |1.0.0.S| . . . . | . . . . |P.Q| . . . . . . . . . . . . . . . |
+//          -----------------------------------------------------------------
+//           opcode     Rd        Rs1                 constant (16)
+//
+// Action:
+//        Rd <- Memory(ea)      (Load)    see below for the
+//        Memory(ea) <- Rd      (Store)   definition of ea.
+//
+// `S' determines whether the instruction is a Load (0) or a Store (1).
+// Loads appear in Rd one cycle after this instruction executes.  If the
+// following instruction reads Rd, that instruction will be delayed by 1
+// clock cycle.
+//
+//   PQ      operation
+//   --      ------------------------------------------
+//   00      ea = Rs1
+//   01      ea = Rs1,             Rs1 <- Rs1 + constant
+//   10      ea = Rs1 + constant
+//   11      ea = Rs1 + constant,  Rs1 <- Rs1 + constant
+//
+// The constant is sign-extended for this instruction.
+//
+// A Jump is accomplished by `Rd' being `pc', and it has *two* delay slots.
+class InstRM<bit S, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  bits<5> Rd;
+  bits<5> Rs1;
+  bit P;
+  bit Q;
+  bits<16> imm16;
+  // Dummy variables to allow multiclass definition of RM and RRM
+  bits<2> YL;
+  bit E;
+
+  let Opcode{3 - 1} = 0b100;
+  let Opcode{0} = S;
+  let Inst{27 - 23} = Rd;
+  let Inst{22 - 18} = Rs1;
+  let Inst{17} = P;
+  let Inst{16} = Q;
+  let Inst{15 - 0} = imm16;
+
+  let PostEncoderMethod = "adjustPqBitsRmAndRrm";
+}
+
+//------------------------------------------------------------------------------
+// Register Register Memory (RRM)
+//------------------------------------------------------------------------------
+// Encoding:
+//           -----------------------------------------------------------------
+//           |1.0.1.S| . . . . | . . . . |P.Q| . . . . |B.B.B|J.J.J.J.J|Y.L.E|
+//           -----------------------------------------------------------------
+//            opcode     Rd        Rs1           Rs2   \       operation     /
+//
+// Action:
+//           Rd <- Memory(ea)      (Load)    see below for the
+//           Memory(ea) <- Rd      (Store)   definition of ea.
+//
+// The RRM instruction is identical to the RM (*note RM::.) instruction
+// except that:
+//
+// 1. `Rs1 + constant' is replaced with `Rs1 op Rs2', where `op' is
+//    determined in the same way as in the RR instruction (*note RR::.)
+//    and
+//
+// 2. part-word memory accesses are allowed as specified below.
+//
+//    If `BBB' != 111 (i.e.: For all but shift operations):
+//        If `YLE' = 01- => fuLl-word memory access
+//        If `YLE' = 00- => half-word memory access
+//        If `YLE' = 10- => bYte memory access
+//        If `YLE' = --1 => loads are zEro extended
+//        If `YLE' = --0 => loads are sign extended
+//
+//    If `BBB' = 111 (For shift operations):
+//        fullword memory access are performed.
+//
+// All part-word loads write the least significant part of the
+// destination register with the higher-order bits zero- or sign-extended.
+// All part-word stores store the least significant part-word of the
+// source register in the destination memory location.
+//
+// A Jump is accomplished by `Rd' being `pc', and it has *two* delay slots.
+class InstRRM<bit S, dag outs, dag ins, string asmstr,
+              list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  bits<5> Rd;
+  bits<5> Rs1;
+  bits<5> Rs2;
+  bit P;
+  bit Q;
+  bits<3> BBB;
+  bits<5> JJJJJ;
+  bits<2> YL;
+  bit E;
+
+  let Opcode{3 - 1} = 0b101;
+  let Opcode{0} = S;
+  let Inst{27 - 23} = Rd;
+  let Inst{22 - 18} = Rs1;
+  let Inst{17} = P;
+  let Inst{16} = Q;
+  let Inst{15 - 11} = Rs2;
+  let Inst{10 - 8} = BBB;
+  let Inst{7 - 3} = JJJJJ;
+  let Inst{2 - 1} = YL;
+  let Inst{0} = E;
+
+  let PostEncoderMethod = "adjustPqBitsRmAndRrm";
+}
+
+//------------------------------------------------------------------------------
+// Conditional Branch (BR)
+//------------------------------------------------------------------------------
+// Encoding:
+//           -----------------------------------------------------------------
+//           |1.1.1.0|D.D.D| . . . . . . . . . . . . . . . . . . . . . . |0.I|
+//           -----------------------------------------------------------------
+//            opcode condition                   constant (23)
+//
+// Action:
+//            if (condition) { `pc' <- 4*(zero-extended constant) }
+//
+// The BR instruction is an absolute branch.
+// The constant is scaled as shown by its position in the instruction word such
+// that it specifies word-aligned addresses in the range [0,2^25-4]
+//
+// The `DDDI' field selects the condition that causes the branch to be taken.
+// (the `I' (Invert sense) bit inverts the sense of the condition):
+//
+//   DDDI  logical function                        [code, used for...]
+//   ----  --------------------------------------  ------------------------
+//   0000  1                                       [T, true]
+//   0001  0                                       [F, false]
+//   0010  C AND Z'                                [HI, high]
+//   0011  C' OR Z                                 [LS, low or same]
+//   0100  C'                                      [CC, carry cleared]
+//   0101  C                                       [CS, carry set]
+//   0110  Z'                                      [NE, not equal]
+//   0111  Z                                       [EQ, equal]
+//   1000  V'                                      [VC, oVerflow cleared]
+//   1001  V                                       [VS, oVerflow set]
+//   1010  N'                                      [PL, plus]
+//   1011  N                                       [MI, minus]
+//   1100  (N AND V) OR (N' AND V')                [GE, greater than or equal]
+//   1101  (N AND V') OR (N' AND V)                [LT, less than]
+//   1110  (N AND V AND Z') OR (N' AND V' AND Z')  [GT, greater than]
+//   1111  (Z) OR (N AND V') OR (N' AND V)         [LE, less than or equal]
+//
+// If the branch is not taken, the BR instruction is a no-op.  If the branch is
+// taken, the processor starts executing instructions at the branch target
+// address *after* the processor has executed one more instruction.  That is,
+// the branch has one “branch delay slot”.  Be very careful if you find yourself
+// wanting to put a branch in a branch delays slot!
+class InstBR<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  let Itinerary = IIC_ALU;
+  bits<25> addr;
+  bits<4> DDDI;
+
+  let Opcode = 0b1110;
+  let Inst{27 - 25} = DDDI{3 - 1};
+  let Inst{24 - 0} = addr;
+  // These instructions overwrite the last two address bits (which are assumed
+  // and ensured to be 0).
+  let Inst{1} = 0;
+  let Inst{0} = DDDI{0};
+}
+
+//------------------------------------------------------------------------------
+// Conditional Branch Relative (BRR)
+//------------------------------------------------------------------------------
+// Encoding:
+//           -----------------------------------------------------------------
+//           |1.1.1.0|D.D.D|1|-| . . . . |-.-| . . . . . . . . . . . . . |1.I|
+//           -----------------------------------------------------------------
+//            opcode condition     Rs1           constant (14)
+// Action:
+//           if (condition) { ‘pc’ <- Rs1 + 4*sign-extended constant) }
+//
+// BRR behaves like BR, except the branch target address is a 16-bit PC relative
+// offset.
+class InstBRR<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  bits<4> DDDI;
+  bits<5> Rs1;
+  bits<16> imm16;
+
+  let Opcode = 0b1110;
+  let Inst{27 - 25} = DDDI{3 - 1};
+  let Inst{24} = 1;
+  let Inst{22 - 18} = Rs1;
+  let Inst{17 - 16} = 0;
+  let Inst{15 - 0} = imm16;
+  // Overwrite last two bits which have to be zero
+  let Inst{1} = 1;
+  let Inst{0} = DDDI{0};
+
+  // Set don't cares to zero
+  let Inst{23} = 0;
+}
+
+//------------------------------------------------------------------------------
+// Conditional Set (SCC)
+//------------------------------------------------------------------------------
+// Encoding:
+//           -----------------------------------------------------------------
+//           |1.1.1.0|D.D.D|0.-| . . . . |-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-|1.I|
+//           -----------------------------------------------------------------
+//            opcode condition     Rs1
+//
+// Action:
+//       Rs1 <- logical function result
+//
+// SCC sets dst_reg to the boolean result of computing the logical function
+// specified by DDDI, as described in the table for the BR instruction.
+class InstSCC<dag outs, dag ins, string asmstr,
+              list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  let Itinerary = IIC_ALU;
+  bits<5> Rs1; // dst_reg in documentation
+  bits<4> DDDI;
+
+  let Opcode = 0b1110;
+  let Inst{27 - 25} = DDDI{3 - 1};
+  let Inst{24} = 0;
+  let Inst{22 - 18} = Rs1;
+  let Inst{1} = 1;
+  let Inst{0} = DDDI{0};
+
+  // Set don't cares to zero
+  let Inst{23} = 0;
+  let Inst{17 - 2} = 0;
+}
+
+//------------------------------------------------------------------------------
+// Special Load/Store (SLS)
+//------------------------------------------------------------------------------
+//
+// Encoding:
+//           -----------------------------------------------------------------
+//           |1.1.1.1| . . . . | . . . . |0.S| . . . . . . . . . . . . . . . |
+//           -----------------------------------------------------------------
+//            opcode     Rd    addr 5msb's            address 16 lsb's
+//
+// Action:
+//           If S = 0 (LOAD):   Rd <- Memory(address);
+//           If S = 1 (STORE):  Memory(address) <- Rd
+//
+// The timing is the same as for RM (*note RM::.) and RRM (*note
+// RRM::.) instructions.  The two low-order bits of the 21-bit address are
+// ignored.  The address is zero extended.  Fullword memory accesses are
+// performed.
+class InstSLS<bit S, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  bits<5> Rd;
+  bits<5> msb;
+  bits<16> lsb;
+
+  let Opcode = 0b1111;
+  let Inst{27 - 23} = Rd;
+  let Inst{22 - 18} = msb;
+  let Inst{17} = 0;
+  let Inst{16} = S;
+  let Inst{15 - 0} = lsb;
+}
+
+//------------------------------------------------------------------------------
+// Special Load Immediate (SLI)
+//------------------------------------------------------------------------------
+// Encoding:
+//           -----------------------------------------------------------------
+//           |1.1.1.1| . . . . | . . . . |1.0| . . . . . . . . . . . . . . . |
+//           -----------------------------------------------------------------
+//            opcode     Rd    const 5msb's          constant 16 lsb's
+//
+// Action:
+//           Rd <- constant
+//
+// The 21-bit constant is zero-extended.  The timing is the same as the
+// RM instruction (*note RM::.).
+class InstSLI<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  bits<5> Rd;
+  bits<5> msb;
+  bits<16> lsb;
+
+  let Opcode = 0b1111;
+  let Inst{27 - 23} = Rd;
+  let Inst{22 - 18} = msb;
+  let Inst{17} = 1;
+  let Inst{16} = 0;
+  let Inst{15 - 0} = lsb;
+}
+
+//------------------------------------------------------------------------------
+// Special Part-Word Load/Store (SPLS)
+//------------------------------------------------------------------------------
+// Encoding:
+//        -----------------------------------------------------------------
+//        |1.1.1.1| . . . . | . . . . |1.1.0.Y.S.E.P.Q| . . . . . . . . . |
+//        -----------------------------------------------------------------
+//         opcode     Rd        Rs1                       constant (10)
+//
+// Action:
+//        If `YS' = 11  (bYte     Store):
+//             Memory(ea) <- (least significant byte of Rr)
+//        If `YS' = 01  (halfword Store):
+//             Memory(ea) <- (least significant half-word of Rr)
+//        If `YS' = 10  (bYte     load):  Rr <- Memory(ea)
+//        If `YS' = 00  (halfword load):  Rr <- Memory(ea)
+//             [Note: here ea is determined as in the the RM instruction. ]
+//        If `SE' = 01 then the value is zEro extended
+//             before being loaded into Rd.
+//        If `SE' = 00 then the value is sign extended
+//             before being loaded into Rd.
+//
+// `P' and `Q' are used to determine `ea' as in the RM instruction. The
+// constant is sign extended.  The timing is the same as the RM and RRM
+// instructions.  *Note RM:: and *Note RRM::.
+//
+// All part-word loads write the part-word into the least significant
+// part of the destination register, with the higher-order bits zero- or
+// sign-extended.  All part-word stores store the least significant
+// part-word of the source register into the destination memory location.
+class InstSPLS<dag outs, dag ins, string asmstr,
+               list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  bits<5> Rd;
+  bits<5> Rs1;
+  bits<5> msb;
+  bit Y;
+  bit S;
+  bit E;
+  bit P;
+  bit Q;
+  bits<10> imm10;
+
+  let Opcode = 0b1111;
+  let Inst{27 - 23} = Rd;
+  let Inst{22 - 18} = Rs1;
+  let Inst{17 - 15} = 0b110;
+  let Inst{14} = Y;
+  let Inst{13} = S;
+  let Inst{12} = E;
+  let Inst{11} = P;
+  let Inst{10} = Q;
+  let Inst{9 - 0} = imm10;
+
+  let PostEncoderMethod = "adjustPqBitsSpls";
+}
+
+//------------------------------------------------------------------------------
+// Special instructions (popc, leadz, trailz)
+//------------------------------------------------------------------------------
+// Encoding:
+//         -----------------------------------------------------------------
+//         |1.1.0.1|    Rd   |   Rs1   |F.-| . . . . | . . | . . . . | OP  |
+//         -----------------------------------------------------------------
+//          opcode      Rd       Rs1
+// Action:
+//         Rd <- Perform action encoded in OP on Rs1
+//   OP is one of:
+//      0b001 POPC   Population count;
+//      0b010 LEADZ  Count number of leading zeros;
+//      0b011 TRAILZ Count number of trailing zeros;
+class InstSpecial<bits<3> op, dag outs, dag ins, string asmstr,
+                  list<dag> pattern> : InstLanai<outs, ins, asmstr,
+                  pattern>, Sched<[WriteALU]> {
+  let Itinerary = IIC_ALU;
+  bit F;
+  bits<5> Rd;
+  bits<5> Rs1;
+
+  let Opcode = 0b1101;
+  let Inst{27 - 23} = Rd;
+  let Inst{22 - 18} = Rs1;
+  let Inst{17} = F;
+  let Inst{16 - 3} = 0;
+  let Inst{2 - 0} = op;
+}
+
+// Pseudo instructions
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstLanai<outs, ins, asmstr, pattern> {
+  let Inst{15 - 0} = 0;
+  let isPseudo = 1;
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
new file mode 100644
index 000000000000..fcd5da876b15
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -0,0 +1,808 @@
+//===-- LanaiInstrInfo.cpp - Lanai Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Lanai implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "LanaiInstrInfo.h"
+#include "LanaiMachineFunctionInfo.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "LanaiGenInstrInfo.inc"
+
+LanaiInstrInfo::LanaiInstrInfo()
+    : LanaiGenInstrInfo(Lanai::ADJCALLSTACKDOWN, Lanai::ADJCALLSTACKUP),
+      RegisterInfo() {}
+
+void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator Position,
+                                 const DebugLoc &DL,
+                                 unsigned DestinationRegister,
+                                 unsigned SourceRegister,
+                                 bool KillSource) const {
+  if (!Lanai::GPRRegClass.contains(DestinationRegister, SourceRegister)) {
+    llvm_unreachable("Impossible reg-to-reg copy");
+  }
+
+  BuildMI(MBB, Position, DL, get(Lanai::OR_I_LO), DestinationRegister)
+      .addReg(SourceRegister, getKillRegState(KillSource))
+      .addImm(0);
+}
+
+void LanaiInstrInfo::storeRegToStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
+    unsigned SourceRegister, bool IsKill, int FrameIndex,
+    const TargetRegisterClass *RegisterClass,
+    const TargetRegisterInfo * /*RegisterInfo*/) const {
+  DebugLoc DL;
+  if (Position != MBB.end()) {
+    DL = Position->getDebugLoc();
+  }
+
+  if (!Lanai::GPRRegClass.hasSubClassEq(RegisterClass)) {
+    llvm_unreachable("Can't store this register to stack slot");
+  }
+  BuildMI(MBB, Position, DL, get(Lanai::SW_RI))
+      .addReg(SourceRegister, getKillRegState(IsKill))
+      .addFrameIndex(FrameIndex)
+      .addImm(0)
+      .addImm(LPAC::ADD);
+}
+
+void LanaiInstrInfo::loadRegFromStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
+    unsigned DestinationRegister, int FrameIndex,
+    const TargetRegisterClass *RegisterClass,
+    const TargetRegisterInfo * /*RegisterInfo*/) const {
+  DebugLoc DL;
+  if (Position != MBB.end()) {
+    DL = Position->getDebugLoc();
+  }
+
+  if (!Lanai::GPRRegClass.hasSubClassEq(RegisterClass)) {
+    llvm_unreachable("Can't load this register from stack slot");
+  }
+  BuildMI(MBB, Position, DL, get(Lanai::LDW_RI), DestinationRegister)
+      .addFrameIndex(FrameIndex)
+      .addImm(0)
+      .addImm(LPAC::ADD);
+}
+
+bool LanaiInstrInfo::areMemAccessesTriviallyDisjoint(
+    MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis * /*AA*/) const {
+  assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
+  assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
+
+  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
+      MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
+    return false;
+
+  // Retrieve the base register, offset from the base register and width. Width
+  // is the size of memory that is being loaded/stored (e.g. 1, 2, 4).  If
+  // base registers are identical, and the offset of a lower memory access +
+  // the width doesn't overlap the offset of a higher memory access,
+  // then the memory accesses are different.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  unsigned BaseRegA = 0, BaseRegB = 0;
+  int64_t OffsetA = 0, OffsetB = 0;
+  unsigned int WidthA = 0, WidthB = 0;
+  if (getMemOpBaseRegImmOfsWidth(MIa, BaseRegA, OffsetA, WidthA, TRI) &&
+      getMemOpBaseRegImmOfsWidth(MIb, BaseRegB, OffsetB, WidthB, TRI)) {
+    if (BaseRegA == BaseRegB) {
+      int LowOffset = std::min(OffsetA, OffsetB);
+      int HighOffset = std::max(OffsetA, OffsetB);
+      int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+      if (LowOffset + LowWidth <= HighOffset)
+        return true;
+    }
+  }
+  return false;
+}
+
+bool LanaiInstrInfo::expandPostRAPseudo(MachineInstr & /*MI*/) const {
+  return false;
+}
+
+static LPCC::CondCode getOppositeCondition(LPCC::CondCode CC) {
+  switch (CC) {
+  case LPCC::ICC_T: //  true
+    return LPCC::ICC_F;
+  case LPCC::ICC_F: //  false
+    return LPCC::ICC_T;
+  case LPCC::ICC_HI: //  high
+    return LPCC::ICC_LS;
+  case LPCC::ICC_LS: //  low or same
+    return LPCC::ICC_HI;
+  case LPCC::ICC_CC: //  carry cleared
+    return LPCC::ICC_CS;
+  case LPCC::ICC_CS: //  carry set
+    return LPCC::ICC_CC;
+  case LPCC::ICC_NE: //  not equal
+    return LPCC::ICC_EQ;
+  case LPCC::ICC_EQ: //  equal
+    return LPCC::ICC_NE;
+  case LPCC::ICC_VC: //  oVerflow cleared
+    return LPCC::ICC_VS;
+  case LPCC::ICC_VS: //  oVerflow set
+    return LPCC::ICC_VC;
+  case LPCC::ICC_PL: //  plus (note: 0 is "minus" too here)
+    return LPCC::ICC_MI;
+  case LPCC::ICC_MI: //  minus
+    return LPCC::ICC_PL;
+  case LPCC::ICC_GE: //  greater than or equal
+    return LPCC::ICC_LT;
+  case LPCC::ICC_LT: //  less than
+    return LPCC::ICC_GE;
+  case LPCC::ICC_GT: //  greater than
+    return LPCC::ICC_LE;
+  case LPCC::ICC_LE: //  less than or equal
+    return LPCC::ICC_GT;
+  default:
+    llvm_unreachable("Invalid condtional code");
+  }
+}
+
+std::pair<unsigned, unsigned>
+LanaiInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+  return std::make_pair(TF, 0u);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+LanaiInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+  using namespace LanaiII;
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_ABS_HI, "lanai-hi"},
+      {MO_ABS_LO, "lanai-lo"},
+      {MO_NO_FLAG, "lanai-nf"}};
+  return makeArrayRef(TargetFlags);
+}
+
+bool LanaiInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                                    unsigned &SrcReg2, int &CmpMask,
+                                    int &CmpValue) const {
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case Lanai::SFSUB_F_RI_LO:
+  case Lanai::SFSUB_F_RI_HI:
+    SrcReg = MI.getOperand(0).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI.getOperand(1).getImm();
+    return true;
+  case Lanai::SFSUB_F_RR:
+    SrcReg = MI.getOperand(0).getReg();
+    SrcReg2 = MI.getOperand(1).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  }
+
+  return false;
+}
+
+// isRedundantFlagInstr - check whether the first instruction, whose only
+// purpose is to update flags, can be made redundant.
+// * SFSUB_F_RR can be made redundant by SUB_RI if the operands are the same.
+// * SFSUB_F_RI can be made redundant by SUB_I if the operands are the same.
+inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
+                                        unsigned SrcReg2, int ImmValue,
+                                        MachineInstr *OI) {
+  if (CmpI->getOpcode() == Lanai::SFSUB_F_RR &&
+      OI->getOpcode() == Lanai::SUB_R &&
+      ((OI->getOperand(1).getReg() == SrcReg &&
+        OI->getOperand(2).getReg() == SrcReg2) ||
+       (OI->getOperand(1).getReg() == SrcReg2 &&
+        OI->getOperand(2).getReg() == SrcReg)))
+    return true;
+
+  if (((CmpI->getOpcode() == Lanai::SFSUB_F_RI_LO &&
+        OI->getOpcode() == Lanai::SUB_I_LO) ||
+       (CmpI->getOpcode() == Lanai::SFSUB_F_RI_HI &&
+        OI->getOpcode() == Lanai::SUB_I_HI)) &&
+      OI->getOperand(1).getReg() == SrcReg &&
+      OI->getOperand(2).getImm() == ImmValue)
+    return true;
+  return false;
+}
+
+inline static unsigned flagSettingOpcodeVariant(unsigned OldOpcode) {
+  switch (OldOpcode) {
+  case Lanai::ADD_I_HI:
+    return Lanai::ADD_F_I_HI;
+  case Lanai::ADD_I_LO:
+    return Lanai::ADD_F_I_LO;
+  case Lanai::ADD_R:
+    return Lanai::ADD_F_R;
+  case Lanai::ADDC_I_HI:
+    return Lanai::ADDC_F_I_HI;
+  case Lanai::ADDC_I_LO:
+    return Lanai::ADDC_F_I_LO;
+  case Lanai::ADDC_R:
+    return Lanai::ADDC_F_R;
+  case Lanai::AND_I_HI:
+    return Lanai::AND_F_I_HI;
+  case Lanai::AND_I_LO:
+    return Lanai::AND_F_I_LO;
+  case Lanai::AND_R:
+    return Lanai::AND_F_R;
+  case Lanai::OR_I_HI:
+    return Lanai::OR_F_I_HI;
+  case Lanai::OR_I_LO:
+    return Lanai::OR_F_I_LO;
+  case Lanai::OR_R:
+    return Lanai::OR_F_R;
+  case Lanai::SL_I:
+    return Lanai::SL_F_I;
+  case Lanai::SRL_R:
+    return Lanai::SRL_F_R;
+  case Lanai::SA_I:
+    return Lanai::SA_F_I;
+  case Lanai::SRA_R:
+    return Lanai::SRA_F_R;
+  case Lanai::SUB_I_HI:
+    return Lanai::SUB_F_I_HI;
+  case Lanai::SUB_I_LO:
+    return Lanai::SUB_F_I_LO;
+  case Lanai::SUB_R:
+    return Lanai::SUB_F_R;
+  case Lanai::SUBB_I_HI:
+    return Lanai::SUBB_F_I_HI;
+  case Lanai::SUBB_I_LO:
+    return Lanai::SUBB_F_I_LO;
+  case Lanai::SUBB_R:
+    return Lanai::SUBB_F_R;
+  case Lanai::XOR_I_HI:
+    return Lanai::XOR_F_I_HI;
+  case Lanai::XOR_I_LO:
+    return Lanai::XOR_F_I_LO;
+  case Lanai::XOR_R:
+    return Lanai::XOR_F_R;
+  default:
+    return Lanai::NOP;
+  }
+}
+
+bool LanaiInstrInfo::optimizeCompareInstr(
+    MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int /*CmpMask*/,
+    int CmpValue, const MachineRegisterInfo *MRI) const {
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI)
+    return false;
+
+  // Get ready to iterate backward from CmpInstr.
+  MachineBasicBlock::iterator I = CmpInstr, E = MI,
+                              B = CmpInstr.getParent()->begin();
+
+  // Early exit if CmpInstr is at the beginning of the BB.
+  if (I == B)
+    return false;
+
+  // There are two possible candidates which can be changed to set SR:
+  // One is MI, the other is a SUB instruction.
+  // * For SFSUB_F_RR(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
+  // * For SFSUB_F_RI(r1, CmpValue), we are looking for SUB(r1, CmpValue).
+  MachineInstr *Sub = nullptr;
+  if (SrcReg2 != 0)
+    // MI is not a candidate to transform into a flag setting instruction.
+    MI = nullptr;
+  else if (MI->getParent() != CmpInstr.getParent() || CmpValue != 0) {
+    // Conservatively refuse to convert an instruction which isn't in the same
+    // BB as the comparison. Don't return if SFSUB_F_RI and CmpValue != 0 as Sub
+    // may still be a candidate.
+    if (CmpInstr.getOpcode() == Lanai::SFSUB_F_RI_LO)
+      MI = nullptr;
+    else
+      return false;
+  }
+
+  // Check that SR isn't set between the comparison instruction and the
+  // instruction we want to change while searching for Sub.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  for (--I; I != E; --I) {
+    const MachineInstr &Instr = *I;
+
+    if (Instr.modifiesRegister(Lanai::SR, TRI) ||
+        Instr.readsRegister(Lanai::SR, TRI))
+      // This instruction modifies or uses SR after the one we want to change.
+      // We can't do this transformation.
+      return false;
+
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if (isRedundantFlagInstr(&CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
+      Sub = &*I;
+      break;
+    }
+
+    // Don't search outside the containing basic block.
+    if (I == B)
+      return false;
+  }
+
+  // Return false if no candidates exist.
+  if (!MI && !Sub)
+    return false;
+
+  // The single candidate is called MI.
+  if (!MI)
+    MI = Sub;
+
+  if (flagSettingOpcodeVariant(MI->getOpcode()) != Lanai::NOP) {
+    bool isSafe = false;
+
+    SmallVector<std::pair<MachineOperand *, LPCC::CondCode>, 4>
+        OperandsToUpdate;
+    I = CmpInstr;
+    E = CmpInstr.getParent()->end();
+    while (!isSafe && ++I != E) {
+      const MachineInstr &Instr = *I;
+      for (unsigned IO = 0, EO = Instr.getNumOperands(); !isSafe && IO != EO;
+           ++IO) {
+        const MachineOperand &MO = Instr.getOperand(IO);
+        if (MO.isRegMask() && MO.clobbersPhysReg(Lanai::SR)) {
+          isSafe = true;
+          break;
+        }
+        if (!MO.isReg() || MO.getReg() != Lanai::SR)
+          continue;
+        if (MO.isDef()) {
+          isSafe = true;
+          break;
+        }
+        // Condition code is after the operand before SR.
+        LPCC::CondCode CC;
+        CC = (LPCC::CondCode)Instr.getOperand(IO - 1).getImm();
+
+        if (Sub) {
+          LPCC::CondCode NewCC = getOppositeCondition(CC);
+          if (NewCC == LPCC::ICC_T)
+            return false;
+          // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on
+          // CMP needs to be updated to be based on SUB.  Push the condition
+          // code operands to OperandsToUpdate.  If it is safe to remove
+          // CmpInstr, the condition code of these operands will be modified.
+          if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+              Sub->getOperand(2).getReg() == SrcReg) {
+            OperandsToUpdate.push_back(
+                std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
+          }
+        } else {
+          // No Sub, so this is x = <op> y, z; cmp x, 0.
+          switch (CC) {
+          case LPCC::ICC_EQ: // Z
+          case LPCC::ICC_NE: // Z
+          case LPCC::ICC_MI: // N
+          case LPCC::ICC_PL: // N
+          case LPCC::ICC_F:  // none
+          case LPCC::ICC_T:  // none
+            // SR can be used multiple times, we should continue.
+            break;
+          case LPCC::ICC_CS: // C
+          case LPCC::ICC_CC: // C
+          case LPCC::ICC_VS: // V
+          case LPCC::ICC_VC: // V
+          case LPCC::ICC_HI: // C Z
+          case LPCC::ICC_LS: // C Z
+          case LPCC::ICC_GE: // N V
+          case LPCC::ICC_LT: // N V
+          case LPCC::ICC_GT: // Z N V
+          case LPCC::ICC_LE: // Z N V
+            // The instruction uses the V bit or C bit which is not safe.
+            return false;
+          case LPCC::UNKNOWN:
+            return false;
+          }
+        }
+      }
+    }
+
+    // If SR is not killed nor re-defined, we should check whether it is
+    // live-out. If it is live-out, do not optimize.
+    if (!isSafe) {
+      MachineBasicBlock *MBB = CmpInstr.getParent();
+      for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+                                            SE = MBB->succ_end();
+           SI != SE; ++SI)
+        if ((*SI)->isLiveIn(Lanai::SR))
+          return false;
+    }
+
+    // Toggle the optional operand to SR.
+    MI->setDesc(get(flagSettingOpcodeVariant(MI->getOpcode())));
+    MI->addRegisterDefined(Lanai::SR);
+    CmpInstr.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+bool LanaiInstrInfo::analyzeSelect(const MachineInstr &MI,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   unsigned &TrueOp, unsigned &FalseOp,
+                                   bool &Optimizable) const {
+  assert(MI.getOpcode() == Lanai::SELECT && "unknown select instruction");
+  // Select operands:
+  // 0: Def.
+  // 1: True use.
+  // 2: False use.
+  // 3: Condition code.
+  TrueOp = 1;
+  FalseOp = 2;
+  Cond.push_back(MI.getOperand(3));
+  Optimizable = true;
+  return false;
+}
+
+// Identify instructions that can be folded into a SELECT instruction, and
+// return the defining instruction.
+static MachineInstr *canFoldIntoSelect(unsigned Reg,
+                                       const MachineRegisterInfo &MRI) {
+  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    return nullptr;
+  if (!MRI.hasOneNonDBGUse(Reg))
+    return nullptr;
+  MachineInstr *MI = MRI.getVRegDef(Reg);
+  if (!MI)
+    return nullptr;
+  // MI is folded into the SELECT by predicating it.
+  if (!MI->isPredicable())
+    return nullptr;
+  // Check if MI has any non-dead defs or physreg uses. This also detects
+  // predicated instructions which will be reading SR.
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    // Reject frame index operands.
+    if (MO.isFI() || MO.isCPI() || MO.isJTI())
+      return nullptr;
+    if (!MO.isReg())
+      continue;
+    // MI can't have any tied operands, that would conflict with predication.
+    if (MO.isTied())
+      return nullptr;
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      return nullptr;
+    if (MO.isDef() && !MO.isDead())
+      return nullptr;
+  }
+  bool DontMoveAcrossStores = true;
+  if (!MI->isSafeToMove(/*AliasAnalysis=*/nullptr, DontMoveAcrossStores))
+    return nullptr;
+  return MI;
+}
+
+MachineInstr *
+LanaiInstrInfo::optimizeSelect(MachineInstr &MI,
+                               SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+                               bool /*PreferFalse*/) const {
+  assert(MI.getOpcode() == Lanai::SELECT && "unknown select instruction");
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  MachineInstr *DefMI = canFoldIntoSelect(MI.getOperand(1).getReg(), MRI);
+  bool Invert = !DefMI;
+  if (!DefMI)
+    DefMI = canFoldIntoSelect(MI.getOperand(2).getReg(), MRI);
+  if (!DefMI)
+    return nullptr;
+
+  // Find new register class to use.
+  MachineOperand FalseReg = MI.getOperand(Invert ? 1 : 2);
+  unsigned DestReg = MI.getOperand(0).getReg();
+  const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
+  if (!MRI.constrainRegClass(DestReg, PreviousClass))
+    return nullptr;
+
+  // Create a new predicated version of DefMI.
+  MachineInstrBuilder NewMI =
+      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), DefMI->getDesc(), DestReg);
+
+  // Copy all the DefMI operands, excluding its (null) predicate.
+  const MCInstrDesc &DefDesc = DefMI->getDesc();
+  for (unsigned i = 1, e = DefDesc.getNumOperands();
+       i != e && !DefDesc.OpInfo[i].isPredicate(); ++i)
+    NewMI.addOperand(DefMI->getOperand(i));
+
+  unsigned CondCode = MI.getOperand(3).getImm();
+  if (Invert)
+    NewMI.addImm(getOppositeCondition(LPCC::CondCode(CondCode)));
+  else
+    NewMI.addImm(CondCode);
+  NewMI.copyImplicitOps(MI);
+
+  // The output register value when the predicate is false is an implicit
+  // register operand tied to the first def.  The tie makes the register
+  // allocator ensure the FalseReg is allocated the same register as operand 0.
+  FalseReg.setImplicit();
+  NewMI.addOperand(FalseReg);
+  NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
+
+  // Update SeenMIs set: register newly created MI and erase removed DefMI.
+  SeenMIs.insert(NewMI);
+  SeenMIs.erase(DefMI);
+
+  // If MI is inside a loop, and DefMI is outside the loop, then kill flags on
+  // DefMI would be invalid when transferred inside the loop.  Checking for a
+  // loop is expensive, but at least remove kill flags if they are in different
+  // BBs.
+  if (DefMI->getParent() != MI.getParent())
+    NewMI->clearKillInfo();
+
+  // The caller will erase MI, but not DefMI.
+  DefMI->eraseFromParent();
+  return NewMI;
+}
+
+// The analyzeBranch function is used to examine conditional instructions and
+// remove unnecessary instructions. This method is used by BranchFolder and
+// IfConverter machine function passes to improve the CFG.
+// - TrueBlock is set to the destination if condition evaluates true (it is the
+//   nullptr if the destination is the fall-through branch);
+// - FalseBlock is set to the destination if condition evaluates to false (it
+//   is the nullptr if the branch is unconditional);
+// - condition is populated with machine operands needed to generate the branch
+//   to insert in insertBranch;
+// Returns: false if branch could successfully be analyzed.
+bool LanaiInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TrueBlock,
+                                   MachineBasicBlock *&FalseBlock,
+                                   SmallVectorImpl<MachineOperand> &Condition,
+                                   bool AllowModify) const {
+  // Iterator to current instruction being considered.
+  MachineBasicBlock::iterator Instruction = MBB.end();
+
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  while (Instruction != MBB.begin()) {
+    --Instruction;
+
+    // Skip over debug values.
+    if (Instruction->isDebugValue())
+      continue;
+
+    // Working from the bottom, when we see a non-terminator
+    // instruction, we're done.
+    if (!isUnpredicatedTerminator(*Instruction))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled
+    // by this analysis.
+    if (!Instruction->isBranch())
+      return true;
+
+    // Handle unconditional branches.
+    if (Instruction->getOpcode() == Lanai::BT) {
+      if (!AllowModify) {
+        TrueBlock = Instruction->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a branch, delete them.
+      while (std::next(Instruction) != MBB.end()) {
+        std::next(Instruction)->eraseFromParent();
+      }
+
+      Condition.clear();
+      FalseBlock = nullptr;
+
+      // Delete the jump if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(Instruction->getOperand(0).getMBB())) {
+        TrueBlock = nullptr;
+        Instruction->eraseFromParent();
+        Instruction = MBB.end();
+        continue;
+      }
+
+      // TrueBlock is used to indicate the unconditional destination.
+      TrueBlock = Instruction->getOperand(0).getMBB();
+      continue;
+    }
+
+    // Handle conditional branches
+    unsigned Opcode = Instruction->getOpcode();
+    if (Opcode != Lanai::BRCC)
+      return true; // Unknown opcode.
+
+    // Multiple conditional branches are not handled here so only proceed if
+    // there are no conditions enqueued.
+    if (Condition.empty()) {
+      LPCC::CondCode BranchCond =
+          static_cast<LPCC::CondCode>(Instruction->getOperand(1).getImm());
+
+      // TrueBlock is the target of the previously seen unconditional branch.
+      FalseBlock = TrueBlock;
+      TrueBlock = Instruction->getOperand(0).getMBB();
+      Condition.push_back(MachineOperand::CreateImm(BranchCond));
+      continue;
+    }
+
+    // Multiple conditional branches are not handled.
+    return true;
+  }
+
+  // Return false indicating branch successfully analyzed.
+  return false;
+}
+
+// reverseBranchCondition - Reverses the branch condition of the specified
+// condition list, returning false on success and true if it cannot be
+// reversed.
+bool LanaiInstrInfo::reverseBranchCondition(
+    SmallVectorImpl<llvm::MachineOperand> &Condition) const {
+  assert((Condition.size() == 1) &&
+         "Lanai branch conditions should have one component.");
+
+  LPCC::CondCode BranchCond =
+      static_cast<LPCC::CondCode>(Condition[0].getImm());
+  Condition[0].setImm(getOppositeCondition(BranchCond));
+  return false;
+}
+
+// Insert the branch with condition specified in condition and given targets
+// (TrueBlock and FalseBlock). This function returns the number of machine
+// instructions inserted.
+unsigned LanaiInstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                      MachineBasicBlock *TrueBlock,
+                                      MachineBasicBlock *FalseBlock,
+                                      ArrayRef<MachineOperand> Condition,
+                                      const DebugLoc &DL,
+                                      int *BytesAdded) const {
+  // Shouldn't be a fall through.
+  assert(TrueBlock && "insertBranch must not be told to insert a fallthrough");
+  assert(!BytesAdded && "code size not handled");
+
+  // If condition is empty then an unconditional branch is being inserted.
+  if (Condition.empty()) {
+    assert(!FalseBlock && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(Lanai::BT)).addMBB(TrueBlock);
+    return 1;
+  }
+
+  // Else a conditional branch is inserted.
+  assert((Condition.size() == 1) &&
+         "Lanai branch conditions should have one component.");
+  unsigned ConditionalCode = Condition[0].getImm();
+  BuildMI(&MBB, DL, get(Lanai::BRCC)).addMBB(TrueBlock).addImm(ConditionalCode);
+
+  // If no false block, then false behavior is fall through and no branch needs
+  // to be inserted.
+  if (!FalseBlock)
+    return 1;
+
+  BuildMI(&MBB, DL, get(Lanai::BT)).addMBB(FalseBlock);
+  return 2;
+}
+
+unsigned LanaiInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                      int *BytesRemoved) const {
+  assert(!BytesRemoved && "code size not handled");
+
+  MachineBasicBlock::iterator Instruction = MBB.end();
+  unsigned Count = 0;
+
+  while (Instruction != MBB.begin()) {
+    --Instruction;
+    if (Instruction->isDebugValue())
+      continue;
+    if (Instruction->getOpcode() != Lanai::BT &&
+        Instruction->getOpcode() != Lanai::BRCC) {
+      break;
+    }
+
+    // Remove the branch.
+    Instruction->eraseFromParent();
+    Instruction = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+unsigned LanaiInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                             int &FrameIndex) const {
+  if (MI.getOpcode() == Lanai::LDW_RI)
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+        MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+  return 0;
+}
+
+unsigned LanaiInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
+                                                   int &FrameIndex) const {
+  if (MI.getOpcode() == Lanai::LDW_RI) {
+    unsigned Reg;
+    if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
+      return Reg;
+    // Check for post-frame index elimination operations
+    const MachineMemOperand *Dummy;
+    return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+  }
+  return 0;
+}
+
+unsigned LanaiInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                            int &FrameIndex) const {
+  if (MI.getOpcode() == Lanai::SW_RI)
+    if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
+        MI.getOperand(1).getImm() == 0) {
+      FrameIndex = MI.getOperand(0).getIndex();
+      return MI.getOperand(2).getReg();
+    }
+  return 0;
+}
+
+bool LanaiInstrInfo::getMemOpBaseRegImmOfsWidth(
+    MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, unsigned &Width,
+    const TargetRegisterInfo * /*TRI*/) const {
+  // Handle only loads/stores with base register followed by immediate offset
+  // and with add as ALU op.
+  if (LdSt.getNumOperands() != 4)
+    return false;
+  if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isImm() ||
+      !(LdSt.getOperand(3).isImm() && LdSt.getOperand(3).getImm() == LPAC::ADD))
+    return false;
+
+  switch (LdSt.getOpcode()) {
+  default:
+    return false;
+  case Lanai::LDW_RI:
+  case Lanai::LDW_RR:
+  case Lanai::SW_RR:
+  case Lanai::SW_RI:
+    Width = 4;
+    break;
+  case Lanai::LDHs_RI:
+  case Lanai::LDHz_RI:
+  case Lanai::STH_RI:
+    Width = 2;
+    break;
+  case Lanai::LDBs_RI:
+  case Lanai::LDBz_RI:
+  case Lanai::STB_RI:
+    Width = 1;
+    break;
+  }
+
+  BaseReg = LdSt.getOperand(1).getReg();
+  Offset = LdSt.getOperand(2).getImm();
+  return true;
+}
+
+bool LanaiInstrInfo::getMemOpBaseRegImmOfs(
+    MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset,
+    const TargetRegisterInfo *TRI) const {
+  switch (LdSt.getOpcode()) {
+  default:
+    return false;
+  case Lanai::LDW_RI:
+  case Lanai::LDW_RR:
+  case Lanai::SW_RR:
+  case Lanai::SW_RI:
+  case Lanai::LDHs_RI:
+  case Lanai::LDHz_RI:
+  case Lanai::STH_RI:
+  case Lanai::LDBs_RI:
+  case Lanai::LDBz_RI:
+    unsigned Width;
+    return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI);
+  }
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.h
new file mode 100644
index 000000000000..4387fe1af3c3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.h
@@ -0,0 +1,186 @@
+//===- LanaiInstrInfo.h - Lanai Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Lanai implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIINSTRINFO_H
+#define LLVM_LIB_TARGET_LANAI_LANAIINSTRINFO_H
+
+#include "LanaiRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "LanaiGenInstrInfo.inc"
+
+namespace llvm {
+
+class LanaiInstrInfo : public LanaiGenInstrInfo {
+  const LanaiRegisterInfo RegisterInfo;
+
+public:
+  LanaiInstrInfo();
+
+  // getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  // such, whenever a client has an instance of instruction info, it should
+  // always be able to get register info as well (through this method).
+  virtual const LanaiRegisterInfo &getRegisterInfo() const {
+    return RegisterInfo;
+  }
+
+  bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+                                       AliasAnalysis *AA) const override;
+
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
+                                     int &FrameIndex) const override;
+
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
+                   const DebugLoc &DL, unsigned DestinationRegister,
+                   unsigned SourceRegister, bool KillSource) const override;
+
+  void
+  storeRegToStackSlot(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator Position,
+                      unsigned SourceRegister, bool IsKill, int FrameIndex,
+                      const TargetRegisterClass *RegisterClass,
+                      const TargetRegisterInfo *RegisterInfo) const override;
+
+  void
+  loadRegFromStackSlot(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator Position,
+                       unsigned DestinationRegister, int FrameIndex,
+                       const TargetRegisterClass *RegisterClass,
+                       const TargetRegisterInfo *RegisterInfo) const override;
+
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+                             int64_t &Offset,
+                             const TargetRegisterInfo *TRI) const override;
+
+  bool getMemOpBaseRegImmOfsWidth(MachineInstr &LdSt, unsigned &BaseReg,
+                                  int64_t &Offset, unsigned &Width,
+                                  const TargetRegisterInfo *TRI) const;
+
+  std::pair<unsigned, unsigned>
+  decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableDirectMachineOperandTargetFlags() const override;
+
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TrueBlock,
+                     MachineBasicBlock *&FalseBlock,
+                     SmallVectorImpl<MachineOperand> &Condition,
+                     bool AllowModify) const override;
+
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+
+  // For a comparison instruction, return the source registers in SrcReg and
+  // SrcReg2 if having two register operands, and the value it compares against
+  // in CmpValue. Return true if the comparison instruction can be analyzed.
+  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &CmpMask,
+                      int &CmpValue) const override;
+
+  // See if the comparison instruction can be converted into something more
+  // efficient. E.g., on Lanai register-register instructions can set the flag
+  // register, obviating the need for a separate compare.
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int CmpMask, int CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
+
+  // Analyze the given select instruction, returning true if it cannot be
+  // understood. It is assumed that MI->isSelect() is true.
+  //
+  // When successful, return the controlling condition and the operands that
+  // determine the true and false result values.
+  //
+  //   Result = SELECT Cond, TrueOp, FalseOp
+  //
+  // Lanai can optimize certain select instructions, for example by predicating
+  // the instruction defining one of the operands and sets Optimizable to true.
+  bool analyzeSelect(const MachineInstr &MI,
+                     SmallVectorImpl<MachineOperand> &Cond, unsigned &TrueOp,
+                     unsigned &FalseOp, bool &Optimizable) const override;
+
+  // Given a select instruction that was understood by analyzeSelect and
+  // returned Optimizable = true, attempt to optimize MI by merging it with one
+  // of its operands. Returns NULL on failure.
+  //
+  // When successful, returns the new select instruction. The client is
+  // responsible for deleting MI.
+  //
+  // If both sides of the select can be optimized, the TrueOp is modifed.
+  // PreferFalse is not used.
+  MachineInstr *optimizeSelect(MachineInstr &MI,
+                               SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+                               bool PreferFalse) const override;
+
+  bool reverseBranchCondition(
+      SmallVectorImpl<MachineOperand> &Condition) const override;
+
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TrueBlock,
+                        MachineBasicBlock *FalseBlock,
+                        ArrayRef<MachineOperand> Condition,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+};
+
+static inline bool isSPLSOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case Lanai::LDBs_RI:
+  case Lanai::LDBz_RI:
+  case Lanai::LDHs_RI:
+  case Lanai::LDHz_RI:
+  case Lanai::STB_RI:
+  case Lanai::STH_RI:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static inline bool isRMOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case Lanai::LDW_RI:
+  case Lanai::SW_RI:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static inline bool isRRMOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case Lanai::LDBs_RR:
+  case Lanai::LDBz_RR:
+  case Lanai::LDHs_RR:
+  case Lanai::LDHz_RR:
+  case Lanai::LDWz_RR:
+  case Lanai::LDW_RR:
+  case Lanai::STB_RR:
+  case Lanai::STH_RR:
+  case Lanai::SW_RR:
+    return true;
+  default:
+    return false;
+  }
+}
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIINSTRINFO_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.td b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.td
new file mode 100644
index 000000000000..285fca11737d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.td
@@ -0,0 +1,884 @@
+//===-- LanaiInstrInfo.td - Target Description for Lanai Target -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Lanai instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "LanaiInstrFormats.td"
+
+// -------------------------------------------------- //
+// Instruction Operands and Patterns
+// -------------------------------------------------- //
+
+//  These are target-independent nodes, but have target-specific formats.
+def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_LanaiCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
+                                          SDTCisVT<1, i32>]>;
+def SDT_LanaiCall         : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
+def SDT_LanaiSetFlag      : SDTypeProfile<0,  2, [SDTCisSameAs<0, 1>]>;
+def SDT_LanaiSelectCC     : SDTypeProfile<1,  3, [SDTCisSameAs<0, 1>,
+                                                  SDTCisSameAs<1, 2>]>;
+def SDT_LanaiSetCC        : SDTypeProfile<1,  1, [SDTCisVT<0, i32>,
+                                                  SDTCisVT<1, i32>]>;
+def SDT_LanaiBrCC         : SDTypeProfile<0,  2, [SDTCisVT<0, OtherVT>,
+                                                  SDTCisVT<1, i32>]>;
+def SDT_LanaiAdjDynAlloc  : SDTypeProfile<1,  1, [SDTCisVT<0, i32>,
+                                                  SDTCisVT<1, i32>]>;
+
+def Call             : SDNode<"LanaiISD::CALL", SDT_LanaiCall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                               SDNPVariadic]>;
+def RetFlag          : SDNode<"LanaiISD::RET_FLAG", SDTNone,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def CallSeqStart     : SDNode<"ISD::CALLSEQ_START", SDT_LanaiCallSeqStart,
+                              [SDNPHasChain, SDNPOutGlue]>;
+def CallSeqEnd       : SDNode<"ISD::CALLSEQ_END", SDT_LanaiCallSeqEnd,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def LanaiSetFlag     : SDNode<"LanaiISD::SET_FLAG", SDT_LanaiSetFlag,
+                              [SDNPOutGlue]>;
+def LanaiSubbF       : SDNode<"LanaiISD::SUBBF", SDT_LanaiSetFlag,
+                              [SDNPOutGlue, SDNPInGlue]>;
+def LanaiBrCC        : SDNode<"LanaiISD::BR_CC", SDT_LanaiBrCC,
+                              [SDNPHasChain, SDNPInGlue]>;
+def LanaiSelectCC    : SDNode<"LanaiISD::SELECT_CC", SDT_LanaiSelectCC,
+                              [SDNPInGlue]>;
+def LanaiSetCC       : SDNode<"LanaiISD::SETCC", SDT_LanaiSetCC,
+                              [SDNPInGlue]>;
+def LanaiHi          : SDNode<"LanaiISD::HI", SDTIntUnaryOp>;
+def LanaiLo          : SDNode<"LanaiISD::LO", SDTIntUnaryOp>;
+def LanaiSmall       : SDNode<"LanaiISD::SMALL", SDTIntUnaryOp>;
+def LanaiAdjDynAlloc : SDNode<"LanaiISD::ADJDYNALLOC", SDT_LanaiAdjDynAlloc>;
+
+// Extract bits 0-15 (low-end) of an immediate value.
+def LO16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((uint64_t)N->getZExtValue() & 0xffff,
+                                   SDLoc(N), MVT::i32);
+}]>;
+
+// Extract bits 16-31 (high-end) of an immediate value.
+// Transformation function: shift the immediate value down into the low bits.
+def HI16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((uint64_t)N->getZExtValue() >> 16, SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+def NEG : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def LO21 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((uint64_t)N->getZExtValue() & 0x1fffff,
+                                   SDLoc(N), MVT::i32);
+}]>;
+
+// Branch targets
+def BrTargetAsmOperand : AsmOperandClass {
+  let Name = "BrTarget";
+}
+def BrTarget   : Operand<OtherVT> {
+  let ParserMatchClass = BrTargetAsmOperand;
+  let EncoderMethod = "getBranchTargetOpValue";
+  let DecoderMethod = "decodeBranch";
+}
+
+def CallTargetAsmOperand : AsmOperandClass {
+  let Name = "CallTarget";
+}
+def CallTarget : Operand<i32> {
+  let ParserMatchClass = CallTargetAsmOperand;
+  let EncoderMethod = "getBranchTargetOpValue";
+  let DecoderMethod = "decodeBranch";
+}
+
+def ImmShiftAsmOperand : AsmOperandClass { let Name = "ImmShift"; }
+def immShift : Operand<i32>, PatLeaf<(imm), [{
+    int Imm = N->getSExtValue();
+    return Imm >= -31 && Imm <= 31;}]> {
+  let ParserMatchClass = ImmShiftAsmOperand;
+  let DecoderMethod = "decodeShiftImm";
+}
+
+def Imm10AsmOperand : AsmOperandClass { let Name = "Imm10"; }
+def imm10 : Operand<i32>, PatLeaf<(imm), [{
+    return isInt<10>(N->getSExtValue()); }]> {
+  let ParserMatchClass = Imm10AsmOperand;
+}
+
+def LoImm16AsmOperand : AsmOperandClass { let Name = "LoImm16"; }
+def i32lo16z : Operand<i32>, PatLeaf<(i32 imm), [{
+    // i32lo16 predicate - true if the 32-bit immediate has only rightmost 16
+    // bits set.
+    return ((N->getZExtValue() & 0xFFFFUL) == N->getZExtValue());}], LO16> {
+  let ParserMatchClass = LoImm16AsmOperand;
+}
+def i32neg16 : Operand<i32>, PatLeaf<(i32 imm), [{
+    // i32neg16 predicate - true if the 32-bit immediate is negative and can
+    // be represented by a 16 bit integer.
+    int Imm = N->getSExtValue();
+    return (Imm < 0) && (isInt<16>(Imm));}], LO16> {
+  let ParserMatchClass = LoImm16AsmOperand;
+}
+def i32lo16s : Operand<i32>, PatLeaf<(i32 imm), [{
+    // i32lo16 predicate - true if the 32-bit immediate has only rightmost 16
+    // bits set.
+    return ((int64_t)(N->getSExtValue() & 0xFFFFUL) == N->getSExtValue());}], LO16> {
+  let ParserMatchClass = LoImm16AsmOperand;
+}
+
+def LoImm16AndAsmOperand : AsmOperandClass { let Name = "LoImm16And"; }
+def i32lo16and : Operand<i32>, PatLeaf<(i32 imm), [{
+    // i32lo16 predicate - true if the 32-bit immediate has the rightmost 16
+    // bits set and the leftmost 16 bits 1's.
+    return (N->getZExtValue() >= 0xFFFF0000UL);}], LO16> {
+  let ParserMatchClass = LoImm16AndAsmOperand;
+  let PrintMethod = "printLo16AndImmOperand";
+}
+
+def HiImm16AsmOperand : AsmOperandClass { let Name = "HiImm16"; }
+def i32hi16 : Operand<i32>, PatLeaf<(i32 imm), [{
+    // i32hi16 predicate - true if the 32-bit immediate has only leftmost 16
+    // bits set.
+    return ((N->getZExtValue() & 0xFFFF0000UL) == N->getZExtValue());}], HI16> {
+  let ParserMatchClass = HiImm16AsmOperand;
+  let PrintMethod = "printHi16ImmOperand";
+}
+
+def HiImm16AndAsmOperand : AsmOperandClass { let Name = "HiImm16And"; }
+def i32hi16and : Operand<i32>, PatLeaf<(i32 imm), [{
+    // i32lo16 predicate - true if the 32-bit immediate has the leftmost 16
+    // bits set and the rightmost 16 bits 1's.
+    return ((N->getZExtValue() & 0xFFFFUL) == 0xFFFFUL);}], HI16> {
+  let ParserMatchClass = HiImm16AndAsmOperand;
+  let PrintMethod = "printHi16AndImmOperand";
+}
+
+def LoImm21AsmOperand : AsmOperandClass { let Name = "LoImm21"; }
+def i32lo21 : Operand<i32>, PatLeaf<(i32 imm), [{
+    // i32lo21 predicate - true if the 32-bit immediate has only rightmost 21
+    // bits set.
+    return ((N->getZExtValue() & 0x1FFFFFUL) == N->getZExtValue());}], LO21> {
+  let ParserMatchClass = LoImm21AsmOperand;
+}
+
+def AluOp : Operand<i32> {
+  let PrintMethod = "printAluOperand";
+}
+
+// Addressing modes.
+def ADDRrr : ComplexPattern<i32, 3, "selectAddrRr", [], []>;
+def ADDRri : ComplexPattern<i32, 3, "selectAddrRi", [frameindex], []>;
+def ADDRsls : ComplexPattern<i32, 1, "selectAddrSls", [frameindex], []>;
+def ADDRspls : ComplexPattern<i32, 3, "selectAddrSpls", [frameindex], []>;
+
+// Address operands
+def MemRegImmAsmOperand : AsmOperandClass {
+  let Name = "MemRegImm";
+  let ParserMethod  = "parseMemoryOperand";
+}
+def MEMri : Operand<i32> {
+  let DecoderMethod = "decodeRiMemoryValue";
+  let EncoderMethod = "getRiMemoryOpValue";
+  let MIOperandInfo = (ops GPR:$base, i32lo16s:$offset, AluOp:$Opcode);
+  let ParserMatchClass = MemRegImmAsmOperand;
+  let PrintMethod   = "printMemRiOperand";
+}
+
+def MemRegRegAsmOperand : AsmOperandClass {
+  let Name = "MemRegReg";
+  let ParserMethod  = "parseMemoryOperand";
+}
+def MEMrr : Operand<i32> {
+  let DecoderMethod = "decodeRrMemoryValue";
+  let EncoderMethod = "getRrMemoryOpValue";
+  let MIOperandInfo = (ops GPR:$Op1, GPR:$Op2, AluOp:$Opcode);
+  let ParserMatchClass = MemRegRegAsmOperand;
+  let PrintMethod   = "printMemRrOperand";
+}
+
+def MemImmAsmOperand : AsmOperandClass {
+  let Name = "MemImm";
+  let ParserMethod  = "parseMemoryOperand";
+}
+def MEMi : Operand<i32> {
+  let MIOperandInfo = (ops i32lo21:$offset);
+  let ParserMatchClass = MemImmAsmOperand;
+  let PrintMethod   = "printMemImmOperand";
+}
+
+def MemSplsAsmOperand : AsmOperandClass {
+  let Name = "MemSpls";
+  let ParserMethod  = "parseMemoryOperand";
+}
+def MEMspls : Operand<i32> {
+  let DecoderMethod = "decodeSplsValue";
+  let EncoderMethod = "getSplsOpValue";
+  let MIOperandInfo = (ops GPR:$base, imm10:$offset, AluOp:$Opcode);
+  let ParserMatchClass = MemSplsAsmOperand;
+  let PrintMethod   = "printMemSplsOperand";
+}
+
+def CCOp : Operand<i32> {
+  let PrintMethod = "printCCOperand";
+}
+
+// Predicate operand. Default to 0 = true.
+def CondCodeOperand : AsmOperandClass { let Name = "CondCode"; }
+
+def pred : PredicateOperand<i32, (ops i32imm), (ops (i32 0))> {
+  let PrintMethod = "printPredicateOperand";
+  let ParserMatchClass = CondCodeOperand;
+  let DecoderMethod = "decodePredicateOperand";
+}
+
+let hasSideEffects = 0, Inst = 0x00000001 in
+  def NOP : InstLanai<(outs), (ins), "nop", []>;
+
+// Special NOPs to change logging level in vlanai.
+let hasSideEffects = 0, Inst = 0x00000002 in
+  def LOG0 : InstLanai<(outs), (ins), "log_0", []>;
+let hasSideEffects = 0, Inst = 0x00000003 in
+  def LOG1 : InstLanai<(outs), (ins), "log_1", []>;
+let hasSideEffects = 0, Inst = 0x00000004 in
+  def LOG2 : InstLanai<(outs), (ins), "log_2", []>;
+let hasSideEffects = 0, Inst = 0x00000005 in
+  def LOG3 : InstLanai<(outs), (ins), "log_3", []>;
+let hasSideEffects = 0, Inst = 0x00000006 in
+  def LOG4 : InstLanai<(outs), (ins), "log_4", []>;
+
+// Map an SPLS instruction onto itself. All other instructions will be mapped
+// onto -1. Used to identify SPLS instructions.
+def splsIdempotent : InstrMapping {
+  let FilterClass = "InstSPLS";
+  let RowFields = ["AsmString"];
+  let ColFields = ["PostEncoderMethod"];
+  let KeyCol = ["adjustPqBitsSpls"];
+  let ValueCols = [["adjustPqBitsSpls"]];
+}
+
+// -------------------------------------------------- //
+// ALU instructions
+// -------------------------------------------------- //
+multiclass ALUbase<bits<3> subOp, string AsmStr, SDNode OpNode,
+                   PatLeaf LoExt, PatLeaf HiExt,
+                   list<dag> loPattern, list<dag> hiPattern> {
+  // Register Immediate
+  let H = 0 in
+    def LO : InstRI<subOp, (outs GPR:$Rd), (ins GPR:$Rs1, LoExt:$imm16),
+                    !strconcat(AsmStr, "\t$Rs1, $imm16, $Rd"),
+                    loPattern>;
+  let H = 1 in
+    def HI : InstRI<subOp, (outs GPR:$Rd), (ins GPR:$Rs1, HiExt:$imm16),
+                    !strconcat(AsmStr, "\t$Rs1, $imm16, $Rd"),
+                    hiPattern>;
+
+}
+
+multiclass ALUarith<bits<3> subOp, string AsmStr, SDNode OpNode,
+                    PatLeaf LoExt, PatLeaf HiExt> {
+  defm I_ : ALUbase<subOp, AsmStr, OpNode, LoExt, HiExt, [], []>;
+
+  // Register Register
+  let JJJJJ = 0 in
+    def R : InstRR<subOp, (outs GPR:$Rd), (ins GPR:$Rs1, GPR:$Rs2, pred:$DDDI),
+                   !strconcat(AsmStr, "$DDDI\t$Rs1, $Rs2, $Rd"),
+                   [(set GPR:$Rd, (OpNode GPR:$Rs1, GPR:$Rs2))]>;
+}
+
+multiclass ALUlogic<bits<3> subOp, string AsmStr, SDNode OpNode,
+                    PatLeaf LoExt, PatLeaf HiExt> {
+  defm I_ : ALUbase<subOp, AsmStr, OpNode, LoExt, HiExt,
+                    [(set GPR:$Rd, (OpNode GPR:$Rs1, LoExt:$imm16))],
+                    [(set GPR:$Rd, (OpNode GPR:$Rs1, HiExt:$imm16))]>;
+
+  // Register Register
+  let JJJJJ = 0 in
+    def R : InstRR<subOp, (outs GPR:$Rd), (ins GPR:$Rs1, GPR:$Rs2, pred:$DDDI),
+                   !strconcat(AsmStr, "$DDDI\t$Rs1, $Rs2, $Rd"),
+                   [(set GPR:$Rd, (OpNode GPR:$Rs1, GPR:$Rs2))]>;
+}
+
+// Non flag setting ALU operations
+let isAsCheapAsAMove = 1, F = 0 in {
+  let isCommutable = 1 in {
+    defm ADD_ : ALUarith<0b000, "add", add, i32lo16z, i32hi16>;
+  }
+  defm SUB_ : ALUarith<0b010,   "sub", sub, i32lo16z, i32hi16>;
+  let isCommutable = 1 in {
+    defm AND_ : ALUlogic<0b100, "and", and, i32lo16and, i32hi16and>;
+    defm OR_  : ALUlogic<0b101,  "or",  or, i32lo16z, i32hi16>;
+    defm XOR_ : ALUlogic<0b110, "xor", xor, i32lo16z, i32hi16>;
+  }
+}
+
+def : Pat<(add GPR:$Rs1, i32lo16z:$imm),
+          (ADD_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(sub GPR:$Rs1, i32lo16z:$imm),
+          (SUB_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(add GPR:$Rs1, i32hi16:$imm),
+          (ADD_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+def : Pat<(sub GPR:$Rs1, i32hi16:$imm),
+          (SUB_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+def : Pat<(i32 i32lo16and:$imm), (AND_I_LO (i32 R1), i32lo16and:$imm)>;
+def : Pat<(i32 i32hi16and:$imm), (AND_I_HI (i32 R1), i32hi16and:$imm)>;
+
+// Change add/sub with negative number to sub/add
+def : Pat<(add GPR:$Rs1, i32neg16:$imm),
+          (SUB_I_LO GPR:$Rs1, (NEG $imm))>;
+def : Pat<(sub GPR:$Rs1, i32neg16:$imm),
+          (ADD_I_LO GPR:$Rs1, (NEG $imm))>;
+
+// Flag (incl. carry) setting addition and subtraction
+let F = 1, Defs = [SR] in {
+  defm ADD_F_ : ALUarith<0b000, "add.f", addc, i32lo16z, i32hi16>;
+  defm SUB_F_ : ALUarith<0b010, "sub.f", subc, i32lo16z, i32hi16>;
+}
+
+def : Pat<(addc GPR:$Rs1, i32lo16z:$imm),
+          (ADD_F_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(subc GPR:$Rs1, i32lo16z:$imm),
+          (SUB_F_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(addc GPR:$Rs1, i32hi16:$imm),
+          (ADD_F_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+def : Pat<(subc GPR:$Rs1, i32hi16:$imm),
+          (SUB_F_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+// Carry using addition and subtraction
+let F = 0, Uses = [SR] in {
+  defm ADDC_ : ALUarith<0b001, "addc", adde, i32lo16z, i32hi16>;
+  defm SUBB_ : ALUarith<0b011, "subb", sube, i32lo16z, i32hi16>;
+}
+
+def : Pat<(adde GPR:$Rs1, i32lo16z:$imm),
+          (ADDC_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(sube GPR:$Rs1, i32lo16z:$imm),
+          (SUBB_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(adde GPR:$Rs1, i32hi16:$imm),
+          (ADDC_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+def : Pat<(sube GPR:$Rs1, i32hi16:$imm),
+          (SUBB_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+// Flag setting ALU operations
+let isAsCheapAsAMove = 1, F = 1, Defs = [SR] in {
+  let isCommutable = 1 in {
+    defm AND_F_ : ALUlogic<0b100, "and.f",  and, i32lo16and, i32hi16and>;
+    defm OR_F_  : ALUlogic<0b101,  "or.f",   or, i32lo16z, i32hi16>;
+    defm XOR_F_ : ALUlogic<0b110, "xor.f",  xor, i32lo16z, i32hi16>;
+  }
+}
+
+let isAsCheapAsAMove = 1, F = 1, Defs = [SR], Uses = [SR] in {
+  defm ADDC_F_ : ALUarith<0b001, "addc.f", adde, i32lo16z, i32hi16>;
+  defm SUBB_F_ : ALUarith<0b011, "subb.f", sube, i32lo16z, i32hi16>;
+}
+
+def : Pat<(LanaiSubbF GPR:$Rs1, GPR:$Rs2),
+          (SUBB_F_R GPR:$Rs1, GPR:$Rs2)>;
+
+def : Pat<(LanaiSubbF GPR:$Rs1, i32lo16z:$imm),
+          (SUBB_F_I_LO GPR:$Rs1, i32lo16z:$imm)>;
+
+def : Pat<(LanaiSubbF GPR:$Rs1, i32hi16:$imm),
+          (SUBB_F_I_HI GPR:$Rs1, i32hi16:$imm)>;
+
+def : InstAlias<"mov $src, $dst", (ADD_R GPR:$dst, GPR:$src, R0, 0)>;
+
+let isAsCheapAsAMove = 1, Rs1 = R0.Num, isCodeGenOnly = 1, H = 1, F = 0,
+  isReMaterializable = 1 in
+  def MOVHI : InstRI<0b000, (outs GPR:$Rd), (ins i32hi16:$imm16),
+                     "mov\t$imm16, $Rd",
+                     [(set GPR:$Rd, i32hi16:$imm16)]>;
+
+def : InstAlias<"mov $imm16, $dst", (ADD_I_LO GPR:$dst, R0, i32lo16z:$imm16)>;
+def : InstAlias<"mov $imm16, $dst", (ADD_I_HI GPR:$dst, R0, i32hi16:$imm16)>;
+def : InstAlias<"mov $imm16, $dst",
+                (AND_I_LO GPR:$dst, R1, i32lo16and:$imm16)>;
+def : InstAlias<"mov $imm16, $dst",
+                (AND_I_HI GPR:$dst, R1, i32hi16and:$imm16)>;
+
+// Shift instructions
+class ShiftRI<string AsmStr, list<dag> Pattern>
+  : InstRI<0b111, (outs GPR:$Rd), (ins GPR:$Rs1, immShift:$imm16),
+           !strconcat(AsmStr, "\t$Rs1, $imm16, $Rd"), Pattern> {
+  let isReMaterializable = 1;
+}
+
+let F = 0 in {
+  let H = 0 in
+    def SL_I : ShiftRI<"sh", [(set GPR:$Rd, (shl GPR:$Rs1, immShift:$imm16))]>;
+  let H = 1 in
+    def SA_I : ShiftRI<"sha", []>;
+}
+def : Pat<(srl GPR:$Rs1, immShift:$imm), (SL_I GPR:$Rs1, (NEG $imm))>;
+def : Pat<(sra GPR:$Rs1, immShift:$imm), (SA_I GPR:$Rs1, (NEG $imm))>;
+
+let F = 1, Defs = [SR] in {
+  let H = 0 in
+    def SL_F_I : ShiftRI<"sh.f", []>;
+  let H = 1 in
+    def SA_F_I : ShiftRI<"sha.f", []>;
+}
+
+class ShiftRR<string AsmStr, list<dag> Pattern>
+  : InstRR<0b111, (outs GPR:$Rd), (ins GPR:$Rs1, GPR:$Rs2, pred:$DDDI), AsmStr,
+           Pattern>;
+
+let F = 0 in {
+  let JJJJJ = 0b10000 in
+    def SHL_R : ShiftRR<"sh$DDDI\t$Rs1, $Rs2, $Rd",
+                        [(set GPR:$Rd, (shl GPR:$Rs1, GPR:$Rs2))]>;
+  let isCodeGenOnly = 1 in {
+    let JJJJJ = 0b10000 in
+      def SRL_R : ShiftRR<"sh$DDDI\t$Rs1, $Rs2, $Rd", []>;
+  }
+  let JJJJJ = 0b11000 in
+    def SRA_R : ShiftRR<"sha$DDDI\t$Rs1, $Rs2, $Rd", []>;
+}
+
+let F = 1, Defs = [SR] in {
+  let JJJJJ = 0b10000 in
+    def SHL_F_R : ShiftRR<"sh.f$DDDI\t$Rs1, $Rs2, $Rd", []>;
+  let isCodeGenOnly = 1 in {
+    let JJJJJ = 0b10000 in
+      def SRL_F_R : ShiftRR<"sh.f$DDDI\t$Rs1, $Rs2, $Rd", []>;
+  }
+  let JJJJJ = 0b11000 in
+    def SRA_F_R : ShiftRR<"sha.f$DDDI\t$Rs1, $Rs2, $Rd", []>;
+}
+
+// Expand shift-right operations
+def : Pat<(srl GPR:$Rs1, GPR:$Rs2),
+          (SRL_R GPR:$Rs1, (SUB_R R0, GPR:$Rs2))>;
+def : Pat<(sra GPR:$Rs1, GPR:$Rs2),
+          (SRA_R GPR:$Rs1, (SUB_R R0, GPR:$Rs2))>;
+
+// -------------------------------------------------- //
+// LOAD instructions
+// -------------------------------------------------- //
+
+class LoadRR<string OpcString, PatFrag OpNode, ValueType Ty>
+  : InstRRM<0b0, (outs GPR:$Rd), (ins MEMrr:$src),
+            !strconcat(OpcString, "\t$src, $Rd"),
+            [(set (Ty GPR:$Rd), (OpNode ADDRrr:$src))]>,
+    Sched<[WriteLD]> {
+  bits<20> src;
+
+  let Rs1 = src{19-15};
+  let Rs2 = src{14-10};
+  let P = src{9};
+  let Q = src{8};
+  let BBB = src{7-5};
+  let JJJJJ = src{4-0};
+  let mayLoad = 1;
+}
+
+class LoadRI<string OpcString, PatFrag OpNode, ValueType Ty>
+  : InstRM<0b0, (outs GPR:$Rd), (ins MEMri:$src),
+           !strconcat(OpcString, "\t$src, $Rd"),
+           [(set (Ty GPR:$Rd), (OpNode ADDRri:$src))]>,
+    Sched<[WriteLD]> {
+  bits<23> src;
+
+  let Itinerary = IIC_LD;
+  let Rs1 = src{22-18};
+  let P = src{17};
+  let Q = src{16};
+  let imm16 = src{15-0};
+  let isReMaterializable = 1;
+  let mayLoad = 1;
+}
+
+let E = 0 in {
+  let YL = 0b01 in {
+    // uld is used here and ld in the alias as the alias is printed out first if
+    // an alias exist
+    def LDW_RI : LoadRI<"uld", load, i32>;
+    def LDW_RR : LoadRR<"ld", load, i32>;
+  }
+}
+
+def : InstAlias<"ld $src, $dst", (LDW_RI GPR:$dst, MEMri:$src)>;
+
+let E = 1 in {
+  let YL = 0b01 in {
+    def LDWz_RR : LoadRR<"uld", zextloadi32, i32>;
+  }
+}
+
+let E = 1 in {
+  let YL = 0b00 in
+    def LDHz_RR : LoadRR<"uld.h", zextloadi16, i32>;
+  let YL = 0b10 in
+    def LDBz_RR : LoadRR<"uld.b", zextloadi8, i32>;
+}
+
+let E = 0 in {
+  let YL = 0b00 in
+    def LDHs_RR : LoadRR<"ld.h", sextloadi16, i32>;
+  let YL = 0b10 in
+    def LDBs_RR : LoadRR<"ld.b", sextloadi8, i32>;
+}
+
+def LDADDR : InstSLS<0x0, (outs GPR:$Rd), (ins MEMi:$src),
+                     "ld\t$src, $Rd",
+                     [(set (i32 GPR:$Rd), (load ADDRsls:$src))]>,
+    Sched<[WriteLD]> {
+  bits<21> src;
+
+  let Itinerary = IIC_LD;
+  let msb = src{20-16};
+  let lsb = src{15-0};
+  let isReMaterializable = 1;
+  let mayLoad = 1;
+}
+
+class LoadSPLS<string asmstring, PatFrag opNode>
+  : InstSPLS<(outs GPR:$Rd), (ins MEMspls:$src),
+             !strconcat(asmstring, "\t$src, $Rd"),
+             [(set (i32 GPR:$Rd), (opNode ADDRspls:$src))]>,
+    Sched<[WriteLDSW]> {
+  bits<17> src;
+  let Itinerary = IIC_LDSW;
+  let Rs1 = src{16-12};
+  let P = src{11};
+  let Q = src{10};
+  let imm10 = src{9-0};
+  let mayLoad = 1;
+  let isReMaterializable = 1;
+}
+
+let Y = 0, S = 0, E = 1 in
+  def LDHz_RI : LoadSPLS<"uld.h", zextloadi16>;
+
+let Y = 0, S = 0, E = 0 in
+  def LDHs_RI : LoadSPLS<"ld.h", sextloadi16>;
+
+let Y = 1, S = 0, E = 1 in
+  def LDBz_RI : LoadSPLS<"uld.b", zextloadi8>;
+
+let Y = 1, S = 0, E = 0 in
+  def LDBs_RI : LoadSPLS<"ld.b", sextloadi8>;
+
+def SLI : InstSLI<(outs GPR:$Rd), (ins i32lo21:$imm),
+                  "mov\t$imm, $Rd",
+                  [(set GPR:$Rd, i32lo21:$imm)]> {
+  bits<21> imm;
+
+  let msb = imm{20-16};
+  let lsb = imm{15-0};
+  let isReMaterializable = 1;
+  let isAsCheapAsAMove = 1;
+}
+
+// -------------------------------------------------- //
+// STORE instructions
+// -------------------------------------------------- //
+
+class StoreRR<string OpcString, PatFrag OpNode, ValueType Ty>
+  : InstRRM<0b1, (outs), (ins GPR:$Rd, MEMrr:$dst),
+            !strconcat(OpcString, "\t$Rd, $dst"),
+            [(OpNode (Ty GPR:$Rd), ADDRrr:$dst)]>,
+    Sched<[WriteST]> {
+  bits<20> dst;
+
+  let Itinerary = IIC_ST;
+  let Rs1 = dst{19-15};
+  let Rs2 = dst{14-10};
+  let P = dst{9};
+  let Q = dst{8};
+  let BBB = dst{7-5};
+  let JJJJJ = dst{4-0};
+  let mayStore = 1;
+}
+
+class StoreRI<string OpcString, PatFrag OpNode, ValueType Ty>
+  : InstRM<0b1, (outs), (ins GPR:$Rd, MEMri:$dst),
+           !strconcat(OpcString, "\t$Rd, $dst"),
+           [(OpNode (Ty GPR:$Rd), ADDRri:$dst)]>,
+    Sched<[WriteST]> {
+  bits<23> dst;
+
+  let Itinerary = IIC_ST;
+  let Rs1 = dst{22-18};
+  let P = dst{17};
+  let Q = dst{16};
+  let imm16 = dst{15-0};
+  let mayStore = 1;
+}
+
+let YL = 0b01, E = 0 in {
+  def SW_RR : StoreRR<"st", store, i32>;
+  def SW_RI : StoreRI<"st", store, i32>;
+}
+
+let E = 0 in {
+  let YL = 0b00 in
+    def STH_RR : StoreRR<"st.h", truncstorei16, i32>;
+  let YL = 0b10 in
+    def STB_RR : StoreRR<"st.b", truncstorei8, i32>;
+}
+
+def STADDR : InstSLS<0x1, (outs), (ins GPR:$Rd, MEMi:$dst),
+                     "st\t$Rd, $dst",
+                     [(store (i32 GPR:$Rd), ADDRsls:$dst)]>,
+    Sched<[WriteST]> {
+  bits<21> dst;
+
+  let Itinerary = IIC_ST;
+  let msb = dst{20-16};
+  let lsb = dst{15-0};
+  let mayStore = 1;
+}
+
+class StoreSPLS<string asmstring, PatFrag opNode>
+  : InstSPLS<(outs), (ins GPR:$Rd, MEMspls:$dst),
+             !strconcat(asmstring, "\t$Rd, $dst"),
+             [(opNode (i32 GPR:$Rd), ADDRspls:$dst)]>,
+    Sched<[WriteSTSW]> {
+  bits<17> dst;
+
+  let Itinerary = IIC_STSW;
+  let Rs1 = dst{16-12};
+  let P = dst{11};
+  let Q = dst{10};
+  let imm10 = dst{9-0};
+  let mayStore = 1;
+}
+
+let Y = 0, S = 1, E = 0 in
+  def STH_RI : StoreSPLS<"st.h", truncstorei16>;
+
+let Y = 1, S = 1, E = 0 in
+  def STB_RI : StoreSPLS<"st.b", truncstorei8>;
+
+// -------------------------------------------------- //
+// BRANCH instructions
+// -------------------------------------------------- //
+
+let isBranch = 1, isBarrier = 1, isTerminator = 1, hasDelaySlot = 1 in {
+  def BT : InstBR<(outs), (ins BrTarget:$addr),
+                  "bt\t$addr",
+                  [(br bb:$addr)]> {
+    let DDDI = 0b0000;
+  }
+  let Uses = [SR] in
+    def BRCC : InstBR<(outs), (ins BrTarget:$addr, CCOp:$DDDI),
+                      "b$DDDI\t$addr",
+                      [(LanaiBrCC bb:$addr, imm:$DDDI)]>;
+
+  let isIndirectBranch = 1 in {
+    def JR : InstRR<0b101, (outs), (ins GPR:$Rs2), "bt\t$Rs2",
+                    [(brind GPR:$Rs2)]> {
+      let Rs1 = R0.Num;
+      let Rd = R2.Num;
+      let F = 0;
+      let JJJJJ = 0;
+      let DDDI = 0;
+    }
+  }
+}
+
+// -------------------------------------------------- //
+// Condition/SF instructions
+// -------------------------------------------------- //
+
+// Instructions to set flags used in lowering comparisons.
+multiclass SF<bits<3> op2Val, string AsmStr> {
+  let F = 1, Rd = R0.Num, JJJJJ = 0, Defs = [SR], DDDI = 0 in
+    def _RR : InstRR<op2Val, (outs), (ins GPR:$Rs1, GPR:$Rs2),
+                     !strconcat(AsmStr, "\t$Rs1, $Rs2, %r0"),
+                     [(LanaiSetFlag (i32 GPR:$Rs1), (i32 GPR:$Rs2))]>;
+  let F = 1, Rd = R0.Num, H = 0, Defs = [SR] in
+    def _RI_LO : InstRI<op2Val, (outs), (ins GPR:$Rs1, i32lo16z:$imm16),
+                     !strconcat(AsmStr, "\t$Rs1, $imm16, %r0"),
+                     [(LanaiSetFlag (i32 GPR:$Rs1), i32lo16z:$imm16)]>;
+  let F = 1, Rd = R0.Num, H = 1, Defs = [SR] in
+    def _RI_HI : InstRI<op2Val, (outs), (ins GPR:$Rs1, i32hi16:$imm16),
+                     !strconcat(AsmStr, "\t$Rs1, $imm16, %r0"),
+                     [(LanaiSetFlag (i32 GPR:$Rs1), i32hi16:$imm16)]>;
+}
+let isCodeGenOnly = 1, isCompare = 1 in {
+  defm SFSUB_F : SF<0b010, "sub.f">;
+}
+
+// Jump and link
+let isCall = 1, hasDelaySlot = 1, isCodeGenOnly = 1, Uses = [SP],
+    Defs = [RCA] in {
+  def CALL : Pseudo<(outs), (ins CallTarget:$addr), "", []>;
+  def CALLR : Pseudo<(outs), (ins GPR:$Rs1), "", [(Call GPR:$Rs1)]>;
+}
+
+let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
+    Uses = [RCA] in {
+  def RET : InstRM<0b0, (outs), (ins),
+                   "ld\t-4[%fp], %pc ! return",
+                   [(RetFlag)]> {
+    let Rd = PC.Num;
+    let Rs1 = FP.Num;
+    let P = 1;
+    let Q = 0;
+    let imm16 = -4;
+
+    // Post encoding is not needed for RET.
+    let PostEncoderMethod = "";
+  }
+}
+
+// ADJCALLSTACKDOWN/UP implicitly use/def SP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber SP.
+let Defs = [SP], Uses = [SP] in {
+  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                                "#ADJCALLSTACKDOWN $amt",
+                                [(CallSeqStart timm:$amt)]>;
+  def ADJCALLSTACKUP   : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                                "#ADJCALLSTACKUP $amt1 $amt2",
+                                [(CallSeqEnd timm:$amt1, timm:$amt2)]>;
+}
+
+let Defs = [SP], Uses = [SP] in {
+  def ADJDYNALLOC : Pseudo<(outs GPR:$dst), (ins GPR:$src),
+                           "#ADJDYNALLOC $dst $src",
+                           [(set GPR:$dst, (LanaiAdjDynAlloc GPR:$src))]>;
+}
+
+let Uses = [SR] in {
+  def SCC : InstSCC<(outs GPR:$Rs1), (ins CCOp:$DDDI),
+                    "s$DDDI\t$Rs1",
+                    [(set (i32 GPR:$Rs1), (LanaiSetCC imm:$DDDI))]>;
+}
+
+// SCC's output is already 1-bit so and'ing with 1 is redundant.
+def : Pat<(and (LanaiSetCC imm:$DDDI), 1), (SCC imm:$DDDI)>;
+
+// Select with hardware support
+let Uses = [SR], isSelect = 1 in {
+  def SELECT : InstRR<0b111, (outs GPR:$Rd),
+                      (ins GPR:$Rs1, GPR:$Rs2, CCOp:$DDDI),
+                      "sel.$DDDI $Rs1, $Rs2, $Rd",
+                      [(set (i32 GPR:$Rd),
+                       (LanaiSelectCC (i32 GPR:$Rs1), (i32 GPR:$Rs2),
+                                      (imm:$DDDI)))]> {
+    let JJJJJ = 0;
+    let F = 0;
+  }
+}
+
+let isBranch = 1, isBarrier = 1, isTerminator = 1, hasDelaySlot = 1,
+    isIndirectBranch = 1, Uses = [SR] in {
+  def BRIND_CC : InstRR<0b101, (outs), (ins GPR:$Rs1, CCOp:$DDDI),
+                        "b$DDDI\t$Rs1", []> {
+    let F = 0;
+    let JJJJJ = 0;
+    let Rd = PC.Num;
+    let Rs2 = R0.Num;
+  }
+
+  def BRIND_CCA : InstRR<0b101, (outs), (ins GPR:$Rs1, GPR:$Rs2, CCOp:$DDDI),
+                         "b${DDDI}\t$Rs1 add $Rs2", []> {
+    let F = 0;
+    let Rd = PC.Num;
+    let JJJJJ = 0;
+  }
+}
+
+// TODO: This only considers the case where BROFF is an immediate and not where
+// it is a register. Add support for register relative branching.
+let isBranch = 1, isBarrier = 1, isTerminator = 1, hasDelaySlot = 1, Rs1 = 0,
+    Uses = [SR] in
+  def BRR : InstBRR<(outs), (ins i16imm:$imm16, CCOp:$DDDI),
+                    "b${DDDI}.r\t$imm16", []>;
+
+let F = 0 in {
+// Population Count (POPC)
+def POPC: InstSpecial<0b001, (outs GPR:$Rd), (ins GPR:$Rs1),
+                      "popc\t$Rs1, $Rd",
+                      [(set GPR:$Rd, (ctpop GPR:$Rs1))]>;
+
+// Count Leading Zeros (LEADZ)
+def LEADZ: InstSpecial<0b010, (outs GPR:$Rd), (ins GPR:$Rs1),
+                       "leadz\t$Rs1, $Rd", [(set GPR:$Rd, (ctlz GPR:$Rs1))]>;
+
+// Count Trailing Zeros (TRAILZ)
+def TRAILZ : InstSpecial<0b011, (outs GPR:$Rd), (ins GPR:$Rs1),
+                         "trailz\t$Rs1, $Rd",
+                         [(set GPR:$Rd, (cttz GPR:$Rs1))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// unsigned 16-bit immediate
+def : Pat<(i32 i32lo16z:$imm), (OR_I_LO (i32 R0), imm:$imm)>;
+
+// arbitrary immediate
+def : Pat<(i32 imm:$imm), (OR_I_LO (MOVHI (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Calls
+def : Pat<(Call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
+def : Pat<(Call texternalsym:$dst), (CALL texternalsym:$dst)>;
+
+// Loads
+def : Pat<(extloadi8  ADDRspls:$src), (i32 (LDBz_RI ADDRspls:$src))>;
+def : Pat<(extloadi16 ADDRspls:$src), (i32 (LDHz_RI ADDRspls:$src))>;
+
+// GlobalAddress, ExternalSymbol, Jumptable, ConstantPool
+def : Pat<(LanaiHi tglobaladdr:$dst), (MOVHI tglobaladdr:$dst)>;
+def : Pat<(LanaiLo tglobaladdr:$dst), (OR_I_LO (i32 R0), tglobaladdr:$dst)>;
+def : Pat<(LanaiSmall tglobaladdr:$dst), (SLI tglobaladdr:$dst)>;
+def : Pat<(LanaiHi texternalsym:$dst), (MOVHI texternalsym:$dst)>;
+def : Pat<(LanaiLo texternalsym:$dst), (OR_I_LO (i32 R0), texternalsym:$dst)>;
+def : Pat<(LanaiSmall texternalsym:$dst), (SLI texternalsym:$dst)>;
+def : Pat<(LanaiHi tblockaddress:$dst), (MOVHI tblockaddress:$dst)>;
+def : Pat<(LanaiLo tblockaddress:$dst), (OR_I_LO (i32 R0), tblockaddress:$dst)>;
+def : Pat<(LanaiSmall tblockaddress:$dst), (SLI tblockaddress:$dst)>;
+def : Pat<(LanaiHi tjumptable:$dst), (MOVHI tjumptable:$dst)>;
+def : Pat<(LanaiLo tjumptable:$dst), (OR_I_LO (i32 R0), tjumptable:$dst)>;
+def : Pat<(LanaiSmall tjumptable:$dst), (SLI tjumptable:$dst)>;
+def : Pat<(LanaiHi tconstpool:$dst), (MOVHI tconstpool:$dst)>;
+def : Pat<(LanaiLo tconstpool:$dst), (OR_I_LO (i32 R0), tconstpool:$dst)>;
+def : Pat<(LanaiSmall tconstpool:$dst), (SLI tconstpool:$dst)>;
+
+def : Pat<(or GPR:$hi, (LanaiLo tglobaladdr:$lo)),
+          (OR_I_LO GPR:$hi, tglobaladdr:$lo)>;
+def : Pat<(or R0, (LanaiSmall tglobaladdr:$small)),
+          (SLI tglobaladdr:$small)>;
+def : Pat<(or GPR:$hi, (LanaiLo texternalsym:$lo)),
+          (OR_I_LO GPR:$hi, texternalsym:$lo)>;
+def : Pat<(or R0, (LanaiSmall texternalsym:$small)),
+          (SLI texternalsym:$small)>;
+def : Pat<(or GPR:$hi, (LanaiLo tblockaddress:$lo)),
+          (OR_I_LO GPR:$hi, tblockaddress:$lo)>;
+def : Pat<(or R0, (LanaiSmall tblockaddress:$small)),
+          (SLI tblockaddress:$small)>;
+def : Pat<(or GPR:$hi, (LanaiLo tjumptable:$lo)),
+          (OR_I_LO GPR:$hi, tjumptable:$lo)>;
+def : Pat<(or R0, (LanaiSmall tjumptable:$small)),
+          (SLI tjumptable:$small)>;
+def : Pat<(or GPR:$hi, (LanaiLo tconstpool:$lo)),
+          (OR_I_LO GPR:$hi, tconstpool:$lo)>;
+def : Pat<(or R0, (LanaiSmall tconstpool:$small)),
+          (SLI tconstpool:$small)>;
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp b/contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp
new file mode 100644
index 000000000000..39c633578d43
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp
@@ -0,0 +1,139 @@
+//=-- LanaiMCInstLower.cpp - Convert Lanai MachineInstr to an MCInst --------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Lanai MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiMCInstLower.h"
+
+#include "MCTargetDesc/LanaiBaseInfo.h"
+#include "MCTargetDesc/LanaiMCExpr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+MCSymbol *
+LanaiMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  return Printer.getSymbol(MO.getGlobal());
+}
+
+MCSymbol *
+LanaiMCInstLower::GetBlockAddressSymbol(const MachineOperand &MO) const {
+  return Printer.GetBlockAddressSymbol(MO.getBlockAddress());
+}
+
+MCSymbol *
+LanaiMCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCSymbol *LanaiMCInstLower::GetJumpTableSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "JTI"
+                            << Printer.getFunctionNumber() << '_'
+                            << MO.getIndex();
+  // Create a symbol for the name.
+  return Ctx.getOrCreateSymbol(Name.str());
+}
+
+MCSymbol *
+LanaiMCInstLower::GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "CPI"
+                            << Printer.getFunctionNumber() << '_'
+                            << MO.getIndex();
+  // Create a symbol for the name.
+  return Ctx.getOrCreateSymbol(Name.str());
+}
+
+MCOperand LanaiMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                               MCSymbol *Sym) const {
+  LanaiMCExpr::VariantKind Kind;
+
+  switch (MO.getTargetFlags()) {
+  case LanaiII::MO_NO_FLAG:
+    Kind = LanaiMCExpr::VK_Lanai_None;
+    break;
+  case LanaiII::MO_ABS_HI:
+    Kind = LanaiMCExpr::VK_Lanai_ABS_HI;
+    break;
+  case LanaiII::MO_ABS_LO:
+    Kind = LanaiMCExpr::VK_Lanai_ABS_LO;
+    break;
+  default:
+    llvm_unreachable("Unknown target flag on GV operand");
+  }
+
+  const MCExpr *Expr =
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+  Expr = LanaiMCExpr::create(Kind, Expr, Ctx);
+  return MCOperand::createExpr(Expr);
+}
+
+void LanaiMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit())
+        continue;
+      MCOp = MCOperand::createReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::createImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::createExpr(
+          MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
+      break;
+    case MachineOperand::MO_RegisterMask:
+      continue;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCOp = LowerSymbolOperand(MO, GetBlockAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
+      break;
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    }
+
+    OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.h b/contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.h
new file mode 100644
index 000000000000..6d7818d63d87
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.h
@@ -0,0 +1,47 @@
+//===-- LanaiMCInstLower.h - Lower MachineInstr to MCInst -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIMCINSTLOWER_H
+#define LLVM_LIB_TARGET_LANAI_LANAIMCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+class MachineInstr;
+class MachineModuleInfoMachO;
+class MachineOperand;
+class Mangler;
+
+// LanaiMCInstLower - This class is used to lower an MachineInstr
+// into an MCInst.
+class LLVM_LIBRARY_VISIBILITY LanaiMCInstLower {
+  MCContext &Ctx;
+
+  AsmPrinter &Printer;
+
+public:
+  LanaiMCInstLower(MCContext &CTX, AsmPrinter &AP) : Ctx(CTX), Printer(AP) {}
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetBlockAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIMCINSTLOWER_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..c72271b67790
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp
@@ -0,0 +1,23 @@
+//===-- LanaiMachineFuctionInfo.cpp - Lanai machine function info ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiMachineFunctionInfo.h"
+
+using namespace llvm;
+
+void LanaiMachineFunctionInfo::anchor() {}
+
+unsigned LanaiMachineFunctionInfo::getGlobalBaseReg() {
+  // Return if it has already been initialized.
+  if (GlobalBaseReg)
+    return GlobalBaseReg;
+
+  return GlobalBaseReg =
+             MF.getRegInfo().createVirtualRegister(&Lanai::GPRRegClass);
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.h b/contrib/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.h
new file mode 100644
index 000000000000..3bd9112a9e13
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.h
@@ -0,0 +1,58 @@
+//===- LanaiMachineFuctionInfo.h - Lanai machine func info -------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares Lanai-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_LANAI_LANAIMACHINEFUNCTIONINFO_H
+
+#include "LanaiRegisterInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+namespace llvm {
+
+// LanaiMachineFunctionInfo - This class is derived from MachineFunction and
+// contains private Lanai target-specific information for each MachineFunction.
+class LanaiMachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  MachineFunction &MF;
+
+  // SRetReturnReg - Lanai ABI require that sret lowering includes
+  // returning the value of the returned struct in a register. This field
+  // holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
+  // GlobalBaseReg - keeps track of the virtual register initialized for
+  // use as the global base register. This is used for PIC in some PIC
+  // relocation models.
+  unsigned GlobalBaseReg;
+
+  // VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+public:
+  explicit LanaiMachineFunctionInfo(MachineFunction &MF)
+      : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), VarArgsFrameIndex(0) {}
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  unsigned getGlobalBaseReg();
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIMACHINEFUNCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp b/contrib/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp
new file mode 100644
index 000000000000..7259c02194ca
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp
@@ -0,0 +1,425 @@
+//===-- LanaiMemAluCombiner.cpp - Pass to combine memory & ALU operations -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Simple pass to combine memory and ALU operations
+//
+// The Lanai ISA supports instructions where a load/store modifies the base
+// register used in the load/store operation. This pass finds suitable
+// load/store and ALU instructions and combines them into one instruction.
+//
+// For example,
+//   ld [ %r6 -- ], %r12
+// is a supported instruction that is not currently generated by the instruction
+// selection pass of this backend. This pass generates these instructions by
+// merging
+//   add %r6, -4, %r6
+// followed by
+//   ld [ %r6 ], %r12
+// in the same machine basic block into one machine instruction.
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define GET_INSTRMAP_INFO
+#include "LanaiGenInstrInfo.inc"
+
+#define DEBUG_TYPE "lanai-mem-alu-combiner"
+
+STATISTIC(NumLdStAluCombined, "Number of memory and ALU instructions combined");
+
+static llvm::cl::opt<bool> DisableMemAluCombiner(
+    "disable-lanai-mem-alu-combiner", llvm::cl::init(false),
+    llvm::cl::desc("Do not combine ALU and memory operators"),
+    llvm::cl::Hidden);
+
+namespace llvm {
+void initializeLanaiMemAluCombinerPass(PassRegistry &);
+} // namespace llvm
+
+namespace {
+typedef MachineBasicBlock::iterator MbbIterator;
+typedef MachineFunction::iterator MfIterator;
+
+class LanaiMemAluCombiner : public MachineFunctionPass {
+public:
+  static char ID;
+  explicit LanaiMemAluCombiner() : MachineFunctionPass(ID) {
+    initializeLanaiMemAluCombinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Lanai load / store optimization pass";
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  MbbIterator findClosestSuitableAluInstr(MachineBasicBlock *BB,
+                                          const MbbIterator &MemInstr,
+                                          bool Decrement);
+  void insertMergedInstruction(MachineBasicBlock *BB,
+                               const MbbIterator &MemInstr,
+                               const MbbIterator &AluInstr, bool Before);
+  bool combineMemAluInBasicBlock(MachineBasicBlock *BB);
+
+  // Target machine description which we query for register names, data
+  // layout, etc.
+  const TargetInstrInfo *TII;
+};
+} // namespace
+
+char LanaiMemAluCombiner::ID = 0;
+
+INITIALIZE_PASS(LanaiMemAluCombiner, DEBUG_TYPE,
+                "Lanai memory ALU combiner pass", false, false)
+
+namespace {
+bool isSpls(uint16_t Opcode) { return Lanai::splsIdempotent(Opcode) == Opcode; }
+
+// Determine the opcode for the merged instruction created by considering the
+// old memory operation's opcode and whether the merged opcode will have an
+// immediate offset.
+unsigned mergedOpcode(unsigned OldOpcode, bool ImmediateOffset) {
+  switch (OldOpcode) {
+  case Lanai::LDW_RI:
+  case Lanai::LDW_RR:
+    if (ImmediateOffset)
+      return Lanai::LDW_RI;
+    return Lanai::LDW_RR;
+  case Lanai::LDHs_RI:
+  case Lanai::LDHs_RR:
+    if (ImmediateOffset)
+      return Lanai::LDHs_RI;
+    return Lanai::LDHs_RR;
+  case Lanai::LDHz_RI:
+  case Lanai::LDHz_RR:
+    if (ImmediateOffset)
+      return Lanai::LDHz_RI;
+    return Lanai::LDHz_RR;
+  case Lanai::LDBs_RI:
+  case Lanai::LDBs_RR:
+    if (ImmediateOffset)
+      return Lanai::LDBs_RI;
+    return Lanai::LDBs_RR;
+  case Lanai::LDBz_RI:
+  case Lanai::LDBz_RR:
+    if (ImmediateOffset)
+      return Lanai::LDBz_RI;
+    return Lanai::LDBz_RR;
+  case Lanai::SW_RI:
+  case Lanai::SW_RR:
+    if (ImmediateOffset)
+      return Lanai::SW_RI;
+    return Lanai::SW_RR;
+  case Lanai::STB_RI:
+  case Lanai::STB_RR:
+    if (ImmediateOffset)
+      return Lanai::STB_RI;
+    return Lanai::STB_RR;
+  case Lanai::STH_RI:
+  case Lanai::STH_RR:
+    if (ImmediateOffset)
+      return Lanai::STH_RI;
+    return Lanai::STH_RR;
+  default:
+    return 0;
+  }
+}
+
+// Check if the machine instruction has non-volatile memory operands of the type
+// supported for combining with ALU instructions.
+bool isNonVolatileMemoryOp(const MachineInstr &MI) {
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  // Determine if the machine instruction is a supported memory operation by
+  // testing if the computed merge opcode is a valid memory operation opcode.
+  if (mergedOpcode(MI.getOpcode(), false) == 0)
+    return false;
+
+  const MachineMemOperand *MemOperand = *MI.memoperands_begin();
+
+  // Don't move volatile memory accesses
+  if (MemOperand->isVolatile())
+    return false;
+
+  return true;
+}
+
+// Test to see if two machine operands are of the same type. This test is less
+// strict than the MachineOperand::isIdenticalTo function.
+bool isSameOperand(const MachineOperand &Op1, const MachineOperand &Op2) {
+  if (Op1.getType() != Op2.getType())
+    return false;
+
+  switch (Op1.getType()) {
+  case MachineOperand::MO_Register:
+    return Op1.getReg() == Op2.getReg();
+  case MachineOperand::MO_Immediate:
+    return Op1.getImm() == Op2.getImm();
+  default:
+    return false;
+  }
+}
+
+bool isZeroOperand(const MachineOperand &Op) {
+  return ((Op.isReg() && Op.getReg() == Lanai::R0) ||
+          (Op.isImm() && Op.getImm() == 0));
+}
+
+// Determines whether a register is used by an instruction.
+bool InstrUsesReg(const MbbIterator &Instr, const MachineOperand *Reg) {
+  for (MachineInstr::const_mop_iterator Mop = Instr->operands_begin();
+       Mop != Instr->operands_end(); ++Mop) {
+    if (isSameOperand(*Mop, *Reg))
+      return true;
+  }
+  return false;
+}
+
+// Converts between machine opcode and AluCode.
+// Flag using/modifying ALU operations should not be considered for merging and
+// are omitted from this list.
+LPAC::AluCode mergedAluCode(unsigned AluOpcode) {
+  switch (AluOpcode) {
+  case Lanai::ADD_I_LO:
+  case Lanai::ADD_R:
+    return LPAC::ADD;
+  case Lanai::SUB_I_LO:
+  case Lanai::SUB_R:
+    return LPAC::SUB;
+  case Lanai::AND_I_LO:
+  case Lanai::AND_R:
+    return LPAC::AND;
+  case Lanai::OR_I_LO:
+  case Lanai::OR_R:
+    return LPAC::OR;
+  case Lanai::XOR_I_LO:
+  case Lanai::XOR_R:
+    return LPAC::XOR;
+  case Lanai::SHL_R:
+    return LPAC::SHL;
+  case Lanai::SRL_R:
+    return LPAC::SRL;
+  case Lanai::SRA_R:
+    return LPAC::SRA;
+  case Lanai::SA_I:
+  case Lanai::SL_I:
+  default:
+    return LPAC::UNKNOWN;
+  }
+}
+
+// Insert a new combined memory and ALU operation instruction.
+//
+// This function builds a new machine instruction using the MachineInstrBuilder
+// class and inserts it before the memory instruction.
+void LanaiMemAluCombiner::insertMergedInstruction(MachineBasicBlock *BB,
+                                                  const MbbIterator &MemInstr,
+                                                  const MbbIterator &AluInstr,
+                                                  bool Before) {
+  // Insert new combined load/store + alu operation
+  MachineOperand Dest = MemInstr->getOperand(0);
+  MachineOperand Base = MemInstr->getOperand(1);
+  MachineOperand MemOffset = MemInstr->getOperand(2);
+  MachineOperand AluOffset = AluInstr->getOperand(2);
+
+  // Abort if ALU offset is not a register or immediate
+  assert((AluOffset.isReg() || AluOffset.isImm()) &&
+         "Unsupported operand type in merge");
+
+  // Determined merged instructions opcode and ALU code
+  LPAC::AluCode AluOpcode = mergedAluCode(AluInstr->getOpcode());
+  unsigned NewOpc = mergedOpcode(MemInstr->getOpcode(), AluOffset.isImm());
+
+  assert(AluOpcode != LPAC::UNKNOWN && "Unknown ALU code in merging");
+  assert(NewOpc != 0 && "Unknown merged node opcode");
+
+  // Build and insert new machine instruction
+  MachineInstrBuilder InstrBuilder =
+      BuildMI(*BB, MemInstr, MemInstr->getDebugLoc(), TII->get(NewOpc));
+  InstrBuilder.addReg(Dest.getReg(), getDefRegState(true));
+  InstrBuilder.addReg(Base.getReg(), getKillRegState(true));
+
+  // Add offset to machine instruction
+  if (AluOffset.isReg())
+    InstrBuilder.addReg(AluOffset.getReg());
+  else if (AluOffset.isImm())
+    InstrBuilder.addImm(AluOffset.getImm());
+  else
+    llvm_unreachable("Unsupported ld/st ALU merge.");
+
+  // Create a pre-op if the ALU operation preceded the memory operation or the
+  // MemOffset is non-zero (i.e. the memory value should be adjusted before
+  // accessing it), else create a post-op.
+  if (Before || !isZeroOperand(MemOffset))
+    InstrBuilder.addImm(LPAC::makePreOp(AluOpcode));
+  else
+    InstrBuilder.addImm(LPAC::makePostOp(AluOpcode));
+
+  // Transfer memory operands.
+  InstrBuilder->setMemRefs(MemInstr->memoperands_begin(),
+                           MemInstr->memoperands_end());
+}
+
+// Function determines if ALU operation (in alu_iter) can be combined with
+// a load/store with base and offset.
+bool isSuitableAluInstr(bool IsSpls, const MbbIterator &AluIter,
+                        const MachineOperand &Base,
+                        const MachineOperand &Offset) {
+  // ALU operations have 3 operands
+  if (AluIter->getNumOperands() != 3)
+    return false;
+
+  MachineOperand &Dest = AluIter->getOperand(0);
+  MachineOperand &Op1 = AluIter->getOperand(1);
+  MachineOperand &Op2 = AluIter->getOperand(2);
+
+  // Only match instructions using the base register as destination and with the
+  // base and first operand equal
+  if (!isSameOperand(Dest, Base) || !isSameOperand(Dest, Op1))
+    return false;
+
+  if (Op2.isImm()) {
+    // It is not a match if the 2nd operand in the ALU operation is an
+    // immediate but the ALU operation is not an addition.
+    if (AluIter->getOpcode() != Lanai::ADD_I_LO)
+      return false;
+
+    if (Offset.isReg() && Offset.getReg() == Lanai::R0)
+      return true;
+
+    if (Offset.isImm() &&
+        ((Offset.getImm() == 0 &&
+          // Check that the Op2 would fit in the immediate field of the
+          // memory operation.
+          ((IsSpls && isInt<10>(Op2.getImm())) ||
+           (!IsSpls && isInt<16>(Op2.getImm())))) ||
+         Offset.getImm() == Op2.getImm()))
+      return true;
+  } else if (Op2.isReg()) {
+    // The Offset and 2nd operand are both registers and equal
+    if (Offset.isReg() && Op2.getReg() == Offset.getReg())
+      return true;
+  } else
+    // Only consider operations with register or immediate values
+    return false;
+
+  return false;
+}
+
+MbbIterator LanaiMemAluCombiner::findClosestSuitableAluInstr(
+    MachineBasicBlock *BB, const MbbIterator &MemInstr, const bool Decrement) {
+  MachineOperand *Base = &MemInstr->getOperand(1);
+  MachineOperand *Offset = &MemInstr->getOperand(2);
+  bool IsSpls = isSpls(MemInstr->getOpcode());
+
+  MbbIterator First = MemInstr;
+  MbbIterator Last = Decrement ? BB->begin() : BB->end();
+
+  while (First != Last) {
+    Decrement ? --First : ++First;
+
+    if (First == Last)
+      break;
+
+    // Skip over debug instructions
+    if (First->isDebugValue())
+      continue;
+
+    if (isSuitableAluInstr(IsSpls, First, *Base, *Offset)) {
+      return First;
+    }
+
+    // Usage of the base or offset register is not a form suitable for merging.
+    if (First != Last) {
+      if (InstrUsesReg(First, Base))
+        break;
+      if (Offset->isReg() && InstrUsesReg(First, Offset))
+        break;
+    }
+  }
+
+  return MemInstr;
+}
+
+bool LanaiMemAluCombiner::combineMemAluInBasicBlock(MachineBasicBlock *BB) {
+  bool Modified = false;
+
+  MbbIterator MBBIter = BB->begin(), End = BB->end();
+  while (MBBIter != End) {
+    bool IsMemOp = isNonVolatileMemoryOp(*MBBIter);
+
+    if (IsMemOp) {
+      MachineOperand AluOperand = MBBIter->getOperand(3);
+      unsigned int DestReg = MBBIter->getOperand(0).getReg(),
+                   BaseReg = MBBIter->getOperand(1).getReg();
+      assert(AluOperand.isImm() && "Unexpected memory operator type");
+      LPAC::AluCode AluOpcode = static_cast<LPAC::AluCode>(AluOperand.getImm());
+
+      // Skip memory operations that already modify the base register or if
+      // the destination and base register are the same
+      if (!LPAC::modifiesOp(AluOpcode) && DestReg != BaseReg) {
+        for (int Inc = 0; Inc <= 1; ++Inc) {
+          MbbIterator AluIter =
+              findClosestSuitableAluInstr(BB, MBBIter, Inc == 0);
+          if (AluIter != MBBIter) {
+            insertMergedInstruction(BB, MBBIter, AluIter, Inc == 0);
+
+            ++NumLdStAluCombined;
+            Modified = true;
+
+            // Erase the matching ALU instruction
+            BB->erase(AluIter);
+            // Erase old load/store instruction
+            BB->erase(MBBIter++);
+            break;
+          }
+        }
+      }
+    }
+    if (MBBIter == End)
+      break;
+    ++MBBIter;
+  }
+
+  return Modified;
+}
+
+// Driver function that iterates over the machine basic building blocks of a
+// machine function
+bool LanaiMemAluCombiner::runOnMachineFunction(MachineFunction &MF) {
+  if (DisableMemAluCombiner)
+    return false;
+
+  TII = MF.getSubtarget<LanaiSubtarget>().getInstrInfo();
+  bool Modified = false;
+  for (MfIterator MFI = MF.begin(); MFI != MF.end(); ++MFI) {
+    Modified |= combineMemAluInBasicBlock(&*MFI);
+  }
+  return Modified;
+}
+} // namespace
+
+FunctionPass *llvm::createLanaiMemAluCombinerPass() {
+  return new LanaiMemAluCombiner();
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp b/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp
new file mode 100644
index 000000000000..12a2571c28d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp
@@ -0,0 +1,287 @@
+//===-- LanaiRegisterInfo.cpp - Lanai Register Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Lanai implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiRegisterInfo.h"
+#include "Lanai.h"
+#include "LanaiSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "LanaiGenRegisterInfo.inc"
+
+using namespace llvm;
+
+LanaiRegisterInfo::LanaiRegisterInfo() : LanaiGenRegisterInfo(Lanai::RCA) {}
+
+const uint16_t *
+LanaiRegisterInfo::getCalleeSavedRegs(const MachineFunction * /*MF*/) const {
+  return CSR_SaveList;
+}
+
+BitVector LanaiRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+
+  Reserved.set(Lanai::R0);
+  Reserved.set(Lanai::R1);
+  Reserved.set(Lanai::PC);
+  Reserved.set(Lanai::R2);
+  Reserved.set(Lanai::SP);
+  Reserved.set(Lanai::R4);
+  Reserved.set(Lanai::FP);
+  Reserved.set(Lanai::R5);
+  Reserved.set(Lanai::RR1);
+  Reserved.set(Lanai::R10);
+  Reserved.set(Lanai::RR2);
+  Reserved.set(Lanai::R11);
+  Reserved.set(Lanai::RCA);
+  Reserved.set(Lanai::R15);
+  if (hasBasePointer(MF))
+    Reserved.set(getBaseRegister());
+  return Reserved;
+}
+
+bool LanaiRegisterInfo::requiresRegisterScavenging(
+    const MachineFunction & /*MF*/) const {
+  return true;
+}
+
+bool LanaiRegisterInfo::trackLivenessAfterRegAlloc(
+    const MachineFunction & /*MF*/) const {
+  return true;
+}
+
+static bool isALUArithLoOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case Lanai::ADD_I_LO:
+  case Lanai::SUB_I_LO:
+  case Lanai::ADD_F_I_LO:
+  case Lanai::SUB_F_I_LO:
+  case Lanai::ADDC_I_LO:
+  case Lanai::SUBB_I_LO:
+  case Lanai::ADDC_F_I_LO:
+  case Lanai::SUBB_F_I_LO:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static unsigned getOppositeALULoOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case Lanai::ADD_I_LO:
+    return Lanai::SUB_I_LO;
+  case Lanai::SUB_I_LO:
+    return Lanai::ADD_I_LO;
+  case Lanai::ADD_F_I_LO:
+    return Lanai::SUB_F_I_LO;
+  case Lanai::SUB_F_I_LO:
+    return Lanai::ADD_F_I_LO;
+  case Lanai::ADDC_I_LO:
+    return Lanai::SUBB_I_LO;
+  case Lanai::SUBB_I_LO:
+    return Lanai::ADDC_I_LO;
+  case Lanai::ADDC_F_I_LO:
+    return Lanai::SUBB_F_I_LO;
+  case Lanai::SUBB_F_I_LO:
+    return Lanai::ADDC_F_I_LO;
+  default:
+    llvm_unreachable("Invalid ALU lo opcode");
+  }
+}
+
+static unsigned getRRMOpcodeVariant(unsigned Opcode) {
+  switch (Opcode) {
+  case Lanai::LDBs_RI:
+    return Lanai::LDBs_RR;
+  case Lanai::LDBz_RI:
+    return Lanai::LDBz_RR;
+  case Lanai::LDHs_RI:
+    return Lanai::LDHs_RR;
+  case Lanai::LDHz_RI:
+    return Lanai::LDHz_RR;
+  case Lanai::LDW_RI:
+    return Lanai::LDW_RR;
+  case Lanai::STB_RI:
+    return Lanai::STB_RR;
+  case Lanai::STH_RI:
+    return Lanai::STH_RR;
+  case Lanai::SW_RI:
+    return Lanai::SW_RR;
+  default:
+    llvm_unreachable("Opcode has no RRM variant");
+  }
+}
+
+void LanaiRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                            int SPAdj, unsigned FIOperandNum,
+                                            RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  bool HasFP = TFI->hasFP(MF);
+  DebugLoc DL = MI.getDebugLoc();
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+  int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex) +
+               MI.getOperand(FIOperandNum + 1).getImm();
+
+  // Addressable stack objects are addressed using neg. offsets from fp
+  // or pos. offsets from sp/basepointer
+  if (!HasFP || (needsStackRealignment(MF) && FrameIndex >= 0))
+    Offset += MF.getFrameInfo().getStackSize();
+
+  unsigned FrameReg = getFrameRegister(MF);
+  if (FrameIndex >= 0) {
+    if (hasBasePointer(MF))
+      FrameReg = getBaseRegister();
+    else if (needsStackRealignment(MF))
+      FrameReg = Lanai::SP;
+  }
+
+  // Replace frame index with a frame pointer reference.
+  // If the offset is small enough to fit in the immediate field, directly
+  // encode it.
+  // Otherwise scavenge a register and encode it into a MOVHI, OR_I_LO sequence.
+  if ((isSPLSOpcode(MI.getOpcode()) && !isInt<10>(Offset)) ||
+      !isInt<16>(Offset)) {
+    assert(RS && "Register scavenging must be on");
+    unsigned Reg = RS->FindUnusedReg(&Lanai::GPRRegClass);
+    if (!Reg)
+      Reg = RS->scavengeRegister(&Lanai::GPRRegClass, II, SPAdj);
+    assert(Reg && "Register scavenger failed");
+
+    bool HasNegOffset = false;
+    // ALU ops have unsigned immediate values. If the Offset is negative, we
+    // negate it here and reverse the opcode later.
+    if (Offset < 0) {
+      HasNegOffset = true;
+      Offset = -Offset;
+    }
+
+    if (!isInt<16>(Offset)) {
+      // Reg = hi(offset) | lo(offset)
+      BuildMI(*MI.getParent(), II, DL, TII->get(Lanai::MOVHI), Reg)
+          .addImm(static_cast<uint32_t>(Offset) >> 16);
+      BuildMI(*MI.getParent(), II, DL, TII->get(Lanai::OR_I_LO), Reg)
+          .addReg(Reg)
+          .addImm(Offset & 0xffffU);
+    } else {
+      // Reg = mov(offset)
+      BuildMI(*MI.getParent(), II, DL, TII->get(Lanai::ADD_I_LO), Reg)
+          .addImm(0)
+          .addImm(Offset);
+    }
+    // Reg = FrameReg OP Reg
+    if (MI.getOpcode() == Lanai::ADD_I_LO) {
+      BuildMI(*MI.getParent(), II, DL,
+              HasNegOffset ? TII->get(Lanai::SUB_R) : TII->get(Lanai::ADD_R),
+              MI.getOperand(0).getReg())
+          .addReg(FrameReg)
+          .addReg(Reg)
+          .addImm(LPCC::ICC_T);
+      MI.eraseFromParent();
+      return;
+    }
+    if (isSPLSOpcode(MI.getOpcode()) || isRMOpcode(MI.getOpcode())) {
+      MI.setDesc(TII->get(getRRMOpcodeVariant(MI.getOpcode())));
+      if (HasNegOffset) {
+        // Change the ALU op (operand 3) from LPAC::ADD (the default) to
+        // LPAC::SUB with the already negated offset.
+        assert((MI.getOperand(3).getImm() == LPAC::ADD) &&
+               "Unexpected ALU op in RRM instruction");
+        MI.getOperand(3).setImm(LPAC::SUB);
+      }
+    } else
+      llvm_unreachable("Unexpected opcode in frame index operation");
+
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, /*isDef=*/false);
+    MI.getOperand(FIOperandNum + 1)
+        .ChangeToRegister(Reg, /*isDef=*/false, /*isImp=*/false,
+                          /*isKill=*/true);
+    return;
+  }
+
+  // ALU arithmetic ops take unsigned immediates. If the offset is negative,
+  // we replace the instruction with one that inverts the opcode and negates
+  // the immediate.
+  if ((Offset < 0) && isALUArithLoOpcode(MI.getOpcode())) {
+    unsigned NewOpcode = getOppositeALULoOpcode(MI.getOpcode());
+    // We know this is an ALU op, so we know the operands are as follows:
+    // 0: destination register
+    // 1: source register (frame register)
+    // 2: immediate
+    BuildMI(*MI.getParent(), II, DL, TII->get(NewOpcode),
+            MI.getOperand(0).getReg())
+        .addReg(FrameReg)
+        .addImm(-Offset);
+    MI.eraseFromParent();
+  } else {
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, /*isDef=*/false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+  }
+}
+
+bool LanaiRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  // When we need stack realignment and there are dynamic allocas, we can't
+  // reference off of the stack pointer, so we reserve a base pointer.
+  if (needsStackRealignment(MF) && MFI.hasVarSizedObjects())
+    return true;
+
+  return false;
+}
+
+unsigned LanaiRegisterInfo::getRARegister() const { return Lanai::RCA; }
+
+unsigned
+LanaiRegisterInfo::getFrameRegister(const MachineFunction & /*MF*/) const {
+  return Lanai::FP;
+}
+
+unsigned LanaiRegisterInfo::getBaseRegister() const { return Lanai::R14; }
+
+bool LanaiRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  if (!TargetRegisterInfo::canRealignStack(MF))
+    return false;
+  return true;
+}
+
+unsigned LanaiRegisterInfo::getEHExceptionRegister() const {
+  llvm_unreachable("no exception support");
+  return 0;
+}
+
+unsigned LanaiRegisterInfo::getEHHandlerRegister() const {
+  llvm_unreachable("no exception support");
+  return 0;
+}
+
+const uint32_t *
+LanaiRegisterInfo::getCallPreservedMask(const MachineFunction & /*MF*/,
+                                        CallingConv::ID /*CC*/) const {
+  return CSR_RegMask;
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.h b/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.h
new file mode 100644
index 000000000000..8b84bbc460e8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.h
@@ -0,0 +1,63 @@
+//===- LanaiRegisterInfo.h - Lanai Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Lanai implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAIREGISTERINFO_H
+#define LLVM_LIB_TARGET_LANAI_LANAIREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "LanaiGenRegisterInfo.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+class Type;
+
+struct LanaiRegisterInfo : public LanaiGenRegisterInfo {
+  LanaiRegisterInfo();
+
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID) const override;
+
+  // Code Generation virtual methods.
+  const uint16_t *
+  getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const override;
+
+  bool canRealignStack(const MachineFunction &MF) const override;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+  unsigned getBaseRegister() const;
+  bool hasBasePointer(const MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+  int getDwarfRegNum(unsigned RegNum, bool IsEH) const;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAIREGISTERINFO_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.td b/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.td
new file mode 100644
index 000000000000..cf8cfe30cce9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.td
@@ -0,0 +1,64 @@
+//===- LanaiRegisterInfo.td - Lanai Register defs ------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the Lanai register file
+//===----------------------------------------------------------------------===//
+
+// Registers are identified with 5-bit ID numbers.
+class LanaiReg<bits<5> num, string n, list<Register> subregs = [],
+               list<string> altNames = []> : Register<n, altNames> {
+  field bits<5> Num;
+  let Num = num;
+  let Namespace = "Lanai";
+  let SubRegs = subregs;
+}
+
+let Namespace = "Lanai" in {
+  def sub_32 : SubRegIndex<32>;
+}
+
+// Integer registers
+foreach i = 0-31 in {
+  def R#i : LanaiReg<i, "r"#i>, DwarfRegNum<[i]>;
+}
+
+// Register aliases
+let SubRegIndices = [sub_32] in {
+  def PC  : LanaiReg< 2,  "pc",  [R2]>,  DwarfRegAlias<R2>;
+  def SP  : LanaiReg< 4,  "sp",  [R4]>,  DwarfRegAlias<R4>;
+  def FP  : LanaiReg< 5,  "fp",  [R5]>,  DwarfRegAlias<R5>;
+  def RV  : LanaiReg< 8,  "rv",  [R8]>,  DwarfRegAlias<R8>;
+  def RR1 : LanaiReg<10, "rr1", [R10]>, DwarfRegAlias<R10>;
+  def RR2 : LanaiReg<11, "rr2", [R11]>, DwarfRegAlias<R11>;
+  def RCA : LanaiReg<15, "rca", [R15]>, DwarfRegAlias<R15>;
+}
+
+// Define a status register to capture the dependencies between the set flag
+// and setcc instructions
+def SR : LanaiReg< 0, "sw">;
+
+// Register classes.
+def GPR : RegisterClass<"Lanai", [i32], 32,
+    (add R3, R9, R12, R13, R14, R16, R17,
+     (sequence "R%i", 20, 31),
+     R6, R7, R18, R19, // registers for passing arguments
+     R15, RCA, // register for constant addresses
+     R10, RR1, R11, RR2, // programmer controlled registers
+     R8,  RV,  // return value
+     R5,  FP,  // frame pointer
+     R4,  SP,  // stack pointer
+     R2,  PC,  // program counter
+     R1,       // all 1s (0xffffffff)
+     R0        // constant 0
+    )>;
+
+// Condition code register class
+def CCR : RegisterClass<"Lanai", [i32], 32, (add SR)> {
+  let CopyCost = -1; // Don't allow copying of status registers
+  let isAllocatable = 0;
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiSchedule.td b/contrib/llvm/lib/Target/Lanai/LanaiSchedule.td
new file mode 100644
index 000000000000..7f931c4be8bb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiSchedule.td
@@ -0,0 +1,70 @@
+//=-LanaiSchedule.td - Lanai Scheduling Definitions --*- tablegen -*-=========//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def ALU_FU  : FuncUnit;
+def LDST_FU : FuncUnit;
+
+def IIC_ALU  : InstrItinClass;
+def IIC_LD   : InstrItinClass;
+def IIC_ST   : InstrItinClass;
+def IIC_LDSW : InstrItinClass;
+def IIC_STSW : InstrItinClass;
+
+def LanaiItinerary : ProcessorItineraries<[ALU_FU, LDST_FU],[],[
+  InstrItinData<IIC_LD,   [InstrStage<1, [LDST_FU]>]>,
+  InstrItinData<IIC_ST,   [InstrStage<1, [LDST_FU]>]>,
+  InstrItinData<IIC_LDSW, [InstrStage<2, [LDST_FU]>]>,
+  InstrItinData<IIC_STSW, [InstrStage<2, [LDST_FU]>]>,
+  InstrItinData<IIC_ALU,  [InstrStage<1, [ALU_FU]>]>
+]>;
+
+def LanaiSchedModel : SchedMachineModel {
+  // Cycles for loads to access the cache [default = -1]
+  let LoadLatency = 2;
+
+  // Max micro-ops that can be buffered for optimized loop dispatch/execution.
+  // [default = -1]
+  let LoopMicroOpBufferSize = 0;
+
+  // Allow scheduler to assign default model to any unrecognized opcodes.
+  // [default = 1]
+  let CompleteModel = 0;
+
+  // Max micro-ops that may be scheduled per cycle. [default = 1]
+  let IssueWidth = 1;
+
+  // Extra cycles for a mispredicted branch. [default = -1]
+  let MispredictPenalty = 10;
+
+  // Enable Post RegAlloc Scheduler pass. [default = 0]
+  let PostRAScheduler = 0;
+
+  // Max micro-ops that can be buffered. [default = -1]
+  let MicroOpBufferSize = 0;
+
+  // Per-cycle resources tables. [default = NoItineraries]
+  let Itineraries = LanaiItinerary;
+}
+
+def ALU : ProcResource<1> { let BufferSize = 0; }
+def LdSt : ProcResource<1> { let BufferSize = 0; }
+
+def WriteLD   : SchedWrite;
+def WriteST   : SchedWrite;
+def WriteLDSW : SchedWrite;
+def WriteSTSW : SchedWrite;
+def WriteALU  : SchedWrite;
+
+let SchedModel = LanaiSchedModel in {
+  def : WriteRes<WriteLD, [LdSt]>   { let Latency = 2; }
+  def : WriteRes<WriteST, [LdSt]>   { let Latency = 2; }
+  def : WriteRes<WriteLDSW, [LdSt]> { let Latency = 2; }
+  def : WriteRes<WriteSTSW, [LdSt]> { let Latency = 4; }
+  def : WriteRes<WriteALU, [ALU]>   { let Latency = 1; }
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..b71c30fe3e05
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp
@@ -0,0 +1,35 @@
+//===-- LanaiSelectionDAGInfo.cpp - Lanai SelectionDAG Info -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LanaiSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiSelectionDAGInfo.h"
+
+#include "LanaiTargetMachine.h"
+
+#define DEBUG_TYPE "lanai-selectiondag-info"
+
+namespace llvm {
+
+SDValue LanaiSelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG & /*DAG*/, const SDLoc & /*dl*/, SDValue /*Chain*/,
+    SDValue /*Dst*/, SDValue /*Src*/, SDValue Size, unsigned /*Align*/,
+    bool /*isVolatile*/, bool /*AlwaysInline*/,
+    MachinePointerInfo /*DstPtrInfo*/,
+    MachinePointerInfo /*SrcPtrInfo*/) const {
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+
+  return SDValue();
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.h b/contrib/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.h
new file mode 100644
index 000000000000..bfd2be2ede09
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.h
@@ -0,0 +1,36 @@
+//===-- LanaiSelectionDAGInfo.h - Lanai SelectionDAG Info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Lanai subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAISELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_LANAI_LANAISELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class LanaiSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+  LanaiSelectionDAGInfo() = default;
+
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAISELECTIONDAGINFO_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiSubtarget.cpp b/contrib/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
new file mode 100644
index 000000000000..0fa5e82a7a66
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
@@ -0,0 +1,47 @@
+//===- LanaiSubtarget.cpp - Lanai Subtarget Information -----------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Lanai specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiSubtarget.h"
+
+#include "Lanai.h"
+
+#define DEBUG_TYPE "lanai-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "LanaiGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+void LanaiSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "generic";
+
+  ParseSubtargetFeatures(CPUName, FS);
+}
+
+LanaiSubtarget &LanaiSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                                StringRef FS) {
+  initSubtargetFeatures(CPU, FS);
+  return *this;
+}
+
+LanaiSubtarget::LanaiSubtarget(const Triple &TargetTriple, StringRef Cpu,
+                               StringRef FeatureString, const TargetMachine &TM,
+                               const TargetOptions & /*Options*/,
+                               CodeModel::Model /*CodeModel*/,
+                               CodeGenOpt::Level /*OptLevel*/)
+    : LanaiGenSubtargetInfo(TargetTriple, Cpu, FeatureString),
+      FrameLowering(initializeSubtargetDependencies(Cpu, FeatureString)),
+      InstrInfo(), TLInfo(TM, *this), TSInfo() {}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiSubtarget.h b/contrib/llvm/lib/Target/Lanai/LanaiSubtarget.h
new file mode 100644
index 000000000000..2732ef3097ec
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiSubtarget.h
@@ -0,0 +1,76 @@
+//=====-- LanaiSubtarget.h - Define Subtarget for the Lanai -----*- C++ -*--==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Lanai specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAISUBTARGET_H
+#define LLVM_LIB_TARGET_LANAI_LANAISUBTARGET_H
+
+#include "LanaiFrameLowering.h"
+#include "LanaiISelLowering.h"
+#include "LanaiInstrInfo.h"
+#include "LanaiSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "LanaiGenSubtargetInfo.inc"
+
+namespace llvm {
+
+class LanaiSubtarget : public LanaiGenSubtargetInfo {
+public:
+  // This constructor initializes the data members to match that
+  // of the specified triple.
+  LanaiSubtarget(const Triple &TargetTriple, StringRef Cpu,
+                 StringRef FeatureString, const TargetMachine &TM,
+                 const TargetOptions &Options, CodeModel::Model CodeModel,
+                 CodeGenOpt::Level OptLevel);
+
+  // ParseSubtargetFeatures - Parses features string setting specified
+  // subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  LanaiSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+  void initSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  bool enableMachineScheduler() const override { return true; }
+
+  const LanaiInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+
+  const TargetFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+
+  const LanaiRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  const LanaiTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+
+  const LanaiSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+private:
+  LanaiFrameLowering FrameLowering;
+  LanaiInstrInfo InstrInfo;
+  LanaiTargetLowering TLInfo;
+  LanaiSelectionDAGInfo TSInfo;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAISUBTARGET_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
new file mode 100644
index 000000000000..2a9bc25d7fad
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -0,0 +1,113 @@
+//===-- LanaiTargetMachine.cpp - Define TargetMachine for Lanai ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Lanai target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiTargetMachine.h"
+
+#include "Lanai.h"
+#include "LanaiTargetObjectFile.h"
+#include "LanaiTargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeLanaiMemAluCombinerPass(PassRegistry &);
+} // namespace llvm
+
+extern "C" void LLVMInitializeLanaiTarget() {
+  // Register the target.
+  RegisterTargetMachine<LanaiTargetMachine> registered_target(
+      getTheLanaiTarget());
+}
+
+static std::string computeDataLayout() {
+  // Data layout (keep in sync with clang/lib/Basic/Targets.cpp)
+  return "E"        // Big endian
+         "-m:e"     // ELF name manging
+         "-p:32:32" // 32-bit pointers, 32 bit aligned
+         "-i64:64"  // 64 bit integers, 64 bit aligned
+         "-a:0:32"  // 32 bit alignment of objects of aggregate type
+         "-n32"     // 32 bit native integer width
+         "-S64";    // 64 bit natural stack alignment
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::PIC_;
+  return *RM;
+}
+
+LanaiTargetMachine::LanaiTargetMachine(const Target &T, const Triple &TT,
+                                       StringRef Cpu, StringRef FeatureString,
+                                       const TargetOptions &Options,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CodeModel,
+                                       CodeGenOpt::Level OptLevel)
+    : LLVMTargetMachine(T, computeDataLayout(), TT, Cpu, FeatureString, Options,
+                        getEffectiveRelocModel(RM), CodeModel, OptLevel),
+      Subtarget(TT, Cpu, FeatureString, *this, Options, CodeModel, OptLevel),
+      TLOF(new LanaiTargetObjectFile()) {
+  initAsmInfo();
+}
+
+TargetIRAnalysis LanaiTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(LanaiTTIImpl(this, F));
+  });
+}
+
+namespace {
+// Lanai Code Generator Pass Configuration Options.
+class LanaiPassConfig : public TargetPassConfig {
+public:
+  LanaiPassConfig(LanaiTargetMachine *TM, PassManagerBase *PassManager)
+      : TargetPassConfig(TM, *PassManager) {}
+
+  LanaiTargetMachine &getLanaiTargetMachine() const {
+    return getTM<LanaiTargetMachine>();
+  }
+
+  bool addInstSelector() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *
+LanaiTargetMachine::createPassConfig(PassManagerBase &PassManager) {
+  return new LanaiPassConfig(this, &PassManager);
+}
+
+// Install an instruction selector pass.
+bool LanaiPassConfig::addInstSelector() {
+  addPass(createLanaiISelDag(getLanaiTargetMachine()));
+  return false;
+}
+
+// Implemented by targets that want to run passes immediately before
+// machine code is emitted.
+void LanaiPassConfig::addPreEmitPass() {
+  addPass(createLanaiDelaySlotFillerPass(getLanaiTargetMachine()));
+}
+
+// Run passes after prolog-epilog insertion and before the second instruction
+// scheduling pass.
+void LanaiPassConfig::addPreSched2() {
+  addPass(createLanaiMemAluCombinerPass());
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.h b/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.h
new file mode 100644
index 000000000000..5278c70d909d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.h
@@ -0,0 +1,55 @@
+//===-- LanaiTargetMachine.h - Define TargetMachine for Lanai --- C++ ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Lanai specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAITARGETMACHINE_H
+#define LLVM_LIB_TARGET_LANAI_LANAITARGETMACHINE_H
+
+#include "LanaiFrameLowering.h"
+#include "LanaiISelLowering.h"
+#include "LanaiInstrInfo.h"
+#include "LanaiSelectionDAGInfo.h"
+#include "LanaiSubtarget.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class formatted_raw_ostream;
+
+class LanaiTargetMachine : public LLVMTargetMachine {
+  LanaiSubtarget Subtarget;
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+
+public:
+  LanaiTargetMachine(const Target &TheTarget, const Triple &TargetTriple,
+                     StringRef Cpu, StringRef FeatureString,
+                     const TargetOptions &Options,
+                     Optional<Reloc::Model> RelocationModel,
+                     CodeModel::Model CodeModel, CodeGenOpt::Level OptLevel);
+
+  const LanaiSubtarget *
+  getSubtargetImpl(const llvm::Function & /*Fn*/) const override {
+    return &Subtarget;
+  }
+
+  TargetIRAnalysis getTargetIRAnalysis() override;
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &pass_manager) override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAITARGETMACHINE_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp b/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp
new file mode 100644
index 000000000000..7475dbd68ae4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp
@@ -0,0 +1,132 @@
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiTargetObjectFile.h"
+
+#include "LanaiSubtarget.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned> SSThreshold(
+    "lanai-ssection-threshold", cl::Hidden,
+    cl::desc("Small data and bss section threshold size (default=0)"),
+    cl::init(0));
+
+void LanaiTargetObjectFile::Initialize(MCContext &Ctx,
+                                       const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+
+  SmallDataSection = getContext().getELFSection(
+      ".sdata", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+  SmallBSSSection = getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+                                               ELF::SHF_WRITE | ELF::SHF_ALLOC);
+}
+
+// A address must be loaded from a small section if its size is less than the
+// small section size threshold. Data in this section must be addressed using
+// gp_rel operator.
+static bool isInSmallSection(uint64_t Size) {
+  // gcc has traditionally not treated zero-sized objects as small data, so this
+  // is effectively part of the ABI.
+  return Size > 0 && Size <= SSThreshold;
+}
+
+// Return true if this global address should be placed into small data/bss
+// section.
+bool LanaiTargetObjectFile::isGlobalInSmallSection(
+    const GlobalObject *GO, const TargetMachine &TM) const {
+  if (GO == nullptr)
+    return false;
+
+  // We first check the case where global is a declaration, because finding
+  // section kind using getKindForGlobal() is only allowed for global
+  // definitions.
+  if (GO->isDeclaration() || GO->hasAvailableExternallyLinkage())
+    return isGlobalInSmallSectionImpl(GO, TM);
+
+  return isGlobalInSmallSection(GO, TM, getKindForGlobal(GO, TM));
+}
+
+// Return true if this global address should be placed into small data/bss
+// section.
+bool LanaiTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
+                                                   const TargetMachine &TM,
+                                                   SectionKind Kind) const {
+  return (isGlobalInSmallSectionImpl(GO, TM) &&
+          (Kind.isData() || Kind.isBSS() || Kind.isCommon()));
+}
+
+// Return true if this global address should be placed into small data/bss
+// section. This method does all the work, except for checking the section
+// kind.
+bool LanaiTargetObjectFile::isGlobalInSmallSectionImpl(
+    const GlobalObject *GO, const TargetMachine &TM) const {
+  // Only global variables, not functions.
+  const auto *GVA = dyn_cast<GlobalVariable>(GO);
+  if (!GVA)
+    return false;
+
+  // Global values placed in sections starting with .ldata do not fit in
+  // 21-bits, so always use large memory access for them. FIXME: This is a
+  // workaround for a tool limitation.
+  if (GVA->getSection().startswith(".ldata"))
+    return false;
+
+  if (TM.getCodeModel() == CodeModel::Small)
+    return true;
+
+  if (GVA->hasLocalLinkage())
+    return false;
+
+  if (((GVA->hasExternalLinkage() && GVA->isDeclaration()) ||
+       GVA->hasCommonLinkage()))
+    return false;
+
+  Type *Ty = GVA->getValueType();
+  return isInSmallSection(
+      GVA->getParent()->getDataLayout().getTypeAllocSize(Ty));
+}
+
+MCSection *LanaiTargetObjectFile::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  // Handle Small Section classification here.
+  if (Kind.isBSS() && isGlobalInSmallSection(GO, TM, Kind))
+    return SmallBSSSection;
+  if (Kind.isData() && isGlobalInSmallSection(GO, TM, Kind))
+    return SmallDataSection;
+
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
+}
+
+/// Return true if this constant should be placed into small data section.
+bool LanaiTargetObjectFile::isConstantInSmallSection(const DataLayout &DL,
+                                                     const Constant *CN) const {
+  return isInSmallSection(DL.getTypeAllocSize(CN->getType()));
+}
+
+MCSection *LanaiTargetObjectFile::getSectionForConstant(const DataLayout &DL,
+                                                        SectionKind Kind,
+                                                        const Constant *C,
+                                                        unsigned &Align) const {
+  if (isConstantInSmallSection(DL, C))
+    return SmallDataSection;
+
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C, Align);
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.h b/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.h
new file mode 100644
index 000000000000..99ec1956da4b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.h
@@ -0,0 +1,46 @@
+//===-- LanaiTargetObjectFile.h - Lanai Object Info -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAITARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_LANAI_LANAITARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+class LanaiTargetMachine;
+class LanaiTargetObjectFile : public TargetLoweringObjectFileELF {
+  MCSection *SmallDataSection;
+  MCSection *SmallBSSSection;
+
+  bool isGlobalInSmallSection(const GlobalObject *GO, const TargetMachine &TM,
+                              SectionKind Kind) const;
+  bool isGlobalInSmallSectionImpl(const GlobalObject *GO,
+                                  const TargetMachine &TM) const;
+
+public:
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+  /// Return true if this global address should be placed into small data/bss
+  /// section.
+  bool isGlobalInSmallSection(const GlobalObject *GO,
+                              const TargetMachine &TM) const;
+
+  MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+                                    const TargetMachine &TM) const override;
+
+  /// Return true if this constant should be placed into small data section.
+  bool isConstantInSmallSection(const DataLayout &DL, const Constant *CN) const;
+
+  MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
+                                   const Constant *C,
+                                   unsigned &Align) const override;
+};
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAITARGETOBJECTFILE_H
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h b/contrib/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
new file mode 100644
index 000000000000..7fcb3ce45bbb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -0,0 +1,81 @@
+//===-- LanaiTargetTransformInfo.h - Lanai specific TTI ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file a TargetTransformInfo::Concept conforming object specific to the
+// Lanai target machine. It uses the target's detailed information to
+// provide more precise answers to certain TTI queries, while letting the
+// target independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_LANAITARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_LANAI_LANAITARGETTRANSFORMINFO_H
+
+#include "Lanai.h"
+#include "LanaiSubtarget.h"
+#include "LanaiTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+class LanaiTTIImpl : public BasicTTIImplBase<LanaiTTIImpl> {
+  typedef BasicTTIImplBase<LanaiTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const LanaiSubtarget *ST;
+  const LanaiTargetLowering *TLI;
+
+  const LanaiSubtarget *getST() const { return ST; }
+  const LanaiTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit LanaiTTIImpl(const LanaiTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  bool shouldBuildLookupTables() const { return false; }
+
+  TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
+    if (TyWidth == 32)
+      return TTI::PSK_FastHardware;
+    return TTI::PSK_Software;
+  }
+
+  unsigned getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) {
+    int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+    switch (ISD) {
+    default:
+      return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                           Opd1PropInfo, Opd2PropInfo);
+    case ISD::MUL:
+    case ISD::SDIV:
+    case ISD::UDIV:
+    case ISD::UREM:
+      // This increases the cost associated with multiplication and division
+      // to 64 times what the baseline arithmetic cost is. The arithmetic
+      // instruction cost was arbitrarily chosen to reduce the desirability
+      // of emitting arithmetic instructions that are emulated in software.
+      // TODO: Investigate the performance impact given specialized lowerings.
+      return 64 * BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                                Opd1PropInfo, Opd2PropInfo);
+    }
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_LANAITARGETTRANSFORMINFO_H
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
new file mode 100644
index 000000000000..a04fe8112fb9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -0,0 +1,172 @@
+//===-- LanaiAsmBackend.cpp - Lanai Assembler Backend ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiFixupKinds.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// Prepare value for the target space
+static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+  switch (Kind) {
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+    return Value;
+  case Lanai::FIXUP_LANAI_21:
+  case Lanai::FIXUP_LANAI_21_F:
+  case Lanai::FIXUP_LANAI_25:
+  case Lanai::FIXUP_LANAI_32:
+  case Lanai::FIXUP_LANAI_HI16:
+  case Lanai::FIXUP_LANAI_LO16:
+    return Value;
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  }
+}
+
+namespace {
+class LanaiAsmBackend : public MCAsmBackend {
+  Triple::OSType OSType;
+
+public:
+  LanaiAsmBackend(const Target &T, Triple::OSType OST)
+      : MCAsmBackend(), OSType(OST) {}
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+  // No instruction requires relaxation
+  bool fixupNeedsRelaxation(const MCFixup & /*Fixup*/, uint64_t /*Value*/,
+                            const MCRelaxableFragment * /*DF*/,
+                            const MCAsmLayout & /*Layout*/) const override {
+    return false;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
+  unsigned getNumFixupKinds() const override {
+    return Lanai::NumTargetFixupKinds;
+  }
+
+  bool mayNeedRelaxation(const MCInst & /*Inst*/) const override {
+    return false;
+  }
+
+  void relaxInstruction(const MCInst & /*Inst*/,
+                        const MCSubtargetInfo & /*STI*/,
+                        MCInst & /*Res*/) const override {}
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+
+bool LanaiAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  if ((Count % 4) != 0)
+    return false;
+
+  for (uint64_t i = 0; i < Count; i += 4)
+    OW->write32(0x15000000);
+
+  return true;
+}
+
+void LanaiAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                 unsigned /*DataSize*/, uint64_t Value,
+                                 bool /*IsPCRel*/) const {
+  MCFixupKind Kind = Fixup.getKind();
+  Value = adjustFixupValue(static_cast<unsigned>(Kind), Value);
+
+  if (!Value)
+    return; // This value doesn't change the encoding
+
+  // Where in the object and where the number of bytes that need
+  // fixing up
+  unsigned Offset = Fixup.getOffset();
+  unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
+  unsigned FullSize = 4;
+
+  // Grab current value, if any, from bits.
+  uint64_t CurVal = 0;
+
+  // Load instruction and apply value
+  for (unsigned i = 0; i != NumBytes; ++i) {
+    unsigned Idx = (FullSize - 1 - i);
+    CurVal |= static_cast<uint64_t>(static_cast<uint8_t>(Data[Offset + Idx]))
+              << (i * 8);
+  }
+
+  uint64_t Mask =
+      (static_cast<uint64_t>(-1) >> (64 - getFixupKindInfo(Kind).TargetSize));
+  CurVal |= Value & Mask;
+
+  // Write out the fixed up bytes back to the code/data bits.
+  for (unsigned i = 0; i != NumBytes; ++i) {
+    unsigned Idx = (FullSize - 1 - i);
+    Data[Offset + Idx] = static_cast<uint8_t>((CurVal >> (i * 8)) & 0xff);
+  }
+}
+
+MCObjectWriter *
+LanaiAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
+  return createLanaiELFObjectWriter(OS,
+                                    MCELFObjectTargetWriter::getOSABI(OSType));
+}
+
+const MCFixupKindInfo &
+LanaiAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+  static const MCFixupKindInfo Infos[Lanai::NumTargetFixupKinds] = {
+      // This table *must* be in same the order of fixup_* kinds in
+      // LanaiFixupKinds.h.
+      // Note: The number of bits indicated here are assumed to be contiguous.
+      //   This does not hold true for LANAI_21 and LANAI_21_F which are applied
+      //   to bits 0x7cffff and 0x7cfffc, respectively. Since the 'bits' counts
+      //   here are used only for cosmetic purposes, we set the size to 16 bits
+      //   for these 21-bit relocation as llvm/lib/MC/MCAsmStreamer.cpp checks
+      //   no bits are set in the fixup range.
+      //
+      // name          offset bits flags
+      {"FIXUP_LANAI_NONE", 0, 32, 0},
+      {"FIXUP_LANAI_21", 16, 16 /*21*/, 0},
+      {"FIXUP_LANAI_21_F", 16, 16 /*21*/, 0},
+      {"FIXUP_LANAI_25", 7, 25, 0},
+      {"FIXUP_LANAI_32", 0, 32, 0},
+      {"FIXUP_LANAI_HI16", 16, 16, 0},
+      {"FIXUP_LANAI_LO16", 16, 16, 0}};
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+         "Invalid kind!");
+  return Infos[Kind - FirstTargetFixupKind];
+}
+
+} // namespace
+
+MCAsmBackend *llvm::createLanaiAsmBackend(const Target &T,
+                                          const MCRegisterInfo & /*MRI*/,
+                                          const Triple &TT, StringRef /*CPU*/,
+                                          const MCTargetOptions & /*Options*/) {
+  if (!TT.isOSBinFormatELF())
+    llvm_unreachable("OS not supported");
+
+  return new LanaiAsmBackend(T, TT.getOS());
+}
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h
new file mode 100644
index 000000000000..ce7f83509c9b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiBaseInfo.h
@@ -0,0 +1,119 @@
+//===-- LanaiBaseInfo.h - Top level definitions for Lanai MC ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the Lanai target useful for the compiler back-end and the MC libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIBASEINFO_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIBASEINFO_H
+
+#include "LanaiMCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+// LanaiII - This namespace holds all of the target specific flags that
+// instruction info tracks.
+namespace LanaiII {
+// Target Operand Flag enum.
+enum TOF {
+  //===------------------------------------------------------------------===//
+  // Lanai Specific MachineOperand flags.
+  MO_NO_FLAG,
+
+  // MO_ABS_HI/LO - Represents the hi or low part of an absolute symbol
+  // address.
+  MO_ABS_HI,
+  MO_ABS_LO,
+};
+} // namespace LanaiII
+
+static inline unsigned getLanaiRegisterNumbering(unsigned Reg) {
+  switch (Reg) {
+  case Lanai::R0:
+    return 0;
+  case Lanai::R1:
+    return 1;
+  case Lanai::R2:
+  case Lanai::PC:
+    return 2;
+  case Lanai::R3:
+    return 3;
+  case Lanai::R4:
+  case Lanai::SP:
+    return 4;
+  case Lanai::R5:
+  case Lanai::FP:
+    return 5;
+  case Lanai::R6:
+    return 6;
+  case Lanai::R7:
+    return 7;
+  case Lanai::R8:
+  case Lanai::RV:
+    return 8;
+  case Lanai::R9:
+    return 9;
+  case Lanai::R10:
+  case Lanai::RR1:
+    return 10;
+  case Lanai::R11:
+  case Lanai::RR2:
+    return 11;
+  case Lanai::R12:
+    return 12;
+  case Lanai::R13:
+    return 13;
+  case Lanai::R14:
+    return 14;
+  case Lanai::R15:
+  case Lanai::RCA:
+    return 15;
+  case Lanai::R16:
+    return 16;
+  case Lanai::R17:
+    return 17;
+  case Lanai::R18:
+    return 18;
+  case Lanai::R19:
+    return 19;
+  case Lanai::R20:
+    return 20;
+  case Lanai::R21:
+    return 21;
+  case Lanai::R22:
+    return 22;
+  case Lanai::R23:
+    return 23;
+  case Lanai::R24:
+    return 24;
+  case Lanai::R25:
+    return 25;
+  case Lanai::R26:
+    return 26;
+  case Lanai::R27:
+    return 27;
+  case Lanai::R28:
+    return 28;
+  case Lanai::R29:
+    return 29;
+  case Lanai::R30:
+    return 30;
+  case Lanai::R31:
+    return 31;
+  default:
+    llvm_unreachable("Unknown register number!");
+  }
+}
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIBASEINFO_H
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
new file mode 100644
index 000000000000..e30d5e9a18eb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
@@ -0,0 +1,95 @@
+//===-- LanaiELFObjectWriter.cpp - Lanai ELF Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/LanaiBaseInfo.h"
+#include "MCTargetDesc/LanaiFixupKinds.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class LanaiELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  explicit LanaiELFObjectWriter(uint8_t OSABI);
+
+  ~LanaiELFObjectWriter() override;
+
+protected:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+  bool needsRelocateWithSymbol(const MCSymbol &SD,
+                               unsigned Type) const override;
+};
+} // namespace
+
+LanaiELFObjectWriter::LanaiELFObjectWriter(uint8_t OSABI)
+    : MCELFObjectTargetWriter(/*Is64Bit_=*/false, OSABI, ELF::EM_LANAI,
+                              /*HasRelocationAddend=*/true) {}
+
+LanaiELFObjectWriter::~LanaiELFObjectWriter() {}
+
+unsigned LanaiELFObjectWriter::getRelocType(MCContext & /*Ctx*/,
+                                            const MCValue & /*Target*/,
+                                            const MCFixup &Fixup,
+                                            bool /*IsPCRel*/) const {
+  unsigned Type;
+  unsigned Kind = static_cast<unsigned>(Fixup.getKind());
+  switch (Kind) {
+  case Lanai::FIXUP_LANAI_21:
+    Type = ELF::R_LANAI_21;
+    break;
+  case Lanai::FIXUP_LANAI_21_F:
+    Type = ELF::R_LANAI_21_F;
+    break;
+  case Lanai::FIXUP_LANAI_25:
+    Type = ELF::R_LANAI_25;
+    break;
+  case Lanai::FIXUP_LANAI_32:
+  case FK_Data_4:
+    Type = ELF::R_LANAI_32;
+    break;
+  case Lanai::FIXUP_LANAI_HI16:
+    Type = ELF::R_LANAI_HI16;
+    break;
+  case Lanai::FIXUP_LANAI_LO16:
+    Type = ELF::R_LANAI_LO16;
+    break;
+  case Lanai::FIXUP_LANAI_NONE:
+    Type = ELF::R_LANAI_NONE;
+    break;
+
+  default:
+    llvm_unreachable("Invalid fixup kind!");
+  }
+  return Type;
+}
+
+bool LanaiELFObjectWriter::needsRelocateWithSymbol(const MCSymbol & /*SD*/,
+                                                   unsigned Type) const {
+  switch (Type) {
+  case ELF::R_LANAI_21:
+  case ELF::R_LANAI_21_F:
+  case ELF::R_LANAI_25:
+  case ELF::R_LANAI_32:
+  case ELF::R_LANAI_HI16:
+    return true;
+  default:
+    return false;
+  }
+}
+
+MCObjectWriter *llvm::createLanaiELFObjectWriter(raw_pwrite_stream &OS,
+                                                 uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new LanaiELFObjectWriter(OSABI);
+  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/false);
+}
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h
new file mode 100644
index 000000000000..9ff8340d2922
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiFixupKinds.h
@@ -0,0 +1,43 @@
+//===-- LanaiFixupKinds.h - Lanai Specific Fixup Entries --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIFIXUPKINDS_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace Lanai {
+// Although most of the current fixup types reflect a unique relocation
+// one can have multiple fixup types for a given relocation and thus need
+// to be uniquely named.
+//
+// This table *must* be in the save order of
+// MCFixupKindInfo Infos[Lanai::NumTargetFixupKinds]
+// in LanaiAsmBackend.cpp.
+//
+enum Fixups {
+  // Results in R_Lanai_NONE
+  FIXUP_LANAI_NONE = FirstTargetFixupKind,
+
+  FIXUP_LANAI_21,   // 21-bit symbol relocation
+  FIXUP_LANAI_21_F, // 21-bit symbol relocation, last two bits masked to 0
+  FIXUP_LANAI_25,   // 25-bit branch targets
+  FIXUP_LANAI_32,   // general 32-bit relocation
+  FIXUP_LANAI_HI16, // upper 16-bits of a symbolic relocation
+  FIXUP_LANAI_LO16, // lower 16-bits of a symbolic relocation
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+} // namespace Lanai
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIFIXUPKINDS_H
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
new file mode 100644
index 000000000000..7e2705e67b6d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
@@ -0,0 +1,43 @@
+//===-- LanaiMCAsmInfo.cpp - Lanai asm properties -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the LanaiMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiMCAsmInfo.h"
+
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+void LanaiMCAsmInfo::anchor() {}
+
+LanaiMCAsmInfo::LanaiMCAsmInfo(const Triple & /*TheTriple*/) {
+  IsLittleEndian = false;
+  PrivateGlobalPrefix = ".L";
+  WeakRefDirective = "\t.weak\t";
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  // Lanai assembly requires ".section" before ".bss"
+  UsesELFSectionDirectiveForBSS = true;
+
+  // Use the integrated assembler instead of system one.
+  UseIntegratedAssembler = true;
+
+  // Use '!' as comment string to correspond with old toolchain.
+  CommentString = "!";
+
+  // Target supports emission of debugging information.
+  SupportsDebugInformation = true;
+
+  // Set the instruction alignment. Currently used only for address adjustment
+  // in dwarf generation.
+  MinInstAlignment = 4;
+}
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h
new file mode 100644
index 000000000000..3eef0592d2fa
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h
@@ -0,0 +1,31 @@
+//=====-- LanaiMCAsmInfo.h - Lanai asm properties -----------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the LanaiMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCASMINFO_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class LanaiMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit LanaiMCAsmInfo(const Triple &TheTriple);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCASMINFO_H
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
new file mode 100644
index 000000000000..ce68b7e24dba
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
@@ -0,0 +1,309 @@
+//===-- LanaiMCCodeEmitter.cpp - Convert Lanai code to machine code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LanaiMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "MCTargetDesc/LanaiBaseInfo.h"
+#include "MCTargetDesc/LanaiFixupKinds.h"
+#include "MCTargetDesc/LanaiMCExpr.h"
+#include "MCTargetDesc/LanaiMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace llvm {
+namespace {
+class LanaiMCCodeEmitter : public MCCodeEmitter {
+  LanaiMCCodeEmitter(const LanaiMCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const LanaiMCCodeEmitter &);     // DO NOT IMPLEMENT
+  const MCInstrInfo &InstrInfo;
+  MCContext &Context;
+
+public:
+  LanaiMCCodeEmitter(const MCInstrInfo &MCII, MCContext &C)
+      : InstrInfo(MCII), Context(C) {}
+
+  ~LanaiMCCodeEmitter() override {}
+
+  // The functions below are called by TableGen generated functions for getting
+  // the binary encoding of instructions/opereands.
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &Inst,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &SubtargetInfo) const;
+
+  // getMachineOpValue - Return binary encoding of operand. If the machine
+  // operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &Inst, const MCOperand &MCOp,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &SubtargetInfo) const;
+
+  unsigned getRiMemoryOpValue(const MCInst &Inst, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &SubtargetInfo) const;
+
+  unsigned getRrMemoryOpValue(const MCInst &Inst, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &SubtargetInfo) const;
+
+  unsigned getSplsOpValue(const MCInst &Inst, unsigned OpNo,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &SubtargetInfo) const;
+
+  unsigned getBranchTargetOpValue(const MCInst &Inst, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &SubtargetInfo) const;
+
+  void encodeInstruction(const MCInst &Inst, raw_ostream &Ostream,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &SubtargetInfo) const override;
+
+  unsigned adjustPqBitsRmAndRrm(const MCInst &Inst, unsigned Value,
+                                const MCSubtargetInfo &STI) const;
+
+  unsigned adjustPqBitsSpls(const MCInst &Inst, unsigned Value,
+                            const MCSubtargetInfo &STI) const;
+};
+
+Lanai::Fixups FixupKind(const MCExpr *Expr) {
+  if (isa<MCSymbolRefExpr>(Expr))
+    return Lanai::FIXUP_LANAI_21;
+  if (const LanaiMCExpr *McExpr = dyn_cast<LanaiMCExpr>(Expr)) {
+    LanaiMCExpr::VariantKind ExprKind = McExpr->getKind();
+    switch (ExprKind) {
+    case LanaiMCExpr::VK_Lanai_None:
+      return Lanai::FIXUP_LANAI_21;
+    case LanaiMCExpr::VK_Lanai_ABS_HI:
+      return Lanai::FIXUP_LANAI_HI16;
+    case LanaiMCExpr::VK_Lanai_ABS_LO:
+      return Lanai::FIXUP_LANAI_LO16;
+    }
+  }
+  return Lanai::Fixups(0);
+}
+
+// getMachineOpValue - Return binary encoding of operand. If the machine
+// operand requires relocation, record the relocation and return zero.
+unsigned LanaiMCCodeEmitter::getMachineOpValue(
+    const MCInst &Inst, const MCOperand &MCOp, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &SubtargetInfo) const {
+  if (MCOp.isReg())
+    return getLanaiRegisterNumbering(MCOp.getReg());
+  if (MCOp.isImm())
+    return static_cast<unsigned>(MCOp.getImm());
+
+  // MCOp must be an expression
+  assert(MCOp.isExpr());
+  const MCExpr *Expr = MCOp.getExpr();
+
+  // Extract the symbolic reference side of a binary expression.
+  if (Expr->getKind() == MCExpr::Binary) {
+    const MCBinaryExpr *BinaryExpr = static_cast<const MCBinaryExpr *>(Expr);
+    Expr = BinaryExpr->getLHS();
+  }
+
+  assert(isa<LanaiMCExpr>(Expr) || Expr->getKind() == MCExpr::SymbolRef);
+  // Push fixup (all info is contained within)
+  Fixups.push_back(
+      MCFixup::create(0, MCOp.getExpr(), MCFixupKind(FixupKind(Expr))));
+  return 0;
+}
+
+// Helper function to adjust P and Q bits on load and store instructions.
+unsigned adjustPqBits(const MCInst &Inst, unsigned Value, unsigned PBitShift,
+                      unsigned QBitShift) {
+  const MCOperand AluOp = Inst.getOperand(3);
+  unsigned AluCode = AluOp.getImm();
+
+  // Set the P bit to one iff the immediate is nonzero and not a post-op
+  // instruction.
+  const MCOperand Op2 = Inst.getOperand(2);
+  Value &= ~(1 << PBitShift);
+  if (!LPAC::isPostOp(AluCode) &&
+      ((Op2.isImm() && Op2.getImm() != 0) ||
+       (Op2.isReg() && Op2.getReg() != Lanai::R0) || (Op2.isExpr())))
+    Value |= (1 << PBitShift);
+
+  // Set the Q bit to one iff it is a post- or pre-op instruction.
+  assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() &&
+         "Expected register operand.");
+  Value &= ~(1 << QBitShift);
+  if (LPAC::modifiesOp(AluCode) && ((Op2.isImm() && Op2.getImm() != 0) ||
+                                    (Op2.isReg() && Op2.getReg() != Lanai::R0)))
+    Value |= (1 << QBitShift);
+
+  return Value;
+}
+
+unsigned
+LanaiMCCodeEmitter::adjustPqBitsRmAndRrm(const MCInst &Inst, unsigned Value,
+                                         const MCSubtargetInfo &STI) const {
+  return adjustPqBits(Inst, Value, 17, 16);
+}
+
+unsigned
+LanaiMCCodeEmitter::adjustPqBitsSpls(const MCInst &Inst, unsigned Value,
+                                     const MCSubtargetInfo &STI) const {
+  return adjustPqBits(Inst, Value, 11, 10);
+}
+
+void LanaiMCCodeEmitter::encodeInstruction(
+    const MCInst &Inst, raw_ostream &Ostream, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &SubtargetInfo) const {
+  // Get instruction encoding and emit it
+  unsigned Value = getBinaryCodeForInstr(Inst, Fixups, SubtargetInfo);
+  ++MCNumEmitted; // Keep track of the number of emitted insns.
+
+  // Emit bytes in big-endian
+  for (int i = (4 - 1) * 8; i >= 0; i -= 8)
+    Ostream << static_cast<char>((Value >> i) & 0xff);
+}
+
+// Encode Lanai Memory Operand
+unsigned LanaiMCCodeEmitter::getRiMemoryOpValue(
+    const MCInst &Inst, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &SubtargetInfo) const {
+  unsigned Encoding;
+  const MCOperand Op1 = Inst.getOperand(OpNo + 0);
+  const MCOperand Op2 = Inst.getOperand(OpNo + 1);
+  const MCOperand AluOp = Inst.getOperand(OpNo + 2);
+
+  assert(Op1.isReg() && "First operand is not register.");
+  assert((Op2.isImm() || Op2.isExpr()) &&
+         "Second operand is neither an immediate nor an expression.");
+  assert((LPAC::getAluOp(AluOp.getImm()) == LPAC::ADD) &&
+         "Register immediate only supports addition operator");
+
+  Encoding = (getLanaiRegisterNumbering(Op1.getReg()) << 18);
+  if (Op2.isImm()) {
+    assert(isInt<16>(Op2.getImm()) &&
+           "Constant value truncated (limited to 16-bit)");
+
+    Encoding |= (Op2.getImm() & 0xffff);
+    if (Op2.getImm() != 0) {
+      if (LPAC::isPreOp(AluOp.getImm()))
+        Encoding |= (0x3 << 16);
+      if (LPAC::isPostOp(AluOp.getImm()))
+        Encoding |= (0x1 << 16);
+    }
+  } else
+    getMachineOpValue(Inst, Op2, Fixups, SubtargetInfo);
+
+  return Encoding;
+}
+
+unsigned LanaiMCCodeEmitter::getRrMemoryOpValue(
+    const MCInst &Inst, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &SubtargetInfo) const {
+  unsigned Encoding;
+  const MCOperand Op1 = Inst.getOperand(OpNo + 0);
+  const MCOperand Op2 = Inst.getOperand(OpNo + 1);
+  const MCOperand AluMCOp = Inst.getOperand(OpNo + 2);
+
+  assert(Op1.isReg() && "First operand is not register.");
+  Encoding = (getLanaiRegisterNumbering(Op1.getReg()) << 15);
+  assert(Op2.isReg() && "Second operand is not register.");
+  Encoding |= (getLanaiRegisterNumbering(Op2.getReg()) << 10);
+
+  assert(AluMCOp.isImm() && "Third operator is not immediate.");
+  // Set BBB
+  unsigned AluOp = AluMCOp.getImm();
+  Encoding |= LPAC::encodeLanaiAluCode(AluOp) << 5;
+  // Set P and Q
+  if (LPAC::isPreOp(AluOp))
+    Encoding |= (0x3 << 8);
+  if (LPAC::isPostOp(AluOp))
+    Encoding |= (0x1 << 8);
+  // Set JJJJ
+  switch (LPAC::getAluOp(AluOp)) {
+  case LPAC::SHL:
+  case LPAC::SRL:
+    Encoding |= 0x10;
+    break;
+  case LPAC::SRA:
+    Encoding |= 0x18;
+    break;
+  default:
+    break;
+  }
+
+  return Encoding;
+}
+
+unsigned
+LanaiMCCodeEmitter::getSplsOpValue(const MCInst &Inst, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &SubtargetInfo) const {
+  unsigned Encoding;
+  const MCOperand Op1 = Inst.getOperand(OpNo + 0);
+  const MCOperand Op2 = Inst.getOperand(OpNo + 1);
+  const MCOperand AluOp = Inst.getOperand(OpNo + 2);
+
+  assert(Op1.isReg() && "First operand is not register.");
+  assert((Op2.isImm() || Op2.isExpr()) &&
+         "Second operand is neither an immediate nor an expression.");
+  assert((LPAC::getAluOp(AluOp.getImm()) == LPAC::ADD) &&
+         "Register immediate only supports addition operator");
+
+  Encoding = (getLanaiRegisterNumbering(Op1.getReg()) << 12);
+  if (Op2.isImm()) {
+    assert(isInt<10>(Op2.getImm()) &&
+           "Constant value truncated (limited to 10-bit)");
+
+    Encoding |= (Op2.getImm() & 0x3ff);
+    if (Op2.getImm() != 0) {
+      if (LPAC::isPreOp(AluOp.getImm()))
+        Encoding |= (0x3 << 10);
+      if (LPAC::isPostOp(AluOp.getImm()))
+        Encoding |= (0x1 << 10);
+    }
+  } else
+    getMachineOpValue(Inst, Op2, Fixups, SubtargetInfo);
+
+  return Encoding;
+}
+
+unsigned LanaiMCCodeEmitter::getBranchTargetOpValue(
+    const MCInst &Inst, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &SubtargetInfo) const {
+  const MCOperand &MCOp = Inst.getOperand(OpNo);
+  if (MCOp.isReg() || MCOp.isImm())
+    return getMachineOpValue(Inst, MCOp, Fixups, SubtargetInfo);
+
+  Fixups.push_back(MCFixup::create(
+      0, MCOp.getExpr(), static_cast<MCFixupKind>(Lanai::FIXUP_LANAI_25)));
+
+  return 0;
+}
+
+#include "LanaiGenMCCodeEmitter.inc"
+} // namespace
+} // namespace llvm
+
+llvm::MCCodeEmitter *
+llvm::createLanaiMCCodeEmitter(const MCInstrInfo &InstrInfo,
+                               const MCRegisterInfo & /*MRI*/,
+                               MCContext &context) {
+  return new LanaiMCCodeEmitter(InstrInfo, context);
+}
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
new file mode 100644
index 000000000000..201c95de07f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
@@ -0,0 +1,60 @@
+//===-- LanaiMCExpr.cpp - Lanai specific MC expression classes ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiMCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "lanaimcexpr"
+
+const LanaiMCExpr *LanaiMCExpr::create(VariantKind Kind, const MCExpr *Expr,
+                                       MCContext &Ctx) {
+  return new (Ctx) LanaiMCExpr(Kind, Expr);
+}
+
+void LanaiMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  if (Kind == VK_Lanai_None) {
+    Expr->print(OS, MAI);
+    return;
+  }
+
+  switch (Kind) {
+  default:
+    llvm_unreachable("Invalid kind!");
+  case VK_Lanai_ABS_HI:
+    OS << "hi";
+    break;
+  case VK_Lanai_ABS_LO:
+    OS << "lo";
+    break;
+  }
+
+  OS << '(';
+  const MCExpr *Expr = getSubExpr();
+  Expr->print(OS, MAI);
+  OS << ')';
+}
+
+void LanaiMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
+}
+
+bool LanaiMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+                                            const MCAsmLayout *Layout,
+                                            const MCFixup *Fixup) const {
+  if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
+    return false;
+
+  Res =
+      MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h
new file mode 100644
index 000000000000..5004d541ff70
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h
@@ -0,0 +1,56 @@
+//===-- LanaiMCExpr.h - Lanai specific MC expression classes ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCEXPR_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+namespace llvm {
+
+class LanaiMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind { VK_Lanai_None, VK_Lanai_ABS_HI, VK_Lanai_ABS_LO };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+
+  explicit LanaiMCExpr(VariantKind Kind, const MCExpr *Expr)
+      : Kind(Kind), Expr(Expr) {}
+
+public:
+  static const LanaiMCExpr *create(VariantKind Kind, const MCExpr *Expr,
+                                   MCContext &Ctx);
+
+  // Returns the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  // Returns the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+  MCFragment *findAssociatedFragment() const override {
+    return getSubExpr()->findAssociatedFragment();
+  }
+
+  // There are no TLS LanaiMCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler & /*Asm*/) const override {}
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
new file mode 100644
index 000000000000..c2f8c0f7ad50
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -0,0 +1,154 @@
+//===-- LanaiMCTargetDesc.cpp - Lanai Target Descriptions -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Lanai specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LanaiMCTargetDesc.h"
+
+#include "InstPrinter/LanaiInstPrinter.h"
+#include "LanaiMCAsmInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "LanaiGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "LanaiGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "LanaiGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createLanaiMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitLanaiMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createLanaiMCRegisterInfo(const Triple & /*TT*/) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitLanaiMCRegisterInfo(X, Lanai::RCA, 0, 0, Lanai::PC);
+  return X;
+}
+
+static MCSubtargetInfo *
+createLanaiMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "generic";
+
+  return createLanaiMCSubtargetInfoImpl(TT, CPUName, FS);
+}
+
+static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
+                                    MCAsmBackend &MAB, raw_pwrite_stream &OS,
+                                    MCCodeEmitter *Emitter, bool RelaxAll) {
+  if (!T.isOSBinFormatELF())
+    llvm_unreachable("OS not supported");
+
+  return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+}
+
+static MCInstPrinter *createLanaiMCInstPrinter(const Triple & /*T*/,
+                                               unsigned SyntaxVariant,
+                                               const MCAsmInfo &MAI,
+                                               const MCInstrInfo &MII,
+                                               const MCRegisterInfo &MRI) {
+  if (SyntaxVariant == 0)
+    return new LanaiInstPrinter(MAI, MII, MRI);
+  return 0;
+}
+
+static MCRelocationInfo *createLanaiElfRelocation(const Triple &TheTriple,
+                                                  MCContext &Ctx) {
+  return createMCRelocationInfo(TheTriple, Ctx);
+}
+
+namespace {
+class LanaiMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  explicit LanaiMCInstrAnalysis(const MCInstrInfo *Info)
+      : MCInstrAnalysis(Info) {}
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+                      uint64_t &Target) const override {
+    if (Inst.getNumOperands() == 0)
+      return false;
+
+    if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType ==
+        MCOI::OPERAND_PCREL) {
+      int64_t Imm = Inst.getOperand(0).getImm();
+      Target = Addr + Size + Imm;
+      return true;
+    } else {
+      int64_t Imm = Inst.getOperand(0).getImm();
+
+      // Skip case where immediate is 0 as that occurs in file that isn't linked
+      // and the branch target inferred would be wrong.
+      if (Imm == 0)
+        return false;
+
+      Target = Imm;
+      return true;
+    }
+  }
+};
+} // end anonymous namespace
+
+static MCInstrAnalysis *createLanaiInstrAnalysis(const MCInstrInfo *Info) {
+  return new LanaiMCInstrAnalysis(Info);
+}
+
+extern "C" void LLVMInitializeLanaiTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfo<LanaiMCAsmInfo> X(getTheLanaiTarget());
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(getTheLanaiTarget(),
+                                      createLanaiMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(getTheLanaiTarget(),
+                                    createLanaiMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(getTheLanaiTarget(),
+                                          createLanaiMCSubtargetInfo);
+
+  // Register the MC code emitter
+  TargetRegistry::RegisterMCCodeEmitter(getTheLanaiTarget(),
+                                        llvm::createLanaiMCCodeEmitter);
+
+  // Register the ASM Backend
+  TargetRegistry::RegisterMCAsmBackend(getTheLanaiTarget(),
+                                       createLanaiAsmBackend);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(getTheLanaiTarget(),
+                                        createLanaiMCInstPrinter);
+
+  // Register the ELF streamer.
+  TargetRegistry::RegisterELFStreamer(getTheLanaiTarget(), createMCStreamer);
+
+  // Register the MC relocation info.
+  TargetRegistry::RegisterMCRelocationInfo(getTheLanaiTarget(),
+                                           createLanaiElfRelocation);
+
+  // Register the MC instruction analyzer.
+  TargetRegistry::RegisterMCInstrAnalysis(getTheLanaiTarget(),
+                                          createLanaiInstrAnalysis);
+}
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
new file mode 100644
index 000000000000..8adaf4cea420
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
@@ -0,0 +1,61 @@
+//===-- LanaiMCTargetDesc.h - Lanai Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Lanai specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCTARGETDESC_H
+#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCTARGETDESC_H
+
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCInstrAnalysis;
+class MCObjectWriter;
+class MCRelocationInfo;
+class MCSubtargetInfo;
+class Target;
+class Triple;
+class StringRef;
+class raw_pwrite_stream;
+
+Target &getTheLanaiTarget();
+
+MCCodeEmitter *createLanaiMCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        MCContext &Ctx);
+
+MCAsmBackend *createLanaiAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                    const Triple &TheTriple, StringRef CPU,
+                                    const MCTargetOptions &Options);
+
+MCObjectWriter *createLanaiELFObjectWriter(raw_pwrite_stream &OS,
+                                           uint8_t OSABI);
+} // namespace llvm
+
+// Defines symbolic names for Lanai registers.  This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "LanaiGenRegisterInfo.inc"
+
+// Defines symbolic names for the Lanai instructions.
+#define GET_INSTRINFO_ENUM
+#include "LanaiGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "LanaiGenSubtargetInfo.inc"
+
+#endif // LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCTARGETDESC_H
diff --git a/contrib/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp b/contrib/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
new file mode 100644
index 000000000000..e377db1d49da
--- /dev/null
+++ b/contrib/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
@@ -0,0 +1,25 @@
+//===-- LanaiTargetInfo.cpp - Lanai Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Lanai.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+namespace llvm {
+Target &getTheLanaiTarget() {
+  static Target TheLanaiTarget;
+  return TheLanaiTarget;
+}
+} // namespace llvm
+
+extern "C" void LLVMInitializeLanaiTargetInfo() {
+  RegisterTarget<Triple::lanai> X(getTheLanaiTarget(), "lanai", "Lanai");
+}
diff --git a/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
new file mode 100644
index 000000000000..be6d1a84a377
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
@@ -0,0 +1,116 @@
+//===-- MSP430InstPrinter.cpp - Convert MSP430 MCInst to assembly syntax --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an MSP430 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430InstPrinter.h"
+#include "MSP430.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+
+// Include the auto-generated portion of the assembly writer.
+#include "MSP430GenAsmWriter.inc"
+
+void MSP430InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                  StringRef Annot, const MCSubtargetInfo &STI) {
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << Op.getImm();
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    Op.getExpr()->print(O, &MAI);
+  }
+}
+
+void MSP430InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O, const char *Modifier) {
+  assert((Modifier == nullptr || Modifier[0] == 0) && "No modifiers supported");
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    O << getRegisterName(Op.getReg());
+  } else if (Op.isImm()) {
+    O << '#' << Op.getImm();
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << '#';
+    Op.getExpr()->print(O, &MAI);
+  }
+}
+
+void MSP430InstPrinter::printSrcMemOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O,
+                                           const char *Modifier) {
+  const MCOperand &Base = MI->getOperand(OpNo);
+  const MCOperand &Disp = MI->getOperand(OpNo+1);
+
+  // Print displacement first
+
+  // If the global address expression is a part of displacement field with a
+  // register base, we should not emit any prefix symbol here, e.g.
+  //   mov.w &foo, r1
+  // vs
+  //   mov.w glb(r1), r2
+  // Otherwise (!) msp430-as will silently miscompile the output :(
+  if (!Base.getReg())
+    O << '&';
+
+  if (Disp.isExpr())
+    Disp.getExpr()->print(O, &MAI);
+  else {
+    assert(Disp.isImm() && "Expected immediate in displacement field");
+    O << Disp.getImm();
+  }
+
+  // Print register base field
+  if (Base.getReg())
+    O << '(' << getRegisterName(Base.getReg()) << ')';
+}
+
+void MSP430InstPrinter::printCCOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned CC = MI->getOperand(OpNo).getImm();
+
+  switch (CC) {
+  default:
+   llvm_unreachable("Unsupported CC code");
+  case MSP430CC::COND_E:
+   O << "eq";
+   break;
+  case MSP430CC::COND_NE:
+   O << "ne";
+   break;
+  case MSP430CC::COND_HS:
+   O << "hs";
+   break;
+  case MSP430CC::COND_LO:
+   O << "lo";
+   break;
+  case MSP430CC::COND_GE:
+   O << "ge";
+   break;
+  case MSP430CC::COND_L:
+   O << 'l';
+   break;
+  }
+}
diff --git a/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
new file mode 100644
index 000000000000..72afec18becb
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
@@ -0,0 +1,43 @@
+//= MSP430InstPrinter.h - Convert MSP430 MCInst to assembly syntax -*- C++ -*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a MSP430 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_INSTPRINTER_MSP430INSTPRINTER_H
+#define LLVM_LIB_TARGET_MSP430_INSTPRINTER_MSP430INSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+  class MSP430InstPrinter : public MCInstPrinter {
+  public:
+    MSP430InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                      const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+    void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                   const MCSubtargetInfo &STI) override;
+
+    // Autogenerated by tblgen.
+    void printInstruction(const MCInst *MI, raw_ostream &O);
+    static const char *getRegisterName(unsigned RegNo);
+
+    void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                      const char *Modifier = nullptr);
+    void printPCRelImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+    void printSrcMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                            const char *Modifier = nullptr);
+    void printCCOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
new file mode 100644
index 000000000000..c26b3081dbc3
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
@@ -0,0 +1,26 @@
+//===-- MSP430MCAsmInfo.cpp - MSP430 asm properties -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MSP430MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCAsmInfo.h"
+using namespace llvm;
+
+void MSP430MCAsmInfo::anchor() { }
+
+MSP430MCAsmInfo::MSP430MCAsmInfo(const Triple &TT) {
+  PointerSize = CalleeSaveStackSlotSize = 2;
+
+  CommentString = ";";
+
+  AlignmentIsInBytes = false;
+  UsesELFSectionDirectiveForBSS = true;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
new file mode 100644
index 000000000000..183dee36a047
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- MSP430MCAsmInfo.h - MSP430 asm properties --------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MSP430MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430MCASMINFO_H
+#define LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430MCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class MSP430MCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit MSP430MCAsmInfo(const Triple &TT);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
new file mode 100644
index 000000000000..8c715500f38b
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -0,0 +1,79 @@
+//===-- MSP430MCTargetDesc.cpp - MSP430 Target Descriptions ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides MSP430 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCTargetDesc.h"
+#include "InstPrinter/MSP430InstPrinter.h"
+#include "MSP430MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "MSP430GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "MSP430GenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "MSP430GenRegisterInfo.inc"
+
+static MCInstrInfo *createMSP430MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitMSP430MCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createMSP430MCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitMSP430MCRegisterInfo(X, MSP430::PC);
+  return X;
+}
+
+static MCSubtargetInfo *
+createMSP430MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  return createMSP430MCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCInstPrinter *createMSP430MCInstPrinter(const Triple &T,
+                                                unsigned SyntaxVariant,
+                                                const MCAsmInfo &MAI,
+                                                const MCInstrInfo &MII,
+                                                const MCRegisterInfo &MRI) {
+  if (SyntaxVariant == 0)
+    return new MSP430InstPrinter(MAI, MII, MRI);
+  return nullptr;
+}
+
+extern "C" void LLVMInitializeMSP430TargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfo<MSP430MCAsmInfo> X(getTheMSP430Target());
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(getTheMSP430Target(),
+                                      createMSP430MCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(getTheMSP430Target(),
+                                    createMSP430MCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(getTheMSP430Target(),
+                                          createMSP430MCSubtargetInfo);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(getTheMSP430Target(),
+                                        createMSP430MCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
new file mode 100644
index 000000000000..b901c5f09794
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
@@ -0,0 +1,38 @@
+//===-- MSP430MCTargetDesc.h - MSP430 Target Descriptions -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides MSP430 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430MCTARGETDESC_H
+#define LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430MCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class Target;
+
+Target &getTheMSP430Target();
+
+} // End llvm namespace
+
+// Defines symbolic names for MSP430 registers.
+// This defines a mapping from register name to register number.
+#define GET_REGINFO_ENUM
+#include "MSP430GenRegisterInfo.inc"
+
+// Defines symbolic names for the MSP430 instructions.
+#define GET_INSTRINFO_ENUM
+#include "MSP430GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "MSP430GenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430.h b/contrib/llvm/lib/Target/MSP430/MSP430.h
new file mode 100644
index 000000000000..796f25233123
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430.h
@@ -0,0 +1,47 @@
+//==-- MSP430.h - Top-level interface for MSP430 representation --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM MSP430 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430_H
+#define LLVM_LIB_TARGET_MSP430_MSP430_H
+
+#include "MCTargetDesc/MSP430MCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace MSP430CC {
+  // MSP430 specific condition code.
+  enum CondCodes {
+    COND_E  = 0,  // aka COND_Z
+    COND_NE = 1,  // aka COND_NZ
+    COND_HS = 2,  // aka COND_C
+    COND_LO = 3,  // aka COND_NC
+    COND_GE = 4,
+    COND_L  = 5,
+
+    COND_INVALID = -1
+  };
+}
+
+namespace llvm {
+  class MSP430TargetMachine;
+  class FunctionPass;
+  class formatted_raw_ostream;
+
+  FunctionPass *createMSP430ISelDag(MSP430TargetMachine &TM,
+                                    CodeGenOpt::Level OptLevel);
+
+  FunctionPass *createMSP430BranchSelectionPass();
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430.td b/contrib/llvm/lib/Target/MSP430/MSP430.td
new file mode 100644
index 000000000000..dfea669f3ba1
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430.td
@@ -0,0 +1,60 @@
+//===-- MSP430.td - Describe the MSP430 Target Machine -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the MSP430 target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features. 
+//===----------------------------------------------------------------------===//
+def FeatureX
+ : SubtargetFeature<"ext", "ExtendedInsts", "true",
+                    "Enable MSP430-X extensions">;
+
+//===----------------------------------------------------------------------===//
+// MSP430 supported processors.
+//===----------------------------------------------------------------------===//
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",         []>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "MSP430RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Description
+//===----------------------------------------------------------------------===//
+
+include "MSP430CallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "MSP430InstrInfo.td"
+
+def MSP430InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def MSP430 : Target {
+  let InstructionSet = MSP430InstrInfo;
+}
+
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp b/contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
new file mode 100644
index 000000000000..abf062fe86ae
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -0,0 +1,159 @@
+//===-- MSP430AsmPrinter.cpp - MSP430 LLVM assembly writer ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the MSP430 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "InstPrinter/MSP430InstPrinter.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430MCInstLower.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+  class MSP430AsmPrinter : public AsmPrinter {
+  public:
+    MSP430AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+        : AsmPrinter(TM, std::move(Streamer)) {}
+
+    StringRef getPassName() const override { return "MSP430 Assembly Printer"; }
+
+    void printOperand(const MachineInstr *MI, int OpNum,
+                      raw_ostream &O, const char* Modifier = nullptr);
+    void printSrcMemOperand(const MachineInstr *MI, int OpNum,
+                            raw_ostream &O);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &O) override;
+    bool PrintAsmMemoryOperand(const MachineInstr *MI,
+                               unsigned OpNo, unsigned AsmVariant,
+                               const char *ExtraCode, raw_ostream &O) override;
+    void EmitInstruction(const MachineInstr *MI) override;
+  };
+} // end of anonymous namespace
+
+
+void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                    raw_ostream &O, const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  default: llvm_unreachable("Not implemented yet!");
+  case MachineOperand::MO_Register:
+    O << MSP430InstPrinter::getRegisterName(MO.getReg());
+    return;
+  case MachineOperand::MO_Immediate:
+    if (!Modifier || strcmp(Modifier, "nohash"))
+      O << '#';
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    MO.getMBB()->getSymbol()->print(O, MAI);
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    uint64_t Offset = MO.getOffset();
+
+    // If the global address expression is a part of displacement field with a
+    // register base, we should not emit any prefix symbol here, e.g.
+    //   mov.w &foo, r1
+    // vs
+    //   mov.w glb(r1), r2
+    // Otherwise (!) msp430-as will silently miscompile the output :(
+    if (!Modifier || strcmp(Modifier, "nohash"))
+      O << (isMemOp ? '&' : '#');
+    if (Offset)
+      O << '(' << Offset << '+';
+
+    getSymbol(MO.getGlobal())->print(O, MAI);
+
+    if (Offset)
+      O << ')';
+
+    return;
+  }
+  }
+}
+
+void MSP430AsmPrinter::printSrcMemOperand(const MachineInstr *MI, int OpNum,
+                                          raw_ostream &O) {
+  const MachineOperand &Base = MI->getOperand(OpNum);
+  const MachineOperand &Disp = MI->getOperand(OpNum+1);
+
+  // Print displacement first
+
+  // Imm here is in fact global address - print extra modifier.
+  if (Disp.isImm() && !Base.getReg())
+    O << '&';
+  printOperand(MI, OpNum+1, O, "nohash");
+
+  // Print register base field
+  if (Base.getReg()) {
+    O << '(';
+    printOperand(MI, OpNum, O);
+    O << ')';
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool MSP430AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                       unsigned AsmVariant,
+                                       const char *ExtraCode, raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  printOperand(MI, OpNo, O);
+  return false;
+}
+
+bool MSP430AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                             unsigned OpNo, unsigned AsmVariant,
+                                             const char *ExtraCode,
+                                             raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    return true; // Unknown modifier.
+  }
+  printSrcMemOperand(MI, OpNo, O);
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MSP430MCInstLower MCInstLowering(OutContext, *this);
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  EmitToStreamer(*OutStreamer, TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeMSP430AsmPrinter() {
+  RegisterAsmPrinter<MSP430AsmPrinter> X(getTheMSP430Target());
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
new file mode 100644
index 000000000000..5fd6b6305f68
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -0,0 +1,257 @@
+//===-- MSP430BranchSelector.cpp - Emit long conditional branches ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that scans a machine function to determine which
+// conditional branches need more than 10 bits of displacement to reach their
+// target basic block.  It does this in two passes; a calculation of basic block
+// positions pass, and a branch pseudo op to machine branch opcode pass.  This
+// pass should be run last, just before the assembly printer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-branch-select"
+
+static cl::opt<bool>
+    BranchSelectEnabled("msp430-branch-select", cl::Hidden, cl::init(true),
+                        cl::desc("Expand out of range branches"));
+
+STATISTIC(NumSplit, "Number of machine basic blocks split");
+STATISTIC(NumExpanded, "Number of branches expanded to long format");
+
+namespace {
+class MSP430BSel : public MachineFunctionPass {
+
+  typedef SmallVector<int, 16> OffsetVector;
+
+  MachineFunction *MF;
+  const MSP430InstrInfo *TII;
+
+  unsigned measureFunction(OffsetVector &BlockOffsets,
+                           MachineBasicBlock *FromBB = nullptr);
+  bool expandBranches(OffsetVector &BlockOffsets);
+
+public:
+  static char ID;
+  MSP430BSel() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  StringRef getPassName() const override { return "MSP430 Branch Selector"; }
+};
+char MSP430BSel::ID = 0;
+}
+
+static bool isInRage(int DistanceInBytes) {
+  // According to CC430 Family User's Guide, Section 4.5.1.3, branch
+  // instructions have the signed 10-bit word offset field, so first we need to
+  // convert the distance from bytes to words, then check if it fits in 10-bit
+  // signed integer.
+  const int WordSize = 2;
+
+  assert((DistanceInBytes % WordSize == 0) &&
+         "Branch offset should be word aligned!");
+
+  int Words = DistanceInBytes / WordSize;
+  return isInt<10>(Words);
+}
+
+/// Measure each basic block, fill the BlockOffsets, and return the size of
+/// the function, starting with BB
+unsigned MSP430BSel::measureFunction(OffsetVector &BlockOffsets,
+                                     MachineBasicBlock *FromBB) {
+  // Give the blocks of the function a dense, in-order, numbering.
+  MF->RenumberBlocks(FromBB);
+
+  MachineFunction::iterator Begin;
+  if (FromBB == nullptr) {
+    Begin = MF->begin();
+  } else {
+    Begin = FromBB->getIterator();
+  }
+
+  BlockOffsets.resize(MF->getNumBlockIDs());
+
+  unsigned TotalSize = BlockOffsets[Begin->getNumber()];
+  for (auto &MBB : make_range(Begin, MF->end())) {
+    BlockOffsets[MBB.getNumber()] = TotalSize;
+    for (MachineInstr &MI : MBB) {
+      TotalSize += TII->getInstSizeInBytes(MI);
+    }
+  }
+  return TotalSize;
+}
+
+/// Do expand branches and split the basic blocks if necessary.
+/// Returns true if made any change.
+bool MSP430BSel::expandBranches(OffsetVector &BlockOffsets) {
+  // For each conditional branch, if the offset to its destination is larger
+  // than the offset field allows, transform it into a long branch sequence
+  // like this:
+  //   short branch:
+  //     bCC MBB
+  //   long branch:
+  //     b!CC $PC+6
+  //     b MBB
+  //
+  bool MadeChange = false;
+  for (auto MBB = MF->begin(), E = MF->end(); MBB != E; ++MBB) {
+    unsigned MBBStartOffset = 0;
+    for (auto MI = MBB->begin(), EE = MBB->end(); MI != EE; ++MI) {
+      MBBStartOffset += TII->getInstSizeInBytes(*MI);
+
+      // If this instruction is not a short branch then skip it.
+      if (MI->getOpcode() != MSP430::JCC && MI->getOpcode() != MSP430::JMP) {
+        continue;
+      }
+
+      MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+      // Determine the distance from the current branch to the destination
+      // block. MBBStartOffset already includes the size of the current branch
+      // instruction.
+      int BlockDistance =
+          BlockOffsets[DestBB->getNumber()] - BlockOffsets[MBB->getNumber()];
+      int BranchDistance = BlockDistance - MBBStartOffset;
+
+      // If this branch is in range, ignore it.
+      if (isInRage(BranchDistance)) {
+        continue;
+      }
+
+      DEBUG(dbgs() << "  Found a branch that needs expanding, BB#"
+                   << DestBB->getNumber() << ", Distance " << BranchDistance
+                   << "\n");
+
+      // If JCC is not the last instruction we need to split the MBB.
+      if (MI->getOpcode() == MSP430::JCC && std::next(MI) != EE) {
+
+        DEBUG(dbgs() << "  Found a basic block that needs to be split, BB#"
+                     << MBB->getNumber() << "\n");
+
+        // Create a new basic block.
+        MachineBasicBlock *NewBB =
+            MF->CreateMachineBasicBlock(MBB->getBasicBlock());
+        MF->insert(std::next(MBB), NewBB);
+
+        // Splice the instructions following MI over to the NewBB.
+        NewBB->splice(NewBB->end(), &*MBB, std::next(MI), MBB->end());
+
+        // Update the successor lists.
+        for (MachineBasicBlock *Succ : MBB->successors()) {
+          if (Succ == DestBB) {
+            continue;
+          }
+          MBB->replaceSuccessor(Succ, NewBB);
+          NewBB->addSuccessor(Succ);
+        }
+
+        // We introduced a new MBB so all following blocks should be numbered
+        // and measured again.
+        measureFunction(BlockOffsets, &*MBB);
+
+        ++NumSplit;
+
+        // It may be not necessary to start all over at this point, but it's
+        // safer do this anyway.
+        return true;
+      }
+
+      MachineInstr &OldBranch = *MI;
+      DebugLoc dl = OldBranch.getDebugLoc();
+      int InstrSizeDiff = -TII->getInstSizeInBytes(OldBranch);
+
+      if (MI->getOpcode() == MSP430::JCC) {
+        MachineBasicBlock *NextMBB = &*std::next(MBB);
+        assert(MBB->isSuccessor(NextMBB) &&
+               "This block must have a layout successor!");
+
+        // The BCC operands are:
+        // 0. Target MBB
+        // 1. MSP430 branch predicate
+        SmallVector<MachineOperand, 1> Cond;
+        Cond.push_back(MI->getOperand(1));
+
+        // Jump over the long branch on the opposite condition
+        TII->reverseBranchCondition(Cond);
+        MI = BuildMI(*MBB, MI, dl, TII->get(MSP430::JCC))
+                             .addMBB(NextMBB)
+                             .addOperand(Cond[0]);
+        InstrSizeDiff += TII->getInstSizeInBytes(*MI);
+        ++MI;
+      }
+
+      // Unconditional branch to the real destination.
+      MI = BuildMI(*MBB, MI, dl, TII->get(MSP430::Bi)).addMBB(DestBB);
+      InstrSizeDiff += TII->getInstSizeInBytes(*MI);
+
+      // Remove the old branch from the function.
+      OldBranch.eraseFromParent();
+
+      // The size of a new instruction is different from the old one, so we need
+      // to correct all block offsets.
+      for (int i = MBB->getNumber() + 1, e = BlockOffsets.size(); i < e; ++i) {
+        BlockOffsets[i] += InstrSizeDiff;
+      }
+      MBBStartOffset += InstrSizeDiff;
+
+      ++NumExpanded;
+      MadeChange = true;
+    }
+  }
+  return MadeChange;
+}
+
+bool MSP430BSel::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  TII = static_cast<const MSP430InstrInfo *>(MF->getSubtarget().getInstrInfo());
+
+  // If the pass is disabled, just bail early.
+  if (!BranchSelectEnabled)
+    return false;
+
+  DEBUG(dbgs() << "\n********** " << getPassName() << " **********\n");
+
+  // BlockOffsets - Contains the distance from the beginning of the function to
+  // the beginning of each basic block.
+  OffsetVector BlockOffsets;
+
+  unsigned FunctionSize = measureFunction(BlockOffsets);
+  // If the entire function is smaller than the displacement of a branch field,
+  // we know we don't need to expand any branches in this
+  // function. This is a common case.
+  if (isInRage(FunctionSize)) {
+    return false;
+  }
+
+  // Iteratively expand branches until we reach a fixed point.
+  bool MadeChange = false;
+  while (expandBranches(BlockOffsets))
+    MadeChange = true;
+
+  return MadeChange;
+}
+
+/// Returns an instance of the Branch Selection Pass
+FunctionPass *llvm::createMSP430BranchSelectionPass() {
+  return new MSP430BSel();
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td b/contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td
new file mode 100644
index 000000000000..b38f5781c84a
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td
@@ -0,0 +1,37 @@
+//==- MSP430CallingConv.td - Calling Conventions for MSP430 -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for MSP430 architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MSP430 Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_MSP430 : CallingConv<[
+  // i8 are returned in registers R15B, R14B, R13B, R12B
+  CCIfType<[i8], CCAssignToReg<[R15B, R14B, R13B, R12B]>>,
+
+  // i16 are returned in registers R15, R14, R13, R12
+  CCIfType<[i16], CCAssignToReg<[R15, R14, R13, R12]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_MSP430_AssignStack : CallingConv<[
+  // Pass by value if the byval attribute is given
+  CCIfByVal<CCPassByVal<2, 2>>,
+
+  // Promote i8 arguments to i16.
+  CCIfType<[i8], CCPromoteToType<i16>>,
+
+  // Integer values get stored in stack slots that are 2 bytes in
+  // size and 2-byte aligned.
+  CCIfType<[i16], CCAssignToStack<2, 2>>
+]>;
+
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
new file mode 100644
index 000000000000..f1cb0b6c031b
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -0,0 +1,301 @@
+//===-- MSP430FrameLowering.cpp - MSP430 Frame Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430FrameLowering.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430Subtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+bool MSP430FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+          MF.getFrameInfo().hasVarSizedObjects() ||
+          MFI.isFrameAddressTaken());
+}
+
+bool MSP430FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo().hasVarSizedObjects();
+}
+
+void MSP430FrameLowering::emitPrologue(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) const {
+  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MSP430MachineFunctionInfo *MSP430FI = MF.getInfo<MSP430MachineFunctionInfo>();
+  const MSP430InstrInfo &TII =
+      *static_cast<const MSP430InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  uint64_t StackSize = MFI.getStackSize();
+
+  uint64_t NumBytes = 0;
+  if (hasFP(MF)) {
+    // Calculate required stack adjustment
+    uint64_t FrameSize = StackSize - 2;
+    NumBytes = FrameSize - MSP430FI->getCalleeSavedFrameSize();
+
+    // Get the offset of the stack slot for the EBP register... which is
+    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+    // Update the frame offset adjustment.
+    MFI.setOffsetAdjustment(-NumBytes);
+
+    // Save FP into the appropriate stack slot...
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::PUSH16r))
+      .addReg(MSP430::FP, RegState::Kill);
+
+    // Update FP with the new base value...
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::MOV16rr), MSP430::FP)
+      .addReg(MSP430::SP);
+
+    // Mark the FramePtr as live-in in every block except the entry.
+    for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
+         I != E; ++I)
+      I->addLiveIn(MSP430::FP);
+
+  } else
+    NumBytes = StackSize - MSP430FI->getCalleeSavedFrameSize();
+
+  // Skip the callee-saved push instructions.
+  while (MBBI != MBB.end() && (MBBI->getOpcode() == MSP430::PUSH16r))
+    ++MBBI;
+
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+
+  if (NumBytes) { // adjust stack pointer: SP -= numbytes
+    // If there is an SUB16ri of SP immediately before this instruction, merge
+    // the two.
+    //NumBytes -= mergeSPUpdates(MBB, MBBI, true);
+    // If there is an ADD16ri or SUB16ri of SP immediately after this
+    // instruction, merge the two instructions.
+    // mergeSPUpdatesDown(MBB, MBBI, &NumBytes);
+
+    if (NumBytes) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL, TII.get(MSP430::SUB16ri), MSP430::SP)
+        .addReg(MSP430::SP).addImm(NumBytes);
+      // The SRW implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
+  }
+}
+
+void MSP430FrameLowering::emitEpilogue(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  MSP430MachineFunctionInfo *MSP430FI = MF.getInfo<MSP430MachineFunctionInfo>();
+  const MSP430InstrInfo &TII =
+      *static_cast<const MSP430InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  switch (RetOpcode) {
+  case MSP430::RET:
+  case MSP430::RETI: break;  // These are ok
+  default:
+    llvm_unreachable("Can only insert epilog into returning blocks");
+  }
+
+  // Get the number of bytes to allocate from the FrameInfo
+  uint64_t StackSize = MFI.getStackSize();
+  unsigned CSSize = MSP430FI->getCalleeSavedFrameSize();
+  uint64_t NumBytes = 0;
+
+  if (hasFP(MF)) {
+    // Calculate required stack adjustment
+    uint64_t FrameSize = StackSize - 2;
+    NumBytes = FrameSize - CSSize;
+
+    // pop FP.
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::POP16r), MSP430::FP);
+  } else
+    NumBytes = StackSize - CSSize;
+
+  // Skip the callee-saved pop instructions.
+  while (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PI = std::prev(MBBI);
+    unsigned Opc = PI->getOpcode();
+    if (Opc != MSP430::POP16r && !PI->isTerminator())
+      break;
+    --MBBI;
+  }
+
+  DL = MBBI->getDebugLoc();
+
+  // If there is an ADD16ri or SUB16ri of SP immediately before this
+  // instruction, merge the two instructions.
+  //if (NumBytes || MFI.hasVarSizedObjects())
+  //  mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
+
+  if (MFI.hasVarSizedObjects()) {
+    BuildMI(MBB, MBBI, DL,
+            TII.get(MSP430::MOV16rr), MSP430::SP).addReg(MSP430::FP);
+    if (CSSize) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL,
+                TII.get(MSP430::SUB16ri), MSP430::SP)
+        .addReg(MSP430::SP).addImm(CSSize);
+      // The SRW implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
+  } else {
+    // adjust stack pointer back: SP += numbytes
+    if (NumBytes) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL, TII.get(MSP430::ADD16ri), MSP430::SP)
+        .addReg(MSP430::SP).addImm(NumBytes);
+      // The SRW implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
+  }
+}
+
+// FIXME: Can we eleminate these in favour of generic code?
+bool
+MSP430FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  MSP430MachineFunctionInfo *MFI = MF.getInfo<MSP430MachineFunctionInfo>();
+  MFI->setCalleeSavedFrameSize(CSI.size() * 2);
+
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    BuildMI(MBB, MI, DL, TII.get(MSP430::PUSH16r))
+      .addReg(Reg, RegState::Kill);
+  }
+  return true;
+}
+
+bool
+MSP430FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                                 MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i)
+    BuildMI(MBB, MI, DL, TII.get(MSP430::POP16r), CSI[i].getReg());
+
+  return true;
+}
+
+MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  const MSP430InstrInfo &TII =
+      *static_cast<const MSP430InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  unsigned StackAlign = getStackAlignment();
+
+  if (!hasReservedCallFrame(MF)) {
+    // If the stack pointer can be changed after prologue, turn the
+    // adjcallstackup instruction into a 'sub SP, <amt>' and the
+    // adjcallstackdown instruction into 'add SP, <amt>'
+    // TODO: consider using push / pop instead of sub + store / add
+    MachineInstr &Old = *I;
+    uint64_t Amount = Old.getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      Amount = (Amount+StackAlign-1)/StackAlign*StackAlign;
+
+      MachineInstr *New = nullptr;
+      if (Old.getOpcode() == TII.getCallFrameSetupOpcode()) {
+        New =
+            BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::SUB16ri), MSP430::SP)
+                .addReg(MSP430::SP)
+                .addImm(Amount);
+      } else {
+        assert(Old.getOpcode() == TII.getCallFrameDestroyOpcode());
+        // factor out the amount the callee already popped.
+        uint64_t CalleeAmt = Old.getOperand(1).getImm();
+        Amount -= CalleeAmt;
+        if (Amount)
+          New = BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::ADD16ri),
+                        MSP430::SP)
+                    .addReg(MSP430::SP)
+                    .addImm(Amount);
+      }
+
+      if (New) {
+        // The SRW implicit def is dead.
+        New->getOperand(3).setIsDead();
+
+        // Replace the pseudo instruction with a new instruction...
+        MBB.insert(I, New);
+      }
+    }
+  } else if (I->getOpcode() == TII.getCallFrameDestroyOpcode()) {
+    // If we are performing frame pointer elimination and if the callee pops
+    // something off the stack pointer, add it back.
+    if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+      MachineInstr &Old = *I;
+      MachineInstr *New =
+          BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::SUB16ri), MSP430::SP)
+              .addReg(MSP430::SP)
+              .addImm(CalleeAmt);
+      // The SRW implicit def is dead.
+      New->getOperand(3).setIsDead();
+
+      MBB.insert(I, New);
+    }
+  }
+
+  return MBB.erase(I);
+}
+
+void
+MSP430FrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                                         RegScavenger *) const {
+  // Create a frame entry for the FP register that must be saved.
+  if (hasFP(MF)) {
+    int FrameIdx = MF.getFrameInfo().CreateFixedObject(2, -4, true);
+    (void)FrameIdx;
+    assert(FrameIdx == MF.getFrameInfo().getObjectIndexBegin() &&
+           "Slot for FP register must be last in order to be found!");
+  }
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h
new file mode 100644
index 000000000000..f77de18b4d16
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h
@@ -0,0 +1,54 @@
+//==- MSP430FrameLowering.h - Define frame lowering for MSP430 --*- C++ -*--==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430FRAMELOWERING_H
+#define LLVM_LIB_TARGET_MSP430_MSP430FRAMELOWERING_H
+
+#include "MSP430.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class MSP430FrameLowering : public TargetFrameLowering {
+protected:
+
+public:
+  explicit MSP430FrameLowering()
+      : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2, 2) {}
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                     RegScavenger *RS = nullptr) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
new file mode 100644
index 000000000000..6e481b68e038
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -0,0 +1,469 @@
+//===-- MSP430ISelDAGToDAG.cpp - A dag to dag inst selector for MSP430 ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MSP430 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-isel"
+
+namespace {
+  struct MSP430ISelAddressMode {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    struct {            // This is really a union, discriminated by BaseType!
+      SDValue Reg;
+      int FrameIndex;
+    } Base;
+
+    int16_t Disp;
+    const GlobalValue *GV;
+    const Constant *CP;
+    const BlockAddress *BlockAddr;
+    const char *ES;
+    int JT;
+    unsigned Align;    // CP alignment.
+
+    MSP430ISelAddressMode()
+      : BaseType(RegBase), Disp(0), GV(nullptr), CP(nullptr),
+        BlockAddr(nullptr), ES(nullptr), JT(-1), Align(0) {
+    }
+
+    bool hasSymbolicDisplacement() const {
+      return GV != nullptr || CP != nullptr || ES != nullptr || JT != -1;
+    }
+
+    void dump() {
+      errs() << "MSP430ISelAddressMode " << this << '\n';
+      if (BaseType == RegBase && Base.Reg.getNode() != nullptr) {
+        errs() << "Base.Reg ";
+        Base.Reg.getNode()->dump();
+      } else if (BaseType == FrameIndexBase) {
+        errs() << " Base.FrameIndex " << Base.FrameIndex << '\n';
+      }
+      errs() << " Disp " << Disp << '\n';
+      if (GV) {
+        errs() << "GV ";
+        GV->dump();
+      } else if (CP) {
+        errs() << " CP ";
+        CP->dump();
+        errs() << " Align" << Align << '\n';
+      } else if (ES) {
+        errs() << "ES ";
+        errs() << ES << '\n';
+      } else if (JT != -1)
+        errs() << " JT" << JT << " Align" << Align << '\n';
+    }
+  };
+}
+
+/// MSP430DAGToDAGISel - MSP430 specific code to select MSP430 machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+  class MSP430DAGToDAGISel : public SelectionDAGISel {
+  public:
+    MSP430DAGToDAGISel(MSP430TargetMachine &TM, CodeGenOpt::Level OptLevel)
+        : SelectionDAGISel(TM, OptLevel) {}
+
+    StringRef getPassName() const override {
+      return "MSP430 DAG->DAG Pattern Instruction Selection";
+    }
+
+    bool MatchAddress(SDValue N, MSP430ISelAddressMode &AM);
+    bool MatchWrapper(SDValue N, MSP430ISelAddressMode &AM);
+    bool MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM);
+
+    bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                                      std::vector<SDValue> &OutOps) override;
+
+    // Include the pieces autogenerated from the target description.
+  #include "MSP430GenDAGISel.inc"
+
+  private:
+    void Select(SDNode *N) override;
+    bool tryIndexedLoad(SDNode *Op);
+    bool tryIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2, unsigned Opc8,
+                         unsigned Opc16);
+
+    bool SelectAddr(SDValue Addr, SDValue &Base, SDValue &Disp);
+  };
+}  // end anonymous namespace
+
+/// createMSP430ISelDag - This pass converts a legalized DAG into a
+/// MSP430-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createMSP430ISelDag(MSP430TargetMachine &TM,
+                                        CodeGenOpt::Level OptLevel) {
+  return new MSP430DAGToDAGISel(TM, OptLevel);
+}
+
+
+/// MatchWrapper - Try to match MSP430ISD::Wrapper node into an addressing mode.
+/// These wrap things that will resolve down into a symbol reference.  If no
+/// match is possible, this returns true, otherwise it returns false.
+bool MSP430DAGToDAGISel::MatchWrapper(SDValue N, MSP430ISelAddressMode &AM) {
+  // If the addressing mode already has a symbol as the displacement, we can
+  // never match another symbol.
+  if (AM.hasSymbolicDisplacement())
+    return true;
+
+  SDValue N0 = N.getOperand(0);
+
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+    AM.GV = G->getGlobal();
+    AM.Disp += G->getOffset();
+    //AM.SymbolFlags = G->getTargetFlags();
+  } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+    AM.CP = CP->getConstVal();
+    AM.Align = CP->getAlignment();
+    AM.Disp += CP->getOffset();
+    //AM.SymbolFlags = CP->getTargetFlags();
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+    AM.ES = S->getSymbol();
+    //AM.SymbolFlags = S->getTargetFlags();
+  } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+    AM.JT = J->getIndex();
+    //AM.SymbolFlags = J->getTargetFlags();
+  } else {
+    AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress();
+    //AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags();
+  }
+  return false;
+}
+
+/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool MSP430DAGToDAGISel::MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM) {
+  // Is the base register already occupied?
+  if (AM.BaseType != MSP430ISelAddressMode::RegBase || AM.Base.Reg.getNode()) {
+    // If so, we cannot select it.
+    return true;
+  }
+
+  // Default, generate it as a register.
+  AM.BaseType = MSP430ISelAddressMode::RegBase;
+  AM.Base.Reg = N;
+  return false;
+}
+
+bool MSP430DAGToDAGISel::MatchAddress(SDValue N, MSP430ISelAddressMode &AM) {
+  DEBUG(errs() << "MatchAddress: "; AM.dump());
+
+  switch (N.getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+    AM.Disp += Val;
+    return false;
+  }
+
+  case MSP430ISD::Wrapper:
+    if (!MatchWrapper(N, AM))
+      return false;
+    break;
+
+  case ISD::FrameIndex:
+    if (AM.BaseType == MSP430ISelAddressMode::RegBase
+        && AM.Base.Reg.getNode() == nullptr) {
+      AM.BaseType = MSP430ISelAddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+      return false;
+    }
+    break;
+
+  case ISD::ADD: {
+    MSP430ISelAddressMode Backup = AM;
+    if (!MatchAddress(N.getNode()->getOperand(0), AM) &&
+        !MatchAddress(N.getNode()->getOperand(1), AM))
+      return false;
+    AM = Backup;
+    if (!MatchAddress(N.getNode()->getOperand(1), AM) &&
+        !MatchAddress(N.getNode()->getOperand(0), AM))
+      return false;
+    AM = Backup;
+
+    break;
+  }
+
+  case ISD::OR:
+    // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      MSP430ISelAddressMode Backup = AM;
+      uint64_t Offset = CN->getSExtValue();
+      // Start with the LHS as an addr mode.
+      if (!MatchAddress(N.getOperand(0), AM) &&
+          // Address could not have picked a GV address for the displacement.
+          AM.GV == nullptr &&
+          // Check to see if the LHS & C is zero.
+          CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getAPIntValue())) {
+        AM.Disp += Offset;
+        return false;
+      }
+      AM = Backup;
+    }
+    break;
+  }
+
+  return MatchAddressBase(N, AM);
+}
+
+/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+bool MSP430DAGToDAGISel::SelectAddr(SDValue N,
+                                    SDValue &Base, SDValue &Disp) {
+  MSP430ISelAddressMode AM;
+
+  if (MatchAddress(N, AM))
+    return false;
+
+  EVT VT = N.getValueType();
+  if (AM.BaseType == MSP430ISelAddressMode::RegBase) {
+    if (!AM.Base.Reg.getNode())
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  }
+
+  Base = (AM.BaseType == MSP430ISelAddressMode::FrameIndexBase)
+             ? CurDAG->getTargetFrameIndex(
+                   AM.Base.FrameIndex,
+                   getTargetLowering()->getPointerTy(CurDAG->getDataLayout()))
+             : AM.Base.Reg;
+
+  if (AM.GV)
+    Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(N),
+                                          MVT::i16, AM.Disp,
+                                          0/*AM.SymbolFlags*/);
+  else if (AM.CP)
+    Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i16,
+                                         AM.Align, AM.Disp, 0/*AM.SymbolFlags*/);
+  else if (AM.ES)
+    Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i16, 0/*AM.SymbolFlags*/);
+  else if (AM.JT != -1)
+    Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i16, 0/*AM.SymbolFlags*/);
+  else if (AM.BlockAddr)
+    Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, 0,
+                                         0/*AM.SymbolFlags*/);
+  else
+    Disp = CurDAG->getTargetConstant(AM.Disp, SDLoc(N), MVT::i16);
+
+  return true;
+}
+
+bool MSP430DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+  switch (ConstraintID) {
+  default: return true;
+  case InlineAsm::Constraint_m: // memory
+    if (!SelectAddr(Op, Op0, Op1))
+      return true;
+    break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  return false;
+}
+
+static bool isValidIndexedLoad(const LoadSDNode *LD) {
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM != ISD::POST_INC || LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return false;
+
+  EVT VT = LD->getMemoryVT();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i8:
+    // Sanity check
+    if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 1)
+      return false;
+
+    break;
+  case MVT::i16:
+    // Sanity check
+    if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 2)
+      return false;
+
+    break;
+  default:
+    return false;
+  }
+
+  return true;
+}
+
+bool MSP430DAGToDAGISel::tryIndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (!isValidIndexedLoad(LD))
+    return false;
+
+  MVT VT = LD->getMemoryVT().getSimpleVT();
+
+  unsigned Opcode = 0;
+  switch (VT.SimpleTy) {
+  case MVT::i8:
+    Opcode = MSP430::MOV8rm_POST;
+    break;
+  case MVT::i16:
+    Opcode = MSP430::MOV16rm_POST;
+    break;
+  default:
+    return false;
+  }
+
+  ReplaceNode(N,
+              CurDAG->getMachineNode(Opcode, SDLoc(N), VT, MVT::i16, MVT::Other,
+                                     LD->getBasePtr(), LD->getChain()));
+  return true;
+}
+
+bool MSP430DAGToDAGISel::tryIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2,
+                                         unsigned Opc8, unsigned Opc16) {
+  if (N1.getOpcode() == ISD::LOAD &&
+      N1.hasOneUse() &&
+      IsLegalToFold(N1, Op, Op, OptLevel)) {
+    LoadSDNode *LD = cast<LoadSDNode>(N1);
+    if (!isValidIndexedLoad(LD))
+      return false;
+
+    MVT VT = LD->getMemoryVT().getSimpleVT();
+    unsigned Opc = (VT == MVT::i16 ? Opc16 : Opc8);
+    MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+    MemRefs0[0] = cast<MemSDNode>(N1)->getMemOperand();
+    SDValue Ops0[] = { N2, LD->getBasePtr(), LD->getChain() };
+    SDNode *ResNode =
+      CurDAG->SelectNodeTo(Op, Opc, VT, MVT::i16, MVT::Other, Ops0);
+    cast<MachineSDNode>(ResNode)->setMemRefs(MemRefs0, MemRefs0 + 1);
+    // Transfer chain.
+    ReplaceUses(SDValue(N1.getNode(), 2), SDValue(ResNode, 2));
+    // Transfer writeback.
+    ReplaceUses(SDValue(N1.getNode(), 1), SDValue(ResNode, 1));
+    return true;
+  }
+
+  return false;
+}
+
+
+void MSP430DAGToDAGISel::Select(SDNode *Node) {
+  SDLoc dl(Node);
+
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: ");
+  DEBUG(Node->dump(CurDAG));
+  DEBUG(errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== ";
+          Node->dump(CurDAG);
+          errs() << "\n");
+    Node->setNodeId(-1);
+    return;
+  }
+
+  // Few custom selection stuff.
+  switch (Node->getOpcode()) {
+  default: break;
+  case ISD::FrameIndex: {
+    assert(Node->getValueType(0) == MVT::i16);
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16);
+    if (Node->hasOneUse()) {
+      CurDAG->SelectNodeTo(Node, MSP430::ADD16ri, MVT::i16, TFI,
+                           CurDAG->getTargetConstant(0, dl, MVT::i16));
+      return;
+    }
+    ReplaceNode(Node, CurDAG->getMachineNode(
+                          MSP430::ADD16ri, dl, MVT::i16, TFI,
+                          CurDAG->getTargetConstant(0, dl, MVT::i16)));
+    return;
+  }
+  case ISD::LOAD:
+    if (tryIndexedLoad(Node))
+      return;
+    // Other cases are autogenerated.
+    break;
+  case ISD::ADD:
+    if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
+                        MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
+      return;
+    else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                             MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
+      return;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::SUB:
+    if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
+                        MSP430::SUB8rm_POST, MSP430::SUB16rm_POST))
+      return;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::AND:
+    if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
+                        MSP430::AND8rm_POST, MSP430::AND16rm_POST))
+      return;
+    else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                             MSP430::AND8rm_POST, MSP430::AND16rm_POST))
+      return;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::OR:
+    if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
+                        MSP430::OR8rm_POST, MSP430::OR16rm_POST))
+      return;
+    else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                             MSP430::OR8rm_POST, MSP430::OR16rm_POST))
+      return;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::XOR:
+    if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
+                        MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
+      return;
+    else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                             MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
+      return;
+
+    // Other cases are autogenerated.
+    break;
+  }
+
+  // Select the default instruction
+  SelectCode(Node);
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
new file mode 100644
index 000000000000..73346b9ce41d
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -0,0 +1,1342 @@
+//===-- MSP430ISelLowering.cpp - MSP430 DAG Lowering Implementation  ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430TargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430ISelLowering.h"
+#include "MSP430.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430Subtarget.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-lower"
+
+typedef enum {
+  NoHWMult,
+  HWMultIntr,
+  HWMultNoIntr
+} HWMultUseMode;
+
+static cl::opt<HWMultUseMode>
+HWMultMode("msp430-hwmult-mode", cl::Hidden,
+           cl::desc("Hardware multiplier use mode"),
+           cl::init(HWMultNoIntr),
+           cl::values(
+             clEnumValN(NoHWMult, "no",
+                "Do not use hardware multiplier"),
+             clEnumValN(HWMultIntr, "interrupts",
+                "Assume hardware multiplier can be used inside interrupts"),
+             clEnumValN(HWMultNoIntr, "use",
+                "Assume hardware multiplier cannot be used inside interrupts")));
+
+MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
+                                           const MSP430Subtarget &STI)
+    : TargetLowering(TM) {
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i8,  &MSP430::GR8RegClass);
+  addRegisterClass(MVT::i16, &MSP430::GR16RegClass);
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties(STI.getRegisterInfo());
+
+  // Provide all sorts of operation actions
+  setStackPointerRegisterToSaveRestore(MSP430::SP);
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
+
+  // We have post-incremented loads / stores.
+  setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
+  setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
+
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD,  VT, MVT::i1,  Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1,  Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1,  Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8,  Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Expand);
+  }
+
+  // We don't have any truncstores
+  setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+
+  setOperationAction(ISD::SRA,              MVT::i8,    Custom);
+  setOperationAction(ISD::SHL,              MVT::i8,    Custom);
+  setOperationAction(ISD::SRL,              MVT::i8,    Custom);
+  setOperationAction(ISD::SRA,              MVT::i16,   Custom);
+  setOperationAction(ISD::SHL,              MVT::i16,   Custom);
+  setOperationAction(ISD::SRL,              MVT::i16,   Custom);
+  setOperationAction(ISD::ROTL,             MVT::i8,    Expand);
+  setOperationAction(ISD::ROTR,             MVT::i8,    Expand);
+  setOperationAction(ISD::ROTL,             MVT::i16,   Expand);
+  setOperationAction(ISD::ROTR,             MVT::i16,   Expand);
+  setOperationAction(ISD::GlobalAddress,    MVT::i16,   Custom);
+  setOperationAction(ISD::ExternalSymbol,   MVT::i16,   Custom);
+  setOperationAction(ISD::BlockAddress,     MVT::i16,   Custom);
+  setOperationAction(ISD::BR_JT,            MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,            MVT::i8,    Custom);
+  setOperationAction(ISD::BR_CC,            MVT::i16,   Custom);
+  setOperationAction(ISD::BRCOND,           MVT::Other, Expand);
+  setOperationAction(ISD::SETCC,            MVT::i8,    Custom);
+  setOperationAction(ISD::SETCC,            MVT::i16,   Custom);
+  setOperationAction(ISD::SELECT,           MVT::i8,    Expand);
+  setOperationAction(ISD::SELECT,           MVT::i16,   Expand);
+  setOperationAction(ISD::SELECT_CC,        MVT::i8,    Custom);
+  setOperationAction(ISD::SELECT_CC,        MVT::i16,   Custom);
+  setOperationAction(ISD::SIGN_EXTEND,      MVT::i16,   Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i8, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Expand);
+
+  setOperationAction(ISD::CTTZ,             MVT::i8,    Expand);
+  setOperationAction(ISD::CTTZ,             MVT::i16,   Expand);
+  setOperationAction(ISD::CTLZ,             MVT::i8,    Expand);
+  setOperationAction(ISD::CTLZ,             MVT::i16,   Expand);
+  setOperationAction(ISD::CTPOP,            MVT::i8,    Expand);
+  setOperationAction(ISD::CTPOP,            MVT::i16,   Expand);
+
+  setOperationAction(ISD::SHL_PARTS,        MVT::i8,    Expand);
+  setOperationAction(ISD::SHL_PARTS,        MVT::i16,   Expand);
+  setOperationAction(ISD::SRL_PARTS,        MVT::i8,    Expand);
+  setOperationAction(ISD::SRL_PARTS,        MVT::i16,   Expand);
+  setOperationAction(ISD::SRA_PARTS,        MVT::i8,    Expand);
+  setOperationAction(ISD::SRA_PARTS,        MVT::i16,   Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,   Expand);
+
+  // FIXME: Implement efficiently multiplication by a constant
+  setOperationAction(ISD::MUL,              MVT::i8,    Expand);
+  setOperationAction(ISD::MULHS,            MVT::i8,    Expand);
+  setOperationAction(ISD::MULHU,            MVT::i8,    Expand);
+  setOperationAction(ISD::SMUL_LOHI,        MVT::i8,    Expand);
+  setOperationAction(ISD::UMUL_LOHI,        MVT::i8,    Expand);
+  setOperationAction(ISD::MUL,              MVT::i16,   Expand);
+  setOperationAction(ISD::MULHS,            MVT::i16,   Expand);
+  setOperationAction(ISD::MULHU,            MVT::i16,   Expand);
+  setOperationAction(ISD::SMUL_LOHI,        MVT::i16,   Expand);
+  setOperationAction(ISD::UMUL_LOHI,        MVT::i16,   Expand);
+
+  setOperationAction(ISD::UDIV,             MVT::i8,    Expand);
+  setOperationAction(ISD::UDIVREM,          MVT::i8,    Expand);
+  setOperationAction(ISD::UREM,             MVT::i8,    Expand);
+  setOperationAction(ISD::SDIV,             MVT::i8,    Expand);
+  setOperationAction(ISD::SDIVREM,          MVT::i8,    Expand);
+  setOperationAction(ISD::SREM,             MVT::i8,    Expand);
+  setOperationAction(ISD::UDIV,             MVT::i16,   Expand);
+  setOperationAction(ISD::UDIVREM,          MVT::i16,   Expand);
+  setOperationAction(ISD::UREM,             MVT::i16,   Expand);
+  setOperationAction(ISD::SDIV,             MVT::i16,   Expand);
+  setOperationAction(ISD::SDIVREM,          MVT::i16,   Expand);
+  setOperationAction(ISD::SREM,             MVT::i16,   Expand);
+
+  // varargs support
+  setOperationAction(ISD::VASTART,          MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,            MVT::Other, Expand);
+  setOperationAction(ISD::VAEND,            MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,           MVT::Other, Expand);
+  setOperationAction(ISD::JumpTable,        MVT::i16,   Custom);
+
+  // Libcalls names.
+  if (HWMultMode == HWMultIntr) {
+    setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw");
+    setLibcallName(RTLIB::MUL_I16, "__mulhi3hw");
+  } else if (HWMultMode == HWMultNoIntr) {
+    setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw_noint");
+    setLibcallName(RTLIB::MUL_I16, "__mulhi3hw_noint");
+  }
+
+  setMinFunctionAlignment(1);
+  setPrefFunctionAlignment(2);
+}
+
+SDValue MSP430TargetLowering::LowerOperation(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  case ISD::SHL: // FALLTHROUGH
+  case ISD::SRL:
+  case ISD::SRA:              return LowerShifts(Op, DAG);
+  case ISD::GlobalAddress:    return LowerGlobalAddress(Op, DAG);
+  case ISD::BlockAddress:     return LowerBlockAddress(Op, DAG);
+  case ISD::ExternalSymbol:   return LowerExternalSymbol(Op, DAG);
+  case ISD::SETCC:            return LowerSETCC(Op, DAG);
+  case ISD::BR_CC:            return LowerBR_CC(Op, DAG);
+  case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
+  case ISD::SIGN_EXTEND:      return LowerSIGN_EXTEND(Op, DAG);
+  case ISD::RETURNADDR:       return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:        return LowerFRAMEADDR(Op, DAG);
+  case ISD::VASTART:          return LowerVASTART(Op, DAG);
+  case ISD::JumpTable:        return LowerJumpTable(Op, DAG);
+  default:
+    llvm_unreachable("unimplemented operand");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                       MSP430 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+TargetLowering::ConstraintType
+MSP430TargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return C_RegisterClass;
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+MSP430TargetLowering::getRegForInlineAsmConstraint(
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+    case 'r':   // GENERAL_REGS
+      if (VT == MVT::i8)
+        return std::make_pair(0U, &MSP430::GR8RegClass);
+
+      return std::make_pair(0U, &MSP430::GR16RegClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "MSP430GenCallingConv.inc"
+
+/// For each argument in a function store the number of pieces it is composed
+/// of.
+template<typename ArgT>
+static void ParseFunctionArgs(const SmallVectorImpl<ArgT> &Args,
+                              SmallVectorImpl<unsigned> &Out) {
+  unsigned CurrentArgIndex = ~0U;
+  for (unsigned i = 0, e = Args.size(); i != e; i++) {
+    if (CurrentArgIndex == Args[i].OrigArgIndex) {
+      Out.back()++;
+    } else {
+      Out.push_back(1);
+      CurrentArgIndex++;
+    }
+  }
+}
+
+static void AnalyzeVarArgs(CCState &State,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  State.AnalyzeCallOperands(Outs, CC_MSP430_AssignStack);
+}
+
+static void AnalyzeVarArgs(CCState &State,
+                           const SmallVectorImpl<ISD::InputArg> &Ins) {
+  State.AnalyzeFormalArguments(Ins, CC_MSP430_AssignStack);
+}
+
+/// Analyze incoming and outgoing function arguments. We need custom C++ code
+/// to handle special constraints in the ABI like reversing the order of the
+/// pieces of splitted arguments. In addition, all pieces of a certain argument
+/// have to be passed either using registers or the stack but never mixing both.
+template<typename ArgT>
+static void AnalyzeArguments(CCState &State,
+                             SmallVectorImpl<CCValAssign> &ArgLocs,
+                             const SmallVectorImpl<ArgT> &Args) {
+  static const MCPhysReg RegList[] = {
+    MSP430::R15, MSP430::R14, MSP430::R13, MSP430::R12
+  };
+  static const unsigned NbRegs = array_lengthof(RegList);
+
+  if (State.isVarArg()) {
+    AnalyzeVarArgs(State, Args);
+    return;
+  }
+
+  SmallVector<unsigned, 4> ArgsParts;
+  ParseFunctionArgs(Args, ArgsParts);
+
+  unsigned RegsLeft = NbRegs;
+  bool UseStack = false;
+  unsigned ValNo = 0;
+
+  for (unsigned i = 0, e = ArgsParts.size(); i != e; i++) {
+    MVT ArgVT = Args[ValNo].VT;
+    ISD::ArgFlagsTy ArgFlags = Args[ValNo].Flags;
+    MVT LocVT = ArgVT;
+    CCValAssign::LocInfo LocInfo = CCValAssign::Full;
+
+    // Promote i8 to i16
+    if (LocVT == MVT::i8) {
+      LocVT = MVT::i16;
+      if (ArgFlags.isSExt())
+          LocInfo = CCValAssign::SExt;
+      else if (ArgFlags.isZExt())
+          LocInfo = CCValAssign::ZExt;
+      else
+          LocInfo = CCValAssign::AExt;
+    }
+
+    // Handle byval arguments
+    if (ArgFlags.isByVal()) {
+      State.HandleByVal(ValNo++, ArgVT, LocVT, LocInfo, 2, 2, ArgFlags);
+      continue;
+    }
+
+    unsigned Parts = ArgsParts[i];
+
+    if (!UseStack && Parts <= RegsLeft) {
+      unsigned FirstVal = ValNo;
+      for (unsigned j = 0; j < Parts; j++) {
+        unsigned Reg = State.AllocateReg(RegList);
+        State.addLoc(CCValAssign::getReg(ValNo++, ArgVT, Reg, LocVT, LocInfo));
+        RegsLeft--;
+      }
+
+      // Reverse the order of the pieces to agree with the "big endian" format
+      // required in the calling convention ABI.
+      SmallVectorImpl<CCValAssign>::iterator B = ArgLocs.begin() + FirstVal;
+      std::reverse(B, B + Parts);
+    } else {
+      UseStack = true;
+      for (unsigned j = 0; j < Parts; j++)
+        CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
+    }
+  }
+}
+
+static void AnalyzeRetResult(CCState &State,
+                             const SmallVectorImpl<ISD::InputArg> &Ins) {
+  State.AnalyzeCallResult(Ins, RetCC_MSP430);
+}
+
+static void AnalyzeRetResult(CCState &State,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  State.AnalyzeReturn(Outs, RetCC_MSP430);
+}
+
+template<typename ArgT>
+static void AnalyzeReturnValues(CCState &State,
+                                SmallVectorImpl<CCValAssign> &RVLocs,
+                                const SmallVectorImpl<ArgT> &Args) {
+  AnalyzeRetResult(State, Args);
+
+  // Reverse splitted return values to get the "big endian" format required
+  // to agree with the calling convention ABI.
+  std::reverse(RVLocs.begin(), RVLocs.end());
+}
+
+SDValue MSP430TargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+  switch (CallConv) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return LowerCCCArguments(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals);
+  case CallingConv::MSP430_INTR:
+    if (Ins.empty())
+      return Chain;
+    report_fatal_error("ISRs cannot have arguments");
+  }
+}
+
+SDValue
+MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  // MSP430 target does not yet support tail call optimization.
+  isTailCall = false;
+
+  switch (CallConv) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::Fast:
+  case CallingConv::C:
+    return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
+                          Outs, OutVals, Ins, dl, DAG, InVals);
+  case CallingConv::MSP430_INTR:
+    report_fatal_error("ISRs cannot be called directly");
+  }
+}
+
+/// LowerCCCArguments - transform physical registers into virtual registers and
+/// generate load operations for arguments places on the stack.
+// FIXME: struct return stuff
+SDValue MSP430TargetLowering::LowerCCCArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  AnalyzeArguments(CCInfo, ArgLocs, Ins);
+
+  // Create frame index for the start of the first vararg value
+  if (isVarArg) {
+    unsigned Offset = CCInfo.getNextStackOffset();
+    FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, Offset, true));
+  }
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      EVT RegVT = VA.getLocVT();
+      switch (RegVT.getSimpleVT().SimpleTy) {
+      default:
+        {
+#ifndef NDEBUG
+          errs() << "LowerFormalArguments Unhandled argument type: "
+               << RegVT.getEVTString() << "\n";
+#endif
+          llvm_unreachable(nullptr);
+        }
+      case MVT::i16:
+        unsigned VReg = RegInfo.createVirtualRegister(&MSP430::GR16RegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
+
+        // If this is an 8-bit value, it is really passed promoted to 16
+        // bits. Insert an assert[sz]ext to capture this, then truncate to the
+        // right size.
+        if (VA.getLocInfo() == CCValAssign::SExt)
+          ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+        else if (VA.getLocInfo() == CCValAssign::ZExt)
+          ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+
+        if (VA.getLocInfo() != CCValAssign::Full)
+          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+
+        InVals.push_back(ArgValue);
+      }
+    } else {
+      // Sanity check
+      assert(VA.isMemLoc());
+
+      SDValue InVal;
+      ISD::ArgFlagsTy Flags = Ins[i].Flags;
+
+      if (Flags.isByVal()) {
+        int FI = MFI.CreateFixedObject(Flags.getByValSize(),
+                                       VA.getLocMemOffset(), true);
+        InVal = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+      } else {
+        // Load the argument to a virtual register
+        unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
+        if (ObjSize > 2) {
+            errs() << "LowerFormalArguments Unhandled argument type: "
+                << EVT(VA.getLocVT()).getEVTString()
+                << "\n";
+        }
+        // Create the frame index object for this incoming parameter...
+        int FI = MFI.CreateFixedObject(ObjSize, VA.getLocMemOffset(), true);
+
+        // Create the SelectionDAG nodes corresponding to a load
+        //from this parameter
+        SDValue FIN = DAG.getFrameIndex(FI, MVT::i16);
+        InVal = DAG.getLoad(
+            VA.getLocVT(), dl, Chain, FIN,
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+      }
+
+      InVals.push_back(InVal);
+    }
+  }
+
+  return Chain;
+}
+
+SDValue
+MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                  bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  const SmallVectorImpl<SDValue> &OutVals,
+                                  const SDLoc &dl, SelectionDAG &DAG) const {
+
+  // CCValAssign - represent the assignment of the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // ISRs cannot return any value.
+  if (CallConv == CallingConv::MSP430_INTR && !Outs.empty())
+    report_fatal_error("ISRs cannot return any value");
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  // Analize return values.
+  AnalyzeReturnValues(CCInfo, RVLocs, Outs);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             OutVals[i], Flag);
+
+    // Guarantee that all emitted copies are stuck together,
+    // avoiding something bad.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  unsigned Opc = (CallConv == CallingConv::MSP430_INTR ?
+                  MSP430ISD::RETI_FLAG : MSP430ISD::RET_FLAG);
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(Opc, dl, MVT::Other, RetOps);
+}
+
+/// LowerCCCCallTo - functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+// TODO: sret.
+SDValue MSP430TargetLowering::LowerCCCCallTo(
+    SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+    bool isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  AnalyzeArguments(CCInfo, ArgLocs, Outs);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  Chain = DAG.getCALLSEQ_START(Chain,
+                               DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+  SDValue StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    SDValue Arg = OutVals[i];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+
+    // Arguments that can be passed on register must be kept at RegsToPass
+    // vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      if (!StackPtr.getNode())
+        StackPtr = DAG.getCopyFromReg(Chain, dl, MSP430::SP, PtrVT);
+
+      SDValue PtrOff =
+          DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                      DAG.getIntPtrConstant(VA.getLocMemOffset(), dl));
+
+      SDValue MemOp;
+      ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+      if (Flags.isByVal()) {
+        SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i16);
+        MemOp = DAG.getMemcpy(Chain, dl, PtrOff, Arg, SizeNode,
+                              Flags.getByValAlign(),
+                              /*isVolatile*/false,
+                              /*AlwaysInline=*/true,
+                              /*isTailCall=*/false,
+                              MachinePointerInfo(),
+                              MachinePointerInfo());
+      } else {
+        MemOp = DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
+      }
+
+      MemOpChains.push_back(MemOp);
+    }
+  }
+
+  // Transform all store nodes into one single node because all store nodes are
+  // independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain and
+  // flag operands which copy the outgoing args into registers.  The InFlag in
+  // necessary since all emitted instructions must be stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i16);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i16);
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(MSP430ISD::CALL, dl, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getConstant(NumBytes, dl, PtrVT, true),
+                             DAG.getConstant(0, dl, PtrVT, true), InFlag, dl);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl,
+                         DAG, InVals);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue MSP430TargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  AnalyzeReturnValues(CCInfo, RVLocs, Ins);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+                               RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  unsigned Opc = Op.getOpcode();
+  SDNode* N = Op.getNode();
+  EVT VT = Op.getValueType();
+  SDLoc dl(N);
+
+  // Expand non-constant shifts to loops:
+  if (!isa<ConstantSDNode>(N->getOperand(1)))
+    switch (Opc) {
+    default: llvm_unreachable("Invalid shift opcode!");
+    case ISD::SHL:
+      return DAG.getNode(MSP430ISD::SHL, dl,
+                         VT, N->getOperand(0), N->getOperand(1));
+    case ISD::SRA:
+      return DAG.getNode(MSP430ISD::SRA, dl,
+                         VT, N->getOperand(0), N->getOperand(1));
+    case ISD::SRL:
+      return DAG.getNode(MSP430ISD::SRL, dl,
+                         VT, N->getOperand(0), N->getOperand(1));
+    }
+
+  uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+
+  // Expand the stuff into sequence of shifts.
+  // FIXME: for some shift amounts this might be done better!
+  // E.g.: foo >> (8 + N) => sxt(swpb(foo)) >> N
+  SDValue Victim = N->getOperand(0);
+
+  if (Opc == ISD::SRL && ShiftAmount) {
+    // Emit a special goodness here:
+    // srl A, 1 => clrc; rrc A
+    Victim = DAG.getNode(MSP430ISD::RRC, dl, VT, Victim);
+    ShiftAmount -= 1;
+  }
+
+  while (ShiftAmount--)
+    Victim = DAG.getNode((Opc == ISD::SHL ? MSP430ISD::RLA : MSP430ISD::RRA),
+                         dl, VT, Victim);
+
+  return Victim;
+}
+
+SDValue MSP430TargetLowering::LowerGlobalAddress(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // Create the TargetGlobalAddress node, folding in the constant offset.
+  SDValue Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), PtrVT, Offset);
+  return DAG.getNode(MSP430ISD::Wrapper, SDLoc(Op), PtrVT, Result);
+}
+
+SDValue MSP430TargetLowering::LowerExternalSymbol(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT);
+
+  return DAG.getNode(MSP430ISD::Wrapper, dl, PtrVT, Result);
+}
+
+SDValue MSP430TargetLowering::LowerBlockAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT);
+
+  return DAG.getNode(MSP430ISD::Wrapper, dl, PtrVT, Result);
+}
+
+static SDValue EmitCMP(SDValue &LHS, SDValue &RHS, SDValue &TargetCC,
+                       ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) {
+  // FIXME: Handle bittests someday
+  assert(!LHS.getValueType().isFloatingPoint() && "We don't handle FP yet");
+
+  // FIXME: Handle jump negative someday
+  MSP430CC::CondCodes TCC = MSP430CC::COND_INVALID;
+  switch (CC) {
+  default: llvm_unreachable("Invalid integer condition!");
+  case ISD::SETEQ:
+    TCC = MSP430CC::COND_E;     // aka COND_Z
+    // Minor optimization: if LHS is a constant, swap operands, then the
+    // constant can be folded into comparison.
+    if (LHS.getOpcode() == ISD::Constant)
+      std::swap(LHS, RHS);
+    break;
+  case ISD::SETNE:
+    TCC = MSP430CC::COND_NE;    // aka COND_NZ
+    // Minor optimization: if LHS is a constant, swap operands, then the
+    // constant can be folded into comparison.
+    if (LHS.getOpcode() == ISD::Constant)
+      std::swap(LHS, RHS);
+    break;
+  case ISD::SETULE:
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ISD::SETUGE:
+    // Turn lhs u>= rhs with lhs constant into rhs u< lhs+1, this allows us to
+    // fold constant into instruction.
+    if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+      LHS = RHS;
+      RHS = DAG.getConstant(C->getSExtValue() + 1, dl, C->getValueType(0));
+      TCC = MSP430CC::COND_LO;
+      break;
+    }
+    TCC = MSP430CC::COND_HS;    // aka COND_C
+    break;
+  case ISD::SETUGT:
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ISD::SETULT:
+    // Turn lhs u< rhs with lhs constant into rhs u>= lhs+1, this allows us to
+    // fold constant into instruction.
+    if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+      LHS = RHS;
+      RHS = DAG.getConstant(C->getSExtValue() + 1, dl, C->getValueType(0));
+      TCC = MSP430CC::COND_HS;
+      break;
+    }
+    TCC = MSP430CC::COND_LO;    // aka COND_NC
+    break;
+  case ISD::SETLE:
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ISD::SETGE:
+    // Turn lhs >= rhs with lhs constant into rhs < lhs+1, this allows us to
+    // fold constant into instruction.
+    if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+      LHS = RHS;
+      RHS = DAG.getConstant(C->getSExtValue() + 1, dl, C->getValueType(0));
+      TCC = MSP430CC::COND_L;
+      break;
+    }
+    TCC = MSP430CC::COND_GE;
+    break;
+  case ISD::SETGT:
+    std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
+  case ISD::SETLT:
+    // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to
+    // fold constant into instruction.
+    if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+      LHS = RHS;
+      RHS = DAG.getConstant(C->getSExtValue() + 1, dl, C->getValueType(0));
+      TCC = MSP430CC::COND_GE;
+      break;
+    }
+    TCC = MSP430CC::COND_L;
+    break;
+  }
+
+  TargetCC = DAG.getConstant(TCC, dl, MVT::i8);
+  return DAG.getNode(MSP430ISD::CMP, dl, MVT::Glue, LHS, RHS);
+}
+
+
+SDValue MSP430TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS   = Op.getOperand(2);
+  SDValue RHS   = Op.getOperand(3);
+  SDValue Dest  = Op.getOperand(4);
+  SDLoc dl  (Op);
+
+  SDValue TargetCC;
+  SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+  return DAG.getNode(MSP430ISD::BR_CC, dl, Op.getValueType(),
+                     Chain, Dest, TargetCC, Flag);
+}
+
+SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS   = Op.getOperand(0);
+  SDValue RHS   = Op.getOperand(1);
+  SDLoc dl  (Op);
+
+  // If we are doing an AND and testing against zero, then the CMP
+  // will not be generated.  The AND (or BIT) will generate the condition codes,
+  // but they are different from CMP.
+  // FIXME: since we're doing a post-processing, use a pseudoinstr here, so
+  // lowering & isel wouldn't diverge.
+  bool andCC = false;
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+    if (RHSC->isNullValue() && LHS.hasOneUse() &&
+        (LHS.getOpcode() == ISD::AND ||
+         (LHS.getOpcode() == ISD::TRUNCATE &&
+          LHS.getOperand(0).getOpcode() == ISD::AND))) {
+      andCC = true;
+    }
+  }
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDValue TargetCC;
+  SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+  // Get the condition codes directly from the status register, if its easy.
+  // Otherwise a branch will be generated.  Note that the AND and BIT
+  // instructions generate different flags than CMP, the carry bit can be used
+  // for NE/EQ.
+  bool Invert = false;
+  bool Shift = false;
+  bool Convert = true;
+  switch (cast<ConstantSDNode>(TargetCC)->getZExtValue()) {
+   default:
+    Convert = false;
+    break;
+   case MSP430CC::COND_HS:
+     // Res = SR & 1, no processing is required
+     break;
+   case MSP430CC::COND_LO:
+     // Res = ~(SR & 1)
+     Invert = true;
+     break;
+   case MSP430CC::COND_NE:
+     if (andCC) {
+       // C = ~Z, thus Res = SR & 1, no processing is required
+     } else {
+       // Res = ~((SR >> 1) & 1)
+       Shift = true;
+       Invert = true;
+     }
+     break;
+   case MSP430CC::COND_E:
+     Shift = true;
+     // C = ~Z for AND instruction, thus we can put Res = ~(SR & 1), however,
+     // Res = (SR >> 1) & 1 is 1 word shorter.
+     break;
+  }
+  EVT VT = Op.getValueType();
+  SDValue One  = DAG.getConstant(1, dl, VT);
+  if (Convert) {
+    SDValue SR = DAG.getCopyFromReg(DAG.getEntryNode(), dl, MSP430::SR,
+                                    MVT::i16, Flag);
+    if (Shift)
+      // FIXME: somewhere this is turned into a SRL, lower it MSP specific?
+      SR = DAG.getNode(ISD::SRA, dl, MVT::i16, SR, One);
+    SR = DAG.getNode(ISD::AND, dl, MVT::i16, SR, One);
+    if (Invert)
+      SR = DAG.getNode(ISD::XOR, dl, MVT::i16, SR, One);
+    return SR;
+  } else {
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+    SDValue Ops[] = {One, Zero, TargetCC, Flag};
+    return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, Ops);
+  }
+}
+
+SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDValue LHS    = Op.getOperand(0);
+  SDValue RHS    = Op.getOperand(1);
+  SDValue TrueV  = Op.getOperand(2);
+  SDValue FalseV = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDLoc dl   (Op);
+
+  SDValue TargetCC;
+  SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  SDValue Ops[] = {TrueV, FalseV, TargetCC, Flag};
+
+  return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, Ops);
+}
+
+SDValue MSP430TargetLowering::LowerSIGN_EXTEND(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue Val = Op.getOperand(0);
+  EVT VT      = Op.getValueType();
+  SDLoc dl(Op);
+
+  assert(VT == MVT::i16 && "Only support i16 for now!");
+
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT,
+                     DAG.getNode(ISD::ANY_EXTEND, dl, VT, Val),
+                     DAG.getValueType(Val.getValueType()));
+}
+
+SDValue
+MSP430TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+  int ReturnAddrIndex = FuncInfo->getRAIndex();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
+
+  if (ReturnAddrIndex == 0) {
+    // Set up a frame object for the return address.
+    uint64_t SlotSize = MF.getDataLayout().getPointerSize();
+    ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize, -SlotSize,
+                                                           true);
+    FuncInfo->setRAIndex(ReturnAddrIndex);
+  }
+
+  return DAG.getFrameIndex(ReturnAddrIndex, PtrVT);
+}
+
+SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI.setReturnAddressIsTaken(true);
+
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDLoc dl(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  if (Depth > 0) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset =
+        DAG.getConstant(DAG.getDataLayout().getPointerSize(), dl, MVT::i16);
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
+                       MachinePointerInfo());
+  }
+
+  // Just load the return address.
+  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
+                     MachinePointerInfo());
+}
+
+SDValue MSP430TargetLowering::LowerFRAMEADDR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI.setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);  // FIXME probably not meaningful
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                                         MSP430::FP, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo());
+  return FrameAddr;
+}
+
+SDValue MSP430TargetLowering::LowerVASTART(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // Frame index of first vararg argument
+  SDValue FrameIndex =
+      DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+
+  // Create a store of the frame index to the location operand
+  return DAG.getStore(Op.getOperand(0), SDLoc(Op), FrameIndex, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
+SDValue MSP430TargetLowering::LowerJumpTable(SDValue Op,
+                                             SelectionDAG &DAG) const {
+    JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+    auto PtrVT = getPointerTy(DAG.getDataLayout());
+    SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+    return DAG.getNode(MSP430ISD::Wrapper, SDLoc(JT), PtrVT, Result);
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool MSP430TargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                                      SDValue &Base,
+                                                      SDValue &Offset,
+                                                      ISD::MemIndexedMode &AM,
+                                                      SelectionDAG &DAG) const {
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return false;
+
+  EVT VT = LD->getMemoryVT();
+  if (VT != MVT::i8 && VT != MVT::i16)
+    return false;
+
+  if (Op->getOpcode() != ISD::ADD)
+    return false;
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+    uint64_t RHSC = RHS->getZExtValue();
+    if ((VT == MVT::i16 && RHSC != 2) ||
+        (VT == MVT::i8 && RHSC != 1))
+      return false;
+
+    Base = Op->getOperand(0);
+    Offset = DAG.getConstant(RHSC, SDLoc(N), VT);
+    AM = ISD::POST_INC;
+    return true;
+  }
+
+  return false;
+}
+
+
+const char *MSP430TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch ((MSP430ISD::NodeType)Opcode) {
+  case MSP430ISD::FIRST_NUMBER:       break;
+  case MSP430ISD::RET_FLAG:           return "MSP430ISD::RET_FLAG";
+  case MSP430ISD::RETI_FLAG:          return "MSP430ISD::RETI_FLAG";
+  case MSP430ISD::RRA:                return "MSP430ISD::RRA";
+  case MSP430ISD::RLA:                return "MSP430ISD::RLA";
+  case MSP430ISD::RRC:                return "MSP430ISD::RRC";
+  case MSP430ISD::CALL:               return "MSP430ISD::CALL";
+  case MSP430ISD::Wrapper:            return "MSP430ISD::Wrapper";
+  case MSP430ISD::BR_CC:              return "MSP430ISD::BR_CC";
+  case MSP430ISD::CMP:                return "MSP430ISD::CMP";
+  case MSP430ISD::SETCC:              return "MSP430ISD::SETCC";
+  case MSP430ISD::SELECT_CC:          return "MSP430ISD::SELECT_CC";
+  case MSP430ISD::SHL:                return "MSP430ISD::SHL";
+  case MSP430ISD::SRA:                return "MSP430ISD::SRA";
+  case MSP430ISD::SRL:                return "MSP430ISD::SRL";
+  }
+  return nullptr;
+}
+
+bool MSP430TargetLowering::isTruncateFree(Type *Ty1,
+                                          Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+
+  return (Ty1->getPrimitiveSizeInBits() > Ty2->getPrimitiveSizeInBits());
+}
+
+bool MSP430TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+
+  return (VT1.getSizeInBits() > VT2.getSizeInBits());
+}
+
+bool MSP430TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+  // MSP430 implicitly zero-extends 8-bit results in 16-bit registers.
+  return 0 && Ty1->isIntegerTy(8) && Ty2->isIntegerTy(16);
+}
+
+bool MSP430TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  // MSP430 implicitly zero-extends 8-bit results in 16-bit registers.
+  return 0 && VT1 == MVT::i8 && VT2 == MVT::i16;
+}
+
+bool MSP430TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  return isZExtFree(Val.getValueType(), VT2);
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+MSP430TargetLowering::EmitShiftInstr(MachineInstr &MI,
+                                     MachineBasicBlock *BB) const {
+  MachineFunction *F = BB->getParent();
+  MachineRegisterInfo &RI = F->getRegInfo();
+  DebugLoc dl = MI.getDebugLoc();
+  const TargetInstrInfo &TII = *F->getSubtarget().getInstrInfo();
+
+  unsigned Opc;
+  const TargetRegisterClass * RC;
+  switch (MI.getOpcode()) {
+  default: llvm_unreachable("Invalid shift opcode!");
+  case MSP430::Shl8:
+   Opc = MSP430::SHL8r1;
+   RC = &MSP430::GR8RegClass;
+   break;
+  case MSP430::Shl16:
+   Opc = MSP430::SHL16r1;
+   RC = &MSP430::GR16RegClass;
+   break;
+  case MSP430::Sra8:
+   Opc = MSP430::SAR8r1;
+   RC = &MSP430::GR8RegClass;
+   break;
+  case MSP430::Sra16:
+   Opc = MSP430::SAR16r1;
+   RC = &MSP430::GR16RegClass;
+   break;
+  case MSP430::Srl8:
+   Opc = MSP430::SAR8r1c;
+   RC = &MSP430::GR8RegClass;
+   break;
+  case MSP430::Srl16:
+   Opc = MSP430::SAR16r1c;
+   RC = &MSP430::GR16RegClass;
+   break;
+  }
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator I = ++BB->getIterator();
+
+  // Create loop block
+  MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *RemBB  = F->CreateMachineBasicBlock(LLVM_BB);
+
+  F->insert(I, LoopBB);
+  F->insert(I, RemBB);
+
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the block containing instructions after shift.
+  RemBB->splice(RemBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
+                BB->end());
+  RemBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB
+  BB->addSuccessor(LoopBB);
+  BB->addSuccessor(RemBB);
+  LoopBB->addSuccessor(RemBB);
+  LoopBB->addSuccessor(LoopBB);
+
+  unsigned ShiftAmtReg = RI.createVirtualRegister(&MSP430::GR8RegClass);
+  unsigned ShiftAmtReg2 = RI.createVirtualRegister(&MSP430::GR8RegClass);
+  unsigned ShiftReg = RI.createVirtualRegister(RC);
+  unsigned ShiftReg2 = RI.createVirtualRegister(RC);
+  unsigned ShiftAmtSrcReg = MI.getOperand(2).getReg();
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  unsigned DstReg = MI.getOperand(0).getReg();
+
+  // BB:
+  // cmp 0, N
+  // je RemBB
+  BuildMI(BB, dl, TII.get(MSP430::CMP8ri))
+    .addReg(ShiftAmtSrcReg).addImm(0);
+  BuildMI(BB, dl, TII.get(MSP430::JCC))
+    .addMBB(RemBB)
+    .addImm(MSP430CC::COND_E);
+
+  // LoopBB:
+  // ShiftReg = phi [%SrcReg, BB], [%ShiftReg2, LoopBB]
+  // ShiftAmt = phi [%N, BB],      [%ShiftAmt2, LoopBB]
+  // ShiftReg2 = shift ShiftReg
+  // ShiftAmt2 = ShiftAmt - 1;
+  BuildMI(LoopBB, dl, TII.get(MSP430::PHI), ShiftReg)
+    .addReg(SrcReg).addMBB(BB)
+    .addReg(ShiftReg2).addMBB(LoopBB);
+  BuildMI(LoopBB, dl, TII.get(MSP430::PHI), ShiftAmtReg)
+    .addReg(ShiftAmtSrcReg).addMBB(BB)
+    .addReg(ShiftAmtReg2).addMBB(LoopBB);
+  BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2)
+    .addReg(ShiftReg);
+  BuildMI(LoopBB, dl, TII.get(MSP430::SUB8ri), ShiftAmtReg2)
+    .addReg(ShiftAmtReg).addImm(1);
+  BuildMI(LoopBB, dl, TII.get(MSP430::JCC))
+    .addMBB(LoopBB)
+    .addImm(MSP430CC::COND_NE);
+
+  // RemBB:
+  // DestReg = phi [%SrcReg, BB], [%ShiftReg, LoopBB]
+  BuildMI(*RemBB, RemBB->begin(), dl, TII.get(MSP430::PHI), DstReg)
+    .addReg(SrcReg).addMBB(BB)
+    .addReg(ShiftReg2).addMBB(LoopBB);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return RemBB;
+}
+
+MachineBasicBlock *
+MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                                  MachineBasicBlock *BB) const {
+  unsigned Opc = MI.getOpcode();
+
+  if (Opc == MSP430::Shl8 || Opc == MSP430::Shl16 ||
+      Opc == MSP430::Sra8 || Opc == MSP430::Sra16 ||
+      Opc == MSP430::Srl8 || Opc == MSP430::Srl16)
+    return EmitShiftInstr(MI, BB);
+
+  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  assert((Opc == MSP430::Select16 || Opc == MSP430::Select8) &&
+         "Unexpected instr type to insert");
+
+  // To "insert" a SELECT instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator I = ++BB->getIterator();
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   jCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *copy1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(I, copy0MBB);
+  F->insert(I, copy1MBB);
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  copy1MBB->splice(copy1MBB->begin(), BB,
+                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  copy1MBB->transferSuccessorsAndUpdatePHIs(BB);
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(copy1MBB);
+
+  BuildMI(BB, dl, TII.get(MSP430::JCC))
+      .addMBB(copy1MBB)
+      .addImm(MI.getOperand(3).getImm());
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to copy1MBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(copy1MBB);
+
+  //  copy1MBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = copy1MBB;
+  BuildMI(*BB, BB->begin(), dl, TII.get(MSP430::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(2).getReg())
+      .addMBB(copy0MBB)
+      .addReg(MI.getOperand(1).getReg())
+      .addMBB(thisMBB);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h
new file mode 100644
index 000000000000..8864807e999e
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h
@@ -0,0 +1,174 @@
+//===-- MSP430ISelLowering.h - MSP430 DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that MSP430 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430ISELLOWERING_H
+#define LLVM_LIB_TARGET_MSP430_MSP430ISELLOWERING_H
+
+#include "MSP430.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  namespace MSP430ISD {
+    enum NodeType : unsigned {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// Return with a flag operand. Operand 0 is the chain operand.
+      RET_FLAG,
+
+      /// Same as RET_FLAG, but used for returning from ISRs.
+      RETI_FLAG,
+
+      /// Y = R{R,L}A X, rotate right (left) arithmetically
+      RRA, RLA,
+
+      /// Y = RRC X, rotate right via carry
+      RRC,
+
+      /// CALL - These operations represent an abstract call
+      /// instruction, which includes a bunch of information.
+      CALL,
+
+      /// Wrapper - A wrapper node for TargetConstantPool, TargetExternalSymbol,
+      /// and TargetGlobalAddress.
+      Wrapper,
+
+      /// CMP - Compare instruction.
+      CMP,
+
+      /// SetCC - Operand 0 is condition code, and operand 1 is the flag
+      /// operand produced by a CMP instruction.
+      SETCC,
+
+      /// MSP430 conditional branches. Operand 0 is the chain operand, operand 1
+      /// is the block to branch if condition is true, operand 2 is the
+      /// condition code, and operand 3 is the flag operand produced by a CMP
+      /// instruction.
+      BR_CC,
+
+      /// SELECT_CC - Operand 0 and operand 1 are selection variable, operand 3
+      /// is condition code and operand 4 is flag operand.
+      SELECT_CC,
+
+      /// SHL, SRA, SRL - Non-constant shifts.
+      SHL, SRA, SRL
+    };
+  }
+
+  class MSP430Subtarget;
+  class MSP430TargetLowering : public TargetLowering {
+  public:
+    explicit MSP430TargetLowering(const TargetMachine &TM,
+                                  const MSP430Subtarget &STI);
+
+    MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+      return MVT::i8;
+    }
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    /// DAG node.
+    const char *getTargetNodeName(unsigned Opcode) const override;
+
+    SDValue LowerShifts(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+
+    TargetLowering::ConstraintType
+    getConstraintType(StringRef Constraint) const override;
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 StringRef Constraint, MVT VT) const override;
+
+    /// isTruncateFree - Return true if it's free to truncate a value of type
+    /// Ty1 to type Ty2. e.g. On msp430 it's free to truncate a i16 value in
+    /// register R15W to i8 by referencing its sub-register R15B.
+    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+    bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+    /// isZExtFree - Return true if any actual instruction that defines a value
+    /// of type Ty1 implicit zero-extends the value to Ty2 in the result
+    /// register. This does not necessarily include registers defined in unknown
+    /// ways, such as incoming arguments, or copies from unknown virtual
+    /// registers. Also, if isTruncateFree(Ty2, Ty1) is true, this does not
+    /// necessarily apply to truncate instructions. e.g. on msp430, all
+    /// instructions that define 8-bit values implicit zero-extend the result
+    /// out to 16 bits.
+    bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+    bool isZExtFree(EVT VT1, EVT VT2) const override;
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *BB) const override;
+    MachineBasicBlock *EmitShiftInstr(MachineInstr &MI,
+                                      MachineBasicBlock *BB) const;
+
+  private:
+    SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           bool isTailCall,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<SDValue> &OutVals,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           const SDLoc &dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue LowerCCCArguments(SDValue Chain, CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              const SDLoc &dl, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            const SDLoc &dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const override;
+
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
+
+    bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                    SDValue &Base,
+                                    SDValue &Offset,
+                                    ISD::MemIndexedMode &AM,
+                                    SelectionDAG &DAG) const override;
+  };
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrFormats.td b/contrib/llvm/lib/Target/MSP430/MSP430InstrFormats.td
new file mode 100644
index 000000000000..a9e87dad0cd8
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrFormats.td
@@ -0,0 +1,211 @@
+//===-- MSP430InstrFormats.td - MSP430 Instruction Formats -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MSP430 instructions format here
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<2> val> {
+  bits<2> Value = val;
+}
+
+def PseudoFrm   : Format<0>;
+def SingleOpFrm : Format<1>;
+def DoubleOpFrm : Format<2>;
+def CondJumpFrm : Format<3>;
+
+class SourceMode<bits<2> val> {
+  bits<2> Value = val;
+}
+
+def SrcReg      : SourceMode<0>;
+def SrcMem      : SourceMode<1>;
+def SrcIndReg   : SourceMode<2>;
+def SrcPostInc  : SourceMode<3>;
+def SrcImm      : SourceMode<3>;
+
+class DestMode<bit val> {
+  bit Value = val;
+}
+
+def DstReg      : DestMode<0>;
+def DstMem      : DestMode<1>;
+
+class SizeVal<bits<3> val> {
+  bits<3> Value = val;
+}
+
+def SizeUnknown : SizeVal<0>; // Unknown / unset size
+def SizeSpecial : SizeVal<1>; // Special instruction, e.g. pseudo
+def Size2Bytes  : SizeVal<2>;
+def Size4Bytes  : SizeVal<3>;
+def Size6Bytes  : SizeVal<4>;
+
+// Generic MSP430 Format
+class MSP430Inst<dag outs, dag ins, SizeVal sz, Format f,
+                 string asmstr> : Instruction {
+  field bits<16> Inst;
+
+  let Namespace = "MSP430";
+
+  dag OutOperandList = outs;
+  dag InOperandList  = ins;
+
+  Format Form = f;
+  SizeVal Sz = sz;
+
+  // Define how we want to layout our TargetSpecific information field... This
+  // should be kept up-to-date with the fields in the MSP430InstrInfo.h file.
+  let TSFlags{1-0} = Form.Value;
+  let TSFlags{4-2} = Sz.Value;
+
+  let AsmString   = asmstr;
+}
+
+// FIXME: Create different classes for different addressing modes.
+
+// MSP430 Double Operand (Format I) Instructions
+class IForm<bits<4> opcode, DestMode dest, bit bw, SourceMode src, SizeVal sz,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, sz, DoubleOpFrm, asmstr> {
+  let Pattern = pattern;
+
+  DestMode ad = dest;
+  SourceMode as = src;
+  
+  let Inst{12-15} = opcode;
+  let Inst{7}     = ad.Value;
+  let Inst{6}     = bw;
+  let Inst{4-5}   = as.Value;
+}
+
+// 8 bit IForm instructions
+class IForm8<bits<4> opcode, DestMode dest, SourceMode src, SizeVal sz,
+             dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm<opcode, dest, 1, src, sz, outs, ins, asmstr, pattern>;
+
+class I8rr<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstReg, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class I8ri<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstReg, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I8rm<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstReg, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I8mr<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstMem, SrcReg, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I8mi<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstMem, SrcImm, Size6Bytes, outs, ins, asmstr, pattern>;
+
+class I8mm<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstMem, SrcMem, Size6Bytes, outs, ins, asmstr, pattern>;
+
+// 16 bit IForm instructions
+class IForm16<bits<4> opcode, DestMode dest, SourceMode src, SizeVal sz,
+              dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm<opcode, dest, 0, src, sz, outs, ins, asmstr, pattern>;
+
+class I16rr<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstReg, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class I16ri<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstReg, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I16rm<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstReg, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I16mr<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstMem, SrcReg, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I16mi<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstMem, SrcImm, Size6Bytes, outs, ins, asmstr, pattern>;
+
+class I16mm<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstMem, SrcMem, Size6Bytes, outs, ins, asmstr, pattern>;
+
+// MSP430 Single Operand (Format II) Instructions
+class IIForm<bits<9> opcode, bit bw, SourceMode src, SizeVal sz,
+             dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, sz, SingleOpFrm, asmstr> {
+  let Pattern = pattern;
+  
+  SourceMode as = src;
+
+  let Inst{7-15} = opcode;
+  let Inst{6}    = bw;
+  let Inst{4-5}  = as.Value;
+}
+
+// 8 bit IIForm instructions
+class IIForm8<bits<9> opcode, SourceMode src, SizeVal sz,
+              dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm<opcode, 1, src, sz, outs, ins, asmstr, pattern>;
+
+class II8r<bits<9> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class II8m<bits<9> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class II8i<bits<9> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+// 16 bit IIForm instructions
+class IIForm16<bits<9> opcode, SourceMode src, SizeVal sz,
+               dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm<opcode, 0, src, sz, outs, ins, asmstr, pattern>;
+
+class II16r<bits<9> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm16<opcode, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class II16m<bits<9> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm16<opcode, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class II16i<bits<9> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm16<opcode, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+// MSP430 Conditional Jumps Instructions
+class CJForm<bits<3> opcode, bits<3> cond,
+             dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, Size2Bytes, CondJumpFrm, asmstr> {
+  let Pattern = pattern;
+  
+  let Inst{13-15} = opcode;
+  let Inst{10-12} = cond;
+}
+
+// Pseudo instructions
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, SizeSpecial, PseudoFrm, asmstr> {
+  let Pattern = pattern;
+  let Inst{15-0} = 0;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
new file mode 100644
index 000000000000..6135ce080920
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -0,0 +1,335 @@
+//===-- MSP430InstrInfo.cpp - MSP430 Instruction Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430InstrInfo.h"
+#include "MSP430.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "MSP430GenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void MSP430InstrInfo::anchor() {}
+
+MSP430InstrInfo::MSP430InstrInfo(MSP430Subtarget &STI)
+  : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
+    RI() {}
+
+void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MI,
+                                    unsigned SrcReg, bool isKill, int FrameIdx,
+                                          const TargetRegisterClass *RC,
+                                          const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FrameIdx),
+      MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
+      MFI.getObjectAlignment(FrameIdx));
+
+  if (RC == &MSP430::GR16RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV16mr))
+      .addFrameIndex(FrameIdx).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  else if (RC == &MSP430::GR8RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV8mr))
+      .addFrameIndex(FrameIdx).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  else
+    llvm_unreachable("Cannot store this register to stack slot!");
+}
+
+void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                           unsigned DestReg, int FrameIdx,
+                                           const TargetRegisterClass *RC,
+                                           const TargetRegisterInfo *TRI) const{
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FrameIdx),
+      MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
+      MFI.getObjectAlignment(FrameIdx));
+
+  if (RC == &MSP430::GR16RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV16rm))
+      .addReg(DestReg, getDefRegState(true)).addFrameIndex(FrameIdx)
+      .addImm(0).addMemOperand(MMO);
+  else if (RC == &MSP430::GR8RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV8rm))
+      .addReg(DestReg, getDefRegState(true)).addFrameIndex(FrameIdx)
+      .addImm(0).addMemOperand(MMO);
+  else
+    llvm_unreachable("Cannot store this register to stack slot!");
+}
+
+void MSP430InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, unsigned DestReg,
+                                  unsigned SrcReg, bool KillSrc) const {
+  unsigned Opc;
+  if (MSP430::GR16RegClass.contains(DestReg, SrcReg))
+    Opc = MSP430::MOV16rr;
+  else if (MSP430::GR8RegClass.contains(DestReg, SrcReg))
+    Opc = MSP430::MOV8rr;
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+
+  BuildMI(MBB, I, DL, get(Opc), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+unsigned MSP430InstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                       int *BytesRemoved) const {
+  assert(!BytesRemoved && "code size not handled");
+
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (I->getOpcode() != MSP430::JMP &&
+        I->getOpcode() != MSP430::JCC &&
+        I->getOpcode() != MSP430::Br &&
+        I->getOpcode() != MSP430::Bm)
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+bool MSP430InstrInfo::
+reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid Xbranch condition!");
+
+  MSP430CC::CondCodes CC = static_cast<MSP430CC::CondCodes>(Cond[0].getImm());
+
+  switch (CC) {
+  default: llvm_unreachable("Invalid branch condition!");
+  case MSP430CC::COND_E:
+    CC = MSP430CC::COND_NE;
+    break;
+  case MSP430CC::COND_NE:
+    CC = MSP430CC::COND_E;
+    break;
+  case MSP430CC::COND_L:
+    CC = MSP430CC::COND_GE;
+    break;
+  case MSP430CC::COND_GE:
+    CC = MSP430CC::COND_L;
+    break;
+  case MSP430CC::COND_HS:
+    CC = MSP430CC::COND_LO;
+    break;
+  case MSP430CC::COND_LO:
+    CC = MSP430CC::COND_HS;
+    break;
+  }
+
+  Cond[0].setImm(CC);
+  return false;
+}
+
+bool MSP430InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
+  if (!MI.isTerminator())
+    return false;
+
+  // Conditional branch is a special case.
+  if (MI.isBranch() && !MI.isBarrier())
+    return true;
+  if (!MI.isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+bool MSP430InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock *&TBB,
+                                    MachineBasicBlock *&FBB,
+                                    SmallVectorImpl<MachineOperand> &Cond,
+                                    bool AllowModify) const {
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+
+    // Working from the bottom, when we see a non-terminator
+    // instruction, we're done.
+    if (!isUnpredicatedTerminator(*I))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled
+    // by this analysis.
+    if (!I->isBranch())
+      return true;
+
+    // Cannot handle indirect branches.
+    if (I->getOpcode() == MSP430::Br ||
+        I->getOpcode() == MSP430::Bm)
+      return true;
+
+    // Handle unconditional branches.
+    if (I->getOpcode() == MSP430::JMP) {
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (std::next(I) != MBB.end())
+        std::next(I)->eraseFromParent();
+      Cond.clear();
+      FBB = nullptr;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = nullptr;
+        I->eraseFromParent();
+        I = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditinal destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+
+    // Handle conditional branches.
+    assert(I->getOpcode() == MSP430::JCC && "Invalid conditional branch");
+    MSP430CC::CondCodes BranchCode =
+      static_cast<MSP430CC::CondCodes>(I->getOperand(1).getImm());
+    if (BranchCode == MSP430CC::COND_INVALID)
+      return true;  // Can't handle weird stuff.
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      continue;
+    }
+
+    // Handle subsequent conditional branches. Only handle the case where all
+    // conditional branches branch to the same destination.
+    assert(Cond.size() == 1);
+    assert(TBB);
+
+    // Only handle the case where all conditional branches branch to
+    // the same destination.
+    if (TBB != I->getOperand(0).getMBB())
+      return true;
+
+    MSP430CC::CondCodes OldBranchCode = (MSP430CC::CondCodes)Cond[0].getImm();
+    // If the conditions are the same, we can leave them alone.
+    if (OldBranchCode == BranchCode)
+      continue;
+
+    return true;
+  }
+
+  return false;
+}
+
+unsigned MSP430InstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                       MachineBasicBlock *TBB,
+                                       MachineBasicBlock *FBB,
+                                       ArrayRef<MachineOperand> Cond,
+                                       const DebugLoc &DL,
+                                       int *BytesAdded) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "MSP430 branch conditions have one component!");
+  assert(!BytesAdded && "code size not handled");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(MSP430::JMP)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  BuildMI(&MBB, DL, get(MSP430::JCC)).addMBB(TBB).addImm(Cond[0].getImm());
+  ++Count;
+
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, DL, get(MSP430::JMP)).addMBB(FBB);
+    ++Count;
+  }
+  return Count;
+}
+
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+///
+unsigned MSP430InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+  const MCInstrDesc &Desc = MI.getDesc();
+
+  switch (Desc.TSFlags & MSP430II::SizeMask) {
+  default:
+    switch (Desc.getOpcode()) {
+    default: llvm_unreachable("Unknown instruction size!");
+    case TargetOpcode::CFI_INSTRUCTION:
+    case TargetOpcode::EH_LABEL:
+    case TargetOpcode::IMPLICIT_DEF:
+    case TargetOpcode::KILL:
+    case TargetOpcode::DBG_VALUE:
+      return 0;
+    case TargetOpcode::INLINEASM: {
+      const MachineFunction *MF = MI.getParent()->getParent();
+      const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+      return TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(),
+                                    *MF->getTarget().getMCAsmInfo());
+    }
+    }
+  case MSP430II::SizeSpecial:
+    switch (MI.getOpcode()) {
+    default: llvm_unreachable("Unknown instruction size!");
+    case MSP430::SAR8r1c:
+    case MSP430::SAR16r1c:
+      return 4;
+    }
+  case MSP430II::Size2Bytes:
+    return 2;
+  case MSP430II::Size4Bytes:
+    return 4;
+  case MSP430II::Size6Bytes:
+    return 6;
+  }
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h
new file mode 100644
index 000000000000..e3259bd6a7bc
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h
@@ -0,0 +1,92 @@
+//===-- MSP430InstrInfo.h - MSP430 Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430INSTRINFO_H
+#define LLVM_LIB_TARGET_MSP430_MSP430INSTRINFO_H
+
+#include "MSP430RegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "MSP430GenInstrInfo.inc"
+
+namespace llvm {
+
+class MSP430Subtarget;
+
+/// MSP430II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace MSP430II {
+  enum {
+    SizeShift   = 2,
+    SizeMask    = 7 << SizeShift,
+
+    SizeUnknown = 0 << SizeShift,
+    SizeSpecial = 1 << SizeShift,
+    Size2Bytes  = 2 << SizeShift,
+    Size4Bytes  = 3 << SizeShift,
+    Size6Bytes  = 4 << SizeShift
+  };
+}
+
+class MSP430InstrInfo : public MSP430GenInstrInfo {
+  const MSP430RegisterInfo RI;
+  virtual void anchor();
+public:
+  explicit MSP430InstrInfo(MSP430Subtarget &STI);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill,
+                           int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIdx,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+  // Branch folding goodness
+  bool
+  reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+  bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td
new file mode 100644
index 000000000000..c0c29b992238
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td
@@ -0,0 +1,1211 @@
+//===-- MSP430InstrInfo.td - MSP430 Instruction defs -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the MSP430 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "MSP430InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Type Constraints.
+//===----------------------------------------------------------------------===//
+class SDTCisI8<int OpNum> : SDTCisVT<OpNum, i8>;
+class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>;
+
+//===----------------------------------------------------------------------===//
+// Type Profiles.
+//===----------------------------------------------------------------------===//
+def SDT_MSP430Call         : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_MSP430CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
+def SDT_MSP430Wrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+                                                  SDTCisPtrTy<0>]>;
+def SDT_MSP430Cmp          : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_MSP430BrCC         : SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>,
+                                                  SDTCisVT<1, i8>]>;
+def SDT_MSP430SelectCC     : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+                                                  SDTCisSameAs<1, 2>, 
+                                                  SDTCisVT<3, i8>]>;
+def SDT_MSP430Shift        : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
+                                                  SDTCisI8<2>]>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Specific Node Definitions.
+//===----------------------------------------------------------------------===//
+def MSP430retflag  : SDNode<"MSP430ISD::RET_FLAG", SDTNone,
+                       [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def MSP430retiflag : SDNode<"MSP430ISD::RETI_FLAG", SDTNone,
+                       [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def MSP430rra     : SDNode<"MSP430ISD::RRA", SDTIntUnaryOp, []>;
+def MSP430rla     : SDNode<"MSP430ISD::RLA", SDTIntUnaryOp, []>;
+def MSP430rrc     : SDNode<"MSP430ISD::RRC", SDTIntUnaryOp, []>;
+
+def MSP430call    : SDNode<"MSP430ISD::CALL", SDT_MSP430Call,
+                     [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>;
+def MSP430callseq_start :
+                 SDNode<"ISD::CALLSEQ_START", SDT_MSP430CallSeqStart,
+                        [SDNPHasChain, SDNPOutGlue]>;
+def MSP430callseq_end :
+                 SDNode<"ISD::CALLSEQ_END",   SDT_MSP430CallSeqEnd,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def MSP430Wrapper : SDNode<"MSP430ISD::Wrapper", SDT_MSP430Wrapper>;
+def MSP430cmp     : SDNode<"MSP430ISD::CMP", SDT_MSP430Cmp, [SDNPOutGlue]>;
+def MSP430brcc    : SDNode<"MSP430ISD::BR_CC", SDT_MSP430BrCC,
+                            [SDNPHasChain, SDNPInGlue]>;
+def MSP430selectcc: SDNode<"MSP430ISD::SELECT_CC", SDT_MSP430SelectCC,
+                            [SDNPInGlue]>;
+def MSP430shl     : SDNode<"MSP430ISD::SHL", SDT_MSP430Shift, []>;
+def MSP430sra     : SDNode<"MSP430ISD::SRA", SDT_MSP430Shift, []>;
+def MSP430srl     : SDNode<"MSP430ISD::SRL", SDT_MSP430Shift, []>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Operand Definitions.
+//===----------------------------------------------------------------------===//
+
+// Address operands
+def memsrc : Operand<i16> {
+  let PrintMethod = "printSrcMemOperand";
+  let MIOperandInfo = (ops GR16, i16imm);
+}
+
+def memdst : Operand<i16> {
+  let PrintMethod = "printSrcMemOperand";
+  let MIOperandInfo = (ops GR16, i16imm);
+}
+
+// Short jump targets have OtherVT type and are printed as pcrel imm values.
+def jmptarget : Operand<OtherVT> {
+  let PrintMethod = "printPCRelImmOperand";
+}
+
+// Operand for printing out a condition code.
+def cc : Operand<i8> {
+  let PrintMethod = "printCCOperand";
+}
+
+//===----------------------------------------------------------------------===//
+// MSP430 Complex Pattern Definitions.
+//===----------------------------------------------------------------------===//
+
+def addr : ComplexPattern<iPTR, 2, "SelectAddr", [], []>;
+
+//===----------------------------------------------------------------------===//
+// Pattern Fragments
+def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def  extloadi16i8 : PatFrag<(ops node:$ptr), (i16 ( extloadi8 node:$ptr))>;
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+//===----------------------------------------------------------------------===//
+// Instruction list..
+
+// ADJCALLSTACKDOWN/UP implicitly use/def SP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber SR.
+let Defs = [SP, SR], Uses = [SP] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt),
+                              "#ADJCALLSTACKDOWN",
+                              [(MSP430callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
+                              "#ADJCALLSTACKUP",
+                              [(MSP430callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let usesCustomInserter = 1 in {
+  def Select8  : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$src2, i8imm:$cc),
+                        "# Select8 PSEUDO",
+                        [(set GR8:$dst,
+                          (MSP430selectcc GR8:$src, GR8:$src2, imm:$cc))]>;
+  def Select16 : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR16:$src2, i8imm:$cc),
+                        "# Select16 PSEUDO",
+                        [(set GR16:$dst,
+                          (MSP430selectcc GR16:$src, GR16:$src2, imm:$cc))]>;
+  let Defs = [SR] in {
+  def Shl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+                        "# Shl8 PSEUDO",
+                        [(set GR8:$dst, (MSP430shl GR8:$src, GR8:$cnt))]>;
+  def Shl16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+                        "# Shl16 PSEUDO",
+                        [(set GR16:$dst, (MSP430shl GR16:$src, GR8:$cnt))]>;
+  def Sra8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+                        "# Sra8 PSEUDO",
+                        [(set GR8:$dst, (MSP430sra GR8:$src, GR8:$cnt))]>;
+  def Sra16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+                        "# Sra16 PSEUDO",
+                        [(set GR16:$dst, (MSP430sra GR16:$src, GR8:$cnt))]>;
+  def Srl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+                        "# Srl8 PSEUDO",
+                        [(set GR8:$dst, (MSP430srl GR8:$src, GR8:$cnt))]>;
+  def Srl16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+                        "# Srl16 PSEUDO",
+                        [(set GR16:$dst, (MSP430srl GR16:$src, GR8:$cnt))]>;
+
+  }
+}
+
+let hasSideEffects = 0 in
+def NOP : Pseudo<(outs), (ins), "nop", []>;
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions...
+//
+
+// FIXME: Provide proper encoding!
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+  def RET  : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                     (outs), (ins), "ret",  [(MSP430retflag)]>;
+  def RETI : II16r<0x0, (outs), (ins), "reti", [(MSP430retiflag)]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+
+// FIXME: expand opcode & cond field for branches!
+
+// Direct branch
+let isBarrier = 1 in {
+  // Short branch
+  def JMP : CJForm<0, 0, (outs), (ins jmptarget:$dst),
+                   "jmp\t$dst",
+                   [(br bb:$dst)]>;
+  let isIndirectBranch = 1 in {
+    // Long branches
+    def Bi  : I16ri<0, (outs), (ins i16imm:$brdst),
+                    "br\t$brdst",
+                    [(brind tblockaddress:$brdst)]>;
+    def Br  : I16rr<0, (outs), (ins GR16:$brdst),
+                    "br\t$brdst",
+                    [(brind GR16:$brdst)]>;
+    def Bm  : I16rm<0, (outs), (ins memsrc:$brdst),
+                    "br\t$brdst",
+                    [(brind (load addr:$brdst))]>;
+  }
+}
+
+// Conditional branches
+let Uses = [SR] in
+  def JCC : CJForm<0, 0,
+                   (outs), (ins jmptarget:$dst, cc:$cc),
+                   "j$cc\t$dst",
+                   [(MSP430brcc bb:$dst, imm:$cc)]>;
+} // isBranch, isTerminator
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. SPW is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Defs = [R12, R13, R14, R15, SR],
+      Uses = [SP] in {
+    def CALLi     : II16i<0x0,
+                          (outs), (ins i16imm:$dst),
+                          "call\t$dst", [(MSP430call imm:$dst)]>;
+    def CALLr     : II16r<0x0,
+                          (outs), (ins GR16:$dst),
+                          "call\t$dst", [(MSP430call GR16:$dst)]>;
+    def CALLm     : II16m<0x0,
+                          (outs), (ins memsrc:$dst),
+                          "call\t${dst:mem}", [(MSP430call (load addr:$dst))]>;
+  }
+
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions...
+//
+let Defs = [SP], Uses = [SP], hasSideEffects=0 in {
+let mayLoad = 1 in
+def POP16r   : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                       (outs GR16:$reg), (ins), "pop.w\t$reg", []>;
+
+let mayStore = 1 in
+def PUSH16r  : II16r<0x0,
+                     (outs), (ins GR16:$reg), "push.w\t$reg",[]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Move Instructions
+
+// FIXME: Provide proper encoding!
+let hasSideEffects = 0 in {
+def MOV8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src),
+                   "mov.b\t{$src, $dst}",
+                   []>;
+def MOV16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "mov.w\t{$src, $dst}",
+                    []>;
+}
+
+// FIXME: Provide proper encoding!
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins i8imm:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(set GR8:$dst, imm:$src)]>;
+def MOV16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins i16imm:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(set GR16:$dst, imm:$src)]>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+def MOV8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins memsrc:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(set GR8:$dst, (load addr:$src))]>;
+def MOV16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins memsrc:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(set GR16:$dst, (load addr:$src))]>;
+}
+
+def MOVZX16rr8 : I8rr<0x0,
+                      (outs GR16:$dst), (ins GR8:$src),
+                      "mov.b\t{$src, $dst}",
+                      [(set GR16:$dst, (zext GR8:$src))]>;
+def MOVZX16rm8 : I8rm<0x0,
+                      (outs GR16:$dst), (ins memsrc:$src),
+                      "mov.b\t{$src, $dst}",
+                      [(set GR16:$dst, (zextloadi16i8 addr:$src))]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, Constraints = "$base = $base_wb" in {
+def MOV8rm_POST  : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb), (ins GR16:$base),
+                         "mov.b\t{@$base+, $dst}", []>;
+def MOV16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                           (outs GR16:$dst, GR16:$base_wb), (ins GR16:$base),
+                           "mov.w\t{@$base+, $dst}", []>;
+}
+
+// Any instruction that defines a 8-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG, and CopyFromReg may
+// be copying from a truncate, but any other 8-bit operation will zero-extend
+// up to 16 bits.
+def def8 : PatLeaf<(i8 GR8:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg;
+}]>;
+
+// In the case of a 8-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i16 (zext def8:$src)),
+          (SUBREG_TO_REG (i16 0), GR8:$src, subreg_8bit)>;
+
+def MOV8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(store (i8 imm:$src), addr:$dst)]>;
+def MOV16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(store (i16 imm:$src), addr:$dst)]>;
+
+def MOV8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(store GR8:$src, addr:$dst)]>;
+def MOV16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(store GR16:$src, addr:$dst)]>;
+
+def MOV8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(store (i8 (load addr:$src)), addr:$dst)]>;
+def MOV16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(store (i16 (load addr:$src)), addr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions
+
+let Constraints = "$src = $dst" in {
+
+let Defs = [SR] in {
+
+let isCommutable = 1 in { // X = ADD Y, Z  == X = ADD Z, Y
+
+def ADD8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "add.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (add GR8:$src, GR8:$src2)),
+                    (implicit SR)]>;
+def ADD16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "add.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (add GR16:$src, GR16:$src2)),
+                     (implicit SR)]>;
+}
+
+def ADD8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "add.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (add GR8:$src, (load addr:$src2))),
+                    (implicit SR)]>;
+def ADD16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "add.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (add GR16:$src, (load addr:$src2))),
+                     (implicit SR)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src = $dst" in {
+def ADD8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb),
+                         (ins GR8:$src, GR16:$base),
+                         "add.b\t{@$base+, $dst}", []>;
+def ADD16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                           (outs GR16:$dst, GR16:$base_wb),
+                           (ins GR16:$src, GR16:$base),
+                          "add.w\t{@$base+, $dst}", []>;
+}
+
+
+def ADD8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "add.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (add GR8:$src, imm:$src2)),
+                    (implicit SR)]>;
+def ADD16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "add.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (add GR16:$src, imm:$src2)),
+                     (implicit SR)]>;
+
+let Constraints = "" in {
+def ADD8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "add.b\t{$src, $dst}",
+                   [(store (add (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SR)]>;
+def ADD16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "add.w\t{$src, $dst}",
+                    [(store (add (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SR)]>;
+
+def ADD8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "add.b\t{$src, $dst}",
+                   [(store (add (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SR)]>;
+def ADD16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "add.w\t{$src, $dst}",
+                    [(store (add (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SR)]>;
+
+def ADD8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "add.b\t{$src, $dst}",
+                   [(store (add (load addr:$dst), 
+                                (i8 (load addr:$src))), addr:$dst),
+                    (implicit SR)]>;
+def ADD16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "add.w\t{$src, $dst}",
+                    [(store (add (load addr:$dst), 
+                                  (i16 (load addr:$src))), addr:$dst),
+                     (implicit SR)]>;
+}
+
+let Uses = [SR] in {
+
+let isCommutable = 1 in { // X = ADDC Y, Z  == X = ADDC Z, Y
+def ADC8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "addc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (adde GR8:$src, GR8:$src2)),
+                    (implicit SR)]>;
+def ADC16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "addc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (adde GR16:$src, GR16:$src2)),
+                     (implicit SR)]>;
+} // isCommutable
+
+def ADC8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "addc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (adde GR8:$src, imm:$src2)),
+                    (implicit SR)]>;
+def ADC16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "addc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (adde GR16:$src, imm:$src2)),
+                     (implicit SR)]>;
+
+def ADC8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "addc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (adde GR8:$src, (load addr:$src2))),
+                    (implicit SR)]>;
+def ADC16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "addc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (adde GR16:$src, (load addr:$src2))),
+                     (implicit SR)]>;
+
+let Constraints = "" in {
+def ADC8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "addc.b\t{$src, $dst}",
+                   [(store (adde (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SR)]>;
+def ADC16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "addc.w\t{$src, $dst}",
+                    [(store (adde (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SR)]>;
+
+def ADC8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "addc.b\t{$src, $dst}",
+                   [(store (adde (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SR)]>;
+def ADC16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "addc.w\t{$src, $dst}",
+                    [(store (adde (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SR)]>;
+
+def ADC8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "addc.b\t{$src, $dst}",
+                   [(store (adde (load addr:$dst), 
+                                 (i8 (load addr:$src))), addr:$dst),
+                    (implicit SR)]>;
+def ADC16mm : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "addc.w\t{$src, $dst}",
+                   [(store (adde (load addr:$dst), 
+                                 (i16 (load addr:$src))), addr:$dst),
+                    (implicit SR)]>;
+}
+
+} // Uses = [SR]
+
+let isCommutable = 1 in { // X = AND Y, Z  == X = AND Z, Y
+def AND8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "and.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (and GR8:$src, GR8:$src2)),
+                    (implicit SR)]>;
+def AND16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "and.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src, GR16:$src2)),
+                     (implicit SR)]>;
+}
+
+def AND8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "and.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (and GR8:$src, imm:$src2)),
+                    (implicit SR)]>;
+def AND16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "and.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src, imm:$src2)),
+                     (implicit SR)]>;
+
+def AND8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "and.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (and GR8:$src, (load addr:$src2))),
+                    (implicit SR)]>;
+def AND16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "and.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src, (load addr:$src2))),
+                     (implicit SR)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src = $dst" in {
+def AND8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb),
+                         (ins GR8:$src, GR16:$base),
+                         "and.b\t{@$base+, $dst}", []>;
+def AND16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                           (outs GR16:$dst, GR16:$base_wb),
+                           (ins GR16:$src, GR16:$base),
+                           "and.w\t{@$base+, $dst}", []>;
+}
+
+let Constraints = "" in {
+def AND8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "and.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SR)]>;
+def AND16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "and.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SR)]>;
+
+def AND8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "and.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SR)]>;
+def AND16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "and.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SR)]>;
+
+def AND8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "and.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst), 
+                                (i8 (load addr:$src))), addr:$dst),
+                    (implicit SR)]>;
+def AND16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "and.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst), 
+                                 (i16 (load addr:$src))), addr:$dst),
+                     (implicit SR)]>;
+}
+
+let isCommutable = 1 in { // X = OR Y, Z  == X = OR Z, Y
+def OR8rr  : I8rr<0x0,
+                  (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                  "bis.b\t{$src2, $dst}",
+                  [(set GR8:$dst, (or GR8:$src, GR8:$src2))]>;
+def OR16rr : I16rr<0x0,
+                   (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                   "bis.w\t{$src2, $dst}",
+                   [(set GR16:$dst, (or GR16:$src, GR16:$src2))]>;
+}
+
+def OR8ri  : I8ri<0x0,
+                  (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                  "bis.b\t{$src2, $dst}",
+                  [(set GR8:$dst, (or GR8:$src, imm:$src2))]>;
+def OR16ri : I16ri<0x0,
+                   (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                   "bis.w\t{$src2, $dst}",
+                   [(set GR16:$dst, (or GR16:$src, imm:$src2))]>;
+
+def OR8rm  : I8rm<0x0,
+                  (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                  "bis.b\t{$src2, $dst}",
+                  [(set GR8:$dst, (or GR8:$src, (load addr:$src2)))]>;
+def OR16rm : I16rm<0x0,
+                   (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                   "bis.w\t{$src2, $dst}",
+                   [(set GR16:$dst, (or GR16:$src, (load addr:$src2)))]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src = $dst" in {
+def OR8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                        (outs GR8:$dst, GR16:$base_wb),
+                        (ins GR8:$src, GR16:$base),
+                        "bis.b\t{@$base+, $dst}", []>;
+def OR16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                          (outs GR16:$dst, GR16:$base_wb),
+                          (ins GR16:$src, GR16:$base),
+                          "bis.w\t{@$base+, $dst}", []>;
+}
+
+let Constraints = "" in {
+def OR8mr  : I8mr<0x0,
+                  (outs), (ins memdst:$dst, GR8:$src),
+                  "bis.b\t{$src, $dst}",
+                  [(store (or (load addr:$dst), GR8:$src), addr:$dst)]>;
+def OR16mr : I16mr<0x0,
+                   (outs), (ins memdst:$dst, GR16:$src),
+                   "bis.w\t{$src, $dst}",
+                   [(store (or (load addr:$dst), GR16:$src), addr:$dst)]>;
+
+def OR8mi  : I8mi<0x0, 
+                  (outs), (ins memdst:$dst, i8imm:$src),
+                  "bis.b\t{$src, $dst}",
+                  [(store (or (load addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def OR16mi : I16mi<0x0,
+                   (outs), (ins memdst:$dst, i16imm:$src),
+                   "bis.w\t{$src, $dst}",
+                   [(store (or (load addr:$dst), (i16 imm:$src)), addr:$dst)]>;
+
+def OR8mm  : I8mm<0x0,
+                  (outs), (ins memdst:$dst, memsrc:$src),
+                  "bis.b\t{$src, $dst}",
+                  [(store (or (i8 (load addr:$dst)),
+                              (i8 (load addr:$src))), addr:$dst)]>;
+def OR16mm : I16mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "bis.w\t{$src, $dst}",
+                   [(store (or (i16 (load addr:$dst)),
+                               (i16 (load addr:$src))), addr:$dst)]>;
+}
+
+// bic does not modify condition codes
+def BIC8rr :  I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "bic.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (and GR8:$src, (not GR8:$src2)))]>;
+def BIC16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "bic.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src, (not GR16:$src2)))]>;
+
+def BIC8rm :  I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "bic.b\t{$src2, $dst}",
+                    [(set GR8:$dst, (and GR8:$src, (not (i8 (load addr:$src2)))))]>;
+def BIC16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "bic.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src, (not (i16 (load addr:$src2)))))]>;
+
+let Constraints = "" in {
+def BIC8mr :  I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "bic.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst), (not GR8:$src)), addr:$dst)]>;
+def BIC16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "bic.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst), (not GR16:$src)), addr:$dst)]>;
+
+def BIC8mm :  I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "bic.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst),
+                                (not (i8 (load addr:$src)))), addr:$dst)]>;
+def BIC16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "bic.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst),
+                                 (not (i16 (load addr:$src)))), addr:$dst)]>;
+}
+
+let isCommutable = 1 in { // X = XOR Y, Z  == X = XOR Z, Y
+def XOR8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "xor.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (xor GR8:$src, GR8:$src2)),
+                    (implicit SR)]>;
+def XOR16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "xor.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (xor GR16:$src, GR16:$src2)),
+                     (implicit SR)]>;
+}
+
+def XOR8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "xor.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (xor GR8:$src, imm:$src2)),
+                    (implicit SR)]>;
+def XOR16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "xor.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (xor GR16:$src, imm:$src2)),
+                     (implicit SR)]>;
+
+def XOR8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "xor.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (xor GR8:$src, (load addr:$src2))),
+                    (implicit SR)]>;
+def XOR16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "xor.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (xor GR16:$src, (load addr:$src2))),
+                     (implicit SR)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src = $dst" in {
+def XOR8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb),
+                         (ins GR8:$src, GR16:$base),
+                         "xor.b\t{@$base+, $dst}", []>;
+def XOR16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                           (outs GR16:$dst, GR16:$base_wb),
+                           (ins GR16:$src, GR16:$base),
+                           "xor.w\t{@$base+, $dst}", []>;
+}
+
+let Constraints = "" in {
+def XOR8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "xor.b\t{$src, $dst}",
+                   [(store (xor (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SR)]>;
+def XOR16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "xor.w\t{$src, $dst}",
+                    [(store (xor (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SR)]>;
+
+def XOR8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "xor.b\t{$src, $dst}",
+                   [(store (xor (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SR)]>;
+def XOR16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "xor.w\t{$src, $dst}",
+                    [(store (xor (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SR)]>;
+
+def XOR8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "xor.b\t{$src, $dst}",
+                   [(store (xor (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+                    (implicit SR)]>;
+def XOR16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "xor.w\t{$src, $dst}",
+                    [(store (xor (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+                     (implicit SR)]>;
+}
+
+
+def SUB8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "sub.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sub GR8:$src, GR8:$src2)),
+                    (implicit SR)]>;
+def SUB16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "sub.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sub GR16:$src, GR16:$src2)),
+                     (implicit SR)]>;
+
+def SUB8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "sub.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sub GR8:$src, imm:$src2)),
+                    (implicit SR)]>;
+def SUB16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "sub.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sub GR16:$src, imm:$src2)),
+                     (implicit SR)]>;
+
+def SUB8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "sub.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sub GR8:$src, (load addr:$src2))),
+                    (implicit SR)]>;
+def SUB16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "sub.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sub GR16:$src, (load addr:$src2))),
+                     (implicit SR)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src = $dst" in {
+def SUB8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb),
+                         (ins GR8:$src, GR16:$base),
+                         "sub.b\t{@$base+, $dst}", []>;
+def SUB16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                          (outs GR16:$dst, GR16:$base_wb),
+                          (ins GR16:$src, GR16:$base),
+                          "sub.w\t{@$base+, $dst}", []>;
+}
+
+let Constraints = "" in {
+def SUB8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "sub.b\t{$src, $dst}",
+                   [(store (sub (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SR)]>;
+def SUB16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "sub.w\t{$src, $dst}",
+                    [(store (sub (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SR)]>;
+
+def SUB8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "sub.b\t{$src, $dst}",
+                   [(store (sub (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SR)]>;
+def SUB16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "sub.w\t{$src, $dst}",
+                    [(store (sub (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SR)]>;
+
+def SUB8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "sub.b\t{$src, $dst}",
+                   [(store (sub (load addr:$dst), 
+                                (i8 (load addr:$src))), addr:$dst),
+                    (implicit SR)]>;
+def SUB16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "sub.w\t{$src, $dst}",
+                    [(store (sub (load addr:$dst), 
+                                 (i16 (load addr:$src))), addr:$dst),
+                     (implicit SR)]>;
+}
+
+let Uses = [SR] in {
+def SBC8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "subc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sube GR8:$src, GR8:$src2)),
+                    (implicit SR)]>;
+def SBC16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "subc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sube GR16:$src, GR16:$src2)),
+                     (implicit SR)]>;
+
+def SBC8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "subc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sube GR8:$src, imm:$src2)),
+                    (implicit SR)]>;
+def SBC16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "subc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sube GR16:$src, imm:$src2)),
+                     (implicit SR)]>;
+
+def SBC8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "subc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sube GR8:$src, (load addr:$src2))),
+                    (implicit SR)]>;
+def SBC16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "subc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sube GR16:$src, (load addr:$src2))),
+                     (implicit SR)]>;
+
+let Constraints = "" in {
+def SBC8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "subc.b\t{$src, $dst}",
+                  [(store (sube (load addr:$dst), GR8:$src), addr:$dst),
+                   (implicit SR)]>;
+def SBC16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "subc.w\t{$src, $dst}",
+                    [(store (sube (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SR)]>;
+
+def SBC8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "subc.b\t{$src, $dst}",
+                   [(store (sube (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SR)]>;
+def SBC16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "subc.w\t{$src, $dst}",
+                    [(store (sube (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SR)]>;
+
+def SBC8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "subc.b\t{$src, $dst}",
+                   [(store (sube (load addr:$dst),
+                                 (i8 (load addr:$src))), addr:$dst),
+                    (implicit SR)]>;
+def SBC16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "subc.w\t{$src, $dst}",
+                    [(store (sube (load addr:$dst),
+                            (i16 (load addr:$src))), addr:$dst),
+                     (implicit SR)]>;
+}
+
+} // Uses = [SR]
+
+// FIXME: memory variant!
+def SAR8r1  : II8r<0x0,
+                   (outs GR8:$dst), (ins GR8:$src),
+                   "rra.b\t$dst",
+                   [(set GR8:$dst, (MSP430rra GR8:$src)),
+                    (implicit SR)]>;
+def SAR16r1 : II16r<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "rra.w\t$dst",
+                    [(set GR16:$dst, (MSP430rra GR16:$src)),
+                     (implicit SR)]>;
+
+def SHL8r1  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src),
+                   "rla.b\t$dst",
+                   [(set GR8:$dst, (MSP430rla GR8:$src)),
+                    (implicit SR)]>;
+def SHL16r1 : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "rla.w\t$dst",
+                    [(set GR16:$dst, (MSP430rla GR16:$src)),
+                     (implicit SR)]>;
+
+def SAR8r1c  : Pseudo<(outs GR8:$dst), (ins GR8:$src),
+                      "clrc\n\t"
+                      "rrc.b\t$dst",
+                      [(set GR8:$dst, (MSP430rrc GR8:$src)),
+                       (implicit SR)]>;
+def SAR16r1c : Pseudo<(outs GR16:$dst), (ins GR16:$src),
+                      "clrc\n\t"
+                      "rrc.w\t$dst",
+                      [(set GR16:$dst, (MSP430rrc GR16:$src)),
+                       (implicit SR)]>;
+
+// FIXME: Memory sext's ?
+def SEXT16r : II16r<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "sxt\t$dst",
+                    [(set GR16:$dst, (sext_inreg GR16:$src, i8)),
+                     (implicit SR)]>;
+
+} // Defs = [SR]
+
+def ZEXT16r : I8rr<0x0,
+                   (outs GR16:$dst), (ins GR16:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(set GR16:$dst, (zext (trunc GR16:$src)))]>;
+
+// FIXME: Memory bitswaps?
+def SWPB16r : II16r<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "swpb\t$dst",
+                    [(set GR16:$dst, (bswap GR16:$src))]>;
+
+} // Constraints = "$src = $dst"
+
+// Integer comparisons
+let Defs = [SR] in {
+def CMP8rr  : I8rr<0x0,
+                   (outs), (ins GR8:$src, GR8:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp GR8:$src, GR8:$src2), (implicit SR)]>;
+def CMP16rr : I16rr<0x0,
+                    (outs), (ins GR16:$src, GR16:$src2),
+                    "cmp.w\t{$src2, $src}",
+                    [(MSP430cmp GR16:$src, GR16:$src2), (implicit SR)]>;
+
+def CMP8ri  : I8ri<0x0,
+                   (outs), (ins GR8:$src, i8imm:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp GR8:$src, imm:$src2), (implicit SR)]>;
+def CMP16ri : I16ri<0x0,
+                    (outs), (ins GR16:$src, i16imm:$src2),
+                    "cmp.w\t{$src2, $src}",
+                    [(MSP430cmp GR16:$src, imm:$src2), (implicit SR)]>;
+
+def CMP8mi  : I8mi<0x0,
+                   (outs), (ins memsrc:$src, i8imm:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp (load addr:$src),
+                               (i8 imm:$src2)), (implicit SR)]>;
+def CMP16mi : I16mi<0x0,
+                    (outs), (ins memsrc:$src, i16imm:$src2),
+                    "cmp.w\t{$src2, $src}",
+                     [(MSP430cmp (load addr:$src),
+                                 (i16 imm:$src2)), (implicit SR)]>;
+
+def CMP8rm  : I8rm<0x0,
+                   (outs), (ins GR8:$src, memsrc:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp GR8:$src, (load addr:$src2)), 
+                    (implicit SR)]>;
+def CMP16rm : I16rm<0x0,
+                    (outs), (ins GR16:$src, memsrc:$src2),
+                    "cmp.w\t{$src2, $src}",
+                    [(MSP430cmp GR16:$src, (load addr:$src2)),
+                     (implicit SR)]>;
+
+def CMP8mr  : I8mr<0x0,
+                   (outs), (ins memsrc:$src, GR8:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp (load addr:$src), GR8:$src2),
+                    (implicit SR)]>;
+def CMP16mr : I16mr<0x0,
+                    (outs), (ins memsrc:$src, GR16:$src2),
+                    "cmp.w\t{$src2, $src}",
+                    [(MSP430cmp (load addr:$src), GR16:$src2), 
+                     (implicit SR)]>;
+
+
+// BIT TESTS, just sets condition codes
+// Note that the C condition is set differently than when using CMP.
+let isCommutable = 1 in {
+def BIT8rr  : I8rr<0x0,
+                   (outs), (ins GR8:$src, GR8:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su GR8:$src, GR8:$src2), 0),
+                    (implicit SR)]>;
+def BIT16rr : I16rr<0x0,
+                    (outs), (ins GR16:$src, GR16:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su GR16:$src, GR16:$src2), 0),
+                     (implicit SR)]>;
+}
+def BIT8ri  : I8ri<0x0,
+                   (outs), (ins GR8:$src, i8imm:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su GR8:$src, imm:$src2), 0),
+                    (implicit SR)]>;
+def BIT16ri : I16ri<0x0,
+                    (outs), (ins GR16:$src, i16imm:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su GR16:$src, imm:$src2), 0),
+                     (implicit SR)]>;
+
+def BIT8rm  : I8rm<0x0,
+                   (outs), (ins GR8:$src, memdst:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su GR8:$src,  (load addr:$src2)), 0),
+                    (implicit SR)]>;
+def BIT16rm : I16rm<0x0,
+                    (outs), (ins GR16:$src, memdst:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su GR16:$src,  (load addr:$src2)), 0),
+                     (implicit SR)]>;
+
+def BIT8mr  : I8mr<0x0,
+                  (outs), (ins memsrc:$src, GR8:$src2),
+                  "bit.b\t{$src2, $src}",
+                  [(MSP430cmp (and_su (load addr:$src), GR8:$src2), 0),
+                   (implicit SR)]>;
+def BIT16mr : I16mr<0x0,
+                    (outs), (ins memsrc:$src, GR16:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su (load addr:$src), GR16:$src2), 0),
+                     (implicit SR)]>;
+
+def BIT8mi  : I8mi<0x0,
+                   (outs), (ins memsrc:$src, i8imm:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su (load addr:$src), (i8 imm:$src2)), 0),
+                    (implicit SR)]>;
+def BIT16mi : I16mi<0x0,
+                    (outs), (ins memsrc:$src, i16imm:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su (load addr:$src), (i16 imm:$src2)), 0),
+                     (implicit SR)]>;
+
+def BIT8mm  : I8mm<0x0,
+                   (outs), (ins memsrc:$src, memsrc:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su (i8 (load addr:$src)),
+                                       (load addr:$src2)),
+                                 0),
+                      (implicit SR)]>;
+def BIT16mm : I16mm<0x0,
+                    (outs), (ins memsrc:$src, memsrc:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su (i16 (load addr:$src)),
+                                        (load addr:$src2)),
+                                 0),
+                     (implicit SR)]>;
+} // Defs = [SR]
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+
+// extload
+def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>;
+
+// anyext
+def : Pat<(i16 (anyext GR8:$src)),
+          (SUBREG_TO_REG (i16 0), GR8:$src, subreg_8bit)>;
+
+// truncs
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG GR16:$src, subreg_8bit)>;
+
+// GlobalAddress, ExternalSymbol
+def : Pat<(i16 (MSP430Wrapper tglobaladdr:$dst)), (MOV16ri tglobaladdr:$dst)>;
+def : Pat<(i16 (MSP430Wrapper texternalsym:$dst)), (MOV16ri texternalsym:$dst)>;
+def : Pat<(i16 (MSP430Wrapper tblockaddress:$dst)), (MOV16ri tblockaddress:$dst)>;
+
+def : Pat<(add GR16:$src, (MSP430Wrapper tglobaladdr :$src2)),
+          (ADD16ri GR16:$src, tglobaladdr:$src2)>;
+def : Pat<(add GR16:$src, (MSP430Wrapper texternalsym:$src2)),
+          (ADD16ri GR16:$src, texternalsym:$src2)>;
+def : Pat<(add GR16:$src, (MSP430Wrapper tblockaddress:$src2)),
+          (ADD16ri GR16:$src, tblockaddress:$src2)>;
+
+def : Pat<(store (i16 (MSP430Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV16mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i16 (MSP430Wrapper texternalsym:$src)), addr:$dst),
+          (MOV16mi addr:$dst, texternalsym:$src)>;
+def : Pat<(store (i16 (MSP430Wrapper tblockaddress:$src)), addr:$dst),
+          (MOV16mi addr:$dst, tblockaddress:$src)>;
+
+// calls
+def : Pat<(MSP430call (i16 tglobaladdr:$dst)),
+          (CALLi tglobaladdr:$dst)>;
+def : Pat<(MSP430call (i16 texternalsym:$dst)),
+          (CALLi texternalsym:$dst)>;
+
+// add and sub always produce carry
+def : Pat<(addc GR16:$src, GR16:$src2),
+          (ADD16rr GR16:$src, GR16:$src2)>;
+def : Pat<(addc GR16:$src, (load addr:$src2)),
+          (ADD16rm GR16:$src, addr:$src2)>;
+def : Pat<(addc GR16:$src, imm:$src2),
+          (ADD16ri GR16:$src, imm:$src2)>;
+def : Pat<(store (addc (load addr:$dst), GR16:$src), addr:$dst),
+          (ADD16mr addr:$dst, GR16:$src)>;
+def : Pat<(store (addc (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+          (ADD16mm addr:$dst, addr:$src)>;
+
+def : Pat<(addc GR8:$src, GR8:$src2),
+          (ADD8rr GR8:$src, GR8:$src2)>;
+def : Pat<(addc GR8:$src, (load addr:$src2)),
+          (ADD8rm GR8:$src, addr:$src2)>;
+def : Pat<(addc GR8:$src, imm:$src2),
+          (ADD8ri GR8:$src, imm:$src2)>;
+def : Pat<(store (addc (load addr:$dst), GR8:$src), addr:$dst),
+          (ADD8mr addr:$dst, GR8:$src)>;
+def : Pat<(store (addc (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+          (ADD8mm addr:$dst, addr:$src)>;
+
+def : Pat<(subc GR16:$src, GR16:$src2),
+          (SUB16rr GR16:$src, GR16:$src2)>;
+def : Pat<(subc GR16:$src, (load addr:$src2)),
+          (SUB16rm GR16:$src, addr:$src2)>;
+def : Pat<(subc GR16:$src, imm:$src2),
+          (SUB16ri GR16:$src, imm:$src2)>;
+def : Pat<(store (subc (load addr:$dst), GR16:$src), addr:$dst),
+          (SUB16mr addr:$dst, GR16:$src)>;
+def : Pat<(store (subc (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+          (SUB16mm addr:$dst, addr:$src)>;
+
+def : Pat<(subc GR8:$src, GR8:$src2),
+          (SUB8rr GR8:$src, GR8:$src2)>;
+def : Pat<(subc GR8:$src, (load addr:$src2)),
+          (SUB8rm GR8:$src, addr:$src2)>;
+def : Pat<(subc GR8:$src, imm:$src2),
+          (SUB8ri GR8:$src, imm:$src2)>;
+def : Pat<(store (subc (load addr:$dst), GR8:$src), addr:$dst),
+          (SUB8mr addr:$dst, GR8:$src)>;
+def : Pat<(store (subc (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+          (SUB8mm addr:$dst, addr:$src)>;
+
+// peephole patterns
+def : Pat<(and GR16:$src, 255), (ZEXT16r GR16:$src)>;
+def : Pat<(MSP430cmp (trunc (and_su GR16:$src, GR16:$src2)), 0),
+          (BIT8rr (EXTRACT_SUBREG GR16:$src, subreg_8bit),
+                  (EXTRACT_SUBREG GR16:$src2, subreg_8bit))>;
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
new file mode 100644
index 000000000000..47b0e270c5b3
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -0,0 +1,157 @@
+//===-- MSP430MCInstLower.cpp - Convert MSP430 MachineInstr to an MCInst --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower MSP430 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCInstLower.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+MCSymbol *MSP430MCInstLower::
+GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  return Printer.getSymbol(MO.getGlobal());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetJumpTableSymbol(const MachineOperand &MO) const {
+  const DataLayout &DL = Printer.getDataLayout();
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "JTI"
+                            << Printer.getFunctionNumber() << '_'
+                            << MO.getIndex();
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  // Create a symbol for the name.
+  return Ctx.getOrCreateSymbol(Name);
+}
+
+MCSymbol *MSP430MCInstLower::
+GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
+  const DataLayout &DL = Printer.getDataLayout();
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "CPI"
+                            << Printer.getFunctionNumber() << '_'
+                            << MO.getIndex();
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  // Create a symbol for the name.
+  return Ctx.getOrCreateSymbol(Name);
+}
+
+MCSymbol *MSP430MCInstLower::
+GetBlockAddressSymbol(const MachineOperand &MO) const {
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  return Printer.GetBlockAddressSymbol(MO.getBlockAddress());
+}
+
+MCOperand MSP430MCInstLower::
+LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::createAdd(Expr,
+                                   MCConstantExpr::create(MO.getOffset(), Ctx),
+                                   Ctx);
+  return MCOperand::createExpr(Expr);
+}
+
+void MSP430MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
+      MCOp = MCOperand::createReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::createImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
+                         MO.getMBB()->getSymbol(), Ctx));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCOp = LowerSymbolOperand(MO, GetBlockAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_RegisterMask:
+      continue;
+    }
+
+    OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h
new file mode 100644
index 000000000000..ebd639744bcc
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h
@@ -0,0 +1,47 @@
+//===-- MSP430MCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430MCINSTLOWER_H
+#define LLVM_LIB_TARGET_MSP430_MSP430MCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class AsmPrinter;
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MCSymbol;
+  class MachineInstr;
+  class MachineModuleInfoMachO;
+  class MachineOperand;
+
+  /// MSP430MCInstLower - This class is used to lower an MachineInstr
+  /// into an MCInst.
+class LLVM_LIBRARY_VISIBILITY MSP430MCInstLower {
+  MCContext &Ctx;
+
+  AsmPrinter &Printer;
+public:
+  MSP430MCInstLower(MCContext &ctx, AsmPrinter &printer)
+    : Ctx(ctx), Printer(printer) {}
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetBlockAddressSymbol(const MachineOperand &MO) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
new file mode 100644
index 000000000000..b442fc03b257
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
@@ -0,0 +1,14 @@
+//===-- MSP430MachineFunctionInfo.cpp - MSP430 machine function info ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MachineFunctionInfo.h"
+
+using namespace llvm;
+
+void MSP430MachineFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
new file mode 100644
index 000000000000..2d937318c7e5
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
@@ -0,0 +1,54 @@
+//=== MSP430MachineFunctionInfo.h - MSP430 machine function info -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares MSP430-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_MSP430_MSP430MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// MSP430MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private MSP430 target-specific information for each MachineFunction.
+class MSP430MachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize;
+
+  /// ReturnAddrIndex - FrameIndex for return slot.
+  int ReturnAddrIndex;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+public:
+  MSP430MachineFunctionInfo() : CalleeSavedFrameSize(0) {}
+
+  explicit MSP430MachineFunctionInfo(MachineFunction &MF)
+    : CalleeSavedFrameSize(0), ReturnAddrIndex(0) {}
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+  int getRAIndex() const { return ReturnAddrIndex; }
+  void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex;}
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
new file mode 100644
index 000000000000..81cd9d1ad3f8
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -0,0 +1,161 @@
+//===-- MSP430RegisterInfo.cpp - MSP430 Register Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430RegisterInfo.h"
+#include "MSP430.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-reg-info"
+
+#define GET_REGINFO_TARGET_DESC
+#include "MSP430GenRegisterInfo.inc"
+
+// FIXME: Provide proper call frame setup / destroy opcodes.
+MSP430RegisterInfo::MSP430RegisterInfo()
+  : MSP430GenRegisterInfo(MSP430::PC) {}
+
+const MCPhysReg*
+MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  const MSP430FrameLowering *TFI = getFrameLowering(*MF);
+  const Function* F = MF->getFunction();
+  static const MCPhysReg CalleeSavedRegs[] = {
+    MSP430::FP, MSP430::R5, MSP430::R6, MSP430::R7,
+    MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+    0
+  };
+  static const MCPhysReg CalleeSavedRegsFP[] = {
+    MSP430::R5, MSP430::R6, MSP430::R7,
+    MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+    0
+  };
+  static const MCPhysReg CalleeSavedRegsIntr[] = {
+    MSP430::FP,  MSP430::R5,  MSP430::R6,  MSP430::R7,
+    MSP430::R8,  MSP430::R9,  MSP430::R10, MSP430::R11,
+    MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15,
+    0
+  };
+  static const MCPhysReg CalleeSavedRegsIntrFP[] = {
+    MSP430::R5,  MSP430::R6,  MSP430::R7,
+    MSP430::R8,  MSP430::R9,  MSP430::R10, MSP430::R11,
+    MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15,
+    0
+  };
+
+  if (TFI->hasFP(*MF))
+    return (F->getCallingConv() == CallingConv::MSP430_INTR ?
+            CalleeSavedRegsIntrFP : CalleeSavedRegsFP);
+  else
+    return (F->getCallingConv() == CallingConv::MSP430_INTR ?
+            CalleeSavedRegsIntr : CalleeSavedRegs);
+
+}
+
+BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const MSP430FrameLowering *TFI = getFrameLowering(MF);
+
+  // Mark 4 special registers with subregisters as reserved.
+  Reserved.set(MSP430::PCB);
+  Reserved.set(MSP430::SPB);
+  Reserved.set(MSP430::SRB);
+  Reserved.set(MSP430::CGB);
+  Reserved.set(MSP430::PC);
+  Reserved.set(MSP430::SP);
+  Reserved.set(MSP430::SR);
+  Reserved.set(MSP430::CG);
+
+  // Mark frame pointer as reserved if needed.
+  if (TFI->hasFP(MF)) {
+    Reserved.set(MSP430::FPB);
+    Reserved.set(MSP430::FP);
+  }
+
+  return Reserved;
+}
+
+const TargetRegisterClass *
+MSP430RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
+  return &MSP430::GR16RegClass;
+}
+
+void
+MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                        int SPAdj, unsigned FIOperandNum,
+                                        RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const MSP430FrameLowering *TFI = getFrameLowering(MF);
+  DebugLoc dl = MI.getDebugLoc();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+  unsigned BasePtr = (TFI->hasFP(MF) ? MSP430::FP : MSP430::SP);
+  int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex);
+
+  // Skip the saved PC
+  Offset += 2;
+
+  if (!TFI->hasFP(MF))
+    Offset += MF.getFrameInfo().getStackSize();
+  else
+    Offset += 2; // Skip the saved FP
+
+  // Fold imm into offset
+  Offset += MI.getOperand(FIOperandNum + 1).getImm();
+
+  if (MI.getOpcode() == MSP430::ADD16ri) {
+    // This is actually "load effective address" of the stack slot
+    // instruction. We have only two-address instructions, thus we need to
+    // expand it into mov + add
+    const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+
+    MI.setDesc(TII.get(MSP430::MOV16rr));
+    MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+
+    if (Offset == 0)
+      return;
+
+    // We need to materialize the offset via add instruction.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    if (Offset < 0)
+      BuildMI(MBB, std::next(II), dl, TII.get(MSP430::SUB16ri), DstReg)
+        .addReg(DstReg).addImm(-Offset);
+    else
+      BuildMI(MBB, std::next(II), dl, TII.get(MSP430::ADD16ri), DstReg)
+        .addReg(DstReg).addImm(Offset);
+
+    return;
+  }
+
+  MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+}
+
+unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const MSP430FrameLowering *TFI = getFrameLowering(MF);
+  return TFI->hasFP(MF) ? MSP430::FP : MSP430::SP;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.h
new file mode 100644
index 000000000000..0cfa4a42bfe4
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -0,0 +1,46 @@
+//===-- MSP430RegisterInfo.h - MSP430 Register Information Impl -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430REGISTERINFO_H
+#define LLVM_LIB_TARGET_MSP430_MSP430REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "MSP430GenRegisterInfo.inc"
+
+namespace llvm {
+
+struct MSP430RegisterInfo : public MSP430GenRegisterInfo {
+public:
+  MSP430RegisterInfo();
+
+  /// Code Generation virtual methods...
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  const TargetRegisterClass*
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.td b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.td
new file mode 100644
index 000000000000..b5a6ed0f0a56
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.td
@@ -0,0 +1,81 @@
+//===-- MSP430RegisterInfo.td - MSP430 Register defs -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the MSP430 register file
+//===----------------------------------------------------------------------===//
+
+class MSP430Reg<bits<4> num, string n> : Register<n> {
+  field bits<4> Num = num;
+  let Namespace = "MSP430";
+}
+
+class MSP430RegWithSubregs<bits<4> num, string n, list<Register> subregs> 
+  : RegisterWithSubRegs<n, subregs> {
+  field bits<4> Num = num;
+  let Namespace = "MSP430";
+}
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+def PCB  : MSP430Reg<0,  "r0">;
+def SPB  : MSP430Reg<1,  "r1">;
+def SRB  : MSP430Reg<2,  "r2">;
+def CGB  : MSP430Reg<3,  "r3">;
+def FPB  : MSP430Reg<4,  "r4">;
+def R5B  : MSP430Reg<5,  "r5">;
+def R6B  : MSP430Reg<6,  "r6">;
+def R7B  : MSP430Reg<7,  "r7">;
+def R8B  : MSP430Reg<8,  "r8">;
+def R9B  : MSP430Reg<9,  "r9">;
+def R10B : MSP430Reg<10, "r10">;
+def R11B : MSP430Reg<11, "r11">;
+def R12B : MSP430Reg<12, "r12">;
+def R13B : MSP430Reg<13, "r13">;
+def R14B : MSP430Reg<14, "r14">;
+def R15B : MSP430Reg<15, "r15">;
+
+def subreg_8bit : SubRegIndex<8> { let Namespace = "MSP430"; }
+
+let SubRegIndices = [subreg_8bit] in {
+def PC  : MSP430RegWithSubregs<0,  "r0",  [PCB]>;
+def SP  : MSP430RegWithSubregs<1,  "r1",  [SPB]>;
+def SR  : MSP430RegWithSubregs<2,  "r2",  [SRB]>;
+def CG  : MSP430RegWithSubregs<3,  "r3",  [CGB]>;
+def FP  : MSP430RegWithSubregs<4,  "r4",  [FPB]>;
+def R5  : MSP430RegWithSubregs<5,  "r5",  [R5B]>;
+def R6  : MSP430RegWithSubregs<6,  "r6",  [R6B]>;
+def R7  : MSP430RegWithSubregs<7,  "r7",  [R7B]>;
+def R8  : MSP430RegWithSubregs<8,  "r8",  [R8B]>;
+def R9  : MSP430RegWithSubregs<9,  "r9",  [R9B]>;
+def R10 : MSP430RegWithSubregs<10, "r10", [R10B]>;
+def R11 : MSP430RegWithSubregs<11, "r11", [R11B]>;
+def R12 : MSP430RegWithSubregs<12, "r12", [R12B]>;
+def R13 : MSP430RegWithSubregs<13, "r13", [R13B]>;
+def R14 : MSP430RegWithSubregs<14, "r14", [R14B]>;
+def R15 : MSP430RegWithSubregs<15, "r15", [R15B]>;
+}
+
+def GR8 : RegisterClass<"MSP430", [i8], 8,
+   // Volatile registers
+  (add R12B, R13B, R14B, R15B, R11B, R10B, R9B, R8B, R7B, R6B, R5B,
+   // Frame pointer, sometimes allocable
+   FPB,
+   // Volatile, but not allocable
+   PCB, SPB, SRB, CGB)>;
+
+def GR16 : RegisterClass<"MSP430", [i16], 16,
+   // Volatile registers
+  (add R12, R13, R14, R15, R11, R10, R9, R8, R7, R6, R5,
+   // Frame pointer, sometimes allocable
+   FP,
+   // Volatile, but not allocable
+   PC, SP, SR, CG)>;
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
new file mode 100644
index 000000000000..6216348e4d71
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -0,0 +1,37 @@
+//===-- MSP430Subtarget.cpp - MSP430 Subtarget Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430Subtarget.h"
+#include "MSP430.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "MSP430GenSubtargetInfo.inc"
+
+void MSP430Subtarget::anchor() { }
+
+MSP430Subtarget &
+MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
+  ParseSubtargetFeatures("generic", FS);
+  return *this;
+}
+
+MSP430Subtarget::MSP430Subtarget(const Triple &TT, const std::string &CPU,
+                                 const std::string &FS, const TargetMachine &TM)
+    : MSP430GenSubtargetInfo(TT, CPU, FS), FrameLowering(),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h
new file mode 100644
index 000000000000..1a00d85e01cb
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h
@@ -0,0 +1,69 @@
+//===-- MSP430Subtarget.h - Define Subtarget for the MSP430 ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MSP430 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430SUBTARGET_H
+#define LLVM_LIB_TARGET_MSP430_MSP430SUBTARGET_H
+
+#include "MSP430FrameLowering.h"
+#include "MSP430ISelLowering.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430RegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "MSP430GenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class MSP430Subtarget : public MSP430GenSubtargetInfo {
+  virtual void anchor();
+  bool ExtendedInsts;
+  MSP430FrameLowering FrameLowering;
+  MSP430InstrInfo InstrInfo;
+  MSP430TargetLowering TLInfo;
+  SelectionDAGTargetInfo TSInfo;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  MSP430Subtarget(const Triple &TT, const std::string &CPU,
+                  const std::string &FS, const TargetMachine &TM);
+
+  MSP430Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  const TargetFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const MSP430InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const TargetRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const MSP430TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+};
+} // End llvm namespace
+
+#endif  // LLVM_TARGET_MSP430_SUBTARGET_H
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
new file mode 100644
index 000000000000..bebe5fa35ad4
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -0,0 +1,80 @@
+//===-- MSP430TargetMachine.cpp - Define TargetMachine for MSP430 ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the MSP430 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430TargetMachine.h"
+#include "MSP430.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeMSP430Target() {
+  // Register the target.
+  RegisterTargetMachine<MSP430TargetMachine> X(getTheMSP430Target());
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::Static;
+  return *RM;
+}
+
+MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT,
+                                         StringRef CPU, StringRef FS,
+                                         const TargetOptions &Options,
+                                         Optional<Reloc::Model> RM,
+                                         CodeModel::Model CM,
+                                         CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, "e-m:e-p:16:16-i32:16:32-a:16-n8:16", TT, CPU, FS,
+                        Options, getEffectiveRelocModel(RM), CM, OL),
+      TLOF(make_unique<TargetLoweringObjectFileELF>()),
+      // FIXME: Check DataLayout string.
+      Subtarget(TT, CPU, FS, *this) {
+  initAsmInfo();
+}
+
+MSP430TargetMachine::~MSP430TargetMachine() {}
+
+namespace {
+/// MSP430 Code Generator Pass Configuration Options.
+class MSP430PassConfig : public TargetPassConfig {
+public:
+  MSP430PassConfig(MSP430TargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  MSP430TargetMachine &getMSP430TargetMachine() const {
+    return getTM<MSP430TargetMachine>();
+  }
+
+  bool addInstSelector() override;
+  void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *MSP430TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new MSP430PassConfig(this, PM);
+}
+
+bool MSP430PassConfig::addInstSelector() {
+  // Install an instruction selector.
+  addPass(createMSP430ISelDag(getMSP430TargetMachine(), getOptLevel()));
+  return false;
+}
+
+void MSP430PassConfig::addPreEmitPass() {
+  // Must run branch selection immediately preceding the asm printer.
+  addPass(createMSP430BranchSelectionPass(), false);
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h
new file mode 100644
index 000000000000..de8f06e71dee
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h
@@ -0,0 +1,49 @@
+//===-- MSP430TargetMachine.h - Define TargetMachine for MSP430 -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MSP430 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430TARGETMACHINE_H
+#define LLVM_LIB_TARGET_MSP430_MSP430TARGETMACHINE_H
+
+#include "MSP430Subtarget.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// MSP430TargetMachine
+///
+class MSP430TargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  MSP430Subtarget        Subtarget;
+
+public:
+  MSP430TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                      StringRef FS, const TargetOptions &Options,
+                      Optional<Reloc::Model> RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
+  ~MSP430TargetMachine() override;
+
+  const MSP430Subtarget *getSubtargetImpl(const Function &F) const override {
+    return &Subtarget;
+  }
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+}; // MSP430TargetMachine.
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp b/contrib/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
new file mode 100644
index 000000000000..62f52a193674
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- MSP430TargetInfo.cpp - MSP430 Target Implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheMSP430Target() {
+  static Target TheMSP430Target;
+  return TheMSP430Target;
+}
+
+extern "C" void LLVMInitializeMSP430TargetInfo() {
+  RegisterTarget<Triple::msp430> X(getTheMSP430Target(), "msp430",
+                                   "MSP430 [experimental]");
+}
diff --git a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
new file mode 100644
index 000000000000..97ca11ca501e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -0,0 +1,6766 @@
+//===-- MipsAsmParser.cpp - Parse Mips assembly to MCInst instructions ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsABIInfo.h"
+#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "MipsRegisterInfo.h"
+#include "MipsTargetObjectFile.h"
+#include "MipsTargetStreamer.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-asm-parser"
+
+namespace llvm {
+class MCInstrInfo;
+}
+
+namespace {
+class MipsAssemblerOptions {
+public:
+  MipsAssemblerOptions(const FeatureBitset &Features_) :
+    ATReg(1), Reorder(true), Macro(true), Features(Features_) {}
+
+  MipsAssemblerOptions(const MipsAssemblerOptions *Opts) {
+    ATReg = Opts->getATRegIndex();
+    Reorder = Opts->isReorder();
+    Macro = Opts->isMacro();
+    Features = Opts->getFeatures();
+  }
+
+  unsigned getATRegIndex() const { return ATReg; }
+  bool setATRegIndex(unsigned Reg) {
+    if (Reg > 31)
+      return false;
+
+    ATReg = Reg;
+    return true;
+  }
+
+  bool isReorder() const { return Reorder; }
+  void setReorder() { Reorder = true; }
+  void setNoReorder() { Reorder = false; }
+
+  bool isMacro() const { return Macro; }
+  void setMacro() { Macro = true; }
+  void setNoMacro() { Macro = false; }
+
+  const FeatureBitset &getFeatures() const { return Features; }
+  void setFeatures(const FeatureBitset &Features_) { Features = Features_; }
+
+  // Set of features that are either architecture features or referenced
+  // by them (e.g.: FeatureNaN2008 implied by FeatureMips32r6).
+  // The full table can be found in MipsGenSubtargetInfo.inc (MipsFeatureKV[]).
+  // The reason we need this mask is explained in the selectArch function.
+  // FIXME: Ideally we would like TableGen to generate this information.
+  static const FeatureBitset AllArchRelatedMask;
+
+private:
+  unsigned ATReg;
+  bool Reorder;
+  bool Macro;
+  FeatureBitset Features;
+};
+}
+
+const FeatureBitset MipsAssemblerOptions::AllArchRelatedMask = {
+    Mips::FeatureMips1, Mips::FeatureMips2, Mips::FeatureMips3,
+    Mips::FeatureMips3_32, Mips::FeatureMips3_32r2, Mips::FeatureMips4,
+    Mips::FeatureMips4_32, Mips::FeatureMips4_32r2, Mips::FeatureMips5,
+    Mips::FeatureMips5_32r2, Mips::FeatureMips32, Mips::FeatureMips32r2,
+    Mips::FeatureMips32r3, Mips::FeatureMips32r5, Mips::FeatureMips32r6,
+    Mips::FeatureMips64, Mips::FeatureMips64r2, Mips::FeatureMips64r3,
+    Mips::FeatureMips64r5, Mips::FeatureMips64r6, Mips::FeatureCnMips,
+    Mips::FeatureFP64Bit, Mips::FeatureGP64Bit, Mips::FeatureNaN2008
+};
+
+namespace {
+class MipsAsmParser : public MCTargetAsmParser {
+  MipsTargetStreamer &getTargetStreamer() {
+    MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+    return static_cast<MipsTargetStreamer &>(TS);
+  }
+
+  MipsABIInfo ABI;
+  SmallVector<std::unique_ptr<MipsAssemblerOptions>, 2> AssemblerOptions;
+  MCSymbol *CurrentFn; // Pointer to the function being parsed. It may be a
+                       // nullptr, which indicates that no function is currently
+                       // selected. This usually happens after an '.end func'
+                       // directive.
+  bool IsLittleEndian;
+  bool IsPicEnabled;
+  bool IsCpRestoreSet;
+  int CpRestoreOffset;
+  unsigned CpSaveLocation;
+  /// If true, then CpSaveLocation is a register, otherwise it's an offset.
+  bool     CpSaveLocationIsRegister;
+
+  // Print a warning along with its fix-it message at the given range.
+  void printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg,
+                             SMRange Range, bool ShowColors = true);
+
+#define GET_ASSEMBLER_HEADER
+#include "MipsGenAsmMatcher.inc"
+
+  unsigned
+  checkEarlyTargetMatchPredicate(MCInst &Inst,
+                                 const OperandVector &Operands) override;
+  unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+
+  /// Parse a register as used in CFI directives
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+  bool parseParenSuffix(StringRef Name, OperandVector &Operands);
+
+  bool parseBracketSuffix(StringRef Name, OperandVector &Operands);
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+
+  bool ParseDirective(AsmToken DirectiveID) override;
+
+  OperandMatchResultTy parseMemOperand(OperandVector &Operands);
+  OperandMatchResultTy
+  matchAnyRegisterNameWithoutDollar(OperandVector &Operands,
+                                    StringRef Identifier, SMLoc S);
+  OperandMatchResultTy matchAnyRegisterWithoutDollar(OperandVector &Operands,
+                                                     SMLoc S);
+  OperandMatchResultTy parseAnyRegister(OperandVector &Operands);
+  OperandMatchResultTy parseImm(OperandVector &Operands);
+  OperandMatchResultTy parseJumpTarget(OperandVector &Operands);
+  OperandMatchResultTy parseInvNum(OperandVector &Operands);
+  OperandMatchResultTy parseRegisterPair(OperandVector &Operands);
+  OperandMatchResultTy parseMovePRegPair(OperandVector &Operands);
+  OperandMatchResultTy parseRegisterList(OperandVector &Operands);
+
+  bool searchSymbolAlias(OperandVector &Operands);
+
+  bool parseOperand(OperandVector &, StringRef Mnemonic);
+
+  enum MacroExpanderResultTy {
+    MER_NotAMacro,
+    MER_Success,
+    MER_Fail,
+  };
+
+  // Expands assembly pseudo instructions.
+  MacroExpanderResultTy tryExpandInstruction(MCInst &Inst, SMLoc IDLoc,
+                                             MCStreamer &Out,
+                                             const MCSubtargetInfo *STI);
+
+  bool expandJalWithRegs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                         const MCSubtargetInfo *STI);
+
+  bool loadImmediate(int64_t ImmValue, unsigned DstReg, unsigned SrcReg,
+                     bool Is32BitImm, bool IsAddress, SMLoc IDLoc,
+                     MCStreamer &Out, const MCSubtargetInfo *STI);
+
+  bool loadAndAddSymbolAddress(const MCExpr *SymExpr, unsigned DstReg,
+                               unsigned SrcReg, bool Is32BitSym, SMLoc IDLoc,
+                               MCStreamer &Out, const MCSubtargetInfo *STI);
+
+  bool expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
+                     MCStreamer &Out, const MCSubtargetInfo *STI);
+
+  bool expandLoadAddress(unsigned DstReg, unsigned BaseReg,
+                         const MCOperand &Offset, bool Is32BitAddress,
+                         SMLoc IDLoc, MCStreamer &Out,
+                         const MCSubtargetInfo *STI);
+
+  bool expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                  const MCSubtargetInfo *STI);
+
+  void expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                     const MCSubtargetInfo *STI, bool IsLoad, bool IsImmOpnd);
+
+  void expandLoadInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                      const MCSubtargetInfo *STI, bool IsImmOpnd);
+
+  void expandStoreInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                       const MCSubtargetInfo *STI, bool IsImmOpnd);
+
+  bool expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                               const MCSubtargetInfo *STI);
+
+  bool expandAliasImmediate(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                            const MCSubtargetInfo *STI);
+
+  bool expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                       const MCSubtargetInfo *STI);
+
+  bool expandCondBranches(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                          const MCSubtargetInfo *STI);
+
+  bool expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                 const MCSubtargetInfo *STI, const bool IsMips64,
+                 const bool Signed);
+
+  bool expandTrunc(MCInst &Inst, bool IsDouble, bool Is64FPU, SMLoc IDLoc,
+                   MCStreamer &Out, const MCSubtargetInfo *STI);
+
+  bool expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc, MCStreamer &Out,
+                 const MCSubtargetInfo *STI);
+
+  bool expandUsh(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                 const MCSubtargetInfo *STI);
+
+  bool expandUxw(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                 const MCSubtargetInfo *STI);
+
+  bool expandRotation(MCInst &Inst, SMLoc IDLoc,
+                      MCStreamer &Out, const MCSubtargetInfo *STI);
+  bool expandRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                         const MCSubtargetInfo *STI);
+  bool expandDRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                       const MCSubtargetInfo *STI);
+  bool expandDRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                          const MCSubtargetInfo *STI);
+
+  bool expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                 const MCSubtargetInfo *STI);
+
+  bool expandLoadStoreDMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                             const MCSubtargetInfo *STI, bool IsLoad);
+
+  bool expandSeq(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                 const MCSubtargetInfo *STI);
+
+  bool expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                  const MCSubtargetInfo *STI);
+
+  bool reportParseError(Twine ErrorMsg);
+  bool reportParseError(SMLoc Loc, Twine ErrorMsg);
+
+  bool parseMemOffset(const MCExpr *&Res, bool isParenExpr);
+
+  bool isEvaluated(const MCExpr *Expr);
+  bool parseSetMips0Directive();
+  bool parseSetArchDirective();
+  bool parseSetFeature(uint64_t Feature);
+  bool isPicAndNotNxxAbi(); // Used by .cpload, .cprestore, and .cpsetup.
+  bool parseDirectiveCpLoad(SMLoc Loc);
+  bool parseDirectiveCpRestore(SMLoc Loc);
+  bool parseDirectiveCPSetup();
+  bool parseDirectiveCPReturn();
+  bool parseDirectiveNaN();
+  bool parseDirectiveSet();
+  bool parseDirectiveOption();
+  bool parseInsnDirective();
+  bool parseSSectionDirective(StringRef Section, unsigned Type);
+
+  bool parseSetAtDirective();
+  bool parseSetNoAtDirective();
+  bool parseSetMacroDirective();
+  bool parseSetNoMacroDirective();
+  bool parseSetMsaDirective();
+  bool parseSetNoMsaDirective();
+  bool parseSetNoDspDirective();
+  bool parseSetReorderDirective();
+  bool parseSetNoReorderDirective();
+  bool parseSetMips16Directive();
+  bool parseSetNoMips16Directive();
+  bool parseSetFpDirective();
+  bool parseSetOddSPRegDirective();
+  bool parseSetNoOddSPRegDirective();
+  bool parseSetPopDirective();
+  bool parseSetPushDirective();
+  bool parseSetSoftFloatDirective();
+  bool parseSetHardFloatDirective();
+
+  bool parseSetAssignment();
+
+  bool parseDataDirective(unsigned Size, SMLoc L);
+  bool parseDirectiveGpWord();
+  bool parseDirectiveGpDWord();
+  bool parseDirectiveDtpRelWord();
+  bool parseDirectiveDtpRelDWord();
+  bool parseDirectiveTpRelWord();
+  bool parseDirectiveTpRelDWord();
+  bool parseDirectiveModule();
+  bool parseDirectiveModuleFP();
+  bool parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI,
+                       StringRef Directive);
+
+  bool parseInternalDirectiveReallowModule();
+
+  bool eatComma(StringRef ErrorStr);
+
+  int matchCPURegisterName(StringRef Symbol);
+
+  int matchHWRegsRegisterName(StringRef Symbol);
+
+  int matchFPURegisterName(StringRef Name);
+
+  int matchFCCRegisterName(StringRef Name);
+
+  int matchACRegisterName(StringRef Name);
+
+  int matchMSA128RegisterName(StringRef Name);
+
+  int matchMSA128CtrlRegisterName(StringRef Name);
+
+  unsigned getReg(int RC, int RegNo);
+
+  /// Returns the internal register number for the current AT. Also checks if
+  /// the current AT is unavailable (set to $0) and gives an error if it is.
+  /// This should be used in pseudo-instruction expansions which need AT.
+  unsigned getATReg(SMLoc Loc);
+
+  bool processInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                          const MCSubtargetInfo *STI);
+
+  // Helper function that checks if the value of a vector index is within the
+  // boundaries of accepted values for each RegisterKind
+  // Example: INSERT.B $w0[n], $1 => 16 > n >= 0
+  bool validateMSAIndex(int Val, int RegKind);
+
+  // Selects a new architecture by updating the FeatureBits with the necessary
+  // info including implied dependencies.
+  // Internally, it clears all the feature bits related to *any* architecture
+  // and selects the new one using the ToggleFeature functionality of the
+  // MCSubtargetInfo object that handles implied dependencies. The reason we
+  // clear all the arch related bits manually is because ToggleFeature only
+  // clears the features that imply the feature being cleared and not the
+  // features implied by the feature being cleared. This is easier to see
+  // with an example:
+  //  --------------------------------------------------
+  // | Feature         | Implies                        |
+  // | -------------------------------------------------|
+  // | FeatureMips1    | None                           |
+  // | FeatureMips2    | FeatureMips1                   |
+  // | FeatureMips3    | FeatureMips2 | FeatureMipsGP64 |
+  // | FeatureMips4    | FeatureMips3                   |
+  // | ...             |                                |
+  //  --------------------------------------------------
+  //
+  // Setting Mips3 is equivalent to set: (FeatureMips3 | FeatureMips2 |
+  // FeatureMipsGP64 | FeatureMips1)
+  // Clearing Mips3 is equivalent to clear (FeatureMips3 | FeatureMips4).
+  void selectArch(StringRef ArchFeature) {
+    MCSubtargetInfo &STI = copySTI();
+    FeatureBitset FeatureBits = STI.getFeatureBits();
+    FeatureBits &= ~MipsAssemblerOptions::AllArchRelatedMask;
+    STI.setFeatureBits(FeatureBits);
+    setAvailableFeatures(
+        ComputeAvailableFeatures(STI.ToggleFeature(ArchFeature)));
+    AssemblerOptions.back()->setFeatures(STI.getFeatureBits());
+  }
+
+  void setFeatureBits(uint64_t Feature, StringRef FeatureString) {
+    if (!(getSTI().getFeatureBits()[Feature])) {
+      MCSubtargetInfo &STI = copySTI();
+      setAvailableFeatures(
+          ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
+      AssemblerOptions.back()->setFeatures(STI.getFeatureBits());
+    }
+  }
+
+  void clearFeatureBits(uint64_t Feature, StringRef FeatureString) {
+    if (getSTI().getFeatureBits()[Feature]) {
+      MCSubtargetInfo &STI = copySTI();
+      setAvailableFeatures(
+          ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
+      AssemblerOptions.back()->setFeatures(STI.getFeatureBits());
+    }
+  }
+
+  void setModuleFeatureBits(uint64_t Feature, StringRef FeatureString) {
+    setFeatureBits(Feature, FeatureString);
+    AssemblerOptions.front()->setFeatures(getSTI().getFeatureBits());
+  }
+
+  void clearModuleFeatureBits(uint64_t Feature, StringRef FeatureString) {
+    clearFeatureBits(Feature, FeatureString);
+    AssemblerOptions.front()->setFeatures(getSTI().getFeatureBits());
+  }
+
+public:
+  enum MipsMatchResultTy {
+    Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY,
+    Match_RequiresDifferentOperands,
+    Match_RequiresNoZeroRegister,
+    Match_RequiresSameSrcAndDst,
+    Match_NonZeroOperandForSync,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "MipsGenAsmMatcher.inc"
+#undef GET_OPERAND_DIAGNOSTIC_TYPES
+  };
+
+  MipsAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
+                const MCInstrInfo &MII, const MCTargetOptions &Options)
+    : MCTargetAsmParser(Options, sti),
+        ABI(MipsABIInfo::computeTargetABI(Triple(sti.getTargetTriple()),
+                                          sti.getCPU(), Options)) {
+    MCAsmParserExtension::Initialize(parser);
+
+    parser.addAliasForDirective(".asciiz", ".asciz");
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+
+    // Remember the initial assembler options. The user can not modify these.
+    AssemblerOptions.push_back(
+        llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));
+
+    // Create an assembler options environment for the user to modify.
+    AssemblerOptions.push_back(
+        llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits()));
+
+    getTargetStreamer().updateABIInfo(*this);
+
+    if (!isABI_O32() && !useOddSPReg() != 0)
+      report_fatal_error("-mno-odd-spreg requires the O32 ABI");
+
+    CurrentFn = nullptr;
+
+    IsPicEnabled = getContext().getObjectFileInfo()->isPositionIndependent();
+
+    IsCpRestoreSet = false;
+    CpRestoreOffset = -1;
+
+    const Triple &TheTriple = sti.getTargetTriple();
+    if ((TheTriple.getArch() == Triple::mips) ||
+        (TheTriple.getArch() == Triple::mips64))
+      IsLittleEndian = false;
+    else
+      IsLittleEndian = true;
+  }
+
+  /// True if all of $fcc0 - $fcc7 exist for the current ISA.
+  bool hasEightFccRegisters() const { return hasMips4() || hasMips32(); }
+
+  bool isGP64bit() const {
+    return getSTI().getFeatureBits()[Mips::FeatureGP64Bit];
+  }
+  bool isFP64bit() const {
+    return getSTI().getFeatureBits()[Mips::FeatureFP64Bit];
+  }
+  const MipsABIInfo &getABI() const { return ABI; }
+  bool isABI_N32() const { return ABI.IsN32(); }
+  bool isABI_N64() const { return ABI.IsN64(); }
+  bool isABI_O32() const { return ABI.IsO32(); }
+  bool isABI_FPXX() const {
+    return getSTI().getFeatureBits()[Mips::FeatureFPXX];
+  }
+
+  bool useOddSPReg() const {
+    return !(getSTI().getFeatureBits()[Mips::FeatureNoOddSPReg]);
+  }
+
+  bool inMicroMipsMode() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMicroMips];
+  }
+  bool hasMips1() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips1];
+  }
+  bool hasMips2() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips2];
+  }
+  bool hasMips3() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips3];
+  }
+  bool hasMips4() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips4];
+  }
+  bool hasMips5() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips5];
+  }
+  bool hasMips32() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips32];
+  }
+  bool hasMips64() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips64];
+  }
+  bool hasMips32r2() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips32r2];
+  }
+  bool hasMips64r2() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips64r2];
+  }
+  bool hasMips32r3() const {
+    return (getSTI().getFeatureBits()[Mips::FeatureMips32r3]);
+  }
+  bool hasMips64r3() const {
+    return (getSTI().getFeatureBits()[Mips::FeatureMips64r3]);
+  }
+  bool hasMips32r5() const {
+    return (getSTI().getFeatureBits()[Mips::FeatureMips32r5]);
+  }
+  bool hasMips64r5() const {
+    return (getSTI().getFeatureBits()[Mips::FeatureMips64r5]);
+  }
+  bool hasMips32r6() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips32r6];
+  }
+  bool hasMips64r6() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips64r6];
+  }
+
+  bool hasDSP() const {
+    return getSTI().getFeatureBits()[Mips::FeatureDSP];
+  }
+  bool hasDSPR2() const {
+    return getSTI().getFeatureBits()[Mips::FeatureDSPR2];
+  }
+  bool hasDSPR3() const {
+    return getSTI().getFeatureBits()[Mips::FeatureDSPR3];
+  }
+  bool hasMSA() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMSA];
+  }
+  bool hasCnMips() const {
+    return (getSTI().getFeatureBits()[Mips::FeatureCnMips]);
+  }
+
+  bool inPicMode() {
+    return IsPicEnabled;
+  }
+
+  bool inMips16Mode() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMips16];
+  }
+
+  bool useTraps() const {
+    return getSTI().getFeatureBits()[Mips::FeatureUseTCCInDIV];
+  }
+
+  bool useSoftFloat() const {
+    return getSTI().getFeatureBits()[Mips::FeatureSoftFloat];
+  }
+
+  /// Warn if RegIndex is the same as the current AT.
+  void warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc);
+
+  void warnIfNoMacro(SMLoc Loc);
+
+  bool isLittle() const { return IsLittleEndian; }
+
+  const MCExpr *createTargetUnaryExpr(const MCExpr *E,
+                                      AsmToken::TokenKind OperatorToken,
+                                      MCContext &Ctx) override {
+    switch(OperatorToken) {
+    default:
+      llvm_unreachable("Unknown token");
+      return nullptr;
+    case AsmToken::PercentCall16:
+      return MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, E, Ctx);
+    case AsmToken::PercentCall_Hi:
+      return MipsMCExpr::create(MipsMCExpr::MEK_CALL_HI16, E, Ctx);
+    case AsmToken::PercentCall_Lo:
+      return MipsMCExpr::create(MipsMCExpr::MEK_CALL_LO16, E, Ctx);
+    case AsmToken::PercentDtprel_Hi:
+      return MipsMCExpr::create(MipsMCExpr::MEK_DTPREL_HI, E, Ctx);
+    case AsmToken::PercentDtprel_Lo:
+      return MipsMCExpr::create(MipsMCExpr::MEK_DTPREL_LO, E, Ctx);
+    case AsmToken::PercentGot:
+      return MipsMCExpr::create(MipsMCExpr::MEK_GOT, E, Ctx);
+    case AsmToken::PercentGot_Disp:
+      return MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, E, Ctx);
+    case AsmToken::PercentGot_Hi:
+      return MipsMCExpr::create(MipsMCExpr::MEK_GOT_HI16, E, Ctx);
+    case AsmToken::PercentGot_Lo:
+      return MipsMCExpr::create(MipsMCExpr::MEK_GOT_LO16, E, Ctx);
+    case AsmToken::PercentGot_Ofst:
+      return MipsMCExpr::create(MipsMCExpr::MEK_GOT_OFST, E, Ctx);
+    case AsmToken::PercentGot_Page:
+      return MipsMCExpr::create(MipsMCExpr::MEK_GOT_PAGE, E, Ctx);
+    case AsmToken::PercentGottprel:
+      return MipsMCExpr::create(MipsMCExpr::MEK_GOTTPREL, E, Ctx);
+    case AsmToken::PercentGp_Rel:
+      return MipsMCExpr::create(MipsMCExpr::MEK_GPREL, E, Ctx);
+    case AsmToken::PercentHi:
+      return MipsMCExpr::create(MipsMCExpr::MEK_HI, E, Ctx);
+    case AsmToken::PercentHigher:
+      return MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, E, Ctx);
+    case AsmToken::PercentHighest:
+      return MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, E, Ctx);
+    case AsmToken::PercentLo:
+      return MipsMCExpr::create(MipsMCExpr::MEK_LO, E, Ctx);
+    case AsmToken::PercentNeg:
+      return MipsMCExpr::create(MipsMCExpr::MEK_NEG, E, Ctx);
+    case AsmToken::PercentPcrel_Hi:
+      return MipsMCExpr::create(MipsMCExpr::MEK_PCREL_HI16, E, Ctx);
+    case AsmToken::PercentPcrel_Lo:
+      return MipsMCExpr::create(MipsMCExpr::MEK_PCREL_LO16, E, Ctx);
+    case AsmToken::PercentTlsgd:
+      return MipsMCExpr::create(MipsMCExpr::MEK_TLSGD, E, Ctx);
+    case AsmToken::PercentTlsldm:
+      return MipsMCExpr::create(MipsMCExpr::MEK_TLSLDM, E, Ctx);
+    case AsmToken::PercentTprel_Hi:
+      return MipsMCExpr::create(MipsMCExpr::MEK_TPREL_HI, E, Ctx);
+    case AsmToken::PercentTprel_Lo:
+      return MipsMCExpr::create(MipsMCExpr::MEK_TPREL_LO, E, Ctx);
+    }
+  }
+};
+}
+
+namespace {
+
+/// MipsOperand - Instances of this class represent a parsed Mips machine
+/// instruction.
+class MipsOperand : public MCParsedAsmOperand {
+public:
+  /// Broad categories of register classes
+  /// The exact class is finalized by the render method.
+  enum RegKind {
+    RegKind_GPR = 1,      /// GPR32 and GPR64 (depending on isGP64bit())
+    RegKind_FGR = 2,      /// FGR32, FGR64, AFGR64 (depending on context and
+                          /// isFP64bit())
+    RegKind_FCC = 4,      /// FCC
+    RegKind_MSA128 = 8,   /// MSA128[BHWD] (makes no difference which)
+    RegKind_MSACtrl = 16, /// MSA control registers
+    RegKind_COP2 = 32,    /// COP2
+    RegKind_ACC = 64,     /// HI32DSP, LO32DSP, and ACC64DSP (depending on
+                          /// context).
+    RegKind_CCR = 128,    /// CCR
+    RegKind_HWRegs = 256, /// HWRegs
+    RegKind_COP3 = 512,   /// COP3
+    RegKind_COP0 = 1024,  /// COP0
+    /// Potentially any (e.g. $1)
+    RegKind_Numeric = RegKind_GPR | RegKind_FGR | RegKind_FCC | RegKind_MSA128 |
+                      RegKind_MSACtrl | RegKind_COP2 | RegKind_ACC |
+                      RegKind_CCR | RegKind_HWRegs | RegKind_COP3 | RegKind_COP0
+  };
+
+private:
+  enum KindTy {
+    k_Immediate,     /// An immediate (possibly involving symbol references)
+    k_Memory,        /// Base + Offset Memory Address
+    k_RegisterIndex, /// A register index in one or more RegKind.
+    k_Token,         /// A simple token
+    k_RegList,       /// A physical register list
+    k_RegPair        /// A pair of physical register
+  } Kind;
+
+public:
+  MipsOperand(KindTy K, MipsAsmParser &Parser)
+      : MCParsedAsmOperand(), Kind(K), AsmParser(Parser) {}
+
+private:
+  /// For diagnostics, and checking the assembler temporary
+  MipsAsmParser &AsmParser;
+
+  struct Token {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegIdxOp {
+    unsigned Index; /// Index into the register class
+    RegKind Kind;   /// Bitfield of the kinds it could possibly be
+    struct Token Tok; /// The input token this operand originated from.
+    const MCRegisterInfo *RegInfo;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct MemOp {
+    MipsOperand *Base;
+    const MCExpr *Off;
+  };
+
+  struct RegListOp {
+    SmallVector<unsigned, 10> *List;
+  };
+
+  union {
+    struct Token Tok;
+    struct RegIdxOp RegIdx;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+    struct RegListOp RegList;
+  };
+
+  SMLoc StartLoc, EndLoc;
+
+  /// Internal constructor for register kinds
+  static std::unique_ptr<MipsOperand> CreateReg(unsigned Index, StringRef Str,
+                                                RegKind RegKind,
+                                                const MCRegisterInfo *RegInfo,
+                                                SMLoc S, SMLoc E,
+                                                MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_RegisterIndex, Parser);
+    Op->RegIdx.Index = Index;
+    Op->RegIdx.RegInfo = RegInfo;
+    Op->RegIdx.Kind = RegKind;
+    Op->RegIdx.Tok.Data = Str.data();
+    Op->RegIdx.Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+public:
+  /// Coerce the register to GPR32 and return the real register for the current
+  /// target.
+  unsigned getGPR32Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!");
+    AsmParser.warnIfRegIndexIsAT(RegIdx.Index, StartLoc);
+    unsigned ClassID = Mips::GPR32RegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to GPR32 and return the real register for the current
+  /// target.
+  unsigned getGPRMM16Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!");
+    unsigned ClassID = Mips::GPR32RegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to GPR64 and return the real register for the current
+  /// target.
+  unsigned getGPR64Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!");
+    unsigned ClassID = Mips::GPR64RegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+private:
+  /// Coerce the register to AFGR64 and return the real register for the current
+  /// target.
+  unsigned getAFGR64Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
+    if (RegIdx.Index % 2 != 0)
+      AsmParser.Warning(StartLoc, "Float register should be even.");
+    return RegIdx.RegInfo->getRegClass(Mips::AFGR64RegClassID)
+        .getRegister(RegIdx.Index / 2);
+  }
+
+  /// Coerce the register to FGR64 and return the real register for the current
+  /// target.
+  unsigned getFGR64Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
+    return RegIdx.RegInfo->getRegClass(Mips::FGR64RegClassID)
+        .getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to FGR32 and return the real register for the current
+  /// target.
+  unsigned getFGR32Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
+    return RegIdx.RegInfo->getRegClass(Mips::FGR32RegClassID)
+        .getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to FGRH32 and return the real register for the current
+  /// target.
+  unsigned getFGRH32Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_FGR) && "Invalid access!");
+    return RegIdx.RegInfo->getRegClass(Mips::FGRH32RegClassID)
+        .getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to FCC and return the real register for the current
+  /// target.
+  unsigned getFCCReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_FCC) && "Invalid access!");
+    return RegIdx.RegInfo->getRegClass(Mips::FCCRegClassID)
+        .getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to MSA128 and return the real register for the current
+  /// target.
+  unsigned getMSA128Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_MSA128) && "Invalid access!");
+    // It doesn't matter which of the MSA128[BHWD] classes we use. They are all
+    // identical
+    unsigned ClassID = Mips::MSA128BRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to MSACtrl and return the real register for the
+  /// current target.
+  unsigned getMSACtrlReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_MSACtrl) && "Invalid access!");
+    unsigned ClassID = Mips::MSACtrlRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to COP0 and return the real register for the
+  /// current target.
+  unsigned getCOP0Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_COP0) && "Invalid access!");
+    unsigned ClassID = Mips::COP0RegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to COP2 and return the real register for the
+  /// current target.
+  unsigned getCOP2Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_COP2) && "Invalid access!");
+    unsigned ClassID = Mips::COP2RegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to COP3 and return the real register for the
+  /// current target.
+  unsigned getCOP3Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_COP3) && "Invalid access!");
+    unsigned ClassID = Mips::COP3RegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to ACC64DSP and return the real register for the
+  /// current target.
+  unsigned getACC64DSPReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!");
+    unsigned ClassID = Mips::ACC64DSPRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to HI32DSP and return the real register for the
+  /// current target.
+  unsigned getHI32DSPReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!");
+    unsigned ClassID = Mips::HI32DSPRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to LO32DSP and return the real register for the
+  /// current target.
+  unsigned getLO32DSPReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_ACC) && "Invalid access!");
+    unsigned ClassID = Mips::LO32DSPRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to CCR and return the real register for the
+  /// current target.
+  unsigned getCCRReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_CCR) && "Invalid access!");
+    unsigned ClassID = Mips::CCRRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to HWRegs and return the real register for the
+  /// current target.
+  unsigned getHWRegsReg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_HWRegs) && "Invalid access!");
+    unsigned ClassID = Mips::HWRegsRegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+public:
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediate when possible.  Null MCExpr = 0.
+    if (!Expr)
+      Inst.addOperand(MCOperand::createImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    llvm_unreachable("Use a custom parser instead");
+  }
+
+  /// Render the operand to an MCInst as a GPR32
+  /// Asserts if the wrong number of operands are requested, or the operand
+  /// is not a k_RegisterIndex compatible with RegKind_GPR
+  void addGPR32AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getGPR32Reg()));
+  }
+
+  void addGPRMM16AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getGPRMM16Reg()));
+  }
+
+  void addGPRMM16AsmRegZeroOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getGPRMM16Reg()));
+  }
+
+  void addGPRMM16AsmRegMovePOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getGPRMM16Reg()));
+  }
+
+  /// Render the operand to an MCInst as a GPR64
+  /// Asserts if the wrong number of operands are requested, or the operand
+  /// is not a k_RegisterIndex compatible with RegKind_GPR
+  void addGPR64AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getGPR64Reg()));
+  }
+
+  void addAFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getAFGR64Reg()));
+  }
+
+  void addFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getFGR64Reg()));
+  }
+
+  void addFGR32AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getFGR32Reg()));
+    // FIXME: We ought to do this for -integrated-as without -via-file-asm too.
+    // FIXME: This should propagate failure up to parseStatement.
+    if (!AsmParser.useOddSPReg() && RegIdx.Index & 1)
+      AsmParser.getParser().printError(
+          StartLoc, "-mno-odd-spreg prohibits the use of odd FPU "
+                    "registers");
+  }
+
+  void addFGRH32AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getFGRH32Reg()));
+  }
+
+  void addFCCAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getFCCReg()));
+  }
+
+  void addMSA128AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getMSA128Reg()));
+  }
+
+  void addMSACtrlAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getMSACtrlReg()));
+  }
+
+  void addCOP0AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getCOP0Reg()));
+  }
+
+  void addCOP2AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getCOP2Reg()));
+  }
+
+  void addCOP3AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getCOP3Reg()));
+  }
+
+  void addACC64DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getACC64DSPReg()));
+  }
+
+  void addHI32DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getHI32DSPReg()));
+  }
+
+  void addLO32DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getLO32DSPReg()));
+  }
+
+  void addCCRAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getCCRReg()));
+  }
+
+  void addHWRegsAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getHWRegsReg()));
+  }
+
+  template <unsigned Bits, int Offset = 0, int AdjustOffset = 0>
+  void addConstantUImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    uint64_t Imm = getConstantImm() - Offset;
+    Imm &= (1ULL << Bits) - 1;
+    Imm += Offset;
+    Imm += AdjustOffset;
+    Inst.addOperand(MCOperand::createImm(Imm));
+  }
+
+  template <unsigned Bits>
+  void addSImmOperands(MCInst &Inst, unsigned N) const {
+    if (isImm() && !isConstantImm()) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    addConstantSImmOperands<Bits, 0, 0>(Inst, N);
+  }
+
+  template <unsigned Bits>
+  void addUImmOperands(MCInst &Inst, unsigned N) const {
+    if (isImm() && !isConstantImm()) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    addConstantUImmOperands<Bits, 0, 0>(Inst, N);
+  }
+
+  template <unsigned Bits, int Offset = 0, int AdjustOffset = 0>
+  void addConstantSImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    int64_t Imm = getConstantImm() - Offset;
+    Imm = SignExtend64<Bits>(Imm);
+    Imm += Offset;
+    Imm += AdjustOffset;
+    Inst.addOperand(MCOperand::createImm(Imm));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCExpr *Expr = getImm();
+    addExpr(Inst, Expr);
+  }
+
+  void addMemOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createReg(AsmParser.getABI().ArePtrs64bit()
+                                             ? getMemBase()->getGPR64Reg()
+                                             : getMemBase()->getGPR32Reg()));
+
+    const MCExpr *Expr = getMemOff();
+    addExpr(Inst, Expr);
+  }
+
+  void addMicroMipsMemOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createReg(getMemBase()->getGPRMM16Reg()));
+
+    const MCExpr *Expr = getMemOff();
+    addExpr(Inst, Expr);
+  }
+
+  void addRegListOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    for (auto RegNo : getRegList())
+      Inst.addOperand(MCOperand::createReg(RegNo));
+  }
+
+  void addRegPairOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    assert((RegIdx.Kind & RegKind_GPR) && "Invalid access!");
+    unsigned RegNo = getRegPair();
+    AsmParser.warnIfRegIndexIsAT(RegNo, StartLoc);
+    Inst.addOperand(MCOperand::createReg(
+      RegIdx.RegInfo->getRegClass(
+        AsmParser.getABI().AreGprs64bit()
+          ? Mips::GPR64RegClassID
+          : Mips::GPR32RegClassID).getRegister(RegNo++)));
+    Inst.addOperand(MCOperand::createReg(
+      RegIdx.RegInfo->getRegClass(
+        AsmParser.getABI().AreGprs64bit()
+          ? Mips::GPR64RegClassID
+          : Mips::GPR32RegClassID).getRegister(RegNo)));
+  }
+
+  void addMovePRegPairOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    for (auto RegNo : getRegList())
+      Inst.addOperand(MCOperand::createReg(RegNo));
+  }
+
+  bool isReg() const override {
+    // As a special case until we sort out the definition of div/divu, accept
+    // $0/$zero here so that MCK_ZERO works correctly.
+    return isGPRAsmReg() && RegIdx.Index == 0;
+  }
+  bool isRegIdx() const { return Kind == k_RegisterIndex; }
+  bool isImm() const override { return Kind == k_Immediate; }
+  bool isConstantImm() const {
+    int64_t Res;
+    return isImm() && getImm()->evaluateAsAbsolute(Res);
+  }
+  bool isConstantImmz() const {
+    return isConstantImm() && getConstantImm() == 0;
+  }
+  template <unsigned Bits, int Offset = 0> bool isConstantUImm() const {
+    return isConstantImm() && isUInt<Bits>(getConstantImm() - Offset);
+  }
+  template <unsigned Bits> bool isSImm() const {
+    return isConstantImm() ? isInt<Bits>(getConstantImm()) : isImm();
+  }
+  template <unsigned Bits> bool isUImm() const {
+    return isConstantImm() ? isUInt<Bits>(getConstantImm()) : isImm();
+  }
+  template <unsigned Bits> bool isAnyImm() const {
+    return isConstantImm() ? (isInt<Bits>(getConstantImm()) ||
+                              isUInt<Bits>(getConstantImm()))
+                           : isImm();
+  }
+  template <unsigned Bits, int Offset = 0> bool isConstantSImm() const {
+    return isConstantImm() && isInt<Bits>(getConstantImm() - Offset);
+  }
+  template <unsigned Bottom, unsigned Top> bool isConstantUImmRange() const {
+    return isConstantImm() && getConstantImm() >= Bottom &&
+           getConstantImm() <= Top;
+  }
+  bool isToken() const override {
+    // Note: It's not possible to pretend that other operand kinds are tokens.
+    // The matcher emitter checks tokens first.
+    return Kind == k_Token;
+  }
+  bool isMem() const override { return Kind == k_Memory; }
+  bool isConstantMemOff() const {
+    return isMem() && isa<MCConstantExpr>(getMemOff());
+  }
+  // Allow relocation operators.
+  // FIXME: This predicate and others need to look through binary expressions
+  //        and determine whether a Value is a constant or not.
+  template <unsigned Bits, unsigned ShiftAmount = 0>
+  bool isMemWithSimmOffset() const {
+    if (!isMem())
+      return false;
+    if (!getMemBase()->isGPRAsmReg())
+      return false;
+    if (isa<MCTargetExpr>(getMemOff()) ||
+        (isConstantMemOff() &&
+         isShiftedInt<Bits, ShiftAmount>(getConstantMemOff())))
+      return true;
+    MCValue Res;
+    bool IsReloc = getMemOff()->evaluateAsRelocatable(Res, nullptr, nullptr);
+    return IsReloc && isShiftedInt<Bits, ShiftAmount>(Res.getConstant());
+  }
+  bool isMemWithGRPMM16Base() const {
+    return isMem() && getMemBase()->isMM16AsmReg();
+  }
+  template <unsigned Bits> bool isMemWithUimmOffsetSP() const {
+    return isMem() && isConstantMemOff() && isUInt<Bits>(getConstantMemOff())
+      && getMemBase()->isRegIdx() && (getMemBase()->getGPR32Reg() == Mips::SP);
+  }
+  template <unsigned Bits> bool isMemWithUimmWordAlignedOffsetSP() const {
+    return isMem() && isConstantMemOff() && isUInt<Bits>(getConstantMemOff())
+      && (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx()
+      && (getMemBase()->getGPR32Reg() == Mips::SP);
+  }
+  template <unsigned Bits> bool isMemWithSimmWordAlignedOffsetGP() const {
+    return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff())
+      && (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx()
+      && (getMemBase()->getGPR32Reg() == Mips::GP);
+  }
+  template <unsigned Bits, unsigned ShiftLeftAmount>
+  bool isScaledUImm() const {
+    return isConstantImm() &&
+           isShiftedUInt<Bits, ShiftLeftAmount>(getConstantImm());
+  }
+  template <unsigned Bits, unsigned ShiftLeftAmount>
+  bool isScaledSImm() const {
+    if (isConstantImm() && isShiftedInt<Bits, ShiftLeftAmount>(getConstantImm()))
+      return true;
+    // Operand can also be a symbol or symbol plus offset in case of relocations.
+    if (Kind != k_Immediate)
+      return false;
+    MCValue Res;
+    bool Success = getImm()->evaluateAsRelocatable(Res, nullptr, nullptr);
+    return Success && isShiftedInt<Bits, ShiftLeftAmount>(Res.getConstant());
+  }
+  bool isRegList16() const {
+    if (!isRegList())
+      return false;
+
+    int Size = RegList.List->size();
+    if (Size < 2 || Size > 5)
+      return false;
+
+    unsigned R0 = RegList.List->front();
+    unsigned R1 = RegList.List->back();
+    if (!((R0 == Mips::S0 && R1 == Mips::RA) ||
+          (R0 == Mips::S0_64 && R1 == Mips::RA_64)))
+      return false;
+
+    int PrevReg = *RegList.List->begin();
+    for (int i = 1; i < Size - 1; i++) {
+      int Reg = (*(RegList.List))[i];
+      if ( Reg != PrevReg + 1)
+        return false;
+      PrevReg = Reg;
+    }
+
+    return true;
+  }
+  bool isInvNum() const { return Kind == k_Immediate; }
+  bool isLSAImm() const {
+    if (!isConstantImm())
+      return false;
+    int64_t Val = getConstantImm();
+    return 1 <= Val && Val <= 4;
+  }
+  bool isRegList() const { return Kind == k_RegList; }
+  bool isMovePRegPair() const {
+    if (Kind != k_RegList || RegList.List->size() != 2)
+      return false;
+
+    unsigned R0 = RegList.List->front();
+    unsigned R1 = RegList.List->back();
+
+    if ((R0 == Mips::A1 && R1 == Mips::A2) ||
+        (R0 == Mips::A1 && R1 == Mips::A3) ||
+        (R0 == Mips::A2 && R1 == Mips::A3) ||
+        (R0 == Mips::A0 && R1 == Mips::S5) ||
+        (R0 == Mips::A0 && R1 == Mips::S6) ||
+        (R0 == Mips::A0 && R1 == Mips::A1) ||
+        (R0 == Mips::A0 && R1 == Mips::A2) ||
+        (R0 == Mips::A0 && R1 == Mips::A3) ||
+        (R0 == Mips::A1_64 && R1 == Mips::A2_64) ||
+        (R0 == Mips::A1_64 && R1 == Mips::A3_64) ||
+        (R0 == Mips::A2_64 && R1 == Mips::A3_64) ||
+        (R0 == Mips::A0_64 && R1 == Mips::S5_64) ||
+        (R0 == Mips::A0_64 && R1 == Mips::S6_64) ||
+        (R0 == Mips::A0_64 && R1 == Mips::A1_64) ||
+        (R0 == Mips::A0_64 && R1 == Mips::A2_64) ||
+        (R0 == Mips::A0_64 && R1 == Mips::A3_64))
+      return true;
+
+    return false;
+  }
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+  bool isRegPair() const {
+    return Kind == k_RegPair && RegIdx.Index <= 30;
+  }
+
+  unsigned getReg() const override {
+    // As a special case until we sort out the definition of div/divu, accept
+    // $0/$zero here so that MCK_ZERO works correctly.
+    if (Kind == k_RegisterIndex && RegIdx.Index == 0 &&
+        RegIdx.Kind & RegKind_GPR)
+      return getGPR32Reg(); // FIXME: GPR64 too
+
+    llvm_unreachable("Invalid access!");
+    return 0;
+  }
+
+  const MCExpr *getImm() const {
+    assert((Kind == k_Immediate) && "Invalid access!");
+    return Imm.Val;
+  }
+
+  int64_t getConstantImm() const {
+    const MCExpr *Val = getImm();
+    int64_t Value = 0;
+    (void)Val->evaluateAsAbsolute(Value);
+    return Value;
+  }
+
+  MipsOperand *getMemBase() const {
+    assert((Kind == k_Memory) && "Invalid access!");
+    return Mem.Base;
+  }
+
+  const MCExpr *getMemOff() const {
+    assert((Kind == k_Memory) && "Invalid access!");
+    return Mem.Off;
+  }
+
+  int64_t getConstantMemOff() const {
+    return static_cast<const MCConstantExpr *>(getMemOff())->getValue();
+  }
+
+  const SmallVectorImpl<unsigned> &getRegList() const {
+    assert((Kind == k_RegList) && "Invalid access!");
+    return *(RegList.List);
+  }
+
+  unsigned getRegPair() const {
+    assert((Kind == k_RegPair) && "Invalid access!");
+    return RegIdx.Index;
+  }
+
+  static std::unique_ptr<MipsOperand> CreateToken(StringRef Str, SMLoc S,
+                                                  MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_Token, Parser);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  /// Create a numeric register (e.g. $1). The exact register remains
+  /// unresolved until an instruction successfully matches
+  static std::unique_ptr<MipsOperand>
+  createNumericReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
+                   SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+    DEBUG(dbgs() << "createNumericReg(" << Index << ", ...)\n");
+    return CreateReg(Index, Str, RegKind_Numeric, RegInfo, S, E, Parser);
+  }
+
+  /// Create a register that is definitely a GPR.
+  /// This is typically only used for named registers such as $gp.
+  static std::unique_ptr<MipsOperand>
+  createGPRReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
+               SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+    return CreateReg(Index, Str, RegKind_GPR, RegInfo, S, E, Parser);
+  }
+
+  /// Create a register that is definitely a FGR.
+  /// This is typically only used for named registers such as $f0.
+  static std::unique_ptr<MipsOperand>
+  createFGRReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
+               SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+    return CreateReg(Index, Str, RegKind_FGR, RegInfo, S, E, Parser);
+  }
+
+  /// Create a register that is definitely a HWReg.
+  /// This is typically only used for named registers such as $hwr_cpunum.
+  static std::unique_ptr<MipsOperand>
+  createHWRegsReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
+                  SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+    return CreateReg(Index, Str, RegKind_HWRegs, RegInfo, S, E, Parser);
+  }
+
+  /// Create a register that is definitely an FCC.
+  /// This is typically only used for named registers such as $fcc0.
+  static std::unique_ptr<MipsOperand>
+  createFCCReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
+               SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+    return CreateReg(Index, Str, RegKind_FCC, RegInfo, S, E, Parser);
+  }
+
+  /// Create a register that is definitely an ACC.
+  /// This is typically only used for named registers such as $ac0.
+  static std::unique_ptr<MipsOperand>
+  createACCReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
+               SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+    return CreateReg(Index, Str, RegKind_ACC, RegInfo, S, E, Parser);
+  }
+
+  /// Create a register that is definitely an MSA128.
+  /// This is typically only used for named registers such as $w0.
+  static std::unique_ptr<MipsOperand>
+  createMSA128Reg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
+                  SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+    return CreateReg(Index, Str, RegKind_MSA128, RegInfo, S, E, Parser);
+  }
+
+  /// Create a register that is definitely an MSACtrl.
+  /// This is typically only used for named registers such as $msaaccess.
+  static std::unique_ptr<MipsOperand>
+  createMSACtrlReg(unsigned Index, StringRef Str, const MCRegisterInfo *RegInfo,
+                   SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+    return CreateReg(Index, Str, RegKind_MSACtrl, RegInfo, S, E, Parser);
+  }
+
+  static std::unique_ptr<MipsOperand>
+  CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_Immediate, Parser);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<MipsOperand>
+  CreateMem(std::unique_ptr<MipsOperand> Base, const MCExpr *Off, SMLoc S,
+            SMLoc E, MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_Memory, Parser);
+    Op->Mem.Base = Base.release();
+    Op->Mem.Off = Off;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<MipsOperand>
+  CreateRegList(SmallVectorImpl<unsigned> &Regs, SMLoc StartLoc, SMLoc EndLoc,
+                MipsAsmParser &Parser) {
+    assert (Regs.size() > 0 && "Empty list not allowed");
+
+    auto Op = make_unique<MipsOperand>(k_RegList, Parser);
+    Op->RegList.List = new SmallVector<unsigned, 10>(Regs.begin(), Regs.end());
+    Op->StartLoc = StartLoc;
+    Op->EndLoc = EndLoc;
+    return Op;
+  }
+
+  static std::unique_ptr<MipsOperand> CreateRegPair(const MipsOperand &MOP,
+                                                    SMLoc S, SMLoc E,
+                                                    MipsAsmParser &Parser) {
+    auto Op = make_unique<MipsOperand>(k_RegPair, Parser);
+    Op->RegIdx.Index = MOP.RegIdx.Index;
+    Op->RegIdx.RegInfo = MOP.RegIdx.RegInfo;
+    Op->RegIdx.Kind = MOP.RegIdx.Kind;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  bool isGPRAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index <= 31;
+  }
+  bool isMM16AsmReg() const {
+    if (!(isRegIdx() && RegIdx.Kind))
+      return false;
+    return ((RegIdx.Index >= 2 && RegIdx.Index <= 7)
+            || RegIdx.Index == 16 || RegIdx.Index == 17);
+  }
+  bool isMM16AsmRegZero() const {
+    if (!(isRegIdx() && RegIdx.Kind))
+      return false;
+    return (RegIdx.Index == 0 ||
+            (RegIdx.Index >= 2 && RegIdx.Index <= 7) ||
+            RegIdx.Index == 17);
+  }
+  bool isMM16AsmRegMoveP() const {
+    if (!(isRegIdx() && RegIdx.Kind))
+      return false;
+    return (RegIdx.Index == 0 || (RegIdx.Index >= 2 && RegIdx.Index <= 3) ||
+      (RegIdx.Index >= 16 && RegIdx.Index <= 20));
+  }
+  bool isFGRAsmReg() const {
+    // AFGR64 is $0-$15 but we handle this in getAFGR64()
+    return isRegIdx() && RegIdx.Kind & RegKind_FGR && RegIdx.Index <= 31;
+  }
+  bool isHWRegsAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_HWRegs && RegIdx.Index <= 31;
+  }
+  bool isCCRAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_CCR && RegIdx.Index <= 31;
+  }
+  bool isFCCAsmReg() const {
+    if (!(isRegIdx() && RegIdx.Kind & RegKind_FCC))
+      return false;
+    if (!AsmParser.hasEightFccRegisters())
+      return RegIdx.Index == 0;
+    return RegIdx.Index <= 7;
+  }
+  bool isACCAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_ACC && RegIdx.Index <= 3;
+  }
+  bool isCOP0AsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_COP0 && RegIdx.Index <= 31;
+  }
+  bool isCOP2AsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_COP2 && RegIdx.Index <= 31;
+  }
+  bool isCOP3AsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_COP3 && RegIdx.Index <= 31;
+  }
+  bool isMSA128AsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_MSA128 && RegIdx.Index <= 31;
+  }
+  bool isMSACtrlAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_MSACtrl && RegIdx.Index <= 7;
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+
+  virtual ~MipsOperand() {
+    switch (Kind) {
+    case k_Immediate:
+      break;
+    case k_Memory:
+      delete Mem.Base;
+      break;
+    case k_RegList:
+      delete RegList.List;
+    case k_RegisterIndex:
+    case k_Token:
+    case k_RegPair:
+      break;
+    }
+  }
+
+  void print(raw_ostream &OS) const override {
+    switch (Kind) {
+    case k_Immediate:
+      OS << "Imm<";
+      OS << *Imm.Val;
+      OS << ">";
+      break;
+    case k_Memory:
+      OS << "Mem<";
+      Mem.Base->print(OS);
+      OS << ", ";
+      OS << *Mem.Off;
+      OS << ">";
+      break;
+    case k_RegisterIndex:
+      OS << "RegIdx<" << RegIdx.Index << ":" << RegIdx.Kind << ", "
+         << StringRef(RegIdx.Tok.Data, RegIdx.Tok.Length) << ">";
+      break;
+    case k_Token:
+      OS << getToken();
+      break;
+    case k_RegList:
+      OS << "RegList< ";
+      for (auto Reg : (*RegList.List))
+        OS << Reg << " ";
+      OS <<  ">";
+      break;
+    case k_RegPair:
+      OS << "RegPair<" << RegIdx.Index << "," << RegIdx.Index + 1 << ">";
+      break;
+    }
+  }
+
+  bool isValidForTie(const MipsOperand &Other) const {
+    if (Kind != Other.Kind)
+      return false;
+
+    switch (Kind) {
+    default:
+      llvm_unreachable("Unexpected kind");
+      return false;
+    case k_RegisterIndex: {
+      StringRef Token(RegIdx.Tok.Data, RegIdx.Tok.Length);
+      StringRef OtherToken(Other.RegIdx.Tok.Data, Other.RegIdx.Tok.Length);
+      return Token == OtherToken;
+    }
+    }
+  }
+}; // class MipsOperand
+} // namespace
+
+namespace llvm {
+extern const MCInstrDesc MipsInsts[];
+}
+static const MCInstrDesc &getInstDesc(unsigned Opcode) {
+  return MipsInsts[Opcode];
+}
+
+static bool hasShortDelaySlot(unsigned Opcode) {
+  switch (Opcode) {
+    case Mips::JALS_MM:
+    case Mips::JALRS_MM:
+    case Mips::JALRS16_MM:
+    case Mips::BGEZALS_MM:
+    case Mips::BLTZALS_MM:
+      return true;
+    default:
+      return false;
+  }
+}
+
+static const MCSymbol *getSingleMCSymbol(const MCExpr *Expr) {
+  if (const MCSymbolRefExpr *SRExpr = dyn_cast<MCSymbolRefExpr>(Expr)) {
+    return &SRExpr->getSymbol();
+  }
+
+  if (const MCBinaryExpr *BExpr = dyn_cast<MCBinaryExpr>(Expr)) {
+    const MCSymbol *LHSSym = getSingleMCSymbol(BExpr->getLHS());
+    const MCSymbol *RHSSym = getSingleMCSymbol(BExpr->getRHS());
+
+    if (LHSSym)
+      return LHSSym;
+
+    if (RHSSym)
+      return RHSSym;
+
+    return nullptr;
+  }
+
+  if (const MCUnaryExpr *UExpr = dyn_cast<MCUnaryExpr>(Expr))
+    return getSingleMCSymbol(UExpr->getSubExpr());
+
+  return nullptr;
+}
+
+static unsigned countMCSymbolRefExpr(const MCExpr *Expr) {
+  if (isa<MCSymbolRefExpr>(Expr))
+    return 1;
+
+  if (const MCBinaryExpr *BExpr = dyn_cast<MCBinaryExpr>(Expr))
+    return countMCSymbolRefExpr(BExpr->getLHS()) +
+           countMCSymbolRefExpr(BExpr->getRHS());
+
+  if (const MCUnaryExpr *UExpr = dyn_cast<MCUnaryExpr>(Expr))
+    return countMCSymbolRefExpr(UExpr->getSubExpr());
+
+  return 0;
+}
+
+bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
+                                       MCStreamer &Out,
+                                       const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+  bool ExpandedJalSym = false;
+
+  Inst.setLoc(IDLoc);
+
+  if (MCID.isBranch() || MCID.isCall()) {
+    const unsigned Opcode = Inst.getOpcode();
+    MCOperand Offset;
+
+    switch (Opcode) {
+    default:
+      break;
+    case Mips::BBIT0:
+    case Mips::BBIT032:
+    case Mips::BBIT1:
+    case Mips::BBIT132:
+      assert(hasCnMips() && "instruction only valid for octeon cpus");
+      LLVM_FALLTHROUGH;
+
+    case Mips::BEQ:
+    case Mips::BNE:
+    case Mips::BEQ_MM:
+    case Mips::BNE_MM:
+      assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
+      Offset = Inst.getOperand(2);
+      if (!Offset.isImm())
+        break; // We'll deal with this situation later on when applying fixups.
+      if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm()))
+        return Error(IDLoc, "branch target out of range");
+      if (OffsetToAlignment(Offset.getImm(),
+                            1LL << (inMicroMipsMode() ? 1 : 2)))
+        return Error(IDLoc, "branch to misaligned address");
+      break;
+    case Mips::BGEZ:
+    case Mips::BGTZ:
+    case Mips::BLEZ:
+    case Mips::BLTZ:
+    case Mips::BGEZAL:
+    case Mips::BLTZAL:
+    case Mips::BC1F:
+    case Mips::BC1T:
+    case Mips::BGEZ_MM:
+    case Mips::BGTZ_MM:
+    case Mips::BLEZ_MM:
+    case Mips::BLTZ_MM:
+    case Mips::BGEZAL_MM:
+    case Mips::BLTZAL_MM:
+    case Mips::BC1F_MM:
+    case Mips::BC1T_MM:
+    case Mips::BC1EQZC_MMR6:
+    case Mips::BC1NEZC_MMR6:
+    case Mips::BC2EQZC_MMR6:
+    case Mips::BC2NEZC_MMR6:
+      assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
+      Offset = Inst.getOperand(1);
+      if (!Offset.isImm())
+        break; // We'll deal with this situation later on when applying fixups.
+      if (!isIntN(inMicroMipsMode() ? 17 : 18, Offset.getImm()))
+        return Error(IDLoc, "branch target out of range");
+      if (OffsetToAlignment(Offset.getImm(),
+                            1LL << (inMicroMipsMode() ? 1 : 2)))
+        return Error(IDLoc, "branch to misaligned address");
+      break;
+    case Mips::BGEC:    case Mips::BGEC_MMR6:
+    case Mips::BLTC:    case Mips::BLTC_MMR6:
+    case Mips::BGEUC:   case Mips::BGEUC_MMR6:
+    case Mips::BLTUC:   case Mips::BLTUC_MMR6:
+    case Mips::BEQC:    case Mips::BEQC_MMR6:
+    case Mips::BNEC:    case Mips::BNEC_MMR6:
+      assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
+      Offset = Inst.getOperand(2);
+      if (!Offset.isImm())
+        break; // We'll deal with this situation later on when applying fixups.
+      if (!isIntN(18, Offset.getImm()))
+        return Error(IDLoc, "branch target out of range");
+      if (OffsetToAlignment(Offset.getImm(), 1LL << 2))
+        return Error(IDLoc, "branch to misaligned address");
+      break;
+    case Mips::BLEZC:   case Mips::BLEZC_MMR6:
+    case Mips::BGEZC:   case Mips::BGEZC_MMR6:
+    case Mips::BGTZC:   case Mips::BGTZC_MMR6:
+    case Mips::BLTZC:   case Mips::BLTZC_MMR6:
+      assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
+      Offset = Inst.getOperand(1);
+      if (!Offset.isImm())
+        break; // We'll deal with this situation later on when applying fixups.
+      if (!isIntN(18, Offset.getImm()))
+        return Error(IDLoc, "branch target out of range");
+      if (OffsetToAlignment(Offset.getImm(), 1LL << 2))
+        return Error(IDLoc, "branch to misaligned address");
+      break;
+    case Mips::BEQZC:   case Mips::BEQZC_MMR6:
+    case Mips::BNEZC:   case Mips::BNEZC_MMR6:
+      assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
+      Offset = Inst.getOperand(1);
+      if (!Offset.isImm())
+        break; // We'll deal with this situation later on when applying fixups.
+      if (!isIntN(23, Offset.getImm()))
+        return Error(IDLoc, "branch target out of range");
+      if (OffsetToAlignment(Offset.getImm(), 1LL << 2))
+        return Error(IDLoc, "branch to misaligned address");
+      break;
+    case Mips::BEQZ16_MM:
+    case Mips::BEQZC16_MMR6:
+    case Mips::BNEZ16_MM:
+    case Mips::BNEZC16_MMR6:
+      assert(MCID.getNumOperands() == 2 && "unexpected number of operands");
+      Offset = Inst.getOperand(1);
+      if (!Offset.isImm())
+        break; // We'll deal with this situation later on when applying fixups.
+      if (!isInt<8>(Offset.getImm()))
+        return Error(IDLoc, "branch target out of range");
+      if (OffsetToAlignment(Offset.getImm(), 2LL))
+        return Error(IDLoc, "branch to misaligned address");
+      break;
+    }
+  }
+
+  // SSNOP is deprecated on MIPS32r6/MIPS64r6
+  // We still accept it but it is a normal nop.
+  if (hasMips32r6() && Inst.getOpcode() == Mips::SSNOP) {
+    std::string ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6";
+    Warning(IDLoc, "ssnop is deprecated for " + ISA + " and is equivalent to a "
+                                                      "nop instruction");
+  }
+
+  if (hasCnMips()) {
+    const unsigned Opcode = Inst.getOpcode();
+    MCOperand Opnd;
+    int Imm;
+
+    switch (Opcode) {
+      default:
+        break;
+
+      case Mips::BBIT0:
+      case Mips::BBIT032:
+      case Mips::BBIT1:
+      case Mips::BBIT132:
+        assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
+        // The offset is handled above
+        Opnd = Inst.getOperand(1);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < 0 || Imm > (Opcode == Mips::BBIT0 ||
+                              Opcode == Mips::BBIT1 ? 63 : 31))
+          return Error(IDLoc, "immediate operand value out of range");
+        if (Imm > 31) {
+          Inst.setOpcode(Opcode == Mips::BBIT0 ? Mips::BBIT032
+                                               : Mips::BBIT132);
+          Inst.getOperand(1).setImm(Imm - 32);
+        }
+        break;
+
+      case Mips::SEQi:
+      case Mips::SNEi:
+        assert(MCID.getNumOperands() == 3 && "unexpected number of operands");
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (!isInt<10>(Imm))
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+    }
+  }
+
+  // For PIC code convert unconditional jump to unconditional branch.
+  if ((Inst.getOpcode() == Mips::J || Inst.getOpcode() == Mips::J_MM) &&
+      inPicMode()) {
+    MCInst BInst;
+    BInst.setOpcode(inMicroMipsMode() ? Mips::BEQ_MM : Mips::BEQ);
+    BInst.addOperand(MCOperand::createReg(Mips::ZERO));
+    BInst.addOperand(MCOperand::createReg(Mips::ZERO));
+    BInst.addOperand(Inst.getOperand(0));
+    Inst = BInst;
+  }
+
+  // This expansion is not in a function called by tryExpandInstruction()
+  // because the pseudo-instruction doesn't have a distinct opcode.
+  if ((Inst.getOpcode() == Mips::JAL || Inst.getOpcode() == Mips::JAL_MM) &&
+      inPicMode()) {
+    warnIfNoMacro(IDLoc);
+
+    const MCExpr *JalExpr = Inst.getOperand(0).getExpr();
+
+    // We can do this expansion if there's only 1 symbol in the argument
+    // expression.
+    if (countMCSymbolRefExpr(JalExpr) > 1)
+      return Error(IDLoc, "jal doesn't support multiple symbols in PIC mode");
+
+    // FIXME: This is checking the expression can be handled by the later stages
+    //        of the assembler. We ought to leave it to those later stages.
+    const MCSymbol *JalSym = getSingleMCSymbol(JalExpr);
+
+    // FIXME: Add support for label+offset operands (currently causes an error).
+    // FIXME: Add support for forward-declared local symbols.
+    // FIXME: Add expansion for when the LargeGOT option is enabled.
+    if (JalSym->isInSection() || JalSym->isTemporary() ||
+        (JalSym->isELF() && cast<MCSymbolELF>(JalSym)->getBinding() == ELF::STB_LOCAL)) {
+      if (isABI_O32()) {
+        // If it's a local symbol and the O32 ABI is being used, we expand to:
+        //  lw $25, 0($gp)
+        //    R_(MICRO)MIPS_GOT16  label
+        //  addiu $25, $25, 0
+        //    R_(MICRO)MIPS_LO16   label
+        //  jalr  $25
+        const MCExpr *Got16RelocExpr =
+            MipsMCExpr::create(MipsMCExpr::MEK_GOT, JalExpr, getContext());
+        const MCExpr *Lo16RelocExpr =
+            MipsMCExpr::create(MipsMCExpr::MEK_LO, JalExpr, getContext());
+
+        TOut.emitRRX(Mips::LW, Mips::T9, Mips::GP,
+                     MCOperand::createExpr(Got16RelocExpr), IDLoc, STI);
+        TOut.emitRRX(Mips::ADDiu, Mips::T9, Mips::T9,
+                     MCOperand::createExpr(Lo16RelocExpr), IDLoc, STI);
+      } else if (isABI_N32() || isABI_N64()) {
+        // If it's a local symbol and the N32/N64 ABIs are being used,
+        // we expand to:
+        //  lw/ld $25, 0($gp)
+        //    R_(MICRO)MIPS_GOT_DISP  label
+        //  jalr  $25
+        const MCExpr *GotDispRelocExpr =
+            MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, JalExpr, getContext());
+
+        TOut.emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9,
+                     Mips::GP, MCOperand::createExpr(GotDispRelocExpr), IDLoc,
+                     STI);
+      }
+    } else {
+      // If it's an external/weak symbol, we expand to:
+      //  lw/ld    $25, 0($gp)
+      //    R_(MICRO)MIPS_CALL16  label
+      //  jalr  $25
+      const MCExpr *Call16RelocExpr =
+          MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, JalExpr, getContext());
+
+      TOut.emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, Mips::GP,
+                   MCOperand::createExpr(Call16RelocExpr), IDLoc, STI);
+    }
+
+    MCInst JalrInst;
+    if (IsCpRestoreSet && inMicroMipsMode())
+      JalrInst.setOpcode(Mips::JALRS_MM);
+    else
+      JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR);
+    JalrInst.addOperand(MCOperand::createReg(Mips::RA));
+    JalrInst.addOperand(MCOperand::createReg(Mips::T9));
+
+    // FIXME: Add an R_(MICRO)MIPS_JALR relocation after the JALR.
+    // This relocation is supposed to be an optimization hint for the linker
+    // and is not necessary for correctness.
+
+    Inst = JalrInst;
+    ExpandedJalSym = true;
+  }
+
+  bool IsPCRelativeLoad = (MCID.TSFlags & MipsII::IsPCRelativeLoad) != 0;
+  if ((MCID.mayLoad() || MCID.mayStore()) && !IsPCRelativeLoad) {
+    // Check the offset of memory operand, if it is a symbol
+    // reference or immediate we may have to expand instructions.
+    for (unsigned i = 0; i < MCID.getNumOperands(); i++) {
+      const MCOperandInfo &OpInfo = MCID.OpInfo[i];
+      if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY) ||
+          (OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
+        MCOperand &Op = Inst.getOperand(i);
+        if (Op.isImm()) {
+          int MemOffset = Op.getImm();
+          if (MemOffset < -32768 || MemOffset > 32767) {
+            // Offset can't exceed 16bit value.
+            expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad(), true);
+            return getParser().hasPendingError();
+          }
+        } else if (Op.isExpr()) {
+          const MCExpr *Expr = Op.getExpr();
+          if (Expr->getKind() == MCExpr::SymbolRef) {
+            const MCSymbolRefExpr *SR =
+                static_cast<const MCSymbolRefExpr *>(Expr);
+            if (SR->getKind() == MCSymbolRefExpr::VK_None) {
+              // Expand symbol.
+              expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad(), false);
+              return getParser().hasPendingError();
+            }
+          } else if (!isEvaluated(Expr)) {
+            expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad(), false);
+            return getParser().hasPendingError();
+          }
+        }
+      }
+    } // for
+  }   // if load/store
+
+  if (inMicroMipsMode()) {
+    if (MCID.mayLoad()) {
+      // Try to create 16-bit GP relative load instruction.
+      for (unsigned i = 0; i < MCID.getNumOperands(); i++) {
+        const MCOperandInfo &OpInfo = MCID.OpInfo[i];
+        if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY) ||
+            (OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
+          MCOperand &Op = Inst.getOperand(i);
+          if (Op.isImm()) {
+            int MemOffset = Op.getImm();
+            MCOperand &DstReg = Inst.getOperand(0);
+            MCOperand &BaseReg = Inst.getOperand(1);
+            if (isInt<9>(MemOffset) && (MemOffset % 4 == 0) &&
+                getContext().getRegisterInfo()->getRegClass(
+                  Mips::GPRMM16RegClassID).contains(DstReg.getReg()) &&
+                (BaseReg.getReg() == Mips::GP ||
+                BaseReg.getReg() == Mips::GP_64)) {
+
+              TOut.emitRRI(Mips::LWGP_MM, DstReg.getReg(), Mips::GP, MemOffset,
+                           IDLoc, STI);
+              return false;
+            }
+          }
+        }
+      } // for
+    }   // if load
+
+    // TODO: Handle this with the AsmOperandClass.PredicateMethod.
+
+    MCOperand Opnd;
+    int Imm;
+
+    switch (Inst.getOpcode()) {
+      default:
+        break;
+      case Mips::ADDIUSP_MM:
+        Opnd = Inst.getOperand(0);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < -1032 || Imm > 1028 || (Imm < 8 && Imm > -12) ||
+            Imm % 4 != 0)
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::SLL16_MM:
+      case Mips::SRL16_MM:
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < 1 || Imm > 8)
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::LI16_MM:
+        Opnd = Inst.getOperand(1);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < -1 || Imm > 126)
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::ADDIUR2_MM:
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (!(Imm == 1 || Imm == -1 ||
+              ((Imm % 4 == 0) && Imm < 28 && Imm > 0)))
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::ANDI16_MM:
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (!(Imm == 128 || (Imm >= 1 && Imm <= 4) || Imm == 7 || Imm == 8 ||
+              Imm == 15 || Imm == 16 || Imm == 31 || Imm == 32 || Imm == 63 ||
+              Imm == 64 || Imm == 255 || Imm == 32768 || Imm == 65535))
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::LBU16_MM:
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < -1 || Imm > 14)
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::SB16_MM:
+      case Mips::SB16_MMR6:
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < 0 || Imm > 15)
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::LHU16_MM:
+      case Mips::SH16_MM:
+      case Mips::SH16_MMR6:
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < 0 || Imm > 30 || (Imm % 2 != 0))
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::LW16_MM:
+      case Mips::SW16_MM:
+      case Mips::SW16_MMR6:
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < 0 || Imm > 60 || (Imm % 4 != 0))
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::ADDIUPC_MM:
+        MCOperand Opnd = Inst.getOperand(1);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        int Imm = Opnd.getImm();
+        if ((Imm % 4 != 0) || !isInt<25>(Imm))
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+    }
+  }
+
+  bool FillDelaySlot =
+      MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder();
+  if (FillDelaySlot)
+    TOut.emitDirectiveSetNoReorder();
+
+  MacroExpanderResultTy ExpandResult =
+      tryExpandInstruction(Inst, IDLoc, Out, STI);
+  switch (ExpandResult) {
+  case MER_NotAMacro:
+    Out.EmitInstruction(Inst, *STI);
+    break;
+  case MER_Success:
+    break;
+  case MER_Fail:
+    return true;
+  }
+
+  // We know we emitted an instruction on the MER_NotAMacro or MER_Success path.
+  // If we're in microMIPS mode then we must also set EF_MIPS_MICROMIPS.
+  if (inMicroMipsMode())
+    TOut.setUsesMicroMips();
+
+  // If this instruction has a delay slot and .set reorder is active,
+  // emit a NOP after it.
+  if (FillDelaySlot) {
+    TOut.emitEmptyDelaySlot(hasShortDelaySlot(Inst.getOpcode()), IDLoc, STI);
+    TOut.emitDirectiveSetReorder();
+  }
+
+  if ((Inst.getOpcode() == Mips::JalOneReg ||
+       Inst.getOpcode() == Mips::JalTwoReg || ExpandedJalSym) &&
+      isPicAndNotNxxAbi()) {
+    if (IsCpRestoreSet) {
+      // We need a NOP between the JALR and the LW:
+      // If .set reorder has been used, we've already emitted a NOP.
+      // If .set noreorder has been used, we need to emit a NOP at this point.
+      if (!AssemblerOptions.back()->isReorder())
+        TOut.emitEmptyDelaySlot(hasShortDelaySlot(Inst.getOpcode()), IDLoc,
+                                STI);
+
+      // Load the $gp from the stack.
+      TOut.emitGPRestore(CpRestoreOffset, IDLoc, STI);
+    } else
+      Warning(IDLoc, "no .cprestore used in PIC mode");
+  }
+
+  return false;
+}
+
+MipsAsmParser::MacroExpanderResultTy
+MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                    const MCSubtargetInfo *STI) {
+  switch (Inst.getOpcode()) {
+  default:
+    return MER_NotAMacro;
+  case Mips::LoadImm32:
+    return expandLoadImm(Inst, true, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::LoadImm64:
+    return expandLoadImm(Inst, false, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::LoadAddrImm32:
+  case Mips::LoadAddrImm64:
+    assert(Inst.getOperand(0).isReg() && "expected register operand kind");
+    assert((Inst.getOperand(1).isImm() || Inst.getOperand(1).isExpr()) &&
+           "expected immediate operand kind");
+
+    return expandLoadAddress(Inst.getOperand(0).getReg(), Mips::NoRegister,
+                             Inst.getOperand(1),
+                             Inst.getOpcode() == Mips::LoadAddrImm32, IDLoc,
+                             Out, STI)
+               ? MER_Fail
+               : MER_Success;
+  case Mips::LoadAddrReg32:
+  case Mips::LoadAddrReg64:
+    assert(Inst.getOperand(0).isReg() && "expected register operand kind");
+    assert(Inst.getOperand(1).isReg() && "expected register operand kind");
+    assert((Inst.getOperand(2).isImm() || Inst.getOperand(2).isExpr()) &&
+           "expected immediate operand kind");
+
+    return expandLoadAddress(Inst.getOperand(0).getReg(),
+                             Inst.getOperand(1).getReg(), Inst.getOperand(2),
+                             Inst.getOpcode() == Mips::LoadAddrReg32, IDLoc,
+                             Out, STI)
+               ? MER_Fail
+               : MER_Success;
+  case Mips::B_MM_Pseudo:
+  case Mips::B_MMR6_Pseudo:
+    return expandUncondBranchMMPseudo(Inst, IDLoc, Out, STI) ? MER_Fail
+                                                             : MER_Success;
+  case Mips::SWM_MM:
+  case Mips::LWM_MM:
+    return expandLoadStoreMultiple(Inst, IDLoc, Out, STI) ? MER_Fail
+                                                          : MER_Success;
+  case Mips::JalOneReg:
+  case Mips::JalTwoReg:
+    return expandJalWithRegs(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::BneImm:
+  case Mips::BeqImm:
+    return expandBranchImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::BLT:
+  case Mips::BLE:
+  case Mips::BGE:
+  case Mips::BGT:
+  case Mips::BLTU:
+  case Mips::BLEU:
+  case Mips::BGEU:
+  case Mips::BGTU:
+  case Mips::BLTL:
+  case Mips::BLEL:
+  case Mips::BGEL:
+  case Mips::BGTL:
+  case Mips::BLTUL:
+  case Mips::BLEUL:
+  case Mips::BGEUL:
+  case Mips::BGTUL:
+  case Mips::BLTImmMacro:
+  case Mips::BLEImmMacro:
+  case Mips::BGEImmMacro:
+  case Mips::BGTImmMacro:
+  case Mips::BLTUImmMacro:
+  case Mips::BLEUImmMacro:
+  case Mips::BGEUImmMacro:
+  case Mips::BGTUImmMacro:
+  case Mips::BLTLImmMacro:
+  case Mips::BLELImmMacro:
+  case Mips::BGELImmMacro:
+  case Mips::BGTLImmMacro:
+  case Mips::BLTULImmMacro:
+  case Mips::BLEULImmMacro:
+  case Mips::BGEULImmMacro:
+  case Mips::BGTULImmMacro:
+    return expandCondBranches(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::SDivMacro:
+    return expandDiv(Inst, IDLoc, Out, STI, false, true) ? MER_Fail
+                                                         : MER_Success;
+  case Mips::DSDivMacro:
+    return expandDiv(Inst, IDLoc, Out, STI, true, true) ? MER_Fail
+                                                        : MER_Success;
+  case Mips::UDivMacro:
+    return expandDiv(Inst, IDLoc, Out, STI, false, false) ? MER_Fail
+                                                          : MER_Success;
+  case Mips::DUDivMacro:
+    return expandDiv(Inst, IDLoc, Out, STI, true, false) ? MER_Fail
+                                                         : MER_Success;
+  case Mips::PseudoTRUNC_W_S:
+    return expandTrunc(Inst, false, false, IDLoc, Out, STI) ? MER_Fail
+                                                            : MER_Success;
+  case Mips::PseudoTRUNC_W_D32:
+    return expandTrunc(Inst, true, false, IDLoc, Out, STI) ? MER_Fail
+                                                           : MER_Success;
+  case Mips::PseudoTRUNC_W_D:
+    return expandTrunc(Inst, true, true, IDLoc, Out, STI) ? MER_Fail
+                                                          : MER_Success;
+  case Mips::Ulh:
+    return expandUlh(Inst, true, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::Ulhu:
+    return expandUlh(Inst, false, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::Ush:
+    return expandUsh(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::Ulw:
+  case Mips::Usw:
+    return expandUxw(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::NORImm:
+    return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::ADDi:
+  case Mips::ADDiu:
+  case Mips::SLTi:
+  case Mips::SLTiu:
+    if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() &&
+        Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) {
+      int64_t ImmValue = Inst.getOperand(2).getImm();
+      if (isInt<16>(ImmValue))
+        return MER_NotAMacro;
+      return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail
+                                                         : MER_Success;
+    }
+    return MER_NotAMacro;
+  case Mips::ANDi:
+  case Mips::ORi:
+  case Mips::XORi:
+    if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() &&
+        Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) {
+      int64_t ImmValue = Inst.getOperand(2).getImm();
+      if (isUInt<16>(ImmValue))
+        return MER_NotAMacro;
+      return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail
+                                                         : MER_Success;
+    }
+    return MER_NotAMacro;
+  case Mips::ROL:
+  case Mips::ROR:
+    return expandRotation(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::ROLImm:
+  case Mips::RORImm:
+    return expandRotationImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::DROL:
+  case Mips::DROR:
+    return expandDRotation(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::DROLImm:
+  case Mips::DRORImm:
+    return expandDRotationImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::ABSMacro:
+    return expandAbs(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::LDMacro:
+  case Mips::SDMacro:
+    return expandLoadStoreDMacro(Inst, IDLoc, Out, STI,
+                                 Inst.getOpcode() == Mips::LDMacro)
+               ? MER_Fail
+               : MER_Success;
+  case Mips::SEQMacro:
+    return expandSeq(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::SEQIMacro:
+    return expandSeqI(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  }
+}
+
+bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
+                                      MCStreamer &Out,
+                                      const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  // Create a JALR instruction which is going to replace the pseudo-JAL.
+  MCInst JalrInst;
+  JalrInst.setLoc(IDLoc);
+  const MCOperand FirstRegOp = Inst.getOperand(0);
+  const unsigned Opcode = Inst.getOpcode();
+
+  if (Opcode == Mips::JalOneReg) {
+    // jal $rs => jalr $rs
+    if (IsCpRestoreSet && inMicroMipsMode()) {
+      JalrInst.setOpcode(Mips::JALRS16_MM);
+      JalrInst.addOperand(FirstRegOp);
+    } else if (inMicroMipsMode()) {
+      JalrInst.setOpcode(hasMips32r6() ? Mips::JALRC16_MMR6 : Mips::JALR16_MM);
+      JalrInst.addOperand(FirstRegOp);
+    } else {
+      JalrInst.setOpcode(Mips::JALR);
+      JalrInst.addOperand(MCOperand::createReg(Mips::RA));
+      JalrInst.addOperand(FirstRegOp);
+    }
+  } else if (Opcode == Mips::JalTwoReg) {
+    // jal $rd, $rs => jalr $rd, $rs
+    if (IsCpRestoreSet && inMicroMipsMode())
+      JalrInst.setOpcode(Mips::JALRS_MM);
+    else
+      JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR);
+    JalrInst.addOperand(FirstRegOp);
+    const MCOperand SecondRegOp = Inst.getOperand(1);
+    JalrInst.addOperand(SecondRegOp);
+  }
+  Out.EmitInstruction(JalrInst, *STI);
+
+  // If .set reorder is active and branch instruction has a delay slot,
+  // emit a NOP after it.
+  const MCInstrDesc &MCID = getInstDesc(JalrInst.getOpcode());
+  if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder())
+    TOut.emitEmptyDelaySlot(hasShortDelaySlot(JalrInst.getOpcode()), IDLoc,
+                            STI);
+
+  return false;
+}
+
+/// Can the value be represented by a unsigned N-bit value and a shift left?
+template <unsigned N> static bool isShiftedUIntAtAnyPosition(uint64_t x) {
+  unsigned BitNum = findFirstSet(x);
+
+  return (x == x >> BitNum << BitNum) && isUInt<N>(x >> BitNum);
+}
+
+/// Load (or add) an immediate into a register.
+///
+/// @param ImmValue     The immediate to load.
+/// @param DstReg       The register that will hold the immediate.
+/// @param SrcReg       A register to add to the immediate or Mips::NoRegister
+///                     for a simple initialization.
+/// @param Is32BitImm   Is ImmValue 32-bit or 64-bit?
+/// @param IsAddress    True if the immediate represents an address. False if it
+///                     is an integer.
+/// @param IDLoc        Location of the immediate in the source file.
+bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
+                                  unsigned SrcReg, bool Is32BitImm,
+                                  bool IsAddress, SMLoc IDLoc, MCStreamer &Out,
+                                  const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  if (!Is32BitImm && !isGP64bit()) {
+    Error(IDLoc, "instruction requires a 64-bit architecture");
+    return true;
+  }
+
+  if (Is32BitImm) {
+    if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) {
+      // Sign extend up to 64-bit so that the predicates match the hardware
+      // behaviour. In particular, isInt<16>(0xffff8000) and similar should be
+      // true.
+      ImmValue = SignExtend64<32>(ImmValue);
+    } else {
+      Error(IDLoc, "instruction requires a 32-bit immediate");
+      return true;
+    }
+  }
+
+  unsigned ZeroReg = IsAddress ? ABI.GetNullPtr() : ABI.GetZeroReg();
+  unsigned AdduOp = !Is32BitImm ? Mips::DADDu : Mips::ADDu;
+
+  bool UseSrcReg = false;
+  if (SrcReg != Mips::NoRegister)
+    UseSrcReg = true;
+
+  unsigned TmpReg = DstReg;
+  if (UseSrcReg &&
+      getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg)) {
+    // At this point we need AT to perform the expansions and we exit if it is
+    // not available.
+    unsigned ATReg = getATReg(IDLoc);
+    if (!ATReg)
+      return true;
+    TmpReg = ATReg;
+  }
+
+  if (isInt<16>(ImmValue)) {
+    if (!UseSrcReg)
+      SrcReg = ZeroReg;
+
+    // This doesn't quite follow the usual ABI expectations for N32 but matches
+    // traditional assembler behaviour. N32 would normally use addiu for both
+    // integers and addresses.
+    if (IsAddress && !Is32BitImm) {
+      TOut.emitRRI(Mips::DADDiu, DstReg, SrcReg, ImmValue, IDLoc, STI);
+      return false;
+    }
+
+    TOut.emitRRI(Mips::ADDiu, DstReg, SrcReg, ImmValue, IDLoc, STI);
+    return false;
+  }
+
+  if (isUInt<16>(ImmValue)) {
+    unsigned TmpReg = DstReg;
+    if (SrcReg == DstReg) {
+      TmpReg = getATReg(IDLoc);
+      if (!TmpReg)
+        return true;
+    }
+
+    TOut.emitRRI(Mips::ORi, TmpReg, ZeroReg, ImmValue, IDLoc, STI);
+    if (UseSrcReg)
+      TOut.emitRRR(ABI.GetPtrAdduOp(), DstReg, TmpReg, SrcReg, IDLoc, STI);
+    return false;
+  }
+
+  if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) {
+    warnIfNoMacro(IDLoc);
+
+    uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff;
+    uint16_t Bits15To0 = ImmValue & 0xffff;
+
+    if (!Is32BitImm && !isInt<32>(ImmValue)) {
+      // Traditional behaviour seems to special case this particular value. It's
+      // not clear why other masks are handled differently.
+      if (ImmValue == 0xffffffff) {
+        TOut.emitRI(Mips::LUi, TmpReg, 0xffff, IDLoc, STI);
+        TOut.emitRRI(Mips::DSRL32, TmpReg, TmpReg, 0, IDLoc, STI);
+        if (UseSrcReg)
+          TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
+        return false;
+      }
+
+      // Expand to an ORi instead of a LUi to avoid sign-extending into the
+      // upper 32 bits.
+      TOut.emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits31To16, IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL, TmpReg, TmpReg, 16, IDLoc, STI);
+      if (Bits15To0)
+        TOut.emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, STI);
+      if (UseSrcReg)
+        TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
+      return false;
+    }
+
+    TOut.emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, STI);
+    if (Bits15To0)
+      TOut.emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, STI);
+    if (UseSrcReg)
+      TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
+    return false;
+  }
+
+  if (isShiftedUIntAtAnyPosition<16>(ImmValue)) {
+    if (Is32BitImm) {
+      Error(IDLoc, "instruction requires a 32-bit immediate");
+      return true;
+    }
+
+    // Traditionally, these immediates are shifted as little as possible and as
+    // such we align the most significant bit to bit 15 of our temporary.
+    unsigned FirstSet = findFirstSet((uint64_t)ImmValue);
+    unsigned LastSet = findLastSet((uint64_t)ImmValue);
+    unsigned ShiftAmount = FirstSet - (15 - (LastSet - FirstSet));
+    uint16_t Bits = (ImmValue >> ShiftAmount) & 0xffff;
+    TOut.emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits, IDLoc, STI);
+    TOut.emitRRI(Mips::DSLL, TmpReg, TmpReg, ShiftAmount, IDLoc, STI);
+
+    if (UseSrcReg)
+      TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
+
+    return false;
+  }
+
+  warnIfNoMacro(IDLoc);
+
+  // The remaining case is packed with a sequence of dsll and ori with zeros
+  // being omitted and any neighbouring dsll's being coalesced.
+  // The highest 32-bit's are equivalent to a 32-bit immediate load.
+
+  // Load bits 32-63 of ImmValue into bits 0-31 of the temporary register.
+  if (loadImmediate(ImmValue >> 32, TmpReg, Mips::NoRegister, true, false,
+                    IDLoc, Out, STI))
+    return false;
+
+  // Shift and accumulate into the register. If a 16-bit chunk is zero, then
+  // skip it and defer the shift to the next chunk.
+  unsigned ShiftCarriedForwards = 16;
+  for (int BitNum = 16; BitNum >= 0; BitNum -= 16) {
+    uint16_t ImmChunk = (ImmValue >> BitNum) & 0xffff;
+
+    if (ImmChunk != 0) {
+      TOut.emitDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc, STI);
+      TOut.emitRRI(Mips::ORi, TmpReg, TmpReg, ImmChunk, IDLoc, STI);
+      ShiftCarriedForwards = 0;
+    }
+
+    ShiftCarriedForwards += 16;
+  }
+  ShiftCarriedForwards -= 16;
+
+  // Finish any remaining shifts left by trailing zeros.
+  if (ShiftCarriedForwards)
+    TOut.emitDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc, STI);
+
+  if (UseSrcReg)
+    TOut.emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, STI);
+
+  return false;
+}
+
+bool MipsAsmParser::expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
+                                  MCStreamer &Out, const MCSubtargetInfo *STI) {
+  const MCOperand &ImmOp = Inst.getOperand(1);
+  assert(ImmOp.isImm() && "expected immediate operand kind");
+  const MCOperand &DstRegOp = Inst.getOperand(0);
+  assert(DstRegOp.isReg() && "expected register operand kind");
+
+  if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), Mips::NoRegister,
+                    Is32BitImm, false, IDLoc, Out, STI))
+    return true;
+
+  return false;
+}
+
+bool MipsAsmParser::expandLoadAddress(unsigned DstReg, unsigned BaseReg,
+                                      const MCOperand &Offset,
+                                      bool Is32BitAddress, SMLoc IDLoc,
+                                      MCStreamer &Out,
+                                      const MCSubtargetInfo *STI) {
+  // la can't produce a usable address when addresses are 64-bit.
+  if (Is32BitAddress && ABI.ArePtrs64bit()) {
+    // FIXME: Demote this to a warning and continue as if we had 'dla' instead.
+    //        We currently can't do this because we depend on the equality
+    //        operator and N64 can end up with a GPR32/GPR64 mismatch.
+    Error(IDLoc, "la used to load 64-bit address");
+    // Continue as if we had 'dla' instead.
+    Is32BitAddress = false;
+    return true;
+  }
+
+  // dla requires 64-bit addresses.
+  if (!Is32BitAddress && !hasMips3()) {
+    Error(IDLoc, "instruction requires a 64-bit architecture");
+    return true;
+  }
+
+  if (!Offset.isImm())
+    return loadAndAddSymbolAddress(Offset.getExpr(), DstReg, BaseReg,
+                                   Is32BitAddress, IDLoc, Out, STI);
+
+  if (!ABI.ArePtrs64bit()) {
+    // Continue as if we had 'la' whether we had 'la' or 'dla'.
+    Is32BitAddress = true;
+  }
+
+  return loadImmediate(Offset.getImm(), DstReg, BaseReg, Is32BitAddress, true,
+                       IDLoc, Out, STI);
+}
+
+bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
+                                            unsigned DstReg, unsigned SrcReg,
+                                            bool Is32BitSym, SMLoc IDLoc,
+                                            MCStreamer &Out,
+                                            const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  bool UseSrcReg = SrcReg != Mips::NoRegister;
+  warnIfNoMacro(IDLoc);
+
+  if (inPicMode() && ABI.IsO32()) {
+    MCValue Res;
+    if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) {
+      Error(IDLoc, "expected relocatable expression");
+      return true;
+    }
+    if (Res.getSymB() != nullptr) {
+      Error(IDLoc, "expected relocatable expression with only one symbol");
+      return true;
+    }
+
+    // The case where the result register is $25 is somewhat special. If the
+    // symbol in the final relocation is external and not modified with a
+    // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT16.
+    if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg &&
+        Res.getConstant() == 0 && !Res.getSymA()->getSymbol().isInSection() &&
+        !Res.getSymA()->getSymbol().isTemporary()) {
+      const MCExpr *CallExpr =
+          MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
+      TOut.emitRRX(Mips::LW, DstReg, ABI.GetGlobalPtr(),
+                   MCOperand::createExpr(CallExpr), IDLoc, STI);
+      return false;
+    }
+
+    // The remaining cases are:
+    //   External GOT: lw $tmp, %got(symbol+offset)($gp)
+    //                >addiu $tmp, $tmp, %lo(offset)
+    //                >addiu $rd, $tmp, $rs
+    //   Local GOT:    lw $tmp, %got(symbol+offset)($gp)
+    //                 addiu $tmp, $tmp, %lo(symbol+offset)($gp)
+    //                >addiu $rd, $tmp, $rs
+    // The addiu's marked with a '>' may be omitted if they are redundant. If
+    // this happens then the last instruction must use $rd as the result
+    // register.
+    const MipsMCExpr *GotExpr =
+        MipsMCExpr::create(MipsMCExpr::MEK_GOT, SymExpr, getContext());
+    const MCExpr *LoExpr = nullptr;
+    if (Res.getSymA()->getSymbol().isInSection() ||
+        Res.getSymA()->getSymbol().isTemporary())
+      LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext());
+    else if (Res.getConstant() != 0) {
+      // External symbols fully resolve the symbol with just the %got(symbol)
+      // but we must still account for any offset to the symbol for expressions
+      // like symbol+8.
+      LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
+    }
+
+    unsigned TmpReg = DstReg;
+    if (UseSrcReg &&
+        getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
+                                                               SrcReg)) {
+      // If $rs is the same as $rd, we need to use AT.
+      // If it is not available we exit.
+      unsigned ATReg = getATReg(IDLoc);
+      if (!ATReg)
+        return true;
+      TmpReg = ATReg;
+    }
+
+    TOut.emitRRX(Mips::LW, TmpReg, ABI.GetGlobalPtr(),
+                 MCOperand::createExpr(GotExpr), IDLoc, STI);
+
+    if (LoExpr)
+      TOut.emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
+                   IDLoc, STI);
+
+    if (UseSrcReg)
+      TOut.emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, STI);
+
+    return false;
+  }
+
+  const MipsMCExpr *HiExpr =
+      MipsMCExpr::create(MipsMCExpr::MEK_HI, SymExpr, getContext());
+  const MipsMCExpr *LoExpr =
+      MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext());
+
+  // This is the 64-bit symbol address expansion.
+  if (ABI.ArePtrs64bit() && isGP64bit()) {
+    // We always need AT for the 64-bit expansion.
+    // If it is not available we exit.
+    unsigned ATReg = getATReg(IDLoc);
+    if (!ATReg)
+      return true;
+
+    const MipsMCExpr *HighestExpr =
+        MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, SymExpr, getContext());
+    const MipsMCExpr *HigherExpr =
+        MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, SymExpr, getContext());
+
+    if (UseSrcReg &&
+        getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
+                                                               SrcReg)) {
+      // If $rs is the same as $rd:
+      // (d)la $rd, sym($rd) => lui    $at, %highest(sym)
+      //                        daddiu $at, $at, %higher(sym)
+      //                        dsll   $at, $at, 16
+      //                        daddiu $at, $at, %hi(sym)
+      //                        dsll   $at, $at, 16
+      //                        daddiu $at, $at, %lo(sym)
+      //                        daddu  $rd, $at, $rd
+      TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
+                  STI);
+      TOut.emitRRX(Mips::DADDiu, ATReg, ATReg,
+                   MCOperand::createExpr(HigherExpr), IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HiExpr),
+                   IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr),
+                   IDLoc, STI);
+      TOut.emitRRR(Mips::DADDu, DstReg, ATReg, SrcReg, IDLoc, STI);
+
+      return false;
+    }
+
+    // Otherwise, if the $rs is different from $rd or if $rs isn't specified:
+    // (d)la $rd, sym/sym($rs) => lui    $rd, %highest(sym)
+    //                            lui    $at, %hi(sym)
+    //                            daddiu $rd, $rd, %higher(sym)
+    //                            daddiu $at, $at, %lo(sym)
+    //                            dsll32 $rd, $rd, 0
+    //                            daddu  $rd, $rd, $at
+    //                            (daddu  $rd, $rd, $rs)
+    TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
+                STI);
+    TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
+    TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
+                 MCOperand::createExpr(HigherExpr), IDLoc, STI);
+    TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr),
+                 IDLoc, STI);
+    TOut.emitRRI(Mips::DSLL32, DstReg, DstReg, 0, IDLoc, STI);
+    TOut.emitRRR(Mips::DADDu, DstReg, DstReg, ATReg, IDLoc, STI);
+    if (UseSrcReg)
+      TOut.emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, STI);
+
+    return false;
+  }
+
+  // And now, the 32-bit symbol address expansion:
+  // If $rs is the same as $rd:
+  // (d)la $rd, sym($rd)     => lui   $at, %hi(sym)
+  //                            ori   $at, $at, %lo(sym)
+  //                            addu  $rd, $at, $rd
+  // Otherwise, if the $rs is different from $rd or if $rs isn't specified:
+  // (d)la $rd, sym/sym($rs) => lui   $rd, %hi(sym)
+  //                            ori   $rd, $rd, %lo(sym)
+  //                            (addu $rd, $rd, $rs)
+  unsigned TmpReg = DstReg;
+  if (UseSrcReg &&
+      getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg)) {
+    // If $rs is the same as $rd, we need to use AT.
+    // If it is not available we exit.
+    unsigned ATReg = getATReg(IDLoc);
+    if (!ATReg)
+      return true;
+    TmpReg = ATReg;
+  }
+
+  TOut.emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
+  TOut.emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
+               IDLoc, STI);
+
+  if (UseSrcReg)
+    TOut.emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, STI);
+  else
+    assert(
+        getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, TmpReg));
+
+  return false;
+}
+
+bool MipsAsmParser::expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc,
+                                               MCStreamer &Out,
+                                               const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  assert(getInstDesc(Inst.getOpcode()).getNumOperands() == 1 &&
+         "unexpected number of operands");
+
+  MCOperand Offset = Inst.getOperand(0);
+  if (Offset.isExpr()) {
+    Inst.clear();
+    Inst.setOpcode(Mips::BEQ_MM);
+    Inst.addOperand(MCOperand::createReg(Mips::ZERO));
+    Inst.addOperand(MCOperand::createReg(Mips::ZERO));
+    Inst.addOperand(MCOperand::createExpr(Offset.getExpr()));
+  } else {
+    assert(Offset.isImm() && "expected immediate operand kind");
+    if (isInt<11>(Offset.getImm())) {
+      // If offset fits into 11 bits then this instruction becomes microMIPS
+      // 16-bit unconditional branch instruction.
+      if (inMicroMipsMode())
+        Inst.setOpcode(hasMips32r6() ? Mips::BC16_MMR6 : Mips::B16_MM);
+    } else {
+      if (!isInt<17>(Offset.getImm()))
+        return Error(IDLoc, "branch target out of range");
+      if (OffsetToAlignment(Offset.getImm(), 1LL << 1))
+        return Error(IDLoc, "branch to misaligned address");
+      Inst.clear();
+      Inst.setOpcode(Mips::BEQ_MM);
+      Inst.addOperand(MCOperand::createReg(Mips::ZERO));
+      Inst.addOperand(MCOperand::createReg(Mips::ZERO));
+      Inst.addOperand(MCOperand::createImm(Offset.getImm()));
+    }
+  }
+  Out.EmitInstruction(Inst, *STI);
+
+  // If .set reorder is active and branch instruction has a delay slot,
+  // emit a NOP after it.
+  const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+  if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder())
+    TOut.emitEmptyDelaySlot(true, IDLoc, STI);
+
+  return false;
+}
+
+bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                    const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  const MCOperand &DstRegOp = Inst.getOperand(0);
+  assert(DstRegOp.isReg() && "expected register operand kind");
+
+  const MCOperand &ImmOp = Inst.getOperand(1);
+  assert(ImmOp.isImm() && "expected immediate operand kind");
+
+  const MCOperand &MemOffsetOp = Inst.getOperand(2);
+  assert((MemOffsetOp.isImm() || MemOffsetOp.isExpr()) &&
+         "expected immediate or expression operand");
+
+  unsigned OpCode = 0;
+  switch(Inst.getOpcode()) {
+    case Mips::BneImm:
+      OpCode = Mips::BNE;
+      break;
+    case Mips::BeqImm:
+      OpCode = Mips::BEQ;
+      break;
+    default:
+      llvm_unreachable("Unknown immediate branch pseudo-instruction.");
+      break;
+  }
+
+  int64_t ImmValue = ImmOp.getImm();
+  if (ImmValue == 0)
+    TOut.emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MemOffsetOp, IDLoc,
+                 STI);
+  else {
+    warnIfNoMacro(IDLoc);
+
+    unsigned ATReg = getATReg(IDLoc);
+    if (!ATReg)
+      return true;
+
+    if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, !isGP64bit(), true,
+                      IDLoc, Out, STI))
+      return true;
+
+    TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg, MemOffsetOp, IDLoc, STI);
+  }
+  return false;
+}
+
+void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                  const MCSubtargetInfo *STI, bool IsLoad,
+                                  bool IsImmOpnd) {
+  if (IsLoad) {
+    expandLoadInst(Inst, IDLoc, Out, STI, IsImmOpnd);
+    return;
+  }
+  expandStoreInst(Inst, IDLoc, Out, STI, IsImmOpnd);
+}
+
+void MipsAsmParser::expandLoadInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                   const MCSubtargetInfo *STI, bool IsImmOpnd) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned BaseReg = Inst.getOperand(1).getReg();
+
+  const MCInstrDesc &Desc = getInstDesc(Inst.getOpcode());
+  int16_t DstRegClass = Desc.OpInfo[0].RegClass;
+  unsigned DstRegClassID =
+      getContext().getRegisterInfo()->getRegClass(DstRegClass).getID();
+  bool IsGPR = (DstRegClassID == Mips::GPR32RegClassID) ||
+               (DstRegClassID == Mips::GPR64RegClassID);
+
+  if (IsImmOpnd) {
+    // Try to use DstReg as the temporary.
+    if (IsGPR && (BaseReg != DstReg)) {
+      TOut.emitLoadWithImmOffset(Inst.getOpcode(), DstReg, BaseReg,
+                                 Inst.getOperand(2).getImm(), DstReg, IDLoc,
+                                 STI);
+      return;
+    }
+
+    // At this point we need AT to perform the expansions and we exit if it is
+    // not available.
+    unsigned ATReg = getATReg(IDLoc);
+    if (!ATReg)
+      return;
+
+    TOut.emitLoadWithImmOffset(Inst.getOpcode(), DstReg, BaseReg,
+                               Inst.getOperand(2).getImm(), ATReg, IDLoc, STI);
+    return;
+  }
+
+  const MCExpr *ExprOffset = Inst.getOperand(2).getExpr();
+  MCOperand LoOperand = MCOperand::createExpr(
+      MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
+  MCOperand HiOperand = MCOperand::createExpr(
+      MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
+
+  // Try to use DstReg as the temporary.
+  if (IsGPR && (BaseReg != DstReg)) {
+    TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
+                               LoOperand, DstReg, IDLoc, STI);
+    return;
+  }
+
+  // At this point we need AT to perform the expansions and we exit if it is
+  // not available.
+  unsigned ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return;
+
+  TOut.emitLoadWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
+                             LoOperand, ATReg, IDLoc, STI);
+}
+
+void MipsAsmParser::expandStoreInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                    const MCSubtargetInfo *STI,
+                                    bool IsImmOpnd) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  unsigned SrcReg = Inst.getOperand(0).getReg();
+  unsigned BaseReg = Inst.getOperand(1).getReg();
+
+  if (IsImmOpnd) {
+    TOut.emitStoreWithImmOffset(Inst.getOpcode(), SrcReg, BaseReg,
+                                Inst.getOperand(2).getImm(),
+                                [&]() { return getATReg(IDLoc); }, IDLoc, STI);
+    return;
+  }
+
+  unsigned ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return;
+
+  const MCExpr *ExprOffset = Inst.getOperand(2).getExpr();
+  MCOperand LoOperand = MCOperand::createExpr(
+      MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
+  MCOperand HiOperand = MCOperand::createExpr(
+      MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
+  TOut.emitStoreWithSymOffset(Inst.getOpcode(), SrcReg, BaseReg, HiOperand,
+                              LoOperand, ATReg, IDLoc, STI);
+}
+
+bool MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
+                                            MCStreamer &Out,
+                                            const MCSubtargetInfo *STI) {
+  unsigned OpNum = Inst.getNumOperands();
+  unsigned Opcode = Inst.getOpcode();
+  unsigned NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM32_MM : Mips::LWM32_MM;
+
+  assert (Inst.getOperand(OpNum - 1).isImm() &&
+          Inst.getOperand(OpNum - 2).isReg() &&
+          Inst.getOperand(OpNum - 3).isReg() && "Invalid instruction operand.");
+
+  if (OpNum < 8 && Inst.getOperand(OpNum - 1).getImm() <= 60 &&
+      Inst.getOperand(OpNum - 1).getImm() >= 0 &&
+      (Inst.getOperand(OpNum - 2).getReg() == Mips::SP ||
+       Inst.getOperand(OpNum - 2).getReg() == Mips::SP_64) &&
+      (Inst.getOperand(OpNum - 3).getReg() == Mips::RA ||
+       Inst.getOperand(OpNum - 3).getReg() == Mips::RA_64)) {
+    // It can be implemented as SWM16 or LWM16 instruction.
+    if (inMicroMipsMode() && hasMips32r6())
+      NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MMR6 : Mips::LWM16_MMR6;
+    else
+      NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MM : Mips::LWM16_MM;
+  }
+
+  Inst.setOpcode(NewOpcode);
+  Out.EmitInstruction(Inst, *STI);
+  return false;
+}
+
+bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
+                                       MCStreamer &Out,
+                                       const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  bool EmittedNoMacroWarning = false;
+  unsigned PseudoOpcode = Inst.getOpcode();
+  unsigned SrcReg = Inst.getOperand(0).getReg();
+  const MCOperand &TrgOp = Inst.getOperand(1);
+  const MCExpr *OffsetExpr = Inst.getOperand(2).getExpr();
+
+  unsigned ZeroSrcOpcode, ZeroTrgOpcode;
+  bool ReverseOrderSLT, IsUnsigned, IsLikely, AcceptsEquality;
+
+  unsigned TrgReg;
+  if (TrgOp.isReg())
+    TrgReg = TrgOp.getReg();
+  else if (TrgOp.isImm()) {
+    warnIfNoMacro(IDLoc);
+    EmittedNoMacroWarning = true;
+
+    TrgReg = getATReg(IDLoc);
+    if (!TrgReg)
+      return true;
+
+    switch(PseudoOpcode) {
+    default:
+      llvm_unreachable("unknown opcode for branch pseudo-instruction");
+    case Mips::BLTImmMacro:
+      PseudoOpcode = Mips::BLT;
+      break;
+    case Mips::BLEImmMacro:
+      PseudoOpcode = Mips::BLE;
+      break;
+    case Mips::BGEImmMacro:
+      PseudoOpcode = Mips::BGE;
+      break;
+    case Mips::BGTImmMacro:
+      PseudoOpcode = Mips::BGT;
+      break;
+    case Mips::BLTUImmMacro:
+      PseudoOpcode = Mips::BLTU;
+      break;
+    case Mips::BLEUImmMacro:
+      PseudoOpcode = Mips::BLEU;
+      break;
+    case Mips::BGEUImmMacro:
+      PseudoOpcode = Mips::BGEU;
+      break;
+    case Mips::BGTUImmMacro:
+      PseudoOpcode = Mips::BGTU;
+      break;
+    case Mips::BLTLImmMacro:
+      PseudoOpcode = Mips::BLTL;
+      break;
+    case Mips::BLELImmMacro:
+      PseudoOpcode = Mips::BLEL;
+      break;
+    case Mips::BGELImmMacro:
+      PseudoOpcode = Mips::BGEL;
+      break;
+    case Mips::BGTLImmMacro:
+      PseudoOpcode = Mips::BGTL;
+      break;
+    case Mips::BLTULImmMacro:
+      PseudoOpcode = Mips::BLTUL;
+      break;
+    case Mips::BLEULImmMacro:
+      PseudoOpcode = Mips::BLEUL;
+      break;
+    case Mips::BGEULImmMacro:
+      PseudoOpcode = Mips::BGEUL;
+      break;
+    case Mips::BGTULImmMacro:
+      PseudoOpcode = Mips::BGTUL;
+      break;
+    }
+
+    if (loadImmediate(TrgOp.getImm(), TrgReg, Mips::NoRegister, !isGP64bit(),
+                      false, IDLoc, Out, STI))
+      return true;
+  }
+
+  switch (PseudoOpcode) {
+  case Mips::BLT:
+  case Mips::BLTU:
+  case Mips::BLTL:
+  case Mips::BLTUL:
+    AcceptsEquality = false;
+    ReverseOrderSLT = false;
+    IsUnsigned = ((PseudoOpcode == Mips::BLTU) || (PseudoOpcode == Mips::BLTUL));
+    IsLikely = ((PseudoOpcode == Mips::BLTL) || (PseudoOpcode == Mips::BLTUL));
+    ZeroSrcOpcode = Mips::BGTZ;
+    ZeroTrgOpcode = Mips::BLTZ;
+    break;
+  case Mips::BLE:
+  case Mips::BLEU:
+  case Mips::BLEL:
+  case Mips::BLEUL:
+    AcceptsEquality = true;
+    ReverseOrderSLT = true;
+    IsUnsigned = ((PseudoOpcode == Mips::BLEU) || (PseudoOpcode == Mips::BLEUL));
+    IsLikely = ((PseudoOpcode == Mips::BLEL) || (PseudoOpcode == Mips::BLEUL));
+    ZeroSrcOpcode = Mips::BGEZ;
+    ZeroTrgOpcode = Mips::BLEZ;
+    break;
+  case Mips::BGE:
+  case Mips::BGEU:
+  case Mips::BGEL:
+  case Mips::BGEUL:
+    AcceptsEquality = true;
+    ReverseOrderSLT = false;
+    IsUnsigned = ((PseudoOpcode == Mips::BGEU) || (PseudoOpcode == Mips::BGEUL));
+    IsLikely = ((PseudoOpcode == Mips::BGEL) || (PseudoOpcode == Mips::BGEUL));
+    ZeroSrcOpcode = Mips::BLEZ;
+    ZeroTrgOpcode = Mips::BGEZ;
+    break;
+  case Mips::BGT:
+  case Mips::BGTU:
+  case Mips::BGTL:
+  case Mips::BGTUL:
+    AcceptsEquality = false;
+    ReverseOrderSLT = true;
+    IsUnsigned = ((PseudoOpcode == Mips::BGTU) || (PseudoOpcode == Mips::BGTUL));
+    IsLikely = ((PseudoOpcode == Mips::BGTL) || (PseudoOpcode == Mips::BGTUL));
+    ZeroSrcOpcode = Mips::BLTZ;
+    ZeroTrgOpcode = Mips::BGTZ;
+    break;
+  default:
+    llvm_unreachable("unknown opcode for branch pseudo-instruction");
+  }
+
+  bool IsTrgRegZero = (TrgReg == Mips::ZERO);
+  bool IsSrcRegZero = (SrcReg == Mips::ZERO);
+  if (IsSrcRegZero && IsTrgRegZero) {
+    // FIXME: All of these Opcode-specific if's are needed for compatibility
+    // with GAS' behaviour. However, they may not generate the most efficient
+    // code in some circumstances.
+    if (PseudoOpcode == Mips::BLT) {
+      TOut.emitRX(Mips::BLTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr),
+                  IDLoc, STI);
+      return false;
+    }
+    if (PseudoOpcode == Mips::BLE) {
+      TOut.emitRX(Mips::BLEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr),
+                  IDLoc, STI);
+      Warning(IDLoc, "branch is always taken");
+      return false;
+    }
+    if (PseudoOpcode == Mips::BGE) {
+      TOut.emitRX(Mips::BGEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr),
+                  IDLoc, STI);
+      Warning(IDLoc, "branch is always taken");
+      return false;
+    }
+    if (PseudoOpcode == Mips::BGT) {
+      TOut.emitRX(Mips::BGTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr),
+                  IDLoc, STI);
+      return false;
+    }
+    if (PseudoOpcode == Mips::BGTU) {
+      TOut.emitRRX(Mips::BNE, Mips::ZERO, Mips::ZERO,
+                   MCOperand::createExpr(OffsetExpr), IDLoc, STI);
+      return false;
+    }
+    if (AcceptsEquality) {
+      // If both registers are $0 and the pseudo-branch accepts equality, it
+      // will always be taken, so we emit an unconditional branch.
+      TOut.emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO,
+                   MCOperand::createExpr(OffsetExpr), IDLoc, STI);
+      Warning(IDLoc, "branch is always taken");
+      return false;
+    }
+    // If both registers are $0 and the pseudo-branch does not accept
+    // equality, it will never be taken, so we don't have to emit anything.
+    return false;
+  }
+  if (IsSrcRegZero || IsTrgRegZero) {
+    if ((IsSrcRegZero && PseudoOpcode == Mips::BGTU) ||
+        (IsTrgRegZero && PseudoOpcode == Mips::BLTU)) {
+      // If the $rs is $0 and the pseudo-branch is BGTU (0 > x) or
+      // if the $rt is $0 and the pseudo-branch is BLTU (x < 0),
+      // the pseudo-branch will never be taken, so we don't emit anything.
+      // This only applies to unsigned pseudo-branches.
+      return false;
+    }
+    if ((IsSrcRegZero && PseudoOpcode == Mips::BLEU) ||
+        (IsTrgRegZero && PseudoOpcode == Mips::BGEU)) {
+      // If the $rs is $0 and the pseudo-branch is BLEU (0 <= x) or
+      // if the $rt is $0 and the pseudo-branch is BGEU (x >= 0),
+      // the pseudo-branch will always be taken, so we emit an unconditional
+      // branch.
+      // This only applies to unsigned pseudo-branches.
+      TOut.emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO,
+                   MCOperand::createExpr(OffsetExpr), IDLoc, STI);
+      Warning(IDLoc, "branch is always taken");
+      return false;
+    }
+    if (IsUnsigned) {
+      // If the $rs is $0 and the pseudo-branch is BLTU (0 < x) or
+      // if the $rt is $0 and the pseudo-branch is BGTU (x > 0),
+      // the pseudo-branch will be taken only when the non-zero register is
+      // different from 0, so we emit a BNEZ.
+      //
+      // If the $rs is $0 and the pseudo-branch is BGEU (0 >= x) or
+      // if the $rt is $0 and the pseudo-branch is BLEU (x <= 0),
+      // the pseudo-branch will be taken only when the non-zero register is
+      // equal to 0, so we emit a BEQZ.
+      //
+      // Because only BLEU and BGEU branch on equality, we can use the
+      // AcceptsEquality variable to decide when to emit the BEQZ.
+      TOut.emitRRX(AcceptsEquality ? Mips::BEQ : Mips::BNE,
+                   IsSrcRegZero ? TrgReg : SrcReg, Mips::ZERO,
+                   MCOperand::createExpr(OffsetExpr), IDLoc, STI);
+      return false;
+    }
+    // If we have a signed pseudo-branch and one of the registers is $0,
+    // we can use an appropriate compare-to-zero branch. We select which one
+    // to use in the switch statement above.
+    TOut.emitRX(IsSrcRegZero ? ZeroSrcOpcode : ZeroTrgOpcode,
+                IsSrcRegZero ? TrgReg : SrcReg,
+                MCOperand::createExpr(OffsetExpr), IDLoc, STI);
+    return false;
+  }
+
+  // If neither the SrcReg nor the TrgReg are $0, we need AT to perform the
+  // expansions. If it is not available, we return.
+  unsigned ATRegNum = getATReg(IDLoc);
+  if (!ATRegNum)
+    return true;
+
+  if (!EmittedNoMacroWarning)
+    warnIfNoMacro(IDLoc);
+
+  // SLT fits well with 2 of our 4 pseudo-branches:
+  //   BLT, where $rs < $rt, translates into "slt $at, $rs, $rt" and
+  //   BGT, where $rs > $rt, translates into "slt $at, $rt, $rs".
+  // If the result of the SLT is 1, we branch, and if it's 0, we don't.
+  // This is accomplished by using a BNEZ with the result of the SLT.
+  //
+  // The other 2 pseudo-branches are opposites of the above 2 (BGE with BLT
+  // and BLE with BGT), so we change the BNEZ into a a BEQZ.
+  // Because only BGE and BLE branch on equality, we can use the
+  // AcceptsEquality variable to decide when to emit the BEQZ.
+  // Note that the order of the SLT arguments doesn't change between
+  // opposites.
+  //
+  // The same applies to the unsigned variants, except that SLTu is used
+  // instead of SLT.
+  TOut.emitRRR(IsUnsigned ? Mips::SLTu : Mips::SLT, ATRegNum,
+               ReverseOrderSLT ? TrgReg : SrcReg,
+               ReverseOrderSLT ? SrcReg : TrgReg, IDLoc, STI);
+
+  TOut.emitRRX(IsLikely ? (AcceptsEquality ? Mips::BEQL : Mips::BNEL)
+                        : (AcceptsEquality ? Mips::BEQ : Mips::BNE),
+               ATRegNum, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc,
+               STI);
+  return false;
+}
+
+bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                              const MCSubtargetInfo *STI, const bool IsMips64,
+                              const bool Signed) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  warnIfNoMacro(IDLoc);
+
+  const MCOperand &RdRegOp = Inst.getOperand(0);
+  assert(RdRegOp.isReg() && "expected register operand kind");
+  unsigned RdReg = RdRegOp.getReg();
+
+  const MCOperand &RsRegOp = Inst.getOperand(1);
+  assert(RsRegOp.isReg() && "expected register operand kind");
+  unsigned RsReg = RsRegOp.getReg();
+
+  const MCOperand &RtRegOp = Inst.getOperand(2);
+  assert(RtRegOp.isReg() && "expected register operand kind");
+  unsigned RtReg = RtRegOp.getReg();
+  unsigned DivOp;
+  unsigned ZeroReg;
+
+  if (IsMips64) {
+    DivOp = Signed ? Mips::DSDIV : Mips::DUDIV;
+    ZeroReg = Mips::ZERO_64;
+  } else {
+    DivOp = Signed ? Mips::SDIV : Mips::UDIV;
+    ZeroReg = Mips::ZERO;
+  }
+
+  bool UseTraps = useTraps();
+
+  if (RsReg == Mips::ZERO || RsReg == Mips::ZERO_64) {
+    if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)
+      Warning(IDLoc, "dividing zero by zero");
+    if (IsMips64) {
+      if (Signed && (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)) {
+        if (UseTraps) {
+          TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
+          return false;
+        }
+
+        TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
+        return false;
+      }
+    } else {
+      TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI);
+      return false;
+    }
+  }
+
+  if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64) {
+    Warning(IDLoc, "division by zero");
+    if (Signed) {
+      if (UseTraps) {
+        TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
+        return false;
+      }
+
+      TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
+      return false;
+    }
+  }
+
+  // FIXME: The values for these two BranchTarget variables may be different in
+  // micromips. These magic numbers need to be removed.
+  unsigned BranchTargetNoTraps;
+  unsigned BranchTarget;
+
+  if (UseTraps) {
+    BranchTarget = IsMips64 ? 12 : 8;
+    TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
+  } else {
+    BranchTarget = IsMips64 ? 20 : 16;
+    BranchTargetNoTraps = 8;
+    // Branch to the li instruction.
+    TOut.emitRRI(Mips::BNE, RtReg, ZeroReg, BranchTargetNoTraps, IDLoc, STI);
+  }
+
+  TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI);
+
+  if (!UseTraps)
+    TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
+
+  if (!Signed) {
+    TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
+    return false;
+  }
+
+  unsigned ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return true;
+
+  TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, -1, IDLoc, STI);
+  if (IsMips64) {
+    // Branch to the mflo instruction.
+    TOut.emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, STI);
+    TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, 1, IDLoc, STI);
+    TOut.emitRRI(Mips::DSLL32, ATReg, ATReg, 0x1f, IDLoc, STI);
+  } else {
+    // Branch to the mflo instruction.
+    TOut.emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, STI);
+    TOut.emitRI(Mips::LUi, ATReg, (uint16_t)0x8000, IDLoc, STI);
+  }
+
+  if (UseTraps)
+    TOut.emitRRI(Mips::TEQ, RsReg, ATReg, 0x6, IDLoc, STI);
+  else {
+    // Branch to the mflo instruction.
+    TOut.emitRRI(Mips::BNE, RsReg, ATReg, BranchTargetNoTraps, IDLoc, STI);
+    TOut.emitRRI(Mips::SLL, ZeroReg, ZeroReg, 0, IDLoc, STI);
+    TOut.emitII(Mips::BREAK, 0x6, 0, IDLoc, STI);
+  }
+  TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
+  return false;
+}
+
+bool MipsAsmParser::expandTrunc(MCInst &Inst, bool IsDouble, bool Is64FPU,
+                                SMLoc IDLoc, MCStreamer &Out,
+                                const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg() &&
+         Inst.getOperand(2).isReg() && "Invalid instruction operand.");
+
+  unsigned FirstReg = Inst.getOperand(0).getReg();
+  unsigned SecondReg = Inst.getOperand(1).getReg();
+  unsigned ThirdReg = Inst.getOperand(2).getReg();
+
+  if (hasMips1() && !hasMips2()) {
+    unsigned ATReg = getATReg(IDLoc);
+    if (!ATReg)
+      return true;
+    TOut.emitRR(Mips::CFC1, ThirdReg, Mips::RA, IDLoc, STI);
+    TOut.emitRR(Mips::CFC1, ThirdReg, Mips::RA, IDLoc, STI);
+    TOut.emitNop(IDLoc, STI);
+    TOut.emitRRI(Mips::ORi, ATReg, ThirdReg, 0x3, IDLoc, STI);
+    TOut.emitRRI(Mips::XORi, ATReg, ATReg, 0x2, IDLoc, STI);
+    TOut.emitRR(Mips::CTC1, Mips::RA, ATReg, IDLoc, STI);
+    TOut.emitNop(IDLoc, STI);
+    TOut.emitRR(IsDouble ? (Is64FPU ? Mips::CVT_W_D64 : Mips::CVT_W_D32)
+                         : Mips::CVT_W_S,
+                FirstReg, SecondReg, IDLoc, STI);
+    TOut.emitRR(Mips::CTC1, Mips::RA, ThirdReg, IDLoc, STI);
+    TOut.emitNop(IDLoc, STI);
+    return false;
+  }
+
+  TOut.emitRR(IsDouble ? (Is64FPU ? Mips::TRUNC_W_D64 : Mips::TRUNC_W_D32)
+                       : Mips::TRUNC_W_S,
+              FirstReg, SecondReg, IDLoc, STI);
+
+  return false;
+}
+
+bool MipsAsmParser::expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc,
+                              MCStreamer &Out, const MCSubtargetInfo *STI) {
+  if (hasMips32r6() || hasMips64r6()) {
+    return Error(IDLoc, "instruction not supported on mips32r6 or mips64r6");
+  }
+
+  const MCOperand &DstRegOp = Inst.getOperand(0);
+  assert(DstRegOp.isReg() && "expected register operand kind");
+  const MCOperand &SrcRegOp = Inst.getOperand(1);
+  assert(SrcRegOp.isReg() && "expected register operand kind");
+  const MCOperand &OffsetImmOp = Inst.getOperand(2);
+  assert(OffsetImmOp.isImm() && "expected immediate operand kind");
+
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned DstReg = DstRegOp.getReg();
+  unsigned SrcReg = SrcRegOp.getReg();
+  int64_t OffsetValue = OffsetImmOp.getImm();
+
+  // NOTE: We always need AT for ULHU, as it is always used as the source
+  // register for one of the LBu's.
+  warnIfNoMacro(IDLoc);
+  unsigned ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return true;
+
+  bool IsLargeOffset = !(isInt<16>(OffsetValue + 1) && isInt<16>(OffsetValue));
+  if (IsLargeOffset) {
+    if (loadImmediate(OffsetValue, ATReg, SrcReg, !ABI.ArePtrs64bit(), true,
+                      IDLoc, Out, STI))
+      return true;
+  }
+
+  int64_t FirstOffset = IsLargeOffset ? 0 : OffsetValue;
+  int64_t SecondOffset = IsLargeOffset ? 1 : (OffsetValue + 1);
+  if (isLittle())
+    std::swap(FirstOffset, SecondOffset);
+
+  unsigned FirstLbuDstReg = IsLargeOffset ? DstReg : ATReg;
+  unsigned SecondLbuDstReg = IsLargeOffset ? ATReg : DstReg;
+
+  unsigned LbuSrcReg = IsLargeOffset ? ATReg : SrcReg;
+  unsigned SllReg = IsLargeOffset ? DstReg : ATReg;
+
+  TOut.emitRRI(Signed ? Mips::LB : Mips::LBu, FirstLbuDstReg, LbuSrcReg,
+               FirstOffset, IDLoc, STI);
+  TOut.emitRRI(Mips::LBu, SecondLbuDstReg, LbuSrcReg, SecondOffset, IDLoc, STI);
+  TOut.emitRRI(Mips::SLL, SllReg, SllReg, 8, IDLoc, STI);
+  TOut.emitRRR(Mips::OR, DstReg, DstReg, ATReg, IDLoc, STI);
+
+  return false;
+}
+
+bool MipsAsmParser::expandUsh(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                              const MCSubtargetInfo *STI) {
+  if (hasMips32r6() || hasMips64r6()) {
+    return Error(IDLoc, "instruction not supported on mips32r6 or mips64r6");
+  }
+
+  const MCOperand &DstRegOp = Inst.getOperand(0);
+  assert(DstRegOp.isReg() && "expected register operand kind");
+  const MCOperand &SrcRegOp = Inst.getOperand(1);
+  assert(SrcRegOp.isReg() && "expected register operand kind");
+  const MCOperand &OffsetImmOp = Inst.getOperand(2);
+  assert(OffsetImmOp.isImm() && "expected immediate operand kind");
+
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned DstReg = DstRegOp.getReg();
+  unsigned SrcReg = SrcRegOp.getReg();
+  int64_t OffsetValue = OffsetImmOp.getImm();
+
+  warnIfNoMacro(IDLoc);
+  unsigned ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return true;
+
+  bool IsLargeOffset = !(isInt<16>(OffsetValue + 1) && isInt<16>(OffsetValue));
+  if (IsLargeOffset) {
+    if (loadImmediate(OffsetValue, ATReg, SrcReg, !ABI.ArePtrs64bit(), true,
+                      IDLoc, Out, STI))
+      return true;
+  }
+
+  int64_t FirstOffset = IsLargeOffset ? 1 : (OffsetValue + 1);
+  int64_t SecondOffset = IsLargeOffset ? 0 : OffsetValue;
+  if (isLittle())
+    std::swap(FirstOffset, SecondOffset);
+
+  if (IsLargeOffset) {
+    TOut.emitRRI(Mips::SB, DstReg, ATReg, FirstOffset, IDLoc, STI);
+    TOut.emitRRI(Mips::SRL, DstReg, DstReg, 8, IDLoc, STI);
+    TOut.emitRRI(Mips::SB, DstReg, ATReg, SecondOffset, IDLoc, STI);
+    TOut.emitRRI(Mips::LBu, ATReg, ATReg, 0, IDLoc, STI);
+    TOut.emitRRI(Mips::SLL, DstReg, DstReg, 8, IDLoc, STI);
+    TOut.emitRRR(Mips::OR, DstReg, DstReg, ATReg, IDLoc, STI);
+  } else {
+    TOut.emitRRI(Mips::SB, DstReg, SrcReg, FirstOffset, IDLoc, STI);
+    TOut.emitRRI(Mips::SRL, ATReg, DstReg, 8, IDLoc, STI);
+    TOut.emitRRI(Mips::SB, ATReg, SrcReg, SecondOffset, IDLoc, STI);
+  }
+
+  return false;
+}
+
+bool MipsAsmParser::expandUxw(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                              const MCSubtargetInfo *STI) {
+  if (hasMips32r6() || hasMips64r6()) {
+    return Error(IDLoc, "instruction not supported on mips32r6 or mips64r6");
+  }
+
+  const MCOperand &DstRegOp = Inst.getOperand(0);
+  assert(DstRegOp.isReg() && "expected register operand kind");
+  const MCOperand &SrcRegOp = Inst.getOperand(1);
+  assert(SrcRegOp.isReg() && "expected register operand kind");
+  const MCOperand &OffsetImmOp = Inst.getOperand(2);
+  assert(OffsetImmOp.isImm() && "expected immediate operand kind");
+
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned DstReg = DstRegOp.getReg();
+  unsigned SrcReg = SrcRegOp.getReg();
+  int64_t OffsetValue = OffsetImmOp.getImm();
+
+  // Compute left/right load/store offsets.
+  bool IsLargeOffset = !(isInt<16>(OffsetValue + 3) && isInt<16>(OffsetValue));
+  int64_t LxlOffset = IsLargeOffset ? 0 : OffsetValue;
+  int64_t LxrOffset = IsLargeOffset ? 3 : (OffsetValue + 3);
+  if (isLittle())
+    std::swap(LxlOffset, LxrOffset);
+
+  bool IsLoadInst = (Inst.getOpcode() == Mips::Ulw);
+  bool DoMove = IsLoadInst && (SrcReg == DstReg) && !IsLargeOffset;
+  unsigned TmpReg = SrcReg;
+  if (IsLargeOffset || DoMove) {
+    warnIfNoMacro(IDLoc);
+    TmpReg = getATReg(IDLoc);
+    if (!TmpReg)
+      return true;
+  }
+
+  if (IsLargeOffset) {
+    if (loadImmediate(OffsetValue, TmpReg, SrcReg, !ABI.ArePtrs64bit(), true,
+                      IDLoc, Out, STI))
+      return true;
+  }
+
+  if (DoMove)
+    std::swap(DstReg, TmpReg);
+
+  unsigned XWL = IsLoadInst ? Mips::LWL : Mips::SWL;
+  unsigned XWR = IsLoadInst ? Mips::LWR : Mips::SWR;
+  TOut.emitRRI(XWL, DstReg, TmpReg, LxlOffset, IDLoc, STI);
+  TOut.emitRRI(XWR, DstReg, TmpReg, LxrOffset, IDLoc, STI);
+
+  if (DoMove)
+    TOut.emitRRR(Mips::OR, TmpReg, DstReg, Mips::ZERO, IDLoc, STI);
+
+  return false;
+}
+
+bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
+                                         MCStreamer &Out,
+                                         const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  assert (Inst.getNumOperands() == 3 && "Invalid operand count");
+  assert (Inst.getOperand(0).isReg() &&
+          Inst.getOperand(1).isReg() &&
+          Inst.getOperand(2).isImm() && "Invalid instruction operand.");
+
+  unsigned ATReg = Mips::NoRegister;
+  unsigned FinalDstReg = Mips::NoRegister;
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  int64_t ImmValue = Inst.getOperand(2).getImm();
+
+  bool Is32Bit = isInt<32>(ImmValue) || isUInt<32>(ImmValue);
+
+  unsigned FinalOpcode = Inst.getOpcode();
+
+  if (DstReg == SrcReg) {
+    ATReg = getATReg(Inst.getLoc());
+    if (!ATReg)
+      return true;
+    FinalDstReg = DstReg;
+    DstReg = ATReg;
+  }
+
+  if (!loadImmediate(ImmValue, DstReg, Mips::NoRegister, Is32Bit, false, Inst.getLoc(), Out, STI)) {
+    switch (FinalOpcode) {
+    default:
+      llvm_unreachable("unimplemented expansion");
+    case (Mips::ADDi):
+      FinalOpcode = Mips::ADD;
+      break;
+    case (Mips::ADDiu):
+      FinalOpcode = Mips::ADDu;
+      break;
+    case (Mips::ANDi):
+      FinalOpcode = Mips::AND;
+      break;
+    case (Mips::NORImm):
+      FinalOpcode = Mips::NOR;
+      break;
+    case (Mips::ORi):
+      FinalOpcode = Mips::OR;
+      break;
+    case (Mips::SLTi):
+      FinalOpcode = Mips::SLT;
+      break;
+    case (Mips::SLTiu):
+      FinalOpcode = Mips::SLTu;
+      break;
+    case (Mips::XORi):
+      FinalOpcode = Mips::XOR;
+      break;
+    }
+
+    if (FinalDstReg == Mips::NoRegister)
+      TOut.emitRRR(FinalOpcode, DstReg, DstReg, SrcReg, IDLoc, STI);
+    else
+      TOut.emitRRR(FinalOpcode, FinalDstReg, FinalDstReg, DstReg, IDLoc, STI);
+    return false;
+  }
+  return true;
+}
+
+bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                   const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DReg = Inst.getOperand(0).getReg();
+  unsigned SReg = Inst.getOperand(1).getReg();
+  unsigned TReg = Inst.getOperand(2).getReg();
+  unsigned TmpReg = DReg;
+
+  unsigned FirstShift = Mips::NOP;
+  unsigned SecondShift = Mips::NOP;
+
+  if (hasMips32r2()) {
+
+    if (DReg == SReg) {
+      TmpReg = getATReg(Inst.getLoc());
+      if (!TmpReg)
+        return true;
+    }
+
+    if (Inst.getOpcode() == Mips::ROL) {
+      TOut.emitRRR(Mips::SUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), STI);
+      TOut.emitRRR(Mips::ROTRV, DReg, SReg, TmpReg, Inst.getLoc(), STI);
+      return false;
+    }
+
+    if (Inst.getOpcode() == Mips::ROR) {
+      TOut.emitRRR(Mips::ROTRV, DReg, SReg, TReg, Inst.getLoc(), STI);
+      return false;
+    }
+
+    return true;
+  }
+
+  if (hasMips32()) {
+
+    switch (Inst.getOpcode()) {
+    default:
+      llvm_unreachable("unexpected instruction opcode");
+    case Mips::ROL:
+      FirstShift = Mips::SRLV;
+      SecondShift = Mips::SLLV;
+      break;
+    case Mips::ROR:
+      FirstShift = Mips::SLLV;
+      SecondShift = Mips::SRLV;
+      break;
+    }
+
+    ATReg = getATReg(Inst.getLoc());
+    if (!ATReg)
+      return true;
+
+    TOut.emitRRR(Mips::SUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), STI);
+    TOut.emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), STI);
+    TOut.emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), STI);
+    TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI);
+
+    return false;
+  }
+
+  return true;
+}
+
+bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc,
+                                      MCStreamer &Out,
+                                      const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DReg = Inst.getOperand(0).getReg();
+  unsigned SReg = Inst.getOperand(1).getReg();
+  int64_t ImmValue = Inst.getOperand(2).getImm();
+
+  unsigned FirstShift = Mips::NOP;
+  unsigned SecondShift = Mips::NOP;
+
+  if (hasMips32r2()) {
+
+    if (Inst.getOpcode() == Mips::ROLImm) {
+      uint64_t MaxShift = 32;
+      uint64_t ShiftValue = ImmValue;
+      if (ImmValue != 0)
+        ShiftValue = MaxShift - ImmValue;
+      TOut.emitRRI(Mips::ROTR, DReg, SReg, ShiftValue, Inst.getLoc(), STI);
+      return false;
+    }
+
+    if (Inst.getOpcode() == Mips::RORImm) {
+      TOut.emitRRI(Mips::ROTR, DReg, SReg, ImmValue, Inst.getLoc(), STI);
+      return false;
+    }
+
+    return true;
+  }
+
+  if (hasMips32()) {
+
+    if (ImmValue == 0) {
+      TOut.emitRRI(Mips::SRL, DReg, SReg, 0, Inst.getLoc(), STI);
+      return false;
+    }
+
+    switch (Inst.getOpcode()) {
+    default:
+      llvm_unreachable("unexpected instruction opcode");
+    case Mips::ROLImm:
+      FirstShift = Mips::SLL;
+      SecondShift = Mips::SRL;
+      break;
+    case Mips::RORImm:
+      FirstShift = Mips::SRL;
+      SecondShift = Mips::SLL;
+      break;
+    }
+
+    ATReg = getATReg(Inst.getLoc());
+    if (!ATReg)
+      return true;
+
+    TOut.emitRRI(FirstShift, ATReg, SReg, ImmValue, Inst.getLoc(), STI);
+    TOut.emitRRI(SecondShift, DReg, SReg, 32 - ImmValue, Inst.getLoc(), STI);
+    TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI);
+
+    return false;
+  }
+
+  return true;
+}
+
+bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                    const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DReg = Inst.getOperand(0).getReg();
+  unsigned SReg = Inst.getOperand(1).getReg();
+  unsigned TReg = Inst.getOperand(2).getReg();
+  unsigned TmpReg = DReg;
+
+  unsigned FirstShift = Mips::NOP;
+  unsigned SecondShift = Mips::NOP;
+
+  if (hasMips64r2()) {
+
+    if (TmpReg == SReg) {
+      TmpReg = getATReg(Inst.getLoc());
+      if (!TmpReg)
+        return true;
+    }
+
+    if (Inst.getOpcode() == Mips::DROL) {
+      TOut.emitRRR(Mips::DSUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), STI);
+      TOut.emitRRR(Mips::DROTRV, DReg, SReg, TmpReg, Inst.getLoc(), STI);
+      return false;
+    }
+
+    if (Inst.getOpcode() == Mips::DROR) {
+      TOut.emitRRR(Mips::DROTRV, DReg, SReg, TReg, Inst.getLoc(), STI);
+      return false;
+    }
+
+    return true;
+  }
+
+  if (hasMips64()) {
+
+    switch (Inst.getOpcode()) {
+    default:
+      llvm_unreachable("unexpected instruction opcode");
+    case Mips::DROL:
+      FirstShift = Mips::DSRLV;
+      SecondShift = Mips::DSLLV;
+      break;
+    case Mips::DROR:
+      FirstShift = Mips::DSLLV;
+      SecondShift = Mips::DSRLV;
+      break;
+    }
+
+    ATReg = getATReg(Inst.getLoc());
+    if (!ATReg)
+      return true;
+
+    TOut.emitRRR(Mips::DSUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), STI);
+    TOut.emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), STI);
+    TOut.emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), STI);
+    TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI);
+
+    return false;
+  }
+
+  return true;
+}
+
+bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
+                                       MCStreamer &Out,
+                                       const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DReg = Inst.getOperand(0).getReg();
+  unsigned SReg = Inst.getOperand(1).getReg();
+  int64_t ImmValue = Inst.getOperand(2).getImm() % 64;
+
+  unsigned FirstShift = Mips::NOP;
+  unsigned SecondShift = Mips::NOP;
+
+  MCInst TmpInst;
+
+  if (hasMips64r2()) {
+
+    unsigned FinalOpcode = Mips::NOP;
+    if (ImmValue == 0)
+      FinalOpcode = Mips::DROTR;
+    else if (ImmValue % 32 == 0)
+      FinalOpcode = Mips::DROTR32;
+    else if ((ImmValue >= 1) && (ImmValue <= 32)) {
+      if (Inst.getOpcode() == Mips::DROLImm)
+        FinalOpcode = Mips::DROTR32;
+      else
+        FinalOpcode = Mips::DROTR;
+    } else if (ImmValue >= 33) {
+      if (Inst.getOpcode() == Mips::DROLImm)
+        FinalOpcode = Mips::DROTR;
+      else
+        FinalOpcode = Mips::DROTR32;
+    }
+
+    uint64_t ShiftValue = ImmValue % 32;
+    if (Inst.getOpcode() == Mips::DROLImm)
+      ShiftValue = (32 - ImmValue % 32) % 32;
+
+    TOut.emitRRI(FinalOpcode, DReg, SReg, ShiftValue, Inst.getLoc(), STI);
+
+    return false;
+  }
+
+  if (hasMips64()) {
+
+    if (ImmValue == 0) {
+      TOut.emitRRI(Mips::DSRL, DReg, SReg, 0, Inst.getLoc(), STI);
+      return false;
+    }
+
+    switch (Inst.getOpcode()) {
+    default:
+      llvm_unreachable("unexpected instruction opcode");
+    case Mips::DROLImm:
+      if ((ImmValue >= 1) && (ImmValue <= 31)) {
+        FirstShift = Mips::DSLL;
+        SecondShift = Mips::DSRL32;
+      }
+      if (ImmValue == 32) {
+        FirstShift = Mips::DSLL32;
+        SecondShift = Mips::DSRL32;
+      }
+      if ((ImmValue >= 33) && (ImmValue <= 63)) {
+        FirstShift = Mips::DSLL32;
+        SecondShift = Mips::DSRL;
+      }
+      break;
+    case Mips::DRORImm:
+      if ((ImmValue >= 1) && (ImmValue <= 31)) {
+        FirstShift = Mips::DSRL;
+        SecondShift = Mips::DSLL32;
+      }
+      if (ImmValue == 32) {
+        FirstShift = Mips::DSRL32;
+        SecondShift = Mips::DSLL32;
+      }
+      if ((ImmValue >= 33) && (ImmValue <= 63)) {
+        FirstShift = Mips::DSRL32;
+        SecondShift = Mips::DSLL;
+      }
+      break;
+    }
+
+    ATReg = getATReg(Inst.getLoc());
+    if (!ATReg)
+      return true;
+
+    TOut.emitRRI(FirstShift, ATReg, SReg, ImmValue % 32, Inst.getLoc(), STI);
+    TOut.emitRRI(SecondShift, DReg, SReg, (32 - ImmValue % 32) % 32,
+                 Inst.getLoc(), STI);
+    TOut.emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), STI);
+
+    return false;
+  }
+
+  return true;
+}
+
+bool MipsAsmParser::expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                              const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned FirstRegOp = Inst.getOperand(0).getReg();
+  unsigned SecondRegOp = Inst.getOperand(1).getReg();
+
+  TOut.emitRI(Mips::BGEZ, SecondRegOp, 8, IDLoc, STI);
+  if (FirstRegOp != SecondRegOp)
+    TOut.emitRRR(Mips::ADDu, FirstRegOp, SecondRegOp, Mips::ZERO, IDLoc, STI);
+  else
+    TOut.emitEmptyDelaySlot(false, IDLoc, STI);
+  TOut.emitRRR(Mips::SUB, FirstRegOp, Mips::ZERO, SecondRegOp, IDLoc, STI);
+
+  return false;
+}
+
+static unsigned nextReg(unsigned Reg) {
+  switch (Reg) {
+  case Mips::ZERO: return Mips::AT;
+  case Mips::AT:   return Mips::V0;
+  case Mips::V0:   return Mips::V1;
+  case Mips::V1:   return Mips::A0;
+  case Mips::A0:   return Mips::A1;
+  case Mips::A1:   return Mips::A2;
+  case Mips::A2:   return Mips::A3;
+  case Mips::A3:   return Mips::T0;
+  case Mips::T0:   return Mips::T1;
+  case Mips::T1:   return Mips::T2;
+  case Mips::T2:   return Mips::T3;
+  case Mips::T3:   return Mips::T4;
+  case Mips::T4:   return Mips::T5;
+  case Mips::T5:   return Mips::T6;
+  case Mips::T6:   return Mips::T7;
+  case Mips::T7:   return Mips::S0;
+  case Mips::S0:   return Mips::S1;
+  case Mips::S1:   return Mips::S2;
+  case Mips::S2:   return Mips::S3;
+  case Mips::S3:   return Mips::S4;
+  case Mips::S4:   return Mips::S5;
+  case Mips::S5:   return Mips::S6;
+  case Mips::S6:   return Mips::S7;
+  case Mips::S7:   return Mips::T8;
+  case Mips::T8:   return Mips::T9;
+  case Mips::T9:   return Mips::K0;
+  case Mips::K0:   return Mips::K1;
+  case Mips::K1:   return Mips::GP;
+  case Mips::GP:   return Mips::SP;
+  case Mips::SP:   return Mips::FP;
+  case Mips::FP:   return Mips::RA;
+  case Mips::RA:   return Mips::ZERO;
+  default:         return 0;
+  }
+
+}
+
+// Expand 'ld $<reg> offset($reg2)' to 'lw $<reg>, offset($reg2);
+//                                      lw $<reg+1>>, offset+4($reg2)'
+// or expand 'sd $<reg> offset($reg2)' to 'sw $<reg>, offset($reg2);
+//                                         sw $<reg+1>>, offset+4($reg2)'
+// for O32.
+bool MipsAsmParser::expandLoadStoreDMacro(MCInst &Inst, SMLoc IDLoc,
+                                          MCStreamer &Out,
+                                          const MCSubtargetInfo *STI,
+                                          bool IsLoad) {
+  if (!isABI_O32())
+    return true;
+
+  warnIfNoMacro(IDLoc);
+
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned Opcode = IsLoad ? Mips::LW : Mips::SW;
+  unsigned FirstReg = Inst.getOperand(0).getReg();
+  unsigned SecondReg = nextReg(FirstReg);
+  unsigned BaseReg = Inst.getOperand(1).getReg();
+  if (!SecondReg)
+    return true;
+
+  warnIfRegIndexIsAT(FirstReg, IDLoc);
+
+  assert(Inst.getOperand(2).isImm() &&
+         "Offset for load macro is not immediate!");
+
+  MCOperand &FirstOffset = Inst.getOperand(2);
+  signed NextOffset = FirstOffset.getImm() + 4;
+  MCOperand SecondOffset = MCOperand::createImm(NextOffset);
+
+  if (!isInt<16>(FirstOffset.getImm()) || !isInt<16>(NextOffset))
+    return true;
+
+  // For loads, clobber the base register with the second load instead of the
+  // first if the BaseReg == FirstReg.
+  if (FirstReg != BaseReg || !IsLoad) {
+    TOut.emitRRX(Opcode, FirstReg, BaseReg, FirstOffset, IDLoc, STI);
+    TOut.emitRRX(Opcode, SecondReg, BaseReg, SecondOffset, IDLoc, STI);
+  } else {
+    TOut.emitRRX(Opcode, SecondReg, BaseReg, SecondOffset, IDLoc, STI);
+    TOut.emitRRX(Opcode, FirstReg, BaseReg, FirstOffset, IDLoc, STI);
+  }
+
+  return false;
+}
+
+bool MipsAsmParser::expandSeq(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                              const MCSubtargetInfo *STI) {
+
+  warnIfNoMacro(IDLoc);
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  if (Inst.getOperand(1).getReg() != Mips::ZERO &&
+      Inst.getOperand(2).getReg() != Mips::ZERO) {
+    TOut.emitRRR(Mips::XOR, Inst.getOperand(0).getReg(),
+                 Inst.getOperand(1).getReg(), Inst.getOperand(2).getReg(),
+                 IDLoc, STI);
+    TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(),
+                 Inst.getOperand(0).getReg(), 1, IDLoc, STI);
+    return false;
+  }
+
+  unsigned Reg = 0;
+  if (Inst.getOperand(1).getReg() == Mips::ZERO) {
+    Reg = Inst.getOperand(2).getReg();
+  } else {
+    Reg = Inst.getOperand(1).getReg();
+  }
+  TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(), Reg, 1, IDLoc, STI);
+  return false;
+}
+
+bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                               const MCSubtargetInfo *STI) {
+
+  warnIfNoMacro(IDLoc);
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  unsigned Opc;
+  int64_t Imm = Inst.getOperand(2).getImm();
+  unsigned Reg = Inst.getOperand(1).getReg();
+
+  if (Imm == 0) {
+    TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(),
+                 Inst.getOperand(1).getReg(), 1, IDLoc, STI);
+    return false;
+  } else {
+
+    if (Reg == Mips::ZERO) {
+      Warning(IDLoc, "comparison is always false");
+      TOut.emitRRR(isGP64bit() ? Mips::DADDu : Mips::ADDu,
+                   Inst.getOperand(0).getReg(), Reg, Reg, IDLoc, STI);
+      return false;
+    }
+
+    if (Imm > -0x8000 && Imm < 0) {
+      Imm = -Imm;
+      Opc = isGP64bit() ? Mips::DADDiu : Mips::ADDiu;
+    } else {
+      Opc = Mips::XORi;
+    }
+  }
+  if (!isUInt<16>(Imm)) {
+    unsigned ATReg = getATReg(IDLoc);
+    if (!ATReg)
+      return true;
+
+    if (loadImmediate(Imm, ATReg, Mips::NoRegister, true, isGP64bit(), IDLoc,
+                      Out, STI))
+      return true;
+
+    TOut.emitRRR(Mips::XOR, Inst.getOperand(0).getReg(),
+                 Inst.getOperand(1).getReg(), ATReg, IDLoc, STI);
+    TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(),
+                 Inst.getOperand(0).getReg(), 1, IDLoc, STI);
+    return false;
+  }
+
+  TOut.emitRRI(Opc, Inst.getOperand(0).getReg(), Inst.getOperand(1).getReg(),
+               Imm, IDLoc, STI);
+  TOut.emitRRI(Mips::SLTiu, Inst.getOperand(0).getReg(),
+               Inst.getOperand(0).getReg(), 1, IDLoc, STI);
+  return false;
+}
+
+unsigned
+MipsAsmParser::checkEarlyTargetMatchPredicate(MCInst &Inst,
+                                              const OperandVector &Operands) {
+  switch (Inst.getOpcode()) {
+  default:
+    return Match_Success;
+  case Mips::DATI:
+  case Mips::DAHI:
+  case Mips::DATI_MM64R6:
+  case Mips::DAHI_MM64R6:
+    if (static_cast<MipsOperand &>(*Operands[1])
+            .isValidForTie(static_cast<MipsOperand &>(*Operands[2])))
+      return Match_Success;
+    return Match_RequiresSameSrcAndDst;
+  }
+}
+unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+  switch (Inst.getOpcode()) {
+  // As described by the MIPSR6 spec, daui must not use the zero operand for
+  // its source operand.
+  case Mips::DAUI:
+  case Mips::DAUI_MM64R6:
+    if (Inst.getOperand(1).getReg() == Mips::ZERO ||
+        Inst.getOperand(1).getReg() == Mips::ZERO_64)
+      return Match_RequiresNoZeroRegister;
+    return Match_Success;
+  // As described by the Mips32r2 spec, the registers Rd and Rs for
+  // jalr.hb must be different.
+  // It also applies for registers Rt and Rs of microMIPSr6 jalrc.hb instruction
+  // and registers Rd and Base for microMIPS lwp instruction
+  case Mips::JALR_HB:
+  case Mips::JALRC_HB_MMR6:
+  case Mips::JALRC_MMR6:
+    if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg())
+      return Match_RequiresDifferentSrcAndDst;
+    return Match_Success;
+  case Mips::LWP_MM:
+  case Mips::LWP_MMR6:
+    if (Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg())
+      return Match_RequiresDifferentSrcAndDst;
+    return Match_Success;
+  case Mips::SYNC:
+    if (Inst.getOperand(0).getImm() != 0 && !hasMips32())
+      return Match_NonZeroOperandForSync;
+    return Match_Success;
+  // As described the MIPSR6 spec, the compact branches that compare registers
+  // must:
+  // a) Not use the zero register.
+  // b) Not use the same register twice.
+  // c) rs < rt for bnec, beqc.
+  //    NB: For this case, the encoding will swap the operands as their
+  //    ordering doesn't matter. GAS performs this transformation  too.
+  //    Hence, that constraint does not have to be enforced.
+  //
+  // The compact branches that branch iff the signed addition of two registers
+  // would overflow must have rs >= rt. That can be handled like beqc/bnec with
+  // operand swapping. They do not have restriction of using the zero register.
+  case Mips::BLEZC:   case Mips::BLEZC_MMR6:
+  case Mips::BGEZC:   case Mips::BGEZC_MMR6:
+  case Mips::BGTZC:   case Mips::BGTZC_MMR6:
+  case Mips::BLTZC:   case Mips::BLTZC_MMR6:
+  case Mips::BEQZC:   case Mips::BEQZC_MMR6:
+  case Mips::BNEZC:   case Mips::BNEZC_MMR6:
+  case Mips::BLEZC64:
+  case Mips::BGEZC64:
+  case Mips::BGTZC64:
+  case Mips::BLTZC64:
+  case Mips::BEQZC64:
+  case Mips::BNEZC64:
+    if (Inst.getOperand(0).getReg() == Mips::ZERO ||
+        Inst.getOperand(0).getReg() == Mips::ZERO_64)
+      return Match_RequiresNoZeroRegister;
+    return Match_Success;
+  case Mips::BGEC:    case Mips::BGEC_MMR6:
+  case Mips::BLTC:    case Mips::BLTC_MMR6:
+  case Mips::BGEUC:   case Mips::BGEUC_MMR6:
+  case Mips::BLTUC:   case Mips::BLTUC_MMR6:
+  case Mips::BEQC:    case Mips::BEQC_MMR6:
+  case Mips::BNEC:    case Mips::BNEC_MMR6:
+  case Mips::BGEC64:
+  case Mips::BLTC64:
+  case Mips::BGEUC64:
+  case Mips::BLTUC64:
+  case Mips::BEQC64:
+  case Mips::BNEC64:
+    if (Inst.getOperand(0).getReg() == Mips::ZERO ||
+        Inst.getOperand(0).getReg() == Mips::ZERO_64)
+      return Match_RequiresNoZeroRegister;
+    if (Inst.getOperand(1).getReg() == Mips::ZERO ||
+        Inst.getOperand(1).getReg() == Mips::ZERO_64)
+      return Match_RequiresNoZeroRegister;
+    if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg())
+      return Match_RequiresDifferentOperands;
+    return Match_Success;
+  default:
+    return Match_Success;
+  }
+}
+
+static SMLoc RefineErrorLoc(const SMLoc Loc, const OperandVector &Operands,
+                            uint64_t ErrorInfo) {
+  if (ErrorInfo != ~0ULL && ErrorInfo < Operands.size()) {
+    SMLoc ErrorLoc = Operands[ErrorInfo]->getStartLoc();
+    if (ErrorLoc == SMLoc())
+      return Loc;
+    return ErrorLoc;
+  }
+  return Loc;
+}
+
+bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                            OperandVector &Operands,
+                                            MCStreamer &Out,
+                                            uint64_t &ErrorInfo,
+                                            bool MatchingInlineAsm) {
+
+  MCInst Inst;
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+
+  switch (MatchResult) {
+  case Match_Success: {
+    if (processInstruction(Inst, IDLoc, Out, STI))
+      return true;
+    return false;
+  }
+  case Match_MissingFeature:
+    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+    return true;
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0ULL) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = Operands[ErrorInfo]->getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_NonZeroOperandForSync:
+    return Error(IDLoc, "s-type must be zero or unspecified for pre-MIPS32 ISAs");
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction");
+  case Match_RequiresDifferentSrcAndDst:
+    return Error(IDLoc, "source and destination must be different");
+  case Match_RequiresDifferentOperands:
+    return Error(IDLoc, "registers must be different");
+  case Match_RequiresNoZeroRegister:
+    return Error(IDLoc, "invalid operand ($zero) for instruction");
+  case Match_RequiresSameSrcAndDst:
+    return Error(IDLoc, "source and destination must match");
+  case Match_Immz:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected '0'");
+  case Match_UImm1_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 1-bit unsigned immediate");
+  case Match_UImm2_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 2-bit unsigned immediate");
+  case Match_UImm2_1:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected immediate in range 1 .. 4");
+  case Match_UImm3_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 3-bit unsigned immediate");
+  case Match_UImm4_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 4-bit unsigned immediate");
+  case Match_SImm4_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 4-bit signed immediate");
+  case Match_UImm5_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 5-bit unsigned immediate");
+  case Match_SImm5_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 5-bit signed immediate");
+  case Match_UImm5_1:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected immediate in range 1 .. 32");
+  case Match_UImm5_32:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected immediate in range 32 .. 63");
+  case Match_UImm5_33:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected immediate in range 33 .. 64");
+  case Match_UImm5_0_Report_UImm6:
+    // This is used on UImm5 operands that have a corresponding UImm5_32
+    // operand to avoid confusing the user.
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 6-bit unsigned immediate");
+  case Match_UImm5_Lsl2:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected both 7-bit unsigned immediate and multiple of 4");
+  case Match_UImmRange2_64:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected immediate in range 2 .. 64");
+  case Match_UImm6_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 6-bit unsigned immediate");
+  case Match_UImm6_Lsl2:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected both 8-bit unsigned immediate and multiple of 4");
+  case Match_SImm6_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 6-bit signed immediate");
+  case Match_UImm7_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 7-bit unsigned immediate");
+  case Match_UImm7_N1:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected immediate in range -1 .. 126");
+  case Match_SImm7_Lsl2:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected both 9-bit signed immediate and multiple of 4");
+  case Match_UImm8_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 8-bit unsigned immediate");
+  case Match_UImm10_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 10-bit unsigned immediate");
+  case Match_SImm10_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 10-bit signed immediate");
+  case Match_SImm11_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 11-bit signed immediate");
+  case Match_UImm16:
+  case Match_UImm16_Relaxed:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 16-bit unsigned immediate");
+  case Match_SImm16:
+  case Match_SImm16_Relaxed:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 16-bit signed immediate");
+  case Match_SImm19_Lsl2:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected both 19-bit signed immediate and multiple of 4");
+  case Match_UImm20_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 20-bit unsigned immediate");
+  case Match_UImm26_0:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 26-bit unsigned immediate");
+  case Match_SImm32:
+  case Match_SImm32_Relaxed:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 32-bit signed immediate");
+  case Match_UImm32_Coerced:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected 32-bit immediate");
+  case Match_MemSImm9:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 9-bit signed offset");
+  case Match_MemSImm10:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 10-bit signed offset");
+  case Match_MemSImm10Lsl1:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 11-bit signed offset and multiple of 2");
+  case Match_MemSImm10Lsl2:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 12-bit signed offset and multiple of 4");
+  case Match_MemSImm10Lsl3:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 13-bit signed offset and multiple of 8");
+  case Match_MemSImm11:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 11-bit signed offset");
+  case Match_MemSImm12:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 12-bit signed offset");
+  case Match_MemSImm16:
+    return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo),
+                 "expected memory with 16-bit signed offset");
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+}
+
+void MipsAsmParser::warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc) {
+  if (RegIndex != 0 && AssemblerOptions.back()->getATRegIndex() == RegIndex)
+    Warning(Loc, "used $at (currently $" + Twine(RegIndex) +
+                     ") without \".set noat\"");
+}
+
+void MipsAsmParser::warnIfNoMacro(SMLoc Loc) {
+  if (!AssemblerOptions.back()->isMacro())
+    Warning(Loc, "macro instruction expanded into multiple instructions");
+}
+
+void
+MipsAsmParser::printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg,
+                                     SMRange Range, bool ShowColors) {
+  getSourceManager().PrintMessage(Range.Start, SourceMgr::DK_Warning, Msg,
+                                  Range, SMFixIt(Range, FixMsg),
+                                  ShowColors);
+}
+
+int MipsAsmParser::matchCPURegisterName(StringRef Name) {
+  int CC;
+
+  CC = StringSwitch<unsigned>(Name)
+           .Case("zero", 0)
+           .Case("at", 1)
+           .Case("a0", 4)
+           .Case("a1", 5)
+           .Case("a2", 6)
+           .Case("a3", 7)
+           .Case("v0", 2)
+           .Case("v1", 3)
+           .Case("s0", 16)
+           .Case("s1", 17)
+           .Case("s2", 18)
+           .Case("s3", 19)
+           .Case("s4", 20)
+           .Case("s5", 21)
+           .Case("s6", 22)
+           .Case("s7", 23)
+           .Case("k0", 26)
+           .Case("k1", 27)
+           .Case("gp", 28)
+           .Case("sp", 29)
+           .Case("fp", 30)
+           .Case("s8", 30)
+           .Case("ra", 31)
+           .Case("t0", 8)
+           .Case("t1", 9)
+           .Case("t2", 10)
+           .Case("t3", 11)
+           .Case("t4", 12)
+           .Case("t5", 13)
+           .Case("t6", 14)
+           .Case("t7", 15)
+           .Case("t8", 24)
+           .Case("t9", 25)
+           .Default(-1);
+
+  if (!(isABI_N32() || isABI_N64()))
+    return CC;
+
+  if (12 <= CC && CC <= 15) {
+    // Name is one of t4-t7
+    AsmToken RegTok = getLexer().peekTok();
+    SMRange RegRange = RegTok.getLocRange();
+
+    StringRef FixedName = StringSwitch<StringRef>(Name)
+                              .Case("t4", "t0")
+                              .Case("t5", "t1")
+                              .Case("t6", "t2")
+                              .Case("t7", "t3")
+                              .Default("");
+    assert(FixedName != "" &&  "Register name is not one of t4-t7.");
+
+    printWarningWithFixIt("register names $t4-$t7 are only available in O32.",
+                          "Did you mean $" + FixedName + "?", RegRange);
+  }
+
+  // Although SGI documentation just cuts out t0-t3 for n32/n64,
+  // GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7
+  // We are supporting both cases, so for t0-t3 we'll just push them to t4-t7.
+  if (8 <= CC && CC <= 11)
+    CC += 4;
+
+  if (CC == -1)
+    CC = StringSwitch<unsigned>(Name)
+             .Case("a4", 8)
+             .Case("a5", 9)
+             .Case("a6", 10)
+             .Case("a7", 11)
+             .Case("kt0", 26)
+             .Case("kt1", 27)
+             .Default(-1);
+
+  return CC;
+}
+
+int MipsAsmParser::matchHWRegsRegisterName(StringRef Name) {
+  int CC;
+
+  CC = StringSwitch<unsigned>(Name)
+            .Case("hwr_cpunum", 0)
+            .Case("hwr_synci_step", 1)
+            .Case("hwr_cc", 2)
+            .Case("hwr_ccres", 3)
+            .Case("hwr_ulr", 29)
+            .Default(-1);
+
+  return CC;
+}
+
+int MipsAsmParser::matchFPURegisterName(StringRef Name) {
+
+  if (Name[0] == 'f') {
+    StringRef NumString = Name.substr(1);
+    unsigned IntVal;
+    if (NumString.getAsInteger(10, IntVal))
+      return -1;     // This is not an integer.
+    if (IntVal > 31) // Maximum index for fpu register.
+      return -1;
+    return IntVal;
+  }
+  return -1;
+}
+
+int MipsAsmParser::matchFCCRegisterName(StringRef Name) {
+
+  if (Name.startswith("fcc")) {
+    StringRef NumString = Name.substr(3);
+    unsigned IntVal;
+    if (NumString.getAsInteger(10, IntVal))
+      return -1;    // This is not an integer.
+    if (IntVal > 7) // There are only 8 fcc registers.
+      return -1;
+    return IntVal;
+  }
+  return -1;
+}
+
+int MipsAsmParser::matchACRegisterName(StringRef Name) {
+
+  if (Name.startswith("ac")) {
+    StringRef NumString = Name.substr(2);
+    unsigned IntVal;
+    if (NumString.getAsInteger(10, IntVal))
+      return -1;    // This is not an integer.
+    if (IntVal > 3) // There are only 3 acc registers.
+      return -1;
+    return IntVal;
+  }
+  return -1;
+}
+
+int MipsAsmParser::matchMSA128RegisterName(StringRef Name) {
+  unsigned IntVal;
+
+  if (Name.front() != 'w' || Name.drop_front(1).getAsInteger(10, IntVal))
+    return -1;
+
+  if (IntVal > 31)
+    return -1;
+
+  return IntVal;
+}
+
+int MipsAsmParser::matchMSA128CtrlRegisterName(StringRef Name) {
+  int CC;
+
+  CC = StringSwitch<unsigned>(Name)
+           .Case("msair", 0)
+           .Case("msacsr", 1)
+           .Case("msaaccess", 2)
+           .Case("msasave", 3)
+           .Case("msamodify", 4)
+           .Case("msarequest", 5)
+           .Case("msamap", 6)
+           .Case("msaunmap", 7)
+           .Default(-1);
+
+  return CC;
+}
+
+unsigned MipsAsmParser::getATReg(SMLoc Loc) {
+  unsigned ATIndex = AssemblerOptions.back()->getATRegIndex();
+  if (ATIndex == 0) {
+    reportParseError(Loc,
+                     "pseudo-instruction requires $at, which is not available");
+    return 0;
+  }
+  unsigned AT = getReg(
+      (isGP64bit()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, ATIndex);
+  return AT;
+}
+
+unsigned MipsAsmParser::getReg(int RC, int RegNo) {
+  return *(getContext().getRegisterInfo()->getRegClass(RC).begin() + RegNo);
+}
+
+bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+  MCAsmParser &Parser = getParser();
+  DEBUG(dbgs() << "parseOperand\n");
+
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  DEBUG(dbgs() << ".. Generic Parser\n");
+
+  switch (getLexer().getKind()) {
+  case AsmToken::Dollar: {
+    // Parse the register.
+    SMLoc S = Parser.getTok().getLoc();
+
+    // Almost all registers have been parsed by custom parsers. There is only
+    // one exception to this. $zero (and it's alias $0) will reach this point
+    // for div, divu, and similar instructions because it is not an operand
+    // to the instruction definition but an explicit register. Special case
+    // this situation for now.
+    if (parseAnyRegister(Operands) != MatchOperand_NoMatch)
+      return false;
+
+    // Maybe it is a symbol reference.
+    StringRef Identifier;
+    if (Parser.parseIdentifier(Identifier))
+      return true;
+
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    MCSymbol *Sym = getContext().getOrCreateSymbol("$" + Identifier);
+    // Otherwise create a symbol reference.
+    const MCExpr *Res =
+        MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+
+    Operands.push_back(MipsOperand::CreateImm(Res, S, E, *this));
+    return false;
+  }
+  default: {
+    DEBUG(dbgs() << ".. generic integer expression\n");
+
+    const MCExpr *Expr;
+    SMLoc S = Parser.getTok().getLoc(); // Start location of the operand.
+    if (getParser().parseExpression(Expr))
+      return true;
+
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+    Operands.push_back(MipsOperand::CreateImm(Expr, S, E, *this));
+    return false;
+  }
+  } // switch(getLexer().getKind())
+  return true;
+}
+
+bool MipsAsmParser::isEvaluated(const MCExpr *Expr) {
+
+  switch (Expr->getKind()) {
+  case MCExpr::Constant:
+    return true;
+  case MCExpr::SymbolRef:
+    return (cast<MCSymbolRefExpr>(Expr)->getKind() != MCSymbolRefExpr::VK_None);
+  case MCExpr::Binary:
+    if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr)) {
+      if (!isEvaluated(BE->getLHS()))
+        return false;
+      return isEvaluated(BE->getRHS());
+    }
+  case MCExpr::Unary:
+    return isEvaluated(cast<MCUnaryExpr>(Expr)->getSubExpr());
+  case MCExpr::Target:
+    return true;
+  }
+  return false;
+}
+
+bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                  SMLoc &EndLoc) {
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
+  OperandMatchResultTy ResTy = parseAnyRegister(Operands);
+  if (ResTy == MatchOperand_Success) {
+    assert(Operands.size() == 1);
+    MipsOperand &Operand = static_cast<MipsOperand &>(*Operands.front());
+    StartLoc = Operand.getStartLoc();
+    EndLoc = Operand.getEndLoc();
+
+    // AFAIK, we only support numeric registers and named GPR's in CFI
+    // directives.
+    // Don't worry about eating tokens before failing. Using an unrecognised
+    // register is a parse error.
+    if (Operand.isGPRAsmReg()) {
+      // Resolve to GPR32 or GPR64 appropriately.
+      RegNo = isGP64bit() ? Operand.getGPR64Reg() : Operand.getGPR32Reg();
+    }
+
+    return (RegNo == (unsigned)-1);
+  }
+
+  assert(Operands.size() == 0);
+  return (RegNo == (unsigned)-1);
+}
+
+bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
+  SMLoc S;
+
+  if (isParenExpr)
+    return getParser().parseParenExprOfDepth(0, Res, S);
+  return getParser().parseExpression(Res);
+}
+
+OperandMatchResultTy
+MipsAsmParser::parseMemOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  DEBUG(dbgs() << "parseMemOperand\n");
+  const MCExpr *IdVal = nullptr;
+  SMLoc S;
+  bool isParenExpr = false;
+  OperandMatchResultTy Res = MatchOperand_NoMatch;
+  // First operand is the offset.
+  S = Parser.getTok().getLoc();
+
+  if (getLexer().getKind() == AsmToken::LParen) {
+    Parser.Lex();
+    isParenExpr = true;
+  }
+
+  if (getLexer().getKind() != AsmToken::Dollar) {
+    if (parseMemOffset(IdVal, isParenExpr))
+      return MatchOperand_ParseFail;
+
+    const AsmToken &Tok = Parser.getTok(); // Get the next token.
+    if (Tok.isNot(AsmToken::LParen)) {
+      MipsOperand &Mnemonic = static_cast<MipsOperand &>(*Operands[0]);
+      if (Mnemonic.getToken() == "la" || Mnemonic.getToken() == "dla") {
+        SMLoc E =
+            SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+        Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this));
+        return MatchOperand_Success;
+      }
+      if (Tok.is(AsmToken::EndOfStatement)) {
+        SMLoc E =
+            SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+        // Zero register assumed, add a memory operand with ZERO as its base.
+        // "Base" will be managed by k_Memory.
+        auto Base = MipsOperand::createGPRReg(
+            0, "0", getContext().getRegisterInfo(), S, E, *this);
+        Operands.push_back(
+            MipsOperand::CreateMem(std::move(Base), IdVal, S, E, *this));
+        return MatchOperand_Success;
+      }
+      MCBinaryExpr::Opcode Opcode;
+      // GAS and LLVM treat comparison operators different. GAS will generate -1
+      // or 0, while LLVM will generate 0 or 1. Since a comparsion operator is
+      // highly unlikely to be found in a memory offset expression, we don't
+      // handle them.
+      switch (Tok.getKind()) {
+      case AsmToken::Plus:
+        Opcode = MCBinaryExpr::Add;
+        Parser.Lex();
+        break;
+      case AsmToken::Minus:
+        Opcode = MCBinaryExpr::Sub;
+        Parser.Lex();
+        break;
+      case AsmToken::Star:
+        Opcode = MCBinaryExpr::Mul;
+        Parser.Lex();
+        break;
+      case AsmToken::Pipe:
+        Opcode = MCBinaryExpr::Or;
+        Parser.Lex();
+        break;
+      case AsmToken::Amp:
+        Opcode = MCBinaryExpr::And;
+        Parser.Lex();
+        break;
+      case AsmToken::LessLess:
+        Opcode = MCBinaryExpr::Shl;
+        Parser.Lex();
+        break;
+      case AsmToken::GreaterGreater:
+        Opcode = MCBinaryExpr::LShr;
+        Parser.Lex();
+        break;
+      case AsmToken::Caret:
+        Opcode = MCBinaryExpr::Xor;
+        Parser.Lex();
+        break;
+      case AsmToken::Slash:
+        Opcode = MCBinaryExpr::Div;
+        Parser.Lex();
+        break;
+      case AsmToken::Percent:
+        Opcode = MCBinaryExpr::Mod;
+        Parser.Lex();
+        break;
+      default:
+        Error(Parser.getTok().getLoc(), "'(' or expression expected");
+        return MatchOperand_ParseFail;
+      }
+      const MCExpr * NextExpr;
+      if (getParser().parseExpression(NextExpr))
+        return MatchOperand_ParseFail;
+      IdVal = MCBinaryExpr::create(Opcode, IdVal, NextExpr, getContext());
+    }
+
+    Parser.Lex(); // Eat the '(' token.
+  }
+
+  Res = parseAnyRegister(Operands);
+  if (Res != MatchOperand_Success)
+    return Res;
+
+  if (Parser.getTok().isNot(AsmToken::RParen)) {
+    Error(Parser.getTok().getLoc(), "')' expected");
+    return MatchOperand_ParseFail;
+  }
+
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+  Parser.Lex(); // Eat the ')' token.
+
+  if (!IdVal)
+    IdVal = MCConstantExpr::create(0, getContext());
+
+  // Replace the register operand with the memory operand.
+  std::unique_ptr<MipsOperand> op(
+      static_cast<MipsOperand *>(Operands.back().release()));
+  // Remove the register from the operands.
+  // "op" will be managed by k_Memory.
+  Operands.pop_back();
+  // Add the memory operand.
+  if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(IdVal)) {
+    int64_t Imm;
+    if (IdVal->evaluateAsAbsolute(Imm))
+      IdVal = MCConstantExpr::create(Imm, getContext());
+    else if (BE->getLHS()->getKind() != MCExpr::SymbolRef)
+      IdVal = MCBinaryExpr::create(BE->getOpcode(), BE->getRHS(), BE->getLHS(),
+                                   getContext());
+  }
+
+  Operands.push_back(MipsOperand::CreateMem(std::move(op), IdVal, S, E, *this));
+  return MatchOperand_Success;
+}
+
+bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  MCSymbol *Sym = getContext().lookupSymbol(Parser.getTok().getIdentifier());
+  if (Sym) {
+    SMLoc S = Parser.getTok().getLoc();
+    const MCExpr *Expr;
+    if (Sym->isVariable())
+      Expr = Sym->getVariableValue();
+    else
+      return false;
+    if (Expr->getKind() == MCExpr::SymbolRef) {
+      const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
+      StringRef DefSymbol = Ref->getSymbol().getName();
+      if (DefSymbol.startswith("$")) {
+        OperandMatchResultTy ResTy =
+            matchAnyRegisterNameWithoutDollar(Operands, DefSymbol.substr(1), S);
+        if (ResTy == MatchOperand_Success) {
+          Parser.Lex();
+          return true;
+        } else if (ResTy == MatchOperand_ParseFail)
+          llvm_unreachable("Should never ParseFail");
+        return false;
+      }
+    }
+  }
+  return false;
+}
+
+OperandMatchResultTy
+MipsAsmParser::matchAnyRegisterNameWithoutDollar(OperandVector &Operands,
+                                                 StringRef Identifier,
+                                                 SMLoc S) {
+  int Index = matchCPURegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::createGPRReg(
+        Index, Identifier, getContext().getRegisterInfo(), S,
+        getLexer().getLoc(), *this));
+    return MatchOperand_Success;
+  }
+
+  Index = matchHWRegsRegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::createHWRegsReg(
+        Index, Identifier, getContext().getRegisterInfo(), S,
+        getLexer().getLoc(), *this));
+    return MatchOperand_Success;
+  }
+
+  Index = matchFPURegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::createFGRReg(
+        Index, Identifier, getContext().getRegisterInfo(), S,
+        getLexer().getLoc(), *this));
+    return MatchOperand_Success;
+  }
+
+  Index = matchFCCRegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::createFCCReg(
+        Index, Identifier, getContext().getRegisterInfo(), S,
+        getLexer().getLoc(), *this));
+    return MatchOperand_Success;
+  }
+
+  Index = matchACRegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::createACCReg(
+        Index, Identifier, getContext().getRegisterInfo(), S,
+        getLexer().getLoc(), *this));
+    return MatchOperand_Success;
+  }
+
+  Index = matchMSA128RegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::createMSA128Reg(
+        Index, Identifier, getContext().getRegisterInfo(), S,
+        getLexer().getLoc(), *this));
+    return MatchOperand_Success;
+  }
+
+  Index = matchMSA128CtrlRegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::createMSACtrlReg(
+        Index, Identifier, getContext().getRegisterInfo(), S,
+        getLexer().getLoc(), *this));
+    return MatchOperand_Success;
+  }
+
+  return MatchOperand_NoMatch;
+}
+
+OperandMatchResultTy
+MipsAsmParser::matchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S) {
+  MCAsmParser &Parser = getParser();
+  auto Token = Parser.getLexer().peekTok(false);
+
+  if (Token.is(AsmToken::Identifier)) {
+    DEBUG(dbgs() << ".. identifier\n");
+    StringRef Identifier = Token.getIdentifier();
+    OperandMatchResultTy ResTy =
+        matchAnyRegisterNameWithoutDollar(Operands, Identifier, S);
+    return ResTy;
+  } else if (Token.is(AsmToken::Integer)) {
+    DEBUG(dbgs() << ".. integer\n");
+    Operands.push_back(MipsOperand::createNumericReg(
+        Token.getIntVal(), Token.getString(), getContext().getRegisterInfo(), S,
+        Token.getLoc(), *this));
+    return MatchOperand_Success;
+  }
+
+  DEBUG(dbgs() << Parser.getTok().getKind() << "\n");
+
+  return MatchOperand_NoMatch;
+}
+
+OperandMatchResultTy
+MipsAsmParser::parseAnyRegister(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  DEBUG(dbgs() << "parseAnyRegister\n");
+
+  auto Token = Parser.getTok();
+
+  SMLoc S = Token.getLoc();
+
+  if (Token.isNot(AsmToken::Dollar)) {
+    DEBUG(dbgs() << ".. !$ -> try sym aliasing\n");
+    if (Token.is(AsmToken::Identifier)) {
+      if (searchSymbolAlias(Operands))
+        return MatchOperand_Success;
+    }
+    DEBUG(dbgs() << ".. !symalias -> NoMatch\n");
+    return MatchOperand_NoMatch;
+  }
+  DEBUG(dbgs() << ".. $\n");
+
+  OperandMatchResultTy ResTy = matchAnyRegisterWithoutDollar(Operands, S);
+  if (ResTy == MatchOperand_Success) {
+    Parser.Lex(); // $
+    Parser.Lex(); // identifier
+  }
+  return ResTy;
+}
+
+OperandMatchResultTy
+MipsAsmParser::parseJumpTarget(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  DEBUG(dbgs() << "parseJumpTarget\n");
+
+  SMLoc S = getLexer().getLoc();
+
+  // Registers are a valid target and have priority over symbols.
+  OperandMatchResultTy ResTy = parseAnyRegister(Operands);
+  if (ResTy != MatchOperand_NoMatch)
+    return ResTy;
+
+  // Integers and expressions are acceptable
+  const MCExpr *Expr = nullptr;
+  if (Parser.parseExpression(Expr)) {
+    // We have no way of knowing if a symbol was consumed so we must ParseFail
+    return MatchOperand_ParseFail;
+  }
+  Operands.push_back(
+      MipsOperand::CreateImm(Expr, S, getLexer().getLoc(), *this));
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+MipsAsmParser::parseInvNum(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  const MCExpr *IdVal;
+  // If the first token is '$' we may have register operand.
+  if (Parser.getTok().is(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+  SMLoc S = Parser.getTok().getLoc();
+  if (getParser().parseExpression(IdVal))
+    return MatchOperand_ParseFail;
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(IdVal);
+  assert(MCE && "Unexpected MCExpr type.");
+  int64_t Val = MCE->getValue();
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Operands.push_back(MipsOperand::CreateImm(
+      MCConstantExpr::create(0 - Val, getContext()), S, E, *this));
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+MipsAsmParser::parseRegisterList(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SmallVector<unsigned, 10> Regs;
+  unsigned RegNo;
+  unsigned PrevReg = Mips::NoRegister;
+  bool RegRange = false;
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> TmpOperands;
+
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_ParseFail;
+
+  SMLoc S = Parser.getTok().getLoc();
+  while (parseAnyRegister(TmpOperands) == MatchOperand_Success) {
+    SMLoc E = getLexer().getLoc();
+    MipsOperand &Reg = static_cast<MipsOperand &>(*TmpOperands.back());
+    RegNo = isGP64bit() ? Reg.getGPR64Reg() : Reg.getGPR32Reg();
+    if (RegRange) {
+      // Remove last register operand because registers from register range
+      // should be inserted first.
+      if ((isGP64bit() && RegNo == Mips::RA_64) ||
+          (!isGP64bit() && RegNo == Mips::RA)) {
+        Regs.push_back(RegNo);
+      } else {
+        unsigned TmpReg = PrevReg + 1;
+        while (TmpReg <= RegNo) {
+          if ((((TmpReg < Mips::S0) || (TmpReg > Mips::S7)) && !isGP64bit()) ||
+              (((TmpReg < Mips::S0_64) || (TmpReg > Mips::S7_64)) &&
+               isGP64bit())) {
+            Error(E, "invalid register operand");
+            return MatchOperand_ParseFail;
+          }
+
+          PrevReg = TmpReg;
+          Regs.push_back(TmpReg++);
+        }
+      }
+
+      RegRange = false;
+    } else {
+      if ((PrevReg == Mips::NoRegister) &&
+          ((isGP64bit() && (RegNo != Mips::S0_64) && (RegNo != Mips::RA_64)) ||
+          (!isGP64bit() && (RegNo != Mips::S0) && (RegNo != Mips::RA)))) {
+        Error(E, "$16 or $31 expected");
+        return MatchOperand_ParseFail;
+      } else if (!(((RegNo == Mips::FP || RegNo == Mips::RA ||
+                    (RegNo >= Mips::S0 && RegNo <= Mips::S7)) &&
+                    !isGP64bit()) ||
+                   ((RegNo == Mips::FP_64 || RegNo == Mips::RA_64 ||
+                    (RegNo >= Mips::S0_64 && RegNo <= Mips::S7_64)) &&
+                    isGP64bit()))) {
+        Error(E, "invalid register operand");
+        return MatchOperand_ParseFail;
+      } else if ((PrevReg != Mips::NoRegister) && (RegNo != PrevReg + 1) &&
+                 ((RegNo != Mips::FP && RegNo != Mips::RA && !isGP64bit()) ||
+                  (RegNo != Mips::FP_64 && RegNo != Mips::RA_64 &&
+                   isGP64bit()))) {
+        Error(E, "consecutive register numbers expected");
+        return MatchOperand_ParseFail;
+      }
+
+      Regs.push_back(RegNo);
+    }
+
+    if (Parser.getTok().is(AsmToken::Minus))
+      RegRange = true;
+
+    if (!Parser.getTok().isNot(AsmToken::Minus) &&
+        !Parser.getTok().isNot(AsmToken::Comma)) {
+      Error(E, "',' or '-' expected");
+      return MatchOperand_ParseFail;
+    }
+
+    Lex(); // Consume comma or minus
+    if (Parser.getTok().isNot(AsmToken::Dollar))
+      break;
+
+    PrevReg = RegNo;
+  }
+
+  SMLoc E = Parser.getTok().getLoc();
+  Operands.push_back(MipsOperand::CreateRegList(Regs, S, E, *this));
+  parseMemOperand(Operands);
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+MipsAsmParser::parseRegisterPair(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+
+  SMLoc S = Parser.getTok().getLoc();
+  if (parseAnyRegister(Operands) != MatchOperand_Success)
+    return MatchOperand_ParseFail;
+
+  SMLoc E = Parser.getTok().getLoc();
+  MipsOperand Op = static_cast<MipsOperand &>(*Operands.back());
+
+  Operands.pop_back();
+  Operands.push_back(MipsOperand::CreateRegPair(Op, S, E, *this));
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+MipsAsmParser::parseMovePRegPair(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> TmpOperands;
+  SmallVector<unsigned, 10> Regs;
+
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_ParseFail;
+
+  SMLoc S = Parser.getTok().getLoc();
+
+  if (parseAnyRegister(TmpOperands) != MatchOperand_Success)
+    return MatchOperand_ParseFail;
+
+  MipsOperand *Reg = &static_cast<MipsOperand &>(*TmpOperands.back());
+  unsigned RegNo = isGP64bit() ? Reg->getGPR64Reg() : Reg->getGPR32Reg();
+  Regs.push_back(RegNo);
+
+  SMLoc E = Parser.getTok().getLoc();
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Error(E, "',' expected");
+    return MatchOperand_ParseFail;
+  }
+
+  // Remove comma.
+  Parser.Lex();
+
+  if (parseAnyRegister(TmpOperands) != MatchOperand_Success)
+    return MatchOperand_ParseFail;
+
+  Reg = &static_cast<MipsOperand &>(*TmpOperands.back());
+  RegNo = isGP64bit() ? Reg->getGPR64Reg() : Reg->getGPR32Reg();
+  Regs.push_back(RegNo);
+
+  Operands.push_back(MipsOperand::CreateRegList(Regs, S, E, *this));
+
+  return MatchOperand_Success;
+}
+
+/// Sometimes (i.e. load/stores) the operand may be followed immediately by
+/// either this.
+/// ::= '(', register, ')'
+/// handle it before we iterate so we don't get tripped up by the lack of
+/// a comma.
+bool MipsAsmParser::parseParenSuffix(StringRef Name, OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  if (getLexer().is(AsmToken::LParen)) {
+    Operands.push_back(
+        MipsOperand::CreateToken("(", getLexer().getLoc(), *this));
+    Parser.Lex();
+    if (parseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      return Error(Loc, "unexpected token in argument list");
+    }
+    if (Parser.getTok().isNot(AsmToken::RParen)) {
+      SMLoc Loc = getLexer().getLoc();
+      return Error(Loc, "unexpected token, expected ')'");
+    }
+    Operands.push_back(
+        MipsOperand::CreateToken(")", getLexer().getLoc(), *this));
+    Parser.Lex();
+  }
+  return false;
+}
+
+/// Sometimes (i.e. in MSA) the operand may be followed immediately by
+/// either one of these.
+/// ::= '[', register, ']'
+/// ::= '[', integer, ']'
+/// handle it before we iterate so we don't get tripped up by the lack of
+/// a comma.
+bool MipsAsmParser::parseBracketSuffix(StringRef Name,
+                                       OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  if (getLexer().is(AsmToken::LBrac)) {
+    Operands.push_back(
+        MipsOperand::CreateToken("[", getLexer().getLoc(), *this));
+    Parser.Lex();
+    if (parseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      return Error(Loc, "unexpected token in argument list");
+    }
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      SMLoc Loc = getLexer().getLoc();
+      return Error(Loc, "unexpected token, expected ']'");
+    }
+    Operands.push_back(
+        MipsOperand::CreateToken("]", getLexer().getLoc(), *this));
+    Parser.Lex();
+  }
+  return false;
+}
+
+bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                     SMLoc NameLoc, OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  DEBUG(dbgs() << "ParseInstruction\n");
+
+  // We have reached first instruction, module directive are now forbidden.
+  getTargetStreamer().forbidModuleDirective();
+
+  // Check if we have valid mnemonic
+  if (!mnemonicIsValid(Name, 0)) {
+    return Error(NameLoc, "unknown instruction");
+  }
+  // First operand in MCInst is instruction mnemonic.
+  Operands.push_back(MipsOperand::CreateToken(Name, NameLoc, *this));
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (parseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      return Error(Loc, "unexpected token in argument list");
+    }
+    if (getLexer().is(AsmToken::LBrac) && parseBracketSuffix(Name, Operands))
+      return true;
+    // AFAIK, parenthesis suffixes are never on the first operand
+
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat the comma.
+      // Parse and remember the operand.
+      if (parseOperand(Operands, Name)) {
+        SMLoc Loc = getLexer().getLoc();
+        return Error(Loc, "unexpected token in argument list");
+      }
+      // Parse bracket and parenthesis suffixes before we iterate
+      if (getLexer().is(AsmToken::LBrac)) {
+        if (parseBracketSuffix(Name, Operands))
+          return true;
+      } else if (getLexer().is(AsmToken::LParen) &&
+                 parseParenSuffix(Name, Operands))
+        return true;
+    }
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    return Error(Loc, "unexpected token in argument list");
+  }
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+// FIXME: Given that these have the same name, these should both be
+// consistent on affecting the Parser.
+bool MipsAsmParser::reportParseError(Twine ErrorMsg) {
+  SMLoc Loc = getLexer().getLoc();
+  return Error(Loc, ErrorMsg);
+}
+
+bool MipsAsmParser::reportParseError(SMLoc Loc, Twine ErrorMsg) {
+  return Error(Loc, ErrorMsg);
+}
+
+bool MipsAsmParser::parseSetNoAtDirective() {
+  MCAsmParser &Parser = getParser();
+  // Line should look like: ".set noat".
+
+  // Set the $at register to $0.
+  AssemblerOptions.back()->setATRegIndex(0);
+
+  Parser.Lex(); // Eat "noat".
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  getTargetStreamer().emitDirectiveSetNoAt();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetAtDirective() {
+  // Line can be: ".set at", which sets $at to $1
+  //          or  ".set at=$reg", which sets $at to $reg.
+  MCAsmParser &Parser = getParser();
+  Parser.Lex(); // Eat "at".
+
+  if (getLexer().is(AsmToken::EndOfStatement)) {
+    // No register was specified, so we set $at to $1.
+    AssemblerOptions.back()->setATRegIndex(1);
+
+    getTargetStreamer().emitDirectiveSetAt();
+    Parser.Lex(); // Consume the EndOfStatement.
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::Equal)) {
+    reportParseError("unexpected token, expected equals sign");
+    return false;
+  }
+  Parser.Lex(); // Eat "=".
+
+  if (getLexer().isNot(AsmToken::Dollar)) {
+    if (getLexer().is(AsmToken::EndOfStatement)) {
+      reportParseError("no register specified");
+      return false;
+    } else {
+      reportParseError("unexpected token, expected dollar sign '$'");
+      return false;
+    }
+  }
+  Parser.Lex(); // Eat "$".
+
+  // Find out what "reg" is.
+  unsigned AtRegNo;
+  const AsmToken &Reg = Parser.getTok();
+  if (Reg.is(AsmToken::Identifier)) {
+    AtRegNo = matchCPURegisterName(Reg.getIdentifier());
+  } else if (Reg.is(AsmToken::Integer)) {
+    AtRegNo = Reg.getIntVal();
+  } else {
+    reportParseError("unexpected token, expected identifier or integer");
+    return false;
+  }
+
+  // Check if $reg is a valid register. If it is, set $at to $reg.
+  if (!AssemblerOptions.back()->setATRegIndex(AtRegNo)) {
+    reportParseError("invalid register");
+    return false;
+  }
+  Parser.Lex(); // Eat "reg".
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  getTargetStreamer().emitDirectiveSetAtWithArg(AtRegNo);
+
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetReorderDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+  AssemblerOptions.back()->setReorder();
+  getTargetStreamer().emitDirectiveSetReorder();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoReorderDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+  AssemblerOptions.back()->setNoReorder();
+  getTargetStreamer().emitDirectiveSetNoReorder();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetMacroDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+  AssemblerOptions.back()->setMacro();
+  getTargetStreamer().emitDirectiveSetMacro();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoMacroDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+  if (AssemblerOptions.back()->isReorder()) {
+    reportParseError("`noreorder' must be set before `nomacro'");
+    return false;
+  }
+  AssemblerOptions.back()->setNoMacro();
+  getTargetStreamer().emitDirectiveSetNoMacro();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetMsaDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token, expected end of statement");
+
+  setFeatureBits(Mips::FeatureMSA, "msa");
+  getTargetStreamer().emitDirectiveSetMsa();
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoMsaDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token, expected end of statement");
+
+  clearFeatureBits(Mips::FeatureMSA, "msa");
+  getTargetStreamer().emitDirectiveSetNoMsa();
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoDspDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex(); // Eat "nodsp".
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  clearFeatureBits(Mips::FeatureDSP, "dsp");
+  getTargetStreamer().emitDirectiveSetNoDsp();
+  return false;
+}
+
+bool MipsAsmParser::parseSetMips16Directive() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex(); // Eat "mips16".
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  setFeatureBits(Mips::FeatureMips16, "mips16");
+  getTargetStreamer().emitDirectiveSetMips16();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoMips16Directive() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex(); // Eat "nomips16".
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  clearFeatureBits(Mips::FeatureMips16, "mips16");
+  getTargetStreamer().emitDirectiveSetNoMips16();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetFpDirective() {
+  MCAsmParser &Parser = getParser();
+  MipsABIFlagsSection::FpABIKind FpAbiVal;
+  // Line can be: .set fp=32
+  //              .set fp=xx
+  //              .set fp=64
+  Parser.Lex(); // Eat fp token
+  AsmToken Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Equal)) {
+    reportParseError("unexpected token, expected equals sign '='");
+    return false;
+  }
+  Parser.Lex(); // Eat '=' token.
+  Tok = Parser.getTok();
+
+  if (!parseFpABIValue(FpAbiVal, ".set"))
+    return false;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+  getTargetStreamer().emitDirectiveSetFp(FpAbiVal);
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetOddSPRegDirective() {
+  MCAsmParser &Parser = getParser();
+
+  Parser.Lex(); // Eat "oddspreg".
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  clearFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
+  getTargetStreamer().emitDirectiveSetOddSPReg();
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoOddSPRegDirective() {
+  MCAsmParser &Parser = getParser();
+
+  Parser.Lex(); // Eat "nooddspreg".
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  setFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
+  getTargetStreamer().emitDirectiveSetNoOddSPReg();
+  return false;
+}
+
+bool MipsAsmParser::parseSetPopDirective() {
+  MCAsmParser &Parser = getParser();
+  SMLoc Loc = getLexer().getLoc();
+
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token, expected end of statement");
+
+  // Always keep an element on the options "stack" to prevent the user
+  // from changing the initial options. This is how we remember them.
+  if (AssemblerOptions.size() == 2)
+    return reportParseError(Loc, ".set pop with no .set push");
+
+  MCSubtargetInfo &STI = copySTI();
+  AssemblerOptions.pop_back();
+  setAvailableFeatures(
+      ComputeAvailableFeatures(AssemblerOptions.back()->getFeatures()));
+  STI.setFeatureBits(AssemblerOptions.back()->getFeatures());
+
+  getTargetStreamer().emitDirectiveSetPop();
+  return false;
+}
+
+bool MipsAsmParser::parseSetPushDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token, expected end of statement");
+
+  // Create a copy of the current assembler options environment and push it.
+  AssemblerOptions.push_back(
+              make_unique<MipsAssemblerOptions>(AssemblerOptions.back().get()));
+
+  getTargetStreamer().emitDirectiveSetPush();
+  return false;
+}
+
+bool MipsAsmParser::parseSetSoftFloatDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token, expected end of statement");
+
+  setFeatureBits(Mips::FeatureSoftFloat, "soft-float");
+  getTargetStreamer().emitDirectiveSetSoftFloat();
+  return false;
+}
+
+bool MipsAsmParser::parseSetHardFloatDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token, expected end of statement");
+
+  clearFeatureBits(Mips::FeatureSoftFloat, "soft-float");
+  getTargetStreamer().emitDirectiveSetHardFloat();
+  return false;
+}
+
+bool MipsAsmParser::parseSetAssignment() {
+  StringRef Name;
+  const MCExpr *Value;
+  MCAsmParser &Parser = getParser();
+
+  if (Parser.parseIdentifier(Name))
+    reportParseError("expected identifier after .set");
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return reportParseError("unexpected token, expected comma");
+  Lex(); // Eat comma
+
+  if (Parser.parseExpression(Value))
+    return reportParseError("expected valid expression after comma");
+
+  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+  Sym->setVariableValue(Value);
+
+  return false;
+}
+
+bool MipsAsmParser::parseSetMips0Directive() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token, expected end of statement");
+
+  // Reset assembler options to their initial values.
+  MCSubtargetInfo &STI = copySTI();
+  setAvailableFeatures(
+      ComputeAvailableFeatures(AssemblerOptions.front()->getFeatures()));
+  STI.setFeatureBits(AssemblerOptions.front()->getFeatures());
+  AssemblerOptions.back()->setFeatures(AssemblerOptions.front()->getFeatures());
+
+  getTargetStreamer().emitDirectiveSetMips0();
+  return false;
+}
+
+bool MipsAsmParser::parseSetArchDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::Equal))
+    return reportParseError("unexpected token, expected equals sign");
+
+  Parser.Lex();
+  StringRef Arch;
+  if (Parser.parseIdentifier(Arch))
+    return reportParseError("expected arch identifier");
+
+  StringRef ArchFeatureName =
+      StringSwitch<StringRef>(Arch)
+          .Case("mips1", "mips1")
+          .Case("mips2", "mips2")
+          .Case("mips3", "mips3")
+          .Case("mips4", "mips4")
+          .Case("mips5", "mips5")
+          .Case("mips32", "mips32")
+          .Case("mips32r2", "mips32r2")
+          .Case("mips32r3", "mips32r3")
+          .Case("mips32r5", "mips32r5")
+          .Case("mips32r6", "mips32r6")
+          .Case("mips64", "mips64")
+          .Case("mips64r2", "mips64r2")
+          .Case("mips64r3", "mips64r3")
+          .Case("mips64r5", "mips64r5")
+          .Case("mips64r6", "mips64r6")
+          .Case("octeon", "cnmips")
+          .Case("r4000", "mips3") // This is an implementation of Mips3.
+          .Default("");
+
+  if (ArchFeatureName.empty())
+    return reportParseError("unsupported architecture");
+
+  selectArch(ArchFeatureName);
+  getTargetStreamer().emitDirectiveSetArch(Arch);
+  return false;
+}
+
+bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token, expected end of statement");
+
+  switch (Feature) {
+  default:
+    llvm_unreachable("Unimplemented feature");
+  case Mips::FeatureDSP:
+    setFeatureBits(Mips::FeatureDSP, "dsp");
+    getTargetStreamer().emitDirectiveSetDsp();
+    break;
+  case Mips::FeatureMicroMips:
+    setFeatureBits(Mips::FeatureMicroMips, "micromips");
+    getTargetStreamer().emitDirectiveSetMicroMips();
+    break;
+  case Mips::FeatureMips1:
+    selectArch("mips1");
+    getTargetStreamer().emitDirectiveSetMips1();
+    break;
+  case Mips::FeatureMips2:
+    selectArch("mips2");
+    getTargetStreamer().emitDirectiveSetMips2();
+    break;
+  case Mips::FeatureMips3:
+    selectArch("mips3");
+    getTargetStreamer().emitDirectiveSetMips3();
+    break;
+  case Mips::FeatureMips4:
+    selectArch("mips4");
+    getTargetStreamer().emitDirectiveSetMips4();
+    break;
+  case Mips::FeatureMips5:
+    selectArch("mips5");
+    getTargetStreamer().emitDirectiveSetMips5();
+    break;
+  case Mips::FeatureMips32:
+    selectArch("mips32");
+    getTargetStreamer().emitDirectiveSetMips32();
+    break;
+  case Mips::FeatureMips32r2:
+    selectArch("mips32r2");
+    getTargetStreamer().emitDirectiveSetMips32R2();
+    break;
+  case Mips::FeatureMips32r3:
+    selectArch("mips32r3");
+    getTargetStreamer().emitDirectiveSetMips32R3();
+    break;
+  case Mips::FeatureMips32r5:
+    selectArch("mips32r5");
+    getTargetStreamer().emitDirectiveSetMips32R5();
+    break;
+  case Mips::FeatureMips32r6:
+    selectArch("mips32r6");
+    getTargetStreamer().emitDirectiveSetMips32R6();
+    break;
+  case Mips::FeatureMips64:
+    selectArch("mips64");
+    getTargetStreamer().emitDirectiveSetMips64();
+    break;
+  case Mips::FeatureMips64r2:
+    selectArch("mips64r2");
+    getTargetStreamer().emitDirectiveSetMips64R2();
+    break;
+  case Mips::FeatureMips64r3:
+    selectArch("mips64r3");
+    getTargetStreamer().emitDirectiveSetMips64R3();
+    break;
+  case Mips::FeatureMips64r5:
+    selectArch("mips64r5");
+    getTargetStreamer().emitDirectiveSetMips64R5();
+    break;
+  case Mips::FeatureMips64r6:
+    selectArch("mips64r6");
+    getTargetStreamer().emitDirectiveSetMips64R6();
+    break;
+  }
+  return false;
+}
+
+bool MipsAsmParser::eatComma(StringRef ErrorStr) {
+  MCAsmParser &Parser = getParser();
+  if (getLexer().isNot(AsmToken::Comma)) {
+    SMLoc Loc = getLexer().getLoc();
+    return Error(Loc, ErrorStr);
+  }
+
+  Parser.Lex(); // Eat the comma.
+  return true;
+}
+
+// Used to determine if .cpload, .cprestore, and .cpsetup have any effect.
+// In this class, it is only used for .cprestore.
+// FIXME: Only keep track of IsPicEnabled in one place, instead of in both
+// MipsTargetELFStreamer and MipsAsmParser.
+bool MipsAsmParser::isPicAndNotNxxAbi() {
+  return inPicMode() && !(isABI_N32() || isABI_N64());
+}
+
+bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) {
+  if (AssemblerOptions.back()->isReorder())
+    Warning(Loc, ".cpload should be inside a noreorder section");
+
+  if (inMips16Mode()) {
+    reportParseError(".cpload is not supported in Mips16 mode");
+    return false;
+  }
+
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Reg;
+  OperandMatchResultTy ResTy = parseAnyRegister(Reg);
+  if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
+    reportParseError("expected register containing function address");
+    return false;
+  }
+
+  MipsOperand &RegOpnd = static_cast<MipsOperand &>(*Reg[0]);
+  if (!RegOpnd.isGPRAsmReg()) {
+    reportParseError(RegOpnd.getStartLoc(), "invalid register");
+    return false;
+  }
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  getTargetStreamer().emitDirectiveCpLoad(RegOpnd.getGPR32Reg());
+  return false;
+}
+
+bool MipsAsmParser::parseDirectiveCpRestore(SMLoc Loc) {
+  MCAsmParser &Parser = getParser();
+
+  // Note that .cprestore is ignored if used with the N32 and N64 ABIs or if it
+  // is used in non-PIC mode.
+
+  if (inMips16Mode()) {
+    reportParseError(".cprestore is not supported in Mips16 mode");
+    return false;
+  }
+
+  // Get the stack offset value.
+  const MCExpr *StackOffset;
+  int64_t StackOffsetVal;
+  if (Parser.parseExpression(StackOffset)) {
+    reportParseError("expected stack offset value");
+    return false;
+  }
+
+  if (!StackOffset->evaluateAsAbsolute(StackOffsetVal)) {
+    reportParseError("stack offset is not an absolute expression");
+    return false;
+  }
+
+  if (StackOffsetVal < 0) {
+    Warning(Loc, ".cprestore with negative stack offset has no effect");
+    IsCpRestoreSet = false;
+  } else {
+    IsCpRestoreSet = true;
+    CpRestoreOffset = StackOffsetVal;
+  }
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  if (!getTargetStreamer().emitDirectiveCpRestore(
+          CpRestoreOffset, [&]() { return getATReg(Loc); }, Loc, STI))
+    return true;
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseDirectiveCPSetup() {
+  MCAsmParser &Parser = getParser();
+  unsigned FuncReg;
+  unsigned Save;
+  bool SaveIsReg = true;
+
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> TmpReg;
+  OperandMatchResultTy ResTy = parseAnyRegister(TmpReg);
+  if (ResTy == MatchOperand_NoMatch) {
+    reportParseError("expected register containing function address");
+    return false;
+  }
+
+  MipsOperand &FuncRegOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
+  if (!FuncRegOpnd.isGPRAsmReg()) {
+    reportParseError(FuncRegOpnd.getStartLoc(), "invalid register");
+    return false;
+  }
+
+  FuncReg = FuncRegOpnd.getGPR32Reg();
+  TmpReg.clear();
+
+  if (!eatComma("unexpected token, expected comma"))
+    return true;
+
+  ResTy = parseAnyRegister(TmpReg);
+  if (ResTy == MatchOperand_NoMatch) {
+    const MCExpr *OffsetExpr;
+    int64_t OffsetVal;
+    SMLoc ExprLoc = getLexer().getLoc();
+
+    if (Parser.parseExpression(OffsetExpr) ||
+        !OffsetExpr->evaluateAsAbsolute(OffsetVal)) {
+      reportParseError(ExprLoc, "expected save register or stack offset");
+      return false;
+    }
+
+    Save = OffsetVal;
+    SaveIsReg = false;
+  } else {
+    MipsOperand &SaveOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
+    if (!SaveOpnd.isGPRAsmReg()) {
+      reportParseError(SaveOpnd.getStartLoc(), "invalid register");
+      return false;
+    }
+    Save = SaveOpnd.getGPR32Reg();
+  }
+
+  if (!eatComma("unexpected token, expected comma"))
+    return true;
+
+  const MCExpr *Expr;
+  if (Parser.parseExpression(Expr)) {
+    reportParseError("expected expression");
+    return false;
+  }
+
+  if (Expr->getKind() != MCExpr::SymbolRef) {
+    reportParseError("expected symbol");
+    return false;
+  }
+  const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
+
+  CpSaveLocation = Save;
+  CpSaveLocationIsRegister = SaveIsReg;
+
+  getTargetStreamer().emitDirectiveCpsetup(FuncReg, Save, Ref->getSymbol(),
+                                           SaveIsReg);
+  return false;
+}
+
+bool MipsAsmParser::parseDirectiveCPReturn() {
+  getTargetStreamer().emitDirectiveCpreturn(CpSaveLocation,
+                                            CpSaveLocationIsRegister);
+  return false;
+}
+
+bool MipsAsmParser::parseDirectiveNaN() {
+  MCAsmParser &Parser = getParser();
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    const AsmToken &Tok = Parser.getTok();
+
+    if (Tok.getString() == "2008") {
+      Parser.Lex();
+      getTargetStreamer().emitDirectiveNaN2008();
+      return false;
+    } else if (Tok.getString() == "legacy") {
+      Parser.Lex();
+      getTargetStreamer().emitDirectiveNaNLegacy();
+      return false;
+    }
+  }
+  // If we don't recognize the option passed to the .nan
+  // directive (e.g. no option or unknown option), emit an error.
+  reportParseError("invalid option in .nan directive");
+  return false;
+}
+
+bool MipsAsmParser::parseDirectiveSet() {
+  MCAsmParser &Parser = getParser();
+  // Get the next token.
+  const AsmToken &Tok = Parser.getTok();
+
+  if (Tok.getString() == "noat") {
+    return parseSetNoAtDirective();
+  } else if (Tok.getString() == "at") {
+    return parseSetAtDirective();
+  } else if (Tok.getString() == "arch") {
+    return parseSetArchDirective();
+  } else if (Tok.getString() == "fp") {
+    return parseSetFpDirective();
+  } else if (Tok.getString() == "oddspreg") {
+    return parseSetOddSPRegDirective();
+  } else if (Tok.getString() == "nooddspreg") {
+    return parseSetNoOddSPRegDirective();
+  } else if (Tok.getString() == "pop") {
+    return parseSetPopDirective();
+  } else if (Tok.getString() == "push") {
+    return parseSetPushDirective();
+  } else if (Tok.getString() == "reorder") {
+    return parseSetReorderDirective();
+  } else if (Tok.getString() == "noreorder") {
+    return parseSetNoReorderDirective();
+  } else if (Tok.getString() == "macro") {
+    return parseSetMacroDirective();
+  } else if (Tok.getString() == "nomacro") {
+    return parseSetNoMacroDirective();
+  } else if (Tok.getString() == "mips16") {
+    return parseSetMips16Directive();
+  } else if (Tok.getString() == "nomips16") {
+    return parseSetNoMips16Directive();
+  } else if (Tok.getString() == "nomicromips") {
+    clearFeatureBits(Mips::FeatureMicroMips, "micromips");
+    getTargetStreamer().emitDirectiveSetNoMicroMips();
+    Parser.eatToEndOfStatement();
+    return false;
+  } else if (Tok.getString() == "micromips") {
+    return parseSetFeature(Mips::FeatureMicroMips);
+  } else if (Tok.getString() == "mips0") {
+    return parseSetMips0Directive();
+  } else if (Tok.getString() == "mips1") {
+    return parseSetFeature(Mips::FeatureMips1);
+  } else if (Tok.getString() == "mips2") {
+    return parseSetFeature(Mips::FeatureMips2);
+  } else if (Tok.getString() == "mips3") {
+    return parseSetFeature(Mips::FeatureMips3);
+  } else if (Tok.getString() == "mips4") {
+    return parseSetFeature(Mips::FeatureMips4);
+  } else if (Tok.getString() == "mips5") {
+    return parseSetFeature(Mips::FeatureMips5);
+  } else if (Tok.getString() == "mips32") {
+    return parseSetFeature(Mips::FeatureMips32);
+  } else if (Tok.getString() == "mips32r2") {
+    return parseSetFeature(Mips::FeatureMips32r2);
+  } else if (Tok.getString() == "mips32r3") {
+    return parseSetFeature(Mips::FeatureMips32r3);
+  } else if (Tok.getString() == "mips32r5") {
+    return parseSetFeature(Mips::FeatureMips32r5);
+  } else if (Tok.getString() == "mips32r6") {
+    return parseSetFeature(Mips::FeatureMips32r6);
+  } else if (Tok.getString() == "mips64") {
+    return parseSetFeature(Mips::FeatureMips64);
+  } else if (Tok.getString() == "mips64r2") {
+    return parseSetFeature(Mips::FeatureMips64r2);
+  } else if (Tok.getString() == "mips64r3") {
+    return parseSetFeature(Mips::FeatureMips64r3);
+  } else if (Tok.getString() == "mips64r5") {
+    return parseSetFeature(Mips::FeatureMips64r5);
+  } else if (Tok.getString() == "mips64r6") {
+    return parseSetFeature(Mips::FeatureMips64r6);
+  } else if (Tok.getString() == "dsp") {
+    return parseSetFeature(Mips::FeatureDSP);
+  } else if (Tok.getString() == "nodsp") {
+    return parseSetNoDspDirective();
+  } else if (Tok.getString() == "msa") {
+    return parseSetMsaDirective();
+  } else if (Tok.getString() == "nomsa") {
+    return parseSetNoMsaDirective();
+  } else if (Tok.getString() == "softfloat") {
+    return parseSetSoftFloatDirective();
+  } else if (Tok.getString() == "hardfloat") {
+    return parseSetHardFloatDirective();
+  } else {
+    // It is just an identifier, look for an assignment.
+    parseSetAssignment();
+    return false;
+  }
+
+  return true;
+}
+
+/// parseDataDirective
+///  ::= .word [ expression (, expression)* ]
+bool MipsAsmParser::parseDataDirective(unsigned Size, SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token, expected comma");
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+/// parseDirectiveGpWord
+///  ::= .gpword local_sym
+bool MipsAsmParser::parseDirectiveGpWord() {
+  MCAsmParser &Parser = getParser();
+  const MCExpr *Value;
+  // EmitGPRel32Value requires an expression, so we are using base class
+  // method to evaluate the expression.
+  if (getParser().parseExpression(Value))
+    return true;
+  getParser().getStreamer().EmitGPRel32Value(Value);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getLoc(), 
+                "unexpected token, expected end of statement");
+  Parser.Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
+/// parseDirectiveGpDWord
+///  ::= .gpdword local_sym
+bool MipsAsmParser::parseDirectiveGpDWord() {
+  MCAsmParser &Parser = getParser();
+  const MCExpr *Value;
+  // EmitGPRel64Value requires an expression, so we are using base class
+  // method to evaluate the expression.
+  if (getParser().parseExpression(Value))
+    return true;
+  getParser().getStreamer().EmitGPRel64Value(Value);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getLoc(),
+                "unexpected token, expected end of statement");
+  Parser.Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
+/// parseDirectiveDtpRelWord
+///  ::= .dtprelword tls_sym
+bool MipsAsmParser::parseDirectiveDtpRelWord() {
+  MCAsmParser &Parser = getParser();
+  const MCExpr *Value;
+  // EmitDTPRel32Value requires an expression, so we are using base class
+  // method to evaluate the expression.
+  if (getParser().parseExpression(Value))
+    return true;
+  getParser().getStreamer().EmitDTPRel32Value(Value);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getLoc(),
+                "unexpected token, expected end of statement");
+  Parser.Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
+/// parseDirectiveDtpRelDWord
+///  ::= .dtpreldword tls_sym
+bool MipsAsmParser::parseDirectiveDtpRelDWord() {
+  MCAsmParser &Parser = getParser();
+  const MCExpr *Value;
+  // EmitDTPRel64Value requires an expression, so we are using base class
+  // method to evaluate the expression.
+  if (getParser().parseExpression(Value))
+    return true;
+  getParser().getStreamer().EmitDTPRel64Value(Value);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getLoc(),
+                "unexpected token, expected end of statement");
+  Parser.Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
+/// parseDirectiveTpRelWord
+///  ::= .tprelword tls_sym
+bool MipsAsmParser::parseDirectiveTpRelWord() {
+  MCAsmParser &Parser = getParser();
+  const MCExpr *Value;
+  // EmitTPRel32Value requires an expression, so we are using base class
+  // method to evaluate the expression.
+  if (getParser().parseExpression(Value))
+    return true;
+  getParser().getStreamer().EmitTPRel32Value(Value);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getLoc(),
+                "unexpected token, expected end of statement");
+  Parser.Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
+/// parseDirectiveTpRelDWord
+///  ::= .tpreldword tls_sym
+bool MipsAsmParser::parseDirectiveTpRelDWord() {
+  MCAsmParser &Parser = getParser();
+  const MCExpr *Value;
+  // EmitTPRel64Value requires an expression, so we are using base class
+  // method to evaluate the expression.
+  if (getParser().parseExpression(Value))
+    return true;
+  getParser().getStreamer().EmitTPRel64Value(Value);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getLoc(),
+                "unexpected token, expected end of statement");
+  Parser.Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
+bool MipsAsmParser::parseDirectiveOption() {
+  MCAsmParser &Parser = getParser();
+  // Get the option token.
+  AsmToken Tok = Parser.getTok();
+  // At the moment only identifiers are supported.
+  if (Tok.isNot(AsmToken::Identifier)) {
+    return Error(Parser.getTok().getLoc(),
+                 "unexpected token, expected identifier");
+  }
+
+  StringRef Option = Tok.getIdentifier();
+
+  if (Option == "pic0") {
+    // MipsAsmParser needs to know if the current PIC mode changes.
+    IsPicEnabled = false;
+
+    getTargetStreamer().emitDirectiveOptionPic0();
+    Parser.Lex();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+      return Error(Parser.getTok().getLoc(),
+                   "unexpected token, expected end of statement");
+    }
+    return false;
+  }
+
+  if (Option == "pic2") {
+    // MipsAsmParser needs to know if the current PIC mode changes.
+    IsPicEnabled = true;
+
+    getTargetStreamer().emitDirectiveOptionPic2();
+    Parser.Lex();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+      return Error(Parser.getTok().getLoc(),
+                   "unexpected token, expected end of statement");
+    }
+    return false;
+  }
+
+  // Unknown option.
+  Warning(Parser.getTok().getLoc(), 
+          "unknown option, expected 'pic0' or 'pic2'");
+  Parser.eatToEndOfStatement();
+  return false;
+}
+
+/// parseInsnDirective
+///  ::= .insn
+bool MipsAsmParser::parseInsnDirective() {
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  // The actual label marking happens in
+  // MipsELFStreamer::createPendingLabelRelocs().
+  getTargetStreamer().emitDirectiveInsn();
+
+  getParser().Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
+/// parseSSectionDirective
+///  ::= .sbss
+///  ::= .sdata
+bool MipsAsmParser::parseSSectionDirective(StringRef Section, unsigned Type) {
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  MCSection *ELFSection = getContext().getELFSection(
+      Section, Type, ELF::SHF_WRITE | ELF::SHF_ALLOC | ELF::SHF_MIPS_GPREL);
+  getParser().getStreamer().SwitchSection(ELFSection);
+
+  getParser().Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
+/// parseDirectiveModule
+///  ::= .module oddspreg
+///  ::= .module nooddspreg
+///  ::= .module fp=value
+///  ::= .module softfloat
+///  ::= .module hardfloat
+bool MipsAsmParser::parseDirectiveModule() {
+  MCAsmParser &Parser = getParser();
+  MCAsmLexer &Lexer = getLexer();
+  SMLoc L = Lexer.getLoc();
+
+  if (!getTargetStreamer().isModuleDirectiveAllowed()) {
+    // TODO : get a better message.
+    reportParseError(".module directive must appear before any code");
+    return false;
+  }
+
+  StringRef Option;
+  if (Parser.parseIdentifier(Option)) {
+    reportParseError("expected .module option identifier");
+    return false;
+  }
+
+  if (Option == "oddspreg") {
+    clearModuleFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
+
+    // Synchronize the abiflags information with the FeatureBits information we
+    // changed above.
+    getTargetStreamer().updateABIInfo(*this);
+
+    // If printing assembly, use the recently updated abiflags information.
+    // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+    // emitted at the end).
+    getTargetStreamer().emitDirectiveModuleOddSPReg();
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    return false; // parseDirectiveModule has finished successfully.
+  } else if (Option == "nooddspreg") {
+    if (!isABI_O32()) {
+      return Error(L, "'.module nooddspreg' requires the O32 ABI");
+    }
+
+    setModuleFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
+
+    // Synchronize the abiflags information with the FeatureBits information we
+    // changed above.
+    getTargetStreamer().updateABIInfo(*this);
+
+    // If printing assembly, use the recently updated abiflags information.
+    // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+    // emitted at the end).
+    getTargetStreamer().emitDirectiveModuleOddSPReg();
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    return false; // parseDirectiveModule has finished successfully.
+  } else if (Option == "fp") {
+    return parseDirectiveModuleFP();
+  } else if (Option == "softfloat") {
+    setModuleFeatureBits(Mips::FeatureSoftFloat, "soft-float");
+
+    // Synchronize the ABI Flags information with the FeatureBits information we
+    // updated above.
+    getTargetStreamer().updateABIInfo(*this);
+
+    // If printing assembly, use the recently updated ABI Flags information.
+    // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+    // emitted later).
+    getTargetStreamer().emitDirectiveModuleSoftFloat();
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    return false; // parseDirectiveModule has finished successfully.
+  } else if (Option == "hardfloat") {
+    clearModuleFeatureBits(Mips::FeatureSoftFloat, "soft-float");
+
+    // Synchronize the ABI Flags information with the FeatureBits information we
+    // updated above.
+    getTargetStreamer().updateABIInfo(*this);
+
+    // If printing assembly, use the recently updated ABI Flags information.
+    // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+    // emitted later).
+    getTargetStreamer().emitDirectiveModuleHardFloat();
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    return false; // parseDirectiveModule has finished successfully.
+  } else {
+    return Error(L, "'" + Twine(Option) + "' is not a valid .module option.");
+  }
+}
+
+/// parseDirectiveModuleFP
+///  ::= =32
+///  ::= =xx
+///  ::= =64
+bool MipsAsmParser::parseDirectiveModuleFP() {
+  MCAsmParser &Parser = getParser();
+  MCAsmLexer &Lexer = getLexer();
+
+  if (Lexer.isNot(AsmToken::Equal)) {
+    reportParseError("unexpected token, expected equals sign '='");
+    return false;
+  }
+  Parser.Lex(); // Eat '=' token.
+
+  MipsABIFlagsSection::FpABIKind FpABI;
+  if (!parseFpABIValue(FpABI, ".module"))
+    return false;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  // Synchronize the abiflags information with the FeatureBits information we
+  // changed above.
+  getTargetStreamer().updateABIInfo(*this);
+
+  // If printing assembly, use the recently updated abiflags information.
+  // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+  // emitted at the end).
+  getTargetStreamer().emitDirectiveModuleFP();
+
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI,
+                                    StringRef Directive) {
+  MCAsmParser &Parser = getParser();
+  MCAsmLexer &Lexer = getLexer();
+  bool ModuleLevelOptions = Directive == ".module";
+
+  if (Lexer.is(AsmToken::Identifier)) {
+    StringRef Value = Parser.getTok().getString();
+    Parser.Lex();
+
+    if (Value != "xx") {
+      reportParseError("unsupported value, expected 'xx', '32' or '64'");
+      return false;
+    }
+
+    if (!isABI_O32()) {
+      reportParseError("'" + Directive + " fp=xx' requires the O32 ABI");
+      return false;
+    }
+
+    FpABI = MipsABIFlagsSection::FpABIKind::XX;
+    if (ModuleLevelOptions) {
+      setModuleFeatureBits(Mips::FeatureFPXX, "fpxx");
+      clearModuleFeatureBits(Mips::FeatureFP64Bit, "fp64");
+    } else {
+      setFeatureBits(Mips::FeatureFPXX, "fpxx");
+      clearFeatureBits(Mips::FeatureFP64Bit, "fp64");
+    }
+    return true;
+  }
+
+  if (Lexer.is(AsmToken::Integer)) {
+    unsigned Value = Parser.getTok().getIntVal();
+    Parser.Lex();
+
+    if (Value != 32 && Value != 64) {
+      reportParseError("unsupported value, expected 'xx', '32' or '64'");
+      return false;
+    }
+
+    if (Value == 32) {
+      if (!isABI_O32()) {
+        reportParseError("'" + Directive + " fp=32' requires the O32 ABI");
+        return false;
+      }
+
+      FpABI = MipsABIFlagsSection::FpABIKind::S32;
+      if (ModuleLevelOptions) {
+        clearModuleFeatureBits(Mips::FeatureFPXX, "fpxx");
+        clearModuleFeatureBits(Mips::FeatureFP64Bit, "fp64");
+      } else {
+        clearFeatureBits(Mips::FeatureFPXX, "fpxx");
+        clearFeatureBits(Mips::FeatureFP64Bit, "fp64");
+      }
+    } else {
+      FpABI = MipsABIFlagsSection::FpABIKind::S64;
+      if (ModuleLevelOptions) {
+        clearModuleFeatureBits(Mips::FeatureFPXX, "fpxx");
+        setModuleFeatureBits(Mips::FeatureFP64Bit, "fp64");
+      } else {
+        clearFeatureBits(Mips::FeatureFPXX, "fpxx");
+        setFeatureBits(Mips::FeatureFP64Bit, "fp64");
+      }
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
+  // This returns false if this function recognizes the directive
+  // regardless of whether it is successfully handles or reports an
+  // error. Otherwise it returns true to give the generic parser a
+  // chance at recognizing it.
+
+  MCAsmParser &Parser = getParser();
+  StringRef IDVal = DirectiveID.getString();
+
+  if (IDVal == ".cpload") {
+    parseDirectiveCpLoad(DirectiveID.getLoc());
+    return false;
+  }
+  if (IDVal == ".cprestore") {
+    parseDirectiveCpRestore(DirectiveID.getLoc());
+    return false;
+  }
+  if (IDVal == ".dword") {
+    parseDataDirective(8, DirectiveID.getLoc());
+    return false;
+  }
+  if (IDVal == ".ent") {
+    StringRef SymbolName;
+
+    if (Parser.parseIdentifier(SymbolName)) {
+      reportParseError("expected identifier after .ent");
+      return false;
+    }
+
+    // There's an undocumented extension that allows an integer to
+    // follow the name of the procedure which AFAICS is ignored by GAS.
+    // Example: .ent foo,2
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      if (getLexer().isNot(AsmToken::Comma)) {
+        // Even though we accept this undocumented extension for compatibility
+        // reasons, the additional integer argument does not actually change
+        // the behaviour of the '.ent' directive, so we would like to discourage
+        // its use. We do this by not referring to the extended version in
+        // error messages which are not directly related to its use.
+        reportParseError("unexpected token, expected end of statement");
+        return false;
+      }
+      Parser.Lex(); // Eat the comma.
+      const MCExpr *DummyNumber;
+      int64_t DummyNumberVal;
+      // If the user was explicitly trying to use the extended version,
+      // we still give helpful extension-related error messages.
+      if (Parser.parseExpression(DummyNumber)) {
+        reportParseError("expected number after comma");
+        return false;
+      }
+      if (!DummyNumber->evaluateAsAbsolute(DummyNumberVal)) {
+        reportParseError("expected an absolute expression after comma");
+        return false;
+      }
+    }
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    MCSymbol *Sym = getContext().getOrCreateSymbol(SymbolName);
+
+    getTargetStreamer().emitDirectiveEnt(*Sym);
+    CurrentFn = Sym;
+    IsCpRestoreSet = false;
+    return false;
+  }
+
+  if (IDVal == ".end") {
+    StringRef SymbolName;
+
+    if (Parser.parseIdentifier(SymbolName)) {
+      reportParseError("expected identifier after .end");
+      return false;
+    }
+
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    if (CurrentFn == nullptr) {
+      reportParseError(".end used without .ent");
+      return false;
+    }
+
+    if ((SymbolName != CurrentFn->getName())) {
+      reportParseError(".end symbol does not match .ent symbol");
+      return false;
+    }
+
+    getTargetStreamer().emitDirectiveEnd(SymbolName);
+    CurrentFn = nullptr;
+    IsCpRestoreSet = false;
+    return false;
+  }
+
+  if (IDVal == ".frame") {
+    // .frame $stack_reg, frame_size_in_bytes, $return_reg
+    SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> TmpReg;
+    OperandMatchResultTy ResTy = parseAnyRegister(TmpReg);
+    if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
+      reportParseError("expected stack register");
+      return false;
+    }
+
+    MipsOperand &StackRegOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
+    if (!StackRegOpnd.isGPRAsmReg()) {
+      reportParseError(StackRegOpnd.getStartLoc(),
+                       "expected general purpose register");
+      return false;
+    }
+    unsigned StackReg = StackRegOpnd.getGPR32Reg();
+
+    if (Parser.getTok().is(AsmToken::Comma))
+      Parser.Lex();
+    else {
+      reportParseError("unexpected token, expected comma");
+      return false;
+    }
+
+    // Parse the frame size.
+    const MCExpr *FrameSize;
+    int64_t FrameSizeVal;
+
+    if (Parser.parseExpression(FrameSize)) {
+      reportParseError("expected frame size value");
+      return false;
+    }
+
+    if (!FrameSize->evaluateAsAbsolute(FrameSizeVal)) {
+      reportParseError("frame size not an absolute expression");
+      return false;
+    }
+
+    if (Parser.getTok().is(AsmToken::Comma))
+      Parser.Lex();
+    else {
+      reportParseError("unexpected token, expected comma");
+      return false;
+    }
+
+    // Parse the return register.
+    TmpReg.clear();
+    ResTy = parseAnyRegister(TmpReg);
+    if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
+      reportParseError("expected return register");
+      return false;
+    }
+
+    MipsOperand &ReturnRegOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
+    if (!ReturnRegOpnd.isGPRAsmReg()) {
+      reportParseError(ReturnRegOpnd.getStartLoc(),
+                       "expected general purpose register");
+      return false;
+    }
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    getTargetStreamer().emitFrame(StackReg, FrameSizeVal,
+                                  ReturnRegOpnd.getGPR32Reg());
+    IsCpRestoreSet = false;
+    return false;
+  }
+
+  if (IDVal == ".set") {
+    parseDirectiveSet();
+    return false;
+  }
+
+  if (IDVal == ".mask" || IDVal == ".fmask") {
+    // .mask bitmask, frame_offset
+    // bitmask: One bit for each register used.
+    // frame_offset: Offset from Canonical Frame Address ($sp on entry) where
+    //               first register is expected to be saved.
+    // Examples:
+    //   .mask 0x80000000, -4
+    //   .fmask 0x80000000, -4
+    //
+
+    // Parse the bitmask
+    const MCExpr *BitMask;
+    int64_t BitMaskVal;
+
+    if (Parser.parseExpression(BitMask)) {
+      reportParseError("expected bitmask value");
+      return false;
+    }
+
+    if (!BitMask->evaluateAsAbsolute(BitMaskVal)) {
+      reportParseError("bitmask not an absolute expression");
+      return false;
+    }
+
+    if (Parser.getTok().is(AsmToken::Comma))
+      Parser.Lex();
+    else {
+      reportParseError("unexpected token, expected comma");
+      return false;
+    }
+
+    // Parse the frame_offset
+    const MCExpr *FrameOffset;
+    int64_t FrameOffsetVal;
+
+    if (Parser.parseExpression(FrameOffset)) {
+      reportParseError("expected frame offset value");
+      return false;
+    }
+
+    if (!FrameOffset->evaluateAsAbsolute(FrameOffsetVal)) {
+      reportParseError("frame offset not an absolute expression");
+      return false;
+    }
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    if (IDVal == ".mask")
+      getTargetStreamer().emitMask(BitMaskVal, FrameOffsetVal);
+    else
+      getTargetStreamer().emitFMask(BitMaskVal, FrameOffsetVal);
+    return false;
+  }
+
+  if (IDVal == ".nan")
+    return parseDirectiveNaN();
+
+  if (IDVal == ".gpword") {
+    parseDirectiveGpWord();
+    return false;
+  }
+
+  if (IDVal == ".gpdword") {
+    parseDirectiveGpDWord();
+    return false;
+  }
+
+  if (IDVal == ".dtprelword") {
+    parseDirectiveDtpRelWord();
+    return false;
+  }
+
+  if (IDVal == ".dtpreldword") {
+    parseDirectiveDtpRelDWord();
+    return false;
+  }
+
+  if (IDVal == ".tprelword") {
+    parseDirectiveTpRelWord();
+    return false;
+  }
+
+  if (IDVal == ".tpreldword") {
+    parseDirectiveTpRelDWord();
+    return false;
+  }
+
+  if (IDVal == ".word") {
+    parseDataDirective(4, DirectiveID.getLoc());
+    return false;
+  }
+
+  if (IDVal == ".hword") {
+    parseDataDirective(2, DirectiveID.getLoc());
+    return false;
+  }
+
+  if (IDVal == ".option") {
+    parseDirectiveOption();
+    return false;
+  }
+
+  if (IDVal == ".abicalls") {
+    getTargetStreamer().emitDirectiveAbiCalls();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+      Error(Parser.getTok().getLoc(), 
+            "unexpected token, expected end of statement");
+    }
+    return false;
+  }
+
+  if (IDVal == ".cpsetup") {
+    parseDirectiveCPSetup();
+    return false;
+  }
+  if (IDVal == ".cpreturn") {
+    parseDirectiveCPReturn();
+    return false;
+  }
+  if (IDVal == ".module") {
+    parseDirectiveModule();
+    return false;
+  }
+  if (IDVal == ".llvm_internal_mips_reallow_module_directive") {
+    parseInternalDirectiveReallowModule();
+    return false;
+  }
+  if (IDVal == ".insn") {
+    parseInsnDirective();
+    return false;
+  }
+  if (IDVal == ".sbss") {
+    parseSSectionDirective(IDVal, ELF::SHT_NOBITS);
+    return false;
+  }
+  if (IDVal == ".sdata") {
+    parseSSectionDirective(IDVal, ELF::SHT_PROGBITS);
+    return false;
+  }
+
+  return true;
+}
+
+bool MipsAsmParser::parseInternalDirectiveReallowModule() {
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  getTargetStreamer().reallowModuleDirective();
+
+  getParser().Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
+extern "C" void LLVMInitializeMipsAsmParser() {
+  RegisterMCAsmParser<MipsAsmParser> X(getTheMipsTarget());
+  RegisterMCAsmParser<MipsAsmParser> Y(getTheMipselTarget());
+  RegisterMCAsmParser<MipsAsmParser> A(getTheMips64Target());
+  RegisterMCAsmParser<MipsAsmParser> B(getTheMips64elTarget());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "MipsGenAsmMatcher.inc"
diff --git a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
new file mode 100644
index 000000000000..f80efb18507b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -0,0 +1,2505 @@
+//===- MipsDisassembler.cpp - Disassembler for Mips -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Mips Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+class MipsDisassembler : public MCDisassembler {
+  bool IsMicroMips;
+  bool IsBigEndian;
+public:
+  MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, bool IsBigEndian)
+      : MCDisassembler(STI, Ctx),
+        IsMicroMips(STI.getFeatureBits()[Mips::FeatureMicroMips]),
+        IsBigEndian(IsBigEndian) {}
+
+  bool hasMips2() const { return STI.getFeatureBits()[Mips::FeatureMips2]; }
+  bool hasMips3() const { return STI.getFeatureBits()[Mips::FeatureMips3]; }
+  bool hasMips32() const { return STI.getFeatureBits()[Mips::FeatureMips32]; }
+  bool hasMips32r6() const {
+    return STI.getFeatureBits()[Mips::FeatureMips32r6];
+  }
+  bool isFP64() const { return STI.getFeatureBits()[Mips::FeatureFP64Bit]; }
+
+  bool isGP64() const { return STI.getFeatureBits()[Mips::FeatureGP64Bit]; }
+
+  bool isPTR64() const { return STI.getFeatureBits()[Mips::FeaturePTR64Bit]; }
+
+  bool hasCnMips() const { return STI.getFeatureBits()[Mips::FeatureCnMips]; }
+
+  bool hasCOP3() const {
+    // Only present in MIPS-I and MIPS-II
+    return !hasMips32() && !hasMips3();
+  }
+
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
+};
+
+} // end anonymous namespace
+
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+
+static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeGPRMM16ZeroRegisterClass(MCInst &Inst,
+                                                   unsigned RegNo,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+
+static DecodeStatus DecodeGPRMM16MovePRegisterClass(MCInst &Inst,
+                                                    unsigned RegNo,
+                                                    uint64_t Address,
+                                                    const void *Decoder);
+
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodePtrRegisterClass(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+
+static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
+                                              unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst,
+                                                unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder);
+
+static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+
+static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget(MCInst &Inst,
+                                       unsigned Offset,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget1SImm16(MCInst &Inst,
+                                              unsigned Offset,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeJumpTarget(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget21(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget21MM(MCInst &Inst,
+                                           unsigned Offset,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget26(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+// DecodeBranchTarget7MM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTarget7MM(MCInst &Inst,
+                                          unsigned Offset,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+// DecodeBranchTarget10MM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTarget10MM(MCInst &Inst,
+                                           unsigned Offset,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+// DecodeBranchTargetMM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+// DecodeBranchTarget26MM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst,
+                                           unsigned Offset,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+// DecodeJumpTargetMM - Decode microMIPS jump target, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
+static DecodeStatus DecodeMem(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder);
+
+static DecodeStatus DecodeMemEVA(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder);
+
+static DecodeStatus DecodeLoadByte9(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder);
+
+static DecodeStatus DecodeLoadByte15(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+static DecodeStatus DecodeCacheOp(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder);
+
+static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst,
+                                             unsigned Insn,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder);
+
+static DecodeStatus DecodeStoreEvaOpMM(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
+static DecodeStatus DecodePrefeOpMM(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder);
+
+static DecodeStatus DecodeSyncI(MCInst &Inst,
+                                unsigned Insn,
+                                uint64_t Address,
+                                const void *Decoder);
+
+static DecodeStatus DecodeSynciR6(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder);
+
+static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeMemMMImm4(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder);
+
+static DecodeStatus DecodeMemMMSPImm5Lsl2(MCInst &Inst,
+                                          unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+static DecodeStatus DecodeMemMMGPImm7Lsl2(MCInst &Inst,
+                                          unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst,
+                                               unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMemMMImm9(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder);
+
+static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+static DecodeStatus DecodeMemMMImm16(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder);
+
+static DecodeStatus DecodeFMemMMR2(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address,
+                                   const void *Decoder);
+
+static DecodeStatus DecodeFMem2(MCInst &Inst, unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder);
+
+static DecodeStatus DecodeFMem3(MCInst &Inst, unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder);
+
+static DecodeStatus DecodeFMemCop2R6(MCInst &Inst, unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder);
+
+static DecodeStatus DecodeFMemCop2MMR6(MCInst &Inst, unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
+static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
+static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst,
+                                       unsigned Value,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
+static DecodeStatus DecodeLi16Imm(MCInst &Inst,
+                                  unsigned Value,
+                                  uint64_t Address,
+                                  const void *Decoder);
+
+static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst,
+                                              unsigned Value,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+template <unsigned Bits, int Offset, int Scale>
+static DecodeStatus DecodeUImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+
+template <unsigned Bits, int Offset>
+static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  return DecodeUImmWithOffsetAndScale<Bits, Offset, 1>(Inst, Value, Address,
+                                                       Decoder);
+}
+
+template <unsigned Bits, int Offset = 0, int ScaleBy = 1>
+static DecodeStatus DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+
+static DecodeStatus DecodeInsSize(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder);
+
+static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeSimm9SP(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeSimm23Lsl2(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder);
+
+/// INSVE_[BHWD] have an implicit operand that the generated decoder doesn't
+/// handle.
+template <typename InsnType>
+static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
+                                   const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus DecodeDAHIDATIMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+                                   const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus DecodeDAHIDATI(MCInst &MI, InsnType insn, uint64_t Address,
+                                   const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus DecodeDAHIDATIMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+                                   const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus DecodeDAHIDATI(MCInst &MI, InsnType insn, uint64_t Address,
+                                   const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeAddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                      const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodePOP35GroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+                           const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeDaddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodePOP37GroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+                           const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodePOP65GroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+                           const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodePOP75GroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+                           const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBlezlGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBgtzGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                      const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBlezGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBgtzGroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+                          const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBlezGroupBranchMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+                          const void *Decoder);
+
+static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
+namespace llvm {
+Target &getTheMipselTarget();
+Target &getTheMipsTarget();
+Target &getTheMips64Target();
+Target &getTheMips64elTarget();
+}
+
+static MCDisassembler *createMipsDisassembler(
+                       const Target &T,
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new MipsDisassembler(STI, Ctx, true);
+}
+
+static MCDisassembler *createMipselDisassembler(
+                       const Target &T,
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new MipsDisassembler(STI, Ctx, false);
+}
+
+extern "C" void LLVMInitializeMipsDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(getTheMipsTarget(),
+                                         createMipsDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheMipselTarget(),
+                                         createMipselDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheMips64Target(),
+                                         createMipsDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheMips64elTarget(),
+                                         createMipselDisassembler);
+}
+
+#include "MipsGenDisassemblerTables.inc"
+
+static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
+  const MipsDisassembler *Dis = static_cast<const MipsDisassembler*>(D);
+  const MCRegisterInfo *RegInfo = Dis->getContext().getRegisterInfo();
+  return *(RegInfo->getRegClass(RC).begin() + RegNo);
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
+                                   const void *Decoder) {
+  typedef DecodeStatus (*DecodeFN)(MCInst &, unsigned, uint64_t, const void *);
+  // The size of the n field depends on the element size
+  // The register class also depends on this.
+  InsnType tmp = fieldFromInstruction(insn, 17, 5);
+  unsigned NSize = 0;
+  DecodeFN RegDecoder = nullptr;
+  if ((tmp & 0x18) == 0x00) { // INSVE_B
+    NSize = 4;
+    RegDecoder = DecodeMSA128BRegisterClass;
+  } else if ((tmp & 0x1c) == 0x10) { // INSVE_H
+    NSize = 3;
+    RegDecoder = DecodeMSA128HRegisterClass;
+  } else if ((tmp & 0x1e) == 0x18) { // INSVE_W
+    NSize = 2;
+    RegDecoder = DecodeMSA128WRegisterClass;
+  } else if ((tmp & 0x1f) == 0x1c) { // INSVE_D
+    NSize = 1;
+    RegDecoder = DecodeMSA128DRegisterClass;
+  } else
+    llvm_unreachable("Invalid encoding");
+
+  assert(NSize != 0 && RegDecoder != nullptr);
+
+  // $wd
+  tmp = fieldFromInstruction(insn, 6, 5);
+  if (RegDecoder(MI, tmp, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  // $wd_in
+  if (RegDecoder(MI, tmp, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  // $n
+  tmp = fieldFromInstruction(insn, 16, NSize);
+  MI.addOperand(MCOperand::createImm(tmp));
+  // $ws
+  tmp = fieldFromInstruction(insn, 11, 5);
+  if (RegDecoder(MI, tmp, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  // $n2
+  MI.addOperand(MCOperand::createImm(0));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeDAHIDATIMMR6(MCInst &MI, InsnType insn, uint64_t Address,
+                               const void *Decoder) {
+  InsnType Rs = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = fieldFromInstruction(insn, 0, 16);
+  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR64RegClassID,
+                                       Rs)));
+  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR64RegClassID,
+                                       Rs)));
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeDAHIDATI(MCInst &MI, InsnType insn, uint64_t Address,
+                               const void *Decoder) {
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Imm = fieldFromInstruction(insn, 0, 16);
+  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR64RegClassID,
+                                       Rs)));
+  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR64RegClassID,
+                                       Rs)));
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeAddiGroupBranch(MCInst &MI, InsnType insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the ADDI instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b001000 sssss ttttt iiiiiiiiiiiiiiii
+  //      BOVC if rs >= rt
+  //      BEQZALC if rs == 0 && rt != 0
+  //      BEQC if rs < rt && rs != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+  bool HasRs = false;
+
+  if (Rs >= Rt) {
+    MI.setOpcode(Mips::BOVC);
+    HasRs = true;
+  } else if (Rs != 0 && Rs < Rt) {
+    MI.setOpcode(Mips::BEQC);
+    HasRs = true;
+  } else
+    MI.setOpcode(Mips::BEQZALC);
+
+  if (HasRs)
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodePOP35GroupBranchMMR6(MCInst &MI, InsnType insn,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  InsnType Rt = fieldFromInstruction(insn, 21, 5);
+  InsnType Rs = fieldFromInstruction(insn, 16, 5);
+  int64_t Imm = 0;
+
+  if (Rs >= Rt) {
+    MI.setOpcode(Mips::BOVC_MMR6);
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+    Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2 + 4;
+  } else if (Rs != 0 && Rs < Rt) {
+    MI.setOpcode(Mips::BEQC_MMR6);
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+    Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+  } else {
+    MI.setOpcode(Mips::BEQZALC_MMR6);
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+    Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2 + 4;
+  }
+
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeDaddiGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the ADDI instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b011000 sssss ttttt iiiiiiiiiiiiiiii
+  //      BNVC if rs >= rt
+  //      BNEZALC if rs == 0 && rt != 0
+  //      BNEC if rs < rt && rs != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  int64_t  Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+  bool HasRs = false;
+
+  if (Rs >= Rt) {
+    MI.setOpcode(Mips::BNVC);
+    HasRs = true;
+  } else if (Rs != 0 && Rs < Rt) {
+    MI.setOpcode(Mips::BNEC);
+    HasRs = true;
+  } else
+    MI.setOpcode(Mips::BNEZALC);
+
+  if (HasRs)
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodePOP37GroupBranchMMR6(MCInst &MI, InsnType insn,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  InsnType Rt = fieldFromInstruction(insn, 21, 5);
+  InsnType Rs = fieldFromInstruction(insn, 16, 5);
+  int64_t Imm = 0;
+
+  if (Rs >= Rt) {
+    MI.setOpcode(Mips::BNVC_MMR6);
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+    Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2 + 4;
+  } else if (Rs != 0 && Rs < Rt) {
+    MI.setOpcode(Mips::BNEC_MMR6);
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+    Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+  } else {
+    MI.setOpcode(Mips::BNEZALC_MMR6);
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+    Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2 + 4;
+  }
+
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodePOP65GroupBranchMMR6(MCInst &MI, InsnType insn,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  // We have:
+  //    0b110101 ttttt sssss iiiiiiiiiiiiiiii
+  //      Invalid if rt == 0
+  //      BGTZC_MMR6   if rs == 0  && rt != 0
+  //      BLTZC_MMR6   if rs == rt && rt != 0
+  //      BLTC_MMR6    if rs != rt && rs != 0  && rt != 0
+
+  InsnType Rt = fieldFromInstruction(insn, 21, 5);
+  InsnType Rs = fieldFromInstruction(insn, 16, 5);
+  int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+  bool HasRs = false;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BGTZC_MMR6);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BLTZC_MMR6);
+  else {
+    MI.setOpcode(Mips::BLTC_MMR6);
+    HasRs = true;
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                              Rs)));
+
+  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodePOP75GroupBranchMMR6(MCInst &MI, InsnType insn,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  // We have:
+  //    0b111101 ttttt sssss iiiiiiiiiiiiiiii
+  //      Invalid if rt == 0
+  //      BLEZC_MMR6   if rs == 0  && rt != 0
+  //      BGEZC_MMR6   if rs == rt && rt != 0
+  //      BGEC_MMR6    if rs != rt && rs != 0  && rt != 0
+
+  InsnType Rt = fieldFromInstruction(insn, 21, 5);
+  InsnType Rs = fieldFromInstruction(insn, 16, 5);
+  int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+  bool HasRs = false;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BLEZC_MMR6);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BGEZC_MMR6);
+  else {
+    HasRs = true;
+    MI.setOpcode(Mips::BGEC_MMR6);
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BLEZL instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b010110 sssss ttttt iiiiiiiiiiiiiiii
+  //      Invalid if rs == 0
+  //      BLEZC   if rs == 0  && rt != 0
+  //      BGEZC   if rs == rt && rt != 0
+  //      BGEC    if rs != rt && rs != 0  && rt != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+  bool HasRs = false;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BLEZC);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BGEZC);
+  else {
+    HasRs = true;
+    MI.setOpcode(Mips::BGEC);
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BGTZL instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b010111 sssss ttttt iiiiiiiiiiiiiiii
+  //      Invalid if rs == 0
+  //      BGTZC   if rs == 0  && rt != 0
+  //      BLTZC   if rs == rt && rt != 0
+  //      BLTC    if rs != rt && rs != 0  && rt != 0
+
+  bool HasRs = false;
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BGTZC);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BLTZC);
+  else {
+    MI.setOpcode(Mips::BLTC);
+    HasRs = true;
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                              Rs)));
+
+  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BGTZ instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b000111 sssss ttttt iiiiiiiiiiiiiiii
+  //      BGTZ    if rt == 0
+  //      BGTZALC if rs == 0 && rt != 0
+  //      BLTZALC if rs != 0 && rs == rt
+  //      BLTUC   if rs != 0 && rs != rt
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+  bool HasRs = false;
+  bool HasRt = false;
+
+  if (Rt == 0) {
+    MI.setOpcode(Mips::BGTZ);
+    HasRs = true;
+  } else if (Rs == 0) {
+    MI.setOpcode(Mips::BGTZALC);
+    HasRt = true;
+  } else if (Rs == Rt) {
+    MI.setOpcode(Mips::BLTZALC);
+    HasRs = true;
+  } else {
+    MI.setOpcode(Mips::BLTUC);
+    HasRs = true;
+    HasRt = true;
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  if (HasRt)
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBlezGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BLEZL instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b000110 sssss ttttt iiiiiiiiiiiiiiii
+  //      Invalid   if rs == 0
+  //      BLEZALC   if rs == 0  && rt != 0
+  //      BGEZALC   if rs == rt && rt != 0
+  //      BGEUC     if rs != rt && rs != 0  && rt != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  int64_t Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+  bool HasRs = false;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BLEZALC);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BGEZALC);
+  else {
+    HasRs = true;
+    MI.setOpcode(Mips::BGEUC);
+  }
+
+  if (HasRs)
+    MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+  MI.addOperand(MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+/// Read two bytes from the ArrayRef and return 16 bit halfword sorted
+/// according to the given endianness.
+static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                      uint64_t &Size, uint32_t &Insn,
+                                      bool IsBigEndian) {
+  // We want to read exactly 2 Bytes of data.
+  if (Bytes.size() < 2) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  if (IsBigEndian) {
+    Insn = (Bytes[0] << 8) | Bytes[1];
+  } else {
+    Insn = (Bytes[1] << 8) | Bytes[0];
+  }
+
+  return MCDisassembler::Success;
+}
+
+/// Read four bytes from the ArrayRef and return 32 bit word sorted
+/// according to the given endianness.
+static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                      uint64_t &Size, uint32_t &Insn,
+                                      bool IsBigEndian, bool IsMicroMips) {
+  // We want to read exactly 4 Bytes of data.
+  if (Bytes.size() < 4) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // High 16 bits of a 32-bit microMIPS instruction (where the opcode is)
+  // always precede the low 16 bits in the instruction stream (that is, they
+  // are placed at lower addresses in the instruction stream).
+  //
+  // microMIPS byte ordering:
+  //   Big-endian:    0 | 1 | 2 | 3
+  //   Little-endian: 1 | 0 | 3 | 2
+
+  if (IsBigEndian) {
+    // Encoded as a big-endian 32-bit word in the stream.
+    Insn =
+        (Bytes[3] << 0) | (Bytes[2] << 8) | (Bytes[1] << 16) | (Bytes[0] << 24);
+  } else {
+    if (IsMicroMips) {
+      Insn = (Bytes[2] << 0) | (Bytes[3] << 8) | (Bytes[0] << 16) |
+             (Bytes[1] << 24);
+    } else {
+      Insn = (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) |
+             (Bytes[3] << 24);
+    }
+  }
+
+  return MCDisassembler::Success;
+}
+
+DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+                                              ArrayRef<uint8_t> Bytes,
+                                              uint64_t Address,
+                                              raw_ostream &VStream,
+                                              raw_ostream &CStream) const {
+  uint32_t Insn;
+  DecodeStatus Result;
+
+  if (IsMicroMips) {
+    Result = readInstruction16(Bytes, Address, Size, Insn, IsBigEndian);
+    if (Result == MCDisassembler::Fail)
+      return MCDisassembler::Fail;
+
+    if (hasMips32r6()) {
+      DEBUG(dbgs() << "Trying MicroMipsR616 table (16-bit instructions):\n");
+      // Calling the auto-generated decoder function for microMIPS32R6
+      // (and microMIPS64R6) 16-bit instructions.
+      Result = decodeInstruction(DecoderTableMicroMipsR616, Instr, Insn,
+                                 Address, this, STI);
+      if (Result != MCDisassembler::Fail) {
+        Size = 2;
+        return Result;
+      }
+    }
+
+    DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n");
+    // Calling the auto-generated decoder function for microMIPS 16-bit
+    // instructions.
+    Result = decodeInstruction(DecoderTableMicroMips16, Instr, Insn, Address,
+                               this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 2;
+      return Result;
+    }
+
+    Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, true);
+    if (Result == MCDisassembler::Fail)
+      return MCDisassembler::Fail;
+
+    if (hasMips32r6()) {
+      DEBUG(dbgs() << "Trying MicroMips32r632 table (32-bit instructions):\n");
+      // Calling the auto-generated decoder function.
+      Result = decodeInstruction(DecoderTableMicroMipsR632, Instr, Insn, Address,
+                                 this, STI);
+      if (Result != MCDisassembler::Fail) {
+        Size = 4;
+        return Result;
+      }
+    }
+
+    DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n");
+    // Calling the auto-generated decoder function.
+    Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address,
+                               this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+
+    if (hasMips32r6() && isFP64()) {
+      DEBUG(dbgs() << "Trying MicroMips32r6FP64 table (32-bit opcodes):\n");
+      Result = decodeInstruction(DecoderTableMicroMips32r6FP6432, Instr, Insn,
+                                 Address, this, STI);
+      if (Result != MCDisassembler::Fail) {
+        Size = 4;
+        return Result;
+      }
+    }
+
+    // This is an invalid instruction. Let the disassembler move forward by the
+    // minimum instruction size.
+    Size = 2;
+    return MCDisassembler::Fail;
+  }
+
+  Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false);
+  if (Result == MCDisassembler::Fail) {
+    Size = 4;
+    return MCDisassembler::Fail;
+  }
+
+  if (hasCOP3()) {
+    DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
+    Result =
+        decodeInstruction(DecoderTableCOP3_32, Instr, Insn, Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  if (hasMips32r6() && isGP64()) {
+    DEBUG(dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, Instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  if (hasMips32r6() && isPTR64()) {
+    DEBUG(dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips32r6_64r6_PTR6432, Instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  if (hasMips32r6()) {
+    DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips32r6_64r632, Instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  if (hasMips2() && isPTR64()) {
+    DEBUG(dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips32_64_PTR6432, Instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  if (hasCnMips()) {
+    DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableCnMips32, Instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  if (isGP64()) {
+    DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableMips6432, Instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
+  DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
+  // Calling the auto-generated decoder function.
+  Result =
+      decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  Size = 4;
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+
+  return MCDisassembler::Fail;
+
+}
+
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::GPR64RegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, Mips::GPRMM16RegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGPRMM16ZeroRegisterClass(MCInst &Inst,
+                                                   unsigned RegNo,
+                                                   uint64_t Address,
+                                                   const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, Mips::GPRMM16ZeroRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGPRMM16MovePRegisterClass(MCInst &Inst,
+                                                    unsigned RegNo,
+                                                    uint64_t Address,
+                                                    const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, Mips::GPRMM16MovePRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePtrRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (static_cast<const MipsDisassembler *>(Decoder)->isGP64())
+    return DecodeGPR64RegisterClass(Inst, RegNo, Address, Decoder);
+
+  return DecodeGPR32RegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return DecodeGPR32RegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::FGR64RegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::FGR32RegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, Mips::CCRRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, Mips::FCCRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFGRCCRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::FGRCCRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMem(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  if (Inst.getOpcode() == Mips::SC ||
+      Inst.getOpcode() == Mips::SCD)
+    Inst.addOperand(MCOperand::createReg(Reg));
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemEVA(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder) {
+  int Offset = SignExtend32<9>(Insn >> 7);
+  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+   if (Inst.getOpcode() == Mips::SCE)
+     Inst.addOperand(MCOperand::createReg(Reg));
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLoadByte9(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder) {
+  int Offset = SignExtend32<9>(Insn & 0x1ff);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLoadByte15(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCacheOp(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Hint = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+  Inst.addOperand(MCOperand::createImm(Hint));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder) {
+  int Offset = SignExtend32<12>(Insn & 0xfff);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned Hint = fieldFromInstruction(Insn, 21, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+  Inst.addOperand(MCOperand::createImm(Hint));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePrefeOpMM(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder) {
+  int Offset = SignExtend32<9>(Insn & 0x1ff);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned Hint = fieldFromInstruction(Insn, 21, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+  Inst.addOperand(MCOperand::createImm(Hint));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst,
+                                             unsigned Insn,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  int Offset = SignExtend32<9>(Insn >> 7);
+  unsigned Hint = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+  Inst.addOperand(MCOperand::createImm(Hint));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeStoreEvaOpMM(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  int Offset = SignExtend32<9>(Insn & 0x1ff);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSyncI(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSynciR6(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder) {
+  int Immediate = SignExtend32<16>(Insn & 0xffff);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Immediate));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  int Offset = SignExtend32<10>(fieldFromInstruction(Insn, 16, 10));
+  unsigned Reg = fieldFromInstruction(Insn, 6, 5);
+  unsigned Base = fieldFromInstruction(Insn, 11, 5);
+
+  Reg = getReg(Decoder, Mips::MSA128BRegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+
+  // The immediate field of an LD/ST instruction is scaled which means it must
+  // be multiplied (when decoding) by the size (in bytes) of the instructions'
+  // data format.
+  // .b - 1 byte
+  // .h - 2 bytes
+  // .w - 4 bytes
+  // .d - 8 bytes
+  switch(Inst.getOpcode())
+  {
+  default:
+    assert (0 && "Unexpected instruction");
+    return MCDisassembler::Fail;
+    break;
+  case Mips::LD_B:
+  case Mips::ST_B:
+    Inst.addOperand(MCOperand::createImm(Offset));
+    break;
+  case Mips::LD_H:
+  case Mips::ST_H:
+    Inst.addOperand(MCOperand::createImm(Offset * 2));
+    break;
+  case Mips::LD_W:
+  case Mips::ST_W:
+    Inst.addOperand(MCOperand::createImm(Offset * 4));
+    break;
+  case Mips::LD_D:
+  case Mips::ST_D:
+    Inst.addOperand(MCOperand::createImm(Offset * 8));
+    break;
+  }
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMImm4(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder) {
+  unsigned Offset = Insn & 0xf;
+  unsigned Reg = fieldFromInstruction(Insn, 7, 3);
+  unsigned Base = fieldFromInstruction(Insn, 4, 3);
+
+  switch (Inst.getOpcode()) {
+    case Mips::LBU16_MM:
+    case Mips::LHU16_MM:
+    case Mips::LW16_MM:
+      if (DecodeGPRMM16RegisterClass(Inst, Reg, Address, Decoder)
+            == MCDisassembler::Fail)
+        return MCDisassembler::Fail;
+      break;
+    case Mips::SB16_MM:
+    case Mips::SB16_MMR6:
+    case Mips::SH16_MM:
+    case Mips::SH16_MMR6:
+    case Mips::SW16_MM:
+    case Mips::SW16_MMR6:
+      if (DecodeGPRMM16ZeroRegisterClass(Inst, Reg, Address, Decoder)
+            == MCDisassembler::Fail)
+        return MCDisassembler::Fail;
+      break;
+  }
+
+  if (DecodeGPRMM16RegisterClass(Inst, Base, Address, Decoder)
+        == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  switch (Inst.getOpcode()) {
+    case Mips::LBU16_MM:
+      if (Offset == 0xf)
+        Inst.addOperand(MCOperand::createImm(-1));
+      else
+        Inst.addOperand(MCOperand::createImm(Offset));
+      break;
+    case Mips::SB16_MM:
+    case Mips::SB16_MMR6:
+      Inst.addOperand(MCOperand::createImm(Offset));
+      break;
+    case Mips::LHU16_MM:
+    case Mips::SH16_MM:
+    case Mips::SH16_MMR6:
+      Inst.addOperand(MCOperand::createImm(Offset << 1));
+      break;
+    case Mips::LW16_MM:
+    case Mips::SW16_MM:
+    case Mips::SW16_MMR6:
+      Inst.addOperand(MCOperand::createImm(Offset << 2));
+      break;
+  }
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMSPImm5Lsl2(MCInst &Inst,
+                                          unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  unsigned Offset = Insn & 0x1F;
+  unsigned Reg = fieldFromInstruction(Insn, 5, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Mips::SP));
+  Inst.addOperand(MCOperand::createImm(Offset << 2));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMGPImm7Lsl2(MCInst &Inst,
+                                          unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  unsigned Offset = Insn & 0x7F;
+  unsigned Reg = fieldFromInstruction(Insn, 7, 3);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Mips::GP));
+  Inst.addOperand(MCOperand::createImm(Offset << 2));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst,
+                                               unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  int Offset;
+  switch (Inst.getOpcode()) {
+  case Mips::LWM16_MMR6:
+  case Mips::SWM16_MMR6:
+    Offset = fieldFromInstruction(Insn, 4, 4);
+    break;
+  default:
+    Offset = SignExtend32<4>(Insn & 0xf);
+    break;
+  }
+
+  if (DecodeRegListOperand16(Inst, Insn, Address, Decoder)
+      == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(Mips::SP));
+  Inst.addOperand(MCOperand::createImm(Offset << 2));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMImm9(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder) {
+  int Offset = SignExtend32<9>(Insn & 0x1ff);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  if (Inst.getOpcode() == Mips::SCE_MM)
+    Inst.addOperand(MCOperand::createReg(Reg));
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  int Offset = SignExtend32<12>(Insn & 0x0fff);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  switch (Inst.getOpcode()) {
+  case Mips::SWM32_MM:
+  case Mips::LWM32_MM:
+    if (DecodeRegListOperand(Inst, Insn, Address, Decoder)
+        == MCDisassembler::Fail)
+      return MCDisassembler::Fail;
+    Inst.addOperand(MCOperand::createReg(Base));
+    Inst.addOperand(MCOperand::createImm(Offset));
+    break;
+  case Mips::SC_MM:
+    Inst.addOperand(MCOperand::createReg(Reg));
+    LLVM_FALLTHROUGH;
+  default:
+    Inst.addOperand(MCOperand::createReg(Reg));
+    if (Inst.getOpcode() == Mips::LWP_MM || Inst.getOpcode() == Mips::SWP_MM ||
+        Inst.getOpcode() == Mips::LWP_MMR6 || Inst.getOpcode() == Mips::SWP_MMR6)
+      Inst.addOperand(MCOperand::createReg(Reg+1));
+
+    Inst.addOperand(MCOperand::createReg(Base));
+    Inst.addOperand(MCOperand::createImm(Offset));
+  }
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMImm16(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMem(MCInst &Inst,
+                               unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Reg = getReg(Decoder, Mips::FGR64RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMemMMR2(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  // This function is the same as DecodeFMem but with the Reg and Base fields
+  // swapped according to microMIPS spec.
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+
+  Reg = getReg(Decoder, Mips::FGR64RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMem2(MCInst &Inst,
+                               unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Reg = getReg(Decoder, Mips::COP2RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMem3(MCInst &Inst,
+                               unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Reg = getReg(Decoder, Mips::COP3RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMemCop2R6(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder) {
+  int Offset = SignExtend32<11>(Insn & 0x07ff);
+  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 11, 5);
+
+  Reg = getReg(Decoder, Mips::COP2RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMemCop2MMR6(MCInst &Inst, unsigned Insn,
+                                       uint64_t Address, const void *Decoder) {
+  int Offset = SignExtend32<11>(Insn & 0x07ff);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Reg = getReg(Decoder, Mips::COP2RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  int64_t Offset = SignExtend64<9>((Insn >> 7) & 0x1ff);
+  unsigned Rt = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Rt = getReg(Decoder, Mips::GPR32RegClassID, Rt);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  if(Inst.getOpcode() == Mips::SC_R6 || Inst.getOpcode() == Mips::SCD_R6){
+    Inst.addOperand(MCOperand::createReg(Rt));
+  }
+
+  Inst.addOperand(MCOperand::createReg(Rt));
+  Inst.addOperand(MCOperand::createReg(Base));
+  Inst.addOperand(MCOperand::createImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  // Currently only hardware register 29 is supported.
+  if (RegNo != 29)
+    return  MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(Mips::HWR29));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  if (RegNo > 30 || RegNo %2)
+    return MCDisassembler::Fail;
+
+  ;
+  unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo /2);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst,
+                                                unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  if (RegNo >= 4)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::ACC64DSPRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo >= 4)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::HI32DSPRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo >= 4)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::LO32DSPRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128HRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128WRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128DRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSACtrlRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCOP0RegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::COP0RegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget(MCInst &Inst,
+                                       unsigned Offset,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  int32_t BranchOffset = (SignExtend32<16>(Offset) * 4) + 4;
+  Inst.addOperand(MCOperand::createImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget1SImm16(MCInst &Inst,
+                                              unsigned Offset,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  int32_t BranchOffset = (SignExtend32<16>(Offset) * 2);
+  Inst.addOperand(MCOperand::createImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeJumpTarget(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+
+  unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 2;
+  Inst.addOperand(MCOperand::createImm(JumpOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget21(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<21>(Offset) * 4 + 4;
+
+  Inst.addOperand(MCOperand::createImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget21MM(MCInst &Inst,
+                                           unsigned Offset,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<21>(Offset) * 4 + 4;
+
+  Inst.addOperand(MCOperand::createImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget26(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<26>(Offset) * 4 + 4;
+
+  Inst.addOperand(MCOperand::createImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget7MM(MCInst &Inst,
+                                          unsigned Offset,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<7>(Offset) << 1;
+  Inst.addOperand(MCOperand::createImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget10MM(MCInst &Inst,
+                                           unsigned Offset,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<10>(Offset) << 1;
+  Inst.addOperand(MCOperand::createImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<16>(Offset) * 2 + 4;
+  Inst.addOperand(MCOperand::createImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst,
+  unsigned Offset,
+  uint64_t Address,
+  const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<26>(Offset) << 1;
+
+  Inst.addOperand(MCOperand::createImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 1;
+  Inst.addOperand(MCOperand::createImm(JumpOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeAddiur2Simm7(MCInst &Inst,
+                                       unsigned Value,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  if (Value == 0)
+    Inst.addOperand(MCOperand::createImm(1));
+  else if (Value == 0x7)
+    Inst.addOperand(MCOperand::createImm(-1));
+  else
+    Inst.addOperand(MCOperand::createImm(Value << 2));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLi16Imm(MCInst &Inst,
+                                  unsigned Value,
+                                  uint64_t Address,
+                                  const void *Decoder) {
+  if (Value == 0x7F)
+    Inst.addOperand(MCOperand::createImm(-1));
+  else
+    Inst.addOperand(MCOperand::createImm(Value));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst,
+                                              unsigned Value,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  Inst.addOperand(MCOperand::createImm(Value == 0x0 ? 8 : Value));
+  return MCDisassembler::Success;
+}
+
+template <unsigned Bits, int Offset, int Scale>
+static DecodeStatus DecodeUImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  Value &= ((1 << Bits) - 1);
+  Value *= Scale;
+  Inst.addOperand(MCOperand::createImm(Value + Offset));
+  return MCDisassembler::Success;
+}
+
+template <unsigned Bits, int Offset, int ScaleBy>
+static DecodeStatus DecodeSImmWithOffsetAndScale(MCInst &Inst, unsigned Value,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  int32_t Imm = SignExtend32<Bits>(Value) * ScaleBy;
+  Inst.addOperand(MCOperand::createImm(Imm + Offset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeInsSize(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder) {
+  // First we need to grab the pos(lsb) from MCInst.
+  int Pos = Inst.getOperand(2).getImm();
+  int Size = (int) Insn - Pos + 1;
+  Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Size)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::createImm(SignExtend32<19>(Insn) * 4));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::createImm(SignExtend32<18>(Insn) * 8));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSimm9SP(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  int32_t DecodedValue;
+  switch (Insn) {
+  case 0: DecodedValue = 256; break;
+  case 1: DecodedValue = 257; break;
+  case 510: DecodedValue = -258; break;
+  case 511: DecodedValue = -257; break;
+  default: DecodedValue = SignExtend32<9>(Insn); break;
+  }
+  Inst.addOperand(MCOperand::createImm(DecodedValue * 4));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeANDI16Imm(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  // Insn must be >= 0, since it is unsigned that condition is always true.
+  assert(Insn < 16);
+  int32_t DecodedValues[] = {128, 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64,
+                             255, 32768, 65535};
+  Inst.addOperand(MCOperand::createImm(DecodedValues[Insn]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRegListOperand(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5,
+                     Mips::S6, Mips::S7, Mips::FP};
+  unsigned RegNum;
+
+  unsigned RegLst = fieldFromInstruction(Insn, 21, 5);
+
+  // Empty register lists are not allowed.
+  if (RegLst == 0)
+    return MCDisassembler::Fail;
+
+  RegNum = RegLst & 0xf;
+
+  // RegLst values 10-15, and 26-31 are reserved.
+  if (RegNum > 9)
+    return MCDisassembler::Fail;
+
+  for (unsigned i = 0; i < RegNum; i++)
+    Inst.addOperand(MCOperand::createReg(Regs[i]));
+
+  if (RegLst & 0x10)
+    Inst.addOperand(MCOperand::createReg(Mips::RA));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3};
+  unsigned RegLst;
+  switch(Inst.getOpcode()) {
+  default:
+    RegLst = fieldFromInstruction(Insn, 4, 2);
+    break;
+  case Mips::LWM16_MMR6:
+  case Mips::SWM16_MMR6:
+    RegLst = fieldFromInstruction(Insn, 8, 2);
+    break;
+  }
+  unsigned RegNum = RegLst & 0x3;
+
+  for (unsigned i = 0; i <= RegNum; i++)
+    Inst.addOperand(MCOperand::createReg(Regs[i]));
+
+  Inst.addOperand(MCOperand::createReg(Mips::RA));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned Insn,
+                                       uint64_t Address, const void *Decoder) {
+
+  unsigned RegPair = fieldFromInstruction(Insn, 7, 3);
+
+  switch (RegPair) {
+  default:
+    return MCDisassembler::Fail;
+  case 0:
+    Inst.addOperand(MCOperand::createReg(Mips::A1));
+    Inst.addOperand(MCOperand::createReg(Mips::A2));
+    break;
+  case 1:
+    Inst.addOperand(MCOperand::createReg(Mips::A1));
+    Inst.addOperand(MCOperand::createReg(Mips::A3));
+    break;
+  case 2:
+    Inst.addOperand(MCOperand::createReg(Mips::A2));
+    Inst.addOperand(MCOperand::createReg(Mips::A3));
+    break;
+  case 3:
+    Inst.addOperand(MCOperand::createReg(Mips::A0));
+    Inst.addOperand(MCOperand::createReg(Mips::S5));
+    break;
+  case 4:
+    Inst.addOperand(MCOperand::createReg(Mips::A0));
+    Inst.addOperand(MCOperand::createReg(Mips::S6));
+    break;
+  case 5:
+    Inst.addOperand(MCOperand::createReg(Mips::A0));
+    Inst.addOperand(MCOperand::createReg(Mips::A1));
+    break;
+  case 6:
+    Inst.addOperand(MCOperand::createReg(Mips::A0));
+    Inst.addOperand(MCOperand::createReg(Mips::A2));
+    break;
+  case 7:
+    Inst.addOperand(MCOperand::createReg(Mips::A0));
+    Inst.addOperand(MCOperand::createReg(Mips::A3));
+    break;
+  }
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSimm23Lsl2(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::createImm(SignExtend32<25>(Insn << 2)));
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBgtzGroupBranchMMR6(MCInst &MI, InsnType insn,
+  uint64_t Address,
+  const void *Decoder) {
+  // We have:
+  //    0b000111 ttttt sssss iiiiiiiiiiiiiiii
+  //      Invalid      if rt == 0
+  //      BGTZALC_MMR6 if rs == 0 && rt != 0
+  //      BLTZALC_MMR6 if rs != 0 && rs == rt
+  //      BLTUC_MMR6   if rs != 0 && rs != rt
+
+  InsnType Rt = fieldFromInstruction(insn, 21, 5);
+  InsnType Rs = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = 0;
+  bool HasRs = false;
+  bool HasRt = false;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0) {
+    MI.setOpcode(Mips::BGTZALC_MMR6);
+    HasRt = true;
+    Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2 + 4;
+  }
+  else if (Rs == Rt) {
+    MI.setOpcode(Mips::BLTZALC_MMR6);
+    HasRs = true;
+    Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2 + 4;
+  }
+  else {
+    MI.setOpcode(Mips::BLTUC_MMR6);
+    HasRs = true;
+    HasRt = true;
+    Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+  }
+
+  if (HasRs)
+    MI.addOperand(
+    MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rs)));
+
+  if (HasRt)
+    MI.addOperand(
+    MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rt)));
+
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBlezGroupBranchMMR6(MCInst &MI, InsnType insn,
+  uint64_t Address,
+  const void *Decoder) {
+  // We have:
+  //    0b000110 ttttt sssss iiiiiiiiiiiiiiii
+  //      Invalid        if rt == 0
+  //      BLEZALC_MMR6   if rs == 0  && rt != 0
+  //      BGEZALC_MMR6   if rs == rt && rt != 0
+  //      BGEUC_MMR6     if rs != rt && rs != 0  && rt != 0
+
+  InsnType Rt = fieldFromInstruction(insn, 21, 5);
+  InsnType Rs = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = 0;
+  bool HasRs = false;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0) {
+    MI.setOpcode(Mips::BLEZALC_MMR6);
+    Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2 + 4;
+  }
+  else if (Rs == Rt) {
+    MI.setOpcode(Mips::BGEZALC_MMR6);
+    Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 2 + 4;
+  }
+  else {
+    HasRs = true;
+    MI.setOpcode(Mips::BGEUC_MMR6);
+    Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4 + 4;
+  }
+
+  if (HasRs)
+    MI.addOperand(
+    MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rs)));
+  MI.addOperand(
+    MCOperand::createReg(getReg(Decoder, Mips::GPR32RegClassID, Rt)));
+
+  MI.addOperand(MCOperand::createImm(Imm));
+
+  return MCDisassembler::Success;
+}
diff --git a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
new file mode 100644
index 000000000000..49c42fd1880c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -0,0 +1,294 @@
+//===-- MipsInstPrinter.cpp - Convert Mips MCInst to assembly syntax ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Mips MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsInstPrinter.h"
+#include "MCTargetDesc/MipsMCExpr.h"
+#include "MipsInstrInfo.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#define PRINT_ALIAS_INSTR
+#include "MipsGenAsmWriter.inc"
+
+template<unsigned R>
+static bool isReg(const MCInst &MI, unsigned OpNo) {
+  assert(MI.getOperand(OpNo).isReg() && "Register operand expected.");
+  return MI.getOperand(OpNo).getReg() == R;
+}
+
+const char* Mips::MipsFCCToString(Mips::CondCode CC) {
+  switch (CC) {
+  case FCOND_F:
+  case FCOND_T:   return "f";
+  case FCOND_UN:
+  case FCOND_OR:  return "un";
+  case FCOND_OEQ:
+  case FCOND_UNE: return "eq";
+  case FCOND_UEQ:
+  case FCOND_ONE: return "ueq";
+  case FCOND_OLT:
+  case FCOND_UGE: return "olt";
+  case FCOND_ULT:
+  case FCOND_OGE: return "ult";
+  case FCOND_OLE:
+  case FCOND_UGT: return "ole";
+  case FCOND_ULE:
+  case FCOND_OGT: return "ule";
+  case FCOND_SF:
+  case FCOND_ST:  return "sf";
+  case FCOND_NGLE:
+  case FCOND_GLE: return "ngle";
+  case FCOND_SEQ:
+  case FCOND_SNE: return "seq";
+  case FCOND_NGL:
+  case FCOND_GL:  return "ngl";
+  case FCOND_LT:
+  case FCOND_NLT: return "lt";
+  case FCOND_NGE:
+  case FCOND_GE:  return "nge";
+  case FCOND_LE:
+  case FCOND_NLE: return "le";
+  case FCOND_NGT:
+  case FCOND_GT:  return "ngt";
+  }
+  llvm_unreachable("Impossible condition code!");
+}
+
+void MipsInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << '$' << StringRef(getRegisterName(RegNo)).lower();
+}
+
+void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                StringRef Annot, const MCSubtargetInfo &STI) {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case Mips::RDHWR:
+  case Mips::RDHWR64:
+    O << "\t.set\tpush\n";
+    O << "\t.set\tmips32r2\n";
+    break;
+  case Mips::Save16:
+    O << "\tsave\t";
+    printSaveRestore(MI, O);
+    O << " # 16 bit inst\n";
+    return;
+  case Mips::SaveX16:
+    O << "\tsave\t";
+    printSaveRestore(MI, O);
+    O << "\n";
+    return;
+  case Mips::Restore16:
+    O << "\trestore\t";
+    printSaveRestore(MI, O);
+    O << " # 16 bit inst\n";
+    return;
+  case Mips::RestoreX16:
+    O << "\trestore\t";
+    printSaveRestore(MI, O);
+    O << "\n";
+    return;
+  }
+
+  // Try to print any aliases first.
+  if (!printAliasInstr(MI, O) && !printAlias(*MI, O))
+    printInstruction(MI, O);
+  printAnnotation(O, Annot);
+
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case Mips::RDHWR:
+  case Mips::RDHWR64:
+    O << "\n\t.set\tpop";
+  }
+}
+
+void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    printRegName(O, Op.getReg());
+    return;
+  }
+
+  if (Op.isImm()) {
+    O << formatImm(Op.getImm());
+    return;
+  }
+
+  assert(Op.isExpr() && "unknown operand kind in printOperand");
+  Op.getExpr()->print(O, &MAI, true);
+}
+
+template <unsigned Bits, unsigned Offset>
+void MipsInstPrinter::printUImm(const MCInst *MI, int opNum, raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(opNum);
+  if (MO.isImm()) {
+    uint64_t Imm = MO.getImm();
+    Imm -= Offset;
+    Imm &= (1 << Bits) - 1;
+    Imm += Offset;
+    O << formatImm(Imm);
+    return;
+  }
+
+  printOperand(MI, opNum, O);
+}
+
+void MipsInstPrinter::
+printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) {
+  // Load/Store memory operands -- imm($reg)
+  // If PIC target the target is loaded as the
+  // pattern lw $25,%call16($28)
+
+  // opNum can be invalid if instruction had reglist as operand.
+  // MemOperand is always last operand of instruction (base + offset).
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case Mips::SWM32_MM:
+  case Mips::LWM32_MM:
+  case Mips::SWM16_MM:
+  case Mips::SWM16_MMR6:
+  case Mips::LWM16_MM:
+  case Mips::LWM16_MMR6:
+    opNum = MI->getNumOperands() - 2;
+    break;
+  }
+
+  printOperand(MI, opNum+1, O);
+  O << "(";
+  printOperand(MI, opNum, O);
+  O << ")";
+}
+
+void MipsInstPrinter::
+printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O) {
+  // when using stack locations for not load/store instructions
+  // print the same way as all normal 3 operand instructions.
+  printOperand(MI, opNum, O);
+  O << ", ";
+  printOperand(MI, opNum+1, O);
+  return;
+}
+
+void MipsInstPrinter::
+printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O) {
+  const MCOperand& MO = MI->getOperand(opNum);
+  O << MipsFCCToString((Mips::CondCode)MO.getImm());
+}
+
+void MipsInstPrinter::
+printRegisterPair(const MCInst *MI, int opNum, raw_ostream &O) {
+  printRegName(O, MI->getOperand(opNum).getReg());
+}
+
+void MipsInstPrinter::
+printSHFMask(const MCInst *MI, int opNum, raw_ostream &O) {
+  llvm_unreachable("TODO");
+}
+
+bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI,
+                                 unsigned OpNo, raw_ostream &OS) {
+  OS << "\t" << Str << "\t";
+  printOperand(&MI, OpNo, OS);
+  return true;
+}
+
+bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI,
+                                 unsigned OpNo0, unsigned OpNo1,
+                                 raw_ostream &OS) {
+  printAlias(Str, MI, OpNo0, OS);
+  OS << ", ";
+  printOperand(&MI, OpNo1, OS);
+  return true;
+}
+
+bool MipsInstPrinter::printAlias(const MCInst &MI, raw_ostream &OS) {
+  switch (MI.getOpcode()) {
+  case Mips::BEQ:
+  case Mips::BEQ_MM:
+    // beq $zero, $zero, $L2 => b $L2
+    // beq $r0, $zero, $L2 => beqz $r0, $L2
+    return (isReg<Mips::ZERO>(MI, 0) && isReg<Mips::ZERO>(MI, 1) &&
+            printAlias("b", MI, 2, OS)) ||
+           (isReg<Mips::ZERO>(MI, 1) && printAlias("beqz", MI, 0, 2, OS));
+  case Mips::BEQ64:
+    // beq $r0, $zero, $L2 => beqz $r0, $L2
+    return isReg<Mips::ZERO_64>(MI, 1) && printAlias("beqz", MI, 0, 2, OS);
+  case Mips::BNE:
+  case Mips::BNE_MM:
+    // bne $r0, $zero, $L2 => bnez $r0, $L2
+    return isReg<Mips::ZERO>(MI, 1) && printAlias("bnez", MI, 0, 2, OS);
+  case Mips::BNE64:
+    // bne $r0, $zero, $L2 => bnez $r0, $L2
+    return isReg<Mips::ZERO_64>(MI, 1) && printAlias("bnez", MI, 0, 2, OS);
+  case Mips::BGEZAL:
+    // bgezal $zero, $L1 => bal $L1
+    return isReg<Mips::ZERO>(MI, 0) && printAlias("bal", MI, 1, OS);
+  case Mips::BC1T:
+    // bc1t $fcc0, $L1 => bc1t $L1
+    return isReg<Mips::FCC0>(MI, 0) && printAlias("bc1t", MI, 1, OS);
+  case Mips::BC1F:
+    // bc1f $fcc0, $L1 => bc1f $L1
+    return isReg<Mips::FCC0>(MI, 0) && printAlias("bc1f", MI, 1, OS);
+  case Mips::JALR:
+    // jalr $ra, $r1 => jalr $r1
+    return isReg<Mips::RA>(MI, 0) && printAlias("jalr", MI, 1, OS);
+  case Mips::JALR64:
+    // jalr $ra, $r1 => jalr $r1
+    return isReg<Mips::RA_64>(MI, 0) && printAlias("jalr", MI, 1, OS);
+  case Mips::NOR:
+  case Mips::NOR_MM:
+  case Mips::NOR_MMR6:
+    // nor $r0, $r1, $zero => not $r0, $r1
+    return isReg<Mips::ZERO>(MI, 2) && printAlias("not", MI, 0, 1, OS);
+  case Mips::NOR64:
+    // nor $r0, $r1, $zero => not $r0, $r1
+    return isReg<Mips::ZERO_64>(MI, 2) && printAlias("not", MI, 0, 1, OS);
+  case Mips::OR:
+    // or $r0, $r1, $zero => move $r0, $r1
+    return isReg<Mips::ZERO>(MI, 2) && printAlias("move", MI, 0, 1, OS);
+  default: return false;
+  }
+}
+
+void MipsInstPrinter::printSaveRestore(const MCInst *MI, raw_ostream &O) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    if (i != 0) O << ", ";
+    if (MI->getOperand(i).isReg())
+      printRegName(O, MI->getOperand(i).getReg());
+    else
+      printUImm<16>(MI, i, O);
+  }
+}
+
+void MipsInstPrinter::
+printRegisterList(const MCInst *MI, int opNum, raw_ostream &O) {
+  // - 2 because register List is always first operand of instruction and it is
+  // always followed by memory operand (base + offset).
+  for (int i = opNum, e = MI->getNumOperands() - 2; i != e; ++i) {
+    if (i != opNum)
+      O << ", ";
+    printRegName(O, MI->getOperand(i).getReg());
+  }
+}
diff --git a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
new file mode 100644
index 000000000000..4a76b5acac79
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -0,0 +1,114 @@
+//=== MipsInstPrinter.h - Convert Mips MCInst to assembly syntax -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a Mips MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_INSTPRINTER_MIPSINSTPRINTER_H
+#define LLVM_LIB_TARGET_MIPS_INSTPRINTER_MIPSINSTPRINTER_H
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+// These enumeration declarations were originally in MipsInstrInfo.h but
+// had to be moved here to avoid circular dependencies between
+// LLVMMipsCodeGen and LLVMMipsAsmPrinter.
+namespace Mips {
+// Mips Branch Codes
+enum FPBranchCode {
+  BRANCH_F,
+  BRANCH_T,
+  BRANCH_FL,
+  BRANCH_TL,
+  BRANCH_INVALID
+};
+
+// Mips Condition Codes
+enum CondCode {
+  // To be used with float branch True
+  FCOND_F,
+  FCOND_UN,
+  FCOND_OEQ,
+  FCOND_UEQ,
+  FCOND_OLT,
+  FCOND_ULT,
+  FCOND_OLE,
+  FCOND_ULE,
+  FCOND_SF,
+  FCOND_NGLE,
+  FCOND_SEQ,
+  FCOND_NGL,
+  FCOND_LT,
+  FCOND_NGE,
+  FCOND_LE,
+  FCOND_NGT,
+
+  // To be used with float branch False
+  // This conditions have the same mnemonic as the
+  // above ones, but are used with a branch False;
+  FCOND_T,
+  FCOND_OR,
+  FCOND_UNE,
+  FCOND_ONE,
+  FCOND_UGE,
+  FCOND_OGE,
+  FCOND_UGT,
+  FCOND_OGT,
+  FCOND_ST,
+  FCOND_GLE,
+  FCOND_SNE,
+  FCOND_GL,
+  FCOND_NLT,
+  FCOND_GE,
+  FCOND_NLE,
+  FCOND_GT
+};
+
+const char *MipsFCCToString(Mips::CondCode CC);
+} // end namespace Mips
+
+class MipsInstPrinter : public MCInstPrinter {
+public:
+  MipsInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                  const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+
+  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx, raw_ostream &O);
+
+private:
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  template <unsigned Bits, unsigned Offset = 0>
+  void printUImm(const MCInst *MI, int opNum, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
+  void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O);
+  void printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O);
+  void printRegisterPair(const MCInst *MI, int opNum, raw_ostream &O);
+  void printSHFMask(const MCInst *MI, int opNum, raw_ostream &O);
+
+  bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo,
+                  raw_ostream &OS);
+  bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo0,
+                  unsigned OpNo1, raw_ostream &OS);
+  bool printAlias(const MCInst &MI, raw_ostream &OS);
+  void printSaveRestore(const MCInst *MI, raw_ostream &O);
+  void printRegisterList(const MCInst *MI, int opNum, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
new file mode 100644
index 000000000000..932d38a0b9fe
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
@@ -0,0 +1,69 @@
+//===-- MipsABIFlagsSection.cpp - Mips ELF ABI Flags Section ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsABIFlagsSection.h"
+
+using namespace llvm;
+
+uint8_t MipsABIFlagsSection::getFpABIValue() {
+  switch (FpABI) {
+  case FpABIKind::ANY:
+    return Mips::Val_GNU_MIPS_ABI_FP_ANY;
+  case FpABIKind::SOFT:
+    return Mips::Val_GNU_MIPS_ABI_FP_SOFT;
+  case FpABIKind::XX:
+    return Mips::Val_GNU_MIPS_ABI_FP_XX;
+  case FpABIKind::S32:
+    return Mips::Val_GNU_MIPS_ABI_FP_DOUBLE;
+  case FpABIKind::S64:
+    if (Is32BitABI)
+      return OddSPReg ? Mips::Val_GNU_MIPS_ABI_FP_64
+                      : Mips::Val_GNU_MIPS_ABI_FP_64A;
+    return Mips::Val_GNU_MIPS_ABI_FP_DOUBLE;
+  }
+
+  llvm_unreachable("unexpected fp abi value");
+}
+
+StringRef MipsABIFlagsSection::getFpABIString(FpABIKind Value) {
+  switch (Value) {
+  case FpABIKind::XX:
+    return "xx";
+  case FpABIKind::S32:
+    return "32";
+  case FpABIKind::S64:
+    return "64";
+  default:
+    llvm_unreachable("unsupported fp abi value");
+  }
+}
+
+uint8_t MipsABIFlagsSection::getCPR1SizeValue() {
+  if (FpABI == FpABIKind::XX)
+    return (uint8_t)Mips::AFL_REG_32;
+  return (uint8_t)CPR1Size;
+}
+
+namespace llvm {
+MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection) {
+  // Write out a Elf_Internal_ABIFlags_v0 struct
+  OS.EmitIntValue(ABIFlagsSection.getVersionValue(), 2);      // version
+  OS.EmitIntValue(ABIFlagsSection.getISALevelValue(), 1);     // isa_level
+  OS.EmitIntValue(ABIFlagsSection.getISARevisionValue(), 1);  // isa_rev
+  OS.EmitIntValue(ABIFlagsSection.getGPRSizeValue(), 1);      // gpr_size
+  OS.EmitIntValue(ABIFlagsSection.getCPR1SizeValue(), 1);     // cpr1_size
+  OS.EmitIntValue(ABIFlagsSection.getCPR2SizeValue(), 1);     // cpr2_size
+  OS.EmitIntValue(ABIFlagsSection.getFpABIValue(), 1);        // fp_abi
+  OS.EmitIntValue(ABIFlagsSection.getISAExtensionValue(), 4); // isa_ext
+  OS.EmitIntValue(ABIFlagsSection.getASESetValue(), 4);       // ases
+  OS.EmitIntValue(ABIFlagsSection.getFlags1Value(), 4);       // flags1
+  OS.EmitIntValue(ABIFlagsSection.getFlags2Value(), 4);       // flags2
+  return OS;
+}
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
new file mode 100644
index 000000000000..3966cae9fe33
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
@@ -0,0 +1,200 @@
+//===-- MipsABIFlagsSection.h - Mips ELF ABI Flags Section -----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
+
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MipsABIFlags.h"
+
+namespace llvm {
+
+class MCStreamer;
+
+struct MipsABIFlagsSection {
+  // Internal representation of the fp_abi related values used in .module.
+  enum class FpABIKind { ANY, XX, S32, S64, SOFT };
+
+  // Version of flags structure.
+  uint16_t Version;
+  // The level of the ISA: 1-5, 32, 64.
+  uint8_t ISALevel;
+  // The revision of ISA: 0 for MIPS V and below, 1-n otherwise.
+  uint8_t ISARevision;
+  // The size of general purpose registers.
+  Mips::AFL_REG GPRSize;
+  // The size of co-processor 1 registers.
+  Mips::AFL_REG CPR1Size;
+  // The size of co-processor 2 registers.
+  Mips::AFL_REG CPR2Size;
+  // Processor-specific extension.
+  Mips::AFL_EXT ISAExtension;
+  // Mask of ASEs used.
+  uint32_t ASESet;
+
+  bool OddSPReg;
+
+  bool Is32BitABI;
+
+protected:
+  // The floating-point ABI.
+  FpABIKind FpABI;
+
+public:
+  MipsABIFlagsSection()
+      : Version(0), ISALevel(0), ISARevision(0), GPRSize(Mips::AFL_REG_NONE),
+        CPR1Size(Mips::AFL_REG_NONE), CPR2Size(Mips::AFL_REG_NONE),
+        ISAExtension(Mips::AFL_EXT_NONE), ASESet(0), OddSPReg(false),
+        Is32BitABI(false), FpABI(FpABIKind::ANY) {}
+
+  uint16_t getVersionValue() { return (uint16_t)Version; }
+  uint8_t getISALevelValue() { return (uint8_t)ISALevel; }
+  uint8_t getISARevisionValue() { return (uint8_t)ISARevision; }
+  uint8_t getGPRSizeValue() { return (uint8_t)GPRSize; }
+  uint8_t getCPR1SizeValue();
+  uint8_t getCPR2SizeValue() { return (uint8_t)CPR2Size; }
+  uint8_t getFpABIValue();
+  uint32_t getISAExtensionValue() { return (uint32_t)ISAExtension; }
+  uint32_t getASESetValue() { return (uint32_t)ASESet; }
+
+  uint32_t getFlags1Value() {
+    uint32_t Value = 0;
+
+    if (OddSPReg)
+      Value |= (uint32_t)Mips::AFL_FLAGS1_ODDSPREG;
+
+    return Value;
+  }
+
+  uint32_t getFlags2Value() { return 0; }
+
+  FpABIKind getFpABI() { return FpABI; }
+  void setFpABI(FpABIKind Value, bool IsABI32Bit) {
+    FpABI = Value;
+    Is32BitABI = IsABI32Bit;
+  }
+  StringRef getFpABIString(FpABIKind Value);
+
+  template <class PredicateLibrary>
+  void setISALevelAndRevisionFromPredicates(const PredicateLibrary &P) {
+    if (P.hasMips64()) {
+      ISALevel = 64;
+      if (P.hasMips64r6())
+        ISARevision = 6;
+      else if (P.hasMips64r5())
+        ISARevision = 5;
+      else if (P.hasMips64r3())
+        ISARevision = 3;
+      else if (P.hasMips64r2())
+        ISARevision = 2;
+      else
+        ISARevision = 1;
+    } else if (P.hasMips32()) {
+      ISALevel = 32;
+      if (P.hasMips32r6())
+        ISARevision = 6;
+      else if (P.hasMips32r5())
+        ISARevision = 5;
+      else if (P.hasMips32r3())
+        ISARevision = 3;
+      else if (P.hasMips32r2())
+        ISARevision = 2;
+      else
+        ISARevision = 1;
+    } else {
+      ISARevision = 0;
+      if (P.hasMips5())
+        ISALevel = 5;
+      else if (P.hasMips4())
+        ISALevel = 4;
+      else if (P.hasMips3())
+        ISALevel = 3;
+      else if (P.hasMips2())
+        ISALevel = 2;
+      else if (P.hasMips1())
+        ISALevel = 1;
+      else
+        llvm_unreachable("Unknown ISA level!");
+    }
+  }
+
+  template <class PredicateLibrary>
+  void setGPRSizeFromPredicates(const PredicateLibrary &P) {
+    GPRSize = P.isGP64bit() ? Mips::AFL_REG_64 : Mips::AFL_REG_32;
+  }
+
+  template <class PredicateLibrary>
+  void setCPR1SizeFromPredicates(const PredicateLibrary &P) {
+    if (P.useSoftFloat())
+      CPR1Size = Mips::AFL_REG_NONE;
+    else if (P.hasMSA())
+      CPR1Size = Mips::AFL_REG_128;
+    else
+      CPR1Size = P.isFP64bit() ? Mips::AFL_REG_64 : Mips::AFL_REG_32;
+  }
+
+  template <class PredicateLibrary>
+  void setISAExtensionFromPredicates(const PredicateLibrary &P) {
+    if (P.hasCnMips())
+      ISAExtension = Mips::AFL_EXT_OCTEON;
+    else
+      ISAExtension = Mips::AFL_EXT_NONE;
+  }
+
+  template <class PredicateLibrary>
+  void setASESetFromPredicates(const PredicateLibrary &P) {
+    ASESet = 0;
+    if (P.hasDSP())
+      ASESet |= Mips::AFL_ASE_DSP;
+    if (P.hasDSPR2())
+      ASESet |= Mips::AFL_ASE_DSPR2;
+    if (P.hasMSA())
+      ASESet |= Mips::AFL_ASE_MSA;
+    if (P.inMicroMipsMode())
+      ASESet |= Mips::AFL_ASE_MICROMIPS;
+    if (P.inMips16Mode())
+      ASESet |= Mips::AFL_ASE_MIPS16;
+  }
+
+  template <class PredicateLibrary>
+  void setFpAbiFromPredicates(const PredicateLibrary &P) {
+    Is32BitABI = P.isABI_O32();
+
+    FpABI = FpABIKind::ANY;
+    if (P.useSoftFloat())
+      FpABI = FpABIKind::SOFT;
+    else if (P.isABI_N32() || P.isABI_N64())
+      FpABI = FpABIKind::S64;
+    else if (P.isABI_O32()) {
+      if (P.isABI_FPXX())
+        FpABI = FpABIKind::XX;
+      else if (P.isFP64bit())
+        FpABI = FpABIKind::S64;
+      else
+        FpABI = FpABIKind::S32;
+    }
+  }
+
+  template <class PredicateLibrary>
+  void setAllFromPredicates(const PredicateLibrary &P) {
+    setISALevelAndRevisionFromPredicates(P);
+    setGPRSizeFromPredicates(P);
+    setCPR1SizeFromPredicates(P);
+    setISAExtensionFromPredicates(P);
+    setASESetFromPredicates(P);
+    setFpAbiFromPredicates(P);
+    OddSPReg = P.useOddSPReg();
+  }
+};
+
+MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
new file mode 100644
index 000000000000..498ea6fda4b3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
@@ -0,0 +1,119 @@
+//===---- MipsABIInfo.cpp - Information about MIPS ABI's ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsABIInfo.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCTargetOptions.h"
+
+using namespace llvm;
+
+namespace {
+static const MCPhysReg O32IntRegs[4] = {Mips::A0, Mips::A1, Mips::A2, Mips::A3};
+
+static const MCPhysReg Mips64IntRegs[8] = {
+    Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64,
+    Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64};
+}
+
+ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const {
+  if (IsO32())
+    return makeArrayRef(O32IntRegs);
+  if (IsN32() || IsN64())
+    return makeArrayRef(Mips64IntRegs);
+  llvm_unreachable("Unhandled ABI");
+}
+
+ArrayRef<MCPhysReg> MipsABIInfo::GetVarArgRegs() const {
+  if (IsO32())
+    return makeArrayRef(O32IntRegs);
+  if (IsN32() || IsN64())
+    return makeArrayRef(Mips64IntRegs);
+  llvm_unreachable("Unhandled ABI");
+}
+
+unsigned MipsABIInfo::GetCalleeAllocdArgSizeInBytes(CallingConv::ID CC) const {
+  if (IsO32())
+    return CC != CallingConv::Fast ? 16 : 0;
+  if (IsN32() || IsN64())
+    return 0;
+  llvm_unreachable("Unhandled ABI");
+}
+
+MipsABIInfo MipsABIInfo::computeTargetABI(const Triple &TT, StringRef CPU,
+                                          const MCTargetOptions &Options) {
+  if (Options.getABIName().startswith("o32"))
+    return MipsABIInfo::O32();
+  if (Options.getABIName().startswith("n32"))
+    return MipsABIInfo::N32();
+  if (Options.getABIName().startswith("n64"))
+    return MipsABIInfo::N64();
+  assert(Options.getABIName().empty() && "Unknown ABI option for MIPS");
+
+  if (TT.getArch() == Triple::mips64 || TT.getArch() == Triple::mips64el)
+    return MipsABIInfo::N64();
+  return MipsABIInfo::O32();
+}
+
+unsigned MipsABIInfo::GetStackPtr() const {
+  return ArePtrs64bit() ? Mips::SP_64 : Mips::SP;
+}
+
+unsigned MipsABIInfo::GetFramePtr() const {
+  return ArePtrs64bit() ? Mips::FP_64 : Mips::FP;
+}
+
+unsigned MipsABIInfo::GetBasePtr() const {
+  return ArePtrs64bit() ? Mips::S7_64 : Mips::S7;
+}
+
+unsigned MipsABIInfo::GetGlobalPtr() const {
+  return ArePtrs64bit() ? Mips::GP_64 : Mips::GP;
+}
+
+unsigned MipsABIInfo::GetNullPtr() const {
+  return ArePtrs64bit() ? Mips::ZERO_64 : Mips::ZERO;
+}
+
+unsigned MipsABIInfo::GetZeroReg() const {
+  return AreGprs64bit() ? Mips::ZERO_64 : Mips::ZERO;
+}
+
+unsigned MipsABIInfo::GetPtrAdduOp() const {
+  return ArePtrs64bit() ? Mips::DADDu : Mips::ADDu;
+}
+
+unsigned MipsABIInfo::GetPtrAddiuOp() const {
+  return ArePtrs64bit() ? Mips::DADDiu : Mips::ADDiu;
+}
+
+unsigned MipsABIInfo::GetPtrSubuOp() const {
+  return ArePtrs64bit() ? Mips::DSUBu : Mips::SUBu;
+}
+
+unsigned MipsABIInfo::GetPtrAndOp() const {
+  return ArePtrs64bit() ? Mips::AND64 : Mips::AND;
+}
+
+unsigned MipsABIInfo::GetGPRMoveOp() const {
+  return ArePtrs64bit() ? Mips::OR64 : Mips::OR;
+}
+
+unsigned MipsABIInfo::GetEhDataReg(unsigned I) const {
+  static const unsigned EhDataReg[] = {
+    Mips::A0, Mips::A1, Mips::A2, Mips::A3
+  };
+  static const unsigned EhDataReg64[] = {
+    Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64
+  };
+
+  return IsN64() ? EhDataReg64[I] : EhDataReg[I];
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
new file mode 100644
index 000000000000..9372a3c2bb1f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
@@ -0,0 +1,82 @@
+//===---- MipsABIInfo.h - Information about MIPS ABI's --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIINFO_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIINFO_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/MC/MCRegisterInfo.h"
+
+namespace llvm {
+
+template <typename T> class ArrayRef;
+class MCTargetOptions;
+class StringRef;
+class TargetRegisterClass;
+
+class MipsABIInfo {
+public:
+  enum class ABI { Unknown, O32, N32, N64 };
+
+protected:
+  ABI ThisABI;
+
+public:
+  MipsABIInfo(ABI ThisABI) : ThisABI(ThisABI) {}
+
+  static MipsABIInfo Unknown() { return MipsABIInfo(ABI::Unknown); }
+  static MipsABIInfo O32() { return MipsABIInfo(ABI::O32); }
+  static MipsABIInfo N32() { return MipsABIInfo(ABI::N32); }
+  static MipsABIInfo N64() { return MipsABIInfo(ABI::N64); }
+  static MipsABIInfo computeTargetABI(const Triple &TT, StringRef CPU,
+                                      const MCTargetOptions &Options);
+
+  bool IsKnown() const { return ThisABI != ABI::Unknown; }
+  bool IsO32() const { return ThisABI == ABI::O32; }
+  bool IsN32() const { return ThisABI == ABI::N32; }
+  bool IsN64() const { return ThisABI == ABI::N64; }
+  ABI GetEnumValue() const { return ThisABI; }
+
+  /// The registers to use for byval arguments.
+  ArrayRef<MCPhysReg> GetByValArgRegs() const;
+
+  /// The registers to use for the variable argument list.
+  ArrayRef<MCPhysReg> GetVarArgRegs() const;
+
+  /// Obtain the size of the area allocated by the callee for arguments.
+  /// CallingConv::FastCall affects the value for O32.
+  unsigned GetCalleeAllocdArgSizeInBytes(CallingConv::ID CC) const;
+
+  /// Ordering of ABI's
+  /// MipsGenSubtargetInfo.inc will use this to resolve conflicts when given
+  /// multiple ABI options.
+  bool operator<(const MipsABIInfo Other) const {
+    return ThisABI < Other.GetEnumValue();
+  }
+
+  unsigned GetStackPtr() const;
+  unsigned GetFramePtr() const;
+  unsigned GetBasePtr() const;
+  unsigned GetGlobalPtr() const;
+  unsigned GetNullPtr() const;
+  unsigned GetZeroReg() const;
+  unsigned GetPtrAdduOp() const;
+  unsigned GetPtrAddiuOp() const;
+  unsigned GetPtrSubuOp() const;
+  unsigned GetPtrAndOp() const;
+  unsigned GetGPRMoveOp() const;
+  inline bool ArePtrs64bit() const { return IsN64(); }
+  inline bool AreGprs64bit() const { return IsN32() || IsN64(); }
+
+  unsigned GetEhDataReg(unsigned I) const;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
new file mode 100644
index 000000000000..38b11f78e36d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -0,0 +1,522 @@
+//===-- MipsAsmBackend.cpp - Mips Asm Backend  ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MipsAsmBackend class.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "MCTargetDesc/MipsAsmBackend.h"
+#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// Prepare value for the target space for it
+static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+                                 MCContext *Ctx = nullptr) {
+
+  unsigned Kind = Fixup.getKind();
+
+  // Add/subtract and shift
+  switch (Kind) {
+  default:
+    return 0;
+  case FK_Data_2:
+  case Mips::fixup_Mips_LO16:
+  case Mips::fixup_Mips_GPREL16:
+  case Mips::fixup_Mips_GPOFF_HI:
+  case Mips::fixup_Mips_GPOFF_LO:
+  case Mips::fixup_Mips_GOT_PAGE:
+  case Mips::fixup_Mips_GOT_OFST:
+  case Mips::fixup_Mips_GOT_DISP:
+  case Mips::fixup_Mips_GOT_LO16:
+  case Mips::fixup_Mips_CALL_LO16:
+  case Mips::fixup_MICROMIPS_LO16:
+  case Mips::fixup_MICROMIPS_GOT_PAGE:
+  case Mips::fixup_MICROMIPS_GOT_OFST:
+  case Mips::fixup_MICROMIPS_GOT_DISP:
+  case Mips::fixup_MIPS_PCLO16:
+    Value &= 0xffff;
+    break;
+  case FK_DTPRel_4:
+  case FK_DTPRel_8:
+  case FK_TPRel_4:
+  case FK_TPRel_8:
+  case FK_GPRel_4:
+  case FK_Data_4:
+  case FK_Data_8:
+  case Mips::fixup_Mips_SUB:
+  case Mips::fixup_MICROMIPS_SUB:
+    break;
+  case Mips::fixup_Mips_PC16:
+    // The displacement is then divided by 4 to give us an 18 bit
+    // address range. Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 4;
+    // We now check if Value can be encoded as a 16-bit signed immediate.
+    if (!isInt<16>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup");
+      return 0;
+    }
+    break;
+  case Mips::fixup_MIPS_PC19_S2:
+  case Mips::fixup_MICROMIPS_PC19_S2:
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 4;
+    // We now check if Value can be encoded as a 19-bit signed immediate.
+    if (!isInt<19>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC19 fixup");
+      return 0;
+    }
+    break;
+  case Mips::fixup_Mips_26:
+    // So far we are only using this type for jumps.
+    // The displacement is then divided by 4 to give us an 28 bit
+    // address range.
+    Value >>= 2;
+    break;
+  case Mips::fixup_Mips_HI16:
+  case Mips::fixup_Mips_GOT:
+  case Mips::fixup_MICROMIPS_GOT16:
+  case Mips::fixup_Mips_GOT_HI16:
+  case Mips::fixup_Mips_CALL_HI16:
+  case Mips::fixup_MICROMIPS_HI16:
+  case Mips::fixup_MIPS_PCHI16:
+    // Get the 2nd 16-bits. Also add 1 if bit 15 is 1.
+    Value = ((Value + 0x8000) >> 16) & 0xffff;
+    break;
+  case Mips::fixup_Mips_HIGHER:
+    // Get the 3rd 16-bits.
+    Value = ((Value + 0x80008000LL) >> 32) & 0xffff;
+    break;
+  case Mips::fixup_Mips_HIGHEST:
+    // Get the 4th 16-bits.
+    Value = ((Value + 0x800080008000LL) >> 48) & 0xffff;
+    break;
+  case Mips::fixup_MICROMIPS_26_S1:
+    Value >>= 1;
+    break;
+  case Mips::fixup_MICROMIPS_PC7_S1:
+    Value -= 4;
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t) Value / 2;
+    // We now check if Value can be encoded as a 7-bit signed immediate.
+    if (!isInt<7>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC7 fixup");
+      return 0;
+    }
+    break;
+  case Mips::fixup_MICROMIPS_PC10_S1:
+    Value -= 2;
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t) Value / 2;
+    // We now check if Value can be encoded as a 10-bit signed immediate.
+    if (!isInt<10>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC10 fixup");
+      return 0;
+    }
+    break;
+  case Mips::fixup_MICROMIPS_PC16_S1:
+    Value -= 4;
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 2;
+    // We now check if Value can be encoded as a 16-bit signed immediate.
+    if (!isInt<16>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup");
+      return 0;
+    }
+    break;
+  case Mips::fixup_MIPS_PC18_S3:
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 8;
+    // We now check if Value can be encoded as a 18-bit signed immediate.
+    if (!isInt<18>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+      return 0;
+    }
+    break;
+  case Mips::fixup_MICROMIPS_PC18_S3:
+    // Check alignment.
+    if ((Value & 7) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+    }
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 8;
+    // We now check if Value can be encoded as a 18-bit signed immediate.
+    if (!isInt<18>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+      return 0;
+    }
+    break;
+  case Mips::fixup_MIPS_PC21_S2:
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t) Value / 4;
+    // We now check if Value can be encoded as a 21-bit signed immediate.
+    if (!isInt<21>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC21 fixup");
+      return 0;
+    }
+    break;
+  case Mips::fixup_MIPS_PC26_S2:
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t) Value / 4;
+    // We now check if Value can be encoded as a 26-bit signed immediate.
+    if (!isInt<26>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC26 fixup");
+      return 0;
+    }
+    break;
+  case Mips::fixup_MICROMIPS_PC26_S1:
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 2;
+    // We now check if Value can be encoded as a 26-bit signed immediate.
+    if (!isInt<26>(Value) && Ctx) {
+      Ctx->reportFatalError(Fixup.getLoc(), "out of range PC26 fixup");
+      return 0;
+    }
+    break;
+  case Mips::fixup_MICROMIPS_PC21_S1:
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t)Value / 2;
+    // We now check if Value can be encoded as a 21-bit signed immediate.
+    if (!isInt<21>(Value) && Ctx) {
+      Ctx->reportError(Fixup.getLoc(), "out of range PC21 fixup");
+      return 0;
+    }
+    break;
+  }
+
+  return Value;
+}
+
+MCObjectWriter *
+MipsAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
+  return createMipsELFObjectWriter(OS,
+    MCELFObjectTargetWriter::getOSABI(OSType), IsLittle, Is64Bit);
+}
+
+// Little-endian fixup data byte ordering:
+//   mips32r2:   a | b | x | x
+//   microMIPS:  x | x | a | b
+
+static bool needsMMLEByteOrder(unsigned Kind) {
+  return Kind != Mips::fixup_MICROMIPS_PC10_S1 &&
+         Kind >= Mips::fixup_MICROMIPS_26_S1 &&
+         Kind < Mips::LastTargetFixupKind;
+}
+
+// Calculate index for microMIPS specific little endian byte order
+static unsigned calculateMMLEIndex(unsigned i) {
+  assert(i <= 3 && "Index out of range!");
+
+  return (1 - i / 2) * 2 + i % 2;
+}
+
+/// ApplyFixup - Apply the \p Value for given \p Fixup into the provided
+/// data fragment, at the offset specified by the fixup and following the
+/// fixup kind as appropriate.
+void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                unsigned DataSize, uint64_t Value,
+                                bool IsPCRel) const {
+  MCFixupKind Kind = Fixup.getKind();
+  Value = adjustFixupValue(Fixup, Value);
+
+  if (!Value)
+    return; // Doesn't change encoding.
+
+  // Where do we start in the object
+  unsigned Offset = Fixup.getOffset();
+  // Number of bytes we need to fixup
+  unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
+  // Used to point to big endian bytes
+  unsigned FullSize;
+
+  switch ((unsigned)Kind) {
+  case FK_Data_2:
+  case Mips::fixup_Mips_16:
+  case Mips::fixup_MICROMIPS_PC10_S1:
+    FullSize = 2;
+    break;
+  case FK_Data_8:
+  case Mips::fixup_Mips_64:
+    FullSize = 8;
+    break;
+  case FK_Data_4:
+  default:
+    FullSize = 4;
+    break;
+  }
+
+  // Grab current value, if any, from bits.
+  uint64_t CurVal = 0;
+
+  bool microMipsLEByteOrder = needsMMLEByteOrder((unsigned) Kind);
+
+  for (unsigned i = 0; i != NumBytes; ++i) {
+    unsigned Idx = IsLittle ? (microMipsLEByteOrder ? calculateMMLEIndex(i)
+                                                    : i)
+                            : (FullSize - 1 - i);
+    CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8);
+  }
+
+  uint64_t Mask = ((uint64_t)(-1) >>
+                    (64 - getFixupKindInfo(Kind).TargetSize));
+  CurVal |= Value & Mask;
+
+  // Write out the fixed up bytes back to the code/data bits.
+  for (unsigned i = 0; i != NumBytes; ++i) {
+    unsigned Idx = IsLittle ? (microMipsLEByteOrder ? calculateMMLEIndex(i)
+                                                    : i)
+                            : (FullSize - 1 - i);
+    Data[Offset + Idx] = (uint8_t)((CurVal >> (i*8)) & 0xff);
+  }
+}
+
+Optional<MCFixupKind> MipsAsmBackend::getFixupKind(StringRef Name) const {
+  return StringSwitch<Optional<MCFixupKind>>(Name)
+      .Case("R_MIPS_NONE", (MCFixupKind)Mips::fixup_Mips_NONE)
+      .Case("R_MIPS_32", FK_Data_4)
+      .Default(MCAsmBackend::getFixupKind(Name));
+}
+
+const MCFixupKindInfo &MipsAsmBackend::
+getFixupKindInfo(MCFixupKind Kind) const {
+  const static MCFixupKindInfo LittleEndianInfos[Mips::NumTargetFixupKinds] = {
+    // This table *must* be in same the order of fixup_* kinds in
+    // MipsFixupKinds.h.
+    //
+    // name                    offset  bits  flags
+    { "fixup_Mips_NONE",         0,      0,   0 },
+    { "fixup_Mips_16",           0,     16,   0 },
+    { "fixup_Mips_32",           0,     32,   0 },
+    { "fixup_Mips_REL32",        0,     32,   0 },
+    { "fixup_Mips_26",           0,     26,   0 },
+    { "fixup_Mips_HI16",         0,     16,   0 },
+    { "fixup_Mips_LO16",         0,     16,   0 },
+    { "fixup_Mips_GPREL16",      0,     16,   0 },
+    { "fixup_Mips_LITERAL",      0,     16,   0 },
+    { "fixup_Mips_GOT",          0,     16,   0 },
+    { "fixup_Mips_PC16",         0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_Mips_CALL16",       0,     16,   0 },
+    { "fixup_Mips_GPREL32",      0,     32,   0 },
+    { "fixup_Mips_SHIFT5",       6,      5,   0 },
+    { "fixup_Mips_SHIFT6",       6,      5,   0 },
+    { "fixup_Mips_64",           0,     64,   0 },
+    { "fixup_Mips_TLSGD",        0,     16,   0 },
+    { "fixup_Mips_GOTTPREL",     0,     16,   0 },
+    { "fixup_Mips_TPREL_HI",     0,     16,   0 },
+    { "fixup_Mips_TPREL_LO",     0,     16,   0 },
+    { "fixup_Mips_TLSLDM",       0,     16,   0 },
+    { "fixup_Mips_DTPREL_HI",    0,     16,   0 },
+    { "fixup_Mips_DTPREL_LO",    0,     16,   0 },
+    { "fixup_Mips_Branch_PCRel", 0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_Mips_GPOFF_HI",     0,     16,   0 },
+    { "fixup_Mips_GPOFF_LO",     0,     16,   0 },
+    { "fixup_Mips_GOT_PAGE",     0,     16,   0 },
+    { "fixup_Mips_GOT_OFST",     0,     16,   0 },
+    { "fixup_Mips_GOT_DISP",     0,     16,   0 },
+    { "fixup_Mips_HIGHER",       0,     16,   0 },
+    { "fixup_Mips_HIGHEST",      0,     16,   0 },
+    { "fixup_Mips_GOT_HI16",     0,     16,   0 },
+    { "fixup_Mips_GOT_LO16",     0,     16,   0 },
+    { "fixup_Mips_CALL_HI16",    0,     16,   0 },
+    { "fixup_Mips_CALL_LO16",    0,     16,   0 },
+    { "fixup_Mips_PC18_S3",      0,     18,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC19_S2",      0,     19,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC21_S2",      0,     21,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC26_S2",      0,     26,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCHI16",       0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCLO16",       0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_26_S1",   0,     26,   0 },
+    { "fixup_MICROMIPS_HI16",    0,     16,   0 },
+    { "fixup_MICROMIPS_LO16",    0,     16,   0 },
+    { "fixup_MICROMIPS_GOT16",   0,     16,   0 },
+    { "fixup_MICROMIPS_PC7_S1",  0,      7,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC10_S1", 0,     10,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC16_S1", 0,     16,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC26_S1", 0,     26,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC19_S2", 0,     19,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC18_S3", 0,     18,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC21_S1", 0,     21,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_CALL16",  0,     16,   0 },
+    { "fixup_MICROMIPS_GOT_DISP",        0,     16,   0 },
+    { "fixup_MICROMIPS_GOT_PAGE",        0,     16,   0 },
+    { "fixup_MICROMIPS_GOT_OFST",        0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_GD",          0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_LDM",         0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_DTPREL_HI16", 0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_DTPREL_LO16", 0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_TPREL_HI16",  0,     16,   0 },
+    { "fixup_MICROMIPS_TLS_TPREL_LO16",  0,     16,   0 },
+    { "fixup_Mips_SUB",                  0,     64,   0 },
+    { "fixup_MICROMIPS_SUB",             0,     64,   0 }
+  };
+
+  const static MCFixupKindInfo BigEndianInfos[Mips::NumTargetFixupKinds] = {
+    // This table *must* be in same the order of fixup_* kinds in
+    // MipsFixupKinds.h.
+    //
+    // name                    offset  bits  flags
+    { "fixup_Mips_NONE",         0,      0,   0 },
+    { "fixup_Mips_16",          16,     16,   0 },
+    { "fixup_Mips_32",           0,     32,   0 },
+    { "fixup_Mips_REL32",        0,     32,   0 },
+    { "fixup_Mips_26",           6,     26,   0 },
+    { "fixup_Mips_HI16",        16,     16,   0 },
+    { "fixup_Mips_LO16",        16,     16,   0 },
+    { "fixup_Mips_GPREL16",     16,     16,   0 },
+    { "fixup_Mips_LITERAL",     16,     16,   0 },
+    { "fixup_Mips_GOT",         16,     16,   0 },
+    { "fixup_Mips_PC16",        16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_Mips_CALL16",      16,     16,   0 },
+    { "fixup_Mips_GPREL32",      0,     32,   0 },
+    { "fixup_Mips_SHIFT5",      21,      5,   0 },
+    { "fixup_Mips_SHIFT6",      21,      5,   0 },
+    { "fixup_Mips_64",           0,     64,   0 },
+    { "fixup_Mips_TLSGD",       16,     16,   0 },
+    { "fixup_Mips_GOTTPREL",    16,     16,   0 },
+    { "fixup_Mips_TPREL_HI",    16,     16,   0 },
+    { "fixup_Mips_TPREL_LO",    16,     16,   0 },
+    { "fixup_Mips_TLSLDM",      16,     16,   0 },
+    { "fixup_Mips_DTPREL_HI",   16,     16,   0 },
+    { "fixup_Mips_DTPREL_LO",   16,     16,   0 },
+    { "fixup_Mips_Branch_PCRel",16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_Mips_GPOFF_HI",    16,     16,   0 },
+    { "fixup_Mips_GPOFF_LO",    16,     16,   0 },
+    { "fixup_Mips_GOT_PAGE",    16,     16,   0 },
+    { "fixup_Mips_GOT_OFST",    16,     16,   0 },
+    { "fixup_Mips_GOT_DISP",    16,     16,   0 },
+    { "fixup_Mips_HIGHER",      16,     16,   0 },
+    { "fixup_Mips_HIGHEST",     16,     16,   0 },
+    { "fixup_Mips_GOT_HI16",    16,     16,   0 },
+    { "fixup_Mips_GOT_LO16",    16,     16,   0 },
+    { "fixup_Mips_CALL_HI16",   16,     16,   0 },
+    { "fixup_Mips_CALL_LO16",   16,     16,   0 },
+    { "fixup_Mips_PC18_S3",     14,     18,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC19_S2",     13,     19,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC21_S2",     11,     21,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC26_S2",      6,     26,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCHI16",      16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCLO16",      16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_26_S1",   6,     26,   0 },
+    { "fixup_MICROMIPS_HI16",   16,     16,   0 },
+    { "fixup_MICROMIPS_LO16",   16,     16,   0 },
+    { "fixup_MICROMIPS_GOT16",  16,     16,   0 },
+    { "fixup_MICROMIPS_PC7_S1",  9,      7,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC10_S1", 6,     10,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC16_S1",16,     16,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC26_S1", 6,     26,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC19_S2",13,     19,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC18_S3",14,     18,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_PC21_S1",11,     21,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_CALL16", 16,     16,   0 },
+    { "fixup_MICROMIPS_GOT_DISP",        16,     16,   0 },
+    { "fixup_MICROMIPS_GOT_PAGE",        16,     16,   0 },
+    { "fixup_MICROMIPS_GOT_OFST",        16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_GD",          16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_LDM",         16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_DTPREL_HI16", 16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_DTPREL_LO16", 16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_TPREL_HI16",  16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_TPREL_LO16",  16,     16,   0 },
+    { "fixup_Mips_SUB",                   0,     64,   0 },
+    { "fixup_MICROMIPS_SUB",              0,     64,   0 }
+  };
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+          "Invalid kind!");
+
+  if (IsLittle)
+    return LittleEndianInfos[Kind - FirstTargetFixupKind];
+  return BigEndianInfos[Kind - FirstTargetFixupKind];
+}
+
+/// WriteNopData - Write an (optimal) nop sequence of Count bytes
+/// to the given output. If the target cannot generate such a sequence,
+/// it should return an error.
+///
+/// \return - True on success.
+bool MipsAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  // Check for a less than instruction size number of bytes
+  // FIXME: 16 bit instructions are not handled yet here.
+  // We shouldn't be using a hard coded number for instruction size.
+
+  // If the count is not 4-byte aligned, we must be writing data into the text
+  // section (otherwise we have unaligned instructions, and thus have far
+  // bigger problems), so just write zeros instead.
+  OW->WriteZeros(Count);
+  return true;
+}
+
+/// processFixupValue - Target hook to process the literal value of a fixup
+/// if necessary.
+void MipsAsmBackend::processFixupValue(const MCAssembler &Asm,
+                                       const MCAsmLayout &Layout,
+                                       const MCFixup &Fixup,
+                                       const MCFragment *DF,
+                                       const MCValue &Target,
+                                       uint64_t &Value,
+                                       bool &IsResolved) {
+  // At this point we'll ignore the value returned by adjustFixupValue as
+  // we are only checking if the fixup can be applied correctly. We have
+  // access to MCContext from here which allows us to report a fatal error
+  // with *possibly* a source code location.
+  // The caller will also ignore any changes we make to Value
+  // (recordRelocation() overwrites it with it's own calculation).
+  (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
+}
+
+// MCAsmBackend
+MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             const Triple &TT, StringRef CPU,
+                                             const MCTargetOptions &Options) {
+  return new MipsAsmBackend(T, TT.getOS(), /*IsLittle*/ true,
+                            /*Is64Bit*/ false);
+}
+
+MCAsmBackend *llvm::createMipsAsmBackendEB32(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             const Triple &TT, StringRef CPU,
+                                             const MCTargetOptions &Options) {
+  return new MipsAsmBackend(T, TT.getOS(), /*IsLittle*/ false,
+                            /*Is64Bit*/ false);
+}
+
+MCAsmBackend *llvm::createMipsAsmBackendEL64(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             const Triple &TT, StringRef CPU,
+                                             const MCTargetOptions &Options) {
+  return new MipsAsmBackend(T, TT.getOS(), /*IsLittle*/ true, /*Is64Bit*/ true);
+}
+
+MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             const Triple &TT, StringRef CPU,
+                                             const MCTargetOptions &Options) {
+  return new MipsAsmBackend(T, TT.getOS(), /*IsLittle*/ false,
+                            /*Is64Bit*/ true);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
new file mode 100644
index 000000000000..f260cfa566c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -0,0 +1,94 @@
+//===-- MipsAsmBackend.h - Mips Asm Backend  ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MipsAsmBackend class.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSASMBACKEND_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSASMBACKEND_H
+
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmBackend.h"
+
+namespace llvm {
+
+class MCAssembler;
+struct MCFixupKindInfo;
+class Target;
+class MCObjectWriter;
+
+class MipsAsmBackend : public MCAsmBackend {
+  Triple::OSType OSType;
+  bool IsLittle; // Big or little endian
+  bool Is64Bit;  // 32 or 64 bit words
+
+public:
+  MipsAsmBackend(const Target &T, Triple::OSType OSType, bool IsLittle,
+                 bool Is64Bit)
+      : MCAsmBackend(), OSType(OSType), IsLittle(IsLittle), Is64Bit(Is64Bit) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
+  unsigned getNumFixupKinds() const override {
+    return Mips::NumTargetFixupKinds;
+  }
+
+  /// @name Target Relaxation Interfaces
+  /// @{
+
+  /// MayNeedRelaxation - Check whether the given instruction may need
+  /// relaxation.
+  ///
+  /// \param Inst - The instruction to test.
+  bool mayNeedRelaxation(const MCInst &Inst) const override {
+    return false;
+  }
+
+  /// fixupNeedsRelaxation - Target specific predicate for whether a given
+  /// fixup requires the associated instruction to be relaxed.
+   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                             const MCRelaxableFragment *DF,
+                             const MCAsmLayout &Layout) const override {
+    // FIXME.
+    llvm_unreachable("RelaxInstruction() unimplemented");
+    return false;
+  }
+
+  /// RelaxInstruction - Relax the instruction in the given fragment
+  /// to the next wider instruction.
+  ///
+  /// \param Inst - The instruction to relax, which may be the same
+  /// as the output.
+  /// \param [out] Res On return, the relaxed instruction.
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {}
+
+  /// @}
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override;
+
+}; // class MipsAsmBackend
+
+} // namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
new file mode 100644
index 000000000000..35de7b27bf10
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -0,0 +1,132 @@
+//===-- MipsBaseInfo.h - Top level definitions for MIPS MC ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the Mips target useful for the compiler back-end and the MC libraries.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSBASEINFO_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSBASEINFO_H
+
+#include "MipsFixupKinds.h"
+#include "MipsMCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+/// MipsII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace MipsII {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // Mips Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    /// MO_GOT - Represents the offset into the global offset table at which
+    /// the address the relocation entry symbol resides during execution.
+    MO_GOT,
+
+    /// MO_GOT_CALL - Represents the offset into the global offset table at
+    /// which the address of a call site relocation entry symbol resides
+    /// during execution. This is different from the above since this flag
+    /// can only be present in call instructions.
+    MO_GOT_CALL,
+
+    /// MO_GPREL - Represents the offset from the current gp value to be used
+    /// for the relocatable object file being produced.
+    MO_GPREL,
+
+    /// MO_ABS_HI/LO - Represents the hi or low part of an absolute symbol
+    /// address.
+    MO_ABS_HI,
+    MO_ABS_LO,
+
+    /// MO_TLSGD - Represents the offset into the global offset table at which
+    // the module ID and TSL block offset reside during execution (General
+    // Dynamic TLS).
+    MO_TLSGD,
+
+    /// MO_TLSLDM - Represents the offset into the global offset table at which
+    // the module ID and TSL block offset reside during execution (Local
+    // Dynamic TLS).
+    MO_TLSLDM,
+    MO_DTPREL_HI,
+    MO_DTPREL_LO,
+
+    /// MO_GOTTPREL - Represents the offset from the thread pointer (Initial
+    // Exec TLS).
+    MO_GOTTPREL,
+
+    /// MO_TPREL_HI/LO - Represents the hi and low part of the offset from
+    // the thread pointer (Local Exec TLS).
+    MO_TPREL_HI,
+    MO_TPREL_LO,
+
+    // N32/64 Flags.
+    MO_GPOFF_HI,
+    MO_GPOFF_LO,
+    MO_GOT_DISP,
+    MO_GOT_PAGE,
+    MO_GOT_OFST,
+
+    /// MO_HIGHER/HIGHEST - Represents the highest or higher half word of a
+    /// 64-bit symbol address.
+    MO_HIGHER,
+    MO_HIGHEST,
+
+    /// MO_GOT_HI16/LO16, MO_CALL_HI16/LO16 - Relocations used for large GOTs.
+    MO_GOT_HI16,
+    MO_GOT_LO16,
+    MO_CALL_HI16,
+    MO_CALL_LO16
+  };
+
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction encodings.  These are the standard/most common forms for
+    // Mips instructions.
+    //
+
+    // Pseudo - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    Pseudo   = 0,
+
+    /// FrmR - This form is for instructions of the format R.
+    FrmR  = 1,
+    /// FrmI - This form is for instructions of the format I.
+    FrmI  = 2,
+    /// FrmJ - This form is for instructions of the format J.
+    FrmJ  = 3,
+    /// FrmFR - This form is for instructions of the format FR.
+    FrmFR = 4,
+    /// FrmFI - This form is for instructions of the format FI.
+    FrmFI = 5,
+    /// FrmOther - This form is for instructions that have no specific format.
+    FrmOther = 6,
+
+    FormMask = 15,
+    /// IsCTI - Instruction is a Control Transfer Instruction.
+    IsCTI = 1 << 4,
+    /// HasForbiddenSlot - Instruction has a forbidden slot.
+    HasForbiddenSlot = 1 << 5,
+    /// IsPCRelativeLoad - A Load instruction with implicit source register
+    ///                    ($pc) with explicit offset and destination register
+    IsPCRelativeLoad = 1 << 6
+
+  };
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
new file mode 100644
index 000000000000..b2efd726da53
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -0,0 +1,662 @@
+//===-- MipsELFObjectWriter.cpp - Mips ELF Writer -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <list>
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#define DEBUG_TYPE "mips-elf-object-writer"
+
+using namespace llvm;
+
+namespace {
+/// Holds additional information needed by the relocation ordering algorithm.
+struct MipsRelocationEntry {
+  const ELFRelocationEntry R; ///< The relocation.
+  bool Matched;               ///< Is this relocation part of a match.
+
+  MipsRelocationEntry(const ELFRelocationEntry &R) : R(R), Matched(false) {}
+
+  void print(raw_ostream &Out) const {
+    R.print(Out);
+    Out << ", Matched=" << Matched;
+  }
+};
+
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &OS, const MipsRelocationEntry &RHS) {
+  RHS.print(OS);
+  return OS;
+}
+#endif
+
+class MipsELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI, bool _isN64,
+                      bool IsLittleEndian);
+
+  ~MipsELFObjectWriter() override;
+
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+  bool needsRelocateWithSymbol(const MCSymbol &Sym,
+                               unsigned Type) const override;
+  virtual void sortRelocs(const MCAssembler &Asm,
+                          std::vector<ELFRelocationEntry> &Relocs) override;
+};
+
+/// Copy elements in the range [First, Last) to d1 when the predicate is true or
+/// d2 when the predicate is false. This is essentially both std::copy_if and
+/// std::remove_copy_if combined into a single pass.
+template <class InputIt, class OutputIt1, class OutputIt2, class UnaryPredicate>
+std::pair<OutputIt1, OutputIt2> copy_if_else(InputIt First, InputIt Last,
+                                             OutputIt1 d1, OutputIt2 d2,
+                                             UnaryPredicate Predicate) {
+  for (InputIt I = First; I != Last; ++I) {
+    if (Predicate(*I)) {
+      *d1 = *I;
+      d1++;
+    } else {
+      *d2 = *I;
+      d2++;
+    }
+  }
+
+  return std::make_pair(d1, d2);
+}
+
+/// The possible results of the Predicate function used by find_best.
+enum FindBestPredicateResult {
+  FindBest_NoMatch = 0,  ///< The current element is not a match.
+  FindBest_Match,        ///< The current element is a match but better ones are
+                         ///  possible.
+  FindBest_PerfectMatch, ///< The current element is an unbeatable match.
+};
+
+/// Find the best match in the range [First, Last).
+///
+/// An element matches when Predicate(X) returns FindBest_Match or
+/// FindBest_PerfectMatch. A value of FindBest_PerfectMatch also terminates
+/// the search. BetterThan(A, B) is a comparator that returns true when A is a
+/// better match than B. The return value is the position of the best match.
+///
+/// This is similar to std::find_if but finds the best of multiple possible
+/// matches.
+template <class InputIt, class UnaryPredicate, class Comparator>
+InputIt find_best(InputIt First, InputIt Last, UnaryPredicate Predicate,
+                  Comparator BetterThan) {
+  InputIt Best = Last;
+
+  for (InputIt I = First; I != Last; ++I) {
+    unsigned Matched = Predicate(*I);
+    if (Matched != FindBest_NoMatch) {
+      DEBUG(dbgs() << std::distance(First, I) << " is a match (";
+            I->print(dbgs()); dbgs() << ")\n");
+      if (Best == Last || BetterThan(*I, *Best)) {
+        DEBUG(dbgs() << ".. and it beats the last one\n");
+        Best = I;
+      }
+    }
+    if (Matched == FindBest_PerfectMatch) {
+      DEBUG(dbgs() << ".. and it is unbeatable\n");
+      break;
+    }
+  }
+
+  return Best;
+}
+
+/// Determine the low relocation that matches the given relocation.
+/// If the relocation does not need a low relocation then the return value
+/// is ELF::R_MIPS_NONE.
+///
+/// The relocations that need a matching low part are
+/// R_(MIPS|MICROMIPS|MIPS16)_HI16 for all symbols and
+/// R_(MIPS|MICROMIPS|MIPS16)_GOT16 for local symbols only.
+static unsigned getMatchingLoType(const ELFRelocationEntry &Reloc) {
+  unsigned Type = Reloc.Type;
+  if (Type == ELF::R_MIPS_HI16)
+    return ELF::R_MIPS_LO16;
+  if (Type == ELF::R_MICROMIPS_HI16)
+    return ELF::R_MICROMIPS_LO16;
+  if (Type == ELF::R_MIPS16_HI16)
+    return ELF::R_MIPS16_LO16;
+
+  if (Reloc.OriginalSymbol->getBinding() != ELF::STB_LOCAL)
+    return ELF::R_MIPS_NONE;
+
+  if (Type == ELF::R_MIPS_GOT16)
+    return ELF::R_MIPS_LO16;
+  if (Type == ELF::R_MICROMIPS_GOT16)
+    return ELF::R_MICROMIPS_LO16;
+  if (Type == ELF::R_MIPS16_GOT16)
+    return ELF::R_MIPS16_LO16;
+
+  return ELF::R_MIPS_NONE;
+}
+
+/// Determine whether a relocation (X) matches the one given in R.
+///
+/// A relocation matches if:
+/// - It's type matches that of a corresponding low part. This is provided in
+///   MatchingType for efficiency.
+/// - It's based on the same symbol.
+/// - It's offset of greater or equal to that of the one given in R.
+///   It should be noted that this rule assumes the programmer does not use
+///   offsets that exceed the alignment of the symbol. The carry-bit will be
+///   incorrect if this is not true.
+///
+/// A matching relocation is unbeatable if:
+/// - It is not already involved in a match.
+/// - It's offset is exactly that of the one given in R.
+static FindBestPredicateResult isMatchingReloc(const MipsRelocationEntry &X,
+                                               const ELFRelocationEntry &R,
+                                               unsigned MatchingType) {
+  if (X.R.Type == MatchingType && X.R.OriginalSymbol == R.OriginalSymbol) {
+    if (!X.Matched &&
+        X.R.OriginalAddend == R.OriginalAddend)
+      return FindBest_PerfectMatch;
+    else if (X.R.OriginalAddend >= R.OriginalAddend)
+      return FindBest_Match;
+  }
+  return FindBest_NoMatch;
+}
+
+/// Determine whether Candidate or PreviousBest is the better match.
+/// The return value is true if Candidate is the better match.
+///
+/// A matching relocation is a better match if:
+/// - It has a smaller addend.
+/// - It is not already involved in a match.
+static bool compareMatchingRelocs(const MipsRelocationEntry &Candidate,
+                                  const MipsRelocationEntry &PreviousBest) {
+  if (Candidate.R.OriginalAddend != PreviousBest.R.OriginalAddend)
+    return Candidate.R.OriginalAddend < PreviousBest.R.OriginalAddend;
+  return PreviousBest.Matched && !Candidate.Matched;
+}
+
+#ifndef NDEBUG
+/// Print all the relocations.
+template <class Container>
+static void dumpRelocs(const char *Prefix, const Container &Relocs) {
+  for (const auto &R : Relocs)
+    dbgs() << Prefix << R << "\n";
+}
+#endif
+
+} // end anonymous namespace
+
+MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
+                                         bool _isN64, bool IsLittleEndian)
+    : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS,
+                              /*HasRelocationAddend*/ _isN64,
+                              /*IsN64*/ _isN64) {}
+
+MipsELFObjectWriter::~MipsELFObjectWriter() {}
+
+unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
+                                           const MCValue &Target,
+                                           const MCFixup &Fixup,
+                                           bool IsPCRel) const {
+  // Determine the type of the relocation.
+  unsigned Kind = (unsigned)Fixup.getKind();
+
+  switch (Kind) {
+  case Mips::fixup_Mips_NONE:
+    return ELF::R_MIPS_NONE;
+  case Mips::fixup_Mips_16:
+  case FK_Data_2:
+    return IsPCRel ? ELF::R_MIPS_PC16 : ELF::R_MIPS_16;
+  case Mips::fixup_Mips_32:
+  case FK_Data_4:
+    return IsPCRel ? ELF::R_MIPS_PC32 : ELF::R_MIPS_32;
+  }
+
+  if (IsPCRel) {
+    switch (Kind) {
+    case Mips::fixup_Mips_Branch_PCRel:
+    case Mips::fixup_Mips_PC16:
+      return ELF::R_MIPS_PC16;
+    case Mips::fixup_MICROMIPS_PC7_S1:
+      return ELF::R_MICROMIPS_PC7_S1;
+    case Mips::fixup_MICROMIPS_PC10_S1:
+      return ELF::R_MICROMIPS_PC10_S1;
+    case Mips::fixup_MICROMIPS_PC16_S1:
+      return ELF::R_MICROMIPS_PC16_S1;
+    case Mips::fixup_MICROMIPS_PC26_S1:
+      return ELF::R_MICROMIPS_PC26_S1;
+    case Mips::fixup_MICROMIPS_PC19_S2:
+      return ELF::R_MICROMIPS_PC19_S2;
+    case Mips::fixup_MICROMIPS_PC18_S3:
+      return ELF::R_MICROMIPS_PC18_S3;
+    case Mips::fixup_MICROMIPS_PC21_S1:
+      return ELF::R_MICROMIPS_PC21_S1;
+    case Mips::fixup_MIPS_PC19_S2:
+      return ELF::R_MIPS_PC19_S2;
+    case Mips::fixup_MIPS_PC18_S3:
+      return ELF::R_MIPS_PC18_S3;
+    case Mips::fixup_MIPS_PC21_S2:
+      return ELF::R_MIPS_PC21_S2;
+    case Mips::fixup_MIPS_PC26_S2:
+      return ELF::R_MIPS_PC26_S2;
+    case Mips::fixup_MIPS_PCHI16:
+      return ELF::R_MIPS_PCHI16;
+    case Mips::fixup_MIPS_PCLO16:
+      return ELF::R_MIPS_PCLO16;
+    }
+
+    llvm_unreachable("invalid PC-relative fixup kind!");
+  }
+
+  switch (Kind) {
+  case Mips::fixup_Mips_64:
+  case FK_Data_8:
+    return ELF::R_MIPS_64;
+  case FK_DTPRel_4:
+    return ELF::R_MIPS_TLS_DTPREL32;
+  case FK_DTPRel_8:
+    return ELF::R_MIPS_TLS_DTPREL64;
+  case FK_TPRel_4:
+    return ELF::R_MIPS_TLS_TPREL32;
+  case FK_TPRel_8:
+    return ELF::R_MIPS_TLS_TPREL64;
+  case FK_GPRel_4:
+    if (isN64()) {
+      unsigned Type = (unsigned)ELF::R_MIPS_NONE;
+      Type = setRType((unsigned)ELF::R_MIPS_GPREL32, Type);
+      Type = setRType2((unsigned)ELF::R_MIPS_64, Type);
+      Type = setRType3((unsigned)ELF::R_MIPS_NONE, Type);
+      return Type;
+    }
+    return ELF::R_MIPS_GPREL32;
+  case Mips::fixup_Mips_GPREL16:
+    return ELF::R_MIPS_GPREL16;
+  case Mips::fixup_Mips_26:
+    return ELF::R_MIPS_26;
+  case Mips::fixup_Mips_CALL16:
+    return ELF::R_MIPS_CALL16;
+  case Mips::fixup_Mips_GOT:
+    return ELF::R_MIPS_GOT16;
+  case Mips::fixup_Mips_HI16:
+    return ELF::R_MIPS_HI16;
+  case Mips::fixup_Mips_LO16:
+    return ELF::R_MIPS_LO16;
+  case Mips::fixup_Mips_TLSGD:
+    return ELF::R_MIPS_TLS_GD;
+  case Mips::fixup_Mips_GOTTPREL:
+    return ELF::R_MIPS_TLS_GOTTPREL;
+  case Mips::fixup_Mips_TPREL_HI:
+    return ELF::R_MIPS_TLS_TPREL_HI16;
+  case Mips::fixup_Mips_TPREL_LO:
+    return ELF::R_MIPS_TLS_TPREL_LO16;
+  case Mips::fixup_Mips_TLSLDM:
+    return ELF::R_MIPS_TLS_LDM;
+  case Mips::fixup_Mips_DTPREL_HI:
+    return ELF::R_MIPS_TLS_DTPREL_HI16;
+  case Mips::fixup_Mips_DTPREL_LO:
+    return ELF::R_MIPS_TLS_DTPREL_LO16;
+  case Mips::fixup_Mips_GOT_PAGE:
+    return ELF::R_MIPS_GOT_PAGE;
+  case Mips::fixup_Mips_GOT_OFST:
+    return ELF::R_MIPS_GOT_OFST;
+  case Mips::fixup_Mips_GOT_DISP:
+    return ELF::R_MIPS_GOT_DISP;
+  case Mips::fixup_Mips_GPOFF_HI: {
+    unsigned Type = (unsigned)ELF::R_MIPS_NONE;
+    Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
+    Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type);
+    Type = setRType3((unsigned)ELF::R_MIPS_HI16, Type);
+    return Type;
+  }
+  case Mips::fixup_Mips_GPOFF_LO: {
+    unsigned Type = (unsigned)ELF::R_MIPS_NONE;
+    Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
+    Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type);
+    Type = setRType3((unsigned)ELF::R_MIPS_LO16, Type);
+    return Type;
+  }
+  case Mips::fixup_Mips_HIGHER:
+    return ELF::R_MIPS_HIGHER;
+  case Mips::fixup_Mips_HIGHEST:
+    return ELF::R_MIPS_HIGHEST;
+  case Mips::fixup_Mips_SUB:
+    return ELF::R_MIPS_SUB;
+  case Mips::fixup_Mips_GOT_HI16:
+    return ELF::R_MIPS_GOT_HI16;
+  case Mips::fixup_Mips_GOT_LO16:
+    return ELF::R_MIPS_GOT_LO16;
+  case Mips::fixup_Mips_CALL_HI16:
+    return ELF::R_MIPS_CALL_HI16;
+  case Mips::fixup_Mips_CALL_LO16:
+    return ELF::R_MIPS_CALL_LO16;
+  case Mips::fixup_MICROMIPS_26_S1:
+    return ELF::R_MICROMIPS_26_S1;
+  case Mips::fixup_MICROMIPS_HI16:
+    return ELF::R_MICROMIPS_HI16;
+  case Mips::fixup_MICROMIPS_LO16:
+    return ELF::R_MICROMIPS_LO16;
+  case Mips::fixup_MICROMIPS_GOT16:
+    return ELF::R_MICROMIPS_GOT16;
+  case Mips::fixup_MICROMIPS_CALL16:
+    return ELF::R_MICROMIPS_CALL16;
+  case Mips::fixup_MICROMIPS_GOT_DISP:
+    return ELF::R_MICROMIPS_GOT_DISP;
+  case Mips::fixup_MICROMIPS_GOT_PAGE:
+    return ELF::R_MICROMIPS_GOT_PAGE;
+  case Mips::fixup_MICROMIPS_GOT_OFST:
+    return ELF::R_MICROMIPS_GOT_OFST;
+  case Mips::fixup_MICROMIPS_TLS_GD:
+    return ELF::R_MICROMIPS_TLS_GD;
+  case Mips::fixup_MICROMIPS_TLS_LDM:
+    return ELF::R_MICROMIPS_TLS_LDM;
+  case Mips::fixup_MICROMIPS_TLS_DTPREL_HI16:
+    return ELF::R_MICROMIPS_TLS_DTPREL_HI16;
+  case Mips::fixup_MICROMIPS_TLS_DTPREL_LO16:
+    return ELF::R_MICROMIPS_TLS_DTPREL_LO16;
+  case Mips::fixup_MICROMIPS_TLS_TPREL_HI16:
+    return ELF::R_MICROMIPS_TLS_TPREL_HI16;
+  case Mips::fixup_MICROMIPS_TLS_TPREL_LO16:
+    return ELF::R_MICROMIPS_TLS_TPREL_LO16;
+  case Mips::fixup_MICROMIPS_SUB:
+    return ELF::R_MICROMIPS_SUB;
+  }
+
+  llvm_unreachable("invalid fixup kind!");
+}
+
+/// Sort relocation table entries by offset except where another order is
+/// required by the MIPS ABI.
+///
+/// MIPS has a few relocations that have an AHL component in the expression used
+/// to evaluate them. This AHL component is an addend with the same number of
+/// bits as a symbol value but not all of our ABI's are able to supply a
+/// sufficiently sized addend in a single relocation.
+///
+/// The O32 ABI for example, uses REL relocations which store the addend in the
+/// section data. All the relocations with AHL components affect 16-bit fields
+/// so the addend for a single relocation is limited to 16-bit. This ABI
+/// resolves the limitation by linking relocations (e.g. R_MIPS_HI16 and
+/// R_MIPS_LO16) and distributing the addend between the linked relocations. The
+/// ABI mandates that such relocations must be next to each other in a
+/// particular order (e.g. R_MIPS_HI16 must be immediately followed by a
+/// matching R_MIPS_LO16) but the rule is less strict in practice.
+///
+/// The de facto standard is lenient in the following ways:
+/// - 'Immediately following' does not refer to the next relocation entry but
+///   the next matching relocation.
+/// - There may be multiple high parts relocations for one low part relocation.
+/// - There may be multiple low part relocations for one high part relocation.
+/// - The AHL addend in each part does not have to be exactly equal as long as
+///   the difference does not affect the carry bit from bit 15 into 16. This is
+///   to allow, for example, the use of %lo(foo) and %lo(foo+4) when loading
+///   both halves of a long long.
+///
+/// See getMatchingLoType() for a description of which high part relocations
+/// match which low part relocations. One particular thing to note is that
+/// R_MIPS_GOT16 and similar only have AHL addends if they refer to local
+/// symbols.
+///
+/// It should also be noted that this function is not affected by whether
+/// the symbol was kept or rewritten into a section-relative equivalent. We
+/// always match using the expressions from the source.
+void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
+                                     std::vector<ELFRelocationEntry> &Relocs) {
+
+  // We do not need to sort the relocation table for RELA relocations which
+  // N32/N64 uses as the relocation addend contains the value we require,
+  // rather than it being split across a pair of relocations.
+  if (hasRelocationAddend())
+    return;
+
+  if (Relocs.size() < 2)
+    return;
+
+  // Sort relocations by the address they are applied to.
+  std::sort(Relocs.begin(), Relocs.end(),
+            [](const ELFRelocationEntry &A, const ELFRelocationEntry &B) {
+              return A.Offset < B.Offset;
+            });
+
+  std::list<MipsRelocationEntry> Sorted;
+  std::list<ELFRelocationEntry> Remainder;
+
+  DEBUG(dumpRelocs("R: ", Relocs));
+
+  // Separate the movable relocations (AHL relocations using the high bits) from
+  // the immobile relocations (everything else). This does not preserve high/low
+  // matches that already existed in the input.
+  copy_if_else(Relocs.begin(), Relocs.end(), std::back_inserter(Remainder),
+               std::back_inserter(Sorted), [](const ELFRelocationEntry &Reloc) {
+                 return getMatchingLoType(Reloc) != ELF::R_MIPS_NONE;
+               });
+
+  for (auto &R : Remainder) {
+    DEBUG(dbgs() << "Matching: " << R << "\n");
+
+    unsigned MatchingType = getMatchingLoType(R);
+    assert(MatchingType != ELF::R_MIPS_NONE &&
+           "Wrong list for reloc that doesn't need a match");
+
+    // Find the best matching relocation for the current high part.
+    // See isMatchingReloc for a description of a matching relocation and
+    // compareMatchingRelocs for a description of what 'best' means.
+    auto InsertionPoint =
+        find_best(Sorted.begin(), Sorted.end(),
+                  [&R, &MatchingType](const MipsRelocationEntry &X) {
+                    return isMatchingReloc(X, R, MatchingType);
+                  },
+                  compareMatchingRelocs);
+
+    // If we matched then insert the high part in front of the match and mark
+    // both relocations as being involved in a match. We only mark the high
+    // part for cosmetic reasons in the debug output.
+    //
+    // If we failed to find a match then the high part is orphaned. This is not
+    // permitted since the relocation cannot be evaluated without knowing the
+    // carry-in. We can sometimes handle this using a matching low part that is
+    // already used in a match but we already cover that case in
+    // isMatchingReloc and compareMatchingRelocs. For the remaining cases we
+    // should insert the high part at the end of the list. This will cause the
+    // linker to fail but the alternative is to cause the linker to bind the
+    // high part to a semi-matching low part and silently calculate the wrong
+    // value. Unfortunately we have no means to warn the user that we did this
+    // so leave it up to the linker to complain about it.
+    if (InsertionPoint != Sorted.end())
+      InsertionPoint->Matched = true;
+    Sorted.insert(InsertionPoint, R)->Matched = true;
+  }
+
+  DEBUG(dumpRelocs("S: ", Sorted));
+
+  assert(Relocs.size() == Sorted.size() && "Some relocs were not consumed");
+
+  // Overwrite the original vector with the sorted elements. The caller expects
+  // them in reverse order.
+  unsigned CopyTo = 0;
+  for (const auto &R : reverse(Sorted))
+    Relocs[CopyTo++] = R.R;
+}
+
+bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
+                                                  unsigned Type) const {
+  // If it's a compound relocation for N64 then we need the relocation if any
+  // sub-relocation needs it.
+  if (!isUInt<8>(Type))
+    return needsRelocateWithSymbol(Sym, Type & 0xff) ||
+           needsRelocateWithSymbol(Sym, (Type >> 8) & 0xff) ||
+           needsRelocateWithSymbol(Sym, (Type >> 16) & 0xff);
+
+  switch (Type) {
+  default:
+    errs() << Type << "\n";
+    llvm_unreachable("Unexpected relocation");
+    return true;
+
+  // This relocation doesn't affect the section data.
+  case ELF::R_MIPS_NONE:
+    return false;
+
+  // On REL ABI's (e.g. O32), these relocations form pairs. The pairing is done
+  // by the static linker by matching the symbol and offset.
+  // We only see one relocation at a time but it's still safe to relocate with
+  // the section so long as both relocations make the same decision.
+  //
+  // Some older linkers may require the symbol for particular cases. Such cases
+  // are not supported yet but can be added as required.
+  case ELF::R_MIPS_GOT16:
+  case ELF::R_MIPS16_GOT16:
+  case ELF::R_MICROMIPS_GOT16:
+  case ELF::R_MIPS_HI16:
+  case ELF::R_MIPS16_HI16:
+  case ELF::R_MICROMIPS_HI16:
+  case ELF::R_MIPS_LO16:
+  case ELF::R_MIPS16_LO16:
+  case ELF::R_MICROMIPS_LO16:
+    // FIXME: It should be safe to return false for the STO_MIPS_MICROMIPS but
+    //        we neglect to handle the adjustment to the LSB of the addend that
+    //        it causes in applyFixup() and similar.
+    if (cast<MCSymbolELF>(Sym).getOther() & ELF::STO_MIPS_MICROMIPS)
+      return true;
+    return false;
+
+  case ELF::R_MIPS_GOT_PAGE:
+  case ELF::R_MICROMIPS_GOT_PAGE:
+  case ELF::R_MIPS_GOT_OFST:
+  case ELF::R_MICROMIPS_GOT_OFST:
+  case ELF::R_MIPS_16:
+  case ELF::R_MIPS_32:
+  case ELF::R_MIPS_GPREL32:
+    if (cast<MCSymbolELF>(Sym).getOther() & ELF::STO_MIPS_MICROMIPS)
+      return true;
+    LLVM_FALLTHROUGH;
+  case ELF::R_MIPS_26:
+  case ELF::R_MIPS_64:
+  case ELF::R_MIPS_GPREL16:
+  case ELF::R_MIPS_PC16:
+  case ELF::R_MIPS_SUB:
+    return false;
+
+  // FIXME: Many of these relocations should probably return false but this
+  //        hasn't been confirmed to be safe yet.
+  case ELF::R_MIPS_REL32:
+  case ELF::R_MIPS_LITERAL:
+  case ELF::R_MIPS_CALL16:
+  case ELF::R_MIPS_SHIFT5:
+  case ELF::R_MIPS_SHIFT6:
+  case ELF::R_MIPS_GOT_DISP:
+  case ELF::R_MIPS_GOT_HI16:
+  case ELF::R_MIPS_GOT_LO16:
+  case ELF::R_MIPS_INSERT_A:
+  case ELF::R_MIPS_INSERT_B:
+  case ELF::R_MIPS_DELETE:
+  case ELF::R_MIPS_HIGHER:
+  case ELF::R_MIPS_HIGHEST:
+  case ELF::R_MIPS_CALL_HI16:
+  case ELF::R_MIPS_CALL_LO16:
+  case ELF::R_MIPS_SCN_DISP:
+  case ELF::R_MIPS_REL16:
+  case ELF::R_MIPS_ADD_IMMEDIATE:
+  case ELF::R_MIPS_PJUMP:
+  case ELF::R_MIPS_RELGOT:
+  case ELF::R_MIPS_JALR:
+  case ELF::R_MIPS_TLS_DTPMOD32:
+  case ELF::R_MIPS_TLS_DTPREL32:
+  case ELF::R_MIPS_TLS_DTPMOD64:
+  case ELF::R_MIPS_TLS_DTPREL64:
+  case ELF::R_MIPS_TLS_GD:
+  case ELF::R_MIPS_TLS_LDM:
+  case ELF::R_MIPS_TLS_DTPREL_HI16:
+  case ELF::R_MIPS_TLS_DTPREL_LO16:
+  case ELF::R_MIPS_TLS_GOTTPREL:
+  case ELF::R_MIPS_TLS_TPREL32:
+  case ELF::R_MIPS_TLS_TPREL64:
+  case ELF::R_MIPS_TLS_TPREL_HI16:
+  case ELF::R_MIPS_TLS_TPREL_LO16:
+  case ELF::R_MIPS_GLOB_DAT:
+  case ELF::R_MIPS_PC21_S2:
+  case ELF::R_MIPS_PC26_S2:
+  case ELF::R_MIPS_PC18_S3:
+  case ELF::R_MIPS_PC19_S2:
+  case ELF::R_MIPS_PCHI16:
+  case ELF::R_MIPS_PCLO16:
+  case ELF::R_MIPS_COPY:
+  case ELF::R_MIPS_JUMP_SLOT:
+  case ELF::R_MIPS_NUM:
+  case ELF::R_MIPS_PC32:
+  case ELF::R_MIPS_EH:
+  case ELF::R_MICROMIPS_26_S1:
+  case ELF::R_MICROMIPS_GPREL16:
+  case ELF::R_MICROMIPS_LITERAL:
+  case ELF::R_MICROMIPS_PC7_S1:
+  case ELF::R_MICROMIPS_PC10_S1:
+  case ELF::R_MICROMIPS_PC16_S1:
+  case ELF::R_MICROMIPS_CALL16:
+  case ELF::R_MICROMIPS_GOT_DISP:
+  case ELF::R_MICROMIPS_GOT_HI16:
+  case ELF::R_MICROMIPS_GOT_LO16:
+  case ELF::R_MICROMIPS_SUB:
+  case ELF::R_MICROMIPS_HIGHER:
+  case ELF::R_MICROMIPS_HIGHEST:
+  case ELF::R_MICROMIPS_CALL_HI16:
+  case ELF::R_MICROMIPS_CALL_LO16:
+  case ELF::R_MICROMIPS_SCN_DISP:
+  case ELF::R_MICROMIPS_JALR:
+  case ELF::R_MICROMIPS_HI0_LO16:
+  case ELF::R_MICROMIPS_TLS_GD:
+  case ELF::R_MICROMIPS_TLS_LDM:
+  case ELF::R_MICROMIPS_TLS_DTPREL_HI16:
+  case ELF::R_MICROMIPS_TLS_DTPREL_LO16:
+  case ELF::R_MICROMIPS_TLS_GOTTPREL:
+  case ELF::R_MICROMIPS_TLS_TPREL_HI16:
+  case ELF::R_MICROMIPS_TLS_TPREL_LO16:
+  case ELF::R_MICROMIPS_GPREL7_S2:
+  case ELF::R_MICROMIPS_PC23_S2:
+  case ELF::R_MICROMIPS_PC21_S1:
+  case ELF::R_MICROMIPS_PC26_S1:
+  case ELF::R_MICROMIPS_PC18_S3:
+  case ELF::R_MICROMIPS_PC19_S2:
+    return true;
+
+  // FIXME: Many of these should probably return false but MIPS16 isn't
+  //        supported by the integrated assembler.
+  case ELF::R_MIPS16_26:
+  case ELF::R_MIPS16_GPREL:
+  case ELF::R_MIPS16_CALL16:
+  case ELF::R_MIPS16_TLS_GD:
+  case ELF::R_MIPS16_TLS_LDM:
+  case ELF::R_MIPS16_TLS_DTPREL_HI16:
+  case ELF::R_MIPS16_TLS_DTPREL_LO16:
+  case ELF::R_MIPS16_TLS_GOTTPREL:
+  case ELF::R_MIPS16_TLS_TPREL_HI16:
+  case ELF::R_MIPS16_TLS_TPREL_LO16:
+    llvm_unreachable("Unsupported MIPS16 relocation");
+    return true;
+  }
+}
+
+MCObjectWriter *llvm::createMipsELFObjectWriter(raw_pwrite_stream &OS,
+                                                uint8_t OSABI,
+                                                bool IsLittleEndian,
+                                                bool Is64Bit) {
+  MCELFObjectTargetWriter *MOTW =
+      new MipsELFObjectWriter(Is64Bit, OSABI, Is64Bit, IsLittleEndian);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
new file mode 100644
index 000000000000..e7d687e89a8a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -0,0 +1,82 @@
+//===-------- MipsELFStreamer.cpp - ELF Object Output ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsELFStreamer.h"
+#include "MipsTargetStreamer.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+
+void MipsELFStreamer::EmitInstruction(const MCInst &Inst,
+                                      const MCSubtargetInfo &STI) {
+  MCELFStreamer::EmitInstruction(Inst, STI);
+
+  MCContext &Context = getContext();
+  const MCRegisterInfo *MCRegInfo = Context.getRegisterInfo();
+
+  for (unsigned OpIndex = 0; OpIndex < Inst.getNumOperands(); ++OpIndex) {
+    const MCOperand &Op = Inst.getOperand(OpIndex);
+
+    if (!Op.isReg())
+      continue;
+
+    unsigned Reg = Op.getReg();
+    RegInfoRecord->SetPhysRegUsed(Reg, MCRegInfo);
+  }
+
+  createPendingLabelRelocs();
+}
+
+void MipsELFStreamer::createPendingLabelRelocs() {
+  MipsTargetELFStreamer *ELFTargetStreamer =
+      static_cast<MipsTargetELFStreamer *>(getTargetStreamer());
+
+  // FIXME: Also mark labels when in MIPS16 mode.
+  if (ELFTargetStreamer->isMicroMipsEnabled()) {
+    for (auto *L : Labels) {
+      auto *Label = cast<MCSymbolELF>(L);
+      getAssembler().registerSymbol(*Label);
+      Label->setOther(ELF::STO_MIPS_MICROMIPS);
+    }
+  }
+
+  Labels.clear();
+}
+
+void MipsELFStreamer::EmitLabel(MCSymbol *Symbol) {
+  MCELFStreamer::EmitLabel(Symbol);
+  Labels.push_back(Symbol);
+}
+
+void MipsELFStreamer::SwitchSection(MCSection *Section,
+                                    const MCExpr *Subsection) {
+  MCELFStreamer::SwitchSection(Section, Subsection);
+  Labels.clear();
+}
+
+void MipsELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+                                    SMLoc Loc) {
+  MCELFStreamer::EmitValueImpl(Value, Size, Loc);
+  Labels.clear();
+}
+
+void MipsELFStreamer::EmitMipsOptionRecords() {
+  for (const auto &I : MipsOptionRecords)
+    I->EmitMipsOptionRecord();
+}
+
+MCELFStreamer *llvm::createMipsELFStreamer(MCContext &Context,
+                                           MCAsmBackend &MAB,
+                                           raw_pwrite_stream &OS,
+                                           MCCodeEmitter *Emitter,
+                                           bool RelaxAll) {
+  return new MipsELFStreamer(Context, MAB, OS, Emitter);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
new file mode 100644
index 000000000000..a241cdebdcc8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -0,0 +1,76 @@
+//===-------- MipsELFStreamer.h - ELF Object Output -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a custom MCELFStreamer which allows us to insert some hooks before
+// emitting data into an actual object file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSELFSTREAMER_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSELFSTREAMER_H
+
+#include "MipsOptionRecord.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include <memory>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCSubtargetInfo;
+
+class MipsELFStreamer : public MCELFStreamer {
+  SmallVector<std::unique_ptr<MipsOptionRecord>, 8> MipsOptionRecords;
+  MipsRegInfoRecord *RegInfoRecord;
+  SmallVector<MCSymbol*, 4> Labels;
+
+
+public:
+  MipsELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS,
+                  MCCodeEmitter *Emitter)
+      : MCELFStreamer(Context, MAB, OS, Emitter) {
+
+    RegInfoRecord = new MipsRegInfoRecord(this, Context);
+    MipsOptionRecords.push_back(
+        std::unique_ptr<MipsRegInfoRecord>(RegInfoRecord));
+  }
+
+  /// Overriding this function allows us to add arbitrary behaviour before the
+  /// \p Inst is actually emitted. For example, we can inspect the operands and
+  /// gather sufficient information that allows us to reason about the register
+  /// usage for the translation unit.
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+
+  /// Overriding this function allows us to record all labels that should be
+  /// marked as microMIPS. Based on this data marking is done in
+  /// EmitInstruction.
+  void EmitLabel(MCSymbol *Symbol) override;
+
+  /// Overriding this function allows us to dismiss all labels that are
+  /// candidates for marking as microMIPS when .section directive is processed.
+  void SwitchSection(MCSection *Section,
+                     const MCExpr *Subsection = nullptr) override;
+
+  /// Overriding this function allows us to dismiss all labels that are
+  /// candidates for marking as microMIPS when .word directive is emitted.
+  void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override;
+
+  /// Emits all the option records stored up until the point it's called.
+  void EmitMipsOptionRecords();
+
+  /// Mark labels as microMIPS, if necessary for the subtarget.
+  void createPendingLabelRelocs();
+};
+
+MCELFStreamer *createMipsELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     raw_pwrite_stream &OS,
+                                     MCCodeEmitter *Emitter, bool RelaxAll);
+} // namespace llvm.
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
new file mode 100644
index 000000000000..149296212eca
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -0,0 +1,224 @@
+//===-- MipsFixupKinds.h - Mips Specific Fixup Entries ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSFIXUPKINDS_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace Mips {
+  // Although most of the current fixup types reflect a unique relocation
+  // one can have multiple fixup types for a given relocation and thus need
+  // to be uniquely named.
+  //
+  // This table *must* be in the same order of
+  // MCFixupKindInfo Infos[Mips::NumTargetFixupKinds]
+  // in MipsAsmBackend.cpp.
+  //
+  enum Fixups {
+    // Branch fixups resulting in R_MIPS_NONE.
+    fixup_Mips_NONE = FirstTargetFixupKind,
+
+    // Branch fixups resulting in R_MIPS_16.
+    fixup_Mips_16,
+
+    // Pure 32 bit data fixup resulting in - R_MIPS_32.
+    fixup_Mips_32,
+
+    // Full 32 bit data relative data fixup resulting in - R_MIPS_REL32.
+    fixup_Mips_REL32,
+
+    // Jump 26 bit fixup resulting in - R_MIPS_26.
+    fixup_Mips_26,
+
+    // Pure upper 16 bit fixup resulting in - R_MIPS_HI16.
+    fixup_Mips_HI16,
+
+    // Pure lower 16 bit fixup resulting in - R_MIPS_LO16.
+    fixup_Mips_LO16,
+
+    // 16 bit fixup for GP offest resulting in - R_MIPS_GPREL16.
+    fixup_Mips_GPREL16,
+
+    // 16 bit literal fixup resulting in - R_MIPS_LITERAL.
+    fixup_Mips_LITERAL,
+
+    // Symbol fixup resulting in - R_MIPS_GOT16.
+    fixup_Mips_GOT,
+
+    // PC relative branch fixup resulting in - R_MIPS_PC16.
+    fixup_Mips_PC16,
+
+    // resulting in - R_MIPS_CALL16.
+    fixup_Mips_CALL16,
+
+    // resulting in - R_MIPS_GPREL32.
+    fixup_Mips_GPREL32,
+
+    // resulting in - R_MIPS_SHIFT5.
+    fixup_Mips_SHIFT5,
+
+    // resulting in - R_MIPS_SHIFT6.
+    fixup_Mips_SHIFT6,
+
+    // Pure 64 bit data fixup resulting in - R_MIPS_64.
+    fixup_Mips_64,
+
+    // resulting in - R_MIPS_TLS_GD.
+    fixup_Mips_TLSGD,
+
+    // resulting in - R_MIPS_TLS_GOTTPREL.
+    fixup_Mips_GOTTPREL,
+
+    // resulting in - R_MIPS_TLS_TPREL_HI16.
+    fixup_Mips_TPREL_HI,
+
+    // resulting in - R_MIPS_TLS_TPREL_LO16.
+    fixup_Mips_TPREL_LO,
+
+    // resulting in - R_MIPS_TLS_LDM.
+    fixup_Mips_TLSLDM,
+
+    // resulting in - R_MIPS_TLS_DTPREL_HI16.
+    fixup_Mips_DTPREL_HI,
+
+    // resulting in - R_MIPS_TLS_DTPREL_LO16.
+    fixup_Mips_DTPREL_LO,
+
+    // PC relative branch fixup resulting in - R_MIPS_PC16
+    fixup_Mips_Branch_PCRel,
+
+    // resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16
+    fixup_Mips_GPOFF_HI,
+
+    // resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16
+    fixup_Mips_GPOFF_LO,
+
+    // resulting in - R_MIPS_PAGE
+    fixup_Mips_GOT_PAGE,
+
+    // resulting in - R_MIPS_GOT_OFST
+    fixup_Mips_GOT_OFST,
+
+    // resulting in - R_MIPS_GOT_DISP
+    fixup_Mips_GOT_DISP,
+
+    // resulting in - R_MIPS_GOT_HIGHER
+    fixup_Mips_HIGHER,
+
+    // resulting in - R_MIPS_HIGHEST
+    fixup_Mips_HIGHEST,
+
+    // resulting in - R_MIPS_GOT_HI16
+    fixup_Mips_GOT_HI16,
+
+    // resulting in - R_MIPS_GOT_LO16
+    fixup_Mips_GOT_LO16,
+
+    // resulting in - R_MIPS_CALL_HI16
+    fixup_Mips_CALL_HI16,
+
+    // resulting in - R_MIPS_CALL_LO16
+    fixup_Mips_CALL_LO16,
+
+    // resulting in - R_MIPS_PC18_S3
+    fixup_MIPS_PC18_S3,
+
+    // resulting in - R_MIPS_PC19_S2
+    fixup_MIPS_PC19_S2,
+
+    // resulting in - R_MIPS_PC21_S2
+    fixup_MIPS_PC21_S2,
+
+    // resulting in - R_MIPS_PC26_S2
+    fixup_MIPS_PC26_S2,
+
+    // resulting in - R_MIPS_PCHI16
+    fixup_MIPS_PCHI16,
+
+    // resulting in - R_MIPS_PCLO16
+    fixup_MIPS_PCLO16,
+
+    // resulting in - R_MICROMIPS_26_S1
+    fixup_MICROMIPS_26_S1,
+
+    // resulting in - R_MICROMIPS_HI16
+    fixup_MICROMIPS_HI16,
+
+    // resulting in - R_MICROMIPS_LO16
+    fixup_MICROMIPS_LO16,
+
+    // resulting in - R_MICROMIPS_GOT16
+    fixup_MICROMIPS_GOT16,
+
+    // resulting in - R_MICROMIPS_PC7_S1
+    fixup_MICROMIPS_PC7_S1,
+
+    // resulting in - R_MICROMIPS_PC10_S1
+    fixup_MICROMIPS_PC10_S1,
+
+    // resulting in - R_MICROMIPS_PC16_S1
+    fixup_MICROMIPS_PC16_S1,
+
+    // resulting in - R_MICROMIPS_PC26_S1
+    fixup_MICROMIPS_PC26_S1,
+
+    // resulting in - R_MICROMIPS_PC19_S2
+    fixup_MICROMIPS_PC19_S2,
+
+    // resulting in - R_MICROMIPS_PC18_S3
+    fixup_MICROMIPS_PC18_S3,
+
+    // resulting in - R_MICROMIPS_PC21_S1
+    fixup_MICROMIPS_PC21_S1,
+
+    // resulting in - R_MICROMIPS_CALL16
+    fixup_MICROMIPS_CALL16,
+
+    // resulting in - R_MICROMIPS_GOT_DISP
+    fixup_MICROMIPS_GOT_DISP,
+
+    // resulting in - R_MICROMIPS_GOT_PAGE
+    fixup_MICROMIPS_GOT_PAGE,
+
+    // resulting in - R_MICROMIPS_GOT_OFST
+    fixup_MICROMIPS_GOT_OFST,
+
+    // resulting in - R_MICROMIPS_TLS_GD
+    fixup_MICROMIPS_TLS_GD,
+
+    // resulting in - R_MICROMIPS_TLS_LDM
+    fixup_MICROMIPS_TLS_LDM,
+
+    // resulting in - R_MICROMIPS_TLS_DTPREL_HI16
+    fixup_MICROMIPS_TLS_DTPREL_HI16,
+
+    // resulting in - R_MICROMIPS_TLS_DTPREL_LO16
+    fixup_MICROMIPS_TLS_DTPREL_LO16,
+
+    // resulting in - R_MICROMIPS_TLS_TPREL_HI16
+    fixup_MICROMIPS_TLS_TPREL_HI16,
+
+    // resulting in - R_MICROMIPS_TLS_TPREL_LO16
+    fixup_MICROMIPS_TLS_TPREL_LO16,
+
+    // resulting in - R_MIPS_SUB/R_MICROMIPS_SUB
+    fixup_Mips_SUB,
+    fixup_MICROMIPS_SUB,
+
+    // Marker
+    LastTargetFixupKind,
+    NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+  };
+} // namespace Mips
+} // namespace llvm
+
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
new file mode 100644
index 000000000000..a44a35f49e5f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -0,0 +1,66 @@
+//===-- MipsMCAsmInfo.cpp - Mips Asm Properties ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MipsMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+void MipsMCAsmInfo::anchor() { }
+
+MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
+  if ((TheTriple.getArch() == Triple::mips) ||
+      (TheTriple.getArch() == Triple::mips64))
+    IsLittleEndian = false;
+
+  if ((TheTriple.getArch() == Triple::mips64el) ||
+      (TheTriple.getArch() == Triple::mips64)) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
+
+  // FIXME: This condition isn't quite right but it's the best we can do until
+  //        this object can identify the ABI. It will misbehave when using O32
+  //        on a mips64*-* triple.
+  if ((TheTriple.getArch() == Triple::mipsel) ||
+      (TheTriple.getArch() == Triple::mips)) {
+    PrivateGlobalPrefix = "$";
+    PrivateLabelPrefix = "$";
+  }
+
+  AlignmentIsInBytes          = false;
+  Data16bitsDirective         = "\t.2byte\t";
+  Data32bitsDirective         = "\t.4byte\t";
+  Data64bitsDirective         = "\t.8byte\t";
+  CommentString               = "#";
+  ZeroDirective               = "\t.space\t";
+  GPRel32Directive            = "\t.gpword\t";
+  GPRel64Directive            = "\t.gpdword\t";
+  DTPRel32Directive           = "\t.dtprelword\t";
+  DTPRel64Directive           = "\t.dtpreldword\t";
+  TPRel32Directive            = "\t.tprelword\t";
+  TPRel64Directive            = "\t.tpreldword\t";
+  UseAssignmentForEHBegin = true;
+  SupportsDebugInformation = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+  DwarfRegNumForCFI = true;
+  HasMipsExpressions = true;
+
+  // Enable IAS by default for O32.
+  if (TheTriple.getArch() == Triple::mips ||
+      TheTriple.getArch() == Triple::mipsel)
+    UseIntegratedAssembler = true;
+
+  // Enable IAS by default for Debian mips64/mips64el.
+  if (TheTriple.getEnvironment() == Triple::GNUABI64)
+    UseIntegratedAssembler = true;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
new file mode 100644
index 000000000000..d4ccf0349c16
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- MipsMCAsmInfo.h - Mips Asm Info ------------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MipsMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class MipsMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit MipsMCAsmInfo(const Triple &TheTriple);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
new file mode 100644
index 000000000000..0614316d5ac7
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -0,0 +1,1171 @@
+//===-- MipsMCCodeEmitter.cpp - Convert Mips Code to Machine Code ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MipsMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "MipsMCCodeEmitter.h"
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "mccodeemitter"
+
+#define GET_INSTRMAP_INFO
+#include "MipsGenInstrInfo.inc"
+#undef GET_INSTRMAP_INFO
+
+namespace llvm {
+MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         MCContext &Ctx) {
+  return new MipsMCCodeEmitter(MCII, Ctx, false);
+}
+
+MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         MCContext &Ctx) {
+  return new MipsMCCodeEmitter(MCII, Ctx, true);
+}
+} // End of namespace llvm.
+
+// If the D<shift> instruction has a shift amount that is greater
+// than 31 (checked in calling routine), lower it to a D<shift>32 instruction
+static void LowerLargeShift(MCInst& Inst) {
+
+  assert(Inst.getNumOperands() == 3 && "Invalid no. of operands for shift!");
+  assert(Inst.getOperand(2).isImm());
+
+  int64_t Shift = Inst.getOperand(2).getImm();
+  if (Shift <= 31)
+    return; // Do nothing
+  Shift -= 32;
+
+  // saminus32
+  Inst.getOperand(2).setImm(Shift);
+
+  switch (Inst.getOpcode()) {
+  default:
+    // Calling function is not synchronized
+    llvm_unreachable("Unexpected shift instruction");
+  case Mips::DSLL:
+    Inst.setOpcode(Mips::DSLL32);
+    return;
+  case Mips::DSRL:
+    Inst.setOpcode(Mips::DSRL32);
+    return;
+  case Mips::DSRA:
+    Inst.setOpcode(Mips::DSRA32);
+    return;
+  case Mips::DROTR:
+    Inst.setOpcode(Mips::DROTR32);
+    return;
+  case Mips::DSLL_MM64R6:
+    Inst.setOpcode(Mips::DSLL32_MM64R6);
+    return;
+  case Mips::DSRL_MM64R6:
+    Inst.setOpcode(Mips::DSRL32_MM64R6);
+    return;
+  case Mips::DSRA_MM64R6:
+    Inst.setOpcode(Mips::DSRA32_MM64R6);
+    return;
+  case Mips::DROTR_MM64R6:
+    Inst.setOpcode(Mips::DROTR32_MM64R6);
+    return;
+  }
+}
+
+// Pick a DINS instruction variant based on the pos and size operands
+static void LowerDins(MCInst& InstIn) {
+  assert(InstIn.getNumOperands() == 5 &&
+         "Invalid no. of machine operands for DINS!");
+
+  assert(InstIn.getOperand(2).isImm());
+  int64_t pos = InstIn.getOperand(2).getImm();
+  assert(InstIn.getOperand(3).isImm());
+  int64_t size = InstIn.getOperand(3).getImm();
+
+  if (size <= 32) {
+    if (pos < 32)  // DINS, do nothing
+      return;
+    // DINSU
+    InstIn.getOperand(2).setImm(pos - 32);
+    InstIn.setOpcode(Mips::DINSU);
+    return;
+  }
+  // DINSM
+  assert(pos < 32 && "DINS cannot have both size and pos > 32");
+  InstIn.getOperand(3).setImm(size - 32);
+  InstIn.setOpcode(Mips::DINSM);
+  return;
+}
+
+// Fix a bad compact branch encoding for beqc/bnec.
+void MipsMCCodeEmitter::LowerCompactBranch(MCInst& Inst) const {
+
+  // Encoding may be illegal !(rs < rt), but this situation is
+  // easily fixed.
+  unsigned RegOp0 = Inst.getOperand(0).getReg();
+  unsigned RegOp1 = Inst.getOperand(1).getReg();
+
+  unsigned Reg0 =  Ctx.getRegisterInfo()->getEncodingValue(RegOp0);
+  unsigned Reg1 =  Ctx.getRegisterInfo()->getEncodingValue(RegOp1);
+
+  if (Inst.getOpcode() == Mips::BNEC || Inst.getOpcode() == Mips::BEQC ||
+      Inst.getOpcode() == Mips::BNEC64 || Inst.getOpcode() == Mips::BEQC64) {
+    assert(Reg0 != Reg1 && "Instruction has bad operands ($rs == $rt)!");
+    if (Reg0 < Reg1)
+      return;
+  } else if (Inst.getOpcode() == Mips::BNVC || Inst.getOpcode() == Mips::BOVC) {
+    if (Reg0 >= Reg1)
+      return;
+  } else if (Inst.getOpcode() == Mips::BNVC_MMR6 ||
+             Inst.getOpcode() == Mips::BOVC_MMR6) {
+    if (Reg1 >= Reg0)
+      return;
+  } else
+    llvm_unreachable("Cannot rewrite unknown branch!");
+
+  Inst.getOperand(0).setReg(RegOp1);
+  Inst.getOperand(1).setReg(RegOp0);
+
+}
+
+bool MipsMCCodeEmitter::isMicroMips(const MCSubtargetInfo &STI) const {
+  return STI.getFeatureBits()[Mips::FeatureMicroMips];
+}
+
+bool MipsMCCodeEmitter::isMips32r6(const MCSubtargetInfo &STI) const {
+  return STI.getFeatureBits()[Mips::FeatureMips32r6];
+}
+
+void MipsMCCodeEmitter::EmitByte(unsigned char C, raw_ostream &OS) const {
+  OS << (char)C;
+}
+
+void MipsMCCodeEmitter::EmitInstruction(uint64_t Val, unsigned Size,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &OS) const {
+  // Output the instruction encoding in little endian byte order.
+  // Little-endian byte ordering:
+  //   mips32r2:   4 | 3 | 2 | 1
+  //   microMIPS:  2 | 1 | 4 | 3
+  if (IsLittleEndian && Size == 4 && isMicroMips(STI)) {
+    EmitInstruction(Val >> 16, 2, STI, OS);
+    EmitInstruction(Val, 2, STI, OS);
+  } else {
+    for (unsigned i = 0; i < Size; ++i) {
+      unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
+      EmitByte((Val >> Shift) & 0xff, OS);
+    }
+  }
+}
+
+/// encodeInstruction - Emit the instruction.
+/// Size the instruction with Desc.getSize().
+void MipsMCCodeEmitter::
+encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const
+{
+
+  // Non-pseudo instructions that get changed for direct object
+  // only based on operand values.
+  // If this list of instructions get much longer we will move
+  // the check to a function call. Until then, this is more efficient.
+  MCInst TmpInst = MI;
+  switch (MI.getOpcode()) {
+  // If shift amount is >= 32 it the inst needs to be lowered further
+  case Mips::DSLL:
+  case Mips::DSRL:
+  case Mips::DSRA:
+  case Mips::DROTR:
+  case Mips::DSLL_MM64R6:
+  case Mips::DSRL_MM64R6:
+  case Mips::DSRA_MM64R6:
+  case Mips::DROTR_MM64R6:
+    LowerLargeShift(TmpInst);
+    break;
+    // Double extract instruction is chosen by pos and size operands
+  case Mips::DINS:
+    LowerDins(TmpInst);
+    break;
+  // Compact branches, enforce encoding restrictions.
+  case Mips::BEQC:
+  case Mips::BNEC:
+  case Mips::BEQC64:
+  case Mips::BNEC64:
+  case Mips::BOVC:
+  case Mips::BOVC_MMR6:
+  case Mips::BNVC:
+  case Mips::BNVC_MMR6:
+    LowerCompactBranch(TmpInst);
+  }
+
+  unsigned long N = Fixups.size();
+  uint32_t Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+
+  // Check for unimplemented opcodes.
+  // Unfortunately in MIPS both NOP and SLL will come in with Binary == 0
+  // so we have to special check for them.
+  unsigned Opcode = TmpInst.getOpcode();
+  if ((Opcode != Mips::NOP) && (Opcode != Mips::SLL) &&
+      (Opcode != Mips::SLL_MM) && !Binary)
+    llvm_unreachable("unimplemented opcode in encodeInstruction()");
+
+  int NewOpcode = -1;
+  if (isMicroMips(STI)) {
+    if (isMips32r6(STI)) {
+      NewOpcode = Mips::MipsR62MicroMipsR6(Opcode, Mips::Arch_micromipsr6);
+      if (NewOpcode == -1)
+        NewOpcode = Mips::Std2MicroMipsR6(Opcode, Mips::Arch_micromipsr6);
+    }
+    else
+      NewOpcode = Mips::Std2MicroMips(Opcode, Mips::Arch_micromips);
+
+    // Check whether it is Dsp instruction.
+    if (NewOpcode == -1)
+      NewOpcode = Mips::Dsp2MicroMips(Opcode, Mips::Arch_mmdsp);
+
+    if (NewOpcode != -1) {
+      if (Fixups.size() > N)
+        Fixups.pop_back();
+
+      Opcode = NewOpcode;
+      TmpInst.setOpcode (NewOpcode);
+      Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
+    }
+  }
+
+  const MCInstrDesc &Desc = MCII.get(TmpInst.getOpcode());
+
+  // Get byte count of instruction
+  unsigned Size = Desc.getSize();
+  if (!Size)
+    llvm_unreachable("Desc.getSize() returns 0");
+
+  EmitInstruction(Binary, Size, STI, OS);
+}
+
+/// getBranchTargetOpValue - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 4.
+  if (MO.isImm()) return MO.getImm() >> 2;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValue expects only expressions or immediates");
+
+  const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+      MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+  Fixups.push_back(MCFixup::create(0, FixupExpression,
+                                   MCFixupKind(Mips::fixup_Mips_PC16)));
+  return 0;
+}
+
+/// getBranchTargetOpValue1SImm16 - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValue1SImm16(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValue expects only expressions or immediates");
+
+  const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+      MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+  Fixups.push_back(MCFixup::create(0, FixupExpression,
+                                   MCFixupKind(Mips::fixup_Mips_PC16)));
+  return 0;
+}
+
+/// getBranchTargetOpValueMMR6 - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValueMMR6(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm())
+    return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValueMMR6 expects only expressions or immediates");
+
+  const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+      MO.getExpr(), MCConstantExpr::create(-2, Ctx), Ctx);
+  Fixups.push_back(MCFixup::create(0, FixupExpression,
+                                   MCFixupKind(Mips::fixup_Mips_PC16)));
+  return 0;
+}
+
+/// getBranchTargetOpValueLsl2MMR6 - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValueLsl2MMR6(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 4.
+  if (MO.isImm())
+    return MO.getImm() >> 2;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValueLsl2MMR6 expects only expressions or immediates");
+
+  const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+      MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+  Fixups.push_back(MCFixup::create(0, FixupExpression,
+                                   MCFixupKind(Mips::fixup_Mips_PC16)));
+  return 0;
+}
+
+/// getBranchTarget7OpValueMM - Return binary encoding of the microMIPS branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTarget7OpValueMM(const MCInst &MI, unsigned OpNo,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValueMM expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MICROMIPS_PC7_S1)));
+  return 0;
+}
+
+/// getBranchTargetOpValueMMPC10 - Return binary encoding of the microMIPS
+/// 10-bit branch target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValueMMPC10(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValuePC10 expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::create(0, Expr,
+                   MCFixupKind(Mips::fixup_MICROMIPS_PC10_S1)));
+  return 0;
+}
+
+/// getBranchTargetOpValue - Return binary encoding of the microMIPS branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValueMM expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::create(0, Expr,
+                   MCFixupKind(Mips::
+                               fixup_MICROMIPS_PC16_S1)));
+  return 0;
+}
+
+/// getBranchTarget21OpValue - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTarget21OpValue(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 4.
+  if (MO.isImm()) return MO.getImm() >> 2;
+
+  assert(MO.isExpr() &&
+         "getBranchTarget21OpValue expects only expressions or immediates");
+
+  const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+      MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+  Fixups.push_back(MCFixup::create(0, FixupExpression,
+                                   MCFixupKind(Mips::fixup_MIPS_PC21_S2)));
+  return 0;
+}
+
+/// getBranchTarget21OpValueMM - Return binary encoding of the branch
+/// target operand for microMIPS. If the machine operand requires
+/// relocation, record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTarget21OpValueMM(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 4.
+  if (MO.isImm()) return MO.getImm() >> 2;
+
+  assert(MO.isExpr() &&
+    "getBranchTarget21OpValueMM expects only expressions or immediates");
+
+  const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+      MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+  Fixups.push_back(MCFixup::create(0, FixupExpression,
+                                   MCFixupKind(Mips::fixup_MICROMIPS_PC21_S1)));
+  return 0;
+}
+
+/// getBranchTarget26OpValue - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 4.
+  if (MO.isImm()) return MO.getImm() >> 2;
+
+  assert(MO.isExpr() &&
+         "getBranchTarget26OpValue expects only expressions or immediates");
+
+  const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+      MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+  Fixups.push_back(MCFixup::create(0, FixupExpression,
+                                   MCFixupKind(Mips::fixup_MIPS_PC26_S2)));
+  return 0;
+}
+
+/// getBranchTarget26OpValueMM - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::getBranchTarget26OpValueMM(
+    const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm())
+    return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getBranchTarget26OpValueMM expects only expressions or immediates");
+
+  const MCExpr *FixupExpression = MCBinaryExpr::createAdd(
+      MO.getExpr(), MCConstantExpr::create(-4, Ctx), Ctx);
+  Fixups.push_back(MCFixup::create(0, FixupExpression,
+                                   MCFixupKind(Mips::fixup_MICROMIPS_PC26_S1)));
+  return 0;
+}
+
+/// getJumpOffset16OpValue - Return binary encoding of the jump
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  if (MO.isImm()) return MO.getImm();
+
+  assert(MO.isExpr() &&
+         "getJumpOffset16OpValue expects only expressions or an immediate");
+
+   // TODO: Push fixup.
+   return 0;
+}
+
+/// getJumpTargetOpValue - Return binary encoding of the jump
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  // If the destination is an immediate, divide by 4.
+  if (MO.isImm()) return MO.getImm()>>2;
+
+  assert(MO.isExpr() &&
+         "getJumpTargetOpValue expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::create(0, Expr,
+                                   MCFixupKind(Mips::fixup_Mips_26)));
+  return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getJumpTargetOpValueMM expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MICROMIPS_26_S1)));
+  return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getUImm5Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm()) {
+    // The immediate is encoded as 'immediate << 2'.
+    unsigned Res = getMachineOpValue(MI, MO, Fixups, STI);
+    assert((Res & 3) == 0);
+    return Res >> 2;
+  }
+
+  assert(MO.isExpr() &&
+         "getUImm5Lsl2Encoding expects only expressions or an immediate");
+
+  return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getSImm3Lsa2Value(const MCInst &MI, unsigned OpNo,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm()) {
+    int Value = MO.getImm();
+    return Value >> 2;
+  }
+
+  return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getUImm6Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm()) {
+    unsigned Value = MO.getImm();
+    return Value >> 2;
+  }
+
+  return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getSImm9AddiuspValue(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm()) {
+    unsigned Binary = (MO.getImm() >> 2) & 0x0000ffff;
+    return (((Binary & 0x8000) >> 7) | (Binary & 0x00ff));
+  }
+
+  return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
+               const MCSubtargetInfo &STI) const {
+  int64_t Res;
+
+  if (Expr->evaluateAsAbsolute(Res))
+    return Res;
+
+  MCExpr::ExprKind Kind = Expr->getKind();
+  if (Kind == MCExpr::Constant) {
+    return cast<MCConstantExpr>(Expr)->getValue();
+  }
+
+  if (Kind == MCExpr::Binary) {
+    unsigned Res = getExprOpValue(cast<MCBinaryExpr>(Expr)->getLHS(), Fixups, STI);
+    Res += getExprOpValue(cast<MCBinaryExpr>(Expr)->getRHS(), Fixups, STI);
+    return Res;
+  }
+
+  if (Kind == MCExpr::Target) {
+    const MipsMCExpr *MipsExpr = cast<MipsMCExpr>(Expr);
+
+    Mips::Fixups FixupKind = Mips::Fixups(0);
+    switch (MipsExpr->getKind()) {
+    case MipsMCExpr::MEK_None:
+    case MipsMCExpr::MEK_Special:
+      llvm_unreachable("Unhandled fixup kind!");
+      break;
+    case MipsMCExpr::MEK_CALL_HI16:
+      FixupKind = Mips::fixup_Mips_CALL_HI16;
+      break;
+    case MipsMCExpr::MEK_CALL_LO16:
+      FixupKind = Mips::fixup_Mips_CALL_LO16;
+      break;
+    case MipsMCExpr::MEK_DTPREL_HI:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_DTPREL_HI16
+                                   : Mips::fixup_Mips_DTPREL_HI;
+      break;
+    case MipsMCExpr::MEK_DTPREL_LO:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_DTPREL_LO16
+                                   : Mips::fixup_Mips_DTPREL_LO;
+      break;
+    case MipsMCExpr::MEK_GOTTPREL:
+      FixupKind = Mips::fixup_Mips_GOTTPREL;
+      break;
+    case MipsMCExpr::MEK_GOT:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT16
+                                   : Mips::fixup_Mips_GOT;
+      break;
+    case MipsMCExpr::MEK_GOT_CALL:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_CALL16
+                                   : Mips::fixup_Mips_CALL16;
+      break;
+    case MipsMCExpr::MEK_GOT_DISP:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_DISP
+                                   : Mips::fixup_Mips_GOT_DISP;
+      break;
+    case MipsMCExpr::MEK_GOT_HI16:
+      FixupKind = Mips::fixup_Mips_GOT_HI16;
+      break;
+    case MipsMCExpr::MEK_GOT_LO16:
+      FixupKind = Mips::fixup_Mips_GOT_LO16;
+      break;
+    case MipsMCExpr::MEK_GOT_PAGE:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_PAGE
+                                   : Mips::fixup_Mips_GOT_PAGE;
+      break;
+    case MipsMCExpr::MEK_GOT_OFST:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_OFST
+                                   : Mips::fixup_Mips_GOT_OFST;
+      break;
+    case MipsMCExpr::MEK_GPREL:
+      FixupKind = Mips::fixup_Mips_GPREL16;
+      break;
+    case MipsMCExpr::MEK_LO: {
+      // Check for %lo(%neg(%gp_rel(X)))
+      if (MipsExpr->isGpOff()) {
+        FixupKind = Mips::fixup_Mips_GPOFF_LO;
+        break;
+      }
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
+                                   : Mips::fixup_Mips_LO16;
+      break;
+    }
+    case MipsMCExpr::MEK_HIGHEST:
+      FixupKind = Mips::fixup_Mips_HIGHEST;
+      break;
+    case MipsMCExpr::MEK_HIGHER:
+      FixupKind = Mips::fixup_Mips_HIGHER;
+      break;
+    case MipsMCExpr::MEK_HI:
+      // Check for %hi(%neg(%gp_rel(X)))
+      if (MipsExpr->isGpOff()) {
+        FixupKind = Mips::fixup_Mips_GPOFF_HI;
+        break;
+      }
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HI16
+                                   : Mips::fixup_Mips_HI16;
+      break;
+    case MipsMCExpr::MEK_PCREL_HI16:
+      FixupKind = Mips::fixup_MIPS_PCHI16;
+      break;
+    case MipsMCExpr::MEK_PCREL_LO16:
+      FixupKind = Mips::fixup_MIPS_PCLO16;
+      break;
+    case MipsMCExpr::MEK_TLSGD:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_GD
+                                   : Mips::fixup_Mips_TLSGD;
+      break;
+    case MipsMCExpr::MEK_TLSLDM:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_LDM
+                                   : Mips::fixup_Mips_TLSLDM;
+      break;
+    case MipsMCExpr::MEK_TPREL_HI:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_TPREL_HI16
+                                   : Mips::fixup_Mips_TPREL_HI;
+      break;
+    case MipsMCExpr::MEK_TPREL_LO:
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_TPREL_LO16
+                                   : Mips::fixup_Mips_TPREL_LO;
+      break;
+    case MipsMCExpr::MEK_NEG:
+      FixupKind =
+          isMicroMips(STI) ? Mips::fixup_MICROMIPS_SUB : Mips::fixup_Mips_SUB;
+      break;
+    }
+    Fixups.push_back(MCFixup::create(0, MipsExpr, MCFixupKind(FixupKind)));
+    return 0;
+  }
+
+  if (Kind == MCExpr::SymbolRef) {
+    Mips::Fixups FixupKind = Mips::Fixups(0);
+
+    switch(cast<MCSymbolRefExpr>(Expr)->getKind()) {
+    default: llvm_unreachable("Unknown fixup kind!");
+      break;
+    case MCSymbolRefExpr::VK_None:
+      FixupKind = Mips::fixup_Mips_32; // FIXME: This is ok for O32/N32 but not N64.
+      break;
+    } // switch
+
+    Fixups.push_back(MCFixup::create(0, Expr, MCFixupKind(FixupKind)));
+    return 0;
+  }
+  return 0;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+    unsigned RegNo = Ctx.getRegisterInfo()->getEncodingValue(Reg);
+    return RegNo;
+  } else if (MO.isImm()) {
+    return static_cast<unsigned>(MO.getImm());
+  } else if (MO.isFPImm()) {
+    return static_cast<unsigned>(APFloat(MO.getFPImm())
+        .bitcastToAPInt().getHiBits(32).getLimitedValue());
+  }
+  // MO must be an Expr.
+  assert(MO.isExpr());
+  return getExprOpValue(MO.getExpr(),Fixups, STI);
+}
+
+/// Return binary encoding of memory related operand.
+/// If the offset operand requires relocation, record the relocation.
+template <unsigned ShiftAmount>
+unsigned MipsMCCodeEmitter::getMemEncoding(const MCInst &MI, unsigned OpNo,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 15-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),Fixups, STI) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+  // Apply the scale factor if there is one.
+  OffBits >>= ShiftAmount;
+
+  return (OffBits & 0xFFFF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm4(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 6-4, offset is encoded in bits 3-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),
+                                       Fixups, STI) << 4;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
+                                       Fixups, STI);
+
+  return (OffBits & 0xF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm4Lsl1(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 6-4, offset is encoded in bits 3-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),
+                                       Fixups, STI) << 4;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
+                                       Fixups, STI) >> 1;
+
+  return (OffBits & 0xF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm4Lsl2(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 6-4, offset is encoded in bits 3-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),
+                                       Fixups, STI) << 4;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
+                                       Fixups, STI) >> 2;
+
+  return (OffBits & 0xF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMSPImm5Lsl2(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+  // Register is encoded in bits 9-5, offset is encoded in bits 4-0.
+  assert(MI.getOperand(OpNo).isReg() &&
+         (MI.getOperand(OpNo).getReg() == Mips::SP ||
+         MI.getOperand(OpNo).getReg() == Mips::SP_64) &&
+         "Unexpected base register!");
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
+                                       Fixups, STI) >> 2;
+
+  return OffBits & 0x1F;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMGPImm7Lsl2(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+  // Register is encoded in bits 9-7, offset is encoded in bits 6-0.
+  assert(MI.getOperand(OpNo).isReg() &&
+         MI.getOperand(OpNo).getReg() == Mips::GP &&
+         "Unexpected base register!");
+
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1),
+                                       Fixups, STI) >> 2;
+
+  return OffBits & 0x7F;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm9(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 8-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups,
+                                       STI) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo + 1), Fixups, STI);
+
+  return (OffBits & 0x1FF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm11(const MCInst &MI, unsigned OpNo,
+                      SmallVectorImpl<MCFixup> &Fixups,
+                      const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 10-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups,
+                                       STI) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+  return (OffBits & 0x07FF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
+                      SmallVectorImpl<MCFixup> &Fixups,
+                      const MCSubtargetInfo &STI) const {
+  // opNum can be invalid if instruction had reglist as operand.
+  // MemOperand is always last operand of instruction (base + offset).
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case Mips::SWM32_MM:
+  case Mips::LWM32_MM:
+    OpNo = MI.getNumOperands() - 2;
+    break;
+  }
+
+  // Base register is encoded in bits 20-16, offset is encoded in bits 11-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+  return (OffBits & 0x0FFF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm16(const MCInst &MI, unsigned OpNo,
+                      SmallVectorImpl<MCFixup> &Fixups,
+                      const MCSubtargetInfo &STI) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 15-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups,
+                                       STI) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+  return (OffBits & 0xFFFF) | RegBits;
+}
+
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+  // opNum can be invalid if instruction had reglist as operand
+  // MemOperand is always last operand of instruction (base + offset)
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case Mips::SWM16_MM:
+  case Mips::SWM16_MMR6:
+  case Mips::LWM16_MM:
+  case Mips::LWM16_MMR6:
+    OpNo = MI.getNumOperands() - 2;
+    break;
+  }
+
+  // Offset is encoded in bits 4-0.
+  assert(MI.getOperand(OpNo).isReg());
+  // Base register is always SP - thus it is not encoded.
+  assert(MI.getOperand(OpNo+1).isImm());
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI);
+
+  return ((OffBits >> 2) & 0x0F);
+}
+
+// FIXME: should be called getMSBEncoding
+//
+unsigned
+MipsMCCodeEmitter::getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
+  assert(MI.getOperand(OpNo-1).isImm());
+  assert(MI.getOperand(OpNo).isImm());
+  unsigned Position = getMachineOpValue(MI, MI.getOperand(OpNo-1), Fixups, STI);
+  unsigned Size = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+
+  return Position + Size - 1;
+}
+
+template <unsigned Bits, int Offset>
+unsigned
+MipsMCCodeEmitter::getUImmWithOffsetEncoding(const MCInst &MI, unsigned OpNo,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  assert(MI.getOperand(OpNo).isImm());
+  unsigned Value = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+  Value -= Offset;
+  return Value;
+}
+
+unsigned
+MipsMCCodeEmitter::getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm()) {
+    // The immediate is encoded as 'immediate << 2'.
+    unsigned Res = getMachineOpValue(MI, MO, Fixups, STI);
+    assert((Res & 3) == 0);
+    return Res >> 2;
+  }
+
+  assert(MO.isExpr() &&
+         "getSimm19Lsl2Encoding expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+  Mips::Fixups FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_PC19_S2
+                                            : Mips::fixup_MIPS_PC19_S2;
+  Fixups.push_back(MCFixup::create(0, Expr, MCFixupKind(FixupKind)));
+  return 0;
+}
+
+unsigned
+MipsMCCodeEmitter::getSimm18Lsl3Encoding(const MCInst &MI, unsigned OpNo,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm()) {
+    // The immediate is encoded as 'immediate << 3'.
+    unsigned Res = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+    assert((Res & 7) == 0);
+    return Res >> 3;
+  }
+
+  assert(MO.isExpr() &&
+         "getSimm18Lsl2Encoding expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+  Mips::Fixups FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_PC18_S3
+                                            : Mips::fixup_MIPS_PC18_S3;
+  Fixups.push_back(MCFixup::create(0, Expr, MCFixupKind(FixupKind)));
+  return 0;
+}
+
+unsigned
+MipsMCCodeEmitter::getUImm3Mod8Encoding(const MCInst &MI, unsigned OpNo,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  assert(MI.getOperand(OpNo).isImm());
+  const MCOperand &MO = MI.getOperand(OpNo);
+  return MO.getImm() % 8;
+}
+
+unsigned
+MipsMCCodeEmitter::getUImm4AndValue(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const {
+  assert(MI.getOperand(OpNo).isImm());
+  const MCOperand &MO = MI.getOperand(OpNo);
+  unsigned Value = MO.getImm();
+  switch (Value) {
+    case 128:   return 0x0;
+    case 1:     return 0x1;
+    case 2:     return 0x2;
+    case 3:     return 0x3;
+    case 4:     return 0x4;
+    case 7:     return 0x5;
+    case 8:     return 0x6;
+    case 15:    return 0x7;
+    case 16:    return 0x8;
+    case 31:    return 0x9;
+    case 32:    return 0xa;
+    case 63:    return 0xb;
+    case 64:    return 0xc;
+    case 255:   return 0xd;
+    case 32768: return 0xe;
+    case 65535: return 0xf;
+  }
+  llvm_unreachable("Unexpected value");
+}
+
+unsigned
+MipsMCCodeEmitter::getRegisterListOpValue(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  unsigned res = 0;
+
+  // Register list operand is always first operand of instruction and it is
+  // placed before memory operand (register + imm).
+
+  for (unsigned I = OpNo, E = MI.getNumOperands() - 2; I < E; ++I) {
+    unsigned Reg = MI.getOperand(I).getReg();
+    unsigned RegNo = Ctx.getRegisterInfo()->getEncodingValue(Reg);
+    if (RegNo != 31)
+      res++;
+    else
+      res |= 0x10;
+  }
+  return res;
+}
+
+unsigned
+MipsMCCodeEmitter::getRegisterListOpValue16(const MCInst &MI, unsigned OpNo,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  return (MI.getNumOperands() - 4);
+}
+
+unsigned
+MipsMCCodeEmitter::getRegisterPairOpValue(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+}
+
+unsigned
+MipsMCCodeEmitter::getMovePRegPairOpValue(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  unsigned res = 0;
+
+  if (MI.getOperand(0).getReg() == Mips::A1 &&
+      MI.getOperand(1).getReg() == Mips::A2)
+    res = 0;
+  else if (MI.getOperand(0).getReg() == Mips::A1 &&
+           MI.getOperand(1).getReg() == Mips::A3)
+    res = 1;
+  else if (MI.getOperand(0).getReg() == Mips::A2 &&
+           MI.getOperand(1).getReg() == Mips::A3)
+    res = 2;
+  else if (MI.getOperand(0).getReg() == Mips::A0 &&
+           MI.getOperand(1).getReg() == Mips::S5)
+    res = 3;
+  else if (MI.getOperand(0).getReg() == Mips::A0 &&
+           MI.getOperand(1).getReg() == Mips::S6)
+    res = 4;
+  else if (MI.getOperand(0).getReg() == Mips::A0 &&
+           MI.getOperand(1).getReg() == Mips::A1)
+    res = 5;
+  else if (MI.getOperand(0).getReg() == Mips::A0 &&
+           MI.getOperand(1).getReg() == Mips::A2)
+    res = 6;
+  else if (MI.getOperand(0).getReg() == Mips::A0 &&
+           MI.getOperand(1).getReg() == Mips::A3)
+    res = 7;
+
+  return res;
+}
+
+unsigned
+MipsMCCodeEmitter::getSimm23Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  assert(MO.isImm() && "getSimm23Lsl2Encoding expects only an immediate");
+  // The immediate is encoded as 'immediate >> 2'.
+  unsigned Res = static_cast<unsigned>(MO.getImm());
+  assert((Res & 3) == 0);
+  return Res >> 2;
+}
+
+#include "MipsGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
new file mode 100644
index 000000000000..2d041dcbf040
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -0,0 +1,278 @@
+//===-- MipsMCCodeEmitter.h - Convert Mips Code to Machine Code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MipsMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCCODEEMITTER_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCCODEEMITTER_H
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/Support/DataTypes.h"
+
+using namespace llvm;
+
+namespace llvm {
+class MCContext;
+class MCExpr;
+class MCInst;
+class MCInstrInfo;
+class MCFixup;
+class MCOperand;
+class MCSubtargetInfo;
+class raw_ostream;
+
+class MipsMCCodeEmitter : public MCCodeEmitter {
+  MipsMCCodeEmitter(const MipsMCCodeEmitter &) = delete;
+  void operator=(const MipsMCCodeEmitter &) = delete;
+  const MCInstrInfo &MCII;
+  MCContext &Ctx;
+  bool IsLittleEndian;
+
+  bool isMicroMips(const MCSubtargetInfo &STI) const;
+  bool isMips32r6(const MCSubtargetInfo &STI) const;
+
+public:
+  MipsMCCodeEmitter(const MCInstrInfo &mcii, MCContext &Ctx_, bool IsLittle)
+      : MCII(mcii), Ctx(Ctx_), IsLittleEndian(IsLittle) {}
+
+  ~MipsMCCodeEmitter() override {}
+
+  void EmitByte(unsigned char C, raw_ostream &OS) const;
+
+  void EmitInstruction(uint64_t Val, unsigned Size, const MCSubtargetInfo &STI,
+                       raw_ostream &OS) const;
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  // getJumpTargetOpValue - Return binary encoding of the jump
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  // getBranchJumpOpValueMM - Return binary encoding of the microMIPS jump
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  // getUImm5Lsl2Encoding - Return binary encoding of the microMIPS jump
+  // target operand.
+  unsigned getUImm5Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  unsigned getSImm3Lsa2Value(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getUImm6Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  // getSImm9AddiuspValue - Return binary encoding of the microMIPS addiusp
+  // instruction immediate operand.
+  unsigned getSImm9AddiuspValue(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  // getBranchTargetOpValue - Return binary encoding of the branch
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  // getBranchTargetOpValue1SImm16 - Return binary encoding of the branch
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTargetOpValue1SImm16(const MCInst &MI, unsigned OpNo,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const;
+
+  // getBranchTargetOpValueMMR6 - Return binary encoding of the branch
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTargetOpValueMMR6(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  // getBranchTargetOpValueLsl2MMR6 - Return binary encoding of the branch
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTargetOpValueLsl2MMR6(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const;
+
+  // getBranchTarget7OpValue - Return binary encoding of the microMIPS branch
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTarget7OpValueMM(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  // getBranchTargetOpValueMMPC10 - Return binary encoding of the microMIPS
+  // 10-bit branch target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTargetOpValueMMPC10(const MCInst &MI, unsigned OpNo,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const;
+
+  // getBranchTargetOpValue - Return binary encoding of the microMIPS branch
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  // getBranchTarget21OpValue - Return binary encoding of the branch
+  // offset operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTarget21OpValue(const MCInst &MI, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  // getBranchTarget21OpValueMM - Return binary encoding of the branch
+  // offset operand for microMIPS. If the machine operand requires
+  // relocation,record the relocation and return zero.
+  unsigned getBranchTarget21OpValueMM(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  // getBranchTarget26OpValue - Return binary encoding of the branch
+  // offset operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  // getBranchTarget26OpValueMM - Return binary encoding of the branch
+  // offset operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTarget26OpValueMM(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+
+  // getJumpOffset16OpValue - Return binary encoding of the jump
+  // offset operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  // getMachineOpValue - Return binary encoding of operand. If the machin
+  // operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getMSAMemEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  template <unsigned ShiftAmount = 0>
+  unsigned getMemEncoding(const MCInst &MI, unsigned OpNo,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm4(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm4Lsl1(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm4Lsl2(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMSPImm5Lsl2(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMGPImm7Lsl2(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm9(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm11(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm16(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  unsigned getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+  unsigned getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  /// Subtract Offset then encode as a N-bit unsigned integer.
+  template <unsigned Bits, int Offset>
+  unsigned getUImmWithOffsetEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
+
+  unsigned getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  unsigned getSimm18Lsl3Encoding(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  unsigned getUImm3Mod8Encoding(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  unsigned getUImm4AndValue(const MCInst &MI, unsigned OpNo,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
+
+  unsigned getRegisterPairOpValue(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  unsigned getMovePRegPairOpValue(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  unsigned getSimm23Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  unsigned getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const;
+
+  unsigned getRegisterListOpValue(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
+  unsigned getRegisterListOpValue16(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+  private:
+  void LowerCompactBranch(MCInst& Inst) const;
+}; // class MipsMCCodeEmitter
+} // namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
new file mode 100644
index 000000000000..082bb87fcb8a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -0,0 +1,287 @@
+//===-- MipsMCExpr.cpp - Mips specific MC expression classes --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mipsmcexpr"
+
+const MipsMCExpr *MipsMCExpr::create(MipsMCExpr::MipsExprKind Kind,
+                                     const MCExpr *Expr, MCContext &Ctx) {
+  return new (Ctx) MipsMCExpr(Kind, Expr);
+}
+
+const MipsMCExpr *MipsMCExpr::createGpOff(MipsMCExpr::MipsExprKind Kind,
+                                          const MCExpr *Expr, MCContext &Ctx) {
+  return create(Kind, create(MEK_NEG, create(MEK_GPREL, Expr, Ctx), Ctx), Ctx);
+}
+
+void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  int64_t AbsVal;
+
+  switch (Kind) {
+  case MEK_None:
+  case MEK_Special:
+    llvm_unreachable("MEK_None and MEK_Special are invalid");
+    break;
+  case MEK_CALL_HI16:
+    OS << "%call_hi";
+    break;
+  case MEK_CALL_LO16:
+    OS << "%call_lo";
+    break;
+  case MEK_DTPREL_HI:
+    OS << "%dtprel_hi";
+    break;
+  case MEK_DTPREL_LO:
+    OS << "%dtprel_lo";
+    break;
+  case MEK_GOT:
+    OS << "%got";
+    break;
+  case MEK_GOTTPREL:
+    OS << "%gottprel";
+    break;
+  case MEK_GOT_CALL:
+    OS << "%call16";
+    break;
+  case MEK_GOT_DISP:
+    OS << "%got_disp";
+    break;
+  case MEK_GOT_HI16:
+    OS << "%got_hi";
+    break;
+  case MEK_GOT_LO16:
+    OS << "%got_lo";
+    break;
+  case MEK_GOT_PAGE:
+    OS << "%got_page";
+    break;
+  case MEK_GOT_OFST:
+    OS << "%got_ofst";
+    break;
+  case MEK_GPREL:
+    OS << "%gp_rel";
+    break;
+  case MEK_HI:
+    OS << "%hi";
+    break;
+  case MEK_HIGHER:
+    OS << "%higher";
+    break;
+  case MEK_HIGHEST:
+    OS << "%highest";
+    break;
+  case MEK_LO:
+    OS << "%lo";
+    break;
+  case MEK_NEG:
+    OS << "%neg";
+    break;
+  case MEK_PCREL_HI16:
+    OS << "%pcrel_hi";
+    break;
+  case MEK_PCREL_LO16:
+    OS << "%pcrel_lo";
+    break;
+  case MEK_TLSGD:
+    OS << "%tlsgd";
+    break;
+  case MEK_TLSLDM:
+    OS << "%tlsldm";
+    break;
+  case MEK_TPREL_HI:
+    OS << "%tprel_hi";
+    break;
+  case MEK_TPREL_LO:
+    OS << "%tprel_lo";
+    break;
+  }
+
+  OS << '(';
+  if (Expr->evaluateAsAbsolute(AbsVal))
+    OS << AbsVal;
+  else
+    Expr->print(OS, MAI, true);
+  OS << ')';
+}
+
+bool
+MipsMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+                                      const MCAsmLayout *Layout,
+                                      const MCFixup *Fixup) const {
+  // Look for the %hi(%neg(%gp_rel(X))) and %lo(%neg(%gp_rel(X))) special cases.
+  if (isGpOff()) {
+    const MCExpr *SubExpr =
+        cast<MipsMCExpr>(cast<MipsMCExpr>(getSubExpr())->getSubExpr())
+            ->getSubExpr();
+    if (!SubExpr->evaluateAsRelocatable(Res, Layout, Fixup))
+      return false;
+
+    Res = MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(),
+                       MEK_Special);
+    return true;
+  }
+
+  if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
+    return false;
+
+  if (Res.getRefKind() != MCSymbolRefExpr::VK_None)
+    return false;
+
+  // evaluateAsAbsolute() and evaluateAsValue() require that we evaluate the
+  // %hi/%lo/etc. here. Fixup is a null pointer when either of these is the
+  // caller.
+  if (Res.isAbsolute() && Fixup == nullptr) {
+    int64_t AbsVal = Res.getConstant();
+    switch (Kind) {
+    case MEK_None:
+    case MEK_Special:
+      llvm_unreachable("MEK_None and MEK_Special are invalid");
+    case MEK_DTPREL_HI:
+    case MEK_DTPREL_LO:
+    case MEK_GOT:
+    case MEK_GOTTPREL:
+    case MEK_GOT_CALL:
+    case MEK_GOT_DISP:
+    case MEK_GOT_HI16:
+    case MEK_GOT_LO16:
+    case MEK_GOT_OFST:
+    case MEK_GOT_PAGE:
+    case MEK_GPREL:
+    case MEK_PCREL_HI16:
+    case MEK_PCREL_LO16:
+    case MEK_TLSGD:
+    case MEK_TLSLDM:
+    case MEK_TPREL_HI:
+    case MEK_TPREL_LO:
+      return false;
+    case MEK_LO:
+    case MEK_CALL_LO16:
+      AbsVal = SignExtend64<16>(AbsVal);
+      break;
+    case MEK_CALL_HI16:
+    case MEK_HI:
+      AbsVal = SignExtend64<16>((AbsVal + 0x8000) >> 16);
+      break;
+    case MEK_HIGHER:
+      AbsVal = SignExtend64<16>((AbsVal + 0x80008000LL) >> 32);
+      break;
+    case MEK_HIGHEST:
+      AbsVal = SignExtend64<16>((AbsVal + 0x800080008000LL) >> 48);
+      break;
+    case MEK_NEG:
+      AbsVal = -AbsVal;
+      break;
+    }
+    Res = MCValue::get(AbsVal);
+    return true;
+  }
+
+  // We want to defer it for relocatable expressions since the constant is
+  // applied to the whole symbol value.
+  //
+  // The value of getKind() that is given to MCValue is only intended to aid
+  // debugging when inspecting MCValue objects. It shouldn't be relied upon
+  // for decision making.
+  Res = MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+  return true;
+}
+
+void MipsMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    fixELFSymbolsInTLSFixupsImpl(cast<MipsMCExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  case MCExpr::Constant:
+    break;
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+    break;
+  }
+  case MCExpr::SymbolRef: {
+    // We're known to be under a TLS fixup, so any symbol should be
+    // modified. There should be only one.
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    cast<MCSymbolELF>(SymRef.getSymbol()).setType(ELF::STT_TLS);
+    break;
+  }
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void MipsMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  switch (getKind()) {
+  case MEK_None:
+  case MEK_Special:
+    llvm_unreachable("MEK_None and MEK_Special are invalid");
+    break;
+  case MEK_CALL_HI16:
+  case MEK_CALL_LO16:
+  case MEK_DTPREL_HI:
+  case MEK_DTPREL_LO:
+  case MEK_GOT:
+  case MEK_GOT_CALL:
+  case MEK_GOT_DISP:
+  case MEK_GOT_HI16:
+  case MEK_GOT_LO16:
+  case MEK_GOT_OFST:
+  case MEK_GOT_PAGE:
+  case MEK_GPREL:
+  case MEK_HI:
+  case MEK_HIGHER:
+  case MEK_HIGHEST:
+  case MEK_LO:
+  case MEK_NEG:
+  case MEK_PCREL_HI16:
+  case MEK_PCREL_LO16:
+  case MEK_TLSLDM:
+    // If we do have nested target-specific expressions, they will be in
+    // a consecutive chain.
+    if (const MipsMCExpr *E = dyn_cast<const MipsMCExpr>(getSubExpr()))
+      E->fixELFSymbolsInTLSFixups(Asm);
+    break;
+  case MEK_GOTTPREL:
+  case MEK_TLSGD:
+  case MEK_TPREL_HI:
+  case MEK_TPREL_LO:
+    fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+    break;
+  }
+}
+
+bool MipsMCExpr::isGpOff(MipsExprKind &Kind) const {
+  if (getKind() == MEK_HI || getKind() == MEK_LO) {
+    if (const MipsMCExpr *S1 = dyn_cast<const MipsMCExpr>(getSubExpr())) {
+      if (const MipsMCExpr *S2 = dyn_cast<const MipsMCExpr>(S1->getSubExpr())) {
+        if (S1->getKind() == MEK_NEG && S2->getKind() == MEK_GPREL) {
+          Kind = getKind();
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
new file mode 100644
index 000000000000..d1a4334ec640
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -0,0 +1,91 @@
+//===-- MipsMCExpr.h - Mips specific MC expression classes ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
+
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+namespace llvm {
+
+class MipsMCExpr : public MCTargetExpr {
+public:
+  enum MipsExprKind {
+    MEK_None,
+    MEK_CALL_HI16,
+    MEK_CALL_LO16,
+    MEK_DTPREL_HI,
+    MEK_DTPREL_LO,
+    MEK_GOT,
+    MEK_GOTTPREL,
+    MEK_GOT_CALL,
+    MEK_GOT_DISP,
+    MEK_GOT_HI16,
+    MEK_GOT_LO16,
+    MEK_GOT_OFST,
+    MEK_GOT_PAGE,
+    MEK_GPREL,
+    MEK_HI,
+    MEK_HIGHER,
+    MEK_HIGHEST,
+    MEK_LO,
+    MEK_NEG,
+    MEK_PCREL_HI16,
+    MEK_PCREL_LO16,
+    MEK_TLSGD,
+    MEK_TLSLDM,
+    MEK_TPREL_HI,
+    MEK_TPREL_LO,
+    MEK_Special,
+  };
+
+private:
+  const MipsExprKind Kind;
+  const MCExpr *Expr;
+
+  explicit MipsMCExpr(MipsExprKind Kind, const MCExpr *Expr)
+      : Kind(Kind), Expr(Expr) {}
+
+public:
+  static const MipsMCExpr *create(MipsExprKind Kind, const MCExpr *Expr,
+                                  MCContext &Ctx);
+  static const MipsMCExpr *createGpOff(MipsExprKind Kind, const MCExpr *Expr,
+                                       MCContext &Ctx);
+
+  /// Get the kind of this expression.
+  MipsExprKind getKind() const { return Kind; }
+
+  /// Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+  MCFragment *findAssociatedFragment() const override {
+    return getSubExpr()->findAssociatedFragment();
+  }
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+
+  bool isGpOff(MipsExprKind &Kind) const;
+  bool isGpOff() const {
+    MipsExprKind Kind;
+    return isGpOff(Kind);
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
new file mode 100644
index 000000000000..687b800c2409
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
@@ -0,0 +1,30 @@
+//===-- MipsMCNaCl.h - NaCl-related declarations --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCNACL_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCNACL_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+
+// Log2 of the NaCl MIPS sandbox's instruction bundle size.
+static const unsigned MIPS_NACL_BUNDLE_ALIGN = 4u;
+
+bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
+                                  bool *IsStore = nullptr);
+bool baseRegNeedsLoadStoreMask(unsigned Reg);
+
+// This function creates an MCELFStreamer for Mips NaCl.
+MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                         raw_pwrite_stream &OS,
+                                         MCCodeEmitter *Emitter, bool RelaxAll);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
new file mode 100644
index 000000000000..56fe18572118
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -0,0 +1,201 @@
+//===-- MipsMCTargetDesc.cpp - Mips Target Descriptions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Mips specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsMCTargetDesc.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsELFStreamer.h"
+#include "MipsMCAsmInfo.h"
+#include "MipsMCNaCl.h"
+#include "MipsTargetStreamer.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "MipsGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "MipsGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "MipsGenRegisterInfo.inc"
+
+/// Select the Mips CPU for the given triple and cpu name.
+/// FIXME: Merge with the copy in MipsSubtarget.cpp
+StringRef MIPS_MC::selectMipsCPU(const Triple &TT, StringRef CPU) {
+  if (CPU.empty() || CPU == "generic") {
+    if (TT.getArch() == Triple::mips || TT.getArch() == Triple::mipsel)
+      CPU = "mips32";
+    else
+      CPU = "mips64";
+  }
+  return CPU;
+}
+
+static MCInstrInfo *createMipsMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitMipsMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createMipsMCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitMipsMCRegisterInfo(X, Mips::RA);
+  return X;
+}
+
+static MCSubtargetInfo *createMipsMCSubtargetInfo(const Triple &TT,
+                                                  StringRef CPU, StringRef FS) {
+  CPU = MIPS_MC::selectMipsCPU(TT, CPU);
+  return createMipsMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI,
+                                      const Triple &TT) {
+  MCAsmInfo *MAI = new MipsMCAsmInfo(TT);
+
+  unsigned SP = MRI.getDwarfRegNum(Mips::SP, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, SP, 0);
+  MAI->addInitialFrameState(Inst);
+
+  return MAI;
+}
+
+static MCInstPrinter *createMipsMCInstPrinter(const Triple &T,
+                                              unsigned SyntaxVariant,
+                                              const MCAsmInfo &MAI,
+                                              const MCInstrInfo &MII,
+                                              const MCRegisterInfo &MRI) {
+  return new MipsInstPrinter(MAI, MII, MRI);
+}
+
+static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
+                                    MCAsmBackend &MAB, raw_pwrite_stream &OS,
+                                    MCCodeEmitter *Emitter, bool RelaxAll) {
+  MCStreamer *S;
+  if (!T.isOSNaCl())
+    S = createMipsELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+  else
+    S = createMipsNaClELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+  return S;
+}
+
+static MCTargetStreamer *createMipsAsmTargetStreamer(MCStreamer &S,
+                                                     formatted_raw_ostream &OS,
+                                                     MCInstPrinter *InstPrint,
+                                                     bool isVerboseAsm) {
+  return new MipsTargetAsmStreamer(S, OS);
+}
+
+static MCTargetStreamer *createMipsNullTargetStreamer(MCStreamer &S) {
+  return new MipsTargetStreamer(S);
+}
+
+static MCTargetStreamer *
+createMipsObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  return new MipsTargetELFStreamer(S, STI);
+}
+
+namespace {
+
+class MipsMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  MipsMCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+                      uint64_t &Target) const override {
+    unsigned NumOps = Inst.getNumOperands();
+    if (NumOps == 0)
+      return false;
+    switch (Info->get(Inst.getOpcode()).OpInfo[NumOps - 1].OperandType) {
+    case MCOI::OPERAND_UNKNOWN:
+    case MCOI::OPERAND_IMMEDIATE:
+      // jal, bal ...
+      Target = Inst.getOperand(NumOps - 1).getImm();
+      return true;
+    case MCOI::OPERAND_PCREL:
+      // b, j, beq ...
+      Target = Addr + Inst.getOperand(NumOps - 1).getImm();
+      return true;
+    default:
+      return false;
+    }
+  }
+};
+}
+
+static MCInstrAnalysis *createMipsMCInstrAnalysis(const MCInstrInfo *Info) {
+  return new MipsMCInstrAnalysis(Info);
+}
+
+extern "C" void LLVMInitializeMipsTargetMC() {
+  for (Target *T : {&getTheMipsTarget(), &getTheMipselTarget(),
+                    &getTheMips64Target(), &getTheMips64elTarget()}) {
+    // Register the MC asm info.
+    RegisterMCAsmInfoFn X(*T, createMipsMCAsmInfo);
+
+    // Register the MC instruction info.
+    TargetRegistry::RegisterMCInstrInfo(*T, createMipsMCInstrInfo);
+
+    // Register the MC register info.
+    TargetRegistry::RegisterMCRegInfo(*T, createMipsMCRegisterInfo);
+
+    // Register the elf streamer.
+    TargetRegistry::RegisterELFStreamer(*T, createMCStreamer);
+
+    // Register the asm target streamer.
+    TargetRegistry::RegisterAsmTargetStreamer(*T, createMipsAsmTargetStreamer);
+
+    TargetRegistry::RegisterNullTargetStreamer(*T,
+                                               createMipsNullTargetStreamer);
+
+    // Register the MC subtarget info.
+    TargetRegistry::RegisterMCSubtargetInfo(*T, createMipsMCSubtargetInfo);
+
+    // Register the MC instruction analyzer.
+    TargetRegistry::RegisterMCInstrAnalysis(*T, createMipsMCInstrAnalysis);
+
+    // Register the MCInstPrinter.
+    TargetRegistry::RegisterMCInstPrinter(*T, createMipsMCInstPrinter);
+
+    TargetRegistry::RegisterObjectTargetStreamer(
+        *T, createMipsObjectTargetStreamer);
+  }
+
+  // Register the MC Code Emitter
+  for (Target *T : {&getTheMipsTarget(), &getTheMips64Target()})
+    TargetRegistry::RegisterMCCodeEmitter(*T, createMipsMCCodeEmitterEB);
+
+  for (Target *T : {&getTheMipselTarget(), &getTheMips64elTarget()})
+    TargetRegistry::RegisterMCCodeEmitter(*T, createMipsMCCodeEmitterEL);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(getTheMipsTarget(),
+                                       createMipsAsmBackendEB32);
+  TargetRegistry::RegisterMCAsmBackend(getTheMipselTarget(),
+                                       createMipsAsmBackendEL32);
+  TargetRegistry::RegisterMCAsmBackend(getTheMips64Target(),
+                                       createMipsAsmBackendEB64);
+  TargetRegistry::RegisterMCAsmBackend(getTheMips64elTarget(),
+                                       createMipsAsmBackendEL64);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
new file mode 100644
index 000000000000..b28681f42ebe
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -0,0 +1,84 @@
+//===-- MipsMCTargetDesc.h - Mips Target Descriptions -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Mips specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCTARGETDESC_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class StringRef;
+class Target;
+class Triple;
+class raw_ostream;
+class raw_pwrite_stream;
+
+Target &getTheMipsTarget();
+Target &getTheMipselTarget();
+Target &getTheMips64Target();
+Target &getTheMips64elTarget();
+
+MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         MCContext &Ctx);
+MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         MCContext &Ctx);
+
+MCAsmBackend *createMipsAsmBackendEB32(const Target &T,
+                                       const MCRegisterInfo &MRI,
+                                       const Triple &TT, StringRef CPU,
+                                       const MCTargetOptions &Options);
+MCAsmBackend *createMipsAsmBackendEL32(const Target &T,
+                                       const MCRegisterInfo &MRI,
+                                       const Triple &TT, StringRef CPU,
+                                       const MCTargetOptions &Options);
+MCAsmBackend *createMipsAsmBackendEB64(const Target &T,
+                                       const MCRegisterInfo &MRI,
+                                       const Triple &TT, StringRef CPU,
+                                       const MCTargetOptions &Options);
+MCAsmBackend *createMipsAsmBackendEL64(const Target &T,
+                                       const MCRegisterInfo &MRI,
+                                       const Triple &TT, StringRef CPU,
+                                       const MCTargetOptions &Options);
+
+MCObjectWriter *createMipsELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
+                                          bool IsLittleEndian, bool Is64Bit);
+
+namespace MIPS_MC {
+StringRef selectMipsCPU(const Triple &TT, StringRef CPU);
+}
+
+} // End llvm namespace
+
+// Defines symbolic names for Mips registers.  This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "MipsGenRegisterInfo.inc"
+
+// Defines symbolic names for the Mips instructions.
+#define GET_INSTRINFO_ENUM
+#include "MipsGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "MipsGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
new file mode 100644
index 000000000000..aef9bd3a8e2a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -0,0 +1,268 @@
+//===-- MipsNaClELFStreamer.cpp - ELF Object Output for Mips NaCl ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements MCELFStreamer for Mips NaCl.  It emits .o object files
+// as required by NaCl's SFI sandbox.  It inserts address-masking instructions
+// before dangerous control-flow and memory access instructions.  It inserts
+// address-masking instructions after instructions that change the stack
+// pointer.  It ensures that the mask and the dangerous instruction are always
+// emitted in the same bundle.  It aligns call + branch delay to the bundle end,
+// so that return address is always aligned to the start of next bundle.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsELFStreamer.h"
+#include "MipsMCNaCl.h"
+#include "llvm/MC/MCELFStreamer.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-mc-nacl"
+
+namespace {
+
+const unsigned IndirectBranchMaskReg = Mips::T6;
+const unsigned LoadStoreStackMaskReg = Mips::T7;
+
+/// Extend the generic MCELFStreamer class so that it can mask dangerous
+/// instructions.
+
+class MipsNaClELFStreamer : public MipsELFStreamer {
+public:
+  MipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                      raw_pwrite_stream &OS, MCCodeEmitter *Emitter)
+      : MipsELFStreamer(Context, TAB, OS, Emitter), PendingCall(false) {}
+
+  ~MipsNaClELFStreamer() override {}
+
+private:
+  // Whether we started the sandboxing sequence for calls.  Calls are bundled
+  // with branch delays and aligned to the bundle end.
+  bool PendingCall;
+
+  bool isIndirectJump(const MCInst &MI) {
+    if (MI.getOpcode() == Mips::JALR) {
+      // MIPS32r6/MIPS64r6 doesn't have a JR instruction and uses JALR instead.
+      // JALR is an indirect branch if the link register is $0.
+      assert(MI.getOperand(0).isReg());
+      return MI.getOperand(0).getReg() == Mips::ZERO;
+    }
+    return MI.getOpcode() == Mips::JR;
+  }
+
+  bool isStackPointerFirstOperand(const MCInst &MI) {
+    return (MI.getNumOperands() > 0 && MI.getOperand(0).isReg()
+            && MI.getOperand(0).getReg() == Mips::SP);
+  }
+
+  bool isCall(const MCInst &MI, bool *IsIndirectCall) {
+    unsigned Opcode = MI.getOpcode();
+
+    *IsIndirectCall = false;
+
+    switch (Opcode) {
+    default:
+      return false;
+
+    case Mips::JAL:
+    case Mips::BAL:
+    case Mips::BAL_BR:
+    case Mips::BLTZAL:
+    case Mips::BGEZAL:
+      return true;
+
+    case Mips::JALR:
+      // JALR is only a call if the link register is not $0. Otherwise it's an
+      // indirect branch.
+      assert(MI.getOperand(0).isReg());
+      if (MI.getOperand(0).getReg() == Mips::ZERO)
+        return false;
+
+      *IsIndirectCall = true;
+      return true;
+    }
+  }
+
+  void emitMask(unsigned AddrReg, unsigned MaskReg,
+                const MCSubtargetInfo &STI) {
+    MCInst MaskInst;
+    MaskInst.setOpcode(Mips::AND);
+    MaskInst.addOperand(MCOperand::createReg(AddrReg));
+    MaskInst.addOperand(MCOperand::createReg(AddrReg));
+    MaskInst.addOperand(MCOperand::createReg(MaskReg));
+    MipsELFStreamer::EmitInstruction(MaskInst, STI);
+  }
+
+  // Sandbox indirect branch or return instruction by inserting mask operation
+  // before it.
+  void sandboxIndirectJump(const MCInst &MI, const MCSubtargetInfo &STI) {
+    unsigned AddrReg = MI.getOperand(0).getReg();
+
+    EmitBundleLock(false);
+    emitMask(AddrReg, IndirectBranchMaskReg, STI);
+    MipsELFStreamer::EmitInstruction(MI, STI);
+    EmitBundleUnlock();
+  }
+
+  // Sandbox memory access or SP change.  Insert mask operation before and/or
+  // after the instruction.
+  void sandboxLoadStoreStackChange(const MCInst &MI, unsigned AddrIdx,
+                                   const MCSubtargetInfo &STI, bool MaskBefore,
+                                   bool MaskAfter) {
+    EmitBundleLock(false);
+    if (MaskBefore) {
+      // Sandbox memory access.
+      unsigned BaseReg = MI.getOperand(AddrIdx).getReg();
+      emitMask(BaseReg, LoadStoreStackMaskReg, STI);
+    }
+    MipsELFStreamer::EmitInstruction(MI, STI);
+    if (MaskAfter) {
+      // Sandbox SP change.
+      unsigned SPReg = MI.getOperand(0).getReg();
+      assert((Mips::SP == SPReg) && "Unexpected stack-pointer register.");
+      emitMask(SPReg, LoadStoreStackMaskReg, STI);
+    }
+    EmitBundleUnlock();
+  }
+
+public:
+  /// This function is the one used to emit instruction data into the ELF
+  /// streamer.  We override it to mask dangerous instructions.
+  void EmitInstruction(const MCInst &Inst,
+                       const MCSubtargetInfo &STI) override {
+    // Sandbox indirect jumps.
+    if (isIndirectJump(Inst)) {
+      if (PendingCall)
+        report_fatal_error("Dangerous instruction in branch delay slot!");
+      sandboxIndirectJump(Inst, STI);
+      return;
+    }
+
+    // Sandbox loads, stores and SP changes.
+    unsigned AddrIdx;
+    bool IsStore;
+    bool IsMemAccess = isBasePlusOffsetMemoryAccess(Inst.getOpcode(), &AddrIdx,
+                                                    &IsStore);
+    bool IsSPFirstOperand = isStackPointerFirstOperand(Inst);
+    if (IsMemAccess || IsSPFirstOperand) {
+      bool MaskBefore = (IsMemAccess
+                         && baseRegNeedsLoadStoreMask(Inst.getOperand(AddrIdx)
+                                                          .getReg()));
+      bool MaskAfter = IsSPFirstOperand && !IsStore;
+      if (MaskBefore || MaskAfter) {
+        if (PendingCall)
+          report_fatal_error("Dangerous instruction in branch delay slot!");
+        sandboxLoadStoreStackChange(Inst, AddrIdx, STI, MaskBefore, MaskAfter);
+        return;
+      }
+      // fallthrough
+    }
+
+    // Sandbox calls by aligning call and branch delay to the bundle end.
+    // For indirect calls, emit the mask before the call.
+    bool IsIndirectCall;
+    if (isCall(Inst, &IsIndirectCall)) {
+      if (PendingCall)
+        report_fatal_error("Dangerous instruction in branch delay slot!");
+
+      // Start the sandboxing sequence by emitting call.
+      EmitBundleLock(true);
+      if (IsIndirectCall) {
+        unsigned TargetReg = Inst.getOperand(1).getReg();
+        emitMask(TargetReg, IndirectBranchMaskReg, STI);
+      }
+      MipsELFStreamer::EmitInstruction(Inst, STI);
+      PendingCall = true;
+      return;
+    }
+    if (PendingCall) {
+      // Finish the sandboxing sequence by emitting branch delay.
+      MipsELFStreamer::EmitInstruction(Inst, STI);
+      EmitBundleUnlock();
+      PendingCall = false;
+      return;
+    }
+
+    // None of the sandboxing applies, just emit the instruction.
+    MipsELFStreamer::EmitInstruction(Inst, STI);
+  }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
+                                  bool *IsStore) {
+  if (IsStore)
+    *IsStore = false;
+
+  switch (Opcode) {
+  default:
+    return false;
+
+  // Load instructions with base address register in position 1.
+  case Mips::LB:
+  case Mips::LBu:
+  case Mips::LH:
+  case Mips::LHu:
+  case Mips::LW:
+  case Mips::LWC1:
+  case Mips::LDC1:
+  case Mips::LL:
+  case Mips::LL_R6:
+  case Mips::LWL:
+  case Mips::LWR:
+    *AddrIdx = 1;
+    return true;
+
+  // Store instructions with base address register in position 1.
+  case Mips::SB:
+  case Mips::SH:
+  case Mips::SW:
+  case Mips::SWC1:
+  case Mips::SDC1:
+  case Mips::SWL:
+  case Mips::SWR:
+    *AddrIdx = 1;
+    if (IsStore)
+      *IsStore = true;
+    return true;
+
+  // Store instructions with base address register in position 2.
+  case Mips::SC:
+  case Mips::SC_R6:
+    *AddrIdx = 2;
+    if (IsStore)
+      *IsStore = true;
+    return true;
+  }
+}
+
+bool baseRegNeedsLoadStoreMask(unsigned Reg) {
+  // The contents of SP and thread pointer register do not require masking.
+  return Reg != Mips::SP && Reg != Mips::T8;
+}
+
+MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                         raw_pwrite_stream &OS,
+                                         MCCodeEmitter *Emitter,
+                                         bool RelaxAll) {
+  MipsNaClELFStreamer *S = new MipsNaClELFStreamer(Context, TAB, OS, Emitter);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+
+  // Set bundle-alignment as required by the NaCl ABI for the target.
+  S->EmitBundleAlignMode(MIPS_NACL_BUNDLE_ALIGN);
+
+  return S;
+}
+
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
new file mode 100644
index 000000000000..24b602810d6e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -0,0 +1,95 @@
+//===-- MipsOptionRecord.cpp - Abstraction for storing information --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsOptionRecord.h"
+#include "MipsELFStreamer.h"
+#include "MipsTargetStreamer.h"
+#include "llvm/MC/MCSectionELF.h"
+
+using namespace llvm;
+
+void MipsRegInfoRecord::EmitMipsOptionRecord() {
+  MCAssembler &MCA = Streamer->getAssembler();
+  MipsTargetStreamer *MTS =
+      static_cast<MipsTargetStreamer *>(Streamer->getTargetStreamer());
+
+  Streamer->PushSection();
+
+  // We need to distinguish between N64 and the rest because at the moment
+  // we don't emit .Mips.options for other ELFs other than N64.
+  // Since .reginfo has the same information as .Mips.options (ODK_REGINFO),
+  // we can use the same abstraction (MipsRegInfoRecord class) to handle both.
+  if (MTS->getABI().IsN64()) {
+    // The EntrySize value of 1 seems strange since the records are neither
+    // 1-byte long nor fixed length but it matches the value GAS emits.
+    MCSectionELF *Sec =
+        Context.getELFSection(".MIPS.options", ELF::SHT_MIPS_OPTIONS,
+                              ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, 1, "");
+    MCA.registerSection(*Sec);
+    Sec->setAlignment(8);
+    Streamer->SwitchSection(Sec);
+
+    Streamer->EmitIntValue(ELF::ODK_REGINFO, 1);  // kind
+    Streamer->EmitIntValue(40, 1); // size
+    Streamer->EmitIntValue(0, 2);  // section
+    Streamer->EmitIntValue(0, 4);  // info
+    Streamer->EmitIntValue(ri_gprmask, 4);
+    Streamer->EmitIntValue(0, 4); // pad
+    Streamer->EmitIntValue(ri_cprmask[0], 4);
+    Streamer->EmitIntValue(ri_cprmask[1], 4);
+    Streamer->EmitIntValue(ri_cprmask[2], 4);
+    Streamer->EmitIntValue(ri_cprmask[3], 4);
+    Streamer->EmitIntValue(ri_gp_value, 8);
+  } else {
+    MCSectionELF *Sec = Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO,
+                                              ELF::SHF_ALLOC, 24, "");
+    MCA.registerSection(*Sec);
+    Sec->setAlignment(MTS->getABI().IsN32() ? 8 : 4);
+    Streamer->SwitchSection(Sec);
+
+    Streamer->EmitIntValue(ri_gprmask, 4);
+    Streamer->EmitIntValue(ri_cprmask[0], 4);
+    Streamer->EmitIntValue(ri_cprmask[1], 4);
+    Streamer->EmitIntValue(ri_cprmask[2], 4);
+    Streamer->EmitIntValue(ri_cprmask[3], 4);
+    assert((ri_gp_value & 0xffffffff) == ri_gp_value);
+    Streamer->EmitIntValue(ri_gp_value, 4);
+  }
+
+  Streamer->PopSection();
+}
+
+void MipsRegInfoRecord::SetPhysRegUsed(unsigned Reg,
+                                       const MCRegisterInfo *MCRegInfo) {
+  unsigned Value = 0;
+
+  for (MCSubRegIterator SubRegIt(Reg, MCRegInfo, true); SubRegIt.isValid();
+       ++SubRegIt) {
+    unsigned CurrentSubReg = *SubRegIt;
+
+    unsigned EncVal = MCRegInfo->getEncodingValue(CurrentSubReg);
+    Value |= 1 << EncVal;
+
+    if (GPR32RegClass->contains(CurrentSubReg) ||
+        GPR64RegClass->contains(CurrentSubReg))
+      ri_gprmask |= Value;
+    else if (COP0RegClass->contains(CurrentSubReg))
+      ri_cprmask[0] |= Value;
+    // MIPS COP1 is the FPU.
+    else if (FGR32RegClass->contains(CurrentSubReg) ||
+             FGR64RegClass->contains(CurrentSubReg) ||
+             AFGR64RegClass->contains(CurrentSubReg) ||
+             MSA128BRegClass->contains(CurrentSubReg))
+      ri_cprmask[1] |= Value;
+    else if (COP2RegClass->contains(CurrentSubReg))
+      ri_cprmask[2] |= Value;
+    else if (COP3RegClass->contains(CurrentSubReg))
+      ri_cprmask[3] |= Value;
+  }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
new file mode 100644
index 000000000000..7f79eb400f59
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -0,0 +1,1166 @@
+//===-- MipsTargetStreamer.cpp - Mips Target Streamer Methods -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Mips specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetStreamer.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsELFStreamer.h"
+#include "MipsMCExpr.h"
+#include "MipsMCTargetDesc.h"
+#include "MipsTargetObjectFile.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+namespace {
+static cl::opt<bool> RoundSectionSizes(
+    "mips-round-section-sizes", cl::init(false),
+    cl::desc("Round section sizes up to the section alignment"), cl::Hidden);
+} // end anonymous namespace
+
+MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S)
+    : MCTargetStreamer(S), ModuleDirectiveAllowed(true) {
+  GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
+}
+void MipsTargetStreamer::emitDirectiveSetMicroMips() {}
+void MipsTargetStreamer::emitDirectiveSetNoMicroMips() {}
+void MipsTargetStreamer::setUsesMicroMips() {}
+void MipsTargetStreamer::emitDirectiveSetMips16() {}
+void MipsTargetStreamer::emitDirectiveSetNoMips16() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetReorder() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetNoReorder() {}
+void MipsTargetStreamer::emitDirectiveSetMacro() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetNoMacro() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMsa() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetNoMsa() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetAt() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetAtWithArg(unsigned RegNo) {
+  forbidModuleDirective();
+}
+void MipsTargetStreamer::emitDirectiveSetNoAt() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveEnd(StringRef Name) {}
+void MipsTargetStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {}
+void MipsTargetStreamer::emitDirectiveAbiCalls() {}
+void MipsTargetStreamer::emitDirectiveNaN2008() {}
+void MipsTargetStreamer::emitDirectiveNaNLegacy() {}
+void MipsTargetStreamer::emitDirectiveOptionPic0() {}
+void MipsTargetStreamer::emitDirectiveOptionPic2() {}
+void MipsTargetStreamer::emitDirectiveInsn() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
+                                   unsigned ReturnReg) {}
+void MipsTargetStreamer::emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) {}
+void MipsTargetStreamer::emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) {
+}
+void MipsTargetStreamer::emitDirectiveSetArch(StringRef Arch) {
+  forbidModuleDirective();
+}
+void MipsTargetStreamer::emitDirectiveSetMips0() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips1() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips2() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips3() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips4() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips5() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips32() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips32R2() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips32R3() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips32R5() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips32R6() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips64() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips64R2() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips64R3() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips64R5() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips64R6() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetPop() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetPush() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetSoftFloat() {
+  forbidModuleDirective();
+}
+void MipsTargetStreamer::emitDirectiveSetHardFloat() {
+  forbidModuleDirective();
+}
+void MipsTargetStreamer::emitDirectiveSetDsp() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {}
+bool MipsTargetStreamer::emitDirectiveCpRestore(
+    int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+    const MCSubtargetInfo *STI) {
+  forbidModuleDirective();
+  return true;
+}
+void MipsTargetStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+                                              const MCSymbol &Sym, bool IsReg) {
+}
+void MipsTargetStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
+                                               bool SaveLocationIsRegister) {}
+
+void MipsTargetStreamer::emitDirectiveModuleFP() {}
+
+void MipsTargetStreamer::emitDirectiveModuleOddSPReg() {
+  if (!ABIFlagsSection.OddSPReg && !ABIFlagsSection.Is32BitABI)
+    report_fatal_error("+nooddspreg is only valid for O32");
+}
+void MipsTargetStreamer::emitDirectiveModuleSoftFloat() {}
+void MipsTargetStreamer::emitDirectiveModuleHardFloat() {}
+void MipsTargetStreamer::emitDirectiveSetFp(
+    MipsABIFlagsSection::FpABIKind Value) {
+  forbidModuleDirective();
+}
+void MipsTargetStreamer::emitDirectiveSetOddSPReg() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetNoOddSPReg() {
+  forbidModuleDirective();
+}
+
+void MipsTargetStreamer::emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc,
+                               const MCSubtargetInfo *STI) {
+  MCInst TmpInst;
+  TmpInst.setOpcode(Opcode);
+  TmpInst.addOperand(MCOperand::createReg(Reg0));
+  TmpInst.setLoc(IDLoc);
+  getStreamer().EmitInstruction(TmpInst, *STI);
+}
+
+void MipsTargetStreamer::emitRX(unsigned Opcode, unsigned Reg0, MCOperand Op1,
+                                SMLoc IDLoc, const MCSubtargetInfo *STI) {
+  MCInst TmpInst;
+  TmpInst.setOpcode(Opcode);
+  TmpInst.addOperand(MCOperand::createReg(Reg0));
+  TmpInst.addOperand(Op1);
+  TmpInst.setLoc(IDLoc);
+  getStreamer().EmitInstruction(TmpInst, *STI);
+}
+
+void MipsTargetStreamer::emitRI(unsigned Opcode, unsigned Reg0, int32_t Imm,
+                                SMLoc IDLoc, const MCSubtargetInfo *STI) {
+  emitRX(Opcode, Reg0, MCOperand::createImm(Imm), IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitRR(unsigned Opcode, unsigned Reg0, unsigned Reg1,
+                                SMLoc IDLoc, const MCSubtargetInfo *STI) {
+  emitRX(Opcode, Reg0, MCOperand::createReg(Reg1), IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitII(unsigned Opcode, int16_t Imm1, int16_t Imm2,
+                                SMLoc IDLoc, const MCSubtargetInfo *STI) {
+  MCInst TmpInst;
+  TmpInst.setOpcode(Opcode);
+  TmpInst.addOperand(MCOperand::createImm(Imm1));
+  TmpInst.addOperand(MCOperand::createImm(Imm2));
+  TmpInst.setLoc(IDLoc);
+  getStreamer().EmitInstruction(TmpInst, *STI);
+}
+
+void MipsTargetStreamer::emitRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1,
+                                 MCOperand Op2, SMLoc IDLoc,
+                                 const MCSubtargetInfo *STI) {
+  MCInst TmpInst;
+  TmpInst.setOpcode(Opcode);
+  TmpInst.addOperand(MCOperand::createReg(Reg0));
+  TmpInst.addOperand(MCOperand::createReg(Reg1));
+  TmpInst.addOperand(Op2);
+  TmpInst.setLoc(IDLoc);
+  getStreamer().EmitInstruction(TmpInst, *STI);
+}
+
+void MipsTargetStreamer::emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1,
+                                 unsigned Reg2, SMLoc IDLoc,
+                                 const MCSubtargetInfo *STI) {
+  emitRRX(Opcode, Reg0, Reg1, MCOperand::createReg(Reg2), IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1,
+                                 int16_t Imm, SMLoc IDLoc,
+                                 const MCSubtargetInfo *STI) {
+  emitRRX(Opcode, Reg0, Reg1, MCOperand::createImm(Imm), IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitAddu(unsigned DstReg, unsigned SrcReg,
+                                  unsigned TrgReg, bool Is64Bit,
+                                  const MCSubtargetInfo *STI) {
+  emitRRR(Is64Bit ? Mips::DADDu : Mips::ADDu, DstReg, SrcReg, TrgReg, SMLoc(),
+          STI);
+}
+
+void MipsTargetStreamer::emitDSLL(unsigned DstReg, unsigned SrcReg,
+                                  int16_t ShiftAmount, SMLoc IDLoc,
+                                  const MCSubtargetInfo *STI) {
+  if (ShiftAmount >= 32) {
+    emitRRI(Mips::DSLL32, DstReg, SrcReg, ShiftAmount - 32, IDLoc, STI);
+    return;
+  }
+
+  emitRRI(Mips::DSLL, DstReg, SrcReg, ShiftAmount, IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitEmptyDelaySlot(bool hasShortDelaySlot, SMLoc IDLoc,
+                                            const MCSubtargetInfo *STI) {
+  if (hasShortDelaySlot)
+    emitRR(Mips::MOVE16_MM, Mips::ZERO, Mips::ZERO, IDLoc, STI);
+  else
+    emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
+}
+
+void MipsTargetStreamer::emitNop(SMLoc IDLoc, const MCSubtargetInfo *STI) {
+  emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
+}
+
+/// Emit the $gp restore operation for .cprestore.
+void MipsTargetStreamer::emitGPRestore(int Offset, SMLoc IDLoc,
+                                       const MCSubtargetInfo *STI) {
+  emitLoadWithImmOffset(Mips::LW, Mips::GP, Mips::SP, Offset, Mips::GP, IDLoc,
+                        STI);
+}
+
+/// Emit a store instruction with an immediate offset.
+void MipsTargetStreamer::emitStoreWithImmOffset(
+    unsigned Opcode, unsigned SrcReg, unsigned BaseReg, int64_t Offset,
+    function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+    const MCSubtargetInfo *STI) {
+  if (isInt<16>(Offset)) {
+    emitRRI(Opcode, SrcReg, BaseReg, Offset, IDLoc, STI);
+    return;
+  }
+
+  // sw $8, offset($8) => lui $at, %hi(offset)
+  //                      add $at, $at, $8
+  //                      sw $8, %lo(offset)($at)
+
+  unsigned ATReg = GetATReg();
+  if (!ATReg)
+    return;
+
+  unsigned LoOffset = Offset & 0x0000ffff;
+  unsigned HiOffset = (Offset & 0xffff0000) >> 16;
+
+  // If msb of LoOffset is 1(negative number) we must increment HiOffset
+  // to account for the sign-extension of the low part.
+  if (LoOffset & 0x8000)
+    HiOffset++;
+
+  // Generate the base address in ATReg.
+  emitRI(Mips::LUi, ATReg, HiOffset, IDLoc, STI);
+  if (BaseReg != Mips::ZERO)
+    emitRRR(Mips::ADDu, ATReg, ATReg, BaseReg, IDLoc, STI);
+  // Emit the store with the adjusted base and offset.
+  emitRRI(Opcode, SrcReg, ATReg, LoOffset, IDLoc, STI);
+}
+
+/// Emit a store instruction with an symbol offset. Symbols are assumed to be
+/// out of range for a simm16 will be expanded to appropriate instructions.
+void MipsTargetStreamer::emitStoreWithSymOffset(
+    unsigned Opcode, unsigned SrcReg, unsigned BaseReg, MCOperand &HiOperand,
+    MCOperand &LoOperand, unsigned ATReg, SMLoc IDLoc,
+    const MCSubtargetInfo *STI) {
+  // sw $8, sym => lui $at, %hi(sym)
+  //               sw $8, %lo(sym)($at)
+
+  // Generate the base address in ATReg.
+  emitRX(Mips::LUi, ATReg, HiOperand, IDLoc, STI);
+  if (BaseReg != Mips::ZERO)
+    emitRRR(Mips::ADDu, ATReg, ATReg, BaseReg, IDLoc, STI);
+  // Emit the store with the adjusted base and offset.
+  emitRRX(Opcode, SrcReg, ATReg, LoOperand, IDLoc, STI);
+}
+
+/// Emit a load instruction with an immediate offset. DstReg and TmpReg are
+/// permitted to be the same register iff DstReg is distinct from BaseReg and
+/// DstReg is a GPR. It is the callers responsibility to identify such cases
+/// and pass the appropriate register in TmpReg.
+void MipsTargetStreamer::emitLoadWithImmOffset(unsigned Opcode, unsigned DstReg,
+                                               unsigned BaseReg, int64_t Offset,
+                                               unsigned TmpReg, SMLoc IDLoc,
+                                               const MCSubtargetInfo *STI) {
+  if (isInt<16>(Offset)) {
+    emitRRI(Opcode, DstReg, BaseReg, Offset, IDLoc, STI);
+    return;
+  }
+
+  // 1) lw $8, offset($9) => lui $8, %hi(offset)
+  //                         add $8, $8, $9
+  //                         lw $8, %lo(offset)($9)
+  // 2) lw $8, offset($8) => lui $at, %hi(offset)
+  //                         add $at, $at, $8
+  //                         lw $8, %lo(offset)($at)
+
+  unsigned LoOffset = Offset & 0x0000ffff;
+  unsigned HiOffset = (Offset & 0xffff0000) >> 16;
+
+  // If msb of LoOffset is 1(negative number) we must increment HiOffset
+  // to account for the sign-extension of the low part.
+  if (LoOffset & 0x8000)
+    HiOffset++;
+
+  // Generate the base address in TmpReg.
+  emitRI(Mips::LUi, TmpReg, HiOffset, IDLoc, STI);
+  if (BaseReg != Mips::ZERO)
+    emitRRR(Mips::ADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI);
+  // Emit the load with the adjusted base and offset.
+  emitRRI(Opcode, DstReg, TmpReg, LoOffset, IDLoc, STI);
+}
+
+/// Emit a load instruction with an symbol offset. Symbols are assumed to be
+/// out of range for a simm16 will be expanded to appropriate instructions.
+/// DstReg and TmpReg are permitted to be the same register iff DstReg is a
+/// GPR. It is the callers responsibility to identify such cases and pass the
+/// appropriate register in TmpReg.
+void MipsTargetStreamer::emitLoadWithSymOffset(unsigned Opcode, unsigned DstReg,
+                                               unsigned BaseReg,
+                                               MCOperand &HiOperand,
+                                               MCOperand &LoOperand,
+                                               unsigned TmpReg, SMLoc IDLoc,
+                                               const MCSubtargetInfo *STI) {
+  // 1) lw $8, sym        => lui $8, %hi(sym)
+  //                         lw $8, %lo(sym)($8)
+  // 2) ldc1 $f0, sym     => lui $at, %hi(sym)
+  //                         ldc1 $f0, %lo(sym)($at)
+
+  // Generate the base address in TmpReg.
+  emitRX(Mips::LUi, TmpReg, HiOperand, IDLoc, STI);
+  if (BaseReg != Mips::ZERO)
+    emitRRR(Mips::ADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI);
+  // Emit the load with the adjusted base and offset.
+  emitRRX(Opcode, DstReg, TmpReg, LoOperand, IDLoc, STI);
+}
+
+MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S,
+                                             formatted_raw_ostream &OS)
+    : MipsTargetStreamer(S), OS(OS) {}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMicroMips() {
+  OS << "\t.set\tmicromips\n";
+  forbidModuleDirective();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMicroMips() {
+  OS << "\t.set\tnomicromips\n";
+  forbidModuleDirective();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips16() {
+  OS << "\t.set\tmips16\n";
+  forbidModuleDirective();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMips16() {
+  OS << "\t.set\tnomips16\n";
+  MipsTargetStreamer::emitDirectiveSetNoMips16();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetReorder() {
+  OS << "\t.set\treorder\n";
+  MipsTargetStreamer::emitDirectiveSetReorder();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoReorder() {
+  OS << "\t.set\tnoreorder\n";
+  forbidModuleDirective();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMacro() {
+  OS << "\t.set\tmacro\n";
+  MipsTargetStreamer::emitDirectiveSetMacro();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMacro() {
+  OS << "\t.set\tnomacro\n";
+  MipsTargetStreamer::emitDirectiveSetNoMacro();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMsa() {
+  OS << "\t.set\tmsa\n";
+  MipsTargetStreamer::emitDirectiveSetMsa();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMsa() {
+  OS << "\t.set\tnomsa\n";
+  MipsTargetStreamer::emitDirectiveSetNoMsa();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetAt() {
+  OS << "\t.set\tat\n";
+  MipsTargetStreamer::emitDirectiveSetAt();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetAtWithArg(unsigned RegNo) {
+  OS << "\t.set\tat=$" << Twine(RegNo) << "\n";
+  MipsTargetStreamer::emitDirectiveSetAtWithArg(RegNo);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoAt() {
+  OS << "\t.set\tnoat\n";
+  MipsTargetStreamer::emitDirectiveSetNoAt();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveEnd(StringRef Name) {
+  OS << "\t.end\t" << Name << '\n';
+}
+
+void MipsTargetAsmStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {
+  OS << "\t.ent\t" << Symbol.getName() << '\n';
+}
+
+void MipsTargetAsmStreamer::emitDirectiveAbiCalls() { OS << "\t.abicalls\n"; }
+
+void MipsTargetAsmStreamer::emitDirectiveNaN2008() { OS << "\t.nan\t2008\n"; }
+
+void MipsTargetAsmStreamer::emitDirectiveNaNLegacy() {
+  OS << "\t.nan\tlegacy\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveOptionPic0() {
+  OS << "\t.option\tpic0\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveOptionPic2() {
+  OS << "\t.option\tpic2\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveInsn() {
+  MipsTargetStreamer::emitDirectiveInsn();
+  OS << "\t.insn\n";
+}
+
+void MipsTargetAsmStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
+                                      unsigned ReturnReg) {
+  OS << "\t.frame\t$"
+     << StringRef(MipsInstPrinter::getRegisterName(StackReg)).lower() << ","
+     << StackSize << ",$"
+     << StringRef(MipsInstPrinter::getRegisterName(ReturnReg)).lower() << '\n';
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetArch(StringRef Arch) {
+  OS << "\t.set arch=" << Arch << "\n";
+  MipsTargetStreamer::emitDirectiveSetArch(Arch);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips0() {
+  OS << "\t.set\tmips0\n";
+  MipsTargetStreamer::emitDirectiveSetMips0();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips1() {
+  OS << "\t.set\tmips1\n";
+  MipsTargetStreamer::emitDirectiveSetMips1();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips2() {
+  OS << "\t.set\tmips2\n";
+  MipsTargetStreamer::emitDirectiveSetMips2();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips3() {
+  OS << "\t.set\tmips3\n";
+  MipsTargetStreamer::emitDirectiveSetMips3();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips4() {
+  OS << "\t.set\tmips4\n";
+  MipsTargetStreamer::emitDirectiveSetMips4();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips5() {
+  OS << "\t.set\tmips5\n";
+  MipsTargetStreamer::emitDirectiveSetMips5();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips32() {
+  OS << "\t.set\tmips32\n";
+  MipsTargetStreamer::emitDirectiveSetMips32();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips32R2() {
+  OS << "\t.set\tmips32r2\n";
+  MipsTargetStreamer::emitDirectiveSetMips32R2();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips32R3() {
+  OS << "\t.set\tmips32r3\n";
+  MipsTargetStreamer::emitDirectiveSetMips32R3();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips32R5() {
+  OS << "\t.set\tmips32r5\n";
+  MipsTargetStreamer::emitDirectiveSetMips32R5();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips32R6() {
+  OS << "\t.set\tmips32r6\n";
+  MipsTargetStreamer::emitDirectiveSetMips32R6();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips64() {
+  OS << "\t.set\tmips64\n";
+  MipsTargetStreamer::emitDirectiveSetMips64();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips64R2() {
+  OS << "\t.set\tmips64r2\n";
+  MipsTargetStreamer::emitDirectiveSetMips64R2();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips64R3() {
+  OS << "\t.set\tmips64r3\n";
+  MipsTargetStreamer::emitDirectiveSetMips64R3();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips64R5() {
+  OS << "\t.set\tmips64r5\n";
+  MipsTargetStreamer::emitDirectiveSetMips64R5();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips64R6() {
+  OS << "\t.set\tmips64r6\n";
+  MipsTargetStreamer::emitDirectiveSetMips64R6();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetDsp() {
+  OS << "\t.set\tdsp\n";
+  MipsTargetStreamer::emitDirectiveSetDsp();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoDsp() {
+  OS << "\t.set\tnodsp\n";
+  MipsTargetStreamer::emitDirectiveSetNoDsp();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetPop() {
+  OS << "\t.set\tpop\n";
+  MipsTargetStreamer::emitDirectiveSetPop();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetPush() {
+ OS << "\t.set\tpush\n";
+ MipsTargetStreamer::emitDirectiveSetPush();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetSoftFloat() {
+  OS << "\t.set\tsoftfloat\n";
+  MipsTargetStreamer::emitDirectiveSetSoftFloat();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetHardFloat() {
+  OS << "\t.set\thardfloat\n";
+  MipsTargetStreamer::emitDirectiveSetHardFloat();
+}
+
+// Print a 32 bit hex number with all numbers.
+static void printHex32(unsigned Value, raw_ostream &OS) {
+  OS << "0x";
+  for (int i = 7; i >= 0; i--)
+    OS.write_hex((Value & (0xF << (i * 4))) >> (i * 4));
+}
+
+void MipsTargetAsmStreamer::emitMask(unsigned CPUBitmask,
+                                     int CPUTopSavedRegOff) {
+  OS << "\t.mask \t";
+  printHex32(CPUBitmask, OS);
+  OS << ',' << CPUTopSavedRegOff << '\n';
+}
+
+void MipsTargetAsmStreamer::emitFMask(unsigned FPUBitmask,
+                                      int FPUTopSavedRegOff) {
+  OS << "\t.fmask\t";
+  printHex32(FPUBitmask, OS);
+  OS << "," << FPUTopSavedRegOff << '\n';
+}
+
+void MipsTargetAsmStreamer::emitDirectiveCpLoad(unsigned RegNo) {
+  OS << "\t.cpload\t$"
+     << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n";
+  forbidModuleDirective();
+}
+
+bool MipsTargetAsmStreamer::emitDirectiveCpRestore(
+    int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+    const MCSubtargetInfo *STI) {
+  MipsTargetStreamer::emitDirectiveCpRestore(Offset, GetATReg, IDLoc, STI);
+  OS << "\t.cprestore\t" << Offset << "\n";
+  return true;
+}
+
+void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo,
+                                                 int RegOrOffset,
+                                                 const MCSymbol &Sym,
+                                                 bool IsReg) {
+  OS << "\t.cpsetup\t$"
+     << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << ", ";
+
+  if (IsReg)
+    OS << "$"
+       << StringRef(MipsInstPrinter::getRegisterName(RegOrOffset)).lower();
+  else
+    OS << RegOrOffset;
+
+  OS << ", ";
+
+  OS << Sym.getName();
+  forbidModuleDirective();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
+                                                  bool SaveLocationIsRegister) {
+  OS << "\t.cpreturn";
+  forbidModuleDirective();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleFP() {
+  OS << "\t.module\tfp=";
+  OS << ABIFlagsSection.getFpABIString(ABIFlagsSection.getFpABI()) << "\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetFp(
+    MipsABIFlagsSection::FpABIKind Value) {
+  MipsTargetStreamer::emitDirectiveSetFp(Value);
+
+  OS << "\t.set\tfp=";
+  OS << ABIFlagsSection.getFpABIString(Value) << "\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleOddSPReg() {
+  MipsTargetStreamer::emitDirectiveModuleOddSPReg();
+
+  OS << "\t.module\t" << (ABIFlagsSection.OddSPReg ? "" : "no") << "oddspreg\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetOddSPReg() {
+  MipsTargetStreamer::emitDirectiveSetOddSPReg();
+  OS << "\t.set\toddspreg\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoOddSPReg() {
+  MipsTargetStreamer::emitDirectiveSetNoOddSPReg();
+  OS << "\t.set\tnooddspreg\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleSoftFloat() {
+  OS << "\t.module\tsoftfloat\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveModuleHardFloat() {
+  OS << "\t.module\thardfloat\n";
+}
+
+// This part is for ELF object output.
+MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
+                                             const MCSubtargetInfo &STI)
+    : MipsTargetStreamer(S), MicroMipsEnabled(false), STI(STI) {
+  MCAssembler &MCA = getStreamer().getAssembler();
+
+  // It's possible that MCObjectFileInfo isn't fully initialized at this point
+  // due to an initialization order problem where LLVMTargetMachine creates the
+  // target streamer before TargetLoweringObjectFile calls
+  // InitializeMCObjectFileInfo. There doesn't seem to be a single place that
+  // covers all cases so this statement covers most cases and direct object
+  // emission must call setPic() once MCObjectFileInfo has been initialized. The
+  // cases we don't handle here are covered by MipsAsmPrinter.
+  Pic = MCA.getContext().getObjectFileInfo()->isPositionIndependent();
+
+  const FeatureBitset &Features = STI.getFeatureBits();
+
+  // Set the header flags that we can in the constructor.
+  // FIXME: This is a fairly terrible hack. We set the rest
+  // of these in the destructor. The problem here is two-fold:
+  //
+  // a: Some of the eflags can be set/reset by directives.
+  // b: There aren't any usage paths that initialize the ABI
+  //    pointer until after we initialize either an assembler
+  //    or the target machine.
+  // We can fix this by making the target streamer construct
+  // the ABI, but this is fraught with wide ranging dependency
+  // issues as well.
+  unsigned EFlags = MCA.getELFHeaderEFlags();
+
+  // Architecture
+  if (Features[Mips::FeatureMips64r6])
+    EFlags |= ELF::EF_MIPS_ARCH_64R6;
+  else if (Features[Mips::FeatureMips64r2] ||
+           Features[Mips::FeatureMips64r3] ||
+           Features[Mips::FeatureMips64r5])
+    EFlags |= ELF::EF_MIPS_ARCH_64R2;
+  else if (Features[Mips::FeatureMips64])
+    EFlags |= ELF::EF_MIPS_ARCH_64;
+  else if (Features[Mips::FeatureMips5])
+    EFlags |= ELF::EF_MIPS_ARCH_5;
+  else if (Features[Mips::FeatureMips4])
+    EFlags |= ELF::EF_MIPS_ARCH_4;
+  else if (Features[Mips::FeatureMips3])
+    EFlags |= ELF::EF_MIPS_ARCH_3;
+  else if (Features[Mips::FeatureMips32r6])
+    EFlags |= ELF::EF_MIPS_ARCH_32R6;
+  else if (Features[Mips::FeatureMips32r2] ||
+           Features[Mips::FeatureMips32r3] ||
+           Features[Mips::FeatureMips32r5])
+    EFlags |= ELF::EF_MIPS_ARCH_32R2;
+  else if (Features[Mips::FeatureMips32])
+    EFlags |= ELF::EF_MIPS_ARCH_32;
+  else if (Features[Mips::FeatureMips2])
+    EFlags |= ELF::EF_MIPS_ARCH_2;
+  else
+    EFlags |= ELF::EF_MIPS_ARCH_1;
+
+  // Machine
+  if (Features[Mips::FeatureCnMips])
+    EFlags |= ELF::EF_MIPS_MACH_OCTEON;
+
+  // Other options.
+  if (Features[Mips::FeatureNaN2008])
+    EFlags |= ELF::EF_MIPS_NAN2008;
+
+  // -mabicalls and -mplt are not implemented but we should act as if they were
+  // given.
+  EFlags |= ELF::EF_MIPS_CPIC;
+
+  MCA.setELFHeaderEFlags(EFlags);
+}
+
+void MipsTargetELFStreamer::emitLabel(MCSymbol *S) {
+  auto *Symbol = cast<MCSymbolELF>(S);
+  if (!isMicroMipsEnabled())
+    return;
+  getStreamer().getAssembler().registerSymbol(*Symbol);
+  uint8_t Type = Symbol->getType();
+  if (Type != ELF::STT_FUNC)
+    return;
+
+  Symbol->setOther(ELF::STO_MIPS_MICROMIPS);
+}
+
+void MipsTargetELFStreamer::finish() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  const MCObjectFileInfo &OFI = *MCA.getContext().getObjectFileInfo();
+
+  // .bss, .text and .data are always at least 16-byte aligned.
+  MCSection &TextSection = *OFI.getTextSection();
+  MCA.registerSection(TextSection);
+  MCSection &DataSection = *OFI.getDataSection();
+  MCA.registerSection(DataSection);
+  MCSection &BSSSection = *OFI.getBSSSection();
+  MCA.registerSection(BSSSection);
+
+  TextSection.setAlignment(std::max(16u, TextSection.getAlignment()));
+  DataSection.setAlignment(std::max(16u, DataSection.getAlignment()));
+  BSSSection.setAlignment(std::max(16u, BSSSection.getAlignment()));
+
+  if (RoundSectionSizes) {
+    // Make sections sizes a multiple of the alignment. This is useful for
+    // verifying the output of IAS against the output of other assemblers but
+    // it's not necessary to produce a correct object and increases section
+    // size.
+    MCStreamer &OS = getStreamer();
+    for (MCSection &S : MCA) {
+      MCSectionELF &Section = static_cast<MCSectionELF &>(S);
+
+      unsigned Alignment = Section.getAlignment();
+      if (Alignment) {
+        OS.SwitchSection(&Section);
+        if (Section.UseCodeAlign())
+          OS.EmitCodeAlignment(Alignment, Alignment);
+        else
+          OS.EmitValueToAlignment(Alignment, 0, 1, Alignment);
+      }
+    }
+  }
+
+  const FeatureBitset &Features = STI.getFeatureBits();
+
+  // Update e_header flags. See the FIXME and comment above in
+  // the constructor for a full rundown on this.
+  unsigned EFlags = MCA.getELFHeaderEFlags();
+
+  // ABI
+  // N64 does not require any ABI bits.
+  if (getABI().IsO32())
+    EFlags |= ELF::EF_MIPS_ABI_O32;
+  else if (getABI().IsN32())
+    EFlags |= ELF::EF_MIPS_ABI2;
+
+  if (Features[Mips::FeatureGP64Bit]) {
+    if (getABI().IsO32())
+      EFlags |= ELF::EF_MIPS_32BITMODE; /* Compatibility Mode */
+  } else if (Features[Mips::FeatureMips64r2] || Features[Mips::FeatureMips64])
+    EFlags |= ELF::EF_MIPS_32BITMODE;
+
+  // If we've set the cpic eflag and we're n64, go ahead and set the pic
+  // one as well.
+  if (EFlags & ELF::EF_MIPS_CPIC && getABI().IsN64())
+    EFlags |= ELF::EF_MIPS_PIC;
+
+  MCA.setELFHeaderEFlags(EFlags);
+
+  // Emit all the option records.
+  // At the moment we are only emitting .Mips.options (ODK_REGINFO) and
+  // .reginfo.
+  MipsELFStreamer &MEF = static_cast<MipsELFStreamer &>(Streamer);
+  MEF.EmitMipsOptionRecords();
+
+  emitMipsAbiFlags();
+}
+
+void MipsTargetELFStreamer::emitAssignment(MCSymbol *S, const MCExpr *Value) {
+  auto *Symbol = cast<MCSymbolELF>(S);
+  // If on rhs is micromips symbol then mark Symbol as microMips.
+  if (Value->getKind() != MCExpr::SymbolRef)
+    return;
+  const auto &RhsSym = cast<MCSymbolELF>(
+      static_cast<const MCSymbolRefExpr *>(Value)->getSymbol());
+
+  if (!(RhsSym.getOther() & ELF::STO_MIPS_MICROMIPS))
+    return;
+
+  Symbol->setOther(ELF::STO_MIPS_MICROMIPS);
+}
+
+MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
+  return static_cast<MCELFStreamer &>(Streamer);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetMicroMips() {
+  MicroMipsEnabled = true;
+  forbidModuleDirective();
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetNoMicroMips() {
+  MicroMipsEnabled = false;
+  forbidModuleDirective();
+}
+
+void MipsTargetELFStreamer::setUsesMicroMips() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_MICROMIPS;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetMips16() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_ARCH_ASE_M16;
+  MCA.setELFHeaderEFlags(Flags);
+  forbidModuleDirective();
+}
+
+void MipsTargetELFStreamer::emitDirectiveSetNoReorder() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_NOREORDER;
+  MCA.setELFHeaderEFlags(Flags);
+  forbidModuleDirective();
+}
+
+void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCContext &Context = MCA.getContext();
+  MCStreamer &OS = getStreamer();
+
+  MCSectionELF *Sec = Context.getELFSection(".pdr", ELF::SHT_PROGBITS, 0);
+
+  MCSymbol *Sym = Context.getOrCreateSymbol(Name);
+  const MCSymbolRefExpr *ExprRef =
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Context);
+
+  MCA.registerSection(*Sec);
+  Sec->setAlignment(4);
+
+  OS.PushSection();
+
+  OS.SwitchSection(Sec);
+
+  OS.EmitValueImpl(ExprRef, 4);
+
+  OS.EmitIntValue(GPRInfoSet ? GPRBitMask : 0, 4); // reg_mask
+  OS.EmitIntValue(GPRInfoSet ? GPROffset : 0, 4);  // reg_offset
+
+  OS.EmitIntValue(FPRInfoSet ? FPRBitMask : 0, 4); // fpreg_mask
+  OS.EmitIntValue(FPRInfoSet ? FPROffset : 0, 4);  // fpreg_offset
+
+  OS.EmitIntValue(FrameInfoSet ? FrameOffset : 0, 4); // frame_offset
+  OS.EmitIntValue(FrameInfoSet ? FrameReg : 0, 4);    // frame_reg
+  OS.EmitIntValue(FrameInfoSet ? ReturnReg : 0, 4);   // return_reg
+
+  // The .end directive marks the end of a procedure. Invalidate
+  // the information gathered up until this point.
+  GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
+
+  OS.PopSection();
+
+  // .end also implicitly sets the size.
+  MCSymbol *CurPCSym = Context.createTempSymbol();
+  OS.EmitLabel(CurPCSym);
+  const MCExpr *Size = MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(CurPCSym, MCSymbolRefExpr::VK_None, Context),
+      ExprRef, Context);
+  int64_t AbsSize;
+  if (!Size->evaluateAsAbsolute(AbsSize, MCA))
+    llvm_unreachable("Function size must be evaluatable as absolute");
+  Size = MCConstantExpr::create(AbsSize, Context);
+  static_cast<MCSymbolELF *>(Sym)->setSize(Size);
+}
+
+void MipsTargetELFStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {
+  GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
+
+  // .ent also acts like an implicit '.type symbol, STT_FUNC'
+  static_cast<const MCSymbolELF &>(Symbol).setType(ELF::STT_FUNC);
+}
+
+void MipsTargetELFStreamer::emitDirectiveAbiCalls() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_CPIC | ELF::EF_MIPS_PIC;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveNaN2008() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_NAN2008;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveNaNLegacy() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags &= ~ELF::EF_MIPS_NAN2008;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveOptionPic0() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  // This option overrides other PIC options like -KPIC.
+  Pic = false;
+  Flags &= ~ELF::EF_MIPS_PIC;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveOptionPic2() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Pic = true;
+  // NOTE: We are following the GAS behaviour here which means the directive
+  // 'pic2' also sets the CPIC bit in the ELF header. This is different from
+  // what is stated in the SYSV ABI which consider the bits EF_MIPS_PIC and
+  // EF_MIPS_CPIC to be mutually exclusive.
+  Flags |= ELF::EF_MIPS_PIC | ELF::EF_MIPS_CPIC;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveInsn() {
+  MipsTargetStreamer::emitDirectiveInsn();
+  MipsELFStreamer &MEF = static_cast<MipsELFStreamer &>(Streamer);
+  MEF.createPendingLabelRelocs();
+}
+
+void MipsTargetELFStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
+                                      unsigned ReturnReg_) {
+  MCContext &Context = getStreamer().getAssembler().getContext();
+  const MCRegisterInfo *RegInfo = Context.getRegisterInfo();
+
+  FrameInfoSet = true;
+  FrameReg = RegInfo->getEncodingValue(StackReg);
+  FrameOffset = StackSize;
+  ReturnReg = RegInfo->getEncodingValue(ReturnReg_);
+}
+
+void MipsTargetELFStreamer::emitMask(unsigned CPUBitmask,
+                                     int CPUTopSavedRegOff) {
+  GPRInfoSet = true;
+  GPRBitMask = CPUBitmask;
+  GPROffset = CPUTopSavedRegOff;
+}
+
+void MipsTargetELFStreamer::emitFMask(unsigned FPUBitmask,
+                                      int FPUTopSavedRegOff) {
+  FPRInfoSet = true;
+  FPRBitMask = FPUBitmask;
+  FPROffset = FPUTopSavedRegOff;
+}
+
+void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
+  // .cpload $reg
+  // This directive expands to:
+  // lui   $gp, %hi(_gp_disp)
+  // addui $gp, $gp, %lo(_gp_disp)
+  // addu  $gp, $gp, $reg
+  // when support for position independent code is enabled.
+  if (!Pic || (getABI().IsN32() || getABI().IsN64()))
+    return;
+
+  // There's a GNU extension controlled by -mno-shared that allows
+  // locally-binding symbols to be accessed using absolute addresses.
+  // This is currently not supported. When supported -mno-shared makes
+  // .cpload expand to:
+  //   lui     $gp, %hi(__gnu_local_gp)
+  //   addiu   $gp, $gp, %lo(__gnu_local_gp)
+
+  StringRef SymName("_gp_disp");
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCSymbol *GP_Disp = MCA.getContext().getOrCreateSymbol(SymName);
+  MCA.registerSymbol(*GP_Disp);
+
+  MCInst TmpInst;
+  TmpInst.setOpcode(Mips::LUi);
+  TmpInst.addOperand(MCOperand::createReg(Mips::GP));
+  const MCExpr *HiSym = MipsMCExpr::create(
+      MipsMCExpr::MEK_HI,
+      MCSymbolRefExpr::create("_gp_disp", MCSymbolRefExpr::VK_None,
+                              MCA.getContext()),
+      MCA.getContext());
+  TmpInst.addOperand(MCOperand::createExpr(HiSym));
+  getStreamer().EmitInstruction(TmpInst, STI);
+
+  TmpInst.clear();
+
+  TmpInst.setOpcode(Mips::ADDiu);
+  TmpInst.addOperand(MCOperand::createReg(Mips::GP));
+  TmpInst.addOperand(MCOperand::createReg(Mips::GP));
+  const MCExpr *LoSym = MipsMCExpr::create(
+      MipsMCExpr::MEK_LO,
+      MCSymbolRefExpr::create("_gp_disp", MCSymbolRefExpr::VK_None,
+                              MCA.getContext()),
+      MCA.getContext());
+  TmpInst.addOperand(MCOperand::createExpr(LoSym));
+  getStreamer().EmitInstruction(TmpInst, STI);
+
+  TmpInst.clear();
+
+  TmpInst.setOpcode(Mips::ADDu);
+  TmpInst.addOperand(MCOperand::createReg(Mips::GP));
+  TmpInst.addOperand(MCOperand::createReg(Mips::GP));
+  TmpInst.addOperand(MCOperand::createReg(RegNo));
+  getStreamer().EmitInstruction(TmpInst, STI);
+
+  forbidModuleDirective();
+}
+
+bool MipsTargetELFStreamer::emitDirectiveCpRestore(
+    int Offset, function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+    const MCSubtargetInfo *STI) {
+  MipsTargetStreamer::emitDirectiveCpRestore(Offset, GetATReg, IDLoc, STI);
+  // .cprestore offset
+  // When PIC mode is enabled and the O32 ABI is used, this directive expands
+  // to:
+  //    sw $gp, offset($sp)
+  // and adds a corresponding LW after every JAL.
+
+  // Note that .cprestore is ignored if used with the N32 and N64 ABIs or if it
+  // is used in non-PIC mode.
+  if (!Pic || (getABI().IsN32() || getABI().IsN64()))
+    return true;
+
+  // Store the $gp on the stack.
+  emitStoreWithImmOffset(Mips::SW, Mips::GP, Mips::SP, Offset, GetATReg, IDLoc,
+                         STI);
+  return true;
+}
+
+void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
+                                                 int RegOrOffset,
+                                                 const MCSymbol &Sym,
+                                                 bool IsReg) {
+  // Only N32 and N64 emit anything for .cpsetup iff PIC is set.
+  if (!Pic || !(getABI().IsN32() || getABI().IsN64()))
+    return;
+
+  forbidModuleDirective();
+
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCInst Inst;
+
+  // Either store the old $gp in a register or on the stack
+  if (IsReg) {
+    // move $save, $gpreg
+    emitRRR(Mips::OR64, RegOrOffset, Mips::GP, Mips::ZERO, SMLoc(), &STI);
+  } else {
+    // sd $gpreg, offset($sp)
+    emitRRI(Mips::SD, Mips::GP, Mips::SP, RegOrOffset, SMLoc(), &STI);
+  }
+
+  if (getABI().IsN32()) {
+    MCSymbol *GPSym = MCA.getContext().getOrCreateSymbol("__gnu_local_gp");
+    const MipsMCExpr *HiExpr = MipsMCExpr::create(
+        MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(GPSym, MCA.getContext()),
+        MCA.getContext());
+    const MipsMCExpr *LoExpr = MipsMCExpr::create(
+        MipsMCExpr::MEK_LO, MCSymbolRefExpr::create(GPSym, MCA.getContext()),
+        MCA.getContext());
+
+    // lui $gp, %hi(__gnu_local_gp)
+    emitRX(Mips::LUi, Mips::GP, MCOperand::createExpr(HiExpr), SMLoc(), &STI);
+
+    // addiu  $gp, $gp, %lo(__gnu_local_gp)
+    emitRRX(Mips::ADDiu, Mips::GP, Mips::GP, MCOperand::createExpr(LoExpr),
+            SMLoc(), &STI);
+
+    return;
+  }
+
+  const MipsMCExpr *HiExpr = MipsMCExpr::createGpOff(
+      MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
+      MCA.getContext());
+  const MipsMCExpr *LoExpr = MipsMCExpr::createGpOff(
+      MipsMCExpr::MEK_LO, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
+      MCA.getContext());
+
+  // lui $gp, %hi(%neg(%gp_rel(funcSym)))
+  emitRX(Mips::LUi, Mips::GP, MCOperand::createExpr(HiExpr), SMLoc(), &STI);
+
+  // addiu  $gp, $gp, %lo(%neg(%gp_rel(funcSym)))
+  emitRRX(Mips::ADDiu, Mips::GP, Mips::GP, MCOperand::createExpr(LoExpr),
+          SMLoc(), &STI);
+
+  // daddu  $gp, $gp, $funcreg
+  emitRRR(Mips::DADDu, Mips::GP, Mips::GP, RegNo, SMLoc(), &STI);
+}
+
+void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
+                                                  bool SaveLocationIsRegister) {
+  // Only N32 and N64 emit anything for .cpreturn iff PIC is set.
+  if (!Pic || !(getABI().IsN32() || getABI().IsN64()))
+    return;
+
+  MCInst Inst;
+  // Either restore the old $gp from a register or on the stack
+  if (SaveLocationIsRegister) {
+    Inst.setOpcode(Mips::OR);
+    Inst.addOperand(MCOperand::createReg(Mips::GP));
+    Inst.addOperand(MCOperand::createReg(SaveLocation));
+    Inst.addOperand(MCOperand::createReg(Mips::ZERO));
+  } else {
+    Inst.setOpcode(Mips::LD);
+    Inst.addOperand(MCOperand::createReg(Mips::GP));
+    Inst.addOperand(MCOperand::createReg(Mips::SP));
+    Inst.addOperand(MCOperand::createImm(SaveLocation));
+  }
+  getStreamer().EmitInstruction(Inst, STI);
+
+  forbidModuleDirective();
+}
+
+void MipsTargetELFStreamer::emitMipsAbiFlags() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCContext &Context = MCA.getContext();
+  MCStreamer &OS = getStreamer();
+  MCSectionELF *Sec = Context.getELFSection(
+      ".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS, ELF::SHF_ALLOC, 24, "");
+  MCA.registerSection(*Sec);
+  Sec->setAlignment(8);
+  OS.SwitchSection(Sec);
+
+  OS << ABIFlagsSection;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MSA.txt b/contrib/llvm/lib/Target/Mips/MSA.txt
new file mode 100644
index 000000000000..113375fa7f2f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MSA.txt
@@ -0,0 +1,83 @@
+Code Generation Notes for MSA
+=============================
+
+Intrinsics are lowered to SelectionDAG nodes where possible in order to enable
+optimisation, reduce the size of the ISel matcher, and reduce repetition in
+the implementation. In a small number of cases, this can cause different
+(semantically equivalent) instructions to be used in place of the requested
+instruction, even when no optimisation has taken place.
+
+Instructions
+============
+
+This section describes any quirks of instruction selection for MSA. For
+example, two instructions might be equally valid for some given IR and one is
+chosen in preference to the other.
+
+bclri.b:
+        It is not possible to emit bclri.b since andi.b covers exactly the
+        same cases. andi.b should use fractionally less power than bclri.b in
+        most hardware implementations so it is used in preference to bclri.b.
+
+vshf.w:
+        It is not possible to emit vshf.w when the shuffle description is
+        constant since shf.w covers exactly the same cases. shf.w is used
+        instead. It is also impossible for the shuffle description to be
+        unknown at compile-time due to the definition of shufflevector in
+        LLVM IR.
+
+vshf.[bhwd]
+        When the shuffle description describes a splat operation, splat.[bhwd]
+        instructions will be selected instead of vshf.[bhwd]. Unlike the ilv*,
+        and pck* instructions, this is matched from MipsISD::VSHF instead of
+        a special-case MipsISD node.
+
+ilvl.d, pckev.d:
+        It is not possible to emit ilvl.d, or pckev.d since ilvev.d covers the
+        same shuffle. ilvev.d will be emitted instead.
+
+ilvr.d, ilvod.d, pckod.d:
+        It is not possible to emit ilvr.d, or pckod.d since ilvod.d covers the
+        same shuffle. ilvod.d will be emitted instead.
+
+splat.[bhwd]
+        The intrinsic will work as expected. However, unlike other intrinsics
+        it lowers directly to MipsISD::VSHF instead of using common IR.
+
+splati.w:
+        It is not possible to emit splati.w since shf.w covers the same cases.
+        shf.w will be emitted instead.
+
+copy_s.w:
+        On MIPS32, the copy_u.d intrinsic will emit this instruction instead of
+        copy_u.w. This is semantically equivalent since the general-purpose
+        register file is 32-bits wide.
+
+binsri.[bhwd],  binsli.[bhwd]:
+        These two operations are equivalent to each other with the operands
+        swapped and condition inverted. The compiler may use either one as
+        appropriate.
+        Furthermore, the compiler may use bsel.[bhwd] for some masks that do
+        not survive the legalization process (this is a bug and will be fixed).
+
+bmnz.v, bmz.v, bsel.v:
+        These three operations differ only in the operand that is tied to the
+        result and the order of the operands.
+        It is (currently) not possible to emit bmz.v, or bsel.v since bmnz.v is
+        the same operation and will be emitted instead.
+        In future, the compiler may choose between these three instructions
+        according to register allocation.
+        These three operations can be very confusing so here is a mapping
+        between the instructions and the vselect node in one place:
+                bmz.v  wd, ws, wt/i8 -> (vselect wt/i8, wd, ws)
+                bmnz.v wd, ws, wt/i8 -> (vselect wt/i8, ws, wd)
+                bsel.v wd, ws, wt/i8 -> (vselect wd, wt/i8, ws)
+
+bmnzi.b, bmzi.b:
+        Like their non-immediate counterparts, bmnzi.v and bmzi.v are the same
+        operation with the operands swapped. bmnzi.v will (currently) be emitted
+        for both cases.
+
+bseli.v:
+        Unlike the non-immediate versions, bseli.v is distinguishable from
+        bmnzi.b and bmzi.b and can be emitted.
diff --git a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td
new file mode 100644
index 000000000000..2f0933277e81
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td
@@ -0,0 +1,1094 @@
+//=- MicroMips32r6InstrFormats.td - Mips32r6 Instruction Formats -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes microMIPS32r6 instruction formats.
+//
+//===----------------------------------------------------------------------===//
+
+class MMR6Arch<string opstr> {
+  string Arch = "micromipsr6";
+  string BaseOpcode = opstr;
+  string DecoderNamespace = "MicroMipsR6";
+}
+
+// Class used for microMIPS32r6 and microMIPS64r6 instructions.
+class MicroMipsR6Inst16 : PredicateControl {
+  string DecoderNamespace = "MicroMipsR6";
+  let InsnPredicates = [HasMicroMips32r6];
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Disambiguators
+//
+//===----------------------------------------------------------------------===//
+//
+// Some encodings are ambiguous except by comparing field values.
+
+class MMDecodeDisambiguatedBy<string Name> : DecodeDisambiguates<Name> {
+  string DecoderNamespace = "MicroMipsR6_Ambiguous";
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Encoding Formats
+//
+//===----------------------------------------------------------------------===//
+
+class BC16_FM_MM16R6 {
+  bits<10> offset;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x33;
+  let Inst{9-0}   = offset;
+}
+
+class BEQZC_BNEZC_FM_MM16R6<bits<6> op> : MicroMipsR6Inst16 {
+  bits<3> rs;
+  bits<7> offset;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = op;
+  let Inst{9-7}   = rs;
+  let Inst{6-0}   = offset;
+}
+
+class POOL16C_JALRC_FM_MM16R6<bits<5> op> {
+  bits<5> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-5}   = rs;
+  let Inst{4-0}   = op;
+}
+
+class POP35_BOVC_FM_MMR6<string instr_asm> : MipsR6Inst, MMR6Arch<instr_asm> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011101;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0} = offset;
+}
+
+class POP37_BNVC_FM_MMR6<string instr_asm> : MipsR6Inst, MMR6Arch<instr_asm> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011111;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0} = offset;
+}
+
+class POOL16C_JRCADDIUSP_FM_MM16R6<bits<5> op> {
+  bits<5> imm;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-5}   = imm;
+  let Inst{4-0}   = op;
+}
+
+class POOL16C_LWM_SWM_FM_MM16R6<bits<4> funct> {
+  bits<2> rt;
+  bits<4> addr;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-8}   = rt;
+  let Inst{7-4}   = addr;
+  let Inst{3-0}   = funct;
+}
+
+class POOL32A_BITSWAP_FM_MMR6<bits<6> funct> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rd;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-6} = funct;
+  let Inst{5-0} = 0b111100;
+}
+
+class CACHE_PREF_FM_MMR6<bits<6> opgroup, bits<4> funct> : MipsR6Inst {
+  bits<21> addr;
+  bits<5> hint;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = opgroup;
+  let Inst{25-21} = hint;
+  let Inst{20-16} = addr{20-16};
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = addr{11-0};
+}
+
+class ARITH_FM_MMR6<string instr_asm, bits<10> funct> : MMR6Arch<instr_asm> {
+  bits<5> rd;
+  bits<5> rt;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0;
+  let Inst{9-0}   = funct;
+}
+
+class ADDI_FM_MMR6<string instr_asm, bits<6> op> : MMR6Arch<instr_asm> {
+  bits<5>  rt;
+  bits<5>  rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = imm16;
+}
+
+class POOL32C_ST_EVA_FM_MMR6<bits<6> op, bits<3> funct> : MipsR6Inst {
+  bits<21> addr;
+  bits<5> hint;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = hint;
+  let Inst{20-16} = base;
+  let Inst{15-12} = 0b1010;
+  let Inst{11-9} = funct;
+  let Inst{8-0}  = offset;
+}
+
+class LB32_FM_MMR6 : MipsR6Inst {
+  bits<21> addr;
+  bits<5> rt;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000111;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-0}  = offset;
+}
+
+class LBU32_FM_MMR6 : MipsR6Inst {
+  bits<21> addr;
+  bits<5> rt;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000101;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-0}  = offset;
+}
+
+class POOL32C_LB_LBU_FM_MMR6<bits<3> funct> : MipsR6Inst {
+  bits<21> addr;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = addr{20-16};
+  let Inst{15-12} = 0b0110;
+  let Inst{11-9} = funct;
+  let Inst{8-0}  = addr{8-0};
+}
+
+class SIGN_EXTEND_FM_MMR6<string instr_asm, bits<10> funct>
+    : MMR6Arch<instr_asm> {
+  bits<5> rd;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
+
+class PCREL19_FM_MMR6<bits<2> funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<19> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011110;
+  let Inst{25-21} = rt;
+  let Inst{20-19} = funct;
+  let Inst{18-0}  = imm;
+}
+
+class PCREL16_FM_MMR6<bits<5> funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = funct;
+  let Inst{15-0}  = imm;
+}
+
+class POOL32A_FM_MMR6<bits<10> funct> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0;
+  let Inst{9-0}   = funct;
+}
+
+class POOL32A_PAUSE_FM_MMR6<string instr_asm, bits<5> op> : MMR6Arch<instr_asm> {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = op;
+  let Inst{10-6} = 0;
+  let Inst{5-0} = 0;
+}
+
+class POOL32A_RDPGPR_FM_MMR6<bits<10> funct> {
+  bits<5> rt;
+  bits<5> rd;
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rd;
+  let Inst{15-6} = funct;
+  let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_RDHWR_FM_MMR6 {
+  bits<5> rt;
+  bits<5> rs;
+  bits<3> sel;
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = 0;
+  let Inst{13-11} = sel;
+  let Inst{10} = 0;
+  let Inst{9-0} = 0b0111000000;
+}
+
+class POOL32A_SYNC_FM_MMR6 {
+  bits<5> stype;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = stype;
+  let Inst{15-6}  = 0b0110101101;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32I_SYNCI_FM_MMR6 {
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<16> immediate = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010000;
+  let Inst{25-21} = 0b01100;
+  let Inst{20-16} = base;
+  let Inst{15-0}  = immediate;
+}
+
+class POOL32A_2R_FM_MMR6<bits<10> funct> : MipsR6Inst {
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
+
+class SPECIAL_2R_FM_MMR6<bits<6> funct> : MipsR6Inst {
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-11} = rt;
+  let Inst{10-6}  = 0b00001;
+  let Inst{5-0}   = funct;
+}
+
+class POOL32A_ALIGN_FM_MMR6<bits<6> funct> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+  bits<2> bp;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-9}  = bp;
+  let Inst{8-6}   = 0b000;
+  let Inst{5-0}   = funct;
+}
+
+class AUI_FM_MMR6 : MipsR6Inst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000100;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0} = imm;
+}
+
+class POOL32A_LSA_FM<bits<6> funct> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+  bits<2> imm2;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10-9}  = imm2;
+  let Inst{8-6}   = 0b000;
+  let Inst{5-0}   = funct;
+}
+
+class SB32_SH32_STORE_FM_MMR6<bits<6> op> {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-0}  = offset;
+}
+
+class POOL32C_STORE_EVA_FM_MMR6<bits<3> funct> {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = 0b1010;
+  let Inst{11-9}  = funct;
+  let Inst{8-0}   = offset;
+}
+
+class LOAD_WORD_EVA_FM_MMR6<bits<3> funct> {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = 0b0110;
+  let Inst{11-9}  = funct;
+  let Inst{8-0}   = offset;
+}
+
+class LOAD_WORD_FM_MMR6 {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b111111;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-0}  = offset;
+}
+
+class LOAD_UPPER_IMM_FM_MMR6 {
+  bits<5> rt;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000100;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = 0;
+  let Inst{15-0}  = imm16;
+}
+
+class CMP_BRANCH_1R_RT_OFF16_FM_MMR6<string instr_asm, bits<6> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-0}  = offset;
+}
+
+class CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<string instr_asm, bits<6> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = offset;
+}
+
+class POOL32A_JALRC_FM_MMR6<string instr_asm, bits<10> funct>
+    : MipsR6Inst, MMR6Arch<instr_asm> {
+  bits<5> rt;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-6} = funct;
+  let Inst{5-0} = 0b111100;
+}
+
+class POOL32A_EXT_INS_FM_MMR6<string instr_asm, bits<6> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> size;
+  bits<5> pos;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = size;
+  let Inst{10-6}  = pos;
+  let Inst{5-0}   = funct;
+}
+
+class POOL32A_ERET_FM_MMR6<string instr_asm, bits<10> funct>
+    : MMR6Arch<instr_asm> {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-16} = 0x00;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class ERETNC_FM_MMR6<string instr_asm> : MMR6Arch<instr_asm> {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-17} = 0x00;
+  let Inst{16-16} = 0x01;
+  let Inst{15-6}  = 0x3cd;
+  let Inst{5-0}   = 0x3c;
+}
+
+class BREAK_MMR6_ENC<string instr_asm> : MMR6Arch<instr_asm> {
+  bits<10> code_1;
+  bits<10> code_2;
+  bits<32> Inst;
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = code_1;
+  let Inst{15-6}  = code_2;
+  let Inst{5-0}   = 0x07;
+}
+
+class BARRIER_MMR6_ENC<string instr_asm, bits<5> op> : MMR6Arch<instr_asm> {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-21} = 0x0;
+  let Inst{20-16} = 0x0;
+  let Inst{15-11} = op;
+  let Inst{10-6}  = 0x0;
+  let Inst{5-0}   = 0x0;
+}
+
+class POOL32A_EIDI_MMR6_ENC<string instr_asm, bits<10> funct>
+    : MMR6Arch<instr_asm> {
+  bits<32> Inst;
+  bits<5> rt; // Actually rs but we're sharing code with the standard encodings which call it rt
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x00;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class SHIFT_MMR6_ENC<string instr_asm, bits<10> funct, bit rotate> : MMR6Arch<instr_asm> {
+  bits<5> rd;
+  bits<5> rt;
+  bits<5> shamt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = shamt;
+  let Inst{10}    = rotate;
+  let Inst{9-0}   = funct;
+}
+
+class SW32_FM_MMR6<string instr_asm, bits<6> op> : MMR6Arch<instr_asm> {
+  bits<5> rt;
+  bits<21> addr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = addr{20-16};
+  let Inst{15-0}  = addr{15-0};
+}
+
+class POOL32C_SWE_FM_MMR6<string instr_asm, bits<6> op, bits<4> fmt,
+    bits<3> funct> : MMR6Arch<instr_asm> {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = fmt;
+  let Inst{11-9} = funct;
+  let Inst{8-0}  = offset;
+}
+
+class POOL32F_ARITH_FM_MMR6<string instr_asm, bits<2> fmt, bits<8> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10}    = 0;
+  let Inst{9-8}   = fmt;
+  let Inst{7-0}   = funct;
+}
+
+class POOL32F_ARITHF_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = funct;
+}
+
+class POOL32F_MOV_NEG_FM_MMR6<string instr_asm, bits<2> fmt, bits<7> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15}    = 0;
+  let Inst{14-13} = fmt;
+  let Inst{12-6}  = funct;
+  let Inst{5-0}   = 0b111011;
+}
+
+class POOL32F_MINMAX_FM<string instr_asm, bits<2> fmt, bits<9> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10-9} = fmt;
+  let Inst{8-0} = funct;
+}
+
+class POOL32F_CMP_FM<string instr_asm, bits<6> format, FIELD_CMP_COND Cond>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10-6} = Cond.Value;
+  let Inst{5-0} = format;
+}
+
+class POOL32F_CVT_LW_FM<string instr_asm, bit fmt, bits<8> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+
+  bits<32> Inst;
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15} = 0;
+  let Inst{14} = fmt;
+  let Inst{13-6} = funct;
+  let Inst{5-0} = 0b111011;
+}
+
+class POOL32F_CVT_DS_FM<string instr_asm, bits<2> fmt, bits<7> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+
+  bits<32> Inst;
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15} = 0;
+  let Inst{14-13} = fmt;
+  let Inst{12-6} = funct;
+  let Inst{5-0} = 0b111011;
+}
+
+class POOL32F_ABS_FM_MMR6<string instr_asm, bits<2> fmt, bits<7> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15}    = 0;
+  let Inst{14-13} = fmt;
+  let Inst{12-6}  = funct;
+  let Inst{5-0}   = 0b111011;
+}
+
+class POOL32F_MATH_FM_MMR6<string instr_asm, bits<1> fmt, bits<8> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15}    = 0;
+  let Inst{14}    = fmt;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0b111011;
+}
+
+class POOL16A_ADDU16_FM_MMR6 : MicroMipsR6Inst16 {
+  bits<3> rs;
+  bits<3> rt;
+  bits<3> rd;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0b000001;
+  let Inst{9-7}   = rs;
+  let Inst{6-4}   = rt;
+  let Inst{3-1}   = rd;
+  let Inst{0}     = 0;
+}
+
+class POOL16C_AND16_FM_MMR6 : MicroMipsR6Inst16 {
+  bits<3> rt;
+  bits<3> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0b010001;
+  let Inst{9-7}   = rt;
+  let Inst{6-4}   = rs;
+  let Inst{3-0}   = 0b0001;
+}
+
+class POOL16C_NOT16_FM_MMR6 : MicroMipsR6Inst16 {
+  bits<3> rt;
+  bits<3> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-7}   = rt;
+  let Inst{6-4}   = rs;
+  let Inst{3-0}   = 0b0000;
+}
+
+class POOL16C_OR16_XOR16_FM_MMR6<bits<4> op> : MicroMipsR6Inst16 {
+  bits<3> rt;
+  bits<3> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0b010001;
+  let Inst{9-7}   = rt;
+  let Inst{6-4}   = rs;
+  let Inst{3-0}   = op;
+}
+
+class POOL16C_BREAKPOINT_FM_MMR6<bits<6> op> {
+  bits<4> code_;
+  bits<16> Inst;
+
+  let Inst{15-10} = 0b010001;
+  let Inst{9-6}   = code_;
+  let Inst{5-0}   = op;
+}
+
+class POOL16A_SUBU16_FM_MMR6 {
+  bits<3> rs;
+  bits<3> rt;
+  bits<3> rd;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0b000001;
+  let Inst{9-7}   = rs;
+  let Inst{6-4}   = rt;
+  let Inst{3-1}   = rd;
+  let Inst{0}     = 0b1;
+}
+
+class POOL32A_WRPGPR_WSBH_FM_MMR6<bits<10> funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class POOL32F_RECIP_ROUND_FM_MMR6<string instr_asm, bits<1> fmt, bits<8> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15}    = 0;
+  let Inst{14}    = fmt;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0b111011;
+}
+
+class POOL32F_RINT_FM_MMR6<string instr_asm, bits<2> fmt>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = fs;
+  let Inst{20-16} = fd;
+  let Inst{15-11} = 0;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = 0b000100000;
+}
+
+class POOL32F_SEL_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = funct;
+}
+
+class POOL32F_CLASS_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = fs;
+  let Inst{20-16} = fd;
+  let Inst{15-11} = 0b00000;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = funct;
+}
+
+class POOL32A_TLBINV_FM_MMR6<string instr_asm, bits<10> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = 0x0;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_MFTC0_FM_MMR6<string instr_asm, bits<5> funct, bits<6> opcode>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<5> rs;
+  bits<3> sel;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = 0;
+  let Inst{13-11} = sel;
+  let Inst{10-6}  = funct;
+  let Inst{5-0}   = opcode;
+}
+
+class POOL32F_MFTC1_FM_MMR6<string instr_asm, bits<8> funct>
+    : MMR6Arch<instr_asm> {
+  bits<5> rt;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010101;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = fs;
+  let Inst{15-14} = 0;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0b111011;
+}
+
+class POOL32A_MFTC2_FM_MMR6<string instr_asm, bits<10> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<5> impl;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = impl;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
+
+class CMP_BRANCH_2R_OFF16_FM_MMR6<string opstr, bits<6> funct>
+    : MipsR6Inst, MMR6Arch<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class POOL32A_DVPEVP_FM_MMR6<string instr_asm, bits<10> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32B_LWP_SWP_FM_MMR6<bits<4> funct> : MipsR6Inst {
+  bits<5> rd;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<12> offset = addr{11-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x8;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = base;
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = offset;
+}
+
+class CMP_BRANCH_OFF21_FM_MMR6<string opstr, bits<6> funct> : MipsR6Inst {
+  bits<5> rs;
+  bits<21> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct;
+  let Inst{25-21} = rs;
+  let Inst{20-0} = offset;
+}
+
+class POOL32I_BRANCH_COP_1_2_FM_MMR6<string instr_asm, bits<5> funct>
+    : MMR6Arch<instr_asm> {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010000;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = offset;
+}
+
+class LDWC1_SDWC1_FM_MMR6<string instr_asm, bits<6> funct>
+    : MMR6Arch<instr_asm> {
+  bits<5> ft;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = base;
+  let Inst{15-0}  = offset;
+}
+
+class POOL32B_LDWC2_SDWC2_FM_MMR6<string instr_asm, bits<4> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<11> offset = addr{10-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b001000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = funct;
+  let Inst{11}    = 0;
+  let Inst{10-0}  = offset;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td
new file mode 100644
index 000000000000..fd04f80dd566
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -0,0 +1,1881 @@
+//=- MicroMips32r6InstrInfo.td - MicroMips r6 Instruction Information -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes microMIPSr6 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+def brtarget21_mm : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTarget21OpValueMM";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget21MM";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def brtarget26_mm : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTarget26OpValueMM";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget26MM";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def brtargetr6 : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValueMMR6";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTargetMM";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def brtarget_lsl2_mm : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValueLsl2MMR6";
+  let OperandType = "OPERAND_PCREL";
+  // Instructions that use this operand have their decoder method
+  // set with DecodeDisambiguates
+  let DecoderMethod = "";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Encodings
+//
+//===----------------------------------------------------------------------===//
+class ADD_MMR6_ENC : ARITH_FM_MMR6<"add", 0x110>;
+class ADDIU_MMR6_ENC : ADDI_FM_MMR6<"addiu", 0xc>;
+class ADDU_MMR6_ENC : ARITH_FM_MMR6<"addu", 0x150>;
+class ADDIUPC_MMR6_ENC : PCREL19_FM_MMR6<0b00>;
+class ALUIPC_MMR6_ENC : PCREL16_FM_MMR6<0b11111>;
+class AND_MMR6_ENC : ARITH_FM_MMR6<"and", 0x250>;
+class ANDI_MMR6_ENC : ADDI_FM_MMR6<"andi", 0x34>;
+class AUIPC_MMR6_ENC  : PCREL16_FM_MMR6<0b11110>;
+class ALIGN_MMR6_ENC : POOL32A_ALIGN_FM_MMR6<0b011111>;
+class AUI_MMR6_ENC : AUI_FM_MMR6;
+class BALC_MMR6_ENC  : BRANCH_OFF26_FM<0b101101>;
+class BC_MMR6_ENC : BRANCH_OFF26_FM<0b100101>;
+class BC16_MMR6_ENC : BC16_FM_MM16R6;
+class BEQZC16_MMR6_ENC : BEQZC_BNEZC_FM_MM16R6<0x23>;
+class BNEZC16_MMR6_ENC : BEQZC_BNEZC_FM_MM16R6<0x2b>;
+class BITSWAP_MMR6_ENC : POOL32A_BITSWAP_FM_MMR6<0b101100>;
+class BRK_MMR6_ENC : BREAK_MMR6_ENC<"break">;
+class BEQZC_MMR6_ENC : CMP_BRANCH_OFF21_FM_MMR6<"beqzc", 0b100000>;
+class BNEZC_MMR6_ENC : CMP_BRANCH_OFF21_FM_MMR6<"bnezc", 0b101000>;
+class BGEC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bgec", 0b111101>,
+                      DecodeDisambiguates<"POP75GroupBranchMMR6">;
+class BGEUC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bgeuc", 0b110000>,
+                       DecodeDisambiguates<"BlezGroupBranchMMR6">;
+class BLTC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bltc", 0b110101>,
+                      DecodeDisambiguates<"POP65GroupBranchMMR6">;
+class BLTUC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bltuc", 0b111000>,
+                       DecodeDisambiguates<"BgtzGroupBranchMMR6">;
+class BEQC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"beqc", 0b011101>;
+class BNEC_MMR6_ENC : CMP_BRANCH_2R_OFF16_FM_MMR6<"bnec", 0b011111>;
+class BLTZC_MMR6_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<"bltzc", 0b110101>,
+                       DecodeDisambiguates<"POP65GroupBranchMMR6">;
+class BLEZC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<"blezc", 0b111101>,
+                       DecodeDisambiguates<"POP75GroupBranchMMR6">;
+class BGEZC_MMR6_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<"bgezc", 0b111101>,
+                       DecodeDisambiguates<"POP75GroupBranchMMR6">;
+class BGTZC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<"bgtzc", 0b110101>,
+                       DecodeDisambiguates<"POP65GroupBranchMMR6">;
+class BEQZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<"beqzalc", 0b011101>,
+                         DecodeDisambiguates<"POP35GroupBranchMMR6">;
+class BNEZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<"bnezalc", 0b011111>,
+                         DecodeDisambiguates<"POP37GroupBranchMMR6">;
+class BGTZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<"bgtzalc", 0b111000>,
+                         MMDecodeDisambiguatedBy<"BgtzGroupBranchMMR6">;
+class BLTZALC_MMR6_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<"bltzalc", 0b111000>,
+                         MMDecodeDisambiguatedBy<"BgtzGroupBranchMMR6">;
+class BGEZALC_MMR6_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<"bgezalc", 0b110000>,
+                         MMDecodeDisambiguatedBy<"BlezGroupBranchMMR6">;
+class BLEZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<"blezalc", 0b110000>,
+                         MMDecodeDisambiguatedBy<"BlezGroupBranchMMR6">;
+class CACHE_MMR6_ENC : CACHE_PREF_FM_MMR6<0b001000, 0b0110>;
+class CLO_MMR6_ENC : POOL32A_2R_FM_MMR6<0b0100101100>;
+class CLZ_MMR6_ENC : SPECIAL_2R_FM_MMR6<0b010000>;
+class DIV_MMR6_ENC : ARITH_FM_MMR6<"div", 0x118>;
+class DIVU_MMR6_ENC : ARITH_FM_MMR6<"divu", 0x198>;
+class EHB_MMR6_ENC : BARRIER_MMR6_ENC<"ehb", 0x3>;
+class EI_MMR6_ENC : POOL32A_EIDI_MMR6_ENC<"ei", 0x15d>;
+class DI_MMR6_ENC : POOL32A_EIDI_MMR6_ENC<"di", 0b0100011101>;
+class ERET_MMR6_ENC : POOL32A_ERET_FM_MMR6<"eret", 0x3cd>;
+class DERET_MMR6_ENC : POOL32A_ERET_FM_MMR6<"eret", 0b1110001101>;
+class ERETNC_MMR6_ENC : ERETNC_FM_MMR6<"eretnc">;
+class JALRC16_MMR6_ENC : POOL16C_JALRC_FM_MM16R6<0xb>;
+class JIALC_MMR6_ENC : JMP_IDX_COMPACT_FM<0b100000>;
+class JIC_MMR6_ENC   : JMP_IDX_COMPACT_FM<0b101000>;
+class JRC16_MMR6_ENC: POOL16C_JALRC_FM_MM16R6<0x3>;
+class JRCADDIUSP_MMR6_ENC : POOL16C_JRCADDIUSP_FM_MM16R6<0x13>;
+class LSA_MMR6_ENC : POOL32A_LSA_FM<0b001111>;
+class LWP_MMR6_ENC : POOL32B_LWP_SWP_FM_MMR6<0x1>;
+class LWPC_MMR6_ENC  : PCREL19_FM_MMR6<0b01>;
+class LWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0x2>;
+class MFC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mfc0", 0b00011, 0b111100>;
+class MFC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mfc1", 0b10000000>;
+class MFC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mfc2", 0b0100110100>;
+class MFHC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mfhc0", 0b00011, 0b110100>;
+class MFHC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mfhc1", 0b11000000>;
+class MFHC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mfhc2", 0b1000110100>;
+class MOD_MMR6_ENC : ARITH_FM_MMR6<"mod", 0x158>;
+class MODU_MMR6_ENC : ARITH_FM_MMR6<"modu", 0x1d8>;
+class MUL_MMR6_ENC : ARITH_FM_MMR6<"mul", 0x18>;
+class MUH_MMR6_ENC : ARITH_FM_MMR6<"muh", 0x58>;
+class MULU_MMR6_ENC : ARITH_FM_MMR6<"mulu", 0x98>;
+class MUHU_MMR6_ENC : ARITH_FM_MMR6<"muhu", 0xd8>;
+class MTC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mtc0", 0b01011, 0b111100>;
+class MTC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mtc1", 0b10100000>;
+class MTC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mtc2", 0b0101110100>;
+class MTHC0_MMR6_ENC : POOL32A_MFTC0_FM_MMR6<"mthc0", 0b01011, 0b110100>;
+class MTHC1_MMR6_ENC : POOL32F_MFTC1_FM_MMR6<"mthc1", 0b11100000>;
+class MTHC2_MMR6_ENC : POOL32A_MFTC2_FM_MMR6<"mthc2", 0b1001110100>;
+class NOR_MMR6_ENC : ARITH_FM_MMR6<"nor", 0x2d0>;
+class OR_MMR6_ENC : ARITH_FM_MMR6<"or", 0x290>;
+class ORI_MMR6_ENC : ADDI_FM_MMR6<"ori", 0x14>;
+class PREF_MMR6_ENC : CACHE_PREF_FM_MMR6<0b011000, 0b0010>;
+class SB16_MMR6_ENC : LOAD_STORE_FM_MM16<0x22>;
+class SEB_MMR6_ENC : SIGN_EXTEND_FM_MMR6<"seb", 0b0010101100>;
+class SEH_MMR6_ENC : SIGN_EXTEND_FM_MMR6<"seh", 0b0011101100>;
+class SELEQZ_MMR6_ENC : POOL32A_FM_MMR6<0b0101000000>;
+class SELNEZ_MMR6_ENC : POOL32A_FM_MMR6<0b0110000000>;
+class SH16_MMR6_ENC : LOAD_STORE_FM_MM16<0x2a>;
+class SLL_MMR6_ENC : SHIFT_MMR6_ENC<"sll", 0x00, 0b0>;
+class SUB_MMR6_ENC : ARITH_FM_MMR6<"sub", 0x190>;
+class SUBU_MMR6_ENC : ARITH_FM_MMR6<"subu", 0x1d0>;
+class SW_MMR6_ENC : SW32_FM_MMR6<"sw", 0x3e>;
+class SWE_MMR6_ENC : POOL32C_SWE_FM_MMR6<"swe", 0x18, 0xa, 0x7>;
+class SW16_MMR6_ENC : LOAD_STORE_FM_MM16<0x3a>;
+class SWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0xa>;
+class SWSP_MMR6_ENC : LOAD_STORE_SP_FM_MM16<0x32>;
+class SWP_MMR6_ENC : POOL32B_LWP_SWP_FM_MMR6<0x9>;
+class PREFE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b010>;
+class CACHEE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b011>;
+class WRPGPR_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x3c5>;
+class WSBH_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x1ec>;
+class LB_MMR6_ENC : LB32_FM_MMR6;
+class LBU_MMR6_ENC : LBU32_FM_MMR6;
+class LBE_MMR6_ENC : POOL32C_LB_LBU_FM_MMR6<0b100>;
+class LBUE_MMR6_ENC : POOL32C_LB_LBU_FM_MMR6<0b000>;
+class PAUSE_MMR6_ENC : POOL32A_PAUSE_FM_MMR6<"pause", 0b00101>;
+class RDHWR_MMR6_ENC : POOL32A_RDHWR_FM_MMR6;
+class WAIT_MMR6_ENC : WAIT_FM_MM, MMR6Arch<"wait">;
+class SSNOP_MMR6_ENC : BARRIER_FM_MM<0x1>, MMR6Arch<"ssnop">;
+class SYNC_MMR6_ENC : POOL32A_SYNC_FM_MMR6;
+class SYNCI_MMR6_ENC : POOL32I_SYNCI_FM_MMR6, MMR6Arch<"synci">;
+class RDPGPR_MMR6_ENC : POOL32A_RDPGPR_FM_MMR6<0b1110000101>;
+class SDBBP_MMR6_ENC : SDBBP_FM_MM, MMR6Arch<"sdbbp">;
+class XOR_MMR6_ENC : ARITH_FM_MMR6<"xor", 0x310>;
+class XORI_MMR6_ENC : ADDI_FM_MMR6<"xori", 0x1c>;
+class ABS_S_MMR6_ENC : POOL32F_ABS_FM_MMR6<"abs.s", 0, 0b0001101>;
+class ABS_D_MMR6_ENC : POOL32F_ABS_FM_MMR6<"abs.d", 1, 0b0001101>;
+class FLOOR_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.l.s", 0, 0b00001100>;
+class FLOOR_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.l.d", 1, 0b00001100>;
+class FLOOR_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.w.s", 0, 0b00101100>;
+class FLOOR_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.w.d", 1, 0b00101100>;
+class CEIL_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.l.s", 0, 0b01001100>;
+class CEIL_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.l.d", 1, 0b01001100>;
+class CEIL_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.w.s", 0, 0b01101100>;
+class CEIL_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.w.d", 1, 0b01101100>;
+class TRUNC_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.l.s", 0, 0b10001100>;
+class TRUNC_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.l.d", 1, 0b10001100>;
+class TRUNC_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.w.s", 0, 0b10101100>;
+class TRUNC_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.w.d", 1, 0b10101100>;
+class SQRT_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"sqrt.s", 0, 0b00101000>;
+class SQRT_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"sqrt.d", 1, 0b00101000>;
+class SB_MMR6_ENC : SB32_SH32_STORE_FM_MMR6<0b000110>;
+class SBE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b100>;
+class SCE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b110>;
+class SH_MMR6_ENC : SB32_SH32_STORE_FM_MMR6<0b001110>;
+class SHE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b101>;
+class LLE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b110>;
+class LWE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b111>;
+class LW_MMR6_ENC : LOAD_WORD_FM_MMR6;
+class LUI_MMR6_ENC : LOAD_UPPER_IMM_FM_MMR6;
+class JALRC_HB_MMR6_ENC : POOL32A_JALRC_FM_MMR6<"jalrc.hb", 0b0001111100>;
+class RINT_S_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.s", 0>;
+class RINT_D_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.d", 1>;
+class ROUND_L_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.s", 0,
+                                                       0b11001100>;
+class ROUND_L_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.d", 1,
+                                                       0b11001100>;
+class ROUND_W_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.s", 0,
+                                                       0b11101100>;
+class ROUND_W_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.d", 1,
+                                                       0b11101100>;
+class SEL_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.s", 0, 0b010111000>;
+class SEL_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.d", 1, 0b010111000>;
+class SELEQZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.s", 0, 0b000111000>;
+class SELEQZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.d", 1, 0b000111000>;
+class SELNEZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selnez.s", 0, 0b001111000>;
+class SELNEZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selnez.d", 1, 0b001111000>;
+class CLASS_S_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.s", 0, 0b001100000>;
+class CLASS_D_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.d", 1, 0b001100000>;
+class EXT_MMR6_ENC : POOL32A_EXT_INS_FM_MMR6<"ext", 0b101100>;
+class INS_MMR6_ENC : POOL32A_EXT_INS_FM_MMR6<"ins", 0b001100>;
+class JALRC_MMR6_ENC : POOL32A_JALRC_FM_MMR6<"jalrc", 0b0000111100>;
+class BOVC_MMR6_ENC : POP35_BOVC_FM_MMR6<"bovc">;
+class BNVC_MMR6_ENC : POP37_BNVC_FM_MMR6<"bnvc">;
+class ADDU16_MMR6_ENC : POOL16A_ADDU16_FM_MMR6;
+class AND16_MMR6_ENC : POOL16C_AND16_FM_MMR6;
+class ANDI16_MMR6_ENC : ANDI_FM_MM16<0b001011>, MicroMipsR6Inst16;
+class NOT16_MMR6_ENC : POOL16C_NOT16_FM_MMR6;
+class OR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1001>;
+class SLL16_MMR6_ENC : SHIFT_FM_MM16<0>, MicroMipsR6Inst16;
+class SRL16_MMR6_ENC : SHIFT_FM_MM16<1>, MicroMipsR6Inst16;
+class BREAK16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b011011>;
+class LI16_MMR6_ENC : LI_FM_MM16;
+class MOVE16_MMR6_ENC : MOVE_FM_MM16<0b000011>;
+class SDBBP16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b111011>;
+class SUBU16_MMR6_ENC : POOL16A_SUBU16_FM_MMR6;
+class XOR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1000>;
+class TLBINV_MMR6_ENC : POOL32A_TLBINV_FM_MMR6<"tlbinv", 0x10d>;
+class TLBINVF_MMR6_ENC : POOL32A_TLBINV_FM_MMR6<"tlbinvf", 0x14d>;
+class DVP_MMR6_ENC : POOL32A_DVPEVP_FM_MMR6<"dvp", 0b0001100101>;
+class EVP_MMR6_ENC : POOL32A_DVPEVP_FM_MMR6<"evp", 0b0011100101>;
+class BC1EQZC_MMR6_ENC : POOL32I_BRANCH_COP_1_2_FM_MMR6<"bc1eqzc", 0b01000>;
+class BC1NEZC_MMR6_ENC : POOL32I_BRANCH_COP_1_2_FM_MMR6<"bc1nezc", 0b01001>;
+class BC2EQZC_MMR6_ENC : POOL32I_BRANCH_COP_1_2_FM_MMR6<"bc2eqzc", 0b01010>;
+class BC2NEZC_MMR6_ENC : POOL32I_BRANCH_COP_1_2_FM_MMR6<"bc2nezc", 0b01011>;
+class LDC1_MMR6_ENC : LDWC1_SDWC1_FM_MMR6<"ldc1", 0b101111>;
+class SDC1_MMR6_ENC : LDWC1_SDWC1_FM_MMR6<"sdc1", 0b101110>;
+class LDC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"ldc2", 0b0010>;
+class SDC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"sdc2", 0b1010>;
+class LWC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"lwc2", 0b0000>;
+class SWC2_MMR6_ENC : POOL32B_LDWC2_SDWC2_FM_MMR6<"swc2", 0b1000>;
+
+/// Floating Point Instructions
+class FADD_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.s", 0, 0b00110000>;
+class FADD_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.d", 1, 0b00110000>;
+class FSUB_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"sub.s", 0, 0b01110000>;
+class FSUB_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"sub.d", 1, 0b01110000>;
+class FMUL_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"mul.s", 0, 0b10110000>;
+class FMUL_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"mul.d", 1, 0b10110000>;
+class FDIV_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"div.s", 0, 0b11110000>;
+class FDIV_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"div.d", 1, 0b11110000>;
+class MADDF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.s", 0, 0b110111000>;
+class MADDF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.d", 1, 0b110111000>;
+class MSUBF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.s", 0, 0b111111000>;
+class MSUBF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.d", 1, 0b111111000>;
+class FMOV_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.s", 0, 0b0000001>;
+class FMOV_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.d", 1, 0b0000001>;
+class FNEG_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.s", 0, 0b0101101>;
+class FNEG_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.d", 1, 0b0101101>;
+class MAX_S_MMR6_ENC : POOL32F_MINMAX_FM<"max.s", 0, 0b000001011>;
+class MAX_D_MMR6_ENC : POOL32F_MINMAX_FM<"max.d", 1, 0b000001011>;
+class MAXA_S_MMR6_ENC : POOL32F_MINMAX_FM<"maxa.s", 0, 0b000101011>;
+class MAXA_D_MMR6_ENC : POOL32F_MINMAX_FM<"maxa.d", 1, 0b000101011>;
+class MIN_S_MMR6_ENC : POOL32F_MINMAX_FM<"min.s", 0, 0b000000011>;
+class MIN_D_MMR6_ENC : POOL32F_MINMAX_FM<"min.d", 1, 0b000000011>;
+class MINA_S_MMR6_ENC : POOL32F_MINMAX_FM<"mina.s", 0, 0b000100011>;
+class MINA_D_MMR6_ENC : POOL32F_MINMAX_FM<"mina.d", 1, 0b000100011>;
+
+class CVT_L_S_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.l.s", 0, 0b00000100>;
+class CVT_L_D_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.l.d", 1, 0b00000100>;
+class CVT_W_S_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.w.s", 0, 0b00100100>;
+class CVT_W_D_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.w.d", 1, 0b00100100>;
+class CVT_D_S_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.s", 0, 0b1001101>;
+class CVT_D_W_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.w", 1, 0b1001101>;
+class CVT_D_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.l", 2, 0b1001101>;
+class CVT_S_D_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.d", 0, 0b1101101>;
+class CVT_S_W_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.w", 1, 0b1101101>;
+class CVT_S_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.l", 2, 0b1101101>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Descriptions
+//
+//===----------------------------------------------------------------------===//
+
+class CMP_CBR_RT_Z_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd,
+                                  RegisterOperand GPROpnd>
+    : BRANCH_DESC_BASE {
+  dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $offset");
+  list<Register> Defs = [AT];
+  InstrItinClass Itinerary = II_BCCZC;
+}
+
+class BEQZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"beqzalc", brtarget_mm,
+                                                      GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BGEZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bgezalc", brtarget_mm,
+                                                      GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BGTZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bgtzalc", brtarget_mm,
+                                                      GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BLEZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"blezalc", brtarget_mm,
+                                                      GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BLTZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bltzalc", brtarget_mm,
+                                                      GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BNEZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bnezalc", brtarget_mm,
+                                                      GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BLTZC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bltzc", brtarget_lsl2_mm,
+                                                    GPR32Opnd>;
+class BLEZC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"blezc", brtarget_lsl2_mm,
+                                                    GPR32Opnd>;
+class BGEZC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bgezc", brtarget_lsl2_mm,
+                                                    GPR32Opnd>;
+class BGTZC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bgtzc", brtarget_lsl2_mm,
+                                                    GPR32Opnd>;
+
+class CMP_CBR_2R_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd,
+                                RegisterOperand GPROpnd> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $offset");
+  list<Register> Defs = [AT];
+  InstrItinClass Itinerary = II_BCCC;
+}
+
+class BGEC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"bgec", brtarget_lsl2_mm,
+                                                 GPR32Opnd>;
+class BGEUC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"bgeuc", brtarget_lsl2_mm,
+                                                 GPR32Opnd>;
+class BLTC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"bltc", brtarget_lsl2_mm,
+                                                 GPR32Opnd>;
+class BLTUC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"bltuc", brtarget_lsl2_mm,
+                                                 GPR32Opnd>;
+class BEQC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"beqc", brtarget_lsl2_mm,
+                                                 GPR32Opnd>;
+class BNEC_MMR6_DESC : CMP_CBR_2R_MMR6_DESC_BASE<"bnec", brtarget_lsl2_mm,
+                                                 GPR32Opnd>;
+
+class ADD_MMR6_DESC : ArithLogicR<"add", GPR32Opnd, 1, II_ADD>;
+class ADDIU_MMR6_DESC : ArithLogicI<"addiu", simm16, GPR32Opnd, II_ADDIU, immSExt16, add>;
+class ADDU_MMR6_DESC : ArithLogicR<"addu", GPR32Opnd, 1, II_ADDU>;
+class MUL_MMR6_DESC : ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>;
+class MUH_MMR6_DESC : ArithLogicR<"muh", GPR32Opnd, 1, II_MUH, mulhs>;
+class MULU_MMR6_DESC : ArithLogicR<"mulu", GPR32Opnd, 1, II_MULU>;
+class MUHU_MMR6_DESC : ArithLogicR<"muhu", GPR32Opnd, 1, II_MUHU, mulhu>;
+
+class BC_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd, InstrItinClass Itin>
+    : BRANCH_DESC_BASE, MMR6Arch<instr_asm> {
+  dag InOperandList = (ins opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$offset");
+  bit isBarrier = 1;
+  InstrItinClass Itinerary = Itin;
+}
+
+class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26_mm, II_BALC> {
+  bit isCall = 1;
+  list<Register> Defs = [RA];
+}
+class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26_mm, II_BC>;
+
+class BC16_MMR6_DESC : MicroMipsInst16<(outs), (ins brtarget10_mm:$offset),
+                                       !strconcat("bc16", "\t$offset"), [],
+                                       II_BC, FrmI>,
+                       MMR6Arch<"bc16">, MicroMipsR6Inst16 {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let hasDelaySlot = 0;
+  let AdditionalPredicates = [RelocPIC];
+  let Defs = [AT];
+}
+
+class BEQZC_BNEZC_MM16R6_DESC_BASE<string instr_asm>
+    : CBranchZeroMM<instr_asm, brtarget7_mm, GPRMM16Opnd>, MMR6Arch<instr_asm> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 0;
+  let Defs = [AT];
+}
+class BEQZC16_MMR6_DESC : BEQZC_BNEZC_MM16R6_DESC_BASE<"beqzc16">;
+class BNEZC16_MMR6_DESC : BEQZC_BNEZC_MM16R6_DESC_BASE<"bnezc16">;
+
+class SUB_MMR6_DESC : ArithLogicR<"sub", GPR32Opnd, 0, II_SUB>;
+class SUBU_MMR6_DESC : ArithLogicR<"subu", GPR32Opnd, 0,II_SUBU>;
+
+class BITSWAP_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
+    : MMR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_BITSWAP;
+}
+
+class BITSWAP_MMR6_DESC : BITSWAP_MMR6_DESC_BASE<"bitswap", GPR32Opnd>;
+
+class BRK_MMR6_DESC : BRK_FT<"break">;
+
+class CACHE_HINT_MMR6_DESC<string instr_asm, Operand MemOpnd,
+                           RegisterOperand GPROpnd, InstrItinClass Itin>
+      : MMR6Arch<instr_asm> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint);
+  string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
+  list<dag> Pattern = [];
+  string DecoderMethod = "DecodeCacheOpMM";
+  InstrItinClass Itinerary = Itin;
+}
+
+class CACHE_MMR6_DESC : CACHE_HINT_MMR6_DESC<"cache", mem_mm_12, GPR32Opnd,
+                                             II_CACHE>;
+class PREF_MMR6_DESC : CACHE_HINT_MMR6_DESC<"pref", mem_mm_12, GPR32Opnd,
+                                             II_PREF>;
+
+class PREFE_CACHEE_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
+                                  RegisterOperand GPROpnd, InstrItinClass Itin>
+    : CACHE_HINT_MMR6_DESC<instr_asm, MemOpnd, GPROpnd, Itin> {
+  string DecoderMethod = "DecodePrefeOpMM";
+}
+
+class PREFE_MMR6_DESC : PREFE_CACHEE_MMR6_DESC_BASE<"prefe", mem_mm_9,
+                                                    GPR32Opnd, II_PREFE>;
+class CACHEE_MMR6_DESC : PREFE_CACHEE_MMR6_DESC_BASE<"cachee", mem_mm_9,
+                                                     GPR32Opnd, II_CACHEE>;
+
+class LB_LBU_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
+                            RegisterOperand GPROpnd, InstrItinClass Itin>
+    : MMR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins MemOpnd:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  string DecoderMethod = "DecodeLoadByte15";
+  bit mayLoad = 1;
+  InstrItinClass Itinerary = Itin;
+}
+class LB_MMR6_DESC : LB_LBU_MMR6_DESC_BASE<"lb", mem_mm_16, GPR32Opnd, II_LB>;
+class LBU_MMR6_DESC : LB_LBU_MMR6_DESC_BASE<"lbu", mem_mm_16, GPR32Opnd,
+                                            II_LBU>;
+
+class LBE_LBUE_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd,
+                              RegisterOperand GPROpnd, InstrItinClass Itin>
+    : LB_LBU_MMR6_DESC_BASE<instr_asm, MemOpnd, GPROpnd, Itin> {
+  let DecoderMethod = "DecodeLoadByte9";
+}
+class LBE_MMR6_DESC : LBE_LBUE_MMR6_DESC_BASE<"lbe", mem_mm_9, GPR32Opnd,
+                                              II_LBE>;
+class LBUE_MMR6_DESC : LBE_LBUE_MMR6_DESC_BASE<"lbue", mem_mm_9, GPR32Opnd,
+                                               II_LBUE>;
+
+class CLO_CLZ_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                             InstrItinClass Itin> : MMR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins GPROpnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
+  InstrItinClass Itinerary = Itin;
+}
+
+class CLO_MMR6_DESC : CLO_CLZ_MMR6_DESC_BASE<"clo", GPR32Opnd, II_CLO>;
+class CLZ_MMR6_DESC : CLO_CLZ_MMR6_DESC_BASE<"clz", GPR32Opnd, II_CLZ>;
+
+class EHB_MMR6_DESC : Barrier<"ehb", II_EHB>;
+class EI_MMR6_DESC : DEI_FT<"ei", GPR32Opnd, II_EI>;
+class DI_MMR6_DESC : DEI_FT<"di", GPR32Opnd, II_DI>;
+
+class ERET_MMR6_DESC : ER_FT<"eret", II_ERET>;
+class DERET_MMR6_DESC : ER_FT<"deret", II_DERET>;
+class ERETNC_MMR6_DESC : ER_FT<"eretnc", II_ERETNC>;
+
+class JALRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO>
+    : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+                      [(MipsJmpLink RO:$rs)], II_JALR, FrmR>,
+      MMR6Arch<opstr>, MicroMipsR6Inst16 {
+  let isCall = 1;
+  let hasDelaySlot = 0;
+  let Defs = [RA];
+}
+class JALRC16_MMR6_DESC : JALRC16_MMR6_DESC_BASE<"jalr", GPR32Opnd>;
+
+class JMP_MMR6_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd,
+                                     RegisterOperand GPROpnd,
+                                     InstrItinClass Itin>
+    : MMR6Arch<opstr> {
+  dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
+  string AsmString = !strconcat(opstr, "\t$rt, $offset");
+  list<dag> Pattern = [];
+  bit isTerminator = 1;
+  bit hasDelaySlot = 0;
+  InstrItinClass Itinerary = Itin;
+}
+
+class JIALC_MMR6_DESC : JMP_MMR6_IDX_COMPACT_DESC_BASE<"jialc", calloffset16,
+                                                       GPR32Opnd, II_JIALC> {
+  bit isCall = 1;
+  list<Register> Defs = [RA];
+}
+
+class JIC_MMR6_DESC : JMP_MMR6_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16,
+                                                     GPR32Opnd, II_JIC> {
+  bit isBarrier = 1;
+  list<Register> Defs = [AT];
+}
+
+class JRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO>
+    : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+                      [], II_JR, FrmR>,
+      MMR6Arch<opstr>, MicroMipsR6Inst16 {
+  let hasDelaySlot = 0;
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+class JRC16_MMR6_DESC : JRC16_MMR6_DESC_BASE<"jrc16", GPR32Opnd>;
+
+class JRCADDIUSP_MMR6_DESC
+    : MicroMipsInst16<(outs), (ins uimm5_lsl2:$imm), "jrcaddiusp\t$imm",
+                      [], II_JRADDIUSP, FrmR>,
+      MMR6Arch<"jrcaddiusp">, MicroMipsR6Inst16 {
+  let hasDelaySlot = 0;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+
+class ALIGN_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                      Operand ImmOpnd, InstrItinClass Itin>
+    : MMR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$bp);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $bp");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = Itin;
+}
+
+class ALIGN_MMR6_DESC : ALIGN_MMR6_DESC_BASE<"align", GPR32Opnd, uimm2,
+                                             II_ALIGN>;
+
+class AUI_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                         InstrItinClass Itin> : MMR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins GPROpnd:$rs, uimm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $imm");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = Itin;
+}
+
+class AUI_MMR6_DESC : AUI_MMR6_DESC_BASE<"aui", GPR32Opnd, II_AUI>;
+
+class SEB_MMR6_DESC : SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>;
+class SEH_MMR6_DESC : SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>;
+class ALUIPC_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                            InstrItinClass Itin> : MMR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins simm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $imm");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = Itin;
+}
+
+class ALUIPC_MMR6_DESC : ALUIPC_MMR6_DESC_BASE<"aluipc", GPR32Opnd, II_ALUIPC>;
+class AUIPC_MMR6_DESC : ALUIPC_MMR6_DESC_BASE<"auipc", GPR32Opnd, II_AUIPC>;
+
+class LSA_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                         Operand ImmOpnd, InstrItinClass Itin>
+    : MMR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$imm2);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $rd, $imm2");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = Itin;
+}
+
+class LSA_MMR6_DESC : LSA_MMR6_DESC_BASE<"lsa", GPR32Opnd, uimm2_plus1, II_LSA>;
+
+class PCREL_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                           Operand ImmOpnd, InstrItinClass Itin>
+    : MMR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins ImmOpnd:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $imm");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = Itin;
+}
+
+class ADDIUPC_MMR6_DESC : PCREL_MMR6_DESC_BASE<"addiupc", GPR32Opnd,
+                                               simm19_lsl2, II_ADDIUPC>;
+class LWPC_MMR6_DESC: PCREL_MMR6_DESC_BASE<"lwpc", GPR32Opnd, simm19_lsl2,
+                                           II_LWPC>;
+
+class LWP_MMR6_DESC : MMR6Arch<"lwp"> {
+  dag OutOperandList = (outs regpair:$rd);
+  dag InOperandList = (ins mem_simm12:$addr);
+  string AsmString = !strconcat("lwp", "\t$rd, $addr");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_LWP;
+  ComplexPattern Addr = addr;
+  Format f = FrmI;
+  string BaseOpcode = "lwp";
+  string DecoderMethod = "DecodeMemMMImm12";
+  bit mayLoad = 1;
+}
+
+class SWP_MMR6_DESC : MMR6Arch<"swp"> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins regpair:$rd, mem_simm12:$addr);
+  string AsmString = !strconcat("swp", "\t$rd, $addr");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_SWP;
+  ComplexPattern Addr = addr;
+  Format f = FrmI;
+  string BaseOpcode = "swp";
+  string DecoderMethod = "DecodeMemMMImm12";
+  bit mayStore = 1;
+}
+
+class SELEQNE_Z_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                               InstrItinClass Itin> : MMR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = Itin;
+}
+
+class SELEQZ_MMR6_DESC : SELEQNE_Z_MMR6_DESC_BASE<"seleqz", GPR32Opnd,
+                                                  II_SELCCZ>;
+class SELNEZ_MMR6_DESC : SELEQNE_Z_MMR6_DESC_BASE<"selnez", GPR32Opnd,
+                                                  II_SELCCZ>;
+class PAUSE_MMR6_DESC : Barrier<"pause", II_PAUSE>;
+class RDHWR_MMR6_DESC : MMR6Arch<"rdhwr">, MipsR6Inst {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins HWRegsOpnd:$rs, uimm3:$sel);
+  string AsmString = !strconcat("rdhwr", "\t$rt, $rs, $sel");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_RDHWR;
+  Format Form = FrmR;
+}
+
+class WAIT_MMR6_DESC : WaitMM<"wait">;
+// FIXME: ssnop should not be defined for R6. Per MD000582 microMIPS32 6.03:
+//        Assemblers targeting specifically Release 6 should reject the SSNOP
+//        instruction with an error.
+class SSNOP_MMR6_DESC : Barrier<"ssnop", II_SSNOP>;
+class SLL_MMR6_DESC : shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL>;
+
+class DIVMOD_MMR6_DESC_BASE<string opstr, RegisterOperand GPROpnd,
+                            InstrItinClass Itin,
+                            SDPatternOperator OpNode=null_frag>
+    : MipsR6Inst {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(opstr, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set GPROpnd:$rd, (OpNode GPROpnd:$rs, GPROpnd:$rt))];
+  string BaseOpcode = opstr;
+  Format f = FrmR;
+  let isCommutable = 0;
+  let isReMaterializable = 1;
+  InstrItinClass Itinerary = Itin;
+
+  // This instruction doesn't trap division by zero itself. We must insert
+  // teq instructions as well.
+  bit usesCustomInserter = 1;
+}
+class DIV_MMR6_DESC  : DIVMOD_MMR6_DESC_BASE<"div", GPR32Opnd, II_DIV, sdiv>;
+class DIVU_MMR6_DESC : DIVMOD_MMR6_DESC_BASE<"divu", GPR32Opnd, II_DIVU, udiv>;
+class MOD_MMR6_DESC  : DIVMOD_MMR6_DESC_BASE<"mod", GPR32Opnd, II_MOD, srem>;
+class MODU_MMR6_DESC : DIVMOD_MMR6_DESC_BASE<"modu", GPR32Opnd, II_MODU, urem>;
+class AND_MMR6_DESC : ArithLogicR<"and", GPR32Opnd, 1, II_AND, and>;
+class ANDI_MMR6_DESC : ArithLogicI<"andi", uimm16, GPR32Opnd, II_ANDI>;
+class NOR_MMR6_DESC : LogicNOR<"nor", GPR32Opnd>;
+class OR_MMR6_DESC : ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>;
+class ORI_MMR6_DESC : ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16,
+                                  or> {
+  int AddedComplexity = 1;
+}
+class XOR_MMR6_DESC : ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>;
+class XORI_MMR6_DESC : ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI,
+                                   immZExt16, xor>;
+
+class SWE_MMR6_DESC_BASE<string opstr, DAGOperand RO, DAGOperand MO,
+                  InstrItinClass Itin = NoItinerary,
+                  SDPatternOperator OpNode = null_frag,
+                  ComplexPattern Addr = addr> :
+  InstSE<(outs), (ins RO:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> {
+  let DecoderMethod = "DecodeMem";
+  let mayStore = 1;
+}
+class SW_MMR6_DESC : Store<"sw", GPR32Opnd> {
+  InstrItinClass Itinerary = II_SW;
+}
+class SWE_MMR6_DESC : SWE_MMR6_DESC_BASE<"swe", GPR32Opnd, mem_simm9, II_SWE>;
+
+class WRPGPR_WSBH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO,
+                                 InstrItinClass Itin> : MMR6Arch<instr_asm> {
+  dag InOperandList = (ins RO:$rs);
+  dag OutOperandList = (outs RO:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
+  list<dag> Pattern = [];
+  Format f = FrmR;
+  string BaseOpcode = instr_asm;
+  bit hasSideEffects = 0;
+  InstrItinClass Itinerary = Itin;
+}
+class WRPGPR_MMR6_DESC : WRPGPR_WSBH_MMR6_DESC_BASE<"wrpgpr", GPR32Opnd,
+                                                    II_WRPGPR>;
+class WSBH_MMR6_DESC : WRPGPR_WSBH_MMR6_DESC_BASE<"wsbh", GPR32Opnd, II_WSBH>;
+
+class MTC0_MMR6_DESC_BASE<string opstr, RegisterOperand DstRC,
+                         RegisterOperand SrcRC, InstrItinClass Itin> {
+  dag InOperandList = (ins SrcRC:$rt, uimm3:$sel);
+  dag OutOperandList = (outs DstRC:$rs);
+  string AsmString = !strconcat(opstr, "\t$rt, $rs, $sel");
+  list<dag> Pattern = [];
+  Format f = FrmFR;
+  string BaseOpcode = opstr;
+  InstrItinClass Itinerary = Itin;
+}
+class MTC1_MMR6_DESC_BASE<
+      string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+      InstrItinClass Itin = NoItinerary, SDPatternOperator OpNode = null_frag>
+      : MipsR6Inst {
+  dag InOperandList = (ins SrcRC:$rt);
+  dag OutOperandList = (outs DstRC:$fs);
+  string AsmString = !strconcat(opstr, "\t$rt, $fs");
+  list<dag> Pattern = [(set DstRC:$fs, (OpNode SrcRC:$rt))];
+  Format f = FrmFR;
+  InstrItinClass Itinerary = Itin;
+  string BaseOpcode = opstr;
+}
+class MTC1_64_MMR6_DESC_BASE<
+      string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+      InstrItinClass Itin = NoItinerary> : MipsR6Inst {
+  dag InOperandList = (ins DstRC:$fs_in, SrcRC:$rt);
+  dag OutOperandList = (outs DstRC:$fs);
+  string AsmString = !strconcat(opstr, "\t$rt, $fs");
+  list<dag> Pattern = [];
+  Format f = FrmFR;
+  InstrItinClass Itinerary = Itin;
+  string BaseOpcode = opstr;
+  // $fs_in is part of a white lie to work around a widespread bug in the FPU
+  // implementation. See expandBuildPairF64 for details.
+  let Constraints = "$fs = $fs_in";
+}
+class MTC2_MMR6_DESC_BASE<string opstr, RegisterOperand DstRC,
+                         RegisterOperand SrcRC, InstrItinClass Itin> {
+  dag InOperandList = (ins SrcRC:$rt);
+  dag OutOperandList = (outs DstRC:$impl);
+  string AsmString = !strconcat(opstr, "\t$rt, $impl");
+  list<dag> Pattern = [];
+  Format f = FrmFR;
+  string BaseOpcode = opstr;
+  InstrItinClass Itinerary = Itin;
+}
+
+class MTC0_MMR6_DESC : MTC0_MMR6_DESC_BASE<"mtc0", COP0Opnd, GPR32Opnd,
+                                           II_MTC0>;
+class MTC1_MMR6_DESC : MTC1_MMR6_DESC_BASE<"mtc1", FGR32Opnd, GPR32Opnd,
+                                           II_MTC1, bitconvert>, HARDFLOAT;
+class MTC2_MMR6_DESC : MTC2_MMR6_DESC_BASE<"mtc2", COP2Opnd, GPR32Opnd,
+                                           II_MTC2>;
+class MTHC0_MMR6_DESC : MTC0_MMR6_DESC_BASE<"mthc0", COP0Opnd, GPR32Opnd,
+                                            II_MTHC0>;
+class MTHC1_D32_MMR6_DESC : MTC1_64_MMR6_DESC_BASE<"mthc1", AFGR64Opnd,
+                                                   GPR32Opnd, II_MTC1>,
+                            HARDFLOAT, FGR_32;
+class MTHC1_D64_MMR6_DESC : MTC1_64_MMR6_DESC_BASE<"mthc1", FGR64Opnd,
+                                                   GPR32Opnd, II_MTC1>,
+                            HARDFLOAT, FGR_64;
+class MTHC2_MMR6_DESC : MTC2_MMR6_DESC_BASE<"mthc2", COP2Opnd, GPR32Opnd,
+                                            II_MTC2>;
+
+class MFC0_MMR6_DESC_BASE<string opstr, RegisterOperand DstRC,
+                          RegisterOperand SrcRC, InstrItinClass Itin> {
+  dag InOperandList = (ins SrcRC:$rs, uimm3:$sel);
+  dag OutOperandList = (outs DstRC:$rt);
+  string AsmString = !strconcat(opstr, "\t$rt, $rs, $sel");
+  list<dag> Pattern = [];
+  Format f = FrmFR;
+  string BaseOpcode = opstr;
+  InstrItinClass Itinerary = Itin;
+}
+class MFC1_MMR6_DESC_BASE<string opstr, RegisterOperand DstRC,
+                          RegisterOperand SrcRC,
+                          InstrItinClass Itin = NoItinerary,
+                          SDPatternOperator OpNode = null_frag> : MipsR6Inst {
+  dag InOperandList = (ins SrcRC:$fs);
+  dag OutOperandList = (outs DstRC:$rt);
+  string AsmString = !strconcat(opstr, "\t$rt, $fs");
+  list<dag> Pattern = [(set DstRC:$rt, (OpNode SrcRC:$fs))];
+  Format f = FrmFR;
+  InstrItinClass Itinerary = Itin;
+  string BaseOpcode = opstr;
+}
+class MFC2_MMR6_DESC_BASE<string opstr, RegisterOperand DstRC,
+                          RegisterOperand SrcRC, InstrItinClass Itin> {
+  dag InOperandList = (ins SrcRC:$impl);
+  dag OutOperandList = (outs DstRC:$rt);
+  string AsmString = !strconcat(opstr, "\t$rt, $impl");
+  list<dag> Pattern = [];
+  Format f = FrmFR;
+  string BaseOpcode = opstr;
+  InstrItinClass Itinerary = Itin;
+}
+class MFC0_MMR6_DESC : MFC0_MMR6_DESC_BASE<"mfc0", GPR32Opnd, COP0Opnd,
+                                           II_MFC0>;
+class MFC1_MMR6_DESC : MFC1_MMR6_DESC_BASE<"mfc1", GPR32Opnd, FGR32Opnd,
+                                           II_MFC1, bitconvert>, HARDFLOAT;
+class MFC2_MMR6_DESC : MFC2_MMR6_DESC_BASE<"mfc2", GPR32Opnd, COP2Opnd,
+                                           II_MFC2>;
+class MFHC0_MMR6_DESC : MFC0_MMR6_DESC_BASE<"mfhc0", GPR32Opnd, COP0Opnd,
+                                            II_MFHC0>;
+class MFHC1_D32_MMR6_DESC : MFC1_MMR6_DESC_BASE<"mfhc1", GPR32Opnd, AFGR64Opnd,
+                                                II_MFHC1>, HARDFLOAT, FGR_32;
+class MFHC1_D64_MMR6_DESC : MFC1_MMR6_DESC_BASE<"mfhc1", GPR32Opnd, FGR64Opnd,
+                                                II_MFHC1>, HARDFLOAT, FGR_64;
+class MFHC2_MMR6_DESC : MFC2_MMR6_DESC_BASE<"mfhc2", GPR32Opnd, COP2Opnd,
+                                            II_MFC2>;
+
+class LDC1_D64_MMR6_DESC : MipsR6Inst, HARDFLOAT, FGR_64 {
+  dag InOperandList = (ins mem_mm_16:$addr);
+  dag OutOperandList = (outs FGR64Opnd:$ft);
+  string AsmString = !strconcat("ldc1", "\t$ft, $addr");
+  list<dag> Pattern = [(set FGR64Opnd:$ft, (load addrimm16:$addr))];
+  Format f = FrmFI;
+  InstrItinClass Itinerary = II_LDC1;
+  string BaseOpcode = "ldc1";
+  bit mayLoad = 1;
+  let DecoderMethod = "DecodeFMemMMR2";
+}
+
+class SDC1_D64_MMR6_DESC : MipsR6Inst, HARDFLOAT, FGR_64 {
+  dag InOperandList = (ins FGR64Opnd:$ft, mem_mm_16:$addr);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat("sdc1", "\t$ft, $addr");
+  list<dag> Pattern = [(store FGR64Opnd:$ft, addrimm16:$addr)];
+  Format f = FrmFI;
+  InstrItinClass Itinerary = II_SDC1;
+  string BaseOpcode = "sdc1";
+  bit mayStore = 1;
+  let DecoderMethod = "DecodeFMemMMR2";
+}
+
+class LDC2_LWC2_MMR6_DESC_BASE<string opstr, InstrItinClass itin> {
+  dag OutOperandList = (outs COP2Opnd:$rt);
+  dag InOperandList = (ins mem_mm_11:$addr);
+  string AsmString = !strconcat(opstr, "\t$rt, $addr");
+  list<dag> Pattern = [(set COP2Opnd:$rt, (load addrimm11:$addr))];
+  Format f = FrmFI;
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = opstr;
+  bit mayLoad = 1;
+  string DecoderMethod = "DecodeFMemCop2MMR6";
+}
+class LDC2_MMR6_DESC : LDC2_LWC2_MMR6_DESC_BASE<"ldc2", II_LDC2>;
+class LWC2_MMR6_DESC : LDC2_LWC2_MMR6_DESC_BASE<"lwc2", II_LWC2>;
+
+class SDC2_SWC2_MMR6_DESC_BASE<string opstr, InstrItinClass itin> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins COP2Opnd:$rt, mem_mm_11:$addr);
+  string AsmString = !strconcat(opstr, "\t$rt, $addr");
+  list<dag> Pattern = [(store COP2Opnd:$rt, addrimm11:$addr)];
+  Format f = FrmFI;
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = opstr;
+  bit mayStore = 1;
+  string DecoderMethod = "DecodeFMemCop2MMR6";
+}
+class SDC2_MMR6_DESC : SDC2_SWC2_MMR6_DESC_BASE<"sdc2", II_SDC2>;
+class SWC2_MMR6_DESC : SDC2_SWC2_MMR6_DESC_BASE<"swc2", II_SWC2>;
+
+/// Floating Point Instructions
+class FARITH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RC,
+                            InstrItinClass Itin, bit isComm,
+                            SDPatternOperator OpNode = null_frag> : HARDFLOAT {
+  dag OutOperandList = (outs RC:$fd);
+  dag InOperandList = (ins RC:$ft, RC:$fs);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [(set RC:$fd, (OpNode RC:$fs, RC:$ft))];
+  InstrItinClass Itinerary = Itin;
+  bit isCommutable = isComm;
+}
+class FADD_S_MMR6_DESC
+  : FARITH_MMR6_DESC_BASE<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>;
+class FADD_D_MMR6_DESC
+  : FARITH_MMR6_DESC_BASE<"add.d", AFGR64Opnd, II_ADD_D, 1, fadd>;
+class FSUB_S_MMR6_DESC
+  : FARITH_MMR6_DESC_BASE<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>;
+class FSUB_D_MMR6_DESC
+  : FARITH_MMR6_DESC_BASE<"sub.d", AFGR64Opnd, II_SUB_D, 0, fsub>;
+class FMUL_S_MMR6_DESC
+  : FARITH_MMR6_DESC_BASE<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>;
+class FMUL_D_MMR6_DESC
+  : FARITH_MMR6_DESC_BASE<"mul.d", AFGR64Opnd, II_MUL_D, 1, fmul>;
+class FDIV_S_MMR6_DESC
+  : FARITH_MMR6_DESC_BASE<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>;
+class FDIV_D_MMR6_DESC
+  : FARITH_MMR6_DESC_BASE<"div.d", AFGR64Opnd, II_DIV_D, 0, fdiv>;
+class MADDF_S_MMR6_DESC : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd,
+                                            II_MADDF_S>, HARDFLOAT;
+class MADDF_D_MMR6_DESC : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd,
+                                            II_MADDF_D>, HARDFLOAT;
+class MSUBF_S_MMR6_DESC : COP1_4R_DESC_BASE<"msubf.s", FGR32Opnd,
+                                            II_MSUBF_S>, HARDFLOAT;
+class MSUBF_D_MMR6_DESC : COP1_4R_DESC_BASE<"msubf.d", FGR64Opnd,
+                                            II_MSUBF_D>, HARDFLOAT;
+
+class FMOV_FNEG_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC,
+                               RegisterOperand SrcRC, InstrItinClass Itin,
+                               SDPatternOperator OpNode = null_frag>
+                               : HARDFLOAT, NeverHasSideEffects {
+  dag OutOperandList = (outs DstRC:$ft);
+  dag InOperandList = (ins SrcRC:$fs);
+  string AsmString = !strconcat(instr_asm, "\t$ft, $fs");
+  list<dag> Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))];
+  InstrItinClass Itinerary = Itin;
+  Format Form = FrmFR;
+}
+class FMOV_S_MMR6_DESC
+  : FMOV_FNEG_MMR6_DESC_BASE<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>;
+class FMOV_D_MMR6_DESC
+  : FMOV_FNEG_MMR6_DESC_BASE<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>;
+class FNEG_S_MMR6_DESC
+  : FMOV_FNEG_MMR6_DESC_BASE<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>;
+class FNEG_D_MMR6_DESC
+  : FMOV_FNEG_MMR6_DESC_BASE<"neg.d", AFGR64Opnd, AFGR64Opnd, II_NEG, fneg>;
+
+class MAX_S_MMR6_DESC : MAX_MIN_DESC_BASE<"max.s", FGR32Opnd, II_MAX_S>,
+                        HARDFLOAT;
+class MAX_D_MMR6_DESC : MAX_MIN_DESC_BASE<"max.d", FGR64Opnd, II_MAX_D>,
+                        HARDFLOAT;
+class MIN_S_MMR6_DESC : MAX_MIN_DESC_BASE<"min.s", FGR32Opnd, II_MIN_S>,
+                        HARDFLOAT;
+class MIN_D_MMR6_DESC : MAX_MIN_DESC_BASE<"min.d", FGR64Opnd, II_MIN_D>,
+                        HARDFLOAT;
+
+class MAXA_S_MMR6_DESC : MAX_MIN_DESC_BASE<"maxa.s", FGR32Opnd, II_MAXA_S>,
+                         HARDFLOAT;
+class MAXA_D_MMR6_DESC : MAX_MIN_DESC_BASE<"maxa.d", FGR64Opnd, II_MAXA_D>,
+                         HARDFLOAT;
+class MINA_S_MMR6_DESC : MAX_MIN_DESC_BASE<"mina.s", FGR32Opnd, II_MINA_S>,
+                         HARDFLOAT;
+class MINA_D_MMR6_DESC : MAX_MIN_DESC_BASE<"mina.d", FGR64Opnd, II_MINA_D>,
+                         HARDFLOAT;
+
+class CVT_MMR6_DESC_BASE<
+    string instr_asm, RegisterOperand DstRC, RegisterOperand SrcRC,
+    InstrItinClass Itin, SDPatternOperator OpNode = null_frag>
+    : HARDFLOAT, NeverHasSideEffects {
+  dag OutOperandList = (outs DstRC:$ft);
+  dag InOperandList = (ins SrcRC:$fs);
+  string AsmString = !strconcat(instr_asm, "\t$ft, $fs");
+  list<dag> Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))];
+  InstrItinClass Itinerary = Itin;
+  Format Form = FrmFR;
+}
+
+class CVT_L_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.l.s", FGR64Opnd, FGR32Opnd,
+                                             II_CVT>;
+class CVT_L_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.l.d", FGR64Opnd, FGR64Opnd,
+                                             II_CVT>;
+class CVT_W_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.w.s", FGR32Opnd, FGR32Opnd,
+                                             II_CVT>;
+class CVT_W_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.w.d", FGR32Opnd, AFGR64Opnd,
+                                             II_CVT>;
+class CVT_D_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.s", FGR32Opnd, AFGR64Opnd,
+                                             II_CVT>;
+class CVT_D_W_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.w", FGR32Opnd, AFGR64Opnd,
+                                             II_CVT>;
+class CVT_D_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.l", FGR64Opnd, FGR64Opnd,
+                                             II_CVT>, FGR_64;
+class CVT_S_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.d", AFGR64Opnd, FGR32Opnd,
+                                             II_CVT>;
+class CVT_S_W_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.w", FGR32Opnd, FGR32Opnd,
+                                             II_CVT>;
+class CVT_S_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.l", FGR64Opnd, FGR32Opnd,
+                                             II_CVT>, FGR_64;
+
+multiclass CMP_CC_MMR6<bits<6> format, string Typestr,
+                       RegisterOperand FGROpnd, InstrItinClass Itin> {
+  def CMP_AF_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+      !strconcat("cmp.af.", Typestr), format, FIELD_CMP_COND_AF>,
+      CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd, Itin>, HARDFLOAT,
+      ISA_MICROMIPS32R6;
+  def CMP_UN_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+      !strconcat("cmp.un.", Typestr), format, FIELD_CMP_COND_UN>,
+      CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, Itin, setuo>, HARDFLOAT,
+      ISA_MICROMIPS32R6;
+  def CMP_EQ_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+      !strconcat("cmp.eq.", Typestr), format, FIELD_CMP_COND_EQ>,
+      CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, Itin, setoeq>, HARDFLOAT,
+      ISA_MICROMIPS32R6;
+  def CMP_UEQ_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+      !strconcat("cmp.ueq.", Typestr), format, FIELD_CMP_COND_UEQ>,
+      CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, Itin, setueq>, HARDFLOAT,
+      ISA_MICROMIPS32R6;
+  def CMP_LT_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+      !strconcat("cmp.lt.", Typestr), format, FIELD_CMP_COND_LT>,
+      CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, Itin, setolt>, HARDFLOAT,
+      ISA_MICROMIPS32R6;
+  def CMP_ULT_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+      !strconcat("cmp.ult.", Typestr), format, FIELD_CMP_COND_ULT>,
+      CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, Itin, setult>, HARDFLOAT,
+      ISA_MICROMIPS32R6;
+  def CMP_LE_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+      !strconcat("cmp.le.", Typestr), format, FIELD_CMP_COND_LE>,
+      CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, Itin, setole>, HARDFLOAT,
+      ISA_MICROMIPS32R6;
+  def CMP_ULE_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+      !strconcat("cmp.ule.", Typestr), format, FIELD_CMP_COND_ULE>,
+      CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, Itin, setule>, HARDFLOAT,
+      ISA_MICROMIPS32R6;
+  def CMP_SAF_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+      !strconcat("cmp.saf.", Typestr), format, FIELD_CMP_COND_SAF>,
+      CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd, Itin>, HARDFLOAT,
+      ISA_MICROMIPS32R6;
+  def CMP_SUN_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+      !strconcat("cmp.sun.", Typestr), format, FIELD_CMP_COND_SUN>,
+      CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd, Itin>, HARDFLOAT,
+      ISA_MICROMIPS32R6;
+  def CMP_SEQ_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+      !strconcat("cmp.seq.", Typestr), format, FIELD_CMP_COND_SEQ>,
+      CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd, Itin>, HARDFLOAT,
+      ISA_MICROMIPS32R6;
+  def CMP_SUEQ_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+      !strconcat("cmp.sueq.", Typestr), format, FIELD_CMP_COND_SUEQ>,
+      CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd, Itin>, HARDFLOAT,
+      ISA_MICROMIPS32R6;
+  def CMP_SLT_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+      !strconcat("cmp.slt.", Typestr), format, FIELD_CMP_COND_SLT>,
+      CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd, Itin>, HARDFLOAT,
+      ISA_MICROMIPS32R6;
+  def CMP_SULT_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+      !strconcat("cmp.sult.", Typestr), format, FIELD_CMP_COND_SULT>,
+      CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd, Itin>, HARDFLOAT,
+      ISA_MICROMIPS32R6;
+  def CMP_SLE_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+      !strconcat("cmp.sle.", Typestr), format, FIELD_CMP_COND_SLE>,
+      CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd, Itin>, HARDFLOAT,
+      ISA_MICROMIPS32R6;
+  def CMP_SULE_#NAME : R6MMR6Rel, POOL32F_CMP_FM<
+      !strconcat("cmp.sule.", Typestr), format, FIELD_CMP_COND_SULE>,
+      CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd, Itin>, HARDFLOAT,
+      ISA_MICROMIPS32R6;
+}
+
+class ABSS_FT_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC,
+                             RegisterOperand SrcRC, InstrItinClass Itin,
+                             SDPatternOperator OpNode = null_frag>
+    : HARDFLOAT, NeverHasSideEffects {
+  dag OutOperandList = (outs DstRC:$ft);
+  dag InOperandList  = (ins SrcRC:$fs);
+  string AsmString   = !strconcat(instr_asm, "\t$ft, $fs");
+  list<dag>  Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))];
+  InstrItinClass Itinerary = Itin;
+  Format Form = FrmFR;
+  list<Predicate> EncodingPredicates = [HasStdEnc];
+}
+
+class ABS_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"abs.s", FGR32Opnd, FGR32Opnd,
+                                                II_ABS, fabs>;
+class ABS_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"abs.d", AFGR64Opnd, AFGR64Opnd,
+                                                II_ABS, fabs>;
+class FLOOR_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.l.s", FGR64Opnd,
+                                                    FGR32Opnd, II_FLOOR>;
+class FLOOR_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.l.d", FGR64Opnd,
+                                                    FGR64Opnd, II_FLOOR>;
+class FLOOR_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.w.s", FGR32Opnd,
+                                                    FGR32Opnd, II_FLOOR>;
+class FLOOR_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.w.d", FGR32Opnd,
+                                                    AFGR64Opnd, II_FLOOR>;
+class CEIL_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.l.s", FGR64Opnd,
+                                                   FGR32Opnd, II_CEIL>;
+class CEIL_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.l.d", FGR64Opnd,
+                                                   FGR64Opnd, II_CEIL>;
+class CEIL_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.w.s", FGR32Opnd,
+                                                   FGR32Opnd, II_CEIL>;
+class CEIL_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.w.d", FGR32Opnd,
+                                                   AFGR64Opnd, II_CEIL>;
+class TRUNC_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.l.s", FGR64Opnd,
+                                                    FGR32Opnd, II_TRUNC>;
+class TRUNC_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.l.d", FGR64Opnd,
+                                                    FGR64Opnd, II_TRUNC>;
+class TRUNC_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.s", FGR32Opnd,
+                                                    FGR32Opnd, II_TRUNC>;
+class TRUNC_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.d", FGR32Opnd,
+                                                    AFGR64Opnd, II_TRUNC>;
+class SQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.s", FGR32Opnd, FGR32Opnd,
+                                                 II_SQRT_S, fsqrt>;
+class SQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.d", AFGR64Opnd, AFGR64Opnd,
+                                                 II_SQRT_D, fsqrt>;
+class ROUND_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.s", FGR64Opnd,
+                                                   FGR32Opnd, II_ROUND>;
+class ROUND_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.d", FGR64Opnd,
+                                                   FGR64Opnd, II_ROUND>;
+class ROUND_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.s", FGR32Opnd,
+                                                   FGR32Opnd, II_ROUND>;
+class ROUND_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.d", FGR64Opnd,
+                                                   FGR64Opnd, II_ROUND>;
+
+class SEL_S_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd, II_SEL_S>;
+class SEL_D_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd, II_SEL_D> {
+  // We must insert a SUBREG_TO_REG around $fd_in
+  bit usesCustomInserter = 1;
+}
+
+class SELEQZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd,
+                                              II_SELCCZ_S>;
+class SELEQZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd,
+                                              II_SELCCZ_D>;
+class SELNEZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd,
+                                              II_SELCCZ_S>;
+class SELNEZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd,
+                                              II_SELCCZ_D>;
+class RINT_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.s", FGR32Opnd,
+                                              II_RINT_S>;
+class RINT_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd,
+                                              II_RINT_S>;
+class CLASS_S_MMR6_DESC  : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd,
+                                              II_CLASS_S>;
+class CLASS_D_MMR6_DESC  : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd,
+                                              II_CLASS_S>;
+
+class STORE_MMR6_DESC_BASE<string opstr, DAGOperand RO,
+                           InstrItinClass Itin>
+    : Store<opstr, RO>, MMR6Arch<opstr> {
+  let DecoderMethod = "DecodeMemMMImm16";
+  InstrItinClass Itinerary = Itin;
+}
+class SB_MMR6_DESC : STORE_MMR6_DESC_BASE<"sb", GPR32Opnd, II_SB>;
+
+class STORE_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO,
+                               InstrItinClass Itin>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins RO:$rt, mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  string DecoderMethod = "DecodeStoreEvaOpMM";
+  bit mayStore = 1;
+  InstrItinClass Itinerary = Itin;
+}
+class SBE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"sbe", GPR32Opnd, II_SBE>;
+class SCE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"sce", GPR32Opnd, II_SCE>;
+class SH_MMR6_DESC : STORE_MMR6_DESC_BASE<"sh", GPR32Opnd, II_SH>;
+class SHE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"she", GPR32Opnd, II_SHE>;
+class LOAD_WORD_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO,
+                                   InstrItinClass Itin>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  dag OutOperandList = (outs RO:$rt);
+  dag InOperandList = (ins mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  string DecoderMethod = "DecodeMemMMImm9";
+  bit mayLoad = 1;
+  InstrItinClass Itinerary = Itin;
+}
+class LLE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lle", GPR32Opnd, II_LLE>;
+class LWE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lwe", GPR32Opnd, II_LWE>;
+class ADDU16_MMR6_DESC : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
+      MMR6Arch<"addu16"> {
+  int AddedComplexity = 1;
+}
+class AND16_MMR6_DESC : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>,
+      MMR6Arch<"and16"> {
+  int AddedComplexity = 1;
+}
+class ANDI16_MMR6_DESC : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>,
+      MMR6Arch<"andi16">;
+class NOT16_MMR6_DESC : NotMM16<"not16", GPRMM16Opnd>, MMR6Arch<"not16"> {
+  int AddedComplexity = 1;
+}
+class OR16_MMR6_DESC : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>,
+      MMR6Arch<"or16"> {
+  int AddedComplexity = 1;
+}
+class SLL16_MMR6_DESC : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>,
+      MMR6Arch<"sll16">;
+class SRL16_MMR6_DESC : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>,
+      MMR6Arch<"srl16">;
+class BREAK16_MMR6_DESC : BrkSdbbp16MM<"break16", II_BREAK>, MMR6Arch<"break16">,
+      MicroMipsR6Inst16;
+class LI16_MMR6_DESC : LoadImmMM16<"li16", li16_imm, GPRMM16Opnd>,
+      MMR6Arch<"li16">, MicroMipsR6Inst16, IsAsCheapAsAMove;
+class MOVE16_MMR6_DESC : MoveMM16<"move16", GPR32Opnd>, MMR6Arch<"move16">,
+      MicroMipsR6Inst16;
+class SDBBP16_MMR6_DESC : BrkSdbbp16MM<"sdbbp16", II_SDBBP>, MMR6Arch<"sdbbp16">,
+      MicroMipsR6Inst16;
+class SUBU16_MMR6_DESC : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
+      MMR6Arch<"subu16">, MicroMipsR6Inst16 {
+  int AddedComplexity = 1;
+}
+class XOR16_MMR6_DESC : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
+      MMR6Arch<"xor16"> {
+  int AddedComplexity = 1;
+}
+
+class LW_MMR6_DESC : MMR6Arch<"lw">, MipsR6Inst {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins mem:$addr);
+  string AsmString = "lw\t$rt, $addr";
+  let DecoderMethod = "DecodeMemMMImm16";
+  let canFoldAsLoad = 1;
+  let mayLoad = 1;
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (load addrDefault:$addr))];
+  InstrItinClass Itinerary = II_LW;
+}
+
+class LUI_MMR6_DESC : IsAsCheapAsAMove, MMR6Arch<"lui">, MipsR6Inst{
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins uimm16:$imm16);
+  string AsmString = "lui\t$rt, $imm16";
+  list<dag> Pattern = [];
+  bit hasSideEffects = 0;
+  bit isReMaterializable = 1;
+  InstrItinClass Itinerary = II_LUI;
+  Format Form = FrmI;
+}
+
+class SYNC_MMR6_DESC : MMR6Arch<"sync">, MipsR6Inst {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins uimm5:$stype);
+  string AsmString = !strconcat("sync", "\t$stype");
+  list<dag> Pattern = [(MipsSync immZExt5:$stype)];
+  InstrItinClass Itinerary = II_SYNC;
+  bit HasSideEffects = 1;
+}
+
+class SYNCI_MMR6_DESC : SYNCI_FT<"synci"> {
+  let DecoderMethod = "DecodeSynciR6";
+}
+
+class RDPGPR_MMR6_DESC : MMR6Arch<"rdpgpr">, MipsR6Inst {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rd);
+  string AsmString = !strconcat("rdpgpr", "\t$rt, $rd");
+  InstrItinClass Itinerary = II_RDPGPR;
+}
+
+class SDBBP_MMR6_DESC : MipsR6Inst {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins uimm20:$code_);
+  string AsmString = !strconcat("sdbbp", "\t$code_");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_SDBBP;
+}
+
+class LWM16_MMR6_DESC
+    : MicroMipsInst16<(outs reglist16:$rt), (ins mem_mm_4sp:$addr),
+                      !strconcat("lwm16", "\t$rt, $addr"), [],
+                      II_LWM, FrmI>,
+      MMR6Arch<"lwm16">, MicroMipsR6Inst16 {
+  let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
+  let mayLoad = 1;
+  ComplexPattern Addr = addr;
+}
+
+class SWM16_MMR6_DESC
+    : MicroMipsInst16<(outs), (ins reglist16:$rt, mem_mm_4sp:$addr),
+                      !strconcat("swm16", "\t$rt, $addr"), [],
+                      II_SWM, FrmI>,
+      MMR6Arch<"swm16">, MicroMipsR6Inst16 {
+  let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
+  let mayStore = 1;
+  ComplexPattern Addr = addr;
+}
+
+class SB16_MMR6_DESC_BASE<string opstr, DAGOperand RTOpnd, DAGOperand RO,
+                          SDPatternOperator OpNode, InstrItinClass Itin,
+                          Operand MemOpnd>
+    : MicroMipsInst16<(outs), (ins RTOpnd:$rt, MemOpnd:$addr),
+                      !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI>,
+      MMR6Arch<opstr>, MicroMipsR6Inst16 {
+  let DecoderMethod = "DecodeMemMMImm4";
+  let mayStore = 1;
+}
+class SB16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sb16", GPRMM16OpndZero, GPRMM16Opnd,
+                                           truncstorei8, II_SB, mem_mm_4>;
+class SH16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sh16", GPRMM16OpndZero, GPRMM16Opnd,
+                                           truncstorei16, II_SH, mem_mm_4_lsl1>;
+class SW16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sw16", GPRMM16OpndZero, GPRMM16Opnd,
+                                           store, II_SW, mem_mm_4_lsl2>;
+
+class SWSP_MMR6_DESC
+    : MicroMipsInst16<(outs), (ins GPR32Opnd:$rt, mem_mm_sp_imm5_lsl2:$offset),
+                      !strconcat("sw", "\t$rt, $offset"), [], II_SW, FrmI>,
+      MMR6Arch<"sw">, MicroMipsR6Inst16 {
+  let DecoderMethod = "DecodeMemMMSPImm5Lsl2";
+  let mayStore = 1;
+}
+
+class JALRC_HB_MMR6_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rs);
+  string AsmString = !strconcat("jalrc.hb", "\t$rt, $rs");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_JALR_HB;
+  Format Form = FrmJ;
+  bit isIndirectBranch = 1;
+  bit hasDelaySlot = 0;
+}
+
+class TLBINV_MMR6_DESC_BASE<string opstr, InstrItinClass Itin> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins);
+  string AsmString = opstr;
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = Itin;
+}
+
+class TLBINV_MMR6_DESC : TLBINV_MMR6_DESC_BASE<"tlbinv", II_TLBINV>;
+class TLBINVF_MMR6_DESC : TLBINV_MMR6_DESC_BASE<"tlbinvf", II_TLBINVF>;
+
+class DVPEVP_MMR6_DESC_BASE<string opstr, InstrItinClass Itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rs);
+  dag InOperandList = (ins);
+  string AsmString = !strconcat(opstr, "\t$rs");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = Itin;
+  bit hasUnModeledSideEffects = 1;
+}
+
+class DVP_MMR6_DESC : DVPEVP_MMR6_DESC_BASE<"dvp", II_DVP>;
+class EVP_MMR6_DESC : DVPEVP_MMR6_DESC_BASE<"evp", II_EVP>;
+
+class BEQZC_MMR6_DESC
+    : CMP_CBR_EQNE_Z_DESC_BASE<"beqzc", brtarget21_mm, GPR32Opnd>,
+      MMR6Arch<"beqzc">;
+class BNEZC_MMR6_DESC
+    : CMP_CBR_EQNE_Z_DESC_BASE<"bnezc", brtarget21_mm, GPR32Opnd>,
+      MMR6Arch<"bnezc">;
+
+class BRANCH_COP1_MMR6_DESC_BASE<string opstr> :
+    InstSE<(outs), (ins FGR64Opnd:$rt, brtarget_mm:$offset),
+           !strconcat(opstr, "\t$rt, $offset"), [], II_BC1CCZ, FrmI>,
+    HARDFLOAT, BRANCH_DESC_BASE {
+  list<Register> Defs = [AT];
+}
+
+class BC1EQZC_MMR6_DESC : BRANCH_COP1_MMR6_DESC_BASE<"bc1eqzc">;
+class BC1NEZC_MMR6_DESC : BRANCH_COP1_MMR6_DESC_BASE<"bc1nezc">;
+
+class BRANCH_COP2_MMR6_DESC_BASE<string opstr, InstrItinClass Itin>
+    : BRANCH_DESC_BASE {
+  dag InOperandList = (ins COP2Opnd:$rt, brtarget_mm:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(opstr, "\t$rt, $offset");
+  list<Register> Defs = [AT];
+  InstrItinClass Itinerary = Itin;
+}
+
+class BC2EQZC_MMR6_DESC : BRANCH_COP2_MMR6_DESC_BASE<"bc2eqzc", II_BC2CCZ>;
+class BC2NEZC_MMR6_DESC : BRANCH_COP2_MMR6_DESC_BASE<"bc2nezc", II_BC2CCZ>;
+
+class EXT_MMR6_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rs, uimm5:$pos, uimm5_plus1:$size);
+  string AsmString = !strconcat("ext", "\t$rt, $rs, $pos, $size");
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (MipsExt GPR32Opnd:$rs, imm:$pos,
+                       imm:$size))];
+  InstrItinClass Itinerary = II_EXT;
+  Format Form = FrmR;
+  string BaseOpcode = "ext";
+}
+
+class INS_MMR6_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rs, uimm5:$pos, uimm5_inssize_plus1:$size,
+                       GPR32Opnd:$src);
+  string AsmString = !strconcat("ins", "\t$rt, $rs, $pos, $size");
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (MipsIns GPR32Opnd:$rs, imm:$pos,
+                       imm:$size, GPR32Opnd:$src))];
+  InstrItinClass Itinerary = II_INS;
+  Format Form = FrmR;
+  string BaseOpcode = "ins";
+  string Constraints = "$src = $rt";
+}
+
+class JALRC_MMR6_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rs);
+  string AsmString = !strconcat("jalrc", "\t$rt, $rs");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_JALRC;
+  bit isCall = 1;
+  bit hasDelaySlot = 0;
+  list<Register> Defs = [RA];
+}
+
+class BOVC_BNVC_MMR6_DESC_BASE<string instr_asm, Operand opnd,
+                               RegisterOperand GPROpnd>
+    : BRANCH_DESC_BASE {
+  dag InOperandList = (ins GPROpnd:$rt, GPROpnd:$rs, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $offset");
+  list<Register> Defs = [AT];
+  InstrItinClass Itinerary = II_BCCC;
+}
+
+class BOVC_MMR6_DESC : BOVC_BNVC_MMR6_DESC_BASE<"bovc", brtargetr6, GPR32Opnd>;
+class BNVC_MMR6_DESC : BOVC_BNVC_MMR6_DESC_BASE<"bnvc", brtargetr6, GPR32Opnd>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Definitions
+//
+//===----------------------------------------------------------------------===//
+
+let DecoderNamespace = "MicroMipsR6" in {
+def ADD_MMR6 : StdMMR6Rel, ADD_MMR6_DESC, ADD_MMR6_ENC, ISA_MICROMIPS32R6;
+def ADDIU_MMR6 : StdMMR6Rel, ADDIU_MMR6_DESC, ADDIU_MMR6_ENC, ISA_MICROMIPS32R6;
+def ADDU_MMR6 : StdMMR6Rel, ADDU_MMR6_DESC, ADDU_MMR6_ENC, ISA_MICROMIPS32R6;
+def ADDIUPC_MMR6 : R6MMR6Rel, ADDIUPC_MMR6_ENC, ADDIUPC_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def ALUIPC_MMR6 : R6MMR6Rel, ALUIPC_MMR6_ENC, ALUIPC_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def AND_MMR6 : StdMMR6Rel, AND_MMR6_DESC, AND_MMR6_ENC, ISA_MICROMIPS32R6;
+def ANDI_MMR6 : StdMMR6Rel, ANDI_MMR6_DESC, ANDI_MMR6_ENC, ISA_MICROMIPS32R6;
+def AUIPC_MMR6 : R6MMR6Rel, AUIPC_MMR6_ENC, AUIPC_MMR6_DESC, ISA_MICROMIPS32R6;
+def ALIGN_MMR6 : R6MMR6Rel, ALIGN_MMR6_ENC, ALIGN_MMR6_DESC, ISA_MICROMIPS32R6;
+def AUI_MMR6 : R6MMR6Rel, AUI_MMR6_ENC, AUI_MMR6_DESC, ISA_MICROMIPS32R6;
+def BALC_MMR6 : R6MMR6Rel, BALC_MMR6_ENC, BALC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BC_MMR6 : R6MMR6Rel, BC_MMR6_ENC, BC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BC16_MMR6 : StdMMR6Rel, BC16_MMR6_DESC, BC16_MMR6_ENC, ISA_MICROMIPS32R6;
+def BEQZC_MMR6 : R6MMR6Rel, BEQZC_MMR6_ENC, BEQZC_MMR6_DESC,
+                 ISA_MICROMIPS32R6;
+def BEQZC16_MMR6 : StdMMR6Rel, BEQZC16_MMR6_DESC, BEQZC16_MMR6_ENC,
+                   ISA_MICROMIPS32R6;
+def BNEZC_MMR6 : R6MMR6Rel, BNEZC_MMR6_ENC, BNEZC_MMR6_DESC,
+                 ISA_MICROMIPS32R6;
+def BNEZC16_MMR6 : StdMMR6Rel, BNEZC16_MMR6_DESC, BNEZC16_MMR6_ENC,
+                   ISA_MICROMIPS32R6;
+def BITSWAP_MMR6 : R6MMR6Rel, BITSWAP_MMR6_ENC, BITSWAP_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def BEQZALC_MMR6 : R6MMR6Rel, BEQZALC_MMR6_ENC, BEQZALC_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def BNEZALC_MMR6 : R6MMR6Rel, BNEZALC_MMR6_ENC, BNEZALC_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def BREAK_MMR6 : StdMMR6Rel, BRK_MMR6_DESC, BRK_MMR6_ENC, ISA_MICROMIPS32R6;
+def CACHE_MMR6 : R6MMR6Rel, CACHE_MMR6_ENC, CACHE_MMR6_DESC, ISA_MICROMIPS32R6;
+def CLO_MMR6 : R6MMR6Rel, CLO_MMR6_ENC, CLO_MMR6_DESC, ISA_MICROMIPS32R6;
+def CLZ_MMR6 : R6MMR6Rel, CLZ_MMR6_ENC, CLZ_MMR6_DESC, ISA_MICROMIPS32R6;
+def DIV_MMR6 : R6MMR6Rel, DIV_MMR6_DESC, DIV_MMR6_ENC, ISA_MICROMIPS32R6;
+def DIVU_MMR6 : R6MMR6Rel, DIVU_MMR6_DESC, DIVU_MMR6_ENC, ISA_MICROMIPS32R6;
+def EHB_MMR6 : StdMMR6Rel, EHB_MMR6_DESC, EHB_MMR6_ENC, ISA_MICROMIPS32R6;
+def EI_MMR6 : StdMMR6Rel, EI_MMR6_DESC, EI_MMR6_ENC, ISA_MICROMIPS32R6;
+def DI_MMR6 : StdMMR6Rel, DI_MMR6_DESC, DI_MMR6_ENC, ISA_MICROMIPS32R6;
+def ERET_MMR6 : StdMMR6Rel, ERET_MMR6_DESC, ERET_MMR6_ENC, ISA_MICROMIPS32R6;
+def DERET_MMR6 : StdMMR6Rel, DERET_MMR6_DESC, DERET_MMR6_ENC, ISA_MICROMIPS32R6;
+def ERETNC_MMR6 : R6MMR6Rel, ERETNC_MMR6_DESC, ERETNC_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def JALRC16_MMR6 : R6MMR6Rel, JALRC16_MMR6_DESC, JALRC16_MMR6_ENC,
+                   ISA_MICROMIPS32R6;
+def JIALC_MMR6 : R6MMR6Rel, JIALC_MMR6_ENC, JIALC_MMR6_DESC, ISA_MICROMIPS32R6;
+def JIC_MMR6 : R6MMR6Rel, JIC_MMR6_ENC, JIC_MMR6_DESC, ISA_MICROMIPS32R6;
+def JRC16_MMR6 : R6MMR6Rel, JRC16_MMR6_DESC, JRC16_MMR6_ENC, ISA_MICROMIPS32R6;
+def JRCADDIUSP_MMR6 : R6MMR6Rel, JRCADDIUSP_MMR6_DESC, JRCADDIUSP_MMR6_ENC,
+                      ISA_MICROMIPS32R6;
+def LSA_MMR6 : R6MMR6Rel, LSA_MMR6_ENC, LSA_MMR6_DESC, ISA_MICROMIPS32R6;
+def LWP_MMR6 : StdMMR6Rel, LWP_MMR6_ENC, LWP_MMR6_DESC, ISA_MICROMIPS32R6;
+def LWPC_MMR6 : R6MMR6Rel, LWPC_MMR6_ENC, LWPC_MMR6_DESC, ISA_MICROMIPS32R6;
+def LWM16_MMR6 : StdMMR6Rel, LWM16_MMR6_DESC, LWM16_MMR6_ENC, ISA_MICROMIPS32R6;
+def MTC0_MMR6 : StdMMR6Rel, MTC0_MMR6_ENC, MTC0_MMR6_DESC, ISA_MICROMIPS32R6;
+def MTC1_MMR6 : StdMMR6Rel, MTC1_MMR6_DESC, MTC1_MMR6_ENC, ISA_MICROMIPS32R6;
+def MTC2_MMR6 : StdMMR6Rel, MTC2_MMR6_ENC, MTC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def MTHC0_MMR6 : R6MMR6Rel, MTHC0_MMR6_ENC, MTHC0_MMR6_DESC, ISA_MICROMIPS32R6;
+def MTHC1_D32_MMR6 : StdMMR6Rel, MTHC1_D32_MMR6_DESC, MTHC1_MMR6_ENC, ISA_MICROMIPS32R6;
+let DecoderNamespace = "MicroMips32r6FP64" in {
+  def MTHC1_D64_MMR6 : R6MMR6Rel, MTHC1_D64_MMR6_DESC, MTHC1_MMR6_ENC,
+                       ISA_MICROMIPS32R6;
+}
+def MTHC2_MMR6 : StdMMR6Rel, MTHC2_MMR6_ENC, MTHC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def MFC0_MMR6 : StdMMR6Rel, MFC0_MMR6_ENC, MFC0_MMR6_DESC, ISA_MICROMIPS32R6;
+def MFC1_MMR6 : StdMMR6Rel, MFC1_MMR6_DESC, MFC1_MMR6_ENC, ISA_MICROMIPS32R6;
+def MFC2_MMR6 : StdMMR6Rel, MFC2_MMR6_ENC, MFC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def MFHC0_MMR6 : R6MMR6Rel, MFHC0_MMR6_ENC, MFHC0_MMR6_DESC, ISA_MICROMIPS32R6;
+def MFHC1_D32_MMR6 : StdMMR6Rel, MFHC1_D32_MMR6_DESC, MFHC1_MMR6_ENC,
+                     ISA_MICROMIPS32R6;
+let DecoderNamespace = "MicroMips32r6FP64" in {
+  def MFHC1_D64_MMR6 : StdMMR6Rel, MFHC1_D64_MMR6_DESC, MFHC1_MMR6_ENC,
+                       ISA_MICROMIPS32R6;
+}
+def MFHC2_MMR6 : StdMMR6Rel, MFHC2_MMR6_ENC, MFHC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def MOD_MMR6 : R6MMR6Rel, MOD_MMR6_DESC, MOD_MMR6_ENC, ISA_MICROMIPS32R6;
+def MODU_MMR6 : R6MMR6Rel, MODU_MMR6_DESC, MODU_MMR6_ENC, ISA_MICROMIPS32R6;
+def MUL_MMR6 : R6MMR6Rel, MUL_MMR6_DESC, MUL_MMR6_ENC, ISA_MICROMIPS32R6;
+def MUH_MMR6 : R6MMR6Rel, MUH_MMR6_DESC, MUH_MMR6_ENC, ISA_MICROMIPS32R6;
+def MULU_MMR6 : R6MMR6Rel, MULU_MMR6_DESC, MULU_MMR6_ENC, ISA_MICROMIPS32R6;
+def MUHU_MMR6 : R6MMR6Rel, MUHU_MMR6_DESC, MUHU_MMR6_ENC, ISA_MICROMIPS32R6;
+def NOR_MMR6 : StdMMR6Rel, NOR_MMR6_DESC, NOR_MMR6_ENC, ISA_MICROMIPS32R6;
+def OR_MMR6 : StdMMR6Rel, OR_MMR6_DESC, OR_MMR6_ENC, ISA_MICROMIPS32R6;
+def ORI_MMR6 : StdMMR6Rel, ORI_MMR6_DESC, ORI_MMR6_ENC, ISA_MICROMIPS32R6;
+def PREF_MMR6 : R6MMR6Rel, PREF_MMR6_ENC, PREF_MMR6_DESC, ISA_MICROMIPS32R6;
+def SB16_MMR6 : StdMMR6Rel, SB16_MMR6_DESC, SB16_MMR6_ENC, ISA_MICROMIPS32R6;
+def SEB_MMR6 : StdMMR6Rel, SEB_MMR6_DESC, SEB_MMR6_ENC, ISA_MICROMIPS32R6;
+def SEH_MMR6 : StdMMR6Rel, SEH_MMR6_DESC, SEH_MMR6_ENC, ISA_MICROMIPS32R6;
+def SELEQZ_MMR6 : R6MMR6Rel, SELEQZ_MMR6_ENC, SELEQZ_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def SELNEZ_MMR6 : R6MMR6Rel, SELNEZ_MMR6_ENC, SELNEZ_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def SH16_MMR6 : StdMMR6Rel, SH16_MMR6_DESC, SH16_MMR6_ENC, ISA_MICROMIPS32R6;
+def SLL_MMR6 : StdMMR6Rel, SLL_MMR6_DESC, SLL_MMR6_ENC, ISA_MICROMIPS32R6;
+def SUB_MMR6 : StdMMR6Rel, SUB_MMR6_DESC, SUB_MMR6_ENC, ISA_MICROMIPS32R6;
+def SUBU_MMR6 : StdMMR6Rel, SUBU_MMR6_DESC, SUBU_MMR6_ENC, ISA_MICROMIPS32R6;
+def SW16_MMR6 : StdMMR6Rel, SW16_MMR6_DESC, SW16_MMR6_ENC, ISA_MICROMIPS32R6;
+def SWM16_MMR6 : StdMMR6Rel, SWM16_MMR6_DESC, SWM16_MMR6_ENC, ISA_MICROMIPS32R6;
+def SWSP_MMR6 : StdMMR6Rel, SWSP_MMR6_DESC, SWSP_MMR6_ENC, ISA_MICROMIPS32R6;
+def SWP_MMR6 : StdMMR6Rel, SWP_MMR6_ENC, SWP_MMR6_DESC, ISA_MICROMIPS32R6;
+def PREFE_MMR6 : StdMMR6Rel, PREFE_MMR6_ENC, PREFE_MMR6_DESC, ISA_MICROMIPS32R6;
+def CACHEE_MMR6 : StdMMR6Rel, CACHEE_MMR6_ENC, CACHEE_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def WRPGPR_MMR6 : StdMMR6Rel, WRPGPR_MMR6_ENC, WRPGPR_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def WSBH_MMR6 : StdMMR6Rel, WSBH_MMR6_ENC, WSBH_MMR6_DESC, ISA_MICROMIPS32R6;
+def LB_MMR6 : R6MMR6Rel, LB_MMR6_ENC, LB_MMR6_DESC, ISA_MICROMIPS32R6;
+def LBU_MMR6 : R6MMR6Rel, LBU_MMR6_ENC, LBU_MMR6_DESC, ISA_MICROMIPS32R6;
+def LBE_MMR6 : R6MMR6Rel, LBE_MMR6_ENC, LBE_MMR6_DESC, ISA_MICROMIPS32R6;
+def LBUE_MMR6 : R6MMR6Rel, LBUE_MMR6_ENC, LBUE_MMR6_DESC, ISA_MICROMIPS32R6;
+def PAUSE_MMR6 : StdMMR6Rel, PAUSE_MMR6_DESC, PAUSE_MMR6_ENC, ISA_MICROMIPS32R6;
+def RDHWR_MMR6 : R6MMR6Rel, RDHWR_MMR6_DESC, RDHWR_MMR6_ENC, ISA_MICROMIPS32R6;
+def WAIT_MMR6 : StdMMR6Rel, WAIT_MMR6_DESC, WAIT_MMR6_ENC, ISA_MICROMIPS32R6;
+def SSNOP_MMR6 : StdMMR6Rel, SSNOP_MMR6_DESC, SSNOP_MMR6_ENC, ISA_MICROMIPS32R6;
+def SYNC_MMR6 : StdMMR6Rel, SYNC_MMR6_DESC, SYNC_MMR6_ENC, ISA_MICROMIPS32R6;
+def SYNCI_MMR6 : StdMMR6Rel, SYNCI_MMR6_DESC, SYNCI_MMR6_ENC, ISA_MICROMIPS32R6;
+def RDPGPR_MMR6 : R6MMR6Rel, RDPGPR_MMR6_DESC, RDPGPR_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def SDBBP_MMR6 : R6MMR6Rel, SDBBP_MMR6_DESC, SDBBP_MMR6_ENC, ISA_MICROMIPS32R6;
+def XOR_MMR6 : StdMMR6Rel, XOR_MMR6_DESC, XOR_MMR6_ENC, ISA_MICROMIPS32R6;
+def XORI_MMR6 : StdMMR6Rel, XORI_MMR6_DESC, XORI_MMR6_ENC, ISA_MICROMIPS32R6;
+let DecoderMethod = "DecodeMemMMImm16" in {
+  def SW_MMR6 : StdMMR6Rel, SW_MMR6_DESC, SW_MMR6_ENC, ISA_MICROMIPS32R6;
+}
+let DecoderMethod = "DecodeMemMMImm9" in {
+  def SWE_MMR6 : StdMMR6Rel, SWE_MMR6_DESC, SWE_MMR6_ENC, ISA_MICROMIPS32R6;
+}
+/// Floating Point Instructions
+def FADD_S_MMR6 : StdMMR6Rel, FADD_S_MMR6_ENC, FADD_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FADD_D_MMR6 : StdMMR6Rel, FADD_D_MMR6_ENC, FADD_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FSUB_S_MMR6 : StdMMR6Rel, FSUB_S_MMR6_ENC, FSUB_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FSUB_D_MMR6 : StdMMR6Rel, FSUB_D_MMR6_ENC, FSUB_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FMUL_S_MMR6 : StdMMR6Rel, FMUL_S_MMR6_ENC, FMUL_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FMUL_D_MMR6 : StdMMR6Rel, FMUL_D_MMR6_ENC, FMUL_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FDIV_S_MMR6 : StdMMR6Rel, FDIV_S_MMR6_ENC, FDIV_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FDIV_D_MMR6 : StdMMR6Rel, FDIV_D_MMR6_ENC, FDIV_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def MADDF_S_MMR6 : R6MMR6Rel, MADDF_S_MMR6_ENC, MADDF_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def MADDF_D_MMR6 : R6MMR6Rel, MADDF_D_MMR6_ENC, MADDF_D_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def MSUBF_S_MMR6 : R6MMR6Rel, MSUBF_S_MMR6_ENC, MSUBF_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def MSUBF_D_MMR6 : R6MMR6Rel, MSUBF_D_MMR6_ENC, MSUBF_D_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def FMOV_S_MMR6 : StdMMR6Rel, FMOV_S_MMR6_ENC, FMOV_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FMOV_D_MMR6 : StdMMR6Rel, FMOV_D_MMR6_ENC, FMOV_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FNEG_S_MMR6 : StdMMR6Rel, FNEG_S_MMR6_ENC, FNEG_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def FNEG_D_MMR6 : StdMMR6Rel, FNEG_D_MMR6_ENC, FNEG_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def MAX_S_MMR6 : R6MMR6Rel, MAX_S_MMR6_ENC, MAX_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def MAX_D_MMR6 : R6MMR6Rel, MAX_D_MMR6_ENC, MAX_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def MIN_S_MMR6 : R6MMR6Rel, MIN_S_MMR6_ENC, MIN_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def MIN_D_MMR6 : R6MMR6Rel, MIN_D_MMR6_ENC, MIN_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def MAXA_S_MMR6 : R6MMR6Rel, MAXA_S_MMR6_ENC, MAXA_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def MAXA_D_MMR6 : R6MMR6Rel, MAXA_D_MMR6_ENC, MAXA_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def MINA_S_MMR6 : R6MMR6Rel, MINA_S_MMR6_ENC, MINA_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def MINA_D_MMR6 : R6MMR6Rel, MINA_D_MMR6_ENC, MINA_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def CVT_L_S_MMR6 : StdMMR6Rel, CVT_L_S_MMR6_ENC, CVT_L_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_L_D_MMR6 : StdMMR6Rel, CVT_L_D_MMR6_ENC, CVT_L_D_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_W_S_MMR6 : StdMMR6Rel, CVT_W_S_MMR6_ENC, CVT_W_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_W_D_MMR6 : StdMMR6Rel, CVT_W_D_MMR6_ENC, CVT_W_D_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_D_S_MMR6 : StdMMR6Rel, CVT_D_S_MMR6_ENC, CVT_D_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_D_W_MMR6 : StdMMR6Rel, CVT_D_W_MMR6_ENC, CVT_D_W_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_D_L_MMR6 : StdMMR6Rel, CVT_D_L_MMR6_ENC, CVT_D_L_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_S_D_MMR6 : StdMMR6Rel, CVT_S_D_MMR6_ENC, CVT_S_D_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_S_W_MMR6 : StdMMR6Rel, CVT_S_W_MMR6_ENC, CVT_S_W_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CVT_S_L_MMR6 : StdMMR6Rel, CVT_S_L_MMR6_ENC, CVT_S_L_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+defm S_MMR6 : CMP_CC_MMR6<0b000101, "s", FGR32Opnd, II_CMP_CC_S>;
+defm D_MMR6 : CMP_CC_MMR6<0b010101, "d", FGR64Opnd, II_CMP_CC_D>;
+def ABS_S_MMR6 : StdMMR6Rel, ABS_S_MMR6_ENC, ABS_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def ABS_D_MMR6 : StdMMR6Rel, ABS_D_MMR6_ENC, ABS_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def FLOOR_L_S_MMR6 : StdMMR6Rel, FLOOR_L_S_MMR6_ENC, FLOOR_L_S_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def FLOOR_L_D_MMR6 : StdMMR6Rel, FLOOR_L_D_MMR6_ENC, FLOOR_L_D_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def FLOOR_W_S_MMR6 : StdMMR6Rel, FLOOR_W_S_MMR6_ENC, FLOOR_W_S_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def FLOOR_W_D_MMR6 : StdMMR6Rel, FLOOR_W_D_MMR6_ENC, FLOOR_W_D_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def CEIL_L_S_MMR6 : StdMMR6Rel, CEIL_L_S_MMR6_ENC, CEIL_L_S_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def CEIL_L_D_MMR6 : StdMMR6Rel, CEIL_L_D_MMR6_ENC, CEIL_L_D_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def CEIL_W_S_MMR6 : StdMMR6Rel, CEIL_W_S_MMR6_ENC, CEIL_W_S_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def CEIL_W_D_MMR6 : StdMMR6Rel, CEIL_W_D_MMR6_ENC, CEIL_W_D_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def TRUNC_L_S_MMR6 : StdMMR6Rel, TRUNC_L_S_MMR6_ENC, TRUNC_L_S_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def TRUNC_L_D_MMR6 : StdMMR6Rel, TRUNC_L_D_MMR6_ENC, TRUNC_L_D_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def TRUNC_W_S_MMR6 : StdMMR6Rel, TRUNC_W_S_MMR6_ENC, TRUNC_W_S_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def TRUNC_W_D_MMR6 : StdMMR6Rel, TRUNC_W_D_MMR6_ENC, TRUNC_W_D_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def SQRT_S_MMR6 : StdMMR6Rel, SQRT_S_MMR6_ENC, SQRT_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def SQRT_D_MMR6 : StdMMR6Rel, SQRT_D_MMR6_ENC, SQRT_D_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def SB_MMR6 : StdMMR6Rel, SB_MMR6_DESC, SB_MMR6_ENC, ISA_MICROMIPS32R6;
+def SBE_MMR6 : StdMMR6Rel, SBE_MMR6_DESC, SBE_MMR6_ENC, ISA_MICROMIPS32R6;
+def SCE_MMR6 : StdMMR6Rel, SCE_MMR6_DESC, SCE_MMR6_ENC, ISA_MICROMIPS32R6;
+def SH_MMR6 : StdMMR6Rel, SH_MMR6_DESC, SH_MMR6_ENC, ISA_MICROMIPS32R6;
+def SHE_MMR6 : StdMMR6Rel, SHE_MMR6_DESC, SHE_MMR6_ENC, ISA_MICROMIPS32R6;
+def LLE_MMR6 : StdMMR6Rel, LLE_MMR6_DESC, LLE_MMR6_ENC, ISA_MICROMIPS32R6;
+def LWE_MMR6 : StdMMR6Rel, LWE_MMR6_DESC, LWE_MMR6_ENC, ISA_MICROMIPS32R6;
+def LW_MMR6 : StdMMR6Rel, LW_MMR6_DESC, LW_MMR6_ENC, ISA_MICROMIPS32R6;
+def LUI_MMR6 : R6MMR6Rel, LUI_MMR6_DESC, LUI_MMR6_ENC, ISA_MICROMIPS32R6;
+def ADDU16_MMR6 : StdMMR6Rel, ADDU16_MMR6_DESC, ADDU16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def AND16_MMR6 : StdMMR6Rel, AND16_MMR6_DESC, AND16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def ANDI16_MMR6 : StdMMR6Rel, ANDI16_MMR6_DESC, ANDI16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def NOT16_MMR6 : StdMMR6Rel, NOT16_MMR6_DESC, NOT16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def OR16_MMR6 : StdMMR6Rel, OR16_MMR6_DESC, OR16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def SLL16_MMR6 : StdMMR6Rel, SLL16_MMR6_DESC, SLL16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def SRL16_MMR6 : StdMMR6Rel, SRL16_MMR6_DESC, SRL16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def BREAK16_MMR6 : StdMMR6Rel, BREAK16_MMR6_DESC, BREAK16_MMR6_ENC,
+                   ISA_MICROMIPS32R6;
+def LI16_MMR6 : StdMMR6Rel, LI16_MMR6_DESC, LI16_MMR6_ENC,
+                ISA_MICROMIPS32R6;
+def MOVE16_MMR6 : StdMMR6Rel, MOVE16_MMR6_DESC, MOVE16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def SDBBP16_MMR6 : StdMMR6Rel, SDBBP16_MMR6_DESC, SDBBP16_MMR6_ENC,
+                   ISA_MICROMIPS32R6;
+def SUBU16_MMR6 : StdMMR6Rel, SUBU16_MMR6_DESC, SUBU16_MMR6_ENC,
+                  ISA_MICROMIPS32R6;
+def XOR16_MMR6 : StdMMR6Rel, XOR16_MMR6_DESC, XOR16_MMR6_ENC,
+                 ISA_MICROMIPS32R6;
+def JALRC_HB_MMR6 : R6MMR6Rel, JALRC_HB_MMR6_ENC, JALRC_HB_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def EXT_MMR6 : StdMMR6Rel, EXT_MMR6_ENC, EXT_MMR6_DESC, ISA_MICROMIPS32R6;
+def INS_MMR6 : StdMMR6Rel, INS_MMR6_ENC, INS_MMR6_DESC, ISA_MICROMIPS32R6;
+def JALRC_MMR6 : R6MMR6Rel, JALRC_MMR6_ENC, JALRC_MMR6_DESC, ISA_MICROMIPS32R6;
+def RINT_S_MMR6 : StdMMR6Rel, RINT_S_MMR6_ENC, RINT_S_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def RINT_D_MMR6 : StdMMR6Rel, RINT_D_MMR6_ENC, RINT_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def ROUND_L_S_MMR6 : StdMMR6Rel, ROUND_L_S_MMR6_ENC, ROUND_L_S_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def ROUND_L_D_MMR6 : StdMMR6Rel, ROUND_L_D_MMR6_ENC, ROUND_L_D_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def ROUND_W_S_MMR6 : StdMMR6Rel, ROUND_W_S_MMR6_ENC, ROUND_W_S_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def ROUND_W_D_MMR6 : StdMMR6Rel, ROUND_W_D_MMR6_ENC, ROUND_W_D_MMR6_DESC,
+                     ISA_MICROMIPS32R6;
+def SEL_S_MMR6 : R6MMR6Rel, SEL_S_MMR6_ENC, SEL_S_MMR6_DESC, ISA_MICROMIPS32R6;
+def SEL_D_MMR6 : R6MMR6Rel, SEL_D_MMR6_ENC, SEL_D_MMR6_DESC, ISA_MICROMIPS32R6;
+def SELEQZ_S_MMR6 : R6MMR6Rel, SELEQZ_S_MMR6_ENC, SELEQZ_S_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def SELEQZ_D_MMR6 : R6MMR6Rel, SELEQZ_D_MMR6_ENC, SELEQZ_D_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def SELNEZ_S_MMR6 : R6MMR6Rel, SELNEZ_S_MMR6_ENC, SELNEZ_S_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def SELNEZ_D_MMR6 : R6MMR6Rel, SELNEZ_D_MMR6_ENC, SELNEZ_D_MMR6_DESC,
+                    ISA_MICROMIPS32R6;
+def CLASS_S_MMR6 : StdMMR6Rel, CLASS_S_MMR6_ENC, CLASS_S_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def CLASS_D_MMR6 : StdMMR6Rel, CLASS_D_MMR6_ENC, CLASS_D_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def TLBINV_MMR6 : StdMMR6Rel, TLBINV_MMR6_ENC, TLBINV_MMR6_DESC,
+                  ISA_MICROMIPS32R6;
+def TLBINVF_MMR6 : StdMMR6Rel, TLBINVF_MMR6_ENC, TLBINVF_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def DVP_MMR6 : R6MMR6Rel, DVP_MMR6_ENC, DVP_MMR6_DESC, ISA_MICROMIPS32R6;
+def EVP_MMR6 : R6MMR6Rel, EVP_MMR6_ENC, EVP_MMR6_DESC, ISA_MICROMIPS32R6;
+def BC1EQZC_MMR6 : R6MMR6Rel, BC1EQZC_MMR6_DESC, BC1EQZC_MMR6_ENC,
+                   ISA_MICROMIPS32R6;
+def BC1NEZC_MMR6 : R6MMR6Rel, BC1NEZC_MMR6_DESC, BC1NEZC_MMR6_ENC,
+                   ISA_MICROMIPS32R6;
+def BC2EQZC_MMR6 : R6MMR6Rel, MipsR6Inst, BC2EQZC_MMR6_ENC, BC2EQZC_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def BC2NEZC_MMR6 : R6MMR6Rel, MipsR6Inst, BC2NEZC_MMR6_ENC, BC2NEZC_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+let DecoderNamespace = "MicroMips32r6FP64" in {
+  def LDC1_D64_MMR6 : StdMMR6Rel, LDC1_D64_MMR6_DESC, LDC1_MMR6_ENC,
+                      ISA_MICROMIPS32R6 {
+    let BaseOpcode = "LDC164";
+  }
+  def SDC1_D64_MMR6 : StdMMR6Rel, SDC1_D64_MMR6_DESC, SDC1_MMR6_ENC,
+                      ISA_MICROMIPS32R6;
+}
+def LDC2_MMR6 : StdMMR6Rel, LDC2_MMR6_ENC, LDC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def SDC2_MMR6 : StdMMR6Rel, SDC2_MMR6_ENC, SDC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def LWC2_MMR6 : StdMMR6Rel, LWC2_MMR6_ENC, LWC2_MMR6_DESC, ISA_MICROMIPS32R6;
+def SWC2_MMR6 : StdMMR6Rel, SWC2_MMR6_ENC, SWC2_MMR6_DESC, ISA_MICROMIPS32R6;
+}
+
+def BOVC_MMR6 : R6MMR6Rel, BOVC_MMR6_ENC, BOVC_MMR6_DESC, ISA_MICROMIPS32R6,
+                MMDecodeDisambiguatedBy<"POP35GroupBranchMMR6">;
+def BNVC_MMR6 : R6MMR6Rel, BNVC_MMR6_ENC, BNVC_MMR6_DESC, ISA_MICROMIPS32R6,
+                MMDecodeDisambiguatedBy<"POP37GroupBranchMMR6">;
+def BGEC_MMR6 : R6MMR6Rel, BGEC_MMR6_ENC, BGEC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BGEUC_MMR6 : R6MMR6Rel, BGEUC_MMR6_ENC, BGEUC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BLTC_MMR6 : R6MMR6Rel, BLTC_MMR6_ENC, BLTC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BLTUC_MMR6 : R6MMR6Rel, BLTUC_MMR6_ENC, BLTUC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BEQC_MMR6 : R6MMR6Rel, BEQC_MMR6_ENC, BEQC_MMR6_DESC, ISA_MICROMIPS32R6,
+                DecodeDisambiguates<"POP35GroupBranchMMR6">;
+def BNEC_MMR6 : R6MMR6Rel, BNEC_MMR6_ENC, BNEC_MMR6_DESC, ISA_MICROMIPS32R6,
+                DecodeDisambiguates<"POP37GroupBranchMMR6">;
+def BLTZC_MMR6 : R6MMR6Rel, BLTZC_MMR6_ENC, BLTZC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BLEZC_MMR6 : R6MMR6Rel, BLEZC_MMR6_ENC, BLEZC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BGEZC_MMR6 : R6MMR6Rel, BGEZC_MMR6_ENC, BGEZC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BGTZC_MMR6 : R6MMR6Rel, BGTZC_MMR6_ENC, BGTZC_MMR6_DESC, ISA_MICROMIPS32R6;
+def BGEZALC_MMR6 : R6MMR6Rel, BGEZALC_MMR6_ENC, BGEZALC_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def BGTZALC_MMR6 : R6MMR6Rel, BGTZALC_MMR6_ENC, BGTZALC_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def BLEZALC_MMR6 : R6MMR6Rel, BLEZALC_MMR6_ENC, BLEZALC_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+def BLTZALC_MMR6 : R6MMR6Rel, BLTZALC_MMR6_ENC, BLTZALC_MMR6_DESC,
+                   ISA_MICROMIPS32R6;
+
+//===----------------------------------------------------------------------===//
+//
+// MicroMips instruction aliases
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsInstAlias<"ei", (EI_MMR6 ZERO), 1>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"di", (DI_MMR6 ZERO), 1>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"nop", (SLL_MMR6 ZERO, ZERO, 0), 1>, ISA_MICROMIPS32R6;
+def B_MMR6_Pseudo : MipsAsmPseudoInst<(outs), (ins brtarget_mm:$offset),
+                                      !strconcat("b", "\t$offset")> {
+  string DecoderNamespace = "MicroMipsR6";
+}
+def : MipsInstAlias<"sync", (SYNC_MMR6 0), 1>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"sdbbp", (SDBBP_MMR6 0), 1>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"rdhwr $rt, $rs",
+                    (RDHWR_MMR6 GPR32Opnd:$rt, HWRegsOpnd:$rs, 0), 1>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"mtc0 $rt, $rs",
+                    (MTC0_MMR6 COP0Opnd:$rs, GPR32Opnd:$rt, 0), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"mthc0 $rt, $rs",
+                    (MTHC0_MMR6 COP0Opnd:$rs, GPR32Opnd:$rt, 0), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"mfc0 $rt, $rs",
+                    (MFC0_MMR6 GPR32Opnd:$rt, COP0Opnd:$rs, 0), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"mfhc0 $rt, $rs",
+                    (MFHC0_MMR6 GPR32Opnd:$rt, COP0Opnd:$rs, 0), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"jalrc.hb $rs", (JALRC_HB_MMR6 RA, GPR32Opnd:$rs), 1>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"dvp", (DVP_MMR6 ZERO), 0>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"evp", (EVP_MMR6 ZERO), 0>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"jalrc $rs", (JALRC_MMR6 RA, GPR32Opnd:$rs), 1>,
+      ISA_MICROMIPS32R6;
+def : MipsInstAlias<"and $rs, $rt, $imm",
+                    (ANDI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"and $rs, $imm",
+                    (ANDI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"or $rs, $rt, $imm",
+                    (ORI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"or $rs, $imm",
+                    (ORI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"xor $rs, $rt, $imm",
+                    (XORI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"xor $rs, $imm",
+                    (XORI_MMR6 GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"not $rt, $rs",
+                    (NOR_MMR6 GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"seh $rd", (SEH_MMR6 GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
+                    ISA_MICROMIPS32R6;
+def : MipsInstAlias<"seb $rd", (SEB_MMR6 GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
+                    ISA_MICROMIPS32R6;
+
+//===----------------------------------------------------------------------===//
+//
+// MicroMips arbitrary patterns that map to one or more instructions
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsPat<(store GPRMM16:$src, addrimm4lsl2:$addr),
+              (SW16_MMR6 GPRMM16:$src, addrimm4lsl2:$addr)>, ISA_MICROMIPS32R6;
+def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
+              (SUBU_MMR6 GPR32:$lhs, GPR32:$rhs)>, ISA_MICROMIPS32R6;
+
+def : MipsPat<(select i32:$cond, i32:$t, i32:$f),
+              (OR_MM (SELNEZ_MMR6 i32:$t, i32:$cond),
+                     (SELEQZ_MMR6 i32:$f, i32:$cond))>,
+              ISA_MICROMIPS32R6;
+def : MipsPat<(select i32:$cond, i32:$t, immz),
+              (SELNEZ_MMR6 i32:$t, i32:$cond)>,
+              ISA_MICROMIPS32R6;
+def : MipsPat<(select i32:$cond, immz, i32:$f),
+              (SELEQZ_MMR6 i32:$f, i32:$cond)>,
+              ISA_MICROMIPS32R6;
+
+defm : SelectInt_Pats<i32, OR_MM, XORI_MMR6, SLTi_MM, SLTiu_MM, SELEQZ_MMR6,
+                      SELNEZ_MMR6, immZExt16, i32>, ISA_MICROMIPS32R6;
+
+defm S_MMR6 : Cmp_Pats<f32, NOR_MMR6, ZERO>, ISA_MICROMIPS32R6;
+defm D_MMR6 : Cmp_Pats<f64, NOR_MMR6, ZERO>, ISA_MICROMIPS32R6;
+
+def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm),
+              (ANDI16_MMR6 GPRMM16:$src, immZExtAndi16:$imm)>,
+              ISA_MICROMIPS32R6;
+def : MipsPat<(and GPR32:$src, immZExt16:$imm),
+              (ANDI_MMR6 GPR32:$src, immZExt16:$imm)>, ISA_MICROMIPS32R6;
+def : MipsPat<(i32 immZExt16:$imm),
+              (XORI_MMR6 ZERO, immZExt16:$imm)>, ISA_MICROMIPS32R6;
+def : MipsPat<(not GPRMM16:$in),
+              (NOT16_MMR6 GPRMM16:$in)>, ISA_MICROMIPS32R6;
+def : MipsPat<(not GPR32:$in),
+              (NOR_MMR6 GPR32Opnd:$in, ZERO)>, ISA_MICROMIPS32R6;
+// Patterns for load with a reg+imm operand.
+let AddedComplexity = 41 in {
+  def : LoadRegImmPat<LDC1_D64_MMR6, f64, load>, FGR_64, ISA_MICROMIPS32R6;
+  def : StoreRegImmPat<SDC1_D64_MMR6, f64>, FGR_64, ISA_MICROMIPS32R6;
+}
+
+def TAILCALL_MMR6 : TailCall<BC_MMR6, brtarget26_mm>, ISA_MICROMIPS32R6;
diff --git a/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrFormats.td
new file mode 100644
index 000000000000..26062bfb2b8e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrFormats.td
@@ -0,0 +1,267 @@
+//=-   MicroMips64r6InstrFormats.td - Instruction Formats  -*- tablegen -*  -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes microMIPS64r6 instruction formats.
+//
+//===----------------------------------------------------------------------===//
+
+class DAUI_FM_MMR6 {
+  bits<5> rt;
+  bits<5> rs;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b111100;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = imm;
+}
+
+class POOL32I_ADD_IMM_FM_MMR6<bits<5> funct> {
+  bits<5> rs;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010000;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0} = imm;
+}
+
+class POOL32S_EXTBITS_FM_MMR6<bits<6> funct> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> size;
+  bits<5> pos;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = size;
+  let Inst{10-6}  = pos;
+  let Inst{5-0}   = funct;
+}
+
+class POOL32S_DALIGN_FM_MMR6 {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> rd;
+  bits<3> bp;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8}  = bp;
+  let Inst{7-6}   = 0b00;
+  let Inst{5-0}   = 0b011100;
+}
+
+class POOL32A_DIVMOD_FM_MMR6<string instr_asm, bits<9> funct>
+    : MMR6Arch<instr_asm> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10-9}  = 0b00;
+  let Inst{8-0}  = funct;
+}
+
+class POOL32S_DMFTC0_FM_MMR6<string instr_asm, bits<5> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<5> rs;
+  bits<3> sel;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = 0;
+  let Inst{13-11} = sel;
+  let Inst{10-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32S_ARITH_FM_MMR6<string opstr, bits<9> funct>
+    : MMR6Arch<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10-9}  = 0b00;
+  let Inst{8-0}   = funct;
+}
+
+class DADDIU_FM_MMR6<string opstr> : MMR6Arch<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010111;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = imm16;
+}
+
+class PCREL18_FM_MMR6<bits<3> funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<18> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011110;
+  let Inst{25-21} = rt;
+  let Inst{20-18} = funct;
+  let Inst{17-0} = imm;
+}
+
+class POOL32S_2R_FM_MMR6<string instr_asm, bits<10> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-6} = funct;
+  let Inst{5-0} = 0b111100;
+}
+
+class POOL32S_2RSA5B0_FM_MMR6<string instr_asm, bits<9> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> sa;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = sa;
+  let Inst{10-9} = 0b00;
+  let Inst{8-0} = funct;
+}
+
+class LD_SD_32_2R_OFFSET16_FM_MMR6<string instr_asm, bits<6> op>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-0}  = offset;
+}
+
+class POOL32C_2R_OFFSET12_FM_MMR6<string instr_asm, bits<4> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<12> offset = addr{11-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = offset;
+}
+
+class POOL32S_3R_FM_MMR6<string instr_asm, bits<9> funct>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10-9}  = 0b00;
+  let Inst{8-0}   = funct;
+}
+
+class POOL32S_DBITSWAP_FM_MMR6<string instr_asm> : MMR6Arch<instr_asm>,
+      MipsR6Inst {
+  bits<5> rt;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rd;
+  let Inst{15-12}  = 0b0000;
+  let Inst{11-6}  = 0b101100;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32S_3RSA_FM_MMR6<string instr_asm> : MMR6Arch<instr_asm>,
+      MipsR6Inst {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> rd;
+  bits<2> sa;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10-9} = sa;
+  let Inst{8-6} = 0b100;
+  let Inst{5-0} = 0b000100;
+}
+
+class PCREL_1ROFFSET19_FM_MMR6<string instr_asm> : MMR6Arch<instr_asm>,
+      MipsR6Inst {
+  bits<5> rt;
+  bits<19> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011110;
+  let Inst{25-21} = rt;
+  let Inst{20-19} = 0b10;
+  let Inst{18-0} = offset;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td
new file mode 100644
index 000000000000..05aad515da46
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td
@@ -0,0 +1,568 @@
+//=-  MicroMips64r6InstrInfo.td - Instruction Information  -*- tablegen -*- -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes MicroMips64r6 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Encodings
+//
+//===----------------------------------------------------------------------===//
+
+class DAUI_MMR6_ENC : DAUI_FM_MMR6;
+class DAHI_MMR6_ENC : POOL32I_ADD_IMM_FM_MMR6<0b10001>;
+class DATI_MMR6_ENC : POOL32I_ADD_IMM_FM_MMR6<0b10000>;
+class DEXT_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b101100>;
+class DEXTM_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b100100>;
+class DEXTU_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b010100>;
+class DALIGN_MMR6_ENC : POOL32S_DALIGN_FM_MMR6;
+class DDIV_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"ddiv", 0b100011000>;
+class DMOD_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"dmod", 0b101011000>;
+class DDIVU_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"ddivu", 0b110011000>;
+class DMODU_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"dmodu", 0b111011000>;
+class DINSU_MM64R6_ENC : POOL32S_EXTBITS_FM_MMR6<0b110100>;
+class DINSM_MM64R6_ENC : POOL32S_EXTBITS_FM_MMR6<0b000100>;
+class DINS_MM64R6_ENC : POOL32S_EXTBITS_FM_MMR6<0b001100>;
+class DMTC0_MM64R6_ENC : POOL32S_DMFTC0_FM_MMR6<"dmtc0", 0b01011>;
+class DMTC1_MM64R6_ENC : POOL32F_MFTC1_FM_MMR6<"dmtc1", 0b10110000>;
+class DMTC2_MM64R6_ENC : POOL32A_MFTC2_FM_MMR6<"dmtc2", 0b0111110100>;
+class DMFC0_MM64R6_ENC : POOL32S_DMFTC0_FM_MMR6<"dmfc0", 0b00011>;
+class DMFC1_MM64R6_ENC : POOL32F_MFTC1_FM_MMR6<"dmfc1", 0b10010000>;
+class DMFC2_MM64R6_ENC : POOL32A_MFTC2_FM_MMR6<"dmfc2", 0b0110110100>;
+class DADD_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dadd", 0b100010000>;
+class DADDIU_MM64R6_ENC : DADDIU_FM_MMR6<"daddiu">;
+class DADDU_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"daddu", 0b101010000>;
+class LDPC_MMR646_ENC : PCREL18_FM_MMR6<0b110>;
+class DSUB_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dsub", 0b110010000>;
+class DSUBU_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dsubu", 0b111010000>;
+class DMUL_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dmul", 0b000011000>;
+class DMUH_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dmuh", 0b001011000>;
+class DMULU_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dmulu", 0b010011000>;
+class DMUHU_MM64R6_ENC : POOL32S_ARITH_FM_MMR6<"dmuhu", 0b011011000>;
+class DSBH_MM64R6_ENC : POOL32S_2R_FM_MMR6<"dsbh", 0b0111101100>;
+class DSHD_MM64R6_ENC : POOL32S_2R_FM_MMR6<"dshd", 0b1111101100>;
+class DSLL_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsll", 0b000000000>;
+class DSLL32_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsll32", 0b000001000>;
+class DSLLV_MM64R6_ENC : POOL32S_3R_FM_MMR6<"dsllv", 0b000010000>;
+class DSRAV_MM64R6_ENC : POOL32S_3R_FM_MMR6<"dsrav", 0b010010000>;
+class DSRA_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsra", 0b010000000>;
+class DSRA32_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsra32", 0b010000100>;
+class DCLO_MM64R6_ENC : POOL32S_2R_FM_MMR6<"dclo", 0b0100101100>;
+class DCLZ_MM64R6_ENC : POOL32S_2R_FM_MMR6<"dclz", 0b0101101100>;
+class DROTR_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"drotr", 0b011000000>;
+class DROTR32_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"drotr32", 0b011001000>;
+class DROTRV_MM64R6_ENC : POOL32S_3R_FM_MMR6<"drotrv", 0b011010000>;
+class LD_MM64R6_ENC : LD_SD_32_2R_OFFSET16_FM_MMR6<"ld", 0b110111>;
+class LLD_MM64R6_ENC : POOL32C_2R_OFFSET12_FM_MMR6<"lld", 0b0111>;
+class LWU_MM64R6_ENC : POOL32C_2R_OFFSET12_FM_MMR6<"lwu", 0b1110>;
+class SD_MM64R6_ENC : LD_SD_32_2R_OFFSET16_FM_MMR6<"sd", 0b110110>;
+class DSRL_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsrl", 0b001000000>;
+class DSRL32_MM64R6_ENC : POOL32S_2RSA5B0_FM_MMR6<"dsrl32", 0b001001000>;
+class DSRLV_MM64R6_ENC : POOL32S_3R_FM_MMR6<"dsrlv", 0b001010000>;
+class DBITSWAP_MM64R6_ENC : POOL32S_DBITSWAP_FM_MMR6<"dbitswap">;
+class DLSA_MM64R6_ENC : POOL32S_3RSA_FM_MMR6<"dlsa">;
+class LWUPC_MM64R6_ENC : PCREL_1ROFFSET19_FM_MMR6<"lwupc">;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Descriptions
+//
+//===----------------------------------------------------------------------===//
+
+class DAUI_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                          InstrItinClass Itin>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins GPROpnd:$rs, uimm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $imm");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = Itin;
+}
+class DAUI_MMR6_DESC : DAUI_MMR6_DESC_BASE<"daui", GPR64Opnd, II_DAUI>;
+
+class DAHI_DATI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                          InstrItinClass Itin>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins GPROpnd:$rt, uimm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $imm");
+  string Constraints = "$rs = $rt";
+  InstrItinClass Itinerary = Itin;
+}
+class DAHI_MMR6_DESC : DAHI_DATI_DESC_BASE<"dahi", GPR64Opnd, II_DAHI>;
+class DATI_MMR6_DESC : DAHI_DATI_DESC_BASE<"dati", GPR64Opnd, II_DATI>;
+
+class EXTBITS_DESC_BASE<string instr_asm, RegisterOperand RO, Operand PosOpnd,
+                        Operand SizeOpnd, SDPatternOperator Op = null_frag>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  dag OutOperandList = (outs RO:$rt);
+  dag InOperandList = (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $pos, $size");
+  list<dag> Pattern = [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))];
+  InstrItinClass Itinerary = II_EXT;
+  Format Form = FrmR;
+  string BaseOpcode = instr_asm;
+}
+// TODO: Add 'pos + size' constraint check to dext* instructions
+//       DEXT: 0 < pos + size <= 63
+//       DEXTM, DEXTU: 32 < pos + size <= 64
+class DEXT_MMR6_DESC : EXTBITS_DESC_BASE<"dext", GPR64Opnd, uimm5_report_uimm6,
+                                         uimm5_plus1, MipsExt>;
+class DEXTM_MMR6_DESC : EXTBITS_DESC_BASE<"dextm", GPR64Opnd, uimm5,
+                                          uimm5_plus33, MipsExt>;
+class DEXTU_MMR6_DESC : EXTBITS_DESC_BASE<"dextu", GPR64Opnd, uimm5_plus32,
+                                          uimm5_plus1, MipsExt>;
+
+class DALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                      Operand ImmOpnd, InstrItinClass itin>
+    : MMR6Arch<instr_asm>, MipsR6Inst {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$bp);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $bp");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
+}
+
+class DALIGN_MMR6_DESC : DALIGN_DESC_BASE<"dalign", GPR64Opnd, uimm3,
+                                          II_DALIGN>;
+
+class DDIV_MM64R6_DESC : DIVMOD_MMR6_DESC_BASE<"ddiv", GPR64Opnd, II_DDIV,
+                                               sdiv>;
+class DMOD_MM64R6_DESC : DIVMOD_MMR6_DESC_BASE<"dmod", GPR64Opnd, II_DMOD,
+                                               srem>;
+class DDIVU_MM64R6_DESC : DIVMOD_MMR6_DESC_BASE<"ddivu", GPR64Opnd, II_DDIVU,
+                                                udiv>;
+class DMODU_MM64R6_DESC : DIVMOD_MMR6_DESC_BASE<"dmodu", GPR64Opnd, II_DMODU,
+                                                urem>;
+
+class DCLO_MM64R6_DESC {
+  dag OutOperandList = (outs GPR64Opnd:$rt);
+  dag InOperandList = (ins GPR64Opnd:$rs);
+  string AsmString = !strconcat("dclo", "\t$rt, $rs");
+  list<dag> Pattern = [(set GPR64Opnd:$rt, (ctlz (not GPR64Opnd:$rs)))];
+  InstrItinClass Itinerary = II_DCLO;
+  Format Form = FrmR;
+  string BaseOpcode = "dclo";
+}
+
+class DCLZ_MM64R6_DESC {
+  dag OutOperandList = (outs GPR64Opnd:$rt);
+  dag InOperandList = (ins GPR64Opnd:$rs);
+  string AsmString = !strconcat("dclz", "\t$rt, $rs");
+  list<dag> Pattern = [(set GPR64Opnd:$rt, (ctlz GPR64Opnd:$rs))];
+  InstrItinClass Itinerary = II_DCLZ;
+  Format Form = FrmR;
+  string BaseOpcode = "dclz";
+}
+
+class DINSU_MM64R6_DESC : InsBase<"dinsu", GPR64Opnd, uimm5_plus32,
+                                  uimm5_inssize_plus1, MipsIns>;
+class DINSM_MM64R6_DESC : InsBase<"dinsm", GPR64Opnd, uimm5, uimm_range_2_64>;
+class DINS_MM64R6_DESC : InsBase<"dins", GPR64Opnd, uimm5, uimm5_inssize_plus1,
+                                 MipsIns>;
+class DMTC0_MM64R6_DESC : MTC0_MMR6_DESC_BASE<"dmtc0", COP0Opnd, GPR64Opnd,
+                                              II_DMTC0>;
+class DMTC1_MM64R6_DESC : MTC1_MMR6_DESC_BASE<"dmtc1", FGR64Opnd, GPR64Opnd,
+                                              II_DMTC1, bitconvert>;
+class DMTC2_MM64R6_DESC : MTC2_MMR6_DESC_BASE<"dmtc2", COP2Opnd, GPR64Opnd,
+                                              II_DMTC2>;
+class DMFC0_MM64R6_DESC : MFC0_MMR6_DESC_BASE<"dmfc0", GPR64Opnd, COP0Opnd,
+                                              II_DMFC0>;
+class DMFC1_MM64R6_DESC : MFC1_MMR6_DESC_BASE<"dmfc1", GPR64Opnd, FGR64Opnd,
+                                              II_DMFC1, bitconvert>;
+class DMFC2_MM64R6_DESC : MFC2_MMR6_DESC_BASE<"dmfc2", GPR64Opnd, COP2Opnd,
+                                              II_DMFC2>;
+class DADD_MM64R6_DESC : ArithLogicR<"dadd", GPR64Opnd, 1, II_DADD>;
+class DADDIU_MM64R6_DESC : ArithLogicI<"daddiu", simm16_64, GPR64Opnd,
+                                       II_DADDIU, immSExt16, add>,
+                           IsAsCheapAsAMove;
+class DADDU_MM64R6_DESC : ArithLogicR<"daddu", GPR64Opnd, 1, II_DADDU, add>;
+
+class DSUB_DESC_BASE<string instr_asm, RegisterOperand RO,
+                     InstrItinClass Itin = NoItinerary,
+                     SDPatternOperator OpNode = null_frag>
+                     : MipsR6Inst {
+  dag OutOperandList = (outs RO:$rd);
+  dag InOperandList = (ins RO:$rs, RO:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rs, RO:$rt))];
+  InstrItinClass Itinerary = Itin;
+  Format Form = FrmR;
+  string BaseOpcode = instr_asm;
+  let isCommutable = 0;
+  let isReMaterializable = 1;
+  let TwoOperandAliasConstraint = "$rd = $rs";
+}
+class DSUB_MM64R6_DESC : DSUB_DESC_BASE<"dsub", GPR64Opnd, II_DSUB>;
+class DSUBU_MM64R6_DESC : DSUB_DESC_BASE<"dsubu", GPR64Opnd, II_DSUBU, sub>;
+
+class LDPC_MM64R6_DESC : PCREL_MMR6_DESC_BASE<"ldpc", GPR64Opnd, simm18_lsl3,
+                                              II_LDPC>;
+
+class MUL_MM64R6_DESC_BASE<string opstr, RegisterOperand GPROpnd,
+                           InstrItinClass Itin = NoItinerary,
+                           SDPatternOperator Op = null_frag> : MipsR6Inst {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(opstr, "\t$rd, $rs, $rt");
+  InstrItinClass Itinerary = Itin;
+  list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))];
+}
+
+class DMUL_MM64R6_DESC : MUL_MM64R6_DESC_BASE<"dmul", GPR64Opnd, II_DMUL, mul>;
+class DMUH_MM64R6_DESC : MUL_MM64R6_DESC_BASE<"dmuh", GPR64Opnd, II_DMUH,
+                                              mulhs>;
+class DMULU_MM64R6_DESC : MUL_MM64R6_DESC_BASE<"dmulu", GPR64Opnd, II_DMULU>;
+class DMUHU_MM64R6_DESC : MUL_MM64R6_DESC_BASE<"dmuhu", GPR64Opnd, II_DMUHU,
+                                               mulhu>;
+
+class DSBH_DSHD_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                          InstrItinClass Itin> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins GPROpnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
+  bit hasSideEffects = 0;
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = Itin;
+  Format Form = FrmR;
+  string BaseOpcode = instr_asm;
+}
+
+class DSBH_MM64R6_DESC : DSBH_DSHD_DESC_BASE<"dsbh", GPR64Opnd, II_DSBH>;
+class DSHD_MM64R6_DESC : DSBH_DSHD_DESC_BASE<"dshd", GPR64Opnd, II_DSHD>;
+
+class SHIFT_ROTATE_IMM_MM64R6<string instr_asm, Operand ImmOpnd,
+                              InstrItinClass itin,
+                              SDPatternOperator OpNode = null_frag,
+                              SDPatternOperator PO = null_frag> {
+  dag OutOperandList = (outs GPR64Opnd:$rt);
+  dag InOperandList = (ins GPR64Opnd:$rs, ImmOpnd:$sa);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+  list<dag> Pattern = [(set GPR64Opnd:$rt, (OpNode GPR64Opnd:$rs, PO:$sa))];
+  InstrItinClass Itinerary = itin;
+  Format Form = FrmR;
+  string TwoOperandAliasConstraint = "$rs = $rt";
+  string BaseOpcode = instr_asm;
+}
+
+class SHIFT_ROTATE_REG_MM64R6<string instr_asm, InstrItinClass itin,
+                              SDPatternOperator OpNode = null_frag> {
+  dag OutOperandList = (outs GPR64Opnd:$rd);
+  dag InOperandList = (ins GPR64Opnd:$rt, GPR32Opnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs");
+  list<dag> Pattern = [(set GPR64Opnd:$rd,
+                       (OpNode GPR64Opnd:$rt, GPR32Opnd:$rs))];
+  InstrItinClass Itinerary = itin;
+  Format Form = FrmR;
+  string BaseOpcode = instr_asm;
+}
+
+class DSLL_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsll", uimm6, II_DSLL, shl,
+                                                 immZExt6>;
+class DSLL32_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsll32", uimm5, II_DSLL32>;
+class DSLLV_MM64R6_DESC : SHIFT_ROTATE_REG_MM64R6<"dsllv", II_DSLLV, shl>;
+class DSRAV_MM64R6_DESC : SHIFT_ROTATE_REG_MM64R6<"dsrav", II_DSRAV, sra>;
+class DSRA_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsra", uimm6, II_DSRA, sra,
+                                                 immZExt6>;
+class DSRA32_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsra32", uimm5, II_DSRA32>;
+class DROTR_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"drotr", uimm6, II_DROTR,
+                                                  rotr, immZExt6>;
+class DROTR32_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"drotr32", uimm5,
+                                                    II_DROTR32>;
+class DROTRV_MM64R6_DESC : SHIFT_ROTATE_REG_MM64R6<"drotrv", II_DROTRV, rotr>;
+class DSRL_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsrl", uimm6, II_DSRL, srl,
+                                                 immZExt6>;
+class DSRL32_MM64R6_DESC : SHIFT_ROTATE_IMM_MM64R6<"dsrl32", uimm5, II_DSRL32>;
+class DSRLV_MM64R6_DESC : SHIFT_ROTATE_REG_MM64R6<"dsrlv", II_DSRLV, srl>;
+
+class Load_MM64R6<string instr_asm, Operand MemOpnd, InstrItinClass itin,
+                  SDPatternOperator OpNode = null_frag> {
+  dag OutOperandList = (outs GPR64Opnd:$rt);
+  dag InOperandList = (ins MemOpnd:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [(set GPR64Opnd:$rt, (OpNode addr:$addr))];
+  InstrItinClass Itinerary = itin;
+  Format Form = FrmI;
+  bit mayLoad = 1;
+  bit canFoldAsLoad = 1;
+  string BaseOpcode = instr_asm;
+}
+
+class LD_MM64R6_DESC : Load_MM64R6<"ld", mem_simm16, II_LD, load> {
+  string DecoderMethod = "DecodeMemMMImm16";
+}
+class LWU_MM64R6_DESC : Load_MM64R6<"lwu", mem_simm12, II_LWU, zextloadi32>{
+  string DecoderMethod = "DecodeMemMMImm12";
+}
+
+class LLD_MM64R6_DESC {
+  dag OutOperandList = (outs GPR64Opnd:$rt);
+  dag InOperandList = (ins mem_simm12:$addr);
+  string AsmString = "lld\t$rt, $addr";
+  list<dag> Pattern = [];
+  bit mayLoad = 1;
+  InstrItinClass Itinerary = II_LLD;
+  string BaseOpcode = "lld";
+  string DecoderMethod = "DecodeMemMMImm12";
+}
+
+class SD_MM64R6_DESC {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins GPR64Opnd:$rt, mem_simm16:$addr);
+  string AsmString = "sd\t$rt, $addr";
+  list<dag> Pattern = [(store GPR64Opnd:$rt, addr:$addr)];
+  InstrItinClass Itinerary = II_SD;
+  Format Form = FrmI;
+  bit mayStore = 1;
+  string BaseOpcode = "sd";
+  string DecoderMethod = "DecodeMemMMImm16";
+}
+
+class DBITSWAP_MM64R6_DESC {
+  dag OutOperandList = (outs GPR64Opnd:$rd);
+  dag InOperandList = (ins GPR64Opnd:$rt);
+  string AsmString = !strconcat("dbitswap", "\t$rd, $rt");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_DBITSWAP;
+}
+
+class DLSA_MM64R6_DESC {
+  dag OutOperandList = (outs GPR64Opnd:$rd);
+  dag InOperandList = (ins GPR64Opnd:$rt, GPR64Opnd:$rs, uimm2_plus1:$sa);
+  string AsmString = "dlsa\t$rt, $rs, $rd, $sa";
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_DLSA;
+}
+
+class LWUPC_MM64R6_DESC {
+  dag OutOperandList = (outs GPR64Opnd:$rt);
+  dag InOperandList = (ins simm19_lsl2:$offset);
+  string AsmString = "lwupc\t$rt, $offset";
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_LWUPC;
+  bit mayLoad = 1;
+  bit IsPCRelativeLoad = 1;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Definitions
+//
+//===----------------------------------------------------------------------===//
+
+let DecoderNamespace = "MicroMipsR6" in {
+  def DAUI_MM64R6 : StdMMR6Rel, DAUI_MMR6_DESC, DAUI_MMR6_ENC, ISA_MICROMIPS64R6;
+  let DecoderMethod = "DecodeDAHIDATIMMR6" in {
+    def DAHI_MM64R6 : StdMMR6Rel, DAHI_MMR6_DESC, DAHI_MMR6_ENC, ISA_MICROMIPS64R6;
+    def DATI_MM64R6 : StdMMR6Rel, DATI_MMR6_DESC, DATI_MMR6_ENC, ISA_MICROMIPS64R6;
+  }
+  def DEXT_MM64R6 : StdMMR6Rel, DEXT_MMR6_DESC, DEXT_MMR6_ENC,
+                    ISA_MICROMIPS64R6;
+  def DEXTM_MM64R6 : StdMMR6Rel, DEXTM_MMR6_DESC, DEXTM_MMR6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DEXTU_MM64R6 : StdMMR6Rel, DEXTU_MMR6_DESC, DEXTU_MMR6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DALIGN_MM64R6 : StdMMR6Rel, DALIGN_MMR6_DESC, DALIGN_MMR6_ENC,
+                      ISA_MICROMIPS64R6;
+  def DDIV_MM64R6 : R6MMR6Rel, DDIV_MM64R6_DESC, DDIV_MM64R6_ENC,
+                    ISA_MICROMIPS64R6;
+  def DMOD_MM64R6 : R6MMR6Rel, DMOD_MM64R6_DESC, DMOD_MM64R6_ENC,
+                    ISA_MICROMIPS64R6;
+  def DDIVU_MM64R6 : R6MMR6Rel, DDIVU_MM64R6_DESC, DDIVU_MM64R6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DMODU_MM64R6 : R6MMR6Rel, DMODU_MM64R6_DESC, DMODU_MM64R6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DINSU_MM64R6: R6MMR6Rel, DINSU_MM64R6_DESC, DINSU_MM64R6_ENC,
+                    ISA_MICROMIPS64R6;
+  def DINSM_MM64R6: R6MMR6Rel, DINSM_MM64R6_DESC, DINSM_MM64R6_ENC,
+                    ISA_MICROMIPS64R6;
+  def DINS_MM64R6: R6MMR6Rel, DINS_MM64R6_DESC, DINS_MM64R6_ENC,
+                   ISA_MICROMIPS64R6;
+  def DMTC0_MM64R6 : StdMMR6Rel, DMTC0_MM64R6_ENC, DMTC0_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
+  def DMTC1_MM64R6 : StdMMR6Rel, DMTC1_MM64R6_DESC, DMTC1_MM64R6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DMTC2_MM64R6 : StdMMR6Rel, DMTC2_MM64R6_ENC, DMTC2_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
+  def DMFC0_MM64R6 : StdMMR6Rel, DMFC0_MM64R6_ENC, DMFC0_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
+  def DMFC1_MM64R6 : StdMMR6Rel, DMFC1_MM64R6_DESC, DMFC1_MM64R6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DMFC2_MM64R6 : StdMMR6Rel, DMFC2_MM64R6_ENC, DMFC2_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
+  def DADD_MM64R6: StdMMR6Rel, DADD_MM64R6_DESC, DADD_MM64R6_ENC,
+                   ISA_MICROMIPS64R6;
+  def DADDIU_MM64R6: StdMMR6Rel, DADDIU_MM64R6_DESC, DADDIU_MM64R6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DADDU_MM64R6: StdMMR6Rel, DADDU_MM64R6_DESC, DADDU_MM64R6_ENC,
+                    ISA_MICROMIPS64R6;
+  def LDPC_MM64R6 :  R6MMR6Rel, LDPC_MMR646_ENC, LDPC_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
+  def DSUB_MM64R6 : StdMMR6Rel, DSUB_MM64R6_DESC, DSUB_MM64R6_ENC,
+                    ISA_MICROMIPS64R6;
+  def DSUBU_MM64R6 : StdMMR6Rel, DSUBU_MM64R6_DESC, DSUBU_MM64R6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DMUL_MM64R6 : R6MMR6Rel, DMUL_MM64R6_DESC, DMUL_MM64R6_ENC,
+                    ISA_MICROMIPS64R6;
+  def DMUH_MM64R6 : R6MMR6Rel, DMUH_MM64R6_DESC, DMUH_MM64R6_ENC,
+                    ISA_MICROMIPS64R6;
+  def DMULU_MM64R6 : R6MMR6Rel, DMULU_MM64R6_DESC, DMULU_MM64R6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DMUHU_MM64R6 : R6MMR6Rel, DMUHU_MM64R6_DESC, DMUHU_MM64R6_ENC,
+                     ISA_MICROMIPS64R6;
+  def DSBH_MM64R6 : R6MMR6Rel, DSBH_MM64R6_ENC, DSBH_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DSHD_MM64R6 : R6MMR6Rel, DSHD_MM64R6_ENC, DSHD_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DSLL_MM64R6 : StdMMR6Rel, DSLL_MM64R6_ENC, DSLL_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DSLL32_MM64R6 : StdMMR6Rel, DSLL32_MM64R6_ENC, DSLL32_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DSLLV_MM64R6 : StdMMR6Rel, DSLLV_MM64R6_ENC, DSLLV_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
+  def DSRAV_MM64R6 : StdMMR6Rel, DSRAV_MM64R6_ENC, DSRAV_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DSRA_MM64R6 : StdMMR6Rel, DSRA_MM64R6_ENC, DSRA_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DSRA32_MM64R6 : StdMMR6Rel, DSRA32_MM64R6_ENC, DSRA32_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DCLO_MM64R6 : StdMMR6Rel, R6MMR6Rel, DCLO_MM64R6_ENC, DCLO_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DCLZ_MM64R6 : StdMMR6Rel, R6MMR6Rel, DCLZ_MM64R6_ENC, DCLZ_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DROTR_MM64R6 : StdMMR6Rel, DROTR_MM64R6_ENC, DROTR_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
+  def DROTR32_MM64R6 : StdMMR6Rel, DROTR32_MM64R6_ENC, DROTR32_MM64R6_DESC,
+                       ISA_MICROMIPS64R6;
+  def DROTRV_MM64R6 : StdMMR6Rel, DROTRV_MM64R6_ENC, DROTRV_MM64R6_DESC,
+                      ISA_MICROMIPS64R6;
+  def LD_MM64R6 : StdMMR6Rel, LD_MM64R6_ENC, LD_MM64R6_DESC,
+                  ISA_MICROMIPS64R6;
+  def LLD_MM64R6 : StdMMR6Rel, R6MMR6Rel, LLD_MM64R6_ENC, LLD_MM64R6_DESC,
+                   ISA_MICROMIPS64R6;
+  def LWU_MM64R6 : StdMMR6Rel, LWU_MM64R6_ENC, LWU_MM64R6_DESC,
+                   ISA_MICROMIPS64R6;
+  def SD_MM64R6 : StdMMR6Rel, SD_MM64R6_ENC, SD_MM64R6_DESC,
+                  ISA_MICROMIPS64R6;
+  def DSRL_MM64R6 : StdMMR6Rel, DSRL_MM64R6_ENC, DSRL_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def DSRL32_MM64R6 : StdMMR6Rel, DSRL32_MM64R6_ENC, DSRL32_MM64R6_DESC,
+                      ISA_MICROMIPS64R6;
+  def DSRLV_MM64R6 : StdMMR6Rel, DSRLV_MM64R6_ENC, DSRLV_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
+  def DBITSWAP_MM64R6 : R6MMR6Rel, DBITSWAP_MM64R6_ENC, DBITSWAP_MM64R6_DESC,
+                        ISA_MICROMIPS64R6;
+  def DLSA_MM64R6 : R6MMR6Rel, DLSA_MM64R6_ENC, DLSA_MM64R6_DESC,
+                    ISA_MICROMIPS64R6;
+  def LWUPC_MM64R6 : R6MMR6Rel, LWUPC_MM64R6_ENC, LWUPC_MM64R6_DESC,
+                     ISA_MICROMIPS64R6;
+}
+
+let AdditionalPredicates = [InMicroMips] in
+defm : MaterializeImms<i64, ZERO_64, DADDIU_MM64R6, LUi64, ORi64>;
+
+//===----------------------------------------------------------------------===//
+//
+// Arbitrary patterns that map to one or more instructions
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsPat<(MipsLo tglobaladdr:$in),
+              (DADDIU_MM64R6 ZERO_64, tglobaladdr:$in)>, ISA_MICROMIPS64R6;
+def : MipsPat<(MipsLo tblockaddress:$in),
+              (DADDIU_MM64R6 ZERO_64, tblockaddress:$in)>, ISA_MICROMIPS64R6;
+def : MipsPat<(MipsLo tjumptable:$in),
+              (DADDIU_MM64R6 ZERO_64, tjumptable:$in)>, ISA_MICROMIPS64R6;
+def : MipsPat<(MipsLo tconstpool:$in),
+              (DADDIU_MM64R6 ZERO_64, tconstpool:$in)>, ISA_MICROMIPS64R6;
+def : MipsPat<(MipsLo tglobaltlsaddr:$in),
+              (DADDIU_MM64R6 ZERO_64, tglobaltlsaddr:$in)>, ISA_MICROMIPS64R6;
+def : MipsPat<(MipsLo texternalsym:$in),
+              (DADDIU_MM64R6 ZERO_64, texternalsym:$in)>, ISA_MICROMIPS64R6;
+
+def : MipsPat<(add GPR64:$hi, (MipsLo tglobaladdr:$lo)),
+              (DADDIU_MM64R6 GPR64:$hi, tglobaladdr:$lo)>, ISA_MICROMIPS64R6;
+def : MipsPat<(add GPR64:$hi, (MipsLo tblockaddress:$lo)),
+              (DADDIU_MM64R6 GPR64:$hi, tblockaddress:$lo)>, ISA_MICROMIPS64R6;
+def : MipsPat<(add GPR64:$hi, (MipsLo tjumptable:$lo)),
+              (DADDIU_MM64R6 GPR64:$hi, tjumptable:$lo)>, ISA_MICROMIPS64R6;
+def : MipsPat<(add GPR64:$hi, (MipsLo tconstpool:$lo)),
+              (DADDIU_MM64R6 GPR64:$hi, tconstpool:$lo)>, ISA_MICROMIPS64R6;
+def : MipsPat<(add GPR64:$hi, (MipsLo tglobaltlsaddr:$lo)),
+              (DADDIU_MM64R6 GPR64:$hi, tglobaltlsaddr:$lo)>, ISA_MICROMIPS64R6;
+
+def : MipsPat<(addc GPR64:$lhs, GPR64:$rhs),
+              (DADDU_MM64R6 GPR64:$lhs, GPR64:$rhs)>, ISA_MICROMIPS64R6;
+def : MipsPat<(addc GPR64:$lhs, immSExt16:$imm),
+              (DADDIU_MM64R6 GPR64:$lhs, imm:$imm)>, ISA_MICROMIPS64R6;
+
+
+def : MipsPat<(rotr GPR64:$rt, (i32 (trunc GPR64:$rs))),
+              (DROTRV_MM64R6 GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>,
+              ISA_MICROMIPS64R6;
+
+
+def : WrapperPat<tglobaladdr, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+def : WrapperPat<tconstpool, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+def : WrapperPat<texternalsym, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+def : WrapperPat<tblockaddress, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+def : WrapperPat<tjumptable, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+def : WrapperPat<tglobaltlsaddr, DADDIU_MM64R6, GPR64>, ISA_MICROMIPS64R6;
+
+// Carry pattern
+def : MipsPat<(subc GPR64:$lhs, GPR64:$rhs),
+              (DSUBU_MM64R6 GPR64:$lhs, GPR64:$rhs)>, ISA_MICROMIPS64R6;
+
+def : MipsPat<(atomic_load_64 addr:$a), (LD_MM64R6 addr:$a)>, ISA_MICROMIPS64R6;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction aliases
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsInstAlias<"dmtc0 $rt, $rd",
+                    (DMTC0_MM64R6 COP0Opnd:$rd, GPR64Opnd:$rt, 0), 0>;
+def : MipsInstAlias<"dmfc0 $rt, $rd",
+                    (DMFC0_MM64R6 GPR64Opnd:$rt, COP0Opnd:$rd, 0), 0>,
+                    ISA_MICROMIPS64R6;
+def : MipsInstAlias<"daddu $rs, $rt, $imm",
+                    (DADDIU_MM64R6 GPR64Opnd:$rs,
+                                   GPR64Opnd:$rt,
+                                   simm16_64:$imm),
+                    0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"daddu $rs, $imm",
+                    (DADDIU_MM64R6 GPR64Opnd:$rs,
+                                   GPR64Opnd:$rs,
+                                   simm16_64:$imm),
+                    0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsubu $rt, $rs, $imm",
+                    (DADDIU_MM64R6 GPR64Opnd:$rt,
+                                   GPR64Opnd:$rs,
+                                   InvertedImOperand64:$imm),
+                    0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsubu $rs, $imm",
+                    (DADDIU_MM64R6 GPR64Opnd:$rs,
+                                   GPR64Opnd:$rs,
+                                   InvertedImOperand64:$imm),
+                    0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dneg $rt, $rs",
+                    (DSUB_MM64R6 GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
+                    ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dneg $rt",
+                    (DSUB_MM64R6 GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rt), 1>,
+                    ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dnegu $rt, $rs",
+                    (DSUBU_MM64R6 GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
+                    ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dnegu $rt",
+                    (DSUBU_MM64R6 GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rt), 1>,
+                    ISA_MICROMIPS64R6;
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td
new file mode 100644
index 000000000000..af6473c468d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td
@@ -0,0 +1,302 @@
+//===-- MicroMipsDSPInstrFormats.td - Instruction Formats --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class MMDSPInst<string opstr = "">
+    : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, PredicateControl {
+  let InsnPredicates = [HasDSP];
+  let AdditionalPredicates = [InMicroMips];
+  string BaseOpcode = opstr;
+  string Arch = "mmdsp";
+  let DecoderNamespace = "MicroMips";
+}
+
+class MMDSPInstAlias<string Asm, dag Result, bit Emit = 0b1>
+    : InstAlias<Asm, Result, Emit>, PredicateControl {
+  let InsnPredicates = [HasDSP];
+  let AdditionalPredicates = [InMicroMips];
+}
+
+class POOL32A_3R_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10-0}  = op;
+}
+
+class POOL32A_2R_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_2RAC_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<2> ac;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = ac;
+  let Inst{13-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_3RB0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0b0;
+  let Inst{9-0}   = op;
+}
+
+class POOL32A_2RSA4_FMT<string opstr, bits<12> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<4> sa;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-12} = sa;
+  let Inst{11-0}  = op;
+}
+
+class POOL32A_2RSA3_FMT<string opstr, bits<7> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<3> sa;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-13} = sa;
+  let Inst{12-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_2RSA5B0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> sa;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = sa;
+  let Inst{10}    = 0b0;
+  let Inst{9-0}   = op;
+}
+
+class POOL32A_2RSA4B0_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<4> sa;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-12} = sa;
+  let Inst{11}    = 0b0;
+  let Inst{10-0}  = op;
+}
+
+class POOL32A_2RSA4OP6_FMT<string opstr, bits<6> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<4> sa;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-12} = sa;
+  let Inst{11-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_1RIMM5AC_FMT<string opstr, bits<8> funct> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> imm;
+  bits<2> ac;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = imm;
+  let Inst{15-14} = ac;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_2RSA5_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> sa;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = sa;
+  let Inst{10-0}  = op;
+}
+
+class POOL32A_1RMEMB0_FMT<string opstr, bits<10> funct> : MMDSPInst<opstr> {
+  bits<5> index;
+  bits<5> base;
+  bits<5> rd;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = index;
+  let Inst{20-16} = base;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0b0;
+  let Inst{9-0}   = funct;
+}
+
+class POOL32A_1RAC_FMT<string instr_asm, bits<8> funct> : MMDSPInst<instr_asm> {
+  bits<5> rs;
+  bits<2> ac;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = ac;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_1RMASK7_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<7> mask;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-14} = mask;
+  let Inst{13-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_1RIMM10_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<5> rd;
+  bits<10> imm;
+
+  let Inst{31-26} = 0;
+  let Inst{25-16} = imm;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0;
+  let Inst{9-0}   = op;
+}
+
+class POOL32A_1RIMM8_FMT<string opstr, bits<6> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<8> imm;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-13} = imm;
+  let Inst{12}    = 0;
+  let Inst{11-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_4B0SHIFT6AC4B0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<6> shift;
+  bits<2> ac;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-22} = 0b0000;
+  let Inst{21-16} = shift;
+  let Inst{15-14} = ac;
+  let Inst{13-10} = 0b0000;
+  let Inst{9-0}   = op;
+}
+
+class POOL32A_5B01RAC_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> {
+  bits<5> rs;
+  bits<2> ac;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = ac;
+  let Inst{13-6}  = op;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32I_IMMB0_FMT<string opstr, bits<5> op> : MMDSPInst<opstr> {
+  bits<16> offset;
+
+  let Inst{31-26} = 0b010000;
+  let Inst{25-21} = op;
+  let Inst{20-16} = 0;
+  let Inst{15-0}  = offset;
+}
+
+class POOL32A_2RBP_FMT<string opstr> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<2> bp;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = bp;
+  let Inst{13-6}  = 0b00100010;
+  let Inst{5-0}   = 0b111100;
+}
+
+class POOL32A_2RB0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-10} = 0;
+  let Inst{9-0}   = op;
+}
+
+class POOL32S_3RB0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> rd;
+
+  let Inst{31-26} = 0b010110;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0b0;
+  let Inst{9-0}   = op;
+}
+
+class POOL32A_2R2B0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> {
+  bits<5> rt;
+  bits<5> rs;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = 0;
+  let Inst{10}    = 0;
+  let Inst{9-0}   = op;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td
new file mode 100644
index 000000000000..f82f82fc7e45
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td
@@ -0,0 +1,601 @@
+//===- MicroMipsDSPInstrInfo.td - Micromips DSP instructions -*- tablegen *-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes MicroMips DSP instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// Instruction encoding.
+class ADDQ_PH_MM_ENC : POOL32A_3R_FMT<"addq.ph", 0b00000001101>;
+class ADDQ_S_PH_MM_ENC : POOL32A_3R_FMT<"addq_s.ph", 0b10000001101>;
+class ADDQ_S_W_MM_ENC : POOL32A_3RB0_FMT<"addq_s.w", 0b1100000101>;
+class ADDQH_PH_MMR2_ENC : POOL32A_3R_FMT<"addqh.ph", 0b00001001101>;
+class ADDQH_R_PH_MMR2_ENC : POOL32A_3R_FMT<"addqh_r.ph", 0b10001001101>;
+class ADDQH_W_MMR2_ENC: POOL32A_3R_FMT<"addqh.w", 0b00010001101>;
+class ADDQH_R_W_MMR2_ENC : POOL32A_3R_FMT<"addqh_r.w", 0b10010001101>;
+class ADDU_PH_MMR2_ENC : POOL32A_3R_FMT<"addu.ph", 0b00100001101>;
+class ADDU_S_PH_MMR2_ENC : POOL32A_3R_FMT<"addu_s.ph", 0b10100001101>;
+class ADDU_QB_MM_ENC : POOL32A_3R_FMT<"addu.qb", 0b00011001101>;
+class ADDU_S_QB_MM_ENC : POOL32A_3R_FMT<"addu_s.qb", 0b10011001101>;
+class ADDUH_QB_MMR2_ENC : POOL32A_3R_FMT<"adduh.qb", 0b00101001101>;
+class ADDUH_R_QB_MMR2_ENC : POOL32A_3R_FMT<"adduh_r.qb", 0b10101001101>;
+class ADDSC_MM_ENC : POOL32A_3RB0_FMT<"addsc", 0b1110000101>;
+class ADDWC_MM_ENC : POOL32A_3RB0_FMT<"addwc", 0b1111000101>;
+class DPA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpa.w.ph", 0b00000010>;
+class DPAQ_S_W_PH_MM_ENC : POOL32A_2RAC_FMT<"dpaq_s.w.ph", 0b00001010>;
+class DPAQ_SA_L_W_MM_ENC : POOL32A_2RAC_FMT<"dpaq_sa.l.w", 0b01001010>;
+class DPAQX_S_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpaqx_s.w.ph", 0b10001010>;
+class DPAQX_SA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpaqx_sa.w.ph", 0b11001010>;
+class DPAU_H_QBL_MM_ENC : POOL32A_2RAC_FMT<"dpau.h.qbl", 0b10000010>;
+class DPAU_H_QBR_MM_ENC : POOL32A_2RAC_FMT<"dpau.h.qbr", 0b11000010>;
+class DPAX_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpax.w.ph", 0b01000010>;
+class ABSQ_S_PH_MM_ENC : POOL32A_2R_FMT<"absq_s.ph", 0b0001000100>;
+class ABSQ_S_W_MM_ENC : POOL32A_2R_FMT<"absq_s.w", 0b0010000100>;
+class ABSQ_S_QB_MMR2_ENC : POOL32A_2R_FMT<"absq_s.qb", 0b0000000100>;
+class INSV_MM_ENC : POOL32A_2R_FMT<"insv", 0b0100000100>;
+class MADD_DSP_MM_ENC : POOL32A_2RAC_FMT<"madd", 0b00101010>;
+class MADDU_DSP_MM_ENC : POOL32A_2RAC_FMT<"maddu", 0b01101010>;
+class MSUB_DSP_MM_ENC : POOL32A_2RAC_FMT<"msub", 0b10101010>;
+class MSUBU_DSP_MM_ENC : POOL32A_2RAC_FMT<"msubu", 0b11101010>;
+class MULT_DSP_MM_ENC : POOL32A_2RAC_FMT<"mult", 0b00110010>;
+class MULTU_DSP_MM_ENC : POOL32A_2RAC_FMT<"multu", 0b01110010>;
+class SHLL_PH_MM_ENC : POOL32A_2RSA4_FMT<"shll.ph", 0b001110110101>;
+class SHLL_S_PH_MM_ENC : POOL32A_2RSA4_FMT<"shll_s.ph", 0b101110110101>;
+class SHLL_QB_MM_ENC : POOL32A_2RSA3_FMT<"shll.qb", 0b0100001>;
+class SHLLV_PH_MM_ENC : POOL32A_3R_FMT<"shllv.ph", 0b00000001110>;
+class SHLLV_S_PH_MM_ENC : POOL32A_3R_FMT<"shllv_s.ph", 0b10000001110>;
+class SHLLV_QB_MM_ENC : POOL32A_3RB0_FMT<"shllv.qb", 0b1110010101>;
+class SHLLV_S_W_MM_ENC : POOL32A_3RB0_FMT<"shllv_s.w", 0b1111010101>;
+class SHLL_S_W_MM_ENC : POOL32A_2RSA5B0_FMT<"shll_s.w", 0b1111110101>;
+class SHRA_QB_MMR2_ENC : POOL32A_2RSA3_FMT<"shra.qb", 0b0000111>;
+class SHRA_R_QB_MMR2_ENC : POOL32A_2RSA3_FMT<"shra_r.qb", 0b1000111>;
+class SHRA_PH_MM_ENC : POOL32A_2RSA4B0_FMT<"shra.ph", 0b01100110101>;
+class SHRA_R_PH_MM_ENC : POOL32A_2RSA4B0_FMT<"shra_r.ph", 0b11100110101>;
+class SHRAV_PH_MM_ENC : POOL32A_3R_FMT<"shrav.ph", 0b00110001101>;
+class SHRAV_R_PH_MM_ENC : POOL32A_3R_FMT<"shrav_r.ph", 0b10110001101>;
+class SHRAV_QB_MMR2_ENC : POOL32A_3R_FMT<"shrav.qb", 0b00111001101>;
+class SHRAV_R_QB_MMR2_ENC : POOL32A_3R_FMT<"shrav_r.qb", 0b10111001101>;
+class SHRAV_R_W_MM_ENC : POOL32A_3RB0_FMT<"shrav_r.w", 0b1011010101>;
+class SHRA_R_W_MM_ENC : POOL32A_2RSA5B0_FMT<"shra_r.w", 0b1011110101>;
+class SHRL_PH_MMR2_ENC : POOL32A_2RSA4OP6_FMT<"shrl.ph", 0b001111>;
+class SHRL_QB_MM_ENC : POOL32A_2RSA3_FMT<"shrl.qb", 0b1100001>;
+class SHRLV_PH_MMR2_ENC : POOL32A_3RB0_FMT<"shrlv.ph", 0b1100010101>;
+class SHRLV_QB_MM_ENC : POOL32A_3RB0_FMT<"shrlv.qb", 0b1101010101>;
+class PRECEQ_W_PHL_MM_ENC : POOL32A_2R_FMT<"preceq.w.phl", 0b0101000100>;
+class PRECEQ_W_PHR_MM_ENC : POOL32A_2R_FMT<"preceq.w.phr", 0b0110000100>;
+class PRECEQU_PH_QBL_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbl", 0b0111000100>;
+class PRECEQU_PH_QBLA_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbla", 0b0111001100>;
+class PRECEQU_PH_QBR_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbr", 0b1001000100>;
+class PRECEQU_PH_QBRA_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbra", 0b1001001100>;
+class PRECEU_PH_QBL_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbl", 0b1011000100>;
+class PRECEU_PH_QBLA_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbla", 0b1011001100>;
+class PRECEU_PH_QBR_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbr", 0b1101000100>;
+class PRECEU_PH_QBRA_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbra", 0b1101001100>;
+class SUBQ_PH_MM_ENC : POOL32A_3R_FMT<"subq.ph", 0b01000001101>;
+class SUBQ_S_PH_MM_ENC : POOL32A_3R_FMT<"subq_s.ph", 0b11000001101>;
+class SUBQ_S_W_MM_ENC : POOL32A_3RB0_FMT<"subq_s.w", 0b1101000101>;
+class SUBQH_PH_MMR2_ENC : POOL32A_3R_FMT<"subqh.ph", 0b01001001101>;
+class SUBQH_R_PH_MMR2_ENC : POOL32A_3R_FMT<"subqh_r.ph", 0b11001001101>;
+class SUBQH_W_MMR2_ENC : POOL32A_3R_FMT<"subqh.w", 0b01010001101>;
+class SUBQH_R_W_MMR2_ENC : POOL32A_3R_FMT<"subqh_r.w", 0b11010001101>;
+class SUBU_PH_MMR2_ENC : POOL32A_3R_FMT<"subu.ph", 0b01100001101>;
+class SUBU_S_PH_MMR2_ENC : POOL32A_3R_FMT<"subu_s.ph", 0b11100001101>;
+class SUBU_QB_MM_ENC : POOL32A_3R_FMT<"subu.qb", 0b01011001101>;
+class SUBU_S_QB_MM_ENC : POOL32A_3R_FMT<"subu_s.qb", 0b11011001101>;
+class SUBUH_QB_MMR2_ENC : POOL32A_3R_FMT<"subuh.qb", 0b01101001101>;
+class SUBUH_R_QB_MMR2_ENC : POOL32A_3R_FMT<"subuh_r.qb", 0b11101001101>;
+class EXTP_MM_ENC : POOL32A_1RIMM5AC_FMT<"extp", 0b10011001>;
+class EXTPDP_MM_ENC : POOL32A_1RIMM5AC_FMT<"extpdp", 0b11011001>;
+class EXTPDPV_MM_ENC : POOL32A_2RAC_FMT<"extpdpv", 0b11100010>;
+class EXTPV_MM_ENC : POOL32A_2RAC_FMT<"extpv", 0b10100010>;
+class EXTR_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr.w", 0b00111001>;
+class EXTR_R_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_r.w", 0b01111001>;
+class EXTR_RS_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_rs.w", 0b10111001>;
+class EXTR_S_H_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_s.h", 0b11111001>;
+class EXTRV_W_MM_ENC : POOL32A_2RAC_FMT<"extrv.w", 0b00111010>;
+class EXTRV_R_W_MM_ENC : POOL32A_2RAC_FMT<"extrv_r.w", 0b01111010>;
+class EXTRV_RS_W_MM_ENC : POOL32A_2RAC_FMT<"extrv_rs.w", 0b10111010>;
+class EXTRV_S_H_MM_ENC : POOL32A_2RAC_FMT<"extrv_s.h", 0b11111010>;
+class DPS_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dps.w.ph", 0b00010010>;
+class DPSQ_S_W_PH_MM_ENC : POOL32A_2RAC_FMT<"dpsq_s.w.ph", 0b00011010>;
+class DPSQ_SA_L_W_MM_ENC : POOL32A_2RAC_FMT<"dpsq_sa.l.w", 0b01011010>;
+class DPSQX_S_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsqx_s.w.ph", 0b10011010>;
+class DPSQX_SA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsqx_sa.w.ph", 0b11011010>;
+class DPSU_H_QBL_MM_ENC : POOL32A_2RAC_FMT<"dpsu.h.qbl", 0b10010010>;
+class DPSU_H_QBR_MM_ENC : POOL32A_2RAC_FMT<"dpsu.h.qbr", 0b11010010>;
+class DPSX_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsx.w.ph", 0b01010010>;
+class MUL_PH_MMR2_ENC : POOL32A_3R_FMT<"mul.ph", 0b00000101101>;
+class MUL_S_PH_MMR2_ENC : POOL32A_3R_FMT<"mul_s.ph", 0b10000101101>;
+class MULEQ_S_W_PHL_MM_ENC : POOL32A_3RB0_FMT<"muleq_s.w.phl", 0b0000100101>;
+class MULEQ_S_W_PHR_MM_ENC : POOL32A_3RB0_FMT<"muleq_s.w.phr", 0b0001100101>;
+class MULEU_S_PH_QBL_MM_ENC : POOL32A_3RB0_FMT<"muleu_s.ph.qbl", 0b0010010101>;
+class MULEU_S_PH_QBR_MM_ENC : POOL32A_3RB0_FMT<"muleu_s.ph.qbr", 0b0011010101>;
+class MULQ_RS_PH_MM_ENC : POOL32A_3RB0_FMT<"mulq_rs.ph", 0b0100010101>;
+class MULQ_RS_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_rs.w", 0b0110010101>;
+class MULQ_S_PH_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.ph", 0b0101010101>;
+class MULQ_S_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.w", 0b0111010101>;
+class PRECR_QB_PH_MMR2_ENC : POOL32A_3RB0_FMT<"precr.qb.ph", 0b0001101101>;
+class PRECR_SRA_PH_W_MMR2_ENC
+    : POOL32A_2RSA5_FMT<"precr_sra.ph.w", 0b01111001101>;
+class PRECR_SRA_R_PH_W_MMR2_ENC
+    : POOL32A_2RSA5_FMT<"precr_sra_r.ph.w", 0b11111001101>;
+class PRECRQ_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq.ph.w", 0b0011101101>;
+class PRECRQ_QB_PH_MM_ENC : POOL32A_3RB0_FMT<"precrq.qb.ph", 0b0010101101>;
+class PRECRQU_S_QB_PH_MM_ENC
+    : POOL32A_3RB0_FMT<"precrqu_s.qb.ph", 0b0101101101>;
+class PRECRQ_RS_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq_rs.ph.w", 0b0100101101>;
+class LBUX_MM_ENC : POOL32A_1RMEMB0_FMT<"lbux", 0b1000100101>;
+class LHX_MM_ENC : POOL32A_1RMEMB0_FMT<"lhx", 0b0101100101>;
+class LWX_MM_ENC : POOL32A_1RMEMB0_FMT<"lwx", 0b0110100101>;
+class MAQ_S_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phl", 0b01101001>;
+class MAQ_SA_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phl", 0b11101001>;
+class MAQ_S_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phr", 0b00101001>;
+class MAQ_SA_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phr", 0b10101001>;
+class MFHI_MM_ENC : POOL32A_1RAC_FMT<"mfhi", 0b00000001>;
+class MFLO_MM_ENC : POOL32A_1RAC_FMT<"mflo", 0b01000001>;
+class MTHI_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b10000001>;
+class MTLO_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b11000001>;
+class PREPEND_MMR2_ENC : POOL32A_2RSA5B0_FMT<"prepend", 0b1001010101>;
+class RADDU_W_QB_MM_ENC : POOL32A_2R_FMT<"raddu.w.qb", 0b1111000100>;
+class RDDSP_MM_ENC : POOL32A_1RMASK7_FMT<"rddsp", 0b00011001>;
+class REPL_PH_MM_ENC : POOL32A_1RIMM10_FMT<"repl.ph", 0b0000111101>;
+class REPL_QB_MM_ENC : POOL32A_1RIMM8_FMT<"repl.qb", 0b010111>;
+class REPLV_PH_MM_ENC : POOL32A_2R_FMT<"replv.ph", 0b0000001100>;
+class REPLV_QB_MM_ENC : POOL32A_2R_FMT<"replv.qb", 0b0001001100>;
+class MTHLIP_MM_ENC : POOL32A_1RAC_FMT<"mthlip", 0b00001001>;
+class PACKRL_PH_MM_ENC : POOL32A_3RB0_FMT<"packrl.ph", 0b0110101101>;
+class PICK_PH_MM_ENC : POOL32A_3RB0_FMT<"pick.ph", 0b1000101101>;
+class PICK_QB_MM_ENC : POOL32A_3RB0_FMT<"pick.qb", 0b0111101101>;
+class SHILO_MM_ENC : POOL32A_4B0SHIFT6AC4B0_FMT<"shilo", 0b0000011101>;
+class SHILOV_MM_ENC : POOL32A_5B01RAC_FMT<"shilov", 0b01001001>;
+class WRDSP_MM_ENC : POOL32A_1RMASK7_FMT<"wrdsp", 0b01011001>;
+class APPEND_MMR2_ENC : POOL32A_2RSA5B0_FMT<"append", 0b1000010101>;
+class MODSUB_MM_ENC : POOL32A_3RB0_FMT<"modsub", 0b1010010101>;
+class MULSA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"mulsa.w.ph", 0b10110010>;
+class MULSAQ_S_W_PH_MM_ENC : POOL32A_2RAC_FMT<"mulsaq_s.w.ph", 0b11110010>;
+class BPOSGE32C_MMR3_ENC : POOL32I_IMMB0_FMT<"bposge32c", 0b11001>;
+class BITREV_MM_ENC : POOL32A_2R_FMT<"bitrev", 0b0011000100>;
+class BALIGN_MMR2_ENC : POOL32A_2RBP_FMT<"balign">;
+class BPOSGE32_MM_ENC : POOL32I_IMMB0_FMT<"bposge32", 0b11011>;
+class CMP_EQ_PH_MM_ENC : POOL32A_2RB0_FMT<"cmp.eq.ph", 0b0000000101>;
+class CMP_LE_PH_MM_ENC : POOL32A_2RB0_FMT<"cmp.le.ph", 0b0010000101>;
+class CMP_LT_PH_MM_ENC : POOL32A_2RB0_FMT<"cmp.lt.ph", 0b0001000101>;
+class CMPGDU_EQ_QB_MMR2_ENC : POOL32A_3RB0_FMT<"cmpgdu.eq.qb", 0b0110000101>;
+class CMPGDU_LT_QB_MMR2_ENC : POOL32A_3RB0_FMT<"cmpgdu.lt.qb", 0b0111000101>;
+class CMPGDU_LE_QB_MMR2_ENC : POOL32A_3RB0_FMT<"cmpgdu.le.qb", 0b1000000101>;
+class CMPGU_EQ_QB_MM_ENC : POOL32S_3RB0_FMT<"cmpgu.eq.qb", 0b0011000101>;
+class CMPGU_LT_QB_MM_ENC : POOL32S_3RB0_FMT<"cmpgu.lt.qb", 0b0100000101>;
+class CMPGU_LE_QB_MM_ENC : POOL32S_3RB0_FMT<"cmpgu.le.qb", 0b0101000101>;
+class CMPU_EQ_QB_MM_ENC : POOL32A_2R2B0_FMT<"cmpu.eq.qb", 0b1001000101>;
+class CMPU_LT_QB_MM_ENC : POOL32A_2R2B0_FMT<"cmpu.lt.qb", 0b1010000101>;
+class CMPU_LE_QB_MM_ENC : POOL32A_2R2B0_FMT<"cmpu.le.qb", 0b1011000101>;
+
+// Instruction desc.
+class ABSQ_S_PH_MM_R2_DESC_BASE<string opstr, SDPatternOperator OpNode,
+                                InstrItinClass itin, RegisterOperand ROD,
+                                RegisterOperand ROS = ROD> {
+  dag OutOperandList = (outs ROD:$rt);
+  dag InOperandList = (ins ROS:$rs);
+  string AsmString = !strconcat(opstr, "\t$rt, $rs");
+  list<dag> Pattern = [(set ROD:$rt, (OpNode ROS:$rs))];
+  InstrItinClass Itinerary = itin;
+}
+class ABSQ_S_PH_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "absq_s.ph", int_mips_absq_s_ph, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag20]>;
+class ABSQ_S_W_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "absq_s.w", int_mips_absq_s_w, NoItinerary, GPR32Opnd>, Defs<[DSPOutFlag20]>;
+class ABSQ_S_QB_MMR2_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "absq_s.qb", int_mips_absq_s_qb, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag20]>;
+class PRECEQ_W_PHL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "preceq.w.phl", int_mips_preceq_w_phl, NoItinerary, GPR32Opnd, DSPROpnd>;
+class PRECEQ_W_PHR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "preceq.w.phr", int_mips_preceq_w_phr, NoItinerary, GPR32Opnd, DSPROpnd>;
+class PRECEQU_PH_QBL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "precequ.ph.qbl", int_mips_precequ_ph_qbl, NoItinerary, DSPROpnd>;
+class PRECEQU_PH_QBLA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "precequ.ph.qbla", int_mips_precequ_ph_qbla, NoItinerary, DSPROpnd>;
+class PRECEQU_PH_QBR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "precequ.ph.qbr", int_mips_precequ_ph_qbr, NoItinerary, DSPROpnd>;
+class PRECEQU_PH_QBRA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "precequ.ph.qbra", int_mips_precequ_ph_qbra, NoItinerary, DSPROpnd>;
+class PRECEU_PH_QBL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "preceu.ph.qbl", int_mips_preceu_ph_qbl, NoItinerary, DSPROpnd>;
+class PRECEU_PH_QBLA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "preceu.ph.qbla", int_mips_preceu_ph_qbla, NoItinerary, DSPROpnd>;
+class PRECEU_PH_QBR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "preceu.ph.qbr", int_mips_preceu_ph_qbr, NoItinerary, DSPROpnd>;
+class PRECEU_PH_QBRA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<
+  "preceu.ph.qbra", int_mips_preceu_ph_qbra, NoItinerary, DSPROpnd>;
+
+class SHLL_R2_MM_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           SDPatternOperator ImmPat, InstrItinClass itin,
+                           RegisterOperand RO, Operand ImmOpnd> {
+  dag OutOperandList = (outs RO:$rt);
+  dag InOperandList = (ins RO:$rs, ImmOpnd:$sa);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+  list<dag> Pattern = [(set RO:$rt, (OpNode RO:$rs, ImmPat:$sa))];
+  InstrItinClass Itinerary = itin;
+  bit hasSideEffects = 1;
+}
+class SHLL_PH_MM_DESC : SHLL_R2_MM_DESC_BASE<
+  "shll.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>,
+  Defs<[DSPOutFlag22]>;
+class SHLL_S_PH_MM_DESC : SHLL_R2_MM_DESC_BASE<
+  "shll_s.ph", int_mips_shll_s_ph, immZExt4, NoItinerary, DSPROpnd, uimm4>,
+  Defs<[DSPOutFlag22]>;
+class SHLL_QB_MM_DESC : SHLL_R2_MM_DESC_BASE<
+  "shll.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>,
+  Defs<[DSPOutFlag22]>;
+class SHLL_S_W_MM_DESC : SHLL_R2_MM_DESC_BASE<
+  "shll_s.w", int_mips_shll_s_w, immZExt5, NoItinerary, GPR32Opnd, uimm5>,
+  Defs<[DSPOutFlag22]>;
+class SHRA_QB_MMR2_DESC : SHLL_R2_MM_DESC_BASE<
+  "shra.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>;
+class SHRA_R_QB_MMR2_DESC : SHLL_R2_MM_DESC_BASE<
+  "shra_r.qb", int_mips_shra_r_qb, immZExt3, NoItinerary, DSPROpnd, uimm3>;
+class SHRA_PH_MM_DESC : SHLL_R2_MM_DESC_BASE<
+  "shra.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>;
+class SHRA_R_PH_MM_DESC : SHLL_R2_MM_DESC_BASE<
+  "shra_r.ph", int_mips_shra_r_ph, immZExt4, NoItinerary, DSPROpnd, uimm4>;
+class SHRA_R_W_MM_DESC : SHLL_R2_MM_DESC_BASE<
+  "shra_r.w", int_mips_shra_r_w, immZExt5, NoItinerary, GPR32Opnd, uimm5>;
+class SHRL_QB_MM_DESC : SHLL_R2_MM_DESC_BASE<
+  "shrl.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>;
+class SHRL_PH_MMR2_DESC : SHLL_R2_MM_DESC_BASE<
+  "shrl.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>;
+
+class SHLLV_R3_MM_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            InstrItinClass itin, RegisterOperand RO> {
+  dag OutOperandList = (outs RO:$rd);
+  dag InOperandList =  (ins RO:$rt, GPR32Opnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs");
+  list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs))];
+  InstrItinClass Itinerary = itin;
+}
+class SHLLV_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shllv.ph", int_mips_shll_ph, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag22]>;
+class SHLLV_S_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shllv_s.ph", int_mips_shll_s_ph, NoItinerary, DSPROpnd>,
+  Defs<[DSPOutFlag22]>;
+class SHLLV_QB_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shllv.qb", int_mips_shll_qb, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag22]>;
+class SHLLV_S_W_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shllv_s.w", int_mips_shll_s_w, NoItinerary, GPR32Opnd>, Defs<[DSPOutFlag22]>;
+class SHRAV_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shrav.ph", int_mips_shra_ph, NoItinerary, DSPROpnd>;
+class SHRAV_R_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shrav_r.ph", int_mips_shra_r_ph, NoItinerary, DSPROpnd>;
+class SHRAV_QB_MMR2_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shrav.qb", int_mips_shra_qb, NoItinerary, DSPROpnd>;
+class SHRAV_R_QB_MMR2_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shrav_r.qb", int_mips_shra_r_qb, NoItinerary, DSPROpnd>;
+class SHRAV_R_W_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shrav_r.w", int_mips_shra_r_w, NoItinerary, GPR32Opnd>;
+class SHRLV_PH_MMR2_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shrlv.ph", int_mips_shrl_ph, NoItinerary, DSPROpnd>;
+class SHRLV_QB_MM_DESC : SHLLV_R3_MM_DESC_BASE<
+  "shrlv.qb", int_mips_shrl_qb, NoItinerary, DSPROpnd>;
+
+class EXT_MM_2R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $rs");
+  InstrItinClass Itinerary = itin;
+}
+class EXT_MM_1R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm5:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $imm");
+  InstrItinClass Itinerary = itin;
+}
+
+class EXTP_MM_DESC
+    : EXT_MM_1R_DESC_BASE<"extp", MipsEXTP, NoItinerary>,
+      Uses<[DSPPos]>, Defs<[DSPEFI]>;
+class EXTPDP_MM_DESC
+    : EXT_MM_1R_DESC_BASE<"extpdp", MipsEXTPDP, NoItinerary>,
+      Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
+class EXTPDPV_MM_DESC
+    : EXT_MM_2R_DESC_BASE<"extpdpv", MipsEXTPDP, NoItinerary>,
+      Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
+class EXTPV_MM_DESC
+    : EXT_MM_2R_DESC_BASE<"extpv", MipsEXTP, NoItinerary>,
+      Uses<[DSPPos]>, Defs<[DSPEFI]>;
+class EXTR_W_MM_DESC
+    : EXT_MM_1R_DESC_BASE<"extr.w", MipsEXTR_W, NoItinerary>,
+      Defs<[DSPOutFlag23]>;
+class EXTR_R_W_MM_DESC
+    : EXT_MM_1R_DESC_BASE<"extr_r.w", MipsEXTR_R_W, NoItinerary>,
+      Defs<[DSPOutFlag23]>;
+class EXTR_RS_W_MM_DESC
+    : EXT_MM_1R_DESC_BASE<"extr_rs.w", MipsEXTR_RS_W, NoItinerary>,
+      Defs<[DSPOutFlag23]>;
+class EXTR_S_H_MM_DESC
+    : EXT_MM_1R_DESC_BASE<"extr_s.h", MipsEXTR_S_H, NoItinerary>,
+      Defs<[DSPOutFlag23]>;
+class EXTRV_W_MM_DESC
+    : EXT_MM_2R_DESC_BASE<"extrv.w", MipsEXTR_W, NoItinerary>,
+      Defs<[DSPOutFlag23]>;
+class EXTRV_R_W_MM_DESC
+    : EXT_MM_2R_DESC_BASE<"extrv_r.w", MipsEXTR_R_W, NoItinerary>,
+      Defs<[DSPOutFlag23]>;
+class EXTRV_RS_W_MM_DESC
+    : EXT_MM_2R_DESC_BASE<"extrv_rs.w", MipsEXTR_RS_W, NoItinerary>,
+      Defs<[DSPOutFlag23]>;
+class EXTRV_S_H_MM_DESC
+    : EXT_MM_2R_DESC_BASE<"extrv_s.h", MipsEXTR_S_H, NoItinerary>,
+      Defs<[DSPOutFlag23]>;
+
+class MFHI_MM_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
+                        InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rs);
+  dag InOperandList = (ins RO:$ac);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
+  list<dag> Pattern = [(set GPR32Opnd:$rs, (OpNode RO:$ac))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MFHI_MM_DESC : MFHI_MM_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI,
+                                       NoItinerary>;
+class MFLO_MM_DESC : MFHI_MM_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO,
+                                       NoItinerary>;
+
+class RADDU_W_QB_MM_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins DSPROpnd:$rs);
+  string AsmString = !strconcat("raddu.w.qb", "\t$rt, $rs");
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_raddu_w_qb DSPROpnd:$rs))];
+  InstrItinClass Itinerary = NoItinerary;
+  string BaseOpcode = "raddu.w.qb";
+}
+
+class RDDSP_MM_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins uimm7:$mask);
+  string AsmString = !strconcat("rddsp", "\t$rt, $mask");
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp immZExt7:$mask))];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class REPL_QB_MM_DESC {
+  dag OutOperandList = (outs DSPROpnd:$rt);
+  dag InOperandList = (ins uimm8:$imm);
+  string AsmString = !strconcat("repl.qb", "\t$rt, $imm");
+  list<dag> Pattern = [(set DSPROpnd:$rt, (int_mips_repl_qb immZExt8:$imm))];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class REPLV_PH_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.ph", int_mips_repl_ph,
+                                                   NoItinerary, DSPROpnd,
+                                                   GPR32Opnd>;
+class REPLV_QB_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
+                                                   NoItinerary, DSPROpnd,
+                                                   GPR32Opnd>;
+
+class WRDSP_MM_DESC {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins GPR32Opnd:$rt, uimm7:$mask);
+  string AsmString = !strconcat("wrdsp", "\t$rt, $mask");
+  list<dag> Pattern = [(int_mips_wrdsp GPR32Opnd:$rt, immZExt7:$mask)];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class BPOSGE32C_MMR3_DESC {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins brtarget1SImm16:$offset);
+  string AsmString = !strconcat("bposge32c", "\t$offset");
+  InstrItinClass Itinerary = NoItinerary;
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit hasDelaySlot = 0;
+}
+
+class BALIGN_MMR2_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rs, uimm2:$bp, GPR32Opnd:$src);
+  string AsmString = !strconcat("balign", "\t$rt, $rs, $bp");
+  list<dag> Pattern =  [(set GPR32Opnd:$rt, (int_mips_balign GPR32Opnd:$src,
+                                                             GPR32Opnd:$rs,
+                                                             immZExt2:$bp))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$src = $rt";
+}
+
+class BITREV_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"bitrev", int_mips_bitrev,
+                                                 NoItinerary, GPR32Opnd>;
+
+class BPOSGE32_MM_DESC : BPOSGE32_DESC_BASE<"bposge32", brtarget_mm,
+                                            NoItinerary>;
+
+// Instruction defs.
+// microMIPS DSP Rev 1
+def ADDQ_PH_MM : DspMMRel, ADDQ_PH_MM_ENC, ADDQ_PH_DESC;
+def ADDQ_S_PH_MM : DspMMRel, ADDQ_S_PH_MM_ENC, ADDQ_S_PH_DESC;
+def ADDQ_S_W_MM : DspMMRel, ADDQ_S_W_MM_ENC, ADDQ_S_W_DESC;
+def ADDU_QB_MM : DspMMRel, ADDU_QB_MM_ENC, ADDU_QB_DESC;
+def ADDU_S_QB_MM : DspMMRel, ADDU_S_QB_MM_ENC, ADDU_S_QB_DESC;
+def ADDSC_MM : DspMMRel, ADDSC_MM_ENC, ADDSC_DESC;
+def ADDWC_MM : DspMMRel, ADDWC_MM_ENC, ADDWC_DESC;
+def DPAQ_S_W_PH_MM : DspMMRel, DPAQ_S_W_PH_MM_ENC, DPAQ_S_W_PH_DESC;
+def DPAQ_SA_L_W_MM : DspMMRel, DPAQ_SA_L_W_MM_ENC, DPAQ_SA_L_W_DESC;
+def DPAU_H_QBL_MM : DspMMRel, DPAU_H_QBL_MM_ENC, DPAU_H_QBL_DESC;
+def DPAU_H_QBR_MM : DspMMRel, DPAU_H_QBR_MM_ENC, DPAU_H_QBR_DESC;
+def ABSQ_S_PH_MM : DspMMRel, ABSQ_S_PH_MM_ENC, ABSQ_S_PH_MM_DESC;
+def ABSQ_S_W_MM : DspMMRel, ABSQ_S_W_MM_ENC, ABSQ_S_W_MM_DESC;
+def INSV_MM : DspMMRel, INSV_MM_ENC, INSV_DESC;
+def MADD_DSP_MM : DspMMRel, MADD_DSP_MM_ENC, MADD_DSP_DESC;
+def MADDU_DSP_MM : DspMMRel, MADDU_DSP_MM_ENC, MADDU_DSP_DESC;
+def MSUB_DSP_MM : DspMMRel, MSUB_DSP_MM_ENC, MSUB_DSP_DESC;
+def MSUBU_DSP_MM : DspMMRel, MSUBU_DSP_MM_ENC, MSUBU_DSP_DESC;
+def MULT_DSP_MM : DspMMRel, MULT_DSP_MM_ENC, MULT_DSP_DESC;
+def MULTU_DSP_MM : DspMMRel, MULTU_DSP_MM_ENC, MULTU_DSP_DESC;
+def SHLL_PH_MM : DspMMRel, SHLL_PH_MM_ENC, SHLL_PH_MM_DESC;
+def SHLL_S_PH_MM : DspMMRel, SHLL_S_PH_MM_ENC, SHLL_S_PH_MM_DESC;
+def SHLL_QB_MM : DspMMRel, SHLL_QB_MM_ENC, SHLL_QB_MM_DESC;
+def SHLLV_PH_MM : DspMMRel, SHLLV_PH_MM_ENC, SHLLV_PH_MM_DESC;
+def SHLLV_S_PH_MM : DspMMRel, SHLLV_S_PH_MM_ENC, SHLLV_S_PH_MM_DESC;
+def SHLLV_QB_MM : DspMMRel, SHLLV_QB_MM_ENC, SHLLV_QB_MM_DESC;
+def SHLLV_S_W_MM : DspMMRel, SHLLV_S_W_MM_ENC, SHLLV_S_W_MM_DESC;
+def SHLL_S_W_MM : DspMMRel, SHLL_S_W_MM_ENC, SHLL_S_W_MM_DESC;
+def SHRA_PH_MM : DspMMRel, SHRA_PH_MM_ENC, SHRA_PH_MM_DESC;
+def SHRA_R_PH_MM : DspMMRel, SHRA_R_PH_MM_ENC, SHRA_R_PH_MM_DESC;
+def SHRAV_PH_MM : DspMMRel, SHRAV_PH_MM_ENC, SHRAV_PH_MM_DESC;
+def SHRAV_R_PH_MM : DspMMRel, SHRAV_R_PH_MM_ENC, SHRAV_R_PH_MM_DESC;
+def SHRAV_R_W_MM : DspMMRel, SHRAV_R_W_MM_ENC, SHRAV_R_W_MM_DESC;
+def SHRA_R_W_MM : DspMMRel, SHRA_R_W_MM_ENC, SHRA_R_W_MM_DESC;
+def SHRL_QB_MM : DspMMRel, SHRL_QB_MM_ENC, SHRL_QB_MM_DESC;
+def SHRLV_QB_MM : DspMMRel, SHRLV_QB_MM_ENC, SHRLV_QB_MM_DESC;
+def PRECEQ_W_PHL_MM : DspMMRel, PRECEQ_W_PHL_MM_ENC, PRECEQ_W_PHL_MM_DESC;
+def PRECEQ_W_PHR_MM : DspMMRel, PRECEQ_W_PHR_MM_ENC, PRECEQ_W_PHR_MM_DESC;
+def PRECEQU_PH_QBL_MM : DspMMRel, PRECEQU_PH_QBL_MM_ENC, PRECEQU_PH_QBL_MM_DESC;
+def PRECEQU_PH_QBLA_MM : DspMMRel, PRECEQU_PH_QBLA_MM_ENC,
+                         PRECEQU_PH_QBLA_MM_DESC;
+def PRECEQU_PH_QBR_MM : DspMMRel, PRECEQU_PH_QBR_MM_ENC, PRECEQU_PH_QBR_MM_DESC;
+def PRECEQU_PH_QBRA_MM : DspMMRel, PRECEQU_PH_QBRA_MM_ENC,
+                         PRECEQU_PH_QBRA_MM_DESC;
+def PRECEU_PH_QBL_MM : DspMMRel, PRECEU_PH_QBL_MM_ENC, PRECEU_PH_QBL_MM_DESC;
+def PRECEU_PH_QBLA_MM : DspMMRel, PRECEU_PH_QBLA_MM_ENC, PRECEU_PH_QBLA_MM_DESC;
+def PRECEU_PH_QBR_MM : DspMMRel, PRECEU_PH_QBR_MM_ENC, PRECEU_PH_QBR_MM_DESC;
+def PRECEU_PH_QBRA_MM : DspMMRel, PRECEU_PH_QBRA_MM_ENC, PRECEU_PH_QBRA_MM_DESC;
+def SUBQ_PH_MM : DspMMRel, SUBQ_PH_MM_ENC, SUBQ_PH_DESC;
+def SUBQ_S_PH_MM : DspMMRel, SUBQ_S_PH_MM_ENC, SUBQ_S_PH_DESC;
+def SUBQ_S_W_MM : DspMMRel, SUBQ_S_W_MM_ENC, SUBQ_S_W_DESC;
+def SUBU_QB_MM : DspMMRel, SUBU_QB_MM_ENC, SUBU_QB_DESC;
+def SUBU_S_QB_MM : DspMMRel, SUBU_S_QB_MM_ENC, SUBU_S_QB_DESC;
+def EXTP_MM : DspMMRel, EXTP_MM_ENC, EXTP_MM_DESC;
+def EXTPDP_MM : DspMMRel, EXTPDP_MM_ENC, EXTPDP_MM_DESC;
+def EXTPDPV_MM : DspMMRel, EXTPDPV_MM_ENC, EXTPDPV_MM_DESC;
+def EXTPV_MM : DspMMRel, EXTPV_MM_ENC, EXTPV_MM_DESC;
+def EXTR_W_MM : DspMMRel, EXTR_W_MM_ENC, EXTR_W_MM_DESC;
+def EXTR_R_W_MM : DspMMRel, EXTR_R_W_MM_ENC, EXTR_R_W_MM_DESC;
+def EXTR_RS_W_MM : DspMMRel, EXTR_RS_W_MM_ENC, EXTR_RS_W_MM_DESC;
+def EXTR_S_H_MM : DspMMRel, EXTR_S_H_MM_ENC, EXTR_S_H_MM_DESC;
+def EXTRV_W_MM : DspMMRel, EXTRV_W_MM_ENC, EXTRV_W_MM_DESC;
+def EXTRV_R_W_MM : DspMMRel, EXTRV_R_W_MM_ENC, EXTRV_R_W_MM_DESC;
+def EXTRV_RS_W_MM : DspMMRel, EXTRV_RS_W_MM_ENC, EXTRV_RS_W_MM_DESC;
+def EXTRV_S_H_MM : DspMMRel, EXTRV_S_H_MM_ENC, EXTRV_S_H_MM_DESC;
+def DPSQ_S_W_PH_MM : DspMMRel, DPSQ_S_W_PH_MM_ENC, DPSQ_S_W_PH_DESC;
+def DPSQ_SA_L_W_MM : DspMMRel, DPSQ_SA_L_W_MM_ENC, DPSQ_SA_L_W_DESC;
+def DPSU_H_QBL_MM : DspMMRel, DPSU_H_QBL_MM_ENC, DPSU_H_QBL_DESC;
+def DPSU_H_QBR_MM : DspMMRel, DPSU_H_QBR_MM_ENC, DPSU_H_QBR_DESC;
+def MULEQ_S_W_PHL_MM : DspMMRel, MULEQ_S_W_PHL_MM_ENC, MULEQ_S_W_PHL_DESC;
+def MULEQ_S_W_PHR_MM : DspMMRel, MULEQ_S_W_PHR_MM_ENC, MULEQ_S_W_PHR_DESC;
+def MULEU_S_PH_QBL_MM : DspMMRel, MULEU_S_PH_QBL_MM_ENC, MULEU_S_PH_QBL_DESC;
+def MULEU_S_PH_QBR_MM : DspMMRel, MULEU_S_PH_QBR_MM_ENC, MULEU_S_PH_QBR_DESC;
+def MULQ_RS_PH_MM : DspMMRel, MULQ_RS_PH_MM_ENC, MULQ_RS_PH_DESC;
+def PRECRQ_PH_W_MM : DspMMRel, PRECRQ_PH_W_MM_ENC, PRECRQ_PH_W_DESC;
+def PRECRQ_QB_PH_MM : DspMMRel, PRECRQ_QB_PH_MM_ENC, PRECRQ_QB_PH_DESC;
+def PRECRQU_S_QB_PH_MM : DspMMRel, PRECRQU_S_QB_PH_MM_ENC, PRECRQU_S_QB_PH_DESC;
+def PRECRQ_RS_PH_W_MM : DspMMRel, PRECRQ_RS_PH_W_MM_ENC, PRECRQ_RS_PH_W_DESC;
+def LBUX_MM : DspMMRel, LBUX_MM_ENC, LBUX_DESC;
+def LHX_MM : DspMMRel, LHX_MM_ENC, LHX_DESC;
+def LWX_MM : DspMMRel, LWX_MM_ENC, LWX_DESC;
+def MAQ_S_W_PHL_MM : DspMMRel, MAQ_S_W_PHL_MM_ENC, MAQ_S_W_PHL_DESC;
+def MAQ_SA_W_PHL_MM : DspMMRel, MAQ_SA_W_PHL_MM_ENC, MAQ_SA_W_PHL_DESC;
+def MAQ_S_W_PHR_MM : DspMMRel, MAQ_S_W_PHR_MM_ENC, MAQ_S_W_PHR_DESC;
+def MAQ_SA_W_PHR_MM : DspMMRel, MAQ_SA_W_PHR_MM_ENC, MAQ_SA_W_PHR_DESC;
+def MFHI_DSP_MM : DspMMRel, MFHI_MM_ENC, MFHI_MM_DESC;
+def MFLO_DSP_MM : DspMMRel, MFLO_MM_ENC, MFLO_MM_DESC;
+def MTHI_DSP_MM : DspMMRel, MTHI_MM_ENC, MTHI_DESC;
+def MTLO_DSP_MM : DspMMRel, MTLO_MM_ENC, MTLO_DESC;
+def RADDU_W_QB_MM : DspMMRel, RADDU_W_QB_MM_ENC, RADDU_W_QB_MM_DESC;
+def RDDSP_MM : DspMMRel, RDDSP_MM_ENC, RDDSP_MM_DESC;
+def REPL_PH_MM : DspMMRel, REPL_PH_MM_ENC, REPL_PH_DESC;
+def REPL_QB_MM : DspMMRel, REPL_QB_MM_ENC, REPL_QB_MM_DESC;
+def REPLV_PH_MM : DspMMRel, REPLV_PH_MM_ENC, REPLV_PH_MM_DESC;
+def REPLV_QB_MM : DspMMRel, REPLV_QB_MM_ENC, REPLV_QB_MM_DESC;
+def MTHLIP_MM : DspMMRel, MTHLIP_MM_ENC, MTHLIP_DESC;
+def PACKRL_PH_MM : DspMMRel, PACKRL_PH_MM_ENC, PACKRL_PH_DESC;
+def PICK_PH_MM : DspMMRel, PICK_PH_MM_ENC, PICK_PH_DESC;
+def PICK_QB_MM : DspMMRel, PICK_QB_MM_ENC, PICK_QB_DESC;
+def SHILO_MM : DspMMRel, SHILO_MM_ENC, SHILO_DESC;
+def SHILOV_MM : DspMMRel, SHILOV_MM_ENC, SHILOV_DESC;
+def WRDSP_MM : DspMMRel, WRDSP_MM_ENC, WRDSP_MM_DESC;
+def MODSUB_MM : DspMMRel, MODSUB_MM_ENC, MODSUB_DESC;
+def MULSAQ_S_W_PH_MM : DspMMRel, MULSAQ_S_W_PH_MM_ENC, MULSAQ_S_W_PH_DESC;
+def BITREV_MM : DspMMRel, BITREV_MM_ENC, BITREV_MM_DESC;
+def BPOSGE32_MM : DspMMRel, BPOSGE32_MM_ENC, BPOSGE32_MM_DESC,
+                  ISA_MIPS1_NOT_32R6_64R6;
+def CMP_EQ_PH_MM : DspMMRel, CMP_EQ_PH_MM_ENC, CMP_EQ_PH_DESC;
+def CMP_LT_PH_MM : DspMMRel, CMP_LT_PH_MM_ENC, CMP_LT_PH_DESC;
+def CMP_LE_PH_MM : DspMMRel, CMP_LE_PH_MM_ENC, CMP_LE_PH_DESC;
+def CMPGU_EQ_QB_MM : DspMMRel, CMPGU_EQ_QB_MM_ENC, CMPGU_EQ_QB_DESC;
+def CMPGU_LT_QB_MM : DspMMRel, CMPGU_LT_QB_MM_ENC, CMPGU_LT_QB_DESC;
+def CMPGU_LE_QB_MM : DspMMRel, CMPGU_LE_QB_MM_ENC, CMPGU_LE_QB_DESC;
+def CMPU_EQ_QB_MM : DspMMRel, CMPU_EQ_QB_MM_ENC, CMPU_EQ_QB_DESC;
+def CMPU_LT_QB_MM : DspMMRel, CMPU_LT_QB_MM_ENC, CMPU_LT_QB_DESC;
+def CMPU_LE_QB_MM : DspMMRel, CMPU_LE_QB_MM_ENC, CMPU_LE_QB_DESC;
+// microMIPS DSP Rev 2
+def ABSQ_S_QB_MMR2 : DspMMRel, ABSQ_S_QB_MMR2_ENC, ABSQ_S_QB_MMR2_DESC,
+                     ISA_DSPR2;
+def ADDQH_PH_MMR2 : DspMMRel, ADDQH_PH_MMR2_ENC, ADDQH_PH_DESC, ISA_DSPR2;
+def ADDQH_R_PH_MMR2 : DspMMRel, ADDQH_R_PH_MMR2_ENC, ADDQH_R_PH_DESC, ISA_DSPR2;
+def ADDQH_W_MMR2 : DspMMRel, ADDQH_W_MMR2_ENC, ADDQH_W_DESC, ISA_DSPR2;
+def ADDQH_R_W_MMR2 : DspMMRel, ADDQH_R_W_MMR2_ENC, ADDQH_R_W_DESC, ISA_DSPR2;
+def ADDU_PH_MMR2 : DspMMRel, ADDU_PH_MMR2_ENC, ADDU_PH_DESC, ISA_DSPR2;
+def ADDU_S_PH_MMR2 : DspMMRel, ADDU_S_PH_MMR2_ENC, ADDU_S_PH_DESC, ISA_DSPR2;
+def ADDUH_QB_MMR2 : DspMMRel, ADDUH_QB_MMR2_ENC, ADDUH_QB_DESC, ISA_DSPR2;
+def ADDUH_R_QB_MMR2 : DspMMRel, ADDUH_R_QB_MMR2_ENC, ADDUH_R_QB_DESC, ISA_DSPR2;
+def DPA_W_PH_MMR2 : DspMMRel, DPA_W_PH_MMR2_ENC, DPA_W_PH_DESC, ISA_DSPR2;
+def DPAQX_S_W_PH_MMR2 : DspMMRel, DPAQX_S_W_PH_MMR2_ENC, DPAQX_S_W_PH_DESC,
+                        ISA_DSPR2;
+def DPAQX_SA_W_PH_MMR2 : DspMMRel, DPAQX_SA_W_PH_MMR2_ENC, DPAQX_SA_W_PH_DESC,
+                         ISA_DSPR2;
+def DPAX_W_PH_MMR2 : DspMMRel, DPAX_W_PH_MMR2_ENC, DPAX_W_PH_DESC, ISA_DSPR2;
+def SHRA_QB_MMR2 : DspMMRel, SHRA_QB_MMR2_ENC, SHRA_QB_MMR2_DESC, ISA_DSPR2;
+def SHRA_R_QB_MMR2 : DspMMRel, SHRA_R_QB_MMR2_ENC, SHRA_R_QB_MMR2_DESC,
+                     ISA_DSPR2;
+def SHRAV_QB_MMR2 : DspMMRel, SHRAV_QB_MMR2_ENC, SHRAV_QB_MMR2_DESC, ISA_DSPR2;
+def SHRAV_R_QB_MMR2 : DspMMRel, SHRAV_R_QB_MMR2_ENC, SHRAV_R_QB_MMR2_DESC,
+                      ISA_DSPR2;
+def BALIGN_MMR2 : DspMMRel, BALIGN_MMR2_ENC, BALIGN_MMR2_DESC, ISA_DSPR2;
+def CMPGDU_EQ_QB_MMR2 : DspMMRel, CMPGDU_EQ_QB_MMR2_ENC, CMPGDU_EQ_QB_DESC,
+                        ISA_DSPR2;
+def CMPGDU_LT_QB_MMR2 : DspMMRel, CMPGDU_LT_QB_MMR2_ENC, CMPGDU_LT_QB_DESC,
+                        ISA_DSPR2;
+def CMPGDU_LE_QB_MMR2 : DspMMRel, CMPGDU_LE_QB_MMR2_ENC, CMPGDU_LE_QB_DESC,
+                        ISA_DSPR2;
+def SHRL_PH_MMR2 : DspMMRel, SHRL_PH_MMR2_ENC, SHRL_PH_MMR2_DESC, ISA_DSPR2;
+def SHRLV_PH_MMR2 : DspMMRel, SHRLV_PH_MMR2_ENC, SHRLV_PH_MMR2_DESC, ISA_DSPR2;
+def SUBQH_PH_MMR2 : DspMMRel, SUBQH_PH_MMR2_ENC, SUBQH_PH_DESC, ISA_DSPR2;
+def SUBQH_R_PH_MMR2 : DspMMRel, SUBQH_R_PH_MMR2_ENC, SUBQH_R_PH_DESC, ISA_DSPR2;
+def SUBQH_W_MMR2 : DspMMRel, SUBQH_W_MMR2_ENC, SUBQH_W_DESC, ISA_DSPR2;
+def SUBQH_R_W_MMR2 : DspMMRel, SUBQH_R_W_MMR2_ENC, SUBQH_R_W_DESC, ISA_DSPR2;
+def SUBU_PH_MMR2 : DspMMRel, SUBU_PH_MMR2_ENC, SUBU_PH_DESC, ISA_DSPR2;
+def SUBU_S_PH_MMR2 : DspMMRel, SUBU_S_PH_MMR2_ENC, SUBU_S_PH_DESC, ISA_DSPR2;
+def SUBUH_QB_MMR2 : DspMMRel, SUBUH_QB_MMR2_ENC, SUBUH_QB_DESC, ISA_DSPR2;
+def SUBUH_R_QB_MMR2 : DspMMRel, SUBUH_R_QB_MMR2_ENC, SUBUH_R_QB_DESC, ISA_DSPR2;
+def DPS_W_PH_MMR2 : DspMMRel, DPS_W_PH_MMR2_ENC, DPS_W_PH_DESC, ISA_DSPR2;
+def DPSQX_S_W_PH_MMR2 : DspMMRel, DPSQX_S_W_PH_MMR2_ENC, DPSQX_S_W_PH_DESC,
+                        ISA_DSPR2;
+def DPSQX_SA_W_PH_MMR2 : DspMMRel, DPSQX_SA_W_PH_MMR2_ENC, DPSQX_SA_W_PH_DESC,
+                         ISA_DSPR2;
+def DPSX_W_PH_MMR2 : DspMMRel, DPSX_W_PH_MMR2_ENC, DPSX_W_PH_DESC, ISA_DSPR2;
+def MUL_PH_MMR2 : DspMMRel, MUL_PH_MMR2_ENC, MUL_PH_DESC, ISA_DSPR2;
+def MUL_S_PH_MMR2 : DspMMRel, MUL_S_PH_MMR2_ENC, MUL_S_PH_DESC, ISA_DSPR2;
+def MULQ_RS_W_MMR2 : DspMMRel, MULQ_RS_W_MMR2_ENC, MULQ_RS_W_DESC, ISA_DSPR2;
+def MULQ_S_PH_MMR2 : DspMMRel, MULQ_S_PH_MMR2_ENC, MULQ_S_PH_DESC, ISA_DSPR2;
+def MULQ_S_W_MMR2 : DspMMRel, MULQ_S_W_MMR2_ENC, MULQ_S_W_DESC, ISA_DSPR2;
+def PRECR_QB_PH_MMR2 : DspMMRel, PRECR_QB_PH_MMR2_ENC, PRECR_QB_PH_DESC,
+                       ISA_DSPR2;
+def PRECR_SRA_PH_W_MMR2 : DspMMRel, PRECR_SRA_PH_W_MMR2_ENC,
+                          PRECR_SRA_PH_W_DESC, ISA_DSPR2;
+def PRECR_SRA_R_PH_W_MMR2 : DspMMRel, PRECR_SRA_R_PH_W_MMR2_ENC,
+                            PRECR_SRA_R_PH_W_DESC, ISA_DSPR2;
+def PREPEND_MMR2 : DspMMRel, PREPEND_MMR2_ENC, PREPEND_DESC, ISA_DSPR2;
+
+// Instruction alias.
+def : MMDSPInstAlias<"wrdsp $rt", (WRDSP_MM GPR32Opnd:$rt, 0x1F), 1>;
+def APPEND_MMR2 : DspMMRel, APPEND_MMR2_ENC, APPEND_DESC, ISA_DSPR2;
+def MULSA_W_PH_MMR2 : DspMMRel, MULSA_W_PH_MMR2_ENC, MULSA_W_PH_DESC, ISA_DSPR2;
+// microMIPS DSP Rev 3
+def BPOSGE32C_MMR3 : DspMMRel, BPOSGE32C_MMR3_ENC, BPOSGE32C_MMR3_DESC,
+                     ISA_DSPR3;
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td
new file mode 100644
index 000000000000..fc83761e409b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -0,0 +1,180 @@
+let isCodeGenOnly = 1, Predicates = [InMicroMips] in {
+def FADD_S_MM : MMRel, ADDS_FT<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>,
+                ADDS_FM_MM<0, 0x30>;
+def FDIV_S_MM : MMRel, ADDS_FT<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>,
+                ADDS_FM_MM<0, 0xf0>;
+def FMUL_S_MM : MMRel, ADDS_FT<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>,
+                ADDS_FM_MM<0, 0xb0>;
+def FSUB_S_MM : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
+                ADDS_FM_MM<0, 0x70>;
+
+def FADD_MM  : MMRel, ADDS_FT<"add.d", AFGR64Opnd, II_ADD_D, 1, fadd>,
+               ADDS_FM_MM<1, 0x30>;
+def FDIV_MM  : MMRel, ADDS_FT<"div.d", AFGR64Opnd, II_DIV_D, 0, fdiv>,
+               ADDS_FM_MM<1, 0xf0>;
+def FMUL_MM  : MMRel, ADDS_FT<"mul.d", AFGR64Opnd, II_MUL_D, 1, fmul>,
+               ADDS_FM_MM<1, 0xb0>;
+def FSUB_MM  : MMRel, ADDS_FT<"sub.d", AFGR64Opnd, II_SUB_D, 0, fsub>,
+               ADDS_FM_MM<1, 0x70>;
+
+def LWXC1_MM : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>,
+               LWXC1_FM_MM<0x48>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+def SWXC1_MM : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>,
+               SWXC1_FM_MM<0x88>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+def LUXC1_MM : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>,
+               LWXC1_FM_MM<0x148>, INSN_MIPS5_32R2_NOT_32R6_64R6;
+def SUXC1_MM : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>,
+               SWXC1_FM_MM<0x188>, INSN_MIPS5_32R2_NOT_32R6_64R6;
+
+def FCMP_S32_MM : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>,
+                  CEQS_FM_MM<0>;
+def FCMP_D32_MM : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>,
+                  CEQS_FM_MM<1>;
+
+def BC1F_MM : MMRel, BC1F_FT<"bc1f", brtarget_mm, II_BC1F, MIPS_BRANCH_F>,
+              BC1F_FM_MM<0x1c>, ISA_MIPS1_NOT_32R6_64R6;
+def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, II_BC1T, MIPS_BRANCH_T>,
+              BC1F_FM_MM<0x1d>, ISA_MIPS1_NOT_32R6_64R6;
+def CVT_W_S_MM   : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
+                   ROUND_W_FM_MM<0, 0x24>;
+def ROUND_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+                   ROUND_W_FM_MM<0, 0xec>;
+
+def CEIL_W_MM  : MMRel, ABSS_FT<"ceil.w.d", FGR32Opnd, AFGR64Opnd, II_CEIL>,
+                 ROUND_W_FM_MM<1, 0x6c>;
+def CVT_W_MM   : MMRel, ABSS_FT<"cvt.w.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+                 ROUND_W_FM_MM<1, 0x24>;
+def FLOOR_W_MM : MMRel, ABSS_FT<"floor.w.d", FGR32Opnd, AFGR64Opnd, II_FLOOR>,
+                 ROUND_W_FM_MM<1, 0x2c>;
+def ROUND_W_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>,
+                 ROUND_W_FM_MM<1, 0xec>;
+def TRUNC_W_MM : MMRel, ABSS_FT<"trunc.w.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>,
+                 ROUND_W_FM_MM<1, 0xac>;
+
+def FSQRT_MM : MMRel, ABSS_FT<"sqrt.d", AFGR64Opnd, AFGR64Opnd, II_SQRT_D,
+                              fsqrt>, ROUND_W_FM_MM<1, 0x28>;
+
+def CVT_L_S_MM   : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+                   ROUND_W_FM_MM<0, 0x4>, INSN_MIPS3_32R2;
+def CVT_L_D64_MM : MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
+                   ROUND_W_FM_MM<1, 0x4>, INSN_MIPS3_32R2;
+
+def FABS_S_MM : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
+                ABS_FM_MM<0, 0xd>;
+def FMOV_S_MM : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
+                ABS_FM_MM<0, 0x1>;
+def FNEG_S_MM : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
+                ABS_FM_MM<0, 0x2d>;
+def CVT_D_S_MM : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                 ABS_FM_MM<0, 0x4d>;
+def CVT_D32_W_MM : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                   ABS_FM_MM<1, 0x4d>;
+def CVT_S_D32_MM : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+                   ABS_FM_MM<0, 0x6d>;
+def CVT_S_W_MM : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
+                 ABS_FM_MM<1, 0x6d>;
+
+def FABS_MM : MMRel, ABSS_FT<"abs.d", AFGR64Opnd, AFGR64Opnd, II_ABS, fabs>,
+              ABS_FM_MM<1, 0xd>;
+def FNEG_MM : MMRel, ABSS_FT<"neg.d", AFGR64Opnd, AFGR64Opnd, II_NEG, fneg>,
+              ABS_FM_MM<1, 0x2d>;
+
+def FMOV_D32_MM : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
+                  ABS_FM_MM<1, 0x1>, FGR_32;
+
+def MOVZ_I_S_MM : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd,
+                                     II_MOVZ_S>, CMov_I_F_FM_MM<0x78, 0>;
+def MOVN_I_S_MM : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd,
+                                     II_MOVN_S>, CMov_I_F_FM_MM<0x38, 0>;
+def MOVZ_I_D32_MM : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
+                                       II_MOVZ_D>, CMov_I_F_FM_MM<0x78, 1>;
+def MOVN_I_D32_MM : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
+                                       II_MOVN_D>, CMov_I_F_FM_MM<0x38, 1>;
+
+def MOVT_S_MM : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S,
+                                   MipsCMovFP_T>, CMov_F_F_FM_MM<0x60, 0>;
+def MOVF_S_MM : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S,
+                                   MipsCMovFP_F>, CMov_F_F_FM_MM<0x20, 0>;
+def MOVT_D32_MM : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
+                                     MipsCMovFP_T>, CMov_F_F_FM_MM<0x60, 1>;
+def MOVF_D32_MM : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
+                                     MipsCMovFP_F>, CMov_F_F_FM_MM<0x20, 1>;
+def MFC1_MM : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd,
+                             II_MFC1, bitconvert>, MFC1_FM_MM<0x80>;
+def MTC1_MM : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd,
+                             II_MTC1, bitconvert>, MFC1_FM_MM<0xa0>;
+
+def MADD_S_MM : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
+                MADDS_FM_MM<0x1>;
+def MSUB_S_MM : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
+                MADDS_FM_MM<0x21>;
+def NMADD_S_MM : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
+                 MADDS_FM_MM<0x2>;
+def NMSUB_S_MM : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
+                 MADDS_FM_MM<0x22>;
+
+def MADD_D32_MM  : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
+                   MADDS_FM_MM<0x9>;
+def MSUB_D32_MM  : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
+                   MADDS_FM_MM<0x29>;
+def NMADD_D32_MM : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
+                   MADDS_FM_MM<0xa>;
+def NMSUB_D32_MM : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
+                   MADDS_FM_MM<0x2a>;
+}
+
+let AdditionalPredicates = [InMicroMips] in {
+  def FLOOR_W_S_MM : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd,
+    II_FLOOR>, ROUND_W_FM_MM<0, 0x2c>;
+  def TRUNC_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd,
+    FGR32Opnd, II_TRUNC>, ROUND_W_FM_MM<0, 0xac>;
+  def CEIL_W_S_MM  : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
+    ROUND_W_FM_MM<0, 0x6c>;
+  def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S,
+    fsqrt>, ROUND_W_FM_MM<0, 0x28>;
+  def MTHC1_MM : MMRel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
+             MFC1_FM_MM<0xe0>, ISA_MIPS32R2, FGR_32;
+  def MFHC1_MM : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
+                 MFC1_FM_MM<0xc0>, ISA_MIPS32R2, FGR_32;
+  let DecoderNamespace = "MicroMips" in {
+    def CFC1_MM : MMRel, MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, II_CFC1>,
+                  MFC1_FM_MM<0x40>;
+    def CTC1_MM : MMRel, MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, II_CTC1>,
+                  MFC1_FM_MM<0x60>;
+    def RECIP_S_MM : MMRel, ABSS_FT<"recip.s", FGR32Opnd, FGR32Opnd,
+                                    II_RECIP_S>,
+                     ROUND_W_FM_MM<0b0, 0b01001000>;
+    def RECIP_D_MM : MMRel, ABSS_FT<"recip.d", AFGR64Opnd, AFGR64Opnd,
+                                 II_RECIP_D>, ROUND_W_FM_MM<0b1, 0b01001000>;
+    def RSQRT_S_MM : MMRel, ABSS_FT<"rsqrt.s", FGR32Opnd, FGR32Opnd,
+                                    II_RECIP_S>,
+                     ROUND_W_FM_MM<0b0, 0b00001000>;
+    def RSQRT_D_MM : MMRel, ABSS_FT<"rsqrt.d", AFGR64Opnd, AFGR64Opnd,
+                                 II_RECIP_D>, ROUND_W_FM_MM<0b1, 0b00001000>;
+  }
+  let DecoderNamespace = "MicroMips",  DecoderMethod = "DecodeFMemMMR2" in {
+    def LDC1_MM : MMRel, LW_FT<"ldc1", AFGR64Opnd, mem_mm_16, II_LDC1, load>,
+                  LW_FM_MM<0x2f>, FGR_32 {
+      let BaseOpcode = "LDC132";
+    }
+    def SDC1_MM : MMRel, SW_FT<"sdc1", AFGR64Opnd, mem_mm_16, II_SDC1, store>,
+                  LW_FM_MM<0x2e>, FGR_32;
+    def LWC1_MM : MMRel, LW_FT<"lwc1", FGR32Opnd, mem_mm_16, II_LWC1, load>,
+                  LW_FM_MM<0x27>;
+    def SWC1_MM : MMRel, SW_FT<"swc1", FGR32Opnd, mem_mm_16, II_SWC1, store>,
+                  LW_FM_MM<0x26>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Patterns
+//===----------------------------------------------------------------------===//
+let AdditionalPredicates = [InMicroMips] in {
+  // Patterns for loads/stores with a reg+imm operand.
+  let AddedComplexity = 40 in {
+    def : LoadRegImmPat<LDC1_MM, f64, load>, FGR_32;
+    def : StoreRegImmPat<SDC1_MM, f64>, FGR_32;
+    def : LoadRegImmPat<LWC1_MM, f32, load>;
+    def : StoreRegImmPat<SWC1_MM, f32>;
+  }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td
new file mode 100644
index 000000000000..8b595f9e6c4c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -0,0 +1,1049 @@
+//===----------------------------------------------------------------------===//
+// MicroMIPS Base Classes
+//===----------------------------------------------------------------------===//
+
+//
+// Base class for MicroMips instructions.
+// This class does not depend on the instruction size.
+//
+class MicroMipsInstBase<dag outs, dag ins, string asmstr, list<dag> pattern,
+                        InstrItinClass itin, Format f> : Instruction
+{
+  let Namespace = "Mips";
+  let DecoderNamespace = "MicroMips";
+
+  let OutOperandList = outs;
+  let InOperandList  = ins;
+
+  let AsmString   = asmstr;
+  let Pattern     = pattern;
+  let Itinerary   = itin;
+
+  let Predicates = [InMicroMips];
+
+  Format Form = f;
+}
+
+//
+// Base class for MicroMIPS 16-bit instructions.
+//
+class MicroMipsInst16<dag outs, dag ins, string asmstr, list<dag> pattern,
+               InstrItinClass itin, Format f> :
+  MicroMipsInstBase<outs, ins, asmstr, pattern, itin, f>
+{
+  let Size = 2;
+  field bits<16> Inst;
+  field bits<16> SoftFail = 0;
+  bits<6> Opcode = 0x0;
+}
+
+//===----------------------------------------------------------------------===//
+// MicroMIPS 16-bit Instruction Formats
+//===----------------------------------------------------------------------===//
+
+class ARITH_FM_MM16<bit funct> {
+  bits<3> rd;
+  bits<3> rt;
+  bits<3> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x01;
+  let Inst{9-7}   = rd;
+  let Inst{6-4}   = rt;
+  let Inst{3-1}   = rs;
+  let Inst{0}     = funct;
+}
+
+class ANDI_FM_MM16<bits<6> funct> {
+  bits<3> rd;
+  bits<3> rs;
+  bits<4> imm;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = funct;
+  let Inst{9-7}   = rd;
+  let Inst{6-4}   = rs;
+  let Inst{3-0}   = imm;
+}
+
+class LOGIC_FM_MM16<bits<4> funct> {
+  bits<3> rt;
+  bits<3> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-6}   = funct;
+  let Inst{5-3}   = rt;
+  let Inst{2-0}   = rs;
+}
+
+class SHIFT_FM_MM16<bits<1> funct> {
+  bits<3> rd;
+  bits<3> rt;
+  bits<3> shamt;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x09;
+  let Inst{9-7}   = rd;
+  let Inst{6-4}   = rt;
+  let Inst{3-1}   = shamt;
+  let Inst{0}     = funct;
+}
+
+class ADDIUR2_FM_MM16 {
+  bits<3> rd;
+  bits<3> rs;
+  bits<3> imm;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x1b;
+  let Inst{9-7}   = rd;
+  let Inst{6-4}   = rs;
+  let Inst{3-1}   = imm;
+  let Inst{0}     = 0;
+}
+
+class LOAD_STORE_FM_MM16<bits<6> op> {
+  bits<3> rt;
+  bits<7> addr;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = op;
+  let Inst{9-7}   = rt;
+  let Inst{6-4}   = addr{6-4};
+  let Inst{3-0}   = addr{3-0};
+}
+
+class LOAD_STORE_SP_FM_MM16<bits<6> op> {
+  bits<5> rt;
+  bits<5> offset;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = op;
+  let Inst{9-5}   = rt;
+  let Inst{4-0}   = offset;
+}
+
+class LOAD_GP_FM_MM16<bits<6> op> {
+  bits<3> rt;
+  bits<7> offset;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = op;
+  let Inst{9-7} = rt;
+  let Inst{6-0} = offset;
+}
+
+class ADDIUS5_FM_MM16 {
+  bits<5> rd;
+  bits<4> imm;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x13;
+  let Inst{9-5}   = rd;
+  let Inst{4-1}   = imm;
+  let Inst{0}     = 0;
+}
+
+class ADDIUSP_FM_MM16 {
+  bits<9> imm;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x13;
+  let Inst{9-1}   = imm;
+  let Inst{0}     = 1;
+}
+
+class MOVE_FM_MM16<bits<6> funct> {
+  bits<5> rs;
+  bits<5> rd;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = funct;
+  let Inst{9-5}   = rd;
+  let Inst{4-0}   = rs;
+}
+
+class LI_FM_MM16 {
+  bits<3> rd;
+  bits<7> imm;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x3b;
+  let Inst{9-7}   = rd;
+  let Inst{6-0}   = imm;
+}
+
+class JALR_FM_MM16<bits<5> op> {
+  bits<5> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-5}   = op;
+  let Inst{4-0}   = rs;
+}
+
+class MFHILO_FM_MM16<bits<5> funct> {
+  bits<5> rd;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-5}   = funct;
+  let Inst{4-0}   = rd;
+}
+
+class JRADDIUSP_FM_MM16<bits<5> op> {
+  bits<5> rs;
+  bits<5> imm;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-5}   = op;
+  let Inst{4-0}   = imm;
+}
+
+class ADDIUR1SP_FM_MM16 {
+  bits<3> rd;
+  bits<6> imm;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x1b;
+  let Inst{9-7}   = rd;
+  let Inst{6-1}   = imm;
+  let Inst{0}     = 1;
+}
+
+class BRKSDBBP16_FM_MM<bits<6> op> {
+  bits<4> code_;
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-4}   = op;
+  let Inst{3-0}   = code_;
+}
+
+class BEQNEZ_FM_MM16<bits<6> op> {
+  bits<3> rs;
+  bits<7> offset;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = op;
+  let Inst{9-7}   = rs;
+  let Inst{6-0}   = offset;
+}
+
+class B16_FM {
+  bits<10> offset;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x33;
+  let Inst{9-0}   = offset;
+}
+
+class MOVEP_FM_MM16 {
+  bits<3> dst_regs;
+  bits<3> rt;
+  bits<3> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x21;
+  let Inst{9-7}   = dst_regs;
+  let Inst{6-4}   = rt;
+  let Inst{3-1}   = rs;
+  let Inst{0}     = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// MicroMIPS 32-bit Instruction Formats
+//===----------------------------------------------------------------------===//
+
+class MMArch {
+  string Arch = "micromips";
+}
+
+class ADD_FM_MM<bits<6> op, bits<10> funct> : MMArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0;
+  let Inst{9-0}   = funct;
+}
+
+class ADDI_FM_MM<bits<6> op> : MMArch {
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = imm16;
+}
+
+class SLTI_FM_MM<bits<6> op> : MMArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = imm16;
+}
+
+class LUI_FM_MM : MMArch {
+  bits<5> rt;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = 0xd;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+class MULT_FM_MM<bits<10> funct> : MMArch {
+  bits<5>  rs;
+  bits<5>  rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class SRA_FM_MM<bits<10> funct, bit rotate> : MMArch {
+  bits<5> rd;
+  bits<5> rt;
+  bits<5> shamt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = shamt;
+  let Inst{10}    = rotate;
+  let Inst{9-0}   = funct;
+}
+
+class SRLV_FM_MM<bits<10> funct, bit rotate> : MMArch {
+  bits<5> rd;
+  bits<5> rt;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = rd;
+  let Inst{10}    = rotate;
+  let Inst{9-0}   = funct;
+}
+
+class LW_FM_MM<bits<6> op> : MMArch {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-0}  = offset;
+}
+
+class POOL32C_LHUE_FM_MM<bits<6> op, bits<4> fmt, bits<3> funct> : MMArch {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = fmt;
+  let Inst{11-9} = funct;
+  let Inst{8-0}  = offset;
+}
+
+class LWL_FM_MM<bits<4> funct> {
+  bits<5> rt;
+  bits<21> addr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x18;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = addr{20-16};
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = addr{11-0};
+}
+
+class POOL32C_STEVA_LDEVA_FM_MM<bits<4> type, bits<3> funct> {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x18;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = type;
+  let Inst{11-9} = funct;
+  let Inst{8-0}  = offset;
+}
+
+class CMov_F_I_FM_MM<bits<7> func> : MMArch {
+  bits<5> rd;
+  bits<5> rs;
+  bits<3> fcc;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rs;
+  let Inst{15-13} = fcc;
+  let Inst{12-6}  = func;
+  let Inst{5-0}   = 0x3b;
+}
+
+class MTLO_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x00;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class MFLO_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x00;
+  let Inst{20-16} = rd;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class CLO_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class SEB_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rd;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class EXT_FM_MM<bits<6> funct> : MMArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> pos;
+  bits<5> size;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = size;
+  let Inst{10-6}  = pos;
+  let Inst{5-0}   = funct;
+}
+
+class J_FM_MM<bits<6> op> : MMArch {
+  bits<26> target;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-0}  = target;
+}
+
+class JR_FM_MM<bits<8> funct> : MMArch {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-21} = 0x00;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = 0x0;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class JALR_FM_MM<bits<10> funct> {
+  bits<5> rs;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class BEQ_FM_MM<bits<6> op> : MMArch {
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class BGEZ_FM_MM<bits<5> funct> : MMArch {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class BGEZAL_FM_MM<bits<5> funct> : MMArch {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class SYNC_FM_MM : MMArch {
+  bits<5> stype;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x0;
+  let Inst{20-16} = stype;
+  let Inst{15-6}  = 0x1ad;
+  let Inst{5-0}   = 0x3c;
+}
+
+class SYNCI_FM_MM : MMArch {
+  bits<5> rs;
+  bits<16> offset;
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b010000;
+  let Inst{25-21} = 0b10000;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class BRK_FM_MM : MMArch {
+  bits<10> code_1;
+  bits<10> code_2;
+  bits<32> Inst;
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = code_1;
+  let Inst{15-6}  = code_2;
+  let Inst{5-0}   = 0x07;
+}
+
+class SYS_FM_MM : MMArch {
+  bits<10> code_;
+  bits<32> Inst;
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = code_;
+  let Inst{15-6}  = 0x22d;
+  let Inst{5-0}   = 0x3c;
+}
+
+class WAIT_FM_MM {
+  bits<10> code_;
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-16} = code_;
+  let Inst{15-6}  = 0x24d;
+  let Inst{5-0}   = 0x3c;
+}
+
+class ER_FM_MM<bits<10> funct> : MMArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-16} = 0x00;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class EI_FM_MM<bits<10> funct> : MMArch {
+  bits<32> Inst;
+  bits<5> rt;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x00;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class TEQ_FM_MM<bits<6> funct> : MMArch {
+  bits<5> rs;
+  bits<5> rt;
+  bits<4> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-12} = code_;
+  let Inst{11-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class TEQI_FM_MM<bits<5> funct> : MMArch {
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = imm16;
+}
+
+class LL_FM_MM<bits<4> funct> : MMArch {
+  bits<5> rt;
+  bits<21> addr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x18;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = addr{20-16};
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = addr{11-0};
+}
+
+class LLE_FM_MM<bits<4> funct> {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x18;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = base;
+  let Inst{15-12} = funct;
+  let Inst{11-9} = 0x6;
+  let Inst{8-0} = offset;
+}
+
+class ADDS_FM_MM<bits<2> fmt, bits<8> funct> : MMArch {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10}    = 0;
+  let Inst{9-8}   = fmt;
+  let Inst{7-0}   = funct;
+
+  list<dag> Pattern = [];
+}
+
+class LWXC1_FM_MM<bits<9> funct> : MMArch {
+  bits<5> fd;
+  bits<5> base;
+  bits<5> index;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = index;
+  let Inst{20-16} = base;
+  let Inst{15-11} = fd;
+  let Inst{10-9}  = 0x0;
+  let Inst{8-0}   = funct;
+}
+
+class SWXC1_FM_MM<bits<9> funct> : MMArch {
+  bits<5> fs;
+  bits<5> base;
+  bits<5> index;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = index;
+  let Inst{20-16} = base;
+  let Inst{15-11} = fs;
+  let Inst{10-9}  = 0x0;
+  let Inst{8-0}   = funct;
+}
+
+class CEQS_FM_MM<bits<2> fmt> : MMArch {
+  bits<5> fs;
+  bits<5> ft;
+  bits<4> cond;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-13} = 0x0;  // cc
+  let Inst{12}    = 0;
+  let Inst{11-10} = fmt;
+  let Inst{9-6}   = cond;
+  let Inst{5-0}   = 0x3c;
+}
+
+class BC1F_FM_MM<bits<5> tf> : MMArch {
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = tf;
+  let Inst{20-18} = 0x0; // cc
+  let Inst{17-16} = 0x0;
+  let Inst{15-0}  = offset;
+}
+
+class ROUND_W_FM_MM<bits<1> fmt, bits<8> funct> : MMArch {
+  bits<5> fd;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = fd;
+  let Inst{20-16} = fs;
+  let Inst{15}    = 0;
+  let Inst{14}    = fmt;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0x3b;
+}
+
+class ABS_FM_MM<bits<2> fmt, bits<7> funct> : MMArch {
+  bits<5> fd;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = fd;
+  let Inst{20-16} = fs;
+  let Inst{15}    = 0;
+  let Inst{14-13} = fmt;
+  let Inst{12-6}  = funct;
+  let Inst{5-0}   = 0x3b;
+}
+
+class CMov_F_F_FM_MM<bits<9> func, bits<2> fmt> : MMArch {
+  bits<5> fd;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = fd;
+  let Inst{20-16} = fs;
+  let Inst{15-13} = 0x0; //cc
+  let Inst{12-11} = 0x0;
+  let Inst{10-9}  = fmt;
+  let Inst{8-0}   = func;
+}
+
+class CMov_I_F_FM_MM<bits<8> funct, bits<2> fmt> : MMArch {
+  bits<5> fd;
+  bits<5> fs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{9-8}   = fmt;
+  let Inst{7-0}   = funct;
+}
+
+class MFC1_FM_MM<bits<8> funct> : MMArch {
+  bits<5> rt;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = fs;
+  let Inst{15-14} = 0x0;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0x3b;
+}
+
+class MADDS_FM_MM<bits<6> funct>: MMArch {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+  bits<5> fr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = ft;
+  let Inst{20-16} = fs;
+  let Inst{15-11} = fd;
+  let Inst{10-6}  = fr;
+  let Inst{5-0}   = funct;
+}
+
+class COMPACT_BRANCH_FM_MM<bits<5> funct> {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class COP0_TLB_FM_MM<bits<10> op> : MMArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = 0x0;
+  let Inst{15-6}  = op;
+  let Inst{5-0}   = 0x3c;
+}
+
+class SDBBP_FM_MM : MMArch {
+  bits<10> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = code_;
+  let Inst{15-6}  = 0x36d;
+  let Inst{5-0}   = 0x3c;
+}
+
+class RDHWR_FM_MM : MMArch {
+  bits<5> rt;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rd;
+  let Inst{15-6}  = 0x1ac;
+  let Inst{5-0}   = 0x3c;
+}
+
+class LWXS_FM_MM<bits<10> funct> {
+  bits<5> rd;
+  bits<5> base;
+  bits<5> index;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-21} = index;
+  let Inst{20-16} = base;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0;
+  let Inst{9-0}   = funct;
+}
+
+class LWM_FM_MM<bits<4> funct> : MMArch {
+  bits<5> rt;
+  bits<21> addr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x8;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = addr{20-16};
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = addr{11-0};
+}
+
+class LWM_FM_MM16<bits<4> funct> : MMArch, PredicateControl {
+  bits<2> rt;
+  bits<4> addr;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-6}   = funct;
+  let Inst{5-4}   = rt;
+  let Inst{3-0}   = addr;
+}
+
+class CACHE_PREF_FM_MM<bits<6> op, bits<4> funct> : MMArch {
+  bits<21> addr;
+  bits<5> hint;
+  bits<5> base = addr{20-16};
+  bits<12> offset = addr{11-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = hint;
+  let Inst{20-16} = base;
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = offset;
+}
+
+class CACHE_PREFE_FM_MM<bits<6> op, bits<3> funct> : MMArch {
+  bits<21> addr;
+  bits<5> hint;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = hint;
+  let Inst{20-16} = base;
+  let Inst{15-12} = 0xA;
+  let Inst{11-9} = funct;
+  let Inst{8-0}  = offset;
+}
+
+class POOL32F_PREFX_FM_MM<bits<6> op, bits<9> funct> : MMArch {
+  bits<5> index;
+  bits<5> base;
+  bits<5> hint;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = index;
+  let Inst{20-16} = base;
+  let Inst{15-11} = hint;
+  let Inst{10-9}  = 0x0;
+  let Inst{8-0}   = funct;
+}
+
+class BARRIER_FM_MM<bits<5> op> : MMArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-21} = 0x0;
+  let Inst{20-16} = 0x0;
+  let Inst{15-11} = op;
+  let Inst{10-6}  = 0x0;
+  let Inst{5-0}   = 0x0;
+}
+
+class ADDIUPC_FM_MM {
+  bits<3> rs;
+  bits<23> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1e;
+  let Inst{25-23} = rs;
+  let Inst{22-0} = imm;
+}
+
+class POOL32A_CFTC2_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rt;
+  bits<5> impl;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000000;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = impl;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0b111100;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
new file mode 100644
index 000000000000..c0de9e7390a4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -0,0 +1,1190 @@
+def addrimm11 : ComplexPattern<iPTR, 2, "selectIntAddr11MM", [frameindex]>;
+def addrimm12 : ComplexPattern<iPTR, 2, "selectIntAddr12MM", [frameindex]>;
+def addrimm16 : ComplexPattern<iPTR, 2, "selectIntAddr16MM", [frameindex]>;
+def addrimm4lsl2 : ComplexPattern<iPTR, 2, "selectIntAddrLSL2MM", [frameindex]>;
+
+def simm9_addiusp : Operand<i32> {
+  let EncoderMethod = "getSImm9AddiuspValue";
+  let DecoderMethod = "DecodeSimm9SP";
+}
+
+def uimm3_shift : Operand<i32> {
+  let EncoderMethod = "getUImm3Mod8Encoding";
+  let DecoderMethod = "DecodePOOL16BEncodedField";
+}
+
+def simm3_lsa2 : Operand<i32> {
+  let EncoderMethod = "getSImm3Lsa2Value";
+  let DecoderMethod = "DecodeAddiur2Simm7";
+}
+
+def uimm4_andi : Operand<i32> {
+  let EncoderMethod = "getUImm4AndValue";
+  let DecoderMethod = "DecodeANDI16Imm";
+}
+
+def immSExtAddiur2 : ImmLeaf<i32, [{return Imm == 1 || Imm == -1 ||
+                                           ((Imm % 4 == 0) &&
+                                            Imm < 28 && Imm > 0);}]>;
+
+def immSExtAddius5 : ImmLeaf<i32, [{return Imm >= -8 && Imm <= 7;}]>;
+
+def immZExtAndi16 : ImmLeaf<i32,
+  [{return (Imm == 128 || (Imm >= 1 && Imm <= 4) || Imm == 7 || Imm == 8 ||
+            Imm == 15 || Imm == 16 || Imm == 31 || Imm == 32 || Imm == 63 ||
+            Imm == 64 || Imm == 255 || Imm == 32768 || Imm == 65535 );}]>;
+
+def immZExt2Shift : ImmLeaf<i32, [{return Imm >= 1 && Imm <= 8;}]>;
+
+def immLi16 : ImmLeaf<i32, [{return Imm >= -1 && Imm <= 126;}]>;
+
+def MicroMipsMemGPRMM16AsmOperand : AsmOperandClass {
+  let Name = "MicroMipsMem";
+  let RenderMethod = "addMicroMipsMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithGRPMM16Base";
+}
+
+// Define the classes of pointers used by microMIPS.
+// The numbers must match those in MipsRegisterInfo::MipsPtrClass.
+def ptr_gpr16mm_rc : PointerLikeRegClass<1>;
+def ptr_sp_rc : PointerLikeRegClass<2>;
+def ptr_gp_rc : PointerLikeRegClass<3>;
+
+class mem_mm_4_generic : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops ptr_gpr16mm_rc, simm4);
+  let OperandType = "OPERAND_MEMORY";
+  let ParserMatchClass = MicroMipsMemGPRMM16AsmOperand;
+}
+
+def mem_mm_4 : mem_mm_4_generic {
+  let EncoderMethod = "getMemEncodingMMImm4";
+}
+
+def mem_mm_4_lsl1 : mem_mm_4_generic {
+  let EncoderMethod = "getMemEncodingMMImm4Lsl1";
+}
+
+def mem_mm_4_lsl2 : mem_mm_4_generic {
+  let EncoderMethod = "getMemEncodingMMImm4Lsl2";
+}
+
+def MicroMipsMemSPAsmOperand : AsmOperandClass {
+  let Name = "MicroMipsMemSP";
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithUimmWordAlignedOffsetSP<7>";
+}
+
+def MicroMipsMemGPAsmOperand : AsmOperandClass {
+  let Name = "MicroMipsMemGP";
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithSimmWordAlignedOffsetGP<9>";
+}
+
+def mem_mm_sp_imm5_lsl2 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops ptr_sp_rc:$base, simm5:$offset);
+  let OperandType = "OPERAND_MEMORY";
+  let ParserMatchClass = MicroMipsMemSPAsmOperand;
+  let EncoderMethod = "getMemEncodingMMSPImm5Lsl2";
+}
+
+def mem_mm_gp_simm7_lsl2 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops ptr_gp_rc:$base, simm7_lsl2:$offset);
+  let OperandType = "OPERAND_MEMORY";
+  let ParserMatchClass = MicroMipsMemGPAsmOperand;
+  let EncoderMethod = "getMemEncodingMMGPImm7Lsl2";
+}
+
+def mem_mm_9 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops ptr_rc, simm9);
+  let EncoderMethod = "getMemEncodingMMImm9";
+  let ParserMatchClass = MipsMemSimm9AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+def mem_mm_11 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops GPR32, simm11);
+  let EncoderMethod = "getMemEncodingMMImm11";
+  let ParserMatchClass = MipsMemSimm11AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+def mem_mm_12 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops ptr_rc, simm12);
+  let EncoderMethod = "getMemEncodingMMImm12";
+  let ParserMatchClass = MipsMemAsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+def mem_mm_16 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops ptr_rc, simm16);
+  let EncoderMethod = "getMemEncodingMMImm16";
+  let ParserMatchClass = MipsMemSimm16AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+def MipsMemUimm4AsmOperand : AsmOperandClass {
+  let Name = "MemOffsetUimm4";
+  let SuperClasses = [MipsMemAsmOperand];
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithUimmOffsetSP<6>";
+}
+
+def mem_mm_4sp : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops ptr_sp_rc, uimm8);
+  let EncoderMethod = "getMemEncodingMMImm4sp";
+  let ParserMatchClass = MipsMemUimm4AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+def jmptarget_mm : Operand<OtherVT> {
+  let EncoderMethod = "getJumpTargetOpValueMM";
+}
+
+def calltarget_mm : Operand<iPTR> {
+  let EncoderMethod = "getJumpTargetOpValueMM";
+}
+
+def brtarget7_mm : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTarget7OpValueMM";
+  let OperandType   = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget7MM";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def brtarget10_mm : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValueMMPC10";
+  let OperandType   = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget10MM";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def brtarget_mm : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValueMM";
+  let OperandType   = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTargetMM";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def simm23_lsl2 : Operand<i32> {
+  let EncoderMethod = "getSimm23Lsl2Encoding";
+  let DecoderMethod = "DecodeSimm23Lsl2";
+}
+
+class CompactBranchMM<string opstr, DAGOperand opnd, PatFrag cond_op,
+                      RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, opnd:$offset),
+         !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZC, FrmI> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 0;
+  let Defs = [AT];
+}
+
+let canFoldAsLoad = 1 in
+class LoadLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
+                      Operand MemOpnd, InstrItinClass Itin> :
+  InstSE<(outs RO:$rt), (ins MemOpnd:$addr, RO:$src),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, (OpNode addrimm12:$addr, RO:$src))],
+         Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  string Constraints = "$src = $rt";
+}
+
+class StoreLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
+                       Operand MemOpnd, InstrItinClass Itin>:
+  InstSE<(outs), (ins RO:$rt, MemOpnd:$addr),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, addrimm12:$addr)], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+}
+
+/// A register pair used by movep instruction.
+def MovePRegPairAsmOperand : AsmOperandClass {
+  let Name = "MovePRegPair";
+  let ParserMethod = "parseMovePRegPair";
+  let PredicateMethod = "isMovePRegPair";
+}
+
+def movep_regpair : Operand<i32> {
+  let EncoderMethod = "getMovePRegPairOpValue";
+  let ParserMatchClass = MovePRegPairAsmOperand;
+  let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeMovePRegPair";
+  let MIOperandInfo = (ops ptr_rc, ptr_rc);
+}
+
+class MovePMM16<string opstr, RegisterOperand RO> :
+MicroMipsInst16<(outs movep_regpair:$dst_regs), (ins RO:$rs, RO:$rt),
+                 !strconcat(opstr, "\t$dst_regs, $rs, $rt"), [],
+                 NoItinerary, FrmR> {
+  let isReMaterializable = 1;
+}
+
+/// A register pair used by load/store pair instructions.
+def RegPairAsmOperand : AsmOperandClass {
+  let Name = "RegPair";
+  let ParserMethod = "parseRegisterPair";
+  let PredicateMethod = "isRegPair";
+}
+
+def regpair : Operand<i32> {
+  let EncoderMethod = "getRegisterPairOpValue";
+  let ParserMatchClass = RegPairAsmOperand;
+  let PrintMethod = "printRegisterPair";
+  let DecoderMethod = "DecodeRegPairOperand";
+  let MIOperandInfo = (ops ptr_rc, ptr_rc);
+}
+
+class StorePairMM<string opstr, ComplexPattern Addr = addr>
+    :  InstSE<(outs), (ins regpair:$rt, mem_simm12:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], II_SWP, FrmI, opstr> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let mayStore = 1;
+}
+
+class LoadPairMM<string opstr, ComplexPattern Addr = addr>
+    : InstSE<(outs regpair:$rt), (ins mem_simm12:$addr),
+          !strconcat(opstr, "\t$rt, $addr"), [], II_LWP, FrmI, opstr> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let mayLoad = 1;
+}
+
+class LLBaseMM<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins mem_mm_12:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], II_LL, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let mayLoad = 1;
+}
+
+class LLEBaseMM<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins mem_simm9:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], II_LLE, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm9";
+  let mayLoad = 1;
+}
+
+class SCBaseMM<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$dst), (ins RO:$rt, mem_mm_12:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], II_SC, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let mayStore = 1;
+  let Constraints = "$rt = $dst";
+}
+
+class SCEBaseMM<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$dst), (ins RO:$rt, mem_simm9:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], II_SCE, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm9";
+  let mayStore = 1;
+  let Constraints = "$rt = $dst";
+}
+
+class LoadMM<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+             InstrItinClass Itin = NoItinerary, DAGOperand MO = mem_mm_12> :
+  InstSE<(outs RO:$rt), (ins MO:$addr),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, (OpNode addrimm12:$addr))], Itin, FrmI, opstr> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let canFoldAsLoad = 1;
+  let mayLoad = 1;
+}
+
+class ArithRMM16<string opstr, RegisterOperand RO, bit isComm = 0,
+                 InstrItinClass Itin = NoItinerary,
+                 SDPatternOperator OpNode = null_frag> :
+  MicroMipsInst16<(outs RO:$rd), (ins RO:$rs, RO:$rt),
+                  !strconcat(opstr, "\t$rd, $rs, $rt"),
+                  [(set RO:$rd, (OpNode RO:$rs, RO:$rt))], Itin, FrmR> {
+  let isCommutable = isComm;
+}
+
+class AndImmMM16<string opstr, RegisterOperand RO,
+                 InstrItinClass Itin = NoItinerary> :
+  MicroMipsInst16<(outs RO:$rd), (ins RO:$rs, uimm4_andi:$imm),
+                  !strconcat(opstr, "\t$rd, $rs, $imm"), [], Itin, FrmI>;
+
+class LogicRMM16<string opstr, RegisterOperand RO,
+                 InstrItinClass Itin = NoItinerary,
+                 SDPatternOperator OpNode = null_frag> :
+  MicroMipsInst16<(outs RO:$dst), (ins RO:$rs, RO:$rt),
+         !strconcat(opstr, "\t$rt, $rs"),
+         [(set RO:$dst, (OpNode RO:$rs, RO:$rt))], Itin, FrmR> {
+  let isCommutable = 1;
+  let Constraints = "$rt = $dst";
+}
+
+class NotMM16<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs RO:$rt), (ins RO:$rs),
+         !strconcat(opstr, "\t$rt, $rs"),
+         [(set RO:$rt, (not RO:$rs))], II_NOT, FrmR>;
+
+class ShiftIMM16<string opstr, Operand ImmOpnd, RegisterOperand RO,
+                 InstrItinClass Itin = NoItinerary> :
+  MicroMipsInst16<(outs RO:$rd), (ins RO:$rt, ImmOpnd:$shamt),
+                  !strconcat(opstr, "\t$rd, $rt, $shamt"), [], Itin, FrmR>;
+
+class LoadMM16<string opstr, DAGOperand RO, SDPatternOperator OpNode,
+               InstrItinClass Itin, Operand MemOpnd> :
+  MicroMipsInst16<(outs RO:$rt), (ins MemOpnd:$addr),
+                  !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm4";
+  let canFoldAsLoad = 1;
+  let mayLoad = 1;
+}
+
+class StoreMM16<string opstr, DAGOperand RTOpnd, DAGOperand RO,
+                SDPatternOperator OpNode, InstrItinClass Itin,
+                Operand MemOpnd> :
+  MicroMipsInst16<(outs), (ins RTOpnd:$rt, MemOpnd:$addr),
+                  !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm4";
+  let mayStore = 1;
+}
+
+class LoadSPMM16<string opstr, DAGOperand RO, InstrItinClass Itin,
+                 Operand MemOpnd> :
+  MicroMipsInst16<(outs RO:$rt), (ins MemOpnd:$offset),
+                  !strconcat(opstr, "\t$rt, $offset"), [], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMSPImm5Lsl2";
+  let canFoldAsLoad = 1;
+  let mayLoad = 1;
+}
+
+class StoreSPMM16<string opstr, DAGOperand RO, InstrItinClass Itin,
+                  Operand MemOpnd> :
+  MicroMipsInst16<(outs), (ins RO:$rt, MemOpnd:$offset),
+                  !strconcat(opstr, "\t$rt, $offset"), [], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMSPImm5Lsl2";
+  let mayStore = 1;
+}
+
+class LoadGPMM16<string opstr, DAGOperand RO, InstrItinClass Itin,
+                 Operand MemOpnd> :
+  MicroMipsInst16<(outs RO:$rt), (ins MemOpnd:$offset),
+                  !strconcat(opstr, "\t$rt, $offset"), [], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMGPImm7Lsl2";
+  let canFoldAsLoad = 1;
+  let mayLoad = 1;
+}
+
+class AddImmUR2<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs RO:$rd), (ins RO:$rs, simm3_lsa2:$imm),
+                  !strconcat(opstr, "\t$rd, $rs, $imm"),
+                  [], II_ADDIU, FrmR> {
+  let isCommutable = 1;
+}
+
+class AddImmUS5<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs RO:$dst), (ins RO:$rd, simm4:$imm),
+                  !strconcat(opstr, "\t$rd, $imm"), [], II_ADDIU, FrmR> {
+  let Constraints = "$rd = $dst";
+}
+
+class AddImmUR1SP<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs RO:$rd), (ins uimm6_lsl2:$imm),
+                  !strconcat(opstr, "\t$rd, $imm"), [], II_ADDIU, FrmR>;
+
+class AddImmUSP<string opstr> :
+  MicroMipsInst16<(outs), (ins simm9_addiusp:$imm),
+                  !strconcat(opstr, "\t$imm"), [], II_ADDIU, FrmI>;
+
+class MoveFromHILOMM<string opstr, RegisterOperand RO, Register UseReg> :
+      MicroMipsInst16<(outs RO:$rd), (ins), !strconcat(opstr, "\t$rd"),
+  [], II_MFHI_MFLO, FrmR> {
+  let Uses = [UseReg];
+  let hasSideEffects = 0;
+}
+
+class MoveMM16<string opstr, RegisterOperand RO>
+    :  MicroMipsInst16<(outs RO:$rd), (ins RO:$rs),
+                       !strconcat(opstr, "\t$rd, $rs"), [], II_MOVE, FrmR> {
+  let isReMaterializable = 1;
+}
+
+class LoadImmMM16<string opstr, Operand Od, RegisterOperand RO> :
+  MicroMipsInst16<(outs RO:$rd), (ins Od:$imm),
+                  !strconcat(opstr, "\t$rd, $imm"), [], II_LI, FrmI> {
+  let isReMaterializable = 1;
+}
+
+// 16-bit Jump and Link (Call)
+class JumpLinkRegMM16<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+           [(MipsJmpLink RO:$rs)], II_JALR, FrmR>, PredicateControl {
+  let isCall = 1;
+  let hasDelaySlot = 1;
+  let Defs = [RA];
+}
+
+// 16-bit Jump Reg
+class JumpRegMM16<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+           [], II_JR, FrmR> {
+  let hasDelaySlot = 1;
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+
+// Base class for JRADDIUSP instruction.
+class JumpRAddiuStackMM16 :
+  MicroMipsInst16<(outs), (ins uimm5_lsl2:$imm), "jraddiusp\t$imm",
+                  [], II_JRADDIUSP, FrmR> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+
+// 16-bit Jump and Link (Call) - Short Delay Slot
+class JumpLinkRegSMM16<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+           [], II_JALRS, FrmR> {
+  let isCall = 1;
+  let hasDelaySlot = 1;
+  let Defs = [RA];
+}
+
+// 16-bit Jump Register Compact - No delay slot
+class JumpRegCMM16<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+                  [], II_JRC, FrmR> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+
+// Break16 and Sdbbp16
+class BrkSdbbp16MM<string opstr, InstrItinClass Itin> :
+  MicroMipsInst16<(outs), (ins uimm4:$code_),
+                  !strconcat(opstr, "\t$code_"),
+                  [], Itin, FrmOther>;
+
+class CBranchZeroMM<string opstr, DAGOperand opnd, RegisterOperand RO> :
+  MicroMipsInst16<(outs), (ins RO:$rs, opnd:$offset),
+                  !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZ, FrmI> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let Defs = [AT];
+}
+
+// MicroMIPS Jump and Link (Call) - Short Delay Slot
+let isCall = 1, hasDelaySlot = 1, Defs = [RA] in {
+  class JumpLinkMM<string opstr, DAGOperand opnd> :
+    InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
+           [], II_JALS, FrmJ, opstr> {
+    let DecoderMethod = "DecodeJumpTargetMM";
+  }
+
+  class JumpLinkRegMM<string opstr, RegisterOperand RO>:
+    InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+            [], II_JALRS, FrmR>;
+
+  class BranchCompareToZeroLinkMM<string opstr, DAGOperand opnd,
+                                  RegisterOperand RO> :
+    InstSE<(outs), (ins RO:$rs, opnd:$offset),
+           !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZALS, FrmI, opstr>;
+}
+
+class LoadWordIndexedScaledMM<string opstr, RegisterOperand RO,
+                              SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RO:$rd), (ins PtrRC:$base, PtrRC:$index),
+         !strconcat(opstr, "\t$rd, ${index}(${base})"), [], II_LWXS, FrmFI>;
+
+class PrefetchIndexed<string opstr> :
+  InstSE<(outs), (ins PtrRC:$base, PtrRC:$index, uimm5:$hint),
+         !strconcat(opstr, "\t$hint, ${index}(${base})"), [], II_PREF, FrmOther>;
+
+class AddImmUPC<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rs), (ins simm23_lsl2:$imm),
+         !strconcat(opstr, "\t$rs, $imm"), [], II_ADDIU, FrmR>;
+
+/// A list of registers used by load/store multiple instructions.
+def RegListAsmOperand : AsmOperandClass {
+  let Name = "RegList";
+  let ParserMethod = "parseRegisterList";
+}
+
+def reglist : Operand<i32> {
+  let EncoderMethod = "getRegisterListOpValue";
+  let ParserMatchClass = RegListAsmOperand;
+  let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeRegListOperand";
+}
+
+def RegList16AsmOperand : AsmOperandClass {
+  let Name = "RegList16";
+  let ParserMethod = "parseRegisterList";
+  let PredicateMethod = "isRegList16";
+  let RenderMethod = "addRegListOperands";
+}
+
+def reglist16 : Operand<i32> {
+  let EncoderMethod = "getRegisterListOpValue16";
+  let DecoderMethod = "DecodeRegListOperand16";
+  let PrintMethod = "printRegisterList";
+  let ParserMatchClass = RegList16AsmOperand;
+}
+
+class StoreMultMM<string opstr,
+            InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  InstSE<(outs), (ins reglist:$rt, mem_mm_12:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI, opstr> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let mayStore = 1;
+}
+
+class LoadMultMM<string opstr,
+            InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  InstSE<(outs reglist:$rt), (ins mem_mm_12:$addr),
+          !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI, opstr> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let mayLoad = 1;
+}
+
+class StoreMultMM16<string opstr,
+                    InstrItinClass Itin = NoItinerary,
+                    ComplexPattern Addr = addr> :
+  MicroMipsInst16<(outs), (ins reglist16:$rt, mem_mm_4sp:$addr),
+                  !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
+  let mayStore = 1;
+}
+
+class LoadMultMM16<string opstr,
+                   InstrItinClass Itin = NoItinerary,
+                   ComplexPattern Addr = addr> :
+  MicroMipsInst16<(outs reglist16:$rt), (ins mem_mm_4sp:$addr),
+                  !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI> {
+  let DecoderMethod = "DecodeMemMMReglistImm4Lsl2";
+  let mayLoad = 1;
+}
+
+class UncondBranchMM16<string opstr> :
+  MicroMipsInst16<(outs), (ins brtarget10_mm:$offset),
+                  !strconcat(opstr, "\t$offset"),
+                  [], II_B, FrmI> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let hasDelaySlot = 1;
+  let Predicates = [RelocPIC, InMicroMips];
+  let Defs = [AT];
+}
+
+def ADDU16_MM : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
+    ARITH_FM_MM16<0>, ISA_MICROMIPS_NOT_32R6_64R6;
+def AND16_MM : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>,
+    LOGIC_FM_MM16<0x2>, ISA_MICROMIPS_NOT_32R6_64R6;
+def ANDI16_MM : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>, ANDI_FM_MM16<0x0b>,
+    ISA_MICROMIPS_NOT_32R6_64R6;
+def NOT16_MM : NotMM16<"not16", GPRMM16Opnd>, LOGIC_FM_MM16<0x0>,
+    ISA_MICROMIPS_NOT_32R6_64R6;
+def OR16_MM : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>, LOGIC_FM_MM16<0x3>,
+    ISA_MICROMIPS_NOT_32R6_64R6;
+def SLL16_MM : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>,
+    SHIFT_FM_MM16<0>, ISA_MICROMIPS_NOT_32R6_64R6;
+def SRL16_MM : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>,
+    SHIFT_FM_MM16<1>, ISA_MICROMIPS_NOT_32R6_64R6;
+
+def SUBU16_MM : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
+                ARITH_FM_MM16<1>, ISA_MICROMIPS_NOT_32R6_64R6;
+def XOR16_MM : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
+               LOGIC_FM_MM16<0x1>, ISA_MICROMIPS_NOT_32R6_64R6;
+def LBU16_MM : LoadMM16<"lbu16", GPRMM16Opnd, zextloadi8, II_LBU,
+                        mem_mm_4>, LOAD_STORE_FM_MM16<0x02>;
+def LHU16_MM : LoadMM16<"lhu16", GPRMM16Opnd, zextloadi16, II_LHU,
+                        mem_mm_4_lsl1>, LOAD_STORE_FM_MM16<0x0a>;
+def LW16_MM : LoadMM16<"lw16", GPRMM16Opnd, load, II_LW, mem_mm_4_lsl2>,
+                      LOAD_STORE_FM_MM16<0x1a>;
+def SB16_MM : StoreMM16<"sb16", GPRMM16OpndZero, GPRMM16Opnd, truncstorei8,
+                        II_SB, mem_mm_4>, LOAD_STORE_FM_MM16<0x22>;
+def SH16_MM : StoreMM16<"sh16", GPRMM16OpndZero, GPRMM16Opnd, truncstorei16,
+                        II_SH, mem_mm_4_lsl1>,
+                        LOAD_STORE_FM_MM16<0x2a>;
+def SW16_MM : StoreMM16<"sw16", GPRMM16OpndZero, GPRMM16Opnd, store, II_SW,
+                        mem_mm_4_lsl2>, LOAD_STORE_FM_MM16<0x3a>;
+def LWGP_MM : LoadGPMM16<"lw", GPRMM16Opnd, II_LW, mem_mm_gp_simm7_lsl2>,
+                         LOAD_GP_FM_MM16<0x19>;
+def LWSP_MM : LoadSPMM16<"lw", GPR32Opnd, II_LW, mem_mm_sp_imm5_lsl2>,
+              LOAD_STORE_SP_FM_MM16<0x12>;
+def SWSP_MM : StoreSPMM16<"sw", GPR32Opnd, II_SW, mem_mm_sp_imm5_lsl2>,
+              LOAD_STORE_SP_FM_MM16<0x32>;
+def ADDIUR1SP_MM : AddImmUR1SP<"addiur1sp", GPRMM16Opnd>, ADDIUR1SP_FM_MM16;
+def ADDIUR2_MM : AddImmUR2<"addiur2", GPRMM16Opnd>, ADDIUR2_FM_MM16;
+def ADDIUS5_MM : AddImmUS5<"addius5", GPR32Opnd>, ADDIUS5_FM_MM16;
+def ADDIUSP_MM : AddImmUSP<"addiusp">, ADDIUSP_FM_MM16;
+def MFHI16_MM : MoveFromHILOMM<"mfhi", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x10>;
+def MFLO16_MM : MoveFromHILOMM<"mflo", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x12>;
+def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>;
+def MOVEP_MM : MovePMM16<"movep", GPRMM16OpndMoveP>, MOVEP_FM_MM16;
+def LI16_MM : LoadImmMM16<"li16", li16_imm, GPRMM16Opnd>, LI_FM_MM16,
+              IsAsCheapAsAMove;
+def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>,
+                ISA_MICROMIPS32_NOT_MIPS32R6;
+def JALRS16_MM : JumpLinkRegSMM16<"jalrs16", GPR32Opnd>, JALR_FM_MM16<0x0f>;
+def JRC16_MM : JumpRegCMM16<"jrc", GPR32Opnd>, JALR_FM_MM16<0x0d>;
+def JRADDIUSP : JumpRAddiuStackMM16, JRADDIUSP_FM_MM16<0x18>;
+def JR16_MM : JumpRegMM16<"jr16", GPR32Opnd>, JALR_FM_MM16<0x0c>;
+def BEQZ16_MM : CBranchZeroMM<"beqz16", brtarget7_mm, GPRMM16Opnd>,
+                BEQNEZ_FM_MM16<0x23>;
+def BNEZ16_MM : CBranchZeroMM<"bnez16", brtarget7_mm, GPRMM16Opnd>,
+                BEQNEZ_FM_MM16<0x2b>;
+def B16_MM : UncondBranchMM16<"b16">, B16_FM;
+def BREAK16_MM : BrkSdbbp16MM<"break16", II_BREAK>, BRKSDBBP16_FM_MM<0x28>,
+    ISA_MICROMIPS_NOT_32R6_64R6;
+def SDBBP16_MM : BrkSdbbp16MM<"sdbbp16", II_SDBBP>, BRKSDBBP16_FM_MM<0x2C>,
+    ISA_MICROMIPS_NOT_32R6_64R6;
+
+let DecoderNamespace = "MicroMips" in {
+  /// Load and Store Instructions - multiple
+  def SWM16_MM : StoreMultMM16<"swm16", II_SWM>, LWM_FM_MM16<0x5>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
+  def LWM16_MM : LoadMultMM16<"lwm16", II_LWM>, LWM_FM_MM16<0x4>,
+                 ISA_MICROMIPS32_NOT_MIPS32R6;
+  let AdditionalPredicates = [InMicroMips] in {
+    def CFC2_MM : InstSE<(outs GPR32Opnd:$rt), (ins COP2Opnd:$impl),
+                         "cfc2\t$rt, $impl", [], II_CFC2, FrmFR, "cfc2">,
+                  POOL32A_CFTC2_FM_MM<0b1100110100>;
+    def CTC2_MM : InstSE<(outs COP2Opnd:$impl), (ins GPR32Opnd:$rt),
+                         "ctc2\t$rt, $impl", [], II_CTC2, FrmFR, "ctc2">,
+                  POOL32A_CFTC2_FM_MM<0b1101110100>;
+  }
+}
+
+class WaitMM<string opstr> :
+  InstSE<(outs), (ins uimm10:$code_), !strconcat(opstr, "\t$code_"), [],
+         II_WAIT, FrmOther, opstr>;
+
+let DecoderNamespace = "MicroMips", Predicates = [InMicroMips, NotMips32r6,
+                                                  NotMips64r6] in {
+  /// Compact Branch Instructions
+  def BEQZC_MM : CompactBranchMM<"beqzc", brtarget_mm, seteq, GPR32Opnd>,
+                 COMPACT_BRANCH_FM_MM<0x7>;
+  def BNEZC_MM : CompactBranchMM<"bnezc", brtarget_mm, setne, GPR32Opnd>,
+                 COMPACT_BRANCH_FM_MM<0x5>;
+}
+let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
+  /// Arithmetic Instructions (ALU Immediate)
+  def ADDiu_MM : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd, II_ADDIU>,
+                 ADDI_FM_MM<0xc>;
+  def ADDi_MM  : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd, II_ADDI>,
+                 ADDI_FM_MM<0x4>;
+  def SLTi_MM  : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
+                 SLTI_FM_MM<0x24>;
+  def SLTiu_MM : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
+                 SLTI_FM_MM<0x2c>;
+  def ANDi_MM  : MMRel, ArithLogicI<"andi", uimm16, GPR32Opnd, II_ANDI>,
+                 ADDI_FM_MM<0x34>;
+  def ORi_MM   : MMRel, ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16,
+                                    or>, ADDI_FM_MM<0x14>;
+  def XORi_MM  : MMRel, ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI,
+                                    immZExt16, xor>, ADDI_FM_MM<0x1c>;
+  def LUi_MM   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM_MM;
+
+  def LEA_ADDiu_MM : MMRel, EffectiveAddress<"addiu", GPR32Opnd>,
+                     LW_FM_MM<0xc>;
+
+  /// Arithmetic Instructions (3-Operand, R-Type)
+  def ADDu_MM  : MMRel, ArithLogicR<"addu", GPR32Opnd, 1, II_ADDU, add>,
+                 ADD_FM_MM<0, 0x150>;
+  def SUBu_MM  : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>,
+                 ADD_FM_MM<0, 0x1d0>;
+  def MUL_MM   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL>,
+                 ADD_FM_MM<0, 0x210>;
+  def ADD_MM   : MMRel, ArithLogicR<"add", GPR32Opnd, 1, II_ADD>,
+                 ADD_FM_MM<0, 0x110>;
+  def SUB_MM   : MMRel, ArithLogicR<"sub", GPR32Opnd, 0, II_SUB>,
+                 ADD_FM_MM<0, 0x190>;
+  def SLT_MM   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM_MM<0, 0x350>;
+  def SLTu_MM  : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>,
+                 ADD_FM_MM<0, 0x390>;
+  def AND_MM   : MMRel, ArithLogicR<"and", GPR32Opnd, 1, II_AND, and>,
+                 ADD_FM_MM<0, 0x250>;
+  def OR_MM    : MMRel, ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>,
+                 ADD_FM_MM<0, 0x290>;
+  def XOR_MM   : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>,
+                 ADD_FM_MM<0, 0x310>;
+  def NOR_MM   : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM_MM<0, 0x2d0>;
+  def MULT_MM  : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
+                 MULT_FM_MM<0x22c>;
+  def MULTu_MM : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
+                 MULT_FM_MM<0x26c>;
+  def SDIV_MM  : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
+                 MULT_FM_MM<0x2ac>, ISA_MIPS1_NOT_32R6_64R6;
+  def UDIV_MM  : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
+                 MULT_FM_MM<0x2ec>, ISA_MIPS1_NOT_32R6_64R6;
+
+  /// Arithmetic Instructions with PC and Immediate
+  def ADDIUPC_MM : AddImmUPC<"addiupc", GPRMM16Opnd>, ADDIUPC_FM_MM;
+
+  /// Shift Instructions
+  def SLL_MM   : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL>,
+                 SRA_FM_MM<0, 0>;
+  def SRL_MM   : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL>,
+                 SRA_FM_MM<0x40, 0>;
+  def SRA_MM   : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA>,
+                 SRA_FM_MM<0x80, 0>;
+  def SLLV_MM  : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV>,
+                 SRLV_FM_MM<0x10, 0>;
+  def SRLV_MM  : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV>,
+                 SRLV_FM_MM<0x50, 0>;
+  def SRAV_MM  : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV>,
+                 SRLV_FM_MM<0x90, 0>;
+  def ROTR_MM  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR>,
+                 SRA_FM_MM<0xc0, 0> {
+    list<dag> Pattern = [(set GPR32Opnd:$rd,
+                          (rotr GPR32Opnd:$rt, immZExt5:$shamt))];
+  }
+  def ROTRV_MM : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV>,
+                 SRLV_FM_MM<0xd0, 0> {
+    list<dag> Pattern = [(set GPR32Opnd:$rd,
+                          (rotr GPR32Opnd:$rt, GPR32Opnd:$rs))];
+  }
+
+  /// Load and Store Instructions - aligned
+  let DecoderMethod = "DecodeMemMMImm16" in {
+    def LB_MM  : LoadMemory<"lb", GPR32Opnd, mem_mm_16, null_frag, II_LB>,
+                 MMRel, LW_FM_MM<0x7>;
+    def LBu_MM : LoadMemory<"lbu", GPR32Opnd, mem_mm_16, null_frag, II_LBU>,
+                 MMRel, LW_FM_MM<0x5>;
+    def LH_MM  : LoadMemory<"lh", GPR32Opnd, mem_simm16, sextloadi16, II_LH,
+                            addrDefault>, MMRel, LW_FM_MM<0xf>;
+    def LHu_MM : LoadMemory<"lhu", GPR32Opnd, mem_simm16, zextloadi16, II_LHU>,
+                 MMRel, LW_FM_MM<0xd>;
+    def LW_MM  : Load<"lw", GPR32Opnd, null_frag, II_LW>, MMRel, LW_FM_MM<0x3f>;
+    def SB_MM  : Store<"sb", GPR32Opnd, null_frag, II_SB>, MMRel,
+                 LW_FM_MM<0x6>;
+    def SH_MM  : Store<"sh", GPR32Opnd, null_frag, II_SH>, MMRel,
+                 LW_FM_MM<0xe>;
+    def SW_MM  : Store<"sw", GPR32Opnd, null_frag, II_SW>, MMRel,
+                 LW_FM_MM<0x3e>;
+  }
+
+  let DecoderMethod = "DecodeMemMMImm9" in {
+    def LBE_MM  : Load<"lbe", GPR32Opnd, null_frag, II_LBE>,
+                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x4>;
+    def LBuE_MM : Load<"lbue", GPR32Opnd, null_frag, II_LBUE>,
+                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x0>;
+    def LHE_MM  : LoadMemory<"lhe", GPR32Opnd, mem_simm9, null_frag, II_LHE>,
+                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x5>;
+    def LHuE_MM : LoadMemory<"lhue", GPR32Opnd, mem_simm9, null_frag, II_LHUE>,
+                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x1>;
+    def LWE_MM  : LoadMemory<"lwe", GPR32Opnd, mem_simm9, null_frag, II_LWE>,
+                  POOL32C_LHUE_FM_MM<0x18, 0x6, 0x7>;
+    def SBE_MM  : StoreMemory<"sbe", GPR32Opnd, mem_simm9, null_frag, II_SBE>,
+                  POOL32C_LHUE_FM_MM<0x18, 0xa, 0x4>;
+    def SHE_MM  : StoreMemory<"she", GPR32Opnd, mem_simm9, null_frag, II_SHE>,
+                  POOL32C_LHUE_FM_MM<0x18, 0xa, 0x5>;
+    def SWE_MM  : StoreMemory<"swe", GPR32Opnd, mem_simm9, null_frag, II_SWE>,
+                  POOL32C_LHUE_FM_MM<0x18, 0xa, 0x7>;
+  }
+
+  def LWXS_MM : LoadWordIndexedScaledMM<"lwxs", GPR32Opnd>, LWXS_FM_MM<0x118>;
+
+  /// Load and Store Instructions - unaligned
+  def LWL_MM : LoadLeftRightMM<"lwl", MipsLWL, GPR32Opnd, mem_mm_12, II_LWL>,
+               LWL_FM_MM<0x0>;
+  def LWR_MM : LoadLeftRightMM<"lwr", MipsLWR, GPR32Opnd, mem_mm_12, II_LWR>,
+               LWL_FM_MM<0x1>;
+  def SWL_MM : StoreLeftRightMM<"swl", MipsSWL, GPR32Opnd, mem_mm_12, II_SWL>,
+               LWL_FM_MM<0x8>;
+  def SWR_MM : StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12, II_SWR>,
+               LWL_FM_MM<0x9>;
+  let DecoderMethod = "DecodeMemMMImm9" in {
+    def LWLE_MM : LoadLeftRightMM<"lwle", MipsLWL, GPR32Opnd, mem_mm_9,
+                                  II_LWLE>, POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x2>;
+    def LWRE_MM : LoadLeftRightMM<"lwre", MipsLWR, GPR32Opnd, mem_mm_9,
+                                  II_LWRE>, POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x3>;
+    def SWLE_MM : StoreLeftRightMM<"swle", MipsSWL, GPR32Opnd, mem_mm_9,
+                                   II_SWLE>,
+                  POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x0>;
+    def SWRE_MM : StoreLeftRightMM<"swre", MipsSWR, GPR32Opnd, mem_mm_9,
+                                   II_SWRE>,
+                  POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x1>, ISA_MIPS1_NOT_32R6_64R6;
+  }
+
+  /// Load and Store Instructions - multiple
+  def SWM32_MM  : StoreMultMM<"swm32", II_SWM>, LWM_FM_MM<0xd>;
+  def LWM32_MM  : LoadMultMM<"lwm32", II_LWM>, LWM_FM_MM<0x5>;
+
+  /// Load and Store Pair Instructions
+  def SWP_MM  : StorePairMM<"swp">, LWM_FM_MM<0x9>;
+  def LWP_MM  : LoadPairMM<"lwp">, LWM_FM_MM<0x1>;
+
+  /// Load and Store multiple pseudo Instructions
+  class LoadWordMultMM<string instr_asm > :
+    MipsAsmPseudoInst<(outs reglist:$rt), (ins mem_mm_12:$addr),
+                      !strconcat(instr_asm, "\t$rt, $addr")> ;
+
+  class StoreWordMultMM<string instr_asm > :
+    MipsAsmPseudoInst<(outs), (ins reglist:$rt, mem_mm_12:$addr),
+                      !strconcat(instr_asm, "\t$rt, $addr")> ;
+
+
+  def SWM_MM  : StoreWordMultMM<"swm">;
+  def LWM_MM  : LoadWordMultMM<"lwm">;
+
+  /// Move Conditional
+  def MOVZ_I_MM : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd,
+                  NoItinerary>, ADD_FM_MM<0, 0x58>;
+  def MOVN_I_MM : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd,
+                  NoItinerary>, ADD_FM_MM<0, 0x18>;
+  def MOVT_I_MM : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT>,
+                  CMov_F_I_FM_MM<0x25>;
+  def MOVF_I_MM : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF>,
+                  CMov_F_I_FM_MM<0x5>;
+
+  /// Move to/from HI/LO
+  def MTHI_MM : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>,
+                MTLO_FM_MM<0x0b5>;
+  def MTLO_MM : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>,
+                MTLO_FM_MM<0x0f5>;
+  def MFHI_MM : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>,
+                MFLO_FM_MM<0x035>;
+  def MFLO_MM : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>,
+                MFLO_FM_MM<0x075>;
+
+  /// Multiply Add/Sub Instructions
+  def MADD_MM  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM_MM<0x32c>;
+  def MADDU_MM : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM_MM<0x36c>;
+  def MSUB_MM  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM_MM<0x3ac>;
+  def MSUBU_MM : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM_MM<0x3ec>;
+
+  /// Count Leading
+  def CLZ_MM : MMRel, CountLeading0<"clz", GPR32Opnd, II_CLZ>, CLO_FM_MM<0x16c>,
+               ISA_MIPS32;
+  def CLO_MM : MMRel, CountLeading1<"clo", GPR32Opnd, II_CLO>, CLO_FM_MM<0x12c>,
+               ISA_MIPS32;
+
+  /// Sign Ext In Register Instructions.
+  def SEB_MM : MMRel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>,
+               SEB_FM_MM<0x0ac>, ISA_MIPS32R2;
+  def SEH_MM : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
+               SEB_FM_MM<0x0ec>, ISA_MIPS32R2;
+
+  /// Word Swap Bytes Within Halfwords
+  def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>,
+                SEB_FM_MM<0x1ec>, ISA_MIPS32R2;
+  // TODO: Add '0 < pos+size <= 32' constraint check to ext instruction
+  def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1, immZExt5,
+                              immZExt5Plus1, MipsExt>, EXT_FM_MM<0x2c>;
+  def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, uimm5_inssize_plus1,
+                              MipsIns>, EXT_FM_MM<0x0c>;
+
+  /// Jump Instructions
+  let DecoderMethod = "DecodeJumpTargetMM" in {
+    def J_MM        : MMRel, JumpFJ<jmptarget_mm, "j", br, bb, "j">,
+                      J_FM_MM<0x35>;
+    def JAL_MM      : MMRel, JumpLink<"jal", calltarget_mm>, J_FM_MM<0x3d>;
+    def JALX_MM     : MMRel, JumpLink<"jalx", calltarget>, J_FM_MM<0x3c>;
+  }
+  def JR_MM : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>,
+              ISA_MICROMIPS32_NOT_MIPS32R6;
+  def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>;
+
+  /// Jump Instructions - Short Delay Slot
+  def JALS_MM   : JumpLinkMM<"jals", calltarget_mm>, J_FM_MM<0x1d>;
+  def JALRS_MM  : JumpLinkRegMM<"jalrs", GPR32Opnd>, JALR_FM_MM<0x13c>;
+
+  /// Branch Instructions
+  def BEQ_MM  : MMRel, CBranch<"beq", brtarget_mm, seteq, GPR32Opnd>,
+                BEQ_FM_MM<0x25>;
+  def BNE_MM  : MMRel, CBranch<"bne", brtarget_mm, setne, GPR32Opnd>,
+                BEQ_FM_MM<0x2d>;
+  def BGEZ_MM : MMRel, CBranchZero<"bgez", brtarget_mm, setge, GPR32Opnd>,
+                BGEZ_FM_MM<0x2>;
+  def BGTZ_MM : MMRel, CBranchZero<"bgtz", brtarget_mm, setgt, GPR32Opnd>,
+                BGEZ_FM_MM<0x6>;
+  def BLEZ_MM : MMRel, CBranchZero<"blez", brtarget_mm, setle, GPR32Opnd>,
+                BGEZ_FM_MM<0x4>;
+  def BLTZ_MM : MMRel, CBranchZero<"bltz", brtarget_mm, setlt, GPR32Opnd>,
+                BGEZ_FM_MM<0x0>;
+  def BGEZAL_MM : MMRel, BGEZAL_FT<"bgezal", brtarget_mm, GPR32Opnd>,
+                  BGEZAL_FM_MM<0x03>;
+  def BLTZAL_MM : MMRel, BGEZAL_FT<"bltzal", brtarget_mm, GPR32Opnd>,
+                  BGEZAL_FM_MM<0x01>;
+
+  /// Branch Instructions - Short Delay Slot
+  def BGEZALS_MM : BranchCompareToZeroLinkMM<"bgezals", brtarget_mm,
+                                             GPR32Opnd>, BGEZAL_FM_MM<0x13>;
+  def BLTZALS_MM : BranchCompareToZeroLinkMM<"bltzals", brtarget_mm,
+                                             GPR32Opnd>, BGEZAL_FM_MM<0x11>;
+
+  /// Control Instructions
+  def SYNC_MM    : MMRel, SYNC_FT<"sync">, SYNC_FM_MM;
+  def SYNCI_MM   : MMRel, SYNCI_FT<"synci">, SYNCI_FM_MM;
+  def BREAK_MM   : MMRel, BRK_FT<"break">, BRK_FM_MM;
+  def SYSCALL_MM : MMRel, SYS_FT<"syscall", uimm10, II_SYSCALL>, SYS_FM_MM;
+  def WAIT_MM    : WaitMM<"wait">, WAIT_FM_MM;
+  def ERET_MM    : MMRel, ER_FT<"eret", II_ERET>, ER_FM_MM<0x3cd>;
+  def DERET_MM   : MMRel, ER_FT<"deret", II_DERET>, ER_FM_MM<0x38d>;
+  def EI_MM      : MMRel, DEI_FT<"ei", GPR32Opnd, II_EI>, EI_FM_MM<0x15d>,
+                   ISA_MIPS32R2;
+  def DI_MM      : MMRel, DEI_FT<"di", GPR32Opnd, II_DI>, EI_FM_MM<0x11d>,
+                   ISA_MIPS32R2;
+
+  /// Trap Instructions
+  def TEQ_MM  : MMRel, TEQ_FT<"teq", GPR32Opnd, uimm4, II_TEQ>, TEQ_FM_MM<0x0>;
+  def TGE_MM  : MMRel, TEQ_FT<"tge", GPR32Opnd, uimm4, II_TGE>, TEQ_FM_MM<0x08>;
+  def TGEU_MM : MMRel, TEQ_FT<"tgeu", GPR32Opnd, uimm4, II_TGEU>,
+                TEQ_FM_MM<0x10>;
+  def TLT_MM  : MMRel, TEQ_FT<"tlt", GPR32Opnd, uimm4, II_TLT>, TEQ_FM_MM<0x20>;
+  def TLTU_MM : MMRel, TEQ_FT<"tltu", GPR32Opnd, uimm4, II_TLTU>,
+                TEQ_FM_MM<0x28>;
+  def TNE_MM  : MMRel, TEQ_FT<"tne", GPR32Opnd, uimm4, II_TNE>, TEQ_FM_MM<0x30>;
+
+  def TEQI_MM  : MMRel, TEQI_FT<"teqi", GPR32Opnd, II_TEQI>, TEQI_FM_MM<0x0e>;
+  def TGEI_MM  : MMRel, TEQI_FT<"tgei", GPR32Opnd, II_TGEI>, TEQI_FM_MM<0x09>;
+  def TGEIU_MM : MMRel, TEQI_FT<"tgeiu", GPR32Opnd, II_TGEIU>,
+                 TEQI_FM_MM<0x0b>;
+  def TLTI_MM  : MMRel, TEQI_FT<"tlti", GPR32Opnd, II_TLTI>, TEQI_FM_MM<0x08>;
+  def TLTIU_MM : MMRel, TEQI_FT<"tltiu", GPR32Opnd, II_TTLTIU>,
+                 TEQI_FM_MM<0x0a>;
+  def TNEI_MM  : MMRel, TEQI_FT<"tnei", GPR32Opnd, II_TNEI>, TEQI_FM_MM<0x0c>;
+
+  /// Load-linked, Store-conditional
+  def LL_MM : LLBaseMM<"ll", GPR32Opnd>, LL_FM_MM<0x3>;
+  def SC_MM : SCBaseMM<"sc", GPR32Opnd>, LL_FM_MM<0xb>;
+
+  def LLE_MM : LLEBaseMM<"lle", GPR32Opnd>, LLE_FM_MM<0x6>;
+  def SCE_MM : SCEBaseMM<"sce", GPR32Opnd>, LLE_FM_MM<0xA>;
+
+  let DecoderMethod = "DecodeCacheOpMM" in {
+  def CACHE_MM : MMRel, CacheOp<"cache", mem_mm_12, II_CACHE>,
+                 CACHE_PREF_FM_MM<0x08, 0x6>;
+  def PREF_MM  : MMRel, CacheOp<"pref", mem_mm_12, II_PREF>,
+                 CACHE_PREF_FM_MM<0x18, 0x2>;
+  }
+
+  let DecoderMethod = "DecodePrefeOpMM" in {
+    def PREFE_MM  : MMRel, CacheOp<"prefe", mem_mm_9, II_PREFE>,
+                    CACHE_PREFE_FM_MM<0x18, 0x2>;
+    def CACHEE_MM : MMRel, CacheOp<"cachee", mem_mm_9, II_CACHEE>,
+                    CACHE_PREFE_FM_MM<0x18, 0x3>;
+  }
+  def SSNOP_MM : MMRel, Barrier<"ssnop", II_SSNOP>, BARRIER_FM_MM<0x1>;
+  def EHB_MM   : MMRel, Barrier<"ehb", II_EHB>, BARRIER_FM_MM<0x3>;
+  def PAUSE_MM : MMRel, Barrier<"pause", II_PAUSE>, BARRIER_FM_MM<0x5>;
+
+  def TLBP_MM : MMRel, TLB<"tlbp", II_TLBP>, COP0_TLB_FM_MM<0x0d>;
+  def TLBR_MM : MMRel, TLB<"tlbr", II_TLBR>, COP0_TLB_FM_MM<0x4d>;
+  def TLBWI_MM : MMRel, TLB<"tlbwi", II_TLBWI>, COP0_TLB_FM_MM<0x8d>;
+  def TLBWR_MM : MMRel, TLB<"tlbwr", II_TLBWR>, COP0_TLB_FM_MM<0xcd>;
+
+  def SDBBP_MM : MMRel, SYS_FT<"sdbbp", uimm10, II_SDBBP>, SDBBP_FM_MM;
+
+  def PREFX_MM : PrefetchIndexed<"prefx">, POOL32F_PREFX_FM_MM<0x15, 0x1A0>;
+}
+
+def TAILCALL_MM : TailCall<J_MM, jmptarget_mm>, ISA_MIPS1_NOT_32R6_64R6;
+
+let DecoderNamespace = "MicroMips" in {
+  def RDHWR_MM : MMRel, R6MMR6Rel, ReadHardware<GPR32Opnd, HWRegsOpnd>,
+                 RDHWR_FM_MM, ISA_MICROMIPS32_NOT_MIPS32R6;
+  def LWU_MM : MMRel, LoadMM<"lwu", GPR32Opnd, zextloadi32, II_LWU,
+                             mem_simm12>, LL_FM_MM<0xe>,
+               ISA_MICROMIPS32_NOT_MIPS32R6;
+}
+
+//===----------------------------------------------------------------------===//
+// MicroMips arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+def : MipsPat<(i32 immLi16:$imm),
+              (LI16_MM immLi16:$imm)>;
+
+let AdditionalPredicates = [InMicroMips] in
+defm :  MaterializeImms<i32, ZERO, ADDiu_MM, LUi_MM, ORi_MM>;
+
+let Predicates = [InMicroMips] in {
+  def : MipsPat<(i32 immLi16:$imm),
+                (LI16_MM immLi16:$imm)>;
+  def : MipsPat<(i32 immSExt16:$imm),
+                (ADDiu_MM ZERO, immSExt16:$imm)>;
+  def : MipsPat<(i32 immZExt16:$imm),
+                (ORi_MM ZERO, immZExt16:$imm)>;
+
+  def : MipsPat<(not GPRMM16:$in),
+                (NOT16_MM GPRMM16:$in)>;
+  def : MipsPat<(not GPR32:$in),
+                (NOR_MM GPR32Opnd:$in, ZERO)>;
+
+  def : MipsPat<(add GPRMM16:$src, immSExtAddiur2:$imm),
+                (ADDIUR2_MM GPRMM16:$src, immSExtAddiur2:$imm)>;
+  def : MipsPat<(add GPR32:$src, immSExtAddius5:$imm),
+                (ADDIUS5_MM GPR32:$src, immSExtAddius5:$imm)>;
+  def : MipsPat<(add GPR32:$src, immSExt16:$imm),
+                (ADDiu_MM GPR32:$src, immSExt16:$imm)>;
+
+  def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm),
+                (ANDI16_MM GPRMM16:$src, immZExtAndi16:$imm)>;
+  def : MipsPat<(and GPR32:$src, immZExt16:$imm),
+                (ANDi_MM GPR32:$src, immZExt16:$imm)>;
+
+  def : MipsPat<(shl GPRMM16:$src, immZExt2Shift:$imm),
+                (SLL16_MM GPRMM16:$src, immZExt2Shift:$imm)>;
+  def : MipsPat<(shl GPR32:$src, immZExt5:$imm),
+                (SLL_MM GPR32:$src, immZExt5:$imm)>;
+  def : MipsPat<(shl GPR32:$lhs, GPR32:$rhs),
+                (SLLV_MM GPR32:$lhs, GPR32:$rhs)>;
+
+  def : MipsPat<(srl GPRMM16:$src, immZExt2Shift:$imm),
+                (SRL16_MM GPRMM16:$src, immZExt2Shift:$imm)>;
+  def : MipsPat<(srl GPR32:$src, immZExt5:$imm),
+                (SRL_MM GPR32:$src, immZExt5:$imm)>;
+  def : MipsPat<(srl GPR32:$lhs, GPR32:$rhs),
+                (SRLV_MM GPR32:$lhs, GPR32:$rhs)>;
+
+  def : MipsPat<(sra GPR32:$src, immZExt5:$imm),
+                (SRA_MM GPR32:$src, immZExt5:$imm)>;
+  def : MipsPat<(sra GPR32:$lhs, GPR32:$rhs),
+                (SRAV_MM GPR32:$lhs, GPR32:$rhs)>;
+
+  def : MipsPat<(store GPRMM16:$src, addrimm4lsl2:$addr),
+                (SW16_MM GPRMM16:$src, addrimm4lsl2:$addr)>;
+  def : MipsPat<(store GPR32:$src, addr:$addr),
+                (SW_MM GPR32:$src, addr:$addr)>;
+
+  def : MipsPat<(load addrimm4lsl2:$addr),
+                (LW16_MM addrimm4lsl2:$addr)>;
+  def : MipsPat<(load addr:$addr),
+                (LW_MM addr:$addr)>;
+  def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
+                (SUBu_MM GPR32:$lhs, GPR32:$rhs)>;
+
+  def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
+                (TAILCALL_MM tglobaladdr:$dst)>, ISA_MIPS1_NOT_32R6_64R6;
+  def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
+                (TAILCALL_MM texternalsym:$dst)>, ISA_MIPS1_NOT_32R6_64R6;
+}
+
+let AddedComplexity = 40 in {
+  def : MipsPat<(i32 (sextloadi16 addrRegImm:$a)),
+                (LH_MM addrRegImm:$a)>;
+}
+def : MipsPat<(atomic_load_16 addr:$a),
+              (LH_MM addr:$a)>;
+def : MipsPat<(i32 (extloadi16 addr:$src)),
+              (LHu_MM addr:$src)>;
+
+defm : BrcondPats<GPR32, BEQ_MM, BEQ_MM, BNE_MM, SLT_MM, SLTu_MM, SLTi_MM,
+                  SLTiu_MM, ZERO>;
+
+defm : SeteqPats<GPR32, SLTiu_MM, XOR_MM, SLTu_MM, ZERO>;
+defm : SetlePats<GPR32, XORi_MM, SLT_MM, SLTu_MM>;
+defm : SetgtPats<GPR32, SLT_MM, SLTu_MM>;
+defm : SetgePats<GPR32, XORi_MM, SLT_MM, SLTu_MM>;
+defm : SetgeImmPats<GPR32, XORi_MM, SLTi_MM, SLTiu_MM>;
+
+//===----------------------------------------------------------------------===//
+// MicroMips instruction aliases
+//===----------------------------------------------------------------------===//
+
+class UncondBranchMMPseudo<string opstr> :
+  MipsAsmPseudoInst<(outs), (ins brtarget_mm:$offset),
+                    !strconcat(opstr, "\t$offset")>;
+
+def B_MM_Pseudo : UncondBranchMMPseudo<"b">, ISA_MICROMIPS;
+
+let Predicates = [InMicroMips] in {
+  def SDIV_MM_Pseudo : MultDivPseudo<SDIV_MM, ACC64, GPR32Opnd, MipsDivRem,
+                                     II_DIV, 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
+  def UDIV_MM_Pseudo : MultDivPseudo<UDIV_MM, ACC64, GPR32Opnd, MipsDivRemU,
+                                     II_DIVU, 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
+
+  def : MipsInstAlias<"wait", (WAIT_MM 0x0), 1>;
+  def : MipsInstAlias<"nop", (SLL_MM ZERO, ZERO, 0), 1>;
+  def : MipsInstAlias<"nop", (MOVE16_MM ZERO, ZERO), 1>;
+  def : MipsInstAlias<"ei", (EI_MM ZERO), 1>, ISA_MIPS32R2;
+  def : MipsInstAlias<"di", (DI_MM ZERO), 1>, ISA_MIPS32R2;
+  def : MipsInstAlias<"teq $rs, $rt",
+                      (TEQ_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+  def : MipsInstAlias<"tge $rs, $rt",
+                      (TGE_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+  def : MipsInstAlias<"tgeu $rs, $rt",
+                      (TGEU_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+  def : MipsInstAlias<"tlt $rs, $rt",
+                      (TLT_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+  def : MipsInstAlias<"tltu $rs, $rt",
+                      (TLTU_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+  def : MipsInstAlias<"tne $rs, $rt",
+                      (TNE_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+  def : MipsInstAlias<
+          "sgt $rd, $rs, $rt",
+          (SLT_MM GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<
+          "sgt $rs, $rt",
+          (SLT_MM GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<
+          "sgtu $rd, $rs, $rt",
+          (SLTu_MM GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<
+          "sgtu $rs, $rt",
+          (SLTu_MM GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<"slt $rs, $rt, $imm",
+                      (SLTi_MM GPR32Opnd:$rs, GPR32Opnd:$rt,
+                               simm32_relaxed:$imm), 0>;
+  def : MipsInstAlias<"sltu $rs, $rt, $imm",
+                      (SLTiu_MM GPR32Opnd:$rs, GPR32Opnd:$rt,
+                                simm32_relaxed:$imm), 0>;
+  def : MipsInstAlias<"sll $rd, $rt, $rs",
+                      (SLLV_MM GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<"sra $rd, $rt, $rs",
+                      (SRAV_MM GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<"srl $rd, $rt, $rs",
+                      (SRLV_MM GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<"sll $rd, $rt",
+                      (SLLV_MM GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rt), 0>;
+  def : MipsInstAlias<"sra $rd, $rt",
+                      (SRAV_MM GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rt), 0>;
+  def : MipsInstAlias<"srl $rd, $rt",
+                      (SRLV_MM GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rt), 0>;
+  def : MipsInstAlias<"sll $rd, $shamt",
+                      (SLL_MM GPR32Opnd:$rd, GPR32Opnd:$rd, uimm5:$shamt), 0>;
+  def : MipsInstAlias<"sra $rd, $shamt",
+                      (SRA_MM GPR32Opnd:$rd, GPR32Opnd:$rd, uimm5:$shamt), 0>;
+  def : MipsInstAlias<"srl $rd, $shamt",
+                      (SRL_MM GPR32Opnd:$rd, GPR32Opnd:$rd, uimm5:$shamt), 0>;
+  def : MipsInstAlias<"rotr $rt, $imm",
+                      (ROTR_MM GPR32Opnd:$rt, GPR32Opnd:$rt, uimm5:$imm), 0>;
+  def : MipsInstAlias<"syscall", (SYSCALL_MM 0), 1>;
+  def : MipsInstAlias<"and $rs, $rt, $imm",
+                      (ANDi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+  def : MipsInstAlias<"and $rs, $imm",
+                      (ANDi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm), 0>;
+  def : MipsInstAlias<"or $rs, $rt, $imm",
+                      (ORi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
+  def : MipsInstAlias<"or $rs, $imm",
+                      (ORi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
+  def : MipsInstAlias<"xor $rs, $rt, $imm",
+                      (XORi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
+  def : MipsInstAlias<"xor $rs, $imm",
+                      (XORi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
+  def : MipsInstAlias<"not $rt, $rs",
+                      (NOR_MM GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
+  def : MipsInstAlias<"not $rt",
+                      (NOR_MM GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>;
+  def : MipsInstAlias<"bnez $rs,$offset",
+                      (BNE_MM GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+  def : MipsInstAlias<"beqz $rs,$offset",
+                      (BEQ_MM GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+  def : MipsInstAlias<"seh $rd", (SEH_MM GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
+                     ISA_MIPS32R2_NOT_32R6_64R6;
+  def : MipsInstAlias<"seb $rd", (SEB_MM GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
+                     ISA_MIPS32R2_NOT_32R6_64R6;
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips.h b/contrib/llvm/lib/Target/Mips/Mips.h
new file mode 100644
index 000000000000..d9faf3325cac
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips.h
@@ -0,0 +1,37 @@
+//===-- Mips.h - Top-level interface for Mips representation ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM Mips back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS_H
+#define LLVM_LIB_TARGET_MIPS_MIPS_H
+
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class MipsTargetMachine;
+  class ModulePass;
+  class FunctionPass;
+
+  ModulePass *createMipsOs16Pass(MipsTargetMachine &TM);
+  ModulePass *createMips16HardFloatPass(MipsTargetMachine &TM);
+
+  FunctionPass *createMipsModuleISelDagPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsOptimizePICCallPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsHazardSchedule();
+  FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsConstantIslandPass();
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips.td b/contrib/llvm/lib/Target/Mips/Mips.td
new file mode 100644
index 000000000000..670272d47e95
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips.td
@@ -0,0 +1,231 @@
+//===-- Mips.td - Describe the Mips Target Machine ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the Mips target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+// The overall idea of the PredicateControl class is to chop the Predicates list
+// into subsets that are usually overridden independently. This allows
+// subclasses to partially override the predicates of their superclasses without
+// having to re-add all the existing predicates.
+class PredicateControl {
+  // Predicates for the encoding scheme in use such as HasStdEnc
+  list<Predicate> EncodingPredicates = [];
+  // Predicates for the GPR size such as IsGP64bit
+  list<Predicate> GPRPredicates = [];
+  // Predicates for the PTR size such as IsPTR64bit
+  list<Predicate> PTRPredicates = [];
+  // Predicates for the FGR size and layout such as IsFP64bit
+  list<Predicate> FGRPredicates = [];
+  // Predicates for the instruction group membership such as ISA's and ASE's
+  list<Predicate> InsnPredicates = [];
+  // Predicate for marking the instruction as usable in hard-float mode only.
+  list<Predicate> HardFloatPredicate = [];
+  // Predicates for anything else
+  list<Predicate> AdditionalPredicates = [];
+  list<Predicate> Predicates = !listconcat(EncodingPredicates,
+                                           GPRPredicates,
+                                           PTRPredicates,
+                                           FGRPredicates,
+                                           InsnPredicates,
+                                           HardFloatPredicate,
+                                           AdditionalPredicates);
+}
+
+// Like Requires<> but for the AdditionalPredicates list
+class AdditionalRequires<list<Predicate> preds> {
+  list<Predicate> AdditionalPredicates = preds;
+}
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "MipsRegisterInfo.td"
+include "MipsSchedule.td"
+include "MipsInstrInfo.td"
+include "MipsCallingConv.td"
+
+// Avoid forward declaration issues.
+include "MipsScheduleP5600.td"
+include "MipsScheduleGeneric.td"
+
+def MipsInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Mips Subtarget features                                                    //
+//===----------------------------------------------------------------------===//
+
+def FeatureNoABICalls  : SubtargetFeature<"noabicalls", "NoABICalls", "true",
+                                "Disable SVR4-style position-independent code">;
+def FeaturePTR64Bit    : SubtargetFeature<"ptr64", "IsPTR64bit", "true",
+                                "Pointers are 64-bit wide">;
+def FeatureGP64Bit     : SubtargetFeature<"gp64", "IsGP64bit", "true",
+                                "General Purpose Registers are 64-bit wide">;
+def FeatureFP64Bit     : SubtargetFeature<"fp64", "IsFP64bit", "true",
+                                "Support 64-bit FP registers">;
+def FeatureFPXX        : SubtargetFeature<"fpxx", "IsFPXX", "true",
+                                "Support for FPXX">;
+def FeatureNaN2008     : SubtargetFeature<"nan2008", "IsNaN2008bit", "true",
+                                "IEEE 754-2008 NaN encoding">;
+def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat",
+                                "true", "Only supports single precision float">;
+def FeatureSoftFloat   : SubtargetFeature<"soft-float", "IsSoftFloat", "true",
+                                "Does not support floating point instructions">;
+def FeatureNoOddSPReg  : SubtargetFeature<"nooddspreg", "UseOddSPReg", "false",
+                              "Disable odd numbered single-precision "
+                              "registers">;
+def FeatureVFPU        : SubtargetFeature<"vfpu", "HasVFPU",
+                                "true", "Enable vector FPU instructions">;
+def FeatureMips1       : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1",
+                                "Mips I ISA Support [highly experimental]">;
+def FeatureMips2       : SubtargetFeature<"mips2", "MipsArchVersion", "Mips2",
+                                "Mips II ISA Support [highly experimental]",
+                                [FeatureMips1]>;
+def FeatureMips3_32    : SubtargetFeature<"mips3_32", "HasMips3_32", "true",
+                                "Subset of MIPS-III that is also in MIPS32 "
+                                "[highly experimental]">;
+def FeatureMips3_32r2  : SubtargetFeature<"mips3_32r2", "HasMips3_32r2", "true",
+                                "Subset of MIPS-III that is also in MIPS32r2 "
+                                "[highly experimental]">;
+def FeatureMips3       : SubtargetFeature<"mips3", "MipsArchVersion", "Mips3",
+                                "MIPS III ISA Support [highly experimental]",
+                                [FeatureMips2, FeatureMips3_32,
+                                 FeatureMips3_32r2, FeatureGP64Bit,
+                                 FeatureFP64Bit]>;
+def FeatureMips4_32    : SubtargetFeature<"mips4_32", "HasMips4_32", "true",
+                                "Subset of MIPS-IV that is also in MIPS32 "
+                                "[highly experimental]">;
+def FeatureMips4_32r2  : SubtargetFeature<"mips4_32r2", "HasMips4_32r2", "true",
+                                "Subset of MIPS-IV that is also in MIPS32r2 "
+                                "[highly experimental]">;
+def FeatureMips4       : SubtargetFeature<"mips4", "MipsArchVersion",
+                                "Mips4", "MIPS IV ISA Support",
+                                [FeatureMips3, FeatureMips4_32,
+                                 FeatureMips4_32r2]>;
+def FeatureMips5_32r2  : SubtargetFeature<"mips5_32r2", "HasMips5_32r2", "true",
+                                "Subset of MIPS-V that is also in MIPS32r2 "
+                                "[highly experimental]">;
+def FeatureMips5       : SubtargetFeature<"mips5", "MipsArchVersion", "Mips5",
+                                "MIPS V ISA Support [highly experimental]",
+                                [FeatureMips4, FeatureMips5_32r2]>;
+def FeatureMips32      : SubtargetFeature<"mips32", "MipsArchVersion", "Mips32",
+                                "Mips32 ISA Support",
+                                [FeatureMips2, FeatureMips3_32,
+                                 FeatureMips4_32]>;
+def FeatureMips32r2    : SubtargetFeature<"mips32r2", "MipsArchVersion",
+                                "Mips32r2", "Mips32r2 ISA Support",
+                                [FeatureMips3_32r2, FeatureMips4_32r2,
+                                 FeatureMips5_32r2, FeatureMips32]>;
+def FeatureMips32r3    : SubtargetFeature<"mips32r3", "MipsArchVersion",
+                                "Mips32r3", "Mips32r3 ISA Support",
+                                [FeatureMips32r2]>;
+def FeatureMips32r5    : SubtargetFeature<"mips32r5", "MipsArchVersion",
+                                "Mips32r5", "Mips32r5 ISA Support",
+                                [FeatureMips32r3]>;
+def FeatureMips32r6    : SubtargetFeature<"mips32r6", "MipsArchVersion",
+                                "Mips32r6",
+                                "Mips32r6 ISA Support [experimental]",
+                                [FeatureMips32r5, FeatureFP64Bit,
+                                 FeatureNaN2008]>;
+def FeatureMips64      : SubtargetFeature<"mips64", "MipsArchVersion",
+                                "Mips64", "Mips64 ISA Support",
+                                [FeatureMips5, FeatureMips32]>;
+def FeatureMips64r2    : SubtargetFeature<"mips64r2", "MipsArchVersion",
+                                "Mips64r2", "Mips64r2 ISA Support",
+                                [FeatureMips64, FeatureMips32r2]>;
+def FeatureMips64r3    : SubtargetFeature<"mips64r3", "MipsArchVersion",
+                                "Mips64r3", "Mips64r3 ISA Support",
+                                [FeatureMips64r2, FeatureMips32r3]>;
+def FeatureMips64r5    : SubtargetFeature<"mips64r5", "MipsArchVersion",
+                                "Mips64r5", "Mips64r5 ISA Support",
+                                [FeatureMips64r3, FeatureMips32r5]>;
+def FeatureMips64r6    : SubtargetFeature<"mips64r6", "MipsArchVersion",
+                                "Mips64r6",
+                                "Mips64r6 ISA Support [experimental]",
+                                [FeatureMips32r6, FeatureMips64r5,
+                                 FeatureNaN2008]>;
+
+def FeatureMips16  : SubtargetFeature<"mips16", "InMips16Mode", "true",
+                                      "Mips16 mode">;
+
+def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Mips DSP ASE">;
+def FeatureDSPR2 : SubtargetFeature<"dspr2", "HasDSPR2", "true",
+                                    "Mips DSP-R2 ASE", [FeatureDSP]>;
+def FeatureDSPR3
+    : SubtargetFeature<"dspr3", "HasDSPR3", "true", "Mips DSP-R3 ASE",
+                       [ FeatureDSP, FeatureDSPR2 ]>;
+
+def FeatureMSA : SubtargetFeature<"msa", "HasMSA", "true", "Mips MSA ASE">;
+
+def FeatureEVA : SubtargetFeature<"eva", "HasEVA", "true", "Mips EVA ASE">;
+
+def FeatureMicroMips  : SubtargetFeature<"micromips", "InMicroMipsMode", "true",
+                                         "microMips mode">;
+
+def FeatureCnMips     : SubtargetFeature<"cnmips", "HasCnMips",
+                                "true", "Octeon cnMIPS Support",
+                                [FeatureMips64r2]>;
+
+def FeatureUseTCCInDIV : SubtargetFeature<
+                               "use-tcc-in-div",
+                               "UseTCCInDIV", "false",
+                               "Force the assembler to use trapping">;
+
+//===----------------------------------------------------------------------===//
+// Mips processors supported.
+//===----------------------------------------------------------------------===//
+
+def ImplP5600 : SubtargetFeature<"p5600", "ProcImpl",
+                                 "MipsSubtarget::CPU::P5600",
+                                 "The P5600 Processor", [FeatureMips32r5]>;
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : ProcessorModel<Name, MipsGenericModel, Features>;
+
+def : Proc<"mips1", [FeatureMips1]>;
+def : Proc<"mips2", [FeatureMips2]>;
+def : Proc<"mips32", [FeatureMips32]>;
+def : Proc<"mips32r2", [FeatureMips32r2]>;
+def : Proc<"mips32r3", [FeatureMips32r3]>;
+def : Proc<"mips32r5", [FeatureMips32r5]>;
+def : Proc<"mips32r6", [FeatureMips32r6]>;
+
+def : Proc<"mips3", [FeatureMips3]>;
+def : Proc<"mips4", [FeatureMips4]>;
+def : Proc<"mips5", [FeatureMips5]>;
+def : Proc<"mips64", [FeatureMips64]>;
+def : Proc<"mips64r2", [FeatureMips64r2]>;
+def : Proc<"mips64r3", [FeatureMips64r3]>;
+def : Proc<"mips64r5", [FeatureMips64r5]>;
+def : Proc<"mips64r6", [FeatureMips64r6]>;
+def : Proc<"octeon", [FeatureMips64r2, FeatureCnMips]>;
+def : ProcessorModel<"p5600", MipsP5600Model, [ImplP5600]>;
+
+def MipsAsmParser : AsmParser {
+  let ShouldEmitMatchRegisterName = 0;
+}
+
+def MipsAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+
+  // Recognize hard coded registers.
+  string RegisterPrefix = "$";
+}
+
+def Mips : Target {
+  let InstructionSet = MipsInstrInfo;
+  let AssemblyParsers = [MipsAsmParser];
+  let AssemblyParserVariants = [MipsAsmParserVariant];
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
new file mode 100644
index 000000000000..e7ceca9612a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -0,0 +1,176 @@
+//===-- Mips16FrameLowering.cpp - Mips16 Frame Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16FrameLowering.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips16InstrInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+Mips16FrameLowering::Mips16FrameLowering(const MipsSubtarget &STI)
+    : MipsFrameLowering(STI, STI.stackAlignment()) {}
+
+void Mips16FrameLowering::emitPrologue(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) const {
+  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const Mips16InstrInfo &TII =
+      *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc dl;
+
+  uint64_t StackSize = MFI.getStackSize();
+
+  // No need to allocate space on the stack.
+  if (StackSize == 0 && !MFI.adjustsStack()) return;
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  MachineLocation DstML, SrcML;
+
+  // Adjust stack.
+  TII.makeFrame(Mips::SP, StackSize, MBB, MBBI);
+
+  // emit ".cfi_def_cfa_offset StackSize"
+  unsigned CFIIndex = MF.addFrameInst(
+      MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+  if (CSI.size()) {
+    const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+         E = CSI.end(); I != E; ++I) {
+      int64_t Offset = MFI.getObjectOffset(I->getFrameIdx());
+      unsigned Reg = I->getReg();
+      unsigned DReg = MRI->getDwarfRegNum(Reg, true);
+      unsigned CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, DReg, Offset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
+  if (hasFP(MF))
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::MoveR3216), Mips::S0)
+      .addReg(Mips::SP).setMIFlag(MachineInstr::FrameSetup);
+
+}
+
+void Mips16FrameLowering::emitEpilogue(MachineFunction &MF,
+                                 MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const Mips16InstrInfo &TII =
+      *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
+  DebugLoc dl = MBBI->getDebugLoc();
+  uint64_t StackSize = MFI.getStackSize();
+
+  if (!StackSize)
+    return;
+
+  if (hasFP(MF))
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::Move32R16), Mips::SP)
+      .addReg(Mips::S0);
+
+  // Adjust stack.
+  // assumes stacksize multiple of 8
+  TII.restoreFrame(Mips::SP, StackSize, MBB, MBBI);
+}
+
+bool Mips16FrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  MachineBasicBlock *EntryBlock = &MF->front();
+
+  //
+  // Registers RA, S0,S1 are the callee saved registers and they
+  // will be saved with the "save" instruction
+  // during emitPrologue
+  //
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    // Add the callee-saved register as live-in. Do not add if the register is
+    // RA and return address is taken, because it has already been added in
+    // method MipsTargetLowering::lowerRETURNADDR.
+    // It's killed at the spill, unless the register is RA and return address
+    // is taken.
+    unsigned Reg = CSI[i].getReg();
+    bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA)
+      && MF->getFrameInfo().isReturnAddressTaken();
+    if (!IsRAAndRetAddrIsTaken)
+      EntryBlock->addLiveIn(Reg);
+  }
+
+  return true;
+}
+
+bool Mips16FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MI,
+                                       const std::vector<CalleeSavedInfo> &CSI,
+                                       const TargetRegisterInfo *TRI) const {
+  //
+  // Registers RA,S0,S1 are the callee saved registers and they will be restored
+  // with the restore instruction during emitEpilogue.
+  // We need to override this virtual function, otherwise llvm will try and
+  // restore the registers on it's on from the stack.
+  //
+
+  return true;
+}
+
+bool
+Mips16FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  // Reserve call frame if the size of the maximum call frame fits into 15-bit
+  // immediate field and there are no variable sized objects on the stack.
+  return isInt<15>(MFI.getMaxCallFrameSize()) && !MFI.hasVarSizedObjects();
+}
+
+void Mips16FrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                               BitVector &SavedRegs,
+                                               RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+  const Mips16InstrInfo &TII =
+      *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo());
+  const MipsRegisterInfo &RI = TII.getRegisterInfo();
+  const BitVector Reserved = RI.getReservedRegs(MF);
+  bool SaveS2 = Reserved[Mips::S2];
+  if (SaveS2)
+    SavedRegs.set(Mips::S2);
+  if (hasFP(MF))
+    SavedRegs.set(Mips::S0);
+}
+
+const MipsFrameLowering *
+llvm::createMips16FrameLowering(const MipsSubtarget &ST) {
+  return new Mips16FrameLowering(ST);
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h
new file mode 100644
index 000000000000..b48ed4641ea7
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h
@@ -0,0 +1,47 @@
+//===-- Mips16FrameLowering.h - Mips16 frame lowering  ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16FRAMELOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16FRAMELOWERING_H
+
+#include "MipsFrameLowering.h"
+
+namespace llvm {
+class Mips16FrameLowering : public MipsFrameLowering {
+public:
+  explicit Mips16FrameLowering(const MipsSubtarget &STI);
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp
new file mode 100644
index 000000000000..191006d6463c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp
@@ -0,0 +1,547 @@
+//===---- Mips16HardFloat.cpp for Mips16 Hard Float               --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass needed for Mips16 Hard Float
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetMachine.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips16-hard-float"
+
+namespace {
+  class Mips16HardFloat : public ModulePass {
+  public:
+    static char ID;
+
+    Mips16HardFloat(MipsTargetMachine &TM_) : ModulePass(ID), TM(TM_) {}
+
+    StringRef getPassName() const override { return "MIPS16 Hard Float Pass"; }
+
+    bool runOnModule(Module &M) override;
+
+  protected:
+    const MipsTargetMachine &TM;
+  };
+
+  static void EmitInlineAsm(LLVMContext &C, BasicBlock *BB, StringRef AsmText) {
+    std::vector<llvm::Type *> AsmArgTypes;
+    std::vector<llvm::Value *> AsmArgs;
+
+    llvm::FunctionType *AsmFTy =
+        llvm::FunctionType::get(Type::getVoidTy(C), AsmArgTypes, false);
+    llvm::InlineAsm *IA =
+        llvm::InlineAsm::get(AsmFTy, AsmText, "", true,
+                             /* IsAlignStack */ false, llvm::InlineAsm::AD_ATT);
+    CallInst::Create(IA, AsmArgs, "", BB);
+  }
+
+  char Mips16HardFloat::ID = 0;
+}
+
+//
+// Return types that matter for hard float are:
+// float, double, complex float, and complex double
+//
+enum FPReturnVariant {
+  FRet, DRet, CFRet, CDRet, NoFPRet
+};
+
+//
+// Determine which FP return type this function has
+//
+static FPReturnVariant whichFPReturnVariant(Type *T) {
+  switch (T->getTypeID()) {
+  case Type::FloatTyID:
+    return FRet;
+  case Type::DoubleTyID:
+    return DRet;
+  case Type::StructTyID:
+    if (T->getStructNumElements() != 2)
+      break;
+    if ((T->getContainedType(0)->isFloatTy()) &&
+        (T->getContainedType(1)->isFloatTy()))
+      return CFRet;
+    if ((T->getContainedType(0)->isDoubleTy()) &&
+        (T->getContainedType(1)->isDoubleTy()))
+      return CDRet;
+    break;
+  default:
+    break;
+  }
+  return NoFPRet;
+}
+
+//
+// Parameter type that matter are float, (float, float), (float, double),
+// double, (double, double), (double, float)
+//
+enum FPParamVariant {
+  FSig, FFSig, FDSig,
+  DSig, DDSig, DFSig, NoSig
+};
+
+// which floating point parameter signature variant we are dealing with
+//
+typedef Type::TypeID TypeID;
+const Type::TypeID FloatTyID = Type::FloatTyID;
+const Type::TypeID DoubleTyID = Type::DoubleTyID;
+
+static FPParamVariant whichFPParamVariantNeeded(Function &F) {
+  switch (F.arg_size()) {
+  case 0:
+    return NoSig;
+  case 1:{
+    TypeID ArgTypeID = F.getFunctionType()->getParamType(0)->getTypeID();
+    switch (ArgTypeID) {
+    case FloatTyID:
+      return FSig;
+    case DoubleTyID:
+      return DSig;
+    default:
+      return NoSig;
+    }
+  }
+  default: {
+    TypeID ArgTypeID0 = F.getFunctionType()->getParamType(0)->getTypeID();
+    TypeID ArgTypeID1 = F.getFunctionType()->getParamType(1)->getTypeID();
+    switch(ArgTypeID0) {
+    case FloatTyID: {
+      switch (ArgTypeID1) {
+      case FloatTyID:
+        return FFSig;
+      case DoubleTyID:
+        return FDSig;
+      default:
+        return FSig;
+      }
+    }
+    case DoubleTyID: {
+      switch (ArgTypeID1) {
+      case FloatTyID:
+        return DFSig;
+      case DoubleTyID:
+        return DDSig;
+      default:
+        return DSig;
+      }
+    }
+    default:
+      return NoSig;
+    }
+  }
+  }
+  llvm_unreachable("can't get here");
+}
+
+// Figure out if we need float point based on the function parameters.
+// We need to move variables in and/or out of floating point
+// registers because of the ABI
+//
+static bool needsFPStubFromParams(Function &F) {
+  if (F.arg_size() >=1) {
+    Type *ArgType = F.getFunctionType()->getParamType(0);
+    switch (ArgType->getTypeID()) {
+    case Type::FloatTyID:
+    case Type::DoubleTyID:
+      return true;
+    default:
+      break;
+    }
+  }
+  return false;
+}
+
+static bool needsFPReturnHelper(Function &F) {
+  Type* RetType = F.getReturnType();
+  return whichFPReturnVariant(RetType) != NoFPRet;
+}
+
+static bool needsFPReturnHelper(FunctionType &FT) {
+  Type* RetType = FT.getReturnType();
+  return whichFPReturnVariant(RetType) != NoFPRet;
+}
+
+static bool needsFPHelperFromSig(Function &F) {
+  return needsFPStubFromParams(F) || needsFPReturnHelper(F);
+}
+
+//
+// We swap between FP and Integer registers to allow Mips16 and Mips32 to
+// interoperate
+//
+static std::string swapFPIntParams(FPParamVariant PV, Module *M, bool LE,
+                                   bool ToFP) {
+  std::string MI = ToFP ? "mtc1 ": "mfc1 ";
+  std::string AsmText;
+
+  switch (PV) {
+  case FSig:
+    AsmText += MI + "$$4, $$f12\n";
+    break;
+
+  case FFSig:
+    AsmText += MI + "$$4, $$f12\n";
+    AsmText += MI + "$$5, $$f14\n";
+    break;
+
+  case FDSig:
+    AsmText += MI + "$$4, $$f12\n";
+    if (LE) {
+      AsmText += MI + "$$6, $$f14\n";
+      AsmText += MI + "$$7, $$f15\n";
+    } else {
+      AsmText += MI + "$$7, $$f14\n";
+      AsmText += MI + "$$6, $$f15\n";
+    }
+    break;
+
+  case DSig:
+    if (LE) {
+      AsmText += MI + "$$4, $$f12\n";
+      AsmText += MI + "$$5, $$f13\n";
+    } else {
+      AsmText += MI + "$$5, $$f12\n";
+      AsmText += MI + "$$4, $$f13\n";
+    }
+    break;
+
+  case DDSig:
+    if (LE) {
+      AsmText += MI + "$$4, $$f12\n";
+      AsmText += MI + "$$5, $$f13\n";
+      AsmText += MI + "$$6, $$f14\n";
+      AsmText += MI + "$$7, $$f15\n";
+    } else {
+      AsmText += MI + "$$5, $$f12\n";
+      AsmText += MI + "$$4, $$f13\n";
+      AsmText += MI + "$$7, $$f14\n";
+      AsmText += MI + "$$6, $$f15\n";
+    }
+    break;
+
+  case DFSig:
+    if (LE) {
+      AsmText += MI + "$$4, $$f12\n";
+      AsmText += MI + "$$5, $$f13\n";
+    } else {
+      AsmText += MI + "$$5, $$f12\n";
+      AsmText += MI + "$$4, $$f13\n";
+    }
+    AsmText += MI + "$$6, $$f14\n";
+    break;
+
+  case NoSig:
+    break;
+  }
+
+  return AsmText;
+}
+
+//
+// Make sure that we know we already need a stub for this function.
+// Having called needsFPHelperFromSig
+//
+static void assureFPCallStub(Function &F, Module *M,
+                             const MipsTargetMachine &TM) {
+  // for now we only need them for static relocation
+  if (TM.isPositionIndependent())
+    return;
+  LLVMContext &Context = M->getContext();
+  bool LE = TM.isLittleEndian();
+  std::string Name = F.getName();
+  std::string SectionName = ".mips16.call.fp." + Name;
+  std::string StubName = "__call_stub_fp_" + Name;
+  //
+  // see if we already have the stub
+  //
+  Function *FStub = M->getFunction(StubName);
+  if (FStub && !FStub->isDeclaration()) return;
+  FStub = Function::Create(F.getFunctionType(),
+                           Function::InternalLinkage, StubName, M);
+  FStub->addFnAttr("mips16_fp_stub");
+  FStub->addFnAttr(llvm::Attribute::Naked);
+  FStub->addFnAttr(llvm::Attribute::NoInline);
+  FStub->addFnAttr(llvm::Attribute::NoUnwind);
+  FStub->addFnAttr("nomips16");
+  FStub->setSection(SectionName);
+  BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub);
+  FPReturnVariant RV = whichFPReturnVariant(FStub->getReturnType());
+  FPParamVariant PV = whichFPParamVariantNeeded(F);
+
+  std::string AsmText;
+  AsmText += ".set reorder\n";
+  AsmText += swapFPIntParams(PV, M, LE, true);
+  if (RV != NoFPRet) {
+    AsmText += "move $$18, $$31\n";
+    AsmText += "jal " + Name + "\n";
+  } else {
+    AsmText += "lui  $$25, %hi(" + Name + ")\n";
+    AsmText += "addiu  $$25, $$25, %lo(" + Name + ")\n";
+  }
+
+  switch (RV) {
+  case FRet:
+    AsmText += "mfc1 $$2, $$f0\n";
+    break;
+
+  case DRet:
+    if (LE) {
+      AsmText += "mfc1 $$2, $$f0\n";
+      AsmText += "mfc1 $$3, $$f1\n";
+    } else {
+      AsmText += "mfc1 $$3, $$f0\n";
+      AsmText += "mfc1 $$2, $$f1\n";
+    }
+    break;
+
+  case CFRet:
+    if (LE) {
+      AsmText += "mfc1 $$2, $$f0\n";
+      AsmText += "mfc1 $$3, $$f2\n";
+    } else {
+      AsmText += "mfc1 $$3, $$f0\n";
+      AsmText += "mfc1 $$3, $$f2\n";
+    }
+    break;
+
+  case CDRet:
+    if (LE) {
+      AsmText += "mfc1 $$4, $$f2\n";
+      AsmText += "mfc1 $$5, $$f3\n";
+      AsmText += "mfc1 $$2, $$f0\n";
+      AsmText += "mfc1 $$3, $$f1\n";
+
+    } else {
+      AsmText += "mfc1 $$5, $$f2\n";
+      AsmText += "mfc1 $$4, $$f3\n";
+      AsmText += "mfc1 $$3, $$f0\n";
+      AsmText += "mfc1 $$2, $$f1\n";
+    }
+    break;
+
+  case NoFPRet:
+    break;
+  }
+
+  if (RV != NoFPRet)
+    AsmText += "jr $$18\n";
+  else
+    AsmText += "jr $$25\n";
+  EmitInlineAsm(Context, BB, AsmText);
+
+  new UnreachableInst(Context, BB);
+}
+
+//
+// Functions that are llvm intrinsics and don't need helpers.
+//
+static const char *const IntrinsicInline[] = {
+  "fabs", "fabsf",
+  "llvm.ceil.f32", "llvm.ceil.f64",
+  "llvm.copysign.f32", "llvm.copysign.f64",
+  "llvm.cos.f32", "llvm.cos.f64",
+  "llvm.exp.f32", "llvm.exp.f64",
+  "llvm.exp2.f32", "llvm.exp2.f64",
+  "llvm.fabs.f32", "llvm.fabs.f64",
+  "llvm.floor.f32", "llvm.floor.f64",
+  "llvm.fma.f32", "llvm.fma.f64",
+  "llvm.log.f32", "llvm.log.f64",
+  "llvm.log10.f32", "llvm.log10.f64",
+  "llvm.nearbyint.f32", "llvm.nearbyint.f64",
+  "llvm.pow.f32", "llvm.pow.f64",
+  "llvm.powi.f32", "llvm.powi.f64",
+  "llvm.rint.f32", "llvm.rint.f64",
+  "llvm.round.f32", "llvm.round.f64",
+  "llvm.sin.f32", "llvm.sin.f64",
+  "llvm.sqrt.f32", "llvm.sqrt.f64",
+  "llvm.trunc.f32", "llvm.trunc.f64",
+};
+
+static bool isIntrinsicInline(Function *F) {
+  return std::binary_search(std::begin(IntrinsicInline),
+                            std::end(IntrinsicInline), F->getName());
+}
+//
+// Returns of float, double and complex need to be handled with a helper
+// function.
+//
+static bool fixupFPReturnAndCall(Function &F, Module *M,
+                                 const MipsTargetMachine &TM) {
+  bool Modified = false;
+  LLVMContext &C = M->getContext();
+  Type *MyVoid = Type::getVoidTy(C);
+  for (auto &BB: F)
+    for (auto &I: BB) {
+      if (const ReturnInst *RI = dyn_cast<ReturnInst>(&I)) {
+        Value *RVal = RI->getReturnValue();
+        if (!RVal) continue;
+        //
+        // If there is a return value and it needs a helper function,
+        // figure out which one and add a call before the actual
+        // return to this helper. The purpose of the helper is to move
+        // floating point values from their soft float return mapping to
+        // where they would have been mapped to in floating point registers.
+        //
+        Type *T = RVal->getType();
+        FPReturnVariant RV = whichFPReturnVariant(T);
+        if (RV == NoFPRet) continue;
+        static const char *const Helper[NoFPRet] = {
+          "__mips16_ret_sf", "__mips16_ret_df", "__mips16_ret_sc",
+          "__mips16_ret_dc"
+        };
+        const char *Name = Helper[RV];
+        AttributeSet A;
+        Value *Params[] = {RVal};
+        Modified = true;
+        //
+        // These helper functions have a different calling ABI so
+        // this __Mips16RetHelper indicates that so that later
+        // during call setup, the proper call lowering to the helper
+        // functions will take place.
+        //
+        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+                           "__Mips16RetHelper");
+        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+                           Attribute::ReadNone);
+        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+                           Attribute::NoInline);
+        Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T, nullptr));
+        CallInst::Create(F, Params, "", &I);
+      } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
+        FunctionType *FT = CI->getFunctionType();
+        Function *F_ =  CI->getCalledFunction();
+        if (needsFPReturnHelper(*FT) &&
+            !(F_ && isIntrinsicInline(F_))) {
+          Modified=true;
+          F.addFnAttr("saveS2");
+        }
+        if (F_ && !isIntrinsicInline(F_)) {
+          // pic mode calls are handled by already defined
+          // helper functions
+          if (needsFPReturnHelper(*F_)) {
+            Modified=true;
+            F.addFnAttr("saveS2");
+          }
+          if (!TM.isPositionIndependent()) {
+            if (needsFPHelperFromSig(*F_)) {
+              assureFPCallStub(*F_, M, TM);
+              Modified=true;
+            }
+          }
+        }
+      }
+    }
+  return Modified;
+}
+
+static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
+                           const MipsTargetMachine &TM) {
+  bool PicMode = TM.isPositionIndependent();
+  bool LE = TM.isLittleEndian();
+  LLVMContext &Context = M->getContext();
+  std::string Name = F->getName();
+  std::string SectionName = ".mips16.fn." + Name;
+  std::string StubName = "__fn_stub_" + Name;
+  std::string LocalName = "$$__fn_local_" + Name;
+  Function *FStub = Function::Create
+    (F->getFunctionType(),
+     Function::InternalLinkage, StubName, M);
+  FStub->addFnAttr("mips16_fp_stub");
+  FStub->addFnAttr(llvm::Attribute::Naked);
+  FStub->addFnAttr(llvm::Attribute::NoUnwind);
+  FStub->addFnAttr(llvm::Attribute::NoInline);
+  FStub->addFnAttr("nomips16");
+  FStub->setSection(SectionName);
+  BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub);
+
+  std::string AsmText;
+  if (PicMode) {
+    AsmText += ".set noreorder\n";
+    AsmText += ".cpload $$25\n";
+    AsmText += ".set reorder\n";
+    AsmText += ".reloc 0, R_MIPS_NONE, " + Name + "\n";
+    AsmText += "la $$25, " + LocalName + "\n";
+  } else
+    AsmText += "la $$25, " + Name + "\n";
+  AsmText += swapFPIntParams(PV, M, LE, false);
+  AsmText += "jr $$25\n";
+  AsmText += LocalName + " = " + Name + "\n";
+  EmitInlineAsm(Context, BB, AsmText);
+
+  new UnreachableInst(FStub->getContext(), BB);
+}
+
+//
+// remove the use-soft-float attribute
+//
+static void removeUseSoftFloat(Function &F) {
+  AttributeSet A;
+  DEBUG(errs() << "removing -use-soft-float\n");
+  A = A.addAttribute(F.getContext(), AttributeSet::FunctionIndex,
+                     "use-soft-float", "false");
+  F.removeAttributes(AttributeSet::FunctionIndex, A);
+  if (F.hasFnAttribute("use-soft-float")) {
+    DEBUG(errs() << "still has -use-soft-float\n");
+  }
+  F.addAttributes(AttributeSet::FunctionIndex, A);
+}
+
+
+//
+// This pass only makes sense when the underlying chip has floating point but
+// we are compiling as mips16.
+// For all mips16 functions (that are not stubs we have already generated), or
+// declared via attributes as nomips16, we must:
+//    1) fixup all returns of float, double, single and double complex
+//       by calling a helper function before the actual return.
+//    2) generate helper functions (stubs) that can be called by mips32
+//       functions that will move parameters passed normally passed in
+//       floating point
+//       registers the soft float equivalents.
+//    3) in the case of static relocation, generate helper functions so that
+//       mips16 functions can call extern functions of unknown type (mips16 or
+//       mips32).
+//    4) TBD. For pic, calls to extern functions of unknown type are handled by
+//       predefined helper functions in libc but this work is currently done
+//       during call lowering but it should be moved here in the future.
+//
+bool Mips16HardFloat::runOnModule(Module &M) {
+  DEBUG(errs() << "Run on Module Mips16HardFloat\n");
+  bool Modified = false;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->hasFnAttribute("nomips16") &&
+        F->hasFnAttribute("use-soft-float")) {
+      removeUseSoftFloat(*F);
+      continue;
+    }
+    if (F->isDeclaration() || F->hasFnAttribute("mips16_fp_stub") ||
+        F->hasFnAttribute("nomips16")) continue;
+    Modified |= fixupFPReturnAndCall(*F, &M, TM);
+    FPParamVariant V = whichFPParamVariantNeeded(*F);
+    if (V != NoSig) {
+      Modified = true;
+      createFPFnStub(&*F, &M, V, TM);
+    }
+  }
+  return Modified;
+}
+
+
+ModulePass *llvm::createMips16HardFloatPass(MipsTargetMachine &TM) {
+  return new Mips16HardFloat(TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.cpp
new file mode 100644
index 000000000000..2eb6e5ddd2d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.cpp
@@ -0,0 +1,50 @@
+//===---- Mips16HardFloatInfo.cpp for Mips16 Hard Float              -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of Mips16HardFloatInfo
+// namespace.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16HardFloatInfo.h"
+#include <string.h>
+
+namespace llvm {
+
+namespace Mips16HardFloatInfo {
+
+const FuncNameSignature PredefinedFuncs[] = {
+  { "__floatdidf", { NoSig, DRet } },
+  { "__floatdisf", { NoSig, FRet } },
+  { "__floatundidf", { NoSig, DRet } },
+  { "__fixsfdi", { FSig, NoFPRet } },
+  { "__fixunsdfsi", { DSig, NoFPRet } },
+  { "__fixunsdfdi", { DSig, NoFPRet } },
+  { "__fixdfdi", { DSig, NoFPRet } },
+  { "__fixunssfsi", { FSig, NoFPRet } },
+  { "__fixunssfdi", { FSig, NoFPRet } },
+  { "__floatundisf", { NoSig, FRet } },
+  { nullptr, { NoSig, NoFPRet } }
+};
+
+// just do a search for now. there are very few of these special cases.
+//
+extern FuncSignature const *findFuncSignature(const char *name) {
+  const char *name_;
+  int i = 0;
+  while (PredefinedFuncs[i].Name) {
+    name_ = PredefinedFuncs[i].Name;
+    if (strcmp(name, name_) == 0)
+      return &PredefinedFuncs[i].Signature;
+    i++;
+  }
+  return nullptr;
+}
+}
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.h b/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.h
new file mode 100644
index 000000000000..7295c287576d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloatInfo.h
@@ -0,0 +1,50 @@
+//===---- Mips16HardFloatInfo.h for Mips16 Hard Float              --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some data structures relevant to the implementation of
+// Mips16 hard float.
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16HARDFLOATINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16HARDFLOATINFO_H
+
+namespace llvm {
+
+namespace Mips16HardFloatInfo {
+
+// Return types that matter for hard float are:
+// float, double, complex float, and complex double
+//
+enum FPReturnVariant { FRet, DRet, CFRet, CDRet, NoFPRet };
+
+//
+// Parameter type that matter are float, (float, float), (float, double),
+// double, (double, double), (double, float)
+//
+enum FPParamVariant { FSig, FFSig, FDSig, DSig, DDSig, DFSig, NoSig };
+
+struct FuncSignature {
+  FPParamVariant ParamSig;
+  FPReturnVariant RetSig;
+};
+
+struct FuncNameSignature {
+  const char *Name;
+  FuncSignature Signature;
+};
+
+extern const FuncNameSignature PredefinedFuncs[];
+
+extern FuncSignature const *findFuncSignature(const char *name);
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
new file mode 100644
index 000000000000..ce193b1734f3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -0,0 +1,261 @@
+//===-- Mips16ISelDAGToDAG.cpp - A Dag to Dag Inst Selector for Mips16 ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16ISelDAGToDAG.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-isel"
+
+bool Mips16DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+  if (!Subtarget->inMips16Mode())
+    return false;
+  return MipsDAGToDAGISel::runOnMachineFunction(MF);
+}
+/// Select multiply instructions.
+std::pair<SDNode *, SDNode *>
+Mips16DAGToDAGISel::selectMULT(SDNode *N, unsigned Opc, const SDLoc &DL, EVT Ty,
+                               bool HasLo, bool HasHi) {
+  SDNode *Lo = nullptr, *Hi = nullptr;
+  SDNode *Mul = CurDAG->getMachineNode(Opc, DL, MVT::Glue, N->getOperand(0),
+                                       N->getOperand(1));
+  SDValue InFlag = SDValue(Mul, 0);
+
+  if (HasLo) {
+    unsigned Opcode = Mips::Mflo16;
+    Lo = CurDAG->getMachineNode(Opcode, DL, Ty, MVT::Glue, InFlag);
+    InFlag = SDValue(Lo, 1);
+  }
+  if (HasHi) {
+    unsigned Opcode = Mips::Mfhi16;
+    Hi = CurDAG->getMachineNode(Opcode, DL, Ty, InFlag);
+  }
+  return std::make_pair(Lo, Hi);
+}
+
+void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  if (!MipsFI->globalBaseRegSet())
+    return;
+
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc DL;
+  unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
+  const TargetRegisterClass *RC = &Mips::CPU16RegsRegClass;
+
+  V0 = RegInfo.createVirtualRegister(RC);
+  V1 = RegInfo.createVirtualRegister(RC);
+  V2 = RegInfo.createVirtualRegister(RC);
+
+
+  BuildMI(MBB, I, DL, TII.get(Mips::LiRxImmX16), V0)
+      .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
+  BuildMI(MBB, I, DL, TII.get(Mips::AddiuRxPcImmX16), V1)
+      .addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+
+  BuildMI(MBB, I, DL, TII.get(Mips::SllX16), V2).addReg(V0).addImm(16);
+  BuildMI(MBB, I, DL, TII.get(Mips::AdduRxRyRz16), GlobalBaseReg)
+      .addReg(V1)
+      .addReg(V2);
+}
+
+void Mips16DAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
+  initGlobalBaseReg(MF);
+}
+
+bool Mips16DAGToDAGISel::selectAddr(bool SPAllowed, SDValue Addr, SDValue &Base,
+                                    SDValue &Offset) {
+  SDLoc DL(Addr);
+  EVT ValTy = Addr.getValueType();
+
+  // if Address is FI, get the TargetFrameIndex.
+  if (SPAllowed) {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+      Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+      Offset = CurDAG->getTargetConstant(0, DL, ValTy);
+      return true;
+    }
+  }
+  // on PIC code Load GA
+  if (Addr.getOpcode() == MipsISD::Wrapper) {
+    Base = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+  if (!TM.isPositionIndependent()) {
+    if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+         Addr.getOpcode() == ISD::TargetGlobalAddress))
+      return false;
+  }
+  // Addresses of the form FI+const or FI|const
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isInt<16>(CN->getSExtValue())) {
+      // If the first operand is a FI, get the TargetFI Node
+      if (SPAllowed) {
+        if (FrameIndexSDNode *FIN =
+                dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
+          Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+          Offset = CurDAG->getTargetConstant(CN->getZExtValue(), DL, ValTy);
+          return true;
+        }
+      }
+
+      Base = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), DL, ValTy);
+      return true;
+    }
+  }
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
+    // When loading from constant pools, load the lower address part in
+    // the instruction itself. Example, instead of:
+    //  lui $2, %hi($CPI1_0)
+    //  addiu $2, $2, %lo($CPI1_0)
+    //  lwc1 $f0, 0($2)
+    // Generate:
+    //  lui $2, %hi($CPI1_0)
+    //  lwc1 $f0, %lo($CPI1_0)($2)
+    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo ||
+        Addr.getOperand(1).getOpcode() == MipsISD::GPRel) {
+      SDValue Opnd0 = Addr.getOperand(1).getOperand(0);
+      if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
+          isa<JumpTableSDNode>(Opnd0)) {
+        Base = Addr.getOperand(0);
+        Offset = Opnd0;
+        return true;
+      }
+    }
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, DL, ValTy);
+  return true;
+}
+
+bool Mips16DAGToDAGISel::selectAddr16(SDValue Addr, SDValue &Base,
+                                      SDValue &Offset) {
+  return selectAddr(false, Addr, Base, Offset);
+}
+
+bool Mips16DAGToDAGISel::selectAddr16SP(SDValue Addr, SDValue &Base,
+                                        SDValue &Offset) {
+  return selectAddr(true, Addr, Base, Offset);
+}
+
+/// Select instructions not customized! Used for
+/// expanded, promoted and normal instructions
+bool Mips16DAGToDAGISel::trySelect(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+  SDLoc DL(Node);
+
+  ///
+  // Instruction Selection not handled by the auto-generated
+  // tablegen selection should be handled here.
+  ///
+  EVT NodeTy = Node->getValueType(0);
+  unsigned MultOpc;
+
+  switch (Opcode) {
+  default:
+    break;
+
+  case ISD::SUBE:
+  case ISD::ADDE: {
+    SDValue InFlag = Node->getOperand(2), CmpLHS;
+    unsigned Opc = InFlag.getOpcode();
+    (void)Opc;
+    assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
+            (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
+           "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
+
+    unsigned MOp;
+    if (Opcode == ISD::ADDE) {
+      CmpLHS = InFlag.getValue(0);
+      MOp = Mips::AdduRxRyRz16;
+    } else {
+      CmpLHS = InFlag.getOperand(0);
+      MOp = Mips::SubuRxRyRz16;
+    }
+
+    SDValue Ops[] = {CmpLHS, InFlag.getOperand(1)};
+
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+
+    EVT VT = LHS.getValueType();
+
+    unsigned Sltu_op = Mips::SltuRxRyRz16;
+    SDNode *Carry = CurDAG->getMachineNode(Sltu_op, DL, VT, Ops);
+    unsigned Addu_op = Mips::AdduRxRyRz16;
+    SDNode *AddCarry =
+        CurDAG->getMachineNode(Addu_op, DL, VT, SDValue(Carry, 0), RHS);
+
+    CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0));
+    return true;
+  }
+
+  /// Mul with two results
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI: {
+    MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::MultuRxRy16 : Mips::MultRxRy16);
+    std::pair<SDNode *, SDNode *> LoHi =
+        selectMULT(Node, MultOpc, DL, NodeTy, true, true);
+    if (!SDValue(Node, 0).use_empty())
+      ReplaceUses(SDValue(Node, 0), SDValue(LoHi.first, 0));
+
+    if (!SDValue(Node, 1).use_empty())
+      ReplaceUses(SDValue(Node, 1), SDValue(LoHi.second, 0));
+
+    CurDAG->RemoveDeadNode(Node);
+    return true;
+  }
+
+  case ISD::MULHS:
+  case ISD::MULHU: {
+    MultOpc = (Opcode == ISD::MULHU ? Mips::MultuRxRy16 : Mips::MultRxRy16);
+    auto LoHi = selectMULT(Node, MultOpc, DL, NodeTy, false, true);
+    ReplaceNode(Node, LoHi.second);
+    return true;
+  }
+  }
+
+  return false;
+}
+
+FunctionPass *llvm::createMips16ISelDag(MipsTargetMachine &TM,
+                                        CodeGenOpt::Level OptLevel) {
+  return new Mips16DAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h
new file mode 100644
index 000000000000..bbf8cc36f241
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h
@@ -0,0 +1,55 @@
+//===---- Mips16ISelDAGToDAG.h - A Dag to Dag Inst Selector for Mips ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16ISELDAGTODAG_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16ISELDAGTODAG_H
+
+#include "MipsISelDAGToDAG.h"
+
+namespace llvm {
+
+class Mips16DAGToDAGISel : public MipsDAGToDAGISel {
+public:
+  explicit Mips16DAGToDAGISel(MipsTargetMachine &TM, CodeGenOpt::Level OL)
+      : MipsDAGToDAGISel(TM, OL) {}
+
+private:
+  std::pair<SDNode *, SDNode *> selectMULT(SDNode *N, unsigned Opc,
+                                           const SDLoc &DL, EVT Ty, bool HasLo,
+                                           bool HasHi);
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  bool selectAddr(bool SPAllowed, SDValue Addr, SDValue &Base,
+                  SDValue &Offset);
+  bool selectAddr16(SDValue Addr, SDValue &Base,
+                    SDValue &Offset) override;
+  bool selectAddr16SP(SDValue Addr, SDValue &Base,
+                      SDValue &Offset) override;
+
+  bool trySelect(SDNode *Node) override;
+
+  void processFunctionAfterISel(MachineFunction &MF) override;
+
+  // Insert instructions to initialize the global base register in the
+  // first MBB of the function.
+  void initGlobalBaseReg(MachineFunction &MF);
+
+  void initMips16SPAliasReg(MachineFunction &MF);
+};
+
+FunctionPass *createMips16ISelDag(MipsTargetMachine &TM,
+                                  CodeGenOpt::Level OptLevel);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
new file mode 100644
index 000000000000..bdb9eec4cc5a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -0,0 +1,800 @@
+//===-- Mips16ISelLowering.h - Mips16 DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+#include "Mips16ISelLowering.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips16HardFloatInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-lower"
+
+static cl::opt<bool> DontExpandCondPseudos16(
+  "mips16-dont-expand-cond-pseudo",
+  cl::init(false),
+  cl::desc("Don't expand conditional move related "
+           "pseudos for Mips 16"),
+  cl::Hidden);
+
+namespace {
+struct Mips16Libcall {
+  RTLIB::Libcall Libcall;
+  const char *Name;
+
+  bool operator<(const Mips16Libcall &RHS) const {
+    return std::strcmp(Name, RHS.Name) < 0;
+  }
+};
+
+struct Mips16IntrinsicHelperType{
+  const char* Name;
+  const char* Helper;
+
+  bool operator<(const Mips16IntrinsicHelperType &RHS) const {
+    return std::strcmp(Name, RHS.Name) < 0;
+  }
+  bool operator==(const Mips16IntrinsicHelperType &RHS) const {
+    return std::strcmp(Name, RHS.Name) == 0;
+  }
+};
+}
+
+// Libcalls for which no helper is generated. Sorted by name for binary search.
+static const Mips16Libcall HardFloatLibCalls[] = {
+  { RTLIB::ADD_F64, "__mips16_adddf3" },
+  { RTLIB::ADD_F32, "__mips16_addsf3" },
+  { RTLIB::DIV_F64, "__mips16_divdf3" },
+  { RTLIB::DIV_F32, "__mips16_divsf3" },
+  { RTLIB::OEQ_F64, "__mips16_eqdf2" },
+  { RTLIB::OEQ_F32, "__mips16_eqsf2" },
+  { RTLIB::FPEXT_F32_F64, "__mips16_extendsfdf2" },
+  { RTLIB::FPTOSINT_F64_I32, "__mips16_fix_truncdfsi" },
+  { RTLIB::FPTOSINT_F32_I32, "__mips16_fix_truncsfsi" },
+  { RTLIB::SINTTOFP_I32_F64, "__mips16_floatsidf" },
+  { RTLIB::SINTTOFP_I32_F32, "__mips16_floatsisf" },
+  { RTLIB::UINTTOFP_I32_F64, "__mips16_floatunsidf" },
+  { RTLIB::UINTTOFP_I32_F32, "__mips16_floatunsisf" },
+  { RTLIB::OGE_F64, "__mips16_gedf2" },
+  { RTLIB::OGE_F32, "__mips16_gesf2" },
+  { RTLIB::OGT_F64, "__mips16_gtdf2" },
+  { RTLIB::OGT_F32, "__mips16_gtsf2" },
+  { RTLIB::OLE_F64, "__mips16_ledf2" },
+  { RTLIB::OLE_F32, "__mips16_lesf2" },
+  { RTLIB::OLT_F64, "__mips16_ltdf2" },
+  { RTLIB::OLT_F32, "__mips16_ltsf2" },
+  { RTLIB::MUL_F64, "__mips16_muldf3" },
+  { RTLIB::MUL_F32, "__mips16_mulsf3" },
+  { RTLIB::UNE_F64, "__mips16_nedf2" },
+  { RTLIB::UNE_F32, "__mips16_nesf2" },
+  { RTLIB::UNKNOWN_LIBCALL, "__mips16_ret_dc" }, // No associated libcall.
+  { RTLIB::UNKNOWN_LIBCALL, "__mips16_ret_df" }, // No associated libcall.
+  { RTLIB::UNKNOWN_LIBCALL, "__mips16_ret_sc" }, // No associated libcall.
+  { RTLIB::UNKNOWN_LIBCALL, "__mips16_ret_sf" }, // No associated libcall.
+  { RTLIB::SUB_F64, "__mips16_subdf3" },
+  { RTLIB::SUB_F32, "__mips16_subsf3" },
+  { RTLIB::FPROUND_F64_F32, "__mips16_truncdfsf2" },
+  { RTLIB::UO_F64, "__mips16_unorddf2" },
+  { RTLIB::UO_F32, "__mips16_unordsf2" }
+};
+
+static const Mips16IntrinsicHelperType Mips16IntrinsicHelper[] = {
+  {"__fixunsdfsi", "__mips16_call_stub_2" },
+  {"ceil",  "__mips16_call_stub_df_2"},
+  {"ceilf", "__mips16_call_stub_sf_1"},
+  {"copysign",  "__mips16_call_stub_df_10"},
+  {"copysignf", "__mips16_call_stub_sf_5"},
+  {"cos",  "__mips16_call_stub_df_2"},
+  {"cosf", "__mips16_call_stub_sf_1"},
+  {"exp2",  "__mips16_call_stub_df_2"},
+  {"exp2f", "__mips16_call_stub_sf_1"},
+  {"floor",  "__mips16_call_stub_df_2"},
+  {"floorf", "__mips16_call_stub_sf_1"},
+  {"log2",  "__mips16_call_stub_df_2"},
+  {"log2f", "__mips16_call_stub_sf_1"},
+  {"nearbyint",  "__mips16_call_stub_df_2"},
+  {"nearbyintf", "__mips16_call_stub_sf_1"},
+  {"rint",  "__mips16_call_stub_df_2"},
+  {"rintf", "__mips16_call_stub_sf_1"},
+  {"sin",  "__mips16_call_stub_df_2"},
+  {"sinf", "__mips16_call_stub_sf_1"},
+  {"sqrt",  "__mips16_call_stub_df_2"},
+  {"sqrtf", "__mips16_call_stub_sf_1"},
+  {"trunc",  "__mips16_call_stub_df_2"},
+  {"truncf", "__mips16_call_stub_sf_1"},
+};
+
+Mips16TargetLowering::Mips16TargetLowering(const MipsTargetMachine &TM,
+                                           const MipsSubtarget &STI)
+    : MipsTargetLowering(TM, STI) {
+
+  // Set up the register classes
+  addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass);
+
+  if (!Subtarget.useSoftFloat())
+    setMips16HardFloatLibCalls();
+
+  setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Expand);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_SWAP,        MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_ADD,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_AND,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_OR,     MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_XOR,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_NAND,   MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_MIN,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_MAX,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_UMIN,   MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_UMAX,   MVT::i32,   Expand);
+
+  setOperationAction(ISD::ROTR, MVT::i32,  Expand);
+  setOperationAction(ISD::ROTR, MVT::i64,  Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+  computeRegisterProperties(STI.getRegisterInfo());
+}
+
+const MipsTargetLowering *
+llvm::createMips16TargetLowering(const MipsTargetMachine &TM,
+                                 const MipsSubtarget &STI) {
+  return new Mips16TargetLowering(TM, STI);
+}
+
+bool
+Mips16TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                     unsigned,
+                                                     unsigned,
+                                                     bool *Fast) const {
+  return false;
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                                  MachineBasicBlock *BB) const {
+  switch (MI.getOpcode()) {
+  default:
+    return MipsTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  case Mips::SelBeqZ:
+    return emitSel16(Mips::BeqzRxImm16, MI, BB);
+  case Mips::SelBneZ:
+    return emitSel16(Mips::BnezRxImm16, MI, BB);
+  case Mips::SelTBteqZCmpi:
+    return emitSeliT16(Mips::Bteqz16, Mips::CmpiRxImmX16, MI, BB);
+  case Mips::SelTBteqZSlti:
+    return emitSeliT16(Mips::Bteqz16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::SelTBteqZSltiu:
+    return emitSeliT16(Mips::Bteqz16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::SelTBtneZCmpi:
+    return emitSeliT16(Mips::Btnez16, Mips::CmpiRxImmX16, MI, BB);
+  case Mips::SelTBtneZSlti:
+    return emitSeliT16(Mips::Btnez16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::SelTBtneZSltiu:
+    return emitSeliT16(Mips::Btnez16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::SelTBteqZCmp:
+    return emitSelT16(Mips::Bteqz16, Mips::CmpRxRy16, MI, BB);
+  case Mips::SelTBteqZSlt:
+    return emitSelT16(Mips::Bteqz16, Mips::SltRxRy16, MI, BB);
+  case Mips::SelTBteqZSltu:
+    return emitSelT16(Mips::Bteqz16, Mips::SltuRxRy16, MI, BB);
+  case Mips::SelTBtneZCmp:
+    return emitSelT16(Mips::Btnez16, Mips::CmpRxRy16, MI, BB);
+  case Mips::SelTBtneZSlt:
+    return emitSelT16(Mips::Btnez16, Mips::SltRxRy16, MI, BB);
+  case Mips::SelTBtneZSltu:
+    return emitSelT16(Mips::Btnez16, Mips::SltuRxRy16, MI, BB);
+  case Mips::BteqzT8CmpX16:
+    return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::CmpRxRy16, MI, BB);
+  case Mips::BteqzT8SltX16:
+    return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::SltRxRy16, MI, BB);
+  case Mips::BteqzT8SltuX16:
+    // TBD: figure out a way to get this or remove the instruction
+    // altogether.
+    return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::SltuRxRy16, MI, BB);
+  case Mips::BtnezT8CmpX16:
+    return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::CmpRxRy16, MI, BB);
+  case Mips::BtnezT8SltX16:
+    return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::SltRxRy16, MI, BB);
+  case Mips::BtnezT8SltuX16:
+    // TBD: figure out a way to get this or remove the instruction
+    // altogether.
+    return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::SltuRxRy16, MI, BB);
+  case Mips::BteqzT8CmpiX16: return emitFEXT_T8I8I16_ins(
+    Mips::Bteqz16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
+  case Mips::BteqzT8SltiX16: return emitFEXT_T8I8I16_ins(
+    Mips::Bteqz16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
+  case Mips::BteqzT8SltiuX16: return emitFEXT_T8I8I16_ins(
+    Mips::Bteqz16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
+  case Mips::BtnezT8CmpiX16: return emitFEXT_T8I8I16_ins(
+    Mips::Btnez16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
+  case Mips::BtnezT8SltiX16: return emitFEXT_T8I8I16_ins(
+    Mips::Btnez16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
+  case Mips::BtnezT8SltiuX16: return emitFEXT_T8I8I16_ins(
+    Mips::Btnez16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
+    break;
+  case Mips::SltCCRxRy16:
+    return emitFEXT_CCRX16_ins(Mips::SltRxRy16, MI, BB);
+    break;
+  case Mips::SltiCCRxImmX16:
+    return emitFEXT_CCRXI16_ins
+      (Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::SltiuCCRxImmX16:
+    return emitFEXT_CCRXI16_ins
+      (Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::SltuCCRxRy16:
+    return emitFEXT_CCRX16_ins
+      (Mips::SltuRxRy16, MI, BB);
+  }
+}
+
+bool Mips16TargetLowering::isEligibleForTailCallOptimization(
+    const CCState &CCInfo, unsigned NextStackOffset,
+    const MipsFunctionInfo &FI) const {
+  // No tail call optimization for mips16.
+  return false;
+}
+
+void Mips16TargetLowering::setMips16HardFloatLibCalls() {
+  for (unsigned I = 0; I != array_lengthof(HardFloatLibCalls); ++I) {
+    assert((I == 0 || HardFloatLibCalls[I - 1] < HardFloatLibCalls[I]) &&
+           "Array not sorted!");
+    if (HardFloatLibCalls[I].Libcall != RTLIB::UNKNOWN_LIBCALL)
+      setLibcallName(HardFloatLibCalls[I].Libcall, HardFloatLibCalls[I].Name);
+  }
+
+  setLibcallName(RTLIB::O_F64, "__mips16_unorddf2");
+  setLibcallName(RTLIB::O_F32, "__mips16_unordsf2");
+}
+
+//
+// The Mips16 hard float is a crazy quilt inherited from gcc. I have a much
+// cleaner way to do all of this but it will have to wait until the traditional
+// gcc mechanism is completed.
+//
+// For Pic, in order for Mips16 code to call Mips32 code which according the abi
+// have either arguments or returned values placed in floating point registers,
+// we use a set of helper functions. (This includes functions which return type
+//  complex which on Mips are returned in a pair of floating point registers).
+//
+// This is an encoding that we inherited from gcc.
+// In Mips traditional O32, N32 ABI, floating point numbers are passed in
+// floating point argument registers 1,2 only when the first and optionally
+// the second arguments are float (sf) or double (df).
+// For Mips16 we are only concerned with the situations where floating point
+// arguments are being passed in floating point registers by the ABI, because
+// Mips16 mode code cannot execute floating point instructions to load those
+// values and hence helper functions are needed.
+// The possibilities are (), (sf), (sf, sf), (sf, df), (df), (df, sf), (df, df)
+// the helper function suffixs for these are:
+//                        0,  1,    5,        9,         2,   6,        10
+// this suffix can then be calculated as follows:
+// for a given argument Arg:
+//     Arg1x, Arg2x = 1 :  Arg is sf
+//                    2 :  Arg is df
+//                    0:   Arg is neither sf or df
+// So this stub is the string for number Arg1x + Arg2x*4.
+// However not all numbers between 0 and 10 are possible, we check anyway and
+// assert if the impossible exists.
+//
+
+unsigned int Mips16TargetLowering::getMips16HelperFunctionStubNumber
+  (ArgListTy &Args) const {
+  unsigned int resultNum = 0;
+  if (Args.size() >= 1) {
+    Type *t = Args[0].Ty;
+    if (t->isFloatTy()) {
+      resultNum = 1;
+    }
+    else if (t->isDoubleTy()) {
+      resultNum = 2;
+    }
+  }
+  if (resultNum) {
+    if (Args.size() >=2) {
+      Type *t = Args[1].Ty;
+      if (t->isFloatTy()) {
+        resultNum += 4;
+      }
+      else if (t->isDoubleTy()) {
+        resultNum += 8;
+      }
+    }
+  }
+  return resultNum;
+}
+
+//
+// Prefixes are attached to stub numbers depending on the return type.
+// return type: float  sf_
+//              double df_
+//              single complex sc_
+//              double complext dc_
+//              others  NO PREFIX
+//
+//
+// The full name of a helper function is__mips16_call_stub +
+//    return type dependent prefix + stub number
+//
+// FIXME: This is something that probably should be in a different source file
+// and perhaps done differently but my main purpose is to not waste runtime
+// on something that we can enumerate in the source. Another possibility is
+// to have a python script to generate these mapping tables. This will do
+// for now. There are a whole series of helper function mapping arrays, one
+// for each return type class as outlined above. There there are 11 possible
+// entries. Ones with 0 are ones which should never be selected.
+//
+// All the arrays are similar except for ones which return neither
+// sf, df, sc, dc, in which we only care about ones which have sf or df as a
+// first parameter.
+//
+#define P_ "__mips16_call_stub_"
+#define MAX_STUB_NUMBER 10
+#define T1 P "1", P "2", 0, 0, P "5", P "6", 0, 0, P "9", P "10"
+#define T P "0" , T1
+#define P P_
+static char const * vMips16Helper[MAX_STUB_NUMBER+1] =
+  {nullptr, T1 };
+#undef P
+#define P P_ "sf_"
+static char const * sfMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#define P P_ "df_"
+static char const * dfMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#define P P_ "sc_"
+static char const * scMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#define P P_ "dc_"
+static char const * dcMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#undef P_
+
+
+const char* Mips16TargetLowering::
+  getMips16HelperFunction
+    (Type* RetTy, ArgListTy &Args, bool &needHelper) const {
+  const unsigned int stubNum = getMips16HelperFunctionStubNumber(Args);
+#ifndef NDEBUG
+  const unsigned int maxStubNum = 10;
+  assert(stubNum <= maxStubNum);
+  const bool validStubNum[maxStubNum+1] =
+    {true, true, true, false, false, true, true, false, false, true, true};
+  assert(validStubNum[stubNum]);
+#endif
+  const char *result;
+  if (RetTy->isFloatTy()) {
+    result = sfMips16Helper[stubNum];
+  }
+  else if (RetTy ->isDoubleTy()) {
+    result = dfMips16Helper[stubNum];
+  }
+  else if (RetTy->isStructTy()) {
+    // check if it's complex
+    if (RetTy->getNumContainedTypes() == 2) {
+      if ((RetTy->getContainedType(0)->isFloatTy()) &&
+          (RetTy->getContainedType(1)->isFloatTy())) {
+        result = scMips16Helper[stubNum];
+      }
+      else if ((RetTy->getContainedType(0)->isDoubleTy()) &&
+               (RetTy->getContainedType(1)->isDoubleTy())) {
+        result = dcMips16Helper[stubNum];
+      }
+      else {
+        llvm_unreachable("Uncovered condition");
+      }
+    }
+    else {
+      llvm_unreachable("Uncovered condition");
+    }
+  }
+  else {
+    if (stubNum == 0) {
+      needHelper = false;
+      return "";
+    }
+    result = vMips16Helper[stubNum];
+  }
+  needHelper = true;
+  return result;
+}
+
+void Mips16TargetLowering::
+getOpndList(SmallVectorImpl<SDValue> &Ops,
+            std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+            bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+            bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
+            SDValue Chain) const {
+  SelectionDAG &DAG = CLI.DAG;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
+  const char* Mips16HelperFunction = nullptr;
+  bool NeedMips16Helper = false;
+
+  if (Subtarget.inMips16HardFloat()) {
+    //
+    // currently we don't have symbols tagged with the mips16 or mips32
+    // qualifier so we will assume that we don't know what kind it is.
+    // and generate the helper
+    //
+    bool LookupHelper = true;
+    if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) {
+      Mips16Libcall Find = { RTLIB::UNKNOWN_LIBCALL, S->getSymbol() };
+
+      if (std::binary_search(std::begin(HardFloatLibCalls),
+                             std::end(HardFloatLibCalls), Find))
+        LookupHelper = false;
+      else {
+        const char *Symbol = S->getSymbol();
+        Mips16IntrinsicHelperType IntrinsicFind = { Symbol, "" };
+        const Mips16HardFloatInfo::FuncSignature *Signature =
+            Mips16HardFloatInfo::findFuncSignature(Symbol);
+        if (!IsPICCall && (Signature && (FuncInfo->StubsNeeded.find(Symbol) ==
+                                         FuncInfo->StubsNeeded.end()))) {
+          FuncInfo->StubsNeeded[Symbol] = Signature;
+          //
+          // S2 is normally saved if the stub is for a function which
+          // returns a float or double value and is not otherwise. This is
+          // because more work is required after the function the stub
+          // is calling completes, and so the stub cannot directly return
+          // and the stub has no stack space to store the return address so
+          // S2 is used for that purpose.
+          // In order to take advantage of not saving S2, we need to also
+          // optimize the call in the stub and this requires some further
+          // functionality in MipsAsmPrinter which we don't have yet.
+          // So for now we always save S2. The optimization will be done
+          // in a follow-on patch.
+          //
+          if (1 || (Signature->RetSig != Mips16HardFloatInfo::NoFPRet))
+            FuncInfo->setSaveS2();
+        }
+        // one more look at list of intrinsics
+        const Mips16IntrinsicHelperType *Helper =
+            std::lower_bound(std::begin(Mips16IntrinsicHelper),
+                             std::end(Mips16IntrinsicHelper), IntrinsicFind);
+        if (Helper != std::end(Mips16IntrinsicHelper) &&
+            *Helper == IntrinsicFind) {
+          Mips16HelperFunction = Helper->Helper;
+          NeedMips16Helper = true;
+          LookupHelper = false;
+        }
+
+      }
+    } else if (GlobalAddressSDNode *G =
+                   dyn_cast<GlobalAddressSDNode>(CLI.Callee)) {
+      Mips16Libcall Find = { RTLIB::UNKNOWN_LIBCALL,
+                             G->getGlobal()->getName().data() };
+
+      if (std::binary_search(std::begin(HardFloatLibCalls),
+                             std::end(HardFloatLibCalls), Find))
+        LookupHelper = false;
+    }
+    if (LookupHelper)
+      Mips16HelperFunction =
+        getMips16HelperFunction(CLI.RetTy, CLI.getArgs(), NeedMips16Helper);
+  }
+
+  SDValue JumpTarget = Callee;
+
+  // T9 should contain the address of the callee function if
+  // -relocation-model=pic or it is an indirect call.
+  if (IsPICCall || !GlobalOrExternal) {
+    unsigned V0Reg = Mips::V0;
+    if (NeedMips16Helper) {
+      RegsToPass.push_front(std::make_pair(V0Reg, Callee));
+      JumpTarget = DAG.getExternalSymbol(Mips16HelperFunction,
+                                         getPointerTy(DAG.getDataLayout()));
+      ExternalSymbolSDNode *S = cast<ExternalSymbolSDNode>(JumpTarget);
+      JumpTarget = getAddrGlobal(S, CLI.DL, JumpTarget.getValueType(), DAG,
+                                 MipsII::MO_GOT, Chain,
+                                 FuncInfo->callPtrInfo(S->getSymbol()));
+    } else
+      RegsToPass.push_front(std::make_pair((unsigned)Mips::T9, Callee));
+  }
+
+  Ops.push_back(JumpTarget);
+
+  MipsTargetLowering::getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal,
+                                  InternalLinkage, IsCallReloc, CLI, Callee,
+                                  Chain);
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::emitSel16(unsigned Opc, MachineInstr &MI,
+                                MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = ++BB->getIterator();
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, DL, TII->get(Opc))
+      .addReg(MI.getOperand(3).getReg())
+      .addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), DL, TII->get(Mips::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(1).getReg())
+      .addMBB(thisMBB)
+      .addReg(MI.getOperand(2).getReg())
+      .addMBB(copy0MBB);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::emitSelT16(unsigned Opc1, unsigned Opc2, MachineInstr &MI,
+                                 MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = ++BB->getIterator();
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, DL, TII->get(Opc2))
+      .addReg(MI.getOperand(3).getReg())
+      .addReg(MI.getOperand(4).getReg());
+  BuildMI(BB, DL, TII->get(Opc1)).addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), DL, TII->get(Mips::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(1).getReg())
+      .addMBB(thisMBB)
+      .addReg(MI.getOperand(2).getReg())
+      .addMBB(copy0MBB);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::emitSeliT16(unsigned Opc1, unsigned Opc2,
+                                  MachineInstr &MI,
+                                  MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = ++BB->getIterator();
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, DL, TII->get(Opc2))
+      .addReg(MI.getOperand(3).getReg())
+      .addImm(MI.getOperand(4).getImm());
+  BuildMI(BB, DL, TII->get(Opc1)).addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), DL, TII->get(Mips::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(1).getReg())
+      .addMBB(thisMBB)
+      .addReg(MI.getOperand(2).getReg())
+      .addMBB(copy0MBB);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
+                                          MachineInstr &MI,
+                                          MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  unsigned regX = MI.getOperand(0).getReg();
+  unsigned regY = MI.getOperand(1).getReg();
+  MachineBasicBlock *target = MI.getOperand(2).getMBB();
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(CmpOpc))
+      .addReg(regX)
+      .addReg(regY);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(BtOpc)).addMBB(target);
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitFEXT_T8I8I16_ins(
+    unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc, bool ImmSigned,
+    MachineInstr &MI, MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  unsigned regX = MI.getOperand(0).getReg();
+  int64_t imm = MI.getOperand(1).getImm();
+  MachineBasicBlock *target = MI.getOperand(2).getMBB();
+  unsigned CmpOpc;
+  if (isUInt<8>(imm))
+    CmpOpc = CmpiOpc;
+  else if ((!ImmSigned && isUInt<16>(imm)) ||
+           (ImmSigned && isInt<16>(imm)))
+    CmpOpc = CmpiXOpc;
+  else
+    llvm_unreachable("immediate field not usable");
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(CmpOpc)).addReg(regX).addImm(imm);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(BtOpc)).addMBB(target);
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+static unsigned Mips16WhichOp8uOr16simm
+  (unsigned shortOp, unsigned longOp, int64_t Imm) {
+  if (isUInt<8>(Imm))
+    return shortOp;
+  else if (isInt<16>(Imm))
+    return longOp;
+  else
+    llvm_unreachable("immediate field not usable");
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::emitFEXT_CCRX16_ins(unsigned SltOpc, MachineInstr &MI,
+                                          MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  unsigned CC = MI.getOperand(0).getReg();
+  unsigned regX = MI.getOperand(1).getReg();
+  unsigned regY = MI.getOperand(2).getReg();
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SltOpc))
+      .addReg(regX)
+      .addReg(regY);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Mips::MoveR3216), CC)
+      .addReg(Mips::T8);
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::emitFEXT_CCRXI16_ins(unsigned SltiOpc, unsigned SltiXOpc,
+                                           MachineInstr &MI,
+                                           MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  unsigned CC = MI.getOperand(0).getReg();
+  unsigned regX = MI.getOperand(1).getReg();
+  int64_t Imm = MI.getOperand(2).getImm();
+  unsigned SltOpc = Mips16WhichOp8uOr16simm(SltiOpc, SltiXOpc, Imm);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SltOpc)).addReg(regX).addImm(Imm);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Mips::MoveR3216), CC)
+      .addReg(Mips::T8);
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h
new file mode 100644
index 000000000000..0ee0b816ef70
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h
@@ -0,0 +1,82 @@
+//===-- Mips16ISelLowering.h - Mips16 DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16ISELLOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16ISELLOWERING_H
+
+#include "MipsISelLowering.h"
+
+namespace llvm {
+  class Mips16TargetLowering : public MipsTargetLowering  {
+  public:
+    explicit Mips16TargetLowering(const MipsTargetMachine &TM,
+                                  const MipsSubtarget &STI);
+
+    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+                                        unsigned Align,
+                                        bool *Fast) const override;
+
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *MBB) const override;
+
+  private:
+    bool isEligibleForTailCallOptimization(
+        const CCState &CCInfo, unsigned NextStackOffset,
+        const MipsFunctionInfo &FI) const override;
+
+    void setMips16HardFloatLibCalls();
+
+    unsigned int
+      getMips16HelperFunctionStubNumber(ArgListTy &Args) const;
+
+    const char *getMips16HelperFunction
+      (Type* RetTy, ArgListTy &Args, bool &needHelper) const;
+
+    void
+    getOpndList(SmallVectorImpl<SDValue> &Ops,
+                std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+                bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+                bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
+                SDValue Chain) const override;
+
+    MachineBasicBlock *emitSel16(unsigned Opc, MachineInstr &MI,
+                                 MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitSeliT16(unsigned Opc1, unsigned Opc2,
+                                   MachineInstr &MI,
+                                   MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitSelT16(unsigned Opc1, unsigned Opc2,
+                                  MachineInstr &MI,
+                                  MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
+                                           MachineInstr &MI,
+                                           MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitFEXT_T8I8I16_ins(unsigned BtOpc, unsigned CmpiOpc,
+                                            unsigned CmpiXOpc, bool ImmSigned,
+                                            MachineInstr &MI,
+                                            MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitFEXT_CCRX16_ins(unsigned SltOpc, MachineInstr &MI,
+                                           MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitFEXT_CCRXI16_ins(unsigned SltiOpc, unsigned SltiXOpc,
+                                            MachineInstr &MI,
+                                            MachineBasicBlock *BB) const;
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrFormats.td b/contrib/llvm/lib/Target/Mips/Mips16InstrFormats.td
new file mode 100644
index 000000000000..4ff68bef957e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrFormats.td
@@ -0,0 +1,640 @@
+//===- Mips16InstrFormats.td - Mips Instruction Formats ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MIPS instructions format
+//
+//  CPU INSTRUCTION FORMATS
+//
+//  funct or f      Function field
+//
+//  immediate       4-,5-,8- or 11-bit immediate, branch displacement, or
+//  or imm          address displacement
+//
+//  op              5-bit major operation code
+//
+//  rx              3-bit source or destination register
+//
+//  ry              3-bit source or destination register
+//
+//  rz              3-bit source or destination register
+//
+//  sa              3- or 5-bit shift amount
+//
+//===----------------------------------------------------------------------===//
+
+
+// Base class for Mips 16 Format
+// This class does not depend on the instruction size
+//
+class MipsInst16_Base<dag outs, dag ins, string asmstr, list<dag> pattern,
+                      InstrItinClass itin>: Instruction
+{
+
+  let Namespace = "Mips";
+
+  let OutOperandList = outs;
+  let InOperandList  = ins;
+
+  let AsmString   = asmstr;
+  let Pattern     = pattern;
+  let Itinerary   = itin;
+
+  let Predicates = [InMips16Mode];
+}
+
+//
+// Generic Mips 16 Format
+//
+class MipsInst16<dag outs, dag ins, string asmstr, list<dag> pattern,
+                 InstrItinClass itin>:
+  MipsInst16_Base<outs, ins, asmstr, pattern, itin>
+{
+  field bits<16> Inst;
+  bits<5> Opcode = 0;
+
+  // Top 5 bits are the 'opcode' field
+  let Inst{15-11} = Opcode;
+
+  let Size=2;
+  field bits<16> SoftFail = 0;
+}
+
+//
+// For 32 bit extended instruction forms.
+//
+class MipsInst16_32<dag outs, dag ins, string asmstr, list<dag> pattern,
+                    InstrItinClass itin>:
+  MipsInst16_Base<outs, ins, asmstr, pattern, itin>
+{
+  field bits<32> Inst;
+
+  let Size=4;
+  field bits<32> SoftFail = 0;
+}
+
+class MipsInst16_EXTEND<dag outs, dag ins, string asmstr, list<dag> pattern,
+                        InstrItinClass itin>:
+  MipsInst16_32<outs, ins, asmstr, pattern, itin>
+{
+  let Inst{31-27} = 0b11110;
+}
+
+
+
+// Mips Pseudo Instructions Format
+class MipsPseudo16<dag outs, dag ins, string asmstr, list<dag> pattern>:
+  MipsInst16<outs, ins, asmstr, pattern, IIPseudo> {
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Format I instruction class in Mips : <|opcode|imm11|>
+//===----------------------------------------------------------------------===//
+
+class FI16<bits<5> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+           InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<11> imm11;
+
+  let Opcode = op;
+
+  let Inst{10-0}  = imm11;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RI instruction class in Mips : <|opcode|rx|imm8|>
+//===----------------------------------------------------------------------===//
+
+class FRI16<bits<5> op, dag outs, dag ins, string asmstr,
+            list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<8>   imm8;
+
+  let Opcode = op;
+
+  let Inst{10-8} = rx;
+  let Inst{7-0} = imm8;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RR instruction class in Mips : <|opcode|rx|ry|funct|>
+//===----------------------------------------------------------------------===//
+
+class FRR16<bits<5> _funct, dag outs, dag ins, string asmstr,
+            list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<5>  funct;
+
+  let Opcode = 0b11101;
+  let funct  = _funct;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-0}   = funct;
+}
+
+class FRRBreak16<dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<6>  Code;
+  bits<5>  funct;
+
+  let Opcode = 0b11101;
+  let funct  = 0b00101;
+
+  let Inst{10-5} = Code;
+  let Inst{4-0}   = funct;
+}
+
+//
+// For conversion functions.
+//
+class FRR_SF16<bits<5> _funct, bits<3> _subfunct, dag outs, dag ins,
+               string asmstr, list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  subfunct;
+  bits<5>  funct;
+
+  let Opcode = 0b11101; // RR
+  let funct  = _funct;
+  let subfunct = _subfunct;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = subfunct;
+  let Inst{4-0}   = funct;
+}
+
+//
+// just used for breakpoint (hardware and software) instructions.
+//
+class FC16<bits<5> _funct, dag outs, dag ins, string asmstr,
+           list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<6>  _code;  // code is a keyword in tablegen
+  bits<5>  funct;
+
+  let Opcode = 0b11101; // RR
+  let funct  = _funct;
+
+  let Inst{10-5} = _code;
+  let Inst{4-0}   = funct;
+}
+
+//
+// J(AL)R(C) subformat
+//
+class FRR16_JALRC<bits<1> _nd, bits<1> _l, bits<1> r_a,
+                  dag outs, dag ins, string asmstr,
+                  list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<1>  nd;
+  bits<1>  l;
+  bits<1>  ra;
+
+  let nd = _nd;
+  let l = _l;
+  let ra = r_a;
+
+  let Opcode = 0b11101;
+
+  let Inst{10-8} = rx;
+  let Inst{7} = nd;
+  let Inst{6} = l;
+  let Inst{5} = ra;
+  let Inst{4-0} = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRI instruction class in Mips : <|opcode|rx|ry|imm5|>
+//===----------------------------------------------------------------------===//
+
+class FRRI16<bits<5> op, dag outs, dag ins, string asmstr,
+             list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<5>  imm5;
+
+  let Opcode = op;
+
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-0}   = imm5;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRR instruction class in Mips : <|opcode|rx|ry|rz|f|>
+//===----------------------------------------------------------------------===//
+
+class FRRR16<bits<2> _f, dag outs, dag ins, string asmstr,
+             list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<3>  rz;
+  bits<2>  f;
+
+  let Opcode = 0b11100;
+  let f  = _f;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-2} = rz;
+  let Inst{1-0}   = f;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRI-A instruction class in Mips : <|opcode|rx|ry|f|imm4|>
+//===----------------------------------------------------------------------===//
+
+class FRRI_A16<bits<1> _f, dag outs, dag ins, string asmstr,
+               list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<1>  f;
+  bits<4>  imm4;
+
+  let Opcode = 0b01000;
+  let  f = _f;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4} = f;
+  let Inst{3-0}   = imm4;
+}
+
+//===----------------------------------------------------------------------===//
+// Format Shift instruction class in Mips : <|opcode|rx|ry|sa|f|>
+//===----------------------------------------------------------------------===//
+
+class FSHIFT16<bits<2> _f, dag outs, dag ins, string asmstr,
+               list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<3>  sa;
+  bits<2>  f;
+
+  let Opcode = 0b00110;
+  let f  = _f;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-2} = sa;
+  let Inst{1-0}   = f;
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8 instruction class in Mips : <|opcode|funct|imm8>
+//===----------------------------------------------------------------------===//
+
+class FI816<bits<3> _func, dag outs, dag ins, string asmstr,
+            list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  func;
+  bits<8>   imm8;
+
+  let Opcode = 0b01100;
+  let func  = _func;
+
+  let Inst{10-8} = func;
+  let Inst{7-0} = imm8;
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8_MOVR32 instruction class in Mips : <|opcode|func|ry|r32>
+//===----------------------------------------------------------------------===//
+
+class FI8_MOVR3216<dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+
+  bits<4> ry;
+  bits<4> r32;
+
+  let Opcode = 0b01100;
+
+  let Inst{10-8} = 0b111;
+  let Inst{7-4} = ry;
+  let Inst{3-0} = r32;
+
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Format i8_MOV32R instruction class in Mips : <|opcode|func|r32|rz>
+//===----------------------------------------------------------------------===//
+
+class FI8_MOV32R16<dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+
+  bits<3>  func;
+  bits<5> r32;
+  bits<3> rz;
+
+
+  let Opcode = 0b01100;
+
+  let Inst{10-8} = 0b101;
+  let Inst{7-5} = r32{2-0};
+  let Inst{4-3} = r32{4-3};
+  let Inst{2-0} = rz;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8_SVRS instruction class in Mips :
+//    <|opcode|svrs|s|ra|s0|s1|framesize>
+//===----------------------------------------------------------------------===//
+
+class FI8_SVRS16<bits<1> _s, dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<1> s;
+  bits<1> ra = 0;
+  bits<1> s0 = 0;
+  bits<1> s1 = 0;
+  bits<4> framesize = 0;
+
+  let s =_s;
+  let Opcode = 0b01100;
+
+  let Inst{10-8} = 0b100;
+  let Inst{7} = s;
+  let Inst{6} = ra;
+  let Inst{5} = s0;
+  let Inst{4} = s1;
+  let Inst{3-0} = framesize;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format JAL instruction class in Mips16 :
+//    <|opcode|svrs|s|ra|s0|s1|framesize>
+//===----------------------------------------------------------------------===//
+
+class FJAL16<bits<1> _X, dag outs, dag ins, string asmstr,
+             list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_32<outs, ins, asmstr, pattern, itin>
+{
+  bits<1> X;
+  bits<26> imm26;
+
+
+  let X = _X;
+
+  let Inst{31-27} = 0b00011;
+  let Inst{26} = X;
+  let Inst{25-21} = imm26{20-16};
+  let Inst{20-16} = imm26{25-21};
+  let Inst{15-0}  = imm26{15-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I instruction class in Mips16 :
+//     <|EXTEND|imm10:5|imm15:11|op|0|0|0|0|0|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I16<bits<5> _eop, dag outs, dag ins, string asmstr,
+               list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<16> imm16;
+  bits<5> eop;
+
+  let eop = _eop;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = eop;
+  let Inst{10-5} = 0;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format ASMACRO instruction class in Mips16 :
+//    <EXTEND|select|p4|p3|RRR|p2|p1|p0>
+//===----------------------------------------------------------------------===//
+
+class FASMACRO16<dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<3> select;
+  bits<3> p4;
+  bits<5> p3;
+  bits<5> RRR = 0b11100;
+  bits<3> p2;
+  bits<3> p1;
+  bits<5> p0;
+
+
+  let Inst{26-24} = select;
+  let Inst{23-21} = p4;
+  let Inst{20-16} = p3;
+  let Inst{15-11} = RRR;
+  let Inst{10-8} = p2;
+  let Inst{7-5} = p1;
+  let Inst{4-0} = p0;
+
+}
+
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RI instruction class in Mips16 :
+//    <|EXTEND|imm10:5|imm15:11|op|rx|0|0|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RI16<bits<5> _op, dag outs, dag ins, string asmstr,
+                list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<16> imm16;
+  bits<5> op;
+  bits<3> rx;
+
+  let op = _op;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = op;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = 0;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RRI instruction class in Mips16 :
+//     <|EXTEND|imm10:5|imm15:11|op|rx|ry|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RRI16<bits<5> _op, dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<5> op;
+  bits<16> imm16;
+  bits<3> rx;
+  bits<3> ry;
+
+  let op=_op;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = op;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RRI-A instruction class in Mips16 :
+//    <|EXTEND|imm10:4|imm14:11|RRI-A|rx|ry|f|imm3:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RRI_A16<bits<1> _f, dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<15> imm15;
+  bits<3> rx;
+  bits<3> ry;
+  bits<1> f;
+
+  let f = _f;
+
+  let Inst{26-20} = imm15{10-4};
+  let Inst{19-16} = imm15{14-11};
+  let Inst{15-11} = 0b01000;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4} = f;
+  let Inst{3-0} = imm15{3-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-SHIFT instruction class in Mips16 :
+//    <|EXTEND|sa 4:0|s5|0|SHIFT|rx|ry|0|f>
+//===----------------------------------------------------------------------===//
+
+class FEXT_SHIFT16<bits<2> _f, dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<6> sa6;
+  bits<3> rx;
+  bits<3> ry;
+  bits<2> f;
+
+  let f = _f;
+
+  let Inst{26-22} = sa6{4-0};
+  let Inst{21} = sa6{5};
+  let Inst{20-16} = 0;
+  let Inst{15-11} = 0b00110;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-2} = 0;
+  let Inst{1-0} = f;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I8 instruction class in Mips16 :
+//    <|EXTEND|imm10:5|imm15:11|I8|funct|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I816<bits<3> _funct, dag outs, dag ins, string asmstr,
+                list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<16> imm16;
+  bits<5> I8;
+  bits<3> funct;
+
+  let funct = _funct;
+  let I8 = 0b00110;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = I8;
+  let Inst{10-8} = funct;
+  let Inst{7-5} = 0;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I8_SVRS instruction class in Mips16 :
+//    <|EXTEND|xsregs|framesize7:4|aregs|I8|SVRS|s|ra|s0|s1|framesize3:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I8_SVRS16<bits<1> s_, dag outs, dag ins, string asmstr,
+                     list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<3> xsregs =0;
+  bits<8> framesize =0;
+  bits<3> aregs =0;
+  bits<5> I8 = 0b01100;
+  bits<3> SVRS = 0b100;
+  bits<1> s;
+  bits<1> ra = 0;
+  bits<1> s0 = 0;
+  bits<1> s1 = 0;
+
+  let s= s_;
+
+  let Inst{26-24} = xsregs;
+  let Inst{23-20} = framesize{7-4};
+  let Inst{19} = 0;
+  let Inst{18-16} = aregs;
+  let Inst{15-11} = I8;
+  let Inst{10-8} = SVRS;
+  let Inst{7} = s;
+  let Inst{6} = ra;
+  let Inst{5} = s0;
+  let Inst{4} = s1;
+  let Inst{3-0} = framesize{3-0};
+
+
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
new file mode 100644
index 000000000000..35ef31749f40
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -0,0 +1,519 @@
+//===-- Mips16InstrInfo.cpp - Mips16 Instruction Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+#include "Mips16InstrInfo.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips16-instrinfo"
+
+Mips16InstrInfo::Mips16InstrInfo(const MipsSubtarget &STI)
+    : MipsInstrInfo(STI, Mips::Bimm16), RI() {}
+
+const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned Mips16InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                              int &FrameIndex) const {
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned Mips16InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                             int &FrameIndex) const {
+  return 0;
+}
+
+void Mips16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, unsigned DestReg,
+                                  unsigned SrcReg, bool KillSrc) const {
+  unsigned Opc = 0;
+
+  if (Mips::CPU16RegsRegClass.contains(DestReg) &&
+      Mips::GPR32RegClass.contains(SrcReg))
+    Opc = Mips::MoveR3216;
+  else if (Mips::GPR32RegClass.contains(DestReg) &&
+           Mips::CPU16RegsRegClass.contains(SrcReg))
+    Opc = Mips::Move32R16;
+  else if ((SrcReg == Mips::HI0) &&
+           (Mips::CPU16RegsRegClass.contains(DestReg)))
+    Opc = Mips::Mfhi16, SrcReg = 0;
+
+  else if ((SrcReg == Mips::LO0) &&
+           (Mips::CPU16RegsRegClass.contains(DestReg)))
+    Opc = Mips::Mflo16, SrcReg = 0;
+
+
+  assert(Opc && "Cannot copy registers");
+
+  MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc));
+
+  if (DestReg)
+    MIB.addReg(DestReg, RegState::Define);
+
+  if (SrcReg)
+    MIB.addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator I,
+                                      unsigned SrcReg, bool isKill, int FI,
+                                      const TargetRegisterClass *RC,
+                                      const TargetRegisterInfo *TRI,
+                                      int64_t Offset) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore);
+  unsigned Opc = 0;
+  if (Mips::CPU16RegsRegClass.hasSubClassEq(RC))
+    Opc = Mips::SwRxSpImmX16;
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill)).
+      addFrameIndex(FI).addImm(Offset)
+      .addMemOperand(MMO);
+}
+
+void Mips16InstrInfo::loadRegFromStack(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned DestReg, int FI,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI,
+                                       int64_t Offset) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
+  unsigned Opc = 0;
+
+  if (Mips::CPU16RegsRegClass.hasSubClassEq(RC))
+    Opc = Mips::LwRxSpImmX16;
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(Offset)
+    .addMemOperand(MMO);
+}
+
+bool Mips16InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  MachineBasicBlock &MBB = *MI.getParent();
+  switch (MI.getDesc().getOpcode()) {
+  default:
+    return false;
+  case Mips::RetRA16:
+    ExpandRetRA16(MBB, MI, Mips::JrcRa16);
+    break;
+  }
+
+  MBB.erase(MI.getIterator());
+  return true;
+}
+
+/// GetOppositeBranchOpc - Return the inverse of the specified
+/// opcode, e.g. turning BEQ to BNE.
+unsigned Mips16InstrInfo::getOppositeBranchOpc(unsigned Opc) const {
+  switch (Opc) {
+  case Mips::BeqzRxImmX16: return Mips::BnezRxImmX16;
+  case Mips::BnezRxImmX16: return Mips::BeqzRxImmX16;
+  case Mips::BeqzRxImm16: return Mips::BnezRxImm16;
+  case Mips::BnezRxImm16: return Mips::BeqzRxImm16;
+  case Mips::BteqzT8CmpX16: return Mips::BtnezT8CmpX16;
+  case Mips::BteqzT8SltX16: return Mips::BtnezT8SltX16;
+  case Mips::BteqzT8SltiX16: return Mips::BtnezT8SltiX16;
+  case Mips::Btnez16: return Mips::Bteqz16;
+  case Mips::BtnezX16: return Mips::BteqzX16;
+  case Mips::BtnezT8CmpiX16: return Mips::BteqzT8CmpiX16;
+  case Mips::BtnezT8SltuX16: return Mips::BteqzT8SltuX16;
+  case Mips::BtnezT8SltiuX16: return Mips::BteqzT8SltiuX16;
+  case Mips::Bteqz16: return Mips::Btnez16;
+  case Mips::BteqzX16: return Mips::BtnezX16;
+  case Mips::BteqzT8CmpiX16: return Mips::BtnezT8CmpiX16;
+  case Mips::BteqzT8SltuX16: return Mips::BtnezT8SltuX16;
+  case Mips::BteqzT8SltiuX16: return Mips::BtnezT8SltiuX16;
+  case Mips::BtnezT8CmpX16: return Mips::BteqzT8CmpX16;
+  case Mips::BtnezT8SltX16: return Mips::BteqzT8SltX16;
+  case Mips::BtnezT8SltiX16: return Mips::BteqzT8SltiX16;
+  }
+  llvm_unreachable("Illegal opcode!");
+}
+
+static void addSaveRestoreRegs(MachineInstrBuilder &MIB,
+                               const std::vector<CalleeSavedInfo> &CSI,
+                               unsigned Flags = 0) {
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    // Add the callee-saved register as live-in. Do not add if the register is
+    // RA and return address is taken, because it has already been added in
+    // method MipsTargetLowering::lowerRETURNADDR.
+    // It's killed at the spill, unless the register is RA and return address
+    // is taken.
+    unsigned Reg = CSI[e-i-1].getReg();
+    switch (Reg) {
+    case Mips::RA:
+    case Mips::S0:
+    case Mips::S1:
+      MIB.addReg(Reg, Flags);
+      break;
+    case Mips::S2:
+      break;
+    default:
+      llvm_unreachable("unexpected mips16 callee saved register");
+
+    }
+  }
+}
+// Adjust SP by FrameSize bytes. Save RA, S0, S1
+void Mips16InstrInfo::makeFrame(unsigned SP, int64_t FrameSize,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const {
+  DebugLoc DL;
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI    = MF.getFrameInfo();
+  const BitVector Reserved = RI.getReservedRegs(MF);
+  bool SaveS2 = Reserved[Mips::S2];
+  MachineInstrBuilder MIB;
+  unsigned Opc = ((FrameSize <= 128) && !SaveS2)? Mips::Save16:Mips::SaveX16;
+  MIB = BuildMI(MBB, I, DL, get(Opc));
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  addSaveRestoreRegs(MIB, CSI);
+  if (SaveS2)
+    MIB.addReg(Mips::S2);
+  if (isUInt<11>(FrameSize))
+    MIB.addImm(FrameSize);
+  else {
+    int Base = 2040; // should create template function like isUInt that
+                     // returns largest possible n bit unsigned integer
+    int64_t Remainder = FrameSize - Base;
+    MIB.addImm(Base);
+    if (isInt<16>(-Remainder))
+      BuildAddiuSpImm(MBB, I, -Remainder);
+    else
+      adjustStackPtrBig(SP, -Remainder, MBB, I, Mips::V0, Mips::V1);
+  }
+}
+
+// Adjust SP by FrameSize bytes. Restore RA, S0, S1
+void Mips16InstrInfo::restoreFrame(unsigned SP, int64_t FrameSize,
+                                   MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I) const {
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  MachineFunction *MF = MBB.getParent();
+  MachineFrameInfo &MFI    = MF->getFrameInfo();
+  const BitVector Reserved = RI.getReservedRegs(*MF);
+  bool SaveS2 = Reserved[Mips::S2];
+  MachineInstrBuilder MIB;
+  unsigned Opc = ((FrameSize <= 128) && !SaveS2)?
+    Mips::Restore16:Mips::RestoreX16;
+
+  if (!isUInt<11>(FrameSize)) {
+    unsigned Base = 2040;
+    int64_t Remainder = FrameSize - Base;
+    FrameSize = Base; // should create template function like isUInt that
+                     // returns largest possible n bit unsigned integer
+
+    if (isInt<16>(Remainder))
+      BuildAddiuSpImm(MBB, I, Remainder);
+    else
+      adjustStackPtrBig(SP, Remainder, MBB, I, Mips::A0, Mips::A1);
+  }
+  MIB = BuildMI(MBB, I, DL, get(Opc));
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  addSaveRestoreRegs(MIB, CSI, RegState::Define);
+  if (SaveS2)
+    MIB.addReg(Mips::S2, RegState::Define);
+  MIB.addImm(FrameSize);
+}
+
+// Adjust SP by Amount bytes where bytes can be up to 32bit number.
+// This can only be called at times that we know that there is at least one free
+// register.
+// This is clearly safe at prologue and epilogue.
+//
+void Mips16InstrInfo::adjustStackPtrBig(unsigned SP, int64_t Amount,
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned Reg1, unsigned Reg2) const {
+  DebugLoc DL;
+  //
+  // li reg1, constant
+  // move reg2, sp
+  // add reg1, reg1, reg2
+  // move sp, reg1
+  //
+  //
+  MachineInstrBuilder MIB1 = BuildMI(MBB, I, DL, get(Mips::LwConstant32), Reg1);
+  MIB1.addImm(Amount).addImm(-1);
+  MachineInstrBuilder MIB2 = BuildMI(MBB, I, DL, get(Mips::MoveR3216), Reg2);
+  MIB2.addReg(Mips::SP, RegState::Kill);
+  MachineInstrBuilder MIB3 = BuildMI(MBB, I, DL, get(Mips::AdduRxRyRz16), Reg1);
+  MIB3.addReg(Reg1);
+  MIB3.addReg(Reg2, RegState::Kill);
+  MachineInstrBuilder MIB4 = BuildMI(MBB, I, DL, get(Mips::Move32R16),
+                                                     Mips::SP);
+  MIB4.addReg(Reg1, RegState::Kill);
+}
+
+void Mips16InstrInfo::adjustStackPtrBigUnrestricted(
+    unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+   llvm_unreachable("adjust stack pointer amount exceeded");
+}
+
+/// Adjust SP by Amount bytes.
+void Mips16InstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const {
+  if (Amount == 0)
+    return;
+
+  if (isInt<16>(Amount))  // need to change to addiu sp, ....and isInt<16>
+    BuildAddiuSpImm(MBB, I, Amount);
+  else
+    adjustStackPtrBigUnrestricted(SP, Amount, MBB, I);
+}
+
+/// This function generates the sequence of instructions needed to get the
+/// result of adding register REG and immediate IMM.
+unsigned Mips16InstrInfo::loadImmediate(unsigned FrameReg, int64_t Imm,
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator II,
+                                        const DebugLoc &DL,
+                                        unsigned &NewImm) const {
+  //
+  // given original instruction is:
+  // Instr rx, T[offset] where offset is too big.
+  //
+  // lo = offset & 0xFFFF
+  // hi = ((offset >> 16) + (lo >> 15)) & 0xFFFF;
+  //
+  // let T = temporary register
+  // li T, hi
+  // shl T, 16
+  // add T, Rx, T
+  //
+  RegScavenger rs;
+  int32_t lo = Imm & 0xFFFF;
+  NewImm = lo;
+  int Reg =0;
+  int SpReg = 0;
+
+  rs.enterBasicBlock(MBB);
+  rs.forward(II);
+  //
+  // We need to know which registers can be used, in the case where there
+  // are not enough free registers. We exclude all registers that
+  // are used in the instruction that we are helping.
+  //  // Consider all allocatable registers in the register class initially
+  BitVector Candidates =
+      RI.getAllocatableSet
+      (*II->getParent()->getParent(), &Mips::CPU16RegsRegClass);
+  // Exclude all the registers being used by the instruction.
+  for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = II->getOperand(i);
+    if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() &&
+        !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      Candidates.reset(MO.getReg());
+  }
+
+  // If the same register was used and defined in an instruction, then
+  // it will not be in the list of candidates.
+  //
+  // we need to analyze the instruction that we are helping.
+  // we need to know if it defines register x but register x is not
+  // present as an operand of the instruction. this tells
+  // whether the register is live before the instruction. if it's not
+  // then we don't need to save it in case there are no free registers.
+  int DefReg = 0;
+  for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = II->getOperand(i);
+    if (MO.isReg() && MO.isDef()) {
+      DefReg = MO.getReg();
+      break;
+    }
+  }
+
+  BitVector Available = rs.getRegsAvailable(&Mips::CPU16RegsRegClass);
+  Available &= Candidates;
+  //
+  // we use T0 for the first register, if we need to save something away.
+  // we use T1 for the second register, if we need to save something away.
+  //
+  unsigned FirstRegSaved =0, SecondRegSaved=0;
+  unsigned FirstRegSavedTo = 0, SecondRegSavedTo = 0;
+
+  Reg = Available.find_first();
+
+  if (Reg == -1) {
+    Reg = Candidates.find_first();
+    Candidates.reset(Reg);
+    if (DefReg != Reg) {
+      FirstRegSaved = Reg;
+      FirstRegSavedTo = Mips::T0;
+      copyPhysReg(MBB, II, DL, FirstRegSavedTo, FirstRegSaved, true);
+    }
+  }
+  else
+    Available.reset(Reg);
+  BuildMI(MBB, II, DL, get(Mips::LwConstant32), Reg).addImm(Imm).addImm(-1);
+  NewImm = 0;
+  if (FrameReg == Mips::SP) {
+    SpReg = Available.find_first();
+    if (SpReg == -1) {
+      SpReg = Candidates.find_first();
+      // Candidates.reset(SpReg); // not really needed
+      if (DefReg!= SpReg) {
+        SecondRegSaved = SpReg;
+        SecondRegSavedTo = Mips::T1;
+      }
+      if (SecondRegSaved)
+        copyPhysReg(MBB, II, DL, SecondRegSavedTo, SecondRegSaved, true);
+    }
+   else
+     Available.reset(SpReg);
+    copyPhysReg(MBB, II, DL, SpReg, Mips::SP, false);
+    BuildMI(MBB, II, DL, get(Mips::  AdduRxRyRz16), Reg).addReg(SpReg, RegState::Kill)
+      .addReg(Reg);
+  }
+  else
+    BuildMI(MBB, II, DL, get(Mips::  AdduRxRyRz16), Reg).addReg(FrameReg)
+      .addReg(Reg, RegState::Kill);
+  if (FirstRegSaved || SecondRegSaved) {
+    II = std::next(II);
+    if (FirstRegSaved)
+      copyPhysReg(MBB, II, DL, FirstRegSaved, FirstRegSavedTo, true);
+    if (SecondRegSaved)
+      copyPhysReg(MBB, II, DL, SecondRegSaved, SecondRegSavedTo, true);
+  }
+  return Reg;
+}
+
+unsigned Mips16InstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
+  return (Opc == Mips::BeqzRxImmX16   || Opc == Mips::BimmX16  ||
+          Opc == Mips::Bimm16  ||
+          Opc == Mips::Bteqz16        || Opc == Mips::Btnez16 ||
+          Opc == Mips::BeqzRxImm16    || Opc == Mips::BnezRxImm16   ||
+          Opc == Mips::BnezRxImmX16   || Opc == Mips::BteqzX16 ||
+          Opc == Mips::BteqzT8CmpX16  || Opc == Mips::BteqzT8CmpiX16 ||
+          Opc == Mips::BteqzT8SltX16  || Opc == Mips::BteqzT8SltuX16  ||
+          Opc == Mips::BteqzT8SltiX16 || Opc == Mips::BteqzT8SltiuX16 ||
+          Opc == Mips::BtnezX16       || Opc == Mips::BtnezT8CmpX16 ||
+          Opc == Mips::BtnezT8CmpiX16 || Opc == Mips::BtnezT8SltX16 ||
+          Opc == Mips::BtnezT8SltuX16 || Opc == Mips::BtnezT8SltiX16 ||
+          Opc == Mips::BtnezT8SltiuX16 ) ? Opc : 0;
+}
+
+void Mips16InstrInfo::ExpandRetRA16(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned Opc) const {
+  BuildMI(MBB, I, I->getDebugLoc(), get(Opc));
+}
+
+const MCInstrDesc &Mips16InstrInfo::AddiuSpImm(int64_t Imm) const {
+  if (validSpImm8(Imm))
+    return get(Mips::AddiuSpImm16);
+  else
+    return get(Mips::AddiuSpImmX16);
+}
+
+void Mips16InstrInfo::BuildAddiuSpImm
+  (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const {
+  DebugLoc DL;
+  BuildMI(MBB, I, DL, AddiuSpImm(Imm)).addImm(Imm);
+}
+
+const MipsInstrInfo *llvm::createMips16InstrInfo(const MipsSubtarget &STI) {
+  return new Mips16InstrInfo(STI);
+}
+
+bool Mips16InstrInfo::validImmediate(unsigned Opcode, unsigned Reg,
+                                     int64_t Amount) {
+  switch (Opcode) {
+  case Mips::LbRxRyOffMemX16:
+  case Mips::LbuRxRyOffMemX16:
+  case Mips::LhRxRyOffMemX16:
+  case Mips::LhuRxRyOffMemX16:
+  case Mips::SbRxRyOffMemX16:
+  case Mips::ShRxRyOffMemX16:
+  case Mips::LwRxRyOffMemX16:
+  case Mips::SwRxRyOffMemX16:
+  case Mips::SwRxSpImmX16:
+  case Mips::LwRxSpImmX16:
+    return isInt<16>(Amount);
+  case Mips::AddiuRxRyOffMemX16:
+    if ((Reg == Mips::PC) || (Reg == Mips::SP))
+      return isInt<16>(Amount);
+    return isInt<15>(Amount);
+  }
+  llvm_unreachable("unexpected Opcode in validImmediate");
+}
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorString or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorString or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+/// We implement the special case of the .space directive taking only an
+/// integer argument, which is the size in bytes. This is used for creating
+/// inline code spacing for testing purposes using inline assembly.
+///
+unsigned Mips16InstrInfo::getInlineAsmLength(const char *Str,
+                                             const MCAsmInfo &MAI) const {
+
+  // Count the number of instructions in the asm.
+  bool atInsnStart = true;
+  unsigned Length = 0;
+  for (; *Str; ++Str) {
+    if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
+                                strlen(MAI.getSeparatorString())) == 0)
+      atInsnStart = true;
+    if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
+      if (strncmp(Str, ".space", 6)==0) {
+        char *EStr; int Sz;
+        Sz = strtol(Str+6, &EStr, 10);
+        while (isspace(*EStr)) ++EStr;
+        if (*EStr=='\0') {
+          DEBUG(dbgs() << "parsed .space " << Sz << '\n');
+          return Sz;
+        }
+      }
+      Length += MAI.getMaxInstLength();
+      atInsnStart = false;
+    }
+    if (atInsnStart && strncmp(Str, MAI.getCommentString().data(),
+                               MAI.getCommentString().size()) == 0)
+      atInsnStart = false;
+  }
+
+  return Length;
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h
new file mode 100644
index 000000000000..ab559799f00b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h
@@ -0,0 +1,126 @@
+//===-- Mips16InstrInfo.h - Mips16 Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16INSTRINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16INSTRINFO_H
+
+#include "Mips16RegisterInfo.h"
+#include "MipsInstrInfo.h"
+
+namespace llvm {
+class MipsSubtarget;
+class Mips16InstrInfo : public MipsInstrInfo {
+  const Mips16RegisterInfo RI;
+
+public:
+  explicit Mips16InstrInfo(const MipsSubtarget &STI);
+
+  const MipsRegisterInfo &getRegisterInfo() const override;
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStack(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator MBBI,
+                       unsigned SrcReg, bool isKill, int FrameIndex,
+                       const TargetRegisterClass *RC,
+                       const TargetRegisterInfo *TRI,
+                       int64_t Offset) const override;
+
+  void loadRegFromStack(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI,
+                        unsigned DestReg, int FrameIndex,
+                        const TargetRegisterClass *RC,
+                        const TargetRegisterInfo *TRI,
+                        int64_t Offset) const override;
+
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+  unsigned getOppositeBranchOpc(unsigned Opc) const override;
+
+  // Adjust SP by FrameSize bytes. Save RA, S0, S1
+  void makeFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB,
+                 MachineBasicBlock::iterator I) const;
+
+  // Adjust SP by FrameSize bytes. Restore RA, S0, S1
+  void restoreFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const;
+
+
+  /// Adjust SP by Amount bytes.
+  void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const override;
+
+  /// Emit a series of instructions to load an immediate.
+  // This is to adjust some FrameReg. We return the new register to be used
+  // in place of FrameReg and the adjusted immediate field (&NewImm)
+  //
+  unsigned loadImmediate(unsigned FrameReg, int64_t Imm, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator II, const DebugLoc &DL,
+                         unsigned &NewImm) const;
+
+  static bool validImmediate(unsigned Opcode, unsigned Reg, int64_t Amount);
+
+  static bool validSpImm8(int offset) {
+    return ((offset & 7) == 0) && isInt<11>(offset);
+  }
+
+  //
+  // build the proper one based on the Imm field
+  //
+
+  const MCInstrDesc& AddiuSpImm(int64_t Imm) const;
+
+  void BuildAddiuSpImm
+    (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const;
+
+  unsigned getInlineAsmLength(const char *Str,
+                              const MCAsmInfo &MAI) const override;
+private:
+  unsigned getAnalyzableBrOpc(unsigned Opc) const override;
+
+  void ExpandRetRA16(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   unsigned Opc) const;
+
+  // Adjust SP by Amount bytes where bytes can be up to 32bit number.
+  void adjustStackPtrBig(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator I,
+                         unsigned Reg1, unsigned Reg2) const;
+
+  // Adjust SP by Amount bytes where bytes can be up to 32bit number.
+  void adjustStackPtrBigUnrestricted(unsigned SP, int64_t Amount,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
new file mode 100644
index 000000000000..021fb8678686
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
@@ -0,0 +1,1910 @@
+//===- Mips16InstrInfo.td - Target Description for Mips16  -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips16 instructions.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// Mips Address
+//
+def addr16 : ComplexPattern<iPTR, 2, "selectAddr16", [frameindex]>;
+def addr16sp : ComplexPattern<iPTR, 2, "selectAddr16SP", [frameindex]>;
+
+//
+// Address operand
+def mem16 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops CPU16Regs, simm16);
+  let EncoderMethod = "getMemEncoding";
+}
+
+def mem16sp : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  // This should be CPUSPReg but the MIPS16 subtarget isn't good enough at
+  // keeping the sp-relative load and the other varieties separate at the
+  // moment. This lie fixes the problem sufficiently well to fix the errors
+  // emitted by -verify-machineinstrs and the output ends up correct as long
+  // as we use an external assembler (which is already a requirement for MIPS16
+  // for several other reasons).
+  let MIOperandInfo = (ops CPU16RegsPlusSP, simm16);
+  let EncoderMethod = "getMemEncoding";
+}
+
+def mem16_ea : Operand<i32> {
+  let PrintMethod = "printMemOperandEA";
+  let MIOperandInfo = (ops CPU16RegsPlusSP, simm16);
+  let EncoderMethod = "getMemEncoding";
+}
+
+def pcrel16 : Operand<i32>;
+
+//
+// I-type instruction format
+//
+// this is only used by bimm. the actual assembly value is a 12 bit signed
+// number
+//
+class FI16_ins<bits<5> op, string asmstr, InstrItinClass itin>:
+  FI16<op, (outs), (ins brtarget:$imm16),
+            !strconcat(asmstr, "\t$imm16 # 16 bit inst"), [], itin>;
+
+//
+//
+// I8 instruction format
+//
+
+class FI816_ins_base<bits<3> _func, string asmstr,
+                     string asmstr2, InstrItinClass itin>:
+  FI816<_func, (outs), (ins simm16:$imm), !strconcat(asmstr, asmstr2),
+        [], itin>;
+
+class FI816_ins<bits<3> _func, string asmstr,
+                InstrItinClass itin>:
+  FI816_ins_base<_func, asmstr, "\t$imm  # 16 bit inst", itin>;
+ 
+class FI816_SP_ins<bits<3> _func, string asmstr,
+                   InstrItinClass itin>:
+  FI816_ins_base<_func, asmstr, "\t$$sp, $imm # 16 bit inst", itin>;
+
+//
+// RI instruction format
+//
+
+
+class FRI16_ins_base<bits<5> op, string asmstr, string asmstr2,
+                     InstrItinClass itin>:
+  FRI16<op, (outs CPU16Regs:$rx), (ins simm16:$imm),
+        !strconcat(asmstr, asmstr2), [], itin>;
+
+class FRI16_ins<bits<5> op, string asmstr,
+                InstrItinClass itin>:
+  FRI16_ins_base<op, asmstr, "\t$rx, $imm \t# 16 bit inst", itin>;
+
+class FRI16_TCP_ins<bits<5> _op, string asmstr,
+                    InstrItinClass itin>:
+  FRI16<_op, (outs CPU16Regs:$rx), (ins pcrel16:$imm, i32imm:$size),
+            !strconcat(asmstr, "\t$rx, $imm\t# 16 bit inst"), [], itin>;
+            
+class FRI16R_ins_base<bits<5> op, string asmstr, string asmstr2,
+                     InstrItinClass itin>:
+  FRI16<op, (outs), (ins CPU16Regs:$rx, simm16:$imm),
+        !strconcat(asmstr, asmstr2), [], itin>;
+
+class FRI16R_ins<bits<5> op, string asmstr,
+                InstrItinClass itin>:
+  FRI16R_ins_base<op, asmstr, "\t$rx, $imm \t# 16 bit inst", itin>;
+
+class F2RI16_ins<bits<5> _op, string asmstr,
+                     InstrItinClass itin>:
+  FRI16<_op, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_, simm16:$imm),
+        !strconcat(asmstr, "\t$rx, $imm\t# 16 bit inst"), [], itin> {
+  let Constraints = "$rx_ = $rx";
+}
+
+class FRI16_B_ins<bits<5> _op, string asmstr,
+                  InstrItinClass itin>:
+  FRI16<_op, (outs), (ins  CPU16Regs:$rx, brtarget:$imm),
+        !strconcat(asmstr, "\t$rx, $imm  # 16 bit inst"), [], itin>;
+//
+// Compare a register and immediate and place result in CC
+// Implicit use of T8
+//
+// EXT-CCRR Instruction format
+//
+class FEXT_CCRXI16_ins<string asmstr>:
+  MipsPseudo16<(outs CPU16Regs:$cc), (ins CPU16Regs:$rx, simm16:$imm),
+               !strconcat(asmstr, "\t$rx, $imm\n\tmove\t$cc, $$t8"), []> {
+  let isCodeGenOnly=1;
+  let usesCustomInserter = 1;
+}
+
+// JAL and JALX instruction format
+//
+class FJAL16_ins<bits<1> _X, string asmstr,
+                 InstrItinClass itin>:
+  FJAL16<_X, (outs), (ins uimm26:$imm),
+         !strconcat(asmstr, "\t$imm\n\tnop"),[],
+         itin>  {
+  let isCodeGenOnly=1;
+  let Size=6;
+}
+
+class FJALB16_ins<bits<1> _X, string asmstr,
+                 InstrItinClass itin>:
+  FJAL16<_X, (outs), (ins uimm26:$imm),
+         !strconcat(asmstr, "\t$imm\t# branch\n\tnop"),[],
+         itin>  {
+  let isCodeGenOnly=1;
+  let Size=6;
+}
+
+//
+// EXT-I instruction format
+//
+class FEXT_I16_ins<bits<5> eop, string asmstr, InstrItinClass itin> :
+  FEXT_I16<eop, (outs), (ins brtarget:$imm16),
+           !strconcat(asmstr, "\t$imm16"),[], itin>;
+
+//
+// EXT-I8 instruction format
+//
+
+class FEXT_I816_ins_base<bits<3> _func, string asmstr,
+                         string asmstr2, InstrItinClass itin>:
+  FEXT_I816<_func, (outs), (ins simm16:$imm), !strconcat(asmstr, asmstr2),
+            [], itin>;
+
+class FEXT_I816_ins<bits<3> _func, string asmstr,
+                    InstrItinClass itin>:
+  FEXT_I816_ins_base<_func, asmstr, "\t$imm", itin>;
+
+class FEXT_I816_SP_ins<bits<3> _func, string asmstr,
+                       InstrItinClass itin>:
+      FEXT_I816_ins_base<_func, asmstr, "\t$$sp, $imm", itin>;
+
+//
+// Assembler formats in alphabetical order.
+// Natural and pseudos are mixed together.
+//
+// Compare two registers and place result in CC
+// Implicit use of T8
+//
+// CC-RR Instruction format
+//
+class FCCRR16_ins<string asmstr> :
+  MipsPseudo16<(outs CPU16Regs:$cc), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+               !strconcat(asmstr, "\t$rx, $ry\n\tmove\t$cc, $$t8"), []> {
+  let isCodeGenOnly=1;
+  let usesCustomInserter = 1;
+}
+
+//
+// EXT-RI instruction format
+//
+
+class FEXT_RI16_ins_base<bits<5> _op, string asmstr, string asmstr2,
+                         InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins simm16:$imm),
+                  !strconcat(asmstr, asmstr2), [], itin>;
+
+class FEXT_RI16_ins<bits<5> _op, string asmstr,
+                    InstrItinClass itin>:
+  FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $imm", itin>;
+
+class FEXT_RI16R_ins_base<bits<5> _op, string asmstr, string asmstr2,
+                         InstrItinClass itin>:
+  FEXT_RI16<_op, (outs ), (ins CPU16Regs:$rx, simm16:$imm),
+                  !strconcat(asmstr, asmstr2), [], itin>;
+
+class FEXT_RI16R_ins<bits<5> _op, string asmstr,
+                    InstrItinClass itin>:
+  FEXT_RI16R_ins_base<_op, asmstr, "\t$rx, $imm", itin>;
+
+class FEXT_RI16_PC_ins<bits<5> _op, string asmstr, InstrItinClass itin>:
+  FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $$pc, $imm", itin>;
+
+class FEXT_RI16_B_ins<bits<5> _op, string asmstr,
+                      InstrItinClass itin>:
+  FEXT_RI16<_op, (outs), (ins  CPU16Regs:$rx, brtarget:$imm),
+            !strconcat(asmstr, "\t$rx, $imm"), [], itin>;
+
+class FEXT_RI16_TCP_ins<bits<5> _op, string asmstr,
+                        InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins pcrel16:$imm, i32imm:$size),
+            !strconcat(asmstr, "\t$rx, $imm"), [], itin>;
+
+class FEXT_2RI16_ins<bits<5> _op, string asmstr,
+                     InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_, simm16:$imm),
+            !strconcat(asmstr, "\t$rx, $imm"), [], itin> {
+  let Constraints = "$rx_ = $rx";
+}
+
+//
+// EXT-RRI instruction format
+//
+
+class FEXT_RRI16_mem_ins<bits<5> op, string asmstr, Operand MemOpnd,
+                         InstrItinClass itin>:
+  FEXT_RRI16<op, (outs CPU16Regs:$ry), (ins  MemOpnd:$addr),
+             !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+class FEXT_RRI16_mem2_ins<bits<5> op, string asmstr, Operand MemOpnd,
+                          InstrItinClass itin>:
+  FEXT_RRI16<op, (outs ), (ins  CPU16Regs:$ry, MemOpnd:$addr),
+             !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+//
+//
+// EXT-RRI-A instruction format
+//
+
+class FEXT_RRI_A16_mem_ins<bits<1> op, string asmstr, Operand MemOpnd,
+                           InstrItinClass itin>:
+  FEXT_RRI_A16<op, (outs CPU16Regs:$ry), (ins  MemOpnd:$addr),
+               !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+//
+// EXT-SHIFT instruction format
+//
+class FEXT_SHIFT16_ins<bits<2> _f, string asmstr, InstrItinClass itin>:
+  FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, uimm5:$sa),
+               !strconcat(asmstr, "\t$rx, $ry, $sa"), [], itin>;
+
+//
+// EXT-T8I8
+//
+class FEXT_T8I816_ins<string asmstr, string asmstr2>:
+  MipsPseudo16<(outs),
+               (ins CPU16Regs:$rx, CPU16Regs:$ry, brtarget:$imm),
+               !strconcat(asmstr2, !strconcat("\t$rx, $ry\n\t",
+               !strconcat(asmstr, "\t$imm"))),[]> {
+  let isCodeGenOnly=1;
+  let usesCustomInserter = 1;
+}
+
+//
+// EXT-T8I8I
+//
+class FEXT_T8I8I16_ins<string asmstr, string asmstr2>:
+  MipsPseudo16<(outs),
+               (ins CPU16Regs:$rx, simm16:$imm, brtarget:$targ),
+               !strconcat(asmstr2, !strconcat("\t$rx, $imm\n\t",
+               !strconcat(asmstr, "\t$targ"))), []> {
+  let isCodeGenOnly=1;
+  let usesCustomInserter = 1;
+}
+//
+
+
+//
+// I8_MOVR32 instruction format (used only by the MOVR32 instructio
+//
+class FI8_MOVR3216_ins<string asmstr, InstrItinClass itin>:
+       FI8_MOVR3216<(outs CPU16Regs:$rz), (ins GPR32:$r32),
+       !strconcat(asmstr,  "\t$rz, $r32"), [], itin>;
+
+//
+// I8_MOV32R instruction format (used only by MOV32R instruction)
+//
+
+class FI8_MOV32R16_ins<string asmstr, InstrItinClass itin>:
+  FI8_MOV32R16<(outs GPR32:$r32), (ins CPU16Regs:$rz),
+               !strconcat(asmstr,  "\t$r32, $rz"), [], itin>;
+
+//
+// This are pseudo formats for multiply
+// This first one can be changed to non-pseudo now.
+//
+// MULT
+//
+class FMULT16_ins<string asmstr, InstrItinClass itin> :
+  MipsPseudo16<(outs), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+               !strconcat(asmstr, "\t$rx, $ry"), []>;
+
+//
+// MULT-LO
+//
+class FMULT16_LO_ins<string asmstr, InstrItinClass itin> :
+  MipsPseudo16<(outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+               !strconcat(asmstr, "\t$rx, $ry\n\tmflo\t$rz"), []> {
+  let isCodeGenOnly=1;
+}
+
+//
+// RR-type instruction format
+//
+
+class FRR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$rx, $ry"), [], itin> {
+}
+
+class FRRBreakNull16_ins<string asmstr, InstrItinClass itin> :
+  FRRBreak16<(outs), (ins), asmstr, [], itin> {
+  let Code=0;
+}
+
+class FRR16R_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs), (ins  CPU16Regs:$rx, CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$rx, $ry"), [], itin> {
+}
+
+class FRRTR16_ins<string asmstr> :
+  MipsPseudo16<(outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+               !strconcat(asmstr, "\t$rx, $ry\n\tmove\t$rz, $$t8"), []> ;
+
+//
+// maybe refactor but need a $zero as a dummy first parameter
+//
+class FRR16_div_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs ), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$$zero, $rx, $ry"), [], itin> ;
+
+class FUnaryRR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$rx, $ry"), [], itin> ;
+
+
+class FRR16_M_ins<bits<5> f, string asmstr,
+                  InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rx), (ins),
+        !strconcat(asmstr, "\t$rx"), [], itin>;
+
+class FRxRxRy16_ins<bits<5> f, string asmstr,
+                    InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+            !strconcat(asmstr, "\t$rz, $ry"),
+            [], itin> {
+  let Constraints = "$rx = $rz";
+}
+
+let rx=0 in
+class FRR16_JALRC_RA_only_ins<bits<1> nd_, bits<1> l_,
+                              string asmstr, InstrItinClass itin>:
+  FRR16_JALRC<nd_, l_, 1, (outs), (ins), !strconcat(asmstr, "\t $$ra"),
+              [], itin> ;
+
+
+class FRR16_JALRC_ins<bits<1> nd, bits<1> l, bits<1> ra,
+                      string asmstr, InstrItinClass itin>:
+  FRR16_JALRC<nd, l, ra, (outs), (ins CPU16Regs:$rx),
+              !strconcat(asmstr, "\t $rx"), [], itin> ;
+
+class FRR_SF16_ins
+  <bits<5> _funct, bits<3> _subfunc,
+    string asmstr, InstrItinClass itin>:
+  FRR_SF16<_funct, _subfunc, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_),
+           !strconcat(asmstr, "\t $rx"),
+           [], itin> {
+  let Constraints = "$rx_ = $rx";
+  }
+//
+// RRR-type instruction format
+//
+
+class FRRR16_ins<bits<2> _f, string asmstr,  InstrItinClass itin> :
+  FRRR16<_f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+         !strconcat(asmstr, "\t$rz, $rx, $ry"), [], itin>;
+
+//
+// These Sel patterns support the generation of conditional move
+// pseudo instructions.
+//
+// The nomenclature uses the components making up the pseudo and may
+// be a bit counter intuitive when compared with the end result we seek.
+// For example using a bqez in the example directly below results in the
+// conditional move being done if the tested register is not zero.
+// I considered in easier to check by keeping the pseudo consistent with
+// it's components but it could have been done differently.
+//
+// The simplest case is when can test and operand directly and do the
+// conditional move based on a simple mips16 conditional
+//  branch instruction.
+// for example:
+// if $op == beqz or bnez:
+//
+// $op1 $rt, .+4
+// move $rd, $rs
+//
+// if $op == beqz, then if $rt != 0, then the conditional assignment
+// $rd = $rs is done.
+
+// if $op == bnez, then if $rt == 0, then the conditional assignment
+// $rd = $rs is done.
+//
+// So this pseudo class only has one operand, i.e. op
+//
+class Sel<string op>:
+  MipsPseudo16<(outs CPU16Regs:$rd_), (ins CPU16Regs:$rd, CPU16Regs:$rs,
+               CPU16Regs:$rt),
+               !strconcat(op, "\t$rt, .+4\n\t\n\tmove $rd, $rs"), []> {
+  //let isCodeGenOnly=1;
+  let Constraints = "$rd = $rd_";
+  let usesCustomInserter = 1;
+}
+
+//
+// The next two instruction classes allow for an operand which tests
+// two operands and returns a value in register T8 and
+//then does a conditional branch based on the value of T8
+//
+
+// op2 can be cmpi or slti/sltiu
+// op1 can bteqz or btnez
+// the operands for op2 are a register and a signed constant
+//
+// $op2 $t, $imm  ;test register t and branch conditionally
+// $op1 .+4       ;op1 is a conditional branch
+// move $rd, $rs
+//
+//
+class SeliT<string op1, string op2>:
+  MipsPseudo16<(outs CPU16Regs:$rd_), (ins CPU16Regs:$rd, CPU16Regs:$rs,
+                                       CPU16Regs:$rl, simm16:$imm),
+               !strconcat(op2,
+               !strconcat("\t$rl, $imm\n\t",
+               !strconcat(op1, "\t.+4\n\tmove $rd, $rs"))), []> {
+  let isCodeGenOnly=1;
+  let Constraints = "$rd = $rd_";
+  let usesCustomInserter = 1;
+}
+
+//
+// op2 can be cmp or slt/sltu
+// op1 can be bteqz or btnez
+// the operands for op2 are two registers
+// op1 is a conditional branch
+//
+//
+// $op2 $rl, $rr  ;test registers rl,rr
+// $op1 .+4       ;op2 is a conditional branch
+// move $rd, $rs
+//
+//
+class SelT<string op1, string op2>:
+  MipsPseudo16<(outs CPU16Regs:$rd_),
+               (ins CPU16Regs:$rd, CPU16Regs:$rs,
+                CPU16Regs:$rl, CPU16Regs:$rr),
+               !strconcat(op2,
+               !strconcat("\t$rl, $rr\n\t",
+               !strconcat(op1, "\t.+4\n\tmove $rd, $rs"))), []> {
+  let isCodeGenOnly=1;
+  let Constraints = "$rd = $rd_";
+  let usesCustomInserter = 1;
+}
+
+//
+// 32 bit constant
+//
+def Constant32:
+  MipsPseudo16<(outs), (ins simm32:$imm), "\t.word $imm", []>;
+
+def LwConstant32:
+  MipsPseudo16<(outs CPU16Regs:$rx), (ins simm32:$imm, simm32:$constid),
+    "lw\t$rx, 1f\n\tb\t2f\n\t.align\t2\n1: \t.word\t$imm\n2:", []>;
+
+
+//
+// Some general instruction class info
+//
+//
+
+class ArithLogic16Defs<bit isCom=0> {
+  bits<5> shamt = 0;
+  bit isCommutable = isCom;
+  bit isReMaterializable = 1;
+  bit hasSideEffects = 0;
+}
+
+class branch16 {
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit isBarrier = 1;
+}
+
+class cbranch16 {
+  bit isBranch = 1;
+  bit isTerminator = 1;
+}
+
+class MayLoad {
+  bit mayLoad = 1;
+}
+
+class MayStore {
+  bit mayStore = 1;
+}
+//
+
+
+// Format: ADDIU rx, immediate MIPS16e
+// Purpose: Add Immediate Unsigned Word (2-Operand, Extended)
+// To add a constant to a 32-bit integer.
+//
+def AddiuRxImmX16: FEXT_RI16_ins<0b01001, "addiu", IIM16Alu>;
+
+def AddiuRxRxImm16: F2RI16_ins<0b01001, "addiu", IIM16Alu>,
+  ArithLogic16Defs<0> {
+  let AddedComplexity = 5;
+}
+def AddiuRxRxImmX16: FEXT_2RI16_ins<0b01001, "addiu", IIM16Alu>,
+  ArithLogic16Defs<0> {
+  let isCodeGenOnly = 1;
+}
+
+def AddiuRxRyOffMemX16:
+  FEXT_RRI_A16_mem_ins<0, "addiu", mem16_ea, IIM16Alu>;
+
+//
+
+// Format: ADDIU rx, pc, immediate MIPS16e
+// Purpose: Add Immediate Unsigned Word (3-Operand, PC-Relative, Extended)
+// To add a constant to the program counter.
+//
+def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIM16Alu>;
+
+//
+// Format: ADDIU sp, immediate MIPS16e
+// Purpose: Add Immediate Unsigned Word (2-Operand, SP-Relative, Extended)
+// To add a constant to the stack pointer.
+//
+def AddiuSpImm16
+  : FI816_SP_ins<0b011, "addiu", IIM16Alu> {
+  let Defs = [SP];
+  let Uses = [SP];
+  let AddedComplexity = 5;
+}
+
+def AddiuSpImmX16
+  : FEXT_I816_SP_ins<0b011, "addiu", IIM16Alu> {
+  let Defs = [SP];
+  let Uses = [SP];
+}
+
+//
+// Format: ADDU rz, rx, ry MIPS16e
+// Purpose: Add Unsigned Word (3-Operand)
+// To add 32-bit integers.
+//
+
+def AdduRxRyRz16: FRRR16_ins<01, "addu", IIM16Alu>, ArithLogic16Defs<1>;
+
+//
+// Format: AND rx, ry MIPS16e
+// Purpose: AND
+// To do a bitwise logical AND.
+
+def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIM16Alu>, ArithLogic16Defs<1>;
+
+
+//
+// Format: BEQZ rx, offset MIPS16e
+// Purpose: Branch on Equal to Zero
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIM16Alu>, cbranch16;
+
+
+//
+// Format: BEQZ rx, offset MIPS16e
+// Purpose: Branch on Equal to Zero (Extended)
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIM16Alu>, cbranch16;
+
+//
+// Format: B offset MIPS16e
+// Purpose: Unconditional Branch (Extended)
+// To do an unconditional PC-relative branch.
+//
+
+def Bimm16: FI16_ins<0b00010, "b", IIM16Alu>, branch16;
+
+// Format: B offset MIPS16e
+// Purpose: Unconditional Branch
+// To do an unconditional PC-relative branch.
+//
+def BimmX16: FEXT_I16_ins<0b00010, "b", IIM16Alu>, branch16;
+
+//
+// Format: BNEZ rx, offset MIPS16e
+// Purpose: Branch on Not Equal to Zero
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BnezRxImm16: FRI16_B_ins<0b00101, "bnez", IIM16Alu>, cbranch16;
+
+//
+// Format: BNEZ rx, offset MIPS16e
+// Purpose: Branch on Not Equal to Zero (Extended)
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIM16Alu>, cbranch16;
+
+
+//
+//Format: BREAK immediate
+// Purpose: Breakpoint
+// To cause a Breakpoint exception.
+
+def Break16: FRRBreakNull16_ins<"break 0", IIM16Alu>;
+//
+// Format: BTEQZ offset MIPS16e
+// Purpose: Branch on T Equal to Zero (Extended)
+// To test special register T then do a PC-relative conditional branch.
+//
+def Bteqz16: FI816_ins<0b000, "bteqz", IIM16Alu>, cbranch16 {
+  let Uses = [T8];
+}
+
+def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIM16Alu>, cbranch16 {
+  let Uses = [T8];
+}
+
+def BteqzT8CmpX16: FEXT_T8I816_ins<"bteqz", "cmp">, cbranch16;
+
+def BteqzT8CmpiX16: FEXT_T8I8I16_ins<"bteqz", "cmpi">,
+  cbranch16;
+
+def BteqzT8SltX16: FEXT_T8I816_ins<"bteqz", "slt">, cbranch16;
+
+def BteqzT8SltuX16: FEXT_T8I816_ins<"bteqz", "sltu">, cbranch16;
+
+def BteqzT8SltiX16: FEXT_T8I8I16_ins<"bteqz", "slti">, cbranch16;
+
+def BteqzT8SltiuX16: FEXT_T8I8I16_ins<"bteqz", "sltiu">,
+  cbranch16;
+
+//
+// Format: BTNEZ offset MIPS16e
+// Purpose: Branch on T Not Equal to Zero (Extended)
+// To test special register T then do a PC-relative conditional branch.
+//
+
+def Btnez16: FI816_ins<0b001, "btnez", IIM16Alu>, cbranch16 {
+  let Uses = [T8];
+}
+
+def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIM16Alu> ,cbranch16 {
+  let Uses = [T8];
+}
+
+def BtnezT8CmpX16: FEXT_T8I816_ins<"btnez", "cmp">, cbranch16;
+
+def BtnezT8CmpiX16: FEXT_T8I8I16_ins<"btnez", "cmpi">, cbranch16;
+
+def BtnezT8SltX16: FEXT_T8I816_ins<"btnez", "slt">, cbranch16;
+
+def BtnezT8SltuX16: FEXT_T8I816_ins<"btnez", "sltu">, cbranch16;
+
+def BtnezT8SltiX16: FEXT_T8I8I16_ins<"btnez", "slti">, cbranch16;
+
+def BtnezT8SltiuX16: FEXT_T8I8I16_ins<"btnez", "sltiu">,
+  cbranch16;
+
+//
+// Format: CMP rx, ry MIPS16e
+// Purpose: Compare
+// To compare the contents of two GPRs.
+//
+def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIM16Alu> {
+  let Defs = [T8];
+}
+
+//
+// Format: CMPI rx, immediate MIPS16e
+// Purpose: Compare Immediate
+// To compare a constant with the contents of a GPR.
+//
+def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIM16Alu> {
+  let Defs = [T8];
+}
+
+//
+// Format: CMPI rx, immediate MIPS16e
+// Purpose: Compare Immediate (Extended)
+// To compare a constant with the contents of a GPR.
+//
+def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIM16Alu> {
+  let Defs = [T8];
+}
+
+
+//
+// Format: DIV rx, ry MIPS16e
+// Purpose: Divide Word
+// To divide 32-bit signed integers.
+//
+def DivRxRy16: FRR16_div_ins<0b11010, "div", IIM16Alu> {
+  let Defs = [HI0, LO0];
+}
+
+//
+// Format: DIVU rx, ry MIPS16e
+// Purpose: Divide Unsigned Word
+// To divide 32-bit unsigned integers.
+//
+def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIM16Alu> {
+  let Defs = [HI0, LO0];
+}
+//
+// Format: JAL target MIPS16e
+// Purpose: Jump and Link
+// To execute a procedure call within the current 256 MB-aligned
+// region and preserve the current ISA.
+//
+
+def Jal16 : FJAL16_ins<0b0, "jal", IIM16Alu> {
+  let hasDelaySlot = 0;  // not true, but we add the nop for now
+  let isCall=1;
+  let Defs = [RA];
+}
+
+def JalB16 : FJALB16_ins<0b0, "jal", IIM16Alu>, branch16 {
+  let hasDelaySlot = 0;  // not true, but we add the nop for now
+  let isBranch=1;
+  let Defs = [RA];
+}
+
+//
+// Format: JR ra MIPS16e
+// Purpose: Jump Register Through Register ra
+// To execute a branch to the instruction address in the return
+// address register.
+//
+
+def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIM16Alu> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+  let hasDelaySlot = 1;
+  let isTerminator=1;
+  let isBarrier=1;
+}
+
+def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIM16Alu> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+  let isTerminator=1;
+  let isBarrier=1;
+}
+
+def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIM16Alu> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+  let isTerminator=1;
+  let isBarrier=1;
+}
+//
+// Format: LB ry, offset(rx) MIPS16e
+// Purpose: Load Byte (Extended)
+// To load a byte from memory as a signed value.
+//
+def LbRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lb", mem16, II_LB>, MayLoad{
+  let isCodeGenOnly = 1;
+}
+
+//
+// Format: LBU ry, offset(rx) MIPS16e
+// Purpose: Load Byte Unsigned (Extended)
+// To load a byte from memory as a unsigned value.
+//
+def LbuRxRyOffMemX16:
+  FEXT_RRI16_mem_ins<0b10100, "lbu", mem16, II_LBU>, MayLoad {
+  let isCodeGenOnly = 1;
+}
+
+//
+// Format: LH ry, offset(rx) MIPS16e
+// Purpose: Load Halfword signed (Extended)
+// To load a halfword from memory as a signed value.
+//
+def LhRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lh", mem16, II_LH>, MayLoad{
+  let isCodeGenOnly = 1;
+}
+
+//
+// Format: LHU ry, offset(rx) MIPS16e
+// Purpose: Load Halfword unsigned (Extended)
+// To load a halfword from memory as an unsigned value.
+//
+def LhuRxRyOffMemX16:
+  FEXT_RRI16_mem_ins<0b10100, "lhu", mem16, II_LHU>, MayLoad {
+  let isCodeGenOnly = 1;
+}
+
+//
+// Format: LI rx, immediate MIPS16e
+// Purpose: Load Immediate
+// To load a constant into a GPR.
+//
+def LiRxImm16: FRI16_ins<0b01101, "li", IIM16Alu>;
+
+//
+// Format: LI rx, immediate MIPS16e
+// Purpose: Load Immediate (Extended)
+// To load a constant into a GPR.
+//
+def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIM16Alu>;
+
+def LiRxImmAlignX16: FEXT_RI16_ins<0b01101, ".align 2\n\tli", IIM16Alu> {
+  let isCodeGenOnly = 1;
+}
+
+//
+// Format: LW ry, offset(rx) MIPS16e
+// Purpose: Load Word (Extended)
+// To load a word from memory as a signed value.
+//
+def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, II_LW>, MayLoad{
+  let isCodeGenOnly = 1;
+}
+
+// Format: LW rx, offset(sp) MIPS16e
+// Purpose: Load Word (SP-Relative, Extended)
+// To load an SP-relative word from memory as a signed value.
+//
+def LwRxSpImmX16: FEXT_RRI16_mem_ins<0b10010, "lw", mem16sp, II_LW>, MayLoad;
+
+def LwRxPcTcp16: FRI16_TCP_ins<0b10110, "lw", II_LW>, MayLoad;
+
+def LwRxPcTcpX16: FEXT_RI16_TCP_ins<0b10110, "lw", II_LW>, MayLoad;
+//
+// Format: MOVE r32, rz MIPS16e
+// Purpose: Move
+// To move the contents of a GPR to a GPR.
+//
+def Move32R16: FI8_MOV32R16_ins<"move", IIM16Alu>;
+
+//
+// Format: MOVE ry, r32 MIPS16e
+//Purpose: Move
+// To move the contents of a GPR to a GPR.
+//
+def MoveR3216: FI8_MOVR3216_ins<"move", IIM16Alu>;
+
+//
+// Format: MFHI rx MIPS16e
+// Purpose: Move From HI Register
+// To copy the special purpose HI register to a GPR.
+//
+def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIM16Alu> {
+  let Uses = [HI0];
+  let hasSideEffects = 0;
+}
+
+//
+// Format: MFLO rx MIPS16e
+// Purpose: Move From LO Register
+// To copy the special purpose LO register to a GPR.
+//
+def Mflo16: FRR16_M_ins<0b10010, "mflo", IIM16Alu> {
+  let Uses = [LO0];
+  let hasSideEffects = 0;
+}
+
+//
+// Pseudo Instruction for mult
+//
+def MultRxRy16:  FMULT16_ins<"mult",  IIM16Alu> {
+  let isCommutable = 1;
+  let hasSideEffects = 0;
+  let Defs = [HI0, LO0];
+}
+
+def MultuRxRy16: FMULT16_ins<"multu", IIM16Alu> {
+  let isCommutable = 1;
+  let hasSideEffects = 0;
+  let Defs = [HI0, LO0];
+}
+
+//
+// Format: MULT rx, ry MIPS16e
+// Purpose: Multiply Word
+// To multiply 32-bit signed integers.
+//
+def MultRxRyRz16: FMULT16_LO_ins<"mult", IIM16Alu> {
+  let isCommutable = 1;
+  let hasSideEffects = 0;
+  let Defs = [HI0, LO0];
+}
+
+//
+// Format: MULTU rx, ry MIPS16e
+// Purpose: Multiply Unsigned Word
+// To multiply 32-bit unsigned integers.
+//
+def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIM16Alu> {
+  let isCommutable = 1;
+  let hasSideEffects = 0;
+  let Defs = [HI0, LO0];
+}
+
+//
+// Format: NEG rx, ry MIPS16e
+// Purpose: Negate
+// To negate an integer value.
+//
+def NegRxRy16: FUnaryRR16_ins<0b11101, "neg", IIM16Alu>;
+
+//
+// Format: NOT rx, ry MIPS16e
+// Purpose: Not
+// To complement an integer value
+//
+def NotRxRy16: FUnaryRR16_ins<0b01111, "not", IIM16Alu>;
+
+//
+// Format: OR rx, ry MIPS16e
+// Purpose: Or
+// To do a bitwise logical OR.
+//
+def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIM16Alu>, ArithLogic16Defs<1>;
+
+//
+// Format: RESTORE {ra,}{s0/s1/s0-1,}{framesize}
+// (All args are optional) MIPS16e
+// Purpose: Restore Registers and Deallocate Stack Frame
+// To deallocate a stack frame before exit from a subroutine,
+// restoring return address and static registers, and adjusting
+// stack
+//
+
+def Restore16:
+  FI8_SVRS16<0b1, (outs), (ins variable_ops),
+             "", [], II_RESTORE >, MayLoad {
+  let isCodeGenOnly = 1;
+  let Defs = [SP];
+  let Uses = [SP];
+}
+
+
+def RestoreX16:
+  FI8_SVRS16<0b1, (outs), (ins variable_ops),
+             "", [], II_RESTORE >, MayLoad {
+  let isCodeGenOnly = 1;
+  let Defs = [SP];
+  let Uses = [SP];
+}
+
+//
+// Format: SAVE {ra,}{s0/s1/s0-1,}{framesize} (All arguments are optional)
+// MIPS16e
+// Purpose: Save Registers and Set Up Stack Frame
+// To set up a stack frame on entry to a subroutine,
+// saving return address and static registers, and adjusting stack
+//
+def Save16: 
+  FI8_SVRS16<0b1, (outs), (ins variable_ops),
+             "", [], II_SAVE >, MayStore {
+  let isCodeGenOnly = 1;
+  let Uses = [SP];
+  let Defs = [SP];
+}
+
+def SaveX16:
+  FI8_SVRS16<0b1, (outs), (ins variable_ops),
+             "", [], II_SAVE >, MayStore {
+  let isCodeGenOnly = 1;
+  let Uses = [SP];
+  let Defs = [SP];
+}
+//
+// Format: SB ry, offset(rx) MIPS16e
+// Purpose: Store Byte (Extended)
+// To store a byte to memory.
+//
+def SbRxRyOffMemX16:
+  FEXT_RRI16_mem2_ins<0b11000, "sb", mem16, II_SB>, MayStore;
+
+//
+// Format: SEB rx MIPS16e
+// Purpose: Sign-Extend Byte
+// Sign-extend least significant byte in register rx.
+//
+def SebRx16
+  : FRR_SF16_ins<0b10001, 0b100, "seb", IIM16Alu>;
+
+//
+// Format: SEH rx MIPS16e
+// Purpose: Sign-Extend Halfword
+// Sign-extend least significant word in register rx.
+//
+def SehRx16
+  : FRR_SF16_ins<0b10001, 0b101, "seh", IIM16Alu>;
+
+//
+// The Sel(T) instructions are pseudos
+// T means that they use T8 implicitly.
+//
+//
+// Format: SelBeqZ rd, rs, rt
+// Purpose: if rt==0, do nothing
+//          else rs = rt
+//
+def SelBeqZ: Sel<"beqz">;
+
+//
+// Format:  SelTBteqZCmp rd, rs, rl, rr
+// Purpose: b = Cmp rl, rr.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZCmp: SelT<"bteqz", "cmp">;
+
+//
+// Format:  SelTBteqZCmpi rd, rs, rl, rr
+// Purpose: b = Cmpi rl, imm.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZCmpi: SeliT<"bteqz", "cmpi">;
+
+//
+// Format:  SelTBteqZSlt rd, rs, rl, rr
+// Purpose: b = Slt rl, rr.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZSlt: SelT<"bteqz", "slt">;
+
+//
+// Format:  SelTBteqZSlti rd, rs, rl, rr
+// Purpose: b = Slti rl, imm.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZSlti: SeliT<"bteqz", "slti">;
+
+//
+// Format:  SelTBteqZSltu rd, rs, rl, rr
+// Purpose: b = Sltu rl, rr.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZSltu: SelT<"bteqz", "sltu">;
+
+//
+// Format:  SelTBteqZSltiu rd, rs, rl, rr
+// Purpose: b = Sltiu rl, imm.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZSltiu: SeliT<"bteqz", "sltiu">;
+
+//
+// Format: SelBnez rd, rs, rt
+// Purpose: if rt!=0, do nothing
+//          else rs = rt
+//
+def SelBneZ: Sel<"bnez">;
+
+//
+// Format:  SelTBtneZCmp rd, rs, rl, rr
+// Purpose: b = Cmp rl, rr.
+//          If b!=0 then do nothing.
+//          if b0=0 then rd = rs
+//
+def SelTBtneZCmp: SelT<"btnez", "cmp">;
+
+//
+// Format:  SelTBtnezCmpi rd, rs, rl, rr
+// Purpose: b = Cmpi rl, imm.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZCmpi: SeliT<"btnez", "cmpi">;
+
+//
+// Format:  SelTBtneZSlt rd, rs, rl, rr
+// Purpose: b = Slt rl, rr.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZSlt: SelT<"btnez", "slt">;
+
+//
+// Format:  SelTBtneZSlti rd, rs, rl, rr
+// Purpose: b = Slti rl, imm.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZSlti: SeliT<"btnez", "slti">;
+
+//
+// Format:  SelTBtneZSltu rd, rs, rl, rr
+// Purpose: b = Sltu rl, rr.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZSltu: SelT<"btnez", "sltu">;
+
+//
+// Format:  SelTBtneZSltiu rd, rs, rl, rr
+// Purpose: b = Slti rl, imm.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZSltiu: SeliT<"btnez", "sltiu">;
+//
+//
+// Format: SH ry, offset(rx) MIPS16e
+// Purpose: Store Halfword (Extended)
+// To store a halfword to memory.
+//
+def ShRxRyOffMemX16:
+  FEXT_RRI16_mem2_ins<0b11001, "sh", mem16, II_SH>, MayStore;
+
+//
+// Format: SLL rx, ry, sa MIPS16e
+// Purpose: Shift Word Left Logical (Extended)
+// To execute a left-shift of a word by a fixed number of bits-0 to 31 bits.
+//
+def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIM16Alu>;
+
+//
+// Format: SLLV ry, rx MIPS16e
+// Purpose: Shift Word Left Logical Variable
+// To execute a left-shift of a word by a variable number of bits.
+//
+def SllvRxRy16 : FRxRxRy16_ins<0b00100, "sllv", IIM16Alu>;
+
+// Format: SLTI rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIM16Alu> {
+  let Defs = [T8];
+}
+
+//
+// Format: SLTI rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate (Extended)
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiRxImmX16: FEXT_RI16R_ins<0b01010, "slti", IIM16Alu> {
+  let Defs = [T8];
+}
+
+def SltiCCRxImmX16: FEXT_CCRXI16_ins<"slti">;
+
+// Format: SLTIU rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate Unsigned
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIM16Alu> {
+  let Defs = [T8];
+}
+
+//
+// Format: SLTI rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate Unsigned (Extended)
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiuRxImmX16: FEXT_RI16R_ins<0b01011, "sltiu", IIM16Alu> {
+  let Defs = [T8];
+}
+//
+// Format: SLTIU rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate Unsigned (Extended)
+// To record the result of a less-than comparison with a constant.
+//
+def SltiuCCRxImmX16: FEXT_CCRXI16_ins<"sltiu">;
+
+//
+// Format: SLT rx, ry MIPS16e
+// Purpose: Set on Less Than
+// To record the result of a less-than comparison.
+//
+def SltRxRy16: FRR16R_ins<0b00010, "slt", IIM16Alu>{
+  let Defs = [T8];
+}
+
+def SltCCRxRy16: FCCRR16_ins<"slt">;
+
+// Format: SLTU rx, ry MIPS16e
+// Purpose: Set on Less Than Unsigned
+// To record the result of an unsigned less-than comparison.
+//
+def SltuRxRy16: FRR16R_ins<0b00011, "sltu", IIM16Alu>{
+  let Defs = [T8];
+}
+
+def SltuRxRyRz16: FRRTR16_ins<"sltu"> {
+  let isCodeGenOnly=1;
+  let Defs = [T8];
+}
+
+
+def SltuCCRxRy16: FCCRR16_ins<"sltu">;
+//
+// Format: SRAV ry, rx MIPS16e
+// Purpose: Shift Word Right Arithmetic Variable
+// To execute an arithmetic right-shift of a word by a variable
+// number of bits.
+//
+def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIM16Alu>;
+
+
+//
+// Format: SRA rx, ry, sa MIPS16e
+// Purpose: Shift Word Right Arithmetic (Extended)
+// To execute an arithmetic right-shift of a word by a fixed
+// number of bits-1 to 8 bits.
+//
+def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIM16Alu>;
+
+
+//
+// Format: SRLV ry, rx MIPS16e
+// Purpose: Shift Word Right Logical Variable
+// To execute a logical right-shift of a word by a variable
+// number of bits.
+//
+def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIM16Alu>;
+
+
+//
+// Format: SRL rx, ry, sa MIPS16e
+// Purpose: Shift Word Right Logical (Extended)
+// To execute a logical right-shift of a word by a fixed
+// number of bits-1 to 31 bits.
+//
+def SrlX16: FEXT_SHIFT16_ins<0b10, "srl", IIM16Alu>;
+
+//
+// Format: SUBU rz, rx, ry MIPS16e
+// Purpose: Subtract Unsigned Word
+// To subtract 32-bit integers
+//
+def SubuRxRyRz16: FRRR16_ins<0b11, "subu", IIM16Alu>, ArithLogic16Defs<0>;
+
+//
+// Format: SW ry, offset(rx) MIPS16e
+// Purpose: Store Word (Extended)
+// To store a word to memory.
+//
+def SwRxRyOffMemX16: FEXT_RRI16_mem2_ins<0b11011, "sw", mem16, II_SW>, MayStore;
+
+//
+// Format: SW rx, offset(sp) MIPS16e
+// Purpose: Store Word rx (SP-Relative)
+// To store an SP-relative word to memory.
+//
+def SwRxSpImmX16: FEXT_RRI16_mem2_ins<0b11010, "sw", mem16sp, II_SW>, MayStore;
+
+//
+//
+// Format: XOR rx, ry MIPS16e
+// Purpose: Xor
+// To do a bitwise logical XOR.
+//
+def XorRxRxRy16: FRxRxRy16_ins<0b01110, "xor", IIM16Alu>, ArithLogic16Defs<1>;
+
+class Mips16Pat<dag pattern, dag result> : Pat<pattern, result> {
+  let Predicates = [InMips16Mode];
+}
+
+// Unary Arith/Logic
+//
+class ArithLogicU_pat<PatFrag OpNode, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$r),
+            (I CPU16Regs:$r)>;
+
+def: ArithLogicU_pat<not, NotRxRy16>;
+def: ArithLogicU_pat<ineg, NegRxRy16>;
+
+class ArithLogic16_pat<SDNode OpNode, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$l, CPU16Regs:$r),
+            (I CPU16Regs:$l, CPU16Regs:$r)>;
+
+def: ArithLogic16_pat<add, AdduRxRyRz16>;
+def: ArithLogic16_pat<and, AndRxRxRy16>;
+def: ArithLogic16_pat<mul, MultRxRyRz16>;
+def: ArithLogic16_pat<or, OrRxRxRy16>;
+def: ArithLogic16_pat<sub, SubuRxRyRz16>;
+def: ArithLogic16_pat<xor, XorRxRxRy16>;
+
+// Arithmetic and logical instructions with 2 register operands.
+
+class ArithLogicI16_pat<SDNode OpNode, PatFrag imm_type, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$in, imm_type:$imm),
+            (I CPU16Regs:$in, imm_type:$imm)>;
+
+def: ArithLogicI16_pat<add, immSExt8, AddiuRxRxImm16>;
+def: ArithLogicI16_pat<add, immSExt16, AddiuRxRxImmX16>;
+def: ArithLogicI16_pat<shl, immZExt5, SllX16>;
+def: ArithLogicI16_pat<srl, immZExt5, SrlX16>;
+def: ArithLogicI16_pat<sra, immZExt5, SraX16>;
+
+class shift_rotate_reg16_pat<SDNode OpNode, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$r, CPU16Regs:$ra),
+            (I CPU16Regs:$r, CPU16Regs:$ra)>;
+
+def: shift_rotate_reg16_pat<shl, SllvRxRy16>;
+def: shift_rotate_reg16_pat<sra, SravRxRy16>;
+def: shift_rotate_reg16_pat<srl, SrlvRxRy16>;
+
+class LoadM16_pat<PatFrag OpNode, Instruction I, ComplexPattern Addr> :
+  Mips16Pat<(OpNode Addr:$addr), (I Addr:$addr)>;
+
+def: LoadM16_pat<sextloadi8, LbRxRyOffMemX16, addr16>;
+def: LoadM16_pat<zextloadi8, LbuRxRyOffMemX16, addr16>;
+def: LoadM16_pat<sextloadi16, LhRxRyOffMemX16, addr16>;
+def: LoadM16_pat<zextloadi16, LhuRxRyOffMemX16, addr16>;
+def: LoadM16_pat<load, LwRxSpImmX16, addr16sp>;
+
+class StoreM16_pat<PatFrag OpNode, Instruction I, ComplexPattern Addr> :
+  Mips16Pat<(OpNode CPU16Regs:$r, Addr:$addr), (I CPU16Regs:$r, Addr:$addr)>;
+
+def: StoreM16_pat<truncstorei8, SbRxRyOffMemX16, addr16>;
+def: StoreM16_pat<truncstorei16, ShRxRyOffMemX16, addr16>;
+def: StoreM16_pat<store, SwRxSpImmX16, addr16sp>;
+
+// Unconditional branch
+class UncondBranch16_pat<SDNode OpNode, Instruction I>:
+  Mips16Pat<(OpNode bb:$imm16), (I bb:$imm16)> {
+    let Predicates = [InMips16Mode];
+  }
+
+def : Mips16Pat<(MipsJmpLink (i32 tglobaladdr:$dst)),
+                (Jal16 tglobaladdr:$dst)>;
+
+def : Mips16Pat<(MipsJmpLink (i32 texternalsym:$dst)),
+                (Jal16 texternalsym:$dst)>;
+
+// Indirect branch
+def: Mips16Pat<(brind CPU16Regs:$rs), (JrcRx16 CPU16Regs:$rs)> {
+  // Ensure that the addition of MIPS32r6/MIPS64r6 support does not change
+  // MIPS16's behaviour.
+  let AddedComplexity = 1;
+}
+
+// Jump and Link (Call)
+let isCall=1, hasDelaySlot=0 in
+def JumpLinkReg16:
+  FRR16_JALRC<0, 0, 0, (outs), (ins CPU16Regs:$rs),
+              "jalrc \t$rs", [(MipsJmpLink CPU16Regs:$rs)], II_JALRC> {
+  let Defs = [RA];
+}
+
+// Mips16 pseudos
+let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1,
+  hasExtraSrcRegAllocReq = 1 in
+def RetRA16 : MipsPseudo16<(outs), (ins), "", [(MipsRet)]>;
+
+
+// setcc patterns
+
+class SetCC_R16<PatFrag cond_op, Instruction I>:
+  Mips16Pat<(cond_op CPU16Regs:$rx, CPU16Regs:$ry),
+            (I CPU16Regs:$rx, CPU16Regs:$ry)>;
+
+class SetCC_I16<PatFrag cond_op, PatLeaf imm_type, Instruction I>:
+  Mips16Pat<(cond_op CPU16Regs:$rx, imm_type:$imm16),
+            (I CPU16Regs:$rx, imm_type:$imm16)>;
+
+
+def: Mips16Pat<(i32 addr16sp:$addr), (AddiuRxRyOffMemX16 addr16sp:$addr)>;
+
+
+// Large (>16 bit) immediate loads
+def : Mips16Pat<(i32 imm:$imm), (LwConstant32 imm:$imm, -1)>;
+
+// Carry MipsPatterns
+def : Mips16Pat<(subc CPU16Regs:$lhs, CPU16Regs:$rhs),
+                (SubuRxRyRz16 CPU16Regs:$lhs, CPU16Regs:$rhs)>;
+def : Mips16Pat<(addc CPU16Regs:$lhs, CPU16Regs:$rhs),
+                (AdduRxRyRz16 CPU16Regs:$lhs, CPU16Regs:$rhs)>;
+def : Mips16Pat<(addc  CPU16Regs:$src, immSExt16:$imm),
+                (AddiuRxRxImmX16 CPU16Regs:$src, imm:$imm)>;
+
+//
+// Some branch conditional patterns are not generated by llvm at this time.
+// Some are for seemingly arbitrary reasons not used: i.e. with signed number
+// comparison they are used and for unsigned a different pattern is used.
+// I am pushing upstream from the full mips16 port and it seemed that I needed
+// these earlier and the mips32 port has these but now I cannot create test
+// cases that use these patterns. While I sort this all out I will leave these
+// extra patterns commented out and if I can be sure they are really not used,
+// I will delete the code. I don't want to check the code in uncommented without
+// a valid test case. In some cases, the compiler is generating patterns with
+// setcc instead and earlier I had implemented setcc first so may have masked
+// the problem. The setcc variants are suboptimal for mips16 so I may wantto
+// figure out how to enable the brcond patterns or else possibly new
+// combinations of of brcond and setcc.
+//
+//
+// bcond-seteq
+//
+def: Mips16Pat
+  <(brcond (i32 (seteq CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BteqzT8CmpX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+
+def: Mips16Pat
+  <(brcond (i32 (seteq CPU16Regs:$rx, immZExt16:$imm)), bb:$targ16),
+   (BteqzT8CmpiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$targ16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (seteq CPU16Regs:$rx, 0)), bb:$targ16),
+   (BeqzRxImm16 CPU16Regs:$rx, bb:$targ16)
+  >;
+
+//
+// bcond-setgt (do we need to have this pair of setlt, setgt??)
+//
+def: Mips16Pat
+  <(brcond (i32 (setgt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BtnezT8SltX16 CPU16Regs:$ry, CPU16Regs:$rx,  bb:$imm16)
+  >;
+
+//
+// bcond-setge
+//
+def: Mips16Pat
+  <(brcond (i32 (setge CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BteqzT8SltX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+//
+// never called because compiler transforms a >= k to a > (k-1)
+def: Mips16Pat
+  <(brcond (i32 (setge CPU16Regs:$rx, immSExt16:$imm)), bb:$imm16),
+   (BteqzT8SltiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$imm16)
+  >;
+
+//
+// bcond-setlt
+//
+def: Mips16Pat
+  <(brcond (i32 (setlt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BtnezT8SltX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (setlt CPU16Regs:$rx, immSExt16:$imm)), bb:$imm16),
+   (BtnezT8SltiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$imm16)
+  >;
+
+//
+// bcond-setle
+//
+def: Mips16Pat
+  <(brcond (i32 (setle CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BteqzT8SltX16 CPU16Regs:$ry, CPU16Regs:$rx,  bb:$imm16)
+  >;
+
+//
+// bcond-setne
+//
+def: Mips16Pat
+  <(brcond (i32 (setne CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BtnezT8CmpX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (setne CPU16Regs:$rx, immZExt16:$imm)), bb:$targ16),
+   (BtnezT8CmpiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$targ16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (setne CPU16Regs:$rx, 0)), bb:$targ16),
+   (BnezRxImm16 CPU16Regs:$rx, bb:$targ16)
+  >;
+
+//
+// This needs to be there but I forget which code will generate it
+//
+def: Mips16Pat
+  <(brcond CPU16Regs:$rx, bb:$targ16),
+   (BnezRxImm16 CPU16Regs:$rx, bb:$targ16)
+  >;
+
+//
+
+//
+// bcond-setugt
+//
+//def: Mips16Pat
+//  <(brcond (i32 (setugt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+//   (BtnezT8SltuX16 CPU16Regs:$ry, CPU16Regs:$rx,  bb:$imm16)
+//  >;
+
+//
+// bcond-setuge
+//
+//def: Mips16Pat
+//  <(brcond (i32 (setuge CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+//   (BteqzT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+//  >;
+
+
+//
+// bcond-setult
+//
+//def: Mips16Pat
+//  <(brcond (i32 (setult CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+//   (BtnezT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+//  >;
+
+def: UncondBranch16_pat<br, Bimm16>;
+
+// Small immediates
+def: Mips16Pat<(i32 immSExt16:$in),
+               (AddiuRxRxImmX16 (MoveR3216 ZERO), immSExt16:$in)>;
+
+def: Mips16Pat<(i32 immZExt16:$in), (LiRxImmX16 immZExt16:$in)>;
+
+//
+// MipsDivRem
+//
+def: Mips16Pat
+  <(MipsDivRem16 CPU16Regs:$rx, CPU16Regs:$ry),
+   (DivRxRy16 CPU16Regs:$rx, CPU16Regs:$ry)>;
+
+//
+// MipsDivRemU
+//
+def: Mips16Pat
+  <(MipsDivRemU16 CPU16Regs:$rx, CPU16Regs:$ry),
+   (DivuRxRy16 CPU16Regs:$rx, CPU16Regs:$ry)>;
+
+//  signed a,b
+//  x = (a>=b)?x:y
+//
+//  if !(a < b) x = y
+//
+def : Mips16Pat<(select (i32 (setge CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZSlt CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a, CPU16Regs:$b)>;
+
+//  signed a,b
+//  x = (a>b)?x:y
+//
+//  if  (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setgt CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBtneZSlt CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+// unsigned a,b
+// x = (a>=b)?x:y
+//
+// if !(a < b) x = y;
+//
+def : Mips16Pat<
+  (select (i32 (setuge CPU16Regs:$a, CPU16Regs:$b)),
+   CPU16Regs:$x, CPU16Regs:$y),
+  (SelTBteqZSltu CPU16Regs:$x, CPU16Regs:$y,
+   CPU16Regs:$a, CPU16Regs:$b)>;
+
+//  unsigned a,b
+//  x = (a>b)?x:y
+//
+//  if (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setugt CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBtneZSltu CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+// signed
+// x = (a >= k)?x:y
+// due to an llvm optimization, i don't think that this will ever
+// be used. This is transformed into x = (a > k-1)?x:y
+//
+//
+
+//def : Mips16Pat<
+//  (select (i32 (setge CPU16Regs:$lhs, immSExt16:$rhs)),
+//   CPU16Regs:$T, CPU16Regs:$F),
+//  (SelTBteqZSlti CPU16Regs:$T, CPU16Regs:$F,
+//   CPU16Regs:$lhs, immSExt16:$rhs)>;
+
+//def : Mips16Pat<
+//  (select (i32 (setuge CPU16Regs:$lhs, immSExt16:$rhs)),
+//   CPU16Regs:$T, CPU16Regs:$F),
+//  (SelTBteqZSltiu CPU16Regs:$T, CPU16Regs:$F,
+//   CPU16Regs:$lhs, immSExt16:$rhs)>;
+
+// signed
+// x = (a < k)?x:y
+//
+// if !(a < k) x = y;
+//
+def : Mips16Pat<
+  (select (i32 (setlt CPU16Regs:$a, immSExt16:$b)),
+   CPU16Regs:$x, CPU16Regs:$y),
+  (SelTBtneZSlti CPU16Regs:$x, CPU16Regs:$y,
+   CPU16Regs:$a, immSExt16:$b)>;
+
+
+//
+//
+// signed
+// x = (a <= b)? x : y
+//
+// if  (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setle CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZSlt CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// unnsigned
+// x = (a <= b)? x : y
+//
+// if  (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setule CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZSltu CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// signed/unsigned
+// x = (a == b)? x : y
+//
+// if (a != b) x = y
+//
+def : Mips16Pat<(select (i32 (seteq CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZCmp CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// signed/unsigned
+// x = (a == 0)? x : y
+//
+// if (a != 0) x = y
+//
+def : Mips16Pat<(select (i32 (seteq CPU16Regs:$a, 0)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelBeqZ CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a)>;
+
+
+//
+// signed/unsigned
+// x = (a == k)? x : y
+//
+// if (a != k) x = y
+//
+def : Mips16Pat<(select (i32 (seteq CPU16Regs:$a, immZExt16:$k)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZCmpi CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a, immZExt16:$k)>;
+
+
+//
+// signed/unsigned
+// x = (a != b)? x : y
+//
+// if (a == b) x = y
+//
+//
+def : Mips16Pat<(select (i32 (setne CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBtneZCmp CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// signed/unsigned
+// x = (a != 0)? x : y
+//
+// if (a == 0) x = y
+//
+def : Mips16Pat<(select (i32 (setne CPU16Regs:$a, 0)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelBneZ CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a)>;
+
+// signed/unsigned
+// x = (a)? x : y
+//
+// if (!a) x = y
+//
+def : Mips16Pat<(select  CPU16Regs:$a,
+                 CPU16Regs:$x, CPU16Regs:$y),
+      (SelBneZ CPU16Regs:$x, CPU16Regs:$y,
+       CPU16Regs:$a)>;
+
+
+//
+// signed/unsigned
+// x = (a != k)? x : y
+//
+// if (a == k) x = y
+//
+def : Mips16Pat<(select (i32 (setne CPU16Regs:$a, immZExt16:$k)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBtneZCmpi CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a, immZExt16:$k)>;
+
+//
+// When writing C code to test setxx these patterns,
+// some will be transformed into
+// other things. So we test using C code but using -O3 and -O0
+//
+// seteq
+//
+def : Mips16Pat
+  <(seteq CPU16Regs:$lhs,CPU16Regs:$rhs),
+   (SltiuCCRxImmX16 (XorRxRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs), 1)>;
+
+def : Mips16Pat
+  <(seteq CPU16Regs:$lhs, 0),
+   (SltiuCCRxImmX16 CPU16Regs:$lhs, 1)>;
+
+
+//
+// setge
+//
+
+def: Mips16Pat
+  <(setge CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (XorRxRxRy16 (SltCCRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (LiRxImmX16 1))>;
+
+//
+// For constants, llvm transforms this to:
+// x > (k - 1) and then reverses the operands to use setlt. So this pattern
+// is not used now by the compiler. (Presumably checking that k-1 does not
+// overflow). The compiler never uses this at the current time, due to
+// other optimizations.
+//
+//def: Mips16Pat
+//  <(setge CPU16Regs:$lhs, immSExt16:$rhs),
+//   (XorRxRxRy16 (SltiCCRxImmX16 CPU16Regs:$lhs, immSExt16:$rhs),
+//   (LiRxImmX16 1))>;
+
+// This catches the x >= -32768 case by transforming it to  x > -32769
+//
+def: Mips16Pat
+  <(setgt CPU16Regs:$lhs, -32769),
+   (XorRxRxRy16 (SltiCCRxImmX16 CPU16Regs:$lhs, -32768),
+   (LiRxImmX16 1))>;
+
+//
+// setgt
+//
+//
+
+def: Mips16Pat
+  <(setgt CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (SltCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs)>;
+
+//
+// setle
+//
+def: Mips16Pat
+  <(setle CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (XorRxRxRy16 (SltCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs), (LiRxImm16 1))>;
+
+//
+// setlt
+//
+def: SetCC_R16<setlt, SltCCRxRy16>;
+
+def: SetCC_I16<setlt, immSExt16, SltiCCRxImmX16>;
+
+//
+// setne
+//
+def : Mips16Pat
+  <(setne CPU16Regs:$lhs,CPU16Regs:$rhs),
+   (SltuCCRxRy16 (LiRxImmX16 0),
+   (XorRxRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs))>;
+
+
+//
+// setuge
+//
+def: Mips16Pat
+  <(setuge CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (XorRxRxRy16 (SltuCCRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (LiRxImmX16 1))>;
+
+// this pattern will never be used because the compiler will transform
+// x >= k to x > (k - 1) and then use SLT
+//
+//def: Mips16Pat
+//  <(setuge CPU16Regs:$lhs, immZExt16:$rhs),
+//   (XorRxRxRy16 (SltiuCCRxImmX16 CPU16Regs:$lhs, immZExt16:$rhs),
+//   (LiRxImmX16 1))>;
+
+//
+// setugt
+//
+def: Mips16Pat
+  <(setugt CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (SltuCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs)>;
+
+//
+// setule
+//
+def: Mips16Pat
+  <(setule CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (XorRxRxRy16 (SltuCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs), (LiRxImmX16 1))>;
+
+//
+// setult
+//
+def: SetCC_R16<setult, SltuCCRxRy16>;
+
+def: SetCC_I16<setult, immSExt16, SltiuCCRxImmX16>;
+
+def: Mips16Pat<(add CPU16Regs:$hi, (MipsLo tglobaladdr:$lo)),
+               (AddiuRxRxImmX16 CPU16Regs:$hi, tglobaladdr:$lo)>;
+
+// hi/lo relocs
+def : Mips16Pat<(MipsHi tblockaddress:$in),
+                (SllX16 (LiRxImmX16 tblockaddress:$in), 16)>;
+def : Mips16Pat<(MipsHi tglobaladdr:$in),
+                (SllX16 (LiRxImmX16 tglobaladdr:$in), 16)>;
+def : Mips16Pat<(MipsHi tjumptable:$in),
+                (SllX16 (LiRxImmX16 tjumptable:$in), 16)>;
+def : Mips16Pat<(MipsHi tglobaltlsaddr:$in),
+                (SllX16 (LiRxImmX16 tglobaltlsaddr:$in), 16)>;
+
+def : Mips16Pat<(MipsLo tblockaddress:$in), (LiRxImmX16 tblockaddress:$in)>;
+
+// wrapper_pic
+class Wrapper16Pat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
+  Mips16Pat<(MipsWrapper RC:$gp, node:$in),
+            (ADDiuOp RC:$gp, node:$in)>;
+
+
+def : Wrapper16Pat<tglobaladdr, AddiuRxRxImmX16, CPU16Regs>;
+def : Wrapper16Pat<tglobaltlsaddr, AddiuRxRxImmX16, CPU16Regs>;
+
+def : Mips16Pat<(i32 (extloadi8   addr16:$src)),
+                (LbuRxRyOffMemX16  addr16:$src)>;
+def : Mips16Pat<(i32 (extloadi16  addr16:$src)),
+                (LhuRxRyOffMemX16  addr16:$src)>;
+
+def: Mips16Pat<(trap), (Break16)>;
+
+def : Mips16Pat<(sext_inreg CPU16Regs:$val, i8),
+                (SebRx16 CPU16Regs:$val)>;
+
+def : Mips16Pat<(sext_inreg CPU16Regs:$val, i16),
+                (SehRx16 CPU16Regs:$val)>;
+
+def GotPrologue16:   
+  MipsPseudo16<
+    (outs CPU16Regs:$rh, CPU16Regs:$rl),
+    (ins simm16:$immHi, simm16:$immLo),
+    "li\t$rh, $immHi\n\taddiu\t$rl, $$pc, $immLo\n ",[]> ;
+
+// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
+def cpinst_operand : Operand<i32> {
+  // let PrintMethod = "printCPInstOperand";
+}
+
+// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in
+// the function.  The first operand is the ID# for this instruction, the second
+// is the index into the MachineConstantPool that this is, the third is the
+// size in bytes of this constant pool entry.
+//
+let hasSideEffects = 0, isNotDuplicable = 1 in
+def CONSTPOOL_ENTRY :
+MipsPseudo16<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                      i32imm:$size), "foo", []>;
+
diff --git a/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
new file mode 100644
index 000000000000..44771cbe8be1
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -0,0 +1,148 @@
+//===-- Mips16RegisterInfo.cpp - MIPS16 Register Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS16 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16RegisterInfo.h"
+#include "Mips.h"
+#include "Mips16InstrInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips16-registerinfo"
+
+Mips16RegisterInfo::Mips16RegisterInfo() : MipsRegisterInfo() {}
+
+bool Mips16RegisterInfo::requiresRegisterScavenging
+  (const MachineFunction &MF) const {
+  return false;
+}
+bool Mips16RegisterInfo::requiresFrameIndexScavenging
+  (const MachineFunction &MF) const {
+  return false;
+}
+
+bool Mips16RegisterInfo::useFPForScavengingIndex
+  (const MachineFunction &MF) const {
+  return false;
+}
+
+bool Mips16RegisterInfo::saveScavengerRegister
+  (MachineBasicBlock &MBB,
+   MachineBasicBlock::iterator I,
+   MachineBasicBlock::iterator &UseMI,
+   const TargetRegisterClass *RC,
+   unsigned Reg) const {
+  DebugLoc DL;
+  const TargetInstrInfo &TII = *MBB.getParent()->getSubtarget().getInstrInfo();
+  TII.copyPhysReg(MBB, I, DL, Mips::T0, Reg, true);
+  TII.copyPhysReg(MBB, UseMI, DL, Reg, Mips::T0, true);
+  return true;
+}
+
+const TargetRegisterClass *
+Mips16RegisterInfo::intRegClass(unsigned Size) const {
+  assert(Size == 4);
+  return &Mips::CPU16RegsRegClass;
+}
+
+void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
+                                     unsigned OpNo, int FrameIndex,
+                                     uint64_t StackSize,
+                                     int64_t SPOffset) const {
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  int MinCSFI = 0;
+  int MaxCSFI = -1;
+
+  if (CSI.size()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  }
+
+  // The following stack frame objects are always
+  // referenced relative to $sp:
+  //  1. Outgoing arguments.
+  //  2. Pointer to dynamically allocated stack space.
+  //  3. Locations for callee-saved registers.
+  // Everything else is referenced relative to whatever register
+  // getFrameRegister() returns.
+  unsigned FrameReg;
+
+  if (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI)
+    FrameReg = Mips::SP;
+  else {
+    const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+    if (TFI->hasFP(MF)) {
+      FrameReg = Mips::S0;
+    }
+    else {
+      if ((MI.getNumOperands()> OpNo+2) && MI.getOperand(OpNo+2).isReg())
+        FrameReg = MI.getOperand(OpNo+2).getReg();
+      else
+        FrameReg = Mips::SP;
+    }
+  }
+  // Calculate final offset.
+  // - There is no need to change the offset if the frame object
+  //   is one of the
+  //   following: an outgoing argument, pointer to a dynamically allocated
+  //   stack space or a $gp restore location,
+  // - If the frame object is any of the following,
+  //   its offset must be adjusted
+  //   by adding the size of the stack:
+  //   incoming argument, callee-saved register location or local variable.
+  int64_t Offset;
+  bool IsKill = false;
+  Offset = SPOffset + (int64_t)StackSize;
+  Offset += MI.getOperand(OpNo + 1).getImm();
+
+
+  DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
+
+  if (!MI.isDebugValue() &&
+      !Mips16InstrInfo::validImmediate(MI.getOpcode(), FrameReg, Offset)) {
+    MachineBasicBlock &MBB = *MI.getParent();
+    DebugLoc DL = II->getDebugLoc();
+    unsigned NewImm;
+    const Mips16InstrInfo &TII =
+        *static_cast<const Mips16InstrInfo *>(MF.getSubtarget().getInstrInfo());
+    FrameReg = TII.loadImmediate(FrameReg, Offset, MBB, II, DL, NewImm);
+    Offset = SignExtend64<16>(NewImm);
+    IsKill = true;
+  }
+  MI.getOperand(OpNo).ChangeToRegister(FrameReg, false, false, IsKill);
+  MI.getOperand(OpNo + 1).ChangeToImmediate(Offset);
+
+
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.h b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.h
new file mode 100644
index 000000000000..d67a79b64033
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.h
@@ -0,0 +1,48 @@
+//===-- Mips16RegisterInfo.h - Mips16 Register Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16REGISTERINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16REGISTERINFO_H
+
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+class Mips16InstrInfo;
+
+class Mips16RegisterInfo : public MipsRegisterInfo {
+public:
+  Mips16RegisterInfo();
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+
+  bool saveScavengerRegister(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     MachineBasicBlock::iterator &UseMI,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Reg) const override;
+
+  const TargetRegisterClass *intRegClass(unsigned Size) const override;
+
+private:
+  void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+                   int FrameIndex, uint64_t StackSize,
+                   int64_t SPOffset) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips32r6InstrFormats.td b/contrib/llvm/lib/Target/Mips/Mips32r6InstrFormats.td
new file mode 100644
index 000000000000..516caa34fbf2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips32r6InstrFormats.td
@@ -0,0 +1,578 @@
+//=- Mips32r6InstrFormats.td - Mips32r6 Instruction Formats -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips32r6 instruction formats.
+//
+//===----------------------------------------------------------------------===//
+
+class R6MMR6Rel;
+
+def MipsR62MicroMipsR6 : InstrMapping {
+  let FilterClass = "R6MMR6Rel";
+  // Instructions with the same BaseOpcode and isNVStore values form a row.
+  let RowFields = ["BaseOpcode"];
+  // Instructions with the same predicate sense form a column.
+  let ColFields = ["Arch"];
+  // The key column is the unpredicated instructions.
+  let KeyCol = ["mipsr6"];
+  // Value columns are PredSense=true and PredSense=false
+  let ValueCols = [["mipsr6"], ["micromipsr6"]];
+}
+
+class MipsR6Arch<string opstr> {
+  string Arch = "mipsr6";
+  string BaseOpcode = opstr;
+}
+
+class MipsR6Inst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
+                   PredicateControl {
+  let DecoderNamespace = "Mips32r6_64r6";
+  let EncodingPredicates = [HasStdEnc];
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Field Values
+//
+//===----------------------------------------------------------------------===//
+
+class OPGROUP<bits<6> Val> {
+  bits<6> Value = Val;
+}
+def OPGROUP_COP0     : OPGROUP<0b010000>;
+def OPGROUP_COP1     : OPGROUP<0b010001>;
+def OPGROUP_COP2     : OPGROUP<0b010010>;
+def OPGROUP_ADDI     : OPGROUP<0b001000>;
+def OPGROUP_AUI      : OPGROUP<0b001111>;
+def OPGROUP_BLEZ     : OPGROUP<0b000110>;
+def OPGROUP_BGTZ     : OPGROUP<0b000111>;
+def OPGROUP_BLEZL    : OPGROUP<0b010110>;
+def OPGROUP_BGTZL    : OPGROUP<0b010111>;
+def OPGROUP_DADDI    : OPGROUP<0b011000>;
+def OPGROUP_DAUI     : OPGROUP<0b011101>;
+def OPGROUP_PCREL    : OPGROUP<0b111011>;
+def OPGROUP_REGIMM   : OPGROUP<0b000001>;
+def OPGROUP_SPECIAL  : OPGROUP<0b000000>;
+// The spec occasionally names this value LL, LLD, SC, or SCD.
+def OPGROUP_SPECIAL3 : OPGROUP<0b011111>;
+// The spec names this constant LWC2, LDC2, SWC2, and SDC2 in different places.
+def OPGROUP_COP2LDST : OPGROUP<0b010010>;
+
+class OPCODE2<bits<2> Val> {
+  bits<2> Value = Val;
+}
+def OPCODE2_ADDIUPC : OPCODE2<0b00>;
+def OPCODE2_LWPC    : OPCODE2<0b01>;
+def OPCODE2_LWUPC   : OPCODE2<0b10>;
+
+class OPCODE3<bits<3> Val> {
+  bits<3> Value = Val;
+}
+def OPCODE3_LDPC : OPCODE3<0b110>;
+
+class OPCODE5<bits<5> Val> {
+  bits<5> Value = Val;
+}
+def OPCODE5_ALUIPC : OPCODE5<0b11111>;
+def OPCODE5_AUIPC  : OPCODE5<0b11110>;
+def OPCODE5_DAHI : OPCODE5<0b00110>;
+def OPCODE5_DATI : OPCODE5<0b11110>;
+def OPCODE5_BC1EQZ : OPCODE5<0b01001>;
+def OPCODE5_BC1NEZ : OPCODE5<0b01101>;
+def OPCODE5_BC2EQZ : OPCODE5<0b01001>;
+def OPCODE5_BC2NEZ : OPCODE5<0b01101>;
+def OPCODE5_BGEZAL : OPCODE5<0b10001>;
+// The next four constants are unnamed in the spec. These names are taken from
+// the OPGROUP names they are used with.
+def OPCODE5_LDC2   : OPCODE5<0b01110>;
+def OPCODE5_LWC2   : OPCODE5<0b01010>;
+def OPCODE5_SDC2   : OPCODE5<0b01111>;
+def OPCODE5_SWC2   : OPCODE5<0b01011>;
+
+class OPCODE6<bits<6> Val> {
+  bits<6> Value = Val;
+}
+def OPCODE6_ALIGN    : OPCODE6<0b100000>;
+def OPCODE6_DALIGN   : OPCODE6<0b100100>;
+def OPCODE6_BITSWAP  : OPCODE6<0b100000>;
+def OPCODE6_DBITSWAP : OPCODE6<0b100100>;
+def OPCODE6_JALR     : OPCODE6<0b001001>;
+def OPCODE6_CACHE    : OPCODE6<0b100101>;
+def OPCODE6_PREF     : OPCODE6<0b110101>;
+// The next four constants are unnamed in the spec. These names are taken from
+// the OPGROUP names they are used with.
+def OPCODE6_LL       : OPCODE6<0b110110>;
+def OPCODE6_LLD      : OPCODE6<0b110111>;
+def OPCODE6_SC       : OPCODE6<0b100110>;
+def OPCODE6_SCD      : OPCODE6<0b100111>;
+def OPCODE6_CLO      : OPCODE6<0b010001>;
+def OPCODE6_CLZ      : OPCODE6<0b010000>;
+def OPCODE6_DCLO     : OPCODE6<0b010011>;
+def OPCODE6_DCLZ     : OPCODE6<0b010010>;
+def OPCODE6_LSA      : OPCODE6<0b000101>;
+def OPCODE6_DLSA     : OPCODE6<0b010101>;
+def OPCODE6_SDBBP    : OPCODE6<0b001110>;
+
+class FIELD_FMT<bits<5> Val> {
+  bits<5> Value = Val;
+}
+def FIELD_FMT_S : FIELD_FMT<0b10000>;
+def FIELD_FMT_D : FIELD_FMT<0b10001>;
+
+class FIELD_CMP_COND<bits<5> Val> {
+  bits<5> Value = Val;
+}
+// Note: The CMP_COND_FMT names differ from the C_COND_FMT names.
+def FIELD_CMP_COND_AF   : FIELD_CMP_COND<0b00000>;
+def FIELD_CMP_COND_UN   : FIELD_CMP_COND<0b00001>;
+def FIELD_CMP_COND_EQ   : FIELD_CMP_COND<0b00010>;
+def FIELD_CMP_COND_UEQ  : FIELD_CMP_COND<0b00011>;
+def FIELD_CMP_COND_LT   : FIELD_CMP_COND<0b00100>;
+def FIELD_CMP_COND_ULT  : FIELD_CMP_COND<0b00101>;
+def FIELD_CMP_COND_LE   : FIELD_CMP_COND<0b00110>;
+def FIELD_CMP_COND_ULE  : FIELD_CMP_COND<0b00111>;
+def FIELD_CMP_COND_SAF  : FIELD_CMP_COND<0b01000>;
+def FIELD_CMP_COND_SUN  : FIELD_CMP_COND<0b01001>;
+def FIELD_CMP_COND_SEQ  : FIELD_CMP_COND<0b01010>;
+def FIELD_CMP_COND_SUEQ : FIELD_CMP_COND<0b01011>;
+def FIELD_CMP_COND_SLT  : FIELD_CMP_COND<0b01100>;
+def FIELD_CMP_COND_SULT : FIELD_CMP_COND<0b01101>;
+def FIELD_CMP_COND_SLE  : FIELD_CMP_COND<0b01110>;
+def FIELD_CMP_COND_SULE : FIELD_CMP_COND<0b01111>;
+
+class FIELD_CMP_FORMAT<bits<5> Val> {
+  bits<5> Value = Val;
+}
+def FIELD_CMP_FORMAT_S : FIELD_CMP_FORMAT<0b10100>;
+def FIELD_CMP_FORMAT_D : FIELD_CMP_FORMAT<0b10101>;
+
+//===----------------------------------------------------------------------===//
+//
+// Disambiguators
+//
+//===----------------------------------------------------------------------===//
+//
+// Some encodings are ambiguous except by comparing field values.
+
+class DecodeDisambiguates<string Name> {
+  string DecoderMethod = !strconcat("Decode", Name);
+}
+
+class DecodeDisambiguatedBy<string Name> : DecodeDisambiguates<Name> {
+  string DecoderNamespace = "Mips32r6_64r6_Ambiguous";
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Encoding Formats
+//
+//===----------------------------------------------------------------------===//
+
+class AUI_FM : MipsR6Inst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_AUI.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = imm;
+}
+
+class DAUI_FM : AUI_FM {
+  let Inst{31-26} = OPGROUP_DAUI.Value;
+}
+
+class BAL_FM : MipsR6Inst {
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_REGIMM.Value;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = OPCODE5_BGEZAL.Value;
+  let Inst{15-0} = offset;
+}
+
+class COP0_EVP_DVP_FM<bits<1> sc> : MipsR6Inst {
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP0.Value;
+  let Inst{25-21} = 0b01011;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = 0b00000;
+  let Inst{10-6}  = 0b00000;
+  let Inst{5}     = sc;
+  let Inst{4-3}   = 0b00;
+  let Inst{2-0}   = 0b100;
+}
+
+class COP1_2R_FM<bits<6> funct, FIELD_FMT Format> : MipsR6Inst {
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Format.Value;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
+}
+
+class COP1_3R_FM<bits<6> funct, FIELD_FMT Format> : MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Format.Value;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-6} = fd;
+  let Inst{5-0} = funct;
+}
+
+class COP1_BCCZ_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> ft;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Operation.Value;
+  let Inst{20-16} = ft;
+  let Inst{15-0} = offset;
+}
+
+class COP2_BCCZ_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> ct;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP2.Value;
+  let Inst{25-21} = Operation.Value;
+  let Inst{20-16} = ct;
+  let Inst{15-0} = offset;
+}
+
+class PCREL16_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> rs;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_PCREL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = Operation.Value;
+  let Inst{15-0} = imm;
+}
+
+class PCREL19_FM<OPCODE2 Operation> : MipsR6Inst {
+  bits<5> rs;
+  bits<19> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_PCREL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-19} = Operation.Value;
+  let Inst{18-0} = imm;
+}
+
+class PCREL18_FM<OPCODE3 Operation> : MipsR6Inst {
+  bits<5> rs;
+  bits<18> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_PCREL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-18} = Operation.Value;
+  let Inst{17-0} = imm;
+}
+
+class SPECIAL3_2R_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0b00000;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class SPECIAL3_MEM_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<21> addr;
+  bits<5> hint;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = base;
+  let Inst{20-16} = hint;
+  let Inst{15-7}  = offset;
+  let Inst{6}     = 0;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class SPECIAL_2R_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0b00001;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class SPECIAL_3R_FM<bits<5> mulop, bits<6> funct> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = mulop;
+  let Inst{5-0}   = funct;
+}
+
+class SPECIAL_SDBBP_FM : MipsR6Inst {
+  bits<20> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-6}  = code_;
+  let Inst{5-0}   = OPCODE6_SDBBP.Value;
+}
+
+// This class is ambiguous with other branches:
+//   BEQC/BNEC require that rs < rt && rs != 0
+class CMP_BRANCH_2R_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+// This class is ambiguous with other branches:
+//   BLEZC/BGEZC/BEQZALC/BNEZALC/BGTZALC require that rs == 0 && rt != 0
+// The '1R_RT' in the name means 1 register in the rt field.
+class CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct.Value;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+// This class is ambiguous with other branches:
+//   BLTZC/BGTZC/BLTZALC/BGEZALC require that rs == rt && rt != 0
+// The '1R_BOTH' in the name means 1 register in both the rs and rt fields.
+class CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct.Value;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+class CMP_BRANCH_OFF21_FM<bits<6> funct> : MipsR6Inst {
+  bits<5> rs; // rs != 0
+  bits<21> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct;
+  let Inst{25-21} = rs;
+  let Inst{20-0} = offset;
+}
+
+class JMP_IDX_COMPACT_FM<bits<6> funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+class BRANCH_OFF26_FM<bits<6> funct> : MipsR6Inst {
+  bits<32> Inst;
+  bits<26> offset;
+
+  let Inst{31-26} = funct;
+  let Inst{25-0} = offset;
+}
+
+class SPECIAL3_ALIGN_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+  bits<2> bp;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8}  = 0b010;
+  let Inst{7-6}   = bp;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class SPECIAL3_DALIGN_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+  bits<3> bp;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-9}  = 0b01;
+  let Inst{8-6}   = bp;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class SPECIAL3_LL_SC_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = base;
+  let Inst{20-16} = rt;
+  let Inst{15-7} = offset;
+  let Inst{5-0} = Operation.Value;
+
+  string DecoderMethod = "DecodeSpecial3LlSc";
+}
+
+class SPECIAL_LSA_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+  bits<2> imm2;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8}  = 0b000;
+  let Inst{7-6}   = imm2;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class REGIMM_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> rs;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_REGIMM.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = Operation.Value;
+  let Inst{15-0} = imm;
+}
+
+class COP1_CMP_CONDN_FM<FIELD_CMP_FORMAT Format,
+                        FIELD_CMP_COND Cond> : MipsR6Inst {
+  bits<5> fd;
+  bits<5> fs;
+  bits<5> ft;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Format.Value;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5}     = 0;
+  let Inst{4-0}   = Cond.Value;
+}
+
+class JR_HB_R6_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = 0;
+  let Inst{10} = 1;
+  let Inst{9-6} = 0;
+  let Inst{5-0} = Operation.Value;
+}
+
+class COP2LDST_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> rt;
+  bits<21> addr;
+  bits<5> base = addr{20-16};
+  bits<11> offset = addr{10-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP2LDST.Value;
+  let Inst{25-21} = Operation.Value;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = base;
+  let Inst{10-0}  = offset;
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
new file mode 100644
index 000000000000..1b4d73b79895
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -0,0 +1,1004 @@
+//=- Mips32r6InstrInfo.td - Mips32r6 Instruction Information -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips32r6 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+include "Mips32r6InstrFormats.td"
+
+// Notes about removals/changes from MIPS32r6:
+// Reencoded: jr -> jalr
+// Reencoded: jr.hb -> jalr.hb
+
+def brtarget21 : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTarget21OpValue";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget21";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def brtarget26 : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTarget26OpValue";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget26";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def jmpoffset16 : Operand<OtherVT> {
+  let EncoderMethod = "getJumpOffset16OpValue";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def calloffset16 : Operand<iPTR> {
+  let EncoderMethod = "getJumpOffset16OpValue";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Encodings
+//
+//===----------------------------------------------------------------------===//
+
+class ADDIUPC_ENC : PCREL19_FM<OPCODE2_ADDIUPC>;
+class ALIGN_ENC  : SPECIAL3_ALIGN_FM<OPCODE6_ALIGN>;
+class ALUIPC_ENC : PCREL16_FM<OPCODE5_ALUIPC>;
+class AUI_ENC    : AUI_FM;
+class AUIPC_ENC  : PCREL16_FM<OPCODE5_AUIPC>;
+
+class BAL_ENC   : BAL_FM;
+class BALC_ENC  : BRANCH_OFF26_FM<0b111010>;
+class BC_ENC    : BRANCH_OFF26_FM<0b110010>;
+class BEQC_ENC  : CMP_BRANCH_2R_OFF16_FM<OPGROUP_ADDI>,
+                  DecodeDisambiguates<"AddiGroupBranch">;
+class BEQZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_ADDI>,
+                    DecodeDisambiguatedBy<"DaddiGroupBranch">;
+class BNEC_ENC  : CMP_BRANCH_2R_OFF16_FM<OPGROUP_DADDI>,
+                  DecodeDisambiguates<"DaddiGroupBranch">;
+class BNEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_DADDI>,
+                    DecodeDisambiguatedBy<"DaddiGroupBranch">;
+
+class BLTZC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BGTZL>,
+                  DecodeDisambiguates<"BgtzlGroupBranch">;
+class BGEC_ENC  : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BLEZL>,
+                  DecodeDisambiguatedBy<"BlezlGroupBranch">;
+class BGEUC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BLEZ>,
+                  DecodeDisambiguatedBy<"BlezGroupBranch">;
+class BGEZC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZL>,
+                  DecodeDisambiguates<"BlezlGroupBranch">;
+class BGTZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BGTZ>,
+                    DecodeDisambiguatedBy<"BgtzGroupBranch">;
+
+class BLTC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BGTZL>,
+                 DecodeDisambiguatedBy<"BgtzlGroupBranch">;
+class BLTUC_ENC : CMP_BRANCH_2R_OFF16_FM<OPGROUP_BGTZ>,
+                  DecodeDisambiguatedBy<"BgtzGroupBranch">;
+
+class BLEZC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZL>,
+                  DecodeDisambiguatedBy<"BlezlGroupBranch">;
+class BLTZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BGTZ>,
+                    DecodeDisambiguates<"BgtzGroupBranch">;
+class BGTZC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BGTZL>,
+                  DecodeDisambiguatedBy<"BgtzlGroupBranch">;
+
+class BEQZC_ENC : CMP_BRANCH_OFF21_FM<0b110110>;
+class BGEZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZ>,
+                    DecodeDisambiguates<"BlezGroupBranch">;
+class BNEZC_ENC : CMP_BRANCH_OFF21_FM<0b111110>;
+
+class BC1EQZ_ENC : COP1_BCCZ_FM<OPCODE5_BC1EQZ>;
+class BC1NEZ_ENC : COP1_BCCZ_FM<OPCODE5_BC1NEZ>;
+class BC2EQZ_ENC : COP2_BCCZ_FM<OPCODE5_BC2EQZ>;
+class BC2NEZ_ENC : COP2_BCCZ_FM<OPCODE5_BC2NEZ>;
+
+class DVP_ENC : COP0_EVP_DVP_FM<0b1>;
+class EVP_ENC : COP0_EVP_DVP_FM<0b0>;
+
+class JIALC_ENC : JMP_IDX_COMPACT_FM<0b111110>;
+class JIC_ENC   : JMP_IDX_COMPACT_FM<0b110110>;
+class JR_HB_R6_ENC : JR_HB_R6_FM<OPCODE6_JALR>;
+class BITSWAP_ENC : SPECIAL3_2R_FM<OPCODE6_BITSWAP>;
+class BLEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZ>,
+                    DecodeDisambiguatedBy<"BlezGroupBranch">;
+class BNVC_ENC   : CMP_BRANCH_2R_OFF16_FM<OPGROUP_DADDI>,
+                   DecodeDisambiguatedBy<"DaddiGroupBranch">;
+class BOVC_ENC   : CMP_BRANCH_2R_OFF16_FM<OPGROUP_ADDI>,
+                   DecodeDisambiguatedBy<"AddiGroupBranch">;
+class DIV_ENC    : SPECIAL_3R_FM<0b00010, 0b011010>;
+class DIVU_ENC   : SPECIAL_3R_FM<0b00010, 0b011011>;
+class MOD_ENC    : SPECIAL_3R_FM<0b00011, 0b011010>;
+class MODU_ENC   : SPECIAL_3R_FM<0b00011, 0b011011>;
+class MUH_ENC    : SPECIAL_3R_FM<0b00011, 0b011000>;
+class MUHU_ENC   : SPECIAL_3R_FM<0b00011, 0b011001>;
+class MUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b011000>;
+class MULU_ENC   : SPECIAL_3R_FM<0b00010, 0b011001>;
+
+class MADDF_S_ENC  : COP1_3R_FM<0b011000, FIELD_FMT_S>;
+class MADDF_D_ENC  : COP1_3R_FM<0b011000, FIELD_FMT_D>;
+class MSUBF_S_ENC  : COP1_3R_FM<0b011001, FIELD_FMT_S>;
+class MSUBF_D_ENC  : COP1_3R_FM<0b011001, FIELD_FMT_D>;
+
+class SEL_D_ENC  : COP1_3R_FM<0b010000, FIELD_FMT_D>;
+class SEL_S_ENC  : COP1_3R_FM<0b010000, FIELD_FMT_S>;
+
+class SELEQZ_ENC : SPECIAL_3R_FM<0b00000, 0b110101>;
+class SELNEZ_ENC : SPECIAL_3R_FM<0b00000, 0b110111>;
+
+class LWPC_ENC   : PCREL19_FM<OPCODE2_LWPC>;
+class LWUPC_ENC  : PCREL19_FM<OPCODE2_LWUPC>;
+
+class MAX_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>;
+class MAX_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>;
+class MIN_S_ENC : COP1_3R_FM<0b011100, FIELD_FMT_S>;
+class MIN_D_ENC : COP1_3R_FM<0b011100, FIELD_FMT_D>;
+
+class MAXA_S_ENC : COP1_3R_FM<0b011111, FIELD_FMT_S>;
+class MAXA_D_ENC : COP1_3R_FM<0b011111, FIELD_FMT_D>;
+class MINA_S_ENC : COP1_3R_FM<0b011110, FIELD_FMT_S>;
+class MINA_D_ENC : COP1_3R_FM<0b011110, FIELD_FMT_D>;
+
+class SELEQZ_S_ENC : COP1_3R_FM<0b010100, FIELD_FMT_S>;
+class SELEQZ_D_ENC : COP1_3R_FM<0b010100, FIELD_FMT_D>;
+class SELNEZ_S_ENC : COP1_3R_FM<0b010111, FIELD_FMT_S>;
+class SELNEZ_D_ENC : COP1_3R_FM<0b010111, FIELD_FMT_D>;
+
+class RINT_S_ENC : COP1_2R_FM<0b011010, FIELD_FMT_S>;
+class RINT_D_ENC : COP1_2R_FM<0b011010, FIELD_FMT_D>;
+class CLASS_S_ENC : COP1_2R_FM<0b011011, FIELD_FMT_S>;
+class CLASS_D_ENC : COP1_2R_FM<0b011011, FIELD_FMT_D>;
+
+class CACHE_ENC : SPECIAL3_MEM_FM<OPCODE6_CACHE>;
+class PREF_ENC : SPECIAL3_MEM_FM<OPCODE6_PREF>;
+
+class LDC2_R6_ENC : COP2LDST_FM<OPCODE5_LDC2>;
+class LWC2_R6_ENC : COP2LDST_FM<OPCODE5_LWC2>;
+class SDC2_R6_ENC : COP2LDST_FM<OPCODE5_SDC2>;
+class SWC2_R6_ENC : COP2LDST_FM<OPCODE5_SWC2>;
+
+class LSA_R6_ENC : SPECIAL_LSA_FM<OPCODE6_LSA>;
+
+class LL_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_LL>;
+class SC_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_SC>;
+
+class CLO_R6_ENC : SPECIAL_2R_FM<OPCODE6_CLO>;
+class CLZ_R6_ENC : SPECIAL_2R_FM<OPCODE6_CLZ>;
+
+class SDBBP_R6_ENC : SPECIAL_SDBBP_FM;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Multiclasses
+//
+//===----------------------------------------------------------------------===//
+
+class CMP_CONDN_DESC_BASE<string CondStr, string Typestr,
+                          RegisterOperand FGROpnd,
+                          InstrItinClass Itin,
+                          SDPatternOperator Op = null_frag> {
+  dag OutOperandList = (outs FGRCCOpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat("cmp.", CondStr, ".", Typestr, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [(set FGRCCOpnd:$fd, (Op FGROpnd:$fs, FGROpnd:$ft))];
+  bit isCTI = 1;
+  InstrItinClass Itinerary = Itin;
+}
+
+multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr,
+                     RegisterOperand FGROpnd, InstrItinClass Itin>{
+  let AdditionalPredicates = [NotInMicroMips] in {
+    def CMP_F_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_AF>,
+                      CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd, Itin>,
+                      MipsR6Arch<!strconcat("cmp.af.", Typestr)>,
+                      ISA_MIPS32R6, HARDFLOAT;
+    def CMP_UN_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>,
+                       CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, Itin, setuo>,
+                       MipsR6Arch<!strconcat("cmp.un.", Typestr)>,
+                       ISA_MIPS32R6, HARDFLOAT;
+    def CMP_EQ_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>,
+                       CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, Itin,
+                                           setoeq>,
+                       MipsR6Arch<!strconcat("cmp.eq.", Typestr)>,
+                       ISA_MIPS32R6, HARDFLOAT;
+    def CMP_UEQ_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                     FIELD_CMP_COND_UEQ>,
+                        CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, Itin,
+                                            setueq>,
+                        MipsR6Arch<!strconcat("cmp.ueq.", Typestr)>,
+                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_LT_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>,
+                       CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, Itin,
+                                           setolt>,
+                       MipsR6Arch<!strconcat("cmp.lt.", Typestr)>,
+                       ISA_MIPS32R6, HARDFLOAT;
+    def CMP_ULT_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                     FIELD_CMP_COND_ULT>,
+                        CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, Itin,
+                                            setult>,
+                        MipsR6Arch<!strconcat("cmp.ult.", Typestr)>,
+                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_LE_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>,
+                       CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, Itin,
+                                           setole>,
+                       MipsR6Arch<!strconcat("cmp.le.", Typestr)>,
+                       ISA_MIPS32R6, HARDFLOAT;
+    def CMP_ULE_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                     FIELD_CMP_COND_ULE>,
+                        CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, Itin,
+                                            setule>,
+                        MipsR6Arch<!strconcat("cmp.ule.", Typestr)>,
+                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_SAF_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                     FIELD_CMP_COND_SAF>,
+                        CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd, Itin>,
+                        MipsR6Arch<!strconcat("cmp.saf.", Typestr)>,
+                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_SUN_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                     FIELD_CMP_COND_SUN>,
+                        CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd, Itin>,
+                        MipsR6Arch<!strconcat("cmp.sun.", Typestr)>,
+                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_SEQ_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                     FIELD_CMP_COND_SEQ>,
+                        CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd, Itin>,
+                        MipsR6Arch<!strconcat("cmp.seq.", Typestr)>,
+                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_SUEQ_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                      FIELD_CMP_COND_SUEQ>,
+                         CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd, Itin>,
+                         MipsR6Arch<!strconcat("cmp.sueq.", Typestr)>,
+                         ISA_MIPS32R6, HARDFLOAT;
+    def CMP_SLT_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                     FIELD_CMP_COND_SLT>,
+                        CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd, Itin>,
+                        MipsR6Arch<!strconcat("cmp.slt.", Typestr)>,
+                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_SULT_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                      FIELD_CMP_COND_SULT>,
+                         CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd, Itin>,
+                         MipsR6Arch<!strconcat("cmp.sult.", Typestr)>,
+                         ISA_MIPS32R6, HARDFLOAT;
+    def CMP_SLE_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                     FIELD_CMP_COND_SLE>,
+                        CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd, Itin>,
+                        MipsR6Arch<!strconcat("cmp.sle.", Typestr)>,
+                        ISA_MIPS32R6, HARDFLOAT;
+    def CMP_SULE_#NAME : R6MMR6Rel, COP1_CMP_CONDN_FM<Format,
+                                                      FIELD_CMP_COND_SULE>,
+                         CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd, Itin>,
+                         MipsR6Arch<!strconcat("cmp.sule.", Typestr)>,
+                         ISA_MIPS32R6, HARDFLOAT;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Descriptions
+//
+//===----------------------------------------------------------------------===//
+
+class PCREL_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                      Operand ImmOpnd, InstrItinClass itin>
+      : MipsR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins ImmOpnd:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $imm");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
+}
+
+class ADDIUPC_DESC : PCREL_DESC_BASE<"addiupc", GPR32Opnd, simm19_lsl2,
+                                     II_ADDIUPC>;
+class LWPC_DESC: PCREL_DESC_BASE<"lwpc", GPR32Opnd, simm19_lsl2, II_LWPC>;
+class LWUPC_DESC: PCREL_DESC_BASE<"lwupc", GPR32Opnd, simm19_lsl2, II_LWUPC>;
+
+class ALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                      Operand ImmOpnd, InstrItinClass itin>
+      : MipsR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$bp);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $bp");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
+}
+
+class ALIGN_DESC : ALIGN_DESC_BASE<"align", GPR32Opnd, uimm2, II_ALIGN>;
+
+class ALUIPC_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       InstrItinClass itin = NoItinerary>
+      : MipsR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins simm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $imm");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
+}
+
+class ALUIPC_DESC : ALUIPC_DESC_BASE<"aluipc", GPR32Opnd, II_ALUIPC>;
+class AUIPC_DESC : ALUIPC_DESC_BASE<"auipc", GPR32Opnd, II_AUIPC>;
+
+class AUI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                    InstrItinClass itin = NoItinerary>
+      : MipsR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins GPROpnd:$rt, uimm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $imm");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
+}
+
+class AUI_DESC : AUI_DESC_BASE<"aui", GPR32Opnd, II_AUI>;
+
+class BRANCH_DESC_BASE {
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit hasDelaySlot = 0;
+  bit isCTI = 1;
+}
+
+class BC_DESC_BASE<string instr_asm, DAGOperand opnd> : BRANCH_DESC_BASE,
+    MipsR6Arch<instr_asm> {
+  dag InOperandList = (ins opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$offset");
+  bit isBarrier = 1;
+  InstrItinClass Itinerary = II_BC;
+  bit isCTI = 1;
+}
+
+class CMP_BC_DESC_BASE<string instr_asm, DAGOperand opnd,
+                       RegisterOperand GPROpnd> : BRANCH_DESC_BASE,
+                                                  MipsR6Arch<instr_asm> {
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $offset");
+  list<Register> Defs = [AT];
+  InstrItinClass Itinerary = II_BCCC;
+  bit hasForbiddenSlot = 1;
+  bit isCTI = 1;
+}
+
+class CMP_CBR_EQNE_Z_DESC_BASE<string instr_asm, DAGOperand opnd,
+                               RegisterOperand GPROpnd>
+    : BRANCH_DESC_BASE, MipsR6Arch<instr_asm> {
+  dag InOperandList = (ins GPROpnd:$rs, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $offset");
+  list<Register> Defs = [AT];
+  InstrItinClass Itinerary = II_BCCZC;
+  bit hasForbiddenSlot = 1;
+  bit isCTI = 1;
+}
+
+class CMP_CBR_RT_Z_DESC_BASE<string instr_asm, DAGOperand opnd,
+                             RegisterOperand GPROpnd>
+    : BRANCH_DESC_BASE, MipsR6Arch<instr_asm> {
+  dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $offset");
+  list<Register> Defs = [AT];
+  InstrItinClass Itinerary = II_BCCZC;
+  bit hasForbiddenSlot = 1;
+  bit isCTI = 1;
+}
+
+class BAL_DESC : BC_DESC_BASE<"bal", brtarget> {
+  bit isCall = 1;
+  bit hasDelaySlot = 1;
+  list<Register> Defs = [RA];
+  bit isCTI = 1;
+}
+
+class BALC_DESC : BC_DESC_BASE<"balc", brtarget26> {
+  bit isCall = 1;
+  list<Register> Defs = [RA];
+  InstrItinClass Itinerary = II_BALC;
+  bit isCTI = 1;
+}
+
+class BC_DESC : BC_DESC_BASE<"bc", brtarget26>;
+class BGEC_DESC : CMP_BC_DESC_BASE<"bgec", brtarget, GPR32Opnd>;
+class BGEUC_DESC : CMP_BC_DESC_BASE<"bgeuc", brtarget, GPR32Opnd>;
+class BEQC_DESC : CMP_BC_DESC_BASE<"beqc", brtarget, GPR32Opnd>;
+class BNEC_DESC : CMP_BC_DESC_BASE<"bnec", brtarget, GPR32Opnd>;
+
+class BLTC_DESC : CMP_BC_DESC_BASE<"bltc", brtarget, GPR32Opnd>;
+class BLTUC_DESC : CMP_BC_DESC_BASE<"bltuc", brtarget, GPR32Opnd>;
+
+class BLTZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzc", brtarget, GPR32Opnd>;
+class BGEZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezc", brtarget, GPR32Opnd>;
+
+class BLEZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"blezc", brtarget, GPR32Opnd>;
+class BGTZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgtzc", brtarget, GPR32Opnd>;
+
+class BEQZC_DESC : CMP_CBR_EQNE_Z_DESC_BASE<"beqzc", brtarget21, GPR32Opnd>;
+class BNEZC_DESC : CMP_CBR_EQNE_Z_DESC_BASE<"bnezc", brtarget21, GPR32Opnd>;
+
+class COP1_BCCZ_DESC_BASE<string instr_asm> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins FGR64Opnd:$ft, brtarget:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = instr_asm;
+  bit hasDelaySlot = 1;
+  InstrItinClass Itinerary = II_BC1CCZ;
+}
+
+class BC1EQZ_DESC : COP1_BCCZ_DESC_BASE<"bc1eqz $ft, $offset">;
+class BC1NEZ_DESC : COP1_BCCZ_DESC_BASE<"bc1nez $ft, $offset">;
+
+class COP2_BCCZ_DESC_BASE<string instr_asm> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins COP2Opnd:$ct, brtarget:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = instr_asm;
+  bit hasDelaySlot = 1;
+  bit isCTI = 1;
+  InstrItinClass Itinerary = II_BC2CCZ;
+}
+
+class BC2EQZ_DESC : COP2_BCCZ_DESC_BASE<"bc2eqz $ct, $offset">;
+class BC2NEZ_DESC : COP2_BCCZ_DESC_BASE<"bc2nez $ct, $offset">;
+
+class BOVC_DESC   : CMP_BC_DESC_BASE<"bovc", brtarget, GPR32Opnd>;
+class BNVC_DESC   : CMP_BC_DESC_BASE<"bnvc", brtarget, GPR32Opnd>;
+
+class JMP_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd,
+                                RegisterOperand GPROpnd,
+                                InstrItinClass itin = NoItinerary>
+    : MipsR6Arch<opstr> {
+  dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
+  string AsmString = !strconcat(opstr, "\t$rt, $offset");
+  list<dag> Pattern = [];
+  bit hasDelaySlot = 0;
+  InstrItinClass Itinerary = itin;
+  bit isCTI = 1;
+  bit isBranch = 1;
+  bit isIndirectBranch = 1;
+}
+
+class JIALC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jialc", calloffset16,
+                                             GPR32Opnd, II_JIALC> {
+  bit isCall = 1;
+  list<Register> Defs = [RA];
+}
+
+class JIC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16,
+                                           GPR32Opnd, II_JIALC> {
+  bit isBarrier = 1;
+  bit isTerminator = 1;
+  list<Register> Defs = [AT];
+}
+
+class JR_HB_R6_DESC : JR_HB_DESC_BASE<"jr.hb", GPR32Opnd> {
+  bit isBranch = 1;
+  bit isIndirectBranch = 1;
+  bit hasDelaySlot = 1;
+  bit isTerminator=1;
+  bit isBarrier=1;
+  bit isCTI = 1;
+  InstrItinClass Itinerary = II_JR_HB;
+}
+
+class BITSWAP_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                        InstrItinClass itin>
+    : MipsR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
+}
+
+class BITSWAP_DESC : BITSWAP_DESC_BASE<"bitswap", GPR32Opnd, II_BITSWAP>;
+
+class DIVMOD_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       InstrItinClass itin,
+                       SDPatternOperator Op=null_frag>
+    : MipsR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))];
+  InstrItinClass Itinerary = itin;
+  // This instruction doesn't trap division by zero itself. We must insert
+  // teq instructions as well.
+  bit usesCustomInserter = 1;
+}
+
+class DVPEVP_DESC_BASE<string instr_asm, InstrItinClass Itin>
+    : MipsR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins);
+  string AsmString = !strconcat(instr_asm, "\t$rt");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = Itin;
+  bit hasUnModeledSideEffects = 1;
+}
+
+class DVP_DESC : DVPEVP_DESC_BASE<"dvp", II_DVP>;
+class EVP_DESC : DVPEVP_DESC_BASE<"evp", II_EVP>;
+
+class DIV_DESC  : DIVMOD_DESC_BASE<"div", GPR32Opnd, II_DIV, sdiv>;
+class DIVU_DESC : DIVMOD_DESC_BASE<"divu", GPR32Opnd, II_DIVU, udiv>;
+class MOD_DESC  : DIVMOD_DESC_BASE<"mod", GPR32Opnd, II_MOD, srem>;
+class MODU_DESC : DIVMOD_DESC_BASE<"modu", GPR32Opnd, II_MODU, urem>;
+
+class BEQZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"beqzalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BGEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BGTZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgtzalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BLEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"blezalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BLTZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BNEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bnezalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class MUL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       InstrItinClass itin,
+                       SDPatternOperator Op=null_frag> : MipsR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set GPROpnd:$rd, (Op GPROpnd:$rs, GPROpnd:$rt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MUH_DESC    : MUL_R6_DESC_BASE<"muh", GPR32Opnd, II_MUH, mulhs>;
+class MUHU_DESC   : MUL_R6_DESC_BASE<"muhu", GPR32Opnd, II_MUHU, mulhu>;
+class MUL_R6_DESC : MUL_R6_DESC_BASE<"mul", GPR32Opnd, II_MUL, mul>;
+class MULU_DESC   : MUL_R6_DESC_BASE<"mulu", GPR32Opnd, II_MULU>;
+
+class COP1_SEL_DESC_BASE<string instr_asm, RegisterOperand FGROpnd,
+                         InstrItinClass itin> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGRCCOpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [(set FGROpnd:$fd, (select FGRCCOpnd:$fd_in,
+                                                 FGROpnd:$ft,
+                                                 FGROpnd:$fs))];
+  string Constraints = "$fd_in = $fd";
+  InstrItinClass Itinerary = itin;
+}
+
+class SEL_D_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd, II_SEL_D>,
+                   MipsR6Arch<"sel.d"> {
+  // We must insert a SUBREG_TO_REG around $fd_in
+  bit usesCustomInserter = 1;
+}
+class SEL_S_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd, II_SEL_S>,
+                   MipsR6Arch<"sel.s">;
+
+class SELEQNE_Z_DESC_BASE<string instr_asm, RegisterOperand GPROpnd>
+    : MipsR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_SELCCZ;
+}
+
+class SELEQZ_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR32Opnd>;
+class SELNEZ_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR32Opnd>;
+
+class COP1_4R_DESC_BASE<string instr_asm, RegisterOperand FGROpnd,
+                        InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [];
+  string Constraints = "$fd_in = $fd";
+  InstrItinClass Itinerary = itin;
+}
+
+class MADDF_S_DESC  : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd, II_MADDF_S>;
+class MADDF_D_DESC  : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd, II_MADDF_D>;
+class MSUBF_S_DESC  : COP1_4R_DESC_BASE<"msubf.s", FGR32Opnd, II_MSUBF_S>;
+class MSUBF_D_DESC  : COP1_4R_DESC_BASE<"msubf.d", FGR64Opnd, II_MSUBF_D>;
+
+class MAX_MIN_DESC_BASE<string instr_asm, RegisterOperand FGROpnd,
+                        InstrItinClass itin> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
+}
+
+class MAX_S_DESC : MAX_MIN_DESC_BASE<"max.s", FGR32Opnd, II_MAX_S>;
+class MAX_D_DESC : MAX_MIN_DESC_BASE<"max.d", FGR64Opnd, II_MAX_D>;
+class MIN_S_DESC : MAX_MIN_DESC_BASE<"min.s", FGR32Opnd, II_MIN_S>;
+class MIN_D_DESC : MAX_MIN_DESC_BASE<"min.d", FGR64Opnd, II_MIN_D>;
+
+class MAXA_S_DESC : MAX_MIN_DESC_BASE<"maxa.s", FGR32Opnd, II_MAX_S>;
+class MAXA_D_DESC : MAX_MIN_DESC_BASE<"maxa.d", FGR64Opnd, II_MAX_D>;
+class MINA_S_DESC : MAX_MIN_DESC_BASE<"mina.s", FGR32Opnd, II_MIN_D>;
+class MINA_D_DESC : MAX_MIN_DESC_BASE<"mina.d", FGR64Opnd, II_MIN_S>;
+
+class SELEQNEZ_DESC_BASE<string instr_asm, RegisterOperand FGROpnd,
+                         InstrItinClass itin> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
+}
+
+class SELEQZ_S_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd, II_SELCCZ_S>,
+                      MipsR6Arch<"seleqz.s">;
+class SELEQZ_D_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd, II_SELCCZ_D>,
+                      MipsR6Arch<"seleqz.d">;
+class SELNEZ_S_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd, II_SELCCZ_S>,
+                      MipsR6Arch<"selnez.s">;
+class SELNEZ_D_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd, II_SELCCZ_D>,
+                      MipsR6Arch<"selnez.d">;
+
+class CLASS_RINT_DESC_BASE<string instr_asm, RegisterOperand FGROpnd,
+                           InstrItinClass itin> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
+}
+
+class RINT_S_DESC : CLASS_RINT_DESC_BASE<"rint.s", FGR32Opnd, II_RINT_S>;
+class RINT_D_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd, II_RINT_D>;
+class CLASS_S_DESC : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd, II_CLASS_S>;
+class CLASS_D_DESC : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd, II_CLASS_D>;
+
+class CACHE_HINT_DESC<string instr_asm, Operand MemOpnd,
+                      RegisterOperand GPROpnd, InstrItinClass itin>
+                     : MipsR6Arch<instr_asm> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint);
+  string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
+  list<dag> Pattern = [];
+  string DecoderMethod = "DecodeCacheeOp_CacheOpR6";
+  InstrItinClass Itinerary = itin;
+}
+
+class CACHE_DESC : CACHE_HINT_DESC<"cache", mem_simm9, GPR32Opnd, II_CACHE>;
+class PREF_DESC : CACHE_HINT_DESC<"pref", mem_simm9, GPR32Opnd, II_PREF>;
+
+class COP2LD_DESC_BASE<string instr_asm, RegisterOperand COPOpnd,
+                       InstrItinClass itin> {
+  dag OutOperandList = (outs COPOpnd:$rt);
+  dag InOperandList = (ins mem_simm11:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayLoad = 1;
+  string DecoderMethod = "DecodeFMemCop2R6";
+  InstrItinClass Itinerary = itin;
+}
+
+class LDC2_R6_DESC : COP2LD_DESC_BASE<"ldc2", COP2Opnd, II_LDC2>;
+class LWC2_R6_DESC : COP2LD_DESC_BASE<"lwc2", COP2Opnd, II_LWC2>;
+
+class COP2ST_DESC_BASE<string instr_asm, RegisterOperand COPOpnd,
+                       InstrItinClass itin> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins COPOpnd:$rt, mem_simm11:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayStore = 1;
+  string DecoderMethod = "DecodeFMemCop2R6";
+  InstrItinClass Itinerary = itin;
+}
+
+class SDC2_R6_DESC : COP2ST_DESC_BASE<"sdc2", COP2Opnd, II_SDC2>;
+class SWC2_R6_DESC : COP2ST_DESC_BASE<"swc2", COP2Opnd, II_SWC2>;
+
+class LSA_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       Operand ImmOpnd, InstrItinClass itin>
+      : MipsR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$imm2);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $imm2");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
+}
+
+class LSA_R6_DESC : LSA_R6_DESC_BASE<"lsa", GPR32Opnd, uimm2_plus1, II_LSA>;
+
+class LL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                      Operand MemOpnd, InstrItinClass itin>
+      : MipsR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins MemOpnd:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayLoad = 1;
+  InstrItinClass Itinerary = itin;
+}
+
+class LL_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd, mem_simm9, II_LL>;
+
+class SC_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                      InstrItinClass itin> {
+  dag OutOperandList = (outs GPROpnd:$dst);
+  dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayStore = 1;
+  string Constraints = "$rt = $dst";
+  InstrItinClass Itinerary = itin;
+}
+
+class SC_R6_DESC : SC_R6_DESC_BASE<"sc", GPR32Opnd, II_SC>;
+
+class CLO_CLZ_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                           InstrItinClass itin>
+    : MipsR6Arch<instr_asm> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
+  InstrItinClass Itinerary = itin;
+}
+
+class CLO_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       InstrItinClass itin> :
+    CLO_CLZ_R6_DESC_BASE<instr_asm, GPROpnd, itin> {
+  list<dag> Pattern = [(set GPROpnd:$rd, (ctlz (not GPROpnd:$rs)))];
+}
+
+class CLZ_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                       InstrItinClass itin> :
+    CLO_CLZ_R6_DESC_BASE<instr_asm, GPROpnd, itin> {
+  list<dag> Pattern = [(set GPROpnd:$rd, (ctlz GPROpnd:$rs))];
+}
+
+class CLO_R6_DESC : CLO_R6_DESC_BASE<"clo", GPR32Opnd, II_CLO>;
+class CLZ_R6_DESC : CLZ_R6_DESC_BASE<"clz", GPR32Opnd, II_CLZ>;
+
+class SDBBP_R6_DESC {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins uimm20:$code_);
+  string AsmString = "sdbbp\t$code_";
+  list<dag> Pattern = [];
+  bit isCTI = 1;
+  InstrItinClass Itinerary = II_SDBBP;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Definitions
+//
+//===----------------------------------------------------------------------===//
+
+def ADDIUPC : R6MMR6Rel, ADDIUPC_ENC, ADDIUPC_DESC, ISA_MIPS32R6;
+def ALIGN : R6MMR6Rel, ALIGN_ENC, ALIGN_DESC, ISA_MIPS32R6;
+def ALUIPC : R6MMR6Rel, ALUIPC_ENC, ALUIPC_DESC, ISA_MIPS32R6;
+def AUI : R6MMR6Rel, AUI_ENC, AUI_DESC, ISA_MIPS32R6;
+def AUIPC : R6MMR6Rel, AUIPC_ENC, AUIPC_DESC, ISA_MIPS32R6;
+def BAL : BAL_ENC, BAL_DESC, ISA_MIPS32R6;
+def BALC : R6MMR6Rel, BALC_ENC, BALC_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def BC1EQZ : BC1EQZ_ENC, BC1EQZ_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def BC1NEZ : BC1NEZ_ENC, BC1NEZ_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def BC2EQZ : BC2EQZ_ENC, BC2EQZ_DESC, ISA_MIPS32R6;
+  def BC2NEZ : BC2NEZ_ENC, BC2NEZ_DESC, ISA_MIPS32R6;
+}
+def BC : R6MMR6Rel, BC_ENC, BC_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def BEQC : R6MMR6Rel, BEQC_ENC, BEQC_DESC, ISA_MIPS32R6;
+  def BEQZALC : R6MMR6Rel, BEQZALC_ENC, BEQZALC_DESC, ISA_MIPS32R6;
+  def BEQZC : R6MMR6Rel, BEQZC_ENC, BEQZC_DESC, ISA_MIPS32R6;
+  def BGEC : R6MMR6Rel, BGEC_ENC, BGEC_DESC, ISA_MIPS32R6;
+  def BGEUC : R6MMR6Rel, BGEUC_ENC, BGEUC_DESC, ISA_MIPS32R6;
+  def BGEZALC : R6MMR6Rel, BGEZALC_ENC, BGEZALC_DESC, ISA_MIPS32R6;
+  def BGEZC : R6MMR6Rel, BGEZC_ENC, BGEZC_DESC, ISA_MIPS32R6;
+  def BGTZALC : R6MMR6Rel, BGTZALC_ENC, BGTZALC_DESC, ISA_MIPS32R6;
+  def BGTZC : R6MMR6Rel, BGTZC_ENC, BGTZC_DESC, ISA_MIPS32R6;
+}
+def BITSWAP : R6MMR6Rel, BITSWAP_ENC, BITSWAP_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def BLEZALC : R6MMR6Rel, BLEZALC_ENC, BLEZALC_DESC, ISA_MIPS32R6;
+  def BLEZC : R6MMR6Rel, BLEZC_ENC, BLEZC_DESC, ISA_MIPS32R6;
+  def BLTC : R6MMR6Rel, BLTC_ENC, BLTC_DESC, ISA_MIPS32R6;
+  def BLTUC : R6MMR6Rel, BLTUC_ENC, BLTUC_DESC, ISA_MIPS32R6;
+  def BLTZALC : R6MMR6Rel, BLTZALC_ENC, BLTZALC_DESC, ISA_MIPS32R6;
+  def BLTZC : R6MMR6Rel, BLTZC_ENC, BLTZC_DESC, ISA_MIPS32R6;
+  def BNEC : R6MMR6Rel, BNEC_ENC, BNEC_DESC, ISA_MIPS32R6;
+  def BNEZALC : R6MMR6Rel, BNEZALC_ENC, BNEZALC_DESC, ISA_MIPS32R6;
+  def BNEZC : R6MMR6Rel, BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6;
+  def BNVC : R6MMR6Rel, BNVC_ENC, BNVC_DESC, ISA_MIPS32R6;
+  def BOVC : R6MMR6Rel, BOVC_ENC, BOVC_DESC, ISA_MIPS32R6;
+}
+def CACHE_R6 : R6MMR6Rel, CACHE_ENC, CACHE_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+}
+def CLO_R6 : R6MMR6Rel, CLO_R6_ENC, CLO_R6_DESC, ISA_MIPS32R6;
+def CLZ_R6 : R6MMR6Rel, CLZ_R6_ENC, CLZ_R6_DESC, ISA_MIPS32R6;
+defm S : CMP_CC_M<FIELD_CMP_FORMAT_S, "s", FGR32Opnd, II_CMP_CC_S>;
+defm D : CMP_CC_M<FIELD_CMP_FORMAT_D, "d", FGR64Opnd, II_CMP_CC_D>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DIV : R6MMR6Rel, DIV_ENC, DIV_DESC, ISA_MIPS32R6;
+  def DIVU : R6MMR6Rel, DIVU_ENC, DIVU_DESC, ISA_MIPS32R6;
+}
+
+def DVP : R6MMR6Rel, DVP_ENC, DVP_DESC, ISA_MIPS32R6;
+def EVP : R6MMR6Rel, EVP_ENC, EVP_DESC, ISA_MIPS32R6;
+
+def JIALC : R6MMR6Rel, JIALC_ENC, JIALC_DESC, ISA_MIPS32R6;
+def JIC : R6MMR6Rel, JIC_ENC, JIC_DESC, ISA_MIPS32R6;
+def JR_HB_R6 : JR_HB_R6_ENC, JR_HB_R6_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def LDC2_R6 : LDC2_R6_ENC, LDC2_R6_DESC, ISA_MIPS32R6;
+  def LL_R6 : LL_R6_ENC, LL_R6_DESC, PTR_32, ISA_MIPS32R6;
+}
+def LSA_R6 : R6MMR6Rel, LSA_R6_ENC, LSA_R6_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def LWC2_R6 : LWC2_R6_ENC, LWC2_R6_DESC, ISA_MIPS32R6;
+}
+def LWPC : R6MMR6Rel, LWPC_ENC, LWPC_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def LWUPC : R6MMR6Rel, LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6;
+  def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+
+  def MOD : R6MMR6Rel, MOD_ENC, MOD_DESC, ISA_MIPS32R6;
+  def MODU : R6MMR6Rel, MODU_ENC, MODU_DESC, ISA_MIPS32R6;
+
+  def MSUBF_S : MSUBF_S_ENC, MSUBF_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def MSUBF_D : MSUBF_D_ENC, MSUBF_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+
+  def MUH    : R6MMR6Rel, MUH_ENC, MUH_DESC, ISA_MIPS32R6;
+  def MUHU   : R6MMR6Rel, MUHU_ENC, MUHU_DESC, ISA_MIPS32R6;
+  def MUL_R6 : R6MMR6Rel, MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6;
+  def MULU   : R6MMR6Rel, MULU_ENC, MULU_DESC, ISA_MIPS32R6;
+}
+def NAL; // BAL with rd=0
+def PREF_R6 : R6MMR6Rel, PREF_ENC, PREF_DESC, ISA_MIPS32R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SC_R6 : SC_R6_ENC, SC_R6_DESC, PTR_32, ISA_MIPS32R6;
+  def SDBBP_R6 : SDBBP_R6_ENC, SDBBP_R6_DESC, ISA_MIPS32R6;
+  def SELEQZ : R6MMR6Rel, SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32;
+  def SELNEZ : R6MMR6Rel, SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6, GPR_32;
+  def SELEQZ_D : R6MMR6Rel, SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6,
+                 HARDFLOAT;
+  def SELEQZ_S : R6MMR6Rel, SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6,
+                 HARDFLOAT;
+  def SELNEZ_D : R6MMR6Rel, SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6,
+                 HARDFLOAT;
+  def SELNEZ_S : R6MMR6Rel, SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6,
+                 HARDFLOAT;
+  def SEL_D : R6MMR6Rel, SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SEL_S : R6MMR6Rel, SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT;
+  def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6;
+  def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Aliases
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsInstAlias<"dvp", (DVP ZERO), 0>, ISA_MIPS32R6;
+def : MipsInstAlias<"evp", (EVP ZERO), 0>, ISA_MIPS32R6;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+def : MipsInstAlias<"sdbbp", (SDBBP_R6 0)>, ISA_MIPS32R6;
+def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6, GPR_32;
+}
+
+def : MipsInstAlias<"jrc $rs", (JIC GPR32Opnd:$rs, 0), 1>, ISA_MIPS32R6, GPR_32;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+def : MipsInstAlias<"jalrc $rs", (JIALC GPR32Opnd:$rs, 0), 1>, ISA_MIPS32R6, GPR_32;
+}
+//===----------------------------------------------------------------------===//
+//
+// Patterns and Pseudo Instructions
+//
+//===----------------------------------------------------------------------===//
+
+// comparisons supported via another comparison
+multiclass Cmp_Pats<ValueType VT, Instruction NOROp, Register ZEROReg> {
+def : MipsPat<(setone VT:$lhs, VT:$rhs),
+      (NOROp (!cast<Instruction>("CMP_UEQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(seto VT:$lhs, VT:$rhs),
+      (NOROp (!cast<Instruction>("CMP_UN_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(setune VT:$lhs, VT:$rhs),
+      (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+def : MipsPat<(seteq VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setgt VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_LE_"#NAME) VT:$rhs, VT:$lhs)>;
+def : MipsPat<(setge VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_LT_"#NAME) VT:$rhs, VT:$lhs)>;
+def : MipsPat<(setlt VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_LT_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setle VT:$lhs, VT:$rhs),
+      (!cast<Instruction>("CMP_LE_"#NAME) VT:$lhs, VT:$rhs)>;
+def : MipsPat<(setne VT:$lhs, VT:$rhs),
+      (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  defm S : Cmp_Pats<f32, NOR, ZERO>, ISA_MIPS32R6;
+  defm D : Cmp_Pats<f64, NOR, ZERO>, ISA_MIPS32R6;
+}
+
+// i32 selects
+multiclass SelectInt_Pats<ValueType RC, Instruction OROp, Instruction XORiOp,
+                          Instruction SLTiOp, Instruction SLTiuOp,
+                          Instruction SELEQZOp, Instruction SELNEZOp,
+                          SDPatternOperator imm_type, ValueType Opg> {
+// reg, immz
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), RC:$t, RC:$f),
+              (OROp (SELEQZOp RC:$t, RC:$cond), (SELNEZOp RC:$f, RC:$cond))>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), RC:$t, RC:$f),
+              (OROp (SELNEZOp RC:$t, RC:$cond), (SELEQZOp RC:$f, RC:$cond))>;
+
+// reg, immZExt16[_64]
+def : MipsPat<(select (Opg (seteq RC:$cond, imm_type:$imm)), RC:$t, RC:$f),
+              (OROp (SELEQZOp RC:$t, (XORiOp RC:$cond, imm_type:$imm)),
+                    (SELNEZOp RC:$f, (XORiOp RC:$cond, imm_type:$imm)))>;
+def : MipsPat<(select (Opg (setne RC:$cond, imm_type:$imm)), RC:$t, RC:$f),
+              (OROp (SELNEZOp RC:$t, (XORiOp RC:$cond, imm_type:$imm)),
+                    (SELEQZOp RC:$f, (XORiOp RC:$cond, imm_type:$imm)))>;
+
+// reg, immSExt16Plus1
+def : MipsPat<(select (Opg (setgt RC:$cond, immSExt16Plus1:$imm)), RC:$t, RC:$f),
+              (OROp (SELEQZOp RC:$t, (SLTiOp RC:$cond, (Plus1 imm:$imm))),
+                    (SELNEZOp RC:$f, (SLTiOp RC:$cond, (Plus1 imm:$imm))))>;
+def : MipsPat<(select (Opg (setugt RC:$cond, immSExt16Plus1:$imm)), RC:$t, RC:$f),
+              (OROp (SELEQZOp RC:$t, (SLTiuOp RC:$cond, (Plus1 imm:$imm))),
+                    (SELNEZOp RC:$f, (SLTiuOp RC:$cond, (Plus1 imm:$imm))))>;
+
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), RC:$t, immz),
+              (SELEQZOp RC:$t, RC:$cond)>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), RC:$t, immz),
+              (SELNEZOp RC:$t, RC:$cond)>;
+def : MipsPat<(select (Opg (seteq RC:$cond, immz)), immz, RC:$f),
+              (SELNEZOp RC:$f, RC:$cond)>;
+def : MipsPat<(select (Opg (setne RC:$cond, immz)), immz, RC:$f),
+              (SELEQZOp RC:$f, RC:$cond)>;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+defm : SelectInt_Pats<i32, OR, XORi, SLTi, SLTiu, SELEQZ, SELNEZ,
+                      immZExt16, i32>, ISA_MIPS32R6;
+
+def : MipsPat<(select i32:$cond, i32:$t, i32:$f),
+              (OR (SELNEZ i32:$t, i32:$cond),
+                  (SELEQZ i32:$f, i32:$cond))>,
+              ISA_MIPS32R6;
+def : MipsPat<(select i32:$cond, i32:$t, immz),
+              (SELNEZ i32:$t, i32:$cond)>,
+              ISA_MIPS32R6;
+def : MipsPat<(select i32:$cond, immz, i32:$f),
+              (SELEQZ i32:$f, i32:$cond)>,
+              ISA_MIPS32R6;
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
new file mode 100644
index 000000000000..521e22fb7992
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
@@ -0,0 +1,772 @@
+//===- Mips64InstrInfo.td - Mips64 Instruction Information -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips64 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Mips Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+// shamt must fit in 6 bits.
+def immZExt6 : ImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>;
+
+// Node immediate fits as 10-bit sign extended on target immediate.
+// e.g. seqi, snei
+def immSExt10_64 : PatLeaf<(i64 imm),
+                           [{ return isInt<10>(N->getSExtValue()); }]>;
+
+def immZExt16_64 : PatLeaf<(i64 imm),
+                           [{ return isUInt<16>(N->getZExtValue()); }]>;
+
+def immZExt5_64 : ImmLeaf<i64, [{ return Imm == (Imm & 0x1f); }]>;
+
+// Transformation function: get log2 of low 32 bits of immediate
+def Log2LO : SDNodeXForm<imm, [{
+  return getImm(N, Log2_64((unsigned) N->getZExtValue()));
+}]>;
+
+// Transformation function: get log2 of high 32 bits of immediate
+def Log2HI : SDNodeXForm<imm, [{
+  return getImm(N, Log2_64((unsigned) (N->getZExtValue() >> 32)));
+}]>;
+
+// Predicate: True if immediate is a power of 2 and fits 32 bits
+def PowerOf2LO : PatLeaf<(imm), [{
+  if (N->getValueType(0) == MVT::i64) {
+    uint64_t Imm = N->getZExtValue();
+    return isPowerOf2_64(Imm) && (Imm & 0xffffffff) == Imm;
+  }
+  else
+    return false;
+}]>;
+
+// Predicate: True if immediate is a power of 2 and exceeds 32 bits
+def PowerOf2HI : PatLeaf<(imm), [{
+  if (N->getValueType(0) == MVT::i64) {
+    uint64_t Imm = N->getZExtValue();
+    return isPowerOf2_64(Imm) && (Imm & 0xffffffff00000000) == Imm;
+  }
+  else
+    return false;
+}]>;
+
+def assertzext_lt_i32 : PatFrag<(ops node:$src), (assertzext node:$src), [{
+  return cast<VTSDNode>(N->getOperand(1))->getVT().bitsLT(MVT::i32);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+let usesCustomInserter = 1 in {
+  def ATOMIC_LOAD_ADD_I64  : Atomic2Ops<atomic_load_add_64, GPR64>;
+  def ATOMIC_LOAD_SUB_I64  : Atomic2Ops<atomic_load_sub_64, GPR64>;
+  def ATOMIC_LOAD_AND_I64  : Atomic2Ops<atomic_load_and_64, GPR64>;
+  def ATOMIC_LOAD_OR_I64   : Atomic2Ops<atomic_load_or_64, GPR64>;
+  def ATOMIC_LOAD_XOR_I64  : Atomic2Ops<atomic_load_xor_64, GPR64>;
+  def ATOMIC_LOAD_NAND_I64 : Atomic2Ops<atomic_load_nand_64, GPR64>;
+  def ATOMIC_SWAP_I64      : Atomic2Ops<atomic_swap_64, GPR64>;
+  def ATOMIC_CMP_SWAP_I64  : AtomicCmpSwap<atomic_cmp_swap_64, GPR64>;
+}
+
+/// Pseudo instructions for loading and storing accumulator registers.
+let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
+  def LOAD_ACC128  : Load<"", ACC128>;
+  def STORE_ACC128 : Store<"", ACC128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction definition
+//===----------------------------------------------------------------------===//
+let DecoderNamespace = "Mips64" in {
+/// Arithmetic Instructions (ALU Immediate)
+def DADDi   : ArithLogicI<"daddi", simm16_64, GPR64Opnd, II_DADDI>,
+              ADDI_FM<0x18>, ISA_MIPS3_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DADDiu : StdMMR6Rel, ArithLogicI<"daddiu", simm16_64, GPR64Opnd,
+                                       II_DADDIU, immSExt16, add>,
+               ADDI_FM<0x19>, IsAsCheapAsAMove, ISA_MIPS3;
+}
+
+let isCodeGenOnly = 1 in {
+def SLTi64  : SetCC_I<"slti", setlt, simm16_64, immSExt16, GPR64Opnd>,
+              SLTI_FM<0xa>;
+def SLTiu64 : SetCC_I<"sltiu", setult, simm16_64, immSExt16, GPR64Opnd>,
+              SLTI_FM<0xb>;
+def ANDi64 : ArithLogicI<"andi", uimm16_64, GPR64Opnd, II_AND, immZExt16, and>,
+             ADDI_FM<0xc>;
+def ORi64   : ArithLogicI<"ori", uimm16_64, GPR64Opnd, II_OR, immZExt16, or>,
+              ADDI_FM<0xd>;
+def XORi64  : ArithLogicI<"xori", uimm16_64, GPR64Opnd, II_XOR, immZExt16, xor>,
+              ADDI_FM<0xe>;
+def LUi64   : LoadUpper<"lui", GPR64Opnd, uimm16_64_relaxed>, LUI_FM;
+}
+
+/// Arithmetic Instructions (3-Operand, R-Type)
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DADD   : StdMMR6Rel, ArithLogicR<"dadd", GPR64Opnd, 1, II_DADD>,
+               ADD_FM<0, 0x2c>, ISA_MIPS3;
+  def DADDu  : StdMMR6Rel, ArithLogicR<"daddu", GPR64Opnd, 1, II_DADDU, add>,
+               ADD_FM<0, 0x2d>, ISA_MIPS3;
+  def DSUBu  : StdMMR6Rel, ArithLogicR<"dsubu", GPR64Opnd, 0, II_DSUBU, sub>, ADD_FM<0, 0x2f>,
+               ISA_MIPS3;
+  def DSUB   : StdMMR6Rel, ArithLogicR<"dsub", GPR64Opnd, 0, II_DSUB>, ADD_FM<0, 0x2e>,
+               ISA_MIPS3;
+}
+
+let isCodeGenOnly = 1 in {
+def SLT64  : SetCC_R<"slt", setlt, GPR64Opnd>, ADD_FM<0, 0x2a>;
+def SLTu64 : SetCC_R<"sltu", setult, GPR64Opnd>, ADD_FM<0, 0x2b>;
+def AND64  : ArithLogicR<"and", GPR64Opnd, 1, II_AND, and>, ADD_FM<0, 0x24>;
+def OR64   : ArithLogicR<"or", GPR64Opnd, 1, II_OR, or>, ADD_FM<0, 0x25>;
+def XOR64  : ArithLogicR<"xor", GPR64Opnd, 1, II_XOR, xor>, ADD_FM<0, 0x26>;
+def NOR64  : LogicNOR<"nor", GPR64Opnd>, ADD_FM<0, 0x27>;
+}
+
+/// Shift Instructions
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DSLL : StdMMR6Rel, shift_rotate_imm<"dsll", uimm6, GPR64Opnd, II_DSLL,
+                                          shl, immZExt6>,
+             SRA_FM<0x38, 0>, ISA_MIPS3;
+  def DSRL : StdMMR6Rel, shift_rotate_imm<"dsrl", uimm6, GPR64Opnd, II_DSRL,
+                                          srl, immZExt6>,
+             SRA_FM<0x3a, 0>, ISA_MIPS3;
+  def DSRA : StdMMR6Rel, shift_rotate_imm<"dsra", uimm6, GPR64Opnd, II_DSRA,
+                                          sra, immZExt6>,
+             SRA_FM<0x3b, 0>, ISA_MIPS3;
+  def DSLLV  : StdMMR6Rel, shift_rotate_reg<"dsllv", GPR64Opnd, II_DSLLV, shl>,
+               SRLV_FM<0x14, 0>, ISA_MIPS3;
+  def DSRAV  : StdMMR6Rel, shift_rotate_reg<"dsrav", GPR64Opnd, II_DSRAV, sra>,
+               SRLV_FM<0x17, 0>, ISA_MIPS3;
+  def DSRLV  : StdMMR6Rel, shift_rotate_reg<"dsrlv", GPR64Opnd, II_DSRLV, srl>,
+               SRLV_FM<0x16, 0>, ISA_MIPS3;
+  def DSLL32 : StdMMR6Rel, shift_rotate_imm<"dsll32", uimm5, GPR64Opnd,
+                                            II_DSLL32>,
+               SRA_FM<0x3c, 0>, ISA_MIPS3;
+  def DSRL32 : StdMMR6Rel, shift_rotate_imm<"dsrl32", uimm5, GPR64Opnd,
+                                            II_DSRL32>,
+               SRA_FM<0x3e, 0>, ISA_MIPS3;
+  def DSRA32 : StdMMR6Rel, shift_rotate_imm<"dsra32", uimm5, GPR64Opnd,
+               II_DSRA32>,
+               SRA_FM<0x3f, 0>, ISA_MIPS3;
+
+// Rotate Instructions
+  def DROTR  : StdMMR6Rel, shift_rotate_imm<"drotr", uimm6, GPR64Opnd, II_DROTR,
+                                            rotr, immZExt6>,
+               SRA_FM<0x3a, 1>, ISA_MIPS64R2;
+  def DROTRV : StdMMR6Rel, shift_rotate_reg<"drotrv", GPR64Opnd, II_DROTRV,
+                                            rotr>,
+               SRLV_FM<0x16, 1>, ISA_MIPS64R2;
+  def DROTR32 : StdMMR6Rel, shift_rotate_imm<"drotr32", uimm5, GPR64Opnd,
+                                             II_DROTR32>,
+                SRA_FM<0x3e, 1>, ISA_MIPS64R2;
+}
+
+/// Load and Store Instructions
+///  aligned
+let isCodeGenOnly = 1 in {
+def LB64  : Load<"lb", GPR64Opnd, sextloadi8, II_LB>, LW_FM<0x20>;
+def LBu64 : Load<"lbu", GPR64Opnd, zextloadi8, II_LBU>, LW_FM<0x24>;
+def LH64  : Load<"lh", GPR64Opnd, sextloadi16, II_LH>, LW_FM<0x21>;
+def LHu64 : Load<"lhu", GPR64Opnd, zextloadi16, II_LHU>, LW_FM<0x25>;
+def LW64  : Load<"lw", GPR64Opnd, sextloadi32, II_LW>, LW_FM<0x23>;
+def SB64  : Store<"sb", GPR64Opnd, truncstorei8, II_SB>, LW_FM<0x28>;
+def SH64  : Store<"sh", GPR64Opnd, truncstorei16, II_SH>, LW_FM<0x29>;
+def SW64  : Store<"sw", GPR64Opnd, truncstorei32, II_SW>, LW_FM<0x2b>;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  def LWu : StdMMR6Rel, MMRel, Load<"lwu", GPR64Opnd, zextloadi32, II_LWU>,
+            LW_FM<0x27>, ISA_MIPS3;
+  def LD  : StdMMR6Rel, LoadMemory<"ld", GPR64Opnd, mem_simm16, load, II_LD>,
+            LW_FM<0x37>, ISA_MIPS3;
+  def SD  : StdMMR6Rel, StoreMemory<"sd", GPR64Opnd, mem_simm16, store, II_SD>,
+            LW_FM<0x3f>, ISA_MIPS3;
+}
+
+
+
+/// load/store left/right
+let isCodeGenOnly = 1 in {
+def LWL64 : LoadLeftRight<"lwl", MipsLWL, GPR64Opnd, II_LWL>, LW_FM<0x22>;
+def LWR64 : LoadLeftRight<"lwr", MipsLWR, GPR64Opnd, II_LWR>, LW_FM<0x26>;
+def SWL64 : StoreLeftRight<"swl", MipsSWL, GPR64Opnd, II_SWL>, LW_FM<0x2a>;
+def SWR64 : StoreLeftRight<"swr", MipsSWR, GPR64Opnd, II_SWR>, LW_FM<0x2e>;
+}
+
+def LDL   : LoadLeftRight<"ldl", MipsLDL, GPR64Opnd, II_LDL>, LW_FM<0x1a>,
+            ISA_MIPS3_NOT_32R6_64R6;
+def LDR   : LoadLeftRight<"ldr", MipsLDR, GPR64Opnd, II_LDR>, LW_FM<0x1b>,
+            ISA_MIPS3_NOT_32R6_64R6;
+def SDL   : StoreLeftRight<"sdl", MipsSDL, GPR64Opnd, II_SDL>, LW_FM<0x2c>,
+            ISA_MIPS3_NOT_32R6_64R6;
+def SDR   : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, II_SDR>, LW_FM<0x2d>,
+            ISA_MIPS3_NOT_32R6_64R6;
+
+/// Load-linked, Store-conditional
+let AdditionalPredicates = [NotInMicroMips] in {
+  def LLD : StdMMR6Rel, LLBase<"lld", GPR64Opnd, mem_simm16>, LW_FM<0x34>,
+            ISA_MIPS3_NOT_32R6_64R6;
+}
+def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>, ISA_MIPS3_NOT_32R6_64R6;
+
+let AdditionalPredicates = [NotInMicroMips],
+    DecoderNamespace = "Mips32_64_PTR64" in {
+def LL64 : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, PTR_64,
+           ISA_MIPS2_NOT_32R6_64R6;
+def SC64 : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, PTR_64,
+           ISA_MIPS2_NOT_32R6_64R6;
+def JR64   : IndirectBranch<"jr", GPR64Opnd>, MTLO_FM<8>, PTR_64;
+}
+
+def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM;
+
+/// Jump and Branch Instructions
+let isCodeGenOnly = 1 in {
+  def BEQ64  : CBranch<"beq", brtarget, seteq, GPR64Opnd>, BEQ_FM<4>;
+  def BNE64  : CBranch<"bne", brtarget, setne, GPR64Opnd>, BEQ_FM<5>;
+  def BGEZ64 : CBranchZero<"bgez", brtarget, setge, GPR64Opnd>, BGEZ_FM<1, 1>;
+  def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>;
+  def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>;
+  def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>;
+  def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>;
+}
+
+def TAILCALLREG64 : TailCallReg<GPR64Opnd>;
+
+def PseudoReturn64 : PseudoReturnBase<GPR64Opnd>;
+def PseudoIndirectBranch64 : PseudoIndirectBranchBase<GPR64Opnd>;
+
+/// Multiply and Divide Instructions.
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DMULT  : Mult<"dmult", II_DMULT, GPR64Opnd, [HI0_64, LO0_64]>,
+               MULT_FM<0, 0x1c>, ISA_MIPS3_NOT_32R6_64R6;
+  def DMULTu : Mult<"dmultu", II_DMULTU, GPR64Opnd, [HI0_64, LO0_64]>,
+               MULT_FM<0, 0x1d>, ISA_MIPS3_NOT_32R6_64R6;
+}
+def PseudoDMULT  : MultDivPseudo<DMULT, ACC128, GPR64Opnd, MipsMult,
+                                 II_DMULT>, ISA_MIPS3_NOT_32R6_64R6;
+def PseudoDMULTu : MultDivPseudo<DMULTu, ACC128, GPR64Opnd, MipsMultu,
+                                 II_DMULTU>, ISA_MIPS3_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DSDIV : Div<"ddiv", II_DDIV, GPR64Opnd, [HI0_64, LO0_64]>,
+              MULT_FM<0, 0x1e>, ISA_MIPS3_NOT_32R6_64R6;
+  def DUDIV : Div<"ddivu", II_DDIVU, GPR64Opnd, [HI0_64, LO0_64]>,
+              MULT_FM<0, 0x1f>, ISA_MIPS3_NOT_32R6_64R6;
+}
+def PseudoDSDIV : MultDivPseudo<DSDIV, ACC128, GPR64Opnd, MipsDivRem,
+                                II_DDIV, 0, 1, 1>, ISA_MIPS3_NOT_32R6_64R6;
+def PseudoDUDIV : MultDivPseudo<DUDIV, ACC128, GPR64Opnd, MipsDivRemU,
+                                II_DDIVU, 0, 1, 1>, ISA_MIPS3_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in {
+def MTHI64 : MoveToLOHI<"mthi", GPR64Opnd, [HI0_64]>, MTLO_FM<0x11>,
+             ISA_MIPS3_NOT_32R6_64R6;
+def MTLO64 : MoveToLOHI<"mtlo", GPR64Opnd, [LO0_64]>, MTLO_FM<0x13>,
+             ISA_MIPS3_NOT_32R6_64R6;
+def MFHI64 : MoveFromLOHI<"mfhi", GPR64Opnd, AC0_64>, MFLO_FM<0x10>,
+             ISA_MIPS3_NOT_32R6_64R6;
+def MFLO64 : MoveFromLOHI<"mflo", GPR64Opnd, AC0_64>, MFLO_FM<0x12>,
+             ISA_MIPS3_NOT_32R6_64R6;
+def PseudoMFHI64 : PseudoMFLOHI<GPR64, ACC128, MipsMFHI>,
+                   ISA_MIPS3_NOT_32R6_64R6;
+def PseudoMFLO64 : PseudoMFLOHI<GPR64, ACC128, MipsMFLO>,
+                   ISA_MIPS3_NOT_32R6_64R6;
+def PseudoMTLOHI64 : PseudoMTLOHI<ACC128, GPR64>, ISA_MIPS3_NOT_32R6_64R6;
+
+/// Sign Ext In Register Instructions.
+def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd, II_SEB>, SEB_FM<0x10, 0x20>,
+            ISA_MIPS32R2;
+def SEH64 : SignExtInReg<"seh", i16, GPR64Opnd, II_SEH>, SEB_FM<0x18, 0x20>,
+            ISA_MIPS32R2;
+}
+
+/// Count Leading
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DCLZ : StdMMR6Rel, CountLeading0<"dclz", GPR64Opnd, II_DCLZ>,
+             CLO_FM<0x24>, ISA_MIPS64_NOT_64R6;
+  def DCLO : StdMMR6Rel, CountLeading1<"dclo", GPR64Opnd, II_DCLO>,
+             CLO_FM<0x25>, ISA_MIPS64_NOT_64R6;
+
+/// Double Word Swap Bytes/HalfWords
+  def DSBH : SubwordSwap<"dsbh", GPR64Opnd, II_DSBH>, SEB_FM<2, 0x24>,
+             ISA_MIPS64R2;
+  def DSHD : SubwordSwap<"dshd", GPR64Opnd, II_DSHD>, SEB_FM<5, 0x24>,
+             ISA_MIPS64R2;
+}
+
+def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>;
+
+let isCodeGenOnly = 1 in
+def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  // The 'pos + size' constraints are enforced by the code that lowers into
+  // MipsISD::Ext.
+  def DEXT : ExtBase<"dext", GPR64Opnd, uimm5_report_uimm6, uimm5_plus1,
+                     immZExt5, immZExt5Plus1, MipsExt>, EXT_FM<3>,
+                     ISA_MIPS64R2;
+  def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5, uimm5_plus33, immZExt5,
+                      immZExt5Plus33, MipsExt>, EXT_FM<1>, ISA_MIPS64R2;
+  def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm5_plus32, uimm5_plus1,
+                      immZExt5Plus32, immZExt5Plus1, MipsExt>, EXT_FM<2>,
+                      ISA_MIPS64R2;
+  def DINS : InsBase<"dins", GPR64Opnd, uimm6, uimm5_inssize_plus1, MipsIns>,
+             EXT_FM<7>, ISA_MIPS64R2;
+  def DINSU : InsBase<"dinsu", GPR64Opnd, uimm5_plus32, uimm5_inssize_plus1>,
+              EXT_FM<6>, ISA_MIPS64R2;
+  def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5, uimm5_inssize_plus1>,
+              EXT_FM<5>, ISA_MIPS64R2;
+}
+
+let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
+  def DSLL64_32 : FR<0x00, 0x3c, (outs GPR64:$rd), (ins GPR32:$rt),
+                     "dsll\t$rd, $rt, 32", [], II_DSLL>;
+  def SLL64_32 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR32:$rt),
+                    "sll\t$rd, $rt, 0", [], II_SLL>;
+  def SLL64_64 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR64:$rt),
+                    "sll\t$rd, $rt, 0", [], II_SLL>;
+}
+
+// We need the following pseudo instruction to avoid offset calculation for
+// long branches.  See the comment in file MipsLongBranch.cpp for detailed
+// explanation.
+
+// Expands to: daddiu $dst, $src, %PART($tgt - $baltgt)
+// where %PART may be %hi or %lo, depending on the relocation kind
+// that $tgt is annotated with.
+def LONG_BRANCH_DADDiu : PseudoSE<(outs GPR64Opnd:$dst),
+  (ins GPR64Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
+
+// Cavium Octeon cnMIPS instructions
+let DecoderNamespace = "CnMips",
+    // FIXME: The lack of HasStdEnc is probably a bug
+    EncodingPredicates = []<Predicate> in {
+
+class Count1s<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+         [(set RO:$rd, (ctpop RO:$rs))], II_POP, FrmR, opstr> {
+  let TwoOperandAliasConstraint = "$rd = $rs";
+}
+
+class ExtsCins<string opstr, InstrItinClass itin,
+               SDPatternOperator Op = null_frag>:
+  InstSE<(outs GPR64Opnd:$rt), (ins GPR64Opnd:$rs, uimm5:$pos, uimm5:$lenm1),
+         !strconcat(opstr, " $rt, $rs, $pos, $lenm1"),
+         [(set GPR64Opnd:$rt, (Op GPR64Opnd:$rs, imm:$pos, imm:$lenm1))],
+         itin, FrmR, opstr> {
+  let TwoOperandAliasConstraint = "$rt = $rs";
+}
+
+class SetCC64_R<string opstr, PatFrag cond_op> :
+  InstSE<(outs GPR64Opnd:$rd), (ins GPR64Opnd:$rs, GPR64Opnd:$rt),
+         !strconcat(opstr, "\t$rd, $rs, $rt"),
+         [(set GPR64Opnd:$rd, (zext (cond_op GPR64Opnd:$rs,
+                                             GPR64Opnd:$rt)))],
+         II_SEQ_SNE, FrmR, opstr> {
+  let TwoOperandAliasConstraint = "$rd = $rs";
+}
+
+class SetCC64_I<string opstr, PatFrag cond_op>:
+  InstSE<(outs GPR64Opnd:$rt), (ins GPR64Opnd:$rs, simm10_64:$imm10),
+         !strconcat(opstr, "\t$rt, $rs, $imm10"),
+         [(set GPR64Opnd:$rt, (zext (cond_op GPR64Opnd:$rs,
+                                             immSExt10_64:$imm10)))],
+         II_SEQI_SNEI, FrmI, opstr> {
+  let TwoOperandAliasConstraint = "$rt = $rs";
+}
+
+class CBranchBitNum<string opstr, DAGOperand opnd, PatFrag cond_op,
+                    RegisterOperand RO, Operand ImmOp, bits<64> shift = 1> :
+  InstSE<(outs), (ins RO:$rs, ImmOp:$p, opnd:$offset),
+         !strconcat(opstr, "\t$rs, $p, $offset"),
+         [(brcond (i32 (cond_op (and RO:$rs, (shl shift, immZExt5_64:$p)), 0)),
+                  bb:$offset)], II_BBIT, FrmI, opstr> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let Defs = [AT];
+}
+
+class MFC2OP<string asmstr, RegisterOperand RO, InstrItinClass itin> :
+  InstSE<(outs RO:$rt, uimm16:$imm16), (ins),
+         !strconcat(asmstr, "\t$rt, $imm16"), [], itin, FrmFR>;
+
+// Unsigned Byte Add
+def BADDu  : ArithLogicR<"baddu", GPR64Opnd, 1, II_BADDU>,
+             ADD_FM<0x1c, 0x28>, ASE_CNMIPS {
+  let Pattern = [(set GPR64Opnd:$rd,
+                      (and (add GPR64Opnd:$rs, GPR64Opnd:$rt), 255))];
+}
+
+// Branch on Bit Clear /+32
+def BBIT0  : CBranchBitNum<"bbit0", brtarget, seteq, GPR64Opnd,
+                           uimm5_64_report_uimm6>, BBIT_FM<0x32>, ASE_CNMIPS;
+def BBIT032: CBranchBitNum<"bbit032", brtarget, seteq, GPR64Opnd, uimm5_64,
+                           0x100000000>, BBIT_FM<0x36>, ASE_CNMIPS;
+
+// Branch on Bit Set /+32
+def BBIT1  : CBranchBitNum<"bbit1", brtarget, setne, GPR64Opnd,
+                           uimm5_64_report_uimm6>, BBIT_FM<0x3a>, ASE_CNMIPS;
+def BBIT132: CBranchBitNum<"bbit132", brtarget, setne, GPR64Opnd, uimm5_64,
+                           0x100000000>, BBIT_FM<0x3e>, ASE_CNMIPS;
+
+// Multiply Doubleword to GPR
+def DMUL  : ArithLogicR<"dmul", GPR64Opnd, 1, II_DMUL, mul>,
+            ADD_FM<0x1c, 0x03>, ASE_CNMIPS {
+  let Defs = [HI0, LO0, P0, P1, P2];
+}
+
+// Extract a signed bit field /+32
+def EXTS  : ExtsCins<"exts", II_EXT>, EXTS_FM<0x3a>, ASE_CNMIPS;
+def EXTS32: ExtsCins<"exts32", II_EXT>, EXTS_FM<0x3b>, ASE_CNMIPS;
+
+// Clear and insert a bit field /+32
+def CINS  : ExtsCins<"cins", II_INS>, EXTS_FM<0x32>, ASE_CNMIPS;
+def CINS32: ExtsCins<"cins32", II_INS>, EXTS_FM<0x33>, ASE_CNMIPS;
+
+// Move to multiplier/product register
+def MTM0   : MoveToLOHI<"mtm0", GPR64Opnd, [MPL0, P0, P1, P2]>, MTMR_FM<0x08>,
+             ASE_CNMIPS;
+def MTM1   : MoveToLOHI<"mtm1", GPR64Opnd, [MPL1, P0, P1, P2]>, MTMR_FM<0x0c>,
+             ASE_CNMIPS;
+def MTM2   : MoveToLOHI<"mtm2", GPR64Opnd, [MPL2, P0, P1, P2]>, MTMR_FM<0x0d>,
+             ASE_CNMIPS;
+def MTP0   : MoveToLOHI<"mtp0", GPR64Opnd, [P0]>, MTMR_FM<0x09>, ASE_CNMIPS;
+def MTP1   : MoveToLOHI<"mtp1", GPR64Opnd, [P1]>, MTMR_FM<0x0a>, ASE_CNMIPS;
+def MTP2   : MoveToLOHI<"mtp2", GPR64Opnd, [P2]>, MTMR_FM<0x0b>, ASE_CNMIPS;
+
+// Count Ones in a Word/Doubleword
+def POP   : Count1s<"pop", GPR32Opnd>, POP_FM<0x2c>, ASE_CNMIPS;
+def DPOP  : Count1s<"dpop", GPR64Opnd>, POP_FM<0x2d>, ASE_CNMIPS;
+
+// Set on equal/not equal
+def SEQ   : SetCC64_R<"seq", seteq>, SEQ_FM<0x2a>, ASE_CNMIPS;
+def SEQi  : SetCC64_I<"seqi", seteq>, SEQI_FM<0x2e>, ASE_CNMIPS;
+def SNE   : SetCC64_R<"sne", setne>, SEQ_FM<0x2b>, ASE_CNMIPS;
+def SNEi  : SetCC64_I<"snei", setne>, SEQI_FM<0x2f>, ASE_CNMIPS;
+
+// 192-bit x 64-bit Unsigned Multiply and Add
+def V3MULU: ArithLogicR<"v3mulu", GPR64Opnd, 0, II_DMUL>, ADD_FM<0x1c, 0x11>,
+            ASE_CNMIPS {
+  let Defs = [P0, P1, P2];
+}
+
+// 64-bit Unsigned Multiply and Add Move
+def VMM0  : ArithLogicR<"vmm0", GPR64Opnd, 0, II_DMUL>, ADD_FM<0x1c, 0x10>,
+            ASE_CNMIPS {
+  let Defs = [MPL0, P0, P1, P2];
+}
+
+// 64-bit Unsigned Multiply and Add
+def VMULU : ArithLogicR<"vmulu", GPR64Opnd, 0, II_DMUL>, ADD_FM<0x1c, 0x0f>,
+            ASE_CNMIPS {
+  let Defs = [MPL1, MPL2, P0, P1, P2];
+}
+
+// Move between CPU and coprocessor registers
+def DMFC2_OCTEON : MFC2OP<"dmfc2", GPR64Opnd, II_DMFC2>, MFC2OP_FM<0x12, 1>,
+                   ASE_CNMIPS;
+def DMTC2_OCTEON : MFC2OP<"dmtc2", GPR64Opnd, II_DMTC2>, MFC2OP_FM<0x12, 5>,
+                   ASE_CNMIPS;
+}
+
+}
+
+/// Move between CPU and coprocessor registers
+let DecoderNamespace = "Mips64", Predicates = [HasMips64] in {
+def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd, COP0Opnd, II_DMFC0>, MFC3OP_FM<0x10, 1>,
+            ISA_MIPS3;
+def DMTC0 : MTC3OP<"dmtc0", COP0Opnd, GPR64Opnd, II_DMTC0>, MFC3OP_FM<0x10, 5>,
+            ISA_MIPS3;
+def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd, COP2Opnd, II_DMFC2>, MFC3OP_FM<0x12, 1>,
+            ISA_MIPS3;
+def DMTC2 : MTC3OP<"dmtc2", COP2Opnd, GPR64Opnd, II_DMTC2>, MFC3OP_FM<0x12, 5>,
+            ISA_MIPS3;
+}
+
+//===----------------------------------------------------------------------===//
+//  Arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+// Materialize i64 constants.
+defm : MaterializeImms<i64, ZERO_64, DADDiu, LUi64, ORi64>;
+
+def : MipsPat<(i64 immZExt32Low16Zero:$imm),
+              (DSLL (ORi64 ZERO_64, (HI16 imm:$imm)), 16)>;
+
+def : MipsPat<(i64 immZExt32:$imm),
+              (ORi64 (DSLL (ORi64 ZERO_64, (HI16 imm:$imm)), 16),
+                     (LO16 imm:$imm))>;
+
+// extended loads
+def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
+def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
+def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
+def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
+
+// hi/lo relocs
+def : MipsPat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
+def : MipsPat<(MipsHi tblockaddress:$in), (LUi64 tblockaddress:$in)>;
+def : MipsPat<(MipsHi tjumptable:$in), (LUi64 tjumptable:$in)>;
+def : MipsPat<(MipsHi tconstpool:$in), (LUi64 tconstpool:$in)>;
+def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>;
+def : MipsPat<(MipsHi texternalsym:$in), (LUi64 texternalsym:$in)>;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(MipsLo tglobaladdr:$in), (DADDiu ZERO_64, tglobaladdr:$in)>;
+  def : MipsPat<(MipsLo tblockaddress:$in),
+                (DADDiu ZERO_64, tblockaddress:$in)>;
+  def : MipsPat<(MipsLo tjumptable:$in), (DADDiu ZERO_64, tjumptable:$in)>;
+  def : MipsPat<(MipsLo tconstpool:$in), (DADDiu ZERO_64, tconstpool:$in)>;
+  def : MipsPat<(MipsLo tglobaltlsaddr:$in),
+                (DADDiu ZERO_64, tglobaltlsaddr:$in)>;
+  def : MipsPat<(MipsLo texternalsym:$in), (DADDiu ZERO_64, texternalsym:$in)>;
+
+  def : MipsPat<(add GPR64:$hi, (MipsLo tglobaladdr:$lo)),
+                (DADDiu GPR64:$hi, tglobaladdr:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsLo tblockaddress:$lo)),
+                (DADDiu GPR64:$hi, tblockaddress:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsLo tjumptable:$lo)),
+                (DADDiu GPR64:$hi, tjumptable:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsLo tconstpool:$lo)),
+                (DADDiu GPR64:$hi, tconstpool:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsLo tglobaltlsaddr:$lo)),
+                (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>;
+
+  def : WrapperPat<tglobaladdr, DADDiu, GPR64>;
+  def : WrapperPat<tconstpool, DADDiu, GPR64>;
+  def : WrapperPat<texternalsym, DADDiu, GPR64>;
+  def : WrapperPat<tblockaddress, DADDiu, GPR64>;
+  def : WrapperPat<tjumptable, DADDiu, GPR64>;
+  def : WrapperPat<tglobaltlsaddr, DADDiu, GPR64>;
+}
+
+defm : BrcondPats<GPR64, BEQ64, BEQ, BNE64, SLT64, SLTu64, SLTi64, SLTiu64,
+                  ZERO_64>;
+def : MipsPat<(brcond (i32 (setlt i64:$lhs, 1)), bb:$dst),
+              (BLEZ64 i64:$lhs, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setgt i64:$lhs, -1)), bb:$dst),
+              (BGEZ64 i64:$lhs, bb:$dst)>;
+
+// setcc patterns
+let AdditionalPredicates = [NotInMicroMips] in {
+  defm : SeteqPats<GPR64, SLTiu64, XOR64, SLTu64, ZERO_64>;
+  defm : SetlePats<GPR64, XORi, SLT64, SLTu64>;
+  defm : SetgtPats<GPR64, SLT64, SLTu64>;
+  defm : SetgePats<GPR64, XORi, SLT64, SLTu64>;
+  defm : SetgeImmPats<GPR64, XORi, SLTi64, SLTiu64>;
+}
+// truncate
+def : MipsPat<(trunc (assertsext GPR64:$src)),
+              (EXTRACT_SUBREG GPR64:$src, sub_32)>;
+// The forward compatibility strategy employed by MIPS requires us to treat
+// values as being sign extended to an infinite number of bits. This allows
+// existing software to run without modification on any future MIPS
+// implementation (e.g. 128-bit, or 1024-bit). Being compatible with this
+// strategy requires that truncation acts as a sign-extension for values being
+// fed into instructions operating on 32-bit values. Such instructions have
+// undefined results if this is not true.
+// For our case, this means that we can't issue an extract_subreg for nodes
+// such as (trunc:i32 (assertzext:i64 X, i32)), because the sign-bit of the
+// lower subreg would not be replicated into the upper half.
+def : MipsPat<(trunc (assertzext_lt_i32 GPR64:$src)),
+              (EXTRACT_SUBREG GPR64:$src, sub_32)>;
+def : MipsPat<(i32 (trunc GPR64:$src)),
+              (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>;
+
+// variable shift instructions patterns
+def : MipsPat<(shl GPR64:$rt, (i32 (trunc GPR64:$rs))),
+              (DSLLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
+def : MipsPat<(srl GPR64:$rt, (i32 (trunc GPR64:$rs))),
+              (DSRLV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
+def : MipsPat<(sra GPR64:$rt, (i32 (trunc GPR64:$rs))),
+              (DSRAV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(rotr GPR64:$rt, (i32 (trunc GPR64:$rs))),
+                (DROTRV GPR64:$rt, (EXTRACT_SUBREG GPR64:$rs, sub_32))>;
+}
+
+// 32-to-64-bit extension
+def : MipsPat<(i64 (anyext GPR32:$src)),
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
+def : MipsPat<(i64 (zext GPR32:$src)), (DSRL (DSLL64_32 GPR32:$src), 32)>;
+def : MipsPat<(i64 (sext GPR32:$src)), (SLL64_32 GPR32:$src)>;
+
+// Sign extend in register
+def : MipsPat<(i64 (sext_inreg GPR64:$src, i32)),
+              (SLL64_64 GPR64:$src)>;
+
+// bswap MipsPattern
+def : MipsPat<(bswap GPR64:$rt), (DSHD (DSBH GPR64:$rt))>;
+
+// Carry pattern
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(subc GPR64:$lhs, GPR64:$rhs),
+                (DSUBu GPR64:$lhs, GPR64:$rhs)>;
+  def : MipsPat<(addc GPR64:$lhs, GPR64:$rhs),
+                (DADDu GPR64:$lhs, GPR64:$rhs)>, ASE_NOT_DSP;
+  def : MipsPat<(addc GPR64:$lhs, immSExt16:$imm),
+                (DADDiu GPR64:$lhs, imm:$imm)>, ASE_NOT_DSP;
+}
+
+// Octeon bbit0/bbit1 MipsPattern
+def : MipsPat<(brcond (i32 (seteq (and i64:$lhs, PowerOf2LO:$mask), 0)), bb:$dst),
+              (BBIT0 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+def : MipsPat<(brcond (i32 (seteq (and i64:$lhs, PowerOf2HI:$mask), 0)), bb:$dst),
+              (BBIT032 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+def : MipsPat<(brcond (i32 (setne (and i64:$lhs, PowerOf2LO:$mask), 0)), bb:$dst),
+              (BBIT1 i64:$lhs, (Log2LO PowerOf2LO:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+def : MipsPat<(brcond (i32 (setne (and i64:$lhs, PowerOf2HI:$mask), 0)), bb:$dst),
+              (BBIT132 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>, ASE_MIPS64_CNMIPS;
+
+// Atomic load patterns.
+def : MipsPat<(atomic_load_8 addr:$a), (LB64 addr:$a)>;
+def : MipsPat<(atomic_load_16 addr:$a), (LH64 addr:$a)>;
+def : MipsPat<(atomic_load_32 addr:$a), (LW64 addr:$a)>;
+def : MipsPat<(atomic_load_64 addr:$a), (LD addr:$a)>;
+
+// Atomic store patterns.
+def : MipsPat<(atomic_store_8 addr:$a, GPR64:$v), (SB64 GPR64:$v, addr:$a)>;
+def : MipsPat<(atomic_store_16 addr:$a, GPR64:$v), (SH64 GPR64:$v, addr:$a)>;
+def : MipsPat<(atomic_store_32 addr:$a, GPR64:$v), (SW64 GPR64:$v, addr:$a)>;
+def : MipsPat<(atomic_store_64 addr:$a, GPR64:$v), (SD GPR64:$v, addr:$a)>;
+
+//===----------------------------------------------------------------------===//
+// Instruction aliases
+//===----------------------------------------------------------------------===//
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"move $dst, $src",
+                      (OR64 GPR64Opnd:$dst,  GPR64Opnd:$src, ZERO_64), 1>,
+        GPR_64;
+  def : MipsInstAlias<"move $dst, $src",
+                      (DADDu GPR64Opnd:$dst,  GPR64Opnd:$src, ZERO_64), 1>,
+        GPR_64;
+  def : MipsInstAlias<"dadd $rs, $rt, $imm",
+                      (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
+                      0>, ISA_MIPS3_NOT_32R6_64R6;
+  def : MipsInstAlias<"dadd $rs, $imm",
+                      (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
+                      0>, ISA_MIPS3_NOT_32R6_64R6;
+  def : MipsInstAlias<"daddu $rs, $rt, $imm",
+                      (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
+                      0>, ISA_MIPS3;
+  def : MipsInstAlias<"daddu $rs, $imm",
+                      (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
+                      0>, ISA_MIPS3;
+}
+def : MipsInstAlias<"dsll $rd, $rt, $rs",
+                    (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+                    ISA_MIPS3;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"dneg $rt, $rs",
+                      (DSUB GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
+                      ISA_MIPS3;
+  def : MipsInstAlias<"dneg $rt",
+                      (DSUB GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rt), 1>,
+                      ISA_MIPS3;
+  def : MipsInstAlias<"dnegu $rt, $rs",
+                      (DSUBu GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
+                      ISA_MIPS3;
+  def : MipsInstAlias<"dnegu $rt",
+                      (DSUBu GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rt), 1>,
+                      ISA_MIPS3;
+}
+def : MipsInstAlias<"dsubi $rs, $rt, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt,
+                           InvertedImOperand64:$imm),
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
+def : MipsInstAlias<"dsubi $rs, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs,
+                           InvertedImOperand64:$imm),
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
+def : MipsInstAlias<"dsub $rs, $rt, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt,
+                           InvertedImOperand64:$imm),
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
+def : MipsInstAlias<"dsub $rs, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs,
+                           InvertedImOperand64:$imm),
+                    0>, ISA_MIPS3_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"dsubu $rt, $rs, $imm",
+                      (DADDiu GPR64Opnd:$rt, GPR64Opnd:$rs,
+                              InvertedImOperand64:$imm), 0>, ISA_MIPS3;
+  def : MipsInstAlias<"dsubu $rs, $imm",
+                      (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs,
+                              InvertedImOperand64:$imm), 0>, ISA_MIPS3;
+}
+def : MipsInstAlias<"dsra $rd, $rt, $rs",
+                    (DSRAV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+                    ISA_MIPS3;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"dsrl $rd, $rt, $rs",
+                      (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+                      ISA_MIPS3;
+
+// Two operand (implicit 0 selector) versions:
+  def : MipsInstAlias<"dmtc0 $rt, $rd",
+                      (DMTC0 COP0Opnd:$rd, GPR64Opnd:$rt, 0), 0>;
+  def : MipsInstAlias<"dmfc0 $rt, $rd",
+                      (DMFC0 GPR64Opnd:$rt, COP0Opnd:$rd, 0), 0>;
+}
+def : MipsInstAlias<"dmfc2 $rt, $rd", (DMFC2 GPR64Opnd:$rt, COP2Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"dmtc2 $rt, $rd", (DMTC2 COP2Opnd:$rd, GPR64Opnd:$rt, 0), 0>;
+
+def : MipsInstAlias<"synciobdma", (SYNC 0x2), 0>, ASE_MIPS64_CNMIPS;
+def : MipsInstAlias<"syncs", (SYNC 0x6), 0>, ASE_MIPS64_CNMIPS;
+def : MipsInstAlias<"syncw", (SYNC 0x4), 0>, ASE_MIPS64_CNMIPS;
+def : MipsInstAlias<"syncws", (SYNC 0x5), 0>, ASE_MIPS64_CNMIPS;
+
+// cnMIPS Aliases.
+
+// bbit* with $p 32-63 converted to bbit*32 with $p 0-31
+def : MipsInstAlias<"bbit0 $rs, $p, $offset",
+                    (BBIT032 GPR64Opnd:$rs, uimm5_plus32_normalize_64:$p,
+                             brtarget:$offset), 0>,
+      ASE_CNMIPS;
+def : MipsInstAlias<"bbit1 $rs, $p, $offset",
+                    (BBIT132 GPR64Opnd:$rs, uimm5_plus32_normalize_64:$p,
+                             brtarget:$offset), 0>,
+      ASE_CNMIPS;
+
+// exts with $pos 32-63 in converted to exts32 with $pos 0-31
+def : MipsInstAlias<"exts $rt, $rs, $pos, $lenm1",
+                    (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rs,
+                            uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
+      ASE_CNMIPS;
+def : MipsInstAlias<"exts $rt, $pos, $lenm1",
+                    (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rt,
+                            uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
+      ASE_CNMIPS;
+
+// cins with $pos 32-63 in converted to cins32 with $pos 0-31
+def : MipsInstAlias<"cins $rt, $rs, $pos, $lenm1",
+                    (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rs,
+                            uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
+      ASE_CNMIPS;
+def : MipsInstAlias<"cins $rt, $pos, $lenm1",
+                    (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rt,
+                            uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
+      ASE_CNMIPS;
+
+//===----------------------------------------------------------------------===//
+// Assembler Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+class LoadImmediate64<string instr_asm, Operand Od, RegisterOperand RO> :
+  MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm64),
+                     !strconcat(instr_asm, "\t$rt, $imm64")> ;
+def LoadImm64 : LoadImmediate64<"dli", imm64, GPR64Opnd>;
+
+def LoadAddrReg64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins mem:$addr),
+                                       "dla\t$rt, $addr">;
+def LoadAddrImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins imm64:$imm64),
+                                       "dla\t$rt, $imm64">;
diff --git a/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td
new file mode 100644
index 000000000000..dabf4e0a52e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td
@@ -0,0 +1,279 @@
+//=- Mips64r6InstrInfo.td - Mips64r6 Instruction Information -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips64r6 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// Notes about removals/changes from MIPS32r6:
+// Reencoded: dclo, dclz
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Encodings
+//
+//===----------------------------------------------------------------------===//
+
+class DALIGN_ENC  : SPECIAL3_DALIGN_FM<OPCODE6_DALIGN>;
+class DAUI_ENC    : DAUI_FM;
+class DAHI_ENC    : REGIMM_FM<OPCODE5_DAHI>;
+class DATI_ENC    : REGIMM_FM<OPCODE5_DATI>;
+class DBITSWAP_ENC : SPECIAL3_2R_FM<OPCODE6_DBITSWAP>;
+class DCLO_R6_ENC : SPECIAL_2R_FM<OPCODE6_DCLO>;
+class DCLZ_R6_ENC : SPECIAL_2R_FM<OPCODE6_DCLZ>;
+class DDIV_ENC    : SPECIAL_3R_FM<0b00010, 0b011110>;
+class DDIVU_ENC   : SPECIAL_3R_FM<0b00010, 0b011111>;
+class DLSA_R6_ENC : SPECIAL_LSA_FM<OPCODE6_DLSA>;
+class DMOD_ENC    : SPECIAL_3R_FM<0b00011, 0b011110>;
+class DMODU_ENC   : SPECIAL_3R_FM<0b00011, 0b011111>;
+class DMUH_ENC    : SPECIAL_3R_FM<0b00011, 0b011100>;
+class DMUHU_ENC   : SPECIAL_3R_FM<0b00011, 0b011101>;
+class DMUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b011100>;
+class DMULU_ENC   : SPECIAL_3R_FM<0b00010, 0b011101>;
+class LDPC_ENC    : PCREL18_FM<OPCODE3_LDPC>;
+class LLD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_LLD>;
+class SCD_R6_ENC : SPECIAL3_LL_SC_FM<OPCODE6_SCD>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Descriptions
+//
+//===----------------------------------------------------------------------===//
+
+class AHI_ATI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, InstrItinClass itin> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins GPROpnd:$rt, uimm16_altrelaxed:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $imm");
+  string Constraints = "$rs = $rt";
+  InstrItinClass Itinerary = itin;
+}
+
+class DALIGN_DESC  : ALIGN_DESC_BASE<"dalign", GPR64Opnd, uimm3, II_DALIGN>;
+class DAHI_DESC    : AHI_ATI_DESC_BASE<"dahi", GPR64Opnd, II_DAHI>;
+class DATI_DESC    : AHI_ATI_DESC_BASE<"dati", GPR64Opnd, II_DATI>;
+class DAUI_DESC    : AUI_DESC_BASE<"daui", GPR64Opnd, II_DAUI>;
+class DBITSWAP_DESC : BITSWAP_DESC_BASE<"dbitswap", GPR64Opnd, II_DBITSWAP>;
+class DCLO_R6_DESC : CLO_R6_DESC_BASE<"dclo", GPR64Opnd, II_DCLO>;
+class DCLZ_R6_DESC : CLZ_R6_DESC_BASE<"dclz", GPR64Opnd, II_DCLZ>;
+class DDIV_DESC    : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd, II_DDIV, sdiv>;
+class DDIVU_DESC   : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd, II_DDIVU, udiv>;
+class DLSA_R6_DESC : LSA_R6_DESC_BASE<"dlsa", GPR64Opnd, uimm2_plus1, II_DLSA>;
+class DMOD_DESC    : DIVMOD_DESC_BASE<"dmod", GPR64Opnd, II_DMOD, srem>;
+class DMODU_DESC   : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd, II_DMODU, urem>;
+class DMUH_DESC    : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd, II_DMUH, mulhs>;
+class DMUHU_DESC   : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd, II_DMUHU, mulhu>;
+class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd, II_DMUL, mul>;
+class DMULU_DESC   : MUL_R6_DESC_BASE<"dmulu", GPR64Opnd, II_DMUL>;
+class LDPC_DESC    : PCREL_DESC_BASE<"ldpc", GPR64Opnd, simm18_lsl3, II_LDPC>;
+class LLD_R6_DESC   : LL_R6_DESC_BASE<"lld", GPR64Opnd, mem_simm16, II_LLD>;
+class SCD_R6_DESC   : SC_R6_DESC_BASE<"scd", GPR64Opnd, II_SCD>;
+class SELEQZ64_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR64Opnd>;
+class SELNEZ64_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR64Opnd>;
+
+class BGEC64_DESC : CMP_BC_DESC_BASE<"bgec", brtarget, GPR64Opnd>;
+class BGEUC64_DESC : CMP_BC_DESC_BASE<"bgeuc", brtarget, GPR64Opnd>;
+class BEQC64_DESC : CMP_BC_DESC_BASE<"beqc", brtarget, GPR64Opnd>;
+class BNEC64_DESC : CMP_BC_DESC_BASE<"bnec", brtarget, GPR64Opnd>;
+class BLTC64_DESC : CMP_BC_DESC_BASE<"bltc", brtarget, GPR64Opnd>;
+class BLTUC64_DESC : CMP_BC_DESC_BASE<"bltuc", brtarget, GPR64Opnd>;
+class BLTZC64_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzc", brtarget, GPR64Opnd>;
+class BGEZC64_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezc", brtarget, GPR64Opnd>;
+class BLEZC64_DESC : CMP_CBR_RT_Z_DESC_BASE<"blezc", brtarget, GPR64Opnd>;
+class BGTZC64_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgtzc", brtarget, GPR64Opnd>;
+class BEQZC64_DESC : CMP_CBR_EQNE_Z_DESC_BASE<"beqzc", brtarget21, GPR64Opnd>;
+class BNEZC64_DESC : CMP_CBR_EQNE_Z_DESC_BASE<"bnezc", brtarget21, GPR64Opnd>;
+
+class JIALC64_DESC : JMP_IDX_COMPACT_DESC_BASE<"jialc", calloffset16,
+                                               GPR64Opnd, II_JIALC> {
+  bit isCall = 1;
+  list<Register> Defs = [RA];
+}
+
+class JIC64_DESC : JMP_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16, GPR64Opnd,
+                                             II_JIC> {
+  bit isBarrier = 1;
+  bit isTerminator = 1;
+  list<Register> Defs = [AT];
+}
+
+class LL64_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd, mem_simm9, II_LL>;
+class SC64_R6_DESC : SC_R6_DESC_BASE<"sc", GPR32Opnd, II_SC>;
+//===----------------------------------------------------------------------===//
+//
+// Instruction Definitions
+//
+//===----------------------------------------------------------------------===//
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  let DecoderMethod = "DecodeDAHIDATI" in {
+    def DATI : DATI_ENC, DATI_DESC, ISA_MIPS64R6;
+    def DAHI : DAHI_ENC, DAHI_DESC, ISA_MIPS64R6;
+  }
+  def DAUI : DAUI_ENC, DAUI_DESC, ISA_MIPS64R6;
+  def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6;
+  def DBITSWAP : R6MMR6Rel, DBITSWAP_ENC, DBITSWAP_DESC, ISA_MIPS64R6;
+  def DCLO_R6 : R6MMR6Rel, DCLO_R6_ENC, DCLO_R6_DESC, ISA_MIPS64R6;
+  def DCLZ_R6 : R6MMR6Rel, DCLZ_R6_ENC, DCLZ_R6_DESC, ISA_MIPS64R6;
+  def DDIV : DDIV_ENC, DDIV_DESC, ISA_MIPS64R6;
+  def DDIVU : DDIVU_ENC, DDIVU_DESC, ISA_MIPS64R6;
+  def DMOD : DMOD_ENC, DMOD_DESC, ISA_MIPS64R6;
+  def DMODU : DMODU_ENC, DMODU_DESC, ISA_MIPS64R6;
+  def DLSA_R6 : R6MMR6Rel, DLSA_R6_ENC, DLSA_R6_DESC, ISA_MIPS64R6;
+  def DMUH: DMUH_ENC, DMUH_DESC, ISA_MIPS64R6;
+  def DMUHU: DMUHU_ENC, DMUHU_DESC, ISA_MIPS64R6;
+  def DMUL_R6: DMUL_R6_ENC, DMUL_R6_DESC, ISA_MIPS64R6;
+  def DMULU: DMULU_ENC, DMULU_DESC, ISA_MIPS64R6;
+  def LLD_R6 : R6MMR6Rel, LLD_R6_ENC, LLD_R6_DESC, ISA_MIPS64R6;
+}
+def LDPC: R6MMR6Rel, LDPC_ENC, LDPC_DESC, ISA_MIPS64R6;
+def SCD_R6 : SCD_R6_ENC, SCD_R6_DESC, ISA_MIPS32R6;
+let DecoderNamespace = "Mips32r6_64r6_GP64" in {
+  def SELEQZ64 : SELEQZ_ENC, SELEQZ64_DESC, ISA_MIPS32R6, GPR_64;
+  def SELNEZ64 : SELNEZ_ENC, SELNEZ64_DESC, ISA_MIPS32R6, GPR_64;
+}
+let AdditionalPredicates = [NotInMicroMips],
+    DecoderNamespace = "Mips32r6_64r6_PTR64" in {
+  def LL64_R6 : LL_R6_ENC, LL64_R6_DESC, PTR_64, ISA_MIPS64R6;
+  def SC64_R6 : SC_R6_ENC, SC64_R6_DESC, PTR_64, ISA_MIPS64R6;
+}
+
+let DecoderNamespace = "Mips32r6_64r6_GP64" in {
+// Jump and Branch Instructions
+def JIALC64 : JIALC_ENC, JIALC64_DESC, ISA_MIPS64R6, GPR_64;
+def JIC64 : JIC_ENC, JIC64_DESC, ISA_MIPS64R6, GPR_64;
+
+def BEQC64 : BEQC_ENC, BEQC64_DESC, ISA_MIPS64R6, GPR_64;
+def BEQZC64 : BEQZC_ENC, BEQZC64_DESC, ISA_MIPS64R6, GPR_64;
+def BGEC64 : BGEC_ENC, BGEC64_DESC, ISA_MIPS64R6, GPR_64;
+def BGEUC64 : BGEUC_ENC, BGEUC64_DESC, ISA_MIPS64R6, GPR_64;
+def BGTZC64 : BGTZC_ENC, BGTZC64_DESC, ISA_MIPS64R6, GPR_64;
+def BLEZC64 : BLEZC_ENC, BLEZC64_DESC, ISA_MIPS64R6, GPR_64;
+def BLTC64 : BLTC_ENC, BLTC64_DESC, ISA_MIPS64R6, GPR_64;
+def BLTUC64 : BLTUC_ENC, BLTUC64_DESC, ISA_MIPS64R6, GPR_64;
+def BNEC64 : BNEC_ENC, BNEC64_DESC, ISA_MIPS64R6, GPR_64;
+def BNEZC64 : BNEZC_ENC, BNEZC64_DESC, ISA_MIPS64R6, GPR_64;
+}
+let DecoderNamespace = "Mips32r6_64r6_BranchZero" in {
+def BLTZC64 : BLTZC_ENC, BLTZC64_DESC, ISA_MIPS64R6, GPR_64;
+def BGEZC64 : BGEZC_ENC, BGEZC64_DESC, ISA_MIPS64R6, GPR_64;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Aliases
+//
+//===----------------------------------------------------------------------===//
+
+def : MipsInstAlias<"jr $rs", (JALR64 ZERO_64, GPR64Opnd:$rs), 1>, ISA_MIPS64R6;
+
+def : MipsInstAlias<"jrc $rs", (JIC64 GPR64Opnd:$rs, 0), 1>, ISA_MIPS64R6;
+
+def : MipsInstAlias<"jalrc $rs", (JIALC64 GPR64Opnd:$rs, 0), 1>, ISA_MIPS64R6;
+//===----------------------------------------------------------------------===//
+//
+// Patterns and Pseudo Instructions
+//
+//===----------------------------------------------------------------------===//
+
+// i64 selects
+def : MipsPat<(select i64:$cond, i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, i64:$cond),
+                    (SELEQZ64 i64:$f, i64:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immz)), i64:$t, i64:$f),
+              (OR64 (SELEQZ64 i64:$t, i64:$cond),
+                    (SELNEZ64 i64:$f, i64:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i64:$cond, immz)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, i64:$cond),
+                    (SELEQZ64 i64:$f, i64:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immZExt16_64:$imm)), i64:$t, i64:$f),
+              (OR64 (SELEQZ64 i64:$t, (XORi64 i64:$cond, immZExt16_64:$imm)),
+                    (SELNEZ64 i64:$f, (XORi64 i64:$cond, immZExt16_64:$imm)))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i64:$cond, immZExt16_64:$imm)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, (XORi64 i64:$cond, immZExt16_64:$imm)),
+                    (SELEQZ64 i64:$f, (XORi64 i64:$cond, immZExt16_64:$imm)))>,
+              ISA_MIPS64R6;
+def : MipsPat<
+  (select (i32 (setgt i64:$cond, immSExt16Plus1:$imm)), i64:$t, i64:$f),
+  (OR64 (SELEQZ64 i64:$t,
+                  (SUBREG_TO_REG (i64 0), (SLTi64 i64:$cond, (Plus1 imm:$imm)),
+                                 sub_32)),
+        (SELNEZ64 i64:$f,
+                  (SUBREG_TO_REG (i64 0), (SLTi64 i64:$cond, (Plus1 imm:$imm)),
+                                 sub_32)))>,
+  ISA_MIPS64R6;
+def : MipsPat<
+  (select (i32 (setugt i64:$cond, immSExt16Plus1:$imm)), i64:$t, i64:$f),
+  (OR64 (SELEQZ64 i64:$t,
+                  (SUBREG_TO_REG (i64 0), (SLTiu64 i64:$cond, (Plus1 imm:$imm)),
+                                 sub_32)),
+        (SELNEZ64 i64:$f,
+                  (SUBREG_TO_REG (i64 0), (SLTiu64 i64:$cond, (Plus1 imm:$imm)),
+                                 sub_32)))>,
+  ISA_MIPS64R6;
+
+def : MipsPat<(select (i32 (setne i64:$cond, immz)), i64:$t, immz),
+              (SELNEZ64 i64:$t, i64:$cond)>, ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immz)), i64:$t, immz),
+              (SELEQZ64 i64:$t, i64:$cond)>, ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i64:$cond, immz)), immz, i64:$f),
+              (SELEQZ64 i64:$f, i64:$cond)>, ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i64:$cond, immz)), immz, i64:$f),
+              (SELNEZ64 i64:$f, i64:$cond)>, ISA_MIPS64R6;
+
+// i64 selects from an i32 comparison
+// One complicating factor here is that bits 32-63 of an i32 are undefined.
+// FIXME: Ideally, setcc would always produce an i64 on MIPS64 targets.
+//        This would allow us to remove the sign-extensions here.
+def : MipsPat<(select i32:$cond, i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, (SLL64_32 i32:$cond)),
+                    (SELEQZ64 i64:$f, (SLL64_32 i32:$cond)))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i64:$t, i64:$f),
+              (OR64 (SELEQZ64 i64:$t, (SLL64_32 i32:$cond)),
+                    (SELNEZ64 i64:$f, (SLL64_32 i32:$cond)))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, (SLL64_32 i32:$cond)),
+                    (SELEQZ64 i64:$f, (SLL64_32 i32:$cond)))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i64:$t, i64:$f),
+              (OR64 (SELEQZ64 i64:$t, (SLL64_32 (XORi i32:$cond,
+                                                      immZExt16:$imm))),
+                    (SELNEZ64 i64:$f, (SLL64_32 (XORi i32:$cond,
+                                                      immZExt16:$imm))))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i64:$t, i64:$f),
+              (OR64 (SELNEZ64 i64:$t, (SLL64_32 (XORi i32:$cond,
+                                                      immZExt16:$imm))),
+                    (SELEQZ64 i64:$f, (SLL64_32 (XORi i32:$cond,
+                                                      immZExt16:$imm))))>,
+              ISA_MIPS64R6;
+
+def : MipsPat<(select i32:$cond, i64:$t, immz),
+              (SELNEZ64 i64:$t, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), i64:$t, immz),
+              (SELNEZ64 i64:$t, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i64:$t, immz),
+              (SELEQZ64 i64:$t, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select i32:$cond, immz, i64:$f),
+              (SELEQZ64 i64:$f, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i64:$f),
+              (SELEQZ64 i64:$f, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
+def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i64:$f),
+              (SELNEZ64 i64:$f, (SLL64_32 i32:$cond))>,
+              ISA_MIPS64R6;
diff --git a/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp
new file mode 100644
index 000000000000..161345d2a845
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp
@@ -0,0 +1,154 @@
+//===-- MipsAnalyzeImmediate.cpp - Analyze Immediates ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsAnalyzeImmediate.h"
+#include "Mips.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+MipsAnalyzeImmediate::Inst::Inst(unsigned O, unsigned I) : Opc(O), ImmOpnd(I) {}
+
+// Add I to the instruction sequences.
+void MipsAnalyzeImmediate::AddInstr(InstSeqLs &SeqLs, const Inst &I) {
+  // Add an instruction seqeunce consisting of just I.
+  if (SeqLs.empty()) {
+    SeqLs.push_back(InstSeq(1, I));
+    return;
+  }
+
+  for (InstSeqLs::iterator Iter = SeqLs.begin(); Iter != SeqLs.end(); ++Iter)
+    Iter->push_back(I);
+}
+
+void MipsAnalyzeImmediate::GetInstSeqLsADDiu(uint64_t Imm, unsigned RemSize,
+                                             InstSeqLs &SeqLs) {
+  GetInstSeqLs((Imm + 0x8000ULL) & 0xffffffffffff0000ULL, RemSize, SeqLs);
+  AddInstr(SeqLs, Inst(ADDiu, Imm & 0xffffULL));
+}
+
+void MipsAnalyzeImmediate::GetInstSeqLsORi(uint64_t Imm, unsigned RemSize,
+                                           InstSeqLs &SeqLs) {
+  GetInstSeqLs(Imm & 0xffffffffffff0000ULL, RemSize, SeqLs);
+  AddInstr(SeqLs, Inst(ORi, Imm & 0xffffULL));
+}
+
+void MipsAnalyzeImmediate::GetInstSeqLsSLL(uint64_t Imm, unsigned RemSize,
+                                           InstSeqLs &SeqLs) {
+  unsigned Shamt = countTrailingZeros(Imm);
+  GetInstSeqLs(Imm >> Shamt, RemSize - Shamt, SeqLs);
+  AddInstr(SeqLs, Inst(SLL, Shamt));
+}
+
+void MipsAnalyzeImmediate::GetInstSeqLs(uint64_t Imm, unsigned RemSize,
+                                        InstSeqLs &SeqLs) {
+  uint64_t MaskedImm = Imm & (0xffffffffffffffffULL >> (64 - Size));
+
+  // Do nothing if Imm is 0.
+  if (!MaskedImm)
+    return;
+
+  // A single ADDiu will do if RemSize <= 16.
+  if (RemSize <= 16) {
+    AddInstr(SeqLs, Inst(ADDiu, MaskedImm));
+    return;
+  }
+
+  // Shift if the lower 16-bit is cleared.
+  if (!(Imm & 0xffff)) {
+    GetInstSeqLsSLL(Imm, RemSize, SeqLs);
+    return;
+  }
+
+  GetInstSeqLsADDiu(Imm, RemSize, SeqLs);
+
+  // If bit 15 is cleared, it doesn't make a difference whether the last
+  // instruction is an ADDiu or ORi. In that case, do not call GetInstSeqLsORi.
+  if (Imm & 0x8000) {
+    InstSeqLs SeqLsORi;
+    GetInstSeqLsORi(Imm, RemSize, SeqLsORi);
+    SeqLs.append(std::make_move_iterator(SeqLsORi.begin()),
+                 std::make_move_iterator(SeqLsORi.end()));
+  }
+}
+
+// Replace a ADDiu & SLL pair with a LUi.
+// e.g. the following two instructions
+//  ADDiu 0x0111
+//  SLL 18
+// are replaced with
+//  LUi 0x444
+void MipsAnalyzeImmediate::ReplaceADDiuSLLWithLUi(InstSeq &Seq) {
+  // Check if the first two instructions are ADDiu and SLL and the shift amount
+  // is at least 16.
+  if ((Seq.size() < 2) || (Seq[0].Opc != ADDiu) ||
+      (Seq[1].Opc != SLL) || (Seq[1].ImmOpnd < 16))
+    return;
+
+  // Sign-extend and shift operand of ADDiu and see if it still fits in 16-bit.
+  int64_t Imm = SignExtend64<16>(Seq[0].ImmOpnd);
+  int64_t ShiftedImm = (uint64_t)Imm << (Seq[1].ImmOpnd - 16);
+
+  if (!isInt<16>(ShiftedImm))
+    return;
+
+  // Replace the first instruction and erase the second.
+  Seq[0].Opc = LUi;
+  Seq[0].ImmOpnd = (unsigned)(ShiftedImm & 0xffff);
+  Seq.erase(Seq.begin() + 1);
+}
+
+void MipsAnalyzeImmediate::GetShortestSeq(InstSeqLs &SeqLs, InstSeq &Insts) {
+  InstSeqLs::iterator ShortestSeq = SeqLs.end();
+  // The length of an instruction sequence is at most 7.
+  unsigned ShortestLength = 8;
+
+  for (InstSeqLs::iterator S = SeqLs.begin(); S != SeqLs.end(); ++S) {
+    ReplaceADDiuSLLWithLUi(*S);
+    assert(S->size() <= 7);
+
+    if (S->size() < ShortestLength) {
+      ShortestSeq = S;
+      ShortestLength = S->size();
+    }
+  }
+
+  Insts.clear();
+  Insts.append(ShortestSeq->begin(), ShortestSeq->end());
+}
+
+const MipsAnalyzeImmediate::InstSeq
+&MipsAnalyzeImmediate::Analyze(uint64_t Imm, unsigned Size,
+                               bool LastInstrIsADDiu) {
+  this->Size = Size;
+
+  if (Size == 32) {
+    ADDiu = Mips::ADDiu;
+    ORi = Mips::ORi;
+    SLL = Mips::SLL;
+    LUi = Mips::LUi;
+  } else {
+    ADDiu = Mips::DADDiu;
+    ORi = Mips::ORi64;
+    SLL = Mips::DSLL;
+    LUi = Mips::LUi64;
+  }
+
+  InstSeqLs SeqLs;
+
+  // Get the list of instruction sequences.
+  if (LastInstrIsADDiu | !Imm)
+    GetInstSeqLsADDiu(Imm, Size, SeqLs);
+  else
+    GetInstSeqLs(Imm, Size, SeqLs);
+
+  // Set Insts to the shortest instruction sequence.
+  GetShortestSeq(SeqLs, Insts);
+
+  return Insts;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h
new file mode 100644
index 000000000000..ae3c38ced80b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h
@@ -0,0 +1,63 @@
+//===-- MipsAnalyzeImmediate.h - Analyze Immediates ------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSANALYZEIMMEDIATE_H
+#define LLVM_LIB_TARGET_MIPS_MIPSANALYZEIMMEDIATE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+  class MipsAnalyzeImmediate {
+  public:
+    struct Inst {
+      unsigned Opc, ImmOpnd;
+      Inst(unsigned Opc, unsigned ImmOpnd);
+    };
+    typedef SmallVector<Inst, 7 > InstSeq;
+
+    /// Analyze - Get an instruction sequence to load immediate Imm. The last
+    /// instruction in the sequence must be an ADDiu if LastInstrIsADDiu is
+    /// true;
+    const InstSeq &Analyze(uint64_t Imm, unsigned Size, bool LastInstrIsADDiu);
+  private:
+    typedef SmallVector<InstSeq, 5> InstSeqLs;
+
+    /// AddInstr - Add I to all instruction sequences in SeqLs.
+    void AddInstr(InstSeqLs &SeqLs, const Inst &I);
+
+    /// GetInstSeqLsADDiu - Get instruction sequences which end with an ADDiu to
+    /// load immediate Imm
+    void GetInstSeqLsADDiu(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+
+    /// GetInstSeqLsORi - Get instrutcion sequences which end with an ORi to
+    /// load immediate Imm
+    void GetInstSeqLsORi(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+
+    /// GetInstSeqLsSLL - Get instruction sequences which end with a SLL to
+    /// load immediate Imm
+    void GetInstSeqLsSLL(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+
+    /// GetInstSeqLs - Get instruction sequences to load immediate Imm.
+    void GetInstSeqLs(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+
+    /// ReplaceADDiuSLLWithLUi - Replace an ADDiu & SLL pair with a LUi.
+    void ReplaceADDiuSLLWithLUi(InstSeq &Seq);
+
+    /// GetShortestSeq - Find the shortest instruction sequence in SeqLs and
+    /// return it in Insts.
+    void GetShortestSeq(InstSeqLs &SeqLs, InstSeq &Insts);
+
+    unsigned Size;
+    unsigned ADDiu, ORi, SLL, LUi;
+    InstSeq Insts;
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
new file mode 100644
index 000000000000..179695bc6988
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -0,0 +1,1073 @@
+//===-- MipsAsmPrinter.cpp - Mips LLVM Assembly Printer -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format MIPS assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsMCNaCl.h"
+#include "Mips.h"
+#include "MipsAsmPrinter.h"
+#include "MipsInstrInfo.h"
+#include "MipsMCInstLower.h"
+#include "MipsTargetMachine.h"
+#include "MipsTargetStreamer.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-asm-printer"
+
+MipsTargetStreamer &MipsAsmPrinter::getTargetStreamer() const {
+  return static_cast<MipsTargetStreamer &>(*OutStreamer->getTargetStreamer());
+}
+
+bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<MipsSubtarget>();
+
+  MipsFI = MF.getInfo<MipsFunctionInfo>();
+  if (Subtarget->inMips16Mode())
+    for (std::map<
+             const char *,
+             const llvm::Mips16HardFloatInfo::FuncSignature *>::const_iterator
+             it = MipsFI->StubsNeeded.begin();
+         it != MipsFI->StubsNeeded.end(); ++it) {
+      const char *Symbol = it->first;
+      const llvm::Mips16HardFloatInfo::FuncSignature *Signature = it->second;
+      if (StubsNeeded.find(Symbol) == StubsNeeded.end())
+        StubsNeeded[Symbol] = Signature;
+    }
+  MCP = MF.getConstantPool();
+
+  // In NaCl, all indirect jump targets must be aligned to bundle size.
+  if (Subtarget->isTargetNaCl())
+    NaClAlignIndirectJumpTargets(MF);
+
+  AsmPrinter::runOnMachineFunction(MF);
+  return true;
+}
+
+bool MipsAsmPrinter::lowerOperand(const MachineOperand &MO, MCOperand &MCOp) {
+  MCOp = MCInstLowering.LowerOperand(MO);
+  return MCOp.isValid();
+}
+
+#include "MipsGenMCPseudoLowering.inc"
+
+// Lower PseudoReturn/PseudoIndirectBranch/PseudoIndirectBranch64 to JR, JR_MM,
+// JALR, or JALR64 as appropriate for the target
+void MipsAsmPrinter::emitPseudoIndirectBranch(MCStreamer &OutStreamer,
+                                              const MachineInstr *MI) {
+  bool HasLinkReg = false;
+  bool InMicroMipsMode = Subtarget->inMicroMipsMode();
+  MCInst TmpInst0;
+
+  if (Subtarget->hasMips64r6()) {
+    // MIPS64r6 should use (JALR64 ZERO_64, $rs)
+    TmpInst0.setOpcode(Mips::JALR64);
+    HasLinkReg = true;
+  } else if (Subtarget->hasMips32r6()) {
+    // MIPS32r6 should use (JALR ZERO, $rs)
+    if (InMicroMipsMode)
+      TmpInst0.setOpcode(Mips::JRC16_MMR6);
+    else {
+      TmpInst0.setOpcode(Mips::JALR);
+      HasLinkReg = true;
+    }
+  } else if (Subtarget->inMicroMipsMode())
+    // microMIPS should use (JR_MM $rs)
+    TmpInst0.setOpcode(Mips::JR_MM);
+  else {
+    // Everything else should use (JR $rs)
+    TmpInst0.setOpcode(Mips::JR);
+  }
+
+  MCOperand MCOp;
+
+  if (HasLinkReg) {
+    unsigned ZeroReg = Subtarget->isGP64bit() ? Mips::ZERO_64 : Mips::ZERO;
+    TmpInst0.addOperand(MCOperand::createReg(ZeroReg));
+  }
+
+  lowerOperand(MI->getOperand(0), MCOp);
+  TmpInst0.addOperand(MCOp);
+
+  EmitToStreamer(OutStreamer, TmpInst0);
+}
+
+void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MipsTargetStreamer &TS = getTargetStreamer();
+  TS.forbidModuleDirective();
+
+  if (MI->isDebugValue()) {
+    SmallString<128> Str;
+    raw_svector_ostream OS(Str);
+
+    PrintDebugValueComment(MI, OS);
+    return;
+  }
+
+  // If we just ended a constant pool, mark it as such.
+  if (InConstantPool && MI->getOpcode() != Mips::CONSTPOOL_ENTRY) {
+    OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+    InConstantPool = false;
+  }
+  if (MI->getOpcode() == Mips::CONSTPOOL_ENTRY) {
+    // CONSTPOOL_ENTRY - This instruction represents a floating
+    //constant pool in the function.  The first operand is the ID#
+    // for this instruction, the second is the index into the
+    // MachineConstantPool that this is, the third is the size in
+    // bytes of this constant pool entry.
+    // The required alignment is specified on the basic block holding this MI.
+    //
+    unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
+    unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
+
+    // If this is the first entry of the pool, mark it.
+    if (!InConstantPool) {
+      OutStreamer->EmitDataRegion(MCDR_DataRegion);
+      InConstantPool = true;
+    }
+
+    OutStreamer->EmitLabel(GetCPISymbol(LabelId));
+
+    const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
+    if (MCPE.isMachineConstantPoolEntry())
+      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+    else
+      EmitGlobalConstant(MF->getDataLayout(), MCPE.Val.ConstVal);
+    return;
+  }
+
+
+  MachineBasicBlock::const_instr_iterator I = MI->getIterator();
+  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+
+  do {
+    // Do any auto-generated pseudo lowerings.
+    if (emitPseudoExpansionLowering(*OutStreamer, &*I))
+      continue;
+
+    if (I->getOpcode() == Mips::PseudoReturn ||
+        I->getOpcode() == Mips::PseudoReturn64 ||
+        I->getOpcode() == Mips::PseudoIndirectBranch ||
+        I->getOpcode() == Mips::PseudoIndirectBranch64 ||
+        I->getOpcode() == Mips::TAILCALLREG ||
+        I->getOpcode() == Mips::TAILCALLREG64) {
+      emitPseudoIndirectBranch(*OutStreamer, &*I);
+      continue;
+    }
+
+    // The inMips16Mode() test is not permanent.
+    // Some instructions are marked as pseudo right now which
+    // would make the test fail for the wrong reason but
+    // that will be fixed soon. We need this here because we are
+    // removing another test for this situation downstream in the
+    // callchain.
+    //
+    if (I->isPseudo() && !Subtarget->inMips16Mode()
+        && !isLongBranchPseudo(I->getOpcode()))
+      llvm_unreachable("Pseudo opcode found in EmitInstruction()");
+
+    MCInst TmpInst0;
+    MCInstLowering.Lower(&*I, TmpInst0);
+    EmitToStreamer(*OutStreamer, TmpInst0);
+  } while ((++I != E) && I->isInsideBundle()); // Delay slot check
+}
+
+//===----------------------------------------------------------------------===//
+//
+//  Mips Asm Directives
+//
+//  -- Frame directive "frame Stackpointer, Stacksize, RARegister"
+//  Describe the stack frame.
+//
+//  -- Mask directives "(f)mask  bitmask, offset"
+//  Tells the assembler which registers are saved and where.
+//  bitmask - contain a little endian bitset indicating which registers are
+//            saved on function prologue (e.g. with a 0x80000000 mask, the
+//            assembler knows the register 31 (RA) is saved at prologue.
+//  offset  - the position before stack pointer subtraction indicating where
+//            the first saved register on prologue is located. (e.g. with a
+//
+//  Consider the following function prologue:
+//
+//    .frame  $fp,48,$ra
+//    .mask   0xc0000000,-8
+//       addiu $sp, $sp, -48
+//       sw $ra, 40($sp)
+//       sw $fp, 36($sp)
+//
+//    With a 0xc0000000 mask, the assembler knows the register 31 (RA) and
+//    30 (FP) are saved at prologue. As the save order on prologue is from
+//    left to right, RA is saved first. A -8 offset means that after the
+//    stack pointer subtration, the first register in the mask (RA) will be
+//    saved at address 48-8=40.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Mask directives
+//===----------------------------------------------------------------------===//
+
+// Create a bitmask with all callee saved registers for CPU or Floating Point
+// registers. For CPU registers consider RA, GP and FP for saving if necessary.
+void MipsAsmPrinter::printSavedRegsBitmask() {
+  // CPU and FPU Saved Registers Bitmasks
+  unsigned CPUBitmask = 0, FPUBitmask = 0;
+  int CPUTopSavedRegOff, FPUTopSavedRegOff;
+
+  // Set the CPU and FPU Bitmasks
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  // size of stack area to which FP callee-saved regs are saved.
+  unsigned CPURegSize = Mips::GPR32RegClass.getSize();
+  unsigned FGR32RegSize = Mips::FGR32RegClass.getSize();
+  unsigned AFGR64RegSize = Mips::AFGR64RegClass.getSize();
+  bool HasAFGR64Reg = false;
+  unsigned CSFPRegsSize = 0;
+
+  for (const auto &I : CSI) {
+    unsigned Reg = I.getReg();
+    unsigned RegNum = TRI->getEncodingValue(Reg);
+
+    // If it's a floating point register, set the FPU Bitmask.
+    // If it's a general purpose register, set the CPU Bitmask.
+    if (Mips::FGR32RegClass.contains(Reg)) {
+      FPUBitmask |= (1 << RegNum);
+      CSFPRegsSize += FGR32RegSize;
+    } else if (Mips::AFGR64RegClass.contains(Reg)) {
+      FPUBitmask |= (3 << RegNum);
+      CSFPRegsSize += AFGR64RegSize;
+      HasAFGR64Reg = true;
+    } else if (Mips::GPR32RegClass.contains(Reg))
+      CPUBitmask |= (1 << RegNum);
+  }
+
+  // FP Regs are saved right below where the virtual frame pointer points to.
+  FPUTopSavedRegOff = FPUBitmask ?
+    (HasAFGR64Reg ? -AFGR64RegSize : -FGR32RegSize) : 0;
+
+  // CPU Regs are saved below FP Regs.
+  CPUTopSavedRegOff = CPUBitmask ? -CSFPRegsSize - CPURegSize : 0;
+
+  MipsTargetStreamer &TS = getTargetStreamer();
+  // Print CPUBitmask
+  TS.emitMask(CPUBitmask, CPUTopSavedRegOff);
+
+  // Print FPUBitmask
+  TS.emitFMask(FPUBitmask, FPUTopSavedRegOff);
+}
+
+//===----------------------------------------------------------------------===//
+// Frame and Set directives
+//===----------------------------------------------------------------------===//
+
+/// Frame Directive
+void MipsAsmPrinter::emitFrameDirective() {
+  const TargetRegisterInfo &RI = *MF->getSubtarget().getRegisterInfo();
+
+  unsigned stackReg  = RI.getFrameRegister(*MF);
+  unsigned returnReg = RI.getRARegister();
+  unsigned stackSize = MF->getFrameInfo().getStackSize();
+
+  getTargetStreamer().emitFrame(stackReg, stackSize, returnReg);
+}
+
+/// Emit Set directives.
+const char *MipsAsmPrinter::getCurrentABIString() const {
+  switch (static_cast<MipsTargetMachine &>(TM).getABI().GetEnumValue()) {
+  case MipsABIInfo::ABI::O32:  return "abi32";
+  case MipsABIInfo::ABI::N32:  return "abiN32";
+  case MipsABIInfo::ABI::N64:  return "abi64";
+  default: llvm_unreachable("Unknown Mips ABI");
+  }
+}
+
+void MipsAsmPrinter::EmitFunctionEntryLabel() {
+  MipsTargetStreamer &TS = getTargetStreamer();
+
+  // NaCl sandboxing requires that indirect call instructions are masked.
+  // This means that function entry points should be bundle-aligned.
+  if (Subtarget->isTargetNaCl())
+    EmitAlignment(std::max(MF->getAlignment(), MIPS_NACL_BUNDLE_ALIGN));
+
+  if (Subtarget->inMicroMipsMode()) {
+    TS.emitDirectiveSetMicroMips();
+    TS.setUsesMicroMips();
+  } else
+    TS.emitDirectiveSetNoMicroMips();
+
+  if (Subtarget->inMips16Mode())
+    TS.emitDirectiveSetMips16();
+  else
+    TS.emitDirectiveSetNoMips16();
+
+  TS.emitDirectiveEnt(*CurrentFnSym);
+  OutStreamer->EmitLabel(CurrentFnSym);
+}
+
+/// EmitFunctionBodyStart - Targets can override this to emit stuff before
+/// the first basic block in the function.
+void MipsAsmPrinter::EmitFunctionBodyStart() {
+  MipsTargetStreamer &TS = getTargetStreamer();
+
+  MCInstLowering.Initialize(&MF->getContext());
+
+  bool IsNakedFunction = MF->getFunction()->hasFnAttribute(Attribute::Naked);
+  if (!IsNakedFunction)
+    emitFrameDirective();
+
+  if (!IsNakedFunction)
+    printSavedRegsBitmask();
+
+  if (!Subtarget->inMips16Mode()) {
+    TS.emitDirectiveSetNoReorder();
+    TS.emitDirectiveSetNoMacro();
+    TS.emitDirectiveSetNoAt();
+  }
+}
+
+/// EmitFunctionBodyEnd - Targets can override this to emit stuff after
+/// the last basic block in the function.
+void MipsAsmPrinter::EmitFunctionBodyEnd() {
+  MipsTargetStreamer &TS = getTargetStreamer();
+
+  // There are instruction for this macros, but they must
+  // always be at the function end, and we can't emit and
+  // break with BB logic.
+  if (!Subtarget->inMips16Mode()) {
+    TS.emitDirectiveSetAt();
+    TS.emitDirectiveSetMacro();
+    TS.emitDirectiveSetReorder();
+  }
+  TS.emitDirectiveEnd(CurrentFnSym->getName());
+  // Make sure to terminate any constant pools that were at the end
+  // of the function.
+  if (!InConstantPool)
+    return;
+  InConstantPool = false;
+  OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+}
+
+void MipsAsmPrinter::EmitBasicBlockEnd(const MachineBasicBlock &MBB) {
+  MipsTargetStreamer &TS = getTargetStreamer();
+  if (MBB.size() == 0)
+    TS.emitDirectiveInsn();
+}
+
+/// isBlockOnlyReachableByFallthough - Return true if the basic block has
+/// exactly one predecessor and the control transfer mechanism between
+/// the predecessor and this block is a fall-through.
+bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
+                                                       MBB) const {
+  // The predecessor has to be immediately before this block.
+  const MachineBasicBlock *Pred = *MBB->pred_begin();
+
+  // If the predecessor is a switch statement, assume a jump table
+  // implementation, so it is not a fall through.
+  if (const BasicBlock *bb = Pred->getBasicBlock())
+    if (isa<SwitchInst>(bb->getTerminator()))
+      return false;
+
+  // If this is a landing pad, it isn't a fall through.  If it has no preds,
+  // then nothing falls through to it.
+  if (MBB->isEHPad() || MBB->pred_empty())
+    return false;
+
+  // If there isn't exactly one predecessor, it can't be a fall through.
+  MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PI2 = PI;
+  ++PI2;
+
+  if (PI2 != MBB->pred_end())
+    return false;
+
+  // The predecessor has to be immediately before this block.
+  if (!Pred->isLayoutSuccessor(MBB))
+    return false;
+
+  // If the block is completely empty, then it definitely does fall through.
+  if (Pred->empty())
+    return true;
+
+  // Otherwise, check the last instruction.
+  // Check if the last terminator is an unconditional branch.
+  MachineBasicBlock::const_iterator I = Pred->end();
+  while (I != Pred->begin() && !(--I)->isTerminator()) ;
+
+  return !I->isBarrier();
+}
+
+// Print out an operand for an inline asm expression.
+bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                                     unsigned AsmVariant, const char *ExtraCode,
+                                     raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    const MachineOperand &MO = MI->getOperand(OpNum);
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI,OpNum,AsmVariant,ExtraCode,O);
+    case 'X': // hex const int
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << "0x" << Twine::utohexstr(MO.getImm());
+      return false;
+    case 'x': // hex const int (low 16 bits)
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << "0x" << Twine::utohexstr(MO.getImm() & 0xffff);
+      return false;
+    case 'd': // decimal const int
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << MO.getImm();
+      return false;
+    case 'm': // decimal const int minus 1
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << MO.getImm() - 1;
+      return false;
+    case 'z': {
+      // $0 if zero, regular printing otherwise
+      if (MO.getType() == MachineOperand::MO_Immediate && MO.getImm() == 0) {
+        O << "$0";
+        return false;
+      }
+      // If not, call printOperand as normal.
+      break;
+    }
+    case 'D': // Second part of a double word register operand
+    case 'L': // Low order register of a double word register operand
+    case 'M': // High order register of a double word register operand
+    {
+      if (OpNum == 0)
+        return true;
+      const MachineOperand &FlagsOP = MI->getOperand(OpNum - 1);
+      if (!FlagsOP.isImm())
+        return true;
+      unsigned Flags = FlagsOP.getImm();
+      unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+      // Number of registers represented by this operand. We are looking
+      // for 2 for 32 bit mode and 1 for 64 bit mode.
+      if (NumVals != 2) {
+        if (Subtarget->isGP64bit() && NumVals == 1 && MO.isReg()) {
+          unsigned Reg = MO.getReg();
+          O << '$' << MipsInstPrinter::getRegisterName(Reg);
+          return false;
+        }
+        return true;
+      }
+
+      unsigned RegOp = OpNum;
+      if (!Subtarget->isGP64bit()){
+        // Endianness reverses which register holds the high or low value
+        // between M and L.
+        switch(ExtraCode[0]) {
+        case 'M':
+          RegOp = (Subtarget->isLittle()) ? OpNum + 1 : OpNum;
+          break;
+        case 'L':
+          RegOp = (Subtarget->isLittle()) ? OpNum : OpNum + 1;
+          break;
+        case 'D': // Always the second part
+          RegOp = OpNum + 1;
+        }
+        if (RegOp >= MI->getNumOperands())
+          return true;
+        const MachineOperand &MO = MI->getOperand(RegOp);
+        if (!MO.isReg())
+          return true;
+        unsigned Reg = MO.getReg();
+        O << '$' << MipsInstPrinter::getRegisterName(Reg);
+        return false;
+      }
+    }
+    case 'w':
+      // Print MSA registers for the 'f' constraint
+      // In LLVM, the 'w' modifier doesn't need to do anything.
+      // We can just call printOperand as normal.
+      break;
+    }
+  }
+
+  printOperand(MI, OpNum, O);
+  return false;
+}
+
+bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                           unsigned OpNum, unsigned AsmVariant,
+                                           const char *ExtraCode,
+                                           raw_ostream &O) {
+  assert(OpNum + 1 < MI->getNumOperands() && "Insufficient operands");
+  const MachineOperand &BaseMO = MI->getOperand(OpNum);
+  const MachineOperand &OffsetMO = MI->getOperand(OpNum + 1);
+  assert(BaseMO.isReg() && "Unexpected base pointer for inline asm memory operand.");
+  assert(OffsetMO.isImm() && "Unexpected offset for inline asm memory operand.");
+  int Offset = OffsetMO.getImm();
+
+  // Currently we are expecting either no ExtraCode or 'D'
+  if (ExtraCode) {
+    if (ExtraCode[0] == 'D')
+      Offset += 4;
+    else
+      return true; // Unknown modifier.
+    // FIXME: M = high order bits
+    // FIXME: L = low order bits
+  }
+
+  O << Offset << "($" << MipsInstPrinter::getRegisterName(BaseMO.getReg()) << ")";
+
+  return false;
+}
+
+void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                  raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  bool closeP = false;
+
+  if (MO.getTargetFlags())
+    closeP = true;
+
+  switch(MO.getTargetFlags()) {
+  case MipsII::MO_GPREL:    O << "%gp_rel("; break;
+  case MipsII::MO_GOT_CALL: O << "%call16("; break;
+  case MipsII::MO_GOT:      O << "%got(";    break;
+  case MipsII::MO_ABS_HI:   O << "%hi(";     break;
+  case MipsII::MO_ABS_LO:   O << "%lo(";     break;
+  case MipsII::MO_TLSGD:    O << "%tlsgd(";  break;
+  case MipsII::MO_GOTTPREL: O << "%gottprel("; break;
+  case MipsII::MO_TPREL_HI: O << "%tprel_hi("; break;
+  case MipsII::MO_TPREL_LO: O << "%tprel_lo("; break;
+  case MipsII::MO_GPOFF_HI: O << "%hi(%neg(%gp_rel("; break;
+  case MipsII::MO_GPOFF_LO: O << "%lo(%neg(%gp_rel("; break;
+  case MipsII::MO_GOT_DISP: O << "%got_disp("; break;
+  case MipsII::MO_GOT_PAGE: O << "%got_page("; break;
+  case MipsII::MO_GOT_OFST: O << "%got_ofst("; break;
+  }
+
+  switch (MO.getType()) {
+    case MachineOperand::MO_Register:
+      O << '$'
+        << StringRef(MipsInstPrinter::getRegisterName(MO.getReg())).lower();
+      break;
+
+    case MachineOperand::MO_Immediate:
+      O << MO.getImm();
+      break;
+
+    case MachineOperand::MO_MachineBasicBlock:
+      MO.getMBB()->getSymbol()->print(O, MAI);
+      return;
+
+    case MachineOperand::MO_GlobalAddress:
+      getSymbol(MO.getGlobal())->print(O, MAI);
+      break;
+
+    case MachineOperand::MO_BlockAddress: {
+      MCSymbol *BA = GetBlockAddressSymbol(MO.getBlockAddress());
+      O << BA->getName();
+      break;
+    }
+
+    case MachineOperand::MO_ConstantPoolIndex:
+      O << getDataLayout().getPrivateGlobalPrefix() << "CPI"
+        << getFunctionNumber() << "_" << MO.getIndex();
+      if (MO.getOffset())
+        O << "+" << MO.getOffset();
+      break;
+
+    default:
+      llvm_unreachable("<unknown operand type>");
+  }
+
+  if (closeP) O << ")";
+}
+
+void MipsAsmPrinter::
+printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O) {
+  // Load/Store memory operands -- imm($reg)
+  // If PIC target the target is loaded as the
+  // pattern lw $25,%call16($28)
+
+  // opNum can be invalid if instruction has reglist as operand.
+  // MemOperand is always last operand of instruction (base + offset).
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case Mips::SWM32_MM:
+  case Mips::LWM32_MM:
+    opNum = MI->getNumOperands() - 2;
+    break;
+  }
+
+  printOperand(MI, opNum+1, O);
+  O << "(";
+  printOperand(MI, opNum, O);
+  O << ")";
+}
+
+void MipsAsmPrinter::
+printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O) {
+  // when using stack locations for not load/store instructions
+  // print the same way as all normal 3 operand instructions.
+  printOperand(MI, opNum, O);
+  O << ", ";
+  printOperand(MI, opNum+1, O);
+  return;
+}
+
+void MipsAsmPrinter::
+printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  O << Mips::MipsFCCToString((Mips::CondCode)MO.getImm());
+}
+
+void MipsAsmPrinter::
+printRegisterList(const MachineInstr *MI, int opNum, raw_ostream &O) {
+  for (int i = opNum, e = MI->getNumOperands(); i != e; ++i) {
+    if (i != opNum) O << ", ";
+    printOperand(MI, i, O);
+  }
+}
+
+void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  MipsTargetStreamer &TS = getTargetStreamer();
+
+  // MipsTargetStreamer has an initialization order problem when emitting an
+  // object file directly (see MipsTargetELFStreamer for full details). Work
+  // around it by re-initializing the PIC state here.
+  TS.setPic(OutContext.getObjectFileInfo()->isPositionIndependent());
+
+  // Compute MIPS architecture attributes based on the default subtarget
+  // that we'd have constructed. Module level directives aren't LTO
+  // clean anyhow.
+  // FIXME: For ifunc related functions we could iterate over and look
+  // for a feature string that doesn't match the default one.
+  const Triple &TT = TM.getTargetTriple();
+  StringRef CPU = MIPS_MC::selectMipsCPU(TT, TM.getTargetCPU());
+  StringRef FS = TM.getTargetFeatureString();
+  const MipsTargetMachine &MTM = static_cast<const MipsTargetMachine &>(TM);
+  const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM);
+
+  bool IsABICalls = STI.isABICalls();
+  const MipsABIInfo &ABI = MTM.getABI();
+  if (IsABICalls) {
+    TS.emitDirectiveAbiCalls();
+    // FIXME: This condition should be a lot more complicated that it is here.
+    //        Ideally it should test for properties of the ABI and not the ABI
+    //        itself.
+    //        For the moment, I'm only correcting enough to make MIPS-IV work.
+    if (!isPositionIndependent() && !ABI.IsN64())
+      TS.emitDirectiveOptionPic0();
+  }
+
+  // Tell the assembler which ABI we are using
+  std::string SectionName = std::string(".mdebug.") + getCurrentABIString();
+  OutStreamer->SwitchSection(
+      OutContext.getELFSection(SectionName, ELF::SHT_PROGBITS, 0));
+
+  // NaN: At the moment we only support:
+  // 1. .nan legacy (default)
+  // 2. .nan 2008
+  STI.isNaN2008() ? TS.emitDirectiveNaN2008()
+                  : TS.emitDirectiveNaNLegacy();
+
+  // TODO: handle O64 ABI
+
+  TS.updateABIInfo(STI);
+
+  // We should always emit a '.module fp=...' but binutils 2.24 does not accept
+  // it. We therefore emit it when it contradicts the ABI defaults (-mfpxx or
+  // -mfp64) and omit it otherwise.
+  if (ABI.IsO32() && (STI.isABI_FPXX() || STI.isFP64bit()))
+    TS.emitDirectiveModuleFP();
+
+  // We should always emit a '.module [no]oddspreg' but binutils 2.24 does not
+  // accept it. We therefore emit it when it contradicts the default or an
+  // option has changed the default (i.e. FPXX) and omit it otherwise.
+  if (ABI.IsO32() && (!STI.useOddSPReg() || STI.isABI_FPXX()))
+    TS.emitDirectiveModuleOddSPReg();
+}
+
+void MipsAsmPrinter::emitInlineAsmStart() const {
+  MipsTargetStreamer &TS = getTargetStreamer();
+
+  // GCC's choice of assembler options for inline assembly code ('at', 'macro'
+  // and 'reorder') is different from LLVM's choice for generated code ('noat',
+  // 'nomacro' and 'noreorder').
+  // In order to maintain compatibility with inline assembly code which depends
+  // on GCC's assembler options being used, we have to switch to those options
+  // for the duration of the inline assembly block and then switch back.
+  TS.emitDirectiveSetPush();
+  TS.emitDirectiveSetAt();
+  TS.emitDirectiveSetMacro();
+  TS.emitDirectiveSetReorder();
+  OutStreamer->AddBlankLine();
+}
+
+void MipsAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+                                      const MCSubtargetInfo *EndInfo) const {
+  OutStreamer->AddBlankLine();
+  getTargetStreamer().emitDirectiveSetPop();
+}
+
+void MipsAsmPrinter::EmitJal(const MCSubtargetInfo &STI, MCSymbol *Symbol) {
+  MCInst I;
+  I.setOpcode(Mips::JAL);
+  I.addOperand(
+      MCOperand::createExpr(MCSymbolRefExpr::create(Symbol, OutContext)));
+  OutStreamer->EmitInstruction(I, STI);
+}
+
+void MipsAsmPrinter::EmitInstrReg(const MCSubtargetInfo &STI, unsigned Opcode,
+                                  unsigned Reg) {
+  MCInst I;
+  I.setOpcode(Opcode);
+  I.addOperand(MCOperand::createReg(Reg));
+  OutStreamer->EmitInstruction(I, STI);
+}
+
+void MipsAsmPrinter::EmitInstrRegReg(const MCSubtargetInfo &STI,
+                                     unsigned Opcode, unsigned Reg1,
+                                     unsigned Reg2) {
+  MCInst I;
+  //
+  // Because of the current td files for Mips32, the operands for MTC1
+  // appear backwards from their normal assembly order. It's not a trivial
+  // change to fix this in the td file so we adjust for it here.
+  //
+  if (Opcode == Mips::MTC1) {
+    unsigned Temp = Reg1;
+    Reg1 = Reg2;
+    Reg2 = Temp;
+  }
+  I.setOpcode(Opcode);
+  I.addOperand(MCOperand::createReg(Reg1));
+  I.addOperand(MCOperand::createReg(Reg2));
+  OutStreamer->EmitInstruction(I, STI);
+}
+
+void MipsAsmPrinter::EmitInstrRegRegReg(const MCSubtargetInfo &STI,
+                                        unsigned Opcode, unsigned Reg1,
+                                        unsigned Reg2, unsigned Reg3) {
+  MCInst I;
+  I.setOpcode(Opcode);
+  I.addOperand(MCOperand::createReg(Reg1));
+  I.addOperand(MCOperand::createReg(Reg2));
+  I.addOperand(MCOperand::createReg(Reg3));
+  OutStreamer->EmitInstruction(I, STI);
+}
+
+void MipsAsmPrinter::EmitMovFPIntPair(const MCSubtargetInfo &STI,
+                                      unsigned MovOpc, unsigned Reg1,
+                                      unsigned Reg2, unsigned FPReg1,
+                                      unsigned FPReg2, bool LE) {
+  if (!LE) {
+    unsigned temp = Reg1;
+    Reg1 = Reg2;
+    Reg2 = temp;
+  }
+  EmitInstrRegReg(STI, MovOpc, Reg1, FPReg1);
+  EmitInstrRegReg(STI, MovOpc, Reg2, FPReg2);
+}
+
+void MipsAsmPrinter::EmitSwapFPIntParams(const MCSubtargetInfo &STI,
+                                         Mips16HardFloatInfo::FPParamVariant PV,
+                                         bool LE, bool ToFP) {
+  using namespace Mips16HardFloatInfo;
+  unsigned MovOpc = ToFP ? Mips::MTC1 : Mips::MFC1;
+  switch (PV) {
+  case FSig:
+    EmitInstrRegReg(STI, MovOpc, Mips::A0, Mips::F12);
+    break;
+  case FFSig:
+    EmitMovFPIntPair(STI, MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F14, LE);
+    break;
+  case FDSig:
+    EmitInstrRegReg(STI, MovOpc, Mips::A0, Mips::F12);
+    EmitMovFPIntPair(STI, MovOpc, Mips::A2, Mips::A3, Mips::F14, Mips::F15, LE);
+    break;
+  case DSig:
+    EmitMovFPIntPair(STI, MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
+    break;
+  case DDSig:
+    EmitMovFPIntPair(STI, MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
+    EmitMovFPIntPair(STI, MovOpc, Mips::A2, Mips::A3, Mips::F14, Mips::F15, LE);
+    break;
+  case DFSig:
+    EmitMovFPIntPair(STI, MovOpc, Mips::A0, Mips::A1, Mips::F12, Mips::F13, LE);
+    EmitInstrRegReg(STI, MovOpc, Mips::A2, Mips::F14);
+    break;
+  case NoSig:
+    return;
+  }
+}
+
+void MipsAsmPrinter::EmitSwapFPIntRetval(
+    const MCSubtargetInfo &STI, Mips16HardFloatInfo::FPReturnVariant RV,
+    bool LE) {
+  using namespace Mips16HardFloatInfo;
+  unsigned MovOpc = Mips::MFC1;
+  switch (RV) {
+  case FRet:
+    EmitInstrRegReg(STI, MovOpc, Mips::V0, Mips::F0);
+    break;
+  case DRet:
+    EmitMovFPIntPair(STI, MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
+    break;
+  case CFRet:
+    EmitMovFPIntPair(STI, MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
+    break;
+  case CDRet:
+    EmitMovFPIntPair(STI, MovOpc, Mips::V0, Mips::V1, Mips::F0, Mips::F1, LE);
+    EmitMovFPIntPair(STI, MovOpc, Mips::A0, Mips::A1, Mips::F2, Mips::F3, LE);
+    break;
+  case NoFPRet:
+    break;
+  }
+}
+
+void MipsAsmPrinter::EmitFPCallStub(
+    const char *Symbol, const Mips16HardFloatInfo::FuncSignature *Signature) {
+  MCSymbol *MSymbol = OutContext.getOrCreateSymbol(StringRef(Symbol));
+  using namespace Mips16HardFloatInfo;
+  bool LE = getDataLayout().isLittleEndian();
+  // Construct a local MCSubtargetInfo here.
+  // This is because the MachineFunction won't exist (but have not yet been
+  // freed) and since we're at the global level we can use the default
+  // constructed subtarget.
+  std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
+      TM.getTargetTriple().str(), TM.getTargetCPU(),
+      TM.getTargetFeatureString()));
+
+  //
+  // .global xxxx
+  //
+  OutStreamer->EmitSymbolAttribute(MSymbol, MCSA_Global);
+  const char *RetType;
+  //
+  // make the comment field identifying the return and parameter
+  // types of the floating point stub
+  // # Stub function to call rettype xxxx (params)
+  //
+  switch (Signature->RetSig) {
+  case FRet:
+    RetType = "float";
+    break;
+  case DRet:
+    RetType = "double";
+    break;
+  case CFRet:
+    RetType = "complex";
+    break;
+  case CDRet:
+    RetType = "double complex";
+    break;
+  case NoFPRet:
+    RetType = "";
+    break;
+  }
+  const char *Parms;
+  switch (Signature->ParamSig) {
+  case FSig:
+    Parms = "float";
+    break;
+  case FFSig:
+    Parms = "float, float";
+    break;
+  case FDSig:
+    Parms = "float, double";
+    break;
+  case DSig:
+    Parms = "double";
+    break;
+  case DDSig:
+    Parms = "double, double";
+    break;
+  case DFSig:
+    Parms = "double, float";
+    break;
+  case NoSig:
+    Parms = "";
+    break;
+  }
+  OutStreamer->AddComment("\t# Stub function to call " + Twine(RetType) + " " +
+                          Twine(Symbol) + " (" + Twine(Parms) + ")");
+  //
+  // probably not necessary but we save and restore the current section state
+  //
+  OutStreamer->PushSection();
+  //
+  // .section mips16.call.fpxxxx,"ax",@progbits
+  //
+  MCSectionELF *M = OutContext.getELFSection(
+      ".mips16.call.fp." + std::string(Symbol), ELF::SHT_PROGBITS,
+      ELF::SHF_ALLOC | ELF::SHF_EXECINSTR);
+  OutStreamer->SwitchSection(M, nullptr);
+  //
+  // .align 2
+  //
+  OutStreamer->EmitValueToAlignment(4);
+  MipsTargetStreamer &TS = getTargetStreamer();
+  //
+  // .set nomips16
+  // .set nomicromips
+  //
+  TS.emitDirectiveSetNoMips16();
+  TS.emitDirectiveSetNoMicroMips();
+  //
+  // .ent __call_stub_fp_xxxx
+  // .type  __call_stub_fp_xxxx,@function
+  //  __call_stub_fp_xxxx:
+  //
+  std::string x = "__call_stub_fp_" + std::string(Symbol);
+  MCSymbolELF *Stub =
+      cast<MCSymbolELF>(OutContext.getOrCreateSymbol(StringRef(x)));
+  TS.emitDirectiveEnt(*Stub);
+  MCSymbol *MType =
+      OutContext.getOrCreateSymbol("__call_stub_fp_" + Twine(Symbol));
+  OutStreamer->EmitSymbolAttribute(MType, MCSA_ELF_TypeFunction);
+  OutStreamer->EmitLabel(Stub);
+
+  // Only handle non-pic for now.
+  assert(!isPositionIndependent() &&
+         "should not be here if we are compiling pic");
+  TS.emitDirectiveSetReorder();
+  //
+  // We need to add a MipsMCExpr class to MCTargetDesc to fully implement
+  // stubs without raw text but this current patch is for compiler generated
+  // functions and they all return some value.
+  // The calling sequence for non pic is different in that case and we need
+  // to implement %lo and %hi in order to handle the case of no return value
+  // See the corresponding method in Mips16HardFloat for details.
+  //
+  // mov the return address to S2.
+  // we have no stack space to store it and we are about to make another call.
+  // We need to make sure that the enclosing function knows to save S2
+  // This should have already been handled.
+  //
+  // Mov $18, $31
+
+  EmitInstrRegRegReg(*STI, Mips::OR, Mips::S2, Mips::RA, Mips::ZERO);
+
+  EmitSwapFPIntParams(*STI, Signature->ParamSig, LE, true);
+
+  // Jal xxxx
+  //
+  EmitJal(*STI, MSymbol);
+
+  // fix return values
+  EmitSwapFPIntRetval(*STI, Signature->RetSig, LE);
+  //
+  // do the return
+  // if (Signature->RetSig == NoFPRet)
+  //  llvm_unreachable("should not be any stubs here with no return value");
+  // else
+  EmitInstrReg(*STI, Mips::JR, Mips::S2);
+
+  MCSymbol *Tmp = OutContext.createTempSymbol();
+  OutStreamer->EmitLabel(Tmp);
+  const MCSymbolRefExpr *E = MCSymbolRefExpr::create(Stub, OutContext);
+  const MCSymbolRefExpr *T = MCSymbolRefExpr::create(Tmp, OutContext);
+  const MCExpr *T_min_E = MCBinaryExpr::createSub(T, E, OutContext);
+  OutStreamer->emitELFSize(Stub, T_min_E);
+  TS.emitDirectiveEnd(x);
+  OutStreamer->PopSection();
+}
+
+void MipsAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  // Emit needed stubs
+  //
+  for (std::map<
+           const char *,
+           const llvm::Mips16HardFloatInfo::FuncSignature *>::const_iterator
+           it = StubsNeeded.begin();
+       it != StubsNeeded.end(); ++it) {
+    const char *Symbol = it->first;
+    const llvm::Mips16HardFloatInfo::FuncSignature *Signature = it->second;
+    EmitFPCallStub(Symbol, Signature);
+  }
+  // return to the text section
+  OutStreamer->SwitchSection(OutContext.getObjectFileInfo()->getTextSection());
+}
+
+void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                           raw_ostream &OS) {
+  // TODO: implement
+}
+
+// Align all targets of indirect branches on bundle size.  Used only if target
+// is NaCl.
+void MipsAsmPrinter::NaClAlignIndirectJumpTargets(MachineFunction &MF) {
+  // Align all blocks that are jumped to through jump table.
+  if (MachineJumpTableInfo *JtInfo = MF.getJumpTableInfo()) {
+    const std::vector<MachineJumpTableEntry> &JT = JtInfo->getJumpTables();
+    for (unsigned I = 0; I < JT.size(); ++I) {
+      const std::vector<MachineBasicBlock*> &MBBs = JT[I].MBBs;
+
+      for (unsigned J = 0; J < MBBs.size(); ++J)
+        MBBs[J]->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
+    }
+  }
+
+  // If basic block address is taken, block can be target of indirect branch.
+  for (auto &MBB : MF) {
+    if (MBB.hasAddressTaken())
+      MBB.setAlignment(MIPS_NACL_BUNDLE_ALIGN);
+  }
+}
+
+bool MipsAsmPrinter::isLongBranchPseudo(int Opcode) const {
+  return (Opcode == Mips::LONG_BRANCH_LUi
+          || Opcode == Mips::LONG_BRANCH_ADDiu
+          || Opcode == Mips::LONG_BRANCH_DADDiu);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeMipsAsmPrinter() {
+  RegisterAsmPrinter<MipsAsmPrinter> X(getTheMipsTarget());
+  RegisterAsmPrinter<MipsAsmPrinter> Y(getTheMipselTarget());
+  RegisterAsmPrinter<MipsAsmPrinter> A(getTheMips64Target());
+  RegisterAsmPrinter<MipsAsmPrinter> B(getTheMips64elTarget());
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h
new file mode 100644
index 000000000000..259e557e1283
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h
@@ -0,0 +1,147 @@
+//===-- MipsAsmPrinter.h - Mips LLVM Assembly Printer ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Mips Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSASMPRINTER_H
+#define LLVM_LIB_TARGET_MIPS_MIPSASMPRINTER_H
+
+#include "Mips16HardFloatInfo.h"
+#include "MipsMCInstLower.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class MCStreamer;
+class MachineInstr;
+class MachineBasicBlock;
+class MipsTargetStreamer;
+class Module;
+class raw_ostream;
+
+class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
+  MipsTargetStreamer &getTargetStreamer() const;
+
+  void EmitInstrWithMacroNoAT(const MachineInstr *MI);
+
+private:
+  // tblgen'erated function.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+  // Emit PseudoReturn, PseudoReturn64, PseudoIndirectBranch,
+  // and PseudoIndirectBranch64 as a JR, JR_MM, JALR, or JALR64 as appropriate
+  // for the target.
+  void emitPseudoIndirectBranch(MCStreamer &OutStreamer,
+                                const MachineInstr *MI);
+
+  // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
+
+  /// MCP - Keep a pointer to constantpool entries of the current
+  /// MachineFunction.
+  const MachineConstantPool *MCP;
+
+  /// InConstantPool - Maintain state when emitting a sequence of constant
+  /// pool entries so we can properly mark them as data regions.
+  bool InConstantPool;
+
+  std::map<const char *, const llvm::Mips16HardFloatInfo::FuncSignature *>
+  StubsNeeded;
+
+  void emitInlineAsmStart() const override;
+
+  void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+                        const MCSubtargetInfo *EndInfo) const override;
+
+  void EmitJal(const MCSubtargetInfo &STI, MCSymbol *Symbol);
+
+  void EmitInstrReg(const MCSubtargetInfo &STI, unsigned Opcode, unsigned Reg);
+
+  void EmitInstrRegReg(const MCSubtargetInfo &STI, unsigned Opcode,
+                       unsigned Reg1, unsigned Reg2);
+
+  void EmitInstrRegRegReg(const MCSubtargetInfo &STI, unsigned Opcode,
+                          unsigned Reg1, unsigned Reg2, unsigned Reg3);
+
+  void EmitMovFPIntPair(const MCSubtargetInfo &STI, unsigned MovOpc,
+                        unsigned Reg1, unsigned Reg2, unsigned FPReg1,
+                        unsigned FPReg2, bool LE);
+
+  void EmitSwapFPIntParams(const MCSubtargetInfo &STI,
+                           Mips16HardFloatInfo::FPParamVariant, bool LE,
+                           bool ToFP);
+
+  void EmitSwapFPIntRetval(const MCSubtargetInfo &STI,
+                           Mips16HardFloatInfo::FPReturnVariant, bool LE);
+
+  void EmitFPCallStub(const char *, const Mips16HardFloatInfo::FuncSignature *);
+
+  void NaClAlignIndirectJumpTargets(MachineFunction &MF);
+
+  bool isLongBranchPseudo(int Opcode) const;
+
+public:
+
+  const MipsSubtarget *Subtarget;
+  const MipsFunctionInfo *MipsFI;
+  MipsMCInstLower MCInstLowering;
+
+  explicit MipsAsmPrinter(TargetMachine &TM,
+                          std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)), MCP(nullptr),
+        InConstantPool(false), MCInstLowering(*this) {}
+
+  StringRef getPassName() const override { return "Mips Assembly Printer"; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void EmitConstantPool() override {
+    bool UsingConstantPools =
+      (Subtarget->inMips16Mode() && Subtarget->useConstantIslands());
+    if (!UsingConstantPools)
+      AsmPrinter::EmitConstantPool();
+    // we emit constant pools customly!
+  }
+
+  void EmitInstruction(const MachineInstr *MI) override;
+  void printSavedRegsBitmask();
+  void emitFrameDirective();
+  const char *getCurrentABIString() const;
+  void EmitFunctionEntryLabel() override;
+  void EmitFunctionBodyStart() override;
+  void EmitFunctionBodyEnd() override;
+  void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override;
+  bool isBlockOnlyReachableByFallthrough(
+                                   const MachineBasicBlock* MBB) const override;
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O) override;
+  void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
+  void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
+  void printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O);
+  void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                       const char *Modifier = nullptr);
+  void printRegisterList(const MachineInstr *MI, int opNum, raw_ostream &O);
+  void EmitStartOfAsmFile(Module &M) override;
+  void EmitEndOfAsmFile(Module &M) override;
+  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+};
+}
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/Mips/MipsCCState.cpp b/contrib/llvm/lib/Target/Mips/MipsCCState.cpp
new file mode 100644
index 000000000000..7af988c1f64d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsCCState.cpp
@@ -0,0 +1,136 @@
+//===---- MipsCCState.cpp - CCState with Mips specific extensions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsCCState.h"
+#include "MipsSubtarget.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+/// This function returns true if CallSym is a long double emulation routine.
+static bool isF128SoftLibCall(const char *CallSym) {
+  const char *const LibCalls[] = {
+      "__addtf3",      "__divtf3",     "__eqtf2",       "__extenddftf2",
+      "__extendsftf2", "__fixtfdi",    "__fixtfsi",     "__fixtfti",
+      "__fixunstfdi",  "__fixunstfsi", "__fixunstfti",  "__floatditf",
+      "__floatsitf",   "__floattitf",  "__floatunditf", "__floatunsitf",
+      "__floatuntitf", "__getf2",      "__gttf2",       "__letf2",
+      "__lttf2",       "__multf3",     "__netf2",       "__powitf2",
+      "__subtf3",      "__trunctfdf2", "__trunctfsf2",  "__unordtf2",
+      "ceill",         "copysignl",    "cosl",          "exp2l",
+      "expl",          "floorl",       "fmal",          "fmodl",
+      "log10l",        "log2l",        "logl",          "nearbyintl",
+      "powl",          "rintl",        "roundl",        "sinl",
+      "sqrtl",         "truncl"};
+
+  // Check that LibCalls is sorted alphabetically.
+  auto Comp = [](const char *S1, const char *S2) { return strcmp(S1, S2) < 0; };
+  assert(std::is_sorted(std::begin(LibCalls), std::end(LibCalls), Comp));
+  return std::binary_search(std::begin(LibCalls), std::end(LibCalls),
+                            CallSym, Comp);
+}
+
+/// This function returns true if Ty is fp128, {f128} or i128 which was
+/// originally a fp128.
+static bool originalTypeIsF128(Type *Ty, const SDNode *CallNode) {
+  if (Ty->isFP128Ty())
+    return true;
+
+  if (Ty->isStructTy() && Ty->getStructNumElements() == 1 &&
+      Ty->getStructElementType(0)->isFP128Ty())
+    return true;
+
+  const ExternalSymbolSDNode *ES =
+      dyn_cast_or_null<const ExternalSymbolSDNode>(CallNode);
+
+  // If the Ty is i128 and the function being called is a long double emulation
+  // routine, then the original type is f128.
+  return (ES && Ty->isIntegerTy(128) && isF128SoftLibCall(ES->getSymbol()));
+}
+
+MipsCCState::SpecialCallingConvType
+MipsCCState::getSpecialCallingConvForCallee(const SDNode *Callee,
+                                            const MipsSubtarget &Subtarget) {
+  MipsCCState::SpecialCallingConvType SpecialCallingConv = NoSpecialCallingConv;
+  if (Subtarget.inMips16HardFloat()) {
+    if (const GlobalAddressSDNode *G =
+            dyn_cast<const GlobalAddressSDNode>(Callee)) {
+      llvm::StringRef Sym = G->getGlobal()->getName();
+      Function *F = G->getGlobal()->getParent()->getFunction(Sym);
+      if (F && F->hasFnAttribute("__Mips16RetHelper")) {
+        SpecialCallingConv = Mips16RetHelperConv;
+      }
+    }
+  }
+  return SpecialCallingConv;
+}
+
+void MipsCCState::PreAnalyzeCallResultForF128(
+    const SmallVectorImpl<ISD::InputArg> &Ins,
+    const TargetLowering::CallLoweringInfo &CLI) {
+  for (unsigned i = 0; i < Ins.size(); ++i) {
+    OriginalArgWasF128.push_back(
+        originalTypeIsF128(CLI.RetTy, CLI.Callee.getNode()));
+    OriginalArgWasFloat.push_back(CLI.RetTy->isFloatingPointTy());
+  }
+}
+
+/// Identify lowered values that originated from f128 arguments and record
+/// this for use by RetCC_MipsN.
+void MipsCCState::PreAnalyzeReturnForF128(
+    const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  const MachineFunction &MF = getMachineFunction();
+  for (unsigned i = 0; i < Outs.size(); ++i) {
+    OriginalArgWasF128.push_back(
+        originalTypeIsF128(MF.getFunction()->getReturnType(), nullptr));
+    OriginalArgWasFloat.push_back(
+        MF.getFunction()->getReturnType()->isFloatingPointTy());
+  }
+}
+
+/// Identify lowered values that originated from f128 arguments and record
+/// this.
+void MipsCCState::PreAnalyzeCallOperands(
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    std::vector<TargetLowering::ArgListEntry> &FuncArgs,
+    const SDNode *CallNode) {
+  for (unsigned i = 0; i < Outs.size(); ++i) {
+    OriginalArgWasF128.push_back(
+        originalTypeIsF128(FuncArgs[Outs[i].OrigArgIndex].Ty, CallNode));
+    OriginalArgWasFloat.push_back(
+        FuncArgs[Outs[i].OrigArgIndex].Ty->isFloatingPointTy());
+    CallOperandIsFixed.push_back(Outs[i].IsFixed);
+  }
+}
+
+/// Identify lowered values that originated from f128 arguments and record
+/// this.
+void MipsCCState::PreAnalyzeFormalArgumentsForF128(
+    const SmallVectorImpl<ISD::InputArg> &Ins) {
+  const MachineFunction &MF = getMachineFunction();
+  for (unsigned i = 0; i < Ins.size(); ++i) {
+    Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+
+    // SRet arguments cannot originate from f128 or {f128} returns so we just
+    // push false. We have to handle this specially since SRet arguments
+    // aren't mapped to an original argument.
+    if (Ins[i].Flags.isSRet()) {
+      OriginalArgWasF128.push_back(false);
+      OriginalArgWasFloat.push_back(false);
+      continue;
+    }
+
+    assert(Ins[i].getOrigArgIndex() < MF.getFunction()->arg_size());
+    std::advance(FuncArg, Ins[i].getOrigArgIndex());
+
+    OriginalArgWasF128.push_back(
+        originalTypeIsF128(FuncArg->getType(), nullptr));
+    OriginalArgWasFloat.push_back(FuncArg->getType()->isFloatingPointTy());
+  }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsCCState.h b/contrib/llvm/lib/Target/Mips/MipsCCState.h
new file mode 100644
index 000000000000..081c393a09be
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsCCState.h
@@ -0,0 +1,136 @@
+//===---- MipsCCState.h - CCState with Mips specific extensions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSCCSTATE_H
+#define MIPSCCSTATE_H
+
+#include "MipsISelLowering.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+
+namespace llvm {
+class SDNode;
+class MipsSubtarget;
+
+class MipsCCState : public CCState {
+public:
+  enum SpecialCallingConvType { Mips16RetHelperConv, NoSpecialCallingConv };
+
+  /// Determine the SpecialCallingConvType for the given callee
+  static SpecialCallingConvType
+  getSpecialCallingConvForCallee(const SDNode *Callee,
+                                 const MipsSubtarget &Subtarget);
+
+private:
+  /// Identify lowered values that originated from f128 arguments and record
+  /// this for use by RetCC_MipsN.
+  void PreAnalyzeCallResultForF128(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   const TargetLowering::CallLoweringInfo &CLI);
+
+  /// Identify lowered values that originated from f128 arguments and record
+  /// this for use by RetCC_MipsN.
+  void PreAnalyzeReturnForF128(const SmallVectorImpl<ISD::OutputArg> &Outs);
+
+  /// Identify lowered values that originated from f128 arguments and record
+  /// this.
+  void
+  PreAnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                         std::vector<TargetLowering::ArgListEntry> &FuncArgs,
+                         const SDNode *CallNode);
+
+  /// Identify lowered values that originated from f128 arguments and record
+  /// this.
+  void
+  PreAnalyzeFormalArgumentsForF128(const SmallVectorImpl<ISD::InputArg> &Ins);
+
+  /// Records whether the value has been lowered from an f128.
+  SmallVector<bool, 4> OriginalArgWasF128;
+
+  /// Records whether the value has been lowered from float.
+  SmallVector<bool, 4> OriginalArgWasFloat;
+
+  /// Records whether the value was a fixed argument.
+  /// See ISD::OutputArg::IsFixed,
+  SmallVector<bool, 4> CallOperandIsFixed;
+
+  // Used to handle MIPS16-specific calling convention tweaks.
+  // FIXME: This should probably be a fully fledged calling convention.
+  SpecialCallingConvType SpecialCallingConv;
+
+public:
+  MipsCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+              SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
+              SpecialCallingConvType SpecialCC = NoSpecialCallingConv)
+      : CCState(CC, isVarArg, MF, locs, C), SpecialCallingConv(SpecialCC) {}
+
+  void
+  AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      CCAssignFn Fn,
+                      std::vector<TargetLowering::ArgListEntry> &FuncArgs,
+                      const SDNode *CallNode) {
+    PreAnalyzeCallOperands(Outs, FuncArgs, CallNode);
+    CCState::AnalyzeCallOperands(Outs, Fn);
+    OriginalArgWasF128.clear();
+    OriginalArgWasFloat.clear();
+    CallOperandIsFixed.clear();
+  }
+
+  // The AnalyzeCallOperands in the base class is not usable since we must
+  // provide a means of accessing ArgListEntry::IsFixed. Delete them from this
+  // class. This doesn't stop them being used via the base class though.
+  void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           CCAssignFn Fn) = delete;
+  void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs,
+                           SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+                           CCAssignFn Fn) = delete;
+
+  void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                              CCAssignFn Fn) {
+    PreAnalyzeFormalArgumentsForF128(Ins);
+    CCState::AnalyzeFormalArguments(Ins, Fn);
+    OriginalArgWasFloat.clear();
+    OriginalArgWasF128.clear();
+  }
+
+  void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+                         CCAssignFn Fn,
+                         const TargetLowering::CallLoweringInfo &CLI) {
+    PreAnalyzeCallResultForF128(Ins, CLI);
+    CCState::AnalyzeCallResult(Ins, Fn);
+    OriginalArgWasFloat.clear();
+    OriginalArgWasF128.clear();
+  }
+
+  void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                     CCAssignFn Fn) {
+    PreAnalyzeReturnForF128(Outs);
+    CCState::AnalyzeReturn(Outs, Fn);
+    OriginalArgWasFloat.clear();
+    OriginalArgWasF128.clear();
+  }
+
+  bool CheckReturn(const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
+                   CCAssignFn Fn) {
+    PreAnalyzeReturnForF128(ArgsFlags);
+    bool Return = CCState::CheckReturn(ArgsFlags, Fn);
+    OriginalArgWasFloat.clear();
+    OriginalArgWasF128.clear();
+    return Return;
+  }
+
+  bool WasOriginalArgF128(unsigned ValNo) { return OriginalArgWasF128[ValNo]; }
+  bool WasOriginalArgFloat(unsigned ValNo) {
+      return OriginalArgWasFloat[ValNo];
+  }
+  bool IsCallOperandFixed(unsigned ValNo) { return CallOperandIsFixed[ValNo]; }
+  SpecialCallingConvType getSpecialCallingConv() { return SpecialCallingConv; }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsCallingConv.td b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td
new file mode 100644
index 000000000000..a57cb7badc17
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td
@@ -0,0 +1,406 @@
+//===-- MipsCallingConv.td - Calling Conventions for Mips --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for Mips architecture.
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A, string Invert = "">
+    : CCIf<!strconcat(Invert,
+                      "static_cast<const MipsSubtarget&>"
+			"(State.getMachineFunction().getSubtarget()).",
+                      F),
+           A>;
+
+// The inverse of CCIfSubtarget
+class CCIfSubtargetNot<string F, CCAction A> : CCIfSubtarget<F, A, "!">;
+
+/// Match if the original argument (before lowering) was a float.
+/// For example, this is true for i32's that were lowered from soft-float.
+class CCIfOrigArgWasNotFloat<CCAction A>
+    : CCIf<"!static_cast<MipsCCState *>(&State)->WasOriginalArgFloat(ValNo)",
+           A>;
+
+/// Match if the original argument (before lowering) was a 128-bit float (i.e.
+/// long double).
+class CCIfOrigArgWasF128<CCAction A>
+    : CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgF128(ValNo)", A>;
+
+/// Match if this specific argument is a vararg.
+/// This is slightly different fro CCIfIsVarArg which matches if any argument is
+/// a vararg.
+class CCIfArgIsVarArg<CCAction A>
+    : CCIf<"!static_cast<MipsCCState *>(&State)->IsCallOperandFixed(ValNo)", A>;
+
+
+/// Match if the special calling conv is the specified value.
+class CCIfSpecialCallingConv<string CC, CCAction A>
+    : CCIf<"static_cast<MipsCCState *>(&State)->getSpecialCallingConv() == "
+               "MipsCCState::" # CC, A>;
+
+// For soft-float, f128 values are returned in A0_64 rather than V1_64.
+def RetCC_F128SoftFloat : CallingConv<[
+  CCAssignToReg<[V0_64, A0_64]>
+]>;
+
+// For hard-float, f128 values are returned as a pair of f64's rather than a
+// pair of i64's.
+def RetCC_F128HardFloat : CallingConv<[
+  CCBitConvertToType<f64>,
+
+  // Contrary to the ABI documentation, a struct containing a long double is
+  // returned in $f0, and $f1 instead of the usual $f0, and $f2. This is to
+  // match the de facto ABI as implemented by GCC.
+  CCIfInReg<CCAssignToReg<[D0_64, D1_64]>>,
+
+  CCAssignToReg<[D0_64, D2_64]>
+]>;
+
+// Handle F128 specially since we can't identify the original type during the
+// tablegen-erated code.
+def RetCC_F128 : CallingConv<[
+  CCIfSubtarget<"useSoftFloat()",
+      CCIfType<[i64], CCDelegateTo<RetCC_F128SoftFloat>>>,
+  CCIfSubtargetNot<"useSoftFloat()",
+      CCIfType<[i64], CCDelegateTo<RetCC_F128HardFloat>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips O32 Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_MipsO32 : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // Integer values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  // Integer values get stored in stack slots that are 8 bytes in
+  // size and 8-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 8>>
+]>;
+
+// Only the return rules are defined here for O32. The rules for argument
+// passing are defined in MipsISelLowering.cpp.
+def RetCC_MipsO32 : CallingConv<[
+  // Promote i1/i8/i16 return values to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // i32 are returned in registers V0, V1, A0, A1
+  CCIfType<[i32], CCAssignToReg<[V0, V1, A0, A1]>>,
+
+  // f32 are returned in registers F0, F2
+  CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
+
+  // f64 arguments are returned in D0_64 and D2_64 in FP64bit mode or
+  // in D0 and D1 in FP32bit mode.
+  CCIfType<[f64], CCIfSubtarget<"isFP64bit()", CCAssignToReg<[D0_64, D2_64]>>>,
+  CCIfType<[f64], CCIfSubtargetNot<"isFP64bit()", CCAssignToReg<[D0, D1]>>>
+]>;
+
+def CC_MipsO32_FP32 : CustomCallingConv;
+def CC_MipsO32_FP64 : CustomCallingConv;
+
+def CC_MipsO32_FP : CallingConv<[
+  CCIfSubtargetNot<"isFP64bit()", CCDelegateTo<CC_MipsO32_FP32>>,
+  CCIfSubtarget<"isFP64bit()", CCDelegateTo<CC_MipsO32_FP64>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips N32/64 Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_MipsN_SoftFloat : CallingConv<[
+  CCAssignToRegWithShadow<[A0, A1, A2, A3,
+                           T0, T1, T2, T3],
+                          [D12_64, D13_64, D14_64, D15_64,
+                           D16_64, D17_64, D18_64, D19_64]>,
+  CCAssignToStack<4, 8>
+]>;
+
+def CC_MipsN : CallingConv<[
+  CCIfType<[i8, i16, i32, i64],
+      CCIfSubtargetNot<"isLittle()",
+          CCIfInReg<CCPromoteToUpperBitsInType<i64>>>>,
+
+  // All integers (except soft-float integers) are promoted to 64-bit.
+  CCIfType<[i8, i16, i32], CCIfOrigArgWasNotFloat<CCPromoteToType<i64>>>,
+
+  // The only i32's we have left are soft-float arguments.
+  CCIfSubtarget<"useSoftFloat()", CCIfType<[i32], CCDelegateTo<CC_MipsN_SoftFloat>>>,
+
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i64], CCAssignToRegWithShadow<[A0_64, A1_64, A2_64, A3_64,
+                                           T0_64, T1_64, T2_64, T3_64],
+                                          [D12_64, D13_64, D14_64, D15_64,
+                                           D16_64, D17_64, D18_64, D19_64]>>,
+
+  // f32 arguments are passed in single precision FP registers.
+  CCIfType<[f32], CCAssignToRegWithShadow<[F12, F13, F14, F15,
+                                           F16, F17, F18, F19],
+                                          [A0_64, A1_64, A2_64, A3_64,
+                                           T0_64, T1_64, T2_64, T3_64]>>,
+
+  // f64 arguments are passed in double precision FP registers.
+  CCIfType<[f64], CCAssignToRegWithShadow<[D12_64, D13_64, D14_64, D15_64,
+                                           D16_64, D17_64, D18_64, D19_64],
+                                          [A0_64, A1_64, A2_64, A3_64,
+                                           T0_64, T1_64, T2_64, T3_64]>>,
+
+  // All stack parameter slots become 64-bit doublewords and are 8-byte aligned.
+  CCIfType<[f32], CCAssignToStack<4, 8>>,
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+// N32/64 variable arguments.
+// All arguments are passed in integer registers.
+def CC_MipsN_VarArg : CallingConv<[
+  CCIfType<[i8, i16, i32, i64],
+      CCIfSubtargetNot<"isLittle()",
+          CCIfInReg<CCPromoteToUpperBitsInType<i64>>>>,
+
+  // All integers are promoted to 64-bit.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  CCIfType<[f32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3]>>,
+
+  CCIfType<[i64, f64], CCAssignToReg<[A0_64, A1_64, A2_64, A3_64,
+                                      T0_64, T1_64, T2_64, T3_64]>>,
+
+  // All stack parameter slots become 64-bit doublewords and are 8-byte aligned.
+  CCIfType<[f32], CCAssignToStack<4, 8>>,
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def RetCC_MipsN : CallingConv<[
+  // f128 needs to be handled similarly to f32 and f64. However, f128 is not
+  // legal and is lowered to i128 which is further lowered to a pair of i64's.
+  // This presents us with a problem for the calling convention since hard-float
+  // still needs to pass them in FPU registers, and soft-float needs to use $v0,
+  // and $a0 instead of the usual $v0, and $v1. We therefore resort to a
+  // pre-analyze (see PreAnalyzeReturnForF128()) step to pass information on
+  // whether the result was originally an f128 into the tablegen-erated code.
+  //
+  // f128 should only occur for the N64 ABI where long double is 128-bit. On
+  // N32, long double is equivalent to double.
+  CCIfType<[i64], CCIfOrigArgWasF128<CCDelegateTo<RetCC_F128>>>,
+
+  // Aggregate returns are positioned at the lowest address in the slot for
+  // both little and big-endian targets. When passing in registers, this
+  // requires that big-endian targets shift the value into the upper bits.
+  CCIfSubtarget<"isLittle()",
+      CCIfType<[i8, i16, i32, i64], CCIfInReg<CCPromoteToType<i64>>>>,
+  CCIfSubtargetNot<"isLittle()",
+      CCIfType<[i8, i16, i32, i64],
+          CCIfInReg<CCPromoteToUpperBitsInType<i64>>>>,
+
+  // i64 are returned in registers V0_64, V1_64
+  CCIfType<[i64], CCAssignToReg<[V0_64, V1_64]>>,
+
+  // f32 are returned in registers F0, F2
+  CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
+
+  // f64 are returned in registers D0, D2
+  CCIfType<[f64], CCAssignToReg<[D0_64, D2_64]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips FastCC Calling Convention
+//===----------------------------------------------------------------------===//
+def CC_MipsO32_FastCC : CallingConv<[
+  // f64 arguments are passed in double-precision floating pointer registers.
+  CCIfType<[f64], CCIfSubtargetNot<"isFP64bit()",
+                                   CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6,
+                                                  D7, D8, D9]>>>,
+  CCIfType<[f64], CCIfSubtarget<"isFP64bit()", CCIfSubtarget<"useOddSPReg()",
+                                CCAssignToReg<[D0_64, D1_64, D2_64, D3_64,
+                                               D4_64, D5_64, D6_64, D7_64,
+                                               D8_64, D9_64, D10_64, D11_64,
+                                               D12_64, D13_64, D14_64, D15_64,
+                                               D16_64, D17_64, D18_64,
+                                               D19_64]>>>>,
+  CCIfType<[f64], CCIfSubtarget<"isFP64bit()", CCIfSubtarget<"noOddSPReg()",
+                                CCAssignToReg<[D0_64, D2_64, D4_64, D6_64,
+                                               D8_64, D10_64, D12_64, D14_64,
+                                               D16_64, D18_64]>>>>,
+
+  // Stack parameter slots for f64 are 64-bit doublewords and 8-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_MipsN_FastCC : CallingConv<[
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i64], CCAssignToReg<[A0_64, A1_64, A2_64, A3_64, T0_64, T1_64,
+                                 T2_64, T3_64, T4_64, T5_64, T6_64, T7_64,
+                                 T8_64, V1_64]>>,
+
+  // f64 arguments are passed in double-precision floating pointer registers.
+  CCIfType<[f64], CCAssignToReg<[D0_64, D1_64, D2_64, D3_64, D4_64, D5_64,
+                                 D6_64, D7_64, D8_64, D9_64, D10_64, D11_64,
+                                 D12_64, D13_64, D14_64, D15_64, D16_64, D17_64,
+                                 D18_64, D19_64]>>,
+
+  // Stack parameter slots for i64 and f64 are 64-bit doublewords and
+  // 8-byte aligned.
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_Mips_FastCC : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Integer arguments are passed in integer registers. All scratch registers,
+  // except for AT, V0 and T9, are available to be used as argument registers.
+  CCIfType<[i32], CCIfSubtargetNot<"isTargetNaCl()",
+      CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6, T7, T8, V1]>>>,
+
+  // In NaCl, T6, T7 and T8 are reserved and not available as argument
+  // registers for fastcc.  T6 contains the mask for sandboxing control flow
+  // (indirect jumps and calls).  T7 contains the mask for sandboxing memory
+  // accesses (loads and stores).  T8 contains the thread pointer.
+  CCIfType<[i32], CCIfSubtarget<"isTargetNaCl()",
+      CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, V1]>>>,
+
+  // f32 arguments are passed in single-precision floating pointer registers.
+  CCIfType<[f32], CCIfSubtarget<"useOddSPReg()",
+      CCAssignToReg<[F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13,
+                     F14, F15, F16, F17, F18, F19]>>>,
+
+  // Don't use odd numbered single-precision registers for -mno-odd-spreg.
+  CCIfType<[f32], CCIfSubtarget<"noOddSPReg()",
+      CCAssignToReg<[F0, F2, F4, F6, F8, F10, F12, F14, F16, F18]>>>,
+
+  // Stack parameter slots for i32 and f32 are 32-bit words and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  CCIfSubtarget<"isABI_O32()", CCDelegateTo<CC_MipsO32_FastCC>>,
+  CCDelegateTo<CC_MipsN_FastCC>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips Calling Convention Dispatch
+//===----------------------------------------------------------------------===//
+
+def RetCC_Mips : CallingConv<[
+  CCIfSubtarget<"isABI_N32()", CCDelegateTo<RetCC_MipsN>>,
+  CCIfSubtarget<"isABI_N64()", CCDelegateTo<RetCC_MipsN>>,
+  CCDelegateTo<RetCC_MipsO32>
+]>;
+
+def CC_Mips_ByVal : CallingConv<[
+  CCIfSubtarget<"isABI_O32()", CCIfByVal<CCPassByVal<4, 4>>>,
+  CCIfByVal<CCPassByVal<8, 8>>
+]>;
+
+def CC_Mips16RetHelper : CallingConv<[
+  CCIfByVal<CCDelegateTo<CC_Mips_ByVal>>,
+
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[V0, V1, A0, A1]>>
+]>;
+
+def CC_Mips_FixedArg : CallingConv<[
+  // Mips16 needs special handling on some functions.
+  CCIf<"State.getCallingConv() != CallingConv::Fast",
+      CCIfSpecialCallingConv<"Mips16RetHelperConv",
+           CCDelegateTo<CC_Mips16RetHelper>>>,
+
+  CCIfByVal<CCDelegateTo<CC_Mips_ByVal>>,
+
+  // f128 needs to be handled similarly to f32 and f64 on hard-float. However,
+  // f128 is not legal and is lowered to i128 which is further lowered to a pair
+  // of i64's.
+  // This presents us with a problem for the calling convention since hard-float
+  // still needs to pass them in FPU registers. We therefore resort to a
+  // pre-analyze (see PreAnalyzeFormalArgsForF128()) step to pass information on
+  // whether the argument was originally an f128 into the tablegen-erated code.
+  //
+  // f128 should only occur for the N64 ABI where long double is 128-bit. On
+  // N32, long double is equivalent to double.
+  CCIfType<[i64],
+      CCIfSubtargetNot<"useSoftFloat()",
+          CCIfOrigArgWasF128<CCBitConvertToType<f64>>>>,
+
+  CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_Mips_FastCC>>,
+
+  CCIfSubtarget<"isABI_O32()", CCDelegateTo<CC_MipsO32_FP>>,
+  CCDelegateTo<CC_MipsN>
+]>;
+
+def CC_Mips_VarArg : CallingConv<[
+  CCIfByVal<CCDelegateTo<CC_Mips_ByVal>>,
+
+  CCIfSubtarget<"isABI_O32()", CCDelegateTo<CC_MipsO32_FP>>,
+  CCDelegateTo<CC_MipsN_VarArg>
+]>;
+
+def CC_Mips : CallingConv<[
+  CCIfVarArg<CCIfArgIsVarArg<CCDelegateTo<CC_Mips_VarArg>>>,
+  CCDelegateTo<CC_Mips_FixedArg>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved register lists.
+//===----------------------------------------------------------------------===//
+
+def CSR_SingleFloatOnly : CalleeSavedRegs<(add (sequence "F%u", 31, 20), RA, FP,
+                                               (sequence "S%u", 7, 0))>;
+
+def CSR_O32_FPXX : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP,
+                                        (sequence "S%u", 7, 0))> {
+  let OtherPreserved = (add (decimate (sequence "F%u", 30, 20), 2));
+}
+
+def CSR_O32 : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP,
+                                   (sequence "S%u", 7, 0))>;
+
+def CSR_O32_FP64 :
+  CalleeSavedRegs<(add (decimate (sequence "D%u_64", 30, 20), 2), RA, FP,
+                       (sequence "S%u", 7, 0))>;
+
+def CSR_N32 : CalleeSavedRegs<(add D20_64, D22_64, D24_64, D26_64, D28_64,
+                                   D30_64, RA_64, FP_64, GP_64,
+                                   (sequence "S%u_64", 7, 0))>;
+
+def CSR_N64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 24), RA_64, FP_64,
+                                   GP_64, (sequence "S%u_64", 7, 0))>;
+
+def CSR_Mips16RetHelper :
+  CalleeSavedRegs<(add V0, V1, FP,
+                   (sequence "A%u", 3, 0), (sequence "S%u", 7, 0),
+                   (sequence "D%u", 15, 10))>;
+
+def CSR_Interrupt_32R6 : CalleeSavedRegs<(add (sequence "A%u", 3, 0),
+                                              (sequence "S%u", 7, 0),
+                                              (sequence "V%u", 1, 0),
+                                              (sequence "T%u", 9, 0),
+                                              RA, FP, GP, AT)>;
+
+def CSR_Interrupt_32 : CalleeSavedRegs<(add (sequence "A%u", 3, 0),
+                                            (sequence "S%u", 7, 0),
+                                            (sequence "V%u", 1, 0),
+                                            (sequence "T%u", 9, 0),
+                                            RA, FP, GP, AT, LO0, HI0)>;
+
+def CSR_Interrupt_64R6 : CalleeSavedRegs<(add (sequence "A%u_64", 3, 0),
+                                              (sequence "V%u_64", 1, 0),
+                                              (sequence "S%u_64", 7, 0),
+                                              (sequence "T%u_64", 9, 0),
+                                              RA_64, FP_64, GP_64, AT_64)>;
+
+def CSR_Interrupt_64 : CalleeSavedRegs<(add (sequence "A%u_64", 3, 0),
+                                            (sequence "S%u_64", 7, 0),
+                                            (sequence "T%u_64", 9, 0),
+                                            (sequence "V%u_64", 1, 0),
+                                            RA_64, FP_64, GP_64, AT_64,
+                                            LO0_64, HI0_64)>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsCondMov.td b/contrib/llvm/lib/Target/Mips/MipsCondMov.td
new file mode 100644
index 000000000000..fd4517f25335
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsCondMov.td
@@ -0,0 +1,299 @@
+//===-- MipsCondMov.td - Describe Mips Conditional Moves --*- tablegen -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the Conditional Moves implementation.
+//
+//===----------------------------------------------------------------------===//
+
+// Conditional moves:
+// These instructions are expanded in
+// MipsISelLowering::EmitInstrWithCustomInserter if target does not have
+// conditional move instructions.
+// cond:int, data:int
+class CMov_I_I_FT<string opstr, RegisterOperand CRC, RegisterOperand DRC,
+                  InstrItinClass Itin> :
+  InstSE<(outs DRC:$rd), (ins DRC:$rs, CRC:$rt, DRC:$F),
+         !strconcat(opstr, "\t$rd, $rs, $rt"), [], Itin, FrmFR, opstr> {
+  let Constraints = "$F = $rd";
+}
+
+// cond:int, data:float
+class CMov_I_F_FT<string opstr, RegisterOperand CRC, RegisterOperand DRC,
+                  InstrItinClass Itin> :
+  InstSE<(outs DRC:$fd), (ins DRC:$fs, CRC:$rt, DRC:$F),
+         !strconcat(opstr, "\t$fd, $fs, $rt"), [], Itin, FrmFR, opstr>,
+  HARDFLOAT {
+  let Constraints = "$F = $fd";
+}
+
+// cond:float, data:int
+class CMov_F_I_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
+                  SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$rd), (ins RC:$rs, FCCRegsOpnd:$fcc, RC:$F),
+         !strconcat(opstr, "\t$rd, $rs, $fcc"),
+         [(set RC:$rd, (OpNode RC:$rs, FCCRegsOpnd:$fcc, RC:$F))],
+         Itin, FrmFR, opstr>, HARDFLOAT {
+  let Constraints = "$F = $rd";
+}
+
+// cond:float, data:float
+class CMov_F_F_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
+                  SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$fd), (ins RC:$fs, FCCRegsOpnd:$fcc, RC:$F),
+         !strconcat(opstr, "\t$fd, $fs, $fcc"),
+         [(set RC:$fd, (OpNode RC:$fs, FCCRegsOpnd:$fcc, RC:$F))],
+         Itin, FrmFR, opstr>, HARDFLOAT {
+  let Constraints = "$F = $fd";
+}
+
+// select patterns
+multiclass MovzPats0<RegisterClass CRC, RegisterClass DRC,
+                     Instruction MOVZInst, Instruction SLTOp,
+                     Instruction SLTuOp, Instruction SLTiOp,
+                     Instruction SLTiuOp> {
+  def : MipsPat<(select (i32 (setge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setuge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTuOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setge CRC:$lhs, immSExt16:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, immSExt16:$rhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setuge CRC:$lh, immSExt16:$rh)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTiuOp CRC:$lh, immSExt16:$rh), DRC:$F)>;
+  def : MipsPat<(select (i32 (setle CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setule CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTuOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setgt CRC:$lhs, immSExt16Plus1:$rhs)),
+                        DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, (Plus1 imm:$rhs)), DRC:$F)>;
+  def : MipsPat<(select (i32 (setugt CRC:$lhs, immSExt16Plus1:$rhs)),
+                        DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTiuOp CRC:$lhs, (Plus1 imm:$rhs)),
+                          DRC:$F)>;
+}
+
+multiclass MovzPats1<RegisterClass CRC, RegisterClass DRC,
+                     Instruction MOVZInst, Instruction XOROp> {
+  def : MipsPat<(select (i32 (seteq CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (seteq CRC:$lhs, 0)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, CRC:$lhs, DRC:$F)>;
+}
+
+multiclass MovzPats2<RegisterClass CRC, RegisterClass DRC,
+                     Instruction MOVZInst, Instruction XORiOp> {
+  def : MipsPat<
+            (select (i32 (seteq CRC:$lhs, immZExt16:$uimm16)), DRC:$T, DRC:$F),
+            (MOVZInst DRC:$T, (XORiOp CRC:$lhs, immZExt16:$uimm16), DRC:$F)>;
+}
+
+multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
+                    Instruction XOROp> {
+  def : MipsPat<(select (i32 (setne CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVNInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select CRC:$cond, DRC:$T, DRC:$F),
+                (MOVNInst DRC:$T, CRC:$cond, DRC:$F)>;
+  def : MipsPat<(select (i32 (setne CRC:$lhs, 0)),DRC:$T, DRC:$F),
+                (MOVNInst DRC:$T, CRC:$lhs, DRC:$F)>;
+}
+
+// Instantiation of instructions.
+def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, II_MOVZ>,
+               ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in {
+  def MOVZ_I_I64   : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, II_MOVZ>,
+                     ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+  def MOVZ_I64_I   : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, II_MOVZ>,
+                     ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+  def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, II_MOVZ>,
+                     ADD_FM<0, 0xa>, INSN_MIPS4_32_NOT_32R6_64R6;
+}
+
+def MOVN_I_I       : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, II_MOVN>,
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in {
+  def MOVN_I_I64   : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, II_MOVN>,
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+  def MOVN_I64_I   : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, II_MOVN>,
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+  def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, II_MOVN>,
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32_NOT_32R6_64R6;
+}
+
+def MOVZ_I_S : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, II_MOVZ_S>,
+               CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in
+def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, II_MOVZ_S>,
+                 CMov_I_F_FM<18, 16>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+def MOVN_I_S : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, II_MOVN_S>,
+               CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in
+def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, II_MOVN_S>,
+                 CMov_I_F_FM<19, 16>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+def MOVZ_I_D32 : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
+                                    II_MOVZ_D>, CMov_I_F_FM<18, 17>,
+                 INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+def MOVN_I_D32 : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
+                                    II_MOVN_D>, CMov_I_F_FM<19, 17>,
+                 INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+
+let DecoderNamespace = "Mips64" in {
+  def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, II_MOVZ_D>,
+                   CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+  def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, II_MOVN_D>,
+                   CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+  let isCodeGenOnly = 1 in {
+    def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd, II_MOVZ_D>,
+                       CMov_I_F_FM<18, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+    def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd, II_MOVN_D>,
+                       CMov_I_F_FM<19, 17>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+  }
+}
+
+def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT, MipsCMovFP_T>,
+             CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in
+def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, II_MOVT, MipsCMovFP_T>,
+               CMov_F_I_FM<1>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF, MipsCMovFP_F>,
+             CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+let isCodeGenOnly = 1 in
+def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, II_MOVF, MipsCMovFP_F>,
+               CMov_F_I_FM<0>, INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+def MOVT_S : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S, MipsCMovFP_T>,
+             CMov_F_F_FM<16, 1>, INSN_MIPS4_32_NOT_32R6_64R6;
+def MOVF_S : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S, MipsCMovFP_F>,
+             CMov_F_F_FM<16, 0>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+def MOVT_D32 : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
+                                  MipsCMovFP_T>, CMov_F_F_FM<17, 1>,
+               INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+def MOVF_D32 : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
+                                  MipsCMovFP_F>, CMov_F_F_FM<17, 0>,
+               INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+
+let DecoderNamespace = "Mips64" in {
+  def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, II_MOVT_D, MipsCMovFP_T>,
+                 CMov_F_F_FM<17, 1>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+  def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, II_MOVF_D, MipsCMovFP_F>,
+                 CMov_F_F_FM<17, 0>, INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+}
+
+// Instantiation of conditional move patterns.
+defm : MovzPats0<GPR32, GPR32, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovzPats1<GPR32, GPR32, MOVZ_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovzPats2<GPR32, GPR32, MOVZ_I_I, XORi>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats0<GPR64, GPR32, MOVZ_I_I, SLT64, SLTu64, SLTi64, SLTiu64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats0<GPR64, GPR64, MOVZ_I_I64, SLT64, SLTu64, SLTi64, SLTiu64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+
+defm : MovnPats<GPR32, GPR32, MOVN_I_I, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       GPR_64;
+defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+       GPR_64;
+defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+       GPR_64;
+
+defm : MovzPats0<GPR32, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovzPats1<GPR32, FGR32, MOVZ_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+defm : MovnPats<GPR32, FGR32, MOVN_I_S, XOR>, INSN_MIPS4_32_NOT_32R6_64R6;
+
+defm : MovzPats0<GPR64, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64, SLTiu64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, GPR_64;
+defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+       GPR_64;
+defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+       GPR_64;
+
+defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6, FGR_32;
+defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_32;
+defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_32;
+
+defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>,
+       INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+defm : MovzPats0<GPR64, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64, SLTiu64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_64;
+defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>,
+       INSN_MIPS4_32_NOT_32R6_64R6, FGR_64;
+defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_64;
+defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>, INSN_MIPS4_32_NOT_32R6_64R6,
+       FGR_64;
+
+// For targets that don't have conditional-move instructions
+// we have to match SELECT nodes with pseudo instructions.
+let usesCustomInserter = 1 in {
+  class Select_Pseudo<RegisterOperand RC> :
+    PseudoSE<(outs RC:$dst), (ins GPR32Opnd:$cond, RC:$T, RC:$F),
+            [(set RC:$dst, (select GPR32Opnd:$cond, RC:$T, RC:$F))]>,
+    ISA_MIPS1_NOT_4_32;
+
+  class SelectFP_Pseudo_T<RegisterOperand RC> :
+    PseudoSE<(outs RC:$dst), (ins GPR32Opnd:$cond, RC:$T, RC:$F),
+             [(set RC:$dst, (MipsCMovFP_T RC:$T, GPR32Opnd:$cond, RC:$F))]>,
+    ISA_MIPS1_NOT_4_32;
+
+  class SelectFP_Pseudo_F<RegisterOperand RC> :
+    PseudoSE<(outs RC:$dst), (ins GPR32Opnd:$cond, RC:$T, RC:$F),
+             [(set RC:$dst, (MipsCMovFP_F RC:$T, GPR32Opnd:$cond, RC:$F))]>,
+    ISA_MIPS1_NOT_4_32;
+}
+
+def PseudoSELECT_I : Select_Pseudo<GPR32Opnd>;
+def PseudoSELECT_I64 : Select_Pseudo<GPR64Opnd>;
+def PseudoSELECT_S : Select_Pseudo<FGR32Opnd>;
+def PseudoSELECT_D32 : Select_Pseudo<AFGR64Opnd>, FGR_32;
+def PseudoSELECT_D64 : Select_Pseudo<FGR64Opnd>, FGR_64;
+
+def PseudoSELECTFP_T_I : SelectFP_Pseudo_T<GPR32Opnd>;
+def PseudoSELECTFP_T_I64 : SelectFP_Pseudo_T<GPR64Opnd>;
+def PseudoSELECTFP_T_S : SelectFP_Pseudo_T<FGR32Opnd>;
+def PseudoSELECTFP_T_D32 : SelectFP_Pseudo_T<AFGR64Opnd>, FGR_32;
+def PseudoSELECTFP_T_D64 : SelectFP_Pseudo_T<FGR64Opnd>, FGR_64;
+
+def PseudoSELECTFP_F_I : SelectFP_Pseudo_F<GPR32Opnd>;
+def PseudoSELECTFP_F_I64 : SelectFP_Pseudo_F<GPR64Opnd>;
+def PseudoSELECTFP_F_S : SelectFP_Pseudo_F<FGR32Opnd>;
+def PseudoSELECTFP_F_D32 : SelectFP_Pseudo_F<AFGR64Opnd>, FGR_32;
+def PseudoSELECTFP_F_D64 : SelectFP_Pseudo_F<FGR64Opnd>, FGR_64;
diff --git a/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
new file mode 100644
index 000000000000..08b8ed31ccbb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -0,0 +1,1694 @@
+//===-- MipsConstantIslandPass.cpp - Emit Pc Relative loads----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// This pass is used to make Pc relative loads of constants.
+// For now, only Mips16 will use this. 
+//
+// Loading constants inline is expensive on Mips16 and it's in general better
+// to place the constant nearby in code space and then it can be loaded with a
+// simple 16 bit load instruction.
+//
+// The constants can be not just numbers but addresses of functions and labels.
+// This can be particularly helpful in static relocation mode for embedded
+// non-linux targets.
+//
+//
+
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips16InstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-constant-islands"
+
+STATISTIC(NumCPEs,       "Number of constpool entries");
+STATISTIC(NumSplit,      "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
+STATISTIC(NumUBrFixed,   "Number of uncond branches fixed");
+
+// FIXME: This option should be removed once it has received sufficient testing.
+static cl::opt<bool>
+AlignConstantIslands("mips-align-constant-islands", cl::Hidden, cl::init(true),
+          cl::desc("Align constant islands in code"));
+
+
+// Rather than do make check tests with huge amounts of code, we force
+// the test to use this amount.
+//
+static cl::opt<int> ConstantIslandsSmallOffset(
+  "mips-constant-islands-small-offset",
+  cl::init(0),
+  cl::desc("Make small offsets be this amount for testing purposes"),
+  cl::Hidden);
+
+//
+// For testing purposes we tell it to not use relaxed load forms so that it
+// will split blocks.
+//
+static cl::opt<bool> NoLoadRelaxation(
+  "mips-constant-islands-no-load-relaxation",
+  cl::init(false),
+  cl::desc("Don't relax loads to long loads - for testing purposes"),
+  cl::Hidden);
+
+static unsigned int branchTargetOperand(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case Mips::Bimm16:
+  case Mips::BimmX16:
+  case Mips::Bteqz16:
+  case Mips::BteqzX16:
+  case Mips::Btnez16:
+  case Mips::BtnezX16:
+  case Mips::JalB16:
+    return 0;
+  case Mips::BeqzRxImm16:
+  case Mips::BeqzRxImmX16:
+  case Mips::BnezRxImm16:
+  case Mips::BnezRxImmX16:
+    return 1;
+  }
+  llvm_unreachable("Unknown branch type");
+}
+
+static unsigned int longformBranchOpcode(unsigned int Opcode) {
+  switch (Opcode) {
+  case Mips::Bimm16:
+  case Mips::BimmX16:
+    return Mips::BimmX16;
+  case Mips::Bteqz16:
+  case Mips::BteqzX16:
+    return Mips::BteqzX16;
+  case Mips::Btnez16:
+  case Mips::BtnezX16:
+    return Mips::BtnezX16;
+  case Mips::JalB16:
+    return Mips::JalB16;
+  case Mips::BeqzRxImm16:
+  case Mips::BeqzRxImmX16:
+    return Mips::BeqzRxImmX16;
+  case Mips::BnezRxImm16:
+  case Mips::BnezRxImmX16:
+    return Mips::BnezRxImmX16;
+  }
+  llvm_unreachable("Unknown branch type");
+}
+
+//
+// FIXME: need to go through this whole constant islands port and check the math
+// for branch ranges and clean this up and make some functions to calculate things
+// that are done many times identically.
+// Need to refactor some of the code to call this routine.
+//
+static unsigned int branchMaxOffsets(unsigned int Opcode) {
+  unsigned Bits, Scale;
+  switch (Opcode) {
+    case Mips::Bimm16:
+      Bits = 11;
+      Scale = 2;
+      break;
+    case Mips::BimmX16:
+      Bits = 16;
+      Scale = 2;
+      break;
+    case Mips::BeqzRxImm16:
+      Bits = 8;
+      Scale = 2;
+      break;
+    case Mips::BeqzRxImmX16:
+      Bits = 16;
+      Scale = 2;
+      break;
+    case Mips::BnezRxImm16:
+      Bits = 8;
+      Scale = 2;
+      break;
+    case Mips::BnezRxImmX16:
+      Bits = 16;
+      Scale = 2;
+      break;
+    case Mips::Bteqz16:
+      Bits = 8;
+      Scale = 2;
+      break;
+    case Mips::BteqzX16:
+      Bits = 16;
+      Scale = 2;
+      break;
+    case Mips::Btnez16:
+      Bits = 8;
+      Scale = 2;
+      break;
+    case Mips::BtnezX16:
+      Bits = 16;
+      Scale = 2;
+      break;
+    default:
+      llvm_unreachable("Unknown branch type");
+  }
+  unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+  return MaxOffs;
+}
+
+namespace {
+
+
+  typedef MachineBasicBlock::iterator Iter;
+  typedef MachineBasicBlock::reverse_iterator ReverseIter;
+
+  /// MipsConstantIslands - Due to limited PC-relative displacements, Mips
+  /// requires constant pool entries to be scattered among the instructions
+  /// inside a function.  To do this, it completely ignores the normal LLVM
+  /// constant pool; instead, it places constants wherever it feels like with
+  /// special instructions.
+  ///
+  /// The terminology used in this pass includes:
+  ///   Islands - Clumps of constants placed in the function.
+  ///   Water   - Potential places where an island could be formed.
+  ///   CPE     - A constant pool entry that has been placed somewhere, which
+  ///             tracks a list of users.
+
+  class MipsConstantIslands : public MachineFunctionPass {
+
+    /// BasicBlockInfo - Information about the offset and size of a single
+    /// basic block.
+    struct BasicBlockInfo {
+      /// Offset - Distance from the beginning of the function to the beginning
+      /// of this basic block.
+      ///
+      /// Offsets are computed assuming worst case padding before an aligned
+      /// block. This means that subtracting basic block offsets always gives a
+      /// conservative estimate of the real distance which may be smaller.
+      ///
+      /// Because worst case padding is used, the computed offset of an aligned
+      /// block may not actually be aligned.
+      unsigned Offset;
+
+      /// Size - Size of the basic block in bytes.  If the block contains
+      /// inline assembly, this is a worst case estimate.
+      ///
+      /// The size does not include any alignment padding whether from the
+      /// beginning of the block, or from an aligned jump table at the end.
+      unsigned Size;
+
+      // FIXME: ignore LogAlign for this patch
+      //
+      unsigned postOffset(unsigned LogAlign = 0) const {
+        unsigned PO = Offset + Size;
+        return PO;
+      }
+
+      BasicBlockInfo() : Offset(0), Size(0) {}
+
+    };
+
+    std::vector<BasicBlockInfo> BBInfo;
+
+    /// WaterList - A sorted list of basic blocks where islands could be placed
+    /// (i.e. blocks that don't fall through to the following block, due
+    /// to a return, unreachable, or unconditional branch).
+    std::vector<MachineBasicBlock*> WaterList;
+
+    /// NewWaterList - The subset of WaterList that was created since the
+    /// previous iteration by inserting unconditional branches.
+    SmallSet<MachineBasicBlock*, 4> NewWaterList;
+
+    typedef std::vector<MachineBasicBlock*>::iterator water_iterator;
+
+    /// CPUser - One user of a constant pool, keeping the machine instruction
+    /// pointer, the constant pool being referenced, and the max displacement
+    /// allowed from the instruction to the CP.  The HighWaterMark records the
+    /// highest basic block where a new CPEntry can be placed.  To ensure this
+    /// pass terminates, the CP entries are initially placed at the end of the
+    /// function and then move monotonically to lower addresses.  The
+    /// exception to this rule is when the current CP entry for a particular
+    /// CPUser is out of range, but there is another CP entry for the same
+    /// constant value in range.  We want to use the existing in-range CP
+    /// entry, but if it later moves out of range, the search for new water
+    /// should resume where it left off.  The HighWaterMark is used to record
+    /// that point.
+    struct CPUser {
+      MachineInstr *MI;
+      MachineInstr *CPEMI;
+      MachineBasicBlock *HighWaterMark;
+    private:
+      unsigned MaxDisp;
+      unsigned LongFormMaxDisp; // mips16 has 16/32 bit instructions
+                                // with different displacements
+      unsigned LongFormOpcode;
+    public:
+      bool NegOk;
+      CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
+             bool neg,
+             unsigned longformmaxdisp, unsigned longformopcode)
+        : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp),
+          LongFormMaxDisp(longformmaxdisp), LongFormOpcode(longformopcode),
+          NegOk(neg){
+        HighWaterMark = CPEMI->getParent();
+      }
+      /// getMaxDisp - Returns the maximum displacement supported by MI.
+      unsigned getMaxDisp() const {
+        unsigned xMaxDisp = ConstantIslandsSmallOffset?
+                            ConstantIslandsSmallOffset: MaxDisp;
+        return xMaxDisp;
+      }
+      void setMaxDisp(unsigned val) {
+        MaxDisp = val;
+      }
+      unsigned getLongFormMaxDisp() const {
+        return LongFormMaxDisp;
+      }
+      unsigned getLongFormOpcode() const {
+          return LongFormOpcode;
+      }
+    };
+
+    /// CPUsers - Keep track of all of the machine instructions that use various
+    /// constant pools and their max displacement.
+    std::vector<CPUser> CPUsers;
+
+  /// CPEntry - One per constant pool entry, keeping the machine instruction
+  /// pointer, the constpool index, and the number of CPUser's which
+  /// reference this entry.
+  struct CPEntry {
+    MachineInstr *CPEMI;
+    unsigned CPI;
+    unsigned RefCount;
+    CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
+      : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
+  };
+
+  /// CPEntries - Keep track of all of the constant pool entry machine
+  /// instructions. For each original constpool index (i.e. those that
+  /// existed upon entry to this pass), it keeps a vector of entries.
+  /// Original elements are cloned as we go along; the clones are
+  /// put in the vector of the original element, but have distinct CPIs.
+  std::vector<std::vector<CPEntry> > CPEntries;
+
+  /// ImmBranch - One per immediate branch, keeping the machine instruction
+  /// pointer, conditional or unconditional, the max displacement,
+  /// and (if isCond is true) the corresponding unconditional branch
+  /// opcode.
+  struct ImmBranch {
+    MachineInstr *MI;
+    unsigned MaxDisp : 31;
+    bool isCond : 1;
+    int UncondBr;
+    ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr)
+      : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
+  };
+
+  /// ImmBranches - Keep track of all the immediate branch instructions.
+  ///
+  std::vector<ImmBranch> ImmBranches;
+
+  /// HasFarJump - True if any far jump instruction has been emitted during
+  /// the branch fix up pass.
+  bool HasFarJump;
+
+  const MipsSubtarget *STI;
+  const Mips16InstrInfo *TII;
+  MipsFunctionInfo *MFI;
+  MachineFunction *MF;
+  MachineConstantPool *MCP;
+
+  unsigned PICLabelUId;
+  bool PrescannedForConstants;
+
+  void initPICLabelUId(unsigned UId) {
+    PICLabelUId = UId;
+  }
+
+
+  unsigned createPICLabelUId() {
+    return PICLabelUId++;
+  }
+
+  public:
+    static char ID;
+    MipsConstantIslands()
+        : MachineFunctionPass(ID), STI(nullptr), MF(nullptr), MCP(nullptr),
+          PrescannedForConstants(false) {}
+
+    StringRef getPassName() const override { return "Mips Constant Islands"; }
+
+    bool runOnMachineFunction(MachineFunction &F) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
+    CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
+    unsigned getCPELogAlign(const MachineInstr &CPEMI);
+    void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
+    unsigned getOffsetOf(MachineInstr *MI) const;
+    unsigned getUserOffset(CPUser&) const;
+    void dumpBBs();
+
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                         unsigned Disp, bool NegativeOK);
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                         const CPUser &U);
+
+    void computeBlockSize(MachineBasicBlock *MBB);
+    MachineBasicBlock *splitBlockBeforeInstr(MachineInstr &MI);
+    void updateForInsertedWaterBlock(MachineBasicBlock *NewBB);
+    void adjustBBOffsetsAfter(MachineBasicBlock *BB);
+    bool decrementCPEReferenceCount(unsigned CPI, MachineInstr* CPEMI);
+    int findInRangeCPEntry(CPUser& U, unsigned UserOffset);
+    int findLongFormInRangeCPEntry(CPUser& U, unsigned UserOffset);
+    bool findAvailableWater(CPUser&U, unsigned UserOffset,
+                            water_iterator &WaterIter);
+    void createNewWater(unsigned CPUserIndex, unsigned UserOffset,
+                        MachineBasicBlock *&NewMBB);
+    bool handleConstantPoolUser(unsigned CPUserIndex);
+    void removeDeadCPEMI(MachineInstr *CPEMI);
+    bool removeUnusedCPEntries();
+    bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
+                          MachineInstr *CPEMI, unsigned Disp, bool NegOk,
+                          bool DoDump = false);
+    bool isWaterInRange(unsigned UserOffset, MachineBasicBlock *Water,
+                        CPUser &U, unsigned &Growth);
+    bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+    bool fixupImmediateBr(ImmBranch &Br);
+    bool fixupConditionalBr(ImmBranch &Br);
+    bool fixupUnconditionalBr(ImmBranch &Br);
+
+    void prescanForConstants();
+
+  private:
+
+  };
+
+  char MipsConstantIslands::ID = 0;
+} // end of anonymous namespace
+
+bool MipsConstantIslands::isOffsetInRange
+  (unsigned UserOffset, unsigned TrialOffset,
+   const CPUser &U) {
+  return isOffsetInRange(UserOffset, TrialOffset,
+                         U.getMaxDisp(), U.NegOk);
+}
+/// print block size and offset information - debugging
+void MipsConstantIslands::dumpBBs() {
+  DEBUG({
+    for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
+      const BasicBlockInfo &BBI = BBInfo[J];
+      dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
+             << format(" size=%#x\n", BBInfo[J].Size);
+    }
+  });
+}
+/// Returns a pass that converts branches to long branches.
+FunctionPass *llvm::createMipsConstantIslandPass() {
+  return new MipsConstantIslands();
+}
+
+bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
+  // The intention is for this to be a mips16 only pass for now
+  // FIXME:
+  MF = &mf;
+  MCP = mf.getConstantPool();
+  STI = &static_cast<const MipsSubtarget &>(mf.getSubtarget());
+  DEBUG(dbgs() << "constant island machine function " << "\n");
+  if (!STI->inMips16Mode() || !MipsSubtarget::useConstantIslands()) {
+    return false;
+  }
+  TII = (const Mips16InstrInfo *)STI->getInstrInfo();
+  MFI = MF->getInfo<MipsFunctionInfo>();
+  DEBUG(dbgs() << "constant island processing " << "\n");
+  //
+  // will need to make predermination if there is any constants we need to
+  // put in constant islands. TBD.
+  //
+  if (!PrescannedForConstants) prescanForConstants();
+
+  HasFarJump = false;
+  // This pass invalidates liveness information when it splits basic blocks.
+  MF->getRegInfo().invalidateLiveness();
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  bool MadeChange = false;
+
+  // Perform the initial placement of the constant pool entries.  To start with,
+  // we put them all at the end of the function.
+  std::vector<MachineInstr*> CPEMIs;
+  if (!MCP->isEmpty())
+    doInitialPlacement(CPEMIs);
+
+  /// The next UID to take is the first unused one.
+  initPICLabelUId(CPEMIs.size());
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block, the location of all the water, and finding all of the
+  // constant pool users.
+  initializeFunctionInfo(CPEMIs);
+  CPEMIs.clear();
+  DEBUG(dumpBBs());
+
+  /// Remove dead constant pool entries.
+  MadeChange |= removeUnusedCPEntries();
+
+  // Iteratively place constant pool entries and fix up branches until there
+  // is no change.
+  unsigned NoCPIters = 0, NoBRIters = 0;
+  (void)NoBRIters;
+  while (true) {
+    DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
+    bool CPChange = false;
+    for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
+      CPChange |= handleConstantPoolUser(i);
+    if (CPChange && ++NoCPIters > 30)
+      report_fatal_error("Constant Island pass failed to converge!");
+    DEBUG(dumpBBs());
+
+    // Clear NewWaterList now.  If we split a block for branches, it should
+    // appear as "new water" for the next iteration of constant pool placement.
+    NewWaterList.clear();
+
+    DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
+    bool BRChange = false;
+    for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
+      BRChange |= fixupImmediateBr(ImmBranches[i]);
+    if (BRChange && ++NoBRIters > 30)
+      report_fatal_error("Branch Fix Up pass failed to converge!");
+    DEBUG(dumpBBs());
+    if (!CPChange && !BRChange)
+      break;
+    MadeChange = true;
+  }
+
+  DEBUG(dbgs() << '\n'; dumpBBs());
+
+  BBInfo.clear();
+  WaterList.clear();
+  CPUsers.clear();
+  CPEntries.clear();
+  ImmBranches.clear();
+  return MadeChange;
+}
+
+/// doInitialPlacement - Perform the initial placement of the constant pool
+/// entries.  To start with, we put them all at the end of the function.
+void
+MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
+  // Create the basic block to hold the CPE's.
+  MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
+  MF->push_back(BB);
+
+
+  // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
+  unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
+
+  // Mark the basic block as required by the const-pool.
+  // If AlignConstantIslands isn't set, use 4-byte alignment for everything.
+  BB->setAlignment(AlignConstantIslands ? MaxAlign : 2);
+
+  // The function needs to be as aligned as the basic blocks. The linker may
+  // move functions around based on their alignment.
+  MF->ensureAlignment(BB->getAlignment());
+
+  // Order the entries in BB by descending alignment.  That ensures correct
+  // alignment of all entries as long as BB is sufficiently aligned.  Keep
+  // track of the insertion point for each alignment.  We are going to bucket
+  // sort the entries as they are created.
+  SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end());
+
+  // Add all of the constants from the constant pool to the end block, use an
+  // identity mapping of CPI's to CPE's.
+  const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
+
+  const DataLayout &TD = MF->getDataLayout();
+  for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
+    unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
+    assert(Size >= 4 && "Too small constant pool entry");
+    unsigned Align = CPs[i].getAlignment();
+    assert(isPowerOf2_32(Align) && "Invalid alignment");
+    // Verify that all constant pool entries are a multiple of their alignment.
+    // If not, we would have to pad them out so that instructions stay aligned.
+    assert((Size % Align) == 0 && "CP Entry not multiple of 4 bytes!");
+
+    // Insert CONSTPOOL_ENTRY before entries with a smaller alignment.
+    unsigned LogAlign = Log2_32(Align);
+    MachineBasicBlock::iterator InsAt = InsPoint[LogAlign];
+
+    MachineInstr *CPEMI =
+      BuildMI(*BB, InsAt, DebugLoc(), TII->get(Mips::CONSTPOOL_ENTRY))
+        .addImm(i).addConstantPoolIndex(i).addImm(Size);
+
+    CPEMIs.push_back(CPEMI);
+
+    // Ensure that future entries with higher alignment get inserted before
+    // CPEMI. This is bucket sort with iterators.
+    for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a)
+      if (InsPoint[a] == InsAt)
+        InsPoint[a] = CPEMI;
+    // Add a new CPEntry, but no corresponding CPUser yet.
+    CPEntries.emplace_back(1, CPEntry(CPEMI, i));
+    ++NumCPEs;
+    DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
+                 << Size << ", align = " << Align <<'\n');
+  }
+  DEBUG(BB->dump());
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB->getIterator();
+  // Can't fall off end of function.
+  if (std::next(MBBI) == MBB->getParent()->end())
+    return false;
+
+  MachineBasicBlock *NextBB = &*std::next(MBBI);
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I == NextBB)
+      return true;
+
+  return false;
+}
+
+/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
+/// look up the corresponding CPEntry.
+MipsConstantIslands::CPEntry
+*MipsConstantIslands::findConstPoolEntry(unsigned CPI,
+                                        const MachineInstr *CPEMI) {
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  // Number of entries per constpool index should be small, just do a
+  // linear search.
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    if (CPEs[i].CPEMI == CPEMI)
+      return &CPEs[i];
+  }
+  return nullptr;
+}
+
+/// getCPELogAlign - Returns the required alignment of the constant pool entry
+/// represented by CPEMI.  Alignment is measured in log2(bytes) units.
+unsigned MipsConstantIslands::getCPELogAlign(const MachineInstr &CPEMI) {
+  assert(CPEMI.getOpcode() == Mips::CONSTPOOL_ENTRY);
+
+  // Everything is 4-byte aligned unless AlignConstantIslands is set.
+  if (!AlignConstantIslands)
+    return 2;
+
+  unsigned CPI = CPEMI.getOperand(1).getIndex();
+  assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
+  unsigned Align = MCP->getConstants()[CPI].getAlignment();
+  assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
+  return Log2_32(Align);
+}
+
+/// initializeFunctionInfo - Do the initial scan of the function, building up
+/// information about the sizes of each block, the location of all the water,
+/// and finding all of the constant pool users.
+void MipsConstantIslands::
+initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
+  BBInfo.clear();
+  BBInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
+    computeBlockSize(&*I);
+
+
+  // Compute block offsets.
+  adjustBBOffsetsAfter(&MF->front());
+
+  // Now go back through the instructions and build up our data structures.
+  for (MachineBasicBlock &MBB : *MF) {
+    // If this block doesn't fall through into the next MBB, then this is
+    // 'water' that a constant pool island could be placed.
+    if (!BBHasFallthrough(&MBB))
+      WaterList.push_back(&MBB);
+    for (MachineInstr &MI : MBB) {
+      if (MI.isDebugValue())
+        continue;
+
+      int Opc = MI.getOpcode();
+      if (MI.isBranch()) {
+        bool isCond = false;
+        unsigned Bits = 0;
+        unsigned Scale = 1;
+        int UOpc = Opc;
+        switch (Opc) {
+        default:
+          continue;  // Ignore other branches for now
+        case Mips::Bimm16:
+          Bits = 11;
+          Scale = 2;
+          isCond = false;
+          break;
+        case Mips::BimmX16:
+          Bits = 16;
+          Scale = 2;
+          isCond = false;
+          break;
+        case Mips::BeqzRxImm16:
+          UOpc=Mips::Bimm16;
+          Bits = 8;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::BeqzRxImmX16:
+          UOpc=Mips::Bimm16;
+          Bits = 16;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::BnezRxImm16:
+          UOpc=Mips::Bimm16;
+          Bits = 8;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::BnezRxImmX16:
+          UOpc=Mips::Bimm16;
+          Bits = 16;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::Bteqz16:
+          UOpc=Mips::Bimm16;
+          Bits = 8;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::BteqzX16:
+          UOpc=Mips::Bimm16;
+          Bits = 16;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::Btnez16:
+          UOpc=Mips::Bimm16;
+          Bits = 8;
+          Scale = 2;
+          isCond = true;
+          break;
+        case Mips::BtnezX16:
+          UOpc=Mips::Bimm16;
+          Bits = 16;
+          Scale = 2;
+          isCond = true;
+          break;
+        }
+        // Record this immediate branch.
+        unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+        ImmBranches.push_back(ImmBranch(&MI, MaxOffs, isCond, UOpc));
+      }
+
+      if (Opc == Mips::CONSTPOOL_ENTRY)
+        continue;
+
+
+      // Scan the instructions for constant pool operands.
+      for (unsigned op = 0, e = MI.getNumOperands(); op != e; ++op)
+        if (MI.getOperand(op).isCPI()) {
+
+          // We found one.  The addressing mode tells us the max displacement
+          // from the PC that this instruction permits.
+
+          // Basic size info comes from the TSFlags field.
+          unsigned Bits = 0;
+          unsigned Scale = 1;
+          bool NegOk = false;
+          unsigned LongFormBits = 0;
+          unsigned LongFormScale = 0;
+          unsigned LongFormOpcode = 0;
+          switch (Opc) {
+          default:
+            llvm_unreachable("Unknown addressing mode for CP reference!");
+          case Mips::LwRxPcTcp16:
+            Bits = 8;
+            Scale = 4;
+            LongFormOpcode = Mips::LwRxPcTcpX16;
+            LongFormBits = 14;
+            LongFormScale = 1;
+            break;
+          case Mips::LwRxPcTcpX16:
+            Bits = 14;
+            Scale = 1;
+            NegOk = true;
+            break;
+          }
+          // Remember that this is a user of a CP entry.
+          unsigned CPI = MI.getOperand(op).getIndex();
+          MachineInstr *CPEMI = CPEMIs[CPI];
+          unsigned MaxOffs = ((1 << Bits)-1) * Scale;
+          unsigned LongFormMaxOffs = ((1 << LongFormBits)-1) * LongFormScale;
+          CPUsers.push_back(CPUser(&MI, CPEMI, MaxOffs, NegOk, LongFormMaxOffs,
+                                   LongFormOpcode));
+
+          // Increment corresponding CPEntry reference count.
+          CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+          assert(CPE && "Cannot find a corresponding CPEntry!");
+          CPE->RefCount++;
+
+          // Instructions can only use one CP entry, don't bother scanning the
+          // rest of the operands.
+          break;
+
+        }
+
+    }
+  }
+
+}
+
+/// computeBlockSize - Compute the size and some alignment information for MBB.
+/// This function updates BBInfo directly.
+void MipsConstantIslands::computeBlockSize(MachineBasicBlock *MBB) {
+  BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
+  BBI.Size = 0;
+
+  for (const MachineInstr &MI : *MBB)
+    BBI.Size += TII->getInstSizeInBytes(MI);
+}
+
+/// getOffsetOf - Return the current offset of the specified machine instruction
+/// from the start of the function.  This offset changes as stuff is moved
+/// around inside the function.
+unsigned MipsConstantIslands::getOffsetOf(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BBInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->getInstSizeInBytes(*I);
+  }
+  return Offset;
+}
+
+/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB
+/// ID.
+static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
+                              const MachineBasicBlock *RHS) {
+  return LHS->getNumber() < RHS->getNumber();
+}
+
+/// updateForInsertedWaterBlock - When a block is newly inserted into the
+/// machine function, it upsets all of the block numbers.  Renumber the blocks
+/// and update the arrays that parallel this numbering.
+void MipsConstantIslands::updateForInsertedWaterBlock
+  (MachineBasicBlock *NewBB) {
+  // Renumber the MBB's to keep them consecutive.
+  NewBB->getParent()->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add NewMBB as having
+  // available water after it.
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
+                     CompareMBBNumbers);
+  WaterList.insert(IP, NewBB);
+}
+
+unsigned MipsConstantIslands::getUserOffset(CPUser &U) const {
+  return getOffsetOf(U.MI);
+}
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *
+MipsConstantIslands::splitBlockBeforeInstr(MachineInstr &MI) {
+  MachineBasicBlock *OrigBB = MI.getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+    MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = ++OrigBB->getIterator();
+  MF->insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  BuildMI(OrigBB, DebugLoc(), TII->get(Mips::Bimm16)).addMBB(NewBB);
+  ++NumSplit;
+
+  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
+  NewBB->transferSuccessors(OrigBB);
+
+  // OrigBB branches to NewBB.
+  OrigBB->addSuccessor(NewBB);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  // This is almost the same as updateForInsertedWaterBlock, except that
+  // the Water goes after OrigBB, not NewBB.
+  MF->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add OrigMBB as having
+  // available water after it (but not if it's already there, which happens
+  // when splitting before a conditional branch that is followed by an
+  // unconditional branch - in that case we want to insert NewBB).
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
+                     CompareMBBNumbers);
+  MachineBasicBlock* WaterBB = *IP;
+  if (WaterBB == OrigBB)
+    WaterList.insert(std::next(IP), NewBB);
+  else
+    WaterList.insert(IP, OrigBB);
+  NewWaterList.insert(OrigBB);
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  computeBlockSize(OrigBB);
+
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  computeBlockSize(NewBB);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBBOffsetsAfter(OrigBB);
+
+  return NewBB;
+}
+
+
+
+/// isOffsetInRange - Checks whether UserOffset (the location of a constant pool
+/// reference) is within MaxDisp of TrialOffset (a proposed location of a
+/// constant pool entry).
+bool MipsConstantIslands::isOffsetInRange(unsigned UserOffset,
+                                         unsigned TrialOffset, unsigned MaxDisp,
+                                         bool NegativeOK) {
+  if (UserOffset <= TrialOffset) {
+    // User before the Trial.
+    if (TrialOffset - UserOffset <= MaxDisp)
+      return true;
+  } else if (NegativeOK) {
+    if (UserOffset - TrialOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// isWaterInRange - Returns true if a CPE placed after the specified
+/// Water (a basic block) will be in range for the specific MI.
+///
+/// Compute how much the function will grow by inserting a CPE after Water.
+bool MipsConstantIslands::isWaterInRange(unsigned UserOffset,
+                                        MachineBasicBlock* Water, CPUser &U,
+                                        unsigned &Growth) {
+  unsigned CPELogAlign = getCPELogAlign(*U.CPEMI);
+  unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
+  unsigned NextBlockOffset, NextBlockAlignment;
+  MachineFunction::const_iterator NextBlock = ++Water->getIterator();
+  if (NextBlock == MF->end()) {
+    NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
+    NextBlockAlignment = 0;
+  } else {
+    NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
+    NextBlockAlignment = NextBlock->getAlignment();
+  }
+  unsigned Size = U.CPEMI->getOperand(2).getImm();
+  unsigned CPEEnd = CPEOffset + Size;
+
+  // The CPE may be able to hide in the alignment padding before the next
+  // block. It may also cause more padding to be required if it is more aligned
+  // that the next block.
+  if (CPEEnd > NextBlockOffset) {
+    Growth = CPEEnd - NextBlockOffset;
+    // Compute the padding that would go at the end of the CPE to align the next
+    // block.
+    Growth += OffsetToAlignment(CPEEnd, 1ULL << NextBlockAlignment);
+
+    // If the CPE is to be inserted before the instruction, that will raise
+    // the offset of the instruction. Also account for unknown alignment padding
+    // in blocks between CPE and the user.
+    if (CPEOffset < UserOffset)
+      UserOffset += Growth;
+  } else
+    // CPE fits in existing padding.
+    Growth = 0;
+
+  return isOffsetInRange(UserOffset, CPEOffset, U);
+}
+
+/// isCPEntryInRange - Returns true if the distance between specific MI and
+/// specific ConstPool entry instruction can fit in MI's displacement field.
+bool MipsConstantIslands::isCPEntryInRange
+  (MachineInstr *MI, unsigned UserOffset,
+   MachineInstr *CPEMI, unsigned MaxDisp,
+   bool NegOk, bool DoDump) {
+  unsigned CPEOffset  = getOffsetOf(CPEMI);
+
+  if (DoDump) {
+    DEBUG({
+      unsigned Block = MI->getParent()->getNumber();
+      const BasicBlockInfo &BBI = BBInfo[Block];
+      dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
+             << " max delta=" << MaxDisp
+             << format(" insn address=%#x", UserOffset)
+             << " in BB#" << Block << ": "
+             << format("%#x-%x\t", BBI.Offset, BBI.postOffset()) << *MI
+             << format("CPE address=%#x offset=%+d: ", CPEOffset,
+                       int(CPEOffset-UserOffset));
+    });
+  }
+
+  return isOffsetInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
+}
+
+#ifndef NDEBUG
+/// BBIsJumpedOver - Return true of the specified basic block's only predecessor
+/// unconditionally branches to its only successor.
+static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
+  if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+    return false;
+  MachineBasicBlock *Succ = *MBB->succ_begin();
+  MachineBasicBlock *Pred = *MBB->pred_begin();
+  MachineInstr *PredMI = &Pred->back();
+  if (PredMI->getOpcode() == Mips::Bimm16)
+    return PredMI->getOperand(0).getMBB() == Succ;
+  return false;
+}
+#endif
+
+void MipsConstantIslands::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
+  unsigned BBNum = BB->getNumber();
+  for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned Offset = BBInfo[i - 1].Offset + BBInfo[i - 1].Size;
+    BBInfo[i].Offset = Offset;
+  }
+}
+
+/// decrementCPEReferenceCount - find the constant pool entry with index CPI
+/// and instruction CPEMI, and decrement its refcount.  If the refcount
+/// becomes 0 remove the entry and instruction.  Returns true if we removed
+/// the entry, false if we didn't.
+
+bool MipsConstantIslands::decrementCPEReferenceCount(unsigned CPI,
+                                                    MachineInstr *CPEMI) {
+  // Find the old entry. Eliminate it if it is no longer used.
+  CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+  assert(CPE && "Unexpected!");
+  if (--CPE->RefCount == 0) {
+    removeDeadCPEMI(CPEMI);
+    CPE->CPEMI = nullptr;
+    --NumCPEs;
+    return true;
+  }
+  return false;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int MipsConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
+{
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk,
+                       true)) {
+    DEBUG(dbgs() << "In range\n");
+    return 1;
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    // We already tried this one
+    if (CPEs[i].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[i].CPEMI == nullptr)
+      continue;
+    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
+                     U.NegOk)) {
+      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                   << CPEs[i].CPI << "\n");
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[i].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+        if (UserMI->getOperand(j).isCPI()) {
+          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[i].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// This version checks if the longer form of the instruction can be used to
+/// to satisfy things.
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int MipsConstantIslands::findLongFormInRangeCPEntry
+  (CPUser& U, unsigned UserOffset)
+{
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (isCPEntryInRange(UserMI, UserOffset, CPEMI,
+                       U.getLongFormMaxDisp(), U.NegOk,
+                       true)) {
+    DEBUG(dbgs() << "In range\n");
+    UserMI->setDesc(TII->get(U.getLongFormOpcode()));
+    U.setMaxDisp(U.getLongFormMaxDisp());
+    return 2;  // instruction is longer length now
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    // We already tried this one
+    if (CPEs[i].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[i].CPEMI == nullptr)
+      continue;
+    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI,
+                         U.getLongFormMaxDisp(), U.NegOk)) {
+      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                   << CPEs[i].CPI << "\n");
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[i].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+        if (UserMI->getOperand(j).isCPI()) {
+          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[i].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in
+/// the specific unconditional branch instruction.
+static inline unsigned getUnconditionalBrDisp(int Opc) {
+  switch (Opc) {
+  case Mips::Bimm16:
+    return ((1<<10)-1)*2;
+  case Mips::BimmX16:
+    return ((1<<16)-1)*2;
+  default:
+    break;
+  }
+  return ((1<<16)-1)*2;
+}
+
+/// findAvailableWater - Look for an existing entry in the WaterList in which
+/// we can place the CPE referenced from U so it's within range of U's MI.
+/// Returns true if found, false if not.  If it returns true, WaterIter
+/// is set to the WaterList entry.  
+/// To ensure that this pass
+/// terminates, the CPE location for a particular CPUser is only allowed to
+/// move to a lower address, so search backward from the end of the list and
+/// prefer the first water that is in range.
+bool MipsConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
+                                      water_iterator &WaterIter) {
+  if (WaterList.empty())
+    return false;
+
+  unsigned BestGrowth = ~0u;
+  for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();;
+       --IP) {
+    MachineBasicBlock* WaterBB = *IP;
+    // Check if water is in range and is either at a lower address than the
+    // current "high water mark" or a new water block that was created since
+    // the previous iteration by inserting an unconditional branch.  In the
+    // latter case, we want to allow resetting the high water mark back to
+    // this new water since we haven't seen it before.  Inserting branches
+    // should be relatively uncommon and when it does happen, we want to be
+    // sure to take advantage of it for all the CPEs near that block, so that
+    // we don't insert more branches than necessary.
+    unsigned Growth;
+    if (isWaterInRange(UserOffset, WaterBB, U, Growth) &&
+        (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
+         NewWaterList.count(WaterBB)) && Growth < BestGrowth) {
+      // This is the least amount of required padding seen so far.
+      BestGrowth = Growth;
+      WaterIter = IP;
+      DEBUG(dbgs() << "Found water after BB#" << WaterBB->getNumber()
+                   << " Growth=" << Growth << '\n');
+
+      // Keep looking unless it is perfect.
+      if (BestGrowth == 0)
+        return true;
+    }
+    if (IP == B)
+      break;
+  }
+  return BestGrowth != ~0u;
+}
+
+/// createNewWater - No existing WaterList entry will work for
+/// CPUsers[CPUserIndex], so create a place to put the CPE.  The end of the
+/// block is used if in range, and the conditional branch munged so control
+/// flow is correct.  Otherwise the block is split to create a hole with an
+/// unconditional branch around it.  In either case NewMBB is set to a
+/// block following which the new island can be inserted (the WaterList
+/// is not adjusted).
+void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
+                                        unsigned UserOffset,
+                                        MachineBasicBlock *&NewMBB) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPELogAlign = getCPELogAlign(*CPEMI);
+  MachineBasicBlock *UserMBB = UserMI->getParent();
+  const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
+
+  // If the block does not end in an unconditional branch already, and if the
+  // end of the block is within range, make new water there.  
+  if (BBHasFallthrough(UserMBB)) {
+    // Size of branch to insert.
+    unsigned Delta = 2;
+    // Compute the offset where the CPE will begin.
+    unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
+
+    if (isOffsetInRange(UserOffset, CPEOffset, U)) {
+      DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
+            << format(", expected CPE offset %#x\n", CPEOffset));
+      NewMBB = &*++UserMBB->getIterator();
+      // Add an unconditional branch from UserMBB to fallthrough block.  Record
+      // it for branch lengthening; this new branch will not get out of range,
+      // but if the preceding conditional branch is out of range, the targets
+      // will be exchanged, and the altered branch may be out of range, so the
+      // machinery has to know about it.
+      int UncondBr = Mips::Bimm16;
+      BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
+      unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
+      ImmBranches.push_back(ImmBranch(&UserMBB->back(),
+                                      MaxDisp, false, UncondBr));
+      BBInfo[UserMBB->getNumber()].Size += Delta;
+      adjustBBOffsetsAfter(UserMBB);
+      return;
+    }
+  }
+
+  // What a big block.  Find a place within the block to split it.  
+
+  // Try to split the block so it's fully aligned.  Compute the latest split
+  // point where we can add a 4-byte branch instruction, and then align to
+  // LogAlign which is the largest possible alignment in the function.
+  unsigned LogAlign = MF->getAlignment();
+  assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
+  unsigned BaseInsertOffset = UserOffset + U.getMaxDisp();
+  DEBUG(dbgs() << format("Split in middle of big block before %#x",
+                         BaseInsertOffset));
+
+  // The 4 in the following is for the unconditional branch we'll be inserting
+  // Alignment of the island is handled
+  // inside isOffsetInRange.
+  BaseInsertOffset -= 4;
+
+  DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
+               << " la=" << LogAlign << '\n');
+
+  // This could point off the end of the block if we've already got constant
+  // pool entries following this block; only the last one is in the water list.
+  // Back past any possible branches (allow for a conditional and a maximally
+  // long unconditional).
+  if (BaseInsertOffset + 8 >= UserBBI.postOffset()) {
+    BaseInsertOffset = UserBBI.postOffset() - 8;
+    DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+  }
+  unsigned EndInsertOffset = BaseInsertOffset + 4 +
+    CPEMI->getOperand(2).getImm();
+  MachineBasicBlock::iterator MI = UserMI;
+  ++MI;
+  unsigned CPUIndex = CPUserIndex+1;
+  unsigned NumCPUsers = CPUsers.size();
+  //MachineInstr *LastIT = 0;
+  for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI);
+       Offset < BaseInsertOffset;
+       Offset += TII->getInstSizeInBytes(*MI), MI = std::next(MI)) {
+    assert(MI != UserMBB->end() && "Fell off end of block");
+    if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
+      CPUser &U = CPUsers[CPUIndex];
+      if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
+        // Shift intertion point by one unit of alignment so it is within reach.
+        BaseInsertOffset -= 1u << LogAlign;
+        EndInsertOffset  -= 1u << LogAlign;
+      }
+      // This is overly conservative, as we don't account for CPEMIs being
+      // reused within the block, but it doesn't matter much.  Also assume CPEs
+      // are added in order with alignment padding.  We may eventually be able
+      // to pack the aligned CPEs better.
+      EndInsertOffset += U.CPEMI->getOperand(2).getImm();
+      CPUIndex++;
+    }
+  }
+
+  NewMBB = splitBlockBeforeInstr(*--MI);
+}
+
+/// handleConstantPoolUser - Analyze the specified user, checking to see if it
+/// is out-of-range.  If so, pick up the constant pool value and move it some
+/// place in-range.  Return true if we changed any addresses (thus must run
+/// another pass of branch lengthening), false otherwise.
+bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  // Compute this only once, it's expensive.
+  unsigned UserOffset = getUserOffset(U);
+
+  // See if the current entry is within range, or there is a clone of it
+  // in range.
+  int result = findInRangeCPEntry(U, UserOffset);
+  if (result==1) return false;
+  else if (result==2) return true;
+
+
+  // Look for water where we can place this CPE.
+  MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *NewMBB;
+  water_iterator IP;
+  if (findAvailableWater(U, UserOffset, IP)) {
+    DEBUG(dbgs() << "Found water in range\n");
+    MachineBasicBlock *WaterBB = *IP;
+
+    // If the original WaterList entry was "new water" on this iteration,
+    // propagate that to the new island.  This is just keeping NewWaterList
+    // updated to match the WaterList, which will be updated below.
+    if (NewWaterList.erase(WaterBB))
+      NewWaterList.insert(NewIsland);
+
+    // The new CPE goes before the following block (NewMBB).
+    NewMBB = &*++WaterBB->getIterator();
+  } else {
+    // No water found.
+    // we first see if a longer form of the instrucion could have reached
+    // the constant. in that case we won't bother to split
+    if (!NoLoadRelaxation) {
+      result = findLongFormInRangeCPEntry(U, UserOffset);
+      if (result != 0) return true;
+    }
+    DEBUG(dbgs() << "No water found\n");
+    createNewWater(CPUserIndex, UserOffset, NewMBB);
+
+    // splitBlockBeforeInstr adds to WaterList, which is important when it is
+    // called while handling branches so that the water will be seen on the
+    // next iteration for constant pools, but in this context, we don't want
+    // it.  Check for this so it will be removed from the WaterList.
+    // Also remove any entry from NewWaterList.
+    MachineBasicBlock *WaterBB = &*--NewMBB->getIterator();
+    IP = find(WaterList, WaterBB);
+    if (IP != WaterList.end())
+      NewWaterList.erase(WaterBB);
+
+    // We are adding new water.  Update NewWaterList.
+    NewWaterList.insert(NewIsland);
+  }
+
+  // Remove the original WaterList entry; we want subsequent insertions in
+  // this vicinity to go after the one we're about to insert.  This
+  // considerably reduces the number of times we have to move the same CPE
+  // more than once and is also important to ensure the algorithm terminates.
+  if (IP != WaterList.end())
+    WaterList.erase(IP);
+
+  // Okay, we know we can put an island before NewMBB now, do it!
+  MF->insert(NewMBB->getIterator(), NewIsland);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  updateForInsertedWaterBlock(NewIsland);
+
+  // Decrement the old entry, and remove it if refcount becomes 0.
+  decrementCPEReferenceCount(CPI, CPEMI);
+
+  // No existing clone of this CPE is within range.
+  // We will be generating a new clone.  Get a UID for it.
+  unsigned ID = createPICLabelUId();
+
+  // Now that we have an island to add the CPE to, clone the original CPE and
+  // add it to the island.
+  U.HighWaterMark = NewIsland;
+  U.CPEMI = BuildMI(NewIsland, DebugLoc(), TII->get(Mips::CONSTPOOL_ENTRY))
+                .addImm(ID).addConstantPoolIndex(CPI).addImm(Size);
+  CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
+  ++NumCPEs;
+
+  // Mark the basic block as aligned as required by the const-pool entry.
+  NewIsland->setAlignment(getCPELogAlign(*U.CPEMI));
+
+  // Increase the size of the island block to account for the new entry.
+  BBInfo[NewIsland->getNumber()].Size += Size;
+  adjustBBOffsetsAfter(&*--NewIsland->getIterator());
+
+  // Finally, change the CPI in the instruction operand to be ID.
+  for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
+    if (UserMI->getOperand(i).isCPI()) {
+      UserMI->getOperand(i).setIndex(ID);
+      break;
+    }
+
+  DEBUG(dbgs() << "  Moved CPE to #" << ID << " CPI=" << CPI
+        << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
+
+  return true;
+}
+
+/// removeDeadCPEMI - Remove a dead constant pool entry instruction. Update
+/// sizes and offsets of impacted basic blocks.
+void MipsConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
+  MachineBasicBlock *CPEBB = CPEMI->getParent();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  CPEMI->eraseFromParent();
+  BBInfo[CPEBB->getNumber()].Size -= Size;
+  // All succeeding offsets have the current size value added in, fix this.
+  if (CPEBB->empty()) {
+    BBInfo[CPEBB->getNumber()].Size = 0;
+
+    // This block no longer needs to be aligned.
+    CPEBB->setAlignment(0);
+  } else
+    // Entries are sorted by descending alignment, so realign from the front.
+    CPEBB->setAlignment(getCPELogAlign(*CPEBB->begin()));
+
+  adjustBBOffsetsAfter(CPEBB);
+  // An island has only one predecessor BB and one successor BB. Check if
+  // this BB's predecessor jumps directly to this BB's successor. This
+  // shouldn't happen currently.
+  assert(!BBIsJumpedOver(CPEBB) && "How did this happen?");
+  // FIXME: remove the empty blocks after all the work is done?
+}
+
+/// removeUnusedCPEntries - Remove constant pool entries whose refcounts
+/// are zero.
+bool MipsConstantIslands::removeUnusedCPEntries() {
+  unsigned MadeChange = false;
+  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+      std::vector<CPEntry> &CPEs = CPEntries[i];
+      for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
+        if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
+          removeDeadCPEMI(CPEs[j].CPEMI);
+          CPEs[j].CPEMI = nullptr;
+          MadeChange = true;
+        }
+      }
+  }
+  return MadeChange;
+}
+
+/// isBBInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool MipsConstantIslands::isBBInRange
+  (MachineInstr *MI,MachineBasicBlock *DestBB, unsigned MaxDisp) {
+
+unsigned PCAdj = 4;
+
+  unsigned BrOffset   = getOffsetOf(MI) + PCAdj;
+  unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
+
+  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " max delta=" << MaxDisp
+               << " from " << getOffsetOf(MI) << " to " << DestOffset
+               << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
+
+  if (BrOffset <= DestOffset) {
+    // Branch before the Dest.
+    if (DestOffset-BrOffset <= MaxDisp)
+      return true;
+  } else {
+    if (BrOffset-DestOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// fixupImmediateBr - Fix up an immediate branch whose destination is too far
+/// away to fit in its displacement field.
+bool MipsConstantIslands::fixupImmediateBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  unsigned TargetOperand = branchTargetOperand(MI);
+  MachineBasicBlock *DestBB = MI->getOperand(TargetOperand).getMBB();
+
+  // Check to see if the DestBB is already in-range.
+  if (isBBInRange(MI, DestBB, Br.MaxDisp))
+    return false;
+
+  if (!Br.isCond)
+    return fixupUnconditionalBr(Br);
+  return fixupConditionalBr(Br);
+}
+
+/// fixupUnconditionalBr - Fix up an unconditional branch whose destination is
+/// too far away to fit in its displacement field. If the LR register has been
+/// spilled in the epilogue, then we can use BL to implement a far jump.
+/// Otherwise, add an intermediate branch instruction to a branch.
+bool
+MipsConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+  // Use BL to implement far jump.
+  unsigned BimmX16MaxDisp = ((1 << 16)-1) * 2;
+  if (isBBInRange(MI, DestBB, BimmX16MaxDisp)) {
+    Br.MaxDisp = BimmX16MaxDisp;
+    MI->setDesc(TII->get(Mips::BimmX16));
+  }
+  else {
+    // need to give the math a more careful look here
+    // this is really a segment address and not
+    // a PC relative address. FIXME. But I think that
+    // just reducing the bits by 1 as I've done is correct.
+    // The basic block we are branching too much be longword aligned.
+    // we know that RA is saved because we always save it right now.
+    // this requirement will be relaxed later but we also have an alternate
+    // way to implement this that I will implement that does not need jal.
+    // We should have a way to back out this alignment restriction if we "can" later.
+    // but it is not harmful.
+    //
+    DestBB->setAlignment(2);
+    Br.MaxDisp = ((1<<24)-1) * 2;
+    MI->setDesc(TII->get(Mips::JalB16));
+  }
+  BBInfo[MBB->getNumber()].Size += 2;
+  adjustBBOffsetsAfter(MBB);
+  HasFarJump = true;
+  ++NumUBrFixed;
+
+  DEBUG(dbgs() << "  Changed B to long jump " << *MI);
+
+  return true;
+}
+
+
+/// fixupConditionalBr - Fix up a conditional branch whose destination is too
+/// far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool
+MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  unsigned TargetOperand = branchTargetOperand(MI);
+  MachineBasicBlock *DestBB = MI->getOperand(TargetOperand).getMBB();
+  unsigned Opcode = MI->getOpcode();
+  unsigned LongFormOpcode = longformBranchOpcode(Opcode);
+  unsigned LongFormMaxOff = branchMaxOffsets(LongFormOpcode);
+
+  // Check to see if the DestBB is already in-range.
+  if (isBBInRange(MI, DestBB, LongFormMaxOff)) {
+    Br.MaxDisp = LongFormMaxOff;
+    MI->setDesc(TII->get(LongFormOpcode));
+    return true;
+  }
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // bteqz L1
+  // =>
+  // bnez L2
+  // b   L1
+  // L2:
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+  unsigned OppositeBranchOpcode = TII->getOppositeBranchOpc(Opcode);
+ 
+  ++NumCBrFixed;
+  if (BMI != MI) {
+    if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) &&
+        BMI->isUnconditionalBranch()) {
+      // Last MI in the BB is an unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beqz L1
+      // b   L2
+      // =>
+      // bnez L2
+      // b   L1
+      unsigned BMITargetOperand = branchTargetOperand(BMI);
+      MachineBasicBlock *NewDest = 
+        BMI->getOperand(BMITargetOperand).getMBB();
+      if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
+        DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
+                     << *BMI);
+        MI->setDesc(TII->get(OppositeBranchOpcode));
+        BMI->getOperand(BMITargetOperand).setMBB(DestBB);
+        MI->getOperand(TargetOperand).setMBB(NewDest);
+        return true;
+      }
+    }
+  }
+
+
+  if (NeedSplit) {
+    splitBlockBeforeInstr(*MI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->getInstSizeInBytes(MBB->back());
+    BBInfo[MBB->getNumber()].Size -= delta;
+    MBB->back().eraseFromParent();
+    // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
+  }
+  MachineBasicBlock *NextBB = &*++MBB->getIterator();
+
+  DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
+               << " also invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
+
+  // Insert a new conditional branch and a new unconditional branch.
+  // Also update the ImmBranch as well as adding a new entry for the new branch.
+  if (MI->getNumExplicitOperands() == 2) {
+    BuildMI(MBB, DebugLoc(), TII->get(OppositeBranchOpcode))
+           .addReg(MI->getOperand(0).getReg())
+           .addMBB(NextBB);
+  } else {
+    BuildMI(MBB, DebugLoc(), TII->get(OppositeBranchOpcode))
+           .addMBB(NextBB);
+  }
+  Br.MI = &MBB->back();
+  BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
+  BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
+  BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
+  unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
+  ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BBInfo[MI->getParent()->getNumber()].Size -= TII->getInstSizeInBytes(*MI);
+  MI->eraseFromParent();
+  adjustBBOffsetsAfter(MBB);
+  return true;
+}
+
+
+void MipsConstantIslands::prescanForConstants() {
+  unsigned J = 0;
+  (void)J;
+  for (MachineFunction::iterator B =
+         MF->begin(), E = MF->end(); B != E; ++B) {
+    for (MachineBasicBlock::instr_iterator I =
+        B->instr_begin(), EB = B->instr_end(); I != EB; ++I) {
+      switch(I->getDesc().getOpcode()) {
+        case Mips::LwConstant32: {
+          PrescannedForConstants = true;
+          DEBUG(dbgs() << "constant island constant " << *I << "\n");
+          J = I->getNumOperands();
+          DEBUG(dbgs() << "num operands " << J  << "\n");
+          MachineOperand& Literal = I->getOperand(1);
+          if (Literal.isImm()) {
+            int64_t V = Literal.getImm();
+            DEBUG(dbgs() << "literal " << V  << "\n");
+            Type *Int32Ty =
+              Type::getInt32Ty(MF->getFunction()->getContext());
+            const Constant *C = ConstantInt::get(Int32Ty, V);
+            unsigned index = MCP->getConstantPoolIndex(C, 4);
+            I->getOperand(2).ChangeToImmediate(index);
+            DEBUG(dbgs() << "constant island constant " << *I << "\n");
+            I->setDesc(TII->get(Mips::LwRxPcTcp16));
+            I->RemoveOperand(1);
+            I->RemoveOperand(1);
+            I->addOperand(MachineOperand::CreateCPI(index, 0));
+            I->addOperand(MachineOperand::CreateImm(4));
+          }
+          break;
+        }
+        default:
+          break;
+      }
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
new file mode 100644
index 000000000000..0ceb1858fb09
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
@@ -0,0 +1,369 @@
+//===- MipsDSPInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class DspMMRel;
+
+def Dsp2MicroMips : InstrMapping {
+  let FilterClass = "DspMMRel";
+  // Instructions with the same BaseOpcode and isNVStore values form a row.
+  let RowFields = ["BaseOpcode"];
+  // Instructions with the same predicate sense form a column.
+  let ColFields = ["Arch"];
+  // The key column is the unpredicated instructions.
+  let KeyCol = ["dsp"];
+  // Value columns are PredSense=true and PredSense=false
+  let ValueCols = [["dsp"], ["mmdsp"]];
+}
+
+def HasDSP : Predicate<"Subtarget->hasDSP()">,
+             AssemblerPredicate<"FeatureDSP">;
+def HasDSPR2 : Predicate<"Subtarget->hasDSPR2()">,
+               AssemblerPredicate<"FeatureDSPR2">;
+def HasDSPR3 : Predicate<"Subtarget->hasDSPR3()">,
+               AssemblerPredicate<"FeatureDSPR3">;
+
+class ISA_DSPR2 {
+  list<Predicate> InsnPredicates = [HasDSPR2];
+}
+
+class ISA_DSPR3 {
+  list<Predicate> InsnPredicates = [HasDSPR3];
+}
+
+// Fields.
+class Field6<bits<6> val> {
+  bits<6> V = val;
+}
+
+def SPECIAL3_OPCODE : Field6<0b011111>;
+def REGIMM_OPCODE : Field6<0b000001>;
+
+class DSPInst<string opstr = "">
+    : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, PredicateControl {
+  let InsnPredicates = [HasDSP];
+  string BaseOpcode = opstr;
+  string Arch = "dsp";
+}
+
+class PseudoDSP<dag outs, dag ins, list<dag> pattern,
+                InstrItinClass itin = IIPseudo>
+    : MipsPseudo<outs, ins, pattern, itin>, PredicateControl {
+  let InsnPredicates = [HasDSP];
+}
+
+class DSPInstAlias<string Asm, dag Result, bit Emit = 0b1>
+    : InstAlias<Asm, Result, Emit>, PredicateControl {
+  let InsnPredicates = [HasDSP];
+}
+
+// ADDU.QB sub-class format.
+class ADDU_QB_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010000;
+}
+
+class RADDU_W_QB_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rs;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010000;
+}
+
+// CMPU.EQ.QB sub-class format.
+class CMP_EQ_QB_R2_FMT<bits<5> op> : DSPInst {
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = 0;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010001;
+}
+
+class CMP_EQ_QB_R3_FMT<bits<5> op> : DSPInst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> rd;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010001;
+}
+
+class PRECR_SRA_PH_W_FMT<bits<5> op> : DSPInst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> sa;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = sa;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010001;
+}
+
+// ABSQ_S.PH sub-class format.
+class ABSQ_S_PH_R2_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010010;
+}
+
+
+class REPL_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<10> imm;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-16} = imm;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010010;
+}
+
+// SHLL.QB sub-class format.
+class SHLL_QB_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rt;
+  bits<5> rs_sa;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs_sa;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010011;
+}
+
+// LX sub-class format.
+class LX_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> base;
+  bits<5> index;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = base;
+  let Inst{20-16} = index;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0} = 0b001010;
+}
+
+// ADDUH.QB sub-class format.
+class ADDUH_QB_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b011000;
+}
+
+// APPEND sub-class format.
+class APPEND_FMT<bits<5> op> : DSPInst {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> sa;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = sa;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b110001;
+}
+
+// DPA.W.PH sub-class format.
+class DPA_W_PH_FMT<bits<5> op> : DSPInst {
+  bits<2> ac;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6}  = op;
+  let Inst{5-0} = 0b110000;
+}
+
+// MULT sub-class format.
+class MULT_FMT<bits<6> opcode, bits<6> funct> : DSPInst {
+  bits<2> ac;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = opcode;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6}  = 0;
+  let Inst{5-0} = funct;
+}
+
+// MFHI sub-class format.
+class MFHI_FMT<bits<6> funct> : DSPInst {
+  bits<5> rd;
+  bits<2> ac;
+
+  let Inst{31-26} = 0;
+  let Inst{25-23} = 0;
+  let Inst{22-21} = ac;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10-6} = 0;
+  let Inst{5-0} = funct;
+}
+
+// MTHI sub-class format.
+class MTHI_FMT<bits<6> funct> : DSPInst {
+  bits<5> rs;
+  bits<2> ac;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6} = 0;
+  let Inst{5-0} = funct;
+}
+
+// EXTR.W sub-class format (type 1).
+class EXTR_W_TY1_FMT<bits<5> op> : DSPInst {
+  bits<5> rt;
+  bits<2> ac;
+  bits<5> shift_rs;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = shift_rs;
+  let Inst{20-16} = rt;
+  let Inst{15-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+// SHILO sub-class format.
+class SHILO_R1_FMT<bits<5> op> : DSPInst {
+  bits<2> ac;
+  bits<6> shift;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-20} = shift;
+  let Inst{19-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+class SHILO_R2_FMT<bits<5> op> : DSPInst {
+  bits<2> ac;
+  bits<5> rs;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+class RDDSP_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<10> mask;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-16} = mask;
+  let Inst{15-11} = rd;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+class WRDSP_FMT<bits<5> op> : DSPInst {
+  bits<5> rs;
+  bits<10> mask;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-11} = mask;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+class BPOSGE32_FMT<bits<5> op> : DSPInst {
+  bits<16> offset;
+
+  let Opcode = REGIMM_OPCODE.V;
+
+  let Inst{25-21} = 0;
+  let Inst{20-16} = op;
+  let Inst{15-0} = offset;
+}
+
+// INSV sub-class format.
+class INSV_FMT<bits<6> op> : DSPInst {
+  bits<5> rt;
+  bits<5> rs;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-6} = 0;
+  let Inst{5-0} = op;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
new file mode 100644
index 000000000000..ac9a81b1bb2f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -0,0 +1,1456 @@
+//===- MipsDSPInstrInfo.td - DSP ASE instructions -*- tablegen ------------*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips DSP ASE instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// ImmLeaf
+def immZExt1 : ImmLeaf<i32, [{return isUInt<1>(Imm);}]>;
+def immZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}]>;
+def immZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}]>;
+def immZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}]>;
+def immZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}]>;
+def immZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}]>;
+def immSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}]>;
+
+// Mips-specific dsp nodes
+def SDT_MipsExtr : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
+                                        SDTCisVT<2, untyped>]>;
+def SDT_MipsShilo : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
+                                         SDTCisSameAs<0, 2>, SDTCisVT<1, i32>]>;
+def SDT_MipsDPA : SDTypeProfile<1, 3, [SDTCisVT<0, untyped>, SDTCisSameAs<0, 3>,
+                                       SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
+def SDT_MipsSHIFT_DSP : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                             SDTCisVT<2, i32>]>;
+
+class MipsDSPBase<string Opc, SDTypeProfile Prof> :
+  SDNode<!strconcat("MipsISD::", Opc), Prof>;
+
+class MipsDSPSideEffectBase<string Opc, SDTypeProfile Prof> :
+  SDNode<!strconcat("MipsISD::", Opc), Prof, [SDNPHasChain, SDNPSideEffect]>;
+
+def MipsEXTP : MipsDSPSideEffectBase<"EXTP", SDT_MipsExtr>;
+def MipsEXTPDP : MipsDSPSideEffectBase<"EXTPDP", SDT_MipsExtr>;
+def MipsEXTR_S_H : MipsDSPSideEffectBase<"EXTR_S_H", SDT_MipsExtr>;
+def MipsEXTR_W : MipsDSPSideEffectBase<"EXTR_W", SDT_MipsExtr>;
+def MipsEXTR_R_W : MipsDSPSideEffectBase<"EXTR_R_W", SDT_MipsExtr>;
+def MipsEXTR_RS_W : MipsDSPSideEffectBase<"EXTR_RS_W", SDT_MipsExtr>;
+
+def MipsSHILO : MipsDSPBase<"SHILO", SDT_MipsShilo>;
+def MipsMTHLIP : MipsDSPSideEffectBase<"MTHLIP", SDT_MipsShilo>;
+
+def MipsMULSAQ_S_W_PH : MipsDSPSideEffectBase<"MULSAQ_S_W_PH", SDT_MipsDPA>;
+def MipsMAQ_S_W_PHL : MipsDSPSideEffectBase<"MAQ_S_W_PHL", SDT_MipsDPA>;
+def MipsMAQ_S_W_PHR : MipsDSPSideEffectBase<"MAQ_S_W_PHR", SDT_MipsDPA>;
+def MipsMAQ_SA_W_PHL : MipsDSPSideEffectBase<"MAQ_SA_W_PHL", SDT_MipsDPA>;
+def MipsMAQ_SA_W_PHR : MipsDSPSideEffectBase<"MAQ_SA_W_PHR", SDT_MipsDPA>;
+
+def MipsDPAU_H_QBL : MipsDSPBase<"DPAU_H_QBL", SDT_MipsDPA>;
+def MipsDPAU_H_QBR : MipsDSPBase<"DPAU_H_QBR", SDT_MipsDPA>;
+def MipsDPSU_H_QBL : MipsDSPBase<"DPSU_H_QBL", SDT_MipsDPA>;
+def MipsDPSU_H_QBR : MipsDSPBase<"DPSU_H_QBR", SDT_MipsDPA>;
+def MipsDPAQ_S_W_PH : MipsDSPSideEffectBase<"DPAQ_S_W_PH", SDT_MipsDPA>;
+def MipsDPSQ_S_W_PH : MipsDSPSideEffectBase<"DPSQ_S_W_PH", SDT_MipsDPA>;
+def MipsDPAQ_SA_L_W : MipsDSPSideEffectBase<"DPAQ_SA_L_W", SDT_MipsDPA>;
+def MipsDPSQ_SA_L_W : MipsDSPSideEffectBase<"DPSQ_SA_L_W", SDT_MipsDPA>;
+
+def MipsDPA_W_PH : MipsDSPBase<"DPA_W_PH", SDT_MipsDPA>;
+def MipsDPS_W_PH : MipsDSPBase<"DPS_W_PH", SDT_MipsDPA>;
+def MipsDPAQX_S_W_PH : MipsDSPSideEffectBase<"DPAQX_S_W_PH", SDT_MipsDPA>;
+def MipsDPAQX_SA_W_PH : MipsDSPSideEffectBase<"DPAQX_SA_W_PH", SDT_MipsDPA>;
+def MipsDPAX_W_PH : MipsDSPBase<"DPAX_W_PH", SDT_MipsDPA>;
+def MipsDPSX_W_PH : MipsDSPBase<"DPSX_W_PH", SDT_MipsDPA>;
+def MipsDPSQX_S_W_PH : MipsDSPSideEffectBase<"DPSQX_S_W_PH", SDT_MipsDPA>;
+def MipsDPSQX_SA_W_PH : MipsDSPSideEffectBase<"DPSQX_SA_W_PH", SDT_MipsDPA>;
+def MipsMULSA_W_PH : MipsDSPBase<"MULSA_W_PH", SDT_MipsDPA>;
+
+def MipsMULT : MipsDSPBase<"MULT", SDT_MipsDPA>;
+def MipsMULTU : MipsDSPBase<"MULTU", SDT_MipsDPA>;
+def MipsMADD_DSP : MipsDSPBase<"MADD_DSP", SDT_MipsDPA>;
+def MipsMADDU_DSP : MipsDSPBase<"MADDU_DSP", SDT_MipsDPA>;
+def MipsMSUB_DSP : MipsDSPBase<"MSUB_DSP", SDT_MipsDPA>;
+def MipsMSUBU_DSP : MipsDSPBase<"MSUBU_DSP", SDT_MipsDPA>;
+def MipsSHLL_DSP : MipsDSPBase<"SHLL_DSP", SDT_MipsSHIFT_DSP>;
+def MipsSHRA_DSP : MipsDSPBase<"SHRA_DSP", SDT_MipsSHIFT_DSP>;
+def MipsSHRL_DSP : MipsDSPBase<"SHRL_DSP", SDT_MipsSHIFT_DSP>;
+def MipsSETCC_DSP : MipsDSPBase<"SETCC_DSP", SDTSetCC>;
+def MipsSELECT_CC_DSP : MipsDSPBase<"SELECT_CC_DSP", SDTSelectCC>;
+
+// Flags.
+class Uses<list<Register> Regs> {
+  list<Register> Uses = Regs;
+}
+
+class Defs<list<Register> Regs> {
+  list<Register> Defs = Regs;
+}
+
+// Instruction encoding.
+class ADDU_QB_ENC : ADDU_QB_FMT<0b00000>;
+class ADDU_S_QB_ENC : ADDU_QB_FMT<0b00100>;
+class SUBU_QB_ENC : ADDU_QB_FMT<0b00001>;
+class SUBU_S_QB_ENC : ADDU_QB_FMT<0b00101>;
+class ADDQ_PH_ENC : ADDU_QB_FMT<0b01010>;
+class ADDQ_S_PH_ENC : ADDU_QB_FMT<0b01110>;
+class SUBQ_PH_ENC : ADDU_QB_FMT<0b01011>;
+class SUBQ_S_PH_ENC : ADDU_QB_FMT<0b01111>;
+class ADDQ_S_W_ENC : ADDU_QB_FMT<0b10110>;
+class SUBQ_S_W_ENC : ADDU_QB_FMT<0b10111>;
+class ADDSC_ENC : ADDU_QB_FMT<0b10000>;
+class ADDWC_ENC : ADDU_QB_FMT<0b10001>;
+class MODSUB_ENC : ADDU_QB_FMT<0b10010>;
+class RADDU_W_QB_ENC : RADDU_W_QB_FMT<0b10100>;
+class ABSQ_S_PH_ENC : ABSQ_S_PH_R2_FMT<0b01001>;
+class ABSQ_S_W_ENC : ABSQ_S_PH_R2_FMT<0b10001>;
+class PRECRQ_QB_PH_ENC : CMP_EQ_QB_R3_FMT<0b01100>;
+class PRECRQ_PH_W_ENC : CMP_EQ_QB_R3_FMT<0b10100>;
+class PRECRQ_RS_PH_W_ENC : CMP_EQ_QB_R3_FMT<0b10101>;
+class PRECRQU_S_QB_PH_ENC : CMP_EQ_QB_R3_FMT<0b01111>;
+class PRECEQ_W_PHL_ENC : ABSQ_S_PH_R2_FMT<0b01100>;
+class PRECEQ_W_PHR_ENC : ABSQ_S_PH_R2_FMT<0b01101>;
+class PRECEQU_PH_QBL_ENC : ABSQ_S_PH_R2_FMT<0b00100>;
+class PRECEQU_PH_QBR_ENC : ABSQ_S_PH_R2_FMT<0b00101>;
+class PRECEQU_PH_QBLA_ENC : ABSQ_S_PH_R2_FMT<0b00110>;
+class PRECEQU_PH_QBRA_ENC : ABSQ_S_PH_R2_FMT<0b00111>;
+class PRECEU_PH_QBL_ENC : ABSQ_S_PH_R2_FMT<0b11100>;
+class PRECEU_PH_QBR_ENC : ABSQ_S_PH_R2_FMT<0b11101>;
+class PRECEU_PH_QBLA_ENC : ABSQ_S_PH_R2_FMT<0b11110>;
+class PRECEU_PH_QBRA_ENC : ABSQ_S_PH_R2_FMT<0b11111>;
+class SHLL_QB_ENC : SHLL_QB_FMT<0b00000>;
+class SHLLV_QB_ENC : SHLL_QB_FMT<0b00010>;
+class SHRL_QB_ENC : SHLL_QB_FMT<0b00001>;
+class SHRLV_QB_ENC : SHLL_QB_FMT<0b00011>;
+class SHLL_PH_ENC : SHLL_QB_FMT<0b01000>;
+class SHLLV_PH_ENC : SHLL_QB_FMT<0b01010>;
+class SHLL_S_PH_ENC : SHLL_QB_FMT<0b01100>;
+class SHLLV_S_PH_ENC : SHLL_QB_FMT<0b01110>;
+class SHRA_PH_ENC : SHLL_QB_FMT<0b01001>;
+class SHRAV_PH_ENC : SHLL_QB_FMT<0b01011>;
+class SHRA_R_PH_ENC : SHLL_QB_FMT<0b01101>;
+class SHRAV_R_PH_ENC : SHLL_QB_FMT<0b01111>;
+class SHLL_S_W_ENC : SHLL_QB_FMT<0b10100>;
+class SHLLV_S_W_ENC : SHLL_QB_FMT<0b10110>;
+class SHRA_R_W_ENC : SHLL_QB_FMT<0b10101>;
+class SHRAV_R_W_ENC : SHLL_QB_FMT<0b10111>;
+class MULEU_S_PH_QBL_ENC : ADDU_QB_FMT<0b00110>;
+class MULEU_S_PH_QBR_ENC : ADDU_QB_FMT<0b00111>;
+class MULEQ_S_W_PHL_ENC : ADDU_QB_FMT<0b11100>;
+class MULEQ_S_W_PHR_ENC : ADDU_QB_FMT<0b11101>;
+class MULQ_RS_PH_ENC : ADDU_QB_FMT<0b11111>;
+class MULSAQ_S_W_PH_ENC : DPA_W_PH_FMT<0b00110>;
+class MAQ_S_W_PHL_ENC : DPA_W_PH_FMT<0b10100>;
+class MAQ_S_W_PHR_ENC : DPA_W_PH_FMT<0b10110>;
+class MAQ_SA_W_PHL_ENC : DPA_W_PH_FMT<0b10000>;
+class MAQ_SA_W_PHR_ENC : DPA_W_PH_FMT<0b10010>;
+class MFHI_ENC : MFHI_FMT<0b010000>;
+class MFLO_ENC : MFHI_FMT<0b010010>;
+class MTHI_ENC : MTHI_FMT<0b010001>;
+class MTLO_ENC : MTHI_FMT<0b010011>;
+class DPAU_H_QBL_ENC : DPA_W_PH_FMT<0b00011>;
+class DPAU_H_QBR_ENC : DPA_W_PH_FMT<0b00111>;
+class DPSU_H_QBL_ENC : DPA_W_PH_FMT<0b01011>;
+class DPSU_H_QBR_ENC : DPA_W_PH_FMT<0b01111>;
+class DPAQ_S_W_PH_ENC : DPA_W_PH_FMT<0b00100>;
+class DPSQ_S_W_PH_ENC : DPA_W_PH_FMT<0b00101>;
+class DPAQ_SA_L_W_ENC : DPA_W_PH_FMT<0b01100>;
+class DPSQ_SA_L_W_ENC : DPA_W_PH_FMT<0b01101>;
+class MULT_DSP_ENC : MULT_FMT<0b000000, 0b011000>;
+class MULTU_DSP_ENC : MULT_FMT<0b000000, 0b011001>;
+class MADD_DSP_ENC : MULT_FMT<0b011100, 0b000000>;
+class MADDU_DSP_ENC : MULT_FMT<0b011100, 0b000001>;
+class MSUB_DSP_ENC : MULT_FMT<0b011100, 0b000100>;
+class MSUBU_DSP_ENC : MULT_FMT<0b011100, 0b000101>;
+class CMPU_EQ_QB_ENC : CMP_EQ_QB_R2_FMT<0b00000>;
+class CMPU_LT_QB_ENC : CMP_EQ_QB_R2_FMT<0b00001>;
+class CMPU_LE_QB_ENC : CMP_EQ_QB_R2_FMT<0b00010>;
+class CMPGU_EQ_QB_ENC : CMP_EQ_QB_R3_FMT<0b00100>;
+class CMPGU_LT_QB_ENC : CMP_EQ_QB_R3_FMT<0b00101>;
+class CMPGU_LE_QB_ENC : CMP_EQ_QB_R3_FMT<0b00110>;
+class CMP_EQ_PH_ENC : CMP_EQ_QB_R2_FMT<0b01000>;
+class CMP_LT_PH_ENC : CMP_EQ_QB_R2_FMT<0b01001>;
+class CMP_LE_PH_ENC : CMP_EQ_QB_R2_FMT<0b01010>;
+class BITREV_ENC : ABSQ_S_PH_R2_FMT<0b11011>;
+class PACKRL_PH_ENC : CMP_EQ_QB_R3_FMT<0b01110>;
+class REPL_QB_ENC : REPL_FMT<0b00010>;
+class REPL_PH_ENC : REPL_FMT<0b01010>;
+class REPLV_QB_ENC : ABSQ_S_PH_R2_FMT<0b00011>;
+class REPLV_PH_ENC : ABSQ_S_PH_R2_FMT<0b01011>;
+class PICK_QB_ENC : CMP_EQ_QB_R3_FMT<0b00011>;
+class PICK_PH_ENC : CMP_EQ_QB_R3_FMT<0b01011>;
+class LWX_ENC : LX_FMT<0b00000>;
+class LHX_ENC : LX_FMT<0b00100>;
+class LBUX_ENC : LX_FMT<0b00110>;
+class BPOSGE32_ENC : BPOSGE32_FMT<0b11100>;
+class INSV_ENC : INSV_FMT<0b001100>;
+
+class EXTP_ENC : EXTR_W_TY1_FMT<0b00010>;
+class EXTPV_ENC : EXTR_W_TY1_FMT<0b00011>;
+class EXTPDP_ENC : EXTR_W_TY1_FMT<0b01010>;
+class EXTPDPV_ENC : EXTR_W_TY1_FMT<0b01011>;
+class EXTR_W_ENC : EXTR_W_TY1_FMT<0b00000>;
+class EXTRV_W_ENC : EXTR_W_TY1_FMT<0b00001>;
+class EXTR_R_W_ENC : EXTR_W_TY1_FMT<0b00100>;
+class EXTRV_R_W_ENC : EXTR_W_TY1_FMT<0b00101>;
+class EXTR_RS_W_ENC : EXTR_W_TY1_FMT<0b00110>;
+class EXTRV_RS_W_ENC : EXTR_W_TY1_FMT<0b00111>;
+class EXTR_S_H_ENC : EXTR_W_TY1_FMT<0b01110>;
+class EXTRV_S_H_ENC : EXTR_W_TY1_FMT<0b01111>;
+class SHILO_ENC : SHILO_R1_FMT<0b11010>;
+class SHILOV_ENC : SHILO_R2_FMT<0b11011>;
+class MTHLIP_ENC : SHILO_R2_FMT<0b11111>;
+
+class RDDSP_ENC : RDDSP_FMT<0b10010>;
+class WRDSP_ENC : WRDSP_FMT<0b10011>;
+class ADDU_PH_ENC : ADDU_QB_FMT<0b01000>;
+class ADDU_S_PH_ENC : ADDU_QB_FMT<0b01100>;
+class SUBU_PH_ENC : ADDU_QB_FMT<0b01001>;
+class SUBU_S_PH_ENC : ADDU_QB_FMT<0b01101>;
+class CMPGDU_EQ_QB_ENC : CMP_EQ_QB_R3_FMT<0b11000>;
+class CMPGDU_LT_QB_ENC : CMP_EQ_QB_R3_FMT<0b11001>;
+class CMPGDU_LE_QB_ENC : CMP_EQ_QB_R3_FMT<0b11010>;
+class ABSQ_S_QB_ENC : ABSQ_S_PH_R2_FMT<0b00001>;
+class ADDUH_QB_ENC : ADDUH_QB_FMT<0b00000>;
+class ADDUH_R_QB_ENC : ADDUH_QB_FMT<0b00010>;
+class SUBUH_QB_ENC : ADDUH_QB_FMT<0b00001>;
+class SUBUH_R_QB_ENC : ADDUH_QB_FMT<0b00011>;
+class ADDQH_PH_ENC : ADDUH_QB_FMT<0b01000>;
+class ADDQH_R_PH_ENC : ADDUH_QB_FMT<0b01010>;
+class SUBQH_PH_ENC : ADDUH_QB_FMT<0b01001>;
+class SUBQH_R_PH_ENC : ADDUH_QB_FMT<0b01011>;
+class ADDQH_W_ENC : ADDUH_QB_FMT<0b10000>;
+class ADDQH_R_W_ENC : ADDUH_QB_FMT<0b10010>;
+class SUBQH_W_ENC : ADDUH_QB_FMT<0b10001>;
+class SUBQH_R_W_ENC : ADDUH_QB_FMT<0b10011>;
+class MUL_PH_ENC : ADDUH_QB_FMT<0b01100>;
+class MUL_S_PH_ENC : ADDUH_QB_FMT<0b01110>;
+class MULQ_S_W_ENC : ADDUH_QB_FMT<0b10110>;
+class MULQ_RS_W_ENC : ADDUH_QB_FMT<0b10111>;
+class MULQ_S_PH_ENC : ADDU_QB_FMT<0b11110>;
+class DPA_W_PH_ENC : DPA_W_PH_FMT<0b00000>;
+class DPS_W_PH_ENC : DPA_W_PH_FMT<0b00001>;
+class DPAQX_S_W_PH_ENC : DPA_W_PH_FMT<0b11000>;
+class DPAQX_SA_W_PH_ENC : DPA_W_PH_FMT<0b11010>;
+class DPAX_W_PH_ENC : DPA_W_PH_FMT<0b01000>;
+class DPSX_W_PH_ENC : DPA_W_PH_FMT<0b01001>;
+class DPSQX_S_W_PH_ENC : DPA_W_PH_FMT<0b11001>;
+class DPSQX_SA_W_PH_ENC : DPA_W_PH_FMT<0b11011>;
+class MULSA_W_PH_ENC : DPA_W_PH_FMT<0b00010>;
+class PRECR_QB_PH_ENC : CMP_EQ_QB_R3_FMT<0b01101>;
+class PRECR_SRA_PH_W_ENC : PRECR_SRA_PH_W_FMT<0b11110>;
+class PRECR_SRA_R_PH_W_ENC : PRECR_SRA_PH_W_FMT<0b11111>;
+class SHRA_QB_ENC : SHLL_QB_FMT<0b00100>;
+class SHRAV_QB_ENC : SHLL_QB_FMT<0b00110>;
+class SHRA_R_QB_ENC : SHLL_QB_FMT<0b00101>;
+class SHRAV_R_QB_ENC : SHLL_QB_FMT<0b00111>;
+class SHRL_PH_ENC : SHLL_QB_FMT<0b11001>;
+class SHRLV_PH_ENC : SHLL_QB_FMT<0b11011>;
+class APPEND_ENC : APPEND_FMT<0b00000>;
+class BALIGN_ENC : APPEND_FMT<0b10000>;
+class PREPEND_ENC : APPEND_FMT<0b00001>;
+
+// Instruction desc.
+class ADDU_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        InstrItinClass itin, RegisterOperand ROD,
+                        RegisterOperand ROS,  RegisterOperand ROT = ROS> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
+}
+
+class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           InstrItinClass itin, RegisterOperand ROD,
+                           RegisterOperand ROS = ROD> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs))];
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
+}
+
+class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             InstrItinClass itin, RegisterOperand ROS,
+                             RegisterOperand ROT = ROS> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $rt");
+  list<dag> Pattern = [(OpNode ROS:$rs, ROT:$rt)];
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
+}
+
+class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             InstrItinClass itin, RegisterOperand ROD,
+                             RegisterOperand ROS,  RegisterOperand ROT = ROS> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
+}
+
+class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                               InstrItinClass itin, RegisterOperand ROT,
+                               RegisterOperand ROS = ROT> {
+  dag OutOperandList = (outs ROT:$rt);
+  dag InOperandList = (ins ROS:$rs, uimm5:$sa, ROS:$src);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+  list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$src = $rt";
+  string BaseOpcode = instr_asm;
+}
+
+class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             InstrItinClass itin, RegisterOperand ROD,
+                             RegisterOperand ROT = ROD> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROT:$rt))];
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
+}
+
+class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                     Operand ImmOp, ImmLeaf immPat, InstrItinClass itin,
+                     RegisterOperand RO> {
+  dag OutOperandList = (outs RO:$rd);
+  dag InOperandList = (ins ImmOp:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $imm");
+  list<dag> Pattern = [(set RO:$rd, (OpNode immPat:$imm))];
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
+}
+
+class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           InstrItinClass itin, RegisterOperand RO> {
+  dag OutOperandList = (outs RO:$rd);
+  dag InOperandList =  (ins RO:$rt, GPR32Opnd:$rs_sa);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
+  list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs_sa))];
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
+}
+
+class SHLL_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           SDPatternOperator ImmPat, InstrItinClass itin,
+                           RegisterOperand RO, Operand ImmOpnd> {
+  dag OutOperandList = (outs RO:$rd);
+  dag InOperandList = (ins RO:$rt, ImmOpnd:$rs_sa);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
+  list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, ImmPat:$rs_sa))];
+  InstrItinClass Itinerary = itin;
+  bit hasSideEffects = 1;
+  string BaseOpcode = instr_asm;
+}
+
+class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                   InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins PtrRC:$base, PtrRC:$index);
+  string AsmString = !strconcat(instr_asm, "\t$rd, ${index}(${base})");
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode iPTR:$base, iPTR:$index))];
+  InstrItinClass Itinerary = itin;
+  bit mayLoad = 1;
+  string BaseOpcode = instr_asm;
+}
+
+class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                         InstrItinClass itin, RegisterOperand ROD,
+                         RegisterOperand ROS = ROD,  RegisterOperand ROT = ROD> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
+}
+
+class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       Operand ImmOp, SDPatternOperator Imm, InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rs, ImmOp:$sa, GPR32Opnd:$src);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+  list<dag> Pattern =  [(set GPR32Opnd:$rt,
+                        (OpNode GPR32Opnd:$src, GPR32Opnd:$rs, Imm:$sa))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$src = $rt";
+  string BaseOpcode = instr_asm;
+}
+
+class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                              InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$shift_rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
+}
+
+class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                              InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm5:$shift_rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
+}
+
+class SHILO_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins simm6:$shift, ACC64DSPOpnd:$acin);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $shift");
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode immSExt6:$shift, ACC64DSPOpnd:$acin))];
+  string Constraints = "$acin = $ac";
+  string BaseOpcode = instr_asm;
+}
+
+class SHILO_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, ACC64DSPOpnd:$acin);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $rs");
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
+  string Constraints = "$acin = $ac";
+  string BaseOpcode = instr_asm;
+}
+
+class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, ACC64DSPOpnd:$acin);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
+  string Constraints = "$acin = $ac";
+  string BaseOpcode = instr_asm;
+}
+
+class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                      InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins uimm10:$mask);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $mask");
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))];
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
+}
+
+class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                      InstrItinClass itin> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins GPR32Opnd:$rs, uimm10:$mask);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $mask");
+  list<dag> Pattern = [(OpNode GPR32Opnd:$rs, immZExt10:$mask)];
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
+}
+
+class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))];
+  string Constraints = "$acin = $ac";
+  string BaseOpcode = instr_asm;
+}
+
+class MULT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                     InstrItinClass itin> {
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt))];
+  InstrItinClass Itinerary = itin;
+  bit isCommutable = 1;
+  string BaseOpcode = instr_asm;
+}
+
+class MADD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                     InstrItinClass itin> {
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$acin = $ac";
+  string BaseOpcode = instr_asm;
+}
+
+class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
+                     InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins RO:$ac);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $ac");
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode RO:$ac))];
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
+}
+
+class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin> {
+  dag OutOperandList = (outs RO:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
+  InstrItinClass Itinerary = itin;
+  string BaseOpcode = instr_asm;
+}
+
+class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
+  MipsPseudo<(outs GPR32Opnd:$dst), (ins), [(set GPR32Opnd:$dst, (OpNode))]> {
+  bit usesCustomInserter = 1;
+}
+
+class BPOSGE32_DESC_BASE<string instr_asm, DAGOperand opnd,
+                         InstrItinClass itin> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins opnd:$offset);
+  string AsmString = !strconcat(instr_asm, "\t$offset");
+  InstrItinClass Itinerary = itin;
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit hasDelaySlot = 1;
+  string BaseOpcode = instr_asm;
+}
+
+class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                     InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$src, GPR32Opnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (OpNode GPR32Opnd:$src, GPR32Opnd:$rs))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$src = $rt";
+  string BaseOpcode = instr_asm;
+}
+
+//===----------------------------------------------------------------------===//
+// MIPS DSP Rev 1
+//===----------------------------------------------------------------------===//
+
+// Addition/subtraction
+class ADDU_QB_DESC : ADDU_QB_DESC_BASE<"addu.qb", null_frag, NoItinerary,
+                                       DSPROpnd, DSPROpnd>, IsCommutable,
+                     Defs<[DSPOutFlag20]>;
+
+class ADDU_S_QB_DESC : ADDU_QB_DESC_BASE<"addu_s.qb", int_mips_addu_s_qb,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
+                       IsCommutable, Defs<[DSPOutFlag20]>;
+
+class SUBU_QB_DESC : ADDU_QB_DESC_BASE<"subu.qb", null_frag, NoItinerary,
+                                       DSPROpnd, DSPROpnd>,
+                     Defs<[DSPOutFlag20]>;
+
+class SUBU_S_QB_DESC : ADDU_QB_DESC_BASE<"subu_s.qb", int_mips_subu_s_qb,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
+                       Defs<[DSPOutFlag20]>;
+
+class ADDQ_PH_DESC : ADDU_QB_DESC_BASE<"addq.ph", null_frag, NoItinerary,
+                                       DSPROpnd, DSPROpnd>, IsCommutable,
+                     Defs<[DSPOutFlag20]>;
+
+class ADDQ_S_PH_DESC : ADDU_QB_DESC_BASE<"addq_s.ph", int_mips_addq_s_ph,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
+                       IsCommutable, Defs<[DSPOutFlag20]>;
+
+class SUBQ_PH_DESC : ADDU_QB_DESC_BASE<"subq.ph", null_frag, NoItinerary,
+                                       DSPROpnd, DSPROpnd>,
+                     Defs<[DSPOutFlag20]>;
+
+class SUBQ_S_PH_DESC : ADDU_QB_DESC_BASE<"subq_s.ph", int_mips_subq_s_ph,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
+                       Defs<[DSPOutFlag20]>;
+
+class ADDQ_S_W_DESC : ADDU_QB_DESC_BASE<"addq_s.w", int_mips_addq_s_w,
+                                        NoItinerary, GPR32Opnd, GPR32Opnd>,
+                      IsCommutable, Defs<[DSPOutFlag20]>;
+
+class SUBQ_S_W_DESC : ADDU_QB_DESC_BASE<"subq_s.w", int_mips_subq_s_w,
+                                        NoItinerary, GPR32Opnd, GPR32Opnd>,
+                      Defs<[DSPOutFlag20]>;
+
+class ADDSC_DESC : ADDU_QB_DESC_BASE<"addsc", null_frag, NoItinerary,
+                                     GPR32Opnd, GPR32Opnd>, IsCommutable,
+                   Defs<[DSPCarry]>;
+
+class ADDWC_DESC : ADDU_QB_DESC_BASE<"addwc", null_frag, NoItinerary,
+                                     GPR32Opnd, GPR32Opnd>,
+                   IsCommutable, Uses<[DSPCarry]>, Defs<[DSPOutFlag20]>;
+
+class MODSUB_DESC : ADDU_QB_DESC_BASE<"modsub", int_mips_modsub, NoItinerary,
+                                      GPR32Opnd, GPR32Opnd>;
+
+class RADDU_W_QB_DESC : RADDU_W_QB_DESC_BASE<"raddu.w.qb", int_mips_raddu_w_qb,
+                                             NoItinerary, GPR32Opnd, DSPROpnd>;
+
+// Absolute value
+class ABSQ_S_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.ph", int_mips_absq_s_ph,
+                                              NoItinerary, DSPROpnd>,
+                       Defs<[DSPOutFlag20]>;
+
+class ABSQ_S_W_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.w", int_mips_absq_s_w,
+                                             NoItinerary, GPR32Opnd>,
+                      Defs<[DSPOutFlag20]>;
+
+// Precision reduce/expand
+class PRECRQ_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.qb.ph",
+                                                 int_mips_precrq_qb_ph,
+                                                 NoItinerary, DSPROpnd, DSPROpnd>;
+
+class PRECRQ_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.ph.w",
+                                                int_mips_precrq_ph_w,
+                                                NoItinerary, DSPROpnd, GPR32Opnd>;
+
+class PRECRQ_RS_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq_rs.ph.w",
+                                                   int_mips_precrq_rs_ph_w,
+                                                   NoItinerary, DSPROpnd,
+                                                   GPR32Opnd>,
+                            Defs<[DSPOutFlag22]>;
+
+class PRECRQU_S_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrqu_s.qb.ph",
+                                                    int_mips_precrqu_s_qb_ph,
+                                                    NoItinerary, DSPROpnd,
+                                                    DSPROpnd>,
+                             Defs<[DSPOutFlag22]>;
+
+class PRECEQ_W_PHL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phl",
+                                                 int_mips_preceq_w_phl,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>;
+
+class PRECEQ_W_PHR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phr",
+                                                 int_mips_preceq_w_phr,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>;
+
+class PRECEQU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbl",
+                                                   int_mips_precequ_ph_qbl,
+                                                   NoItinerary, DSPROpnd>;
+
+class PRECEQU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbr",
+                                                   int_mips_precequ_ph_qbr,
+                                                   NoItinerary, DSPROpnd>;
+
+class PRECEQU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbla",
+                                                    int_mips_precequ_ph_qbla,
+                                                    NoItinerary, DSPROpnd>;
+
+class PRECEQU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbra",
+                                                    int_mips_precequ_ph_qbra,
+                                                    NoItinerary, DSPROpnd>;
+
+class PRECEU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbl",
+                                                  int_mips_preceu_ph_qbl,
+                                                  NoItinerary, DSPROpnd>;
+
+class PRECEU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbr",
+                                                  int_mips_preceu_ph_qbr,
+                                                  NoItinerary, DSPROpnd>;
+
+class PRECEU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbla",
+                                                   int_mips_preceu_ph_qbla,
+                                                   NoItinerary, DSPROpnd>;
+
+class PRECEU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbra",
+                                                   int_mips_preceu_ph_qbra,
+                                                   NoItinerary, DSPROpnd>;
+
+// Shift
+class SHLL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shll.qb", null_frag, immZExt3,
+                                          NoItinerary, DSPROpnd, uimm3>,
+                     Defs<[DSPOutFlag22]>;
+
+class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb,
+                                           NoItinerary, DSPROpnd>,
+                      Defs<[DSPOutFlag22]>;
+
+class SHRL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shrl.qb", null_frag, immZExt3,
+                                          NoItinerary, DSPROpnd, uimm3>;
+
+class SHRLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.qb", int_mips_shrl_qb,
+                                           NoItinerary, DSPROpnd>;
+
+class SHLL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll.ph", null_frag, immZExt4,
+                                          NoItinerary, DSPROpnd, uimm4>,
+                     Defs<[DSPOutFlag22]>;
+
+class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph,
+                                           NoItinerary, DSPROpnd>,
+                      Defs<[DSPOutFlag22]>;
+
+class SHLL_S_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.ph", int_mips_shll_s_ph,
+                                            immZExt4, NoItinerary, DSPROpnd,
+                                            uimm4>,
+                       Defs<[DSPOutFlag22]>;
+
+class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph,
+                                             NoItinerary, DSPROpnd>,
+                        Defs<[DSPOutFlag22]>;
+
+class SHRA_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra.ph", null_frag, immZExt4,
+                                          NoItinerary, DSPROpnd, uimm4>;
+
+class SHRAV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav.ph", int_mips_shra_ph,
+                                           NoItinerary, DSPROpnd>;
+
+class SHRA_R_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.ph", int_mips_shra_r_ph,
+                                            immZExt4, NoItinerary, DSPROpnd,
+                                            uimm4>;
+
+class SHRAV_R_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.ph", int_mips_shra_r_ph,
+                                             NoItinerary, DSPROpnd>;
+
+class SHLL_S_W_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.w", int_mips_shll_s_w,
+                                           immZExt5, NoItinerary, GPR32Opnd,
+                                           uimm5>,
+                      Defs<[DSPOutFlag22]>;
+
+class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w,
+                                            NoItinerary, GPR32Opnd>,
+                       Defs<[DSPOutFlag22]>;
+
+class SHRA_R_W_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.w", int_mips_shra_r_w,
+                                           immZExt5, NoItinerary, GPR32Opnd,
+                                           uimm5>;
+
+class SHRAV_R_W_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.w", int_mips_shra_r_w,
+                                            NoItinerary, GPR32Opnd>;
+
+// Multiplication
+class MULEU_S_PH_QBL_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbl",
+                                              int_mips_muleu_s_ph_qbl,
+                                              NoItinerary, DSPROpnd, DSPROpnd>,
+                            Defs<[DSPOutFlag21]>;
+
+class MULEU_S_PH_QBR_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbr",
+                                              int_mips_muleu_s_ph_qbr,
+                                              NoItinerary, DSPROpnd, DSPROpnd>,
+                            Defs<[DSPOutFlag21]>;
+
+class MULEQ_S_W_PHL_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phl",
+                                             int_mips_muleq_s_w_phl,
+                                             NoItinerary, GPR32Opnd, DSPROpnd>,
+                           IsCommutable, Defs<[DSPOutFlag21]>;
+
+class MULEQ_S_W_PHR_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phr",
+                                             int_mips_muleq_s_w_phr,
+                                             NoItinerary, GPR32Opnd, DSPROpnd>,
+                           IsCommutable, Defs<[DSPOutFlag21]>;
+
+class MULQ_RS_PH_DESC : ADDU_QB_DESC_BASE<"mulq_rs.ph", int_mips_mulq_rs_ph,
+                                          NoItinerary, DSPROpnd, DSPROpnd>,
+                        IsCommutable, Defs<[DSPOutFlag21]>;
+
+class MULSAQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsaq_s.w.ph",
+                                              MipsMULSAQ_S_W_PH>,
+                           Defs<[DSPOutFlag16_19]>;
+
+class MAQ_S_W_PHL_DESC : DPA_W_PH_DESC_BASE<"maq_s.w.phl", MipsMAQ_S_W_PHL>,
+                         Defs<[DSPOutFlag16_19]>;
+
+class MAQ_S_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_s.w.phr", MipsMAQ_S_W_PHR>,
+                         Defs<[DSPOutFlag16_19]>;
+
+class MAQ_SA_W_PHL_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phl", MipsMAQ_SA_W_PHL>,
+                          Defs<[DSPOutFlag16_19]>;
+
+class MAQ_SA_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phr", MipsMAQ_SA_W_PHR>,
+                          Defs<[DSPOutFlag16_19]>;
+
+// Move from/to hi/lo.
+class MFHI_DESC : MFHI_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI, NoItinerary>;
+class MFLO_DESC : MFHI_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO, NoItinerary>;
+class MTHI_DESC : MTHI_DESC_BASE<"mthi", HI32DSPOpnd, NoItinerary>;
+class MTLO_DESC : MTHI_DESC_BASE<"mtlo", LO32DSPOpnd, NoItinerary>;
+
+// Dot product with accumulate/subtract
+class DPAU_H_QBL_DESC : DPA_W_PH_DESC_BASE<"dpau.h.qbl", MipsDPAU_H_QBL>;
+
+class DPAU_H_QBR_DESC : DPA_W_PH_DESC_BASE<"dpau.h.qbr", MipsDPAU_H_QBR>;
+
+class DPSU_H_QBL_DESC : DPA_W_PH_DESC_BASE<"dpsu.h.qbl", MipsDPSU_H_QBL>;
+
+class DPSU_H_QBR_DESC : DPA_W_PH_DESC_BASE<"dpsu.h.qbr", MipsDPSU_H_QBR>;
+
+class DPAQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaq_s.w.ph", MipsDPAQ_S_W_PH>,
+                         Defs<[DSPOutFlag16_19]>;
+
+class DPSQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsq_s.w.ph", MipsDPSQ_S_W_PH>,
+                         Defs<[DSPOutFlag16_19]>;
+
+class DPAQ_SA_L_W_DESC : DPA_W_PH_DESC_BASE<"dpaq_sa.l.w", MipsDPAQ_SA_L_W>,
+                         Defs<[DSPOutFlag16_19]>;
+
+class DPSQ_SA_L_W_DESC : DPA_W_PH_DESC_BASE<"dpsq_sa.l.w", MipsDPSQ_SA_L_W>,
+                         Defs<[DSPOutFlag16_19]>;
+
+class MULT_DSP_DESC  : MULT_DESC_BASE<"mult", MipsMult, NoItinerary>;
+class MULTU_DSP_DESC : MULT_DESC_BASE<"multu", MipsMultu, NoItinerary>;
+class MADD_DSP_DESC  : MADD_DESC_BASE<"madd", MipsMAdd, NoItinerary>;
+class MADDU_DSP_DESC : MADD_DESC_BASE<"maddu", MipsMAddu, NoItinerary>;
+class MSUB_DSP_DESC  : MADD_DESC_BASE<"msub", MipsMSub, NoItinerary>;
+class MSUBU_DSP_DESC : MADD_DESC_BASE<"msubu", MipsMSubu, NoItinerary>;
+
+// Comparison
+class CMPU_EQ_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.eq.qb",
+                                               int_mips_cmpu_eq_qb, NoItinerary,
+                                               DSPROpnd>,
+                        IsCommutable, Defs<[DSPCCond]>;
+
+class CMPU_LT_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.lt.qb",
+                                               int_mips_cmpu_lt_qb, NoItinerary,
+                                               DSPROpnd>, Defs<[DSPCCond]>;
+
+class CMPU_LE_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.le.qb",
+                                               int_mips_cmpu_le_qb, NoItinerary,
+                                               DSPROpnd>, Defs<[DSPCCond]>;
+
+class CMPGU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.eq.qb",
+                                                int_mips_cmpgu_eq_qb,
+                                                NoItinerary, GPR32Opnd, DSPROpnd>,
+                         IsCommutable;
+
+class CMPGU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.lt.qb",
+                                                int_mips_cmpgu_lt_qb,
+                                                NoItinerary, GPR32Opnd, DSPROpnd>;
+
+class CMPGU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.le.qb",
+                                                int_mips_cmpgu_le_qb,
+                                                NoItinerary, GPR32Opnd, DSPROpnd>;
+
+class CMP_EQ_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.eq.ph", int_mips_cmp_eq_ph,
+                                              NoItinerary, DSPROpnd>,
+                       IsCommutable, Defs<[DSPCCond]>;
+
+class CMP_LT_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.lt.ph", int_mips_cmp_lt_ph,
+                                              NoItinerary, DSPROpnd>,
+                       Defs<[DSPCCond]>;
+
+class CMP_LE_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.le.ph", int_mips_cmp_le_ph,
+                                              NoItinerary, DSPROpnd>,
+                       Defs<[DSPCCond]>;
+
+// Misc
+class BITREV_DESC : ABSQ_S_PH_R2_DESC_BASE<"bitrev", int_mips_bitrev,
+                                           NoItinerary, GPR32Opnd>;
+
+class PACKRL_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"packrl.ph", int_mips_packrl_ph,
+                                              NoItinerary, DSPROpnd, DSPROpnd>;
+
+class REPL_QB_DESC : REPL_DESC_BASE<"repl.qb", int_mips_repl_qb, uimm8,
+                                    immZExt8, NoItinerary, DSPROpnd>;
+
+class REPL_PH_DESC : REPL_DESC_BASE<"repl.ph", int_mips_repl_ph, uimm10,
+                                    immZExt10, NoItinerary, DSPROpnd>;
+
+class REPLV_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
+                                             NoItinerary, DSPROpnd, GPR32Opnd>;
+
+class REPLV_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.ph", int_mips_repl_ph,
+                                             NoItinerary, DSPROpnd, GPR32Opnd>;
+
+class PICK_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.qb", int_mips_pick_qb,
+                                            NoItinerary, DSPROpnd, DSPROpnd>,
+                     Uses<[DSPCCond]>;
+
+class PICK_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.ph", int_mips_pick_ph,
+                                            NoItinerary, DSPROpnd, DSPROpnd>,
+                     Uses<[DSPCCond]>;
+
+class LWX_DESC : LX_DESC_BASE<"lwx", int_mips_lwx, NoItinerary>;
+
+class LHX_DESC : LX_DESC_BASE<"lhx", int_mips_lhx, NoItinerary>;
+
+class LBUX_DESC : LX_DESC_BASE<"lbux", int_mips_lbux, NoItinerary>;
+
+class BPOSGE32_DESC : BPOSGE32_DESC_BASE<"bposge32", brtarget, NoItinerary>;
+
+// Extr
+class EXTP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extp", MipsEXTP, NoItinerary>,
+                  Uses<[DSPPos]>, Defs<[DSPEFI]>;
+
+class EXTPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpv", MipsEXTP, NoItinerary>,
+                   Uses<[DSPPos]>, Defs<[DSPEFI]>;
+
+class EXTPDP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extpdp", MipsEXTPDP, NoItinerary>,
+                    Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
+
+class EXTPDPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpdpv", MipsEXTPDP,
+                                             NoItinerary>,
+                     Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>;
+
+class EXTR_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr.w", MipsEXTR_W, NoItinerary>,
+                    Defs<[DSPOutFlag23]>;
+
+class EXTRV_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv.w", MipsEXTR_W,
+                                             NoItinerary>, Defs<[DSPOutFlag23]>;
+
+class EXTR_R_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_r.w", MipsEXTR_R_W,
+                                              NoItinerary>,
+                      Defs<[DSPOutFlag23]>;
+
+class EXTRV_R_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_r.w", MipsEXTR_R_W,
+                                               NoItinerary>,
+                       Defs<[DSPOutFlag23]>;
+
+class EXTR_RS_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_rs.w", MipsEXTR_RS_W,
+                                               NoItinerary>,
+                       Defs<[DSPOutFlag23]>;
+
+class EXTRV_RS_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_rs.w", MipsEXTR_RS_W,
+                                                NoItinerary>,
+                        Defs<[DSPOutFlag23]>;
+
+class EXTR_S_H_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_s.h", MipsEXTR_S_H,
+                                              NoItinerary>,
+                      Defs<[DSPOutFlag23]>;
+
+class EXTRV_S_H_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_s.h", MipsEXTR_S_H,
+                                               NoItinerary>,
+                       Defs<[DSPOutFlag23]>;
+
+class SHILO_DESC : SHILO_R1_DESC_BASE<"shilo", MipsSHILO>;
+
+class SHILOV_DESC : SHILO_R2_DESC_BASE<"shilov", MipsSHILO>;
+
+class MTHLIP_DESC : MTHLIP_DESC_BASE<"mthlip", MipsMTHLIP>, Defs<[DSPPos]>;
+
+class RDDSP_DESC : RDDSP_DESC_BASE<"rddsp", int_mips_rddsp, NoItinerary>;
+
+class WRDSP_DESC : WRDSP_DESC_BASE<"wrdsp", int_mips_wrdsp, NoItinerary>;
+
+class INSV_DESC : INSV_DESC_BASE<"insv", int_mips_insv, NoItinerary>,
+                  Uses<[DSPPos, DSPSCount]>;
+
+//===----------------------------------------------------------------------===//
+// MIPS DSP Rev 2
+// Addition/subtraction
+class ADDU_PH_DESC : ADDU_QB_DESC_BASE<"addu.ph", int_mips_addu_ph, NoItinerary,
+                                       DSPROpnd, DSPROpnd>, IsCommutable,
+                     Defs<[DSPOutFlag20]>;
+
+class ADDU_S_PH_DESC : ADDU_QB_DESC_BASE<"addu_s.ph", int_mips_addu_s_ph,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
+                       IsCommutable, Defs<[DSPOutFlag20]>;
+
+class SUBU_PH_DESC : ADDU_QB_DESC_BASE<"subu.ph", int_mips_subu_ph, NoItinerary,
+                                       DSPROpnd, DSPROpnd>,
+                     Defs<[DSPOutFlag20]>;
+
+class SUBU_S_PH_DESC : ADDU_QB_DESC_BASE<"subu_s.ph", int_mips_subu_s_ph,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
+                       Defs<[DSPOutFlag20]>;
+
+class ADDUH_QB_DESC : ADDUH_QB_DESC_BASE<"adduh.qb", int_mips_adduh_qb,
+                                         NoItinerary, DSPROpnd>, IsCommutable;
+
+class ADDUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"adduh_r.qb", int_mips_adduh_r_qb,
+                                           NoItinerary, DSPROpnd>, IsCommutable;
+
+class SUBUH_QB_DESC : ADDUH_QB_DESC_BASE<"subuh.qb", int_mips_subuh_qb,
+                                         NoItinerary, DSPROpnd>;
+
+class SUBUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"subuh_r.qb", int_mips_subuh_r_qb,
+                                           NoItinerary, DSPROpnd>;
+
+class ADDQH_PH_DESC : ADDUH_QB_DESC_BASE<"addqh.ph", int_mips_addqh_ph,
+                                         NoItinerary, DSPROpnd>, IsCommutable;
+
+class ADDQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"addqh_r.ph", int_mips_addqh_r_ph,
+                                           NoItinerary, DSPROpnd>, IsCommutable;
+
+class SUBQH_PH_DESC : ADDUH_QB_DESC_BASE<"subqh.ph", int_mips_subqh_ph,
+                                         NoItinerary, DSPROpnd>;
+
+class SUBQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"subqh_r.ph", int_mips_subqh_r_ph,
+                                           NoItinerary, DSPROpnd>;
+
+class ADDQH_W_DESC : ADDUH_QB_DESC_BASE<"addqh.w", int_mips_addqh_w,
+                                        NoItinerary, GPR32Opnd>, IsCommutable;
+
+class ADDQH_R_W_DESC : ADDUH_QB_DESC_BASE<"addqh_r.w", int_mips_addqh_r_w,
+                                          NoItinerary, GPR32Opnd>, IsCommutable;
+
+class SUBQH_W_DESC : ADDUH_QB_DESC_BASE<"subqh.w", int_mips_subqh_w,
+                                        NoItinerary, GPR32Opnd>;
+
+class SUBQH_R_W_DESC : ADDUH_QB_DESC_BASE<"subqh_r.w", int_mips_subqh_r_w,
+                                          NoItinerary, GPR32Opnd>;
+
+// Comparison
+class CMPGDU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.eq.qb",
+                                                 int_mips_cmpgdu_eq_qb,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>,
+                          IsCommutable, Defs<[DSPCCond]>;
+
+class CMPGDU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.lt.qb",
+                                                 int_mips_cmpgdu_lt_qb,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>,
+                          Defs<[DSPCCond]>;
+
+class CMPGDU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.le.qb",
+                                                 int_mips_cmpgdu_le_qb,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>,
+                          Defs<[DSPCCond]>;
+
+// Absolute
+class ABSQ_S_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.qb", int_mips_absq_s_qb,
+                                              NoItinerary, DSPROpnd>,
+                       Defs<[DSPOutFlag20]>;
+
+// Multiplication
+class MUL_PH_DESC : ADDUH_QB_DESC_BASE<"mul.ph", null_frag, NoItinerary,
+                                       DSPROpnd>, IsCommutable,
+                    Defs<[DSPOutFlag21]>;
+
+class MUL_S_PH_DESC : ADDUH_QB_DESC_BASE<"mul_s.ph", int_mips_mul_s_ph,
+                                         NoItinerary, DSPROpnd>, IsCommutable,
+                      Defs<[DSPOutFlag21]>;
+
+class MULQ_S_W_DESC : ADDUH_QB_DESC_BASE<"mulq_s.w", int_mips_mulq_s_w,
+                                         NoItinerary, GPR32Opnd>, IsCommutable,
+                      Defs<[DSPOutFlag21]>;
+
+class MULQ_RS_W_DESC : ADDUH_QB_DESC_BASE<"mulq_rs.w", int_mips_mulq_rs_w,
+                                          NoItinerary, GPR32Opnd>, IsCommutable,
+                       Defs<[DSPOutFlag21]>;
+
+class MULQ_S_PH_DESC : ADDU_QB_DESC_BASE<"mulq_s.ph", int_mips_mulq_s_ph,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
+                       IsCommutable, Defs<[DSPOutFlag21]>;
+
+// Dot product with accumulate/subtract
+class DPA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpa.w.ph", MipsDPA_W_PH>;
+
+class DPS_W_PH_DESC : DPA_W_PH_DESC_BASE<"dps.w.ph", MipsDPS_W_PH>;
+
+class DPAQX_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaqx_s.w.ph", MipsDPAQX_S_W_PH>,
+                          Defs<[DSPOutFlag16_19]>;
+
+class DPAQX_SA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaqx_sa.w.ph",
+                                              MipsDPAQX_SA_W_PH>,
+                           Defs<[DSPOutFlag16_19]>;
+
+class DPAX_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpax.w.ph", MipsDPAX_W_PH>;
+
+class DPSX_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsx.w.ph", MipsDPSX_W_PH>;
+
+class DPSQX_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsqx_s.w.ph", MipsDPSQX_S_W_PH>,
+                          Defs<[DSPOutFlag16_19]>;
+
+class DPSQX_SA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsqx_sa.w.ph",
+                                              MipsDPSQX_SA_W_PH>,
+                           Defs<[DSPOutFlag16_19]>;
+
+class MULSA_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsa.w.ph", MipsMULSA_W_PH>;
+
+// Precision reduce/expand
+class PRECR_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precr.qb.ph",
+                                                int_mips_precr_qb_ph,
+                                                NoItinerary, DSPROpnd, DSPROpnd>;
+
+class PRECR_SRA_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra.ph.w",
+                                                     int_mips_precr_sra_ph_w,
+                                                     NoItinerary, DSPROpnd,
+                                                     GPR32Opnd>;
+
+class PRECR_SRA_R_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra_r.ph.w",
+                                                      int_mips_precr_sra_r_ph_w,
+                                                       NoItinerary, DSPROpnd,
+                                                       GPR32Opnd>;
+
+// Shift
+class SHRA_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra.qb", null_frag, immZExt3,
+                                          NoItinerary, DSPROpnd, uimm3>;
+
+class SHRAV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav.qb", int_mips_shra_qb,
+                                           NoItinerary, DSPROpnd>;
+
+class SHRA_R_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.qb", int_mips_shra_r_qb,
+                                            immZExt3, NoItinerary, DSPROpnd,
+                                            uimm3>;
+
+class SHRAV_R_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.qb", int_mips_shra_r_qb,
+                                             NoItinerary, DSPROpnd>;
+
+class SHRL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shrl.ph", null_frag, immZExt4,
+                                          NoItinerary, DSPROpnd, uimm4>;
+
+class SHRLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.ph", int_mips_shrl_ph,
+                                           NoItinerary, DSPROpnd>;
+
+// Misc
+class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, uimm5, immZExt5,
+                                     NoItinerary>;
+
+class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, uimm2, immZExt2,
+                                     NoItinerary>;
+
+class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, uimm5,
+                                      immZExt5, NoItinerary>;
+
+// Pseudos.
+def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32,
+                                                NoItinerary>, Uses<[DSPPos]>;
+
+// Instruction defs.
+// MIPS DSP Rev 1
+def ADDU_QB : DspMMRel, ADDU_QB_ENC, ADDU_QB_DESC;
+def ADDU_S_QB : DspMMRel, ADDU_S_QB_ENC, ADDU_S_QB_DESC;
+def SUBU_QB : DspMMRel, SUBU_QB_ENC, SUBU_QB_DESC;
+def SUBU_S_QB : DspMMRel, SUBU_S_QB_ENC, SUBU_S_QB_DESC;
+def ADDQ_PH : DspMMRel, ADDQ_PH_ENC, ADDQ_PH_DESC;
+def ADDQ_S_PH : DspMMRel, ADDQ_S_PH_ENC, ADDQ_S_PH_DESC;
+def SUBQ_PH : DspMMRel, SUBQ_PH_ENC, SUBQ_PH_DESC;
+def SUBQ_S_PH : DspMMRel, SUBQ_S_PH_ENC, SUBQ_S_PH_DESC;
+def ADDQ_S_W : DspMMRel, ADDQ_S_W_ENC, ADDQ_S_W_DESC;
+def SUBQ_S_W : DspMMRel, SUBQ_S_W_ENC, SUBQ_S_W_DESC;
+def ADDSC : DspMMRel, ADDSC_ENC, ADDSC_DESC;
+def ADDWC : DspMMRel, ADDWC_ENC, ADDWC_DESC;
+def MODSUB : DspMMRel, MODSUB_ENC, MODSUB_DESC;
+def RADDU_W_QB : DspMMRel, RADDU_W_QB_ENC, RADDU_W_QB_DESC;
+def ABSQ_S_PH : DspMMRel, ABSQ_S_PH_ENC, ABSQ_S_PH_DESC;
+def ABSQ_S_W : DspMMRel, ABSQ_S_W_ENC, ABSQ_S_W_DESC;
+def PRECRQ_QB_PH : DspMMRel, PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC;
+def PRECRQ_PH_W : DspMMRel, PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC;
+def PRECRQ_RS_PH_W : DspMMRel, PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC;
+def PRECRQU_S_QB_PH : DspMMRel, PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC;
+def PRECEQ_W_PHL : DspMMRel, PRECEQ_W_PHL_ENC, PRECEQ_W_PHL_DESC;
+def PRECEQ_W_PHR : DspMMRel, PRECEQ_W_PHR_ENC, PRECEQ_W_PHR_DESC;
+def PRECEQU_PH_QBL : DspMMRel, PRECEQU_PH_QBL_ENC, PRECEQU_PH_QBL_DESC;
+def PRECEQU_PH_QBR : DspMMRel, PRECEQU_PH_QBR_ENC, PRECEQU_PH_QBR_DESC;
+def PRECEQU_PH_QBLA : DspMMRel, PRECEQU_PH_QBLA_ENC, PRECEQU_PH_QBLA_DESC;
+def PRECEQU_PH_QBRA : DspMMRel, PRECEQU_PH_QBRA_ENC, PRECEQU_PH_QBRA_DESC;
+def PRECEU_PH_QBL : DspMMRel, PRECEU_PH_QBL_ENC, PRECEU_PH_QBL_DESC;
+def PRECEU_PH_QBR : DspMMRel, PRECEU_PH_QBR_ENC, PRECEU_PH_QBR_DESC;
+def PRECEU_PH_QBLA : DspMMRel, PRECEU_PH_QBLA_ENC, PRECEU_PH_QBLA_DESC;
+def PRECEU_PH_QBRA : DspMMRel, PRECEU_PH_QBRA_ENC, PRECEU_PH_QBRA_DESC;
+def SHLL_QB : DspMMRel, SHLL_QB_ENC, SHLL_QB_DESC;
+def SHLLV_QB : DspMMRel, SHLLV_QB_ENC, SHLLV_QB_DESC;
+def SHRL_QB : DspMMRel, SHRL_QB_ENC, SHRL_QB_DESC;
+def SHRLV_QB : DspMMRel, SHRLV_QB_ENC, SHRLV_QB_DESC;
+def SHLL_PH : DspMMRel, SHLL_PH_ENC, SHLL_PH_DESC;
+def SHLLV_PH : DspMMRel, SHLLV_PH_ENC, SHLLV_PH_DESC;
+def SHLL_S_PH : DspMMRel, SHLL_S_PH_ENC, SHLL_S_PH_DESC;
+def SHLLV_S_PH : DspMMRel, SHLLV_S_PH_ENC, SHLLV_S_PH_DESC;
+def SHRA_PH : DspMMRel, SHRA_PH_ENC, SHRA_PH_DESC;
+def SHRAV_PH : DspMMRel, SHRAV_PH_ENC, SHRAV_PH_DESC;
+def SHRA_R_PH : DspMMRel, SHRA_R_PH_ENC, SHRA_R_PH_DESC;
+def SHRAV_R_PH : DspMMRel, SHRAV_R_PH_ENC, SHRAV_R_PH_DESC;
+def SHLL_S_W : DspMMRel, SHLL_S_W_ENC, SHLL_S_W_DESC;
+def SHLLV_S_W : DspMMRel, SHLLV_S_W_ENC, SHLLV_S_W_DESC;
+def SHRA_R_W : DspMMRel, SHRA_R_W_ENC, SHRA_R_W_DESC;
+def SHRAV_R_W : DspMMRel, SHRAV_R_W_ENC, SHRAV_R_W_DESC;
+def MULEU_S_PH_QBL : DspMMRel, MULEU_S_PH_QBL_ENC, MULEU_S_PH_QBL_DESC;
+def MULEU_S_PH_QBR : DspMMRel, MULEU_S_PH_QBR_ENC, MULEU_S_PH_QBR_DESC;
+def MULEQ_S_W_PHL : DspMMRel, MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC;
+def MULEQ_S_W_PHR : DspMMRel, MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC;
+def MULQ_RS_PH : DspMMRel, MULQ_RS_PH_ENC, MULQ_RS_PH_DESC;
+def MULSAQ_S_W_PH : DspMMRel, MULSAQ_S_W_PH_ENC, MULSAQ_S_W_PH_DESC;
+def MAQ_S_W_PHL : DspMMRel, MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC;
+def MAQ_S_W_PHR : DspMMRel, MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC;
+def MAQ_SA_W_PHL : DspMMRel, MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC;
+def MAQ_SA_W_PHR : DspMMRel, MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC;
+def MFHI_DSP : DspMMRel, MFHI_ENC, MFHI_DESC;
+def MFLO_DSP : DspMMRel, MFLO_ENC, MFLO_DESC;
+def MTHI_DSP : DspMMRel, MTHI_ENC, MTHI_DESC;
+def MTLO_DSP : DspMMRel, MTLO_ENC, MTLO_DESC;
+def DPAU_H_QBL : DspMMRel, DPAU_H_QBL_ENC, DPAU_H_QBL_DESC;
+def DPAU_H_QBR : DspMMRel, DPAU_H_QBR_ENC, DPAU_H_QBR_DESC;
+def DPSU_H_QBL : DspMMRel, DPSU_H_QBL_ENC, DPSU_H_QBL_DESC;
+def DPSU_H_QBR : DspMMRel, DPSU_H_QBR_ENC, DPSU_H_QBR_DESC;
+def DPAQ_S_W_PH : DspMMRel, DPAQ_S_W_PH_ENC, DPAQ_S_W_PH_DESC;
+def DPSQ_S_W_PH : DspMMRel, DPSQ_S_W_PH_ENC, DPSQ_S_W_PH_DESC;
+def DPAQ_SA_L_W : DspMMRel, DPAQ_SA_L_W_ENC, DPAQ_SA_L_W_DESC;
+def DPSQ_SA_L_W : DspMMRel, DPSQ_SA_L_W_ENC, DPSQ_SA_L_W_DESC;
+def MULT_DSP : DspMMRel, MULT_DSP_ENC, MULT_DSP_DESC;
+def MULTU_DSP : DspMMRel, MULTU_DSP_ENC, MULTU_DSP_DESC;
+def MADD_DSP : DspMMRel, MADD_DSP_ENC, MADD_DSP_DESC;
+def MADDU_DSP : DspMMRel, MADDU_DSP_ENC, MADDU_DSP_DESC;
+def MSUB_DSP : DspMMRel, MSUB_DSP_ENC, MSUB_DSP_DESC;
+def MSUBU_DSP : DspMMRel, MSUBU_DSP_ENC, MSUBU_DSP_DESC;
+def CMPU_EQ_QB : DspMMRel, CMPU_EQ_QB_ENC, CMPU_EQ_QB_DESC;
+def CMPU_LT_QB : DspMMRel, CMPU_LT_QB_ENC, CMPU_LT_QB_DESC;
+def CMPU_LE_QB : DspMMRel, CMPU_LE_QB_ENC, CMPU_LE_QB_DESC;
+def CMPGU_EQ_QB : DspMMRel, CMPGU_EQ_QB_ENC, CMPGU_EQ_QB_DESC;
+def CMPGU_LT_QB : DspMMRel, CMPGU_LT_QB_ENC, CMPGU_LT_QB_DESC;
+def CMPGU_LE_QB : DspMMRel, CMPGU_LE_QB_ENC, CMPGU_LE_QB_DESC;
+def CMP_EQ_PH : DspMMRel, CMP_EQ_PH_ENC, CMP_EQ_PH_DESC;
+def CMP_LT_PH : DspMMRel, CMP_LT_PH_ENC, CMP_LT_PH_DESC;
+def CMP_LE_PH : DspMMRel, CMP_LE_PH_ENC, CMP_LE_PH_DESC;
+def BITREV : DspMMRel, BITREV_ENC, BITREV_DESC;
+def PACKRL_PH : DspMMRel, PACKRL_PH_ENC, PACKRL_PH_DESC;
+def REPL_QB : DspMMRel, REPL_QB_ENC, REPL_QB_DESC;
+def REPL_PH : DspMMRel, REPL_PH_ENC, REPL_PH_DESC;
+def REPLV_QB : DspMMRel, REPLV_QB_ENC, REPLV_QB_DESC;
+def REPLV_PH : DspMMRel, REPLV_PH_ENC, REPLV_PH_DESC;
+def PICK_QB : DspMMRel, PICK_QB_ENC, PICK_QB_DESC;
+def PICK_PH : DspMMRel, PICK_PH_ENC, PICK_PH_DESC;
+def LWX : DspMMRel, LWX_ENC, LWX_DESC;
+def LHX : DspMMRel, LHX_ENC, LHX_DESC;
+def LBUX : DspMMRel, LBUX_ENC, LBUX_DESC;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def BPOSGE32 : DspMMRel, BPOSGE32_ENC, BPOSGE32_DESC;
+}
+def INSV : DspMMRel, INSV_ENC, INSV_DESC;
+def EXTP : DspMMRel, EXTP_ENC, EXTP_DESC;
+def EXTPV : DspMMRel, EXTPV_ENC, EXTPV_DESC;
+def EXTPDP : DspMMRel, EXTPDP_ENC, EXTPDP_DESC;
+def EXTPDPV : DspMMRel, EXTPDPV_ENC, EXTPDPV_DESC;
+def EXTR_W : DspMMRel, EXTR_W_ENC, EXTR_W_DESC;
+def EXTRV_W : DspMMRel, EXTRV_W_ENC, EXTRV_W_DESC;
+def EXTR_R_W : DspMMRel, EXTR_R_W_ENC, EXTR_R_W_DESC;
+def EXTRV_R_W : DspMMRel, EXTRV_R_W_ENC, EXTRV_R_W_DESC;
+def EXTR_RS_W : DspMMRel, EXTR_RS_W_ENC, EXTR_RS_W_DESC;
+def EXTRV_RS_W : DspMMRel, EXTRV_RS_W_ENC, EXTRV_RS_W_DESC;
+def EXTR_S_H : DspMMRel, EXTR_S_H_ENC, EXTR_S_H_DESC;
+def EXTRV_S_H : DspMMRel, EXTRV_S_H_ENC, EXTRV_S_H_DESC;
+def SHILO : DspMMRel, SHILO_ENC, SHILO_DESC;
+def SHILOV : DspMMRel, SHILOV_ENC, SHILOV_DESC;
+def MTHLIP : DspMMRel, MTHLIP_ENC, MTHLIP_DESC;
+def RDDSP : DspMMRel, RDDSP_ENC, RDDSP_DESC;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def WRDSP : WRDSP_ENC, WRDSP_DESC;
+}
+
+// MIPS DSP Rev 2
+def ADDU_PH : DspMMRel, ADDU_PH_ENC, ADDU_PH_DESC, ISA_DSPR2;
+def ADDU_S_PH : DspMMRel, ADDU_S_PH_ENC, ADDU_S_PH_DESC, ISA_DSPR2;
+def SUBU_PH : DspMMRel, SUBU_PH_ENC, SUBU_PH_DESC, ISA_DSPR2;
+def SUBU_S_PH : DspMMRel, SUBU_S_PH_ENC, SUBU_S_PH_DESC, ISA_DSPR2;
+def CMPGDU_EQ_QB : DspMMRel, CMPGDU_EQ_QB_ENC, CMPGDU_EQ_QB_DESC, ISA_DSPR2;
+def CMPGDU_LT_QB : DspMMRel, CMPGDU_LT_QB_ENC, CMPGDU_LT_QB_DESC, ISA_DSPR2;
+def CMPGDU_LE_QB : DspMMRel, CMPGDU_LE_QB_ENC, CMPGDU_LE_QB_DESC, ISA_DSPR2;
+def ABSQ_S_QB : DspMMRel, ABSQ_S_QB_ENC, ABSQ_S_QB_DESC, ISA_DSPR2;
+def ADDUH_QB : DspMMRel, ADDUH_QB_ENC, ADDUH_QB_DESC, ISA_DSPR2;
+def ADDUH_R_QB : DspMMRel, ADDUH_R_QB_ENC, ADDUH_R_QB_DESC, ISA_DSPR2;
+def SUBUH_QB : DspMMRel, SUBUH_QB_ENC, SUBUH_QB_DESC, ISA_DSPR2;
+def SUBUH_R_QB : DspMMRel, SUBUH_R_QB_ENC, SUBUH_R_QB_DESC, ISA_DSPR2;
+def ADDQH_PH : DspMMRel, ADDQH_PH_ENC, ADDQH_PH_DESC, ISA_DSPR2;
+def ADDQH_R_PH : DspMMRel, ADDQH_R_PH_ENC, ADDQH_R_PH_DESC, ISA_DSPR2;
+def SUBQH_PH : DspMMRel, SUBQH_PH_ENC, SUBQH_PH_DESC, ISA_DSPR2;
+def SUBQH_R_PH : DspMMRel, SUBQH_R_PH_ENC, SUBQH_R_PH_DESC, ISA_DSPR2;
+def ADDQH_W : DspMMRel, ADDQH_W_ENC, ADDQH_W_DESC, ISA_DSPR2;
+def ADDQH_R_W : DspMMRel, ADDQH_R_W_ENC, ADDQH_R_W_DESC, ISA_DSPR2;
+def SUBQH_W : DspMMRel, SUBQH_W_ENC, SUBQH_W_DESC, ISA_DSPR2;
+def SUBQH_R_W : DspMMRel, SUBQH_R_W_ENC, SUBQH_R_W_DESC, ISA_DSPR2;
+def MUL_PH : DspMMRel, MUL_PH_ENC, MUL_PH_DESC, ISA_DSPR2;
+def MUL_S_PH : DspMMRel, MUL_S_PH_ENC, MUL_S_PH_DESC, ISA_DSPR2;
+def MULQ_S_W : DspMMRel, MULQ_S_W_ENC, MULQ_S_W_DESC, ISA_DSPR2;
+def MULQ_RS_W : DspMMRel, MULQ_RS_W_ENC, MULQ_RS_W_DESC, ISA_DSPR2;
+def MULQ_S_PH : DspMMRel, MULQ_S_PH_ENC, MULQ_S_PH_DESC, ISA_DSPR2;
+def DPA_W_PH : DspMMRel, DPA_W_PH_ENC, DPA_W_PH_DESC, ISA_DSPR2;
+def DPS_W_PH : DspMMRel, DPS_W_PH_ENC, DPS_W_PH_DESC, ISA_DSPR2;
+def DPAQX_S_W_PH : DspMMRel, DPAQX_S_W_PH_ENC, DPAQX_S_W_PH_DESC, ISA_DSPR2;
+def DPAQX_SA_W_PH : DspMMRel, DPAQX_SA_W_PH_ENC, DPAQX_SA_W_PH_DESC, ISA_DSPR2;
+def DPAX_W_PH : DspMMRel, DPAX_W_PH_ENC, DPAX_W_PH_DESC, ISA_DSPR2;
+def DPSX_W_PH : DspMMRel, DPSX_W_PH_ENC, DPSX_W_PH_DESC, ISA_DSPR2;
+def DPSQX_S_W_PH : DspMMRel, DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC, ISA_DSPR2;
+def DPSQX_SA_W_PH : DspMMRel, DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC, ISA_DSPR2;
+def MULSA_W_PH : DspMMRel, MULSA_W_PH_ENC, MULSA_W_PH_DESC, ISA_DSPR2;
+def PRECR_QB_PH : DspMMRel, PRECR_QB_PH_ENC, PRECR_QB_PH_DESC, ISA_DSPR2;
+def PRECR_SRA_PH_W : DspMMRel, PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC, ISA_DSPR2;
+def PRECR_SRA_R_PH_W : DspMMRel, PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC, ISA_DSPR2;
+def SHRA_QB : DspMMRel, SHRA_QB_ENC, SHRA_QB_DESC, ISA_DSPR2;
+def SHRAV_QB : DspMMRel, SHRAV_QB_ENC, SHRAV_QB_DESC, ISA_DSPR2;
+def SHRA_R_QB : DspMMRel, SHRA_R_QB_ENC, SHRA_R_QB_DESC, ISA_DSPR2;
+def SHRAV_R_QB : DspMMRel, SHRAV_R_QB_ENC, SHRAV_R_QB_DESC, ISA_DSPR2;
+def SHRL_PH : DspMMRel, SHRL_PH_ENC, SHRL_PH_DESC, ISA_DSPR2;
+def SHRLV_PH : DspMMRel, SHRLV_PH_ENC, SHRLV_PH_DESC, ISA_DSPR2;
+def APPEND : DspMMRel, APPEND_ENC, APPEND_DESC, ISA_DSPR2;
+def BALIGN : DspMMRel, BALIGN_ENC, BALIGN_DESC, ISA_DSPR2;
+def PREPEND : DspMMRel, PREPEND_ENC, PREPEND_DESC, ISA_DSPR2;
+
+// Pseudos.
+let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
+  // Pseudo instructions for loading and storing accumulator registers.
+  def LOAD_ACC64DSP  : Load<"", ACC64DSPOpnd>;
+  def STORE_ACC64DSP : Store<"", ACC64DSPOpnd>;
+
+  // Pseudos for loading and storing ccond field of DSP control register.
+  def LOAD_CCOND_DSP  : Load<"load_ccond_dsp", DSPCC>;
+  def STORE_CCOND_DSP : Store<"store_ccond_dsp", DSPCC>;
+}
+
+// Pseudo CMP and PICK instructions.
+class PseudoCMP<Instruction RealInst> :
+  PseudoDSP<(outs DSPCC:$cmp), (ins DSPROpnd:$rs, DSPROpnd:$rt), []>,
+  PseudoInstExpansion<(RealInst DSPROpnd:$rs, DSPROpnd:$rt)>, NeverHasSideEffects;
+
+class PseudoPICK<Instruction RealInst> :
+  PseudoDSP<(outs DSPROpnd:$rd), (ins DSPCC:$cmp, DSPROpnd:$rs, DSPROpnd:$rt), []>,
+  PseudoInstExpansion<(RealInst DSPROpnd:$rd, DSPROpnd:$rs, DSPROpnd:$rt)>,
+  NeverHasSideEffects;
+
+def PseudoCMP_EQ_PH : PseudoCMP<CMP_EQ_PH>;
+def PseudoCMP_LT_PH : PseudoCMP<CMP_LT_PH>;
+def PseudoCMP_LE_PH : PseudoCMP<CMP_LE_PH>;
+def PseudoCMPU_EQ_QB : PseudoCMP<CMPU_EQ_QB>;
+def PseudoCMPU_LT_QB : PseudoCMP<CMPU_LT_QB>;
+def PseudoCMPU_LE_QB : PseudoCMP<CMPU_LE_QB>;
+
+def PseudoPICK_PH : PseudoPICK<PICK_PH>;
+def PseudoPICK_QB : PseudoPICK<PICK_QB>;
+
+def PseudoMTLOHI_DSP : PseudoMTLOHI<ACC64DSP, GPR32>;
+
+// Patterns.
+class DSPPat<dag pattern, dag result, Predicate pred = HasDSP> :
+  Pat<pattern, result>, Requires<[pred]>;
+
+class BitconvertPat<ValueType DstVT, ValueType SrcVT, RegisterClass DstRC,
+                    RegisterClass SrcRC> :
+   DSPPat<(DstVT (bitconvert (SrcVT SrcRC:$src))),
+          (COPY_TO_REGCLASS SrcRC:$src, DstRC)>;
+
+def : BitconvertPat<i32, v2i16, GPR32, DSPR>;
+def : BitconvertPat<i32, v4i8, GPR32, DSPR>;
+def : BitconvertPat<v2i16, i32, DSPR, GPR32>;
+def : BitconvertPat<v4i8, i32, DSPR, GPR32>;
+
+def : DSPPat<(v2i16 (load addr:$a)),
+             (v2i16 (COPY_TO_REGCLASS (LW addr:$a), DSPR))>;
+def : DSPPat<(v4i8 (load addr:$a)),
+             (v4i8 (COPY_TO_REGCLASS (LW addr:$a), DSPR))>;
+def : DSPPat<(store (v2i16 DSPR:$val), addr:$a),
+             (SW (COPY_TO_REGCLASS DSPR:$val, GPR32), addr:$a)>;
+def : DSPPat<(store (v4i8 DSPR:$val), addr:$a),
+             (SW (COPY_TO_REGCLASS DSPR:$val, GPR32), addr:$a)>;
+
+// Binary operations.
+class DSPBinPat<Instruction Inst, ValueType ValTy, SDPatternOperator Node,
+                Predicate Pred = HasDSP> :
+  DSPPat<(Node ValTy:$a, ValTy:$b), (Inst ValTy:$a, ValTy:$b), Pred>;
+
+def : DSPBinPat<ADDQ_PH, v2i16, int_mips_addq_ph>;
+def : DSPBinPat<ADDQ_PH, v2i16, add>;
+def : DSPBinPat<SUBQ_PH, v2i16, int_mips_subq_ph>;
+def : DSPBinPat<SUBQ_PH, v2i16, sub>;
+def : DSPBinPat<MUL_PH, v2i16, int_mips_mul_ph, HasDSPR2>;
+def : DSPBinPat<MUL_PH, v2i16, mul, HasDSPR2>;
+def : DSPBinPat<ADDU_QB, v4i8, int_mips_addu_qb>;
+def : DSPBinPat<ADDU_QB, v4i8, add>;
+def : DSPBinPat<SUBU_QB, v4i8, int_mips_subu_qb>;
+def : DSPBinPat<SUBU_QB, v4i8, sub>;
+def : DSPBinPat<ADDSC, i32, int_mips_addsc>;
+def : DSPBinPat<ADDSC, i32, addc>;
+def : DSPBinPat<ADDWC, i32, int_mips_addwc>;
+def : DSPBinPat<ADDWC, i32, adde>;
+
+// Shift immediate patterns.
+class DSPShiftPat<Instruction Inst, ValueType ValTy, SDPatternOperator Node,
+                  SDPatternOperator Imm, Predicate Pred = HasDSP> :
+  DSPPat<(Node ValTy:$a, Imm:$shamt), (Inst ValTy:$a, Imm:$shamt), Pred>;
+
+def : DSPShiftPat<SHLL_PH, v2i16, MipsSHLL_DSP, imm>;
+def : DSPShiftPat<SHRA_PH, v2i16, MipsSHRA_DSP, imm>;
+def : DSPShiftPat<SHRL_PH, v2i16, MipsSHRL_DSP, imm, HasDSPR2>;
+def : DSPShiftPat<SHLL_PH, v2i16, int_mips_shll_ph, immZExt4>;
+def : DSPShiftPat<SHRA_PH, v2i16, int_mips_shra_ph, immZExt4>;
+def : DSPShiftPat<SHRL_PH, v2i16, int_mips_shrl_ph, immZExt4, HasDSPR2>;
+def : DSPShiftPat<SHLL_QB, v4i8, MipsSHLL_DSP, imm>;
+def : DSPShiftPat<SHRA_QB, v4i8, MipsSHRA_DSP, imm, HasDSPR2>;
+def : DSPShiftPat<SHRL_QB, v4i8, MipsSHRL_DSP, imm>;
+def : DSPShiftPat<SHLL_QB, v4i8, int_mips_shll_qb, immZExt3>;
+def : DSPShiftPat<SHRA_QB, v4i8, int_mips_shra_qb, immZExt3, HasDSPR2>;
+def : DSPShiftPat<SHRL_QB, v4i8, int_mips_shrl_qb, immZExt3>;
+
+// SETCC/SELECT_CC patterns.
+class DSPSetCCPat<Instruction Cmp, Instruction Pick, ValueType ValTy,
+                  CondCode CC> :
+  DSPPat<(ValTy (MipsSETCC_DSP ValTy:$a, ValTy:$b, CC)),
+         (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)),
+                      (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPR)),
+                      (ValTy ZERO)))>;
+
+class DSPSetCCPatInv<Instruction Cmp, Instruction Pick, ValueType ValTy,
+                     CondCode CC> :
+  DSPPat<(ValTy (MipsSETCC_DSP ValTy:$a, ValTy:$b, CC)),
+         (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)),
+                      (ValTy ZERO),
+                      (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPR))))>;
+
+class DSPSelectCCPat<Instruction Cmp, Instruction Pick, ValueType ValTy,
+                     CondCode CC> :
+  DSPPat<(ValTy (MipsSELECT_CC_DSP ValTy:$a, ValTy:$b, ValTy:$c, ValTy:$d, CC)),
+         (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)), $c, $d))>;
+
+class DSPSelectCCPatInv<Instruction Cmp, Instruction Pick, ValueType ValTy,
+                        CondCode CC> :
+  DSPPat<(ValTy (MipsSELECT_CC_DSP ValTy:$a, ValTy:$b, ValTy:$c, ValTy:$d, CC)),
+         (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)), $d, $c))>;
+
+def : DSPSetCCPat<PseudoCMP_EQ_PH, PseudoPICK_PH, v2i16, SETEQ>;
+def : DSPSetCCPat<PseudoCMP_LT_PH, PseudoPICK_PH, v2i16, SETLT>;
+def : DSPSetCCPat<PseudoCMP_LE_PH, PseudoPICK_PH, v2i16, SETLE>;
+def : DSPSetCCPatInv<PseudoCMP_EQ_PH, PseudoPICK_PH, v2i16, SETNE>;
+def : DSPSetCCPatInv<PseudoCMP_LT_PH, PseudoPICK_PH, v2i16, SETGE>;
+def : DSPSetCCPatInv<PseudoCMP_LE_PH, PseudoPICK_PH, v2i16, SETGT>;
+def : DSPSetCCPat<PseudoCMPU_EQ_QB, PseudoPICK_QB, v4i8, SETEQ>;
+def : DSPSetCCPat<PseudoCMPU_LT_QB, PseudoPICK_QB, v4i8, SETULT>;
+def : DSPSetCCPat<PseudoCMPU_LE_QB, PseudoPICK_QB, v4i8, SETULE>;
+def : DSPSetCCPatInv<PseudoCMPU_EQ_QB, PseudoPICK_QB, v4i8, SETNE>;
+def : DSPSetCCPatInv<PseudoCMPU_LT_QB, PseudoPICK_QB, v4i8, SETUGE>;
+def : DSPSetCCPatInv<PseudoCMPU_LE_QB, PseudoPICK_QB, v4i8, SETUGT>;
+
+def : DSPSelectCCPat<PseudoCMP_EQ_PH, PseudoPICK_PH, v2i16, SETEQ>;
+def : DSPSelectCCPat<PseudoCMP_LT_PH, PseudoPICK_PH, v2i16, SETLT>;
+def : DSPSelectCCPat<PseudoCMP_LE_PH, PseudoPICK_PH, v2i16, SETLE>;
+def : DSPSelectCCPatInv<PseudoCMP_EQ_PH, PseudoPICK_PH, v2i16, SETNE>;
+def : DSPSelectCCPatInv<PseudoCMP_LT_PH, PseudoPICK_PH, v2i16, SETGE>;
+def : DSPSelectCCPatInv<PseudoCMP_LE_PH, PseudoPICK_PH, v2i16, SETGT>;
+def : DSPSelectCCPat<PseudoCMPU_EQ_QB, PseudoPICK_QB, v4i8, SETEQ>;
+def : DSPSelectCCPat<PseudoCMPU_LT_QB, PseudoPICK_QB, v4i8, SETULT>;
+def : DSPSelectCCPat<PseudoCMPU_LE_QB, PseudoPICK_QB, v4i8, SETULE>;
+def : DSPSelectCCPatInv<PseudoCMPU_EQ_QB, PseudoPICK_QB, v4i8, SETNE>;
+def : DSPSelectCCPatInv<PseudoCMPU_LT_QB, PseudoPICK_QB, v4i8, SETUGE>;
+def : DSPSelectCCPatInv<PseudoCMPU_LE_QB, PseudoPICK_QB, v4i8, SETUGT>;
+
+// Extr patterns.
+class EXTR_W_TY1_R2_Pat<SDPatternOperator OpNode, Instruction Instr> :
+  DSPPat<(i32 (OpNode GPR32:$rs, ACC64DSP:$ac)),
+         (Instr ACC64DSP:$ac, GPR32:$rs)>;
+
+class EXTR_W_TY1_R1_Pat<SDPatternOperator OpNode, Instruction Instr> :
+  DSPPat<(i32 (OpNode immZExt5:$shift, ACC64DSP:$ac)),
+         (Instr ACC64DSP:$ac, immZExt5:$shift)>;
+
+def : EXTR_W_TY1_R1_Pat<MipsEXTP, EXTP>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTP, EXTPV>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTPDP, EXTPDP>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTPDP, EXTPDPV>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_W, EXTR_W>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_W, EXTRV_W>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_R_W, EXTR_R_W>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_R_W, EXTRV_R_W>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_RS_W, EXTR_RS_W>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_RS_W, EXTRV_RS_W>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_S_H, EXTR_S_H>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_S_H, EXTRV_S_H>;
+
+// Indexed load patterns.
+class IndexedLoadPat<SDPatternOperator LoadNode, Instruction Instr> :
+  DSPPat<(i32 (LoadNode (add i32:$base, i32:$index))),
+         (Instr i32:$base, i32:$index)>;
+
+let AddedComplexity = 20 in {
+  def : IndexedLoadPat<zextloadi8, LBUX>;
+  def : IndexedLoadPat<sextloadi16, LHX>;
+  def : IndexedLoadPat<load, LWX>;
+}
+
+// Instruction alias.
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : DSPInstAlias<"wrdsp $rt", (WRDSP GPR32Opnd:$rt, 0x1F), 1>;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
new file mode 100644
index 000000000000..c821084f68cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -0,0 +1,891 @@
+//===-- MipsDelaySlotFiller.cpp - Mips Delay Slot Filler ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple pass to fill delay slots with useful instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsMCNaCl.h"
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "delay-slot-filler"
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+STATISTIC(UsefulSlots, "Number of delay slots filled with instructions that"
+                       " are not NOP.");
+
+static cl::opt<bool> DisableDelaySlotFiller(
+  "disable-mips-delay-filler",
+  cl::init(false),
+  cl::desc("Fill all delay slots with NOPs."),
+  cl::Hidden);
+
+static cl::opt<bool> DisableForwardSearch(
+  "disable-mips-df-forward-search",
+  cl::init(true),
+  cl::desc("Disallow MIPS delay filler to search forward."),
+  cl::Hidden);
+
+static cl::opt<bool> DisableSuccBBSearch(
+  "disable-mips-df-succbb-search",
+  cl::init(true),
+  cl::desc("Disallow MIPS delay filler to search successor basic blocks."),
+  cl::Hidden);
+
+static cl::opt<bool> DisableBackwardSearch(
+  "disable-mips-df-backward-search",
+  cl::init(false),
+  cl::desc("Disallow MIPS delay filler to search backward."),
+  cl::Hidden);
+
+enum CompactBranchPolicy {
+  CB_Never,   ///< The policy 'never' may in some circumstances or for some
+              ///< ISAs not be absolutely adhered to.
+  CB_Optimal, ///< Optimal is the default and will produce compact branches
+              ///< when delay slots cannot be filled.
+  CB_Always   ///< 'always' may in some circumstances may not be
+              ///< absolutely adhered to there may not be a corresponding
+              ///< compact form of a branch.
+};
+
+static cl::opt<CompactBranchPolicy> MipsCompactBranchPolicy(
+  "mips-compact-branches",cl::Optional,
+  cl::init(CB_Optimal),
+  cl::desc("MIPS Specific: Compact branch policy."),
+  cl::values(
+    clEnumValN(CB_Never, "never", "Do not use compact branches if possible."),
+    clEnumValN(CB_Optimal, "optimal", "Use compact branches where appropiate (default)."),
+    clEnumValN(CB_Always, "always", "Always use compact branches if possible.")
+  )
+);
+
+namespace {
+  typedef MachineBasicBlock::iterator Iter;
+  typedef MachineBasicBlock::reverse_iterator ReverseIter;
+  typedef SmallDenseMap<MachineBasicBlock*, MachineInstr*, 2> BB2BrMap;
+
+  class RegDefsUses {
+  public:
+    RegDefsUses(const TargetRegisterInfo &TRI);
+    void init(const MachineInstr &MI);
+
+    /// This function sets all caller-saved registers in Defs.
+    void setCallerSaved(const MachineInstr &MI);
+
+    /// This function sets all unallocatable registers in Defs.
+    void setUnallocatableRegs(const MachineFunction &MF);
+
+    /// Set bits in Uses corresponding to MBB's live-out registers except for
+    /// the registers that are live-in to SuccBB.
+    void addLiveOut(const MachineBasicBlock &MBB,
+                    const MachineBasicBlock &SuccBB);
+
+    bool update(const MachineInstr &MI, unsigned Begin, unsigned End);
+
+  private:
+    bool checkRegDefsUses(BitVector &NewDefs, BitVector &NewUses, unsigned Reg,
+                          bool IsDef) const;
+
+    /// Returns true if Reg or its alias is in RegSet.
+    bool isRegInSet(const BitVector &RegSet, unsigned Reg) const;
+
+    const TargetRegisterInfo &TRI;
+    BitVector Defs, Uses;
+  };
+
+  /// Base class for inspecting loads and stores.
+  class InspectMemInstr {
+  public:
+    InspectMemInstr(bool ForbidMemInstr_)
+      : OrigSeenLoad(false), OrigSeenStore(false), SeenLoad(false),
+        SeenStore(false), ForbidMemInstr(ForbidMemInstr_) {}
+
+    /// Return true if MI cannot be moved to delay slot.
+    bool hasHazard(const MachineInstr &MI);
+
+    virtual ~InspectMemInstr() {}
+
+  protected:
+    /// Flags indicating whether loads or stores have been seen.
+    bool OrigSeenLoad, OrigSeenStore, SeenLoad, SeenStore;
+
+    /// Memory instructions are not allowed to move to delay slot if this flag
+    /// is true.
+    bool ForbidMemInstr;
+
+  private:
+    virtual bool hasHazard_(const MachineInstr &MI) = 0;
+  };
+
+  /// This subclass rejects any memory instructions.
+  class NoMemInstr : public InspectMemInstr {
+  public:
+    NoMemInstr() : InspectMemInstr(true) {}
+  private:
+    bool hasHazard_(const MachineInstr &MI) override { return true; }
+  };
+
+  /// This subclass accepts loads from stacks and constant loads.
+  class LoadFromStackOrConst : public InspectMemInstr {
+  public:
+    LoadFromStackOrConst() : InspectMemInstr(false) {}
+  private:
+    bool hasHazard_(const MachineInstr &MI) override;
+  };
+
+  /// This subclass uses memory dependence information to determine whether a
+  /// memory instruction can be moved to a delay slot.
+  class MemDefsUses : public InspectMemInstr {
+  public:
+    MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI);
+
+  private:
+    typedef PointerUnion<const Value *, const PseudoSourceValue *> ValueType;
+
+    bool hasHazard_(const MachineInstr &MI) override;
+
+    /// Update Defs and Uses. Return true if there exist dependences that
+    /// disqualify the delay slot candidate between V and values in Uses and
+    /// Defs.
+    bool updateDefsUses(ValueType V, bool MayStore);
+
+    /// Get the list of underlying objects of MI's memory operand.
+    bool getUnderlyingObjects(const MachineInstr &MI,
+                              SmallVectorImpl<ValueType> &Objects) const;
+
+    const MachineFrameInfo *MFI;
+    SmallPtrSet<ValueType, 4> Uses, Defs;
+    const DataLayout &DL;
+
+    /// Flags indicating whether loads or stores with no underlying objects have
+    /// been seen.
+    bool SeenNoObjLoad, SeenNoObjStore;
+  };
+
+  class Filler : public MachineFunctionPass {
+  public:
+    Filler(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm) { }
+
+    StringRef getPassName() const override { return "Mips Delay Slot Filler"; }
+
+    bool runOnMachineFunction(MachineFunction &F) override {
+      bool Changed = false;
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI)
+        Changed |= runOnMachineBasicBlock(*FI);
+
+      // This pass invalidates liveness information when it reorders
+      // instructions to fill delay slot. Without this, -verify-machineinstrs
+      // will fail.
+      if (Changed)
+        F.getRegInfo().invalidateLiveness();
+
+      return Changed;
+    }
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineBranchProbabilityInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+
+    Iter replaceWithCompactBranch(MachineBasicBlock &MBB, Iter Branch,
+                                  const DebugLoc &DL);
+
+    /// This function checks if it is valid to move Candidate to the delay slot
+    /// and returns true if it isn't. It also updates memory and register
+    /// dependence information.
+    bool delayHasHazard(const MachineInstr &Candidate, RegDefsUses &RegDU,
+                        InspectMemInstr &IM) const;
+
+    /// This function searches range [Begin, End) for an instruction that can be
+    /// moved to the delay slot. Returns true on success.
+    template<typename IterTy>
+    bool searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
+                     RegDefsUses &RegDU, InspectMemInstr &IM, Iter Slot,
+                     IterTy &Filler) const;
+
+    /// This function searches in the backward direction for an instruction that
+    /// can be moved to the delay slot. Returns true on success.
+    bool searchBackward(MachineBasicBlock &MBB, MachineInstr &Slot) const;
+
+    /// This function searches MBB in the forward direction for an instruction
+    /// that can be moved to the delay slot. Returns true on success.
+    bool searchForward(MachineBasicBlock &MBB, Iter Slot) const;
+
+    /// This function searches one of MBB's successor blocks for an instruction
+    /// that can be moved to the delay slot and inserts clones of the
+    /// instruction into the successor's predecessor blocks.
+    bool searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const;
+
+    /// Pick a successor block of MBB. Return NULL if MBB doesn't have a
+    /// successor block that is not a landing pad.
+    MachineBasicBlock *selectSuccBB(MachineBasicBlock &B) const;
+
+    /// This function analyzes MBB and returns an instruction with an unoccupied
+    /// slot that branches to Dst.
+    std::pair<MipsInstrInfo::BranchType, MachineInstr *>
+    getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const;
+
+    /// Examine Pred and see if it is possible to insert an instruction into
+    /// one of its branches delay slot or its end.
+    bool examinePred(MachineBasicBlock &Pred, const MachineBasicBlock &Succ,
+                     RegDefsUses &RegDU, bool &HasMultipleSuccs,
+                     BB2BrMap &BrMap) const;
+
+    bool terminateSearch(const MachineInstr &Candidate) const;
+
+    TargetMachine &TM;
+
+    static char ID;
+  };
+  char Filler::ID = 0;
+} // end of anonymous namespace
+
+static bool hasUnoccupiedSlot(const MachineInstr *MI) {
+  return MI->hasDelaySlot() && !MI->isBundledWithSucc();
+}
+
+/// This function inserts clones of Filler into predecessor blocks.
+static void insertDelayFiller(Iter Filler, const BB2BrMap &BrMap) {
+  MachineFunction *MF = Filler->getParent()->getParent();
+
+  for (BB2BrMap::const_iterator I = BrMap.begin(); I != BrMap.end(); ++I) {
+    if (I->second) {
+      MIBundleBuilder(I->second).append(MF->CloneMachineInstr(&*Filler));
+      ++UsefulSlots;
+    } else {
+      I->first->insert(I->first->end(), MF->CloneMachineInstr(&*Filler));
+    }
+  }
+}
+
+/// This function adds registers Filler defines to MBB's live-in register list.
+static void addLiveInRegs(Iter Filler, MachineBasicBlock &MBB) {
+  for (unsigned I = 0, E = Filler->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = Filler->getOperand(I);
+    unsigned R;
+
+    if (!MO.isReg() || !MO.isDef() || !(R = MO.getReg()))
+      continue;
+
+#ifndef NDEBUG
+    const MachineFunction &MF = *MBB.getParent();
+    assert(MF.getSubtarget().getRegisterInfo()->getAllocatableSet(MF).test(R) &&
+           "Shouldn't move an instruction with unallocatable registers across "
+           "basic block boundaries.");
+#endif
+
+    if (!MBB.isLiveIn(R))
+      MBB.addLiveIn(R);
+  }
+}
+
+RegDefsUses::RegDefsUses(const TargetRegisterInfo &TRI)
+    : TRI(TRI), Defs(TRI.getNumRegs(), false), Uses(TRI.getNumRegs(), false) {}
+
+void RegDefsUses::init(const MachineInstr &MI) {
+  // Add all register operands which are explicit and non-variadic.
+  update(MI, 0, MI.getDesc().getNumOperands());
+
+  // If MI is a call, add RA to Defs to prevent users of RA from going into
+  // delay slot.
+  if (MI.isCall())
+    Defs.set(Mips::RA);
+
+  // Add all implicit register operands of branch instructions except
+  // register AT.
+  if (MI.isBranch()) {
+    update(MI, MI.getDesc().getNumOperands(), MI.getNumOperands());
+    Defs.reset(Mips::AT);
+  }
+}
+
+void RegDefsUses::setCallerSaved(const MachineInstr &MI) {
+  assert(MI.isCall());
+
+  // Add RA/RA_64 to Defs to prevent users of RA/RA_64 from going into
+  // the delay slot. The reason is that RA/RA_64 must not be changed
+  // in the delay slot so that the callee can return to the caller.
+  if (MI.definesRegister(Mips::RA) || MI.definesRegister(Mips::RA_64)) {
+    Defs.set(Mips::RA);
+    Defs.set(Mips::RA_64);
+  }
+
+  // If MI is a call, add all caller-saved registers to Defs.
+  BitVector CallerSavedRegs(TRI.getNumRegs(), true);
+
+  CallerSavedRegs.reset(Mips::ZERO);
+  CallerSavedRegs.reset(Mips::ZERO_64);
+
+  for (const MCPhysReg *R = TRI.getCalleeSavedRegs(MI.getParent()->getParent());
+       *R; ++R)
+    for (MCRegAliasIterator AI(*R, &TRI, true); AI.isValid(); ++AI)
+      CallerSavedRegs.reset(*AI);
+
+  Defs |= CallerSavedRegs;
+}
+
+void RegDefsUses::setUnallocatableRegs(const MachineFunction &MF) {
+  BitVector AllocSet = TRI.getAllocatableSet(MF);
+
+  for (int R = AllocSet.find_first(); R != -1; R = AllocSet.find_next(R))
+    for (MCRegAliasIterator AI(R, &TRI, false); AI.isValid(); ++AI)
+      AllocSet.set(*AI);
+
+  AllocSet.set(Mips::ZERO);
+  AllocSet.set(Mips::ZERO_64);
+
+  Defs |= AllocSet.flip();
+}
+
+void RegDefsUses::addLiveOut(const MachineBasicBlock &MBB,
+                             const MachineBasicBlock &SuccBB) {
+  for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
+       SE = MBB.succ_end(); SI != SE; ++SI)
+    if (*SI != &SuccBB)
+      for (const auto &LI : (*SI)->liveins())
+        Uses.set(LI.PhysReg);
+}
+
+bool RegDefsUses::update(const MachineInstr &MI, unsigned Begin, unsigned End) {
+  BitVector NewDefs(TRI.getNumRegs()), NewUses(TRI.getNumRegs());
+  bool HasHazard = false;
+
+  for (unsigned I = Begin; I != End; ++I) {
+    const MachineOperand &MO = MI.getOperand(I);
+
+    if (MO.isReg() && MO.getReg())
+      HasHazard |= checkRegDefsUses(NewDefs, NewUses, MO.getReg(), MO.isDef());
+  }
+
+  Defs |= NewDefs;
+  Uses |= NewUses;
+
+  return HasHazard;
+}
+
+bool RegDefsUses::checkRegDefsUses(BitVector &NewDefs, BitVector &NewUses,
+                                   unsigned Reg, bool IsDef) const {
+  if (IsDef) {
+    NewDefs.set(Reg);
+    // check whether Reg has already been defined or used.
+    return (isRegInSet(Defs, Reg) || isRegInSet(Uses, Reg));
+  }
+
+  NewUses.set(Reg);
+  // check whether Reg has already been defined.
+  return isRegInSet(Defs, Reg);
+}
+
+bool RegDefsUses::isRegInSet(const BitVector &RegSet, unsigned Reg) const {
+  // Check Reg and all aliased Registers.
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    if (RegSet.test(*AI))
+      return true;
+  return false;
+}
+
+bool InspectMemInstr::hasHazard(const MachineInstr &MI) {
+  if (!MI.mayStore() && !MI.mayLoad())
+    return false;
+
+  if (ForbidMemInstr)
+    return true;
+
+  OrigSeenLoad = SeenLoad;
+  OrigSeenStore = SeenStore;
+  SeenLoad |= MI.mayLoad();
+  SeenStore |= MI.mayStore();
+
+  // If MI is an ordered or volatile memory reference, disallow moving
+  // subsequent loads and stores to delay slot.
+  if (MI.hasOrderedMemoryRef() && (OrigSeenLoad || OrigSeenStore)) {
+    ForbidMemInstr = true;
+    return true;
+  }
+
+  return hasHazard_(MI);
+}
+
+bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
+  if (MI.mayStore())
+    return true;
+
+  if (!MI.hasOneMemOperand() || !(*MI.memoperands_begin())->getPseudoValue())
+    return true;
+
+  if (const PseudoSourceValue *PSV =
+      (*MI.memoperands_begin())->getPseudoValue()) {
+    if (isa<FixedStackPseudoSourceValue>(PSV))
+      return false;
+    return !PSV->isConstant(nullptr) && !PSV->isStack();
+  }
+
+  return true;
+}
+
+MemDefsUses::MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI_)
+    : InspectMemInstr(false), MFI(MFI_), DL(DL), SeenNoObjLoad(false),
+      SeenNoObjStore(false) {}
+
+bool MemDefsUses::hasHazard_(const MachineInstr &MI) {
+  bool HasHazard = false;
+  SmallVector<ValueType, 4> Objs;
+
+  // Check underlying object list.
+  if (getUnderlyingObjects(MI, Objs)) {
+    for (SmallVectorImpl<ValueType>::const_iterator I = Objs.begin();
+         I != Objs.end(); ++I)
+      HasHazard |= updateDefsUses(*I, MI.mayStore());
+
+    return HasHazard;
+  }
+
+  // No underlying objects found.
+  HasHazard = MI.mayStore() && (OrigSeenLoad || OrigSeenStore);
+  HasHazard |= MI.mayLoad() || OrigSeenStore;
+
+  SeenNoObjLoad |= MI.mayLoad();
+  SeenNoObjStore |= MI.mayStore();
+
+  return HasHazard;
+}
+
+bool MemDefsUses::updateDefsUses(ValueType V, bool MayStore) {
+  if (MayStore)
+    return !Defs.insert(V).second || Uses.count(V) || SeenNoObjStore ||
+           SeenNoObjLoad;
+
+  Uses.insert(V);
+  return Defs.count(V) || SeenNoObjStore;
+}
+
+bool MemDefsUses::
+getUnderlyingObjects(const MachineInstr &MI,
+                     SmallVectorImpl<ValueType> &Objects) const {
+  if (!MI.hasOneMemOperand() ||
+      (!(*MI.memoperands_begin())->getValue() &&
+       !(*MI.memoperands_begin())->getPseudoValue()))
+    return false;
+
+  if (const PseudoSourceValue *PSV =
+      (*MI.memoperands_begin())->getPseudoValue()) {
+    if (!PSV->isAliased(MFI))
+      return false;
+    Objects.push_back(PSV);
+    return true;
+  }
+
+  const Value *V = (*MI.memoperands_begin())->getValue();
+
+  SmallVector<Value *, 4> Objs;
+  GetUnderlyingObjects(const_cast<Value *>(V), Objs, DL);
+
+  for (SmallVectorImpl<Value *>::iterator I = Objs.begin(), E = Objs.end();
+       I != E; ++I) {
+    if (!isIdentifiedObject(V))
+      return false;
+
+    Objects.push_back(*I);
+  }
+
+  return true;
+}
+
+// Replace Branch with the compact branch instruction.
+Iter Filler::replaceWithCompactBranch(MachineBasicBlock &MBB, Iter Branch,
+                                      const DebugLoc &DL) {
+  const MipsSubtarget &STI = MBB.getParent()->getSubtarget<MipsSubtarget>();
+  const MipsInstrInfo *TII = STI.getInstrInfo();
+
+  unsigned NewOpcode = TII->getEquivalentCompactForm(Branch);
+  Branch = TII->genInstrWithNewOpc(NewOpcode, Branch);
+
+  std::next(Branch)->eraseFromParent();
+  return Branch;
+}
+
+// For given opcode returns opcode of corresponding instruction with short
+// delay slot.
+// For the pseudo TAILCALL*_MM instrunctions return the short delay slot
+// form. Unfortunately, TAILCALL<->b16 is denied as b16 has a limited range
+// that is too short to make use of for tail calls.
+static int getEquivalentCallShort(int Opcode) {
+  switch (Opcode) {
+  case Mips::BGEZAL:
+    return Mips::BGEZALS_MM;
+  case Mips::BLTZAL:
+    return Mips::BLTZALS_MM;
+  case Mips::JAL:
+    return Mips::JALS_MM;
+  case Mips::JALR:
+    return Mips::JALRS_MM;
+  case Mips::JALR16_MM:
+    return Mips::JALRS16_MM;
+  case Mips::TAILCALL_MM:
+    llvm_unreachable("Attempting to shorten the TAILCALL_MM pseudo!");
+  case Mips::TAILCALLREG:
+    return Mips::JR16_MM;
+  default:
+    llvm_unreachable("Unexpected call instruction for microMIPS.");
+  }
+}
+
+/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+/// We assume there is only one delay slot per delayed instruction.
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  const MipsSubtarget &STI = MBB.getParent()->getSubtarget<MipsSubtarget>();
+  bool InMicroMipsMode = STI.inMicroMipsMode();
+  const MipsInstrInfo *TII = STI.getInstrInfo();
+
+  if (InMicroMipsMode && STI.hasMips32r6()) {
+    // This is microMIPS32r6 or microMIPS64r6 processor. Delay slot for
+    // branching instructions is not needed.
+    return Changed;
+  }
+
+  for (Iter I = MBB.begin(); I != MBB.end(); ++I) {
+    if (!hasUnoccupiedSlot(&*I))
+      continue;
+
+    ++FilledSlots;
+    Changed = true;
+
+    // Delay slot filling is disabled at -O0.
+    if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None)) {
+      bool Filled = false;
+
+      if (MipsCompactBranchPolicy.getValue() != CB_Always ||
+           !TII->getEquivalentCompactForm(I)) {
+        if (searchBackward(MBB, *I)) {
+          Filled = true;
+        } else if (I->isTerminator()) {
+          if (searchSuccBBs(MBB, I)) {
+            Filled = true;
+          }
+        } else if (searchForward(MBB, I)) {
+          Filled = true;
+        }
+      }
+
+      if (Filled) {
+        // Get instruction with delay slot.
+        MachineBasicBlock::instr_iterator DSI = I.getInstrIterator();
+
+        if (InMicroMipsMode && TII->getInstSizeInBytes(*std::next(DSI)) == 2 &&
+            DSI->isCall()) {
+          // If instruction in delay slot is 16b change opcode to
+          // corresponding instruction with short delay slot.
+
+          // TODO: Implement an instruction mapping table of 16bit opcodes to
+          // 32bit opcodes so that an instruction can be expanded. This would
+          // save 16 bits as a TAILCALL_MM pseudo requires a fullsized nop.
+          // TODO: Permit b16 when branching backwards to the the same function
+          // if it is in range.
+          DSI->setDesc(TII->get(getEquivalentCallShort(DSI->getOpcode())));
+        }
+        continue;
+      }
+    }
+
+    // For microMIPS if instruction is BEQ or BNE with one ZERO register, then
+    // instead of adding NOP replace this instruction with the corresponding
+    // compact branch instruction, i.e. BEQZC or BNEZC. Additionally
+    // PseudoReturn and PseudoIndirectBranch are expanded to JR_MM, so they can
+    // be replaced with JRC16_MM.
+
+    // For MIPSR6 attempt to produce the corresponding compact (no delay slot)
+    // form of the CTI. For indirect jumps this will not require inserting a
+    // NOP and for branches will hopefully avoid requiring a NOP.
+    if ((InMicroMipsMode ||
+         (STI.hasMips32r6() && MipsCompactBranchPolicy != CB_Never)) &&
+        TII->getEquivalentCompactForm(I)) {
+      I = replaceWithCompactBranch(MBB, I, I->getDebugLoc());
+      continue;
+    }
+
+    // Bundle the NOP to the instruction with the delay slot.
+    BuildMI(MBB, std::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
+    MIBundleBuilder(MBB, I, std::next(I, 2));
+  }
+
+  return Changed;
+}
+
+/// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in Mips MachineFunctions
+FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
+  return new Filler(tm);
+}
+
+template<typename IterTy>
+bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
+                         RegDefsUses &RegDU, InspectMemInstr& IM, Iter Slot,
+                         IterTy &Filler) const {
+  for (IterTy I = Begin; I != End;) {
+    IterTy CurrI = I;
+    ++I;
+
+    // skip debug value
+    if (CurrI->isDebugValue())
+      continue;
+
+    if (terminateSearch(*CurrI))
+      break;
+
+    assert((!CurrI->isCall() && !CurrI->isReturn() && !CurrI->isBranch()) &&
+           "Cannot put calls, returns or branches in delay slot.");
+
+    if (CurrI->isKill()) {
+      CurrI->eraseFromParent();
+      continue;
+    }
+
+    if (delayHasHazard(*CurrI, RegDU, IM))
+      continue;
+
+    const MipsSubtarget &STI = MBB.getParent()->getSubtarget<MipsSubtarget>();
+    if (STI.isTargetNaCl()) {
+      // In NaCl, instructions that must be masked are forbidden in delay slots.
+      // We only check for loads, stores and SP changes.  Calls, returns and
+      // branches are not checked because non-NaCl targets never put them in
+      // delay slots.
+      unsigned AddrIdx;
+      if ((isBasePlusOffsetMemoryAccess(CurrI->getOpcode(), &AddrIdx) &&
+           baseRegNeedsLoadStoreMask(CurrI->getOperand(AddrIdx).getReg())) ||
+          CurrI->modifiesRegister(Mips::SP, STI.getRegisterInfo()))
+        continue;
+    }
+
+    bool InMicroMipsMode = STI.inMicroMipsMode();
+    const MipsInstrInfo *TII = STI.getInstrInfo();
+    unsigned Opcode = (*Slot).getOpcode();
+    // This is complicated by the tail call optimization. For non-PIC code
+    // there is only a 32bit sized unconditional branch which can be assumed
+    // to be able to reach the target. b16 only has a range of +/- 1 KB.
+    // It's entirely possible that the target function is reachable with b16
+    // but we don't have enough information to make that decision.
+     if (InMicroMipsMode && TII->getInstSizeInBytes(*CurrI) == 2 &&
+        (Opcode == Mips::JR || Opcode == Mips::PseudoIndirectBranch ||
+         Opcode == Mips::PseudoReturn || Opcode == Mips::TAILCALL))
+      continue;
+
+    Filler = CurrI;
+    return true;
+  }
+
+  return false;
+}
+
+bool Filler::searchBackward(MachineBasicBlock &MBB, MachineInstr &Slot) const {
+  if (DisableBackwardSearch)
+    return false;
+
+  auto *Fn = MBB.getParent();
+  RegDefsUses RegDU(*Fn->getSubtarget().getRegisterInfo());
+  MemDefsUses MemDU(Fn->getDataLayout(), &Fn->getFrameInfo());
+  ReverseIter Filler;
+
+  RegDU.init(Slot);
+
+  MachineBasicBlock::iterator SlotI = Slot;
+  if (!searchRange(MBB, ++SlotI.getReverse(), MBB.rend(), RegDU, MemDU, Slot,
+                   Filler))
+    return false;
+
+  MBB.splice(std::next(SlotI), &MBB, Filler.getReverse());
+  MIBundleBuilder(MBB, SlotI, std::next(SlotI, 2));
+  ++UsefulSlots;
+  return true;
+}
+
+bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
+  // Can handle only calls.
+  if (DisableForwardSearch || !Slot->isCall())
+    return false;
+
+  RegDefsUses RegDU(*MBB.getParent()->getSubtarget().getRegisterInfo());
+  NoMemInstr NM;
+  Iter Filler;
+
+  RegDU.setCallerSaved(*Slot);
+
+  if (!searchRange(MBB, std::next(Slot), MBB.end(), RegDU, NM, Slot, Filler))
+    return false;
+
+  MBB.splice(std::next(Slot), &MBB, Filler);
+  MIBundleBuilder(MBB, Slot, std::next(Slot, 2));
+  ++UsefulSlots;
+  return true;
+}
+
+bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
+  if (DisableSuccBBSearch)
+    return false;
+
+  MachineBasicBlock *SuccBB = selectSuccBB(MBB);
+
+  if (!SuccBB)
+    return false;
+
+  RegDefsUses RegDU(*MBB.getParent()->getSubtarget().getRegisterInfo());
+  bool HasMultipleSuccs = false;
+  BB2BrMap BrMap;
+  std::unique_ptr<InspectMemInstr> IM;
+  Iter Filler;
+  auto *Fn = MBB.getParent();
+
+  // Iterate over SuccBB's predecessor list.
+  for (MachineBasicBlock::pred_iterator PI = SuccBB->pred_begin(),
+       PE = SuccBB->pred_end(); PI != PE; ++PI)
+    if (!examinePred(**PI, *SuccBB, RegDU, HasMultipleSuccs, BrMap))
+      return false;
+
+  // Do not allow moving instructions which have unallocatable register operands
+  // across basic block boundaries.
+  RegDU.setUnallocatableRegs(*Fn);
+
+  // Only allow moving loads from stack or constants if any of the SuccBB's
+  // predecessors have multiple successors.
+  if (HasMultipleSuccs) {
+    IM.reset(new LoadFromStackOrConst());
+  } else {
+    const MachineFrameInfo &MFI = Fn->getFrameInfo();
+    IM.reset(new MemDefsUses(Fn->getDataLayout(), &MFI));
+  }
+
+  if (!searchRange(MBB, SuccBB->begin(), SuccBB->end(), RegDU, *IM, Slot,
+                   Filler))
+    return false;
+
+  insertDelayFiller(Filler, BrMap);
+  addLiveInRegs(Filler, *SuccBB);
+  Filler->eraseFromParent();
+
+  return true;
+}
+
+MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
+  if (B.succ_empty())
+    return nullptr;
+
+  // Select the successor with the larget edge weight.
+  auto &Prob = getAnalysis<MachineBranchProbabilityInfo>();
+  MachineBasicBlock *S = *std::max_element(
+      B.succ_begin(), B.succ_end(),
+      [&](const MachineBasicBlock *Dst0, const MachineBasicBlock *Dst1) {
+        return Prob.getEdgeProbability(&B, Dst0) <
+               Prob.getEdgeProbability(&B, Dst1);
+      });
+  return S->isEHPad() ? nullptr : S;
+}
+
+std::pair<MipsInstrInfo::BranchType, MachineInstr *>
+Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
+  const MipsInstrInfo *TII =
+      MBB.getParent()->getSubtarget<MipsSubtarget>().getInstrInfo();
+  MachineBasicBlock *TrueBB = nullptr, *FalseBB = nullptr;
+  SmallVector<MachineInstr*, 2> BranchInstrs;
+  SmallVector<MachineOperand, 2> Cond;
+
+  MipsInstrInfo::BranchType R =
+      TII->analyzeBranch(MBB, TrueBB, FalseBB, Cond, false, BranchInstrs);
+
+  if ((R == MipsInstrInfo::BT_None) || (R == MipsInstrInfo::BT_NoBranch))
+    return std::make_pair(R, nullptr);
+
+  if (R != MipsInstrInfo::BT_CondUncond) {
+    if (!hasUnoccupiedSlot(BranchInstrs[0]))
+      return std::make_pair(MipsInstrInfo::BT_None, nullptr);
+
+    assert(((R != MipsInstrInfo::BT_Uncond) || (TrueBB == &Dst)));
+
+    return std::make_pair(R, BranchInstrs[0]);
+  }
+
+  assert((TrueBB == &Dst) || (FalseBB == &Dst));
+
+  // Examine the conditional branch. See if its slot is occupied.
+  if (hasUnoccupiedSlot(BranchInstrs[0]))
+    return std::make_pair(MipsInstrInfo::BT_Cond, BranchInstrs[0]);
+
+  // If that fails, try the unconditional branch.
+  if (hasUnoccupiedSlot(BranchInstrs[1]) && (FalseBB == &Dst))
+    return std::make_pair(MipsInstrInfo::BT_Uncond, BranchInstrs[1]);
+
+  return std::make_pair(MipsInstrInfo::BT_None, nullptr);
+}
+
+bool Filler::examinePred(MachineBasicBlock &Pred, const MachineBasicBlock &Succ,
+                         RegDefsUses &RegDU, bool &HasMultipleSuccs,
+                         BB2BrMap &BrMap) const {
+  std::pair<MipsInstrInfo::BranchType, MachineInstr *> P =
+    getBranch(Pred, Succ);
+
+  // Return if either getBranch wasn't able to analyze the branches or there
+  // were no branches with unoccupied slots.
+  if (P.first == MipsInstrInfo::BT_None)
+    return false;
+
+  if ((P.first != MipsInstrInfo::BT_Uncond) &&
+      (P.first != MipsInstrInfo::BT_NoBranch)) {
+    HasMultipleSuccs = true;
+    RegDU.addLiveOut(Pred, Succ);
+  }
+
+  BrMap[&Pred] = P.second;
+  return true;
+}
+
+bool Filler::delayHasHazard(const MachineInstr &Candidate, RegDefsUses &RegDU,
+                            InspectMemInstr &IM) const {
+  assert(!Candidate.isKill() &&
+         "KILL instructions should have been eliminated at this point.");
+
+  bool HasHazard = Candidate.isImplicitDef();
+
+  HasHazard |= IM.hasHazard(Candidate);
+  HasHazard |= RegDU.update(Candidate, 0, Candidate.getNumOperands());
+
+  return HasHazard;
+}
+
+bool Filler::terminateSearch(const MachineInstr &Candidate) const {
+  return (Candidate.isTerminator() || Candidate.isCall() ||
+          Candidate.isPosition() || Candidate.isInlineAsm() ||
+          Candidate.hasUnmodeledSideEffects());
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td
new file mode 100644
index 000000000000..8c3024810d27
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td
@@ -0,0 +1,84 @@
+//===- MipsEVAInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips32r6 instruction formats.
+//
+//===----------------------------------------------------------------------===//
+
+class MipsEVAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
+                    PredicateControl, StdArch {
+  let DecoderNamespace = "Mips";
+  let EncodingPredicates = [HasStdEnc];
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Field Values
+//
+//===----------------------------------------------------------------------===//
+
+// Memory Load/Store EVA
+def OPCODE6_LBE        : OPCODE6<0b101100>;
+def OPCODE6_LBuE       : OPCODE6<0b101000>;
+def OPCODE6_LHE        : OPCODE6<0b101101>;
+def OPCODE6_LHuE       : OPCODE6<0b101001>;
+def OPCODE6_LWE        : OPCODE6<0b101111>;
+
+def OPCODE6_SBE        : OPCODE6<0b011100>;
+def OPCODE6_SHE        : OPCODE6<0b011101>;
+def OPCODE6_SWE        : OPCODE6<0b011111>;
+
+// load/store left/right EVA
+def OPCODE6_LWLE       : OPCODE6<0b011001>;
+def OPCODE6_LWRE       : OPCODE6<0b011010>;
+def OPCODE6_SWLE       : OPCODE6<0b100001>;
+def OPCODE6_SWRE       : OPCODE6<0b100010>;
+
+// Load-linked EVA, Store-conditional EVA
+def OPCODE6_LLE        : OPCODE6<0b101110>;
+def OPCODE6_SCE        : OPCODE6<0b011110>;
+
+def OPCODE6_TLBINV     : OPCODE6<0b000011>;
+def OPCODE6_TLBINVF    : OPCODE6<0b000100>;
+
+def OPCODE6_CACHEE     : OPCODE6<0b011011>;
+def OPCODE6_PREFE      : OPCODE6<0b100011>;
+
+def OPGROUP_COP0_TLB   : OPGROUP<0b010000>;
+
+//===----------------------------------------------------------------------===//
+//
+// Encoding Formats
+//
+//===----------------------------------------------------------------------===//
+
+class SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6 Operation> : MipsEVAInst {
+  bits<21> addr;
+  bits<5> hint;
+  bits<5> base = addr{20-16};
+  bits<9> offset = addr{8-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = base;
+  let Inst{20-16} = hint;
+  let Inst{15-7}  = offset;
+  let Inst{6}     = 0;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class TLB_FM<OPCODE6 Operation> : MipsEVAInst {
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP0_TLB.Value;
+  let Inst{25} = 1;       // CO
+  let Inst{24-6} = 0;
+  let Inst{5-0} = Operation.Value;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td
new file mode 100644
index 000000000000..26df263d228b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td
@@ -0,0 +1,209 @@
+//===- MipsEVAInstrInfo.td - EVA ASE instructions -*- tablegen ------------*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips EVA ASE instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction encodings
+//
+//===----------------------------------------------------------------------===//
+
+// Memory Load/Store EVA encodings
+class LBE_ENC     : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LBE>;
+class LBuE_ENC    : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LBuE>;
+class LHE_ENC     : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LHE>;
+class LHuE_ENC    : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LHuE>;
+class LWE_ENC     : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWE>;
+
+class SBE_ENC     : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SBE>;
+class SHE_ENC     : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SHE>;
+class SWE_ENC     : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWE>;
+
+// load/store left/right EVA encodings
+class LWLE_ENC    : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWLE>;
+class LWRE_ENC    : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWRE>;
+class SWLE_ENC    : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWLE>;
+class SWRE_ENC    : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWRE>;
+
+// Load-linked EVA, Store-conditional EVA encodings
+class LLE_ENC     : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LLE>;
+class SCE_ENC     : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SCE>;
+
+class TLBINV_ENC  : TLB_FM<OPCODE6_TLBINV>;
+class TLBINVF_ENC : TLB_FM<OPCODE6_TLBINVF>;
+
+class CACHEE_ENC  : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_CACHEE>;
+class PREFE_ENC   : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_PREFE>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction descriptions
+//
+//===----------------------------------------------------------------------===//
+
+// Memory Load/Store EVA descriptions
+class LOAD_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                         InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  string DecoderMethod = "DecodeMemEVA";
+  bit canFoldAsLoad = 1;
+  bit mayLoad = 1;
+  InstrItinClass Itinerary = itin;
+}
+
+class LBE_DESC  : LOAD_EVA_DESC_BASE<"lbe",  GPR32Opnd, II_LBE>;
+class LBuE_DESC : LOAD_EVA_DESC_BASE<"lbue", GPR32Opnd, II_LBUE>;
+class LHE_DESC  : LOAD_EVA_DESC_BASE<"lhe",  GPR32Opnd, II_LHE>;
+class LHuE_DESC : LOAD_EVA_DESC_BASE<"lhue", GPR32Opnd, II_LHUE>;
+class LWE_DESC  : LOAD_EVA_DESC_BASE<"lwe",  GPR32Opnd, II_LWE>;
+
+class STORE_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                          SDPatternOperator OpNode = null_frag,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  string DecoderMethod = "DecodeMemEVA";
+  bit mayStore = 1;
+  InstrItinClass Itinerary = itin;
+}
+
+class SBE_DESC  : STORE_EVA_DESC_BASE<"sbe",  GPR32Opnd, null_frag, II_SBE>;
+class SHE_DESC  : STORE_EVA_DESC_BASE<"she",  GPR32Opnd, null_frag, II_SHE>;
+class SWE_DESC  : STORE_EVA_DESC_BASE<"swe",  GPR32Opnd, null_frag, II_SWE>;
+
+// Load/Store Left/Right EVA descriptions
+class LOAD_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                                    InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins mem_simm9:$addr, GPROpnd:$src);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  string DecoderMethod = "DecodeMemEVA";
+  string Constraints = "$src = $rt";
+  bit canFoldAsLoad = 1;
+  InstrItinClass Itinerary = itin;
+}
+
+class LWLE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwle",  GPR32Opnd, II_LWLE>;
+class LWRE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwre",  GPR32Opnd, II_LWRE>;
+
+class STORE_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                                     InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  string DecoderMethod = "DecodeMemEVA";
+  InstrItinClass Itinerary = itin;
+}
+
+class SWLE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swle",  GPR32Opnd, II_SWLE>;
+class SWRE_DESC  : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swre",  GPR32Opnd, II_SWRE>;
+
+// Load-linked EVA, Store-conditional EVA descriptions
+class LLE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                    InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayLoad = 1;
+  string DecoderMethod = "DecodeMemEVA";
+  InstrItinClass Itinerary = itin;
+}
+
+class LLE_DESC : LLE_DESC_BASE<"lle", GPR32Opnd, II_LLE>;
+
+class SCE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                    InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs GPROpnd:$dst);
+  dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
+  list<dag> Pattern = [];
+  bit mayStore = 1;
+  string Constraints = "$rt = $dst";
+  string DecoderMethod = "DecodeMemEVA";
+  InstrItinClass Itinerary = itin;
+}
+
+class SCE_DESC : SCE_DESC_BASE<"sce", GPR32Opnd, II_SCE>;
+
+class TLB_DESC_BASE<string instr_asm, InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins);
+  string AsmString = instr_asm;
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = itin;
+}
+
+class TLBINV_DESC  : TLB_DESC_BASE<"tlbinv", II_TLBINV>;
+class TLBINVF_DESC : TLB_DESC_BASE<"tlbinvf", II_TLBINVF>;
+
+class CACHEE_DESC_BASE<string instr_asm, Operand MemOpnd,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins  MemOpnd:$addr, uimm5:$hint);
+  string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
+  list<dag> Pattern = [];
+  string DecoderMethod = "DecodeCacheeOp_CacheOpR6";
+  InstrItinClass Itinerary = itin;
+}
+
+class CACHEE_DESC  : CACHEE_DESC_BASE<"cachee", mem_simm9, II_CACHEE>;
+class PREFE_DESC   : CACHEE_DESC_BASE<"prefe", mem_simm9, II_PREFE>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction definitions
+//
+//===----------------------------------------------------------------------===//
+
+/// Load and Store EVA Instructions
+def LBE     : LBE_ENC, LBE_DESC, INSN_EVA;
+def LBuE    : LBuE_ENC, LBuE_DESC, INSN_EVA;
+def LHE     : LHE_ENC, LHE_DESC, INSN_EVA;
+def LHuE    : LHuE_ENC, LHuE_DESC, INSN_EVA;
+let AdditionalPredicates = [NotInMicroMips] in {
+def LWE     : LWE_ENC, LWE_DESC, INSN_EVA;
+}
+def SBE     : SBE_ENC, SBE_DESC, INSN_EVA;
+def SHE     : SHE_ENC, SHE_DESC, INSN_EVA;
+let AdditionalPredicates = [NotInMicroMips] in {
+def SWE     : SWE_ENC, SWE_DESC, INSN_EVA;
+}
+
+/// load/store left/right EVA
+let AdditionalPredicates = [NotInMicroMips] in {
+def LWLE    : LWLE_ENC, LWLE_DESC, INSN_EVA_NOT_32R6_64R6;
+def LWRE    : LWRE_ENC, LWRE_DESC, INSN_EVA_NOT_32R6_64R6;
+def SWLE    : SWLE_ENC, SWLE_DESC, INSN_EVA_NOT_32R6_64R6;
+def SWRE    : SWRE_ENC, SWRE_DESC, INSN_EVA_NOT_32R6_64R6;
+}
+
+/// Load-linked EVA, Store-conditional EVA
+let AdditionalPredicates = [NotInMicroMips] in {
+def LLE     : LLE_ENC, LLE_DESC, INSN_EVA;
+def SCE     : SCE_ENC, SCE_DESC, INSN_EVA;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  def TLBINV  : TLBINV_ENC, TLBINV_DESC, INSN_EVA;
+  def TLBINVF : TLBINVF_ENC, TLBINVF_DESC, INSN_EVA;
+}
+
+def CACHEE  : CACHEE_ENC, CACHEE_DESC, INSN_EVA;
+def PREFE   : PREFE_ENC, PREFE_DESC, INSN_EVA;
diff --git a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp
new file mode 100644
index 000000000000..29f3e2c07e04
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -0,0 +1,2081 @@
+//===-- MipsFastISel.cpp - Mips FastISel implementation --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the MIPS-specific support for the FastISel class.
+/// Some of the target-specific code is generated by tablegen in the file
+/// MipsGenFastISel.inc, which is #included here.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MipsCCState.h"
+#include "MipsInstrInfo.h"
+#include "MipsISelLowering.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "mips-fastisel"
+
+using namespace llvm;
+
+namespace {
+
+class MipsFastISel final : public FastISel {
+
+  // All possible address modes.
+  class Address {
+  public:
+    typedef enum { RegBase, FrameIndexBase } BaseKind;
+
+  private:
+    BaseKind Kind;
+    union {
+      unsigned Reg;
+      int FI;
+    } Base;
+
+    int64_t Offset;
+
+    const GlobalValue *GV;
+
+  public:
+    // Innocuous defaults for our address.
+    Address() : Kind(RegBase), Offset(0), GV(0) { Base.Reg = 0; }
+    void setKind(BaseKind K) { Kind = K; }
+    BaseKind getKind() const { return Kind; }
+    bool isRegBase() const { return Kind == RegBase; }
+    bool isFIBase() const { return Kind == FrameIndexBase; }
+    void setReg(unsigned Reg) {
+      assert(isRegBase() && "Invalid base register access!");
+      Base.Reg = Reg;
+    }
+    unsigned getReg() const {
+      assert(isRegBase() && "Invalid base register access!");
+      return Base.Reg;
+    }
+    void setFI(unsigned FI) {
+      assert(isFIBase() && "Invalid base frame index access!");
+      Base.FI = FI;
+    }
+    unsigned getFI() const {
+      assert(isFIBase() && "Invalid base frame index access!");
+      return Base.FI;
+    }
+
+    void setOffset(int64_t Offset_) { Offset = Offset_; }
+    int64_t getOffset() const { return Offset; }
+    void setGlobalValue(const GlobalValue *G) { GV = G; }
+    const GlobalValue *getGlobalValue() { return GV; }
+  };
+
+  /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const TargetMachine &TM;
+  const MipsSubtarget *Subtarget;
+  const TargetInstrInfo &TII;
+  const TargetLowering &TLI;
+  MipsFunctionInfo *MFI;
+
+  // Convenience variables to avoid some queries.
+  LLVMContext *Context;
+
+  bool fastLowerArguments() override;
+  bool fastLowerCall(CallLoweringInfo &CLI) override;
+  bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
+
+  bool UnsupportedFPMode; // To allow fast-isel to proceed and just not handle
+  // floating point but not reject doing fast-isel in other
+  // situations
+
+private:
+  // Selection routines.
+  bool selectLogicalOp(const Instruction *I);
+  bool selectLoad(const Instruction *I);
+  bool selectStore(const Instruction *I);
+  bool selectBranch(const Instruction *I);
+  bool selectSelect(const Instruction *I);
+  bool selectCmp(const Instruction *I);
+  bool selectFPExt(const Instruction *I);
+  bool selectFPTrunc(const Instruction *I);
+  bool selectFPToInt(const Instruction *I, bool IsSigned);
+  bool selectRet(const Instruction *I);
+  bool selectTrunc(const Instruction *I);
+  bool selectIntExt(const Instruction *I);
+  bool selectShift(const Instruction *I);
+  bool selectDivRem(const Instruction *I, unsigned ISDOpcode);
+
+  // Utility helper routines.
+  bool isTypeLegal(Type *Ty, MVT &VT);
+  bool isTypeSupported(Type *Ty, MVT &VT);
+  bool isLoadTypeLegal(Type *Ty, MVT &VT);
+  bool computeAddress(const Value *Obj, Address &Addr);
+  bool computeCallAddress(const Value *V, Address &Addr);
+  void simplifyAddress(Address &Addr);
+
+  // Emit helper routines.
+  bool emitCmp(unsigned DestReg, const CmpInst *CI);
+  bool emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                unsigned Alignment = 0);
+  bool emitStore(MVT VT, unsigned SrcReg, Address Addr,
+                 MachineMemOperand *MMO = nullptr);
+  bool emitStore(MVT VT, unsigned SrcReg, Address &Addr,
+                 unsigned Alignment = 0);
+  unsigned emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+  bool emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, unsigned DestReg,
+
+                  bool IsZExt);
+  bool emitIntZExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, unsigned DestReg);
+
+  bool emitIntSExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, unsigned DestReg);
+  bool emitIntSExt32r1(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                       unsigned DestReg);
+  bool emitIntSExt32r2(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                       unsigned DestReg);
+
+  unsigned getRegEnsuringSimpleIntegerWidening(const Value *, bool IsUnsigned);
+
+  unsigned emitLogicalOp(unsigned ISDOpc, MVT RetVT, const Value *LHS,
+                         const Value *RHS);
+
+  unsigned materializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned materializeGV(const GlobalValue *GV, MVT VT);
+  unsigned materializeInt(const Constant *C, MVT VT);
+  unsigned materialize32BitInt(int64_t Imm, const TargetRegisterClass *RC);
+  unsigned materializeExternalCallSym(MCSymbol *Syn);
+
+  MachineInstrBuilder emitInst(unsigned Opc) {
+    return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+  }
+  MachineInstrBuilder emitInst(unsigned Opc, unsigned DstReg) {
+    return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                   DstReg);
+  }
+  MachineInstrBuilder emitInstStore(unsigned Opc, unsigned SrcReg,
+                                    unsigned MemReg, int64_t MemOffset) {
+    return emitInst(Opc).addReg(SrcReg).addReg(MemReg).addImm(MemOffset);
+  }
+  MachineInstrBuilder emitInstLoad(unsigned Opc, unsigned DstReg,
+                                   unsigned MemReg, int64_t MemOffset) {
+    return emitInst(Opc, DstReg).addReg(MemReg).addImm(MemOffset);
+  }
+
+  unsigned fastEmitInst_rr(unsigned MachineInstOpcode,
+                           const TargetRegisterClass *RC,
+                           unsigned Op0, bool Op0IsKill,
+                           unsigned Op1, bool Op1IsKill);
+
+  // for some reason, this default is not generated by tablegen
+  // so we explicitly generate it here.
+  //
+  unsigned fastEmitInst_riir(uint64_t inst, const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill, uint64_t imm1,
+                             uint64_t imm2, unsigned Op3, bool Op3IsKill) {
+    return 0;
+  }
+
+  // Call handling routines.
+private:
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
+  bool processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &ArgVTs,
+                       unsigned &NumBytes);
+  bool finishCall(CallLoweringInfo &CLI, MVT RetVT, unsigned NumBytes);
+  const MipsABIInfo &getABI() const {
+    return static_cast<const MipsTargetMachine &>(TM).getABI();
+  }
+
+public:
+  // Backend specific FastISel code.
+  explicit MipsFastISel(FunctionLoweringInfo &funcInfo,
+                        const TargetLibraryInfo *libInfo)
+      : FastISel(funcInfo, libInfo), TM(funcInfo.MF->getTarget()),
+        Subtarget(&funcInfo.MF->getSubtarget<MipsSubtarget>()),
+        TII(*Subtarget->getInstrInfo()), TLI(*Subtarget->getTargetLowering()) {
+    MFI = funcInfo.MF->getInfo<MipsFunctionInfo>();
+    Context = &funcInfo.Fn->getContext();
+    UnsupportedFPMode = Subtarget->isFP64bit() || Subtarget->useSoftFloat();
+  }
+
+  unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
+  unsigned fastMaterializeConstant(const Constant *C) override;
+  bool fastSelectInstruction(const Instruction *I) override;
+
+#include "MipsGenFastISel.inc"
+};
+} // end anonymous namespace.
+
+static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
+                    CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+                    CCState &State) LLVM_ATTRIBUTE_UNUSED;
+
+static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, MVT LocVT,
+                            CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  llvm_unreachable("should not be called");
+}
+
+static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT,
+                            CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  llvm_unreachable("should not be called");
+}
+
+#include "MipsGenCallingConv.inc"
+
+CCAssignFn *MipsFastISel::CCAssignFnForCall(CallingConv::ID CC) const {
+  return CC_MipsO32;
+}
+
+unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
+                                     const Value *LHS, const Value *RHS) {
+  // Canonicalize immediates to the RHS first.
+  if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS))
+    std::swap(LHS, RHS);
+
+  unsigned Opc;
+  switch (ISDOpc) {
+  case ISD::AND:
+    Opc = Mips::AND;
+    break;
+  case ISD::OR:
+    Opc = Mips::OR;
+    break;
+  case ISD::XOR:
+    Opc = Mips::XOR;
+    break;
+  default:
+    llvm_unreachable("unexpected opcode");
+  }
+
+  unsigned LHSReg = getRegForValue(LHS);
+  if (!LHSReg)
+    return 0;
+
+  unsigned RHSReg;
+  if (const auto *C = dyn_cast<ConstantInt>(RHS))
+    RHSReg = materializeInt(C, MVT::i32);
+  else
+    RHSReg = getRegForValue(RHS);
+  if (!RHSReg)
+    return 0;
+
+  unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+  if (!ResultReg)
+    return 0;
+
+  emitInst(Opc, ResultReg).addReg(LHSReg).addReg(RHSReg);
+  return ResultReg;
+}
+
+unsigned MipsFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
+  assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i32 &&
+         "Alloca should always return a pointer.");
+
+  DenseMap<const AllocaInst *, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LEA_ADDiu),
+            ResultReg)
+        .addFrameIndex(SI->second)
+        .addImm(0);
+    return ResultReg;
+  }
+
+  return 0;
+}
+
+unsigned MipsFastISel::materializeInt(const Constant *C, MVT VT) {
+  if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1)
+    return 0;
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  const ConstantInt *CI = cast<ConstantInt>(C);
+  return materialize32BitInt(CI->getZExtValue(), RC);
+}
+
+unsigned MipsFastISel::materialize32BitInt(int64_t Imm,
+                                           const TargetRegisterClass *RC) {
+  unsigned ResultReg = createResultReg(RC);
+
+  if (isInt<16>(Imm)) {
+    unsigned Opc = Mips::ADDiu;
+    emitInst(Opc, ResultReg).addReg(Mips::ZERO).addImm(Imm);
+    return ResultReg;
+  } else if (isUInt<16>(Imm)) {
+    emitInst(Mips::ORi, ResultReg).addReg(Mips::ZERO).addImm(Imm);
+    return ResultReg;
+  }
+  unsigned Lo = Imm & 0xFFFF;
+  unsigned Hi = (Imm >> 16) & 0xFFFF;
+  if (Lo) {
+    // Both Lo and Hi have nonzero bits.
+    unsigned TmpReg = createResultReg(RC);
+    emitInst(Mips::LUi, TmpReg).addImm(Hi);
+    emitInst(Mips::ORi, ResultReg).addReg(TmpReg).addImm(Lo);
+  } else {
+    emitInst(Mips::LUi, ResultReg).addImm(Hi);
+  }
+  return ResultReg;
+}
+
+unsigned MipsFastISel::materializeFP(const ConstantFP *CFP, MVT VT) {
+  if (UnsupportedFPMode)
+    return 0;
+  int64_t Imm = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+  if (VT == MVT::f32) {
+    const TargetRegisterClass *RC = &Mips::FGR32RegClass;
+    unsigned DestReg = createResultReg(RC);
+    unsigned TempReg = materialize32BitInt(Imm, &Mips::GPR32RegClass);
+    emitInst(Mips::MTC1, DestReg).addReg(TempReg);
+    return DestReg;
+  } else if (VT == MVT::f64) {
+    const TargetRegisterClass *RC = &Mips::AFGR64RegClass;
+    unsigned DestReg = createResultReg(RC);
+    unsigned TempReg1 = materialize32BitInt(Imm >> 32, &Mips::GPR32RegClass);
+    unsigned TempReg2 =
+        materialize32BitInt(Imm & 0xFFFFFFFF, &Mips::GPR32RegClass);
+    emitInst(Mips::BuildPairF64, DestReg).addReg(TempReg2).addReg(TempReg1);
+    return DestReg;
+  }
+  return 0;
+}
+
+unsigned MipsFastISel::materializeGV(const GlobalValue *GV, MVT VT) {
+  // For now 32-bit only.
+  if (VT != MVT::i32)
+    return 0;
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  unsigned DestReg = createResultReg(RC);
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  bool IsThreadLocal = GVar && GVar->isThreadLocal();
+  // TLS not supported at this time.
+  if (IsThreadLocal)
+    return 0;
+  emitInst(Mips::LW, DestReg)
+      .addReg(MFI->getGlobalBaseReg())
+      .addGlobalAddress(GV, 0, MipsII::MO_GOT);
+  if ((GV->hasInternalLinkage() ||
+       (GV->hasLocalLinkage() && !isa<Function>(GV)))) {
+    unsigned TempReg = createResultReg(RC);
+    emitInst(Mips::ADDiu, TempReg)
+        .addReg(DestReg)
+        .addGlobalAddress(GV, 0, MipsII::MO_ABS_LO);
+    DestReg = TempReg;
+  }
+  return DestReg;
+}
+
+unsigned MipsFastISel::materializeExternalCallSym(MCSymbol *Sym) {
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  unsigned DestReg = createResultReg(RC);
+  emitInst(Mips::LW, DestReg)
+      .addReg(MFI->getGlobalBaseReg())
+      .addSym(Sym, MipsII::MO_GOT);
+  return DestReg;
+}
+
+// Materialize a constant into a register, and return the register
+// number (or zero if we failed to handle it).
+unsigned MipsFastISel::fastMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(DL, C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple())
+    return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return (UnsupportedFPMode) ? 0 : materializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return materializeGV(GV, VT);
+  else if (isa<ConstantInt>(C))
+    return materializeInt(C, VT);
+
+  return 0;
+}
+
+bool MipsFastISel::computeAddress(const Value *Obj, Address &Addr) {
+
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+  switch (Opcode) {
+  default:
+    break;
+  case Instruction::BitCast: {
+    // Look through bitcasts.
+    return computeAddress(U->getOperand(0), Addr);
+  }
+  case Instruction::GetElementPtr: {
+    Address SavedAddr = Addr;
+    int64_t TmpOffset = Addr.getOffset();
+    // Iterate through the GEP folding the constants into offsets where
+    // we can.
+    gep_type_iterator GTI = gep_type_begin(U);
+    for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e;
+         ++i, ++GTI) {
+      const Value *Op = *i;
+      if (StructType *STy = GTI.getStructTypeOrNull()) {
+        const StructLayout *SL = DL.getStructLayout(STy);
+        unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+        TmpOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+        for (;;) {
+          if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+            // Constant-offset addressing.
+            TmpOffset += CI->getSExtValue() * S;
+            break;
+          }
+          if (canFoldAddIntoGEP(U, Op)) {
+            // A compatible add with a constant operand. Fold the constant.
+            ConstantInt *CI =
+                cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+            TmpOffset += CI->getSExtValue() * S;
+            // Iterate on the other operand.
+            Op = cast<AddOperator>(Op)->getOperand(0);
+            continue;
+          }
+          // Unsupported
+          goto unsupported_gep;
+        }
+      }
+    }
+    // Try to grab the base operand now.
+    Addr.setOffset(TmpOffset);
+    if (computeAddress(U->getOperand(0), Addr))
+      return true;
+    // We failed, restore everything and try the other options.
+    Addr = SavedAddr;
+  unsupported_gep:
+    break;
+  }
+  case Instruction::Alloca: {
+    const AllocaInst *AI = cast<AllocaInst>(Obj);
+    DenseMap<const AllocaInst *, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
+      Addr.setKind(Address::FrameIndexBase);
+      Addr.setFI(SI->second);
+      return true;
+    }
+    break;
+  }
+  }
+  Addr.setReg(getRegForValue(Obj));
+  return Addr.getReg() != 0;
+}
+
+bool MipsFastISel::computeCallAddress(const Value *V, Address &Addr) {
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+
+  if (const auto *I = dyn_cast<Instruction>(V)) {
+    // Check if the value is defined in the same basic block. This information
+    // is crucial to know whether or not folding an operand is valid.
+    if (I->getParent() == FuncInfo.MBB->getBasicBlock()) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const auto *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+  default:
+    break;
+  case Instruction::BitCast:
+    // Look past bitcasts if its operand is in the same BB.
+      return computeCallAddress(U->getOperand(0), Addr);
+    break;
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs if its operand is in the same BB.
+    if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+        TLI.getPointerTy(DL))
+      return computeCallAddress(U->getOperand(0), Addr);
+    break;
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints if its operand is in the same BB.
+    if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+      return computeCallAddress(U->getOperand(0), Addr);
+    break;
+  }
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    Addr.setGlobalValue(GV);
+    return true;
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!Addr.getGlobalValue()) {
+    Addr.setReg(getRegForValue(V));
+    return Addr.getReg() != 0;
+  }
+
+  return false;
+}
+
+bool MipsFastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT evt = TLI.getValueType(DL, Ty, true);
+  // Only handle simple types.
+  if (evt == MVT::Other || !evt.isSimple())
+    return false;
+  VT = evt.getSimpleVT();
+
+  // Handle all legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+bool MipsFastISel::isTypeSupported(Type *Ty, MVT &VT) {
+  if (Ty->isVectorTy())
+    return false;
+
+  if (isTypeLegal(Ty, VT))
+    return true;
+
+  // If this is a type than can be sign or zero-extended to a basic operation
+  // go ahead and accept it now.
+  if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+    return true;
+
+  return false;
+}
+
+bool MipsFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT))
+    return true;
+  // We will extend this in a later patch:
+  //   If this is a type than can be sign or zero-extended to a basic operation
+  //   go ahead and accept it now.
+  if (VT == MVT::i8 || VT == MVT::i16)
+    return true;
+  return false;
+}
+// Because of how EmitCmp is called with fast-isel, you can
+// end up with redundant "andi" instructions after the sequences emitted below.
+// We should try and solve this issue in the future.
+//
+bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) {
+  const Value *Left = CI->getOperand(0), *Right = CI->getOperand(1);
+  bool IsUnsigned = CI->isUnsigned();
+  unsigned LeftReg = getRegEnsuringSimpleIntegerWidening(Left, IsUnsigned);
+  if (LeftReg == 0)
+    return false;
+  unsigned RightReg = getRegEnsuringSimpleIntegerWidening(Right, IsUnsigned);
+  if (RightReg == 0)
+    return false;
+  CmpInst::Predicate P = CI->getPredicate();
+
+  switch (P) {
+  default:
+    return false;
+  case CmpInst::ICMP_EQ: {
+    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    emitInst(Mips::XOR, TempReg).addReg(LeftReg).addReg(RightReg);
+    emitInst(Mips::SLTiu, ResultReg).addReg(TempReg).addImm(1);
+    break;
+  }
+  case CmpInst::ICMP_NE: {
+    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    emitInst(Mips::XOR, TempReg).addReg(LeftReg).addReg(RightReg);
+    emitInst(Mips::SLTu, ResultReg).addReg(Mips::ZERO).addReg(TempReg);
+    break;
+  }
+  case CmpInst::ICMP_UGT: {
+    emitInst(Mips::SLTu, ResultReg).addReg(RightReg).addReg(LeftReg);
+    break;
+  }
+  case CmpInst::ICMP_ULT: {
+    emitInst(Mips::SLTu, ResultReg).addReg(LeftReg).addReg(RightReg);
+    break;
+  }
+  case CmpInst::ICMP_UGE: {
+    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    emitInst(Mips::SLTu, TempReg).addReg(LeftReg).addReg(RightReg);
+    emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
+    break;
+  }
+  case CmpInst::ICMP_ULE: {
+    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    emitInst(Mips::SLTu, TempReg).addReg(RightReg).addReg(LeftReg);
+    emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
+    break;
+  }
+  case CmpInst::ICMP_SGT: {
+    emitInst(Mips::SLT, ResultReg).addReg(RightReg).addReg(LeftReg);
+    break;
+  }
+  case CmpInst::ICMP_SLT: {
+    emitInst(Mips::SLT, ResultReg).addReg(LeftReg).addReg(RightReg);
+    break;
+  }
+  case CmpInst::ICMP_SGE: {
+    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    emitInst(Mips::SLT, TempReg).addReg(LeftReg).addReg(RightReg);
+    emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
+    break;
+  }
+  case CmpInst::ICMP_SLE: {
+    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    emitInst(Mips::SLT, TempReg).addReg(RightReg).addReg(LeftReg);
+    emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
+    break;
+  }
+  case CmpInst::FCMP_OEQ:
+  case CmpInst::FCMP_UNE:
+  case CmpInst::FCMP_OLT:
+  case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_OGT:
+  case CmpInst::FCMP_OGE: {
+    if (UnsupportedFPMode)
+      return false;
+    bool IsFloat = Left->getType()->isFloatTy();
+    bool IsDouble = Left->getType()->isDoubleTy();
+    if (!IsFloat && !IsDouble)
+      return false;
+    unsigned Opc, CondMovOpc;
+    switch (P) {
+    case CmpInst::FCMP_OEQ:
+      Opc = IsFloat ? Mips::C_EQ_S : Mips::C_EQ_D32;
+      CondMovOpc = Mips::MOVT_I;
+      break;
+    case CmpInst::FCMP_UNE:
+      Opc = IsFloat ? Mips::C_EQ_S : Mips::C_EQ_D32;
+      CondMovOpc = Mips::MOVF_I;
+      break;
+    case CmpInst::FCMP_OLT:
+      Opc = IsFloat ? Mips::C_OLT_S : Mips::C_OLT_D32;
+      CondMovOpc = Mips::MOVT_I;
+      break;
+    case CmpInst::FCMP_OLE:
+      Opc = IsFloat ? Mips::C_OLE_S : Mips::C_OLE_D32;
+      CondMovOpc = Mips::MOVT_I;
+      break;
+    case CmpInst::FCMP_OGT:
+      Opc = IsFloat ? Mips::C_ULE_S : Mips::C_ULE_D32;
+      CondMovOpc = Mips::MOVF_I;
+      break;
+    case CmpInst::FCMP_OGE:
+      Opc = IsFloat ? Mips::C_ULT_S : Mips::C_ULT_D32;
+      CondMovOpc = Mips::MOVF_I;
+      break;
+    default:
+      llvm_unreachable("Only switching of a subset of CCs.");
+    }
+    unsigned RegWithZero = createResultReg(&Mips::GPR32RegClass);
+    unsigned RegWithOne = createResultReg(&Mips::GPR32RegClass);
+    emitInst(Mips::ADDiu, RegWithZero).addReg(Mips::ZERO).addImm(0);
+    emitInst(Mips::ADDiu, RegWithOne).addReg(Mips::ZERO).addImm(1);
+    emitInst(Opc).addReg(LeftReg).addReg(RightReg).addReg(
+        Mips::FCC0, RegState::ImplicitDefine);
+    emitInst(CondMovOpc, ResultReg)
+        .addReg(RegWithOne)
+        .addReg(Mips::FCC0)
+        .addReg(RegWithZero);
+    break;
+  }
+  }
+  return true;
+}
+bool MipsFastISel::emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                            unsigned Alignment) {
+  //
+  // more cases will be handled here in following patches.
+  //
+  unsigned Opc;
+  switch (VT.SimpleTy) {
+  case MVT::i32: {
+    ResultReg = createResultReg(&Mips::GPR32RegClass);
+    Opc = Mips::LW;
+    break;
+  }
+  case MVT::i16: {
+    ResultReg = createResultReg(&Mips::GPR32RegClass);
+    Opc = Mips::LHu;
+    break;
+  }
+  case MVT::i8: {
+    ResultReg = createResultReg(&Mips::GPR32RegClass);
+    Opc = Mips::LBu;
+    break;
+  }
+  case MVT::f32: {
+    if (UnsupportedFPMode)
+      return false;
+    ResultReg = createResultReg(&Mips::FGR32RegClass);
+    Opc = Mips::LWC1;
+    break;
+  }
+  case MVT::f64: {
+    if (UnsupportedFPMode)
+      return false;
+    ResultReg = createResultReg(&Mips::AFGR64RegClass);
+    Opc = Mips::LDC1;
+    break;
+  }
+  default:
+    return false;
+  }
+  if (Addr.isRegBase()) {
+    simplifyAddress(Addr);
+    emitInstLoad(Opc, ResultReg, Addr.getReg(), Addr.getOffset());
+    return true;
+  }
+  if (Addr.isFIBase()) {
+    unsigned FI = Addr.getFI();
+    unsigned Align = 4;
+    int64_t Offset = Addr.getOffset();
+    MachineFrameInfo &MFI = MF->getFrameInfo();
+    MachineMemOperand *MMO = MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
+        MFI.getObjectSize(FI), Align);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+        .addFrameIndex(FI)
+        .addImm(Offset)
+        .addMemOperand(MMO);
+    return true;
+  }
+  return false;
+}
+
+bool MipsFastISel::emitStore(MVT VT, unsigned SrcReg, Address &Addr,
+                             unsigned Alignment) {
+  //
+  // more cases will be handled here in following patches.
+  //
+  unsigned Opc;
+  switch (VT.SimpleTy) {
+  case MVT::i8:
+    Opc = Mips::SB;
+    break;
+  case MVT::i16:
+    Opc = Mips::SH;
+    break;
+  case MVT::i32:
+    Opc = Mips::SW;
+    break;
+  case MVT::f32:
+    if (UnsupportedFPMode)
+      return false;
+    Opc = Mips::SWC1;
+    break;
+  case MVT::f64:
+    if (UnsupportedFPMode)
+      return false;
+    Opc = Mips::SDC1;
+    break;
+  default:
+    return false;
+  }
+  if (Addr.isRegBase()) {
+    simplifyAddress(Addr);
+    emitInstStore(Opc, SrcReg, Addr.getReg(), Addr.getOffset());
+    return true;
+  }
+  if (Addr.isFIBase()) {
+    unsigned FI = Addr.getFI();
+    unsigned Align = 4;
+    int64_t Offset = Addr.getOffset();
+    MachineFrameInfo &MFI = MF->getFrameInfo();
+    MachineMemOperand *MMO = MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
+        MFI.getObjectSize(FI), Align);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+        .addReg(SrcReg)
+        .addFrameIndex(FI)
+        .addImm(Offset)
+        .addMemOperand(MMO);
+    return true;
+  }
+  return false;
+}
+
+bool MipsFastISel::selectLogicalOp(const Instruction *I) {
+  MVT VT;
+  if (!isTypeSupported(I->getType(), VT))
+    return false;
+
+  unsigned ResultReg;
+  switch (I->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  case Instruction::And:
+    ResultReg = emitLogicalOp(ISD::AND, VT, I->getOperand(0), I->getOperand(1));
+    break;
+  case Instruction::Or:
+    ResultReg = emitLogicalOp(ISD::OR, VT, I->getOperand(0), I->getOperand(1));
+    break;
+  case Instruction::Xor:
+    ResultReg = emitLogicalOp(ISD::XOR, VT, I->getOperand(0), I->getOperand(1));
+    break;
+  }
+
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool MipsFastISel::selectLoad(const Instruction *I) {
+  // Atomic loads need special handling.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getType(), VT))
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!computeAddress(I->getOperand(0), Addr))
+    return false;
+
+  unsigned ResultReg;
+  if (!emitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
+    return false;
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool MipsFastISel::selectStore(const Instruction *I) {
+  Value *Op0 = I->getOperand(0);
+  unsigned SrcReg = 0;
+
+  // Atomic stores need special handling.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT))
+    return false;
+
+  // Get the value to be stored into a register.
+  SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0)
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!computeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!emitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlignment()))
+    return false;
+  return true;
+}
+
+//
+// This can cause a redundant sltiu to be generated.
+// FIXME: try and eliminate this in a future patch.
+//
+bool MipsFastISel::selectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *BrBB = FuncInfo.MBB;
+  //
+  // TBB is the basic block for the case where the comparison is true.
+  // FBB is the basic block for the case where the comparison is false.
+  // if (cond) goto TBB
+  // goto FBB
+  // TBB:
+  //
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+  BI->getCondition();
+  // For now, just try the simplest case where it's fed by a compare.
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    unsigned CondReg = createResultReg(&Mips::GPR32RegClass);
+    if (!emitCmp(CondReg, CI))
+      return false;
+    BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::BGTZ))
+        .addReg(CondReg)
+        .addMBB(TBB);
+    finishCondBranch(BI->getParent(), TBB, FBB);
+    return true;
+  }
+  return false;
+}
+
+bool MipsFastISel::selectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
+  unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+  if (!emitCmp(ResultReg, CI))
+    return false;
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+// Attempt to fast-select a floating-point extend instruction.
+bool MipsFastISel::selectFPExt(const Instruction *I) {
+  if (UnsupportedFPMode)
+    return false;
+  Value *Src = I->getOperand(0);
+  EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
+
+  if (SrcVT != MVT::f32 || DestVT != MVT::f64)
+    return false;
+
+  unsigned SrcReg =
+      getRegForValue(Src); // this must be a 32bit floating point register class
+                           // maybe we should handle this differently
+  if (!SrcReg)
+    return false;
+
+  unsigned DestReg = createResultReg(&Mips::AFGR64RegClass);
+  emitInst(Mips::CVT_D32_S, DestReg).addReg(SrcReg);
+  updateValueMap(I, DestReg);
+  return true;
+}
+
+bool MipsFastISel::selectSelect(const Instruction *I) {
+  assert(isa<SelectInst>(I) && "Expected a select instruction.");
+
+  DEBUG(dbgs() << "selectSelect\n");
+
+  MVT VT;
+  if (!isTypeSupported(I->getType(), VT) || UnsupportedFPMode) {
+    DEBUG(dbgs() << ".. .. gave up (!isTypeSupported || UnsupportedFPMode)\n");
+    return false;
+  }
+
+  unsigned CondMovOpc;
+  const TargetRegisterClass *RC;
+
+  if (VT.isInteger() && !VT.isVector() && VT.getSizeInBits() <= 32) {
+    CondMovOpc = Mips::MOVN_I_I;
+    RC = &Mips::GPR32RegClass;
+  } else if (VT == MVT::f32) {
+    CondMovOpc = Mips::MOVN_I_S;
+    RC = &Mips::FGR32RegClass;
+  } else if (VT == MVT::f64) {
+    CondMovOpc = Mips::MOVN_I_D32;
+    RC = &Mips::AFGR64RegClass;
+  } else
+    return false;
+
+  const SelectInst *SI = cast<SelectInst>(I);
+  const Value *Cond = SI->getCondition();
+  unsigned Src1Reg = getRegForValue(SI->getTrueValue());
+  unsigned Src2Reg = getRegForValue(SI->getFalseValue());
+  unsigned CondReg = getRegForValue(Cond);
+
+  if (!Src1Reg || !Src2Reg || !CondReg)
+    return false;
+
+  unsigned ZExtCondReg = createResultReg(&Mips::GPR32RegClass);
+  if (!ZExtCondReg)
+    return false;
+
+  if (!emitIntExt(MVT::i1, CondReg, MVT::i32, ZExtCondReg, true))
+    return false;
+
+  unsigned ResultReg = createResultReg(RC);
+  unsigned TempReg = createResultReg(RC);
+
+  if (!ResultReg || !TempReg)
+    return false;
+
+  emitInst(TargetOpcode::COPY, TempReg).addReg(Src2Reg);
+  emitInst(CondMovOpc, ResultReg)
+    .addReg(Src1Reg).addReg(ZExtCondReg).addReg(TempReg);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+// Attempt to fast-select a floating-point truncate instruction.
+bool MipsFastISel::selectFPTrunc(const Instruction *I) {
+  if (UnsupportedFPMode)
+    return false;
+  Value *Src = I->getOperand(0);
+  EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
+
+  if (SrcVT != MVT::f64 || DestVT != MVT::f32)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  unsigned DestReg = createResultReg(&Mips::FGR32RegClass);
+  if (!DestReg)
+    return false;
+
+  emitInst(Mips::CVT_S_D32, DestReg).addReg(SrcReg);
+  updateValueMap(I, DestReg);
+  return true;
+}
+
+// Attempt to fast-select a floating-point-to-integer conversion.
+bool MipsFastISel::selectFPToInt(const Instruction *I, bool IsSigned) {
+  if (UnsupportedFPMode)
+    return false;
+  MVT DstVT, SrcVT;
+  if (!IsSigned)
+    return false; // We don't handle this case yet. There is no native
+                  // instruction for this but it can be synthesized.
+  Type *DstTy = I->getType();
+  if (!isTypeLegal(DstTy, DstVT))
+    return false;
+
+  if (DstVT != MVT::i32)
+    return false;
+
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+  if (!isTypeLegal(SrcTy, SrcVT))
+    return false;
+
+  if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (SrcReg == 0)
+    return false;
+
+  // Determine the opcode for the conversion, which takes place
+  // entirely within FPRs.
+  unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
+  unsigned TempReg = createResultReg(&Mips::FGR32RegClass);
+  unsigned Opc = (SrcVT == MVT::f32) ? Mips::TRUNC_W_S : Mips::TRUNC_W_D32;
+
+  // Generate the convert.
+  emitInst(Opc, TempReg).addReg(SrcReg);
+  emitInst(Mips::MFC1, DestReg).addReg(TempReg);
+
+  updateValueMap(I, DestReg);
+  return true;
+}
+
+bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
+                                   SmallVectorImpl<MVT> &OutVTs,
+                                   unsigned &NumBytes) {
+  CallingConv::ID CC = CLI.CallConv;
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC));
+  // Get a count of how many bytes are to be pushed on the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+  // This is the minimum argument area used for A0-A3.
+  if (NumBytes < 16)
+    NumBytes = 16;
+
+  emitInst(Mips::ADJCALLSTACKDOWN).addImm(16);
+  // Process the args.
+  MVT firstMVT;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    const Value *ArgVal = CLI.OutVals[VA.getValNo()];
+    MVT ArgVT = OutVTs[VA.getValNo()];
+
+    if (i == 0) {
+      firstMVT = ArgVT;
+      if (ArgVT == MVT::f32) {
+        VA.convertToReg(Mips::F12);
+      } else if (ArgVT == MVT::f64) {
+        VA.convertToReg(Mips::D6);
+      }
+    } else if (i == 1) {
+      if ((firstMVT == MVT::f32) || (firstMVT == MVT::f64)) {
+        if (ArgVT == MVT::f32) {
+          VA.convertToReg(Mips::F14);
+        } else if (ArgVT == MVT::f64) {
+          VA.convertToReg(Mips::D7);
+        }
+      }
+    }
+    if (((ArgVT == MVT::i32) || (ArgVT == MVT::f32) || (ArgVT == MVT::i16) ||
+         (ArgVT == MVT::i8)) &&
+        VA.isMemLoc()) {
+      switch (VA.getLocMemOffset()) {
+      case 0:
+        VA.convertToReg(Mips::A0);
+        break;
+      case 4:
+        VA.convertToReg(Mips::A1);
+        break;
+      case 8:
+        VA.convertToReg(Mips::A2);
+        break;
+      case 12:
+        VA.convertToReg(Mips::A3);
+        break;
+      default:
+        break;
+      }
+    }
+    unsigned ArgReg = getRegForValue(ArgVal);
+    if (!ArgReg)
+      return false;
+
+    // Handle arg promotion: SExt, ZExt, AExt.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::AExt:
+    case CCValAssign::SExt: {
+      MVT DestVT = VA.getLocVT();
+      MVT SrcVT = ArgVT;
+      ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/false);
+      if (!ArgReg)
+        return false;
+      break;
+    }
+    case CCValAssign::ZExt: {
+      MVT DestVT = VA.getLocVT();
+      MVT SrcVT = ArgVT;
+      ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/true);
+      if (!ArgReg)
+        return false;
+      break;
+    }
+    default:
+      llvm_unreachable("Unknown arg promotion!");
+    }
+
+    // Now copy/store arg to correct locations.
+    if (VA.isRegLoc() && !VA.needsCustom()) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
+      CLI.OutRegs.push_back(VA.getLocReg());
+    } else if (VA.needsCustom()) {
+      llvm_unreachable("Mips does not use custom args.");
+      return false;
+    } else {
+      //
+      // FIXME: This path will currently return false. It was copied
+      // from the AArch64 port and should be essentially fine for Mips too.
+      // The work to finish up this path will be done in a follow-on patch.
+      //
+      assert(VA.isMemLoc() && "Assuming store on stack.");
+      // Don't emit stores for undef values.
+      if (isa<UndefValue>(ArgVal))
+        continue;
+
+      // Need to store on the stack.
+      // FIXME: This alignment is incorrect but this path is disabled
+      // for now (will return false). We need to determine the right alignment
+      // based on the normal alignment for the underlying machine type.
+      //
+      unsigned ArgSize = alignTo(ArgVT.getSizeInBits(), 4);
+
+      unsigned BEAlign = 0;
+      if (ArgSize < 8 && !Subtarget->isLittle())
+        BEAlign = 8 - ArgSize;
+
+      Address Addr;
+      Addr.setKind(Address::RegBase);
+      Addr.setReg(Mips::SP);
+      Addr.setOffset(VA.getLocMemOffset() + BEAlign);
+
+      unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+      MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+          MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()),
+          MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
+      (void)(MMO);
+      // if (!emitStore(ArgVT, ArgReg, Addr, MMO))
+      return false; // can't store on the stack yet.
+    }
+  }
+
+  return true;
+}
+
+bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
+                              unsigned NumBytes) {
+  CallingConv::ID CC = CLI.CallConv;
+  emitInst(Mips::ADJCALLSTACKUP).addImm(16).addImm(0);
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, RetCC_Mips);
+
+    // Only handle a single return value.
+    if (RVLocs.size() != 1)
+      return false;
+    // Copy all of the result registers out of their specified physreg.
+    MVT CopyVT = RVLocs[0].getValVT();
+    // Special handling for extended integers.
+    if (RetVT == MVT::i1 || RetVT == MVT::i8 || RetVT == MVT::i16)
+      CopyVT = MVT::i32;
+
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
+    if (!ResultReg)
+      return false;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(RVLocs[0].getLocReg());
+    CLI.InRegs.push_back(RVLocs[0].getLocReg());
+
+    CLI.ResultReg = ResultReg;
+    CLI.NumResultRegs = 1;
+  }
+  return true;
+}
+
+bool MipsFastISel::fastLowerArguments() {
+  DEBUG(dbgs() << "fastLowerArguments\n");
+
+  if (!FuncInfo.CanLowerReturn) {
+    DEBUG(dbgs() << ".. gave up (!CanLowerReturn)\n");
+    return false;
+  }
+
+  const Function *F = FuncInfo.Fn;
+  if (F->isVarArg()) {
+    DEBUG(dbgs() << ".. gave up (varargs)\n");
+    return false;
+  }
+
+  CallingConv::ID CC = F->getCallingConv();
+  if (CC != CallingConv::C) {
+    DEBUG(dbgs() << ".. gave up (calling convention is not C)\n");
+    return false;
+  }
+
+  const ArrayRef<MCPhysReg> GPR32ArgRegs = {Mips::A0, Mips::A1, Mips::A2,
+                                            Mips::A3};
+  const ArrayRef<MCPhysReg> FGR32ArgRegs = {Mips::F12, Mips::F14};
+  const ArrayRef<MCPhysReg> AFGR64ArgRegs = {Mips::D6, Mips::D7};
+  ArrayRef<MCPhysReg>::iterator NextGPR32 = GPR32ArgRegs.begin();
+  ArrayRef<MCPhysReg>::iterator NextFGR32 = FGR32ArgRegs.begin();
+  ArrayRef<MCPhysReg>::iterator NextAFGR64 = AFGR64ArgRegs.begin();
+
+  struct AllocatedReg {
+    const TargetRegisterClass *RC;
+    unsigned Reg;
+    AllocatedReg(const TargetRegisterClass *RC, unsigned Reg)
+        : RC(RC), Reg(Reg) {}
+  };
+
+  // Only handle simple cases. i.e. All arguments are directly mapped to
+  // registers of the appropriate type.
+  SmallVector<AllocatedReg, 4> Allocation;
+  unsigned Idx = 1;
+  for (const auto &FormalArg : F->args()) {
+    if (F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::ByVal)) {
+      DEBUG(dbgs() << ".. gave up (inreg, structret, byval)\n");
+      return false;
+    }
+
+    Type *ArgTy = FormalArg.getType();
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) {
+      DEBUG(dbgs() << ".. gave up (struct, array, or vector)\n");
+      return false;
+    }
+
+    EVT ArgVT = TLI.getValueType(DL, ArgTy);
+    DEBUG(dbgs() << ".. " << (Idx - 1) << ": " << ArgVT.getEVTString() << "\n");
+    if (!ArgVT.isSimple()) {
+      DEBUG(dbgs() << ".. .. gave up (not a simple type)\n");
+      return false;
+    }
+
+    switch (ArgVT.getSimpleVT().SimpleTy) {
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+      if (!F->getAttributes().hasAttribute(Idx, Attribute::SExt) &&
+          !F->getAttributes().hasAttribute(Idx, Attribute::ZExt)) {
+        // It must be any extend, this shouldn't happen for clang-generated IR
+        // so just fall back on SelectionDAG.
+        DEBUG(dbgs() << ".. .. gave up (i8/i16 arg is not extended)\n");
+        return false;
+      }
+
+      if (NextGPR32 == GPR32ArgRegs.end()) {
+        DEBUG(dbgs() << ".. .. gave up (ran out of GPR32 arguments)\n");
+        return false;
+      }
+
+      DEBUG(dbgs() << ".. .. GPR32(" << *NextGPR32 << ")\n");
+      Allocation.emplace_back(&Mips::GPR32RegClass, *NextGPR32++);
+
+      // Allocating any GPR32 prohibits further use of floating point arguments.
+      NextFGR32 = FGR32ArgRegs.end();
+      NextAFGR64 = AFGR64ArgRegs.end();
+      break;
+
+    case MVT::i32:
+      if (F->getAttributes().hasAttribute(Idx, Attribute::ZExt)) {
+        // The O32 ABI does not permit a zero-extended i32.
+        DEBUG(dbgs() << ".. .. gave up (i32 arg is zero extended)\n");
+        return false;
+      }
+
+      if (NextGPR32 == GPR32ArgRegs.end()) {
+        DEBUG(dbgs() << ".. .. gave up (ran out of GPR32 arguments)\n");
+        return false;
+      }
+
+      DEBUG(dbgs() << ".. .. GPR32(" << *NextGPR32 << ")\n");
+      Allocation.emplace_back(&Mips::GPR32RegClass, *NextGPR32++);
+
+      // Allocating any GPR32 prohibits further use of floating point arguments.
+      NextFGR32 = FGR32ArgRegs.end();
+      NextAFGR64 = AFGR64ArgRegs.end();
+      break;
+
+    case MVT::f32:
+      if (UnsupportedFPMode) {
+        DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode)\n");
+        return false;
+      }
+      if (NextFGR32 == FGR32ArgRegs.end()) {
+        DEBUG(dbgs() << ".. .. gave up (ran out of FGR32 arguments)\n");
+        return false;
+      }
+      DEBUG(dbgs() << ".. .. FGR32(" << *NextFGR32 << ")\n");
+      Allocation.emplace_back(&Mips::FGR32RegClass, *NextFGR32++);
+      // Allocating an FGR32 also allocates the super-register AFGR64, and
+      // ABI rules require us to skip the corresponding GPR32.
+      if (NextGPR32 != GPR32ArgRegs.end())
+        NextGPR32++;
+      if (NextAFGR64 != AFGR64ArgRegs.end())
+        NextAFGR64++;
+      break;
+
+    case MVT::f64:
+      if (UnsupportedFPMode) {
+        DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode)\n");
+        return false;
+      }
+      if (NextAFGR64 == AFGR64ArgRegs.end()) {
+        DEBUG(dbgs() << ".. .. gave up (ran out of AFGR64 arguments)\n");
+        return false;
+      }
+      DEBUG(dbgs() << ".. .. AFGR64(" << *NextAFGR64 << ")\n");
+      Allocation.emplace_back(&Mips::AFGR64RegClass, *NextAFGR64++);
+      // Allocating an FGR32 also allocates the super-register AFGR64, and
+      // ABI rules require us to skip the corresponding GPR32 pair.
+      if (NextGPR32 != GPR32ArgRegs.end())
+        NextGPR32++;
+      if (NextGPR32 != GPR32ArgRegs.end())
+        NextGPR32++;
+      if (NextFGR32 != FGR32ArgRegs.end())
+        NextFGR32++;
+      break;
+
+    default:
+      DEBUG(dbgs() << ".. .. gave up (unknown type)\n");
+      return false;
+    }
+
+    ++Idx;
+  }
+
+  Idx = 0;
+  for (const auto &FormalArg : F->args()) {
+    unsigned SrcReg = Allocation[Idx].Reg;
+    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, Allocation[Idx].RC);
+    // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+    // Without this, EmitLiveInCopies may eliminate the livein if its only
+    // use is a bitcast (which isn't turned into an instruction).
+    unsigned ResultReg = createResultReg(Allocation[Idx].RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(DstReg, getKillRegState(true));
+    updateValueMap(&FormalArg, ResultReg);
+    ++Idx;
+  }
+
+  // Calculate the size of the incoming arguments area.
+  // We currently reject all the cases where this would be non-zero.
+  unsigned IncomingArgSizeInBytes = 0;
+
+  // Account for the reserved argument area on ABI's that have one (O32).
+  // It seems strange to do this on the caller side but it's necessary in
+  // SelectionDAG's implementation.
+  IncomingArgSizeInBytes = std::min(getABI().GetCalleeAllocdArgSizeInBytes(CC),
+                                    IncomingArgSizeInBytes);
+
+  MF->getInfo<MipsFunctionInfo>()->setFormalArgInfo(IncomingArgSizeInBytes,
+                                                    false);
+
+  return true;
+}
+
+bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) {
+  CallingConv::ID CC = CLI.CallConv;
+  bool IsTailCall = CLI.IsTailCall;
+  bool IsVarArg = CLI.IsVarArg;
+  const Value *Callee = CLI.Callee;
+  MCSymbol *Symbol = CLI.Symbol;
+
+  // Do not handle FastCC.
+  if (CC == CallingConv::Fast)
+    return false;
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (IsTailCall)
+    return false;
+
+  // Let SDISel handle vararg functions.
+  if (IsVarArg)
+    return false;
+
+  // FIXME: Only handle *simple* calls for now.
+  MVT RetVT;
+  if (CLI.RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeSupported(CLI.RetTy, RetVT))
+    return false;
+
+  for (auto Flag : CLI.OutFlags)
+    if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal())
+      return false;
+
+  // Set up the argument vectors.
+  SmallVector<MVT, 16> OutVTs;
+  OutVTs.reserve(CLI.OutVals.size());
+
+  for (auto *Val : CLI.OutVals) {
+    MVT VT;
+    if (!isTypeLegal(Val->getType(), VT) &&
+        !(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16))
+      return false;
+
+    // We don't handle vector parameters yet.
+    if (VT.isVector() || VT.getSizeInBits() > 64)
+      return false;
+
+    OutVTs.push_back(VT);
+  }
+
+  Address Addr;
+  if (!computeCallAddress(Callee, Addr))
+    return false;
+
+  // Handle the arguments now that we've gotten them.
+  unsigned NumBytes;
+  if (!processCallArgs(CLI, OutVTs, NumBytes))
+    return false;
+
+  if (!Addr.getGlobalValue())
+    return false;
+
+  // Issue the call.
+  unsigned DestAddress;
+  if (Symbol)
+    DestAddress = materializeExternalCallSym(Symbol);
+  else
+    DestAddress = materializeGV(Addr.getGlobalValue(), MVT::i32);
+  emitInst(TargetOpcode::COPY, Mips::T9).addReg(DestAddress);
+  MachineInstrBuilder MIB =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::JALR),
+              Mips::RA).addReg(Mips::T9);
+
+  // Add implicit physical register uses to the call.
+  for (auto Reg : CLI.OutRegs)
+    MIB.addReg(Reg, RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
+
+  CLI.Call = MIB;
+
+  // Finish off the call including any return values.
+  return finishCall(CLI, RetVT, NumBytes);
+}
+
+bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
+  switch (II->getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::bswap: {
+    Type *RetTy = II->getCalledFunction()->getReturnType();
+
+    MVT VT;
+    if (!isTypeSupported(RetTy, VT))
+      return false;
+
+    unsigned SrcReg = getRegForValue(II->getOperand(0));
+    if (SrcReg == 0)
+      return false;
+    unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
+    if (DestReg == 0)
+      return false;
+    if (VT == MVT::i16) {
+      if (Subtarget->hasMips32r2()) {
+        emitInst(Mips::WSBH, DestReg).addReg(SrcReg);
+        updateValueMap(II, DestReg);
+        return true;
+      } else {
+        unsigned TempReg[3];
+        for (int i = 0; i < 3; i++) {
+          TempReg[i] = createResultReg(&Mips::GPR32RegClass);
+          if (TempReg[i] == 0)
+            return false;
+        }
+        emitInst(Mips::SLL, TempReg[0]).addReg(SrcReg).addImm(8);
+        emitInst(Mips::SRL, TempReg[1]).addReg(SrcReg).addImm(8);
+        emitInst(Mips::OR, TempReg[2]).addReg(TempReg[0]).addReg(TempReg[1]);
+        emitInst(Mips::ANDi, DestReg).addReg(TempReg[2]).addImm(0xFFFF);
+        updateValueMap(II, DestReg);
+        return true;
+      }
+    } else if (VT == MVT::i32) {
+      if (Subtarget->hasMips32r2()) {
+        unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+        emitInst(Mips::WSBH, TempReg).addReg(SrcReg);
+        emitInst(Mips::ROTR, DestReg).addReg(TempReg).addImm(16);
+        updateValueMap(II, DestReg);
+        return true;
+      } else {
+        unsigned TempReg[8];
+        for (int i = 0; i < 8; i++) {
+          TempReg[i] = createResultReg(&Mips::GPR32RegClass);
+          if (TempReg[i] == 0)
+            return false;
+        }
+
+        emitInst(Mips::SRL, TempReg[0]).addReg(SrcReg).addImm(8);
+        emitInst(Mips::SRL, TempReg[1]).addReg(SrcReg).addImm(24);
+        emitInst(Mips::ANDi, TempReg[2]).addReg(TempReg[0]).addImm(0xFF00);
+        emitInst(Mips::OR, TempReg[3]).addReg(TempReg[1]).addReg(TempReg[2]);
+
+        emitInst(Mips::ANDi, TempReg[4]).addReg(SrcReg).addImm(0xFF00);
+        emitInst(Mips::SLL, TempReg[5]).addReg(TempReg[4]).addImm(8);
+
+        emitInst(Mips::SLL, TempReg[6]).addReg(SrcReg).addImm(24);
+        emitInst(Mips::OR, TempReg[7]).addReg(TempReg[3]).addReg(TempReg[5]);
+        emitInst(Mips::OR, DestReg).addReg(TempReg[6]).addReg(TempReg[7]);
+        updateValueMap(II, DestReg);
+        return true;
+      }
+    }
+    return false;
+  }
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove: {
+    const auto *MTI = cast<MemTransferInst>(II);
+    // Don't handle volatile.
+    if (MTI->isVolatile())
+      return false;
+    if (!MTI->getLength()->getType()->isIntegerTy(32))
+      return false;
+    const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove";
+    return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 2);
+  }
+  case Intrinsic::memset: {
+    const MemSetInst *MSI = cast<MemSetInst>(II);
+    // Don't handle volatile.
+    if (MSI->isVolatile())
+      return false;
+    if (!MSI->getLength()->getType()->isIntegerTy(32))
+      return false;
+    return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+  }
+  }
+  return false;
+}
+
+bool MipsFastISel::selectRet(const Instruction *I) {
+  const Function &F = *I->getParent()->getParent();
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+
+  DEBUG(dbgs() << "selectRet\n");
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+
+  if (Ret->getNumOperands() > 0) {
+    CallingConv::ID CC = F.getCallingConv();
+
+    // Do not handle FastCC.
+    if (CC == CallingConv::Fast)
+      return false;
+
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    MipsCCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs,
+                       I->getContext());
+    CCAssignFn *RetCC = RetCC_Mips;
+    CCInfo.AnalyzeReturn(Outs, RetCC);
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+    const Value *RV = Ret->getOperand(0);
+
+    // Don't bother handling odd stuff for now.
+    if ((VA.getLocInfo() != CCValAssign::Full) &&
+        (VA.getLocInfo() != CCValAssign::BCvt))
+      return false;
+
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    unsigned SrcReg = Reg + VA.getValNo();
+    unsigned DestReg = VA.getLocReg();
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!MRI.getRegClass(SrcReg)->contains(DestReg))
+      return false;
+
+    EVT RVEVT = TLI.getValueType(DL, RV->getType());
+    if (!RVEVT.isSimple())
+      return false;
+
+    if (RVEVT.isVector())
+      return false;
+
+    MVT RVVT = RVEVT.getSimpleVT();
+    if (RVVT == MVT::f128)
+      return false;
+
+    // Do not handle FGR64 returns for now.
+    if (RVVT == MVT::f64 && UnsupportedFPMode) {
+      DEBUG(dbgs() << ".. .. gave up (UnsupportedFPMode\n");
+      return false;
+    }
+
+    MVT DestVT = VA.getValVT();
+    // Special handling for extended integers.
+    if (RVVT != DestVT) {
+      if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
+        return false;
+
+      if (Outs[0].Flags.isZExt() || Outs[0].Flags.isSExt()) {
+        bool IsZExt = Outs[0].Flags.isZExt();
+        SrcReg = emitIntExt(RVVT, SrcReg, DestVT, IsZExt);
+        if (SrcReg == 0)
+          return false;
+      }
+    }
+
+    // Make the copy.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg);
+
+    // Add register to return instruction.
+    RetRegs.push_back(VA.getLocReg());
+  }
+  MachineInstrBuilder MIB = emitInst(Mips::RetRA);
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+  return true;
+}
+
+bool MipsFastISel::selectTrunc(const Instruction *I) {
+  // The high bits for a type smaller than the register size are assumed to be
+  // undefined.
+  Value *Op = I->getOperand(0);
+
+  EVT SrcVT, DestVT;
+  SrcVT = TLI.getValueType(DL, Op->getType(), true);
+  DestVT = TLI.getValueType(DL, I->getType(), true);
+
+  if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
+    return false;
+  if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Op);
+  if (!SrcReg)
+    return false;
+
+  // Because the high bits are undefined, a truncate doesn't generate
+  // any code.
+  updateValueMap(I, SrcReg);
+  return true;
+}
+bool MipsFastISel::selectIntExt(const Instruction *I) {
+  Type *DestTy = I->getType();
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+
+  bool isZExt = isa<ZExtInst>(I);
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  EVT SrcEVT, DestEVT;
+  SrcEVT = TLI.getValueType(DL, SrcTy, true);
+  DestEVT = TLI.getValueType(DL, DestTy, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+  unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+
+  if (!emitIntExt(SrcVT, SrcReg, DestVT, ResultReg, isZExt))
+    return false;
+  updateValueMap(I, ResultReg);
+  return true;
+}
+bool MipsFastISel::emitIntSExt32r1(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                   unsigned DestReg) {
+  unsigned ShiftAmt;
+  switch (SrcVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i8:
+    ShiftAmt = 24;
+    break;
+  case MVT::i16:
+    ShiftAmt = 16;
+    break;
+  }
+  unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+  emitInst(Mips::SLL, TempReg).addReg(SrcReg).addImm(ShiftAmt);
+  emitInst(Mips::SRA, DestReg).addReg(TempReg).addImm(ShiftAmt);
+  return true;
+}
+
+bool MipsFastISel::emitIntSExt32r2(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                   unsigned DestReg) {
+  switch (SrcVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i8:
+    emitInst(Mips::SEB, DestReg).addReg(SrcReg);
+    break;
+  case MVT::i16:
+    emitInst(Mips::SEH, DestReg).addReg(SrcReg);
+    break;
+  }
+  return true;
+}
+
+bool MipsFastISel::emitIntSExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                               unsigned DestReg) {
+  if ((DestVT != MVT::i32) && (DestVT != MVT::i16))
+    return false;
+  if (Subtarget->hasMips32r2())
+    return emitIntSExt32r2(SrcVT, SrcReg, DestVT, DestReg);
+  return emitIntSExt32r1(SrcVT, SrcReg, DestVT, DestReg);
+}
+
+bool MipsFastISel::emitIntZExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                               unsigned DestReg) {
+  int64_t Imm;
+
+  switch (SrcVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+    Imm = 1;
+    break;
+  case MVT::i8:
+    Imm = 0xff;
+    break;
+  case MVT::i16:
+    Imm = 0xffff;
+    break;
+  }
+
+  emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(Imm);
+  return true;
+}
+
+bool MipsFastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                              unsigned DestReg, bool IsZExt) {
+  // FastISel does not have plumbing to deal with extensions where the SrcVT or
+  // DestVT are odd things, so test to make sure that they are both types we can
+  // handle (i1/i8/i16/i32 for SrcVT and i8/i16/i32/i64 for DestVT), otherwise
+  // bail out to SelectionDAG.
+  if (((DestVT != MVT::i8) && (DestVT != MVT::i16) && (DestVT != MVT::i32)) ||
+      ((SrcVT != MVT::i1) && (SrcVT != MVT::i8) && (SrcVT != MVT::i16)))
+    return false;
+  if (IsZExt)
+    return emitIntZExt(SrcVT, SrcReg, DestVT, DestReg);
+  return emitIntSExt(SrcVT, SrcReg, DestVT, DestReg);
+}
+
+unsigned MipsFastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                  bool isZExt) {
+  unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
+  bool Success = emitIntExt(SrcVT, SrcReg, DestVT, DestReg, isZExt);
+  return Success ? DestReg : 0;
+}
+
+bool MipsFastISel::selectDivRem(const Instruction *I, unsigned ISDOpcode) {
+  EVT DestEVT = TLI.getValueType(DL, I->getType(), true);
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT DestVT = DestEVT.getSimpleVT();
+  if (DestVT != MVT::i32)
+    return false;
+
+  unsigned DivOpc;
+  switch (ISDOpcode) {
+  default:
+    return false;
+  case ISD::SDIV:
+  case ISD::SREM:
+    DivOpc = Mips::SDIV;
+    break;
+  case ISD::UDIV:
+  case ISD::UREM:
+    DivOpc = Mips::UDIV;
+    break;
+  }
+
+  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  if (!Src0Reg || !Src1Reg)
+    return false;
+
+  emitInst(DivOpc).addReg(Src0Reg).addReg(Src1Reg);
+  emitInst(Mips::TEQ).addReg(Src1Reg).addReg(Mips::ZERO).addImm(7);
+
+  unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+  if (!ResultReg)
+    return false;
+
+  unsigned MFOpc = (ISDOpcode == ISD::SREM || ISDOpcode == ISD::UREM)
+                       ? Mips::MFHI
+                       : Mips::MFLO;
+  emitInst(MFOpc, ResultReg);
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool MipsFastISel::selectShift(const Instruction *I) {
+  MVT RetVT;
+
+  if (!isTypeSupported(I->getType(), RetVT))
+    return false;
+
+  unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+  if (!ResultReg)
+    return false;
+
+  unsigned Opcode = I->getOpcode();
+  const Value *Op0 = I->getOperand(0);
+  unsigned Op0Reg = getRegForValue(Op0);
+  if (!Op0Reg)
+    return false;
+
+  // If AShr or LShr, then we need to make sure the operand0 is sign extended.
+  if (Opcode == Instruction::AShr || Opcode == Instruction::LShr) {
+    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    if (!TempReg)
+      return false;
+
+    MVT Op0MVT = TLI.getValueType(DL, Op0->getType(), true).getSimpleVT();
+    bool IsZExt = Opcode == Instruction::LShr;
+    if (!emitIntExt(Op0MVT, Op0Reg, MVT::i32, TempReg, IsZExt))
+      return false;
+
+    Op0Reg = TempReg;
+  }
+
+  if (const auto *C = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    uint64_t ShiftVal = C->getZExtValue();
+
+    switch (Opcode) {
+    default:
+      llvm_unreachable("Unexpected instruction.");
+    case Instruction::Shl:
+      Opcode = Mips::SLL;
+      break;
+    case Instruction::AShr:
+      Opcode = Mips::SRA;
+      break;
+    case Instruction::LShr:
+      Opcode = Mips::SRL;
+      break;
+    }
+
+    emitInst(Opcode, ResultReg).addReg(Op0Reg).addImm(ShiftVal);
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (!Op1Reg)
+    return false;
+
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  case Instruction::Shl:
+    Opcode = Mips::SLLV;
+    break;
+  case Instruction::AShr:
+    Opcode = Mips::SRAV;
+    break;
+  case Instruction::LShr:
+    Opcode = Mips::SRLV;
+    break;
+  }
+
+  emitInst(Opcode, ResultReg).addReg(Op0Reg).addReg(Op1Reg);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool MipsFastISel::fastSelectInstruction(const Instruction *I) {
+  switch (I->getOpcode()) {
+  default:
+    break;
+  case Instruction::Load:
+    return selectLoad(I);
+  case Instruction::Store:
+    return selectStore(I);
+  case Instruction::SDiv:
+    if (!selectBinaryOp(I, ISD::SDIV))
+      return selectDivRem(I, ISD::SDIV);
+    return true;
+  case Instruction::UDiv:
+    if (!selectBinaryOp(I, ISD::UDIV))
+      return selectDivRem(I, ISD::UDIV);
+    return true;
+  case Instruction::SRem:
+    if (!selectBinaryOp(I, ISD::SREM))
+      return selectDivRem(I, ISD::SREM);
+    return true;
+  case Instruction::URem:
+    if (!selectBinaryOp(I, ISD::UREM))
+      return selectDivRem(I, ISD::UREM);
+    return true;
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return selectShift(I);
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return selectLogicalOp(I);
+  case Instruction::Br:
+    return selectBranch(I);
+  case Instruction::Ret:
+    return selectRet(I);
+  case Instruction::Trunc:
+    return selectTrunc(I);
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    return selectIntExt(I);
+  case Instruction::FPTrunc:
+    return selectFPTrunc(I);
+  case Instruction::FPExt:
+    return selectFPExt(I);
+  case Instruction::FPToSI:
+    return selectFPToInt(I, /*isSigned*/ true);
+  case Instruction::FPToUI:
+    return selectFPToInt(I, /*isSigned*/ false);
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return selectCmp(I);
+  case Instruction::Select:
+    return selectSelect(I);
+  }
+  return false;
+}
+
+unsigned MipsFastISel::getRegEnsuringSimpleIntegerWidening(const Value *V,
+                                                           bool IsUnsigned) {
+  unsigned VReg = getRegForValue(V);
+  if (VReg == 0)
+    return 0;
+  MVT VMVT = TLI.getValueType(DL, V->getType(), true).getSimpleVT();
+  if ((VMVT == MVT::i8) || (VMVT == MVT::i16)) {
+    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    if (!emitIntExt(VMVT, VReg, MVT::i32, TempReg, IsUnsigned))
+      return 0;
+    VReg = TempReg;
+  }
+  return VReg;
+}
+
+void MipsFastISel::simplifyAddress(Address &Addr) {
+  if (!isInt<16>(Addr.getOffset())) {
+    unsigned TempReg =
+        materialize32BitInt(Addr.getOffset(), &Mips::GPR32RegClass);
+    unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
+    emitInst(Mips::ADDu, DestReg).addReg(TempReg).addReg(Addr.getReg());
+    Addr.setReg(DestReg);
+    Addr.setOffset(0);
+  }
+}
+
+unsigned MipsFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
+                                       const TargetRegisterClass *RC,
+                                       unsigned Op0, bool Op0IsKill,
+                                       unsigned Op1, bool Op1IsKill) {
+  // We treat the MUL instruction in a special way because it clobbers
+  // the HI0 & LO0 registers. The TableGen definition of this instruction can
+  // mark these registers only as implicitly defined. As a result, the
+  // register allocator runs out of registers when this instruction is
+  // followed by another instruction that defines the same registers too.
+  // We can fix this by explicitly marking those registers as dead.
+  if (MachineInstOpcode == Mips::MUL) {
+    unsigned ResultReg = createResultReg(RC);
+    const MCInstrDesc &II = TII.get(MachineInstOpcode);
+    Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+    Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+      .addReg(Op0, getKillRegState(Op0IsKill))
+      .addReg(Op1, getKillRegState(Op1IsKill))
+      .addReg(Mips::HI0, RegState::ImplicitDefine | RegState::Dead)
+      .addReg(Mips::LO0, RegState::ImplicitDefine | RegState::Dead);
+    return ResultReg;
+  }
+
+  return FastISel::fastEmitInst_rr(MachineInstOpcode, RC, Op0, Op0IsKill, Op1,
+                                   Op1IsKill);
+}
+
+namespace llvm {
+FastISel *Mips::createFastISel(FunctionLoweringInfo &funcInfo,
+                               const TargetLibraryInfo *libInfo) {
+  return new MipsFastISel(funcInfo, libInfo);
+}
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp
new file mode 100644
index 000000000000..b2cf03976f81
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp
@@ -0,0 +1,159 @@
+//===-- MipsFrameLowering.cpp - Mips Frame Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsFrameLowering.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+
+//===----------------------------------------------------------------------===//
+//
+// Stack Frame Processing methods
+// +----------------------------+
+//
+// The stack is allocated decrementing the stack pointer on
+// the first instruction of a function prologue. Once decremented,
+// all stack references are done thought a positive offset
+// from the stack/frame pointer, so the stack is considering
+// to grow up! Otherwise terrible hacks would have to be made
+// to get this stack ABI compliant :)
+//
+//  The stack frame required by the ABI (after call):
+//  Offset
+//
+//  0                 ----------
+//  4                 Args to pass
+//  .                 saved $GP  (used in PIC)
+//  .                 Alloca allocations
+//  .                 Local Area
+//  .                 CPU "Callee Saved" Registers
+//  .                 saved FP
+//  .                 saved RA
+//  .                 FPU "Callee Saved" Registers
+//  StackSize         -----------
+//
+// Offset - offset from sp after stack allocation on function prologue
+//
+// The sp is the stack pointer subtracted/added from the stack size
+// at the Prologue/Epilogue
+//
+// References to the previous stack (to obtain arguments) are done
+// with offsets that exceeds the stack size: (stacksize+(4*(num_arg-1))
+//
+// Examples:
+// - reference to the actual stack frame
+//   for any local area var there is smt like : FI >= 0, StackOffset: 4
+//     sw REGX, 4(SP)
+//
+// - reference to previous stack frame
+//   suppose there's a load to the 5th arguments : FI < 0, StackOffset: 16.
+//   The emitted instruction will be something like:
+//     lw REGX, 16+StackSize(SP)
+//
+// Since the total stack size is unknown on LowerFormalArguments, all
+// stack references (ObjectOffset) created to reference the function
+// arguments, are negative numbers. This way, on eliminateFrameIndex it's
+// possible to detect those references and the offsets are adjusted to
+// their real location.
+//
+//===----------------------------------------------------------------------===//
+
+const MipsFrameLowering *MipsFrameLowering::create(const MipsSubtarget &ST) {
+  if (ST.inMips16Mode())
+    return llvm::createMips16FrameLowering(ST);
+
+  return llvm::createMipsSEFrameLowering(ST);
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas,
+// if it needs dynamic stack realignment, if frame pointer elimination is
+// disabled, or if the frame address is taken.
+bool MipsFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+      MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
+      TRI->needsStackRealignment(MF);
+}
+
+bool MipsFrameLowering::hasBP(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
+  return MFI.hasVarSizedObjects() && TRI->needsStackRealignment(MF);
+}
+
+uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+
+  int64_t Offset = 0;
+
+  // Iterate over fixed sized objects.
+  for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
+    Offset = std::max(Offset, -MFI.getObjectOffset(I));
+
+  // Conservatively assume all callee-saved registers will be saved.
+  for (const MCPhysReg *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) {
+    unsigned Size = TRI.getMinimalPhysRegClass(*R)->getSize();
+    Offset = alignTo(Offset + Size, Size);
+  }
+
+  unsigned MaxAlign = MFI.getMaxAlignment();
+
+  // Check that MaxAlign is not zero if there is a stack object that is not a
+  // callee-saved spill.
+  assert(!MFI.getObjectIndexEnd() || MaxAlign);
+
+  // Iterate over other objects.
+  for (unsigned I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I)
+    Offset = alignTo(Offset + MFI.getObjectSize(I), MaxAlign);
+
+  // Call frame.
+  if (MFI.adjustsStack() && hasReservedCallFrame(MF))
+    Offset = alignTo(Offset + MFI.getMaxCallFrameSize(),
+                     std::max(MaxAlign, getStackAlignment()));
+
+  return alignTo(Offset, getStackAlignment());
+}
+
+// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions
+MachineBasicBlock::iterator MipsFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  unsigned SP = STI.getABI().IsN64() ? Mips::SP_64 : Mips::SP;
+
+  if (!hasReservedCallFrame(MF)) {
+    int64_t Amount = I->getOperand(0).getImm();
+    if (I->getOpcode() == Mips::ADJCALLSTACKDOWN)
+      Amount = -Amount;
+
+    STI.getInstrInfo()->adjustStackPtr(SP, Amount, MBB, I);
+  }
+
+  return MBB.erase(I);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h
new file mode 100644
index 000000000000..8c4214c4c21d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h
@@ -0,0 +1,54 @@
+//===-- MipsFrameLowering.h - Define frame lowering for Mips ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSFRAMELOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPSFRAMELOWERING_H
+
+#include "Mips.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+  class MipsSubtarget;
+
+class MipsFrameLowering : public TargetFrameLowering {
+protected:
+  const MipsSubtarget &STI;
+
+public:
+  explicit MipsFrameLowering(const MipsSubtarget &sti, unsigned Alignment)
+    : TargetFrameLowering(StackGrowsDown, Alignment, 0, Alignment), STI(sti) {}
+
+  static const MipsFrameLowering *create(const MipsSubtarget &ST);
+
+  bool hasFP(const MachineFunction &MF) const override;
+
+  bool hasBP(const MachineFunction &MF) const;
+
+  bool isFPCloseToIncomingSP() const override { return false; }
+
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
+
+protected:
+  uint64_t estimateStackSize(const MachineFunction &MF) const;
+};
+
+/// Create MipsFrameLowering objects.
+const MipsFrameLowering *createMips16FrameLowering(const MipsSubtarget &ST);
+const MipsFrameLowering *createMipsSEFrameLowering(const MipsSubtarget &ST);
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsHazardSchedule.cpp b/contrib/llvm/lib/Target/Mips/MipsHazardSchedule.cpp
new file mode 100644
index 000000000000..31b86124bc8d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsHazardSchedule.cpp
@@ -0,0 +1,160 @@
+//===-- MipsHazardSchedule.cpp - Workaround pipeline hazards --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This pass is used to workaround certain pipeline hazards. For now, this
+/// covers compact branch hazards. In future this pass can be extended to other
+/// pipeline hazards, such as various MIPS1 hazards, processor errata that
+/// require instruction reorganization, etc.
+///
+/// This pass has to run after the delay slot filler as that pass can introduce
+/// pipeline hazards, hence the existing hazard recognizer is not suitable.
+///
+/// Hazards handled: forbidden slots for MIPSR6.
+///
+/// A forbidden slot hazard occurs when a compact branch instruction is executed
+/// and the adjacent instruction in memory is a control transfer instruction
+/// such as a branch or jump, ERET, ERETNC, DERET, WAIT and PAUSE.
+///
+/// For example:
+///
+/// 0x8004      bnec    a1,v0,<P+0x18>
+/// 0x8008      beqc    a1,a2,<P+0x54>
+///
+/// In such cases, the processor is required to signal a Reserved Instruction
+/// exception.
+///
+/// Here, if the instruction at 0x8004 is executed, the processor will raise an
+/// exception as there is a control transfer instruction at 0x8008.
+///
+/// There are two sources of forbidden slot hazards:
+///
+/// A) A previous pass has created a compact branch directly.
+/// B) Transforming a delay slot branch into compact branch. This case can be
+///    difficult to process as lookahead for hazards is insufficent, as
+///    backwards delay slot fillling can also produce hazards in previously
+///    processed instuctions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsSEInstrInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-hazard-schedule"
+
+STATISTIC(NumInsertedNops, "Number of nops inserted");
+
+namespace {
+
+typedef MachineBasicBlock::iterator Iter;
+typedef MachineBasicBlock::reverse_iterator ReverseIter;
+
+class MipsHazardSchedule : public MachineFunctionPass {
+
+public:
+  MipsHazardSchedule() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "Mips Hazard Schedule"; }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  static char ID;
+};
+
+char MipsHazardSchedule::ID = 0;
+} // end of anonymous namespace
+
+/// Returns a pass that clears pipeline hazards.
+FunctionPass *llvm::createMipsHazardSchedule() {
+  return new MipsHazardSchedule();
+}
+
+// Find the next real instruction from the current position in current basic
+// block.
+static Iter getNextMachineInstrInBB(Iter Position) {
+  Iter I = Position, E = Position->getParent()->end();
+  I = std::find_if_not(I, E,
+                       [](const Iter &Insn) { return Insn->isTransient(); });
+
+  return I;
+}
+
+// Find the next real instruction from the current position, looking through
+// basic block boundaries.
+static Iter getNextMachineInstr(Iter Position, MachineBasicBlock *Parent) {
+  if (Position == Parent->end()) {
+    MachineBasicBlock *Succ = Parent->getNextNode();
+    if (Succ != nullptr && Parent->isSuccessor(Succ)) {
+      Position = Succ->begin();
+      Parent = Succ;
+    } else {
+      llvm_unreachable(
+          "Should have identified the end of the function earlier!");
+    }
+  }
+
+  Iter Instr = getNextMachineInstrInBB(Position);
+  if (Instr == Parent->end()) {
+    return getNextMachineInstr(Instr, Parent);
+  }
+  return Instr;
+}
+
+bool MipsHazardSchedule::runOnMachineFunction(MachineFunction &MF) {
+
+  const MipsSubtarget *STI =
+      &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+
+  // Forbidden slot hazards are only defined for MIPSR6 but not microMIPSR6.
+  if (!STI->hasMips32r6() || STI->inMicroMipsMode())
+    return false;
+
+  bool Changed = false;
+  const MipsInstrInfo *TII = STI->getInstrInfo();
+
+  for (MachineFunction::iterator FI = MF.begin(); FI != MF.end(); ++FI) {
+    for (Iter I = FI->begin(); I != FI->end(); ++I) {
+
+      // Forbidden slot hazard handling. Use lookahead over state.
+      if (!TII->HasForbiddenSlot(*I))
+        continue;
+
+      Iter Inst;
+      bool LastInstInFunction =
+          std::next(I) == FI->end() && std::next(FI) == MF.end();
+      if (!LastInstInFunction) {
+        Inst = getNextMachineInstr(std::next(I), &*FI);
+      }
+
+      if (LastInstInFunction || !TII->SafeInForbiddenSlot(*Inst)) {
+        Changed = true;
+        MIBundleBuilder(&*I)
+            .append(BuildMI(MF, I->getDebugLoc(), TII->get(Mips::NOP)));
+        NumInsertedNops++;
+      }
+    }
+  }
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
new file mode 100644
index 000000000000..0e1173f1c617
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -0,0 +1,270 @@
+//===-- MipsISelDAGToDAG.cpp - A Dag to Dag Inst Selector for Mips --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MIPS target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsISelDAGToDAG.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips.h"
+#include "Mips16ISelDAGToDAG.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSEISelDAGToDAG.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-isel"
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MipsDAGToDAGISel - MIPS specific code to select MIPS machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+
+bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+  bool Ret = SelectionDAGISel::runOnMachineFunction(MF);
+
+  processFunctionAfterISel(MF);
+
+  return Ret;
+}
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// GOT address into a register.
+SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
+  unsigned GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg();
+  return CurDAG->getRegister(GlobalBaseReg, getTargetLowering()->getPointerTy(
+                                                CurDAG->getDataLayout()))
+      .getNode();
+}
+
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
+                                        SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
+                                         SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
+                                     SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddr11MM(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddr12MM(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddr16MM(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddrLSL2MM(SDValue Addr, SDValue &Base,
+                                           SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddrSImm10(SDValue Addr, SDValue &Base,
+                                           SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddrSImm10Lsl1(SDValue Addr, SDValue &Base,
+                                               SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddrSImm10Lsl2(SDValue Addr, SDValue &Base,
+                                               SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddrSImm10Lsl3(SDValue Addr, SDValue &Base,
+                                               SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectAddr16(SDValue Addr, SDValue &Base,
+                                    SDValue &Offset) {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectAddr16SP(SDValue Addr, SDValue &Base,
+                                      SDValue &Offset) {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm,
+                                    unsigned MinSizeInBits) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm1(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm2(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm3(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm4(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm5(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm6(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm8(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatSimm5(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimmPow2(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+/// Select instructions not customized! Used for
+/// expanded, promoted and normal instructions
+void MipsDAGToDAGISel::Select(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    Node->setNodeId(-1);
+    return;
+  }
+
+  // See if subclasses can handle this node.
+  if (trySelect(Node))
+    return;
+
+  switch(Opcode) {
+  default: break;
+
+  // Get target GOT address.
+  case ISD::GLOBAL_OFFSET_TABLE:
+    ReplaceNode(Node, getGlobalBaseReg());
+    return;
+
+#ifndef NDEBUG
+  case ISD::LOAD:
+  case ISD::STORE:
+    assert((Subtarget->systemSupportsUnalignedAccess() ||
+            cast<MemSDNode>(Node)->getMemoryVT().getSizeInBits() / 8 <=
+            cast<MemSDNode>(Node)->getAlignment()) &&
+           "Unexpected unaligned loads/stores.");
+    break;
+#endif
+  }
+
+  // Select the default instruction
+  SelectCode(Node);
+}
+
+bool MipsDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                             std::vector<SDValue> &OutOps) {
+  // All memory constraints can at least accept raw pointers.
+  switch(ConstraintID) {
+  default:
+    llvm_unreachable("Unexpected asm memory constraint");
+  case InlineAsm::Constraint_i:
+  case InlineAsm::Constraint_m:
+  case InlineAsm::Constraint_R:
+  case InlineAsm::Constraint_ZC:
+    OutOps.push_back(Op);
+    return false;
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h
new file mode 100644
index 000000000000..20bdd4aa8f5f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -0,0 +1,144 @@
+//===---- MipsISelDAGToDAG.h - A Dag to Dag Inst Selector for Mips --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MIPS target.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSISELDAGTODAG_H
+#define LLVM_LIB_TARGET_MIPS_MIPSISELDAGTODAG_H
+
+#include "Mips.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MipsDAGToDAGISel - MIPS specific code to select MIPS machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+namespace llvm {
+
+class MipsDAGToDAGISel : public SelectionDAGISel {
+public:
+  explicit MipsDAGToDAGISel(MipsTargetMachine &TM, CodeGenOpt::Level OL)
+      : SelectionDAGISel(TM, OL), Subtarget(nullptr) {}
+
+  // Pass Name
+  StringRef getPassName() const override {
+    return "MIPS DAG->DAG Pattern Instruction Selection";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+protected:
+  SDNode *getGlobalBaseReg();
+
+  /// Keep a pointer to the MipsSubtarget around so that we can make the right
+  /// decision when generating code for different targets.
+  const MipsSubtarget *Subtarget;
+
+private:
+  // Include the pieces autogenerated from the target description.
+  #include "MipsGenDAGISel.inc"
+
+  // Complex Pattern.
+  /// (reg + imm).
+  virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
+                                SDValue &Offset) const;
+
+  /// Fall back on this function if all else fails.
+  virtual bool selectAddrDefault(SDValue Addr, SDValue &Base,
+                                 SDValue &Offset) const;
+
+  /// Match integer address pattern.
+  virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
+                             SDValue &Offset) const;
+
+  virtual bool selectIntAddr11MM(SDValue Addr, SDValue &Base,
+                                 SDValue &Offset) const;
+
+  virtual bool selectIntAddr12MM(SDValue Addr, SDValue &Base,
+                               SDValue &Offset) const;
+
+  virtual bool selectIntAddr16MM(SDValue Addr, SDValue &Base,
+                                 SDValue &Offset) const;
+
+  virtual bool selectIntAddrLSL2MM(SDValue Addr, SDValue &Base,
+                                   SDValue &Offset) const;
+
+  /// Match addr+simm10 and addr
+  virtual bool selectIntAddrSImm10(SDValue Addr, SDValue &Base,
+                                   SDValue &Offset) const;
+
+  virtual bool selectIntAddrSImm10Lsl1(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) const;
+
+  virtual bool selectIntAddrSImm10Lsl2(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) const;
+
+  virtual bool selectIntAddrSImm10Lsl3(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) const;
+
+  virtual bool selectAddr16(SDValue Addr, SDValue &Base, SDValue &Offset);
+  virtual bool selectAddr16SP(SDValue Addr, SDValue &Base, SDValue &Offset);
+
+  /// \brief Select constant vector splats.
+  virtual bool selectVSplat(SDNode *N, APInt &Imm,
+                            unsigned MinSizeInBits) const;
+  /// \brief Select constant vector splats whose value fits in a uimm1.
+  virtual bool selectVSplatUimm1(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm2.
+  virtual bool selectVSplatUimm2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm3.
+  virtual bool selectVSplatUimm3(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm4.
+  virtual bool selectVSplatUimm4(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm5.
+  virtual bool selectVSplatUimm5(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm6.
+  virtual bool selectVSplatUimm6(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm8.
+  virtual bool selectVSplatUimm8(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a simm5.
+  virtual bool selectVSplatSimm5(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a power of 2.
+  virtual bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is the inverse of a
+  /// power of 2.
+  virtual bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// ending at the most significant bit
+  virtual bool selectVSplatMaskL(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// starting at bit zero.
+  virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
+
+  void Select(SDNode *N) override;
+
+  virtual bool trySelect(SDNode *Node) = 0;
+
+  // getImm - Return a target constant with the specified value.
+  inline SDValue getImm(const SDNode *Node, uint64_t Imm) {
+    return CurDAG->getTargetConstant(Imm, SDLoc(Node), Node->getValueType(0));
+  }
+
+  virtual void processFunctionAfterISel(MachineFunction &MF) = 0;
+
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    unsigned ConstraintID,
+                                    std::vector<SDValue> &OutOps) override;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
new file mode 100644
index 000000000000..9c511bd77822
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -0,0 +1,4030 @@
+//===-- MipsISelLowering.cpp - Mips DAG Lowering Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Mips uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsISelLowering.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsCCState.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "MipsTargetObjectFile.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-lower"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+static cl::opt<bool>
+LargeGOT("mxgot", cl::Hidden,
+         cl::desc("MIPS: Enable GOT larger than 64k."), cl::init(false));
+
+static cl::opt<bool>
+NoZeroDivCheck("mno-check-zero-division", cl::Hidden,
+               cl::desc("MIPS: Don't trap on integer division by zero."),
+               cl::init(false));
+
+static const MCPhysReg Mips64DPRegs[8] = {
+  Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
+  Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64
+};
+
+// If I is a shifted mask, set the size (Size) and the first bit of the
+// mask (Pos), and return true.
+// For example, if I is 0x003ff800, (Pos, Size) = (11, 11).
+static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
+  if (!isShiftedMask_64(I))
+    return false;
+
+  Size = countPopulation(I);
+  Pos = countTrailingZeros(I);
+  return true;
+}
+
+SDValue MipsTargetLowering::getGlobalReg(SelectionDAG &DAG, EVT Ty) const {
+  MipsFunctionInfo *FI = DAG.getMachineFunction().getInfo<MipsFunctionInfo>();
+  return DAG.getRegister(FI->getGlobalBaseReg(), Ty);
+}
+
+SDValue MipsTargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag);
+}
+
+SDValue MipsTargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
+}
+
+SDValue MipsTargetLowering::getTargetNode(BlockAddressSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
+}
+
+SDValue MipsTargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
+}
+
+SDValue MipsTargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
+                                   N->getOffset(), Flag);
+}
+
+const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch ((MipsISD::NodeType)Opcode) {
+  case MipsISD::FIRST_NUMBER:      break;
+  case MipsISD::JmpLink:           return "MipsISD::JmpLink";
+  case MipsISD::TailCall:          return "MipsISD::TailCall";
+  case MipsISD::Hi:                return "MipsISD::Hi";
+  case MipsISD::Lo:                return "MipsISD::Lo";
+  case MipsISD::GPRel:             return "MipsISD::GPRel";
+  case MipsISD::ThreadPointer:     return "MipsISD::ThreadPointer";
+  case MipsISD::Ret:               return "MipsISD::Ret";
+  case MipsISD::ERet:              return "MipsISD::ERet";
+  case MipsISD::EH_RETURN:         return "MipsISD::EH_RETURN";
+  case MipsISD::FPBrcond:          return "MipsISD::FPBrcond";
+  case MipsISD::FPCmp:             return "MipsISD::FPCmp";
+  case MipsISD::CMovFP_T:          return "MipsISD::CMovFP_T";
+  case MipsISD::CMovFP_F:          return "MipsISD::CMovFP_F";
+  case MipsISD::TruncIntFP:        return "MipsISD::TruncIntFP";
+  case MipsISD::MFHI:              return "MipsISD::MFHI";
+  case MipsISD::MFLO:              return "MipsISD::MFLO";
+  case MipsISD::MTLOHI:            return "MipsISD::MTLOHI";
+  case MipsISD::Mult:              return "MipsISD::Mult";
+  case MipsISD::Multu:             return "MipsISD::Multu";
+  case MipsISD::MAdd:              return "MipsISD::MAdd";
+  case MipsISD::MAddu:             return "MipsISD::MAddu";
+  case MipsISD::MSub:              return "MipsISD::MSub";
+  case MipsISD::MSubu:             return "MipsISD::MSubu";
+  case MipsISD::DivRem:            return "MipsISD::DivRem";
+  case MipsISD::DivRemU:           return "MipsISD::DivRemU";
+  case MipsISD::DivRem16:          return "MipsISD::DivRem16";
+  case MipsISD::DivRemU16:         return "MipsISD::DivRemU16";
+  case MipsISD::BuildPairF64:      return "MipsISD::BuildPairF64";
+  case MipsISD::ExtractElementF64: return "MipsISD::ExtractElementF64";
+  case MipsISD::Wrapper:           return "MipsISD::Wrapper";
+  case MipsISD::DynAlloc:          return "MipsISD::DynAlloc";
+  case MipsISD::Sync:              return "MipsISD::Sync";
+  case MipsISD::Ext:               return "MipsISD::Ext";
+  case MipsISD::Ins:               return "MipsISD::Ins";
+  case MipsISD::LWL:               return "MipsISD::LWL";
+  case MipsISD::LWR:               return "MipsISD::LWR";
+  case MipsISD::SWL:               return "MipsISD::SWL";
+  case MipsISD::SWR:               return "MipsISD::SWR";
+  case MipsISD::LDL:               return "MipsISD::LDL";
+  case MipsISD::LDR:               return "MipsISD::LDR";
+  case MipsISD::SDL:               return "MipsISD::SDL";
+  case MipsISD::SDR:               return "MipsISD::SDR";
+  case MipsISD::EXTP:              return "MipsISD::EXTP";
+  case MipsISD::EXTPDP:            return "MipsISD::EXTPDP";
+  case MipsISD::EXTR_S_H:          return "MipsISD::EXTR_S_H";
+  case MipsISD::EXTR_W:            return "MipsISD::EXTR_W";
+  case MipsISD::EXTR_R_W:          return "MipsISD::EXTR_R_W";
+  case MipsISD::EXTR_RS_W:         return "MipsISD::EXTR_RS_W";
+  case MipsISD::SHILO:             return "MipsISD::SHILO";
+  case MipsISD::MTHLIP:            return "MipsISD::MTHLIP";
+  case MipsISD::MULSAQ_S_W_PH:     return "MipsISD::MULSAQ_S_W_PH";
+  case MipsISD::MAQ_S_W_PHL:       return "MipsISD::MAQ_S_W_PHL";
+  case MipsISD::MAQ_S_W_PHR:       return "MipsISD::MAQ_S_W_PHR";
+  case MipsISD::MAQ_SA_W_PHL:      return "MipsISD::MAQ_SA_W_PHL";
+  case MipsISD::MAQ_SA_W_PHR:      return "MipsISD::MAQ_SA_W_PHR";
+  case MipsISD::DPAU_H_QBL:        return "MipsISD::DPAU_H_QBL";
+  case MipsISD::DPAU_H_QBR:        return "MipsISD::DPAU_H_QBR";
+  case MipsISD::DPSU_H_QBL:        return "MipsISD::DPSU_H_QBL";
+  case MipsISD::DPSU_H_QBR:        return "MipsISD::DPSU_H_QBR";
+  case MipsISD::DPAQ_S_W_PH:       return "MipsISD::DPAQ_S_W_PH";
+  case MipsISD::DPSQ_S_W_PH:       return "MipsISD::DPSQ_S_W_PH";
+  case MipsISD::DPAQ_SA_L_W:       return "MipsISD::DPAQ_SA_L_W";
+  case MipsISD::DPSQ_SA_L_W:       return "MipsISD::DPSQ_SA_L_W";
+  case MipsISD::DPA_W_PH:          return "MipsISD::DPA_W_PH";
+  case MipsISD::DPS_W_PH:          return "MipsISD::DPS_W_PH";
+  case MipsISD::DPAQX_S_W_PH:      return "MipsISD::DPAQX_S_W_PH";
+  case MipsISD::DPAQX_SA_W_PH:     return "MipsISD::DPAQX_SA_W_PH";
+  case MipsISD::DPAX_W_PH:         return "MipsISD::DPAX_W_PH";
+  case MipsISD::DPSX_W_PH:         return "MipsISD::DPSX_W_PH";
+  case MipsISD::DPSQX_S_W_PH:      return "MipsISD::DPSQX_S_W_PH";
+  case MipsISD::DPSQX_SA_W_PH:     return "MipsISD::DPSQX_SA_W_PH";
+  case MipsISD::MULSA_W_PH:        return "MipsISD::MULSA_W_PH";
+  case MipsISD::MULT:              return "MipsISD::MULT";
+  case MipsISD::MULTU:             return "MipsISD::MULTU";
+  case MipsISD::MADD_DSP:          return "MipsISD::MADD_DSP";
+  case MipsISD::MADDU_DSP:         return "MipsISD::MADDU_DSP";
+  case MipsISD::MSUB_DSP:          return "MipsISD::MSUB_DSP";
+  case MipsISD::MSUBU_DSP:         return "MipsISD::MSUBU_DSP";
+  case MipsISD::SHLL_DSP:          return "MipsISD::SHLL_DSP";
+  case MipsISD::SHRA_DSP:          return "MipsISD::SHRA_DSP";
+  case MipsISD::SHRL_DSP:          return "MipsISD::SHRL_DSP";
+  case MipsISD::SETCC_DSP:         return "MipsISD::SETCC_DSP";
+  case MipsISD::SELECT_CC_DSP:     return "MipsISD::SELECT_CC_DSP";
+  case MipsISD::VALL_ZERO:         return "MipsISD::VALL_ZERO";
+  case MipsISD::VANY_ZERO:         return "MipsISD::VANY_ZERO";
+  case MipsISD::VALL_NONZERO:      return "MipsISD::VALL_NONZERO";
+  case MipsISD::VANY_NONZERO:      return "MipsISD::VANY_NONZERO";
+  case MipsISD::VCEQ:              return "MipsISD::VCEQ";
+  case MipsISD::VCLE_S:            return "MipsISD::VCLE_S";
+  case MipsISD::VCLE_U:            return "MipsISD::VCLE_U";
+  case MipsISD::VCLT_S:            return "MipsISD::VCLT_S";
+  case MipsISD::VCLT_U:            return "MipsISD::VCLT_U";
+  case MipsISD::VSMAX:             return "MipsISD::VSMAX";
+  case MipsISD::VSMIN:             return "MipsISD::VSMIN";
+  case MipsISD::VUMAX:             return "MipsISD::VUMAX";
+  case MipsISD::VUMIN:             return "MipsISD::VUMIN";
+  case MipsISD::VEXTRACT_SEXT_ELT: return "MipsISD::VEXTRACT_SEXT_ELT";
+  case MipsISD::VEXTRACT_ZEXT_ELT: return "MipsISD::VEXTRACT_ZEXT_ELT";
+  case MipsISD::VNOR:              return "MipsISD::VNOR";
+  case MipsISD::VSHF:              return "MipsISD::VSHF";
+  case MipsISD::SHF:               return "MipsISD::SHF";
+  case MipsISD::ILVEV:             return "MipsISD::ILVEV";
+  case MipsISD::ILVOD:             return "MipsISD::ILVOD";
+  case MipsISD::ILVL:              return "MipsISD::ILVL";
+  case MipsISD::ILVR:              return "MipsISD::ILVR";
+  case MipsISD::PCKEV:             return "MipsISD::PCKEV";
+  case MipsISD::PCKOD:             return "MipsISD::PCKOD";
+  case MipsISD::INSVE:             return "MipsISD::INSVE";
+  }
+  return nullptr;
+}
+
+MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
+                                       const MipsSubtarget &STI)
+    : TargetLowering(TM), Subtarget(STI), ABI(TM.getABI()) {
+  // Mips does not have i1 type, so use i32 for
+  // setcc operations results (slt, sgt, ...).
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+  // The cmp.cond.fmt instruction in MIPS32r6/MIPS64r6 uses 0 and -1 like MSA
+  // does. Integer booleans still use 0 and 1.
+  if (Subtarget.hasMips32r6())
+    setBooleanContents(ZeroOrOneBooleanContent,
+                       ZeroOrNegativeOneBooleanContent);
+
+  // Load extented operations for i1 types must be promoted
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD,  VT, MVT::i1,  Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1,  Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1,  Promote);
+  }
+
+  // MIPS doesn't have extending float->double load/store.  Set LoadExtAction
+  // for f32, f16
+  for (MVT VT : MVT::fp_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+  }
+
+  // Set LoadExtAction for f16 vectors to Expand
+  for (MVT VT : MVT::fp_vector_valuetypes()) {
+    MVT F16VT = MVT::getVectorVT(MVT::f16, VT.getVectorNumElements());
+    if (F16VT.isValid())
+      setLoadExtAction(ISD::EXTLOAD, VT, F16VT, Expand);
+  }
+
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // Used by legalize types to correctly generate the setcc result.
+  // Without this, every float setcc comes with a AND/OR with the result,
+  // we don't want this, since the fpcmp result goes to a flag register,
+  // which is used implicitly by brcond and select operations.
+  AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
+
+  // Mips Custom Operations
+  setOperationAction(ISD::BR_JT,              MVT::Other, Expand);
+  setOperationAction(ISD::GlobalAddress,      MVT::i32,   Custom);
+  setOperationAction(ISD::BlockAddress,       MVT::i32,   Custom);
+  setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
+  setOperationAction(ISD::JumpTable,          MVT::i32,   Custom);
+  setOperationAction(ISD::ConstantPool,       MVT::i32,   Custom);
+  setOperationAction(ISD::SELECT,             MVT::f32,   Custom);
+  setOperationAction(ISD::SELECT,             MVT::f64,   Custom);
+  setOperationAction(ISD::SELECT,             MVT::i32,   Custom);
+  setOperationAction(ISD::SETCC,              MVT::f32,   Custom);
+  setOperationAction(ISD::SETCC,              MVT::f64,   Custom);
+  setOperationAction(ISD::BRCOND,             MVT::Other, Custom);
+  setOperationAction(ISD::FCOPYSIGN,          MVT::f32,   Custom);
+  setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
+  setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
+
+  if (Subtarget.isGP64bit()) {
+    setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
+    setOperationAction(ISD::BlockAddress,       MVT::i64,   Custom);
+    setOperationAction(ISD::GlobalTLSAddress,   MVT::i64,   Custom);
+    setOperationAction(ISD::JumpTable,          MVT::i64,   Custom);
+    setOperationAction(ISD::ConstantPool,       MVT::i64,   Custom);
+    setOperationAction(ISD::SELECT,             MVT::i64,   Custom);
+    setOperationAction(ISD::LOAD,               MVT::i64,   Custom);
+    setOperationAction(ISD::STORE,              MVT::i64,   Custom);
+    setOperationAction(ISD::FP_TO_SINT,         MVT::i64,   Custom);
+    setOperationAction(ISD::SHL_PARTS,          MVT::i64,   Custom);
+    setOperationAction(ISD::SRA_PARTS,          MVT::i64,   Custom);
+    setOperationAction(ISD::SRL_PARTS,          MVT::i64,   Custom);
+  }
+
+  if (!Subtarget.isGP64bit()) {
+    setOperationAction(ISD::SHL_PARTS,          MVT::i32,   Custom);
+    setOperationAction(ISD::SRA_PARTS,          MVT::i32,   Custom);
+    setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Custom);
+  }
+
+  setOperationAction(ISD::EH_DWARF_CFA,         MVT::i32,   Custom);
+  if (Subtarget.isGP64bit())
+    setOperationAction(ISD::EH_DWARF_CFA,       MVT::i64,   Custom);
+
+  setOperationAction(ISD::SDIV, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIV, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIV, MVT::i64, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIV, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  // Operations not directly supported by Mips.
+  setOperationAction(ISD::BR_CC,             MVT::f32,   Expand);
+  setOperationAction(ISD::BR_CC,             MVT::f64,   Expand);
+  setOperationAction(ISD::BR_CC,             MVT::i32,   Expand);
+  setOperationAction(ISD::BR_CC,             MVT::i64,   Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::i32,   Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::i64,   Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::f32,   Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::f64,   Expand);
+  setOperationAction(ISD::UINT_TO_FP,        MVT::i32,   Expand);
+  setOperationAction(ISD::UINT_TO_FP,        MVT::i64,   Expand);
+  setOperationAction(ISD::FP_TO_UINT,        MVT::i32,   Expand);
+  setOperationAction(ISD::FP_TO_UINT,        MVT::i64,   Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,    Expand);
+  if (Subtarget.hasCnMips()) {
+    setOperationAction(ISD::CTPOP,           MVT::i32,   Legal);
+    setOperationAction(ISD::CTPOP,           MVT::i64,   Legal);
+  } else {
+    setOperationAction(ISD::CTPOP,           MVT::i32,   Expand);
+    setOperationAction(ISD::CTPOP,           MVT::i64,   Expand);
+  }
+  setOperationAction(ISD::CTTZ,              MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ,              MVT::i64,   Expand);
+  setOperationAction(ISD::ROTL,              MVT::i32,   Expand);
+  setOperationAction(ISD::ROTL,              MVT::i64,   Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,  Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64,  Expand);
+
+  if (!Subtarget.hasMips32r2())
+    setOperationAction(ISD::ROTR, MVT::i32,   Expand);
+
+  if (!Subtarget.hasMips64r2())
+    setOperationAction(ISD::ROTR, MVT::i64,   Expand);
+
+  setOperationAction(ISD::FSIN,              MVT::f32,   Expand);
+  setOperationAction(ISD::FSIN,              MVT::f64,   Expand);
+  setOperationAction(ISD::FCOS,              MVT::f32,   Expand);
+  setOperationAction(ISD::FCOS,              MVT::f64,   Expand);
+  setOperationAction(ISD::FSINCOS,           MVT::f32,   Expand);
+  setOperationAction(ISD::FSINCOS,           MVT::f64,   Expand);
+  setOperationAction(ISD::FPOWI,             MVT::f32,   Expand);
+  setOperationAction(ISD::FPOW,              MVT::f32,   Expand);
+  setOperationAction(ISD::FPOW,              MVT::f64,   Expand);
+  setOperationAction(ISD::FLOG,              MVT::f32,   Expand);
+  setOperationAction(ISD::FLOG2,             MVT::f32,   Expand);
+  setOperationAction(ISD::FLOG10,            MVT::f32,   Expand);
+  setOperationAction(ISD::FEXP,              MVT::f32,   Expand);
+  setOperationAction(ISD::FMA,               MVT::f32,   Expand);
+  setOperationAction(ISD::FMA,               MVT::f64,   Expand);
+  setOperationAction(ISD::FREM,              MVT::f32,   Expand);
+  setOperationAction(ISD::FREM,              MVT::f64,   Expand);
+
+  // Lower f16 conversion operations into library calls
+  setOperationAction(ISD::FP16_TO_FP,        MVT::f32,   Expand);
+  setOperationAction(ISD::FP_TO_FP16,        MVT::f32,   Expand);
+  setOperationAction(ISD::FP16_TO_FP,        MVT::f64,   Expand);
+  setOperationAction(ISD::FP_TO_FP16,        MVT::f64,   Expand);
+
+  setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
+
+  setOperationAction(ISD::VASTART,           MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,             MVT::Other, Custom);
+  setOperationAction(ISD::VACOPY,            MVT::Other, Expand);
+  setOperationAction(ISD::VAEND,             MVT::Other, Expand);
+
+  // Use the default for now
+  setOperationAction(ISD::STACKSAVE,         MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,      MVT::Other, Expand);
+
+  if (!Subtarget.isGP64bit()) {
+    setOperationAction(ISD::ATOMIC_LOAD,     MVT::i64,   Expand);
+    setOperationAction(ISD::ATOMIC_STORE,    MVT::i64,   Expand);
+  }
+
+
+  if (!Subtarget.hasMips32r2()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  }
+
+  // MIPS16 lacks MIPS32's clz and clo instructions.
+  if (!Subtarget.hasMips32() || Subtarget.inMips16Mode())
+    setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+  if (!Subtarget.hasMips64())
+    setOperationAction(ISD::CTLZ, MVT::i64, Expand);
+
+  if (!Subtarget.hasMips32r2())
+    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  if (!Subtarget.hasMips64r2())
+    setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+  if (Subtarget.isGP64bit()) {
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, MVT::i32, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, MVT::i32, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::i64, MVT::i32, Custom);
+    setTruncStoreAction(MVT::i64, MVT::i32, Custom);
+  }
+
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  setTargetDAGCombine(ISD::SDIVREM);
+  setTargetDAGCombine(ISD::UDIVREM);
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::AssertZext);
+
+  if (ABI.IsO32()) {
+    // These libcalls are not available in 32-bit.
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
+  }
+
+  setMinFunctionAlignment(Subtarget.isGP64bit() ? 3 : 2);
+
+  // The arguments on the stack are defined in terms of 4-byte slots on O32
+  // and 8-byte slots on N32/N64.
+  setMinStackArgumentAlignment((ABI.IsN32() || ABI.IsN64()) ? 8 : 4);
+
+  setStackPointerRegisterToSaveRestore(ABI.IsN64() ? Mips::SP_64 : Mips::SP);
+
+  MaxStoresPerMemcpy = 16;
+
+  isMicroMips = Subtarget.inMicroMipsMode();
+}
+
+const MipsTargetLowering *MipsTargetLowering::create(const MipsTargetMachine &TM,
+                                                     const MipsSubtarget &STI) {
+  if (STI.inMips16Mode())
+    return llvm::createMips16TargetLowering(TM, STI);
+
+  return llvm::createMipsSETargetLowering(TM, STI);
+}
+
+// Create a fast isel object.
+FastISel *
+MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                  const TargetLibraryInfo *libInfo) const {
+  const MipsTargetMachine &TM =
+      static_cast<const MipsTargetMachine &>(funcInfo.MF->getTarget());
+
+  // We support only the standard encoding [MIPS32,MIPS32R5] ISAs.
+  bool UseFastISel = TM.Options.EnableFastISel && Subtarget.hasMips32() &&
+                     !Subtarget.hasMips32r6() && !Subtarget.inMips16Mode() &&
+                     !Subtarget.inMicroMipsMode();
+
+  // Disable if we don't generate PIC or the ABI isn't O32.
+  if (!TM.isPositionIndependent() || !TM.getABI().IsO32())
+    UseFastISel = false;
+
+  return UseFastISel ? Mips::createFastISel(funcInfo, libInfo) : nullptr;
+}
+
+EVT MipsTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
+                                           EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+static SDValue performDivRemCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const MipsSubtarget &Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  EVT Ty = N->getValueType(0);
+  unsigned LO = (Ty == MVT::i32) ? Mips::LO0 : Mips::LO0_64;
+  unsigned HI = (Ty == MVT::i32) ? Mips::HI0 : Mips::HI0_64;
+  unsigned Opc = N->getOpcode() == ISD::SDIVREM ? MipsISD::DivRem16 :
+                                                  MipsISD::DivRemU16;
+  SDLoc DL(N);
+
+  SDValue DivRem = DAG.getNode(Opc, DL, MVT::Glue,
+                               N->getOperand(0), N->getOperand(1));
+  SDValue InChain = DAG.getEntryNode();
+  SDValue InGlue = DivRem;
+
+  // insert MFLO
+  if (N->hasAnyUseOfValue(0)) {
+    SDValue CopyFromLo = DAG.getCopyFromReg(InChain, DL, LO, Ty,
+                                            InGlue);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyFromLo);
+    InChain = CopyFromLo.getValue(1);
+    InGlue = CopyFromLo.getValue(2);
+  }
+
+  // insert MFHI
+  if (N->hasAnyUseOfValue(1)) {
+    SDValue CopyFromHi = DAG.getCopyFromReg(InChain, DL,
+                                            HI, Ty, InGlue);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), CopyFromHi);
+  }
+
+  return SDValue();
+}
+
+static Mips::CondCode condCodeToFCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown fp condition code!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: return Mips::FCOND_OEQ;
+  case ISD::SETUNE: return Mips::FCOND_UNE;
+  case ISD::SETLT:
+  case ISD::SETOLT: return Mips::FCOND_OLT;
+  case ISD::SETGT:
+  case ISD::SETOGT: return Mips::FCOND_OGT;
+  case ISD::SETLE:
+  case ISD::SETOLE: return Mips::FCOND_OLE;
+  case ISD::SETGE:
+  case ISD::SETOGE: return Mips::FCOND_OGE;
+  case ISD::SETULT: return Mips::FCOND_ULT;
+  case ISD::SETULE: return Mips::FCOND_ULE;
+  case ISD::SETUGT: return Mips::FCOND_UGT;
+  case ISD::SETUGE: return Mips::FCOND_UGE;
+  case ISD::SETUO:  return Mips::FCOND_UN;
+  case ISD::SETO:   return Mips::FCOND_OR;
+  case ISD::SETNE:
+  case ISD::SETONE: return Mips::FCOND_ONE;
+  case ISD::SETUEQ: return Mips::FCOND_UEQ;
+  }
+}
+
+
+/// This function returns true if the floating point conditional branches and
+/// conditional moves which use condition code CC should be inverted.
+static bool invertFPCondCodeUser(Mips::CondCode CC) {
+  if (CC >= Mips::FCOND_F && CC <= Mips::FCOND_NGT)
+    return false;
+
+  assert((CC >= Mips::FCOND_T && CC <= Mips::FCOND_GT) &&
+         "Illegal Condition Code");
+
+  return true;
+}
+
+// Creates and returns an FPCmp node from a setcc node.
+// Returns Op if setcc is not a floating point comparison.
+static SDValue createFPCmp(SelectionDAG &DAG, const SDValue &Op) {
+  // must be a SETCC node
+  if (Op.getOpcode() != ISD::SETCC)
+    return Op;
+
+  SDValue LHS = Op.getOperand(0);
+
+  if (!LHS.getValueType().isFloatingPoint())
+    return Op;
+
+  SDValue RHS = Op.getOperand(1);
+  SDLoc DL(Op);
+
+  // Assume the 3rd operand is a CondCodeSDNode. Add code to check the type of
+  // node if necessary.
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+  return DAG.getNode(MipsISD::FPCmp, DL, MVT::Glue, LHS, RHS,
+                     DAG.getConstant(condCodeToFCC(CC), DL, MVT::i32));
+}
+
+// Creates and returns a CMovFPT/F node.
+static SDValue createCMovFP(SelectionDAG &DAG, SDValue Cond, SDValue True,
+                            SDValue False, const SDLoc &DL) {
+  ConstantSDNode *CC = cast<ConstantSDNode>(Cond.getOperand(2));
+  bool invert = invertFPCondCodeUser((Mips::CondCode)CC->getSExtValue());
+  SDValue FCC0 = DAG.getRegister(Mips::FCC0, MVT::i32);
+
+  return DAG.getNode((invert ? MipsISD::CMovFP_F : MipsISD::CMovFP_T), DL,
+                     True.getValueType(), True, FCC0, False, Cond);
+}
+
+static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const MipsSubtarget &Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue SetCC = N->getOperand(0);
+
+  if ((SetCC.getOpcode() != ISD::SETCC) ||
+      !SetCC.getOperand(0).getValueType().isInteger())
+    return SDValue();
+
+  SDValue False = N->getOperand(2);
+  EVT FalseTy = False.getValueType();
+
+  if (!FalseTy.isInteger())
+    return SDValue();
+
+  ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(False);
+
+  // If the RHS (False) is 0, we swap the order of the operands
+  // of ISD::SELECT (obviously also inverting the condition) so that we can
+  // take advantage of conditional moves using the $0 register.
+  // Example:
+  //   return (a != 0) ? x : 0;
+  //     load $reg, x
+  //     movz $reg, $0, a
+  if (!FalseC)
+    return SDValue();
+
+  const SDLoc DL(N);
+
+  if (!FalseC->getZExtValue()) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+    SDValue True = N->getOperand(1);
+
+    SetCC = DAG.getSetCC(DL, SetCC.getValueType(), SetCC.getOperand(0),
+                         SetCC.getOperand(1), ISD::getSetCCInverse(CC, true));
+
+    return DAG.getNode(ISD::SELECT, DL, FalseTy, SetCC, False, True);
+  }
+
+  // If both operands are integer constants there's a possibility that we
+  // can do some interesting optimizations.
+  SDValue True = N->getOperand(1);
+  ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(True);
+
+  if (!TrueC || !True.getValueType().isInteger())
+    return SDValue();
+
+  // We'll also ignore MVT::i64 operands as this optimizations proves
+  // to be ineffective because of the required sign extensions as the result
+  // of a SETCC operator is always MVT::i32 for non-vector types.
+  if (True.getValueType() == MVT::i64)
+    return SDValue();
+
+  int64_t Diff = TrueC->getSExtValue() - FalseC->getSExtValue();
+
+  // 1)  (a < x) ? y : y-1
+  //  slti $reg1, a, x
+  //  addiu $reg2, $reg1, y-1
+  if (Diff == 1)
+    return DAG.getNode(ISD::ADD, DL, SetCC.getValueType(), SetCC, False);
+
+  // 2)  (a < x) ? y-1 : y
+  //  slti $reg1, a, x
+  //  xor $reg1, $reg1, 1
+  //  addiu $reg2, $reg1, y-1
+  if (Diff == -1) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+    SetCC = DAG.getSetCC(DL, SetCC.getValueType(), SetCC.getOperand(0),
+                         SetCC.getOperand(1), ISD::getSetCCInverse(CC, true));
+    return DAG.getNode(ISD::ADD, DL, SetCC.getValueType(), SetCC, True);
+  }
+
+  // Couldn't optimize.
+  return SDValue();
+}
+
+static SDValue performCMovFPCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const MipsSubtarget &Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue ValueIfTrue = N->getOperand(0), ValueIfFalse = N->getOperand(2);
+
+  ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(ValueIfFalse);
+  if (!FalseC || FalseC->getZExtValue())
+    return SDValue();
+
+  // Since RHS (False) is 0, we swap the order of the True/False operands
+  // (obviously also inverting the condition) so that we can
+  // take advantage of conditional moves using the $0 register.
+  // Example:
+  //   return (a != 0) ? x : 0;
+  //     load $reg, x
+  //     movz $reg, $0, a
+  unsigned Opc = (N->getOpcode() == MipsISD::CMovFP_T) ? MipsISD::CMovFP_F :
+                                                         MipsISD::CMovFP_T;
+
+  SDValue FCC = N->getOperand(1), Glue = N->getOperand(3);
+  return DAG.getNode(Opc, SDLoc(N), ValueIfFalse.getValueType(),
+                     ValueIfFalse, FCC, ValueIfTrue, Glue);
+}
+
+static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  // Pattern match EXT.
+  //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
+  //  => ext $dst, $src, size, pos
+  if (DCI.isBeforeLegalizeOps() || !Subtarget.hasExtractInsert())
+    return SDValue();
+
+  SDValue ShiftRight = N->getOperand(0), Mask = N->getOperand(1);
+  unsigned ShiftRightOpc = ShiftRight.getOpcode();
+
+  // Op's first operand must be a shift right.
+  if (ShiftRightOpc != ISD::SRA && ShiftRightOpc != ISD::SRL)
+    return SDValue();
+
+  // The second operand of the shift must be an immediate.
+  ConstantSDNode *CN;
+  if (!(CN = dyn_cast<ConstantSDNode>(ShiftRight.getOperand(1))))
+    return SDValue();
+
+  uint64_t Pos = CN->getZExtValue();
+  uint64_t SMPos, SMSize;
+
+  // Op's second operand must be a shifted mask.
+  if (!(CN = dyn_cast<ConstantSDNode>(Mask)) ||
+      !isShiftedMask(CN->getZExtValue(), SMPos, SMSize))
+    return SDValue();
+
+  // Return if the shifted mask does not start at bit 0 or the sum of its size
+  // and Pos exceeds the word's size.
+  EVT ValTy = N->getValueType(0);
+  if (SMPos != 0 || Pos + SMSize > ValTy.getSizeInBits())
+    return SDValue();
+
+  SDLoc DL(N);
+  return DAG.getNode(MipsISD::Ext, DL, ValTy,
+                     ShiftRight.getOperand(0),
+                     DAG.getConstant(Pos, DL, MVT::i32),
+                     DAG.getConstant(SMSize, DL, MVT::i32));
+}
+
+static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const MipsSubtarget &Subtarget) {
+  // Pattern match INS.
+  //  $dst = or (and $src1 , mask0), (and (shl $src, pos), mask1),
+  //  where mask1 = (2**size - 1) << pos, mask0 = ~mask1
+  //  => ins $dst, $src, size, pos, $src1
+  if (DCI.isBeforeLegalizeOps() || !Subtarget.hasExtractInsert())
+    return SDValue();
+
+  SDValue And0 = N->getOperand(0), And1 = N->getOperand(1);
+  uint64_t SMPos0, SMSize0, SMPos1, SMSize1;
+  ConstantSDNode *CN;
+
+  // See if Op's first operand matches (and $src1 , mask0).
+  if (And0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (!(CN = dyn_cast<ConstantSDNode>(And0.getOperand(1))) ||
+      !isShiftedMask(~CN->getSExtValue(), SMPos0, SMSize0))
+    return SDValue();
+
+  // See if Op's second operand matches (and (shl $src, pos), mask1).
+  if (And1.getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (!(CN = dyn_cast<ConstantSDNode>(And1.getOperand(1))) ||
+      !isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1))
+    return SDValue();
+
+  // The shift masks must have the same position and size.
+  if (SMPos0 != SMPos1 || SMSize0 != SMSize1)
+    return SDValue();
+
+  SDValue Shl = And1.getOperand(0);
+  if (Shl.getOpcode() != ISD::SHL)
+    return SDValue();
+
+  if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1))))
+    return SDValue();
+
+  unsigned Shamt = CN->getZExtValue();
+
+  // Return if the shift amount and the first bit position of mask are not the
+  // same.
+  EVT ValTy = N->getValueType(0);
+  if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits()))
+    return SDValue();
+
+  SDLoc DL(N);
+  return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0),
+                     DAG.getConstant(SMPos0, DL, MVT::i32),
+                     DAG.getConstant(SMSize0, DL, MVT::i32),
+                     And0.getOperand(0));
+}
+
+static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
+
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue Add = N->getOperand(1);
+
+  if (Add.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  SDValue Lo = Add.getOperand(1);
+
+  if ((Lo.getOpcode() != MipsISD::Lo) ||
+      (Lo.getOperand(0).getOpcode() != ISD::TargetJumpTable))
+    return SDValue();
+
+  EVT ValTy = N->getValueType(0);
+  SDLoc DL(N);
+
+  SDValue Add1 = DAG.getNode(ISD::ADD, DL, ValTy, N->getOperand(0),
+                             Add.getOperand(0));
+  return DAG.getNode(ISD::ADD, DL, ValTy, Add1, Lo);
+}
+
+static SDValue performAssertZextCombine(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const MipsSubtarget &Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  EVT NarrowerVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+
+  if (N0.getOpcode() != ISD::TRUNCATE)
+    return SDValue();
+
+  if (N0.getOperand(0).getOpcode() != ISD::AssertZext)
+    return SDValue();
+
+  // fold (AssertZext (trunc (AssertZext x))) -> (trunc (AssertZext x))
+  // if the type of the extension of the innermost AssertZext node is
+  // smaller from that of the outermost node, eg:
+  // (AssertZext:i32 (trunc:i32 (AssertZext:i64 X, i32)), i8)
+  //   -> (trunc:i32 (AssertZext X, i8))
+  SDValue WiderAssertZext = N0.getOperand(0);
+  EVT WiderVT = cast<VTSDNode>(WiderAssertZext->getOperand(1))->getVT();
+
+  if (NarrowerVT.bitsLT(WiderVT)) {
+    SDValue NewAssertZext = DAG.getNode(
+        ISD::AssertZext, SDLoc(N), WiderAssertZext.getValueType(),
+        WiderAssertZext.getOperand(0), DAG.getValueType(NarrowerVT));
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0),
+                       NewAssertZext);
+  }
+
+  return SDValue();
+}
+
+SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
+  const {
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned Opc = N->getOpcode();
+
+  switch (Opc) {
+  default: break;
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+    return performDivRemCombine(N, DAG, DCI, Subtarget);
+  case ISD::SELECT:
+    return performSELECTCombine(N, DAG, DCI, Subtarget);
+  case MipsISD::CMovFP_F:
+  case MipsISD::CMovFP_T:
+    return performCMovFPCombine(N, DAG, DCI, Subtarget);
+  case ISD::AND:
+    return performANDCombine(N, DAG, DCI, Subtarget);
+  case ISD::OR:
+    return performORCombine(N, DAG, DCI, Subtarget);
+  case ISD::ADD:
+    return performADDCombine(N, DAG, DCI, Subtarget);
+  case ISD::AssertZext:
+    return performAssertZextCombine(N, DAG, DCI, Subtarget);
+  }
+
+  return SDValue();
+}
+
+bool MipsTargetLowering::isCheapToSpeculateCttz() const {
+  return Subtarget.hasMips32();
+}
+
+bool MipsTargetLowering::isCheapToSpeculateCtlz() const {
+  return Subtarget.hasMips32();
+}
+
+void
+MipsTargetLowering::LowerOperationWrapper(SDNode *N,
+                                          SmallVectorImpl<SDValue> &Results,
+                                          SelectionDAG &DAG) const {
+  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+  for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
+    Results.push_back(Res.getValue(I));
+}
+
+void
+MipsTargetLowering::ReplaceNodeResults(SDNode *N,
+                                       SmallVectorImpl<SDValue> &Results,
+                                       SelectionDAG &DAG) const {
+  return LowerOperationWrapper(N, Results, DAG);
+}
+
+SDValue MipsTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) const
+{
+  switch (Op.getOpcode())
+  {
+  case ISD::BRCOND:             return lowerBRCOND(Op, DAG);
+  case ISD::ConstantPool:       return lowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:      return lowerGlobalAddress(Op, DAG);
+  case ISD::BlockAddress:       return lowerBlockAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return lowerGlobalTLSAddress(Op, DAG);
+  case ISD::JumpTable:          return lowerJumpTable(Op, DAG);
+  case ISD::SELECT:             return lowerSELECT(Op, DAG);
+  case ISD::SETCC:              return lowerSETCC(Op, DAG);
+  case ISD::VASTART:            return lowerVASTART(Op, DAG);
+  case ISD::VAARG:              return lowerVAARG(Op, DAG);
+  case ISD::FCOPYSIGN:          return lowerFCOPYSIGN(Op, DAG);
+  case ISD::FRAMEADDR:          return lowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:         return lowerRETURNADDR(Op, DAG);
+  case ISD::EH_RETURN:          return lowerEH_RETURN(Op, DAG);
+  case ISD::ATOMIC_FENCE:       return lowerATOMIC_FENCE(Op, DAG);
+  case ISD::SHL_PARTS:          return lowerShiftLeftParts(Op, DAG);
+  case ISD::SRA_PARTS:          return lowerShiftRightParts(Op, DAG, true);
+  case ISD::SRL_PARTS:          return lowerShiftRightParts(Op, DAG, false);
+  case ISD::LOAD:               return lowerLOAD(Op, DAG);
+  case ISD::STORE:              return lowerSTORE(Op, DAG);
+  case ISD::EH_DWARF_CFA:       return lowerEH_DWARF_CFA(Op, DAG);
+  case ISD::FP_TO_SINT:         return lowerFP_TO_SINT(Op, DAG);
+  }
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Lower helper functions
+//===----------------------------------------------------------------------===//
+
+// addLiveIn - This helper function adds the specified physical register to the
+// MachineFunction as a live in value.  It also creates a corresponding
+// virtual register for it.
+static unsigned
+addLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC)
+{
+  unsigned VReg = MF.getRegInfo().createVirtualRegister(RC);
+  MF.getRegInfo().addLiveIn(PReg, VReg);
+  return VReg;
+}
+
+static MachineBasicBlock *insertDivByZeroTrap(MachineInstr &MI,
+                                              MachineBasicBlock &MBB,
+                                              const TargetInstrInfo &TII,
+                                              bool Is64Bit, bool IsMicroMips) {
+  if (NoZeroDivCheck)
+    return &MBB;
+
+  // Insert instruction "teq $divisor_reg, $zero, 7".
+  MachineBasicBlock::iterator I(MI);
+  MachineInstrBuilder MIB;
+  MachineOperand &Divisor = MI.getOperand(2);
+  MIB = BuildMI(MBB, std::next(I), MI.getDebugLoc(),
+                TII.get(IsMicroMips ? Mips::TEQ_MM : Mips::TEQ))
+            .addReg(Divisor.getReg(), getKillRegState(Divisor.isKill()))
+            .addReg(Mips::ZERO)
+            .addImm(7);
+
+  // Use the 32-bit sub-register if this is a 64-bit division.
+  if (Is64Bit)
+    MIB->getOperand(0).setSubReg(Mips::sub_32);
+
+  // Clear Divisor's kill flag.
+  Divisor.setIsKill(false);
+
+  // We would normally delete the original instruction here but in this case
+  // we only needed to inject an additional instruction rather than replace it.
+
+  return &MBB;
+}
+
+MachineBasicBlock *
+MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                                MachineBasicBlock *BB) const {
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected instr type to insert");
+  case Mips::ATOMIC_LOAD_ADD_I8:
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu);
+  case Mips::ATOMIC_LOAD_ADD_I16:
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::ADDu);
+  case Mips::ATOMIC_LOAD_ADD_I32:
+    return emitAtomicBinary(MI, BB, 4, Mips::ADDu);
+  case Mips::ATOMIC_LOAD_ADD_I64:
+    return emitAtomicBinary(MI, BB, 8, Mips::DADDu);
+
+  case Mips::ATOMIC_LOAD_AND_I8:
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::AND);
+  case Mips::ATOMIC_LOAD_AND_I16:
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::AND);
+  case Mips::ATOMIC_LOAD_AND_I32:
+    return emitAtomicBinary(MI, BB, 4, Mips::AND);
+  case Mips::ATOMIC_LOAD_AND_I64:
+    return emitAtomicBinary(MI, BB, 8, Mips::AND64);
+
+  case Mips::ATOMIC_LOAD_OR_I8:
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::OR);
+  case Mips::ATOMIC_LOAD_OR_I16:
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::OR);
+  case Mips::ATOMIC_LOAD_OR_I32:
+    return emitAtomicBinary(MI, BB, 4, Mips::OR);
+  case Mips::ATOMIC_LOAD_OR_I64:
+    return emitAtomicBinary(MI, BB, 8, Mips::OR64);
+
+  case Mips::ATOMIC_LOAD_XOR_I8:
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::XOR);
+  case Mips::ATOMIC_LOAD_XOR_I16:
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::XOR);
+  case Mips::ATOMIC_LOAD_XOR_I32:
+    return emitAtomicBinary(MI, BB, 4, Mips::XOR);
+  case Mips::ATOMIC_LOAD_XOR_I64:
+    return emitAtomicBinary(MI, BB, 8, Mips::XOR64);
+
+  case Mips::ATOMIC_LOAD_NAND_I8:
+    return emitAtomicBinaryPartword(MI, BB, 1, 0, true);
+  case Mips::ATOMIC_LOAD_NAND_I16:
+    return emitAtomicBinaryPartword(MI, BB, 2, 0, true);
+  case Mips::ATOMIC_LOAD_NAND_I32:
+    return emitAtomicBinary(MI, BB, 4, 0, true);
+  case Mips::ATOMIC_LOAD_NAND_I64:
+    return emitAtomicBinary(MI, BB, 8, 0, true);
+
+  case Mips::ATOMIC_LOAD_SUB_I8:
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::SUBu);
+  case Mips::ATOMIC_LOAD_SUB_I16:
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::SUBu);
+  case Mips::ATOMIC_LOAD_SUB_I32:
+    return emitAtomicBinary(MI, BB, 4, Mips::SUBu);
+  case Mips::ATOMIC_LOAD_SUB_I64:
+    return emitAtomicBinary(MI, BB, 8, Mips::DSUBu);
+
+  case Mips::ATOMIC_SWAP_I8:
+    return emitAtomicBinaryPartword(MI, BB, 1, 0);
+  case Mips::ATOMIC_SWAP_I16:
+    return emitAtomicBinaryPartword(MI, BB, 2, 0);
+  case Mips::ATOMIC_SWAP_I32:
+    return emitAtomicBinary(MI, BB, 4, 0);
+  case Mips::ATOMIC_SWAP_I64:
+    return emitAtomicBinary(MI, BB, 8, 0);
+
+  case Mips::ATOMIC_CMP_SWAP_I8:
+    return emitAtomicCmpSwapPartword(MI, BB, 1);
+  case Mips::ATOMIC_CMP_SWAP_I16:
+    return emitAtomicCmpSwapPartword(MI, BB, 2);
+  case Mips::ATOMIC_CMP_SWAP_I32:
+    return emitAtomicCmpSwap(MI, BB, 4);
+  case Mips::ATOMIC_CMP_SWAP_I64:
+    return emitAtomicCmpSwap(MI, BB, 8);
+  case Mips::PseudoSDIV:
+  case Mips::PseudoUDIV:
+  case Mips::DIV:
+  case Mips::DIVU:
+  case Mips::MOD:
+  case Mips::MODU:
+    return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo(), false,
+                               false);
+  case Mips::SDIV_MM_Pseudo:
+  case Mips::UDIV_MM_Pseudo:
+  case Mips::SDIV_MM:
+  case Mips::UDIV_MM:
+  case Mips::DIV_MMR6:
+  case Mips::DIVU_MMR6:
+  case Mips::MOD_MMR6:
+  case Mips::MODU_MMR6:
+    return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo(), false, true);
+  case Mips::PseudoDSDIV:
+  case Mips::PseudoDUDIV:
+  case Mips::DDIV:
+  case Mips::DDIVU:
+  case Mips::DMOD:
+  case Mips::DMODU:
+    return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo(), true, false);
+  case Mips::DDIV_MM64R6:
+  case Mips::DDIVU_MM64R6:
+  case Mips::DMOD_MM64R6:
+  case Mips::DMODU_MM64R6:
+    return insertDivByZeroTrap(MI, *BB, *Subtarget.getInstrInfo(), true, true);
+  case Mips::SEL_D:
+  case Mips::SEL_D_MMR6:
+    return emitSEL_D(MI, BB);
+
+  case Mips::PseudoSELECT_I:
+  case Mips::PseudoSELECT_I64:
+  case Mips::PseudoSELECT_S:
+  case Mips::PseudoSELECT_D32:
+  case Mips::PseudoSELECT_D64:
+    return emitPseudoSELECT(MI, BB, false, Mips::BNE);
+  case Mips::PseudoSELECTFP_F_I:
+  case Mips::PseudoSELECTFP_F_I64:
+  case Mips::PseudoSELECTFP_F_S:
+  case Mips::PseudoSELECTFP_F_D32:
+  case Mips::PseudoSELECTFP_F_D64:
+    return emitPseudoSELECT(MI, BB, true, Mips::BC1F);
+  case Mips::PseudoSELECTFP_T_I:
+  case Mips::PseudoSELECTFP_T_I64:
+  case Mips::PseudoSELECTFP_T_S:
+  case Mips::PseudoSELECTFP_T_D32:
+  case Mips::PseudoSELECTFP_T_D64:
+    return emitPseudoSELECT(MI, BB, true, Mips::BC1T);
+  }
+}
+
+// This function also handles Mips::ATOMIC_SWAP_I32 (when BinOpcode == 0), and
+// Mips::ATOMIC_LOAD_NAND_I32 (when Nand == true)
+MachineBasicBlock *MipsTargetLowering::emitAtomicBinary(MachineInstr &MI,
+                                                        MachineBasicBlock *BB,
+                                                        unsigned Size,
+                                                        unsigned BinOpcode,
+                                                        bool Nand) const {
+  assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicBinary.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const bool ArePtrs64bit = ABI.ArePtrs64bit();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned LL, SC, AND, NOR, ZERO, BEQ;
+
+  if (Size == 4) {
+    if (isMicroMips) {
+      LL = Mips::LL_MM;
+      SC = Mips::SC_MM;
+    } else {
+      LL = Subtarget.hasMips32r6()
+               ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+               : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+      SC = Subtarget.hasMips32r6()
+               ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+               : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+    }
+
+    AND = Mips::AND;
+    NOR = Mips::NOR;
+    ZERO = Mips::ZERO;
+    BEQ = Mips::BEQ;
+  } else {
+    LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
+    SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
+    AND = Mips::AND64;
+    NOR = Mips::NOR64;
+    ZERO = Mips::ZERO_64;
+    BEQ = Mips::BEQ64;
+  }
+
+  unsigned OldVal = MI.getOperand(0).getReg();
+  unsigned Ptr = MI.getOperand(1).getReg();
+  unsigned Incr = MI.getOperand(2).getReg();
+
+  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+  unsigned AndRes = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = ++BB->getIterator();
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  thisMBB:
+  //    ...
+  //    fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(exitMBB);
+
+  //  loopMBB:
+  //    ll oldval, 0(ptr)
+  //    <binop> storeval, oldval, incr
+  //    sc success, storeval, 0(ptr)
+  //    beq success, $0, loopMBB
+  BB = loopMBB;
+  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
+  if (Nand) {
+    //  and andres, oldval, incr
+    //  nor storeval, $0, andres
+    BuildMI(BB, DL, TII->get(AND), AndRes).addReg(OldVal).addReg(Incr);
+    BuildMI(BB, DL, TII->get(NOR), StoreVal).addReg(ZERO).addReg(AndRes);
+  } else if (BinOpcode) {
+    //  <binop> storeval, oldval, incr
+    BuildMI(BB, DL, TII->get(BinOpcode), StoreVal).addReg(OldVal).addReg(Incr);
+  } else {
+    StoreVal = Incr;
+  }
+  BuildMI(BB, DL, TII->get(SC), Success).addReg(StoreVal).addReg(Ptr).addImm(0);
+  BuildMI(BB, DL, TII->get(BEQ)).addReg(Success).addReg(ZERO).addMBB(loopMBB);
+
+  MI.eraseFromParent(); // The instruction is gone now.
+
+  return exitMBB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
+    MachineInstr &MI, MachineBasicBlock *BB, unsigned Size, unsigned DstReg,
+    unsigned SrcReg) const {
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  if (Subtarget.hasMips32r2() && Size == 1) {
+    BuildMI(BB, DL, TII->get(Mips::SEB), DstReg).addReg(SrcReg);
+    return BB;
+  }
+
+  if (Subtarget.hasMips32r2() && Size == 2) {
+    BuildMI(BB, DL, TII->get(Mips::SEH), DstReg).addReg(SrcReg);
+    return BB;
+  }
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  unsigned ScrReg = RegInfo.createVirtualRegister(RC);
+
+  assert(Size < 32);
+  int64_t ShiftImm = 32 - (Size * 8);
+
+  BuildMI(BB, DL, TII->get(Mips::SLL), ScrReg).addReg(SrcReg).addImm(ShiftImm);
+  BuildMI(BB, DL, TII->get(Mips::SRA), DstReg).addReg(ScrReg).addImm(ShiftImm);
+
+  return BB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword(
+    MachineInstr &MI, MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
+    bool Nand) const {
+  assert((Size == 1 || Size == 2) &&
+         "Unsupported size for EmitAtomicBinaryPartial.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  const bool ArePtrs64bit = ABI.ArePtrs64bit();
+  const TargetRegisterClass *RCp =
+    getRegClassFor(ArePtrs64bit ? MVT::i64 : MVT::i32);
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dest = MI.getOperand(0).getReg();
+  unsigned Ptr = MI.getOperand(1).getReg();
+  unsigned Incr = MI.getOperand(2).getReg();
+
+  unsigned AlignedAddr = RegInfo.createVirtualRegister(RCp);
+  unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
+  unsigned Mask = RegInfo.createVirtualRegister(RC);
+  unsigned Mask2 = RegInfo.createVirtualRegister(RC);
+  unsigned NewVal = RegInfo.createVirtualRegister(RC);
+  unsigned OldVal = RegInfo.createVirtualRegister(RC);
+  unsigned Incr2 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
+  unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
+  unsigned AndRes = RegInfo.createVirtualRegister(RC);
+  unsigned BinOpRes = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
+  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
+  unsigned SrlRes = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
+
+  unsigned LL, SC;
+  if (isMicroMips) {
+    LL = Mips::LL_MM;
+    SC = Mips::SC_MM;
+  } else {
+    LL = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+                                 : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+    SC = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+                                 : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+  }
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = ++BB->getIterator();
+  MF->insert(It, loopMBB);
+  MF->insert(It, sinkMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  BB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(sinkMBB);
+  sinkMBB->addSuccessor(exitMBB);
+
+  //  thisMBB:
+  //    addiu   masklsb2,$0,-4                # 0xfffffffc
+  //    and     alignedaddr,ptr,masklsb2
+  //    andi    ptrlsb2,ptr,3
+  //    sll     shiftamt,ptrlsb2,3
+  //    ori     maskupper,$0,255               # 0xff
+  //    sll     mask,maskupper,shiftamt
+  //    nor     mask2,$0,mask
+  //    sll     incr2,incr,shiftamt
+
+  int64_t MaskImm = (Size == 1) ? 255 : 65535;
+  BuildMI(BB, DL, TII->get(ABI.GetPtrAddiuOp()), MaskLSB2)
+    .addReg(ABI.GetNullPtr()).addImm(-4);
+  BuildMI(BB, DL, TII->get(ABI.GetPtrAndOp()), AlignedAddr)
+    .addReg(Ptr).addReg(MaskLSB2);
+  BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2)
+      .addReg(Ptr, 0, ArePtrs64bit ? Mips::sub_32 : 0).addImm(3);
+  if (Subtarget.isLittle()) {
+    BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  } else {
+    unsigned Off = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, DL, TII->get(Mips::XORi), Off)
+      .addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
+    BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3);
+  }
+  BuildMI(BB, DL, TII->get(Mips::ORi), MaskUpper)
+    .addReg(Mips::ZERO).addImm(MaskImm);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), Mask)
+    .addReg(MaskUpper).addReg(ShiftAmt);
+  BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), Incr2).addReg(Incr).addReg(ShiftAmt);
+
+  // atomic.load.binop
+  // loopMBB:
+  //   ll      oldval,0(alignedaddr)
+  //   binop   binopres,oldval,incr2
+  //   and     newval,binopres,mask
+  //   and     maskedoldval0,oldval,mask2
+  //   or      storeval,maskedoldval0,newval
+  //   sc      success,storeval,0(alignedaddr)
+  //   beq     success,$0,loopMBB
+
+  // atomic.swap
+  // loopMBB:
+  //   ll      oldval,0(alignedaddr)
+  //   and     newval,incr2,mask
+  //   and     maskedoldval0,oldval,mask2
+  //   or      storeval,maskedoldval0,newval
+  //   sc      success,storeval,0(alignedaddr)
+  //   beq     success,$0,loopMBB
+
+  BB = loopMBB;
+  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
+  if (Nand) {
+    //  and andres, oldval, incr2
+    //  nor binopres, $0, andres
+    //  and newval, binopres, mask
+    BuildMI(BB, DL, TII->get(Mips::AND), AndRes).addReg(OldVal).addReg(Incr2);
+    BuildMI(BB, DL, TII->get(Mips::NOR), BinOpRes)
+      .addReg(Mips::ZERO).addReg(AndRes);
+    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
+  } else if (BinOpcode) {
+    //  <binop> binopres, oldval, incr2
+    //  and newval, binopres, mask
+    BuildMI(BB, DL, TII->get(BinOpcode), BinOpRes).addReg(OldVal).addReg(Incr2);
+    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
+  } else { // atomic.swap
+    //  and newval, incr2, mask
+    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(Incr2).addReg(Mask);
+  }
+
+  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
+    .addReg(OldVal).addReg(Mask2);
+  BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
+    .addReg(MaskedOldVal0).addReg(NewVal);
+  BuildMI(BB, DL, TII->get(SC), Success)
+    .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(Mips::BEQ))
+    .addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);
+
+  //  sinkMBB:
+  //    and     maskedoldval1,oldval,mask
+  //    srl     srlres,maskedoldval1,shiftamt
+  //    sign_extend dest,srlres
+  BB = sinkMBB;
+
+  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
+    .addReg(OldVal).addReg(Mask);
+  BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
+      .addReg(MaskedOldVal1).addReg(ShiftAmt);
+  BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
+
+  MI.eraseFromParent(); // The instruction is gone now.
+
+  return exitMBB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwap(MachineInstr &MI,
+                                                         MachineBasicBlock *BB,
+                                                         unsigned Size) const {
+  assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicCmpSwap.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const bool ArePtrs64bit = ABI.ArePtrs64bit();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned LL, SC, ZERO, BNE, BEQ;
+
+  if (Size == 4) {
+    if (isMicroMips) {
+      LL = Mips::LL_MM;
+      SC = Mips::SC_MM;
+    } else {
+      LL = Subtarget.hasMips32r6()
+               ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+               : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+      SC = Subtarget.hasMips32r6()
+               ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+               : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+    }
+
+    ZERO = Mips::ZERO;
+    BNE = Mips::BNE;
+    BEQ = Mips::BEQ;
+  } else {
+    LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
+    SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
+    ZERO = Mips::ZERO_64;
+    BNE = Mips::BNE64;
+    BEQ = Mips::BEQ64;
+  }
+
+  unsigned Dest = MI.getOperand(0).getReg();
+  unsigned Ptr = MI.getOperand(1).getReg();
+  unsigned OldVal = MI.getOperand(2).getReg();
+  unsigned NewVal = MI.getOperand(3).getReg();
+
+  unsigned Success = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = ++BB->getIterator();
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  thisMBB:
+  //    ...
+  //    fallthrough --> loop1MBB
+  BB->addSuccessor(loop1MBB);
+  loop1MBB->addSuccessor(exitMBB);
+  loop1MBB->addSuccessor(loop2MBB);
+  loop2MBB->addSuccessor(loop1MBB);
+  loop2MBB->addSuccessor(exitMBB);
+
+  // loop1MBB:
+  //   ll dest, 0(ptr)
+  //   bne dest, oldval, exitMBB
+  BB = loop1MBB;
+  BuildMI(BB, DL, TII->get(LL), Dest).addReg(Ptr).addImm(0);
+  BuildMI(BB, DL, TII->get(BNE))
+    .addReg(Dest).addReg(OldVal).addMBB(exitMBB);
+
+  // loop2MBB:
+  //   sc success, newval, 0(ptr)
+  //   beq success, $0, loop1MBB
+  BB = loop2MBB;
+  BuildMI(BB, DL, TII->get(SC), Success)
+    .addReg(NewVal).addReg(Ptr).addImm(0);
+  BuildMI(BB, DL, TII->get(BEQ))
+    .addReg(Success).addReg(ZERO).addMBB(loop1MBB);
+
+  MI.eraseFromParent(); // The instruction is gone now.
+
+  return exitMBB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword(
+    MachineInstr &MI, MachineBasicBlock *BB, unsigned Size) const {
+  assert((Size == 1 || Size == 2) &&
+      "Unsupported size for EmitAtomicCmpSwapPartial.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  const bool ArePtrs64bit = ABI.ArePtrs64bit();
+  const TargetRegisterClass *RCp =
+    getRegClassFor(ArePtrs64bit ? MVT::i64 : MVT::i32);
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dest = MI.getOperand(0).getReg();
+  unsigned Ptr = MI.getOperand(1).getReg();
+  unsigned CmpVal = MI.getOperand(2).getReg();
+  unsigned NewVal = MI.getOperand(3).getReg();
+
+  unsigned AlignedAddr = RegInfo.createVirtualRegister(RCp);
+  unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
+  unsigned Mask = RegInfo.createVirtualRegister(RC);
+  unsigned Mask2 = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftedCmpVal = RegInfo.createVirtualRegister(RC);
+  unsigned OldVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftedNewVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RCp);
+  unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedCmpVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedNewVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
+  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+  unsigned SrlRes = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
+  unsigned LL, SC;
+
+  if (isMicroMips) {
+    LL = Mips::LL_MM;
+    SC = Mips::SC_MM;
+  } else {
+    LL = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::LL64_R6 : Mips::LL_R6)
+                                 : (ArePtrs64bit ? Mips::LL64 : Mips::LL);
+    SC = Subtarget.hasMips32r6() ? (ArePtrs64bit ? Mips::SC64_R6 : Mips::SC_R6)
+                                 : (ArePtrs64bit ? Mips::SC64 : Mips::SC);
+  }
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = ++BB->getIterator();
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, sinkMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  BB->addSuccessor(loop1MBB);
+  loop1MBB->addSuccessor(sinkMBB);
+  loop1MBB->addSuccessor(loop2MBB);
+  loop2MBB->addSuccessor(loop1MBB);
+  loop2MBB->addSuccessor(sinkMBB);
+  sinkMBB->addSuccessor(exitMBB);
+
+  // FIXME: computation of newval2 can be moved to loop2MBB.
+  //  thisMBB:
+  //    addiu   masklsb2,$0,-4                # 0xfffffffc
+  //    and     alignedaddr,ptr,masklsb2
+  //    andi    ptrlsb2,ptr,3
+  //    xori    ptrlsb2,ptrlsb2,3              # Only for BE
+  //    sll     shiftamt,ptrlsb2,3
+  //    ori     maskupper,$0,255               # 0xff
+  //    sll     mask,maskupper,shiftamt
+  //    nor     mask2,$0,mask
+  //    andi    maskedcmpval,cmpval,255
+  //    sll     shiftedcmpval,maskedcmpval,shiftamt
+  //    andi    maskednewval,newval,255
+  //    sll     shiftednewval,maskednewval,shiftamt
+  int64_t MaskImm = (Size == 1) ? 255 : 65535;
+  BuildMI(BB, DL, TII->get(ArePtrs64bit ? Mips::DADDiu : Mips::ADDiu), MaskLSB2)
+    .addReg(ABI.GetNullPtr()).addImm(-4);
+  BuildMI(BB, DL, TII->get(ArePtrs64bit ? Mips::AND64 : Mips::AND), AlignedAddr)
+    .addReg(Ptr).addReg(MaskLSB2);
+  BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2)
+      .addReg(Ptr, 0, ArePtrs64bit ? Mips::sub_32 : 0).addImm(3);
+  if (Subtarget.isLittle()) {
+    BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  } else {
+    unsigned Off = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, DL, TII->get(Mips::XORi), Off)
+      .addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
+    BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3);
+  }
+  BuildMI(BB, DL, TII->get(Mips::ORi), MaskUpper)
+    .addReg(Mips::ZERO).addImm(MaskImm);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), Mask)
+    .addReg(MaskUpper).addReg(ShiftAmt);
+  BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
+  BuildMI(BB, DL, TII->get(Mips::ANDi), MaskedCmpVal)
+    .addReg(CmpVal).addImm(MaskImm);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedCmpVal)
+    .addReg(MaskedCmpVal).addReg(ShiftAmt);
+  BuildMI(BB, DL, TII->get(Mips::ANDi), MaskedNewVal)
+    .addReg(NewVal).addImm(MaskImm);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedNewVal)
+    .addReg(MaskedNewVal).addReg(ShiftAmt);
+
+  //  loop1MBB:
+  //    ll      oldval,0(alginedaddr)
+  //    and     maskedoldval0,oldval,mask
+  //    bne     maskedoldval0,shiftedcmpval,sinkMBB
+  BB = loop1MBB;
+  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
+    .addReg(OldVal).addReg(Mask);
+  BuildMI(BB, DL, TII->get(Mips::BNE))
+    .addReg(MaskedOldVal0).addReg(ShiftedCmpVal).addMBB(sinkMBB);
+
+  //  loop2MBB:
+  //    and     maskedoldval1,oldval,mask2
+  //    or      storeval,maskedoldval1,shiftednewval
+  //    sc      success,storeval,0(alignedaddr)
+  //    beq     success,$0,loop1MBB
+  BB = loop2MBB;
+  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
+    .addReg(OldVal).addReg(Mask2);
+  BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
+    .addReg(MaskedOldVal1).addReg(ShiftedNewVal);
+  BuildMI(BB, DL, TII->get(SC), Success)
+      .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(Mips::BEQ))
+      .addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);
+
+  //  sinkMBB:
+  //    srl     srlres,maskedoldval0,shiftamt
+  //    sign_extend dest,srlres
+  BB = sinkMBB;
+
+  BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
+      .addReg(MaskedOldVal0).addReg(ShiftAmt);
+  BB = emitSignExtendToI32InReg(MI, BB, Size, Dest, SrlRes);
+
+  MI.eraseFromParent(); // The instruction is gone now.
+
+  return exitMBB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitSEL_D(MachineInstr &MI,
+                                                 MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  MachineBasicBlock::iterator II(MI);
+
+  unsigned Fc = MI.getOperand(1).getReg();
+  const auto &FGR64RegClass = TRI->getRegClass(Mips::FGR64RegClassID);
+
+  unsigned Fc2 = RegInfo.createVirtualRegister(FGR64RegClass);
+
+  BuildMI(*BB, II, DL, TII->get(Mips::SUBREG_TO_REG), Fc2)
+      .addImm(0)
+      .addReg(Fc)
+      .addImm(Mips::sub_lo);
+
+  // We don't erase the original instruction, we just replace the condition
+  // register with the 64-bit super-register.
+  MI.getOperand(1).setReg(Fc2);
+
+  return BB;
+}
+
+SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+  // The first operand is the chain, the second is the condition, the third is
+  // the block to branch to if the condition is true.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Dest = Op.getOperand(2);
+  SDLoc DL(Op);
+
+  assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
+  SDValue CondRes = createFPCmp(DAG, Op.getOperand(1));
+
+  // Return if flag is not set by a floating point comparison.
+  if (CondRes.getOpcode() != MipsISD::FPCmp)
+    return Op;
+
+  SDValue CCNode  = CondRes.getOperand(2);
+  Mips::CondCode CC =
+    (Mips::CondCode)cast<ConstantSDNode>(CCNode)->getZExtValue();
+  unsigned Opc = invertFPCondCodeUser(CC) ? Mips::BRANCH_F : Mips::BRANCH_T;
+  SDValue BrCode = DAG.getConstant(Opc, DL, MVT::i32);
+  SDValue FCC0 = DAG.getRegister(Mips::FCC0, MVT::i32);
+  return DAG.getNode(MipsISD::FPBrcond, DL, Op.getValueType(), Chain, BrCode,
+                     FCC0, Dest, CondRes);
+}
+
+SDValue MipsTargetLowering::
+lowerSELECT(SDValue Op, SelectionDAG &DAG) const
+{
+  assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
+  SDValue Cond = createFPCmp(DAG, Op.getOperand(0));
+
+  // Return if flag is not set by a floating point comparison.
+  if (Cond.getOpcode() != MipsISD::FPCmp)
+    return Op;
+
+  return createCMovFP(DAG, Cond, Op.getOperand(1), Op.getOperand(2),
+                      SDLoc(Op));
+}
+
+SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
+  SDValue Cond = createFPCmp(DAG, Op);
+
+  assert(Cond.getOpcode() == MipsISD::FPCmp &&
+         "Floating point operand expected.");
+
+  SDLoc DL(Op);
+  SDValue True  = DAG.getConstant(1, DL, MVT::i32);
+  SDValue False = DAG.getConstant(0, DL, MVT::i32);
+
+  return createCMovFP(DAG, Cond, True, False, DL);
+}
+
+SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  EVT Ty = Op.getValueType();
+  GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = N->getGlobal();
+
+  if (!isPositionIndependent() && !ABI.IsN64()) {
+    const MipsTargetObjectFile *TLOF =
+        static_cast<const MipsTargetObjectFile *>(
+            getTargetMachine().getObjFileLowering());
+    const GlobalObject *GO = GV->getBaseObject();
+    if (GO && TLOF->IsGlobalInSmallSection(GO, getTargetMachine()))
+      // %gp_rel relocation
+      return getAddrGPRel(N, SDLoc(N), Ty, DAG);
+
+    // %hi/%lo relocation
+    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
+  }
+
+  // Every other architecture would use shouldAssumeDSOLocal in here, but
+  // mips is special.
+  // * In PIC code mips requires got loads even for local statics!
+  // * To save on got entries, for local statics the got entry contains the
+  //   page and an additional add instruction takes care of the low bits.
+  // * It is legal to access a hidden symbol with a non hidden undefined,
+  //   so one cannot guarantee that all access to a hidden symbol will know
+  //   it is hidden.
+  // * Mips linkers don't support creating a page and a full got entry for
+  //   the same symbol.
+  // * Given all that, we have to use a full got entry for hidden symbols :-(
+  if (GV->hasLocalLinkage())
+    return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
+
+  if (LargeGOT)
+    return getAddrGlobalLargeGOT(
+        N, SDLoc(N), Ty, DAG, MipsII::MO_GOT_HI16, MipsII::MO_GOT_LO16,
+        DAG.getEntryNode(),
+        MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+
+  return getAddrGlobal(
+      N, SDLoc(N), Ty, DAG,
+      (ABI.IsN32() || ABI.IsN64()) ? MipsII::MO_GOT_DISP : MipsII::MO_GOT,
+      DAG.getEntryNode(), MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+}
+
+SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
+  EVT Ty = Op.getValueType();
+
+  if (!isPositionIndependent() && !ABI.IsN64())
+    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
+
+  return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
+}
+
+SDValue MipsTargetLowering::
+lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
+{
+  // If the relocation model is PIC, use the General Dynamic TLS Model or
+  // Local Dynamic TLS model, otherwise use the Initial Exec or
+  // Local Exec TLS Model.
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  if (DAG.getTarget().Options.EmulatedTLS)
+    return LowerToTLSEmulatedModel(GA, DAG);
+
+  SDLoc DL(GA);
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+  if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
+    // General Dynamic and Local Dynamic TLS Model.
+    unsigned Flag = (model == TLSModel::LocalDynamic) ? MipsII::MO_TLSLDM
+                                                      : MipsII::MO_TLSGD;
+
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, Flag);
+    SDValue Argument = DAG.getNode(MipsISD::Wrapper, DL, PtrVT,
+                                   getGlobalReg(DAG, PtrVT), TGA);
+    unsigned PtrSize = PtrVT.getSizeInBits();
+    IntegerType *PtrTy = Type::getIntNTy(*DAG.getContext(), PtrSize);
+
+    SDValue TlsGetAddr = DAG.getExternalSymbol("__tls_get_addr", PtrVT);
+
+    ArgListTy Args;
+    ArgListEntry Entry;
+    Entry.Node = Argument;
+    Entry.Ty = PtrTy;
+    Args.push_back(Entry);
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(DL).setChain(DAG.getEntryNode())
+      .setCallee(CallingConv::C, PtrTy, TlsGetAddr, std::move(Args));
+    std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+    SDValue Ret = CallResult.first;
+
+    if (model != TLSModel::LocalDynamic)
+      return Ret;
+
+    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                               MipsII::MO_DTPREL_HI);
+    SDValue Hi = DAG.getNode(MipsISD::Hi, DL, PtrVT, TGAHi);
+    SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                               MipsII::MO_DTPREL_LO);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, DL, PtrVT, TGALo);
+    SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Ret);
+    return DAG.getNode(ISD::ADD, DL, PtrVT, Add, Lo);
+  }
+
+  SDValue Offset;
+  if (model == TLSModel::InitialExec) {
+    // Initial Exec TLS Model
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                             MipsII::MO_GOTTPREL);
+    TGA = DAG.getNode(MipsISD::Wrapper, DL, PtrVT, getGlobalReg(DAG, PtrVT),
+                      TGA);
+    Offset =
+        DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), TGA, MachinePointerInfo());
+  } else {
+    // Local Exec TLS Model
+    assert(model == TLSModel::LocalExec);
+    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                               MipsII::MO_TPREL_HI);
+    SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                               MipsII::MO_TPREL_LO);
+    SDValue Hi = DAG.getNode(MipsISD::Hi, DL, PtrVT, TGAHi);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, DL, PtrVT, TGALo);
+    Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
+  }
+
+  SDValue ThreadPointer = DAG.getNode(MipsISD::ThreadPointer, DL, PtrVT);
+  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue MipsTargetLowering::
+lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
+{
+  JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
+  EVT Ty = Op.getValueType();
+
+  if (!isPositionIndependent() && !ABI.IsN64())
+    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
+
+  return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
+}
+
+SDValue MipsTargetLowering::
+lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
+{
+  ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+  EVT Ty = Op.getValueType();
+
+  if (!isPositionIndependent() && !ABI.IsN64()) {
+    const MipsTargetObjectFile *TLOF =
+        static_cast<const MipsTargetObjectFile *>(
+            getTargetMachine().getObjFileLowering());
+
+    if (TLOF->IsConstantInSmallSection(DAG.getDataLayout(), N->getConstVal(),
+                                       getTargetMachine()))
+      // %gp_rel relocation
+      return getAddrGPRel(N, SDLoc(N), Ty, DAG);
+
+    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
+  }
+
+  return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
+}
+
+SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
+
+  SDLoc DL(Op);
+  SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                 getPointerTy(MF.getDataLayout()));
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
+SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  EVT VT = Node->getValueType(0);
+  SDValue Chain = Node->getOperand(0);
+  SDValue VAListPtr = Node->getOperand(1);
+  unsigned Align = Node->getConstantOperandVal(3);
+  const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  SDLoc DL(Node);
+  unsigned ArgSlotSizeInBytes = (ABI.IsN32() || ABI.IsN64()) ? 8 : 4;
+
+  SDValue VAListLoad = DAG.getLoad(getPointerTy(DAG.getDataLayout()), DL, Chain,
+                                   VAListPtr, MachinePointerInfo(SV));
+  SDValue VAList = VAListLoad;
+
+  // Re-align the pointer if necessary.
+  // It should only ever be necessary for 64-bit types on O32 since the minimum
+  // argument alignment is the same as the maximum type alignment for N32/N64.
+  //
+  // FIXME: We currently align too often. The code generator doesn't notice
+  //        when the pointer is still aligned from the last va_arg (or pair of
+  //        va_args for the i64 on O32 case).
+  if (Align > getMinStackArgumentAlignment()) {
+    assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
+
+    VAList = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
+                         DAG.getConstant(Align - 1, DL, VAList.getValueType()));
+
+    VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
+                         DAG.getConstant(-(int64_t)Align, DL,
+                                         VAList.getValueType()));
+  }
+
+  // Increment the pointer, VAList, to the next vaarg.
+  auto &TD = DAG.getDataLayout();
+  unsigned ArgSizeInBytes =
+      TD.getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext()));
+  SDValue Tmp3 =
+      DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
+                  DAG.getConstant(alignTo(ArgSizeInBytes, ArgSlotSizeInBytes),
+                                  DL, VAList.getValueType()));
+  // Store the incremented VAList to the legalized pointer
+  Chain = DAG.getStore(VAListLoad.getValue(1), DL, Tmp3, VAListPtr,
+                       MachinePointerInfo(SV));
+
+  // In big-endian mode we must adjust the pointer when the load size is smaller
+  // than the argument slot size. We must also reduce the known alignment to
+  // match. For example in the N64 ABI, we must add 4 bytes to the offset to get
+  // the correct half of the slot, and reduce the alignment from 8 (slot
+  // alignment) down to 4 (type alignment).
+  if (!Subtarget.isLittle() && ArgSizeInBytes < ArgSlotSizeInBytes) {
+    unsigned Adjustment = ArgSlotSizeInBytes - ArgSizeInBytes;
+    VAList = DAG.getNode(ISD::ADD, DL, VAListPtr.getValueType(), VAList,
+                         DAG.getIntPtrConstant(Adjustment, DL));
+  }
+  // Load the actual argument out of the pointer VAList
+  return DAG.getLoad(VT, DL, Chain, VAList, MachinePointerInfo());
+}
+
+static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG,
+                                bool HasExtractInsert) {
+  EVT TyX = Op.getOperand(0).getValueType();
+  EVT TyY = Op.getOperand(1).getValueType();
+  SDLoc DL(Op);
+  SDValue Const1 = DAG.getConstant(1, DL, MVT::i32);
+  SDValue Const31 = DAG.getConstant(31, DL, MVT::i32);
+  SDValue Res;
+
+  // If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it
+  // to i32.
+  SDValue X = (TyX == MVT::f32) ?
+    DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(0)) :
+    DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(0),
+                Const1);
+  SDValue Y = (TyY == MVT::f32) ?
+    DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(1)) :
+    DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(1),
+                Const1);
+
+  if (HasExtractInsert) {
+    // ext  E, Y, 31, 1  ; extract bit31 of Y
+    // ins  X, E, 31, 1  ; insert extracted bit at bit31 of X
+    SDValue E = DAG.getNode(MipsISD::Ext, DL, MVT::i32, Y, Const31, Const1);
+    Res = DAG.getNode(MipsISD::Ins, DL, MVT::i32, E, Const31, Const1, X);
+  } else {
+    // sll SllX, X, 1
+    // srl SrlX, SllX, 1
+    // srl SrlY, Y, 31
+    // sll SllY, SrlX, 31
+    // or  Or, SrlX, SllY
+    SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i32, X, Const1);
+    SDValue SrlX = DAG.getNode(ISD::SRL, DL, MVT::i32, SllX, Const1);
+    SDValue SrlY = DAG.getNode(ISD::SRL, DL, MVT::i32, Y, Const31);
+    SDValue SllY = DAG.getNode(ISD::SHL, DL, MVT::i32, SrlY, Const31);
+    Res = DAG.getNode(ISD::OR, DL, MVT::i32, SrlX, SllY);
+  }
+
+  if (TyX == MVT::f32)
+    return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), Res);
+
+  SDValue LowX = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                             Op.getOperand(0),
+                             DAG.getConstant(0, DL, MVT::i32));
+  return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
+}
+
+static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG,
+                                bool HasExtractInsert) {
+  unsigned WidthX = Op.getOperand(0).getValueSizeInBits();
+  unsigned WidthY = Op.getOperand(1).getValueSizeInBits();
+  EVT TyX = MVT::getIntegerVT(WidthX), TyY = MVT::getIntegerVT(WidthY);
+  SDLoc DL(Op);
+  SDValue Const1 = DAG.getConstant(1, DL, MVT::i32);
+
+  // Bitcast to integer nodes.
+  SDValue X = DAG.getNode(ISD::BITCAST, DL, TyX, Op.getOperand(0));
+  SDValue Y = DAG.getNode(ISD::BITCAST, DL, TyY, Op.getOperand(1));
+
+  if (HasExtractInsert) {
+    // ext  E, Y, width(Y) - 1, 1  ; extract bit width(Y)-1 of Y
+    // ins  X, E, width(X) - 1, 1  ; insert extracted bit at bit width(X)-1 of X
+    SDValue E = DAG.getNode(MipsISD::Ext, DL, TyY, Y,
+                            DAG.getConstant(WidthY - 1, DL, MVT::i32), Const1);
+
+    if (WidthX > WidthY)
+      E = DAG.getNode(ISD::ZERO_EXTEND, DL, TyX, E);
+    else if (WidthY > WidthX)
+      E = DAG.getNode(ISD::TRUNCATE, DL, TyX, E);
+
+    SDValue I = DAG.getNode(MipsISD::Ins, DL, TyX, E,
+                            DAG.getConstant(WidthX - 1, DL, MVT::i32), Const1,
+                            X);
+    return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), I);
+  }
+
+  // (d)sll SllX, X, 1
+  // (d)srl SrlX, SllX, 1
+  // (d)srl SrlY, Y, width(Y)-1
+  // (d)sll SllY, SrlX, width(Y)-1
+  // or     Or, SrlX, SllY
+  SDValue SllX = DAG.getNode(ISD::SHL, DL, TyX, X, Const1);
+  SDValue SrlX = DAG.getNode(ISD::SRL, DL, TyX, SllX, Const1);
+  SDValue SrlY = DAG.getNode(ISD::SRL, DL, TyY, Y,
+                             DAG.getConstant(WidthY - 1, DL, MVT::i32));
+
+  if (WidthX > WidthY)
+    SrlY = DAG.getNode(ISD::ZERO_EXTEND, DL, TyX, SrlY);
+  else if (WidthY > WidthX)
+    SrlY = DAG.getNode(ISD::TRUNCATE, DL, TyX, SrlY);
+
+  SDValue SllY = DAG.getNode(ISD::SHL, DL, TyX, SrlY,
+                             DAG.getConstant(WidthX - 1, DL, MVT::i32));
+  SDValue Or = DAG.getNode(ISD::OR, DL, TyX, SrlX, SllY);
+  return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), Or);
+}
+
+SDValue
+MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
+  if (Subtarget.isGP64bit())
+    return lowerFCOPYSIGN64(Op, DAG, Subtarget.hasExtractInsert());
+
+  return lowerFCOPYSIGN32(Op, DAG, Subtarget.hasExtractInsert());
+}
+
+SDValue MipsTargetLowering::
+lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+  // check the depth
+  assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
+         "Frame address can only be determined for current frame.");
+
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI.setFrameAddressIsTaken(true);
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue FrameAddr = DAG.getCopyFromReg(
+      DAG.getEntryNode(), DL, ABI.IsN64() ? Mips::FP_64 : Mips::FP, VT);
+  return FrameAddr;
+}
+
+SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  // check the depth
+  assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
+         "Return address can be determined only for current frame.");
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MVT VT = Op.getSimpleValueType();
+  unsigned RA = ABI.IsN64() ? Mips::RA_64 : Mips::RA;
+  MFI.setReturnAddressIsTaken(true);
+
+  // Return RA, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(RA, getRegClassFor(VT));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), Reg, VT);
+}
+
+// An EH_RETURN is the result of lowering llvm.eh.return which in turn is
+// generated from __builtin_eh_return (offset, handler)
+// The effect of this is to adjust the stack pointer by "offset"
+// and then branch to "handler".
+SDValue MipsTargetLowering::lowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
+                                                                     const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  MipsFI->setCallsEhReturn();
+  SDValue Chain     = Op.getOperand(0);
+  SDValue Offset    = Op.getOperand(1);
+  SDValue Handler   = Op.getOperand(2);
+  SDLoc DL(Op);
+  EVT Ty = ABI.IsN64() ? MVT::i64 : MVT::i32;
+
+  // Store stack offset in V1, store jump target in V0. Glue CopyToReg and
+  // EH_RETURN nodes, so that instructions are emitted back-to-back.
+  unsigned OffsetReg = ABI.IsN64() ? Mips::V1_64 : Mips::V1;
+  unsigned AddrReg = ABI.IsN64() ? Mips::V0_64 : Mips::V0;
+  Chain = DAG.getCopyToReg(Chain, DL, OffsetReg, Offset, SDValue());
+  Chain = DAG.getCopyToReg(Chain, DL, AddrReg, Handler, Chain.getValue(1));
+  return DAG.getNode(MipsISD::EH_RETURN, DL, MVT::Other, Chain,
+                     DAG.getRegister(OffsetReg, Ty),
+                     DAG.getRegister(AddrReg, getPointerTy(MF.getDataLayout())),
+                     Chain.getValue(1));
+}
+
+SDValue MipsTargetLowering::lowerATOMIC_FENCE(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  // FIXME: Need pseudo-fence for 'singlethread' fences
+  // FIXME: Set SType for weaker fences where supported/appropriate.
+  unsigned SType = 0;
+  SDLoc DL(Op);
+  return DAG.getNode(MipsISD::Sync, DL, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(SType, DL, MVT::i32));
+}
+
+SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MVT VT = Subtarget.isGP64bit() ? MVT::i64 : MVT::i32;
+
+  SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
+  SDValue Shamt = Op.getOperand(2);
+  // if shamt < (VT.bits):
+  //  lo = (shl lo, shamt)
+  //  hi = (or (shl hi, shamt) (srl (srl lo, 1), ~shamt))
+  // else:
+  //  lo = 0
+  //  hi = (shl lo, shamt[4:0])
+  SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
+                            DAG.getConstant(-1, DL, MVT::i32));
+  SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, VT, Lo,
+                                      DAG.getConstant(1, DL, VT));
+  SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, ShiftRight1Lo, Not);
+  SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, Hi, Shamt);
+  SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
+  SDValue ShiftLeftLo = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt);
+  SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
+                             DAG.getConstant(VT.getSizeInBits(), DL, MVT::i32));
+  Lo = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+                   DAG.getConstant(0, DL, VT), ShiftLeftLo);
+  Hi = DAG.getNode(ISD::SELECT, DL, VT, Cond, ShiftLeftLo, Or);
+
+  SDValue Ops[2] = {Lo, Hi};
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
+                                                 bool IsSRA) const {
+  SDLoc DL(Op);
+  SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
+  SDValue Shamt = Op.getOperand(2);
+  MVT VT = Subtarget.isGP64bit() ? MVT::i64 : MVT::i32;
+
+  // if shamt < (VT.bits):
+  //  lo = (or (shl (shl hi, 1), ~shamt) (srl lo, shamt))
+  //  if isSRA:
+  //    hi = (sra hi, shamt)
+  //  else:
+  //    hi = (srl hi, shamt)
+  // else:
+  //  if isSRA:
+  //   lo = (sra hi, shamt[4:0])
+  //   hi = (sra hi, 31)
+  //  else:
+  //   lo = (srl hi, shamt[4:0])
+  //   hi = 0
+  SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
+                            DAG.getConstant(-1, DL, MVT::i32));
+  SDValue ShiftLeft1Hi = DAG.getNode(ISD::SHL, DL, VT, Hi,
+                                     DAG.getConstant(1, DL, VT));
+  SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, ShiftLeft1Hi, Not);
+  SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, Lo, Shamt);
+  SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
+  SDValue ShiftRightHi = DAG.getNode(IsSRA ? ISD::SRA : ISD::SRL,
+                                     DL, VT, Hi, Shamt);
+  SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
+                             DAG.getConstant(VT.getSizeInBits(), DL, MVT::i32));
+  SDValue Ext = DAG.getNode(ISD::SRA, DL, VT, Hi,
+                            DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
+  Lo = DAG.getNode(ISD::SELECT, DL, VT, Cond, ShiftRightHi, Or);
+  Hi = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+                   IsSRA ? Ext : DAG.getConstant(0, DL, VT), ShiftRightHi);
+
+  SDValue Ops[2] = {Lo, Hi};
+  return DAG.getMergeValues(Ops, DL);
+}
+
+static SDValue createLoadLR(unsigned Opc, SelectionDAG &DAG, LoadSDNode *LD,
+                            SDValue Chain, SDValue Src, unsigned Offset) {
+  SDValue Ptr = LD->getBasePtr();
+  EVT VT = LD->getValueType(0), MemVT = LD->getMemoryVT();
+  EVT BasePtrVT = Ptr.getValueType();
+  SDLoc DL(LD);
+  SDVTList VTList = DAG.getVTList(VT, MVT::Other);
+
+  if (Offset)
+    Ptr = DAG.getNode(ISD::ADD, DL, BasePtrVT, Ptr,
+                      DAG.getConstant(Offset, DL, BasePtrVT));
+
+  SDValue Ops[] = { Chain, Ptr, Src };
+  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, MemVT,
+                                 LD->getMemOperand());
+}
+
+// Expand an unaligned 32 or 64-bit integer load node.
+SDValue MipsTargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode *LD = cast<LoadSDNode>(Op);
+  EVT MemVT = LD->getMemoryVT();
+
+  if (Subtarget.systemSupportsUnalignedAccess())
+    return Op;
+
+  // Return if load is aligned or if MemVT is neither i32 nor i64.
+  if ((LD->getAlignment() >= MemVT.getSizeInBits() / 8) ||
+      ((MemVT != MVT::i32) && (MemVT != MVT::i64)))
+    return SDValue();
+
+  bool IsLittle = Subtarget.isLittle();
+  EVT VT = Op.getValueType();
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  SDValue Chain = LD->getChain(), Undef = DAG.getUNDEF(VT);
+
+  assert((VT == MVT::i32) || (VT == MVT::i64));
+
+  // Expand
+  //  (set dst, (i64 (load baseptr)))
+  // to
+  //  (set tmp, (ldl (add baseptr, 7), undef))
+  //  (set dst, (ldr baseptr, tmp))
+  if ((VT == MVT::i64) && (ExtType == ISD::NON_EXTLOAD)) {
+    SDValue LDL = createLoadLR(MipsISD::LDL, DAG, LD, Chain, Undef,
+                               IsLittle ? 7 : 0);
+    return createLoadLR(MipsISD::LDR, DAG, LD, LDL.getValue(1), LDL,
+                        IsLittle ? 0 : 7);
+  }
+
+  SDValue LWL = createLoadLR(MipsISD::LWL, DAG, LD, Chain, Undef,
+                             IsLittle ? 3 : 0);
+  SDValue LWR = createLoadLR(MipsISD::LWR, DAG, LD, LWL.getValue(1), LWL,
+                             IsLittle ? 0 : 3);
+
+  // Expand
+  //  (set dst, (i32 (load baseptr))) or
+  //  (set dst, (i64 (sextload baseptr))) or
+  //  (set dst, (i64 (extload baseptr)))
+  // to
+  //  (set tmp, (lwl (add baseptr, 3), undef))
+  //  (set dst, (lwr baseptr, tmp))
+  if ((VT == MVT::i32) || (ExtType == ISD::SEXTLOAD) ||
+      (ExtType == ISD::EXTLOAD))
+    return LWR;
+
+  assert((VT == MVT::i64) && (ExtType == ISD::ZEXTLOAD));
+
+  // Expand
+  //  (set dst, (i64 (zextload baseptr)))
+  // to
+  //  (set tmp0, (lwl (add baseptr, 3), undef))
+  //  (set tmp1, (lwr baseptr, tmp0))
+  //  (set tmp2, (shl tmp1, 32))
+  //  (set dst, (srl tmp2, 32))
+  SDLoc DL(LD);
+  SDValue Const32 = DAG.getConstant(32, DL, MVT::i32);
+  SDValue SLL = DAG.getNode(ISD::SHL, DL, MVT::i64, LWR, Const32);
+  SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i64, SLL, Const32);
+  SDValue Ops[] = { SRL, LWR.getValue(1) };
+  return DAG.getMergeValues(Ops, DL);
+}
+
+static SDValue createStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD,
+                             SDValue Chain, unsigned Offset) {
+  SDValue Ptr = SD->getBasePtr(), Value = SD->getValue();
+  EVT MemVT = SD->getMemoryVT(), BasePtrVT = Ptr.getValueType();
+  SDLoc DL(SD);
+  SDVTList VTList = DAG.getVTList(MVT::Other);
+
+  if (Offset)
+    Ptr = DAG.getNode(ISD::ADD, DL, BasePtrVT, Ptr,
+                      DAG.getConstant(Offset, DL, BasePtrVT));
+
+  SDValue Ops[] = { Chain, Value, Ptr };
+  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, MemVT,
+                                 SD->getMemOperand());
+}
+
+// Expand an unaligned 32 or 64-bit integer store node.
+static SDValue lowerUnalignedIntStore(StoreSDNode *SD, SelectionDAG &DAG,
+                                      bool IsLittle) {
+  SDValue Value = SD->getValue(), Chain = SD->getChain();
+  EVT VT = Value.getValueType();
+
+  // Expand
+  //  (store val, baseptr) or
+  //  (truncstore val, baseptr)
+  // to
+  //  (swl val, (add baseptr, 3))
+  //  (swr val, baseptr)
+  if ((VT == MVT::i32) || SD->isTruncatingStore()) {
+    SDValue SWL = createStoreLR(MipsISD::SWL, DAG, SD, Chain,
+                                IsLittle ? 3 : 0);
+    return createStoreLR(MipsISD::SWR, DAG, SD, SWL, IsLittle ? 0 : 3);
+  }
+
+  assert(VT == MVT::i64);
+
+  // Expand
+  //  (store val, baseptr)
+  // to
+  //  (sdl val, (add baseptr, 7))
+  //  (sdr val, baseptr)
+  SDValue SDL = createStoreLR(MipsISD::SDL, DAG, SD, Chain, IsLittle ? 7 : 0);
+  return createStoreLR(MipsISD::SDR, DAG, SD, SDL, IsLittle ? 0 : 7);
+}
+
+// Lower (store (fp_to_sint $fp) $ptr) to (store (TruncIntFP $fp), $ptr).
+static SDValue lowerFP_TO_SINT_STORE(StoreSDNode *SD, SelectionDAG &DAG) {
+  SDValue Val = SD->getValue();
+
+  if (Val.getOpcode() != ISD::FP_TO_SINT)
+    return SDValue();
+
+  EVT FPTy = EVT::getFloatingPointVT(Val.getValueSizeInBits());
+  SDValue Tr = DAG.getNode(MipsISD::TruncIntFP, SDLoc(Val), FPTy,
+                           Val.getOperand(0));
+  return DAG.getStore(SD->getChain(), SDLoc(SD), Tr, SD->getBasePtr(),
+                      SD->getPointerInfo(), SD->getAlignment(),
+                      SD->getMemOperand()->getFlags());
+}
+
+SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  StoreSDNode *SD = cast<StoreSDNode>(Op);
+  EVT MemVT = SD->getMemoryVT();
+
+  // Lower unaligned integer stores.
+  if (!Subtarget.systemSupportsUnalignedAccess() &&
+      (SD->getAlignment() < MemVT.getSizeInBits() / 8) &&
+      ((MemVT == MVT::i32) || (MemVT == MVT::i64)))
+    return lowerUnalignedIntStore(SD, DAG, Subtarget.isLittle());
+
+  return lowerFP_TO_SINT_STORE(SD, DAG);
+}
+
+SDValue MipsTargetLowering::lowerEH_DWARF_CFA(SDValue Op,
+                                              SelectionDAG &DAG) const {
+
+  // Return a fixed StackObject with offset 0 which points to the old stack
+  // pointer.
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  EVT ValTy = Op->getValueType(0);
+  int FI = MFI.CreateFixedObject(Op.getValueSizeInBits() / 8, 0, false);
+  return DAG.getFrameIndex(FI, ValTy);
+}
+
+SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  EVT FPTy = EVT::getFloatingPointVT(Op.getValueSizeInBits());
+  SDValue Trunc = DAG.getNode(MipsISD::TruncIntFP, SDLoc(Op), FPTy,
+                              Op.getOperand(0));
+  return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op.getValueType(), Trunc);
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TODO: Implement a generic logic using tblgen that can support this.
+// Mips O32 ABI rules:
+// ---
+// i32 - Passed in A0, A1, A2, A3 and stack
+// f32 - Only passed in f32 registers if no int reg has been used yet to hold
+//       an argument. Otherwise, passed in A1, A2, A3 and stack.
+// f64 - Only passed in two aliased f32 registers if no int reg has been used
+//       yet to hold an argument. Otherwise, use A2, A3 and stack. If A1 is
+//       not used, it must be shadowed. If only A3 is available, shadow it and
+//       go to stack.
+//
+//  For vararg functions, all arguments are passed in A0, A1, A2, A3 and stack.
+//===----------------------------------------------------------------------===//
+
+static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
+                       CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+                       CCState &State, ArrayRef<MCPhysReg> F64Regs) {
+  const MipsSubtarget &Subtarget = static_cast<const MipsSubtarget &>(
+      State.getMachineFunction().getSubtarget());
+
+  static const MCPhysReg IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 };
+  static const MCPhysReg F32Regs[] = { Mips::F12, Mips::F14 };
+
+  // Do not process byval args here.
+  if (ArgFlags.isByVal())
+    return true;
+
+  // Promote i8 and i16
+  if (ArgFlags.isInReg() && !Subtarget.isLittle()) {
+    if (LocVT == MVT::i8 || LocVT == MVT::i16 || LocVT == MVT::i32) {
+      LocVT = MVT::i32;
+      if (ArgFlags.isSExt())
+        LocInfo = CCValAssign::SExtUpper;
+      else if (ArgFlags.isZExt())
+        LocInfo = CCValAssign::ZExtUpper;
+      else
+        LocInfo = CCValAssign::AExtUpper;
+    }
+  }
+
+  // Promote i8 and i16
+  if (LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+
+  unsigned Reg;
+
+  // f32 and f64 are allocated in A0, A1, A2, A3 when either of the following
+  // is true: function is vararg, argument is 3rd or higher, there is previous
+  // argument which is not f32 or f64.
+  bool AllocateFloatsInIntReg = State.isVarArg() || ValNo > 1 ||
+                                State.getFirstUnallocated(F32Regs) != ValNo;
+  unsigned OrigAlign = ArgFlags.getOrigAlign();
+  bool isI64 = (ValVT == MVT::i32 && OrigAlign == 8);
+
+  if (ValVT == MVT::i32 || (ValVT == MVT::f32 && AllocateFloatsInIntReg)) {
+    Reg = State.AllocateReg(IntRegs);
+    // If this is the first part of an i64 arg,
+    // the allocated register must be either A0 or A2.
+    if (isI64 && (Reg == Mips::A1 || Reg == Mips::A3))
+      Reg = State.AllocateReg(IntRegs);
+    LocVT = MVT::i32;
+  } else if (ValVT == MVT::f64 && AllocateFloatsInIntReg) {
+    // Allocate int register and shadow next int register. If first
+    // available register is Mips::A1 or Mips::A3, shadow it too.
+    Reg = State.AllocateReg(IntRegs);
+    if (Reg == Mips::A1 || Reg == Mips::A3)
+      Reg = State.AllocateReg(IntRegs);
+    State.AllocateReg(IntRegs);
+    LocVT = MVT::i32;
+  } else if (ValVT.isFloatingPoint() && !AllocateFloatsInIntReg) {
+    // we are guaranteed to find an available float register
+    if (ValVT == MVT::f32) {
+      Reg = State.AllocateReg(F32Regs);
+      // Shadow int register
+      State.AllocateReg(IntRegs);
+    } else {
+      Reg = State.AllocateReg(F64Regs);
+      // Shadow int registers
+      unsigned Reg2 = State.AllocateReg(IntRegs);
+      if (Reg2 == Mips::A1 || Reg2 == Mips::A3)
+        State.AllocateReg(IntRegs);
+      State.AllocateReg(IntRegs);
+    }
+  } else
+    llvm_unreachable("Cannot handle this ValVT.");
+
+  if (!Reg) {
+    unsigned Offset = State.AllocateStack(ValVT.getSizeInBits() >> 3,
+                                          OrigAlign);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  } else
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+
+  return false;
+}
+
+static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static const MCPhysReg F64Regs[] = { Mips::D6, Mips::D7 };
+
+  return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
+}
+
+static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static const MCPhysReg F64Regs[] = { Mips::D12_64, Mips::D14_64 };
+
+  return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
+}
+
+static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
+                       CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+                       CCState &State) LLVM_ATTRIBUTE_UNUSED;
+
+#include "MipsGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+//                  Call Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+// Return next O32 integer argument register.
+static unsigned getNextIntArgReg(unsigned Reg) {
+  assert((Reg == Mips::A0) || (Reg == Mips::A2));
+  return (Reg == Mips::A0) ? Mips::A1 : Mips::A3;
+}
+
+SDValue MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
+                                           SDValue Chain, SDValue Arg,
+                                           const SDLoc &DL, bool IsTailCall,
+                                           SelectionDAG &DAG) const {
+  if (!IsTailCall) {
+    SDValue PtrOff =
+        DAG.getNode(ISD::ADD, DL, getPointerTy(DAG.getDataLayout()), StackPtr,
+                    DAG.getIntPtrConstant(Offset, DL));
+    return DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo());
+  }
+
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  int FI = MFI.CreateFixedObject(Arg.getValueSizeInBits() / 8, Offset, false);
+  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+  return DAG.getStore(Chain, DL, Arg, FIN, MachinePointerInfo(),
+                      /* Alignment = */ 0, MachineMemOperand::MOVolatile);
+}
+
+void MipsTargetLowering::
+getOpndList(SmallVectorImpl<SDValue> &Ops,
+            std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+            bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+            bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
+            SDValue Chain) const {
+  // Insert node "GP copy globalreg" before call to function.
+  //
+  // R_MIPS_CALL* operators (emitted when non-internal functions are called
+  // in PIC mode) allow symbols to be resolved via lazy binding.
+  // The lazy binding stub requires GP to point to the GOT.
+  // Note that we don't need GP to point to the GOT for indirect calls
+  // (when R_MIPS_CALL* is not used for the call) because Mips linker generates
+  // lazy binding stub for a function only when R_MIPS_CALL* are the only relocs
+  // used for the function (that is, Mips linker doesn't generate lazy binding
+  // stub for a function whose address is taken in the program).
+  if (IsPICCall && !InternalLinkage && IsCallReloc) {
+    unsigned GPReg = ABI.IsN64() ? Mips::GP_64 : Mips::GP;
+    EVT Ty = ABI.IsN64() ? MVT::i64 : MVT::i32;
+    RegsToPass.push_back(std::make_pair(GPReg, getGlobalReg(CLI.DAG, Ty)));
+  }
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emitted instructions must be
+  // stuck together.
+  SDValue InFlag;
+
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = CLI.DAG.getCopyToReg(Chain, CLI.DL, RegsToPass[i].first,
+                                 RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(CLI.DAG.getRegister(RegsToPass[i].first,
+                                      RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const uint32_t *Mask =
+      TRI->getCallPreservedMask(CLI.DAG.getMachineFunction(), CLI.CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  if (Subtarget.inMips16HardFloat()) {
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(CLI.Callee)) {
+      llvm::StringRef Sym = G->getGlobal()->getName();
+      Function *F = G->getGlobal()->getParent()->getFunction(Sym);
+      if (F && F->hasFnAttribute("__Mips16RetHelper")) {
+        Mask = MipsRegisterInfo::getMips16RetHelperMask();
+      }
+    }
+  }
+  Ops.push_back(CLI.DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+}
+
+/// LowerCall - functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+SDValue
+MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                              SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc DL                              = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &IsTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool IsVarArg                         = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetFrameLowering *TFL = Subtarget.getFrameLowering();
+  MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
+  bool IsPIC = isPositionIndependent();
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  MipsCCState CCInfo(
+      CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext(),
+      MipsCCState::getSpecialCallingConvForCallee(Callee.getNode(), Subtarget));
+
+  // Allocate the reserved argument area. It seems strange to do this from the
+  // caller side but removing it breaks the frame size calculation.
+  CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(), Callee.getNode());
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NextStackOffset = CCInfo.getNextStackOffset();
+
+  // Check if it's really possible to do a tail call. Restrict it to functions
+  // that are part of this compilation unit.
+  bool InternalLinkage = false;
+  if (IsTailCall) {
+    IsTailCall = isEligibleForTailCallOptimization(
+        CCInfo, NextStackOffset, *MF.getInfo<MipsFunctionInfo>());
+     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+      InternalLinkage = G->getGlobal()->hasInternalLinkage();
+      IsTailCall &= (InternalLinkage || G->getGlobal()->hasLocalLinkage() ||
+                     G->getGlobal()->hasPrivateLinkage() ||
+                     G->getGlobal()->hasHiddenVisibility() ||
+                     G->getGlobal()->hasProtectedVisibility());
+     }
+  }
+  if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
+    report_fatal_error("failed to perform tail call elimination on a call "
+                       "site marked musttail");
+
+  if (IsTailCall)
+    ++NumTailCalls;
+
+  // Chain is the output chain of the last Load/Store or CopyToReg node.
+  // ByValChain is the output chain of the last Memcpy node created for copying
+  // byval arguments to the stack.
+  unsigned StackAlignment = TFL->getStackAlignment();
+  NextStackOffset = alignTo(NextStackOffset, StackAlignment);
+  SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, DL, true);
+
+  if (!IsTailCall)
+    Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL);
+
+  SDValue StackPtr =
+      DAG.getCopyFromReg(Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP,
+                         getPointerTy(DAG.getDataLayout()));
+
+  std::deque< std::pair<unsigned, SDValue> > RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  CCInfo.rewindByValRegsInfo();
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    SDValue Arg = OutVals[i];
+    CCValAssign &VA = ArgLocs[i];
+    MVT ValVT = VA.getValVT(), LocVT = VA.getLocVT();
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    bool UseUpperBits = false;
+
+    // ByVal Arg.
+    if (Flags.isByVal()) {
+      unsigned FirstByValReg, LastByValReg;
+      unsigned ByValIdx = CCInfo.getInRegsParamsProcessed();
+      CCInfo.getInRegsParamInfo(ByValIdx, FirstByValReg, LastByValReg);
+
+      assert(Flags.getByValSize() &&
+             "ByVal args of size 0 should have been ignored by front-end.");
+      assert(ByValIdx < CCInfo.getInRegsParamsCount());
+      assert(!IsTailCall &&
+             "Do not tail-call optimize if there is a byval argument.");
+      passByValArg(Chain, DL, RegsToPass, MemOpChains, StackPtr, MFI, DAG, Arg,
+                   FirstByValReg, LastByValReg, Flags, Subtarget.isLittle(),
+                   VA);
+      CCInfo.nextInRegsParam();
+      continue;
+    }
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      if (VA.isRegLoc()) {
+        if ((ValVT == MVT::f32 && LocVT == MVT::i32) ||
+            (ValVT == MVT::f64 && LocVT == MVT::i64) ||
+            (ValVT == MVT::i64 && LocVT == MVT::f64))
+          Arg = DAG.getNode(ISD::BITCAST, DL, LocVT, Arg);
+        else if (ValVT == MVT::f64 && LocVT == MVT::i32) {
+          SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                                   Arg, DAG.getConstant(0, DL, MVT::i32));
+          SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                                   Arg, DAG.getConstant(1, DL, MVT::i32));
+          if (!Subtarget.isLittle())
+            std::swap(Lo, Hi);
+          unsigned LocRegLo = VA.getLocReg();
+          unsigned LocRegHigh = getNextIntArgReg(LocRegLo);
+          RegsToPass.push_back(std::make_pair(LocRegLo, Lo));
+          RegsToPass.push_back(std::make_pair(LocRegHigh, Hi));
+          continue;
+        }
+      }
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, LocVT, Arg);
+      break;
+    case CCValAssign::SExtUpper:
+      UseUpperBits = true;
+      LLVM_FALLTHROUGH;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, LocVT, Arg);
+      break;
+    case CCValAssign::ZExtUpper:
+      UseUpperBits = true;
+      LLVM_FALLTHROUGH;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, LocVT, Arg);
+      break;
+    case CCValAssign::AExtUpper:
+      UseUpperBits = true;
+      LLVM_FALLTHROUGH;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, LocVT, Arg);
+      break;
+    }
+
+    if (UseUpperBits) {
+      unsigned ValSizeInBits = Outs[i].ArgVT.getSizeInBits();
+      unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
+      Arg = DAG.getNode(
+          ISD::SHL, DL, VA.getLocVT(), Arg,
+          DAG.getConstant(LocSizeInBits - ValSizeInBits, DL, VA.getLocVT()));
+    }
+
+    // Arguments that can be passed on register must be kept at
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      continue;
+    }
+
+    // Register can't get to this point...
+    assert(VA.isMemLoc());
+
+    // emit ISD::STORE whichs stores the
+    // parameter value to a stack Location
+    MemOpChains.push_back(passArgOnStack(StackPtr, VA.getLocMemOffset(),
+                                         Chain, Arg, DL, IsTailCall, DAG));
+  }
+
+  // Transform all store nodes into one single node because all store
+  // nodes are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  bool IsPICCall = (ABI.IsN64() || IsPIC); // true if calls are translated to
+                                           // jalr $25
+  SDValue CalleeLo;
+  EVT Ty = Callee.getValueType();
+  bool GlobalOrExternal = false, IsCallReloc = false;
+
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    if (IsPICCall) {
+      const GlobalValue *Val = G->getGlobal();
+      InternalLinkage = Val->hasInternalLinkage();
+
+      if (InternalLinkage)
+        Callee = getAddrLocal(G, DL, Ty, DAG, ABI.IsN32() || ABI.IsN64());
+      else if (LargeGOT) {
+        Callee = getAddrGlobalLargeGOT(G, DL, Ty, DAG, MipsII::MO_CALL_HI16,
+                                       MipsII::MO_CALL_LO16, Chain,
+                                       FuncInfo->callPtrInfo(Val));
+        IsCallReloc = true;
+      } else {
+        Callee = getAddrGlobal(G, DL, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
+                               FuncInfo->callPtrInfo(Val));
+        IsCallReloc = true;
+      }
+    } else
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL,
+                                          getPointerTy(DAG.getDataLayout()), 0,
+                                          MipsII::MO_NO_FLAG);
+    GlobalOrExternal = true;
+  }
+  else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const char *Sym = S->getSymbol();
+
+    if (!ABI.IsN64() && !IsPIC) // !N64 && static
+      Callee = DAG.getTargetExternalSymbol(
+          Sym, getPointerTy(DAG.getDataLayout()), MipsII::MO_NO_FLAG);
+    else if (LargeGOT) {
+      Callee = getAddrGlobalLargeGOT(S, DL, Ty, DAG, MipsII::MO_CALL_HI16,
+                                     MipsII::MO_CALL_LO16, Chain,
+                                     FuncInfo->callPtrInfo(Sym));
+      IsCallReloc = true;
+    } else { // N64 || PIC
+      Callee = getAddrGlobal(S, DL, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
+                             FuncInfo->callPtrInfo(Sym));
+      IsCallReloc = true;
+    }
+
+    GlobalOrExternal = true;
+  }
+
+  SmallVector<SDValue, 8> Ops(1, Chain);
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal, InternalLinkage,
+              IsCallReloc, CLI, Callee, Chain);
+
+  if (IsTailCall) {
+    MF.getFrameInfo().setHasTailCall();
+    return DAG.getNode(MipsISD::TailCall, DL, MVT::Other, Ops);
+  }
+
+  Chain = DAG.getNode(MipsISD::JmpLink, DL, NodeTys, Ops);
+  SDValue InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain, NextStackOffsetVal,
+                             DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+                         InVals, CLI);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue MipsTargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+    TargetLowering::CallLoweringInfo &CLI) const {
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+                     *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, RVLocs[i].getLocReg(),
+                                     RVLocs[i].getLocVT(), InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    if (VA.isUpperBitsInLoc()) {
+      unsigned ValSizeInBits = Ins[i].ArgVT.getSizeInBits();
+      unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
+      unsigned Shift =
+          VA.getLocInfo() == CCValAssign::ZExtUpper ? ISD::SRL : ISD::SRA;
+      Val = DAG.getNode(
+          Shift, DL, VA.getLocVT(), Val,
+          DAG.getConstant(LocSizeInBits - ValSizeInBits, DL, VA.getLocVT()));
+    }
+
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+      break;
+    case CCValAssign::AExt:
+    case CCValAssign::AExtUpper:
+      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+      break;
+    case CCValAssign::ZExt:
+    case CCValAssign::ZExtUpper:
+      Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+      break;
+    case CCValAssign::SExt:
+    case CCValAssign::SExtUpper:
+      Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+static SDValue UnpackFromArgumentSlot(SDValue Val, const CCValAssign &VA,
+                                      EVT ArgVT, const SDLoc &DL,
+                                      SelectionDAG &DAG) {
+  MVT LocVT = VA.getLocVT();
+  EVT ValVT = VA.getValVT();
+
+  // Shift into the upper bits if necessary.
+  switch (VA.getLocInfo()) {
+  default:
+    break;
+  case CCValAssign::AExtUpper:
+  case CCValAssign::SExtUpper:
+  case CCValAssign::ZExtUpper: {
+    unsigned ValSizeInBits = ArgVT.getSizeInBits();
+    unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
+    unsigned Opcode =
+        VA.getLocInfo() == CCValAssign::ZExtUpper ? ISD::SRL : ISD::SRA;
+    Val = DAG.getNode(
+        Opcode, DL, VA.getLocVT(), Val,
+        DAG.getConstant(LocSizeInBits - ValSizeInBits, DL, VA.getLocVT()));
+    break;
+  }
+  }
+
+  // If this is an value smaller than the argument slot size (32-bit for O32,
+  // 64-bit for N32/N64), it has been promoted in some way to the argument slot
+  // size. Extract the value and insert any appropriate assertions regarding
+  // sign/zero extension.
+  switch (VA.getLocInfo()) {
+  default:
+    llvm_unreachable("Unknown loc info!");
+  case CCValAssign::Full:
+    break;
+  case CCValAssign::AExtUpper:
+  case CCValAssign::AExt:
+    Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+    break;
+  case CCValAssign::SExtUpper:
+  case CCValAssign::SExt:
+    Val = DAG.getNode(ISD::AssertSext, DL, LocVT, Val, DAG.getValueType(ValVT));
+    Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+    break;
+  case CCValAssign::ZExtUpper:
+  case CCValAssign::ZExt:
+    Val = DAG.getNode(ISD::AssertZext, DL, LocVT, Val, DAG.getValueType(ValVT));
+    Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+    break;
+  case CCValAssign::BCvt:
+    Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
+    break;
+  }
+
+  return Val;
+}
+
+//===----------------------------------------------------------------------===//
+//             Formal Arguments Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+/// LowerFormalArguments - transform physical registers into virtual registers
+/// and generate load operations for arguments places on the stack.
+SDValue MipsTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  MipsFI->setVarArgsFrameIndex(0);
+
+  // Used with vargs to acumulate store chains.
+  std::vector<SDValue> OutChains;
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                     *DAG.getContext());
+  CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
+  const Function *Func = DAG.getMachineFunction().getFunction();
+  Function::const_arg_iterator FuncArg = Func->arg_begin();
+
+  if (Func->hasFnAttribute("interrupt") && !Func->arg_empty())
+    report_fatal_error(
+        "Functions with the interrupt attribute cannot have arguments!");
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_Mips_FixedArg);
+  MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(),
+                           CCInfo.getInRegsParamsCount() > 0);
+
+  unsigned CurArgIdx = 0;
+  CCInfo.rewindByValRegsInfo();
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (Ins[i].isOrigArg()) {
+      std::advance(FuncArg, Ins[i].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[i].getOrigArgIndex();
+    }
+    EVT ValVT = VA.getValVT();
+    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+    bool IsRegLoc = VA.isRegLoc();
+
+    if (Flags.isByVal()) {
+      assert(Ins[i].isOrigArg() && "Byval arguments cannot be implicit");
+      unsigned FirstByValReg, LastByValReg;
+      unsigned ByValIdx = CCInfo.getInRegsParamsProcessed();
+      CCInfo.getInRegsParamInfo(ByValIdx, FirstByValReg, LastByValReg);
+
+      assert(Flags.getByValSize() &&
+             "ByVal args of size 0 should have been ignored by front-end.");
+      assert(ByValIdx < CCInfo.getInRegsParamsCount());
+      copyByValRegs(Chain, DL, OutChains, DAG, Flags, InVals, &*FuncArg,
+                    FirstByValReg, LastByValReg, VA, CCInfo);
+      CCInfo.nextInRegsParam();
+      continue;
+    }
+
+    // Arguments stored on registers
+    if (IsRegLoc) {
+      MVT RegVT = VA.getLocVT();
+      unsigned ArgReg = VA.getLocReg();
+      const TargetRegisterClass *RC = getRegClassFor(RegVT);
+
+      // Transform the arguments stored on
+      // physical registers into virtual ones
+      unsigned Reg = addLiveIn(DAG.getMachineFunction(), ArgReg, RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
+
+      ArgValue = UnpackFromArgumentSlot(ArgValue, VA, Ins[i].ArgVT, DL, DAG);
+
+      // Handle floating point arguments passed in integer registers and
+      // long double arguments passed in floating point registers.
+      if ((RegVT == MVT::i32 && ValVT == MVT::f32) ||
+          (RegVT == MVT::i64 && ValVT == MVT::f64) ||
+          (RegVT == MVT::f64 && ValVT == MVT::i64))
+        ArgValue = DAG.getNode(ISD::BITCAST, DL, ValVT, ArgValue);
+      else if (ABI.IsO32() && RegVT == MVT::i32 &&
+               ValVT == MVT::f64) {
+        unsigned Reg2 = addLiveIn(DAG.getMachineFunction(),
+                                  getNextIntArgReg(ArgReg), RC);
+        SDValue ArgValue2 = DAG.getCopyFromReg(Chain, DL, Reg2, RegVT);
+        if (!Subtarget.isLittle())
+          std::swap(ArgValue, ArgValue2);
+        ArgValue = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64,
+                               ArgValue, ArgValue2);
+      }
+
+      InVals.push_back(ArgValue);
+    } else { // VA.isRegLoc()
+      MVT LocVT = VA.getLocVT();
+
+      if (ABI.IsO32()) {
+        // We ought to be able to use LocVT directly but O32 sets it to i32
+        // when allocating floating point values to integer registers.
+        // This shouldn't influence how we load the value into registers unless
+        // we are targeting softfloat.
+        if (VA.getValVT().isFloatingPoint() && !Subtarget.useSoftFloat())
+          LocVT = VA.getValVT();
+      }
+
+      // sanity check
+      assert(VA.isMemLoc());
+
+      // The stack pointer offset is relative to the caller stack frame.
+      int FI = MFI.CreateFixedObject(LocVT.getSizeInBits() / 8,
+                                     VA.getLocMemOffset(), true);
+
+      // Create load nodes to retrieve arguments from the stack
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+      SDValue ArgValue = DAG.getLoad(
+          LocVT, DL, Chain, FIN,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+      OutChains.push_back(ArgValue.getValue(1));
+
+      ArgValue = UnpackFromArgumentSlot(ArgValue, VA, Ins[i].ArgVT, DL, DAG);
+
+      InVals.push_back(ArgValue);
+    }
+  }
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    // The mips ABIs for returning structs by value requires that we copy
+    // the sret argument into $v0 for the return. Save the argument into
+    // a virtual register so that we can access it from the return points.
+    if (Ins[i].Flags.isSRet()) {
+      unsigned Reg = MipsFI->getSRetReturnReg();
+      if (!Reg) {
+        Reg = MF.getRegInfo().createVirtualRegister(
+            getRegClassFor(ABI.IsN64() ? MVT::i64 : MVT::i32));
+        MipsFI->setSRetReturnReg(Reg);
+      }
+      SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[i]);
+      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
+      break;
+    }
+  }
+
+  if (IsVarArg)
+    writeVarArgRegs(OutChains, Chain, DL, DAG, CCInfo);
+
+  // All stores are grouped in one node to allow the matching between
+  // the size of Ins and InVals. This only happens when on varg functions
+  if (!OutChains.empty()) {
+    OutChains.push_back(Chain);
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+bool
+MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                   MachineFunction &MF, bool IsVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  MipsCCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_Mips);
+}
+
+bool
+MipsTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const {
+  if (Subtarget.hasMips3() && Subtarget.useSoftFloat()) {
+    if (Type == MVT::i32)
+      return true;
+  }
+  return IsSigned;
+}
+
+SDValue
+MipsTargetLowering::LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
+                                         const SDLoc &DL,
+                                         SelectionDAG &DAG) const {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  MipsFI->setISR();
+
+  return DAG.getNode(MipsISD::ERet, DL, MVT::Other, RetOps);
+}
+
+SDValue
+MipsTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                bool IsVarArg,
+                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                const SmallVectorImpl<SDValue> &OutVals,
+                                const SDLoc &DL, SelectionDAG &DAG) const {
+  // CCValAssign - represent the assignment of
+  // the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // CCState - Info about the registers and stack slot.
+  MipsCCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
+
+  // Analyze return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_Mips);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    SDValue Val = OutVals[i];
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    bool UseUpperBits = false;
+
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Val);
+      break;
+    case CCValAssign::AExtUpper:
+      UseUpperBits = true;
+      LLVM_FALLTHROUGH;
+    case CCValAssign::AExt:
+      Val = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Val);
+      break;
+    case CCValAssign::ZExtUpper:
+      UseUpperBits = true;
+      LLVM_FALLTHROUGH;
+    case CCValAssign::ZExt:
+      Val = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Val);
+      break;
+    case CCValAssign::SExtUpper:
+      UseUpperBits = true;
+      LLVM_FALLTHROUGH;
+    case CCValAssign::SExt:
+      Val = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Val);
+      break;
+    }
+
+    if (UseUpperBits) {
+      unsigned ValSizeInBits = Outs[i].ArgVT.getSizeInBits();
+      unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
+      Val = DAG.getNode(
+          ISD::SHL, DL, VA.getLocVT(), Val,
+          DAG.getConstant(LocSizeInBits - ValSizeInBits, DL, VA.getLocVT()));
+    }
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  // The mips ABIs for returning structs by value requires that we copy
+  // the sret argument into $v0 for the return. We saved the argument into
+  // a virtual register in the entry block, so now we copy the value out
+  // and into $v0.
+  if (MF.getFunction()->hasStructRetAttr()) {
+    MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+    unsigned Reg = MipsFI->getSRetReturnReg();
+
+    if (!Reg)
+      llvm_unreachable("sret virtual register not created in the entry block");
+    SDValue Val =
+        DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(DAG.getDataLayout()));
+    unsigned V0 = ABI.IsN64() ? Mips::V0_64 : Mips::V0;
+
+    Chain = DAG.getCopyToReg(Chain, DL, V0, Val, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(V0, getPointerTy(DAG.getDataLayout())));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  // ISRs must use "eret".
+  if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt"))
+    return LowerInterruptReturn(RetOps, DL, DAG);
+
+  // Standard return on Mips is a "jr $ra"
+  return DAG.getNode(MipsISD::Ret, DL, MVT::Other, RetOps);
+}
+
+//===----------------------------------------------------------------------===//
+//                           Mips Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+MipsTargetLowering::ConstraintType
+MipsTargetLowering::getConstraintType(StringRef Constraint) const {
+  // Mips specific constraints
+  // GCC config/mips/constraints.md
+  //
+  // 'd' : An address register. Equivalent to r
+  //       unless generating MIPS16 code.
+  // 'y' : Equivalent to r; retained for
+  //       backwards compatibility.
+  // 'c' : A register suitable for use in an indirect
+  //       jump. This will always be $25 for -mabicalls.
+  // 'l' : The lo register. 1 word storage.
+  // 'x' : The hilo register pair. Double word storage.
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+      default : break;
+      case 'd':
+      case 'y':
+      case 'f':
+      case 'c':
+      case 'l':
+      case 'x':
+        return C_RegisterClass;
+      case 'R':
+        return C_Memory;
+    }
+  }
+
+  if (Constraint == "ZC")
+    return C_Memory;
+
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+MipsTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'd':
+  case 'y':
+    if (type->isIntegerTy())
+      weight = CW_Register;
+    break;
+  case 'f': // FPU or MSA register
+    if (Subtarget.hasMSA() && type->isVectorTy() &&
+        cast<VectorType>(type)->getBitWidth() == 128)
+      weight = CW_Register;
+    else if (type->isFloatTy())
+      weight = CW_Register;
+    break;
+  case 'c': // $25 for indirect jumps
+  case 'l': // lo register
+  case 'x': // hilo register pair
+    if (type->isIntegerTy())
+      weight = CW_SpecificReg;
+    break;
+  case 'I': // signed 16 bit immediate
+  case 'J': // integer zero
+  case 'K': // unsigned 16 bit immediate
+  case 'L': // signed 32 bit immediate where lower 16 bits are 0
+  case 'N': // immediate in the range of -65535 to -1 (inclusive)
+  case 'O': // signed 15 bit immediate (+- 16383)
+  case 'P': // immediate in the range of 65535 to 1 (inclusive)
+    if (isa<ConstantInt>(CallOperandVal))
+      weight = CW_Constant;
+    break;
+  case 'R':
+    weight = CW_Memory;
+    break;
+  }
+  return weight;
+}
+
+/// This is a helper function to parse a physical register string and split it
+/// into non-numeric and numeric parts (Prefix and Reg). The first boolean flag
+/// that is returned indicates whether parsing was successful. The second flag
+/// is true if the numeric part exists.
+static std::pair<bool, bool> parsePhysicalReg(StringRef C, StringRef &Prefix,
+                                              unsigned long long &Reg) {
+  if (C.front() != '{' || C.back() != '}')
+    return std::make_pair(false, false);
+
+  // Search for the first numeric character.
+  StringRef::const_iterator I, B = C.begin() + 1, E = C.end() - 1;
+  I = std::find_if(B, E, isdigit);
+
+  Prefix = StringRef(B, I - B);
+
+  // The second flag is set to false if no numeric characters were found.
+  if (I == E)
+    return std::make_pair(true, false);
+
+  // Parse the numeric characters.
+  return std::make_pair(!getAsUnsignedInteger(StringRef(I, E - I), 10, Reg),
+                        true);
+}
+
+std::pair<unsigned, const TargetRegisterClass *> MipsTargetLowering::
+parseRegForInlineAsmConstraint(StringRef C, MVT VT) const {
+  const TargetRegisterInfo *TRI =
+      Subtarget.getRegisterInfo();
+  const TargetRegisterClass *RC;
+  StringRef Prefix;
+  unsigned long long Reg;
+
+  std::pair<bool, bool> R = parsePhysicalReg(C, Prefix, Reg);
+
+  if (!R.first)
+    return std::make_pair(0U, nullptr);
+
+  if ((Prefix == "hi" || Prefix == "lo")) { // Parse hi/lo.
+    // No numeric characters follow "hi" or "lo".
+    if (R.second)
+      return std::make_pair(0U, nullptr);
+
+    RC = TRI->getRegClass(Prefix == "hi" ?
+                          Mips::HI32RegClassID : Mips::LO32RegClassID);
+    return std::make_pair(*(RC->begin()), RC);
+  } else if (Prefix.startswith("$msa")) {
+    // Parse $msa(ir|csr|access|save|modify|request|map|unmap)
+
+    // No numeric characters follow the name.
+    if (R.second)
+      return std::make_pair(0U, nullptr);
+
+    Reg = StringSwitch<unsigned long long>(Prefix)
+              .Case("$msair", Mips::MSAIR)
+              .Case("$msacsr", Mips::MSACSR)
+              .Case("$msaaccess", Mips::MSAAccess)
+              .Case("$msasave", Mips::MSASave)
+              .Case("$msamodify", Mips::MSAModify)
+              .Case("$msarequest", Mips::MSARequest)
+              .Case("$msamap", Mips::MSAMap)
+              .Case("$msaunmap", Mips::MSAUnmap)
+              .Default(0);
+
+    if (!Reg)
+      return std::make_pair(0U, nullptr);
+
+    RC = TRI->getRegClass(Mips::MSACtrlRegClassID);
+    return std::make_pair(Reg, RC);
+  }
+
+  if (!R.second)
+    return std::make_pair(0U, nullptr);
+
+  if (Prefix == "$f") { // Parse $f0-$f31.
+    // If the size of FP registers is 64-bit or Reg is an even number, select
+    // the 64-bit register class. Otherwise, select the 32-bit register class.
+    if (VT == MVT::Other)
+      VT = (Subtarget.isFP64bit() || !(Reg % 2)) ? MVT::f64 : MVT::f32;
+
+    RC = getRegClassFor(VT);
+
+    if (RC == &Mips::AFGR64RegClass) {
+      assert(Reg % 2 == 0);
+      Reg >>= 1;
+    }
+  } else if (Prefix == "$fcc") // Parse $fcc0-$fcc7.
+    RC = TRI->getRegClass(Mips::FCCRegClassID);
+  else if (Prefix == "$w") { // Parse $w0-$w31.
+    RC = getRegClassFor((VT == MVT::Other) ? MVT::v16i8 : VT);
+  } else { // Parse $0-$31.
+    assert(Prefix == "$");
+    RC = getRegClassFor((VT == MVT::Other) ? MVT::i32 : VT);
+  }
+
+  assert(Reg < RC->getNumRegs());
+  return std::make_pair(*(RC->begin() + Reg), RC);
+}
+
+/// Given a register class constraint, like 'r', if this corresponds directly
+/// to an LLVM register class, return a register of 0 and the register class
+/// pointer.
+std::pair<unsigned, const TargetRegisterClass *>
+MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                 StringRef Constraint,
+                                                 MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'd': // Address register. Same as 'r' unless generating MIPS16 code.
+    case 'y': // Same as 'r'. Exists for compatibility.
+    case 'r':
+      if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
+        if (Subtarget.inMips16Mode())
+          return std::make_pair(0U, &Mips::CPU16RegsRegClass);
+        return std::make_pair(0U, &Mips::GPR32RegClass);
+      }
+      if (VT == MVT::i64 && !Subtarget.isGP64bit())
+        return std::make_pair(0U, &Mips::GPR32RegClass);
+      if (VT == MVT::i64 && Subtarget.isGP64bit())
+        return std::make_pair(0U, &Mips::GPR64RegClass);
+      // This will generate an error message
+      return std::make_pair(0U, nullptr);
+    case 'f': // FPU or MSA register
+      if (VT == MVT::v16i8)
+        return std::make_pair(0U, &Mips::MSA128BRegClass);
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
+        return std::make_pair(0U, &Mips::MSA128HRegClass);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return std::make_pair(0U, &Mips::MSA128WRegClass);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return std::make_pair(0U, &Mips::MSA128DRegClass);
+      else if (VT == MVT::f32)
+        return std::make_pair(0U, &Mips::FGR32RegClass);
+      else if ((VT == MVT::f64) && (!Subtarget.isSingleFloat())) {
+        if (Subtarget.isFP64bit())
+          return std::make_pair(0U, &Mips::FGR64RegClass);
+        return std::make_pair(0U, &Mips::AFGR64RegClass);
+      }
+      break;
+    case 'c': // register suitable for indirect jump
+      if (VT == MVT::i32)
+        return std::make_pair((unsigned)Mips::T9, &Mips::GPR32RegClass);
+      assert(VT == MVT::i64 && "Unexpected type.");
+      return std::make_pair((unsigned)Mips::T9_64, &Mips::GPR64RegClass);
+    case 'l': // register suitable for indirect jump
+      if (VT == MVT::i32)
+        return std::make_pair((unsigned)Mips::LO0, &Mips::LO32RegClass);
+      return std::make_pair((unsigned)Mips::LO0_64, &Mips::LO64RegClass);
+    case 'x': // register suitable for indirect jump
+      // Fixme: Not triggering the use of both hi and low
+      // This will generate an error message
+      return std::make_pair(0U, nullptr);
+    }
+  }
+
+  std::pair<unsigned, const TargetRegisterClass *> R;
+  R = parseRegForInlineAsmConstraint(Constraint, VT);
+
+  if (R.second)
+    return R;
+
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Result;
+
+  // Only support length 1 constraints for now.
+  if (Constraint.length() > 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break; // This will fall through to the generic implementation
+  case 'I': // Signed 16 bit constant
+    // If this fails, the parent routine will give an error
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if (isInt<16>(Val)) {
+        Result = DAG.getTargetConstant(Val, DL, Type);
+        break;
+      }
+    }
+    return;
+  case 'J': // integer zero
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getZExtValue();
+      if (Val == 0) {
+        Result = DAG.getTargetConstant(0, DL, Type);
+        break;
+      }
+    }
+    return;
+  case 'K': // unsigned 16 bit immediate
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      uint64_t Val = (uint64_t)C->getZExtValue();
+      if (isUInt<16>(Val)) {
+        Result = DAG.getTargetConstant(Val, DL, Type);
+        break;
+      }
+    }
+    return;
+  case 'L': // signed 32 bit immediate where lower 16 bits are 0
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((isInt<32>(Val)) && ((Val & 0xffff) == 0)){
+        Result = DAG.getTargetConstant(Val, DL, Type);
+        break;
+      }
+    }
+    return;
+  case 'N': // immediate in the range of -65535 to -1 (inclusive)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((Val >= -65535) && (Val <= -1)) {
+        Result = DAG.getTargetConstant(Val, DL, Type);
+        break;
+      }
+    }
+    return;
+  case 'O': // signed 15 bit immediate
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((isInt<15>(Val))) {
+        Result = DAG.getTargetConstant(Val, DL, Type);
+        break;
+      }
+    }
+    return;
+  case 'P': // immediate in the range of 1 to 65535 (inclusive)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((Val <= 65535) && (Val >= 1)) {
+        Result = DAG.getTargetConstant(Val, DL, Type);
+        break;
+      }
+    }
+    return;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+bool MipsTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                               const AddrMode &AM, Type *Ty,
+                                               unsigned AS) const {
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  switch (AM.Scale) {
+  case 0: // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (!AM.HasBaseReg) // allow "r+i".
+      break;
+    return false; // disallow "r+r" or "r+r+i".
+  default:
+    return false;
+  }
+
+  return true;
+}
+
+bool
+MipsTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The Mips target isn't yet aware of offsets.
+  return false;
+}
+
+EVT MipsTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                            unsigned SrcAlign,
+                                            bool IsMemset, bool ZeroMemset,
+                                            bool MemcpyStrSrc,
+                                            MachineFunction &MF) const {
+  if (Subtarget.hasMips64())
+    return MVT::i64;
+
+  return MVT::i32;
+}
+
+bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return false;
+  if (Imm.isNegZero())
+    return false;
+  return Imm.isZero();
+}
+
+unsigned MipsTargetLowering::getJumpTableEncoding() const {
+  if (ABI.IsN64())
+    return MachineJumpTableInfo::EK_GPRel64BlockAddress;
+
+  return TargetLowering::getJumpTableEncoding();
+}
+
+bool MipsTargetLowering::useSoftFloat() const {
+  return Subtarget.useSoftFloat();
+}
+
+void MipsTargetLowering::copyByValRegs(
+    SDValue Chain, const SDLoc &DL, std::vector<SDValue> &OutChains,
+    SelectionDAG &DAG, const ISD::ArgFlagsTy &Flags,
+    SmallVectorImpl<SDValue> &InVals, const Argument *FuncArg,
+    unsigned FirstReg, unsigned LastReg, const CCValAssign &VA,
+    MipsCCState &State) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  unsigned GPRSizeInBytes = Subtarget.getGPRSizeInBytes();
+  unsigned NumRegs = LastReg - FirstReg;
+  unsigned RegAreaSize = NumRegs * GPRSizeInBytes;
+  unsigned FrameObjSize = std::max(Flags.getByValSize(), RegAreaSize);
+  int FrameObjOffset;
+  ArrayRef<MCPhysReg> ByValArgRegs = ABI.GetByValArgRegs();
+
+  if (RegAreaSize)
+    FrameObjOffset =
+        (int)ABI.GetCalleeAllocdArgSizeInBytes(State.getCallingConv()) -
+        (int)((ByValArgRegs.size() - FirstReg) * GPRSizeInBytes);
+  else
+    FrameObjOffset = VA.getLocMemOffset();
+
+  // Create frame object.
+  EVT PtrTy = getPointerTy(DAG.getDataLayout());
+  int FI = MFI.CreateFixedObject(FrameObjSize, FrameObjOffset, true);
+  SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
+  InVals.push_back(FIN);
+
+  if (!NumRegs)
+    return;
+
+  // Copy arg registers.
+  MVT RegTy = MVT::getIntegerVT(GPRSizeInBytes * 8);
+  const TargetRegisterClass *RC = getRegClassFor(RegTy);
+
+  for (unsigned I = 0; I < NumRegs; ++I) {
+    unsigned ArgReg = ByValArgRegs[FirstReg + I];
+    unsigned VReg = addLiveIn(MF, ArgReg, RC);
+    unsigned Offset = I * GPRSizeInBytes;
+    SDValue StorePtr = DAG.getNode(ISD::ADD, DL, PtrTy, FIN,
+                                   DAG.getConstant(Offset, DL, PtrTy));
+    SDValue Store = DAG.getStore(Chain, DL, DAG.getRegister(VReg, RegTy),
+                                 StorePtr, MachinePointerInfo(FuncArg, Offset));
+    OutChains.push_back(Store);
+  }
+}
+
+// Copy byVal arg to registers and stack.
+void MipsTargetLowering::passByValArg(
+    SDValue Chain, const SDLoc &DL,
+    std::deque<std::pair<unsigned, SDValue>> &RegsToPass,
+    SmallVectorImpl<SDValue> &MemOpChains, SDValue StackPtr,
+    MachineFrameInfo &MFI, SelectionDAG &DAG, SDValue Arg, unsigned FirstReg,
+    unsigned LastReg, const ISD::ArgFlagsTy &Flags, bool isLittle,
+    const CCValAssign &VA) const {
+  unsigned ByValSizeInBytes = Flags.getByValSize();
+  unsigned OffsetInBytes = 0; // From beginning of struct
+  unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
+  unsigned Alignment = std::min(Flags.getByValAlign(), RegSizeInBytes);
+  EVT PtrTy = getPointerTy(DAG.getDataLayout()),
+      RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
+  unsigned NumRegs = LastReg - FirstReg;
+
+  if (NumRegs) {
+    ArrayRef<MCPhysReg> ArgRegs = ABI.GetByValArgRegs();
+    bool LeftoverBytes = (NumRegs * RegSizeInBytes > ByValSizeInBytes);
+    unsigned I = 0;
+
+    // Copy words to registers.
+    for (; I < NumRegs - LeftoverBytes; ++I, OffsetInBytes += RegSizeInBytes) {
+      SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+                                    DAG.getConstant(OffsetInBytes, DL, PtrTy));
+      SDValue LoadVal = DAG.getLoad(RegTy, DL, Chain, LoadPtr,
+                                    MachinePointerInfo(), Alignment);
+      MemOpChains.push_back(LoadVal.getValue(1));
+      unsigned ArgReg = ArgRegs[FirstReg + I];
+      RegsToPass.push_back(std::make_pair(ArgReg, LoadVal));
+    }
+
+    // Return if the struct has been fully copied.
+    if (ByValSizeInBytes == OffsetInBytes)
+      return;
+
+    // Copy the remainder of the byval argument with sub-word loads and shifts.
+    if (LeftoverBytes) {
+      SDValue Val;
+
+      for (unsigned LoadSizeInBytes = RegSizeInBytes / 2, TotalBytesLoaded = 0;
+           OffsetInBytes < ByValSizeInBytes; LoadSizeInBytes /= 2) {
+        unsigned RemainingSizeInBytes = ByValSizeInBytes - OffsetInBytes;
+
+        if (RemainingSizeInBytes < LoadSizeInBytes)
+          continue;
+
+        // Load subword.
+        SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+                                      DAG.getConstant(OffsetInBytes, DL,
+                                                      PtrTy));
+        SDValue LoadVal = DAG.getExtLoad(
+            ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr, MachinePointerInfo(),
+            MVT::getIntegerVT(LoadSizeInBytes * 8), Alignment);
+        MemOpChains.push_back(LoadVal.getValue(1));
+
+        // Shift the loaded value.
+        unsigned Shamt;
+
+        if (isLittle)
+          Shamt = TotalBytesLoaded * 8;
+        else
+          Shamt = (RegSizeInBytes - (TotalBytesLoaded + LoadSizeInBytes)) * 8;
+
+        SDValue Shift = DAG.getNode(ISD::SHL, DL, RegTy, LoadVal,
+                                    DAG.getConstant(Shamt, DL, MVT::i32));
+
+        if (Val.getNode())
+          Val = DAG.getNode(ISD::OR, DL, RegTy, Val, Shift);
+        else
+          Val = Shift;
+
+        OffsetInBytes += LoadSizeInBytes;
+        TotalBytesLoaded += LoadSizeInBytes;
+        Alignment = std::min(Alignment, LoadSizeInBytes);
+      }
+
+      unsigned ArgReg = ArgRegs[FirstReg + I];
+      RegsToPass.push_back(std::make_pair(ArgReg, Val));
+      return;
+    }
+  }
+
+  // Copy remainder of byval arg to it with memcpy.
+  unsigned MemCpySize = ByValSizeInBytes - OffsetInBytes;
+  SDValue Src = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+                            DAG.getConstant(OffsetInBytes, DL, PtrTy));
+  SDValue Dst = DAG.getNode(ISD::ADD, DL, PtrTy, StackPtr,
+                            DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));
+  Chain = DAG.getMemcpy(Chain, DL, Dst, Src,
+                        DAG.getConstant(MemCpySize, DL, PtrTy),
+                        Alignment, /*isVolatile=*/false, /*AlwaysInline=*/false,
+                        /*isTailCall=*/false,
+                        MachinePointerInfo(), MachinePointerInfo());
+  MemOpChains.push_back(Chain);
+}
+
+void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
+                                         SDValue Chain, const SDLoc &DL,
+                                         SelectionDAG &DAG,
+                                         CCState &State) const {
+  ArrayRef<MCPhysReg> ArgRegs = ABI.GetVarArgRegs();
+  unsigned Idx = State.getFirstUnallocated(ArgRegs);
+  unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
+  MVT RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
+  const TargetRegisterClass *RC = getRegClassFor(RegTy);
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  // Offset of the first variable argument from stack pointer.
+  int VaArgOffset;
+
+  if (ArgRegs.size() == Idx)
+    VaArgOffset = alignTo(State.getNextStackOffset(), RegSizeInBytes);
+  else {
+    VaArgOffset =
+        (int)ABI.GetCalleeAllocdArgSizeInBytes(State.getCallingConv()) -
+        (int)(RegSizeInBytes * (ArgRegs.size() - Idx));
+  }
+
+  // Record the frame index of the first variable argument
+  // which is a value necessary to VASTART.
+  int FI = MFI.CreateFixedObject(RegSizeInBytes, VaArgOffset, true);
+  MipsFI->setVarArgsFrameIndex(FI);
+
+  // Copy the integer registers that have not been used for argument passing
+  // to the argument register save area. For O32, the save area is allocated
+  // in the caller's stack frame, while for N32/64, it is allocated in the
+  // callee's stack frame.
+  for (unsigned I = Idx; I < ArgRegs.size();
+       ++I, VaArgOffset += RegSizeInBytes) {
+    unsigned Reg = addLiveIn(MF, ArgRegs[I], RC);
+    SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegTy);
+    FI = MFI.CreateFixedObject(RegSizeInBytes, VaArgOffset, true);
+    SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+    SDValue Store =
+        DAG.getStore(Chain, DL, ArgValue, PtrOff, MachinePointerInfo());
+    cast<StoreSDNode>(Store.getNode())->getMemOperand()->setValue(
+        (Value *)nullptr);
+    OutChains.push_back(Store);
+  }
+}
+
+void MipsTargetLowering::HandleByVal(CCState *State, unsigned &Size,
+                                     unsigned Align) const {
+  const TargetFrameLowering *TFL = Subtarget.getFrameLowering();
+
+  assert(Size && "Byval argument's size shouldn't be 0.");
+
+  Align = std::min(Align, TFL->getStackAlignment());
+
+  unsigned FirstReg = 0;
+  unsigned NumRegs = 0;
+
+  if (State->getCallingConv() != CallingConv::Fast) {
+    unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
+    ArrayRef<MCPhysReg> IntArgRegs = ABI.GetByValArgRegs();
+    // FIXME: The O32 case actually describes no shadow registers.
+    const MCPhysReg *ShadowRegs =
+        ABI.IsO32() ? IntArgRegs.data() : Mips64DPRegs;
+
+    // We used to check the size as well but we can't do that anymore since
+    // CCState::HandleByVal() rounds up the size after calling this function.
+    assert(!(Align % RegSizeInBytes) &&
+           "Byval argument's alignment should be a multiple of"
+           "RegSizeInBytes.");
+
+    FirstReg = State->getFirstUnallocated(IntArgRegs);
+
+    // If Align > RegSizeInBytes, the first arg register must be even.
+    // FIXME: This condition happens to do the right thing but it's not the
+    //        right way to test it. We want to check that the stack frame offset
+    //        of the register is aligned.
+    if ((Align > RegSizeInBytes) && (FirstReg % 2)) {
+      State->AllocateReg(IntArgRegs[FirstReg], ShadowRegs[FirstReg]);
+      ++FirstReg;
+    }
+
+    // Mark the registers allocated.
+    Size = alignTo(Size, RegSizeInBytes);
+    for (unsigned I = FirstReg; Size > 0 && (I < IntArgRegs.size());
+         Size -= RegSizeInBytes, ++I, ++NumRegs)
+      State->AllocateReg(IntArgRegs[I], ShadowRegs[I]);
+  }
+
+  State->addInRegsParamInfo(FirstReg, FirstReg + NumRegs);
+}
+
+MachineBasicBlock *MipsTargetLowering::emitPseudoSELECT(MachineInstr &MI,
+                                                        MachineBasicBlock *BB,
+                                                        bool isFPCmp,
+                                                        unsigned Opc) const {
+  assert(!(Subtarget.hasMips4() || Subtarget.hasMips32()) &&
+         "Subtarget already supports SELECT nodes with the use of"
+         "conditional-move instructions.");
+
+  const TargetInstrInfo *TII =
+      Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // To "insert" a SELECT instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = ++BB->getIterator();
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  if (isFPCmp) {
+    // bc1[tf] cc, sinkMBB
+    BuildMI(BB, DL, TII->get(Opc))
+        .addReg(MI.getOperand(1).getReg())
+        .addMBB(sinkMBB);
+  } else {
+    // bne rs, $0, sinkMBB
+    BuildMI(BB, DL, TII->get(Opc))
+        .addReg(MI.getOperand(1).getReg())
+        .addReg(Mips::ZERO)
+        .addMBB(sinkMBB);
+  }
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), DL, TII->get(Mips::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(2).getReg())
+      .addMBB(thisMBB)
+      .addReg(MI.getOperand(3).getReg())
+      .addMBB(copy0MBB);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+  return BB;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned MipsTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                               SelectionDAG &DAG) const {
+  // Named registers is expected to be fairly rare. For now, just support $28
+  // since the linux kernel uses it.
+  if (Subtarget.isGP64bit()) {
+    unsigned Reg = StringSwitch<unsigned>(RegName)
+                         .Case("$28", Mips::GP_64)
+                         .Default(0);
+    if (Reg)
+      return Reg;
+  } else {
+    unsigned Reg = StringSwitch<unsigned>(RegName)
+                         .Case("$28", Mips::GP)
+                         .Default(0);
+    if (Reg)
+      return Reg;
+  }
+  report_fatal_error("Invalid register name global variable");
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
new file mode 100644
index 000000000000..cddf0903ca6a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -0,0 +1,614 @@
+//===-- MipsISelLowering.h - Mips DAG Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Mips uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSISELLOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPSISELLOWERING_H
+
+#include "MCTargetDesc/MipsABIInfo.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetLowering.h"
+#include <deque>
+#include <string>
+
+namespace llvm {
+  namespace MipsISD {
+    enum NodeType : unsigned {
+      // Start the numbering from where ISD NodeType finishes.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      // Jump and link (call)
+      JmpLink,
+
+      // Tail call
+      TailCall,
+
+      // Get the Higher 16 bits from a 32-bit immediate
+      // No relation with Mips Hi register
+      Hi,
+
+      // Get the Lower 16 bits from a 32-bit immediate
+      // No relation with Mips Lo register
+      Lo,
+
+      // Handle gp_rel (small data/bss sections) relocation.
+      GPRel,
+
+      // Thread Pointer
+      ThreadPointer,
+
+      // Floating Point Branch Conditional
+      FPBrcond,
+
+      // Floating Point Compare
+      FPCmp,
+
+      // Floating Point Conditional Moves
+      CMovFP_T,
+      CMovFP_F,
+
+      // FP-to-int truncation node.
+      TruncIntFP,
+
+      // Return
+      Ret,
+
+      // Interrupt, exception, error trap Return
+      ERet,
+
+      // Software Exception Return.
+      EH_RETURN,
+
+      // Node used to extract integer from accumulator.
+      MFHI,
+      MFLO,
+
+      // Node used to insert integers to accumulator.
+      MTLOHI,
+
+      // Mult nodes.
+      Mult,
+      Multu,
+
+      // MAdd/Sub nodes
+      MAdd,
+      MAddu,
+      MSub,
+      MSubu,
+
+      // DivRem(u)
+      DivRem,
+      DivRemU,
+      DivRem16,
+      DivRemU16,
+
+      BuildPairF64,
+      ExtractElementF64,
+
+      Wrapper,
+
+      DynAlloc,
+
+      Sync,
+
+      Ext,
+      Ins,
+
+      // EXTR.W instrinsic nodes.
+      EXTP,
+      EXTPDP,
+      EXTR_S_H,
+      EXTR_W,
+      EXTR_R_W,
+      EXTR_RS_W,
+      SHILO,
+      MTHLIP,
+
+      // DPA.W intrinsic nodes.
+      MULSAQ_S_W_PH,
+      MAQ_S_W_PHL,
+      MAQ_S_W_PHR,
+      MAQ_SA_W_PHL,
+      MAQ_SA_W_PHR,
+      DPAU_H_QBL,
+      DPAU_H_QBR,
+      DPSU_H_QBL,
+      DPSU_H_QBR,
+      DPAQ_S_W_PH,
+      DPSQ_S_W_PH,
+      DPAQ_SA_L_W,
+      DPSQ_SA_L_W,
+      DPA_W_PH,
+      DPS_W_PH,
+      DPAQX_S_W_PH,
+      DPAQX_SA_W_PH,
+      DPAX_W_PH,
+      DPSX_W_PH,
+      DPSQX_S_W_PH,
+      DPSQX_SA_W_PH,
+      MULSA_W_PH,
+
+      MULT,
+      MULTU,
+      MADD_DSP,
+      MADDU_DSP,
+      MSUB_DSP,
+      MSUBU_DSP,
+
+      // DSP shift nodes.
+      SHLL_DSP,
+      SHRA_DSP,
+      SHRL_DSP,
+
+      // DSP setcc and select_cc nodes.
+      SETCC_DSP,
+      SELECT_CC_DSP,
+
+      // Vector comparisons.
+      // These take a vector and return a boolean.
+      VALL_ZERO,
+      VANY_ZERO,
+      VALL_NONZERO,
+      VANY_NONZERO,
+
+      // These take a vector and return a vector bitmask.
+      VCEQ,
+      VCLE_S,
+      VCLE_U,
+      VCLT_S,
+      VCLT_U,
+
+      // Element-wise vector max/min.
+      VSMAX,
+      VSMIN,
+      VUMAX,
+      VUMIN,
+
+      // Vector Shuffle with mask as an operand
+      VSHF,  // Generic shuffle
+      SHF,   // 4-element set shuffle.
+      ILVEV, // Interleave even elements
+      ILVOD, // Interleave odd elements
+      ILVL,  // Interleave left elements
+      ILVR,  // Interleave right elements
+      PCKEV, // Pack even elements
+      PCKOD, // Pack odd elements
+
+      // Vector Lane Copy
+      INSVE, // Copy element from one vector to another
+
+      // Combined (XOR (OR $a, $b), -1)
+      VNOR,
+
+      // Extended vector element extraction
+      VEXTRACT_SEXT_ELT,
+      VEXTRACT_ZEXT_ELT,
+
+      // Load/Store Left/Right nodes.
+      LWL = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      LWR,
+      SWL,
+      SWR,
+      LDL,
+      LDR,
+      SDL,
+      SDR
+    };
+  }
+
+  //===--------------------------------------------------------------------===//
+  // TargetLowering Implementation
+  //===--------------------------------------------------------------------===//
+  class MipsFunctionInfo;
+  class MipsSubtarget;
+  class MipsCCState;
+
+  class MipsTargetLowering : public TargetLowering  {
+    bool isMicroMips;
+  public:
+    explicit MipsTargetLowering(const MipsTargetMachine &TM,
+                                const MipsSubtarget &STI);
+
+    static const MipsTargetLowering *create(const MipsTargetMachine &TM,
+                                            const MipsSubtarget &STI);
+
+    /// createFastISel - This method returns a target specific FastISel object,
+    /// or null if the target does not support "fast" ISel.
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo) const override;
+
+    MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+      return MVT::i32;
+    }
+
+    bool isCheapToSpeculateCttz() const override;
+    bool isCheapToSpeculateCtlz() const override;
+
+    ISD::NodeType getExtendForAtomicOps() const override {
+      return ISD::SIGN_EXTEND;
+    }
+
+    void LowerOperationWrapper(SDNode *N,
+                               SmallVectorImpl<SDValue> &Results,
+                               SelectionDAG &DAG) const override;
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    //  DAG node.
+    const char *getTargetNodeName(unsigned Opcode) const override;
+
+    /// getSetCCResultType - get the ISD::SETCC result ValueType
+    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                           EVT VT) const override;
+
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *MBB) const override;
+
+    void HandleByVal(CCState *, unsigned &, unsigned) const override;
+
+    unsigned getRegisterByName(const char* RegName, EVT VT,
+                               SelectionDAG &DAG) const override;
+
+    /// If a physical register, this returns the register that receives the
+    /// exception address on entry to an EH pad.
+    unsigned
+    getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+      return ABI.IsN64() ? Mips::A0_64 : Mips::A0;
+    }
+
+    /// If a physical register, this returns the register that receives the
+    /// exception typeid on entry to a landing pad.
+    unsigned
+    getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+      return ABI.IsN64() ? Mips::A1_64 : Mips::A1;
+    }
+
+    /// Returns true if a cast between SrcAS and DestAS is a noop.
+    bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+      // Mips doesn't have any special address spaces so we just reserve
+      // the first 256 for software use (e.g. OpenCL) and treat casts
+      // between them as noops.
+      return SrcAS < 256 && DestAS < 256;
+    }
+
+    bool isJumpTableRelative() const override {
+      return getTargetMachine().isPositionIndependent() || ABI.IsN64();
+    }
+
+  protected:
+    SDValue getGlobalReg(SelectionDAG &DAG, EVT Ty) const;
+
+    // This method creates the following nodes, which are necessary for
+    // computing a local symbol's address:
+    //
+    // (add (load (wrapper $gp, %got(sym)), %lo(sym))
+    template <class NodeTy>
+    SDValue getAddrLocal(NodeTy *N, const SDLoc &DL, EVT Ty, SelectionDAG &DAG,
+                         bool IsN32OrN64) const {
+      unsigned GOTFlag = IsN32OrN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
+      SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
+                                getTargetNode(N, Ty, DAG, GOTFlag));
+      SDValue Load =
+          DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
+                      MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+      unsigned LoFlag = IsN32OrN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
+      SDValue Lo = DAG.getNode(MipsISD::Lo, DL, Ty,
+                               getTargetNode(N, Ty, DAG, LoFlag));
+      return DAG.getNode(ISD::ADD, DL, Ty, Load, Lo);
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // computing a global symbol's address:
+    //
+    // (load (wrapper $gp, %got(sym)))
+    template <class NodeTy>
+    SDValue getAddrGlobal(NodeTy *N, const SDLoc &DL, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag, SDValue Chain,
+                          const MachinePointerInfo &PtrInfo) const {
+      SDValue Tgt = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
+                                getTargetNode(N, Ty, DAG, Flag));
+      return DAG.getLoad(Ty, DL, Chain, Tgt, PtrInfo);
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // computing a global symbol's address in large-GOT mode:
+    //
+    // (load (wrapper (add %hi(sym), $gp), %lo(sym)))
+    template <class NodeTy>
+    SDValue getAddrGlobalLargeGOT(NodeTy *N, const SDLoc &DL, EVT Ty,
+                                  SelectionDAG &DAG, unsigned HiFlag,
+                                  unsigned LoFlag, SDValue Chain,
+                                  const MachinePointerInfo &PtrInfo) const {
+      SDValue Hi =
+          DAG.getNode(MipsISD::Hi, DL, Ty, getTargetNode(N, Ty, DAG, HiFlag));
+      Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
+      SDValue Wrapper = DAG.getNode(MipsISD::Wrapper, DL, Ty, Hi,
+                                    getTargetNode(N, Ty, DAG, LoFlag));
+      return DAG.getLoad(Ty, DL, Chain, Wrapper, PtrInfo);
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // computing a symbol's address in non-PIC mode:
+    //
+    // (add %hi(sym), %lo(sym))
+    template <class NodeTy>
+    SDValue getAddrNonPIC(NodeTy *N, const SDLoc &DL, EVT Ty,
+                          SelectionDAG &DAG) const {
+      SDValue Hi = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_HI);
+      SDValue Lo = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_LO);
+      return DAG.getNode(ISD::ADD, DL, Ty,
+                         DAG.getNode(MipsISD::Hi, DL, Ty, Hi),
+                         DAG.getNode(MipsISD::Lo, DL, Ty, Lo));
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // computing a symbol's address using gp-relative addressing:
+    //
+    // (add $gp, %gp_rel(sym))
+    template <class NodeTy>
+    SDValue getAddrGPRel(NodeTy *N, const SDLoc &DL, EVT Ty,
+                         SelectionDAG &DAG) const {
+      assert(Ty == MVT::i32);
+      SDValue GPRel = getTargetNode(N, Ty, DAG, MipsII::MO_GPREL);
+      return DAG.getNode(ISD::ADD, DL, Ty,
+                         DAG.getRegister(Mips::GP, Ty),
+                         DAG.getNode(MipsISD::GPRel, DL, DAG.getVTList(Ty),
+                                     GPRel));
+    }
+
+    /// This function fills Ops, which is the list of operands that will later
+    /// be used when a function call node is created. It also generates
+    /// copyToReg nodes to set up argument registers.
+    virtual void
+    getOpndList(SmallVectorImpl<SDValue> &Ops,
+                std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+                bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+                bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
+                SDValue Chain) const;
+
+  protected:
+    SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+
+    // Subtarget Info
+    const MipsSubtarget &Subtarget;
+    // Cache the ABI from the TargetMachine, we use it everywhere.
+    const MipsABIInfo &ABI;
+
+  private:
+    // Create a TargetGlobalAddress node.
+    SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetExternalSymbol node.
+    SDValue getTargetNode(ExternalSymbolSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetBlockAddress node.
+    SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetJumpTable node.
+    SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetConstantPool node.
+    SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Lower Operand helpers
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            const SDLoc &dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals,
+                            TargetLowering::CallLoweringInfo &CLI) const;
+
+    // Lower Operand specifics
+    SDValue lowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFABS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
+    SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG& DAG) const;
+    SDValue lowerShiftRightParts(SDValue Op, SelectionDAG& DAG,
+                                 bool IsSRA) const;
+    SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+
+    /// isEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization.
+    virtual bool
+    isEligibleForTailCallOptimization(const CCState &CCInfo,
+                                      unsigned NextStackOffset,
+                                      const MipsFunctionInfo &FI) const = 0;
+
+    /// copyByValArg - Copy argument registers which were used to pass a byval
+    /// argument to the stack. Create a stack frame object for the byval
+    /// argument.
+    void copyByValRegs(SDValue Chain, const SDLoc &DL,
+                       std::vector<SDValue> &OutChains, SelectionDAG &DAG,
+                       const ISD::ArgFlagsTy &Flags,
+                       SmallVectorImpl<SDValue> &InVals,
+                       const Argument *FuncArg, unsigned FirstReg,
+                       unsigned LastReg, const CCValAssign &VA,
+                       MipsCCState &State) const;
+
+    /// passByValArg - Pass a byval argument in registers or on stack.
+    void passByValArg(SDValue Chain, const SDLoc &DL,
+                      std::deque<std::pair<unsigned, SDValue>> &RegsToPass,
+                      SmallVectorImpl<SDValue> &MemOpChains, SDValue StackPtr,
+                      MachineFrameInfo &MFI, SelectionDAG &DAG, SDValue Arg,
+                      unsigned FirstReg, unsigned LastReg,
+                      const ISD::ArgFlagsTy &Flags, bool isLittle,
+                      const CCValAssign &VA) const;
+
+    /// writeVarArgRegs - Write variable function arguments passed in registers
+    /// to the stack. Also create a stack frame object for the first variable
+    /// argument.
+    void writeVarArgRegs(std::vector<SDValue> &OutChains, SDValue Chain,
+                         const SDLoc &DL, SelectionDAG &DAG,
+                         CCState &State) const;
+
+    SDValue
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
+
+    SDValue passArgOnStack(SDValue StackPtr, unsigned Offset, SDValue Chain,
+                           SDValue Arg, const SDLoc &DL, bool IsTailCall,
+                           SelectionDAG &DAG) const;
+
+    SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                      SmallVectorImpl<SDValue> &InVals) const override;
+
+    bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                        bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
+
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
+
+    SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
+                                 const SDLoc &DL, SelectionDAG &DAG) const;
+
+    bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override;
+
+    // Inline asm support
+    ConstraintType getConstraintType(StringRef Constraint) const override;
+
+    /// Examine constraint string and operand type and determine a weight value.
+    /// The operand object must already have been set up with the operand type.
+    ConstraintWeight getSingleConstraintMatchWeight(
+      AsmOperandInfo &info, const char *constraint) const override;
+
+    /// This function parses registers that appear in inline-asm constraints.
+    /// It returns pair (0, 0) on failure.
+    std::pair<unsigned, const TargetRegisterClass *>
+    parseRegForInlineAsmConstraint(StringRef C, MVT VT) const;
+
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 StringRef Constraint, MVT VT) const override;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
+    /// true it means one of the asm constraint of the inline asm instruction
+    /// being processed is 'm'.
+    void LowerAsmOperandForConstraint(SDValue Op,
+                                      std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
+
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+      if (ConstraintCode == "R")
+        return InlineAsm::Constraint_R;
+      else if (ConstraintCode == "ZC")
+        return InlineAsm::Constraint_ZC;
+      return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+    }
+
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
+
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+    EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                            unsigned SrcAlign,
+                            bool IsMemset, bool ZeroMemset,
+                            bool MemcpyStrSrc,
+                            MachineFunction &MF) const override;
+
+    /// isFPImmLegal - Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+    unsigned getJumpTableEncoding() const override;
+    bool useSoftFloat() const override;
+
+    bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+      return true;
+    }
+
+    /// Emit a sign-extension using sll/sra, seb, or seh appropriately.
+    MachineBasicBlock *emitSignExtendToI32InReg(MachineInstr &MI,
+                                                MachineBasicBlock *BB,
+                                                unsigned Size, unsigned DstReg,
+                                                unsigned SrcRec) const;
+
+    MachineBasicBlock *emitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
+                                        unsigned Size, unsigned BinOpcode,
+                                        bool Nand = false) const;
+    MachineBasicBlock *emitAtomicBinaryPartword(MachineInstr &MI,
+                                                MachineBasicBlock *BB,
+                                                unsigned Size,
+                                                unsigned BinOpcode,
+                                                bool Nand = false) const;
+    MachineBasicBlock *emitAtomicCmpSwap(MachineInstr &MI,
+                                         MachineBasicBlock *BB,
+                                         unsigned Size) const;
+    MachineBasicBlock *emitAtomicCmpSwapPartword(MachineInstr &MI,
+                                                 MachineBasicBlock *BB,
+                                                 unsigned Size) const;
+    MachineBasicBlock *emitSEL_D(MachineInstr &MI, MachineBasicBlock *BB) const;
+    MachineBasicBlock *emitPseudoSELECT(MachineInstr &MI, MachineBasicBlock *BB,
+                                        bool isFPCmp, unsigned Opc) const;
+  };
+
+  /// Create MipsTargetLowering objects.
+  const MipsTargetLowering *
+  createMips16TargetLowering(const MipsTargetMachine &TM,
+                             const MipsSubtarget &STI);
+  const MipsTargetLowering *
+  createMipsSETargetLowering(const MipsTargetMachine &TM,
+                             const MipsSubtarget &STI);
+
+  namespace Mips {
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo);
+  }
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
new file mode 100644
index 000000000000..ab7aa9dcdcae
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
@@ -0,0 +1,687 @@
+//===-- MipsInstrFPU.td - Mips FPU Instruction Information -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Mips FPU instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Floating Point Instructions
+// ------------------------
+// * 64bit fp:
+//    - 32 64-bit registers (default mode)
+//    - 16 even 32-bit registers (32-bit compatible mode) for
+//      single and double access.
+// * 32bit fp:
+//    - 16 even 32-bit registers - single and double (aliased)
+//    - 32 32-bit registers (within single-only mode)
+//===----------------------------------------------------------------------===//
+
+// Floating Point Compare and Branch
+def SDT_MipsFPBrcond : SDTypeProfile<0, 3, [SDTCisInt<0>,
+                                            SDTCisVT<1, i32>,
+                                            SDTCisVT<2, OtherVT>]>;
+def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<1>,
+                                         SDTCisVT<2, i32>]>;
+def SDT_MipsCMovFP : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisVT<2, i32>,
+                                          SDTCisSameAs<1, 3>]>;
+def SDT_MipsTruncIntFP : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
+def SDT_MipsBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>,
+                                                SDTCisVT<1, i32>,
+                                                SDTCisSameAs<1, 2>]>;
+def SDT_MipsExtractElementF64 : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                                                     SDTCisVT<1, f64>,
+                                                     SDTCisVT<2, i32>]>;
+
+def MipsFPCmp : SDNode<"MipsISD::FPCmp", SDT_MipsFPCmp, [SDNPOutGlue]>;
+def MipsCMovFP_T : SDNode<"MipsISD::CMovFP_T", SDT_MipsCMovFP, [SDNPInGlue]>;
+def MipsCMovFP_F : SDNode<"MipsISD::CMovFP_F", SDT_MipsCMovFP, [SDNPInGlue]>;
+def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond,
+                          [SDNPHasChain, SDNPOptInGlue]>;
+def MipsTruncIntFP : SDNode<"MipsISD::TruncIntFP", SDT_MipsTruncIntFP>;
+def MipsBuildPairF64 : SDNode<"MipsISD::BuildPairF64", SDT_MipsBuildPairF64>;
+def MipsExtractElementF64 : SDNode<"MipsISD::ExtractElementF64",
+                                   SDT_MipsExtractElementF64>;
+
+// Operand for printing out a condition code.
+let PrintMethod = "printFCCOperand", DecoderMethod = "DecodeCondCode" in
+  def condcode : Operand<i32>;
+
+//===----------------------------------------------------------------------===//
+// Feature predicates.
+//===----------------------------------------------------------------------===//
+
+def IsFP64bit        : Predicate<"Subtarget->isFP64bit()">,
+                       AssemblerPredicate<"FeatureFP64Bit">;
+def NotFP64bit       : Predicate<"!Subtarget->isFP64bit()">,
+                       AssemblerPredicate<"!FeatureFP64Bit">;
+def IsSingleFloat    : Predicate<"Subtarget->isSingleFloat()">,
+                       AssemblerPredicate<"FeatureSingleFloat">;
+def IsNotSingleFloat : Predicate<"!Subtarget->isSingleFloat()">,
+                       AssemblerPredicate<"!FeatureSingleFloat">;
+def IsNotSoftFloat   : Predicate<"!Subtarget->useSoftFloat()">,
+                       AssemblerPredicate<"!FeatureSoftFloat">;
+
+//===----------------------------------------------------------------------===//
+// Mips FGR size adjectives.
+// They are mutually exclusive.
+//===----------------------------------------------------------------------===//
+
+class FGR_32 { list<Predicate> FGRPredicates = [NotFP64bit]; }
+class FGR_64 { list<Predicate> FGRPredicates = [IsFP64bit]; }
+class HARDFLOAT { list<Predicate> HardFloatPredicate = [IsNotSoftFloat]; }
+
+//===----------------------------------------------------------------------===//
+
+// FP immediate patterns.
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimm0neg : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-0.0);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//
+// A set of multiclasses is used to address the register usage.
+//
+// S32 - single precision in 16 32bit even fp registers
+//       single precision in 32 32bit fp registers in SingleOnly mode
+// S64 - single precision in 32 64bit fp registers (In64BitMode)
+// D32 - double precision in 16 32bit even fp registers
+// D64 - double precision in 32 64bit fp registers (In64BitMode)
+//
+// Only S32 and D32 are supported right now.
+//===----------------------------------------------------------------------===//
+class ADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin, bit IsComm,
+              SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs RC:$fd), (ins RC:$fs, RC:$ft),
+         !strconcat(opstr, "\t$fd, $fs, $ft"),
+         [(set RC:$fd, (OpNode RC:$fs, RC:$ft))], Itin, FrmFR, opstr>,
+  HARDFLOAT {
+  let isCommutable = IsComm;
+}
+
+multiclass ADDS_M<string opstr, InstrItinClass Itin, bit IsComm,
+                  SDPatternOperator OpNode = null_frag> {
+  def _D32 : MMRel, ADDS_FT<opstr, AFGR64Opnd, Itin, IsComm, OpNode>, FGR_32;
+  def _D64 : ADDS_FT<opstr, FGR64Opnd, Itin, IsComm, OpNode>, FGR_64 {
+    string DecoderNamespace = "Mips64";
+  }
+}
+
+class ABSS_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs DstRC:$fd), (ins SrcRC:$fs), !strconcat(opstr, "\t$fd, $fs"),
+         [(set DstRC:$fd, (OpNode SrcRC:$fs))], Itin, FrmFR, opstr>,
+  HARDFLOAT,
+  NeverHasSideEffects;
+
+multiclass ABSS_M<string opstr, InstrItinClass Itin,
+                  SDPatternOperator OpNode= null_frag> {
+  def _D32 : MMRel, ABSS_FT<opstr, AFGR64Opnd, AFGR64Opnd, Itin, OpNode>,
+             FGR_32;
+  def _D64 : ABSS_FT<opstr, FGR64Opnd, FGR64Opnd, Itin, OpNode>, FGR_64 {
+    string DecoderNamespace = "Mips64";
+  }
+}
+
+multiclass ROUND_M<string opstr, InstrItinClass Itin> {
+  def _D32 : MMRel, ABSS_FT<opstr, FGR32Opnd, AFGR64Opnd, Itin>, FGR_32;
+  def _D64 : StdMMR6Rel, ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>, FGR_64 {
+    let DecoderNamespace = "Mips64";
+  }
+}
+
+class MFC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs DstRC:$rt), (ins SrcRC:$fs), !strconcat(opstr, "\t$rt, $fs"),
+         [(set DstRC:$rt, (OpNode SrcRC:$fs))], Itin, FrmFR, opstr>, HARDFLOAT;
+
+class MTC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs DstRC:$fs), (ins SrcRC:$rt), !strconcat(opstr, "\t$rt, $fs"),
+         [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR, opstr>, HARDFLOAT;
+
+class MTC1_64_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
+                 InstrItinClass Itin> :
+  InstSE<(outs DstRC:$fs), (ins DstRC:$fs_in, SrcRC:$rt),
+         !strconcat(opstr, "\t$rt, $fs"), [], Itin, FrmFR, opstr>, HARDFLOAT {
+  // $fs_in is part of a white lie to work around a widespread bug in the FPU
+  // implementation. See expandBuildPairF64 for details.
+  let Constraints = "$fs = $fs_in";
+}
+
+class LW_FT<string opstr, RegisterOperand RC, DAGOperand MO,
+            InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$rt), (ins MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI, opstr>,
+  HARDFLOAT {
+  let DecoderMethod = "DecodeFMem";
+  let mayLoad = 1;
+}
+
+class SW_FT<string opstr, RegisterOperand RC, DAGOperand MO,
+            InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs), (ins RC:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI, opstr>, HARDFLOAT {
+  let DecoderMethod = "DecodeFMem";
+  let mayStore = 1;
+}
+
+class MADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
+               SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$fd), (ins RC:$fr, RC:$fs, RC:$ft),
+         !strconcat(opstr, "\t$fd, $fr, $fs, $ft"),
+         [(set RC:$fd, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr))], Itin,
+         FrmFR, opstr>, HARDFLOAT;
+
+class NMADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
+                SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$fd), (ins RC:$fr, RC:$fs, RC:$ft),
+         !strconcat(opstr, "\t$fd, $fr, $fs, $ft"),
+         [(set RC:$fd, (fsub fpimm0, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr)))],
+         Itin, FrmFR, opstr>, HARDFLOAT;
+
+class LWXC1_FT<string opstr, RegisterOperand DRC,
+               InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs DRC:$fd), (ins PtrRC:$base, PtrRC:$index),
+         !strconcat(opstr, "\t$fd, ${index}(${base})"),
+         [(set DRC:$fd, (OpNode (add iPTR:$base, iPTR:$index)))], Itin,
+         FrmFI, opstr>, HARDFLOAT {
+  let AddedComplexity = 20;
+}
+
+class SWXC1_FT<string opstr, RegisterOperand DRC,
+               InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs), (ins DRC:$fs, PtrRC:$base, PtrRC:$index),
+         !strconcat(opstr, "\t$fs, ${index}(${base})"),
+         [(OpNode DRC:$fs, (add iPTR:$base, iPTR:$index))], Itin,
+         FrmFI, opstr>, HARDFLOAT {
+  let AddedComplexity = 20;
+}
+
+class BC1F_FT<string opstr, DAGOperand opnd, InstrItinClass Itin,
+              SDPatternOperator Op = null_frag, bit DelaySlot = 1> :
+  InstSE<(outs), (ins FCCRegsOpnd:$fcc, opnd:$offset),
+         !strconcat(opstr, "\t$fcc, $offset"),
+         [(MipsFPBrcond Op, FCCRegsOpnd:$fcc, bb:$offset)], Itin,
+         FrmFI, opstr>, HARDFLOAT {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = DelaySlot;
+  let Defs = [AT];
+}
+
+class CEQS_FT<string typestr, RegisterClass RC, InstrItinClass Itin,
+              SDPatternOperator OpNode = null_frag>  :
+  InstSE<(outs), (ins RC:$fs, RC:$ft, condcode:$cond),
+         !strconcat("c.$cond.", typestr, "\t$fs, $ft"),
+         [(OpNode RC:$fs, RC:$ft, imm:$cond)], Itin, FrmFR,
+         !strconcat("c.$cond.", typestr)>, HARDFLOAT {
+  let Defs = [FCC0];
+  let isCodeGenOnly = 1;
+}
+
+class C_COND_FT<string CondStr, string Typestr, RegisterOperand RC,
+                InstrItinClass itin>  :
+   InstSE<(outs), (ins RC:$fs, RC:$ft),
+          !strconcat("c.", CondStr, ".", Typestr, "\t$fs, $ft"), [], itin,
+          FrmFR>, HARDFLOAT;
+
+multiclass C_COND_M<string TypeStr, RegisterOperand RC, bits<5> fmt,
+                    InstrItinClass itin> {
+  def C_F_#NAME : C_COND_FT<"f", TypeStr, RC, itin>, C_COND_FM<fmt, 0>;
+  def C_UN_#NAME : C_COND_FT<"un", TypeStr, RC, itin>, C_COND_FM<fmt, 1>;
+  def C_EQ_#NAME : C_COND_FT<"eq", TypeStr, RC, itin>, C_COND_FM<fmt, 2>;
+  def C_UEQ_#NAME : C_COND_FT<"ueq", TypeStr, RC, itin>, C_COND_FM<fmt, 3>;
+  def C_OLT_#NAME : C_COND_FT<"olt", TypeStr, RC, itin>, C_COND_FM<fmt, 4>;
+  def C_ULT_#NAME : C_COND_FT<"ult", TypeStr, RC, itin>, C_COND_FM<fmt, 5>;
+  def C_OLE_#NAME : C_COND_FT<"ole", TypeStr, RC, itin>, C_COND_FM<fmt, 6>;
+  def C_ULE_#NAME : C_COND_FT<"ule", TypeStr, RC, itin>, C_COND_FM<fmt, 7>;
+  def C_SF_#NAME : C_COND_FT<"sf", TypeStr, RC, itin>, C_COND_FM<fmt, 8>;
+  def C_NGLE_#NAME : C_COND_FT<"ngle", TypeStr, RC, itin>, C_COND_FM<fmt, 9>;
+  def C_SEQ_#NAME : C_COND_FT<"seq", TypeStr, RC, itin>, C_COND_FM<fmt, 10>;
+  def C_NGL_#NAME : C_COND_FT<"ngl", TypeStr, RC, itin>, C_COND_FM<fmt, 11>;
+  def C_LT_#NAME : C_COND_FT<"lt", TypeStr, RC, itin>, C_COND_FM<fmt, 12>;
+  def C_NGE_#NAME : C_COND_FT<"nge", TypeStr, RC, itin>, C_COND_FM<fmt, 13>;
+  def C_LE_#NAME : C_COND_FT<"le", TypeStr, RC, itin>, C_COND_FM<fmt, 14>;
+  def C_NGT_#NAME : C_COND_FT<"ngt", TypeStr, RC, itin>, C_COND_FM<fmt, 15>;
+}
+
+defm S : C_COND_M<"s", FGR32Opnd, 16, II_C_CC_S>, ISA_MIPS1_NOT_32R6_64R6;
+defm D32 : C_COND_M<"d", AFGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6,
+           FGR_32;
+let DecoderNamespace = "Mips64" in
+defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6,
+           FGR_64;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Instructions
+//===----------------------------------------------------------------------===//
+def ROUND_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
+                 ABSS_FM<0xc, 16>, ISA_MIPS2;
+defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
+def TRUNC_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
+                 ABSS_FM<0xd, 16>, ISA_MIPS2;
+def CEIL_W_S   : MMRel, StdMMR6Rel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
+                 ABSS_FM<0xe, 16>, ISA_MIPS2;
+def FLOOR_W_S  : MMRel, StdMMR6Rel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
+                 ABSS_FM<0xf, 16>, ISA_MIPS2;
+def CVT_W_S    : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
+                 ABSS_FM<0x24, 16>;
+
+defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2;
+defm CEIL_W  : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2;
+defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2;
+defm CVT_W   : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  def RECIP_S : MMRel, ABSS_FT<"recip.s", FGR32Opnd, FGR32Opnd, II_RECIP_S>,
+                ABSS_FM<0b010101, 0x10>, INSN_MIPS4_32R2;
+  def RECIP_D : MMRel, ABSS_FT<"recip.d", FGR64Opnd, FGR64Opnd, II_RECIP_D>,
+                ABSS_FM<0b010101, 0x11>, INSN_MIPS4_32R2;
+  def RSQRT_S : MMRel, ABSS_FT<"rsqrt.s", FGR32Opnd, FGR32Opnd, II_RSQRT_S>,
+                ABSS_FM<0b010110, 0x10>, INSN_MIPS4_32R2;
+  def RSQRT_D : MMRel, ABSS_FT<"rsqrt.d", FGR64Opnd, FGR64Opnd, II_RSQRT_D>,
+                ABSS_FM<0b010110, 0x11>, INSN_MIPS4_32R2;
+}
+let DecoderNamespace = "Mips64" in {
+  let AdditionalPredicates = [NotInMicroMips] in {
+  def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, II_ROUND>,
+                  ABSS_FM<0x8, 16>, FGR_64;
+  def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, II_ROUND>,
+                    ABSS_FM<0x8, 17>, FGR_64;
+  def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64Opnd, FGR32Opnd, II_TRUNC>,
+                  ABSS_FM<0x9, 16>, FGR_64;
+  def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64Opnd, FGR64Opnd, II_TRUNC>,
+                    ABSS_FM<0x9, 17>, FGR_64;
+  def CEIL_L_S  : ABSS_FT<"ceil.l.s", FGR64Opnd, FGR32Opnd, II_CEIL>,
+                  ABSS_FM<0xa, 16>, FGR_64;
+  def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64Opnd, FGR64Opnd, II_CEIL>,
+                   ABSS_FM<0xa, 17>, FGR_64;
+  def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64Opnd, FGR32Opnd, II_FLOOR>,
+                  ABSS_FM<0xb, 16>, FGR_64;
+  def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64Opnd, FGR64Opnd, II_FLOOR>,
+                    ABSS_FM<0xb, 17>, FGR_64;
+  }
+}
+
+def CVT_S_W : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
+              ABSS_FM<0x20, 20>;
+let AdditionalPredicates = [NotInMicroMips] in{
+  def CVT_L_S : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+                ABSS_FM<0x25, 16>, INSN_MIPS3_32R2;
+  def CVT_L_D64: MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
+                 ABSS_FM<0x25, 17>, INSN_MIPS3_32R2;
+}
+
+def CVT_S_D32 : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+                ABSS_FM<0x20, 17>, FGR_32;
+def CVT_D32_W : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                ABSS_FM<0x21, 20>, FGR_32;
+def CVT_D32_S : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                ABSS_FM<0x21, 16>, FGR_32;
+
+let DecoderNamespace = "Mips64" in {
+  def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, II_CVT>,
+                  ABSS_FM<0x20, 17>, FGR_64;
+  let AdditionalPredicates = [NotInMicroMips] in{
+    def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>,
+                    ABSS_FM<0x20, 21>, FGR_64;
+  }
+  def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, II_CVT>,
+                  ABSS_FM<0x21, 20>, FGR_64;
+  def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, II_CVT>,
+                  ABSS_FM<0x21, 16>, FGR_64;
+  def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64Opnd, FGR64Opnd, II_CVT>,
+                  ABSS_FM<0x21, 21>, FGR_64;
+}
+
+let isPseudo = 1, isCodeGenOnly = 1 in {
+  def PseudoCVT_S_W : ABSS_FT<"", FGR32Opnd, GPR32Opnd, II_CVT>;
+  def PseudoCVT_D32_W : ABSS_FT<"", AFGR64Opnd, GPR32Opnd, II_CVT>;
+  def PseudoCVT_S_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, II_CVT>;
+  def PseudoCVT_D64_W : ABSS_FT<"", FGR64Opnd, GPR32Opnd, II_CVT>;
+  def PseudoCVT_D64_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, II_CVT>;
+}
+
+def FABS_S : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
+             ABSS_FM<0x5, 16>;
+def FNEG_S : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
+             ABSS_FM<0x7, 16>;
+defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>;
+defm FNEG : ABSS_M<"neg.d", II_NEG, fneg>, ABSS_FM<0x7, 17>;
+
+def FSQRT_S : MMRel, StdMMR6Rel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd,
+              II_SQRT_S, fsqrt>, ABSS_FM<0x4, 16>, ISA_MIPS2;
+defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>, ISA_MIPS2;
+
+// The odd-numbered registers are only referenced when doing loads,
+// stores, and moves between floating-point and integer registers.
+// When defining instructions, we reference all 32-bit registers,
+// regardless of register aliasing.
+
+/// Move Control Registers From/To CPU Registers
+let AdditionalPredicates = [NotInMicroMips] in {
+  def CFC1 : MMRel, MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, II_CFC1>, MFC1_FM<2>;
+  def CTC1 : MMRel, MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, II_CTC1>, MFC1_FM<6>;
+}
+def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1,
+                          bitconvert>, MFC1_FM<0>;
+def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
+                          bitconvert>, MFC1_FM<4>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def MFHC1_D32 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
+                  MFC1_FM<3>, ISA_MIPS32R2, FGR_32;
+  def MFHC1_D64 : MFC1_FT<"mfhc1", GPR32Opnd, FGR64Opnd, II_MFHC1>,
+                  MFC1_FM<3>, ISA_MIPS32R2, FGR_64 {
+    let DecoderNamespace = "Mips64";
+  }
+}
+let AdditionalPredicates = [NotInMicroMips] in {
+  def MTHC1_D32 : MMRel, StdMMR6Rel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
+                  MFC1_FM<7>, ISA_MIPS32R2, FGR_32;
+  def MTHC1_D64 : MTC1_64_FT<"mthc1", FGR64Opnd, GPR32Opnd, II_MTHC1>,
+                  MFC1_FM<7>, ISA_MIPS32R2, FGR_64 {
+    let DecoderNamespace = "Mips64";
+  }
+}
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, II_DMTC1,
+              bitconvert>, MFC1_FM<5>, ISA_MIPS3;
+  def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, II_DMFC1,
+                      bitconvert>, MFC1_FM<1>, ISA_MIPS3;
+}
+
+def FMOV_S   : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
+               ABSS_FM<0x6, 16>;
+def FMOV_D32 : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
+               ABSS_FM<0x6, 17>, FGR_32;
+def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, II_MOV_D>,
+               ABSS_FM<0x6, 17>, FGR_64 {
+                 let DecoderNamespace = "Mips64";
+}
+
+/// Floating Point Memory Instructions
+let AdditionalPredicates = [NotInMicroMips] in {
+  def LWC1 : MMRel, LW_FT<"lwc1", FGR32Opnd, mem_simm16, II_LWC1, load>,
+             LW_FM<0x31>;
+  def SWC1 : MMRel, SW_FT<"swc1", FGR32Opnd, mem_simm16, II_SWC1, store>,
+             LW_FM<0x39>;
+}
+
+let DecoderNamespace = "Mips64", AdditionalPredicates = [NotInMicroMips] in {
+  def LDC164 : StdMMR6Rel, LW_FT<"ldc1", FGR64Opnd, mem_simm16, II_LDC1, load>,
+               LW_FM<0x35>, ISA_MIPS2, FGR_64 {
+    let BaseOpcode = "LDC164";
+  }
+  def SDC164 : StdMMR6Rel, SW_FT<"sdc1", FGR64Opnd, mem_simm16, II_SDC1, store>,
+               LW_FM<0x3d>, ISA_MIPS2, FGR_64;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  def LDC1 : MMRel, StdMMR6Rel, LW_FT<"ldc1", AFGR64Opnd, mem_simm16, II_LDC1,
+                                      load>, LW_FM<0x35>, ISA_MIPS2, FGR_32 {
+    let BaseOpcode = "LDC132";
+  }
+  def SDC1 : MMRel, SW_FT<"sdc1", AFGR64Opnd, mem_simm16, II_SDC1, store>,
+             LW_FM<0x3d>, ISA_MIPS2, FGR_32;
+}
+
+// Indexed loads and stores.
+// Base register + offset register addressing mode (indicated by "x" in the
+// instruction mnemonic) is disallowed under NaCl.
+let AdditionalPredicates = [IsNotNaCl] in {
+  def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>,
+              INSN_MIPS4_32R2_NOT_32R6_64R6;
+  def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>,
+              INSN_MIPS4_32R2_NOT_32R6_64R6;
+}
+
+let AdditionalPredicates = [NotInMicroMips, IsNotNaCl] in {
+  def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>,
+              INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+  def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>,
+              INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+}
+
+let DecoderNamespace="Mips64" in {
+  def LDXC164 : LWXC1_FT<"ldxc1", FGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>,
+                INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+  def SDXC164 : SWXC1_FT<"sdxc1", FGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>,
+                INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+}
+
+// Load/store doubleword indexed unaligned.
+let AdditionalPredicates = [IsNotNaCl] in {
+  def LUXC1 : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
+              INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
+  def SUXC1 : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
+              INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32;
+}
+
+let DecoderNamespace="Mips64" in {
+  def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
+                INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64;
+  def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
+                INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64;
+}
+
+/// Floating-point Aritmetic
+def FADD_S : MMRel, ADDS_FT<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>,
+             ADDS_FM<0x00, 16>;
+defm FADD :  ADDS_M<"add.d", II_ADD_D, 1, fadd>, ADDS_FM<0x00, 17>;
+def FDIV_S : MMRel, ADDS_FT<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>,
+             ADDS_FM<0x03, 16>;
+defm FDIV :  ADDS_M<"div.d", II_DIV_D, 0, fdiv>, ADDS_FM<0x03, 17>;
+def FMUL_S : MMRel, ADDS_FT<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>,
+             ADDS_FM<0x02, 16>;
+defm FMUL :  ADDS_M<"mul.d", II_MUL_D, 1, fmul>, ADDS_FM<0x02, 17>;
+def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
+             ADDS_FM<0x01, 16>;
+defm FSUB :  ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>;
+
+def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
+             MADDS_FM<4, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
+             MADDS_FM<5, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+
+let AdditionalPredicates = [NoNaNsFPMath] in {
+  def NMADD_S : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
+                MADDS_FM<6, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+  def NMSUB_S : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
+                MADDS_FM<7, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+}
+
+def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
+               MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
+               MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+
+let AdditionalPredicates = [NoNaNsFPMath] in {
+  def NMADD_D32 : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
+                  MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+  def NMSUB_D32 : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
+                  MADDS_FM<7, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+}
+
+let DecoderNamespace = "Mips64" in {
+  def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>,
+                 MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+  def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>,
+                 MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+}
+
+let AdditionalPredicates = [NoNaNsFPMath],
+    DecoderNamespace = "Mips64" in {
+  def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>,
+                  MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+  def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, II_NMSUB_D, fsub>,
+                  MADDS_FM<7, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Branch Codes
+//===----------------------------------------------------------------------===//
+// Mips branch codes. These correspond to condcode in MipsInstrInfo.h.
+// They must be kept in synch.
+def MIPS_BRANCH_F  : PatLeaf<(i32 0)>;
+def MIPS_BRANCH_T  : PatLeaf<(i32 1)>;
+
+def BC1F : MMRel, BC1F_FT<"bc1f", brtarget, II_BC1F, MIPS_BRANCH_F>,
+           BC1F_FM<0, 0>, ISA_MIPS1_NOT_32R6_64R6;
+def BC1FL : MMRel, BC1F_FT<"bc1fl", brtarget, II_BC1FL, MIPS_BRANCH_F, 0>,
+            BC1F_FM<1, 0>, ISA_MIPS2_NOT_32R6_64R6;
+def BC1T : MMRel, BC1F_FT<"bc1t", brtarget, II_BC1T, MIPS_BRANCH_T>,
+           BC1F_FM<0, 1>, ISA_MIPS1_NOT_32R6_64R6;
+def BC1TL : MMRel, BC1F_FT<"bc1tl", brtarget, II_BC1TL, MIPS_BRANCH_T, 0>,
+            BC1F_FM<1, 1>, ISA_MIPS2_NOT_32R6_64R6;
+
+/// Floating Point Compare
+let AdditionalPredicates = [NotInMicroMips] in {
+  def FCMP_S32 : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, CEQS_FM<16>,
+                 ISA_MIPS1_NOT_32R6_64R6;
+  def FCMP_D32 : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
+                 ISA_MIPS1_NOT_32R6_64R6, FGR_32;
+}
+let DecoderNamespace = "Mips64" in
+def FCMP_D64 : CEQS_FT<"d", FGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
+               ISA_MIPS1_NOT_32R6_64R6, FGR_64;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Pseudo-Instructions
+//===----------------------------------------------------------------------===//
+
+// This pseudo instr gets expanded into 2 mtc1 instrs after register
+// allocation.
+class BuildPairF64Base<RegisterOperand RO> :
+  PseudoSE<(outs RO:$dst), (ins GPR32Opnd:$lo, GPR32Opnd:$hi),
+           [(set RO:$dst, (MipsBuildPairF64 GPR32Opnd:$lo, GPR32Opnd:$hi))],
+           II_MTC1>;
+
+def BuildPairF64 : BuildPairF64Base<AFGR64Opnd>, FGR_32, HARDFLOAT;
+def BuildPairF64_64 : BuildPairF64Base<FGR64Opnd>, FGR_64, HARDFLOAT;
+
+// This pseudo instr gets expanded into 2 mfc1 instrs after register
+// allocation.
+// if n is 0, lower part of src is extracted.
+// if n is 1, higher part of src is extracted.
+// This node has associated scheduling information as the pre RA scheduler
+// asserts otherwise.
+class ExtractElementF64Base<RegisterOperand RO> :
+  PseudoSE<(outs GPR32Opnd:$dst), (ins RO:$src, i32imm:$n),
+           [(set GPR32Opnd:$dst, (MipsExtractElementF64 RO:$src, imm:$n))],
+           II_MFC1>;
+
+def ExtractElementF64 : ExtractElementF64Base<AFGR64Opnd>, FGR_32, HARDFLOAT;
+def ExtractElementF64_64 : ExtractElementF64Base<FGR64Opnd>, FGR_64, HARDFLOAT;
+
+def PseudoTRUNC_W_S : MipsAsmPseudoInst<(outs FGR32Opnd:$fd),
+                                        (ins FGR32Opnd:$fs, GPR32Opnd:$rs),
+                                        "trunc.w.s\t$fd, $fs, $rs">;
+
+def PseudoTRUNC_W_D32 : MipsAsmPseudoInst<(outs FGR32Opnd:$fd),
+                                          (ins AFGR64Opnd:$fs, GPR32Opnd:$rs),
+                                          "trunc.w.d\t$fd, $fs, $rs">,
+                        FGR_32, HARDFLOAT;
+
+def PseudoTRUNC_W_D : MipsAsmPseudoInst<(outs FGR32Opnd:$fd),
+                                        (ins FGR64Opnd:$fs, GPR32Opnd:$rs),
+                                        "trunc.w.d\t$fd, $fs, $rs">,
+                      FGR_64, HARDFLOAT;
+
+//===----------------------------------------------------------------------===//
+// InstAliases.
+//===----------------------------------------------------------------------===//
+def : MipsInstAlias<"bc1t $offset", (BC1T FCC0, brtarget:$offset)>,
+      ISA_MIPS1_NOT_32R6_64R6, HARDFLOAT;
+def : MipsInstAlias<"bc1tl $offset", (BC1TL FCC0, brtarget:$offset)>,
+      ISA_MIPS2_NOT_32R6_64R6, HARDFLOAT;
+def : MipsInstAlias<"bc1f $offset", (BC1F FCC0, brtarget:$offset)>,
+      ISA_MIPS1_NOT_32R6_64R6, HARDFLOAT;
+def : MipsInstAlias<"bc1fl $offset", (BC1FL FCC0, brtarget:$offset)>,
+      ISA_MIPS2_NOT_32R6_64R6, HARDFLOAT;
+
+def : MipsInstAlias
+        <"s.s $fd, $addr", (SWC1 FGR32Opnd:$fd, mem_simm16:$addr), 0>,
+      ISA_MIPS2, HARDFLOAT;
+def : MipsInstAlias
+        <"s.d $fd, $addr", (SDC1 AFGR64Opnd:$fd, mem_simm16:$addr), 0>,
+      FGR_32, ISA_MIPS2, HARDFLOAT;
+def : MipsInstAlias
+        <"s.d $fd, $addr", (SDC164 FGR64Opnd:$fd, mem_simm16:$addr), 0>,
+      FGR_64, ISA_MIPS2, HARDFLOAT;
+
+def : MipsInstAlias
+        <"l.s $fd, $addr", (LWC1 FGR32Opnd:$fd, mem_simm16:$addr), 0>,
+      ISA_MIPS2, HARDFLOAT;
+def : MipsInstAlias
+        <"l.d $fd, $addr", (LDC1 AFGR64Opnd:$fd, mem_simm16:$addr), 0>,
+      FGR_32, ISA_MIPS2, HARDFLOAT;
+def : MipsInstAlias
+        <"l.d $fd, $addr", (LDC164 FGR64Opnd:$fd, mem_simm16:$addr), 0>,
+      FGR_64, ISA_MIPS2, HARDFLOAT;
+//===----------------------------------------------------------------------===//
+// Floating Point Patterns
+//===----------------------------------------------------------------------===//
+def : MipsPat<(f32 fpimm0), (MTC1 ZERO)>;
+def : MipsPat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>;
+
+def : MipsPat<(f32 (sint_to_fp GPR32Opnd:$src)),
+              (PseudoCVT_S_W GPR32Opnd:$src)>;
+def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+              (TRUNC_W_S FGR32Opnd:$src)>;
+
+def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
+              (PseudoCVT_D32_W GPR32Opnd:$src)>, FGR_32;
+def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
+              (TRUNC_W_D32 AFGR64Opnd:$src)>, FGR_32;
+def : MipsPat<(f32 (fpround AFGR64Opnd:$src)),
+              (CVT_S_D32 AFGR64Opnd:$src)>, FGR_32;
+def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
+              (CVT_D32_S FGR32Opnd:$src)>, FGR_32;
+
+def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>, FGR_64;
+def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>, FGR_64;
+
+def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
+              (PseudoCVT_D64_W GPR32Opnd:$src)>, FGR_64;
+def : MipsPat<(f32 (sint_to_fp GPR64Opnd:$src)),
+              (EXTRACT_SUBREG (PseudoCVT_S_L GPR64Opnd:$src), sub_lo)>, FGR_64;
+def : MipsPat<(f64 (sint_to_fp GPR64Opnd:$src)),
+              (PseudoCVT_D64_L GPR64Opnd:$src)>, FGR_64;
+
+def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+              (TRUNC_W_D64 FGR64Opnd:$src)>, FGR_64;
+def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+              (TRUNC_L_S FGR32Opnd:$src)>, FGR_64;
+def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+              (TRUNC_L_D64 FGR64Opnd:$src)>, FGR_64;
+
+def : MipsPat<(f32 (fpround FGR64Opnd:$src)),
+              (CVT_S_D64 FGR64Opnd:$src)>, FGR_64;
+def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
+              (CVT_D64_S FGR32Opnd:$src)>, FGR_64;
+
+// Patterns for loads/stores with a reg+imm operand.
+let AdditionalPredicates = [NotInMicroMips] in {
+  let AddedComplexity = 40 in {
+    def : LoadRegImmPat<LWC1, f32, load>;
+    def : StoreRegImmPat<SWC1, f32>;
+
+    def : LoadRegImmPat<LDC164, f64, load>, FGR_64;
+    def : StoreRegImmPat<SDC164, f64>, FGR_64;
+
+    def : LoadRegImmPat<LDC1, f64, load>, FGR_32;
+    def : StoreRegImmPat<SDC1, f64>, FGR_32;
+  }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td
new file mode 100644
index 000000000000..1437fb75434a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td
@@ -0,0 +1,968 @@
+//===-- MipsInstrFormats.td - Mips Instruction Formats -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MIPS instructions format
+//
+//  CPU INSTRUCTION FORMATS
+//
+//  opcode  - operation code.
+//  rs      - src reg.
+//  rt      - dst reg (on a 2 regs instr) or src reg (on a 3 reg instr).
+//  rd      - dst reg, only used on 3 regs instr.
+//  shamt   - only used on shift instructions, contains the shift amount.
+//  funct   - combined with opcode field give us an operation code.
+//
+//===----------------------------------------------------------------------===//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<4> val> {
+  bits<4> Value = val;
+}
+
+def Pseudo    : Format<0>;
+def FrmR      : Format<1>;
+def FrmI      : Format<2>;
+def FrmJ      : Format<3>;
+def FrmFR     : Format<4>;
+def FrmFI     : Format<5>;
+def FrmOther  : Format<6>; // Instruction w/ a custom format
+
+class MMRel;
+
+def Std2MicroMips : InstrMapping {
+  let FilterClass = "MMRel";
+  // Instructions with the same BaseOpcode and isNVStore values form a row.
+  let RowFields = ["BaseOpcode"];
+  // Instructions with the same predicate sense form a column.
+  let ColFields = ["Arch"];
+  // The key column is the unpredicated instructions.
+  let KeyCol = ["se"];
+  // Value columns are PredSense=true and PredSense=false
+  let ValueCols = [["se"], ["micromips"]];
+}
+
+class StdMMR6Rel;
+
+def Std2MicroMipsR6 : InstrMapping {
+  let FilterClass = "StdMMR6Rel";
+  // Instructions with the same BaseOpcode and isNVStore values form a row.
+  let RowFields = ["BaseOpcode"];
+  // Instructions with the same predicate sense form a column.
+  let ColFields = ["Arch"];
+  // The key column is the unpredicated instructions.
+  let KeyCol = ["se"];
+  // Value columns are PredSense=true and PredSense=false
+  let ValueCols = [["se"], ["micromipsr6"]];
+}
+
+class StdArch {
+  string Arch = "se";
+}
+
+// Generic Mips Format
+class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
+               InstrItinClass itin, Format f>: Instruction
+{
+  field bits<32> Inst;
+  Format Form = f;
+
+  let Namespace = "Mips";
+
+  let Size = 4;
+
+  bits<6> Opcode = 0;
+
+  // Top 6 bits are the 'opcode' field
+  let Inst{31-26} = Opcode;
+
+  let OutOperandList = outs;
+  let InOperandList  = ins;
+
+  let AsmString   = asmstr;
+  let Pattern     = pattern;
+  let Itinerary   = itin;
+
+  //
+  // Attributes specific to Mips instructions...
+  //
+  bits<4> FormBits     = Form.Value;
+  bit isCTI            = 0; // Any form of Control Transfer Instruction.
+                            // Required for MIPSR6
+  bit hasForbiddenSlot = 0; // Instruction has a forbidden slot.
+  bit IsPCRelativeLoad = 0; // Load instruction with implicit source register
+                            // ($pc) and with explicit offset and destination
+                            // register
+
+  // TSFlags layout should be kept in sync with MipsInstrInfo.h.
+  let TSFlags{3-0}   = FormBits;
+  let TSFlags{4}     = isCTI;
+  let TSFlags{5}     = hasForbiddenSlot;
+  let TSFlags{6}     = IsPCRelativeLoad;
+
+  let DecoderNamespace = "Mips";
+
+  field bits<32> SoftFail = 0;
+}
+
+// Mips32/64 Instruction Format
+class InstSE<dag outs, dag ins, string asmstr, list<dag> pattern,
+             InstrItinClass itin, Format f, string opstr = ""> :
+  MipsInst<outs, ins, asmstr, pattern, itin, f>, PredicateControl {
+  let EncodingPredicates = [HasStdEnc];
+  string BaseOpcode = opstr;
+  string Arch;
+}
+
+// Mips Pseudo Instructions Format
+class MipsPseudo<dag outs, dag ins, list<dag> pattern,
+                 InstrItinClass itin = IIPseudo> :
+  MipsInst<outs, ins, "", pattern, itin, Pseudo> {
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
+
+// Mips32/64 Pseudo Instruction Format
+class PseudoSE<dag outs, dag ins, list<dag> pattern,
+               InstrItinClass itin = IIPseudo> :
+  MipsPseudo<outs, ins, pattern, itin>, PredicateControl {
+  let EncodingPredicates = [HasStdEnc];
+}
+
+// Pseudo-instructions for alternate assembly syntax (never used by codegen).
+// These are aliases that require C++ handling to convert to the target
+// instruction, while InstAliases can be handled directly by tblgen.
+class MipsAsmPseudoInst<dag outs, dag ins, string asmstr>:
+  MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo>, PredicateControl {
+  let isPseudo = 1;
+  let Pattern = [];
+}
+//===----------------------------------------------------------------------===//
+// Format R instruction class in Mips : <|opcode|rs|rt|rd|shamt|funct|>
+//===----------------------------------------------------------------------===//
+
+class FR<bits<6> op, bits<6> _funct, dag outs, dag ins, string asmstr,
+         list<dag> pattern, InstrItinClass itin>:
+  InstSE<outs, ins, asmstr, pattern, itin, FrmR>
+{
+  bits<5>  rd;
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<5>  shamt;
+  bits<6>  funct;
+
+  let Opcode = op;
+  let funct  = _funct;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = shamt;
+  let Inst{5-0}   = funct;
+}
+
+//===----------------------------------------------------------------------===//
+// Format I instruction class in Mips : <|opcode|rs|rt|immediate|>
+//===----------------------------------------------------------------------===//
+
+class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+         InstrItinClass itin>: InstSE<outs, ins, asmstr, pattern, itin, FrmI>
+{
+  bits<5>  rt;
+  bits<5>  rs;
+  bits<16> imm16;
+
+  let Opcode = op;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+class BranchBase<bits<6> op, dag outs, dag ins, string asmstr,
+                  list<dag> pattern, InstrItinClass itin>:
+  InstSE<outs, ins, asmstr, pattern, itin, FrmI>
+{
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> imm16;
+
+  let Opcode = op;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+//===----------------------------------------------------------------------===//
+// Format J instruction class in Mips : <|opcode|address|>
+//===----------------------------------------------------------------------===//
+
+class FJ<bits<6> op> : StdArch
+{
+  bits<26> target;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-0}  = target;
+}
+
+//===----------------------------------------------------------------------===//
+// MFC instruction class in Mips : <|op|mf|rt|rd|0000000|sel|>
+//===----------------------------------------------------------------------===//
+class MFC3OP_FM<bits<6> op, bits<5> mfmt>
+{
+  bits<5> rt;
+  bits<5> rd;
+  bits<3> sel;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = mfmt;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-3}  = 0;
+  let Inst{2-0}   = sel;
+}
+
+class MFC2OP_FM<bits<6> op, bits<5> mfmt> : StdArch {
+  bits<5>  rt;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = mfmt;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+class ADD_FM<bits<6> op, bits<6> funct> : StdArch {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class ADDI_FM<bits<6> op> : StdArch {
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+class SRA_FM<bits<6> funct, bit rotate> : StdArch {
+  bits<5> rd;
+  bits<5> rt;
+  bits<5> shamt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-22} = 0;
+  let Inst{21}    = rotate;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = shamt;
+  let Inst{5-0}   = funct;
+}
+
+class SRLV_FM<bits<6> funct, bit rotate> : StdArch {
+  bits<5> rd;
+  bits<5> rt;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-7}  = 0;
+  let Inst{6}     = rotate;
+  let Inst{5-0}   = funct;
+}
+
+class BEQ_FM<bits<6> op> : StdArch {
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = offset;
+}
+
+class BGEZ_FM<bits<6> op, bits<5> funct> : StdArch {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = funct;
+  let Inst{15-0}  = offset;
+}
+
+class BBIT_FM<bits<6> op> : StdArch {
+  bits<5>  rs;
+  bits<5>  p;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = p;
+  let Inst{15-0}  = offset;
+}
+
+class SLTI_FM<bits<6> op> : StdArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+class MFLO_FM<bits<6> funct> : StdArch {
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class MTLO_FM<bits<6> funct> : StdArch {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class SEB_FM<bits<5> funct, bits<6> funct2> : StdArch {
+  bits<5> rd;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1f;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = funct;
+  let Inst{5-0}   = funct2;
+}
+
+class CLO_FM<bits<6> funct> : StdArch {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+  let rt = rd;
+}
+
+class LUI_FM : StdArch {
+  bits<5> rt;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0xf;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+class JALR_FM {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = 9;
+}
+
+class BGEZAL_FM<bits<5> funct> : StdArch {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 1;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = funct;
+  let Inst{15-0}  = offset;
+}
+
+class SYNC_FM : StdArch {
+  bits<5> stype;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{10-6}  = stype;
+  let Inst{5-0}   = 0xf;
+}
+
+class SYNCI_FM : StdArch {
+  // Produced by the mem_simm16 address as reg << 16 | imm (see getMemEncoding).
+  bits<21> addr;
+  bits<5> rs = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b000001;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0b11111;
+  let Inst{15-0}  = offset;
+}
+
+class MULT_FM<bits<6> op, bits<6> funct> : StdArch {
+  bits<5>  rs;
+  bits<5>  rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class EXT_FM<bits<6> funct> : StdArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> pos;
+  bits<5> size;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1f;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = size;
+  let Inst{10-6}  = pos;
+  let Inst{5-0}   = funct;
+}
+
+class RDHWR_FM : StdArch {
+  bits<5> rt;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1f;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = 0x3b;
+}
+
+class TEQ_FM<bits<6> funct> : StdArch {
+  bits<5> rs;
+  bits<5> rt;
+  bits<10> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = code_;
+  let Inst{5-0}   = funct;
+}
+
+class TEQI_FM<bits<5> funct> : StdArch {
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 1;
+  let Inst{25-21} = rs;
+  let Inst{20-16}   = funct;
+  let Inst{15-0}  = imm16;
+}
+
+class WAIT_FM : StdArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25}    = 1;
+  let Inst{24-6}  = 0;
+  let Inst{5-0}   = 0x20;
+}
+
+class EXTS_FM<bits<6> funct> : StdArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> pos;
+  bits<5> lenm1;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = lenm1;
+  let Inst{10-6}  = pos;
+  let Inst{5-0}   = funct;
+}
+
+class MTMR_FM<bits<6> funct> : StdArch {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class POP_FM<bits<6> funct> : StdArch {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class SEQ_FM<bits<6> funct> : StdArch {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class SEQI_FM<bits<6> funct> : StdArch {
+  bits<5> rs;
+  bits<5> rt;
+  bits<10> imm10;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = imm10;
+  let Inst{5-0}   = funct;
+}
+
+//===----------------------------------------------------------------------===//
+//  System calls format <op|code_|funct>
+//===----------------------------------------------------------------------===//
+
+class SYS_FM<bits<6> funct> : StdArch
+{
+  bits<20> code_;
+  bits<32> Inst;
+  let Inst{31-26} = 0x0;
+  let Inst{25-6} = code_;
+  let Inst{5-0}  = funct;
+}
+
+//===----------------------------------------------------------------------===//
+//  Break instruction format <op|code_1|funct>
+//===----------------------------------------------------------------------===//
+
+class BRK_FM<bits<6> funct> : StdArch
+{
+  bits<10> code_1;
+  bits<10> code_2;
+  bits<32> Inst;
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = code_1;
+  let Inst{15-6}  = code_2;
+  let Inst{5-0}   = funct;
+}
+
+//===----------------------------------------------------------------------===//
+//  Exception return format <Cop0|1|0|funct>
+//===----------------------------------------------------------------------===//
+
+class ER_FM<bits<6> funct, bit LLBit> : StdArch
+{
+  bits<32> Inst;
+  let Inst{31-26} = 0x10;
+  let Inst{25}    = 1;
+  let Inst{24-7}  = 0;
+  let Inst{6} = LLBit;
+  let Inst{5-0}   = funct;
+}
+
+//===----------------------------------------------------------------------===//
+//  Enable/disable interrupt instruction format <Cop0|MFMC0|rt|12|0|sc|0|0>
+//===----------------------------------------------------------------------===//
+
+class EI_FM<bits<1> sc> : StdArch
+{
+  bits<32> Inst;
+  bits<5> rt;
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = 0xb;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = 0xc;
+  let Inst{10-6}  = 0;
+  let Inst{5}     = sc;
+  let Inst{4-0}   = 0;
+}
+
+//===----------------------------------------------------------------------===//
+//
+//  FLOATING POINT INSTRUCTION FORMATS
+//
+//  opcode  - operation code.
+//  fs      - src reg.
+//  ft      - dst reg (on a 2 regs instr) or src reg (on a 3 reg instr).
+//  fd      - dst reg, only used on 3 regs instr.
+//  fmt     - double or single precision.
+//  funct   - combined with opcode field give us an operation code.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Format FI instruction class in Mips : <|opcode|base|ft|immediate|>
+//===----------------------------------------------------------------------===//
+
+class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>:
+  InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmFI>
+{
+  bits<5>  ft;
+  bits<5>  base;
+  bits<16> imm16;
+
+  let Opcode = op;
+
+  let Inst{25-21} = base;
+  let Inst{20-16} = ft;
+  let Inst{15-0}  = imm16;
+}
+
+class ADDS_FM<bits<6> funct, bits<5> fmt> : StdArch {
+  bits<5> fd;
+  bits<5> fs;
+  bits<5> ft;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
+}
+
+class ABSS_FM<bits<6> funct, bits<5> fmt> : StdArch {
+  bits<5> fd;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
+}
+
+class MFC1_FM<bits<5> funct> : StdArch {
+  bits<5> rt;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = fs;
+  let Inst{10-0}  = 0;
+}
+
+class LW_FM<bits<6> op> : StdArch {
+  bits<5> rt;
+  bits<21> addr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = addr{20-16};
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = addr{15-0};
+}
+
+class MADDS_FM<bits<3> funct, bits<3> fmt> : StdArch {
+  bits<5> fd;
+  bits<5> fr;
+  bits<5> fs;
+  bits<5> ft;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x13;
+  let Inst{25-21} = fr;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-3}   = funct;
+  let Inst{2-0}   = fmt;
+}
+
+class LWXC1_FM<bits<6> funct> : StdArch {
+  bits<5> fd;
+  bits<5> base;
+  bits<5> index;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x13;
+  let Inst{25-21} = base;
+  let Inst{20-16} = index;
+  let Inst{15-11} = 0;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
+}
+
+class SWXC1_FM<bits<6> funct> : StdArch {
+  bits<5> fs;
+  bits<5> base;
+  bits<5> index;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x13;
+  let Inst{25-21} = base;
+  let Inst{20-16} = index;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class BC1F_FM<bit nd, bit tf> : StdArch {
+  bits<3>  fcc;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = 0x8;
+  let Inst{20-18} = fcc;
+  let Inst{17} = nd;
+  let Inst{16} = tf;
+  let Inst{15-0} = offset;
+}
+
+class CEQS_FM<bits<5> fmt> : StdArch {
+  bits<5> fs;
+  bits<5> ft;
+  bits<4> cond;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-8} = 0; // cc
+  let Inst{7-4} = 0x3;
+  let Inst{3-0} = cond;
+}
+
+class C_COND_FM<bits<5> fmt, bits<4> c> : CEQS_FM<fmt> {
+  let cond = c;
+}
+
+class CMov_I_F_FM<bits<6> funct, bits<5> fmt> : StdArch {
+  bits<5> fd;
+  bits<5> fs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = fs;
+  let Inst{10-6} = fd;
+  let Inst{5-0} = funct;
+}
+
+class CMov_F_I_FM<bit tf> : StdArch {
+  bits<5> rd;
+  bits<5> rs;
+  bits<3> fcc;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-18} = fcc;
+  let Inst{17} = 0;
+  let Inst{16} = tf;
+  let Inst{15-11} = rd;
+  let Inst{10-6} = 0;
+  let Inst{5-0} = 1;
+}
+
+class CMov_F_F_FM<bits<5> fmt, bit tf> : StdArch {
+  bits<5> fd;
+  bits<5> fs;
+  bits<3> fcc;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-18} = fcc;
+  let Inst{17} = 0;
+  let Inst{16} = tf;
+  let Inst{15-11} = fs;
+  let Inst{10-6} = fd;
+  let Inst{5-0} = 0x11;
+}
+
+class BARRIER_FM<bits<5> op> : StdArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0; // SPECIAL
+  let Inst{25-21} = 0;
+  let Inst{20-16} = 0; // rt = 0
+  let Inst{15-11} = 0; // rd = 0
+  let Inst{10-6} = op; // Operation
+  let Inst{5-0} = 0;   // SLL
+}
+
+class SDBBP_FM : StdArch {
+  bits<20> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0b011100; // SPECIAL2
+  let Inst{25-6} = code_;
+  let Inst{5-0} = 0b111111;   // SDBBP
+}
+
+class JR_HB_FM<bits<6> op> : StdArch{
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0; // SPECIAL
+  let Inst{25-21} = rs;
+  let Inst{20-11} = 0;
+  let Inst{10} = 1;
+  let Inst{9-6} = 0;
+  let Inst{5-0} = op;
+}
+
+class JALR_HB_FM<bits<6> op> : StdArch {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0; // SPECIAL
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10} = 1;
+  let Inst{9-6} = 0;
+  let Inst{5-0} = op;
+}
+
+class COP0_TLB_FM<bits<6> op> : StdArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10; // COP0
+  let Inst{25} = 1;       // CO
+  let Inst{24-6} = 0;
+  let Inst{5-0} = op;     // Operation
+}
+
+class CACHEOP_FM<bits<6> op> : StdArch {
+  bits<21> addr;
+  bits<5> hint;
+  bits<5> base = addr{20-16};
+  bits<16> offset = addr{15-0};
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = base;
+  let Inst{20-16} = hint;
+  let Inst{15-0}  = offset;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
new file mode 100644
index 000000000000..19af1914c819
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
@@ -0,0 +1,503 @@
+//===-- MipsInstrInfo.cpp - Mips Instruction Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsInstrInfo.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "MipsGenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void MipsInstrInfo::anchor() {}
+
+MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBr)
+    : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
+      Subtarget(STI), UncondBrOpc(UncondBr) {}
+
+const MipsInstrInfo *MipsInstrInfo::create(MipsSubtarget &STI) {
+  if (STI.inMips16Mode())
+    return llvm::createMips16InstrInfo(STI);
+
+  return llvm::createMipsSEInstrInfo(STI);
+}
+
+bool MipsInstrInfo::isZeroImm(const MachineOperand &op) const {
+  return op.isImm() && op.getImm() == 0;
+}
+
+/// insertNoop - If data hazard condition is found insert the target nop
+/// instruction.
+// FIXME: This appears to be dead code.
+void MipsInstrInfo::
+insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const
+{
+  DebugLoc DL;
+  BuildMI(MBB, MI, DL, get(Mips::NOP));
+}
+
+MachineMemOperand *
+MipsInstrInfo::GetMemOperand(MachineBasicBlock &MBB, int FI,
+                             MachineMemOperand::Flags Flags) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  return MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
+                                 Flags, MFI.getObjectSize(FI), Align);
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Analysis
+//===----------------------------------------------------------------------===//
+
+void MipsInstrInfo::AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc,
+                                  MachineBasicBlock *&BB,
+                                  SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(getAnalyzableBrOpc(Opc) && "Not an analyzable branch");
+  int NumOp = Inst->getNumExplicitOperands();
+
+  // for both int and fp branches, the last explicit operand is the
+  // MBB.
+  BB = Inst->getOperand(NumOp-1).getMBB();
+  Cond.push_back(MachineOperand::CreateImm(Opc));
+
+  for (int i=0; i<NumOp-1; i++)
+    Cond.push_back(Inst->getOperand(i));
+}
+
+bool MipsInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                  MachineBasicBlock *&TBB,
+                                  MachineBasicBlock *&FBB,
+                                  SmallVectorImpl<MachineOperand> &Cond,
+                                  bool AllowModify) const {
+  SmallVector<MachineInstr*, 2> BranchInstrs;
+  BranchType BT = analyzeBranch(MBB, TBB, FBB, Cond, AllowModify, BranchInstrs);
+
+  return (BT == BT_None) || (BT == BT_Indirect);
+}
+
+void MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                const DebugLoc &DL,
+                                ArrayRef<MachineOperand> Cond) const {
+  unsigned Opc = Cond[0].getImm();
+  const MCInstrDesc &MCID = get(Opc);
+  MachineInstrBuilder MIB = BuildMI(&MBB, DL, MCID);
+
+  for (unsigned i = 1; i < Cond.size(); ++i) {
+    if (Cond[i].isReg())
+      MIB.addReg(Cond[i].getReg());
+    else if (Cond[i].isImm())
+      MIB.addImm(Cond[i].getImm());
+    else
+       assert(false && "Cannot copy operand");
+  }
+  MIB.addMBB(TBB);
+}
+
+unsigned MipsInstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *TBB,
+                                     MachineBasicBlock *FBB,
+                                     ArrayRef<MachineOperand> Cond,
+                                     const DebugLoc &DL,
+                                     int *BytesAdded) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+  assert(!BytesAdded && "code size not handled");
+
+  // # of condition operands:
+  //  Unconditional branches: 0
+  //  Floating point branches: 1 (opc)
+  //  Int BranchZero: 2 (opc, reg)
+  //  Int Branch: 3 (opc, reg0, reg1)
+  assert((Cond.size() <= 3) &&
+         "# of Mips branch conditions must be <= 3!");
+
+  // Two-way Conditional branch.
+  if (FBB) {
+    BuildCondBr(MBB, TBB, DL, Cond);
+    BuildMI(&MBB, DL, get(UncondBrOpc)).addMBB(FBB);
+    return 2;
+  }
+
+  // One way branch.
+  // Unconditional branch.
+  if (Cond.empty())
+    BuildMI(&MBB, DL, get(UncondBrOpc)).addMBB(TBB);
+  else // Conditional branch.
+    BuildCondBr(MBB, TBB, DL, Cond);
+  return 1;
+}
+
+unsigned MipsInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                     int *BytesRemoved) const {
+  assert(!BytesRemoved && "code size not handled");
+
+  MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
+  unsigned removed;
+
+  // Skip all the debug instructions.
+  while (I != REnd && I->isDebugValue())
+    ++I;
+
+  if (I == REnd)
+    return 0;
+
+  MachineBasicBlock::iterator FirstBr = ++I.getReverse();
+
+  // Up to 2 branches are removed.
+  // Note that indirect branches are not removed.
+  for (removed = 0; I != REnd && removed < 2; ++I, ++removed)
+    if (!getAnalyzableBrOpc(I->getOpcode()))
+      break;
+
+  MBB.erase((--I).getReverse(), FirstBr);
+
+  return removed;
+}
+
+/// reverseBranchCondition - Return the inverse opcode of the
+/// specified Branch instruction.
+bool MipsInstrInfo::reverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  assert( (Cond.size() && Cond.size() <= 3) &&
+          "Invalid Mips branch condition!");
+  Cond[0].setImm(getOppositeBranchOpc(Cond[0].getImm()));
+  return false;
+}
+
+MipsInstrInfo::BranchType MipsInstrInfo::analyzeBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+    SmallVectorImpl<MachineOperand> &Cond, bool AllowModify,
+    SmallVectorImpl<MachineInstr *> &BranchInstrs) const {
+
+  MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
+
+  // Skip all the debug instructions.
+  while (I != REnd && I->isDebugValue())
+    ++I;
+
+  if (I == REnd || !isUnpredicatedTerminator(*I)) {
+    // This block ends with no branches (it just falls through to its succ).
+    // Leave TBB/FBB null.
+    TBB = FBB = nullptr;
+    return BT_NoBranch;
+  }
+
+  MachineInstr *LastInst = &*I;
+  unsigned LastOpc = LastInst->getOpcode();
+  BranchInstrs.push_back(LastInst);
+
+  // Not an analyzable branch (e.g., indirect jump).
+  if (!getAnalyzableBrOpc(LastOpc))
+    return LastInst->isIndirectBranch() ? BT_Indirect : BT_None;
+
+  // Get the second to last instruction in the block.
+  unsigned SecondLastOpc = 0;
+  MachineInstr *SecondLastInst = nullptr;
+
+  if (++I != REnd) {
+    SecondLastInst = &*I;
+    SecondLastOpc = getAnalyzableBrOpc(SecondLastInst->getOpcode());
+
+    // Not an analyzable branch (must be an indirect jump).
+    if (isUnpredicatedTerminator(*SecondLastInst) && !SecondLastOpc)
+      return BT_None;
+  }
+
+  // If there is only one terminator instruction, process it.
+  if (!SecondLastOpc) {
+    // Unconditional branch.
+    if (LastInst->isUnconditionalBranch()) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return BT_Uncond;
+    }
+
+    // Conditional branch
+    AnalyzeCondBr(LastInst, LastOpc, TBB, Cond);
+    return BT_Cond;
+  }
+
+  // If we reached here, there are two branches.
+  // If there are three terminators, we don't know what sort of block this is.
+  if (++I != REnd && isUnpredicatedTerminator(*I))
+    return BT_None;
+
+  BranchInstrs.insert(BranchInstrs.begin(), SecondLastInst);
+
+  // If second to last instruction is an unconditional branch,
+  // analyze it and remove the last instruction.
+  if (SecondLastInst->isUnconditionalBranch()) {
+    // Return if the last instruction cannot be removed.
+    if (!AllowModify)
+      return BT_None;
+
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    LastInst->eraseFromParent();
+    BranchInstrs.pop_back();
+    return BT_Uncond;
+  }
+
+  // Conditional branch followed by an unconditional branch.
+  // The last one must be unconditional.
+  if (!LastInst->isUnconditionalBranch())
+    return BT_None;
+
+  AnalyzeCondBr(SecondLastInst, SecondLastOpc, TBB, Cond);
+  FBB = LastInst->getOperand(0).getMBB();
+
+  return BT_CondUncond;
+}
+
+/// Return the corresponding compact (no delay slot) form of a branch.
+unsigned MipsInstrInfo::getEquivalentCompactForm(
+    const MachineBasicBlock::iterator I) const {
+  unsigned Opcode = I->getOpcode();
+  bool canUseShortMicroMipsCTI = false;
+
+  if (Subtarget.inMicroMipsMode()) {
+    switch (Opcode) {
+    case Mips::BNE:
+    case Mips::BNE_MM:
+    case Mips::BEQ:
+    case Mips::BEQ_MM:
+    // microMIPS has NE,EQ branches that do not have delay slots provided one
+    // of the operands is zero.
+      if (I->getOperand(1).getReg() == Subtarget.getABI().GetZeroReg())
+        canUseShortMicroMipsCTI = true;
+      break;
+    // For microMIPS the PseudoReturn and PseudoIndirectBranch are always
+    // expanded to JR_MM, so they can be replaced with JRC16_MM.
+    case Mips::JR:
+    case Mips::PseudoReturn:
+    case Mips::PseudoIndirectBranch:
+    case Mips::TAILCALLREG:
+      canUseShortMicroMipsCTI = true;
+      break;
+    }
+  }
+
+  // MIPSR6 forbids both operands being the zero register.
+  if (Subtarget.hasMips32r6() && (I->getNumOperands() > 1) &&
+      (I->getOperand(0).isReg() &&
+       (I->getOperand(0).getReg() == Mips::ZERO ||
+        I->getOperand(0).getReg() == Mips::ZERO_64)) &&
+      (I->getOperand(1).isReg() &&
+       (I->getOperand(1).getReg() == Mips::ZERO ||
+        I->getOperand(1).getReg() == Mips::ZERO_64)))
+    return 0;
+
+  if (Subtarget.hasMips32r6() || canUseShortMicroMipsCTI) {
+    switch (Opcode) {
+    case Mips::B:
+      return Mips::BC;
+    case Mips::BAL:
+      return Mips::BALC;
+    case Mips::BEQ:
+    case Mips::BEQ_MM:
+      if (canUseShortMicroMipsCTI)
+        return Mips::BEQZC_MM;
+      else if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+        return 0;
+      return Mips::BEQC;
+    case Mips::BNE:
+    case Mips::BNE_MM:
+      if (canUseShortMicroMipsCTI)
+        return Mips::BNEZC_MM;
+      else if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+        return 0;
+      return Mips::BNEC;
+    case Mips::BGE:
+      if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+        return 0;
+      return Mips::BGEC;
+    case Mips::BGEU:
+      if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+        return 0;
+      return Mips::BGEUC;
+    case Mips::BGEZ:
+      return Mips::BGEZC;
+    case Mips::BGTZ:
+      return Mips::BGTZC;
+    case Mips::BLEZ:
+      return Mips::BLEZC;
+    case Mips::BLT:
+      if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+        return 0;
+      return Mips::BLTC;
+    case Mips::BLTU:
+      if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+        return 0;
+      return Mips::BLTUC;
+    case Mips::BLTZ:
+      return Mips::BLTZC;
+    case Mips::BEQ64:
+      if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+        return 0;
+      return Mips::BEQC64;
+    case Mips::BNE64:
+      if (I->getOperand(0).getReg() == I->getOperand(1).getReg())
+        return 0;
+      return Mips::BNEC64;
+    case Mips::BGTZ64:
+      return Mips::BGTZC64;
+    case Mips::BGEZ64:
+      return Mips::BGEZC64;
+    case Mips::BLTZ64:
+      return Mips::BLTZC64;
+    case Mips::BLEZ64:
+      return Mips::BLEZC64;
+    // For MIPSR6, the instruction 'jic' can be used for these cases. Some
+    // tools will accept 'jrc reg' as an alias for 'jic 0, $reg'.
+    case Mips::JR:
+    case Mips::PseudoReturn:
+    case Mips::PseudoIndirectBranch:
+    case Mips::TAILCALLREG:
+      if (canUseShortMicroMipsCTI)
+        return Mips::JRC16_MM;
+      return Mips::JIC;
+    case Mips::JALRPseudo:
+      return Mips::JIALC;
+    case Mips::JR64:
+    case Mips::PseudoReturn64:
+    case Mips::PseudoIndirectBranch64:
+    case Mips::TAILCALLREG64:
+      return Mips::JIC64;
+    case Mips::JALR64Pseudo:
+      return Mips::JIALC64;
+    default:
+      return 0;
+    }
+  }
+
+  return 0;
+}
+
+/// Predicate for distingushing between control transfer instructions and all
+/// other instructions for handling forbidden slots. Consider inline assembly
+/// as unsafe as well.
+bool MipsInstrInfo::SafeInForbiddenSlot(const MachineInstr &MI) const {
+  if (MI.isInlineAsm())
+    return false;
+
+  return (MI.getDesc().TSFlags & MipsII::IsCTI) == 0;
+
+}
+
+/// Predicate for distingushing instructions that have forbidden slots.
+bool MipsInstrInfo::HasForbiddenSlot(const MachineInstr &MI) const {
+  return (MI.getDesc().TSFlags & MipsII::HasForbiddenSlot) != 0;
+}
+
+/// Return the number of bytes of code the specified instruction may be.
+unsigned MipsInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default:
+    return MI.getDesc().getSize();
+  case  TargetOpcode::INLINEASM: {       // Inline Asm: Variable size.
+    const MachineFunction *MF = MI.getParent()->getParent();
+    const char *AsmStr = MI.getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  }
+  case Mips::CONSTPOOL_ENTRY:
+    // If this machine instr is a constant pool entry, its size is recorded as
+    // operand #2.
+    return MI.getOperand(2).getImm();
+  }
+}
+
+MachineInstrBuilder
+MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
+                                  MachineBasicBlock::iterator I) const {
+  MachineInstrBuilder MIB;
+
+  // Certain branches have two forms: e.g beq $1, $zero, dest vs beqz $1, dest
+  // Pick the zero form of the branch for readable assembly and for greater
+  // branch distance in non-microMIPS mode.
+  // Additional MIPSR6 does not permit the use of register $zero for compact
+  // branches.
+  // FIXME: Certain atomic sequences on mips64 generate 32bit references to
+  // Mips::ZERO, which is incorrect. This test should be updated to use
+  // Subtarget.getABI().GetZeroReg() when those atomic sequences and others
+  // are fixed.
+  int ZeroOperandPosition = -1;
+  bool BranchWithZeroOperand = false;
+  if (I->isBranch() && !I->isPseudo()) {
+    auto TRI = I->getParent()->getParent()->getSubtarget().getRegisterInfo();
+    ZeroOperandPosition = I->findRegisterUseOperandIdx(Mips::ZERO, false, TRI);
+    BranchWithZeroOperand = ZeroOperandPosition != -1;
+  }
+
+  if (BranchWithZeroOperand) {
+    switch (NewOpc) {
+    case Mips::BEQC:
+      NewOpc = Mips::BEQZC;
+      break;
+    case Mips::BNEC:
+      NewOpc = Mips::BNEZC;
+      break;
+    case Mips::BGEC:
+      NewOpc = Mips::BGEZC;
+      break;
+    case Mips::BLTC:
+      NewOpc = Mips::BLTZC;
+      break;
+    case Mips::BEQC64:
+      NewOpc = Mips::BEQZC64;
+      break;
+    case Mips::BNEC64:
+      NewOpc = Mips::BNEZC64;
+      break;
+    }
+  }
+
+  MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), get(NewOpc));
+
+  // For MIPSR6 JI*C requires an immediate 0 as an operand, JIALC(64) an
+  // immediate 0 as an operand and requires the removal of it's %RA<imp-def>
+  // implicit operand as copying the implicit operations of the instructio we're
+  // looking at will give us the correct flags.
+  if (NewOpc == Mips::JIC || NewOpc == Mips::JIALC || NewOpc == Mips::JIC64 ||
+      NewOpc == Mips::JIALC64) {
+
+    if (NewOpc == Mips::JIALC || NewOpc == Mips::JIALC64)
+      MIB->RemoveOperand(0);
+
+    for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) {
+      MIB.addOperand(I->getOperand(J));
+    }
+
+    MIB.addImm(0);
+
+  } else {
+    for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) {
+      if (BranchWithZeroOperand && (unsigned)ZeroOperandPosition == J)
+        continue;
+
+      MIB.addOperand(I->getOperand(J));
+    }
+  }
+
+  MIB.copyImplicitOps(*I);
+
+  MIB.setMemRefs(I->memoperands_begin(), I->memoperands_end());
+  return MIB;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h
new file mode 100644
index 000000000000..347b9187d08c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h
@@ -0,0 +1,161 @@
+//===-- MipsInstrInfo.h - Mips Instruction Information ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+// FIXME: We need to override TargetInstrInfo::getInlineAsmLength method in
+// order for MipsLongBranch pass to work correctly when the code has inline
+// assembly.  The returned value doesn't have to be the asm instruction's exact
+// size in bytes; MipsLongBranch only expects it to be the correct upper bound.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSINSTRINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPSINSTRINFO_H
+
+#include "Mips.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "MipsGenInstrInfo.inc"
+
+namespace llvm {
+class MipsSubtarget;
+class MipsInstrInfo : public MipsGenInstrInfo {
+  virtual void anchor();
+protected:
+  const MipsSubtarget &Subtarget;
+  unsigned UncondBrOpc;
+
+public:
+  enum BranchType {
+    BT_None,       // Couldn't analyze branch.
+    BT_NoBranch,   // No branches found.
+    BT_Uncond,     // One unconditional branch.
+    BT_Cond,       // One conditional branch.
+    BT_CondUncond, // A conditional branch followed by an unconditional branch.
+    BT_Indirect    // One indirct branch.
+  };
+
+  explicit MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBrOpc);
+
+  static const MipsInstrInfo *create(MipsSubtarget &STI);
+
+  /// Branch Analysis
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+
+  bool
+  reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  BranchType analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                           MachineBasicBlock *&FBB,
+                           SmallVectorImpl<MachineOperand> &Cond,
+                           bool AllowModify,
+                           SmallVectorImpl<MachineInstr *> &BranchInstrs) const;
+
+  /// Determine the opcode of a non-delay slot form for a branch if one exists.
+  unsigned getEquivalentCompactForm(const MachineBasicBlock::iterator I) const;
+
+  /// Predicate to determine if an instruction can go in a forbidden slot.
+  bool SafeInForbiddenSlot(const MachineInstr &MI) const;
+
+  /// Predicate to determine if an instruction has a forbidden slot.
+  bool HasForbiddenSlot(const MachineInstr &MI) const;
+
+  /// Insert nop instruction when hazard condition is found
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const override;
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const MipsRegisterInfo &getRegisterInfo() const = 0;
+
+  virtual unsigned getOppositeBranchOpc(unsigned Opc) const = 0;
+
+  /// Return the number of bytes of code the specified instruction may be.
+  unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override {
+    storeRegToStack(MBB, MBBI, SrcReg, isKill, FrameIndex, RC, TRI, 0);
+  }
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override {
+    loadRegFromStack(MBB, MBBI, DestReg, FrameIndex, RC, TRI, 0);
+  }
+
+  virtual void storeRegToStack(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MI,
+                               unsigned SrcReg, bool isKill, int FrameIndex,
+                               const TargetRegisterClass *RC,
+                               const TargetRegisterInfo *TRI,
+                               int64_t Offset) const = 0;
+
+  virtual void loadRegFromStack(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI,
+                                unsigned DestReg, int FrameIndex,
+                                const TargetRegisterClass *RC,
+                                const TargetRegisterInfo *TRI,
+                                int64_t Offset) const = 0;
+
+  virtual void adjustStackPtr(unsigned SP, int64_t Amount,
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const = 0;
+
+  /// Create an instruction which has the same operands and memory operands
+  /// as MI but has a new opcode.
+  MachineInstrBuilder genInstrWithNewOpc(unsigned NewOpc,
+                                         MachineBasicBlock::iterator I) const;
+
+protected:
+  bool isZeroImm(const MachineOperand &op) const;
+
+  MachineMemOperand *GetMemOperand(MachineBasicBlock &MBB, int FI,
+                                   MachineMemOperand::Flags Flags) const;
+
+private:
+  virtual unsigned getAnalyzableBrOpc(unsigned Opc) const = 0;
+
+  void AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc,
+                     MachineBasicBlock *&BB,
+                     SmallVectorImpl<MachineOperand> &Cond) const;
+
+  void BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                   const DebugLoc &DL, ArrayRef<MachineOperand> Cond) const;
+};
+
+/// Create MipsInstrInfo objects.
+const MipsInstrInfo *createMips16InstrInfo(const MipsSubtarget &STI);
+const MipsInstrInfo *createMipsSEInstrInfo(const MipsSubtarget &STI);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
new file mode 100644
index 000000000000..5bc48336121a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -0,0 +1,2868 @@
+//===- MipsInstrInfo.td - Target Description for Mips Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// Mips profiles and nodes
+//===----------------------------------------------------------------------===//
+
+def SDT_MipsJmpLink      : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+def SDT_MipsCMov         : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
+                                                SDTCisSameAs<1, 2>,
+                                                SDTCisSameAs<3, 4>,
+                                                SDTCisInt<4>]>;
+def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_MipsCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+def SDT_MFLOHI : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVT<1, untyped>]>;
+def SDT_MTLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
+                                      SDTCisInt<1>, SDTCisSameAs<1, 2>]>;
+def SDT_MipsMultDiv : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>, SDTCisInt<1>,
+                                    SDTCisSameAs<1, 2>]>;
+def SDT_MipsMAddMSub : SDTypeProfile<1, 3,
+                                     [SDTCisVT<0, untyped>, SDTCisSameAs<0, 3>,
+                                      SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
+def SDT_MipsDivRem16 : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>;
+
+def SDT_MipsThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+
+def SDT_Sync             : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+
+def SDT_Ext : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                   SDTCisVT<2, i32>, SDTCisSameAs<2, 3>]>;
+def SDT_Ins : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                   SDTCisVT<2, i32>, SDTCisSameAs<2, 3>,
+                                   SDTCisSameAs<0, 4>]>;
+
+def SDTMipsLoadLR  : SDTypeProfile<1, 2,
+                                   [SDTCisInt<0>, SDTCisPtrTy<1>,
+                                    SDTCisSameAs<0, 2>]>;
+
+// Call
+def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink,
+                         [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                          SDNPVariadic]>;
+
+// Tail call
+def MipsTailCall : SDNode<"MipsISD::TailCall", SDT_MipsJmpLink,
+                          [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+// Hi and Lo nodes are used to handle global addresses. Used on
+// MipsISelLowering to lower stuff like GlobalAddress, ExternalSymbol
+// static model. (nothing to do with Mips Registers Hi and Lo)
+def MipsHi    : SDNode<"MipsISD::Hi", SDTIntUnaryOp>;
+def MipsLo    : SDNode<"MipsISD::Lo", SDTIntUnaryOp>;
+def MipsGPRel : SDNode<"MipsISD::GPRel", SDTIntUnaryOp>;
+
+// TlsGd node is used to handle General Dynamic TLS
+def MipsTlsGd : SDNode<"MipsISD::TlsGd", SDTIntUnaryOp>;
+
+// TprelHi and TprelLo nodes are used to handle Local Exec TLS
+def MipsTprelHi    : SDNode<"MipsISD::TprelHi", SDTIntUnaryOp>;
+def MipsTprelLo    : SDNode<"MipsISD::TprelLo", SDTIntUnaryOp>;
+
+// Thread pointer
+def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>;
+
+// Return
+def MipsRet : SDNode<"MipsISD::Ret", SDTNone,
+                     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def MipsERet : SDNode<"MipsISD::ERet", SDTNone,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPSideEffect]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart,
+                           [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeqEnd,
+                           [SDNPHasChain, SDNPSideEffect,
+                            SDNPOptInGlue, SDNPOutGlue]>;
+
+// Nodes used to extract LO/HI registers.
+def MipsMFHI : SDNode<"MipsISD::MFHI", SDT_MFLOHI>;
+def MipsMFLO : SDNode<"MipsISD::MFLO", SDT_MFLOHI>;
+
+// Node used to insert 32-bit integers to LOHI register pair.
+def MipsMTLOHI : SDNode<"MipsISD::MTLOHI", SDT_MTLOHI>;
+
+// Mult nodes.
+def MipsMult  : SDNode<"MipsISD::Mult", SDT_MipsMultDiv>;
+def MipsMultu : SDNode<"MipsISD::Multu", SDT_MipsMultDiv>;
+
+// MAdd*/MSub* nodes
+def MipsMAdd  : SDNode<"MipsISD::MAdd", SDT_MipsMAddMSub>;
+def MipsMAddu : SDNode<"MipsISD::MAddu", SDT_MipsMAddMSub>;
+def MipsMSub  : SDNode<"MipsISD::MSub", SDT_MipsMAddMSub>;
+def MipsMSubu : SDNode<"MipsISD::MSubu", SDT_MipsMAddMSub>;
+
+// DivRem(u) nodes
+def MipsDivRem    : SDNode<"MipsISD::DivRem", SDT_MipsMultDiv>;
+def MipsDivRemU   : SDNode<"MipsISD::DivRemU", SDT_MipsMultDiv>;
+def MipsDivRem16  : SDNode<"MipsISD::DivRem16", SDT_MipsDivRem16,
+                           [SDNPOutGlue]>;
+def MipsDivRemU16 : SDNode<"MipsISD::DivRemU16", SDT_MipsDivRem16,
+                           [SDNPOutGlue]>;
+
+// Target constant nodes that are not part of any isel patterns and remain
+// unchanged can cause instructions with illegal operands to be emitted.
+// Wrapper node patterns give the instruction selector a chance to replace
+// target constant nodes that would otherwise remain unchanged with ADDiu
+// nodes. Without these wrapper node patterns, the following conditional move
+// instruction is emitted when function cmov2 in test/CodeGen/Mips/cmov.ll is
+// compiled:
+//  movn  %got(d)($gp), %got(c)($gp), $4
+// This instruction is illegal since movn can take only register operands.
+
+def MipsWrapper    : SDNode<"MipsISD::Wrapper", SDTIntBinOp>;
+
+def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain,SDNPSideEffect]>;
+
+def MipsExt :  SDNode<"MipsISD::Ext", SDT_Ext>;
+def MipsIns :  SDNode<"MipsISD::Ins", SDT_Ins>;
+
+def MipsLWL : SDNode<"MipsISD::LWL", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsLWR : SDNode<"MipsISD::LWR", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsSWL : SDNode<"MipsISD::SWL", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsSWR : SDNode<"MipsISD::SWR", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsLDL : SDNode<"MipsISD::LDL", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsLDR : SDNode<"MipsISD::LDR", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsSDL : SDNode<"MipsISD::SDL", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsSDR : SDNode<"MipsISD::SDR", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+//===----------------------------------------------------------------------===//
+// Mips Instruction Predicate Definitions.
+//===----------------------------------------------------------------------===//
+def HasMips2     :    Predicate<"Subtarget->hasMips2()">,
+                      AssemblerPredicate<"FeatureMips2">;
+def HasMips3_32  :    Predicate<"Subtarget->hasMips3_32()">,
+                      AssemblerPredicate<"FeatureMips3_32">;
+def HasMips3_32r2 :   Predicate<"Subtarget->hasMips3_32r2()">,
+                      AssemblerPredicate<"FeatureMips3_32r2">;
+def HasMips3     :    Predicate<"Subtarget->hasMips3()">,
+                      AssemblerPredicate<"FeatureMips3">;
+def NotMips3     :    Predicate<"!Subtarget->hasMips3()">,
+                      AssemblerPredicate<"!FeatureMips3">;
+def HasMips4_32  :    Predicate<"Subtarget->hasMips4_32()">,
+                      AssemblerPredicate<"FeatureMips4_32">;
+def NotMips4_32  :    Predicate<"!Subtarget->hasMips4_32()">,
+                      AssemblerPredicate<"!FeatureMips4_32">;
+def HasMips4_32r2 :   Predicate<"Subtarget->hasMips4_32r2()">,
+                      AssemblerPredicate<"FeatureMips4_32r2">;
+def HasMips5_32r2 :   Predicate<"Subtarget->hasMips5_32r2()">,
+                      AssemblerPredicate<"FeatureMips5_32r2">;
+def HasMips32    :    Predicate<"Subtarget->hasMips32()">,
+                      AssemblerPredicate<"FeatureMips32">;
+def HasMips32r2  :    Predicate<"Subtarget->hasMips32r2()">,
+                      AssemblerPredicate<"FeatureMips32r2">;
+def HasMips32r5  :    Predicate<"Subtarget->hasMips32r5()">,
+                      AssemblerPredicate<"FeatureMips32r5">;
+def HasMips32r6  :    Predicate<"Subtarget->hasMips32r6()">,
+                      AssemblerPredicate<"FeatureMips32r6">;
+def NotMips32r6  :    Predicate<"!Subtarget->hasMips32r6()">,
+                      AssemblerPredicate<"!FeatureMips32r6">;
+def IsGP64bit    :    Predicate<"Subtarget->isGP64bit()">,
+                      AssemblerPredicate<"FeatureGP64Bit">;
+def IsGP32bit    :    Predicate<"!Subtarget->isGP64bit()">,
+                      AssemblerPredicate<"!FeatureGP64Bit">;
+def IsPTR64bit    :   Predicate<"Subtarget->isABI_N64()">,
+                      AssemblerPredicate<"FeaturePTR64Bit">;
+def IsPTR32bit    :   Predicate<"!Subtarget->isABI_N64()">,
+                      AssemblerPredicate<"!FeaturePTR64Bit">;
+def HasMips64    :    Predicate<"Subtarget->hasMips64()">,
+                      AssemblerPredicate<"FeatureMips64">;
+def NotMips64    :    Predicate<"!Subtarget->hasMips64()">,
+                      AssemblerPredicate<"!FeatureMips64">;
+def HasMips64r2  :    Predicate<"Subtarget->hasMips64r2()">,
+                      AssemblerPredicate<"FeatureMips64r2">;
+def HasMips64r6  :    Predicate<"Subtarget->hasMips64r6()">,
+                      AssemblerPredicate<"FeatureMips64r6">;
+def NotMips64r6  :    Predicate<"!Subtarget->hasMips64r6()">,
+                      AssemblerPredicate<"!FeatureMips64r6">;
+def HasMicroMips32r6 : Predicate<"Subtarget->inMicroMips32r6Mode()">,
+                       AssemblerPredicate<"FeatureMicroMips,FeatureMips32r6">;
+def HasMicroMips64r6 : Predicate<"Subtarget->inMicroMips64r6Mode()">,
+                       AssemblerPredicate<"FeatureMicroMips,FeatureMips64r6">;
+def InMips16Mode :    Predicate<"Subtarget->inMips16Mode()">,
+                      AssemblerPredicate<"FeatureMips16">;
+def HasCnMips    :    Predicate<"Subtarget->hasCnMips()">,
+                      AssemblerPredicate<"FeatureCnMips">;
+def NotCnMips    :    Predicate<"!Subtarget->hasCnMips()">,
+                      AssemblerPredicate<"!FeatureCnMips">;
+def RelocNotPIC :     Predicate<"!TM.isPositionIndependent()">;
+def RelocPIC    :     Predicate<"TM.isPositionIndependent()">;
+def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">;
+def HasStdEnc :       Predicate<"Subtarget->hasStandardEncoding()">,
+                      AssemblerPredicate<"!FeatureMips16">;
+def NotDSP :          Predicate<"!Subtarget->hasDSP()">;
+def InMicroMips    :  Predicate<"Subtarget->inMicroMipsMode()">,
+                      AssemblerPredicate<"FeatureMicroMips">;
+def NotInMicroMips :  Predicate<"!Subtarget->inMicroMipsMode()">,
+                      AssemblerPredicate<"!FeatureMicroMips">;
+def IsLE           :  Predicate<"Subtarget->isLittle()">;
+def IsBE           :  Predicate<"!Subtarget->isLittle()">;
+def IsNotNaCl    :    Predicate<"!Subtarget->isTargetNaCl()">;
+def UseTCCInDIV    :  AssemblerPredicate<"FeatureUseTCCInDIV">;
+def HasEVA       :    Predicate<"Subtarget->hasEVA()">,
+                      AssemblerPredicate<"FeatureEVA,FeatureMips32r2">;
+def HasMSA : Predicate<"Subtarget->hasMSA()">,
+             AssemblerPredicate<"FeatureMSA">;
+
+
+//===----------------------------------------------------------------------===//
+// Mips GPR size adjectives.
+// They are mutually exclusive.
+//===----------------------------------------------------------------------===//
+
+class GPR_32 { list<Predicate> GPRPredicates = [IsGP32bit]; }
+class GPR_64 { list<Predicate> GPRPredicates = [IsGP64bit]; }
+
+class PTR_32 { list<Predicate> PTRPredicates = [IsPTR32bit]; }
+class PTR_64 { list<Predicate> PTRPredicates = [IsPTR64bit]; }
+
+//===----------------------------------------------------------------------===//
+// Mips ISA/ASE membership and instruction group membership adjectives.
+// They are mutually exclusive.
+//===----------------------------------------------------------------------===//
+
+// FIXME: I'd prefer to use additive predicates to build the instruction sets
+//        but we are short on assembler feature bits at the moment. Using a
+//        subtractive predicate will hopefully keep us under the 32 predicate
+//        limit long enough to develop an alternative way to handle P1||P2
+//        predicates.
+class ISA_MIPS1_NOT_MIPS3 {
+  list<Predicate> InsnPredicates = [NotMips3];
+}
+class ISA_MIPS1_NOT_4_32 {
+  list<Predicate> InsnPredicates = [NotMips4_32];
+}
+class ISA_MIPS1_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS2    { list<Predicate> InsnPredicates = [HasMips2]; }
+class ISA_MIPS2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips2, NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS3    { list<Predicate> InsnPredicates = [HasMips3]; }
+class ISA_MIPS3_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips3, NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS32   { list<Predicate> InsnPredicates = [HasMips32]; }
+class ISA_MIPS32_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips32, NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS32R2 { list<Predicate> InsnPredicates = [HasMips32r2]; }
+class ISA_MIPS32R2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips32r2, NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS32R5 { list<Predicate> InsnPredicates = [HasMips32r5]; }
+class ISA_MIPS64   { list<Predicate> InsnPredicates = [HasMips64]; }
+class ISA_MIPS64_NOT_64R6 {
+  list<Predicate> InsnPredicates = [HasMips64, NotMips64r6];
+}
+class ISA_MIPS64R2 { list<Predicate> InsnPredicates = [HasMips64r2]; }
+class ISA_MIPS32R6 { list<Predicate> InsnPredicates = [HasMips32r6]; }
+class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; }
+class ISA_MICROMIPS { list<Predicate> InsnPredicates = [InMicroMips]; }
+class ISA_MICROMIPS32R6 {
+  list<Predicate> InsnPredicates = [HasMicroMips32r6];
+}
+class ISA_MICROMIPS64R6 {
+  list<Predicate> InsnPredicates = [HasMicroMips64r6];
+}
+class ISA_MICROMIPS32_NOT_MIPS32R6 {
+  list<Predicate> InsnPredicates = [InMicroMips, NotMips32r6];
+}
+
+class INSN_EVA { list<Predicate> InsnPredicates = [HasEVA]; }
+class INSN_EVA_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6, HasEVA];
+}
+
+// The portions of MIPS-III that were also added to MIPS32
+class INSN_MIPS3_32 { list<Predicate> InsnPredicates = [HasMips3_32]; }
+
+// The portions of MIPS-III that were also added to MIPS32 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS3_32_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips3_32, NotMips32r6, NotMips64r6];
+}
+
+// The portions of MIPS-III that were also added to MIPS32
+class INSN_MIPS3_32R2 { list<Predicate> InsnPredicates = [HasMips3_32r2]; }
+
+// The portions of MIPS-IV that were also added to MIPS32.
+class INSN_MIPS4_32 { list <Predicate> InsnPredicates = [HasMips4_32]; }
+
+// The portions of MIPS-IV that were also added to MIPS32 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS4_32_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips4_32, NotMips32r6, NotMips64r6];
+}
+
+// The portions of MIPS-IV that were also added to MIPS32r2 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS4_32R2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips4_32r2, NotMips32r6, NotMips64r6];
+}
+
+// The portions of MIPS-IV that were also added to MIPS32r2.
+class INSN_MIPS4_32R2 {
+  list<Predicate> InsnPredicates = [HasMips4_32r2];
+}
+
+// The portions of MIPS-V that were also added to MIPS32r2 but were removed in
+// MIPS32r6 and MIPS64r6.
+class INSN_MIPS5_32R2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips5_32r2, NotMips32r6, NotMips64r6];
+}
+
+class ASE_CNMIPS {
+  list<Predicate> InsnPredicates = [HasCnMips];
+}
+
+class NOT_ASE_CNMIPS {
+  list<Predicate> InsnPredicates = [NotCnMips];
+}
+
+class ASE_MIPS64_CNMIPS {
+  list<Predicate> InsnPredicates = [HasMips64, HasCnMips];
+}
+
+class ASE_MSA {
+  list<Predicate> InsnPredicates = [HasMSA];
+}
+
+class ASE_MSA_NOT_MSA64 {
+  list<Predicate> InsnPredicates = [HasMSA, NotMips64];
+}
+
+class ASE_MSA64 {
+  list<Predicate> InsnPredicates = [HasMSA, HasMips64];
+}
+
+// Class used for separating microMIPSr6 and microMIPS (r3) instruction.
+// It can be used only on instructions that doesn't inherit PredicateControl.
+class ISA_MICROMIPS_NOT_32R6_64R6 : PredicateControl {
+  let InsnPredicates = [InMicroMips, NotMips32r6, NotMips64r6];
+}
+
+class ASE_NOT_DSP {
+  list<Predicate> InsnPredicates = [NotDSP];
+}
+
+//===----------------------------------------------------------------------===//
+
+class MipsPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl {
+  let EncodingPredicates = [HasStdEnc];
+}
+
+class MipsInstAlias<string Asm, dag Result, bit Emit = 0b1> :
+  InstAlias<Asm, Result, Emit>, PredicateControl;
+
+class IsCommutable {
+  bit isCommutable = 1;
+}
+
+class IsBranch {
+  bit isBranch = 1;
+  bit isCTI = 1;
+}
+
+class IsReturn {
+  bit isReturn = 1;
+  bit isCTI = 1;
+}
+
+class IsCall {
+  bit isCall = 1;
+  bit isCTI = 1;
+}
+
+class IsTailCall {
+  bit isCall = 1;
+  bit isTerminator = 1;
+  bit isReturn = 1;
+  bit isBarrier = 1;
+  bit hasExtraSrcRegAllocReq = 1;
+  bit isCodeGenOnly = 1;
+  bit isCTI = 1;
+}
+
+class IsAsCheapAsAMove {
+  bit isAsCheapAsAMove = 1;
+}
+
+class NeverHasSideEffects {
+  bit hasSideEffects = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "MipsInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Mips Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+class ConstantSImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [],
+                                  int Offset = 0> : AsmOperandClass {
+  let Name = "ConstantSImm" # Bits # "_" # Offset;
+  let RenderMethod = "addConstantSImmOperands<" # Bits # ", " # Offset # ">";
+  let PredicateMethod = "isConstantSImm<" # Bits # ", " # Offset # ">";
+  let SuperClasses = Supers;
+  let DiagnosticType = "SImm" # Bits # "_" # Offset;
+}
+
+class SimmLslAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [],
+                                  int Shift = 0> : AsmOperandClass {
+  let Name = "Simm" # Bits # "_Lsl" # Shift;
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isScaledSImm<" # Bits # ", " # Shift # ">";
+  let SuperClasses = Supers;
+  let DiagnosticType = "SImm" # Bits # "_Lsl" # Shift;
+}
+
+class ConstantUImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [],
+                                  int Offset = 0> : AsmOperandClass {
+  let Name = "ConstantUImm" # Bits # "_" # Offset;
+  let RenderMethod = "addConstantUImmOperands<" # Bits # ", " # Offset # ">";
+  let PredicateMethod = "isConstantUImm<" # Bits # ", " # Offset # ">";
+  let SuperClasses = Supers;
+  let DiagnosticType = "UImm" # Bits # "_" # Offset;
+}
+
+class ConstantUImmRangeAsmOperandClass<int Bottom, int Top,
+                                       list<AsmOperandClass> Supers = []>
+    : AsmOperandClass {
+  let Name = "ConstantUImmRange" # Bottom # "_" # Top;
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isConstantUImmRange<" # Bottom # ", " # Top # ">";
+  let SuperClasses = Supers;
+  let DiagnosticType = "UImmRange" # Bottom # "_" # Top;
+}
+
+class SImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = []>
+    : AsmOperandClass {
+  let Name = "SImm" # Bits;
+  let RenderMethod = "addSImmOperands<" # Bits # ">";
+  let PredicateMethod = "isSImm<" # Bits # ">";
+  let SuperClasses = Supers;
+  let DiagnosticType = "SImm" # Bits;
+}
+
+class UImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = []>
+    : AsmOperandClass {
+  let Name = "UImm" # Bits;
+  let RenderMethod = "addUImmOperands<" # Bits # ">";
+  let PredicateMethod = "isUImm<" # Bits # ">";
+  let SuperClasses = Supers;
+  let DiagnosticType = "UImm" # Bits;
+}
+
+// Generic case - only to support certain assembly pseudo instructions.
+class UImmAnyAsmOperandClass<int Bits, list<AsmOperandClass> Supers = []>
+    : AsmOperandClass {
+  let Name = "ImmAny";
+  let RenderMethod = "addConstantUImmOperands<32>";
+  let PredicateMethod = "isSImm<" # Bits # ">";
+  let SuperClasses = Supers;
+  let DiagnosticType = "ImmAny";
+}
+
+// AsmOperandClasses require a strict ordering which is difficult to manage
+// as a hierarchy. Instead, we use a linear ordering and impose an order that
+// is in some places arbitrary.
+//
+// Here the rules that are in use:
+// * Wider immediates are a superset of narrower immediates:
+//     uimm4 < uimm5 < uimm6
+// * For the same bit-width, unsigned immediates are a superset of signed
+//   immediates::
+//     simm4 < uimm4 < simm5 < uimm5
+// * For the same upper-bound, signed immediates are a superset of unsigned
+//   immediates:
+//     uimm3 < simm4 < uimm4 < simm4
+// * Modified immediates are a superset of ordinary immediates:
+//     uimm5 < uimm5_plus1 (1..32) < uimm5_plus32 (32..63) < uimm6
+//   The term 'superset' starts to break down here since the uimm5_plus* classes
+//   are not true supersets of uimm5 (but they are still subsets of uimm6).
+// * 'Relaxed' immediates are supersets of the corresponding unsigned immediate.
+//     uimm16 < uimm16_relaxed
+// * The codeGen pattern type is arbitrarily ordered.
+//     uimm5 < uimm5_64, and uimm5 < vsplat_uimm5
+//   This is entirely arbitrary. We need an ordering and what we pick is
+//   unimportant since only one is possible for a given mnemonic.
+
+def UImm32CoercedAsmOperandClass : UImmAnyAsmOperandClass<33, []> {
+  let Name = "UImm32_Coerced";
+  let DiagnosticType = "UImm32_Coerced";
+}
+def SImm32RelaxedAsmOperandClass
+    : SImmAsmOperandClass<32, [UImm32CoercedAsmOperandClass]> {
+  let Name = "SImm32_Relaxed";
+  let PredicateMethod = "isAnyImm<32>";
+  let DiagnosticType = "SImm32_Relaxed";
+}
+def SImm32AsmOperandClass
+    : SImmAsmOperandClass<32, [SImm32RelaxedAsmOperandClass]>;
+def ConstantUImm26AsmOperandClass
+    : ConstantUImmAsmOperandClass<26, [SImm32AsmOperandClass]>;
+def ConstantUImm20AsmOperandClass
+    : ConstantUImmAsmOperandClass<20, [ConstantUImm26AsmOperandClass]>;
+def ConstantSImm19Lsl2AsmOperandClass : AsmOperandClass {
+  let Name = "SImm19Lsl2";
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isScaledSImm<19, 2>";
+  let SuperClasses = [ConstantUImm20AsmOperandClass];
+  let DiagnosticType = "SImm19_Lsl2";
+}
+def UImm16RelaxedAsmOperandClass
+    : UImmAsmOperandClass<16, [ConstantUImm20AsmOperandClass]> {
+  let Name = "UImm16_Relaxed";
+  let PredicateMethod = "isAnyImm<16>";
+  let DiagnosticType = "UImm16_Relaxed";
+}
+// Similar to the relaxed classes which take an SImm and render it as
+// an UImm, this takes a UImm and renders it as an SImm.
+def UImm16AltRelaxedAsmOperandClass
+    : SImmAsmOperandClass<16, [UImm16RelaxedAsmOperandClass]> {
+  let Name = "UImm16_AltRelaxed";
+  let PredicateMethod = "isUImm<16>";
+  let DiagnosticType = "UImm16_AltRelaxed";
+}
+// FIXME: One of these should probably have UImm16AsmOperandClass as the
+//        superclass instead of UImm16RelaxedasmOPerandClass.
+def UImm16AsmOperandClass
+    : UImmAsmOperandClass<16, [UImm16RelaxedAsmOperandClass]>;
+def SImm16RelaxedAsmOperandClass
+    : SImmAsmOperandClass<16, [UImm16RelaxedAsmOperandClass]> {
+  let Name = "SImm16_Relaxed";
+  let PredicateMethod = "isAnyImm<16>";
+  let DiagnosticType = "SImm16_Relaxed";
+}
+def SImm16AsmOperandClass
+    : SImmAsmOperandClass<16, [SImm16RelaxedAsmOperandClass]>;
+def ConstantSImm10Lsl3AsmOperandClass : AsmOperandClass {
+  let Name = "SImm10Lsl3";
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isScaledSImm<10, 3>";
+  let SuperClasses = [SImm16AsmOperandClass];
+  let DiagnosticType = "SImm10_Lsl3";
+}
+def ConstantSImm10Lsl2AsmOperandClass : AsmOperandClass {
+  let Name = "SImm10Lsl2";
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isScaledSImm<10, 2>";
+  let SuperClasses = [ConstantSImm10Lsl3AsmOperandClass];
+  let DiagnosticType = "SImm10_Lsl2";
+}
+def ConstantSImm11AsmOperandClass
+    : ConstantSImmAsmOperandClass<11, [ConstantSImm10Lsl2AsmOperandClass]>;
+def ConstantSImm10Lsl1AsmOperandClass : AsmOperandClass {
+  let Name = "SImm10Lsl1";
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isScaledSImm<10, 1>";
+  let SuperClasses = [ConstantSImm11AsmOperandClass];
+  let DiagnosticType = "SImm10_Lsl1";
+}
+def ConstantUImm10AsmOperandClass
+    : ConstantUImmAsmOperandClass<10, [ConstantSImm10Lsl1AsmOperandClass]>;
+def ConstantSImm10AsmOperandClass
+    : ConstantSImmAsmOperandClass<10, [ConstantUImm10AsmOperandClass]>;
+def ConstantSImm9AsmOperandClass
+    : ConstantSImmAsmOperandClass<9, [ConstantSImm10AsmOperandClass]>;
+def ConstantSImm7Lsl2AsmOperandClass : AsmOperandClass {
+  let Name = "SImm7Lsl2";
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isScaledSImm<7, 2>";
+  let SuperClasses = [ConstantSImm9AsmOperandClass];
+  let DiagnosticType = "SImm7_Lsl2";
+}
+def ConstantUImm8AsmOperandClass
+    : ConstantUImmAsmOperandClass<8, [ConstantSImm7Lsl2AsmOperandClass]>;
+def ConstantUImm7Sub1AsmOperandClass
+    : ConstantUImmAsmOperandClass<7, [ConstantUImm8AsmOperandClass], -1> {
+  // Specify the names since the -1 offset causes invalid identifiers otherwise.
+  let Name = "UImm7_N1";
+  let DiagnosticType = "UImm7_N1";
+}
+def ConstantUImm7AsmOperandClass
+    : ConstantUImmAsmOperandClass<7, [ConstantUImm7Sub1AsmOperandClass]>;
+def ConstantUImm6Lsl2AsmOperandClass : AsmOperandClass {
+  let Name = "UImm6Lsl2";
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isScaledUImm<6, 2>";
+  let SuperClasses = [ConstantUImm7AsmOperandClass];
+  let DiagnosticType = "UImm6_Lsl2";
+}
+def ConstantUImm6AsmOperandClass
+    : ConstantUImmAsmOperandClass<6, [ConstantUImm6Lsl2AsmOperandClass]>;
+def ConstantSImm6AsmOperandClass
+    : ConstantSImmAsmOperandClass<6, [ConstantUImm6AsmOperandClass]>;
+def ConstantUImm5Lsl2AsmOperandClass : AsmOperandClass {
+  let Name = "UImm5Lsl2";
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isScaledUImm<5, 2>";
+  let SuperClasses = [ConstantSImm6AsmOperandClass];
+  let DiagnosticType = "UImm5_Lsl2";
+}
+def ConstantUImm5_Range2_64AsmOperandClass
+    : ConstantUImmRangeAsmOperandClass<2, 64, [ConstantUImm5Lsl2AsmOperandClass]>;
+def ConstantUImm5Plus33AsmOperandClass
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm5_Range2_64AsmOperandClass],
+                                  33>;
+def ConstantUImm5ReportUImm6AsmOperandClass
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm5Plus33AsmOperandClass]> {
+  let Name = "ConstantUImm5_0_Report_UImm6";
+  let DiagnosticType = "UImm5_0_Report_UImm6";
+}
+def ConstantUImm5Plus32AsmOperandClass
+    : ConstantUImmAsmOperandClass<
+          5, [ConstantUImm5ReportUImm6AsmOperandClass], 32>;
+def ConstantUImm5Plus32NormalizeAsmOperandClass
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm5Plus32AsmOperandClass], 32> {
+  let Name = "ConstantUImm5_32_Norm";
+  // We must also subtract 32 when we render the operand.
+  let RenderMethod = "addConstantUImmOperands<5, 32, -32>";
+}
+def ConstantUImm5Plus1AsmOperandClass
+    : ConstantUImmAsmOperandClass<
+          5, [ConstantUImm5Plus32NormalizeAsmOperandClass], 1>;
+def ConstantUImm5AsmOperandClass
+    : ConstantUImmAsmOperandClass<5, [ConstantUImm5Plus1AsmOperandClass]>;
+def ConstantSImm5AsmOperandClass
+    : ConstantSImmAsmOperandClass<5, [ConstantUImm5AsmOperandClass]>;
+def ConstantUImm4AsmOperandClass
+    : ConstantUImmAsmOperandClass<4, [ConstantSImm5AsmOperandClass]>;
+def ConstantSImm4AsmOperandClass
+    : ConstantSImmAsmOperandClass<4, [ConstantUImm4AsmOperandClass]>;
+def ConstantUImm3AsmOperandClass
+    : ConstantUImmAsmOperandClass<3, [ConstantSImm4AsmOperandClass]>;
+def ConstantUImm2Plus1AsmOperandClass
+    : ConstantUImmAsmOperandClass<2, [ConstantUImm3AsmOperandClass], 1>;
+def ConstantUImm2AsmOperandClass
+    : ConstantUImmAsmOperandClass<2, [ConstantUImm3AsmOperandClass]>;
+def ConstantUImm1AsmOperandClass
+    : ConstantUImmAsmOperandClass<1, [ConstantUImm2AsmOperandClass]>;
+def ConstantImmzAsmOperandClass : AsmOperandClass {
+  let Name = "ConstantImmz";
+  let RenderMethod = "addConstantUImmOperands<1>";
+  let PredicateMethod = "isConstantImmz";
+  let SuperClasses = [ConstantUImm1AsmOperandClass];
+  let DiagnosticType = "Immz";
+}
+
+def Simm19Lsl2AsmOperand
+    : SimmLslAsmOperandClass<19, [], 2>;
+
+def MipsJumpTargetAsmOperand : AsmOperandClass {
+  let Name = "JumpTarget";
+  let ParserMethod = "parseJumpTarget";
+  let PredicateMethod = "isImm";
+  let RenderMethod = "addImmOperands";
+}
+
+// Instruction operand types
+def jmptarget   : Operand<OtherVT> {
+  let EncoderMethod = "getJumpTargetOpValue";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+def brtarget    : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+def brtarget1SImm16 : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue1SImm16";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget1SImm16";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+def calltarget  : Operand<iPTR> {
+  let EncoderMethod = "getJumpTargetOpValue";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def imm64: Operand<i64>;
+
+def simm19_lsl2 : Operand<i32> {
+  let EncoderMethod = "getSimm19Lsl2Encoding";
+  let DecoderMethod = "DecodeSimm19Lsl2";
+  let ParserMatchClass = Simm19Lsl2AsmOperand;
+}
+
+def simm18_lsl3 : Operand<i32> {
+  let EncoderMethod = "getSimm18Lsl3Encoding";
+  let DecoderMethod = "DecodeSimm18Lsl3";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+// Zero
+def uimmz       : Operand<i32> {
+  let PrintMethod = "printUImm<0>";
+  let ParserMatchClass = ConstantImmzAsmOperandClass;
+}
+
+// size operand of ins instruction
+def uimm_range_2_64 : Operand<i32> {
+  let PrintMethod = "printUImm<6, 2>";
+  let EncoderMethod = "getSizeInsEncoding";
+  let DecoderMethod = "DecodeInsSize";
+  let ParserMatchClass = ConstantUImm5_Range2_64AsmOperandClass;
+}
+
+// Unsigned Operands
+foreach I = {1, 2, 3, 4, 5, 6, 7, 8, 10, 20, 26} in
+  def uimm # I : Operand<i32> {
+    let PrintMethod = "printUImm<" # I # ">";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass");
+  }
+
+def uimm2_plus1 : Operand<i32> {
+  let PrintMethod = "printUImm<2, 1>";
+  let EncoderMethod = "getUImmWithOffsetEncoding<2, 1>";
+  let DecoderMethod = "DecodeUImmWithOffset<2, 1>";
+  let ParserMatchClass = ConstantUImm2Plus1AsmOperandClass;
+}
+
+def uimm5_plus1 : Operand<i32> {
+  let PrintMethod = "printUImm<5, 1>";
+  let EncoderMethod = "getUImmWithOffsetEncoding<5, 1>";
+  let DecoderMethod = "DecodeUImmWithOffset<5, 1>";
+  let ParserMatchClass = ConstantUImm5Plus1AsmOperandClass;
+}
+
+def uimm5_plus32 : Operand<i32> {
+  let PrintMethod = "printUImm<5, 32>";
+  let ParserMatchClass = ConstantUImm5Plus32AsmOperandClass;
+}
+
+def uimm5_plus33 : Operand<i32> {
+  let PrintMethod = "printUImm<5, 33>";
+  let EncoderMethod = "getUImmWithOffsetEncoding<5, 1>";
+  let DecoderMethod = "DecodeUImmWithOffset<5, 1>";
+  let ParserMatchClass = ConstantUImm5Plus33AsmOperandClass;
+}
+
+def uimm5_inssize_plus1 : Operand<i32> {
+  let PrintMethod = "printUImm<6>";
+  let ParserMatchClass = ConstantUImm5Plus1AsmOperandClass;
+  let EncoderMethod = "getSizeInsEncoding";
+  let DecoderMethod = "DecodeInsSize";
+}
+
+def uimm5_plus32_normalize : Operand<i32> {
+  let PrintMethod = "printUImm<5>";
+  let ParserMatchClass = ConstantUImm5Plus32NormalizeAsmOperandClass;
+}
+
+def uimm5_lsl2 : Operand<OtherVT> {
+  let EncoderMethod = "getUImm5Lsl2Encoding";
+  let DecoderMethod = "DecodeUImmWithOffsetAndScale<5, 0, 4>";
+  let ParserMatchClass = ConstantUImm5Lsl2AsmOperandClass;
+}
+
+def uimm5_plus32_normalize_64 : Operand<i64> {
+  let PrintMethod = "printUImm<5>";
+  let ParserMatchClass = ConstantUImm5Plus32NormalizeAsmOperandClass;
+}
+
+def uimm6_lsl2 : Operand<OtherVT> {
+  let EncoderMethod = "getUImm6Lsl2Encoding";
+  let DecoderMethod = "DecodeUImmWithOffsetAndScale<6, 0, 4>";
+  let ParserMatchClass = ConstantUImm6Lsl2AsmOperandClass;
+}
+
+foreach I = {16} in
+  def uimm # I : Operand<i32> {
+    let PrintMethod = "printUImm<" # I # ">";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("UImm" # I # "AsmOperandClass");
+  }
+
+// Like uimm16_64 but coerces simm16 to uimm16.
+def uimm16_relaxed : Operand<i32> {
+  let PrintMethod = "printUImm<16>";
+  let ParserMatchClass =
+      !cast<AsmOperandClass>("UImm16RelaxedAsmOperandClass");
+}
+
+foreach I = {5} in
+  def uimm # I # _64 : Operand<i64> {
+    let PrintMethod = "printUImm<" # I # ">";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass");
+  }
+
+foreach I = {16} in
+  def uimm # I # _64 : Operand<i64> {
+    let PrintMethod = "printUImm<" # I # ">";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("UImm" # I # "AsmOperandClass");
+  }
+
+// Like uimm16_64 but coerces simm16 to uimm16.
+def uimm16_64_relaxed : Operand<i64> {
+  let PrintMethod = "printUImm<16>";
+  let ParserMatchClass =
+      !cast<AsmOperandClass>("UImm16RelaxedAsmOperandClass");
+}
+
+def uimm16_altrelaxed : Operand<i32> {
+  let PrintMethod = "printUImm<16>";
+  let ParserMatchClass =
+      !cast<AsmOperandClass>("UImm16AltRelaxedAsmOperandClass");
+}
+// Like uimm5 but reports a less confusing error for 32-63 when
+// an instruction alias permits that.
+def uimm5_report_uimm6 : Operand<i32> {
+  let PrintMethod = "printUImm<5>";
+  let ParserMatchClass = ConstantUImm5ReportUImm6AsmOperandClass;
+}
+
+// Like uimm5_64 but reports a less confusing error for 32-63 when
+// an instruction alias permits that.
+def uimm5_64_report_uimm6 : Operand<i64> {
+  let PrintMethod = "printUImm<5>";
+  let ParserMatchClass = ConstantUImm5ReportUImm6AsmOperandClass;
+}
+
+foreach I = {1, 2, 3, 4} in
+  def uimm # I # _ptr : Operand<iPTR> {
+    let PrintMethod = "printUImm<" # I # ">";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass");
+  }
+
+foreach I = {1, 2, 3, 4, 5, 6, 8} in
+  def vsplat_uimm # I : Operand<vAny> {
+    let PrintMethod = "printUImm<" # I # ">";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass");
+  }
+
+// Signed operands
+foreach I = {4, 5, 6, 9, 10, 11} in
+  def simm # I : Operand<i32> {
+    let DecoderMethod = "DecodeSImmWithOffsetAndScale<" # I # ">";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("ConstantSImm" # I # "AsmOperandClass");
+  }
+
+foreach I = {1, 2, 3} in
+  def simm10_lsl # I : Operand<i32> {
+    let DecoderMethod = "DecodeSImmWithOffsetAndScale<10, " # I # ">";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("ConstantSImm10Lsl" # I # "AsmOperandClass");
+  }
+
+foreach I = {10} in
+  def simm # I # _64 : Operand<i64> {
+    let DecoderMethod = "DecodeSImmWithOffsetAndScale<" # I # ">";
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("ConstantSImm" # I # "AsmOperandClass");
+  }
+
+foreach I = {5, 10} in
+  def vsplat_simm # I : Operand<vAny> {
+    let ParserMatchClass =
+        !cast<AsmOperandClass>("ConstantSImm" # I # "AsmOperandClass");
+  }
+
+def simm7_lsl2 : Operand<OtherVT> {
+  let EncoderMethod = "getSImm7Lsl2Encoding";
+  let DecoderMethod = "DecodeSImmWithOffsetAndScale<" # I # ", 0, 4>";
+  let ParserMatchClass = ConstantSImm7Lsl2AsmOperandClass;
+}
+
+foreach I = {16, 32} in
+  def simm # I : Operand<i32> {
+    let DecoderMethod = "DecodeSImmWithOffsetAndScale<" # I # ">";
+    let ParserMatchClass = !cast<AsmOperandClass>("SImm" # I # "AsmOperandClass");
+  }
+
+// Like simm16 but coerces uimm16 to simm16.
+def simm16_relaxed : Operand<i32> {
+  let DecoderMethod = "DecodeSImmWithOffsetAndScale<16>";
+  let ParserMatchClass = !cast<AsmOperandClass>("SImm16RelaxedAsmOperandClass");
+}
+
+def simm16_64 : Operand<i64> {
+  let DecoderMethod = "DecodeSImmWithOffsetAndScale<16>";
+  let ParserMatchClass = !cast<AsmOperandClass>("SImm16AsmOperandClass");
+}
+
+// like simm32 but coerces simm32 to uimm32.
+def uimm32_coerced : Operand<i32> {
+  let ParserMatchClass = !cast<AsmOperandClass>("UImm32CoercedAsmOperandClass");
+}
+// Like simm32 but coerces uimm32 to simm32.
+def simm32_relaxed : Operand<i32> {
+  let DecoderMethod = "DecodeSImmWithOffsetAndScale<32>";
+  let ParserMatchClass = !cast<AsmOperandClass>("SImm32RelaxedAsmOperandClass");
+}
+
+// This is almost the same as a uimm7 but 0x7f is interpreted as -1.
+def li16_imm : Operand<i32> {
+  let DecoderMethod = "DecodeLi16Imm";
+  let ParserMatchClass = ConstantUImm7Sub1AsmOperandClass;
+}
+
+def MipsMemAsmOperand : AsmOperandClass {
+  let Name = "Mem";
+  let ParserMethod = "parseMemOperand";
+}
+
+def MipsMemSimm9AsmOperand : AsmOperandClass {
+  let Name = "MemOffsetSimm9";
+  let SuperClasses = [MipsMemAsmOperand];
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithSimmOffset<9>";
+  let DiagnosticType = "MemSImm9";
+}
+
+def MipsMemSimm10AsmOperand : AsmOperandClass {
+  let Name = "MemOffsetSimm10";
+  let SuperClasses = [MipsMemAsmOperand];
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithSimmOffset<10>";
+  let DiagnosticType = "MemSImm10";
+}
+
+def MipsMemSimm12AsmOperand : AsmOperandClass {
+  let Name = "MemOffsetSimm12";
+  let SuperClasses = [MipsMemAsmOperand];
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithSimmOffset<12>";
+  let DiagnosticType = "MemSImm12";
+}
+
+foreach I = {1, 2, 3} in
+  def MipsMemSimm10Lsl # I # AsmOperand : AsmOperandClass {
+    let Name = "MemOffsetSimm10_" # I;
+    let SuperClasses = [MipsMemAsmOperand];
+    let RenderMethod = "addMemOperands";
+    let ParserMethod = "parseMemOperand";
+    let PredicateMethod = "isMemWithSimmOffset<10, " # I # ">";
+    let DiagnosticType = "MemSImm10Lsl" # I;
+  }
+
+def MipsMemSimm11AsmOperand : AsmOperandClass {
+  let Name = "MemOffsetSimm11";
+  let SuperClasses = [MipsMemAsmOperand];
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithSimmOffset<11>";
+  let DiagnosticType = "MemSImm11";
+}
+
+def MipsMemSimm16AsmOperand : AsmOperandClass {
+  let Name = "MemOffsetSimm16";
+  let SuperClasses = [MipsMemAsmOperand];
+  let RenderMethod = "addMemOperands";
+  let ParserMethod = "parseMemOperand";
+  let PredicateMethod = "isMemWithSimmOffset<16>";
+  let DiagnosticType = "MemSImm16";
+}
+
+def MipsInvertedImmoperand : AsmOperandClass {
+  let Name = "InvNum";
+  let RenderMethod = "addImmOperands";
+  let ParserMethod = "parseInvNum";
+}
+
+def InvertedImOperand : Operand<i32> {
+  let ParserMatchClass = MipsInvertedImmoperand;
+}
+
+def InvertedImOperand64 : Operand<i64> {
+  let ParserMatchClass = MipsInvertedImmoperand;
+}
+
+class mem_generic : Operand<iPTR> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops ptr_rc, simm16);
+  let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemAsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+// Address operand
+def mem : mem_generic;
+
+// MSA specific address operand
+def mem_msa : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm10);
+  let EncoderMethod = "getMSAMemEncoding";
+}
+
+def simm12 : Operand<i32> {
+  let DecoderMethod = "DecodeSimm12";
+}
+
+def mem_simm9 : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm9);
+  let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemSimm9AsmOperand;
+}
+
+def mem_simm10 : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm10);
+  let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemSimm10AsmOperand;
+}
+
+foreach I = {1, 2, 3} in
+  def mem_simm10_lsl # I : mem_generic {
+    let MIOperandInfo = (ops ptr_rc, !cast<Operand>("simm10_lsl" # I));
+    let EncoderMethod = "getMemEncoding<" # I  # ">";
+    let ParserMatchClass =
+            !cast<AsmOperandClass>("MipsMemSimm10Lsl" # I # "AsmOperand");
+  }
+
+def mem_simm11 : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm11);
+  let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemSimm11AsmOperand;
+}
+
+def mem_simm12 : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm12);
+  let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemSimm12AsmOperand;
+}
+
+def mem_simm16 : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm16);
+  let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemSimm16AsmOperand;
+}
+
+def mem_ea : Operand<iPTR> {
+  let PrintMethod = "printMemOperandEA";
+  let MIOperandInfo = (ops ptr_rc, simm16);
+  let EncoderMethod = "getMemEncoding";
+  let OperandType = "OPERAND_MEMORY";
+}
+
+def PtrRC : Operand<iPTR> {
+  let MIOperandInfo = (ops ptr_rc);
+  let DecoderMethod = "DecodePtrRegisterClass";
+  let ParserMatchClass = GPR32AsmOperand;
+}
+
+// size operand of ins instruction
+def size_ins : Operand<i32> {
+  let EncoderMethod = "getSizeInsEncoding";
+  let DecoderMethod = "DecodeInsSize";
+}
+
+// Transformation Function - get the lower 16 bits.
+def LO16 : SDNodeXForm<imm, [{
+  return getImm(N, N->getZExtValue() & 0xFFFF);
+}]>;
+
+// Transformation Function - get the higher 16 bits.
+def HI16 : SDNodeXForm<imm, [{
+  return getImm(N, (N->getZExtValue() >> 16) & 0xFFFF);
+}]>;
+
+// Plus 1.
+def Plus1 : SDNodeXForm<imm, [{ return getImm(N, N->getSExtValue() + 1); }]>;
+
+// Node immediate is zero (e.g. insve.d)
+def immz : PatLeaf<(imm), [{ return N->getSExtValue() == 0; }]>;
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt8  : PatLeaf<(imm), [{ return isInt<8>(N->getSExtValue()); }]>;
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt16  : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>;
+
+// Node immediate fits as 7-bit zero extended on target immediate.
+def immZExt7 : PatLeaf<(imm), [{ return isUInt<7>(N->getZExtValue()); }]>;
+
+// Node immediate fits as 16-bit zero extended on target immediate.
+// The LO16 param means that only the lower 16 bits of the node
+// immediate are caught.
+// e.g. addiu, sltiu
+def immZExt16  : PatLeaf<(imm), [{
+  if (N->getValueType(0) == MVT::i32)
+    return (uint32_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+  else
+    return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+}], LO16>;
+
+// Immediate can be loaded with LUi (32-bit int with lower 16-bit cleared).
+def immSExt32Low16Zero : PatLeaf<(imm), [{
+  int64_t Val = N->getSExtValue();
+  return isInt<32>(Val) && !(Val & 0xffff);
+}]>;
+
+// Zero-extended 32-bit unsigned int with lower 16-bit cleared.
+def immZExt32Low16Zero : PatLeaf<(imm), [{
+  uint64_t Val = N->getZExtValue();
+  return isUInt<32>(Val) && !(Val & 0xffff);
+}]>;
+
+// Note immediate fits as a 32 bit signed extended on target immediate.
+def immSExt32  : PatLeaf<(imm), [{ return isInt<32>(N->getSExtValue()); }]>;
+
+// Note immediate fits as a 32 bit zero extended on target immediate.
+def immZExt32  : PatLeaf<(imm), [{ return isUInt<32>(N->getZExtValue()); }]>;
+
+// shamt field must fit in 5 bits.
+def immZExt5 : ImmLeaf<i32, [{return Imm == (Imm & 0x1f);}]>;
+
+def immZExt5Plus1 : PatLeaf<(imm), [{
+  return isUInt<5>(N->getZExtValue() - 1);
+}]>;
+def immZExt5Plus32 : PatLeaf<(imm), [{
+  return isUInt<5>(N->getZExtValue() - 32);
+}]>;
+def immZExt5Plus33 : PatLeaf<(imm), [{
+  return isUInt<5>(N->getZExtValue() - 33);
+}]>;
+
+// True if (N + 1) fits in 16-bit field.
+def immSExt16Plus1 : PatLeaf<(imm), [{
+  return isInt<17>(N->getSExtValue()) && isInt<16>(N->getSExtValue() + 1);
+}]>;
+
+// Mips Address Mode! SDNode frameindex could possibily be a match
+// since load and store instructions from stack used it.
+def addr :
+  ComplexPattern<iPTR, 2, "selectIntAddr", [frameindex]>;
+
+def addrRegImm :
+  ComplexPattern<iPTR, 2, "selectAddrRegImm", [frameindex]>;
+
+def addrDefault :
+  ComplexPattern<iPTR, 2, "selectAddrDefault", [frameindex]>;
+
+def addrimm10 : ComplexPattern<iPTR, 2, "selectIntAddrSImm10", [frameindex]>;
+def addrimm10lsl1 : ComplexPattern<iPTR, 2, "selectIntAddrSImm10Lsl1",
+                                   [frameindex]>;
+def addrimm10lsl2 : ComplexPattern<iPTR, 2, "selectIntAddrSImm10Lsl2",
+                                   [frameindex]>;
+def addrimm10lsl3 : ComplexPattern<iPTR, 2, "selectIntAddrSImm10Lsl3",
+                                   [frameindex]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+
+// Arithmetic and logical instructions with 3 register operands.
+class ArithLogicR<string opstr, RegisterOperand RO, bit isComm = 0,
+                  InstrItinClass Itin = NoItinerary,
+                  SDPatternOperator OpNode = null_frag>:
+  InstSE<(outs RO:$rd), (ins RO:$rs, RO:$rt),
+         !strconcat(opstr, "\t$rd, $rs, $rt"),
+         [(set RO:$rd, (OpNode RO:$rs, RO:$rt))], Itin, FrmR, opstr> {
+  let isCommutable = isComm;
+  let isReMaterializable = 1;
+  let TwoOperandAliasConstraint = "$rd = $rs";
+}
+
+// Arithmetic and logical instructions with 2 register operands.
+class ArithLogicI<string opstr, Operand Od, RegisterOperand RO,
+                  InstrItinClass Itin = NoItinerary,
+                  SDPatternOperator imm_type = null_frag,
+                  SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RO:$rt), (ins RO:$rs, Od:$imm16),
+         !strconcat(opstr, "\t$rt, $rs, $imm16"),
+         [(set RO:$rt, (OpNode RO:$rs, imm_type:$imm16))],
+         Itin, FrmI, opstr> {
+  let isReMaterializable = 1;
+  let TwoOperandAliasConstraint = "$rs = $rt";
+}
+
+// Arithmetic Multiply ADD/SUB
+class MArithR<string opstr, InstrItinClass itin, bit isComm = 0> :
+  InstSE<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+         !strconcat(opstr, "\t$rs, $rt"), [], itin, FrmR, opstr> {
+  let Defs = [HI0, LO0];
+  let Uses = [HI0, LO0];
+  let isCommutable = isComm;
+}
+
+//  Logical
+class LogicNOR<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rd), (ins RO:$rs, RO:$rt),
+         !strconcat(opstr, "\t$rd, $rs, $rt"),
+         [(set RO:$rd, (not (or RO:$rs, RO:$rt)))], II_NOR, FrmR, opstr> {
+  let isCommutable = 1;
+}
+
+// Shifts
+class shift_rotate_imm<string opstr, Operand ImmOpnd,
+                       RegisterOperand RO, InstrItinClass itin,
+                       SDPatternOperator OpNode = null_frag,
+                       SDPatternOperator PF = null_frag> :
+  InstSE<(outs RO:$rd), (ins RO:$rt, ImmOpnd:$shamt),
+         !strconcat(opstr, "\t$rd, $rt, $shamt"),
+         [(set RO:$rd, (OpNode RO:$rt, PF:$shamt))], itin, FrmR, opstr> {
+  let TwoOperandAliasConstraint = "$rt = $rd";
+}
+
+class shift_rotate_reg<string opstr, RegisterOperand RO, InstrItinClass itin,
+                       SDPatternOperator OpNode = null_frag>:
+  InstSE<(outs RO:$rd), (ins RO:$rt, GPR32Opnd:$rs),
+         !strconcat(opstr, "\t$rd, $rt, $rs"),
+         [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs))], itin, FrmR,
+         opstr>;
+
+// Load Upper Immediate
+class LoadUpper<string opstr, RegisterOperand RO, Operand Imm>:
+  InstSE<(outs RO:$rt), (ins Imm:$imm16), !strconcat(opstr, "\t$rt, $imm16"),
+         [], II_LUI, FrmI, opstr>, IsAsCheapAsAMove {
+  let hasSideEffects = 0;
+  let isReMaterializable = 1;
+}
+
+// Memory Load/Store
+class LoadMemory<string opstr, DAGOperand RO, DAGOperand MO,
+                 SDPatternOperator OpNode = null_frag,
+                 InstrItinClass Itin = NoItinerary,
+                 ComplexPattern Addr = addr> :
+  InstSE<(outs RO:$rt), (ins MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, (OpNode Addr:$addr))], Itin, FrmI, opstr> {
+  let DecoderMethod = "DecodeMem";
+  let canFoldAsLoad = 1;
+  let mayLoad = 1;
+}
+
+class Load<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+           InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  LoadMemory<opstr, RO, mem, OpNode, Itin, Addr>;
+
+class StoreMemory<string opstr, DAGOperand RO, DAGOperand MO,
+            SDPatternOperator OpNode = null_frag,
+            InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  InstSE<(outs), (ins RO:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> {
+  let DecoderMethod = "DecodeMem";
+  let mayStore = 1;
+}
+
+class Store<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+            InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr,
+            DAGOperand MO = mem> :
+  StoreMemory<opstr, RO, MO, OpNode, Itin, Addr>;
+
+// Load/Store Left/Right
+let canFoldAsLoad = 1 in
+class LoadLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
+                    InstrItinClass Itin> :
+  InstSE<(outs RO:$rt), (ins mem:$addr, RO:$src),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, (OpNode addr:$addr, RO:$src))], Itin, FrmI> {
+  let DecoderMethod = "DecodeMem";
+  string Constraints = "$src = $rt";
+}
+
+class StoreLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
+                     InstrItinClass Itin> :
+  InstSE<(outs), (ins RO:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, addr:$addr)], Itin, FrmI> {
+  let DecoderMethod = "DecodeMem";
+}
+
+// COP2 Load/Store
+class LW_FT2<string opstr, RegisterOperand RC, InstrItinClass Itin,
+             SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs RC:$rt), (ins mem_simm16:$addr),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI, opstr> {
+  let DecoderMethod = "DecodeFMem2";
+  let mayLoad = 1;
+}
+
+class SW_FT2<string opstr, RegisterOperand RC, InstrItinClass Itin,
+             SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs), (ins RC:$rt, mem_simm16:$addr),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI, opstr> {
+  let DecoderMethod = "DecodeFMem2";
+  let mayStore = 1;
+}
+
+// COP3 Load/Store
+class LW_FT3<string opstr, RegisterOperand RC, InstrItinClass Itin,
+             SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs RC:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI, opstr> {
+  let DecoderMethod = "DecodeFMem3";
+  let mayLoad = 1;
+}
+
+class SW_FT3<string opstr, RegisterOperand RC, InstrItinClass Itin,
+             SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs), (ins RC:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI, opstr> {
+  let DecoderMethod = "DecodeFMem3";
+  let mayStore = 1;
+}
+
+// Conditional Branch
+class CBranch<string opstr, DAGOperand opnd, PatFrag cond_op,
+              RegisterOperand RO, bit DelaySlot = 1> :
+  InstSE<(outs), (ins RO:$rs, RO:$rt, opnd:$offset),
+         !strconcat(opstr, "\t$rs, $rt, $offset"),
+         [(brcond (i32 (cond_op RO:$rs, RO:$rt)), bb:$offset)], II_BCC,
+         FrmI, opstr> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = DelaySlot;
+  let Defs = [AT];
+  bit isCTI = 1;
+}
+
+class CBranchZero<string opstr, DAGOperand opnd, PatFrag cond_op,
+                  RegisterOperand RO, bit DelaySlot = 1> :
+  InstSE<(outs), (ins RO:$rs, opnd:$offset),
+         !strconcat(opstr, "\t$rs, $offset"),
+         [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], II_BCCZ,
+         FrmI, opstr> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = DelaySlot;
+  let Defs = [AT];
+  bit isCTI = 1;
+}
+
+// SetCC
+class SetCC_R<string opstr, PatFrag cond_op, RegisterOperand RO> :
+  InstSE<(outs GPR32Opnd:$rd), (ins RO:$rs, RO:$rt),
+         !strconcat(opstr, "\t$rd, $rs, $rt"),
+         [(set GPR32Opnd:$rd, (cond_op RO:$rs, RO:$rt))],
+         II_SLT_SLTU, FrmR, opstr>;
+
+class SetCC_I<string opstr, PatFrag cond_op, Operand Od, PatLeaf imm_type,
+              RegisterOperand RO>:
+  InstSE<(outs GPR32Opnd:$rt), (ins RO:$rs, Od:$imm16),
+         !strconcat(opstr, "\t$rt, $rs, $imm16"),
+         [(set GPR32Opnd:$rt, (cond_op RO:$rs, imm_type:$imm16))],
+         II_SLTI_SLTIU, FrmI, opstr>;
+
+// Jump
+class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
+             SDPatternOperator targetoperator, string bopstr> :
+  InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
+         [(operator targetoperator:$target)], II_J, FrmJ, bopstr> {
+  let isTerminator=1;
+  let isBarrier=1;
+  let hasDelaySlot = 1;
+  let DecoderMethod = "DecodeJumpTarget";
+  let Defs = [AT];
+  bit isCTI = 1;
+}
+
+// Unconditional branch
+class UncondBranch<Instruction BEQInst> :
+  PseudoSE<(outs), (ins brtarget:$offset), [(br bb:$offset)], II_B>,
+  PseudoInstExpansion<(BEQInst ZERO, ZERO, brtarget:$offset)> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let hasDelaySlot = 1;
+  let AdditionalPredicates = [RelocPIC];
+  let Defs = [AT];
+  bit isCTI = 1;
+}
+
+// Base class for indirect branch and return instruction classes.
+let isTerminator=1, isBarrier=1, hasDelaySlot = 1, isCTI = 1 in
+class JumpFR<string opstr, RegisterOperand RO,
+             SDPatternOperator operator = null_frag>:
+  InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], II_JR,
+         FrmR, opstr>;
+
+// Indirect branch
+class IndirectBranch<string opstr, RegisterOperand RO> : JumpFR<opstr, RO> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+
+// Jump and Link (Call)
+let isCall=1, hasDelaySlot=1, isCTI=1, Defs = [RA] in {
+  class JumpLink<string opstr, DAGOperand opnd> :
+    InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
+           [(MipsJmpLink tglobaladdr:$target)], II_JAL, FrmJ, opstr> {
+    let DecoderMethod = "DecodeJumpTarget";
+  }
+
+  class JumpLinkRegPseudo<RegisterOperand RO, Instruction JALRInst,
+                          Register RetReg, RegisterOperand ResRO = RO>:
+    PseudoSE<(outs), (ins RO:$rs), [(MipsJmpLink RO:$rs)], II_JALR>,
+    PseudoInstExpansion<(JALRInst RetReg, ResRO:$rs)>;
+
+  class JumpLinkReg<string opstr, RegisterOperand RO>:
+    InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+           [], II_JALR, FrmR, opstr>;
+
+  class BGEZAL_FT<string opstr, DAGOperand opnd,
+                  RegisterOperand RO, bit DelaySlot = 1> :
+    InstSE<(outs), (ins RO:$rs, opnd:$offset),
+           !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZAL, FrmI, opstr> {
+    let hasDelaySlot = DelaySlot;
+  }
+
+}
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
+    hasExtraSrcRegAllocReq = 1, isCTI = 1, Defs = [AT] in {
+  class TailCall<Instruction JumpInst, DAGOperand Opnd> :
+    PseudoSE<(outs), (ins calltarget:$target), [], II_J>,
+    PseudoInstExpansion<(JumpInst Opnd:$target)>;
+
+  class TailCallReg<RegisterOperand RO> :
+    MipsPseudo<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>;
+}
+
+class BAL_BR_Pseudo<Instruction RealInst> :
+  PseudoSE<(outs), (ins brtarget:$offset), [], II_BCCZAL>,
+  PseudoInstExpansion<(RealInst ZERO, brtarget:$offset)> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let hasDelaySlot = 1;
+  let Defs = [RA];
+  bit isCTI = 1;
+}
+
+let isCTI = 1 in {
+// Syscall
+class SYS_FT<string opstr, Operand ImmOp, InstrItinClass itin = NoItinerary> :
+  InstSE<(outs), (ins ImmOp:$code_),
+         !strconcat(opstr, "\t$code_"), [], itin, FrmI, opstr>;
+// Break
+class BRK_FT<string opstr> :
+  InstSE<(outs), (ins uimm10:$code_1, uimm10:$code_2),
+         !strconcat(opstr, "\t$code_1, $code_2"), [], II_BREAK,
+         FrmOther, opstr>;
+
+// (D)Eret
+class ER_FT<string opstr, InstrItinClass itin = NoItinerary> :
+  InstSE<(outs), (ins),
+         opstr, [], itin, FrmOther, opstr>;
+
+// Wait
+class WAIT_FT<string opstr> :
+  InstSE<(outs), (ins), opstr, [], II_WAIT, FrmOther, opstr>;
+}
+
+// Interrupts
+class DEI_FT<string opstr, RegisterOperand RO,
+             InstrItinClass itin = NoItinerary> :
+  InstSE<(outs RO:$rt), (ins),
+         !strconcat(opstr, "\t$rt"), [], itin, FrmOther, opstr>;
+
+// Sync
+let hasSideEffects = 1 in
+class SYNC_FT<string opstr> :
+  InstSE<(outs), (ins uimm5:$stype), "sync $stype",
+         [(MipsSync immZExt5:$stype)], II_SYNC, FrmOther, opstr>;
+
+class SYNCI_FT<string opstr> :
+  InstSE<(outs), (ins mem_simm16:$addr), !strconcat(opstr, "\t$addr"), [],
+         II_SYNCI, FrmOther, opstr> {
+  let hasSideEffects = 1;
+  let DecoderMethod = "DecodeSyncI";
+}
+
+let hasSideEffects = 1, isCTI = 1 in {
+class TEQ_FT<string opstr, RegisterOperand RO, Operand ImmOp,
+             InstrItinClass itin = NoItinerary> :
+  InstSE<(outs), (ins RO:$rs, RO:$rt, ImmOp:$code_),
+         !strconcat(opstr, "\t$rs, $rt, $code_"), [], itin, FrmI, opstr>;
+
+class TEQI_FT<string opstr, RegisterOperand RO,
+              InstrItinClass itin = NoItinerary> :
+  InstSE<(outs), (ins RO:$rs, simm16:$imm16),
+         !strconcat(opstr, "\t$rs, $imm16"), [], itin, FrmOther, opstr>;
+}
+
+// Mul, Div
+class Mult<string opstr, InstrItinClass itin, RegisterOperand RO,
+           list<Register> DefRegs> :
+  InstSE<(outs), (ins RO:$rs, RO:$rt), !strconcat(opstr, "\t$rs, $rt"), [],
+         itin, FrmR, opstr> {
+  let isCommutable = 1;
+  let Defs = DefRegs;
+  let hasSideEffects = 0;
+}
+
+// Pseudo multiply/divide instruction with explicit accumulator register
+// operands.
+class MultDivPseudo<Instruction RealInst, RegisterClass R0, RegisterOperand R1,
+                    SDPatternOperator OpNode, InstrItinClass Itin,
+                    bit IsComm = 1, bit HasSideEffects = 0,
+                    bit UsesCustomInserter = 0> :
+  PseudoSE<(outs R0:$ac), (ins R1:$rs, R1:$rt),
+           [(set R0:$ac, (OpNode R1:$rs, R1:$rt))], Itin>,
+  PseudoInstExpansion<(RealInst R1:$rs, R1:$rt)> {
+  let isCommutable = IsComm;
+  let hasSideEffects = HasSideEffects;
+  let usesCustomInserter = UsesCustomInserter;
+}
+
+// Pseudo multiply add/sub instruction with explicit accumulator register
+// operands.
+class MAddSubPseudo<Instruction RealInst, SDPatternOperator OpNode,
+                    InstrItinClass itin>
+  : PseudoSE<(outs ACC64:$ac),
+             (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64:$acin),
+             [(set ACC64:$ac,
+              (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64:$acin))],
+             itin>,
+    PseudoInstExpansion<(RealInst GPR32Opnd:$rs, GPR32Opnd:$rt)> {
+  string Constraints = "$acin = $ac";
+}
+
+class Div<string opstr, InstrItinClass itin, RegisterOperand RO,
+          list<Register> DefRegs> :
+  InstSE<(outs), (ins RO:$rs, RO:$rt), !strconcat(opstr, "\t$$zero, $rs, $rt"),
+         [], itin, FrmR, opstr> {
+  let Defs = DefRegs;
+}
+
+// Move from Hi/Lo
+class PseudoMFLOHI<RegisterClass DstRC, RegisterClass SrcRC, SDNode OpNode>
+  : PseudoSE<(outs DstRC:$rd), (ins SrcRC:$hilo),
+             [(set DstRC:$rd, (OpNode SrcRC:$hilo))], II_MFHI_MFLO>;
+
+class MoveFromLOHI<string opstr, RegisterOperand RO, Register UseReg>:
+  InstSE<(outs RO:$rd), (ins), !strconcat(opstr, "\t$rd"), [], II_MFHI_MFLO,
+         FrmR, opstr> {
+  let Uses = [UseReg];
+  let hasSideEffects = 0;
+}
+
+class PseudoMTLOHI<RegisterClass DstRC, RegisterClass SrcRC>
+  : PseudoSE<(outs DstRC:$lohi), (ins SrcRC:$lo, SrcRC:$hi),
+             [(set DstRC:$lohi, (MipsMTLOHI SrcRC:$lo, SrcRC:$hi))],
+             II_MTHI_MTLO>;
+
+class MoveToLOHI<string opstr, RegisterOperand RO, list<Register> DefRegs>:
+  InstSE<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), [], II_MTHI_MTLO,
+  FrmR, opstr> {
+  let Defs = DefRegs;
+  let hasSideEffects = 0;
+}
+
+class EffectiveAddress<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins mem_ea:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, addr:$addr)], II_ADDIU, FrmI,
+         !strconcat(opstr, "_lea")> {
+  let isCodeGenOnly = 1;
+  let hasNoSchedulingInfo = 1;
+  let DecoderMethod = "DecodeMem";
+}
+
+// Count Leading Ones/Zeros in Word
+class CountLeading0<string opstr, RegisterOperand RO,
+                  InstrItinClass itin = NoItinerary>:
+  InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+         [(set RO:$rd, (ctlz RO:$rs))], itin, FrmR, opstr>;
+
+class CountLeading1<string opstr, RegisterOperand RO,
+                  InstrItinClass itin = NoItinerary>:
+  InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+         [(set RO:$rd, (ctlz (not RO:$rs)))], itin, FrmR, opstr>;
+
+// Sign Extend in Register.
+class SignExtInReg<string opstr, ValueType vt, RegisterOperand RO,
+                   InstrItinClass itin> :
+  InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"),
+         [(set RO:$rd, (sext_inreg RO:$rt, vt))], itin, FrmR, opstr>;
+
+// Subword Swap
+class SubwordSwap<string opstr, RegisterOperand RO,
+                  InstrItinClass itin = NoItinerary>:
+  InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [], itin,
+         FrmR, opstr> {
+  let hasSideEffects = 0;
+}
+
+// Read Hardware
+class ReadHardware<RegisterOperand CPURegOperand, RegisterOperand RO> :
+  InstSE<(outs CPURegOperand:$rt), (ins RO:$rd), "rdhwr\t$rt, $rd", [],
+         II_RDHWR, FrmR, "rdhwr">;
+
+// Ext and Ins
+class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
+              Operand SizeOpnd, PatFrag PosImm, PatFrag SizeImm,
+              SDPatternOperator Op = null_frag> :
+  InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size),
+         !strconcat(opstr, " $rt, $rs, $pos, $size"),
+         [(set RO:$rt, (Op RO:$rs, PosImm:$pos, SizeImm:$size))], II_EXT,
+         FrmR, opstr>, ISA_MIPS32R2;
+
+class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd,
+              Operand SizeOpnd, SDPatternOperator Op = null_frag>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size, RO:$src),
+         !strconcat(opstr, " $rt, $rs, $pos, $size"),
+         [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size, RO:$src))],
+         II_INS, FrmR, opstr>, ISA_MIPS32R2 {
+  let Constraints = "$src = $rt";
+}
+
+// Atomic instructions with 2 source operands (ATOMIC_SWAP & ATOMIC_LOAD_*).
+class Atomic2Ops<PatFrag Op, RegisterClass DRC> :
+  PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$incr),
+           [(set DRC:$dst, (Op iPTR:$ptr, DRC:$incr))]>;
+
+// Atomic Compare & Swap.
+class AtomicCmpSwap<PatFrag Op, RegisterClass DRC> :
+  PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$cmp, DRC:$swap),
+           [(set DRC:$dst, (Op iPTR:$ptr, DRC:$cmp, DRC:$swap))]>;
+
+class LLBase<string opstr, RegisterOperand RO, DAGOperand MO = mem> :
+  InstSE<(outs RO:$rt), (ins MO:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [], II_LL, FrmI, opstr> {
+  let DecoderMethod = "DecodeMem";
+  let mayLoad = 1;
+}
+
+class SCBase<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$dst), (ins RO:$rt, mem:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], II_SC, FrmI> {
+  let DecoderMethod = "DecodeMem";
+  let mayStore = 1;
+  let Constraints = "$rt = $dst";
+}
+
+class MFC3OP<string asmstr, RegisterOperand RO, RegisterOperand RD,
+             InstrItinClass itin> :
+  InstSE<(outs RO:$rt), (ins RD:$rd, uimm3:$sel),
+         !strconcat(asmstr, "\t$rt, $rd, $sel"), [], itin, FrmFR>;
+
+class MTC3OP<string asmstr, RegisterOperand RO, RegisterOperand RD,
+             InstrItinClass itin> :
+  InstSE<(outs RO:$rd), (ins RD:$rt, uimm3:$sel),
+         !strconcat(asmstr, "\t$rt, $rd, $sel"), [], itin, FrmFR>;
+
+class TrapBase<Instruction RealInst>
+  : PseudoSE<(outs), (ins), [(trap)], II_TRAP>,
+    PseudoInstExpansion<(RealInst 0, 0)> {
+  let isBarrier = 1;
+  let isTerminator = 1;
+  let isCodeGenOnly = 1;
+  let isCTI = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+// Return RA.
+let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, isCTI=1 in {
+  let hasDelaySlot=1 in
+  def RetRA : PseudoSE<(outs), (ins), [(MipsRet)]>;
+
+  let hasSideEffects=1 in
+  def ERet : PseudoSE<(outs), (ins), [(MipsERet)]>;
+}
+
+let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
+def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt),
+                                  [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                                  [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let usesCustomInserter = 1 in {
+  def ATOMIC_LOAD_ADD_I8   : Atomic2Ops<atomic_load_add_8, GPR32>;
+  def ATOMIC_LOAD_ADD_I16  : Atomic2Ops<atomic_load_add_16, GPR32>;
+  def ATOMIC_LOAD_ADD_I32  : Atomic2Ops<atomic_load_add_32, GPR32>;
+  def ATOMIC_LOAD_SUB_I8   : Atomic2Ops<atomic_load_sub_8, GPR32>;
+  def ATOMIC_LOAD_SUB_I16  : Atomic2Ops<atomic_load_sub_16, GPR32>;
+  def ATOMIC_LOAD_SUB_I32  : Atomic2Ops<atomic_load_sub_32, GPR32>;
+  def ATOMIC_LOAD_AND_I8   : Atomic2Ops<atomic_load_and_8, GPR32>;
+  def ATOMIC_LOAD_AND_I16  : Atomic2Ops<atomic_load_and_16, GPR32>;
+  def ATOMIC_LOAD_AND_I32  : Atomic2Ops<atomic_load_and_32, GPR32>;
+  def ATOMIC_LOAD_OR_I8    : Atomic2Ops<atomic_load_or_8, GPR32>;
+  def ATOMIC_LOAD_OR_I16   : Atomic2Ops<atomic_load_or_16, GPR32>;
+  def ATOMIC_LOAD_OR_I32   : Atomic2Ops<atomic_load_or_32, GPR32>;
+  def ATOMIC_LOAD_XOR_I8   : Atomic2Ops<atomic_load_xor_8, GPR32>;
+  def ATOMIC_LOAD_XOR_I16  : Atomic2Ops<atomic_load_xor_16, GPR32>;
+  def ATOMIC_LOAD_XOR_I32  : Atomic2Ops<atomic_load_xor_32, GPR32>;
+  def ATOMIC_LOAD_NAND_I8  : Atomic2Ops<atomic_load_nand_8, GPR32>;
+  def ATOMIC_LOAD_NAND_I16 : Atomic2Ops<atomic_load_nand_16, GPR32>;
+  def ATOMIC_LOAD_NAND_I32 : Atomic2Ops<atomic_load_nand_32, GPR32>;
+
+  def ATOMIC_SWAP_I8       : Atomic2Ops<atomic_swap_8, GPR32>;
+  def ATOMIC_SWAP_I16      : Atomic2Ops<atomic_swap_16, GPR32>;
+  def ATOMIC_SWAP_I32      : Atomic2Ops<atomic_swap_32, GPR32>;
+
+  def ATOMIC_CMP_SWAP_I8   : AtomicCmpSwap<atomic_cmp_swap_8, GPR32>;
+  def ATOMIC_CMP_SWAP_I16  : AtomicCmpSwap<atomic_cmp_swap_16, GPR32>;
+  def ATOMIC_CMP_SWAP_I32  : AtomicCmpSwap<atomic_cmp_swap_32, GPR32>;
+}
+
+/// Pseudo instructions for loading and storing accumulator registers.
+let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
+  def LOAD_ACC64  : Load<"", ACC64>;
+  def STORE_ACC64 : Store<"", ACC64>;
+}
+
+// We need these two pseudo instructions to avoid offset calculation for long
+// branches.  See the comment in file MipsLongBranch.cpp for detailed
+// explanation.
+
+// Expands to: lui $dst, %hi($tgt - $baltgt)
+def LONG_BRANCH_LUi : PseudoSE<(outs GPR32Opnd:$dst),
+  (ins brtarget:$tgt, brtarget:$baltgt), []>;
+
+// Expands to: addiu $dst, $src, %lo($tgt - $baltgt)
+def LONG_BRANCH_ADDiu : PseudoSE<(outs GPR32Opnd:$dst),
+  (ins GPR32Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
+
+//===----------------------------------------------------------------------===//
+// Instruction definition
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MipsI Instructions
+//===----------------------------------------------------------------------===//
+
+/// Arithmetic Instructions (ALU Immediate)
+let AdditionalPredicates = [NotInMicroMips] in {
+  def ADDiu : MMRel, StdMMR6Rel, ArithLogicI<"addiu", simm16_relaxed, GPR32Opnd,
+                                             II_ADDIU, immSExt16, add>,
+              ADDI_FM<0x9>, IsAsCheapAsAMove;
+
+  def ANDi : MMRel, StdMMR6Rel,
+             ArithLogicI<"andi", uimm16, GPR32Opnd, II_ANDI, immZExt16, and>,
+             ADDI_FM<0xc>;
+  def ORi  : MMRel, StdMMR6Rel,
+             ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16, or>,
+             ADDI_FM<0xd>;
+  def XORi : MMRel, StdMMR6Rel,
+             ArithLogicI<"xori", uimm16, GPR32Opnd, II_XORI, immZExt16, xor>,
+             ADDI_FM<0xe>;
+}
+def ADDi  : MMRel, ArithLogicI<"addi", simm16_relaxed, GPR32Opnd, II_ADDI>, ADDI_FM<0x8>,
+            ISA_MIPS1_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def SLTi  : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
+              SLTI_FM<0xa>;
+  def SLTiu : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
+              SLTI_FM<0xb>;
+}
+def LUi   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM;
+let AdditionalPredicates = [NotInMicroMips] in {
+  /// Arithmetic Instructions (3-Operand, R-Type)
+  def ADDu  : MMRel, StdMMR6Rel, ArithLogicR<"addu", GPR32Opnd, 1, II_ADDU, add>,
+              ADD_FM<0, 0x21>;
+  def SUBu  : MMRel, StdMMR6Rel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>,
+              ADD_FM<0, 0x23>;
+}
+let Defs = [HI0, LO0] in
+def MUL   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>,
+            ADD_FM<0x1c, 2>, ISA_MIPS32_NOT_32R6_64R6;
+def ADD   : MMRel, StdMMR6Rel, ArithLogicR<"add", GPR32Opnd, 1, II_ADD>, ADD_FM<0, 0x20>;
+def SUB   : MMRel, StdMMR6Rel, ArithLogicR<"sub", GPR32Opnd, 0, II_SUB>, ADD_FM<0, 0x22>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def SLT   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM<0, 0x2a>;
+  def SLTu  : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>, ADD_FM<0, 0x2b>;
+  def AND   : MMRel, StdMMR6Rel, ArithLogicR<"and", GPR32Opnd, 1, II_AND, and>,
+              ADD_FM<0, 0x24>;
+  def OR    : MMRel, StdMMR6Rel, ArithLogicR<"or", GPR32Opnd, 1, II_OR, or>,
+              ADD_FM<0, 0x25>;
+  def XOR   : MMRel, StdMMR6Rel, ArithLogicR<"xor", GPR32Opnd, 1, II_XOR, xor>,
+              ADD_FM<0, 0x26>;
+  def NOR   : MMRel, StdMMR6Rel, LogicNOR<"nor", GPR32Opnd>, ADD_FM<0, 0x27>;
+}
+
+/// Shift Instructions
+let AdditionalPredicates = [NotInMicroMips] in {
+def SLL  : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL, shl,
+                                   immZExt5>, SRA_FM<0, 0>;
+def SRL  : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL, srl,
+                                   immZExt5>, SRA_FM<2, 0>;
+def SRA  : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA, sra,
+                                   immZExt5>, SRA_FM<3, 0>;
+def SLLV : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV, shl>,
+           SRLV_FM<4, 0>;
+def SRLV : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, II_SRLV, srl>,
+           SRLV_FM<6, 0>;
+def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV, sra>,
+           SRLV_FM<7, 0>;
+}
+
+// Rotate Instructions
+let AdditionalPredicates = [NotInMicroMips] in {
+  def ROTR  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR, rotr,
+                                      immZExt5>,
+              SRA_FM<2, 1>, ISA_MIPS32R2;
+  def ROTRV : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV, rotr>,
+              SRLV_FM<6, 1>, ISA_MIPS32R2;
+}
+
+/// Load and Store Instructions
+///  aligned
+def LB  : LoadMemory<"lb", GPR32Opnd, mem_simm16, sextloadi8, II_LB>, MMRel,
+          LW_FM<0x20>;
+def LBu : LoadMemory<"lbu", GPR32Opnd, mem_simm16, zextloadi8, II_LBU,
+                     addrDefault>, MMRel, LW_FM<0x24>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def LH  : LoadMemory<"lh", GPR32Opnd, mem_simm16, sextloadi16, II_LH,
+                       addrDefault>, MMRel, LW_FM<0x21>;
+  def LHu : LoadMemory<"lhu", GPR32Opnd, mem_simm16, zextloadi16, II_LHU>,
+            MMRel, LW_FM<0x25>;
+  def LW  : StdMMR6Rel, Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel,
+            LW_FM<0x23>;
+}
+def SB  : StdMMR6Rel, Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel,
+          LW_FM<0x28>;
+def SH  : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>;
+let AdditionalPredicates = [NotInMicroMips] in {
+def SW  : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>;
+}
+
+/// load/store left/right
+let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
+    AdditionalPredicates = [NotInMicroMips] in {
+def LWL : LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, II_LWL>, LW_FM<0x22>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def LWR : LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, II_LWR>, LW_FM<0x26>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def SWL : StoreLeftRight<"swl", MipsSWL, GPR32Opnd, II_SWL>, LW_FM<0x2a>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>,
+          ISA_MIPS1_NOT_32R6_64R6;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+// COP2 Memory Instructions
+def LWC2 : StdMMR6Rel, LW_FT2<"lwc2", COP2Opnd, II_LWC2, load>, LW_FM<0x32>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def SWC2 : StdMMR6Rel, SW_FT2<"swc2", COP2Opnd, II_SWC2, store>,
+           LW_FM<0x3a>, ISA_MIPS1_NOT_32R6_64R6;
+def LDC2 : StdMMR6Rel, LW_FT2<"ldc2", COP2Opnd, II_LDC2, load>, LW_FM<0x36>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def SDC2 : StdMMR6Rel, SW_FT2<"sdc2", COP2Opnd, II_SDC2, store>,
+           LW_FM<0x3e>, ISA_MIPS2_NOT_32R6_64R6;
+
+// COP3 Memory Instructions
+let DecoderNamespace = "COP3_" in {
+  def LWC3 : LW_FT3<"lwc3", COP3Opnd, II_LWC3, load>, LW_FM<0x33>;
+  def SWC3 : SW_FT3<"swc3", COP3Opnd, II_SWC3, store>, LW_FM<0x3b>;
+  def LDC3 : LW_FT3<"ldc3", COP3Opnd, II_LDC3, load>, LW_FM<0x37>,
+             ISA_MIPS2;
+  def SDC3 : SW_FT3<"sdc3", COP3Opnd, II_SDC3, store>, LW_FM<0x3f>,
+             ISA_MIPS2;
+}
+
+  def SYNC : MMRel, StdMMR6Rel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS2;
+  def SYNCI : MMRel, StdMMR6Rel, SYNCI_FT<"synci">, SYNCI_FM, ISA_MIPS32R2;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd, uimm10, II_TEQ>, TEQ_FM<0x34>, ISA_MIPS2;
+  def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd, uimm10, II_TGE>, TEQ_FM<0x30>, ISA_MIPS2;
+  def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd, uimm10, II_TGEU>, TEQ_FM<0x31>, ISA_MIPS2;
+  def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd, uimm10, II_TLT>, TEQ_FM<0x32>, ISA_MIPS2;
+  def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd, uimm10, II_TLTU>, TEQ_FM<0x33>, ISA_MIPS2;
+  def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd, uimm10, II_TNE>, TEQ_FM<0x36>, ISA_MIPS2;
+}
+
+def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd, II_TEQI>, TEQI_FM<0xc>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd, II_TGEI>, TEQI_FM<0x8>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd, II_TGEIU>, TEQI_FM<0x9>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd, II_TLTI>, TEQI_FM<0xa>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd, II_TTLTIU>, TEQI_FM<0xb>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd, II_TNEI>, TEQI_FM<0xe>,
+           ISA_MIPS2_NOT_32R6_64R6;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+def BREAK : MMRel, StdMMR6Rel, BRK_FT<"break">, BRK_FM<0xd>;
+def SYSCALL : MMRel, SYS_FT<"syscall", uimm20, II_SYSCALL>, SYS_FM<0xc>;
+}
+def TRAP : TrapBase<BREAK>;
+let AdditionalPredicates = [NotInMicroMips] in {
+def SDBBP : MMRel, SYS_FT<"sdbbp", uimm20, II_SDBBP>, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  def ERET : MMRel, ER_FT<"eret", II_ERET>, ER_FM<0x18, 0x0>, INSN_MIPS3_32;
+  def ERETNC : MMRel, ER_FT<"eretnc", II_ERETNC>, ER_FM<0x18, 0x1>, ISA_MIPS32R5;
+  def DERET : MMRel, ER_FT<"deret", II_DERET>, ER_FM<0x1f, 0x0>, ISA_MIPS32;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  def EI : MMRel, StdMMR6Rel, DEI_FT<"ei", GPR32Opnd, II_EI>, EI_FM<1>, ISA_MIPS32R2;
+  def DI : MMRel, StdMMR6Rel, DEI_FT<"di", GPR32Opnd, II_DI>, EI_FM<0>, ISA_MIPS32R2;
+}
+
+let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
+    AdditionalPredicates = [NotInMicroMips] in {
+def WAIT : WAIT_FT<"wait">, WAIT_FM;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+/// Load-linked, Store-conditional
+def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, PTR_32, ISA_MIPS2_NOT_32R6_64R6;
+def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, PTR_32, ISA_MIPS2_NOT_32R6_64R6;
+}
+
+/// Jump and Branch Instructions
+def J       : MMRel, JumpFJ<jmptarget, "j", br, bb, "j">, FJ<2>,
+              AdditionalRequires<[RelocNotPIC]>, IsBranch;
+def JR      : MMRel, IndirectBranch<"jr", GPR32Opnd>, MTLO_FM<8>, ISA_MIPS1_NOT_32R6_64R6; 
+def BEQ     : MMRel, CBranch<"beq", brtarget, seteq, GPR32Opnd>, BEQ_FM<4>;
+def BEQL    : MMRel, CBranch<"beql", brtarget, seteq, GPR32Opnd, 0>,
+              BEQ_FM<20>, ISA_MIPS2_NOT_32R6_64R6;
+def BNE     : MMRel, CBranch<"bne", brtarget, setne, GPR32Opnd>, BEQ_FM<5>;
+def BNEL    : MMRel, CBranch<"bnel", brtarget, setne, GPR32Opnd, 0>,
+              BEQ_FM<21>, ISA_MIPS2_NOT_32R6_64R6;
+def BGEZ    : MMRel, CBranchZero<"bgez", brtarget, setge, GPR32Opnd>,
+              BGEZ_FM<1, 1>;
+def BGEZL   : MMRel, CBranchZero<"bgezl", brtarget, setge, GPR32Opnd, 0>,
+              BGEZ_FM<1, 3>, ISA_MIPS2_NOT_32R6_64R6;
+def BGTZ    : MMRel, CBranchZero<"bgtz", brtarget, setgt, GPR32Opnd>,
+              BGEZ_FM<7, 0>;
+def BGTZL   : MMRel, CBranchZero<"bgtzl", brtarget, setgt, GPR32Opnd, 0>,
+              BGEZ_FM<23, 0>, ISA_MIPS2_NOT_32R6_64R6;
+def BLEZ    : MMRel, CBranchZero<"blez", brtarget, setle, GPR32Opnd>,
+              BGEZ_FM<6, 0>;
+def BLEZL   : MMRel, CBranchZero<"blezl", brtarget, setle, GPR32Opnd, 0>,
+              BGEZ_FM<22, 0>, ISA_MIPS2_NOT_32R6_64R6;
+def BLTZ    : MMRel, CBranchZero<"bltz", brtarget, setlt, GPR32Opnd>,
+              BGEZ_FM<1, 0>;
+def BLTZL   : MMRel, CBranchZero<"bltzl", brtarget, setlt, GPR32Opnd, 0>,
+              BGEZ_FM<1, 2>, ISA_MIPS2_NOT_32R6_64R6;
+def B       : UncondBranch<BEQ>;
+
+def JAL  : MMRel, JumpLink<"jal", calltarget>, FJ<3>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
+  def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
+}
+
+def JALX : MMRel, JumpLink<"jalx", calltarget>, FJ<0x1D>,
+           ISA_MIPS32_NOT_32R6_64R6;
+def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>,
+             ISA_MIPS1_NOT_32R6_64R6;
+def BGEZALL : MMRel, BGEZAL_FT<"bgezall", brtarget, GPR32Opnd, 0>,
+              BGEZAL_FM<0x13>, ISA_MIPS2_NOT_32R6_64R6;
+def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>,
+             ISA_MIPS1_NOT_32R6_64R6;
+def BLTZALL : MMRel, BGEZAL_FT<"bltzall", brtarget, GPR32Opnd, 0>,
+              BGEZAL_FM<0x12>, ISA_MIPS2_NOT_32R6_64R6;
+def BAL_BR : BAL_BR_Pseudo<BGEZAL>;
+
+let Predicates = [NotInMicroMips] in { 
+  def TAILCALL : TailCall<J, jmptarget>;
+}
+
+def TAILCALLREG : TailCallReg<GPR32Opnd>;
+
+// Indirect branches are matched as PseudoIndirectBranch/PseudoIndirectBranch64
+// then are expanded to JR, JR64, JALR, or JALR64 depending on the ISA.
+class PseudoIndirectBranchBase<RegisterOperand RO> :
+    MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)],
+               II_IndirectBranchPseudo> {
+  let isTerminator=1;
+  let isBarrier=1;
+  let hasDelaySlot = 1;
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+  bit isCTI = 1;
+}
+
+def PseudoIndirectBranch : PseudoIndirectBranchBase<GPR32Opnd>;
+
+// Return instructions are matched as a RetRA instruction, then are expanded
+// into PseudoReturn/PseudoReturn64 after register allocation. Finally,
+// MipsAsmPrinter expands this into JR, JR64, JALR, or JALR64 depending on the
+// ISA.
+class PseudoReturnBase<RegisterOperand RO> : MipsPseudo<(outs), (ins RO:$rs),
+                                                        [], II_ReturnPseudo> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let hasDelaySlot = 1;
+  let isReturn = 1;
+  let isCodeGenOnly = 1;
+  let hasCtrlDep = 1;
+  let hasExtraSrcRegAllocReq = 1;
+  bit isCTI = 1;
+}
+
+def PseudoReturn : PseudoReturnBase<GPR32Opnd>;
+
+// Exception handling related node and instructions.
+// The conversion sequence is:
+// ISD::EH_RETURN -> MipsISD::EH_RETURN ->
+// MIPSeh_return -> (stack change + indirect branch)
+//
+// MIPSeh_return takes the place of regular return instruction
+// but takes two arguments (V1, V0) which are used for storing
+// the offset and return address respectively.
+def SDT_MipsEHRET : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisPtrTy<1>]>;
+
+def MIPSehret : SDNode<"MipsISD::EH_RETURN", SDT_MipsEHRET,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1, isCTI = 1 in {
+  def MIPSeh_return32 : MipsPseudo<(outs), (ins GPR32:$spoff, GPR32:$dst),
+                                [(MIPSehret GPR32:$spoff, GPR32:$dst)]>;
+  def MIPSeh_return64 : MipsPseudo<(outs), (ins GPR64:$spoff,
+                                                GPR64:$dst),
+                                [(MIPSehret GPR64:$spoff, GPR64:$dst)]>;
+}
+
+/// Multiply and Divide Instructions.
+def MULT  : MMRel, Mult<"mult", II_MULT, GPR32Opnd, [HI0, LO0]>,
+            MULT_FM<0, 0x18>, ISA_MIPS1_NOT_32R6_64R6;
+def MULTu : MMRel, Mult<"multu", II_MULTU, GPR32Opnd, [HI0, LO0]>,
+            MULT_FM<0, 0x19>, ISA_MIPS1_NOT_32R6_64R6;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def SDIV  : MMRel, Div<"div", II_DIV, GPR32Opnd, [HI0, LO0]>,
+              MULT_FM<0, 0x1a>, ISA_MIPS1_NOT_32R6_64R6;
+  def UDIV  : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
+              MULT_FM<0, 0x1b>, ISA_MIPS1_NOT_32R6_64R6;
+}
+def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>,
+           ISA_MIPS1_NOT_32R6_64R6;
+let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
+    AdditionalPredicates = [NotInMicroMips] in {
+def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>,
+           ISA_MIPS1_NOT_32R6_64R6;
+}
+
+/// Sign Ext In Register Instructions.
+def SEB : MMRel, StdMMR6Rel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>,
+          SEB_FM<0x10, 0x20>, ISA_MIPS32R2;
+def SEH : MMRel, StdMMR6Rel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
+          SEB_FM<0x18, 0x20>, ISA_MIPS32R2;
+
+/// Count Leading
+def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd, II_CLZ>, CLO_FM<0x20>,
+          ISA_MIPS32_NOT_32R6_64R6;
+def CLO : MMRel, CountLeading1<"clo", GPR32Opnd, II_CLO>, CLO_FM<0x21>,
+          ISA_MIPS32_NOT_32R6_64R6;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  /// Word Swap Bytes Within Halfwords
+  def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>, SEB_FM<2, 0x20>,
+             ISA_MIPS32R2;
+}
+
+/// No operation.
+def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
+
+// FrameIndexes are legalized when they are operands from load/store
+// instructions. The same not happens for stack address copies, so an
+// add op with mem ComplexPattern is used and the stack address copy
+// can be matched. It's similar to Sparc LEA_ADDRi
+def LEA_ADDiu : MMRel, EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>;
+
+// MADD*/MSUB*
+def MADD  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>,
+            ISA_MIPS32_NOT_32R6_64R6;
+def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>,
+            ISA_MIPS32_NOT_32R6_64R6;
+def MSUB  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>,
+            ISA_MIPS32_NOT_32R6_64R6;
+def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>,
+            ISA_MIPS32_NOT_32R6_64R6;
+
+let AdditionalPredicates = [NotDSP] in {
+def PseudoMULT  : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, II_MULT>,
+                  ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMULTu : MultDivPseudo<MULTu, ACC64, GPR32Opnd, MipsMultu, II_MULTU>,
+                  ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMFHI : PseudoMFLOHI<GPR32, ACC64, MipsMFHI>, ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMFLO : PseudoMFLOHI<GPR32, ACC64, MipsMFLO>, ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMTLOHI : PseudoMTLOHI<ACC64, GPR32>, ISA_MIPS1_NOT_32R6_64R6;
+def PseudoMADD  : MAddSubPseudo<MADD, MipsMAdd, II_MADD>,
+                  ISA_MIPS32_NOT_32R6_64R6;
+def PseudoMADDU : MAddSubPseudo<MADDU, MipsMAddu, II_MADDU>,
+                  ISA_MIPS32_NOT_32R6_64R6;
+def PseudoMSUB  : MAddSubPseudo<MSUB, MipsMSub, II_MSUB>,
+                  ISA_MIPS32_NOT_32R6_64R6;
+def PseudoMSUBU : MAddSubPseudo<MSUBU, MipsMSubu, II_MSUBU>,
+                  ISA_MIPS32_NOT_32R6_64R6;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, II_DIV,
+                                 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
+  def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, II_DIVU,
+                                 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
+  def RDHWR : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
+  // TODO: Add '0 < pos+size <= 32' constraint check to ext instruction
+  def EXT : MMRel, StdMMR6Rel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1,
+                                       immZExt5, immZExt5Plus1, MipsExt>,
+            EXT_FM<0>;
+  def INS : MMRel, StdMMR6Rel, InsBase<"ins", GPR32Opnd, uimm5,
+                                       uimm5_inssize_plus1, MipsIns>,
+            EXT_FM<4>;
+}
+/// Move Control Registers From/To CPU Registers
+let AdditionalPredicates = [NotInMicroMips] in {
+  def MTC0 : MTC3OP<"mtc0", COP0Opnd, GPR32Opnd, II_MTC0>, MFC3OP_FM<0x10, 4>,
+             ISA_MIPS32;
+  def MFC0 : MFC3OP<"mfc0", GPR32Opnd, COP0Opnd, II_MFC0>, MFC3OP_FM<0x10, 0>,
+             ISA_MIPS32;
+}
+def MFC2 : MFC3OP<"mfc2", GPR32Opnd, COP2Opnd, II_MFC2>, MFC3OP_FM<0x12, 0>;
+def MTC2 : MTC3OP<"mtc2", COP2Opnd, GPR32Opnd, II_MTC2>, MFC3OP_FM<0x12, 4>;
+
+class Barrier<string asmstr, InstrItinClass itin = NoItinerary> :
+  InstSE<(outs), (ins), asmstr, [], itin, FrmOther, asmstr>;
+
+def SSNOP : MMRel, StdMMR6Rel, Barrier<"ssnop", II_SSNOP>, BARRIER_FM<1>;
+def EHB : MMRel, Barrier<"ehb", II_EHB>, BARRIER_FM<3>;
+
+let isCTI = 1 in
+def PAUSE : MMRel, StdMMR6Rel, Barrier<"pause", II_PAUSE>, BARRIER_FM<5>,
+            ISA_MIPS32R2;
+
+// JR_HB and JALR_HB are defined here using the new style naming
+// scheme because some of this code is shared with Mips32r6InstrInfo.td
+// and because of that it doesn't follow the naming convention of the
+// rest of the file. To avoid a mixture of old vs new style, the new
+// style was chosen.
+class JR_HB_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins GPROpnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rs");
+  list<dag> Pattern = [];
+}
+
+class JALR_HB_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
+  list<dag> Pattern = [];
+}
+
+class JR_HB_DESC : InstSE<(outs), (ins), "", [], II_JR_HB, FrmJ>,
+                   JR_HB_DESC_BASE<"jr.hb", GPR32Opnd> {
+  let isBranch=1;
+  let isIndirectBranch=1;
+  let hasDelaySlot=1;
+  let isTerminator=1;
+  let isBarrier=1;
+  bit isCTI = 1;
+}
+
+class JALR_HB_DESC : InstSE<(outs), (ins), "", [], II_JALR_HB, FrmJ>,
+                     JALR_HB_DESC_BASE<"jalr.hb", GPR32Opnd> {
+  let isIndirectBranch=1;
+  let hasDelaySlot=1;
+  bit isCTI = 1;
+}
+
+class JR_HB_ENC : JR_HB_FM<8>;
+class JALR_HB_ENC : JALR_HB_FM<9>;
+
+def JR_HB : JR_HB_DESC, JR_HB_ENC, ISA_MIPS32_NOT_32R6_64R6;
+def JALR_HB : JALR_HB_DESC, JALR_HB_ENC, ISA_MIPS32;
+
+class TLB<string asmstr, InstrItinClass itin = NoItinerary> :
+  InstSE<(outs), (ins), asmstr, [], itin, FrmOther, asmstr>;
+let AdditionalPredicates = [NotInMicroMips] in {
+def TLBP : MMRel, TLB<"tlbp", II_TLBP>, COP0_TLB_FM<0x08>;
+def TLBR : MMRel, TLB<"tlbr", II_TLBR>, COP0_TLB_FM<0x01>;
+def TLBWI : MMRel, TLB<"tlbwi", II_TLBWI>, COP0_TLB_FM<0x02>;
+def TLBWR : MMRel, TLB<"tlbwr", II_TLBWR>, COP0_TLB_FM<0x06>;
+}
+class CacheOp<string instr_asm, Operand MemOpnd,
+              InstrItinClass itin = NoItinerary> :
+    InstSE<(outs), (ins  MemOpnd:$addr, uimm5:$hint),
+           !strconcat(instr_asm, "\t$hint, $addr"), [], itin, FrmOther,
+           instr_asm> {
+  let DecoderMethod = "DecodeCacheOp";
+}
+
+def CACHE : MMRel, CacheOp<"cache", mem, II_CACHE>, CACHEOP_FM<0b101111>,
+            INSN_MIPS3_32_NOT_32R6_64R6;
+def PREF :  MMRel, CacheOp<"pref", mem, II_PREF>, CACHEOP_FM<0b110011>,
+            INSN_MIPS3_32_NOT_32R6_64R6;
+
+def ROL : MipsAsmPseudoInst<(outs),
+                            (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
+                            "rol\t$rs, $rt, $rd">;
+def ROLImm : MipsAsmPseudoInst<(outs),
+                               (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+                               "rol\t$rs, $rt, $imm">;
+def : MipsInstAlias<"rol $rd, $rs",
+                    (ROL GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"rol $rd, $imm",
+                    (ROLImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>;
+
+def ROR : MipsAsmPseudoInst<(outs),
+                            (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
+                            "ror\t$rs, $rt, $rd">;
+def RORImm : MipsAsmPseudoInst<(outs),
+                               (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+                               "ror\t$rs, $rt, $imm">;
+def : MipsInstAlias<"ror $rd, $rs",
+                    (ROR GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"ror $rd, $imm",
+                    (RORImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>;
+
+def DROL : MipsAsmPseudoInst<(outs),
+                             (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
+                             "drol\t$rs, $rt, $rd">, ISA_MIPS64;
+def DROLImm : MipsAsmPseudoInst<(outs),
+                                (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+                                "drol\t$rs, $rt, $imm">, ISA_MIPS64;
+def : MipsInstAlias<"drol $rd, $rs",
+                    (DROL GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>, ISA_MIPS64;
+def : MipsInstAlias<"drol $rd, $imm",
+                    (DROLImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>, ISA_MIPS64;
+
+def DROR : MipsAsmPseudoInst<(outs),
+                             (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd),
+                             "dror\t$rs, $rt, $rd">, ISA_MIPS64;
+def DRORImm : MipsAsmPseudoInst<(outs),
+                                (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm),
+                                "dror\t$rs, $rt, $imm">, ISA_MIPS64;
+def : MipsInstAlias<"dror $rd, $rs",
+                    (DROR GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>, ISA_MIPS64;
+def : MipsInstAlias<"dror $rd, $imm",
+                    (DRORImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>, ISA_MIPS64;
+
+def ABSMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rs),
+                                 "abs\t$rd, $rs">;
+
+def SEQMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                 (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                 "seq $rd, $rs, $rt">, NOT_ASE_CNMIPS;
+
+def : MipsInstAlias<"seq $rd, $rs",
+                    (SEQMacro GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>,
+                    NOT_ASE_CNMIPS;
+
+def SEQIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                  (ins GPR32Opnd:$rs, simm32_relaxed:$imm),
+                                  "seq $rd, $rs, $imm">, NOT_ASE_CNMIPS;
+
+def : MipsInstAlias<"seq $rd, $imm",
+                    (SEQIMacro GPR32Opnd:$rd, GPR32Opnd:$rd, simm32:$imm), 0>,
+                    NOT_ASE_CNMIPS;
+//===----------------------------------------------------------------------===//
+// Instruction aliases
+//===----------------------------------------------------------------------===//
+def : MipsInstAlias<"move $dst, $src",
+                    (OR GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
+      GPR_32 {
+  let AdditionalPredicates = [NotInMicroMips];
+}
+def : MipsInstAlias<"move $dst, $src",
+                    (ADDu GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
+      GPR_32 {
+  let AdditionalPredicates = [NotInMicroMips];
+}
+def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<
+          "addu $rs, $rt, $imm",
+          (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+def : MipsInstAlias<
+          "addu $rs, $imm",
+          (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+def : MipsInstAlias<
+          "add $rs, $rt, $imm",
+          (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<
+          "add $rs, $imm",
+          (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<
+          "and $rs, $rt, $imm",
+          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+def : MipsInstAlias<
+          "and $rs, $imm",
+          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
+let Predicates = [NotInMicroMips] in {
+def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
+}
+def : MipsInstAlias<"jalr.hb $rs", (JALR_HB RA, GPR32Opnd:$rs), 1>, ISA_MIPS32;
+def : MipsInstAlias<"neg $rt, $rs",
+                    (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
+def : MipsInstAlias<"neg $rt",
+                    (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>;
+def : MipsInstAlias<"negu $rt, $rs",
+                    (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
+def : MipsInstAlias<"negu $rt",
+                    (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 1>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<
+          "sgt $rd, $rs, $rt",
+          (SLT GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<
+          "sgt $rs, $rt",
+          (SLT GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<
+          "sgtu $rd, $rs, $rt",
+          (SLTu GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<
+          "sgtu $$rs, $rt",
+          (SLTu GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<
+          "slt $rs, $rt, $imm",
+          (SLTi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+  def : MipsInstAlias<
+          "sltu $rt, $rs, $imm",
+          (SLTiu GPR32Opnd:$rt, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+  def : MipsInstAlias<
+          "and $rs, $rt, $imm",
+          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+  def : MipsInstAlias<
+          "and $rs, $imm",
+          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+  def : MipsInstAlias<
+          "xor $rs, $rt, $imm",
+          (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+  def : MipsInstAlias<
+          "xor $rs, $imm",
+          (XORi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+  def : MipsInstAlias<
+          "or $rs, $rt, $imm",
+          (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
+  def : MipsInstAlias<
+          "or $rs, $imm",
+          (ORi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+  def : MipsInstAlias<
+          "not $rt, $rs",
+          (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
+  def : MipsInstAlias<
+          "not $rt",
+          (NOR GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>;
+  def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
+}
+def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, COP0Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 COP0Opnd:$rd, GPR32Opnd:$rt, 0), 0>;
+def : MipsInstAlias<"mfc2 $rt, $rd", (MFC2 GPR32Opnd:$rt, COP2Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"mtc2 $rt, $rd", (MTC2 COP2Opnd:$rd, GPR32Opnd:$rt, 0), 0>;
+let AdditionalPredicates = [NotInMicroMips] in {
+def : MipsInstAlias<"b $offset", (BEQ ZERO, ZERO, brtarget:$offset), 0>;
+}
+def : MipsInstAlias<"bnez $rs,$offset",
+                    (BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"bnezl $rs,$offset",
+                    (BNEL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"beqz $rs,$offset",
+                    (BEQ GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"beqzl $rs,$offset",
+                    (BEQL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"syscall", (SYSCALL 0), 1>;
+}
+
+def : MipsInstAlias<"break", (BREAK 0, 0), 1>;
+def : MipsInstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"ei", (EI ZERO), 1>, ISA_MIPS32R2;
+  def : MipsInstAlias<"di", (DI ZERO), 1>, ISA_MIPS32R2;
+}
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"teq $rs, $rt",
+                      (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+  def : MipsInstAlias<"tge $rs, $rt",
+                      (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+  def : MipsInstAlias<"tgeu $rs, $rt",
+                      (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+  def : MipsInstAlias<"tlt $rs, $rt",
+                      (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+  def : MipsInstAlias<"tltu $rs, $rt",
+                      (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+  def : MipsInstAlias<"tne $rs, $rt",
+                      (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+}
+def : MipsInstAlias<"sub, $rd, $rs, $imm",
+                    (ADDi GPR32Opnd:$rd, GPR32Opnd:$rs,
+                          InvertedImOperand:$imm), 0>, ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"sub $rs, $imm",
+                    (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, InvertedImOperand:$imm),
+                    0>, ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"subu, $rd, $rs, $imm",
+                    (ADDiu GPR32Opnd:$rd, GPR32Opnd:$rs,
+                           InvertedImOperand:$imm), 0>;
+def : MipsInstAlias<"subu $rs, $imm", (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs,
+                                             InvertedImOperand:$imm), 0>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"sll $rd, $rt, $rs",
+                      (SLLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<"sra $rd, $rt, $rs",
+                      (SRAV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<"srl $rd, $rt, $rs",
+                      (SRLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+  def : MipsInstAlias<"sll $rd, $rt",
+                      (SLLV GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rt), 0>;
+  def : MipsInstAlias<"sra $rd, $rt",
+                      (SRAV GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rt), 0>;
+  def : MipsInstAlias<"srl $rd, $rt",
+                      (SRLV GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rt), 0>;
+  def : MipsInstAlias<"seh $rd", (SEH GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
+                     ISA_MIPS32R2;
+  def : MipsInstAlias<"seb $rd", (SEB GPR32Opnd:$rd, GPR32Opnd:$rd), 0>,
+                     ISA_MIPS32R2;
+}
+def : MipsInstAlias<"sdbbp", (SDBBP 0)>, ISA_MIPS32_NOT_32R6_64R6;
+def : MipsInstAlias<"sync",
+                    (SYNC 0), 1>, ISA_MIPS2;
+//===----------------------------------------------------------------------===//
+// Assembler Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+// We use uimm32_coerced to accept a 33 bit signed number that is rendered into
+// a 32 bit number.
+class LoadImmediate32<string instr_asm, Operand Od, RegisterOperand RO> :
+  MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
+                     !strconcat(instr_asm, "\t$rt, $imm32")> ;
+def LoadImm32 : LoadImmediate32<"li", uimm32_coerced, GPR32Opnd>;
+
+class LoadAddressFromReg32<string instr_asm, Operand MemOpnd,
+                           RegisterOperand RO> :
+  MipsAsmPseudoInst<(outs RO:$rt), (ins MemOpnd:$addr),
+                     !strconcat(instr_asm, "\t$rt, $addr")> ;
+def LoadAddrReg32 : LoadAddressFromReg32<"la", mem, GPR32Opnd>;
+
+class LoadAddressFromImm32<string instr_asm, Operand Od, RegisterOperand RO> :
+  MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
+                     !strconcat(instr_asm, "\t$rt, $imm32")> ;
+def LoadAddrImm32 : LoadAddressFromImm32<"la", i32imm, GPR32Opnd>;
+
+def JalTwoReg : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rs),
+                      "jal\t$rd, $rs"> ;
+def JalOneReg : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs),
+                      "jal\t$rs"> ;
+
+def NORImm : MipsAsmPseudoInst<
+                 (outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm32:$imm),
+                 "nor\t$rs, $rt, $imm"> ;
+
+let hasDelaySlot = 1, isCTI = 1 in {
+def BneImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
+                               (ins imm64:$imm64, brtarget:$offset),
+                               "bne\t$rt, $imm64, $offset">;
+def BeqImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
+                               (ins imm64:$imm64, brtarget:$offset),
+                               "beq\t$rt, $imm64, $offset">;
+
+class CondBranchPseudo<string instr_asm> :
+  MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt,
+                                 brtarget:$offset),
+                    !strconcat(instr_asm, "\t$rs, $rt, $offset")>;
+}
+
+def BLT : CondBranchPseudo<"blt">;
+def BLE : CondBranchPseudo<"ble">;
+def BGE : CondBranchPseudo<"bge">;
+def BGT : CondBranchPseudo<"bgt">;
+def BLTU : CondBranchPseudo<"bltu">;
+def BLEU : CondBranchPseudo<"bleu">;
+def BGEU : CondBranchPseudo<"bgeu">;
+def BGTU : CondBranchPseudo<"bgtu">;
+def BLTL : CondBranchPseudo<"bltl">, ISA_MIPS2_NOT_32R6_64R6;
+def BLEL : CondBranchPseudo<"blel">, ISA_MIPS2_NOT_32R6_64R6;
+def BGEL : CondBranchPseudo<"bgel">, ISA_MIPS2_NOT_32R6_64R6;
+def BGTL : CondBranchPseudo<"bgtl">, ISA_MIPS2_NOT_32R6_64R6;
+def BLTUL: CondBranchPseudo<"bltul">, ISA_MIPS2_NOT_32R6_64R6;
+def BLEUL: CondBranchPseudo<"bleul">, ISA_MIPS2_NOT_32R6_64R6;
+def BGEUL: CondBranchPseudo<"bgeul">, ISA_MIPS2_NOT_32R6_64R6;
+def BGTUL: CondBranchPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6;
+
+let isCTI = 1 in
+class CondBranchImmPseudo<string instr_asm> :
+  MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, imm64:$imm, brtarget:$offset),
+                    !strconcat(instr_asm, "\t$rs, $imm, $offset")>;
+
+def BLTImmMacro  : CondBranchImmPseudo<"blt">;
+def BLEImmMacro  : CondBranchImmPseudo<"ble">;
+def BGEImmMacro  : CondBranchImmPseudo<"bge">;
+def BGTImmMacro  : CondBranchImmPseudo<"bgt">;
+def BLTUImmMacro : CondBranchImmPseudo<"bltu">;
+def BLEUImmMacro : CondBranchImmPseudo<"bleu">;
+def BGEUImmMacro : CondBranchImmPseudo<"bgeu">;
+def BGTUImmMacro : CondBranchImmPseudo<"bgtu">;
+def BLTLImmMacro : CondBranchImmPseudo<"bltl">, ISA_MIPS2_NOT_32R6_64R6;
+def BLELImmMacro : CondBranchImmPseudo<"blel">, ISA_MIPS2_NOT_32R6_64R6;
+def BGELImmMacro : CondBranchImmPseudo<"bgel">, ISA_MIPS2_NOT_32R6_64R6;
+def BGTLImmMacro : CondBranchImmPseudo<"bgtl">, ISA_MIPS2_NOT_32R6_64R6;
+def BLTULImmMacro : CondBranchImmPseudo<"bltul">, ISA_MIPS2_NOT_32R6_64R6;
+def BLEULImmMacro : CondBranchImmPseudo<"bleul">, ISA_MIPS2_NOT_32R6_64R6;
+def BGEULImmMacro : CondBranchImmPseudo<"bgeul">, ISA_MIPS2_NOT_32R6_64R6;
+def BGTULImmMacro : CondBranchImmPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6;
+
+// FIXME: Predicates are removed because instructions are matched regardless of
+// predicates, because PredicateControl was not in the hierarchy. This was
+// done to emit more precise error message from expansion function.
+// Once the tablegen-erated errors are made better, this needs to be fixed and
+// predicates needs to be restored.
+
+def SDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                  (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                  "div\t$rd, $rs, $rt">,
+                ISA_MIPS1_NOT_32R6_64R6;
+def UDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                  (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                  "divu\t$rd, $rs, $rt">,
+                ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"div $rt, $rs", (SDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+                                               GPR32Opnd:$rs), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"divu $rt, $rs", (UDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+                                                GPR32Opnd:$rs), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
+def DSDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                   (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                   "ddiv\t$rd, $rs, $rt">,
+                 ISA_MIPS64_NOT_64R6;
+def DUDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                   (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                   "ddivu\t$rd, $rs, $rt">,
+                 ISA_MIPS64_NOT_64R6;
+def : MipsInstAlias<"ddiv $rt, $rs", (DSDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+                                                 GPR32Opnd:$rs), 0>,
+      ISA_MIPS64_NOT_64R6;
+def : MipsInstAlias<"ddivu $rt, $rs", (DUDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+                                                  GPR32Opnd:$rs), 0>,
+      ISA_MIPS64_NOT_64R6;
+
+def Ulh : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
+                            "ulh\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
+
+def Ulhu : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
+                             "ulhu\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
+
+def Ulw : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
+                            "ulw\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
+
+def Ush : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
+                            "ush\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
+
+def Usw : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
+                            "usw\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
+
+def LDMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
+                                (ins mem_simm16:$addr), "ld $rt, $addr">,
+                                ISA_MIPS1_NOT_MIPS3;
+def SDMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
+                                (ins mem_simm16:$addr), "sd $rt, $addr">,
+                                ISA_MIPS1_NOT_MIPS3;
+//===----------------------------------------------------------------------===//
+//  Arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+// Load/store pattern templates.
+class LoadRegImmPat<Instruction LoadInst, ValueType ValTy, PatFrag Node> :
+  MipsPat<(ValTy (Node addrRegImm:$a)), (LoadInst addrRegImm:$a)>;
+
+class StoreRegImmPat<Instruction StoreInst, ValueType ValTy> :
+  MipsPat<(store ValTy:$v, addrRegImm:$a), (StoreInst ValTy:$v, addrRegImm:$a)>;
+
+// Materialize constants.
+multiclass MaterializeImms<ValueType VT, Register ZEROReg,
+                           Instruction ADDiuOp, Instruction LUiOp,
+                           Instruction ORiOp> {
+
+// Small immediates
+def : MipsPat<(VT immSExt16:$imm), (ADDiuOp ZEROReg, imm:$imm)>;
+def : MipsPat<(VT immZExt16:$imm), (ORiOp ZEROReg, imm:$imm)>;
+
+// Bits 32-16 set, sign/zero extended.
+def : MipsPat<(VT immSExt32Low16Zero:$imm), (LUiOp (HI16 imm:$imm))>;
+
+// Arbitrary immediates
+def : MipsPat<(VT immSExt32:$imm), (ORiOp (LUiOp (HI16 imm:$imm)), (LO16 imm:$imm))>;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in
+  defm : MaterializeImms<i32, ZERO, ADDiu, LUi, ORi>;
+
+// Carry MipsPatterns
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
+                (SUBu GPR32:$lhs, GPR32:$rhs)>;
+}
+def : MipsPat<(addc GPR32:$lhs, GPR32:$rhs),
+              (ADDu GPR32:$lhs, GPR32:$rhs)>, ASE_NOT_DSP;
+def : MipsPat<(addc  GPR32:$src, immSExt16:$imm),
+              (ADDiu GPR32:$src, imm:$imm)>, ASE_NOT_DSP;
+
+// Support multiplication for pre-Mips32 targets that don't have
+// the MUL instruction.
+def : MipsPat<(mul GPR32:$lhs, GPR32:$rhs),
+              (PseudoMFLO (PseudoMULT GPR32:$lhs, GPR32:$rhs))>,
+      ISA_MIPS1_NOT_32R6_64R6;
+
+// SYNC
+def : MipsPat<(MipsSync (i32 immz)),
+              (SYNC 0)>, ISA_MIPS2;
+
+// Call
+def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
+              (JAL texternalsym:$dst)>;
+//def : MipsPat<(MipsJmpLink GPR32:$dst),
+//              (JALR GPR32:$dst)>;
+
+// Tail call
+def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
+              (TAILCALL tglobaladdr:$dst)>;
+def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
+              (TAILCALL texternalsym:$dst)>;
+// hi/lo relocs
+def : MipsPat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
+def : MipsPat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
+def : MipsPat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
+def : MipsPat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
+def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
+def : MipsPat<(MipsHi texternalsym:$in), (LUi texternalsym:$in)>;
+
+def : MipsPat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>;
+def : MipsPat<(MipsLo tblockaddress:$in), (ADDiu ZERO, tblockaddress:$in)>;
+def : MipsPat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>;
+def : MipsPat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>;
+def : MipsPat<(MipsLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>;
+def : MipsPat<(MipsLo texternalsym:$in), (ADDiu ZERO, texternalsym:$in)>;
+
+def : MipsPat<(add GPR32:$hi, (MipsLo tglobaladdr:$lo)),
+              (ADDiu GPR32:$hi, tglobaladdr:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tblockaddress:$lo)),
+              (ADDiu GPR32:$hi, tblockaddress:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tjumptable:$lo)),
+              (ADDiu GPR32:$hi, tjumptable:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tconstpool:$lo)),
+              (ADDiu GPR32:$hi, tconstpool:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tglobaltlsaddr:$lo)),
+              (ADDiu GPR32:$hi, tglobaltlsaddr:$lo)>;
+
+// gp_rel relocs
+def : MipsPat<(add GPR32:$gp, (MipsGPRel tglobaladdr:$in)),
+              (ADDiu GPR32:$gp, tglobaladdr:$in)>;
+def : MipsPat<(add GPR32:$gp, (MipsGPRel tconstpool:$in)),
+              (ADDiu GPR32:$gp, tconstpool:$in)>;
+
+// wrapper_pic
+class WrapperPat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
+      MipsPat<(MipsWrapper RC:$gp, node:$in),
+              (ADDiuOp RC:$gp, node:$in)>;
+
+def : WrapperPat<tglobaladdr, ADDiu, GPR32>;
+def : WrapperPat<tconstpool, ADDiu, GPR32>;
+def : WrapperPat<texternalsym, ADDiu, GPR32>;
+def : WrapperPat<tblockaddress, ADDiu, GPR32>;
+def : WrapperPat<tjumptable, ADDiu, GPR32>;
+def : WrapperPat<tglobaltlsaddr, ADDiu, GPR32>;
+
+let AdditionalPredicates = [NotInMicroMips] in {
+// Mips does not have "not", so we expand our way
+def : MipsPat<(not GPR32:$in),
+              (NOR GPR32Opnd:$in, ZERO)>;
+}
+
+// extended loads
+def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
+def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
+}
+
+// peepholes
+def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
+
+// brcond patterns
+multiclass BrcondPats<RegisterClass RC, Instruction BEQOp, Instruction BEQOp1,
+                      Instruction BNEOp, Instruction SLTOp, Instruction SLTuOp,
+                      Instruction SLTiOp, Instruction SLTiuOp,
+                      Register ZEROReg> {
+def : MipsPat<(brcond (i32 (setne RC:$lhs, 0)), bb:$dst),
+              (BNEOp RC:$lhs, ZEROReg, bb:$dst)>;
+def : MipsPat<(brcond (i32 (seteq RC:$lhs, 0)), bb:$dst),
+              (BEQOp RC:$lhs, ZEROReg, bb:$dst)>;
+
+def : MipsPat<(brcond (i32 (setge RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQOp1 (SLTOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setuge RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQOp1 (SLTuOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setge RC:$lhs, immSExt16:$rhs)), bb:$dst),
+              (BEQOp1 (SLTiOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setuge RC:$lhs, immSExt16:$rhs)), bb:$dst),
+              (BEQOp1 (SLTiuOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setgt RC:$lhs, immSExt16Plus1:$rhs)), bb:$dst),
+              (BEQOp1 (SLTiOp RC:$lhs, (Plus1 imm:$rhs)), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setugt RC:$lhs, immSExt16Plus1:$rhs)), bb:$dst),
+              (BEQOp1 (SLTiuOp RC:$lhs, (Plus1 imm:$rhs)), ZERO, bb:$dst)>;
+
+def : MipsPat<(brcond (i32 (setle RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQOp1 (SLTOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setule RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQOp1 (SLTuOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
+
+def : MipsPat<(brcond RC:$cond, bb:$dst),
+              (BNEOp RC:$cond, ZEROReg, bb:$dst)>;
+}
+let AdditionalPredicates = [NotInMicroMips] in {
+  defm : BrcondPats<GPR32, BEQ, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>;
+}
+def : MipsPat<(brcond (i32 (setlt i32:$lhs, 1)), bb:$dst),
+              (BLEZ i32:$lhs, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setgt i32:$lhs, -1)), bb:$dst),
+              (BGEZ i32:$lhs, bb:$dst)>;
+
+// setcc patterns
+multiclass SeteqPats<RegisterClass RC, Instruction SLTiuOp, Instruction XOROp,
+                     Instruction SLTuOp, Register ZEROReg> {
+  def : MipsPat<(seteq RC:$lhs, 0),
+                (SLTiuOp RC:$lhs, 1)>;
+  def : MipsPat<(setne RC:$lhs, 0),
+                (SLTuOp ZEROReg, RC:$lhs)>;
+  def : MipsPat<(seteq RC:$lhs, RC:$rhs),
+                (SLTiuOp (XOROp RC:$lhs, RC:$rhs), 1)>;
+  def : MipsPat<(setne RC:$lhs, RC:$rhs),
+                (SLTuOp ZEROReg, (XOROp RC:$lhs, RC:$rhs))>;
+}
+
+multiclass SetlePats<RegisterClass RC, Instruction XORiOp, Instruction SLTOp,
+                     Instruction SLTuOp> {
+  def : MipsPat<(setle RC:$lhs, RC:$rhs),
+                (XORiOp (SLTOp RC:$rhs, RC:$lhs), 1)>;
+  def : MipsPat<(setule RC:$lhs, RC:$rhs),
+                (XORiOp (SLTuOp RC:$rhs, RC:$lhs), 1)>;
+}
+
+multiclass SetgtPats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
+  def : MipsPat<(setgt RC:$lhs, RC:$rhs),
+                (SLTOp RC:$rhs, RC:$lhs)>;
+  def : MipsPat<(setugt RC:$lhs, RC:$rhs),
+                (SLTuOp RC:$rhs, RC:$lhs)>;
+}
+
+multiclass SetgePats<RegisterClass RC, Instruction XORiOp, Instruction SLTOp,
+                     Instruction SLTuOp> {
+  def : MipsPat<(setge RC:$lhs, RC:$rhs),
+                (XORiOp (SLTOp RC:$lhs, RC:$rhs), 1)>;
+  def : MipsPat<(setuge RC:$lhs, RC:$rhs),
+                (XORiOp (SLTuOp RC:$lhs, RC:$rhs), 1)>;
+}
+
+multiclass SetgeImmPats<RegisterClass RC, Instruction XORiOp,
+                        Instruction SLTiOp, Instruction SLTiuOp> {
+  def : MipsPat<(setge RC:$lhs, immSExt16:$rhs),
+                (XORiOp (SLTiOp RC:$lhs, immSExt16:$rhs), 1)>;
+  def : MipsPat<(setuge RC:$lhs, immSExt16:$rhs),
+                (XORiOp (SLTiuOp RC:$lhs, immSExt16:$rhs), 1)>;
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  defm : SeteqPats<GPR32, SLTiu, XOR, SLTu, ZERO>;
+  defm : SetlePats<GPR32, XORi, SLT, SLTu>;
+  defm : SetgtPats<GPR32, SLT, SLTu>;
+  defm : SetgePats<GPR32, XORi, SLT, SLTu>;
+  defm : SetgeImmPats<GPR32, XORi, SLTi, SLTiu>;
+}
+
+// bswap pattern
+def : MipsPat<(bswap GPR32:$rt), (ROTR (WSBH GPR32:$rt), 16)>;
+
+// Load halfword/word patterns.
+let AddedComplexity = 40 in {
+  def : LoadRegImmPat<LBu, i32, zextloadi8>;
+  let AdditionalPredicates = [NotInMicroMips] in {
+    def : LoadRegImmPat<LH, i32, sextloadi16>;
+    def : LoadRegImmPat<LW, i32, load>;
+  }
+}
+
+// Atomic load patterns.
+def : MipsPat<(atomic_load_8 addr:$a), (LB addr:$a)>;
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(atomic_load_16 addr:$a), (LH addr:$a)>;
+}
+def : MipsPat<(atomic_load_32 addr:$a), (LW addr:$a)>;
+
+// Atomic store patterns.
+def : MipsPat<(atomic_store_8 addr:$a, GPR32:$v), (SB GPR32:$v, addr:$a)>;
+def : MipsPat<(atomic_store_16 addr:$a, GPR32:$v), (SH GPR32:$v, addr:$a)>;
+def : MipsPat<(atomic_store_32 addr:$a, GPR32:$v), (SW GPR32:$v, addr:$a)>;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Support
+//===----------------------------------------------------------------------===//
+
+include "MipsInstrFPU.td"
+include "Mips64InstrInfo.td"
+include "MipsCondMov.td"
+
+include "Mips32r6InstrInfo.td"
+include "Mips64r6InstrInfo.td"
+
+//
+// Mips16
+
+include "Mips16InstrFormats.td"
+include "Mips16InstrInfo.td"
+
+// DSP
+include "MipsDSPInstrFormats.td"
+include "MipsDSPInstrInfo.td"
+
+// MSA
+include "MipsMSAInstrFormats.td"
+include "MipsMSAInstrInfo.td"
+
+// EVA
+include "MipsEVAInstrFormats.td"
+include "MipsEVAInstrInfo.td"
+
+// Micromips
+include "MicroMipsInstrFormats.td"
+include "MicroMipsInstrInfo.td"
+include "MicroMipsInstrFPU.td"
+
+// Micromips r6
+include "MicroMips32r6InstrFormats.td"
+include "MicroMips32r6InstrInfo.td"
+
+// Micromips64 r6
+include "MicroMips64r6InstrFormats.td"
+include "MicroMips64r6InstrInfo.td"
+
+// Micromips DSP
+include "MicroMipsDSPInstrFormats.td"
+include "MicroMipsDSPInstrInfo.td"
diff --git a/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp b/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp
new file mode 100644
index 000000000000..1087d0e0140e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp
@@ -0,0 +1,532 @@
+//===-- MipsLongBranch.cpp - Emit long branches ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands a branch or jump instruction into a long branch if its
+// offset is too large to fit into its immediate field.
+//
+// FIXME: Fix pc-region jump instructions which cross 256MB segment boundaries.
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsMCNaCl.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-long-branch"
+
+STATISTIC(LongBranches, "Number of long branches.");
+
+static cl::opt<bool> SkipLongBranch(
+  "skip-mips-long-branch",
+  cl::init(false),
+  cl::desc("MIPS: Skip long branch pass."),
+  cl::Hidden);
+
+static cl::opt<bool> ForceLongBranch(
+  "force-mips-long-branch",
+  cl::init(false),
+  cl::desc("MIPS: Expand all branches to long format."),
+  cl::Hidden);
+
+namespace {
+  typedef MachineBasicBlock::iterator Iter;
+  typedef MachineBasicBlock::reverse_iterator ReverseIter;
+
+  struct MBBInfo {
+    uint64_t Size, Address;
+    bool HasLongBranch;
+    MachineInstr *Br;
+
+    MBBInfo() : Size(0), HasLongBranch(false), Br(nullptr) {}
+  };
+
+  class MipsLongBranch : public MachineFunctionPass {
+
+  public:
+    static char ID;
+    MipsLongBranch(TargetMachine &tm)
+        : MachineFunctionPass(ID), TM(tm), IsPIC(TM.isPositionIndependent()),
+          ABI(static_cast<const MipsTargetMachine &>(TM).getABI()) {}
+
+    StringRef getPassName() const override { return "Mips Long Branch"; }
+
+    bool runOnMachineFunction(MachineFunction &F) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+  private:
+    void splitMBB(MachineBasicBlock *MBB);
+    void initMBBInfo();
+    int64_t computeOffset(const MachineInstr *Br);
+    void replaceBranch(MachineBasicBlock &MBB, Iter Br, const DebugLoc &DL,
+                       MachineBasicBlock *MBBOpnd);
+    void expandToLongBranch(MBBInfo &Info);
+
+    const TargetMachine &TM;
+    MachineFunction *MF;
+    SmallVector<MBBInfo, 16> MBBInfos;
+    bool IsPIC;
+    MipsABIInfo ABI;
+    unsigned LongBranchSeqSize;
+  };
+
+  char MipsLongBranch::ID = 0;
+} // end of anonymous namespace
+
+/// createMipsLongBranchPass - Returns a pass that converts branches to long
+/// branches.
+FunctionPass *llvm::createMipsLongBranchPass(MipsTargetMachine &tm) {
+  return new MipsLongBranch(tm);
+}
+
+/// Iterate over list of Br's operands and search for a MachineBasicBlock
+/// operand.
+static MachineBasicBlock *getTargetMBB(const MachineInstr &Br) {
+  for (unsigned I = 0, E = Br.getDesc().getNumOperands(); I < E; ++I) {
+    const MachineOperand &MO = Br.getOperand(I);
+
+    if (MO.isMBB())
+      return MO.getMBB();
+  }
+
+  llvm_unreachable("This instruction does not have an MBB operand.");
+}
+
+// Traverse the list of instructions backwards until a non-debug instruction is
+// found or it reaches E.
+static ReverseIter getNonDebugInstr(ReverseIter B, const ReverseIter &E) {
+  for (; B != E; ++B)
+    if (!B->isDebugValue())
+      return B;
+
+  return E;
+}
+
+// Split MBB if it has two direct jumps/branches.
+void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
+  ReverseIter End = MBB->rend();
+  ReverseIter LastBr = getNonDebugInstr(MBB->rbegin(), End);
+
+  // Return if MBB has no branch instructions.
+  if ((LastBr == End) ||
+      (!LastBr->isConditionalBranch() && !LastBr->isUnconditionalBranch()))
+    return;
+
+  ReverseIter FirstBr = getNonDebugInstr(std::next(LastBr), End);
+
+  // MBB has only one branch instruction if FirstBr is not a branch
+  // instruction.
+  if ((FirstBr == End) ||
+      (!FirstBr->isConditionalBranch() && !FirstBr->isUnconditionalBranch()))
+    return;
+
+  assert(!FirstBr->isIndirectBranch() && "Unexpected indirect branch found.");
+
+  // Create a new MBB. Move instructions in MBB to the newly created MBB.
+  MachineBasicBlock *NewMBB =
+    MF->CreateMachineBasicBlock(MBB->getBasicBlock());
+
+  // Insert NewMBB and fix control flow.
+  MachineBasicBlock *Tgt = getTargetMBB(*FirstBr);
+  NewMBB->transferSuccessors(MBB);
+  NewMBB->removeSuccessor(Tgt, true);
+  MBB->addSuccessor(NewMBB);
+  MBB->addSuccessor(Tgt);
+  MF->insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
+
+  NewMBB->splice(NewMBB->end(), MBB, LastBr.getReverse(), MBB->end());
+}
+
+// Fill MBBInfos.
+void MipsLongBranch::initMBBInfo() {
+  // Split the MBBs if they have two branches. Each basic block should have at
+  // most one branch after this loop is executed.
+  for (auto &MBB : *MF)
+    splitMBB(&MBB);
+
+  MF->RenumberBlocks();
+  MBBInfos.clear();
+  MBBInfos.resize(MF->size());
+
+  const MipsInstrInfo *TII =
+      static_cast<const MipsInstrInfo *>(MF->getSubtarget().getInstrInfo());
+  for (unsigned I = 0, E = MBBInfos.size(); I < E; ++I) {
+    MachineBasicBlock *MBB = MF->getBlockNumbered(I);
+
+    // Compute size of MBB.
+    for (MachineBasicBlock::instr_iterator MI = MBB->instr_begin();
+         MI != MBB->instr_end(); ++MI)
+      MBBInfos[I].Size += TII->getInstSizeInBytes(*MI);
+
+    // Search for MBB's branch instruction.
+    ReverseIter End = MBB->rend();
+    ReverseIter Br = getNonDebugInstr(MBB->rbegin(), End);
+
+    if ((Br != End) && !Br->isIndirectBranch() &&
+        (Br->isConditionalBranch() || (Br->isUnconditionalBranch() && IsPIC)))
+      MBBInfos[I].Br = &*Br;
+  }
+}
+
+// Compute offset of branch in number of bytes.
+int64_t MipsLongBranch::computeOffset(const MachineInstr *Br) {
+  int64_t Offset = 0;
+  int ThisMBB = Br->getParent()->getNumber();
+  int TargetMBB = getTargetMBB(*Br)->getNumber();
+
+  // Compute offset of a forward branch.
+  if (ThisMBB < TargetMBB) {
+    for (int N = ThisMBB + 1; N < TargetMBB; ++N)
+      Offset += MBBInfos[N].Size;
+
+    return Offset + 4;
+  }
+
+  // Compute offset of a backward branch.
+  for (int N = ThisMBB; N >= TargetMBB; --N)
+    Offset += MBBInfos[N].Size;
+
+  return -Offset + 4;
+}
+
+// Replace Br with a branch which has the opposite condition code and a
+// MachineBasicBlock operand MBBOpnd.
+void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
+                                   const DebugLoc &DL,
+                                   MachineBasicBlock *MBBOpnd) {
+  const MipsInstrInfo *TII = static_cast<const MipsInstrInfo *>(
+      MBB.getParent()->getSubtarget().getInstrInfo());
+  unsigned NewOpc = TII->getOppositeBranchOpc(Br->getOpcode());
+  const MCInstrDesc &NewDesc = TII->get(NewOpc);
+
+  MachineInstrBuilder MIB = BuildMI(MBB, Br, DL, NewDesc);
+
+  for (unsigned I = 0, E = Br->getDesc().getNumOperands(); I < E; ++I) {
+    MachineOperand &MO = Br->getOperand(I);
+
+    if (!MO.isReg()) {
+      assert(MO.isMBB() && "MBB operand expected.");
+      break;
+    }
+
+    MIB.addReg(MO.getReg());
+  }
+
+  MIB.addMBB(MBBOpnd);
+
+  if (Br->hasDelaySlot()) {
+    // Bundle the instruction in the delay slot to the newly created branch
+    // and erase the original branch.
+    assert(Br->isBundledWithSucc());
+    MachineBasicBlock::instr_iterator II = Br.getInstrIterator();
+    MIBundleBuilder(&*MIB).append((++II)->removeFromBundle());
+  }
+  Br->eraseFromParent();
+}
+
+// Expand branch instructions to long branches.
+// TODO: This function has to be fixed for beqz16 and bnez16, because it
+// currently assumes that all branches have 16-bit offsets, and will produce
+// wrong code if branches whose allowed offsets are [-128, -126, ..., 126]
+// are present.
+void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
+  MachineBasicBlock::iterator Pos;
+  MachineBasicBlock *MBB = I.Br->getParent(), *TgtMBB = getTargetMBB(*I.Br);
+  DebugLoc DL = I.Br->getDebugLoc();
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator FallThroughMBB = ++MachineFunction::iterator(MBB);
+  MachineBasicBlock *LongBrMBB = MF->CreateMachineBasicBlock(BB);
+  const MipsSubtarget &Subtarget =
+      static_cast<const MipsSubtarget &>(MF->getSubtarget());
+  const MipsInstrInfo *TII =
+      static_cast<const MipsInstrInfo *>(Subtarget.getInstrInfo());
+
+  MF->insert(FallThroughMBB, LongBrMBB);
+  MBB->replaceSuccessor(TgtMBB, LongBrMBB);
+
+  if (IsPIC) {
+    MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
+    MF->insert(FallThroughMBB, BalTgtMBB);
+    LongBrMBB->addSuccessor(BalTgtMBB);
+    BalTgtMBB->addSuccessor(TgtMBB);
+
+    // We must select between the MIPS32r6/MIPS64r6 BAL (which is a normal
+    // instruction) and the pre-MIPS32r6/MIPS64r6 definition (which is an
+    // pseudo-instruction wrapping BGEZAL).
+    unsigned BalOp = Subtarget.hasMips32r6() ? Mips::BAL : Mips::BAL_BR;
+
+    if (!ABI.IsN64()) {
+      // $longbr:
+      //  addiu $sp, $sp, -8
+      //  sw $ra, 0($sp)
+      //  lui $at, %hi($tgt - $baltgt)
+      //  bal $baltgt
+      //  addiu $at, $at, %lo($tgt - $baltgt)
+      // $baltgt:
+      //  addu $at, $ra, $at
+      //  lw $ra, 0($sp)
+      //  jr $at
+      //  addiu $sp, $sp, 8
+      // $fallthrough:
+      //
+
+      Pos = LongBrMBB->begin();
+
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+        .addReg(Mips::SP).addImm(-8);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SW)).addReg(Mips::RA)
+        .addReg(Mips::SP).addImm(0);
+
+      // LUi and ADDiu instructions create 32-bit offset of the target basic
+      // block from the target of BAL instruction.  We cannot use immediate
+      // value for this offset because it cannot be determined accurately when
+      // the program has inline assembly statements.  We therefore use the
+      // relocation expressions %hi($tgt-$baltgt) and %lo($tgt-$baltgt) which
+      // are resolved during the fixup, so the values will always be correct.
+      //
+      // Since we cannot create %hi($tgt-$baltgt) and %lo($tgt-$baltgt)
+      // expressions at this point (it is possible only at the MC layer),
+      // we replace LUi and ADDiu with pseudo instructions
+      // LONG_BRANCH_LUi and LONG_BRANCH_ADDiu, and add both basic
+      // blocks as operands to these instructions.  When lowering these pseudo
+      // instructions to LUi and ADDiu in the MC layer, we will create
+      // %hi($tgt-$baltgt) and %lo($tgt-$baltgt) expressions and add them as
+      // operands to lowered instructions.
+
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi), Mips::AT)
+        .addMBB(TgtMBB).addMBB(BalTgtMBB);
+      MIBundleBuilder(*LongBrMBB, Pos)
+          .append(BuildMI(*MF, DL, TII->get(BalOp)).addMBB(BalTgtMBB))
+          .append(BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_ADDiu), Mips::AT)
+                      .addReg(Mips::AT)
+                      .addMBB(TgtMBB)
+                      .addMBB(BalTgtMBB));
+
+      Pos = BalTgtMBB->begin();
+
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDu), Mips::AT)
+        .addReg(Mips::RA).addReg(Mips::AT);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
+        .addReg(Mips::SP).addImm(0);
+
+      // In NaCl, modifying the sp is not allowed in branch delay slot.
+      if (Subtarget.isTargetNaCl())
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+          .addReg(Mips::SP).addImm(8);
+
+      if (Subtarget.hasMips32r6())
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR))
+          .addReg(Mips::ZERO).addReg(Mips::AT);
+      else
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR)).addReg(Mips::AT);
+
+      if (Subtarget.isTargetNaCl()) {
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::NOP));
+        // Bundle-align the target of indirect branch JR.
+        TgtMBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
+      } else
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+          .addReg(Mips::SP).addImm(8);
+
+      BalTgtMBB->rbegin()->bundleWithPred();
+    } else {
+      // $longbr:
+      //  daddiu $sp, $sp, -16
+      //  sd $ra, 0($sp)
+      //  daddiu $at, $zero, %hi($tgt - $baltgt)
+      //  dsll $at, $at, 16
+      //  bal $baltgt
+      //  daddiu $at, $at, %lo($tgt - $baltgt)
+      // $baltgt:
+      //  daddu $at, $ra, $at
+      //  ld $ra, 0($sp)
+      //  jr64 $at
+      //  daddiu $sp, $sp, 16
+      // $fallthrough:
+      //
+
+      // We assume the branch is within-function, and that offset is within
+      // +/- 2GB.  High 32 bits will therefore always be zero.
+
+      // Note that this will work even if the offset is negative, because
+      // of the +1 modification that's added in that case.  For example, if the
+      // offset is -1MB (0xFFFFFFFFFFF00000), the computation for %higher is
+      //
+      // 0xFFFFFFFFFFF00000 + 0x80008000 = 0x000000007FF08000
+      //
+      // and the bits [47:32] are zero.  For %highest
+      //
+      // 0xFFFFFFFFFFF00000 + 0x800080008000 = 0x000080007FF08000
+      //
+      // and the bits [63:48] are zero.
+
+      Pos = LongBrMBB->begin();
+
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
+        .addReg(Mips::SP_64).addImm(-16);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SD)).addReg(Mips::RA_64)
+        .addReg(Mips::SP_64).addImm(0);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
+              Mips::AT_64).addReg(Mips::ZERO_64)
+                          .addMBB(TgtMBB, MipsII::MO_ABS_HI).addMBB(BalTgtMBB);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(16);
+
+      MIBundleBuilder(*LongBrMBB, Pos)
+          .append(BuildMI(*MF, DL, TII->get(BalOp)).addMBB(BalTgtMBB))
+          .append(
+              BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_DADDiu), Mips::AT_64)
+                  .addReg(Mips::AT_64)
+                  .addMBB(TgtMBB, MipsII::MO_ABS_LO)
+                  .addMBB(BalTgtMBB));
+
+      Pos = BalTgtMBB->begin();
+
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDu), Mips::AT_64)
+        .addReg(Mips::RA_64).addReg(Mips::AT_64);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LD), Mips::RA_64)
+        .addReg(Mips::SP_64).addImm(0);
+
+      if (Subtarget.hasMips64r6())
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JALR64))
+          .addReg(Mips::ZERO_64).addReg(Mips::AT_64);
+      else
+        BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64);
+
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
+        .addReg(Mips::SP_64).addImm(16);
+      BalTgtMBB->rbegin()->bundleWithPred();
+    }
+
+    assert(LongBrMBB->size() + BalTgtMBB->size() == LongBranchSeqSize);
+  } else {
+    // $longbr:
+    //  j $tgt
+    //  nop
+    // $fallthrough:
+    //
+    Pos = LongBrMBB->begin();
+    LongBrMBB->addSuccessor(TgtMBB);
+    MIBundleBuilder(*LongBrMBB, Pos)
+      .append(BuildMI(*MF, DL, TII->get(Mips::J)).addMBB(TgtMBB))
+      .append(BuildMI(*MF, DL, TII->get(Mips::NOP)));
+
+    assert(LongBrMBB->size() == LongBranchSeqSize);
+  }
+
+  if (I.Br->isUnconditionalBranch()) {
+    // Change branch destination.
+    assert(I.Br->getDesc().getNumOperands() == 1);
+    I.Br->RemoveOperand(0);
+    I.Br->addOperand(MachineOperand::CreateMBB(LongBrMBB));
+  } else
+    // Change branch destination and reverse condition.
+    replaceBranch(*MBB, I.Br, DL, &*FallThroughMBB);
+}
+
+static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) {
+  MachineBasicBlock &MBB = F.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  DebugLoc DL = MBB.findDebugLoc(MBB.begin());
+  BuildMI(MBB, I, DL, TII->get(Mips::LUi), Mips::V0)
+    .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
+  BuildMI(MBB, I, DL, TII->get(Mips::ADDiu), Mips::V0)
+    .addReg(Mips::V0).addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+  MBB.removeLiveIn(Mips::V0);
+}
+
+bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
+  const MipsSubtarget &STI =
+      static_cast<const MipsSubtarget &>(F.getSubtarget());
+  const MipsInstrInfo *TII =
+      static_cast<const MipsInstrInfo *>(STI.getInstrInfo());
+  LongBranchSeqSize =
+      !IsPIC ? 2 : (ABI.IsN64() ? 10 : (!STI.isTargetNaCl() ? 9 : 10));
+
+  if (STI.inMips16Mode() || !STI.enableLongBranchPass())
+    return false;
+  if (IsPIC && static_cast<const MipsTargetMachine &>(TM).getABI().IsO32() &&
+      F.getInfo<MipsFunctionInfo>()->globalBaseRegSet())
+    emitGPDisp(F, TII);
+
+  if (SkipLongBranch)
+    return true;
+
+  MF = &F;
+  initMBBInfo();
+
+  SmallVectorImpl<MBBInfo>::iterator I, E = MBBInfos.end();
+  bool EverMadeChange = false, MadeChange = true;
+
+  while (MadeChange) {
+    MadeChange = false;
+
+    for (I = MBBInfos.begin(); I != E; ++I) {
+      // Skip if this MBB doesn't have a branch or the branch has already been
+      // converted to a long branch.
+      if (!I->Br || I->HasLongBranch)
+        continue;
+
+      int ShVal = STI.inMicroMipsMode() ? 2 : 4;
+      int64_t Offset = computeOffset(I->Br) / ShVal;
+
+      if (STI.isTargetNaCl()) {
+        // The offset calculation does not include sandboxing instructions
+        // that will be added later in the MC layer.  Since at this point we
+        // don't know the exact amount of code that "sandboxing" will add, we
+        // conservatively estimate that code will not grow more than 100%.
+        Offset *= 2;
+      }
+
+      // Check if offset fits into 16-bit immediate field of branches.
+      if (!ForceLongBranch && isInt<16>(Offset))
+        continue;
+
+      I->HasLongBranch = true;
+      I->Size += LongBranchSeqSize * 4;
+      ++LongBranches;
+      EverMadeChange = MadeChange = true;
+    }
+  }
+
+  if (!EverMadeChange)
+    return true;
+
+  // Compute basic block addresses.
+  if (IsPIC) {
+    uint64_t Address = 0;
+
+    for (I = MBBInfos.begin(); I != E; Address += I->Size, ++I)
+      I->Address = Address;
+  }
+
+  // Do the expansion.
+  for (I = MBBInfos.begin(); I != E; ++I)
+    if (I->HasLongBranch)
+      expandToLongBranch(*I);
+
+  MF->RenumberBlocks();
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp
new file mode 100644
index 000000000000..d5bc4e537c37
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp
@@ -0,0 +1,281 @@
+//===-- MipsMCInstLower.cpp - Convert Mips MachineInstr to MCInst ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Mips MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsMCInstLower.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsAsmPrinter.h"
+#include "MipsInstrInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+MipsMCInstLower::MipsMCInstLower(MipsAsmPrinter &asmprinter)
+  : AsmPrinter(asmprinter) {}
+
+void MipsMCInstLower::Initialize(MCContext *C) {
+  Ctx = C;
+}
+
+MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                              MachineOperandType MOTy,
+                                              unsigned Offset) const {
+  MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+  MipsMCExpr::MipsExprKind TargetKind = MipsMCExpr::MEK_None;
+  bool IsGpOff = false;
+  const MCSymbol *Symbol;
+
+  switch(MO.getTargetFlags()) {
+  default:
+    llvm_unreachable("Invalid target flag!");
+  case MipsII::MO_NO_FLAG:
+    break;
+  case MipsII::MO_GPREL:
+    TargetKind = MipsMCExpr::MEK_GPREL;
+    break;
+  case MipsII::MO_GOT_CALL:
+    TargetKind = MipsMCExpr::MEK_GOT_CALL;
+    break;
+  case MipsII::MO_GOT:
+    TargetKind = MipsMCExpr::MEK_GOT;
+    break;
+  case MipsII::MO_ABS_HI:
+    TargetKind = MipsMCExpr::MEK_HI;
+    break;
+  case MipsII::MO_ABS_LO:
+    TargetKind = MipsMCExpr::MEK_LO;
+    break;
+  case MipsII::MO_TLSGD:
+    TargetKind = MipsMCExpr::MEK_TLSGD;
+    break;
+  case MipsII::MO_TLSLDM:
+    TargetKind = MipsMCExpr::MEK_TLSLDM;
+    break;
+  case MipsII::MO_DTPREL_HI:
+    TargetKind = MipsMCExpr::MEK_DTPREL_HI;
+    break;
+  case MipsII::MO_DTPREL_LO:
+    TargetKind = MipsMCExpr::MEK_DTPREL_LO;
+    break;
+  case MipsII::MO_GOTTPREL:
+    TargetKind = MipsMCExpr::MEK_GOTTPREL;
+    break;
+  case MipsII::MO_TPREL_HI:
+    TargetKind = MipsMCExpr::MEK_TPREL_HI;
+    break;
+  case MipsII::MO_TPREL_LO:
+    TargetKind = MipsMCExpr::MEK_TPREL_LO;
+    break;
+  case MipsII::MO_GPOFF_HI:
+    TargetKind = MipsMCExpr::MEK_HI;
+    IsGpOff = true;
+    break;
+  case MipsII::MO_GPOFF_LO:
+    TargetKind = MipsMCExpr::MEK_LO;
+    IsGpOff = true;
+    break;
+  case MipsII::MO_GOT_DISP:
+    TargetKind = MipsMCExpr::MEK_GOT_DISP;
+    break;
+  case MipsII::MO_GOT_HI16:
+    TargetKind = MipsMCExpr::MEK_GOT_HI16;
+    break;
+  case MipsII::MO_GOT_LO16:
+    TargetKind = MipsMCExpr::MEK_GOT_LO16;
+    break;
+  case MipsII::MO_GOT_PAGE:
+    TargetKind = MipsMCExpr::MEK_GOT_PAGE;
+    break;
+  case MipsII::MO_GOT_OFST:
+    TargetKind = MipsMCExpr::MEK_GOT_OFST;
+    break;
+  case MipsII::MO_HIGHER:
+    TargetKind = MipsMCExpr::MEK_HIGHER;
+    break;
+  case MipsII::MO_HIGHEST:
+    TargetKind = MipsMCExpr::MEK_HIGHEST;
+    break;
+  case MipsII::MO_CALL_HI16:
+    TargetKind = MipsMCExpr::MEK_CALL_HI16;
+    break;
+  case MipsII::MO_CALL_LO16:
+    TargetKind = MipsMCExpr::MEK_CALL_LO16;
+    break;
+  }
+
+  switch (MOTy) {
+  case MachineOperand::MO_MachineBasicBlock:
+    Symbol = MO.getMBB()->getSymbol();
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    Symbol = AsmPrinter.getSymbol(MO.getGlobal());
+    Offset += MO.getOffset();
+    break;
+
+  case MachineOperand::MO_BlockAddress:
+    Symbol = AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress());
+    Offset += MO.getOffset();
+    break;
+
+  case MachineOperand::MO_ExternalSymbol:
+    Symbol = AsmPrinter.GetExternalSymbolSymbol(MO.getSymbolName());
+    Offset += MO.getOffset();
+    break;
+
+  case MachineOperand::MO_MCSymbol:
+    Symbol = MO.getMCSymbol();
+    Offset += MO.getOffset();
+    break;
+
+  case MachineOperand::MO_JumpTableIndex:
+    Symbol = AsmPrinter.GetJTISymbol(MO.getIndex());
+    break;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    Symbol = AsmPrinter.GetCPISymbol(MO.getIndex());
+    Offset += MO.getOffset();
+    break;
+
+  default:
+    llvm_unreachable("<unknown operand type>");
+  }
+
+  const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, Kind, *Ctx);
+
+  if (Offset) {
+    // Assume offset is never negative.
+    assert(Offset > 0);
+
+    Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, *Ctx),
+                                   *Ctx);
+  }
+
+  if (IsGpOff)
+    Expr = MipsMCExpr::createGpOff(TargetKind, Expr, *Ctx);
+  else if (TargetKind != MipsMCExpr::MEK_None)
+    Expr = MipsMCExpr::create(TargetKind, Expr, *Ctx);
+
+  return MCOperand::createExpr(Expr);
+}
+
+MCOperand MipsMCInstLower::LowerOperand(const MachineOperand &MO,
+                                        unsigned offset) const {
+  MachineOperandType MOTy = MO.getType();
+
+  switch (MOTy) {
+  default: llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit()) break;
+    return MCOperand::createReg(MO.getReg());
+  case MachineOperand::MO_Immediate:
+    return MCOperand::createImm(MO.getImm() + offset);
+  case MachineOperand::MO_MachineBasicBlock:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_MCSymbol:
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_BlockAddress:
+    return LowerSymbolOperand(MO, MOTy, offset);
+  case MachineOperand::MO_RegisterMask:
+    break;
+ }
+
+  return MCOperand();
+}
+
+MCOperand MipsMCInstLower::createSub(MachineBasicBlock *BB1,
+                                     MachineBasicBlock *BB2,
+                                     MipsMCExpr::MipsExprKind Kind) const {
+  const MCSymbolRefExpr *Sym1 = MCSymbolRefExpr::create(BB1->getSymbol(), *Ctx);
+  const MCSymbolRefExpr *Sym2 = MCSymbolRefExpr::create(BB2->getSymbol(), *Ctx);
+  const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Sym1, Sym2, *Ctx);
+
+  return MCOperand::createExpr(MipsMCExpr::create(Kind, Sub, *Ctx));
+}
+
+void MipsMCInstLower::
+lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(Mips::LUi);
+
+  // Lower register operand.
+  OutMI.addOperand(LowerOperand(MI->getOperand(0)));
+
+  // Create %hi($tgt-$baltgt).
+  OutMI.addOperand(createSub(MI->getOperand(1).getMBB(),
+                             MI->getOperand(2).getMBB(),
+                             MipsMCExpr::MEK_HI));
+}
+
+void MipsMCInstLower::lowerLongBranchADDiu(
+    const MachineInstr *MI, MCInst &OutMI, int Opcode,
+    MipsMCExpr::MipsExprKind Kind) const {
+  OutMI.setOpcode(Opcode);
+
+  // Lower two register operands.
+  for (unsigned I = 0, E = 2; I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    OutMI.addOperand(LowerOperand(MO));
+  }
+
+  // Create %lo($tgt-$baltgt) or %hi($tgt-$baltgt).
+  OutMI.addOperand(createSub(MI->getOperand(2).getMBB(),
+                             MI->getOperand(3).getMBB(), Kind));
+}
+
+bool MipsMCInstLower::lowerLongBranch(const MachineInstr *MI,
+                                      MCInst &OutMI) const {
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case Mips::LONG_BRANCH_LUi:
+    lowerLongBranchLUi(MI, OutMI);
+    return true;
+  case Mips::LONG_BRANCH_ADDiu:
+    lowerLongBranchADDiu(MI, OutMI, Mips::ADDiu, MipsMCExpr::MEK_LO);
+    return true;
+  case Mips::LONG_BRANCH_DADDiu:
+    unsigned TargetFlags = MI->getOperand(2).getTargetFlags();
+    if (TargetFlags == MipsII::MO_ABS_HI)
+      lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu, MipsMCExpr::MEK_HI);
+    else if (TargetFlags == MipsII::MO_ABS_LO)
+      lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu, MipsMCExpr::MEK_LO);
+    else
+      report_fatal_error("Unexpected flags for LONG_BRANCH_DADDiu");
+    return true;
+  }
+}
+
+void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  if (lowerLongBranch(MI, OutMI))
+    return;
+
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCOp = LowerOperand(MO);
+
+    if (MCOp.isValid())
+      OutMI.addOperand(MCOp);
+  }
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h
new file mode 100644
index 000000000000..c25f90005480
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h
@@ -0,0 +1,48 @@
+//===-- MipsMCInstLower.h - Lower MachineInstr to MCInst -------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSMCINSTLOWER_H
+#define LLVM_LIB_TARGET_MIPS_MIPSMCINSTLOWER_H
+#include "MCTargetDesc/MipsMCExpr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MachineInstr;
+  class MachineFunction;
+  class MipsAsmPrinter;
+
+/// MipsMCInstLower - This class is used to lower an MachineInstr into an
+//                    MCInst.
+class LLVM_LIBRARY_VISIBILITY MipsMCInstLower {
+  typedef MachineOperand::MachineOperandType MachineOperandType;
+  MCContext *Ctx;
+  MipsAsmPrinter &AsmPrinter;
+public:
+  MipsMCInstLower(MipsAsmPrinter &asmprinter);
+  void Initialize(MCContext *C);
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+  MCOperand LowerOperand(const MachineOperand& MO, unsigned offset = 0) const;
+
+private:
+  MCOperand LowerSymbolOperand(const MachineOperand &MO,
+                               MachineOperandType MOTy, unsigned Offset) const;
+  MCOperand createSub(MachineBasicBlock *BB1, MachineBasicBlock *BB2,
+                      MipsMCExpr::MipsExprKind Kind) const;
+  void lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const;
+  void lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI, int Opcode,
+                            MipsMCExpr::MipsExprKind Kind) const;
+  bool lowerLongBranch(const MachineInstr *MI, MCInst &OutMI) const;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td
new file mode 100644
index 000000000000..7d25ea56e3d5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td
@@ -0,0 +1,455 @@
+//===- MipsMSAInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
+                PredicateControl, ASE_MSA {
+  let EncodingPredicates = [HasStdEnc];
+  let Inst{31-26} = 0b011110;
+}
+
+class MSACBranch : MSAInst {
+  let Inst{31-26} = 0b010001;
+}
+
+class MSASpecial : MSAInst {
+  let Inst{31-26} = 0b000000;
+}
+
+class MSAPseudo<dag outs, dag ins, list<dag> pattern,
+                InstrItinClass itin = IIPseudo>:
+  MipsPseudo<outs, ins, pattern, itin> {
+  let Predicates = [HasMSA];
+}
+
+class MSA_BIT_B_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<3> m;
+
+  let Inst{25-23} = major;
+  let Inst{22-19} = 0b1110;
+  let Inst{18-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_BIT_H_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<4> m;
+
+  let Inst{25-23} = major;
+  let Inst{22-20} = 0b110;
+  let Inst{19-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_BIT_W_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<5> m;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = 0b10;
+  let Inst{20-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_BIT_D_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<6> m;
+
+  let Inst{25-23} = major;
+  let Inst{22} = 0b0;
+  let Inst{21-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2R_FILL_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-18} = major;
+  let Inst{17-16} = df;
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2R_FILL_D_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-18} = major;
+  let Inst{17-16} = df;
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2R_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-18} = major;
+  let Inst{17-16} = df;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2RF_FMT<bits<9> major, bits<1> df, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-17} = major;
+  let Inst{16} = df;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_3R_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> wt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = wt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_3RF_FMT<bits<4> major, bits<1> df, bits<6> minor>: MSAInst {
+  bits<5> wt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21} = df;
+  let Inst{20-16} = wt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_3R_INDEX_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> rt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_FMT<bits<10> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-16} = major;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_CFCMSA_FMT<bits<10> major, bits<6> minor>: MSAInst {
+  bits<5> rd;
+  bits<5> cs;
+
+  let Inst{25-16} = major;
+  let Inst{15-11} = cs;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_CTCMSA_FMT<bits<10> major, bits<6> minor>: MSAInst {
+  bits<5> rs;
+  bits<5> cd;
+
+  let Inst{25-16} = major;
+  let Inst{15-11} = rs;
+  let Inst{10-6} = cd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = n{3-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = n{2-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-18} = 0b1100;
+  let Inst{17-16} = n{1-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_D_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-17} = 0b11100;
+  let Inst{16} = n{0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = n{3-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = n{2-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-18} = 0b1100;
+  let Inst{17-16} = n{1-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_D_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-17} = 0b11100;
+  let Inst{16} = n{0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = n{3-0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = n{2-0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-18} = 0b1100;
+  let Inst{17-16} = n{1-0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_D_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-17} = 0b11100;
+  let Inst{16} = n{0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_I5_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> imm;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = imm;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_I8_FMT<bits<2> major, bits<6> minor>: MSAInst {
+  bits<8> u8;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-24} = major;
+  let Inst{23-16} = u8;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_I10_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<10> s10;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-11} = s10;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_MI10_FMT<bits<2> df, bits<4> minor>: MSAInst {
+  bits<21> addr;
+  bits<5> wd;
+
+  let Inst{25-16} = addr{9-0};
+  let Inst{15-11} = addr{20-16};
+  let Inst{10-6} = wd;
+  let Inst{5-2} = minor;
+  let Inst{1-0} = df;
+}
+
+class MSA_VEC_FMT<bits<5> major, bits<6> minor>: MSAInst {
+  bits<5> wt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-21} = major;
+  let Inst{20-16} = wt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_CBRANCH_FMT<bits<3> major, bits<2> df>: MSACBranch {
+  bits<16> offset;
+  bits<5> wt;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = wt;
+  let Inst{15-0} = offset;
+}
+
+class MSA_CBRANCH_V_FMT<bits<5> major>: MSACBranch {
+  bits<16> offset;
+  bits<5> wt;
+
+  let Inst{25-21} = major;
+  let Inst{20-16} = wt;
+  let Inst{15-0} = offset;
+}
+
+class SPECIAL_LSA_FMT<bits<6> minor>: MSASpecial {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> rd;
+  bits<2> sa;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8} = 0b000;
+  let Inst{7-6} = sa;
+  let Inst{5-0} = minor;
+}
+
+class SPECIAL_DLSA_FMT<bits<6> minor>: MSASpecial {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> rd;
+  bits<2> sa;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8} = 0b000;
+  let Inst{7-6} = sa;
+  let Inst{5-0} = minor;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
new file mode 100644
index 000000000000..8b04fcb76920
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -0,0 +1,3946 @@
+//===- MipsMSAInstrInfo.td - MSA ASE instructions -*- tablegen ------------*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips MSA ASE instructions.
+//
+//===----------------------------------------------------------------------===//
+
+def SDT_MipsVecCond : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>]>;
+def SDT_VSetCC : SDTypeProfile<1, 3, [SDTCisInt<0>,
+                                      SDTCisInt<1>,
+                                      SDTCisSameAs<1, 2>,
+                                      SDTCisVT<3, OtherVT>]>;
+def SDT_VFSetCC : SDTypeProfile<1, 3, [SDTCisInt<0>,
+                                       SDTCisFP<1>,
+                                       SDTCisSameAs<1, 2>,
+                                       SDTCisVT<3, OtherVT>]>;
+def SDT_VSHF : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisVec<0>,
+                                    SDTCisInt<1>, SDTCisVec<1>,
+                                    SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>;
+def SDT_SHF : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
+                                   SDTCisVT<1, i32>, SDTCisSameAs<0, 2>]>;
+def SDT_ILV : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
+                                   SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>;
+def SDT_INSVE : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                     SDTCisVT<2, i32>, SDTCisSameAs<0, 3>,
+                                     SDTCisVT<4, i32>]>;
+
+def MipsVAllNonZero : SDNode<"MipsISD::VALL_NONZERO", SDT_MipsVecCond>;
+def MipsVAnyNonZero : SDNode<"MipsISD::VANY_NONZERO", SDT_MipsVecCond>;
+def MipsVAllZero : SDNode<"MipsISD::VALL_ZERO", SDT_MipsVecCond>;
+def MipsVAnyZero : SDNode<"MipsISD::VANY_ZERO", SDT_MipsVecCond>;
+def MipsVSMax : SDNode<"MipsISD::VSMAX", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVSMin : SDNode<"MipsISD::VSMIN", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVUMax : SDNode<"MipsISD::VUMAX", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVUMin : SDNode<"MipsISD::VUMIN", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVNOR : SDNode<"MipsISD::VNOR", SDTIntBinOp,
+                      [SDNPCommutative, SDNPAssociative]>;
+def MipsVSHF : SDNode<"MipsISD::VSHF", SDT_VSHF>;
+def MipsSHF : SDNode<"MipsISD::SHF", SDT_SHF>;
+def MipsILVEV : SDNode<"MipsISD::ILVEV", SDT_ILV>;
+def MipsILVOD : SDNode<"MipsISD::ILVOD", SDT_ILV>;
+def MipsILVL  : SDNode<"MipsISD::ILVL",  SDT_ILV>;
+def MipsILVR  : SDNode<"MipsISD::ILVR",  SDT_ILV>;
+def MipsPCKEV : SDNode<"MipsISD::PCKEV", SDT_ILV>;
+def MipsPCKOD : SDNode<"MipsISD::PCKOD", SDT_ILV>;
+def MipsINSVE : SDNode<"MipsISD::INSVE", SDT_INSVE>;
+
+def vsetcc : SDNode<"ISD::SETCC", SDT_VSetCC>;
+def vfsetcc : SDNode<"ISD::SETCC", SDT_VFSetCC>;
+
+def MipsVExtractSExt : SDNode<"MipsISD::VEXTRACT_SEXT_ELT",
+    SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>, []>;
+def MipsVExtractZExt : SDNode<"MipsISD::VEXTRACT_ZEXT_ELT",
+    SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>, []>;
+
+def immZExt1Ptr : ImmLeaf<iPTR, [{return isUInt<1>(Imm);}]>;
+def immZExt2Ptr : ImmLeaf<iPTR, [{return isUInt<2>(Imm);}]>;
+def immZExt3Ptr : ImmLeaf<iPTR, [{return isUInt<3>(Imm);}]>;
+def immZExt4Ptr : ImmLeaf<iPTR, [{return isUInt<4>(Imm);}]>;
+
+// Operands
+
+def immZExt2Lsa : ImmLeaf<i32, [{return isUInt<2>(Imm - 1);}]>;
+
+// Pattern fragments
+def vextract_sext_i8  : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i8)>;
+def vextract_sext_i16 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i16)>;
+def vextract_sext_i32 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i32)>;
+def vextract_sext_i64 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i64)>;
+
+def vextract_zext_i8  : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i8)>;
+def vextract_zext_i16 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i16)>;
+def vextract_zext_i32 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i32)>;
+def vextract_zext_i64 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i64)>;
+
+def vinsert_v16i8 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v16i8 (vector_insert node:$vec, node:$val, node:$idx))>;
+def vinsert_v8i16 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v8i16 (vector_insert node:$vec, node:$val, node:$idx))>;
+def vinsert_v4i32 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v4i32 (vector_insert node:$vec, node:$val, node:$idx))>;
+def vinsert_v2i64 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v2i64 (vector_insert node:$vec, node:$val, node:$idx))>;
+
+def insve_v16i8 : PatFrag<(ops node:$v1, node:$i1, node:$v2, node:$i2),
+    (v16i8 (MipsINSVE node:$v1, node:$i1, node:$v2, node:$i2))>;
+def insve_v8i16 : PatFrag<(ops node:$v1, node:$i1, node:$v2, node:$i2),
+    (v8i16 (MipsINSVE node:$v1, node:$i1, node:$v2, node:$i2))>;
+def insve_v4i32 : PatFrag<(ops node:$v1, node:$i1, node:$v2, node:$i2),
+    (v4i32 (MipsINSVE node:$v1, node:$i1, node:$v2, node:$i2))>;
+def insve_v2i64 : PatFrag<(ops node:$v1, node:$i1, node:$v2, node:$i2),
+    (v2i64 (MipsINSVE node:$v1, node:$i1, node:$v2, node:$i2))>;
+
+class vfsetcc_type<ValueType ResTy, ValueType OpTy, CondCode CC> :
+  PatFrag<(ops node:$lhs, node:$rhs),
+          (ResTy (vfsetcc (OpTy node:$lhs), (OpTy node:$rhs), CC))>;
+
+// ISD::SETFALSE cannot occur
+def vfsetoeq_v4f32 : vfsetcc_type<v4i32, v4f32, SETOEQ>;
+def vfsetoeq_v2f64 : vfsetcc_type<v2i64, v2f64, SETOEQ>;
+def vfsetoge_v4f32 : vfsetcc_type<v4i32, v4f32, SETOGE>;
+def vfsetoge_v2f64 : vfsetcc_type<v2i64, v2f64, SETOGE>;
+def vfsetogt_v4f32 : vfsetcc_type<v4i32, v4f32, SETOGT>;
+def vfsetogt_v2f64 : vfsetcc_type<v2i64, v2f64, SETOGT>;
+def vfsetole_v4f32 : vfsetcc_type<v4i32, v4f32, SETOLE>;
+def vfsetole_v2f64 : vfsetcc_type<v2i64, v2f64, SETOLE>;
+def vfsetolt_v4f32 : vfsetcc_type<v4i32, v4f32, SETOLT>;
+def vfsetolt_v2f64 : vfsetcc_type<v2i64, v2f64, SETOLT>;
+def vfsetone_v4f32 : vfsetcc_type<v4i32, v4f32, SETONE>;
+def vfsetone_v2f64 : vfsetcc_type<v2i64, v2f64, SETONE>;
+def vfsetord_v4f32 : vfsetcc_type<v4i32, v4f32, SETO>;
+def vfsetord_v2f64 : vfsetcc_type<v2i64, v2f64, SETO>;
+def vfsetun_v4f32  : vfsetcc_type<v4i32, v4f32, SETUO>;
+def vfsetun_v2f64  : vfsetcc_type<v2i64, v2f64, SETUO>;
+def vfsetueq_v4f32 : vfsetcc_type<v4i32, v4f32, SETUEQ>;
+def vfsetueq_v2f64 : vfsetcc_type<v2i64, v2f64, SETUEQ>;
+def vfsetuge_v4f32 : vfsetcc_type<v4i32, v4f32, SETUGE>;
+def vfsetuge_v2f64 : vfsetcc_type<v2i64, v2f64, SETUGE>;
+def vfsetugt_v4f32 : vfsetcc_type<v4i32, v4f32, SETUGT>;
+def vfsetugt_v2f64 : vfsetcc_type<v2i64, v2f64, SETUGT>;
+def vfsetule_v4f32 : vfsetcc_type<v4i32, v4f32, SETULE>;
+def vfsetule_v2f64 : vfsetcc_type<v2i64, v2f64, SETULE>;
+def vfsetult_v4f32 : vfsetcc_type<v4i32, v4f32, SETULT>;
+def vfsetult_v2f64 : vfsetcc_type<v2i64, v2f64, SETULT>;
+def vfsetune_v4f32 : vfsetcc_type<v4i32, v4f32, SETUNE>;
+def vfsetune_v2f64 : vfsetcc_type<v2i64, v2f64, SETUNE>;
+// ISD::SETTRUE cannot occur
+// ISD::SETFALSE2 cannot occur
+// ISD::SETTRUE2 cannot occur
+
+class vsetcc_type<ValueType ResTy, CondCode CC> :
+  PatFrag<(ops node:$lhs, node:$rhs),
+          (ResTy (vsetcc node:$lhs, node:$rhs, CC))>;
+
+def vseteq_v16i8  : vsetcc_type<v16i8, SETEQ>;
+def vseteq_v8i16  : vsetcc_type<v8i16, SETEQ>;
+def vseteq_v4i32  : vsetcc_type<v4i32, SETEQ>;
+def vseteq_v2i64  : vsetcc_type<v2i64, SETEQ>;
+def vsetle_v16i8  : vsetcc_type<v16i8, SETLE>;
+def vsetle_v8i16  : vsetcc_type<v8i16, SETLE>;
+def vsetle_v4i32  : vsetcc_type<v4i32, SETLE>;
+def vsetle_v2i64  : vsetcc_type<v2i64, SETLE>;
+def vsetlt_v16i8  : vsetcc_type<v16i8, SETLT>;
+def vsetlt_v8i16  : vsetcc_type<v8i16, SETLT>;
+def vsetlt_v4i32  : vsetcc_type<v4i32, SETLT>;
+def vsetlt_v2i64  : vsetcc_type<v2i64, SETLT>;
+def vsetule_v16i8 : vsetcc_type<v16i8, SETULE>;
+def vsetule_v8i16 : vsetcc_type<v8i16, SETULE>;
+def vsetule_v4i32 : vsetcc_type<v4i32, SETULE>;
+def vsetule_v2i64 : vsetcc_type<v2i64, SETULE>;
+def vsetult_v16i8 : vsetcc_type<v16i8, SETULT>;
+def vsetult_v8i16 : vsetcc_type<v8i16, SETULT>;
+def vsetult_v4i32 : vsetcc_type<v4i32, SETULT>;
+def vsetult_v2i64 : vsetcc_type<v2i64, SETULT>;
+
+def vsplati8  : PatFrag<(ops node:$e0),
+                        (v16i8 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplati16 : PatFrag<(ops node:$e0),
+                        (v8i16 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplati32 : PatFrag<(ops node:$e0),
+                        (v4i32 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplati64 : PatFrag<(ops node:$e0),
+                        (v2i64 (build_vector node:$e0, node:$e0))>;
+def vsplatf32 : PatFrag<(ops node:$e0),
+                        (v4f32 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplatf64 : PatFrag<(ops node:$e0),
+                        (v2f64 (build_vector node:$e0, node:$e0))>;
+
+def vsplati8_elt  : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati8 node:$i), node:$v, node:$v)>;
+def vsplati16_elt : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati16 node:$i), node:$v, node:$v)>;
+def vsplati32_elt : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati32 node:$i), node:$v, node:$v)>;
+def vsplati64_elt : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati64 node:$i), node:$v, node:$v)>;
+
+class SplatPatLeaf<Operand opclass, dag frag, code pred = [{}],
+                   SDNodeXForm xform = NOOP_SDNodeXForm>
+  : PatLeaf<frag, pred, xform> {
+  Operand OpClass = opclass;
+}
+
+class SplatComplexPattern<Operand opclass, ValueType ty, int numops, string fn,
+                          list<SDNode> roots = [],
+                          list<SDNodeProperty> props = []> :
+  ComplexPattern<ty, numops, fn, roots, props> {
+  Operand OpClass = opclass;
+}
+
+def vsplati8_uimm3 : SplatComplexPattern<vsplat_uimm3, v16i8, 1,
+                                         "selectVSplatUimm3",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_uimm4 : SplatComplexPattern<vsplat_uimm4, v16i8, 1,
+                                         "selectVSplatUimm4",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_uimm5 : SplatComplexPattern<vsplat_uimm5, v16i8, 1,
+                                         "selectVSplatUimm5",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_uimm8 : SplatComplexPattern<vsplat_uimm8, v16i8, 1,
+                                         "selectVSplatUimm8",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_simm5 : SplatComplexPattern<vsplat_simm5, v16i8, 1,
+                                         "selectVSplatSimm5",
+                                         [build_vector, bitconvert]>;
+
+def vsplati16_uimm3 : SplatComplexPattern<vsplat_uimm3, v8i16, 1,
+                                          "selectVSplatUimm3",
+                                          [build_vector, bitconvert]>;
+
+def vsplati16_uimm4 : SplatComplexPattern<vsplat_uimm4, v8i16, 1,
+                                          "selectVSplatUimm4",
+                                          [build_vector, bitconvert]>;
+
+def vsplati16_uimm5 : SplatComplexPattern<vsplat_uimm5, v8i16, 1,
+                                          "selectVSplatUimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati16_simm5 : SplatComplexPattern<vsplat_simm5, v8i16, 1,
+                                          "selectVSplatSimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati32_uimm2 : SplatComplexPattern<vsplat_uimm2, v4i32, 1,
+                                          "selectVSplatUimm2",
+                                          [build_vector, bitconvert]>;
+
+def vsplati32_uimm5 : SplatComplexPattern<vsplat_uimm5, v4i32, 1,
+                                          "selectVSplatUimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati32_simm5 : SplatComplexPattern<vsplat_simm5, v4i32, 1,
+                                          "selectVSplatSimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_uimm1 : SplatComplexPattern<vsplat_uimm1, v2i64, 1,
+                                          "selectVSplatUimm1",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_uimm5 : SplatComplexPattern<vsplat_uimm5, v2i64, 1,
+                                          "selectVSplatUimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_uimm6 : SplatComplexPattern<vsplat_uimm6, v2i64, 1,
+                                          "selectVSplatUimm6",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_simm5 : SplatComplexPattern<vsplat_simm5, v2i64, 1,
+                                          "selectVSplatSimm5",
+                                          [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that is an exact
+// power of 2
+def vsplat_uimm_pow2 : ComplexPattern<vAny, 1, "selectVSplatUimmPow2",
+                                      [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that is the bitwise
+// inverse of an exact power of 2
+def vsplat_uimm_inv_pow2 : ComplexPattern<vAny, 1, "selectVSplatUimmInvPow2",
+                                          [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with only a consecutive sequence
+// of left-most bits set.
+def vsplat_maskl_bits_uimm3
+    : SplatComplexPattern<vsplat_uimm3, vAny, 1, "selectVSplatMaskL",
+                          [build_vector, bitconvert]>;
+def vsplat_maskl_bits_uimm4
+    : SplatComplexPattern<vsplat_uimm4, vAny, 1, "selectVSplatMaskL",
+                          [build_vector, bitconvert]>;
+def vsplat_maskl_bits_uimm5
+    : SplatComplexPattern<vsplat_uimm5, vAny, 1, "selectVSplatMaskL",
+                          [build_vector, bitconvert]>;
+def vsplat_maskl_bits_uimm6
+    : SplatComplexPattern<vsplat_uimm6, vAny, 1, "selectVSplatMaskL",
+                          [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with only a consecutive sequence
+// of right-most bits set.
+def vsplat_maskr_bits_uimm3
+    : SplatComplexPattern<vsplat_uimm3, vAny, 1, "selectVSplatMaskR",
+                          [build_vector, bitconvert]>;
+def vsplat_maskr_bits_uimm4
+    : SplatComplexPattern<vsplat_uimm4, vAny, 1, "selectVSplatMaskR",
+                          [build_vector, bitconvert]>;
+def vsplat_maskr_bits_uimm5
+    : SplatComplexPattern<vsplat_uimm5, vAny, 1, "selectVSplatMaskR",
+                          [build_vector, bitconvert]>;
+def vsplat_maskr_bits_uimm6
+    : SplatComplexPattern<vsplat_uimm6, vAny, 1, "selectVSplatMaskR",
+                          [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that equals 1
+// FIXME: These should be a ComplexPattern but we can't use them because the
+//        ISel generator requires the uses to have a name, but providing a name
+//        causes other errors ("used in pattern but not operand list")
+def vsplat_imm_eq_1 : PatLeaf<(build_vector), [{
+  APInt Imm;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  return selectVSplat(N, Imm, EltTy.getSizeInBits()) &&
+         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
+}]>;
+
+def vsplati64_imm_eq_1 : PatLeaf<(bitconvert (v4i32 (build_vector))), [{
+  APInt Imm;
+  SDNode *BV = N->getOperand(0).getNode();
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  return selectVSplat(BV, Imm, EltTy.getSizeInBits()) &&
+         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
+}]>;
+
+def vbclr_b : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+                                          immAllOnesV))>;
+def vbclr_h : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+                                          immAllOnesV))>;
+def vbclr_w : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+                                          immAllOnesV))>;
+def vbclr_d : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl (v2i64 vsplati64_imm_eq_1),
+                                               node:$wt),
+                                          (bitconvert (v4i32 immAllOnesV))))>;
+
+def vbneg_b : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_h : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_w : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_d : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl (v2i64 vsplati64_imm_eq_1),
+                                          node:$wt))>;
+
+def vbset_b : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_h : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_w : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_d : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl (v2i64 vsplati64_imm_eq_1),
+                                         node:$wt))>;
+
+def fms : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+                  (fsub node:$wd, (fmul node:$ws, node:$wt))>;
+
+def muladd : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+                     (add node:$wd, (mul node:$ws, node:$wt))>;
+
+def mulsub : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+                     (sub node:$wd, (mul node:$ws, node:$wt))>;
+
+def mul_fexp2 : PatFrag<(ops node:$ws, node:$wt),
+                        (fmul node:$ws, (fexp2 node:$wt))>;
+
+// Instruction encoding.
+class ADD_A_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010000>;
+class ADD_A_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010000>;
+class ADD_A_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010000>;
+class ADD_A_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010000>;
+
+class ADDS_A_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010000>;
+class ADDS_A_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010000>;
+class ADDS_A_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010000>;
+class ADDS_A_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010000>;
+
+class ADDS_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010000>;
+class ADDS_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010000>;
+class ADDS_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010000>;
+class ADDS_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010000>;
+
+class ADDS_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010000>;
+class ADDS_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010000>;
+class ADDS_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010000>;
+class ADDS_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010000>;
+
+class ADDV_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001110>;
+class ADDV_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001110>;
+class ADDV_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001110>;
+class ADDV_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001110>;
+
+class ADDVI_B_ENC : MSA_I5_FMT<0b000, 0b00, 0b000110>;
+class ADDVI_H_ENC : MSA_I5_FMT<0b000, 0b01, 0b000110>;
+class ADDVI_W_ENC : MSA_I5_FMT<0b000, 0b10, 0b000110>;
+class ADDVI_D_ENC : MSA_I5_FMT<0b000, 0b11, 0b000110>;
+
+class AND_V_ENC : MSA_VEC_FMT<0b00000, 0b011110>;
+
+class ANDI_B_ENC : MSA_I8_FMT<0b00, 0b000000>;
+
+class ASUB_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010001>;
+class ASUB_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010001>;
+class ASUB_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010001>;
+class ASUB_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010001>;
+
+class ASUB_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010001>;
+class ASUB_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010001>;
+class ASUB_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010001>;
+class ASUB_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010001>;
+
+class AVE_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010000>;
+class AVE_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010000>;
+class AVE_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010000>;
+class AVE_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010000>;
+
+class AVE_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010000>;
+class AVE_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010000>;
+class AVE_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010000>;
+class AVE_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010000>;
+
+class AVER_S_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010000>;
+class AVER_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010000>;
+class AVER_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010000>;
+class AVER_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010000>;
+
+class AVER_U_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010000>;
+class AVER_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010000>;
+class AVER_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010000>;
+class AVER_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010000>;
+
+class BCLR_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001101>;
+class BCLR_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001101>;
+class BCLR_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001101>;
+class BCLR_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001101>;
+
+class BCLRI_B_ENC : MSA_BIT_B_FMT<0b011, 0b001001>;
+class BCLRI_H_ENC : MSA_BIT_H_FMT<0b011, 0b001001>;
+class BCLRI_W_ENC : MSA_BIT_W_FMT<0b011, 0b001001>;
+class BCLRI_D_ENC : MSA_BIT_D_FMT<0b011, 0b001001>;
+
+class BINSL_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b001101>;
+class BINSL_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b001101>;
+class BINSL_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b001101>;
+class BINSL_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b001101>;
+
+class BINSLI_B_ENC : MSA_BIT_B_FMT<0b110, 0b001001>;
+class BINSLI_H_ENC : MSA_BIT_H_FMT<0b110, 0b001001>;
+class BINSLI_W_ENC : MSA_BIT_W_FMT<0b110, 0b001001>;
+class BINSLI_D_ENC : MSA_BIT_D_FMT<0b110, 0b001001>;
+
+class BINSR_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b001101>;
+class BINSR_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b001101>;
+class BINSR_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b001101>;
+class BINSR_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b001101>;
+
+class BINSRI_B_ENC : MSA_BIT_B_FMT<0b111, 0b001001>;
+class BINSRI_H_ENC : MSA_BIT_H_FMT<0b111, 0b001001>;
+class BINSRI_W_ENC : MSA_BIT_W_FMT<0b111, 0b001001>;
+class BINSRI_D_ENC : MSA_BIT_D_FMT<0b111, 0b001001>;
+
+class BMNZ_V_ENC : MSA_VEC_FMT<0b00100, 0b011110>;
+
+class BMNZI_B_ENC : MSA_I8_FMT<0b00, 0b000001>;
+
+class BMZ_V_ENC : MSA_VEC_FMT<0b00101, 0b011110>;
+
+class BMZI_B_ENC : MSA_I8_FMT<0b01, 0b000001>;
+
+class BNEG_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001101>;
+class BNEG_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001101>;
+class BNEG_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001101>;
+class BNEG_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001101>;
+
+class BNEGI_B_ENC : MSA_BIT_B_FMT<0b101, 0b001001>;
+class BNEGI_H_ENC : MSA_BIT_H_FMT<0b101, 0b001001>;
+class BNEGI_W_ENC : MSA_BIT_W_FMT<0b101, 0b001001>;
+class BNEGI_D_ENC : MSA_BIT_D_FMT<0b101, 0b001001>;
+
+class BNZ_B_ENC : MSA_CBRANCH_FMT<0b111, 0b00>;
+class BNZ_H_ENC : MSA_CBRANCH_FMT<0b111, 0b01>;
+class BNZ_W_ENC : MSA_CBRANCH_FMT<0b111, 0b10>;
+class BNZ_D_ENC : MSA_CBRANCH_FMT<0b111, 0b11>;
+
+class BNZ_V_ENC : MSA_CBRANCH_V_FMT<0b01111>;
+
+class BSEL_V_ENC : MSA_VEC_FMT<0b00110, 0b011110>;
+
+class BSELI_B_ENC : MSA_I8_FMT<0b10, 0b000001>;
+
+class BSET_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001101>;
+class BSET_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001101>;
+class BSET_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001101>;
+class BSET_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001101>;
+
+class BSETI_B_ENC : MSA_BIT_B_FMT<0b100, 0b001001>;
+class BSETI_H_ENC : MSA_BIT_H_FMT<0b100, 0b001001>;
+class BSETI_W_ENC : MSA_BIT_W_FMT<0b100, 0b001001>;
+class BSETI_D_ENC : MSA_BIT_D_FMT<0b100, 0b001001>;
+
+class BZ_B_ENC : MSA_CBRANCH_FMT<0b110, 0b00>;
+class BZ_H_ENC : MSA_CBRANCH_FMT<0b110, 0b01>;
+class BZ_W_ENC : MSA_CBRANCH_FMT<0b110, 0b10>;
+class BZ_D_ENC : MSA_CBRANCH_FMT<0b110, 0b11>;
+
+class BZ_V_ENC : MSA_CBRANCH_V_FMT<0b01011>;
+
+class CEQ_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001111>;
+class CEQ_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001111>;
+class CEQ_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001111>;
+class CEQ_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001111>;
+
+class CEQI_B_ENC : MSA_I5_FMT<0b000, 0b00, 0b000111>;
+class CEQI_H_ENC : MSA_I5_FMT<0b000, 0b01, 0b000111>;
+class CEQI_W_ENC : MSA_I5_FMT<0b000, 0b10, 0b000111>;
+class CEQI_D_ENC : MSA_I5_FMT<0b000, 0b11, 0b000111>;
+
+class CFCMSA_ENC : MSA_ELM_CFCMSA_FMT<0b0001111110, 0b011001>;
+
+class CLE_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001111>;
+class CLE_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001111>;
+class CLE_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001111>;
+class CLE_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001111>;
+
+class CLE_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001111>;
+class CLE_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001111>;
+class CLE_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001111>;
+class CLE_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001111>;
+
+class CLEI_S_B_ENC : MSA_I5_FMT<0b100, 0b00, 0b000111>;
+class CLEI_S_H_ENC : MSA_I5_FMT<0b100, 0b01, 0b000111>;
+class CLEI_S_W_ENC : MSA_I5_FMT<0b100, 0b10, 0b000111>;
+class CLEI_S_D_ENC : MSA_I5_FMT<0b100, 0b11, 0b000111>;
+
+class CLEI_U_B_ENC : MSA_I5_FMT<0b101, 0b00, 0b000111>;
+class CLEI_U_H_ENC : MSA_I5_FMT<0b101, 0b01, 0b000111>;
+class CLEI_U_W_ENC : MSA_I5_FMT<0b101, 0b10, 0b000111>;
+class CLEI_U_D_ENC : MSA_I5_FMT<0b101, 0b11, 0b000111>;
+
+class CLT_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001111>;
+class CLT_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001111>;
+class CLT_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001111>;
+class CLT_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001111>;
+
+class CLT_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001111>;
+class CLT_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001111>;
+class CLT_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001111>;
+class CLT_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001111>;
+
+class CLTI_S_B_ENC : MSA_I5_FMT<0b010, 0b00, 0b000111>;
+class CLTI_S_H_ENC : MSA_I5_FMT<0b010, 0b01, 0b000111>;
+class CLTI_S_W_ENC : MSA_I5_FMT<0b010, 0b10, 0b000111>;
+class CLTI_S_D_ENC : MSA_I5_FMT<0b010, 0b11, 0b000111>;
+
+class CLTI_U_B_ENC : MSA_I5_FMT<0b011, 0b00, 0b000111>;
+class CLTI_U_H_ENC : MSA_I5_FMT<0b011, 0b01, 0b000111>;
+class CLTI_U_W_ENC : MSA_I5_FMT<0b011, 0b10, 0b000111>;
+class CLTI_U_D_ENC : MSA_I5_FMT<0b011, 0b11, 0b000111>;
+
+class COPY_S_B_ENC : MSA_ELM_COPY_B_FMT<0b0010, 0b011001>;
+class COPY_S_H_ENC : MSA_ELM_COPY_H_FMT<0b0010, 0b011001>;
+class COPY_S_W_ENC : MSA_ELM_COPY_W_FMT<0b0010, 0b011001>;
+class COPY_S_D_ENC : MSA_ELM_COPY_D_FMT<0b0010, 0b011001>;
+
+class COPY_U_B_ENC : MSA_ELM_COPY_B_FMT<0b0011, 0b011001>;
+class COPY_U_H_ENC : MSA_ELM_COPY_H_FMT<0b0011, 0b011001>;
+class COPY_U_W_ENC : MSA_ELM_COPY_W_FMT<0b0011, 0b011001>;
+
+class CTCMSA_ENC : MSA_ELM_CTCMSA_FMT<0b0000111110, 0b011001>;
+
+class DIV_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010010>;
+class DIV_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010010>;
+class DIV_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010010>;
+class DIV_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010010>;
+
+class DIV_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010010>;
+class DIV_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010010>;
+class DIV_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010010>;
+class DIV_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010010>;
+
+class DOTP_S_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010011>;
+class DOTP_S_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010011>;
+class DOTP_S_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010011>;
+
+class DOTP_U_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010011>;
+class DOTP_U_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010011>;
+class DOTP_U_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010011>;
+
+class DPADD_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010011>;
+class DPADD_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010011>;
+class DPADD_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010011>;
+
+class DPADD_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010011>;
+class DPADD_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010011>;
+class DPADD_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010011>;
+
+class DPSUB_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010011>;
+class DPSUB_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010011>;
+class DPSUB_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010011>;
+
+class DPSUB_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010011>;
+class DPSUB_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010011>;
+class DPSUB_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010011>;
+
+class FADD_W_ENC : MSA_3RF_FMT<0b0000, 0b0, 0b011011>;
+class FADD_D_ENC : MSA_3RF_FMT<0b0000, 0b1, 0b011011>;
+
+class FCAF_W_ENC : MSA_3RF_FMT<0b0000, 0b0, 0b011010>;
+class FCAF_D_ENC : MSA_3RF_FMT<0b0000, 0b1, 0b011010>;
+
+class FCEQ_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011010>;
+class FCEQ_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011010>;
+
+class FCLASS_W_ENC : MSA_2RF_FMT<0b110010000, 0b0, 0b011110>;
+class FCLASS_D_ENC : MSA_2RF_FMT<0b110010000, 0b1, 0b011110>;
+
+class FCLE_W_ENC : MSA_3RF_FMT<0b0110, 0b0, 0b011010>;
+class FCLE_D_ENC : MSA_3RF_FMT<0b0110, 0b1, 0b011010>;
+
+class FCLT_W_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011010>;
+class FCLT_D_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011010>;
+
+class FCNE_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011100>;
+class FCNE_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011100>;
+
+class FCOR_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011100>;
+class FCOR_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011100>;
+
+class FCUEQ_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011010>;
+class FCUEQ_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011010>;
+
+class FCULE_W_ENC : MSA_3RF_FMT<0b0111, 0b0, 0b011010>;
+class FCULE_D_ENC : MSA_3RF_FMT<0b0111, 0b1, 0b011010>;
+
+class FCULT_W_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011010>;
+class FCULT_D_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011010>;
+
+class FCUN_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011010>;
+class FCUN_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011010>;
+
+class FCUNE_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011100>;
+class FCUNE_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011100>;
+
+class FDIV_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011011>;
+class FDIV_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011011>;
+
+class FEXDO_H_ENC : MSA_3RF_FMT<0b1000, 0b0, 0b011011>;
+class FEXDO_W_ENC : MSA_3RF_FMT<0b1000, 0b1, 0b011011>;
+
+class FEXP2_W_ENC : MSA_3RF_FMT<0b0111, 0b0, 0b011011>;
+class FEXP2_D_ENC : MSA_3RF_FMT<0b0111, 0b1, 0b011011>;
+
+class FEXUPL_W_ENC : MSA_2RF_FMT<0b110011000, 0b0, 0b011110>;
+class FEXUPL_D_ENC : MSA_2RF_FMT<0b110011000, 0b1, 0b011110>;
+
+class FEXUPR_W_ENC : MSA_2RF_FMT<0b110011001, 0b0, 0b011110>;
+class FEXUPR_D_ENC : MSA_2RF_FMT<0b110011001, 0b1, 0b011110>;
+
+class FFINT_S_W_ENC : MSA_2RF_FMT<0b110011110, 0b0, 0b011110>;
+class FFINT_S_D_ENC : MSA_2RF_FMT<0b110011110, 0b1, 0b011110>;
+
+class FFINT_U_W_ENC : MSA_2RF_FMT<0b110011111, 0b0, 0b011110>;
+class FFINT_U_D_ENC : MSA_2RF_FMT<0b110011111, 0b1, 0b011110>;
+
+class FFQL_W_ENC : MSA_2RF_FMT<0b110011010, 0b0, 0b011110>;
+class FFQL_D_ENC : MSA_2RF_FMT<0b110011010, 0b1, 0b011110>;
+
+class FFQR_W_ENC : MSA_2RF_FMT<0b110011011, 0b0, 0b011110>;
+class FFQR_D_ENC : MSA_2RF_FMT<0b110011011, 0b1, 0b011110>;
+
+class FILL_B_ENC : MSA_2R_FILL_FMT<0b11000000, 0b00, 0b011110>;
+class FILL_H_ENC : MSA_2R_FILL_FMT<0b11000000, 0b01, 0b011110>;
+class FILL_W_ENC : MSA_2R_FILL_FMT<0b11000000, 0b10, 0b011110>;
+class FILL_D_ENC : MSA_2R_FILL_D_FMT<0b11000000, 0b11, 0b011110>;
+
+class FLOG2_W_ENC : MSA_2RF_FMT<0b110010111, 0b0, 0b011110>;
+class FLOG2_D_ENC : MSA_2RF_FMT<0b110010111, 0b1, 0b011110>;
+
+class FMADD_W_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011011>;
+class FMADD_D_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011011>;
+
+class FMAX_W_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011011>;
+class FMAX_D_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011011>;
+
+class FMAX_A_W_ENC : MSA_3RF_FMT<0b1111, 0b0, 0b011011>;
+class FMAX_A_D_ENC : MSA_3RF_FMT<0b1111, 0b1, 0b011011>;
+
+class FMIN_W_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011011>;
+class FMIN_D_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011011>;
+
+class FMIN_A_W_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011011>;
+class FMIN_A_D_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011011>;
+
+class FMSUB_W_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011011>;
+class FMSUB_D_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011011>;
+
+class FMUL_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011011>;
+class FMUL_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011011>;
+
+class FRINT_W_ENC : MSA_2RF_FMT<0b110010110, 0b0, 0b011110>;
+class FRINT_D_ENC : MSA_2RF_FMT<0b110010110, 0b1, 0b011110>;
+
+class FRCP_W_ENC : MSA_2RF_FMT<0b110010101, 0b0, 0b011110>;
+class FRCP_D_ENC : MSA_2RF_FMT<0b110010101, 0b1, 0b011110>;
+
+class FRSQRT_W_ENC : MSA_2RF_FMT<0b110010100, 0b0, 0b011110>;
+class FRSQRT_D_ENC : MSA_2RF_FMT<0b110010100, 0b1, 0b011110>;
+
+class FSAF_W_ENC : MSA_3RF_FMT<0b1000, 0b0, 0b011010>;
+class FSAF_D_ENC : MSA_3RF_FMT<0b1000, 0b1, 0b011010>;
+
+class FSEQ_W_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011010>;
+class FSEQ_D_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011010>;
+
+class FSLE_W_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011010>;
+class FSLE_D_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011010>;
+
+class FSLT_W_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011010>;
+class FSLT_D_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011010>;
+
+class FSNE_W_ENC : MSA_3RF_FMT<0b1011, 0b0, 0b011100>;
+class FSNE_D_ENC : MSA_3RF_FMT<0b1011, 0b1, 0b011100>;
+
+class FSOR_W_ENC : MSA_3RF_FMT<0b1001, 0b0, 0b011100>;
+class FSOR_D_ENC : MSA_3RF_FMT<0b1001, 0b1, 0b011100>;
+
+class FSQRT_W_ENC : MSA_2RF_FMT<0b110010011, 0b0, 0b011110>;
+class FSQRT_D_ENC : MSA_2RF_FMT<0b110010011, 0b1, 0b011110>;
+
+class FSUB_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011011>;
+class FSUB_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011011>;
+
+class FSUEQ_W_ENC : MSA_3RF_FMT<0b1011, 0b0, 0b011010>;
+class FSUEQ_D_ENC : MSA_3RF_FMT<0b1011, 0b1, 0b011010>;
+
+class FSULE_W_ENC : MSA_3RF_FMT<0b1111, 0b0, 0b011010>;
+class FSULE_D_ENC : MSA_3RF_FMT<0b1111, 0b1, 0b011010>;
+
+class FSULT_W_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011010>;
+class FSULT_D_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011010>;
+
+class FSUN_W_ENC : MSA_3RF_FMT<0b1001, 0b0, 0b011010>;
+class FSUN_D_ENC : MSA_3RF_FMT<0b1001, 0b1, 0b011010>;
+
+class FSUNE_W_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011100>;
+class FSUNE_D_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011100>;
+
+class FTINT_S_W_ENC : MSA_2RF_FMT<0b110011100, 0b0, 0b011110>;
+class FTINT_S_D_ENC : MSA_2RF_FMT<0b110011100, 0b1, 0b011110>;
+
+class FTINT_U_W_ENC : MSA_2RF_FMT<0b110011101, 0b0, 0b011110>;
+class FTINT_U_D_ENC : MSA_2RF_FMT<0b110011101, 0b1, 0b011110>;
+
+class FTQ_H_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011011>;
+class FTQ_W_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011011>;
+
+class FTRUNC_S_W_ENC : MSA_2RF_FMT<0b110010001, 0b0, 0b011110>;
+class FTRUNC_S_D_ENC : MSA_2RF_FMT<0b110010001, 0b1, 0b011110>;
+
+class FTRUNC_U_W_ENC : MSA_2RF_FMT<0b110010010, 0b0, 0b011110>;
+class FTRUNC_U_D_ENC : MSA_2RF_FMT<0b110010010, 0b1, 0b011110>;
+
+class HADD_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010101>;
+class HADD_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010101>;
+class HADD_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010101>;
+
+class HADD_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010101>;
+class HADD_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010101>;
+class HADD_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010101>;
+
+class HSUB_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010101>;
+class HSUB_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010101>;
+class HSUB_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010101>;
+
+class HSUB_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010101>;
+class HSUB_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010101>;
+class HSUB_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010101>;
+
+class ILVEV_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010100>;
+class ILVEV_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010100>;
+class ILVEV_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010100>;
+class ILVEV_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010100>;
+
+class ILVL_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010100>;
+class ILVL_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010100>;
+class ILVL_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010100>;
+class ILVL_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010100>;
+
+class ILVOD_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010100>;
+class ILVOD_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010100>;
+class ILVOD_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010100>;
+class ILVOD_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010100>;
+
+class ILVR_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010100>;
+class ILVR_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010100>;
+class ILVR_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010100>;
+class ILVR_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010100>;
+
+class INSERT_B_ENC : MSA_ELM_INSERT_B_FMT<0b0100, 0b011001>;
+class INSERT_H_ENC : MSA_ELM_INSERT_H_FMT<0b0100, 0b011001>;
+class INSERT_W_ENC : MSA_ELM_INSERT_W_FMT<0b0100, 0b011001>;
+class INSERT_D_ENC : MSA_ELM_INSERT_D_FMT<0b0100, 0b011001>;
+
+class INSVE_B_ENC : MSA_ELM_B_FMT<0b0101, 0b011001>;
+class INSVE_H_ENC : MSA_ELM_H_FMT<0b0101, 0b011001>;
+class INSVE_W_ENC : MSA_ELM_W_FMT<0b0101, 0b011001>;
+class INSVE_D_ENC : MSA_ELM_D_FMT<0b0101, 0b011001>;
+
+class LD_B_ENC   : MSA_MI10_FMT<0b00, 0b1000>;
+class LD_H_ENC   : MSA_MI10_FMT<0b01, 0b1000>;
+class LD_W_ENC   : MSA_MI10_FMT<0b10, 0b1000>;
+class LD_D_ENC   : MSA_MI10_FMT<0b11, 0b1000>;
+
+class LDI_B_ENC  : MSA_I10_FMT<0b110, 0b00, 0b000111>;
+class LDI_H_ENC  : MSA_I10_FMT<0b110, 0b01, 0b000111>;
+class LDI_W_ENC  : MSA_I10_FMT<0b110, 0b10, 0b000111>;
+class LDI_D_ENC  : MSA_I10_FMT<0b110, 0b11, 0b000111>;
+
+class LSA_ENC : SPECIAL_LSA_FMT<0b000101>;
+class DLSA_ENC : SPECIAL_DLSA_FMT<0b010101>;
+
+class MADD_Q_H_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011100>;
+class MADD_Q_W_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011100>;
+
+class MADDR_Q_H_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011100>;
+class MADDR_Q_W_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011100>;
+
+class MADDV_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010010>;
+class MADDV_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010010>;
+class MADDV_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010010>;
+class MADDV_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010010>;
+
+class MAX_A_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b001110>;
+class MAX_A_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b001110>;
+class MAX_A_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b001110>;
+class MAX_A_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b001110>;
+
+class MAX_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001110>;
+class MAX_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001110>;
+class MAX_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001110>;
+class MAX_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001110>;
+
+class MAX_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001110>;
+class MAX_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001110>;
+class MAX_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001110>;
+class MAX_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001110>;
+
+class MAXI_S_B_ENC : MSA_I5_FMT<0b010, 0b00, 0b000110>;
+class MAXI_S_H_ENC : MSA_I5_FMT<0b010, 0b01, 0b000110>;
+class MAXI_S_W_ENC : MSA_I5_FMT<0b010, 0b10, 0b000110>;
+class MAXI_S_D_ENC : MSA_I5_FMT<0b010, 0b11, 0b000110>;
+
+class MAXI_U_B_ENC : MSA_I5_FMT<0b011, 0b00, 0b000110>;
+class MAXI_U_H_ENC : MSA_I5_FMT<0b011, 0b01, 0b000110>;
+class MAXI_U_W_ENC : MSA_I5_FMT<0b011, 0b10, 0b000110>;
+class MAXI_U_D_ENC : MSA_I5_FMT<0b011, 0b11, 0b000110>;
+
+class MIN_A_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b001110>;
+class MIN_A_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b001110>;
+class MIN_A_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b001110>;
+class MIN_A_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b001110>;
+
+class MIN_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001110>;
+class MIN_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001110>;
+class MIN_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001110>;
+class MIN_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001110>;
+
+class MIN_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001110>;
+class MIN_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001110>;
+class MIN_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001110>;
+class MIN_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001110>;
+
+class MINI_S_B_ENC : MSA_I5_FMT<0b100, 0b00, 0b000110>;
+class MINI_S_H_ENC : MSA_I5_FMT<0b100, 0b01, 0b000110>;
+class MINI_S_W_ENC : MSA_I5_FMT<0b100, 0b10, 0b000110>;
+class MINI_S_D_ENC : MSA_I5_FMT<0b100, 0b11, 0b000110>;
+
+class MINI_U_B_ENC : MSA_I5_FMT<0b101, 0b00, 0b000110>;
+class MINI_U_H_ENC : MSA_I5_FMT<0b101, 0b01, 0b000110>;
+class MINI_U_W_ENC : MSA_I5_FMT<0b101, 0b10, 0b000110>;
+class MINI_U_D_ENC : MSA_I5_FMT<0b101, 0b11, 0b000110>;
+
+class MOD_S_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010010>;
+class MOD_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010010>;
+class MOD_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010010>;
+class MOD_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010010>;
+
+class MOD_U_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010010>;
+class MOD_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010010>;
+class MOD_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010010>;
+class MOD_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010010>;
+
+class MOVE_V_ENC : MSA_ELM_FMT<0b0010111110, 0b011001>;
+
+class MSUB_Q_H_ENC : MSA_3RF_FMT<0b0110, 0b0, 0b011100>;
+class MSUB_Q_W_ENC : MSA_3RF_FMT<0b0110, 0b1, 0b011100>;
+
+class MSUBR_Q_H_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011100>;
+class MSUBR_Q_W_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011100>;
+
+class MSUBV_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010010>;
+class MSUBV_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010010>;
+class MSUBV_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010010>;
+class MSUBV_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010010>;
+
+class MUL_Q_H_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011100>;
+class MUL_Q_W_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011100>;
+
+class MULR_Q_H_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011100>;
+class MULR_Q_W_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011100>;
+
+class MULV_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010010>;
+class MULV_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010010>;
+class MULV_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010010>;
+class MULV_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010010>;
+
+class NLOC_B_ENC : MSA_2R_FMT<0b11000010, 0b00, 0b011110>;
+class NLOC_H_ENC : MSA_2R_FMT<0b11000010, 0b01, 0b011110>;
+class NLOC_W_ENC : MSA_2R_FMT<0b11000010, 0b10, 0b011110>;
+class NLOC_D_ENC : MSA_2R_FMT<0b11000010, 0b11, 0b011110>;
+
+class NLZC_B_ENC : MSA_2R_FMT<0b11000011, 0b00, 0b011110>;
+class NLZC_H_ENC : MSA_2R_FMT<0b11000011, 0b01, 0b011110>;
+class NLZC_W_ENC : MSA_2R_FMT<0b11000011, 0b10, 0b011110>;
+class NLZC_D_ENC : MSA_2R_FMT<0b11000011, 0b11, 0b011110>;
+
+class NOR_V_ENC : MSA_VEC_FMT<0b00010, 0b011110>;
+
+class NORI_B_ENC : MSA_I8_FMT<0b10, 0b000000>;
+
+class OR_V_ENC : MSA_VEC_FMT<0b00001, 0b011110>;
+
+class ORI_B_ENC  : MSA_I8_FMT<0b01, 0b000000>;
+
+class PCKEV_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010100>;
+class PCKEV_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010100>;
+class PCKEV_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010100>;
+class PCKEV_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010100>;
+
+class PCKOD_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010100>;
+class PCKOD_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010100>;
+class PCKOD_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010100>;
+class PCKOD_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010100>;
+
+class PCNT_B_ENC : MSA_2R_FMT<0b11000001, 0b00, 0b011110>;
+class PCNT_H_ENC : MSA_2R_FMT<0b11000001, 0b01, 0b011110>;
+class PCNT_W_ENC : MSA_2R_FMT<0b11000001, 0b10, 0b011110>;
+class PCNT_D_ENC : MSA_2R_FMT<0b11000001, 0b11, 0b011110>;
+
+class SAT_S_B_ENC : MSA_BIT_B_FMT<0b000, 0b001010>;
+class SAT_S_H_ENC : MSA_BIT_H_FMT<0b000, 0b001010>;
+class SAT_S_W_ENC : MSA_BIT_W_FMT<0b000, 0b001010>;
+class SAT_S_D_ENC : MSA_BIT_D_FMT<0b000, 0b001010>;
+
+class SAT_U_B_ENC : MSA_BIT_B_FMT<0b001, 0b001010>;
+class SAT_U_H_ENC : MSA_BIT_H_FMT<0b001, 0b001010>;
+class SAT_U_W_ENC : MSA_BIT_W_FMT<0b001, 0b001010>;
+class SAT_U_D_ENC : MSA_BIT_D_FMT<0b001, 0b001010>;
+
+class SHF_B_ENC  : MSA_I8_FMT<0b00, 0b000010>;
+class SHF_H_ENC  : MSA_I8_FMT<0b01, 0b000010>;
+class SHF_W_ENC  : MSA_I8_FMT<0b10, 0b000010>;
+
+class SLD_B_ENC : MSA_3R_INDEX_FMT<0b000, 0b00, 0b010100>;
+class SLD_H_ENC : MSA_3R_INDEX_FMT<0b000, 0b01, 0b010100>;
+class SLD_W_ENC : MSA_3R_INDEX_FMT<0b000, 0b10, 0b010100>;
+class SLD_D_ENC : MSA_3R_INDEX_FMT<0b000, 0b11, 0b010100>;
+
+class SLDI_B_ENC : MSA_ELM_B_FMT<0b0000, 0b011001>;
+class SLDI_H_ENC : MSA_ELM_H_FMT<0b0000, 0b011001>;
+class SLDI_W_ENC : MSA_ELM_W_FMT<0b0000, 0b011001>;
+class SLDI_D_ENC : MSA_ELM_D_FMT<0b0000, 0b011001>;
+
+class SLL_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001101>;
+class SLL_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001101>;
+class SLL_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001101>;
+class SLL_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001101>;
+
+class SLLI_B_ENC : MSA_BIT_B_FMT<0b000, 0b001001>;
+class SLLI_H_ENC : MSA_BIT_H_FMT<0b000, 0b001001>;
+class SLLI_W_ENC : MSA_BIT_W_FMT<0b000, 0b001001>;
+class SLLI_D_ENC : MSA_BIT_D_FMT<0b000, 0b001001>;
+
+class SPLAT_B_ENC : MSA_3R_INDEX_FMT<0b001, 0b00, 0b010100>;
+class SPLAT_H_ENC : MSA_3R_INDEX_FMT<0b001, 0b01, 0b010100>;
+class SPLAT_W_ENC : MSA_3R_INDEX_FMT<0b001, 0b10, 0b010100>;
+class SPLAT_D_ENC : MSA_3R_INDEX_FMT<0b001, 0b11, 0b010100>;
+
+class SPLATI_B_ENC : MSA_ELM_B_FMT<0b0001, 0b011001>;
+class SPLATI_H_ENC : MSA_ELM_H_FMT<0b0001, 0b011001>;
+class SPLATI_W_ENC : MSA_ELM_W_FMT<0b0001, 0b011001>;
+class SPLATI_D_ENC : MSA_ELM_D_FMT<0b0001, 0b011001>;
+
+class SRA_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b001101>;
+class SRA_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b001101>;
+class SRA_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b001101>;
+class SRA_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b001101>;
+
+class SRAI_B_ENC : MSA_BIT_B_FMT<0b001, 0b001001>;
+class SRAI_H_ENC : MSA_BIT_H_FMT<0b001, 0b001001>;
+class SRAI_W_ENC : MSA_BIT_W_FMT<0b001, 0b001001>;
+class SRAI_D_ENC : MSA_BIT_D_FMT<0b001, 0b001001>;
+
+class SRAR_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010101>;
+class SRAR_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010101>;
+class SRAR_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010101>;
+class SRAR_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010101>;
+
+class SRARI_B_ENC : MSA_BIT_B_FMT<0b010, 0b001010>;
+class SRARI_H_ENC : MSA_BIT_H_FMT<0b010, 0b001010>;
+class SRARI_W_ENC : MSA_BIT_W_FMT<0b010, 0b001010>;
+class SRARI_D_ENC : MSA_BIT_D_FMT<0b010, 0b001010>;
+
+class SRL_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001101>;
+class SRL_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001101>;
+class SRL_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001101>;
+class SRL_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001101>;
+
+class SRLI_B_ENC : MSA_BIT_B_FMT<0b010, 0b001001>;
+class SRLI_H_ENC : MSA_BIT_H_FMT<0b010, 0b001001>;
+class SRLI_W_ENC : MSA_BIT_W_FMT<0b010, 0b001001>;
+class SRLI_D_ENC : MSA_BIT_D_FMT<0b010, 0b001001>;
+
+class SRLR_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010101>;
+class SRLR_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010101>;
+class SRLR_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010101>;
+class SRLR_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010101>;
+
+class SRLRI_B_ENC : MSA_BIT_B_FMT<0b011, 0b001010>;
+class SRLRI_H_ENC : MSA_BIT_H_FMT<0b011, 0b001010>;
+class SRLRI_W_ENC : MSA_BIT_W_FMT<0b011, 0b001010>;
+class SRLRI_D_ENC : MSA_BIT_D_FMT<0b011, 0b001010>;
+
+class ST_B_ENC   : MSA_MI10_FMT<0b00, 0b1001>;
+class ST_H_ENC   : MSA_MI10_FMT<0b01, 0b1001>;
+class ST_W_ENC   : MSA_MI10_FMT<0b10, 0b1001>;
+class ST_D_ENC   : MSA_MI10_FMT<0b11, 0b1001>;
+
+class SUBS_S_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010001>;
+class SUBS_S_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010001>;
+class SUBS_S_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010001>;
+class SUBS_S_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010001>;
+
+class SUBS_U_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010001>;
+class SUBS_U_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010001>;
+class SUBS_U_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010001>;
+class SUBS_U_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010001>;
+
+class SUBSUS_U_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010001>;
+class SUBSUS_U_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010001>;
+class SUBSUS_U_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010001>;
+class SUBSUS_U_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010001>;
+
+class SUBSUU_S_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010001>;
+class SUBSUU_S_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010001>;
+class SUBSUU_S_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010001>;
+class SUBSUU_S_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010001>;
+
+class SUBV_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b001110>;
+class SUBV_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b001110>;
+class SUBV_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b001110>;
+class SUBV_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b001110>;
+
+class SUBVI_B_ENC : MSA_I5_FMT<0b001, 0b00, 0b000110>;
+class SUBVI_H_ENC : MSA_I5_FMT<0b001, 0b01, 0b000110>;
+class SUBVI_W_ENC : MSA_I5_FMT<0b001, 0b10, 0b000110>;
+class SUBVI_D_ENC : MSA_I5_FMT<0b001, 0b11, 0b000110>;
+
+class VSHF_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010101>;
+class VSHF_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010101>;
+class VSHF_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010101>;
+class VSHF_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010101>;
+
+class XOR_V_ENC : MSA_VEC_FMT<0b00011, 0b011110>;
+
+class XORI_B_ENC : MSA_I8_FMT<0b11, 0b000000>;
+
+// Instruction desc.
+class MSA_BIT_B_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm3:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_H_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm4:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm5:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_D_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm6:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, ImmOp:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_BINSXI_DESC_BASE<string instr_asm, ValueType Ty,
+                               SplatComplexPattern Mask, RegisterOperand ROWD,
+                               RegisterOperand ROWS = ROWD,
+                               InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, Mask.OpClass:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  // Note that binsxi and vselect treat the condition operand the opposite
+  // way to each other.
+  //   (vselect cond, if_set, if_clear)
+  //   (BSEL_V cond, if_clear, if_set)
+  list<dag> Pattern = [(set ROWD:$wd, (vselect (Ty Mask:$m), (Ty ROWD:$ws),
+                                               ROWS:$wd_in))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_BIT_BINSLI_DESC_BASE<string instr_asm, ValueType Ty,
+                               SplatComplexPattern ImmOp, RegisterOperand ROWD,
+                               RegisterOperand ROWS = ROWD,
+                               InstrItinClass itin = NoItinerary> :
+  MSA_BIT_BINSXI_DESC_BASE<instr_asm, Ty, ImmOp, ROWD, ROWS, itin>;
+
+class MSA_BIT_BINSRI_DESC_BASE<string instr_asm, ValueType Ty,
+                               SplatComplexPattern ImmOp, RegisterOperand ROWD,
+                               RegisterOperand ROWS = ROWD,
+                               InstrItinClass itin = NoItinerary> :
+  MSA_BIT_BINSXI_DESC_BASE<instr_asm, Ty, ImmOp, ROWD, ROWS, itin>;
+
+class MSA_BIT_SPLAT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                              SplatComplexPattern SplatImm,
+                              RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                              InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_COPY_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                         ValueType VecTy, Operand ImmOp, ImmLeaf Imm,
+                         RegisterOperand ROD, RegisterOperand ROWS,
+                         InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROWS:$ws, ImmOp:$n);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $ws[$n]");
+  list<dag> Pattern = [(set ROD:$rd, (OpNode (VecTy ROWS:$ws), Imm:$n))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_ELM_SLD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS,
+                            Operand ImmOp, ImmLeaf Imm,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ImmOp:$n);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+                                              Imm:$n))];
+  string Constraints = "$wd = $wd_in";
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_COPY_PSEUDO_BASE<SDPatternOperator OpNode, ValueType VecTy,
+                           Operand ImmOp, ImmLeaf Imm, RegisterClass RCD,
+                           RegisterClass RCWS> :
+      MSAPseudo<(outs RCD:$wd), (ins RCWS:$ws, ImmOp:$n),
+                [(set RCD:$wd, (OpNode (VecTy RCWS:$ws), Imm:$n))]> {
+  bit usesCustomInserter = 1;
+}
+
+class MSA_I5_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       SplatComplexPattern SplatImm, RegisterOperand ROWD,
+                       RegisterOperand ROWS = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $imm");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$imm))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_I8_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       SplatComplexPattern SplatImm, RegisterOperand ROWD,
+                       RegisterOperand ROWS = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$u8);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$u8))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_I8_SHF_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+                           RegisterOperand ROWS = ROWD,
+                           InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm8:$u8);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
+  list<dag> Pattern = [(set ROWD:$wd, (MipsSHF immZExt8:$u8, ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_I10_LDI_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins vsplat_simm10:$s10);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $s10");
+  // LDI is matched using custom matching code in MipsSEISelDAGToDAG.cpp
+  list<dag> Pattern = [];
+  bit hasSideEffects = 0;
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_FILL_DESC_BASE<string instr_asm, ValueType VT,
+                            SDPatternOperator OpNode, RegisterOperand ROWD,
+                            RegisterOperand ROS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROS:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $rs");
+  list<dag> Pattern = [(set ROWD:$wd, (VT (OpNode ROS:$rs)))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_FILL_PSEUDO_BASE<ValueType VT, SDPatternOperator OpNode,
+                              RegisterClass RCWD, RegisterClass RCWS = RCWD> :
+      MSAPseudo<(outs RCWD:$wd), (ins RCWS:$fs),
+                [(set RCWD:$wd, (OpNode RCWS:$fs))]> {
+  let usesCustomInserter = 1;
+}
+
+class MSA_2RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                       RegisterOperand ROWT = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_BINSX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                             RegisterOperand ROWT = ROWD,
+                             InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+                                              ROWT:$wt))];
+  string Constraints = "$wd = $wd_in";
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_SPLAT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                             InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, GPR32Opnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$rt]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, GPR32Opnd:$rt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_VSHF_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+                            RegisterOperand ROWS = ROWD,
+                            RegisterOperand ROWT = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (MipsVSHF ROWD:$wd_in, ROWS:$ws,
+                                                ROWT:$wt))];
+  string Constraints = "$wd = $wd_in";
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_SLD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                           InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, GPR32Opnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$rt]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+                                              GPR32Opnd:$rt))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_3R_4R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                          RegisterOperand ROWT = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+                                              ROWT:$wt))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_3RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        RegisterOperand ROWT = ROWD,
+                        InstrItinClass itin = NoItinerary> :
+  MSA_3R_DESC_BASE<instr_asm, OpNode, ROWD, ROWS, ROWT, itin>;
+
+class MSA_3RF_4RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            RegisterOperand ROWT = ROWD,
+                            InstrItinClass itin = NoItinerary> :
+  MSA_3R_4R_DESC_BASE<instr_asm, OpNode, ROWD, ROWS, ROWT, itin>;
+
+class MSA_CBRANCH_DESC_BASE<string instr_asm, RegisterOperand ROWD> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins ROWD:$wt, brtarget:$offset);
+  string AsmString = !strconcat(instr_asm, "\t$wt, $offset");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = NoItinerary;
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit hasDelaySlot = 1;
+  list<Register> Defs = [AT];
+}
+
+class MSA_INSERT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+                           RegisterOperand ROS,
+                           InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROS:$rs, ImmOp:$n);
+  string AsmString = !strconcat(instr_asm, "\t$wd[$n], $rs");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROS:$rs, Imm:$n))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_INSERT_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
+                             Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+                             RegisterOperand ROFS> :
+      MSAPseudo<(outs ROWD:$wd), (ins ROWD:$wd_in, ImmOp:$n, ROFS:$fs),
+                [(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs, Imm:$n))]> {
+  bit usesCustomInserter = 1;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_INSERT_VIDX_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
+                                  RegisterOperand ROWD, RegisterOperand ROFS,
+                                  RegisterOperand ROIdx> :
+      MSAPseudo<(outs ROWD:$wd), (ins ROWD:$wd_in, ROIdx:$n, ROFS:$fs),
+                [(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs,
+                                        ROIdx:$n))]> {
+  bit usesCustomInserter = 1;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_INSVE_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ImmOp:$n, ROWS:$ws, uimmz:$n2);
+  string AsmString = !strconcat(instr_asm, "\t$wd[$n], $ws[$n2]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in,
+                                              Imm:$n,
+                                              ROWS:$ws,
+                                              immz:$n2))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_VEC_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        RegisterOperand ROWT = ROWD,
+                        InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_ELM_SPLAT_DESC_BASE<string instr_asm, SplatComplexPattern SplatImm,
+                              RegisterOperand ROWD,
+                              RegisterOperand ROWS = ROWD,
+                              InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$n);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]");
+  list<dag> Pattern = [(set ROWD:$wd, (MipsVSHF SplatImm:$n, ROWS:$ws,
+                                                ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_VEC_PSEUDO_BASE<SDPatternOperator OpNode, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          RegisterOperand ROWT = ROWD> :
+      MSAPseudo<(outs ROWD:$wd), (ins ROWS:$ws, ROWT:$wt),
+                [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))]>;
+
+class ADD_A_B_DESC : MSA_3R_DESC_BASE<"add_a.b", int_mips_add_a_b, MSA128BOpnd>,
+                     IsCommutable;
+class ADD_A_H_DESC : MSA_3R_DESC_BASE<"add_a.h", int_mips_add_a_h, MSA128HOpnd>,
+                     IsCommutable;
+class ADD_A_W_DESC : MSA_3R_DESC_BASE<"add_a.w", int_mips_add_a_w, MSA128WOpnd>,
+                     IsCommutable;
+class ADD_A_D_DESC : MSA_3R_DESC_BASE<"add_a.d", int_mips_add_a_d, MSA128DOpnd>,
+                     IsCommutable;
+
+class ADDS_A_B_DESC : MSA_3R_DESC_BASE<"adds_a.b", int_mips_adds_a_b,
+                                       MSA128BOpnd>, IsCommutable;
+class ADDS_A_H_DESC : MSA_3R_DESC_BASE<"adds_a.h", int_mips_adds_a_h,
+                                       MSA128HOpnd>, IsCommutable;
+class ADDS_A_W_DESC : MSA_3R_DESC_BASE<"adds_a.w", int_mips_adds_a_w,
+                                       MSA128WOpnd>, IsCommutable;
+class ADDS_A_D_DESC : MSA_3R_DESC_BASE<"adds_a.d", int_mips_adds_a_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class ADDS_S_B_DESC : MSA_3R_DESC_BASE<"adds_s.b", int_mips_adds_s_b,
+                                       MSA128BOpnd>, IsCommutable;
+class ADDS_S_H_DESC : MSA_3R_DESC_BASE<"adds_s.h", int_mips_adds_s_h,
+                                       MSA128HOpnd>, IsCommutable;
+class ADDS_S_W_DESC : MSA_3R_DESC_BASE<"adds_s.w", int_mips_adds_s_w,
+                                       MSA128WOpnd>, IsCommutable;
+class ADDS_S_D_DESC : MSA_3R_DESC_BASE<"adds_s.d", int_mips_adds_s_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class ADDS_U_B_DESC : MSA_3R_DESC_BASE<"adds_u.b", int_mips_adds_u_b,
+                                       MSA128BOpnd>, IsCommutable;
+class ADDS_U_H_DESC : MSA_3R_DESC_BASE<"adds_u.h", int_mips_adds_u_h,
+                                       MSA128HOpnd>, IsCommutable;
+class ADDS_U_W_DESC : MSA_3R_DESC_BASE<"adds_u.w", int_mips_adds_u_w,
+                                       MSA128WOpnd>, IsCommutable;
+class ADDS_U_D_DESC : MSA_3R_DESC_BASE<"adds_u.d", int_mips_adds_u_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class ADDV_B_DESC : MSA_3R_DESC_BASE<"addv.b", add, MSA128BOpnd>, IsCommutable;
+class ADDV_H_DESC : MSA_3R_DESC_BASE<"addv.h", add, MSA128HOpnd>, IsCommutable;
+class ADDV_W_DESC : MSA_3R_DESC_BASE<"addv.w", add, MSA128WOpnd>, IsCommutable;
+class ADDV_D_DESC : MSA_3R_DESC_BASE<"addv.d", add, MSA128DOpnd>, IsCommutable;
+
+class ADDVI_B_DESC : MSA_I5_DESC_BASE<"addvi.b", add, vsplati8_uimm5,
+                                      MSA128BOpnd>;
+class ADDVI_H_DESC : MSA_I5_DESC_BASE<"addvi.h", add, vsplati16_uimm5,
+                                      MSA128HOpnd>;
+class ADDVI_W_DESC : MSA_I5_DESC_BASE<"addvi.w", add, vsplati32_uimm5,
+                                      MSA128WOpnd>;
+class ADDVI_D_DESC : MSA_I5_DESC_BASE<"addvi.d", add, vsplati64_uimm5,
+                                      MSA128DOpnd>;
+
+class AND_V_DESC : MSA_VEC_DESC_BASE<"and.v", and, MSA128BOpnd>;
+class AND_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128HOpnd>;
+class AND_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128WOpnd>;
+class AND_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128DOpnd>;
+
+class ANDI_B_DESC : MSA_I8_DESC_BASE<"andi.b", and, vsplati8_uimm8,
+                                     MSA128BOpnd>;
+
+class ASUB_S_B_DESC : MSA_3R_DESC_BASE<"asub_s.b", int_mips_asub_s_b,
+                                       MSA128BOpnd>;
+class ASUB_S_H_DESC : MSA_3R_DESC_BASE<"asub_s.h", int_mips_asub_s_h,
+                                       MSA128HOpnd>;
+class ASUB_S_W_DESC : MSA_3R_DESC_BASE<"asub_s.w", int_mips_asub_s_w,
+                                       MSA128WOpnd>;
+class ASUB_S_D_DESC : MSA_3R_DESC_BASE<"asub_s.d", int_mips_asub_s_d,
+                                       MSA128DOpnd>;
+
+class ASUB_U_B_DESC : MSA_3R_DESC_BASE<"asub_u.b", int_mips_asub_u_b,
+                                       MSA128BOpnd>;
+class ASUB_U_H_DESC : MSA_3R_DESC_BASE<"asub_u.h", int_mips_asub_u_h,
+                                       MSA128HOpnd>;
+class ASUB_U_W_DESC : MSA_3R_DESC_BASE<"asub_u.w", int_mips_asub_u_w,
+                                       MSA128WOpnd>;
+class ASUB_U_D_DESC : MSA_3R_DESC_BASE<"asub_u.d", int_mips_asub_u_d,
+                                       MSA128DOpnd>;
+
+class AVE_S_B_DESC : MSA_3R_DESC_BASE<"ave_s.b", int_mips_ave_s_b, MSA128BOpnd>,
+                     IsCommutable;
+class AVE_S_H_DESC : MSA_3R_DESC_BASE<"ave_s.h", int_mips_ave_s_h, MSA128HOpnd>,
+                     IsCommutable;
+class AVE_S_W_DESC : MSA_3R_DESC_BASE<"ave_s.w", int_mips_ave_s_w, MSA128WOpnd>,
+                     IsCommutable;
+class AVE_S_D_DESC : MSA_3R_DESC_BASE<"ave_s.d", int_mips_ave_s_d, MSA128DOpnd>,
+                     IsCommutable;
+
+class AVE_U_B_DESC : MSA_3R_DESC_BASE<"ave_u.b", int_mips_ave_u_b, MSA128BOpnd>,
+                     IsCommutable;
+class AVE_U_H_DESC : MSA_3R_DESC_BASE<"ave_u.h", int_mips_ave_u_h, MSA128HOpnd>,
+                     IsCommutable;
+class AVE_U_W_DESC : MSA_3R_DESC_BASE<"ave_u.w", int_mips_ave_u_w, MSA128WOpnd>,
+                     IsCommutable;
+class AVE_U_D_DESC : MSA_3R_DESC_BASE<"ave_u.d", int_mips_ave_u_d, MSA128DOpnd>,
+                     IsCommutable;
+
+class AVER_S_B_DESC : MSA_3R_DESC_BASE<"aver_s.b", int_mips_aver_s_b,
+                                       MSA128BOpnd>, IsCommutable;
+class AVER_S_H_DESC : MSA_3R_DESC_BASE<"aver_s.h", int_mips_aver_s_h,
+                                       MSA128HOpnd>, IsCommutable;
+class AVER_S_W_DESC : MSA_3R_DESC_BASE<"aver_s.w", int_mips_aver_s_w,
+                                       MSA128WOpnd>, IsCommutable;
+class AVER_S_D_DESC : MSA_3R_DESC_BASE<"aver_s.d", int_mips_aver_s_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class AVER_U_B_DESC : MSA_3R_DESC_BASE<"aver_u.b", int_mips_aver_u_b,
+                                       MSA128BOpnd>, IsCommutable;
+class AVER_U_H_DESC : MSA_3R_DESC_BASE<"aver_u.h", int_mips_aver_u_h,
+                                       MSA128HOpnd>, IsCommutable;
+class AVER_U_W_DESC : MSA_3R_DESC_BASE<"aver_u.w", int_mips_aver_u_w,
+                                       MSA128WOpnd>, IsCommutable;
+class AVER_U_D_DESC : MSA_3R_DESC_BASE<"aver_u.d", int_mips_aver_u_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class BCLR_B_DESC : MSA_3R_DESC_BASE<"bclr.b", vbclr_b, MSA128BOpnd>;
+class BCLR_H_DESC : MSA_3R_DESC_BASE<"bclr.h", vbclr_h, MSA128HOpnd>;
+class BCLR_W_DESC : MSA_3R_DESC_BASE<"bclr.w", vbclr_w, MSA128WOpnd>;
+class BCLR_D_DESC : MSA_3R_DESC_BASE<"bclr.d", vbclr_d, MSA128DOpnd>;
+
+class BCLRI_B_DESC : MSA_BIT_B_DESC_BASE<"bclri.b", and, vsplat_uimm_inv_pow2,
+                                         MSA128BOpnd>;
+class BCLRI_H_DESC : MSA_BIT_H_DESC_BASE<"bclri.h", and, vsplat_uimm_inv_pow2,
+                                         MSA128HOpnd>;
+class BCLRI_W_DESC : MSA_BIT_W_DESC_BASE<"bclri.w", and, vsplat_uimm_inv_pow2,
+                                         MSA128WOpnd>;
+class BCLRI_D_DESC : MSA_BIT_D_DESC_BASE<"bclri.d", and, vsplat_uimm_inv_pow2,
+                                         MSA128DOpnd>;
+
+class BINSL_B_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.b", int_mips_binsl_b,
+                                            MSA128BOpnd>;
+class BINSL_H_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.h", int_mips_binsl_h,
+                                            MSA128HOpnd>;
+class BINSL_W_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.w", int_mips_binsl_w,
+                                            MSA128WOpnd>;
+class BINSL_D_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.d", int_mips_binsl_d,
+                                            MSA128DOpnd>;
+
+class BINSLI_B_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.b", v16i8, vsplat_maskl_bits_uimm3, MSA128BOpnd>;
+class BINSLI_H_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.h", v8i16, vsplat_maskl_bits_uimm4, MSA128HOpnd>;
+class BINSLI_W_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.w", v4i32, vsplat_maskl_bits_uimm5, MSA128WOpnd>;
+class BINSLI_D_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.d", v2i64, vsplat_maskl_bits_uimm6, MSA128DOpnd>;
+
+class BINSR_B_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.b", int_mips_binsr_b,
+                                            MSA128BOpnd>;
+class BINSR_H_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.h", int_mips_binsr_h,
+                                            MSA128HOpnd>;
+class BINSR_W_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.w", int_mips_binsr_w,
+                                            MSA128WOpnd>;
+class BINSR_D_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.d", int_mips_binsr_d,
+                                            MSA128DOpnd>;
+
+class BINSRI_B_DESC
+    : MSA_BIT_BINSRI_DESC_BASE<"binsri.b", v16i8, vsplat_maskr_bits_uimm3,
+                               MSA128BOpnd>;
+class BINSRI_H_DESC
+    : MSA_BIT_BINSRI_DESC_BASE<"binsri.h", v8i16, vsplat_maskr_bits_uimm4,
+                               MSA128HOpnd>;
+class BINSRI_W_DESC
+    : MSA_BIT_BINSRI_DESC_BASE<"binsri.w", v4i32, vsplat_maskr_bits_uimm5,
+                               MSA128WOpnd>;
+class BINSRI_D_DESC
+    : MSA_BIT_BINSRI_DESC_BASE<"binsri.d", v2i64, vsplat_maskr_bits_uimm6,
+                               MSA128DOpnd>;
+
+class BMNZ_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                       MSA128BOpnd:$wt);
+  string AsmString = "bmnz.v\t$wd, $ws, $wt";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wt,
+                                                      MSA128BOpnd:$ws,
+                                                      MSA128BOpnd:$wd_in))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BMNZI_B_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                           vsplat_uimm8:$u8);
+  string AsmString = "bmnzi.b\t$wd, $ws, $u8";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect vsplati8_uimm8:$u8,
+                                                      MSA128BOpnd:$ws,
+                                                      MSA128BOpnd:$wd_in))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BMZ_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                       MSA128BOpnd:$wt);
+  string AsmString = "bmz.v\t$wd, $ws, $wt";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wt,
+                                                      MSA128BOpnd:$wd_in,
+                                                      MSA128BOpnd:$ws))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BMZI_B_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                           vsplat_uimm8:$u8);
+  string AsmString = "bmzi.b\t$wd, $ws, $u8";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect vsplati8_uimm8:$u8,
+                                                      MSA128BOpnd:$wd_in,
+                                                      MSA128BOpnd:$ws))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BNEG_B_DESC : MSA_3R_DESC_BASE<"bneg.b", vbneg_b, MSA128BOpnd>;
+class BNEG_H_DESC : MSA_3R_DESC_BASE<"bneg.h", vbneg_h, MSA128HOpnd>;
+class BNEG_W_DESC : MSA_3R_DESC_BASE<"bneg.w", vbneg_w, MSA128WOpnd>;
+class BNEG_D_DESC : MSA_3R_DESC_BASE<"bneg.d", vbneg_d, MSA128DOpnd>;
+
+class BNEGI_B_DESC : MSA_BIT_B_DESC_BASE<"bnegi.b", xor, vsplat_uimm_pow2,
+                                         MSA128BOpnd>;
+class BNEGI_H_DESC : MSA_BIT_H_DESC_BASE<"bnegi.h", xor, vsplat_uimm_pow2,
+                                         MSA128HOpnd>;
+class BNEGI_W_DESC : MSA_BIT_W_DESC_BASE<"bnegi.w", xor, vsplat_uimm_pow2,
+                                         MSA128WOpnd>;
+class BNEGI_D_DESC : MSA_BIT_D_DESC_BASE<"bnegi.d", xor, vsplat_uimm_pow2,
+                                         MSA128DOpnd>;
+
+class BNZ_B_DESC : MSA_CBRANCH_DESC_BASE<"bnz.b", MSA128BOpnd>;
+class BNZ_H_DESC : MSA_CBRANCH_DESC_BASE<"bnz.h", MSA128HOpnd>;
+class BNZ_W_DESC : MSA_CBRANCH_DESC_BASE<"bnz.w", MSA128WOpnd>;
+class BNZ_D_DESC : MSA_CBRANCH_DESC_BASE<"bnz.d", MSA128DOpnd>;
+
+class BNZ_V_DESC : MSA_CBRANCH_DESC_BASE<"bnz.v", MSA128BOpnd>;
+
+class BSEL_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                       MSA128BOpnd:$wt);
+  string AsmString = "bsel.v\t$wd, $ws, $wt";
+  // Note that vselect and BSEL_V treat the condition operand the opposite way
+  // from each other.
+  //   (vselect cond, if_set, if_clear)
+  //   (BSEL_V cond, if_clear, if_set)
+  list<dag> Pattern = [(set MSA128BOpnd:$wd,
+                        (vselect MSA128BOpnd:$wd_in, MSA128BOpnd:$wt,
+                                                     MSA128BOpnd:$ws))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BSELI_B_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                           vsplat_uimm8:$u8);
+  string AsmString = "bseli.b\t$wd, $ws, $u8";
+  // Note that vselect and BSEL_V treat the condition operand the opposite way
+  // from each other.
+  //   (vselect cond, if_set, if_clear)
+  //   (BSEL_V cond, if_clear, if_set)
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wd_in,
+                                                      vsplati8_uimm8:$u8,
+                                                      MSA128BOpnd:$ws))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BSET_B_DESC : MSA_3R_DESC_BASE<"bset.b", vbset_b, MSA128BOpnd>;
+class BSET_H_DESC : MSA_3R_DESC_BASE<"bset.h", vbset_h, MSA128HOpnd>;
+class BSET_W_DESC : MSA_3R_DESC_BASE<"bset.w", vbset_w, MSA128WOpnd>;
+class BSET_D_DESC : MSA_3R_DESC_BASE<"bset.d", vbset_d, MSA128DOpnd>;
+
+class BSETI_B_DESC : MSA_BIT_B_DESC_BASE<"bseti.b", or, vsplat_uimm_pow2,
+                                         MSA128BOpnd>;
+class BSETI_H_DESC : MSA_BIT_H_DESC_BASE<"bseti.h", or, vsplat_uimm_pow2,
+                                         MSA128HOpnd>;
+class BSETI_W_DESC : MSA_BIT_W_DESC_BASE<"bseti.w", or, vsplat_uimm_pow2,
+                                         MSA128WOpnd>;
+class BSETI_D_DESC : MSA_BIT_D_DESC_BASE<"bseti.d", or, vsplat_uimm_pow2,
+                                         MSA128DOpnd>;
+
+class BZ_B_DESC : MSA_CBRANCH_DESC_BASE<"bz.b", MSA128BOpnd>;
+class BZ_H_DESC : MSA_CBRANCH_DESC_BASE<"bz.h", MSA128HOpnd>;
+class BZ_W_DESC : MSA_CBRANCH_DESC_BASE<"bz.w", MSA128WOpnd>;
+class BZ_D_DESC : MSA_CBRANCH_DESC_BASE<"bz.d", MSA128DOpnd>;
+
+class BZ_V_DESC : MSA_CBRANCH_DESC_BASE<"bz.v", MSA128BOpnd>;
+
+class CEQ_B_DESC : MSA_3R_DESC_BASE<"ceq.b", vseteq_v16i8, MSA128BOpnd>,
+                   IsCommutable;
+class CEQ_H_DESC : MSA_3R_DESC_BASE<"ceq.h", vseteq_v8i16, MSA128HOpnd>,
+                   IsCommutable;
+class CEQ_W_DESC : MSA_3R_DESC_BASE<"ceq.w", vseteq_v4i32, MSA128WOpnd>,
+                   IsCommutable;
+class CEQ_D_DESC : MSA_3R_DESC_BASE<"ceq.d", vseteq_v2i64, MSA128DOpnd>,
+                   IsCommutable;
+
+class CEQI_B_DESC : MSA_I5_DESC_BASE<"ceqi.b", vseteq_v16i8, vsplati8_simm5,
+                                     MSA128BOpnd>;
+class CEQI_H_DESC : MSA_I5_DESC_BASE<"ceqi.h", vseteq_v8i16, vsplati16_simm5,
+                                     MSA128HOpnd>;
+class CEQI_W_DESC : MSA_I5_DESC_BASE<"ceqi.w", vseteq_v4i32, vsplati32_simm5,
+                                     MSA128WOpnd>;
+class CEQI_D_DESC : MSA_I5_DESC_BASE<"ceqi.d", vseteq_v2i64, vsplati64_simm5,
+                                     MSA128DOpnd>;
+
+class CFCMSA_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins MSA128CROpnd:$cs);
+  string AsmString = "cfcmsa\t$rd, $cs";
+  InstrItinClass Itinerary = NoItinerary;
+  bit hasSideEffects = 1;
+}
+
+class CLE_S_B_DESC : MSA_3R_DESC_BASE<"cle_s.b", vsetle_v16i8, MSA128BOpnd>;
+class CLE_S_H_DESC : MSA_3R_DESC_BASE<"cle_s.h", vsetle_v8i16, MSA128HOpnd>;
+class CLE_S_W_DESC : MSA_3R_DESC_BASE<"cle_s.w", vsetle_v4i32, MSA128WOpnd>;
+class CLE_S_D_DESC : MSA_3R_DESC_BASE<"cle_s.d", vsetle_v2i64, MSA128DOpnd>;
+
+class CLE_U_B_DESC : MSA_3R_DESC_BASE<"cle_u.b", vsetule_v16i8, MSA128BOpnd>;
+class CLE_U_H_DESC : MSA_3R_DESC_BASE<"cle_u.h", vsetule_v8i16, MSA128HOpnd>;
+class CLE_U_W_DESC : MSA_3R_DESC_BASE<"cle_u.w", vsetule_v4i32, MSA128WOpnd>;
+class CLE_U_D_DESC : MSA_3R_DESC_BASE<"cle_u.d", vsetule_v2i64, MSA128DOpnd>;
+
+class CLEI_S_B_DESC : MSA_I5_DESC_BASE<"clei_s.b", vsetle_v16i8,
+                                       vsplati8_simm5,  MSA128BOpnd>;
+class CLEI_S_H_DESC : MSA_I5_DESC_BASE<"clei_s.h", vsetle_v8i16,
+                                       vsplati16_simm5, MSA128HOpnd>;
+class CLEI_S_W_DESC : MSA_I5_DESC_BASE<"clei_s.w", vsetle_v4i32,
+                                       vsplati32_simm5, MSA128WOpnd>;
+class CLEI_S_D_DESC : MSA_I5_DESC_BASE<"clei_s.d", vsetle_v2i64,
+                                       vsplati64_simm5, MSA128DOpnd>;
+
+class CLEI_U_B_DESC : MSA_I5_DESC_BASE<"clei_u.b", vsetule_v16i8,
+                                       vsplati8_uimm5,  MSA128BOpnd>;
+class CLEI_U_H_DESC : MSA_I5_DESC_BASE<"clei_u.h", vsetule_v8i16,
+                                       vsplati16_uimm5, MSA128HOpnd>;
+class CLEI_U_W_DESC : MSA_I5_DESC_BASE<"clei_u.w", vsetule_v4i32,
+                                       vsplati32_uimm5, MSA128WOpnd>;
+class CLEI_U_D_DESC : MSA_I5_DESC_BASE<"clei_u.d", vsetule_v2i64,
+                                       vsplati64_uimm5, MSA128DOpnd>;
+
+class CLT_S_B_DESC : MSA_3R_DESC_BASE<"clt_s.b", vsetlt_v16i8, MSA128BOpnd>;
+class CLT_S_H_DESC : MSA_3R_DESC_BASE<"clt_s.h", vsetlt_v8i16, MSA128HOpnd>;
+class CLT_S_W_DESC : MSA_3R_DESC_BASE<"clt_s.w", vsetlt_v4i32, MSA128WOpnd>;
+class CLT_S_D_DESC : MSA_3R_DESC_BASE<"clt_s.d", vsetlt_v2i64, MSA128DOpnd>;
+
+class CLT_U_B_DESC : MSA_3R_DESC_BASE<"clt_u.b", vsetult_v16i8, MSA128BOpnd>;
+class CLT_U_H_DESC : MSA_3R_DESC_BASE<"clt_u.h", vsetult_v8i16, MSA128HOpnd>;
+class CLT_U_W_DESC : MSA_3R_DESC_BASE<"clt_u.w", vsetult_v4i32, MSA128WOpnd>;
+class CLT_U_D_DESC : MSA_3R_DESC_BASE<"clt_u.d", vsetult_v2i64, MSA128DOpnd>;
+
+class CLTI_S_B_DESC : MSA_I5_DESC_BASE<"clti_s.b", vsetlt_v16i8,
+                                       vsplati8_simm5, MSA128BOpnd>;
+class CLTI_S_H_DESC : MSA_I5_DESC_BASE<"clti_s.h", vsetlt_v8i16,
+                                       vsplati16_simm5, MSA128HOpnd>;
+class CLTI_S_W_DESC : MSA_I5_DESC_BASE<"clti_s.w", vsetlt_v4i32,
+                                       vsplati32_simm5, MSA128WOpnd>;
+class CLTI_S_D_DESC : MSA_I5_DESC_BASE<"clti_s.d", vsetlt_v2i64,
+                                       vsplati64_simm5, MSA128DOpnd>;
+
+class CLTI_U_B_DESC : MSA_I5_DESC_BASE<"clti_u.b", vsetult_v16i8,
+                                       vsplati8_uimm5, MSA128BOpnd>;
+class CLTI_U_H_DESC : MSA_I5_DESC_BASE<"clti_u.h", vsetult_v8i16,
+                                       vsplati16_uimm5, MSA128HOpnd>;
+class CLTI_U_W_DESC : MSA_I5_DESC_BASE<"clti_u.w", vsetult_v4i32,
+                                       vsplati32_uimm5, MSA128WOpnd>;
+class CLTI_U_D_DESC : MSA_I5_DESC_BASE<"clti_u.d", vsetult_v2i64,
+                                       vsplati64_uimm5, MSA128DOpnd>;
+
+class COPY_S_B_DESC : MSA_COPY_DESC_BASE<"copy_s.b", vextract_sext_i8,  v16i8,
+                                         uimm4_ptr, immZExt4Ptr, GPR32Opnd,
+                                         MSA128BOpnd>;
+class COPY_S_H_DESC : MSA_COPY_DESC_BASE<"copy_s.h", vextract_sext_i16, v8i16,
+                                         uimm3_ptr, immZExt3Ptr, GPR32Opnd,
+                                         MSA128HOpnd>;
+class COPY_S_W_DESC : MSA_COPY_DESC_BASE<"copy_s.w", vextract_sext_i32, v4i32,
+                                         uimm2_ptr, immZExt2Ptr, GPR32Opnd,
+                                         MSA128WOpnd>;
+class COPY_S_D_DESC : MSA_COPY_DESC_BASE<"copy_s.d", vextract_sext_i64, v2i64,
+                                         uimm1_ptr, immZExt1Ptr, GPR64Opnd,
+                                         MSA128DOpnd>;
+
+class COPY_U_B_DESC : MSA_COPY_DESC_BASE<"copy_u.b", vextract_zext_i8,  v16i8,
+                                         uimm4_ptr, immZExt4Ptr, GPR32Opnd,
+                                         MSA128BOpnd>;
+class COPY_U_H_DESC : MSA_COPY_DESC_BASE<"copy_u.h", vextract_zext_i16, v8i16,
+                                         uimm3_ptr, immZExt3Ptr, GPR32Opnd,
+                                         MSA128HOpnd>;
+class COPY_U_W_DESC : MSA_COPY_DESC_BASE<"copy_u.w", vextract_zext_i32, v4i32,
+                                         uimm2_ptr, immZExt2Ptr, GPR32Opnd,
+                                         MSA128WOpnd>;
+
+class COPY_FW_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v4f32,
+                                                 uimm2_ptr, immZExt2Ptr, FGR32,
+                                                 MSA128W>;
+class COPY_FD_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v2f64,
+                                                 uimm1_ptr, immZExt1Ptr, FGR64,
+                                                 MSA128D>;
+
+class CTCMSA_DESC {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins MSA128CROpnd:$cd, GPR32Opnd:$rs);
+  string AsmString = "ctcmsa\t$cd, $rs";
+  InstrItinClass Itinerary = NoItinerary;
+  bit hasSideEffects = 1;
+}
+
+class DIV_S_B_DESC : MSA_3R_DESC_BASE<"div_s.b", sdiv, MSA128BOpnd>;
+class DIV_S_H_DESC : MSA_3R_DESC_BASE<"div_s.h", sdiv, MSA128HOpnd>;
+class DIV_S_W_DESC : MSA_3R_DESC_BASE<"div_s.w", sdiv, MSA128WOpnd>;
+class DIV_S_D_DESC : MSA_3R_DESC_BASE<"div_s.d", sdiv, MSA128DOpnd>;
+
+class DIV_U_B_DESC : MSA_3R_DESC_BASE<"div_u.b", udiv, MSA128BOpnd>;
+class DIV_U_H_DESC : MSA_3R_DESC_BASE<"div_u.h", udiv, MSA128HOpnd>;
+class DIV_U_W_DESC : MSA_3R_DESC_BASE<"div_u.w", udiv, MSA128WOpnd>;
+class DIV_U_D_DESC : MSA_3R_DESC_BASE<"div_u.d", udiv, MSA128DOpnd>;
+
+class DOTP_S_H_DESC : MSA_3R_DESC_BASE<"dotp_s.h", int_mips_dotp_s_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>,
+                      IsCommutable;
+class DOTP_S_W_DESC : MSA_3R_DESC_BASE<"dotp_s.w", int_mips_dotp_s_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>,
+                      IsCommutable;
+class DOTP_S_D_DESC : MSA_3R_DESC_BASE<"dotp_s.d", int_mips_dotp_s_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>,
+                      IsCommutable;
+
+class DOTP_U_H_DESC : MSA_3R_DESC_BASE<"dotp_u.h", int_mips_dotp_u_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>,
+                      IsCommutable;
+class DOTP_U_W_DESC : MSA_3R_DESC_BASE<"dotp_u.w", int_mips_dotp_u_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>,
+                      IsCommutable;
+class DOTP_U_D_DESC : MSA_3R_DESC_BASE<"dotp_u.d", int_mips_dotp_u_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>,
+                      IsCommutable;
+
+class DPADD_S_H_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.h", int_mips_dpadd_s_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>, IsCommutable;
+class DPADD_S_W_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.w", int_mips_dpadd_s_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>, IsCommutable;
+class DPADD_S_D_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.d", int_mips_dpadd_s_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>, IsCommutable;
+
+class DPADD_U_H_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.h", int_mips_dpadd_u_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>, IsCommutable;
+class DPADD_U_W_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.w", int_mips_dpadd_u_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>, IsCommutable;
+class DPADD_U_D_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.d", int_mips_dpadd_u_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>, IsCommutable;
+
+class DPSUB_S_H_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.h", int_mips_dpsub_s_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>;
+class DPSUB_S_W_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.w", int_mips_dpsub_s_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>;
+class DPSUB_S_D_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.d", int_mips_dpsub_s_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>;
+
+class DPSUB_U_H_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.h", int_mips_dpsub_u_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>;
+class DPSUB_U_W_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.w", int_mips_dpsub_u_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>;
+class DPSUB_U_D_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.d", int_mips_dpsub_u_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>;
+
+class FADD_W_DESC : MSA_3RF_DESC_BASE<"fadd.w", fadd, MSA128WOpnd>,
+                    IsCommutable;
+class FADD_D_DESC : MSA_3RF_DESC_BASE<"fadd.d", fadd, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCAF_W_DESC : MSA_3RF_DESC_BASE<"fcaf.w", int_mips_fcaf_w, MSA128WOpnd>,
+                    IsCommutable;
+class FCAF_D_DESC : MSA_3RF_DESC_BASE<"fcaf.d", int_mips_fcaf_d, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCEQ_W_DESC : MSA_3RF_DESC_BASE<"fceq.w", vfsetoeq_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCEQ_D_DESC : MSA_3RF_DESC_BASE<"fceq.d", vfsetoeq_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCLASS_W_DESC : MSA_2RF_DESC_BASE<"fclass.w", int_mips_fclass_w,
+                                        MSA128WOpnd>;
+class FCLASS_D_DESC : MSA_2RF_DESC_BASE<"fclass.d", int_mips_fclass_d,
+                                        MSA128DOpnd>;
+
+class FCLE_W_DESC : MSA_3RF_DESC_BASE<"fcle.w", vfsetole_v4f32, MSA128WOpnd>;
+class FCLE_D_DESC : MSA_3RF_DESC_BASE<"fcle.d", vfsetole_v2f64, MSA128DOpnd>;
+
+class FCLT_W_DESC : MSA_3RF_DESC_BASE<"fclt.w", vfsetolt_v4f32, MSA128WOpnd>;
+class FCLT_D_DESC : MSA_3RF_DESC_BASE<"fclt.d", vfsetolt_v2f64, MSA128DOpnd>;
+
+class FCNE_W_DESC : MSA_3RF_DESC_BASE<"fcne.w", vfsetone_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCNE_D_DESC : MSA_3RF_DESC_BASE<"fcne.d", vfsetone_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCOR_W_DESC : MSA_3RF_DESC_BASE<"fcor.w", vfsetord_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCOR_D_DESC : MSA_3RF_DESC_BASE<"fcor.d", vfsetord_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCUEQ_W_DESC : MSA_3RF_DESC_BASE<"fcueq.w", vfsetueq_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCUEQ_D_DESC : MSA_3RF_DESC_BASE<"fcueq.d", vfsetueq_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FCULE_W_DESC : MSA_3RF_DESC_BASE<"fcule.w", vfsetule_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCULE_D_DESC : MSA_3RF_DESC_BASE<"fcule.d", vfsetule_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FCULT_W_DESC : MSA_3RF_DESC_BASE<"fcult.w", vfsetult_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCULT_D_DESC : MSA_3RF_DESC_BASE<"fcult.d", vfsetult_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FCUN_W_DESC : MSA_3RF_DESC_BASE<"fcun.w", vfsetun_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCUN_D_DESC : MSA_3RF_DESC_BASE<"fcun.d", vfsetun_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCUNE_W_DESC : MSA_3RF_DESC_BASE<"fcune.w", vfsetune_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCUNE_D_DESC : MSA_3RF_DESC_BASE<"fcune.d", vfsetune_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FDIV_W_DESC : MSA_3RF_DESC_BASE<"fdiv.w", fdiv, MSA128WOpnd>;
+class FDIV_D_DESC : MSA_3RF_DESC_BASE<"fdiv.d", fdiv, MSA128DOpnd>;
+
+class FEXDO_H_DESC : MSA_3RF_DESC_BASE<"fexdo.h", int_mips_fexdo_h,
+                                       MSA128HOpnd, MSA128WOpnd, MSA128WOpnd>;
+class FEXDO_W_DESC : MSA_3RF_DESC_BASE<"fexdo.w", int_mips_fexdo_w,
+                                       MSA128WOpnd, MSA128DOpnd, MSA128DOpnd>;
+
+// The fexp2.df instruction multiplies the first operand by 2 to the power of
+// the second operand. We therefore need a pseudo-insn in order to invent the
+// 1.0 when we only need to match ISD::FEXP2.
+class FEXP2_W_DESC : MSA_3RF_DESC_BASE<"fexp2.w", mul_fexp2, MSA128WOpnd>;
+class FEXP2_D_DESC : MSA_3RF_DESC_BASE<"fexp2.d", mul_fexp2, MSA128DOpnd>;
+let usesCustomInserter = 1 in {
+  class FEXP2_W_1_PSEUDO_DESC :
+      MSAPseudo<(outs MSA128W:$wd), (ins MSA128W:$ws),
+                [(set MSA128W:$wd, (fexp2 MSA128W:$ws))]>;
+  class FEXP2_D_1_PSEUDO_DESC :
+      MSAPseudo<(outs MSA128D:$wd), (ins MSA128D:$ws),
+                [(set MSA128D:$wd, (fexp2 MSA128D:$ws))]>;
+}
+
+class FEXUPL_W_DESC : MSA_2RF_DESC_BASE<"fexupl.w", int_mips_fexupl_w,
+                                        MSA128WOpnd, MSA128HOpnd>;
+class FEXUPL_D_DESC : MSA_2RF_DESC_BASE<"fexupl.d", int_mips_fexupl_d,
+                                        MSA128DOpnd, MSA128WOpnd>;
+
+class FEXUPR_W_DESC : MSA_2RF_DESC_BASE<"fexupr.w", int_mips_fexupr_w,
+                                        MSA128WOpnd, MSA128HOpnd>;
+class FEXUPR_D_DESC : MSA_2RF_DESC_BASE<"fexupr.d", int_mips_fexupr_d,
+                                        MSA128DOpnd, MSA128WOpnd>;
+
+class FFINT_S_W_DESC : MSA_2RF_DESC_BASE<"ffint_s.w", sint_to_fp, MSA128WOpnd>;
+class FFINT_S_D_DESC : MSA_2RF_DESC_BASE<"ffint_s.d", sint_to_fp, MSA128DOpnd>;
+
+class FFINT_U_W_DESC : MSA_2RF_DESC_BASE<"ffint_u.w", uint_to_fp, MSA128WOpnd>;
+class FFINT_U_D_DESC : MSA_2RF_DESC_BASE<"ffint_u.d", uint_to_fp, MSA128DOpnd>;
+
+class FFQL_W_DESC : MSA_2RF_DESC_BASE<"ffql.w", int_mips_ffql_w,
+                                      MSA128WOpnd, MSA128HOpnd>;
+class FFQL_D_DESC : MSA_2RF_DESC_BASE<"ffql.d", int_mips_ffql_d,
+                                      MSA128DOpnd, MSA128WOpnd>;
+
+class FFQR_W_DESC : MSA_2RF_DESC_BASE<"ffqr.w", int_mips_ffqr_w,
+                                      MSA128WOpnd, MSA128HOpnd>;
+class FFQR_D_DESC : MSA_2RF_DESC_BASE<"ffqr.d", int_mips_ffqr_d,
+                                      MSA128DOpnd, MSA128WOpnd>;
+
+class FILL_B_DESC : MSA_2R_FILL_DESC_BASE<"fill.b", v16i8, vsplati8,
+                                          MSA128BOpnd, GPR32Opnd>;
+class FILL_H_DESC : MSA_2R_FILL_DESC_BASE<"fill.h", v8i16, vsplati16,
+                                          MSA128HOpnd, GPR32Opnd>;
+class FILL_W_DESC : MSA_2R_FILL_DESC_BASE<"fill.w", v4i32, vsplati32,
+                                          MSA128WOpnd, GPR32Opnd>;
+class FILL_D_DESC : MSA_2R_FILL_DESC_BASE<"fill.d", v2i64, vsplati64,
+                                          MSA128DOpnd, GPR64Opnd>;
+
+class FILL_FW_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<v4f32, vsplatf32, MSA128W,
+                                                    FGR32>;
+class FILL_FD_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<v2f64, vsplatf64, MSA128D,
+                                                    FGR64>;
+
+class FLOG2_W_DESC : MSA_2RF_DESC_BASE<"flog2.w", flog2, MSA128WOpnd>;
+class FLOG2_D_DESC : MSA_2RF_DESC_BASE<"flog2.d", flog2, MSA128DOpnd>;
+
+class FMADD_W_DESC : MSA_3RF_4RF_DESC_BASE<"fmadd.w", fma, MSA128WOpnd>;
+class FMADD_D_DESC : MSA_3RF_4RF_DESC_BASE<"fmadd.d", fma, MSA128DOpnd>;
+
+class FMAX_W_DESC : MSA_3RF_DESC_BASE<"fmax.w", int_mips_fmax_w, MSA128WOpnd>;
+class FMAX_D_DESC : MSA_3RF_DESC_BASE<"fmax.d", int_mips_fmax_d, MSA128DOpnd>;
+
+class FMAX_A_W_DESC : MSA_3RF_DESC_BASE<"fmax_a.w", int_mips_fmax_a_w,
+                                        MSA128WOpnd>;
+class FMAX_A_D_DESC : MSA_3RF_DESC_BASE<"fmax_a.d", int_mips_fmax_a_d,
+                                        MSA128DOpnd>;
+
+class FMIN_W_DESC : MSA_3RF_DESC_BASE<"fmin.w", int_mips_fmin_w, MSA128WOpnd>;
+class FMIN_D_DESC : MSA_3RF_DESC_BASE<"fmin.d", int_mips_fmin_d, MSA128DOpnd>;
+
+class FMIN_A_W_DESC : MSA_3RF_DESC_BASE<"fmin_a.w", int_mips_fmin_a_w,
+                                        MSA128WOpnd>;
+class FMIN_A_D_DESC : MSA_3RF_DESC_BASE<"fmin_a.d", int_mips_fmin_a_d,
+                                        MSA128DOpnd>;
+
+class FMSUB_W_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.w", fms, MSA128WOpnd>;
+class FMSUB_D_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.d", fms, MSA128DOpnd>;
+
+class FMUL_W_DESC : MSA_3RF_DESC_BASE<"fmul.w", fmul, MSA128WOpnd>;
+class FMUL_D_DESC : MSA_3RF_DESC_BASE<"fmul.d", fmul, MSA128DOpnd>;
+
+class FRINT_W_DESC : MSA_2RF_DESC_BASE<"frint.w", frint, MSA128WOpnd>;
+class FRINT_D_DESC : MSA_2RF_DESC_BASE<"frint.d", frint, MSA128DOpnd>;
+
+class FRCP_W_DESC : MSA_2RF_DESC_BASE<"frcp.w", int_mips_frcp_w, MSA128WOpnd>;
+class FRCP_D_DESC : MSA_2RF_DESC_BASE<"frcp.d", int_mips_frcp_d, MSA128DOpnd>;
+
+class FRSQRT_W_DESC : MSA_2RF_DESC_BASE<"frsqrt.w", int_mips_frsqrt_w,
+                                        MSA128WOpnd>;
+class FRSQRT_D_DESC : MSA_2RF_DESC_BASE<"frsqrt.d", int_mips_frsqrt_d,
+                                        MSA128DOpnd>;
+
+class FSAF_W_DESC : MSA_3RF_DESC_BASE<"fsaf.w", int_mips_fsaf_w, MSA128WOpnd>;
+class FSAF_D_DESC : MSA_3RF_DESC_BASE<"fsaf.d", int_mips_fsaf_d, MSA128DOpnd>;
+
+class FSEQ_W_DESC : MSA_3RF_DESC_BASE<"fseq.w", int_mips_fseq_w, MSA128WOpnd>;
+class FSEQ_D_DESC : MSA_3RF_DESC_BASE<"fseq.d", int_mips_fseq_d, MSA128DOpnd>;
+
+class FSLE_W_DESC : MSA_3RF_DESC_BASE<"fsle.w", int_mips_fsle_w, MSA128WOpnd>;
+class FSLE_D_DESC : MSA_3RF_DESC_BASE<"fsle.d", int_mips_fsle_d, MSA128DOpnd>;
+
+class FSLT_W_DESC : MSA_3RF_DESC_BASE<"fslt.w", int_mips_fslt_w, MSA128WOpnd>;
+class FSLT_D_DESC : MSA_3RF_DESC_BASE<"fslt.d", int_mips_fslt_d, MSA128DOpnd>;
+
+class FSNE_W_DESC : MSA_3RF_DESC_BASE<"fsne.w", int_mips_fsne_w, MSA128WOpnd>;
+class FSNE_D_DESC : MSA_3RF_DESC_BASE<"fsne.d", int_mips_fsne_d, MSA128DOpnd>;
+
+class FSOR_W_DESC : MSA_3RF_DESC_BASE<"fsor.w", int_mips_fsor_w, MSA128WOpnd>;
+class FSOR_D_DESC : MSA_3RF_DESC_BASE<"fsor.d", int_mips_fsor_d, MSA128DOpnd>;
+
+class FSQRT_W_DESC : MSA_2RF_DESC_BASE<"fsqrt.w", fsqrt, MSA128WOpnd>;
+class FSQRT_D_DESC : MSA_2RF_DESC_BASE<"fsqrt.d", fsqrt, MSA128DOpnd>;
+
+class FSUB_W_DESC : MSA_3RF_DESC_BASE<"fsub.w", fsub, MSA128WOpnd>;
+class FSUB_D_DESC : MSA_3RF_DESC_BASE<"fsub.d", fsub, MSA128DOpnd>;
+
+class FSUEQ_W_DESC : MSA_3RF_DESC_BASE<"fsueq.w", int_mips_fsueq_w,
+                                       MSA128WOpnd>;
+class FSUEQ_D_DESC : MSA_3RF_DESC_BASE<"fsueq.d", int_mips_fsueq_d,
+                                       MSA128DOpnd>;
+
+class FSULE_W_DESC : MSA_3RF_DESC_BASE<"fsule.w", int_mips_fsule_w,
+                                       MSA128WOpnd>;
+class FSULE_D_DESC : MSA_3RF_DESC_BASE<"fsule.d", int_mips_fsule_d,
+                                       MSA128DOpnd>;
+
+class FSULT_W_DESC : MSA_3RF_DESC_BASE<"fsult.w", int_mips_fsult_w,
+                                       MSA128WOpnd>;
+class FSULT_D_DESC : MSA_3RF_DESC_BASE<"fsult.d", int_mips_fsult_d,
+                                       MSA128DOpnd>;
+
+class FSUN_W_DESC : MSA_3RF_DESC_BASE<"fsun.w", int_mips_fsun_w,
+                                      MSA128WOpnd>;
+class FSUN_D_DESC : MSA_3RF_DESC_BASE<"fsun.d", int_mips_fsun_d,
+                                      MSA128DOpnd>;
+
+class FSUNE_W_DESC : MSA_3RF_DESC_BASE<"fsune.w", int_mips_fsune_w,
+                                       MSA128WOpnd>;
+class FSUNE_D_DESC : MSA_3RF_DESC_BASE<"fsune.d", int_mips_fsune_d,
+                                       MSA128DOpnd>;
+
+class FTINT_S_W_DESC : MSA_2RF_DESC_BASE<"ftint_s.w", int_mips_ftint_s_w,
+                                         MSA128WOpnd>;
+class FTINT_S_D_DESC : MSA_2RF_DESC_BASE<"ftint_s.d", int_mips_ftint_s_d,
+                                         MSA128DOpnd>;
+
+class FTINT_U_W_DESC : MSA_2RF_DESC_BASE<"ftint_u.w", int_mips_ftint_u_w,
+                                         MSA128WOpnd>;
+class FTINT_U_D_DESC : MSA_2RF_DESC_BASE<"ftint_u.d", int_mips_ftint_u_d,
+                                         MSA128DOpnd>;
+
+class FTQ_H_DESC : MSA_3RF_DESC_BASE<"ftq.h", int_mips_ftq_h,
+                                     MSA128HOpnd, MSA128WOpnd, MSA128WOpnd>;
+class FTQ_W_DESC : MSA_3RF_DESC_BASE<"ftq.w", int_mips_ftq_w,
+                                     MSA128WOpnd, MSA128DOpnd, MSA128DOpnd>;
+
+class FTRUNC_S_W_DESC : MSA_2RF_DESC_BASE<"ftrunc_s.w", fp_to_sint,
+                                          MSA128WOpnd>;
+class FTRUNC_S_D_DESC : MSA_2RF_DESC_BASE<"ftrunc_s.d", fp_to_sint,
+                                          MSA128DOpnd>;
+
+class FTRUNC_U_W_DESC : MSA_2RF_DESC_BASE<"ftrunc_u.w", fp_to_uint,
+                                          MSA128WOpnd>;
+class FTRUNC_U_D_DESC : MSA_2RF_DESC_BASE<"ftrunc_u.d", fp_to_uint,
+                                          MSA128DOpnd>;
+
+class HADD_S_H_DESC : MSA_3R_DESC_BASE<"hadd_s.h", int_mips_hadd_s_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HADD_S_W_DESC : MSA_3R_DESC_BASE<"hadd_s.w", int_mips_hadd_s_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HADD_S_D_DESC : MSA_3R_DESC_BASE<"hadd_s.d", int_mips_hadd_s_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HADD_U_H_DESC : MSA_3R_DESC_BASE<"hadd_u.h", int_mips_hadd_u_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HADD_U_W_DESC : MSA_3R_DESC_BASE<"hadd_u.w", int_mips_hadd_u_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HADD_U_D_DESC : MSA_3R_DESC_BASE<"hadd_u.d", int_mips_hadd_u_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HSUB_S_H_DESC : MSA_3R_DESC_BASE<"hsub_s.h", int_mips_hsub_s_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HSUB_S_W_DESC : MSA_3R_DESC_BASE<"hsub_s.w", int_mips_hsub_s_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HSUB_S_D_DESC : MSA_3R_DESC_BASE<"hsub_s.d", int_mips_hsub_s_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HSUB_U_H_DESC : MSA_3R_DESC_BASE<"hsub_u.h", int_mips_hsub_u_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HSUB_U_W_DESC : MSA_3R_DESC_BASE<"hsub_u.w", int_mips_hsub_u_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HSUB_U_D_DESC : MSA_3R_DESC_BASE<"hsub_u.d", int_mips_hsub_u_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class ILVEV_B_DESC : MSA_3R_DESC_BASE<"ilvev.b", MipsILVEV, MSA128BOpnd>;
+class ILVEV_H_DESC : MSA_3R_DESC_BASE<"ilvev.h", MipsILVEV, MSA128HOpnd>;
+class ILVEV_W_DESC : MSA_3R_DESC_BASE<"ilvev.w", MipsILVEV, MSA128WOpnd>;
+class ILVEV_D_DESC : MSA_3R_DESC_BASE<"ilvev.d", MipsILVEV, MSA128DOpnd>;
+
+class ILVL_B_DESC : MSA_3R_DESC_BASE<"ilvl.b", MipsILVL, MSA128BOpnd>;
+class ILVL_H_DESC : MSA_3R_DESC_BASE<"ilvl.h", MipsILVL, MSA128HOpnd>;
+class ILVL_W_DESC : MSA_3R_DESC_BASE<"ilvl.w", MipsILVL, MSA128WOpnd>;
+class ILVL_D_DESC : MSA_3R_DESC_BASE<"ilvl.d", MipsILVL, MSA128DOpnd>;
+
+class ILVOD_B_DESC : MSA_3R_DESC_BASE<"ilvod.b", MipsILVOD, MSA128BOpnd>;
+class ILVOD_H_DESC : MSA_3R_DESC_BASE<"ilvod.h", MipsILVOD, MSA128HOpnd>;
+class ILVOD_W_DESC : MSA_3R_DESC_BASE<"ilvod.w", MipsILVOD, MSA128WOpnd>;
+class ILVOD_D_DESC : MSA_3R_DESC_BASE<"ilvod.d", MipsILVOD, MSA128DOpnd>;
+
+class ILVR_B_DESC : MSA_3R_DESC_BASE<"ilvr.b", MipsILVR, MSA128BOpnd>;
+class ILVR_H_DESC : MSA_3R_DESC_BASE<"ilvr.h", MipsILVR, MSA128HOpnd>;
+class ILVR_W_DESC : MSA_3R_DESC_BASE<"ilvr.w", MipsILVR, MSA128WOpnd>;
+class ILVR_D_DESC : MSA_3R_DESC_BASE<"ilvr.d", MipsILVR, MSA128DOpnd>;
+
+class INSERT_B_DESC : MSA_INSERT_DESC_BASE<"insert.b", vinsert_v16i8, uimm4,
+                                           immZExt4Ptr, MSA128BOpnd, GPR32Opnd>;
+class INSERT_H_DESC : MSA_INSERT_DESC_BASE<"insert.h", vinsert_v8i16, uimm3,
+                                           immZExt3Ptr, MSA128HOpnd, GPR32Opnd>;
+class INSERT_W_DESC : MSA_INSERT_DESC_BASE<"insert.w", vinsert_v4i32, uimm2,
+                                           immZExt2Ptr, MSA128WOpnd, GPR32Opnd>;
+class INSERT_D_DESC : MSA_INSERT_DESC_BASE<"insert.d", vinsert_v2i64, uimm1,
+                                           immZExt1Ptr, MSA128DOpnd, GPR64Opnd>;
+
+class INSERT_B_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v16i8, MSA128BOpnd, GPR32Opnd, GPR32Opnd>;
+class INSERT_H_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v8i16, MSA128HOpnd, GPR32Opnd, GPR32Opnd>;
+class INSERT_W_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v4i32, MSA128WOpnd, GPR32Opnd, GPR32Opnd>;
+class INSERT_D_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2i64, MSA128DOpnd, GPR64Opnd, GPR32Opnd>;
+
+class INSERT_FW_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v4f32,
+                                                     uimm2, immZExt2Ptr,
+                                                     MSA128WOpnd, FGR32Opnd>;
+class INSERT_FD_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v2f64,
+                                                     uimm1, immZExt1Ptr,
+                                                     MSA128DOpnd, FGR64Opnd>;
+
+class INSERT_FW_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v4f32, MSA128WOpnd, FGR32Opnd, GPR32Opnd>;
+class INSERT_FD_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2f64, MSA128DOpnd, FGR64Opnd, GPR32Opnd>;
+
+class INSERT_B_VIDX64_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v16i8, MSA128BOpnd, GPR32Opnd, GPR64Opnd>;
+class INSERT_H_VIDX64_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v8i16, MSA128HOpnd, GPR32Opnd, GPR64Opnd>;
+class INSERT_W_VIDX64_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v4i32, MSA128WOpnd, GPR32Opnd, GPR64Opnd>;
+class INSERT_D_VIDX64_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2i64, MSA128DOpnd, GPR64Opnd, GPR64Opnd>;
+
+class INSERT_FW_VIDX64_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v4f32, MSA128WOpnd, FGR32Opnd, GPR64Opnd>;
+class INSERT_FD_VIDX64_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2f64, MSA128DOpnd, FGR64Opnd, GPR64Opnd>;
+
+class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, uimm4, immZExt4,
+                                         MSA128BOpnd>;
+class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, uimm3, immZExt3,
+                                         MSA128HOpnd>;
+class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32, uimm2, immZExt2,
+                                         MSA128WOpnd>;
+class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, uimm1, immZExt1,
+                                         MSA128DOpnd>;
+
+class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                   ValueType TyNode, RegisterOperand ROWD,
+                   Operand MemOpnd, ComplexPattern Addr = addrimm10,
+                   InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins MemOpnd:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $addr");
+  list<dag> Pattern = [(set ROWD:$wd, (TyNode (OpNode Addr:$addr)))];
+  InstrItinClass Itinerary = itin;
+  string DecoderMethod = "DecodeMSA128Mem";
+}
+
+class LD_B_DESC : LD_DESC_BASE<"ld.b", load, v16i8, MSA128BOpnd, mem_simm10>;
+class LD_H_DESC : LD_DESC_BASE<"ld.h", load, v8i16, MSA128HOpnd,
+                               mem_simm10_lsl1, addrimm10lsl1>;
+class LD_W_DESC : LD_DESC_BASE<"ld.w", load, v4i32, MSA128WOpnd,
+                               mem_simm10_lsl2, addrimm10lsl2>;
+class LD_D_DESC : LD_DESC_BASE<"ld.d", load, v2i64, MSA128DOpnd,
+                               mem_simm10_lsl3, addrimm10lsl3>;
+
+class LDI_B_DESC : MSA_I10_LDI_DESC_BASE<"ldi.b", MSA128BOpnd>;
+class LDI_H_DESC : MSA_I10_LDI_DESC_BASE<"ldi.h", MSA128HOpnd>;
+class LDI_W_DESC : MSA_I10_LDI_DESC_BASE<"ldi.w", MSA128WOpnd>;
+class LDI_D_DESC : MSA_I10_LDI_DESC_BASE<"ldi.d", MSA128DOpnd>;
+
+class LSA_DESC_BASE<string instr_asm, RegisterOperand RORD,
+                    InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs RORD:$rd);
+  dag InOperandList = (ins RORD:$rs, RORD:$rt, uimm2_plus1:$sa);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $sa");
+  list<dag> Pattern = [(set RORD:$rd, (add RORD:$rt,
+                                                (shl RORD:$rs,
+                                                     immZExt2Lsa:$sa)))];
+  InstrItinClass Itinerary = itin;
+}
+
+class LSA_DESC : LSA_DESC_BASE<"lsa", GPR32Opnd, II_LSA>;
+class DLSA_DESC : LSA_DESC_BASE<"dlsa", GPR64Opnd, II_DLSA>;
+
+class MADD_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"madd_q.h", int_mips_madd_q_h,
+                                            MSA128HOpnd>;
+class MADD_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"madd_q.w", int_mips_madd_q_w,
+                                            MSA128WOpnd>;
+
+class MADDR_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"maddr_q.h", int_mips_maddr_q_h,
+                                             MSA128HOpnd>;
+class MADDR_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"maddr_q.w", int_mips_maddr_q_w,
+                                             MSA128WOpnd>;
+
+class MADDV_B_DESC : MSA_3R_4R_DESC_BASE<"maddv.b", muladd, MSA128BOpnd>;
+class MADDV_H_DESC : MSA_3R_4R_DESC_BASE<"maddv.h", muladd, MSA128HOpnd>;
+class MADDV_W_DESC : MSA_3R_4R_DESC_BASE<"maddv.w", muladd, MSA128WOpnd>;
+class MADDV_D_DESC : MSA_3R_4R_DESC_BASE<"maddv.d", muladd, MSA128DOpnd>;
+
+class MAX_A_B_DESC : MSA_3R_DESC_BASE<"max_a.b", int_mips_max_a_b, MSA128BOpnd>;
+class MAX_A_H_DESC : MSA_3R_DESC_BASE<"max_a.h", int_mips_max_a_h, MSA128HOpnd>;
+class MAX_A_W_DESC : MSA_3R_DESC_BASE<"max_a.w", int_mips_max_a_w, MSA128WOpnd>;
+class MAX_A_D_DESC : MSA_3R_DESC_BASE<"max_a.d", int_mips_max_a_d, MSA128DOpnd>;
+
+class MAX_S_B_DESC : MSA_3R_DESC_BASE<"max_s.b", MipsVSMax, MSA128BOpnd>;
+class MAX_S_H_DESC : MSA_3R_DESC_BASE<"max_s.h", MipsVSMax, MSA128HOpnd>;
+class MAX_S_W_DESC : MSA_3R_DESC_BASE<"max_s.w", MipsVSMax, MSA128WOpnd>;
+class MAX_S_D_DESC : MSA_3R_DESC_BASE<"max_s.d", MipsVSMax, MSA128DOpnd>;
+
+class MAX_U_B_DESC : MSA_3R_DESC_BASE<"max_u.b", MipsVUMax, MSA128BOpnd>;
+class MAX_U_H_DESC : MSA_3R_DESC_BASE<"max_u.h", MipsVUMax, MSA128HOpnd>;
+class MAX_U_W_DESC : MSA_3R_DESC_BASE<"max_u.w", MipsVUMax, MSA128WOpnd>;
+class MAX_U_D_DESC : MSA_3R_DESC_BASE<"max_u.d", MipsVUMax, MSA128DOpnd>;
+
+class MAXI_S_B_DESC : MSA_I5_DESC_BASE<"maxi_s.b", MipsVSMax, vsplati8_simm5,
+                                       MSA128BOpnd>;
+class MAXI_S_H_DESC : MSA_I5_DESC_BASE<"maxi_s.h", MipsVSMax, vsplati16_simm5,
+                                       MSA128HOpnd>;
+class MAXI_S_W_DESC : MSA_I5_DESC_BASE<"maxi_s.w", MipsVSMax, vsplati32_simm5,
+                                       MSA128WOpnd>;
+class MAXI_S_D_DESC : MSA_I5_DESC_BASE<"maxi_s.d", MipsVSMax, vsplati64_simm5,
+                                       MSA128DOpnd>;
+
+class MAXI_U_B_DESC : MSA_I5_DESC_BASE<"maxi_u.b", MipsVUMax, vsplati8_uimm5,
+                                       MSA128BOpnd>;
+class MAXI_U_H_DESC : MSA_I5_DESC_BASE<"maxi_u.h", MipsVUMax, vsplati16_uimm5,
+                                       MSA128HOpnd>;
+class MAXI_U_W_DESC : MSA_I5_DESC_BASE<"maxi_u.w", MipsVUMax, vsplati32_uimm5,
+                                       MSA128WOpnd>;
+class MAXI_U_D_DESC : MSA_I5_DESC_BASE<"maxi_u.d", MipsVUMax, vsplati64_uimm5,
+                                       MSA128DOpnd>;
+
+class MIN_A_B_DESC : MSA_3R_DESC_BASE<"min_a.b", int_mips_min_a_b, MSA128BOpnd>;
+class MIN_A_H_DESC : MSA_3R_DESC_BASE<"min_a.h", int_mips_min_a_h, MSA128HOpnd>;
+class MIN_A_W_DESC : MSA_3R_DESC_BASE<"min_a.w", int_mips_min_a_w, MSA128WOpnd>;
+class MIN_A_D_DESC : MSA_3R_DESC_BASE<"min_a.d", int_mips_min_a_d, MSA128DOpnd>;
+
+class MIN_S_B_DESC : MSA_3R_DESC_BASE<"min_s.b", MipsVSMin, MSA128BOpnd>;
+class MIN_S_H_DESC : MSA_3R_DESC_BASE<"min_s.h", MipsVSMin, MSA128HOpnd>;
+class MIN_S_W_DESC : MSA_3R_DESC_BASE<"min_s.w", MipsVSMin, MSA128WOpnd>;
+class MIN_S_D_DESC : MSA_3R_DESC_BASE<"min_s.d", MipsVSMin, MSA128DOpnd>;
+
+class MIN_U_B_DESC : MSA_3R_DESC_BASE<"min_u.b", MipsVUMin, MSA128BOpnd>;
+class MIN_U_H_DESC : MSA_3R_DESC_BASE<"min_u.h", MipsVUMin, MSA128HOpnd>;
+class MIN_U_W_DESC : MSA_3R_DESC_BASE<"min_u.w", MipsVUMin, MSA128WOpnd>;
+class MIN_U_D_DESC : MSA_3R_DESC_BASE<"min_u.d", MipsVUMin, MSA128DOpnd>;
+
+class MINI_S_B_DESC : MSA_I5_DESC_BASE<"mini_s.b", MipsVSMin, vsplati8_simm5,
+                                       MSA128BOpnd>;
+class MINI_S_H_DESC : MSA_I5_DESC_BASE<"mini_s.h", MipsVSMin, vsplati16_simm5,
+                                       MSA128HOpnd>;
+class MINI_S_W_DESC : MSA_I5_DESC_BASE<"mini_s.w", MipsVSMin, vsplati32_simm5,
+                                       MSA128WOpnd>;
+class MINI_S_D_DESC : MSA_I5_DESC_BASE<"mini_s.d", MipsVSMin, vsplati64_simm5,
+                                       MSA128DOpnd>;
+
+class MINI_U_B_DESC : MSA_I5_DESC_BASE<"mini_u.b", MipsVUMin, vsplati8_uimm5,
+                                       MSA128BOpnd>;
+class MINI_U_H_DESC : MSA_I5_DESC_BASE<"mini_u.h", MipsVUMin, vsplati16_uimm5,
+                                       MSA128HOpnd>;
+class MINI_U_W_DESC : MSA_I5_DESC_BASE<"mini_u.w", MipsVUMin, vsplati32_uimm5,
+                                       MSA128WOpnd>;
+class MINI_U_D_DESC : MSA_I5_DESC_BASE<"mini_u.d", MipsVUMin, vsplati64_uimm5,
+                                       MSA128DOpnd>;
+
+class MOD_S_B_DESC : MSA_3R_DESC_BASE<"mod_s.b", srem, MSA128BOpnd>;
+class MOD_S_H_DESC : MSA_3R_DESC_BASE<"mod_s.h", srem, MSA128HOpnd>;
+class MOD_S_W_DESC : MSA_3R_DESC_BASE<"mod_s.w", srem, MSA128WOpnd>;
+class MOD_S_D_DESC : MSA_3R_DESC_BASE<"mod_s.d", srem, MSA128DOpnd>;
+
+class MOD_U_B_DESC : MSA_3R_DESC_BASE<"mod_u.b", urem, MSA128BOpnd>;
+class MOD_U_H_DESC : MSA_3R_DESC_BASE<"mod_u.h", urem, MSA128HOpnd>;
+class MOD_U_W_DESC : MSA_3R_DESC_BASE<"mod_u.w", urem, MSA128WOpnd>;
+class MOD_U_D_DESC : MSA_3R_DESC_BASE<"mod_u.d", urem, MSA128DOpnd>;
+
+class MOVE_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$ws);
+  string AsmString = "move.v\t$wd, $ws";
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class MSUB_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"msub_q.h", int_mips_msub_q_h,
+                                            MSA128HOpnd>;
+class MSUB_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"msub_q.w", int_mips_msub_q_w,
+                                            MSA128WOpnd>;
+
+class MSUBR_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"msubr_q.h", int_mips_msubr_q_h,
+                                             MSA128HOpnd>;
+class MSUBR_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"msubr_q.w", int_mips_msubr_q_w,
+                                             MSA128WOpnd>;
+
+class MSUBV_B_DESC : MSA_3R_4R_DESC_BASE<"msubv.b", mulsub, MSA128BOpnd>;
+class MSUBV_H_DESC : MSA_3R_4R_DESC_BASE<"msubv.h", mulsub, MSA128HOpnd>;
+class MSUBV_W_DESC : MSA_3R_4R_DESC_BASE<"msubv.w", mulsub, MSA128WOpnd>;
+class MSUBV_D_DESC : MSA_3R_4R_DESC_BASE<"msubv.d", mulsub, MSA128DOpnd>;
+
+class MUL_Q_H_DESC : MSA_3RF_DESC_BASE<"mul_q.h", int_mips_mul_q_h,
+                                       MSA128HOpnd>;
+class MUL_Q_W_DESC : MSA_3RF_DESC_BASE<"mul_q.w", int_mips_mul_q_w,
+                                       MSA128WOpnd>;
+
+class MULR_Q_H_DESC : MSA_3RF_DESC_BASE<"mulr_q.h", int_mips_mulr_q_h,
+                                        MSA128HOpnd>;
+class MULR_Q_W_DESC : MSA_3RF_DESC_BASE<"mulr_q.w", int_mips_mulr_q_w,
+                                        MSA128WOpnd>;
+
+class MULV_B_DESC : MSA_3R_DESC_BASE<"mulv.b", mul, MSA128BOpnd>;
+class MULV_H_DESC : MSA_3R_DESC_BASE<"mulv.h", mul, MSA128HOpnd>;
+class MULV_W_DESC : MSA_3R_DESC_BASE<"mulv.w", mul, MSA128WOpnd>;
+class MULV_D_DESC : MSA_3R_DESC_BASE<"mulv.d", mul, MSA128DOpnd>;
+
+class NLOC_B_DESC : MSA_2R_DESC_BASE<"nloc.b", int_mips_nloc_b, MSA128BOpnd>;
+class NLOC_H_DESC : MSA_2R_DESC_BASE<"nloc.h", int_mips_nloc_h, MSA128HOpnd>;
+class NLOC_W_DESC : MSA_2R_DESC_BASE<"nloc.w", int_mips_nloc_w, MSA128WOpnd>;
+class NLOC_D_DESC : MSA_2R_DESC_BASE<"nloc.d", int_mips_nloc_d, MSA128DOpnd>;
+
+class NLZC_B_DESC : MSA_2R_DESC_BASE<"nlzc.b", ctlz, MSA128BOpnd>;
+class NLZC_H_DESC : MSA_2R_DESC_BASE<"nlzc.h", ctlz, MSA128HOpnd>;
+class NLZC_W_DESC : MSA_2R_DESC_BASE<"nlzc.w", ctlz, MSA128WOpnd>;
+class NLZC_D_DESC : MSA_2R_DESC_BASE<"nlzc.d", ctlz, MSA128DOpnd>;
+
+class NOR_V_DESC : MSA_VEC_DESC_BASE<"nor.v", MipsVNOR, MSA128BOpnd>;
+class NOR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128HOpnd>;
+class NOR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128WOpnd>;
+class NOR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128DOpnd>;
+
+class NORI_B_DESC : MSA_I8_DESC_BASE<"nori.b", MipsVNOR, vsplati8_uimm8,
+                                     MSA128BOpnd>;
+
+class OR_V_DESC : MSA_VEC_DESC_BASE<"or.v", or, MSA128BOpnd>;
+class OR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128HOpnd>;
+class OR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128WOpnd>;
+class OR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128DOpnd>;
+
+class ORI_B_DESC : MSA_I8_DESC_BASE<"ori.b", or, vsplati8_uimm8, MSA128BOpnd>;
+
+class PCKEV_B_DESC : MSA_3R_DESC_BASE<"pckev.b", MipsPCKEV, MSA128BOpnd>;
+class PCKEV_H_DESC : MSA_3R_DESC_BASE<"pckev.h", MipsPCKEV, MSA128HOpnd>;
+class PCKEV_W_DESC : MSA_3R_DESC_BASE<"pckev.w", MipsPCKEV, MSA128WOpnd>;
+class PCKEV_D_DESC : MSA_3R_DESC_BASE<"pckev.d", MipsPCKEV, MSA128DOpnd>;
+
+class PCKOD_B_DESC : MSA_3R_DESC_BASE<"pckod.b", MipsPCKOD, MSA128BOpnd>;
+class PCKOD_H_DESC : MSA_3R_DESC_BASE<"pckod.h", MipsPCKOD, MSA128HOpnd>;
+class PCKOD_W_DESC : MSA_3R_DESC_BASE<"pckod.w", MipsPCKOD, MSA128WOpnd>;
+class PCKOD_D_DESC : MSA_3R_DESC_BASE<"pckod.d", MipsPCKOD, MSA128DOpnd>;
+
+class PCNT_B_DESC : MSA_2R_DESC_BASE<"pcnt.b", ctpop, MSA128BOpnd>;
+class PCNT_H_DESC : MSA_2R_DESC_BASE<"pcnt.h", ctpop, MSA128HOpnd>;
+class PCNT_W_DESC : MSA_2R_DESC_BASE<"pcnt.w", ctpop, MSA128WOpnd>;
+class PCNT_D_DESC : MSA_2R_DESC_BASE<"pcnt.d", ctpop, MSA128DOpnd>;
+
+class SAT_S_B_DESC : MSA_BIT_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b, uimm3,
+                                         immZExt3, MSA128BOpnd>;
+class SAT_S_H_DESC : MSA_BIT_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h, uimm4,
+                                         immZExt4, MSA128HOpnd>;
+class SAT_S_W_DESC : MSA_BIT_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w, uimm5,
+                                         immZExt5, MSA128WOpnd>;
+class SAT_S_D_DESC : MSA_BIT_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d, uimm6,
+                                         immZExt6, MSA128DOpnd>;
+
+class SAT_U_B_DESC : MSA_BIT_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b, uimm3,
+                                         immZExt3, MSA128BOpnd>;
+class SAT_U_H_DESC : MSA_BIT_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h, uimm4,
+                                         immZExt4, MSA128HOpnd>;
+class SAT_U_W_DESC : MSA_BIT_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w, uimm5,
+                                         immZExt5, MSA128WOpnd>;
+class SAT_U_D_DESC : MSA_BIT_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d, uimm6,
+                                         immZExt6, MSA128DOpnd>;
+
+class SHF_B_DESC : MSA_I8_SHF_DESC_BASE<"shf.b", MSA128BOpnd>;
+class SHF_H_DESC : MSA_I8_SHF_DESC_BASE<"shf.h", MSA128HOpnd>;
+class SHF_W_DESC : MSA_I8_SHF_DESC_BASE<"shf.w", MSA128WOpnd>;
+
+class SLD_B_DESC : MSA_3R_SLD_DESC_BASE<"sld.b", int_mips_sld_b, MSA128BOpnd>;
+class SLD_H_DESC : MSA_3R_SLD_DESC_BASE<"sld.h", int_mips_sld_h, MSA128HOpnd>;
+class SLD_W_DESC : MSA_3R_SLD_DESC_BASE<"sld.w", int_mips_sld_w, MSA128WOpnd>;
+class SLD_D_DESC : MSA_3R_SLD_DESC_BASE<"sld.d", int_mips_sld_d, MSA128DOpnd>;
+
+class SLDI_B_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.b", int_mips_sldi_b,
+                                          MSA128BOpnd, MSA128BOpnd, uimm4,
+                                          immZExt4>;
+class SLDI_H_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.h", int_mips_sldi_h,
+                                          MSA128HOpnd, MSA128HOpnd, uimm3,
+                                          immZExt3>;
+class SLDI_W_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.w", int_mips_sldi_w,
+                                          MSA128WOpnd, MSA128WOpnd, uimm2,
+                                          immZExt2>;
+class SLDI_D_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.d", int_mips_sldi_d,
+                                          MSA128DOpnd, MSA128DOpnd, uimm1,
+                                          immZExt1>;
+
+class SLL_B_DESC : MSA_3R_DESC_BASE<"sll.b", shl, MSA128BOpnd>;
+class SLL_H_DESC : MSA_3R_DESC_BASE<"sll.h", shl, MSA128HOpnd>;
+class SLL_W_DESC : MSA_3R_DESC_BASE<"sll.w", shl, MSA128WOpnd>;
+class SLL_D_DESC : MSA_3R_DESC_BASE<"sll.d", shl, MSA128DOpnd>;
+
+class SLLI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.b", shl, vsplati8_uimm3,
+                                            MSA128BOpnd>;
+class SLLI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.h", shl, vsplati16_uimm4,
+                                            MSA128HOpnd>;
+class SLLI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.w", shl, vsplati32_uimm5,
+                                            MSA128WOpnd>;
+class SLLI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.d", shl, vsplati64_uimm6,
+                                            MSA128DOpnd>;
+
+class SPLAT_B_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.b", vsplati8_elt,
+                                            MSA128BOpnd>;
+class SPLAT_H_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.h", vsplati16_elt,
+                                            MSA128HOpnd>;
+class SPLAT_W_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.w", vsplati32_elt,
+                                            MSA128WOpnd>;
+class SPLAT_D_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.d", vsplati64_elt,
+                                            MSA128DOpnd>;
+
+class SPLATI_B_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.b", vsplati8_uimm4,
+                                              MSA128BOpnd>;
+class SPLATI_H_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.h", vsplati16_uimm3,
+                                              MSA128HOpnd>;
+class SPLATI_W_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.w", vsplati32_uimm2,
+                                              MSA128WOpnd>;
+class SPLATI_D_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.d", vsplati64_uimm1,
+                                              MSA128DOpnd>;
+
+class SRA_B_DESC : MSA_3R_DESC_BASE<"sra.b", sra, MSA128BOpnd>;
+class SRA_H_DESC : MSA_3R_DESC_BASE<"sra.h", sra, MSA128HOpnd>;
+class SRA_W_DESC : MSA_3R_DESC_BASE<"sra.w", sra, MSA128WOpnd>;
+class SRA_D_DESC : MSA_3R_DESC_BASE<"sra.d", sra, MSA128DOpnd>;
+
+class SRAI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.b", sra, vsplati8_uimm3,
+                                            MSA128BOpnd>;
+class SRAI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.h", sra, vsplati16_uimm4,
+                                            MSA128HOpnd>;
+class SRAI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.w", sra, vsplati32_uimm5,
+                                            MSA128WOpnd>;
+class SRAI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.d", sra, vsplati64_uimm6,
+                                            MSA128DOpnd>;
+
+class SRAR_B_DESC : MSA_3R_DESC_BASE<"srar.b", int_mips_srar_b, MSA128BOpnd>;
+class SRAR_H_DESC : MSA_3R_DESC_BASE<"srar.h", int_mips_srar_h, MSA128HOpnd>;
+class SRAR_W_DESC : MSA_3R_DESC_BASE<"srar.w", int_mips_srar_w, MSA128WOpnd>;
+class SRAR_D_DESC : MSA_3R_DESC_BASE<"srar.d", int_mips_srar_d, MSA128DOpnd>;
+
+class SRARI_B_DESC : MSA_BIT_X_DESC_BASE<"srari.b", int_mips_srari_b, uimm3,
+                                         immZExt3, MSA128BOpnd>;
+class SRARI_H_DESC : MSA_BIT_X_DESC_BASE<"srari.h", int_mips_srari_h, uimm4,
+                                         immZExt4, MSA128HOpnd>;
+class SRARI_W_DESC : MSA_BIT_X_DESC_BASE<"srari.w", int_mips_srari_w, uimm5,
+                                         immZExt5, MSA128WOpnd>;
+class SRARI_D_DESC : MSA_BIT_X_DESC_BASE<"srari.d", int_mips_srari_d, uimm6,
+                                         immZExt6, MSA128DOpnd>;
+
+class SRL_B_DESC : MSA_3R_DESC_BASE<"srl.b", srl, MSA128BOpnd>;
+class SRL_H_DESC : MSA_3R_DESC_BASE<"srl.h", srl, MSA128HOpnd>;
+class SRL_W_DESC : MSA_3R_DESC_BASE<"srl.w", srl, MSA128WOpnd>;
+class SRL_D_DESC : MSA_3R_DESC_BASE<"srl.d", srl, MSA128DOpnd>;
+
+class SRLI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.b", srl, vsplati8_uimm3,
+                                            MSA128BOpnd>;
+class SRLI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.h", srl, vsplati16_uimm4,
+                                            MSA128HOpnd>;
+class SRLI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.w", srl, vsplati32_uimm5,
+                                            MSA128WOpnd>;
+class SRLI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.d", srl, vsplati64_uimm6,
+                                            MSA128DOpnd>;
+
+class SRLR_B_DESC : MSA_3R_DESC_BASE<"srlr.b", int_mips_srlr_b, MSA128BOpnd>;
+class SRLR_H_DESC : MSA_3R_DESC_BASE<"srlr.h", int_mips_srlr_h, MSA128HOpnd>;
+class SRLR_W_DESC : MSA_3R_DESC_BASE<"srlr.w", int_mips_srlr_w, MSA128WOpnd>;
+class SRLR_D_DESC : MSA_3R_DESC_BASE<"srlr.d", int_mips_srlr_d, MSA128DOpnd>;
+
+class SRLRI_B_DESC : MSA_BIT_X_DESC_BASE<"srlri.b", int_mips_srlri_b, uimm3,
+                                         immZExt3, MSA128BOpnd>;
+class SRLRI_H_DESC : MSA_BIT_X_DESC_BASE<"srlri.h", int_mips_srlri_h, uimm4,
+                                         immZExt4, MSA128HOpnd>;
+class SRLRI_W_DESC : MSA_BIT_X_DESC_BASE<"srlri.w", int_mips_srlri_w, uimm5,
+                                         immZExt5, MSA128WOpnd>;
+class SRLRI_D_DESC : MSA_BIT_X_DESC_BASE<"srlri.d", int_mips_srlri_d, uimm6,
+                                         immZExt6, MSA128DOpnd>;
+
+class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                   ValueType TyNode, RegisterOperand ROWD,
+                   Operand MemOpnd, ComplexPattern Addr = addrimm10,
+                   InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins ROWD:$wd, MemOpnd:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $addr");
+  list<dag> Pattern = [(OpNode (TyNode ROWD:$wd), Addr:$addr)];
+  InstrItinClass Itinerary = itin;
+  string DecoderMethod = "DecodeMSA128Mem";
+}
+
+class ST_B_DESC : ST_DESC_BASE<"st.b", store, v16i8, MSA128BOpnd, mem_simm10>;
+class ST_H_DESC : ST_DESC_BASE<"st.h", store, v8i16, MSA128HOpnd,
+                               mem_simm10_lsl1, addrimm10lsl1>;
+class ST_W_DESC : ST_DESC_BASE<"st.w", store, v4i32, MSA128WOpnd,
+                               mem_simm10_lsl2, addrimm10lsl2>;
+class ST_D_DESC : ST_DESC_BASE<"st.d", store, v2i64, MSA128DOpnd,
+                               mem_simm10_lsl3, addrimm10lsl3>;
+
+class SUBS_S_B_DESC : MSA_3R_DESC_BASE<"subs_s.b", int_mips_subs_s_b,
+                                       MSA128BOpnd>;
+class SUBS_S_H_DESC : MSA_3R_DESC_BASE<"subs_s.h", int_mips_subs_s_h,
+                                       MSA128HOpnd>;
+class SUBS_S_W_DESC : MSA_3R_DESC_BASE<"subs_s.w", int_mips_subs_s_w,
+                                       MSA128WOpnd>;
+class SUBS_S_D_DESC : MSA_3R_DESC_BASE<"subs_s.d", int_mips_subs_s_d,
+                                       MSA128DOpnd>;
+
+class SUBS_U_B_DESC : MSA_3R_DESC_BASE<"subs_u.b", int_mips_subs_u_b,
+                                       MSA128BOpnd>;
+class SUBS_U_H_DESC : MSA_3R_DESC_BASE<"subs_u.h", int_mips_subs_u_h,
+                                       MSA128HOpnd>;
+class SUBS_U_W_DESC : MSA_3R_DESC_BASE<"subs_u.w", int_mips_subs_u_w,
+                                       MSA128WOpnd>;
+class SUBS_U_D_DESC : MSA_3R_DESC_BASE<"subs_u.d", int_mips_subs_u_d,
+                                       MSA128DOpnd>;
+
+class SUBSUS_U_B_DESC : MSA_3R_DESC_BASE<"subsus_u.b", int_mips_subsus_u_b,
+                                         MSA128BOpnd>;
+class SUBSUS_U_H_DESC : MSA_3R_DESC_BASE<"subsus_u.h", int_mips_subsus_u_h,
+                                         MSA128HOpnd>;
+class SUBSUS_U_W_DESC : MSA_3R_DESC_BASE<"subsus_u.w", int_mips_subsus_u_w,
+                                         MSA128WOpnd>;
+class SUBSUS_U_D_DESC : MSA_3R_DESC_BASE<"subsus_u.d", int_mips_subsus_u_d,
+                                         MSA128DOpnd>;
+
+class SUBSUU_S_B_DESC : MSA_3R_DESC_BASE<"subsuu_s.b", int_mips_subsuu_s_b,
+                                         MSA128BOpnd>;
+class SUBSUU_S_H_DESC : MSA_3R_DESC_BASE<"subsuu_s.h", int_mips_subsuu_s_h,
+                                         MSA128HOpnd>;
+class SUBSUU_S_W_DESC : MSA_3R_DESC_BASE<"subsuu_s.w", int_mips_subsuu_s_w,
+                                         MSA128WOpnd>;
+class SUBSUU_S_D_DESC : MSA_3R_DESC_BASE<"subsuu_s.d", int_mips_subsuu_s_d,
+                                         MSA128DOpnd>;
+
+class SUBV_B_DESC : MSA_3R_DESC_BASE<"subv.b", sub, MSA128BOpnd>;
+class SUBV_H_DESC : MSA_3R_DESC_BASE<"subv.h", sub, MSA128HOpnd>;
+class SUBV_W_DESC : MSA_3R_DESC_BASE<"subv.w", sub, MSA128WOpnd>;
+class SUBV_D_DESC : MSA_3R_DESC_BASE<"subv.d", sub, MSA128DOpnd>;
+
+class SUBVI_B_DESC : MSA_I5_DESC_BASE<"subvi.b", sub, vsplati8_uimm5,
+                                      MSA128BOpnd>;
+class SUBVI_H_DESC : MSA_I5_DESC_BASE<"subvi.h", sub, vsplati16_uimm5,
+                                      MSA128HOpnd>;
+class SUBVI_W_DESC : MSA_I5_DESC_BASE<"subvi.w", sub, vsplati32_uimm5,
+                                      MSA128WOpnd>;
+class SUBVI_D_DESC : MSA_I5_DESC_BASE<"subvi.d", sub, vsplati64_uimm5,
+                                      MSA128DOpnd>;
+
+class VSHF_B_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.b", MSA128BOpnd>;
+class VSHF_H_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.h", MSA128HOpnd>;
+class VSHF_W_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.w", MSA128WOpnd>;
+class VSHF_D_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.d", MSA128DOpnd>;
+
+class XOR_V_DESC : MSA_VEC_DESC_BASE<"xor.v", xor, MSA128BOpnd>;
+class XOR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128HOpnd>;
+class XOR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128WOpnd>;
+class XOR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128DOpnd>;
+
+class XORI_B_DESC : MSA_I8_DESC_BASE<"xori.b", xor, vsplati8_uimm8,
+                                     MSA128BOpnd>;
+
+// Instruction defs.
+def ADD_A_B : ADD_A_B_ENC, ADD_A_B_DESC;
+def ADD_A_H : ADD_A_H_ENC, ADD_A_H_DESC;
+def ADD_A_W : ADD_A_W_ENC, ADD_A_W_DESC;
+def ADD_A_D : ADD_A_D_ENC, ADD_A_D_DESC;
+
+def ADDS_A_B : ADDS_A_B_ENC, ADDS_A_B_DESC;
+def ADDS_A_H : ADDS_A_H_ENC, ADDS_A_H_DESC;
+def ADDS_A_W : ADDS_A_W_ENC, ADDS_A_W_DESC;
+def ADDS_A_D : ADDS_A_D_ENC, ADDS_A_D_DESC;
+
+def ADDS_S_B : ADDS_S_B_ENC, ADDS_S_B_DESC;
+def ADDS_S_H : ADDS_S_H_ENC, ADDS_S_H_DESC;
+def ADDS_S_W : ADDS_S_W_ENC, ADDS_S_W_DESC;
+def ADDS_S_D : ADDS_S_D_ENC, ADDS_S_D_DESC;
+
+def ADDS_U_B : ADDS_U_B_ENC, ADDS_U_B_DESC;
+def ADDS_U_H : ADDS_U_H_ENC, ADDS_U_H_DESC;
+def ADDS_U_W : ADDS_U_W_ENC, ADDS_U_W_DESC;
+def ADDS_U_D : ADDS_U_D_ENC, ADDS_U_D_DESC;
+
+def ADDV_B : ADDV_B_ENC, ADDV_B_DESC;
+def ADDV_H : ADDV_H_ENC, ADDV_H_DESC;
+def ADDV_W : ADDV_W_ENC, ADDV_W_DESC;
+def ADDV_D : ADDV_D_ENC, ADDV_D_DESC;
+
+def ADDVI_B : ADDVI_B_ENC, ADDVI_B_DESC;
+def ADDVI_H : ADDVI_H_ENC, ADDVI_H_DESC;
+def ADDVI_W : ADDVI_W_ENC, ADDVI_W_DESC;
+def ADDVI_D : ADDVI_D_ENC, ADDVI_D_DESC;
+
+def AND_V : AND_V_ENC, AND_V_DESC;
+def AND_V_H_PSEUDO : AND_V_H_PSEUDO_DESC,
+                     PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def AND_V_W_PSEUDO : AND_V_W_PSEUDO_DESC,
+                     PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def AND_V_D_PSEUDO : AND_V_D_PSEUDO_DESC,
+                     PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+
+def ANDI_B : ANDI_B_ENC, ANDI_B_DESC;
+
+def ASUB_S_B : ASUB_S_B_ENC, ASUB_S_B_DESC;
+def ASUB_S_H : ASUB_S_H_ENC, ASUB_S_H_DESC;
+def ASUB_S_W : ASUB_S_W_ENC, ASUB_S_W_DESC;
+def ASUB_S_D : ASUB_S_D_ENC, ASUB_S_D_DESC;
+
+def ASUB_U_B : ASUB_U_B_ENC, ASUB_U_B_DESC;
+def ASUB_U_H : ASUB_U_H_ENC, ASUB_U_H_DESC;
+def ASUB_U_W : ASUB_U_W_ENC, ASUB_U_W_DESC;
+def ASUB_U_D : ASUB_U_D_ENC, ASUB_U_D_DESC;
+
+def AVE_S_B : AVE_S_B_ENC, AVE_S_B_DESC;
+def AVE_S_H : AVE_S_H_ENC, AVE_S_H_DESC;
+def AVE_S_W : AVE_S_W_ENC, AVE_S_W_DESC;
+def AVE_S_D : AVE_S_D_ENC, AVE_S_D_DESC;
+
+def AVE_U_B : AVE_U_B_ENC, AVE_U_B_DESC;
+def AVE_U_H : AVE_U_H_ENC, AVE_U_H_DESC;
+def AVE_U_W : AVE_U_W_ENC, AVE_U_W_DESC;
+def AVE_U_D : AVE_U_D_ENC, AVE_U_D_DESC;
+
+def AVER_S_B : AVER_S_B_ENC, AVER_S_B_DESC;
+def AVER_S_H : AVER_S_H_ENC, AVER_S_H_DESC;
+def AVER_S_W : AVER_S_W_ENC, AVER_S_W_DESC;
+def AVER_S_D : AVER_S_D_ENC, AVER_S_D_DESC;
+
+def AVER_U_B : AVER_U_B_ENC, AVER_U_B_DESC;
+def AVER_U_H : AVER_U_H_ENC, AVER_U_H_DESC;
+def AVER_U_W : AVER_U_W_ENC, AVER_U_W_DESC;
+def AVER_U_D : AVER_U_D_ENC, AVER_U_D_DESC;
+
+def BCLR_B : BCLR_B_ENC, BCLR_B_DESC;
+def BCLR_H : BCLR_H_ENC, BCLR_H_DESC;
+def BCLR_W : BCLR_W_ENC, BCLR_W_DESC;
+def BCLR_D : BCLR_D_ENC, BCLR_D_DESC;
+
+def BCLRI_B : BCLRI_B_ENC, BCLRI_B_DESC;
+def BCLRI_H : BCLRI_H_ENC, BCLRI_H_DESC;
+def BCLRI_W : BCLRI_W_ENC, BCLRI_W_DESC;
+def BCLRI_D : BCLRI_D_ENC, BCLRI_D_DESC;
+
+def BINSL_B : BINSL_B_ENC, BINSL_B_DESC;
+def BINSL_H : BINSL_H_ENC, BINSL_H_DESC;
+def BINSL_W : BINSL_W_ENC, BINSL_W_DESC;
+def BINSL_D : BINSL_D_ENC, BINSL_D_DESC;
+
+def BINSLI_B : BINSLI_B_ENC, BINSLI_B_DESC;
+def BINSLI_H : BINSLI_H_ENC, BINSLI_H_DESC;
+def BINSLI_W : BINSLI_W_ENC, BINSLI_W_DESC;
+def BINSLI_D : BINSLI_D_ENC, BINSLI_D_DESC;
+
+def BINSR_B : BINSR_B_ENC, BINSR_B_DESC;
+def BINSR_H : BINSR_H_ENC, BINSR_H_DESC;
+def BINSR_W : BINSR_W_ENC, BINSR_W_DESC;
+def BINSR_D : BINSR_D_ENC, BINSR_D_DESC;
+
+def BINSRI_B : BINSRI_B_ENC, BINSRI_B_DESC;
+def BINSRI_H : BINSRI_H_ENC, BINSRI_H_DESC;
+def BINSRI_W : BINSRI_W_ENC, BINSRI_W_DESC;
+def BINSRI_D : BINSRI_D_ENC, BINSRI_D_DESC;
+
+def BMNZ_V : BMNZ_V_ENC, BMNZ_V_DESC;
+
+def BMNZI_B : BMNZI_B_ENC, BMNZI_B_DESC;
+
+def BMZ_V : BMZ_V_ENC, BMZ_V_DESC;
+
+def BMZI_B : BMZI_B_ENC, BMZI_B_DESC;
+
+def BNEG_B : BNEG_B_ENC, BNEG_B_DESC;
+def BNEG_H : BNEG_H_ENC, BNEG_H_DESC;
+def BNEG_W : BNEG_W_ENC, BNEG_W_DESC;
+def BNEG_D : BNEG_D_ENC, BNEG_D_DESC;
+
+def BNEGI_B : BNEGI_B_ENC, BNEGI_B_DESC;
+def BNEGI_H : BNEGI_H_ENC, BNEGI_H_DESC;
+def BNEGI_W : BNEGI_W_ENC, BNEGI_W_DESC;
+def BNEGI_D : BNEGI_D_ENC, BNEGI_D_DESC;
+
+def BNZ_B : BNZ_B_ENC, BNZ_B_DESC;
+def BNZ_H : BNZ_H_ENC, BNZ_H_DESC;
+def BNZ_W : BNZ_W_ENC, BNZ_W_DESC;
+def BNZ_D : BNZ_D_ENC, BNZ_D_DESC;
+
+def BNZ_V : BNZ_V_ENC, BNZ_V_DESC;
+
+def BSEL_V : BSEL_V_ENC, BSEL_V_DESC;
+
+class MSA_BSEL_PSEUDO_BASE<RegisterOperand RO, ValueType Ty> :
+  MSAPseudo<(outs RO:$wd), (ins RO:$wd_in, RO:$ws, RO:$wt),
+            [(set RO:$wd, (Ty (vselect RO:$wd_in, RO:$wt, RO:$ws)))]>,
+  // Note that vselect and BSEL_V treat the condition operand the opposite way
+  // from each other.
+  //   (vselect cond, if_set, if_clear)
+  //   (BSEL_V cond, if_clear, if_set)
+  PseudoInstExpansion<(BSEL_V MSA128BOpnd:$wd, MSA128BOpnd:$wd_in,
+                              MSA128BOpnd:$ws, MSA128BOpnd:$wt)> {
+  let Constraints = "$wd_in = $wd";
+}
+
+def BSEL_H_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128HOpnd, v8i16>;
+def BSEL_W_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128WOpnd, v4i32>;
+def BSEL_D_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128DOpnd, v2i64>;
+def BSEL_FW_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128WOpnd, v4f32>;
+def BSEL_FD_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128DOpnd, v2f64>;
+
+def BSELI_B : BSELI_B_ENC, BSELI_B_DESC;
+
+def BSET_B : BSET_B_ENC, BSET_B_DESC;
+def BSET_H : BSET_H_ENC, BSET_H_DESC;
+def BSET_W : BSET_W_ENC, BSET_W_DESC;
+def BSET_D : BSET_D_ENC, BSET_D_DESC;
+
+def BSETI_B : BSETI_B_ENC, BSETI_B_DESC;
+def BSETI_H : BSETI_H_ENC, BSETI_H_DESC;
+def BSETI_W : BSETI_W_ENC, BSETI_W_DESC;
+def BSETI_D : BSETI_D_ENC, BSETI_D_DESC;
+
+def BZ_B : BZ_B_ENC, BZ_B_DESC;
+def BZ_H : BZ_H_ENC, BZ_H_DESC;
+def BZ_W : BZ_W_ENC, BZ_W_DESC;
+def BZ_D : BZ_D_ENC, BZ_D_DESC;
+
+def BZ_V : BZ_V_ENC, BZ_V_DESC;
+
+def CEQ_B : CEQ_B_ENC, CEQ_B_DESC;
+def CEQ_H : CEQ_H_ENC, CEQ_H_DESC;
+def CEQ_W : CEQ_W_ENC, CEQ_W_DESC;
+def CEQ_D : CEQ_D_ENC, CEQ_D_DESC;
+
+def CEQI_B : CEQI_B_ENC, CEQI_B_DESC;
+def CEQI_H : CEQI_H_ENC, CEQI_H_DESC;
+def CEQI_W : CEQI_W_ENC, CEQI_W_DESC;
+def CEQI_D : CEQI_D_ENC, CEQI_D_DESC;
+
+def CFCMSA : CFCMSA_ENC, CFCMSA_DESC;
+
+def CLE_S_B : CLE_S_B_ENC, CLE_S_B_DESC;
+def CLE_S_H : CLE_S_H_ENC, CLE_S_H_DESC;
+def CLE_S_W : CLE_S_W_ENC, CLE_S_W_DESC;
+def CLE_S_D : CLE_S_D_ENC, CLE_S_D_DESC;
+
+def CLE_U_B : CLE_U_B_ENC, CLE_U_B_DESC;
+def CLE_U_H : CLE_U_H_ENC, CLE_U_H_DESC;
+def CLE_U_W : CLE_U_W_ENC, CLE_U_W_DESC;
+def CLE_U_D : CLE_U_D_ENC, CLE_U_D_DESC;
+
+def CLEI_S_B : CLEI_S_B_ENC, CLEI_S_B_DESC;
+def CLEI_S_H : CLEI_S_H_ENC, CLEI_S_H_DESC;
+def CLEI_S_W : CLEI_S_W_ENC, CLEI_S_W_DESC;
+def CLEI_S_D : CLEI_S_D_ENC, CLEI_S_D_DESC;
+
+def CLEI_U_B : CLEI_U_B_ENC, CLEI_U_B_DESC;
+def CLEI_U_H : CLEI_U_H_ENC, CLEI_U_H_DESC;
+def CLEI_U_W : CLEI_U_W_ENC, CLEI_U_W_DESC;
+def CLEI_U_D : CLEI_U_D_ENC, CLEI_U_D_DESC;
+
+def CLT_S_B : CLT_S_B_ENC, CLT_S_B_DESC;
+def CLT_S_H : CLT_S_H_ENC, CLT_S_H_DESC;
+def CLT_S_W : CLT_S_W_ENC, CLT_S_W_DESC;
+def CLT_S_D : CLT_S_D_ENC, CLT_S_D_DESC;
+
+def CLT_U_B : CLT_U_B_ENC, CLT_U_B_DESC;
+def CLT_U_H : CLT_U_H_ENC, CLT_U_H_DESC;
+def CLT_U_W : CLT_U_W_ENC, CLT_U_W_DESC;
+def CLT_U_D : CLT_U_D_ENC, CLT_U_D_DESC;
+
+def CLTI_S_B : CLTI_S_B_ENC, CLTI_S_B_DESC;
+def CLTI_S_H : CLTI_S_H_ENC, CLTI_S_H_DESC;
+def CLTI_S_W : CLTI_S_W_ENC, CLTI_S_W_DESC;
+def CLTI_S_D : CLTI_S_D_ENC, CLTI_S_D_DESC;
+
+def CLTI_U_B : CLTI_U_B_ENC, CLTI_U_B_DESC;
+def CLTI_U_H : CLTI_U_H_ENC, CLTI_U_H_DESC;
+def CLTI_U_W : CLTI_U_W_ENC, CLTI_U_W_DESC;
+def CLTI_U_D : CLTI_U_D_ENC, CLTI_U_D_DESC;
+
+def COPY_S_B : COPY_S_B_ENC, COPY_S_B_DESC;
+def COPY_S_H : COPY_S_H_ENC, COPY_S_H_DESC;
+def COPY_S_W : COPY_S_W_ENC, COPY_S_W_DESC;
+def COPY_S_D : COPY_S_D_ENC, COPY_S_D_DESC, ASE_MSA64;
+
+def COPY_U_B : COPY_U_B_ENC, COPY_U_B_DESC;
+def COPY_U_H : COPY_U_H_ENC, COPY_U_H_DESC;
+def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC, ASE_MSA64;
+
+def COPY_FW_PSEUDO : COPY_FW_PSEUDO_DESC;
+def COPY_FD_PSEUDO : COPY_FD_PSEUDO_DESC;
+
+def CTCMSA : CTCMSA_ENC, CTCMSA_DESC;
+
+def DIV_S_B : DIV_S_B_ENC, DIV_S_B_DESC;
+def DIV_S_H : DIV_S_H_ENC, DIV_S_H_DESC;
+def DIV_S_W : DIV_S_W_ENC, DIV_S_W_DESC;
+def DIV_S_D : DIV_S_D_ENC, DIV_S_D_DESC;
+
+def DIV_U_B : DIV_U_B_ENC, DIV_U_B_DESC;
+def DIV_U_H : DIV_U_H_ENC, DIV_U_H_DESC;
+def DIV_U_W : DIV_U_W_ENC, DIV_U_W_DESC;
+def DIV_U_D : DIV_U_D_ENC, DIV_U_D_DESC;
+
+def DOTP_S_H : DOTP_S_H_ENC, DOTP_S_H_DESC;
+def DOTP_S_W : DOTP_S_W_ENC, DOTP_S_W_DESC;
+def DOTP_S_D : DOTP_S_D_ENC, DOTP_S_D_DESC;
+
+def DOTP_U_H : DOTP_U_H_ENC, DOTP_U_H_DESC;
+def DOTP_U_W : DOTP_U_W_ENC, DOTP_U_W_DESC;
+def DOTP_U_D : DOTP_U_D_ENC, DOTP_U_D_DESC;
+
+def DPADD_S_H : DPADD_S_H_ENC, DPADD_S_H_DESC;
+def DPADD_S_W : DPADD_S_W_ENC, DPADD_S_W_DESC;
+def DPADD_S_D : DPADD_S_D_ENC, DPADD_S_D_DESC;
+
+def DPADD_U_H : DPADD_U_H_ENC, DPADD_U_H_DESC;
+def DPADD_U_W : DPADD_U_W_ENC, DPADD_U_W_DESC;
+def DPADD_U_D : DPADD_U_D_ENC, DPADD_U_D_DESC;
+
+def DPSUB_S_H : DPSUB_S_H_ENC, DPSUB_S_H_DESC;
+def DPSUB_S_W : DPSUB_S_W_ENC, DPSUB_S_W_DESC;
+def DPSUB_S_D : DPSUB_S_D_ENC, DPSUB_S_D_DESC;
+
+def DPSUB_U_H : DPSUB_U_H_ENC, DPSUB_U_H_DESC;
+def DPSUB_U_W : DPSUB_U_W_ENC, DPSUB_U_W_DESC;
+def DPSUB_U_D : DPSUB_U_D_ENC, DPSUB_U_D_DESC;
+
+def FADD_W : FADD_W_ENC, FADD_W_DESC;
+def FADD_D : FADD_D_ENC, FADD_D_DESC;
+
+def FCAF_W : FCAF_W_ENC, FCAF_W_DESC;
+def FCAF_D : FCAF_D_ENC, FCAF_D_DESC;
+
+def FCEQ_W : FCEQ_W_ENC, FCEQ_W_DESC;
+def FCEQ_D : FCEQ_D_ENC, FCEQ_D_DESC;
+
+def FCLE_W : FCLE_W_ENC, FCLE_W_DESC;
+def FCLE_D : FCLE_D_ENC, FCLE_D_DESC;
+
+def FCLT_W : FCLT_W_ENC, FCLT_W_DESC;
+def FCLT_D : FCLT_D_ENC, FCLT_D_DESC;
+
+def FCLASS_W : FCLASS_W_ENC, FCLASS_W_DESC;
+def FCLASS_D : FCLASS_D_ENC, FCLASS_D_DESC;
+
+def FCNE_W : FCNE_W_ENC, FCNE_W_DESC;
+def FCNE_D : FCNE_D_ENC, FCNE_D_DESC;
+
+def FCOR_W : FCOR_W_ENC, FCOR_W_DESC;
+def FCOR_D : FCOR_D_ENC, FCOR_D_DESC;
+
+def FCUEQ_W : FCUEQ_W_ENC, FCUEQ_W_DESC;
+def FCUEQ_D : FCUEQ_D_ENC, FCUEQ_D_DESC;
+
+def FCULE_W : FCULE_W_ENC, FCULE_W_DESC;
+def FCULE_D : FCULE_D_ENC, FCULE_D_DESC;
+
+def FCULT_W : FCULT_W_ENC, FCULT_W_DESC;
+def FCULT_D : FCULT_D_ENC, FCULT_D_DESC;
+
+def FCUN_W : FCUN_W_ENC, FCUN_W_DESC;
+def FCUN_D : FCUN_D_ENC, FCUN_D_DESC;
+
+def FCUNE_W : FCUNE_W_ENC, FCUNE_W_DESC;
+def FCUNE_D : FCUNE_D_ENC, FCUNE_D_DESC;
+
+def FDIV_W : FDIV_W_ENC, FDIV_W_DESC;
+def FDIV_D : FDIV_D_ENC, FDIV_D_DESC;
+
+def FEXDO_H : FEXDO_H_ENC, FEXDO_H_DESC;
+def FEXDO_W : FEXDO_W_ENC, FEXDO_W_DESC;
+
+def FEXP2_W : FEXP2_W_ENC, FEXP2_W_DESC;
+def FEXP2_D : FEXP2_D_ENC, FEXP2_D_DESC;
+def FEXP2_W_1_PSEUDO : FEXP2_W_1_PSEUDO_DESC;
+def FEXP2_D_1_PSEUDO : FEXP2_D_1_PSEUDO_DESC;
+
+def FEXUPL_W : FEXUPL_W_ENC, FEXUPL_W_DESC;
+def FEXUPL_D : FEXUPL_D_ENC, FEXUPL_D_DESC;
+
+def FEXUPR_W : FEXUPR_W_ENC, FEXUPR_W_DESC;
+def FEXUPR_D : FEXUPR_D_ENC, FEXUPR_D_DESC;
+
+def FFINT_S_W : FFINT_S_W_ENC, FFINT_S_W_DESC;
+def FFINT_S_D : FFINT_S_D_ENC, FFINT_S_D_DESC;
+
+def FFINT_U_W : FFINT_U_W_ENC, FFINT_U_W_DESC;
+def FFINT_U_D : FFINT_U_D_ENC, FFINT_U_D_DESC;
+
+def FFQL_W : FFQL_W_ENC, FFQL_W_DESC;
+def FFQL_D : FFQL_D_ENC, FFQL_D_DESC;
+
+def FFQR_W : FFQR_W_ENC, FFQR_W_DESC;
+def FFQR_D : FFQR_D_ENC, FFQR_D_DESC;
+
+def FILL_B : FILL_B_ENC, FILL_B_DESC;
+def FILL_H : FILL_H_ENC, FILL_H_DESC;
+def FILL_W : FILL_W_ENC, FILL_W_DESC;
+def FILL_D : FILL_D_ENC, FILL_D_DESC, ASE_MSA64;
+def FILL_FW_PSEUDO : FILL_FW_PSEUDO_DESC;
+def FILL_FD_PSEUDO : FILL_FD_PSEUDO_DESC;
+
+def FLOG2_W : FLOG2_W_ENC, FLOG2_W_DESC;
+def FLOG2_D : FLOG2_D_ENC, FLOG2_D_DESC;
+
+def FMADD_W : FMADD_W_ENC, FMADD_W_DESC;
+def FMADD_D : FMADD_D_ENC, FMADD_D_DESC;
+
+def FMAX_W : FMAX_W_ENC, FMAX_W_DESC;
+def FMAX_D : FMAX_D_ENC, FMAX_D_DESC;
+
+def FMAX_A_W : FMAX_A_W_ENC, FMAX_A_W_DESC;
+def FMAX_A_D : FMAX_A_D_ENC, FMAX_A_D_DESC;
+
+def FMIN_W : FMIN_W_ENC, FMIN_W_DESC;
+def FMIN_D : FMIN_D_ENC, FMIN_D_DESC;
+
+def FMIN_A_W : FMIN_A_W_ENC, FMIN_A_W_DESC;
+def FMIN_A_D : FMIN_A_D_ENC, FMIN_A_D_DESC;
+
+def FMSUB_W : FMSUB_W_ENC, FMSUB_W_DESC;
+def FMSUB_D : FMSUB_D_ENC, FMSUB_D_DESC;
+
+def FMUL_W : FMUL_W_ENC, FMUL_W_DESC;
+def FMUL_D : FMUL_D_ENC, FMUL_D_DESC;
+
+def FRINT_W : FRINT_W_ENC, FRINT_W_DESC;
+def FRINT_D : FRINT_D_ENC, FRINT_D_DESC;
+
+def FRCP_W : FRCP_W_ENC, FRCP_W_DESC;
+def FRCP_D : FRCP_D_ENC, FRCP_D_DESC;
+
+def FRSQRT_W : FRSQRT_W_ENC, FRSQRT_W_DESC;
+def FRSQRT_D : FRSQRT_D_ENC, FRSQRT_D_DESC;
+
+def FSAF_W : FSAF_W_ENC, FSAF_W_DESC;
+def FSAF_D : FSAF_D_ENC, FSAF_D_DESC;
+
+def FSEQ_W : FSEQ_W_ENC, FSEQ_W_DESC;
+def FSEQ_D : FSEQ_D_ENC, FSEQ_D_DESC;
+
+def FSLE_W : FSLE_W_ENC, FSLE_W_DESC;
+def FSLE_D : FSLE_D_ENC, FSLE_D_DESC;
+
+def FSLT_W : FSLT_W_ENC, FSLT_W_DESC;
+def FSLT_D : FSLT_D_ENC, FSLT_D_DESC;
+
+def FSNE_W : FSNE_W_ENC, FSNE_W_DESC;
+def FSNE_D : FSNE_D_ENC, FSNE_D_DESC;
+
+def FSOR_W : FSOR_W_ENC, FSOR_W_DESC;
+def FSOR_D : FSOR_D_ENC, FSOR_D_DESC;
+
+def FSQRT_W : FSQRT_W_ENC, FSQRT_W_DESC;
+def FSQRT_D : FSQRT_D_ENC, FSQRT_D_DESC;
+
+def FSUB_W : FSUB_W_ENC, FSUB_W_DESC;
+def FSUB_D : FSUB_D_ENC, FSUB_D_DESC;
+
+def FSUEQ_W : FSUEQ_W_ENC, FSUEQ_W_DESC;
+def FSUEQ_D : FSUEQ_D_ENC, FSUEQ_D_DESC;
+
+def FSULE_W : FSULE_W_ENC, FSULE_W_DESC;
+def FSULE_D : FSULE_D_ENC, FSULE_D_DESC;
+
+def FSULT_W : FSULT_W_ENC, FSULT_W_DESC;
+def FSULT_D : FSULT_D_ENC, FSULT_D_DESC;
+
+def FSUN_W : FSUN_W_ENC, FSUN_W_DESC;
+def FSUN_D : FSUN_D_ENC, FSUN_D_DESC;
+
+def FSUNE_W : FSUNE_W_ENC, FSUNE_W_DESC;
+def FSUNE_D : FSUNE_D_ENC, FSUNE_D_DESC;
+
+def FTINT_S_W : FTINT_S_W_ENC, FTINT_S_W_DESC;
+def FTINT_S_D : FTINT_S_D_ENC, FTINT_S_D_DESC;
+
+def FTINT_U_W : FTINT_U_W_ENC, FTINT_U_W_DESC;
+def FTINT_U_D : FTINT_U_D_ENC, FTINT_U_D_DESC;
+
+def FTQ_H : FTQ_H_ENC, FTQ_H_DESC;
+def FTQ_W : FTQ_W_ENC, FTQ_W_DESC;
+
+def FTRUNC_S_W : FTRUNC_S_W_ENC, FTRUNC_S_W_DESC;
+def FTRUNC_S_D : FTRUNC_S_D_ENC, FTRUNC_S_D_DESC;
+
+def FTRUNC_U_W : FTRUNC_U_W_ENC, FTRUNC_U_W_DESC;
+def FTRUNC_U_D : FTRUNC_U_D_ENC, FTRUNC_U_D_DESC;
+
+def HADD_S_H : HADD_S_H_ENC, HADD_S_H_DESC;
+def HADD_S_W : HADD_S_W_ENC, HADD_S_W_DESC;
+def HADD_S_D : HADD_S_D_ENC, HADD_S_D_DESC;
+
+def HADD_U_H : HADD_U_H_ENC, HADD_U_H_DESC;
+def HADD_U_W : HADD_U_W_ENC, HADD_U_W_DESC;
+def HADD_U_D : HADD_U_D_ENC, HADD_U_D_DESC;
+
+def HSUB_S_H : HSUB_S_H_ENC, HSUB_S_H_DESC;
+def HSUB_S_W : HSUB_S_W_ENC, HSUB_S_W_DESC;
+def HSUB_S_D : HSUB_S_D_ENC, HSUB_S_D_DESC;
+
+def HSUB_U_H : HSUB_U_H_ENC, HSUB_U_H_DESC;
+def HSUB_U_W : HSUB_U_W_ENC, HSUB_U_W_DESC;
+def HSUB_U_D : HSUB_U_D_ENC, HSUB_U_D_DESC;
+
+def ILVEV_B : ILVEV_B_ENC, ILVEV_B_DESC;
+def ILVEV_H : ILVEV_H_ENC, ILVEV_H_DESC;
+def ILVEV_W : ILVEV_W_ENC, ILVEV_W_DESC;
+def ILVEV_D : ILVEV_D_ENC, ILVEV_D_DESC;
+
+def ILVL_B : ILVL_B_ENC, ILVL_B_DESC;
+def ILVL_H : ILVL_H_ENC, ILVL_H_DESC;
+def ILVL_W : ILVL_W_ENC, ILVL_W_DESC;
+def ILVL_D : ILVL_D_ENC, ILVL_D_DESC;
+
+def ILVOD_B : ILVOD_B_ENC, ILVOD_B_DESC;
+def ILVOD_H : ILVOD_H_ENC, ILVOD_H_DESC;
+def ILVOD_W : ILVOD_W_ENC, ILVOD_W_DESC;
+def ILVOD_D : ILVOD_D_ENC, ILVOD_D_DESC;
+
+def ILVR_B : ILVR_B_ENC, ILVR_B_DESC;
+def ILVR_H : ILVR_H_ENC, ILVR_H_DESC;
+def ILVR_W : ILVR_W_ENC, ILVR_W_DESC;
+def ILVR_D : ILVR_D_ENC, ILVR_D_DESC;
+
+def INSERT_B : INSERT_B_ENC, INSERT_B_DESC;
+def INSERT_H : INSERT_H_ENC, INSERT_H_DESC;
+def INSERT_W : INSERT_W_ENC, INSERT_W_DESC;
+def INSERT_D : INSERT_D_ENC, INSERT_D_DESC, ASE_MSA64;
+
+// INSERT_FW_PSEUDO defined after INSVE_W
+// INSERT_FD_PSEUDO defined after INSVE_D
+
+// There is a fourth operand that is not present in the encoding. Use a
+// custom decoder to get a chance to add it.
+let DecoderMethod = "DecodeINSVE_DF" in {
+  def INSVE_B : INSVE_B_ENC, INSVE_B_DESC;
+  def INSVE_H : INSVE_H_ENC, INSVE_H_DESC;
+  def INSVE_W : INSVE_W_ENC, INSVE_W_DESC;
+  def INSVE_D : INSVE_D_ENC, INSVE_D_DESC;
+}
+
+def INSERT_FW_PSEUDO : INSERT_FW_PSEUDO_DESC;
+def INSERT_FD_PSEUDO : INSERT_FD_PSEUDO_DESC;
+
+def INSERT_B_VIDX_PSEUDO : INSERT_B_VIDX_PSEUDO_DESC;
+def INSERT_H_VIDX_PSEUDO : INSERT_H_VIDX_PSEUDO_DESC;
+def INSERT_W_VIDX_PSEUDO : INSERT_W_VIDX_PSEUDO_DESC;
+def INSERT_D_VIDX_PSEUDO : INSERT_D_VIDX_PSEUDO_DESC;
+def INSERT_FW_VIDX_PSEUDO : INSERT_FW_VIDX_PSEUDO_DESC;
+def INSERT_FD_VIDX_PSEUDO : INSERT_FD_VIDX_PSEUDO_DESC;
+
+def INSERT_B_VIDX64_PSEUDO : INSERT_B_VIDX64_PSEUDO_DESC;
+def INSERT_H_VIDX64_PSEUDO : INSERT_H_VIDX64_PSEUDO_DESC;
+def INSERT_W_VIDX64_PSEUDO : INSERT_W_VIDX64_PSEUDO_DESC;
+def INSERT_D_VIDX64_PSEUDO : INSERT_D_VIDX64_PSEUDO_DESC;
+def INSERT_FW_VIDX64_PSEUDO : INSERT_FW_VIDX64_PSEUDO_DESC;
+def INSERT_FD_VIDX64_PSEUDO : INSERT_FD_VIDX64_PSEUDO_DESC;
+
+def LD_B: LD_B_ENC, LD_B_DESC;
+def LD_H: LD_H_ENC, LD_H_DESC;
+def LD_W: LD_W_ENC, LD_W_DESC;
+def LD_D: LD_D_ENC, LD_D_DESC;
+
+def LDI_B : LDI_B_ENC, LDI_B_DESC;
+def LDI_H : LDI_H_ENC, LDI_H_DESC;
+def LDI_W : LDI_W_ENC, LDI_W_DESC;
+def LDI_D : LDI_D_ENC, LDI_D_DESC;
+
+def LSA : LSA_ENC, LSA_DESC;
+def DLSA : DLSA_ENC, DLSA_DESC, ASE_MSA64;
+
+def MADD_Q_H : MADD_Q_H_ENC, MADD_Q_H_DESC;
+def MADD_Q_W : MADD_Q_W_ENC, MADD_Q_W_DESC;
+
+def MADDR_Q_H : MADDR_Q_H_ENC, MADDR_Q_H_DESC;
+def MADDR_Q_W : MADDR_Q_W_ENC, MADDR_Q_W_DESC;
+
+def MADDV_B : MADDV_B_ENC, MADDV_B_DESC;
+def MADDV_H : MADDV_H_ENC, MADDV_H_DESC;
+def MADDV_W : MADDV_W_ENC, MADDV_W_DESC;
+def MADDV_D : MADDV_D_ENC, MADDV_D_DESC;
+
+def MAX_A_B : MAX_A_B_ENC, MAX_A_B_DESC;
+def MAX_A_H : MAX_A_H_ENC, MAX_A_H_DESC;
+def MAX_A_W : MAX_A_W_ENC, MAX_A_W_DESC;
+def MAX_A_D : MAX_A_D_ENC, MAX_A_D_DESC;
+
+def MAX_S_B : MAX_S_B_ENC, MAX_S_B_DESC;
+def MAX_S_H : MAX_S_H_ENC, MAX_S_H_DESC;
+def MAX_S_W : MAX_S_W_ENC, MAX_S_W_DESC;
+def MAX_S_D : MAX_S_D_ENC, MAX_S_D_DESC;
+
+def MAX_U_B : MAX_U_B_ENC, MAX_U_B_DESC;
+def MAX_U_H : MAX_U_H_ENC, MAX_U_H_DESC;
+def MAX_U_W : MAX_U_W_ENC, MAX_U_W_DESC;
+def MAX_U_D : MAX_U_D_ENC, MAX_U_D_DESC;
+
+def MAXI_S_B : MAXI_S_B_ENC, MAXI_S_B_DESC;
+def MAXI_S_H : MAXI_S_H_ENC, MAXI_S_H_DESC;
+def MAXI_S_W : MAXI_S_W_ENC, MAXI_S_W_DESC;
+def MAXI_S_D : MAXI_S_D_ENC, MAXI_S_D_DESC;
+
+def MAXI_U_B : MAXI_U_B_ENC, MAXI_U_B_DESC;
+def MAXI_U_H : MAXI_U_H_ENC, MAXI_U_H_DESC;
+def MAXI_U_W : MAXI_U_W_ENC, MAXI_U_W_DESC;
+def MAXI_U_D : MAXI_U_D_ENC, MAXI_U_D_DESC;
+
+def MIN_A_B : MIN_A_B_ENC, MIN_A_B_DESC;
+def MIN_A_H : MIN_A_H_ENC, MIN_A_H_DESC;
+def MIN_A_W : MIN_A_W_ENC, MIN_A_W_DESC;
+def MIN_A_D : MIN_A_D_ENC, MIN_A_D_DESC;
+
+def MIN_S_B : MIN_S_B_ENC, MIN_S_B_DESC;
+def MIN_S_H : MIN_S_H_ENC, MIN_S_H_DESC;
+def MIN_S_W : MIN_S_W_ENC, MIN_S_W_DESC;
+def MIN_S_D : MIN_S_D_ENC, MIN_S_D_DESC;
+
+def MIN_U_B : MIN_U_B_ENC, MIN_U_B_DESC;
+def MIN_U_H : MIN_U_H_ENC, MIN_U_H_DESC;
+def MIN_U_W : MIN_U_W_ENC, MIN_U_W_DESC;
+def MIN_U_D : MIN_U_D_ENC, MIN_U_D_DESC;
+
+def MINI_S_B : MINI_S_B_ENC, MINI_S_B_DESC;
+def MINI_S_H : MINI_S_H_ENC, MINI_S_H_DESC;
+def MINI_S_W : MINI_S_W_ENC, MINI_S_W_DESC;
+def MINI_S_D : MINI_S_D_ENC, MINI_S_D_DESC;
+
+def MINI_U_B : MINI_U_B_ENC, MINI_U_B_DESC;
+def MINI_U_H : MINI_U_H_ENC, MINI_U_H_DESC;
+def MINI_U_W : MINI_U_W_ENC, MINI_U_W_DESC;
+def MINI_U_D : MINI_U_D_ENC, MINI_U_D_DESC;
+
+def MOD_S_B : MOD_S_B_ENC, MOD_S_B_DESC;
+def MOD_S_H : MOD_S_H_ENC, MOD_S_H_DESC;
+def MOD_S_W : MOD_S_W_ENC, MOD_S_W_DESC;
+def MOD_S_D : MOD_S_D_ENC, MOD_S_D_DESC;
+
+def MOD_U_B : MOD_U_B_ENC, MOD_U_B_DESC;
+def MOD_U_H : MOD_U_H_ENC, MOD_U_H_DESC;
+def MOD_U_W : MOD_U_W_ENC, MOD_U_W_DESC;
+def MOD_U_D : MOD_U_D_ENC, MOD_U_D_DESC;
+
+def MOVE_V : MOVE_V_ENC, MOVE_V_DESC;
+
+def MSUB_Q_H : MSUB_Q_H_ENC, MSUB_Q_H_DESC;
+def MSUB_Q_W : MSUB_Q_W_ENC, MSUB_Q_W_DESC;
+
+def MSUBR_Q_H : MSUBR_Q_H_ENC, MSUBR_Q_H_DESC;
+def MSUBR_Q_W : MSUBR_Q_W_ENC, MSUBR_Q_W_DESC;
+
+def MSUBV_B : MSUBV_B_ENC, MSUBV_B_DESC;
+def MSUBV_H : MSUBV_H_ENC, MSUBV_H_DESC;
+def MSUBV_W : MSUBV_W_ENC, MSUBV_W_DESC;
+def MSUBV_D : MSUBV_D_ENC, MSUBV_D_DESC;
+
+def MUL_Q_H : MUL_Q_H_ENC, MUL_Q_H_DESC;
+def MUL_Q_W : MUL_Q_W_ENC, MUL_Q_W_DESC;
+
+def MULR_Q_H : MULR_Q_H_ENC, MULR_Q_H_DESC;
+def MULR_Q_W : MULR_Q_W_ENC, MULR_Q_W_DESC;
+
+def MULV_B : MULV_B_ENC, MULV_B_DESC;
+def MULV_H : MULV_H_ENC, MULV_H_DESC;
+def MULV_W : MULV_W_ENC, MULV_W_DESC;
+def MULV_D : MULV_D_ENC, MULV_D_DESC;
+
+def NLOC_B : NLOC_B_ENC, NLOC_B_DESC;
+def NLOC_H : NLOC_H_ENC, NLOC_H_DESC;
+def NLOC_W : NLOC_W_ENC, NLOC_W_DESC;
+def NLOC_D : NLOC_D_ENC, NLOC_D_DESC;
+
+def NLZC_B : NLZC_B_ENC, NLZC_B_DESC;
+def NLZC_H : NLZC_H_ENC, NLZC_H_DESC;
+def NLZC_W : NLZC_W_ENC, NLZC_W_DESC;
+def NLZC_D : NLZC_D_ENC, NLZC_D_DESC;
+
+def NOR_V : NOR_V_ENC, NOR_V_DESC;
+def NOR_V_H_PSEUDO : NOR_V_H_PSEUDO_DESC,
+                     PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def NOR_V_W_PSEUDO : NOR_V_W_PSEUDO_DESC,
+                     PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def NOR_V_D_PSEUDO : NOR_V_D_PSEUDO_DESC,
+                     PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+
+def NORI_B : NORI_B_ENC, NORI_B_DESC;
+
+def OR_V : OR_V_ENC, OR_V_DESC;
+def OR_V_H_PSEUDO : OR_V_H_PSEUDO_DESC,
+                    PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+                                              MSA128BOpnd:$ws,
+                                              MSA128BOpnd:$wt)>;
+def OR_V_W_PSEUDO : OR_V_W_PSEUDO_DESC,
+                    PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+                                              MSA128BOpnd:$ws,
+                                              MSA128BOpnd:$wt)>;
+def OR_V_D_PSEUDO : OR_V_D_PSEUDO_DESC,
+                    PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+                                              MSA128BOpnd:$ws,
+                                              MSA128BOpnd:$wt)>;
+
+def ORI_B : ORI_B_ENC, ORI_B_DESC;
+
+def PCKEV_B : PCKEV_B_ENC, PCKEV_B_DESC;
+def PCKEV_H : PCKEV_H_ENC, PCKEV_H_DESC;
+def PCKEV_W : PCKEV_W_ENC, PCKEV_W_DESC;
+def PCKEV_D : PCKEV_D_ENC, PCKEV_D_DESC;
+
+def PCKOD_B : PCKOD_B_ENC, PCKOD_B_DESC;
+def PCKOD_H : PCKOD_H_ENC, PCKOD_H_DESC;
+def PCKOD_W : PCKOD_W_ENC, PCKOD_W_DESC;
+def PCKOD_D : PCKOD_D_ENC, PCKOD_D_DESC;
+
+def PCNT_B : PCNT_B_ENC, PCNT_B_DESC;
+def PCNT_H : PCNT_H_ENC, PCNT_H_DESC;
+def PCNT_W : PCNT_W_ENC, PCNT_W_DESC;
+def PCNT_D : PCNT_D_ENC, PCNT_D_DESC;
+
+def SAT_S_B : SAT_S_B_ENC, SAT_S_B_DESC;
+def SAT_S_H : SAT_S_H_ENC, SAT_S_H_DESC;
+def SAT_S_W : SAT_S_W_ENC, SAT_S_W_DESC;
+def SAT_S_D : SAT_S_D_ENC, SAT_S_D_DESC;
+
+def SAT_U_B : SAT_U_B_ENC, SAT_U_B_DESC;
+def SAT_U_H : SAT_U_H_ENC, SAT_U_H_DESC;
+def SAT_U_W : SAT_U_W_ENC, SAT_U_W_DESC;
+def SAT_U_D : SAT_U_D_ENC, SAT_U_D_DESC;
+
+def SHF_B : SHF_B_ENC, SHF_B_DESC;
+def SHF_H : SHF_H_ENC, SHF_H_DESC;
+def SHF_W : SHF_W_ENC, SHF_W_DESC;
+
+def SLD_B : SLD_B_ENC, SLD_B_DESC;
+def SLD_H : SLD_H_ENC, SLD_H_DESC;
+def SLD_W : SLD_W_ENC, SLD_W_DESC;
+def SLD_D : SLD_D_ENC, SLD_D_DESC;
+
+def SLDI_B : SLDI_B_ENC, SLDI_B_DESC;
+def SLDI_H : SLDI_H_ENC, SLDI_H_DESC;
+def SLDI_W : SLDI_W_ENC, SLDI_W_DESC;
+def SLDI_D : SLDI_D_ENC, SLDI_D_DESC;
+
+def SLL_B : SLL_B_ENC, SLL_B_DESC;
+def SLL_H : SLL_H_ENC, SLL_H_DESC;
+def SLL_W : SLL_W_ENC, SLL_W_DESC;
+def SLL_D : SLL_D_ENC, SLL_D_DESC;
+
+def SLLI_B : SLLI_B_ENC, SLLI_B_DESC;
+def SLLI_H : SLLI_H_ENC, SLLI_H_DESC;
+def SLLI_W : SLLI_W_ENC, SLLI_W_DESC;
+def SLLI_D : SLLI_D_ENC, SLLI_D_DESC;
+
+def SPLAT_B : SPLAT_B_ENC, SPLAT_B_DESC;
+def SPLAT_H : SPLAT_H_ENC, SPLAT_H_DESC;
+def SPLAT_W : SPLAT_W_ENC, SPLAT_W_DESC;
+def SPLAT_D : SPLAT_D_ENC, SPLAT_D_DESC;
+
+def SPLATI_B : SPLATI_B_ENC, SPLATI_B_DESC;
+def SPLATI_H : SPLATI_H_ENC, SPLATI_H_DESC;
+def SPLATI_W : SPLATI_W_ENC, SPLATI_W_DESC;
+def SPLATI_D : SPLATI_D_ENC, SPLATI_D_DESC;
+
+def SRA_B : SRA_B_ENC, SRA_B_DESC;
+def SRA_H : SRA_H_ENC, SRA_H_DESC;
+def SRA_W : SRA_W_ENC, SRA_W_DESC;
+def SRA_D : SRA_D_ENC, SRA_D_DESC;
+
+def SRAI_B : SRAI_B_ENC, SRAI_B_DESC;
+def SRAI_H : SRAI_H_ENC, SRAI_H_DESC;
+def SRAI_W : SRAI_W_ENC, SRAI_W_DESC;
+def SRAI_D : SRAI_D_ENC, SRAI_D_DESC;
+
+def SRAR_B : SRAR_B_ENC, SRAR_B_DESC;
+def SRAR_H : SRAR_H_ENC, SRAR_H_DESC;
+def SRAR_W : SRAR_W_ENC, SRAR_W_DESC;
+def SRAR_D : SRAR_D_ENC, SRAR_D_DESC;
+
+def SRARI_B : SRARI_B_ENC, SRARI_B_DESC;
+def SRARI_H : SRARI_H_ENC, SRARI_H_DESC;
+def SRARI_W : SRARI_W_ENC, SRARI_W_DESC;
+def SRARI_D : SRARI_D_ENC, SRARI_D_DESC;
+
+def SRL_B : SRL_B_ENC, SRL_B_DESC;
+def SRL_H : SRL_H_ENC, SRL_H_DESC;
+def SRL_W : SRL_W_ENC, SRL_W_DESC;
+def SRL_D : SRL_D_ENC, SRL_D_DESC;
+
+def SRLI_B : SRLI_B_ENC, SRLI_B_DESC;
+def SRLI_H : SRLI_H_ENC, SRLI_H_DESC;
+def SRLI_W : SRLI_W_ENC, SRLI_W_DESC;
+def SRLI_D : SRLI_D_ENC, SRLI_D_DESC;
+
+def SRLR_B : SRLR_B_ENC, SRLR_B_DESC;
+def SRLR_H : SRLR_H_ENC, SRLR_H_DESC;
+def SRLR_W : SRLR_W_ENC, SRLR_W_DESC;
+def SRLR_D : SRLR_D_ENC, SRLR_D_DESC;
+
+def SRLRI_B : SRLRI_B_ENC, SRLRI_B_DESC;
+def SRLRI_H : SRLRI_H_ENC, SRLRI_H_DESC;
+def SRLRI_W : SRLRI_W_ENC, SRLRI_W_DESC;
+def SRLRI_D : SRLRI_D_ENC, SRLRI_D_DESC;
+
+def ST_B: ST_B_ENC, ST_B_DESC;
+def ST_H: ST_H_ENC, ST_H_DESC;
+def ST_W: ST_W_ENC, ST_W_DESC;
+def ST_D: ST_D_ENC, ST_D_DESC;
+
+def SUBS_S_B : SUBS_S_B_ENC, SUBS_S_B_DESC;
+def SUBS_S_H : SUBS_S_H_ENC, SUBS_S_H_DESC;
+def SUBS_S_W : SUBS_S_W_ENC, SUBS_S_W_DESC;
+def SUBS_S_D : SUBS_S_D_ENC, SUBS_S_D_DESC;
+
+def SUBS_U_B : SUBS_U_B_ENC, SUBS_U_B_DESC;
+def SUBS_U_H : SUBS_U_H_ENC, SUBS_U_H_DESC;
+def SUBS_U_W : SUBS_U_W_ENC, SUBS_U_W_DESC;
+def SUBS_U_D : SUBS_U_D_ENC, SUBS_U_D_DESC;
+
+def SUBSUS_U_B : SUBSUS_U_B_ENC, SUBSUS_U_B_DESC;
+def SUBSUS_U_H : SUBSUS_U_H_ENC, SUBSUS_U_H_DESC;
+def SUBSUS_U_W : SUBSUS_U_W_ENC, SUBSUS_U_W_DESC;
+def SUBSUS_U_D : SUBSUS_U_D_ENC, SUBSUS_U_D_DESC;
+
+def SUBSUU_S_B : SUBSUU_S_B_ENC, SUBSUU_S_B_DESC;
+def SUBSUU_S_H : SUBSUU_S_H_ENC, SUBSUU_S_H_DESC;
+def SUBSUU_S_W : SUBSUU_S_W_ENC, SUBSUU_S_W_DESC;
+def SUBSUU_S_D : SUBSUU_S_D_ENC, SUBSUU_S_D_DESC;
+
+def SUBV_B : SUBV_B_ENC, SUBV_B_DESC;
+def SUBV_H : SUBV_H_ENC, SUBV_H_DESC;
+def SUBV_W : SUBV_W_ENC, SUBV_W_DESC;
+def SUBV_D : SUBV_D_ENC, SUBV_D_DESC;
+
+def SUBVI_B : SUBVI_B_ENC, SUBVI_B_DESC;
+def SUBVI_H : SUBVI_H_ENC, SUBVI_H_DESC;
+def SUBVI_W : SUBVI_W_ENC, SUBVI_W_DESC;
+def SUBVI_D : SUBVI_D_ENC, SUBVI_D_DESC;
+
+def VSHF_B : VSHF_B_ENC, VSHF_B_DESC;
+def VSHF_H : VSHF_H_ENC, VSHF_H_DESC;
+def VSHF_W : VSHF_W_ENC, VSHF_W_DESC;
+def VSHF_D : VSHF_D_ENC, VSHF_D_DESC;
+
+def XOR_V : XOR_V_ENC, XOR_V_DESC;
+def XOR_V_H_PSEUDO : XOR_V_H_PSEUDO_DESC,
+                     PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def XOR_V_W_PSEUDO : XOR_V_W_PSEUDO_DESC,
+                     PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def XOR_V_D_PSEUDO : XOR_V_D_PSEUDO_DESC,
+                     PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+
+def XORI_B : XORI_B_ENC, XORI_B_DESC;
+
+// Patterns.
+class MSAPat<dag pattern, dag result, list<Predicate> pred = [HasMSA]> :
+  Pat<pattern, result>, Requires<pred>;
+
+def : MSAPat<(extractelt (v4i32 MSA128W:$ws), immZExt4:$idx),
+             (COPY_S_W MSA128W:$ws, immZExt4:$idx)>;
+
+def : MSAPat<(v8f16 (load addrimm10lsl1:$addr)), (LD_H addrimm10lsl1:$addr)>;
+def : MSAPat<(v4f32 (load addrimm10lsl2:$addr)), (LD_W addrimm10lsl2:$addr)>;
+def : MSAPat<(v2f64 (load addrimm10lsl3:$addr)), (LD_D addrimm10lsl3:$addr)>;
+
+def ST_FH : MSAPat<(store (v8f16 MSA128H:$ws), addrimm10lsl1:$addr),
+                   (ST_H MSA128H:$ws, addrimm10lsl1:$addr)>;
+def ST_FW : MSAPat<(store (v4f32 MSA128W:$ws), addrimm10lsl2:$addr),
+                   (ST_W MSA128W:$ws, addrimm10lsl2:$addr)>;
+def ST_FD : MSAPat<(store (v2f64 MSA128D:$ws), addrimm10lsl3:$addr),
+                   (ST_D MSA128D:$ws, addrimm10lsl3:$addr)>;
+
+class MSA_FABS_PSEUDO_DESC_BASE<RegisterOperand ROWD,
+                                RegisterOperand ROWS = ROWD,
+                                InstrItinClass itin = NoItinerary> :
+  MSAPseudo<(outs ROWD:$wd),
+            (ins ROWS:$ws),
+            [(set ROWD:$wd, (fabs ROWS:$ws))]> {
+  InstrItinClass Itinerary = itin;
+}
+def FABS_W : MSA_FABS_PSEUDO_DESC_BASE<MSA128WOpnd>,
+             PseudoInstExpansion<(FMAX_A_W MSA128WOpnd:$wd, MSA128WOpnd:$ws,
+                                           MSA128WOpnd:$ws)>;
+def FABS_D : MSA_FABS_PSEUDO_DESC_BASE<MSA128DOpnd>,
+             PseudoInstExpansion<(FMAX_A_D MSA128DOpnd:$wd, MSA128DOpnd:$ws,
+                                           MSA128DOpnd:$ws)>;
+
+class MSABitconvertPat<ValueType DstVT, ValueType SrcVT,
+                       RegisterClass DstRC, list<Predicate> preds = [HasMSA]> :
+   MSAPat<(DstVT (bitconvert SrcVT:$src)),
+          (COPY_TO_REGCLASS SrcVT:$src, DstRC), preds>;
+
+// These are endian-independent because the element size doesnt change
+def : MSABitconvertPat<v8i16, v8f16, MSA128H>;
+def : MSABitconvertPat<v4i32, v4f32, MSA128W>;
+def : MSABitconvertPat<v2i64, v2f64, MSA128D>;
+def : MSABitconvertPat<v8f16, v8i16, MSA128H>;
+def : MSABitconvertPat<v4f32, v4i32, MSA128W>;
+def : MSABitconvertPat<v2f64, v2i64, MSA128D>;
+
+// Little endian bitcasts are always no-ops
+def : MSABitconvertPat<v16i8, v8i16, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v4i32, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v2i64, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v8f16, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v4f32, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v2f64, MSA128B, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v8i16, v16i8, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v4i32, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v2i64, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v4f32, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v2f64, MSA128H, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v4i32, v16i8, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v8i16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v2i64, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v8f16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v2f64, MSA128W, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v2i64, v16i8, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v8i16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v4i32, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v8f16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v4f32, MSA128D, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v4f32, v16i8, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v8i16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v2i64, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v8f16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v2f64, MSA128W, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v2f64, v16i8, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v8i16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v4i32, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v8f16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v4f32, MSA128D, [HasMSA, IsLE]>;
+
+// Big endian bitcasts expand to shuffle instructions.
+// This is because bitcast is defined to be a store/load sequence and the
+// vector store/load instructions are mixed-endian with respect to the vector
+// as a whole (little endian with respect to element order, but big endian
+// elements).
+
+class MSABitconvertReverseQuartersPat<ValueType DstVT, ValueType SrcVT,
+                                      RegisterClass DstRC, MSAInst Insn,
+                                      RegisterClass ViaRC> :
+  MSAPat<(DstVT (bitconvert SrcVT:$src)),
+         (COPY_TO_REGCLASS (Insn (COPY_TO_REGCLASS SrcVT:$src, ViaRC), 27),
+                           DstRC),
+         [HasMSA, IsBE]>;
+
+class MSABitconvertReverseHalvesPat<ValueType DstVT, ValueType SrcVT,
+                                    RegisterClass DstRC, MSAInst Insn,
+                                    RegisterClass ViaRC> :
+  MSAPat<(DstVT (bitconvert SrcVT:$src)),
+         (COPY_TO_REGCLASS (Insn (COPY_TO_REGCLASS SrcVT:$src, ViaRC), 177),
+                           DstRC),
+         [HasMSA, IsBE]>;
+
+class MSABitconvertReverseBInHPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_B, MSA128B>;
+
+class MSABitconvertReverseBInWPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseQuartersPat<DstVT, SrcVT, DstRC, SHF_B, MSA128B>;
+
+class MSABitconvertReverseBInDPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSAPat<(DstVT (bitconvert SrcVT:$src)),
+         (COPY_TO_REGCLASS
+           (SHF_W
+             (COPY_TO_REGCLASS
+               (SHF_B (COPY_TO_REGCLASS SrcVT:$src, MSA128B), 27),
+               MSA128W), 177),
+           DstRC),
+         [HasMSA, IsBE]>;
+
+class MSABitconvertReverseHInWPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_H, MSA128H>;
+
+class MSABitconvertReverseHInDPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseQuartersPat<DstVT, SrcVT, DstRC, SHF_H, MSA128H>;
+
+class MSABitconvertReverseWInDPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_W, MSA128W>;
+
+def : MSABitconvertReverseBInHPat<v8i16, v16i8, MSA128H>;
+def : MSABitconvertReverseBInHPat<v8f16, v16i8, MSA128H>;
+def : MSABitconvertReverseBInWPat<v4i32, v16i8, MSA128W>;
+def : MSABitconvertReverseBInWPat<v4f32, v16i8, MSA128W>;
+def : MSABitconvertReverseBInDPat<v2i64, v16i8, MSA128D>;
+def : MSABitconvertReverseBInDPat<v2f64, v16i8, MSA128D>;
+
+def : MSABitconvertReverseBInHPat<v16i8, v8i16, MSA128B>;
+def : MSABitconvertReverseHInWPat<v4i32, v8i16, MSA128W>;
+def : MSABitconvertReverseHInWPat<v4f32, v8i16, MSA128W>;
+def : MSABitconvertReverseHInDPat<v2i64, v8i16, MSA128D>;
+def : MSABitconvertReverseHInDPat<v2f64, v8i16, MSA128D>;
+
+def : MSABitconvertReverseBInHPat<v16i8, v8f16, MSA128B>;
+def : MSABitconvertReverseHInWPat<v4i32, v8f16, MSA128W>;
+def : MSABitconvertReverseHInWPat<v4f32, v8f16, MSA128W>;
+def : MSABitconvertReverseHInDPat<v2i64, v8f16, MSA128D>;
+def : MSABitconvertReverseHInDPat<v2f64, v8f16, MSA128D>;
+
+def : MSABitconvertReverseBInWPat<v16i8, v4i32, MSA128B>;
+def : MSABitconvertReverseHInWPat<v8i16, v4i32, MSA128H>;
+def : MSABitconvertReverseHInWPat<v8f16, v4i32, MSA128H>;
+def : MSABitconvertReverseWInDPat<v2i64, v4i32, MSA128D>;
+def : MSABitconvertReverseWInDPat<v2f64, v4i32, MSA128D>;
+
+def : MSABitconvertReverseBInWPat<v16i8, v4f32, MSA128B>;
+def : MSABitconvertReverseHInWPat<v8i16, v4f32, MSA128H>;
+def : MSABitconvertReverseHInWPat<v8f16, v4f32, MSA128H>;
+def : MSABitconvertReverseWInDPat<v2i64, v4f32, MSA128D>;
+def : MSABitconvertReverseWInDPat<v2f64, v4f32, MSA128D>;
+
+def : MSABitconvertReverseBInDPat<v16i8, v2i64, MSA128B>;
+def : MSABitconvertReverseHInDPat<v8i16, v2i64, MSA128H>;
+def : MSABitconvertReverseHInDPat<v8f16, v2i64, MSA128H>;
+def : MSABitconvertReverseWInDPat<v4i32, v2i64, MSA128W>;
+def : MSABitconvertReverseWInDPat<v4f32, v2i64, MSA128W>;
+
+def : MSABitconvertReverseBInDPat<v16i8, v2f64, MSA128B>;
+def : MSABitconvertReverseHInDPat<v8i16, v2f64, MSA128H>;
+def : MSABitconvertReverseHInDPat<v8f16, v2f64, MSA128H>;
+def : MSABitconvertReverseWInDPat<v4i32, v2f64, MSA128W>;
+def : MSABitconvertReverseWInDPat<v4f32, v2f64, MSA128W>;
+
+// Pseudos used to implement BNZ.df, and BZ.df
+
+class MSA_CBRANCH_PSEUDO_DESC_BASE<SDPatternOperator OpNode, ValueType TyNode,
+                                   RegisterClass RCWS,
+                                   InstrItinClass itin = NoItinerary> :
+  MipsPseudo<(outs GPR32:$dst),
+             (ins RCWS:$ws),
+             [(set GPR32:$dst, (OpNode (TyNode RCWS:$ws)))]> {
+  bit usesCustomInserter = 1;
+}
+
+def SNZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v16i8,
+                                                MSA128B, NoItinerary>;
+def SNZ_H_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v8i16,
+                                                MSA128H, NoItinerary>;
+def SNZ_W_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v4i32,
+                                                MSA128W, NoItinerary>;
+def SNZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v2i64,
+                                                MSA128D, NoItinerary>;
+def SNZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyNonZero, v16i8,
+                                                MSA128B, NoItinerary>;
+
+def SZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v16i8,
+                                               MSA128B, NoItinerary>;
+def SZ_H_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v8i16,
+                                               MSA128H, NoItinerary>;
+def SZ_W_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v4i32,
+                                               MSA128W, NoItinerary>;
+def SZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v2i64,
+                                               MSA128D, NoItinerary>;
+def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8,
+                                               MSA128B, NoItinerary>;
+
+// Pseudoes used to implement transparent fp16 support.
+
+let Predicates = [HasMSA] in {
+ def ST_F16 : MipsPseudo<(outs), (ins MSA128F16:$ws, mem_simm10:$addr),
+                          [(store (f16 MSA128F16:$ws), (addrimm10:$addr))]> {
+   let usesCustomInserter = 1;
+ }
+
+ def LD_F16 : MipsPseudo<(outs MSA128F16:$ws), (ins mem_simm10:$addr),
+                         [(set MSA128F16:$ws, (f16 (load addrimm10:$addr)))]> {
+   let usesCustomInserter = 1;
+ }
+
+ def MSA_FP_EXTEND_W_PSEUDO : MipsPseudo<(outs FGR32Opnd:$fd),
+                                         (ins MSA128F16:$ws),
+                              [(set FGR32Opnd:$fd,
+                                    (f32 (fpextend MSA128F16:$ws)))]> {
+  let usesCustomInserter = 1;
+ }
+
+ def MSA_FP_ROUND_W_PSEUDO : MipsPseudo<(outs MSA128F16:$wd),
+                                        (ins FGR32Opnd:$fs),
+                              [(set MSA128F16:$wd,
+                                    (f16 (fpround FGR32Opnd:$fs)))]> {
+  let usesCustomInserter = 1;
+ }
+
+ def MSA_FP_EXTEND_D_PSEUDO : MipsPseudo<(outs FGR64Opnd:$fd),
+                                         (ins MSA128F16:$ws),
+                              [(set FGR64Opnd:$fd,
+                                    (f64 (fpextend MSA128F16:$ws)))]> {
+  let usesCustomInserter = 1;
+ }
+
+ def MSA_FP_ROUND_D_PSEUDO : MipsPseudo<(outs MSA128F16:$wd),
+                                        (ins FGR64Opnd:$fs),
+                              [(set MSA128F16:$wd,
+                                    (f16 (fpround FGR64Opnd:$fs)))]> {
+  let usesCustomInserter = 1;
+ }
+
+ def : MipsPat<(MipsTruncIntFP MSA128F16:$ws),
+               (TRUNC_W_D64 (MSA_FP_EXTEND_D_PSEUDO MSA128F16:$ws))>;
+
+ def : MipsPat<(MipsFPCmp MSA128F16:$ws, MSA128F16:$wt, imm:$cond),
+               (FCMP_S32 (MSA_FP_EXTEND_W_PSEUDO MSA128F16:$ws),
+                         (MSA_FP_EXTEND_W_PSEUDO MSA128F16:$wt), imm:$cond)>,
+       ISA_MIPS1_NOT_32R6_64R6;
+}
+
+// Vector extraction with fixed index.
+//
+// Extracting 32-bit values on MSA32 should always use COPY_S_W rather than
+// COPY_U_W, even for the zero-extended case. This is because our forward
+// compatibility strategy is to consider registers to be infinitely
+// sign-extended so that a MIPS64 can execute MIPS32 code without getting
+// different register values.
+def : MSAPat<(vextract_zext_i32 (v4i32 MSA128W:$ws), immZExt2Ptr:$idx),
+             (COPY_S_W MSA128W:$ws, immZExt2:$idx)>, ASE_MSA_NOT_MSA64;
+def : MSAPat<(vextract_zext_i32 (v4f32 MSA128W:$ws), immZExt2Ptr:$idx),
+             (COPY_S_W MSA128W:$ws, immZExt2:$idx)>, ASE_MSA_NOT_MSA64;
+
+// Extracting 64-bit values on MSA64 should always use COPY_S_D rather than
+// COPY_U_D, even for the zero-extended case. This is because our forward
+// compatibility strategy is to consider registers to be infinitely
+// sign-extended so that a hypothetical MIPS128 would be able to execute MIPS64
+// code without getting different register values.
+def : MSAPat<(vextract_zext_i64 (v2i64 MSA128D:$ws), immZExt1Ptr:$idx),
+             (COPY_S_D MSA128D:$ws, immZExt1:$idx)>, ASE_MSA64;
+def : MSAPat<(vextract_zext_i64 (v2f64 MSA128D:$ws), immZExt1Ptr:$idx),
+             (COPY_S_D MSA128D:$ws, immZExt1:$idx)>, ASE_MSA64;
+
+// Vector extraction with variable index
+def : MSAPat<(i32 (vextract_sext_i8 v16i8:$ws, i32:$idx)),
+             (SRA (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_B v16i8:$ws,
+                                                                  i32:$idx),
+                                                         sub_lo)),
+                                    GPR32), (i32 24))>;
+def : MSAPat<(i32 (vextract_sext_i16 v8i16:$ws, i32:$idx)),
+             (SRA (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_H v8i16:$ws,
+                                                                  i32:$idx),
+                                                         sub_lo)),
+                                    GPR32), (i32 16))>;
+def : MSAPat<(i32 (vextract_sext_i32 v4i32:$ws, i32:$idx)),
+             (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_W v4i32:$ws,
+                                                             i32:$idx),
+                                                    sub_lo)),
+                               GPR32)>;
+def : MSAPat<(i64 (vextract_sext_i64 v2i64:$ws, i32:$idx)),
+             (COPY_TO_REGCLASS (i64 (EXTRACT_SUBREG (SPLAT_D v2i64:$ws,
+                                                             i32:$idx),
+                                                    sub_64)),
+                               GPR64), [HasMSA, IsGP64bit]>;
+
+def : MSAPat<(i32 (vextract_zext_i8 v16i8:$ws, i32:$idx)),
+             (SRL (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_B v16i8:$ws,
+                                                                  i32:$idx),
+                                                         sub_lo)),
+                                    GPR32), (i32 24))>;
+def : MSAPat<(i32 (vextract_zext_i16 v8i16:$ws, i32:$idx)),
+             (SRL (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_H v8i16:$ws,
+                                                                  i32:$idx),
+                                                         sub_lo)),
+                                    GPR32), (i32 16))>;
+def : MSAPat<(i32 (vextract_zext_i32 v4i32:$ws, i32:$idx)),
+             (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_W v4i32:$ws,
+                                                             i32:$idx),
+                                                    sub_lo)),
+                               GPR32)>;
+def : MSAPat<(i64 (vextract_zext_i64 v2i64:$ws, i32:$idx)),
+             (COPY_TO_REGCLASS (i64 (EXTRACT_SUBREG (SPLAT_D v2i64:$ws,
+                                                             i32:$idx),
+                                                    sub_64)),
+                               GPR64), [HasMSA, IsGP64bit]>;
+
+def : MSAPat<(f32 (vector_extract v4f32:$ws, i32:$idx)),
+             (f32 (EXTRACT_SUBREG (SPLAT_W v4f32:$ws,
+                                           i32:$idx),
+                                  sub_lo))>;
+def : MSAPat<(f64 (vector_extract v2f64:$ws, i32:$idx)),
+             (f64 (EXTRACT_SUBREG (SPLAT_D v2f64:$ws,
+                                           i32:$idx),
+                                  sub_64))>;
+
+// Vector extraction with variable index (N64 ABI)
+def : MSAPat<
+  (i32 (vextract_sext_i8 v16i8:$ws, i64:$idx)),
+  (SRA (COPY_TO_REGCLASS
+         (i32 (EXTRACT_SUBREG
+                (SPLAT_B v16i8:$ws,
+                  (COPY_TO_REGCLASS
+                    (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+                sub_lo)),
+         GPR32),
+       (i32 24))>;
+def : MSAPat<
+  (i32 (vextract_sext_i16 v8i16:$ws, i64:$idx)),
+  (SRA (COPY_TO_REGCLASS
+         (i32 (EXTRACT_SUBREG
+                (SPLAT_H v8i16:$ws,
+                  (COPY_TO_REGCLASS
+                    (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+                sub_lo)),
+         GPR32),
+       (i32 16))>;
+def : MSAPat<
+  (i32 (vextract_sext_i32 v4i32:$ws, i64:$idx)),
+  (COPY_TO_REGCLASS
+    (i32 (EXTRACT_SUBREG
+           (SPLAT_W v4i32:$ws,
+             (COPY_TO_REGCLASS
+               (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+           sub_lo)),
+    GPR32)>;
+def : MSAPat<
+  (i64 (vextract_sext_i64 v2i64:$ws, i64:$idx)),
+  (COPY_TO_REGCLASS
+    (i64 (EXTRACT_SUBREG
+           (SPLAT_D v2i64:$ws,
+             (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+           sub_64)),
+    GPR64), [HasMSA, IsGP64bit]>;
+
+def : MSAPat<
+  (i32 (vextract_zext_i8 v16i8:$ws, i64:$idx)),
+  (SRL (COPY_TO_REGCLASS
+         (i32 (EXTRACT_SUBREG
+                 (SPLAT_B v16i8:$ws,
+                   (COPY_TO_REGCLASS
+                     (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+                 sub_lo)),
+         GPR32),
+       (i32 24))>;
+def : MSAPat<
+  (i32 (vextract_zext_i16 v8i16:$ws, i64:$idx)),
+  (SRL (COPY_TO_REGCLASS
+         (i32 (EXTRACT_SUBREG
+                (SPLAT_H v8i16:$ws,
+                  (COPY_TO_REGCLASS
+                    (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+                sub_lo)),
+         GPR32),
+       (i32 16))>;
+def : MSAPat<
+  (i32 (vextract_zext_i32 v4i32:$ws, i64:$idx)),
+  (COPY_TO_REGCLASS
+    (i32 (EXTRACT_SUBREG
+           (SPLAT_W v4i32:$ws,
+             (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+           sub_lo)),
+    GPR32)>;
+def : MSAPat<
+  (i64 (vextract_zext_i64 v2i64:$ws, i64:$idx)),
+  (COPY_TO_REGCLASS
+    (i64 (EXTRACT_SUBREG
+           (SPLAT_D v2i64:$ws,
+             (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+           sub_64)),
+    GPR64),
+  [HasMSA, IsGP64bit]>;
+
+def : MSAPat<
+  (f32 (vector_extract v4f32:$ws, i64:$idx)),
+  (f32 (EXTRACT_SUBREG
+         (SPLAT_W v4f32:$ws,
+           (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+         sub_lo))>;
+def : MSAPat<
+  (f64 (vector_extract v2f64:$ws, i64:$idx)),
+  (f64 (EXTRACT_SUBREG
+         (SPLAT_D v2f64:$ws,
+           (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG i64:$idx, sub_32)), GPR32)),
+         sub_64))>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
new file mode 100644
index 000000000000..d0609b15341d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
@@ -0,0 +1,104 @@
+//===-- MipsMachineFunctionInfo.cpp - Private data used for Mips ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true),
+                 cl::desc("Always use $gp as the global base register."));
+
+MipsFunctionInfo::~MipsFunctionInfo() {}
+
+bool MipsFunctionInfo::globalBaseRegSet() const {
+  return GlobalBaseReg;
+}
+
+unsigned MipsFunctionInfo::getGlobalBaseReg() {
+  // Return if it has already been initialized.
+  if (GlobalBaseReg)
+    return GlobalBaseReg;
+
+  MipsSubtarget const &STI =
+      static_cast<const MipsSubtarget &>(MF.getSubtarget());
+
+  const TargetRegisterClass *RC =
+      STI.inMips16Mode()
+          ? &Mips::CPU16RegsRegClass
+          : STI.inMicroMipsMode()
+                ? STI.hasMips64()
+                      ? &Mips::GPRMM16_64RegClass
+                      : &Mips::GPRMM16RegClass
+                : static_cast<const MipsTargetMachine &>(MF.getTarget())
+                          .getABI()
+                          .IsN64()
+                      ? &Mips::GPR64RegClass
+                      : &Mips::GPR32RegClass;
+  return GlobalBaseReg = MF.getRegInfo().createVirtualRegister(RC);
+}
+
+void MipsFunctionInfo::createEhDataRegsFI() {
+  for (int I = 0; I < 4; ++I) {
+    const TargetRegisterClass *RC =
+        static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI().IsN64()
+            ? &Mips::GPR64RegClass
+            : &Mips::GPR32RegClass;
+
+    EhDataRegFI[I] = MF.getFrameInfo().CreateStackObject(RC->getSize(),
+        RC->getAlignment(), false);
+  }
+}
+
+void MipsFunctionInfo::createISRRegFI() {
+  // ISRs require spill slots for Status & ErrorPC Coprocessor 0 registers.
+  // The current implementation only supports Mips32r2+ not Mips64rX. Status
+  // is always 32 bits, ErrorPC is 32 or 64 bits dependent on architecture,
+  // however Mips32r2+ is the supported architecture.
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+
+  for (int I = 0; I < 2; ++I)
+    ISRDataRegFI[I] = MF.getFrameInfo().CreateStackObject(
+        RC->getSize(), RC->getAlignment(), false);
+}
+
+bool MipsFunctionInfo::isEhDataRegFI(int FI) const {
+  return CallsEhReturn && (FI == EhDataRegFI[0] || FI == EhDataRegFI[1]
+                        || FI == EhDataRegFI[2] || FI == EhDataRegFI[3]);
+}
+
+bool MipsFunctionInfo::isISRRegFI(int FI) const {
+  return IsISR && (FI == ISRDataRegFI[0] || FI == ISRDataRegFI[1]);
+}
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(const char *ES) {
+  return MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES));
+}
+
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *GV) {
+  return MachinePointerInfo(MF.getPSVManager().getGlobalValueCallEntry(GV));
+}
+
+int MipsFunctionInfo::getMoveF64ViaSpillFI(const TargetRegisterClass *RC) {
+  if (MoveF64ViaSpillFI == -1) {
+    MoveF64ViaSpillFI = MF.getFrameInfo().CreateStackObject(
+        RC->getSize(), RC->getAlignment(), false);
+  }
+  return MoveF64ViaSpillFI;
+}
+
+void MipsFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h
new file mode 100644
index 000000000000..c9e5fddc1932
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h
@@ -0,0 +1,132 @@
+//===-- MipsMachineFunctionInfo.h - Private data used for Mips ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSMACHINEFUNCTION_H
+#define LLVM_LIB_TARGET_MIPS_MIPSMACHINEFUNCTION_H
+
+#include "Mips16HardFloatInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include <map>
+
+namespace llvm {
+
+/// MipsFunctionInfo - This class is derived from MachineFunction private
+/// Mips target-specific information for each MachineFunction.
+class MipsFunctionInfo : public MachineFunctionInfo {
+public:
+  MipsFunctionInfo(MachineFunction &MF)
+      : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), VarArgsFrameIndex(0),
+        CallsEhReturn(false), IsISR(false), SaveS2(false),
+        MoveF64ViaSpillFI(-1) {}
+
+  ~MipsFunctionInfo();
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  bool globalBaseRegSet() const;
+  unsigned getGlobalBaseReg();
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+  bool hasByvalArg() const { return HasByvalArg; }
+  void setFormalArgInfo(unsigned Size, bool HasByval) {
+    IncomingArgSize = Size;
+    HasByvalArg = HasByval;
+  }
+
+  unsigned getIncomingArgSize() const { return IncomingArgSize; }
+
+  bool callsEhReturn() const { return CallsEhReturn; }
+  void setCallsEhReturn() { CallsEhReturn = true; }
+
+  void createEhDataRegsFI();
+  int getEhDataRegFI(unsigned Reg) const { return EhDataRegFI[Reg]; }
+  bool isEhDataRegFI(int FI) const;
+
+  /// Create a MachinePointerInfo that has an ExternalSymbolPseudoSourceValue
+  /// object representing a GOT entry for an external function.
+  MachinePointerInfo callPtrInfo(const char *ES);
+
+  // Functions with the "interrupt" attribute require special prologues,
+  // epilogues and additional spill slots.
+  bool isISR() const { return IsISR; }
+  void setISR() { IsISR = true; }
+  void createISRRegFI();
+  int getISRRegFI(unsigned Reg) const { return ISRDataRegFI[Reg]; }
+  bool isISRRegFI(int FI) const;
+
+  /// Create a MachinePointerInfo that has a GlobalValuePseudoSourceValue object
+  /// representing a GOT entry for a global function.
+  MachinePointerInfo callPtrInfo(const GlobalValue *GV);
+
+  void setSaveS2() { SaveS2 = true; }
+  bool hasSaveS2() const { return SaveS2; }
+
+  int getMoveF64ViaSpillFI(const TargetRegisterClass *RC);
+
+  std::map<const char *, const llvm::Mips16HardFloatInfo::FuncSignature *>
+  StubsNeeded;
+
+private:
+  virtual void anchor();
+
+  MachineFunction& MF;
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+  /// True if function has a byval argument.
+  bool HasByvalArg;
+
+  /// Size of incoming argument area.
+  unsigned IncomingArgSize;
+
+  /// CallsEhReturn - Whether the function calls llvm.eh.return.
+  bool CallsEhReturn;
+
+  /// Frame objects for spilling eh data registers.
+  int EhDataRegFI[4];
+
+  /// ISR - Whether the function is an Interrupt Service Routine.
+  bool IsISR;
+
+  /// Frame objects for spilling C0_STATUS, C0_EPC
+  int ISRDataRegFI[2];
+
+  // saveS2
+  bool SaveS2;
+
+  /// FrameIndex for expanding BuildPairF64 nodes to spill and reload when the
+  /// O32 FPXX ABI is enabled. -1 is used to denote invalid index.
+  int MoveF64ViaSpillFI;
+};
+
+} // end of namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
new file mode 100644
index 000000000000..cf85eb3f2416
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+// Instruction Selector Subtarget Control
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// This file defines a pass used to change the subtarget for the
+// Mips Instruction selector.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsTargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-isel"
+
+namespace {
+  class MipsModuleDAGToDAGISel : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    explicit MipsModuleDAGToDAGISel(MipsTargetMachine &TM_)
+      : MachineFunctionPass(ID), TM(TM_) {}
+
+    // Pass Name
+    StringRef getPassName() const override {
+      return "MIPS DAG->DAG Pattern Instruction Selection";
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  protected:
+    MipsTargetMachine &TM;
+  };
+
+  char MipsModuleDAGToDAGISel::ID = 0;
+}
+
+bool MipsModuleDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(errs() << "In MipsModuleDAGToDAGISel::runMachineFunction\n");
+  TM.resetSubtarget(&MF);
+  return false;
+}
+
+llvm::FunctionPass *llvm::createMipsModuleISelDagPass(MipsTargetMachine &TM) {
+  return new MipsModuleDAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp b/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
new file mode 100644
index 000000000000..f33857fe628f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -0,0 +1,301 @@
+//===--------- MipsOptimizePICCall.cpp - Optimize PIC Calls ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates unnecessary instructions that set up $gp and replace
+// instructions that load target function addresses with copy instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "optimize-mips-pic-call"
+
+static cl::opt<bool> LoadTargetFromGOT("mips-load-target-from-got",
+                                       cl::init(true),
+                                       cl::desc("Load target address from GOT"),
+                                       cl::Hidden);
+
+static cl::opt<bool> EraseGPOpnd("mips-erase-gp-opnd",
+                                 cl::init(true), cl::desc("Erase GP Operand"),
+                                 cl::Hidden);
+
+namespace {
+typedef PointerUnion<const Value *, const PseudoSourceValue *> ValueType;
+
+typedef std::pair<unsigned, unsigned> CntRegP;
+typedef RecyclingAllocator<BumpPtrAllocator,
+                           ScopedHashTableVal<ValueType, CntRegP> >
+AllocatorTy;
+typedef ScopedHashTable<ValueType, CntRegP, DenseMapInfo<ValueType>,
+                        AllocatorTy> ScopedHTType;
+
+class MBBInfo {
+public:
+  MBBInfo(MachineDomTreeNode *N);
+  const MachineDomTreeNode *getNode() const;
+  bool isVisited() const;
+  void preVisit(ScopedHTType &ScopedHT);
+  void postVisit();
+
+private:
+  MachineDomTreeNode *Node;
+  ScopedHTType::ScopeTy *HTScope;
+};
+
+class OptimizePICCall : public MachineFunctionPass {
+public:
+  OptimizePICCall(TargetMachine &tm) : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "Mips OptimizePICCall"; }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  /// \brief Visit MBB.
+  bool visitNode(MBBInfo &MBBI);
+
+  /// \brief Test if MI jumps to a function via a register.
+  ///
+  /// Also, return the virtual register containing the target function's address
+  /// and the underlying object in Reg and Val respectively, if the function's
+  /// address can be resolved lazily.
+  bool isCallViaRegister(MachineInstr &MI, unsigned &Reg,
+                         ValueType &Val) const;
+
+  /// \brief Return the number of instructions that dominate the current
+  /// instruction and load the function address from object Entry.
+  unsigned getCount(ValueType Entry);
+
+  /// \brief Return the destination virtual register of the last instruction
+  /// that loads from object Entry.
+  unsigned getReg(ValueType Entry);
+
+  /// \brief Update ScopedHT.
+  void incCntAndSetReg(ValueType Entry, unsigned Reg);
+
+  ScopedHTType ScopedHT;
+  static char ID;
+};
+
+char OptimizePICCall::ID = 0;
+} // end of anonymous namespace
+
+/// Return the first MachineOperand of MI if it is a used virtual register.
+static MachineOperand *getCallTargetRegOpnd(MachineInstr &MI) {
+  if (MI.getNumOperands() == 0)
+    return nullptr;
+
+  MachineOperand &MO = MI.getOperand(0);
+
+  if (!MO.isReg() || !MO.isUse() ||
+      !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    return nullptr;
+
+  return &MO;
+}
+
+/// Return type of register Reg.
+static MVT::SimpleValueType getRegTy(unsigned Reg, MachineFunction &MF) {
+  const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(Reg);
+  assert(RC->vt_end() - RC->vt_begin() == 1);
+  return *RC->vt_begin();
+}
+
+/// Do the following transformation:
+///
+/// jalr $vreg
+/// =>
+/// copy $t9, $vreg
+/// jalr $t9
+static void setCallTargetReg(MachineBasicBlock *MBB,
+                             MachineBasicBlock::iterator I) {
+  MachineFunction &MF = *MBB->getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  unsigned SrcReg = I->getOperand(0).getReg();
+  unsigned DstReg = getRegTy(SrcReg, MF) == MVT::i32 ? Mips::T9 : Mips::T9_64;
+  BuildMI(*MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), DstReg)
+      .addReg(SrcReg);
+  I->getOperand(0).setReg(DstReg);
+}
+
+/// Search MI's operands for register GP and erase it.
+static void eraseGPOpnd(MachineInstr &MI) {
+  if (!EraseGPOpnd)
+    return;
+
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MVT::SimpleValueType Ty = getRegTy(MI.getOperand(0).getReg(), MF);
+  unsigned Reg = Ty == MVT::i32 ? Mips::GP : Mips::GP_64;
+
+  for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+    MachineOperand &MO = MI.getOperand(I);
+    if (MO.isReg() && MO.getReg() == Reg) {
+      MI.RemoveOperand(I);
+      return;
+    }
+  }
+
+  llvm_unreachable(nullptr);
+}
+
+MBBInfo::MBBInfo(MachineDomTreeNode *N) : Node(N), HTScope(nullptr) {}
+
+const MachineDomTreeNode *MBBInfo::getNode() const { return Node; }
+
+bool MBBInfo::isVisited() const { return HTScope; }
+
+void MBBInfo::preVisit(ScopedHTType &ScopedHT) {
+  HTScope = new ScopedHTType::ScopeTy(ScopedHT);
+}
+
+void MBBInfo::postVisit() {
+  delete HTScope;
+}
+
+// OptimizePICCall methods.
+bool OptimizePICCall::runOnMachineFunction(MachineFunction &F) {
+  if (static_cast<const MipsSubtarget &>(F.getSubtarget()).inMips16Mode())
+    return false;
+
+  // Do a pre-order traversal of the dominator tree.
+  MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+  bool Changed = false;
+
+  SmallVector<MBBInfo, 8> WorkList(1, MBBInfo(MDT->getRootNode()));
+
+  while (!WorkList.empty()) {
+    MBBInfo &MBBI = WorkList.back();
+
+    // If this MBB has already been visited, destroy the scope for the MBB and
+    // pop it from the work list.
+    if (MBBI.isVisited()) {
+      MBBI.postVisit();
+      WorkList.pop_back();
+      continue;
+    }
+
+    // Visit the MBB and add its children to the work list.
+    MBBI.preVisit(ScopedHT);
+    Changed |= visitNode(MBBI);
+    const MachineDomTreeNode *Node = MBBI.getNode();
+    const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
+    WorkList.append(Children.begin(), Children.end());
+  }
+
+  return Changed;
+}
+
+bool OptimizePICCall::visitNode(MBBInfo &MBBI) {
+  bool Changed = false;
+  MachineBasicBlock *MBB = MBBI.getNode()->getBlock();
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I) {
+    unsigned Reg;
+    ValueType Entry;
+
+    // Skip instructions that are not call instructions via registers.
+    if (!isCallViaRegister(*I, Reg, Entry))
+      continue;
+
+    Changed = true;
+    unsigned N = getCount(Entry);
+
+    if (N != 0) {
+      // If a function has been called more than twice, we do not have to emit a
+      // load instruction to get the function address from the GOT, but can
+      // instead reuse the address that has been loaded before.
+      if (N >= 2 && !LoadTargetFromGOT)
+        getCallTargetRegOpnd(*I)->setReg(getReg(Entry));
+
+      // Erase the $gp operand if this isn't the first time a function has
+      // been called. $gp needs to be set up only if the function call can go
+      // through a lazy binding stub.
+      eraseGPOpnd(*I);
+    }
+
+    if (Entry)
+      incCntAndSetReg(Entry, Reg);
+
+    setCallTargetReg(MBB, I);
+  }
+
+  return Changed;
+}
+
+bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg,
+                                        ValueType &Val) const {
+  if (!MI.isCall())
+    return false;
+
+  MachineOperand *MO = getCallTargetRegOpnd(MI);
+
+  // Return if MI is not a function call via a register.
+  if (!MO)
+    return false;
+
+  // Get the instruction that loads the function address from the GOT.
+  Reg = MO->getReg();
+  Val = (Value*)nullptr;
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  MachineInstr *DefMI = MRI.getVRegDef(Reg);
+
+  assert(DefMI);
+
+  // See if DefMI is an instruction that loads from a GOT entry that holds the
+  // address of a lazy binding stub.
+  if (!DefMI->mayLoad() || DefMI->getNumOperands() < 3)
+    return true;
+
+  unsigned Flags = DefMI->getOperand(2).getTargetFlags();
+
+  if (Flags != MipsII::MO_GOT_CALL && Flags != MipsII::MO_CALL_LO16)
+    return true;
+
+  // Return the underlying object for the GOT entry in Val.
+  assert(DefMI->hasOneMemOperand());
+  Val = (*DefMI->memoperands_begin())->getValue();
+  if (!Val)
+    Val = (*DefMI->memoperands_begin())->getPseudoValue();
+  return true;
+}
+
+unsigned OptimizePICCall::getCount(ValueType Entry) {
+  return ScopedHT.lookup(Entry).first;
+}
+
+unsigned OptimizePICCall::getReg(ValueType Entry) {
+  unsigned Reg = ScopedHT.lookup(Entry).second;
+  assert(Reg);
+  return Reg;
+}
+
+void OptimizePICCall::incCntAndSetReg(ValueType Entry, unsigned Reg) {
+  CntRegP P = ScopedHT.lookup(Entry);
+  ScopedHT.insert(Entry, std::make_pair(P.first + 1, Reg));
+}
+
+/// Return an OptimizeCall object.
+FunctionPass *llvm::createMipsOptimizePICCallPass(MipsTargetMachine &TM) {
+  return new OptimizePICCall(TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsOptionRecord.h b/contrib/llvm/lib/Target/Mips/MipsOptionRecord.h
new file mode 100644
index 000000000000..23f0b7070d62
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsOptionRecord.h
@@ -0,0 +1,78 @@
+//===-- MipsOptionRecord.h - Abstraction for storing information ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MipsOptionRecord - Abstraction for storing arbitrary information in
+// ELF files. Arbitrary information (e.g. register usage) can be stored in Mips
+// specific ELF sections like .Mips.options. Specific records should subclass
+// MipsOptionRecord and provide an implementation to EmitMipsOptionRecord which
+// basically just dumps the information into an ELF section. More information
+// about .Mips.option can be found in the SysV ABI and the 64-bit ELF Object
+// specification.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSOPTIONRECORD_H
+#define LLVM_LIB_TARGET_MIPS_MIPSOPTIONRECORD_H
+
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCRegisterInfo.h"
+
+namespace llvm {
+class MipsELFStreamer;
+class MCSubtargetInfo;
+
+class MipsOptionRecord {
+public:
+  virtual ~MipsOptionRecord(){};
+  virtual void EmitMipsOptionRecord() = 0;
+};
+
+class MipsRegInfoRecord : public MipsOptionRecord {
+public:
+  MipsRegInfoRecord(MipsELFStreamer *S, MCContext &Context)
+      : Streamer(S), Context(Context) {
+    ri_gprmask = 0;
+    ri_cprmask[0] = ri_cprmask[1] = ri_cprmask[2] = ri_cprmask[3] = 0;
+    ri_gp_value = 0;
+
+    const MCRegisterInfo *TRI = Context.getRegisterInfo();
+    GPR32RegClass = &(TRI->getRegClass(Mips::GPR32RegClassID));
+    GPR64RegClass = &(TRI->getRegClass(Mips::GPR64RegClassID));
+    FGR32RegClass = &(TRI->getRegClass(Mips::FGR32RegClassID));
+    FGR64RegClass = &(TRI->getRegClass(Mips::FGR64RegClassID));
+    AFGR64RegClass = &(TRI->getRegClass(Mips::AFGR64RegClassID));
+    MSA128BRegClass = &(TRI->getRegClass(Mips::MSA128BRegClassID));
+    COP0RegClass = &(TRI->getRegClass(Mips::COP0RegClassID));
+    COP2RegClass = &(TRI->getRegClass(Mips::COP2RegClassID));
+    COP3RegClass = &(TRI->getRegClass(Mips::COP3RegClassID));
+  }
+  ~MipsRegInfoRecord() override {}
+
+  void EmitMipsOptionRecord() override;
+  void SetPhysRegUsed(unsigned Reg, const MCRegisterInfo *MCRegInfo);
+
+private:
+  MipsELFStreamer *Streamer;
+  MCContext &Context;
+  const MCRegisterClass *GPR32RegClass;
+  const MCRegisterClass *GPR64RegClass;
+  const MCRegisterClass *FGR32RegClass;
+  const MCRegisterClass *FGR64RegClass;
+  const MCRegisterClass *AFGR64RegClass;
+  const MCRegisterClass *MSA128BRegClass;
+  const MCRegisterClass *COP0RegClass;
+  const MCRegisterClass *COP2RegClass;
+  const MCRegisterClass *COP3RegClass;
+  uint32_t ri_gprmask;
+  uint32_t ri_cprmask[4];
+  int64_t ri_gp_value;
+};
+} // namespace llvm
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsOs16.cpp b/contrib/llvm/lib/Target/Mips/MipsOs16.cpp
new file mode 100644
index 000000000000..51ac5620f585
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsOs16.cpp
@@ -0,0 +1,160 @@
+//===---- MipsOs16.cpp for Mips Option -Os16                       --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an optimization phase for the MIPS target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Instructions.h"
+#include "Mips.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-os16"
+
+static cl::opt<std::string> Mips32FunctionMask(
+  "mips32-function-mask",
+  cl::init(""),
+  cl::desc("Force function to be mips32"),
+  cl::Hidden);
+
+namespace {
+  class MipsOs16 : public ModulePass {
+  public:
+    static char ID;
+
+    MipsOs16() : ModulePass(ID) {}
+
+    StringRef getPassName() const override { return "MIPS Os16 Optimization"; }
+
+    bool runOnModule(Module &M) override;
+  };
+
+  char MipsOs16::ID = 0;
+}
+
+// Figure out if we need float point based on the function signature.
+// We need to move variables in and/or out of floating point
+// registers because of the ABI
+//
+static  bool needsFPFromSig(Function &F) {
+  Type* RetType = F.getReturnType();
+  switch (RetType->getTypeID()) {
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+    return true;
+  default:
+    ;
+  }
+  if (F.arg_size() >=1) {
+    Argument &Arg = F.getArgumentList().front();
+    switch (Arg.getType()->getTypeID()) {
+    case Type::FloatTyID:
+    case Type::DoubleTyID:
+      return true;
+    default:
+      ;
+    }
+  }
+  return false;
+}
+
+// Figure out if the function will need floating point operations
+//
+static bool needsFP(Function &F) {
+  if (needsFPFromSig(F))
+    return true;
+  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      const Instruction &Inst = *I;
+      switch (Inst.getOpcode()) {
+      case Instruction::FAdd:
+      case Instruction::FSub:
+      case Instruction::FMul:
+      case Instruction::FDiv:
+      case Instruction::FRem:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::UIToFP:
+      case Instruction::SIToFP:
+      case Instruction::FPTrunc:
+      case Instruction::FPExt:
+      case Instruction::FCmp:
+        return true;
+      default:
+        ;
+      }
+      if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+        DEBUG(dbgs() << "Working on call" << "\n");
+        Function &F_ =  *CI->getCalledFunction();
+        if (needsFPFromSig(F_))
+          return true;
+      }
+    }
+  return false;
+}
+
+
+bool MipsOs16::runOnModule(Module &M) {
+  bool usingMask = Mips32FunctionMask.length() > 0;
+  bool doneUsingMask = false; // this will make it stop repeating
+
+  DEBUG(dbgs() << "Run on Module MipsOs16 \n" << Mips32FunctionMask << "\n");
+  if (usingMask)
+    DEBUG(dbgs() << "using mask \n" << Mips32FunctionMask << "\n");
+
+  unsigned int functionIndex = 0;
+  bool modified = false;
+
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+
+    DEBUG(dbgs() << "Working on " << F.getName() << "\n");
+    if (usingMask) {
+      if (!doneUsingMask) {
+        if (functionIndex == Mips32FunctionMask.length())
+          functionIndex = 0;
+        switch (Mips32FunctionMask[functionIndex]) {
+        case '1':
+          DEBUG(dbgs() << "mask forced mips32: " << F.getName() << "\n");
+          F.addFnAttr("nomips16");
+          break;
+        case '.':
+          doneUsingMask = true;
+          break;
+        default:
+          break;
+        }
+        functionIndex++;
+      }
+    }
+    else {
+      if (needsFP(F)) {
+        DEBUG(dbgs() << "os16 forced mips32: " << F.getName() << "\n");
+        F.addFnAttr("nomips16");
+      }
+      else {
+        DEBUG(dbgs() << "os16 forced mips16: " << F.getName() << "\n");
+        F.addFnAttr("mips16");
+      }
+    }
+  }
+
+  return modified;
+}
+
+ModulePass *llvm::createMipsOs16Pass(MipsTargetMachine &TM) {
+  return new MipsOs16;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
new file mode 100644
index 000000000000..65be350f259d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -0,0 +1,340 @@
+//===-- MipsRegisterInfo.cpp - MIPS Register Information -== --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsRegisterInfo.h"
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-reg-info"
+
+#define GET_REGINFO_TARGET_DESC
+#include "MipsGenRegisterInfo.inc"
+
+MipsRegisterInfo::MipsRegisterInfo() : MipsGenRegisterInfo(Mips::RA) {}
+
+unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
+
+const TargetRegisterClass *
+MipsRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                     unsigned Kind) const {
+  MipsABIInfo ABI = MF.getSubtarget<MipsSubtarget>().getABI();
+  MipsPtrClass PtrClassKind = static_cast<MipsPtrClass>(Kind);
+
+  switch (PtrClassKind) {
+  case MipsPtrClass::Default:
+    return ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+  case MipsPtrClass::GPR16MM:
+    return ABI.ArePtrs64bit() ? &Mips::GPRMM16_64RegClass
+                              : &Mips::GPRMM16RegClass;
+  case MipsPtrClass::StackPointer:
+    return ABI.ArePtrs64bit() ? &Mips::SP64RegClass : &Mips::SP32RegClass;
+  case MipsPtrClass::GlobalPointer:                              
+    return ABI.ArePtrs64bit() ? &Mips::GP64RegClass : &Mips::GP32RegClass;
+  }
+
+  llvm_unreachable("Unknown pointer kind");
+}
+
+unsigned
+MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                      MachineFunction &MF) const {
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case Mips::GPR32RegClassID:
+  case Mips::GPR64RegClassID:
+  case Mips::DSPRRegClassID: {
+    const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+    return 28 - TFI->hasFP(MF);
+  }
+  case Mips::FGR32RegClassID:
+    return 32;
+  case Mips::AFGR64RegClassID:
+    return 16;
+  case Mips::FGR64RegClassID:
+    return 32;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Callee Saved Registers methods
+//===----------------------------------------------------------------------===//
+
+/// Mips Callee Saved Registers
+const MCPhysReg *
+MipsRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  const MipsSubtarget &Subtarget = MF->getSubtarget<MipsSubtarget>();
+  const Function *F = MF->getFunction();
+  if (F->hasFnAttribute("interrupt")) {
+    if (Subtarget.hasMips64())
+      return Subtarget.hasMips64r6() ? CSR_Interrupt_64R6_SaveList
+                                     : CSR_Interrupt_64_SaveList;
+    else
+      return Subtarget.hasMips32r6() ? CSR_Interrupt_32R6_SaveList
+                                     : CSR_Interrupt_32_SaveList;
+  }
+
+  if (Subtarget.isSingleFloat())
+    return CSR_SingleFloatOnly_SaveList;
+
+  if (Subtarget.isABI_N64())
+    return CSR_N64_SaveList;
+
+  if (Subtarget.isABI_N32())
+    return CSR_N32_SaveList;
+
+  if (Subtarget.isFP64bit())
+    return CSR_O32_FP64_SaveList;
+
+  if (Subtarget.isFPXX())
+    return CSR_O32_FPXX_SaveList;
+
+  return CSR_O32_SaveList;
+}
+
+const uint32_t *
+MipsRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID) const {
+  const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
+  if (Subtarget.isSingleFloat())
+    return CSR_SingleFloatOnly_RegMask;
+
+  if (Subtarget.isABI_N64())
+    return CSR_N64_RegMask;
+
+  if (Subtarget.isABI_N32())
+    return CSR_N32_RegMask;
+
+  if (Subtarget.isFP64bit())
+    return CSR_O32_FP64_RegMask;
+
+  if (Subtarget.isFPXX())
+    return CSR_O32_FPXX_RegMask;
+
+  return CSR_O32_RegMask;
+}
+
+const uint32_t *MipsRegisterInfo::getMips16RetHelperMask() {
+  return CSR_Mips16RetHelper_RegMask;
+}
+
+BitVector MipsRegisterInfo::
+getReservedRegs(const MachineFunction &MF) const {
+  static const MCPhysReg ReservedGPR32[] = {
+    Mips::ZERO, Mips::K0, Mips::K1, Mips::SP
+  };
+
+  static const MCPhysReg ReservedGPR64[] = {
+    Mips::ZERO_64, Mips::K0_64, Mips::K1_64, Mips::SP_64
+  };
+
+  BitVector Reserved(getNumRegs());
+  const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
+  typedef TargetRegisterClass::const_iterator RegIter;
+
+  for (unsigned I = 0; I < array_lengthof(ReservedGPR32); ++I)
+    Reserved.set(ReservedGPR32[I]);
+
+  // Reserve registers for the NaCl sandbox.
+  if (Subtarget.isTargetNaCl()) {
+    Reserved.set(Mips::T6);   // Reserved for control flow mask.
+    Reserved.set(Mips::T7);   // Reserved for memory access mask.
+    Reserved.set(Mips::T8);   // Reserved for thread pointer.
+  }
+
+  for (unsigned I = 0; I < array_lengthof(ReservedGPR64); ++I)
+    Reserved.set(ReservedGPR64[I]);
+
+  // For mno-abicalls, GP is a program invariant!
+  if (!Subtarget.isABICalls()) {
+    Reserved.set(Mips::GP);
+    Reserved.set(Mips::GP_64);
+  }
+
+  if (Subtarget.isFP64bit()) {
+    // Reserve all registers in AFGR64.
+    for (RegIter Reg = Mips::AFGR64RegClass.begin(),
+         EReg = Mips::AFGR64RegClass.end(); Reg != EReg; ++Reg)
+      Reserved.set(*Reg);
+  } else {
+    // Reserve all registers in FGR64.
+    for (RegIter Reg = Mips::FGR64RegClass.begin(),
+         EReg = Mips::FGR64RegClass.end(); Reg != EReg; ++Reg)
+      Reserved.set(*Reg);
+  }
+  // Reserve FP if this function should have a dedicated frame pointer register.
+  if (Subtarget.getFrameLowering()->hasFP(MF)) {
+    if (Subtarget.inMips16Mode())
+      Reserved.set(Mips::S0);
+    else {
+      Reserved.set(Mips::FP);
+      Reserved.set(Mips::FP_64);
+
+      // Reserve the base register if we need to both realign the stack and
+      // allocate variable-sized objects at runtime. This should test the
+      // same conditions as MipsFrameLowering::hasBP().
+      if (needsStackRealignment(MF) &&
+          MF.getFrameInfo().hasVarSizedObjects()) {
+        Reserved.set(Mips::S7);
+        Reserved.set(Mips::S7_64);
+      }
+    }
+  }
+
+  // Reserve hardware registers.
+  Reserved.set(Mips::HWR29);
+
+  // Reserve DSP control register.
+  Reserved.set(Mips::DSPPos);
+  Reserved.set(Mips::DSPSCount);
+  Reserved.set(Mips::DSPCarry);
+  Reserved.set(Mips::DSPEFI);
+  Reserved.set(Mips::DSPOutFlag);
+
+  // Reserve MSA control registers.
+  Reserved.set(Mips::MSAIR);
+  Reserved.set(Mips::MSACSR);
+  Reserved.set(Mips::MSAAccess);
+  Reserved.set(Mips::MSASave);
+  Reserved.set(Mips::MSAModify);
+  Reserved.set(Mips::MSARequest);
+  Reserved.set(Mips::MSAMap);
+  Reserved.set(Mips::MSAUnmap);
+
+  // Reserve RA if in mips16 mode.
+  if (Subtarget.inMips16Mode()) {
+    const MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+    Reserved.set(Mips::RA);
+    Reserved.set(Mips::RA_64);
+    Reserved.set(Mips::T0);
+    Reserved.set(Mips::T1);
+    if (MF.getFunction()->hasFnAttribute("saveS2") || MipsFI->hasSaveS2())
+      Reserved.set(Mips::S2);
+  }
+
+  // Reserve GP if small section is used.
+  if (Subtarget.useSmallSection()) {
+    Reserved.set(Mips::GP);
+    Reserved.set(Mips::GP_64);
+  }
+
+  if (Subtarget.isABI_O32() && !Subtarget.useOddSPReg()) {
+    for (const auto &Reg : Mips::OddSPRegClass)
+      Reserved.set(Reg);
+  }
+
+  return Reserved;
+}
+
+bool
+MipsRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool
+MipsRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return true;
+}
+
+// FrameIndex represent objects inside a abstract stack.
+// We must replace FrameIndex with an stack/frame pointer
+// direct reference.
+void MipsRegisterInfo::
+eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                    unsigned FIOperandNum, RegScavenger *RS) const {
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+
+  DEBUG(errs() << "\nFunction : " << MF.getName() << "\n";
+        errs() << "<--------->\n" << MI);
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  uint64_t stackSize = MF.getFrameInfo().getStackSize();
+  int64_t spOffset = MF.getFrameInfo().getObjectOffset(FrameIndex);
+
+  DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n"
+               << "spOffset   : " << spOffset << "\n"
+               << "stackSize  : " << stackSize << "\n");
+
+  eliminateFI(MI, FIOperandNum, FrameIndex, stackSize, spOffset);
+}
+
+unsigned MipsRegisterInfo::
+getFrameRegister(const MachineFunction &MF) const {
+  const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
+  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+  bool IsN64 =
+      static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI().IsN64();
+
+  if (Subtarget.inMips16Mode())
+    return TFI->hasFP(MF) ? Mips::S0 : Mips::SP;
+  else
+    return TFI->hasFP(MF) ? (IsN64 ? Mips::FP_64 : Mips::FP) :
+                            (IsN64 ? Mips::SP_64 : Mips::SP);
+}
+
+bool MipsRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  // Avoid realigning functions that explicitly do not want to be realigned.
+  // Normally, we should report an error when a function should be dynamically
+  // realigned but also has the attribute no-realign-stack. Unfortunately,
+  // with this attribute, MachineFrameInfo clamps each new object's alignment
+  // to that of the stack's alignment as specified by the ABI. As a result,
+  // the information of whether we have objects with larger alignment
+  // requirement than the stack's alignment is already lost at this point.
+  if (!TargetRegisterInfo::canRealignStack(MF))
+    return false;
+
+  const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
+  unsigned FP = Subtarget.isGP32bit() ? Mips::FP : Mips::FP_64;
+  unsigned BP = Subtarget.isGP32bit() ? Mips::S7 : Mips::S7_64;
+
+  // Support dynamic stack realignment only for targets with standard encoding.
+  if (!Subtarget.hasStandardEncoding())
+    return false;
+
+  // We can't perform dynamic stack realignment if we can't reserve the
+  // frame pointer register.
+  if (!MF.getRegInfo().canReserveReg(FP))
+    return false;
+
+  // We can realign the stack if we know the maximum call frame size and we
+  // don't have variable sized objects.
+  if (Subtarget.getFrameLowering()->hasReservedCallFrame(MF))
+    return true;
+
+  // We have to reserve the base pointer register in the presence of variable
+  // sized objects.
+  return MF.getRegInfo().canReserveReg(BP);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h
new file mode 100644
index 000000000000..32f835e83108
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h
@@ -0,0 +1,82 @@
+//===-- MipsRegisterInfo.h - Mips Register Information Impl -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSREGISTERINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPSREGISTERINFO_H
+
+#include "Mips.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "MipsGenRegisterInfo.inc"
+
+namespace llvm {
+class MipsRegisterInfo : public MipsGenRegisterInfo {
+public:
+  enum class MipsPtrClass {
+    /// The default register class for integer values.
+    Default = 0,
+    /// The subset of registers permitted in certain microMIPS instructions
+    /// such as lw16.
+    GPR16MM = 1,
+    /// The stack pointer only.
+    StackPointer = 2,
+    /// The global pointer only.
+    GlobalPointer = 3,
+  };
+
+  MipsRegisterInfo();
+
+  /// Get PIC indirect call register
+  static unsigned getPICCallReg();
+
+  /// Code Generation virtual methods...
+  const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
+                                                unsigned Kind) const override;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID) const override;
+  static const uint32_t *getMips16RetHelperMask();
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+  /// Stack Frame Processing Methods
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  // Stack realignment queries.
+  bool canRealignStack(const MachineFunction &MF) const override;
+
+  /// Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  /// \brief Return GPR register class.
+  virtual const TargetRegisterClass *intRegClass(unsigned Size) const = 0;
+
+private:
+  virtual void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+                           int FrameIndex, uint64_t StackSize,
+                           int64_t SPOffset) const = 0;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td
new file mode 100644
index 000000000000..8c82239ebbd3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td
@@ -0,0 +1,673 @@
+//===-- MipsRegisterInfo.td - Mips Register defs -----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the MIPS register file
+//===----------------------------------------------------------------------===//
+let Namespace = "Mips" in {
+def sub_32     : SubRegIndex<32>;
+def sub_64     : SubRegIndex<64>;
+def sub_lo     : SubRegIndex<32>;
+def sub_hi     : SubRegIndex<32, 32>;
+def sub_dsp16_19 : SubRegIndex<4, 16>;
+def sub_dsp20    : SubRegIndex<1, 20>;
+def sub_dsp21    : SubRegIndex<1, 21>;
+def sub_dsp22    : SubRegIndex<1, 22>;
+def sub_dsp23    : SubRegIndex<1, 23>;
+}
+
+class Unallocatable {
+  bit isAllocatable = 0;
+}
+
+// We have banks of 32 registers each.
+class MipsReg<bits<16> Enc, string n> : Register<n> {
+  let HWEncoding = Enc;
+  let Namespace = "Mips";
+}
+
+class MipsRegWithSubRegs<bits<16> Enc, string n, list<Register> subregs>
+  : RegisterWithSubRegs<n, subregs> {
+  let HWEncoding = Enc;
+  let Namespace = "Mips";
+}
+
+// Mips CPU Registers
+class MipsGPRReg<bits<16> Enc, string n> : MipsReg<Enc, n>;
+
+// Mips 64-bit CPU Registers
+class Mips64GPRReg<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_32];
+}
+
+// Mips 32-bit FPU Registers
+class FPR<bits<16> Enc, string n> : MipsReg<Enc, n>;
+
+// Mips 64-bit (aliased) FPU Registers
+class AFPR<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_lo, sub_hi];
+  let CoveredBySubRegs = 1;
+}
+
+class AFPR64<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_lo, sub_hi];
+  let CoveredBySubRegs = 1;
+}
+
+// Mips 128-bit (aliased) MSA Registers
+class AFPR128<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_64];
+}
+
+// Accumulator Registers
+class ACCReg<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_lo, sub_hi];
+  let CoveredBySubRegs = 1;
+}
+
+// Mips Hardware Registers
+class HWR<bits<16> Enc, string n> : MipsReg<Enc, n>;
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+let Namespace = "Mips" in {
+  // General Purpose Registers
+  def ZERO : MipsGPRReg< 0, "zero">, DwarfRegNum<[0]>;
+  def AT   : MipsGPRReg< 1, "1">,    DwarfRegNum<[1]>;
+  def V0   : MipsGPRReg< 2, "2">,    DwarfRegNum<[2]>;
+  def V1   : MipsGPRReg< 3, "3">,    DwarfRegNum<[3]>;
+  def A0   : MipsGPRReg< 4, "4">,    DwarfRegNum<[4]>;
+  def A1   : MipsGPRReg< 5, "5">,    DwarfRegNum<[5]>;
+  def A2   : MipsGPRReg< 6, "6">,    DwarfRegNum<[6]>;
+  def A3   : MipsGPRReg< 7, "7">,    DwarfRegNum<[7]>;
+  def T0   : MipsGPRReg< 8, "8">,    DwarfRegNum<[8]>;
+  def T1   : MipsGPRReg< 9, "9">,    DwarfRegNum<[9]>;
+  def T2   : MipsGPRReg< 10, "10">,  DwarfRegNum<[10]>;
+  def T3   : MipsGPRReg< 11, "11">,  DwarfRegNum<[11]>;
+  def T4   : MipsGPRReg< 12, "12">,  DwarfRegNum<[12]>;
+  def T5   : MipsGPRReg< 13, "13">,  DwarfRegNum<[13]>;
+  def T6   : MipsGPRReg< 14, "14">,  DwarfRegNum<[14]>;
+  def T7   : MipsGPRReg< 15, "15">,  DwarfRegNum<[15]>;
+  def S0   : MipsGPRReg< 16, "16">,  DwarfRegNum<[16]>;
+  def S1   : MipsGPRReg< 17, "17">,  DwarfRegNum<[17]>;
+  def S2   : MipsGPRReg< 18, "18">,  DwarfRegNum<[18]>;
+  def S3   : MipsGPRReg< 19, "19">,  DwarfRegNum<[19]>;
+  def S4   : MipsGPRReg< 20, "20">,  DwarfRegNum<[20]>;
+  def S5   : MipsGPRReg< 21, "21">,  DwarfRegNum<[21]>;
+  def S6   : MipsGPRReg< 22, "22">,  DwarfRegNum<[22]>;
+  def S7   : MipsGPRReg< 23, "23">,  DwarfRegNum<[23]>;
+  def T8   : MipsGPRReg< 24, "24">,  DwarfRegNum<[24]>;
+  def T9   : MipsGPRReg< 25, "25">,  DwarfRegNum<[25]>;
+  def K0   : MipsGPRReg< 26, "26">,  DwarfRegNum<[26]>;
+  def K1   : MipsGPRReg< 27, "27">,  DwarfRegNum<[27]>;
+  def GP   : MipsGPRReg< 28, "gp">,  DwarfRegNum<[28]>;
+  def SP   : MipsGPRReg< 29, "sp">,  DwarfRegNum<[29]>;
+  def FP   : MipsGPRReg< 30, "fp">,  DwarfRegNum<[30]>;
+  def RA   : MipsGPRReg< 31, "ra">,  DwarfRegNum<[31]>;
+
+  // General Purpose 64-bit Registers
+  def ZERO_64 : Mips64GPRReg< 0, "zero", [ZERO]>, DwarfRegNum<[0]>;
+  def AT_64   : Mips64GPRReg< 1, "1",    [AT]>, DwarfRegNum<[1]>;
+  def V0_64   : Mips64GPRReg< 2, "2",    [V0]>, DwarfRegNum<[2]>;
+  def V1_64   : Mips64GPRReg< 3, "3",    [V1]>, DwarfRegNum<[3]>;
+  def A0_64   : Mips64GPRReg< 4, "4",    [A0]>, DwarfRegNum<[4]>;
+  def A1_64   : Mips64GPRReg< 5, "5",    [A1]>, DwarfRegNum<[5]>;
+  def A2_64   : Mips64GPRReg< 6, "6",    [A2]>, DwarfRegNum<[6]>;
+  def A3_64   : Mips64GPRReg< 7, "7",    [A3]>, DwarfRegNum<[7]>;
+  def T0_64   : Mips64GPRReg< 8, "8",    [T0]>, DwarfRegNum<[8]>;
+  def T1_64   : Mips64GPRReg< 9, "9",    [T1]>, DwarfRegNum<[9]>;
+  def T2_64   : Mips64GPRReg< 10, "10",  [T2]>, DwarfRegNum<[10]>;
+  def T3_64   : Mips64GPRReg< 11, "11",  [T3]>, DwarfRegNum<[11]>;
+  def T4_64   : Mips64GPRReg< 12, "12",  [T4]>, DwarfRegNum<[12]>;
+  def T5_64   : Mips64GPRReg< 13, "13",  [T5]>, DwarfRegNum<[13]>;
+  def T6_64   : Mips64GPRReg< 14, "14",  [T6]>, DwarfRegNum<[14]>;
+  def T7_64   : Mips64GPRReg< 15, "15",  [T7]>, DwarfRegNum<[15]>;
+  def S0_64   : Mips64GPRReg< 16, "16",  [S0]>, DwarfRegNum<[16]>;
+  def S1_64   : Mips64GPRReg< 17, "17",  [S1]>, DwarfRegNum<[17]>;
+  def S2_64   : Mips64GPRReg< 18, "18",  [S2]>, DwarfRegNum<[18]>;
+  def S3_64   : Mips64GPRReg< 19, "19",  [S3]>, DwarfRegNum<[19]>;
+  def S4_64   : Mips64GPRReg< 20, "20",  [S4]>, DwarfRegNum<[20]>;
+  def S5_64   : Mips64GPRReg< 21, "21",  [S5]>, DwarfRegNum<[21]>;
+  def S6_64   : Mips64GPRReg< 22, "22",  [S6]>, DwarfRegNum<[22]>;
+  def S7_64   : Mips64GPRReg< 23, "23",  [S7]>, DwarfRegNum<[23]>;
+  def T8_64   : Mips64GPRReg< 24, "24",  [T8]>, DwarfRegNum<[24]>;
+  def T9_64   : Mips64GPRReg< 25, "25",  [T9]>, DwarfRegNum<[25]>;
+  def K0_64   : Mips64GPRReg< 26, "26",  [K0]>, DwarfRegNum<[26]>;
+  def K1_64   : Mips64GPRReg< 27, "27",  [K1]>, DwarfRegNum<[27]>;
+  def GP_64   : Mips64GPRReg< 28, "gp",  [GP]>, DwarfRegNum<[28]>;
+  def SP_64   : Mips64GPRReg< 29, "sp",  [SP]>, DwarfRegNum<[29]>;
+  def FP_64   : Mips64GPRReg< 30, "fp",  [FP]>, DwarfRegNum<[30]>;
+  def RA_64   : Mips64GPRReg< 31, "ra",  [RA]>, DwarfRegNum<[31]>;
+
+  /// Mips Single point precision FPU Registers
+  foreach I = 0-31 in
+  def F#I : FPR<I, "f"#I>, DwarfRegNum<[!add(I, 32)]>;
+
+  // Higher half of 64-bit FP registers.
+  foreach I = 0-31 in
+  def F_HI#I : FPR<I, "f"#I>, DwarfRegNum<[!add(I, 32)]>;
+
+  /// Mips Double point precision FPU Registers (aliased
+  /// with the single precision to hold 64 bit values)
+  foreach I = 0-15 in
+  def D#I : AFPR<!shl(I, 1), "f"#!shl(I, 1),
+                 [!cast<FPR>("F"#!shl(I, 1)),
+                  !cast<FPR>("F"#!add(!shl(I, 1), 1))]>;
+
+  /// Mips Double point precision FPU Registers in MFP64 mode.
+  foreach I = 0-31 in
+  def D#I#_64 : AFPR64<I, "f"#I, [!cast<FPR>("F"#I), !cast<FPR>("F_HI"#I)]>,
+                DwarfRegNum<[!add(I, 32)]>;
+
+  /// Mips MSA registers
+  /// MSA and FPU cannot both be present unless the FPU has 64-bit registers
+  foreach I = 0-31 in
+  def W#I : AFPR128<I, "w"#I, [!cast<AFPR64>("D"#I#"_64")]>,
+            DwarfRegNum<[!add(I, 32)]>;
+
+  // Hi/Lo registers
+  def HI0 : MipsReg<0, "ac0">, DwarfRegNum<[64]>;
+  def HI1 : MipsReg<1, "ac1">, DwarfRegNum<[176]>;
+  def HI2 : MipsReg<2, "ac2">, DwarfRegNum<[178]>;
+  def HI3 : MipsReg<3, "ac3">, DwarfRegNum<[180]>;
+  def LO0 : MipsReg<0, "ac0">, DwarfRegNum<[65]>;
+  def LO1 : MipsReg<1, "ac1">, DwarfRegNum<[177]>;
+  def LO2 : MipsReg<2, "ac2">, DwarfRegNum<[179]>;
+  def LO3 : MipsReg<3, "ac3">, DwarfRegNum<[181]>;
+
+  let SubRegIndices = [sub_32] in {
+  def HI0_64  : RegisterWithSubRegs<"hi", [HI0]>;
+  def LO0_64  : RegisterWithSubRegs<"lo", [LO0]>;
+  }
+
+  // FP control registers.
+  foreach I = 0-31 in
+  def FCR#I : MipsReg<#I, ""#I>;
+
+  // FP condition code registers.
+  foreach I = 0-7 in
+  def FCC#I : MipsReg<#I, "fcc"#I>;
+
+  // COP0 registers.
+  foreach I = 0-31 in
+  def COP0#I : MipsReg<#I, ""#I>;
+
+  // COP2 registers.
+  foreach I = 0-31 in
+  def COP2#I : MipsReg<#I, ""#I>;
+
+  // COP3 registers.
+  foreach I = 0-31 in
+  def COP3#I : MipsReg<#I, ""#I>;
+
+  // PC register
+  def PC : Register<"pc">;
+
+  // Hardware registers
+  def HWR0 : MipsReg<0, "hwr_cpunum">;
+  def HWR1 : MipsReg<1, "hwr_synci_step">;
+  def HWR2 : MipsReg<2, "hwr_cc">;
+  def HWR3 : MipsReg<3, "hwr_ccres">;
+
+  foreach I = 4-31 in
+  def HWR#I : MipsReg<#I, ""#I>;
+
+  // Accum registers
+  foreach I = 0-3 in
+  def AC#I : ACCReg<#I, "ac"#I,
+                    [!cast<Register>("LO"#I), !cast<Register>("HI"#I)]>;
+
+  def AC0_64 : ACCReg<0, "ac0", [LO0_64, HI0_64]>;
+
+  // DSP-ASE control register fields.
+  def DSPPos : Register<"">;
+  def DSPSCount : Register<"">;
+  def DSPCarry : Register<"">;
+  def DSPEFI : Register<"">;
+  def DSPOutFlag16_19 : Register<"">;
+  def DSPOutFlag20 : Register<"">;
+  def DSPOutFlag21 : Register<"">;
+  def DSPOutFlag22 : Register<"">;
+  def DSPOutFlag23 : Register<"">;
+  def DSPCCond : Register<"">;
+
+  let SubRegIndices = [sub_dsp16_19, sub_dsp20, sub_dsp21, sub_dsp22,
+                       sub_dsp23] in
+  def DSPOutFlag : RegisterWithSubRegs<"", [DSPOutFlag16_19, DSPOutFlag20,
+                                            DSPOutFlag21, DSPOutFlag22,
+                                            DSPOutFlag23]>;
+
+  // MSA-ASE control registers.
+  def MSAIR      : MipsReg<0, "0">;
+  def MSACSR     : MipsReg<1, "1">;
+  def MSAAccess  : MipsReg<2, "2">;
+  def MSASave    : MipsReg<3, "3">;
+  def MSAModify  : MipsReg<4, "4">;
+  def MSARequest : MipsReg<5, "5">;
+  def MSAMap     : MipsReg<6, "6">;
+  def MSAUnmap   : MipsReg<7, "7">;
+
+  // Octeon multiplier and product registers
+  def MPL0 : MipsReg<0, "mpl0">;
+  def MPL1 : MipsReg<1, "mpl1">;
+  def MPL2 : MipsReg<2, "mpl2">;
+  def P0 : MipsReg<0, "p0">;
+  def P1 : MipsReg<1, "p1">;
+  def P2 : MipsReg<2, "p2">;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Register Classes
+//===----------------------------------------------------------------------===//
+
+class GPR32Class<list<ValueType> regTypes> :
+  RegisterClass<"Mips", regTypes, 32, (add
+  // Reserved
+  ZERO, AT,
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3,
+  // Not preserved across procedure calls
+  T0, T1, T2, T3, T4, T5, T6, T7,
+  // Callee save
+  S0, S1, S2, S3, S4, S5, S6, S7,
+  // Not preserved across procedure calls
+  T8, T9,
+  // Reserved
+  K0, K1, GP, SP, FP, RA)>;
+
+def GPR32 : GPR32Class<[i32]>;
+def DSPR  : GPR32Class<[v4i8, v2i16]>;
+
+def GPRMM16 : RegisterClass<"Mips", [i32], 32, (add
+  // Callee save
+  S0, S1,
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3)>;
+
+def GPRMM16Zero : RegisterClass<"Mips", [i32], 32, (add
+  // Reserved
+  ZERO,
+  // Callee save
+  S1,
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3)>;
+
+def GPRMM16MoveP : RegisterClass<"Mips", [i32], 32, (add
+  // Reserved
+  ZERO,
+  // Callee save
+  S1,
+  // Return Values and Arguments
+  V0, V1,
+  // Callee save
+  S0, S2, S3, S4)>;
+
+def GPR64 : RegisterClass<"Mips", [i64], 64, (add
+// Reserved
+  ZERO_64, AT_64,
+  // Return Values and Arguments
+  V0_64, V1_64, A0_64, A1_64, A2_64, A3_64,
+  // Not preserved across procedure calls
+  T0_64, T1_64, T2_64, T3_64, T4_64, T5_64, T6_64, T7_64,
+  // Callee save
+  S0_64, S1_64, S2_64, S3_64, S4_64, S5_64, S6_64, S7_64,
+  // Not preserved across procedure calls
+  T8_64, T9_64,
+  // Reserved
+  K0_64, K1_64, GP_64, SP_64, FP_64, RA_64)>;
+
+def GPRMM16_64 : RegisterClass<"Mips", [i64], 64, (add
+  // Callee save
+  S0_64, S1_64,
+  // Return Values and Arguments
+  V0_64, V1_64, A0_64, A1_64, A2_64, A3_64)>;
+
+def CPU16Regs : RegisterClass<"Mips", [i32], 32, (add
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3,
+  // Callee save
+  S0, S1)>;
+
+def CPU16RegsPlusSP : RegisterClass<"Mips", [i32], 32, (add
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3,
+  // Callee save
+  S0, S1,
+  SP)>;
+
+def CPURAReg : RegisterClass<"Mips", [i32], 32, (add RA)>, Unallocatable;
+
+def CPUSPReg : RegisterClass<"Mips", [i32], 32, (add SP)>, Unallocatable;
+
+// 64bit fp:
+// * FGR64  - 32 64-bit registers
+// * AFGR64 - 16 32-bit even registers (32-bit FP Mode)
+//
+// 32bit fp:
+// * FGR32 - 16 32-bit even registers
+// * FGR32 - 32 32-bit registers (single float only mode)
+def FGR32 : RegisterClass<"Mips", [f32], 32, (sequence "F%u", 0, 31)>;
+
+def FGRH32 : RegisterClass<"Mips", [f32], 32, (sequence "F_HI%u", 0, 31)>,
+             Unallocatable;
+
+def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
+  // Return Values and Arguments
+  D0, D1,
+  // Not preserved across procedure calls
+  D2, D3, D4, D5,
+  // Return Values and Arguments
+  D6, D7,
+  // Not preserved across procedure calls
+  D8, D9,
+  // Callee save
+  D10, D11, D12, D13, D14, D15)>;
+
+def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)>;
+
+// Used to reserve odd registers when given -mattr=+nooddspreg
+// FIXME: Remove double precision registers from this set.
+def OddSP : RegisterClass<"Mips", [f32], 32,
+                          (add (decimate (sequence "F%u", 1, 31), 2),
+                               (decimate (sequence "F_HI%u", 1, 31), 2),
+                               (decimate (sequence "D%u", 1, 15), 2),
+                               (decimate (sequence "D%u_64", 1, 31), 2))>,
+            Unallocatable;
+
+// FP control registers.
+def CCR : RegisterClass<"Mips", [i32], 32, (sequence "FCR%u", 0, 31)>,
+          Unallocatable;
+
+// FP condition code registers.
+def FCC : RegisterClass<"Mips", [i32], 32, (sequence "FCC%u", 0, 7)>,
+          Unallocatable;
+
+// MIPS32r6/MIPS64r6 store FPU condition codes in normal FGR registers.
+// This class allows us to represent this in codegen patterns.
+def FGRCC : RegisterClass<"Mips", [i32], 32, (sequence "F%u", 0, 31)>;
+
+def MSA128F16 : RegisterClass<"Mips", [f16], 128, (sequence "W%u", 0, 31)>;
+
+def MSA128B: RegisterClass<"Mips", [v16i8], 128,
+                           (sequence "W%u", 0, 31)>;
+def MSA128H: RegisterClass<"Mips", [v8i16, v8f16], 128,
+                           (sequence "W%u", 0, 31)>;
+def MSA128W: RegisterClass<"Mips", [v4i32, v4f32], 128,
+                           (sequence "W%u", 0, 31)>;
+def MSA128D: RegisterClass<"Mips", [v2i64, v2f64], 128,
+                           (sequence "W%u", 0, 31)>;
+def MSA128WEvens: RegisterClass<"Mips", [v4i32, v4f32], 128,
+                                (decimate (sequence "W%u", 0, 31), 2)>;
+
+def MSACtrl: RegisterClass<"Mips", [i32], 32, (add
+  MSAIR, MSACSR, MSAAccess, MSASave, MSAModify, MSARequest, MSAMap, MSAUnmap)>;
+
+// Hi/Lo Registers
+def LO32 : RegisterClass<"Mips", [i32], 32, (add LO0)>;
+def HI32 : RegisterClass<"Mips", [i32], 32, (add HI0)>;
+def LO32DSP : RegisterClass<"Mips", [i32], 32, (sequence "LO%u", 0, 3)>;
+def HI32DSP : RegisterClass<"Mips", [i32], 32, (sequence "HI%u", 0, 3)>;
+def LO64 : RegisterClass<"Mips", [i64], 64, (add LO0_64)>;
+def HI64 : RegisterClass<"Mips", [i64], 64, (add HI0_64)>;
+
+// Hardware registers
+def HWRegs : RegisterClass<"Mips", [i32], 32, (sequence "HWR%u", 0, 31)>,
+             Unallocatable;
+
+// Accumulator Registers
+def ACC64 : RegisterClass<"Mips", [untyped], 64, (add AC0)> {
+  let Size = 64;
+}
+
+def ACC128 : RegisterClass<"Mips", [untyped], 128, (add AC0_64)> {
+  let Size = 128;
+}
+
+def ACC64DSP : RegisterClass<"Mips", [untyped], 64, (sequence "AC%u", 0, 3)> {
+  let Size = 64;
+}
+
+def DSPCC : RegisterClass<"Mips", [v4i8, v2i16], 32, (add DSPCCond)>;
+
+// Coprocessor 0 registers.
+def COP0 : RegisterClass<"Mips", [i32], 32, (sequence "COP0%u", 0, 31)>,
+           Unallocatable;
+
+// Coprocessor 2 registers.
+def COP2 : RegisterClass<"Mips", [i32], 32, (sequence "COP2%u", 0, 31)>,
+           Unallocatable;
+
+// Coprocessor 3 registers.
+def COP3 : RegisterClass<"Mips", [i32], 32, (sequence "COP3%u", 0, 31)>,
+           Unallocatable;
+
+// Stack pointer and global pointer classes for instructions that are limited
+// to a single register such as lwgp/lwsp in microMIPS.
+def SP32 : RegisterClass<"Mips", [i32], 32, (add SP)>, Unallocatable;
+def SP64 : RegisterClass<"Mips", [i64], 64, (add SP_64)>, Unallocatable;
+def GP32 : RegisterClass<"Mips", [i32], 32, (add GP)>, Unallocatable;
+def GP64 : RegisterClass<"Mips", [i64], 64, (add GP_64)>, Unallocatable;
+
+// Octeon multiplier and product registers
+def OCTEON_MPL : RegisterClass<"Mips", [i64], 64, (add MPL0, MPL1, MPL2)>,
+                 Unallocatable;
+def OCTEON_P : RegisterClass<"Mips", [i64], 64, (add P0, P1, P2)>,
+               Unallocatable;
+
+// Register Operands.
+
+class MipsAsmRegOperand : AsmOperandClass {
+  let ParserMethod = "parseAnyRegister";
+}
+
+def GPR64AsmOperand : MipsAsmRegOperand {
+  let Name = "GPR64AsmReg";
+  let PredicateMethod = "isGPRAsmReg";
+}
+
+def GPR32AsmOperand : MipsAsmRegOperand {
+  let Name = "GPR32AsmReg";
+  let PredicateMethod = "isGPRAsmReg";
+}
+
+def GPRMM16AsmOperand : MipsAsmRegOperand {
+  let Name = "GPRMM16AsmReg";
+  let PredicateMethod = "isMM16AsmReg";
+}
+
+def GPRMM16AsmOperandZero : MipsAsmRegOperand {
+  let Name = "GPRMM16AsmRegZero";
+  let PredicateMethod = "isMM16AsmRegZero";
+}
+
+def GPRMM16AsmOperandMoveP : MipsAsmRegOperand {
+  let Name = "GPRMM16AsmRegMoveP";
+  let PredicateMethod = "isMM16AsmRegMoveP";
+}
+
+def ACC64DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "ACC64DSPAsmReg";
+  let PredicateMethod = "isACCAsmReg";
+}
+
+def HI32DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "HI32DSPAsmReg";
+  let PredicateMethod = "isACCAsmReg";
+}
+
+def LO32DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "LO32DSPAsmReg";
+  let PredicateMethod = "isACCAsmReg";
+}
+
+def CCRAsmOperand : MipsAsmRegOperand {
+  let Name = "CCRAsmReg";
+}
+
+def AFGR64AsmOperand : MipsAsmRegOperand {
+  let Name = "AFGR64AsmReg";
+  let PredicateMethod = "isFGRAsmReg";
+}
+
+def FGR64AsmOperand : MipsAsmRegOperand {
+  let Name = "FGR64AsmReg";
+  let PredicateMethod = "isFGRAsmReg";
+}
+
+def FGR32AsmOperand : MipsAsmRegOperand {
+  let Name = "FGR32AsmReg";
+  let PredicateMethod = "isFGRAsmReg";
+}
+
+def FGRH32AsmOperand : MipsAsmRegOperand {
+  let Name = "FGRH32AsmReg";
+  let PredicateMethod = "isFGRAsmReg";
+}
+
+def FCCRegsAsmOperand : MipsAsmRegOperand {
+  let Name = "FCCAsmReg";
+}
+
+def MSA128AsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128AsmReg";
+}
+
+def MSACtrlAsmOperand : MipsAsmRegOperand {
+  let Name = "MSACtrlAsmReg";
+}
+
+def GPR32Opnd : RegisterOperand<GPR32> {
+  let ParserMatchClass = GPR32AsmOperand;
+}
+
+def GPRMM16Opnd : RegisterOperand<GPRMM16> {
+  let ParserMatchClass = GPRMM16AsmOperand;
+}
+
+def GPRMM16OpndZero : RegisterOperand<GPRMM16Zero> {
+  let ParserMatchClass = GPRMM16AsmOperandZero;
+}
+
+def GPRMM16OpndMoveP : RegisterOperand<GPRMM16MoveP> {
+  let ParserMatchClass = GPRMM16AsmOperandMoveP;
+}
+
+def GPR64Opnd : RegisterOperand<GPR64> {
+  let ParserMatchClass = GPR64AsmOperand;
+}
+
+def DSPROpnd : RegisterOperand<DSPR> {
+  let ParserMatchClass = GPR32AsmOperand;
+}
+
+def CCROpnd : RegisterOperand<CCR> {
+  let ParserMatchClass = CCRAsmOperand;
+}
+
+def HWRegsAsmOperand : MipsAsmRegOperand {
+  let Name = "HWRegsAsmReg";
+}
+
+def COP0AsmOperand : MipsAsmRegOperand {
+  let Name = "COP0AsmReg";
+}
+
+def COP2AsmOperand : MipsAsmRegOperand {
+  let Name = "COP2AsmReg";
+}
+
+def COP3AsmOperand : MipsAsmRegOperand {
+  let Name = "COP3AsmReg";
+}
+
+def HWRegsOpnd : RegisterOperand<HWRegs> {
+  let ParserMatchClass = HWRegsAsmOperand;
+}
+
+def AFGR64Opnd : RegisterOperand<AFGR64> {
+  let ParserMatchClass = AFGR64AsmOperand;
+}
+
+def FGR64Opnd : RegisterOperand<FGR64> {
+  let ParserMatchClass = FGR64AsmOperand;
+}
+
+def FGR32Opnd : RegisterOperand<FGR32> {
+  let ParserMatchClass = FGR32AsmOperand;
+}
+
+def FGRCCOpnd : RegisterOperand<FGRCC> {
+  // The assembler doesn't use register classes so we can re-use
+  // FGR32AsmOperand.
+  let ParserMatchClass = FGR32AsmOperand;
+}
+
+def FGRH32Opnd : RegisterOperand<FGRH32> {
+  let ParserMatchClass = FGRH32AsmOperand;
+}
+
+def FCCRegsOpnd : RegisterOperand<FCC> {
+  let ParserMatchClass = FCCRegsAsmOperand;
+}
+
+def LO32DSPOpnd : RegisterOperand<LO32DSP> {
+  let ParserMatchClass = LO32DSPAsmOperand;
+}
+
+def HI32DSPOpnd : RegisterOperand<HI32DSP> {
+  let ParserMatchClass = HI32DSPAsmOperand;
+}
+
+def ACC64DSPOpnd : RegisterOperand<ACC64DSP> {
+  let ParserMatchClass = ACC64DSPAsmOperand;
+}
+
+def COP0Opnd : RegisterOperand<COP0> {
+  let ParserMatchClass = COP0AsmOperand;
+}
+
+def COP2Opnd : RegisterOperand<COP2> {
+  let ParserMatchClass = COP2AsmOperand;
+}
+
+def COP3Opnd : RegisterOperand<COP3> {
+  let ParserMatchClass = COP3AsmOperand;
+}
+
+def MSA128F16Opnd : RegisterOperand<MSA128F16> {
+  let ParserMatchClass = MSA128AsmOperand;
+}
+
+def MSA128BOpnd : RegisterOperand<MSA128B> {
+  let ParserMatchClass = MSA128AsmOperand;
+}
+
+def MSA128HOpnd : RegisterOperand<MSA128H> {
+  let ParserMatchClass = MSA128AsmOperand;
+}
+
+def MSA128WOpnd : RegisterOperand<MSA128W> {
+  let ParserMatchClass = MSA128AsmOperand;
+}
+
+def MSA128DOpnd : RegisterOperand<MSA128D> {
+  let ParserMatchClass = MSA128AsmOperand;
+}
+
+def MSA128CROpnd : RegisterOperand<MSACtrl> {
+  let ParserMatchClass = MSACtrlAsmOperand;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
new file mode 100644
index 000000000000..4996d070eb29
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -0,0 +1,893 @@
+//===-- MipsSEFrameLowering.cpp - Mips32/64 Frame Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSEFrameLowering.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsSEInstrInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+namespace {
+typedef MachineBasicBlock::iterator Iter;
+
+static std::pair<unsigned, unsigned> getMFHiLoOpc(unsigned Src) {
+  if (Mips::ACC64RegClass.contains(Src))
+    return std::make_pair((unsigned)Mips::PseudoMFHI,
+                          (unsigned)Mips::PseudoMFLO);
+
+  if (Mips::ACC64DSPRegClass.contains(Src))
+    return std::make_pair((unsigned)Mips::MFHI_DSP, (unsigned)Mips::MFLO_DSP);
+
+  if (Mips::ACC128RegClass.contains(Src))
+    return std::make_pair((unsigned)Mips::PseudoMFHI64,
+                          (unsigned)Mips::PseudoMFLO64);
+
+  return std::make_pair(0, 0);
+}
+
+/// Helper class to expand pseudos.
+class ExpandPseudo {
+public:
+  ExpandPseudo(MachineFunction &MF);
+  bool expand();
+
+private:
+  bool expandInstr(MachineBasicBlock &MBB, Iter I);
+  void expandLoadCCond(MachineBasicBlock &MBB, Iter I);
+  void expandStoreCCond(MachineBasicBlock &MBB, Iter I);
+  void expandLoadACC(MachineBasicBlock &MBB, Iter I, unsigned RegSize);
+  void expandStoreACC(MachineBasicBlock &MBB, Iter I, unsigned MFHiOpc,
+                      unsigned MFLoOpc, unsigned RegSize);
+  bool expandCopy(MachineBasicBlock &MBB, Iter I);
+  bool expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned MFHiOpc,
+                     unsigned MFLoOpc);
+  bool expandBuildPairF64(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator I, bool FP64) const;
+  bool expandExtractElementF64(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, bool FP64) const;
+
+  MachineFunction &MF;
+  MachineRegisterInfo &MRI;
+  const MipsSubtarget &Subtarget;
+  const MipsSEInstrInfo &TII;
+  const MipsRegisterInfo &RegInfo;
+};
+}
+
+ExpandPseudo::ExpandPseudo(MachineFunction &MF_)
+    : MF(MF_), MRI(MF.getRegInfo()),
+      Subtarget(static_cast<const MipsSubtarget &>(MF.getSubtarget())),
+      TII(*static_cast<const MipsSEInstrInfo *>(Subtarget.getInstrInfo())),
+      RegInfo(*Subtarget.getRegisterInfo()) {}
+
+bool ExpandPseudo::expand() {
+  bool Expanded = false;
+
+  for (auto &MBB : MF) {
+    for (Iter I = MBB.begin(), End = MBB.end(); I != End;)
+      Expanded |= expandInstr(MBB, I++);
+  }
+
+  return Expanded;
+}
+
+bool ExpandPseudo::expandInstr(MachineBasicBlock &MBB, Iter I) {
+  switch(I->getOpcode()) {
+  case Mips::LOAD_CCOND_DSP:
+    expandLoadCCond(MBB, I);
+    break;
+  case Mips::STORE_CCOND_DSP:
+    expandStoreCCond(MBB, I);
+    break;
+  case Mips::LOAD_ACC64:
+  case Mips::LOAD_ACC64DSP:
+    expandLoadACC(MBB, I, 4);
+    break;
+  case Mips::LOAD_ACC128:
+    expandLoadACC(MBB, I, 8);
+    break;
+  case Mips::STORE_ACC64:
+    expandStoreACC(MBB, I, Mips::PseudoMFHI, Mips::PseudoMFLO, 4);
+    break;
+  case Mips::STORE_ACC64DSP:
+    expandStoreACC(MBB, I, Mips::MFHI_DSP, Mips::MFLO_DSP, 4);
+    break;
+  case Mips::STORE_ACC128:
+    expandStoreACC(MBB, I, Mips::PseudoMFHI64, Mips::PseudoMFLO64, 8);
+    break;
+  case Mips::BuildPairF64:
+    if (expandBuildPairF64(MBB, I, false))
+      MBB.erase(I);
+    return false;
+  case Mips::BuildPairF64_64:
+    if (expandBuildPairF64(MBB, I, true))
+      MBB.erase(I);
+    return false;
+  case Mips::ExtractElementF64:
+    if (expandExtractElementF64(MBB, I, false))
+      MBB.erase(I);
+    return false;
+  case Mips::ExtractElementF64_64:
+    if (expandExtractElementF64(MBB, I, true))
+      MBB.erase(I);
+    return false;
+  case TargetOpcode::COPY:
+    if (!expandCopy(MBB, I))
+      return false;
+    break;
+  default:
+    return false;
+  }
+
+  MBB.erase(I);
+  return true;
+}
+
+void ExpandPseudo::expandLoadCCond(MachineBasicBlock &MBB, Iter I) {
+  //  load $vr, FI
+  //  copy ccond, $vr
+
+  assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
+
+  const TargetRegisterClass *RC = RegInfo.intRegClass(4);
+  unsigned VR = MRI.createVirtualRegister(RC);
+  unsigned Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+
+  TII.loadRegFromStack(MBB, I, VR, FI, RC, &RegInfo, 0);
+  BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), Dst)
+    .addReg(VR, RegState::Kill);
+}
+
+void ExpandPseudo::expandStoreCCond(MachineBasicBlock &MBB, Iter I) {
+  //  copy $vr, ccond
+  //  store $vr, FI
+
+  assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
+
+  const TargetRegisterClass *RC = RegInfo.intRegClass(4);
+  unsigned VR = MRI.createVirtualRegister(RC);
+  unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+
+  BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), VR)
+    .addReg(Src, getKillRegState(I->getOperand(0).isKill()));
+  TII.storeRegToStack(MBB, I, VR, true, FI, RC, &RegInfo, 0);
+}
+
+void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I,
+                                 unsigned RegSize) {
+  //  load $vr0, FI
+  //  copy lo, $vr0
+  //  load $vr1, FI + 4
+  //  copy hi, $vr1
+
+  assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
+
+  const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
+  unsigned VR0 = MRI.createVirtualRegister(RC);
+  unsigned VR1 = MRI.createVirtualRegister(RC);
+  unsigned Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+  unsigned Lo = RegInfo.getSubReg(Dst, Mips::sub_lo);
+  unsigned Hi = RegInfo.getSubReg(Dst, Mips::sub_hi);
+  DebugLoc DL = I->getDebugLoc();
+  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+
+  TII.loadRegFromStack(MBB, I, VR0, FI, RC, &RegInfo, 0);
+  BuildMI(MBB, I, DL, Desc, Lo).addReg(VR0, RegState::Kill);
+  TII.loadRegFromStack(MBB, I, VR1, FI, RC, &RegInfo, RegSize);
+  BuildMI(MBB, I, DL, Desc, Hi).addReg(VR1, RegState::Kill);
+}
+
+void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I,
+                                  unsigned MFHiOpc, unsigned MFLoOpc,
+                                  unsigned RegSize) {
+  //  mflo $vr0, src
+  //  store $vr0, FI
+  //  mfhi $vr1, src
+  //  store $vr1, FI + 4
+
+  assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
+
+  const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
+  unsigned VR0 = MRI.createVirtualRegister(RC);
+  unsigned VR1 = MRI.createVirtualRegister(RC);
+  unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+  unsigned SrcKill = getKillRegState(I->getOperand(0).isKill());
+  DebugLoc DL = I->getDebugLoc();
+
+  BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src);
+  TII.storeRegToStack(MBB, I, VR0, true, FI, RC, &RegInfo, 0);
+  BuildMI(MBB, I, DL, TII.get(MFHiOpc), VR1).addReg(Src, SrcKill);
+  TII.storeRegToStack(MBB, I, VR1, true, FI, RC, &RegInfo, RegSize);
+}
+
+bool ExpandPseudo::expandCopy(MachineBasicBlock &MBB, Iter I) {
+  unsigned Src = I->getOperand(1).getReg();
+  std::pair<unsigned, unsigned> Opcodes = getMFHiLoOpc(Src);
+
+  if (!Opcodes.first)
+    return false;
+
+  return expandCopyACC(MBB, I, Opcodes.first, Opcodes.second);
+}
+
+bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I,
+                                 unsigned MFHiOpc, unsigned MFLoOpc) {
+  //  mflo $vr0, src
+  //  copy dst_lo, $vr0
+  //  mfhi $vr1, src
+  //  copy dst_hi, $vr1
+
+  unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg();
+  unsigned VRegSize = RegInfo.getMinimalPhysRegClass(Dst)->getSize() / 2;
+  const TargetRegisterClass *RC = RegInfo.intRegClass(VRegSize);
+  unsigned VR0 = MRI.createVirtualRegister(RC);
+  unsigned VR1 = MRI.createVirtualRegister(RC);
+  unsigned SrcKill = getKillRegState(I->getOperand(1).isKill());
+  unsigned DstLo = RegInfo.getSubReg(Dst, Mips::sub_lo);
+  unsigned DstHi = RegInfo.getSubReg(Dst, Mips::sub_hi);
+  DebugLoc DL = I->getDebugLoc();
+
+  BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src);
+  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), DstLo)
+    .addReg(VR0, RegState::Kill);
+  BuildMI(MBB, I, DL, TII.get(MFHiOpc), VR1).addReg(Src, SrcKill);
+  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), DstHi)
+    .addReg(VR1, RegState::Kill);
+  return true;
+}
+
+/// This method expands the same instruction that MipsSEInstrInfo::
+/// expandBuildPairF64 does, for the case when ABI is fpxx and mthc1 is not
+/// available and the case where the ABI is FP64A. It is implemented here
+/// because frame indexes are eliminated before MipsSEInstrInfo::
+/// expandBuildPairF64 is called.
+bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator I,
+                                      bool FP64) const {
+  // For fpxx and when mthc1 is not available, use:
+  //   spill + reload via ldc1
+  //
+  // The case where dmtc1 is available doesn't need to be handled here
+  // because it never creates a BuildPairF64 node.
+  //
+  // The FP64A ABI (fp64 with nooddspreg) must also use a spill/reload sequence
+  // for odd-numbered double precision values (because the lower 32-bits is
+  // transferred with mtc1 which is redirected to the upper half of the even
+  // register). Unfortunately, we have to make this decision before register
+  // allocation so for now we use a spill/reload sequence for all
+  // double-precision values in regardless of being an odd/even register.
+  if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) ||
+      (FP64 && !Subtarget.useOddSPReg())) {
+    unsigned DstReg = I->getOperand(0).getReg();
+    unsigned LoReg = I->getOperand(1).getReg();
+    unsigned HiReg = I->getOperand(2).getReg();
+
+    // It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are
+    // the cases where mthc1 is not available). 64-bit architectures and
+    // MIPS32r2 or later can use FGR64 though.
+    assert(Subtarget.isGP64bit() || Subtarget.hasMTHC1() ||
+           !Subtarget.isFP64bit());
+
+    const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+    const TargetRegisterClass *RC2 =
+        FP64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass;
+
+    // We re-use the same spill slot each time so that the stack frame doesn't
+    // grow too much in functions with a large number of moves.
+    int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(RC2);
+    if (!Subtarget.isLittle())
+      std::swap(LoReg, HiReg);
+    TII.storeRegToStack(MBB, I, LoReg, I->getOperand(1).isKill(), FI, RC,
+                        &RegInfo, 0);
+    TII.storeRegToStack(MBB, I, HiReg, I->getOperand(2).isKill(), FI, RC,
+                        &RegInfo, 4);
+    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, 0);
+    return true;
+  }
+
+  return false;
+}
+
+/// This method expands the same instruction that MipsSEInstrInfo::
+/// expandExtractElementF64 does, for the case when ABI is fpxx and mfhc1 is not
+/// available and the case where the ABI is FP64A. It is implemented here
+/// because frame indexes are eliminated before MipsSEInstrInfo::
+/// expandExtractElementF64 is called.
+bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator I,
+                                           bool FP64) const {
+  const MachineOperand &Op1 = I->getOperand(1);
+  const MachineOperand &Op2 = I->getOperand(2);
+
+  if ((Op1.isReg() && Op1.isUndef()) || (Op2.isReg() && Op2.isUndef())) {
+    unsigned DstReg = I->getOperand(0).getReg();
+    BuildMI(MBB, I, I->getDebugLoc(), TII.get(Mips::IMPLICIT_DEF), DstReg);
+    return true;
+  }
+
+  // For fpxx and when mfhc1 is not available, use:
+  //   spill + reload via ldc1
+  //
+  // The case where dmfc1 is available doesn't need to be handled here
+  // because it never creates a ExtractElementF64 node.
+  //
+  // The FP64A ABI (fp64 with nooddspreg) must also use a spill/reload sequence
+  // for odd-numbered double precision values (because the lower 32-bits is
+  // transferred with mfc1 which is redirected to the upper half of the even
+  // register). Unfortunately, we have to make this decision before register
+  // allocation so for now we use a spill/reload sequence for all
+  // double-precision values in regardless of being an odd/even register.
+
+  if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) ||
+      (FP64 && !Subtarget.useOddSPReg())) {
+    unsigned DstReg = I->getOperand(0).getReg();
+    unsigned SrcReg = Op1.getReg();
+    unsigned N = Op2.getImm();
+    int64_t Offset = 4 * (Subtarget.isLittle() ? N : (1 - N));
+
+    // It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are
+    // the cases where mfhc1 is not available). 64-bit architectures and
+    // MIPS32r2 or later can use FGR64 though.
+    assert(Subtarget.isGP64bit() || Subtarget.hasMTHC1() ||
+           !Subtarget.isFP64bit());
+
+    const TargetRegisterClass *RC =
+        FP64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass;
+    const TargetRegisterClass *RC2 = &Mips::GPR32RegClass;
+
+    // We re-use the same spill slot each time so that the stack frame doesn't
+    // grow too much in functions with a large number of moves.
+    int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(RC);
+    TII.storeRegToStack(MBB, I, SrcReg, Op1.isKill(), FI, RC, &RegInfo, 0);
+    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, Offset);
+    return true;
+  }
+
+  return false;
+}
+
+MipsSEFrameLowering::MipsSEFrameLowering(const MipsSubtarget &STI)
+    : MipsFrameLowering(STI, STI.stackAlignment()) {}
+
+void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) const {
+  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+  MachineFrameInfo &MFI    = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  const MipsSEInstrInfo &TII =
+      *static_cast<const MipsSEInstrInfo *>(STI.getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+      *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo());
+
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc dl;
+  MipsABIInfo ABI = STI.getABI();
+  unsigned SP = ABI.GetStackPtr();
+  unsigned FP = ABI.GetFramePtr();
+  unsigned ZERO = ABI.GetNullPtr();
+  unsigned MOVE = ABI.GetGPRMoveOp();
+  unsigned ADDiu = ABI.GetPtrAddiuOp();
+  unsigned AND = ABI.IsN64() ? Mips::AND64 : Mips::AND;
+
+  const TargetRegisterClass *RC = ABI.ArePtrs64bit() ?
+        &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+
+  // First, compute final stack size.
+  uint64_t StackSize = MFI.getStackSize();
+
+  // No need to allocate space on the stack.
+  if (StackSize == 0 && !MFI.adjustsStack()) return;
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  MachineLocation DstML, SrcML;
+
+  // Adjust stack.
+  TII.adjustStackPtr(SP, -StackSize, MBB, MBBI);
+
+  // emit ".cfi_def_cfa_offset StackSize"
+  unsigned CFIIndex = MF.addFrameInst(
+      MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  if (MF.getFunction()->hasFnAttribute("interrupt"))
+    emitInterruptPrologueStub(MF, MBB);
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+  if (CSI.size()) {
+    // Find the instruction past the last instruction that saves a callee-saved
+    // register to the stack.
+    for (unsigned i = 0; i < CSI.size(); ++i)
+      ++MBBI;
+
+    // Iterate over list of callee-saved registers and emit .cfi_offset
+    // directives.
+    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+           E = CSI.end(); I != E; ++I) {
+      int64_t Offset = MFI.getObjectOffset(I->getFrameIdx());
+      unsigned Reg = I->getReg();
+
+      // If Reg is a double precision register, emit two cfa_offsets,
+      // one for each of the paired single precision registers.
+      if (Mips::AFGR64RegClass.contains(Reg)) {
+        unsigned Reg0 =
+            MRI->getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_lo), true);
+        unsigned Reg1 =
+            MRI->getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_hi), true);
+
+        if (!STI.isLittle())
+          std::swap(Reg0, Reg1);
+
+        unsigned CFIIndex = MF.addFrameInst(
+            MCCFIInstruction::createOffset(nullptr, Reg0, Offset));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+
+        CFIIndex = MF.addFrameInst(
+            MCCFIInstruction::createOffset(nullptr, Reg1, Offset + 4));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+      } else if (Mips::FGR64RegClass.contains(Reg)) {
+        unsigned Reg0 = MRI->getDwarfRegNum(Reg, true);
+        unsigned Reg1 = MRI->getDwarfRegNum(Reg, true) + 1;
+
+        if (!STI.isLittle())
+          std::swap(Reg0, Reg1);
+
+        unsigned CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg0, Offset));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+
+        CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg1, Offset + 4));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+      } else {
+        // Reg is either in GPR32 or FGR32.
+        unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+            nullptr, MRI->getDwarfRegNum(Reg, 1), Offset));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+      }
+    }
+  }
+
+  if (MipsFI->callsEhReturn()) {
+    // Insert instructions that spill eh data registers.
+    for (int I = 0; I < 4; ++I) {
+      if (!MBB.isLiveIn(ABI.GetEhDataReg(I)))
+        MBB.addLiveIn(ABI.GetEhDataReg(I));
+      TII.storeRegToStackSlot(MBB, MBBI, ABI.GetEhDataReg(I), false,
+                              MipsFI->getEhDataRegFI(I), RC, &RegInfo);
+    }
+
+    // Emit .cfi_offset directives for eh data registers.
+    for (int I = 0; I < 4; ++I) {
+      int64_t Offset = MFI.getObjectOffset(MipsFI->getEhDataRegFI(I));
+      unsigned Reg = MRI->getDwarfRegNum(ABI.GetEhDataReg(I), true);
+      unsigned CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, Offset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
+
+  // if framepointer enabled, set it to point to the stack pointer.
+  if (hasFP(MF)) {
+    // Insert instruction "move $fp, $sp" at this location.
+    BuildMI(MBB, MBBI, dl, TII.get(MOVE), FP).addReg(SP).addReg(ZERO)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+    // emit ".cfi_def_cfa_register $fp"
+    unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+        nullptr, MRI->getDwarfRegNum(FP, true)));
+    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+
+    if (RegInfo.needsStackRealignment(MF)) {
+      // addiu $Reg, $zero, -MaxAlignment
+      // andi $sp, $sp, $Reg
+      unsigned VR = MF.getRegInfo().createVirtualRegister(RC);
+      assert(isInt<16>(MFI.getMaxAlignment()) &&
+             "Function's alignment size requirement is not supported.");
+      int MaxAlign = -(int)MFI.getMaxAlignment();
+
+      BuildMI(MBB, MBBI, dl, TII.get(ADDiu), VR).addReg(ZERO) .addImm(MaxAlign);
+      BuildMI(MBB, MBBI, dl, TII.get(AND), SP).addReg(SP).addReg(VR);
+
+      if (hasBP(MF)) {
+        // move $s7, $sp
+        unsigned BP = STI.isABI_N64() ? Mips::S7_64 : Mips::S7;
+        BuildMI(MBB, MBBI, dl, TII.get(MOVE), BP)
+          .addReg(SP)
+          .addReg(ZERO);
+      }
+    }
+  }
+}
+
+void MipsSEFrameLowering::emitInterruptPrologueStub(
+    MachineFunction &MF, MachineBasicBlock &MBB) const {
+
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Report an error the target doesn't support Mips32r2 or later.
+  // The epilogue relies on the use of the "ehb" to clear execution
+  // hazards. Pre R2 Mips relies on an implementation defined number
+  // of "ssnop"s to clear the execution hazard. Support for ssnop hazard
+  // clearing is not provided so reject that configuration.
+  if (!STI.hasMips32r2())
+    report_fatal_error(
+        "\"interrupt\" attribute is not supported on pre-MIPS32R2 or "
+        "MIPS16 targets.");
+
+  // The GP register contains the "user" value, so we cannot perform
+  // any gp relative loads until we restore the "kernel" or "system" gp
+  // value. Until support is written we shall only accept the static
+  // relocation model.
+  if ((STI.getRelocationModel() != Reloc::Static))
+    report_fatal_error("\"interrupt\" attribute is only supported for the "
+                       "static relocation model on MIPS at the present time.");
+
+  if (!STI.isABI_O32() || STI.hasMips64())
+    report_fatal_error("\"interrupt\" attribute is only supported for the "
+                       "O32 ABI on MIPS32R2+ at the present time.");
+
+  // Perform ISR handling like GCC
+  StringRef IntKind =
+      MF.getFunction()->getFnAttribute("interrupt").getValueAsString();
+  const TargetRegisterClass *PtrRC = &Mips::GPR32RegClass;
+
+  // EIC interrupt handling needs to read the Cause register to disable
+  // interrupts.
+  if (IntKind == "eic") {
+    // Coprocessor registers are always live per se.
+    MBB.addLiveIn(Mips::COP013);
+    BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K0)
+        .addReg(Mips::COP013)
+        .addImm(0)
+        .setMIFlag(MachineInstr::FrameSetup);
+
+    BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::EXT), Mips::K0)
+        .addReg(Mips::K0)
+        .addImm(10)
+        .addImm(6)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // Fetch and spill EPC
+  MBB.addLiveIn(Mips::COP014);
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K1)
+      .addReg(Mips::COP014)
+      .addImm(0)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false,
+                                      MipsFI->getISRRegFI(0), PtrRC,
+                                      STI.getRegisterInfo(), 0);
+
+  // Fetch and Spill Status
+  MBB.addLiveIn(Mips::COP012);
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K1)
+      .addReg(Mips::COP012)
+      .addImm(0)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false,
+                                      MipsFI->getISRRegFI(1), PtrRC,
+                                      STI.getRegisterInfo(), 0);
+
+  // Build the configuration for disabling lower priority interrupts. Non EIC
+  // interrupts need to be masked off with zero, EIC from the Cause register.
+  unsigned InsPosition = 8;
+  unsigned InsSize = 0;
+  unsigned SrcReg = Mips::ZERO;
+
+  // If the interrupt we're tied to is the EIC, switch the source for the
+  // masking off interrupts to the cause register.
+  if (IntKind == "eic") {
+    SrcReg = Mips::K0;
+    InsPosition = 10;
+    InsSize = 6;
+  } else
+    InsSize = StringSwitch<unsigned>(IntKind)
+                  .Case("sw0", 1)
+                  .Case("sw1", 2)
+                  .Case("hw0", 3)
+                  .Case("hw1", 4)
+                  .Case("hw2", 5)
+                  .Case("hw3", 6)
+                  .Case("hw4", 7)
+                  .Case("hw5", 8)
+                  .Default(0);
+  assert(InsSize != 0 && "Unknown interrupt type!");
+
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1)
+      .addReg(SrcReg)
+      .addImm(InsPosition)
+      .addImm(InsSize)
+      .addReg(Mips::K1)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // Mask off KSU, ERL, EXL
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1)
+      .addReg(Mips::ZERO)
+      .addImm(1)
+      .addImm(4)
+      .addReg(Mips::K1)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // Disable the FPU as we are not spilling those register sets.
+  if (!STI.useSoftFloat())
+    BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1)
+        .addReg(Mips::ZERO)
+        .addImm(29)
+        .addImm(1)
+        .addReg(Mips::K1)
+        .setMIFlag(MachineInstr::FrameSetup);
+
+  // Set the new status
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP012)
+      .addReg(Mips::K1)
+      .addImm(0)
+      .setMIFlag(MachineInstr::FrameSetup);
+}
+
+void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  MachineFrameInfo &MFI            = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  const MipsSEInstrInfo &TII =
+      *static_cast<const MipsSEInstrInfo *>(STI.getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+      *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo());
+
+  DebugLoc DL = MBBI->getDebugLoc();
+  MipsABIInfo ABI = STI.getABI();
+  unsigned SP = ABI.GetStackPtr();
+  unsigned FP = ABI.GetFramePtr();
+  unsigned ZERO = ABI.GetNullPtr();
+  unsigned MOVE = ABI.GetGPRMoveOp();
+
+  // if framepointer enabled, restore the stack pointer.
+  if (hasFP(MF)) {
+    // Find the first instruction that restores a callee-saved register.
+    MachineBasicBlock::iterator I = MBBI;
+
+    for (unsigned i = 0; i < MFI.getCalleeSavedInfo().size(); ++i)
+      --I;
+
+    // Insert instruction "move $sp, $fp" at this location.
+    BuildMI(MBB, I, DL, TII.get(MOVE), SP).addReg(FP).addReg(ZERO);
+  }
+
+  if (MipsFI->callsEhReturn()) {
+    const TargetRegisterClass *RC =
+        ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+
+    // Find first instruction that restores a callee-saved register.
+    MachineBasicBlock::iterator I = MBBI;
+    for (unsigned i = 0; i < MFI.getCalleeSavedInfo().size(); ++i)
+      --I;
+
+    // Insert instructions that restore eh data registers.
+    for (int J = 0; J < 4; ++J) {
+      TII.loadRegFromStackSlot(MBB, I, ABI.GetEhDataReg(J),
+                               MipsFI->getEhDataRegFI(J), RC, &RegInfo);
+    }
+  }
+
+  if (MF.getFunction()->hasFnAttribute("interrupt"))
+    emitInterruptEpilogueStub(MF, MBB);
+
+  // Get the number of bytes from FrameInfo
+  uint64_t StackSize = MFI.getStackSize();
+
+  if (!StackSize)
+    return;
+
+  // Adjust stack.
+  TII.adjustStackPtr(SP, StackSize, MBB, MBBI);
+}
+
+void MipsSEFrameLowering::emitInterruptEpilogueStub(
+    MachineFunction &MF, MachineBasicBlock &MBB) const {
+
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Perform ISR handling like GCC
+  const TargetRegisterClass *PtrRC = &Mips::GPR32RegClass;
+
+  // Disable Interrupts.
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::DI), Mips::ZERO);
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::EHB));
+
+  // Restore EPC
+  STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1,
+                                           MipsFI->getISRRegFI(0), PtrRC,
+                                           STI.getRegisterInfo());
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP014)
+      .addReg(Mips::K1)
+      .addImm(0);
+
+  // Restore Status
+  STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1,
+                                           MipsFI->getISRRegFI(1), PtrRC,
+                                           STI.getRegisterInfo());
+  BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP012)
+      .addReg(Mips::K1)
+      .addImm(0);
+}
+
+int MipsSEFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                int FI,
+                                                unsigned &FrameReg) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  MipsABIInfo ABI = STI.getABI();
+
+  if (MFI.isFixedObjectIndex(FI))
+    FrameReg = hasFP(MF) ? ABI.GetFramePtr() : ABI.GetStackPtr();
+  else
+    FrameReg = hasBP(MF) ? ABI.GetBasePtr() : ABI.GetStackPtr();
+
+  return MFI.getObjectOffset(FI) + MFI.getStackSize() -
+         getOffsetOfLocalArea() + MFI.getOffsetAdjustment();
+}
+
+bool MipsSEFrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  MachineBasicBlock *EntryBlock = &MF->front();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    // Add the callee-saved register as live-in. Do not add if the register is
+    // RA and return address is taken, because it has already been added in
+    // method MipsTargetLowering::lowerRETURNADDR.
+    // It's killed at the spill, unless the register is RA and return address
+    // is taken.
+    unsigned Reg = CSI[i].getReg();
+    bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA || Reg == Mips::RA_64)
+        && MF->getFrameInfo().isReturnAddressTaken();
+    if (!IsRAAndRetAddrIsTaken)
+      EntryBlock->addLiveIn(Reg);
+
+    // ISRs require HI/LO to be spilled into kernel registers to be then
+    // spilled to the stack frame.
+    bool IsLOHI = (Reg == Mips::LO0 || Reg == Mips::LO0_64 ||
+                   Reg == Mips::HI0 || Reg == Mips::HI0_64);
+    const Function *Func = MBB.getParent()->getFunction();
+    if (IsLOHI && Func->hasFnAttribute("interrupt")) {
+      DebugLoc DL = MI->getDebugLoc();
+
+      unsigned Op = 0;
+      if (!STI.getABI().ArePtrs64bit()) {
+        Op = (Reg == Mips::HI0) ? Mips::MFHI : Mips::MFLO;
+        Reg = Mips::K0;
+      } else {
+        Op = (Reg == Mips::HI0) ? Mips::MFHI64 : Mips::MFLO64;
+        Reg = Mips::K0_64;
+      }
+      BuildMI(MBB, MI, DL, TII.get(Op), Mips::K0)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    // Insert the spill to the stack frame.
+    bool IsKill = !IsRAAndRetAddrIsTaken;
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.storeRegToStackSlot(*EntryBlock, MI, Reg, IsKill,
+                            CSI[i].getFrameIdx(), RC, TRI);
+  }
+
+  return true;
+}
+
+bool
+MipsSEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // Reserve call frame if the size of the maximum call frame fits into 16-bit
+  // immediate field and there are no variable sized objects on the stack.
+  // Make sure the second register scavenger spill slot can be accessed with one
+  // instruction.
+  return isInt<16>(MFI.getMaxCallFrameSize() + getStackAlignment()) &&
+    !MFI.hasVarSizedObjects();
+}
+
+/// Mark \p Reg and all registers aliasing it in the bitset.
+static void setAliasRegs(MachineFunction &MF, BitVector &SavedRegs,
+                         unsigned Reg) {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    SavedRegs.set(*AI);
+}
+
+void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                               BitVector &SavedRegs,
+                                               RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  MipsABIInfo ABI = STI.getABI();
+  unsigned FP = ABI.GetFramePtr();
+  unsigned BP = ABI.IsN64() ? Mips::S7_64 : Mips::S7;
+
+  // Mark $fp as used if function has dedicated frame pointer.
+  if (hasFP(MF))
+    setAliasRegs(MF, SavedRegs, FP);
+  // Mark $s7 as used if function has dedicated base pointer.
+  if (hasBP(MF))
+    setAliasRegs(MF, SavedRegs, BP);
+
+  // Create spill slots for eh data registers if function calls eh_return.
+  if (MipsFI->callsEhReturn())
+    MipsFI->createEhDataRegsFI();
+
+  // Create spill slots for Coprocessor 0 registers if function is an ISR.
+  if (MipsFI->isISR())
+    MipsFI->createISRRegFI();
+
+  // Expand pseudo instructions which load, store or copy accumulators.
+  // Add an emergency spill slot if a pseudo was expanded.
+  if (ExpandPseudo(MF).expand()) {
+    // The spill slot should be half the size of the accumulator. If target is
+    // mips64, it should be 64-bit, otherwise it should be 32-bt.
+    const TargetRegisterClass *RC = STI.hasMips64() ?
+      &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+    int FI = MF.getFrameInfo().CreateStackObject(RC->getSize(),
+                                                  RC->getAlignment(), false);
+    RS->addScavengingFrameIndex(FI);
+  }
+
+  // Set scavenging frame index if necessary.
+  uint64_t MaxSPOffset = MF.getInfo<MipsFunctionInfo>()->getIncomingArgSize() +
+    estimateStackSize(MF);
+
+  if (isInt<16>(MaxSPOffset))
+    return;
+
+  const TargetRegisterClass *RC =
+      ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+  int FI = MF.getFrameInfo().CreateStackObject(RC->getSize(),
+                                                RC->getAlignment(), false);
+  RS->addScavengingFrameIndex(FI);
+}
+
+const MipsFrameLowering *
+llvm::createMipsSEFrameLowering(const MipsSubtarget &ST) {
+  return new MipsSEFrameLowering(ST);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
new file mode 100644
index 000000000000..63cd3cebc56a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
@@ -0,0 +1,52 @@
+//===-- MipsSEFrameLowering.h - Mips32/64 frame lowering --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSEFRAMELOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSEFRAMELOWERING_H
+
+#include "MipsFrameLowering.h"
+
+namespace llvm {
+
+class MipsSEFrameLowering : public MipsFrameLowering {
+public:
+  explicit MipsSEFrameLowering(const MipsSubtarget &STI);
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
+  unsigned ehDataReg(unsigned I) const;
+
+private:
+  void emitInterruptEpilogueStub(MachineFunction &MF,
+                                 MachineBasicBlock &MBB) const;
+  void emitInterruptPrologueStub(MachineFunction &MF,
+                                 MachineBasicBlock &MBB) const;
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
new file mode 100644
index 000000000000..6f0fdddd7d55
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -0,0 +1,1067 @@
+//===-- MipsSEISelDAGToDAG.cpp - A Dag to Dag Inst Selector for MipsSE ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSEISelDAGToDAG.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-isel"
+
+bool MipsSEDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+  if (Subtarget->inMips16Mode())
+    return false;
+  return MipsDAGToDAGISel::runOnMachineFunction(MF);
+}
+
+void MipsSEDAGToDAGISel::addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
+                                               MachineFunction &MF) {
+  MachineInstrBuilder MIB(MF, &MI);
+  unsigned Mask = MI.getOperand(1).getImm();
+  unsigned Flag =
+      IsDef ? RegState::ImplicitDefine : RegState::Implicit | RegState::Undef;
+
+  if (Mask & 1)
+    MIB.addReg(Mips::DSPPos, Flag);
+
+  if (Mask & 2)
+    MIB.addReg(Mips::DSPSCount, Flag);
+
+  if (Mask & 4)
+    MIB.addReg(Mips::DSPCarry, Flag);
+
+  if (Mask & 8)
+    MIB.addReg(Mips::DSPOutFlag, Flag);
+
+  if (Mask & 16)
+    MIB.addReg(Mips::DSPCCond, Flag);
+
+  if (Mask & 32)
+    MIB.addReg(Mips::DSPEFI, Flag);
+}
+
+unsigned MipsSEDAGToDAGISel::getMSACtrlReg(const SDValue RegIdx) const {
+  switch (cast<ConstantSDNode>(RegIdx)->getZExtValue()) {
+  default:
+    llvm_unreachable("Could not map int to register");
+  case 0: return Mips::MSAIR;
+  case 1: return Mips::MSACSR;
+  case 2: return Mips::MSAAccess;
+  case 3: return Mips::MSASave;
+  case 4: return Mips::MSAModify;
+  case 5: return Mips::MSARequest;
+  case 6: return Mips::MSAMap;
+  case 7: return Mips::MSAUnmap;
+  }
+}
+
+bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI,
+                                                const MachineInstr& MI) {
+  unsigned DstReg = 0, ZeroReg = 0;
+
+  // Check if MI is "addiu $dst, $zero, 0" or "daddiu $dst, $zero, 0".
+  if ((MI.getOpcode() == Mips::ADDiu) &&
+      (MI.getOperand(1).getReg() == Mips::ZERO) &&
+      (MI.getOperand(2).getImm() == 0)) {
+    DstReg = MI.getOperand(0).getReg();
+    ZeroReg = Mips::ZERO;
+  } else if ((MI.getOpcode() == Mips::DADDiu) &&
+             (MI.getOperand(1).getReg() == Mips::ZERO_64) &&
+             (MI.getOperand(2).getImm() == 0)) {
+    DstReg = MI.getOperand(0).getReg();
+    ZeroReg = Mips::ZERO_64;
+  }
+
+  if (!DstReg)
+    return false;
+
+  // Replace uses with ZeroReg.
+  for (MachineRegisterInfo::use_iterator U = MRI->use_begin(DstReg),
+       E = MRI->use_end(); U != E;) {
+    MachineOperand &MO = *U;
+    unsigned OpNo = U.getOperandNo();
+    MachineInstr *MI = MO.getParent();
+    ++U;
+
+    // Do not replace if it is a phi's operand or is tied to def operand.
+    if (MI->isPHI() || MI->isRegTiedToDefOperand(OpNo) || MI->isPseudo())
+      continue;
+
+    // Also, we have to check that the register class of the operand
+    // contains the zero register.
+    if (!MRI->getRegClass(MO.getReg())->contains(ZeroReg))
+      continue;
+
+    MO.setReg(ZeroReg);
+  }
+
+  return true;
+}
+
+void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  if (!MipsFI->globalBaseRegSet())
+    return;
+
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc DL;
+  unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg();
+  const TargetRegisterClass *RC;
+  const MipsABIInfo &ABI = static_cast<const MipsTargetMachine &>(TM).getABI();
+  RC = (ABI.IsN64()) ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+
+  V0 = RegInfo.createVirtualRegister(RC);
+  V1 = RegInfo.createVirtualRegister(RC);
+
+  if (ABI.IsN64()) {
+    MF.getRegInfo().addLiveIn(Mips::T9_64);
+    MBB.addLiveIn(Mips::T9_64);
+
+    // lui $v0, %hi(%neg(%gp_rel(fname)))
+    // daddu $v1, $v0, $t9
+    // daddiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
+    const GlobalValue *FName = MF.getFunction();
+    BuildMI(MBB, I, DL, TII.get(Mips::LUi64), V0)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
+    BuildMI(MBB, I, DL, TII.get(Mips::DADDu), V1).addReg(V0)
+      .addReg(Mips::T9_64);
+    BuildMI(MBB, I, DL, TII.get(Mips::DADDiu), GlobalBaseReg).addReg(V1)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
+    return;
+  }
+
+  if (!MF.getTarget().isPositionIndependent()) {
+    // Set global register to __gnu_local_gp.
+    //
+    // lui   $v0, %hi(__gnu_local_gp)
+    // addiu $globalbasereg, $v0, %lo(__gnu_local_gp)
+    BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
+      .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_HI);
+    BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V0)
+      .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_LO);
+    return;
+  }
+
+  MF.getRegInfo().addLiveIn(Mips::T9);
+  MBB.addLiveIn(Mips::T9);
+
+  if (ABI.IsN32()) {
+    // lui $v0, %hi(%neg(%gp_rel(fname)))
+    // addu $v1, $v0, $t9
+    // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
+    const GlobalValue *FName = MF.getFunction();
+    BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
+    BuildMI(MBB, I, DL, TII.get(Mips::ADDu), V1).addReg(V0).addReg(Mips::T9);
+    BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V1)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
+    return;
+  }
+
+  assert(ABI.IsO32());
+
+  // For O32 ABI, the following instruction sequence is emitted to initialize
+  // the global base register:
+  //
+  //  0. lui   $2, %hi(_gp_disp)
+  //  1. addiu $2, $2, %lo(_gp_disp)
+  //  2. addu  $globalbasereg, $2, $t9
+  //
+  // We emit only the last instruction here.
+  //
+  // GNU linker requires that the first two instructions appear at the beginning
+  // of a function and no instructions be inserted before or between them.
+  // The two instructions are emitted during lowering to MC layer in order to
+  // avoid any reordering.
+  //
+  // Register $2 (Mips::V0) is added to the list of live-in registers to ensure
+  // the value instruction 1 (addiu) defines is valid when instruction 2 (addu)
+  // reads it.
+  MF.getRegInfo().addLiveIn(Mips::V0);
+  MBB.addLiveIn(Mips::V0);
+  BuildMI(MBB, I, DL, TII.get(Mips::ADDu), GlobalBaseReg)
+    .addReg(Mips::V0).addReg(Mips::T9);
+}
+
+void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
+  initGlobalBaseReg(MF);
+
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+  for (auto &MBB: MF) {
+    for (auto &MI: MBB) {
+      switch (MI.getOpcode()) {
+      case Mips::RDDSP:
+        addDSPCtrlRegOperands(false, MI, MF);
+        break;
+      case Mips::WRDSP:
+        addDSPCtrlRegOperands(true, MI, MF);
+        break;
+      default:
+        replaceUsesWithZeroReg(MRI, MI);
+      }
+    }
+  }
+}
+
+void MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
+                                        SDValue CmpLHS, const SDLoc &DL,
+                                        SDNode *Node) const {
+  unsigned Opc = InFlag.getOpcode(); (void)Opc;
+
+  assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
+          (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
+         "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
+
+  unsigned SLTuOp = Mips::SLTu, ADDuOp = Mips::ADDu;
+  if (Subtarget->isGP64bit()) {
+    SLTuOp = Mips::SLTu64;
+    ADDuOp = Mips::DADDu;
+  }
+
+  SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
+  SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1);
+  EVT VT = LHS.getValueType();
+
+  SDNode *Carry = CurDAG->getMachineNode(SLTuOp, DL, VT, Ops);
+
+  if (Subtarget->isGP64bit()) {
+    // On 64-bit targets, sltu produces an i64 but our backend currently says
+    // that SLTu64 produces an i32. We need to fix this in the long run but for
+    // now, just make the DAG type-correct by asserting the upper bits are zero.
+    Carry = CurDAG->getMachineNode(Mips::SUBREG_TO_REG, DL, VT,
+                                   CurDAG->getTargetConstant(0, DL, VT),
+                                   SDValue(Carry, 0),
+                                   CurDAG->getTargetConstant(Mips::sub_32, DL,
+                                                             VT));
+  }
+
+  // Generate a second addition only if we know that RHS is not a
+  // constant-zero node.
+  SDNode *AddCarry = Carry;
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
+  if (!C || C->getZExtValue())
+    AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, SDValue(Carry, 0), RHS);
+
+  CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0));
+}
+
+/// Match frameindex
+bool MipsSEDAGToDAGISel::selectAddrFrameIndex(SDValue Addr, SDValue &Base,
+                                              SDValue &Offset) const {
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    EVT ValTy = Addr.getValueType();
+
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+    Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), ValTy);
+    return true;
+  }
+  return false;
+}
+
+/// Match frameindex+offset and frameindex|offset
+bool MipsSEDAGToDAGISel::selectAddrFrameIndexOffset(
+    SDValue Addr, SDValue &Base, SDValue &Offset, unsigned OffsetBits,
+    unsigned ShiftAmount = 0) const {
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isIntN(OffsetBits + ShiftAmount, CN->getSExtValue())) {
+      EVT ValTy = Addr.getValueType();
+
+      // If the first operand is a FI, get the TargetFI Node
+      if (FrameIndexSDNode *FIN =
+              dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+      else {
+        Base = Addr.getOperand(0);
+        // If base is a FI, additional offset calculation is done in
+        // eliminateFrameIndex, otherwise we need to check the alignment
+        if (OffsetToAlignment(CN->getZExtValue(), 1ull << ShiftAmount) != 0)
+          return false;
+      }
+
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(Addr),
+                                         ValTy);
+      return true;
+    }
+  }
+  return false;
+}
+
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsSEDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
+                                          SDValue &Offset) const {
+  // if Address is FI, get the TargetFrameIndex.
+  if (selectAddrFrameIndex(Addr, Base, Offset))
+    return true;
+
+  // on PIC code Load GA
+  if (Addr.getOpcode() == MipsISD::Wrapper) {
+    Base   = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+
+  if (!TM.isPositionIndependent()) {
+    if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+        Addr.getOpcode() == ISD::TargetGlobalAddress))
+      return false;
+  }
+
+  // Addresses of the form FI+const or FI|const
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 16))
+    return true;
+
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
+    // When loading from constant pools, load the lower address part in
+    // the instruction itself. Example, instead of:
+    //  lui $2, %hi($CPI1_0)
+    //  addiu $2, $2, %lo($CPI1_0)
+    //  lwc1 $f0, 0($2)
+    // Generate:
+    //  lui $2, %hi($CPI1_0)
+    //  lwc1 $f0, %lo($CPI1_0)($2)
+    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo ||
+        Addr.getOperand(1).getOpcode() == MipsISD::GPRel) {
+      SDValue Opnd0 = Addr.getOperand(1).getOperand(0);
+      if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
+          isa<JumpTableSDNode>(Opnd0)) {
+        Base = Addr.getOperand(0);
+        Offset = Opnd0;
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsSEDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
+                                           SDValue &Offset) const {
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), Addr.getValueType());
+  return true;
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) const {
+  return selectAddrRegImm(Addr, Base, Offset) ||
+    selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectAddrRegImm9(SDValue Addr, SDValue &Base,
+                                           SDValue &Offset) const {
+  if (selectAddrFrameIndex(Addr, Base, Offset))
+    return true;
+
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 9))
+    return true;
+
+  return false;
+}
+
+/// Used on microMIPS LWC2, LDC2, SWC2 and SDC2 instructions (11-bit offset)
+bool MipsSEDAGToDAGISel::selectAddrRegImm11(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) const {
+  if (selectAddrFrameIndex(Addr, Base, Offset))
+    return true;
+
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 11))
+    return true;
+
+  return false;
+}
+
+/// Used on microMIPS Load/Store unaligned instructions (12-bit offset)
+bool MipsSEDAGToDAGISel::selectAddrRegImm12(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) const {
+  if (selectAddrFrameIndex(Addr, Base, Offset))
+    return true;
+
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 12))
+    return true;
+
+  return false;
+}
+
+bool MipsSEDAGToDAGISel::selectAddrRegImm16(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) const {
+  if (selectAddrFrameIndex(Addr, Base, Offset))
+    return true;
+
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 16))
+    return true;
+
+  return false;
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddr11MM(SDValue Addr, SDValue &Base,
+                                         SDValue &Offset) const {
+  return selectAddrRegImm11(Addr, Base, Offset) ||
+    selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddr12MM(SDValue Addr, SDValue &Base,
+                                         SDValue &Offset) const {
+  return selectAddrRegImm12(Addr, Base, Offset) ||
+    selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddr16MM(SDValue Addr, SDValue &Base,
+                                         SDValue &Offset) const {
+  return selectAddrRegImm16(Addr, Base, Offset) ||
+    selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddrLSL2MM(SDValue Addr, SDValue &Base,
+                                             SDValue &Offset) const {
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 7)) {
+    if (isa<FrameIndexSDNode>(Base))
+      return false;
+
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Offset)) {
+      unsigned CnstOff = CN->getZExtValue();
+      return (CnstOff == (CnstOff & 0x3c));
+    }
+
+    return false;
+  }
+
+  // For all other cases where "lw" would be selected, don't select "lw16"
+  // because it would result in additional instructions to prepare operands.
+  if (selectAddrRegImm(Addr, Base, Offset))
+    return false;
+
+  return selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddrSImm10(SDValue Addr, SDValue &Base,
+                                             SDValue &Offset) const {
+
+  if (selectAddrFrameIndex(Addr, Base, Offset))
+    return true;
+
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 10))
+    return true;
+
+  return selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddrSImm10Lsl1(SDValue Addr, SDValue &Base,
+                                                 SDValue &Offset) const {
+  if (selectAddrFrameIndex(Addr, Base, Offset))
+    return true;
+
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 10, 1))
+    return true;
+
+  return selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddrSImm10Lsl2(SDValue Addr, SDValue &Base,
+                                                 SDValue &Offset) const {
+  if (selectAddrFrameIndex(Addr, Base, Offset))
+    return true;
+
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 10, 2))
+    return true;
+
+  return selectAddrDefault(Addr, Base, Offset);
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddrSImm10Lsl3(SDValue Addr, SDValue &Base,
+                                                 SDValue &Offset) const {
+  if (selectAddrFrameIndex(Addr, Base, Offset))
+    return true;
+
+  if (selectAddrFrameIndexOffset(Addr, Base, Offset, 10, 3))
+    return true;
+
+  return selectAddrDefault(Addr, Base, Offset);
+}
+
+// Select constant vector splats.
+//
+// Returns true and sets Imm if:
+// * MSA is enabled
+// * N is a ISD::BUILD_VECTOR representing a constant splat
+bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm,
+                                      unsigned MinSizeInBits) const {
+  if (!Subtarget->hasMSA())
+    return false;
+
+  BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N);
+
+  if (!Node)
+    return false;
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
+                             MinSizeInBits, !Subtarget->isLittle()))
+    return false;
+
+  Imm = SplatValue;
+
+  return true;
+}
+
+// Select constant vector splats.
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value fits in an integer with the specified signed-ness and
+//   width.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+//
+// It's worth noting that this function is not used as part of the selection
+// of ldi.[bhwd] since it does not permit using the wrong-typed ldi.[bhwd]
+// instruction to achieve the desired bit pattern. ldi.[bhwd] is selected in
+// MipsSEDAGToDAGISel::selectNode.
+bool MipsSEDAGToDAGISel::
+selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
+                   unsigned ImmBitSize) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+
+    if (( Signed && ImmValue.isSignedIntN(ImmBitSize)) ||
+        (!Signed && ImmValue.isIntN(ImmBitSize))) {
+      Imm = CurDAG->getTargetConstant(ImmValue, SDLoc(N), EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm1(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 1);
+}
+
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm2(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 2);
+}
+
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm3(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 3);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm4(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 4);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm5(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 5);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm6(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 6);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm8(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 8);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatSimm5(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, true, 5);
+}
+
+// Select constant vector splats whose value is a power of 2.
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a power of two.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatUimmPow2(SDValue N, SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    int32_t Log2 = ImmValue.exactLogBase2();
+
+    if (Log2 != -1) {
+      Imm = CurDAG->getTargetConstant(Log2, SDLoc(N), EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Select constant vector splats whose value only has a consecutive sequence
+// of left-most bits set (e.g. 0b11...1100...00).
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a consecutive sequence of left-most bits.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    // Extract the run of set bits starting with bit zero from the bitwise
+    // inverse of ImmValue, and test that the inverse of this is the same
+    // as the original value.
+    if (ImmValue == ~(~ImmValue & ~(~ImmValue + 1))) {
+
+      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), SDLoc(N),
+                                      EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Select constant vector splats whose value only has a consecutive sequence
+// of right-most bits set (e.g. 0b00...0011...11).
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a consecutive sequence of right-most bits.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    // Extract the run of set bits starting with bit zero, and test that the
+    // result is the same as the original value
+    if (ImmValue == (ImmValue & ~(ImmValue + 1))) {
+      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), SDLoc(N),
+                                      EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool MipsSEDAGToDAGISel::selectVSplatUimmInvPow2(SDValue N,
+                                                 SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    int32_t Log2 = (~ImmValue).exactLogBase2();
+
+    if (Log2 != -1) {
+      Imm = CurDAG->getTargetConstant(Log2, SDLoc(N), EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+  SDLoc DL(Node);
+
+  ///
+  // Instruction Selection not handled by the auto-generated
+  // tablegen selection should be handled here.
+  ///
+  switch(Opcode) {
+  default: break;
+
+  case ISD::SUBE: {
+    SDValue InFlag = Node->getOperand(2);
+    unsigned Opc = Subtarget->isGP64bit() ? Mips::DSUBu : Mips::SUBu;
+    selectAddESubE(Opc, InFlag, InFlag.getOperand(0), DL, Node);
+    return true;
+  }
+
+  case ISD::ADDE: {
+    if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC.
+      break;
+    SDValue InFlag = Node->getOperand(2);
+    unsigned Opc = Subtarget->isGP64bit() ? Mips::DADDu : Mips::ADDu;
+    selectAddESubE(Opc, InFlag, InFlag.getValue(0), DL, Node);
+    return true;
+  }
+
+  case ISD::ConstantFP: {
+    ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Node);
+    if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) {
+      if (Subtarget->isGP64bit()) {
+        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+                                              Mips::ZERO_64, MVT::i64);
+        ReplaceNode(Node,
+                    CurDAG->getMachineNode(Mips::DMTC1, DL, MVT::f64, Zero));
+      } else if (Subtarget->isFP64bit()) {
+        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+                                              Mips::ZERO, MVT::i32);
+        ReplaceNode(Node, CurDAG->getMachineNode(Mips::BuildPairF64_64, DL,
+                                                 MVT::f64, Zero, Zero));
+      } else {
+        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+                                              Mips::ZERO, MVT::i32);
+        ReplaceNode(Node, CurDAG->getMachineNode(Mips::BuildPairF64, DL,
+                                                 MVT::f64, Zero, Zero));
+      }
+      return true;
+    }
+    break;
+  }
+
+  case ISD::Constant: {
+    const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Node);
+    int64_t Imm = CN->getSExtValue();
+    unsigned Size = CN->getValueSizeInBits(0);
+
+    if (isInt<32>(Imm))
+      break;
+
+    MipsAnalyzeImmediate AnalyzeImm;
+
+    const MipsAnalyzeImmediate::InstSeq &Seq =
+      AnalyzeImm.Analyze(Imm, Size, false);
+
+    MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
+    SDLoc DL(CN);
+    SDNode *RegOpnd;
+    SDValue ImmOpnd = CurDAG->getTargetConstant(SignExtend64<16>(Inst->ImmOpnd),
+                                                DL, MVT::i64);
+
+    // The first instruction can be a LUi which is different from other
+    // instructions (ADDiu, ORI and SLL) in that it does not have a register
+    // operand.
+    if (Inst->Opc == Mips::LUi64)
+      RegOpnd = CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64, ImmOpnd);
+    else
+      RegOpnd =
+        CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64,
+                               CurDAG->getRegister(Mips::ZERO_64, MVT::i64),
+                               ImmOpnd);
+
+    // The remaining instructions in the sequence are handled here.
+    for (++Inst; Inst != Seq.end(); ++Inst) {
+      ImmOpnd = CurDAG->getTargetConstant(SignExtend64<16>(Inst->ImmOpnd), DL,
+                                          MVT::i64);
+      RegOpnd = CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64,
+                                       SDValue(RegOpnd, 0), ImmOpnd);
+    }
+
+    ReplaceNode(Node, RegOpnd);
+    return true;
+  }
+
+  case ISD::INTRINSIC_W_CHAIN: {
+    switch (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+    default:
+      break;
+
+    case Intrinsic::mips_cfcmsa: {
+      SDValue ChainIn = Node->getOperand(0);
+      SDValue RegIdx = Node->getOperand(2);
+      SDValue Reg = CurDAG->getCopyFromReg(ChainIn, DL,
+                                           getMSACtrlReg(RegIdx), MVT::i32);
+      ReplaceNode(Node, Reg.getNode());
+      return true;
+    }
+    }
+    break;
+  }
+
+  case ISD::INTRINSIC_WO_CHAIN: {
+    switch (cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue()) {
+    default:
+      break;
+
+    case Intrinsic::mips_move_v:
+      // Like an assignment but will always produce a move.v even if
+      // unnecessary.
+      ReplaceNode(Node, CurDAG->getMachineNode(Mips::MOVE_V, DL,
+                                               Node->getValueType(0),
+                                               Node->getOperand(1)));
+      return true;
+    }
+    break;
+  }
+
+  case ISD::INTRINSIC_VOID: {
+    switch (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+    default:
+      break;
+
+    case Intrinsic::mips_ctcmsa: {
+      SDValue ChainIn = Node->getOperand(0);
+      SDValue RegIdx  = Node->getOperand(2);
+      SDValue Value   = Node->getOperand(3);
+      SDValue ChainOut = CurDAG->getCopyToReg(ChainIn, DL,
+                                              getMSACtrlReg(RegIdx), Value);
+      ReplaceNode(Node, ChainOut.getNode());
+      return true;
+    }
+    }
+    break;
+  }
+
+  case MipsISD::ThreadPointer: {
+    EVT PtrVT = getTargetLowering()->getPointerTy(CurDAG->getDataLayout());
+    unsigned RdhwrOpc, DestReg;
+
+    if (PtrVT == MVT::i32) {
+      RdhwrOpc = Mips::RDHWR;
+      DestReg = Mips::V1;
+    } else {
+      RdhwrOpc = Mips::RDHWR64;
+      DestReg = Mips::V1_64;
+    }
+
+    SDNode *Rdhwr =
+      CurDAG->getMachineNode(RdhwrOpc, DL,
+                             Node->getValueType(0),
+                             CurDAG->getRegister(Mips::HWR29, MVT::i32));
+    SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, DestReg,
+                                         SDValue(Rdhwr, 0));
+    SDValue ResNode = CurDAG->getCopyFromReg(Chain, DL, DestReg, PtrVT);
+    ReplaceNode(Node, ResNode.getNode());
+    return true;
+  }
+
+  case ISD::BUILD_VECTOR: {
+    // Select appropriate ldi.[bhwd] instructions for constant splats of
+    // 128-bit when MSA is enabled. Fixup any register class mismatches that
+    // occur as a result.
+    //
+    // This allows the compiler to use a wider range of immediates than would
+    // otherwise be allowed. If, for example, v4i32 could only use ldi.h then
+    // it would not be possible to load { 0x01010101, 0x01010101, 0x01010101,
+    // 0x01010101 } without using a constant pool. This would be sub-optimal
+    // when // 'ldi.b wd, 1' is capable of producing that bit-pattern in the
+    // same set/ of registers. Similarly, ldi.h isn't capable of producing {
+    // 0x00000000, 0x00000001, 0x00000000, 0x00000001 } but 'ldi.d wd, 1' can.
+
+    BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Node);
+    APInt SplatValue, SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    unsigned LdiOp;
+    EVT ResVecTy = BVN->getValueType(0);
+    EVT ViaVecTy;
+
+    if (!Subtarget->hasMSA() || !BVN->getValueType(0).is128BitVector())
+      return false;
+
+    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                              HasAnyUndefs, 8,
+                              !Subtarget->isLittle()))
+      return false;
+
+    switch (SplatBitSize) {
+    default:
+      return false;
+    case 8:
+      LdiOp = Mips::LDI_B;
+      ViaVecTy = MVT::v16i8;
+      break;
+    case 16:
+      LdiOp = Mips::LDI_H;
+      ViaVecTy = MVT::v8i16;
+      break;
+    case 32:
+      LdiOp = Mips::LDI_W;
+      ViaVecTy = MVT::v4i32;
+      break;
+    case 64:
+      LdiOp = Mips::LDI_D;
+      ViaVecTy = MVT::v2i64;
+      break;
+    }
+
+    if (!SplatValue.isSignedIntN(10))
+      return false;
+
+    SDValue Imm = CurDAG->getTargetConstant(SplatValue, DL,
+                                            ViaVecTy.getVectorElementType());
+
+    SDNode *Res = CurDAG->getMachineNode(LdiOp, DL, ViaVecTy, Imm);
+
+    if (ResVecTy != ViaVecTy) {
+      // If LdiOp is writing to a different register class to ResVecTy, then
+      // fix it up here. This COPY_TO_REGCLASS should never cause a move.v
+      // since the source and destination register sets contain the same
+      // registers.
+      const TargetLowering *TLI = getTargetLowering();
+      MVT ResVecTySimple = ResVecTy.getSimpleVT();
+      const TargetRegisterClass *RC = TLI->getRegClassFor(ResVecTySimple);
+      Res = CurDAG->getMachineNode(Mips::COPY_TO_REGCLASS, DL,
+                                   ResVecTy, SDValue(Res, 0),
+                                   CurDAG->getTargetConstant(RC->getID(), DL,
+                                                             MVT::i32));
+    }
+
+    ReplaceNode(Node, Res);
+    return true;
+  }
+
+  }
+
+  return false;
+}
+
+bool MipsSEDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Base, Offset;
+
+  switch(ConstraintID) {
+  default:
+    llvm_unreachable("Unexpected asm memory constraint");
+  // All memory constraints can at least accept raw pointers.
+  case InlineAsm::Constraint_i:
+    OutOps.push_back(Op);
+    OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
+    return false;
+  case InlineAsm::Constraint_m:
+    if (selectAddrRegImm16(Op, Base, Offset)) {
+      OutOps.push_back(Base);
+      OutOps.push_back(Offset);
+      return false;
+    }
+    OutOps.push_back(Op);
+    OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
+    return false;
+  case InlineAsm::Constraint_R:
+    // The 'R' constraint is supposed to be much more complicated than this.
+    // However, it's becoming less useful due to architectural changes and
+    // ought to be replaced by other constraints such as 'ZC'.
+    // For now, support 9-bit signed offsets which is supportable by all
+    // subtargets for all instructions.
+    if (selectAddrRegImm9(Op, Base, Offset)) {
+      OutOps.push_back(Base);
+      OutOps.push_back(Offset);
+      return false;
+    }
+    OutOps.push_back(Op);
+    OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
+    return false;
+  case InlineAsm::Constraint_ZC:
+    // ZC matches whatever the pref, ll, and sc instructions can handle for the
+    // given subtarget.
+    if (Subtarget->inMicroMipsMode()) {
+      // On microMIPS, they can handle 12-bit offsets.
+      if (selectAddrRegImm12(Op, Base, Offset)) {
+        OutOps.push_back(Base);
+        OutOps.push_back(Offset);
+        return false;
+      }
+    } else if (Subtarget->hasMips32r6()) {
+      // On MIPS32r6/MIPS64r6, they can only handle 9-bit offsets.
+      if (selectAddrRegImm9(Op, Base, Offset)) {
+        OutOps.push_back(Base);
+        OutOps.push_back(Offset);
+        return false;
+      }
+    } else if (selectAddrRegImm16(Op, Base, Offset)) {
+      // Prior to MIPS32r6/MIPS64r6, they can handle 16-bit offsets.
+      OutOps.push_back(Base);
+      OutOps.push_back(Offset);
+      return false;
+    }
+    // In all cases, 0-bit offsets are acceptable.
+    OutOps.push_back(Op);
+    OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
+    return false;
+  }
+  return true;
+}
+
+FunctionPass *llvm::createMipsSEISelDag(MipsTargetMachine &TM,
+                                        CodeGenOpt::Level OptLevel) {
+  return new MipsSEDAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
new file mode 100644
index 000000000000..2a8e5877e848
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -0,0 +1,146 @@
+//===-- MipsSEISelDAGToDAG.h - A Dag to Dag Inst Selector for MipsSE -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSEISELDAGTODAG_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSEISELDAGTODAG_H
+
+#include "MipsISelDAGToDAG.h"
+
+namespace llvm {
+
+class MipsSEDAGToDAGISel : public MipsDAGToDAGISel {
+
+public:
+  explicit MipsSEDAGToDAGISel(MipsTargetMachine &TM, CodeGenOpt::Level OL)
+      : MipsDAGToDAGISel(TM, OL) {}
+
+private:
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
+                             MachineFunction &MF);
+
+  unsigned getMSACtrlReg(const SDValue RegIdx) const;
+
+  bool replaceUsesWithZeroReg(MachineRegisterInfo *MRI, const MachineInstr&);
+
+  std::pair<SDNode *, SDNode *> selectMULT(SDNode *N, unsigned Opc,
+                                           const SDLoc &dl, EVT Ty, bool HasLo,
+                                           bool HasHi);
+
+  void selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
+                      const SDLoc &DL, SDNode *Node) const;
+
+  bool selectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset) const;
+  bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset,
+                                  unsigned OffsetBits,
+                                  unsigned ShiftAmount) const;
+
+  bool selectAddrRegImm(SDValue Addr, SDValue &Base,
+                        SDValue &Offset) const override;
+
+  bool selectAddrDefault(SDValue Addr, SDValue &Base,
+                         SDValue &Offset) const override;
+
+  bool selectIntAddr(SDValue Addr, SDValue &Base,
+                     SDValue &Offset) const override;
+
+  bool selectAddrRegImm9(SDValue Addr, SDValue &Base,
+                         SDValue &Offset) const;
+
+  bool selectAddrRegImm11(SDValue Addr, SDValue &Base,
+                          SDValue &Offset) const;
+
+  bool selectAddrRegImm12(SDValue Addr, SDValue &Base,
+                          SDValue &Offset) const;
+
+  bool selectAddrRegImm16(SDValue Addr, SDValue &Base,
+                          SDValue &Offset) const;
+
+  bool selectIntAddr11MM(SDValue Addr, SDValue &Base,
+                         SDValue &Offset) const override;
+
+  bool selectIntAddr12MM(SDValue Addr, SDValue &Base,
+                         SDValue &Offset) const override;
+
+  bool selectIntAddr16MM(SDValue Addr, SDValue &Base,
+                         SDValue &Offset) const override;
+
+  bool selectIntAddrLSL2MM(SDValue Addr, SDValue &Base,
+                           SDValue &Offset) const override;
+
+  bool selectIntAddrSImm10(SDValue Addr, SDValue &Base,
+                           SDValue &Offset) const override;
+
+  bool selectIntAddrSImm10Lsl1(SDValue Addr, SDValue &Base,
+                               SDValue &Offset) const override;
+
+  bool selectIntAddrSImm10Lsl2(SDValue Addr, SDValue &Base,
+                               SDValue &Offset) const override;
+
+  bool selectIntAddrSImm10Lsl3(SDValue Addr, SDValue &Base,
+                               SDValue &Offset) const override;
+
+  /// \brief Select constant vector splats.
+  bool selectVSplat(SDNode *N, APInt &Imm,
+                    unsigned MinSizeInBits) const override;
+  /// \brief Select constant vector splats whose value fits in a given integer.
+  bool selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
+                                  unsigned ImmBitSize) const;
+  /// \brief Select constant vector splats whose value fits in a uimm1.
+  bool selectVSplatUimm1(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value fits in a uimm2.
+  bool selectVSplatUimm2(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value fits in a uimm3.
+  bool selectVSplatUimm3(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value fits in a uimm4.
+  bool selectVSplatUimm4(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value fits in a uimm5.
+  bool selectVSplatUimm5(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value fits in a uimm6.
+  bool selectVSplatUimm6(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value fits in a uimm8.
+  bool selectVSplatUimm8(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value fits in a simm5.
+  bool selectVSplatSimm5(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value is a power of 2.
+  bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value is the inverse of a
+  /// power of 2.
+  bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// ending at the most significant bit
+  bool selectVSplatMaskL(SDValue N, SDValue &Imm) const override;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// starting at bit zero.
+  bool selectVSplatMaskR(SDValue N, SDValue &Imm) const override;
+
+  bool trySelect(SDNode *Node) override;
+
+  void processFunctionAfterISel(MachineFunction &MF) override;
+
+  // Insert instructions to initialize the global base register in the
+  // first MBB of the function.
+  void initGlobalBaseReg(MachineFunction &MF);
+
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    unsigned ConstraintID,
+                                    std::vector<SDValue> &OutOps) override;
+};
+
+FunctionPass *createMipsSEISelDag(MipsTargetMachine &TM,
+                                  CodeGenOpt::Level OptLevel);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
new file mode 100644
index 000000000000..26e0f9a94368
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -0,0 +1,3779 @@
+//===-- MipsSEISelLowering.cpp - MipsSE DAG Lowering Interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsSEISelLowering.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-isel"
+
+static cl::opt<bool>
+UseMipsTailCalls("mips-tail-calls", cl::Hidden,
+                    cl::desc("MIPS: permit tail calls."), cl::init(false));
+
+static cl::opt<bool> NoDPLoadStore("mno-ldc1-sdc1", cl::init(false),
+                                   cl::desc("Expand double precision loads and "
+                                            "stores to their single precision "
+                                            "counterparts"));
+
+MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
+                                           const MipsSubtarget &STI)
+    : MipsTargetLowering(TM, STI) {
+  // Set up the register classes
+  addRegisterClass(MVT::i32, &Mips::GPR32RegClass);
+
+  if (Subtarget.isGP64bit())
+    addRegisterClass(MVT::i64, &Mips::GPR64RegClass);
+
+  if (Subtarget.hasDSP() || Subtarget.hasMSA()) {
+    // Expand all truncating stores and extending loads.
+    for (MVT VT0 : MVT::vector_valuetypes()) {
+      for (MVT VT1 : MVT::vector_valuetypes()) {
+        setTruncStoreAction(VT0, VT1, Expand);
+        setLoadExtAction(ISD::SEXTLOAD, VT0, VT1, Expand);
+        setLoadExtAction(ISD::ZEXTLOAD, VT0, VT1, Expand);
+        setLoadExtAction(ISD::EXTLOAD, VT0, VT1, Expand);
+      }
+    }
+  }
+
+  if (Subtarget.hasDSP()) {
+    MVT::SimpleValueType VecTys[2] = {MVT::v2i16, MVT::v4i8};
+
+    for (unsigned i = 0; i < array_lengthof(VecTys); ++i) {
+      addRegisterClass(VecTys[i], &Mips::DSPRRegClass);
+
+      // Expand all builtin opcodes.
+      for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+        setOperationAction(Opc, VecTys[i], Expand);
+
+      setOperationAction(ISD::ADD, VecTys[i], Legal);
+      setOperationAction(ISD::SUB, VecTys[i], Legal);
+      setOperationAction(ISD::LOAD, VecTys[i], Legal);
+      setOperationAction(ISD::STORE, VecTys[i], Legal);
+      setOperationAction(ISD::BITCAST, VecTys[i], Legal);
+    }
+
+    setTargetDAGCombine(ISD::SHL);
+    setTargetDAGCombine(ISD::SRA);
+    setTargetDAGCombine(ISD::SRL);
+    setTargetDAGCombine(ISD::SETCC);
+    setTargetDAGCombine(ISD::VSELECT);
+  }
+
+  if (Subtarget.hasDSPR2())
+    setOperationAction(ISD::MUL, MVT::v2i16, Legal);
+
+  if (Subtarget.hasMSA()) {
+    addMSAIntType(MVT::v16i8, &Mips::MSA128BRegClass);
+    addMSAIntType(MVT::v8i16, &Mips::MSA128HRegClass);
+    addMSAIntType(MVT::v4i32, &Mips::MSA128WRegClass);
+    addMSAIntType(MVT::v2i64, &Mips::MSA128DRegClass);
+    addMSAFloatType(MVT::v8f16, &Mips::MSA128HRegClass);
+    addMSAFloatType(MVT::v4f32, &Mips::MSA128WRegClass);
+    addMSAFloatType(MVT::v2f64, &Mips::MSA128DRegClass);
+
+    // f16 is a storage-only type, always promote it to f32.
+    addRegisterClass(MVT::f16, &Mips::MSA128HRegClass);
+    setOperationAction(ISD::SETCC, MVT::f16, Promote);
+    setOperationAction(ISD::BR_CC, MVT::f16, Promote);
+    setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
+    setOperationAction(ISD::SELECT, MVT::f16, Promote);
+    setOperationAction(ISD::FADD, MVT::f16, Promote);
+    setOperationAction(ISD::FSUB, MVT::f16, Promote);
+    setOperationAction(ISD::FMUL, MVT::f16, Promote);
+    setOperationAction(ISD::FDIV, MVT::f16, Promote);
+    setOperationAction(ISD::FREM, MVT::f16, Promote);
+    setOperationAction(ISD::FMA, MVT::f16, Promote);
+    setOperationAction(ISD::FNEG, MVT::f16, Promote);
+    setOperationAction(ISD::FABS, MVT::f16, Promote);
+    setOperationAction(ISD::FCEIL, MVT::f16, Promote);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
+    setOperationAction(ISD::FCOS, MVT::f16, Promote);
+    setOperationAction(ISD::FP_EXTEND, MVT::f16, Promote);
+    setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
+    setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
+    setOperationAction(ISD::FPOW, MVT::f16, Promote);
+    setOperationAction(ISD::FPOWI, MVT::f16, Promote);
+    setOperationAction(ISD::FRINT, MVT::f16, Promote);
+    setOperationAction(ISD::FSIN, MVT::f16, Promote);
+    setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
+    setOperationAction(ISD::FSQRT, MVT::f16, Promote);
+    setOperationAction(ISD::FEXP, MVT::f16, Promote);
+    setOperationAction(ISD::FEXP2, MVT::f16, Promote);
+    setOperationAction(ISD::FLOG, MVT::f16, Promote);
+    setOperationAction(ISD::FLOG2, MVT::f16, Promote);
+    setOperationAction(ISD::FLOG10, MVT::f16, Promote);
+    setOperationAction(ISD::FROUND, MVT::f16, Promote);
+    setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
+    setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
+    setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
+    setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
+    setOperationAction(ISD::FMAXNAN, MVT::f16, Promote);
+
+    setTargetDAGCombine(ISD::AND);
+    setTargetDAGCombine(ISD::OR);
+    setTargetDAGCombine(ISD::SRA);
+    setTargetDAGCombine(ISD::VSELECT);
+    setTargetDAGCombine(ISD::XOR);
+  }
+
+  if (!Subtarget.useSoftFloat()) {
+    addRegisterClass(MVT::f32, &Mips::FGR32RegClass);
+
+    // When dealing with single precision only, use libcalls
+    if (!Subtarget.isSingleFloat()) {
+      if (Subtarget.isFP64bit())
+        addRegisterClass(MVT::f64, &Mips::FGR64RegClass);
+      else
+        addRegisterClass(MVT::f64, &Mips::AFGR64RegClass);
+    }
+  }
+
+  setOperationAction(ISD::SMUL_LOHI,          MVT::i32, Custom);
+  setOperationAction(ISD::UMUL_LOHI,          MVT::i32, Custom);
+  setOperationAction(ISD::MULHS,              MVT::i32, Custom);
+  setOperationAction(ISD::MULHU,              MVT::i32, Custom);
+
+  if (Subtarget.hasCnMips())
+    setOperationAction(ISD::MUL,              MVT::i64, Legal);
+  else if (Subtarget.isGP64bit())
+    setOperationAction(ISD::MUL,              MVT::i64, Custom);
+
+  if (Subtarget.isGP64bit()) {
+    setOperationAction(ISD::SMUL_LOHI,        MVT::i64, Custom);
+    setOperationAction(ISD::UMUL_LOHI,        MVT::i64, Custom);
+    setOperationAction(ISD::MULHS,            MVT::i64, Custom);
+    setOperationAction(ISD::MULHU,            MVT::i64, Custom);
+    setOperationAction(ISD::SDIVREM,          MVT::i64, Custom);
+    setOperationAction(ISD::UDIVREM,          MVT::i64, Custom);
+  }
+
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN,  MVT::i64, Custom);
+
+  setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
+  setOperationAction(ISD::LOAD,               MVT::i32, Custom);
+  setOperationAction(ISD::STORE,              MVT::i32, Custom);
+
+  setTargetDAGCombine(ISD::ADDE);
+  setTargetDAGCombine(ISD::SUBE);
+  setTargetDAGCombine(ISD::MUL);
+
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+
+  if (NoDPLoadStore) {
+    setOperationAction(ISD::LOAD, MVT::f64, Custom);
+    setOperationAction(ISD::STORE, MVT::f64, Custom);
+  }
+
+  if (Subtarget.hasMips32r6()) {
+    // MIPS32r6 replaces the accumulator-based multiplies with a three register
+    // instruction
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::MUL, MVT::i32, Legal);
+    setOperationAction(ISD::MULHS, MVT::i32, Legal);
+    setOperationAction(ISD::MULHU, MVT::i32, Legal);
+
+    // MIPS32r6 replaces the accumulator-based division/remainder with separate
+    // three register division and remainder instructions.
+    setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+    setOperationAction(ISD::SDIV, MVT::i32, Legal);
+    setOperationAction(ISD::UDIV, MVT::i32, Legal);
+    setOperationAction(ISD::SREM, MVT::i32, Legal);
+    setOperationAction(ISD::UREM, MVT::i32, Legal);
+
+    // MIPS32r6 replaces conditional moves with an equivalent that removes the
+    // need for three GPR read ports.
+    setOperationAction(ISD::SETCC, MVT::i32, Legal);
+    setOperationAction(ISD::SELECT, MVT::i32, Legal);
+    setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+
+    setOperationAction(ISD::SETCC, MVT::f32, Legal);
+    setOperationAction(ISD::SELECT, MVT::f32, Legal);
+    setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+
+    assert(Subtarget.isFP64bit() && "FR=1 is required for MIPS32r6");
+    setOperationAction(ISD::SETCC, MVT::f64, Legal);
+    setOperationAction(ISD::SELECT, MVT::f64, Legal);
+    setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+
+    setOperationAction(ISD::BRCOND, MVT::Other, Legal);
+
+    // Floating point > and >= are supported via < and <=
+    setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+    setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+
+    setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+    setCondCodeAction(ISD::SETOGT, MVT::f64, Expand);
+    setCondCodeAction(ISD::SETUGE, MVT::f64, Expand);
+    setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
+  }
+
+  if (Subtarget.hasMips64r6()) {
+    // MIPS64r6 replaces the accumulator-based multiplies with a three register
+    // instruction
+    setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+    setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+    setOperationAction(ISD::MUL, MVT::i64, Legal);
+    setOperationAction(ISD::MULHS, MVT::i64, Legal);
+    setOperationAction(ISD::MULHU, MVT::i64, Legal);
+
+    // MIPS32r6 replaces the accumulator-based division/remainder with separate
+    // three register division and remainder instructions.
+    setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+    setOperationAction(ISD::SDIV, MVT::i64, Legal);
+    setOperationAction(ISD::UDIV, MVT::i64, Legal);
+    setOperationAction(ISD::SREM, MVT::i64, Legal);
+    setOperationAction(ISD::UREM, MVT::i64, Legal);
+
+    // MIPS64r6 replaces conditional moves with an equivalent that removes the
+    // need for three GPR read ports.
+    setOperationAction(ISD::SETCC, MVT::i64, Legal);
+    setOperationAction(ISD::SELECT, MVT::i64, Legal);
+    setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  }
+
+  computeRegisterProperties(Subtarget.getRegisterInfo());
+}
+
+const MipsTargetLowering *
+llvm::createMipsSETargetLowering(const MipsTargetMachine &TM,
+                                 const MipsSubtarget &STI) {
+  return new MipsSETargetLowering(TM, STI);
+}
+
+const TargetRegisterClass *
+MipsSETargetLowering::getRepRegClassFor(MVT VT) const {
+  if (VT == MVT::Untyped)
+    return Subtarget.hasDSP() ? &Mips::ACC64DSPRegClass : &Mips::ACC64RegClass;
+
+  return TargetLowering::getRepRegClassFor(VT);
+}
+
+// Enable MSA support for the given integer type and Register class.
+void MipsSETargetLowering::
+addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
+  addRegisterClass(Ty, RC);
+
+  // Expand all builtin opcodes.
+  for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+    setOperationAction(Opc, Ty, Expand);
+
+  setOperationAction(ISD::BITCAST, Ty, Legal);
+  setOperationAction(ISD::LOAD, Ty, Legal);
+  setOperationAction(ISD::STORE, Ty, Legal);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, Ty, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, Ty, Legal);
+  setOperationAction(ISD::BUILD_VECTOR, Ty, Custom);
+
+  setOperationAction(ISD::ADD, Ty, Legal);
+  setOperationAction(ISD::AND, Ty, Legal);
+  setOperationAction(ISD::CTLZ, Ty, Legal);
+  setOperationAction(ISD::CTPOP, Ty, Legal);
+  setOperationAction(ISD::MUL, Ty, Legal);
+  setOperationAction(ISD::OR, Ty, Legal);
+  setOperationAction(ISD::SDIV, Ty, Legal);
+  setOperationAction(ISD::SREM, Ty, Legal);
+  setOperationAction(ISD::SHL, Ty, Legal);
+  setOperationAction(ISD::SRA, Ty, Legal);
+  setOperationAction(ISD::SRL, Ty, Legal);
+  setOperationAction(ISD::SUB, Ty, Legal);
+  setOperationAction(ISD::UDIV, Ty, Legal);
+  setOperationAction(ISD::UREM, Ty, Legal);
+  setOperationAction(ISD::VECTOR_SHUFFLE, Ty, Custom);
+  setOperationAction(ISD::VSELECT, Ty, Legal);
+  setOperationAction(ISD::XOR, Ty, Legal);
+
+  if (Ty == MVT::v4i32 || Ty == MVT::v2i64) {
+    setOperationAction(ISD::FP_TO_SINT, Ty, Legal);
+    setOperationAction(ISD::FP_TO_UINT, Ty, Legal);
+    setOperationAction(ISD::SINT_TO_FP, Ty, Legal);
+    setOperationAction(ISD::UINT_TO_FP, Ty, Legal);
+  }
+
+  setOperationAction(ISD::SETCC, Ty, Legal);
+  setCondCodeAction(ISD::SETNE, Ty, Expand);
+  setCondCodeAction(ISD::SETGE, Ty, Expand);
+  setCondCodeAction(ISD::SETGT, Ty, Expand);
+  setCondCodeAction(ISD::SETUGE, Ty, Expand);
+  setCondCodeAction(ISD::SETUGT, Ty, Expand);
+}
+
+// Enable MSA support for the given floating-point type and Register class.
+void MipsSETargetLowering::
+addMSAFloatType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
+  addRegisterClass(Ty, RC);
+
+  // Expand all builtin opcodes.
+  for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+    setOperationAction(Opc, Ty, Expand);
+
+  setOperationAction(ISD::LOAD, Ty, Legal);
+  setOperationAction(ISD::STORE, Ty, Legal);
+  setOperationAction(ISD::BITCAST, Ty, Legal);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, Ty, Legal);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, Ty, Legal);
+  setOperationAction(ISD::BUILD_VECTOR, Ty, Custom);
+
+  if (Ty != MVT::v8f16) {
+    setOperationAction(ISD::FABS,  Ty, Legal);
+    setOperationAction(ISD::FADD,  Ty, Legal);
+    setOperationAction(ISD::FDIV,  Ty, Legal);
+    setOperationAction(ISD::FEXP2, Ty, Legal);
+    setOperationAction(ISD::FLOG2, Ty, Legal);
+    setOperationAction(ISD::FMA,   Ty, Legal);
+    setOperationAction(ISD::FMUL,  Ty, Legal);
+    setOperationAction(ISD::FRINT, Ty, Legal);
+    setOperationAction(ISD::FSQRT, Ty, Legal);
+    setOperationAction(ISD::FSUB,  Ty, Legal);
+    setOperationAction(ISD::VSELECT, Ty, Legal);
+
+    setOperationAction(ISD::SETCC, Ty, Legal);
+    setCondCodeAction(ISD::SETOGE, Ty, Expand);
+    setCondCodeAction(ISD::SETOGT, Ty, Expand);
+    setCondCodeAction(ISD::SETUGE, Ty, Expand);
+    setCondCodeAction(ISD::SETUGT, Ty, Expand);
+    setCondCodeAction(ISD::SETGE,  Ty, Expand);
+    setCondCodeAction(ISD::SETGT,  Ty, Expand);
+  }
+}
+
+bool
+MipsSETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                     unsigned,
+                                                     unsigned,
+                                                     bool *Fast) const {
+  MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
+
+  if (Subtarget.systemSupportsUnalignedAccess()) {
+    // MIPS32r6/MIPS64r6 is required to support unaligned access. It's
+    // implementation defined whether this is handled by hardware, software, or
+    // a hybrid of the two but it's expected that most implementations will
+    // handle the majority of cases in hardware.
+    if (Fast)
+      *Fast = true;
+    return true;
+  }
+
+  switch (SVT) {
+  case MVT::i64:
+  case MVT::i32:
+    if (Fast)
+      *Fast = true;
+    return true;
+  default:
+    return false;
+  }
+}
+
+SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  switch(Op.getOpcode()) {
+  case ISD::LOAD:  return lowerLOAD(Op, DAG);
+  case ISD::STORE: return lowerSTORE(Op, DAG);
+  case ISD::SMUL_LOHI: return lowerMulDiv(Op, MipsISD::Mult, true, true, DAG);
+  case ISD::UMUL_LOHI: return lowerMulDiv(Op, MipsISD::Multu, true, true, DAG);
+  case ISD::MULHS:     return lowerMulDiv(Op, MipsISD::Mult, false, true, DAG);
+  case ISD::MULHU:     return lowerMulDiv(Op, MipsISD::Multu, false, true, DAG);
+  case ISD::MUL:       return lowerMulDiv(Op, MipsISD::Mult, true, false, DAG);
+  case ISD::SDIVREM:   return lowerMulDiv(Op, MipsISD::DivRem, true, true, DAG);
+  case ISD::UDIVREM:   return lowerMulDiv(Op, MipsISD::DivRemU, true, true,
+                                          DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return lowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN:  return lowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_VOID:     return lowerINTRINSIC_VOID(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::BUILD_VECTOR:       return lowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, DAG);
+  }
+
+  return MipsTargetLowering::LowerOperation(Op, DAG);
+}
+
+// selectMADD -
+// Transforms a subgraph in CurDAG if the following pattern is found:
+//  (addc multLo, Lo0), (adde multHi, Hi0),
+// where,
+//  multHi/Lo: product of multiplication
+//  Lo0: initial value of Lo register
+//  Hi0: initial value of Hi register
+// Return true if pattern matching was successful.
+static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
+  // ADDENode's second operand must be a flag output of an ADDC node in order
+  // for the matching to be successful.
+  SDNode *ADDCNode = ADDENode->getOperand(2).getNode();
+
+  if (ADDCNode->getOpcode() != ISD::ADDC)
+    return false;
+
+  SDValue MultHi = ADDENode->getOperand(0);
+  SDValue MultLo = ADDCNode->getOperand(0);
+  SDNode *MultNode = MultHi.getNode();
+  unsigned MultOpc = MultHi.getOpcode();
+
+  // MultHi and MultLo must be generated by the same node,
+  if (MultLo.getNode() != MultNode)
+    return false;
+
+  // and it must be a multiplication.
+  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
+    return false;
+
+  // MultLo amd MultHi must be the first and second output of MultNode
+  // respectively.
+  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
+    return false;
+
+  // Transform this to a MADD only if ADDENode and ADDCNode are the only users
+  // of the values of MultNode, in which case MultNode will be removed in later
+  // phases.
+  // If there exist users other than ADDENode or ADDCNode, this function returns
+  // here, which will result in MultNode being mapped to a single MULT
+  // instruction node rather than a pair of MULT and MADD instructions being
+  // produced.
+  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
+    return false;
+
+  SDLoc DL(ADDENode);
+
+  // Initialize accumulator.
+  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
+                                  ADDCNode->getOperand(1),
+                                  ADDENode->getOperand(1));
+
+  // create MipsMAdd(u) node
+  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MAddu : MipsISD::MAdd;
+
+  SDValue MAdd = CurDAG->getNode(MultOpc, DL, MVT::Untyped,
+                                 MultNode->getOperand(0),// Factor 0
+                                 MultNode->getOperand(1),// Factor 1
+                                 ACCIn);
+
+  // replace uses of adde and addc here
+  if (!SDValue(ADDCNode, 0).use_empty()) {
+    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut);
+  }
+  if (!SDValue(ADDENode, 0).use_empty()) {
+    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut);
+  }
+
+  return true;
+}
+
+// selectMSUB -
+// Transforms a subgraph in CurDAG if the following pattern is found:
+//  (addc Lo0, multLo), (sube Hi0, multHi),
+// where,
+//  multHi/Lo: product of multiplication
+//  Lo0: initial value of Lo register
+//  Hi0: initial value of Hi register
+// Return true if pattern matching was successful.
+static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
+  // SUBENode's second operand must be a flag output of an SUBC node in order
+  // for the matching to be successful.
+  SDNode *SUBCNode = SUBENode->getOperand(2).getNode();
+
+  if (SUBCNode->getOpcode() != ISD::SUBC)
+    return false;
+
+  SDValue MultHi = SUBENode->getOperand(1);
+  SDValue MultLo = SUBCNode->getOperand(1);
+  SDNode *MultNode = MultHi.getNode();
+  unsigned MultOpc = MultHi.getOpcode();
+
+  // MultHi and MultLo must be generated by the same node,
+  if (MultLo.getNode() != MultNode)
+    return false;
+
+  // and it must be a multiplication.
+  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
+    return false;
+
+  // MultLo amd MultHi must be the first and second output of MultNode
+  // respectively.
+  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
+    return false;
+
+  // Transform this to a MSUB only if SUBENode and SUBCNode are the only users
+  // of the values of MultNode, in which case MultNode will be removed in later
+  // phases.
+  // If there exist users other than SUBENode or SUBCNode, this function returns
+  // here, which will result in MultNode being mapped to a single MULT
+  // instruction node rather than a pair of MULT and MSUB instructions being
+  // produced.
+  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
+    return false;
+
+  SDLoc DL(SUBENode);
+
+  // Initialize accumulator.
+  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
+                                  SUBCNode->getOperand(0),
+                                  SUBENode->getOperand(0));
+
+  // create MipsSub(u) node
+  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MSubu : MipsISD::MSub;
+
+  SDValue MSub = CurDAG->getNode(MultOpc, DL, MVT::Glue,
+                                 MultNode->getOperand(0),// Factor 0
+                                 MultNode->getOperand(1),// Factor 1
+                                 ACCIn);
+
+  // replace uses of sube and subc here
+  if (!SDValue(SUBCNode, 0).use_empty()) {
+    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MSub);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut);
+  }
+  if (!SDValue(SUBENode, 0).use_empty()) {
+    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MSub);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut);
+  }
+
+  return true;
+}
+
+static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const MipsSubtarget &Subtarget) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
+      N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+// Fold zero extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT
+//
+// Performs the following transformations:
+// - Changes MipsISD::VEXTRACT_[SZ]EXT_ELT to zero extension if its
+//   sign/zero-extension is completely overwritten by the new one performed by
+//   the ISD::AND.
+// - Removes redundant zero extensions performed by an ISD::AND.
+static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  if (!Subtarget.hasMSA())
+    return SDValue();
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  unsigned Op0Opcode = Op0->getOpcode();
+
+  // (and (MipsVExtract[SZ]Ext $a, $b, $c), imm:$d)
+  // where $d + 1 == 2^n and n == 32
+  // or    $d + 1 == 2^n and n <= 32 and ZExt
+  // -> (MipsVExtractZExt $a, $b, $c)
+  if (Op0Opcode == MipsISD::VEXTRACT_SEXT_ELT ||
+      Op0Opcode == MipsISD::VEXTRACT_ZEXT_ELT) {
+    ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Op1);
+
+    if (!Mask)
+      return SDValue();
+
+    int32_t Log2IfPositive = (Mask->getAPIntValue() + 1).exactLogBase2();
+
+    if (Log2IfPositive <= 0)
+      return SDValue(); // Mask+1 is not a power of 2
+
+    SDValue Op0Op2 = Op0->getOperand(2);
+    EVT ExtendTy = cast<VTSDNode>(Op0Op2)->getVT();
+    unsigned ExtendTySize = ExtendTy.getSizeInBits();
+    unsigned Log2 = Log2IfPositive;
+
+    if ((Op0Opcode == MipsISD::VEXTRACT_ZEXT_ELT && Log2 >= ExtendTySize) ||
+        Log2 == ExtendTySize) {
+      SDValue Ops[] = { Op0->getOperand(0), Op0->getOperand(1), Op0Op2 };
+      return DAG.getNode(MipsISD::VEXTRACT_ZEXT_ELT, SDLoc(Op0),
+                         Op0->getVTList(),
+                         makeArrayRef(Ops, Op0->getNumOperands()));
+    }
+  }
+
+  return SDValue();
+}
+
+// Determine if the specified node is a constant vector splat.
+//
+// Returns true and sets Imm if:
+// * N is a ISD::BUILD_VECTOR representing a constant splat
+//
+// This function is quite similar to MipsSEDAGToDAGISel::selectVSplat. The
+// differences are that it assumes the MSA has already been checked and the
+// arbitrary requirement for a maximum of 32-bit integers isn't applied (and
+// must not be in order for binsri.d to be selectable).
+static bool isVSplat(SDValue N, APInt &Imm, bool IsLittleEndian) {
+  BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N.getNode());
+
+  if (!Node)
+    return false;
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
+                             8, !IsLittleEndian))
+    return false;
+
+  Imm = SplatValue;
+
+  return true;
+}
+
+// Test whether the given node is an all-ones build_vector.
+static bool isVectorAllOnes(SDValue N) {
+  // Look through bitcasts. Endianness doesn't matter because we are looking
+  // for an all-ones value.
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N);
+
+  if (!BVN)
+    return false;
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  // Endianness doesn't matter in this context because we are looking for
+  // an all-ones value.
+  if (BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs))
+    return SplatValue.isAllOnesValue();
+
+  return false;
+}
+
+// Test whether N is the bitwise inverse of OfNode.
+static bool isBitwiseInverse(SDValue N, SDValue OfNode) {
+  if (N->getOpcode() != ISD::XOR)
+    return false;
+
+  if (isVectorAllOnes(N->getOperand(0)))
+    return N->getOperand(1) == OfNode;
+
+  if (isVectorAllOnes(N->getOperand(1)))
+    return N->getOperand(0) == OfNode;
+
+  return false;
+}
+
+// Perform combines where ISD::OR is the root node.
+//
+// Performs the following transformations:
+// - (or (and $a, $mask), (and $b, $inv_mask)) => (vselect $mask, $a, $b)
+//   where $inv_mask is the bitwise inverse of $mask and the 'or' has a 128-bit
+//   vector type.
+static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const MipsSubtarget &Subtarget) {
+  if (!Subtarget.hasMSA())
+    return SDValue();
+
+  EVT Ty = N->getValueType(0);
+
+  if (!Ty.is128BitVector())
+    return SDValue();
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  if (Op0->getOpcode() == ISD::AND && Op1->getOpcode() == ISD::AND) {
+    SDValue Op0Op0 = Op0->getOperand(0);
+    SDValue Op0Op1 = Op0->getOperand(1);
+    SDValue Op1Op0 = Op1->getOperand(0);
+    SDValue Op1Op1 = Op1->getOperand(1);
+    bool IsLittleEndian = !Subtarget.isLittle();
+
+    SDValue IfSet, IfClr, Cond;
+    bool IsConstantMask = false;
+    APInt Mask, InvMask;
+
+    // If Op0Op0 is an appropriate mask, try to find it's inverse in either
+    // Op1Op0, or Op1Op1. Keep track of the Cond, IfSet, and IfClr nodes, while
+    // looking.
+    // IfClr will be set if we find a valid match.
+    if (isVSplat(Op0Op0, Mask, IsLittleEndian)) {
+      Cond = Op0Op0;
+      IfSet = Op0Op1;
+
+      if (isVSplat(Op1Op0, InvMask, IsLittleEndian) &&
+          Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op1;
+      else if (isVSplat(Op1Op1, InvMask, IsLittleEndian) &&
+               Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op0;
+
+      IsConstantMask = true;
+    }
+
+    // If IfClr is not yet set, and Op0Op1 is an appropriate mask, try the same
+    // thing again using this mask.
+    // IfClr will be set if we find a valid match.
+    if (!IfClr.getNode() && isVSplat(Op0Op1, Mask, IsLittleEndian)) {
+      Cond = Op0Op1;
+      IfSet = Op0Op0;
+
+      if (isVSplat(Op1Op0, InvMask, IsLittleEndian) &&
+          Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op1;
+      else if (isVSplat(Op1Op1, InvMask, IsLittleEndian) &&
+               Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op0;
+
+      IsConstantMask = true;
+    }
+
+    // If IfClr is not yet set, try looking for a non-constant match.
+    // IfClr will be set if we find a valid match amongst the eight
+    // possibilities.
+    if (!IfClr.getNode()) {
+      if (isBitwiseInverse(Op0Op0, Op1Op0)) {
+        Cond = Op1Op0;
+        IfSet = Op1Op1;
+        IfClr = Op0Op1;
+      } else if (isBitwiseInverse(Op0Op1, Op1Op0)) {
+        Cond = Op1Op0;
+        IfSet = Op1Op1;
+        IfClr = Op0Op0;
+      } else if (isBitwiseInverse(Op0Op0, Op1Op1)) {
+        Cond = Op1Op1;
+        IfSet = Op1Op0;
+        IfClr = Op0Op1;
+      } else if (isBitwiseInverse(Op0Op1, Op1Op1)) {
+        Cond = Op1Op1;
+        IfSet = Op1Op0;
+        IfClr = Op0Op0;
+      } else if (isBitwiseInverse(Op1Op0, Op0Op0)) {
+        Cond = Op0Op0;
+        IfSet = Op0Op1;
+        IfClr = Op1Op1;
+      } else if (isBitwiseInverse(Op1Op1, Op0Op0)) {
+        Cond = Op0Op0;
+        IfSet = Op0Op1;
+        IfClr = Op1Op0;
+      } else if (isBitwiseInverse(Op1Op0, Op0Op1)) {
+        Cond = Op0Op1;
+        IfSet = Op0Op0;
+        IfClr = Op1Op1;
+      } else if (isBitwiseInverse(Op1Op1, Op0Op1)) {
+        Cond = Op0Op1;
+        IfSet = Op0Op0;
+        IfClr = Op1Op0;
+      }
+    }
+
+    // At this point, IfClr will be set if we have a valid match.
+    if (!IfClr.getNode())
+      return SDValue();
+
+    assert(Cond.getNode() && IfSet.getNode());
+
+    // Fold degenerate cases.
+    if (IsConstantMask) {
+      if (Mask.isAllOnesValue())
+        return IfSet;
+      else if (Mask == 0)
+        return IfClr;
+    }
+
+    // Transform the DAG into an equivalent VSELECT.
+    return DAG.getNode(ISD::VSELECT, SDLoc(N), Ty, Cond, IfSet, IfClr);
+  }
+
+  return SDValue();
+}
+
+static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const MipsSubtarget &Subtarget) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  if (Subtarget.hasMips32() && N->getValueType(0) == MVT::i32 &&
+      selectMSUB(N, &DAG))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+static SDValue genConstMult(SDValue X, uint64_t C, const SDLoc &DL, EVT VT,
+                            EVT ShiftTy, SelectionDAG &DAG) {
+  // Clear the upper (64 - VT.sizeInBits) bits.
+  C &= ((uint64_t)-1) >> (64 - VT.getSizeInBits());
+
+  // Return 0.
+  if (C == 0)
+    return DAG.getConstant(0, DL, VT);
+
+  // Return x.
+  if (C == 1)
+    return X;
+
+  // If c is power of 2, return (shl x, log2(c)).
+  if (isPowerOf2_64(C))
+    return DAG.getNode(ISD::SHL, DL, VT, X,
+                       DAG.getConstant(Log2_64(C), DL, ShiftTy));
+
+  unsigned Log2Ceil = Log2_64_Ceil(C);
+  uint64_t Floor = 1LL << Log2_64(C);
+  uint64_t Ceil = Log2Ceil == 64 ? 0LL : 1LL << Log2Ceil;
+
+  // If |c - floor_c| <= |c - ceil_c|,
+  // where floor_c = pow(2, floor(log2(c))) and ceil_c = pow(2, ceil(log2(c))),
+  // return (add constMult(x, floor_c), constMult(x, c - floor_c)).
+  if (C - Floor <= Ceil - C) {
+    SDValue Op0 = genConstMult(X, Floor, DL, VT, ShiftTy, DAG);
+    SDValue Op1 = genConstMult(X, C - Floor, DL, VT, ShiftTy, DAG);
+    return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
+  }
+
+  // If |c - floor_c| > |c - ceil_c|,
+  // return (sub constMult(x, ceil_c), constMult(x, ceil_c - c)).
+  SDValue Op0 = genConstMult(X, Ceil, DL, VT, ShiftTy, DAG);
+  SDValue Op1 = genConstMult(X, Ceil - C, DL, VT, ShiftTy, DAG);
+  return DAG.getNode(ISD::SUB, DL, VT, Op0, Op1);
+}
+
+static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
+                                 const TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSETargetLowering *TL) {
+  EVT VT = N->getValueType(0);
+
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    if (!VT.isVector())
+      return genConstMult(N->getOperand(0), C->getZExtValue(), SDLoc(N), VT,
+                          TL->getScalarShiftAmountTy(DAG.getDataLayout(), VT),
+                          DAG);
+
+  return SDValue(N, 0);
+}
+
+static SDValue performDSPShiftCombine(unsigned Opc, SDNode *N, EVT Ty,
+                                      SelectionDAG &DAG,
+                                      const MipsSubtarget &Subtarget) {
+  // See if this is a vector splat immediate node.
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  unsigned EltSize = Ty.getScalarSizeInBits();
+  BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+
+  if (!Subtarget.hasDSP())
+    return SDValue();
+
+  if (!BV ||
+      !BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
+                           EltSize, !Subtarget.isLittle()) ||
+      (SplatBitSize != EltSize) ||
+      (SplatValue.getZExtValue() >= EltSize))
+    return SDValue();
+
+  SDLoc DL(N);
+  return DAG.getNode(Opc, DL, Ty, N->getOperand(0),
+                     DAG.getConstant(SplatValue.getZExtValue(), DL, MVT::i32));
+}
+
+static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  EVT Ty = N->getValueType(0);
+
+  if ((Ty != MVT::v2i16) && (Ty != MVT::v4i8))
+    return SDValue();
+
+  return performDSPShiftCombine(MipsISD::SHLL_DSP, N, Ty, DAG, Subtarget);
+}
+
+// Fold sign-extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT for MSA and fold
+// constant splats into MipsISD::SHRA_DSP for DSPr2.
+//
+// Performs the following transformations:
+// - Changes MipsISD::VEXTRACT_[SZ]EXT_ELT to sign extension if its
+//   sign/zero-extension is completely overwritten by the new one performed by
+//   the ISD::SRA and ISD::SHL nodes.
+// - Removes redundant sign extensions performed by an ISD::SRA and ISD::SHL
+//   sequence.
+//
+// See performDSPShiftCombine for more information about the transformation
+// used for DSPr2.
+static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  EVT Ty = N->getValueType(0);
+
+  if (Subtarget.hasMSA()) {
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+
+    // (sra (shl (MipsVExtract[SZ]Ext $a, $b, $c), imm:$d), imm:$d)
+    // where $d + sizeof($c) == 32
+    // or    $d + sizeof($c) <= 32 and SExt
+    // -> (MipsVExtractSExt $a, $b, $c)
+    if (Op0->getOpcode() == ISD::SHL && Op1 == Op0->getOperand(1)) {
+      SDValue Op0Op0 = Op0->getOperand(0);
+      ConstantSDNode *ShAmount = dyn_cast<ConstantSDNode>(Op1);
+
+      if (!ShAmount)
+        return SDValue();
+
+      if (Op0Op0->getOpcode() != MipsISD::VEXTRACT_SEXT_ELT &&
+          Op0Op0->getOpcode() != MipsISD::VEXTRACT_ZEXT_ELT)
+        return SDValue();
+
+      EVT ExtendTy = cast<VTSDNode>(Op0Op0->getOperand(2))->getVT();
+      unsigned TotalBits = ShAmount->getZExtValue() + ExtendTy.getSizeInBits();
+
+      if (TotalBits == 32 ||
+          (Op0Op0->getOpcode() == MipsISD::VEXTRACT_SEXT_ELT &&
+           TotalBits <= 32)) {
+        SDValue Ops[] = { Op0Op0->getOperand(0), Op0Op0->getOperand(1),
+                          Op0Op0->getOperand(2) };
+        return DAG.getNode(MipsISD::VEXTRACT_SEXT_ELT, SDLoc(Op0Op0),
+                           Op0Op0->getVTList(),
+                           makeArrayRef(Ops, Op0Op0->getNumOperands()));
+      }
+    }
+  }
+
+  if ((Ty != MVT::v2i16) && ((Ty != MVT::v4i8) || !Subtarget.hasDSPR2()))
+    return SDValue();
+
+  return performDSPShiftCombine(MipsISD::SHRA_DSP, N, Ty, DAG, Subtarget);
+}
+
+
+static SDValue performSRLCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  EVT Ty = N->getValueType(0);
+
+  if (((Ty != MVT::v2i16) || !Subtarget.hasDSPR2()) && (Ty != MVT::v4i8))
+    return SDValue();
+
+  return performDSPShiftCombine(MipsISD::SHRL_DSP, N, Ty, DAG, Subtarget);
+}
+
+static bool isLegalDSPCondCode(EVT Ty, ISD::CondCode CC) {
+  bool IsV216 = (Ty == MVT::v2i16);
+
+  switch (CC) {
+  case ISD::SETEQ:
+  case ISD::SETNE:  return true;
+  case ISD::SETLT:
+  case ISD::SETLE:
+  case ISD::SETGT:
+  case ISD::SETGE:  return IsV216;
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETUGT:
+  case ISD::SETUGE: return !IsV216;
+  default:          return false;
+  }
+}
+
+static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT Ty = N->getValueType(0);
+
+  if ((Ty != MVT::v2i16) && (Ty != MVT::v4i8))
+    return SDValue();
+
+  if (!isLegalDSPCondCode(Ty, cast<CondCodeSDNode>(N->getOperand(2))->get()))
+    return SDValue();
+
+  return DAG.getNode(MipsISD::SETCC_DSP, SDLoc(N), Ty, N->getOperand(0),
+                     N->getOperand(1), N->getOperand(2));
+}
+
+static SDValue performVSELECTCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT Ty = N->getValueType(0);
+
+  if (Ty.is128BitVector() && Ty.isInteger()) {
+    // Try the following combines:
+    //   (vselect (setcc $a, $b, SETLT), $b, $a)) -> (vsmax $a, $b)
+    //   (vselect (setcc $a, $b, SETLE), $b, $a)) -> (vsmax $a, $b)
+    //   (vselect (setcc $a, $b, SETLT), $a, $b)) -> (vsmin $a, $b)
+    //   (vselect (setcc $a, $b, SETLE), $a, $b)) -> (vsmin $a, $b)
+    //   (vselect (setcc $a, $b, SETULT), $b, $a)) -> (vumax $a, $b)
+    //   (vselect (setcc $a, $b, SETULE), $b, $a)) -> (vumax $a, $b)
+    //   (vselect (setcc $a, $b, SETULT), $a, $b)) -> (vumin $a, $b)
+    //   (vselect (setcc $a, $b, SETULE), $a, $b)) -> (vumin $a, $b)
+    // SETGT/SETGE/SETUGT/SETUGE variants of these will show up initially but
+    // will be expanded to equivalent SETLT/SETLE/SETULT/SETULE versions by the
+    // legalizer.
+    SDValue Op0 = N->getOperand(0);
+
+    if (Op0->getOpcode() != ISD::SETCC)
+      return SDValue();
+
+    ISD::CondCode CondCode = cast<CondCodeSDNode>(Op0->getOperand(2))->get();
+    bool Signed;
+
+    if (CondCode == ISD::SETLT  || CondCode == ISD::SETLE)
+      Signed = true;
+    else if (CondCode == ISD::SETULT || CondCode == ISD::SETULE)
+      Signed = false;
+    else
+      return SDValue();
+
+    SDValue Op1 = N->getOperand(1);
+    SDValue Op2 = N->getOperand(2);
+    SDValue Op0Op0 = Op0->getOperand(0);
+    SDValue Op0Op1 = Op0->getOperand(1);
+
+    if (Op1 == Op0Op0 && Op2 == Op0Op1)
+      return DAG.getNode(Signed ? MipsISD::VSMIN : MipsISD::VUMIN, SDLoc(N),
+                         Ty, Op1, Op2);
+    else if (Op1 == Op0Op1 && Op2 == Op0Op0)
+      return DAG.getNode(Signed ? MipsISD::VSMAX : MipsISD::VUMAX, SDLoc(N),
+                         Ty, Op1, Op2);
+  } else if ((Ty == MVT::v2i16) || (Ty == MVT::v4i8)) {
+    SDValue SetCC = N->getOperand(0);
+
+    if (SetCC.getOpcode() != MipsISD::SETCC_DSP)
+      return SDValue();
+
+    return DAG.getNode(MipsISD::SELECT_CC_DSP, SDLoc(N), Ty,
+                       SetCC.getOperand(0), SetCC.getOperand(1),
+                       N->getOperand(1), N->getOperand(2), SetCC.getOperand(2));
+  }
+
+  return SDValue();
+}
+
+static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
+                                 const MipsSubtarget &Subtarget) {
+  EVT Ty = N->getValueType(0);
+
+  if (Subtarget.hasMSA() && Ty.is128BitVector() && Ty.isInteger()) {
+    // Try the following combines:
+    //   (xor (or $a, $b), (build_vector allones))
+    //   (xor (or $a, $b), (bitcast (build_vector allones)))
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+    SDValue NotOp;
+
+    if (ISD::isBuildVectorAllOnes(Op0.getNode()))
+      NotOp = Op1;
+    else if (ISD::isBuildVectorAllOnes(Op1.getNode()))
+      NotOp = Op0;
+    else
+      return SDValue();
+
+    if (NotOp->getOpcode() == ISD::OR)
+      return DAG.getNode(MipsISD::VNOR, SDLoc(N), Ty, NotOp->getOperand(0),
+                         NotOp->getOperand(1));
+  }
+
+  return SDValue();
+}
+
+SDValue
+MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Val;
+
+  switch (N->getOpcode()) {
+  case ISD::ADDE:
+    return performADDECombine(N, DAG, DCI, Subtarget);
+  case ISD::AND:
+    Val = performANDCombine(N, DAG, DCI, Subtarget);
+    break;
+  case ISD::OR:
+    Val = performORCombine(N, DAG, DCI, Subtarget);
+    break;
+  case ISD::SUBE:
+    return performSUBECombine(N, DAG, DCI, Subtarget);
+  case ISD::MUL:
+    return performMULCombine(N, DAG, DCI, this);
+  case ISD::SHL:
+    return performSHLCombine(N, DAG, DCI, Subtarget);
+  case ISD::SRA:
+    return performSRACombine(N, DAG, DCI, Subtarget);
+  case ISD::SRL:
+    return performSRLCombine(N, DAG, DCI, Subtarget);
+  case ISD::VSELECT:
+    return performVSELECTCombine(N, DAG);
+  case ISD::XOR:
+    Val = performXORCombine(N, DAG, Subtarget);
+    break;
+  case ISD::SETCC:
+    Val = performSETCCCombine(N, DAG);
+    break;
+  }
+
+  if (Val.getNode()) {
+    DEBUG(dbgs() << "\nMipsSE DAG Combine:\n";
+          N->printrWithDepth(dbgs(), &DAG);
+          dbgs() << "\n=> \n";
+          Val.getNode()->printrWithDepth(dbgs(), &DAG);
+          dbgs() << "\n");
+    return Val;
+  }
+
+  return MipsTargetLowering::PerformDAGCombine(N, DCI);
+}
+
+MachineBasicBlock *
+MipsSETargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                                  MachineBasicBlock *BB) const {
+  switch (MI.getOpcode()) {
+  default:
+    return MipsTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  case Mips::BPOSGE32_PSEUDO:
+    return emitBPOSGE32(MI, BB);
+  case Mips::SNZ_B_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_B);
+  case Mips::SNZ_H_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_H);
+  case Mips::SNZ_W_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_W);
+  case Mips::SNZ_D_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_D);
+  case Mips::SNZ_V_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_V);
+  case Mips::SZ_B_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_B);
+  case Mips::SZ_H_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_H);
+  case Mips::SZ_W_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_W);
+  case Mips::SZ_D_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_D);
+  case Mips::SZ_V_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_V);
+  case Mips::COPY_FW_PSEUDO:
+    return emitCOPY_FW(MI, BB);
+  case Mips::COPY_FD_PSEUDO:
+    return emitCOPY_FD(MI, BB);
+  case Mips::INSERT_FW_PSEUDO:
+    return emitINSERT_FW(MI, BB);
+  case Mips::INSERT_FD_PSEUDO:
+    return emitINSERT_FD(MI, BB);
+  case Mips::INSERT_B_VIDX_PSEUDO:
+  case Mips::INSERT_B_VIDX64_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 1, false);
+  case Mips::INSERT_H_VIDX_PSEUDO:
+  case Mips::INSERT_H_VIDX64_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 2, false);
+  case Mips::INSERT_W_VIDX_PSEUDO:
+  case Mips::INSERT_W_VIDX64_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 4, false);
+  case Mips::INSERT_D_VIDX_PSEUDO:
+  case Mips::INSERT_D_VIDX64_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 8, false);
+  case Mips::INSERT_FW_VIDX_PSEUDO:
+  case Mips::INSERT_FW_VIDX64_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 4, true);
+  case Mips::INSERT_FD_VIDX_PSEUDO:
+  case Mips::INSERT_FD_VIDX64_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 8, true);
+  case Mips::FILL_FW_PSEUDO:
+    return emitFILL_FW(MI, BB);
+  case Mips::FILL_FD_PSEUDO:
+    return emitFILL_FD(MI, BB);
+  case Mips::FEXP2_W_1_PSEUDO:
+    return emitFEXP2_W_1(MI, BB);
+  case Mips::FEXP2_D_1_PSEUDO:
+    return emitFEXP2_D_1(MI, BB);
+  case Mips::ST_F16:
+    return emitST_F16_PSEUDO(MI, BB);
+  case Mips::LD_F16:
+    return emitLD_F16_PSEUDO(MI, BB);
+  case Mips::MSA_FP_EXTEND_W_PSEUDO:
+    return emitFPEXTEND_PSEUDO(MI, BB, false);
+  case Mips::MSA_FP_ROUND_W_PSEUDO:
+    return emitFPROUND_PSEUDO(MI, BB, false);
+  case Mips::MSA_FP_EXTEND_D_PSEUDO:
+    return emitFPEXTEND_PSEUDO(MI, BB, true);
+  case Mips::MSA_FP_ROUND_D_PSEUDO:
+    return emitFPROUND_PSEUDO(MI, BB, true);
+  }
+}
+
+bool MipsSETargetLowering::isEligibleForTailCallOptimization(
+    const CCState &CCInfo, unsigned NextStackOffset,
+    const MipsFunctionInfo &FI) const {
+  if (!UseMipsTailCalls)
+    return false;
+
+  // Exception has to be cleared with eret.
+  if (FI.isISR())
+    return false;
+
+  // Return false if either the callee or caller has a byval argument.
+  if (CCInfo.getInRegsParamsCount() > 0 || FI.hasByvalArg())
+    return false;
+
+  // Return true if the callee's argument area is no larger than the
+  // caller's.
+  return NextStackOffset <= FI.getIncomingArgSize();
+}
+
+void MipsSETargetLowering::
+getOpndList(SmallVectorImpl<SDValue> &Ops,
+            std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+            bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+            bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
+            SDValue Chain) const {
+  Ops.push_back(Callee);
+  MipsTargetLowering::getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal,
+                                  InternalLinkage, IsCallReloc, CLI, Callee,
+                                  Chain);
+}
+
+SDValue MipsSETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode &Nd = *cast<LoadSDNode>(Op);
+
+  if (Nd.getMemoryVT() != MVT::f64 || !NoDPLoadStore)
+    return MipsTargetLowering::lowerLOAD(Op, DAG);
+
+  // Replace a double precision load with two i32 loads and a buildpair64.
+  SDLoc DL(Op);
+  SDValue Ptr = Nd.getBasePtr(), Chain = Nd.getChain();
+  EVT PtrVT = Ptr.getValueType();
+
+  // i32 load from lower address.
+  SDValue Lo = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo(),
+                           Nd.getAlignment(), Nd.getMemOperand()->getFlags());
+
+  // i32 load from higher address.
+  Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, DL, PtrVT));
+  SDValue Hi = DAG.getLoad(
+      MVT::i32, DL, Lo.getValue(1), Ptr, MachinePointerInfo(),
+      std::min(Nd.getAlignment(), 4U), Nd.getMemOperand()->getFlags());
+
+  if (!Subtarget.isLittle())
+    std::swap(Lo, Hi);
+
+  SDValue BP = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
+  SDValue Ops[2] = {BP, Hi.getValue(1)};
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue MipsSETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  StoreSDNode &Nd = *cast<StoreSDNode>(Op);
+
+  if (Nd.getMemoryVT() != MVT::f64 || !NoDPLoadStore)
+    return MipsTargetLowering::lowerSTORE(Op, DAG);
+
+  // Replace a double precision store with two extractelement64s and i32 stores.
+  SDLoc DL(Op);
+  SDValue Val = Nd.getValue(), Ptr = Nd.getBasePtr(), Chain = Nd.getChain();
+  EVT PtrVT = Ptr.getValueType();
+  SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                           Val, DAG.getConstant(0, DL, MVT::i32));
+  SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                           Val, DAG.getConstant(1, DL, MVT::i32));
+
+  if (!Subtarget.isLittle())
+    std::swap(Lo, Hi);
+
+  // i32 store to lower address.
+  Chain =
+      DAG.getStore(Chain, DL, Lo, Ptr, MachinePointerInfo(), Nd.getAlignment(),
+                   Nd.getMemOperand()->getFlags(), Nd.getAAInfo());
+
+  // i32 store to higher address.
+  Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, DL, PtrVT));
+  return DAG.getStore(Chain, DL, Hi, Ptr, MachinePointerInfo(),
+                      std::min(Nd.getAlignment(), 4U),
+                      Nd.getMemOperand()->getFlags(), Nd.getAAInfo());
+}
+
+SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
+                                          bool HasLo, bool HasHi,
+                                          SelectionDAG &DAG) const {
+  // MIPS32r6/MIPS64r6 removed accumulator based multiplies.
+  assert(!Subtarget.hasMips32r6());
+
+  EVT Ty = Op.getOperand(0).getValueType();
+  SDLoc DL(Op);
+  SDValue Mult = DAG.getNode(NewOpc, DL, MVT::Untyped,
+                             Op.getOperand(0), Op.getOperand(1));
+  SDValue Lo, Hi;
+
+  if (HasLo)
+    Lo = DAG.getNode(MipsISD::MFLO, DL, Ty, Mult);
+  if (HasHi)
+    Hi = DAG.getNode(MipsISD::MFHI, DL, Ty, Mult);
+
+  if (!HasLo || !HasHi)
+    return HasLo ? Lo : Hi;
+
+  SDValue Vals[] = { Lo, Hi };
+  return DAG.getMergeValues(Vals, DL);
+}
+
+static SDValue initAccumulator(SDValue In, const SDLoc &DL, SelectionDAG &DAG) {
+  SDValue InLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
+                             DAG.getConstant(0, DL, MVT::i32));
+  SDValue InHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
+                             DAG.getConstant(1, DL, MVT::i32));
+  return DAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped, InLo, InHi);
+}
+
+static SDValue extractLOHI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG) {
+  SDValue Lo = DAG.getNode(MipsISD::MFLO, DL, MVT::i32, Op);
+  SDValue Hi = DAG.getNode(MipsISD::MFHI, DL, MVT::i32, Op);
+  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
+}
+
+// This function expands mips intrinsic nodes which have 64-bit input operands
+// or output values.
+//
+// out64 = intrinsic-node in64
+// =>
+// lo = copy (extract-element (in64, 0))
+// hi = copy (extract-element (in64, 1))
+// mips-specific-node
+// v0 = copy lo
+// v1 = copy hi
+// out64 = merge-values (v0, v1)
+//
+static SDValue lowerDSPIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
+  SDLoc DL(Op);
+  bool HasChainIn = Op->getOperand(0).getValueType() == MVT::Other;
+  SmallVector<SDValue, 3> Ops;
+  unsigned OpNo = 0;
+
+  // See if Op has a chain input.
+  if (HasChainIn)
+    Ops.push_back(Op->getOperand(OpNo++));
+
+  // The next operand is the intrinsic opcode.
+  assert(Op->getOperand(OpNo).getOpcode() == ISD::TargetConstant);
+
+  // See if the next operand has type i64.
+  SDValue Opnd = Op->getOperand(++OpNo), In64;
+
+  if (Opnd.getValueType() == MVT::i64)
+    In64 = initAccumulator(Opnd, DL, DAG);
+  else
+    Ops.push_back(Opnd);
+
+  // Push the remaining operands.
+  for (++OpNo ; OpNo < Op->getNumOperands(); ++OpNo)
+    Ops.push_back(Op->getOperand(OpNo));
+
+  // Add In64 to the end of the list.
+  if (In64.getNode())
+    Ops.push_back(In64);
+
+  // Scan output.
+  SmallVector<EVT, 2> ResTys;
+
+  for (SDNode::value_iterator I = Op->value_begin(), E = Op->value_end();
+       I != E; ++I)
+    ResTys.push_back((*I == MVT::i64) ? MVT::Untyped : *I);
+
+  // Create node.
+  SDValue Val = DAG.getNode(Opc, DL, ResTys, Ops);
+  SDValue Out = (ResTys[0] == MVT::Untyped) ? extractLOHI(Val, DL, DAG) : Val;
+
+  if (!HasChainIn)
+    return Out;
+
+  assert(Val->getValueType(1) == MVT::Other);
+  SDValue Vals[] = { Out, SDValue(Val.getNode(), 1) };
+  return DAG.getMergeValues(Vals, DL);
+}
+
+// Lower an MSA copy intrinsic into the specified SelectionDAG node
+static SDValue lowerMSACopyIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
+  SDLoc DL(Op);
+  SDValue Vec = Op->getOperand(1);
+  SDValue Idx = Op->getOperand(2);
+  EVT ResTy = Op->getValueType(0);
+  EVT EltTy = Vec->getValueType(0).getVectorElementType();
+
+  SDValue Result = DAG.getNode(Opc, DL, ResTy, Vec, Idx,
+                               DAG.getValueType(EltTy));
+
+  return Result;
+}
+
+static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
+  EVT ResVecTy = Op->getValueType(0);
+  EVT ViaVecTy = ResVecTy;
+  SDLoc DL(Op);
+
+  // When ResVecTy == MVT::v2i64, LaneA is the upper 32 bits of the lane and
+  // LaneB is the lower 32-bits. Otherwise LaneA and LaneB are alternating
+  // lanes.
+  SDValue LaneA;
+  SDValue LaneB = Op->getOperand(2);
+
+  if (ResVecTy == MVT::v2i64) {
+    LaneA = DAG.getConstant(0, DL, MVT::i32);
+    ViaVecTy = MVT::v4i32;
+  } else
+    LaneA = LaneB;
+
+  SDValue Ops[16] = { LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB,
+                      LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB };
+
+  SDValue Result = DAG.getBuildVector(
+      ViaVecTy, DL, makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
+
+  if (ViaVecTy != ResVecTy)
+    Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy, Result);
+
+  return Result;
+}
+
+static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG) {
+  return DAG.getConstant(Op->getConstantOperandVal(ImmOp), SDLoc(Op),
+                         Op->getValueType(0));
+}
+
+static SDValue getBuildVectorSplat(EVT VecTy, SDValue SplatValue,
+                                   bool BigEndian, SelectionDAG &DAG) {
+  EVT ViaVecTy = VecTy;
+  SDValue SplatValueA = SplatValue;
+  SDValue SplatValueB = SplatValue;
+  SDLoc DL(SplatValue);
+
+  if (VecTy == MVT::v2i64) {
+    // v2i64 BUILD_VECTOR must be performed via v4i32 so split into i32's.
+    ViaVecTy = MVT::v4i32;
+
+    SplatValueA = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, SplatValue);
+    SplatValueB = DAG.getNode(ISD::SRL, DL, MVT::i64, SplatValue,
+                              DAG.getConstant(32, DL, MVT::i32));
+    SplatValueB = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, SplatValueB);
+  }
+
+  // We currently hold the parts in little endian order. Swap them if
+  // necessary.
+  if (BigEndian)
+    std::swap(SplatValueA, SplatValueB);
+
+  SDValue Ops[16] = { SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB };
+
+  SDValue Result = DAG.getBuildVector(
+      ViaVecTy, DL, makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
+
+  if (VecTy != ViaVecTy)
+    Result = DAG.getNode(ISD::BITCAST, DL, VecTy, Result);
+
+  return Result;
+}
+
+static SDValue lowerMSABinaryBitImmIntr(SDValue Op, SelectionDAG &DAG,
+                                        unsigned Opc, SDValue Imm,
+                                        bool BigEndian) {
+  EVT VecTy = Op->getValueType(0);
+  SDValue Exp2Imm;
+  SDLoc DL(Op);
+
+  // The DAG Combiner can't constant fold bitcasted vectors yet so we must do it
+  // here for now.
+  if (VecTy == MVT::v2i64) {
+    if (ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(Imm)) {
+      APInt BitImm = APInt(64, 1) << CImm->getAPIntValue();
+
+      SDValue BitImmHiOp = DAG.getConstant(BitImm.lshr(32).trunc(32), DL,
+                                           MVT::i32);
+      SDValue BitImmLoOp = DAG.getConstant(BitImm.trunc(32), DL, MVT::i32);
+
+      if (BigEndian)
+        std::swap(BitImmLoOp, BitImmHiOp);
+
+      Exp2Imm = DAG.getNode(
+          ISD::BITCAST, DL, MVT::v2i64,
+          DAG.getBuildVector(MVT::v4i32, DL,
+                             {BitImmLoOp, BitImmHiOp, BitImmLoOp, BitImmHiOp}));
+    }
+  }
+
+  if (!Exp2Imm.getNode()) {
+    // We couldnt constant fold, do a vector shift instead
+
+    // Extend i32 to i64 if necessary. Sign or zero extend doesn't matter since
+    // only values 0-63 are valid.
+    if (VecTy == MVT::v2i64)
+      Imm = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Imm);
+
+    Exp2Imm = getBuildVectorSplat(VecTy, Imm, BigEndian, DAG);
+
+    Exp2Imm = DAG.getNode(ISD::SHL, DL, VecTy, DAG.getConstant(1, DL, VecTy),
+                          Exp2Imm);
+  }
+
+  return DAG.getNode(Opc, DL, VecTy, Op->getOperand(1), Exp2Imm);
+}
+
+static SDValue lowerMSABitClear(SDValue Op, SelectionDAG &DAG) {
+  EVT ResTy = Op->getValueType(0);
+  SDLoc DL(Op);
+  SDValue One = DAG.getConstant(1, DL, ResTy);
+  SDValue Bit = DAG.getNode(ISD::SHL, DL, ResTy, One, Op->getOperand(2));
+
+  return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1),
+                     DAG.getNOT(DL, Bit, ResTy));
+}
+
+static SDValue lowerMSABitClearImm(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  EVT ResTy = Op->getValueType(0);
+  APInt BitImm = APInt(ResTy.getScalarSizeInBits(), 1)
+                 << cast<ConstantSDNode>(Op->getOperand(2))->getAPIntValue();
+  SDValue BitMask = DAG.getConstant(~BitImm, DL, ResTy);
+
+  return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1), BitMask);
+}
+
+SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+
+  switch (cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue()) {
+  default:
+    return SDValue();
+  case Intrinsic::mips_shilo:
+    return lowerDSPIntr(Op, DAG, MipsISD::SHILO);
+  case Intrinsic::mips_dpau_h_qbl:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAU_H_QBL);
+  case Intrinsic::mips_dpau_h_qbr:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAU_H_QBR);
+  case Intrinsic::mips_dpsu_h_qbl:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSU_H_QBL);
+  case Intrinsic::mips_dpsu_h_qbr:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSU_H_QBR);
+  case Intrinsic::mips_dpa_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPA_W_PH);
+  case Intrinsic::mips_dps_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPS_W_PH);
+  case Intrinsic::mips_dpax_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAX_W_PH);
+  case Intrinsic::mips_dpsx_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSX_W_PH);
+  case Intrinsic::mips_mulsa_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::MULSA_W_PH);
+  case Intrinsic::mips_mult:
+    return lowerDSPIntr(Op, DAG, MipsISD::Mult);
+  case Intrinsic::mips_multu:
+    return lowerDSPIntr(Op, DAG, MipsISD::Multu);
+  case Intrinsic::mips_madd:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAdd);
+  case Intrinsic::mips_maddu:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAddu);
+  case Intrinsic::mips_msub:
+    return lowerDSPIntr(Op, DAG, MipsISD::MSub);
+  case Intrinsic::mips_msubu:
+    return lowerDSPIntr(Op, DAG, MipsISD::MSubu);
+  case Intrinsic::mips_addv_b:
+  case Intrinsic::mips_addv_h:
+  case Intrinsic::mips_addv_w:
+  case Intrinsic::mips_addv_d:
+    return DAG.getNode(ISD::ADD, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_addvi_b:
+  case Intrinsic::mips_addvi_h:
+  case Intrinsic::mips_addvi_w:
+  case Intrinsic::mips_addvi_d:
+    return DAG.getNode(ISD::ADD, DL, Op->getValueType(0), Op->getOperand(1),
+                       lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_and_v:
+    return DAG.getNode(ISD::AND, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_andi_b:
+    return DAG.getNode(ISD::AND, DL, Op->getValueType(0), Op->getOperand(1),
+                       lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_bclr_b:
+  case Intrinsic::mips_bclr_h:
+  case Intrinsic::mips_bclr_w:
+  case Intrinsic::mips_bclr_d:
+    return lowerMSABitClear(Op, DAG);
+  case Intrinsic::mips_bclri_b:
+  case Intrinsic::mips_bclri_h:
+  case Intrinsic::mips_bclri_w:
+  case Intrinsic::mips_bclri_d:
+    return lowerMSABitClearImm(Op, DAG);
+  case Intrinsic::mips_binsli_b:
+  case Intrinsic::mips_binsli_h:
+  case Intrinsic::mips_binsli_w:
+  case Intrinsic::mips_binsli_d: {
+    // binsli_x(IfClear, IfSet, nbits) -> (vselect LBitsMask, IfSet, IfClear)
+    EVT VecTy = Op->getValueType(0);
+    EVT EltTy = VecTy.getVectorElementType();
+    APInt Mask = APInt::getHighBitsSet(EltTy.getSizeInBits(),
+                                       Op->getConstantOperandVal(3));
+    return DAG.getNode(ISD::VSELECT, DL, VecTy,
+                       DAG.getConstant(Mask, DL, VecTy, true),
+                       Op->getOperand(2), Op->getOperand(1));
+  }
+  case Intrinsic::mips_binsri_b:
+  case Intrinsic::mips_binsri_h:
+  case Intrinsic::mips_binsri_w:
+  case Intrinsic::mips_binsri_d: {
+    // binsri_x(IfClear, IfSet, nbits) -> (vselect RBitsMask, IfSet, IfClear)
+    EVT VecTy = Op->getValueType(0);
+    EVT EltTy = VecTy.getVectorElementType();
+    APInt Mask = APInt::getLowBitsSet(EltTy.getSizeInBits(),
+                                      Op->getConstantOperandVal(3));
+    return DAG.getNode(ISD::VSELECT, DL, VecTy,
+                       DAG.getConstant(Mask, DL, VecTy, true),
+                       Op->getOperand(2), Op->getOperand(1));
+  }
+  case Intrinsic::mips_bmnz_v:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0), Op->getOperand(3),
+                       Op->getOperand(2), Op->getOperand(1));
+  case Intrinsic::mips_bmnzi_b:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       lowerMSASplatImm(Op, 3, DAG), Op->getOperand(2),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bmz_v:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0), Op->getOperand(3),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_bmzi_b:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       lowerMSASplatImm(Op, 3, DAG), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_bneg_b:
+  case Intrinsic::mips_bneg_h:
+  case Intrinsic::mips_bneg_w:
+  case Intrinsic::mips_bneg_d: {
+    EVT VecTy = Op->getValueType(0);
+    SDValue One = DAG.getConstant(1, DL, VecTy);
+
+    return DAG.getNode(ISD::XOR, DL, VecTy, Op->getOperand(1),
+                       DAG.getNode(ISD::SHL, DL, VecTy, One,
+                                   Op->getOperand(2)));
+  }
+  case Intrinsic::mips_bnegi_b:
+  case Intrinsic::mips_bnegi_h:
+  case Intrinsic::mips_bnegi_w:
+  case Intrinsic::mips_bnegi_d:
+    return lowerMSABinaryBitImmIntr(Op, DAG, ISD::XOR, Op->getOperand(2),
+                                    !Subtarget.isLittle());
+  case Intrinsic::mips_bnz_b:
+  case Intrinsic::mips_bnz_h:
+  case Intrinsic::mips_bnz_w:
+  case Intrinsic::mips_bnz_d:
+    return DAG.getNode(MipsISD::VALL_NONZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bnz_v:
+    return DAG.getNode(MipsISD::VANY_NONZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bsel_v:
+    // bsel_v(Mask, IfClear, IfSet) -> (vselect Mask, IfSet, IfClear)
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(3),
+                       Op->getOperand(2));
+  case Intrinsic::mips_bseli_b:
+    // bseli_v(Mask, IfClear, IfSet) -> (vselect Mask, IfSet, IfClear)
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 3, DAG),
+                       Op->getOperand(2));
+  case Intrinsic::mips_bset_b:
+  case Intrinsic::mips_bset_h:
+  case Intrinsic::mips_bset_w:
+  case Intrinsic::mips_bset_d: {
+    EVT VecTy = Op->getValueType(0);
+    SDValue One = DAG.getConstant(1, DL, VecTy);
+
+    return DAG.getNode(ISD::OR, DL, VecTy, Op->getOperand(1),
+                       DAG.getNode(ISD::SHL, DL, VecTy, One,
+                                   Op->getOperand(2)));
+  }
+  case Intrinsic::mips_bseti_b:
+  case Intrinsic::mips_bseti_h:
+  case Intrinsic::mips_bseti_w:
+  case Intrinsic::mips_bseti_d:
+    return lowerMSABinaryBitImmIntr(Op, DAG, ISD::OR, Op->getOperand(2),
+                                    !Subtarget.isLittle());
+  case Intrinsic::mips_bz_b:
+  case Intrinsic::mips_bz_h:
+  case Intrinsic::mips_bz_w:
+  case Intrinsic::mips_bz_d:
+    return DAG.getNode(MipsISD::VALL_ZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bz_v:
+    return DAG.getNode(MipsISD::VANY_ZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ceq_b:
+  case Intrinsic::mips_ceq_h:
+  case Intrinsic::mips_ceq_w:
+  case Intrinsic::mips_ceq_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETEQ);
+  case Intrinsic::mips_ceqi_b:
+  case Intrinsic::mips_ceqi_h:
+  case Intrinsic::mips_ceqi_w:
+  case Intrinsic::mips_ceqi_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETEQ);
+  case Intrinsic::mips_cle_s_b:
+  case Intrinsic::mips_cle_s_h:
+  case Intrinsic::mips_cle_s_w:
+  case Intrinsic::mips_cle_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETLE);
+  case Intrinsic::mips_clei_s_b:
+  case Intrinsic::mips_clei_s_h:
+  case Intrinsic::mips_clei_s_w:
+  case Intrinsic::mips_clei_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETLE);
+  case Intrinsic::mips_cle_u_b:
+  case Intrinsic::mips_cle_u_h:
+  case Intrinsic::mips_cle_u_w:
+  case Intrinsic::mips_cle_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULE);
+  case Intrinsic::mips_clei_u_b:
+  case Intrinsic::mips_clei_u_h:
+  case Intrinsic::mips_clei_u_w:
+  case Intrinsic::mips_clei_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETULE);
+  case Intrinsic::mips_clt_s_b:
+  case Intrinsic::mips_clt_s_h:
+  case Intrinsic::mips_clt_s_w:
+  case Intrinsic::mips_clt_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETLT);
+  case Intrinsic::mips_clti_s_b:
+  case Intrinsic::mips_clti_s_h:
+  case Intrinsic::mips_clti_s_w:
+  case Intrinsic::mips_clti_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETLT);
+  case Intrinsic::mips_clt_u_b:
+  case Intrinsic::mips_clt_u_h:
+  case Intrinsic::mips_clt_u_w:
+  case Intrinsic::mips_clt_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULT);
+  case Intrinsic::mips_clti_u_b:
+  case Intrinsic::mips_clti_u_h:
+  case Intrinsic::mips_clti_u_w:
+  case Intrinsic::mips_clti_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETULT);
+  case Intrinsic::mips_copy_s_b:
+  case Intrinsic::mips_copy_s_h:
+  case Intrinsic::mips_copy_s_w:
+    return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT);
+  case Intrinsic::mips_copy_s_d:
+    if (Subtarget.hasMips64())
+      // Lower directly into VEXTRACT_SEXT_ELT since i64 is legal on Mips64.
+      return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT);
+    else {
+      // Lower into the generic EXTRACT_VECTOR_ELT node and let the type
+      // legalizer and EXTRACT_VECTOR_ELT lowering sort it out.
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op),
+                         Op->getValueType(0), Op->getOperand(1),
+                         Op->getOperand(2));
+    }
+  case Intrinsic::mips_copy_u_b:
+  case Intrinsic::mips_copy_u_h:
+  case Intrinsic::mips_copy_u_w:
+    return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT);
+  case Intrinsic::mips_copy_u_d:
+    if (Subtarget.hasMips64())
+      // Lower directly into VEXTRACT_ZEXT_ELT since i64 is legal on Mips64.
+      return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT);
+    else {
+      // Lower into the generic EXTRACT_VECTOR_ELT node and let the type
+      // legalizer and EXTRACT_VECTOR_ELT lowering sort it out.
+      // Note: When i64 is illegal, this results in copy_s.w instructions
+      // instead of copy_u.w instructions. This makes no difference to the
+      // behaviour since i64 is only illegal when the register file is 32-bit.
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op),
+                         Op->getValueType(0), Op->getOperand(1),
+                         Op->getOperand(2));
+    }
+  case Intrinsic::mips_div_s_b:
+  case Intrinsic::mips_div_s_h:
+  case Intrinsic::mips_div_s_w:
+  case Intrinsic::mips_div_s_d:
+    return DAG.getNode(ISD::SDIV, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_div_u_b:
+  case Intrinsic::mips_div_u_h:
+  case Intrinsic::mips_div_u_w:
+  case Intrinsic::mips_div_u_d:
+    return DAG.getNode(ISD::UDIV, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_fadd_w:
+  case Intrinsic::mips_fadd_d: {
+    // TODO: If intrinsics have fast-math-flags, propagate them.
+    return DAG.getNode(ISD::FADD, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  }
+  // Don't lower mips_fcaf_[wd] since LLVM folds SETFALSE condcodes away
+  case Intrinsic::mips_fceq_w:
+  case Intrinsic::mips_fceq_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETOEQ);
+  case Intrinsic::mips_fcle_w:
+  case Intrinsic::mips_fcle_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETOLE);
+  case Intrinsic::mips_fclt_w:
+  case Intrinsic::mips_fclt_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETOLT);
+  case Intrinsic::mips_fcne_w:
+  case Intrinsic::mips_fcne_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETONE);
+  case Intrinsic::mips_fcor_w:
+  case Intrinsic::mips_fcor_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETO);
+  case Intrinsic::mips_fcueq_w:
+  case Intrinsic::mips_fcueq_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETUEQ);
+  case Intrinsic::mips_fcule_w:
+  case Intrinsic::mips_fcule_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULE);
+  case Intrinsic::mips_fcult_w:
+  case Intrinsic::mips_fcult_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULT);
+  case Intrinsic::mips_fcun_w:
+  case Intrinsic::mips_fcun_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETUO);
+  case Intrinsic::mips_fcune_w:
+  case Intrinsic::mips_fcune_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETUNE);
+  case Intrinsic::mips_fdiv_w:
+  case Intrinsic::mips_fdiv_d: {
+    // TODO: If intrinsics have fast-math-flags, propagate them.
+    return DAG.getNode(ISD::FDIV, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  }
+  case Intrinsic::mips_ffint_u_w:
+  case Intrinsic::mips_ffint_u_d:
+    return DAG.getNode(ISD::UINT_TO_FP, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ffint_s_w:
+  case Intrinsic::mips_ffint_s_d:
+    return DAG.getNode(ISD::SINT_TO_FP, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_fill_b:
+  case Intrinsic::mips_fill_h:
+  case Intrinsic::mips_fill_w:
+  case Intrinsic::mips_fill_d: {
+    EVT ResTy = Op->getValueType(0);
+    SmallVector<SDValue, 16> Ops(ResTy.getVectorNumElements(),
+                                 Op->getOperand(1));
+
+    // If ResTy is v2i64 then the type legalizer will break this node down into
+    // an equivalent v4i32.
+    return DAG.getBuildVector(ResTy, DL, Ops);
+  }
+  case Intrinsic::mips_fexp2_w:
+  case Intrinsic::mips_fexp2_d: {
+    // TODO: If intrinsics have fast-math-flags, propagate them.
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(
+        ISD::FMUL, SDLoc(Op), ResTy, Op->getOperand(1),
+        DAG.getNode(ISD::FEXP2, SDLoc(Op), ResTy, Op->getOperand(2)));
+  }
+  case Intrinsic::mips_flog2_w:
+  case Intrinsic::mips_flog2_d:
+    return DAG.getNode(ISD::FLOG2, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_fmadd_w:
+  case Intrinsic::mips_fmadd_d:
+    return DAG.getNode(ISD::FMA, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2), Op->getOperand(3));
+  case Intrinsic::mips_fmul_w:
+  case Intrinsic::mips_fmul_d: {
+    // TODO: If intrinsics have fast-math-flags, propagate them.
+    return DAG.getNode(ISD::FMUL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  }
+  case Intrinsic::mips_fmsub_w:
+  case Intrinsic::mips_fmsub_d: {
+    // TODO: If intrinsics have fast-math-flags, propagate them.
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::FSUB, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::FMUL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_frint_w:
+  case Intrinsic::mips_frint_d:
+    return DAG.getNode(ISD::FRINT, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_fsqrt_w:
+  case Intrinsic::mips_fsqrt_d:
+    return DAG.getNode(ISD::FSQRT, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_fsub_w:
+  case Intrinsic::mips_fsub_d: {
+    // TODO: If intrinsics have fast-math-flags, propagate them.
+    return DAG.getNode(ISD::FSUB, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  }
+  case Intrinsic::mips_ftrunc_u_w:
+  case Intrinsic::mips_ftrunc_u_d:
+    return DAG.getNode(ISD::FP_TO_UINT, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ftrunc_s_w:
+  case Intrinsic::mips_ftrunc_s_d:
+    return DAG.getNode(ISD::FP_TO_SINT, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ilvev_b:
+  case Intrinsic::mips_ilvev_h:
+  case Intrinsic::mips_ilvev_w:
+  case Intrinsic::mips_ilvev_d:
+    return DAG.getNode(MipsISD::ILVEV, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_ilvl_b:
+  case Intrinsic::mips_ilvl_h:
+  case Intrinsic::mips_ilvl_w:
+  case Intrinsic::mips_ilvl_d:
+    return DAG.getNode(MipsISD::ILVL, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_ilvod_b:
+  case Intrinsic::mips_ilvod_h:
+  case Intrinsic::mips_ilvod_w:
+  case Intrinsic::mips_ilvod_d:
+    return DAG.getNode(MipsISD::ILVOD, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_ilvr_b:
+  case Intrinsic::mips_ilvr_h:
+  case Intrinsic::mips_ilvr_w:
+  case Intrinsic::mips_ilvr_d:
+    return DAG.getNode(MipsISD::ILVR, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_insert_b:
+  case Intrinsic::mips_insert_h:
+  case Intrinsic::mips_insert_w:
+  case Intrinsic::mips_insert_d:
+    return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(3), Op->getOperand(2));
+  case Intrinsic::mips_insve_b:
+  case Intrinsic::mips_insve_h:
+  case Intrinsic::mips_insve_w:
+  case Intrinsic::mips_insve_d:
+    return DAG.getNode(MipsISD::INSVE, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2), Op->getOperand(3),
+                       DAG.getConstant(0, DL, MVT::i32));
+  case Intrinsic::mips_ldi_b:
+  case Intrinsic::mips_ldi_h:
+  case Intrinsic::mips_ldi_w:
+  case Intrinsic::mips_ldi_d:
+    return lowerMSASplatImm(Op, 1, DAG);
+  case Intrinsic::mips_lsa:
+  case Intrinsic::mips_dlsa: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::ADD, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::SHL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_maddv_b:
+  case Intrinsic::mips_maddv_h:
+  case Intrinsic::mips_maddv_w:
+  case Intrinsic::mips_maddv_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::ADD, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::MUL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_max_s_b:
+  case Intrinsic::mips_max_s_h:
+  case Intrinsic::mips_max_s_w:
+  case Intrinsic::mips_max_s_d:
+    return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_max_u_b:
+  case Intrinsic::mips_max_u_h:
+  case Intrinsic::mips_max_u_w:
+  case Intrinsic::mips_max_u_d:
+    return DAG.getNode(MipsISD::VUMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_maxi_s_b:
+  case Intrinsic::mips_maxi_s_h:
+  case Intrinsic::mips_maxi_s_w:
+  case Intrinsic::mips_maxi_s_d:
+    return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_maxi_u_b:
+  case Intrinsic::mips_maxi_u_h:
+  case Intrinsic::mips_maxi_u_w:
+  case Intrinsic::mips_maxi_u_d:
+    return DAG.getNode(MipsISD::VUMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_min_s_b:
+  case Intrinsic::mips_min_s_h:
+  case Intrinsic::mips_min_s_w:
+  case Intrinsic::mips_min_s_d:
+    return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_min_u_b:
+  case Intrinsic::mips_min_u_h:
+  case Intrinsic::mips_min_u_w:
+  case Intrinsic::mips_min_u_d:
+    return DAG.getNode(MipsISD::VUMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_mini_s_b:
+  case Intrinsic::mips_mini_s_h:
+  case Intrinsic::mips_mini_s_w:
+  case Intrinsic::mips_mini_s_d:
+    return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_mini_u_b:
+  case Intrinsic::mips_mini_u_h:
+  case Intrinsic::mips_mini_u_w:
+  case Intrinsic::mips_mini_u_d:
+    return DAG.getNode(MipsISD::VUMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_mod_s_b:
+  case Intrinsic::mips_mod_s_h:
+  case Intrinsic::mips_mod_s_w:
+  case Intrinsic::mips_mod_s_d:
+    return DAG.getNode(ISD::SREM, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_mod_u_b:
+  case Intrinsic::mips_mod_u_h:
+  case Intrinsic::mips_mod_u_w:
+  case Intrinsic::mips_mod_u_d:
+    return DAG.getNode(ISD::UREM, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_mulv_b:
+  case Intrinsic::mips_mulv_h:
+  case Intrinsic::mips_mulv_w:
+  case Intrinsic::mips_mulv_d:
+    return DAG.getNode(ISD::MUL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_msubv_b:
+  case Intrinsic::mips_msubv_h:
+  case Intrinsic::mips_msubv_w:
+  case Intrinsic::mips_msubv_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::SUB, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::MUL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_nlzc_b:
+  case Intrinsic::mips_nlzc_h:
+  case Intrinsic::mips_nlzc_w:
+  case Intrinsic::mips_nlzc_d:
+    return DAG.getNode(ISD::CTLZ, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_nor_v: {
+    SDValue Res = DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+                              Op->getOperand(1), Op->getOperand(2));
+    return DAG.getNOT(DL, Res, Res->getValueType(0));
+  }
+  case Intrinsic::mips_nori_b: {
+    SDValue Res =  DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+                               Op->getOperand(1),
+                               lowerMSASplatImm(Op, 2, DAG));
+    return DAG.getNOT(DL, Res, Res->getValueType(0));
+  }
+  case Intrinsic::mips_or_v:
+    return DAG.getNode(ISD::OR, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_ori_b:
+    return DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_pckev_b:
+  case Intrinsic::mips_pckev_h:
+  case Intrinsic::mips_pckev_w:
+  case Intrinsic::mips_pckev_d:
+    return DAG.getNode(MipsISD::PCKEV, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_pckod_b:
+  case Intrinsic::mips_pckod_h:
+  case Intrinsic::mips_pckod_w:
+  case Intrinsic::mips_pckod_d:
+    return DAG.getNode(MipsISD::PCKOD, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_pcnt_b:
+  case Intrinsic::mips_pcnt_h:
+  case Intrinsic::mips_pcnt_w:
+  case Intrinsic::mips_pcnt_d:
+    return DAG.getNode(ISD::CTPOP, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_shf_b:
+  case Intrinsic::mips_shf_h:
+  case Intrinsic::mips_shf_w:
+    return DAG.getNode(MipsISD::SHF, DL, Op->getValueType(0),
+                       Op->getOperand(2), Op->getOperand(1));
+  case Intrinsic::mips_sll_b:
+  case Intrinsic::mips_sll_h:
+  case Intrinsic::mips_sll_w:
+  case Intrinsic::mips_sll_d:
+    return DAG.getNode(ISD::SHL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_slli_b:
+  case Intrinsic::mips_slli_h:
+  case Intrinsic::mips_slli_w:
+  case Intrinsic::mips_slli_d:
+    return DAG.getNode(ISD::SHL, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_splat_b:
+  case Intrinsic::mips_splat_h:
+  case Intrinsic::mips_splat_w:
+  case Intrinsic::mips_splat_d:
+    // We can't lower via VECTOR_SHUFFLE because it requires constant shuffle
+    // masks, nor can we lower via BUILD_VECTOR & EXTRACT_VECTOR_ELT because
+    // EXTRACT_VECTOR_ELT can't extract i64's on MIPS32.
+    // Instead we lower to MipsISD::VSHF and match from there.
+    return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+                       lowerMSASplatZExt(Op, 2, DAG), Op->getOperand(1),
+                       Op->getOperand(1));
+  case Intrinsic::mips_splati_b:
+  case Intrinsic::mips_splati_h:
+  case Intrinsic::mips_splati_w:
+  case Intrinsic::mips_splati_d:
+    return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+                       lowerMSASplatImm(Op, 2, DAG), Op->getOperand(1),
+                       Op->getOperand(1));
+  case Intrinsic::mips_sra_b:
+  case Intrinsic::mips_sra_h:
+  case Intrinsic::mips_sra_w:
+  case Intrinsic::mips_sra_d:
+    return DAG.getNode(ISD::SRA, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_srai_b:
+  case Intrinsic::mips_srai_h:
+  case Intrinsic::mips_srai_w:
+  case Intrinsic::mips_srai_d:
+    return DAG.getNode(ISD::SRA, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_srl_b:
+  case Intrinsic::mips_srl_h:
+  case Intrinsic::mips_srl_w:
+  case Intrinsic::mips_srl_d:
+    return DAG.getNode(ISD::SRL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_srli_b:
+  case Intrinsic::mips_srli_h:
+  case Intrinsic::mips_srli_w:
+  case Intrinsic::mips_srli_d:
+    return DAG.getNode(ISD::SRL, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_subv_b:
+  case Intrinsic::mips_subv_h:
+  case Intrinsic::mips_subv_w:
+  case Intrinsic::mips_subv_d:
+    return DAG.getNode(ISD::SUB, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_subvi_b:
+  case Intrinsic::mips_subvi_h:
+  case Intrinsic::mips_subvi_w:
+  case Intrinsic::mips_subvi_d:
+    return DAG.getNode(ISD::SUB, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_vshf_b:
+  case Intrinsic::mips_vshf_h:
+  case Intrinsic::mips_vshf_w:
+  case Intrinsic::mips_vshf_d:
+    return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2), Op->getOperand(3));
+  case Intrinsic::mips_xor_v:
+    return DAG.getNode(ISD::XOR, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_xori_b:
+    return DAG.getNode(ISD::XOR, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::thread_pointer: {
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
+    return DAG.getNode(MipsISD::ThreadPointer, DL, PtrVT);
+  }
+  }
+}
+
+static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+  SDLoc DL(Op);
+  SDValue ChainIn = Op->getOperand(0);
+  SDValue Address = Op->getOperand(2);
+  SDValue Offset  = Op->getOperand(3);
+  EVT ResTy = Op->getValueType(0);
+  EVT PtrTy = Address->getValueType(0);
+
+  Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
+  return DAG.getLoad(ResTy, DL, ChainIn, Address, MachinePointerInfo(),
+                     /* Alignment = */ 16);
+}
+
+SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  unsigned Intr = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+  switch (Intr) {
+  default:
+    return SDValue();
+  case Intrinsic::mips_extp:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTP);
+  case Intrinsic::mips_extpdp:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTPDP);
+  case Intrinsic::mips_extr_w:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTR_W);
+  case Intrinsic::mips_extr_r_w:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTR_R_W);
+  case Intrinsic::mips_extr_rs_w:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTR_RS_W);
+  case Intrinsic::mips_extr_s_h:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTR_S_H);
+  case Intrinsic::mips_mthlip:
+    return lowerDSPIntr(Op, DAG, MipsISD::MTHLIP);
+  case Intrinsic::mips_mulsaq_s_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::MULSAQ_S_W_PH);
+  case Intrinsic::mips_maq_s_w_phl:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAQ_S_W_PHL);
+  case Intrinsic::mips_maq_s_w_phr:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAQ_S_W_PHR);
+  case Intrinsic::mips_maq_sa_w_phl:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAQ_SA_W_PHL);
+  case Intrinsic::mips_maq_sa_w_phr:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAQ_SA_W_PHR);
+  case Intrinsic::mips_dpaq_s_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAQ_S_W_PH);
+  case Intrinsic::mips_dpsq_s_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSQ_S_W_PH);
+  case Intrinsic::mips_dpaq_sa_l_w:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAQ_SA_L_W);
+  case Intrinsic::mips_dpsq_sa_l_w:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSQ_SA_L_W);
+  case Intrinsic::mips_dpaqx_s_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAQX_S_W_PH);
+  case Intrinsic::mips_dpaqx_sa_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAQX_SA_W_PH);
+  case Intrinsic::mips_dpsqx_s_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSQX_S_W_PH);
+  case Intrinsic::mips_dpsqx_sa_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSQX_SA_W_PH);
+  case Intrinsic::mips_ld_b:
+  case Intrinsic::mips_ld_h:
+  case Intrinsic::mips_ld_w:
+  case Intrinsic::mips_ld_d:
+   return lowerMSALoadIntr(Op, DAG, Intr);
+  }
+}
+
+static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+  SDLoc DL(Op);
+  SDValue ChainIn = Op->getOperand(0);
+  SDValue Value   = Op->getOperand(2);
+  SDValue Address = Op->getOperand(3);
+  SDValue Offset  = Op->getOperand(4);
+  EVT PtrTy = Address->getValueType(0);
+
+  Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
+
+  return DAG.getStore(ChainIn, DL, Value, Address, MachinePointerInfo(),
+                      /* Alignment = */ 16);
+}
+
+SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  unsigned Intr = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+  switch (Intr) {
+  default:
+    return SDValue();
+  case Intrinsic::mips_st_b:
+  case Intrinsic::mips_st_h:
+  case Intrinsic::mips_st_w:
+  case Intrinsic::mips_st_d:
+    return lowerMSAStoreIntr(Op, DAG, Intr);
+  }
+}
+
+/// \brief Check if the given BuildVectorSDNode is a splat.
+/// This method currently relies on DAG nodes being reused when equivalent,
+/// so it's possible for this to return false even when isConstantSplat returns
+/// true.
+static bool isSplatVector(const BuildVectorSDNode *N) {
+  unsigned int nOps = N->getNumOperands();
+  assert(nOps > 1 && "isSplatVector has 0 or 1 sized build vector");
+
+  SDValue Operand0 = N->getOperand(0);
+
+  for (unsigned int i = 1; i < nOps; ++i) {
+    if (N->getOperand(i) != Operand0)
+      return false;
+  }
+
+  return true;
+}
+
+// Lower ISD::EXTRACT_VECTOR_ELT into MipsISD::VEXTRACT_SEXT_ELT.
+//
+// The non-value bits resulting from ISD::EXTRACT_VECTOR_ELT are undefined. We
+// choose to sign-extend but we could have equally chosen zero-extend. The
+// DAGCombiner will fold any sign/zero extension of the ISD::EXTRACT_VECTOR_ELT
+// result into this node later (possibly changing it to a zero-extend in the
+// process).
+SDValue MipsSETargetLowering::
+lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT ResTy = Op->getValueType(0);
+  SDValue Op0 = Op->getOperand(0);
+  EVT VecTy = Op0->getValueType(0);
+
+  if (!VecTy.is128BitVector())
+    return SDValue();
+
+  if (ResTy.isInteger()) {
+    SDValue Op1 = Op->getOperand(1);
+    EVT EltTy = VecTy.getVectorElementType();
+    return DAG.getNode(MipsISD::VEXTRACT_SEXT_ELT, DL, ResTy, Op0, Op1,
+                       DAG.getValueType(EltTy));
+  }
+
+  return Op;
+}
+
+static bool isConstantOrUndef(const SDValue Op) {
+  if (Op->isUndef())
+    return true;
+  if (isa<ConstantSDNode>(Op))
+    return true;
+  if (isa<ConstantFPSDNode>(Op))
+    return true;
+  return false;
+}
+
+static bool isConstantOrUndefBUILD_VECTOR(const BuildVectorSDNode *Op) {
+  for (unsigned i = 0; i < Op->getNumOperands(); ++i)
+    if (isConstantOrUndef(Op->getOperand(i)))
+      return true;
+  return false;
+}
+
+// Lowers ISD::BUILD_VECTOR into appropriate SelectionDAG nodes for the
+// backend.
+//
+// Lowers according to the following rules:
+// - Constant splats are legal as-is as long as the SplatBitSize is a power of
+//   2 less than or equal to 64 and the value fits into a signed 10-bit
+//   immediate
+// - Constant splats are lowered to bitconverted BUILD_VECTORs if SplatBitSize
+//   is a power of 2 less than or equal to 64 and the value does not fit into a
+//   signed 10-bit immediate
+// - Non-constant splats are legal as-is.
+// - Non-constant non-splats are lowered to sequences of INSERT_VECTOR_ELT.
+// - All others are illegal and must be expanded.
+SDValue MipsSETargetLowering::lowerBUILD_VECTOR(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  BuildVectorSDNode *Node = cast<BuildVectorSDNode>(Op);
+  EVT ResTy = Op->getValueType(0);
+  SDLoc DL(Op);
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!Subtarget.hasMSA() || !ResTy.is128BitVector())
+    return SDValue();
+
+  if (Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                            HasAnyUndefs, 8,
+                            !Subtarget.isLittle()) && SplatBitSize <= 64) {
+    // We can only cope with 8, 16, 32, or 64-bit elements
+    if (SplatBitSize != 8 && SplatBitSize != 16 && SplatBitSize != 32 &&
+        SplatBitSize != 64)
+      return SDValue();
+
+    // If the value fits into a simm10 then we can use ldi.[bhwd]
+    // However, if it isn't an integer type we will have to bitcast from an
+    // integer type first. Also, if there are any undefs, we must lower them
+    // to defined values first.
+    if (ResTy.isInteger() && !HasAnyUndefs && SplatValue.isSignedIntN(10))
+      return Op;
+
+    EVT ViaVecTy;
+
+    switch (SplatBitSize) {
+    default:
+      return SDValue();
+    case 8:
+      ViaVecTy = MVT::v16i8;
+      break;
+    case 16:
+      ViaVecTy = MVT::v8i16;
+      break;
+    case 32:
+      ViaVecTy = MVT::v4i32;
+      break;
+    case 64:
+      // There's no fill.d to fall back on for 64-bit values
+      return SDValue();
+    }
+
+    // SelectionDAG::getConstant will promote SplatValue appropriately.
+    SDValue Result = DAG.getConstant(SplatValue, DL, ViaVecTy);
+
+    // Bitcast to the type we originally wanted
+    if (ViaVecTy != ResTy)
+      Result = DAG.getNode(ISD::BITCAST, SDLoc(Node), ResTy, Result);
+
+    return Result;
+  } else if (isSplatVector(Node))
+    return Op;
+  else if (!isConstantOrUndefBUILD_VECTOR(Node)) {
+    // Use INSERT_VECTOR_ELT operations rather than expand to stores.
+    // The resulting code is the same length as the expansion, but it doesn't
+    // use memory operations
+    EVT ResTy = Node->getValueType(0);
+
+    assert(ResTy.isVector());
+
+    unsigned NumElts = ResTy.getVectorNumElements();
+    SDValue Vector = DAG.getUNDEF(ResTy);
+    for (unsigned i = 0; i < NumElts; ++i) {
+      Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector,
+                           Node->getOperand(i),
+                           DAG.getConstant(i, DL, MVT::i32));
+    }
+    return Vector;
+  }
+
+  return SDValue();
+}
+
+// Lower VECTOR_SHUFFLE into SHF (if possible).
+//
+// SHF splits the vector into blocks of four elements, then shuffles these
+// elements according to a <4 x i2> constant (encoded as an integer immediate).
+//
+// It is therefore possible to lower into SHF when the mask takes the form:
+//   <a, b, c, d, a+4, b+4, c+4, d+4, a+8, b+8, c+8, d+8, ...>
+// When undef's appear they are treated as if they were whatever value is
+// necessary in order to fit the above forms.
+//
+// For example:
+//   %2 = shufflevector <8 x i16> %0, <8 x i16> undef,
+//                      <8 x i32> <i32 3, i32 2, i32 1, i32 0,
+//                                 i32 7, i32 6, i32 5, i32 4>
+// is lowered to:
+//   (SHF_H $w0, $w1, 27)
+// where the 27 comes from:
+//   3 + (2 << 2) + (1 << 4) + (0 << 6)
+static SDValue lowerVECTOR_SHUFFLE_SHF(SDValue Op, EVT ResTy,
+                                       SmallVector<int, 16> Indices,
+                                       SelectionDAG &DAG) {
+  int SHFIndices[4] = { -1, -1, -1, -1 };
+
+  if (Indices.size() < 4)
+    return SDValue();
+
+  for (unsigned i = 0; i < 4; ++i) {
+    for (unsigned j = i; j < Indices.size(); j += 4) {
+      int Idx = Indices[j];
+
+      // Convert from vector index to 4-element subvector index
+      // If an index refers to an element outside of the subvector then give up
+      if (Idx != -1) {
+        Idx -= 4 * (j / 4);
+        if (Idx < 0 || Idx >= 4)
+          return SDValue();
+      }
+
+      // If the mask has an undef, replace it with the current index.
+      // Note that it might still be undef if the current index is also undef
+      if (SHFIndices[i] == -1)
+        SHFIndices[i] = Idx;
+
+      // Check that non-undef values are the same as in the mask. If they
+      // aren't then give up
+      if (!(Idx == -1 || Idx == SHFIndices[i]))
+        return SDValue();
+    }
+  }
+
+  // Calculate the immediate. Replace any remaining undefs with zero
+  APInt Imm(32, 0);
+  for (int i = 3; i >= 0; --i) {
+    int Idx = SHFIndices[i];
+
+    if (Idx == -1)
+      Idx = 0;
+
+    Imm <<= 2;
+    Imm |= Idx & 0x3;
+  }
+
+  SDLoc DL(Op);
+  return DAG.getNode(MipsISD::SHF, DL, ResTy,
+                     DAG.getConstant(Imm, DL, MVT::i32), Op->getOperand(0));
+}
+
+/// Determine whether a range fits a regular pattern of values.
+/// This function accounts for the possibility of jumping over the End iterator.
+template <typename ValType>
+static bool
+fitsRegularPattern(typename SmallVectorImpl<ValType>::const_iterator Begin,
+                   unsigned CheckStride,
+                   typename SmallVectorImpl<ValType>::const_iterator End,
+                   ValType ExpectedIndex, unsigned ExpectedIndexStride) {
+  auto &I = Begin;
+
+  while (I != End) {
+    if (*I != -1 && *I != ExpectedIndex)
+      return false;
+    ExpectedIndex += ExpectedIndexStride;
+
+    // Incrementing past End is undefined behaviour so we must increment one
+    // step at a time and check for End at each step.
+    for (unsigned n = 0; n < CheckStride && I != End; ++n, ++I)
+      ; // Empty loop body.
+  }
+  return true;
+}
+
+// Determine whether VECTOR_SHUFFLE is a SPLATI.
+//
+// It is a SPLATI when the mask is:
+//   <x, x, x, ...>
+// where x is any valid index.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static bool isVECTOR_SHUFFLE_SPLATI(SDValue Op, EVT ResTy,
+                                    SmallVector<int, 16> Indices,
+                                    SelectionDAG &DAG) {
+  assert((Indices.size() % 2) == 0);
+
+  int SplatIndex = -1;
+  for (const auto &V : Indices) {
+    if (V != -1) {
+      SplatIndex = V;
+      break;
+    }
+  }
+
+  return fitsRegularPattern<int>(Indices.begin(), 1, Indices.end(), SplatIndex,
+                                 0);
+}
+
+// Lower VECTOR_SHUFFLE into ILVEV (if possible).
+//
+// ILVEV interleaves the even elements from each vector.
+//
+// It is possible to lower into ILVEV when the mask consists of two of the
+// following forms interleaved:
+//   <0, 2, 4, ...>
+//   <n, n+2, n+4, ...>
+// where n is the number of elements in the vector.
+// For example:
+//   <0, 0, 2, 2, 4, 4, ...>
+//   <0, n, 2, n+2, 4, n+4, ...>
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_ILVEV(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert((Indices.size() % 2) == 0);
+
+  SDValue Wt;
+  SDValue Ws;
+  const auto &Begin = Indices.begin();
+  const auto &End = Indices.end();
+
+  // Check even elements are taken from the even elements of one half or the
+  // other and pick an operand accordingly.
+  if (fitsRegularPattern<int>(Begin, 2, End, 0, 2))
+    Wt = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin, 2, End, Indices.size(), 2))
+    Wt = Op->getOperand(1);
+  else
+    return SDValue();
+
+  // Check odd elements are taken from the even elements of one half or the
+  // other and pick an operand accordingly.
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 2))
+    Ws = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Indices.size(), 2))
+    Ws = Op->getOperand(1);
+  else
+    return SDValue();
+
+  return DAG.getNode(MipsISD::ILVEV, SDLoc(Op), ResTy, Ws, Wt);
+}
+
+// Lower VECTOR_SHUFFLE into ILVOD (if possible).
+//
+// ILVOD interleaves the odd elements from each vector.
+//
+// It is possible to lower into ILVOD when the mask consists of two of the
+// following forms interleaved:
+//   <1, 3, 5, ...>
+//   <n+1, n+3, n+5, ...>
+// where n is the number of elements in the vector.
+// For example:
+//   <1, 1, 3, 3, 5, 5, ...>
+//   <1, n+1, 3, n+3, 5, n+5, ...>
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_ILVOD(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert((Indices.size() % 2) == 0);
+
+  SDValue Wt;
+  SDValue Ws;
+  const auto &Begin = Indices.begin();
+  const auto &End = Indices.end();
+
+  // Check even elements are taken from the odd elements of one half or the
+  // other and pick an operand accordingly.
+  if (fitsRegularPattern<int>(Begin, 2, End, 1, 2))
+    Wt = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin, 2, End, Indices.size() + 1, 2))
+    Wt = Op->getOperand(1);
+  else
+    return SDValue();
+
+  // Check odd elements are taken from the odd elements of one half or the
+  // other and pick an operand accordingly.
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, 1, 2))
+    Ws = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Indices.size() + 1, 2))
+    Ws = Op->getOperand(1);
+  else
+    return SDValue();
+
+  return DAG.getNode(MipsISD::ILVOD, SDLoc(Op), ResTy, Wt, Ws);
+}
+
+// Lower VECTOR_SHUFFLE into ILVR (if possible).
+//
+// ILVR interleaves consecutive elements from the right (lowest-indexed) half of
+// each vector.
+//
+// It is possible to lower into ILVR when the mask consists of two of the
+// following forms interleaved:
+//   <0, 1, 2, ...>
+//   <n, n+1, n+2, ...>
+// where n is the number of elements in the vector.
+// For example:
+//   <0, 0, 1, 1, 2, 2, ...>
+//   <0, n, 1, n+1, 2, n+2, ...>
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_ILVR(SDValue Op, EVT ResTy,
+                                        SmallVector<int, 16> Indices,
+                                        SelectionDAG &DAG) {
+  assert((Indices.size() % 2) == 0);
+
+  SDValue Wt;
+  SDValue Ws;
+  const auto &Begin = Indices.begin();
+  const auto &End = Indices.end();
+
+  // Check even elements are taken from the right (lowest-indexed) elements of
+  // one half or the other and pick an operand accordingly.
+  if (fitsRegularPattern<int>(Begin, 2, End, 0, 1))
+    Wt = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin, 2, End, Indices.size(), 1))
+    Wt = Op->getOperand(1);
+  else
+    return SDValue();
+
+  // Check odd elements are taken from the right (lowest-indexed) elements of
+  // one half or the other and pick an operand accordingly.
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 1))
+    Ws = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Indices.size(), 1))
+    Ws = Op->getOperand(1);
+  else
+    return SDValue();
+
+  return DAG.getNode(MipsISD::ILVR, SDLoc(Op), ResTy, Ws, Wt);
+}
+
+// Lower VECTOR_SHUFFLE into ILVL (if possible).
+//
+// ILVL interleaves consecutive elements from the left (highest-indexed) half
+// of each vector.
+//
+// It is possible to lower into ILVL when the mask consists of two of the
+// following forms interleaved:
+//   <x, x+1, x+2, ...>
+//   <n+x, n+x+1, n+x+2, ...>
+// where n is the number of elements in the vector and x is half n.
+// For example:
+//   <x, x, x+1, x+1, x+2, x+2, ...>
+//   <x, n+x, x+1, n+x+1, x+2, n+x+2, ...>
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_ILVL(SDValue Op, EVT ResTy,
+                                        SmallVector<int, 16> Indices,
+                                        SelectionDAG &DAG) {
+  assert((Indices.size() % 2) == 0);
+
+  unsigned HalfSize = Indices.size() / 2;
+  SDValue Wt;
+  SDValue Ws;
+  const auto &Begin = Indices.begin();
+  const auto &End = Indices.end();
+
+  // Check even elements are taken from the left (highest-indexed) elements of
+  // one half or the other and pick an operand accordingly.
+  if (fitsRegularPattern<int>(Begin, 2, End, HalfSize, 1))
+    Wt = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin, 2, End, Indices.size() + HalfSize, 1))
+    Wt = Op->getOperand(1);
+  else
+    return SDValue();
+
+  // Check odd elements are taken from the left (highest-indexed) elements of
+  // one half or the other and pick an operand accordingly.
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, HalfSize, 1))
+    Ws = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Indices.size() + HalfSize,
+                                   1))
+    Ws = Op->getOperand(1);
+  else
+    return SDValue();
+
+  return DAG.getNode(MipsISD::ILVL, SDLoc(Op), ResTy, Ws, Wt);
+}
+
+// Lower VECTOR_SHUFFLE into PCKEV (if possible).
+//
+// PCKEV copies the even elements of each vector into the result vector.
+//
+// It is possible to lower into PCKEV when the mask consists of two of the
+// following forms concatenated:
+//   <0, 2, 4, ...>
+//   <n, n+2, n+4, ...>
+// where n is the number of elements in the vector.
+// For example:
+//   <0, 2, 4, ..., 0, 2, 4, ...>
+//   <0, 2, 4, ..., n, n+2, n+4, ...>
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_PCKEV(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert((Indices.size() % 2) == 0);
+
+  SDValue Wt;
+  SDValue Ws;
+  const auto &Begin = Indices.begin();
+  const auto &Mid = Indices.begin() + Indices.size() / 2;
+  const auto &End = Indices.end();
+
+  if (fitsRegularPattern<int>(Begin, 1, Mid, 0, 2))
+    Wt = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin, 1, Mid, Indices.size(), 2))
+    Wt = Op->getOperand(1);
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Mid, 1, End, 0, 2))
+    Ws = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Mid, 1, End, Indices.size(), 2))
+    Ws = Op->getOperand(1);
+  else
+    return SDValue();
+
+  return DAG.getNode(MipsISD::PCKEV, SDLoc(Op), ResTy, Ws, Wt);
+}
+
+// Lower VECTOR_SHUFFLE into PCKOD (if possible).
+//
+// PCKOD copies the odd elements of each vector into the result vector.
+//
+// It is possible to lower into PCKOD when the mask consists of two of the
+// following forms concatenated:
+//   <1, 3, 5, ...>
+//   <n+1, n+3, n+5, ...>
+// where n is the number of elements in the vector.
+// For example:
+//   <1, 3, 5, ..., 1, 3, 5, ...>
+//   <1, 3, 5, ..., n+1, n+3, n+5, ...>
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_PCKOD(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert((Indices.size() % 2) == 0);
+
+  SDValue Wt;
+  SDValue Ws;
+  const auto &Begin = Indices.begin();
+  const auto &Mid = Indices.begin() + Indices.size() / 2;
+  const auto &End = Indices.end();
+
+  if (fitsRegularPattern<int>(Begin, 1, Mid, 1, 2))
+    Wt = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin, 1, Mid, Indices.size() + 1, 2))
+    Wt = Op->getOperand(1);
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Mid, 1, End, 1, 2))
+    Ws = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Mid, 1, End, Indices.size() + 1, 2))
+    Ws = Op->getOperand(1);
+  else
+    return SDValue();
+
+  return DAG.getNode(MipsISD::PCKOD, SDLoc(Op), ResTy, Ws, Wt);
+}
+
+// Lower VECTOR_SHUFFLE into VSHF.
+//
+// This mostly consists of converting the shuffle indices in Indices into a
+// BUILD_VECTOR and adding it as an operand to the resulting VSHF. There is
+// also code to eliminate unused operands of the VECTOR_SHUFFLE. For example,
+// if the type is v8i16 and all the indices are less than 8 then the second
+// operand is unused and can be replaced with anything. We choose to replace it
+// with the used operand since this reduces the number of instructions overall.
+static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
+                                        SmallVector<int, 16> Indices,
+                                        SelectionDAG &DAG) {
+  SmallVector<SDValue, 16> Ops;
+  SDValue Op0;
+  SDValue Op1;
+  EVT MaskVecTy = ResTy.changeVectorElementTypeToInteger();
+  EVT MaskEltTy = MaskVecTy.getVectorElementType();
+  bool Using1stVec = false;
+  bool Using2ndVec = false;
+  SDLoc DL(Op);
+  int ResTyNumElts = ResTy.getVectorNumElements();
+
+  for (int i = 0; i < ResTyNumElts; ++i) {
+    // Idx == -1 means UNDEF
+    int Idx = Indices[i];
+
+    if (0 <= Idx && Idx < ResTyNumElts)
+      Using1stVec = true;
+    if (ResTyNumElts <= Idx && Idx < ResTyNumElts * 2)
+      Using2ndVec = true;
+  }
+
+  for (SmallVector<int, 16>::iterator I = Indices.begin(); I != Indices.end();
+       ++I)
+    Ops.push_back(DAG.getTargetConstant(*I, DL, MaskEltTy));
+
+  SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, Ops);
+
+  if (Using1stVec && Using2ndVec) {
+    Op0 = Op->getOperand(0);
+    Op1 = Op->getOperand(1);
+  } else if (Using1stVec)
+    Op0 = Op1 = Op->getOperand(0);
+  else if (Using2ndVec)
+    Op0 = Op1 = Op->getOperand(1);
+  else
+    llvm_unreachable("shuffle vector mask references neither vector operand?");
+
+  // VECTOR_SHUFFLE concatenates the vectors in an vectorwise fashion.
+  // <0b00, 0b01> + <0b10, 0b11> -> <0b00, 0b01, 0b10, 0b11>
+  // VSHF concatenates the vectors in a bitwise fashion:
+  // <0b00, 0b01> + <0b10, 0b11> ->
+  // 0b0100       + 0b1110       -> 0b01001110
+  //                                <0b10, 0b11, 0b00, 0b01>
+  // We must therefore swap the operands to get the correct result.
+  return DAG.getNode(MipsISD::VSHF, DL, ResTy, MaskVec, Op1, Op0);
+}
+
+// Lower VECTOR_SHUFFLE into one of a number of instructions depending on the
+// indices in the shuffle.
+SDValue MipsSETargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  ShuffleVectorSDNode *Node = cast<ShuffleVectorSDNode>(Op);
+  EVT ResTy = Op->getValueType(0);
+
+  if (!ResTy.is128BitVector())
+    return SDValue();
+
+  int ResTyNumElts = ResTy.getVectorNumElements();
+  SmallVector<int, 16> Indices;
+
+  for (int i = 0; i < ResTyNumElts; ++i)
+    Indices.push_back(Node->getMaskElt(i));
+
+  // splati.[bhwd] is preferable to the others but is matched from
+  // MipsISD::VSHF.
+  if (isVECTOR_SHUFFLE_SPLATI(Op, ResTy, Indices, DAG))
+    return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
+  SDValue Result;
+  if ((Result = lowerVECTOR_SHUFFLE_ILVEV(Op, ResTy, Indices, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_ILVOD(Op, ResTy, Indices, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_ILVL(Op, ResTy, Indices, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_ILVR(Op, ResTy, Indices, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_PCKEV(Op, ResTy, Indices, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_PCKOD(Op, ResTy, Indices, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_SHF(Op, ResTy, Indices, DAG)))
+    return Result;
+  return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
+}
+
+MachineBasicBlock *
+MipsSETargetLowering::emitBPOSGE32(MachineInstr &MI,
+                                   MachineBasicBlock *BB) const {
+  // $bb:
+  //  bposge32_pseudo $vr0
+  //  =>
+  // $bb:
+  //  bposge32 $tbb
+  // $fbb:
+  //  li $vr2, 0
+  //  b $sink
+  // $tbb:
+  //  li $vr1, 1
+  // $sink:
+  //  $vr0 = phi($vr2, $fbb, $vr1, $tbb)
+
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  DebugLoc DL = MI.getDebugLoc();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = std::next(MachineFunction::iterator(BB));
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *FBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *TBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *Sink  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, FBB);
+  F->insert(It, TBB);
+  F->insert(It, Sink);
+
+  // Transfer the remainder of BB and its successor edges to Sink.
+  Sink->splice(Sink->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
+               BB->end());
+  Sink->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add successors.
+  BB->addSuccessor(FBB);
+  BB->addSuccessor(TBB);
+  FBB->addSuccessor(Sink);
+  TBB->addSuccessor(Sink);
+
+  // Insert the real bposge32 instruction to $BB.
+  BuildMI(BB, DL, TII->get(Mips::BPOSGE32)).addMBB(TBB);
+  // Insert the real bposge32c instruction to $BB.
+  BuildMI(BB, DL, TII->get(Mips::BPOSGE32C_MMR3)).addMBB(TBB);
+
+  // Fill $FBB.
+  unsigned VR2 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), VR2)
+    .addReg(Mips::ZERO).addImm(0);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
+
+  // Fill $TBB.
+  unsigned VR1 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), VR1)
+    .addReg(Mips::ZERO).addImm(1);
+
+  // Insert phi function to $Sink.
+  BuildMI(*Sink, Sink->begin(), DL, TII->get(Mips::PHI),
+          MI.getOperand(0).getReg())
+      .addReg(VR2)
+      .addMBB(FBB)
+      .addReg(VR1)
+      .addMBB(TBB);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return Sink;
+}
+
+MachineBasicBlock *MipsSETargetLowering::emitMSACBranchPseudo(
+    MachineInstr &MI, MachineBasicBlock *BB, unsigned BranchOp) const {
+  // $bb:
+  //  vany_nonzero $rd, $ws
+  //  =>
+  // $bb:
+  //  bnz.b $ws, $tbb
+  //  b $fbb
+  // $fbb:
+  //  li $rd1, 0
+  //  b $sink
+  // $tbb:
+  //  li $rd2, 1
+  // $sink:
+  //  $rd = phi($rd1, $fbb, $rd2, $tbb)
+
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  DebugLoc DL = MI.getDebugLoc();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = std::next(MachineFunction::iterator(BB));
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *FBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *TBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *Sink  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, FBB);
+  F->insert(It, TBB);
+  F->insert(It, Sink);
+
+  // Transfer the remainder of BB and its successor edges to Sink.
+  Sink->splice(Sink->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
+               BB->end());
+  Sink->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add successors.
+  BB->addSuccessor(FBB);
+  BB->addSuccessor(TBB);
+  FBB->addSuccessor(Sink);
+  TBB->addSuccessor(Sink);
+
+  // Insert the real bnz.b instruction to $BB.
+  BuildMI(BB, DL, TII->get(BranchOp))
+      .addReg(MI.getOperand(1).getReg())
+      .addMBB(TBB);
+
+  // Fill $FBB.
+  unsigned RD1 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), RD1)
+    .addReg(Mips::ZERO).addImm(0);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
+
+  // Fill $TBB.
+  unsigned RD2 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), RD2)
+    .addReg(Mips::ZERO).addImm(1);
+
+  // Insert phi function to $Sink.
+  BuildMI(*Sink, Sink->begin(), DL, TII->get(Mips::PHI),
+          MI.getOperand(0).getReg())
+      .addReg(RD1)
+      .addMBB(FBB)
+      .addReg(RD2)
+      .addMBB(TBB);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return Sink;
+}
+
+// Emit the COPY_FW pseudo instruction.
+//
+// copy_fw_pseudo $fd, $ws, n
+// =>
+// copy_u_w $rt, $ws, $n
+// mtc1     $rt, $fd
+//
+// When n is zero, the equivalent operation can be performed with (potentially)
+// zero instructions due to register overlaps. This optimization is never valid
+// for lane 1 because it would require FR=0 mode which isn't supported by MSA.
+MachineBasicBlock *
+MipsSETargetLowering::emitCOPY_FW(MachineInstr &MI,
+                                  MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Fd = MI.getOperand(0).getReg();
+  unsigned Ws = MI.getOperand(1).getReg();
+  unsigned Lane = MI.getOperand(2).getImm();
+
+  if (Lane == 0) {
+    unsigned Wt = Ws;
+    if (!Subtarget.useOddSPReg()) {
+      // We must copy to an even-numbered MSA register so that the
+      // single-precision sub-register is also guaranteed to be even-numbered.
+      Wt = RegInfo.createVirtualRegister(&Mips::MSA128WEvensRegClass);
+
+      BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Wt).addReg(Ws);
+    }
+
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
+  } else {
+    unsigned Wt = RegInfo.createVirtualRegister(
+        Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass :
+                                  &Mips::MSA128WEvensRegClass);
+
+    BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(Lane);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
+  }
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the COPY_FD pseudo instruction.
+//
+// copy_fd_pseudo $fd, $ws, n
+// =>
+// splati.d $wt, $ws, $n
+// copy $fd, $wt:sub_64
+//
+// When n is zero, the equivalent operation can be performed with (potentially)
+// zero instructions due to register overlaps. This optimization is always
+// valid because FR=1 mode which is the only supported mode in MSA.
+MachineBasicBlock *
+MipsSETargetLowering::emitCOPY_FD(MachineInstr &MI,
+                                  MachineBasicBlock *BB) const {
+  assert(Subtarget.isFP64bit());
+
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  unsigned Fd = MI.getOperand(0).getReg();
+  unsigned Ws = MI.getOperand(1).getReg();
+  unsigned Lane = MI.getOperand(2).getImm() * 2;
+  DebugLoc DL = MI.getDebugLoc();
+
+  if (Lane == 0)
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_64);
+  else {
+    unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+    BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wt).addReg(Ws).addImm(1);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_64);
+  }
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the INSERT_FW pseudo instruction.
+//
+// insert_fw_pseudo $wd, $wd_in, $n, $fs
+// =>
+// subreg_to_reg $wt:sub_lo, $fs
+// insve_w $wd[$n], $wd_in, $wt[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitINSERT_FW(MachineInstr &MI,
+                                    MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Wd = MI.getOperand(0).getReg();
+  unsigned Wd_in = MI.getOperand(1).getReg();
+  unsigned Lane = MI.getOperand(2).getImm();
+  unsigned Fs = MI.getOperand(3).getReg();
+  unsigned Wt = RegInfo.createVirtualRegister(
+      Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass :
+                                &Mips::MSA128WEvensRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+      .addImm(0)
+      .addReg(Fs)
+      .addImm(Mips::sub_lo);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSVE_W), Wd)
+      .addReg(Wd_in)
+      .addImm(Lane)
+      .addReg(Wt)
+      .addImm(0);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the INSERT_FD pseudo instruction.
+//
+// insert_fd_pseudo $wd, $fs, n
+// =>
+// subreg_to_reg $wt:sub_64, $fs
+// insve_d $wd[$n], $wd_in, $wt[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitINSERT_FD(MachineInstr &MI,
+                                    MachineBasicBlock *BB) const {
+  assert(Subtarget.isFP64bit());
+
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Wd = MI.getOperand(0).getReg();
+  unsigned Wd_in = MI.getOperand(1).getReg();
+  unsigned Lane = MI.getOperand(2).getImm();
+  unsigned Fs = MI.getOperand(3).getReg();
+  unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+      .addImm(0)
+      .addReg(Fs)
+      .addImm(Mips::sub_64);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSVE_D), Wd)
+      .addReg(Wd_in)
+      .addImm(Lane)
+      .addReg(Wt)
+      .addImm(0);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction.
+//
+// For integer:
+// (INSERT_([BHWD]|F[WD])_PSEUDO $wd, $wd_in, $n, $rs)
+// =>
+// (SLL $lanetmp1, $lane, <log2size)
+// (SLD_B $wdtmp1, $wd_in, $wd_in, $lanetmp1)
+// (INSERT_[BHWD], $wdtmp2, $wdtmp1, 0, $rs)
+// (NEG $lanetmp2, $lanetmp1)
+// (SLD_B $wd, $wdtmp2, $wdtmp2,  $lanetmp2)
+//
+// For floating point:
+// (INSERT_([BHWD]|F[WD])_PSEUDO $wd, $wd_in, $n, $fs)
+// =>
+// (SUBREG_TO_REG $wt, $fs, <subreg>)
+// (SLL $lanetmp1, $lane, <log2size)
+// (SLD_B $wdtmp1, $wd_in, $wd_in, $lanetmp1)
+// (INSVE_[WD], $wdtmp2, 0, $wdtmp1, 0)
+// (NEG $lanetmp2, $lanetmp1)
+// (SLD_B $wd, $wdtmp2, $wdtmp2,  $lanetmp2)
+MachineBasicBlock *MipsSETargetLowering::emitINSERT_DF_VIDX(
+    MachineInstr &MI, MachineBasicBlock *BB, unsigned EltSizeInBytes,
+    bool IsFP) const {
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Wd = MI.getOperand(0).getReg();
+  unsigned SrcVecReg = MI.getOperand(1).getReg();
+  unsigned LaneReg = MI.getOperand(2).getReg();
+  unsigned SrcValReg = MI.getOperand(3).getReg();
+
+  const TargetRegisterClass *VecRC = nullptr;
+  // FIXME: This should be true for N32 too.
+  const TargetRegisterClass *GPRRC =
+      Subtarget.isABI_N64() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+  unsigned SubRegIdx = Subtarget.isABI_N64() ? Mips::sub_32 : 0;
+  unsigned ShiftOp = Subtarget.isABI_N64() ? Mips::DSLL : Mips::SLL;
+  unsigned EltLog2Size;
+  unsigned InsertOp = 0;
+  unsigned InsveOp = 0;
+  switch (EltSizeInBytes) {
+  default:
+    llvm_unreachable("Unexpected size");
+  case 1:
+    EltLog2Size = 0;
+    InsertOp = Mips::INSERT_B;
+    InsveOp = Mips::INSVE_B;
+    VecRC = &Mips::MSA128BRegClass;
+    break;
+  case 2:
+    EltLog2Size = 1;
+    InsertOp = Mips::INSERT_H;
+    InsveOp = Mips::INSVE_H;
+    VecRC = &Mips::MSA128HRegClass;
+    break;
+  case 4:
+    EltLog2Size = 2;
+    InsertOp = Mips::INSERT_W;
+    InsveOp = Mips::INSVE_W;
+    VecRC = &Mips::MSA128WRegClass;
+    break;
+  case 8:
+    EltLog2Size = 3;
+    InsertOp = Mips::INSERT_D;
+    InsveOp = Mips::INSVE_D;
+    VecRC = &Mips::MSA128DRegClass;
+    break;
+  }
+
+  if (IsFP) {
+    unsigned Wt = RegInfo.createVirtualRegister(VecRC);
+    BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+        .addImm(0)
+        .addReg(SrcValReg)
+        .addImm(EltSizeInBytes == 8 ? Mips::sub_64 : Mips::sub_lo);
+    SrcValReg = Wt;
+  }
+
+  // Convert the lane index into a byte index
+  if (EltSizeInBytes != 1) {
+    unsigned LaneTmp1 = RegInfo.createVirtualRegister(GPRRC);
+    BuildMI(*BB, MI, DL, TII->get(ShiftOp), LaneTmp1)
+        .addReg(LaneReg)
+        .addImm(EltLog2Size);
+    LaneReg = LaneTmp1;
+  }
+
+  // Rotate bytes around so that the desired lane is element zero
+  unsigned WdTmp1 = RegInfo.createVirtualRegister(VecRC);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), WdTmp1)
+      .addReg(SrcVecReg)
+      .addReg(SrcVecReg)
+      .addReg(LaneReg, 0, SubRegIdx);
+
+  unsigned WdTmp2 = RegInfo.createVirtualRegister(VecRC);
+  if (IsFP) {
+    // Use insve.df to insert to element zero
+    BuildMI(*BB, MI, DL, TII->get(InsveOp), WdTmp2)
+        .addReg(WdTmp1)
+        .addImm(0)
+        .addReg(SrcValReg)
+        .addImm(0);
+  } else {
+    // Use insert.df to insert to element zero
+    BuildMI(*BB, MI, DL, TII->get(InsertOp), WdTmp2)
+        .addReg(WdTmp1)
+        .addReg(SrcValReg)
+        .addImm(0);
+  }
+
+  // Rotate elements the rest of the way for a full rotation.
+  // sld.df inteprets $rt modulo the number of columns so we only need to negate
+  // the lane index to do this.
+  unsigned LaneTmp2 = RegInfo.createVirtualRegister(GPRRC);
+  BuildMI(*BB, MI, DL, TII->get(Subtarget.isABI_N64() ? Mips::DSUB : Mips::SUB),
+          LaneTmp2)
+      .addReg(Subtarget.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO)
+      .addReg(LaneReg);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), Wd)
+      .addReg(WdTmp2)
+      .addReg(WdTmp2)
+      .addReg(LaneTmp2, 0, SubRegIdx);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FILL_FW pseudo instruction.
+//
+// fill_fw_pseudo $wd, $fs
+// =>
+// implicit_def $wt1
+// insert_subreg $wt2:subreg_lo, $wt1, $fs
+// splati.w $wd, $wt2[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitFILL_FW(MachineInstr &MI,
+                                  MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Wd = MI.getOperand(0).getReg();
+  unsigned Fs = MI.getOperand(1).getReg();
+  unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+  unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
+      .addReg(Wt1)
+      .addReg(Fs)
+      .addImm(Mips::sub_lo);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wd).addReg(Wt2).addImm(0);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FILL_FD pseudo instruction.
+//
+// fill_fd_pseudo $wd, $fs
+// =>
+// implicit_def $wt1
+// insert_subreg $wt2:subreg_64, $wt1, $fs
+// splati.d $wd, $wt2[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitFILL_FD(MachineInstr &MI,
+                                  MachineBasicBlock *BB) const {
+  assert(Subtarget.isFP64bit());
+
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Wd = MI.getOperand(0).getReg();
+  unsigned Fs = MI.getOperand(1).getReg();
+  unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+  unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
+      .addReg(Wt1)
+      .addReg(Fs)
+      .addImm(Mips::sub_64);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wd).addReg(Wt2).addImm(0);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the ST_F16_PSEDUO instruction to store a f16 value from an MSA
+// register.
+//
+// STF16 MSA128F16:$wd, mem_simm10:$addr
+// =>
+//  copy_u.h $rtemp,$wd[0]
+//  sh $rtemp, $addr
+//
+// Safety: We can't use st.h & co as they would over write the memory after
+// the destination. It would require half floats be allocated 16 bytes(!) of
+// space.
+MachineBasicBlock *
+MipsSETargetLowering::emitST_F16_PSEUDO(MachineInstr &MI,
+                                       MachineBasicBlock *BB) const {
+
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Ws = MI.getOperand(0).getReg();
+  unsigned Rt = MI.getOperand(1).getReg();
+  const MachineMemOperand &MMO = **MI.memoperands_begin();
+  unsigned Imm = MMO.getOffset();
+
+  // Caution: A load via the GOT can expand to a GPR32 operand, a load via
+  //          spill and reload can expand as a GPR64 operand. Examine the
+  //          operand in detail and default to ABI.
+  const TargetRegisterClass *RC =
+      MI.getOperand(1).isReg() ? RegInfo.getRegClass(MI.getOperand(1).getReg())
+                               : (Subtarget.isABI_O32() ? &Mips::GPR32RegClass
+                                                        : &Mips::GPR64RegClass);
+  const bool UsingMips32 = RC == &Mips::GPR32RegClass;
+  unsigned Rs = RegInfo.createVirtualRegister(RC);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::COPY_U_H), Rs).addReg(Ws).addImm(0);
+  BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::SH : Mips::SH64))
+      .addReg(Rs)
+      .addReg(Rt)
+      .addImm(Imm)
+      .addMemOperand(BB->getParent()->getMachineMemOperand(
+          &MMO, MMO.getOffset(), MMO.getSize()));
+
+  MI.eraseFromParent();
+  return BB;
+}
+
+// Emit the LD_F16_PSEDUO instruction to load a f16 value into an MSA register.
+//
+// LD_F16 MSA128F16:$wd, mem_simm10:$addr
+// =>
+//  lh $rtemp, $addr
+//  fill.h $wd, $rtemp
+//
+// Safety: We can't use ld.h & co as they over-read from the source.
+// Additionally, if the address is not modulo 16, 2 cases can occur:
+//  a) Segmentation fault as the load instruction reads from a memory page
+//     memory it's not supposed to.
+//  b) The load crosses an implementation specific boundary, requiring OS
+//     intervention.
+//
+MachineBasicBlock *
+MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI,
+                                       MachineBasicBlock *BB) const {
+
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Wd = MI.getOperand(0).getReg();
+
+  // Caution: A load via the GOT can expand to a GPR32 operand, a load via
+  //          spill and reload can expand as a GPR64 operand. Examine the
+  //          operand in detail and default to ABI.
+  const TargetRegisterClass *RC =
+      MI.getOperand(1).isReg() ? RegInfo.getRegClass(MI.getOperand(1).getReg())
+                               : (Subtarget.isABI_O32() ? &Mips::GPR32RegClass
+                                                        : &Mips::GPR64RegClass);
+
+  const bool UsingMips32 = RC == &Mips::GPR32RegClass;
+  unsigned Rt = RegInfo.createVirtualRegister(RC);
+
+  MachineInstrBuilder MIB =
+      BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::LH : Mips::LH64), Rt);
+  for (unsigned i = 1; i < MI.getNumOperands(); i++)
+    MIB.addOperand(MI.getOperand(i));
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::FILL_H), Wd).addReg(Rt);
+
+  MI.eraseFromParent();
+  return BB;
+}
+
+// Emit the FPROUND_PSEUDO instruction.
+//
+// Round an FGR64Opnd, FGR32Opnd to an f16.
+//
+// Safety: Cycle the operand through the GPRs so the result always ends up
+//         the correct MSA register.
+//
+// FIXME: This copying is strictly unnecessary. If we could tie FGR32Opnd:$Fs
+//        / FGR64Opnd:$Fs and MSA128F16:$Wd to the same physical register
+//        (which they can be, as the MSA registers are defined to alias the
+//        FPU's 64 bit and 32 bit registers) the result can be accessed using
+//        the correct register class. That requires operands be tie-able across
+//        register classes which have a sub/super register class relationship.
+//
+// For FPG32Opnd:
+//
+// FPROUND MSA128F16:$wd, FGR32Opnd:$fs
+// =>
+//  mfc1 $rtemp, $fs
+//  fill.w $rtemp, $wtemp
+//  fexdo.w $wd, $wtemp, $wtemp
+//
+// For FPG64Opnd on mips32r2+:
+//
+// FPROUND MSA128F16:$wd, FGR64Opnd:$fs
+// =>
+//  mfc1 $rtemp, $fs
+//  fill.w $rtemp, $wtemp
+//  mfhc1 $rtemp2, $fs
+//  insert.w $wtemp[1], $rtemp2
+//  insert.w $wtemp[3], $rtemp2
+//  fexdo.w $wtemp2, $wtemp, $wtemp
+//  fexdo.h $wd, $temp2, $temp2
+//
+// For FGR64Opnd on mips64r2+:
+//
+// FPROUND MSA128F16:$wd, FGR64Opnd:$fs
+// =>
+//  dmfc1 $rtemp, $fs
+//  fill.d $rtemp, $wtemp
+//  fexdo.w $wtemp2, $wtemp, $wtemp
+//  fexdo.h $wd, $wtemp2, $wtemp2
+//
+// Safety note: As $wtemp is UNDEF, we may provoke a spurious exception if the
+//              undef bits are "just right" and the exception enable bits are
+//              set. By using fill.w to replicate $fs into all elements over
+//              insert.w for one element, we avoid that potiential case. If
+//              fexdo.[hw] causes an exception in, the exception is valid and it
+//              occurs for all elements.
+//
+MachineBasicBlock *
+MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
+                                         MachineBasicBlock *BB,
+                                         bool IsFGR64) const {
+
+  // Strictly speaking, we need MIPS32R5 to support MSA. We'll be generous
+  // here. It's technically doable to support MIPS32 here, but the ISA forbids
+  // it.
+  assert(Subtarget.hasMSA() && Subtarget.hasMips32r2());
+
+  bool IsFGR64onMips64 = Subtarget.hasMips64() && IsFGR64;
+
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Wd = MI.getOperand(0).getReg();
+  unsigned Fs = MI.getOperand(1).getReg();
+
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+  const TargetRegisterClass *GPRRC =
+      IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+  unsigned MFC1Opc = IsFGR64onMips64 ? Mips::DMFC1 : Mips::MFC1;
+  unsigned FILLOpc = IsFGR64onMips64 ? Mips::FILL_D : Mips::FILL_W;
+
+  // Perform the register class copy as mentioned above.
+  unsigned Rtemp = RegInfo.createVirtualRegister(GPRRC);
+  BuildMI(*BB, MI, DL, TII->get(MFC1Opc), Rtemp).addReg(Fs);
+  BuildMI(*BB, MI, DL, TII->get(FILLOpc), Wtemp).addReg(Rtemp);
+  unsigned WPHI = Wtemp;
+
+  if (!Subtarget.hasMips64() && IsFGR64) {
+    unsigned Rtemp2 = RegInfo.createVirtualRegister(GPRRC);
+    BuildMI(*BB, MI, DL, TII->get(Mips::MFHC1_D64), Rtemp2).addReg(Fs);
+    unsigned Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+    unsigned Wtemp3 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+    BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_W), Wtemp2)
+        .addReg(Wtemp)
+        .addReg(Rtemp2)
+        .addImm(1);
+    BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_W), Wtemp3)
+        .addReg(Wtemp2)
+        .addReg(Rtemp2)
+        .addImm(3);
+    WPHI = Wtemp3;
+  }
+
+  if (IsFGR64) {
+    unsigned Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+    BuildMI(*BB, MI, DL, TII->get(Mips::FEXDO_W), Wtemp2)
+        .addReg(WPHI)
+        .addReg(WPHI);
+    WPHI = Wtemp2;
+  }
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::FEXDO_H), Wd).addReg(WPHI).addReg(WPHI);
+
+  MI.eraseFromParent();
+  return BB;
+}
+
+// Emit the FPEXTEND_PSEUDO instruction.
+//
+// Expand an f16 to either a FGR32Opnd or FGR64Opnd.
+//
+// Safety: Cycle the result through the GPRs so the result always ends up
+//         the correct floating point register.
+//
+// FIXME: This copying is strictly unnecessary. If we could tie FGR32Opnd:$Fd
+//        / FGR64Opnd:$Fd and MSA128F16:$Ws to the same physical register
+//        (which they can be, as the MSA registers are defined to alias the
+//        FPU's 64 bit and 32 bit registers) the result can be accessed using
+//        the correct register class. That requires operands be tie-able across
+//        register classes which have a sub/super register class relationship. I
+//        haven't checked.
+//
+// For FGR32Opnd:
+//
+// FPEXTEND FGR32Opnd:$fd, MSA128F16:$ws
+// =>
+//  fexupr.w $wtemp, $ws
+//  copy_s.w $rtemp, $ws[0]
+//  mtc1 $rtemp, $fd
+//
+// For FGR64Opnd on Mips64:
+//
+// FPEXTEND FGR64Opnd:$fd, MSA128F16:$ws
+// =>
+//  fexupr.w $wtemp, $ws
+//  fexupr.d $wtemp2, $wtemp
+//  copy_s.d $rtemp, $wtemp2s[0]
+//  dmtc1 $rtemp, $fd
+//
+// For FGR64Opnd on Mips32:
+//
+// FPEXTEND FGR64Opnd:$fd, MSA128F16:$ws
+// =>
+//  fexupr.w $wtemp, $ws
+//  fexupr.d $wtemp2, $wtemp
+//  copy_s.w $rtemp, $wtemp2[0]
+//  mtc1 $rtemp, $ftemp
+//  copy_s.w $rtemp2, $wtemp2[1]
+//  $fd = mthc1 $rtemp2, $ftemp
+//
+MachineBasicBlock *
+MipsSETargetLowering::emitFPEXTEND_PSEUDO(MachineInstr &MI,
+                                          MachineBasicBlock *BB,
+                                          bool IsFGR64) const {
+
+  // Strictly speaking, we need MIPS32R5 to support MSA. We'll be generous
+  // here. It's technically doable to support MIPS32 here, but the ISA forbids
+  // it.
+  assert(Subtarget.hasMSA() && Subtarget.hasMips32r2());
+
+  bool IsFGR64onMips64 = Subtarget.hasMips64() && IsFGR64;
+  bool IsFGR64onMips32 = !Subtarget.hasMips64() && IsFGR64;
+
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Fd = MI.getOperand(0).getReg();
+  unsigned Ws = MI.getOperand(1).getReg();
+
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetRegisterClass *GPRRC =
+      IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+  unsigned MTC1Opc = IsFGR64onMips64 ? Mips::DMTC1 : Mips::MTC1;
+  unsigned COPYOpc = IsFGR64onMips64 ? Mips::COPY_S_D : Mips::COPY_S_W;
+
+  unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+  unsigned WPHI = Wtemp;
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::FEXUPR_W), Wtemp).addReg(Ws);
+  if (IsFGR64) {
+    WPHI = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+    BuildMI(*BB, MI, DL, TII->get(Mips::FEXUPR_D), WPHI).addReg(Wtemp);
+  }
+
+  // Perform the safety regclass copy mentioned above.
+  unsigned Rtemp = RegInfo.createVirtualRegister(GPRRC);
+  unsigned FPRPHI = IsFGR64onMips32
+                        ? RegInfo.createVirtualRegister(&Mips::FGR64RegClass)
+                        : Fd;
+  BuildMI(*BB, MI, DL, TII->get(COPYOpc), Rtemp).addReg(WPHI).addImm(0);
+  BuildMI(*BB, MI, DL, TII->get(MTC1Opc), FPRPHI).addReg(Rtemp);
+
+  if (IsFGR64onMips32) {
+    unsigned Rtemp2 = RegInfo.createVirtualRegister(GPRRC);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY_S_W), Rtemp2)
+        .addReg(WPHI)
+        .addImm(1);
+    BuildMI(*BB, MI, DL, TII->get(Mips::MTHC1_D64), Fd)
+        .addReg(FPRPHI)
+        .addReg(Rtemp2);
+  }
+
+  MI.eraseFromParent();
+  return BB;
+}
+
+// Emit the FEXP2_W_1 pseudo instructions.
+//
+// fexp2_w_1_pseudo $wd, $wt
+// =>
+// ldi.w $ws, 1
+// fexp2.w $wd, $ws, $wt
+MachineBasicBlock *
+MipsSETargetLowering::emitFEXP2_W_1(MachineInstr &MI,
+                                    MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = &Mips::MSA128WRegClass;
+  unsigned Ws1 = RegInfo.createVirtualRegister(RC);
+  unsigned Ws2 = RegInfo.createVirtualRegister(RC);
+  DebugLoc DL = MI.getDebugLoc();
+
+  // Splat 1.0 into a vector
+  BuildMI(*BB, MI, DL, TII->get(Mips::LDI_W), Ws1).addImm(1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::FFINT_U_W), Ws2).addReg(Ws1);
+
+  // Emit 1.0 * fexp2(Wt)
+  BuildMI(*BB, MI, DL, TII->get(Mips::FEXP2_W), MI.getOperand(0).getReg())
+      .addReg(Ws2)
+      .addReg(MI.getOperand(1).getReg());
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FEXP2_D_1 pseudo instructions.
+//
+// fexp2_d_1_pseudo $wd, $wt
+// =>
+// ldi.d $ws, 1
+// fexp2.d $wd, $ws, $wt
+MachineBasicBlock *
+MipsSETargetLowering::emitFEXP2_D_1(MachineInstr &MI,
+                                    MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = &Mips::MSA128DRegClass;
+  unsigned Ws1 = RegInfo.createVirtualRegister(RC);
+  unsigned Ws2 = RegInfo.createVirtualRegister(RC);
+  DebugLoc DL = MI.getDebugLoc();
+
+  // Splat 1.0 into a vector
+  BuildMI(*BB, MI, DL, TII->get(Mips::LDI_D), Ws1).addImm(1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::FFINT_U_D), Ws2).addReg(Ws1);
+
+  // Emit 1.0 * fexp2(Wt)
+  BuildMI(*BB, MI, DL, TII->get(Mips::FEXP2_D), MI.getOperand(0).getReg())
+      .addReg(Ws2)
+      .addReg(MI.getOperand(1).getReg());
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h
new file mode 100644
index 000000000000..0abb9b318bda
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h
@@ -0,0 +1,131 @@
+//===-- MipsSEISelLowering.h - MipsSE DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSEISELLOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSEISELLOWERING_H
+
+#include "MipsISelLowering.h"
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+  class MipsSETargetLowering : public MipsTargetLowering  {
+  public:
+    explicit MipsSETargetLowering(const MipsTargetMachine &TM,
+                                  const MipsSubtarget &STI);
+
+    /// \brief Enable MSA support for the given integer type and Register
+    /// class.
+    void addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC);
+    /// \brief Enable MSA support for the given floating-point type and
+    /// Register class.
+    void addMSAFloatType(MVT::SimpleValueType Ty,
+                         const TargetRegisterClass *RC);
+
+    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS = 0,
+                                        unsigned Align = 1,
+                                        bool *Fast = nullptr) const override;
+
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *MBB) const override;
+
+    bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+                            EVT VT) const override {
+      return false;
+    }
+
+    const TargetRegisterClass *getRepRegClassFor(MVT VT) const override;
+
+  private:
+    bool isEligibleForTailCallOptimization(
+        const CCState &CCInfo, unsigned NextStackOffset,
+        const MipsFunctionInfo &FI) const override;
+
+    void
+    getOpndList(SmallVectorImpl<SDValue> &Ops,
+                std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+                bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+                bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
+                SDValue Chain) const override;
+
+    SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue lowerMulDiv(SDValue Op, unsigned NewOpc, bool HasLo, bool HasHi,
+                        SelectionDAG &DAG) const;
+
+    SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    /// \brief Lower VECTOR_SHUFFLE into one of a number of instructions
+    /// depending on the indices in the shuffle.
+    SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+
+    MachineBasicBlock *emitBPOSGE32(MachineInstr &MI,
+                                    MachineBasicBlock *BB) const;
+    MachineBasicBlock *emitMSACBranchPseudo(MachineInstr &MI,
+                                            MachineBasicBlock *BB,
+                                            unsigned BranchOp) const;
+    /// \brief Emit the COPY_FW pseudo instruction
+    MachineBasicBlock *emitCOPY_FW(MachineInstr &MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the COPY_FD pseudo instruction
+    MachineBasicBlock *emitCOPY_FD(MachineInstr &MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the INSERT_FW pseudo instruction
+    MachineBasicBlock *emitINSERT_FW(MachineInstr &MI,
+                                     MachineBasicBlock *BB) const;
+    /// \brief Emit the INSERT_FD pseudo instruction
+    MachineBasicBlock *emitINSERT_FD(MachineInstr &MI,
+                                     MachineBasicBlock *BB) const;
+    /// \brief Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction
+    MachineBasicBlock *emitINSERT_DF_VIDX(MachineInstr &MI,
+                                          MachineBasicBlock *BB,
+                                          unsigned EltSizeInBytes,
+                                          bool IsFP) const;
+    /// \brief Emit the FILL_FW pseudo instruction
+    MachineBasicBlock *emitFILL_FW(MachineInstr &MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the FILL_FD pseudo instruction
+    MachineBasicBlock *emitFILL_FD(MachineInstr &MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the FEXP2_W_1 pseudo instructions.
+    MachineBasicBlock *emitFEXP2_W_1(MachineInstr &MI,
+                                     MachineBasicBlock *BB) const;
+    /// \brief Emit the FEXP2_D_1 pseudo instructions.
+    MachineBasicBlock *emitFEXP2_D_1(MachineInstr &MI,
+                                     MachineBasicBlock *BB) const;
+    /// \brief Emit the FILL_FW pseudo instruction
+    MachineBasicBlock *emitLD_F16_PSEUDO(MachineInstr &MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the FILL_FD pseudo instruction
+    MachineBasicBlock *emitST_F16_PSEUDO(MachineInstr &MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the FEXP2_W_1 pseudo instructions.
+    MachineBasicBlock *emitFPEXTEND_PSEUDO(MachineInstr &MI,
+                                           MachineBasicBlock *BB,
+                                           bool IsFGR64) const;
+    /// \brief Emit the FEXP2_D_1 pseudo instructions.
+    MachineBasicBlock *emitFPROUND_PSEUDO(MachineInstr &MI,
+                                          MachineBasicBlock *BBi,
+                                          bool IsFGR64) const;
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
new file mode 100644
index 000000000000..ea703d0edd96
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -0,0 +1,754 @@
+//===-- MipsSEInstrInfo.cpp - Mips32/64 Instruction Information -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSEInstrInfo.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+MipsSEInstrInfo::MipsSEInstrInfo(const MipsSubtarget &STI)
+    : MipsInstrInfo(STI, STI.isPositionIndependent() ? Mips::B : Mips::J),
+      RI() {}
+
+const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned MipsSEInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                              int &FrameIndex) const {
+  unsigned Opc = MI.getOpcode();
+
+  if ((Opc == Mips::LW)   || (Opc == Mips::LD)   ||
+      (Opc == Mips::LWC1) || (Opc == Mips::LDC1) || (Opc == Mips::LDC164)) {
+    if ((MI.getOperand(1).isFI()) &&  // is a stack slot
+        (MI.getOperand(2).isImm()) && // the imm is zero
+        (isZeroImm(MI.getOperand(2)))) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+  }
+
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned MipsSEInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                             int &FrameIndex) const {
+  unsigned Opc = MI.getOpcode();
+
+  if ((Opc == Mips::SW)   || (Opc == Mips::SD)   ||
+      (Opc == Mips::SWC1) || (Opc == Mips::SDC1) || (Opc == Mips::SDC164)) {
+    if ((MI.getOperand(1).isFI()) &&  // is a stack slot
+        (MI.getOperand(2).isImm()) && // the imm is zero
+        (isZeroImm(MI.getOperand(2)))) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, unsigned DestReg,
+                                  unsigned SrcReg, bool KillSrc) const {
+  unsigned Opc = 0, ZeroReg = 0;
+  bool isMicroMips = Subtarget.inMicroMipsMode();
+
+  if (Mips::GPR32RegClass.contains(DestReg)) { // Copy to CPU Reg.
+    if (Mips::GPR32RegClass.contains(SrcReg)) {
+      if (isMicroMips)
+        Opc = Mips::MOVE16_MM;
+      else
+        Opc = Mips::OR, ZeroReg = Mips::ZERO;
+    } else if (Mips::CCRRegClass.contains(SrcReg))
+      Opc = Mips::CFC1;
+    else if (Mips::FGR32RegClass.contains(SrcReg))
+      Opc = Mips::MFC1;
+    else if (Mips::HI32RegClass.contains(SrcReg)) {
+      Opc = isMicroMips ? Mips::MFHI16_MM : Mips::MFHI;
+      SrcReg = 0;
+    } else if (Mips::LO32RegClass.contains(SrcReg)) {
+      Opc = isMicroMips ? Mips::MFLO16_MM : Mips::MFLO;
+      SrcReg = 0;
+    } else if (Mips::HI32DSPRegClass.contains(SrcReg))
+      Opc = Mips::MFHI_DSP;
+    else if (Mips::LO32DSPRegClass.contains(SrcReg))
+      Opc = Mips::MFLO_DSP;
+    else if (Mips::DSPCCRegClass.contains(SrcReg)) {
+      BuildMI(MBB, I, DL, get(Mips::RDDSP), DestReg).addImm(1 << 4)
+        .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      return;
+    }
+    else if (Mips::MSACtrlRegClass.contains(SrcReg))
+      Opc = Mips::CFCMSA;
+  }
+  else if (Mips::GPR32RegClass.contains(SrcReg)) { // Copy from CPU Reg.
+    if (Mips::CCRRegClass.contains(DestReg))
+      Opc = Mips::CTC1;
+    else if (Mips::FGR32RegClass.contains(DestReg))
+      Opc = Mips::MTC1;
+    else if (Mips::HI32RegClass.contains(DestReg))
+      Opc = Mips::MTHI, DestReg = 0;
+    else if (Mips::LO32RegClass.contains(DestReg))
+      Opc = Mips::MTLO, DestReg = 0;
+    else if (Mips::HI32DSPRegClass.contains(DestReg))
+      Opc = Mips::MTHI_DSP;
+    else if (Mips::LO32DSPRegClass.contains(DestReg))
+      Opc = Mips::MTLO_DSP;
+    else if (Mips::DSPCCRegClass.contains(DestReg)) {
+      BuildMI(MBB, I, DL, get(Mips::WRDSP))
+        .addReg(SrcReg, getKillRegState(KillSrc)).addImm(1 << 4)
+        .addReg(DestReg, RegState::ImplicitDefine);
+      return;
+    } else if (Mips::MSACtrlRegClass.contains(DestReg)) {
+      BuildMI(MBB, I, DL, get(Mips::CTCMSA))
+          .addReg(DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+      return;
+    }
+  }
+  else if (Mips::FGR32RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::FMOV_S;
+  else if (Mips::AFGR64RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::FMOV_D32;
+  else if (Mips::FGR64RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::FMOV_D64;
+  else if (Mips::GPR64RegClass.contains(DestReg)) { // Copy to CPU64 Reg.
+    if (Mips::GPR64RegClass.contains(SrcReg))
+      Opc = Mips::OR64, ZeroReg = Mips::ZERO_64;
+    else if (Mips::HI64RegClass.contains(SrcReg))
+      Opc = Mips::MFHI64, SrcReg = 0;
+    else if (Mips::LO64RegClass.contains(SrcReg))
+      Opc = Mips::MFLO64, SrcReg = 0;
+    else if (Mips::FGR64RegClass.contains(SrcReg))
+      Opc = Mips::DMFC1;
+  }
+  else if (Mips::GPR64RegClass.contains(SrcReg)) { // Copy from CPU64 Reg.
+    if (Mips::HI64RegClass.contains(DestReg))
+      Opc = Mips::MTHI64, DestReg = 0;
+    else if (Mips::LO64RegClass.contains(DestReg))
+      Opc = Mips::MTLO64, DestReg = 0;
+    else if (Mips::FGR64RegClass.contains(DestReg))
+      Opc = Mips::DMTC1;
+  }
+  else if (Mips::MSA128BRegClass.contains(DestReg)) { // Copy to MSA reg
+    if (Mips::MSA128BRegClass.contains(SrcReg))
+      Opc = Mips::MOVE_V;
+  }
+
+  assert(Opc && "Cannot copy registers");
+
+  MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc));
+
+  if (DestReg)
+    MIB.addReg(DestReg, RegState::Define);
+
+  if (SrcReg)
+    MIB.addReg(SrcReg, getKillRegState(KillSrc));
+
+  if (ZeroReg)
+    MIB.addReg(ZeroReg);
+}
+
+void MipsSEInstrInfo::
+storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                unsigned SrcReg, bool isKill, int FI,
+                const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
+                int64_t Offset) const {
+  DebugLoc DL;
+  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore);
+
+  unsigned Opc = 0;
+
+  if (Mips::GPR32RegClass.hasSubClassEq(RC))
+    Opc = Mips::SW;
+  else if (Mips::GPR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::SD;
+  else if (Mips::ACC64RegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_ACC64;
+  else if (Mips::ACC64DSPRegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_ACC64DSP;
+  else if (Mips::ACC128RegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_ACC128;
+  else if (Mips::DSPCCRegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_CCOND_DSP;
+  else if (Mips::FGR32RegClass.hasSubClassEq(RC))
+    Opc = Mips::SWC1;
+  else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::SDC1;
+  else if (Mips::FGR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::SDC164;
+  else if (RC->hasType(MVT::v16i8))
+    Opc = Mips::ST_B;
+  else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16))
+    Opc = Mips::ST_H;
+  else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32))
+    Opc = Mips::ST_W;
+  else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
+    Opc = Mips::ST_D;
+  else if (Mips::LO32RegClass.hasSubClassEq(RC))
+    Opc = Mips::SW;
+  else if (Mips::LO64RegClass.hasSubClassEq(RC))
+    Opc = Mips::SD;
+  else if (Mips::HI32RegClass.hasSubClassEq(RC))
+    Opc = Mips::SW;
+  else if (Mips::HI64RegClass.hasSubClassEq(RC))
+    Opc = Mips::SD;
+
+  // Hi, Lo are normally caller save but they are callee save
+  // for interrupt handling.
+  const Function *Func = MBB.getParent()->getFunction();
+  if (Func->hasFnAttribute("interrupt")) {
+    if (Mips::HI32RegClass.hasSubClassEq(RC)) {
+      BuildMI(MBB, I, DL, get(Mips::MFHI), Mips::K0);
+      SrcReg = Mips::K0;
+    } else if (Mips::HI64RegClass.hasSubClassEq(RC)) {
+      BuildMI(MBB, I, DL, get(Mips::MFHI64), Mips::K0_64);
+      SrcReg = Mips::K0_64;
+    } else if (Mips::LO32RegClass.hasSubClassEq(RC)) {
+      BuildMI(MBB, I, DL, get(Mips::MFLO), Mips::K0);
+      SrcReg = Mips::K0;
+    } else if (Mips::LO64RegClass.hasSubClassEq(RC)) {
+      BuildMI(MBB, I, DL, get(Mips::MFLO64), Mips::K0_64);
+      SrcReg = Mips::K0_64;
+    }
+  }
+
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
+    .addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
+}
+
+void MipsSEInstrInfo::
+loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                 unsigned DestReg, int FI, const TargetRegisterClass *RC,
+                 const TargetRegisterInfo *TRI, int64_t Offset) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
+  unsigned Opc = 0;
+
+  const Function *Func = MBB.getParent()->getFunction();
+  bool ReqIndirectLoad = Func->hasFnAttribute("interrupt") &&
+                         (DestReg == Mips::LO0 || DestReg == Mips::LO0_64 ||
+                          DestReg == Mips::HI0 || DestReg == Mips::HI0_64);
+
+  if (Mips::GPR32RegClass.hasSubClassEq(RC))
+    Opc = Mips::LW;
+  else if (Mips::GPR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LD;
+  else if (Mips::ACC64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_ACC64;
+  else if (Mips::ACC64DSPRegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_ACC64DSP;
+  else if (Mips::ACC128RegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_ACC128;
+  else if (Mips::DSPCCRegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_CCOND_DSP;
+  else if (Mips::FGR32RegClass.hasSubClassEq(RC))
+    Opc = Mips::LWC1;
+  else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LDC1;
+  else if (Mips::FGR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LDC164;
+  else if (RC->hasType(MVT::v16i8))
+    Opc = Mips::LD_B;
+  else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16))
+    Opc = Mips::LD_H;
+  else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32))
+    Opc = Mips::LD_W;
+  else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
+    Opc = Mips::LD_D;
+  else if (Mips::HI32RegClass.hasSubClassEq(RC))
+    Opc = Mips::LW;
+  else if (Mips::HI64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LD;
+  else if (Mips::LO32RegClass.hasSubClassEq(RC))
+    Opc = Mips::LW;
+  else if (Mips::LO64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LD;
+
+  assert(Opc && "Register class not handled!");
+
+  if (!ReqIndirectLoad)
+    BuildMI(MBB, I, DL, get(Opc), DestReg)
+        .addFrameIndex(FI)
+        .addImm(Offset)
+        .addMemOperand(MMO);
+  else {
+    // Load HI/LO through K0. Notably the DestReg is encoded into the
+    // instruction itself.
+    unsigned Reg = Mips::K0;
+    unsigned LdOp = Mips::MTLO;
+    if (DestReg == Mips::HI0)
+      LdOp = Mips::MTHI;
+
+    if (Subtarget.getABI().ArePtrs64bit()) {
+      Reg = Mips::K0_64;
+      if (DestReg == Mips::HI0_64)
+        LdOp = Mips::MTHI64;
+      else
+        LdOp = Mips::MTLO64;
+    }
+
+    BuildMI(MBB, I, DL, get(Opc), Reg)
+        .addFrameIndex(FI)
+        .addImm(Offset)
+        .addMemOperand(MMO);
+    BuildMI(MBB, I, DL, get(LdOp)).addReg(Reg);
+  }
+}
+
+bool MipsSEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  MachineBasicBlock &MBB = *MI.getParent();
+  bool isMicroMips = Subtarget.inMicroMipsMode();
+  unsigned Opc;
+
+  switch (MI.getDesc().getOpcode()) {
+  default:
+    return false;
+  case Mips::RetRA:
+    expandRetRA(MBB, MI);
+    break;
+  case Mips::ERet:
+    expandERet(MBB, MI);
+    break;
+  case Mips::PseudoMFHI:
+    Opc = isMicroMips ? Mips::MFHI16_MM : Mips::MFHI;
+    expandPseudoMFHiLo(MBB, MI, Opc);
+    break;
+  case Mips::PseudoMFLO:
+    Opc = isMicroMips ? Mips::MFLO16_MM : Mips::MFLO;
+    expandPseudoMFHiLo(MBB, MI, Opc);
+    break;
+  case Mips::PseudoMFHI64:
+    expandPseudoMFHiLo(MBB, MI, Mips::MFHI64);
+    break;
+  case Mips::PseudoMFLO64:
+    expandPseudoMFHiLo(MBB, MI, Mips::MFLO64);
+    break;
+  case Mips::PseudoMTLOHI:
+    expandPseudoMTLoHi(MBB, MI, Mips::MTLO, Mips::MTHI, false);
+    break;
+  case Mips::PseudoMTLOHI64:
+    expandPseudoMTLoHi(MBB, MI, Mips::MTLO64, Mips::MTHI64, false);
+    break;
+  case Mips::PseudoMTLOHI_DSP:
+    expandPseudoMTLoHi(MBB, MI, Mips::MTLO_DSP, Mips::MTHI_DSP, true);
+    break;
+  case Mips::PseudoCVT_S_W:
+    expandCvtFPInt(MBB, MI, Mips::CVT_S_W, Mips::MTC1, false);
+    break;
+  case Mips::PseudoCVT_D32_W:
+    expandCvtFPInt(MBB, MI, Mips::CVT_D32_W, Mips::MTC1, false);
+    break;
+  case Mips::PseudoCVT_S_L:
+    expandCvtFPInt(MBB, MI, Mips::CVT_S_L, Mips::DMTC1, true);
+    break;
+  case Mips::PseudoCVT_D64_W:
+    expandCvtFPInt(MBB, MI, Mips::CVT_D64_W, Mips::MTC1, true);
+    break;
+  case Mips::PseudoCVT_D64_L:
+    expandCvtFPInt(MBB, MI, Mips::CVT_D64_L, Mips::DMTC1, true);
+    break;
+  case Mips::BuildPairF64:
+    expandBuildPairF64(MBB, MI, false);
+    break;
+  case Mips::BuildPairF64_64:
+    expandBuildPairF64(MBB, MI, true);
+    break;
+  case Mips::ExtractElementF64:
+    expandExtractElementF64(MBB, MI, false);
+    break;
+  case Mips::ExtractElementF64_64:
+    expandExtractElementF64(MBB, MI, true);
+    break;
+  case Mips::MIPSeh_return32:
+  case Mips::MIPSeh_return64:
+    expandEhReturn(MBB, MI);
+    break;
+  }
+
+  MBB.erase(MI);
+  return true;
+}
+
+/// getOppositeBranchOpc - Return the inverse of the specified
+/// opcode, e.g. turning BEQ to BNE.
+unsigned MipsSEInstrInfo::getOppositeBranchOpc(unsigned Opc) const {
+  switch (Opc) {
+  default:           llvm_unreachable("Illegal opcode!");
+  case Mips::BEQ:    return Mips::BNE;
+  case Mips::BEQ_MM: return Mips::BNE_MM;
+  case Mips::BNE:    return Mips::BEQ;
+  case Mips::BNE_MM: return Mips::BEQ_MM;
+  case Mips::BGTZ:   return Mips::BLEZ;
+  case Mips::BGEZ:   return Mips::BLTZ;
+  case Mips::BLTZ:   return Mips::BGEZ;
+  case Mips::BLEZ:   return Mips::BGTZ;
+  case Mips::BEQ64:  return Mips::BNE64;
+  case Mips::BNE64:  return Mips::BEQ64;
+  case Mips::BGTZ64: return Mips::BLEZ64;
+  case Mips::BGEZ64: return Mips::BLTZ64;
+  case Mips::BLTZ64: return Mips::BGEZ64;
+  case Mips::BLEZ64: return Mips::BGTZ64;
+  case Mips::BC1T:   return Mips::BC1F;
+  case Mips::BC1F:   return Mips::BC1T;
+  case Mips::BEQZC_MM: return Mips::BNEZC_MM;
+  case Mips::BNEZC_MM: return Mips::BEQZC_MM;
+  case Mips::BEQZC:  return Mips::BNEZC;
+  case Mips::BNEZC:  return Mips::BEQZC;
+  case Mips::BEQC:   return Mips::BNEC;
+  case Mips::BNEC:   return Mips::BEQC;
+  case Mips::BGTZC:  return Mips::BLEZC;
+  case Mips::BGEZC:  return Mips::BLTZC;
+  case Mips::BLTZC:  return Mips::BGEZC;
+  case Mips::BLEZC:  return Mips::BGTZC;
+  case Mips::BEQZC64:  return Mips::BNEZC64;
+  case Mips::BNEZC64:  return Mips::BEQZC64;
+  case Mips::BEQC64:   return Mips::BNEC64;
+  case Mips::BNEC64:   return Mips::BEQC64;
+  case Mips::BGEC64:   return Mips::BLTC64;
+  case Mips::BGEUC64:  return Mips::BLTUC64;
+  case Mips::BLTC64:   return Mips::BGEC64;
+  case Mips::BLTUC64:  return Mips::BGEUC64;
+  case Mips::BGTZC64:  return Mips::BLEZC64;
+  case Mips::BGEZC64:  return Mips::BLTZC64;
+  case Mips::BLTZC64:  return Mips::BGEZC64;
+  case Mips::BLEZC64:  return Mips::BGTZC64;
+  }
+}
+
+/// Adjust SP by Amount bytes.
+void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const {
+  MipsABIInfo ABI = Subtarget.getABI();
+  DebugLoc DL;
+  unsigned ADDiu = ABI.GetPtrAddiuOp();
+
+  if (Amount == 0)
+    return;
+
+  if (isInt<16>(Amount)) {
+    // addi sp, sp, amount
+    BuildMI(MBB, I, DL, get(ADDiu), SP).addReg(SP).addImm(Amount);
+  } else {
+    // For numbers which are not 16bit integers we synthesize Amount inline
+    // then add or subtract it from sp.
+    unsigned Opc = ABI.GetPtrAdduOp();
+    if (Amount < 0) {
+      Opc = ABI.GetPtrSubuOp();
+      Amount = -Amount;
+    }
+    unsigned Reg = loadImmediate(Amount, MBB, I, DL, nullptr);
+    BuildMI(MBB, I, DL, get(Opc), SP).addReg(SP).addReg(Reg, RegState::Kill);
+  }
+}
+
+/// This function generates the sequence of instructions needed to get the
+/// result of adding register REG and immediate IMM.
+unsigned MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator II,
+                                        const DebugLoc &DL,
+                                        unsigned *NewImm) const {
+  MipsAnalyzeImmediate AnalyzeImm;
+  const MipsSubtarget &STI = Subtarget;
+  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+  unsigned Size = STI.isABI_N64() ? 64 : 32;
+  unsigned LUi = STI.isABI_N64() ? Mips::LUi64 : Mips::LUi;
+  unsigned ZEROReg = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+  const TargetRegisterClass *RC = STI.isABI_N64() ?
+    &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+  bool LastInstrIsADDiu = NewImm;
+
+  const MipsAnalyzeImmediate::InstSeq &Seq =
+    AnalyzeImm.Analyze(Imm, Size, LastInstrIsADDiu);
+  MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
+
+  assert(Seq.size() && (!LastInstrIsADDiu || (Seq.size() > 1)));
+
+  // The first instruction can be a LUi, which is different from other
+  // instructions (ADDiu, ORI and SLL) in that it does not have a register
+  // operand.
+  unsigned Reg = RegInfo.createVirtualRegister(RC);
+
+  if (Inst->Opc == LUi)
+    BuildMI(MBB, II, DL, get(LUi), Reg).addImm(SignExtend64<16>(Inst->ImmOpnd));
+  else
+    BuildMI(MBB, II, DL, get(Inst->Opc), Reg).addReg(ZEROReg)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+  // Build the remaining instructions in Seq.
+  for (++Inst; Inst != Seq.end() - LastInstrIsADDiu; ++Inst)
+    BuildMI(MBB, II, DL, get(Inst->Opc), Reg).addReg(Reg, RegState::Kill)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+  if (LastInstrIsADDiu)
+    *NewImm = Inst->ImmOpnd;
+
+  return Reg;
+}
+
+unsigned MipsSEInstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
+  return (Opc == Mips::BEQ    || Opc == Mips::BEQ_MM || Opc == Mips::BNE    ||
+          Opc == Mips::BNE_MM || Opc == Mips::BGTZ   || Opc == Mips::BGEZ   ||
+          Opc == Mips::BLTZ   || Opc == Mips::BLEZ   || Opc == Mips::BEQ64  ||
+          Opc == Mips::BNE64  || Opc == Mips::BGTZ64 || Opc == Mips::BGEZ64 ||
+          Opc == Mips::BLTZ64 || Opc == Mips::BLEZ64 || Opc == Mips::BC1T   ||
+          Opc == Mips::BC1F   || Opc == Mips::B      || Opc == Mips::J      ||
+          Opc == Mips::BEQZC_MM || Opc == Mips::BNEZC_MM || Opc == Mips::BEQC ||
+          Opc == Mips::BNEC   || Opc == Mips::BLTC   || Opc == Mips::BGEC   ||
+          Opc == Mips::BLTUC  || Opc == Mips::BGEUC  || Opc == Mips::BGTZC  ||
+          Opc == Mips::BLEZC  || Opc == Mips::BGEZC  || Opc == Mips::BLTZC  ||
+          Opc == Mips::BEQZC  || Opc == Mips::BNEZC  || Opc == Mips::BEQZC64 ||
+          Opc == Mips::BNEZC64 || Opc == Mips::BEQC64 || Opc == Mips::BNEC64 ||
+          Opc == Mips::BGEC64 || Opc == Mips::BGEUC64 || Opc == Mips::BLTC64 ||
+          Opc == Mips::BLTUC64 || Opc == Mips::BGTZC64 ||
+          Opc == Mips::BGEZC64 || Opc == Mips::BLTZC64 ||
+          Opc == Mips::BLEZC64 || Opc == Mips::BC) ? Opc : 0;
+}
+
+void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const {
+  if (Subtarget.isGP64bit())
+    BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn64))
+        .addReg(Mips::RA_64);
+  else
+    BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn)).addReg(Mips::RA);
+}
+
+void MipsSEInstrInfo::expandERet(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I) const {
+  BuildMI(MBB, I, I->getDebugLoc(), get(Mips::ERET));
+}
+
+std::pair<bool, bool>
+MipsSEInstrInfo::compareOpndSize(unsigned Opc,
+                                 const MachineFunction &MF) const {
+  const MCInstrDesc &Desc = get(Opc);
+  assert(Desc.NumOperands == 2 && "Unary instruction expected.");
+  const MipsRegisterInfo *RI = &getRegisterInfo();
+  unsigned DstRegSize = getRegClass(Desc, 0, RI, MF)->getSize();
+  unsigned SrcRegSize = getRegClass(Desc, 1, RI, MF)->getSize();
+
+  return std::make_pair(DstRegSize > SrcRegSize, DstRegSize < SrcRegSize);
+}
+
+void MipsSEInstrInfo::expandPseudoMFHiLo(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned NewOpc) const {
+  BuildMI(MBB, I, I->getDebugLoc(), get(NewOpc), I->getOperand(0).getReg());
+}
+
+void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned LoOpc,
+                                         unsigned HiOpc,
+                                         bool HasExplicitDef) const {
+  // Expand
+  //  lo_hi pseudomtlohi $gpr0, $gpr1
+  // to these two instructions:
+  //  mtlo $gpr0
+  //  mthi $gpr1
+
+  DebugLoc DL = I->getDebugLoc();
+  const MachineOperand &SrcLo = I->getOperand(1), &SrcHi = I->getOperand(2);
+  MachineInstrBuilder LoInst = BuildMI(MBB, I, DL, get(LoOpc));
+  MachineInstrBuilder HiInst = BuildMI(MBB, I, DL, get(HiOpc));
+
+  // Add lo/hi registers if the mtlo/hi instructions created have explicit
+  // def registers.
+  if (HasExplicitDef) {
+    unsigned DstReg = I->getOperand(0).getReg();
+    unsigned DstLo = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
+    unsigned DstHi = getRegisterInfo().getSubReg(DstReg, Mips::sub_hi);
+    LoInst.addReg(DstLo, RegState::Define);
+    HiInst.addReg(DstHi, RegState::Define);
+  }
+
+  LoInst.addReg(SrcLo.getReg(), getKillRegState(SrcLo.isKill()));
+  HiInst.addReg(SrcHi.getReg(), getKillRegState(SrcHi.isKill()));
+}
+
+void MipsSEInstrInfo::expandCvtFPInt(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     unsigned CvtOpc, unsigned MovOpc,
+                                     bool IsI64) const {
+  const MCInstrDesc &CvtDesc = get(CvtOpc), &MovDesc = get(MovOpc);
+  const MachineOperand &Dst = I->getOperand(0), &Src = I->getOperand(1);
+  unsigned DstReg = Dst.getReg(), SrcReg = Src.getReg(), TmpReg = DstReg;
+  unsigned KillSrc =  getKillRegState(Src.isKill());
+  DebugLoc DL = I->getDebugLoc();
+  bool DstIsLarger, SrcIsLarger;
+
+  std::tie(DstIsLarger, SrcIsLarger) =
+      compareOpndSize(CvtOpc, *MBB.getParent());
+
+  if (DstIsLarger)
+    TmpReg = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
+
+  if (SrcIsLarger)
+    DstReg = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
+
+  BuildMI(MBB, I, DL, MovDesc, TmpReg).addReg(SrcReg, KillSrc);
+  BuildMI(MBB, I, DL, CvtDesc, DstReg).addReg(TmpReg, RegState::Kill);
+}
+
+void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
+                                              MachineBasicBlock::iterator I,
+                                              bool FP64) const {
+  unsigned DstReg = I->getOperand(0).getReg();
+  unsigned SrcReg = I->getOperand(1).getReg();
+  unsigned N = I->getOperand(2).getImm();
+  DebugLoc dl = I->getDebugLoc();
+
+  assert(N < 2 && "Invalid immediate");
+  unsigned SubIdx = N ? Mips::sub_hi : Mips::sub_lo;
+  unsigned SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx);
+
+  // FPXX on MIPS-II or MIPS32r1 should have been handled with a spill/reload
+  // in MipsSEFrameLowering.cpp.
+  assert(!(Subtarget.isABI_FPXX() && !Subtarget.hasMips32r2()));
+
+  // FP64A (FP64 with nooddspreg) should have been handled with a spill/reload
+  // in MipsSEFrameLowering.cpp.
+  assert(!(Subtarget.isFP64bit() && !Subtarget.useOddSPReg()));
+
+  if (SubIdx == Mips::sub_hi && Subtarget.hasMTHC1()) {
+    // FIXME: Strictly speaking MFHC1 only reads the top 32-bits however, we
+    //        claim to read the whole 64-bits as part of a white lie used to
+    //        temporarily work around a widespread bug in the -mfp64 support.
+    //        The problem is that none of the 32-bit fpu ops mention the fact
+    //        that they clobber the upper 32-bits of the 64-bit FPR. Fixing that
+    //        requires a major overhaul of the FPU implementation which can't
+    //        be done right now due to time constraints.
+    //        MFHC1 is one of two instructions that are affected since they are
+    //        the only instructions that don't read the lower 32-bits.
+    //        We therefore pretend that it reads the bottom 32-bits to
+    //        artificially create a dependency and prevent the scheduler
+    //        changing the behaviour of the code.
+    BuildMI(MBB, I, dl, get(FP64 ? Mips::MFHC1_D64 : Mips::MFHC1_D32), DstReg)
+        .addReg(SrcReg);
+  } else
+    BuildMI(MBB, I, dl, get(Mips::MFC1), DstReg).addReg(SubReg);
+}
+
+void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         bool FP64) const {
+  unsigned DstReg = I->getOperand(0).getReg();
+  unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
+  const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1);
+  DebugLoc dl = I->getDebugLoc();
+  const TargetRegisterInfo &TRI = getRegisterInfo();
+
+  // When mthc1 is available, use:
+  //   mtc1 Lo, $fp
+  //   mthc1 Hi, $fp
+  //
+  // Otherwise, for O32 FPXX ABI:
+  //   spill + reload via ldc1
+  // This case is handled by the frame lowering code.
+  //
+  // Otherwise, for FP32:
+  //   mtc1 Lo, $fp
+  //   mtc1 Hi, $fp + 1
+  //
+  // The case where dmtc1 is available doesn't need to be handled here
+  // because it never creates a BuildPairF64 node.
+
+  // FPXX on MIPS-II or MIPS32r1 should have been handled with a spill/reload
+  // in MipsSEFrameLowering.cpp.
+  assert(!(Subtarget.isABI_FPXX() && !Subtarget.hasMips32r2()));
+
+  // FP64A (FP64 with nooddspreg) should have been handled with a spill/reload
+  // in MipsSEFrameLowering.cpp.
+  assert(!(Subtarget.isFP64bit() && !Subtarget.useOddSPReg()));
+
+  BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_lo))
+    .addReg(LoReg);
+
+  if (Subtarget.hasMTHC1()) {
+    // FIXME: The .addReg(DstReg) is a white lie used to temporarily work
+    //        around a widespread bug in the -mfp64 support.
+    //        The problem is that none of the 32-bit fpu ops mention the fact
+    //        that they clobber the upper 32-bits of the 64-bit FPR. Fixing that
+    //        requires a major overhaul of the FPU implementation which can't
+    //        be done right now due to time constraints.
+    //        MTHC1 is one of two instructions that are affected since they are
+    //        the only instructions that don't read the lower 32-bits.
+    //        We therefore pretend that it reads the bottom 32-bits to
+    //        artificially create a dependency and prevent the scheduler
+    //        changing the behaviour of the code.
+    BuildMI(MBB, I, dl, get(FP64 ? Mips::MTHC1_D64 : Mips::MTHC1_D32), DstReg)
+        .addReg(DstReg)
+        .addReg(HiReg);
+  } else if (Subtarget.isABI_FPXX())
+    llvm_unreachable("BuildPairF64 not expanded in frame lowering code!");
+  else
+    BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_hi))
+      .addReg(HiReg);
+}
+
+void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const {
+  // This pseudo instruction is generated as part of the lowering of
+  // ISD::EH_RETURN. We convert it to a stack increment by OffsetReg, and
+  // indirect jump to TargetReg
+  MipsABIInfo ABI = Subtarget.getABI();
+  unsigned ADDU = ABI.GetPtrAdduOp();
+  unsigned SP = Subtarget.isGP64bit() ? Mips::SP_64 : Mips::SP;
+  unsigned RA = Subtarget.isGP64bit() ? Mips::RA_64 : Mips::RA;
+  unsigned T9 = Subtarget.isGP64bit() ? Mips::T9_64 : Mips::T9;
+  unsigned ZERO = Subtarget.isGP64bit() ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned OffsetReg = I->getOperand(0).getReg();
+  unsigned TargetReg = I->getOperand(1).getReg();
+
+  // addu $ra, $v0, $zero
+  // addu $sp, $sp, $v1
+  // jr   $ra (via RetRA)
+  const TargetMachine &TM = MBB.getParent()->getTarget();
+  if (TM.isPositionIndependent())
+    BuildMI(MBB, I, I->getDebugLoc(), get(ADDU), T9)
+        .addReg(TargetReg)
+        .addReg(ZERO);
+  BuildMI(MBB, I, I->getDebugLoc(), get(ADDU), RA)
+      .addReg(TargetReg)
+      .addReg(ZERO);
+  BuildMI(MBB, I, I->getDebugLoc(), get(ADDU), SP).addReg(SP).addReg(OffsetReg);
+  expandRetRA(MBB, I);
+}
+
+const MipsInstrInfo *llvm::createMipsSEInstrInfo(const MipsSubtarget &STI) {
+  return new MipsSEInstrInfo(STI);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h
new file mode 100644
index 000000000000..b356909bf1cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h
@@ -0,0 +1,119 @@
+//===-- MipsSEInstrInfo.h - Mips32/64 Instruction Information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSEINSTRINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSEINSTRINFO_H
+
+#include "MipsInstrInfo.h"
+#include "MipsSERegisterInfo.h"
+
+namespace llvm {
+
+class MipsSEInstrInfo : public MipsInstrInfo {
+  const MipsSERegisterInfo RI;
+
+public:
+  explicit MipsSEInstrInfo(const MipsSubtarget &STI);
+
+  const MipsRegisterInfo &getRegisterInfo() const override;
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStack(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator MI,
+                       unsigned SrcReg, bool isKill, int FrameIndex,
+                       const TargetRegisterClass *RC,
+                       const TargetRegisterInfo *TRI,
+                       int64_t Offset) const override;
+
+  void loadRegFromStack(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MI,
+                        unsigned DestReg, int FrameIndex,
+                        const TargetRegisterClass *RC,
+                        const TargetRegisterInfo *TRI,
+                        int64_t Offset) const override;
+
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+  unsigned getOppositeBranchOpc(unsigned Opc) const override;
+
+  /// Adjust SP by Amount bytes.
+  void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const override;
+
+  /// Emit a series of instructions to load an immediate. If NewImm is a
+  /// non-NULL parameter, the last instruction is not emitted, but instead
+  /// its immediate operand is returned in NewImm.
+  unsigned loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator II, const DebugLoc &DL,
+                         unsigned *NewImm) const;
+
+private:
+  unsigned getAnalyzableBrOpc(unsigned Opc) const override;
+
+  void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const;
+
+  void expandERet(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const;
+
+  std::pair<bool, bool> compareOpndSize(unsigned Opc,
+                                        const MachineFunction &MF) const;
+
+  void expandPseudoMFHiLo(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                          unsigned NewOpc) const;
+
+  void expandPseudoMTLoHi(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                          unsigned LoOpc, unsigned HiOpc,
+                          bool HasExplicitDef) const;
+
+  /// Expand pseudo Int-to-FP conversion instructions.
+  ///
+  /// For example, the following pseudo instruction
+  ///  PseudoCVT_D32_W D2, A5
+  /// gets expanded into these two instructions:
+  ///  MTC1 F4, A5
+  ///  CVT_D32_W D2, F4
+  ///
+  /// We do this expansion post-RA to avoid inserting a floating point copy
+  /// instruction between MTC1 and CVT_D32_W.
+  void expandCvtFPInt(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                      unsigned CvtOpc, unsigned MovOpc, bool IsI64) const;
+
+  void expandExtractElementF64(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, bool FP64) const;
+  void expandBuildPairF64(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator I, bool FP64) const;
+  void expandEhReturn(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
new file mode 100644
index 000000000000..86bd24166bb6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -0,0 +1,260 @@
+//===-- MipsSERegisterInfo.cpp - MIPS32/64 Register Information -== -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS32/64 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSERegisterInfo.h"
+#include "Mips.h"
+#include "MipsMachineFunction.h"
+#include "MipsSEInstrInfo.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-reg-info"
+
+MipsSERegisterInfo::MipsSERegisterInfo() : MipsRegisterInfo() {}
+
+bool MipsSERegisterInfo::
+requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool MipsSERegisterInfo::
+requiresFrameIndexScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+const TargetRegisterClass *
+MipsSERegisterInfo::intRegClass(unsigned Size) const {
+  if (Size == 4)
+    return &Mips::GPR32RegClass;
+
+  assert(Size == 8);
+  return &Mips::GPR64RegClass;
+}
+
+/// Get the size of the offset supported by the given load/store/inline asm.
+/// The result includes the effects of any scale factors applied to the
+/// instruction immediate.
+static inline unsigned getLoadStoreOffsetSizeInBits(const unsigned Opcode,
+                                                    MachineOperand MO) {
+  switch (Opcode) {
+  case Mips::LD_B:
+  case Mips::ST_B:
+    return 10;
+  case Mips::LD_H:
+  case Mips::ST_H:
+    return 10 + 1 /* scale factor */;
+  case Mips::LD_W:
+  case Mips::ST_W:
+    return 10 + 2 /* scale factor */;
+  case Mips::LD_D:
+  case Mips::ST_D:
+    return 10 + 3 /* scale factor */;
+  case Mips::LL:
+  case Mips::LL64:
+  case Mips::LLD:
+  case Mips::LLE:
+  case Mips::SC:
+  case Mips::SC64:
+  case Mips::SCD:
+  case Mips::SCE:
+    return 16;
+  case Mips::LLE_MM:
+  case Mips::LLE_MMR6:
+  case Mips::LL_MM:
+  case Mips::SCE_MM:
+  case Mips::SCE_MMR6:
+  case Mips::SC_MM:
+    return 12;
+  case Mips::LL64_R6:
+  case Mips::LL_R6:
+  case Mips::LLD_R6:
+  case Mips::SC64_R6:
+  case Mips::SCD_R6:
+  case Mips::SC_R6:
+    return 9;
+  case Mips::INLINEASM: {
+    unsigned ConstraintID = InlineAsm::getMemoryConstraintID(MO.getImm());
+    switch (ConstraintID) {
+    case InlineAsm::Constraint_ZC: {
+      const MipsSubtarget &Subtarget = MO.getParent()
+                                           ->getParent()
+                                           ->getParent()
+                                           ->getSubtarget<MipsSubtarget>();
+      if (Subtarget.inMicroMipsMode())
+        return 12;
+
+      if (Subtarget.hasMips32r6())
+        return 9;
+
+      return 16;
+    }
+    default:
+      return 16;
+    }
+  }
+  default:
+    return 16;
+  }
+}
+
+/// Get the scale factor applied to the immediate in the given load/store.
+static inline unsigned getLoadStoreOffsetAlign(const unsigned Opcode) {
+  switch (Opcode) {
+  case Mips::LD_H:
+  case Mips::ST_H:
+    return 2;
+  case Mips::LD_W:
+  case Mips::ST_W:
+    return 4;
+  case Mips::LD_D:
+  case Mips::ST_D:
+    return 8;
+  default:
+    return 1;
+  }
+}
+
+void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
+                                     unsigned OpNo, int FrameIndex,
+                                     uint64_t StackSize,
+                                     int64_t SPOffset) const {
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  MipsABIInfo ABI =
+      static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI();
+  const MipsRegisterInfo *RegInfo =
+    static_cast<const MipsRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  int MinCSFI = 0;
+  int MaxCSFI = -1;
+
+  if (CSI.size()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  }
+
+  bool EhDataRegFI = MipsFI->isEhDataRegFI(FrameIndex);
+  bool IsISRRegFI = MipsFI->isISRRegFI(FrameIndex);
+  // The following stack frame objects are always referenced relative to $sp:
+  //  1. Outgoing arguments.
+  //  2. Pointer to dynamically allocated stack space.
+  //  3. Locations for callee-saved registers.
+  //  4. Locations for eh data registers.
+  //  5. Locations for ISR saved Coprocessor 0 registers 12 & 14.
+  // Everything else is referenced relative to whatever register
+  // getFrameRegister() returns.
+  unsigned FrameReg;
+
+  if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) || EhDataRegFI ||
+      IsISRRegFI)
+    FrameReg = ABI.GetStackPtr();
+  else if (RegInfo->needsStackRealignment(MF)) {
+    if (MFI.hasVarSizedObjects() && !MFI.isFixedObjectIndex(FrameIndex))
+      FrameReg = ABI.GetBasePtr();
+    else if (MFI.isFixedObjectIndex(FrameIndex))
+      FrameReg = getFrameRegister(MF);
+    else
+      FrameReg = ABI.GetStackPtr();
+  } else
+    FrameReg = getFrameRegister(MF);
+
+  // Calculate final offset.
+  // - There is no need to change the offset if the frame object is one of the
+  //   following: an outgoing argument, pointer to a dynamically allocated
+  //   stack space or a $gp restore location,
+  // - If the frame object is any of the following, its offset must be adjusted
+  //   by adding the size of the stack:
+  //   incoming argument, callee-saved register location or local variable.
+  bool IsKill = false;
+  int64_t Offset;
+
+  Offset = SPOffset + (int64_t)StackSize;
+  Offset += MI.getOperand(OpNo + 1).getImm();
+
+  DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
+
+  if (!MI.isDebugValue()) {
+    // Make sure Offset fits within the field available.
+    // For MSA instructions, this is a 10-bit signed immediate (scaled by
+    // element size), otherwise it is a 16-bit signed immediate.
+    unsigned OffsetBitSize =
+        getLoadStoreOffsetSizeInBits(MI.getOpcode(), MI.getOperand(OpNo - 1));
+    unsigned OffsetAlign = getLoadStoreOffsetAlign(MI.getOpcode());
+
+    if (OffsetBitSize < 16 && isInt<16>(Offset) &&
+        (!isIntN(OffsetBitSize, Offset) ||
+         OffsetToAlignment(Offset, OffsetAlign) != 0)) {
+      // If we have an offset that needs to fit into a signed n-bit immediate
+      // (where n < 16) and doesn't, but does fit into 16-bits then use an ADDiu
+      MachineBasicBlock &MBB = *MI.getParent();
+      DebugLoc DL = II->getDebugLoc();
+      const TargetRegisterClass *PtrRC =
+          ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+      MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+      unsigned Reg = RegInfo.createVirtualRegister(PtrRC);
+      const MipsSEInstrInfo &TII =
+          *static_cast<const MipsSEInstrInfo *>(
+              MBB.getParent()->getSubtarget().getInstrInfo());
+      BuildMI(MBB, II, DL, TII.get(ABI.GetPtrAddiuOp()), Reg)
+          .addReg(FrameReg)
+          .addImm(Offset);
+
+      FrameReg = Reg;
+      Offset = 0;
+      IsKill = true;
+    } else if (!isInt<16>(Offset)) {
+      // Otherwise split the offset into 16-bit pieces and add it in multiple
+      // instructions.
+      MachineBasicBlock &MBB = *MI.getParent();
+      DebugLoc DL = II->getDebugLoc();
+      unsigned NewImm = 0;
+      const MipsSEInstrInfo &TII =
+          *static_cast<const MipsSEInstrInfo *>(
+              MBB.getParent()->getSubtarget().getInstrInfo());
+      unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL,
+                                       OffsetBitSize == 16 ? &NewImm : nullptr);
+      BuildMI(MBB, II, DL, TII.get(ABI.GetPtrAdduOp()), Reg).addReg(FrameReg)
+        .addReg(Reg, RegState::Kill);
+
+      FrameReg = Reg;
+      Offset = SignExtend64<16>(NewImm);
+      IsKill = true;
+    }
+  }
+
+  MI.getOperand(OpNo).ChangeToRegister(FrameReg, false, false, IsKill);
+  MI.getOperand(OpNo + 1).ChangeToImmediate(Offset);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.h b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.h
new file mode 100644
index 000000000000..ebae1909d233
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.h
@@ -0,0 +1,41 @@
+//===-- MipsSERegisterInfo.h - Mips32/64 Register Information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSEREGISTERINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSEREGISTERINFO_H
+
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+class MipsSEInstrInfo;
+
+class MipsSERegisterInfo : public MipsRegisterInfo {
+public:
+  MipsSERegisterInfo();
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+  const TargetRegisterClass *intRegClass(unsigned Size) const override;
+
+private:
+  void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+                   int FrameIndex, uint64_t StackSize,
+                   int64_t SPOffset) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSchedule.td b/contrib/llvm/lib/Target/Mips/MipsSchedule.td
new file mode 100644
index 000000000000..c0de59ba15f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSchedule.td
@@ -0,0 +1,674 @@
+//===-- MipsSchedule.td - Mips Scheduling Definitions ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Functional units across Mips chips sets. Based on GCC/Mips backend files.
+//===----------------------------------------------------------------------===//
+def ALU     : FuncUnit;
+def IMULDIV : FuncUnit;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for Mips
+//===----------------------------------------------------------------------===//
+// IIM16Alu is a placeholder class for most MIPS16 instructions.
+def IIM16Alu           : InstrItinClass;
+def IIPseudo           : InstrItinClass;
+
+def II_ABS              : InstrItinClass;
+def II_ADDI             : InstrItinClass;
+def II_ADDIU            : InstrItinClass;
+def II_ADDIUPC          : InstrItinClass;
+def II_ADD              : InstrItinClass;
+def II_ADDU             : InstrItinClass;
+def II_ADD_D            : InstrItinClass;
+def II_ADD_S            : InstrItinClass;
+def II_ALIGN            : InstrItinClass;
+def II_AND              : InstrItinClass;
+def II_ANDI             : InstrItinClass;
+def II_ALUIPC           : InstrItinClass;
+def II_AUI              : InstrItinClass;
+def II_AUIPC            : InstrItinClass;
+def II_B                : InstrItinClass;
+def II_BADDU            : InstrItinClass;
+def II_BBIT             : InstrItinClass; // bbit[01], bbit[01]32
+def II_BALC             : InstrItinClass;
+def II_BC               : InstrItinClass;
+def II_BC1F             : InstrItinClass;
+def II_BC1FL            : InstrItinClass;
+def II_BC1T             : InstrItinClass;
+def II_BC1TL            : InstrItinClass;
+def II_BC1CCZ           : InstrItinClass;
+def II_BC2CCZ           : InstrItinClass;
+def II_BCC              : InstrItinClass; // beq and bne
+def II_BCCZ             : InstrItinClass; // b[gl][et]z
+def II_BCCC             : InstrItinClass; // b<cc>c
+def II_BCCZAL           : InstrItinClass; // bgezal and bltzal
+def II_BCCZALS          : InstrItinClass; // bgezals and bltzals
+def II_BCCZC            : InstrItinClass; // beqzc, bnezc
+def II_BITSWAP          : InstrItinClass;
+def II_CEIL             : InstrItinClass;
+def II_CFC1             : InstrItinClass;
+def II_CFC2             : InstrItinClass;
+def II_CLO              : InstrItinClass;
+def II_CLZ              : InstrItinClass;
+def II_CTC1             : InstrItinClass;
+def II_CTC2             : InstrItinClass;
+def II_CVT              : InstrItinClass;
+def II_C_CC_D           : InstrItinClass; // Any c.<cc>.d instruction
+def II_C_CC_S           : InstrItinClass; // Any c.<cc>.s instruction
+def II_CMP_CC_D         : InstrItinClass; // Any cmp.<cc>.d instruction
+def II_CMP_CC_S         : InstrItinClass; // Any cmp.<cc>.s instruction
+def II_CLASS_D          : InstrItinClass;
+def II_CLASS_S          : InstrItinClass;
+def II_DADDIU           : InstrItinClass;
+def II_DADDU            : InstrItinClass;
+def II_DADDI            : InstrItinClass;
+def II_DADD             : InstrItinClass;
+def II_DAHI             : InstrItinClass;
+def II_DATI             : InstrItinClass;
+def II_DAUI             : InstrItinClass;
+def II_DALIGN           : InstrItinClass;
+def II_DBITSWAP         : InstrItinClass;
+def II_DCLO             : InstrItinClass;
+def II_DCLZ             : InstrItinClass;
+def II_DDIV             : InstrItinClass;
+def II_DDIVU            : InstrItinClass;
+def II_DIV              : InstrItinClass;
+def II_DIVU             : InstrItinClass;
+def II_DIV_D            : InstrItinClass;
+def II_DIV_S            : InstrItinClass;
+def II_DMFC0            : InstrItinClass;
+def II_DMTC0            : InstrItinClass;
+def II_DMFC1            : InstrItinClass;
+def II_DMTC1            : InstrItinClass;
+def II_DMOD             : InstrItinClass;
+def II_DMODU            : InstrItinClass;
+def II_DMUH             : InstrItinClass;
+def II_DMUHU            : InstrItinClass;
+def II_DMFC2            : InstrItinClass;
+def II_DMTC2            : InstrItinClass;
+def II_DMUL             : InstrItinClass;
+def II_DMULU            : InstrItinClass;
+def II_DMULT            : InstrItinClass;
+def II_DMULTU           : InstrItinClass;
+def II_DROTR            : InstrItinClass;
+def II_DROTR32          : InstrItinClass;
+def II_DROTRV           : InstrItinClass;
+def II_DSLL             : InstrItinClass;
+def II_DSLL32           : InstrItinClass;
+def II_DSLLV            : InstrItinClass;
+def II_DSRA             : InstrItinClass;
+def II_DSRA32           : InstrItinClass;
+def II_DSRAV            : InstrItinClass;
+def II_DSRL             : InstrItinClass;
+def II_DSRL32           : InstrItinClass;
+def II_DSRLV            : InstrItinClass;
+def II_DSBH             : InstrItinClass;
+def II_DSHD             : InstrItinClass;
+def II_DSUBU            : InstrItinClass;
+def II_DSUB             : InstrItinClass;
+def II_EXT              : InstrItinClass; // Any EXT instruction
+def II_FLOOR            : InstrItinClass;
+def II_INS              : InstrItinClass; // Any INS instruction
+def II_IndirectBranchPseudo : InstrItinClass; // Indirect branch pseudo.
+def II_J                : InstrItinClass;
+def II_JAL              : InstrItinClass;
+def II_JALR             : InstrItinClass;
+def II_JALR_HB          : InstrItinClass;
+def II_JALRC            : InstrItinClass;
+def II_JALRS            : InstrItinClass;
+def II_JALS             : InstrItinClass;
+def II_JIC              : InstrItinClass;
+def II_JIALC            : InstrItinClass;
+def II_JR               : InstrItinClass;
+def II_JR_HB            : InstrItinClass;
+def II_JRADDIUSP        : InstrItinClass;
+def II_JRC              : InstrItinClass;
+def II_ReturnPseudo     : InstrItinClass; // Return pseudo.
+def II_ERET             : InstrItinClass;
+def II_DERET            : InstrItinClass;
+def II_ERETNC           : InstrItinClass;
+def II_EHB              : InstrItinClass;
+def II_SDBBP            : InstrItinClass;
+def II_SSNOP            : InstrItinClass;
+def II_SYSCALL          : InstrItinClass;
+def II_PAUSE            : InstrItinClass;
+def II_WAIT             : InstrItinClass;
+def II_EI               : InstrItinClass;
+def II_DI               : InstrItinClass;
+def II_TEQ              : InstrItinClass;
+def II_TEQI             : InstrItinClass;
+def II_TGE              : InstrItinClass;
+def II_TGEI             : InstrItinClass;
+def II_TGEIU            : InstrItinClass;
+def II_TGEU             : InstrItinClass;
+def II_TNE              : InstrItinClass;
+def II_TNEI             : InstrItinClass;
+def II_TLT              : InstrItinClass;
+def II_TLTI             : InstrItinClass;
+def II_TLTU             : InstrItinClass;
+def II_TTLTIU           : InstrItinClass;
+def II_TLBP             : InstrItinClass;
+def II_TLBR             : InstrItinClass;
+def II_TLBWI            : InstrItinClass;
+def II_TLBWR            : InstrItinClass;
+def II_TRAP             : InstrItinClass;
+def II_BREAK            : InstrItinClass;
+def II_SYNC             : InstrItinClass;
+def II_SYNCI            : InstrItinClass;
+def II_LB               : InstrItinClass;
+def II_LBE              : InstrItinClass;
+def II_LBU              : InstrItinClass;
+def II_LBUE             : InstrItinClass;
+def II_LD               : InstrItinClass;
+def II_LDC1             : InstrItinClass;
+def II_LDC2             : InstrItinClass;
+def II_LDC3             : InstrItinClass;
+def II_LDL              : InstrItinClass;
+def II_LDR              : InstrItinClass;
+def II_LDPC             : InstrItinClass;
+def II_LDXC1            : InstrItinClass;
+def II_LH               : InstrItinClass;
+def II_LHE              : InstrItinClass;
+def II_LHU              : InstrItinClass;
+def II_LHUE             : InstrItinClass;
+def II_LL               : InstrItinClass;
+def II_LI               : InstrItinClass;
+def II_LLD              : InstrItinClass;
+def II_LUI              : InstrItinClass;
+def II_LUXC1            : InstrItinClass;
+def II_LW               : InstrItinClass;
+def II_LWE              : InstrItinClass;
+def II_LWC1             : InstrItinClass;
+def II_LWC2             : InstrItinClass;
+def II_LWC3             : InstrItinClass;
+def II_LWM              : InstrItinClass;
+def II_LWL              : InstrItinClass;
+def II_LWLE             : InstrItinClass;
+def II_LWPC             : InstrItinClass;
+def II_LWP              : InstrItinClass;
+def II_LWR              : InstrItinClass;
+def II_LWRE             : InstrItinClass;
+def II_LWU              : InstrItinClass;
+def II_LWUPC            : InstrItinClass;
+def II_LWXC1            : InstrItinClass;
+def II_LWXS             : InstrItinClass;
+def II_LSA              : InstrItinClass;
+def II_DLSA             : InstrItinClass;
+def II_MADD             : InstrItinClass;
+def II_MADDU            : InstrItinClass;
+def II_MADD_D           : InstrItinClass;
+def II_MADD_S           : InstrItinClass;
+def II_MADDF_D          : InstrItinClass;
+def II_MADDF_S          : InstrItinClass;
+def II_MAX_D            : InstrItinClass;
+def II_MAX_S            : InstrItinClass;
+def II_MAXA_D           : InstrItinClass;
+def II_MAXA_S           : InstrItinClass;
+def II_MIN_D            : InstrItinClass;
+def II_MIN_S            : InstrItinClass;
+def II_MINA_D           : InstrItinClass;
+def II_MINA_S           : InstrItinClass;
+def II_MFC0             : InstrItinClass;
+def II_MFHC0            : InstrItinClass;
+def II_MFC1             : InstrItinClass;
+def II_MFHC1            : InstrItinClass;
+def II_MFC2             : InstrItinClass;
+def II_MFHI_MFLO        : InstrItinClass; // mfhi and mflo
+def II_MOD              : InstrItinClass;
+def II_MODU             : InstrItinClass;
+def II_MOVE             : InstrItinClass;
+def II_MOVF             : InstrItinClass;
+def II_MOVF_D           : InstrItinClass;
+def II_MOVF_S           : InstrItinClass;
+def II_MOVN             : InstrItinClass;
+def II_MOVN_D           : InstrItinClass;
+def II_MOVN_S           : InstrItinClass;
+def II_MOVT             : InstrItinClass;
+def II_MOVT_D           : InstrItinClass;
+def II_MOVT_S           : InstrItinClass;
+def II_MOVZ             : InstrItinClass;
+def II_MOVZ_D           : InstrItinClass;
+def II_MOVZ_S           : InstrItinClass;
+def II_MOV_D            : InstrItinClass;
+def II_MOV_S            : InstrItinClass;
+def II_MSUB             : InstrItinClass;
+def II_MSUBU            : InstrItinClass;
+def II_MSUB_D           : InstrItinClass;
+def II_MSUB_S           : InstrItinClass;
+def II_MSUBF_D          : InstrItinClass;
+def II_MSUBF_S          : InstrItinClass;
+def II_MTC0             : InstrItinClass;
+def II_MTHC0            : InstrItinClass;
+def II_MTC1             : InstrItinClass;
+def II_MTHC1            : InstrItinClass;
+def II_MTC2             : InstrItinClass;
+def II_MTHI_MTLO        : InstrItinClass; // mthi and mtlo
+def II_MUL              : InstrItinClass;
+def II_MUH              : InstrItinClass;
+def II_MUHU             : InstrItinClass;
+def II_MULU             : InstrItinClass;
+def II_MULT             : InstrItinClass;
+def II_MULTU            : InstrItinClass;
+def II_MUL_D            : InstrItinClass;
+def II_MUL_S            : InstrItinClass;
+def II_NEG              : InstrItinClass;
+def II_NMADD_D          : InstrItinClass;
+def II_NMADD_S          : InstrItinClass;
+def II_NMSUB_D          : InstrItinClass;
+def II_NMSUB_S          : InstrItinClass;
+def II_NOR              : InstrItinClass;
+def II_NOT              : InstrItinClass;
+def II_OR               : InstrItinClass;
+def II_ORI              : InstrItinClass;
+def II_POP              : InstrItinClass;
+def II_RDHWR            : InstrItinClass;
+def II_RESTORE          : InstrItinClass;
+def II_RECIP_S          : InstrItinClass;
+def II_RECIP_D          : InstrItinClass;
+def II_RINT_S           : InstrItinClass;
+def II_RINT_D           : InstrItinClass;
+def II_ROTR             : InstrItinClass;
+def II_ROTRV            : InstrItinClass;
+def II_ROUND            : InstrItinClass;
+def II_RSQRT_S          : InstrItinClass;
+def II_RSQRT_D          : InstrItinClass;
+def II_SAVE             : InstrItinClass;
+def II_SC               : InstrItinClass;
+def II_SCD              : InstrItinClass;
+def II_SB               : InstrItinClass;
+def II_SBE              : InstrItinClass;
+def II_SD               : InstrItinClass;
+def II_SDC1             : InstrItinClass;
+def II_SDC2             : InstrItinClass;
+def II_SDC3             : InstrItinClass;
+def II_SDL              : InstrItinClass;
+def II_SDR              : InstrItinClass;
+def II_SDXC1            : InstrItinClass;
+def II_SEB              : InstrItinClass;
+def II_SEH              : InstrItinClass;
+def II_SELCCZ           : InstrItinClass;
+def II_SELCCZ_D         : InstrItinClass;
+def II_SELCCZ_S         : InstrItinClass;
+def II_SEQ_SNE          : InstrItinClass; // seq and sne
+def II_SEQI_SNEI        : InstrItinClass; // seqi and snei
+def II_SH               : InstrItinClass;
+def II_SHE              : InstrItinClass;
+def II_SLL              : InstrItinClass;
+def II_SLLV             : InstrItinClass;
+def II_SLTI_SLTIU       : InstrItinClass; // slti and sltiu
+def II_SLT_SLTU         : InstrItinClass; // slt and sltu
+def II_SQRT_D           : InstrItinClass;
+def II_SQRT_S           : InstrItinClass;
+def II_SEL_D            : InstrItinClass;
+def II_SEL_S            : InstrItinClass;
+def II_SRA              : InstrItinClass;
+def II_SRAV             : InstrItinClass;
+def II_SRL              : InstrItinClass;
+def II_SRLV             : InstrItinClass;
+def II_SUB              : InstrItinClass;
+def II_SUBU             : InstrItinClass;
+def II_SUB_D            : InstrItinClass;
+def II_SUB_S            : InstrItinClass;
+def II_SUXC1            : InstrItinClass;
+def II_SW               : InstrItinClass;
+def II_SWE              : InstrItinClass;
+def II_SWC1             : InstrItinClass;
+def II_SWC2             : InstrItinClass;
+def II_SWC3             : InstrItinClass;
+def II_SWL              : InstrItinClass;
+def II_SWLE             : InstrItinClass;
+def II_SWM              : InstrItinClass;
+def II_SWP              : InstrItinClass;
+def II_SWR              : InstrItinClass;
+def II_SWRE             : InstrItinClass;
+def II_SWXC1            : InstrItinClass;
+def II_TRUNC            : InstrItinClass;
+def II_WSBH             : InstrItinClass;
+def II_XOR              : InstrItinClass;
+def II_XORI             : InstrItinClass;
+def II_CACHE            : InstrItinClass;
+def II_PREF             : InstrItinClass;
+def II_CACHEE           : InstrItinClass;
+def II_PREFE            : InstrItinClass;
+def II_LLE              : InstrItinClass;
+def II_SCE              : InstrItinClass;
+def II_TLBINV           : InstrItinClass;
+def II_TLBINVF          : InstrItinClass;
+def II_WRPGPR           : InstrItinClass;
+def II_RDPGPR           : InstrItinClass;
+def II_DVP              : InstrItinClass;
+def II_EVP              : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Mips Generic instruction itineraries.
+//===----------------------------------------------------------------------===//
+def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
+  InstrItinData<IIM16Alu           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ADDI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ADDIU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ADDIUPC         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ADD             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ADDU            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_AUI             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_AND             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ALUIPC          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_AUIPC           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ALIGN           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BADDU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BITSWAP         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SLL             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SRA             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SRL             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ROTR            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SLLV            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SRAV            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SRLV            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ROTRV           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CLO             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CLZ             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DADDIU          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DADDU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DADDI           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DADD            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DALIGN          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DAHI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DATI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DAUI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DBITSWAP        , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DCLO            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DCLZ            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DMOD            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DMODU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DSLL            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSLL32          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSRL            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSRL32          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSRA            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSRA32          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSLLV           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSRLV           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSRAV           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSUBU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSUB            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DROTR           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DROTR32         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DROTRV          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSBH            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DSHD            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DCLO            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DCLZ            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_EXT             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_INS             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_LUI             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVE            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVF            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVN            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVN_S          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVN_D          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVT            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOVZ            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_NOR             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_NOT             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_OR              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_POP             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_RDHWR           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SUB             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SUBU            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_XOR             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ANDI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ORI             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_XORI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_LB              , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LBE             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LBU             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LBUE            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LH              , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LHU             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LHUE            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LW              , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWM             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWP             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWPC            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWL             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWLE            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWR             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWRE            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWUPC           , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LD              , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LDL             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LDR             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LDPC            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LI              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_LL              , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LLD             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_RESTORE         , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_SB              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SH              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SHE             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SW              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWM             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWL             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWR             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWP             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SDL             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SDR             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SD              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SC              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SCD             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SAVE            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SELCCZ_S        , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SELCCZ_D        , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SEQ_SNE         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SEQI_SNEI       , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SLTI_SLTIU      , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SLT_SLTU        , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_B               , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BALC            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BBIT            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BC              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BC1F            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BC1FL           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BC1T            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BC1TL           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BC1CCZ          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BC2CCZ          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BCC             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BCCC            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BCCZ            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BCCZAL          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BCCZALS         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BCCZC           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CLASS_D         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CLASS_S         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_IndirectBranchPseudo, [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_J               , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JAL             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JALR            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JALR_HB         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JALRC           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JALRS           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JALS            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JIC             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JIALC           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JR              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JR_HB           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JRADDIUSP       , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_JRC             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ReturnPseudo    , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIPseudo           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DMUH            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DMUHU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_ERET            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DERET           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ERETNC          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_EHB             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SDBBP           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SSNOP           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SYSCALL         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_PAUSE           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_WAIT            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_EI              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DI              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TEQ             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TEQI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TGE             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TGEI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TGEIU           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TGEU            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TNE             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TNEI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLT             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLTI            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLTU            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TTLTIU          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLBP            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLBR            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLBWI           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLBWR           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TRAP            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_BREAK           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SYNC            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SYNCI           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DMUL            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DMULT           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DMULTU          , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DMULU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MADD            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MADDU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MFHI_MFLO       , [InstrStage<1,  [IMULDIV]>]>,
+  InstrItinData<II_MAX_D           , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_MAX_S           , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_MAXA_D          , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_MAXA_S          , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_MIN_S           , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_MIN_D           , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_MINA_S          , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_MINA_D          , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_MOD             , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<II_MODU            , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<II_MSUB            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MSUBU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MTHI_MTLO       , [InstrStage<1,  [IMULDIV]>]>,
+  InstrItinData<II_MUH             , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MUHU            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MUL             , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MULT            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MULTU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MULU            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MSUB            , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_MSUBU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DIV             , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<II_DIVU            , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<II_DDIV            , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<II_DDIVU           , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<II_CEIL            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CVT             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ABS             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_FLOOR           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_NEG             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_ROUND           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TRUNC           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_MOV_D           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOV_S           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_CFC1            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_CTC1            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_CFC2            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_CTC2            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVF_D          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVF_S          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVT_D          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVT_S          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVZ_D          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MOVZ_S          , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_C_CC_S          , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_C_CC_D          , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_CMP_CC_S        , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_CMP_CC_D        , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_ADD_D           , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_ADD_S           , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_SUB_D           , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_SUB_S           , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<II_MUL_S           , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_MADD_S          , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_MADDF_S         , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_MSUB_S          , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_MSUBF_S         , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_NMADD_S         , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_NMSUB_S         , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<II_MUL_D           , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_MADD_D          , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_MADDF_D         , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_MSUB_D          , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_MSUBF_D         , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_NMADD_D         , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_NMSUB_D         , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<II_DIV_S           , [InstrStage<23, [ALU]>]>,
+  InstrItinData<II_DIV_D           , [InstrStage<36, [ALU]>]>,
+  InstrItinData<II_RECIP_D         , [InstrStage<25, [ALU]>]>,
+  InstrItinData<II_RECIP_S         , [InstrStage<13, [ALU]>]>,
+  InstrItinData<II_RSQRT_D         , [InstrStage<29, [ALU]>]>,
+  InstrItinData<II_RSQRT_S         , [InstrStage<14, [ALU]>]>,
+  InstrItinData<II_RINT_D          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_RINT_S          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SQRT_S          , [InstrStage<54, [ALU]>]>,
+  InstrItinData<II_SQRT_D          , [InstrStage<12, [ALU]>]>,
+  InstrItinData<II_SEL_D           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SEL_S           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_WSBH            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_LSA             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DLSA            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_LDC1            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LDC2            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LDC3            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWC1            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWC2            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWC3            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LDXC1           , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWXC1           , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LUXC1           , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_LWXS            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_SDC1            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SDC2            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SDC3            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWC1            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWC2            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWC3            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SDXC1           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SWXC1           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SUXC1           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DMFC0           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_DMFC1           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_DMFC2           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_DMTC0           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_DMTC1           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_DMTC2           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MFC0            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MFHC0           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MFC1            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MFC2            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MTC0            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MTHC0           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MTC1            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MTC2            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MFHC1           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_MTHC1           , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_CACHE           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_PREF            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_CACHEE          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_PREFE           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLBINV          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_TLBINVF         , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_LLE             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<II_SCE             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_WRPGPR          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_RDPGPR          , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DVP             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_EVP             , [InstrStage<1,  [ALU]>]>
+]>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsScheduleGeneric.td b/contrib/llvm/lib/Target/Mips/MipsScheduleGeneric.td
new file mode 100644
index 000000000000..15a0401b781e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsScheduleGeneric.td
@@ -0,0 +1,1048 @@
+//=- MipsScheduleGeneric.td - Generic Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the interAptiv processor in a manner of speaking. It
+// describes a hypothetical version of the in-order MIPS32R2 interAptiv with all
+// branches of the MIPS ISAs, ASEs and ISA variants. The itinerary lists are
+// broken down into per ISA lists, so that this file can be used to rapidly
+// develop new schedule models.
+//
+//===----------------------------------------------------------------------===//
+def MipsGenericModel : SchedMachineModel {
+  int IssueWidth = 1;
+  int MicroOpBufferSize = 0;
+
+  // These figures assume an L1 hit.
+  int LoadLatency = 2;
+  int MispredictPenalty = 4;
+
+  int HighLatency = 37;
+  list<Predicate> UnsupportedFeatures = [];
+
+  let CompleteModel = 1;
+  let PostRAScheduler = 1;
+}
+
+let SchedModel = MipsGenericModel in {
+
+// ALU Pipeline
+// ============
+
+def GenericALU : ProcResource<1> { let BufferSize = 1; }
+def GenericIssueALU : ProcResource<1> { let Super = GenericALU; }
+
+def GenericWriteALU : SchedWriteRes<[GenericIssueALU]>;
+
+// and, lui, nor, or, slti, sltiu, sub, subu, xor
+// add, addi, addiu, addu, andi, ori, rotr, se[bh], sllv?, sr[al]v?, slt, sltu,
+// xori
+def : ItinRW<[GenericWriteALU], [II_ADD, II_ADDU, II_ADDI, II_ADDIU, II_ANDI,
+                                 II_AND, II_ANDI, II_CLO, II_CLZ, II_EXT,
+                                 II_INS, II_LUI, II_MULT, II_MULTU, II_NOR,
+                                 II_ORI, II_OR, II_ROTR, II_ROTRV, II_SEB,
+                                 II_SEH, II_SLTI_SLTIU, II_SLT_SLTU, II_SLL,
+                                 II_SRA, II_SRL, II_SLLV, II_SRAV, II_SRLV,
+                                 II_SSNOP, II_SUB, II_SUBU, II_WSBH, II_XOR,
+                                 II_XORI]>;
+
+def : InstRW<[GenericWriteALU], (instrs COPY)>;
+
+def GenericMDU : ProcResource<1> { let BufferSize = 1; }
+def GenericIssueMDU : ProcResource<1> { let Super = GenericALU; }
+def GenericIssueDIV : ProcResource<1> { let Super = GenericMDU; }
+def GenericWriteHILO : SchedWriteRes<[GenericIssueMDU]>;
+def GenericWriteALULong : SchedWriteRes<[GenericIssueALU]> { let Latency = 5; }
+def GenericWriteMove : SchedWriteRes<[GenericIssueALU]> { let Latency = 2; }
+
+def : ItinRW<[GenericWriteHILO], [II_MADD, II_MADDU, II_MSUB, II_MSUBU]>;
+
+def GenericWriteMDUtoGPR : SchedWriteRes<[GenericIssueMDU]> {
+  let Latency = 5;
+}
+
+def : ItinRW<[GenericWriteMDUtoGPR], [II_MUL]>;
+
+def GenericWriteDIV : SchedWriteRes<[GenericIssueDIV]> {
+  // Estimated worst case
+  let Latency = 33;
+  let ResourceCycles = [1, 33];
+}
+def GenericWriteDIVU : SchedWriteRes<[GenericIssueDIV]> {
+  // Estimated worst case
+  let Latency = 31;
+  let ResourceCycles = [1, 31];
+}
+
+def : ItinRW<[GenericWriteDIV], [II_DIV]>;
+
+def : ItinRW<[GenericWriteDIVU], [II_DIVU]>;
+
+// MIPS64
+// ======
+
+def : ItinRW<[GenericWriteALU], [II_DADDIU, II_DADDU, II_DADDI, II_DADD,
+                                 II_DCLO, II_DCLZ, II_DROTR, II_DROTR32,
+                                 II_DROTRV, II_DSBH, II_DSHD, II_DSLL,
+                                 II_DSLL32, II_DSLLV, II_DSRA, II_DSRA32,
+                                 II_DSRAV, II_DSRL, II_DSRL32, II_DSRLV,
+                                 II_DSUBU, II_DSUB]>;
+
+def : ItinRW<[GenericWriteDIV], [II_DDIV]>;
+
+def : ItinRW<[GenericWriteDIVU], [II_DDIVU]>;
+
+def : ItinRW<[GenericWriteMDUtoGPR], [II_DMUL]>;
+
+def : ItinRW<[GenericWriteHILO], [II_DMULU, II_DMULT, II_DMULTU]>;
+
+// MIPS16e
+// =======
+
+def : ItinRW<[GenericWriteALU], [IIM16Alu, IIPseudo]>;
+
+// microMIPS
+// =========
+
+def : ItinRW<[GenericWriteALU], [II_MOVE, II_LI, II_NOT]>;
+
+// MIPSR6
+// ======
+
+def GenericWriteMul : SchedWriteRes<[GenericIssueMDU]> { let Latency = 4; }
+def : ItinRW<[GenericWriteMul], [II_MUH, II_MUHU, II_MULU]>;
+
+def : ItinRW<[GenericWriteDIV], [II_MOD, II_MODU]>;
+
+def : ItinRW<[GenericWriteALU], [II_ADDIUPC, II_ALIGN, II_ALUIPC, II_AUI,
+                                 II_AUIPC, II_BITSWAP, II_LSA, II_SELCCZ]>;
+
+// MIPS64R6
+// ========
+
+def : ItinRW<[GenericWriteALU], [II_DALIGN, II_DAHI, II_DATI, II_DAUI,
+                               II_DBITSWAP, II_DLSA]>;
+
+def : ItinRW<[GenericWriteMDUtoGPR], [II_DMUH, II_DMUHU]>;
+def : ItinRW<[GenericWriteDIV], [II_DMOD, II_DMODU]>;
+
+// clo, clz, di, mfhi, mflo
+def : ItinRW<[GenericWriteALULong], [II_MFHI_MFLO]>;
+def : ItinRW<[GenericWriteALU], [II_MOVN, II_MOVZ]>;
+def : ItinRW<[GenericWriteMove], [II_MTHI_MTLO, II_RDHWR]>;
+
+
+// CTISTD Pipeline
+// ---------------
+
+def GenericIssueCTISTD : ProcResource<1> { let Super = GenericALU; }
+
+def GenericLDST : ProcResource<1> { let BufferSize = 1; }
+def GenericIssueLDST : ProcResource<1> { let Super = GenericLDST; }
+
+def GenericWriteJump : SchedWriteRes<[GenericIssueCTISTD]>;
+def GenericWriteJumpAndLink : SchedWriteRes<[GenericIssueCTISTD]> {
+  let Latency = 2;
+}
+
+// b, beq, beql, bg[et]z, bl[et]z, bne, bnel, j, syscall, jal, bltzal, jalx,
+// jalr, jr.hb, jr, jalr.hb, jarlc, jialc
+def : ItinRW<[GenericWriteJump], [II_B, II_BCC, II_BCCZ, II_BCCZAL, II_J,
+                                  II_JR, II_JR_HB, II_ERET, II_ERETNC,
+                                  II_DERET]>;
+
+def : ItinRW<[GenericWriteJumpAndLink], [II_JAL, II_JALR, II_JALR_HB,
+                                         II_BC2CCZ]>;
+
+def : ItinRW<[GenericWriteJump], [II_JRC, II_JRADDIUSP]>;
+
+def : ItinRW<[GenericWriteJumpAndLink], [II_BCCZALS, II_JALS, II_JALRS]>;
+
+// MIPSR6
+// ======
+
+def : ItinRW<[GenericWriteJumpAndLink], [II_BALC, II_JALRC, II_JIALC]>;
+
+def : ItinRW<[GenericWriteJump], [II_JIC, II_BC, II_BCCC, II_BCCZC]>;
+
+
+def GenericWriteTrap : SchedWriteRes<[GenericIssueCTISTD]>;
+
+def : ItinRW<[GenericWriteTrap], [II_BREAK, II_SYSCALL, II_TEQ, II_TEQI,
+                                  II_TGE, II_TGEI, II_TGEIU, II_TGEU, II_TNE,
+                                  II_TNEI, II_TLT, II_TLTI, II_TLTU, II_TTLTIU,
+                                  II_TRAP, II_SDBBP]>;
+
+// COP0 Pipeline
+// =============
+
+def GenericCOP0 : ProcResource<1> { let BufferSize = 1; }
+
+def GenericIssueCOP0 : ProcResource<1> { let Super = GenericCOP0; }
+def GenericWriteCOP0TLB : SchedWriteRes<[GenericIssueCOP0]> { let Latency = 4; }
+def GenericWriteCOP0 : SchedWriteRes<[GenericIssueCOP0]> { let Latency = 3; }
+def GenericReadCOP0 : SchedWriteRes<[GenericIssueCOP0]> { let Latency = 2; }
+def GnereicReadWritePGPR : SchedWriteRes<[GenericIssueCOP0]>;
+
+def : ItinRW<[GenericWriteCOP0TLB], [II_TLBP, II_TLBR, II_TLBWI, II_TLBWR]>;
+def : ItinRW<[GenericWriteCOP0TLB], [II_TLBINV, II_TLBINVF]>;
+
+def : ItinRW<[GenericReadCOP0], [II_MFC0]>;
+def : ItinRW<[GenericWriteCOP0], [II_MTC0]>;
+
+def : ItinRW<[GenericWriteCOP0], [II_EVP, II_DVP]>;
+
+// MIPSR5
+// ======
+def : ItinRW<[GenericReadCOP0], [II_MFHC0]>;
+def : ItinRW<[GenericWriteCOP0], [II_MTHC0]>;
+
+// MIPS64
+// ======
+
+def : ItinRW<[GenericReadCOP0], [II_DMFC0]>;
+def : ItinRW<[GenericWriteCOP0], [II_DMTC0]>;
+
+def : ItinRW<[GenericWriteCOP0], [II_RDPGPR, II_WRPGPR]>;
+
+def : ItinRW<[GenericWriteCOP0], [II_DI, II_EI]>;
+
+def : ItinRW<[GenericWriteCOP0], [II_EHB, II_PAUSE, II_WAIT]>;
+
+def GenericCOP2 : ProcResource<1> { let BufferSize = 1; }
+def GenericWriteCOPOther : SchedWriteRes<[GenericCOP2]>;
+
+def : ItinRW<[GenericWriteCOPOther], [II_MFC2, II_MTC2, II_DMFC2, II_DMTC2]>;
+
+// LDST Pipeline
+// -------------
+
+def GenericWriteLoad : SchedWriteRes<[GenericIssueLDST]> {
+  let Latency = 2;
+}
+
+def GenericWritePref : SchedWriteRes<[GenericIssueLDST]>;
+def GenericWriteSync : SchedWriteRes<[GenericIssueLDST]>;
+def GenericWriteCache : SchedWriteRes<[GenericIssueLDST]> { let Latency = 5; }
+
+def GenericWriteStore : SchedWriteRes<[GenericIssueLDST]>;
+def GenericWriteStoreSC : SchedWriteRes<[GenericIssueLDST]> { let Latency = 2; }
+
+def GenericWriteGPRFromBypass : SchedWriteRes<[GenericIssueLDST]> {
+  let Latency = 2;
+}
+
+def GenericWriteStoreFromOtherUnits : SchedWriteRes<[GenericIssueLDST]>;
+def GenericWriteLoadToOtherUnits : SchedWriteRes<[GenericIssueLDST]> {
+  let Latency = 0;
+}
+
+// l[bhw], l[bh]u, ll
+def : ItinRW<[GenericWriteLoad], [II_LB, II_LBU, II_LH, II_LHU, II_LW, II_LL,
+                                  II_LWC2, II_LWC3, II_LDC2, II_LDC3]>;
+
+// lw[lr]
+def : ItinRW<[GenericWriteLoad], [II_LWL, II_LWR]>;
+
+// MIPS64 loads
+def : ItinRW<[GenericWriteLoad], [II_LD, II_LLD, II_LWU]>;
+
+// ld[lr]
+def : ItinRW<[GenericWriteLoad], [II_LDL, II_LDR]>;
+
+// MIPS32 EVA
+def : ItinRW<[GenericWriteLoad], [II_LBE, II_LBUE, II_LHE, II_LHUE, II_LWE,
+                                  II_LLE]>;
+
+def : ItinRW<[GenericWriteLoad], [II_LWLE, II_LWRE]>;
+
+// MIPS32R6 and MIPS16e
+// ====================
+
+def : ItinRW<[GenericWriteLoad], [II_LWPC]>;
+
+// MIPS64R6
+// ====================
+
+def : ItinRW<[GenericWriteLoad], [II_LWUPC, II_LDPC]>;
+
+
+// s[bhw], sc, s[dw]c[23]
+def : ItinRW<[GenericWriteStore], [II_SB, II_SH, II_SW, II_SWC2, II_SWC3,
+                                   II_SDC2, II_SDC3]>;
+
+def : ItinRW<[GenericWriteStoreSC], [II_SC]>;
+
+// PreMIPSR6 sw[lr]
+def : ItinRW<[GenericWriteStore], [II_SWL, II_SWR]>;
+
+// EVA ASE stores
+def : ItinRW<[GenericWriteStore], [II_SBE, II_SHE, II_SWE, II_SCE]>;
+
+def : ItinRW<[GenericWriteStore], [II_SWLE, II_SWRE]>;
+
+// MIPS64
+// ======
+
+def : ItinRW<[GenericWriteStore], [II_SD, II_SCD]>;
+
+// PreMIPSR6 stores
+// ================
+
+def : ItinRW<[GenericWriteStore], [II_SDL, II_SDR]>;
+
+// MIPS16e
+// =======
+
+def : ItinRW<[GenericWriteLoad], [II_RESTORE]>;
+
+def : ItinRW<[GenericWriteStore], [II_SAVE]>;
+
+// microMIPS
+// =========
+
+def : ItinRW<[GenericWriteLoad], [II_LWM, II_LWP, II_LWXS]>;
+
+def : ItinRW<[GenericWriteStore], [II_SWM, II_SWP]>;
+
+// pref
+def : ItinRW<[GenericWritePref], [II_PREF]>;
+
+def : ItinRW<[GenericWritePref], [II_PREFE]>;
+
+// cache
+def : ItinRW<[GenericWriteCache], [II_CACHE]>;
+
+def : ItinRW<[GenericWriteCache], [II_CACHEE]>;
+
+// sync
+def : ItinRW<[GenericWriteSync], [II_SYNC]>;
+
+def : ItinRW<[GenericWriteSync], [II_SYNCI]>;
+
+// FPU Pipelines
+// =============
+
+def GenericFPQ : ProcResource<1> { let BufferSize = 1; }
+def GenericIssueFPUS : ProcResource<1> { let Super = GenericFPQ; }
+def GenericIssueFPUL : ProcResource<1> { let Super = GenericFPQ; }
+def GenericIssueFPULoad : ProcResource<1> { let Super = GenericFPQ; }
+def GenericIssueFPUStore : ProcResource<1> { let Super = GenericFPQ; }
+def GenericIssueFPUMove : ProcResource<1> { let Super = GenericFPQ; }
+def GenericFPUDivSqrt : ProcResource<1> { let Super = GenericFPQ; }
+
+// The floating point compare of the 24k series including interAptiv has a
+// listed latency of 1-2. Using the higher latency here.
+
+def GenericWriteFPUCmp : SchedWriteRes<[GenericIssueFPUS]> { let Latency = 2; }
+def GenericWriteFPUS : SchedWriteRes<[GenericIssueFPUS]> { let Latency = 4; }
+def GenericWriteFPUL : SchedWriteRes<[GenericIssueFPUL]> { let Latency = 5; }
+def GenericWriteFPUStore : SchedWriteRes<[GenericIssueFPUStore]> { let
+  Latency = 1;
+}
+def GenericWriteFPULoad : SchedWriteRes<[GenericIssueFPULoad]> {
+  let Latency = 2;
+}
+def GenericWriteFPUMoveFP : SchedWriteRes<[GenericIssueFPUMove]> {
+  let Latency = 4;
+}
+def GenericWriteFPUMoveGPRFPU : SchedWriteRes<[GenericIssueFPUMove]> {
+  let Latency = 2;
+}
+def GenericWriteFPUDivS : SchedWriteRes<[GenericFPUDivSqrt]> {
+  let Latency = 17;
+  let ResourceCycles = [ 14 ];
+}
+def GenericWriteFPUDivD : SchedWriteRes<[GenericFPUDivSqrt]> {
+  let Latency = 32;
+  let ResourceCycles = [ 29 ];
+}
+def GenericWriteFPURcpS : SchedWriteRes<[GenericFPUDivSqrt]> {
+  let Latency = 13;
+  let ResourceCycles = [ 10 ];
+}
+def GenericWriteFPURcpD : SchedWriteRes<[GenericFPUDivSqrt]> {
+  let Latency = 25;
+  let ResourceCycles = [ 21 ];
+}
+def GenericWriteFPURsqrtS : SchedWriteRes<[GenericFPUDivSqrt]> {
+  let Latency = 17;
+  let ResourceCycles = [ 14 ];
+}
+def GenericWriteFPURsqrtD : SchedWriteRes<[GenericFPUDivSqrt]> {
+  let Latency = 32;
+  let ResourceCycles = [ 29 ];
+}
+def GenericWriteFPUSqrtS : SchedWriteRes<[GenericFPUDivSqrt]> {
+  let Latency = 17;
+  let ResourceCycles = [ 14 ];
+}
+def GenericWriteFPUSqrtD : SchedWriteRes<[GenericFPUDivSqrt]> {
+  let Latency = 29;
+  let ResourceCycles = [ 29 ];
+}
+
+// Floating point compare and branch
+// ---------------------------------
+//
+// c.<cc>.[ds], bc1[tf], bc1[tf]l
+def : ItinRW<[GenericWriteFPUCmp], [II_C_CC_D, II_C_CC_S, II_BC1F, II_BC1T,
+                                    II_BC1FL, II_BC1TL]>;
+
+def : ItinRW<[GenericWriteFPUCmp], [II_CMP_CC_D, II_CMP_CC_S]>;
+
+// Short Pipe
+// ----------
+//
+// abs.[ds], abs.ps, add.[ds], neg.[ds], neg.ps, madd.s, msub.s, nmadd,s
+// nmsub.s, sub.[ds], mul.s
+
+def : ItinRW<[GenericWriteFPUS], [II_ABS, II_ADD_D, II_ADD_S, II_MADD_S,
+                                  II_MSUB_S, II_MUL_S, II_NEG, II_NMADD_S,
+                                  II_NMSUB_S, II_SUB_S, II_SUB_D]>;
+// mov[tf].[ds]
+
+def : ItinRW<[GenericWriteFPUS], [II_MOVF_S, II_MOVF_D, II_MOVT_S, II_MOVT_D]>;
+
+// MIPSR6
+// ------
+//
+// sel(eq|ne).[ds], max.[ds], maxa.[ds], min.[ds], mina.[ds], class.[ds]
+def : ItinRW<[GenericWriteFPUS], [II_SELCCZ_S, II_SELCCZ_D, II_MAX_S,
+                                  II_MAX_D, II_MAXA_S, II_MAXA_D, II_MIN_S,
+                                  II_MIN_D, II_MINA_S, II_MINA_D, II_CLASS_S,
+                                  II_CLASS_D]>;
+
+// Long Pipe
+// ----------
+//
+// nmadd.d, nmsub.d, mul.[ds], mul.ps, ceil.[wl].[sd], cvt.d.[sw], cvt.s.[dw],
+// cvt.w.[sd], cvt.[sw].ps, trunc.w.[ds], trunc.w.ps, floor.[ds],
+// round.[lw].[ds], floor.[lw].ds
+
+// madd.d, msub.dm mul.d, mul.ps, nmadd.d, nmsub.d, ceil.[wl].[sd], cvt.d.[sw],
+// cvt.s.[dw], cvt.w.[sd], cvt.[sw].ps, round.[lw].[ds], floor.[lw].ds,
+// trunc.w.[ds], trunc.w.ps,
+def : ItinRW<[GenericWriteFPUL], [II_MADD_D, II_MSUB_D, II_MUL_D, II_NMADD_D,
+                                  II_NMSUB_D, II_CEIL, II_CVT,
+                                  II_FLOOR, II_ROUND, II_TRUNC]>;
+
+// div.[ds], div.ps
+def : ItinRW<[GenericWriteFPUDivS], [II_DIV_S]>;
+def : ItinRW<[GenericWriteFPUDivD], [II_DIV_D]>;
+
+// sqrt.[ds], sqrt.ps
+def : ItinRW<[GenericWriteFPUSqrtS], [II_SQRT_S]>;
+def : ItinRW<[GenericWriteFPUSqrtD], [II_SQRT_D]>;
+
+// rsqrt.[ds], recip.[ds]
+def : ItinRW<[GenericWriteFPURcpS], [II_RECIP_S, II_RSQRT_S]>;
+def : ItinRW<[GenericWriteFPURcpD], [II_RECIP_D, II_RSQRT_D]>;
+
+// MIPSR6
+// ======
+//
+// rint.[ds]
+def : ItinRW<[GenericWriteFPUL], [II_RINT_S, II_RINT_D]>;
+
+// Load Pipe
+// ---------
+
+// ctc1, mtc1, mthc1, cfc1, mfc1, mfhc1
+def : ItinRW<[GenericWriteFPUMoveGPRFPU], [II_CFC1, II_CTC1, II_MFC1, II_MFHC1,
+                                           II_MTC1, II_MTHC1]>;
+
+// swc1, swxc1
+def : ItinRW<[GenericWriteFPUStore], [II_SDC1, II_SDXC1, II_SUXC1, II_SWC1,
+                                      II_SWXC1]>;
+
+// movn.[ds], movz.[ds]
+def : ItinRW<[GenericWriteFPUMoveFP], [II_MOV_D, II_MOV_S, II_MOVF, II_MOVT,
+                                       II_MOVN_D, II_MOVN_S, II_MOVZ_D,
+                                       II_MOVZ_S]>;
+
+// l[dw]x?c1
+def : ItinRW<[GenericWriteFPULoad], [II_LDC1, II_LDXC1, II_LUXC1, II_LWC1,
+                                     II_LWXC1]>;
+
+// MIPS64
+// ======
+
+def : ItinRW<[GenericWriteFPUMoveGPRFPU], [II_DMFC1, II_DMTC1]>;
+
+// MIPSR6
+// ======
+
+def : ItinRW<[GenericWriteFPUS], [II_MADDF_S, II_MSUBF_S]>;
+
+def : ItinRW<[GenericWriteFPUS], [II_MADDF_D, II_MSUBF_D]>;
+
+def : ItinRW<[GenericWriteFPUCmp], [II_BC1CCZ, II_SEL_D, II_SEL_S]>;
+
+// Cavium Networks MIPS (cnMIPS) - Octeon, HasCnMips
+// =================================================
+
+def : ItinRW<[GenericWriteALU], [II_SEQ_SNE, II_SEQI_SNEI, II_POP, II_BADDU,
+                                 II_BBIT]>;
+
+// MIPS DSP ASE, HasDSP
+// ====================
+
+def GenericDSP : ProcResource<1> { let BufferSize = 1; }
+def GenericDSPShort : SchedWriteRes<[GenericDSP]> { let Latency = 2; }
+def GenericDSPLong : SchedWriteRes<[GenericDSP]> { let Latency = 6; }
+def GenericDSPBypass : SchedWriteRes<[GenericDSP]> { let Latency = 1; }
+def GenericDSPMTHILO : SchedWriteRes<[GenericDSP]> { let Latency = 5; }
+def GenericDSPLoad : SchedWriteRes<[GenericDSP]> { let Latency = 4; }
+def GenericDSPMTHLIP : SchedWriteRes<[GenericDSP]> { let Latency = 5; }
+
+def : InstRW<[GenericDSPLong], (instregex "^EXTRV_RS_W$")>;
+def : InstRW<[GenericDSPLong], (instregex "^EXTRV_R_W$")>;
+def : InstRW<[GenericDSPLong], (instregex "^EXTRV_S_H$")>;
+def : InstRW<[GenericDSPLong], (instregex "^EXTRV_W$")>;
+def : InstRW<[GenericDSPLong], (instregex "^EXTR_RS_W$")>;
+def : InstRW<[GenericDSPLong], (instregex "^EXTR_R_W$")>;
+def : InstRW<[GenericDSPLong], (instregex "^EXTR_S_H$")>;
+def : InstRW<[GenericDSPLong], (instregex "^EXTR_W$")>;
+def : InstRW<[GenericDSPLong], (instregex "^INSV$")>;
+
+def : InstRW<[GenericDSPMTHLIP], (instregex "^MTHLIP$")>;
+def : InstRW<[GenericDSPMTHILO], (instregex "^MTHI_DSP$")>;
+def : InstRW<[GenericDSPMTHILO], (instregex "^MTLO_DSP$")>;
+
+def : InstRW<[GenericDSPShort], (instregex "^ABSQ_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ABSQ_S_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQ_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQ_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQ_S_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDSC$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDU_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDU_S_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDWC$")>;
+def : InstRW<[GenericDSPShort], (instregex "^BITREV$")>;
+def : InstRW<[GenericDSPShort], (instregex "^BPOSGE32$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGU_EQ_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGU_LE_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGU_LT_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPU_EQ_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPU_LE_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPU_LT_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMP_EQ_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMP_LE_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMP_LT_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAQ_SA_L_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAQ_S_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAU_H_QBL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAU_H_QBR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSQ_SA_L_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSQ_S_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSU_H_QBL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSU_H_QBR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTPDPV$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTPDP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTPV$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^LBUX$")>;
+def : InstRW<[GenericDSPShort], (instregex "^LHX$")>;
+def : InstRW<[GenericDSPShort], (instregex "^LWX$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MADDU_DSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MADD_DSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MAQ_SA_W_PHL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MAQ_SA_W_PHR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MAQ_S_W_PHL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MAQ_S_W_PHR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MFHI_DSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MFLO_DSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MODSUB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MSUBU_DSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MSUB_DSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULEQ_S_W_PHL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULEQ_S_W_PHR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULEU_S_PH_QBL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULEU_S_PH_QBR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULQ_RS_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULSAQ_S_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULTU_DSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULT_DSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PACKRL_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PICK_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PICK_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQU_PH_QBLA$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQU_PH_QBL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQU_PH_QBRA$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQU_PH_QBR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQ_W_PHL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQ_W_PHR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEU_PH_QBLA$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEU_PH_QBL$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEU_PH_QBRA$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEU_PH_QBR$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECRQU_S_QB_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECRQ_PH_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECRQ_QB_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECRQ_RS_PH_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^RADDU_W_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^RDDSP$")>;
+def : InstRW<[GenericDSPShort], (instregex "^REPLV_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^REPLV_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^REPL_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^REPL_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHILOV$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHILO$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLLV_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLLV_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLLV_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLLV_S_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLL_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLL_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLL_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLL_S_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_R_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_R_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_R_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_R_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRLV_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRL_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQ_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQ_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQ_S_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBU_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBU_S_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^WRDSP$")>;
+
+// MIPS DSP R2 - hasDSP, HasDSPR2, InMicroMips
+// ===========================================
+
+def : InstRW<[GenericDSPShort], (instregex "^ABSQ_S_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQH_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQH_R_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQH_R_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQH_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDUH_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDUH_R_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDU_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDU_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^APPEND$")>;
+def : InstRW<[GenericDSPShort], (instregex "^BALIGN$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGDU_EQ_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGDU_LE_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGDU_LT_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPA_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAQX_SA_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAQX_S_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAX_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPS_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSQX_S_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSQX_SA_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSX_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MUL_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MUL_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULQ_RS_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULQ_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULQ_S_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULSA_W_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECR_QB_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECR_SRA_PH_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECR_SRA_R_PH_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PREPEND$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_R_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_R_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRL_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRLV_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQH_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQH_R_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQH_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQH_R_W$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBU_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBU_S_PH$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBUH_QB$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBUH_R_QB$")>;
+
+// microMIPS DSP R1 - HasDSP, InMicroMips
+// ======================================
+
+def : InstRW<[GenericDSPShort], (instregex "^ABSQ_S_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ABSQ_S_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQ_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQ_S_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQ_S_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDSC_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDU_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDU_S_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDWC_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^BITREV_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^BPOSGE32_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGU_EQ_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGU_LE_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGU_LT_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPU_EQ_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPU_LE_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPU_LT_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMP_EQ_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMP_LE_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMP_LT_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAQ_SA_L_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAQ_S_W_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAU_H_QBL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAU_H_QBR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSQ_SA_L_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSQ_S_W_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSU_H_QBL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSU_H_QBR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTPDPV_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTPDP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTPV_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTRV_RS_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTRV_R_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTRV_S_H_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTRV_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTR_RS_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTR_R_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTR_S_H_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^EXTR_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^INSV_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^LBUX_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^LHX_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^LWX_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MADDU_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MADD_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MAQ_SA_W_PHL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MAQ_SA_W_PHR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MAQ_S_W_PHL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MAQ_S_W_PHR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MFHI_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MFLO_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MODSUB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MOVEP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MOVN_I_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MOVZ_I_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MSUBU_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MSUB_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MTHI_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MTHLIP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MTLO_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULEQ_S_W_PHL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULEQ_S_W_PHR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULEU_S_PH_QBL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULEU_S_PH_QBR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULQ_RS_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULSAQ_S_W_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULTU_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULT_DSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PACKRL_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PICK_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PICK_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQU_PH_QBLA_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQU_PH_QBL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQU_PH_QBRA_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQU_PH_QBR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQ_W_PHL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEQ_W_PHR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEU_PH_QBLA_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEU_PH_QBL_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEU_PH_QBRA_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECEU_PH_QBR_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECRQU_S_QB_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECRQ_PH_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECRQ_QB_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECRQ_RS_PH_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^RADDU_W_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^RDDSP_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^REPLV_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^REPLV_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^REPL_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^REPL_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHILOV_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHILO_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLLV_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLLV_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLLV_S_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLLV_S_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLL_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLL_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLL_S_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHLL_S_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_R_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_R_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_R_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_R_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRLV_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRL_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQ_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQ_S_PH_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQ_S_W_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBU_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBU_S_QB_MM$")>;
+def : InstRW<[GenericDSPShort], (instregex "^WRDSP_MM$")>;
+
+
+// microMIPS DSP R2 - hasDSP, HasDSPR2, InMicroMips
+// ================================================
+
+def : InstRW<[GenericDSPShort], (instregex "^ABSQ_S_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQH_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQH_R_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQH_R_W_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDQH_W_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDUH_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDUH_R_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDU_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^ADDU_S_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^APPEND_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^BALIGN_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGDU_EQ_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGDU_LE_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^CMPGDU_LT_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPA_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAQX_SA_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAQX_S_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPAX_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPS_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSQX_S_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSQX_SA_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^DPSX_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MUL_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MUL_S_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULQ_RS_W_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULQ_S_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULQ_S_W_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^MULSA_W_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECR_QB_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECR_SRA_PH_W_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PRECR_SRA_R_PH_W_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^PREPEND_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRA_R_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRAV_R_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRL_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SHRLV_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQH_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQH_R_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQH_W_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBQH_R_W_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBU_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBU_S_PH_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBUH_QB_MMR2$")>;
+def : InstRW<[GenericDSPShort], (instregex "^SUBUH_R_QB_MMR2$")>;
+
+// microMIPS DSP R3 - hasDSP, hasDSPR2, hasDSPR3, InMicroMips
+// ==========================================================
+
+def : InstRW<[GenericDSPShort], (instregex "^BPOSGE32C_MMR3$")>;
+
+// MIPS MSA ASE - hasMSA
+// =====================
+
+def GenericWriteMSAShortLogic : SchedWriteRes<[GenericIssueFPUS]>;
+def GenericWriteMSAShortInt : SchedWriteRes<[GenericIssueFPUS]> {
+let Latency = 2;
+}
+def GenericWriteMoveOtherUnitsToFPU : SchedWriteRes<[GenericIssueFPUS]>;
+def GenericWriteMSAOther3 : SchedWriteRes<[GenericIssueFPUS]> {
+let Latency = 3;
+}
+def GenericWriteMSALongInt : SchedWriteRes<[GenericIssueFPUS]> {
+let Latency = 5;
+}
+def GenericWriteFPUDivI : SchedWriteRes<[GenericFPQ]> {
+  let Latency = 33;
+  let ResourceCycles = [ 33 ];
+}
+
+// FPUS is also used in moves from floating point and MSA registers to general
+// purpose registers.
+def GenericWriteMoveFPUSToOtherUnits : SchedWriteRes<[GenericIssueFPUS]> {
+  let Latency = 0;
+}
+
+// FPUL is also used in moves from floating point and MSA registers to general
+// purpose registers.
+def GenericWriteMoveFPULToOtherUnits : SchedWriteRes<[GenericIssueFPUL]>;
+
+
+// adds_a.[bhwd], adds_[asu].[bhwd], addvi?.[bhwd], asub_[us].[bhwd],
+// aver?_[us].[bhwd]
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^ADD_A_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^ADDS_[ASU]_[BHWD]$")>;
+
+// TODO: ADDVI_[BHW] might be 1 cycle latency rather than 2. Need to confirm it.
+// add.[bhwd], addvi.[bhwd], asub_[us].[bhwd], ave.[bhwd], aver.[bhwd]
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^ADDVI?_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^ASUB_[US].[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^AVER?_[US].[BHWD]$")>;
+
+// and.v, andi.b, move.v, ldi.[bhwd], xor.v, nor.v, xori.b, nori.b
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^MOVE_V$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^LDI_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)_V$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>;
+
+// vshf.[bhwd], binsl.[bhwd], binsr.[bhwd], insert.[bhwd], sld?.[bhwd],
+// bset.[bhwd], bclr.[bhwd], bneg.[bhwd], bsel_v, bseli_b
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^VSHF_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BINSL|BINSLI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BINSR|BINSRI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^INSERT_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^(SLD|SLDI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BSET|BSETI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BCLR|BCLRI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BNEG|BNEGI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^(BSEL_V|BSELI_B)$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^BMN*Z.*$")>;
+
+// pcnt.[bhwd], sat_s.[bhwd], sat_u.bhwd]
+def : InstRW<[GenericWriteMSAOther3], (instregex "^PCNT_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAOther3], (instregex "^SAT_(S|U)_[BHWD]$")>;
+
+// bnz.[bhwdv], cfcmsa, ctcmsa
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(BNZ|BZ)_[BHWDV]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^C(F|T)CMSA$")>;
+
+// shf.[bhw], fill[bhwd], splat?.[bhwd]
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^SHF_[BHW]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^FILL_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^(SPLAT|SPLATI)_[BHWD]$")>;
+
+// pcnt.[bhwd], sat_s.[bhwd], sat_u.bhwd]
+def : InstRW<[GenericWriteMSAOther3], (instregex "^PCNT_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAOther3], (instregex "^SAT_(S|U)_[BHWD]$")>;
+
+// fexp2_w, fexp2_d
+def : InstRW<[GenericWriteFPUS], (instregex "^FEXP2_(W|D)$")>;
+
+// compare, converts, round to int, floating point truncate.
+def : InstRW<[GenericWriteFPUS], (instregex "^(CLT|CLTI)_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^(CLE|CLEI)_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^(CEQ|CEQI)_[BHWD]$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_UN_(S|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_UEQ_(S|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_EQ_(S|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_LT_(S|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_ULT_(S|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_LE_(S|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^CMP_ULE_(S|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FS(AF|EQ|LT|LE|NE|OR)_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FSUEQ_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FSULE_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FSULT_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FSUNE_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FSUN_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCAF_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCEQ_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCLE_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCLT_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCNE_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCOR_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCUEQ_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCULE_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCULT_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCUNE_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FCUN_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FABS_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FFINT_(U|S)_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FFQL_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FFQR_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FTINT_(U|S)_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FRINT_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FTQ_(H|W)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FTRUNC_(U|S)_(W|D)$")>;
+
+// fexdo.[hw], fexupl.[wd], fexupr.[wd]
+def : InstRW<[GenericWriteFPUS], (instregex "^FEXDO_(H|W)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FEXUPL_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FEXUPR_(W|D)$")>;
+
+// fclass.[wd], fmax.[wd], fmax_a.[wd], fmin.[wd], fmin_a.[wd], flog2.[wd]
+def : InstRW<[GenericWriteFPUS], (instregex "^FCLASS_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FMAX_A_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FMAX_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FMIN_A_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FMIN_(W|D)$")>;
+def : InstRW<[GenericWriteFPUS], (instregex "^FLOG2_(W|D)$")>;
+
+// interleave right/left, interleave even/odd, insert
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(ILVR|ILVL)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(ILVEV|ILVOD)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^INSVE_[BHWD]$")>;
+
+// subs_?.[bhwd], subsus_?.[bhwd], subsuu_?.[bhwd], subvi.[bhwd], subv.[bhwd],
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^SUBS_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^SUBSUS_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^SUBSUU_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^SUBVI_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortInt], (instregex "^SUBV_[BHWD]$")>;
+
+// mod_[su].[bhwd], div_[su].[bhwd]
+def : InstRW<[GenericWriteFPUDivI], (instregex "^MOD_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteFPUDivI], (instregex "^DIV_(S|U)_[BHWD]$")>;
+
+// hadd_[su].[bhwd], hsub_[su].[bhwd], max_[sua].[bhwd], min_[sua].[bhwd],
+// maxi_[su].[bhwd], mini_[su].[bhwd], sra?.[bhwd], srar?.[bhwd], srlr.[bhwd],
+// sll?.[bhwd], pckev.[bhwd], pckod.[bhwd], nloc.[bhwd], nlzc.[bhwd],
+// insve.[bhwd]
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^HADD_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^HSUB_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(MAX|MIN)_S_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(MAX|MIN)_U_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(MAX|MIN)_A_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic],
+             (instregex "^(MAXI|MINI)_(S|U)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(SRA|SRAI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(SRL|SRLI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(SRAR|SRARI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(SRLR|SRLRI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(SLL|SLLI)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(PCKEV|PCKOD)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^(NLOC|NLZC)_[BHWD]$")>;
+def : InstRW<[GenericWriteMSAShortLogic], (instregex "^INSVE_[BHWD]$")>;
+
+// dpadd_?.[bhwd], dpsub_?.[bhwd], dotp_?.[bhwd], msubv.[bhwd], maddv.[bhwd]
+// mulv.[bhwd].
+def : InstRW<[GenericWriteMSALongInt], (instregex "^DPADD_(S|U)_[HWD]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^DPSUB_(S|U)_[HWD]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^DOTP_(S|U)_[HWD]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MSUBV_[BHWD]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MADDV_[BHWD]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MULV_[BHWD]$")>;
+
+// madd?.q.[hw], msub?.q.[hw], mul?.q.[hw]
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MADDR_Q_[HW]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MADD_Q_[HW]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MSUBR_Q_[HW]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MSUB_Q_[HW]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MULR_Q_[HW]$")>;
+def : InstRW<[GenericWriteMSALongInt], (instregex "^MUL_Q_[HW]$")>;
+
+// fadd.[dw], fmadd.[dw], fmul.[dw], frcp.[dw], frsqrt.[dw], fsqrt.[dw]
+// fsub.[dw], fdiv.[dw]
+def : InstRW<[GenericWriteFPUL], (instregex "^FADD_[DW]$")>;
+def : InstRW<[GenericWriteFPUL], (instregex "^FMADD_[DW]$")>;
+def : InstRW<[GenericWriteFPUL], (instregex "^FMSUB_[DW]$")>;
+def : InstRW<[GenericWriteFPUL], (instregex "^FMUL_[DW]$")>;
+def : InstRW<[GenericWriteFPUL], (instregex "^FRCP_[DW]$")>;
+def : InstRW<[GenericWriteFPUL], (instregex "^FRSQRT_[DW]$")>;
+def : InstRW<[GenericWriteFPUL], (instregex "^FSQRT_[DW]$")>;
+def : InstRW<[GenericWriteFPUL], (instregex "^FSUB_[DW]$")>;
+def : InstRW<[GenericWriteFPUL], (instregex "^FDIV_[DW]$")>;
+
+// copy.[su]_[bhwd]
+def : InstRW<[GenericWriteFPUMoveGPRFPU], (instregex "^COPY_U_[BHW]$")>;
+def : InstRW<[GenericWriteFPUMoveGPRFPU], (instregex "^COPY_S_[BHWD]$")>;
+
+def : InstRW<[GenericWriteFPUStore], (instregex "^ST_[BHWD]$")>;
+def : InstRW<[GenericWriteFPULoad], (instregex "^LD_[BHWD]$")>;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td b/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td
new file mode 100644
index 000000000000..882a241d1426
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td
@@ -0,0 +1,586 @@
+//==- MipsScheduleP5600.td - P5600 Scheduling Definitions --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def MipsP5600Model : SchedMachineModel {
+  int IssueWidth = 2; // 2x dispatched per cycle
+  int MicroOpBufferSize = 48; // min(48, 48, 64)
+  int LoadLatency = 4;
+  int MispredictPenalty = 8; // TODO: Estimated
+
+  let CompleteModel = 1;
+
+  list<Predicate> UnsupportedFeatures = [HasMips32r6, HasMips64r6,
+                                         HasMips64, HasMips64r2, HasCnMips,
+                                         InMicroMips, InMips16Mode,
+                                         HasMicroMips32r6, HasMicroMips64r6,
+                                         HasDSP, HasDSPR2];
+
+}
+
+let SchedModel = MipsP5600Model in {
+
+// ALQ Pipelines
+// =============
+
+def P5600ALQ : ProcResource<1> { let BufferSize = 16; }
+def P5600IssueALU : ProcResource<1> { let Super = P5600ALQ; }
+
+// ALU Pipeline
+// ------------
+
+def P5600WriteALU : SchedWriteRes<[P5600IssueALU]>;
+
+// and, lui, nor, or, slti, sltiu, sub, subu, xor
+def : ItinRW<[P5600WriteALU],
+             [II_AND, II_LUI, II_NOR, II_OR, II_SLTI_SLTIU, II_SUB, II_SUBU,
+              II_XOR]>;
+
+// AGQ Pipelines
+// =============
+
+def P5600AGQ : ProcResource<3> { let BufferSize = 16; }
+def P5600IssueAL2 : ProcResource<1> { let Super = P5600AGQ; }
+def P5600IssueCTISTD : ProcResource<1> { let Super = P5600AGQ; }
+def P5600IssueLDST : ProcResource<1> { let Super = P5600AGQ; }
+
+def P5600AL2Div : ProcResource<1>;
+// Pseudo-resource used to block CTISTD when handling multi-pipeline splits.
+def P5600CTISTD : ProcResource<1>;
+
+// CTISTD Pipeline
+// ---------------
+
+def P5600WriteJump : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]>;
+def P5600WriteJumpAndLink : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]> {
+  let Latency = 2;
+}
+
+// b, beq, beql, bg[et]z, bl[et]z, bne, bnel, j, syscall, jal, bltzal,
+// jalr, jr.hb, jr
+def : ItinRW<[P5600WriteJump], [II_B, II_BCC, II_BCCZ, II_BCCZAL, II_J, II_JR,
+                                II_JR_HB, II_DERET, II_ERET, II_ERETNC, 
+                                II_SYSCALL, II_BREAK, II_SDBBP, II_SSNOP,
+                                II_TEQ, II_TEQI, II_TGE, II_TGEI, II_TGEIU,
+                                II_TGEU, II_TLT, II_TLTI, II_TLTU, II_TNE,
+                                II_TNEI, II_TRAP, II_TTLTIU, II_WAIT,
+                                II_PAUSE]>;
+
+def : ItinRW<[P5600WriteJumpAndLink], [II_JAL, II_JALR, II_JALR_HB]>;
+
+def P5600COP0 : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]>;
+
+def : ItinRW<[P5600COP0], [II_TLBINV, II_TLBINVF, II_TLBP, II_TLBR, II_TLBWI,
+                           II_TLBWR, II_MFC0, II_MTC0]>;
+// LDST Pipeline
+// -------------
+
+def P5600WriteLoad : SchedWriteRes<[P5600IssueLDST]> {
+  let Latency = 4;
+}
+
+def P5600WriteLoadShifted : SchedWriteRes<[P5600IssueLDST, P5600CTISTD]> {
+  let Latency = 4;
+}
+
+def P5600WriteCache : SchedWriteRes<[P5600IssueLDST]>;
+
+def P5600WriteStore : SchedWriteRes<[P5600IssueLDST, P5600CTISTD]> {
+  // FIXME: This is a bit pessimistic. P5600CTISTD is only used during cycle 2
+  //        not during 0, 1, and 2.
+  let ResourceCycles = [ 1, 3 ];
+}
+
+def P5600WriteGPRFromBypass : SchedWriteRes<[P5600IssueLDST]> {
+  let Latency = 2;
+}
+
+def P5600WriteStoreFromOtherUnits : SchedWriteRes<[P5600IssueLDST]>;
+def P5600WriteLoadToOtherUnits : SchedWriteRes<[P5600IssueLDST]> {
+  let Latency = 0;
+}
+
+// l[bhw], l[bh]u, ll
+def : ItinRW<[P5600WriteLoad], [II_LB, II_LBE, II_LBU, II_LBUE, II_LH, II_LHE,
+                                II_LHU, II_LHUE, II_LW, II_LWE, II_LL, II_LLE,
+                                II_LWPC]>;
+
+// lw[lr]
+def : ItinRW<[P5600WriteLoadShifted], [II_LWL, II_LWLE, II_LWR, II_LWRE]>;
+
+// s[bhw], sw[lr]
+def : ItinRW<[P5600WriteStore], [II_SB, II_SBE, II_SH, II_SHE, II_SW, II_SWE,
+                                 II_SWL, II_SWLE, II_SWR, II_SWRE, II_SC,
+                                 II_SCE]>;
+
+// pref, cache, sync, synci
+def : ItinRW<[P5600WriteCache], [II_PREF, II_PREFE, II_CACHE, II_CACHEE,
+                                 II_SYNC, II_SYNCI]>;
+
+// LDST is also used in moves from general purpose registers to floating point
+// and MSA.
+def P5600WriteMoveGPRToOtherUnits : SchedWriteRes<[P5600IssueLDST]> {
+  let Latency = 0;
+}
+
+// AL2 Pipeline
+// ------------
+
+def P5600WriteAL2 : SchedWriteRes<[P5600IssueAL2]>;
+def P5600WriteAL2BitExt : SchedWriteRes<[P5600IssueAL2]> { let Latency = 2; }
+def P5600WriteAL2ShadowMov : SchedWriteRes<[P5600IssueAL2]> { let Latency = 2; }
+def P5600WriteAL2CondMov : SchedWriteRes<[P5600IssueAL2, P5600CTISTD]> {
+  let Latency = 2;
+}
+def P5600WriteAL2Div : SchedWriteRes<[P5600IssueAL2, P5600AL2Div]> {
+  // Estimated worst case
+  let Latency = 34;
+  let ResourceCycles = [1, 34];
+}
+def P5600WriteAL2DivU : SchedWriteRes<[P5600IssueAL2, P5600AL2Div]> {
+  // Estimated worst case
+  let Latency = 34;
+  let ResourceCycles = [1, 34];
+}
+def P5600WriteAL2Mul : SchedWriteRes<[P5600IssueAL2]> { let Latency = 3; }
+def P5600WriteAL2Mult: SchedWriteRes<[P5600IssueAL2]> { let Latency = 5; }
+def P5600WriteAL2MAdd: SchedWriteRes<[P5600IssueAL2, P5600CTISTD]> {
+  let Latency = 5;
+}
+
+// clo, clz, di, ei, mfhi, mflo
+def : ItinRW<[P5600WriteAL2], [II_CLO, II_CLZ, II_DI, II_EI, II_MFHI_MFLO]>;
+
+// ehb, rdhwr, rdpgpr, wrpgpr, wsbh
+def : ItinRW<[P5600WriteAL2ShadowMov], [II_EHB, II_RDHWR, II_WSBH]>;
+
+// mov[nz]
+def : ItinRW<[P5600WriteAL2CondMov], [II_MOVN, II_MOVZ]>;
+
+// divu?
+def : ItinRW<[P5600WriteAL2Div], [II_DIV]>;
+def : ItinRW<[P5600WriteAL2DivU], [II_DIVU]>;
+
+// mul
+def : ItinRW<[P5600WriteAL2Mul], [II_MUL]>;
+// multu?, multu?
+def : ItinRW<[P5600WriteAL2Mult], [II_MULT, II_MULTU]>;
+// maddu?, msubu?, mthi, mtlo
+def : ItinRW<[P5600WriteAL2MAdd],
+             [II_MADD, II_MADDU, II_MSUB, II_MSUBU, II_MTHI_MTLO]>;
+
+// ext, ins
+def : ItinRW<[P5600WriteAL2BitExt], [II_EXT, II_INS]>;
+
+// Either ALU or AL2 Pipelines
+// ---------------------------
+//
+// Some instructions can choose between ALU and AL2, but once dispatched to
+// ALQ or AGQ respectively they are committed to that path.
+// The decision is based on the outcome of the most recent selection when the
+// choice was last available. For now, we assume ALU is always chosen.
+
+def P5600WriteEitherALU : SchedWriteVariant<
+  // FIXME: Implement selection predicate
+  [SchedVar<SchedPredicate<[{1}]>, [P5600WriteALU]>,
+   SchedVar<SchedPredicate<[{0}]>, [P5600WriteAL2]>
+  ]>;
+
+// add, addi, addiu, addu, andi, ori, rotr, se[bh], sllv?, sr[al]v?, slt, sltu,
+// xori
+def : ItinRW<[P5600WriteEitherALU],
+             [II_ADD, II_ADDI, II_ADDIU, II_ANDI, II_ORI, II_ROTR, II_SEB, II_SEH,
+              II_SLT_SLTU, II_SLL, II_SRA, II_SRL, II_XORI, II_ADDU, II_SLLV,
+              II_SRAV, II_SRLV, II_LSA]>;
+def : InstRW<[], (instrs COPY)>;
+
+// FPU Pipelines
+// =============
+
+def P5600FPQ : ProcResource<3> { let BufferSize = 16; }
+def P5600IssueFPUS : ProcResource<1> { let Super = P5600FPQ; }
+def P5600IssueFPUL : ProcResource<1> { let Super = P5600FPQ; }
+def P5600IssueFPULoad : ProcResource<1> { let Super = P5600FPQ; }
+
+def P5600FPUDivSqrt : ProcResource<2>;
+
+def P5600WriteFPUS : SchedWriteRes<[P5600IssueFPUS]>;
+def P5600WriteFPUL : SchedWriteRes<[P5600IssueFPUL]> { let Latency = 4; }
+def P5600WriteFPUL_MADDSUB : SchedWriteRes<[P5600IssueFPUL]> { let Latency = 6; }
+def P5600WriteFPUDivI : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 23 / 27
+  let Latency = 23; // Using common case
+  let ResourceCycles = [ 1, 23 ];
+}
+def P5600WriteFPUDivS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 23 / 27
+  let Latency = 23; // Using common case
+  let ResourceCycles = [ 1, 23 ];
+}
+def P5600WriteFPUDivD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 31 / 35
+  let Latency = 31; // Using common case
+  let ResourceCycles = [ 1, 31 ];
+}
+def P5600WriteFPURcpS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 19 / 23
+  let Latency = 19; // Using common case
+  let ResourceCycles = [ 1, 19 ];
+}
+def P5600WriteFPURcpD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 27 / 31
+  let Latency = 27; // Using common case
+  let ResourceCycles = [ 1, 27 ];
+}
+def P5600WriteFPURsqrtS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 27 / 27
+  let Latency = 27; // Using common case
+  let ResourceCycles = [ 1, 27 ];
+}
+def P5600WriteFPURsqrtD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 27 / 31
+  let Latency = 27; // Using common case
+  let ResourceCycles = [ 1, 27 ];
+}
+def P5600WriteFPUSqrtS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 27 / 31
+  let Latency = 27; // Using common case
+  let ResourceCycles = [ 1, 27 ];
+}
+def P5600WriteFPUSqrtD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> {
+  // Best/Common/Worst case = 7 / 35 / 39
+  let Latency = 35; // Using common case
+  let ResourceCycles = [ 1, 35 ];
+}
+def P5600WriteMSAShortLogic : SchedWriteRes<[P5600IssueFPUS]>;
+def P5600WriteMSAShortInt : SchedWriteRes<[P5600IssueFPUS]> { let Latency = 2; }
+def P5600WriteMoveOtherUnitsToFPU : SchedWriteRes<[P5600IssueFPUS]>;
+def P5600WriteMSAOther3 : SchedWriteRes<[P5600IssueFPUS]> { let Latency = 3; }
+def P5600WriteMSALongInt : SchedWriteRes<[P5600IssueFPUS]> { let Latency = 5; }
+
+// vshf.[bhwd], binsl.[bhwd], binsr.[bhwd], insert.[bhwd], sld?.[bhwd],
+// bset.[bhwd], bclr.[bhwd], bneg.[bhwd], bsel_v, bseli_b
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^VSHF_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BINSL|BINSLI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BINSR|BINSRI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^INSERT_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(SLD|SLDI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BSET|BSETI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BCLR|BCLRI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BNEG|BNEGI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BSEL_V|BSELI_B)$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^BMN*Z.*$")>;
+
+// pcnt.[bhwd], sat_s.[bhwd], sat_u.bhwd]
+def : InstRW<[P5600WriteMSAOther3], (instregex "^PCNT_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAOther3], (instregex "^SAT_(S|U)_[BHWD]$")>;
+
+// bnz.[bhwdv], cfcmsa, ctcmsa
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(BNZ|BZ)_[BHWDV]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^C(F|T)CMSA$")>;
+
+// FPUS is also used in moves from floating point and MSA registers to general
+// purpose registers.
+def P5600WriteMoveFPUSToOtherUnits : SchedWriteRes<[P5600IssueFPUS]> {
+  let Latency = 0;
+}
+
+// FPUL is also used in moves from floating point and MSA registers to general
+// purpose registers.
+def P5600WriteMoveFPULToOtherUnits : SchedWriteRes<[P5600IssueFPUL]>;
+
+// Short Pipe
+// ----------
+//
+// abs.[ds], abs.ps, bc1[tf]l?, mov[tf].[ds], mov[tf], mov.[ds], [cm][ft]c1,
+// m[ft]hc1, neg.[ds], neg.ps, nor.v, nori.b, or.v, ori.b, xor.v, xori.b,
+// sdxc1, sdc1, st.[bhwd], swc1, swxc1
+def : ItinRW<[P5600WriteFPUS], [II_ABS, II_MOVF_D, II_MOVF_S, II_MOVT_D,
+                                II_MOVT_S, II_MOV_D, II_MOV_S, II_NEG]>;
+
+// adds_a.[bhwd], adds_[asu].[bhwd], addvi?.[bhwd], asub_[us].[bhwd],
+// aver?_[us].[bhwd], shf.[bhw], fill[bhwd], splat?.[bhwd]
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADD_A_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADDS_[ASU]_[BHWD]$")>;
+// TODO: ADDVI_[BHW] might be 1 cycle latency rather than 2. Need to confirm it.
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADDVI?_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^ASUB_[US].[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^AVER?_[US].[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^SHF_[BHW]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^FILL_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(SPLAT|SPLATI)_[BHWD]$")>;
+
+// and.v, andi.b, move.v, ldi.[bhwd]
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^MOVE_V$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^LDI_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)_V$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>;
+
+// vshf.[bhwd], binsl.[bhwd], binsr.[bhwd], insert.[bhwd], sld?.[bhwd],
+// bset.[bhwd], bclr.[bhwd], bneg.[bhwd], bsel_v, bseli_b
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^VSHF_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BINSL|BINSLI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BINSR|BINSRI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^INSERT_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(SLD|SLDI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BSET|BSETI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BCLR|BCLRI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BNEG|BNEGI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^(BSEL_V|BSELI_B)$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^BMN*Z.*$")>;
+
+// pcnt.[bhwd], sat_s.[bhwd], sat_u.bhwd]
+def : InstRW<[P5600WriteMSAOther3], (instregex "^PCNT_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAOther3], (instregex "^SAT_(S|U)_[BHWD]$")>;
+
+// fexp2_w, fexp2_d
+def : InstRW<[P5600WriteFPUS], (instregex "^FEXP2_(W|D)$")>;
+
+// compare, converts, round to int, floating point truncate.
+def : InstRW<[P5600WriteFPUS], (instregex "^(CLT|CLTI)_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^(CLE|CLEI)_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^(CEQ|CEQI)_[BHWD]$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^CMP_UN_(S|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^CMP_UEQ_(S|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^CMP_EQ_(S|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^CMP_LT_(S|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^CMP_ULT_(S|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^CMP_LE_(S|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^CMP_ULE_(S|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FS(AF|EQ|LT|LE|NE|OR)_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FSUEQ_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FSULE_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FSULT_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FSUNE_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FSUN_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCAF_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCEQ_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCLE_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCLT_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCNE_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCOR_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCUEQ_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCULE_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCULT_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCUNE_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FCUN_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FABS_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FFINT_(U|S)_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FFQL_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FFQR_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FTINT_(U|S)_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FRINT_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FTQ_(H|W)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FTRUNC_(U|S)_(W|D)$")>;
+
+// fexdo.[hw], fexupl.[wd], fexupr.[wd]
+def : InstRW<[P5600WriteFPUS], (instregex "^FEXDO_(H|W)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FEXUPL_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FEXUPR_(W|D)$")>;
+
+// fclass.[wd], fmax.[wd], fmax_a.[wd], fmin.[wd], fmin_a.[wd], flog2.[wd]
+def : InstRW<[P5600WriteFPUS], (instregex "^FCLASS_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FMAX_A_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FMAX_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FMIN_A_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FMIN_(W|D)$")>;
+def : InstRW<[P5600WriteFPUS], (instregex "^FLOG2_(W|D)$")>;
+
+// interleave right/left, interleave even/odd, insert
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(ILVR|ILVL)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(ILVEV|ILVOD)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^INSVE_[BHWD]$")>;
+
+// subs_?.[bhwd], subsus_?.[bhwd], subsuu_?.[bhwd], subvi.[bhwd], subv.[bhwd],
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^SUBS_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^SUBSUS_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^SUBSUU_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^SUBVI_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortInt], (instregex "^SUBV_[BHWD]$")>;
+
+// mod_[su].[bhwd], div_[su].[bhwd]
+def : InstRW<[P5600WriteFPUDivI], (instregex "^MOD_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteFPUDivI], (instregex "^DIV_(S|U)_[BHWD]$")>;
+
+// hadd_[su].[bhwd], hsub_[su].[bhwd], max_[sua].[bhwd], min_[sua].[bhwd],
+// maxi_[su].[bhwd], mini_[su].[bhwd], sra?.[bhwd], srar?.[bhwd], srlr.[bhwd],
+// sll?.[bhwd], pckev.[bhwd], pckod.[bhwd], nloc.[bhwd], nlzc.[bhwd],
+// insve.[bhwd]
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^HADD_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^HSUB_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(MAX|MIN)_S_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(MAX|MIN)_U_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(MAX|MIN)_A_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(MAXI|MINI)_(S|U)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(SRA|SRAI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(SRL|SRLI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(SRAR|SRARI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(SRLR|SRLRI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(SLL|SLLI)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(PCKEV|PCKOD)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(NLOC|NLZC)_[BHWD]$")>;
+def : InstRW<[P5600WriteMSAShortLogic], (instregex "^INSVE_[BHWD]$")>;
+
+// Long Pipe
+// ----------
+//
+// add.[ds], add.ps, cvt.d.[sw], cvt.s.[dw], cvt.w.[sd], cvt.[sw].ps,
+// cvt.ps.[sw], c.<cc>.[ds], c.<cc>.ps, mul.[ds], mul.ps, sub.[ds], sub.ps,
+// trunc.w.[ds], trunc.w.ps
+def : ItinRW<[P5600WriteFPUL],
+             [II_ADD_D, II_ADD_S, II_CVT, II_C_CC_D, II_C_CC_S, II_MUL_D,
+              II_MUL_S, II_SUB_D, II_SUB_S, II_TRUNC]>;
+
+// div.[ds], div.ps
+def : ItinRW<[P5600WriteFPUDivS], [II_DIV_S]>;
+def : ItinRW<[P5600WriteFPUDivD], [II_DIV_D]>;
+
+// sqrt.[ds], sqrt.ps
+def : ItinRW<[P5600WriteFPUSqrtS], [II_SQRT_S]>;
+def : ItinRW<[P5600WriteFPUSqrtD], [II_SQRT_D]>;
+
+// frcp.[wd], frsqrt.[wd]
+def : InstRW<[P5600WriteFPURsqrtD], (instregex "^FRCP_(W|D)$")>;
+def : InstRW<[P5600WriteFPURsqrtD], (instregex "^FRSQRT_(W|D)$")>;
+
+def : ItinRW<[P5600WriteFPURsqrtD], [II_RECIP_D, II_RSQRT_D]>;
+def : ItinRW<[P5600WriteFPURsqrtS], [II_RECIP_S, II_RSQRT_S]>;
+
+// fmadd.[wd], fmsubb.[wd], fdiv.[wd], fsqrt.[wd], fmul.[wd], fadd.[wd],
+// fsub.[wd]
+def : InstRW<[P5600WriteFPUL_MADDSUB], (instregex "^FMADD_(W|D)$")>;
+def : InstRW<[P5600WriteFPUL_MADDSUB], (instregex "^FMSUB_(W|D)$")>;
+def : InstRW<[P5600WriteFPUDivS], (instregex "^FDIV_W$")>;
+def : InstRW<[P5600WriteFPUDivD], (instregex "^FDIV_D$")>;
+def : InstRW<[P5600WriteFPUSqrtS], (instregex "^FSQRT_W$")>;
+def : InstRW<[P5600WriteFPUSqrtD], (instregex "^FSQRT_D$")>;
+def : InstRW<[P5600WriteFPUL], (instregex "^FMUL_(W|D)$")>;
+def : InstRW<[P5600WriteFPUL], (instregex "^FADD_(W|D)$")>;
+def : InstRW<[P5600WriteFPUL], (instregex "^FSUB_(W|D)$")>;
+
+// dpadd_?.[bhwd], dpsub_?.[bhwd], dotp_?.[bhwd], msubv.[bhwd], maddv.[bhwd]
+// mulv.[bhwd].
+def : InstRW<[P5600WriteMSALongInt], (instregex "^DPADD_(S|U)_[HWD]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^DPSUB_(S|U)_[HWD]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^DOTP_(S|U)_[HWD]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MSUBV_[BHWD]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MADDV_[BHWD]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MULV_[BHWD]$")>;
+
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MADDR_Q_[HW]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MADD_Q_[HW]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MSUBR_Q_[HW]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MSUB_Q_[HW]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MULR_Q_[HW]$")>;
+def : InstRW<[P5600WriteMSALongInt], (instregex "^MUL_Q_[HW]$")>;
+
+// madd.[ds], msub.[ds], nmadd.[ds], nmsub.[ds],
+// Operand 0 is read on cycle 5. All other operands are read on operand 0.
+def : ItinRW<[SchedReadAdvance<5>, P5600WriteFPUL_MADDSUB],
+             [II_MADD_D, II_MADD_S, II_MSUB_D, II_MSUB_S, II_NMADD_D,
+              II_NMADD_S, II_NMSUB_D, II_NMSUB_S]>;
+
+// madd.ps, msub.ps, nmadd.ps, nmsub.ps
+// Operand 0 and 1 are read on cycle 5. All others are read on operand 0.
+// (none of these instructions exist in the backend yet)
+
+// Load Pipe
+// ---------
+//
+// This is typically used in conjunction with the load pipeline under the AGQ
+// All the instructions are in the 'Tricky Instructions' section.
+
+def P5600WriteLoadOtherUnitsToFPU : SchedWriteRes<[P5600IssueFPULoad]> {
+  let Latency = 4;
+}
+
+// Tricky Instructions
+// ===================
+//
+// These instructions are split across multiple uops (in different pipelines)
+// that must cooperate to complete the operation
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+//        current aggregates the resources and ignores the exact cycle they are
+//        used.
+def P5600WriteMoveGPRToFPU : WriteSequence<[P5600WriteMoveGPRToOtherUnits,
+                                            P5600WriteMoveOtherUnitsToFPU]>;
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+//        current aggregates the resources and ignores the exact cycle they are
+//        used.
+def P5600WriteMoveFPUToGPR : WriteSequence<[P5600WriteMoveFPUSToOtherUnits,
+                                            P5600WriteGPRFromBypass]>;
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+//        current aggregates the resources and ignores the exact cycle they are
+//        used.
+def P5600WriteStoreFPUS : WriteSequence<[P5600WriteMoveFPUSToOtherUnits,
+                                         P5600WriteStoreFromOtherUnits]>;
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+//        current aggregates the resources and ignores the exact cycle they are
+//        used.
+def P5600WriteStoreFPUL : WriteSequence<[P5600WriteMoveFPULToOtherUnits,
+                                         P5600WriteStoreFromOtherUnits]>;
+
+// FIXME: This isn't quite right since the implementation of WriteSequence
+//        current aggregates the resources and ignores the exact cycle they are
+//        used.
+def P5600WriteLoadFPU : WriteSequence<[P5600WriteLoadToOtherUnits,
+                                       P5600WriteLoadOtherUnitsToFPU]>;
+
+// ctc1, mtc1, mthc1
+def : ItinRW<[P5600WriteMoveGPRToFPU], [II_CTC1, II_MTC1, II_MTHC1]>;
+
+// copy.[su]_[bhwd]
+def : InstRW<[P5600WriteMoveFPUToGPR], (instregex "^COPY_U_[BHW]$")>;
+def : InstRW<[P5600WriteMoveFPUToGPR], (instregex "^COPY_S_[BHWD]$")>;
+
+// bc1[ft], cfc1, mfc1, mfhc1, movf, movt
+def : ItinRW<[P5600WriteMoveFPUToGPR],
+             [II_BC1F, II_BC1FL, II_BC1T, II_BC1TL, II_CFC1, II_MFC1, II_MFHC1, II_MOVF, II_MOVT]>;
+
+// swc1, swxc1, st.[bhwd]
+def : ItinRW<[P5600WriteStoreFPUS], [II_SDC1, II_SDXC1, II_SUXC1, II_SWC1,
+                                     II_SWXC1]>;
+def : InstRW<[P5600WriteStoreFPUS], (instregex "^ST_[BHWD]$")>;
+
+// movn.[ds], movz.[ds]
+def : ItinRW<[P5600WriteStoreFPUL], [II_MOVN_D, II_MOVN_S, II_MOVZ_D, II_MOVZ_S]>;
+
+// l[dw]x?c1, ld.[bhwd]
+def : ItinRW<[P5600WriteLoadFPU], [II_LDC1, II_LDXC1, II_LWC1, II_LWXC1, II_LUXC1]>;
+def : InstRW<[P5600WriteLoadFPU], (instregex "LD_[BHWD]")>;
+
+// Unsupported Instructions
+// ========================
+//
+// The following instruction classes are never valid on P5600.
+//   II_DADDIU, II_DADDU, II_DMFC1, II_DMTC1, II_DMULT, II_DMULTU, II_DROTR,
+//   II_DROTR32, II_DROTRV, II_DDIV, II_DSLL, II_DSLL32, II_DSLLV, II_DSRA,
+//   II_DSRA32, II_DSRAV, II_DSRL, II_DSRL32, II_DSRLV, II_DSUBU, II_DDIVU,
+//   II_JALRC, II_LD, II_LD[LR], II_RESTORE, II_SAVE, II_SD, II_SDC1, II_SD[LR]
+//
+// The following instructions are never valid on P5600.
+//   addq.ph, repl.ph, repl.qb, subq.ph, subu_s.qb
+//
+// Guesswork
+// =========
+//
+// This section is largely temporary guesswork.
+
+// ceil.[lw].[ds], floor.[lw].[ds]
+// Reason behind guess: trunc.[lw].ds and the various cvt's are in FPUL
+def : ItinRW<[P5600WriteFPUL], [II_CEIL, II_FLOOR, II_ROUND]>;
+
+// rotrv
+// Reason behind guess: rotr is in the same category and the two register forms
+//                      generally follow the immediate forms in this category
+def : ItinRW<[P5600WriteEitherALU], [II_ROTRV]>;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
new file mode 100644
index 000000000000..3e7570ff46ed
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
@@ -0,0 +1,174 @@
+//===-- MipsSubtarget.cpp - Mips Subtarget Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Mips specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsMachineFunction.h"
+#include "Mips.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "MipsGenSubtargetInfo.inc"
+
+// FIXME: Maybe this should be on by default when Mips16 is specified
+//
+static cl::opt<bool>
+    Mixed16_32("mips-mixed-16-32", cl::init(false),
+               cl::desc("Allow for a mixture of Mips16 "
+                        "and Mips32 code in a single output file"),
+               cl::Hidden);
+
+static cl::opt<bool> Mips_Os16("mips-os16", cl::init(false),
+                               cl::desc("Compile all functions that don't use "
+                                        "floating point as Mips 16"),
+                               cl::Hidden);
+
+static cl::opt<bool> Mips16HardFloat("mips16-hard-float", cl::NotHidden,
+                                     cl::desc("Enable mips16 hard float."),
+                                     cl::init(false));
+
+static cl::opt<bool>
+    Mips16ConstantIslands("mips16-constant-islands", cl::NotHidden,
+                          cl::desc("Enable mips16 constant islands."),
+                          cl::init(true));
+
+static cl::opt<bool>
+    GPOpt("mgpopt", cl::Hidden,
+          cl::desc("Enable gp-relative addressing of mips small data items"));
+
+void MipsSubtarget::anchor() { }
+
+MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
+                             const std::string &FS, bool little,
+                             const MipsTargetMachine &TM)
+    : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault),
+      IsLittle(little), IsSoftFloat(false), IsSingleFloat(false), IsFPXX(false),
+      NoABICalls(false), IsFP64bit(false), UseOddSPReg(true),
+      IsNaN2008bit(false), IsGP64bit(false), HasVFPU(false), HasCnMips(false),
+      HasMips3_32(false), HasMips3_32r2(false), HasMips4_32(false),
+      HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false),
+      InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
+      HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 | Mips_Os16),
+      Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasEVA(false), TM(TM),
+      TargetTriple(TT), TSInfo(),
+      InstrInfo(
+          MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))),
+      FrameLowering(MipsFrameLowering::create(*this)),
+      TLInfo(MipsTargetLowering::create(TM, *this)) {
+
+  PreviousInMips16Mode = InMips16Mode;
+
+  if (MipsArchVersion == MipsDefault)
+    MipsArchVersion = Mips32;
+
+  // Don't even attempt to generate code for MIPS-I and MIPS-V. They have not
+  // been tested and currently exist for the integrated assembler only.
+  if (MipsArchVersion == Mips1)
+    report_fatal_error("Code generation for MIPS-I is not implemented", false);
+  if (MipsArchVersion == Mips5)
+    report_fatal_error("Code generation for MIPS-V is not implemented", false);
+
+  // Check if Architecture and ABI are compatible.
+  assert(((!isGP64bit() && isABI_O32()) ||
+          (isGP64bit() && (isABI_N32() || isABI_N64()))) &&
+         "Invalid  Arch & ABI pair.");
+
+  if (hasMSA() && !isFP64bit())
+    report_fatal_error("MSA requires a 64-bit FPU register file (FR=1 mode). "
+                       "See -mattr=+fp64.",
+                       false);
+
+  if (!isABI_O32() && !useOddSPReg())
+    report_fatal_error("-mattr=+nooddspreg requires the O32 ABI.", false);
+
+  if (IsFPXX && (isABI_N32() || isABI_N64()))
+    report_fatal_error("FPXX is not permitted for the N32/N64 ABI's.", false);
+
+  if (hasMips32r6()) {
+    StringRef ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6";
+
+    assert(isFP64bit());
+    assert(isNaN2008());
+    if (hasDSP())
+      report_fatal_error(ISA + " is not compatible with the DSP ASE", false);
+  }
+
+  if (NoABICalls && TM.isPositionIndependent())
+    report_fatal_error("position-independent code requires '-mabicalls'");
+
+  // Set UseSmallSection.
+  UseSmallSection = GPOpt;
+  if (!NoABICalls && GPOpt) {
+    errs() << "warning: cannot use small-data accesses for '-mabicalls'"
+           << "\n";
+    UseSmallSection = false;
+  }
+}
+
+bool MipsSubtarget::isPositionIndependent() const {
+  return TM.isPositionIndependent();
+}
+
+/// This overrides the PostRAScheduler bit in the SchedModel for any CPU.
+bool MipsSubtarget::enablePostRAScheduler() const { return true; }
+
+void MipsSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
+  CriticalPathRCs.clear();
+  CriticalPathRCs.push_back(isGP64bit() ?
+                            &Mips::GPR64RegClass : &Mips::GPR32RegClass);
+}
+
+CodeGenOpt::Level MipsSubtarget::getOptLevelToEnablePostRAScheduler() const {
+  return CodeGenOpt::Aggressive;
+}
+
+MipsSubtarget &
+MipsSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS,
+                                               const TargetMachine &TM) {
+  std::string CPUName = MIPS_MC::selectMipsCPU(TM.getTargetTriple(), CPU);
+
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FS);
+  // Initialize scheduling itinerary for the specified CPU.
+  InstrItins = getInstrItineraryForCPU(CPUName);
+
+  if (InMips16Mode && !IsSoftFloat)
+    InMips16HardFloat = true;
+
+  return *this;
+}
+
+bool MipsSubtarget::useConstantIslands() {
+  DEBUG(dbgs() << "use constant islands " << Mips16ConstantIslands << "\n");
+  return Mips16ConstantIslands;
+}
+
+Reloc::Model MipsSubtarget::getRelocationModel() const {
+  return TM.getRelocationModel();
+}
+
+bool MipsSubtarget::isABI_N64() const { return getABI().IsN64(); }
+bool MipsSubtarget::isABI_N32() const { return getABI().IsN32(); }
+bool MipsSubtarget::isABI_O32() const { return getABI().IsO32(); }
+const MipsABIInfo &MipsSubtarget::getABI() const { return TM.getABI(); }
diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
new file mode 100644
index 000000000000..38d3cee70477
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
@@ -0,0 +1,316 @@
+//===-- MipsSubtarget.h - Define Subtarget for the Mips ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSUBTARGET_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSUBTARGET_H
+
+#include "MCTargetDesc/MipsABIInfo.h"
+#include "MipsFrameLowering.h"
+#include "MipsISelLowering.h"
+#include "MipsInstrInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "MipsGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class MipsTargetMachine;
+
+class MipsSubtarget : public MipsGenSubtargetInfo {
+  virtual void anchor();
+
+  enum MipsArchEnum {
+    MipsDefault,
+    Mips1, Mips2, Mips32, Mips32r2, Mips32r3, Mips32r5, Mips32r6, Mips32Max,
+    Mips3, Mips4, Mips5, Mips64, Mips64r2, Mips64r3, Mips64r5, Mips64r6
+  };
+
+  enum class CPU { P5600 };
+
+  // Mips architecture version
+  MipsArchEnum MipsArchVersion;
+
+  // Processor implementation (unused but required to exist by
+  // tablegen-erated code).
+  CPU ProcImpl;
+
+  // IsLittle - The target is Little Endian
+  bool IsLittle;
+
+  // IsSoftFloat - The target does not support any floating point instructions.
+  bool IsSoftFloat;
+
+  // IsSingleFloat - The target only supports single precision float
+  // point operations. This enable the target to use all 32 32-bit
+  // floating point registers instead of only using even ones.
+  bool IsSingleFloat;
+
+  // IsFPXX - MIPS O32 modeless ABI.
+  bool IsFPXX;
+
+  // NoABICalls - Disable SVR4-style position-independent code.
+  bool NoABICalls;
+
+  // IsFP64bit - The target processor has 64-bit floating point registers.
+  bool IsFP64bit;
+
+  /// Are odd single-precision registers permitted?
+  /// This corresponds to -modd-spreg and -mno-odd-spreg
+  bool UseOddSPReg;
+
+  // IsNan2008 - IEEE 754-2008 NaN encoding.
+  bool IsNaN2008bit;
+
+  // IsFP64bit - General-purpose registers are 64 bits wide
+  bool IsGP64bit;
+
+  // IsPTR64bit - Pointers are 64 bit wide
+  bool IsPTR64bit;
+
+  // HasVFPU - Processor has a vector floating point unit.
+  bool HasVFPU;
+
+  // CPU supports cnMIPS (Cavium Networks Octeon CPU).
+  bool HasCnMips;
+
+  // isLinux - Target system is Linux. Is false we consider ELFOS for now.
+  bool IsLinux;
+
+  // UseSmallSection - Small section is used.
+  bool UseSmallSection;
+
+  /// Features related to the presence of specific instructions.
+
+  // HasMips3_32 - The subset of MIPS-III instructions added to MIPS32
+  bool HasMips3_32;
+
+  // HasMips3_32r2 - The subset of MIPS-III instructions added to MIPS32r2
+  bool HasMips3_32r2;
+
+  // HasMips4_32 - Has the subset of MIPS-IV present in MIPS32
+  bool HasMips4_32;
+
+  // HasMips4_32r2 - Has the subset of MIPS-IV present in MIPS32r2
+  bool HasMips4_32r2;
+
+  // HasMips5_32r2 - Has the subset of MIPS-V present in MIPS32r2
+  bool HasMips5_32r2;
+
+  // InMips16 -- can process Mips16 instructions
+  bool InMips16Mode;
+
+  // Mips16 hard float
+  bool InMips16HardFloat;
+
+  // PreviousInMips16 -- the function we just processed was in Mips 16 Mode
+  bool PreviousInMips16Mode;
+
+  // InMicroMips -- can process MicroMips instructions
+  bool InMicroMipsMode;
+
+  // HasDSP, HasDSPR2, HasDSPR3 -- supports DSP ASE.
+  bool HasDSP, HasDSPR2, HasDSPR3;
+
+  // Allow mixed Mips16 and Mips32 in one source file
+  bool AllowMixed16_32;
+
+  // Optimize for space by compiling all functions as Mips 16 unless
+  // it needs floating point. Functions needing floating point are
+  // compiled as Mips32
+  bool Os16;
+
+  // HasMSA -- supports MSA ASE.
+  bool HasMSA;
+
+  // UseTCCInDIV -- Enables the use of trapping in the assembler.
+  bool UseTCCInDIV;
+
+  // HasEVA -- supports EVA ASE.
+  bool HasEVA;
+
+  InstrItineraryData InstrItins;
+
+  // We can override the determination of whether we are in mips16 mode
+  // as from the command line
+  enum {NoOverride, Mips16Override, NoMips16Override} OverrideMode;
+
+  const MipsTargetMachine &TM;
+
+  Triple TargetTriple;
+
+  const SelectionDAGTargetInfo TSInfo;
+  std::unique_ptr<const MipsInstrInfo> InstrInfo;
+  std::unique_ptr<const MipsFrameLowering> FrameLowering;
+  std::unique_ptr<const MipsTargetLowering> TLInfo;
+
+public:
+  bool isPositionIndependent() const;
+  /// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
+  bool enablePostRAScheduler() const override;
+  void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;
+  CodeGenOpt::Level getOptLevelToEnablePostRAScheduler() const override;
+
+  bool isABI_N64() const;
+  bool isABI_N32() const;
+  bool isABI_O32() const;
+  const MipsABIInfo &getABI() const;
+  bool isABI_FPXX() const { return isABI_O32() && IsFPXX; }
+
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  MipsSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
+                bool little, const MipsTargetMachine &TM);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  bool hasMips1() const { return MipsArchVersion >= Mips1; }
+  bool hasMips2() const { return MipsArchVersion >= Mips2; }
+  bool hasMips3() const { return MipsArchVersion >= Mips3; }
+  bool hasMips4() const { return MipsArchVersion >= Mips4; }
+  bool hasMips5() const { return MipsArchVersion >= Mips5; }
+  bool hasMips4_32() const { return HasMips4_32; }
+  bool hasMips4_32r2() const { return HasMips4_32r2; }
+  bool hasMips32() const {
+    return (MipsArchVersion >= Mips32 && MipsArchVersion < Mips32Max) ||
+           hasMips64();
+  }
+  bool hasMips32r2() const {
+    return (MipsArchVersion >= Mips32r2 && MipsArchVersion < Mips32Max) ||
+           hasMips64r2();
+  }
+  bool hasMips32r3() const {
+    return (MipsArchVersion >= Mips32r3 && MipsArchVersion < Mips32Max) ||
+           hasMips64r2();
+  }
+  bool hasMips32r5() const {
+    return (MipsArchVersion >= Mips32r5 && MipsArchVersion < Mips32Max) ||
+           hasMips64r5();
+  }
+  bool hasMips32r6() const {
+    return (MipsArchVersion >= Mips32r6 && MipsArchVersion < Mips32Max) ||
+           hasMips64r6();
+  }
+  bool hasMips64() const { return MipsArchVersion >= Mips64; }
+  bool hasMips64r2() const { return MipsArchVersion >= Mips64r2; }
+  bool hasMips64r3() const { return MipsArchVersion >= Mips64r3; }
+  bool hasMips64r5() const { return MipsArchVersion >= Mips64r5; }
+  bool hasMips64r6() const { return MipsArchVersion >= Mips64r6; }
+
+  bool hasCnMips() const { return HasCnMips; }
+
+  bool isLittle() const { return IsLittle; }
+  bool isABICalls() const { return !NoABICalls; }
+  bool isFPXX() const { return IsFPXX; }
+  bool isFP64bit() const { return IsFP64bit; }
+  bool useOddSPReg() const { return UseOddSPReg; }
+  bool noOddSPReg() const { return !UseOddSPReg; }
+  bool isNaN2008() const { return IsNaN2008bit; }
+  bool isGP64bit() const { return IsGP64bit; }
+  bool isGP32bit() const { return !IsGP64bit; }
+  unsigned getGPRSizeInBytes() const { return isGP64bit() ? 8 : 4; }
+  bool isPTR64bit() const { return IsPTR64bit; }
+  bool isPTR32bit() const { return !IsPTR64bit; }
+  bool isSingleFloat() const { return IsSingleFloat; }
+  bool hasVFPU() const { return HasVFPU; }
+  bool inMips16Mode() const { return InMips16Mode; }
+  bool inMips16ModeDefault() const {
+    return InMips16Mode;
+  }
+  // Hard float for mips16 means essentially to compile as soft float
+  // but to use a runtime library for soft float that is written with
+  // native mips32 floating point instructions (those runtime routines
+  // run in mips32 hard float mode).
+  bool inMips16HardFloat() const {
+    return inMips16Mode() && InMips16HardFloat;
+  }
+  bool inMicroMipsMode() const { return InMicroMipsMode; }
+  bool inMicroMips32r6Mode() const { return InMicroMipsMode && hasMips32r6(); }
+  bool inMicroMips64r6Mode() const { return InMicroMipsMode && hasMips64r6(); }
+  bool hasDSP() const { return HasDSP; }
+  bool hasDSPR2() const { return HasDSPR2; }
+  bool hasDSPR3() const { return HasDSPR3; }
+  bool hasMSA() const { return HasMSA; }
+  bool hasEVA() const { return HasEVA; }
+  bool useSmallSection() const { return UseSmallSection; }
+
+  bool hasStandardEncoding() const { return !inMips16Mode(); }
+
+  bool useSoftFloat() const { return IsSoftFloat; }
+
+  bool enableLongBranchPass() const {
+    return hasStandardEncoding() || allowMixed16_32();
+  }
+
+  /// Features related to the presence of specific instructions.
+  bool hasExtractInsert() const { return !inMips16Mode() && hasMips32r2(); }
+  bool hasMTHC1() const { return hasMips32r2(); }
+
+  bool allowMixed16_32() const { return inMips16ModeDefault() |
+                                        AllowMixed16_32; }
+
+  bool os16() const { return Os16; }
+
+  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+
+  // for now constant islands are on for the whole compilation unit but we only
+  // really use them if in addition we are in mips16 mode
+  static bool useConstantIslands();
+
+  unsigned stackAlignment() const { return hasMips64() ? 16 : 8; }
+
+  // Grab relocation model
+  Reloc::Model getRelocationModel() const;
+
+  MipsSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS,
+                                                 const TargetMachine &TM);
+
+  /// Does the system support unaligned memory access.
+  ///
+  /// MIPS32r6/MIPS64r6 require full unaligned access support but does not
+  /// specify which component of the system provides it. Hardware, software, and
+  /// hybrid implementations are all valid.
+  bool systemSupportsUnalignedAccess() const { return hasMips32r6(); }
+
+  // Set helper classes
+  void setHelperClassesMips16();
+  void setHelperClassesMipsSE();
+
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const MipsInstrInfo *getInstrInfo() const override { return InstrInfo.get(); }
+  const TargetFrameLowering *getFrameLowering() const override {
+    return FrameLowering.get();
+  }
+  const MipsRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo->getRegisterInfo();
+  }
+  const MipsTargetLowering *getTargetLowering() const override {
+    return TLInfo.get();
+  }
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
new file mode 100644
index 000000000000..bb48188e3b87
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -0,0 +1,270 @@
+//===-- MipsTargetMachine.cpp - Define TargetMachine for Mips -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Mips target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetMachine.h"
+#include "Mips.h"
+#include "Mips16FrameLowering.h"
+#include "Mips16ISelDAGToDAG.h"
+#include "Mips16ISelLowering.h"
+#include "Mips16InstrInfo.h"
+#include "MipsFrameLowering.h"
+#include "MipsInstrInfo.h"
+#include "MipsSEFrameLowering.h"
+#include "MipsSEISelDAGToDAG.h"
+#include "MipsSEISelLowering.h"
+#include "MipsSEInstrInfo.h"
+#include "MipsTargetObjectFile.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mips"
+
+extern "C" void LLVMInitializeMipsTarget() {
+  // Register the target.
+  RegisterTargetMachine<MipsebTargetMachine> X(getTheMipsTarget());
+  RegisterTargetMachine<MipselTargetMachine> Y(getTheMipselTarget());
+  RegisterTargetMachine<MipsebTargetMachine> A(getTheMips64Target());
+  RegisterTargetMachine<MipselTargetMachine> B(getTheMips64elTarget());
+}
+
+static std::string computeDataLayout(const Triple &TT, StringRef CPU,
+                                     const TargetOptions &Options,
+                                     bool isLittle) {
+  std::string Ret = "";
+  MipsABIInfo ABI = MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions);
+
+  // There are both little and big endian mips.
+  if (isLittle)
+    Ret += "e";
+  else
+    Ret += "E";
+
+  if (ABI.IsO32())
+    Ret += "-m:m";
+  else
+    Ret += "-m:e";
+
+  // Pointers are 32 bit on some ABIs.
+  if (!ABI.IsN64())
+    Ret += "-p:32:32";
+
+  // 8 and 16 bit integers only need to have natural alignment, but try to
+  // align them to 32 bits. 64 bit integers have natural alignment.
+  Ret += "-i8:8:32-i16:16:32-i64:64";
+
+  // 32 bit registers are always available and the stack is at least 64 bit
+  // aligned. On N64 64 bit registers are also available and the stack is
+  // 128 bit aligned.
+  if (ABI.IsN64() || ABI.IsN32())
+    Ret += "-n32:64-S128";
+  else
+    Ret += "-n32-S64";
+
+  return Ret;
+}
+
+static Reloc::Model getEffectiveRelocModel(CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM) {
+  if (!RM.hasValue() || CM == CodeModel::JITDefault)
+    return Reloc::Static;
+  return *RM;
+}
+
+// On function prologue, the stack is created by decrementing
+// its pointer. Once decremented, all references are done with positive
+// offset from the stack/frame pointer, using StackGrowsUp enables
+// an easier handling.
+// Using CodeModel::Large enables different CALL behavior.
+MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT,
+                                     StringRef CPU, StringRef FS,
+                                     const TargetOptions &Options,
+                                     Optional<Reloc::Model> RM,
+                                     CodeModel::Model CM, CodeGenOpt::Level OL,
+                                     bool isLittle)
+    : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT,
+                        CPU, FS, Options, getEffectiveRelocModel(CM, RM), CM,
+                        OL),
+      isLittle(isLittle), TLOF(make_unique<MipsTargetObjectFile>()),
+      ABI(MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions)),
+      Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, *this),
+      NoMips16Subtarget(TT, CPU, FS.empty() ? "-mips16" : FS.str() + ",-mips16",
+                        isLittle, *this),
+      Mips16Subtarget(TT, CPU, FS.empty() ? "+mips16" : FS.str() + ",+mips16",
+                      isLittle, *this) {
+  Subtarget = &DefaultSubtarget;
+  initAsmInfo();
+}
+
+MipsTargetMachine::~MipsTargetMachine() {}
+
+void MipsebTargetMachine::anchor() { }
+
+MipsebTargetMachine::MipsebTargetMachine(const Target &T, const Triple &TT,
+                                         StringRef CPU, StringRef FS,
+                                         const TargetOptions &Options,
+                                         Optional<Reloc::Model> RM,
+                                         CodeModel::Model CM,
+                                         CodeGenOpt::Level OL)
+    : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+void MipselTargetMachine::anchor() { }
+
+MipselTargetMachine::MipselTargetMachine(const Target &T, const Triple &TT,
+                                         StringRef CPU, StringRef FS,
+                                         const TargetOptions &Options,
+                                         Optional<Reloc::Model> RM,
+                                         CodeModel::Model CM,
+                                         CodeGenOpt::Level OL)
+    : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+const MipsSubtarget *
+MipsTargetMachine::getSubtargetImpl(const Function &F) const {
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+  bool hasMips16Attr =
+      !F.getFnAttribute("mips16").hasAttribute(Attribute::None);
+  bool hasNoMips16Attr =
+      !F.getFnAttribute("nomips16").hasAttribute(Attribute::None);
+
+  // FIXME: This is related to the code below to reset the target options,
+  // we need to know whether or not the soft float flag is set on the
+  // function, so we can enable it as a subtarget feature.
+  bool softFloat =
+      F.hasFnAttribute("use-soft-float") &&
+      F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+
+  if (hasMips16Attr)
+    FS += FS.empty() ? "+mips16" : ",+mips16";
+  else if (hasNoMips16Attr)
+    FS += FS.empty() ? "-mips16" : ",-mips16";
+  if (softFloat)
+    FS += FS.empty() ? "+soft-float" : ",+soft-float";
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<MipsSubtarget>(TargetTriple, CPU, FS, isLittle,
+                                         *this);
+  }
+  return I.get();
+}
+
+void MipsTargetMachine::resetSubtarget(MachineFunction *MF) {
+  DEBUG(dbgs() << "resetSubtarget\n");
+
+  Subtarget = const_cast<MipsSubtarget *>(getSubtargetImpl(*MF->getFunction()));
+  MF->setSubtarget(Subtarget);
+  return;
+}
+
+namespace {
+/// Mips Code Generator Pass Configuration Options.
+class MipsPassConfig : public TargetPassConfig {
+public:
+  MipsPassConfig(MipsTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {
+    // The current implementation of long branch pass requires a scratch
+    // register ($at) to be available before branch instructions. Tail merging
+    // can break this requirement, so disable it when long branch pass is
+    // enabled.
+    EnableTailMerge = !getMipsSubtarget().enableLongBranchPass();
+  }
+
+  MipsTargetMachine &getMipsTargetMachine() const {
+    return getTM<MipsTargetMachine>();
+  }
+
+  const MipsSubtarget &getMipsSubtarget() const {
+    return *getMipsTargetMachine().getSubtargetImpl();
+  }
+
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  void addPreEmitPass() override;
+
+  void addPreRegAlloc() override;
+
+};
+} // namespace
+
+TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new MipsPassConfig(this, PM);
+}
+
+void MipsPassConfig::addIRPasses() {
+  TargetPassConfig::addIRPasses();
+  addPass(createAtomicExpandPass(&getMipsTargetMachine()));
+  if (getMipsSubtarget().os16())
+    addPass(createMipsOs16Pass(getMipsTargetMachine()));
+  if (getMipsSubtarget().inMips16HardFloat())
+    addPass(createMips16HardFloatPass(getMipsTargetMachine()));
+}
+// Install an instruction selector pass using
+// the ISelDag to gen Mips code.
+bool MipsPassConfig::addInstSelector() {
+  addPass(createMipsModuleISelDagPass(getMipsTargetMachine()));
+  addPass(createMips16ISelDag(getMipsTargetMachine(), getOptLevel()));
+  addPass(createMipsSEISelDag(getMipsTargetMachine(), getOptLevel()));
+  return false;
+}
+
+void MipsPassConfig::addPreRegAlloc() {
+  addPass(createMipsOptimizePICCallPass(getMipsTargetMachine()));
+}
+
+TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    if (Subtarget->allowMixed16_32()) {
+      DEBUG(errs() << "No Target Transform Info Pass Added\n");
+      // FIXME: This is no longer necessary as the TTI returned is per-function.
+      return TargetTransformInfo(F.getParent()->getDataLayout());
+    }
+
+    DEBUG(errs() << "Target Transform Info Pass Added\n");
+    return TargetTransformInfo(BasicTTIImpl(this, F));
+  });
+}
+
+// Implemented by targets that want to run passes immediately before
+// machine code is emitted. return true if -print-machineinstrs should
+// print out the code after the passes.
+void MipsPassConfig::addPreEmitPass() {
+  MipsTargetMachine &TM = getMipsTargetMachine();
+
+  // The delay slot filler pass can potientially create forbidden slot (FS)
+  // hazards for MIPSR6 which the hazard schedule pass (HSP) will fix. Any
+  // (new) pass that creates compact branches after the HSP must handle FS
+  // hazards itself or be pipelined before the HSP.
+  addPass(createMipsDelaySlotFillerPass(TM));
+  addPass(createMipsHazardSchedule());
+  addPass(createMipsLongBranchPass(TM));
+  addPass(createMipsConstantIslandPass());
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h
new file mode 100644
index 000000000000..e4cf17e2abd8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h
@@ -0,0 +1,96 @@
+//===-- MipsTargetMachine.h - Define TargetMachine for Mips -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSTARGETMACHINE_H
+#define LLVM_LIB_TARGET_MIPS_MIPSTARGETMACHINE_H
+
+#include "MCTargetDesc/MipsABIInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class formatted_raw_ostream;
+class MipsRegisterInfo;
+
+class MipsTargetMachine : public LLVMTargetMachine {
+  bool isLittle;
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  // Selected ABI
+  MipsABIInfo ABI;
+  MipsSubtarget *Subtarget;
+  MipsSubtarget DefaultSubtarget;
+  MipsSubtarget NoMips16Subtarget;
+  MipsSubtarget Mips16Subtarget;
+
+  mutable StringMap<std::unique_ptr<MipsSubtarget>> SubtargetMap;
+
+public:
+  MipsTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                    StringRef FS, const TargetOptions &Options,
+                    Optional<Reloc::Model> RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL, bool isLittle);
+  ~MipsTargetMachine() override;
+
+  TargetIRAnalysis getTargetIRAnalysis() override;
+
+  const MipsSubtarget *getSubtargetImpl() const {
+    if (Subtarget)
+      return Subtarget;
+    return &DefaultSubtarget;
+  }
+
+  const MipsSubtarget *getSubtargetImpl(const Function &F) const override;
+
+  /// \brief Reset the subtarget for the Mips target.
+  void resetSubtarget(MachineFunction *MF);
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+
+  bool isLittleEndian() const { return isLittle; }
+  const MipsABIInfo &getABI() const { return ABI; }
+};
+
+/// Mips32/64 big endian target machine.
+///
+class MipsebTargetMachine : public MipsTargetMachine {
+  virtual void anchor();
+public:
+  MipsebTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                      StringRef FS, const TargetOptions &Options,
+                      Optional<Reloc::Model> RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
+};
+
+/// Mips32/64 little endian target machine.
+///
+class MipselTargetMachine : public MipsTargetMachine {
+  virtual void anchor();
+public:
+  MipselTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                      StringRef FS, const TargetOptions &Options,
+                      Optional<Reloc::Model> RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
new file mode 100644
index 000000000000..fadab7806120
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -0,0 +1,150 @@
+//===-- MipsTargetObjectFile.cpp - Mips Object Files ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetObjectFile.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+static cl::opt<unsigned>
+SSThreshold("mips-ssection-threshold", cl::Hidden,
+            cl::desc("Small data and bss section threshold size (default=8)"),
+            cl::init(8));
+
+static cl::opt<bool>
+LocalSData("mlocal-sdata", cl::Hidden,
+           cl::desc("MIPS: Use gp_rel for object-local data."),
+           cl::init(true));
+
+static cl::opt<bool>
+ExternSData("mextern-sdata", cl::Hidden,
+            cl::desc("MIPS: Use gp_rel for data that is not defined by the "
+                     "current object."),
+            cl::init(true));
+
+void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+
+  SmallDataSection = getContext().getELFSection(
+      ".sdata", ELF::SHT_PROGBITS,
+      ELF::SHF_WRITE | ELF::SHF_ALLOC | ELF::SHF_MIPS_GPREL);
+
+  SmallBSSSection = getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+                                               ELF::SHF_WRITE | ELF::SHF_ALLOC |
+                                                   ELF::SHF_MIPS_GPREL);
+  this->TM = &static_cast<const MipsTargetMachine &>(TM);
+}
+
+// A address must be loaded from a small section if its size is less than the
+// small section size threshold. Data in this section must be addressed using
+// gp_rel operator.
+static bool IsInSmallSection(uint64_t Size) {
+  // gcc has traditionally not treated zero-sized objects as small data, so this
+  // is effectively part of the ABI.
+  return Size > 0 && Size <= SSThreshold;
+}
+
+/// Return true if this global address should be placed into small data/bss
+/// section.
+bool MipsTargetObjectFile::IsGlobalInSmallSection(
+    const GlobalObject *GO, const TargetMachine &TM) const {
+  // We first check the case where global is a declaration, because finding
+  // section kind using getKindForGlobal() is only allowed for global
+  // definitions.
+  if (GO->isDeclaration() || GO->hasAvailableExternallyLinkage())
+    return IsGlobalInSmallSectionImpl(GO, TM);
+
+  return IsGlobalInSmallSection(GO, TM, getKindForGlobal(GO, TM));
+}
+
+/// Return true if this global address should be placed into small data/bss
+/// section.
+bool MipsTargetObjectFile::
+IsGlobalInSmallSection(const GlobalObject *GO, const TargetMachine &TM,
+                       SectionKind Kind) const {
+  return (IsGlobalInSmallSectionImpl(GO, TM) &&
+          (Kind.isData() || Kind.isBSS() || Kind.isCommon()));
+}
+
+/// Return true if this global address should be placed into small data/bss
+/// section. This method does all the work, except for checking the section
+/// kind.
+bool MipsTargetObjectFile::
+IsGlobalInSmallSectionImpl(const GlobalObject *GO,
+                           const TargetMachine &TM) const {
+  const MipsSubtarget &Subtarget =
+      *static_cast<const MipsTargetMachine &>(TM).getSubtargetImpl();
+
+  // Return if small section is not available.
+  if (!Subtarget.useSmallSection())
+    return false;
+
+  // Only global variables, not functions.
+  const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GO);
+  if (!GVA)
+    return false;
+
+  // Enforce -mlocal-sdata.
+  if (!LocalSData && GVA->hasLocalLinkage())
+    return false;
+
+  // Enforce -mextern-sdata.
+  if (!ExternSData && ((GVA->hasExternalLinkage() && GVA->isDeclaration()) ||
+                       GVA->hasCommonLinkage()))
+    return false;
+
+  Type *Ty = GVA->getValueType();
+  return IsInSmallSection(
+      GVA->getParent()->getDataLayout().getTypeAllocSize(Ty));
+}
+
+MCSection *MipsTargetObjectFile::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  // TODO: Could also support "weak" symbols as well with ".gnu.linkonce.s.*"
+  // sections?
+
+  // Handle Small Section classification here.
+  if (Kind.isBSS() && IsGlobalInSmallSection(GO, TM, Kind))
+    return SmallBSSSection;
+  if (Kind.isData() && IsGlobalInSmallSection(GO, TM, Kind))
+    return SmallDataSection;
+
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
+}
+
+/// Return true if this constant should be placed into small data section.
+bool MipsTargetObjectFile::IsConstantInSmallSection(
+    const DataLayout &DL, const Constant *CN, const TargetMachine &TM) const {
+  return (static_cast<const MipsTargetMachine &>(TM)
+              .getSubtargetImpl()
+              ->useSmallSection() &&
+          LocalSData && IsInSmallSection(DL.getTypeAllocSize(CN->getType())));
+}
+
+/// Return true if this constant should be placed into small data section.
+MCSection *MipsTargetObjectFile::getSectionForConstant(const DataLayout &DL,
+                                                       SectionKind Kind,
+                                                       const Constant *C,
+                                                       unsigned &Align) const {
+  if (IsConstantInSmallSection(DL, C, *TM))
+    return SmallDataSection;
+
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C, Align);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h
new file mode 100644
index 000000000000..e5423f9578a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h
@@ -0,0 +1,48 @@
+//===-- llvm/Target/MipsTargetObjectFile.h - Mips Object Info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_MIPS_MIPSTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+class MipsTargetMachine;
+  class MipsTargetObjectFile : public TargetLoweringObjectFileELF {
+    MCSection *SmallDataSection;
+    MCSection *SmallBSSSection;
+    const MipsTargetMachine *TM;
+
+    bool IsGlobalInSmallSection(const GlobalObject *GO, const TargetMachine &TM,
+                                SectionKind Kind) const;
+    bool IsGlobalInSmallSectionImpl(const GlobalObject *GO,
+                                    const TargetMachine &TM) const;
+  public:
+
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+    /// Return true if this global address should be placed into small data/bss
+    /// section.
+    bool IsGlobalInSmallSection(const GlobalObject *GO,
+                                const TargetMachine &TM) const;
+
+    MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+                                      const TargetMachine &TM) const override;
+
+    /// Return true if this constant should be placed into small data section.
+    bool IsConstantInSmallSection(const DataLayout &DL, const Constant *CN,
+                                  const TargetMachine &TM) const;
+
+    MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
+                                     const Constant *C,
+                                     unsigned &Align) const override;
+  };
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h
new file mode 100644
index 000000000000..41ebe411b98d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h
@@ -0,0 +1,324 @@
+//===-- MipsTargetStreamer.h - Mips Target Streamer ------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_MIPS_MIPSTARGETSTREAMER_H
+
+#include "MCTargetDesc/MipsABIFlagsSection.h"
+#include "MCTargetDesc/MipsABIInfo.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+struct MipsABIFlagsSection;
+
+class MipsTargetStreamer : public MCTargetStreamer {
+public:
+  MipsTargetStreamer(MCStreamer &S);
+
+  virtual void setPic(bool Value) {}
+
+  virtual void emitDirectiveSetMicroMips();
+  virtual void emitDirectiveSetNoMicroMips();
+  virtual void setUsesMicroMips();
+  virtual void emitDirectiveSetMips16();
+  virtual void emitDirectiveSetNoMips16();
+
+  virtual void emitDirectiveSetReorder();
+  virtual void emitDirectiveSetNoReorder();
+  virtual void emitDirectiveSetMacro();
+  virtual void emitDirectiveSetNoMacro();
+  virtual void emitDirectiveSetMsa();
+  virtual void emitDirectiveSetNoMsa();
+  virtual void emitDirectiveSetAt();
+  virtual void emitDirectiveSetAtWithArg(unsigned RegNo);
+  virtual void emitDirectiveSetNoAt();
+  virtual void emitDirectiveEnd(StringRef Name);
+
+  virtual void emitDirectiveEnt(const MCSymbol &Symbol);
+  virtual void emitDirectiveAbiCalls();
+  virtual void emitDirectiveNaN2008();
+  virtual void emitDirectiveNaNLegacy();
+  virtual void emitDirectiveOptionPic0();
+  virtual void emitDirectiveOptionPic2();
+  virtual void emitDirectiveInsn();
+  virtual void emitFrame(unsigned StackReg, unsigned StackSize,
+                         unsigned ReturnReg);
+  virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff);
+  virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff);
+
+  virtual void emitDirectiveSetArch(StringRef Arch);
+  virtual void emitDirectiveSetMips0();
+  virtual void emitDirectiveSetMips1();
+  virtual void emitDirectiveSetMips2();
+  virtual void emitDirectiveSetMips3();
+  virtual void emitDirectiveSetMips4();
+  virtual void emitDirectiveSetMips5();
+  virtual void emitDirectiveSetMips32();
+  virtual void emitDirectiveSetMips32R2();
+  virtual void emitDirectiveSetMips32R3();
+  virtual void emitDirectiveSetMips32R5();
+  virtual void emitDirectiveSetMips32R6();
+  virtual void emitDirectiveSetMips64();
+  virtual void emitDirectiveSetMips64R2();
+  virtual void emitDirectiveSetMips64R3();
+  virtual void emitDirectiveSetMips64R5();
+  virtual void emitDirectiveSetMips64R6();
+  virtual void emitDirectiveSetDsp();
+  virtual void emitDirectiveSetNoDsp();
+  virtual void emitDirectiveSetPop();
+  virtual void emitDirectiveSetPush();
+  virtual void emitDirectiveSetSoftFloat();
+  virtual void emitDirectiveSetHardFloat();
+
+  // PIC support
+  virtual void emitDirectiveCpLoad(unsigned RegNo);
+  virtual bool emitDirectiveCpRestore(int Offset,
+                                      function_ref<unsigned()> GetATReg,
+                                      SMLoc IDLoc, const MCSubtargetInfo *STI);
+  virtual void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+                                    const MCSymbol &Sym, bool IsReg);
+  virtual void emitDirectiveCpreturn(unsigned SaveLocation,
+                                     bool SaveLocationIsRegister);
+
+  // FP abiflags directives
+  virtual void emitDirectiveModuleFP();
+  virtual void emitDirectiveModuleOddSPReg();
+  virtual void emitDirectiveModuleSoftFloat();
+  virtual void emitDirectiveModuleHardFloat();
+  virtual void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value);
+  virtual void emitDirectiveSetOddSPReg();
+  virtual void emitDirectiveSetNoOddSPReg();
+
+  void emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc,
+             const MCSubtargetInfo *STI);
+  void emitII(unsigned Opcode, int16_t Imm1, int16_t Imm2, SMLoc IDLoc,
+              const MCSubtargetInfo *STI);
+  void emitRX(unsigned Opcode, unsigned Reg0, MCOperand Op1, SMLoc IDLoc,
+              const MCSubtargetInfo *STI);
+  void emitRI(unsigned Opcode, unsigned Reg0, int32_t Imm, SMLoc IDLoc,
+              const MCSubtargetInfo *STI);
+  void emitRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, SMLoc IDLoc,
+              const MCSubtargetInfo *STI);
+  void emitRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1, MCOperand Op2,
+               SMLoc IDLoc, const MCSubtargetInfo *STI);
+  void emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, unsigned Reg2,
+               SMLoc IDLoc, const MCSubtargetInfo *STI);
+  void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm,
+               SMLoc IDLoc, const MCSubtargetInfo *STI);
+  void emitAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg, bool Is64Bit,
+                const MCSubtargetInfo *STI);
+  void emitDSLL(unsigned DstReg, unsigned SrcReg, int16_t ShiftAmount,
+                SMLoc IDLoc, const MCSubtargetInfo *STI);
+  void emitEmptyDelaySlot(bool hasShortDelaySlot, SMLoc IDLoc,
+                          const MCSubtargetInfo *STI);
+  void emitNop(SMLoc IDLoc, const MCSubtargetInfo *STI);
+
+  /// Emit a store instruction with an offset. If the offset is out of range
+  /// then it will be synthesized using the assembler temporary.
+  ///
+  /// GetATReg() is a callback that can be used to obtain the current assembler
+  /// temporary and is only called when the assembler temporary is required. It
+  /// must handle the case where no assembler temporary is available (typically
+  /// by reporting an error).
+  void emitStoreWithImmOffset(unsigned Opcode, unsigned SrcReg,
+                              unsigned BaseReg, int64_t Offset,
+                              function_ref<unsigned()> GetATReg, SMLoc IDLoc,
+                              const MCSubtargetInfo *STI);
+  void emitStoreWithSymOffset(unsigned Opcode, unsigned SrcReg,
+                              unsigned BaseReg, MCOperand &HiOperand,
+                              MCOperand &LoOperand, unsigned ATReg, SMLoc IDLoc,
+                              const MCSubtargetInfo *STI);
+  void emitLoadWithImmOffset(unsigned Opcode, unsigned DstReg, unsigned BaseReg,
+                             int64_t Offset, unsigned TmpReg, SMLoc IDLoc,
+                             const MCSubtargetInfo *STI);
+  void emitLoadWithSymOffset(unsigned Opcode, unsigned DstReg, unsigned BaseReg,
+                             MCOperand &HiOperand, MCOperand &LoOperand,
+                             unsigned ATReg, SMLoc IDLoc,
+                             const MCSubtargetInfo *STI);
+  void emitGPRestore(int Offset, SMLoc IDLoc, const MCSubtargetInfo *STI);
+
+  void forbidModuleDirective() { ModuleDirectiveAllowed = false; }
+  void reallowModuleDirective() { ModuleDirectiveAllowed = true; }
+  bool isModuleDirectiveAllowed() { return ModuleDirectiveAllowed; }
+
+  // This method enables template classes to set internal abi flags
+  // structure values.
+  template <class PredicateLibrary>
+  void updateABIInfo(const PredicateLibrary &P) {
+    ABI = P.getABI();
+    ABIFlagsSection.setAllFromPredicates(P);
+  }
+
+  MipsABIFlagsSection &getABIFlagsSection() { return ABIFlagsSection; }
+  const MipsABIInfo &getABI() const {
+    assert(ABI.hasValue() && "ABI hasn't been set!");
+    return *ABI;
+  }
+
+protected:
+  llvm::Optional<MipsABIInfo> ABI;
+  MipsABIFlagsSection ABIFlagsSection;
+
+  bool GPRInfoSet;
+  unsigned GPRBitMask;
+  int GPROffset;
+
+  bool FPRInfoSet;
+  unsigned FPRBitMask;
+  int FPROffset;
+
+  bool FrameInfoSet;
+  int FrameOffset;
+  unsigned FrameReg;
+  unsigned ReturnReg;
+
+private:
+  bool ModuleDirectiveAllowed;
+};
+
+// This part is for ascii assembly output
+class MipsTargetAsmStreamer : public MipsTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  MipsTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+  void emitDirectiveSetMicroMips() override;
+  void emitDirectiveSetNoMicroMips() override;
+  void emitDirectiveSetMips16() override;
+  void emitDirectiveSetNoMips16() override;
+
+  void emitDirectiveSetReorder() override;
+  void emitDirectiveSetNoReorder() override;
+  void emitDirectiveSetMacro() override;
+  void emitDirectiveSetNoMacro() override;
+  void emitDirectiveSetMsa() override;
+  void emitDirectiveSetNoMsa() override;
+  void emitDirectiveSetAt() override;
+  void emitDirectiveSetAtWithArg(unsigned RegNo) override;
+  void emitDirectiveSetNoAt() override;
+  void emitDirectiveEnd(StringRef Name) override;
+
+  void emitDirectiveEnt(const MCSymbol &Symbol) override;
+  void emitDirectiveAbiCalls() override;
+  void emitDirectiveNaN2008() override;
+  void emitDirectiveNaNLegacy() override;
+  void emitDirectiveOptionPic0() override;
+  void emitDirectiveOptionPic2() override;
+  void emitDirectiveInsn() override;
+  void emitFrame(unsigned StackReg, unsigned StackSize,
+                 unsigned ReturnReg) override;
+  void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) override;
+  void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
+
+  void emitDirectiveSetArch(StringRef Arch) override;
+  void emitDirectiveSetMips0() override;
+  void emitDirectiveSetMips1() override;
+  void emitDirectiveSetMips2() override;
+  void emitDirectiveSetMips3() override;
+  void emitDirectiveSetMips4() override;
+  void emitDirectiveSetMips5() override;
+  void emitDirectiveSetMips32() override;
+  void emitDirectiveSetMips32R2() override;
+  void emitDirectiveSetMips32R3() override;
+  void emitDirectiveSetMips32R5() override;
+  void emitDirectiveSetMips32R6() override;
+  void emitDirectiveSetMips64() override;
+  void emitDirectiveSetMips64R2() override;
+  void emitDirectiveSetMips64R3() override;
+  void emitDirectiveSetMips64R5() override;
+  void emitDirectiveSetMips64R6() override;
+  void emitDirectiveSetDsp() override;
+  void emitDirectiveSetNoDsp() override;
+  void emitDirectiveSetPop() override;
+  void emitDirectiveSetPush() override;
+  void emitDirectiveSetSoftFloat() override;
+  void emitDirectiveSetHardFloat() override;
+
+  // PIC support
+  void emitDirectiveCpLoad(unsigned RegNo) override;
+
+  /// Emit a .cprestore directive.  If the offset is out of range then it will
+  /// be synthesized using the assembler temporary.
+  ///
+  /// GetATReg() is a callback that can be used to obtain the current assembler
+  /// temporary and is only called when the assembler temporary is required. It
+  /// must handle the case where no assembler temporary is available (typically
+  /// by reporting an error).
+  bool emitDirectiveCpRestore(int Offset, function_ref<unsigned()> GetATReg,
+                              SMLoc IDLoc, const MCSubtargetInfo *STI) override;
+  void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+                            const MCSymbol &Sym, bool IsReg) override;
+  void emitDirectiveCpreturn(unsigned SaveLocation,
+                             bool SaveLocationIsRegister) override;
+
+  // FP abiflags directives
+  void emitDirectiveModuleFP() override;
+  void emitDirectiveModuleOddSPReg() override;
+  void emitDirectiveModuleSoftFloat() override;
+  void emitDirectiveModuleHardFloat() override;
+  void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value) override;
+  void emitDirectiveSetOddSPReg() override;
+  void emitDirectiveSetNoOddSPReg() override;
+};
+
+// This part is for ELF object output
+class MipsTargetELFStreamer : public MipsTargetStreamer {
+  bool MicroMipsEnabled;
+  const MCSubtargetInfo &STI;
+  bool Pic;
+
+public:
+  bool isMicroMipsEnabled() const { return MicroMipsEnabled; }
+  MCELFStreamer &getStreamer();
+  MipsTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
+
+  void setPic(bool Value) override { Pic = Value; }
+
+  void emitLabel(MCSymbol *Symbol) override;
+  void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
+  void finish() override;
+
+  void emitDirectiveSetMicroMips() override;
+  void emitDirectiveSetNoMicroMips() override;
+  void setUsesMicroMips() override;
+  void emitDirectiveSetMips16() override;
+
+  void emitDirectiveSetNoReorder() override;
+  void emitDirectiveEnd(StringRef Name) override;
+
+  void emitDirectiveEnt(const MCSymbol &Symbol) override;
+  void emitDirectiveAbiCalls() override;
+  void emitDirectiveNaN2008() override;
+  void emitDirectiveNaNLegacy() override;
+  void emitDirectiveOptionPic0() override;
+  void emitDirectiveOptionPic2() override;
+  void emitDirectiveInsn() override;
+  void emitFrame(unsigned StackReg, unsigned StackSize,
+                 unsigned ReturnReg) override;
+  void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) override;
+  void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
+
+  // PIC support
+  void emitDirectiveCpLoad(unsigned RegNo) override;
+  bool emitDirectiveCpRestore(int Offset, function_ref<unsigned()> GetATReg,
+                              SMLoc IDLoc, const MCSubtargetInfo *STI) override;
+  void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+                            const MCSymbol &Sym, bool IsReg) override;
+  void emitDirectiveCpreturn(unsigned SaveLocation,
+                             bool SaveLocationIsRegister) override;
+
+  void emitMipsAbiFlags();
+};
+}
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/contrib/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
new file mode 100644
index 000000000000..4c1edfaaaeca
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
@@ -0,0 +1,48 @@
+//===-- MipsTargetInfo.cpp - Mips Target Implementation -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheMipsTarget() {
+  static Target TheMipsTarget;
+  return TheMipsTarget;
+}
+Target &llvm::getTheMipselTarget() {
+  static Target TheMipselTarget;
+  return TheMipselTarget;
+}
+Target &llvm::getTheMips64Target() {
+  static Target TheMips64Target;
+  return TheMips64Target;
+}
+Target &llvm::getTheMips64elTarget() {
+  static Target TheMips64elTarget;
+  return TheMips64elTarget;
+}
+
+extern "C" void LLVMInitializeMipsTargetInfo() {
+  RegisterTarget<Triple::mips,
+                 /*HasJIT=*/true>
+      X(getTheMipsTarget(), "mips", "Mips");
+
+  RegisterTarget<Triple::mipsel,
+                 /*HasJIT=*/true>
+      Y(getTheMipselTarget(), "mipsel", "Mipsel");
+
+  RegisterTarget<Triple::mips64,
+                 /*HasJIT=*/true>
+      A(getTheMips64Target(), "mips64", "Mips64 [experimental]");
+
+  RegisterTarget<Triple::mips64el,
+                 /*HasJIT=*/true>
+      B(getTheMips64elTarget(), "mips64el", "Mips64el [experimental]");
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
new file mode 100644
index 000000000000..4594c22b8701
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -0,0 +1,286 @@
+//===-- NVPTXInstPrinter.cpp - PTX assembly instruction printing ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Print MCInst instructions to .ptx format.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstPrinter/NVPTXInstPrinter.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "NVPTX.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "NVPTXGenAsmWriter.inc"
+
+NVPTXInstPrinter::NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                                   const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+void NVPTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  // Decode the virtual register
+  // Must be kept in sync with NVPTXAsmPrinter::encodeVirtualRegister
+  unsigned RCId = (RegNo >> 28);
+  switch (RCId) {
+  default: report_fatal_error("Bad virtual register encoding");
+  case 0:
+    // This is actually a physical register, so defer to the autogenerated
+    // register printer
+    OS << getRegisterName(RegNo);
+    return;
+  case 1:
+    OS << "%p";
+    break;
+  case 2:
+    OS << "%rs";
+    break;
+  case 3:
+    OS << "%r";
+    break;
+  case 4:
+    OS << "%rd";
+    break;
+  case 5:
+    OS << "%f";
+    break;
+  case 6:
+    OS << "%fd";
+    break;
+  }
+
+  unsigned VReg = RegNo & 0x0FFFFFFF;
+  OS << VReg;
+}
+
+void NVPTXInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                 StringRef Annot, const MCSubtargetInfo &STI) {
+  printInstruction(MI, OS);
+
+  // Next always print the annotation.
+  printAnnotation(OS, Annot);
+}
+
+void NVPTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    printRegName(O, Reg);
+  } else if (Op.isImm()) {
+    O << markup("<imm:") << formatImm(Op.getImm()) << markup(">");
+  } else {
+    assert(Op.isExpr() && "Unknown operand kind in printOperand");
+    Op.getExpr()->print(O, &MAI);
+  }
+}
+
+void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
+                                    const char *Modifier) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  int64_t Imm = MO.getImm();
+
+  if (strcmp(Modifier, "ftz") == 0) {
+    // FTZ flag
+    if (Imm & NVPTX::PTXCvtMode::FTZ_FLAG)
+      O << ".ftz";
+  } else if (strcmp(Modifier, "sat") == 0) {
+    // SAT flag
+    if (Imm & NVPTX::PTXCvtMode::SAT_FLAG)
+      O << ".sat";
+  } else if (strcmp(Modifier, "base") == 0) {
+    // Default operand
+    switch (Imm & NVPTX::PTXCvtMode::BASE_MASK) {
+    default:
+      return;
+    case NVPTX::PTXCvtMode::NONE:
+      break;
+    case NVPTX::PTXCvtMode::RNI:
+      O << ".rni";
+      break;
+    case NVPTX::PTXCvtMode::RZI:
+      O << ".rzi";
+      break;
+    case NVPTX::PTXCvtMode::RMI:
+      O << ".rmi";
+      break;
+    case NVPTX::PTXCvtMode::RPI:
+      O << ".rpi";
+      break;
+    case NVPTX::PTXCvtMode::RN:
+      O << ".rn";
+      break;
+    case NVPTX::PTXCvtMode::RZ:
+      O << ".rz";
+      break;
+    case NVPTX::PTXCvtMode::RM:
+      O << ".rm";
+      break;
+    case NVPTX::PTXCvtMode::RP:
+      O << ".rp";
+      break;
+    }
+  } else {
+    llvm_unreachable("Invalid conversion modifier");
+  }
+}
+
+void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
+                                    const char *Modifier) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  int64_t Imm = MO.getImm();
+
+  if (strcmp(Modifier, "ftz") == 0) {
+    // FTZ flag
+    if (Imm & NVPTX::PTXCmpMode::FTZ_FLAG)
+      O << ".ftz";
+  } else if (strcmp(Modifier, "base") == 0) {
+    switch (Imm & NVPTX::PTXCmpMode::BASE_MASK) {
+    default:
+      return;
+    case NVPTX::PTXCmpMode::EQ:
+      O << ".eq";
+      break;
+    case NVPTX::PTXCmpMode::NE:
+      O << ".ne";
+      break;
+    case NVPTX::PTXCmpMode::LT:
+      O << ".lt";
+      break;
+    case NVPTX::PTXCmpMode::LE:
+      O << ".le";
+      break;
+    case NVPTX::PTXCmpMode::GT:
+      O << ".gt";
+      break;
+    case NVPTX::PTXCmpMode::GE:
+      O << ".ge";
+      break;
+    case NVPTX::PTXCmpMode::LO:
+      O << ".lo";
+      break;
+    case NVPTX::PTXCmpMode::LS:
+      O << ".ls";
+      break;
+    case NVPTX::PTXCmpMode::HI:
+      O << ".hi";
+      break;
+    case NVPTX::PTXCmpMode::HS:
+      O << ".hs";
+      break;
+    case NVPTX::PTXCmpMode::EQU:
+      O << ".equ";
+      break;
+    case NVPTX::PTXCmpMode::NEU:
+      O << ".neu";
+      break;
+    case NVPTX::PTXCmpMode::LTU:
+      O << ".ltu";
+      break;
+    case NVPTX::PTXCmpMode::LEU:
+      O << ".leu";
+      break;
+    case NVPTX::PTXCmpMode::GTU:
+      O << ".gtu";
+      break;
+    case NVPTX::PTXCmpMode::GEU:
+      O << ".geu";
+      break;
+    case NVPTX::PTXCmpMode::NUM:
+      O << ".num";
+      break;
+    case NVPTX::PTXCmpMode::NotANumber:
+      O << ".nan";
+      break;
+    }
+  } else {
+    llvm_unreachable("Empty Modifier");
+  }
+}
+
+void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
+                                     raw_ostream &O, const char *Modifier) {
+  if (Modifier) {
+    const MCOperand &MO = MI->getOperand(OpNum);
+    int Imm = (int) MO.getImm();
+    if (!strcmp(Modifier, "volatile")) {
+      if (Imm)
+        O << ".volatile";
+    } else if (!strcmp(Modifier, "addsp")) {
+      switch (Imm) {
+      case NVPTX::PTXLdStInstCode::GLOBAL:
+        O << ".global";
+        break;
+      case NVPTX::PTXLdStInstCode::SHARED:
+        O << ".shared";
+        break;
+      case NVPTX::PTXLdStInstCode::LOCAL:
+        O << ".local";
+        break;
+      case NVPTX::PTXLdStInstCode::PARAM:
+        O << ".param";
+        break;
+      case NVPTX::PTXLdStInstCode::CONSTANT:
+        O << ".const";
+        break;
+      case NVPTX::PTXLdStInstCode::GENERIC:
+        break;
+      default:
+        llvm_unreachable("Wrong Address Space");
+      }
+    } else if (!strcmp(Modifier, "sign")) {
+      if (Imm == NVPTX::PTXLdStInstCode::Signed)
+        O << "s";
+      else if (Imm == NVPTX::PTXLdStInstCode::Unsigned)
+        O << "u";
+      else
+        O << "f";
+    } else if (!strcmp(Modifier, "vec")) {
+      if (Imm == NVPTX::PTXLdStInstCode::V2)
+        O << ".v2";
+      else if (Imm == NVPTX::PTXLdStInstCode::V4)
+        O << ".v4";
+    } else
+      llvm_unreachable("Unknown Modifier");
+  } else
+    llvm_unreachable("Empty Modifier");
+}
+
+void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum,
+                                       raw_ostream &O, const char *Modifier) {
+  printOperand(MI, OpNum, O);
+
+  if (Modifier && !strcmp(Modifier, "add")) {
+    O << ", ";
+    printOperand(MI, OpNum + 1, O);
+  } else {
+    if (MI->getOperand(OpNum + 1).isImm() &&
+        MI->getOperand(OpNum + 1).getImm() == 0)
+      return; // don't print ',0' or '+0'
+    O << "+";
+    printOperand(MI, OpNum + 1, O);
+  }
+}
+
+void NVPTXInstPrinter::printProtoIdent(const MCInst *MI, int OpNum,
+                                       raw_ostream &O, const char *Modifier) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  assert(Op.isExpr() && "Call prototype is not an MCExpr?");
+  const MCExpr *Expr = Op.getExpr();
+  const MCSymbol &Sym = cast<MCSymbolRefExpr>(Expr)->getSymbol();
+  O << Sym.getName();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
new file mode 100644
index 000000000000..f0f223aa057b
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
@@ -0,0 +1,52 @@
+//= NVPTXInstPrinter.h - Convert NVPTX MCInst to assembly syntax --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an NVPTX MCInst to .ptx file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_INSTPRINTER_NVPTXINSTPRINTER_H
+#define LLVM_LIB_TARGET_NVPTX_INSTPRINTER_NVPTXINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class MCSubtargetInfo;
+
+class NVPTXInstPrinter : public MCInstPrinter {
+public:
+  NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                   const MCRegisterInfo &MRI);
+
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+  // End
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
+                    const char *Modifier = nullptr);
+  void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
+                    const char *Modifier = nullptr);
+  void printLdStCode(const MCInst *MI, int OpNum,
+                     raw_ostream &O, const char *Modifier = nullptr);
+  void printMemOperand(const MCInst *MI, int OpNum,
+                       raw_ostream &O, const char *Modifier = nullptr);
+  void printProtoIdent(const MCInst *MI, int OpNum,
+                       raw_ostream &O, const char *Modifier = nullptr);
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
new file mode 100644
index 000000000000..1cb92005979d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
@@ -0,0 +1,46 @@
+//===-- NVPTXBaseInfo.h - Top-level definitions for NVPTX -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the NVPTX target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXBASEINFO_H
+#define LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXBASEINFO_H
+
+namespace llvm {
+
+enum AddressSpace {
+  ADDRESS_SPACE_GENERIC = 0,
+  ADDRESS_SPACE_GLOBAL = 1,
+  ADDRESS_SPACE_SHARED = 3,
+  ADDRESS_SPACE_CONST = 4,
+  ADDRESS_SPACE_LOCAL = 5,
+
+  // NVVM Internal
+  ADDRESS_SPACE_PARAM = 101
+};
+
+namespace NVPTXII {
+enum {
+  // These must be kept in sync with TSFlags in NVPTXInstrFormats.td
+  IsTexFlag = 0x80,
+  IsSuldMask = 0x300,
+  IsSuldShift = 8,
+  IsSustFlag = 0x400,
+  IsSurfTexQueryFlag = 0x800,
+  IsTexModeUnifiedFlag = 0x1000
+};
+} // namespace NVPTXII
+
+} // namespace llvm
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
new file mode 100644
index 000000000000..78bdf4e698d8
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -0,0 +1,59 @@
+//===-- NVPTXMCAsmInfo.cpp - NVPTX asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the NVPTXMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+// -debug-compile - Command line option to inform opt and llc passes to
+// compile for debugging
+static cl::opt<bool> CompileForDebugging("debug-compile",
+                                         cl::desc("Compile for debugging"),
+                                         cl::Hidden, cl::init(false));
+
+void NVPTXMCAsmInfo::anchor() {}
+
+NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) {
+  if (TheTriple.getArch() == Triple::nvptx64) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
+
+  CommentString = "//";
+
+  HasSingleParameterDotFile = false;
+
+  InlineAsmStart = " begin inline asm";
+  InlineAsmEnd = " end inline asm";
+
+  SupportsDebugInformation = CompileForDebugging;
+  // PTX does not allow .align on functions.
+  HasFunctionAlignment = false;
+  HasDotTypeDotSizeDirective = false;
+  // PTX does not allow .hidden or .protected
+  HiddenDeclarationVisibilityAttr = HiddenVisibilityAttr = MCSA_Invalid;
+  ProtectedVisibilityAttr = MCSA_Invalid;
+
+  Data8bitsDirective = " .b8 ";
+  Data16bitsDirective = " .b16 ";
+  Data32bitsDirective = " .b32 ";
+  Data64bitsDirective = " .b64 ";
+  ZeroDirective = " .b8";
+  AsciiDirective = " .b8";
+  AscizDirective = " .b8";
+
+  // @TODO: Can we just disable this?
+  WeakDirective = "\t// .weak\t";
+  GlobalDirective = "\t// .globl\t";
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
new file mode 100644
index 000000000000..9ac3c8850f75
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- NVPTXMCAsmInfo.h - NVPTX asm properties ----------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVPTXMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXMCASMINFO_H
+#define LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+class Target;
+class Triple;
+
+class NVPTXMCAsmInfo : public MCAsmInfo {
+  virtual void anchor();
+
+public:
+  explicit NVPTXMCAsmInfo(const Triple &TheTriple);
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
new file mode 100644
index 000000000000..12f992749366
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -0,0 +1,79 @@
+//===-- NVPTXMCTargetDesc.cpp - NVPTX Target Descriptions -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides NVPTX specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCTargetDesc.h"
+#include "InstPrinter/NVPTXInstPrinter.h"
+#include "NVPTXMCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "NVPTXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "NVPTXGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "NVPTXGenRegisterInfo.inc"
+
+static MCInstrInfo *createNVPTXMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitNVPTXMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createNVPTXMCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  // PTX does not have a return address register.
+  InitNVPTXMCRegisterInfo(X, 0);
+  return X;
+}
+
+static MCSubtargetInfo *
+createNVPTXMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  return createNVPTXMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCInstPrinter *createNVPTXMCInstPrinter(const Triple &T,
+                                               unsigned SyntaxVariant,
+                                               const MCAsmInfo &MAI,
+                                               const MCInstrInfo &MII,
+                                               const MCRegisterInfo &MRI) {
+  if (SyntaxVariant == 0)
+    return new NVPTXInstPrinter(MAI, MII, MRI);
+  return nullptr;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXTargetMC() {
+  for (Target *T : {&getTheNVPTXTarget32(), &getTheNVPTXTarget64()}) {
+    // Register the MC asm info.
+    RegisterMCAsmInfo<NVPTXMCAsmInfo> X(*T);
+
+    // Register the MC instruction info.
+    TargetRegistry::RegisterMCInstrInfo(*T, createNVPTXMCInstrInfo);
+
+    // Register the MC register info.
+    TargetRegistry::RegisterMCRegInfo(*T, createNVPTXMCRegisterInfo);
+
+    // Register the MC subtarget info.
+    TargetRegistry::RegisterMCSubtargetInfo(*T, createNVPTXMCSubtargetInfo);
+
+    // Register the MCInstPrinter.
+    TargetRegistry::RegisterMCInstPrinter(*T, createNVPTXMCInstPrinter);
+  }
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
new file mode 100644
index 000000000000..0c9ad977e7ec
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
@@ -0,0 +1,38 @@
+//===-- NVPTXMCTargetDesc.h - NVPTX Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides NVPTX specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXMCTARGETDESC_H
+#define LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXMCTARGETDESC_H
+
+#include <stdint.h>
+
+namespace llvm {
+class Target;
+
+Target &getTheNVPTXTarget32();
+Target &getTheNVPTXTarget64();
+
+} // End llvm namespace
+
+// Defines symbolic names for PTX registers.
+#define GET_REGINFO_ENUM
+#include "NVPTXGenRegisterInfo.inc"
+
+// Defines symbolic names for the PTX instructions.
+#define GET_INSTRINFO_ENUM
+#include "NVPTXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "NVPTXGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h b/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h
new file mode 100644
index 000000000000..a2d670f8d39d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h
@@ -0,0 +1,48 @@
+//===-- ManagedStringPool.h - Managed String Pool ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The strings allocated from a managed string pool are owned by the string
+// pool and will be deleted together with the managed string pool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_MANAGEDSTRINGPOOL_H
+#define LLVM_LIB_TARGET_NVPTX_MANAGEDSTRINGPOOL_H
+
+#include "llvm/ADT/SmallVector.h"
+#include <string>
+
+namespace llvm {
+
+/// ManagedStringPool - The strings allocated from a managed string pool are
+/// owned by the string pool and will be deleted together with the managed
+/// string pool.
+class ManagedStringPool {
+  SmallVector<std::string *, 8> Pool;
+
+public:
+  ManagedStringPool() {}
+  ~ManagedStringPool() {
+    SmallVectorImpl<std::string *>::iterator Current = Pool.begin();
+    while (Current != Pool.end()) {
+      delete *Current;
+      Current++;
+    }
+  }
+
+  std::string *getManagedString(const char *S) {
+    std::string *Str = new std::string(S);
+    Pool.push_back(Str);
+    return Str;
+  }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.h b/contrib/llvm/lib/Target/NVPTX/NVPTX.h
new file mode 100644
index 000000000000..c455a437d8d5
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.h
@@ -0,0 +1,178 @@
+//===-- NVPTX.h - Top-level interface for NVPTX representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM NVPTX back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTX_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTX_H
+
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <iosfwd>
+
+namespace llvm {
+class NVPTXTargetMachine;
+class FunctionPass;
+class MachineFunctionPass;
+class formatted_raw_ostream;
+
+namespace NVPTXCC {
+enum CondCodes {
+  EQ,
+  NE,
+  LT,
+  LE,
+  GT,
+  GE
+};
+}
+
+FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
+                                 llvm::CodeGenOpt::Level OptLevel);
+ModulePass *createNVPTXAssignValidGlobalNamesPass();
+ModulePass *createGenericToNVVMPass();
+FunctionPass *createNVPTXInferAddressSpacesPass();
+FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion);
+FunctionPass *createNVVMReflectPass();
+FunctionPass *createNVVMReflectPass(const StringMap<int> &Mapping);
+MachineFunctionPass *createNVPTXPrologEpilogPass();
+MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
+FunctionPass *createNVPTXImageOptimizerPass();
+FunctionPass *createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM);
+BasicBlockPass *createNVPTXLowerAllocaPass();
+MachineFunctionPass *createNVPTXPeephole();
+
+Target &getTheNVPTXTarget32();
+Target &getTheNVPTXTarget64();
+
+namespace NVPTX {
+enum DrvInterface {
+  NVCL,
+  CUDA
+};
+
+// A field inside TSFlags needs a shift and a mask. The usage is
+// always as follows :
+// ((TSFlags & fieldMask) >> fieldShift)
+// The enum keeps the mask, the shift, and all valid values of the
+// field in one place.
+enum VecInstType {
+  VecInstTypeShift = 0,
+  VecInstTypeMask = 0xF,
+
+  VecNOP = 0,
+  VecLoad = 1,
+  VecStore = 2,
+  VecBuild = 3,
+  VecShuffle = 4,
+  VecExtract = 5,
+  VecInsert = 6,
+  VecDest = 7,
+  VecOther = 15
+};
+
+enum SimpleMove {
+  SimpleMoveMask = 0x10,
+  SimpleMoveShift = 4
+};
+enum LoadStore {
+  isLoadMask = 0x20,
+  isLoadShift = 5,
+  isStoreMask = 0x40,
+  isStoreShift = 6
+};
+
+namespace PTXLdStInstCode {
+enum AddressSpace {
+  GENERIC = 0,
+  GLOBAL = 1,
+  CONSTANT = 2,
+  SHARED = 3,
+  PARAM = 4,
+  LOCAL = 5
+};
+enum FromType {
+  Unsigned = 0,
+  Signed,
+  Float
+};
+enum VecType {
+  Scalar = 1,
+  V2 = 2,
+  V4 = 4
+};
+}
+
+/// PTXCvtMode - Conversion code enumeration
+namespace PTXCvtMode {
+enum CvtMode {
+  NONE = 0,
+  RNI,
+  RZI,
+  RMI,
+  RPI,
+  RN,
+  RZ,
+  RM,
+  RP,
+
+  BASE_MASK = 0x0F,
+  FTZ_FLAG = 0x10,
+  SAT_FLAG = 0x20
+};
+}
+
+/// PTXCmpMode - Comparison mode enumeration
+namespace PTXCmpMode {
+enum CmpMode {
+  EQ = 0,
+  NE,
+  LT,
+  LE,
+  GT,
+  GE,
+  LO,
+  LS,
+  HI,
+  HS,
+  EQU,
+  NEU,
+  LTU,
+  LEU,
+  GTU,
+  GEU,
+  NUM,
+  // NAN is a MACRO
+  NotANumber,
+
+  BASE_MASK = 0xFF,
+  FTZ_FLAG = 0x100
+};
+}
+}
+} // end namespace llvm;
+
+// Defines symbolic names for NVPTX registers.  This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "NVPTXGenRegisterInfo.inc"
+
+// Defines symbolic names for the NVPTX instructions.
+#define GET_INSTRINFO_ENUM
+#include "NVPTXGenInstrInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.td b/contrib/llvm/lib/Target/NVPTX/NVPTX.td
new file mode 100644
index 000000000000..c77ddbc99789
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.td
@@ -0,0 +1,96 @@
+//===- NVPTX.td - Describe the NVPTX Target Machine -----------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the NVPTX target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+include "NVPTXRegisterInfo.td"
+include "NVPTXInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features.
+// - We use the SM version number instead of explicit feature table.
+// - Need at least one feature to avoid generating zero sized array by
+//   TableGen in NVPTXGenSubtarget.inc.
+//===----------------------------------------------------------------------===//
+
+// SM Versions
+def SM20 : SubtargetFeature<"sm_20", "SmVersion", "20",
+                            "Target SM 2.0">;
+def SM21 : SubtargetFeature<"sm_21", "SmVersion", "21",
+                            "Target SM 2.1">;
+def SM30 : SubtargetFeature<"sm_30", "SmVersion", "30",
+                            "Target SM 3.0">;
+def SM32 : SubtargetFeature<"sm_32", "SmVersion", "32",
+                            "Target SM 3.2">;
+def SM35 : SubtargetFeature<"sm_35", "SmVersion", "35",
+                            "Target SM 3.5">;
+def SM37 : SubtargetFeature<"sm_37", "SmVersion", "37",
+                            "Target SM 3.7">;
+def SM50 : SubtargetFeature<"sm_50", "SmVersion", "50",
+                            "Target SM 5.0">;
+def SM52 : SubtargetFeature<"sm_52", "SmVersion", "52",
+                            "Target SM 5.2">;
+def SM53 : SubtargetFeature<"sm_53", "SmVersion", "53",
+                            "Target SM 5.3">;
+def SM60 : SubtargetFeature<"sm_60", "SmVersion", "60",
+                             "Target SM 6.0">;
+def SM61 : SubtargetFeature<"sm_61", "SmVersion", "61",
+                             "Target SM 6.1">;
+def SM62 : SubtargetFeature<"sm_62", "SmVersion", "62",
+                             "Target SM 6.2">;
+
+def SATOM : SubtargetFeature<"satom", "HasAtomScope", "true",
+                             "Atomic operations with scope">;
+
+// PTX Versions
+def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32",
+                             "Use PTX version 3.2">;
+def PTX40 : SubtargetFeature<"ptx40", "PTXVersion", "40",
+                             "Use PTX version 4.0">;
+def PTX41 : SubtargetFeature<"ptx41", "PTXVersion", "41",
+                             "Use PTX version 4.1">;
+def PTX42 : SubtargetFeature<"ptx42", "PTXVersion", "42",
+                             "Use PTX version 4.2">;
+def PTX43 : SubtargetFeature<"ptx43", "PTXVersion", "43",
+                             "Use PTX version 4.3">;
+def PTX50 : SubtargetFeature<"ptx50", "PTXVersion", "50",
+                             "Use PTX version 5.0">;
+
+//===----------------------------------------------------------------------===//
+// NVPTX supported processors.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"sm_20", [SM20]>;
+def : Proc<"sm_21", [SM21]>;
+def : Proc<"sm_30", [SM30]>;
+def : Proc<"sm_32", [SM32, PTX40]>;
+def : Proc<"sm_35", [SM35]>;
+def : Proc<"sm_37", [SM37, PTX41]>;
+def : Proc<"sm_50", [SM50, PTX40]>;
+def : Proc<"sm_52", [SM52, PTX41]>;
+def : Proc<"sm_53", [SM53, PTX42]>;
+def : Proc<"sm_60", [SM60, PTX50, SATOM]>;
+def : Proc<"sm_61", [SM61, PTX50, SATOM]>;
+def : Proc<"sm_62", [SM62, PTX50, SATOM]>;
+
+def NVPTXInstrInfo : InstrInfo {
+}
+
+def NVPTX : Target {
+  let InstructionSet = NVPTXInstrInfo;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
new file mode 100644
index 000000000000..bed52293197d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
@@ -0,0 +1,70 @@
+//===-- AllocaHoisting.cpp - Hoist allocas to the entry block --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoist the alloca instructions in the non-entry blocks to the entry blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXAllocaHoisting.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+using namespace llvm;
+
+namespace {
+// Hoisting the alloca instructions in the non-entry blocks to the entry
+// block.
+class NVPTXAllocaHoisting : public FunctionPass {
+public:
+  static char ID; // Pass ID
+  NVPTXAllocaHoisting() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<StackProtector>();
+  }
+
+  StringRef getPassName() const override {
+    return "NVPTX specific alloca hoisting";
+  }
+
+  bool runOnFunction(Function &function) override;
+};
+} // namespace
+
+bool NVPTXAllocaHoisting::runOnFunction(Function &function) {
+  bool functionModified = false;
+  Function::iterator I = function.begin();
+  TerminatorInst *firstTerminatorInst = (I++)->getTerminator();
+
+  for (Function::iterator E = function.end(); I != E; ++I) {
+    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+      AllocaInst *allocaInst = dyn_cast<AllocaInst>(BI++);
+      if (allocaInst && isa<ConstantInt>(allocaInst->getArraySize())) {
+        allocaInst->moveBefore(firstTerminatorInst);
+        functionModified = true;
+      }
+    }
+  }
+
+  return functionModified;
+}
+
+char NVPTXAllocaHoisting::ID = 0;
+
+namespace llvm {
+void initializeNVPTXAllocaHoistingPass(PassRegistry &);
+}
+
+INITIALIZE_PASS(
+    NVPTXAllocaHoisting, "alloca-hoisting",
+    "Hoisting alloca instructions in non-entry blocks to the entry block",
+    false, false)
+
+FunctionPass *llvm::createAllocaHoisting() { return new NVPTXAllocaHoisting; }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h
new file mode 100644
index 000000000000..7a6fc7d9b14d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h
@@ -0,0 +1,23 @@
+//===-- AllocaHoisting.h - Hosist allocas to the entry block ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoist the alloca instructions in the non-entry blocks to the entry blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXALLOCAHOISTING_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXALLOCAHOISTING_H
+
+namespace llvm {
+class FunctionPass;
+
+extern FunctionPass *createAllocaHoisting();
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
new file mode 100644
index 000000000000..04c8d5c0443e
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -0,0 +1,2387 @@
+//===-- NVPTXAsmPrinter.cpp - NVPTX LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to NVPTX assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXAsmPrinter.h"
+#include "InstPrinter/NVPTXInstPrinter.h"
+#include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "NVPTX.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXMCExpr.h"
+#include "NVPTXMachineFunctionInfo.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXUtilities.h"
+#include "cl_common_defines.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <sstream>
+using namespace llvm;
+
+#define DEPOTNAME "__local_depot"
+
+static cl::opt<bool>
+EmitLineNumbers("nvptx-emit-line-numbers", cl::Hidden,
+                cl::desc("NVPTX Specific: Emit Line numbers even without -G"),
+                cl::init(true));
+
+static cl::opt<bool>
+InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore, cl::Hidden,
+              cl::desc("NVPTX Specific: Emit source line in ptx file"),
+              cl::init(false));
+
+namespace {
+/// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
+/// depends.
+void DiscoverDependentGlobals(const Value *V,
+                              DenseSet<const GlobalVariable *> &Globals) {
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    Globals.insert(GV);
+  else {
+    if (const User *U = dyn_cast<User>(V)) {
+      for (unsigned i = 0, e = U->getNumOperands(); i != e; ++i) {
+        DiscoverDependentGlobals(U->getOperand(i), Globals);
+      }
+    }
+  }
+}
+
+/// VisitGlobalVariableForEmission - Add \p GV to the list of GlobalVariable
+/// instances to be emitted, but only after any dependents have been added
+/// first.
+void VisitGlobalVariableForEmission(
+    const GlobalVariable *GV, SmallVectorImpl<const GlobalVariable *> &Order,
+    DenseSet<const GlobalVariable *> &Visited,
+    DenseSet<const GlobalVariable *> &Visiting) {
+  // Have we already visited this one?
+  if (Visited.count(GV))
+    return;
+
+  // Do we have a circular dependency?
+  if (!Visiting.insert(GV).second)
+    report_fatal_error("Circular dependency found in global variable set");
+
+  // Make sure we visit all dependents first
+  DenseSet<const GlobalVariable *> Others;
+  for (unsigned i = 0, e = GV->getNumOperands(); i != e; ++i)
+    DiscoverDependentGlobals(GV->getOperand(i), Others);
+
+  for (DenseSet<const GlobalVariable *>::iterator I = Others.begin(),
+                                                  E = Others.end();
+       I != E; ++I)
+    VisitGlobalVariableForEmission(*I, Order, Visited, Visiting);
+
+  // Now we can visit ourself
+  Order.push_back(GV);
+  Visited.insert(GV);
+  Visiting.erase(GV);
+}
+}
+
+void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
+  if (!EmitLineNumbers)
+    return;
+  if (ignoreLoc(MI))
+    return;
+
+  const DebugLoc &curLoc = MI.getDebugLoc();
+
+  if (!prevDebugLoc && !curLoc)
+    return;
+
+  if (prevDebugLoc == curLoc)
+    return;
+
+  prevDebugLoc = curLoc;
+
+  if (!curLoc)
+    return;
+
+  auto *Scope = cast_or_null<DIScope>(curLoc.getScope());
+  if (!Scope)
+     return;
+
+  StringRef fileName(Scope->getFilename());
+  StringRef dirName(Scope->getDirectory());
+  SmallString<128> FullPathName = dirName;
+  if (!dirName.empty() && !sys::path::is_absolute(fileName)) {
+    sys::path::append(FullPathName, fileName);
+    fileName = FullPathName;
+  }
+
+  if (filenameMap.find(fileName) == filenameMap.end())
+    return;
+
+  // Emit the line from the source file.
+  if (InterleaveSrc)
+    this->emitSrcInText(fileName, curLoc.getLine());
+
+  std::stringstream temp;
+  temp << "\t.loc " << filenameMap[fileName] << " " << curLoc.getLine()
+       << " " << curLoc.getCol();
+  OutStreamer->EmitRawText(temp.str());
+}
+
+void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  SmallString<128> Str;
+  raw_svector_ostream OS(Str);
+  if (static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() == NVPTX::CUDA)
+    emitLineNumberAsDotLoc(*MI);
+
+  MCInst Inst;
+  lowerToMCInst(MI, Inst);
+  EmitToStreamer(*OutStreamer, Inst);
+}
+
+// Handle symbol backtracking for targets that do not support image handles
+bool NVPTXAsmPrinter::lowerImageHandleOperand(const MachineInstr *MI,
+                                           unsigned OpNo, MCOperand &MCOp) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  const MCInstrDesc &MCID = MI->getDesc();
+
+  if (MCID.TSFlags & NVPTXII::IsTexFlag) {
+    // This is a texture fetch, so operand 4 is a texref and operand 5 is
+    // a samplerref
+    if (OpNo == 4 && MO.isImm()) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+    if (OpNo == 5 && MO.isImm() && !(MCID.TSFlags & NVPTXII::IsTexModeUnifiedFlag)) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  } else if (MCID.TSFlags & NVPTXII::IsSuldMask) {
+    unsigned VecSize =
+      1 << (((MCID.TSFlags & NVPTXII::IsSuldMask) >> NVPTXII::IsSuldShift) - 1);
+
+    // For a surface load of vector size N, the Nth operand will be the surfref
+    if (OpNo == VecSize && MO.isImm()) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  } else if (MCID.TSFlags & NVPTXII::IsSustFlag) {
+    // This is a surface store, so operand 0 is a surfref
+    if (OpNo == 0 && MO.isImm()) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  } else if (MCID.TSFlags & NVPTXII::IsSurfTexQueryFlag) {
+    // This is a query, so operand 1 is a surfref/texref
+    if (OpNo == 1 && MO.isImm()) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  }
+
+  return false;
+}
+
+void NVPTXAsmPrinter::lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp) {
+  // Ewwww
+  TargetMachine &TM = const_cast<TargetMachine&>(MF->getTarget());
+  NVPTXTargetMachine &nvTM = static_cast<NVPTXTargetMachine&>(TM);
+  const NVPTXMachineFunctionInfo *MFI = MF->getInfo<NVPTXMachineFunctionInfo>();
+  const char *Sym = MFI->getImageHandleSymbol(Index);
+  std::string *SymNamePtr =
+    nvTM.getManagedStrPool()->getManagedString(Sym);
+  MCOp = GetSymbolRef(OutContext.getOrCreateSymbol(StringRef(*SymNamePtr)));
+}
+
+void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
+  OutMI.setOpcode(MI->getOpcode());
+  // Special: Do not mangle symbol operand of CALL_PROTOTYPE
+  if (MI->getOpcode() == NVPTX::CALL_PROTOTYPE) {
+    const MachineOperand &MO = MI->getOperand(0);
+    OutMI.addOperand(GetSymbolRef(
+      OutContext.getOrCreateSymbol(Twine(MO.getSymbolName()))));
+    return;
+  }
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    if (!nvptxSubtarget->hasImageHandles()) {
+      if (lowerImageHandleOperand(MI, i, MCOp)) {
+        OutMI.addOperand(MCOp);
+        continue;
+      }
+    }
+
+    if (lowerOperand(MO, MCOp))
+      OutMI.addOperand(MCOp);
+  }
+}
+
+bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
+                                   MCOperand &MCOp) {
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Register:
+    MCOp = MCOperand::createReg(encodeVirtualRegister(MO.getReg()));
+    break;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::createImm(MO.getImm());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
+        MO.getMBB()->getSymbol(), OutContext));
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = GetSymbolRef(GetExternalSymbolSymbol(MO.getSymbolName()));
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    MCOp = GetSymbolRef(getSymbol(MO.getGlobal()));
+    break;
+  case MachineOperand::MO_FPImmediate: {
+    const ConstantFP *Cnt = MO.getFPImm();
+    const APFloat &Val = Cnt->getValueAPF();
+
+    switch (Cnt->getType()->getTypeID()) {
+    default: report_fatal_error("Unsupported FP type"); break;
+    case Type::FloatTyID:
+      MCOp = MCOperand::createExpr(
+        NVPTXFloatMCExpr::createConstantFPSingle(Val, OutContext));
+      break;
+    case Type::DoubleTyID:
+      MCOp = MCOperand::createExpr(
+        NVPTXFloatMCExpr::createConstantFPDouble(Val, OutContext));
+      break;
+    }
+    break;
+  }
+  }
+  return true;
+}
+
+unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+
+    DenseMap<unsigned, unsigned> &RegMap = VRegMapping[RC];
+    unsigned RegNum = RegMap[Reg];
+
+    // Encode the register class in the upper 4 bits
+    // Must be kept in sync with NVPTXInstPrinter::printRegName
+    unsigned Ret = 0;
+    if (RC == &NVPTX::Int1RegsRegClass) {
+      Ret = (1 << 28);
+    } else if (RC == &NVPTX::Int16RegsRegClass) {
+      Ret = (2 << 28);
+    } else if (RC == &NVPTX::Int32RegsRegClass) {
+      Ret = (3 << 28);
+    } else if (RC == &NVPTX::Int64RegsRegClass) {
+      Ret = (4 << 28);
+    } else if (RC == &NVPTX::Float32RegsRegClass) {
+      Ret = (5 << 28);
+    } else if (RC == &NVPTX::Float64RegsRegClass) {
+      Ret = (6 << 28);
+    } else {
+      report_fatal_error("Bad register class");
+    }
+
+    // Insert the vreg number
+    Ret |= (RegNum & 0x0FFFFFFF);
+    return Ret;
+  } else {
+    // Some special-use registers are actually physical registers.
+    // Encode this as the register class ID of 0 and the real register ID.
+    return Reg & 0x0FFFFFFF;
+  }
+}
+
+MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) {
+  const MCExpr *Expr;
+  Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None,
+                                 OutContext);
+  return MCOperand::createExpr(Expr);
+}
+
+void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
+  const DataLayout &DL = getDataLayout();
+  const TargetLowering *TLI = nvptxSubtarget->getTargetLowering();
+
+  Type *Ty = F->getReturnType();
+
+  bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
+
+  if (Ty->getTypeID() == Type::VoidTyID)
+    return;
+
+  O << " (";
+
+  if (isABI) {
+    if (Ty->isFloatingPointTy() || Ty->isIntegerTy()) {
+      unsigned size = 0;
+      if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
+        size = ITy->getBitWidth();
+        if (size < 32)
+          size = 32;
+      } else {
+        assert(Ty->isFloatingPointTy() && "Floating point type expected here");
+        size = Ty->getPrimitiveSizeInBits();
+      }
+
+      O << ".param .b" << size << " func_retval0";
+    } else if (isa<PointerType>(Ty)) {
+      O << ".param .b" << TLI->getPointerTy(DL).getSizeInBits()
+        << " func_retval0";
+    } else if (Ty->isAggregateType() || Ty->isVectorTy()) {
+      unsigned totalsz = DL.getTypeAllocSize(Ty);
+      unsigned retAlignment = 0;
+      if (!llvm::getAlign(*F, 0, retAlignment))
+        retAlignment = DL.getABITypeAlignment(Ty);
+      O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz
+        << "]";
+    } else
+      llvm_unreachable("Unknown return type");
+  } else {
+    SmallVector<EVT, 16> vtparts;
+    ComputeValueVTs(*TLI, DL, Ty, vtparts);
+    unsigned idx = 0;
+    for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
+      unsigned elems = 1;
+      EVT elemtype = vtparts[i];
+      if (vtparts[i].isVector()) {
+        elems = vtparts[i].getVectorNumElements();
+        elemtype = vtparts[i].getVectorElementType();
+      }
+
+      for (unsigned j = 0, je = elems; j != je; ++j) {
+        unsigned sz = elemtype.getSizeInBits();
+        if (elemtype.isInteger() && (sz < 32))
+          sz = 32;
+        O << ".reg .b" << sz << " func_retval" << idx;
+        if (j < je - 1)
+          O << ", ";
+        ++idx;
+      }
+      if (i < e - 1)
+        O << ", ";
+    }
+  }
+  O << ") ";
+  return;
+}
+
+void NVPTXAsmPrinter::printReturnValStr(const MachineFunction &MF,
+                                        raw_ostream &O) {
+  const Function *F = MF.getFunction();
+  printReturnValStr(F, O);
+}
+
+// Return true if MBB is the header of a loop marked with
+// llvm.loop.unroll.disable.
+// TODO: consider "#pragma unroll 1" which is equivalent to "#pragma nounroll".
+bool NVPTXAsmPrinter::isLoopHeaderOfNoUnroll(
+    const MachineBasicBlock &MBB) const {
+  MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
+  // We insert .pragma "nounroll" only to the loop header.
+  if (!LI.isLoopHeader(&MBB))
+    return false;
+
+  // llvm.loop.unroll.disable is marked on the back edges of a loop. Therefore,
+  // we iterate through each back edge of the loop with header MBB, and check
+  // whether its metadata contains llvm.loop.unroll.disable.
+  for (auto I = MBB.pred_begin(); I != MBB.pred_end(); ++I) {
+    const MachineBasicBlock *PMBB = *I;
+    if (LI.getLoopFor(PMBB) != LI.getLoopFor(&MBB)) {
+      // Edges from other loops to MBB are not back edges.
+      continue;
+    }
+    if (const BasicBlock *PBB = PMBB->getBasicBlock()) {
+      if (MDNode *LoopID =
+              PBB->getTerminator()->getMetadata(LLVMContext::MD_loop)) {
+        if (GetUnrollMetadata(LoopID, "llvm.loop.unroll.disable"))
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+void NVPTXAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
+  AsmPrinter::EmitBasicBlockStart(MBB);
+  if (isLoopHeaderOfNoUnroll(MBB))
+    OutStreamer->EmitRawText(StringRef("\t.pragma \"nounroll\";\n"));
+}
+
+void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+
+  if (!GlobalsEmitted) {
+    emitGlobals(*MF->getFunction()->getParent());
+    GlobalsEmitted = true;
+  }
+  
+  // Set up
+  MRI = &MF->getRegInfo();
+  F = MF->getFunction();
+  emitLinkageDirective(F, O);
+  if (llvm::isKernelFunction(*F))
+    O << ".entry ";
+  else {
+    O << ".func ";
+    printReturnValStr(*MF, O);
+  }
+
+  CurrentFnSym->print(O, MAI);
+
+  emitFunctionParamList(*MF, O);
+
+  if (llvm::isKernelFunction(*F))
+    emitKernelFunctionDirectives(*F, O);
+
+  OutStreamer->EmitRawText(O.str());
+
+  prevDebugLoc = DebugLoc();
+}
+
+void NVPTXAsmPrinter::EmitFunctionBodyStart() {
+  VRegMapping.clear();
+  OutStreamer->EmitRawText(StringRef("{\n"));
+  setAndEmitFunctionVirtualRegisters(*MF);
+
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+  emitDemotedVars(MF->getFunction(), O);
+  OutStreamer->EmitRawText(O.str());
+}
+
+void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
+  OutStreamer->EmitRawText(StringRef("}\n"));
+  VRegMapping.clear();
+}
+
+void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
+  unsigned RegNo = MI->getOperand(0).getReg();
+  if (TargetRegisterInfo::isVirtualRegister(RegNo)) {
+    OutStreamer->AddComment(Twine("implicit-def: ") +
+                            getVirtualRegisterName(RegNo));
+  } else {
+    OutStreamer->AddComment(Twine("implicit-def: ") +
+                            nvptxSubtarget->getRegisterInfo()->getName(RegNo));
+  }
+  OutStreamer->AddBlankLine();
+}
+
+void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
+                                                   raw_ostream &O) const {
+  // If the NVVM IR has some of reqntid* specified, then output
+  // the reqntid directive, and set the unspecified ones to 1.
+  // If none of reqntid* is specified, don't output reqntid directive.
+  unsigned reqntidx, reqntidy, reqntidz;
+  bool specified = false;
+  if (!llvm::getReqNTIDx(F, reqntidx))
+    reqntidx = 1;
+  else
+    specified = true;
+  if (!llvm::getReqNTIDy(F, reqntidy))
+    reqntidy = 1;
+  else
+    specified = true;
+  if (!llvm::getReqNTIDz(F, reqntidz))
+    reqntidz = 1;
+  else
+    specified = true;
+
+  if (specified)
+    O << ".reqntid " << reqntidx << ", " << reqntidy << ", " << reqntidz
+      << "\n";
+
+  // If the NVVM IR has some of maxntid* specified, then output
+  // the maxntid directive, and set the unspecified ones to 1.
+  // If none of maxntid* is specified, don't output maxntid directive.
+  unsigned maxntidx, maxntidy, maxntidz;
+  specified = false;
+  if (!llvm::getMaxNTIDx(F, maxntidx))
+    maxntidx = 1;
+  else
+    specified = true;
+  if (!llvm::getMaxNTIDy(F, maxntidy))
+    maxntidy = 1;
+  else
+    specified = true;
+  if (!llvm::getMaxNTIDz(F, maxntidz))
+    maxntidz = 1;
+  else
+    specified = true;
+
+  if (specified)
+    O << ".maxntid " << maxntidx << ", " << maxntidy << ", " << maxntidz
+      << "\n";
+
+  unsigned mincta;
+  if (llvm::getMinCTASm(F, mincta))
+    O << ".minnctapersm " << mincta << "\n";
+
+  unsigned maxnreg;
+  if (llvm::getMaxNReg(F, maxnreg))
+    O << ".maxnreg " << maxnreg << "\n";
+}
+
+std::string
+NVPTXAsmPrinter::getVirtualRegisterName(unsigned Reg) const {
+  const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+
+  std::string Name;
+  raw_string_ostream NameStr(Name);
+
+  VRegRCMap::const_iterator I = VRegMapping.find(RC);
+  assert(I != VRegMapping.end() && "Bad register class");
+  const DenseMap<unsigned, unsigned> &RegMap = I->second;
+
+  VRegMap::const_iterator VI = RegMap.find(Reg);
+  assert(VI != RegMap.end() && "Bad virtual register");
+  unsigned MappedVR = VI->second;
+
+  NameStr << getNVPTXRegClassStr(RC) << MappedVR;
+
+  NameStr.flush();
+  return Name;
+}
+
+void NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr,
+                                          raw_ostream &O) {
+  O << getVirtualRegisterName(vr);
+}
+
+void NVPTXAsmPrinter::printVecModifiedImmediate(
+    const MachineOperand &MO, const char *Modifier, raw_ostream &O) {
+  static const char vecelem[] = { '0', '1', '2', '3', '0', '1', '2', '3' };
+  int Imm = (int) MO.getImm();
+  if (0 == strcmp(Modifier, "vecelem"))
+    O << "_" << vecelem[Imm];
+  else if (0 == strcmp(Modifier, "vecv4comm1")) {
+    if ((Imm < 0) || (Imm > 3))
+      O << "//";
+  } else if (0 == strcmp(Modifier, "vecv4comm2")) {
+    if ((Imm < 4) || (Imm > 7))
+      O << "//";
+  } else if (0 == strcmp(Modifier, "vecv4pos")) {
+    if (Imm < 0)
+      Imm = 0;
+    O << "_" << vecelem[Imm % 4];
+  } else if (0 == strcmp(Modifier, "vecv2comm1")) {
+    if ((Imm < 0) || (Imm > 1))
+      O << "//";
+  } else if (0 == strcmp(Modifier, "vecv2comm2")) {
+    if ((Imm < 2) || (Imm > 3))
+      O << "//";
+  } else if (0 == strcmp(Modifier, "vecv2pos")) {
+    if (Imm < 0)
+      Imm = 0;
+    O << "_" << vecelem[Imm % 2];
+  } else
+    llvm_unreachable("Unknown Modifier on immediate operand");
+}
+
+
+
+void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
+
+  emitLinkageDirective(F, O);
+  if (llvm::isKernelFunction(*F))
+    O << ".entry ";
+  else
+    O << ".func ";
+  printReturnValStr(F, O);
+  getSymbol(F)->print(O, MAI);
+  O << "\n";
+  emitFunctionParamList(F, O);
+  O << ";\n";
+}
+
+static bool usedInGlobalVarDef(const Constant *C) {
+  if (!C)
+    return false;
+
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+    return GV->getName() != "llvm.used";
+  }
+
+  for (const User *U : C->users())
+    if (const Constant *C = dyn_cast<Constant>(U))
+      if (usedInGlobalVarDef(C))
+        return true;
+
+  return false;
+}
+
+static bool usedInOneFunc(const User *U, Function const *&oneFunc) {
+  if (const GlobalVariable *othergv = dyn_cast<GlobalVariable>(U)) {
+    if (othergv->getName() == "llvm.used")
+      return true;
+  }
+
+  if (const Instruction *instr = dyn_cast<Instruction>(U)) {
+    if (instr->getParent() && instr->getParent()->getParent()) {
+      const Function *curFunc = instr->getParent()->getParent();
+      if (oneFunc && (curFunc != oneFunc))
+        return false;
+      oneFunc = curFunc;
+      return true;
+    } else
+      return false;
+  }
+
+  for (const User *UU : U->users())
+    if (!usedInOneFunc(UU, oneFunc))
+      return false;
+
+  return true;
+}
+
+/* Find out if a global variable can be demoted to local scope.
+ * Currently, this is valid for CUDA shared variables, which have local
+ * scope and global lifetime. So the conditions to check are :
+ * 1. Is the global variable in shared address space?
+ * 2. Does it have internal linkage?
+ * 3. Is the global variable referenced only in one function?
+ */
+static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
+  if (!gv->hasInternalLinkage())
+    return false;
+  PointerType *Pty = gv->getType();
+  if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED)
+    return false;
+
+  const Function *oneFunc = nullptr;
+
+  bool flag = usedInOneFunc(gv, oneFunc);
+  if (!flag)
+    return false;
+  if (!oneFunc)
+    return false;
+  f = oneFunc;
+  return true;
+}
+
+static bool useFuncSeen(const Constant *C,
+                        llvm::DenseMap<const Function *, bool> &seenMap) {
+  for (const User *U : C->users()) {
+    if (const Constant *cu = dyn_cast<Constant>(U)) {
+      if (useFuncSeen(cu, seenMap))
+        return true;
+    } else if (const Instruction *I = dyn_cast<Instruction>(U)) {
+      const BasicBlock *bb = I->getParent();
+      if (!bb)
+        continue;
+      const Function *caller = bb->getParent();
+      if (!caller)
+        continue;
+      if (seenMap.find(caller) != seenMap.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
+  llvm::DenseMap<const Function *, bool> seenMap;
+  for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
+    const Function *F = &*FI;
+
+    if (F->isDeclaration()) {
+      if (F->use_empty())
+        continue;
+      if (F->getIntrinsicID())
+        continue;
+      emitDeclaration(F, O);
+      continue;
+    }
+    for (const User *U : F->users()) {
+      if (const Constant *C = dyn_cast<Constant>(U)) {
+        if (usedInGlobalVarDef(C)) {
+          // The use is in the initialization of a global variable
+          // that is a function pointer, so print a declaration
+          // for the original function
+          emitDeclaration(F, O);
+          break;
+        }
+        // Emit a declaration of this function if the function that
+        // uses this constant expr has already been seen.
+        if (useFuncSeen(C, seenMap)) {
+          emitDeclaration(F, O);
+          break;
+        }
+      }
+
+      if (!isa<Instruction>(U))
+        continue;
+      const Instruction *instr = cast<Instruction>(U);
+      const BasicBlock *bb = instr->getParent();
+      if (!bb)
+        continue;
+      const Function *caller = bb->getParent();
+      if (!caller)
+        continue;
+
+      // If a caller has already been seen, then the caller is
+      // appearing in the module before the callee. so print out
+      // a declaration for the callee.
+      if (seenMap.find(caller) != seenMap.end()) {
+        emitDeclaration(F, O);
+        break;
+      }
+    }
+    seenMap[F] = true;
+  }
+}
+
+void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) {
+  DebugInfoFinder DbgFinder;
+  DbgFinder.processModule(M);
+
+  unsigned i = 1;
+  for (const DICompileUnit *DIUnit : DbgFinder.compile_units()) {
+    StringRef Filename = DIUnit->getFilename();
+    StringRef Dirname = DIUnit->getDirectory();
+    SmallString<128> FullPathName = Dirname;
+    if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
+      sys::path::append(FullPathName, Filename);
+      Filename = FullPathName;
+    }
+    if (filenameMap.find(Filename) != filenameMap.end())
+      continue;
+    filenameMap[Filename] = i;
+    OutStreamer->EmitDwarfFileDirective(i, "", Filename);
+    ++i;
+  }
+
+  for (DISubprogram *SP : DbgFinder.subprograms()) {
+    StringRef Filename = SP->getFilename();
+    StringRef Dirname = SP->getDirectory();
+    SmallString<128> FullPathName = Dirname;
+    if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
+      sys::path::append(FullPathName, Filename);
+      Filename = FullPathName;
+    }
+    if (filenameMap.find(Filename) != filenameMap.end())
+      continue;
+    filenameMap[Filename] = i;
+    OutStreamer->EmitDwarfFileDirective(i, "", Filename);
+    ++i;
+  }
+}
+
+static bool isEmptyXXStructor(GlobalVariable *GV) {
+  if (!GV) return true;
+  const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (!InitList) return true;  // Not an array; we don't know how to parse.
+  return InitList->getNumOperands() == 0;
+}
+
+bool NVPTXAsmPrinter::doInitialization(Module &M) {
+  // Construct a default subtarget off of the TargetMachine defaults. The
+  // rest of NVPTX isn't friendly to change subtargets per function and
+  // so the default TargetMachine will have all of the options.
+  const Triple &TT = TM.getTargetTriple();
+  StringRef CPU = TM.getTargetCPU();
+  StringRef FS = TM.getTargetFeatureString();
+  const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
+  const NVPTXSubtarget STI(TT, CPU, FS, NTM);
+
+  if (M.alias_size()) {
+    report_fatal_error("Module has aliases, which NVPTX does not support.");
+    return true; // error
+  }
+  if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_ctors"))) {
+    report_fatal_error(
+        "Module has a nontrivial global ctor, which NVPTX does not support.");
+    return true;  // error
+  }
+  if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_dtors"))) {
+    report_fatal_error(
+        "Module has a nontrivial global dtor, which NVPTX does not support.");
+    return true;  // error
+  }
+
+  SmallString<128> Str1;
+  raw_svector_ostream OS1(Str1);
+
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+
+  // We need to call the parent's one explicitly.
+  //bool Result = AsmPrinter::doInitialization(M);
+
+  // Initialize TargetLoweringObjectFile since we didn't do in
+  // AsmPrinter::doInitialization either right above or where it's commented out
+  // below.
+  const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
+      .Initialize(OutContext, TM);
+
+  // Emit header before any dwarf directives are emitted below.
+  emitHeader(M, OS1, STI);
+  OutStreamer->EmitRawText(OS1.str());
+
+  // Already commented out
+  //bool Result = AsmPrinter::doInitialization(M);
+
+  // Emit module-level inline asm if it exists.
+  if (!M.getModuleInlineAsm().empty()) {
+    OutStreamer->AddComment("Start of file scope inline assembly");
+    OutStreamer->AddBlankLine();
+    OutStreamer->EmitRawText(StringRef(M.getModuleInlineAsm()));
+    OutStreamer->AddBlankLine();
+    OutStreamer->AddComment("End of file scope inline assembly");
+    OutStreamer->AddBlankLine();
+  }
+
+  // If we're not NVCL we're CUDA, go ahead and emit filenames.
+  if (TM.getTargetTriple().getOS() != Triple::NVCL)
+    recordAndEmitFilenames(M);
+
+  GlobalsEmitted = false;
+    
+  return false; // success
+}
+
+void NVPTXAsmPrinter::emitGlobals(const Module &M) {
+  SmallString<128> Str2;
+  raw_svector_ostream OS2(Str2);
+
+  emitDeclarations(M, OS2);
+
+  // As ptxas does not support forward references of globals, we need to first
+  // sort the list of module-level globals in def-use order. We visit each
+  // global variable in order, and ensure that we emit it *after* its dependent
+  // globals. We use a little extra memory maintaining both a set and a list to
+  // have fast searches while maintaining a strict ordering.
+  SmallVector<const GlobalVariable *, 8> Globals;
+  DenseSet<const GlobalVariable *> GVVisited;
+  DenseSet<const GlobalVariable *> GVVisiting;
+
+  // Visit each global variable, in order
+  for (const GlobalVariable &I : M.globals())
+    VisitGlobalVariableForEmission(&I, Globals, GVVisited, GVVisiting);
+
+  assert(GVVisited.size() == M.getGlobalList().size() &&
+         "Missed a global variable");
+  assert(GVVisiting.size() == 0 && "Did not fully process a global variable");
+
+  // Print out module-level global variables in proper order
+  for (unsigned i = 0, e = Globals.size(); i != e; ++i)
+    printModuleLevelGV(Globals[i], OS2);
+
+  OS2 << '\n';
+
+  OutStreamer->EmitRawText(OS2.str());
+}
+
+void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
+                                 const NVPTXSubtarget &STI) {
+  O << "//\n";
+  O << "// Generated by LLVM NVPTX Back-End\n";
+  O << "//\n";
+  O << "\n";
+
+  unsigned PTXVersion = STI.getPTXVersion();
+  O << ".version " << (PTXVersion / 10) << "." << (PTXVersion % 10) << "\n";
+
+  O << ".target ";
+  O << STI.getTargetName();
+
+  const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
+  if (NTM.getDrvInterface() == NVPTX::NVCL)
+    O << ", texmode_independent";
+  else {
+    if (!STI.hasDouble())
+      O << ", map_f64_to_f32";
+  }
+
+  if (MAI->doesSupportDebugInformation())
+    O << ", debug";
+
+  O << "\n";
+
+  O << ".address_size ";
+  if (NTM.is64Bit())
+    O << "64";
+  else
+    O << "32";
+  O << "\n";
+
+  O << "\n";
+}
+
+bool NVPTXAsmPrinter::doFinalization(Module &M) {
+  // If we did not emit any functions, then the global declarations have not
+  // yet been emitted.
+  if (!GlobalsEmitted) {
+    emitGlobals(M);
+    GlobalsEmitted = true;
+  }
+
+  // XXX Temproarily remove global variables so that doFinalization() will not
+  // emit them again (global variables are emitted at beginning).
+
+  Module::GlobalListType &global_list = M.getGlobalList();
+  int i, n = global_list.size();
+  GlobalVariable **gv_array = new GlobalVariable *[n];
+
+  // first, back-up GlobalVariable in gv_array
+  i = 0;
+  for (Module::global_iterator I = global_list.begin(), E = global_list.end();
+       I != E; ++I)
+    gv_array[i++] = &*I;
+
+  // second, empty global_list
+  while (!global_list.empty())
+    global_list.remove(global_list.begin());
+
+  // call doFinalization
+  bool ret = AsmPrinter::doFinalization(M);
+
+  // now we restore global variables
+  for (i = 0; i < n; i++)
+    global_list.insert(global_list.end(), gv_array[i]);
+
+  clearAnnotationCache(&M);
+
+  delete[] gv_array;
+  return ret;
+
+  //bool Result = AsmPrinter::doFinalization(M);
+  // Instead of calling the parents doFinalization, we may
+  // clone parents doFinalization and customize here.
+  // Currently, we if NVISA out the EmitGlobals() in
+  // parent's doFinalization, which is too intrusive.
+  //
+  // Same for the doInitialization.
+  //return Result;
+}
+
+// This function emits appropriate linkage directives for
+// functions and global variables.
+//
+// extern function declaration            -> .extern
+// extern function definition             -> .visible
+// external global variable with init     -> .visible
+// external without init                  -> .extern
+// appending                              -> not allowed, assert.
+// for any linkage other than
+// internal, private, linker_private,
+// linker_private_weak, linker_private_weak_def_auto,
+// we emit                                -> .weak.
+
+void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
+                                           raw_ostream &O) {
+  if (static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() == NVPTX::CUDA) {
+    if (V->hasExternalLinkage()) {
+      if (isa<GlobalVariable>(V)) {
+        const GlobalVariable *GVar = cast<GlobalVariable>(V);
+        if (GVar) {
+          if (GVar->hasInitializer())
+            O << ".visible ";
+          else
+            O << ".extern ";
+        }
+      } else if (V->isDeclaration())
+        O << ".extern ";
+      else
+        O << ".visible ";
+    } else if (V->hasAppendingLinkage()) {
+      std::string msg;
+      msg.append("Error: ");
+      msg.append("Symbol ");
+      if (V->hasName())
+        msg.append(V->getName());
+      msg.append("has unsupported appending linkage type");
+      llvm_unreachable(msg.c_str());
+    } else if (!V->hasInternalLinkage() &&
+               !V->hasPrivateLinkage()) {
+      O << ".weak ";
+    }
+  }
+}
+
+void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
+                                         raw_ostream &O,
+                                         bool processDemoted) {
+
+  // Skip meta data
+  if (GVar->hasSection()) {
+    if (GVar->getSection() == "llvm.metadata")
+      return;
+  }
+
+  // Skip LLVM intrinsic global variables
+  if (GVar->getName().startswith("llvm.") ||
+      GVar->getName().startswith("nvvm."))
+    return;
+
+  const DataLayout &DL = getDataLayout();
+
+  // GlobalVariables are always constant pointers themselves.
+  PointerType *PTy = GVar->getType();
+  Type *ETy = GVar->getValueType();
+
+  if (GVar->hasExternalLinkage()) {
+    if (GVar->hasInitializer())
+      O << ".visible ";
+    else
+      O << ".extern ";
+  } else if (GVar->hasLinkOnceLinkage() || GVar->hasWeakLinkage() ||
+             GVar->hasAvailableExternallyLinkage() ||
+             GVar->hasCommonLinkage()) {
+    O << ".weak ";
+  }
+
+  if (llvm::isTexture(*GVar)) {
+    O << ".global .texref " << llvm::getTextureName(*GVar) << ";\n";
+    return;
+  }
+
+  if (llvm::isSurface(*GVar)) {
+    O << ".global .surfref " << llvm::getSurfaceName(*GVar) << ";\n";
+    return;
+  }
+
+  if (GVar->isDeclaration()) {
+    // (extern) declarations, no definition or initializer
+    // Currently the only known declaration is for an automatic __local
+    // (.shared) promoted to global.
+    emitPTXGlobalVariable(GVar, O);
+    O << ";\n";
+    return;
+  }
+
+  if (llvm::isSampler(*GVar)) {
+    O << ".global .samplerref " << llvm::getSamplerName(*GVar);
+
+    const Constant *Initializer = nullptr;
+    if (GVar->hasInitializer())
+      Initializer = GVar->getInitializer();
+    const ConstantInt *CI = nullptr;
+    if (Initializer)
+      CI = dyn_cast<ConstantInt>(Initializer);
+    if (CI) {
+      unsigned sample = CI->getZExtValue();
+
+      O << " = { ";
+
+      for (int i = 0,
+               addr = ((sample & __CLK_ADDRESS_MASK) >> __CLK_ADDRESS_BASE);
+           i < 3; i++) {
+        O << "addr_mode_" << i << " = ";
+        switch (addr) {
+        case 0:
+          O << "wrap";
+          break;
+        case 1:
+          O << "clamp_to_border";
+          break;
+        case 2:
+          O << "clamp_to_edge";
+          break;
+        case 3:
+          O << "wrap";
+          break;
+        case 4:
+          O << "mirror";
+          break;
+        }
+        O << ", ";
+      }
+      O << "filter_mode = ";
+      switch ((sample & __CLK_FILTER_MASK) >> __CLK_FILTER_BASE) {
+      case 0:
+        O << "nearest";
+        break;
+      case 1:
+        O << "linear";
+        break;
+      case 2:
+        llvm_unreachable("Anisotropic filtering is not supported");
+      default:
+        O << "nearest";
+        break;
+      }
+      if (!((sample & __CLK_NORMALIZED_MASK) >> __CLK_NORMALIZED_BASE)) {
+        O << ", force_unnormalized_coords = 1";
+      }
+      O << " }";
+    }
+
+    O << ";\n";
+    return;
+  }
+
+  if (GVar->hasPrivateLinkage()) {
+
+    if (!strncmp(GVar->getName().data(), "unrollpragma", 12))
+      return;
+
+    // FIXME - need better way (e.g. Metadata) to avoid generating this global
+    if (!strncmp(GVar->getName().data(), "filename", 8))
+      return;
+    if (GVar->use_empty())
+      return;
+  }
+
+  const Function *demotedFunc = nullptr;
+  if (!processDemoted && canDemoteGlobalVar(GVar, demotedFunc)) {
+    O << "// " << GVar->getName() << " has been demoted\n";
+    if (localDecls.find(demotedFunc) != localDecls.end())
+      localDecls[demotedFunc].push_back(GVar);
+    else {
+      std::vector<const GlobalVariable *> temp;
+      temp.push_back(GVar);
+      localDecls[demotedFunc] = temp;
+    }
+    return;
+  }
+
+  O << ".";
+  emitPTXAddressSpace(PTy->getAddressSpace(), O);
+
+  if (isManaged(*GVar)) {
+    O << " .attribute(.managed)";
+  }
+
+  if (GVar->getAlignment() == 0)
+    O << " .align " << (int)DL.getPrefTypeAlignment(ETy);
+  else
+    O << " .align " << GVar->getAlignment();
+
+  if (ETy->isFloatingPointTy() || ETy->isIntegerTy() || ETy->isPointerTy()) {
+    O << " .";
+    // Special case: ABI requires that we use .u8 for predicates
+    if (ETy->isIntegerTy(1))
+      O << "u8";
+    else
+      O << getPTXFundamentalTypeStr(ETy, false);
+    O << " ";
+    getSymbol(GVar)->print(O, MAI);
+
+    // Ptx allows variable initilization only for constant and global state
+    // spaces.
+    if (GVar->hasInitializer()) {
+      if ((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
+          (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) {
+        const Constant *Initializer = GVar->getInitializer();
+        // 'undef' is treated as there is no value specified.
+        if (!Initializer->isNullValue() && !isa<UndefValue>(Initializer)) {
+          O << " = ";
+          printScalarConstant(Initializer, O);
+        }
+      } else {
+        // The frontend adds zero-initializer to device and constant variables
+        // that don't have an initial value, and UndefValue to shared
+        // variables, so skip warning for this case.
+        if (!GVar->getInitializer()->isNullValue() &&
+            !isa<UndefValue>(GVar->getInitializer())) {
+          report_fatal_error("initial value of '" + GVar->getName() +
+                             "' is not allowed in addrspace(" +
+                             Twine(PTy->getAddressSpace()) + ")");
+        }
+      }
+    }
+  } else {
+    unsigned int ElementSize = 0;
+
+    // Although PTX has direct support for struct type and array type and
+    // LLVM IR is very similar to PTX, the LLVM CodeGen does not support for
+    // targets that support these high level field accesses. Structs, arrays
+    // and vectors are lowered into arrays of bytes.
+    switch (ETy->getTypeID()) {
+    case Type::StructTyID:
+    case Type::ArrayTyID:
+    case Type::VectorTyID:
+      ElementSize = DL.getTypeStoreSize(ETy);
+      // Ptx allows variable initilization only for constant and
+      // global state spaces.
+      if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
+           (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
+          GVar->hasInitializer()) {
+        const Constant *Initializer = GVar->getInitializer();
+        if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) {
+          AggBuffer aggBuffer(ElementSize, O, *this);
+          bufferAggregateConstant(Initializer, &aggBuffer);
+          if (aggBuffer.numSymbols) {
+            if (static_cast<const NVPTXTargetMachine &>(TM).is64Bit()) {
+              O << " .u64 ";
+              getSymbol(GVar)->print(O, MAI);
+              O << "[";
+              O << ElementSize / 8;
+            } else {
+              O << " .u32 ";
+              getSymbol(GVar)->print(O, MAI);
+              O << "[";
+              O << ElementSize / 4;
+            }
+            O << "]";
+          } else {
+            O << " .b8 ";
+            getSymbol(GVar)->print(O, MAI);
+            O << "[";
+            O << ElementSize;
+            O << "]";
+          }
+          O << " = {";
+          aggBuffer.print();
+          O << "}";
+        } else {
+          O << " .b8 ";
+          getSymbol(GVar)->print(O, MAI);
+          if (ElementSize) {
+            O << "[";
+            O << ElementSize;
+            O << "]";
+          }
+        }
+      } else {
+        O << " .b8 ";
+        getSymbol(GVar)->print(O, MAI);
+        if (ElementSize) {
+          O << "[";
+          O << ElementSize;
+          O << "]";
+        }
+      }
+      break;
+    default:
+      llvm_unreachable("type not supported yet");
+    }
+
+  }
+  O << ";\n";
+}
+
+void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
+  if (localDecls.find(f) == localDecls.end())
+    return;
+
+  std::vector<const GlobalVariable *> &gvars = localDecls[f];
+
+  for (unsigned i = 0, e = gvars.size(); i != e; ++i) {
+    O << "\t// demoted variable\n\t";
+    printModuleLevelGV(gvars[i], O, true);
+  }
+}
+
+void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
+                                          raw_ostream &O) const {
+  switch (AddressSpace) {
+  case llvm::ADDRESS_SPACE_LOCAL:
+    O << "local";
+    break;
+  case llvm::ADDRESS_SPACE_GLOBAL:
+    O << "global";
+    break;
+  case llvm::ADDRESS_SPACE_CONST:
+    O << "const";
+    break;
+  case llvm::ADDRESS_SPACE_SHARED:
+    O << "shared";
+    break;
+  default:
+    report_fatal_error("Bad address space found while emitting PTX");
+    break;
+  }
+}
+
+std::string
+NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const {
+  switch (Ty->getTypeID()) {
+  default:
+    llvm_unreachable("unexpected type");
+    break;
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits == 1)
+      return "pred";
+    else if (NumBits <= 64) {
+      std::string name = "u";
+      return name + utostr(NumBits);
+    } else {
+      llvm_unreachable("Integer too large");
+      break;
+    }
+    break;
+  }
+  case Type::FloatTyID:
+    return "f32";
+  case Type::DoubleTyID:
+    return "f64";
+  case Type::PointerTyID:
+    if (static_cast<const NVPTXTargetMachine &>(TM).is64Bit())
+      if (useB4PTR)
+        return "b64";
+      else
+        return "u64";
+    else if (useB4PTR)
+      return "b32";
+    else
+      return "u32";
+  }
+  llvm_unreachable("unexpected type");
+  return nullptr;
+}
+
+void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
+                                            raw_ostream &O) {
+
+  const DataLayout &DL = getDataLayout();
+
+  // GlobalVariables are always constant pointers themselves.
+  Type *ETy = GVar->getValueType();
+
+  O << ".";
+  emitPTXAddressSpace(GVar->getType()->getAddressSpace(), O);
+  if (GVar->getAlignment() == 0)
+    O << " .align " << (int)DL.getPrefTypeAlignment(ETy);
+  else
+    O << " .align " << GVar->getAlignment();
+
+  if (ETy->isFloatingPointTy() || ETy->isIntegerTy() || ETy->isPointerTy()) {
+    O << " .";
+    O << getPTXFundamentalTypeStr(ETy);
+    O << " ";
+    getSymbol(GVar)->print(O, MAI);
+    return;
+  }
+
+  int64_t ElementSize = 0;
+
+  // Although PTX has direct support for struct type and array type and LLVM IR
+  // is very similar to PTX, the LLVM CodeGen does not support for targets that
+  // support these high level field accesses. Structs and arrays are lowered
+  // into arrays of bytes.
+  switch (ETy->getTypeID()) {
+  case Type::StructTyID:
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+    ElementSize = DL.getTypeStoreSize(ETy);
+    O << " .b8 ";
+    getSymbol(GVar)->print(O, MAI);
+    O << "[";
+    if (ElementSize) {
+      O << ElementSize;
+    }
+    O << "]";
+    break;
+  default:
+    llvm_unreachable("type not supported yet");
+  }
+  return;
+}
+
+static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) {
+  if (Ty->isSingleValueType())
+    return DL.getPrefTypeAlignment(Ty);
+
+  auto *ATy = dyn_cast<ArrayType>(Ty);
+  if (ATy)
+    return getOpenCLAlignment(DL, ATy->getElementType());
+
+  auto *STy = dyn_cast<StructType>(Ty);
+  if (STy) {
+    unsigned int alignStruct = 1;
+    // Go through each element of the struct and find the
+    // largest alignment.
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; i++) {
+      Type *ETy = STy->getElementType(i);
+      unsigned int align = getOpenCLAlignment(DL, ETy);
+      if (align > alignStruct)
+        alignStruct = align;
+    }
+    return alignStruct;
+  }
+
+  auto *FTy = dyn_cast<FunctionType>(Ty);
+  if (FTy)
+    return DL.getPointerPrefAlignment();
+  return DL.getPrefTypeAlignment(Ty);
+}
+
+void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
+                                     int paramIndex, raw_ostream &O) {
+  getSymbol(I->getParent())->print(O, MAI);
+  O << "_param_" << paramIndex;
+}
+
+void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
+  const DataLayout &DL = getDataLayout();
+  const AttributeSet &PAL = F->getAttributes();
+  const TargetLowering *TLI = nvptxSubtarget->getTargetLowering();
+  Function::const_arg_iterator I, E;
+  unsigned paramIndex = 0;
+  bool first = true;
+  bool isKernelFunc = llvm::isKernelFunction(*F);
+  bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
+  MVT thePointerTy = TLI->getPointerTy(DL);
+
+  if (F->arg_empty()) {
+    O << "()\n";
+    return;
+  }
+
+  O << "(\n";
+
+  for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, paramIndex++) {
+    Type *Ty = I->getType();
+
+    if (!first)
+      O << ",\n";
+
+    first = false;
+
+    // Handle image/sampler parameters
+    if (isKernelFunction(*F)) {
+      if (isSampler(*I) || isImage(*I)) {
+        if (isImage(*I)) {
+          std::string sname = I->getName();
+          if (isImageWriteOnly(*I) || isImageReadWrite(*I)) {
+            if (nvptxSubtarget->hasImageHandles())
+              O << "\t.param .u64 .ptr .surfref ";
+            else
+              O << "\t.param .surfref ";
+            CurrentFnSym->print(O, MAI);
+            O << "_param_" << paramIndex;
+          }
+          else { // Default image is read_only
+            if (nvptxSubtarget->hasImageHandles())
+              O << "\t.param .u64 .ptr .texref ";
+            else
+              O << "\t.param .texref ";
+            CurrentFnSym->print(O, MAI);
+            O << "_param_" << paramIndex;
+          }
+        } else {
+          if (nvptxSubtarget->hasImageHandles())
+            O << "\t.param .u64 .ptr .samplerref ";
+          else
+            O << "\t.param .samplerref ";
+          CurrentFnSym->print(O, MAI);
+          O << "_param_" << paramIndex;
+        }
+        continue;
+      }
+    }
+
+    if (!PAL.hasAttribute(paramIndex + 1, Attribute::ByVal)) {
+      if (Ty->isAggregateType() || Ty->isVectorTy()) {
+        // Just print .param .align <a> .b8 .param[size];
+        // <a> = PAL.getparamalignment
+        // size = typeallocsize of element type
+        unsigned align = PAL.getParamAlignment(paramIndex + 1);
+        if (align == 0)
+          align = DL.getABITypeAlignment(Ty);
+
+        unsigned sz = DL.getTypeAllocSize(Ty);
+        O << "\t.param .align " << align << " .b8 ";
+        printParamName(I, paramIndex, O);
+        O << "[" << sz << "]";
+
+        continue;
+      }
+      // Just a scalar
+      auto *PTy = dyn_cast<PointerType>(Ty);
+      if (isKernelFunc) {
+        if (PTy) {
+          // Special handling for pointer arguments to kernel
+          O << "\t.param .u" << thePointerTy.getSizeInBits() << " ";
+
+          if (static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() !=
+              NVPTX::CUDA) {
+            Type *ETy = PTy->getElementType();
+            int addrSpace = PTy->getAddressSpace();
+            switch (addrSpace) {
+            default:
+              O << ".ptr ";
+              break;
+            case llvm::ADDRESS_SPACE_CONST:
+              O << ".ptr .const ";
+              break;
+            case llvm::ADDRESS_SPACE_SHARED:
+              O << ".ptr .shared ";
+              break;
+            case llvm::ADDRESS_SPACE_GLOBAL:
+              O << ".ptr .global ";
+              break;
+            }
+            O << ".align " << (int)getOpenCLAlignment(DL, ETy) << " ";
+          }
+          printParamName(I, paramIndex, O);
+          continue;
+        }
+
+        // non-pointer scalar to kernel func
+        O << "\t.param .";
+        // Special case: predicate operands become .u8 types
+        if (Ty->isIntegerTy(1))
+          O << "u8";
+        else
+          O << getPTXFundamentalTypeStr(Ty);
+        O << " ";
+        printParamName(I, paramIndex, O);
+        continue;
+      }
+      // Non-kernel function, just print .param .b<size> for ABI
+      // and .reg .b<size> for non-ABI
+      unsigned sz = 0;
+      if (isa<IntegerType>(Ty)) {
+        sz = cast<IntegerType>(Ty)->getBitWidth();
+        if (sz < 32)
+          sz = 32;
+      } else if (isa<PointerType>(Ty))
+        sz = thePointerTy.getSizeInBits();
+      else
+        sz = Ty->getPrimitiveSizeInBits();
+      if (isABI)
+        O << "\t.param .b" << sz << " ";
+      else
+        O << "\t.reg .b" << sz << " ";
+      printParamName(I, paramIndex, O);
+      continue;
+    }
+
+    // param has byVal attribute. So should be a pointer
+    auto *PTy = dyn_cast<PointerType>(Ty);
+    assert(PTy && "Param with byval attribute should be a pointer type");
+    Type *ETy = PTy->getElementType();
+
+    if (isABI || isKernelFunc) {
+      // Just print .param .align <a> .b8 .param[size];
+      // <a> = PAL.getparamalignment
+      // size = typeallocsize of element type
+      unsigned align = PAL.getParamAlignment(paramIndex + 1);
+      if (align == 0)
+        align = DL.getABITypeAlignment(ETy);
+      // Work around a bug in ptxas. When PTX code takes address of
+      // byval parameter with alignment < 4, ptxas generates code to
+      // spill argument into memory. Alas on sm_50+ ptxas generates
+      // SASS code that fails with misaligned access. To work around
+      // the problem, make sure that we align byval parameters by at
+      // least 4. Matching change must be made in LowerCall() where we
+      // prepare parameters for the call.
+      //
+      // TODO: this will need to be undone when we get to support multi-TU
+      // device-side compilation as it breaks ABI compatibility with nvcc.
+      // Hopefully ptxas bug is fixed by then.
+      if (!isKernelFunc && align < 4)
+        align = 4;
+      unsigned sz = DL.getTypeAllocSize(ETy);
+      O << "\t.param .align " << align << " .b8 ";
+      printParamName(I, paramIndex, O);
+      O << "[" << sz << "]";
+      continue;
+    } else {
+      // Split the ETy into constituent parts and
+      // print .param .b<size> <name> for each part.
+      // Further, if a part is vector, print the above for
+      // each vector element.
+      SmallVector<EVT, 16> vtparts;
+      ComputeValueVTs(*TLI, DL, ETy, vtparts);
+      for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
+        unsigned elems = 1;
+        EVT elemtype = vtparts[i];
+        if (vtparts[i].isVector()) {
+          elems = vtparts[i].getVectorNumElements();
+          elemtype = vtparts[i].getVectorElementType();
+        }
+
+        for (unsigned j = 0, je = elems; j != je; ++j) {
+          unsigned sz = elemtype.getSizeInBits();
+          if (elemtype.isInteger() && (sz < 32))
+            sz = 32;
+          O << "\t.reg .b" << sz << " ";
+          printParamName(I, paramIndex, O);
+          if (j < je - 1)
+            O << ",\n";
+          ++paramIndex;
+        }
+        if (i < e - 1)
+          O << ",\n";
+      }
+      --paramIndex;
+      continue;
+    }
+  }
+
+  O << "\n)\n";
+}
+
+void NVPTXAsmPrinter::emitFunctionParamList(const MachineFunction &MF,
+                                            raw_ostream &O) {
+  const Function *F = MF.getFunction();
+  emitFunctionParamList(F, O);
+}
+
+void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
+    const MachineFunction &MF) {
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+
+  // Map the global virtual register number to a register class specific
+  // virtual register number starting from 1 with that class.
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  //unsigned numRegClasses = TRI->getNumRegClasses();
+
+  // Emit the Fake Stack Object
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  int NumBytes = (int) MFI.getStackSize();
+  if (NumBytes) {
+    O << "\t.local .align " << MFI.getMaxAlignment() << " .b8 \t" << DEPOTNAME
+      << getFunctionNumber() << "[" << NumBytes << "];\n";
+    if (static_cast<const NVPTXTargetMachine &>(MF.getTarget()).is64Bit()) {
+      O << "\t.reg .b64 \t%SP;\n";
+      O << "\t.reg .b64 \t%SPL;\n";
+    } else {
+      O << "\t.reg .b32 \t%SP;\n";
+      O << "\t.reg .b32 \t%SPL;\n";
+    }
+  }
+
+  // Go through all virtual registers to establish the mapping between the
+  // global virtual
+  // register number and the per class virtual register number.
+  // We use the per class virtual register number in the ptx output.
+  unsigned int numVRs = MRI->getNumVirtRegs();
+  for (unsigned i = 0; i < numVRs; i++) {
+    unsigned int vr = TRI->index2VirtReg(i);
+    const TargetRegisterClass *RC = MRI->getRegClass(vr);
+    DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
+    int n = regmap.size();
+    regmap.insert(std::make_pair(vr, n + 1));
+  }
+
+  // Emit register declarations
+  // @TODO: Extract out the real register usage
+  // O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s64 %rd<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .f64 %fd<" << NVPTXNumRegisters << ">;\n";
+
+  // Emit declaration of the virtual registers or 'physical' registers for
+  // each register class
+  for (unsigned i=0; i< TRI->getNumRegClasses(); i++) {
+    const TargetRegisterClass *RC = TRI->getRegClass(i);
+    DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
+    std::string rcname = getNVPTXRegClassName(RC);
+    std::string rcStr = getNVPTXRegClassStr(RC);
+    int n = regmap.size();
+
+    // Only declare those registers that may be used.
+    if (n) {
+       O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1)
+         << ">;\n";
+    }
+  }
+
+  OutStreamer->EmitRawText(O.str());
+}
+
+void NVPTXAsmPrinter::printFPConstant(const ConstantFP *Fp, raw_ostream &O) {
+  APFloat APF = APFloat(Fp->getValueAPF()); // make a copy
+  bool ignored;
+  unsigned int numHex;
+  const char *lead;
+
+  if (Fp->getType()->getTypeID() == Type::FloatTyID) {
+    numHex = 8;
+    lead = "0f";
+    APF.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &ignored);
+  } else if (Fp->getType()->getTypeID() == Type::DoubleTyID) {
+    numHex = 16;
+    lead = "0d";
+    APF.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &ignored);
+  } else
+    llvm_unreachable("unsupported fp type");
+
+  APInt API = APF.bitcastToAPInt();
+  std::string hexstr(utohexstr(API.getZExtValue()));
+  O << lead;
+  if (hexstr.length() < numHex)
+    O << std::string(numHex - hexstr.length(), '0');
+  O << utohexstr(API.getZExtValue());
+}
+
+void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+    O << CI->getValue();
+    return;
+  }
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) {
+    printFPConstant(CFP, O);
+    return;
+  }
+  if (isa<ConstantPointerNull>(CPV)) {
+    O << "0";
+    return;
+  }
+  if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+    bool IsNonGenericPointer = false;
+    if (GVar->getType()->getAddressSpace() != 0) {
+      IsNonGenericPointer = true;
+    }
+    if (EmitGeneric && !isa<Function>(CPV) && !IsNonGenericPointer) {
+      O << "generic(";
+      getSymbol(GVar)->print(O, MAI);
+      O << ")";
+    } else {
+      getSymbol(GVar)->print(O, MAI);
+    }
+    return;
+  }
+  if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+    const Value *v = Cexpr->stripPointerCasts();
+    PointerType *PTy = dyn_cast<PointerType>(Cexpr->getType());
+    bool IsNonGenericPointer = false;
+    if (PTy && PTy->getAddressSpace() != 0) {
+      IsNonGenericPointer = true;
+    }
+    if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+      if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
+        O << "generic(";
+        getSymbol(GVar)->print(O, MAI);
+        O << ")";
+      } else {
+        getSymbol(GVar)->print(O, MAI);
+      }
+      return;
+    } else {
+      lowerConstant(CPV)->print(O, MAI);
+      return;
+    }
+  }
+  llvm_unreachable("Not scalar type found in printScalarConstant()");
+}
+
+// These utility functions assure we get the right sequence of bytes for a given
+// type even for big-endian machines
+template <typename T> static void ConvertIntToBytes(unsigned char *p, T val) {
+  int64_t vp = (int64_t)val;
+  for (unsigned i = 0; i < sizeof(T); ++i) {
+    p[i] = (unsigned char)vp;
+    vp >>= 8;
+  }
+}
+static void ConvertFloatToBytes(unsigned char *p, float val) {
+  int32_t *vp = (int32_t *)&val;
+  for (unsigned i = 0; i < sizeof(int32_t); ++i) {
+    p[i] = (unsigned char)*vp;
+    *vp >>= 8;
+  }
+}
+static void ConvertDoubleToBytes(unsigned char *p, double val) {
+  int64_t *vp = (int64_t *)&val;
+  for (unsigned i = 0; i < sizeof(int64_t); ++i) {
+    p[i] = (unsigned char)*vp;
+    *vp >>= 8;
+  }
+}
+
+void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
+                                   AggBuffer *aggBuffer) {
+
+  const DataLayout &DL = getDataLayout();
+
+  if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
+    int s = DL.getTypeAllocSize(CPV->getType());
+    if (s < Bytes)
+      s = Bytes;
+    aggBuffer->addZeros(s);
+    return;
+  }
+
+  unsigned char ptr[8];
+  switch (CPV->getType()->getTypeID()) {
+
+  case Type::IntegerTyID: {
+    Type *ETy = CPV->getType();
+    if (ETy == Type::getInt8Ty(CPV->getContext())) {
+      unsigned char c = (unsigned char)cast<ConstantInt>(CPV)->getZExtValue();
+      ConvertIntToBytes<>(ptr, c);
+      aggBuffer->addBytes(ptr, 1, Bytes);
+    } else if (ETy == Type::getInt16Ty(CPV->getContext())) {
+      short int16 = (short)cast<ConstantInt>(CPV)->getZExtValue();
+      ConvertIntToBytes<>(ptr, int16);
+      aggBuffer->addBytes(ptr, 2, Bytes);
+    } else if (ETy == Type::getInt32Ty(CPV->getContext())) {
+      if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+        int int32 = (int)(constInt->getZExtValue());
+        ConvertIntToBytes<>(ptr, int32);
+        aggBuffer->addBytes(ptr, 4, Bytes);
+        break;
+      } else if (const auto *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+        if (const auto *constInt = dyn_cast_or_null<ConstantInt>(
+                ConstantFoldConstant(Cexpr, DL))) {
+          int int32 = (int)(constInt->getZExtValue());
+          ConvertIntToBytes<>(ptr, int32);
+          aggBuffer->addBytes(ptr, 4, Bytes);
+          break;
+        }
+        if (Cexpr->getOpcode() == Instruction::PtrToInt) {
+          Value *v = Cexpr->getOperand(0)->stripPointerCasts();
+          aggBuffer->addSymbol(v, Cexpr->getOperand(0));
+          aggBuffer->addZeros(4);
+          break;
+        }
+      }
+      llvm_unreachable("unsupported integer const type");
+    } else if (ETy == Type::getInt64Ty(CPV->getContext())) {
+      if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+        long long int64 = (long long)(constInt->getZExtValue());
+        ConvertIntToBytes<>(ptr, int64);
+        aggBuffer->addBytes(ptr, 8, Bytes);
+        break;
+      } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+        if (const auto *constInt = dyn_cast_or_null<ConstantInt>(
+                ConstantFoldConstant(Cexpr, DL))) {
+          long long int64 = (long long)(constInt->getZExtValue());
+          ConvertIntToBytes<>(ptr, int64);
+          aggBuffer->addBytes(ptr, 8, Bytes);
+          break;
+        }
+        if (Cexpr->getOpcode() == Instruction::PtrToInt) {
+          Value *v = Cexpr->getOperand(0)->stripPointerCasts();
+          aggBuffer->addSymbol(v, Cexpr->getOperand(0));
+          aggBuffer->addZeros(8);
+          break;
+        }
+      }
+      llvm_unreachable("unsupported integer const type");
+    } else
+      llvm_unreachable("unsupported integer const type");
+    break;
+  }
+  case Type::FloatTyID:
+  case Type::DoubleTyID: {
+    const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
+    Type *Ty = CFP->getType();
+    if (Ty == Type::getFloatTy(CPV->getContext())) {
+      float float32 = (float) CFP->getValueAPF().convertToFloat();
+      ConvertFloatToBytes(ptr, float32);
+      aggBuffer->addBytes(ptr, 4, Bytes);
+    } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
+      double float64 = CFP->getValueAPF().convertToDouble();
+      ConvertDoubleToBytes(ptr, float64);
+      aggBuffer->addBytes(ptr, 8, Bytes);
+    } else {
+      llvm_unreachable("unsupported fp const type");
+    }
+    break;
+  }
+  case Type::PointerTyID: {
+    if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+      aggBuffer->addSymbol(GVar, GVar);
+    } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+      const Value *v = Cexpr->stripPointerCasts();
+      aggBuffer->addSymbol(v, Cexpr);
+    }
+    unsigned int s = DL.getTypeAllocSize(CPV->getType());
+    aggBuffer->addZeros(s);
+    break;
+  }
+
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+  case Type::StructTyID: {
+    if (isa<ConstantAggregate>(CPV) || isa<ConstantDataSequential>(CPV)) {
+      int ElementSize = DL.getTypeAllocSize(CPV->getType());
+      bufferAggregateConstant(CPV, aggBuffer);
+      if (Bytes > ElementSize)
+        aggBuffer->addZeros(Bytes - ElementSize);
+    } else if (isa<ConstantAggregateZero>(CPV))
+      aggBuffer->addZeros(Bytes);
+    else
+      llvm_unreachable("Unexpected Constant type");
+    break;
+  }
+
+  default:
+    llvm_unreachable("unsupported type");
+  }
+}
+
+void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
+                                              AggBuffer *aggBuffer) {
+  const DataLayout &DL = getDataLayout();
+  int Bytes;
+
+  // Old constants
+  if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV)) {
+    if (CPV->getNumOperands())
+      for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i)
+        bufferLEByte(cast<Constant>(CPV->getOperand(i)), 0, aggBuffer);
+    return;
+  }
+
+  if (const ConstantDataSequential *CDS =
+          dyn_cast<ConstantDataSequential>(CPV)) {
+    if (CDS->getNumElements())
+      for (unsigned i = 0; i < CDS->getNumElements(); ++i)
+        bufferLEByte(cast<Constant>(CDS->getElementAsConstant(i)), 0,
+                     aggBuffer);
+    return;
+  }
+
+  if (isa<ConstantStruct>(CPV)) {
+    if (CPV->getNumOperands()) {
+      StructType *ST = cast<StructType>(CPV->getType());
+      for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) {
+        if (i == (e - 1))
+          Bytes = DL.getStructLayout(ST)->getElementOffset(0) +
+                  DL.getTypeAllocSize(ST) -
+                  DL.getStructLayout(ST)->getElementOffset(i);
+        else
+          Bytes = DL.getStructLayout(ST)->getElementOffset(i + 1) -
+                  DL.getStructLayout(ST)->getElementOffset(i);
+        bufferLEByte(cast<Constant>(CPV->getOperand(i)), Bytes, aggBuffer);
+      }
+    }
+    return;
+  }
+  llvm_unreachable("unsupported constant type in printAggregateConstant()");
+}
+
+// buildTypeNameMap - Run through symbol table looking for type names.
+//
+
+
+bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case NVPTX::CallArgBeginInst:
+  case NVPTX::CallArgEndInst0:
+  case NVPTX::CallArgEndInst1:
+  case NVPTX::CallArgF32:
+  case NVPTX::CallArgF64:
+  case NVPTX::CallArgI16:
+  case NVPTX::CallArgI32:
+  case NVPTX::CallArgI32imm:
+  case NVPTX::CallArgI64:
+  case NVPTX::CallArgParam:
+  case NVPTX::CallVoidInst:
+  case NVPTX::CallVoidInstReg:
+  case NVPTX::Callseq_End:
+  case NVPTX::CallVoidInstReg64:
+  case NVPTX::DeclareParamInst:
+  case NVPTX::DeclareRetMemInst:
+  case NVPTX::DeclareRetRegInst:
+  case NVPTX::DeclareRetScalarInst:
+  case NVPTX::DeclareScalarParamInst:
+  case NVPTX::DeclareScalarRegInst:
+  case NVPTX::StoreParamF32:
+  case NVPTX::StoreParamF64:
+  case NVPTX::StoreParamI16:
+  case NVPTX::StoreParamI32:
+  case NVPTX::StoreParamI64:
+  case NVPTX::StoreParamI8:
+  case NVPTX::StoreRetvalF32:
+  case NVPTX::StoreRetvalF64:
+  case NVPTX::StoreRetvalI16:
+  case NVPTX::StoreRetvalI32:
+  case NVPTX::StoreRetvalI64:
+  case NVPTX::StoreRetvalI8:
+  case NVPTX::LastCallArgF32:
+  case NVPTX::LastCallArgF64:
+  case NVPTX::LastCallArgI16:
+  case NVPTX::LastCallArgI32:
+  case NVPTX::LastCallArgI32imm:
+  case NVPTX::LastCallArgI64:
+  case NVPTX::LastCallArgParam:
+  case NVPTX::LoadParamMemF32:
+  case NVPTX::LoadParamMemF64:
+  case NVPTX::LoadParamMemI16:
+  case NVPTX::LoadParamMemI32:
+  case NVPTX::LoadParamMemI64:
+  case NVPTX::LoadParamMemI8:
+  case NVPTX::PrototypeInst:
+  case NVPTX::DBG_VALUE:
+    return true;
+  }
+  return false;
+}
+
+/// lowerConstantForGV - Return an MCExpr for the given Constant.  This is mostly
+/// a copy from AsmPrinter::lowerConstant, except customized to only handle
+/// expressions that are representable in PTX and create
+/// NVPTXGenericMCSymbolRefExpr nodes for addrspacecast instructions.
+const MCExpr *
+NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric) {
+  MCContext &Ctx = OutContext;
+
+  if (CV->isNullValue() || isa<UndefValue>(CV))
+    return MCConstantExpr::create(0, Ctx);
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV))
+    return MCConstantExpr::create(CI->getZExtValue(), Ctx);
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) {
+    const MCSymbolRefExpr *Expr =
+      MCSymbolRefExpr::create(getSymbol(GV), Ctx);
+    if (ProcessingGeneric) {
+      return NVPTXGenericMCSymbolRefExpr::create(Expr, Ctx);
+    } else {
+      return Expr;
+    }
+  }
+
+  const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
+  if (!CE) {
+    llvm_unreachable("Unknown constant value to lower!");
+  }
+
+  switch (CE->getOpcode()) {
+  default:
+    // If the code isn't optimized, there may be outstanding folding
+    // opportunities. Attempt to fold the expression using DataLayout as a
+    // last resort before giving up.
+    if (Constant *C = ConstantFoldConstant(CE, getDataLayout()))
+      if (C && C != CE)
+        return lowerConstantForGV(C, ProcessingGeneric);
+
+    // Otherwise report the problem to the user.
+    {
+      std::string S;
+      raw_string_ostream OS(S);
+      OS << "Unsupported expression in static initializer: ";
+      CE->printAsOperand(OS, /*PrintType=*/false,
+                     !MF ? nullptr : MF->getFunction()->getParent());
+      report_fatal_error(OS.str());
+    }
+
+  case Instruction::AddrSpaceCast: {
+    // Strip the addrspacecast and pass along the operand
+    PointerType *DstTy = cast<PointerType>(CE->getType());
+    if (DstTy->getAddressSpace() == 0) {
+      return lowerConstantForGV(cast<const Constant>(CE->getOperand(0)), true);
+    }
+    std::string S;
+    raw_string_ostream OS(S);
+    OS << "Unsupported expression in static initializer: ";
+    CE->printAsOperand(OS, /*PrintType=*/ false,
+                       !MF ? 0 : MF->getFunction()->getParent());
+    report_fatal_error(OS.str());
+  }
+
+  case Instruction::GetElementPtr: {
+    const DataLayout &DL = getDataLayout();
+
+    // Generate a symbolic expression for the byte address
+    APInt OffsetAI(DL.getPointerTypeSizeInBits(CE->getType()), 0);
+    cast<GEPOperator>(CE)->accumulateConstantOffset(DL, OffsetAI);
+
+    const MCExpr *Base = lowerConstantForGV(CE->getOperand(0),
+                                            ProcessingGeneric);
+    if (!OffsetAI)
+      return Base;
+
+    int64_t Offset = OffsetAI.getSExtValue();
+    return MCBinaryExpr::createAdd(Base, MCConstantExpr::create(Offset, Ctx),
+                                   Ctx);
+  }
+
+  case Instruction::Trunc:
+    // We emit the value and depend on the assembler to truncate the generated
+    // expression properly.  This is important for differences between
+    // blockaddress labels.  Since the two labels are in the same function, it
+    // is reasonable to treat their delta as a 32-bit value.
+    LLVM_FALLTHROUGH;
+  case Instruction::BitCast:
+    return lowerConstantForGV(CE->getOperand(0), ProcessingGeneric);
+
+  case Instruction::IntToPtr: {
+    const DataLayout &DL = getDataLayout();
+
+    // Handle casts to pointers by changing them into casts to the appropriate
+    // integer type.  This promotes constant folding and simplifies this code.
+    Constant *Op = CE->getOperand(0);
+    Op = ConstantExpr::getIntegerCast(Op, DL.getIntPtrType(CV->getType()),
+                                      false/*ZExt*/);
+    return lowerConstantForGV(Op, ProcessingGeneric);
+  }
+
+  case Instruction::PtrToInt: {
+    const DataLayout &DL = getDataLayout();
+
+    // Support only foldable casts to/from pointers that can be eliminated by
+    // changing the pointer to the appropriately sized integer type.
+    Constant *Op = CE->getOperand(0);
+    Type *Ty = CE->getType();
+
+    const MCExpr *OpExpr = lowerConstantForGV(Op, ProcessingGeneric);
+
+    // We can emit the pointer value into this slot if the slot is an
+    // integer slot equal to the size of the pointer.
+    if (DL.getTypeAllocSize(Ty) == DL.getTypeAllocSize(Op->getType()))
+      return OpExpr;
+
+    // Otherwise the pointer is smaller than the resultant integer, mask off
+    // the high bits so we are sure to get a proper truncation if the input is
+    // a constant expr.
+    unsigned InBits = DL.getTypeAllocSizeInBits(Op->getType());
+    const MCExpr *MaskExpr = MCConstantExpr::create(~0ULL >> (64-InBits), Ctx);
+    return MCBinaryExpr::createAnd(OpExpr, MaskExpr, Ctx);
+  }
+
+  // The MC library also has a right-shift operator, but it isn't consistently
+  // signed or unsigned between different targets.
+  case Instruction::Add: {
+    const MCExpr *LHS = lowerConstantForGV(CE->getOperand(0), ProcessingGeneric);
+    const MCExpr *RHS = lowerConstantForGV(CE->getOperand(1), ProcessingGeneric);
+    switch (CE->getOpcode()) {
+    default: llvm_unreachable("Unknown binary operator constant cast expr");
+    case Instruction::Add: return MCBinaryExpr::createAdd(LHS, RHS, Ctx);
+    }
+  }
+  }
+}
+
+// Copy of MCExpr::print customized for NVPTX
+void NVPTXAsmPrinter::printMCExpr(const MCExpr &Expr, raw_ostream &OS) {
+  switch (Expr.getKind()) {
+  case MCExpr::Target:
+    return cast<MCTargetExpr>(&Expr)->printImpl(OS, MAI);
+  case MCExpr::Constant:
+    OS << cast<MCConstantExpr>(Expr).getValue();
+    return;
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr &SRE = cast<MCSymbolRefExpr>(Expr);
+    const MCSymbol &Sym = SRE.getSymbol();
+    Sym.print(OS, MAI);
+    return;
+  }
+
+  case MCExpr::Unary: {
+    const MCUnaryExpr &UE = cast<MCUnaryExpr>(Expr);
+    switch (UE.getOpcode()) {
+    case MCUnaryExpr::LNot:  OS << '!'; break;
+    case MCUnaryExpr::Minus: OS << '-'; break;
+    case MCUnaryExpr::Not:   OS << '~'; break;
+    case MCUnaryExpr::Plus:  OS << '+'; break;
+    }
+    printMCExpr(*UE.getSubExpr(), OS);
+    return;
+  }
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr &BE = cast<MCBinaryExpr>(Expr);
+
+    // Only print parens around the LHS if it is non-trivial.
+    if (isa<MCConstantExpr>(BE.getLHS()) || isa<MCSymbolRefExpr>(BE.getLHS()) ||
+        isa<NVPTXGenericMCSymbolRefExpr>(BE.getLHS())) {
+      printMCExpr(*BE.getLHS(), OS);
+    } else {
+      OS << '(';
+      printMCExpr(*BE.getLHS(), OS);
+      OS<< ')';
+    }
+
+    switch (BE.getOpcode()) {
+    case MCBinaryExpr::Add:
+      // Print "X-42" instead of "X+-42".
+      if (const MCConstantExpr *RHSC = dyn_cast<MCConstantExpr>(BE.getRHS())) {
+        if (RHSC->getValue() < 0) {
+          OS << RHSC->getValue();
+          return;
+        }
+      }
+
+      OS <<  '+';
+      break;
+    default: llvm_unreachable("Unhandled binary operator");
+    }
+
+    // Only print parens around the LHS if it is non-trivial.
+    if (isa<MCConstantExpr>(BE.getRHS()) || isa<MCSymbolRefExpr>(BE.getRHS())) {
+      printMCExpr(*BE.getRHS(), OS);
+    } else {
+      OS << '(';
+      printMCExpr(*BE.getRHS(), OS);
+      OS << ')';
+    }
+    return;
+  }
+  }
+
+  llvm_unreachable("Invalid expression kind!");
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'r':
+      break;
+    }
+  }
+
+  printOperand(MI, OpNo, O);
+
+  return false;
+}
+
+bool NVPTXAsmPrinter::PrintAsmMemoryOperand(
+    const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant,
+    const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier
+
+  O << '[';
+  printMemOperand(MI, OpNo, O);
+  O << ']';
+
+  return false;
+}
+
+void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                   raw_ostream &O, const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      if (MO.getReg() == NVPTX::VRDepot)
+        O << DEPOTNAME << getFunctionNumber();
+      else
+        O << NVPTXInstPrinter::getRegisterName(MO.getReg());
+    } else {
+      emitVirtualRegister(MO.getReg(), O);
+    }
+    return;
+
+  case MachineOperand::MO_Immediate:
+    if (!Modifier)
+      O << MO.getImm();
+    else if (strstr(Modifier, "vec") == Modifier)
+      printVecModifiedImmediate(MO, Modifier, O);
+    else
+      llvm_unreachable(
+          "Don't know how to handle modifier on immediate operand");
+    return;
+
+  case MachineOperand::MO_FPImmediate:
+    printFPConstant(MO.getFPImm(), O);
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    getSymbol(MO.getGlobal())->print(O, MAI);
+    break;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    MO.getMBB()->getSymbol()->print(O, MAI);
+    return;
+
+  default:
+    llvm_unreachable("Operand type not supported.");
+  }
+}
+
+void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+                                      raw_ostream &O, const char *Modifier) {
+  printOperand(MI, opNum, O);
+
+  if (Modifier && !strcmp(Modifier, "add")) {
+    O << ", ";
+    printOperand(MI, opNum + 1, O);
+  } else {
+    if (MI->getOperand(opNum + 1).isImm() &&
+        MI->getOperand(opNum + 1).getImm() == 0)
+      return; // don't print ',0' or '+0'
+    O << "+";
+    printOperand(MI, opNum + 1, O);
+  }
+}
+
+void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) {
+  std::stringstream temp;
+  LineReader *reader = this->getReader(filename);
+  temp << "\n//";
+  temp << filename.str();
+  temp << ":";
+  temp << line;
+  temp << " ";
+  temp << reader->readLine(line);
+  temp << "\n";
+  this->OutStreamer->EmitRawText(temp.str());
+}
+
+LineReader *NVPTXAsmPrinter::getReader(const std::string &filename) {
+  if (!reader) {
+    reader = new LineReader(filename);
+  }
+
+  if (reader->fileName() != filename) {
+    delete reader;
+    reader = new LineReader(filename);
+  }
+
+  return reader;
+}
+
+std::string LineReader::readLine(unsigned lineNum) {
+  if (lineNum < theCurLine) {
+    theCurLine = 0;
+    fstr.seekg(0, std::ios::beg);
+  }
+  while (theCurLine < lineNum) {
+    fstr.getline(buff, 500);
+    theCurLine++;
+  }
+  return buff;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXAsmPrinter() {
+  RegisterAsmPrinter<NVPTXAsmPrinter> X(getTheNVPTXTarget32());
+  RegisterAsmPrinter<NVPTXAsmPrinter> Y(getTheNVPTXTarget64());
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
new file mode 100644
index 000000000000..3dcc0e358a14
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -0,0 +1,343 @@
+//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to NVPTX assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXASMPRINTER_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXASMPRINTER_H
+
+#include "NVPTX.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <fstream>
+
+// The ptx syntax and format is very different from that usually seem in a .s
+// file,
+// therefore we are not able to use the MCAsmStreamer interface here.
+//
+// We are handcrafting the output method here.
+//
+// A better approach is to clone the MCAsmStreamer to a MCPTXAsmStreamer
+// (subclass of MCStreamer).
+
+namespace llvm {
+  class MCOperand;
+
+class LineReader {
+private:
+  unsigned theCurLine;
+  std::ifstream fstr;
+  char buff[512];
+  std::string theFileName;
+  SmallVector<unsigned, 32> lineOffset;
+public:
+  LineReader(std::string filename) {
+    theCurLine = 0;
+    fstr.open(filename.c_str());
+    theFileName = filename;
+  }
+  std::string fileName() { return theFileName; }
+  ~LineReader() { fstr.close(); }
+  std::string readLine(unsigned line);
+};
+
+class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
+
+  class AggBuffer {
+    // Used to buffer the emitted string for initializing global
+    // aggregates.
+    //
+    // Normally an aggregate (array, vector or structure) is emitted
+    // as a u8[]. However, if one element/field of the aggregate
+    // is a non-NULL address, then the aggregate is emitted as u32[]
+    // or u64[].
+    //
+    // We first layout the aggregate in 'buffer' in bytes, except for
+    // those symbol addresses. For the i-th symbol address in the
+    //aggregate, its corresponding 4-byte or 8-byte elements in 'buffer'
+    // are filled with 0s. symbolPosInBuffer[i-1] records its position
+    // in 'buffer', and Symbols[i-1] records the Value*.
+    //
+    // Once we have this AggBuffer setup, we can choose how to print
+    // it out.
+  public:
+    unsigned numSymbols;   // number of symbol addresses
+
+  private:
+    const unsigned size;   // size of the buffer in bytes
+    std::vector<unsigned char> buffer; // the buffer
+    SmallVector<unsigned, 4> symbolPosInBuffer;
+    SmallVector<const Value *, 4> Symbols;
+    // SymbolsBeforeStripping[i] is the original form of Symbols[i] before
+    // stripping pointer casts, i.e.,
+    // Symbols[i] == SymbolsBeforeStripping[i]->stripPointerCasts().
+    //
+    // We need to keep these values because AggBuffer::print decides whether to
+    // emit a "generic()" cast for Symbols[i] depending on the address space of
+    // SymbolsBeforeStripping[i].
+    SmallVector<const Value *, 4> SymbolsBeforeStripping;
+    unsigned curpos;
+    raw_ostream &O;
+    NVPTXAsmPrinter &AP;
+    bool EmitGeneric;
+
+  public:
+    AggBuffer(unsigned size, raw_ostream &O, NVPTXAsmPrinter &AP)
+        : size(size), buffer(size), O(O), AP(AP) {
+      curpos = 0;
+      numSymbols = 0;
+      EmitGeneric = AP.EmitGeneric;
+    }
+    unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) {
+      assert((curpos + Num) <= size);
+      assert((curpos + Bytes) <= size);
+      for (int i = 0; i < Num; ++i) {
+        buffer[curpos] = Ptr[i];
+        curpos++;
+      }
+      for (int i = Num; i < Bytes; ++i) {
+        buffer[curpos] = 0;
+        curpos++;
+      }
+      return curpos;
+    }
+    unsigned addZeros(int Num) {
+      assert((curpos + Num) <= size);
+      for (int i = 0; i < Num; ++i) {
+        buffer[curpos] = 0;
+        curpos++;
+      }
+      return curpos;
+    }
+    void addSymbol(const Value *GVar, const Value *GVarBeforeStripping) {
+      symbolPosInBuffer.push_back(curpos);
+      Symbols.push_back(GVar);
+      SymbolsBeforeStripping.push_back(GVarBeforeStripping);
+      numSymbols++;
+    }
+    void print() {
+      if (numSymbols == 0) {
+        // print out in bytes
+        for (unsigned i = 0; i < size; i++) {
+          if (i)
+            O << ", ";
+          O << (unsigned int) buffer[i];
+        }
+      } else {
+        // print out in 4-bytes or 8-bytes
+        unsigned int pos = 0;
+        unsigned int nSym = 0;
+        unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
+        unsigned int nBytes = 4;
+        if (static_cast<const NVPTXTargetMachine &>(AP.TM).is64Bit())
+          nBytes = 8;
+        for (pos = 0; pos < size; pos += nBytes) {
+          if (pos)
+            O << ", ";
+          if (pos == nextSymbolPos) {
+            const Value *v = Symbols[nSym];
+            const Value *v0 = SymbolsBeforeStripping[nSym];
+            if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+              MCSymbol *Name = AP.getSymbol(GVar);
+              PointerType *PTy = dyn_cast<PointerType>(v0->getType());
+              bool IsNonGenericPointer = false; // Is v0 a non-generic pointer?
+              if (PTy && PTy->getAddressSpace() != 0) {
+                IsNonGenericPointer = true;
+              }
+              if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
+                O << "generic(";
+                Name->print(O, AP.MAI);
+                O << ")";
+              } else {
+                Name->print(O, AP.MAI);
+              }
+            } else if (const ConstantExpr *CExpr = dyn_cast<ConstantExpr>(v0)) {
+              const MCExpr *Expr =
+                AP.lowerConstantForGV(cast<Constant>(CExpr), false);
+              AP.printMCExpr(*Expr, O);
+            } else
+              llvm_unreachable("symbol type unknown");
+            nSym++;
+            if (nSym >= numSymbols)
+              nextSymbolPos = size + 1;
+            else
+              nextSymbolPos = symbolPosInBuffer[nSym];
+          } else if (nBytes == 4)
+            O << *(unsigned int *)(&buffer[pos]);
+          else
+            O << *(unsigned long long *)(&buffer[pos]);
+        }
+      }
+    }
+  };
+
+  friend class AggBuffer;
+
+  void emitSrcInText(StringRef filename, unsigned line);
+
+private:
+  StringRef getPassName() const override { return "NVPTX Assembly Printer"; }
+
+  const Function *F;
+  std::string CurrentFnName;
+
+  void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override;
+  void EmitFunctionEntryLabel() override;
+  void EmitFunctionBodyStart() override;
+  void EmitFunctionBodyEnd() override;
+  void emitImplicitDef(const MachineInstr *MI) const override;
+
+  void EmitInstruction(const MachineInstr *) override;
+  void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI);
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
+  MCOperand GetSymbolRef(const MCSymbol *Symbol);
+  unsigned encodeVirtualRegister(unsigned Reg);
+
+  void printVecModifiedImmediate(const MachineOperand &MO, const char *Modifier,
+                                 raw_ostream &O);
+  void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                       const char *Modifier = nullptr);
+  void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O,
+                          bool = false);
+  void printParamName(Function::const_arg_iterator I, int paramIndex,
+                      raw_ostream &O);
+  void emitGlobals(const Module &M);
+  void emitHeader(Module &M, raw_ostream &O, const NVPTXSubtarget &STI);
+  void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const;
+  void emitVirtualRegister(unsigned int vr, raw_ostream &);
+  void emitFunctionParamList(const Function *, raw_ostream &O);
+  void emitFunctionParamList(const MachineFunction &MF, raw_ostream &O);
+  void setAndEmitFunctionVirtualRegisters(const MachineFunction &MF);
+  void printReturnValStr(const Function *, raw_ostream &O);
+  void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &) override;
+  void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                    const char *Modifier = nullptr);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &) override;
+
+  const MCExpr *lowerConstantForGV(const Constant *CV, bool ProcessingGeneric);
+  void printMCExpr(const MCExpr &Expr, raw_ostream &OS);
+
+protected:
+  bool doInitialization(Module &M) override;
+  bool doFinalization(Module &M) override;
+
+private:
+  std::string CurrentBankselLabelInBasicBlock;
+
+  bool GlobalsEmitted;
+  
+  // This is specific per MachineFunction.
+  const MachineRegisterInfo *MRI;
+  // The contents are specific for each
+  // MachineFunction. But the size of the
+  // array is not.
+  typedef DenseMap<unsigned, unsigned> VRegMap;
+  typedef DenseMap<const TargetRegisterClass *, VRegMap> VRegRCMap;
+  VRegRCMap VRegMapping;
+
+  // Cache the subtarget here.
+  const NVPTXSubtarget *nvptxSubtarget;
+
+  // Build the map between type name and ID based on module's type
+  // symbol table.
+  std::map<Type *, std::string> TypeNameMap;
+
+  // List of variables demoted to a function scope.
+  std::map<const Function *, std::vector<const GlobalVariable *> > localDecls;
+
+  // To record filename to ID mapping
+  std::map<std::string, unsigned> filenameMap;
+  void recordAndEmitFilenames(Module &);
+
+  void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O);
+  void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const;
+  std::string getPTXFundamentalTypeStr(Type *Ty, bool = true) const;
+  void printScalarConstant(const Constant *CPV, raw_ostream &O);
+  void printFPConstant(const ConstantFP *Fp, raw_ostream &O);
+  void bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer);
+  void bufferAggregateConstant(const Constant *CV, AggBuffer *aggBuffer);
+
+  void emitLinkageDirective(const GlobalValue *V, raw_ostream &O);
+  void emitDeclarations(const Module &, raw_ostream &O);
+  void emitDeclaration(const Function *, raw_ostream &O);
+  void emitDemotedVars(const Function *, raw_ostream &);
+
+  bool lowerImageHandleOperand(const MachineInstr *MI, unsigned OpNo,
+                               MCOperand &MCOp);
+  void lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp);
+
+  bool isLoopHeaderOfNoUnroll(const MachineBasicBlock &MBB) const;
+
+  LineReader *reader;
+  LineReader *getReader(const std::string &);
+
+  // Used to control the need to emit .generic() in the initializer of
+  // module scope variables.
+  // Although ptx supports the hybrid mode like the following,
+  //    .global .u32 a;
+  //    .global .u32 b;
+  //    .global .u32 addr[] = {a, generic(b)}
+  // we have difficulty representing the difference in the NVVM IR.
+  //
+  // Since the address value should always be generic in CUDA C and always
+  // be specific in OpenCL, we use this simple control here.
+  //
+  bool EmitGeneric;
+
+public:
+  NVPTXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)),
+        EmitGeneric(static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() ==
+                    NVPTX::CUDA) {
+    CurrentBankselLabelInBasicBlock = "";
+    reader = nullptr;
+  }
+
+  ~NVPTXAsmPrinter() {
+    if (!reader)
+      delete reader;
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override {
+    nvptxSubtarget = &F.getSubtarget<NVPTXSubtarget>();
+    return AsmPrinter::runOnMachineFunction(F);
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineLoopInfo>();
+    AsmPrinter::getAnalysisUsage(AU);
+  }
+
+  bool ignoreLoc(const MachineInstr &);
+
+  std::string getVirtualRegisterName(unsigned) const;
+
+  DebugLoc prevDebugLoc;
+  void emitLineNumberAsDotLoc(const MachineInstr &);
+};
+} // end of namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
new file mode 100644
index 000000000000..7d4be8e809cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
@@ -0,0 +1,84 @@
+//===-- NVPTXAssignValidGlobalNames.cpp - Assign valid names to globals ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Clean up the names of global variables in the module to not contain symbols
+// that are invalid in PTX.
+//
+// Currently NVPTX, like other backends, relies on generic symbol name
+// sanitizing done by MC. However, the ptxas assembler is more stringent and
+// disallows some additional characters in symbol names. This pass makes sure
+// such names do not reach MC at all.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+
+using namespace llvm;
+
+namespace {
+/// \brief NVPTXAssignValidGlobalNames
+class NVPTXAssignValidGlobalNames : public ModulePass {
+public:
+  static char ID;
+  NVPTXAssignValidGlobalNames() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override;
+
+  /// \brief Clean up the name to remove symbols invalid in PTX.
+  std::string cleanUpName(StringRef Name);
+};
+}
+
+char NVPTXAssignValidGlobalNames::ID = 0;
+
+namespace llvm {
+void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS(NVPTXAssignValidGlobalNames, "nvptx-assign-valid-global-names",
+                "Assign valid PTX names to globals", false, false)
+
+bool NVPTXAssignValidGlobalNames::runOnModule(Module &M) {
+  for (GlobalVariable &GV : M.globals()) {
+    // We are only allowed to rename local symbols.
+    if (GV.hasLocalLinkage()) {
+      // setName doesn't do extra work if the name does not change.
+      // Note: this does not create collisions - if setName is asked to set the
+      // name to something that already exists, it adds a proper postfix to
+      // avoid collisions.
+      GV.setName(cleanUpName(GV.getName()));
+    }
+  }
+
+  return true;
+}
+
+std::string NVPTXAssignValidGlobalNames::cleanUpName(StringRef Name) {
+  std::string ValidName;
+  raw_string_ostream ValidNameStream(ValidName);
+  for (unsigned I = 0, E = Name.size(); I != E; ++I) {
+    char C = Name[I];
+    if (C == '.' || C == '@') {
+      ValidNameStream << "_$_";
+    } else {
+      ValidNameStream << C;
+    }
+  }
+
+  return ValidNameStream.str();
+}
+
+ModulePass *llvm::createNVPTXAssignValidGlobalNamesPass() {
+  return new NVPTXAssignValidGlobalNames();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
new file mode 100644
index 000000000000..6ced2f6967cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -0,0 +1,78 @@
+//=======- NVPTXFrameLowering.cpp - NVPTX Frame Information ---*- C++ -*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXFrameLowering.h"
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+NVPTXFrameLowering::NVPTXFrameLowering()
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0) {}
+
+bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
+
+void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
+                                      MachineBasicBlock &MBB) const {
+  if (MF.getFrameInfo().hasStackObjects()) {
+    assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+    MachineInstr *MI = &MBB.front();
+    MachineRegisterInfo &MR = MF.getRegInfo();
+
+    // This instruction really occurs before first instruction
+    // in the BB, so giving it no debug location.
+    DebugLoc dl = DebugLoc();
+
+    // Emits
+    //   mov %SPL, %depot;
+    //   cvta.local %SP, %SPL;
+    // for local address accesses in MF.
+    bool Is64Bit =
+        static_cast<const NVPTXTargetMachine &>(MF.getTarget()).is64Bit();
+    unsigned CvtaLocalOpcode =
+        (Is64Bit ? NVPTX::cvta_local_yes_64 : NVPTX::cvta_local_yes);
+    unsigned MovDepotOpcode =
+        (Is64Bit ? NVPTX::MOV_DEPOT_ADDR_64 : NVPTX::MOV_DEPOT_ADDR);
+    if (!MR.use_empty(NVPTX::VRFrame)) {
+      // If %SP is not used, do not bother emitting "cvta.local %SP, %SPL".
+      MI = BuildMI(MBB, MI, dl,
+                   MF.getSubtarget().getInstrInfo()->get(CvtaLocalOpcode),
+                   NVPTX::VRFrame)
+               .addReg(NVPTX::VRFrameLocal);
+    }
+    BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode),
+            NVPTX::VRFrameLocal)
+        .addImm(MF.getFunctionNumber());
+  }
+}
+
+void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF,
+                                      MachineBasicBlock &MBB) const {}
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+MachineBasicBlock::iterator NVPTXFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  // Simply discard ADJCALLSTACKDOWN,
+  // ADJCALLSTACKUP instructions.
+  return MBB.erase(I);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
new file mode 100644
index 000000000000..320ca9a2f095
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -0,0 +1,36 @@
+//===--- NVPTXFrameLowering.h - Define frame lowering for NVPTX -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class NVPTXSubtarget;
+class NVPTXFrameLowering : public TargetFrameLowering {
+public:
+  explicit NVPTXFrameLowering();
+
+  bool hasFP(const MachineFunction &MF) const override;
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
new file mode 100644
index 000000000000..390776212ce7
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -0,0 +1,354 @@
+//===-- GenericToNVVM.cpp - Convert generic module to NVVM module - C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Convert generic global variables into either .global or .const access based
+// on the variable's "constant" qualifier.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "NVPTXUtilities.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeGenericToNVVMPass(PassRegistry &);
+}
+
+namespace {
+class GenericToNVVM : public ModulePass {
+public:
+  static char ID;
+
+  GenericToNVVM() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {}
+
+private:
+  Value *getOrInsertCVTA(Module *M, Function *F, GlobalVariable *GV,
+                         IRBuilder<> &Builder);
+  Value *remapConstant(Module *M, Function *F, Constant *C,
+                       IRBuilder<> &Builder);
+  Value *remapConstantVectorOrConstantAggregate(Module *M, Function *F,
+                                                Constant *C,
+                                                IRBuilder<> &Builder);
+  Value *remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
+                           IRBuilder<> &Builder);
+
+  typedef ValueMap<GlobalVariable *, GlobalVariable *> GVMapTy;
+  typedef ValueMap<Constant *, Value *> ConstantToValueMapTy;
+  GVMapTy GVMap;
+  ConstantToValueMapTy ConstantToValueMap;
+};
+} // end namespace
+
+char GenericToNVVM::ID = 0;
+
+ModulePass *llvm::createGenericToNVVMPass() { return new GenericToNVVM(); }
+
+INITIALIZE_PASS(
+    GenericToNVVM, "generic-to-nvvm",
+    "Ensure that the global variables are in the global address space", false,
+    false)
+
+bool GenericToNVVM::runOnModule(Module &M) {
+  // Create a clone of each global variable that has the default address space.
+  // The clone is created with the global address space  specifier, and the pair
+  // of original global variable and its clone is placed in the GVMap for later
+  // use.
+
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E;) {
+    GlobalVariable *GV = &*I++;
+    if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC &&
+        !llvm::isTexture(*GV) && !llvm::isSurface(*GV) &&
+        !llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) {
+      GlobalVariable *NewGV = new GlobalVariable(
+          M, GV->getValueType(), GV->isConstant(),
+          GV->getLinkage(),
+          GV->hasInitializer() ? GV->getInitializer() : nullptr,
+          "", GV, GV->getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL);
+      NewGV->copyAttributesFrom(GV);
+      GVMap[GV] = NewGV;
+    }
+  }
+
+  // Return immediately, if every global variable has a specific address space
+  // specifier.
+  if (GVMap.empty()) {
+    return false;
+  }
+
+  // Walk through the instructions in function defitinions, and replace any use
+  // of original global variables in GVMap with a use of the corresponding
+  // copies in GVMap.  If necessary, promote constants to instructions.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    if (I->isDeclaration()) {
+      continue;
+    }
+    IRBuilder<> Builder(I->getEntryBlock().getFirstNonPHIOrDbg());
+    for (Function::iterator BBI = I->begin(), BBE = I->end(); BBI != BBE;
+         ++BBI) {
+      for (BasicBlock::iterator II = BBI->begin(), IE = BBI->end(); II != IE;
+           ++II) {
+        for (unsigned i = 0, e = II->getNumOperands(); i < e; ++i) {
+          Value *Operand = II->getOperand(i);
+          if (isa<Constant>(Operand)) {
+            II->setOperand(
+                i, remapConstant(&M, &*I, cast<Constant>(Operand), Builder));
+          }
+        }
+      }
+    }
+    ConstantToValueMap.clear();
+  }
+
+  // Copy GVMap over to a standard value map.
+  ValueToValueMapTy VM;
+  for (auto I = GVMap.begin(), E = GVMap.end(); I != E; ++I)
+    VM[I->first] = I->second;
+
+  // Walk through the global variable  initializers, and replace any use of
+  // original global variables in GVMap with a use of the corresponding copies
+  // in GVMap.  The copies need to be bitcast to the original global variable
+  // types, as we cannot use cvta in global variable initializers.
+  for (GVMapTy::iterator I = GVMap.begin(), E = GVMap.end(); I != E;) {
+    GlobalVariable *GV = I->first;
+    GlobalVariable *NewGV = I->second;
+
+    // Remove GV from the map so that it can be RAUWed.  Note that
+    // DenseMap::erase() won't invalidate any iterators but this one.
+    auto Next = std::next(I);
+    GVMap.erase(I);
+    I = Next;
+
+    Constant *BitCastNewGV = ConstantExpr::getPointerCast(NewGV, GV->getType());
+    // At this point, the remaining uses of GV should be found only in global
+    // variable initializers, as other uses have been already been removed
+    // while walking through the instructions in function definitions.
+    GV->replaceAllUsesWith(BitCastNewGV);
+    std::string Name = GV->getName();
+    GV->eraseFromParent();
+    NewGV->setName(Name);
+  }
+  assert(GVMap.empty() && "Expected it to be empty by now");
+
+  return true;
+}
+
+Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F,
+                                      GlobalVariable *GV,
+                                      IRBuilder<> &Builder) {
+  PointerType *GVType = GV->getType();
+  Value *CVTA = nullptr;
+
+  // See if the address space conversion requires the operand to be bitcast
+  // to i8 addrspace(n)* first.
+  EVT ExtendedGVType = EVT::getEVT(GV->getValueType(), true);
+  if (!ExtendedGVType.isInteger() && !ExtendedGVType.isFloatingPoint()) {
+    // A bitcast to i8 addrspace(n)* on the operand is needed.
+    LLVMContext &Context = M->getContext();
+    unsigned int AddrSpace = GVType->getAddressSpace();
+    Type *DestTy = PointerType::get(Type::getInt8Ty(Context), AddrSpace);
+    CVTA = Builder.CreateBitCast(GV, DestTy, "cvta");
+    // Insert the address space conversion.
+    Type *ResultType =
+        PointerType::get(Type::getInt8Ty(Context), llvm::ADDRESS_SPACE_GENERIC);
+    Function *CVTAFunction = Intrinsic::getDeclaration(
+        M, Intrinsic::nvvm_ptr_global_to_gen, {ResultType, DestTy});
+    CVTA = Builder.CreateCall(CVTAFunction, CVTA, "cvta");
+    // Another bitcast from i8 * to <the element type of GVType> * is
+    // required.
+    DestTy =
+        PointerType::get(GV->getValueType(), llvm::ADDRESS_SPACE_GENERIC);
+    CVTA = Builder.CreateBitCast(CVTA, DestTy, "cvta");
+  } else {
+    // A simple CVTA is enough.
+    SmallVector<Type *, 2> ParamTypes;
+    ParamTypes.push_back(PointerType::get(GV->getValueType(),
+                                          llvm::ADDRESS_SPACE_GENERIC));
+    ParamTypes.push_back(GVType);
+    Function *CVTAFunction = Intrinsic::getDeclaration(
+        M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes);
+    CVTA = Builder.CreateCall(CVTAFunction, GV, "cvta");
+  }
+
+  return CVTA;
+}
+
+Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C,
+                                    IRBuilder<> &Builder) {
+  // If the constant C has been converted already in the given function  F, just
+  // return the converted value.
+  ConstantToValueMapTy::iterator CTII = ConstantToValueMap.find(C);
+  if (CTII != ConstantToValueMap.end()) {
+    return CTII->second;
+  }
+
+  Value *NewValue = C;
+  if (isa<GlobalVariable>(C)) {
+    // If the constant C is a global variable and is found in  GVMap, generate a
+    // set set of instructions that convert the clone of C with the global
+    // address space specifier to a generic pointer.
+    // The constant C cannot be used here, as it will be erased from the
+    // module eventually.  And the clone of C with the global address space
+    // specifier cannot be used here either, as it will affect the types of
+    // other instructions in the function.  Hence, this address space conversion
+    // is required.
+    GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(C));
+    if (I != GVMap.end()) {
+      NewValue = getOrInsertCVTA(M, F, I->second, Builder);
+    }
+  } else if (isa<ConstantAggregate>(C)) {
+    // If any element in the constant vector or aggregate C is or uses a global
+    // variable in GVMap, the constant C needs to be reconstructed, using a set
+    // of instructions.
+    NewValue = remapConstantVectorOrConstantAggregate(M, F, C, Builder);
+  } else if (isa<ConstantExpr>(C)) {
+    // If any operand in the constant expression C is or uses a global variable
+    // in GVMap, the constant expression C needs to be reconstructed, using a
+    // set of instructions.
+    NewValue = remapConstantExpr(M, F, cast<ConstantExpr>(C), Builder);
+  }
+
+  ConstantToValueMap[C] = NewValue;
+  return NewValue;
+}
+
+Value *GenericToNVVM::remapConstantVectorOrConstantAggregate(
+    Module *M, Function *F, Constant *C, IRBuilder<> &Builder) {
+  bool OperandChanged = false;
+  SmallVector<Value *, 4> NewOperands;
+  unsigned NumOperands = C->getNumOperands();
+
+  // Check if any element is or uses a global variable in  GVMap, and thus
+  // converted to another value.
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    Value *Operand = C->getOperand(i);
+    Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder);
+    OperandChanged |= Operand != NewOperand;
+    NewOperands.push_back(NewOperand);
+  }
+
+  // If none of the elements has been modified, return C as it is.
+  if (!OperandChanged) {
+    return C;
+  }
+
+  // If any of the elements has been  modified, construct the equivalent
+  // vector or aggregate value with a set instructions and the converted
+  // elements.
+  Value *NewValue = UndefValue::get(C->getType());
+  if (isa<ConstantVector>(C)) {
+    for (unsigned i = 0; i < NumOperands; ++i) {
+      Value *Idx = ConstantInt::get(Type::getInt32Ty(M->getContext()), i);
+      NewValue = Builder.CreateInsertElement(NewValue, NewOperands[i], Idx);
+    }
+  } else {
+    for (unsigned i = 0; i < NumOperands; ++i) {
+      NewValue =
+          Builder.CreateInsertValue(NewValue, NewOperands[i], makeArrayRef(i));
+    }
+  }
+
+  return NewValue;
+}
+
+Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
+                                        IRBuilder<> &Builder) {
+  bool OperandChanged = false;
+  SmallVector<Value *, 4> NewOperands;
+  unsigned NumOperands = C->getNumOperands();
+
+  // Check if any operand is or uses a global variable in  GVMap, and thus
+  // converted to another value.
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    Value *Operand = C->getOperand(i);
+    Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder);
+    OperandChanged |= Operand != NewOperand;
+    NewOperands.push_back(NewOperand);
+  }
+
+  // If none of the operands has been modified, return C as it is.
+  if (!OperandChanged) {
+    return C;
+  }
+
+  // If any of the operands has been modified, construct the instruction with
+  // the converted operands.
+  unsigned Opcode = C->getOpcode();
+  switch (Opcode) {
+  case Instruction::ICmp:
+    // CompareConstantExpr (icmp)
+    return Builder.CreateICmp(CmpInst::Predicate(C->getPredicate()),
+                              NewOperands[0], NewOperands[1]);
+  case Instruction::FCmp:
+    // CompareConstantExpr (fcmp)
+    llvm_unreachable("Address space conversion should have no effect "
+                     "on float point CompareConstantExpr (fcmp)!");
+  case Instruction::ExtractElement:
+    // ExtractElementConstantExpr
+    return Builder.CreateExtractElement(NewOperands[0], NewOperands[1]);
+  case Instruction::InsertElement:
+    // InsertElementConstantExpr
+    return Builder.CreateInsertElement(NewOperands[0], NewOperands[1],
+                                       NewOperands[2]);
+  case Instruction::ShuffleVector:
+    // ShuffleVector
+    return Builder.CreateShuffleVector(NewOperands[0], NewOperands[1],
+                                       NewOperands[2]);
+  case Instruction::ExtractValue:
+    // ExtractValueConstantExpr
+    return Builder.CreateExtractValue(NewOperands[0], C->getIndices());
+  case Instruction::InsertValue:
+    // InsertValueConstantExpr
+    return Builder.CreateInsertValue(NewOperands[0], NewOperands[1],
+                                     C->getIndices());
+  case Instruction::GetElementPtr:
+    // GetElementPtrConstantExpr
+    return cast<GEPOperator>(C)->isInBounds()
+               ? Builder.CreateGEP(
+                     cast<GEPOperator>(C)->getSourceElementType(),
+                     NewOperands[0],
+                     makeArrayRef(&NewOperands[1], NumOperands - 1))
+               : Builder.CreateInBoundsGEP(
+                     cast<GEPOperator>(C)->getSourceElementType(),
+                     NewOperands[0],
+                     makeArrayRef(&NewOperands[1], NumOperands - 1));
+  case Instruction::Select:
+    // SelectConstantExpr
+    return Builder.CreateSelect(NewOperands[0], NewOperands[1], NewOperands[2]);
+  default:
+    // BinaryConstantExpr
+    if (Instruction::isBinaryOp(Opcode)) {
+      return Builder.CreateBinOp(Instruction::BinaryOps(C->getOpcode()),
+                                 NewOperands[0], NewOperands[1]);
+    }
+    // UnaryConstantExpr
+    if (Instruction::isCast(Opcode)) {
+      return Builder.CreateCast(Instruction::CastOps(C->getOpcode()),
+                                NewOperands[0], C->getType());
+    }
+    llvm_unreachable("GenericToNVVM encountered an unsupported ConstantExpr");
+  }
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
new file mode 100644
index 000000000000..43c478f4212f
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -0,0 +1,5259 @@
+//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXISelDAGToDAG.h"
+#include "NVPTXUtilities.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-isel"
+
+static cl::opt<int> UsePrecDivF32(
+    "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
+    cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
+             " IEEE Compliant F32 div.rnd if available."),
+    cl::init(2));
+
+static cl::opt<bool>
+UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden,
+          cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
+          cl::init(true));
+
+static cl::opt<bool>
+FtzEnabled("nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
+           cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
+           cl::init(false));
+
+
+/// createNVPTXISelDag - This pass converts a legalized DAG into a
+/// NVPTX-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
+                                       llvm::CodeGenOpt::Level OptLevel) {
+  return new NVPTXDAGToDAGISel(TM, OptLevel);
+}
+
+NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
+                                     CodeGenOpt::Level OptLevel)
+    : SelectionDAGISel(tm, OptLevel), TM(tm) {
+  doMulWide = (OptLevel > 0);
+}
+
+bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+    Subtarget = &static_cast<const NVPTXSubtarget &>(MF.getSubtarget());
+    return SelectionDAGISel::runOnMachineFunction(MF);
+}
+
+int NVPTXDAGToDAGISel::getDivF32Level() const {
+  if (UsePrecDivF32.getNumOccurrences() > 0) {
+    // If nvptx-prec-div32=N is used on the command-line, always honor it
+    return UsePrecDivF32;
+  } else {
+    // Otherwise, use div.approx if fast math is enabled
+    if (TM.Options.UnsafeFPMath)
+      return 0;
+    else
+      return 2;
+  }
+}
+
+bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
+  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
+    // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
+    return UsePrecSqrtF32;
+  } else {
+    // Otherwise, use sqrt.approx if fast math is enabled
+    return !TM.Options.UnsafeFPMath;
+  }
+}
+
+bool NVPTXDAGToDAGISel::useF32FTZ() const {
+  if (FtzEnabled.getNumOccurrences() > 0) {
+    // If nvptx-f32ftz is used on the command-line, always honor it
+    return FtzEnabled;
+  } else {
+    const Function *F = MF->getFunction();
+    // Otherwise, check for an nvptx-f32ftz attribute on the function
+    if (F->hasFnAttribute("nvptx-f32ftz"))
+      return F->getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
+    else
+      return false;
+  }
+}
+
+bool NVPTXDAGToDAGISel::allowFMA() const {
+  const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
+  return TL->allowFMA(*MF, OptLevel);
+}
+
+/// Select - Select instructions not customized! Used for
+/// expanded, promoted and normal instructions.
+void NVPTXDAGToDAGISel::Select(SDNode *N) {
+
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return; // Already selected.
+  }
+
+  switch (N->getOpcode()) {
+  case ISD::LOAD:
+    if (tryLoad(N))
+      return;
+    break;
+  case ISD::STORE:
+    if (tryStore(N))
+      return;
+    break;
+  case NVPTXISD::LoadV2:
+  case NVPTXISD::LoadV4:
+    if (tryLoadVector(N))
+      return;
+    break;
+  case NVPTXISD::LDGV2:
+  case NVPTXISD::LDGV4:
+  case NVPTXISD::LDUV2:
+  case NVPTXISD::LDUV4:
+    if (tryLDGLDU(N))
+      return;
+    break;
+  case NVPTXISD::StoreV2:
+  case NVPTXISD::StoreV4:
+    if (tryStoreVector(N))
+      return;
+    break;
+  case NVPTXISD::LoadParam:
+  case NVPTXISD::LoadParamV2:
+  case NVPTXISD::LoadParamV4:
+    if (tryLoadParam(N))
+      return;
+    break;
+  case NVPTXISD::StoreRetval:
+  case NVPTXISD::StoreRetvalV2:
+  case NVPTXISD::StoreRetvalV4:
+    if (tryStoreRetval(N))
+      return;
+    break;
+  case NVPTXISD::StoreParam:
+  case NVPTXISD::StoreParamV2:
+  case NVPTXISD::StoreParamV4:
+  case NVPTXISD::StoreParamS32:
+  case NVPTXISD::StoreParamU32:
+    if (tryStoreParam(N))
+      return;
+    break;
+  case ISD::INTRINSIC_WO_CHAIN:
+    if (tryIntrinsicNoChain(N))
+      return;
+    break;
+  case ISD::INTRINSIC_W_CHAIN:
+    if (tryIntrinsicChain(N))
+      return;
+    break;
+  case NVPTXISD::Tex1DFloatS32:
+  case NVPTXISD::Tex1DFloatFloat:
+  case NVPTXISD::Tex1DFloatFloatLevel:
+  case NVPTXISD::Tex1DFloatFloatGrad:
+  case NVPTXISD::Tex1DS32S32:
+  case NVPTXISD::Tex1DS32Float:
+  case NVPTXISD::Tex1DS32FloatLevel:
+  case NVPTXISD::Tex1DS32FloatGrad:
+  case NVPTXISD::Tex1DU32S32:
+  case NVPTXISD::Tex1DU32Float:
+  case NVPTXISD::Tex1DU32FloatLevel:
+  case NVPTXISD::Tex1DU32FloatGrad:
+  case NVPTXISD::Tex1DArrayFloatS32:
+  case NVPTXISD::Tex1DArrayFloatFloat:
+  case NVPTXISD::Tex1DArrayFloatFloatLevel:
+  case NVPTXISD::Tex1DArrayFloatFloatGrad:
+  case NVPTXISD::Tex1DArrayS32S32:
+  case NVPTXISD::Tex1DArrayS32Float:
+  case NVPTXISD::Tex1DArrayS32FloatLevel:
+  case NVPTXISD::Tex1DArrayS32FloatGrad:
+  case NVPTXISD::Tex1DArrayU32S32:
+  case NVPTXISD::Tex1DArrayU32Float:
+  case NVPTXISD::Tex1DArrayU32FloatLevel:
+  case NVPTXISD::Tex1DArrayU32FloatGrad:
+  case NVPTXISD::Tex2DFloatS32:
+  case NVPTXISD::Tex2DFloatFloat:
+  case NVPTXISD::Tex2DFloatFloatLevel:
+  case NVPTXISD::Tex2DFloatFloatGrad:
+  case NVPTXISD::Tex2DS32S32:
+  case NVPTXISD::Tex2DS32Float:
+  case NVPTXISD::Tex2DS32FloatLevel:
+  case NVPTXISD::Tex2DS32FloatGrad:
+  case NVPTXISD::Tex2DU32S32:
+  case NVPTXISD::Tex2DU32Float:
+  case NVPTXISD::Tex2DU32FloatLevel:
+  case NVPTXISD::Tex2DU32FloatGrad:
+  case NVPTXISD::Tex2DArrayFloatS32:
+  case NVPTXISD::Tex2DArrayFloatFloat:
+  case NVPTXISD::Tex2DArrayFloatFloatLevel:
+  case NVPTXISD::Tex2DArrayFloatFloatGrad:
+  case NVPTXISD::Tex2DArrayS32S32:
+  case NVPTXISD::Tex2DArrayS32Float:
+  case NVPTXISD::Tex2DArrayS32FloatLevel:
+  case NVPTXISD::Tex2DArrayS32FloatGrad:
+  case NVPTXISD::Tex2DArrayU32S32:
+  case NVPTXISD::Tex2DArrayU32Float:
+  case NVPTXISD::Tex2DArrayU32FloatLevel:
+  case NVPTXISD::Tex2DArrayU32FloatGrad:
+  case NVPTXISD::Tex3DFloatS32:
+  case NVPTXISD::Tex3DFloatFloat:
+  case NVPTXISD::Tex3DFloatFloatLevel:
+  case NVPTXISD::Tex3DFloatFloatGrad:
+  case NVPTXISD::Tex3DS32S32:
+  case NVPTXISD::Tex3DS32Float:
+  case NVPTXISD::Tex3DS32FloatLevel:
+  case NVPTXISD::Tex3DS32FloatGrad:
+  case NVPTXISD::Tex3DU32S32:
+  case NVPTXISD::Tex3DU32Float:
+  case NVPTXISD::Tex3DU32FloatLevel:
+  case NVPTXISD::Tex3DU32FloatGrad:
+  case NVPTXISD::TexCubeFloatFloat:
+  case NVPTXISD::TexCubeFloatFloatLevel:
+  case NVPTXISD::TexCubeS32Float:
+  case NVPTXISD::TexCubeS32FloatLevel:
+  case NVPTXISD::TexCubeU32Float:
+  case NVPTXISD::TexCubeU32FloatLevel:
+  case NVPTXISD::TexCubeArrayFloatFloat:
+  case NVPTXISD::TexCubeArrayFloatFloatLevel:
+  case NVPTXISD::TexCubeArrayS32Float:
+  case NVPTXISD::TexCubeArrayS32FloatLevel:
+  case NVPTXISD::TexCubeArrayU32Float:
+  case NVPTXISD::TexCubeArrayU32FloatLevel:
+  case NVPTXISD::Tld4R2DFloatFloat:
+  case NVPTXISD::Tld4G2DFloatFloat:
+  case NVPTXISD::Tld4B2DFloatFloat:
+  case NVPTXISD::Tld4A2DFloatFloat:
+  case NVPTXISD::Tld4R2DS64Float:
+  case NVPTXISD::Tld4G2DS64Float:
+  case NVPTXISD::Tld4B2DS64Float:
+  case NVPTXISD::Tld4A2DS64Float:
+  case NVPTXISD::Tld4R2DU64Float:
+  case NVPTXISD::Tld4G2DU64Float:
+  case NVPTXISD::Tld4B2DU64Float:
+  case NVPTXISD::Tld4A2DU64Float:
+  case NVPTXISD::TexUnified1DFloatS32:
+  case NVPTXISD::TexUnified1DFloatFloat:
+  case NVPTXISD::TexUnified1DFloatFloatLevel:
+  case NVPTXISD::TexUnified1DFloatFloatGrad:
+  case NVPTXISD::TexUnified1DS32S32:
+  case NVPTXISD::TexUnified1DS32Float:
+  case NVPTXISD::TexUnified1DS32FloatLevel:
+  case NVPTXISD::TexUnified1DS32FloatGrad:
+  case NVPTXISD::TexUnified1DU32S32:
+  case NVPTXISD::TexUnified1DU32Float:
+  case NVPTXISD::TexUnified1DU32FloatLevel:
+  case NVPTXISD::TexUnified1DU32FloatGrad:
+  case NVPTXISD::TexUnified1DArrayFloatS32:
+  case NVPTXISD::TexUnified1DArrayFloatFloat:
+  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+  case NVPTXISD::TexUnified1DArrayS32S32:
+  case NVPTXISD::TexUnified1DArrayS32Float:
+  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+  case NVPTXISD::TexUnified1DArrayU32S32:
+  case NVPTXISD::TexUnified1DArrayU32Float:
+  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+  case NVPTXISD::TexUnified2DFloatS32:
+  case NVPTXISD::TexUnified2DFloatFloat:
+  case NVPTXISD::TexUnified2DFloatFloatLevel:
+  case NVPTXISD::TexUnified2DFloatFloatGrad:
+  case NVPTXISD::TexUnified2DS32S32:
+  case NVPTXISD::TexUnified2DS32Float:
+  case NVPTXISD::TexUnified2DS32FloatLevel:
+  case NVPTXISD::TexUnified2DS32FloatGrad:
+  case NVPTXISD::TexUnified2DU32S32:
+  case NVPTXISD::TexUnified2DU32Float:
+  case NVPTXISD::TexUnified2DU32FloatLevel:
+  case NVPTXISD::TexUnified2DU32FloatGrad:
+  case NVPTXISD::TexUnified2DArrayFloatS32:
+  case NVPTXISD::TexUnified2DArrayFloatFloat:
+  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+  case NVPTXISD::TexUnified2DArrayS32S32:
+  case NVPTXISD::TexUnified2DArrayS32Float:
+  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+  case NVPTXISD::TexUnified2DArrayU32S32:
+  case NVPTXISD::TexUnified2DArrayU32Float:
+  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+  case NVPTXISD::TexUnified3DFloatS32:
+  case NVPTXISD::TexUnified3DFloatFloat:
+  case NVPTXISD::TexUnified3DFloatFloatLevel:
+  case NVPTXISD::TexUnified3DFloatFloatGrad:
+  case NVPTXISD::TexUnified3DS32S32:
+  case NVPTXISD::TexUnified3DS32Float:
+  case NVPTXISD::TexUnified3DS32FloatLevel:
+  case NVPTXISD::TexUnified3DS32FloatGrad:
+  case NVPTXISD::TexUnified3DU32S32:
+  case NVPTXISD::TexUnified3DU32Float:
+  case NVPTXISD::TexUnified3DU32FloatLevel:
+  case NVPTXISD::TexUnified3DU32FloatGrad:
+  case NVPTXISD::TexUnifiedCubeFloatFloat:
+  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+  case NVPTXISD::TexUnifiedCubeS32Float:
+  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+  case NVPTXISD::TexUnifiedCubeU32Float:
+  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+  case NVPTXISD::TexUnifiedCubeArrayS32Float:
+  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+  case NVPTXISD::TexUnifiedCubeArrayU32Float:
+  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+  case NVPTXISD::Tld4UnifiedR2DS64Float:
+  case NVPTXISD::Tld4UnifiedG2DS64Float:
+  case NVPTXISD::Tld4UnifiedB2DS64Float:
+  case NVPTXISD::Tld4UnifiedA2DS64Float:
+  case NVPTXISD::Tld4UnifiedR2DU64Float:
+  case NVPTXISD::Tld4UnifiedG2DU64Float:
+  case NVPTXISD::Tld4UnifiedB2DU64Float:
+  case NVPTXISD::Tld4UnifiedA2DU64Float:
+    if (tryTextureIntrinsic(N))
+      return;
+    break;
+  case NVPTXISD::Suld1DI8Clamp:
+  case NVPTXISD::Suld1DI16Clamp:
+  case NVPTXISD::Suld1DI32Clamp:
+  case NVPTXISD::Suld1DI64Clamp:
+  case NVPTXISD::Suld1DV2I8Clamp:
+  case NVPTXISD::Suld1DV2I16Clamp:
+  case NVPTXISD::Suld1DV2I32Clamp:
+  case NVPTXISD::Suld1DV2I64Clamp:
+  case NVPTXISD::Suld1DV4I8Clamp:
+  case NVPTXISD::Suld1DV4I16Clamp:
+  case NVPTXISD::Suld1DV4I32Clamp:
+  case NVPTXISD::Suld1DArrayI8Clamp:
+  case NVPTXISD::Suld1DArrayI16Clamp:
+  case NVPTXISD::Suld1DArrayI32Clamp:
+  case NVPTXISD::Suld1DArrayI64Clamp:
+  case NVPTXISD::Suld1DArrayV2I8Clamp:
+  case NVPTXISD::Suld1DArrayV2I16Clamp:
+  case NVPTXISD::Suld1DArrayV2I32Clamp:
+  case NVPTXISD::Suld1DArrayV2I64Clamp:
+  case NVPTXISD::Suld1DArrayV4I8Clamp:
+  case NVPTXISD::Suld1DArrayV4I16Clamp:
+  case NVPTXISD::Suld1DArrayV4I32Clamp:
+  case NVPTXISD::Suld2DI8Clamp:
+  case NVPTXISD::Suld2DI16Clamp:
+  case NVPTXISD::Suld2DI32Clamp:
+  case NVPTXISD::Suld2DI64Clamp:
+  case NVPTXISD::Suld2DV2I8Clamp:
+  case NVPTXISD::Suld2DV2I16Clamp:
+  case NVPTXISD::Suld2DV2I32Clamp:
+  case NVPTXISD::Suld2DV2I64Clamp:
+  case NVPTXISD::Suld2DV4I8Clamp:
+  case NVPTXISD::Suld2DV4I16Clamp:
+  case NVPTXISD::Suld2DV4I32Clamp:
+  case NVPTXISD::Suld2DArrayI8Clamp:
+  case NVPTXISD::Suld2DArrayI16Clamp:
+  case NVPTXISD::Suld2DArrayI32Clamp:
+  case NVPTXISD::Suld2DArrayI64Clamp:
+  case NVPTXISD::Suld2DArrayV2I8Clamp:
+  case NVPTXISD::Suld2DArrayV2I16Clamp:
+  case NVPTXISD::Suld2DArrayV2I32Clamp:
+  case NVPTXISD::Suld2DArrayV2I64Clamp:
+  case NVPTXISD::Suld2DArrayV4I8Clamp:
+  case NVPTXISD::Suld2DArrayV4I16Clamp:
+  case NVPTXISD::Suld2DArrayV4I32Clamp:
+  case NVPTXISD::Suld3DI8Clamp:
+  case NVPTXISD::Suld3DI16Clamp:
+  case NVPTXISD::Suld3DI32Clamp:
+  case NVPTXISD::Suld3DI64Clamp:
+  case NVPTXISD::Suld3DV2I8Clamp:
+  case NVPTXISD::Suld3DV2I16Clamp:
+  case NVPTXISD::Suld3DV2I32Clamp:
+  case NVPTXISD::Suld3DV2I64Clamp:
+  case NVPTXISD::Suld3DV4I8Clamp:
+  case NVPTXISD::Suld3DV4I16Clamp:
+  case NVPTXISD::Suld3DV4I32Clamp:
+  case NVPTXISD::Suld1DI8Trap:
+  case NVPTXISD::Suld1DI16Trap:
+  case NVPTXISD::Suld1DI32Trap:
+  case NVPTXISD::Suld1DI64Trap:
+  case NVPTXISD::Suld1DV2I8Trap:
+  case NVPTXISD::Suld1DV2I16Trap:
+  case NVPTXISD::Suld1DV2I32Trap:
+  case NVPTXISD::Suld1DV2I64Trap:
+  case NVPTXISD::Suld1DV4I8Trap:
+  case NVPTXISD::Suld1DV4I16Trap:
+  case NVPTXISD::Suld1DV4I32Trap:
+  case NVPTXISD::Suld1DArrayI8Trap:
+  case NVPTXISD::Suld1DArrayI16Trap:
+  case NVPTXISD::Suld1DArrayI32Trap:
+  case NVPTXISD::Suld1DArrayI64Trap:
+  case NVPTXISD::Suld1DArrayV2I8Trap:
+  case NVPTXISD::Suld1DArrayV2I16Trap:
+  case NVPTXISD::Suld1DArrayV2I32Trap:
+  case NVPTXISD::Suld1DArrayV2I64Trap:
+  case NVPTXISD::Suld1DArrayV4I8Trap:
+  case NVPTXISD::Suld1DArrayV4I16Trap:
+  case NVPTXISD::Suld1DArrayV4I32Trap:
+  case NVPTXISD::Suld2DI8Trap:
+  case NVPTXISD::Suld2DI16Trap:
+  case NVPTXISD::Suld2DI32Trap:
+  case NVPTXISD::Suld2DI64Trap:
+  case NVPTXISD::Suld2DV2I8Trap:
+  case NVPTXISD::Suld2DV2I16Trap:
+  case NVPTXISD::Suld2DV2I32Trap:
+  case NVPTXISD::Suld2DV2I64Trap:
+  case NVPTXISD::Suld2DV4I8Trap:
+  case NVPTXISD::Suld2DV4I16Trap:
+  case NVPTXISD::Suld2DV4I32Trap:
+  case NVPTXISD::Suld2DArrayI8Trap:
+  case NVPTXISD::Suld2DArrayI16Trap:
+  case NVPTXISD::Suld2DArrayI32Trap:
+  case NVPTXISD::Suld2DArrayI64Trap:
+  case NVPTXISD::Suld2DArrayV2I8Trap:
+  case NVPTXISD::Suld2DArrayV2I16Trap:
+  case NVPTXISD::Suld2DArrayV2I32Trap:
+  case NVPTXISD::Suld2DArrayV2I64Trap:
+  case NVPTXISD::Suld2DArrayV4I8Trap:
+  case NVPTXISD::Suld2DArrayV4I16Trap:
+  case NVPTXISD::Suld2DArrayV4I32Trap:
+  case NVPTXISD::Suld3DI8Trap:
+  case NVPTXISD::Suld3DI16Trap:
+  case NVPTXISD::Suld3DI32Trap:
+  case NVPTXISD::Suld3DI64Trap:
+  case NVPTXISD::Suld3DV2I8Trap:
+  case NVPTXISD::Suld3DV2I16Trap:
+  case NVPTXISD::Suld3DV2I32Trap:
+  case NVPTXISD::Suld3DV2I64Trap:
+  case NVPTXISD::Suld3DV4I8Trap:
+  case NVPTXISD::Suld3DV4I16Trap:
+  case NVPTXISD::Suld3DV4I32Trap:
+  case NVPTXISD::Suld1DI8Zero:
+  case NVPTXISD::Suld1DI16Zero:
+  case NVPTXISD::Suld1DI32Zero:
+  case NVPTXISD::Suld1DI64Zero:
+  case NVPTXISD::Suld1DV2I8Zero:
+  case NVPTXISD::Suld1DV2I16Zero:
+  case NVPTXISD::Suld1DV2I32Zero:
+  case NVPTXISD::Suld1DV2I64Zero:
+  case NVPTXISD::Suld1DV4I8Zero:
+  case NVPTXISD::Suld1DV4I16Zero:
+  case NVPTXISD::Suld1DV4I32Zero:
+  case NVPTXISD::Suld1DArrayI8Zero:
+  case NVPTXISD::Suld1DArrayI16Zero:
+  case NVPTXISD::Suld1DArrayI32Zero:
+  case NVPTXISD::Suld1DArrayI64Zero:
+  case NVPTXISD::Suld1DArrayV2I8Zero:
+  case NVPTXISD::Suld1DArrayV2I16Zero:
+  case NVPTXISD::Suld1DArrayV2I32Zero:
+  case NVPTXISD::Suld1DArrayV2I64Zero:
+  case NVPTXISD::Suld1DArrayV4I8Zero:
+  case NVPTXISD::Suld1DArrayV4I16Zero:
+  case NVPTXISD::Suld1DArrayV4I32Zero:
+  case NVPTXISD::Suld2DI8Zero:
+  case NVPTXISD::Suld2DI16Zero:
+  case NVPTXISD::Suld2DI32Zero:
+  case NVPTXISD::Suld2DI64Zero:
+  case NVPTXISD::Suld2DV2I8Zero:
+  case NVPTXISD::Suld2DV2I16Zero:
+  case NVPTXISD::Suld2DV2I32Zero:
+  case NVPTXISD::Suld2DV2I64Zero:
+  case NVPTXISD::Suld2DV4I8Zero:
+  case NVPTXISD::Suld2DV4I16Zero:
+  case NVPTXISD::Suld2DV4I32Zero:
+  case NVPTXISD::Suld2DArrayI8Zero:
+  case NVPTXISD::Suld2DArrayI16Zero:
+  case NVPTXISD::Suld2DArrayI32Zero:
+  case NVPTXISD::Suld2DArrayI64Zero:
+  case NVPTXISD::Suld2DArrayV2I8Zero:
+  case NVPTXISD::Suld2DArrayV2I16Zero:
+  case NVPTXISD::Suld2DArrayV2I32Zero:
+  case NVPTXISD::Suld2DArrayV2I64Zero:
+  case NVPTXISD::Suld2DArrayV4I8Zero:
+  case NVPTXISD::Suld2DArrayV4I16Zero:
+  case NVPTXISD::Suld2DArrayV4I32Zero:
+  case NVPTXISD::Suld3DI8Zero:
+  case NVPTXISD::Suld3DI16Zero:
+  case NVPTXISD::Suld3DI32Zero:
+  case NVPTXISD::Suld3DI64Zero:
+  case NVPTXISD::Suld3DV2I8Zero:
+  case NVPTXISD::Suld3DV2I16Zero:
+  case NVPTXISD::Suld3DV2I32Zero:
+  case NVPTXISD::Suld3DV2I64Zero:
+  case NVPTXISD::Suld3DV4I8Zero:
+  case NVPTXISD::Suld3DV4I16Zero:
+  case NVPTXISD::Suld3DV4I32Zero:
+    if (trySurfaceIntrinsic(N))
+      return;
+    break;
+  case ISD::AND:
+  case ISD::SRA:
+  case ISD::SRL:
+    // Try to select BFE
+    if (tryBFE(N))
+      return;
+    break;
+  case ISD::ADDRSPACECAST:
+    SelectAddrSpaceCast(N);
+    return;
+  default:
+    break;
+  }
+  SelectCode(N);
+}
+
+bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
+  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  switch (IID) {
+  default:
+    return false;
+  case Intrinsic::nvvm_ldg_global_f:
+  case Intrinsic::nvvm_ldg_global_i:
+  case Intrinsic::nvvm_ldg_global_p:
+  case Intrinsic::nvvm_ldu_global_f:
+  case Intrinsic::nvvm_ldu_global_i:
+  case Intrinsic::nvvm_ldu_global_p:
+    return tryLDGLDU(N);
+  }
+}
+
+static unsigned int getCodeAddrSpace(MemSDNode *N) {
+  const Value *Src = N->getMemOperand()->getValue();
+
+  if (!Src)
+    return NVPTX::PTXLdStInstCode::GENERIC;
+
+  if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
+    switch (PT->getAddressSpace()) {
+    case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
+    case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
+    case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
+    case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
+    case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
+    case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
+    default: break;
+    }
+  }
+  return NVPTX::PTXLdStInstCode::GENERIC;
+}
+
+static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
+                          unsigned CodeAddrSpace, MachineFunction *F) {
+  // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
+  // space.
+  //
+  // We have two ways of identifying invariant loads: Loads may be explicitly
+  // marked as invariant, or we may infer them to be invariant.
+  //
+  // We currently infer invariance only for kernel function pointer params that
+  // are noalias (i.e. __restrict) and never written to.
+  //
+  // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
+  // not during the SelectionDAG phase).
+  //
+  // TODO: Infer invariance only at -O2.  We still want to use ldg at -O0 for
+  // explicitly invariant loads because these are how clang tells us to use ldg
+  // when the user uses a builtin.
+  if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
+    return false;
+
+  if (N->isInvariant())
+    return true;
+
+  // Load wasn't explicitly invariant.  Attempt to infer invariance.
+  if (!isKernelFunction(*F->getFunction()))
+    return false;
+
+  // We use GetUnderlyingObjects() here instead of
+  // GetUnderlyingObject() mainly because the former looks through phi
+  // nodes while the latter does not. We need to look through phi
+  // nodes to handle pointer induction variables.
+  SmallVector<Value *, 8> Objs;
+  GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
+                       Objs, F->getDataLayout());
+  for (Value *Obj : Objs) {
+    auto *A = dyn_cast<const Argument>(Obj);
+    if (!A || !A->onlyReadsMemory() || !A->hasNoAliasAttr()) return false;
+  }
+
+  return true;
+}
+
+bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
+  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  switch (IID) {
+  default:
+    return false;
+  case Intrinsic::nvvm_texsurf_handle_internal:
+    SelectTexSurfHandle(N);
+    return true;
+  }
+}
+
+void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
+  // Op 0 is the intrinsic ID
+  SDValue Wrapper = N->getOperand(1);
+  SDValue GlobalVal = Wrapper.getOperand(0);
+  ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
+                                        MVT::i64, GlobalVal));
+}
+
+void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
+  SDValue Src = N->getOperand(0);
+  AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
+  unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
+  unsigned DstAddrSpace = CastN->getDestAddressSpace();
+
+  assert(SrcAddrSpace != DstAddrSpace &&
+         "addrspacecast must be between different address spaces");
+
+  if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
+    // Specific to generic
+    unsigned Opc;
+    switch (SrcAddrSpace) {
+    default: report_fatal_error("Bad address space in addrspacecast");
+    case ADDRESS_SPACE_GLOBAL:
+      Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
+      break;
+    case ADDRESS_SPACE_SHARED:
+      Opc = TM.is64Bit() ? NVPTX::cvta_shared_yes_64 : NVPTX::cvta_shared_yes;
+      break;
+    case ADDRESS_SPACE_CONST:
+      Opc = TM.is64Bit() ? NVPTX::cvta_const_yes_64 : NVPTX::cvta_const_yes;
+      break;
+    case ADDRESS_SPACE_LOCAL:
+      Opc = TM.is64Bit() ? NVPTX::cvta_local_yes_64 : NVPTX::cvta_local_yes;
+      break;
+    }
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
+                                          Src));
+    return;
+  } else {
+    // Generic to specific
+    if (SrcAddrSpace != 0)
+      report_fatal_error("Cannot cast between two non-generic address spaces");
+    unsigned Opc;
+    switch (DstAddrSpace) {
+    default: report_fatal_error("Bad address space in addrspacecast");
+    case ADDRESS_SPACE_GLOBAL:
+      Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
+                         : NVPTX::cvta_to_global_yes;
+      break;
+    case ADDRESS_SPACE_SHARED:
+      Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_yes_64
+                         : NVPTX::cvta_to_shared_yes;
+      break;
+    case ADDRESS_SPACE_CONST:
+      Opc =
+          TM.is64Bit() ? NVPTX::cvta_to_const_yes_64 : NVPTX::cvta_to_const_yes;
+      break;
+    case ADDRESS_SPACE_LOCAL:
+      Opc =
+          TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes;
+      break;
+    case ADDRESS_SPACE_PARAM:
+      Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
+                         : NVPTX::nvvm_ptr_gen_to_param;
+      break;
+    }
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
+                                          Src));
+    return;
+  }
+}
+
+bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
+  SDLoc dl(N);
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  EVT LoadedVT = LD->getMemoryVT();
+  SDNode *NVPTXLD = nullptr;
+
+  // do not support pre/post inc/dec
+  if (LD->isIndexed())
+    return false;
+
+  if (!LoadedVT.isSimple())
+    return false;
+
+  // Address Space Setting
+  unsigned int codeAddrSpace = getCodeAddrSpace(LD);
+
+  if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) {
+    return tryLDGLDU(N);
+  }
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool isVolatile = LD->isVolatile();
+  if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    isVolatile = false;
+
+  // Vector Setting
+  MVT SimpleVT = LoadedVT.getSimpleVT();
+  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+  if (SimpleVT.isVector()) {
+    unsigned num = SimpleVT.getVectorNumElements();
+    if (num == 2)
+      vecType = NVPTX::PTXLdStInstCode::V2;
+    else if (num == 4)
+      vecType = NVPTX::PTXLdStInstCode::V4;
+    else
+      return false;
+  }
+
+  // Type Setting: fromType + fromTypeWidth
+  //
+  // Sign   : ISD::SEXTLOAD
+  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
+  //          type is integer
+  // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+  MVT ScalarVT = SimpleVT.getScalarType();
+  // Read at least 8 bits (predicates are stored as 8-bit values)
+  unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
+  unsigned int fromType;
+  if ((LD->getExtensionType() == ISD::SEXTLOAD))
+    fromType = NVPTX::PTXLdStInstCode::Signed;
+  else if (ScalarVT.isFloatingPoint())
+    fromType = NVPTX::PTXLdStInstCode::Float;
+  else
+    fromType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  // Create the machine instruction DAG
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue Addr;
+  SDValue Offset, Base;
+  unsigned Opcode;
+  MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
+
+  if (SelectDirectAddr(N1, Addr)) {
+    switch (TargetVT) {
+    case MVT::i8:
+      Opcode = NVPTX::LD_i8_avar;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::LD_i16_avar;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::LD_i32_avar;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::LD_i64_avar;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::LD_f32_avar;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::LD_f64_avar;
+      break;
+    default:
+      return false;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
+                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
+                      getI32Imm(fromTypeWidth, dl), Addr, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+  } else if (TM.is64Bit() ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
+                          : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
+    switch (TargetVT) {
+    case MVT::i8:
+      Opcode = NVPTX::LD_i8_asi;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::LD_i16_asi;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::LD_i32_asi;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::LD_i64_asi;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::LD_f32_asi;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::LD_f64_asi;
+      break;
+    default:
+      return false;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
+                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
+                      getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+  } else if (TM.is64Bit() ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
+                          : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
+    if (TM.is64Bit()) {
+      switch (TargetVT) {
+      case MVT::i8:
+        Opcode = NVPTX::LD_i8_ari_64;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LD_i16_ari_64;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LD_i32_ari_64;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LD_i64_ari_64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LD_f32_ari_64;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LD_f64_ari_64;
+        break;
+      default:
+        return false;
+      }
+    } else {
+      switch (TargetVT) {
+      case MVT::i8:
+        Opcode = NVPTX::LD_i8_ari;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LD_i16_ari;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LD_i32_ari;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LD_i64_ari;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LD_f32_ari;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LD_f64_ari;
+        break;
+      default:
+        return false;
+      }
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
+                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
+                      getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+  } else {
+    if (TM.is64Bit()) {
+      switch (TargetVT) {
+      case MVT::i8:
+        Opcode = NVPTX::LD_i8_areg_64;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LD_i16_areg_64;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LD_i32_areg_64;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LD_i64_areg_64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LD_f32_areg_64;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LD_f64_areg_64;
+        break;
+      default:
+        return false;
+      }
+    } else {
+      switch (TargetVT) {
+      case MVT::i8:
+        Opcode = NVPTX::LD_i8_areg;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LD_i16_areg;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LD_i32_areg;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LD_i64_areg;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LD_f32_areg;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LD_f64_areg;
+        break;
+      default:
+        return false;
+      }
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
+                      getI32Imm(vecType, dl), getI32Imm(fromType, dl),
+                      getI32Imm(fromTypeWidth, dl), N1, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+  }
+
+  if (!NVPTXLD)
+    return false;
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  ReplaceNode(N, NVPTXLD);
+  return true;
+}
+
+bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
+
+  SDValue Chain = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue Addr, Offset, Base;
+  unsigned Opcode;
+  SDLoc DL(N);
+  SDNode *LD;
+  MemSDNode *MemSD = cast<MemSDNode>(N);
+  EVT LoadedVT = MemSD->getMemoryVT();
+
+  if (!LoadedVT.isSimple())
+    return false;
+
+  // Address Space Setting
+  unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
+
+  if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
+    return tryLDGLDU(N);
+  }
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool IsVolatile = MemSD->isVolatile();
+  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    IsVolatile = false;
+
+  // Vector Setting
+  MVT SimpleVT = LoadedVT.getSimpleVT();
+
+  // Type Setting: fromType + fromTypeWidth
+  //
+  // Sign   : ISD::SEXTLOAD
+  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
+  //          type is integer
+  // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+  MVT ScalarVT = SimpleVT.getScalarType();
+  // Read at least 8 bits (predicates are stored as 8-bit values)
+  unsigned FromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
+  unsigned int FromType;
+  // The last operand holds the original LoadSDNode::getExtensionType() value
+  unsigned ExtensionType = cast<ConstantSDNode>(
+      N->getOperand(N->getNumOperands() - 1))->getZExtValue();
+  if (ExtensionType == ISD::SEXTLOAD)
+    FromType = NVPTX::PTXLdStInstCode::Signed;
+  else if (ScalarVT.isFloatingPoint())
+    FromType = NVPTX::PTXLdStInstCode::Float;
+  else
+    FromType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  unsigned VecType;
+
+  switch (N->getOpcode()) {
+  case NVPTXISD::LoadV2:
+    VecType = NVPTX::PTXLdStInstCode::V2;
+    break;
+  case NVPTXISD::LoadV4:
+    VecType = NVPTX::PTXLdStInstCode::V4;
+    break;
+  default:
+    return false;
+  }
+
+  EVT EltVT = N->getValueType(0);
+
+  if (SelectDirectAddr(Op1, Addr)) {
+    switch (N->getOpcode()) {
+    default:
+      return false;
+    case NVPTXISD::LoadV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i8:
+        Opcode = NVPTX::LDV_i8_v2_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LDV_i16_v2_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LDV_i32_v2_avar;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LDV_i64_v2_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LDV_f32_v2_avar;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LDV_f64_v2_avar;
+        break;
+      }
+      break;
+    case NVPTXISD::LoadV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i8:
+        Opcode = NVPTX::LDV_i8_v4_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LDV_i16_v4_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LDV_i32_v4_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LDV_f32_v4_avar;
+        break;
+      }
+      break;
+    }
+
+    SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
+                      getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+                      getI32Imm(FromTypeWidth, DL), Addr, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+  } else if (TM.is64Bit() ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
+                          : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
+    switch (N->getOpcode()) {
+    default:
+      return false;
+    case NVPTXISD::LoadV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i8:
+        Opcode = NVPTX::LDV_i8_v2_asi;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LDV_i16_v2_asi;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LDV_i32_v2_asi;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LDV_i64_v2_asi;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LDV_f32_v2_asi;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LDV_f64_v2_asi;
+        break;
+      }
+      break;
+    case NVPTXISD::LoadV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i8:
+        Opcode = NVPTX::LDV_i8_v4_asi;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LDV_i16_v4_asi;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LDV_i32_v4_asi;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LDV_f32_v4_asi;
+        break;
+      }
+      break;
+    }
+
+    SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
+                      getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+                      getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+  } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
+                          : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
+    if (TM.is64Bit()) {
+      switch (N->getOpcode()) {
+      default:
+        return false;
+      case NVPTXISD::LoadV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v2_ari_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v2_ari_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v2_ari_64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::LDV_i64_v2_ari_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v2_ari_64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::LDV_f64_v2_ari_64;
+          break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v4_ari_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v4_ari_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v4_ari_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v4_ari_64;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default:
+        return false;
+      case NVPTXISD::LoadV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v2_ari;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v2_ari;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v2_ari;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::LDV_i64_v2_ari;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v2_ari;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::LDV_f64_v2_ari;
+          break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v4_ari;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v4_ari;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v4_ari;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v4_ari;
+          break;
+        }
+        break;
+      }
+    }
+
+    SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
+                      getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+                      getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
+
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+  } else {
+    if (TM.is64Bit()) {
+      switch (N->getOpcode()) {
+      default:
+        return false;
+      case NVPTXISD::LoadV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v2_areg_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v2_areg_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v2_areg_64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::LDV_i64_v2_areg_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v2_areg_64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::LDV_f64_v2_areg_64;
+          break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v4_areg_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v4_areg_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v4_areg_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v4_areg_64;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default:
+        return false;
+      case NVPTXISD::LoadV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v2_areg;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v2_areg;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v2_areg;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::LDV_i64_v2_areg;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v2_areg;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::LDV_f64_v2_areg;
+          break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v4_areg;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v4_areg;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v4_areg;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v4_areg;
+          break;
+        }
+        break;
+      }
+    }
+
+    SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
+                      getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+                      getI32Imm(FromTypeWidth, DL), Op1, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+  }
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  ReplaceNode(N, LD);
+  return true;
+}
+
+bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
+
+  SDValue Chain = N->getOperand(0);
+  SDValue Op1;
+  MemSDNode *Mem;
+  bool IsLDG = true;
+
+  // If this is an LDG intrinsic, the address is the third operand. If its an
+  // LDG/LDU SD node (from custom vector handling), then its the second operand
+  if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+    Op1 = N->getOperand(2);
+    Mem = cast<MemIntrinsicSDNode>(N);
+    unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IID) {
+    default:
+      return false;
+    case Intrinsic::nvvm_ldg_global_f:
+    case Intrinsic::nvvm_ldg_global_i:
+    case Intrinsic::nvvm_ldg_global_p:
+      IsLDG = true;
+      break;
+    case Intrinsic::nvvm_ldu_global_f:
+    case Intrinsic::nvvm_ldu_global_i:
+    case Intrinsic::nvvm_ldu_global_p:
+      IsLDG = false;
+      break;
+    }
+  } else {
+    Op1 = N->getOperand(1);
+    Mem = cast<MemSDNode>(N);
+  }
+
+  unsigned Opcode;
+  SDLoc DL(N);
+  SDNode *LD;
+  SDValue Base, Offset, Addr;
+
+  EVT EltVT = Mem->getMemoryVT();
+  unsigned NumElts = 1;
+  if (EltVT.isVector()) {
+    NumElts = EltVT.getVectorNumElements();
+    EltVT = EltVT.getVectorElementType();
+  }
+
+  // Build the "promoted" result VTList for the load. If we are really loading
+  // i8s, then the return type will be promoted to i16 since we do not expose
+  // 8-bit registers in NVPTX.
+  EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
+  SmallVector<EVT, 5> InstVTs;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    InstVTs.push_back(NodeVT);
+  }
+  InstVTs.push_back(MVT::Other);
+  SDVTList InstVTList = CurDAG->getVTList(InstVTs);
+
+  if (SelectDirectAddr(Op1, Addr)) {
+    switch (N->getOpcode()) {
+    default:
+      return false;
+    case ISD::INTRINSIC_W_CHAIN:
+      if (IsLDG) {
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8avar;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16avar;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32avar;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64avar;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32avar;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64avar;
+          break;
+        }
+      } else {
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8avar;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16avar;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32avar;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64avar;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32avar;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64avar;
+          break;
+        }
+      }
+      break;
+    case NVPTXISD::LDGV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i8:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar;
+        break;
+      }
+      break;
+    case NVPTXISD::LDUV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i8:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar;
+        break;
+      }
+      break;
+    case NVPTXISD::LDGV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i8:
+        Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar;
+        break;
+      }
+      break;
+    case NVPTXISD::LDUV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i8:
+        Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar;
+        break;
+      }
+      break;
+    }
+
+    SDValue Ops[] = { Addr, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
+  } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
+                          : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
+    if (TM.is64Bit()) {
+      switch (N->getOpcode()) {
+      default:
+        return false;
+      case ISD::LOAD:
+      case ISD::INTRINSIC_W_CHAIN:
+        if (IsLDG) {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return false;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari64;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari64;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari64;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari64;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari64;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari64;
+            break;
+          }
+        } else {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return false;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari64;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari64;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari64;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari64;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari64;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari64;
+            break;
+          }
+        }
+        break;
+      case NVPTXISD::LoadV2:
+      case NVPTXISD::LDGV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64;
+          break;
+        }
+        break;
+      case NVPTXISD::LDUV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64;
+          break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+      case NVPTXISD::LDGV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64;
+          break;
+        }
+        break;
+      case NVPTXISD::LDUV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default:
+        return false;
+      case ISD::LOAD:
+      case ISD::INTRINSIC_W_CHAIN:
+        if (IsLDG) {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return false;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari;
+            break;
+          }
+        } else {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return false;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari;
+            break;
+          }
+        }
+        break;
+      case NVPTXISD::LoadV2:
+      case NVPTXISD::LDGV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32;
+          break;
+        }
+        break;
+      case NVPTXISD::LDUV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32;
+          break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+      case NVPTXISD::LDGV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32;
+          break;
+        }
+        break;
+      case NVPTXISD::LDUV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32;
+          break;
+        }
+        break;
+      }
+    }
+
+    SDValue Ops[] = { Base, Offset, Chain };
+
+    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
+  } else {
+    if (TM.is64Bit()) {
+      switch (N->getOpcode()) {
+      default:
+        return false;
+      case ISD::LOAD:
+      case ISD::INTRINSIC_W_CHAIN:
+        if (IsLDG) {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return false;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg64;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg64;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg64;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg64;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg64;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg64;
+            break;
+          }
+        } else {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return false;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg64;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg64;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg64;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg64;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg64;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg64;
+            break;
+          }
+        }
+        break;
+      case NVPTXISD::LoadV2:
+      case NVPTXISD::LDGV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64;
+          break;
+        }
+        break;
+      case NVPTXISD::LDUV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64;
+          break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+      case NVPTXISD::LDGV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64;
+          break;
+        }
+        break;
+      case NVPTXISD::LDUV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default:
+        return false;
+      case ISD::LOAD:
+      case ISD::INTRINSIC_W_CHAIN:
+        if (IsLDG) {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return false;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg;
+            break;
+          }
+        } else {
+          switch (EltVT.getSimpleVT().SimpleTy) {
+          default:
+            return false;
+          case MVT::i8:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg;
+            break;
+          case MVT::i16:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg;
+            break;
+          case MVT::i32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg;
+            break;
+          case MVT::i64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg;
+            break;
+          case MVT::f32:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg;
+            break;
+          case MVT::f64:
+            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg;
+            break;
+          }
+        }
+        break;
+      case NVPTXISD::LoadV2:
+      case NVPTXISD::LDGV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32;
+          break;
+        }
+        break;
+      case NVPTXISD::LDUV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32;
+          break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+      case NVPTXISD::LDGV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32;
+          break;
+        }
+        break;
+      case NVPTXISD::LDUV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32;
+          break;
+        }
+        break;
+      }
+    }
+
+    SDValue Ops[] = { Op1, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
+  }
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = Mem->getMemOperand();
+  cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  // For automatic generation of LDG (through SelectLoad[Vector], not the
+  // intrinsics), we may have an extending load like:
+  //
+  //   i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
+  //
+  // In this case, the matching logic above will select a load for the original
+  // memory type (in this case, i8) and our types will not match (the node needs
+  // to return an i32 in this case). Our LDG/LDU nodes do not support the
+  // concept of sign-/zero-extension, so emulate it here by adding an explicit
+  // CVT instruction. Ptxas should clean up any redundancies here.
+
+  EVT OrigType = N->getValueType(0);
+  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
+
+  if (OrigType != EltVT && LdNode) {
+    // We have an extending-load. The instruction we selected operates on the
+    // smaller type, but the SDNode we are replacing has the larger type. We
+    // need to emit a CVT to make the types match.
+    bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
+    unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
+                                       EltVT.getSimpleVT(), IsSigned);
+
+    // For each output value, apply the manual sign/zero-extension and make sure
+    // all users of the load go through that CVT.
+    for (unsigned i = 0; i != NumElts; ++i) {
+      SDValue Res(LD, i);
+      SDValue OrigVal(N, i);
+
+      SDNode *CvtNode =
+        CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
+                               CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
+                                                         DL, MVT::i32));
+      ReplaceUses(OrigVal, SDValue(CvtNode, 0));
+    }
+  }
+
+  ReplaceNode(N, LD);
+  return true;
+}
+
+bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
+  SDLoc dl(N);
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  EVT StoreVT = ST->getMemoryVT();
+  SDNode *NVPTXST = nullptr;
+
+  // do not support pre/post inc/dec
+  if (ST->isIndexed())
+    return false;
+
+  if (!StoreVT.isSimple())
+    return false;
+
+  // Address Space Setting
+  unsigned int codeAddrSpace = getCodeAddrSpace(ST);
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool isVolatile = ST->isVolatile();
+  if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    isVolatile = false;
+
+  // Vector Setting
+  MVT SimpleVT = StoreVT.getSimpleVT();
+  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+  if (SimpleVT.isVector()) {
+    unsigned num = SimpleVT.getVectorNumElements();
+    if (num == 2)
+      vecType = NVPTX::PTXLdStInstCode::V2;
+    else if (num == 4)
+      vecType = NVPTX::PTXLdStInstCode::V4;
+    else
+      return false;
+  }
+
+  // Type Setting: toType + toTypeWidth
+  // - for integer type, always use 'u'
+  //
+  MVT ScalarVT = SimpleVT.getScalarType();
+  unsigned toTypeWidth = ScalarVT.getSizeInBits();
+  unsigned int toType;
+  if (ScalarVT.isFloatingPoint())
+    toType = NVPTX::PTXLdStInstCode::Float;
+  else
+    toType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  // Create the machine instruction DAG
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  SDValue Addr;
+  SDValue Offset, Base;
+  unsigned Opcode;
+  MVT::SimpleValueType SourceVT = N1.getNode()->getSimpleValueType(0).SimpleTy;
+
+  if (SelectDirectAddr(N2, Addr)) {
+    switch (SourceVT) {
+    case MVT::i8:
+      Opcode = NVPTX::ST_i8_avar;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::ST_i16_avar;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::ST_i32_avar;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::ST_i64_avar;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::ST_f32_avar;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::ST_f64_avar;
+      break;
+    default:
+      return false;
+    }
+    SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
+                      getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
+                      getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Addr,
+                      Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+  } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
+                          : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+    switch (SourceVT) {
+    case MVT::i8:
+      Opcode = NVPTX::ST_i8_asi;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::ST_i16_asi;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::ST_i32_asi;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::ST_i64_asi;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::ST_f32_asi;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::ST_f64_asi;
+      break;
+    default:
+      return false;
+    }
+    SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
+                      getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
+                      getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
+                      Offset, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+  } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
+                          : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+    if (TM.is64Bit()) {
+      switch (SourceVT) {
+      case MVT::i8:
+        Opcode = NVPTX::ST_i8_ari_64;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::ST_i16_ari_64;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::ST_i32_ari_64;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::ST_i64_ari_64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::ST_f32_ari_64;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::ST_f64_ari_64;
+        break;
+      default:
+        return false;
+      }
+    } else {
+      switch (SourceVT) {
+      case MVT::i8:
+        Opcode = NVPTX::ST_i8_ari;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::ST_i16_ari;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::ST_i32_ari;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::ST_i64_ari;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::ST_f32_ari;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::ST_f64_ari;
+        break;
+      default:
+        return false;
+      }
+    }
+    SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
+                      getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
+                      getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
+                      Offset, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+  } else {
+    if (TM.is64Bit()) {
+      switch (SourceVT) {
+      case MVT::i8:
+        Opcode = NVPTX::ST_i8_areg_64;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::ST_i16_areg_64;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::ST_i32_areg_64;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::ST_i64_areg_64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::ST_f32_areg_64;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::ST_f64_areg_64;
+        break;
+      default:
+        return false;
+      }
+    } else {
+      switch (SourceVT) {
+      case MVT::i8:
+        Opcode = NVPTX::ST_i8_areg;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::ST_i16_areg;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::ST_i32_areg;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::ST_i64_areg;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::ST_f32_areg;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::ST_f64_areg;
+        break;
+      default:
+        return false;
+      }
+    }
+    SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
+                      getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
+                      getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), N2,
+                      Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+  }
+
+  if (!NVPTXST)
+    return false;
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
+  ReplaceNode(N, NVPTXST);
+  return true;
+}
+
+bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue Addr, Offset, Base;
+  unsigned Opcode;
+  SDLoc DL(N);
+  SDNode *ST;
+  EVT EltVT = Op1.getValueType();
+  MemSDNode *MemSD = cast<MemSDNode>(N);
+  EVT StoreVT = MemSD->getMemoryVT();
+
+  // Address Space Setting
+  unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
+
+  if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
+    report_fatal_error("Cannot store to pointer that points to constant "
+                       "memory space");
+  }
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool IsVolatile = MemSD->isVolatile();
+  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    IsVolatile = false;
+
+  // Type Setting: toType + toTypeWidth
+  // - for integer type, always use 'u'
+  assert(StoreVT.isSimple() && "Store value is not simple");
+  MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
+  unsigned ToTypeWidth = ScalarVT.getSizeInBits();
+  unsigned ToType;
+  if (ScalarVT.isFloatingPoint())
+    ToType = NVPTX::PTXLdStInstCode::Float;
+  else
+    ToType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  SmallVector<SDValue, 12> StOps;
+  SDValue N2;
+  unsigned VecType;
+
+  switch (N->getOpcode()) {
+  case NVPTXISD::StoreV2:
+    VecType = NVPTX::PTXLdStInstCode::V2;
+    StOps.push_back(N->getOperand(1));
+    StOps.push_back(N->getOperand(2));
+    N2 = N->getOperand(3);
+    break;
+  case NVPTXISD::StoreV4:
+    VecType = NVPTX::PTXLdStInstCode::V4;
+    StOps.push_back(N->getOperand(1));
+    StOps.push_back(N->getOperand(2));
+    StOps.push_back(N->getOperand(3));
+    StOps.push_back(N->getOperand(4));
+    N2 = N->getOperand(5);
+    break;
+  default:
+    return false;
+  }
+
+  StOps.push_back(getI32Imm(IsVolatile, DL));
+  StOps.push_back(getI32Imm(CodeAddrSpace, DL));
+  StOps.push_back(getI32Imm(VecType, DL));
+  StOps.push_back(getI32Imm(ToType, DL));
+  StOps.push_back(getI32Imm(ToTypeWidth, DL));
+
+  if (SelectDirectAddr(N2, Addr)) {
+    switch (N->getOpcode()) {
+    default:
+      return false;
+    case NVPTXISD::StoreV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i8:
+        Opcode = NVPTX::STV_i8_v2_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::STV_i16_v2_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::STV_i32_v2_avar;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::STV_i64_v2_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::STV_f32_v2_avar;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::STV_f64_v2_avar;
+        break;
+      }
+      break;
+    case NVPTXISD::StoreV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i8:
+        Opcode = NVPTX::STV_i8_v4_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::STV_i16_v4_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::STV_i32_v4_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::STV_f32_v4_avar;
+        break;
+      }
+      break;
+    }
+    StOps.push_back(Addr);
+  } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
+                          : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+    switch (N->getOpcode()) {
+    default:
+      return false;
+    case NVPTXISD::StoreV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i8:
+        Opcode = NVPTX::STV_i8_v2_asi;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::STV_i16_v2_asi;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::STV_i32_v2_asi;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::STV_i64_v2_asi;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::STV_f32_v2_asi;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::STV_f64_v2_asi;
+        break;
+      }
+      break;
+    case NVPTXISD::StoreV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i8:
+        Opcode = NVPTX::STV_i8_v4_asi;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::STV_i16_v4_asi;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::STV_i32_v4_asi;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::STV_f32_v4_asi;
+        break;
+      }
+      break;
+    }
+    StOps.push_back(Base);
+    StOps.push_back(Offset);
+  } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
+                          : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+    if (TM.is64Bit()) {
+      switch (N->getOpcode()) {
+      default:
+        return false;
+      case NVPTXISD::StoreV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v2_ari_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v2_ari_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v2_ari_64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::STV_i64_v2_ari_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v2_ari_64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::STV_f64_v2_ari_64;
+          break;
+        }
+        break;
+      case NVPTXISD::StoreV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v4_ari_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v4_ari_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v4_ari_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v4_ari_64;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default:
+        return false;
+      case NVPTXISD::StoreV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v2_ari;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v2_ari;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v2_ari;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::STV_i64_v2_ari;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v2_ari;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::STV_f64_v2_ari;
+          break;
+        }
+        break;
+      case NVPTXISD::StoreV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v4_ari;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v4_ari;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v4_ari;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v4_ari;
+          break;
+        }
+        break;
+      }
+    }
+    StOps.push_back(Base);
+    StOps.push_back(Offset);
+  } else {
+    if (TM.is64Bit()) {
+      switch (N->getOpcode()) {
+      default:
+        return false;
+      case NVPTXISD::StoreV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v2_areg_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v2_areg_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v2_areg_64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::STV_i64_v2_areg_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v2_areg_64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::STV_f64_v2_areg_64;
+          break;
+        }
+        break;
+      case NVPTXISD::StoreV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v4_areg_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v4_areg_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v4_areg_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v4_areg_64;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default:
+        return false;
+      case NVPTXISD::StoreV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v2_areg;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v2_areg;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v2_areg;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::STV_i64_v2_areg;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v2_areg;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::STV_f64_v2_areg;
+          break;
+        }
+        break;
+      case NVPTXISD::StoreV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return false;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v4_areg;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v4_areg;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v4_areg;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v4_areg;
+          break;
+        }
+        break;
+      }
+    }
+    StOps.push_back(N2);
+  }
+
+  StOps.push_back(Chain);
+
+  ST = CurDAG->getMachineNode(Opcode, DL, MVT::Other, StOps);
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(ST)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  ReplaceNode(N, ST);
+  return true;
+}
+
+bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
+  SDValue Chain = Node->getOperand(0);
+  SDValue Offset = Node->getOperand(2);
+  SDValue Flag = Node->getOperand(3);
+  SDLoc DL(Node);
+  MemSDNode *Mem = cast<MemSDNode>(Node);
+
+  unsigned VecSize;
+  switch (Node->getOpcode()) {
+  default:
+    return false;
+  case NVPTXISD::LoadParam:
+    VecSize = 1;
+    break;
+  case NVPTXISD::LoadParamV2:
+    VecSize = 2;
+    break;
+  case NVPTXISD::LoadParamV4:
+    VecSize = 4;
+    break;
+  }
+
+  EVT EltVT = Node->getValueType(0);
+  EVT MemVT = Mem->getMemoryVT();
+
+  unsigned Opc = 0;
+
+  switch (VecSize) {
+  default:
+    return false;
+  case 1:
+    switch (MemVT.getSimpleVT().SimpleTy) {
+    default:
+      return false;
+    case MVT::i1:
+      Opc = NVPTX::LoadParamMemI8;
+      break;
+    case MVT::i8:
+      Opc = NVPTX::LoadParamMemI8;
+      break;
+    case MVT::i16:
+      Opc = NVPTX::LoadParamMemI16;
+      break;
+    case MVT::i32:
+      Opc = NVPTX::LoadParamMemI32;
+      break;
+    case MVT::i64:
+      Opc = NVPTX::LoadParamMemI64;
+      break;
+    case MVT::f32:
+      Opc = NVPTX::LoadParamMemF32;
+      break;
+    case MVT::f64:
+      Opc = NVPTX::LoadParamMemF64;
+      break;
+    }
+    break;
+  case 2:
+    switch (MemVT.getSimpleVT().SimpleTy) {
+    default:
+      return false;
+    case MVT::i1:
+      Opc = NVPTX::LoadParamMemV2I8;
+      break;
+    case MVT::i8:
+      Opc = NVPTX::LoadParamMemV2I8;
+      break;
+    case MVT::i16:
+      Opc = NVPTX::LoadParamMemV2I16;
+      break;
+    case MVT::i32:
+      Opc = NVPTX::LoadParamMemV2I32;
+      break;
+    case MVT::i64:
+      Opc = NVPTX::LoadParamMemV2I64;
+      break;
+    case MVT::f32:
+      Opc = NVPTX::LoadParamMemV2F32;
+      break;
+    case MVT::f64:
+      Opc = NVPTX::LoadParamMemV2F64;
+      break;
+    }
+    break;
+  case 4:
+    switch (MemVT.getSimpleVT().SimpleTy) {
+    default:
+      return false;
+    case MVT::i1:
+      Opc = NVPTX::LoadParamMemV4I8;
+      break;
+    case MVT::i8:
+      Opc = NVPTX::LoadParamMemV4I8;
+      break;
+    case MVT::i16:
+      Opc = NVPTX::LoadParamMemV4I16;
+      break;
+    case MVT::i32:
+      Opc = NVPTX::LoadParamMemV4I32;
+      break;
+    case MVT::f32:
+      Opc = NVPTX::LoadParamMemV4F32;
+      break;
+    }
+    break;
+  }
+
+  SDVTList VTs;
+  if (VecSize == 1) {
+    VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
+  } else if (VecSize == 2) {
+    VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
+  } else {
+    EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
+    VTs = CurDAG->getVTList(EVTs);
+  }
+
+  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+
+  SmallVector<SDValue, 2> Ops;
+  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
+  Ops.push_back(Chain);
+  Ops.push_back(Flag);
+
+  ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, VTs, Ops));
+  return true;
+}
+
+bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Chain = N->getOperand(0);
+  SDValue Offset = N->getOperand(1);
+  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+  MemSDNode *Mem = cast<MemSDNode>(N);
+
+  // How many elements do we have?
+  unsigned NumElts = 1;
+  switch (N->getOpcode()) {
+  default:
+    return false;
+  case NVPTXISD::StoreRetval:
+    NumElts = 1;
+    break;
+  case NVPTXISD::StoreRetvalV2:
+    NumElts = 2;
+    break;
+  case NVPTXISD::StoreRetvalV4:
+    NumElts = 4;
+    break;
+  }
+
+  // Build vector of operands
+  SmallVector<SDValue, 6> Ops;
+  for (unsigned i = 0; i < NumElts; ++i)
+    Ops.push_back(N->getOperand(i + 2));
+  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
+  Ops.push_back(Chain);
+
+  // Determine target opcode
+  // If we have an i1, use an 8-bit store. The lowering code in
+  // NVPTXISelLowering will have already emitted an upcast.
+  unsigned Opcode = 0;
+  switch (NumElts) {
+  default:
+    return false;
+  case 1:
+    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+    default:
+      return false;
+    case MVT::i1:
+      Opcode = NVPTX::StoreRetvalI8;
+      break;
+    case MVT::i8:
+      Opcode = NVPTX::StoreRetvalI8;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::StoreRetvalI16;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::StoreRetvalI32;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::StoreRetvalI64;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::StoreRetvalF32;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::StoreRetvalF64;
+      break;
+    }
+    break;
+  case 2:
+    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+    default:
+      return false;
+    case MVT::i1:
+      Opcode = NVPTX::StoreRetvalV2I8;
+      break;
+    case MVT::i8:
+      Opcode = NVPTX::StoreRetvalV2I8;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::StoreRetvalV2I16;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::StoreRetvalV2I32;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::StoreRetvalV2I64;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::StoreRetvalV2F32;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::StoreRetvalV2F64;
+      break;
+    }
+    break;
+  case 4:
+    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+    default:
+      return false;
+    case MVT::i1:
+      Opcode = NVPTX::StoreRetvalV4I8;
+      break;
+    case MVT::i8:
+      Opcode = NVPTX::StoreRetvalV4I8;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::StoreRetvalV4I16;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::StoreRetvalV4I32;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::StoreRetvalV4F32;
+      break;
+    }
+    break;
+  }
+
+  SDNode *Ret =
+      CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  ReplaceNode(N, Ret);
+  return true;
+}
+
+bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Chain = N->getOperand(0);
+  SDValue Param = N->getOperand(1);
+  unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
+  SDValue Offset = N->getOperand(2);
+  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+  MemSDNode *Mem = cast<MemSDNode>(N);
+  SDValue Flag = N->getOperand(N->getNumOperands() - 1);
+
+  // How many elements do we have?
+  unsigned NumElts = 1;
+  switch (N->getOpcode()) {
+  default:
+    return false;
+  case NVPTXISD::StoreParamU32:
+  case NVPTXISD::StoreParamS32:
+  case NVPTXISD::StoreParam:
+    NumElts = 1;
+    break;
+  case NVPTXISD::StoreParamV2:
+    NumElts = 2;
+    break;
+  case NVPTXISD::StoreParamV4:
+    NumElts = 4;
+    break;
+  }
+
+  // Build vector of operands
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i < NumElts; ++i)
+    Ops.push_back(N->getOperand(i + 3));
+  Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
+  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
+  Ops.push_back(Chain);
+  Ops.push_back(Flag);
+
+  // Determine target opcode
+  // If we have an i1, use an 8-bit store. The lowering code in
+  // NVPTXISelLowering will have already emitted an upcast.
+  unsigned Opcode = 0;
+  switch (N->getOpcode()) {
+  default:
+    switch (NumElts) {
+    default:
+      return false;
+    case 1:
+      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i1:
+        Opcode = NVPTX::StoreParamI8;
+        break;
+      case MVT::i8:
+        Opcode = NVPTX::StoreParamI8;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::StoreParamI16;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::StoreParamI32;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::StoreParamI64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::StoreParamF32;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::StoreParamF64;
+        break;
+      }
+      break;
+    case 2:
+      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i1:
+        Opcode = NVPTX::StoreParamV2I8;
+        break;
+      case MVT::i8:
+        Opcode = NVPTX::StoreParamV2I8;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::StoreParamV2I16;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::StoreParamV2I32;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::StoreParamV2I64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::StoreParamV2F32;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::StoreParamV2F64;
+        break;
+      }
+      break;
+    case 4:
+      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i1:
+        Opcode = NVPTX::StoreParamV4I8;
+        break;
+      case MVT::i8:
+        Opcode = NVPTX::StoreParamV4I8;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::StoreParamV4I16;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::StoreParamV4I32;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::StoreParamV4F32;
+        break;
+      }
+      break;
+    }
+    break;
+  // Special case: if we have a sign-extend/zero-extend node, insert the
+  // conversion instruction first, and use that as the value operand to
+  // the selected StoreParam node.
+  case NVPTXISD::StoreParamU32: {
+    Opcode = NVPTX::StoreParamI32;
+    SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
+                                                MVT::i32);
+    SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
+                                         MVT::i32, Ops[0], CvtNone);
+    Ops[0] = SDValue(Cvt, 0);
+    break;
+  }
+  case NVPTXISD::StoreParamS32: {
+    Opcode = NVPTX::StoreParamI32;
+    SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
+                                                MVT::i32);
+    SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
+                                         MVT::i32, Ops[0], CvtNone);
+    Ops[0] = SDValue(Cvt, 0);
+    break;
+  }
+  }
+
+  SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+  SDNode *Ret =
+      CurDAG->getMachineNode(Opcode, DL, RetVTs, Ops);
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  ReplaceNode(N, Ret);
+  return true;
+}
+
+bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  unsigned Opc = 0;
+  SmallVector<SDValue, 8> Ops;
+
+  switch (N->getOpcode()) {
+  default: return false;
+  case NVPTXISD::Tex1DFloatS32:
+    Opc = NVPTX::TEX_1D_F32_S32;
+    break;
+  case NVPTXISD::Tex1DFloatFloat:
+    Opc = NVPTX::TEX_1D_F32_F32;
+    break;
+  case NVPTXISD::Tex1DFloatFloatLevel:
+    Opc = NVPTX::TEX_1D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DFloatFloatGrad:
+    Opc = NVPTX::TEX_1D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DS32S32:
+    Opc = NVPTX::TEX_1D_S32_S32;
+    break;
+  case NVPTXISD::Tex1DS32Float:
+    Opc = NVPTX::TEX_1D_S32_F32;
+    break;
+  case NVPTXISD::Tex1DS32FloatLevel:
+    Opc = NVPTX::TEX_1D_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DS32FloatGrad:
+    Opc = NVPTX::TEX_1D_S32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DU32S32:
+    Opc = NVPTX::TEX_1D_U32_S32;
+    break;
+  case NVPTXISD::Tex1DU32Float:
+    Opc = NVPTX::TEX_1D_U32_F32;
+    break;
+  case NVPTXISD::Tex1DU32FloatLevel:
+    Opc = NVPTX::TEX_1D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DU32FloatGrad:
+    Opc = NVPTX::TEX_1D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DArrayFloatS32:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_S32;
+    break;
+  case NVPTXISD::Tex1DArrayFloatFloat:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::Tex1DArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DArrayFloatFloatGrad:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DArrayS32S32:
+    Opc = NVPTX::TEX_1D_ARRAY_S32_S32;
+    break;
+  case NVPTXISD::Tex1DArrayS32Float:
+    Opc = NVPTX::TEX_1D_ARRAY_S32_F32;
+    break;
+  case NVPTXISD::Tex1DArrayS32FloatLevel:
+    Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DArrayS32FloatGrad:
+    Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DArrayU32S32:
+    Opc = NVPTX::TEX_1D_ARRAY_U32_S32;
+    break;
+  case NVPTXISD::Tex1DArrayU32Float:
+    Opc = NVPTX::TEX_1D_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::Tex1DArrayU32FloatLevel:
+    Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DArrayU32FloatGrad:
+    Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DFloatS32:
+    Opc = NVPTX::TEX_2D_F32_S32;
+    break;
+  case NVPTXISD::Tex2DFloatFloat:
+    Opc = NVPTX::TEX_2D_F32_F32;
+    break;
+  case NVPTXISD::Tex2DFloatFloatLevel:
+    Opc = NVPTX::TEX_2D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DFloatFloatGrad:
+    Opc = NVPTX::TEX_2D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DS32S32:
+    Opc = NVPTX::TEX_2D_S32_S32;
+    break;
+  case NVPTXISD::Tex2DS32Float:
+    Opc = NVPTX::TEX_2D_S32_F32;
+    break;
+  case NVPTXISD::Tex2DS32FloatLevel:
+    Opc = NVPTX::TEX_2D_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DS32FloatGrad:
+    Opc = NVPTX::TEX_2D_S32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DU32S32:
+    Opc = NVPTX::TEX_2D_U32_S32;
+    break;
+  case NVPTXISD::Tex2DU32Float:
+    Opc = NVPTX::TEX_2D_U32_F32;
+    break;
+  case NVPTXISD::Tex2DU32FloatLevel:
+    Opc = NVPTX::TEX_2D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DU32FloatGrad:
+    Opc = NVPTX::TEX_2D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DArrayFloatS32:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_S32;
+    break;
+  case NVPTXISD::Tex2DArrayFloatFloat:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::Tex2DArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DArrayFloatFloatGrad:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DArrayS32S32:
+    Opc = NVPTX::TEX_2D_ARRAY_S32_S32;
+    break;
+  case NVPTXISD::Tex2DArrayS32Float:
+    Opc = NVPTX::TEX_2D_ARRAY_S32_F32;
+    break;
+  case NVPTXISD::Tex2DArrayS32FloatLevel:
+    Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DArrayS32FloatGrad:
+    Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DArrayU32S32:
+    Opc = NVPTX::TEX_2D_ARRAY_U32_S32;
+    break;
+  case NVPTXISD::Tex2DArrayU32Float:
+    Opc = NVPTX::TEX_2D_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::Tex2DArrayU32FloatLevel:
+    Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DArrayU32FloatGrad:
+    Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex3DFloatS32:
+    Opc = NVPTX::TEX_3D_F32_S32;
+    break;
+  case NVPTXISD::Tex3DFloatFloat:
+    Opc = NVPTX::TEX_3D_F32_F32;
+    break;
+  case NVPTXISD::Tex3DFloatFloatLevel:
+    Opc = NVPTX::TEX_3D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex3DFloatFloatGrad:
+    Opc = NVPTX::TEX_3D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex3DS32S32:
+    Opc = NVPTX::TEX_3D_S32_S32;
+    break;
+  case NVPTXISD::Tex3DS32Float:
+    Opc = NVPTX::TEX_3D_S32_F32;
+    break;
+  case NVPTXISD::Tex3DS32FloatLevel:
+    Opc = NVPTX::TEX_3D_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex3DS32FloatGrad:
+    Opc = NVPTX::TEX_3D_S32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex3DU32S32:
+    Opc = NVPTX::TEX_3D_U32_S32;
+    break;
+  case NVPTXISD::Tex3DU32Float:
+    Opc = NVPTX::TEX_3D_U32_F32;
+    break;
+  case NVPTXISD::Tex3DU32FloatLevel:
+    Opc = NVPTX::TEX_3D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex3DU32FloatGrad:
+    Opc = NVPTX::TEX_3D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexCubeFloatFloat:
+    Opc = NVPTX::TEX_CUBE_F32_F32;
+    break;
+  case NVPTXISD::TexCubeFloatFloatLevel:
+    Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexCubeS32Float:
+    Opc = NVPTX::TEX_CUBE_S32_F32;
+    break;
+  case NVPTXISD::TexCubeS32FloatLevel:
+    Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexCubeU32Float:
+    Opc = NVPTX::TEX_CUBE_U32_F32;
+    break;
+  case NVPTXISD::TexCubeU32FloatLevel:
+    Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexCubeArrayFloatFloat:
+    Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::TexCubeArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexCubeArrayS32Float:
+    Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32;
+    break;
+  case NVPTXISD::TexCubeArrayS32FloatLevel:
+    Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexCubeArrayU32Float:
+    Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::TexCubeArrayU32FloatLevel:
+    Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tld4R2DFloatFloat:
+    Opc = NVPTX::TLD4_R_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4G2DFloatFloat:
+    Opc = NVPTX::TLD4_G_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4B2DFloatFloat:
+    Opc = NVPTX::TLD4_B_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4A2DFloatFloat:
+    Opc = NVPTX::TLD4_A_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4R2DS64Float:
+    Opc = NVPTX::TLD4_R_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4G2DS64Float:
+    Opc = NVPTX::TLD4_G_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4B2DS64Float:
+    Opc = NVPTX::TLD4_B_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4A2DS64Float:
+    Opc = NVPTX::TLD4_A_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4R2DU64Float:
+    Opc = NVPTX::TLD4_R_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4G2DU64Float:
+    Opc = NVPTX::TLD4_G_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4B2DU64Float:
+    Opc = NVPTX::TLD4_B_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4A2DU64Float:
+    Opc = NVPTX::TLD4_A_2D_U32_F32;
+    break;
+  case NVPTXISD::TexUnified1DFloatS32:
+    Opc = NVPTX::TEX_UNIFIED_1D_F32_S32;
+    break;
+  case NVPTXISD::TexUnified1DFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_1D_F32_F32;
+    break;
+  case NVPTXISD::TexUnified1DFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DFloatFloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified1DS32S32:
+    Opc = NVPTX::TEX_UNIFIED_1D_S32_S32;
+    break;
+  case NVPTXISD::TexUnified1DS32Float:
+    Opc = NVPTX::TEX_UNIFIED_1D_S32_F32;
+    break;
+  case NVPTXISD::TexUnified1DS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DS32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified1DU32S32:
+    Opc = NVPTX::TEX_UNIFIED_1D_U32_S32;
+    break;
+  case NVPTXISD::TexUnified1DU32Float:
+    Opc = NVPTX::TEX_UNIFIED_1D_U32_F32;
+    break;
+  case NVPTXISD::TexUnified1DU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DU32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified1DArrayFloatS32:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32;
+    break;
+  case NVPTXISD::TexUnified1DArrayFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified1DArrayS32S32:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32;
+    break;
+  case NVPTXISD::TexUnified1DArrayS32Float:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32;
+    break;
+  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified1DArrayU32S32:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32;
+    break;
+  case NVPTXISD::TexUnified1DArrayU32Float:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DFloatS32:
+    Opc = NVPTX::TEX_UNIFIED_2D_F32_S32;
+    break;
+  case NVPTXISD::TexUnified2DFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_2D_F32_F32;
+    break;
+  case NVPTXISD::TexUnified2DFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DFloatFloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DS32S32:
+    Opc = NVPTX::TEX_UNIFIED_2D_S32_S32;
+    break;
+  case NVPTXISD::TexUnified2DS32Float:
+    Opc = NVPTX::TEX_UNIFIED_2D_S32_F32;
+    break;
+  case NVPTXISD::TexUnified2DS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DS32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DU32S32:
+    Opc = NVPTX::TEX_UNIFIED_2D_U32_S32;
+    break;
+  case NVPTXISD::TexUnified2DU32Float:
+    Opc = NVPTX::TEX_UNIFIED_2D_U32_F32;
+    break;
+  case NVPTXISD::TexUnified2DU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DU32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DArrayFloatS32:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32;
+    break;
+  case NVPTXISD::TexUnified2DArrayFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DArrayS32S32:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32;
+    break;
+  case NVPTXISD::TexUnified2DArrayS32Float:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32;
+    break;
+  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DArrayU32S32:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32;
+    break;
+  case NVPTXISD::TexUnified2DArrayU32Float:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified3DFloatS32:
+    Opc = NVPTX::TEX_UNIFIED_3D_F32_S32;
+    break;
+  case NVPTXISD::TexUnified3DFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_3D_F32_F32;
+    break;
+  case NVPTXISD::TexUnified3DFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified3DFloatFloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified3DS32S32:
+    Opc = NVPTX::TEX_UNIFIED_3D_S32_S32;
+    break;
+  case NVPTXISD::TexUnified3DS32Float:
+    Opc = NVPTX::TEX_UNIFIED_3D_S32_F32;
+    break;
+  case NVPTXISD::TexUnified3DS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified3DS32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified3DU32S32:
+    Opc = NVPTX::TEX_UNIFIED_3D_U32_S32;
+    break;
+  case NVPTXISD::TexUnified3DU32Float:
+    Opc = NVPTX::TEX_UNIFIED_3D_U32_F32;
+    break;
+  case NVPTXISD::TexUnified3DU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified3DU32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnifiedCubeFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnifiedCubeS32Float:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnifiedCubeU32Float:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayS32Float:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayU32Float:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+    Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+    Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+    Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+    Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedR2DS64Float:
+    Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedG2DS64Float:
+    Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedB2DS64Float:
+    Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedA2DS64Float:
+    Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedR2DU64Float:
+    Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedG2DU64Float:
+    Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedB2DU64Float:
+    Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedA2DU64Float:
+    Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32;
+    break;
+  }
+
+  // Copy over operands
+  for (unsigned i = 1; i < N->getNumOperands(); ++i) {
+    Ops.push_back(N->getOperand(i));
+  }
+
+  Ops.push_back(Chain);
+  ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
+  return true;
+}
+
+bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue TexHandle = N->getOperand(1);
+  unsigned Opc = 0;
+  SmallVector<SDValue, 8> Ops;
+  switch (N->getOpcode()) {
+  default: return false;
+  case NVPTXISD::Suld1DI8Clamp:
+    Opc = NVPTX::SULD_1D_I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI16Clamp:
+    Opc = NVPTX::SULD_1D_I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI32Clamp:
+    Opc = NVPTX::SULD_1D_I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI64Clamp:
+    Opc = NVPTX::SULD_1D_I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I8Clamp:
+    Opc = NVPTX::SULD_1D_V2I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I16Clamp:
+    Opc = NVPTX::SULD_1D_V2I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I32Clamp:
+    Opc = NVPTX::SULD_1D_V2I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I64Clamp:
+    Opc = NVPTX::SULD_1D_V2I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I8Clamp:
+    Opc = NVPTX::SULD_1D_V4I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I16Clamp:
+    Opc = NVPTX::SULD_1D_V4I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I32Clamp:
+    Opc = NVPTX::SULD_1D_V4I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI8Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI16Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI32Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI64Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I8Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I16Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I32Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I64Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I8Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I16Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I32Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI8Clamp:
+    Opc = NVPTX::SULD_2D_I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI16Clamp:
+    Opc = NVPTX::SULD_2D_I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI32Clamp:
+    Opc = NVPTX::SULD_2D_I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI64Clamp:
+    Opc = NVPTX::SULD_2D_I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I8Clamp:
+    Opc = NVPTX::SULD_2D_V2I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I16Clamp:
+    Opc = NVPTX::SULD_2D_V2I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I32Clamp:
+    Opc = NVPTX::SULD_2D_V2I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I64Clamp:
+    Opc = NVPTX::SULD_2D_V2I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I8Clamp:
+    Opc = NVPTX::SULD_2D_V4I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I16Clamp:
+    Opc = NVPTX::SULD_2D_V4I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I32Clamp:
+    Opc = NVPTX::SULD_2D_V4I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI8Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI16Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI32Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI64Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I8Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I16Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I32Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I64Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I8Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I16Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I32Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI8Clamp:
+    Opc = NVPTX::SULD_3D_I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI16Clamp:
+    Opc = NVPTX::SULD_3D_I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI32Clamp:
+    Opc = NVPTX::SULD_3D_I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI64Clamp:
+    Opc = NVPTX::SULD_3D_I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I8Clamp:
+    Opc = NVPTX::SULD_3D_V2I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I16Clamp:
+    Opc = NVPTX::SULD_3D_V2I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I32Clamp:
+    Opc = NVPTX::SULD_3D_V2I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I64Clamp:
+    Opc = NVPTX::SULD_3D_V2I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I8Clamp:
+    Opc = NVPTX::SULD_3D_V4I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I16Clamp:
+    Opc = NVPTX::SULD_3D_V4I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I32Clamp:
+    Opc = NVPTX::SULD_3D_V4I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI8Trap:
+    Opc = NVPTX::SULD_1D_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI16Trap:
+    Opc = NVPTX::SULD_1D_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI32Trap:
+    Opc = NVPTX::SULD_1D_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI64Trap:
+    Opc = NVPTX::SULD_1D_I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I8Trap:
+    Opc = NVPTX::SULD_1D_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I16Trap:
+    Opc = NVPTX::SULD_1D_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I32Trap:
+    Opc = NVPTX::SULD_1D_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I64Trap:
+    Opc = NVPTX::SULD_1D_V2I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I8Trap:
+    Opc = NVPTX::SULD_1D_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I16Trap:
+    Opc = NVPTX::SULD_1D_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I32Trap:
+    Opc = NVPTX::SULD_1D_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI8Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI16Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI32Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI64Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I8Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I16Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I32Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I64Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I8Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I16Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I32Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI8Trap:
+    Opc = NVPTX::SULD_2D_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI16Trap:
+    Opc = NVPTX::SULD_2D_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI32Trap:
+    Opc = NVPTX::SULD_2D_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI64Trap:
+    Opc = NVPTX::SULD_2D_I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I8Trap:
+    Opc = NVPTX::SULD_2D_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I16Trap:
+    Opc = NVPTX::SULD_2D_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I32Trap:
+    Opc = NVPTX::SULD_2D_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I64Trap:
+    Opc = NVPTX::SULD_2D_V2I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I8Trap:
+    Opc = NVPTX::SULD_2D_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I16Trap:
+    Opc = NVPTX::SULD_2D_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I32Trap:
+    Opc = NVPTX::SULD_2D_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI8Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI16Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI32Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI64Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I8Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I16Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I32Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I64Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I8Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I16Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I32Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI8Trap:
+    Opc = NVPTX::SULD_3D_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI16Trap:
+    Opc = NVPTX::SULD_3D_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI32Trap:
+    Opc = NVPTX::SULD_3D_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI64Trap:
+    Opc = NVPTX::SULD_3D_I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I8Trap:
+    Opc = NVPTX::SULD_3D_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I16Trap:
+    Opc = NVPTX::SULD_3D_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I32Trap:
+    Opc = NVPTX::SULD_3D_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I64Trap:
+    Opc = NVPTX::SULD_3D_V2I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I8Trap:
+    Opc = NVPTX::SULD_3D_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I16Trap:
+    Opc = NVPTX::SULD_3D_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I32Trap:
+    Opc = NVPTX::SULD_3D_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI8Zero:
+    Opc = NVPTX::SULD_1D_I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI16Zero:
+    Opc = NVPTX::SULD_1D_I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI32Zero:
+    Opc = NVPTX::SULD_1D_I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI64Zero:
+    Opc = NVPTX::SULD_1D_I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I8Zero:
+    Opc = NVPTX::SULD_1D_V2I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I16Zero:
+    Opc = NVPTX::SULD_1D_V2I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I32Zero:
+    Opc = NVPTX::SULD_1D_V2I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I64Zero:
+    Opc = NVPTX::SULD_1D_V2I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I8Zero:
+    Opc = NVPTX::SULD_1D_V4I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I16Zero:
+    Opc = NVPTX::SULD_1D_V4I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I32Zero:
+    Opc = NVPTX::SULD_1D_V4I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI8Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI16Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI32Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI64Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I8Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I16Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I32Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I64Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I8Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I16Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I32Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI8Zero:
+    Opc = NVPTX::SULD_2D_I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI16Zero:
+    Opc = NVPTX::SULD_2D_I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI32Zero:
+    Opc = NVPTX::SULD_2D_I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI64Zero:
+    Opc = NVPTX::SULD_2D_I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I8Zero:
+    Opc = NVPTX::SULD_2D_V2I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I16Zero:
+    Opc = NVPTX::SULD_2D_V2I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I32Zero:
+    Opc = NVPTX::SULD_2D_V2I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I64Zero:
+    Opc = NVPTX::SULD_2D_V2I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I8Zero:
+    Opc = NVPTX::SULD_2D_V4I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I16Zero:
+    Opc = NVPTX::SULD_2D_V4I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I32Zero:
+    Opc = NVPTX::SULD_2D_V4I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI8Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI16Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI32Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI64Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I8Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I16Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I32Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I64Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I8Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I16Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I32Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI8Zero:
+    Opc = NVPTX::SULD_3D_I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI16Zero:
+    Opc = NVPTX::SULD_3D_I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI32Zero:
+    Opc = NVPTX::SULD_3D_I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI64Zero:
+    Opc = NVPTX::SULD_3D_I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I8Zero:
+    Opc = NVPTX::SULD_3D_V2I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I16Zero:
+    Opc = NVPTX::SULD_3D_V2I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I32Zero:
+    Opc = NVPTX::SULD_3D_V2I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I64Zero:
+    Opc = NVPTX::SULD_3D_V2I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I8Zero:
+    Opc = NVPTX::SULD_3D_V4I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I16Zero:
+    Opc = NVPTX::SULD_3D_V4I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I32Zero:
+    Opc = NVPTX::SULD_3D_V4I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  }
+  ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
+  return true;
+}
+
+
+/// SelectBFE - Look for instruction sequences that can be made more efficient
+/// by using the 'bfe' (bit-field extract) PTX instruction
+bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
+  SDLoc DL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Len;
+  SDValue Start;
+  SDValue Val;
+  bool IsSigned = false;
+
+  if (N->getOpcode() == ISD::AND) {
+    // Canonicalize the operands
+    // We want 'and %val, %mask'
+    if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
+      std::swap(LHS, RHS);
+    }
+
+    ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
+    if (!Mask) {
+      // We need a constant mask on the RHS of the AND
+      return false;
+    }
+
+    // Extract the mask bits
+    uint64_t MaskVal = Mask->getZExtValue();
+    if (!isMask_64(MaskVal)) {
+      // We *could* handle shifted masks here, but doing so would require an
+      // 'and' operation to fix up the low-order bits so we would trade
+      // shr+and for bfe+and, which has the same throughput
+      return false;
+    }
+
+    // How many bits are in our mask?
+    uint64_t NumBits = countTrailingOnes(MaskVal);
+    Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
+
+    if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
+      // We have a 'srl/and' pair, extract the effective start bit and length
+      Val = LHS.getNode()->getOperand(0);
+      Start = LHS.getNode()->getOperand(1);
+      ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
+      if (StartConst) {
+        uint64_t StartVal = StartConst->getZExtValue();
+        // How many "good" bits do we have left?  "good" is defined here as bits
+        // that exist in the original value, not shifted in.
+        uint64_t GoodBits = Start.getValueSizeInBits() - StartVal;
+        if (NumBits > GoodBits) {
+          // Do not handle the case where bits have been shifted in. In theory
+          // we could handle this, but the cost is likely higher than just
+          // emitting the srl/and pair.
+          return false;
+        }
+        Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
+      } else {
+        // Do not handle the case where the shift amount (can be zero if no srl
+        // was found) is not constant. We could handle this case, but it would
+        // require run-time logic that would be more expensive than just
+        // emitting the srl/and pair.
+        return false;
+      }
+    } else {
+      // Do not handle the case where the LHS of the and is not a shift. While
+      // it would be trivial to handle this case, it would just transform
+      // 'and' -> 'bfe', but 'and' has higher-throughput.
+      return false;
+    }
+  } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
+    if (LHS->getOpcode() == ISD::AND) {
+      ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
+      if (!ShiftCnst) {
+        // Shift amount must be constant
+        return false;
+      }
+
+      uint64_t ShiftAmt = ShiftCnst->getZExtValue();
+
+      SDValue AndLHS = LHS->getOperand(0);
+      SDValue AndRHS = LHS->getOperand(1);
+
+      // Canonicalize the AND to have the mask on the RHS
+      if (isa<ConstantSDNode>(AndLHS)) {
+        std::swap(AndLHS, AndRHS);
+      }
+
+      ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
+      if (!MaskCnst) {
+        // Mask must be constant
+        return false;
+      }
+
+      uint64_t MaskVal = MaskCnst->getZExtValue();
+      uint64_t NumZeros;
+      uint64_t NumBits;
+      if (isMask_64(MaskVal)) {
+        NumZeros = 0;
+        // The number of bits in the result bitfield will be the number of
+        // trailing ones (the AND) minus the number of bits we shift off
+        NumBits = countTrailingOnes(MaskVal) - ShiftAmt;
+      } else if (isShiftedMask_64(MaskVal)) {
+        NumZeros = countTrailingZeros(MaskVal);
+        unsigned NumOnes = countTrailingOnes(MaskVal >> NumZeros);
+        // The number of bits in the result bitfield will be the number of
+        // trailing zeros plus the number of set bits in the mask minus the
+        // number of bits we shift off
+        NumBits = NumZeros + NumOnes - ShiftAmt;
+      } else {
+        // This is not a mask we can handle
+        return false;
+      }
+
+      if (ShiftAmt < NumZeros) {
+        // Handling this case would require extra logic that would make this
+        // transformation non-profitable
+        return false;
+      }
+
+      Val = AndLHS;
+      Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
+      Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
+    } else if (LHS->getOpcode() == ISD::SHL) {
+      // Here, we have a pattern like:
+      //
+      // (sra (shl val, NN), MM)
+      // or
+      // (srl (shl val, NN), MM)
+      //
+      // If MM >= NN, we can efficiently optimize this with bfe
+      Val = LHS->getOperand(0);
+
+      SDValue ShlRHS = LHS->getOperand(1);
+      ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
+      if (!ShlCnst) {
+        // Shift amount must be constant
+        return false;
+      }
+      uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
+
+      SDValue ShrRHS = RHS;
+      ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
+      if (!ShrCnst) {
+        // Shift amount must be constant
+        return false;
+      }
+      uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
+
+      // To avoid extra codegen and be profitable, we need Outer >= Inner
+      if (OuterShiftAmt < InnerShiftAmt) {
+        return false;
+      }
+
+      // If the outer shift is more than the type size, we have no bitfield to
+      // extract (since we also check that the inner shift is <= the outer shift
+      // then this also implies that the inner shift is < the type size)
+      if (OuterShiftAmt >= Val.getValueSizeInBits()) {
+        return false;
+      }
+
+      Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
+                                        MVT::i32);
+      Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
+                                      DL, MVT::i32);
+
+      if (N->getOpcode() == ISD::SRA) {
+        // If we have a arithmetic right shift, we need to use the signed bfe
+        // variant
+        IsSigned = true;
+      }
+    } else {
+      // No can do...
+      return false;
+    }
+  } else {
+    // No can do...
+    return false;
+  }
+
+
+  unsigned Opc;
+  // For the BFE operations we form here from "and" and "srl", always use the
+  // unsigned variants.
+  if (Val.getValueType() == MVT::i32) {
+    if (IsSigned) {
+      Opc = NVPTX::BFE_S32rii;
+    } else {
+      Opc = NVPTX::BFE_U32rii;
+    }
+  } else if (Val.getValueType() == MVT::i64) {
+    if (IsSigned) {
+      Opc = NVPTX::BFE_S64rii;
+    } else {
+      Opc = NVPTX::BFE_U64rii;
+    }
+  } else {
+    // We cannot handle this type
+    return false;
+  }
+
+  SDValue Ops[] = {
+    Val, Start, Len
+  };
+
+  ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
+  return true;
+}
+
+// SelectDirectAddr - Match a direct address for DAG.
+// A direct address could be a globaladdress or externalsymbol.
+bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
+  // Return true if TGA or ES.
+  if (N.getOpcode() == ISD::TargetGlobalAddress ||
+      N.getOpcode() == ISD::TargetExternalSymbol) {
+    Address = N;
+    return true;
+  }
+  if (N.getOpcode() == NVPTXISD::Wrapper) {
+    Address = N.getOperand(0);
+    return true;
+  }
+  // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
+  if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
+    if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
+        CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
+        CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
+      return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
+  }
+  return false;
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
+    SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      SDValue base = Addr.getOperand(0);
+      if (SelectDirectAddr(base, Base)) {
+        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
+                                           mvt);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
+                                       SDValue &Base, SDValue &Offset) {
+  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri_imp(
+    SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+    Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false; // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
+      return false;
+    }
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      if (FrameIndexSDNode *FIN =
+              dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+        // Constant offset from frame ref.
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+      else
+        Base = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
+                                         mvt);
+      return true;
+    }
+  }
+  return false;
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
+                                       SDValue &Base, SDValue &Offset) {
+  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
+}
+
+bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
+                                                 unsigned int spN) const {
+  const Value *Src = nullptr;
+  if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
+    if (spN == 0 && mN->getMemOperand()->getPseudoValue())
+      return true;
+    Src = mN->getMemOperand()->getValue();
+  }
+  if (!Src)
+    return false;
+  if (auto *PT = dyn_cast<PointerType>(Src->getType()))
+    return (PT->getAddressSpace() == spN);
+  return false;
+}
+
+/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+/// inline asm expressions.
+bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+  switch (ConstraintID) {
+  default:
+    return true;
+  case InlineAsm::Constraint_m: // memory
+    if (SelectDirectAddr(Op, Op0)) {
+      OutOps.push_back(Op0);
+      OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
+      return false;
+    }
+    if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
+      OutOps.push_back(Op0);
+      OutOps.push_back(Op1);
+      return false;
+    }
+    break;
+  }
+  return true;
+}
+
+/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
+/// conversion from \p SrcTy to \p DestTy.
+unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
+                                             bool IsSigned) {
+  switch (SrcTy.SimpleTy) {
+  default:
+    llvm_unreachable("Unhandled source type");
+  case MVT::i8:
+    switch (DestTy.SimpleTy) {
+    default:
+      llvm_unreachable("Unhandled dest type");
+    case MVT::i16:
+      return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
+    case MVT::i32:
+      return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
+    case MVT::i64:
+      return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
+    }
+  case MVT::i16:
+    switch (DestTy.SimpleTy) {
+    default:
+      llvm_unreachable("Unhandled dest type");
+    case MVT::i8:
+      return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
+    case MVT::i32:
+      return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
+    case MVT::i64:
+      return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
+    }
+  case MVT::i32:
+    switch (DestTy.SimpleTy) {
+    default:
+      llvm_unreachable("Unhandled dest type");
+    case MVT::i8:
+      return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
+    case MVT::i16:
+      return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
+    case MVT::i64:
+      return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
+    }
+  case MVT::i64:
+    switch (DestTy.SimpleTy) {
+    default:
+      llvm_unreachable("Unhandled dest type");
+    case MVT::i8:
+      return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
+    case MVT::i16:
+      return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
+    case MVT::i32:
+      return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
new file mode 100644
index 000000000000..0591035a6aa8
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -0,0 +1,100 @@
+//===-- NVPTXISelDAGToDAG.h - A dag to dag inst selector for NVPTX --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXISELDAGTODAG_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXISELDAGTODAG_H
+
+#include "NVPTX.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
+  const NVPTXTargetMachine &TM;
+
+  // If true, generate mul.wide from sext and mul
+  bool doMulWide;
+
+  int getDivF32Level() const;
+  bool usePrecSqrtF32() const;
+  bool useF32FTZ() const;
+  bool allowFMA() const;
+
+public:
+  explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
+                             CodeGenOpt::Level   OptLevel);
+
+  // Pass Name
+  StringRef getPassName() const override {
+    return "NVPTX DAG->DAG Pattern Instruction Selection";
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  const NVPTXSubtarget *Subtarget;
+
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    unsigned ConstraintID,
+                                    std::vector<SDValue> &OutOps) override;
+private:
+// Include the pieces autogenerated from the target description.
+#include "NVPTXGenDAGISel.inc"
+
+  void Select(SDNode *N) override;
+  bool tryIntrinsicNoChain(SDNode *N);
+  bool tryIntrinsicChain(SDNode *N);
+  void SelectTexSurfHandle(SDNode *N);
+  bool tryLoad(SDNode *N);
+  bool tryLoadVector(SDNode *N);
+  bool tryLDGLDU(SDNode *N);
+  bool tryStore(SDNode *N);
+  bool tryStoreVector(SDNode *N);
+  bool tryLoadParam(SDNode *N);
+  bool tryStoreRetval(SDNode *N);
+  bool tryStoreParam(SDNode *N);
+  void SelectAddrSpaceCast(SDNode *N);
+  bool tryTextureIntrinsic(SDNode *N);
+  bool trySurfaceIntrinsic(SDNode *N);
+  bool tryBFE(SDNode *N);
+
+  inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
+    return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
+  }
+
+  // Match direct address complex pattern.
+  bool SelectDirectAddr(SDValue N, SDValue &Address);
+
+  bool SelectADDRri_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                        SDValue &Offset, MVT mvt);
+  bool SelectADDRri(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                    SDValue &Offset);
+  bool SelectADDRri64(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                      SDValue &Offset);
+
+  bool SelectADDRsi_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                        SDValue &Offset, MVT mvt);
+  bool SelectADDRsi(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                    SDValue &Offset);
+  bool SelectADDRsi64(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                      SDValue &Offset);
+
+  bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
+
+  static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, bool IsSigned);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
new file mode 100644
index 000000000000..2e4764feff11
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -0,0 +1,4639 @@
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that NVPTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXISelLowering.h"
+#include "NVPTX.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXTargetObjectFile.h"
+#include "NVPTXUtilities.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <sstream>
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "nvptx-lower"
+
+using namespace llvm;
+
+static unsigned int uniqueCallSite = 0;
+
+static cl::opt<bool> sched4reg(
+    "nvptx-sched4reg",
+    cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
+
+static cl::opt<unsigned>
+FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
+                    cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
+                             " 1: do it  2: do it aggressively"),
+                    cl::init(2));
+
+static bool IsPTXVectorType(MVT VT) {
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::v2i1:
+  case MVT::v4i1:
+  case MVT::v2i8:
+  case MVT::v4i8:
+  case MVT::v2i16:
+  case MVT::v4i16:
+  case MVT::v2i32:
+  case MVT::v4i32:
+  case MVT::v2i64:
+  case MVT::v2f32:
+  case MVT::v4f32:
+  case MVT::v2f64:
+    return true;
+  }
+}
+
+/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
+/// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
+/// into their primitive components.
+/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
+/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
+/// LowerCall, and LowerReturn.
+static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+                               Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
+                               SmallVectorImpl<uint64_t> *Offsets = nullptr,
+                               uint64_t StartingOffset = 0) {
+  SmallVector<EVT, 16> TempVTs;
+  SmallVector<uint64_t, 16> TempOffsets;
+
+  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
+  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
+    EVT VT = TempVTs[i];
+    uint64_t Off = TempOffsets[i];
+    if (VT.isVector())
+      for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) {
+        ValueVTs.push_back(VT.getVectorElementType());
+        if (Offsets)
+          Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize());
+      }
+    else {
+      ValueVTs.push_back(VT);
+      if (Offsets)
+        Offsets->push_back(Off);
+    }
+  }
+}
+
+// NVPTXTargetLowering Constructor.
+NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
+                                         const NVPTXSubtarget &STI)
+    : TargetLowering(TM), nvTM(&TM), STI(STI) {
+
+  // always lower memset, memcpy, and memmove intrinsics to load/store
+  // instructions, rather
+  // then generating calls to memset, mempcy or memmove.
+  MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
+  MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
+  MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
+
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
+  // condition branches.
+  setJumpIsExpensive(true);
+
+  // Wide divides are _very_ slow. Try to reduce the width of the divide if
+  // possible.
+  addBypassSlowDiv(64, 32);
+
+  // By default, use the Source scheduling
+  if (sched4reg)
+    setSchedulingPreference(Sched::RegPressure);
+  else
+    setSchedulingPreference(Sched::Source);
+
+  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
+  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
+  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
+  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
+  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
+  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
+
+  // Operations not directly supported by NVPTX.
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i8, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+  // Some SIGN_EXTEND_INREG can be done using cvt instruction.
+  // For others we will expand to a SHL/SRA pair.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
+  setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
+
+  if (STI.hasROT64()) {
+    setOperationAction(ISD::ROTL, MVT::i64, Legal);
+    setOperationAction(ISD::ROTR, MVT::i64, Legal);
+  } else {
+    setOperationAction(ISD::ROTL, MVT::i64, Expand);
+    setOperationAction(ISD::ROTR, MVT::i64, Expand);
+  }
+  if (STI.hasROT32()) {
+    setOperationAction(ISD::ROTL, MVT::i32, Legal);
+    setOperationAction(ISD::ROTR, MVT::i32, Legal);
+  } else {
+    setOperationAction(ISD::ROTL, MVT::i32, Expand);
+    setOperationAction(ISD::ROTR, MVT::i32, Expand);
+  }
+
+  setOperationAction(ISD::ROTL, MVT::i16, Expand);
+  setOperationAction(ISD::ROTR, MVT::i16, Expand);
+  setOperationAction(ISD::ROTL, MVT::i8, Expand);
+  setOperationAction(ISD::ROTR, MVT::i8, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i16, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+  // Indirect branch is not supported.
+  // This also disables Jump Table creation.
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+
+  // We want to legalize constant related memmove and memcopy
+  // intrinsics.
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+
+  // Turn FP extload into load/fpextend
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
+  // Turn FP truncstore into trunc + store.
+  // FIXME: vector types should also be expanded
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // PTX does not support load / store predicate registers
+  setOperationAction(ISD::LOAD, MVT::i1, Custom);
+  setOperationAction(ISD::STORE, MVT::i1, Custom);
+
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setTruncStoreAction(VT, MVT::i1, Expand);
+  }
+
+  // This is legal in NVPTX
+  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+
+  // TRAP can be lowered to PTX trap
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  setOperationAction(ISD::ADDC, MVT::i64, Expand);
+  setOperationAction(ISD::ADDE, MVT::i64, Expand);
+
+  // Register custom handling for vector loads/stores
+  for (MVT VT : MVT::vector_valuetypes()) {
+    if (IsPTXVectorType(VT)) {
+      setOperationAction(ISD::LOAD, VT, Custom);
+      setOperationAction(ISD::STORE, VT, Custom);
+      setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
+    }
+  }
+
+  // Custom handling for i8 intrinsics
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+
+  setOperationAction(ISD::CTLZ, MVT::i16, Legal);
+  setOperationAction(ISD::CTLZ, MVT::i32, Legal);
+  setOperationAction(ISD::CTLZ, MVT::i64, Legal);
+  setOperationAction(ISD::CTTZ, MVT::i16, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i64, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i16, Legal);
+  setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+  setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+
+  // PTX does not directly support SELP of i1, so promote to i32 first
+  setOperationAction(ISD::SELECT, MVT::i1, Custom);
+
+  // PTX cannot multiply two i64s in a single instruction.
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+
+  // We have some custom DAG combine patterns for these nodes
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::SREM);
+  setTargetDAGCombine(ISD::UREM);
+
+  // Library functions.  These default to Expand, but we have instructions
+  // for them.
+  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
+  setOperationAction(ISD::FCEIL,  MVT::f64, Legal);
+  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+  setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
+  setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
+  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
+  setOperationAction(ISD::FRINT,  MVT::f64, Legal);
+  setOperationAction(ISD::FROUND, MVT::f32, Legal);
+  setOperationAction(ISD::FROUND, MVT::f64, Legal);
+  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+  setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+
+  // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
+  // No FPOW or FREM in PTX.
+
+  // Now deduce the information based on the above mentioned
+  // actions
+  computeRegisterProperties(STI.getRegisterInfo());
+}
+
+const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch ((NVPTXISD::NodeType)Opcode) {
+  case NVPTXISD::FIRST_NUMBER:
+    break;
+  case NVPTXISD::CALL:
+    return "NVPTXISD::CALL";
+  case NVPTXISD::RET_FLAG:
+    return "NVPTXISD::RET_FLAG";
+  case NVPTXISD::LOAD_PARAM:
+    return "NVPTXISD::LOAD_PARAM";
+  case NVPTXISD::Wrapper:
+    return "NVPTXISD::Wrapper";
+  case NVPTXISD::DeclareParam:
+    return "NVPTXISD::DeclareParam";
+  case NVPTXISD::DeclareScalarParam:
+    return "NVPTXISD::DeclareScalarParam";
+  case NVPTXISD::DeclareRet:
+    return "NVPTXISD::DeclareRet";
+  case NVPTXISD::DeclareScalarRet:
+    return "NVPTXISD::DeclareScalarRet";
+  case NVPTXISD::DeclareRetParam:
+    return "NVPTXISD::DeclareRetParam";
+  case NVPTXISD::PrintCall:
+    return "NVPTXISD::PrintCall";
+  case NVPTXISD::PrintConvergentCall:
+    return "NVPTXISD::PrintConvergentCall";
+  case NVPTXISD::PrintCallUni:
+    return "NVPTXISD::PrintCallUni";
+  case NVPTXISD::PrintConvergentCallUni:
+    return "NVPTXISD::PrintConvergentCallUni";
+  case NVPTXISD::LoadParam:
+    return "NVPTXISD::LoadParam";
+  case NVPTXISD::LoadParamV2:
+    return "NVPTXISD::LoadParamV2";
+  case NVPTXISD::LoadParamV4:
+    return "NVPTXISD::LoadParamV4";
+  case NVPTXISD::StoreParam:
+    return "NVPTXISD::StoreParam";
+  case NVPTXISD::StoreParamV2:
+    return "NVPTXISD::StoreParamV2";
+  case NVPTXISD::StoreParamV4:
+    return "NVPTXISD::StoreParamV4";
+  case NVPTXISD::StoreParamS32:
+    return "NVPTXISD::StoreParamS32";
+  case NVPTXISD::StoreParamU32:
+    return "NVPTXISD::StoreParamU32";
+  case NVPTXISD::CallArgBegin:
+    return "NVPTXISD::CallArgBegin";
+  case NVPTXISD::CallArg:
+    return "NVPTXISD::CallArg";
+  case NVPTXISD::LastCallArg:
+    return "NVPTXISD::LastCallArg";
+  case NVPTXISD::CallArgEnd:
+    return "NVPTXISD::CallArgEnd";
+  case NVPTXISD::CallVoid:
+    return "NVPTXISD::CallVoid";
+  case NVPTXISD::CallVal:
+    return "NVPTXISD::CallVal";
+  case NVPTXISD::CallSymbol:
+    return "NVPTXISD::CallSymbol";
+  case NVPTXISD::Prototype:
+    return "NVPTXISD::Prototype";
+  case NVPTXISD::MoveParam:
+    return "NVPTXISD::MoveParam";
+  case NVPTXISD::StoreRetval:
+    return "NVPTXISD::StoreRetval";
+  case NVPTXISD::StoreRetvalV2:
+    return "NVPTXISD::StoreRetvalV2";
+  case NVPTXISD::StoreRetvalV4:
+    return "NVPTXISD::StoreRetvalV4";
+  case NVPTXISD::PseudoUseParam:
+    return "NVPTXISD::PseudoUseParam";
+  case NVPTXISD::RETURN:
+    return "NVPTXISD::RETURN";
+  case NVPTXISD::CallSeqBegin:
+    return "NVPTXISD::CallSeqBegin";
+  case NVPTXISD::CallSeqEnd:
+    return "NVPTXISD::CallSeqEnd";
+  case NVPTXISD::CallPrototype:
+    return "NVPTXISD::CallPrototype";
+  case NVPTXISD::LoadV2:
+    return "NVPTXISD::LoadV2";
+  case NVPTXISD::LoadV4:
+    return "NVPTXISD::LoadV4";
+  case NVPTXISD::LDGV2:
+    return "NVPTXISD::LDGV2";
+  case NVPTXISD::LDGV4:
+    return "NVPTXISD::LDGV4";
+  case NVPTXISD::LDUV2:
+    return "NVPTXISD::LDUV2";
+  case NVPTXISD::LDUV4:
+    return "NVPTXISD::LDUV4";
+  case NVPTXISD::StoreV2:
+    return "NVPTXISD::StoreV2";
+  case NVPTXISD::StoreV4:
+    return "NVPTXISD::StoreV4";
+  case NVPTXISD::FUN_SHFL_CLAMP:
+    return "NVPTXISD::FUN_SHFL_CLAMP";
+  case NVPTXISD::FUN_SHFR_CLAMP:
+    return "NVPTXISD::FUN_SHFR_CLAMP";
+  case NVPTXISD::IMAD:
+    return "NVPTXISD::IMAD";
+  case NVPTXISD::Dummy:
+    return "NVPTXISD::Dummy";
+  case NVPTXISD::MUL_WIDE_SIGNED:
+    return "NVPTXISD::MUL_WIDE_SIGNED";
+  case NVPTXISD::MUL_WIDE_UNSIGNED:
+    return "NVPTXISD::MUL_WIDE_UNSIGNED";
+  case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
+  case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
+  case NVPTXISD::Tex1DFloatFloatLevel:
+    return "NVPTXISD::Tex1DFloatFloatLevel";
+  case NVPTXISD::Tex1DFloatFloatGrad:
+    return "NVPTXISD::Tex1DFloatFloatGrad";
+  case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
+  case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
+  case NVPTXISD::Tex1DS32FloatLevel:
+    return "NVPTXISD::Tex1DS32FloatLevel";
+  case NVPTXISD::Tex1DS32FloatGrad:
+    return "NVPTXISD::Tex1DS32FloatGrad";
+  case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
+  case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
+  case NVPTXISD::Tex1DU32FloatLevel:
+    return "NVPTXISD::Tex1DU32FloatLevel";
+  case NVPTXISD::Tex1DU32FloatGrad:
+    return "NVPTXISD::Tex1DU32FloatGrad";
+  case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
+  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
+  case NVPTXISD::Tex1DArrayFloatFloatLevel:
+    return "NVPTXISD::Tex1DArrayFloatFloatLevel";
+  case NVPTXISD::Tex1DArrayFloatFloatGrad:
+    return "NVPTXISD::Tex1DArrayFloatFloatGrad";
+  case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
+  case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
+  case NVPTXISD::Tex1DArrayS32FloatLevel:
+    return "NVPTXISD::Tex1DArrayS32FloatLevel";
+  case NVPTXISD::Tex1DArrayS32FloatGrad:
+    return "NVPTXISD::Tex1DArrayS32FloatGrad";
+  case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
+  case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
+  case NVPTXISD::Tex1DArrayU32FloatLevel:
+    return "NVPTXISD::Tex1DArrayU32FloatLevel";
+  case NVPTXISD::Tex1DArrayU32FloatGrad:
+    return "NVPTXISD::Tex1DArrayU32FloatGrad";
+  case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
+  case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
+  case NVPTXISD::Tex2DFloatFloatLevel:
+    return "NVPTXISD::Tex2DFloatFloatLevel";
+  case NVPTXISD::Tex2DFloatFloatGrad:
+    return "NVPTXISD::Tex2DFloatFloatGrad";
+  case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
+  case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
+  case NVPTXISD::Tex2DS32FloatLevel:
+    return "NVPTXISD::Tex2DS32FloatLevel";
+  case NVPTXISD::Tex2DS32FloatGrad:
+    return "NVPTXISD::Tex2DS32FloatGrad";
+  case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
+  case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
+  case NVPTXISD::Tex2DU32FloatLevel:
+    return "NVPTXISD::Tex2DU32FloatLevel";
+  case NVPTXISD::Tex2DU32FloatGrad:
+    return "NVPTXISD::Tex2DU32FloatGrad";
+  case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
+  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
+  case NVPTXISD::Tex2DArrayFloatFloatLevel:
+    return "NVPTXISD::Tex2DArrayFloatFloatLevel";
+  case NVPTXISD::Tex2DArrayFloatFloatGrad:
+    return "NVPTXISD::Tex2DArrayFloatFloatGrad";
+  case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
+  case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
+  case NVPTXISD::Tex2DArrayS32FloatLevel:
+    return "NVPTXISD::Tex2DArrayS32FloatLevel";
+  case NVPTXISD::Tex2DArrayS32FloatGrad:
+    return "NVPTXISD::Tex2DArrayS32FloatGrad";
+  case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
+  case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
+  case NVPTXISD::Tex2DArrayU32FloatLevel:
+    return "NVPTXISD::Tex2DArrayU32FloatLevel";
+  case NVPTXISD::Tex2DArrayU32FloatGrad:
+    return "NVPTXISD::Tex2DArrayU32FloatGrad";
+  case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
+  case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
+  case NVPTXISD::Tex3DFloatFloatLevel:
+    return "NVPTXISD::Tex3DFloatFloatLevel";
+  case NVPTXISD::Tex3DFloatFloatGrad:
+    return "NVPTXISD::Tex3DFloatFloatGrad";
+  case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
+  case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
+  case NVPTXISD::Tex3DS32FloatLevel:
+    return "NVPTXISD::Tex3DS32FloatLevel";
+  case NVPTXISD::Tex3DS32FloatGrad:
+    return "NVPTXISD::Tex3DS32FloatGrad";
+  case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
+  case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
+  case NVPTXISD::Tex3DU32FloatLevel:
+    return "NVPTXISD::Tex3DU32FloatLevel";
+  case NVPTXISD::Tex3DU32FloatGrad:
+    return "NVPTXISD::Tex3DU32FloatGrad";
+  case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
+  case NVPTXISD::TexCubeFloatFloatLevel:
+    return "NVPTXISD::TexCubeFloatFloatLevel";
+  case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
+  case NVPTXISD::TexCubeS32FloatLevel:
+    return "NVPTXISD::TexCubeS32FloatLevel";
+  case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
+  case NVPTXISD::TexCubeU32FloatLevel:
+    return "NVPTXISD::TexCubeU32FloatLevel";
+  case NVPTXISD::TexCubeArrayFloatFloat:
+    return "NVPTXISD::TexCubeArrayFloatFloat";
+  case NVPTXISD::TexCubeArrayFloatFloatLevel:
+    return "NVPTXISD::TexCubeArrayFloatFloatLevel";
+  case NVPTXISD::TexCubeArrayS32Float:
+    return "NVPTXISD::TexCubeArrayS32Float";
+  case NVPTXISD::TexCubeArrayS32FloatLevel:
+    return "NVPTXISD::TexCubeArrayS32FloatLevel";
+  case NVPTXISD::TexCubeArrayU32Float:
+    return "NVPTXISD::TexCubeArrayU32Float";
+  case NVPTXISD::TexCubeArrayU32FloatLevel:
+    return "NVPTXISD::TexCubeArrayU32FloatLevel";
+  case NVPTXISD::Tld4R2DFloatFloat:
+    return "NVPTXISD::Tld4R2DFloatFloat";
+  case NVPTXISD::Tld4G2DFloatFloat:
+    return "NVPTXISD::Tld4G2DFloatFloat";
+  case NVPTXISD::Tld4B2DFloatFloat:
+    return "NVPTXISD::Tld4B2DFloatFloat";
+  case NVPTXISD::Tld4A2DFloatFloat:
+    return "NVPTXISD::Tld4A2DFloatFloat";
+  case NVPTXISD::Tld4R2DS64Float:
+    return "NVPTXISD::Tld4R2DS64Float";
+  case NVPTXISD::Tld4G2DS64Float:
+    return "NVPTXISD::Tld4G2DS64Float";
+  case NVPTXISD::Tld4B2DS64Float:
+    return "NVPTXISD::Tld4B2DS64Float";
+  case NVPTXISD::Tld4A2DS64Float:
+    return "NVPTXISD::Tld4A2DS64Float";
+  case NVPTXISD::Tld4R2DU64Float:
+    return "NVPTXISD::Tld4R2DU64Float";
+  case NVPTXISD::Tld4G2DU64Float:
+    return "NVPTXISD::Tld4G2DU64Float";
+  case NVPTXISD::Tld4B2DU64Float:
+    return "NVPTXISD::Tld4B2DU64Float";
+  case NVPTXISD::Tld4A2DU64Float:
+    return "NVPTXISD::Tld4A2DU64Float";
+
+  case NVPTXISD::TexUnified1DFloatS32:
+    return "NVPTXISD::TexUnified1DFloatS32";
+  case NVPTXISD::TexUnified1DFloatFloat:
+    return "NVPTXISD::TexUnified1DFloatFloat";
+  case NVPTXISD::TexUnified1DFloatFloatLevel:
+    return "NVPTXISD::TexUnified1DFloatFloatLevel";
+  case NVPTXISD::TexUnified1DFloatFloatGrad:
+    return "NVPTXISD::TexUnified1DFloatFloatGrad";
+  case NVPTXISD::TexUnified1DS32S32:
+    return "NVPTXISD::TexUnified1DS32S32";
+  case NVPTXISD::TexUnified1DS32Float:
+    return "NVPTXISD::TexUnified1DS32Float";
+  case NVPTXISD::TexUnified1DS32FloatLevel:
+    return "NVPTXISD::TexUnified1DS32FloatLevel";
+  case NVPTXISD::TexUnified1DS32FloatGrad:
+    return "NVPTXISD::TexUnified1DS32FloatGrad";
+  case NVPTXISD::TexUnified1DU32S32:
+    return "NVPTXISD::TexUnified1DU32S32";
+  case NVPTXISD::TexUnified1DU32Float:
+    return "NVPTXISD::TexUnified1DU32Float";
+  case NVPTXISD::TexUnified1DU32FloatLevel:
+    return "NVPTXISD::TexUnified1DU32FloatLevel";
+  case NVPTXISD::TexUnified1DU32FloatGrad:
+    return "NVPTXISD::TexUnified1DU32FloatGrad";
+  case NVPTXISD::TexUnified1DArrayFloatS32:
+    return "NVPTXISD::TexUnified1DArrayFloatS32";
+  case NVPTXISD::TexUnified1DArrayFloatFloat:
+    return "NVPTXISD::TexUnified1DArrayFloatFloat";
+  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+    return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
+  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+    return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
+  case NVPTXISD::TexUnified1DArrayS32S32:
+    return "NVPTXISD::TexUnified1DArrayS32S32";
+  case NVPTXISD::TexUnified1DArrayS32Float:
+    return "NVPTXISD::TexUnified1DArrayS32Float";
+  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+    return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
+  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+    return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
+  case NVPTXISD::TexUnified1DArrayU32S32:
+    return "NVPTXISD::TexUnified1DArrayU32S32";
+  case NVPTXISD::TexUnified1DArrayU32Float:
+    return "NVPTXISD::TexUnified1DArrayU32Float";
+  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+    return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
+  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+    return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
+  case NVPTXISD::TexUnified2DFloatS32:
+    return "NVPTXISD::TexUnified2DFloatS32";
+  case NVPTXISD::TexUnified2DFloatFloat:
+    return "NVPTXISD::TexUnified2DFloatFloat";
+  case NVPTXISD::TexUnified2DFloatFloatLevel:
+    return "NVPTXISD::TexUnified2DFloatFloatLevel";
+  case NVPTXISD::TexUnified2DFloatFloatGrad:
+    return "NVPTXISD::TexUnified2DFloatFloatGrad";
+  case NVPTXISD::TexUnified2DS32S32:
+    return "NVPTXISD::TexUnified2DS32S32";
+  case NVPTXISD::TexUnified2DS32Float:
+    return "NVPTXISD::TexUnified2DS32Float";
+  case NVPTXISD::TexUnified2DS32FloatLevel:
+    return "NVPTXISD::TexUnified2DS32FloatLevel";
+  case NVPTXISD::TexUnified2DS32FloatGrad:
+    return "NVPTXISD::TexUnified2DS32FloatGrad";
+  case NVPTXISD::TexUnified2DU32S32:
+    return "NVPTXISD::TexUnified2DU32S32";
+  case NVPTXISD::TexUnified2DU32Float:
+    return "NVPTXISD::TexUnified2DU32Float";
+  case NVPTXISD::TexUnified2DU32FloatLevel:
+    return "NVPTXISD::TexUnified2DU32FloatLevel";
+  case NVPTXISD::TexUnified2DU32FloatGrad:
+    return "NVPTXISD::TexUnified2DU32FloatGrad";
+  case NVPTXISD::TexUnified2DArrayFloatS32:
+    return "NVPTXISD::TexUnified2DArrayFloatS32";
+  case NVPTXISD::TexUnified2DArrayFloatFloat:
+    return "NVPTXISD::TexUnified2DArrayFloatFloat";
+  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+    return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
+  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+    return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
+  case NVPTXISD::TexUnified2DArrayS32S32:
+    return "NVPTXISD::TexUnified2DArrayS32S32";
+  case NVPTXISD::TexUnified2DArrayS32Float:
+    return "NVPTXISD::TexUnified2DArrayS32Float";
+  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+    return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
+  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+    return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
+  case NVPTXISD::TexUnified2DArrayU32S32:
+    return "NVPTXISD::TexUnified2DArrayU32S32";
+  case NVPTXISD::TexUnified2DArrayU32Float:
+    return "NVPTXISD::TexUnified2DArrayU32Float";
+  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+    return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
+  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+    return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
+  case NVPTXISD::TexUnified3DFloatS32:
+    return "NVPTXISD::TexUnified3DFloatS32";
+  case NVPTXISD::TexUnified3DFloatFloat:
+    return "NVPTXISD::TexUnified3DFloatFloat";
+  case NVPTXISD::TexUnified3DFloatFloatLevel:
+    return "NVPTXISD::TexUnified3DFloatFloatLevel";
+  case NVPTXISD::TexUnified3DFloatFloatGrad:
+    return "NVPTXISD::TexUnified3DFloatFloatGrad";
+  case NVPTXISD::TexUnified3DS32S32:
+    return "NVPTXISD::TexUnified3DS32S32";
+  case NVPTXISD::TexUnified3DS32Float:
+    return "NVPTXISD::TexUnified3DS32Float";
+  case NVPTXISD::TexUnified3DS32FloatLevel:
+    return "NVPTXISD::TexUnified3DS32FloatLevel";
+  case NVPTXISD::TexUnified3DS32FloatGrad:
+    return "NVPTXISD::TexUnified3DS32FloatGrad";
+  case NVPTXISD::TexUnified3DU32S32:
+    return "NVPTXISD::TexUnified3DU32S32";
+  case NVPTXISD::TexUnified3DU32Float:
+    return "NVPTXISD::TexUnified3DU32Float";
+  case NVPTXISD::TexUnified3DU32FloatLevel:
+    return "NVPTXISD::TexUnified3DU32FloatLevel";
+  case NVPTXISD::TexUnified3DU32FloatGrad:
+    return "NVPTXISD::TexUnified3DU32FloatGrad";
+  case NVPTXISD::TexUnifiedCubeFloatFloat:
+    return "NVPTXISD::TexUnifiedCubeFloatFloat";
+  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+    return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
+  case NVPTXISD::TexUnifiedCubeS32Float:
+    return "NVPTXISD::TexUnifiedCubeS32Float";
+  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+    return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
+  case NVPTXISD::TexUnifiedCubeU32Float:
+    return "NVPTXISD::TexUnifiedCubeU32Float";
+  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+    return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+    return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+    return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
+  case NVPTXISD::TexUnifiedCubeArrayS32Float:
+    return "NVPTXISD::TexUnifiedCubeArrayS32Float";
+  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+    return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
+  case NVPTXISD::TexUnifiedCubeArrayU32Float:
+    return "NVPTXISD::TexUnifiedCubeArrayU32Float";
+  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+    return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
+  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+    return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
+  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+    return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
+  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+    return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
+  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+    return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
+  case NVPTXISD::Tld4UnifiedR2DS64Float:
+    return "NVPTXISD::Tld4UnifiedR2DS64Float";
+  case NVPTXISD::Tld4UnifiedG2DS64Float:
+    return "NVPTXISD::Tld4UnifiedG2DS64Float";
+  case NVPTXISD::Tld4UnifiedB2DS64Float:
+    return "NVPTXISD::Tld4UnifiedB2DS64Float";
+  case NVPTXISD::Tld4UnifiedA2DS64Float:
+    return "NVPTXISD::Tld4UnifiedA2DS64Float";
+  case NVPTXISD::Tld4UnifiedR2DU64Float:
+    return "NVPTXISD::Tld4UnifiedR2DU64Float";
+  case NVPTXISD::Tld4UnifiedG2DU64Float:
+    return "NVPTXISD::Tld4UnifiedG2DU64Float";
+  case NVPTXISD::Tld4UnifiedB2DU64Float:
+    return "NVPTXISD::Tld4UnifiedB2DU64Float";
+  case NVPTXISD::Tld4UnifiedA2DU64Float:
+    return "NVPTXISD::Tld4UnifiedA2DU64Float";
+
+  case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
+  case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
+  case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
+  case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
+  case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
+  case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
+  case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
+  case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
+  case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
+  case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
+  case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
+
+  case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
+  case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
+  case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
+  case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
+  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
+  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
+  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
+  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
+  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
+  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
+  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
+
+  case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
+  case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
+  case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
+  case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
+  case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
+  case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
+  case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
+  case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
+  case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
+  case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
+  case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
+
+  case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
+  case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
+  case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
+  case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
+  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
+  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
+  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
+  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
+  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
+  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
+  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
+
+  case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
+  case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
+  case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
+  case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
+  case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
+  case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
+  case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
+  case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
+  case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
+  case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
+  case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
+
+  case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
+  case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
+  case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
+  case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
+  case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
+  case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
+  case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
+  case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
+  case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
+  case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
+  case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
+
+  case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
+  case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
+  case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
+  case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
+  case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
+  case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
+  case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
+  case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
+  case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
+  case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
+  case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
+
+  case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
+  case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
+  case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
+  case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
+  case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
+  case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
+  case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
+  case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
+  case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
+  case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
+  case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
+
+  case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
+  case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
+  case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
+  case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
+  case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
+  case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
+  case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
+  case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
+  case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
+  case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
+  case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
+
+  case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
+  case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
+  case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
+  case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
+  case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
+  case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
+  case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
+  case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
+  case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
+  case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
+  case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
+
+  case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
+  case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
+  case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
+  case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
+  case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
+  case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
+  case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
+  case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
+  case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
+  case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
+  case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
+
+  case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
+  case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
+  case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
+  case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
+  case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
+  case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
+  case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
+  case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
+  case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
+  case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
+  case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
+
+  case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
+  case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
+  case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
+  case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
+  case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
+  case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
+  case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
+  case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
+  case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
+  case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
+  case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
+
+  case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
+  case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
+  case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
+  case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
+  case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
+  case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
+  case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
+  case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
+  case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
+  case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
+  case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
+
+  case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
+  case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
+  case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
+  case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
+  case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
+  case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
+  case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
+  case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
+  case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
+  case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
+  case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
+  }
+  return nullptr;
+}
+
+TargetLoweringBase::LegalizeTypeAction
+NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
+    return TypeSplitVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+SDValue
+NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
+  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
+}
+
+std::string NVPTXTargetLowering::getPrototype(
+    const DataLayout &DL, Type *retTy, const ArgListTy &Args,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
+    const ImmutableCallSite *CS) const {
+  auto PtrVT = getPointerTy(DL);
+
+  bool isABI = (STI.getSmVersion() >= 20);
+  assert(isABI && "Non-ABI compilation is not supported");
+  if (!isABI)
+    return "";
+
+  std::stringstream O;
+  O << "prototype_" << uniqueCallSite << " : .callprototype ";
+
+  if (retTy->getTypeID() == Type::VoidTyID) {
+    O << "()";
+  } else {
+    O << "(";
+    if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
+      unsigned size = 0;
+      if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
+        size = ITy->getBitWidth();
+        if (size < 32)
+          size = 32;
+      } else {
+        assert(retTy->isFloatingPointTy() &&
+               "Floating point type expected here");
+        size = retTy->getPrimitiveSizeInBits();
+      }
+
+      O << ".param .b" << size << " _";
+    } else if (isa<PointerType>(retTy)) {
+      O << ".param .b" << PtrVT.getSizeInBits() << " _";
+    } else if ((retTy->getTypeID() == Type::StructTyID) ||
+               isa<VectorType>(retTy)) {
+      auto &DL = CS->getCalledFunction()->getParent()->getDataLayout();
+      O << ".param .align " << retAlignment << " .b8 _["
+        << DL.getTypeAllocSize(retTy) << "]";
+    } else {
+      llvm_unreachable("Unknown return type");
+    }
+    O << ") ";
+  }
+  O << "_ (";
+
+  bool first = true;
+
+  unsigned OIdx = 0;
+  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
+    Type *Ty = Args[i].Ty;
+    if (!first) {
+      O << ", ";
+    }
+    first = false;
+
+    if (!Outs[OIdx].Flags.isByVal()) {
+      if (Ty->isAggregateType() || Ty->isVectorTy()) {
+        unsigned align = 0;
+        const CallInst *CallI = cast<CallInst>(CS->getInstruction());
+        // +1 because index 0 is reserved for return type alignment
+        if (!llvm::getAlign(*CallI, i + 1, align))
+          align = DL.getABITypeAlignment(Ty);
+        unsigned sz = DL.getTypeAllocSize(Ty);
+        O << ".param .align " << align << " .b8 ";
+        O << "_";
+        O << "[" << sz << "]";
+        // update the index for Outs
+        SmallVector<EVT, 16> vtparts;
+        ComputeValueVTs(*this, DL, Ty, vtparts);
+        if (unsigned len = vtparts.size())
+          OIdx += len - 1;
+        continue;
+      }
+       // i8 types in IR will be i16 types in SDAG
+      assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
+              (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
+             "type mismatch between callee prototype and arguments");
+      // scalar type
+      unsigned sz = 0;
+      if (isa<IntegerType>(Ty)) {
+        sz = cast<IntegerType>(Ty)->getBitWidth();
+        if (sz < 32)
+          sz = 32;
+      } else if (isa<PointerType>(Ty))
+        sz = PtrVT.getSizeInBits();
+      else
+        sz = Ty->getPrimitiveSizeInBits();
+      O << ".param .b" << sz << " ";
+      O << "_";
+      continue;
+    }
+    auto *PTy = dyn_cast<PointerType>(Ty);
+    assert(PTy && "Param with byval attribute should be a pointer type");
+    Type *ETy = PTy->getElementType();
+
+    unsigned align = Outs[OIdx].Flags.getByValAlign();
+    unsigned sz = DL.getTypeAllocSize(ETy);
+    O << ".param .align " << align << " .b8 ";
+    O << "_";
+    O << "[" << sz << "]";
+  }
+  O << ");";
+  return O.str();
+}
+
+unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
+                                                   const ImmutableCallSite *CS,
+                                                   Type *Ty, unsigned Idx,
+                                                   const DataLayout &DL) const {
+  if (!CS) {
+    // CallSite is zero, fallback to ABI type alignment
+    return DL.getABITypeAlignment(Ty);
+  }
+
+  unsigned Align = 0;
+  const Value *DirectCallee = CS->getCalledFunction();
+
+  if (!DirectCallee) {
+    // We don't have a direct function symbol, but that may be because of
+    // constant cast instructions in the call.
+    const Instruction *CalleeI = CS->getInstruction();
+    assert(CalleeI && "Call target is not a function or derived value?");
+
+    // With bitcast'd call targets, the instruction will be the call
+    if (isa<CallInst>(CalleeI)) {
+      // Check if we have call alignment metadata
+      if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align))
+        return Align;
+
+      const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
+      // Ignore any bitcast instructions
+      while (isa<ConstantExpr>(CalleeV)) {
+        const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
+        if (!CE->isCast())
+          break;
+        // Look through the bitcast
+        CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
+      }
+
+      // We have now looked past all of the bitcasts.  Do we finally have a
+      // Function?
+      if (isa<Function>(CalleeV))
+        DirectCallee = CalleeV;
+    }
+  }
+
+  // Check for function alignment information if we found that the
+  // ultimate target is a Function
+  if (DirectCallee)
+    if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align))
+      return Align;
+
+  // Call is indirect or alignment information is not available, fall back to
+  // the ABI type alignment
+  return DL.getABITypeAlignment(Ty);
+}
+
+SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                       SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc dl = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &isTailCall = CLI.IsTailCall;
+  ArgListTy &Args = CLI.getArgs();
+  Type *retTy = CLI.RetTy;
+  ImmutableCallSite *CS = CLI.CS;
+
+  bool isABI = (STI.getSmVersion() >= 20);
+  assert(isABI && "Non-ABI compilation is not supported");
+  if (!isABI)
+    return Chain;
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *F = MF.getFunction();
+  auto &DL = MF.getDataLayout();
+
+  SDValue tempChain = Chain;
+  Chain = DAG.getCALLSEQ_START(Chain,
+                               DAG.getIntPtrConstant(uniqueCallSite, dl, true),
+                               dl);
+  SDValue InFlag = Chain.getValue(1);
+
+  unsigned paramCount = 0;
+  // Args.size() and Outs.size() need not match.
+  // Outs.size() will be larger
+  //   * if there is an aggregate argument with multiple fields (each field
+  //     showing up separately in Outs)
+  //   * if there is a vector argument with more than typical vector-length
+  //     elements (generally if more than 4) where each vector element is
+  //     individually present in Outs.
+  // So a different index should be used for indexing into Outs/OutVals.
+  // See similar issue in LowerFormalArguments.
+  unsigned OIdx = 0;
+  // Declare the .params or .reg need to pass values
+  // to the function
+  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
+    EVT VT = Outs[OIdx].VT;
+    Type *Ty = Args[i].Ty;
+
+    if (!Outs[OIdx].Flags.isByVal()) {
+      if (Ty->isAggregateType()) {
+        // aggregate
+        SmallVector<EVT, 16> vtparts;
+        SmallVector<uint64_t, 16> Offsets;
+        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &Offsets,
+                           0);
+
+        unsigned align =
+            getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
+        // declare .param .align <align> .b8 .param<n>[<size>];
+        unsigned sz = DL.getTypeAllocSize(Ty);
+        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, dl,
+                                                             MVT::i32),
+                                      DAG.getConstant(paramCount, dl, MVT::i32),
+                                      DAG.getConstant(sz, dl, MVT::i32),
+                                      InFlag };
+        Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                            DeclareParamOps);
+        InFlag = Chain.getValue(1);
+        for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
+          EVT elemtype = vtparts[j];
+          unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]);
+          if (elemtype.isInteger() && (sz < 8))
+            sz = 8;
+          SDValue StVal = OutVals[OIdx];
+          if (elemtype.getSizeInBits() < 16) {
+            StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
+          }
+          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+          SDValue CopyParamOps[] = { Chain,
+                                     DAG.getConstant(paramCount, dl, MVT::i32),
+                                     DAG.getConstant(Offsets[j], dl, MVT::i32),
+                                     StVal, InFlag };
+          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
+                                          CopyParamVTs, CopyParamOps,
+                                          elemtype, MachinePointerInfo(),
+                                          ArgAlign);
+          InFlag = Chain.getValue(1);
+          ++OIdx;
+        }
+        if (vtparts.size() > 0)
+          --OIdx;
+        ++paramCount;
+        continue;
+      }
+      if (Ty->isVectorTy()) {
+        EVT ObjectVT = getValueType(DL, Ty);
+        unsigned align =
+            getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
+        // declare .param .align <align> .b8 .param<n>[<size>];
+        unsigned sz = DL.getTypeAllocSize(Ty);
+        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue DeclareParamOps[] = { Chain,
+                                      DAG.getConstant(align, dl, MVT::i32),
+                                      DAG.getConstant(paramCount, dl, MVT::i32),
+                                      DAG.getConstant(sz, dl, MVT::i32),
+                                      InFlag };
+        Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                            DeclareParamOps);
+        InFlag = Chain.getValue(1);
+        unsigned NumElts = ObjectVT.getVectorNumElements();
+        EVT EltVT = ObjectVT.getVectorElementType();
+        EVT MemVT = EltVT;
+        bool NeedExtend = false;
+        if (EltVT.getSizeInBits() < 16) {
+          NeedExtend = true;
+          EltVT = MVT::i16;
+        }
+
+        // V1 store
+        if (NumElts == 1) {
+          SDValue Elt = OutVals[OIdx++];
+          if (NeedExtend)
+            Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt);
+
+          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+          SDValue CopyParamOps[] = { Chain,
+                                     DAG.getConstant(paramCount, dl, MVT::i32),
+                                     DAG.getConstant(0, dl, MVT::i32), Elt,
+                                     InFlag };
+          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
+                                          CopyParamVTs, CopyParamOps,
+                                          MemVT, MachinePointerInfo());
+          InFlag = Chain.getValue(1);
+        } else if (NumElts == 2) {
+          SDValue Elt0 = OutVals[OIdx++];
+          SDValue Elt1 = OutVals[OIdx++];
+          if (NeedExtend) {
+            Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0);
+            Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1);
+          }
+
+          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+          SDValue CopyParamOps[] = { Chain,
+                                     DAG.getConstant(paramCount, dl, MVT::i32),
+                                     DAG.getConstant(0, dl, MVT::i32), Elt0,
+                                     Elt1, InFlag };
+          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl,
+                                          CopyParamVTs, CopyParamOps,
+                                          MemVT, MachinePointerInfo());
+          InFlag = Chain.getValue(1);
+        } else {
+          unsigned curOffset = 0;
+          // V4 stores
+          // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
+          // the
+          // vector will be expanded to a power of 2 elements, so we know we can
+          // always round up to the next multiple of 4 when creating the vector
+          // stores.
+          // e.g.  4 elem => 1 st.v4
+          //       6 elem => 2 st.v4
+          //       8 elem => 2 st.v4
+          //      11 elem => 3 st.v4
+          unsigned VecSize = 4;
+          if (EltVT.getSizeInBits() == 64)
+            VecSize = 2;
+
+          // This is potentially only part of a vector, so assume all elements
+          // are packed together.
+          unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize;
+
+          for (unsigned i = 0; i < NumElts; i += VecSize) {
+            // Get values
+            SDValue StoreVal;
+            SmallVector<SDValue, 8> Ops;
+            Ops.push_back(Chain);
+            Ops.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
+            Ops.push_back(DAG.getConstant(curOffset, dl, MVT::i32));
+
+            unsigned Opc = NVPTXISD::StoreParamV2;
+
+            StoreVal = OutVals[OIdx++];
+            if (NeedExtend)
+              StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+            Ops.push_back(StoreVal);
+
+            if (i + 1 < NumElts) {
+              StoreVal = OutVals[OIdx++];
+              if (NeedExtend)
+                StoreVal =
+                    DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+            } else {
+              StoreVal = DAG.getUNDEF(EltVT);
+            }
+            Ops.push_back(StoreVal);
+
+            if (VecSize == 4) {
+              Opc = NVPTXISD::StoreParamV4;
+              if (i + 2 < NumElts) {
+                StoreVal = OutVals[OIdx++];
+                if (NeedExtend)
+                  StoreVal =
+                      DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+              } else {
+                StoreVal = DAG.getUNDEF(EltVT);
+              }
+              Ops.push_back(StoreVal);
+
+              if (i + 3 < NumElts) {
+                StoreVal = OutVals[OIdx++];
+                if (NeedExtend)
+                  StoreVal =
+                      DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+              } else {
+                StoreVal = DAG.getUNDEF(EltVT);
+              }
+              Ops.push_back(StoreVal);
+            }
+
+            Ops.push_back(InFlag);
+
+            SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+            Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops,
+                                            MemVT, MachinePointerInfo());
+            InFlag = Chain.getValue(1);
+            curOffset += PerStoreOffset;
+          }
+        }
+        ++paramCount;
+        --OIdx;
+        continue;
+      }
+      // Plain scalar
+      // for ABI,    declare .param .b<size> .param<n>;
+      unsigned sz = VT.getSizeInBits();
+      bool needExtend = false;
+      if (VT.isInteger()) {
+        if (sz < 16)
+          needExtend = true;
+        if (sz < 32)
+          sz = 32;
+      }
+      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue DeclareParamOps[] = { Chain,
+                                    DAG.getConstant(paramCount, dl, MVT::i32),
+                                    DAG.getConstant(sz, dl, MVT::i32),
+                                    DAG.getConstant(0, dl, MVT::i32), InFlag };
+      Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
+                          DeclareParamOps);
+      InFlag = Chain.getValue(1);
+      SDValue OutV = OutVals[OIdx];
+      if (needExtend) {
+        // zext/sext i1 to i16
+        unsigned opc = ISD::ZERO_EXTEND;
+        if (Outs[OIdx].Flags.isSExt())
+          opc = ISD::SIGN_EXTEND;
+        OutV = DAG.getNode(opc, dl, MVT::i16, OutV);
+      }
+      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue CopyParamOps[] = { Chain,
+                                 DAG.getConstant(paramCount, dl, MVT::i32),
+                                 DAG.getConstant(0, dl, MVT::i32), OutV,
+                                 InFlag };
+
+      unsigned opcode = NVPTXISD::StoreParam;
+      if (Outs[OIdx].Flags.isZExt() && VT.getSizeInBits() < 32)
+        opcode = NVPTXISD::StoreParamU32;
+      else if (Outs[OIdx].Flags.isSExt() && VT.getSizeInBits() < 32)
+        opcode = NVPTXISD::StoreParamS32;
+      Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps,
+                                      VT, MachinePointerInfo());
+
+      InFlag = Chain.getValue(1);
+      ++paramCount;
+      continue;
+    }
+    // struct or vector
+    SmallVector<EVT, 16> vtparts;
+    SmallVector<uint64_t, 16> Offsets;
+    auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
+    assert(PTy && "Type of a byval parameter should be pointer");
+    ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(),
+                       vtparts, &Offsets, 0);
+
+    // declare .param .align <align> .b8 .param<n>[<size>];
+    unsigned sz = Outs[OIdx].Flags.getByValSize();
+    SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
+    // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
+    // so we don't need to worry about natural alignment or not.
+    // See TargetLowering::LowerCallTo().
+
+    // Enforce minumum alignment of 4 to work around ptxas miscompile
+    // for sm_50+. See corresponding alignment adjustment in
+    // emitFunctionParamList() for details.
+    if (ArgAlign < 4)
+      ArgAlign = 4;
+    SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
+                                 DAG.getConstant(paramCount, dl, MVT::i32),
+                                 DAG.getConstant(sz, dl, MVT::i32), InFlag};
+    Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                        DeclareParamOps);
+    InFlag = Chain.getValue(1);
+    for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
+      EVT elemtype = vtparts[j];
+      int curOffset = Offsets[j];
+      unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
+      auto PtrVT = getPointerTy(DAG.getDataLayout());
+      SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
+                                    DAG.getConstant(curOffset, dl, PtrVT));
+      SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
+                                   MachinePointerInfo(), PartAlign);
+      if (elemtype.getSizeInBits() < 16) {
+        theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
+      }
+      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue CopyParamOps[] = { Chain,
+                                 DAG.getConstant(paramCount, dl, MVT::i32),
+                                 DAG.getConstant(curOffset, dl, MVT::i32),
+                                 theVal, InFlag };
+      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
+                                      CopyParamOps, elemtype,
+                                      MachinePointerInfo());
+
+      InFlag = Chain.getValue(1);
+    }
+    ++paramCount;
+  }
+
+  GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
+  unsigned retAlignment = 0;
+
+  // Handle Result
+  if (Ins.size() > 0) {
+    SmallVector<EVT, 16> resvtparts;
+    ComputeValueVTs(*this, DL, retTy, resvtparts);
+
+    // Declare
+    //  .param .align 16 .b8 retval0[<size-in-bytes>], or
+    //  .param .b<size-in-bits> retval0
+    unsigned resultsz = DL.getTypeAllocSizeInBits(retTy);
+    // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
+    // these three types to match the logic in
+    // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
+    // Plus, this behavior is consistent with nvcc's.
+    if (retTy->isFloatingPointTy() || retTy->isIntegerTy() ||
+        retTy->isPointerTy()) {
+      // Scalar needs to be at least 32bit wide
+      if (resultsz < 32)
+        resultsz = 32;
+      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
+                                  DAG.getConstant(resultsz, dl, MVT::i32),
+                                  DAG.getConstant(0, dl, MVT::i32), InFlag };
+      Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
+                          DeclareRetOps);
+      InFlag = Chain.getValue(1);
+    } else {
+      retAlignment = getArgumentAlignment(Callee, CS, retTy, 0, DL);
+      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue DeclareRetOps[] = { Chain,
+                                  DAG.getConstant(retAlignment, dl, MVT::i32),
+                                  DAG.getConstant(resultsz / 8, dl, MVT::i32),
+                                  DAG.getConstant(0, dl, MVT::i32), InFlag };
+      Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
+                          DeclareRetOps);
+      InFlag = Chain.getValue(1);
+    }
+  }
+
+  if (!Func) {
+    // This is indirect function call case : PTX requires a prototype of the
+    // form
+    // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
+    // to be emitted, and the label has to used as the last arg of call
+    // instruction.
+    // The prototype is embedded in a string and put as the operand for a
+    // CallPrototype SDNode which will print out to the value of the string.
+    SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    std::string Proto =
+        getPrototype(DAG.getDataLayout(), retTy, Args, Outs, retAlignment, CS);
+    const char *ProtoStr =
+      nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
+    SDValue ProtoOps[] = {
+      Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
+    };
+    Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
+    InFlag = Chain.getValue(1);
+  }
+  // Op to just print "call"
+  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue PrintCallOps[] = {
+    Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
+  };
+  // We model convergent calls as separate opcodes.
+  unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall;
+  if (CLI.IsConvergent)
+    Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
+                                              : NVPTXISD::PrintConvergentCall;
+  Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
+  InFlag = Chain.getValue(1);
+
+  // Ops to print out the function name
+  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
+  InFlag = Chain.getValue(1);
+
+  // Ops to print out the param list
+  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallArgBeginOps[] = { Chain, InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
+                      CallArgBeginOps);
+  InFlag = Chain.getValue(1);
+
+  for (unsigned i = 0, e = paramCount; i != e; ++i) {
+    unsigned opcode;
+    if (i == (e - 1))
+      opcode = NVPTXISD::LastCallArg;
+    else
+      opcode = NVPTXISD::CallArg;
+    SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
+                             DAG.getConstant(i, dl, MVT::i32), InFlag };
+    Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
+    InFlag = Chain.getValue(1);
+  }
+  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallArgEndOps[] = { Chain,
+                              DAG.getConstant(Func ? 1 : 0, dl, MVT::i32),
+                              InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
+  InFlag = Chain.getValue(1);
+
+  if (!Func) {
+    SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue PrototypeOps[] = { Chain,
+                               DAG.getConstant(uniqueCallSite, dl, MVT::i32),
+                               InFlag };
+    Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Generate loads from param memory/moves from registers for result
+  if (Ins.size() > 0) {
+    if (retTy && retTy->isVectorTy()) {
+      EVT ObjectVT = getValueType(DL, retTy);
+      unsigned NumElts = ObjectVT.getVectorNumElements();
+      EVT EltVT = ObjectVT.getVectorElementType();
+      assert(STI.getTargetLowering()->getNumRegisters(F->getContext(),
+                                                      ObjectVT) == NumElts &&
+             "Vector was not scalarized");
+      unsigned sz = EltVT.getSizeInBits();
+      bool needTruncate = sz < 8;
+
+      if (NumElts == 1) {
+        // Just a simple load
+        SmallVector<EVT, 4> LoadRetVTs;
+        if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+          // If loading i1/i8 result, generate
+          //   load.b8 i16
+          //   if i1
+          //   trunc i16 to i1
+          LoadRetVTs.push_back(MVT::i16);
+        } else
+          LoadRetVTs.push_back(EltVT);
+        LoadRetVTs.push_back(MVT::Other);
+        LoadRetVTs.push_back(MVT::Glue);
+        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
+                                DAG.getConstant(0, dl, MVT::i32), InFlag};
+        SDValue retval = DAG.getMemIntrinsicNode(
+            NVPTXISD::LoadParam, dl,
+            DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
+        Chain = retval.getValue(1);
+        InFlag = retval.getValue(2);
+        SDValue Ret0 = retval;
+        if (needTruncate)
+          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0);
+        InVals.push_back(Ret0);
+      } else if (NumElts == 2) {
+        // LoadV2
+        SmallVector<EVT, 4> LoadRetVTs;
+        if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+          // If loading i1/i8 result, generate
+          //   load.b8 i16
+          //   if i1
+          //   trunc i16 to i1
+          LoadRetVTs.push_back(MVT::i16);
+          LoadRetVTs.push_back(MVT::i16);
+        } else {
+          LoadRetVTs.push_back(EltVT);
+          LoadRetVTs.push_back(EltVT);
+        }
+        LoadRetVTs.push_back(MVT::Other);
+        LoadRetVTs.push_back(MVT::Glue);
+        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
+                                DAG.getConstant(0, dl, MVT::i32), InFlag};
+        SDValue retval = DAG.getMemIntrinsicNode(
+            NVPTXISD::LoadParamV2, dl,
+            DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
+        Chain = retval.getValue(2);
+        InFlag = retval.getValue(3);
+        SDValue Ret0 = retval.getValue(0);
+        SDValue Ret1 = retval.getValue(1);
+        if (needTruncate) {
+          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0);
+          InVals.push_back(Ret0);
+          Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1);
+          InVals.push_back(Ret1);
+        } else {
+          InVals.push_back(Ret0);
+          InVals.push_back(Ret1);
+        }
+      } else {
+        // Split into N LoadV4
+        unsigned Ofst = 0;
+        unsigned VecSize = 4;
+        unsigned Opc = NVPTXISD::LoadParamV4;
+        if (EltVT.getSizeInBits() == 64) {
+          VecSize = 2;
+          Opc = NVPTXISD::LoadParamV2;
+        }
+        EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
+        for (unsigned i = 0; i < NumElts; i += VecSize) {
+          SmallVector<EVT, 8> LoadRetVTs;
+          if (EltVT == MVT::i1 || EltVT == MVT::i8) {
+            // If loading i1/i8 result, generate
+            //   load.b8 i16
+            //   if i1
+            //   trunc i16 to i1
+            for (unsigned j = 0; j < VecSize; ++j)
+              LoadRetVTs.push_back(MVT::i16);
+          } else {
+            for (unsigned j = 0; j < VecSize; ++j)
+              LoadRetVTs.push_back(EltVT);
+          }
+          LoadRetVTs.push_back(MVT::Other);
+          LoadRetVTs.push_back(MVT::Glue);
+          SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
+                                  DAG.getConstant(Ofst, dl, MVT::i32), InFlag};
+          SDValue retval = DAG.getMemIntrinsicNode(
+              Opc, dl, DAG.getVTList(LoadRetVTs),
+              LoadRetOps, EltVT, MachinePointerInfo());
+          if (VecSize == 2) {
+            Chain = retval.getValue(2);
+            InFlag = retval.getValue(3);
+          } else {
+            Chain = retval.getValue(4);
+            InFlag = retval.getValue(5);
+          }
+
+          for (unsigned j = 0; j < VecSize; ++j) {
+            if (i + j >= NumElts)
+              break;
+            SDValue Elt = retval.getValue(j);
+            if (needTruncate)
+              Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+            InVals.push_back(Elt);
+          }
+          Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+        }
+      }
+    } else {
+      SmallVector<EVT, 16> VTs;
+      SmallVector<uint64_t, 16> Offsets;
+      auto &DL = DAG.getDataLayout();
+      ComputePTXValueVTs(*this, DL, retTy, VTs, &Offsets, 0);
+      assert(VTs.size() == Ins.size() && "Bad value decomposition");
+      unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0, DL);
+      for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+        unsigned sz = VTs[i].getSizeInBits();
+        unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);
+        bool needTruncate = false;
+        if (VTs[i].isInteger() && sz < 8) {
+          sz = 8;
+          needTruncate = true;
+        }
+
+        SmallVector<EVT, 4> LoadRetVTs;
+        EVT TheLoadType = VTs[i];
+        if (retTy->isIntegerTy() && DL.getTypeAllocSizeInBits(retTy) < 32) {
+          // This is for integer types only, and specifically not for
+          // aggregates.
+          LoadRetVTs.push_back(MVT::i32);
+          TheLoadType = MVT::i32;
+          needTruncate = true;
+        } else if (sz < 16) {
+          // If loading i1/i8 result, generate
+          //   load i8 (-> i16)
+          //   trunc i16 to i1/i8
+
+          // FIXME: Do we need to set needTruncate to true here, too?  We could
+          // not figure out what this branch is for in D17872, so we left it
+          // alone.  The comment above about loading i1/i8 may be wrong, as the
+          // branch above seems to cover integers of size < 32.
+          LoadRetVTs.push_back(MVT::i16);
+        } else
+          LoadRetVTs.push_back(Ins[i].VT);
+        LoadRetVTs.push_back(MVT::Other);
+        LoadRetVTs.push_back(MVT::Glue);
+
+        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
+                                DAG.getConstant(Offsets[i], dl, MVT::i32),
+                                InFlag};
+        SDValue retval = DAG.getMemIntrinsicNode(
+            NVPTXISD::LoadParam, dl,
+            DAG.getVTList(LoadRetVTs), LoadRetOps,
+            TheLoadType, MachinePointerInfo(), AlignI);
+        Chain = retval.getValue(1);
+        InFlag = retval.getValue(2);
+        SDValue Ret0 = retval.getValue(0);
+        if (needTruncate)
+          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0);
+        InVals.push_back(Ret0);
+      }
+    }
+  }
+
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getIntPtrConstant(uniqueCallSite, dl, true),
+                             DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
+                                                   true),
+                             InFlag, dl);
+  uniqueCallSite++;
+
+  // set isTailCall to false for now, until we figure out how to express
+  // tail call optimization in PTX
+  isTailCall = false;
+  return Chain;
+}
+
+// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
+// (see LegalizeDAG.cpp). This is slow and uses local memory.
+// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
+SDValue
+NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  SDLoc dl(Node);
+  SmallVector<SDValue, 8> Ops;
+  unsigned NumOperands = Node->getNumOperands();
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    SDValue SubOp = Node->getOperand(i);
+    EVT VVT = SubOp.getNode()->getValueType(0);
+    EVT EltVT = VVT.getVectorElementType();
+    unsigned NumSubElem = VVT.getVectorNumElements();
+    for (unsigned j = 0; j < NumSubElem; ++j) {
+      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
+                                DAG.getIntPtrConstant(j, dl)));
+    }
+  }
+  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
+}
+
+/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
+/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
+///    amount, or
+/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
+///    amount.
+SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+  if (VTBits == 32 && STI.getSmVersion() >= 35) {
+
+    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
+    // {dHi, dLo} = {aHi, aLo} >> Amt
+    //   dHi = aHi >> Amt
+    //   dLo = shf.r.clamp aLo, aHi, Amt
+
+    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+    SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
+                             ShAmt);
+
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, dl);
+  }
+  else {
+
+    // {dHi, dLo} = {aHi, aLo} >> Amt
+    // - if (Amt>=size) then
+    //      dLo = aHi >> (Amt-size)
+    //      dHi = aHi >> Amt (this is either all 0 or all 1)
+    //   else
+    //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
+    //      dHi = aHi >> Amt
+
+    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                   DAG.getConstant(VTBits, dl, MVT::i32),
+                                   ShAmt);
+    SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                     DAG.getConstant(VTBits, dl, MVT::i32));
+    SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+    SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+
+    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
+                               DAG.getConstant(VTBits, dl, MVT::i32),
+                               ISD::SETGE);
+    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+    SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
+
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, dl);
+  }
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which
+/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
+///    amount, or
+/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
+///    amount.
+SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  assert(Op.getOpcode() == ISD::SHL_PARTS);
+
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+
+  if (VTBits == 32 && STI.getSmVersion() >= 35) {
+
+    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
+    // {dHi, dLo} = {aHi, aLo} << Amt
+    //   dHi = shf.l.clamp aLo, aHi, Amt
+    //   dLo = aLo << Amt
+
+    SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
+                             ShAmt);
+    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, dl);
+  }
+  else {
+
+    // {dHi, dLo} = {aHi, aLo} << Amt
+    // - if (Amt>=size) then
+    //      dLo = aLo << Amt (all 0)
+    //      dLo = aLo << (Amt-size)
+    //   else
+    //      dLo = aLo << Amt
+    //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
+
+    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                   DAG.getConstant(VTBits, dl, MVT::i32),
+                                   ShAmt);
+    SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                     DAG.getConstant(VTBits, dl, MVT::i32));
+    SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+    SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
+                               DAG.getConstant(VTBits, dl, MVT::i32),
+                               ISD::SETGE);
+    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+    SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
+
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, dl);
+  }
+}
+
+SDValue
+NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  case ISD::RETURNADDR:
+    return SDValue();
+  case ISD::FRAMEADDR:
+    return SDValue();
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN:
+    return Op;
+  case ISD::BUILD_VECTOR:
+  case ISD::EXTRACT_SUBVECTOR:
+    return Op;
+  case ISD::CONCAT_VECTORS:
+    return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::STORE:
+    return LowerSTORE(Op, DAG);
+  case ISD::LOAD:
+    return LowerLOAD(Op, DAG);
+  case ISD::SHL_PARTS:
+    return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:
+    return LowerShiftRightParts(Op, DAG);
+  case ISD::SELECT:
+    return LowerSelect(Op, DAG);
+  default:
+    llvm_unreachable("Custom lowering not defined for operation");
+  }
+}
+
+SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Op0 = Op->getOperand(0);
+  SDValue Op1 = Op->getOperand(1);
+  SDValue Op2 = Op->getOperand(2);
+  SDLoc DL(Op.getNode());
+
+  assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
+
+  Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
+  Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
+  SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
+  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
+
+  return Trunc;
+}
+
+SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  if (Op.getValueType() == MVT::i1)
+    return LowerLOADi1(Op, DAG);
+  else
+    return SDValue();
+}
+
+// v = ld i1* addr
+//   =>
+// v1 = ld i8* addr (-> i16)
+// v = trunc i16 to i1
+SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  LoadSDNode *LD = cast<LoadSDNode>(Node);
+  SDLoc dl(Node);
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
+  assert(Node->getValueType(0) == MVT::i1 &&
+         "Custom lowering for i1 load only");
+  SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
+                              LD->getPointerInfo(), LD->getAlignment(),
+                              LD->getMemOperand()->getFlags());
+  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
+  // The legalizer (the caller) is expecting two values from the legalized
+  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
+  // in LegalizeDAG.cpp which also uses MergeValues.
+  SDValue Ops[] = { result, LD->getChain() };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  EVT ValVT = Op.getOperand(1).getValueType();
+  if (ValVT == MVT::i1)
+    return LowerSTOREi1(Op, DAG);
+  else if (ValVT.isVector())
+    return LowerSTOREVector(Op, DAG);
+  else
+    return SDValue();
+}
+
+SDValue
+NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *N = Op.getNode();
+  SDValue Val = N->getOperand(1);
+  SDLoc DL(N);
+  EVT ValVT = Val.getValueType();
+
+  if (ValVT.isVector()) {
+    // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+    // legal.  We can (and should) split that into 2 stores of <2 x double> here
+    // but I'm leaving that as a TODO for now.
+    if (!ValVT.isSimple())
+      return SDValue();
+    switch (ValVT.getSimpleVT().SimpleTy) {
+    default:
+      return SDValue();
+    case MVT::v2i8:
+    case MVT::v2i16:
+    case MVT::v2i32:
+    case MVT::v2i64:
+    case MVT::v2f32:
+    case MVT::v2f64:
+    case MVT::v4i8:
+    case MVT::v4i16:
+    case MVT::v4i32:
+    case MVT::v4f32:
+      // This is a "native" vector type
+      break;
+    }
+
+    MemSDNode *MemSD = cast<MemSDNode>(N);
+    const DataLayout &TD = DAG.getDataLayout();
+
+    unsigned Align = MemSD->getAlignment();
+    unsigned PrefAlign =
+        TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
+    if (Align < PrefAlign) {
+      // This store is not sufficiently aligned, so bail out and let this vector
+      // store be scalarized.  Note that we may still be able to emit smaller
+      // vector stores.  For example, if we are storing a <4 x float> with an
+      // alignment of 8, this check will fail but the legalizer will try again
+      // with 2 x <2 x float>, which will succeed with an alignment of 8.
+      return SDValue();
+    }
+
+    unsigned Opcode = 0;
+    EVT EltVT = ValVT.getVectorElementType();
+    unsigned NumElts = ValVT.getVectorNumElements();
+
+    // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
+    // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+    // stored type to i16 and propagate the "real" type as the memory type.
+    bool NeedExt = false;
+    if (EltVT.getSizeInBits() < 16)
+      NeedExt = true;
+
+    switch (NumElts) {
+    default:
+      return SDValue();
+    case 2:
+      Opcode = NVPTXISD::StoreV2;
+      break;
+    case 4: {
+      Opcode = NVPTXISD::StoreV4;
+      break;
+    }
+    }
+
+    SmallVector<SDValue, 8> Ops;
+
+    // First is the chain
+    Ops.push_back(N->getOperand(0));
+
+    // Then the split values
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
+                                   DAG.getIntPtrConstant(i, DL));
+      if (NeedExt)
+        ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
+      Ops.push_back(ExtVal);
+    }
+
+    // Then any remaining arguments
+    Ops.append(N->op_begin() + 2, N->op_end());
+
+    SDValue NewSt = DAG.getMemIntrinsicNode(
+        Opcode, DL, DAG.getVTList(MVT::Other), Ops,
+        MemSD->getMemoryVT(), MemSD->getMemOperand());
+
+    //return DCI.CombineTo(N, NewSt, true);
+    return NewSt;
+  }
+
+  return SDValue();
+}
+
+// st i1 v, addr
+//    =>
+// v1 = zxt v to i16
+// st.u8 i16, addr
+SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  SDLoc dl(Node);
+  StoreSDNode *ST = cast<StoreSDNode>(Node);
+  SDValue Tmp1 = ST->getChain();
+  SDValue Tmp2 = ST->getBasePtr();
+  SDValue Tmp3 = ST->getValue();
+  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
+  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
+  SDValue Result =
+      DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
+                        ST->getAlignment(), ST->getMemOperand()->getFlags());
+  return Result;
+}
+
+SDValue
+NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
+  std::string ParamSym;
+  raw_string_ostream ParamStr(ParamSym);
+
+  ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
+  ParamStr.flush();
+
+  std::string *SavedStr =
+    nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
+  return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
+}
+
+// Check to see if the kernel argument is image*_t or sampler_t
+
+static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
+  static const char *const specialTypes[] = { "struct._image2d_t",
+                                              "struct._image3d_t",
+                                              "struct._sampler_t" };
+
+  Type *Ty = arg->getType();
+  auto *PTy = dyn_cast<PointerType>(Ty);
+
+  if (!PTy)
+    return false;
+
+  if (!context)
+    return false;
+
+  auto *STy = dyn_cast<StructType>(PTy->getElementType());
+  if (!STy || STy->isLiteral())
+    return false;
+
+  return std::find(std::begin(specialTypes), std::end(specialTypes),
+                   STy->getName()) != std::end(specialTypes);
+}
+
+SDValue NVPTXTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const DataLayout &DL = DAG.getDataLayout();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  const Function *F = MF.getFunction();
+  const AttributeSet &PAL = F->getAttributes();
+  const TargetLowering *TLI = STI.getTargetLowering();
+
+  SDValue Root = DAG.getRoot();
+  std::vector<SDValue> OutChains;
+
+  bool isABI = (STI.getSmVersion() >= 20);
+  assert(isABI && "Non-ABI compilation is not supported");
+  if (!isABI)
+    return Chain;
+
+  std::vector<Type *> argTypes;
+  std::vector<const Argument *> theArgs;
+  for (const Argument &I : F->args()) {
+    theArgs.push_back(&I);
+    argTypes.push_back(I.getType());
+  }
+  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
+  // Ins.size() will be larger
+  //   * if there is an aggregate argument with multiple fields (each field
+  //     showing up separately in Ins)
+  //   * if there is a vector argument with more than typical vector-length
+  //     elements (generally if more than 4) where each vector element is
+  //     individually present in Ins.
+  // So a different index should be used for indexing into Ins.
+  // See similar issue in LowerCall.
+  unsigned InsIdx = 0;
+
+  int idx = 0;
+  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
+    Type *Ty = argTypes[i];
+
+    // If the kernel argument is image*_t or sampler_t, convert it to
+    // a i32 constant holding the parameter position. This can later
+    // matched in the AsmPrinter to output the correct mangled name.
+    if (isImageOrSamplerVal(
+            theArgs[i],
+            (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
+                                     : nullptr))) {
+      assert(llvm::isKernelFunction(*F) &&
+             "Only kernels can have image/sampler params");
+      InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
+      continue;
+    }
+
+    if (theArgs[i]->use_empty()) {
+      // argument is dead
+      if (Ty->isAggregateType()) {
+        SmallVector<EVT, 16> vtparts;
+
+        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
+        assert(vtparts.size() > 0 && "empty aggregate type not expected");
+        for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
+             ++parti) {
+          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+          ++InsIdx;
+        }
+        if (vtparts.size() > 0)
+          --InsIdx;
+        continue;
+      }
+      if (Ty->isVectorTy()) {
+        EVT ObjectVT = getValueType(DL, Ty);
+        unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
+        for (unsigned parti = 0; parti < NumRegs; ++parti) {
+          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+          ++InsIdx;
+        }
+        if (NumRegs > 0)
+          --InsIdx;
+        continue;
+      }
+      InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+      continue;
+    }
+
+    // In the following cases, assign a node order of "idx+1"
+    // to newly created nodes. The SDNodes for params have to
+    // appear in the same order as their order of appearance
+    // in the original function. "idx+1" holds that order.
+    if (!PAL.hasAttribute(i + 1, Attribute::ByVal)) {
+      if (Ty->isAggregateType()) {
+        SmallVector<EVT, 16> vtparts;
+        SmallVector<uint64_t, 16> offsets;
+
+        // NOTE: Here, we lose the ability to issue vector loads for vectors
+        // that are a part of a struct.  This should be investigated in the
+        // future.
+        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &offsets,
+                           0);
+        assert(vtparts.size() > 0 && "empty aggregate type not expected");
+        bool aggregateIsPacked = false;
+        if (StructType *STy = llvm::dyn_cast<StructType>(Ty))
+          aggregateIsPacked = STy->isPacked();
+
+        SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+        for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
+             ++parti) {
+          EVT partVT = vtparts[parti];
+          Value *srcValue = Constant::getNullValue(
+              PointerType::get(partVT.getTypeForEVT(F->getContext()),
+                               llvm::ADDRESS_SPACE_PARAM));
+          SDValue srcAddr =
+              DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
+                          DAG.getConstant(offsets[parti], dl, PtrVT));
+          unsigned partAlign = aggregateIsPacked
+                                   ? 1
+                                   : DL.getABITypeAlignment(
+                                         partVT.getTypeForEVT(F->getContext()));
+          SDValue p;
+          if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) {
+            ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
+                                     ISD::SEXTLOAD : ISD::ZEXTLOAD;
+            p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr,
+                               MachinePointerInfo(srcValue), partVT, partAlign);
+          } else {
+            p = DAG.getLoad(partVT, dl, Root, srcAddr,
+                            MachinePointerInfo(srcValue), partAlign);
+          }
+          if (p.getNode())
+            p.getNode()->setIROrder(idx + 1);
+          InVals.push_back(p);
+          ++InsIdx;
+        }
+        if (vtparts.size() > 0)
+          --InsIdx;
+        continue;
+      }
+      if (Ty->isVectorTy()) {
+        EVT ObjectVT = getValueType(DL, Ty);
+        SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+        unsigned NumElts = ObjectVT.getVectorNumElements();
+        assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts &&
+               "Vector was not scalarized");
+        EVT EltVT = ObjectVT.getVectorElementType();
+
+        // V1 load
+        // f32 = load ...
+        if (NumElts == 1) {
+          // We only have one element, so just directly load it
+          Value *SrcValue = Constant::getNullValue(PointerType::get(
+              EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+          SDValue P = DAG.getLoad(
+              EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
+              DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())),
+              MachineMemOperand::MODereferenceable |
+                  MachineMemOperand::MOInvariant);
+          if (P.getNode())
+            P.getNode()->setIROrder(idx + 1);
+
+          if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
+            P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P);
+          InVals.push_back(P);
+          ++InsIdx;
+        } else if (NumElts == 2) {
+          // V2 load
+          // f32,f32 = load ...
+          EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
+          Value *SrcValue = Constant::getNullValue(PointerType::get(
+              VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+          SDValue P = DAG.getLoad(
+              VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
+              DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
+              MachineMemOperand::MODereferenceable |
+                  MachineMemOperand::MOInvariant);
+          if (P.getNode())
+            P.getNode()->setIROrder(idx + 1);
+
+          SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
+                                     DAG.getIntPtrConstant(0, dl));
+          SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
+                                     DAG.getIntPtrConstant(1, dl));
+
+          if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) {
+            Elt0 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt0);
+            Elt1 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt1);
+          }
+
+          InVals.push_back(Elt0);
+          InVals.push_back(Elt1);
+          InsIdx += 2;
+        } else {
+          // V4 loads
+          // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
+          // the vector will be expanded to a power of 2 elements, so we know we
+          // can always round up to the next multiple of 4 when creating the
+          // vector loads.
+          // e.g.  4 elem => 1 ld.v4
+          //       6 elem => 2 ld.v4
+          //       8 elem => 2 ld.v4
+          //      11 elem => 3 ld.v4
+          unsigned VecSize = 4;
+          if (EltVT.getSizeInBits() == 64) {
+            VecSize = 2;
+          }
+          EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
+          unsigned Ofst = 0;
+          for (unsigned i = 0; i < NumElts; i += VecSize) {
+            Value *SrcValue = Constant::getNullValue(
+                PointerType::get(VecVT.getTypeForEVT(F->getContext()),
+                                 llvm::ADDRESS_SPACE_PARAM));
+            SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
+                                          DAG.getConstant(Ofst, dl, PtrVT));
+            SDValue P = DAG.getLoad(
+                VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue),
+                DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
+                MachineMemOperand::MODereferenceable |
+                    MachineMemOperand::MOInvariant);
+            if (P.getNode())
+              P.getNode()->setIROrder(idx + 1);
+
+            for (unsigned j = 0; j < VecSize; ++j) {
+              if (i + j >= NumElts)
+                break;
+              SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
+                                        DAG.getIntPtrConstant(j, dl));
+              if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
+                Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt);
+              InVals.push_back(Elt);
+            }
+            Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+          }
+          InsIdx += NumElts;
+        }
+
+        if (NumElts > 0)
+          --InsIdx;
+        continue;
+      }
+      // A plain scalar.
+      EVT ObjectVT = getValueType(DL, Ty);
+      // If ABI, load from the param symbol
+      SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+      Value *srcValue = Constant::getNullValue(PointerType::get(
+          ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+      SDValue p;
+       if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
+        ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
+                                       ISD::SEXTLOAD : ISD::ZEXTLOAD;
+        p = DAG.getExtLoad(
+            ExtOp, dl, Ins[InsIdx].VT, Root, Arg, MachinePointerInfo(srcValue),
+            ObjectVT,
+            DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
+      } else {
+        p = DAG.getLoad(
+            Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue),
+            DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
+      }
+      if (p.getNode())
+        p.getNode()->setIROrder(idx + 1);
+      InVals.push_back(p);
+      continue;
+    }
+
+    // Param has ByVal attribute
+    // Return MoveParam(param symbol).
+    // Ideally, the param symbol can be returned directly,
+    // but when SDNode builder decides to use it in a CopyToReg(),
+    // machine instruction fails because TargetExternalSymbol
+    // (not lowered) is target dependent, and CopyToReg assumes
+    // the source is lowered.
+    EVT ObjectVT = getValueType(DL, Ty);
+    assert(ObjectVT == Ins[InsIdx].VT &&
+           "Ins type did not match function type");
+    SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+    SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
+    if (p.getNode())
+      p.getNode()->setIROrder(idx + 1);
+    InVals.push_back(p);
+  }
+
+  // Clang will check explicit VarArg and issue error if any. However, Clang
+  // will let code with
+  // implicit var arg like f() pass. See bug 617733.
+  // We treat this case as if the arg list is empty.
+  // if (F.isVarArg()) {
+  // assert(0 && "VarArg not supported yet!");
+  //}
+
+  if (!OutChains.empty())
+    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
+
+  return Chain;
+}
+
+SDValue
+NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                 bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 const SDLoc &dl, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *F = MF.getFunction();
+  Type *RetTy = F->getReturnType();
+  const DataLayout &TD = DAG.getDataLayout();
+
+  bool isABI = (STI.getSmVersion() >= 20);
+  assert(isABI && "Non-ABI compilation is not supported");
+  if (!isABI)
+    return Chain;
+
+  if (VectorType *VTy = dyn_cast<VectorType>(RetTy)) {
+    // If we have a vector type, the OutVals array will be the scalarized
+    // components and we have combine them into 1 or more vector stores.
+    unsigned NumElts = VTy->getNumElements();
+    assert(NumElts == Outs.size() && "Bad scalarization of return value");
+
+    // const_cast can be removed in later LLVM versions
+    EVT EltVT = getValueType(TD, RetTy).getVectorElementType();
+    bool NeedExtend = false;
+    if (EltVT.getSizeInBits() < 16)
+      NeedExtend = true;
+
+    // V1 store
+    if (NumElts == 1) {
+      SDValue StoreVal = OutVals[0];
+      // We only have one element, so just directly store it
+      if (NeedExtend)
+        StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+      SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal };
+      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
+                                      DAG.getVTList(MVT::Other), Ops,
+                                      EltVT, MachinePointerInfo());
+
+    } else if (NumElts == 2) {
+      // V2 store
+      SDValue StoreVal0 = OutVals[0];
+      SDValue StoreVal1 = OutVals[1];
+
+      if (NeedExtend) {
+        StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0);
+        StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1);
+      }
+
+      SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal0,
+                        StoreVal1 };
+      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl,
+                                      DAG.getVTList(MVT::Other), Ops,
+                                      EltVT, MachinePointerInfo());
+    } else {
+      // V4 stores
+      // We have at least 4 elements (<3 x Ty> expands to 4 elements) and the
+      // vector will be expanded to a power of 2 elements, so we know we can
+      // always round up to the next multiple of 4 when creating the vector
+      // stores.
+      // e.g.  4 elem => 1 st.v4
+      //       6 elem => 2 st.v4
+      //       8 elem => 2 st.v4
+      //      11 elem => 3 st.v4
+
+      unsigned VecSize = 4;
+      if (OutVals[0].getValueSizeInBits() == 64)
+        VecSize = 2;
+
+      unsigned Offset = 0;
+
+      EVT VecVT =
+          EVT::getVectorVT(F->getContext(), EltVT, VecSize);
+      unsigned PerStoreOffset =
+          TD.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+
+      for (unsigned i = 0; i < NumElts; i += VecSize) {
+        // Get values
+        SDValue StoreVal;
+        SmallVector<SDValue, 8> Ops;
+        Ops.push_back(Chain);
+        Ops.push_back(DAG.getConstant(Offset, dl, MVT::i32));
+        unsigned Opc = NVPTXISD::StoreRetvalV2;
+        EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType();
+
+        StoreVal = OutVals[i];
+        if (NeedExtend)
+          StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+        Ops.push_back(StoreVal);
+
+        if (i + 1 < NumElts) {
+          StoreVal = OutVals[i + 1];
+          if (NeedExtend)
+            StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+        } else {
+          StoreVal = DAG.getUNDEF(ExtendedVT);
+        }
+        Ops.push_back(StoreVal);
+
+        if (VecSize == 4) {
+          Opc = NVPTXISD::StoreRetvalV4;
+          if (i + 2 < NumElts) {
+            StoreVal = OutVals[i + 2];
+            if (NeedExtend)
+              StoreVal =
+                  DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+          } else {
+            StoreVal = DAG.getUNDEF(ExtendedVT);
+          }
+          Ops.push_back(StoreVal);
+
+          if (i + 3 < NumElts) {
+            StoreVal = OutVals[i + 3];
+            if (NeedExtend)
+              StoreVal =
+                  DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+          } else {
+            StoreVal = DAG.getUNDEF(ExtendedVT);
+          }
+          Ops.push_back(StoreVal);
+        }
+
+        // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size());
+        Chain =
+            DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops,
+                                    EltVT, MachinePointerInfo());
+        Offset += PerStoreOffset;
+      }
+    }
+  } else {
+    SmallVector<EVT, 16> ValVTs;
+    SmallVector<uint64_t, 16> Offsets;
+    ComputePTXValueVTs(*this, DAG.getDataLayout(), RetTy, ValVTs, &Offsets, 0);
+    assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
+
+    for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+      SDValue theVal = OutVals[i];
+      EVT TheValType = theVal.getValueType();
+      unsigned numElems = 1;
+      if (TheValType.isVector())
+        numElems = TheValType.getVectorNumElements();
+      for (unsigned j = 0, je = numElems; j != je; ++j) {
+        SDValue TmpVal = theVal;
+        if (TheValType.isVector())
+          TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                               TheValType.getVectorElementType(), TmpVal,
+                               DAG.getIntPtrConstant(j, dl));
+        EVT TheStoreType = ValVTs[i];
+        if (RetTy->isIntegerTy() && TD.getTypeAllocSizeInBits(RetTy) < 32) {
+          // The following zero-extension is for integer types only, and
+          // specifically not for aggregates.
+          TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal);
+          TheStoreType = MVT::i32;
+        }
+        else if (TmpVal.getValueSizeInBits() < 16)
+          TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal);
+
+        SDValue Ops[] = {
+          Chain,
+          DAG.getConstant(Offsets[i], dl, MVT::i32),
+          TmpVal };
+        Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
+                                        DAG.getVTList(MVT::Other), Ops,
+                                        TheStoreType,
+                                        MachinePointerInfo());
+      }
+    }
+  }
+
+  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+
+void NVPTXTargetLowering::LowerAsmOperandForConstraint(
+    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+    SelectionDAG &DAG) const {
+  if (Constraint.length() > 1)
+    return;
+  else
+    TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
+  switch (Intrinsic) {
+  default:
+    return 0;
+
+  case Intrinsic::nvvm_tex_1d_v4f32_s32:
+    return NVPTXISD::Tex1DFloatS32;
+  case Intrinsic::nvvm_tex_1d_v4f32_f32:
+    return NVPTXISD::Tex1DFloatFloat;
+  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
+    return NVPTXISD::Tex1DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
+    return NVPTXISD::Tex1DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_1d_v4s32_s32:
+    return NVPTXISD::Tex1DS32S32;
+  case Intrinsic::nvvm_tex_1d_v4s32_f32:
+    return NVPTXISD::Tex1DS32Float;
+  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
+    return NVPTXISD::Tex1DS32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
+    return NVPTXISD::Tex1DS32FloatGrad;
+  case Intrinsic::nvvm_tex_1d_v4u32_s32:
+    return NVPTXISD::Tex1DU32S32;
+  case Intrinsic::nvvm_tex_1d_v4u32_f32:
+    return NVPTXISD::Tex1DU32Float;
+  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
+    return NVPTXISD::Tex1DU32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
+    return NVPTXISD::Tex1DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
+    return NVPTXISD::Tex1DArrayFloatS32;
+  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
+    return NVPTXISD::Tex1DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
+    return NVPTXISD::Tex1DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
+    return NVPTXISD::Tex1DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
+    return NVPTXISD::Tex1DArrayS32S32;
+  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
+    return NVPTXISD::Tex1DArrayS32Float;
+  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
+    return NVPTXISD::Tex1DArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
+    return NVPTXISD::Tex1DArrayS32FloatGrad;
+  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
+    return NVPTXISD::Tex1DArrayU32S32;
+  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
+    return NVPTXISD::Tex1DArrayU32Float;
+  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
+    return NVPTXISD::Tex1DArrayU32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
+    return NVPTXISD::Tex1DArrayU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_2d_v4f32_s32:
+    return NVPTXISD::Tex2DFloatS32;
+  case Intrinsic::nvvm_tex_2d_v4f32_f32:
+    return NVPTXISD::Tex2DFloatFloat;
+  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
+    return NVPTXISD::Tex2DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
+    return NVPTXISD::Tex2DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_2d_v4s32_s32:
+    return NVPTXISD::Tex2DS32S32;
+  case Intrinsic::nvvm_tex_2d_v4s32_f32:
+    return NVPTXISD::Tex2DS32Float;
+  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
+    return NVPTXISD::Tex2DS32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
+    return NVPTXISD::Tex2DS32FloatGrad;
+  case Intrinsic::nvvm_tex_2d_v4u32_s32:
+    return NVPTXISD::Tex2DU32S32;
+  case Intrinsic::nvvm_tex_2d_v4u32_f32:
+    return NVPTXISD::Tex2DU32Float;
+  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
+    return NVPTXISD::Tex2DU32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
+    return NVPTXISD::Tex2DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
+    return NVPTXISD::Tex2DArrayFloatS32;
+  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
+    return NVPTXISD::Tex2DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
+    return NVPTXISD::Tex2DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
+    return NVPTXISD::Tex2DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
+    return NVPTXISD::Tex2DArrayS32S32;
+  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
+    return NVPTXISD::Tex2DArrayS32Float;
+  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
+    return NVPTXISD::Tex2DArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
+    return NVPTXISD::Tex2DArrayS32FloatGrad;
+  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
+    return NVPTXISD::Tex2DArrayU32S32;
+  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
+    return NVPTXISD::Tex2DArrayU32Float;
+  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
+    return NVPTXISD::Tex2DArrayU32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
+    return NVPTXISD::Tex2DArrayU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_3d_v4f32_s32:
+    return NVPTXISD::Tex3DFloatS32;
+  case Intrinsic::nvvm_tex_3d_v4f32_f32:
+    return NVPTXISD::Tex3DFloatFloat;
+  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
+    return NVPTXISD::Tex3DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
+    return NVPTXISD::Tex3DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_3d_v4s32_s32:
+    return NVPTXISD::Tex3DS32S32;
+  case Intrinsic::nvvm_tex_3d_v4s32_f32:
+    return NVPTXISD::Tex3DS32Float;
+  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
+    return NVPTXISD::Tex3DS32FloatLevel;
+  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
+    return NVPTXISD::Tex3DS32FloatGrad;
+  case Intrinsic::nvvm_tex_3d_v4u32_s32:
+    return NVPTXISD::Tex3DU32S32;
+  case Intrinsic::nvvm_tex_3d_v4u32_f32:
+    return NVPTXISD::Tex3DU32Float;
+  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
+    return NVPTXISD::Tex3DU32FloatLevel;
+  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
+    return NVPTXISD::Tex3DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_cube_v4f32_f32:
+    return NVPTXISD::TexCubeFloatFloat;
+  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
+    return NVPTXISD::TexCubeFloatFloatLevel;
+  case Intrinsic::nvvm_tex_cube_v4s32_f32:
+    return NVPTXISD::TexCubeS32Float;
+  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
+    return NVPTXISD::TexCubeS32FloatLevel;
+  case Intrinsic::nvvm_tex_cube_v4u32_f32:
+    return NVPTXISD::TexCubeU32Float;
+  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
+    return NVPTXISD::TexCubeU32FloatLevel;
+
+  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
+    return NVPTXISD::TexCubeArrayFloatFloat;
+  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
+    return NVPTXISD::TexCubeArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
+    return NVPTXISD::TexCubeArrayS32Float;
+  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
+    return NVPTXISD::TexCubeArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
+    return NVPTXISD::TexCubeArrayU32Float;
+  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
+    return NVPTXISD::TexCubeArrayU32FloatLevel;
+
+  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
+    return NVPTXISD::Tld4R2DFloatFloat;
+  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
+    return NVPTXISD::Tld4G2DFloatFloat;
+  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
+    return NVPTXISD::Tld4B2DFloatFloat;
+  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
+    return NVPTXISD::Tld4A2DFloatFloat;
+  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
+    return NVPTXISD::Tld4R2DS64Float;
+  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
+    return NVPTXISD::Tld4G2DS64Float;
+  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
+    return NVPTXISD::Tld4B2DS64Float;
+  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
+    return NVPTXISD::Tld4A2DS64Float;
+  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
+    return NVPTXISD::Tld4R2DU64Float;
+  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
+    return NVPTXISD::Tld4G2DU64Float;
+  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
+    return NVPTXISD::Tld4B2DU64Float;
+  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
+    return NVPTXISD::Tld4A2DU64Float;
+
+  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
+    return NVPTXISD::TexUnified1DFloatS32;
+  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
+    return NVPTXISD::TexUnified1DFloatFloat;
+  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
+    return NVPTXISD::TexUnified1DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
+    return NVPTXISD::TexUnified1DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
+    return NVPTXISD::TexUnified1DS32S32;
+  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
+    return NVPTXISD::TexUnified1DS32Float;
+  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
+    return NVPTXISD::TexUnified1DS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
+    return NVPTXISD::TexUnified1DS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
+    return NVPTXISD::TexUnified1DU32S32;
+  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
+    return NVPTXISD::TexUnified1DU32Float;
+  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
+    return NVPTXISD::TexUnified1DU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
+    return NVPTXISD::TexUnified1DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
+    return NVPTXISD::TexUnified1DArrayFloatS32;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
+    return NVPTXISD::TexUnified1DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
+    return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
+    return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
+    return NVPTXISD::TexUnified1DArrayS32S32;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
+    return NVPTXISD::TexUnified1DArrayS32Float;
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
+    return NVPTXISD::TexUnified1DArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
+    return NVPTXISD::TexUnified1DArrayS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
+    return NVPTXISD::TexUnified1DArrayU32S32;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
+    return NVPTXISD::TexUnified1DArrayU32Float;
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
+    return NVPTXISD::TexUnified1DArrayU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
+    return NVPTXISD::TexUnified1DArrayU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
+    return NVPTXISD::TexUnified2DFloatS32;
+  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
+    return NVPTXISD::TexUnified2DFloatFloat;
+  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
+    return NVPTXISD::TexUnified2DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
+    return NVPTXISD::TexUnified2DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
+    return NVPTXISD::TexUnified2DS32S32;
+  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
+    return NVPTXISD::TexUnified2DS32Float;
+  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
+    return NVPTXISD::TexUnified2DS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
+    return NVPTXISD::TexUnified2DS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
+    return NVPTXISD::TexUnified2DU32S32;
+  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
+    return NVPTXISD::TexUnified2DU32Float;
+  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
+    return NVPTXISD::TexUnified2DU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
+    return NVPTXISD::TexUnified2DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
+    return NVPTXISD::TexUnified2DArrayFloatS32;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
+    return NVPTXISD::TexUnified2DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
+    return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
+    return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
+    return NVPTXISD::TexUnified2DArrayS32S32;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
+    return NVPTXISD::TexUnified2DArrayS32Float;
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
+    return NVPTXISD::TexUnified2DArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
+    return NVPTXISD::TexUnified2DArrayS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
+    return NVPTXISD::TexUnified2DArrayU32S32;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
+    return NVPTXISD::TexUnified2DArrayU32Float;
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
+    return NVPTXISD::TexUnified2DArrayU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
+    return NVPTXISD::TexUnified2DArrayU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
+    return NVPTXISD::TexUnified3DFloatS32;
+  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
+    return NVPTXISD::TexUnified3DFloatFloat;
+  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
+    return NVPTXISD::TexUnified3DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
+    return NVPTXISD::TexUnified3DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
+    return NVPTXISD::TexUnified3DS32S32;
+  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
+    return NVPTXISD::TexUnified3DS32Float;
+  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
+    return NVPTXISD::TexUnified3DS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
+    return NVPTXISD::TexUnified3DS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
+    return NVPTXISD::TexUnified3DU32S32;
+  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
+    return NVPTXISD::TexUnified3DU32Float;
+  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
+    return NVPTXISD::TexUnified3DU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
+    return NVPTXISD::TexUnified3DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
+    return NVPTXISD::TexUnifiedCubeFloatFloat;
+  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
+    return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
+    return NVPTXISD::TexUnifiedCubeS32Float;
+  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
+    return NVPTXISD::TexUnifiedCubeS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
+    return NVPTXISD::TexUnifiedCubeU32Float;
+  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
+    return NVPTXISD::TexUnifiedCubeU32FloatLevel;
+
+  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayS32Float;
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayU32Float;
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
+
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
+    return NVPTXISD::Tld4UnifiedR2DFloatFloat;
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
+    return NVPTXISD::Tld4UnifiedG2DFloatFloat;
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
+    return NVPTXISD::Tld4UnifiedB2DFloatFloat;
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
+    return NVPTXISD::Tld4UnifiedA2DFloatFloat;
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
+    return NVPTXISD::Tld4UnifiedR2DS64Float;
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
+    return NVPTXISD::Tld4UnifiedG2DS64Float;
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
+    return NVPTXISD::Tld4UnifiedB2DS64Float;
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
+    return NVPTXISD::Tld4UnifiedA2DS64Float;
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
+    return NVPTXISD::Tld4UnifiedR2DU64Float;
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
+    return NVPTXISD::Tld4UnifiedG2DU64Float;
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
+    return NVPTXISD::Tld4UnifiedB2DU64Float;
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
+    return NVPTXISD::Tld4UnifiedA2DU64Float;
+  }
+}
+
+static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
+  switch (Intrinsic) {
+  default:
+    return 0;
+  case Intrinsic::nvvm_suld_1d_i8_clamp:
+    return NVPTXISD::Suld1DI8Clamp;
+  case Intrinsic::nvvm_suld_1d_i16_clamp:
+    return NVPTXISD::Suld1DI16Clamp;
+  case Intrinsic::nvvm_suld_1d_i32_clamp:
+    return NVPTXISD::Suld1DI32Clamp;
+  case Intrinsic::nvvm_suld_1d_i64_clamp:
+    return NVPTXISD::Suld1DI64Clamp;
+  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
+    return NVPTXISD::Suld1DV2I8Clamp;
+  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
+    return NVPTXISD::Suld1DV2I16Clamp;
+  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
+    return NVPTXISD::Suld1DV2I32Clamp;
+  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
+    return NVPTXISD::Suld1DV2I64Clamp;
+  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
+    return NVPTXISD::Suld1DV4I8Clamp;
+  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
+    return NVPTXISD::Suld1DV4I16Clamp;
+  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
+    return NVPTXISD::Suld1DV4I32Clamp;
+  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
+    return NVPTXISD::Suld1DArrayI8Clamp;
+  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
+    return NVPTXISD::Suld1DArrayI16Clamp;
+  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
+    return NVPTXISD::Suld1DArrayI32Clamp;
+  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
+    return NVPTXISD::Suld1DArrayI64Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
+    return NVPTXISD::Suld1DArrayV2I8Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
+    return NVPTXISD::Suld1DArrayV2I16Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
+    return NVPTXISD::Suld1DArrayV2I32Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
+    return NVPTXISD::Suld1DArrayV2I64Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
+    return NVPTXISD::Suld1DArrayV4I8Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
+    return NVPTXISD::Suld1DArrayV4I16Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
+    return NVPTXISD::Suld1DArrayV4I32Clamp;
+  case Intrinsic::nvvm_suld_2d_i8_clamp:
+    return NVPTXISD::Suld2DI8Clamp;
+  case Intrinsic::nvvm_suld_2d_i16_clamp:
+    return NVPTXISD::Suld2DI16Clamp;
+  case Intrinsic::nvvm_suld_2d_i32_clamp:
+    return NVPTXISD::Suld2DI32Clamp;
+  case Intrinsic::nvvm_suld_2d_i64_clamp:
+    return NVPTXISD::Suld2DI64Clamp;
+  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
+    return NVPTXISD::Suld2DV2I8Clamp;
+  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
+    return NVPTXISD::Suld2DV2I16Clamp;
+  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
+    return NVPTXISD::Suld2DV2I32Clamp;
+  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
+    return NVPTXISD::Suld2DV2I64Clamp;
+  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
+    return NVPTXISD::Suld2DV4I8Clamp;
+  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
+    return NVPTXISD::Suld2DV4I16Clamp;
+  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
+    return NVPTXISD::Suld2DV4I32Clamp;
+  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
+    return NVPTXISD::Suld2DArrayI8Clamp;
+  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
+    return NVPTXISD::Suld2DArrayI16Clamp;
+  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
+    return NVPTXISD::Suld2DArrayI32Clamp;
+  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
+    return NVPTXISD::Suld2DArrayI64Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
+    return NVPTXISD::Suld2DArrayV2I8Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
+    return NVPTXISD::Suld2DArrayV2I16Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
+    return NVPTXISD::Suld2DArrayV2I32Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
+    return NVPTXISD::Suld2DArrayV2I64Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
+    return NVPTXISD::Suld2DArrayV4I8Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
+    return NVPTXISD::Suld2DArrayV4I16Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
+    return NVPTXISD::Suld2DArrayV4I32Clamp;
+  case Intrinsic::nvvm_suld_3d_i8_clamp:
+    return NVPTXISD::Suld3DI8Clamp;
+  case Intrinsic::nvvm_suld_3d_i16_clamp:
+    return NVPTXISD::Suld3DI16Clamp;
+  case Intrinsic::nvvm_suld_3d_i32_clamp:
+    return NVPTXISD::Suld3DI32Clamp;
+  case Intrinsic::nvvm_suld_3d_i64_clamp:
+    return NVPTXISD::Suld3DI64Clamp;
+  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
+    return NVPTXISD::Suld3DV2I8Clamp;
+  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
+    return NVPTXISD::Suld3DV2I16Clamp;
+  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
+    return NVPTXISD::Suld3DV2I32Clamp;
+  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
+    return NVPTXISD::Suld3DV2I64Clamp;
+  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
+    return NVPTXISD::Suld3DV4I8Clamp;
+  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
+    return NVPTXISD::Suld3DV4I16Clamp;
+  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
+    return NVPTXISD::Suld3DV4I32Clamp;
+  case Intrinsic::nvvm_suld_1d_i8_trap:
+    return NVPTXISD::Suld1DI8Trap;
+  case Intrinsic::nvvm_suld_1d_i16_trap:
+    return NVPTXISD::Suld1DI16Trap;
+  case Intrinsic::nvvm_suld_1d_i32_trap:
+    return NVPTXISD::Suld1DI32Trap;
+  case Intrinsic::nvvm_suld_1d_i64_trap:
+    return NVPTXISD::Suld1DI64Trap;
+  case Intrinsic::nvvm_suld_1d_v2i8_trap:
+    return NVPTXISD::Suld1DV2I8Trap;
+  case Intrinsic::nvvm_suld_1d_v2i16_trap:
+    return NVPTXISD::Suld1DV2I16Trap;
+  case Intrinsic::nvvm_suld_1d_v2i32_trap:
+    return NVPTXISD::Suld1DV2I32Trap;
+  case Intrinsic::nvvm_suld_1d_v2i64_trap:
+    return NVPTXISD::Suld1DV2I64Trap;
+  case Intrinsic::nvvm_suld_1d_v4i8_trap:
+    return NVPTXISD::Suld1DV4I8Trap;
+  case Intrinsic::nvvm_suld_1d_v4i16_trap:
+    return NVPTXISD::Suld1DV4I16Trap;
+  case Intrinsic::nvvm_suld_1d_v4i32_trap:
+    return NVPTXISD::Suld1DV4I32Trap;
+  case Intrinsic::nvvm_suld_1d_array_i8_trap:
+    return NVPTXISD::Suld1DArrayI8Trap;
+  case Intrinsic::nvvm_suld_1d_array_i16_trap:
+    return NVPTXISD::Suld1DArrayI16Trap;
+  case Intrinsic::nvvm_suld_1d_array_i32_trap:
+    return NVPTXISD::Suld1DArrayI32Trap;
+  case Intrinsic::nvvm_suld_1d_array_i64_trap:
+    return NVPTXISD::Suld1DArrayI64Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
+    return NVPTXISD::Suld1DArrayV2I8Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
+    return NVPTXISD::Suld1DArrayV2I16Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
+    return NVPTXISD::Suld1DArrayV2I32Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
+    return NVPTXISD::Suld1DArrayV2I64Trap;
+  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
+    return NVPTXISD::Suld1DArrayV4I8Trap;
+  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
+    return NVPTXISD::Suld1DArrayV4I16Trap;
+  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
+    return NVPTXISD::Suld1DArrayV4I32Trap;
+  case Intrinsic::nvvm_suld_2d_i8_trap:
+    return NVPTXISD::Suld2DI8Trap;
+  case Intrinsic::nvvm_suld_2d_i16_trap:
+    return NVPTXISD::Suld2DI16Trap;
+  case Intrinsic::nvvm_suld_2d_i32_trap:
+    return NVPTXISD::Suld2DI32Trap;
+  case Intrinsic::nvvm_suld_2d_i64_trap:
+    return NVPTXISD::Suld2DI64Trap;
+  case Intrinsic::nvvm_suld_2d_v2i8_trap:
+    return NVPTXISD::Suld2DV2I8Trap;
+  case Intrinsic::nvvm_suld_2d_v2i16_trap:
+    return NVPTXISD::Suld2DV2I16Trap;
+  case Intrinsic::nvvm_suld_2d_v2i32_trap:
+    return NVPTXISD::Suld2DV2I32Trap;
+  case Intrinsic::nvvm_suld_2d_v2i64_trap:
+    return NVPTXISD::Suld2DV2I64Trap;
+  case Intrinsic::nvvm_suld_2d_v4i8_trap:
+    return NVPTXISD::Suld2DV4I8Trap;
+  case Intrinsic::nvvm_suld_2d_v4i16_trap:
+    return NVPTXISD::Suld2DV4I16Trap;
+  case Intrinsic::nvvm_suld_2d_v4i32_trap:
+    return NVPTXISD::Suld2DV4I32Trap;
+  case Intrinsic::nvvm_suld_2d_array_i8_trap:
+    return NVPTXISD::Suld2DArrayI8Trap;
+  case Intrinsic::nvvm_suld_2d_array_i16_trap:
+    return NVPTXISD::Suld2DArrayI16Trap;
+  case Intrinsic::nvvm_suld_2d_array_i32_trap:
+    return NVPTXISD::Suld2DArrayI32Trap;
+  case Intrinsic::nvvm_suld_2d_array_i64_trap:
+    return NVPTXISD::Suld2DArrayI64Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
+    return NVPTXISD::Suld2DArrayV2I8Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
+    return NVPTXISD::Suld2DArrayV2I16Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
+    return NVPTXISD::Suld2DArrayV2I32Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
+    return NVPTXISD::Suld2DArrayV2I64Trap;
+  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
+    return NVPTXISD::Suld2DArrayV4I8Trap;
+  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
+    return NVPTXISD::Suld2DArrayV4I16Trap;
+  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
+    return NVPTXISD::Suld2DArrayV4I32Trap;
+  case Intrinsic::nvvm_suld_3d_i8_trap:
+    return NVPTXISD::Suld3DI8Trap;
+  case Intrinsic::nvvm_suld_3d_i16_trap:
+    return NVPTXISD::Suld3DI16Trap;
+  case Intrinsic::nvvm_suld_3d_i32_trap:
+    return NVPTXISD::Suld3DI32Trap;
+  case Intrinsic::nvvm_suld_3d_i64_trap:
+    return NVPTXISD::Suld3DI64Trap;
+  case Intrinsic::nvvm_suld_3d_v2i8_trap:
+    return NVPTXISD::Suld3DV2I8Trap;
+  case Intrinsic::nvvm_suld_3d_v2i16_trap:
+    return NVPTXISD::Suld3DV2I16Trap;
+  case Intrinsic::nvvm_suld_3d_v2i32_trap:
+    return NVPTXISD::Suld3DV2I32Trap;
+  case Intrinsic::nvvm_suld_3d_v2i64_trap:
+    return NVPTXISD::Suld3DV2I64Trap;
+  case Intrinsic::nvvm_suld_3d_v4i8_trap:
+    return NVPTXISD::Suld3DV4I8Trap;
+  case Intrinsic::nvvm_suld_3d_v4i16_trap:
+    return NVPTXISD::Suld3DV4I16Trap;
+  case Intrinsic::nvvm_suld_3d_v4i32_trap:
+    return NVPTXISD::Suld3DV4I32Trap;
+  case Intrinsic::nvvm_suld_1d_i8_zero:
+    return NVPTXISD::Suld1DI8Zero;
+  case Intrinsic::nvvm_suld_1d_i16_zero:
+    return NVPTXISD::Suld1DI16Zero;
+  case Intrinsic::nvvm_suld_1d_i32_zero:
+    return NVPTXISD::Suld1DI32Zero;
+  case Intrinsic::nvvm_suld_1d_i64_zero:
+    return NVPTXISD::Suld1DI64Zero;
+  case Intrinsic::nvvm_suld_1d_v2i8_zero:
+    return NVPTXISD::Suld1DV2I8Zero;
+  case Intrinsic::nvvm_suld_1d_v2i16_zero:
+    return NVPTXISD::Suld1DV2I16Zero;
+  case Intrinsic::nvvm_suld_1d_v2i32_zero:
+    return NVPTXISD::Suld1DV2I32Zero;
+  case Intrinsic::nvvm_suld_1d_v2i64_zero:
+    return NVPTXISD::Suld1DV2I64Zero;
+  case Intrinsic::nvvm_suld_1d_v4i8_zero:
+    return NVPTXISD::Suld1DV4I8Zero;
+  case Intrinsic::nvvm_suld_1d_v4i16_zero:
+    return NVPTXISD::Suld1DV4I16Zero;
+  case Intrinsic::nvvm_suld_1d_v4i32_zero:
+    return NVPTXISD::Suld1DV4I32Zero;
+  case Intrinsic::nvvm_suld_1d_array_i8_zero:
+    return NVPTXISD::Suld1DArrayI8Zero;
+  case Intrinsic::nvvm_suld_1d_array_i16_zero:
+    return NVPTXISD::Suld1DArrayI16Zero;
+  case Intrinsic::nvvm_suld_1d_array_i32_zero:
+    return NVPTXISD::Suld1DArrayI32Zero;
+  case Intrinsic::nvvm_suld_1d_array_i64_zero:
+    return NVPTXISD::Suld1DArrayI64Zero;
+  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
+    return NVPTXISD::Suld1DArrayV2I8Zero;
+  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
+    return NVPTXISD::Suld1DArrayV2I16Zero;
+  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
+    return NVPTXISD::Suld1DArrayV2I32Zero;
+  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
+    return NVPTXISD::Suld1DArrayV2I64Zero;
+  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
+    return NVPTXISD::Suld1DArrayV4I8Zero;
+  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
+    return NVPTXISD::Suld1DArrayV4I16Zero;
+  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
+    return NVPTXISD::Suld1DArrayV4I32Zero;
+  case Intrinsic::nvvm_suld_2d_i8_zero:
+    return NVPTXISD::Suld2DI8Zero;
+  case Intrinsic::nvvm_suld_2d_i16_zero:
+    return NVPTXISD::Suld2DI16Zero;
+  case Intrinsic::nvvm_suld_2d_i32_zero:
+    return NVPTXISD::Suld2DI32Zero;
+  case Intrinsic::nvvm_suld_2d_i64_zero:
+    return NVPTXISD::Suld2DI64Zero;
+  case Intrinsic::nvvm_suld_2d_v2i8_zero:
+    return NVPTXISD::Suld2DV2I8Zero;
+  case Intrinsic::nvvm_suld_2d_v2i16_zero:
+    return NVPTXISD::Suld2DV2I16Zero;
+  case Intrinsic::nvvm_suld_2d_v2i32_zero:
+    return NVPTXISD::Suld2DV2I32Zero;
+  case Intrinsic::nvvm_suld_2d_v2i64_zero:
+    return NVPTXISD::Suld2DV2I64Zero;
+  case Intrinsic::nvvm_suld_2d_v4i8_zero:
+    return NVPTXISD::Suld2DV4I8Zero;
+  case Intrinsic::nvvm_suld_2d_v4i16_zero:
+    return NVPTXISD::Suld2DV4I16Zero;
+  case Intrinsic::nvvm_suld_2d_v4i32_zero:
+    return NVPTXISD::Suld2DV4I32Zero;
+  case Intrinsic::nvvm_suld_2d_array_i8_zero:
+    return NVPTXISD::Suld2DArrayI8Zero;
+  case Intrinsic::nvvm_suld_2d_array_i16_zero:
+    return NVPTXISD::Suld2DArrayI16Zero;
+  case Intrinsic::nvvm_suld_2d_array_i32_zero:
+    return NVPTXISD::Suld2DArrayI32Zero;
+  case Intrinsic::nvvm_suld_2d_array_i64_zero:
+    return NVPTXISD::Suld2DArrayI64Zero;
+  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
+    return NVPTXISD::Suld2DArrayV2I8Zero;
+  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
+    return NVPTXISD::Suld2DArrayV2I16Zero;
+  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
+    return NVPTXISD::Suld2DArrayV2I32Zero;
+  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
+    return NVPTXISD::Suld2DArrayV2I64Zero;
+  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
+    return NVPTXISD::Suld2DArrayV4I8Zero;
+  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
+    return NVPTXISD::Suld2DArrayV4I16Zero;
+  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
+    return NVPTXISD::Suld2DArrayV4I32Zero;
+  case Intrinsic::nvvm_suld_3d_i8_zero:
+    return NVPTXISD::Suld3DI8Zero;
+  case Intrinsic::nvvm_suld_3d_i16_zero:
+    return NVPTXISD::Suld3DI16Zero;
+  case Intrinsic::nvvm_suld_3d_i32_zero:
+    return NVPTXISD::Suld3DI32Zero;
+  case Intrinsic::nvvm_suld_3d_i64_zero:
+    return NVPTXISD::Suld3DI64Zero;
+  case Intrinsic::nvvm_suld_3d_v2i8_zero:
+    return NVPTXISD::Suld3DV2I8Zero;
+  case Intrinsic::nvvm_suld_3d_v2i16_zero:
+    return NVPTXISD::Suld3DV2I16Zero;
+  case Intrinsic::nvvm_suld_3d_v2i32_zero:
+    return NVPTXISD::Suld3DV2I32Zero;
+  case Intrinsic::nvvm_suld_3d_v2i64_zero:
+    return NVPTXISD::Suld3DV2I64Zero;
+  case Intrinsic::nvvm_suld_3d_v4i8_zero:
+    return NVPTXISD::Suld3DV4I8Zero;
+  case Intrinsic::nvvm_suld_3d_v4i16_zero:
+    return NVPTXISD::Suld3DV4I16Zero;
+  case Intrinsic::nvvm_suld_3d_v4i32_zero:
+    return NVPTXISD::Suld3DV4I32Zero;
+  }
+}
+
+// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
+// TgtMemIntrinsic
+// because we need the information that is only available in the "Value" type
+// of destination
+// pointer. In particular, the address space information.
+bool NVPTXTargetLowering::getTgtMemIntrinsic(
+    IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  default:
+    return false;
+
+  case Intrinsic::nvvm_atomic_load_add_f32:
+  case Intrinsic::nvvm_atomic_load_inc_32:
+  case Intrinsic::nvvm_atomic_load_dec_32:
+
+  case Intrinsic::nvvm_atomic_add_gen_f_cta:
+  case Intrinsic::nvvm_atomic_add_gen_f_sys:
+  case Intrinsic::nvvm_atomic_add_gen_i_cta:
+  case Intrinsic::nvvm_atomic_add_gen_i_sys:
+  case Intrinsic::nvvm_atomic_and_gen_i_cta:
+  case Intrinsic::nvvm_atomic_and_gen_i_sys:
+  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
+  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
+  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
+  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
+  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
+  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
+  case Intrinsic::nvvm_atomic_max_gen_i_cta:
+  case Intrinsic::nvvm_atomic_max_gen_i_sys:
+  case Intrinsic::nvvm_atomic_min_gen_i_cta:
+  case Intrinsic::nvvm_atomic_min_gen_i_sys:
+  case Intrinsic::nvvm_atomic_or_gen_i_cta:
+  case Intrinsic::nvvm_atomic_or_gen_i_sys:
+  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
+  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
+  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
+  case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
+    auto &DL = I.getModule()->getDataLayout();
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = getValueType(DL, I.getType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = true;
+    Info.align = 0;
+    return true;
+  }
+
+  case Intrinsic::nvvm_ldu_global_i:
+  case Intrinsic::nvvm_ldu_global_f:
+  case Intrinsic::nvvm_ldu_global_p: {
+    auto &DL = I.getModule()->getDataLayout();
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
+      Info.memVT = getValueType(DL, I.getType());
+    else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
+      Info.memVT = getPointerTy(DL);
+    else
+      Info.memVT = getValueType(DL, I.getType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+
+    return true;
+  }
+  case Intrinsic::nvvm_ldg_global_i:
+  case Intrinsic::nvvm_ldg_global_f:
+  case Intrinsic::nvvm_ldg_global_p: {
+    auto &DL = I.getModule()->getDataLayout();
+
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
+      Info.memVT = getValueType(DL, I.getType());
+    else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
+      Info.memVT = getPointerTy(DL);
+    else
+      Info.memVT = getValueType(DL, I.getType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+
+    return true;
+  }
+
+  case Intrinsic::nvvm_tex_1d_v4f32_s32:
+  case Intrinsic::nvvm_tex_1d_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
+  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_v4f32_s32:
+  case Intrinsic::nvvm_tex_2d_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
+  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_3d_v4f32_s32:
+  case Intrinsic::nvvm_tex_3d_v4f32_f32:
+  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_cube_v4f32_f32:
+  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: {
+    Info.opc = getOpcForTextureInstr(Intrinsic);
+    Info.memVT = MVT::v4f32;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_tex_1d_v4s32_s32:
+  case Intrinsic::nvvm_tex_1d_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
+  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_v4s32_s32:
+  case Intrinsic::nvvm_tex_2d_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
+  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_3d_v4s32_s32:
+  case Intrinsic::nvvm_tex_3d_v4s32_f32:
+  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_v4u32_f32:
+  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_v4u32_s32:
+  case Intrinsic::nvvm_tex_1d_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
+  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_v4u32_s32:
+  case Intrinsic::nvvm_tex_2d_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
+  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_3d_v4u32_s32:
+  case Intrinsic::nvvm_tex_3d_v4u32_f32:
+  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: {
+    Info.opc = getOpcForTextureInstr(Intrinsic);
+    Info.memVT = MVT::v4i32;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_suld_1d_i8_clamp:
+  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
+  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
+  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
+  case Intrinsic::nvvm_suld_2d_i8_clamp:
+  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
+  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
+  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
+  case Intrinsic::nvvm_suld_3d_i8_clamp:
+  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
+  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
+  case Intrinsic::nvvm_suld_1d_i8_trap:
+  case Intrinsic::nvvm_suld_1d_v2i8_trap:
+  case Intrinsic::nvvm_suld_1d_v4i8_trap:
+  case Intrinsic::nvvm_suld_1d_array_i8_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
+  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
+  case Intrinsic::nvvm_suld_2d_i8_trap:
+  case Intrinsic::nvvm_suld_2d_v2i8_trap:
+  case Intrinsic::nvvm_suld_2d_v4i8_trap:
+  case Intrinsic::nvvm_suld_2d_array_i8_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
+  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
+  case Intrinsic::nvvm_suld_3d_i8_trap:
+  case Intrinsic::nvvm_suld_3d_v2i8_trap:
+  case Intrinsic::nvvm_suld_3d_v4i8_trap:
+  case Intrinsic::nvvm_suld_1d_i8_zero:
+  case Intrinsic::nvvm_suld_1d_v2i8_zero:
+  case Intrinsic::nvvm_suld_1d_v4i8_zero:
+  case Intrinsic::nvvm_suld_1d_array_i8_zero:
+  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
+  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
+  case Intrinsic::nvvm_suld_2d_i8_zero:
+  case Intrinsic::nvvm_suld_2d_v2i8_zero:
+  case Intrinsic::nvvm_suld_2d_v4i8_zero:
+  case Intrinsic::nvvm_suld_2d_array_i8_zero:
+  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
+  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
+  case Intrinsic::nvvm_suld_3d_i8_zero:
+  case Intrinsic::nvvm_suld_3d_v2i8_zero:
+  case Intrinsic::nvvm_suld_3d_v4i8_zero: {
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i8;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_suld_1d_i16_clamp:
+  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
+  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
+  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
+  case Intrinsic::nvvm_suld_2d_i16_clamp:
+  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
+  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
+  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
+  case Intrinsic::nvvm_suld_3d_i16_clamp:
+  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
+  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
+  case Intrinsic::nvvm_suld_1d_i16_trap:
+  case Intrinsic::nvvm_suld_1d_v2i16_trap:
+  case Intrinsic::nvvm_suld_1d_v4i16_trap:
+  case Intrinsic::nvvm_suld_1d_array_i16_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
+  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
+  case Intrinsic::nvvm_suld_2d_i16_trap:
+  case Intrinsic::nvvm_suld_2d_v2i16_trap:
+  case Intrinsic::nvvm_suld_2d_v4i16_trap:
+  case Intrinsic::nvvm_suld_2d_array_i16_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
+  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
+  case Intrinsic::nvvm_suld_3d_i16_trap:
+  case Intrinsic::nvvm_suld_3d_v2i16_trap:
+  case Intrinsic::nvvm_suld_3d_v4i16_trap:
+  case Intrinsic::nvvm_suld_1d_i16_zero:
+  case Intrinsic::nvvm_suld_1d_v2i16_zero:
+  case Intrinsic::nvvm_suld_1d_v4i16_zero:
+  case Intrinsic::nvvm_suld_1d_array_i16_zero:
+  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
+  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
+  case Intrinsic::nvvm_suld_2d_i16_zero:
+  case Intrinsic::nvvm_suld_2d_v2i16_zero:
+  case Intrinsic::nvvm_suld_2d_v4i16_zero:
+  case Intrinsic::nvvm_suld_2d_array_i16_zero:
+  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
+  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
+  case Intrinsic::nvvm_suld_3d_i16_zero:
+  case Intrinsic::nvvm_suld_3d_v2i16_zero:
+  case Intrinsic::nvvm_suld_3d_v4i16_zero: {
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i16;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_suld_1d_i32_clamp:
+  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
+  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
+  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
+  case Intrinsic::nvvm_suld_2d_i32_clamp:
+  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
+  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
+  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
+  case Intrinsic::nvvm_suld_3d_i32_clamp:
+  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
+  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
+  case Intrinsic::nvvm_suld_1d_i32_trap:
+  case Intrinsic::nvvm_suld_1d_v2i32_trap:
+  case Intrinsic::nvvm_suld_1d_v4i32_trap:
+  case Intrinsic::nvvm_suld_1d_array_i32_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
+  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
+  case Intrinsic::nvvm_suld_2d_i32_trap:
+  case Intrinsic::nvvm_suld_2d_v2i32_trap:
+  case Intrinsic::nvvm_suld_2d_v4i32_trap:
+  case Intrinsic::nvvm_suld_2d_array_i32_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
+  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
+  case Intrinsic::nvvm_suld_3d_i32_trap:
+  case Intrinsic::nvvm_suld_3d_v2i32_trap:
+  case Intrinsic::nvvm_suld_3d_v4i32_trap:
+  case Intrinsic::nvvm_suld_1d_i32_zero:
+  case Intrinsic::nvvm_suld_1d_v2i32_zero:
+  case Intrinsic::nvvm_suld_1d_v4i32_zero:
+  case Intrinsic::nvvm_suld_1d_array_i32_zero:
+  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
+  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
+  case Intrinsic::nvvm_suld_2d_i32_zero:
+  case Intrinsic::nvvm_suld_2d_v2i32_zero:
+  case Intrinsic::nvvm_suld_2d_v4i32_zero:
+  case Intrinsic::nvvm_suld_2d_array_i32_zero:
+  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
+  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
+  case Intrinsic::nvvm_suld_3d_i32_zero:
+  case Intrinsic::nvvm_suld_3d_v2i32_zero:
+  case Intrinsic::nvvm_suld_3d_v4i32_zero: {
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i32;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_suld_1d_i64_clamp:
+  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
+  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
+  case Intrinsic::nvvm_suld_2d_i64_clamp:
+  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
+  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
+  case Intrinsic::nvvm_suld_3d_i64_clamp:
+  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
+  case Intrinsic::nvvm_suld_1d_i64_trap:
+  case Intrinsic::nvvm_suld_1d_v2i64_trap:
+  case Intrinsic::nvvm_suld_1d_array_i64_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
+  case Intrinsic::nvvm_suld_2d_i64_trap:
+  case Intrinsic::nvvm_suld_2d_v2i64_trap:
+  case Intrinsic::nvvm_suld_2d_array_i64_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
+  case Intrinsic::nvvm_suld_3d_i64_trap:
+  case Intrinsic::nvvm_suld_3d_v2i64_trap:
+  case Intrinsic::nvvm_suld_1d_i64_zero:
+  case Intrinsic::nvvm_suld_1d_v2i64_zero:
+  case Intrinsic::nvvm_suld_1d_array_i64_zero:
+  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
+  case Intrinsic::nvvm_suld_2d_i64_zero:
+  case Intrinsic::nvvm_suld_2d_v2i64_zero:
+  case Intrinsic::nvvm_suld_2d_array_i64_zero:
+  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
+  case Intrinsic::nvvm_suld_3d_i64_zero:
+  case Intrinsic::nvvm_suld_3d_v2i64_zero: {
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i64;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  }
+  return false;
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+/// Used to guide target specific optimizations, like loop strength reduction
+/// (LoopStrengthReduce.cpp) and memory optimization for address mode
+/// (CodeGenPrepare.cpp)
+bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                const AddrMode &AM, Type *Ty,
+                                                unsigned AS) const {
+
+  // AddrMode - This represents an addressing mode of:
+  //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
+  //
+  // The legal address modes are
+  // - [avar]
+  // - [areg]
+  // - [areg+immoff]
+  // - [immAddr]
+
+  if (AM.BaseGV) {
+    return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
+  }
+
+  switch (AM.Scale) {
+  case 0: // "r", "r+i" or "i" is allowed
+    break;
+  case 1:
+    if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
+      return false;
+    // Otherwise we have r+i.
+    break;
+  default:
+    // No scale > 1 is allowed
+    return false;
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                         NVPTX Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+NVPTXTargetLowering::ConstraintType
+NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'b':
+    case 'r':
+    case 'h':
+    case 'c':
+    case 'l':
+    case 'f':
+    case 'd':
+    case '0':
+    case 'N':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                  StringRef Constraint,
+                                                  MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'b':
+      return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
+    case 'c':
+      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
+    case 'h':
+      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
+    case 'r':
+      return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
+    case 'l':
+    case 'N':
+      return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
+    case 'f':
+      return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
+    case 'd':
+      return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+//===----------------------------------------------------------------------===//
+//                         NVPTX DAG Combining
+//===----------------------------------------------------------------------===//
+
+bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
+                                   CodeGenOpt::Level OptLevel) const {
+  const Function *F = MF.getFunction();
+  const TargetOptions &TO = MF.getTarget().Options;
+
+  // Always honor command-line argument
+  if (FMAContractLevelOpt.getNumOccurrences() > 0) {
+    return FMAContractLevelOpt > 0;
+  } else if (OptLevel == 0) {
+    // Do not contract if we're not optimizing the code
+    return false;
+  } else if (TO.AllowFPOpFusion == FPOpFusion::Fast || TO.UnsafeFPMath) {
+    // Honor TargetOptions flags that explicitly say fusion is okay
+    return true;
+  } else if (F->hasFnAttribute("unsafe-fp-math")) {
+    // Check for unsafe-fp-math=true coming from Clang
+    Attribute Attr = F->getFnAttribute("unsafe-fp-math");
+    StringRef Val = Attr.getValueAsString();
+    if (Val == "true")
+      return true;
+  }
+
+  // We did not have a clear indication that fusion is allowed, so assume not
+  return false;
+}
+
+/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
+/// operands N0 and N1.  This is a helper for PerformADDCombine that is
+/// called with the default operands, and if that fails, with commuted
+/// operands.
+static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                             const NVPTXSubtarget &Subtarget,
+                                             CodeGenOpt::Level OptLevel) {
+  SelectionDAG  &DAG = DCI.DAG;
+  // Skip non-integer, non-scalar case
+  EVT VT=N0.getValueType();
+  if (VT.isVector())
+    return SDValue();
+
+  // fold (add (mul a, b), c) -> (mad a, b, c)
+  //
+  if (N0.getOpcode() == ISD::MUL) {
+    assert (VT.isInteger());
+    // For integer:
+    // Since integer multiply-add costs the same as integer multiply
+    // but is more costly than integer add, do the fusion only when
+    // the mul is only used in the add.
+    if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
+        !N0.getNode()->hasOneUse())
+      return SDValue();
+
+    // Do the folding
+    return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
+                       N0.getOperand(0), N0.getOperand(1), N1);
+  }
+  else if (N0.getOpcode() == ISD::FMUL) {
+    if (VT == MVT::f32 || VT == MVT::f64) {
+      const auto *TLI = static_cast<const NVPTXTargetLowering *>(
+          &DAG.getTargetLoweringInfo());
+      if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
+        return SDValue();
+
+      // For floating point:
+      // Do the fusion only when the mul has less than 5 uses and all
+      // are add.
+      // The heuristic is that if a use is not an add, then that use
+      // cannot be fused into fma, therefore mul is still needed anyway.
+      // If there are more than 4 uses, even if they are all add, fusing
+      // them will increase register pressue.
+      //
+      int numUses = 0;
+      int nonAddCount = 0;
+      for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
+           UE = N0.getNode()->use_end();
+           UI != UE; ++UI) {
+        numUses++;
+        SDNode *User = *UI;
+        if (User->getOpcode() != ISD::FADD)
+          ++nonAddCount;
+      }
+      if (numUses >= 5)
+        return SDValue();
+      if (nonAddCount) {
+        int orderNo = N->getIROrder();
+        int orderNo2 = N0.getNode()->getIROrder();
+        // simple heuristics here for considering potential register
+        // pressure, the logics here is that the differnce are used
+        // to measure the distance between def and use, the longer distance
+        // more likely cause register pressure.
+        if (orderNo - orderNo2 < 500)
+          return SDValue();
+
+        // Now, check if at least one of the FMUL's operands is live beyond the node N,
+        // which guarantees that the FMA will not increase register pressure at node N.
+        bool opIsLive = false;
+        const SDNode *left = N0.getOperand(0).getNode();
+        const SDNode *right = N0.getOperand(1).getNode();
+
+        if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
+          opIsLive = true;
+
+        if (!opIsLive)
+          for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
+            SDNode *User = *UI;
+            int orderNo3 = User->getIROrder();
+            if (orderNo3 > orderNo) {
+              opIsLive = true;
+              break;
+            }
+          }
+
+        if (!opIsLive)
+          for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
+            SDNode *User = *UI;
+            int orderNo3 = User->getIROrder();
+            if (orderNo3 > orderNo) {
+              opIsLive = true;
+              break;
+            }
+          }
+
+        if (!opIsLive)
+          return SDValue();
+      }
+
+      return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                         N0.getOperand(0), N0.getOperand(1), N1);
+    }
+  }
+
+  return SDValue();
+}
+
+/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
+///
+static SDValue PerformADDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const NVPTXSubtarget &Subtarget,
+                                 CodeGenOpt::Level OptLevel) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // First try with the default operand order.
+  if (SDValue Result =
+          PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
+    return Result;
+
+  // If that didn't work, try again with the operands commuted.
+  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
+}
+
+static SDValue PerformANDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  // The type legalizer turns a vector load of i8 values into a zextload to i16
+  // registers, optionally ANY_EXTENDs it (if target type is integer),
+  // and ANDs off the high 8 bits. Since we turn this load into a
+  // target-specific DAG node, the DAG combiner fails to eliminate these AND
+  // nodes. Do that here.
+  SDValue Val = N->getOperand(0);
+  SDValue Mask = N->getOperand(1);
+
+  if (isa<ConstantSDNode>(Val)) {
+    std::swap(Val, Mask);
+  }
+
+  SDValue AExt;
+  // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
+  if (Val.getOpcode() == ISD::ANY_EXTEND) {
+    AExt = Val;
+    Val = Val->getOperand(0);
+  }
+
+  if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
+    Val = Val->getOperand(0);
+  }
+
+  if (Val->getOpcode() == NVPTXISD::LoadV2 ||
+      Val->getOpcode() == NVPTXISD::LoadV4) {
+    ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
+    if (!MaskCnst) {
+      // Not an AND with a constant
+      return SDValue();
+    }
+
+    uint64_t MaskVal = MaskCnst->getZExtValue();
+    if (MaskVal != 0xff) {
+      // Not an AND that chops off top 8 bits
+      return SDValue();
+    }
+
+    MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
+    if (!Mem) {
+      // Not a MemSDNode?!?
+      return SDValue();
+    }
+
+    EVT MemVT = Mem->getMemoryVT();
+    if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
+      // We only handle the i8 case
+      return SDValue();
+    }
+
+    unsigned ExtType =
+      cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
+        getZExtValue();
+    if (ExtType == ISD::SEXTLOAD) {
+      // If for some reason the load is a sextload, the and is needed to zero
+      // out the high 8 bits
+      return SDValue();
+    }
+
+    bool AddTo = false;
+    if (AExt.getNode() != 0) {
+      // Re-insert the ext as a zext.
+      Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
+                            AExt.getValueType(), Val);
+      AddTo = true;
+    }
+
+    // If we get here, the AND is unnecessary.  Just replace it with the load
+    DCI.CombineTo(N, Val, AddTo);
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformSELECTCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI) {
+  // Currently this detects patterns for integer min and max and
+  // lowers them to PTX-specific intrinsics that enable hardware
+  // support.
+
+  const SDValue Cond = N->getOperand(0);
+  if (Cond.getOpcode() != ISD::SETCC) return SDValue();
+
+  const SDValue LHS = Cond.getOperand(0);
+  const SDValue RHS = Cond.getOperand(1);
+  const SDValue True = N->getOperand(1);
+  const SDValue False = N->getOperand(2);
+  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
+    return SDValue();
+
+  const EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64) return SDValue();
+
+  const ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+  SDValue Larger;  // The larger of LHS and RHS when condition is true.
+  switch (CC) {
+    case ISD::SETULT:
+    case ISD::SETULE:
+    case ISD::SETLT:
+    case ISD::SETLE:
+      Larger = RHS;
+      break;
+
+    case ISD::SETGT:
+    case ISD::SETGE:
+    case ISD::SETUGT:
+    case ISD::SETUGE:
+      Larger = LHS;
+      break;
+
+    default:
+      return SDValue();
+  }
+  const bool IsMax = (Larger == True);
+  const bool IsSigned = ISD::isSignedIntSetCC(CC);
+
+  unsigned IntrinsicId;
+  if (VT == MVT::i32) {
+    if (IsSigned)
+      IntrinsicId = IsMax ? Intrinsic::nvvm_max_i : Intrinsic::nvvm_min_i;
+    else
+      IntrinsicId = IsMax ? Intrinsic::nvvm_max_ui : Intrinsic::nvvm_min_ui;
+  } else {
+    assert(VT == MVT::i64);
+    if (IsSigned)
+      IntrinsicId = IsMax ? Intrinsic::nvvm_max_ll : Intrinsic::nvvm_min_ll;
+    else
+      IntrinsicId = IsMax ? Intrinsic::nvvm_max_ull : Intrinsic::nvvm_min_ull;
+  }
+
+  SDLoc DL(N);
+  return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DCI.DAG.getConstant(IntrinsicId, DL, VT), LHS, RHS);
+}
+
+static SDValue PerformREMCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 CodeGenOpt::Level OptLevel) {
+  assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
+
+  // Don't do anything at less than -O2.
+  if (OptLevel < CodeGenOpt::Default)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  bool IsSigned = N->getOpcode() == ISD::SREM;
+  unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
+
+  const SDValue &Num = N->getOperand(0);
+  const SDValue &Den = N->getOperand(1);
+
+  for (const SDNode *U : Num->uses()) {
+    if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
+        U->getOperand(1) == Den) {
+      // Num % Den -> Num - (Num / Den) * Den
+      return DAG.getNode(ISD::SUB, DL, VT, Num,
+                         DAG.getNode(ISD::MUL, DL, VT,
+                                     DAG.getNode(DivOpc, DL, VT, Num, Den),
+                                     Den));
+    }
+  }
+  return SDValue();
+}
+
+enum OperandSignedness {
+  Signed = 0,
+  Unsigned,
+  Unknown
+};
+
+/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
+/// that can be demoted to \p OptSize bits without loss of information. The
+/// signedness of the operand, if determinable, is placed in \p S.
+static bool IsMulWideOperandDemotable(SDValue Op,
+                                      unsigned OptSize,
+                                      OperandSignedness &S) {
+  S = Unknown;
+
+  if (Op.getOpcode() == ISD::SIGN_EXTEND ||
+      Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    EVT OrigVT = Op.getOperand(0).getValueType();
+    if (OrigVT.getSizeInBits() <= OptSize) {
+      S = Signed;
+      return true;
+    }
+  } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
+    EVT OrigVT = Op.getOperand(0).getValueType();
+    if (OrigVT.getSizeInBits() <= OptSize) {
+      S = Unsigned;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
+/// be demoted to \p OptSize bits without loss of information. If the operands
+/// contain a constant, it should appear as the RHS operand. The signedness of
+/// the operands is placed in \p IsSigned.
+static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
+                                        unsigned OptSize,
+                                        bool &IsSigned) {
+
+  OperandSignedness LHSSign;
+
+  // The LHS operand must be a demotable op
+  if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
+    return false;
+
+  // We should have been able to determine the signedness from the LHS
+  if (LHSSign == Unknown)
+    return false;
+
+  IsSigned = (LHSSign == Signed);
+
+  // The RHS can be a demotable op or a constant
+  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
+    const APInt &Val = CI->getAPIntValue();
+    if (LHSSign == Unsigned) {
+      return Val.isIntN(OptSize);
+    } else {
+      return Val.isSignedIntN(OptSize);
+    }
+  } else {
+    OperandSignedness RHSSign;
+    if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
+      return false;
+
+    return LHSSign == RHSSign;
+  }
+}
+
+/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
+/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
+/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
+/// amount.
+static SDValue TryMULWIDECombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  EVT MulType = N->getValueType(0);
+  if (MulType != MVT::i32 && MulType != MVT::i64) {
+    return SDValue();
+  }
+
+  SDLoc DL(N);
+  unsigned OptSize = MulType.getSizeInBits() >> 1;
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Canonicalize the multiply so the constant (if any) is on the right
+  if (N->getOpcode() == ISD::MUL) {
+    if (isa<ConstantSDNode>(LHS)) {
+      std::swap(LHS, RHS);
+    }
+  }
+
+  // If we have a SHL, determine the actual multiply amount
+  if (N->getOpcode() == ISD::SHL) {
+    ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
+    if (!ShlRHS) {
+      return SDValue();
+    }
+
+    APInt ShiftAmt = ShlRHS->getAPIntValue();
+    unsigned BitWidth = MulType.getSizeInBits();
+    if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
+      APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
+      RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
+    } else {
+      return SDValue();
+    }
+  }
+
+  bool Signed;
+  // Verify that our operands are demotable
+  if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
+    return SDValue();
+  }
+
+  EVT DemotedVT;
+  if (MulType == MVT::i32) {
+    DemotedVT = MVT::i16;
+  } else {
+    DemotedVT = MVT::i32;
+  }
+
+  // Truncate the operands to the correct size. Note that these are just for
+  // type consistency and will (likely) be eliminated in later phases.
+  SDValue TruncLHS =
+    DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
+  SDValue TruncRHS =
+    DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
+
+  unsigned Opc;
+  if (Signed) {
+    Opc = NVPTXISD::MUL_WIDE_SIGNED;
+  } else {
+    Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
+  }
+
+  return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
+}
+
+/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
+static SDValue PerformMULCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 CodeGenOpt::Level OptLevel) {
+  if (OptLevel > 0) {
+    // Try mul.wide combining at OptLevel > 0
+    if (SDValue Ret = TryMULWIDECombine(N, DCI))
+      return Ret;
+  }
+
+  return SDValue();
+}
+
+/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
+static SDValue PerformSHLCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 CodeGenOpt::Level OptLevel) {
+  if (OptLevel > 0) {
+    // Try mul.wide combining at OptLevel > 0
+    if (SDValue Ret = TryMULWIDECombine(N, DCI))
+      return Ret;
+  }
+
+  return SDValue();
+}
+
+SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
+  CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
+  switch (N->getOpcode()) {
+    default: break;
+    case ISD::ADD:
+    case ISD::FADD:
+      return PerformADDCombine(N, DCI, STI, OptLevel);
+    case ISD::MUL:
+      return PerformMULCombine(N, DCI, OptLevel);
+    case ISD::SHL:
+      return PerformSHLCombine(N, DCI, OptLevel);
+    case ISD::AND:
+      return PerformANDCombine(N, DCI);
+    case ISD::SELECT:
+      return PerformSELECTCombine(N, DCI);
+    case ISD::UREM:
+    case ISD::SREM:
+      return PerformREMCombine(N, DCI, OptLevel);
+  }
+  return SDValue();
+}
+
+/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
+static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &Results) {
+  EVT ResVT = N->getValueType(0);
+  SDLoc DL(N);
+
+  assert(ResVT.isVector() && "Vector load must have vector type");
+
+  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+  // legal.  We can (and should) split that into 2 loads of <2 x double> here
+  // but I'm leaving that as a TODO for now.
+  assert(ResVT.isSimple() && "Can only handle simple types");
+  switch (ResVT.getSimpleVT().SimpleTy) {
+  default:
+    return;
+  case MVT::v2i8:
+  case MVT::v2i16:
+  case MVT::v2i32:
+  case MVT::v2i64:
+  case MVT::v2f32:
+  case MVT::v2f64:
+  case MVT::v4i8:
+  case MVT::v4i16:
+  case MVT::v4i32:
+  case MVT::v4f32:
+    // This is a "native" vector type
+    break;
+  }
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+
+  unsigned Align = LD->getAlignment();
+  auto &TD = DAG.getDataLayout();
+  unsigned PrefAlign =
+      TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
+  if (Align < PrefAlign) {
+    // This load is not sufficiently aligned, so bail out and let this vector
+    // load be scalarized.  Note that we may still be able to emit smaller
+    // vector loads.  For example, if we are loading a <4 x float> with an
+    // alignment of 8, this check will fail but the legalizer will try again
+    // with 2 x <2 x float>, which will succeed with an alignment of 8.
+    return;
+  }
+
+  EVT EltVT = ResVT.getVectorElementType();
+  unsigned NumElts = ResVT.getVectorNumElements();
+
+  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
+  // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+  // loaded type to i16 and propagate the "real" type as the memory type.
+  bool NeedTrunc = false;
+  if (EltVT.getSizeInBits() < 16) {
+    EltVT = MVT::i16;
+    NeedTrunc = true;
+  }
+
+  unsigned Opcode = 0;
+  SDVTList LdResVTs;
+
+  switch (NumElts) {
+  default:
+    return;
+  case 2:
+    Opcode = NVPTXISD::LoadV2;
+    LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+    break;
+  case 4: {
+    Opcode = NVPTXISD::LoadV4;
+    EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
+    LdResVTs = DAG.getVTList(ListVTs);
+    break;
+  }
+  }
+
+  // Copy regular operands
+  SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
+
+  // The select routine does not have access to the LoadSDNode instance, so
+  // pass along the extension information
+  OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
+
+  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
+                                          LD->getMemoryVT(),
+                                          LD->getMemOperand());
+
+  SmallVector<SDValue, 4> ScalarRes;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Res = NewLD.getValue(i);
+    if (NeedTrunc)
+      Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+    ScalarRes.push_back(Res);
+  }
+
+  SDValue LoadChain = NewLD.getValue(NumElts);
+
+  SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
+
+  Results.push_back(BuildVec);
+  Results.push_back(LoadChain);
+}
+
+static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
+                                     SmallVectorImpl<SDValue> &Results) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Intrin = N->getOperand(1);
+  SDLoc DL(N);
+
+  // Get the intrinsic ID
+  unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
+  switch (IntrinNo) {
+  default:
+    return;
+  case Intrinsic::nvvm_ldg_global_i:
+  case Intrinsic::nvvm_ldg_global_f:
+  case Intrinsic::nvvm_ldg_global_p:
+  case Intrinsic::nvvm_ldu_global_i:
+  case Intrinsic::nvvm_ldu_global_f:
+  case Intrinsic::nvvm_ldu_global_p: {
+    EVT ResVT = N->getValueType(0);
+
+    if (ResVT.isVector()) {
+      // Vector LDG/LDU
+
+      unsigned NumElts = ResVT.getVectorNumElements();
+      EVT EltVT = ResVT.getVectorElementType();
+
+      // Since LDU/LDG are target nodes, we cannot rely on DAG type
+      // legalization.
+      // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+      // loaded type to i16 and propagate the "real" type as the memory type.
+      bool NeedTrunc = false;
+      if (EltVT.getSizeInBits() < 16) {
+        EltVT = MVT::i16;
+        NeedTrunc = true;
+      }
+
+      unsigned Opcode = 0;
+      SDVTList LdResVTs;
+
+      switch (NumElts) {
+      default:
+        return;
+      case 2:
+        switch (IntrinNo) {
+        default:
+          return;
+        case Intrinsic::nvvm_ldg_global_i:
+        case Intrinsic::nvvm_ldg_global_f:
+        case Intrinsic::nvvm_ldg_global_p:
+          Opcode = NVPTXISD::LDGV2;
+          break;
+        case Intrinsic::nvvm_ldu_global_i:
+        case Intrinsic::nvvm_ldu_global_f:
+        case Intrinsic::nvvm_ldu_global_p:
+          Opcode = NVPTXISD::LDUV2;
+          break;
+        }
+        LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+        break;
+      case 4: {
+        switch (IntrinNo) {
+        default:
+          return;
+        case Intrinsic::nvvm_ldg_global_i:
+        case Intrinsic::nvvm_ldg_global_f:
+        case Intrinsic::nvvm_ldg_global_p:
+          Opcode = NVPTXISD::LDGV4;
+          break;
+        case Intrinsic::nvvm_ldu_global_i:
+        case Intrinsic::nvvm_ldu_global_f:
+        case Intrinsic::nvvm_ldu_global_p:
+          Opcode = NVPTXISD::LDUV4;
+          break;
+        }
+        EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
+        LdResVTs = DAG.getVTList(ListVTs);
+        break;
+      }
+      }
+
+      SmallVector<SDValue, 8> OtherOps;
+
+      // Copy regular operands
+
+      OtherOps.push_back(Chain); // Chain
+                                 // Skip operand 1 (intrinsic ID)
+      // Others
+      OtherOps.append(N->op_begin() + 2, N->op_end());
+
+      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+
+      SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
+                                              MemSD->getMemoryVT(),
+                                              MemSD->getMemOperand());
+
+      SmallVector<SDValue, 4> ScalarRes;
+
+      for (unsigned i = 0; i < NumElts; ++i) {
+        SDValue Res = NewLD.getValue(i);
+        if (NeedTrunc)
+          Res =
+              DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+        ScalarRes.push_back(Res);
+      }
+
+      SDValue LoadChain = NewLD.getValue(NumElts);
+
+      SDValue BuildVec =
+          DAG.getBuildVector(ResVT, DL, ScalarRes);
+
+      Results.push_back(BuildVec);
+      Results.push_back(LoadChain);
+    } else {
+      // i8 LDG/LDU
+      assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
+             "Custom handling of non-i8 ldu/ldg?");
+
+      // Just copy all operands as-is
+      SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
+
+      // Force output to i16
+      SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
+
+      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+
+      // We make sure the memory type is i8, which will be used during isel
+      // to select the proper instruction.
+      SDValue NewLD =
+          DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
+                                  MVT::i8, MemSD->getMemOperand());
+
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
+                                    NewLD.getValue(0)));
+      Results.push_back(NewLD.getValue(1));
+    }
+  }
+  }
+}
+
+void NVPTXTargetLowering::ReplaceNodeResults(
+    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    report_fatal_error("Unhandled custom legalization");
+  case ISD::LOAD:
+    ReplaceLoadVector(N, DAG, Results);
+    return;
+  case ISD::INTRINSIC_W_CHAIN:
+    ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
+    return;
+  }
+}
+
+// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
+void NVPTXSection::anchor() {}
+
+NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
+  delete static_cast<NVPTXSection *>(TextSection);
+  delete static_cast<NVPTXSection *>(DataSection);
+  delete static_cast<NVPTXSection *>(BSSSection);
+  delete static_cast<NVPTXSection *>(ReadOnlySection);
+
+  delete static_cast<NVPTXSection *>(StaticCtorSection);
+  delete static_cast<NVPTXSection *>(StaticDtorSection);
+  delete static_cast<NVPTXSection *>(LSDASection);
+  delete static_cast<NVPTXSection *>(EHFrameSection);
+  delete static_cast<NVPTXSection *>(DwarfAbbrevSection);
+  delete static_cast<NVPTXSection *>(DwarfInfoSection);
+  delete static_cast<NVPTXSection *>(DwarfLineSection);
+  delete static_cast<NVPTXSection *>(DwarfFrameSection);
+  delete static_cast<NVPTXSection *>(DwarfPubTypesSection);
+  delete static_cast<const NVPTXSection *>(DwarfDebugInlineSection);
+  delete static_cast<NVPTXSection *>(DwarfStrSection);
+  delete static_cast<NVPTXSection *>(DwarfLocSection);
+  delete static_cast<NVPTXSection *>(DwarfARangesSection);
+  delete static_cast<NVPTXSection *>(DwarfRangesSection);
+  delete static_cast<NVPTXSection *>(DwarfMacinfoSection);
+}
+
+MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  return getDataSection();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
new file mode 100644
index 000000000000..e433aed7781b
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -0,0 +1,547 @@
+//===-- NVPTXISelLowering.h - NVPTX DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that NVPTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXISELLOWERING_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXISELLOWERING_H
+
+#include "NVPTX.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+namespace NVPTXISD {
+enum NodeType : unsigned {
+  // Start the numbering from where ISD NodeType finishes.
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  Wrapper,
+  CALL,
+  RET_FLAG,
+  LOAD_PARAM,
+  DeclareParam,
+  DeclareScalarParam,
+  DeclareRetParam,
+  DeclareRet,
+  DeclareScalarRet,
+  PrintCall,
+  PrintConvergentCall,
+  PrintCallUni,
+  PrintConvergentCallUni,
+  CallArgBegin,
+  CallArg,
+  LastCallArg,
+  CallArgEnd,
+  CallVoid,
+  CallVal,
+  CallSymbol,
+  Prototype,
+  MoveParam,
+  PseudoUseParam,
+  RETURN,
+  CallSeqBegin,
+  CallSeqEnd,
+  CallPrototype,
+  FUN_SHFL_CLAMP,
+  FUN_SHFR_CLAMP,
+  MUL_WIDE_SIGNED,
+  MUL_WIDE_UNSIGNED,
+  IMAD,
+  Dummy,
+
+  LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  LoadV4,
+  LDGV2, // LDG.v2
+  LDGV4, // LDG.v4
+  LDUV2, // LDU.v2
+  LDUV4, // LDU.v4
+  StoreV2,
+  StoreV4,
+  LoadParam,
+  LoadParamV2,
+  LoadParamV4,
+  StoreParam,
+  StoreParamV2,
+  StoreParamV4,
+  StoreParamS32, // to sext and store a <32bit value, not used currently
+  StoreParamU32, // to zext and store a <32bit value, not used currently 
+  StoreRetval,
+  StoreRetvalV2,
+  StoreRetvalV4,
+
+  // Texture intrinsics
+  Tex1DFloatS32,
+  Tex1DFloatFloat,
+  Tex1DFloatFloatLevel,
+  Tex1DFloatFloatGrad,
+  Tex1DS32S32,
+  Tex1DS32Float,
+  Tex1DS32FloatLevel,
+  Tex1DS32FloatGrad,
+  Tex1DU32S32,
+  Tex1DU32Float,
+  Tex1DU32FloatLevel,
+  Tex1DU32FloatGrad,
+  Tex1DArrayFloatS32,
+  Tex1DArrayFloatFloat,
+  Tex1DArrayFloatFloatLevel,
+  Tex1DArrayFloatFloatGrad,
+  Tex1DArrayS32S32,
+  Tex1DArrayS32Float,
+  Tex1DArrayS32FloatLevel,
+  Tex1DArrayS32FloatGrad,
+  Tex1DArrayU32S32,
+  Tex1DArrayU32Float,
+  Tex1DArrayU32FloatLevel,
+  Tex1DArrayU32FloatGrad,
+  Tex2DFloatS32,
+  Tex2DFloatFloat,
+  Tex2DFloatFloatLevel,
+  Tex2DFloatFloatGrad,
+  Tex2DS32S32,
+  Tex2DS32Float,
+  Tex2DS32FloatLevel,
+  Tex2DS32FloatGrad,
+  Tex2DU32S32,
+  Tex2DU32Float,
+  Tex2DU32FloatLevel,
+  Tex2DU32FloatGrad,
+  Tex2DArrayFloatS32,
+  Tex2DArrayFloatFloat,
+  Tex2DArrayFloatFloatLevel,
+  Tex2DArrayFloatFloatGrad,
+  Tex2DArrayS32S32,
+  Tex2DArrayS32Float,
+  Tex2DArrayS32FloatLevel,
+  Tex2DArrayS32FloatGrad,
+  Tex2DArrayU32S32,
+  Tex2DArrayU32Float,
+  Tex2DArrayU32FloatLevel,
+  Tex2DArrayU32FloatGrad,
+  Tex3DFloatS32,
+  Tex3DFloatFloat,
+  Tex3DFloatFloatLevel,
+  Tex3DFloatFloatGrad,
+  Tex3DS32S32,
+  Tex3DS32Float,
+  Tex3DS32FloatLevel,
+  Tex3DS32FloatGrad,
+  Tex3DU32S32,
+  Tex3DU32Float,
+  Tex3DU32FloatLevel,
+  Tex3DU32FloatGrad,
+  TexCubeFloatFloat,
+  TexCubeFloatFloatLevel,
+  TexCubeS32Float,
+  TexCubeS32FloatLevel,
+  TexCubeU32Float,
+  TexCubeU32FloatLevel,
+  TexCubeArrayFloatFloat,
+  TexCubeArrayFloatFloatLevel,
+  TexCubeArrayS32Float,
+  TexCubeArrayS32FloatLevel,
+  TexCubeArrayU32Float,
+  TexCubeArrayU32FloatLevel,
+  Tld4R2DFloatFloat,
+  Tld4G2DFloatFloat,
+  Tld4B2DFloatFloat,
+  Tld4A2DFloatFloat,
+  Tld4R2DS64Float,
+  Tld4G2DS64Float,
+  Tld4B2DS64Float,
+  Tld4A2DS64Float,
+  Tld4R2DU64Float,
+  Tld4G2DU64Float,
+  Tld4B2DU64Float,
+  Tld4A2DU64Float,
+  TexUnified1DFloatS32,
+  TexUnified1DFloatFloat,
+  TexUnified1DFloatFloatLevel,
+  TexUnified1DFloatFloatGrad,
+  TexUnified1DS32S32,
+  TexUnified1DS32Float,
+  TexUnified1DS32FloatLevel,
+  TexUnified1DS32FloatGrad,
+  TexUnified1DU32S32,
+  TexUnified1DU32Float,
+  TexUnified1DU32FloatLevel,
+  TexUnified1DU32FloatGrad,
+  TexUnified1DArrayFloatS32,
+  TexUnified1DArrayFloatFloat,
+  TexUnified1DArrayFloatFloatLevel,
+  TexUnified1DArrayFloatFloatGrad,
+  TexUnified1DArrayS32S32,
+  TexUnified1DArrayS32Float,
+  TexUnified1DArrayS32FloatLevel,
+  TexUnified1DArrayS32FloatGrad,
+  TexUnified1DArrayU32S32,
+  TexUnified1DArrayU32Float,
+  TexUnified1DArrayU32FloatLevel,
+  TexUnified1DArrayU32FloatGrad,
+  TexUnified2DFloatS32,
+  TexUnified2DFloatFloat,
+  TexUnified2DFloatFloatLevel,
+  TexUnified2DFloatFloatGrad,
+  TexUnified2DS32S32,
+  TexUnified2DS32Float,
+  TexUnified2DS32FloatLevel,
+  TexUnified2DS32FloatGrad,
+  TexUnified2DU32S32,
+  TexUnified2DU32Float,
+  TexUnified2DU32FloatLevel,
+  TexUnified2DU32FloatGrad,
+  TexUnified2DArrayFloatS32,
+  TexUnified2DArrayFloatFloat,
+  TexUnified2DArrayFloatFloatLevel,
+  TexUnified2DArrayFloatFloatGrad,
+  TexUnified2DArrayS32S32,
+  TexUnified2DArrayS32Float,
+  TexUnified2DArrayS32FloatLevel,
+  TexUnified2DArrayS32FloatGrad,
+  TexUnified2DArrayU32S32,
+  TexUnified2DArrayU32Float,
+  TexUnified2DArrayU32FloatLevel,
+  TexUnified2DArrayU32FloatGrad,
+  TexUnified3DFloatS32,
+  TexUnified3DFloatFloat,
+  TexUnified3DFloatFloatLevel,
+  TexUnified3DFloatFloatGrad,
+  TexUnified3DS32S32,
+  TexUnified3DS32Float,
+  TexUnified3DS32FloatLevel,
+  TexUnified3DS32FloatGrad,
+  TexUnified3DU32S32,
+  TexUnified3DU32Float,
+  TexUnified3DU32FloatLevel,
+  TexUnified3DU32FloatGrad,
+  TexUnifiedCubeFloatFloat,
+  TexUnifiedCubeFloatFloatLevel,
+  TexUnifiedCubeS32Float,
+  TexUnifiedCubeS32FloatLevel,
+  TexUnifiedCubeU32Float,
+  TexUnifiedCubeU32FloatLevel,
+  TexUnifiedCubeArrayFloatFloat,
+  TexUnifiedCubeArrayFloatFloatLevel,
+  TexUnifiedCubeArrayS32Float,
+  TexUnifiedCubeArrayS32FloatLevel,
+  TexUnifiedCubeArrayU32Float,
+  TexUnifiedCubeArrayU32FloatLevel,
+  Tld4UnifiedR2DFloatFloat,
+  Tld4UnifiedG2DFloatFloat,
+  Tld4UnifiedB2DFloatFloat,
+  Tld4UnifiedA2DFloatFloat,
+  Tld4UnifiedR2DS64Float,
+  Tld4UnifiedG2DS64Float,
+  Tld4UnifiedB2DS64Float,
+  Tld4UnifiedA2DS64Float,
+  Tld4UnifiedR2DU64Float,
+  Tld4UnifiedG2DU64Float,
+  Tld4UnifiedB2DU64Float,
+  Tld4UnifiedA2DU64Float,
+
+  // Surface intrinsics
+  Suld1DI8Clamp,
+  Suld1DI16Clamp,
+  Suld1DI32Clamp,
+  Suld1DI64Clamp,
+  Suld1DV2I8Clamp,
+  Suld1DV2I16Clamp,
+  Suld1DV2I32Clamp,
+  Suld1DV2I64Clamp,
+  Suld1DV4I8Clamp,
+  Suld1DV4I16Clamp,
+  Suld1DV4I32Clamp,
+
+  Suld1DArrayI8Clamp,
+  Suld1DArrayI16Clamp,
+  Suld1DArrayI32Clamp,
+  Suld1DArrayI64Clamp,
+  Suld1DArrayV2I8Clamp,
+  Suld1DArrayV2I16Clamp,
+  Suld1DArrayV2I32Clamp,
+  Suld1DArrayV2I64Clamp,
+  Suld1DArrayV4I8Clamp,
+  Suld1DArrayV4I16Clamp,
+  Suld1DArrayV4I32Clamp,
+
+  Suld2DI8Clamp,
+  Suld2DI16Clamp,
+  Suld2DI32Clamp,
+  Suld2DI64Clamp,
+  Suld2DV2I8Clamp,
+  Suld2DV2I16Clamp,
+  Suld2DV2I32Clamp,
+  Suld2DV2I64Clamp,
+  Suld2DV4I8Clamp,
+  Suld2DV4I16Clamp,
+  Suld2DV4I32Clamp,
+
+  Suld2DArrayI8Clamp,
+  Suld2DArrayI16Clamp,
+  Suld2DArrayI32Clamp,
+  Suld2DArrayI64Clamp,
+  Suld2DArrayV2I8Clamp,
+  Suld2DArrayV2I16Clamp,
+  Suld2DArrayV2I32Clamp,
+  Suld2DArrayV2I64Clamp,
+  Suld2DArrayV4I8Clamp,
+  Suld2DArrayV4I16Clamp,
+  Suld2DArrayV4I32Clamp,
+
+  Suld3DI8Clamp,
+  Suld3DI16Clamp,
+  Suld3DI32Clamp,
+  Suld3DI64Clamp,
+  Suld3DV2I8Clamp,
+  Suld3DV2I16Clamp,
+  Suld3DV2I32Clamp,
+  Suld3DV2I64Clamp,
+  Suld3DV4I8Clamp,
+  Suld3DV4I16Clamp,
+  Suld3DV4I32Clamp,
+
+  Suld1DI8Trap,
+  Suld1DI16Trap,
+  Suld1DI32Trap,
+  Suld1DI64Trap,
+  Suld1DV2I8Trap,
+  Suld1DV2I16Trap,
+  Suld1DV2I32Trap,
+  Suld1DV2I64Trap,
+  Suld1DV4I8Trap,
+  Suld1DV4I16Trap,
+  Suld1DV4I32Trap,
+
+  Suld1DArrayI8Trap,
+  Suld1DArrayI16Trap,
+  Suld1DArrayI32Trap,
+  Suld1DArrayI64Trap,
+  Suld1DArrayV2I8Trap,
+  Suld1DArrayV2I16Trap,
+  Suld1DArrayV2I32Trap,
+  Suld1DArrayV2I64Trap,
+  Suld1DArrayV4I8Trap,
+  Suld1DArrayV4I16Trap,
+  Suld1DArrayV4I32Trap,
+
+  Suld2DI8Trap,
+  Suld2DI16Trap,
+  Suld2DI32Trap,
+  Suld2DI64Trap,
+  Suld2DV2I8Trap,
+  Suld2DV2I16Trap,
+  Suld2DV2I32Trap,
+  Suld2DV2I64Trap,
+  Suld2DV4I8Trap,
+  Suld2DV4I16Trap,
+  Suld2DV4I32Trap,
+
+  Suld2DArrayI8Trap,
+  Suld2DArrayI16Trap,
+  Suld2DArrayI32Trap,
+  Suld2DArrayI64Trap,
+  Suld2DArrayV2I8Trap,
+  Suld2DArrayV2I16Trap,
+  Suld2DArrayV2I32Trap,
+  Suld2DArrayV2I64Trap,
+  Suld2DArrayV4I8Trap,
+  Suld2DArrayV4I16Trap,
+  Suld2DArrayV4I32Trap,
+
+  Suld3DI8Trap,
+  Suld3DI16Trap,
+  Suld3DI32Trap,
+  Suld3DI64Trap,
+  Suld3DV2I8Trap,
+  Suld3DV2I16Trap,
+  Suld3DV2I32Trap,
+  Suld3DV2I64Trap,
+  Suld3DV4I8Trap,
+  Suld3DV4I16Trap,
+  Suld3DV4I32Trap,
+
+  Suld1DI8Zero,
+  Suld1DI16Zero,
+  Suld1DI32Zero,
+  Suld1DI64Zero,
+  Suld1DV2I8Zero,
+  Suld1DV2I16Zero,
+  Suld1DV2I32Zero,
+  Suld1DV2I64Zero,
+  Suld1DV4I8Zero,
+  Suld1DV4I16Zero,
+  Suld1DV4I32Zero,
+
+  Suld1DArrayI8Zero,
+  Suld1DArrayI16Zero,
+  Suld1DArrayI32Zero,
+  Suld1DArrayI64Zero,
+  Suld1DArrayV2I8Zero,
+  Suld1DArrayV2I16Zero,
+  Suld1DArrayV2I32Zero,
+  Suld1DArrayV2I64Zero,
+  Suld1DArrayV4I8Zero,
+  Suld1DArrayV4I16Zero,
+  Suld1DArrayV4I32Zero,
+
+  Suld2DI8Zero,
+  Suld2DI16Zero,
+  Suld2DI32Zero,
+  Suld2DI64Zero,
+  Suld2DV2I8Zero,
+  Suld2DV2I16Zero,
+  Suld2DV2I32Zero,
+  Suld2DV2I64Zero,
+  Suld2DV4I8Zero,
+  Suld2DV4I16Zero,
+  Suld2DV4I32Zero,
+
+  Suld2DArrayI8Zero,
+  Suld2DArrayI16Zero,
+  Suld2DArrayI32Zero,
+  Suld2DArrayI64Zero,
+  Suld2DArrayV2I8Zero,
+  Suld2DArrayV2I16Zero,
+  Suld2DArrayV2I32Zero,
+  Suld2DArrayV2I64Zero,
+  Suld2DArrayV4I8Zero,
+  Suld2DArrayV4I16Zero,
+  Suld2DArrayV4I32Zero,
+
+  Suld3DI8Zero,
+  Suld3DI16Zero,
+  Suld3DI32Zero,
+  Suld3DI64Zero,
+  Suld3DV2I8Zero,
+  Suld3DV2I16Zero,
+  Suld3DV2I32Zero,
+  Suld3DV2I64Zero,
+  Suld3DV4I8Zero,
+  Suld3DV4I16Zero,
+  Suld3DV4I32Zero
+};
+}
+
+class NVPTXSubtarget;
+
+//===--------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===--------------------------------------------------------------------===//
+class NVPTXTargetLowering : public TargetLowering {
+public:
+  explicit NVPTXTargetLowering(const NVPTXTargetMachine &TM,
+                               const NVPTXSubtarget &STI);
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                          unsigned Intrinsic) const override;
+
+  /// isLegalAddressingMode - Return true if the addressing mode represented
+  /// by AM is legal for this target, for a load/store of the specified type
+  /// Used to guide target specific optimizations, like loop strength
+  /// reduction (LoopStrengthReduce.cpp) and memory optimization for
+  /// address mode (CodeGenPrepare.cpp)
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
+
+  bool isTruncateFree(Type *SrcTy, Type *DstTy) const override {
+    // Truncating 64-bit to 32-bit is free in SASS.
+    if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
+      return false;
+    return SrcTy->getPrimitiveSizeInBits() == 64 &&
+           DstTy->getPrimitiveSizeInBits() == 32;
+  }
+
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
+                         EVT VT) const override {
+    if (VT.isVector())
+      return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
+    return MVT::i1;
+  }
+
+  ConstraintType getConstraintType(StringRef Constraint) const override;
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &,
+                           const SmallVectorImpl<ISD::OutputArg> &,
+                           unsigned retAlignment,
+                           const ImmutableCallSite *CS) const;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
+                      SelectionDAG &DAG) const override;
+
+  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+
+  const NVPTXTargetMachine *nvTM;
+
+  // PTX always uses 32-bit shift amounts
+  MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+    return MVT::i32;
+  }
+
+  TargetLoweringBase::LegalizeTypeAction
+  getPreferredVectorAction(EVT VT) const override;
+
+  bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const;
+
+  bool isFMAFasterThanFMulAndFAdd(EVT) const override { return true; }
+
+  bool enableAggressiveFMAFusion(EVT VT) const override { return true; }
+
+private:
+  const NVPTXSubtarget &STI; // cache the subtarget here
+  SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
+
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerSelect(SDValue Op, SelectionDAG &DAG) const;
+
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+  unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS,
+                                Type *Ty, unsigned Idx,
+                                const DataLayout &DL) const;
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
new file mode 100644
index 000000000000..8d00bbb5e9c2
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -0,0 +1,181 @@
+//===-- NVPTXImageOptimizer.cpp - Image optimization pass -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR-level optimizations of image access code,
+// including:
+//
+// 1. Eliminate istypep intrinsics when image access qualifier is known
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+class NVPTXImageOptimizer : public FunctionPass {
+private:
+  static char ID;
+  SmallVector<Instruction*, 4> InstrToDelete;
+
+public:
+  NVPTXImageOptimizer();
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  bool replaceIsTypePSampler(Instruction &I);
+  bool replaceIsTypePSurface(Instruction &I);
+  bool replaceIsTypePTexture(Instruction &I);
+  Value *cleanupValue(Value *V);
+  void replaceWith(Instruction *From, ConstantInt *To);
+};
+}
+
+char NVPTXImageOptimizer::ID = 0;
+
+NVPTXImageOptimizer::NVPTXImageOptimizer()
+  : FunctionPass(ID) {}
+
+bool NVPTXImageOptimizer::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  bool Changed = false;
+  InstrToDelete.clear();
+
+  // Look for call instructions in the function
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;
+       ++BI) {
+    for (BasicBlock::iterator I = (*BI).begin(), E = (*BI).end();
+         I != E; ++I) {
+      Instruction &Instr = *I;
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        Function *CalledF = CI->getCalledFunction();
+        if (CalledF && CalledF->isIntrinsic()) {
+          // This is an intrinsic function call, check if its an istypep
+          switch (CalledF->getIntrinsicID()) {
+          default: break;
+          case Intrinsic::nvvm_istypep_sampler:
+            Changed |= replaceIsTypePSampler(Instr);
+            break;
+          case Intrinsic::nvvm_istypep_surface:
+            Changed |= replaceIsTypePSurface(Instr);
+            break;
+          case Intrinsic::nvvm_istypep_texture:
+            Changed |= replaceIsTypePTexture(Instr);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // Delete any istypep instances we replaced in the IR
+  for (unsigned i = 0, e = InstrToDelete.size(); i != e; ++i)
+    InstrToDelete[i]->eraseFromParent();
+
+  return Changed;
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePSampler(Instruction &I) {
+  Value *TexHandle = cleanupValue(I.getOperand(0));
+  if (isSampler(*TexHandle)) {
+    // This is an OpenCL sampler, so it must be a samplerref
+    replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+    return true;
+  } else if (isImageWriteOnly(*TexHandle) ||
+             isImageReadWrite(*TexHandle) ||
+             isImageReadOnly(*TexHandle)) {
+    // This is an OpenCL image, so it cannot be a samplerref
+    replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+    return true;
+  } else {
+    // The image type is unknown, so we cannot eliminate the intrinsic
+    return false;
+  }
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePSurface(Instruction &I) {
+  Value *TexHandle = cleanupValue(I.getOperand(0));
+  if (isImageReadWrite(*TexHandle) ||
+      isImageWriteOnly(*TexHandle)) {
+    // This is an OpenCL read-only/read-write image, so it must be a surfref
+    replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+    return true;
+  } else if (isImageReadOnly(*TexHandle) ||
+             isSampler(*TexHandle)) {
+    // This is an OpenCL read-only/ imageor sampler, so it cannot be
+    // a surfref
+    replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+    return true;
+  } else {
+    // The image type is unknown, so we cannot eliminate the intrinsic
+    return false;
+  }
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePTexture(Instruction &I) {
+  Value *TexHandle = cleanupValue(I.getOperand(0));
+  if (isImageReadOnly(*TexHandle)) {
+    // This is an OpenCL read-only image, so it must be a texref
+    replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+    return true;
+  } else if (isImageWriteOnly(*TexHandle) ||
+             isImageReadWrite(*TexHandle) ||
+             isSampler(*TexHandle)) {
+    // This is an OpenCL read-write/write-only image or a sampler, so it
+    // cannot be a texref
+    replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+    return true;
+  } else {
+    // The image type is unknown, so we cannot eliminate the intrinsic
+    return false;
+  }
+}
+
+void NVPTXImageOptimizer::replaceWith(Instruction *From, ConstantInt *To) {
+  // We implement "poor man's DCE" here to make sure any code that is no longer
+  // live is actually unreachable and can be trivially eliminated by the
+  // unreachable block elimination pass.
+  for (CallInst::use_iterator UI = From->use_begin(), UE = From->use_end();
+       UI != UE; ++UI) {
+    if (BranchInst *BI = dyn_cast<BranchInst>(*UI)) {
+      if (BI->isUnconditional()) continue;
+      BasicBlock *Dest;
+      if (To->isZero())
+        // Get false block
+        Dest = BI->getSuccessor(1);
+      else
+        // Get true block
+        Dest = BI->getSuccessor(0);
+      BranchInst::Create(Dest, BI);
+      InstrToDelete.push_back(BI);
+    }
+  }
+  From->replaceAllUsesWith(To);
+  InstrToDelete.push_back(From);
+}
+
+Value *NVPTXImageOptimizer::cleanupValue(Value *V) {
+  if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(V)) {
+    return cleanupValue(EVI->getAggregateOperand());
+  }
+  return V;
+}
+
+FunctionPass *llvm::createNVPTXImageOptimizerPass() {
+  return new NVPTXImageOptimizer();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp
new file mode 100644
index 000000000000..f4940c937a2d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp
@@ -0,0 +1,583 @@
+//===-- NVPTXInferAddressSpace.cpp - ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// CUDA C/C++ includes memory space designation as variable type qualifers (such
+// as __global__ and __shared__). Knowing the space of a memory access allows
+// CUDA compilers to emit faster PTX loads and stores. For example, a load from
+// shared memory can be translated to `ld.shared` which is roughly 10% faster
+// than a generic `ld` on an NVIDIA Tesla K40c.
+//
+// Unfortunately, type qualifiers only apply to variable declarations, so CUDA
+// compilers must infer the memory space of an address expression from
+// type-qualified variables.
+//
+// LLVM IR uses non-zero (so-called) specific address spaces to represent memory
+// spaces (e.g. addrspace(3) means shared memory). The Clang frontend
+// places only type-qualified variables in specific address spaces, and then
+// conservatively `addrspacecast`s each type-qualified variable to addrspace(0)
+// (so-called the generic address space) for other instructions to use.
+//
+// For example, the Clang translates the following CUDA code
+//   __shared__ float a[10];
+//   float v = a[i];
+// to
+//   %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+//   %1 = gep [10 x float], [10 x float]* %0, i64 0, i64 %i
+//   %v = load float, float* %1 ; emits ld.f32
+// @a is in addrspace(3) since it's type-qualified, but its use from %1 is
+// redirected to %0 (the generic version of @a).
+//
+// The optimization implemented in this file propagates specific address spaces
+// from type-qualified variable declarations to its users. For example, it
+// optimizes the above IR to
+//   %1 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+//   %v = load float addrspace(3)* %1 ; emits ld.shared.f32
+// propagating the addrspace(3) from @a to %1. As the result, the NVPTX
+// codegen is able to emit ld.shared.f32 for %v.
+//
+// Address space inference works in two steps. First, it uses a data-flow
+// analysis to infer as many generic pointers as possible to point to only one
+// specific address space. In the above example, it can prove that %1 only
+// points to addrspace(3). This algorithm was published in
+//   CUDA: Compiling and optimizing for a GPU platform
+//   Chakrabarti, Grover, Aarts, Kong, Kudlur, Lin, Marathe, Murphy, Wang
+//   ICCS 2012
+//
+// Then, address space inference replaces all refinable generic pointers with
+// equivalent specific pointers.
+//
+// The major challenge of implementing this optimization is handling PHINodes,
+// which may create loops in the data flow graph. This brings two complications.
+//
+// First, the data flow analysis in Step 1 needs to be circular. For example,
+//     %generic.input = addrspacecast float addrspace(3)* %input to float*
+//   loop:
+//     %y = phi [ %generic.input, %y2 ]
+//     %y2 = getelementptr %y, 1
+//     %v = load %y2
+//     br ..., label %loop, ...
+// proving %y specific requires proving both %generic.input and %y2 specific,
+// but proving %y2 specific circles back to %y. To address this complication,
+// the data flow analysis operates on a lattice:
+//   uninitialized > specific address spaces > generic.
+// All address expressions (our implementation only considers phi, bitcast,
+// addrspacecast, and getelementptr) start with the uninitialized address space.
+// The monotone transfer function moves the address space of a pointer down a
+// lattice path from uninitialized to specific and then to generic. A join
+// operation of two different specific address spaces pushes the expression down
+// to the generic address space. The analysis completes once it reaches a fixed
+// point.
+//
+// Second, IR rewriting in Step 2 also needs to be circular. For example,
+// converting %y to addrspace(3) requires the compiler to know the converted
+// %y2, but converting %y2 needs the converted %y. To address this complication,
+// we break these cycles using "undef" placeholders. When converting an
+// instruction `I` to a new address space, if its operand `Op` is not converted
+// yet, we let `I` temporarily use `undef` and fix all the uses of undef later.
+// For instance, our algorithm first converts %y to
+//   %y' = phi float addrspace(3)* [ %input, undef ]
+// Then, it converts %y2 to
+//   %y2' = getelementptr %y', 1
+// Finally, it fixes the undef in %y' so that
+//   %y' = phi float addrspace(3)* [ %input, %y2' ]
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "nvptx-infer-addrspace"
+
+#include "NVPTX.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+using namespace llvm;
+
+namespace {
+const unsigned ADDRESS_SPACE_UNINITIALIZED = (unsigned)-1;
+
+using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
+
+/// \brief NVPTXInferAddressSpaces
+class NVPTXInferAddressSpaces: public FunctionPass {
+public:
+  static char ID;
+
+  NVPTXInferAddressSpaces() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  // Returns the new address space of V if updated; otherwise, returns None.
+  Optional<unsigned>
+  updateAddressSpace(const Value &V,
+                     const ValueToAddrSpaceMapTy &InferredAddrSpace);
+
+  // Tries to infer the specific address space of each address expression in
+  // Postorder.
+  void inferAddressSpaces(const std::vector<Value *> &Postorder,
+                          ValueToAddrSpaceMapTy *InferredAddrSpace);
+
+  // Changes the generic address expressions in function F to point to specific
+  // address spaces if InferredAddrSpace says so. Postorder is the postorder of
+  // all generic address expressions in the use-def graph of function F.
+  bool
+  rewriteWithNewAddressSpaces(const std::vector<Value *> &Postorder,
+                              const ValueToAddrSpaceMapTy &InferredAddrSpace,
+                              Function *F);
+};
+} // end anonymous namespace
+
+char NVPTXInferAddressSpaces::ID = 0;
+
+namespace llvm {
+void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
+}
+INITIALIZE_PASS(NVPTXInferAddressSpaces, "nvptx-infer-addrspace",
+                "Infer address spaces",
+                false, false)
+
+// Returns true if V is an address expression.
+// TODO: Currently, we consider only phi, bitcast, addrspacecast, and
+// getelementptr operators.
+static bool isAddressExpression(const Value &V) {
+  if (!isa<Operator>(V))
+    return false;
+
+  switch (cast<Operator>(V).getOpcode()) {
+  case Instruction::PHI:
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+  case Instruction::GetElementPtr:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// Returns the pointer operands of V.
+//
+// Precondition: V is an address expression.
+static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
+  assert(isAddressExpression(V));
+  const Operator& Op = cast<Operator>(V);
+  switch (Op.getOpcode()) {
+  case Instruction::PHI: {
+    auto IncomingValues = cast<PHINode>(Op).incoming_values();
+    return SmallVector<Value *, 2>(IncomingValues.begin(),
+                                   IncomingValues.end());
+  }
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+  case Instruction::GetElementPtr:
+    return {Op.getOperand(0)};
+  default:
+    llvm_unreachable("Unexpected instruction type.");
+  }
+}
+
+// If V is an unvisited generic address expression, appends V to PostorderStack
+// and marks it as visited.
+static void appendsGenericAddressExpressionToPostorderStack(
+    Value *V, std::vector<std::pair<Value *, bool>> *PostorderStack,
+    DenseSet<Value *> *Visited) {
+  assert(V->getType()->isPointerTy());
+  if (isAddressExpression(*V) &&
+      V->getType()->getPointerAddressSpace() ==
+          AddressSpace::ADDRESS_SPACE_GENERIC) {
+    if (Visited->insert(V).second)
+      PostorderStack->push_back(std::make_pair(V, false));
+  }
+}
+
+// Returns all generic address expressions in function F. The elements are
+// ordered in postorder.
+static std::vector<Value *> collectGenericAddressExpressions(Function &F) {
+  // This function implements a non-recursive postorder traversal of a partial
+  // use-def graph of function F.
+  std::vector<std::pair<Value*, bool>> PostorderStack;
+  // The set of visited expressions.
+  DenseSet<Value*> Visited;
+  // We only explore address expressions that are reachable from loads and
+  // stores for now because we aim at generating faster loads and stores.
+  for (Instruction &I : instructions(F)) {
+    if (isa<LoadInst>(I)) {
+      appendsGenericAddressExpressionToPostorderStack(
+          I.getOperand(0), &PostorderStack, &Visited);
+    } else if (isa<StoreInst>(I)) {
+      appendsGenericAddressExpressionToPostorderStack(
+          I.getOperand(1), &PostorderStack, &Visited);
+    }
+  }
+
+  std::vector<Value *> Postorder; // The resultant postorder.
+  while (!PostorderStack.empty()) {
+    // If the operands of the expression on the top are already explored,
+    // adds that expression to the resultant postorder.
+    if (PostorderStack.back().second) {
+      Postorder.push_back(PostorderStack.back().first);
+      PostorderStack.pop_back();
+      continue;
+    }
+    // Otherwise, adds its operands to the stack and explores them.
+    PostorderStack.back().second = true;
+    for (Value *PtrOperand : getPointerOperands(*PostorderStack.back().first)) {
+      appendsGenericAddressExpressionToPostorderStack(
+          PtrOperand, &PostorderStack, &Visited);
+    }
+  }
+  return Postorder;
+}
+
+// A helper function for cloneInstructionWithNewAddressSpace. Returns the clone
+// of OperandUse.get() in the new address space. If the clone is not ready yet,
+// returns an undef in the new address space as a placeholder.
+static Value *operandWithNewAddressSpaceOrCreateUndef(
+    const Use &OperandUse, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) {
+  Value *Operand = OperandUse.get();
+  if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand))
+    return NewOperand;
+
+  UndefUsesToFix->push_back(&OperandUse);
+  return UndefValue::get(
+      Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace));
+}
+
+// Returns a clone of `I` with its operands converted to those specified in
+// ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an
+// operand whose address space needs to be modified might not exist in
+// ValueWithNewAddrSpace. In that case, uses undef as a placeholder operand and
+// adds that operand use to UndefUsesToFix so that caller can fix them later.
+//
+// Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast
+// from a pointer whose type already matches. Therefore, this function returns a
+// Value* instead of an Instruction*.
+static Value *cloneInstructionWithNewAddressSpace(
+    Instruction *I, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) {
+  Type *NewPtrType =
+      I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (I->getOpcode() == Instruction::AddrSpaceCast) {
+    Value *Src = I->getOperand(0);
+    // Because `I` is generic, the source address space must be specific.
+    // Therefore, the inferred address space must be the source space, according
+    // to our algorithm.
+    assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+    if (Src->getType() != NewPtrType)
+      return new BitCastInst(Src, NewPtrType);
+    return Src;
+  }
+
+  // Computes the converted pointer operands.
+  SmallVector<Value *, 4> NewPointerOperands;
+  for (const Use &OperandUse : I->operands()) {
+    if (!OperandUse.get()->getType()->isPointerTy())
+      NewPointerOperands.push_back(nullptr);
+    else
+      NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef(
+          OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix));
+  }
+
+  switch (I->getOpcode()) {
+  case Instruction::BitCast:
+    return new BitCastInst(NewPointerOperands[0], NewPtrType);
+  case Instruction::PHI: {
+    assert(I->getType()->isPointerTy());
+    PHINode *PHI = cast<PHINode>(I);
+    PHINode *NewPHI = PHINode::Create(NewPtrType, PHI->getNumIncomingValues());
+    for (unsigned Index = 0; Index < PHI->getNumIncomingValues(); ++Index) {
+      unsigned OperandNo = PHINode::getOperandNumForIncomingValue(Index);
+      NewPHI->addIncoming(NewPointerOperands[OperandNo],
+                          PHI->getIncomingBlock(Index));
+    }
+    return NewPHI;
+  }
+  case Instruction::GetElementPtr: {
+    GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+    GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+        GEP->getSourceElementType(), NewPointerOperands[0],
+        SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end()));
+    NewGEP->setIsInBounds(GEP->isInBounds());
+    return NewGEP;
+  }
+  default:
+    llvm_unreachable("Unexpected opcode");
+  }
+}
+
+// Similar to cloneInstructionWithNewAddressSpace, returns a clone of the
+// constant expression `CE` with its operands replaced as specified in
+// ValueWithNewAddrSpace.
+static Value *cloneConstantExprWithNewAddressSpace(
+    ConstantExpr *CE, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace) {
+  Type *TargetType =
+      CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+    // Because CE is generic, the source address space must be specific.
+    // Therefore, the inferred address space must be the source space according
+    // to our algorithm.
+    assert(CE->getOperand(0)->getType()->getPointerAddressSpace() ==
+           NewAddrSpace);
+    return ConstantExpr::getBitCast(CE->getOperand(0), TargetType);
+  }
+
+  // Computes the operands of the new constant expression.
+  SmallVector<Constant *, 4> NewOperands;
+  for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) {
+    Constant *Operand = CE->getOperand(Index);
+    // If the address space of `Operand` needs to be modified, the new operand
+    // with the new address space should already be in ValueWithNewAddrSpace
+    // because (1) the constant expressions we consider (i.e. addrspacecast,
+    // bitcast, and getelementptr) do not incur cycles in the data flow graph
+    // and (2) this function is called on constant expressions in postorder.
+    if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) {
+      NewOperands.push_back(cast<Constant>(NewOperand));
+    } else {
+      // Otherwise, reuses the old operand.
+      NewOperands.push_back(Operand);
+    }
+  }
+
+  if (CE->getOpcode() == Instruction::GetElementPtr) {
+    // Needs to specify the source type while constructing a getelementptr
+    // constant expression.
+    return CE->getWithOperands(
+        NewOperands, TargetType, /*OnlyIfReduced=*/false,
+        NewOperands[0]->getType()->getPointerElementType());
+  }
+
+  return CE->getWithOperands(NewOperands, TargetType);
+}
+
+// Returns a clone of the value `V`, with its operands replaced as specified in
+// ValueWithNewAddrSpace. This function is called on every generic address
+// expression whose address space needs to be modified, in postorder.
+//
+// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix.
+static Value *
+cloneValueWithNewAddressSpace(Value *V, unsigned NewAddrSpace,
+                              const ValueToValueMapTy &ValueWithNewAddrSpace,
+                              SmallVectorImpl<const Use *> *UndefUsesToFix) {
+  // All values in Postorder are generic address expressions.
+  assert(isAddressExpression(*V) &&
+         V->getType()->getPointerAddressSpace() ==
+             AddressSpace::ADDRESS_SPACE_GENERIC);
+
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    Value *NewV = cloneInstructionWithNewAddressSpace(
+        I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
+    if (Instruction *NewI = dyn_cast<Instruction>(NewV)) {
+      if (NewI->getParent() == nullptr) {
+        NewI->insertBefore(I);
+        NewI->takeName(I);
+      }
+    }
+    return NewV;
+  }
+
+  return cloneConstantExprWithNewAddressSpace(
+      cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace);
+}
+
+// Defines the join operation on the address space lattice (see the file header
+// comments).
+static unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) {
+  if (AS1 == AddressSpace::ADDRESS_SPACE_GENERIC ||
+      AS2 == AddressSpace::ADDRESS_SPACE_GENERIC)
+    return AddressSpace::ADDRESS_SPACE_GENERIC;
+
+  if (AS1 == ADDRESS_SPACE_UNINITIALIZED)
+    return AS2;
+  if (AS2 == ADDRESS_SPACE_UNINITIALIZED)
+    return AS1;
+
+  // The join of two different specific address spaces is generic.
+  return AS1 == AS2 ? AS1 : (unsigned)AddressSpace::ADDRESS_SPACE_GENERIC;
+}
+
+bool NVPTXInferAddressSpaces::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  // Collects all generic address expressions in postorder.
+  std::vector<Value *> Postorder = collectGenericAddressExpressions(F);
+
+  // Runs a data-flow analysis to refine the address spaces of every expression
+  // in Postorder.
+  ValueToAddrSpaceMapTy InferredAddrSpace;
+  inferAddressSpaces(Postorder, &InferredAddrSpace);
+
+  // Changes the address spaces of the generic address expressions who are
+  // inferred to point to a specific address space.
+  return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, &F);
+}
+
+void NVPTXInferAddressSpaces::inferAddressSpaces(
+    const std::vector<Value *> &Postorder,
+    ValueToAddrSpaceMapTy *InferredAddrSpace) {
+  SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
+  // Initially, all expressions are in the uninitialized address space.
+  for (Value *V : Postorder)
+    (*InferredAddrSpace)[V] = ADDRESS_SPACE_UNINITIALIZED;
+
+  while (!Worklist.empty()) {
+    Value* V = Worklist.pop_back_val();
+
+    // Tries to update the address space of the stack top according to the
+    // address spaces of its operands.
+    DEBUG(dbgs() << "Updating the address space of\n"
+                 << "  " << *V << "\n");
+    Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
+    if (!NewAS.hasValue())
+      continue;
+    // If any updates are made, grabs its users to the worklist because
+    // their address spaces can also be possibly updated.
+    DEBUG(dbgs() << "  to " << NewAS.getValue() << "\n");
+    (*InferredAddrSpace)[V] = NewAS.getValue();
+
+    for (Value *User : V->users()) {
+      // Skip if User is already in the worklist.
+      if (Worklist.count(User))
+        continue;
+
+      auto Pos = InferredAddrSpace->find(User);
+      // Our algorithm only updates the address spaces of generic address
+      // expressions, which are those in InferredAddrSpace.
+      if (Pos == InferredAddrSpace->end())
+        continue;
+
+      // Function updateAddressSpace moves the address space down a lattice
+      // path. Therefore, nothing to do if User is already inferred as
+      // generic (the bottom element in the lattice).
+      if (Pos->second == AddressSpace::ADDRESS_SPACE_GENERIC)
+        continue;
+
+      Worklist.insert(User);
+    }
+  }
+}
+
+Optional<unsigned> NVPTXInferAddressSpaces::updateAddressSpace(
+    const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) {
+  assert(InferredAddrSpace.count(&V));
+
+  // The new inferred address space equals the join of the address spaces
+  // of all its pointer operands.
+  unsigned NewAS = ADDRESS_SPACE_UNINITIALIZED;
+  for (Value *PtrOperand : getPointerOperands(V)) {
+    unsigned OperandAS;
+    if (InferredAddrSpace.count(PtrOperand))
+      OperandAS = InferredAddrSpace.lookup(PtrOperand);
+    else
+      OperandAS = PtrOperand->getType()->getPointerAddressSpace();
+    NewAS = joinAddressSpaces(NewAS, OperandAS);
+    // join(generic, *) = generic. So we can break if NewAS is already generic.
+    if (NewAS == AddressSpace::ADDRESS_SPACE_GENERIC)
+      break;
+  }
+
+  unsigned OldAS = InferredAddrSpace.lookup(&V);
+  assert(OldAS != AddressSpace::ADDRESS_SPACE_GENERIC);
+  if (OldAS == NewAS)
+    return None;
+  return NewAS;
+}
+
+bool NVPTXInferAddressSpaces::rewriteWithNewAddressSpaces(
+    const std::vector<Value *> &Postorder,
+    const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) {
+  // For each address expression to be modified, creates a clone of it with its
+  // pointer operands converted to the new address space. Since the pointer
+  // operands are converted, the clone is naturally in the new address space by
+  // construction.
+  ValueToValueMapTy ValueWithNewAddrSpace;
+  SmallVector<const Use *, 32> UndefUsesToFix;
+  for (Value* V : Postorder) {
+    unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
+    if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
+      ValueWithNewAddrSpace[V] = cloneValueWithNewAddressSpace(
+          V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+    }
+  }
+
+  if (ValueWithNewAddrSpace.empty())
+    return false;
+
+  // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
+  for (const Use* UndefUse : UndefUsesToFix) {
+    User *V = UndefUse->getUser();
+    User *NewV = cast<User>(ValueWithNewAddrSpace.lookup(V));
+    unsigned OperandNo = UndefUse->getOperandNo();
+    assert(isa<UndefValue>(NewV->getOperand(OperandNo)));
+    NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get()));
+  }
+
+  // Replaces the uses of the old address expressions with the new ones.
+  for (Value *V : Postorder) {
+    Value *NewV = ValueWithNewAddrSpace.lookup(V);
+    if (NewV == nullptr)
+      continue;
+
+    SmallVector<Use *, 4> Uses;
+    for (Use &U : V->uses())
+      Uses.push_back(&U);
+    DEBUG(dbgs() << "Replacing the uses of " << *V << "\n  to\n  " << *NewV
+                 << "\n");
+    for (Use *U : Uses) {
+      if (isa<LoadInst>(U->getUser()) ||
+          (isa<StoreInst>(U->getUser()) && U->getOperandNo() == 1)) {
+        // If V is used as the pointer operand of a load/store, sets the pointer
+        // operand to NewV. This replacement does not change the element type,
+        // so the resultant load/store is still valid.
+        U->set(NewV);
+      } else if (isa<Instruction>(U->getUser())) {
+        // Otherwise, replaces the use with generic(NewV).
+        // TODO: Some optimization opportunities are missed. For example, in
+        //   %0 = icmp eq float* %p, %q
+        // if both p and q are inferred to be shared, we can rewrite %0 as
+        //   %0 = icmp eq float addrspace(3)* %new_p, %new_q
+        // instead of currently
+        //   %generic_p = addrspacecast float addrspace(3)* %new_p to float*
+        //   %generic_q = addrspacecast float addrspace(3)* %new_q to float*
+        //   %0 = icmp eq float* %generic_p, %generic_q
+        if (Instruction *I = dyn_cast<Instruction>(V)) {
+          BasicBlock::iterator InsertPos = std::next(I->getIterator());
+          while (isa<PHINode>(InsertPos))
+            ++InsertPos;
+          U->set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
+        } else {
+          U->set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+                                                V->getType()));
+        }
+      }
+    }
+    if (V->use_empty())
+      RecursivelyDeleteTriviallyDeadInstructions(V);
+  }
+
+  return true;
+}
+
+FunctionPass *llvm::createNVPTXInferAddressSpacesPass() {
+  return new NVPTXInferAddressSpaces();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td
new file mode 100644
index 000000000000..ffcb5d5273a2
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td
@@ -0,0 +1,59 @@
+//===- NVPTXInstrFormats.td - NVPTX Instruction Formats-------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe NVPTX instructions format
+//
+//===----------------------------------------------------------------------===//
+
+// Vector instruction type enum
+class VecInstTypeEnum<bits<4> val> {
+  bits<4> Value=val;
+}
+def VecNOP : VecInstTypeEnum<0>;
+
+// Generic NVPTX Format
+
+class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Instruction {
+  field bits<14> Inst;
+
+  let Namespace = "NVPTX";
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+
+  // TSFlagFields
+  bits<4> VecInstType = VecNOP.Value;
+  bit IsSimpleMove = 0;
+  bit IsLoad = 0;
+  bit IsStore = 0;
+
+  bit IsTex = 0;
+  bit IsSust = 0;
+  bit IsSurfTexQuery = 0;
+  bit IsTexModeUnified = 0;
+
+  // The following field is encoded as log2 of the vector size minus one,
+  // with 0 meaning the operation is not a surface instruction.  For example,
+  // if IsSuld == 2, then the instruction is a suld instruction with vector size
+  // 2**(2-1) = 2.
+  bits<2> IsSuld = 0;
+
+  let TSFlags{3-0}   = VecInstType;
+  let TSFlags{4-4}   = IsSimpleMove;
+  let TSFlags{5-5}   = IsLoad;
+  let TSFlags{6-6}   = IsStore;
+  let TSFlags{7}     = IsTex;
+  let TSFlags{9-8}   = IsSuld;
+  let TSFlags{10}    = IsSust;
+  let TSFlags{11}    = IsSurfTexQuery;
+  let TSFlags{12}    = IsTexModeUnified;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
new file mode 100644
index 000000000000..7f89742a3215
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -0,0 +1,248 @@
+//===- NVPTXInstrInfo.cpp - NVPTX Instruction Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "NVPTXGenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void NVPTXInstrInfo::anchor() {}
+
+NVPTXInstrInfo::NVPTXInstrInfo() : NVPTXGenInstrInfo(), RegInfo() {}
+
+void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I,
+                                 const DebugLoc &DL, unsigned DestReg,
+                                 unsigned SrcReg, bool KillSrc) const {
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg);
+  const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+
+  if (DestRC->getSize() != SrcRC->getSize())
+    report_fatal_error("Copy one register into another with a different width");
+
+  unsigned Op;
+  if (DestRC == &NVPTX::Int1RegsRegClass) {
+    Op = NVPTX::IMOV1rr;
+  } else if (DestRC == &NVPTX::Int16RegsRegClass) {
+    Op = NVPTX::IMOV16rr;
+  } else if (DestRC == &NVPTX::Int32RegsRegClass) {
+    Op = (SrcRC == &NVPTX::Int32RegsRegClass ? NVPTX::IMOV32rr
+                                             : NVPTX::BITCONVERT_32_F2I);
+  } else if (DestRC == &NVPTX::Int64RegsRegClass) {
+    Op = (SrcRC == &NVPTX::Int64RegsRegClass ? NVPTX::IMOV64rr
+                                             : NVPTX::BITCONVERT_64_F2I);
+  } else if (DestRC == &NVPTX::Float32RegsRegClass) {
+    Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32rr
+                                               : NVPTX::BITCONVERT_32_I2F);
+  } else if (DestRC == &NVPTX::Float64RegsRegClass) {
+    Op = (SrcRC == &NVPTX::Float64RegsRegClass ? NVPTX::FMOV64rr
+                                               : NVPTX::BITCONVERT_64_I2F);
+  } else {
+    llvm_unreachable("Bad register copy");
+  }
+  BuildMI(MBB, I, DL, get(Op), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
+                                 unsigned &DestReg) const {
+  // Look for the appropriate part of TSFlags
+  bool isMove = false;
+
+  unsigned TSFlags =
+      (MI.getDesc().TSFlags & NVPTX::SimpleMoveMask) >> NVPTX::SimpleMoveShift;
+  isMove = (TSFlags == 1);
+
+  if (isMove) {
+    MachineOperand dest = MI.getOperand(0);
+    MachineOperand src = MI.getOperand(1);
+    assert(dest.isReg() && "dest of a movrr is not a reg");
+    assert(src.isReg() && "src of a movrr is not a reg");
+
+    SrcReg = src.getReg();
+    DestReg = dest.getReg();
+    return true;
+  }
+
+  return false;
+}
+
+bool NVPTXInstrInfo::isLoadInstr(const MachineInstr &MI,
+                                 unsigned &AddrSpace) const {
+  bool isLoad = false;
+  unsigned TSFlags =
+      (MI.getDesc().TSFlags & NVPTX::isLoadMask) >> NVPTX::isLoadShift;
+  isLoad = (TSFlags == 1);
+  if (isLoad)
+    AddrSpace = getLdStCodeAddrSpace(MI);
+  return isLoad;
+}
+
+bool NVPTXInstrInfo::isStoreInstr(const MachineInstr &MI,
+                                  unsigned &AddrSpace) const {
+  bool isStore = false;
+  unsigned TSFlags =
+      (MI.getDesc().TSFlags & NVPTX::isStoreMask) >> NVPTX::isStoreShift;
+  isStore = (TSFlags == 1);
+  if (isStore)
+    AddrSpace = getLdStCodeAddrSpace(MI);
+  return isStore;
+}
+
+/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
+/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+/// implemented for a target).  Upon success, this returns false and returns
+/// with the following information in various cases:
+///
+/// 1. If this block ends with no branches (it just falls through to its succ)
+///    just return false, leaving TBB/FBB null.
+/// 2. If this block ends with only an unconditional branch, it sets TBB to be
+///    the destination block.
+/// 3. If this block ends with an conditional branch and it falls through to
+///    an successor block, it sets TBB to be the branch destination block and a
+///    list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+/// 4. If this block ends with an conditional branch and an unconditional
+///    block, it returns the 'true' destination in TBB, the 'false' destination
+///    in FBB, and a list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+///
+/// Note that removeBranch and insertBranch must be implemented to support
+/// cases where this method returns success.
+///
+bool NVPTXInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr &LastInst = *I;
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+    if (LastInst.getOpcode() == NVPTX::GOTO) {
+      TBB = LastInst.getOperand(0).getMBB();
+      return false;
+    } else if (LastInst.getOpcode() == NVPTX::CBranch) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst.getOperand(1).getMBB();
+      Cond.push_back(LastInst.getOperand(0));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  // Get the instruction before it if it's a terminator.
+  MachineInstr &SecondLastInst = *I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (I != MBB.begin() && isUnpredicatedTerminator(*--I))
+    return true;
+
+  // If the block ends with NVPTX::GOTO and NVPTX:CBranch, handle it.
+  if (SecondLastInst.getOpcode() == NVPTX::CBranch &&
+      LastInst.getOpcode() == NVPTX::GOTO) {
+    TBB = SecondLastInst.getOperand(1).getMBB();
+    Cond.push_back(SecondLastInst.getOperand(0));
+    FBB = LastInst.getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two NVPTX:GOTOs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst.getOpcode() == NVPTX::GOTO &&
+      LastInst.getOpcode() == NVPTX::GOTO) {
+    TBB = SecondLastInst.getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned NVPTXInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                      int *BytesRemoved) const {
+  assert(!BytesRemoved && "code size not handled");
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return 0;
+  --I;
+  if (I->getOpcode() != NVPTX::GOTO && I->getOpcode() != NVPTX::CBranch)
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin())
+    return 1;
+  --I;
+  if (I->getOpcode() != NVPTX::CBranch)
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned NVPTXInstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                      MachineBasicBlock *TBB,
+                                      MachineBasicBlock *FBB,
+                                      ArrayRef<MachineOperand> Cond,
+                                      const DebugLoc &DL,
+                                      int *BytesAdded) const {
+  assert(!BytesAdded && "code size not handled");
+
+  // Shouldn't be a fall through.
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "NVPTX branch conditions have two components!");
+
+  // One-way branch.
+  if (!FBB) {
+    if (Cond.empty()) // Unconditional branch
+      BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(TBB);
+    else // Conditional branch
+      BuildMI(&MBB, DL, get(NVPTX::CBranch)).addReg(Cond[0].getReg())
+          .addMBB(TBB);
+    return 1;
+  }
+
+  // Two-way Conditional Branch.
+  BuildMI(&MBB, DL, get(NVPTX::CBranch)).addReg(Cond[0].getReg()).addMBB(TBB);
+  BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB);
+  return 2;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
new file mode 100644
index 000000000000..d284282e28c5
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -0,0 +1,79 @@
+//===- NVPTXInstrInfo.h - NVPTX Instruction Information----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the niversity of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXINSTRINFO_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXINSTRINFO_H
+
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "NVPTXGenInstrInfo.inc"
+
+namespace llvm {
+
+class NVPTXInstrInfo : public NVPTXGenInstrInfo {
+  const NVPTXRegisterInfo RegInfo;
+  virtual void anchor();
+public:
+  explicit NVPTXInstrInfo();
+
+  const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
+
+  /* The following virtual functions are used in register allocation.
+   * They are not implemented because the existing interface and the logic
+   * at the caller side do not work for the elementized vector load and store.
+   *
+   * virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+   *                                  int &FrameIndex) const;
+   * virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+   *                                 int &FrameIndex) const;
+   * virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+   *                              MachineBasicBlock::iterator MBBI,
+   *                             unsigned SrcReg, bool isKill, int FrameIndex,
+   *                              const TargetRegisterClass *RC) const;
+   * virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+   *                               MachineBasicBlock::iterator MBBI,
+   *                               unsigned DestReg, int FrameIndex,
+   *                               const TargetRegisterClass *RC) const;
+   */
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+  virtual bool isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
+                           unsigned &DestReg) const;
+  bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
+  bool isStoreInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
+
+  // Branch analysis.
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+  unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const {
+    return MI.getOperand(2).getImm();
+  }
+
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
new file mode 100644
index 000000000000..92a88c7f2506
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -0,0 +1,2807 @@
+//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PTX instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "NVPTXInstrFormats.td"
+
+// A NOP instruction
+let hasSideEffects = 0 in {
+  def NOP : NVPTXInst<(outs), (ins), "", []>;
+}
+
+// List of vector specific properties
+def isVecLD      : VecInstTypeEnum<1>;
+def isVecST      : VecInstTypeEnum<2>;
+def isVecBuild   : VecInstTypeEnum<3>;
+def isVecShuffle : VecInstTypeEnum<4>;
+def isVecExtract : VecInstTypeEnum<5>;
+def isVecInsert  : VecInstTypeEnum<6>;
+def isVecDest    : VecInstTypeEnum<7>;
+def isVecOther   : VecInstTypeEnum<15>;
+
+//===----------------------------------------------------------------------===//
+// NVPTX Operand Definitions.
+//===----------------------------------------------------------------------===//
+
+def brtarget    : Operand<OtherVT>;
+
+// CVT conversion modes
+// These must match the enum in NVPTX.h
+def CvtNONE : PatLeaf<(i32 0x0)>;
+def CvtRNI  : PatLeaf<(i32 0x1)>;
+def CvtRZI  : PatLeaf<(i32 0x2)>;
+def CvtRMI  : PatLeaf<(i32 0x3)>;
+def CvtRPI  : PatLeaf<(i32 0x4)>;
+def CvtRN   : PatLeaf<(i32 0x5)>;
+def CvtRZ   : PatLeaf<(i32 0x6)>;
+def CvtRM   : PatLeaf<(i32 0x7)>;
+def CvtRP   : PatLeaf<(i32 0x8)>;
+
+def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
+def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
+def CvtRZI_FTZ  : PatLeaf<(i32 0x12)>;
+def CvtRMI_FTZ  : PatLeaf<(i32 0x13)>;
+def CvtRPI_FTZ  : PatLeaf<(i32 0x14)>;
+def CvtRN_FTZ   : PatLeaf<(i32 0x15)>;
+def CvtRZ_FTZ   : PatLeaf<(i32 0x16)>;
+def CvtRM_FTZ   : PatLeaf<(i32 0x17)>;
+def CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
+
+def CvtSAT      : PatLeaf<(i32 0x20)>;
+def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
+
+def CvtMode : Operand<i32> {
+  let PrintMethod = "printCvtMode";
+}
+
+// Compare modes
+// These must match the enum in NVPTX.h
+def CmpEQ   : PatLeaf<(i32 0)>;
+def CmpNE   : PatLeaf<(i32 1)>;
+def CmpLT   : PatLeaf<(i32 2)>;
+def CmpLE   : PatLeaf<(i32 3)>;
+def CmpGT   : PatLeaf<(i32 4)>;
+def CmpGE   : PatLeaf<(i32 5)>;
+def CmpEQU  : PatLeaf<(i32 10)>;
+def CmpNEU  : PatLeaf<(i32 11)>;
+def CmpLTU  : PatLeaf<(i32 12)>;
+def CmpLEU  : PatLeaf<(i32 13)>;
+def CmpGTU  : PatLeaf<(i32 14)>;
+def CmpGEU  : PatLeaf<(i32 15)>;
+def CmpNUM  : PatLeaf<(i32 16)>;
+def CmpNAN  : PatLeaf<(i32 17)>;
+
+def CmpEQ_FTZ   : PatLeaf<(i32 0x100)>;
+def CmpNE_FTZ   : PatLeaf<(i32 0x101)>;
+def CmpLT_FTZ   : PatLeaf<(i32 0x102)>;
+def CmpLE_FTZ   : PatLeaf<(i32 0x103)>;
+def CmpGT_FTZ   : PatLeaf<(i32 0x104)>;
+def CmpGE_FTZ   : PatLeaf<(i32 0x105)>;
+def CmpEQU_FTZ  : PatLeaf<(i32 0x10A)>;
+def CmpNEU_FTZ  : PatLeaf<(i32 0x10B)>;
+def CmpLTU_FTZ  : PatLeaf<(i32 0x10C)>;
+def CmpLEU_FTZ  : PatLeaf<(i32 0x10D)>;
+def CmpGTU_FTZ  : PatLeaf<(i32 0x10E)>;
+def CmpGEU_FTZ  : PatLeaf<(i32 0x10F)>;
+def CmpNUM_FTZ  : PatLeaf<(i32 0x110)>;
+def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
+
+def CmpMode : Operand<i32> {
+  let PrintMethod = "printCmpMode";
+}
+
+//===----------------------------------------------------------------------===//
+// NVPTX Instruction Predicate Definitions
+//===----------------------------------------------------------------------===//
+
+
+def hasAtomRedG32 : Predicate<"Subtarget->hasAtomRedG32()">;
+def hasAtomRedS32 : Predicate<"Subtarget->hasAtomRedS32()">;
+def hasAtomRedGen32 : Predicate<"Subtarget->hasAtomRedGen32()">;
+def useAtomRedG32forGen32 :
+  Predicate<"!Subtarget->hasAtomRedGen32() && Subtarget->hasAtomRedG32()">;
+def hasBrkPt : Predicate<"Subtarget->hasBrkPt()">;
+def hasAtomRedG64 : Predicate<"Subtarget->hasAtomRedG64()">;
+def hasAtomRedS64 : Predicate<"Subtarget->hasAtomRedS64()">;
+def hasAtomRedGen64 : Predicate<"Subtarget->hasAtomRedGen64()">;
+def useAtomRedG64forGen64 :
+  Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">;
+def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">;
+def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
+def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
+def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
+def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
+def hasVote : Predicate<"Subtarget->hasVote()">;
+def hasDouble : Predicate<"Subtarget->hasDouble()">;
+def reqPTX20 : Predicate<"Subtarget->reqPTX20()">;
+def hasLDG : Predicate<"Subtarget->hasLDG()">;
+def hasLDU : Predicate<"Subtarget->hasLDU()">;
+def hasGenericLdSt : Predicate<"Subtarget->hasGenericLdSt()">;
+
+def doF32FTZ : Predicate<"useF32FTZ()">;
+def doNoF32FTZ : Predicate<"!useF32FTZ()">;
+
+def doMulWide      : Predicate<"doMulWide">;
+
+def allowFMA : Predicate<"allowFMA()">;
+def noFMA : Predicate<"!allowFMA()">;
+
+def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
+def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
+
+def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
+def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
+
+def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
+def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
+
+def true : Predicate<"1">;
+
+def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
+
+
+//===----------------------------------------------------------------------===//
+// Some Common Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+// Template for instructions which take three int64, int32, or int16 args.
+// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
+multiclass I3<string OpcStr, SDNode OpNode> {
+  def i64rr :
+    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+  def i64ri :
+    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i32rr :
+    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+  def i32ri :
+    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i16rr :
+    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+  def i16ri :
+    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+}
+
+// Template for instructions which take 3 int32 args.  The instructions are
+// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
+multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
+   def i32rr :
+     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+   def i32ri :
+     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+}
+
+// Template for instructions which take three fp64 or fp32 args.  The
+// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
+//
+// Also defines ftz (flush subnormal inputs and results to sign-preserving
+// zero) variants for fp32 functions.
+//
+// This multiclass should be used for nodes that cannot be folded into FMAs.
+// For nodes that can be folded into FMAs (i.e. adds and muls), use
+// F3_fma_component.
+multiclass F3<string OpcStr, SDNode OpNode> {
+   def f64rr :
+     NVPTXInst<(outs Float64Regs:$dst),
+               (ins Float64Regs:$a, Float64Regs:$b),
+               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+   def f64ri :
+     NVPTXInst<(outs Float64Regs:$dst),
+               (ins Float64Regs:$a, f64imm:$b),
+               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
+   def f32rr_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b),
+               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+               Requires<[doF32FTZ]>;
+   def f32ri_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b),
+               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+               Requires<[doF32FTZ]>;
+   def f32rr :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b),
+               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+   def f32ri :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b),
+               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
+}
+
+// Template for instructions which take three fp64 or fp32 args.  The
+// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
+//
+// Also defines ftz (flush subnormal inputs and results to sign-preserving
+// zero) variants for fp32 functions.
+//
+// This multiclass should be used for nodes that can be folded to make fma ops.
+// In this case, we use the ".rn" variant when FMA is disabled, as this behaves
+// just like the non ".rn" op, but prevents ptxas from creating FMAs.
+multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
+   def f64rr :
+     NVPTXInst<(outs Float64Regs:$dst),
+               (ins Float64Regs:$a, Float64Regs:$b),
+               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+               Requires<[allowFMA]>;
+   def f64ri :
+     NVPTXInst<(outs Float64Regs:$dst),
+               (ins Float64Regs:$a, f64imm:$b),
+               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
+               Requires<[allowFMA]>;
+   def f32rr_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b),
+               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+               Requires<[allowFMA, doF32FTZ]>;
+   def f32ri_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b),
+               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+               Requires<[allowFMA, doF32FTZ]>;
+   def f32rr :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b),
+               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+               Requires<[allowFMA]>;
+   def f32ri :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b),
+               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+               Requires<[allowFMA]>;
+
+   // These have strange names so we don't perturb existing mir tests.
+   def _rnf64rr :
+     NVPTXInst<(outs Float64Regs:$dst),
+               (ins Float64Regs:$a, Float64Regs:$b),
+               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+               Requires<[noFMA]>;
+   def _rnf64ri :
+     NVPTXInst<(outs Float64Regs:$dst),
+               (ins Float64Regs:$a, f64imm:$b),
+               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
+               Requires<[noFMA]>;
+   def _rnf32rr_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b),
+               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+               Requires<[noFMA, doF32FTZ]>;
+   def _rnf32ri_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b),
+               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+               Requires<[noFMA, doF32FTZ]>;
+   def _rnf32rr :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b),
+               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+               Requires<[noFMA]>;
+   def _rnf32ri :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b),
+               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+               Requires<[noFMA]>;
+}
+
+// Template for operations which take two f32 or f64 operands.  Provides three
+// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
+// subnormal inputs and results to zero).
+multiclass F2<string OpcStr, SDNode OpNode> {
+   def f64 :     NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
+                           !strconcat(OpcStr, ".f64 \t$dst, $a;"),
+                           [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
+   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                           !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
+                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
+                           Requires<[doF32FTZ]>;
+   def f32 :     NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                           !strconcat(OpcStr, ".f32 \t$dst, $a;"),
+                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// NVPTX Instructions.
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------
+// Type Conversion
+//-----------------------------------
+
+let hasSideEffects = 0 in {
+  // Generate a cvt to the given type from all possible types.  Each instance
+  // takes a CvtMode immediate that defines the conversion mode to use.  It can
+  // be CvtNONE to omit a conversion mode.
+  multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
+    def _s8 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".s8\t$dst, $src;"), []>;
+    def _u8 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".u8\t$dst, $src;"), []>;
+    def _s16 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".s16\t$dst, $src;"), []>;
+    def _u16 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".u16\t$dst, $src;"), []>;
+    def _f16 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".f16\t$dst, $src;"), []>;
+    def _s32 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int32Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".s32\t$dst, $src;"), []>;
+    def _u32 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int32Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".u32\t$dst, $src;"), []>;
+    def _s64 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int64Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".s64\t$dst, $src;"), []>;
+    def _u64 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int64Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".u64\t$dst, $src;"), []>;
+    def _f32 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Float32Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".f32\t$dst, $src;"), []>;
+    def _f64 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Float64Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".f64\t$dst, $src;"), []>;
+  }
+
+  // Generate cvts from all types to all types.
+  defm CVT_s8  : CVT_FROM_ALL<"s8",  Int16Regs>;
+  defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
+  defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
+  defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
+  defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
+  defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
+  defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
+  defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
+  defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
+  defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
+  defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
+
+  // These cvts are different from those above: The source and dest registers
+  // are of the same type.
+  def CVT_INREG_s16_s8 :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                                    "cvt.s16.s8 \t$dst, $src;", []>;
+  def CVT_INREG_s32_s8 :  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                                    "cvt.s32.s8 \t$dst, $src;", []>;
+  def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                                    "cvt.s32.s16 \t$dst, $src;", []>;
+  def CVT_INREG_s64_s8 :  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                                    "cvt.s64.s8 \t$dst, $src;", []>;
+  def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                                    "cvt.s64.s16 \t$dst, $src;", []>;
+  def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                                    "cvt.s64.s32 \t$dst, $src;", []>;
+}
+
+//-----------------------------------
+// Integer Arithmetic
+//-----------------------------------
+
+// Template for xor masquerading as int1 arithmetic.
+multiclass ADD_SUB_i1<SDNode OpNode> {
+   def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+                      "xor.pred \t$dst, $a, $b;",
+                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+   def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+                      "xor.pred \t$dst, $a, $b;",
+                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
+}
+
+// int1 addition and subtraction are both just xor.
+defm ADD_i1 : ADD_SUB_i1<add>;
+defm SUB_i1 : ADD_SUB_i1<sub>;
+
+// int16, int32, and int64 signed addition.  Since nvptx is 2's compliment, we
+// also use these for unsigned arithmetic.
+defm ADD : I3<"add.s", add>;
+defm SUB : I3<"sub.s", sub>;
+
+// int32 addition and subtraction with carry-out.
+// FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?).
+defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
+defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
+
+// int32 addition and subtraction with carry-in and carry-out.
+defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
+defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
+
+defm MULT : I3<"mul.lo.s", mul>;
+
+defm MULTHS : I3<"mul.hi.s", mulhs>;
+defm MULTHU : I3<"mul.hi.u", mulhu>;
+
+defm SDIV : I3<"div.s", sdiv>;
+defm UDIV : I3<"div.u", udiv>;
+
+// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
+// will lower it.
+defm SREM : I3<"rem.s", srem>;
+defm UREM : I3<"rem.u", urem>;
+
+
+//
+// Wide multiplication
+//
+def MULWIDES64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+            "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+            "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+            "mul.wide.s32 \t$dst, $a, $b;", []>;
+
+def MULWIDEU64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+            "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+            "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+            "mul.wide.u32 \t$dst, $a, $b;", []>;
+
+def MULWIDES32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+            "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+            "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+            "mul.wide.s16 \t$dst, $a, $b;", []>;
+
+def MULWIDEU32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+            "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+            "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+            "mul.wide.u16 \t$dst, $a, $b;", []>;
+
+def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
+def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
+def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
+
+// Matchers for signed, unsigned mul.wide ISD nodes.
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
+          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
+          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
+          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
+          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
+      Requires<[doMulWide]>;
+
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
+          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
+          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
+          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
+          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
+      Requires<[doMulWide]>;
+
+// Predicates used for converting some patterns to mul.wide.
+def SInt32Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  return v.isSignedIntN(32);
+}]>;
+
+def UInt32Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  return v.isIntN(32);
+}]>;
+
+def SInt16Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  return v.isSignedIntN(16);
+}]>;
+
+def UInt16Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  return v.isIntN(16);
+}]>;
+
+def Int5Const : PatLeaf<(imm), [{
+  // Check if 0 <= v < 32; only then will the result of (x << v) be an int32.
+  const APInt &v = N->getAPIntValue();
+  return v.sge(0) && v.slt(32);
+}]>;
+
+def Int4Const : PatLeaf<(imm), [{
+  // Check if 0 <= v < 16; only then will the result of (x << v) be an int16.
+  const APInt &v = N->getAPIntValue();
+  return v.sge(0) && v.slt(16);
+}]>;
+
+def SHL2MUL32 : SDNodeXForm<imm, [{
+  const APInt &v = N->getAPIntValue();
+  APInt temp(32, 1);
+  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
+}]>;
+
+def SHL2MUL16 : SDNodeXForm<imm, [{
+  const APInt &v = N->getAPIntValue();
+  APInt temp(16, 1);
+  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
+}]>;
+
+// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
+def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
+          (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+      Requires<[doMulWide]>;
+def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
+          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+      Requires<[doMulWide]>;
+
+def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
+          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+      Requires<[doMulWide]>;
+def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
+          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+      Requires<[doMulWide]>;
+
+// Convert "sign/zero-extend then multiply" to mul.wide.
+def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
+          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
+          (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
+      Requires<[doMulWide]>;
+
+def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
+          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
+          (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
+      Requires<[doMulWide]>;
+
+def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
+          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
+          (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
+      Requires<[doMulWide]>;
+
+def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
+          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
+          (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
+      Requires<[doMulWide]>;
+
+//
+// Integer multiply-add
+//
+def SDTIMAD :
+  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
+                       SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
+def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
+
+def MAD16rrr :
+  NVPTXInst<(outs Int16Regs:$dst),
+            (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+            "mad.lo.s16 \t$dst, $a, $b, $c;",
+            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
+def MAD16rri :
+  NVPTXInst<(outs Int16Regs:$dst),
+            (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
+            "mad.lo.s16 \t$dst, $a, $b, $c;",
+            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
+def MAD16rir :
+  NVPTXInst<(outs Int16Regs:$dst),
+            (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
+            "mad.lo.s16 \t$dst, $a, $b, $c;",
+            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
+def MAD16rii :
+  NVPTXInst<(outs Int16Regs:$dst),
+            (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
+            "mad.lo.s16 \t$dst, $a, $b, $c;",
+            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
+
+def MAD32rrr :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
+            "mad.lo.s32 \t$dst, $a, $b, $c;",
+            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
+def MAD32rri :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
+            "mad.lo.s32 \t$dst, $a, $b, $c;",
+            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
+def MAD32rir :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
+            "mad.lo.s32 \t$dst, $a, $b, $c;",
+            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
+def MAD32rii :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
+            "mad.lo.s32 \t$dst, $a, $b, $c;",
+            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>;
+
+def MAD64rrr :
+  NVPTXInst<(outs Int64Regs:$dst),
+            (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
+            "mad.lo.s64 \t$dst, $a, $b, $c;",
+            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
+def MAD64rri :
+  NVPTXInst<(outs Int64Regs:$dst),
+            (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
+            "mad.lo.s64 \t$dst, $a, $b, $c;",
+            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
+def MAD64rir :
+  NVPTXInst<(outs Int64Regs:$dst),
+            (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
+            "mad.lo.s64 \t$dst, $a, $b, $c;",
+            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
+def MAD64rii :
+  NVPTXInst<(outs Int64Regs:$dst),
+            (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
+            "mad.lo.s64 \t$dst, $a, $b, $c;",
+            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
+
+def INEG16 :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+            "neg.s16 \t$dst, $src;",
+            [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
+def INEG32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+            "neg.s32 \t$dst, $src;",
+            [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
+def INEG64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+            "neg.s64 \t$dst, $src;",
+            [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
+
+//-----------------------------------
+// Floating Point Arithmetic
+//-----------------------------------
+
+// Constant 1.0f
+def FloatConst1 : PatLeaf<(fpimm), [{
+  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
+         N->getValueAPF().convertToFloat() == 1.0f;
+}]>;
+// Constant 1.0 (double)
+def DoubleConst1 : PatLeaf<(fpimm), [{
+  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
+         N->getValueAPF().convertToDouble() == 1.0;
+}]>;
+
+defm FADD : F3_fma_component<"add", fadd>;
+defm FSUB : F3_fma_component<"sub", fsub>;
+defm FMUL : F3_fma_component<"mul", fmul>;
+
+defm FMIN : F3<"min", fminnum>;
+defm FMAX : F3<"max", fmaxnum>;
+
+defm FABS  : F2<"abs", fabs>;
+defm FNEG  : F2<"neg", fneg>;
+defm FSQRT : F2<"sqrt.rn", fsqrt>;
+
+//
+// F64 division
+//
+def FDIV641r :
+  NVPTXInst<(outs Float64Regs:$dst),
+            (ins f64imm:$a, Float64Regs:$b),
+            "rcp.rn.f64 \t$dst, $b;",
+            [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
+def FDIV64rr :
+  NVPTXInst<(outs Float64Regs:$dst),
+            (ins Float64Regs:$a, Float64Regs:$b),
+            "div.rn.f64 \t$dst, $a, $b;",
+            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
+def FDIV64ri :
+  NVPTXInst<(outs Float64Regs:$dst),
+            (ins Float64Regs:$a, f64imm:$b),
+            "div.rn.f64 \t$dst, $a, $b;",
+            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
+
+//
+// F32 Approximate reciprocal
+//
+def FDIV321r_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.approx.ftz.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV321r :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.approx.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_APPROX]>;
+//
+// F32 Approximate division
+//
+def FDIV32approxrr_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.approx.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxri_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.approx.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxrr :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.approx.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_APPROX]>;
+def FDIV32approxri :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.approx.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[do_DIVF32_APPROX]>;
+//
+// F32 Semi-accurate reciprocal
+//
+// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
+//
+def FDIV321r_approx_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.approx.ftz.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV321r_approx :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.approx.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_FULL]>;
+//
+// F32 Semi-accurate division
+//
+def FDIV32rr_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.full.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32ri_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.full.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32rr :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.full.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_FULL]>;
+def FDIV32ri :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.full.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[do_DIVF32_FULL]>;
+//
+// F32 Accurate reciprocal
+//
+def FDIV321r_prec_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.rn.ftz.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[reqPTX20, doF32FTZ]>;
+def FDIV321r_prec :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.rn.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[reqPTX20]>;
+//
+// F32 Accurate division
+//
+def FDIV32rr_prec_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.rn.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32ri_prec_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.rn.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32rr_prec :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.rn.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[reqPTX20]>;
+def FDIV32ri_prec :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.rn.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[reqPTX20]>;
+
+//
+// F32 rsqrt
+//
+
+def RSQRTF32approx1r : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$b),
+                       "rsqrt.approx.f32 \t$dst, $b;", []>;
+
+// Convert 1.0f/sqrt(x) to rsqrt.approx.f32.  (There is an rsqrt.approx.f64, but
+// it's emulated in software.)
+def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f Float32Regs:$b)),
+         (RSQRTF32approx1r Float32Regs:$b)>,
+         Requires<[do_DIVF32_FULL, do_SQRTF32_APPROX, doNoF32FTZ]>;
+
+multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
+   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
+                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
+                       Requires<[Pred]>;
+   def rri : NVPTXInst<(outs RC:$dst),
+                       (ins RC:$a, RC:$b, ImmCls:$c),
+                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                       [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
+                       Requires<[Pred]>;
+   def rir : NVPTXInst<(outs RC:$dst),
+                       (ins RC:$a, ImmCls:$b, RC:$c),
+                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                       [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
+                       Requires<[Pred]>;
+   def rii : NVPTXInst<(outs RC:$dst),
+                       (ins RC:$a, ImmCls:$b, ImmCls:$c),
+                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                       [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
+                       Requires<[Pred]>;
+}
+
+defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
+defm FMA32     : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
+defm FMA64     : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
+
+// sin/cos
+def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                      "sin.approx.f32 \t$dst, $src;",
+                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>;
+def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                      "cos.approx.f32 \t$dst, $src;",
+                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>;
+
+// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)),
+// i.e. "poor man's fmod()"
+
+// frem - f32 FTZ
+def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
+          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
+            (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ),
+             Float32Regs:$y))>,
+          Requires<[doF32FTZ]>;
+def : Pat<(frem Float32Regs:$x, fpimm:$y),
+          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
+            (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ),
+             fpimm:$y))>,
+          Requires<[doF32FTZ]>;
+
+// frem - f32
+def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
+          (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
+            (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI),
+             Float32Regs:$y))>;
+def : Pat<(frem Float32Regs:$x, fpimm:$y),
+          (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
+            (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI),
+             fpimm:$y))>;
+
+// frem - f64
+def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
+          (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
+            (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI),
+             Float64Regs:$y))>;
+def : Pat<(frem Float64Regs:$x, fpimm:$y),
+          (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
+            (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI),
+             fpimm:$y))>;
+
+//-----------------------------------
+// Bitwise operations
+//-----------------------------------
+
+// Template for three-arg bitwise operations.  Takes three args, Creates .b16,
+// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
+multiclass BITWISE<string OpcStr, SDNode OpNode> {
+  def b1rr :
+    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
+              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+  def b1ri :
+    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
+              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
+  def b16rr :
+    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+  def b16ri :
+    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+  def b32rr :
+    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
+              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+  def b32ri :
+    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
+              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def b64rr :
+    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
+              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+  def b64ri :
+    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
+              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+}
+
+defm OR  : BITWISE<"or", or>;
+defm AND : BITWISE<"and", and>;
+defm XOR : BITWISE<"xor", xor>;
+
+def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
+                      "not.pred \t$dst, $src;",
+                      [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
+def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                      "not.b16 \t$dst, $src;",
+                      [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
+def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                      "not.b32 \t$dst, $src;",
+                      [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
+def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                       "not.b64 \t$dst, $src;",
+                       [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
+
+// Template for left/right shifts.  Takes three operands,
+//   [dest (reg), src (reg), shift (reg or imm)].
+// dest and src may be int64, int32, or int16, but shift is always int32.
+//
+// This template also defines a 32-bit shift (imm, imm) instruction.
+multiclass SHIFT<string OpcStr, SDNode OpNode> {
+   def i64rr :
+     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
+               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>;
+   def i64ri :
+     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
+   def i32rr :
+     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+   def i32ri :
+     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>;
+   def i32ii :
+     NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
+   def i16rr :
+     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
+               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>;
+   def i16ri :
+     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
+}
+
+defm SHL : SHIFT<"shl.b", shl>;
+defm SRA : SHIFT<"shr.s", sra>;
+defm SRL : SHIFT<"shr.u", srl>;
+
+//
+// Rotate: Use ptx shf instruction if available.
+//
+
+// 32 bit r2 = rotl r1, n
+//    =>
+//        r2 = shf.l r1, r1, n
+def ROTL32imm_hw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
+            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+            [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
+           Requires<[hasHWROT32]>;
+
+def ROTL32reg_hw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+           Requires<[hasHWROT32]>;
+
+// 32 bit r2 = rotr r1, n
+//    =>
+//        r2 = shf.r r1, r1, n
+def ROTR32imm_hw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
+            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+            [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
+           Requires<[hasHWROT32]>;
+
+def ROTR32reg_hw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+           Requires<[hasHWROT32]>;
+
+// 32-bit software rotate by immediate.  $amt2 should equal 32 - $amt1.
+def ROT32imm_sw :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
+            "{{\n\t"
+            ".reg .b32 %lhs;\n\t"
+            ".reg .b32 %rhs;\n\t"
+            "shl.b32 \t%lhs, $src, $amt1;\n\t"
+            "shr.b32 \t%rhs, $src, $amt2;\n\t"
+            "add.u32 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            []>;
+
+def SUB_FRM_32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
+          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+      Requires<[noHWROT32]>;
+def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
+          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
+      Requires<[noHWROT32]>;
+
+// 32-bit software rotate left by register.
+def ROTL32reg_sw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+            "{{\n\t"
+            ".reg .b32 %lhs;\n\t"
+            ".reg .b32 %rhs;\n\t"
+            ".reg .b32 %amt2;\n\t"
+            "shl.b32 \t%lhs, $src, $amt;\n\t"
+            "sub.s32 \t%amt2, 32, $amt;\n\t"
+            "shr.b32 \t%rhs, $src, %amt2;\n\t"
+            "add.u32 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+           Requires<[noHWROT32]>;
+
+// 32-bit software rotate right by register.
+def ROTR32reg_sw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+            "{{\n\t"
+            ".reg .b32 %lhs;\n\t"
+            ".reg .b32 %rhs;\n\t"
+            ".reg .b32 %amt2;\n\t"
+            "shr.b32 \t%lhs, $src, $amt;\n\t"
+            "sub.s32 \t%amt2, 32, $amt;\n\t"
+            "shl.b32 \t%rhs, $src, %amt2;\n\t"
+            "add.u32 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+           Requires<[noHWROT32]>;
+
+// 64-bit software rotate by immediate.  $amt2 should equal 64 - $amt1.
+def ROT64imm_sw :
+  NVPTXInst<(outs Int64Regs:$dst),
+            (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
+            "{{\n\t"
+            ".reg .b64 %lhs;\n\t"
+            ".reg .b64 %rhs;\n\t"
+            "shl.b64 \t%lhs, $src, $amt1;\n\t"
+            "shr.b64 \t%rhs, $src, $amt2;\n\t"
+            "add.u64 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            []>;
+
+def SUB_FRM_64 : SDNodeXForm<imm, [{
+    return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
+def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
+
+// 64-bit software rotate left by register.
+def ROTL64reg_sw :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
+            "{{\n\t"
+            ".reg .b64 %lhs;\n\t"
+            ".reg .b64 %rhs;\n\t"
+            ".reg .u32 %amt2;\n\t"
+            "shl.b64 \t%lhs, $src, $amt;\n\t"
+            "sub.u32 \t%amt2, 64, $amt;\n\t"
+            "shr.b64 \t%rhs, $src, %amt2;\n\t"
+            "add.u64 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
+
+def ROTR64reg_sw :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
+            "{{\n\t"
+            ".reg .b64 %lhs;\n\t"
+            ".reg .b64 %rhs;\n\t"
+            ".reg .u32 %amt2;\n\t"
+            "shr.b64 \t%lhs, $src, $amt;\n\t"
+            "sub.u32 \t%amt2, 64, $amt;\n\t"
+            "shl.b64 \t%rhs, $src, %amt2;\n\t"
+            "add.u64 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
+
+//
+// Funnnel shift in clamp mode
+//
+
+// Create SDNodes so they can be used in the DAG code, e.g.
+// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
+def SDTIntShiftDOp :
+  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                       SDTCisInt<0>, SDTCisInt<3>]>;
+def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
+def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
+
+def FUNSHFLCLAMP :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+            "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
+            [(set Int32Regs:$dst,
+              (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
+
+def FUNSHFRCLAMP :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+            "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
+            [(set Int32Regs:$dst,
+             (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
+
+//
+// BFE - bit-field extract
+//
+
+// Template for BFE instructions.  Takes four args,
+//   [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
+// Start may be an imm only if end is also an imm.  FIXME: Is this a
+// restriction in PTX?
+//
+// dest and src may be int32 or int64, but start and end are always int32.
+multiclass BFE<string TyStr, RegisterClass RC> {
+  def rrr
+    : NVPTXInst<(outs RC:$d),
+                (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
+                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+  def rri
+    : NVPTXInst<(outs RC:$d),
+                (ins RC:$a, Int32Regs:$b, i32imm:$c),
+                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+  def rii
+    : NVPTXInst<(outs RC:$d),
+                (ins RC:$a, i32imm:$b, i32imm:$c),
+                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+}
+
+let hasSideEffects = 0 in {
+  defm BFE_S32 : BFE<"s32", Int32Regs>;
+  defm BFE_U32 : BFE<"u32", Int32Regs>;
+  defm BFE_S64 : BFE<"s64", Int64Regs>;
+  defm BFE_U64 : BFE<"u64", Int64Regs>;
+}
+
+//-----------------------------------
+// Comparison instructions (setp, set)
+//-----------------------------------
+
+// FIXME: This doesn't cover versions of set and setp that combine with a
+// boolean predicate, e.g. setp.eq.and.b16.
+
+let hasSideEffects = 0 in {
+  multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+    def rr :
+      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
+                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+                           "\t$dst, $a, $b;"), []>;
+    def ri :
+      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+                           "\t$dst, $a, $b;"), []>;
+    def ir :
+      NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+                           "\t$dst, $a, $b;"), []>;
+  }
+}
+
+defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
+defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
+defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
+defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
+defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
+defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
+defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
+defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
+defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
+defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
+defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
+
+// FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
+// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
+// reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
+
+let hasSideEffects = 0 in {
+  multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
+    def rr : NVPTXInst<(outs Int32Regs:$dst),
+                       (ins RC:$a, RC:$b, CmpMode:$cmp),
+                       !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+    def ri : NVPTXInst<(outs Int32Regs:$dst),
+                       (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+                       !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+    def ir : NVPTXInst<(outs Int32Regs:$dst),
+                       (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+                       !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+  }
+}
+
+defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
+defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
+defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
+defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
+defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
+defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
+defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
+defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
+defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
+defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
+defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
+
+//-----------------------------------
+// Selection instructions (selp)
+//-----------------------------------
+
+// FIXME: Missing slct
+
+// selp instructions that don't have any pattern matches; we explicitly use
+// them within this file.
+let hasSideEffects = 0 in {
+  multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+    def rr : NVPTXInst<(outs RC:$dst),
+                       (ins RC:$a, RC:$b, Int1Regs:$p),
+                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+    def ri : NVPTXInst<(outs RC:$dst),
+                       (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+    def ir : NVPTXInst<(outs RC:$dst),
+                       (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+    def ii : NVPTXInst<(outs RC:$dst),
+                       (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+  }
+
+  multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
+                          SDNode ImmNode> {
+    def rr :
+      NVPTXInst<(outs RC:$dst),
+                (ins RC:$a, RC:$b, Int1Regs:$p),
+                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
+    def ri :
+      NVPTXInst<(outs RC:$dst),
+                (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
+    def ir :
+      NVPTXInst<(outs RC:$dst),
+                (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
+    def ii :
+      NVPTXInst<(outs RC:$dst),
+                (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
+  }
+}
+
+// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
+// good.
+defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
+defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
+defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
+defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;
+defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
+defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
+defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
+defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
+defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
+defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
+defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
+
+//-----------------------------------
+// Data Movement (Load / Store, Move)
+//-----------------------------------
+
+def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
+                            [SDNPWantRoot]>;
+def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
+                              [SDNPWantRoot]>;
+
+def MEMri : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops Int32Regs, i32imm);
+}
+def MEMri64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops Int64Regs, i64imm);
+}
+
+def imem : Operand<iPTR> {
+  let PrintMethod = "printOperand";
+}
+
+def imemAny : Operand<iPTRAny> {
+  let PrintMethod = "printOperand";
+}
+
+def LdStCode : Operand<i32> {
+  let PrintMethod = "printLdStCode";
+}
+
+def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
+
+// Load a memory address into a u32 or u64 register.
+def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
+                         "mov.u32 \t$dst, $a;",
+                         [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
+                           "mov.u64 \t$dst, $a;",
+                           [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+
+// Get pointer to local stack.
+let hasSideEffects = 0 in {
+  def MOV_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
+                                     "mov.u32 \t$d, __local_depot$num;", []>;
+  def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
+                                    "mov.u64 \t$d, __local_depot$num;", []>;
+}
+
+
+// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
+let IsSimpleMove=1, hasSideEffects=0 in {
+  def IMOV1rr :  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
+                           "mov.pred \t$dst, $sss;", []>;
+  def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
+                           "mov.u16 \t$dst, $sss;", []>;
+  def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
+                           "mov.u32 \t$dst, $sss;", []>;
+  def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
+                           "mov.u64 \t$dst, $sss;", []>;
+
+  def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                           "mov.f32 \t$dst, $src;", []>;
+  def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
+                           "mov.f64 \t$dst, $src;", []>;
+}
+
+def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
+                        "mov.pred \t$dst, $src;",
+                        [(set Int1Regs:$dst, imm:$src)]>;
+def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
+                         "mov.u16 \t$dst, $src;",
+                         [(set Int16Regs:$dst, imm:$src)]>;
+def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
+                         "mov.u32 \t$dst, $src;",
+                         [(set Int32Regs:$dst, imm:$src)]>;
+def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
+                        "mov.u64 \t$dst, $src;",
+                        [(set Int64Regs:$dst, imm:$src)]>;
+
+def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
+                         "mov.f32 \t$dst, $src;",
+                         [(set Float32Regs:$dst, fpimm:$src)]>;
+def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
+                         "mov.f64 \t$dst, $src;",
+                         [(set Float64Regs:$dst, fpimm:$src)]>;
+
+def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
+
+//---- Copy Frame Index ----
+def LEA_ADDRi :   NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
+                            "add.u32 \t$dst, ${addr:add};",
+                            [(set Int32Regs:$dst, ADDRri:$addr)]>;
+def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
+                            "add.u64 \t$dst, ${addr:add};",
+                            [(set Int64Regs:$dst, ADDRri64:$addr)]>;
+
+//-----------------------------------
+// Comparison and Selection
+//-----------------------------------
+
+multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
+                       Instruction setp_16rr,
+                       Instruction setp_16ri,
+                       Instruction setp_16ir,
+                       Instruction setp_32rr,
+                       Instruction setp_32ri,
+                       Instruction setp_32ir,
+                       Instruction setp_64rr,
+                       Instruction setp_64ri,
+                       Instruction setp_64ir,
+                       Instruction set_16rr,
+                       Instruction set_16ri,
+                       Instruction set_16ir,
+                       Instruction set_32rr,
+                       Instruction set_32ri,
+                       Instruction set_32ir,
+                       Instruction set_64rr,
+                       Instruction set_64ri,
+                       Instruction set_64ir> {
+  // i16 -> pred
+  def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
+            (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
+            (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
+            (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
+  // i32 -> pred
+  def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
+            (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
+            (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
+            (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
+  // i64 -> pred
+  def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
+            (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
+            (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
+            (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
+
+  // i16 -> i32
+  def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
+            (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
+            (set_16ri Int16Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
+            (set_16ir imm:$a, Int16Regs:$b, Mode)>;
+  // i32 -> i32
+  def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
+            (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
+            (set_32ri Int32Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
+            (set_32ir imm:$a, Int32Regs:$b, Mode)>;
+  // i64 -> i32
+  def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
+            (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
+            (set_64ri Int64Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
+            (set_64ir imm:$a, Int64Regs:$b, Mode)>;
+}
+
+multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
+  : ISET_FORMAT<OpNode, Mode,
+                SETP_s16rr, SETP_s16ri, SETP_s16ir,
+                SETP_s32rr, SETP_s32ri, SETP_s32ir,
+                SETP_s64rr, SETP_s64ri, SETP_s64ir,
+                SET_s16rr, SET_s16ri, SET_s16ir,
+                SET_s32rr, SET_s32ri, SET_s32ir,
+                SET_s64rr, SET_s64ri, SET_s64ir> {
+  // TableGen doesn't like empty multiclasses.
+  def : PatLeaf<(i32 0)>;
+}
+
+multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
+  : ISET_FORMAT<OpNode, Mode,
+                SETP_u16rr, SETP_u16ri, SETP_u16ir,
+                SETP_u32rr, SETP_u32ri, SETP_u32ir,
+                SETP_u64rr, SETP_u64ri, SETP_u64ir,
+                SET_u16rr, SET_u16ri, SET_u16ir,
+                SET_u32rr, SET_u32ri, SET_u32ir,
+                SET_u64rr, SET_u64ri, SET_u64ir> {
+  // TableGen doesn't like empty multiclasses.
+  def : PatLeaf<(i32 0)>;
+}
+
+defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
+defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
+defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
+defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
+defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
+defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
+defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
+defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
+defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
+defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
+defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
+defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
+
+// i1 compares
+def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
+          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
+def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
+          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
+
+def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
+          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
+          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+
+// i1 compare -> i32
+def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
+          (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
+          (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+
+
+
+multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
+  // f32 -> pred
+  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
+            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
+            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
+            (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
+            (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
+            (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
+            (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+
+  // f64 -> pred
+  def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
+            (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
+            (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+  def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
+            (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+
+  // f32 -> i32
+  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
+            (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
+            (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
+            (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
+            (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
+            (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
+            (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+
+  // f64 -> i32
+  def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
+            (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
+            (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+  def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
+            (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+}
+
+defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
+defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
+defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
+defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
+defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
+defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
+
+defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
+defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
+defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
+defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
+defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
+defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
+
+defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
+defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
+defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
+defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
+defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
+defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
+
+defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
+defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
+
+// FIXME: What is this doing here?  Can it be deleted?
+// def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
+//                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def SDTDeclareParamProfile :
+  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTDeclareScalarParamProfile :
+  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
+def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
+def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
+def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
+def SDTCallValProfile : SDTypeProfile<1, 0, []>;
+def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
+def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
+def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
+def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
+
+def DeclareParam :
+  SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareScalarParam :
+  SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRetParam :
+  SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRet :
+  SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LoadParam :
+  SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
+         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def LoadParamV2 :
+  SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
+         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def LoadParamV4 :
+  SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
+         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def PrintCall :
+  SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintConvergentCall :
+  SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintCallUni :
+  SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintConvergentCallUni :
+  SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParam :
+  SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamV2 :
+  SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamV4 :
+  SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamU32 :
+  SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamS32 :
+  SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgBegin :
+  SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArg :
+  SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LastCallArg :
+  SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgEnd :
+  SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVoid :
+  SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def Prototype :
+  SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVal :
+  SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def MoveParam :
+  SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
+def StoreRetval :
+  SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
+         [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetvalV2 :
+  SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
+         [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetvalV4 :
+  SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
+         [SDNPHasChain, SDNPSideEffect]>;
+def PseudoUseParam :
+  SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def RETURNNode :
+  SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
+         [SDNPHasChain, SDNPSideEffect]>;
+
+let mayLoad = 1 in {
+  class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+                  !strconcat(!strconcat("ld.param", opstr),
+                             "\t$dst, [retval0+$b];"),
+                  []>;
+
+  class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
+                  !strconcat("ld.param.v2", opstr,
+                             "\t{{$dst, $dst2}}, [retval0+$b];"), []>;
+
+  class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
+                        regclass:$dst4),
+                  (ins i32imm:$b),
+                  !strconcat("ld.param.v4", opstr,
+                             "\t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
+                  []>;
+}
+
+class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+                !strconcat("mov", opstr, "\t$dst, retval$b;"),
+                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
+
+let mayStore = 1 in {
+  class StoreParamInst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+                  !strconcat("st.param", opstr, "\t[param$a+$b], $val;"),
+                  []>;
+
+  class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
+                               i32imm:$a, i32imm:$b),
+                  !strconcat("st.param.v2", opstr,
+                             "\t[param$a+$b], {{$val, $val2}};"),
+                  []>;
+
+  class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
+                               regclass:$val4, i32imm:$a,
+                               i32imm:$b),
+                  !strconcat("st.param.v4", opstr,
+                             "\t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
+                  []>;
+
+  class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
+                  !strconcat("st.param", opstr, "\t[func_retval0+$a], $val;"),
+                  []>;
+
+  class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
+                  !strconcat("st.param.v2", opstr,
+                             "\t[func_retval0+$a], {{$val, $val2}};"),
+                  []>;
+
+  class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs),
+                  (ins regclass:$val, regclass:$val2, regclass:$val3,
+                       regclass:$val4, i32imm:$a),
+                  !strconcat("st.param.v4", opstr,
+                             "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
+                  []>;
+}
+
+let isCall=1 in {
+  multiclass CALL<string OpcStr, SDNode OpNode> {
+     def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
+     def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
+     def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
+     def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
+     def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
+       [(OpNode (i32 4))]>;
+     def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
+       [(OpNode (i32 5))]>;
+     def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+                            "retval5), "),
+       [(OpNode (i32 6))]>;
+     def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+                            "retval5, retval6), "),
+       [(OpNode (i32 7))]>;
+     def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+                            "retval5, retval6, retval7), "),
+       [(OpNode (i32 8))]>;
+  }
+}
+
+defm Call : CALL<"call", PrintCall>;
+defm CallUni : CALL<"call.uni", PrintCallUni>;
+
+// Convergent call instructions.  These are identical to regular calls, except
+// they have the isConvergent bit set.
+let isConvergent=1 in {
+  defm ConvergentCall : CALL<"call", PrintConvergentCall>;
+  defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
+}
+
+def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
+def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
+def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
+def LoadParamMemI8     : LoadParamMemInst<Int16Regs, ".b8">;
+def LoadParamMemV2I64  : LoadParamV2MemInst<Int64Regs, ".b64">;
+def LoadParamMemV2I32  : LoadParamV2MemInst<Int32Regs, ".b32">;
+def LoadParamMemV2I16  : LoadParamV2MemInst<Int16Regs, ".b16">;
+def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
+def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
+def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
+def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
+def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
+def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
+def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
+def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
+def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
+
+def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
+def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
+
+def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
+def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
+def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
+def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
+def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
+def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
+
+def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
+def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
+def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
+
+def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
+def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
+def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
+def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
+def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
+
+def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
+def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
+def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
+def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
+def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
+def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
+def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
+def StoreRetvalV2I8   : StoreRetvalV2Inst<Int16Regs, ".b8">;
+def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
+def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
+def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
+
+def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
+def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
+def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
+def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
+def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
+
+def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
+def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
+def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
+def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
+
+class CallArgInst<NVPTXRegClass regclass> :
+  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
+            [(CallArg (i32 0), regclass:$a)]>;
+
+class LastCallArgInst<NVPTXRegClass regclass> :
+  NVPTXInst<(outs), (ins regclass:$a), "$a",
+            [(LastCallArg (i32 0), regclass:$a)]>;
+
+def CallArgI64     : CallArgInst<Int64Regs>;
+def CallArgI32     : CallArgInst<Int32Regs>;
+def CallArgI16     : CallArgInst<Int16Regs>;
+def CallArgF64     : CallArgInst<Float64Regs>;
+def CallArgF32     : CallArgInst<Float32Regs>;
+
+def LastCallArgI64 : LastCallArgInst<Int64Regs>;
+def LastCallArgI32 : LastCallArgInst<Int32Regs>;
+def LastCallArgI16 : LastCallArgInst<Int16Regs>;
+def LastCallArgF64 : LastCallArgInst<Float64Regs>;
+def LastCallArgF32 : LastCallArgInst<Float32Regs>;
+
+def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
+                              [(CallArg (i32 0), (i32 imm:$a))]>;
+def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
+                                  [(LastCallArg (i32 0), (i32 imm:$a))]>;
+
+def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
+                             [(CallArg (i32 1), (i32 imm:$a))]>;
+def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
+                                 [(LastCallArg (i32 1), (i32 imm:$a))]>;
+
+def CallVoidInst :      NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
+                                  [(CallVoid (Wrapper tglobaladdr:$addr))]>;
+def CallVoidInstReg :   NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
+                                  [(CallVoid Int32Regs:$addr)]>;
+def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
+                                  [(CallVoid Int64Regs:$addr)]>;
+def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
+                                  [(Prototype (i32 imm:$val))]>;
+
+def DeclareRetMemInst :
+  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
+            ".param .align $align .b8 retval$num[$size];",
+            [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetScalarInst :
+  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+            ".param .b$size retval$num;",
+            [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetRegInst :
+  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+            ".reg .b$size retval$num;",
+            [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
+
+def DeclareParamInst :
+  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
+            ".param .align $align .b8 param$a[$size];",
+            [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
+def DeclareScalarParamInst :
+  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+            ".param .b$size param$a;",
+            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
+def DeclareScalarRegInst :
+  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+            ".reg .b$size param$a;",
+            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
+
+class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
+  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
+            !strconcat("mov", asmstr, "\t$dst, $src;"),
+            [(set regclass:$dst, (MoveParam regclass:$src))]>;
+
+def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
+def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
+def MoveParamI16 :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+            "cvt.u16.u32\t$dst, $src;",
+            [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
+def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
+def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
+
+class PseudoUseParamInst<NVPTXRegClass regclass> :
+  NVPTXInst<(outs), (ins regclass:$src),
+            "// Pseudo use of $src",
+            [(PseudoUseParam regclass:$src)]>;
+
+def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
+def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
+def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
+def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
+def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
+
+
+//
+// Load / Store Handling
+//
+multiclass LD<NVPTXRegClass regclass> {
+  def _avar : NVPTXInst<
+    (outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, imem:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr];", []>;
+  def _areg : NVPTXInst<
+    (outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr];", []>;
+  def _areg_64 : NVPTXInst<
+    (outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int64Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr];", []>;
+  def _ari : NVPTXInst<
+    (outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr+$offset];", []>;
+  def _ari_64 : NVPTXInst<
+    (outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr+$offset];", []>;
+  def _asi : NVPTXInst<
+    (outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr+$offset];", []>;
+}
+
+let mayLoad=1, hasSideEffects=0 in {
+  defm LD_i8  : LD<Int16Regs>;
+  defm LD_i16 : LD<Int16Regs>;
+  defm LD_i32 : LD<Int32Regs>;
+  defm LD_i64 : LD<Int64Regs>;
+  defm LD_f32 : LD<Float32Regs>;
+  defm LD_f64 : LD<Float64Regs>;
+}
+
+multiclass ST<NVPTXRegClass regclass> {
+  def _avar : NVPTXInst<
+    (outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+         LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr], $src;", []>;
+  def _areg : NVPTXInst<
+    (outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr], $src;", []>;
+  def _areg_64 : NVPTXInst<
+    (outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr], $src;", []>;
+  def _ari : NVPTXInst<
+    (outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+         LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr+$offset], $src;", []>;
+  def _ari_64 : NVPTXInst<
+    (outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr+$offset], $src;", []>;
+  def _asi : NVPTXInst<
+    (outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+         LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr+$offset], $src;", []>;
+}
+
+let mayStore=1, hasSideEffects=0 in {
+  defm ST_i8  : ST<Int16Regs>;
+  defm ST_i16 : ST<Int16Regs>;
+  defm ST_i32 : ST<Int32Regs>;
+  defm ST_i64 : ST<Int64Regs>;
+  defm ST_f32 : ST<Float32Regs>;
+  defm ST_f64 : ST<Float64Regs>;
+}
+
+// The following is used only in and after vector elementizations.  Vector
+// elementization happens at the machine instruction level, so the following
+// instructions never appear in the DAG.
+multiclass LD_VEC<NVPTXRegClass regclass> {
+  def _v2_avar : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, imem:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr];", []>;
+  def _v2_areg : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr];", []>;
+  def _v2_areg_64 : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int64Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr];", []>;
+  def _v2_ari : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+  def _v2_ari_64 : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+  def _v2_asi : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+  def _v4_avar : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, imem:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+  def _v4_areg : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+  def _v4_areg_64 : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int64Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+  def _v4_ari : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+  def _v4_ari_64 : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+  def _v4_asi : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+}
+let mayLoad=1, hasSideEffects=0 in {
+  defm LDV_i8  : LD_VEC<Int16Regs>;
+  defm LDV_i16 : LD_VEC<Int16Regs>;
+  defm LDV_i32 : LD_VEC<Int32Regs>;
+  defm LDV_i64 : LD_VEC<Int64Regs>;
+  defm LDV_f32 : LD_VEC<Float32Regs>;
+  defm LDV_f64 : LD_VEC<Float64Regs>;
+}
+
+multiclass ST_VEC<NVPTXRegClass regclass> {
+  def _v2_avar : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2}};", []>;
+  def _v2_areg : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2}};", []>;
+  def _v2_areg_64 : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2}};", []>;
+  def _v2_ari : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
+         i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr+$offset], {{$src1, $src2}};", []>;
+  def _v2_ari_64 : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
+         i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr+$offset], {{$src1, $src2}};", []>;
+  def _v2_asi : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
+         i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr+$offset], {{$src1, $src2}};", []>;
+  def _v4_avar : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, imem:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+  def _v4_areg : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+  def _v4_areg_64 : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int64Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+  def _v4_ari : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+  def _v4_ari_64 : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+  def _v4_asi : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
+    "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+}
+
+let mayStore=1, hasSideEffects=0 in {
+  defm STV_i8  : ST_VEC<Int16Regs>;
+  defm STV_i16 : ST_VEC<Int16Regs>;
+  defm STV_i32 : ST_VEC<Int32Regs>;
+  defm STV_i64 : ST_VEC<Int64Regs>;
+  defm STV_f32 : ST_VEC<Float32Regs>;
+  defm STV_f64 : ST_VEC<Float64Regs>;
+}
+
+
+//---- Conversion ----
+
+class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
+  NVPTXRegClass regclassOut> :
+           NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
+           !strconcat("mov.b", !strconcat(SzStr, " \t $d, $a;")),
+     [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
+
+def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
+def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
+def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
+def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
+
+// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
+// we cannot specify floating-point literals in isel patterns.  Therefore, we
+// use an integer selp to select either 1 or 0 and then cvt to floating-point.
+
+// sint -> f32
+def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
+          (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
+          (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
+          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
+          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
+
+// uint -> f32
+def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
+          (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
+          (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
+          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
+          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
+
+// sint -> f64
+def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
+          (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
+          (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
+          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
+          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
+
+// uint -> f64
+def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
+          (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
+          (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
+          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
+          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
+
+
+// f32 -> sint
+def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
+          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
+          (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
+          (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
+          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
+          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
+          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
+          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+
+// f32 -> uint
+def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
+          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
+          (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
+          (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
+          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
+          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
+          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
+          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+
+// f64 -> sint
+def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
+          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
+          (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
+          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
+          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+
+// f64 -> uint
+def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
+          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
+          (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
+          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
+          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+
+// sext i1
+def : Pat<(i16 (sext Int1Regs:$a)),
+          (SELP_s16ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (sext Int1Regs:$a)),
+          (SELP_s32ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (sext Int1Regs:$a)),
+          (SELP_s64ii -1, 0, Int1Regs:$a)>;
+
+// zext i1
+def : Pat<(i16 (zext Int1Regs:$a)),
+          (SELP_u16ii 1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (zext Int1Regs:$a)),
+          (SELP_u32ii 1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (zext Int1Regs:$a)),
+          (SELP_u64ii 1, 0, Int1Regs:$a)>;
+
+// anyext i1
+def : Pat<(i16 (anyext Int1Regs:$a)),
+          (SELP_u16ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (anyext Int1Regs:$a)),
+          (SELP_u32ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (anyext Int1Regs:$a)),
+          (SELP_u64ii -1, 0, Int1Regs:$a)>;
+
+// sext i16
+def : Pat<(i32 (sext Int16Regs:$a)),
+          (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (sext Int16Regs:$a)),
+          (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
+
+// zext i16
+def : Pat<(i32 (zext Int16Regs:$a)),
+          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (zext Int16Regs:$a)),
+          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+
+// anyext i16
+def : Pat<(i32 (anyext Int16Regs:$a)),
+          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (anyext Int16Regs:$a)),
+          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+
+// sext i32
+def : Pat<(i64 (sext Int32Regs:$a)),
+          (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
+
+// zext i32
+def : Pat<(i64 (zext Int32Regs:$a)),
+          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+
+// anyext i32
+def : Pat<(i64 (anyext Int32Regs:$a)),
+          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+
+
+// truncate i64
+def : Pat<(i32 (trunc Int64Regs:$a)),
+          (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
+def : Pat<(i16 (trunc Int64Regs:$a)),
+          (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
+def : Pat<(i1 (trunc Int64Regs:$a)),
+          (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
+
+// truncate i32
+def : Pat<(i16 (trunc Int32Regs:$a)),
+          (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
+def : Pat<(i1 (trunc Int32Regs:$a)),
+          (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
+
+// truncate i16
+def : Pat<(i1 (trunc Int16Regs:$a)),
+          (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
+
+// sext_inreg
+def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
+def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
+def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
+
+
+// Select instructions with 32-bit predicates
+def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
+          (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
+          (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
+          (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
+          (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
+          (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+
+
+let hasSideEffects = 0 in {
+  // pack a set of smaller int registers to a larger int register
+  def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
+                             (ins Int16Regs:$s1, Int16Regs:$s2,
+                                  Int16Regs:$s3, Int16Regs:$s4),
+                             "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};", []>;
+  def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
+                             (ins Int16Regs:$s1, Int16Regs:$s2),
+                             "mov.b32\t$d, {{$s1, $s2}};", []>;
+  def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
+                             (ins Int32Regs:$s1, Int32Regs:$s2),
+                             "mov.b64\t$d, {{$s1, $s2}};", []>;
+  def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
+                             (ins Float32Regs:$s1, Float32Regs:$s2),
+                             "mov.b64\t$d, {{$s1, $s2}};", []>;
+
+  // unpack a larger int register to a set of smaller int registers
+  def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
+                                   Int16Regs:$d3, Int16Regs:$d4),
+                             (ins Int64Regs:$s),
+                             "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;", []>;
+  def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
+                             (ins Int32Regs:$s),
+                             "mov.b32\t{{$d1, $d2}}, $s;", []>;
+  def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
+                             (ins Int64Regs:$s),
+                             "mov.b64\t{{$d1, $d2}}, $s;", []>;
+  def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
+                             (ins Float64Regs:$s),
+                             "mov.b64\t{{$d1, $d2}}, $s;", []>;
+}
+
+// Count leading zeros
+let hasSideEffects = 0 in {
+  def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+                         "clz.b32\t$d, $a;", []>;
+  def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                         "clz.b64\t$d, $a;", []>;
+}
+
+// 32-bit has a direct PTX instruction
+def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
+
+// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
+// to 64-bit to match the LLVM semantics
+def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+
+// For 16-bit, we zero-extend to 32-bit, then trunc the result back
+// to 16-bits (ctlz of a 16-bit value is guaranteed to require less
+// than 16 bits to store). We also need to subtract 16 because the
+// high-order 16 zeros were counted.
+def : Pat<(ctlz Int16Regs:$a),
+          (SUBi16ri (CVT_u16_u32 (CLZr32
+            (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
+           CvtNONE), 16)>;
+
+// Population count
+let hasSideEffects = 0 in {
+  def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+                          "popc.b32\t$d, $a;", []>;
+  def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                          "popc.b64\t$d, $a;", []>;
+}
+
+// 32-bit has a direct PTX instruction
+def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
+
+// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
+// to 64-bit to match the LLVM semantics
+def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
+
+// For 16-bit, we zero-extend to 32-bit, then trunc the result back
+// to 16-bits (ctpop of a 16-bit value is guaranteed to require less
+// than 16 bits to store)
+def : Pat<(ctpop Int16Regs:$a),
+          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
+
+// fpround f64 -> f32
+def : Pat<(f32 (fpround Float64Regs:$a)),
+          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpround Float64Regs:$a)),
+          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+
+// fpextend f32 -> f64
+def : Pat<(f64 (fpextend Float32Regs:$a)),
+          (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f64 (fpextend Float32Regs:$a)),
+          (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
+
+def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
+                     [SDNPHasChain, SDNPOptInGlue]>;
+
+// fceil, ffloor, fround, ftrunc.
+
+def : Pat<(fceil Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(fceil Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fceil Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(ffloor Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(ffloor Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(ffloor Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
+
+def : Pat<(fround Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fround Float32Regs:$a)),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(f64 (fround Float64Regs:$a)),
+          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+
+def : Pat<(ftrunc Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(ftrunc Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(ftrunc Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
+
+// nearbyint and rint are implemented as rounding to nearest even.  This isn't
+// strictly correct, because it causes us to ignore the rounding mode.  But it
+// matches what CUDA's "libm" does.
+
+def : Pat<(fnearbyint Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(fnearbyint Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fnearbyint Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+
+def : Pat<(frint Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(frint Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(frint Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+
+
+//-----------------------------------
+// Control-flow
+//-----------------------------------
+
+let isTerminator=1 in {
+   let isReturn=1, isBarrier=1 in
+      def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
+
+   let isBranch=1 in
+      def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+                              "@$a bra \t$target;",
+                              [(brcond Int1Regs:$a, bb:$target)]>;
+   let isBranch=1 in
+      def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+                                   "@!$a bra \t$target;", []>;
+
+   let isBranch=1, isBarrier=1 in
+      def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
+                           "bra.uni \t$target;", [(br bb:$target)]>;
+}
+
+def : Pat<(brcond Int32Regs:$a, bb:$target),
+          (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
+
+// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
+// conditional branch if the target block is the next block so that the code
+// can fall through to the target block.  The invertion is done by 'xor
+// condition, 1', which will be translated to (setne condition, -1).  Since ptx
+// supports '@!pred bra target', we should use it.
+def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
+          (CBranchOther Int1Regs:$a, bb:$target)>;
+
+// Call
+def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                            SDNPSideEffect]>;
+
+def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def calltarget : Operand<i32>;
+let isCall=1 in {
+   def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
+}
+
+def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
+
+// Pseudo instructions.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : NVPTXInst<outs, ins, asmstr, pattern>;
+
+def Callseq_Start :
+  NVPTXInst<(outs), (ins i32imm:$amt),
+            "\\{ // callseq $amt\n"
+            "\t.reg .b32 temp_param_reg;",
+           [(callseq_start timm:$amt)]>;
+def Callseq_End :
+  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+            "\\} // callseq $amt1",
+            [(callseq_end timm:$amt1, timm:$amt2)]>;
+
+// trap instruction
+def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>;
+
+// Call prototype wrapper
+def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def CallPrototype :
+  SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def ProtoIdent : Operand<i32> {
+  let PrintMethod = "printProtoIdent";
+}
+def CALL_PROTOTYPE :
+  NVPTXInst<(outs), (ins ProtoIdent:$ident),
+            "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
+
+
+include "NVPTXIntrinsics.td"
+
+
+//-----------------------------------
+// Notes
+//-----------------------------------
+// BSWAP is currently expanded. The following is a more efficient
+// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
+// - for sm_20, use pmpt (use vector scalar mov to get the pack and
+//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
+// register.
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
new file mode 100644
index 000000000000..b0408f12f5b1
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -0,0 +1,7260 @@
+//===- NVPTXIntrinsics.td - PTX Intrinsics Instructions -------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def immFloat0 : PatLeaf<(fpimm), [{
+    float f = (float)N->getValueAPF().convertToFloat();
+    return (f==0.0f);
+}]>;
+
+def immFloat1 : PatLeaf<(fpimm), [{
+    float f = (float)N->getValueAPF().convertToFloat();
+    return (f==1.0f);
+}]>;
+
+def immDouble0 : PatLeaf<(fpimm), [{
+    double d = (double)N->getValueAPF().convertToDouble();
+    return (d==0.0);
+}]>;
+
+def immDouble1 : PatLeaf<(fpimm), [{
+    double d = (double)N->getValueAPF().convertToDouble();
+    return (d==1.0);
+}]>;
+
+
+
+//-----------------------------------
+// Synchronization and shuffle functions
+//-----------------------------------
+let isConvergent = 1 in {
+def INT_BARRIER0 : NVPTXInst<(outs), (ins),
+                  "bar.sync \t0;",
+      [(int_nvvm_barrier0)]>;
+def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+  !strconcat("{{ \n\t",
+      !strconcat(".reg .pred \t%p1; \n\t",
+      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+      !strconcat("bar.red.popc.u32 \t$dst, 0, %p1; \n\t",
+        !strconcat("}}", ""))))),
+      [(set Int32Regs:$dst, (int_nvvm_barrier0_popc Int32Regs:$pred))]>;
+def INT_BARRIER0_AND : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+  !strconcat("{{ \n\t",
+      !strconcat(".reg .pred \t%p1; \n\t",
+      !strconcat(".reg .pred \t%p2; \n\t",
+      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+      !strconcat("bar.red.and.pred \t%p2, 0, %p1; \n\t",
+      !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
+        !strconcat("}}", ""))))))),
+      [(set Int32Regs:$dst, (int_nvvm_barrier0_and Int32Regs:$pred))]>;
+def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+  !strconcat("{{ \n\t",
+      !strconcat(".reg .pred \t%p1; \n\t",
+      !strconcat(".reg .pred \t%p2; \n\t",
+      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+      !strconcat("bar.red.or.pred \t%p2, 0, %p1; \n\t",
+      !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
+        !strconcat("}}", ""))))))),
+      [(set Int32Regs:$dst, (int_nvvm_barrier0_or Int32Regs:$pred))]>;
+
+def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync\t$i;",
+                             [(int_nvvm_bar_sync imm:$i)]>;
+
+// shfl.{up,down,bfly,idx}.b32
+multiclass SHFL<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
+  // The last two parameters to shfl can be regs or imms.  ptxas is smart
+  // enough to inline constant registers, so strictly speaking we don't need to
+  // handle immediates here.  But it's easy enough, and it makes our ptx more
+  // readable.
+  def reg : NVPTXInst<
+      (outs regclass:$dst),
+      (ins regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
+      !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
+      [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, Int32Regs:$mask))]>;
+
+  def imm1 : NVPTXInst<
+      (outs regclass:$dst),
+      (ins regclass:$src, i32imm:$offset, Int32Regs:$mask),
+      !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
+      [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, Int32Regs:$mask))]>;
+
+  def imm2 : NVPTXInst<
+      (outs regclass:$dst),
+      (ins regclass:$src, Int32Regs:$offset, i32imm:$mask),
+      !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
+      [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, imm:$mask))]>;
+
+  def imm3 : NVPTXInst<
+      (outs regclass:$dst),
+      (ins regclass:$src, i32imm:$offset, i32imm:$mask),
+      !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
+      [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, imm:$mask))]>;
+}
+
+defm INT_SHFL_DOWN_I32 : SHFL<Int32Regs, "down", int_nvvm_shfl_down_i32>;
+defm INT_SHFL_DOWN_F32 : SHFL<Float32Regs, "down", int_nvvm_shfl_down_f32>;
+defm INT_SHFL_UP_I32 : SHFL<Int32Regs, "up", int_nvvm_shfl_up_i32>;
+defm INT_SHFL_UP_F32 : SHFL<Float32Regs, "up", int_nvvm_shfl_up_f32>;
+defm INT_SHFL_BFLY_I32 : SHFL<Int32Regs, "bfly", int_nvvm_shfl_bfly_i32>;
+defm INT_SHFL_BFLY_F32 : SHFL<Float32Regs, "bfly", int_nvvm_shfl_bfly_f32>;
+defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_nvvm_shfl_idx_i32>;
+defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_nvvm_shfl_idx_f32>;
+
+} // isConvergent = 1
+
+
+//-----------------------------------
+// Explicit Memory Fence Functions
+//-----------------------------------
+class MEMBAR<string StrOp, Intrinsic IntOP> :
+              NVPTXInst<(outs), (ins),
+            StrOp, [(IntOP)]>;
+
+def INT_MEMBAR_CTA : MEMBAR<"membar.cta;", int_nvvm_membar_cta>;
+def INT_MEMBAR_GL  : MEMBAR<"membar.gl;",  int_nvvm_membar_gl>;
+def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
+
+
+//-----------------------------------
+// Math Functions
+//-----------------------------------
+
+// Map min(1.0, max(0.0, x)) to sat(x)
+// Note that max(0.0, min(x, 1.0)) cannot be mapped to sat(x) because when x is
+// NaN
+// max(0.0, min(x, 1.0)) is 1.0 while sat(x) is 0.
+// Same story for fmax, fmin.
+
+def : Pat<(int_nvvm_fmin_f immFloat1,
+            (int_nvvm_fmax_f immFloat0, Float32Regs:$a)),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_f immFloat1,
+            (int_nvvm_fmax_f Float32Regs:$a, immFloat0)),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_f
+            (int_nvvm_fmax_f immFloat0, Float32Regs:$a), immFloat1),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_f
+            (int_nvvm_fmax_f Float32Regs:$a, immFloat0), immFloat1),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+
+def : Pat<(int_nvvm_fmin_d immDouble1,
+            (int_nvvm_fmax_d immDouble0, Float64Regs:$a)),
+          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_d immDouble1,
+            (int_nvvm_fmax_d Float64Regs:$a, immDouble0)),
+          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_d
+            (int_nvvm_fmax_d immDouble0, Float64Regs:$a), immDouble1),
+          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_d
+            (int_nvvm_fmax_d Float64Regs:$a, immDouble0), immDouble1),
+          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+
+
+// We need a full string for OpcStr here because we need to deal with case like
+// INT_PTX_RECIP.
+class F_MATH_1<string OpcStr, NVPTXRegClass target_regclass,
+  NVPTXRegClass src_regclass, Intrinsic IntOP>
+            : NVPTXInst<(outs target_regclass:$dst), (ins src_regclass:$src0),
+            OpcStr,
+        [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>;
+
+// We need a full string for OpcStr here because we need to deal with the case
+// like INT_PTX_NATIVE_POWR_F.
+class F_MATH_2<string OpcStr, NVPTXRegClass t_regclass,
+  NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, Intrinsic IntOP>
+            : NVPTXInst<(outs t_regclass:$dst),
+              (ins s0_regclass:$src0, s1_regclass:$src1),
+            OpcStr,
+        [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>;
+
+class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
+  NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass,
+  NVPTXRegClass s2_regclass, Intrinsic IntOP>
+            : NVPTXInst<(outs t_regclass:$dst),
+              (ins s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2),
+            OpcStr,
+        [(set t_regclass:$dst,
+          (IntOP s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2))]>;
+
+//
+// MISC
+//
+
+def INT_NVVM_CLZ_I : F_MATH_1<"clz.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_clz_i>;
+def INT_NVVM_CLZ_LL : F_MATH_1<"clz.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
+  int_nvvm_clz_ll>;
+
+def INT_NVVM_POPC_I : F_MATH_1<"popc.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_popc_i>;
+def INT_NVVM_POPC_LL : F_MATH_1<"popc.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
+  int_nvvm_popc_ll>;
+
+def INT_NVVM_PRMT : F_MATH_3<"prmt.b32 \t$dst, $src0, $src1, $src2;", Int32Regs,
+  Int32Regs, Int32Regs, Int32Regs, int_nvvm_prmt>;
+
+//
+// Min Max
+//
+
+def INT_NVVM_MIN_I : F_MATH_2<"min.s32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_min_i>;
+def INT_NVVM_MIN_UI : F_MATH_2<"min.u32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_min_ui>;
+
+def INT_NVVM_MIN_LL : F_MATH_2<"min.s64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_min_ll>;
+def INT_NVVM_MIN_ULL : F_MATH_2<"min.u64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_min_ull>;
+
+def INT_NVVM_MAX_I : F_MATH_2<"max.s32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_max_i>;
+def INT_NVVM_MAX_UI : F_MATH_2<"max.u32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_max_ui>;
+
+def INT_NVVM_MAX_LL : F_MATH_2<"max.s64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_max_ll>;
+def INT_NVVM_MAX_ULL : F_MATH_2<"max.u64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_max_ull>;
+
+def INT_NVVM_FMIN_F : F_MATH_2<"min.f32 \t$dst, $src0, $src1;", Float32Regs,
+  Float32Regs, Float32Regs, int_nvvm_fmin_f>;
+def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_f>;
+
+def INT_NVVM_FMAX_F : F_MATH_2<"max.f32 \t$dst, $src0, $src1;", Float32Regs,
+  Float32Regs, Float32Regs, int_nvvm_fmax_f>;
+def INT_NVVM_FMAX_FTZ_F : F_MATH_2<"max.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_f>;
+
+def INT_NVVM_FMIN_D : F_MATH_2<"min.f64 \t$dst, $src0, $src1;", Float64Regs,
+  Float64Regs, Float64Regs, int_nvvm_fmin_d>;
+def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs,
+  Float64Regs, Float64Regs, int_nvvm_fmax_d>;
+
+//
+// Multiplication
+//
+
+def INT_NVVM_MULHI_I : F_MATH_2<"mul.hi.s32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_mulhi_i>;
+def INT_NVVM_MULHI_UI : F_MATH_2<"mul.hi.u32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_mulhi_ui>;
+
+def INT_NVVM_MULHI_LL : F_MATH_2<"mul.hi.s64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_mulhi_ll>;
+def INT_NVVM_MULHI_ULL : F_MATH_2<"mul.hi.u64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_mulhi_ull>;
+
+def INT_NVVM_MUL_RN_FTZ_F : F_MATH_2<"mul.rn.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_ftz_f>;
+def INT_NVVM_MUL_RN_F : F_MATH_2<"mul.rn.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_f>;
+def INT_NVVM_MUL_RZ_FTZ_F : F_MATH_2<"mul.rz.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_ftz_f>;
+def INT_NVVM_MUL_RZ_F : F_MATH_2<"mul.rz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_f>;
+def INT_NVVM_MUL_RM_FTZ_F : F_MATH_2<"mul.rm.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_ftz_f>;
+def INT_NVVM_MUL_RM_F : F_MATH_2<"mul.rm.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_f>;
+def INT_NVVM_MUL_RP_FTZ_F : F_MATH_2<"mul.rp.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_ftz_f>;
+def INT_NVVM_MUL_RP_F : F_MATH_2<"mul.rp.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_f>;
+
+def INT_NVVM_MUL_RN_D : F_MATH_2<"mul.rn.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rn_d>;
+def INT_NVVM_MUL_RZ_D : F_MATH_2<"mul.rz.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rz_d>;
+def INT_NVVM_MUL_RM_D : F_MATH_2<"mul.rm.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rm_d>;
+def INT_NVVM_MUL_RP_D : F_MATH_2<"mul.rp.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rp_d>;
+
+def INT_NVVM_MUL24_I : F_MATH_2<"mul24.lo.s32 \t$dst, $src0, $src1;",
+  Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_i>;
+def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32 \t$dst, $src0, $src1;",
+  Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_ui>;
+
+//
+// Div
+//
+
+def INT_NVVM_DIV_APPROX_FTZ_F
+  : F_MATH_2<"div.approx.ftz.f32 \t$dst, $src0, $src1;", Float32Regs,
+    Float32Regs, Float32Regs, int_nvvm_div_approx_ftz_f>;
+def INT_NVVM_DIV_APPROX_F : F_MATH_2<"div.approx.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_approx_f>;
+
+def INT_NVVM_DIV_RN_FTZ_F : F_MATH_2<"div.rn.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_ftz_f>;
+def INT_NVVM_DIV_RN_F     : F_MATH_2<"div.rn.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_f>;
+def INT_NVVM_DIV_RZ_FTZ_F : F_MATH_2<"div.rz.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_ftz_f>;
+def INT_NVVM_DIV_RZ_F     : F_MATH_2<"div.rz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_f>;
+def INT_NVVM_DIV_RM_FTZ_F : F_MATH_2<"div.rm.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_ftz_f>;
+def INT_NVVM_DIV_RM_F     : F_MATH_2<"div.rm.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_f>;
+def INT_NVVM_DIV_RP_FTZ_F : F_MATH_2<"div.rp.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_ftz_f>;
+def INT_NVVM_DIV_RP_F     : F_MATH_2<"div.rp.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_f>;
+
+def INT_NVVM_DIV_RN_D : F_MATH_2<"div.rn.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rn_d>;
+def INT_NVVM_DIV_RZ_D : F_MATH_2<"div.rz.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rz_d>;
+def INT_NVVM_DIV_RM_D : F_MATH_2<"div.rm.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rm_d>;
+def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rp_d>;
+
+//
+// Brev
+//
+
+def INT_NVVM_BREV32 : F_MATH_1<"brev.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_brev32>;
+def INT_NVVM_BREV64 : F_MATH_1<"brev.b64 \t$dst, $src0;", Int64Regs, Int64Regs,
+  int_nvvm_brev64>;
+
+//
+// Sad
+//
+
+def INT_NVVM_SAD_I : F_MATH_3<"sad.s32 \t$dst, $src0, $src1, $src2;",
+  Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_i>;
+def INT_NVVM_SAD_UI : F_MATH_3<"sad.u32 \t$dst, $src0, $src1, $src2;",
+  Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_ui>;
+
+//
+// Floor  Ceil
+//
+
+def : Pat<(int_nvvm_floor_ftz_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_floor_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_floor_d Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
+
+def : Pat<(int_nvvm_ceil_ftz_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_ceil_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRPI)>;
+def : Pat<(int_nvvm_ceil_d Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
+
+//
+// Abs
+//
+
+def INT_NVVM_ABS_I : F_MATH_1<"abs.s32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_abs_i>;
+def INT_NVVM_ABS_LL : F_MATH_1<"abs.s64 \t$dst, $src0;", Int64Regs, Int64Regs,
+  int_nvvm_abs_ll>;
+
+def INT_NVVM_FABS_FTZ_F : F_MATH_1<"abs.ftz.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_fabs_ftz_f>;
+def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_fabs_f>;
+
+def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_fabs_d>;
+
+//
+// Round
+//
+
+def : Pat<(int_nvvm_round_ftz_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_round_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_round_d Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+
+//
+// Trunc
+//
+
+def : Pat<(int_nvvm_trunc_ftz_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_trunc_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_trunc_d Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
+
+//
+// Saturate
+//
+
+def : Pat<(int_nvvm_saturate_ftz_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT_FTZ)>;
+def : Pat<(int_nvvm_saturate_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_saturate_d Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+
+//
+// Exp2  Log2
+//
+
+def INT_NVVM_EX2_APPROX_FTZ_F : F_MATH_1<"ex2.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_ex2_approx_ftz_f>;
+def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>;
+def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>;
+
+def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>;
+def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_lg2_approx_f>;
+def INT_NVVM_LG2_APPROX_D : F_MATH_1<"lg2.approx.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_lg2_approx_d>;
+
+//
+// Sin  Cos
+//
+
+def INT_NVVM_SIN_APPROX_FTZ_F : F_MATH_1<"sin.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sin_approx_ftz_f>;
+def INT_NVVM_SIN_APPROX_F : F_MATH_1<"sin.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sin_approx_f>;
+
+def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_cos_approx_ftz_f>;
+def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_cos_approx_f>;
+
+//
+// Fma
+//
+
+def INT_NVVM_FMA_RN_FTZ_F
+  : F_MATH_3<"fma.rn.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_ftz_f>;
+def INT_NVVM_FMA_RN_F : F_MATH_3<"fma.rn.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_f>;
+def INT_NVVM_FMA_RZ_FTZ_F
+  : F_MATH_3<"fma.rz.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_ftz_f>;
+def INT_NVVM_FMA_RZ_F : F_MATH_3<"fma.rz.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_f>;
+def INT_NVVM_FMA_RM_FTZ_F
+  : F_MATH_3<"fma.rm.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_ftz_f>;
+def INT_NVVM_FMA_RM_F : F_MATH_3<"fma.rm.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_f>;
+def INT_NVVM_FMA_RP_FTZ_F
+  : F_MATH_3<"fma.rp.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_ftz_f>;
+def INT_NVVM_FMA_RP_F : F_MATH_3<"fma.rp.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_f>;
+
+def INT_NVVM_FMA_RN_D : F_MATH_3<"fma.rn.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rn_d>;
+def INT_NVVM_FMA_RZ_D : F_MATH_3<"fma.rz.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rz_d>;
+def INT_NVVM_FMA_RM_D : F_MATH_3<"fma.rm.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rm_d>;
+def INT_NVVM_FMA_RP_D : F_MATH_3<"fma.rp.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rp_d>;
+
+//
+// Rcp
+//
+
+def INT_NVVM_RCP_RN_FTZ_F : F_MATH_1<"rcp.rn.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rn_ftz_f>;
+def INT_NVVM_RCP_RN_F : F_MATH_1<"rcp.rn.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rn_f>;
+def INT_NVVM_RCP_RZ_FTZ_F : F_MATH_1<"rcp.rz.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rz_ftz_f>;
+def INT_NVVM_RCP_RZ_F : F_MATH_1<"rcp.rz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rz_f>;
+def INT_NVVM_RCP_RM_FTZ_F : F_MATH_1<"rcp.rm.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rm_ftz_f>;
+def INT_NVVM_RCP_RM_F : F_MATH_1<"rcp.rm.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rm_f>;
+def INT_NVVM_RCP_RP_FTZ_F : F_MATH_1<"rcp.rp.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rp_ftz_f>;
+def INT_NVVM_RCP_RP_F : F_MATH_1<"rcp.rp.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rp_f>;
+
+def INT_NVVM_RCP_RN_D : F_MATH_1<"rcp.rn.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rn_d>;
+def INT_NVVM_RCP_RZ_D : F_MATH_1<"rcp.rz.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rz_d>;
+def INT_NVVM_RCP_RM_D : F_MATH_1<"rcp.rm.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rm_d>;
+def INT_NVVM_RCP_RP_D : F_MATH_1<"rcp.rp.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rp_d>;
+
+def INT_NVVM_RCP_APPROX_FTZ_D : F_MATH_1<"rcp.approx.ftz.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_rcp_approx_ftz_d>;
+
+//
+// Sqrt
+//
+
+def INT_NVVM_SQRT_RN_FTZ_F : F_MATH_1<"sqrt.rn.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rn_ftz_f>;
+def INT_NVVM_SQRT_RN_F : F_MATH_1<"sqrt.rn.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rn_f>;
+def INT_NVVM_SQRT_RZ_FTZ_F : F_MATH_1<"sqrt.rz.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rz_ftz_f>;
+def INT_NVVM_SQRT_RZ_F : F_MATH_1<"sqrt.rz.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rz_f>;
+def INT_NVVM_SQRT_RM_FTZ_F : F_MATH_1<"sqrt.rm.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rm_ftz_f>;
+def INT_NVVM_SQRT_RM_F : F_MATH_1<"sqrt.rm.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rm_f>;
+def INT_NVVM_SQRT_RP_FTZ_F : F_MATH_1<"sqrt.rp.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rp_ftz_f>;
+def INT_NVVM_SQRT_RP_F : F_MATH_1<"sqrt.rp.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rp_f>;
+def INT_NVVM_SQRT_APPROX_FTZ_F : F_MATH_1<"sqrt.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_approx_ftz_f>;
+def INT_NVVM_SQRT_APPROX_F : F_MATH_1<"sqrt.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_approx_f>;
+
+def INT_NVVM_SQRT_RN_D : F_MATH_1<"sqrt.rn.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rn_d>;
+def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rz_d>;
+def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rm_d>;
+def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rp_d>;
+
+// nvvm_sqrt intrinsic
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+          (INT_NVVM_SQRT_RN_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ, do_SQRTF32_RN]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+          (INT_NVVM_SQRT_RN_F Float32Regs:$a)>, Requires<[do_SQRTF32_RN]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+          (INT_NVVM_SQRT_APPROX_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+          (INT_NVVM_SQRT_APPROX_F Float32Regs:$a)>;
+
+//
+// Rsqrt
+//
+
+def INT_NVVM_RSQRT_APPROX_FTZ_F
+  : F_MATH_1<"rsqrt.approx.ftz.f32 \t$dst, $src0;", Float32Regs, Float32Regs,
+    int_nvvm_rsqrt_approx_ftz_f>;
+def INT_NVVM_RSQRT_APPROX_F : F_MATH_1<"rsqrt.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rsqrt_approx_f>;
+def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_rsqrt_approx_d>;
+
+//
+// Add
+//
+
+def INT_NVVM_ADD_RN_FTZ_F : F_MATH_2<"add.rn.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_ftz_f>;
+def INT_NVVM_ADD_RN_F : F_MATH_2<"add.rn.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_f>;
+def INT_NVVM_ADD_RZ_FTZ_F : F_MATH_2<"add.rz.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_ftz_f>;
+def INT_NVVM_ADD_RZ_F : F_MATH_2<"add.rz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_f>;
+def INT_NVVM_ADD_RM_FTZ_F : F_MATH_2<"add.rm.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_ftz_f>;
+def INT_NVVM_ADD_RM_F : F_MATH_2<"add.rm.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_f>;
+def INT_NVVM_ADD_RP_FTZ_F : F_MATH_2<"add.rp.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_ftz_f>;
+def INT_NVVM_ADD_RP_F : F_MATH_2<"add.rp.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_f>;
+
+def INT_NVVM_ADD_RN_D : F_MATH_2<"add.rn.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rn_d>;
+def INT_NVVM_ADD_RZ_D : F_MATH_2<"add.rz.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rz_d>;
+def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rm_d>;
+def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rp_d>;
+
+//
+// Convert
+//
+
+def : Pat<(int_nvvm_d2f_rn_ftz Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>;
+def : Pat<(int_nvvm_d2f_rn Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_d2f_rz_ftz Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRZ_FTZ)>;
+def : Pat<(int_nvvm_d2f_rz Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_d2f_rm_ftz Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRM_FTZ)>;
+def : Pat<(int_nvvm_d2f_rm Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_d2f_rp_ftz Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRP_FTZ)>;
+def : Pat<(int_nvvm_d2f_rp Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_d2i_rn Float64Regs:$a),
+          (CVT_s32_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2i_rz Float64Regs:$a),
+          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2i_rm Float64Regs:$a),
+          (CVT_s32_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2i_rp Float64Regs:$a),
+          (CVT_s32_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ui_rn Float64Regs:$a),
+          (CVT_u32_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ui_rz Float64Regs:$a),
+          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ui_rm Float64Regs:$a),
+          (CVT_u32_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ui_rp Float64Regs:$a),
+          (CVT_u32_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_i2d_rn Int32Regs:$a),
+          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_i2d_rz Int32Regs:$a),
+          (CVT_f64_s32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_i2d_rm Int32Regs:$a),
+          (CVT_f64_s32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_i2d_rp Int32Regs:$a),
+          (CVT_f64_s32 Int32Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ui2d_rn Int32Regs:$a),
+          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ui2d_rz Int32Regs:$a),
+          (CVT_f64_u32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ui2d_rm Int32Regs:$a),
+          (CVT_f64_u32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ui2d_rp Int32Regs:$a),
+          (CVT_f64_u32 Int32Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_f2i_rn_ftz Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rn Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2i_rz_ftz Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rz Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2i_rm_ftz Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rm Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2i_rp_ftz Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rp Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_f2ui_rn_ftz Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rn Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ui_rz_ftz Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rz Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ui_rm_ftz Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rm Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ui_rp_ftz Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rp Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_i2f_rn Int32Regs:$a),
+          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_i2f_rz Int32Regs:$a),
+          (CVT_f32_s32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_i2f_rm Int32Regs:$a),
+          (CVT_f32_s32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_i2f_rp Int32Regs:$a),
+          (CVT_f32_s32 Int32Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ui2f_rn Int32Regs:$a),
+          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ui2f_rz Int32Regs:$a),
+          (CVT_f32_u32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ui2f_rm Int32Regs:$a),
+          (CVT_f32_u32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ui2f_rp Int32Regs:$a),
+          (CVT_f32_u32 Int32Regs:$a, CvtRP)>;
+
+def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};",
+  Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>;
+
+def INT_NVVM_D2I_LO : F_MATH_1<!strconcat("{{\n\t",
+                       !strconcat(".reg .b32 %temp; \n\t",
+             !strconcat("mov.b64 \t{$dst, %temp}, $src0;\n\t",
+               "}}"))),
+             Int32Regs, Float64Regs, int_nvvm_d2i_lo>;
+def INT_NVVM_D2I_HI : F_MATH_1<!strconcat("{{\n\t",
+                       !strconcat(".reg .b32 %temp; \n\t",
+                         !strconcat("mov.b64 \t{%temp, $dst}, $src0;\n\t",
+                           "}}"))),
+             Int32Regs, Float64Regs, int_nvvm_d2i_hi>;
+
+def : Pat<(int_nvvm_f2ll_rn_ftz Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rn Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ll_rz_ftz Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rz Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ll_rm_ftz Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rm Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ll_rp_ftz Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rp Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_f2ull_rn_ftz Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rn Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ull_rz_ftz Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rz Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ull_rm_ftz Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rm Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ull_rp_ftz Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rp Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ll_rn Float64Regs:$a),
+          (CVT_s64_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ll_rz Float64Regs:$a),
+          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ll_rm Float64Regs:$a),
+          (CVT_s64_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ll_rp Float64Regs:$a),
+          (CVT_s64_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ull_rn Float64Regs:$a),
+          (CVT_u64_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ull_rz Float64Regs:$a),
+          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ull_rm Float64Regs:$a),
+          (CVT_u64_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ull_rp Float64Regs:$a),
+          (CVT_u64_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_ll2f_rn Int64Regs:$a),
+          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ll2f_rz Int64Regs:$a),
+          (CVT_f32_s64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ll2f_rm Int64Regs:$a),
+          (CVT_f32_s64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ll2f_rp Int64Regs:$a),
+          (CVT_f32_s64 Int64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ull2f_rn Int64Regs:$a),
+          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ull2f_rz Int64Regs:$a),
+          (CVT_f32_u64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ull2f_rm Int64Regs:$a),
+          (CVT_f32_u64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ull2f_rp Int64Regs:$a),
+          (CVT_f32_u64 Int64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ll2d_rn Int64Regs:$a),
+          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ll2d_rz Int64Regs:$a),
+          (CVT_f64_s64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ll2d_rm Int64Regs:$a),
+          (CVT_f64_s64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ll2d_rp Int64Regs:$a),
+          (CVT_f64_s64 Int64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ull2d_rn Int64Regs:$a),
+          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ull2d_rz Int64Regs:$a),
+          (CVT_f64_u64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ull2d_rm Int64Regs:$a),
+          (CVT_f64_u64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ull2d_rp Int64Regs:$a),
+          (CVT_f64_u64 Int64Regs:$a, CvtRP)>;
+
+
+// FIXME: Ideally, we could use these patterns instead of the scope-creating
+// patterns, but ptxas does not like these since .s16 is not compatible with
+// .f16.  The solution is to use .bXX for all integer register types, but we
+// are not there yet.
+//def : Pat<(int_nvvm_f2h_rn_ftz Float32Regs:$a),
+//          (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>;
+//def : Pat<(int_nvvm_f2h_rn Float32Regs:$a),
+//          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+//
+//def : Pat<(int_nvvm_h2f Int16Regs:$a),
+//          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
+
+def INT_NVVM_F2H_RN_FTZ : F_MATH_1<!strconcat("{{\n\t",
+                                   !strconcat(".reg .b16 %temp;\n\t",
+           !strconcat("cvt.rn.ftz.f16.f32 \t%temp, $src0;\n\t",
+           !strconcat("mov.b16 \t$dst, %temp;\n",
+             "}}")))),
+                                   Int16Regs, Float32Regs, int_nvvm_f2h_rn_ftz>;
+def INT_NVVM_F2H_RN : F_MATH_1<!strconcat("{{\n\t",
+                                   !strconcat(".reg .b16 %temp;\n\t",
+           !strconcat("cvt.rn.f16.f32 \t%temp, $src0;\n\t",
+           !strconcat("mov.b16 \t$dst, %temp;\n",
+             "}}")))),
+           Int16Regs, Float32Regs, int_nvvm_f2h_rn>;
+
+def INT_NVVM_H2F : F_MATH_1<!strconcat("{{\n\t",
+                            !strconcat(".reg .b16 %temp;\n\t",
+          !strconcat("mov.b16 \t%temp, $src0;\n\t",
+          !strconcat("cvt.f32.f16 \t$dst, %temp;\n\t",
+            "}}")))),
+          Float32Regs, Int16Regs, int_nvvm_h2f>;
+
+def : Pat<(f32 (f16_to_fp Int16Regs:$a)),
+          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i16 (fp_to_f16 Float32Regs:$a)),
+          (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_f16 Float32Regs:$a)),
+          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+
+def : Pat<(f64 (f16_to_fp Int16Regs:$a)),
+          (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i16 (fp_to_f16 Float64Regs:$a)),
+          (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
+
+//
+// Bitcast
+//
+
+def INT_NVVM_BITCAST_F2I : F_MATH_1<"mov.b32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_bitcast_f2i>;
+def INT_NVVM_BITCAST_I2F : F_MATH_1<"mov.b32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_bitcast_i2f>;
+
+def INT_NVVM_BITCAST_LL2D : F_MATH_1<"mov.b64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_bitcast_ll2d>;
+def INT_NVVM_BITCAST_D2LL : F_MATH_1<"mov.b64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_bitcast_d2ll>;
+
+//-----------------------------------
+// Atomic Functions
+//-----------------------------------
+
+class ATOMIC_GLOBAL_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
+}]>;
+class ATOMIC_SHARED_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
+}]>;
+class ATOMIC_GENERIC_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
+}]>;
+
+multiclass F_ATOMIC_2_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+  string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+  Operand IMMType, SDNode IMM, Predicate Pred> {
+  def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+  Requires<[Pred]>;
+  def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, IMM:$b))]>,
+  Requires<[Pred]>;
+}
+multiclass F_ATOMIC_2<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
+  string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM, Predicate Pred> {
+  defm p32 : F_ATOMIC_2_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, IMM, Pred>;
+  defm p64 : F_ATOMIC_2_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, IMM, Pred>;
+}
+
+// has 2 operands, neg the second one
+multiclass F_ATOMIC_2_NEG_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+  string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+  Operand IMMType, Predicate Pred> {
+  def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
+    !strconcat("{{ \n\t",
+         !strconcat(".reg \t.s",
+         !strconcat(TypeStr,
+         !strconcat(" temp; \n\t",
+         !strconcat("neg.s",
+         !strconcat(TypeStr,
+         !strconcat(" \ttemp, $b; \n\t",
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(".u",
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], temp; \n\t",
+           !strconcat("}}", "")))))))))))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+  Requires<[Pred]>;
+}
+multiclass F_ATOMIC_2_NEG<NVPTXRegClass regclass, string SpaceStr,
+  string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType,
+  Predicate Pred> {
+ defm p32: F_ATOMIC_2_NEG_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+   IntOp, IMMType, Pred> ;
+ defm p64: F_ATOMIC_2_NEG_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+   IntOp, IMMType, Pred> ;
+}
+
+// has 3 operands
+multiclass F_ATOMIC_3_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+  string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+  Operand IMMType, Predicate Pred> {
+  def reg : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, regclass:$b, regclass:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst,
+           (IntOp ptrclass:$addr, regclass:$b, regclass:$c))]>,
+         Requires<[Pred]>;
+  def imm1 : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, IMMType:$b, regclass:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, regclass:$c))]>,
+  Requires<[Pred]>;
+  def imm2 : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, regclass:$b, IMMType:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, imm:$c))]>,
+  Requires<[Pred]>;
+  def imm3 : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, IMMType:$b, IMMType:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, imm:$c))]>,
+  Requires<[Pred]>;
+}
+multiclass F_ATOMIC_3<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
+  string OpcStr, PatFrag IntOp, Operand IMMType, Predicate Pred> {
+  defm p32 : F_ATOMIC_3_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, Pred>;
+  defm p64 : F_ATOMIC_3_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, Pred>;
+}
+
+// atom_add
+
+def atomic_load_add_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_f32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+def atomic_load_add_f32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add",
+  atomic_load_add_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".add",
+  atomic_load_add_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".add",
+  atomic_load_add_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".add", atomic_load_add_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64", ".add",
+  atomic_load_add_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64", ".add",
+  atomic_load_add_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".add",
+  atomic_load_add_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+  ".add", atomic_load_add_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<Float32Regs, ".global", ".f32", ".add",
+  atomic_load_add_f32_g, f32imm, fpimm, hasAtomAddF32>;
+defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<Float32Regs, ".shared", ".f32", ".add",
+  atomic_load_add_f32_s, f32imm, fpimm, hasAtomAddF32>;
+defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add",
+  atomic_load_add_f32_gen, f32imm, fpimm, hasAtomAddF32>;
+
+// atom_sub
+
+def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_64 node:$a, node:$b)>;
+def atomic_load_sub_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_64 node:$a, node:$b)>;
+def atomic_load_sub_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG<Int32Regs, ".global", "32", ".add",
+  atomic_load_sub_32_g, i32imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG<Int64Regs, ".global", "64", ".add",
+  atomic_load_sub_64_g, i64imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG<Int32Regs, "", "32", ".add",
+  atomic_load_sub_32_gen, i32imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG<Int32Regs, ".global", "32",
+  ".add", atomic_load_sub_32_gen, i32imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG<Int32Regs, ".shared", "32", ".add",
+  atomic_load_sub_32_s, i32imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG<Int64Regs, ".shared", "64", ".add",
+  atomic_load_sub_64_s, i64imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG<Int64Regs, "", "64", ".add",
+  atomic_load_sub_64_gen, i64imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG<Int64Regs, ".global", "64",
+  ".add", atomic_load_sub_64_gen, i64imm, useAtomRedG64forGen64>;
+
+// atom_swap
+
+def atomic_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_swap_64 node:$a, node:$b)>;
+def atomic_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_swap_64 node:$a, node:$b)>;
+def atomic_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_swap_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".exch",
+  atomic_swap_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".exch",
+  atomic_swap_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".exch",
+  atomic_swap_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".exch", atomic_swap_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".exch",
+  atomic_swap_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".exch",
+  atomic_swap_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".exch",
+  atomic_swap_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+  ".exch", atomic_swap_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_max
+
+def atomic_load_max_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
+  , (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
+  , (atomic_load_max_64 node:$a, node:$b)>;
+def atomic_load_max_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_max_64 node:$a, node:$b)>;
+def atomic_load_max_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_max_64 node:$a, node:$b)>;
+def atomic_load_umax_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_64 node:$a, node:$b)>;
+def atomic_load_umax_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_64 node:$a, node:$b)>;
+def atomic_load_umax_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
+  ".max", atomic_load_max_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
+  ".max", atomic_load_max_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".max",
+  atomic_load_max_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
+  ".max", atomic_load_max_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
+  ".max", atomic_load_max_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".max",
+  atomic_load_max_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+  ".s64", ".max", atomic_load_max_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".max", atomic_load_umax_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
+  ".max", atomic_load_umax_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".max",
+  atomic_load_umax_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+  ".max", atomic_load_umax_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
+  ".max", atomic_load_umax_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".max",
+  atomic_load_umax_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+  ".u64", ".max", atomic_load_umax_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_min
+
+def atomic_load_min_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_64 node:$a, node:$b)>;
+def atomic_load_min_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_64 node:$a, node:$b)>;
+def atomic_load_min_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_64 node:$a, node:$b)>;
+def atomic_load_umin_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_64 node:$a, node:$b)>;
+def atomic_load_umin_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_64 node:$a, node:$b)>;
+def atomic_load_umin_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
+  ".min", atomic_load_min_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
+  ".min", atomic_load_min_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".min",
+  atomic_load_min_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".s64",
+  ".min", atomic_load_min_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".s64",
+  ".min", atomic_load_min_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".s64", ".min",
+  atomic_load_min_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+  ".s64", ".min", atomic_load_min_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".min", atomic_load_umin_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
+  ".min", atomic_load_umin_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".min",
+  atomic_load_umin_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+  ".min", atomic_load_umin_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64",
+  ".min", atomic_load_umin_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".min",
+  atomic_load_umin_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global",
+  ".u64", ".min", atomic_load_umin_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_inc  atom_dec
+
+def atomic_load_inc_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_inc_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_inc_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_dec_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+def atomic_load_dec_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+def atomic_load_dec_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_INC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".inc",
+  atomic_load_inc_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_INC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".inc",
+  atomic_load_inc_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_INC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".inc",
+  atomic_load_inc_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_INC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".inc", atomic_load_inc_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_DEC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".dec",
+  atomic_load_dec_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_DEC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".dec",
+  atomic_load_dec_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_DEC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".dec",
+  atomic_load_dec_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".dec", atomic_load_dec_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_and
+
+def atomic_load_and_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_64 node:$a, node:$b)>;
+def atomic_load_and_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_64 node:$a, node:$b)>;
+def atomic_load_and_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and",
+  atomic_load_and_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".and",
+  atomic_load_and_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".and",
+  atomic_load_and_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".and", atomic_load_and_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".and",
+  atomic_load_and_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".and",
+  atomic_load_and_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".and",
+  atomic_load_and_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+  ".and", atomic_load_and_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_or
+
+def atomic_load_or_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_64 node:$a, node:$b)>;
+def atomic_load_or_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_64 node:$a, node:$b)>;
+def atomic_load_or_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or",
+  atomic_load_or_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".or",
+  atomic_load_or_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".or", atomic_load_or_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".or",
+  atomic_load_or_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".or",
+  atomic_load_or_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".or",
+  atomic_load_or_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+  ".or", atomic_load_or_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".or",
+  atomic_load_or_64_s, i64imm, imm, hasAtomRedS64>;
+
+// atom_xor
+
+def atomic_load_xor_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_64 node:$a, node:$b)>;
+def atomic_load_xor_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_64 node:$a, node:$b)>;
+def atomic_load_xor_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor",
+  atomic_load_xor_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".xor",
+  atomic_load_xor_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".xor",
+  atomic_load_xor_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".xor", atomic_load_xor_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".xor",
+  atomic_load_xor_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".xor",
+  atomic_load_xor_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".xor",
+  atomic_load_xor_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+  ".xor", atomic_load_xor_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_cas
+
+def atomic_cmp_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+
+defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<Int32Regs, ".global", ".b32", ".cas",
+  atomic_cmp_swap_32_g, i32imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<Int32Regs, ".shared", ".b32", ".cas",
+  atomic_cmp_swap_32_s, i32imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<Int32Regs, "", ".b32", ".cas",
+  atomic_cmp_swap_32_gen, i32imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<Int32Regs, ".global", ".b32",
+  ".cas", atomic_cmp_swap_32_gen, i32imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<Int64Regs, ".global", ".b64", ".cas",
+  atomic_cmp_swap_64_g, i64imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<Int64Regs, ".shared", ".b64", ".cas",
+  atomic_cmp_swap_64_s, i64imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<Int64Regs, "", ".b64", ".cas",
+  atomic_cmp_swap_64_gen, i64imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<Int64Regs, ".global", ".b64",
+  ".cas", atomic_cmp_swap_64_gen, i64imm, useAtomRedG64forGen64>;
+
+// Support for scoped atomic operations.  Matches
+// int_nvvm_atomic_{op}_{space}_{type}_{scope}
+// and converts it into the appropriate instruction.
+// NOTE: not all possible combinations are implemented
+//  'space' is limited to generic as it's the only one needed to support CUDA.
+//  'scope' = 'gpu' is default and is handled by regular atomic instructions.
+class ATOM23_impl<string AsmStr, NVPTXRegClass regclass, list<Predicate> Preds,
+                  dag ins, dag Operands>
+      : NVPTXInst<(outs regclass:$result), ins,
+                  AsmStr,
+                  [(set regclass:$result, Operands)]>,
+        Requires<Preds>;
+
+// Define instruction variants for all addressing modes.
+multiclass ATOM2P_impl<string AsmStr,  Intrinsic Intr,
+                       NVPTXRegClass regclass, Operand ImmType,
+                       SDNode Imm, ValueType ImmTy,
+                       list<Predicate> Preds> {
+  let AddedComplexity = 1 in {
+    def : ATOM23_impl<AsmStr, regclass, Preds,
+                      (ins Int32Regs:$src, regclass:$b),
+                      (Intr Int32Regs:$src, regclass:$b)>;
+    def : ATOM23_impl<AsmStr, regclass, Preds,
+                      (ins Int64Regs:$src, regclass:$b),
+                      (Intr Int64Regs:$src, regclass:$b)>;
+  }
+  // tablegen can't infer argument types from Intrinsic (though it can
+  // from Instruction) so we have to enforce specific type on
+  // immediates via explicit cast to ImmTy.
+  def : ATOM23_impl<AsmStr, regclass, Preds,
+                    (ins Int32Regs:$src, ImmType:$b),
+                    (Intr Int32Regs:$src, (ImmTy Imm:$b))>;
+  def : ATOM23_impl<AsmStr, regclass, Preds,
+                    (ins Int64Regs:$src, ImmType:$b),
+                    (Intr Int64Regs:$src, (ImmTy Imm:$b))>;
+}
+
+multiclass ATOM3P_impl<string AsmStr,  Intrinsic Intr,
+                       NVPTXRegClass regclass, Operand ImmType,
+                       SDNode Imm, ValueType ImmTy,
+                       list<Predicate> Preds> {
+  // Variants for register/immediate permutations of $b and $c
+  let AddedComplexity = 2 in {
+    def : ATOM23_impl<AsmStr, regclass, Preds,
+                      (ins Int32Regs:$src, regclass:$b, regclass:$c),
+                      (Intr Int32Regs:$src, regclass:$b, regclass:$c)>;
+    def : ATOM23_impl<AsmStr, regclass, Preds,
+                      (ins Int64Regs:$src, regclass:$b, regclass:$c),
+                      (Intr Int64Regs:$src, regclass:$b, regclass:$c)>;
+  }
+  let AddedComplexity = 1 in {
+    def : ATOM23_impl<AsmStr, regclass, Preds,
+                      (ins Int32Regs:$src, ImmType:$b, regclass:$c),
+                      (Intr Int32Regs:$src, (ImmTy Imm:$b), regclass:$c)>;
+    def : ATOM23_impl<AsmStr, regclass, Preds,
+                      (ins Int64Regs:$src, ImmType:$b, regclass:$c),
+                      (Intr Int64Regs:$src, (ImmTy Imm:$b), regclass:$c)>;
+    def : ATOM23_impl<AsmStr, regclass, Preds,
+                      (ins Int32Regs:$src, regclass:$b, ImmType:$c),
+                      (Intr Int32Regs:$src, regclass:$b, (ImmTy Imm:$c))>;
+    def : ATOM23_impl<AsmStr, regclass, Preds,
+                      (ins Int64Regs:$src, regclass:$b, ImmType:$c),
+                      (Intr Int64Regs:$src, regclass:$b, (ImmTy Imm:$c))>;
+  }
+  def : ATOM23_impl<AsmStr, regclass, Preds,
+                    (ins Int32Regs:$src, ImmType:$b, ImmType:$c),
+                    (Intr Int32Regs:$src, (ImmTy Imm:$b), (ImmTy Imm:$c))>;
+  def : ATOM23_impl<AsmStr, regclass, Preds,
+                    (ins Int64Regs:$src, ImmType:$b, ImmType:$c),
+                    (Intr Int64Regs:$src, (ImmTy Imm:$b), (ImmTy Imm:$c))>;
+}
+
+// Constructs instrinsic name and instruction asm strings.
+multiclass ATOM2N_impl<string OpStr, string IntTypeStr, string TypeStr,
+                       string ScopeStr, string SpaceStr,
+                       NVPTXRegClass regclass, Operand ImmType, SDNode Imm,
+                       ValueType ImmTy, list<Predicate> Preds> {
+  defm : ATOM2P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr)
+                            # !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr)
+                            # "." # OpStr # "." # TypeStr
+                            # " \t$result, [$src], $b;",
+                     !cast<Intrinsic>(
+                            "int_nvvm_atomic_" # OpStr
+                            # "_" # SpaceStr # "_" # IntTypeStr
+                            # !if(!eq(ScopeStr,""), "", "_" # ScopeStr)),
+                     regclass, ImmType, Imm, ImmTy, Preds>;
+}
+multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
+                       string ScopeStr, string SpaceStr,
+                       NVPTXRegClass regclass, Operand ImmType, SDNode Imm,
+                       ValueType ImmTy, list<Predicate> Preds> {
+  defm : ATOM3P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr)
+                            # !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr)
+                            # "." # OpStr # "." # TypeStr
+                            # " \t$result, [$src], $b, $c;",
+                     !cast<Intrinsic>(
+                            "int_nvvm_atomic_" # OpStr
+                            # "_" # SpaceStr # "_" # IntTypeStr
+                            # !if(!eq(ScopeStr,""), "", "_" # ScopeStr)),
+                     regclass, ImmType, Imm, ImmTy, Preds>;
+}
+
+// Constructs variants for different address spaces.
+// For now we only need variants for generic space pointers.
+multiclass ATOM2A_impl<string OpStr, string IntTypeStr, string TypeStr,
+                       string ScopeStr, NVPTXRegClass regclass, Operand ImmType,
+                       SDNode Imm, ValueType ImmTy, list<Predicate> Preds> {
+   defm _gen_ : ATOM2N_impl<OpStr, IntTypeStr, TypeStr, ScopeStr, "gen",
+                            regclass, ImmType, Imm, ImmTy, Preds>;
+}
+multiclass ATOM3A_impl<string OpStr, string IntTypeStr, string TypeStr,
+                       string ScopeStr, NVPTXRegClass regclass, Operand ImmType,
+                       SDNode Imm, ValueType ImmTy, list<Predicate> Preds> {
+   defm _gen_ : ATOM3N_impl<OpStr, IntTypeStr, TypeStr, ScopeStr, "gen",
+                            regclass, ImmType, Imm, ImmTy, Preds>;
+}
+
+// Constructs variants for different scopes of atomic op.
+multiclass ATOM2S_impl<string OpStr, string IntTypeStr, string TypeStr,
+                       NVPTXRegClass regclass, Operand ImmType, SDNode Imm,
+                       ValueType ImmTy, list<Predicate> Preds> {
+   // .gpu scope is default and is currently covered by existing
+   // atomics w/o explicitly specified scope.
+   defm _cta : ATOM2A_impl<OpStr, IntTypeStr, TypeStr, "cta",
+                           regclass, ImmType, Imm, ImmTy,
+                           !listconcat(Preds,[hasAtomScope])>;
+   defm _sys : ATOM2A_impl<OpStr, IntTypeStr, TypeStr, "sys",
+                           regclass, ImmType, Imm, ImmTy,
+                           !listconcat(Preds,[hasAtomScope])>;
+}
+multiclass ATOM3S_impl<string OpStr, string IntTypeStr, string TypeStr,
+           NVPTXRegClass regclass, Operand ImmType, SDNode Imm, ValueType ImmTy,
+           list<Predicate> Preds> {
+   // No need to define ".gpu"-scoped atomics.  They do the same thing
+   // as the regular, non-scoped atomics defined elsewhere.
+   defm _cta : ATOM3A_impl<OpStr, IntTypeStr, TypeStr, "cta",
+                           regclass, ImmType, Imm, ImmTy,
+                           !listconcat(Preds,[hasAtomScope])>;
+   defm _sys : ATOM3A_impl<OpStr, IntTypeStr, TypeStr, "sys",
+                           regclass, ImmType, Imm, ImmTy,
+                           !listconcat(Preds,[hasAtomScope])>;
+}
+
+// atom.add
+multiclass ATOM2_add_impl<string OpStr> {
+   defm _s32  : ATOM2S_impl<OpStr, "i", "s32", Int32Regs, i32imm, imm, i32, []>;
+   defm _u32  : ATOM2S_impl<OpStr, "i", "u32", Int32Regs, i32imm, imm, i32, []>;
+   defm _u64  : ATOM2S_impl<OpStr, "i", "u64", Int64Regs, i64imm, imm, i64, []>;
+   defm _f32  : ATOM2S_impl<OpStr, "f", "f32", Float32Regs, f32imm, fpimm, f32,
+                            [hasAtomAddF32]>;
+   defm _f64  : ATOM2S_impl<OpStr, "f", "f64", Float64Regs, f64imm, fpimm, f64,
+                            [hasAtomAddF64]>;
+}
+
+// atom.{and,or,xor}
+multiclass ATOM2_bitwise_impl<string OpStr> {
+   defm _b32  : ATOM2S_impl<OpStr, "i", "b32", Int32Regs, i32imm, imm, i32, []>;
+   defm _b64  : ATOM2S_impl<OpStr, "i", "b64", Int64Regs, i64imm, imm, i64,
+                            [hasAtomBitwise64]>;
+}
+
+// atom.exch
+multiclass ATOM2_exch_impl<string OpStr> {
+   defm _b32 : ATOM2S_impl<OpStr, "i", "b32", Int32Regs, i32imm, imm, i32, []>;
+   defm _b64 : ATOM2S_impl<OpStr, "i", "b64", Int64Regs, i64imm, imm, i64, []>;
+}
+
+// atom.{min,max}
+multiclass ATOM2_minmax_impl<string OpStr> {
+   defm _s32  : ATOM2S_impl<OpStr, "i", "s32", Int32Regs, i32imm, imm, i32, []>;
+   defm _u32  : ATOM2S_impl<OpStr, "i", "u32", Int32Regs, i32imm, imm, i32, []>;
+   defm _s64  : ATOM2S_impl<OpStr, "i", "s64", Int64Regs, i64imm, imm, i64,
+                            [hasAtomMinMax64]>;
+   defm _u64  : ATOM2S_impl<OpStr, "i", "u64", Int64Regs, i64imm, imm, i64,
+                            [hasAtomMinMax64]>;
+}
+
+// atom.{inc,dec}
+multiclass ATOM2_incdec_impl<string OpStr> {
+   defm _u32  : ATOM2S_impl<OpStr, "i", "u32", Int32Regs, i32imm, imm, i32, []>;
+}
+
+// atom.cas
+multiclass ATOM3_cas_impl<string OpStr> {
+   defm _b32  : ATOM3S_impl<OpStr, "i", "b32", Int32Regs, i32imm, imm, i32, []>;
+   defm _b64  : ATOM3S_impl<OpStr, "i", "b64", Int64Regs, i64imm, imm, i64, []>;
+}
+
+defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">;
+defm INT_PTX_SATOM_AND : ATOM2_bitwise_impl<"and">;
+defm INT_PTX_SATOM_CAS : ATOM3_cas_impl<"cas">;
+defm INT_PTX_SATOM_DEC : ATOM2_incdec_impl<"dec">;
+defm INT_PTX_SATOM_EXCH: ATOM2_exch_impl<"exch">;
+defm INT_PTX_SATOM_INC : ATOM2_incdec_impl<"inc">;
+defm INT_PTX_SATOM_MAX : ATOM2_minmax_impl<"max">;
+defm INT_PTX_SATOM_MIN : ATOM2_minmax_impl<"min">;
+defm INT_PTX_SATOM_OR  : ATOM2_bitwise_impl<"or">;
+defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
+
+//-----------------------------------
+// Support for ldu on sm_20 or later
+//-----------------------------------
+
+// Don't annotate ldu instructions as mayLoad, as they load from memory that is
+// read-only in a kernel.
+
+// Scalar
+
+multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
+  def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
+               !strconcat("ldu.global.", TyStr),
+                      []>, Requires<[hasLDU]>;
+  def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
+               !strconcat("ldu.global.", TyStr),
+                        []>, Requires<[hasLDU]>;
+ def avar:  NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
+               !strconcat("ldu.global.", TyStr),
+                      []>, Requires<[hasLDU]>;
+ def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
+               !strconcat("ldu.global.", TyStr),
+                      []>, Requires<[hasLDU]>;
+ def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
+               !strconcat("ldu.global.", TyStr),
+                        []>, Requires<[hasLDU]>;
+}
+
+defm INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
+defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
+defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
+
+// vector
+
+// Elementized vector ldu
+multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins Int32Regs:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins Int64Regs:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins MEMri:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins MEMri64:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins imemAny:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+}
+
+multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> { 
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins Int32Regs:$src), 
+               !strconcat("ldu.global.", TyStr), []>;
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins Int64Regs:$src), 
+               !strconcat("ldu.global.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins MEMri:$src), 
+               !strconcat("ldu.global.", TyStr), []>;
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins MEMri64:$src), 
+               !strconcat("ldu.global.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins imemAny:$src), 
+               !strconcat("ldu.global.", TyStr), []>;
+}
+
+defm INT_PTX_LDU_G_v2i8_ELE
+  : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int16Regs>;
+defm INT_PTX_LDU_G_v2i16_ELE
+  : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDU_G_v2i32_ELE
+  : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDU_G_v2f32_ELE
+  : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
+defm INT_PTX_LDU_G_v2i64_ELE
+  : VLDU_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
+defm INT_PTX_LDU_G_v2f64_ELE
+  : VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
+defm INT_PTX_LDU_G_v4i8_ELE
+  : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
+defm INT_PTX_LDU_G_v4i16_ELE
+  : VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Int16Regs>;
+defm INT_PTX_LDU_G_v4i32_ELE
+  : VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Int32Regs>;
+defm INT_PTX_LDU_G_v4f32_ELE
+  : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Float32Regs>;
+
+
+//-----------------------------------
+// Support for ldg on sm_35 or later 
+//-----------------------------------
+
+// Don't annotate ld.global.nc as mayLoad, because these loads go through the
+// non-coherent texture cache, and therefore the values read must be read-only
+// during the lifetime of the kernel.
+
+multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
+  def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
+               !strconcat("ld.global.nc.", TyStr),
+                      []>, Requires<[hasLDG]>;
+  def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
+               !strconcat("ld.global.nc.", TyStr),
+                        []>, Requires<[hasLDG]>;
+ def avar:  NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
+               !strconcat("ld.global.nc.", TyStr),
+                      []>, Requires<[hasLDG]>;
+ def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
+               !strconcat("ld.global.nc.", TyStr),
+                      []>, Requires<[hasLDG]>;
+ def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
+               !strconcat("ld.global.nc.", TyStr),
+                        []>, Requires<[hasLDG]>;
+}
+
+defm INT_PTX_LDG_GLOBAL_i8
+  : LDG_G<"u8 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDG_GLOBAL_i16
+  : LDG_G<"u16 \t$result, [$src];", Int16Regs>;
+defm INT_PTX_LDG_GLOBAL_i32
+  : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDG_GLOBAL_i64
+  : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
+defm INT_PTX_LDG_GLOBAL_f32
+  : LDG_G<"f32 \t$result, [$src];", Float32Regs>;
+defm INT_PTX_LDG_GLOBAL_f64
+  : LDG_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDG_GLOBAL_p32
+  : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
+defm INT_PTX_LDG_GLOBAL_p64
+  : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
+
+// vector
+
+// Elementized vector ldg 
+multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins Int32Regs:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins Int64Regs:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins MEMri:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins MEMri64:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins imemAny:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
+}
+
+multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> { 
+  def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                              regclass:$dst4), (ins Int32Regs:$src), 
+               !strconcat("ld.global.nc.", TyStr), []>;
+  def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                               regclass:$dst4), (ins Int64Regs:$src), 
+               !strconcat("ld.global.nc.", TyStr), []>;
+  def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                              regclass:$dst4), (ins MEMri:$src), 
+               !strconcat("ld.global.nc.", TyStr), []>;
+  def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                              regclass:$dst4), (ins MEMri64:$src), 
+               !strconcat("ld.global.nc.", TyStr), []>;
+  def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                             regclass:$dst4), (ins imemAny:$src), 
+               !strconcat("ld.global.nc.", TyStr), []>;
+}
+
+// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
+defm INT_PTX_LDG_G_v2i8_ELE
+  : VLDG_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int16Regs>;
+defm INT_PTX_LDG_G_v2i16_ELE
+  : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v2i32_ELE
+  : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v2f32_ELE
+  : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
+defm INT_PTX_LDG_G_v2i64_ELE
+  : VLDG_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
+defm INT_PTX_LDG_G_v2f64_ELE
+  : VLDG_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
+defm INT_PTX_LDG_G_v4i8_ELE
+  : VLDG_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v4i16_ELE
+  : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v4i32_ELE
+  : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v4f32_ELE
+  : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
+
+
+multiclass NG_TO_G<string Str, Intrinsic Intrin> {
+   def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+   def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+
+// @TODO: Are these actually needed?  I believe global addresses will be copied
+// to register values anyway.
+   /*def __addr_yes : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
+      [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
+      Requires<[hasGenericLdSt]>;
+   def __addr_yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
+      [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
+      Requires<[hasGenericLdSt]>;*/
+
+   def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          "mov.u32 \t$result, $src;",
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
+   def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          "mov.u64 \t$result, $src;",
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
+
+// @TODO: Are these actually needed?  I believe global addresses will be copied
+// to register values anyway.
+   /*def _addr_no : NVPTXInst<(outs Int32Regs:$result), (ins imem:$src),
+          "mov.u32 \t$result, $src;",
+      [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;
+   def _addr_no_64 : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
+          "mov.u64 \t$result, $src;",
+      [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;*/
+}
+
+multiclass G_TO_NG<string Str, Intrinsic Intrin> {
+   def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          !strconcat("cvta.to.", !strconcat(Str, ".u32 \t$result, $src;")),
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+   def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          !strconcat("cvta.to.", !strconcat(Str, ".u64 \t$result, $src;")),
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+   def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          "mov.u32 \t$result, $src;",
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
+   def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          "mov.u64 \t$result, $src;",
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
+}
+
+defm cvta_local  : NG_TO_G<"local", int_nvvm_ptr_local_to_gen>;
+defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen>;
+defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen>;
+defm cvta_const  : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen>;
+
+defm cvta_to_local   : G_TO_NG<"local", int_nvvm_ptr_gen_to_local>;
+defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared>;
+defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global>;
+defm cvta_to_const  : G_TO_NG<"const", int_nvvm_ptr_gen_to_constant>;
+
+
+// nvvm.ptr.gen.to.param
+def nvvm_ptr_gen_to_param : NVPTXInst<(outs Int32Regs:$result),
+  (ins Int32Regs:$src),
+                        "mov.u32 \t$result, $src;",
+                              [(set Int32Regs:$result,
+                                (int_nvvm_ptr_gen_to_param Int32Regs:$src))]>;
+def nvvm_ptr_gen_to_param_64 : NVPTXInst<(outs Int64Regs:$result),
+  (ins Int64Regs:$src),
+                        "mov.u64 \t$result, $src;",
+                              [(set Int64Regs:$result,
+                                (int_nvvm_ptr_gen_to_param Int64Regs:$src))]>;
+
+
+// nvvm.move intrinsicc
+def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s),
+                             "mov.b16 \t$r, $s;",
+                             [(set Int16Regs:$r,
+                               (int_nvvm_move_i16 Int16Regs:$s))]>;
+def nvvm_move_i32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
+                             "mov.b32 \t$r, $s;",
+                             [(set Int32Regs:$r,
+                               (int_nvvm_move_i32 Int32Regs:$s))]>;
+def nvvm_move_i64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
+                             "mov.b64 \t$r, $s;",
+                             [(set Int64Regs:$r,
+                               (int_nvvm_move_i64 Int64Regs:$s))]>;
+def nvvm_move_float : NVPTXInst<(outs Float32Regs:$r), (ins Float32Regs:$s),
+                             "mov.f32 \t$r, $s;",
+                             [(set Float32Regs:$r,
+                               (int_nvvm_move_float Float32Regs:$s))]>;
+def nvvm_move_double : NVPTXInst<(outs Float64Regs:$r), (ins Float64Regs:$s),
+                             "mov.f64 \t$r, $s;",
+                             [(set Float64Regs:$r,
+                               (int_nvvm_move_double Float64Regs:$s))]>;
+def nvvm_move_ptr32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
+                             "mov.u32 \t$r, $s;",
+                             [(set Int32Regs:$r,
+                               (int_nvvm_move_ptr Int32Regs:$s))]>;
+def nvvm_move_ptr64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
+                             "mov.u64 \t$r, $s;",
+                             [(set Int64Regs:$r,
+                               (int_nvvm_move_ptr Int64Regs:$s))]>;
+
+// @TODO: Are these actually needed, or will we always just see symbols
+// copied to registers first?
+/*def nvvm_move_sym32 : NVPTXInst<(outs Int32Regs:$r), (ins imem:$s),
+                             "mov.u32 \t$r, $s;",
+                             [(set Int32Regs:$r,
+                             (int_nvvm_move_ptr texternalsym:$s))]>;
+def nvvm_move_sym64 : NVPTXInst<(outs Int64Regs:$r), (ins imem:$s),
+                             "mov.u64 \t$r, $s;",
+                             [(set Int64Regs:$r,
+                             (int_nvvm_move_ptr texternalsym:$s))]>;*/
+
+
+// MoveParam        %r1, param
+// ptr_local_to_gen %r2, %r1
+// ptr_gen_to_local %r3, %r2
+// ->
+// mov %r1, param
+
+// @TODO: Revisit this.  There is a type
+// contradiction between iPTRAny and iPTR for the addr defs, so the move_sym
+// instructions are not currently defined. However, we can use the ptr
+// variants and the asm printer will do the right thing.
+def : Pat<(i64 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
+                (MoveParam texternalsym:$src)))),
+               (nvvm_move_ptr64  texternalsym:$src)>;
+def : Pat<(i32 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
+                (MoveParam texternalsym:$src)))),
+               (nvvm_move_ptr32  texternalsym:$src)>;
+
+def texsurf_handles
+  : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
+              "mov.u64 \t$result, $src;", []>;
+
+//-----------------------------------
+// Compiler Error Warn
+// - Just ignore them in codegen
+//-----------------------------------
+
+def INT_NVVM_COMPILER_WARN_32 : NVPTXInst<(outs), (ins Int32Regs:$a),
+                "// llvm.nvvm.compiler.warn()",
+                [(int_nvvm_compiler_warn Int32Regs:$a)]>;
+def INT_NVVM_COMPILER_WARN_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
+                "// llvm.nvvm.compiler.warn()",
+                [(int_nvvm_compiler_warn Int64Regs:$a)]>;
+def INT_NVVM_COMPILER_ERROR_32 : NVPTXInst<(outs), (ins Int32Regs:$a),
+                "// llvm.nvvm.compiler.error()",
+                [(int_nvvm_compiler_error Int32Regs:$a)]>;
+def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
+                "// llvm.nvvm.compiler.error()",
+                [(int_nvvm_compiler_error Int64Regs:$a)]>;
+
+
+// isspacep
+
+def ISSPACEP_CONST_32
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+              "isspacep.const \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_const Int32Regs:$a))]>,
+    Requires<[hasPTX31]>;
+def ISSPACEP_CONST_64
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "isspacep.const \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_const Int64Regs:$a))]>,
+    Requires<[hasPTX31]>;
+def ISSPACEP_GLOBAL_32
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+              "isspacep.global \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_global Int32Regs:$a))]>;
+def ISSPACEP_GLOBAL_64
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "isspacep.global \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_global Int64Regs:$a))]>;
+def ISSPACEP_LOCAL_32
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+              "isspacep.local \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_local Int32Regs:$a))]>;
+def ISSPACEP_LOCAL_64
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "isspacep.local \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_local Int64Regs:$a))]>;
+def ISSPACEP_SHARED_32
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+              "isspacep.shared \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int32Regs:$a))]>;
+def ISSPACEP_SHARED_64
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "isspacep.shared \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_isspacep_shared Int64Regs:$a))]>;
+
+
+// Special register reads
+def MOV_SPECIAL : NVPTXInst<(outs Int32Regs:$d),
+                            (ins SpecialRegs:$r),
+                            "mov.b32\t$d, $r;", []>;
+
+def : Pat<(int_nvvm_read_ptx_sreg_envreg0), (MOV_SPECIAL ENVREG0)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg1), (MOV_SPECIAL ENVREG1)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg2), (MOV_SPECIAL ENVREG2)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg3), (MOV_SPECIAL ENVREG3)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg4), (MOV_SPECIAL ENVREG4)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg5), (MOV_SPECIAL ENVREG5)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg6), (MOV_SPECIAL ENVREG6)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg7), (MOV_SPECIAL ENVREG7)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg8), (MOV_SPECIAL ENVREG8)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg9), (MOV_SPECIAL ENVREG9)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg10), (MOV_SPECIAL ENVREG10)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg11), (MOV_SPECIAL ENVREG11)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg12), (MOV_SPECIAL ENVREG12)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg13), (MOV_SPECIAL ENVREG13)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg14), (MOV_SPECIAL ENVREG14)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg15), (MOV_SPECIAL ENVREG15)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg16), (MOV_SPECIAL ENVREG16)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg17), (MOV_SPECIAL ENVREG17)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg18), (MOV_SPECIAL ENVREG18)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg19), (MOV_SPECIAL ENVREG19)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg20), (MOV_SPECIAL ENVREG20)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg21), (MOV_SPECIAL ENVREG21)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg22), (MOV_SPECIAL ENVREG22)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg23), (MOV_SPECIAL ENVREG23)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg24), (MOV_SPECIAL ENVREG24)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg25), (MOV_SPECIAL ENVREG25)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg26), (MOV_SPECIAL ENVREG26)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg27), (MOV_SPECIAL ENVREG27)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg28), (MOV_SPECIAL ENVREG28)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg29), (MOV_SPECIAL ENVREG29)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>;
+def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>;
+
+
+// rotate builtin support
+
+def ROTATE_B32_HW_IMM
+  : NVPTXInst<(outs Int32Regs:$dst),
+              (ins  Int32Regs:$src, i32imm:$amt),
+              "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+              [(set Int32Regs:$dst,
+                 (int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)))]>,
+              Requires<[hasHWROT32]> ;
+
+def ROTATE_B32_HW_REG
+  : NVPTXInst<(outs Int32Regs:$dst),
+              (ins  Int32Regs:$src, Int32Regs:$amt),
+              "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+              [(set Int32Regs:$dst,
+                 (int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt))]>,
+              Requires<[hasHWROT32]> ;
+
+def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)),
+          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+      Requires<[noHWROT32]> ;
+
+def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt),
+          (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>,
+      Requires<[noHWROT32]> ;
+
+let hasSideEffects = 0 in {
+  def GET_LO_INT64
+    : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+                !strconcat("{{\n\t",
+                !strconcat(".reg .b32 %dummy;\n\t",
+                !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t",
+          !strconcat("}}", "")))),
+          []> ;
+
+  def GET_HI_INT64
+    : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+                !strconcat("{{\n\t",
+                !strconcat(".reg .b32 %dummy;\n\t",
+                !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t",
+          !strconcat("}}", "")))),
+          []> ;
+}
+
+let hasSideEffects = 0 in {
+  def PACK_TWO_INT32
+    : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi),
+                "mov.b64 \t$dst, {{$lo, $hi}};", []> ;
+}
+
+def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src),
+          (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src),
+                          (GET_LO_INT64 Int64Regs:$src))> ;
+
+// Funnel shift, requires >= sm_32.  Does not trap if amt is out of range, so
+// no side effects.
+let hasSideEffects = 0 in {
+  def SHF_L_WRAP_B32_IMM
+    : NVPTXInst<(outs Int32Regs:$dst),
+                (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+                "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+      Requires<[hasHWROT32]>;
+
+  def SHF_L_WRAP_B32_REG
+    : NVPTXInst<(outs Int32Regs:$dst),
+                (ins  Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+                "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+      Requires<[hasHWROT32]>;
+
+  def SHF_R_WRAP_B32_IMM
+    : NVPTXInst<(outs Int32Regs:$dst),
+                (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+                "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+      Requires<[hasHWROT32]>;
+
+  def SHF_R_WRAP_B32_REG
+    : NVPTXInst<(outs Int32Regs:$dst),
+                (ins  Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+                "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+      Requires<[hasHWROT32]>;
+}
+
+// HW version of rotate 64
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
+          (PACK_TWO_INT32
+            (SHF_L_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
+                                (GET_LO_INT64 Int64Regs:$src), imm:$amt),
+            (SHF_L_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
+                                (GET_HI_INT64 Int64Regs:$src), imm:$amt))>,
+      Requires<[hasHWROT32]>;
+
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
+          (PACK_TWO_INT32
+            (SHF_L_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
+                                (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt),
+            (SHF_L_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
+                               (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt))>,
+      Requires<[hasHWROT32]>;
+
+
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
+          (PACK_TWO_INT32
+            (SHF_R_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
+                                (GET_HI_INT64 Int64Regs:$src), imm:$amt),
+            (SHF_R_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
+                                (GET_LO_INT64 Int64Regs:$src), imm:$amt))>,
+      Requires<[hasHWROT32]>;
+
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
+          (PACK_TWO_INT32
+            (SHF_R_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
+                                (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt),
+            (SHF_R_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
+                               (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt))>,
+      Requires<[hasHWROT32]>;
+
+// SW version of rotate 64
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+      Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
+          (ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
+      Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>,
+      Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
+          (ROTR64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
+      Requires<[noHWROT32]>;
+
+
+//-----------------------------------
+// Texture Intrinsics
+//-----------------------------------
+
+// NOTE: For Fermi support, any new texture/surface/sampler intrinsics must be
+// also defined in NVPTXReplaceImageHandles.cpp
+
+// texmode_independent
+let IsTex = 1, IsTexModeUnified = 0 in {
+// Texture fetch instructions using handles
+def TEX_1D_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+              "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+              "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$lod),
+              "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], $lod;",
+              []>;
+def TEX_1D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_1D_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+              "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+              "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], $lod;",
+              []>;
+def TEX_1D_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_1D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+              "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+              "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], $lod;",
+              []>;
+def TEX_1D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+
+def TEX_1D_ARRAY_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_1D_ARRAY_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_1D_ARRAY_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_1D_ARRAY_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_1D_ARRAY_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_1D_ARRAY_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+
+def TEX_2D_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_2D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_2D_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_2D_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_2D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_2D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+
+def TEX_2D_ARRAY_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_2D_ARRAY_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_2D_ARRAY_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_2D_ARRAY_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_2D_ARRAY_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_2D_ARRAY_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+
+def TEX_3D_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_3D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+def TEX_3D_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_3D_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+def TEX_3D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_3D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+
+def TEX_CUBE_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+               Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_CUBE_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_CUBE_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_CUBE_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_CUBE_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_CUBE_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+
+def TEX_CUBE_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+               Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_CUBE_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+def TEX_CUBE_ARRAY_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_CUBE_ARRAY_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+def TEX_CUBE_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_CUBE_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+
+def TLD4_R_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_G_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_B_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_A_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_R_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_G_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_B_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_A_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_R_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_G_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_B_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_A_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+}
+
+
+// texmode_unified
+let IsTex = 1, IsTexModeUnified = 1 in {
+// Texture fetch instructions using handles
+def TEX_UNIFIED_1D_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x),
+              "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x),
+              "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$lod),
+              "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_UNIFIED_1D_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x),
+              "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x),
+              "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_UNIFIED_1D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x),
+              "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x),
+              "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+
+def TEX_UNIFIED_1D_ARRAY_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+
+def TEX_UNIFIED_2D_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_UNIFIED_2D_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_UNIFIED_2D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+
+def TEX_UNIFIED_2D_ARRAY_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+
+def TEX_UNIFIED_3D_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_3D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+def TEX_UNIFIED_3D_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_3D_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+def TEX_UNIFIED_3D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_3D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+
+def TEX_UNIFIED_CUBE_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t,
+               Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_CUBE_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_CUBE_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+
+def TEX_UNIFIED_CUBE_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+               Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_CUBE_ARRAY_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_CUBE_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+
+def TLD4_UNIFIED_R_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_G_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_B_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_A_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_R_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_G_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_B_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_A_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_R_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_G_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_B_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_A_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+}
+
+
+
+//=== Surface load instructions
+// .clamp variant
+let IsSuld = 1 in {
+def SULD_1D_I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b8.clamp \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b16.clamp \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b32.clamp \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b64.clamp \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b8.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b16.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b32.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b64.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b8.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b16.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b32.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b64.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b8.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b16.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b32.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b64.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b8.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b16.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b32.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b64.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 2 in {
+def SULD_1D_V2I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_V2I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V2I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V2I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b8.clamp \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b16.clamp \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b32.clamp \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b64.clamp \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_V2I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 3 in {
+def SULD_1D_V4I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b8.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b16.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b32.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_V4I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V4I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b8.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b16.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b32.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V4I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+
+def SULD_3D_V4I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+
+// .trap variant
+let IsSuld = 1 in {
+def SULD_1D_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b8.trap \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b16.trap \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b32.trap \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b64.trap \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b8.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b16.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b32.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b64.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b8.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b16.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b32.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b64.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b8.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b16.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b32.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b64.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b8.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b16.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b32.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b64.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 2 in {
+def SULD_1D_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b8.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b16.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b32.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b64.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 3 in {
+def SULD_1D_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+
+def SULD_3D_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+// .zero variant
+let IsSuld = 1 in {
+def SULD_1D_I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b8.zero \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b16.zero \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b32.zero \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b64.zero \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b8.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b16.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b32.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b64.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b8.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b16.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b32.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b64.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b8.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b16.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b32.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b64.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b8.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b16.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b32.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b64.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 2 in {
+def SULD_1D_V2I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_V2I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V2I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V2I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b8.zero \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b16.zero \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b32.zero \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b64.zero \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_V2I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 3 in {
+def SULD_1D_V4I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b8.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b16.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b32.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_V4I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V4I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b8.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b16.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b32.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V4I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+
+def SULD_3D_V4I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+//-----------------------------------
+// Texture Query Intrinsics
+//-----------------------------------
+
+let IsSurfTexQuery = 1 in {
+def TXQ_CHANNEL_ORDER
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.channel_order.b32 \t$d, [$a];",
+              []>;
+def TXQ_CHANNEL_DATA_TYPE
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.channel_data_type.b32 \t$d, [$a];",
+              []>;
+def TXQ_WIDTH
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.width.b32 \t$d, [$a];",
+              []>;
+def TXQ_HEIGHT
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.height.b32 \t$d, [$a];",
+              []>;
+def TXQ_DEPTH
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.depth.b32 \t$d, [$a];",
+              []>;
+def TXQ_ARRAY_SIZE
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.array_size.b32 \t$d, [$a];",
+              []>;
+def TXQ_NUM_SAMPLES
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.num_samples.b32 \t$d, [$a];",
+              []>;
+def TXQ_NUM_MIPMAP_LEVELS
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.num_mipmap_levels.b32 \t$d, [$a];",
+              []>;
+}
+
+def : Pat<(int_nvvm_txq_channel_order Int64Regs:$a),
+          (TXQ_CHANNEL_ORDER Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_channel_data_type Int64Regs:$a),
+          (TXQ_CHANNEL_DATA_TYPE Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_width Int64Regs:$a),
+          (TXQ_WIDTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_height Int64Regs:$a),
+          (TXQ_HEIGHT Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_depth Int64Regs:$a),
+          (TXQ_DEPTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_array_size Int64Regs:$a),
+          (TXQ_ARRAY_SIZE Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_num_samples Int64Regs:$a),
+          (TXQ_NUM_SAMPLES Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_num_mipmap_levels Int64Regs:$a),
+          (TXQ_NUM_MIPMAP_LEVELS Int64Regs:$a)>;
+
+
+//-----------------------------------
+// Surface Query Intrinsics
+//-----------------------------------
+
+let IsSurfTexQuery = 1 in {
+def SUQ_CHANNEL_ORDER
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.channel_order.b32 \t$d, [$a];",
+              []>;
+def SUQ_CHANNEL_DATA_TYPE
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.channel_data_type.b32 \t$d, [$a];",
+              []>;
+def SUQ_WIDTH
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.width.b32 \t$d, [$a];",
+              []>;
+def SUQ_HEIGHT
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.height.b32 \t$d, [$a];",
+              []>;
+def SUQ_DEPTH
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.depth.b32 \t$d, [$a];",
+              []>;
+def SUQ_ARRAY_SIZE
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.array_size.b32 \t$d, [$a];",
+              []>;
+}
+
+def : Pat<(int_nvvm_suq_channel_order Int64Regs:$a),
+          (SUQ_CHANNEL_ORDER Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_channel_data_type Int64Regs:$a),
+          (SUQ_CHANNEL_DATA_TYPE Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_width Int64Regs:$a),
+          (SUQ_WIDTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_height Int64Regs:$a),
+          (SUQ_HEIGHT Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_depth Int64Regs:$a),
+          (SUQ_DEPTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_array_size Int64Regs:$a),
+          (SUQ_ARRAY_SIZE Int64Regs:$a)>;
+
+
+//===- Handle Query -------------------------------------------------------===//
+
+// TODO: These intrinsics are not yet finalized, pending PTX ISA design work
+def ISTYPEP_SAMPLER
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.samplerref \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_istypep_sampler Int64Regs:$a))]>;
+def ISTYPEP_SURFACE
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.surfref \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_istypep_surface Int64Regs:$a))]>;
+def ISTYPEP_TEXTURE
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.texref \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_istypep_texture Int64Regs:$a))]>;
+
+//===- Surface Stores -----------------------------------------------------===//
+
+let IsSust = 1 in {
+// Unformatted
+// .clamp variant
+def SUST_B_1D_B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b8.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b16.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.1d.b32.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.1d.b64.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_V2B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b8.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b16.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+              "sust.b.1d.v2.b32.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+              "sust.b.1d.v2.b64.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V4B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b8.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b16.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+                   Int32Regs:$b, Int32Regs:$a),
+              "sust.b.1d.v4.b32.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_1D_ARRAY_B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b8.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b16.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.a1d.b32.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.a1d.b64.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b8.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b16.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.a1d.v2.b32.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.a1d.v2.b64.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.a1d.v4.b8.clamp \t[$s, \\{$idx, $x\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.a1d.v4.b16.clamp \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.a1d.v4.b32.clamp \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b8.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b16.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+              "sust.b.2d.b32.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+              "sust.b.2d.b64.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_V2B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b8.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b16.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.2d.v2.b32.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.2d.v2.b64.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V4B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.2d.v4.b8.clamp \t[$s, \\{$x, $y\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.2d.v4.b16.clamp \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.2d.v4.b32.clamp \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_ARRAY_B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r),
+              "sust.b.a2d.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r),
+              "sust.b.a2d.b64.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.a2d.v2.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+             "sust.b.a2d.v2.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g),
+             "sust.b.a2d.v2.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r, Int64Regs:$g),
+             "sust.b.a2d.v2.b64.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+      "sust.b.a2d.v4.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+      "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+     "sust.b.a2d.v4.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+     "sust.b.a2d.v4.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_3D_B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r),
+              "sust.b.3d.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r),
+              "sust.b.3d.b64.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_V2B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g),
+              "sust.b.3d.v2.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r, Int64Regs:$g),
+              "sust.b.3d.v2.b64.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V4B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+         "sust.b.3d.v4.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+         "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+        "sust.b.3d.v4.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+        "sust.b.3d.v4.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+// .trap variant
+def SUST_B_1D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.1d.b64.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+              "sust.b.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+              "sust.b.1d.v2.b64.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+                   Int32Regs:$b, Int32Regs:$a),
+              "sust.b.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_1D_ARRAY_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.a1d.b64.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.a1d.v2.b64.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+              "sust.b.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+              "sust.b.2d.b64.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.2d.v2.b64.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_ARRAY_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r),
+              "sust.b.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r),
+              "sust.b.a2d.b64.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+             "sust.b.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g),
+             "sust.b.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r, Int64Regs:$g),
+             "sust.b.a2d.v2.b64.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+      "sust.b.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+      "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+     "sust.b.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+     "sust.b.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_3D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r),
+              "sust.b.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r),
+              "sust.b.3d.b64.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g),
+              "sust.b.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r, Int64Regs:$g),
+              "sust.b.3d.v2.b64.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+         "sust.b.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+         "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+        "sust.b.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+        "sust.b.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+// .zero variant
+def SUST_B_1D_B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b8.zero \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b16.zero \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.1d.b32.zero \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.1d.b64.zero \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_V2B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b8.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b16.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+              "sust.b.1d.v2.b32.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+              "sust.b.1d.v2.b64.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V4B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b8.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b16.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+                   Int32Regs:$b, Int32Regs:$a),
+              "sust.b.1d.v4.b32.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_1D_ARRAY_B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b8.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b16.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.a1d.b32.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.a1d.b64.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b8.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b16.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.a1d.v2.b32.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.a1d.v2.b64.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.a1d.v4.b8.zero \t[$s, \\{$idx, $x\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.a1d.v4.b16.zero \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.a1d.v4.b32.zero \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b8.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b16.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+              "sust.b.2d.b32.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+              "sust.b.2d.b64.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_V2B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b8.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b16.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.2d.v2.b32.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.2d.v2.b64.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V4B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.2d.v4.b8.zero \t[$s, \\{$x, $y\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.2d.v4.b16.zero \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.2d.v4.b32.zero \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_ARRAY_B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r),
+              "sust.b.a2d.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r),
+              "sust.b.a2d.b64.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.a2d.v2.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+             "sust.b.a2d.v2.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g),
+             "sust.b.a2d.v2.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r, Int64Regs:$g),
+             "sust.b.a2d.v2.b64.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+      "sust.b.a2d.v4.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+      "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+     "sust.b.a2d.v4.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+     "sust.b.a2d.v4.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_3D_B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r),
+              "sust.b.3d.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r),
+              "sust.b.3d.b64.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_V2B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g),
+              "sust.b.3d.v2.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r, Int64Regs:$g),
+              "sust.b.3d.v2.b64.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V4B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+         "sust.b.3d.v4.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+         "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+        "sust.b.3d.v4.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+        "sust.b.3d.v4.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+
+// Formatted
+
+def SUST_P_1D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.p.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.p.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+              "sust.p.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.p.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.p.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+              "sust.p.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.p.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_1D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.p.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_1D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+                   Int32Regs:$b, Int32Regs:$a),
+              "sust.p.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_P_1D_ARRAY_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.p.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_ARRAY_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.p.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_ARRAY_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+              "sust.p.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_ARRAY_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.p.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_ARRAY_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.p.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_ARRAY_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.p.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_ARRAY_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.p.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_1D_ARRAY_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.p.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_1D_ARRAY_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.p.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_P_2D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.p.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.p.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+              "sust.p.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.p.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_2D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.p.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_2D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.p.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_2D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.p.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_2D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.p.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_2D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.p.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_P_2D_ARRAY_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.p.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_ARRAY_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.p.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_ARRAY_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r),
+              "sust.p.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_ARRAY_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.p.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_P_2D_ARRAY_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+             "sust.p.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_P_2D_ARRAY_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g),
+             "sust.p.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_P_2D_ARRAY_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+      "sust.p.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+      "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_2D_ARRAY_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+     "sust.p.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_2D_ARRAY_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+     "sust.p.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_P_3D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.p.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_P_3D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.p.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_P_3D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r),
+              "sust.p.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_P_3D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.p.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_P_3D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.p.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_P_3D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g),
+              "sust.p.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_P_3D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+         "sust.p.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+         "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_3D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+        "sust.p.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_3D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+        "sust.p.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+}
+
+// Surface store instruction patterns
+// I'm not sure why we can't just include these in the instruction definitions,
+// but TableGen complains of type errors :(
+
+// .clamp variant
+def : Pat<(int_nvvm_sust_b_1d_i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i64_clamp
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B8_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B16_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_V2B32_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i64_clamp
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_V2B64_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i8_clamp
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B8_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i16_clamp
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B16_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i32_clamp
+           Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_V4B32_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_1d_array_i8_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i16_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i32_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_ARRAY_B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i64_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_ARRAY_B64_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i8_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i16_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i32_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_ARRAY_V2B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i64_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_ARRAY_V2B64_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i8_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i16_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i32_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_ARRAY_V4B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i64_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i8_clamp
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i16_clamp
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i32_clamp
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_2D_V2B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i64_clamp
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_2D_V2B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_V4B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_array_i8_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B8_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i16_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B16_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i32_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_ARRAY_B32_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i64_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_ARRAY_B64_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i8_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B8_CLAMP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i16_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B16_CLAMP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i32_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+           Int32Regs:$g),
+          (SUST_B_2D_ARRAY_V2B32_CLAMP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i64_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+           Int64Regs:$g),
+          (SUST_B_2D_ARRAY_V2B64_CLAMP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i8_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B8_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i16_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B16_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i32_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_ARRAY_V4B32_CLAMP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_3d_i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B8_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B16_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r),
+          (SUST_B_3D_B32_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i64_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r),
+          (SUST_B_3D_B64_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B8_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B16_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_3D_V2B32_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i64_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_3D_V2B64_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B8_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B16_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_3D_V4B32_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+// .trap variant
+def : Pat<(int_nvvm_sust_b_1d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i64_trap
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_B64_TRAP Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i64_trap
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_V2B64_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_1d_array_i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i64_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_ARRAY_B64_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i8_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i16_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i32_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i64_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_ARRAY_V2B64_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i64_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_B64_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i8_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i16_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i32_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i64_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_2D_V2B64_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_array_i8_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B8_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i16_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B16_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i32_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_ARRAY_B32_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i64_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_ARRAY_B64_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+           Int32Regs:$g),
+          (SUST_B_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i64_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+           Int64Regs:$g),
+          (SUST_B_2D_ARRAY_V2B64_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B8_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B16_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_3d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r),
+          (SUST_B_3D_B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i64_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r),
+          (SUST_B_3D_B64_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_3D_V2B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i64_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_3D_V2B64_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_3D_V4B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+// .zero variant
+def : Pat<(int_nvvm_sust_b_1d_i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B8_ZERO Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B16_ZERO Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i64_zero
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_B64_ZERO Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B8_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B16_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_V2B32_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i64_zero
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_V2B64_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i8_zero
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B8_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i16_zero
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B16_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i32_zero
+           Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_V4B32_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_1d_array_i8_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i16_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i32_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_ARRAY_B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i64_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_ARRAY_B64_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i8_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i16_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i32_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_ARRAY_V2B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i64_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_ARRAY_V2B64_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i8_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i16_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i32_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_ARRAY_V4B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i64_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_B64_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i8_zero
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i16_zero
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i32_zero
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_2D_V2B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i64_zero
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_2D_V2B64_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_V4B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_array_i8_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B8_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i16_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B16_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i32_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_ARRAY_B32_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i64_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_ARRAY_B64_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i8_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B8_ZERO Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i16_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B16_ZERO Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i32_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+           Int32Regs:$g),
+          (SUST_B_2D_ARRAY_V2B32_ZERO Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i64_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+           Int64Regs:$g),
+          (SUST_B_2D_ARRAY_V2B64_ZERO Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i8_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B8_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i16_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B16_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i32_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_ARRAY_V4B32_ZERO Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_3d_i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B8_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B16_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r),
+          (SUST_B_3D_B32_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i64_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r),
+          (SUST_B_3D_B64_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B8_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B16_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_3D_V2B32_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i64_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_3D_V2B64_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B8_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B16_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_3D_V4B32_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+
+def : Pat<(int_nvvm_sust_p_1d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_P_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_P_1D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+          (SUST_P_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_1D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_P_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_1D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_1D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_1D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_1d_array_i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_P_1D_ARRAY_B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_P_1D_ARRAY_B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+          (SUST_P_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i8_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i16_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_1D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i32_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_P_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_1D_ARRAY_V4B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_1D_ARRAY_V4B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_1D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_2d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_P_2D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_P_2D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_P_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i8_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i16_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_2D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i32_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+          (SUST_P_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_2D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_2D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_2D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_2d_array_i8_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_P_2D_ARRAY_B8_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_i16_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_P_2D_ARRAY_B16_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_i32_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_P_2D_ARRAY_B32_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_2D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_2D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+           Int32Regs:$g),
+          (SUST_P_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_2D_ARRAY_V4B8_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_2D_ARRAY_V4B16_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_2D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_3d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_P_3D_B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_P_3D_B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r),
+          (SUST_P_3D_B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_3D_V2B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_3D_V2B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g),
+          (SUST_P_3D_V2B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_3D_V4B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_3D_V4B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_3D_V4B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+//-----------------------------------
+// Read Special Registers
+//-----------------------------------
+
+class PTX_READ_SREG_R64<string regname, Intrinsic intop>
+  : NVPTXInst<(outs Int64Regs:$d), (ins),
+              !strconcat(!strconcat("mov.u64\t$d, %", regname), ";"),
+              [(set Int64Regs:$d, (intop))]>;
+
+class PTX_READ_SREG_R32<string regname, Intrinsic intop>
+  : NVPTXInst<(outs Int32Regs:$d), (ins),
+              !strconcat(!strconcat("mov.u32\t$d, %", regname), ";"),
+              [(set Int32Regs:$d, (intop))]>;
+
+// TODO Add read vector-version of special registers
+
+def INT_PTX_SREG_TID_X :
+    PTX_READ_SREG_R32<"tid.x", int_nvvm_read_ptx_sreg_tid_x>;
+def INT_PTX_SREG_TID_Y :
+    PTX_READ_SREG_R32<"tid.y", int_nvvm_read_ptx_sreg_tid_y>;
+def INT_PTX_SREG_TID_Z :
+    PTX_READ_SREG_R32<"tid.z", int_nvvm_read_ptx_sreg_tid_z>;
+def INT_PTX_SREG_TID_W :
+    PTX_READ_SREG_R32<"tid.w", int_nvvm_read_ptx_sreg_tid_w>;
+
+def INT_PTX_SREG_NTID_X :
+    PTX_READ_SREG_R32<"ntid.x", int_nvvm_read_ptx_sreg_ntid_x>;
+def INT_PTX_SREG_NTID_Y :
+    PTX_READ_SREG_R32<"ntid.y", int_nvvm_read_ptx_sreg_ntid_y>;
+def INT_PTX_SREG_NTID_Z :
+    PTX_READ_SREG_R32<"ntid.z", int_nvvm_read_ptx_sreg_ntid_z>;
+def INT_PTX_SREG_NTID_W :
+    PTX_READ_SREG_R32<"ntid.w", int_nvvm_read_ptx_sreg_ntid_w>;
+
+def INT_PTX_SREG_LANEID :
+    PTX_READ_SREG_R32<"laneid", int_nvvm_read_ptx_sreg_laneid>;
+def INT_PTX_SREG_WARPID :
+    PTX_READ_SREG_R32<"warpid", int_nvvm_read_ptx_sreg_warpid>;
+def INT_PTX_SREG_NWARPID :
+    PTX_READ_SREG_R32<"nwarpid", int_nvvm_read_ptx_sreg_nwarpid>;
+
+def INT_PTX_SREG_CTAID_X :
+    PTX_READ_SREG_R32<"ctaid.x", int_nvvm_read_ptx_sreg_ctaid_x>;
+def INT_PTX_SREG_CTAID_Y :
+    PTX_READ_SREG_R32<"ctaid.y", int_nvvm_read_ptx_sreg_ctaid_y>;
+def INT_PTX_SREG_CTAID_Z :
+    PTX_READ_SREG_R32<"ctaid.z", int_nvvm_read_ptx_sreg_ctaid_z>;
+def INT_PTX_SREG_CTAID_W :
+    PTX_READ_SREG_R32<"ctaid.w", int_nvvm_read_ptx_sreg_ctaid_w>;
+
+def INT_PTX_SREG_NCTAID_X :
+    PTX_READ_SREG_R32<"nctaid.x", int_nvvm_read_ptx_sreg_nctaid_x>;
+def INT_PTX_SREG_NCTAID_Y :
+    PTX_READ_SREG_R32<"nctaid.y", int_nvvm_read_ptx_sreg_nctaid_y>;
+def INT_PTX_SREG_NCTAID_Z :
+    PTX_READ_SREG_R32<"nctaid.z", int_nvvm_read_ptx_sreg_nctaid_z>;
+def INT_PTX_SREG_NCTAID_W :
+    PTX_READ_SREG_R32<"nctaid.w", int_nvvm_read_ptx_sreg_nctaid_w>;
+
+def INT_PTX_SREG_SMID :
+    PTX_READ_SREG_R32<"smid", int_nvvm_read_ptx_sreg_smid>;
+def INT_PTX_SREG_NSMID :
+    PTX_READ_SREG_R32<"nsmid", int_nvvm_read_ptx_sreg_nsmid>;
+def INT_PTX_SREG_GRIDID :
+    PTX_READ_SREG_R32<"gridid", int_nvvm_read_ptx_sreg_gridid>;
+
+def INT_PTX_SREG_LANEMASK_EQ :
+    PTX_READ_SREG_R32<"lanemask_eq", int_nvvm_read_ptx_sreg_lanemask_eq>;
+def INT_PTX_SREG_LANEMASK_LE :
+    PTX_READ_SREG_R32<"lanemask_le", int_nvvm_read_ptx_sreg_lanemask_le>;
+def INT_PTX_SREG_LANEMASK_LT :
+    PTX_READ_SREG_R32<"lanemask_lt", int_nvvm_read_ptx_sreg_lanemask_lt>;
+def INT_PTX_SREG_LANEMASK_GE :
+    PTX_READ_SREG_R32<"lanemask_ge", int_nvvm_read_ptx_sreg_lanemask_ge>;
+def INT_PTX_SREG_LANEMASK_GT :
+    PTX_READ_SREG_R32<"lanemask_gt", int_nvvm_read_ptx_sreg_lanemask_gt>;
+
+def INT_PTX_SREG_CLOCK :
+    PTX_READ_SREG_R32<"clock", int_nvvm_read_ptx_sreg_clock>;
+def INT_PTX_SREG_CLOCK64 :
+    PTX_READ_SREG_R64<"clock64", int_nvvm_read_ptx_sreg_clock64>;
+
+def INT_PTX_SREG_PM0 : PTX_READ_SREG_R32<"pm0", int_nvvm_read_ptx_sreg_pm0>;
+def INT_PTX_SREG_PM1 : PTX_READ_SREG_R32<"pm1", int_nvvm_read_ptx_sreg_pm1>;
+def INT_PTX_SREG_PM2 : PTX_READ_SREG_R32<"pm2", int_nvvm_read_ptx_sreg_pm2>;
+def INT_PTX_SREG_PM3 : PTX_READ_SREG_R32<"pm3", int_nvvm_read_ptx_sreg_pm3>;
+
+// TODO: It would be nice to use PTX_READ_SREG here, but it doesn't
+// handle the constant.
+def INT_PTX_SREG_WARPSIZE :
+    NVPTXInst<(outs Int32Regs:$dst), (ins), "mov.u32 \t$dst, WARP_SZ;",
+              [(set Int32Regs:$dst, (int_nvvm_read_ptx_sreg_warpsize))]>;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
new file mode 100644
index 000000000000..b925b632ee4a
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -0,0 +1,349 @@
+//===- NVPTXLowerAggrCopies.cpp - ------------------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when
+// the size is large or is not a compile-time constant.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXLowerAggrCopies.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "nvptx"
+
+using namespace llvm;
+
+namespace {
+
+// actual analysis class, which is a functionpass
+struct NVPTXLowerAggrCopies : public FunctionPass {
+  static char ID;
+
+  NVPTXLowerAggrCopies() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<StackProtector>();
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  static const unsigned MaxAggrCopySize = 128;
+
+  StringRef getPassName() const override {
+    return "Lower aggregate copies/intrinsics into loops";
+  }
+};
+
+char NVPTXLowerAggrCopies::ID = 0;
+
+// Lower memcpy to loop.
+void convertMemCpyToLoop(Instruction *ConvertedInst, Value *SrcAddr,
+                         Value *DstAddr, Value *CopyLen, bool SrcIsVolatile,
+                         bool DstIsVolatile, LLVMContext &Context,
+                         Function &F) {
+  Type *TypeOfCopyLen = CopyLen->getType();
+
+  BasicBlock *OrigBB = ConvertedInst->getParent();
+  BasicBlock *NewBB =
+      ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split");
+  BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB);
+
+  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
+  IRBuilder<> Builder(OrigBB->getTerminator());
+
+  // SrcAddr and DstAddr are expected to be pointer types,
+  // so no check is made here.
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  // Cast pointers to (char *)
+  SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS));
+  DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS));
+
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
+
+  // load from SrcAddr+LoopIndex
+  // TODO: we can leverage the align parameter of llvm.memcpy for more efficient
+  // word-sized loads and stores.
+  Value *Element =
+      LoopBuilder.CreateLoad(LoopBuilder.CreateInBoundsGEP(
+                                 LoopBuilder.getInt8Ty(), SrcAddr, LoopIndex),
+                             SrcIsVolatile);
+  // store at DstAddr+LoopIndex
+  LoopBuilder.CreateStore(Element,
+                          LoopBuilder.CreateInBoundsGEP(LoopBuilder.getInt8Ty(),
+                                                        DstAddr, LoopIndex),
+                          DstIsVolatile);
+
+  // The value for LoopIndex coming from backedge is (LoopIndex + 1)
+  Value *NewIndex =
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+                           NewBB);
+}
+
+// Lower memmove to IR. memmove is required to correctly copy overlapping memory
+// regions; therefore, it has to check the relative positions of the source and
+// destination pointers and choose the copy direction accordingly.
+//
+// The code below is an IR rendition of this C function:
+//
+// void* memmove(void* dst, const void* src, size_t n) {
+//   unsigned char* d = dst;
+//   const unsigned char* s = src;
+//   if (s < d) {
+//     // copy backwards
+//     while (n--) {
+//       d[n] = s[n];
+//     }
+//   } else {
+//     // copy forward
+//     for (size_t i = 0; i < n; ++i) {
+//       d[i] = s[i];
+//     }
+//   }
+//   return dst;
+// }
+void convertMemMoveToLoop(Instruction *ConvertedInst, Value *SrcAddr,
+                          Value *DstAddr, Value *CopyLen, bool SrcIsVolatile,
+                          bool DstIsVolatile, LLVMContext &Context,
+                          Function &F) {
+  Type *TypeOfCopyLen = CopyLen->getType();
+  BasicBlock *OrigBB = ConvertedInst->getParent();
+
+  // Create the a comparison of src and dst, based on which we jump to either
+  // the forward-copy part of the function (if src >= dst) or the backwards-copy
+  // part (if src < dst).
+  // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
+  // structure. Its block terminators (unconditional branches) are replaced by
+  // the appropriate conditional branches when the loop is built.
+  ICmpInst *PtrCompare = new ICmpInst(ConvertedInst, ICmpInst::ICMP_ULT,
+                                      SrcAddr, DstAddr, "compare_src_dst");
+  TerminatorInst *ThenTerm, *ElseTerm;
+  SplitBlockAndInsertIfThenElse(PtrCompare, ConvertedInst, &ThenTerm,
+                                &ElseTerm);
+
+  // Each part of the function consists of two blocks:
+  //   copy_backwards:        used to skip the loop when n == 0
+  //   copy_backwards_loop:   the actual backwards loop BB
+  //   copy_forward:          used to skip the loop when n == 0
+  //   copy_forward_loop:     the actual forward loop BB
+  BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
+  CopyBackwardsBB->setName("copy_backwards");
+  BasicBlock *CopyForwardBB = ElseTerm->getParent();
+  CopyForwardBB->setName("copy_forward");
+  BasicBlock *ExitBB = ConvertedInst->getParent();
+  ExitBB->setName("memmove_done");
+
+  // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
+  // between both backwards and forward copy clauses.
+  ICmpInst *CompareN =
+      new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen,
+                   ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
+
+  // Copying backwards.
+  BasicBlock *LoopBB =
+      BasicBlock::Create(Context, "copy_backwards_loop", &F, CopyForwardBB);
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  Value *IndexPtr = LoopBuilder.CreateSub(
+      LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
+  Value *Element = LoopBuilder.CreateLoad(
+      LoopBuilder.CreateInBoundsGEP(SrcAddr, IndexPtr), "element");
+  LoopBuilder.CreateStore(Element,
+                          LoopBuilder.CreateInBoundsGEP(DstAddr, IndexPtr));
+  LoopBuilder.CreateCondBr(
+      LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
+      ExitBB, LoopBB);
+  LoopPhi->addIncoming(IndexPtr, LoopBB);
+  LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
+  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
+  ThenTerm->eraseFromParent();
+
+  // Copying forward.
+  BasicBlock *FwdLoopBB =
+      BasicBlock::Create(Context, "copy_forward_loop", &F, ExitBB);
+  IRBuilder<> FwdLoopBuilder(FwdLoopBB);
+  PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
+  Value *FwdElement = FwdLoopBuilder.CreateLoad(
+      FwdLoopBuilder.CreateInBoundsGEP(SrcAddr, FwdCopyPhi), "element");
+  FwdLoopBuilder.CreateStore(
+      FwdElement, FwdLoopBuilder.CreateInBoundsGEP(DstAddr, FwdCopyPhi));
+  Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
+      FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
+  FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
+                              ExitBB, FwdLoopBB);
+  FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
+  FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
+
+  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
+  ElseTerm->eraseFromParent();
+}
+
+// Lower memset to loop.
+void convertMemSetToLoop(Instruction *ConvertedInst, Value *DstAddr,
+                         Value *CopyLen, Value *SetValue, LLVMContext &Context,
+                         Function &F) {
+  BasicBlock *OrigBB = ConvertedInst->getParent();
+  BasicBlock *NewBB =
+      ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split");
+  BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB);
+
+  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
+  IRBuilder<> Builder(OrigBB->getTerminator());
+
+  // Cast pointer to the type of value getting stored
+  unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+  DstAddr = Builder.CreateBitCast(DstAddr,
+                                  PointerType::get(SetValue->getType(), dstAS));
+
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLen->getType(), 0);
+  LoopIndex->addIncoming(ConstantInt::get(CopyLen->getType(), 0), OrigBB);
+
+  LoopBuilder.CreateStore(
+      SetValue,
+      LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
+      false);
+
+  Value *NewIndex =
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLen->getType(), 1));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+                           NewBB);
+}
+
+bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
+  SmallVector<LoadInst *, 4> AggrLoads;
+  SmallVector<MemIntrinsic *, 4> MemCalls;
+
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  LLVMContext &Context = F.getParent()->getContext();
+
+  // Collect all aggregate loads and mem* calls.
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
+         ++II) {
+      if (LoadInst *LI = dyn_cast<LoadInst>(II)) {
+        if (!LI->hasOneUse())
+          continue;
+
+        if (DL.getTypeStoreSize(LI->getType()) < MaxAggrCopySize)
+          continue;
+
+        if (StoreInst *SI = dyn_cast<StoreInst>(LI->user_back())) {
+          if (SI->getOperand(0) != LI)
+            continue;
+          AggrLoads.push_back(LI);
+        }
+      } else if (MemIntrinsic *IntrCall = dyn_cast<MemIntrinsic>(II)) {
+        // Convert intrinsic calls with variable size or with constant size
+        // larger than the MaxAggrCopySize threshold.
+        if (ConstantInt *LenCI = dyn_cast<ConstantInt>(IntrCall->getLength())) {
+          if (LenCI->getZExtValue() >= MaxAggrCopySize) {
+            MemCalls.push_back(IntrCall);
+          }
+        } else {
+          MemCalls.push_back(IntrCall);
+        }
+      }
+    }
+  }
+
+  if (AggrLoads.size() == 0 && MemCalls.size() == 0) {
+    return false;
+  }
+
+  //
+  // Do the transformation of an aggr load/copy/set to a loop
+  //
+  for (LoadInst *LI : AggrLoads) {
+    StoreInst *SI = dyn_cast<StoreInst>(*LI->user_begin());
+    Value *SrcAddr = LI->getOperand(0);
+    Value *DstAddr = SI->getOperand(1);
+    unsigned NumLoads = DL.getTypeStoreSize(LI->getType());
+    Value *CopyLen = ConstantInt::get(Type::getInt32Ty(Context), NumLoads);
+
+    convertMemCpyToLoop(/* ConvertedInst */ SI,
+                        /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
+                        /* CopyLen */ CopyLen,
+                        /* SrcIsVolatile */ LI->isVolatile(),
+                        /* DstIsVolatile */ SI->isVolatile(),
+                        /* Context */ Context,
+                        /* Function F */ F);
+
+    SI->eraseFromParent();
+    LI->eraseFromParent();
+  }
+
+  // Transform mem* intrinsic calls.
+  for (MemIntrinsic *MemCall : MemCalls) {
+    if (MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(MemCall)) {
+      convertMemCpyToLoop(/* ConvertedInst */ Memcpy,
+                          /* SrcAddr */ Memcpy->getRawSource(),
+                          /* DstAddr */ Memcpy->getRawDest(),
+                          /* CopyLen */ Memcpy->getLength(),
+                          /* SrcIsVolatile */ Memcpy->isVolatile(),
+                          /* DstIsVolatile */ Memcpy->isVolatile(),
+                          /* Context */ Context,
+                          /* Function F */ F);
+    } else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) {
+      convertMemMoveToLoop(/* ConvertedInst */ Memmove,
+                           /* SrcAddr */ Memmove->getRawSource(),
+                           /* DstAddr */ Memmove->getRawDest(),
+                           /* CopyLen */ Memmove->getLength(),
+                           /* SrcIsVolatile */ Memmove->isVolatile(),
+                           /* DstIsVolatile */ Memmove->isVolatile(),
+                           /* Context */ Context,
+                           /* Function F */ F);
+
+    } else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
+      convertMemSetToLoop(/* ConvertedInst */ Memset,
+                          /* DstAddr */ Memset->getRawDest(),
+                          /* CopyLen */ Memset->getLength(),
+                          /* SetValue */ Memset->getValue(),
+                          /* Context */ Context,
+                          /* Function F */ F);
+    }
+    MemCall->eraseFromParent();
+  }
+
+  return true;
+}
+
+} // namespace
+
+namespace llvm {
+void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS(NVPTXLowerAggrCopies, "nvptx-lower-aggr-copies",
+                "Lower aggregate copies, and llvm.mem* intrinsics into loops",
+                false, false)
+
+FunctionPass *llvm::createLowerAggrCopies() {
+  return new NVPTXLowerAggrCopies();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
new file mode 100644
index 000000000000..3c39f53eb30a
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
@@ -0,0 +1,24 @@
+//===-- llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVIDIA specific lowering of
+// aggregate copies
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXLOWERAGGRCOPIES_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXLOWERAGGRCOPIES_H
+
+namespace llvm {
+class FunctionPass;
+
+FunctionPass *createLowerAggrCopies();
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
new file mode 100644
index 000000000000..e94c1914029d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
@@ -0,0 +1,118 @@
+//===-- NVPTXLowerAlloca.cpp - Make alloca to use local memory =====--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// For all alloca instructions, and add a pair of cast to local address for
+// each of them. For example,
+//
+//   %A = alloca i32
+//   store i32 0, i32* %A ; emits st.u32
+//
+// will be transformed to
+//
+//   %A = alloca i32
+//   %Local = addrspacecast i32* %A to i32 addrspace(5)*
+//   %Generic = addrspacecast i32 addrspace(5)* %A to i32*
+//   store i32 0, i32 addrspace(5)* %Generic ; emits st.local.u32
+//
+// And we will rely on NVPTXInferAddressSpaces to combine the last two
+// instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeNVPTXLowerAllocaPass(PassRegistry &);
+}
+
+namespace {
+class NVPTXLowerAlloca : public BasicBlockPass {
+  bool runOnBasicBlock(BasicBlock &BB) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  NVPTXLowerAlloca() : BasicBlockPass(ID) {}
+  StringRef getPassName() const override {
+    return "convert address space of alloca'ed memory to local";
+  }
+};
+} // namespace
+
+char NVPTXLowerAlloca::ID = 1;
+
+INITIALIZE_PASS(NVPTXLowerAlloca, "nvptx-lower-alloca",
+                "Lower Alloca", false, false)
+
+// =============================================================================
+// Main function for this pass.
+// =============================================================================
+bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) {
+  if (skipBasicBlock(BB))
+    return false;
+
+  bool Changed = false;
+  for (auto &I : BB) {
+    if (auto allocaInst = dyn_cast<AllocaInst>(&I)) {
+      Changed = true;
+      auto PTy = dyn_cast<PointerType>(allocaInst->getType());
+      auto ETy = PTy->getElementType();
+      auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL);
+      auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, "");
+      auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC);
+      auto NewASCToGeneric = new AddrSpaceCastInst(NewASCToLocal,
+                                                    GenericAddrTy, "");
+      NewASCToLocal->insertAfter(allocaInst);
+      NewASCToGeneric->insertAfter(NewASCToLocal);
+      for (Value::use_iterator UI = allocaInst->use_begin(),
+                                UE = allocaInst->use_end();
+            UI != UE; ) {
+        // Check Load, Store, GEP, and BitCast Uses on alloca and make them
+        // use the converted generic address, in order to expose non-generic
+        // addrspacecast to NVPTXInferAddressSpaces. For other types
+        // of instructions this is unnecessary and may introduce redundant
+        // address cast.
+        const auto &AllocaUse = *UI++;
+        auto LI = dyn_cast<LoadInst>(AllocaUse.getUser());
+        if (LI && LI->getPointerOperand() == allocaInst && !LI->isVolatile()) {
+          LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric);
+          continue;
+        }
+        auto SI = dyn_cast<StoreInst>(AllocaUse.getUser());
+        if (SI && SI->getPointerOperand() == allocaInst && !SI->isVolatile()) {
+          SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric);
+          continue;
+        }
+        auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser());
+        if (GI && GI->getPointerOperand() == allocaInst) {
+          GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric);
+          continue;
+        }
+        auto BI = dyn_cast<BitCastInst>(AllocaUse.getUser());
+        if (BI && BI->getOperand(0) == allocaInst) {
+          BI->setOperand(0, NewASCToGeneric);
+          continue;
+        }
+      }
+    }
+  }
+  return Changed;
+}
+
+BasicBlockPass *llvm::createNVPTXLowerAllocaPass() {
+  return new NVPTXLowerAlloca();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
new file mode 100644
index 000000000000..3f0c7be7863d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -0,0 +1,253 @@
+//===-- NVPTXLowerArgs.cpp - Lower arguments ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// Arguments to kernel and device functions are passed via param space,
+// which imposes certain restrictions:
+// http://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces
+//
+// Kernel parameters are read-only and accessible only via ld.param
+// instruction, directly or via a pointer. Pointers to kernel
+// arguments can't be converted to generic address space.
+//
+// Device function parameters are directly accessible via
+// ld.param/st.param, but taking the address of one returns a pointer
+// to a copy created in local space which *can't* be used with
+// ld.param/st.param.
+//
+// Copying a byval struct into local memory in IR allows us to enforce
+// the param space restrictions, gives the rest of IR a pointer w/o
+// param space restrictions, and gives us an opportunity to eliminate
+// the copy.
+//
+// Pointer arguments to kernel functions need more work to be lowered:
+//
+// 1. Convert non-byval pointer arguments of CUDA kernels to pointers in the
+//    global address space. This allows later optimizations to emit
+//    ld.global.*/st.global.* for accessing these pointer arguments. For
+//    example,
+//
+//    define void @foo(float* %input) {
+//      %v = load float, float* %input, align 4
+//      ...
+//    }
+//
+//    becomes
+//
+//    define void @foo(float* %input) {
+//      %input2 = addrspacecast float* %input to float addrspace(1)*
+//      %input3 = addrspacecast float addrspace(1)* %input2 to float*
+//      %v = load float, float* %input3, align 4
+//      ...
+//    }
+//
+//    Later, NVPTXInferAddressSpaces will optimize it to
+//
+//    define void @foo(float* %input) {
+//      %input2 = addrspacecast float* %input to float addrspace(1)*
+//      %v = load float, float addrspace(1)* %input2, align 4
+//      ...
+//    }
+//
+// 2. Convert pointers in a byval kernel parameter to pointers in the global
+//    address space. As #2, it allows NVPTX to emit more ld/st.global. E.g.,
+//
+//    struct S {
+//      int *x;
+//      int *y;
+//    };
+//    __global__ void foo(S s) {
+//      int *b = s.y;
+//      // use b
+//    }
+//
+//    "b" points to the global address space. In the IR level,
+//
+//    define void @foo({i32*, i32*}* byval %input) {
+//      %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1
+//      %b = load i32*, i32** %b_ptr
+//      ; use %b
+//    }
+//
+//    becomes
+//
+//    define void @foo({i32*, i32*}* byval %input) {
+//      %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1
+//      %b = load i32*, i32** %b_ptr
+//      %b_global = addrspacecast i32* %b to i32 addrspace(1)*
+//      %b_generic = addrspacecast i32 addrspace(1)* %b_global to i32*
+//      ; use %b_generic
+//    }
+//
+// TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't
+// cancel the addrspacecast pair this pass emits.
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeNVPTXLowerArgsPass(PassRegistry &);
+}
+
+namespace {
+class NVPTXLowerArgs : public FunctionPass {
+  bool runOnFunction(Function &F) override;
+
+  bool runOnKernelFunction(Function &F);
+  bool runOnDeviceFunction(Function &F);
+
+  // handle byval parameters
+  void handleByValParam(Argument *Arg);
+  // Knowing Ptr must point to the global address space, this function
+  // addrspacecasts Ptr to global and then back to generic. This allows
+  // NVPTXInferAddressSpaces to fold the global-to-generic cast into
+  // loads/stores that appear later.
+  void markPointerAsGlobal(Value *Ptr);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  NVPTXLowerArgs(const NVPTXTargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM) {}
+  StringRef getPassName() const override {
+    return "Lower pointer arguments of CUDA kernels";
+  }
+
+private:
+  const NVPTXTargetMachine *TM;
+};
+} // namespace
+
+char NVPTXLowerArgs::ID = 1;
+
+INITIALIZE_PASS(NVPTXLowerArgs, "nvptx-lower-args",
+                "Lower arguments (NVPTX)", false, false)
+
+// =============================================================================
+// If the function had a byval struct ptr arg, say foo(%struct.x* byval %d),
+// then add the following instructions to the first basic block:
+//
+// %temp = alloca %struct.x, align 8
+// %tempd = addrspacecast %struct.x* %d to %struct.x addrspace(101)*
+// %tv = load %struct.x addrspace(101)* %tempd
+// store %struct.x %tv, %struct.x* %temp, align 8
+//
+// The above code allocates some space in the stack and copies the incoming
+// struct from param space to local space.
+// Then replace all occurrences of %d by %temp.
+// =============================================================================
+void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
+  Function *Func = Arg->getParent();
+  Instruction *FirstInst = &(Func->getEntryBlock().front());
+  PointerType *PType = dyn_cast<PointerType>(Arg->getType());
+
+  assert(PType && "Expecting pointer type in handleByValParam");
+
+  Type *StructType = PType->getElementType();
+  AllocaInst *AllocA = new AllocaInst(StructType, Arg->getName(), FirstInst);
+  // Set the alignment to alignment of the byval parameter. This is because,
+  // later load/stores assume that alignment, and we are going to replace
+  // the use of the byval parameter with this alloca instruction.
+  AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo() + 1));
+  Arg->replaceAllUsesWith(AllocA);
+
+  Value *ArgInParam = new AddrSpaceCastInst(
+      Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
+      FirstInst);
+  LoadInst *LI = new LoadInst(ArgInParam, Arg->getName(), FirstInst);
+  new StoreInst(LI, AllocA, FirstInst);
+}
+
+void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) {
+  if (Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL)
+    return;
+
+  // Deciding where to emit the addrspacecast pair.
+  BasicBlock::iterator InsertPt;
+  if (Argument *Arg = dyn_cast<Argument>(Ptr)) {
+    // Insert at the functon entry if Ptr is an argument.
+    InsertPt = Arg->getParent()->getEntryBlock().begin();
+  } else {
+    // Insert right after Ptr if Ptr is an instruction.
+    InsertPt = ++cast<Instruction>(Ptr)->getIterator();
+    assert(InsertPt != InsertPt->getParent()->end() &&
+           "We don't call this function with Ptr being a terminator.");
+  }
+
+  Instruction *PtrInGlobal = new AddrSpaceCastInst(
+      Ptr, PointerType::get(Ptr->getType()->getPointerElementType(),
+                            ADDRESS_SPACE_GLOBAL),
+      Ptr->getName(), &*InsertPt);
+  Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(),
+                                              Ptr->getName(), &*InsertPt);
+  // Replace with PtrInGeneric all uses of Ptr except PtrInGlobal.
+  Ptr->replaceAllUsesWith(PtrInGeneric);
+  PtrInGlobal->setOperand(0, Ptr);
+}
+
+// =============================================================================
+// Main function for this pass.
+// =============================================================================
+bool NVPTXLowerArgs::runOnKernelFunction(Function &F) {
+  if (TM && TM->getDrvInterface() == NVPTX::CUDA) {
+    // Mark pointers in byval structs as global.
+    for (auto &B : F) {
+      for (auto &I : B) {
+        if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+          if (LI->getType()->isPointerTy()) {
+            Value *UO = GetUnderlyingObject(LI->getPointerOperand(),
+                                            F.getParent()->getDataLayout());
+            if (Argument *Arg = dyn_cast<Argument>(UO)) {
+              if (Arg->hasByValAttr()) {
+                // LI is a load from a pointer within a byval kernel parameter.
+                markPointerAsGlobal(LI);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (Argument &Arg : F.args()) {
+    if (Arg.getType()->isPointerTy()) {
+      if (Arg.hasByValAttr())
+        handleByValParam(&Arg);
+      else if (TM && TM->getDrvInterface() == NVPTX::CUDA)
+        markPointerAsGlobal(&Arg);
+    }
+  }
+  return true;
+}
+
+// Device functions only need to copy byval args into local memory.
+bool NVPTXLowerArgs::runOnDeviceFunction(Function &F) {
+  for (Argument &Arg : F.args())
+    if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())
+      handleByValParam(&Arg);
+  return true;
+}
+
+bool NVPTXLowerArgs::runOnFunction(Function &F) {
+  return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F);
+}
+
+FunctionPass *
+llvm::createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM) {
+  return new NVPTXLowerArgs(TM);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
new file mode 100644
index 000000000000..eab5ee80561e
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
@@ -0,0 +1,60 @@
+//===-- NVPTXMCExpr.cpp - NVPTX specific MC expression classes ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCExpr.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-mcexpr"
+
+const NVPTXFloatMCExpr *
+NVPTXFloatMCExpr::create(VariantKind Kind, const APFloat &Flt, MCContext &Ctx) {
+  return new (Ctx) NVPTXFloatMCExpr(Kind, Flt);
+}
+
+void NVPTXFloatMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  bool Ignored;
+  unsigned NumHex;
+  APFloat APF = getAPFloat();
+
+  switch (Kind) {
+  default: llvm_unreachable("Invalid kind!");
+  case VK_NVPTX_SINGLE_PREC_FLOAT:
+    OS << "0f";
+    NumHex = 8;
+    APF.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &Ignored);
+    break;
+  case VK_NVPTX_DOUBLE_PREC_FLOAT:
+    OS << "0d";
+    NumHex = 16;
+    APF.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &Ignored);
+    break;
+  }
+
+  APInt API = APF.bitcastToAPInt();
+  std::string HexStr(utohexstr(API.getZExtValue()));
+  if (HexStr.length() < NumHex)
+    OS << std::string(NumHex - HexStr.length(), '0');
+  OS << utohexstr(API.getZExtValue());
+}
+
+const NVPTXGenericMCSymbolRefExpr*
+NVPTXGenericMCSymbolRefExpr::create(const MCSymbolRefExpr *SymExpr,
+                                    MCContext &Ctx) {
+  return new (Ctx) NVPTXGenericMCSymbolRefExpr(SymExpr);
+}
+
+void NVPTXGenericMCSymbolRefExpr::printImpl(raw_ostream &OS,
+                                            const MCAsmInfo *MAI) const {
+  OS << "generic(";
+  SymExpr->print(OS, MAI);
+  OS << ")";
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
new file mode 100644
index 000000000000..7f833c42fa8f
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -0,0 +1,125 @@
+//===-- NVPTXMCExpr.h - NVPTX specific MC expression classes ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Modeled after ARMMCExpr
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXMCEXPR_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXMCEXPR_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/MC/MCExpr.h"
+#include <utility>
+
+namespace llvm {
+
+class NVPTXFloatMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_NVPTX_None,
+    VK_NVPTX_SINGLE_PREC_FLOAT,   // FP constant in single-precision
+    VK_NVPTX_DOUBLE_PREC_FLOAT    // FP constant in double-precision
+  };
+
+private:
+  const VariantKind Kind;
+  const APFloat Flt;
+
+  explicit NVPTXFloatMCExpr(VariantKind Kind, APFloat Flt)
+      : Kind(Kind), Flt(std::move(Flt)) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const NVPTXFloatMCExpr *create(VariantKind Kind, const APFloat &Flt,
+                                        MCContext &Ctx);
+
+  static const NVPTXFloatMCExpr *createConstantFPSingle(const APFloat &Flt,
+                                                        MCContext &Ctx) {
+    return create(VK_NVPTX_SINGLE_PREC_FLOAT, Flt, Ctx);
+  }
+
+  static const NVPTXFloatMCExpr *createConstantFPDouble(const APFloat &Flt,
+                                                        MCContext &Ctx) {
+    return create(VK_NVPTX_DOUBLE_PREC_FLOAT, Flt, Ctx);
+  }
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  APFloat getAPFloat() const { return Flt; }
+
+/// @}
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override {
+    return false;
+  }
+  void visitUsedExpr(MCStreamer &Streamer) const override {};
+  MCFragment *findAssociatedFragment() const override { return nullptr; }
+
+  // There are no TLS NVPTXMCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+
+/// A wrapper for MCSymbolRefExpr that tells the assembly printer that the
+/// symbol should be enclosed by generic().
+class NVPTXGenericMCSymbolRefExpr : public MCTargetExpr {
+private:
+  const MCSymbolRefExpr *SymExpr;
+
+  explicit NVPTXGenericMCSymbolRefExpr(const MCSymbolRefExpr *_SymExpr)
+      : SymExpr(_SymExpr) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const NVPTXGenericMCSymbolRefExpr
+  *create(const MCSymbolRefExpr *SymExpr, MCContext &Ctx);
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  const MCSymbolRefExpr *getSymbolExpr() const { return SymExpr; }
+
+  /// @}
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override {
+    return false;
+  }
+  void visitUsedExpr(MCStreamer &Streamer) const override {};
+  MCFragment *findAssociatedFragment() const override { return nullptr; }
+
+  // There are no TLS NVPTXMCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+  };
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
new file mode 100644
index 000000000000..10f1135ad841
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
@@ -0,0 +1,51 @@
+//===-- NVPTXMachineFunctionInfo.h - NVPTX-specific Function Info  --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class is attached to a MachineFunction instance and tracks target-
+// dependent information
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+class NVPTXMachineFunctionInfo : public MachineFunctionInfo {
+private:
+  /// Stores a mapping from index to symbol name for removing image handles
+  /// on Fermi.
+  SmallVector<std::string, 8> ImageHandleList;
+
+public:
+  NVPTXMachineFunctionInfo(MachineFunction &MF) {}
+
+  /// Returns the index for the symbol \p Symbol. If the symbol was previously,
+  /// added, the same index is returned. Otherwise, the symbol is added and the
+  /// new index is returned.
+  unsigned getImageHandleSymbolIndex(const char *Symbol) {
+    // Is the symbol already present?
+    for (unsigned i = 0, e = ImageHandleList.size(); i != e; ++i)
+      if (ImageHandleList[i] == std::string(Symbol))
+        return i;
+    // Nope, insert it
+    ImageHandleList.push_back(Symbol);
+    return ImageHandleList.size()-1;
+  }
+
+  /// Returns the symbol name at the given index.
+  const char *getImageHandleSymbol(unsigned Idx) const {
+    assert(ImageHandleList.size() > Idx && "Bad index");
+    return ImageHandleList[Idx].c_str();
+  }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp
new file mode 100644
index 000000000000..49e639793efc
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp
@@ -0,0 +1,157 @@
+//===-- NVPTXPeephole.cpp - NVPTX Peephole Optimiztions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// In NVPTX, NVPTXFrameLowering will emit following instruction at the beginning
+// of a MachineFunction.
+//
+//   mov %SPL, %depot
+//   cvta.local %SP, %SPL
+//
+// Because Frame Index is a generic address and alloca can only return generic
+// pointer, without this pass the instructions producing alloca'ed address will
+// be based on %SP. NVPTXLowerAlloca tends to help replace store and load on
+// this address with their .local versions, but this may introduce a lot of
+// cvta.to.local instructions. Performance can be improved if we avoid casting
+// address back and forth and directly calculate local address based on %SPL.
+// This peephole pass optimizes these cases, for example
+//
+// It will transform the following pattern
+//    %vreg0<def> = LEA_ADDRi64 %VRFrame, 4
+//    %vreg1<def> = cvta_to_local_yes_64 %vreg0
+//
+// into
+//    %vreg1<def> = LEA_ADDRi64 %VRFrameLocal, 4
+//
+// %VRFrameLocal is the virtual register name of %SPL
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-peephole"
+
+namespace llvm {
+void initializeNVPTXPeepholePass(PassRegistry &);
+}
+
+namespace {
+struct NVPTXPeephole : public MachineFunctionPass {
+ public:
+  static char ID;
+  NVPTXPeephole() : MachineFunctionPass(ID) {
+    initializeNVPTXPeepholePass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "NVPTX optimize redundant cvta.to.local instruction";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+}
+
+char NVPTXPeephole::ID = 0;
+
+INITIALIZE_PASS(NVPTXPeephole, "nvptx-peephole", "NVPTX Peephole", false, false)
+
+static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) {
+  auto &MBB = *Root.getParent();
+  auto &MF = *MBB.getParent();
+  // Check current instruction is cvta.to.local
+  if (Root.getOpcode() != NVPTX::cvta_to_local_yes_64 &&
+      Root.getOpcode() != NVPTX::cvta_to_local_yes)
+    return false;
+
+  auto &Op = Root.getOperand(1);
+  const auto &MRI = MF.getRegInfo();
+  MachineInstr *GenericAddrDef = nullptr;
+  if (Op.isReg() && TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+    GenericAddrDef = MRI.getUniqueVRegDef(Op.getReg());
+  }
+
+  // Check the register operand is uniquely defined by LEA_ADDRi instruction
+  if (!GenericAddrDef || GenericAddrDef->getParent() != &MBB ||
+      (GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi64 &&
+       GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi)) {
+    return false;
+  }
+
+  // Check the LEA_ADDRi operand is Frame index
+  auto &BaseAddrOp = GenericAddrDef->getOperand(1);
+  if (BaseAddrOp.isReg() && BaseAddrOp.getReg() == NVPTX::VRFrame) {
+    return true;
+  }
+
+  return false;
+}
+
+static void CombineCVTAToLocal(MachineInstr &Root) {
+  auto &MBB = *Root.getParent();
+  auto &MF = *MBB.getParent();
+  const auto &MRI = MF.getRegInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  auto &Prev = *MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+
+  MachineInstrBuilder MIB =
+      BuildMI(MF, Root.getDebugLoc(), TII->get(Prev.getOpcode()),
+              Root.getOperand(0).getReg())
+          .addReg(NVPTX::VRFrameLocal)
+          .addOperand(Prev.getOperand(2));
+
+  MBB.insert((MachineBasicBlock::iterator)&Root, MIB);
+
+  // Check if MRI has only one non dbg use, which is Root
+  if (MRI.hasOneNonDBGUse(Prev.getOperand(0).getReg())) {
+    Prev.eraseFromParentAndMarkDBGValuesForRemoval();
+  }
+  Root.eraseFromParentAndMarkDBGValuesForRemoval();
+}
+
+bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  bool Changed = false;
+  // Loop over all of the basic blocks.
+  for (auto &MBB : MF) {
+    // Traverse the basic block.
+    auto BlockIter = MBB.begin();
+
+    while (BlockIter != MBB.end()) {
+      auto &MI = *BlockIter++;
+      if (isCVTAToLocalCombinationCandidate(MI)) {
+        CombineCVTAToLocal(MI);
+        Changed = true;
+      }
+    }  // Instruction
+  }    // Basic Block
+
+  // Remove unnecessary %VRFrame = cvta.local %VRFrameLocal
+  const auto &MRI = MF.getRegInfo();
+  if (MRI.use_empty(NVPTX::VRFrame)) {
+    if (auto MI = MRI.getUniqueVRegDef(NVPTX::VRFrame)) {
+      MI->eraseFromParentAndMarkDBGValuesForRemoval();
+    }
+  }
+
+  return Changed;
+}
+
+MachineFunctionPass *llvm::createNVPTXPeephole() { return new NVPTXPeephole(); }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
new file mode 100644
index 000000000000..88288abe64f9
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -0,0 +1,227 @@
+//===-- NVPTXPrologEpilogPass.cpp - NVPTX prolog/epilog inserter ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a copy of the generic LLVM PrologEpilogInserter pass, modified
+// to remove unneeded functionality and to handle virtual registers. Most code
+// here is a copy of PrologEpilogInserter.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-prolog-epilog"
+
+namespace {
+class NVPTXPrologEpilogPass : public MachineFunctionPass {
+public:
+  static char ID;
+  NVPTXPrologEpilogPass() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  void calculateFrameObjectOffsets(MachineFunction &Fn);
+};
+}
+
+MachineFunctionPass *llvm::createNVPTXPrologEpilogPass() {
+  return new NVPTXPrologEpilogPass();
+}
+
+char NVPTXPrologEpilogPass::ID = 0;
+
+bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetFrameLowering &TFI = *STI.getFrameLowering();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  bool Modified = false;
+
+  calculateFrameObjectOffsets(MF);
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        if (!MI.getOperand(i).isFI())
+          continue;
+        TRI.eliminateFrameIndex(MI, 0, i, nullptr);
+        Modified = true;
+      }
+    }
+  }
+
+  // Add function prolog/epilog
+  TFI.emitPrologue(MF, MF.front());
+
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    // If last instruction is a return instruction, add an epilogue
+    if (I->isReturnBlock())
+      TFI.emitEpilogue(MF, *I);
+  }
+
+  return Modified;
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+static inline void
+AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
+                  bool StackGrowsDown, int64_t &Offset,
+                  unsigned &MaxAlign) {
+  // If the stack grows down, add the object size to find the lowest address.
+  if (StackGrowsDown)
+    Offset += MFI.getObjectSize(FrameIdx);
+
+  unsigned Align = MFI.getObjectAlignment(FrameIdx);
+
+  // If the alignment of this object is greater than that of the stack, then
+  // increase the stack alignment to match.
+  MaxAlign = std::max(MaxAlign, Align);
+
+  // Adjust to alignment boundary.
+  Offset = (Offset + Align - 1) / Align * Align;
+
+  if (StackGrowsDown) {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
+    MFI.setObjectOffset(FrameIdx, -Offset); // Set the computed offset
+  } else {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
+    MFI.setObjectOffset(FrameIdx, Offset);
+    Offset += MFI.getObjectSize(FrameIdx);
+  }
+}
+
+void
+NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo();
+
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  // Loop over all of the stack objects, assigning sequential addresses...
+  MachineFrameInfo &MFI = Fn.getFrameInfo();
+
+  // Start at the beginning of the local area.
+  // The Offset is the distance from the stack top in the direction
+  // of stack growth -- so it's always nonnegative.
+  int LocalAreaOffset = TFI.getOffsetOfLocalArea();
+  if (StackGrowsDown)
+    LocalAreaOffset = -LocalAreaOffset;
+  assert(LocalAreaOffset >= 0
+         && "Local area offset should be in direction of stack growth");
+  int64_t Offset = LocalAreaOffset;
+
+  // If there are fixed sized objects that are preallocated in the local area,
+  // non-fixed objects can't be allocated right at the start of local area.
+  // We currently don't support filling in holes in between fixed sized
+  // objects, so we adjust 'Offset' to point to the end of last fixed sized
+  // preallocated object.
+  for (int i = MFI.getObjectIndexBegin(); i != 0; ++i) {
+    int64_t FixedOff;
+    if (StackGrowsDown) {
+      // The maximum distance from the stack pointer is at lower address of
+      // the object -- which is given by offset. For down growing stack
+      // the offset is negative, so we negate the offset to get the distance.
+      FixedOff = -MFI.getObjectOffset(i);
+    } else {
+      // The maximum distance from the start pointer is at the upper
+      // address of the object.
+      FixedOff = MFI.getObjectOffset(i) + MFI.getObjectSize(i);
+    }
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+
+  // NOTE: We do not have a call stack
+
+  unsigned MaxAlign = MFI.getMaxAlignment();
+
+  // No scavenger
+
+  // FIXME: Once this is working, then enable flag will change to a target
+  // check for whether the frame is large enough to want to use virtual
+  // frame index registers. Functions which don't want/need this optimization
+  // will continue to use the existing code path.
+  if (MFI.getUseLocalStackAllocationBlock()) {
+    unsigned Align = MFI.getLocalFrameMaxAlign();
+
+    // Adjust to alignment boundary.
+    Offset = (Offset + Align - 1) / Align * Align;
+
+    DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+
+    // Resolve offsets for objects in the local block.
+    for (unsigned i = 0, e = MFI.getLocalFrameObjectCount(); i != e; ++i) {
+      std::pair<int, int64_t> Entry = MFI.getLocalFrameObjectMap(i);
+      int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
+      DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
+            FIOffset << "]\n");
+      MFI.setObjectOffset(Entry.first, FIOffset);
+    }
+    // Allocate the local block
+    Offset += MFI.getLocalFrameSize();
+
+    MaxAlign = std::max(Align, MaxAlign);
+  }
+
+  // No stack protector
+
+  // Then assign frame offsets to stack objects that are not used to spill
+  // callee saved registers.
+  for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
+    if (MFI.isObjectPreAllocated(i) &&
+        MFI.getUseLocalStackAllocationBlock())
+      continue;
+    if (MFI.isDeadObjectIndex(i))
+      continue;
+
+    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
+  }
+
+  // No scavenger
+
+  if (!TFI.targetHandlesStackFrameRounding()) {
+    // If we have reserved argument space for call sites in the function
+    // immediately on entry to the current function, count it as part of the
+    // overall stack size.
+    if (MFI.adjustsStack() && TFI.hasReservedCallFrame(Fn))
+      Offset += MFI.getMaxCallFrameSize();
+
+    // Round up the size to a multiple of the alignment.  If the function has
+    // any calls or alloca's, align to the target's StackAlignment value to
+    // ensure that the callee's frame or the alloca data is suitably aligned;
+    // otherwise, for leaf functions, align to the TransientStackAlignment
+    // value.
+    unsigned StackAlign;
+    if (MFI.adjustsStack() || MFI.hasVarSizedObjects() ||
+        (RegInfo->needsStackRealignment(Fn) && MFI.getObjectIndexEnd() != 0))
+      StackAlign = TFI.getStackAlignment();
+    else
+      StackAlign = TFI.getTransientStackAlignment();
+
+    // If the frame pointer is eliminated, all frame offsets will be relative to
+    // SP not FP. Align to MaxAlign so this works.
+    StackAlign = std::max(StackAlign, MaxAlign);
+    unsigned AlignMask = StackAlign - 1;
+    Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+  }
+
+  // Update frame info to pretend that this is part of the stack...
+  int64_t StackSize = Offset - LocalAreaOffset;
+  MFI.setStackSize(StackSize);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
new file mode 100644
index 000000000000..6cbf0604d7ef
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -0,0 +1,128 @@
+//===- NVPTXRegisterInfo.cpp - NVPTX Register Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXRegisterInfo.h"
+#include "NVPTX.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-reg-info"
+
+namespace llvm {
+std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
+  if (RC == &NVPTX::Float32RegsRegClass) {
+    return ".f32";
+  }
+  if (RC == &NVPTX::Float64RegsRegClass) {
+    return ".f64";
+  } else if (RC == &NVPTX::Int64RegsRegClass) {
+    // We use untyped (.b) integer registers here as NVCC does.
+    // Correctness of generated code does not depend on register type,
+    // but using .s/.u registers runs into ptxas bug that prevents
+    // assembly of otherwise valid PTX into SASS. Despite PTX ISA
+    // specifying only argument size for fp16 instructions, ptxas does
+    // not allow using .s16 or .u16 arguments for .fp16
+    // instructions. At the same time it allows using .s32/.u32
+    // arguments for .fp16v2 instructions:
+    //
+    //   .reg .b16 rb16
+    //   .reg .s16 rs16
+    //   add.f16 rb16,rb16,rb16; // OK
+    //   add.f16 rs16,rs16,rs16; // Arguments mismatch for instruction 'add'
+    // but:
+    //   .reg .b32 rb32
+    //   .reg .s32 rs32
+    //   add.f16v2 rb32,rb32,rb32; // OK
+    //   add.f16v2 rs32,rs32,rs32; // OK
+    return ".b64";
+  } else if (RC == &NVPTX::Int32RegsRegClass) {
+    return ".b32";
+  } else if (RC == &NVPTX::Int16RegsRegClass) {
+    return ".b16";
+  } else if (RC == &NVPTX::Int1RegsRegClass) {
+    return ".pred";
+  } else if (RC == &NVPTX::SpecialRegsRegClass) {
+    return "!Special!";
+  } else {
+    return "INTERNAL";
+  }
+  return "";
+}
+
+std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
+  if (RC == &NVPTX::Float32RegsRegClass) {
+    return "%f";
+  }
+  if (RC == &NVPTX::Float64RegsRegClass) {
+    return "%fd";
+  } else if (RC == &NVPTX::Int64RegsRegClass) {
+    return "%rd";
+  } else if (RC == &NVPTX::Int32RegsRegClass) {
+    return "%r";
+  } else if (RC == &NVPTX::Int16RegsRegClass) {
+    return "%rs";
+  } else if (RC == &NVPTX::Int1RegsRegClass) {
+    return "%p";
+  } else if (RC == &NVPTX::SpecialRegsRegClass) {
+    return "!Special!";
+  } else {
+    return "INTERNAL";
+  }
+  return "";
+}
+}
+
+NVPTXRegisterInfo::NVPTXRegisterInfo() : NVPTXGenRegisterInfo(0) {}
+
+#define GET_REGINFO_TARGET_DESC
+#include "NVPTXGenRegisterInfo.inc"
+
+/// NVPTX Callee Saved Registers
+const MCPhysReg *
+NVPTXRegisterInfo::getCalleeSavedRegs(const MachineFunction *) const {
+  static const MCPhysReg CalleeSavedRegs[] = { 0 };
+  return CalleeSavedRegs;
+}
+
+BitVector NVPTXRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  return Reserved;
+}
+
+void NVPTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                            int SPAdj, unsigned FIOperandNum,
+                                            RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+  MachineFunction &MF = *MI.getParent()->getParent();
+  int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex) +
+               MI.getOperand(FIOperandNum + 1).getImm();
+
+  // Using I0 as the frame pointer
+  MI.getOperand(FIOperandNum).ChangeToRegister(NVPTX::VRFrame, false);
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+}
+
+unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return NVPTX::VRFrame;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h
new file mode 100644
index 000000000000..c310a9c1ad0c
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h
@@ -0,0 +1,65 @@
+//===- NVPTXRegisterInfo.h - NVPTX Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXREGISTERINFO_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXREGISTERINFO_H
+
+#include "ManagedStringPool.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <sstream>
+
+#define GET_REGINFO_HEADER
+#include "NVPTXGenRegisterInfo.inc"
+
+namespace llvm {
+class NVPTXRegisterInfo : public NVPTXGenRegisterInfo {
+private:
+  // Hold Strings that can be free'd all together with NVPTXRegisterInfo
+  ManagedStringPool ManagedStrPool;
+
+public:
+  NVPTXRegisterInfo();
+
+  //------------------------------------------------------
+  // Pure virtual functions from TargetRegisterInfo
+  //------------------------------------------------------
+
+  // NVPTX callee saved registers
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  ManagedStringPool *getStrPool() const {
+    return const_cast<ManagedStringPool *>(&ManagedStrPool);
+  }
+
+  const char *getName(unsigned RegNo) const {
+    std::stringstream O;
+    O << "reg" << RegNo;
+    return getStrPool()->getManagedString(O.str().c_str())->c_str();
+  }
+
+};
+
+std::string getNVPTXRegClassName(const TargetRegisterClass *RC);
+std::string getNVPTXRegClassStr(const TargetRegisterClass *RC);
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
new file mode 100644
index 000000000000..ff6ccc457db7
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -0,0 +1,69 @@
+//===-- NVPTXRegisterInfo.td - NVPTX Register defs ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the PTX register file
+//===----------------------------------------------------------------------===//
+
+class NVPTXReg<string n> : Register<n> {
+  let Namespace = "NVPTX";
+}
+
+class NVPTXRegClass<list<ValueType> regTypes, int alignment, dag regList>
+     : RegisterClass <"NVPTX", regTypes, alignment, regList>;
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+// Special Registers used as stack pointer
+def VRFrame         : NVPTXReg<"%SP">;
+def VRFrameLocal    : NVPTXReg<"%SPL">;
+
+// Special Registers used as the stack
+def VRDepot  : NVPTXReg<"%Depot">;
+
+// We use virtual registers, but define a few physical registers here to keep
+// SDAG and the MachineInstr layers happy.
+foreach i = 0-4 in {
+  def P#i  : NVPTXReg<"%p"#i>;  // Predicate
+  def RS#i : NVPTXReg<"%rs"#i>; // 16-bit
+  def R#i  : NVPTXReg<"%r"#i>;  // 32-bit
+  def RL#i : NVPTXReg<"%rd"#i>; // 64-bit
+  def F#i  : NVPTXReg<"%f"#i>;  // 32-bit float
+  def FL#i : NVPTXReg<"%fd"#i>; // 64-bit float
+
+  // Arguments
+  def ia#i : NVPTXReg<"%ia"#i>;
+  def la#i : NVPTXReg<"%la"#i>;
+  def fa#i : NVPTXReg<"%fa"#i>;
+  def da#i : NVPTXReg<"%da"#i>;
+}
+
+foreach i = 0-31 in {
+  def ENVREG#i : NVPTXReg<"%envreg"#i>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Register classes
+//===----------------------------------------------------------------------===//
+def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>;
+def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 4))>;
+def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 4))>;
+def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4))>;
+def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
+def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>;
+def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>;
+def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%u", 0, 4))>;
+def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>;
+def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>;
+
+// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
+def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRFrameLocal, VRDepot,
+                                            (sequence "ENVREG%u", 0, 31))>;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
new file mode 100644
index 000000000000..2022caca76ee
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -0,0 +1,191 @@
+//===-- NVPTXReplaceImageHandles.cpp - Replace image handles for Fermi ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// On Fermi, image handles are not supported. To work around this, we traverse
+// the machine code and replace image handles with concrete symbols. For this
+// to work reliably, inlining of all function call must be performed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXMachineFunctionInfo.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class NVPTXReplaceImageHandles : public MachineFunctionPass {
+private:
+  static char ID;
+  DenseSet<MachineInstr *> InstrsToRemove;
+
+public:
+  NVPTXReplaceImageHandles();
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "NVPTX Replace Image Handles";
+  }
+private:
+  bool processInstr(MachineInstr &MI);
+  void replaceImageHandle(MachineOperand &Op, MachineFunction &MF);
+  bool findIndexForHandle(MachineOperand &Op, MachineFunction &MF,
+                          unsigned &Idx);
+};
+}
+
+char NVPTXReplaceImageHandles::ID = 0;
+
+NVPTXReplaceImageHandles::NVPTXReplaceImageHandles()
+  : MachineFunctionPass(ID) {}
+
+bool NVPTXReplaceImageHandles::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  InstrsToRemove.clear();
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+    for (MachineBasicBlock::iterator I = (*BI).begin(), E = (*BI).end();
+         I != E; ++I) {
+      MachineInstr &MI = *I;
+      Changed |= processInstr(MI);
+    }
+  }
+
+  // Now clean up any handle-access instructions
+  // This is needed in debug mode when code cleanup passes are not executed,
+  // but we need the handle access to be eliminated because they are not
+  // valid instructions when image handles are disabled.
+  for (DenseSet<MachineInstr *>::iterator I = InstrsToRemove.begin(),
+       E = InstrsToRemove.end(); I != E; ++I) {
+    (*I)->eraseFromParent();
+  }
+  return Changed;
+}
+
+bool NVPTXReplaceImageHandles::processInstr(MachineInstr &MI) {
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  if (MCID.TSFlags & NVPTXII::IsTexFlag) {
+    // This is a texture fetch, so operand 4 is a texref and operand 5 is
+    // a samplerref
+    MachineOperand &TexHandle = MI.getOperand(4);
+    replaceImageHandle(TexHandle, MF);
+
+    if (!(MCID.TSFlags & NVPTXII::IsTexModeUnifiedFlag)) {
+      MachineOperand &SampHandle = MI.getOperand(5);
+      replaceImageHandle(SampHandle, MF);
+    }
+
+    return true;
+  } else if (MCID.TSFlags & NVPTXII::IsSuldMask) {
+    unsigned VecSize =
+      1 << (((MCID.TSFlags & NVPTXII::IsSuldMask) >> NVPTXII::IsSuldShift) - 1);
+
+    // For a surface load of vector size N, the Nth operand will be the surfref
+    MachineOperand &SurfHandle = MI.getOperand(VecSize);
+
+    replaceImageHandle(SurfHandle, MF);
+
+    return true;
+  } else if (MCID.TSFlags & NVPTXII::IsSustFlag) {
+    // This is a surface store, so operand 0 is a surfref
+    MachineOperand &SurfHandle = MI.getOperand(0);
+
+    replaceImageHandle(SurfHandle, MF);
+
+    return true;
+  } else if (MCID.TSFlags & NVPTXII::IsSurfTexQueryFlag) {
+    // This is a query, so operand 1 is a surfref/texref
+    MachineOperand &Handle = MI.getOperand(1);
+
+    replaceImageHandle(Handle, MF);
+
+    return true;
+  }
+
+  return false;
+}
+
+void NVPTXReplaceImageHandles::
+replaceImageHandle(MachineOperand &Op, MachineFunction &MF) {
+  unsigned Idx;
+  if (findIndexForHandle(Op, MF, Idx)) {
+    Op.ChangeToImmediate(Idx);
+  }
+}
+
+bool NVPTXReplaceImageHandles::
+findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  NVPTXMachineFunctionInfo *MFI = MF.getInfo<NVPTXMachineFunctionInfo>();
+
+  assert(Op.isReg() && "Handle is not in a reg?");
+
+  // Which instruction defines the handle?
+  MachineInstr &TexHandleDef = *MRI.getVRegDef(Op.getReg());
+
+  switch (TexHandleDef.getOpcode()) {
+  case NVPTX::LD_i64_avar: {
+    // The handle is a parameter value being loaded, replace with the
+    // parameter symbol
+    const NVPTXTargetMachine &TM =
+        static_cast<const NVPTXTargetMachine &>(MF.getTarget());
+    if (TM.getDrvInterface() == NVPTX::CUDA) {
+      // For CUDA, we preserve the param loads coming from function arguments
+      return false;
+    }
+
+    assert(TexHandleDef.getOperand(6).isSymbol() && "Load is not a symbol!");
+    StringRef Sym = TexHandleDef.getOperand(6).getSymbolName();
+    std::string ParamBaseName = MF.getName();
+    ParamBaseName += "_param_";
+    assert(Sym.startswith(ParamBaseName) && "Invalid symbol reference");
+    unsigned Param = atoi(Sym.data()+ParamBaseName.size());
+    std::string NewSym;
+    raw_string_ostream NewSymStr(NewSym);
+    NewSymStr << MF.getFunction()->getName() << "_param_" << Param;
+
+    InstrsToRemove.insert(&TexHandleDef);
+    Idx = MFI->getImageHandleSymbolIndex(NewSymStr.str().c_str());
+    return true;
+  }
+  case NVPTX::texsurf_handles: {
+    // The handle is a global variable, replace with the global variable name
+    assert(TexHandleDef.getOperand(1).isGlobal() && "Load is not a global!");
+    const GlobalValue *GV = TexHandleDef.getOperand(1).getGlobal();
+    assert(GV->hasName() && "Global sampler must be named!");
+    InstrsToRemove.insert(&TexHandleDef);
+    Idx = MFI->getImageHandleSymbolIndex(GV->getName().data());
+    return true;
+  }
+  case NVPTX::nvvm_move_i64:
+  case TargetOpcode::COPY: {
+    bool Res = findIndexForHandle(TexHandleDef.getOperand(1), MF, Idx);
+    if (Res) {
+      InstrsToRemove.insert(&TexHandleDef);
+    }
+    return Res;
+  }
+  default:
+    llvm_unreachable("Unknown instruction operating on handle");
+  }
+}
+
+MachineFunctionPass *llvm::createNVPTXReplaceImageHandlesPass() {
+  return new NVPTXReplaceImageHandles();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
new file mode 100644
index 000000000000..cad4f5668fdf
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
@@ -0,0 +1,43 @@
+//===- NVPTXSection.h - NVPTX-specific section representation -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTXSection class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
+
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCSection.h"
+
+namespace llvm {
+/// Represents a section in PTX PTX does not have sections. We create this class
+/// in order to use the ASMPrint interface.
+///
+class NVPTXSection final : public MCSection {
+  virtual void anchor();
+public:
+  NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K, nullptr) {}
+  ~NVPTXSection() {}
+
+  /// Override this as NVPTX has its own way of printing switching
+  /// to a section.
+  void PrintSwitchToSection(const MCAsmInfo &MAI,
+                            raw_ostream &OS,
+                            const MCExpr *Subsection) const override {}
+
+  /// Base address of PTX sections is zero.
+  bool UseCodeAlign() const override { return false; }
+  bool isVirtualSection() const override { return false; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
new file mode 100644
index 000000000000..6e1f427ed021
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -0,0 +1,59 @@
+//===- NVPTXSubtarget.cpp - NVPTX Subtarget Information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the NVPTX specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-subtarget"
+
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "NVPTXGenSubtargetInfo.inc"
+
+// Pin the vtable to this file.
+void NVPTXSubtarget::anchor() {}
+
+NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                                StringRef FS) {
+    // Provide the default CPU if we don't have one.
+  TargetName = CPU.empty() ? "sm_20" : CPU;
+
+  ParseSubtargetFeatures(TargetName, FS);
+
+  // Set default to PTX 3.2 (CUDA 5.5)
+  if (PTXVersion == 0) {
+    PTXVersion = 32;
+  }
+
+  return *this;
+}
+
+NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
+                               const std::string &FS,
+                               const NVPTXTargetMachine &TM)
+    : NVPTXGenSubtargetInfo(TT, CPU, FS), PTXVersion(0), SmVersion(20), TM(TM),
+      InstrInfo(), TLInfo(TM, initializeSubtargetDependencies(CPU, FS)),
+      FrameLowering() {}
+
+bool NVPTXSubtarget::hasImageHandles() const {
+  // Enable handles for Kepler+, where CUDA supports indirect surfaces and
+  // textures
+  if (TM.getDrvInterface() == NVPTX::CUDA)
+    return (SmVersion >= 30);
+
+  // Disabled, otherwise
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
new file mode 100644
index 000000000000..da020a94bcdd
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -0,0 +1,116 @@
+//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTX specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
+
+#include "NVPTX.h"
+#include "NVPTXFrameLowering.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "NVPTXGenSubtargetInfo.inc"
+
+namespace llvm {
+
+class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
+  virtual void anchor();
+  std::string TargetName;
+
+  // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
+  unsigned PTXVersion;
+
+  // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31
+  unsigned int SmVersion;
+
+  const NVPTXTargetMachine &TM;
+  NVPTXInstrInfo InstrInfo;
+  NVPTXTargetLowering TLInfo;
+  SelectionDAGTargetInfo TSInfo;
+
+  // NVPTX does not have any call stack frame, but need a NVPTX specific
+  // FrameLowering class because TargetFrameLowering is abstract.
+  NVPTXFrameLowering FrameLowering;
+
+protected:
+  // Processor supports scoped atomic operations.
+  bool HasAtomScope;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  NVPTXSubtarget(const Triple &TT, const std::string &CPU,
+                 const std::string &FS, const NVPTXTargetMachine &TM);
+
+  const TargetFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const NVPTXRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const NVPTXTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+  bool hasBrkPt() const { return SmVersion >= 11; }
+  bool hasAtomRedG32() const { return SmVersion >= 11; }
+  bool hasAtomRedS32() const { return SmVersion >= 12; }
+  bool hasAtomRedG64() const { return SmVersion >= 12; }
+  bool hasAtomRedS64() const { return SmVersion >= 20; }
+  bool hasAtomRedGen32() const { return SmVersion >= 20; }
+  bool hasAtomRedGen64() const { return SmVersion >= 20; }
+  bool hasAtomAddF32() const { return SmVersion >= 20; }
+  bool hasAtomAddF64() const { return SmVersion >= 60; }
+  bool hasAtomScope() const { return HasAtomScope; }
+  bool hasAtomBitwise64() const { return SmVersion >= 32; }
+  bool hasAtomMinMax64() const { return SmVersion >= 32; }
+  bool hasVote() const { return SmVersion >= 12; }
+  bool hasDouble() const { return SmVersion >= 13; }
+  bool reqPTX20() const { return SmVersion >= 20; }
+  bool hasF32FTZ() const { return SmVersion >= 20; }
+  bool hasFMAF32() const { return SmVersion >= 20; }
+  bool hasFMAF64() const { return SmVersion >= 13; }
+  bool hasLDG() const { return SmVersion >= 32; }
+  bool hasLDU() const { return ((SmVersion >= 20) && (SmVersion < 30)); }
+  bool hasGenericLdSt() const { return SmVersion >= 20; }
+  inline bool hasHWROT32() const { return SmVersion >= 32; }
+  inline bool hasSWROT32() const {
+    return ((SmVersion >= 20) && (SmVersion < 32));
+  }
+  inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); }
+  inline bool hasROT64() const { return SmVersion >= 20; }
+  bool hasImageHandles() const;
+
+  unsigned int getSmVersion() const { return SmVersion; }
+  std::string getTargetName() const { return TargetName; }
+
+  unsigned getPTXVersion() const { return PTXVersion; }
+
+  NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
new file mode 100644
index 000000000000..6c68a2c9370d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -0,0 +1,371 @@
+//===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXTargetMachine.h"
+#include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "NVPTX.h"
+#include "NVPTXAllocaHoisting.h"
+#include "NVPTXLowerAggrCopies.h"
+#include "NVPTXTargetObjectFile.h"
+#include "NVPTXTargetTransformInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Vectorize.h"
+
+using namespace llvm;
+
+// LSV is still relatively new; this switch lets us turn it off in case we
+// encounter (or suspect) a bug.
+static cl::opt<bool>
+    DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer",
+                               cl::desc("Disable load/store vectorizer"),
+                               cl::init(false), cl::Hidden);
+
+namespace llvm {
+void initializeNVVMIntrRangePass(PassRegistry&);
+void initializeNVVMReflectPass(PassRegistry&);
+void initializeGenericToNVVMPass(PassRegistry&);
+void initializeNVPTXAllocaHoistingPass(PassRegistry &);
+void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
+void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
+void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
+void initializeNVPTXLowerArgsPass(PassRegistry &);
+void initializeNVPTXLowerAllocaPass(PassRegistry &);
+}
+
+extern "C" void LLVMInitializeNVPTXTarget() {
+  // Register the target.
+  RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
+  RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64());
+
+  // FIXME: This pass is really intended to be invoked during IR optimization,
+  // but it's very NVPTX-specific.
+  PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeNVVMReflectPass(PR);
+  initializeNVVMIntrRangePass(PR);
+  initializeGenericToNVVMPass(PR);
+  initializeNVPTXAllocaHoistingPass(PR);
+  initializeNVPTXAssignValidGlobalNamesPass(PR);
+  initializeNVPTXInferAddressSpacesPass(PR);
+  initializeNVPTXLowerArgsPass(PR);
+  initializeNVPTXLowerAllocaPass(PR);
+  initializeNVPTXLowerAggrCopiesPass(PR);
+}
+
+static std::string computeDataLayout(bool is64Bit) {
+  std::string Ret = "e";
+
+  if (!is64Bit)
+    Ret += "-p:32:32";
+
+  Ret += "-i64:64-v16:16-v32:32-n16:32:64";
+
+  return Ret;
+}
+
+NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
+                                       CodeGenOpt::Level OL, bool is64bit)
+    // The pic relocation model is used regardless of what the client has
+    // specified, as it is the only relocation model currently supported.
+    : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options,
+                        Reloc::PIC_, CM, OL),
+      is64bit(is64bit),
+      TLOF(make_unique<NVPTXTargetObjectFile>()),
+      Subtarget(TT, CPU, FS, *this) {
+  if (TT.getOS() == Triple::NVCL)
+    drvInterface = NVPTX::NVCL;
+  else
+    drvInterface = NVPTX::CUDA;
+  initAsmInfo();
+}
+
+NVPTXTargetMachine::~NVPTXTargetMachine() {}
+
+void NVPTXTargetMachine32::anchor() {}
+
+NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+    : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+void NVPTXTargetMachine64::anchor() {}
+
+NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+    : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+namespace {
+class NVPTXPassConfig : public TargetPassConfig {
+public:
+  NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  NVPTXTargetMachine &getNVPTXTargetMachine() const {
+    return getTM<NVPTXTargetMachine>();
+  }
+
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  void addPostRegAlloc() override;
+  void addMachineSSAOptimization() override;
+
+  FunctionPass *createTargetRegisterAllocator(bool) override;
+  void addFastRegAlloc(FunctionPass *RegAllocPass) override;
+  void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
+
+private:
+  // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This
+  // function is only called in opt mode.
+  void addEarlyCSEOrGVNPass();
+
+  // Add passes that propagate special memory spaces.
+  void addAddressSpaceInferencePasses();
+
+  // Add passes that perform straight-line scalar optimizations.
+  void addStraightLineScalarOptimizationPasses();
+};
+} // end anonymous namespace
+
+TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new NVPTXPassConfig(this, PM);
+}
+
+void NVPTXTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) {
+  PM.add(createNVVMReflectPass());
+  PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
+}
+
+TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(NVPTXTTIImpl(this, F));
+  });
+}
+
+void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
+  if (getOptLevel() == CodeGenOpt::Aggressive)
+    addPass(createGVNPass());
+  else
+    addPass(createEarlyCSEPass());
+}
+
+void NVPTXPassConfig::addAddressSpaceInferencePasses() {
+  // NVPTXLowerArgs emits alloca for byval parameters which can often
+  // be eliminated by SROA.
+  addPass(createSROAPass());
+  addPass(createNVPTXLowerAllocaPass());
+  addPass(createNVPTXInferAddressSpacesPass());
+}
+
+void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
+  addPass(createSeparateConstOffsetFromGEPPass());
+  addPass(createSpeculativeExecutionPass());
+  // ReassociateGEPs exposes more opportunites for SLSR. See
+  // the example in reassociate-geps-and-slsr.ll.
+  addPass(createStraightLineStrengthReducePass());
+  // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
+  // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
+  // for some of our benchmarks.
+  addEarlyCSEOrGVNPass();
+  // Run NaryReassociate after EarlyCSE/GVN to be more effective.
+  addPass(createNaryReassociatePass());
+  // NaryReassociate on GEPs creates redundant common expressions, so run
+  // EarlyCSE after it.
+  addPass(createEarlyCSEPass());
+}
+
+void NVPTXPassConfig::addIRPasses() {
+  // The following passes are known to not play well with virtual regs hanging
+  // around after register allocation (which in our case, is *all* registers).
+  // We explicitly disable them here.  We do, however, need some functionality
+  // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
+  // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
+  disablePass(&PrologEpilogCodeInserterID);
+  disablePass(&MachineCopyPropagationID);
+  disablePass(&TailDuplicateID);
+  disablePass(&StackMapLivenessID);
+  disablePass(&LiveDebugValuesID);
+  disablePass(&PostRASchedulerID);
+  disablePass(&FuncletLayoutID);
+  disablePass(&PatchableFunctionID);
+
+  // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
+  // it here does nothing.  But since we need it for correctness when lowering
+  // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
+  // call addEarlyAsPossiblePasses.
+  addPass(createNVVMReflectPass());
+
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createNVPTXImageOptimizerPass());
+  addPass(createNVPTXAssignValidGlobalNamesPass());
+  addPass(createGenericToNVVMPass());
+
+  // NVPTXLowerArgs is required for correctness and should be run right
+  // before the address space inference passes.
+  addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
+  if (getOptLevel() != CodeGenOpt::None) {
+    addAddressSpaceInferencePasses();
+    if (!DisableLoadStoreVectorizer)
+      addPass(createLoadStoreVectorizerPass());
+    addStraightLineScalarOptimizationPasses();
+  }
+
+  // === LSR and other generic IR passes ===
+  TargetPassConfig::addIRPasses();
+  // EarlyCSE is not always strong enough to clean up what LSR produces. For
+  // example, GVN can combine
+  //
+  //   %0 = add %a, %b
+  //   %1 = add %b, %a
+  //
+  // and
+  //
+  //   %0 = shl nsw %a, 2
+  //   %1 = shl %a, 2
+  //
+  // but EarlyCSE can do neither of them.
+  if (getOptLevel() != CodeGenOpt::None)
+    addEarlyCSEOrGVNPass();
+}
+
+bool NVPTXPassConfig::addInstSelector() {
+  const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
+
+  addPass(createLowerAggrCopies());
+  addPass(createAllocaHoisting());
+  addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
+
+  if (!ST.hasImageHandles())
+    addPass(createNVPTXReplaceImageHandlesPass());
+
+  return false;
+}
+
+void NVPTXPassConfig::addPostRegAlloc() {
+  addPass(createNVPTXPrologEpilogPass(), false);
+  if (getOptLevel() != CodeGenOpt::None) {
+    // NVPTXPrologEpilogPass calculates frame object offset and replace frame
+    // index with VRFrame register. NVPTXPeephole need to be run after that and
+    // will replace VRFrame with VRFrameLocal when possible.
+    addPass(createNVPTXPeephole());
+  }
+}
+
+FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
+  return nullptr; // No reg alloc
+}
+
+void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+  assert(!RegAllocPass && "NVPTX uses no regalloc!");
+  addPass(&PHIEliminationID);
+  addPass(&TwoAddressInstructionPassID);
+}
+
+void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+  assert(!RegAllocPass && "NVPTX uses no regalloc!");
+
+  addPass(&ProcessImplicitDefsID);
+  addPass(&LiveVariablesID);
+  addPass(&MachineLoopInfoID);
+  addPass(&PHIEliminationID);
+
+  addPass(&TwoAddressInstructionPassID);
+  addPass(&RegisterCoalescerID);
+
+  // PreRA instruction scheduling.
+  if (addPass(&MachineSchedulerID))
+    printAndVerify("After Machine Scheduling");
+
+
+  addPass(&StackSlotColoringID);
+
+  // FIXME: Needs physical registers
+  //addPass(&PostRAMachineLICMID);
+
+  printAndVerify("After StackSlotColoring");
+}
+
+void NVPTXPassConfig::addMachineSSAOptimization() {
+  // Pre-ra tail duplication.
+  if (addPass(&EarlyTailDuplicateID))
+    printAndVerify("After Pre-RegAlloc TailDuplicate");
+
+  // Optimize PHIs before DCE: removing dead PHI cycles may make more
+  // instructions dead.
+  addPass(&OptimizePHIsID);
+
+  // This pass merges large allocas. StackSlotColoring is a different pass
+  // which merges spill slots.
+  addPass(&StackColoringID);
+
+  // If the target requests it, assign local variables to stack slots relative
+  // to one another and simplify frame index references where possible.
+  addPass(&LocalStackSlotAllocationID);
+
+  // With optimization, dead code should already be eliminated. However
+  // there is one known exception: lowered code for arguments that are only
+  // used by tail calls, where the tail calls reuse the incoming stack
+  // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
+  addPass(&DeadMachineInstructionElimID);
+  printAndVerify("After codegen DCE pass");
+
+  // Allow targets to insert passes that improve instruction level parallelism,
+  // like if-conversion. Such passes will typically need dominator trees and
+  // loop info, just like LICM and CSE below.
+  if (addILPOpts())
+    printAndVerify("After ILP optimizations");
+
+  addPass(&MachineLICMID);
+  addPass(&MachineCSEID);
+
+  addPass(&MachineSinkingID);
+  printAndVerify("After Machine LICM, CSE and Sinking passes");
+
+  addPass(&PeepholeOptimizerID);
+  printAndVerify("After codegen peephole optimization pass");
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
new file mode 100644
index 000000000000..78a053831772
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -0,0 +1,89 @@
+//===-- NVPTXTargetMachine.h - Define TargetMachine for NVPTX ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTX specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXTARGETMACHINE_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETMACHINE_H
+
+#include "ManagedStringPool.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// NVPTXTargetMachine
+///
+class NVPTXTargetMachine : public LLVMTargetMachine {
+  bool is64bit;
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  NVPTX::DrvInterface drvInterface;
+  NVPTXSubtarget Subtarget;
+
+  // Hold Strings that can be free'd all together with NVPTXTargetMachine
+  ManagedStringPool ManagedStrPool;
+
+public:
+  NVPTXTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                     StringRef FS, const TargetOptions &Options,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OP, bool is64bit);
+
+  ~NVPTXTargetMachine() override;
+  const NVPTXSubtarget *getSubtargetImpl(const Function &) const override {
+    return &Subtarget;
+  }
+  const NVPTXSubtarget *getSubtargetImpl() const { return &Subtarget; }
+  bool is64Bit() const { return is64bit; }
+  NVPTX::DrvInterface getDrvInterface() const { return drvInterface; }
+  ManagedStringPool *getManagedStrPool() const {
+    return const_cast<ManagedStringPool *>(&ManagedStrPool);
+  }
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  // Emission of machine code through MCJIT is not supported.
+  bool addPassesToEmitMC(PassManagerBase &, MCContext *&, raw_pwrite_stream &,
+                         bool = true) override {
+    return true;
+  }
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+
+  void addEarlyAsPossiblePasses(PassManagerBase &PM) override;
+  TargetIRAnalysis getTargetIRAnalysis() override;
+
+}; // NVPTXTargetMachine.
+
+class NVPTXTargetMachine32 : public NVPTXTargetMachine {
+  virtual void anchor();
+public:
+  NVPTXTargetMachine32(const Target &T, const Triple &TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+class NVPTXTargetMachine64 : public NVPTXTargetMachine {
+  virtual void anchor();
+public:
+  NVPTXTargetMachine64(const Target &T, const Triple &TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
new file mode 100644
index 000000000000..dc367a90594a
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -0,0 +1,105 @@
+//===-- NVPTXTargetObjectFile.h - NVPTX Object Info -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H
+
+#include "NVPTXSection.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+class GlobalVariable;
+class Module;
+
+class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
+
+public:
+  NVPTXTargetObjectFile() {
+    TextSection = nullptr;
+    DataSection = nullptr;
+    BSSSection = nullptr;
+    ReadOnlySection = nullptr;
+
+    StaticCtorSection = nullptr;
+    StaticDtorSection = nullptr;
+    LSDASection = nullptr;
+    EHFrameSection = nullptr;
+    DwarfAbbrevSection = nullptr;
+    DwarfInfoSection = nullptr;
+    DwarfLineSection = nullptr;
+    DwarfFrameSection = nullptr;
+    DwarfPubTypesSection = nullptr;
+    DwarfDebugInlineSection = nullptr;
+    DwarfStrSection = nullptr;
+    DwarfLocSection = nullptr;
+    DwarfARangesSection = nullptr;
+    DwarfRangesSection = nullptr;
+    DwarfMacinfoSection = nullptr;
+  }
+
+  virtual ~NVPTXTargetObjectFile();
+
+  void Initialize(MCContext &ctx, const TargetMachine &TM) override {
+    TargetLoweringObjectFile::Initialize(ctx, TM);
+    TextSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getText());
+    DataSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getData());
+    BSSSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getBSS());
+    ReadOnlySection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getReadOnly());
+
+    StaticCtorSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    StaticDtorSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    LSDASection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    EHFrameSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfAbbrevSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfInfoSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfLineSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfFrameSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfPubTypesSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfDebugInlineSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfStrSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfLocSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfARangesSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfRangesSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfMacinfoSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+  }
+
+  MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
+                                   const Constant *C,
+                                   unsigned &Align) const override {
+    return ReadOnlySection;
+  }
+
+  MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind,
+                                      const TargetMachine &TM) const override {
+    return DataSection;
+  }
+
+  MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+                                    const TargetMachine &TM) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
new file mode 100644
index 000000000000..48928ee2d540
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -0,0 +1,154 @@
+//===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXTargetTransformInfo.h"
+#include "NVPTXUtilities.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "NVPTXtti"
+
+// Whether the given intrinsic reads threadIdx.x/y/z.
+static bool readsThreadIndex(const IntrinsicInst *II) {
+  switch (II->getIntrinsicID()) {
+    default: return false;
+    case Intrinsic::nvvm_read_ptx_sreg_tid_x:
+    case Intrinsic::nvvm_read_ptx_sreg_tid_y:
+    case Intrinsic::nvvm_read_ptx_sreg_tid_z:
+      return true;
+  }
+}
+
+static bool readsLaneId(const IntrinsicInst *II) {
+  return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
+}
+
+// Whether the given intrinsic is an atomic instruction in PTX.
+static bool isNVVMAtomic(const IntrinsicInst *II) {
+  switch (II->getIntrinsicID()) {
+    default: return false;
+    case Intrinsic::nvvm_atomic_load_add_f32:
+    case Intrinsic::nvvm_atomic_load_inc_32:
+    case Intrinsic::nvvm_atomic_load_dec_32:
+
+    case Intrinsic::nvvm_atomic_add_gen_f_cta:
+    case Intrinsic::nvvm_atomic_add_gen_f_sys:
+    case Intrinsic::nvvm_atomic_add_gen_i_cta:
+    case Intrinsic::nvvm_atomic_add_gen_i_sys:
+    case Intrinsic::nvvm_atomic_and_gen_i_cta:
+    case Intrinsic::nvvm_atomic_and_gen_i_sys:
+    case Intrinsic::nvvm_atomic_cas_gen_i_cta:
+    case Intrinsic::nvvm_atomic_cas_gen_i_sys:
+    case Intrinsic::nvvm_atomic_dec_gen_i_cta:
+    case Intrinsic::nvvm_atomic_dec_gen_i_sys:
+    case Intrinsic::nvvm_atomic_inc_gen_i_cta:
+    case Intrinsic::nvvm_atomic_inc_gen_i_sys:
+    case Intrinsic::nvvm_atomic_max_gen_i_cta:
+    case Intrinsic::nvvm_atomic_max_gen_i_sys:
+    case Intrinsic::nvvm_atomic_min_gen_i_cta:
+    case Intrinsic::nvvm_atomic_min_gen_i_sys:
+    case Intrinsic::nvvm_atomic_or_gen_i_cta:
+    case Intrinsic::nvvm_atomic_or_gen_i_sys:
+    case Intrinsic::nvvm_atomic_exch_gen_i_cta:
+    case Intrinsic::nvvm_atomic_exch_gen_i_sys:
+    case Intrinsic::nvvm_atomic_xor_gen_i_cta:
+    case Intrinsic::nvvm_atomic_xor_gen_i_sys:
+      return true;
+  }
+}
+
+bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
+  // Without inter-procedural analysis, we conservatively assume that arguments
+  // to __device__ functions are divergent.
+  if (const Argument *Arg = dyn_cast<Argument>(V))
+    return !isKernelFunction(*Arg->getParent());
+
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    // Without pointer analysis, we conservatively assume values loaded from
+    // generic or local address space are divergent.
+    if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      unsigned AS = LI->getPointerAddressSpace();
+      return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
+    }
+    // Atomic instructions may cause divergence. Atomic instructions are
+    // executed sequentially across all threads in a warp. Therefore, an earlier
+    // executed thread may see different memory inputs than a later executed
+    // thread. For example, suppose *a = 0 initially.
+    //
+    //   atom.global.add.s32 d, [a], 1
+    //
+    // returns 0 for the first thread that enters the critical region, and 1 for
+    // the second thread.
+    if (I->isAtomic())
+      return true;
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      // Instructions that read threadIdx are obviously divergent.
+      if (readsThreadIndex(II) || readsLaneId(II))
+        return true;
+      // Handle the NVPTX atomic instrinsics that cannot be represented as an
+      // atomic IR instruction.
+      if (isNVVMAtomic(II))
+        return true;
+    }
+    // Conservatively consider the return value of function calls as divergent.
+    // We could analyze callees with bodies more precisely using
+    // inter-procedural analysis.
+    if (isa<CallInst>(I))
+      return true;
+  }
+
+  return false;
+}
+
+int NVPTXTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+  switch (ISD) {
+  default:
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                         Opd1PropInfo, Opd2PropInfo);
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::XOR:
+  case ISD::OR:
+  case ISD::AND:
+    // The machine code (SASS) simulates an i64 with two i32. Therefore, we
+    // estimate that arithmetic operations on i64 are twice as expensive as
+    // those on types that can fit into one machine register.
+    if (LT.second.SimpleTy == MVT::i64)
+      return 2 * LT.first;
+    // Delegate other cases to the basic TTI.
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                         Opd1PropInfo, Opd2PropInfo);
+  }
+}
+
+void NVPTXTTIImpl::getUnrollingPreferences(Loop *L,
+                                           TTI::UnrollingPreferences &UP) {
+  BaseT::getUnrollingPreferences(L, UP);
+
+  // Enable partial unrolling and runtime unrolling, but reduce the
+  // threshold.  This partially unrolls small loops which are often
+  // unrolled by the PTX to SASS compiler and unrolling earlier can be
+  // beneficial.
+  UP.Partial = UP.Runtime = true;
+  UP.PartialThreshold = UP.Threshold / 4;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
new file mode 100644
index 000000000000..d953aa8a7199
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -0,0 +1,64 @@
+//===-- NVPTXTargetTransformInfo.h - NVPTX specific TTI ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// NVPTX target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETTRANSFORMINFO_H
+
+#include "NVPTX.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
+  typedef BasicTTIImplBase<NVPTXTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const NVPTXSubtarget *ST;
+  const NVPTXTargetLowering *TLI;
+
+  const NVPTXSubtarget *getST() const { return ST; };
+  const NVPTXTargetLowering *getTLI() const { return TLI; };
+
+public:
+  explicit NVPTXTTIImpl(const NVPTXTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
+        TLI(ST->getTargetLowering()) {}
+
+  bool hasBranchDivergence() { return true; }
+
+  bool isSourceOfDivergence(const Value *V);
+
+  // Increase the inlining cost threshold by a factor of 5, reflecting that
+  // calls are particularly expensive in NVPTX.
+  unsigned getInliningThresholdMultiplier() { return 5; }
+
+  int getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+
+  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
new file mode 100644
index 000000000000..e464f474b1d5
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -0,0 +1,317 @@
+//===- NVPTXUtilities.cpp - Utility Functions -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains miscellaneous utility functions
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXUtilities.h"
+#include "NVPTX.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MutexGuard.h"
+#include <algorithm>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+namespace {
+typedef std::map<std::string, std::vector<unsigned> > key_val_pair_t;
+typedef std::map<const GlobalValue *, key_val_pair_t> global_val_annot_t;
+typedef std::map<const Module *, global_val_annot_t> per_module_annot_t;
+} // anonymous namespace
+
+static ManagedStatic<per_module_annot_t> annotationCache;
+static sys::Mutex Lock;
+
+void clearAnnotationCache(const Module *Mod) {
+  MutexGuard Guard(Lock);
+  annotationCache->erase(Mod);
+}
+
+static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
+  MutexGuard Guard(Lock);
+  assert(md && "Invalid mdnode for annotation");
+  assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
+  // start index = 1, to skip the global variable key
+  // increment = 2, to skip the value for each property-value pairs
+  for (unsigned i = 1, e = md->getNumOperands(); i != e; i += 2) {
+    // property
+    const MDString *prop = dyn_cast<MDString>(md->getOperand(i));
+    assert(prop && "Annotation property not a string");
+
+    // value
+    ConstantInt *Val = mdconst::dyn_extract<ConstantInt>(md->getOperand(i + 1));
+    assert(Val && "Value operand not a constant int");
+
+    std::string keyname = prop->getString().str();
+    if (retval.find(keyname) != retval.end())
+      retval[keyname].push_back(Val->getZExtValue());
+    else {
+      std::vector<unsigned> tmp;
+      tmp.push_back(Val->getZExtValue());
+      retval[keyname] = tmp;
+    }
+  }
+}
+
+static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
+  MutexGuard Guard(Lock);
+  NamedMDNode *NMD = m->getNamedMetadata("nvvm.annotations");
+  if (!NMD)
+    return;
+  key_val_pair_t tmp;
+  for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+    const MDNode *elem = NMD->getOperand(i);
+
+    GlobalValue *entity =
+        mdconst::dyn_extract_or_null<GlobalValue>(elem->getOperand(0));
+    // entity may be null due to DCE
+    if (!entity)
+      continue;
+    if (entity != gv)
+      continue;
+
+    // accumulate annotations for entity in tmp
+    cacheAnnotationFromMD(elem, tmp);
+  }
+
+  if (tmp.empty()) // no annotations for this gv
+    return;
+
+  if ((*annotationCache).find(m) != (*annotationCache).end())
+    (*annotationCache)[m][gv] = std::move(tmp);
+  else {
+    global_val_annot_t tmp1;
+    tmp1[gv] = std::move(tmp);
+    (*annotationCache)[m] = std::move(tmp1);
+  }
+}
+
+bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
+                           unsigned &retval) {
+  MutexGuard Guard(Lock);
+  const Module *m = gv->getParent();
+  if ((*annotationCache).find(m) == (*annotationCache).end())
+    cacheAnnotationFromMD(m, gv);
+  else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+    cacheAnnotationFromMD(m, gv);
+  if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+    return false;
+  retval = (*annotationCache)[m][gv][prop][0];
+  return true;
+}
+
+bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
+                           std::vector<unsigned> &retval) {
+  MutexGuard Guard(Lock);
+  const Module *m = gv->getParent();
+  if ((*annotationCache).find(m) == (*annotationCache).end())
+    cacheAnnotationFromMD(m, gv);
+  else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+    cacheAnnotationFromMD(m, gv);
+  if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+    return false;
+  retval = (*annotationCache)[m][gv][prop];
+  return true;
+}
+
+bool isTexture(const Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (findOneNVVMAnnotation(gv, "texture", annot)) {
+      assert((annot == 1) && "Unexpected annotation on a texture symbol");
+      return true;
+    }
+  }
+  return false;
+}
+
+bool isSurface(const Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (findOneNVVMAnnotation(gv, "surface", annot)) {
+      assert((annot == 1) && "Unexpected annotation on a surface symbol");
+      return true;
+    }
+  }
+  return false;
+}
+
+bool isSampler(const Value &val) {
+  const char *AnnotationName = "sampler";
+
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (findOneNVVMAnnotation(gv, AnnotationName, annot)) {
+      assert((annot == 1) && "Unexpected annotation on a sampler symbol");
+      return true;
+    }
+  }
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (findAllNVVMAnnotation(func, AnnotationName, annot)) {
+      if (is_contained(annot, arg->getArgNo()))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool isImageReadOnly(const Value &val) {
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (findAllNVVMAnnotation(func, "rdoimage", annot)) {
+      if (is_contained(annot, arg->getArgNo()))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool isImageWriteOnly(const Value &val) {
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (findAllNVVMAnnotation(func, "wroimage", annot)) {
+      if (is_contained(annot, arg->getArgNo()))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool isImageReadWrite(const Value &val) {
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (findAllNVVMAnnotation(func, "rdwrimage", annot)) {
+      if (is_contained(annot, arg->getArgNo()))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool isImage(const Value &val) {
+  return isImageReadOnly(val) || isImageWriteOnly(val) || isImageReadWrite(val);
+}
+
+bool isManaged(const Value &val) {
+  if(const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (findOneNVVMAnnotation(gv, "managed", annot)) {
+      assert((annot == 1) && "Unexpected annotation on a managed symbol");
+      return true;
+    }
+  }
+  return false;
+}
+
+std::string getTextureName(const Value &val) {
+  assert(val.hasName() && "Found texture variable with no name");
+  return val.getName();
+}
+
+std::string getSurfaceName(const Value &val) {
+  assert(val.hasName() && "Found surface variable with no name");
+  return val.getName();
+}
+
+std::string getSamplerName(const Value &val) {
+  assert(val.hasName() && "Found sampler variable with no name");
+  return val.getName();
+}
+
+bool getMaxNTIDx(const Function &F, unsigned &x) {
+  return findOneNVVMAnnotation(&F, "maxntidx", x);
+}
+
+bool getMaxNTIDy(const Function &F, unsigned &y) {
+  return findOneNVVMAnnotation(&F, "maxntidy", y);
+}
+
+bool getMaxNTIDz(const Function &F, unsigned &z) {
+  return findOneNVVMAnnotation(&F, "maxntidz", z);
+}
+
+bool getReqNTIDx(const Function &F, unsigned &x) {
+  return findOneNVVMAnnotation(&F, "reqntidx", x);
+}
+
+bool getReqNTIDy(const Function &F, unsigned &y) {
+  return findOneNVVMAnnotation(&F, "reqntidy", y);
+}
+
+bool getReqNTIDz(const Function &F, unsigned &z) {
+  return findOneNVVMAnnotation(&F, "reqntidz", z);
+}
+
+bool getMinCTASm(const Function &F, unsigned &x) {
+  return findOneNVVMAnnotation(&F, "minctasm", x);
+}
+
+bool getMaxNReg(const Function &F, unsigned &x) {
+  return findOneNVVMAnnotation(&F, "maxnreg", x);
+}
+
+bool isKernelFunction(const Function &F) {
+  unsigned x = 0;
+  bool retval = findOneNVVMAnnotation(&F, "kernel", x);
+  if (!retval) {
+    // There is no NVVM metadata, check the calling convention
+    return F.getCallingConv() == CallingConv::PTX_Kernel;
+  }
+  return (x == 1);
+}
+
+bool getAlign(const Function &F, unsigned index, unsigned &align) {
+  std::vector<unsigned> Vs;
+  bool retval = findAllNVVMAnnotation(&F, "align", Vs);
+  if (!retval)
+    return false;
+  for (int i = 0, e = Vs.size(); i < e; i++) {
+    unsigned v = Vs[i];
+    if ((v >> 16) == index) {
+      align = v & 0xFFFF;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool getAlign(const CallInst &I, unsigned index, unsigned &align) {
+  if (MDNode *alignNode = I.getMetadata("callalign")) {
+    for (int i = 0, n = alignNode->getNumOperands(); i < n; i++) {
+      if (const ConstantInt *CI =
+              mdconst::dyn_extract<ConstantInt>(alignNode->getOperand(i))) {
+        unsigned v = CI->getZExtValue();
+        if ((v >> 16) == index) {
+          align = v & 0xFFFF;
+          return true;
+        }
+        if ((v >> 16) > index) {
+          return false;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h
new file mode 100644
index 000000000000..a0cc4e78ac21
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -0,0 +1,65 @@
+//===-- NVPTXUtilities - Utilities -----------------------------*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVVM specific utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include <cstdarg>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+void clearAnnotationCache(const Module *);
+
+bool findOneNVVMAnnotation(const GlobalValue *, const std::string &,
+                           unsigned &);
+bool findAllNVVMAnnotation(const GlobalValue *, const std::string &,
+                           std::vector<unsigned> &);
+
+bool isTexture(const Value &);
+bool isSurface(const Value &);
+bool isSampler(const Value &);
+bool isImage(const Value &);
+bool isImageReadOnly(const Value &);
+bool isImageWriteOnly(const Value &);
+bool isImageReadWrite(const Value &);
+bool isManaged(const Value &);
+
+std::string getTextureName(const Value &);
+std::string getSurfaceName(const Value &);
+std::string getSamplerName(const Value &);
+
+bool getMaxNTIDx(const Function &, unsigned &);
+bool getMaxNTIDy(const Function &, unsigned &);
+bool getMaxNTIDz(const Function &, unsigned &);
+
+bool getReqNTIDx(const Function &, unsigned &);
+bool getReqNTIDy(const Function &, unsigned &);
+bool getReqNTIDz(const Function &, unsigned &);
+
+bool getMinCTASm(const Function &, unsigned &);
+bool getMaxNReg(const Function &, unsigned &);
+bool isKernelFunction(const Function &);
+
+bool getAlign(const Function &, unsigned index, unsigned &);
+bool getAlign(const CallInst &, unsigned index, unsigned &);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td
new file mode 100644
index 000000000000..e69bbba9f193
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td
@@ -0,0 +1,1479 @@
+//===- NVPTXVector.td - NVPTX Vector Specific Instruction defs -*- tblgen-*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------
+// Vector Specific
+//-----------------------------------
+
+//
+// All vector instructions derive from NVPTXVecInst
+//
+
+class NVPTXVecInst<dag outs, dag ins, string asmstr, list<dag> pattern,
+  NVPTXInst sInst=NOP>
+  : NVPTXInst<outs, ins, asmstr, pattern> {
+  NVPTXInst scalarInst=sInst;
+}
+
+let isAsCheapAsAMove=1, VecInstType=isVecExtract.Value in {
+// Extract v2i16
+def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
+  (ins V2I16Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int16Regs:$dst, (extractelt
+                           (v2i16 V2I16Regs:$src), imm:$c))],
+                         IMOV16rr>;
+
+// Extract v4i16
+def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
+  (ins V4I16Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int16Regs:$dst, (extractelt
+                           (v4i16 V4I16Regs:$src), imm:$c))],
+                         IMOV16rr>;
+
+// Extract v2i8
+def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
+  (ins V2I8Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int8Regs:$dst, (extractelt
+                           (v2i8 V2I8Regs:$src), imm:$c))],
+                         IMOV8rr>;
+
+// Extract v4i8
+def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
+  (ins V4I8Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int8Regs:$dst, (extractelt
+                           (v4i8 V4I8Regs:$src), imm:$c))],
+                         IMOV8rr>;
+
+// Extract v2i32
+def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
+  (ins V2I32Regs:$src, i8imm:$c),
+                         "mov.u32 \t$dst, $src${c:vecelem};",
+                         [(set Int32Regs:$dst, (extractelt
+                           (v2i32 V2I32Regs:$src), imm:$c))],
+                         IMOV32rr>;
+
+// Extract v2f32
+def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
+  (ins V2F32Regs:$src, i8imm:$c),
+                         "mov.f32 \t$dst, $src${c:vecelem};",
+                         [(set Float32Regs:$dst, (extractelt
+                           (v2f32 V2F32Regs:$src), imm:$c))],
+                         FMOV32rr>;
+
+// Extract v2i64
+def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst),
+  (ins V2I64Regs:$src, i8imm:$c),
+                         "mov.u64 \t$dst, $src${c:vecelem};",
+                         [(set Int64Regs:$dst, (extractelt
+                           (v2i64 V2I64Regs:$src), imm:$c))],
+                         IMOV64rr>;
+
+// Extract v2f64
+def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst),
+  (ins V2F64Regs:$src, i8imm:$c),
+                         "mov.f64 \t$dst, $src${c:vecelem};",
+                         [(set Float64Regs:$dst, (extractelt
+                           (v2f64 V2F64Regs:$src), imm:$c))],
+                         FMOV64rr>;
+
+// Extract v4i32
+def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
+  (ins V4I32Regs:$src, i8imm:$c),
+                         "mov.u32 \t$dst, $src${c:vecelem};",
+                         [(set Int32Regs:$dst, (extractelt
+                           (v4i32 V4I32Regs:$src), imm:$c))],
+                         IMOV32rr>;
+
+// Extract v4f32
+def V4f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
+  (ins V4F32Regs:$src, i8imm:$c),
+                         "mov.f32 \t$dst, $src${c:vecelem};",
+                         [(set Float32Regs:$dst, (extractelt
+                           (v4f32 V4F32Regs:$src), imm:$c))],
+                         FMOV32rr>;
+}
+
+let isAsCheapAsAMove=1, VecInstType=isVecInsert.Value in {
+// Insert v2i8
+def V2i8Insert : NVPTXVecInst<(outs V2I8Regs:$dst),
+  (ins V2I8Regs:$src, Int8Regs:$val, i8imm:$c),
+        "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
+        "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V2I8Regs:$dst,
+         (insertelt V2I8Regs:$src, Int8Regs:$val, imm:$c))], IMOV8rr>;
+
+// Insert v4i8
+def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst),
+  (ins V4I8Regs:$src, Int8Regs:$val, i8imm:$c),
+                       "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V4I8Regs:$dst,
+         (insertelt V4I8Regs:$src, Int8Regs:$val, imm:$c))], IMOV8rr>;
+
+// Insert v2i16
+def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst),
+  (ins V2I16Regs:$src, Int16Regs:$val, i8imm:$c),
+                       "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V2I16Regs:$dst,
+         (insertelt V2I16Regs:$src, Int16Regs:$val, imm:$c))],
+                    IMOV16rr>;
+
+// Insert v4i16
+def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst),
+  (ins V4I16Regs:$src, Int16Regs:$val, i8imm:$c),
+                       "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V4I16Regs:$dst,
+         (insertelt V4I16Regs:$src, Int16Regs:$val, imm:$c))],
+                    IMOV16rr>;
+
+// Insert v2i32
+def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst),
+  (ins V2I32Regs:$src, Int32Regs:$val, i8imm:$c),
+                       "mov.v2.u32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u32 \t$dst${c:vecelem}, $val;",
+       [(set V2I32Regs:$dst,
+         (insertelt V2I32Regs:$src, Int32Regs:$val, imm:$c))],
+                    IMOV32rr>;
+
+// Insert v2f32
+def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst),
+  (ins V2F32Regs:$src, Float32Regs:$val, i8imm:$c),
+                       "mov.v2.f32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.f32 \t$dst${c:vecelem}, $val;",
+       [(set V2F32Regs:$dst,
+         (insertelt V2F32Regs:$src, Float32Regs:$val, imm:$c))],
+                    FMOV32rr>;
+
+// Insert v2i64
+def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst),
+  (ins V2I64Regs:$src, Int64Regs:$val, i8imm:$c),
+                       "mov.v2.u64 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u64 \t$dst${c:vecelem}, $val;",
+       [(set V2I64Regs:$dst,
+         (insertelt V2I64Regs:$src, Int64Regs:$val, imm:$c))],
+                    IMOV64rr>;
+
+// Insert v2f64
+def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst),
+  (ins V2F64Regs:$src, Float64Regs:$val, i8imm:$c),
+                       "mov.v2.f64 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.f64 \t$dst${c:vecelem}, $val;",
+       [(set V2F64Regs:$dst,
+         (insertelt V2F64Regs:$src, Float64Regs:$val, imm:$c))],
+                    FMOV64rr>;
+
+// Insert v4i32
+def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst),
+  (ins V4I32Regs:$src, Int32Regs:$val, i8imm:$c),
+                       "mov.v4.u32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u32 \t$dst${c:vecelem}, $val;",
+       [(set V4I32Regs:$dst,
+         (insertelt V4I32Regs:$src, Int32Regs:$val, imm:$c))],
+                    IMOV32rr>;
+
+// Insert v4f32
+def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst),
+  (ins V4F32Regs:$src, Float32Regs:$val, i8imm:$c),
+                       "mov.v4.f32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.f32 \t$dst${c:vecelem}, $val;",
+       [(set V4F32Regs:$dst,
+         (insertelt V4F32Regs:$src, Float32Regs:$val, imm:$c))],
+                    FMOV32rr>;
+}
+
+class BinOpAsmString<string c> {
+  string s = c;
+}
+
+class V4AsmStr<string opcode> : BinOpAsmString<
+                          !strconcat(!strconcat(!strconcat(!strconcat(
+                            !strconcat(!strconcat(!strconcat(
+                          opcode,  " \t${dst}_0, ${a}_0, ${b}_0;\n\t"),
+                          opcode), " \t${dst}_1, ${a}_1, ${b}_1;\n\t"),
+                          opcode), " \t${dst}_2, ${a}_2, ${b}_2;\n\t"),
+                          opcode), " \t${dst}_3, ${a}_3, ${b}_3;")>;
+
+class V2AsmStr<string opcode> : BinOpAsmString<
+                           !strconcat(!strconcat(!strconcat(
+                           opcode,  " \t${dst}_0, ${a}_0, ${b}_0;\n\t"),
+                           opcode), " \t${dst}_1, ${a}_1, ${b}_1;")>;
+
+class V4MADStr<string opcode> : BinOpAsmString<
+                          !strconcat(!strconcat(!strconcat(!strconcat(
+                            !strconcat(!strconcat(!strconcat(
+                          opcode,  " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"),
+                          opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;\n\t"),
+                          opcode), " \t${dst}_2, ${a}_2, ${b}_2, ${c}_2;\n\t"),
+                          opcode), " \t${dst}_3, ${a}_3, ${b}_3, ${c}_3;")>;
+
+class V2MADStr<string opcode> : BinOpAsmString<
+                           !strconcat(!strconcat(!strconcat(
+                           opcode,  " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"),
+                           opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;")>;
+
+class V4UnaryStr<string opcode> : BinOpAsmString<
+                          !strconcat(!strconcat(!strconcat(!strconcat(
+                            !strconcat(!strconcat(!strconcat(
+                          opcode,  " \t${dst}_0, ${a}_0;\n\t"),
+                          opcode), " \t${dst}_1, ${a}_1;\n\t"),
+                          opcode), " \t${dst}_2, ${a}_2;\n\t"),
+                          opcode), " \t${dst}_3, ${a}_3;")>;
+
+class V2UnaryStr<string opcode> : BinOpAsmString<
+                           !strconcat(!strconcat(!strconcat(
+                           opcode,  " \t${dst}_0, ${a}_0;\n\t"),
+                           opcode), " \t${dst}_1, ${a}_1;")>;
+
+class VecBinaryOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass,
+  NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a, regclass:$b),
+                 asmstr.s,
+                 [(set regclass:$dst, (OpNode regclass:$a, regclass:$b))],
+                 sInst>;
+
+class VecShiftOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass1,
+                 NVPTXRegClass regclass2, NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs regclass1:$dst), (ins regclass1:$a, regclass2:$b),
+                 asmstr.s,
+                 [(set regclass1:$dst, (OpNode regclass1:$a, regclass2:$b))],
+                 sInst>;
+
+class VecUnaryOp<BinOpAsmString asmstr, PatFrag OpNode, NVPTXRegClass regclass,
+  NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a),
+                 asmstr.s,
+                 [(set regclass:$dst, (OpNode regclass:$a))], sInst>;
+
+multiclass IntBinVOp<string asmstr, SDNode OpNode,
+                     NVPTXInst i64op=NOP, NVPTXInst i32op=NOP, NVPTXInst
+                     i16op=NOP, NVPTXInst i8op=NOP> {
+  def V2I64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "64")>, OpNode, V2I64Regs,
+    i64op>;
+  def V4I32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "32")>, OpNode, V4I32Regs,
+    i32op>;
+  def V2I32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "32")>, OpNode, V2I32Regs,
+    i32op>;
+  def V4I16 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I16Regs,
+    i16op>;
+  def V2I16 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I16Regs,
+    i16op>;
+  def V4I8 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I8Regs,
+    i8op>;
+  def V2I8 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I8Regs,
+    i8op>;
+}
+
+multiclass FloatBinVOp<string asmstr, SDNode OpNode,
+                       NVPTXInst f64=NOP, NVPTXInst f32=NOP,
+                       NVPTXInst f32_ftz=NOP> {
+  def V2F64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f64")>, OpNode,
+    V2F64Regs, f64>;
+  def V4F32_ftz : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode,
+    V4F32Regs, f32_ftz>, Requires<[doF32FTZ]>;
+  def V2F32_ftz : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode,
+    V2F32Regs, f32_ftz>, Requires<[doF32FTZ]>;
+  def V4F32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "f32")>, OpNode,
+    V4F32Regs, f32>;
+  def V2F32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f32")>, OpNode,
+    V2F32Regs, f32>;
+}
+
+multiclass IntUnaryVOp<string asmstr, PatFrag OpNode,
+                       NVPTXInst i64op=NOP, NVPTXInst i32op=NOP,
+                       NVPTXInst i16op=NOP, NVPTXInst i8op=NOP> {
+  def V2I64 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "64")>, OpNode,
+    V2I64Regs, i64op>;
+  def V4I32 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "32")>, OpNode,
+    V4I32Regs, i32op>;
+  def V2I32 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "32")>, OpNode,
+    V2I32Regs, i32op>;
+  def V4I16 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V4I16Regs, i16op>;
+  def V2I16 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V2I16Regs, i16op>;
+  def V4I8  : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V4I8Regs,   i8op>;
+  def V2I8  : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V2I8Regs,   i8op>;
+}
+
+
+// Integer Arithmetic
+let VecInstType=isVecOther.Value in {
+defm VAdd : IntBinVOp<"add.s", add, ADDi64rr, ADDi32rr, ADDi16rr, ADDi8rr>;
+defm VSub : IntBinVOp<"sub.s", sub, SUBi64rr, SUBi32rr, SUBi16rr, SUBi8rr>;
+
+def AddCCV4I32 : VecBinaryOp<V4AsmStr<"add.cc.s32">, addc, V4I32Regs,
+  ADDCCi32rr>;
+def AddCCV2I32 : VecBinaryOp<V2AsmStr<"add.cc.s32">, addc, V2I32Regs,
+  ADDCCi32rr>;
+def SubCCV4I32 : VecBinaryOp<V4AsmStr<"sub.cc.s32">, subc, V4I32Regs,
+  SUBCCi32rr>;
+def SubCCV2I32 : VecBinaryOp<V2AsmStr<"sub.cc.s32">, subc, V2I32Regs,
+  SUBCCi32rr>;
+def AddCCCV4I32 : VecBinaryOp<V4AsmStr<"addc.cc.s32">, adde, V4I32Regs,
+  ADDCCCi32rr>;
+def AddCCCV2I32 : VecBinaryOp<V2AsmStr<"addc.cc.s32">, adde, V2I32Regs,
+  ADDCCCi32rr>;
+def SubCCCV4I32 : VecBinaryOp<V4AsmStr<"subc.cc.s32">, sube, V4I32Regs,
+  SUBCCCi32rr>;
+def SubCCCV2I32 : VecBinaryOp<V2AsmStr<"subc.cc.s32">, sube, V2I32Regs,
+  SUBCCCi32rr>;
+
+def ShiftLV2I64 : VecShiftOp<V2AsmStr<"shl.b64">, shl, V2I64Regs, V2I32Regs,
+  SHLi64rr>;
+def ShiftLV2I32 : VecShiftOp<V2AsmStr<"shl.b32">, shl, V2I32Regs, V2I32Regs,
+  SHLi32rr>;
+def ShiftLV4I32 : VecShiftOp<V4AsmStr<"shl.b32">, shl, V4I32Regs, V4I32Regs,
+  SHLi32rr>;
+def ShiftLV2I16 : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I16Regs, V2I32Regs,
+  SHLi16rr>;
+def ShiftLV4I16 : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I16Regs, V4I32Regs,
+  SHLi16rr>;
+def ShiftLV2I8  : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I8Regs,  V2I32Regs,
+  SHLi8rr>;
+def ShiftLV4I8  : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I8Regs,  V4I32Regs,
+  SHLi8rr>;
+}
+
+// cvt to v*i32, helpers for shift
+class CVTtoVeci32<NVPTXRegClass inclass, NVPTXRegClass outclass, string asmstr,
+  NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs outclass:$d), (ins inclass:$s), asmstr, [], sInst>;
+
+class VecCVTStrHelper<string op, string dest, string src> {
+  string s=!strconcat(op, !strconcat("\t",
+           !strconcat(dest, !strconcat(", ", !strconcat(src, ";")))));
+}
+
+class Vec2CVTStr<string op> {
+  string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s,
+           !strconcat("\n\t", VecCVTStrHelper<op, "${d}_1", "${s}_1">.s));
+}
+
+class Vec4CVTStr<string op> {
+  string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s,
+           !strconcat("\n\t",
+           !strconcat(VecCVTStrHelper<op, "${d}_1", "${s}_1">.s,
+           !strconcat("\n\t",
+           !strconcat(VecCVTStrHelper<op, "${d}_2", "${s}_2">.s,
+           !strconcat("\n\t", VecCVTStrHelper<op, "${d}_3", "${s}_3">.s))))));
+}
+
+let VecInstType=isVecOther.Value in {
+def CVTv2i8tov2i32 : CVTtoVeci32<V2I8Regs, V2I32Regs,
+  Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>;
+def CVTv2i16tov2i32 : CVTtoVeci32<V2I16Regs, V2I32Regs,
+  Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>;
+def CVTv4i8tov4i32 : CVTtoVeci32<V4I8Regs, V4I32Regs,
+  Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>;
+def CVTv4i16tov4i32 : CVTtoVeci32<V4I16Regs, V4I32Regs,
+  Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>;
+def CVTv2i64tov2i32 : CVTtoVeci32<V2I64Regs, V2I32Regs,
+  Vec2CVTStr<"cvt.u32.u64">.s, TRUNC_64to32>;
+}
+
+def : Pat<(shl V2I16Regs:$src1, V2I16Regs:$src2),
+          (ShiftLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(shl V2I8Regs:$src1, V2I8Regs:$src2),
+          (ShiftLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(shl V2I64Regs:$src1, V2I64Regs:$src2),
+          (ShiftLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(shl V4I16Regs:$src1, V4I16Regs:$src2),
+          (ShiftLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(shl V4I8Regs:$src1, V4I8Regs:$src2),
+          (ShiftLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+let VecInstType=isVecOther.Value in {
+def ShiftRAV2I64 : VecShiftOp<V2AsmStr<"shr.s64">, sra, V2I64Regs, V2I32Regs,
+  SRAi64rr>;
+def ShiftRAV2I32 : VecShiftOp<V2AsmStr<"shr.s32">, sra, V2I32Regs, V2I32Regs,
+  SRAi32rr>;
+def ShiftRAV4I32 : VecShiftOp<V4AsmStr<"shr.s32">, sra, V4I32Regs, V4I32Regs,
+  SRAi32rr>;
+def ShiftRAV2I16 : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I16Regs, V2I32Regs,
+  SRAi16rr>;
+def ShiftRAV4I16 : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I16Regs, V4I32Regs,
+  SRAi16rr>;
+def ShiftRAV2I8  : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I8Regs,  V2I32Regs,
+  SRAi8rr>;
+def ShiftRAV4I8  : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I8Regs,  V4I32Regs,
+  SRAi8rr>;
+
+def ShiftRLV2I64 : VecShiftOp<V2AsmStr<"shr.u64">, srl, V2I64Regs, V2I32Regs,
+  SRLi64rr>;
+def ShiftRLV2I32 : VecShiftOp<V2AsmStr<"shr.u32">, srl, V2I32Regs, V2I32Regs,
+  SRLi32rr>;
+def ShiftRLV4I32 : VecShiftOp<V4AsmStr<"shr.u32">, srl, V4I32Regs, V4I32Regs,
+  SRLi32rr>;
+def ShiftRLV2I16 : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I16Regs, V2I32Regs,
+  SRLi16rr>;
+def ShiftRLV4I16 : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I16Regs, V4I32Regs,
+  SRLi16rr>;
+def ShiftRLV2I8  : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I8Regs,  V2I32Regs,
+  SRLi8rr>;
+def ShiftRLV4I8  : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I8Regs,  V4I32Regs,
+  SRLi8rr>;
+
+defm VMult   : IntBinVOp<"mul.lo.s", mul, MULTi64rr, MULTi32rr, MULTi16rr,
+  MULTi8rr>;
+defm VMultHS : IntBinVOp<"mul.hi.s", mulhs, MULTHSi64rr, MULTHSi32rr,
+  MULTHSi16rr,
+  MULTHSi8rr>;
+defm VMultHU : IntBinVOp<"mul.hi.u", mulhu, MULTHUi64rr, MULTHUi32rr,
+  MULTHUi16rr,
+  MULTHUi8rr>;
+defm VSDiv   : IntBinVOp<"div.s", sdiv, SDIVi64rr, SDIVi32rr, SDIVi16rr,
+  SDIVi8rr>;
+defm VUDiv   : IntBinVOp<"div.u", udiv, UDIVi64rr, UDIVi32rr, UDIVi16rr,
+  UDIVi8rr>;
+defm VSRem   : IntBinVOp<"rem.s", srem, SREMi64rr, SREMi32rr, SREMi16rr,
+  SREMi8rr>;
+defm VURem   : IntBinVOp<"rem.u", urem, UREMi64rr, UREMi32rr, UREMi16rr,
+  UREMi8rr>;
+}
+
+def : Pat<(sra V2I16Regs:$src1, V2I16Regs:$src2),
+          (ShiftRAV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(sra V2I8Regs:$src1, V2I8Regs:$src2),
+          (ShiftRAV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(sra V2I64Regs:$src1, V2I64Regs:$src2),
+          (ShiftRAV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(sra V4I16Regs:$src1, V4I16Regs:$src2),
+          (ShiftRAV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(sra V4I8Regs:$src1, V4I8Regs:$src2),
+          (ShiftRAV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+def : Pat<(srl V2I16Regs:$src1, V2I16Regs:$src2),
+          (ShiftRLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(srl V2I8Regs:$src1, V2I8Regs:$src2),
+          (ShiftRLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(srl V2I64Regs:$src1, V2I64Regs:$src2),
+          (ShiftRLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(srl V4I16Regs:$src1, V4I16Regs:$src2),
+          (ShiftRLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(srl V4I8Regs:$src1, V4I8Regs:$src2),
+          (ShiftRLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+multiclass VMAD<string asmstr, NVPTXRegClass regclassv4,
+  NVPTXRegClass regclassv2,
+                SDNode an=add, SDNode mn=mul, NVPTXInst sop=NOP,
+                Predicate Pred> {
+  def V4 : NVPTXVecInst<(outs regclassv4:$dst),
+    (ins regclassv4:$a, regclassv4:$b, regclassv4:$c),
+                      V4MADStr<asmstr>.s,
+                      [(set regclassv4:$dst,
+                        (an (mn regclassv4:$a, regclassv4:$b), regclassv4:$c))],
+                      sop>,
+           Requires<[Pred]>;
+  def V2 : NVPTXVecInst<(outs regclassv2:$dst),
+    (ins regclassv2:$a, regclassv2:$b, regclassv2:$c),
+                      V2MADStr<asmstr>.s,
+                      [(set regclassv2:$dst,
+                        (an (mn regclassv2:$a, regclassv2:$b), regclassv2:$c))],
+                      sop>,
+           Requires<[Pred]>;
+}
+
+multiclass VMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP,
+  Predicate Pred> {
+  def V2 : NVPTXVecInst<(outs regclass:$dst),
+    (ins regclass:$a, regclass:$b, regclass:$c),
+                      V2MADStr<asmstr>.s,
+                      [(set regclass:$dst, (add
+                        (mul regclass:$a, regclass:$b), regclass:$c))], sop>,
+           Requires<[Pred]>;
+}
+multiclass VFMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP,
+  Predicate Pred> {
+  def V2 : NVPTXVecInst<(outs regclass:$dst),
+    (ins regclass:$a, regclass:$b, regclass:$c),
+                      V2MADStr<asmstr>.s,
+                      [(set regclass:$dst, (fadd
+                        (fmul regclass:$a, regclass:$b), regclass:$c))], sop>,
+           Requires<[Pred]>;
+}
+
+let VecInstType=isVecOther.Value in {
+defm I8MAD  : VMAD<"mad.lo.s16", V4I8Regs, V2I8Regs, add, mul, MAD8rrr, true>;
+defm I16MAD : VMAD<"mad.lo.s16", V4I16Regs, V2I16Regs, add, mul, MAD16rrr,
+  true>;
+defm I32MAD : VMAD<"mad.lo.s32", V4I32Regs, V2I32Regs, add, mul, MAD32rrr,
+  true>;
+defm I64MAD : VMADV2Only<"mad.lo.s64", V2I64Regs, MAD64rrr, true>;
+
+defm VNeg : IntUnaryVOp<"neg.s", ineg, INEG64, INEG32, INEG16, INEG8>;
+
+defm VAddf : FloatBinVOp<"add.", fadd, FADDf64rr, FADDf32rr, FADDf32rr_ftz>;
+defm VSubf : FloatBinVOp<"sub.", fsub, FSUBf64rr, FSUBf32rr, FSUBf32rr_ftz>;
+defm VMulf : FloatBinVOp<"mul.", fmul, FMULf64rr, FMULf32rr, FMULf32rr_ftz>;
+
+defm F32MAD_ftz : VMAD<"mad.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul,
+  FMAD32_ftzrrr, doFMADF32_ftz>;
+defm F32FMA_ftz : VMAD<"fma.rn.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul,
+  FMA32_ftzrrr, doFMAF32_ftz>;
+defm F32MAD : VMAD<"mad.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMAD32rrr,
+  doFMADF32>;
+defm F32FMA : VMAD<"fma.rn.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMA32rrr,
+  doFMAF32>;
+defm F64FMA : VFMADV2Only<"fma.rn.f64", V2F64Regs, FMA64rrr, doFMAF64>;
+}
+
+let VecInstType=isVecOther.Value in {
+def V4F32Div_prec_ftz : VecBinaryOp<V4AsmStr<"div.rn.ftz.f32">, fdiv, V4F32Regs,
+  FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>;
+def V2F32Div_prec_ftz : VecBinaryOp<V2AsmStr<"div.rn.ftz.f32">, fdiv, V2F32Regs,
+  FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>;
+def V4F32Div_prec : VecBinaryOp<V4AsmStr<"div.rn.f32">, fdiv, V4F32Regs,
+  FDIV32rr_prec>, Requires<[reqPTX20]>;
+def V2F32Div_prec : VecBinaryOp<V2AsmStr<"div.rn.f32">, fdiv, V2F32Regs,
+  FDIV32rr_prec>, Requires<[reqPTX20]>;
+def V2F32Div_ftz : VecBinaryOp<V2AsmStr<"div.full.ftz.f32">, fdiv, V2F32Regs,
+  FDIV32rr_ftz>, Requires<[doF32FTZ]>;
+def V4F32Div_ftz : VecBinaryOp<V4AsmStr<"div.full.ftz.f32">, fdiv, V4F32Regs,
+  FDIV32rr_ftz>, Requires<[doF32FTZ]>;
+def V2F32Div : VecBinaryOp<V2AsmStr<"div.full.f32">, fdiv, V2F32Regs, FDIV32rr>;
+def V4F32Div : VecBinaryOp<V4AsmStr<"div.full.f32">, fdiv, V4F32Regs, FDIV32rr>;
+def V2F64Div : VecBinaryOp<V2AsmStr<"div.rn.f64">, fdiv, V2F64Regs, FDIV64rr>;
+}
+
+def fnegpat : PatFrag<(ops node:$in), (fneg node:$in)>;
+
+let VecInstType=isVecOther.Value in {
+def VNegv2f32_ftz : VecUnaryOp<V2UnaryStr<"neg.ftz.f32">, fnegpat, V2F32Regs,
+  FNEGf32_ftz>, Requires<[doF32FTZ]>;
+def VNegv4f32_ftz : VecUnaryOp<V4UnaryStr<"neg.ftz.f32">, fnegpat, V4F32Regs,
+  FNEGf32_ftz>, Requires<[doF32FTZ]>;
+def VNegv2f32 : VecUnaryOp<V2UnaryStr<"neg.f32">, fnegpat, V2F32Regs, FNEGf32>;
+def VNegv4f32 : VecUnaryOp<V4UnaryStr<"neg.f32">, fnegpat, V4F32Regs, FNEGf32>;
+def VNegv2f64 : VecUnaryOp<V2UnaryStr<"neg.f64">, fnegpat, V2F64Regs, FNEGf64>;
+
+// Logical Arithmetic
+defm VAnd : IntBinVOp<"and.b", and, ANDb64rr, ANDb32rr, ANDb16rr, ANDb8rr>;
+defm VOr  : IntBinVOp<"or.b", or, ORb64rr, ORb32rr, ORb16rr, ORb8rr>;
+defm VXor : IntBinVOp<"xor.b", xor, XORb64rr, XORb32rr, XORb16rr, XORb8rr>;
+
+defm VNot : IntUnaryVOp<"not.b", not, NOT64, NOT32, NOT16, NOT8>;
+}
+
+
+multiclass V2FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub V2F32Regs:$a, (fmul V2F32Regs:$b, V2F32Regs:$c)),
+          (Inst (VNegv2f32 V2F32Regs:$b), V2F32Regs:$c,  V2F32Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul V2F32Regs:$a, V2F32Regs:$b), V2F32Regs:$c),
+          (Inst V2F32Regs:$a, V2F32Regs:$b, (VNegv2f32 V2F32Regs:$c))>,
+          Requires<[Pred]>;
+}
+
+defm V2FMAF32ext_ftz  : V2FPCONTRACT32_SUB_PAT<F32FMA_ftzV2, doFMAF32AGG_ftz>;
+defm V2FMADF32ext_ftz : V2FPCONTRACT32_SUB_PAT<F32MAD_ftzV2, doFMADF32_ftz>;
+defm V2FMAF32ext  : V2FPCONTRACT32_SUB_PAT<F32FMAV2, doFMAF32AGG>;
+defm V2FMADF32ext : V2FPCONTRACT32_SUB_PAT<F32MADV2, doFMADF32>;
+
+multiclass V4FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub V4F32Regs:$a, (fmul V4F32Regs:$b, V4F32Regs:$c)),
+          (Inst (VNegv4f32 V4F32Regs:$b), V4F32Regs:$c,  V4F32Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul V4F32Regs:$a, V4F32Regs:$b), V4F32Regs:$c),
+          (Inst V4F32Regs:$a, V4F32Regs:$b, (VNegv4f32 V4F32Regs:$c))>,
+          Requires<[Pred]>;
+}
+
+defm V4FMAF32ext_ftz  : V4FPCONTRACT32_SUB_PAT<F32FMA_ftzV4, doFMAF32AGG_ftz>;
+defm V4FMADF32ext_ftz : V4FPCONTRACT32_SUB_PAT<F32MAD_ftzV4, doFMADF32_ftz>;
+defm V4FMAF32ext  : V4FPCONTRACT32_SUB_PAT<F32FMAV4, doFMAF32AGG>;
+defm V4FMADF32ext : V4FPCONTRACT32_SUB_PAT<F32MADV4, doFMADF32>;
+
+multiclass V2FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub V2F64Regs:$a, (fmul V2F64Regs:$b, V2F64Regs:$c)),
+          (Inst (VNegv2f64 V2F64Regs:$b), V2F64Regs:$c, V2F64Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul V2F64Regs:$a, V2F64Regs:$b), V2F64Regs:$c),
+          (Inst V2F64Regs:$a, V2F64Regs:$b, (VNegv2f64 V2F64Regs:$c))>,
+          Requires<[Pred]>;
+}
+
+defm V2FMAF64ext : V2FPCONTRACT64_SUB_PAT<F64FMAV2, doFMAF64AGG>;
+
+class VecModStr<string vecsize, string elem, string extra, string l="">
+{
+  string t1 = !strconcat("${c", elem);
+  string t2 = !strconcat(t1, ":vecv");
+  string t3 = !strconcat(t2, vecsize);
+  string t4 = !strconcat(t3, extra);
+  string t5 = !strconcat(t4, l);
+  string s =  !strconcat(t5, "}");
+}
+class ShuffleOneLine<string vecsize, string elem, string type>
+{
+  string t1 = VecModStr<vecsize, elem, "comm", "1">.s;
+  string t2 = !strconcat(t1, "mov.");
+  string t3 = !strconcat(t2, type);
+  string t4 = !strconcat(t3, " \t${dst}_");
+  string t5 = !strconcat(t4, elem);
+  string t6 = !strconcat(t5, ", $src1");
+  string t7 = !strconcat(t6, VecModStr<vecsize, elem, "pos">.s);
+  string t8 = !strconcat(t7, ";\n\t");
+  string t9 = !strconcat(t8, VecModStr<vecsize, elem, "comm", "2">.s);
+  string t10 = !strconcat(t9, "mov.");
+  string t11 = !strconcat(t10, type);
+  string t12 = !strconcat(t11, " \t${dst}_");
+  string t13 = !strconcat(t12, elem);
+  string t14 = !strconcat(t13, ", $src2");
+  string t15 = !strconcat(t14, VecModStr<vecsize, elem, "pos">.s);
+  string s =   !strconcat(t15, ";");
+}
+class ShuffleAsmStr2<string type>
+{
+  string t1 = ShuffleOneLine<"2", "0", type>.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string s  = !strconcat(t2, ShuffleOneLine<"2", "1", type>.s);
+}
+class ShuffleAsmStr4<string type>
+{
+  string t1 = ShuffleOneLine<"4", "0", type>.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string t3 = !strconcat(t2, ShuffleOneLine<"4", "1", type>.s);
+  string t4 = !strconcat(t3, "\n\t");
+  string t5 = !strconcat(t4, ShuffleOneLine<"4", "2", type>.s);
+  string t6 = !strconcat(t5, "\n\t");
+  string s  = !strconcat(t6, ShuffleOneLine<"4", "3", type>.s);
+}
+
+let hasSideEffects=0, VecInstType=isVecShuffle.Value in {
+def VecShuffle_v4f32 : NVPTXVecInst<(outs V4F32Regs:$dst),
+                       (ins  V4F32Regs:$src1, V4F32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"f32">.s),
+                       [], FMOV32rr>;
+
+def VecShuffle_v4i32 : NVPTXVecInst<(outs V4I32Regs:$dst),
+                       (ins  V4I32Regs:$src1, V4I32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"u32">.s),
+                       [], IMOV32rr>;
+
+def VecShuffle_v4i16 : NVPTXVecInst<(outs V4I16Regs:$dst),
+                       (ins  V4I16Regs:$src1, V4I16Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"u16">.s),
+                       [], IMOV16rr>;
+
+def VecShuffle_v4i8 : NVPTXVecInst<(outs V4I8Regs:$dst),
+                       (ins  V4I8Regs:$src1, V4I8Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"u16">.s),
+                       [], IMOV8rr>;
+
+def VecShuffle_v2f32 : NVPTXVecInst<(outs V2F32Regs:$dst),
+                       (ins  V2F32Regs:$src1, V2F32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"f32">.s),
+                       [], FMOV32rr>;
+
+def VecShuffle_v2i32 : NVPTXVecInst<(outs V2I32Regs:$dst),
+                       (ins  V2I32Regs:$src1, V2I32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u32">.s),
+                       [], IMOV32rr>;
+
+def VecShuffle_v2i8 : NVPTXVecInst<(outs V2I8Regs:$dst),
+                       (ins  V2I8Regs:$src1, V2I8Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u16">.s),
+                       [], IMOV8rr>;
+
+def VecShuffle_v2i16 : NVPTXVecInst<(outs V2I16Regs:$dst),
+                       (ins  V2I16Regs:$src1, V2I16Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u16">.s),
+                       [], IMOV16rr>;
+
+def VecShuffle_v2f64 : NVPTXVecInst<(outs V2F64Regs:$dst),
+                       (ins  V2F64Regs:$src1, V2F64Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"f64">.s),
+                       [], FMOV64rr>;
+
+def VecShuffle_v2i64 : NVPTXVecInst<(outs V2I64Regs:$dst),
+                       (ins  V2I64Regs:$src1, V2I64Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u64">.s),
+                       [], IMOV64rr>;
+}
+
+def ShuffleMask0 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(0), SDLoc(N), MVT::i32);
+}]>;
+def ShuffleMask1 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(1), SDLoc(N), MVT::i32);
+}]>;
+def ShuffleMask2 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(2), SDLoc(N), MVT::i32);
+}]>;
+def ShuffleMask3 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(3), SDLoc(N), MVT::i32);
+}]>;
+
+// The spurious call is here to silence a compiler warning about N being
+// unused.
+def vec_shuf : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs),
+                       [{ N->getGluedNode(); return true; }]>;
+
+def : Pat<(v2f64 (vec_shuf:$op V2F64Regs:$src1, V2F64Regs:$src2)),
+          (VecShuffle_v2f64 V2F64Regs:$src1, V2F64Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4f32 (vec_shuf:$op V4F32Regs:$src1, V4F32Regs:$src2)),
+          (VecShuffle_v4f32 V4F32Regs:$src1, V4F32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2f32 (vec_shuf:$op V2F32Regs:$src1, V2F32Regs:$src2)),
+          (VecShuffle_v2f32 V2F32Regs:$src1, V2F32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v2i64 (vec_shuf:$op V2I64Regs:$src1, V2I64Regs:$src2)),
+          (VecShuffle_v2i64 V2I64Regs:$src1, V2I64Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i32 (vec_shuf:$op V4I32Regs:$src1, V4I32Regs:$src2)),
+          (VecShuffle_v4i32 V4I32Regs:$src1, V4I32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i32 (vec_shuf:$op V2I32Regs:$src1, V2I32Regs:$src2)),
+          (VecShuffle_v2i32 V2I32Regs:$src1, V2I32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i16 (vec_shuf:$op V4I16Regs:$src1, V4I16Regs:$src2)),
+          (VecShuffle_v4i16 V4I16Regs:$src1, V4I16Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i16 (vec_shuf:$op V2I16Regs:$src1, V2I16Regs:$src2)),
+          (VecShuffle_v2i16 V2I16Regs:$src1, V2I16Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i8 (vec_shuf:$op V4I8Regs:$src1, V4I8Regs:$src2)),
+          (VecShuffle_v4i8 V4I8Regs:$src1, V4I8Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i8 (vec_shuf:$op V2I8Regs:$src1, V2I8Regs:$src2)),
+          (VecShuffle_v2i8 V2I8Regs:$src1, V2I8Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+class Build_Vector2<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass,
+  NVPTXInst si>
+                   : NVPTXVecInst<(outs vclass:$dst),
+                   (ins  sclass:$a1, sclass:$a2),
+                   !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2}};"),
+                   [(set vclass:$dst, (build_vector sclass:$a1, sclass:$a2))],
+                   si>;
+class Build_Vector4<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass,
+  NVPTXInst si>
+                   : NVPTXVecInst<(outs vclass:$dst),
+                   (ins  sclass:$a1, sclass:$a2, sclass:$a3, sclass:$a4),
+               !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2, $a3, $a4}};"),
+                   [(set vclass:$dst,
+                     (build_vector sclass:$a1, sclass:$a2,
+                       sclass:$a3, sclass:$a4))], si>;
+
+let isAsCheapAsAMove=1, VecInstType=isVecBuild.Value in {
+def Build_Vector2_f32 : Build_Vector2<"mov.v2.f32", V2F32Regs, Float32Regs,
+  FMOV32rr>;
+def Build_Vector2_f64 : Build_Vector2<"mov.v2.f64", V2F64Regs, Float64Regs,
+  FMOV64rr>;
+
+def Build_Vector2_i32 : Build_Vector2<"mov.v2.u32", V2I32Regs, Int32Regs,
+  IMOV32rr>;
+def Build_Vector2_i64 : Build_Vector2<"mov.v2.u64", V2I64Regs, Int64Regs,
+  IMOV64rr>;
+def Build_Vector2_i16 : Build_Vector2<"mov.v2.u16", V2I16Regs, Int16Regs,
+  IMOV16rr>;
+def Build_Vector2_i8  : Build_Vector2<"mov.v2.u16",  V2I8Regs,  Int8Regs,
+  IMOV8rr>;
+
+def Build_Vector4_f32 : Build_Vector4<"mov.v4.f32", V4F32Regs, Float32Regs,
+  FMOV32rr>;
+
+def Build_Vector4_i32 : Build_Vector4<"mov.v4.u32", V4I32Regs, Int32Regs,
+  IMOV32rr>;
+def Build_Vector4_i16 : Build_Vector4<"mov.v4.u16", V4I16Regs, Int16Regs,
+  IMOV16rr>;
+def Build_Vector4_i8  : Build_Vector4<"mov.v4.u16", V4I8Regs, Int8Regs,
+  IMOV8rr>;
+}
+
+class Vec_Move<string asmstr, NVPTXRegClass vclass, NVPTXInst sop=NOP>
+                 : NVPTXVecInst<(outs vclass:$dst), (ins vclass:$src),
+                   !strconcat(asmstr, "\t${dst:vecfull}, ${src:vecfull};"),
+                   [], sop>;
+
+let isAsCheapAsAMove=1, hasSideEffects=0, IsSimpleMove=1,
+  VecInstType=isVecOther.Value in {
+def V4f32Mov : Vec_Move<"mov.v4.f32", V4F32Regs, FMOV32rr>;
+def V2f32Mov : Vec_Move<"mov.v2.f32", V2F32Regs, FMOV32rr>;
+
+def V4i32Mov : Vec_Move<"mov.v4.u32", V4I32Regs, IMOV32rr>;
+def V2i32Mov : Vec_Move<"mov.v2.u32", V2I32Regs, IMOV32rr>;
+
+def V4i16Mov : Vec_Move<"mov.v4.u16", V4I16Regs, IMOV16rr>;
+def V2i16Mov : Vec_Move<"mov.v2.u16", V2I16Regs, IMOV16rr>;
+
+def V4i8Mov : Vec_Move<"mov.v4.u16", V4I8Regs, IMOV8rr>;
+def V2i8Mov : Vec_Move<"mov.v2.u16", V2I8Regs, IMOV8rr>;
+
+def V2f64Mov : Vec_Move<"mov.v2.f64", V2F64Regs, FMOV64rr>;
+def V2i64Mov : Vec_Move<"mov.v2.u64", V2I64Regs, IMOV64rr>;
+}
+
+// extract subvector patterns
+def extract_subvec : SDNode<"ISD::EXTRACT_SUBVECTOR",
+                        SDTypeProfile<1, 2, [SDTCisPtrTy<2>]>>;
+
+def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 0)),
+                 (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 0),
+                                    (V4f32Extract V4F32Regs:$src, 1))>;
+def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 2)),
+                 (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 2),
+                                    (V4f32Extract V4F32Regs:$src, 3))>;
+def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 0)),
+                 (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 0),
+                                    (V4i32Extract V4I32Regs:$src, 1))>;
+def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 2)),
+                 (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 2),
+                                    (V4i32Extract V4I32Regs:$src, 3))>;
+def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 0)),
+                 (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 0),
+                                    (V4i16Extract V4I16Regs:$src, 1))>;
+def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 2)),
+                 (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 2),
+                                    (V4i16Extract V4I16Regs:$src, 3))>;
+def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 0)),
+                 (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 0),
+                                    (V4i8Extract V4I8Regs:$src, 1))>;
+def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 2)),
+                 (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 2),
+                                    (V4i8Extract V4I8Regs:$src, 3))>;
+
+// Select instructions
+class Select_OneLine<string type, string pos> {
+  string t1 = !strconcat("selp.", type);
+  string t2 = !strconcat(t1, " \t${dst}_");
+  string t3 = !strconcat(t2, pos);
+  string t4 = !strconcat(t3, ", ${src1}_");
+  string t5 = !strconcat(t4, pos);
+  string t6 = !strconcat(t5, ", ${src2}_");
+  string t7 = !strconcat(t6, pos);
+  string s  = !strconcat(t7, ", $p;");
+}
+
+class Select_Str2<string type> {
+  string t1 = Select_OneLine<type, "0">.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string s  = !strconcat(t2, Select_OneLine<type, "1">.s);
+}
+
+class Select_Str4<string type> {
+  string t1 = Select_OneLine<type, "0">.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string t3 = !strconcat(t2, Select_OneLine<type, "1">.s);
+  string t4 = !strconcat(t3, "\n\t");
+  string t5 = !strconcat(t4, Select_OneLine<type, "2">.s);
+  string t6 = !strconcat(t5, "\n\t");
+  string s  = !strconcat(t6, Select_OneLine<type, "3">.s);
+
+}
+
+class Vec_Select<NVPTXRegClass vclass, string asmstr, NVPTXInst sop>
+      : NVPTXVecInst<(outs vclass:$dst),
+                     (ins  vclass:$src1, vclass:$src2, Int1Regs:$p),
+                     asmstr,
+                     [(set vclass:$dst, (select Int1Regs:$p, vclass:$src1,
+                       vclass:$src2))],
+                     sop>;
+
+let VecInstType=isVecOther.Value in {
+def V2I64_Select : Vec_Select<V2I64Regs, Select_Str2<"b64">.s, SELECTi64rr>;
+def V4I32_Select : Vec_Select<V4I32Regs, Select_Str4<"b32">.s, SELECTi32rr>;
+def V2I32_Select : Vec_Select<V2I32Regs, Select_Str2<"b32">.s, SELECTi32rr>;
+def V4I16_Select : Vec_Select<V4I16Regs, Select_Str4<"b16">.s, SELECTi16rr>;
+def V2I16_Select : Vec_Select<V2I16Regs, Select_Str2<"b16">.s, SELECTi16rr>;
+def V4I8_Select  : Vec_Select<V4I8Regs,  Select_Str4<"b16">.s, SELECTi8rr>;
+def V2I8_Select  : Vec_Select<V2I8Regs,  Select_Str2<"b16">.s, SELECTi8rr>;
+
+def V2F64_Select : Vec_Select<V2F64Regs, Select_Str2<"f64">.s, SELECTf64rr>;
+def V4F32_Select : Vec_Select<V4F32Regs, Select_Str4<"f32">.s, SELECTf32rr>;
+def V2F32_Select : Vec_Select<V2F32Regs, Select_Str2<"f32">.s, SELECTf32rr>;
+}
+
+// Comparison instructions
+
+// setcc convenience fragments.
+def vsetoeq : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOEQ)>;
+def vsetogt : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOGT)>;
+def vsetoge : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOGE)>;
+def vsetolt : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOLT)>;
+def vsetole : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOLE)>;
+def vsetone : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETONE)>;
+def vseto   : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETO)>;
+def vsetuo  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUO)>;
+def vsetueq : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUEQ)>;
+def vsetugt : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUGT)>;
+def vsetuge : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUGE)>;
+def vsetult : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETULT)>;
+def vsetule : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETULE)>;
+def vsetune : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUNE)>;
+def vseteq  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETEQ)>;
+def vsetgt  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETGT)>;
+def vsetge  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETGE)>;
+def vsetlt  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETLT)>;
+def vsetle  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETLE)>;
+def vsetne  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETNE)>;
+
+class Vec_Compare<PatFrag op, NVPTXRegClass outrclass, NVPTXRegClass inrclass,
+  NVPTXInst sop>
+    : NVPTXVecInst<(outs outrclass:$dst),
+                   (ins  inrclass:$a, inrclass:$b),
+                   "Unsupported",
+                   [(set outrclass:$dst, (op inrclass:$a, inrclass:$b))],
+                   sop>;
+
+multiclass Vec_Compare_All<PatFrag op,
+                           NVPTXInst inst8,
+                           NVPTXInst inst16,
+                           NVPTXInst inst32,
+                           NVPTXInst inst64>
+{
+  def  V2I8 : Vec_Compare<op, V2I8Regs,  V2I8Regs,  inst8>;
+  def  V4I8 : Vec_Compare<op, V4I8Regs,  V4I8Regs,  inst8>;
+  def V2I16 : Vec_Compare<op, V2I16Regs, V2I16Regs, inst16>;
+  def V4I16 : Vec_Compare<op, V4I16Regs, V4I16Regs, inst16>;
+  def V2I32 : Vec_Compare<op, V2I32Regs, V2I32Regs, inst32>;
+  def V4I32 : Vec_Compare<op, V4I32Regs, V4I32Regs, inst32>;
+  def V2I64 : Vec_Compare<op, V2I64Regs, V2I64Regs, inst64>;
+}
+
+let VecInstType=isVecOther.Value in {
+  defm VecSGT : Vec_Compare_All<vsetgt,  ISetSGTi8rr_toi8, ISetSGTi16rr_toi16,
+    ISetSGTi32rr_toi32, ISetSGTi64rr_toi64>;
+  defm VecUGT : Vec_Compare_All<vsetugt, ISetUGTi8rr_toi8, ISetUGTi16rr_toi16,
+    ISetUGTi32rr_toi32, ISetUGTi64rr_toi64>;
+  defm VecSLT : Vec_Compare_All<vsetlt,  ISetSLTi8rr_toi8, ISetSLTi16rr_toi16,
+    ISetSLTi32rr_toi32, ISetSLTi64rr_toi64>;
+  defm VecULT : Vec_Compare_All<vsetult, ISetULTi8rr_toi8, ISetULTi16rr_toi16,
+    ISetULTi32rr_toi32, ISetULTi64rr_toi64>;
+  defm VecSGE : Vec_Compare_All<vsetge,  ISetSGEi8rr_toi8, ISetSGEi16rr_toi16,
+    ISetSGEi32rr_toi32, ISetSGEi64rr_toi64>;
+  defm VecUGE : Vec_Compare_All<vsetuge, ISetUGEi8rr_toi8, ISetUGEi16rr_toi16,
+    ISetUGEi32rr_toi32, ISetUGEi64rr_toi64>;
+  defm VecSLE : Vec_Compare_All<vsetle,  ISetSLEi8rr_toi8, ISetSLEi16rr_toi16,
+    ISetSLEi32rr_toi32, ISetSLEi64rr_toi64>;
+  defm VecULE : Vec_Compare_All<vsetule, ISetULEi8rr_toi8, ISetULEi16rr_toi16,
+    ISetULEi32rr_toi32, ISetULEi64rr_toi64>;
+  defm VecSEQ : Vec_Compare_All<vseteq,  ISetSEQi8rr_toi8, ISetSEQi16rr_toi16,
+    ISetSEQi32rr_toi32, ISetSEQi64rr_toi64>;
+  defm VecUEQ : Vec_Compare_All<vsetueq, ISetUEQi8rr_toi8, ISetUEQi16rr_toi16,
+    ISetUEQi32rr_toi32, ISetUEQi64rr_toi64>;
+  defm VecSNE : Vec_Compare_All<vsetne,  ISetSNEi8rr_toi8, ISetSNEi16rr_toi16,
+    ISetSNEi32rr_toi32, ISetSNEi64rr_toi64>;
+  defm VecUNE : Vec_Compare_All<vsetune, ISetUNEi8rr_toi8, ISetUNEi16rr_toi16,
+    ISetUNEi32rr_toi32, ISetUNEi64rr_toi64>;
+}
+
+multiclass FVec_Compare_All<PatFrag op,
+                            NVPTXInst instf32,
+                            NVPTXInst instf64>
+{
+  def V2F32 : Vec_Compare<op, V2I32Regs, V2F32Regs, instf32>;
+  def V4F32 : Vec_Compare<op, V4I32Regs, V4F32Regs, instf32>;
+  def V2F64 : Vec_Compare<op, V2I64Regs, V2F64Regs, instf64>;
+}
+
+let VecInstType=isVecOther.Value in {
+  defm FVecGT :  FVec_Compare_All<vsetogt, FSetGTf32rr_toi32,
+    FSetGTf64rr_toi64>;
+  defm FVecLT :  FVec_Compare_All<vsetolt, FSetLTf32rr_toi32,
+    FSetLTf64rr_toi64>;
+  defm FVecGE :  FVec_Compare_All<vsetoge, FSetGEf32rr_toi32,
+    FSetGEf64rr_toi64>;
+  defm FVecLE :  FVec_Compare_All<vsetole, FSetLEf32rr_toi32,
+    FSetLEf64rr_toi64>;
+  defm FVecEQ :  FVec_Compare_All<vsetoeq, FSetEQf32rr_toi32,
+    FSetEQf64rr_toi64>;
+  defm FVecNE :  FVec_Compare_All<vsetone, FSetNEf32rr_toi32,
+    FSetNEf64rr_toi64>;
+
+  defm FVecUGT :  FVec_Compare_All<vsetugt, FSetUGTf32rr_toi32,
+    FSetUGTf64rr_toi64>;
+  defm FVecULT :  FVec_Compare_All<vsetult, FSetULTf32rr_toi32,
+    FSetULTf64rr_toi64>;
+  defm FVecUGE :  FVec_Compare_All<vsetuge, FSetUGEf32rr_toi32,
+    FSetUGEf64rr_toi64>;
+  defm FVecULE :  FVec_Compare_All<vsetule, FSetULEf32rr_toi32,
+    FSetULEf64rr_toi64>;
+  defm FVecUEQ :  FVec_Compare_All<vsetueq, FSetUEQf32rr_toi32,
+    FSetUEQf64rr_toi64>;
+  defm FVecUNE :  FVec_Compare_All<vsetune, FSetUNEf32rr_toi32,
+    FSetUNEf64rr_toi64>;
+
+  defm FVecNUM :  FVec_Compare_All<vseto,  FSetNUMf32rr_toi32,
+    FSetNUMf64rr_toi64>;
+  defm FVecNAN :  FVec_Compare_All<vsetuo, FSetNANf32rr_toi32,
+    FSetNANf64rr_toi64>;
+}
+
+class LoadParamScalar4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$d1, regclass:$d2, regclass:$d3, regclass:$d4),
+                (ins i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("ld.param", opstr),
+                  "\t{{$d1, $d2, $d3, $d4}}, [retval0+$b];"), []>;
+
+class LoadParamScalar2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$d1, regclass:$d2),
+                (ins i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("ld.param", opstr),
+                  "\t{{$d1, $d2}}, [retval0+$b];"), []>;
+
+
+class StoreParamScalar4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4,
+                  i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[param$a+$b], {{$s1, $s2, $s3, $s4}};"), []>;
+
+class StoreParamScalar2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[param$a+$b], {{$s1, $s2}};"), []>;
+
+class StoreRetvalScalar4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4,
+                  i32imm:$a),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[func_retval+$a], {{$s1, $s2, $s3, $s4}};"), []>;
+
+class StoreRetvalScalar2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, i32imm:$a),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[func_retval+$a], {{$s1, $s2}};"), []>;
+
+def LoadParamScalar4I32 : LoadParamScalar4Inst<Int32Regs, ".v4.b32">;
+def LoadParamScalar4I16 : LoadParamScalar4Inst<Int16Regs, ".v4.b16">;
+def LoadParamScalar4I8  : LoadParamScalar4Inst<Int8Regs, ".v4.b8">;
+
+def LoadParamScalar2I64 : LoadParamScalar2Inst<Int32Regs, ".v2.b64">;
+def LoadParamScalar2I32 : LoadParamScalar2Inst<Int32Regs, ".v2.b32">;
+def LoadParamScalar2I16 : LoadParamScalar2Inst<Int32Regs, ".v2.b16">;
+def LoadParamScalar2I8  : LoadParamScalar2Inst<Int32Regs, ".v2.b8">;
+
+def LoadParamScalar4F32 : LoadParamScalar4Inst<Float32Regs, ".v4.f32">;
+def LoadParamScalar2F32 : LoadParamScalar2Inst<Float32Regs, ".v2.f32">;
+def LoadParamScalar2F64 : LoadParamScalar2Inst<Float64Regs, ".v2.f64">;
+
+def StoreParamScalar4I32 : StoreParamScalar4Inst<Int32Regs, ".v4.b32">;
+def StoreParamScalar4I16 : StoreParamScalar4Inst<Int16Regs, ".v4.b16">;
+def StoreParamScalar4I8  : StoreParamScalar4Inst<Int8Regs, ".v4.b8">;
+
+def StoreParamScalar2I64 : StoreParamScalar2Inst<Int64Regs, ".v2.b64">;
+def StoreParamScalar2I32 : StoreParamScalar2Inst<Int32Regs, ".v2.b32">;
+def StoreParamScalar2I16 : StoreParamScalar2Inst<Int16Regs, ".v2.b16">;
+def StoreParamScalar2I8  : StoreParamScalar2Inst<Int8Regs, ".v2.b8">;
+
+def StoreParamScalar4F32 : StoreParamScalar4Inst<Float32Regs, ".v4.f32">;
+def StoreParamScalar2F32 : StoreParamScalar2Inst<Float32Regs, ".v2.f32">;
+def StoreParamScalar2F64 : StoreParamScalar2Inst<Float64Regs, ".v2.f64">;
+
+def StoreRetvalScalar4I32 : StoreRetvalScalar4Inst<Int32Regs, ".v4.b32">;
+def StoreRetvalScalar4I16 : StoreRetvalScalar4Inst<Int16Regs, ".v4.b16">;
+def StoreRetvalScalar4I8  : StoreRetvalScalar4Inst<Int8Regs, ".v4.b8">;
+
+def StoreRetvalScalar2I64 : StoreRetvalScalar2Inst<Int64Regs, ".v2.b64">;
+def StoreRetvalScalar2I32 : StoreRetvalScalar2Inst<Int32Regs, ".v2.b32">;
+def StoreRetvalScalar2I16 : StoreRetvalScalar2Inst<Int16Regs, ".v2.b16">;
+def StoreRetvalScalar2I8  : StoreRetvalScalar2Inst<Int8Regs, ".v2.b8">;
+
+def StoreRetvalScalar4F32 : StoreRetvalScalar4Inst<Float32Regs, ".v4.f32">;
+def StoreRetvalScalar2F32 : StoreRetvalScalar2Inst<Float32Regs, ".v2.f32">;
+def StoreRetvalScalar2F64 : StoreRetvalScalar2Inst<Float64Regs, ".v2.f64">;
+
+class LoadParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP>:
+      NVPTXVecInst<(outs regclass:$dst), (ins i32imm:$a, i32imm:$b),
+                "loadparam : $dst <- [$a, $b]",
+                [(set regclass:$dst, (LoadParam (i32 imm:$a), (i32 imm:$b)))],
+                sop>;
+
+class StoreParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP>
+      : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+                "storeparam : [$a, $b] <- $val",
+                [(StoreParam (i32 imm:$a), (i32 imm:$b), regclass:$val)], sop>;
+
+class StoreRetvalVecInst<NVPTXRegClass regclass, string opstr,
+  NVPTXInst sop=NOP>
+      : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a),
+                "storeretval : retval[$a] <- $val",
+                [(StoreRetval (i32 imm:$a), regclass:$val)], sop>;
+
+let VecInstType=isVecLD.Value in {
+def LoadParamV4I32  : LoadParamVecInst<V4I32Regs, ".v4.b32",
+  LoadParamScalar4I32>;
+def LoadParamV4I16  : LoadParamVecInst<V4I16Regs, ".v4.b16",
+  LoadParamScalar4I16>;
+def LoadParamV4I8   : LoadParamVecInst<V4I8Regs, ".v4.b8",
+  LoadParamScalar4I8>;
+
+def LoadParamV2I64  : LoadParamVecInst<V2I64Regs, ".v2.b64",
+  LoadParamScalar2I64>;
+def LoadParamV2I32  : LoadParamVecInst<V2I32Regs, ".v2.b32",
+  LoadParamScalar2I32>;
+def LoadParamV2I16  : LoadParamVecInst<V2I16Regs, ".v2.b16",
+  LoadParamScalar2I16>;
+def LoadParamV2I8   : LoadParamVecInst<V2I8Regs, ".v2.b8",
+  LoadParamScalar2I8>;
+
+def LoadParamV4F32  : LoadParamVecInst<V4F32Regs, ".v4.f32",
+  LoadParamScalar4F32>;
+def LoadParamV2F32  : LoadParamVecInst<V2F32Regs, ".v2.f32",
+  LoadParamScalar2F32>;
+def LoadParamV2F64  : LoadParamVecInst<V2F64Regs, ".v2.f64",
+  LoadParamScalar2F64>;
+}
+
+let VecInstType=isVecST.Value in {
+def StoreParamV4I32  : StoreParamVecInst<V4I32Regs, ".v4.b32",
+  StoreParamScalar4I32>;
+def StoreParamV4I16  : StoreParamVecInst<V4I16Regs, ".v4.b16",
+  StoreParamScalar4I16>;
+def StoreParamV4I8   : StoreParamVecInst<V4I8Regs, ".v4.b8",
+  StoreParamScalar4I8>;
+
+def StoreParamV2I64  : StoreParamVecInst<V2I64Regs, ".v2.b64",
+  StoreParamScalar2I64>;
+def StoreParamV2I32  : StoreParamVecInst<V2I32Regs, ".v2.b32",
+  StoreParamScalar2I32>;
+def StoreParamV2I16  : StoreParamVecInst<V2I16Regs, ".v2.b16",
+  StoreParamScalar2I16>;
+def StoreParamV2I8   : StoreParamVecInst<V2I8Regs, ".v2.b8",
+  StoreParamScalar2I8>;
+
+def StoreParamV4F32  : StoreParamVecInst<V4F32Regs, ".v4.f32",
+  StoreParamScalar4F32>;
+def StoreParamV2F32  : StoreParamVecInst<V2F32Regs, ".v2.f32",
+  StoreParamScalar2F32>;
+def StoreParamV2F64  : StoreParamVecInst<V2F64Regs, ".v2.f64",
+  StoreParamScalar2F64>;
+
+def StoreRetvalV4I32  : StoreRetvalVecInst<V4I32Regs, ".v4.b32",
+  StoreRetvalScalar4I32>;
+def StoreRetvalV4I16  : StoreRetvalVecInst<V4I16Regs, ".v4.b16",
+  StoreRetvalScalar4I16>;
+def StoreRetvalV4I8   : StoreRetvalVecInst<V4I8Regs,  ".v4.b8",
+  StoreRetvalScalar4I8>;
+
+def StoreRetvalV2I64  : StoreRetvalVecInst<V2I64Regs, ".v2.b64",
+  StoreRetvalScalar2I64>;
+def StoreRetvalV2I32  : StoreRetvalVecInst<V2I32Regs, ".v2.b32",
+  StoreRetvalScalar2I32>;
+def StoreRetvalV2I16  : StoreRetvalVecInst<V2I16Regs, ".v2.b16",
+  StoreRetvalScalar2I16>;
+def StoreRetvalV2I8   : StoreRetvalVecInst<V2I8Regs,  ".v2.b8",
+  StoreRetvalScalar2I8>;
+
+def StoreRetvalV4F32  : StoreRetvalVecInst<V4F32Regs, ".v4.f32",
+  StoreRetvalScalar4F32>;
+def StoreRetvalV2F32  : StoreRetvalVecInst<V2F32Regs, ".v2.f32",
+  StoreRetvalScalar2F32>;
+def StoreRetvalV2F64  : StoreRetvalVecInst<V2F64Regs, ".v2.f64",
+  StoreRetvalScalar2F64>;
+
+}
+
+
+// Int vector to int scalar bit convert
+// v4i8 -> i32
+def : Pat<(i32 (bitconvert V4I8Regs:$s)),
+          (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+                     (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3))>;
+// v4i16 -> i64
+def : Pat<(i64 (bitconvert V4I16Regs:$s)),
+          (V4I16toI64 (V4i16Extract V4I16Regs:$s,0),
+            (V4i16Extract V4I16Regs:$s,1),
+                     (V4i16Extract V4I16Regs:$s,2),
+                     (V4i16Extract V4I16Regs:$s,3))>;
+// v2i8 -> i16
+def : Pat<(i16 (bitconvert V2I8Regs:$s)),
+          (V2I8toI16 (V2i8Extract V2I8Regs:$s,0), (V2i8Extract V2I8Regs:$s,1))>;
+// v2i16 -> i32
+def : Pat<(i32 (bitconvert V2I16Regs:$s)),
+          (V2I16toI32 (V2i16Extract V2I16Regs:$s,0),
+            (V2i16Extract V2I16Regs:$s,1))>;
+// v2i32 -> i64
+def : Pat<(i64 (bitconvert V2I32Regs:$s)),
+          (V2I32toI64 (V2i32Extract V2I32Regs:$s,0),
+            (V2i32Extract V2I32Regs:$s,1))>;
+
+// Int scalar to int vector bit convert
+let VecInstType=isVecDest.Value in {
+// i32 -> v4i8
+def VecI32toV4I8 : NVPTXVecInst<(outs V4I8Regs:$d), (ins Int32Regs:$s),
+                                "Error!",
+                                [(set V4I8Regs:$d, (bitconvert Int32Regs:$s))],
+                                I32toV4I8>;
+// i64 -> v4i16
+def VecI64toV4I16 : NVPTXVecInst<(outs V4I16Regs:$d), (ins Int64Regs:$s),
+                                 "Error!",
+                                [(set V4I16Regs:$d, (bitconvert Int64Regs:$s))],
+                                 I64toV4I16>;
+// i16 -> v2i8
+def VecI16toV2I8 : NVPTXVecInst<(outs V2I8Regs:$d), (ins Int16Regs:$s),
+                                "Error!",
+                               [(set V2I8Regs:$d, (bitconvert Int16Regs:$s))],
+                                I16toV2I8>;
+// i32 -> v2i16
+def VecI32toV2I16 : NVPTXVecInst<(outs V2I16Regs:$d), (ins Int32Regs:$s),
+                                 "Error!",
+                                [(set V2I16Regs:$d, (bitconvert Int32Regs:$s))],
+                                 I32toV2I16>;
+// i64 -> v2i32
+def VecI64toV2I32 : NVPTXVecInst<(outs V2I32Regs:$d), (ins Int64Regs:$s),
+                                  "Error!",
+                                [(set V2I32Regs:$d, (bitconvert Int64Regs:$s))],
+                                  I64toV2I32>;
+}
+
+// Int vector to int vector bit convert
+// v4i8 -> v2i16
+def : Pat<(v2i16 (bitconvert V4I8Regs:$s)),
+          (VecI32toV2I16
+          (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+                    (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>;
+// v4i16 -> v2i32
+def : Pat<(v2i32 (bitconvert V4I16Regs:$s)),
+          (VecI64toV2I32
+       (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1),
+                (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>;
+// v2i16 -> v4i8
+def : Pat<(v4i8 (bitconvert V2I16Regs:$s)),
+          (VecI32toV4I8
+    (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>;
+// v2i32 -> v4i16
+def : Pat<(v4i16 (bitconvert V2I32Regs:$s)),
+          (VecI64toV4I16
+    (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>;
+// v2i64 -> v4i32
+def : Pat<(v4i32 (bitconvert V2I64Regs:$s)),
+          (Build_Vector4_i32
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 0),
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 1),
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 0),
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 1))>;
+// v4i32 -> v2i64
+def : Pat<(v2i64 (bitconvert V4I32Regs:$s)),
+          (Build_Vector2_i64
+      (V2I32toI64 (V4i32Extract V4I32Regs:$s,0), (V4i32Extract V4I32Regs:$s,1)),
+    (V2I32toI64 (V4i32Extract V4I32Regs:$s,2), (V4i32Extract V4I32Regs:$s,3)))>;
+
+// Fp scalar to fp vector convert
+// f64 -> v2f32
+let VecInstType=isVecDest.Value in {
+def VecF64toV2F32 : NVPTXVecInst<(outs V2F32Regs:$d), (ins Float64Regs:$s),
+                                  "Error!",
+                              [(set V2F32Regs:$d, (bitconvert Float64Regs:$s))],
+                                  F64toV2F32>;
+}
+
+// Fp vector to fp scalar convert
+// v2f32 -> f64
+def : Pat<(f64 (bitconvert V2F32Regs:$s)),
+     (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1))>;
+
+// Fp scalar to int vector convert
+// f32 -> v4i8
+def : Pat<(v4i8 (bitconvert Float32Regs:$s)),
+          (VecI32toV4I8 (BITCONVERT_32_F2I Float32Regs:$s))>;
+// f32 -> v2i16
+def : Pat<(v2i16 (bitconvert Float32Regs:$s)),
+          (VecI32toV2I16 (BITCONVERT_32_F2I Float32Regs:$s))>;
+// f64 -> v4i16
+def : Pat<(v4i16 (bitconvert Float64Regs:$s)),
+          (VecI64toV4I16 (BITCONVERT_64_F2I Float64Regs:$s))>;
+// f64 -> v2i32
+def : Pat<(v2i32 (bitconvert Float64Regs:$s)),
+          (VecI64toV2I32 (BITCONVERT_64_F2I Float64Regs:$s))>;
+
+// Int vector to fp scalar convert
+// v4i8 -> f32
+def : Pat<(f32 (bitconvert V4I8Regs:$s)),
+          (BITCONVERT_32_I2F
+          (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+                    (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>;
+// v4i16 -> f64
+def : Pat<(f64 (bitconvert V4I16Regs:$s)),
+          (BITCONVERT_64_I2F
+       (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1),
+                (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>;
+// v2i16 -> f32
+def : Pat<(f32 (bitconvert V2I16Regs:$s)),
+          (BITCONVERT_32_I2F
+    (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>;
+// v2i32 -> f64
+def : Pat<(f64 (bitconvert V2I32Regs:$s)),
+          (BITCONVERT_64_I2F
+    (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>;
+
+// Int scalar to fp vector convert
+// i64 -> v2f32
+def : Pat<(v2f32 (bitconvert Int64Regs:$s)),
+          (VecF64toV2F32 (BITCONVERT_64_I2F Int64Regs:$s))>;
+
+// Fp vector to int scalar convert
+// v2f32 -> i64
+def : Pat<(i64 (bitconvert V2F32Regs:$s)),
+          (BITCONVERT_64_F2I
+    (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1)))>;
+
+// Int vector to fp vector convert
+// v2i64 -> v4f32
+def : Pat<(v4f32 (bitconvert V2I64Regs:$s)),
+          (Build_Vector4_f32
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 0)), 0)),
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 0)), 1)),
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 1)), 0)),
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 1)), 1)))>;
+// v2i64 -> v2f64
+def : Pat<(v2f64 (bitconvert V2I64Regs:$s)),
+    (Build_Vector2_f64
+            (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,0)),
+            (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,1)))>;
+// v2i32 -> v2f32
+def : Pat<(v2f32 (bitconvert V2I32Regs:$s)),
+    (Build_Vector2_f32
+            (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,0)),
+            (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,1)))>;
+// v4i32 -> v2f64
+def : Pat<(v2f64 (bitconvert V4I32Regs:$s)),
+          (Build_Vector2_f64
+           (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,0),
+             (V4i32Extract V4I32Regs:$s,1))),
+           (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,2),
+             (V4i32Extract V4I32Regs:$s,3))))>;
+// v4i32 -> v4f32
+def : Pat<(v4f32 (bitconvert V4I32Regs:$s)),
+    (Build_Vector4_f32
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,0)),
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,1)),
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,2)),
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,3)))>;
+// v4i16 -> v2f32
+def : Pat<(v2f32 (bitconvert V4I16Regs:$s)),
+          (VecF64toV2F32 (BITCONVERT_64_I2F
+          (V4I16toI64 (V4i16Extract V4I16Regs:$s,0),
+            (V4i16Extract V4I16Regs:$s,1),
+                      (V4i16Extract V4I16Regs:$s,2),
+                      (V4i16Extract V4I16Regs:$s,3))))>;
+
+// Fp vector to int vector convert
+// v2i64 <- v4f32
+def : Pat<(v2i64 (bitconvert V4F32Regs:$s)),
+          (Build_Vector2_i64
+           (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,0),
+             (V4f32Extract V4F32Regs:$s,1))),
+           (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,2),
+             (V4f32Extract V4F32Regs:$s,3))))>;
+// v2i64 <- v2f64
+def : Pat<(v2i64 (bitconvert V2F64Regs:$s)),
+    (Build_Vector2_i64
+            (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,0)),
+            (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,1)))>;
+// v2i32 <- v2f32
+def : Pat<(v2i32 (bitconvert V2F32Regs:$s)),
+    (Build_Vector2_i32
+            (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,0)),
+            (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,1)))>;
+// v4i32 <- v2f64
+def : Pat<(v4i32 (bitconvert V2F64Regs:$s)),
+          (Build_Vector4_i32
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 0)), 0)),
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 0)), 1)),
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 1)), 0)),
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 1)), 1)))>;
+// v4i32 <- v4f32
+def : Pat<(v4i32 (bitconvert V4F32Regs:$s)),
+          (Build_Vector4_i32
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,0)),
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,1)),
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,2)),
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,3)))>;
+// v4i16 <- v2f32
+def : Pat<(v4i16 (bitconvert V2F32Regs:$s)),
+          (VecI64toV4I16 (BITCONVERT_64_F2I
+          (V2F32toF64 (V2f32Extract V2F32Regs:$s,0),
+            (V2f32Extract V2F32Regs:$s,1))))>;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/contrib/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
new file mode 100644
index 000000000000..9c71a2ee165b
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
@@ -0,0 +1,152 @@
+//===- NVVMIntrRange.cpp - Set !range metadata for NVVM intrinsics --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass adds appropriate !range metadata for calls to NVVM
+// intrinsics that return a limited range of values.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvvm-intr-range"
+
+namespace llvm { void initializeNVVMIntrRangePass(PassRegistry &); }
+
+// Add !range metadata based on limits of given SM variant.
+static cl::opt<unsigned> NVVMIntrRangeSM("nvvm-intr-range-sm", cl::init(20),
+                                         cl::Hidden, cl::desc("SM variant"));
+
+namespace {
+class NVVMIntrRange : public FunctionPass {
+ private:
+   struct {
+     unsigned x, y, z;
+   } MaxBlockSize, MaxGridSize;
+
+ public:
+   static char ID;
+   NVVMIntrRange() : NVVMIntrRange(NVVMIntrRangeSM) {}
+   NVVMIntrRange(unsigned int SmVersion) : FunctionPass(ID) {
+     MaxBlockSize.x = 1024;
+     MaxBlockSize.y = 1024;
+     MaxBlockSize.z = 64;
+
+     MaxGridSize.x = SmVersion >= 30 ? 0x7fffffff : 0xffff;
+     MaxGridSize.y = 0xffff;
+     MaxGridSize.z = 0xffff;
+
+     initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry());
+   }
+
+   bool runOnFunction(Function &) override;
+};
+}
+
+FunctionPass *llvm::createNVVMIntrRangePass(unsigned int SmVersion) {
+  return new NVVMIntrRange(SmVersion);
+}
+
+char NVVMIntrRange::ID = 0;
+INITIALIZE_PASS(NVVMIntrRange, "nvvm-intr-range",
+                "Add !range metadata to NVVM intrinsics.", false, false)
+
+// Adds the passed-in [Low,High) range information as metadata to the
+// passed-in call instruction.
+static bool addRangeMetadata(uint64_t Low, uint64_t High, CallInst *C) {
+  // This call already has range metadata, nothing to do.
+  if (C->getMetadata(LLVMContext::MD_range))
+    return false;
+
+  LLVMContext &Context = C->getParent()->getContext();
+  IntegerType *Int32Ty = Type::getInt32Ty(Context);
+  Metadata *LowAndHigh[] = {
+      ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Low)),
+      ConstantAsMetadata::get(ConstantInt::get(Int32Ty, High))};
+  C->setMetadata(LLVMContext::MD_range, MDNode::get(Context, LowAndHigh));
+  return true;
+}
+
+bool NVVMIntrRange::runOnFunction(Function &F) {
+  // Go through the calls in this function.
+  bool Changed = false;
+  for (Instruction &I : instructions(F)) {
+    CallInst *Call = dyn_cast<CallInst>(&I);
+    if (!Call)
+      continue;
+
+    if (Function *Callee = Call->getCalledFunction()) {
+      switch (Callee->getIntrinsicID()) {
+      // Index within block
+      case Intrinsic::nvvm_read_ptx_sreg_tid_x:
+        Changed |= addRangeMetadata(0, MaxBlockSize.x, Call);
+        break;
+      case Intrinsic::nvvm_read_ptx_sreg_tid_y:
+        Changed |= addRangeMetadata(0, MaxBlockSize.y, Call);
+        break;
+      case Intrinsic::nvvm_read_ptx_sreg_tid_z:
+        Changed |= addRangeMetadata(0, MaxBlockSize.z, Call);
+        break;
+
+      // Block size
+      case Intrinsic::nvvm_read_ptx_sreg_ntid_x:
+        Changed |= addRangeMetadata(1, MaxBlockSize.x+1, Call);
+        break;
+      case Intrinsic::nvvm_read_ptx_sreg_ntid_y:
+        Changed |= addRangeMetadata(1, MaxBlockSize.y+1, Call);
+        break;
+      case Intrinsic::nvvm_read_ptx_sreg_ntid_z:
+        Changed |= addRangeMetadata(1, MaxBlockSize.z+1, Call);
+        break;
+
+      // Index within grid
+      case Intrinsic::nvvm_read_ptx_sreg_ctaid_x:
+        Changed |= addRangeMetadata(0, MaxGridSize.x, Call);
+        break;
+      case Intrinsic::nvvm_read_ptx_sreg_ctaid_y:
+        Changed |= addRangeMetadata(0, MaxGridSize.y, Call);
+        break;
+      case Intrinsic::nvvm_read_ptx_sreg_ctaid_z:
+        Changed |= addRangeMetadata(0, MaxGridSize.z, Call);
+        break;
+
+      // Grid size
+      case Intrinsic::nvvm_read_ptx_sreg_nctaid_x:
+        Changed |= addRangeMetadata(1, MaxGridSize.x+1, Call);
+        break;
+      case Intrinsic::nvvm_read_ptx_sreg_nctaid_y:
+        Changed |= addRangeMetadata(1, MaxGridSize.y+1, Call);
+        break;
+      case Intrinsic::nvvm_read_ptx_sreg_nctaid_z:
+        Changed |= addRangeMetadata(1, MaxGridSize.z+1, Call);
+        break;
+
+      // warp size is constant 32.
+      case Intrinsic::nvvm_read_ptx_sreg_warpsize:
+        Changed |= addRangeMetadata(32, 32+1, Call);
+        break;
+
+      // Lane ID is [0..warpsize)
+      case Intrinsic::nvvm_read_ptx_sreg_laneid:
+        Changed |= addRangeMetadata(0, 32, Call);
+        break;
+
+      default:
+        break;
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
new file mode 100644
index 000000000000..c639c4dc0683
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -0,0 +1,219 @@
+//===- NVVMReflect.cpp - NVVM Emulate conditional compilation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect
+// with an integer.
+//
+// We choose the value we use by looking, in this order, at:
+//
+//  * the -nvvm-reflect-list flag, which has the format "foo=1,bar=42",
+//  * the StringMap passed to the pass's constructor, and
+//  * metadata in the module itself.
+//
+// If we see an unknown string, we replace its call with 0.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <sstream>
+#include <string>
+#define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-reflect"
+
+namespace llvm { void initializeNVVMReflectPass(PassRegistry &); }
+
+namespace {
+class NVVMReflect : public FunctionPass {
+private:
+  StringMap<int> VarMap;
+
+public:
+  static char ID;
+  NVVMReflect() : NVVMReflect(StringMap<int>()) {}
+
+  NVVMReflect(const StringMap<int> &Mapping)
+      : FunctionPass(ID), VarMap(Mapping) {
+    initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+    setVarMap();
+  }
+
+  bool runOnFunction(Function &) override;
+
+private:
+  void setVarMap();
+};
+}
+
+FunctionPass *llvm::createNVVMReflectPass() { return new NVVMReflect(); }
+FunctionPass *llvm::createNVVMReflectPass(const StringMap<int> &Mapping) {
+  return new NVVMReflect(Mapping);
+}
+
+static cl::opt<bool>
+NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden,
+                   cl::desc("NVVM reflection, enabled by default"));
+
+char NVVMReflect::ID = 0;
+INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
+                "Replace occurrences of __nvvm_reflect() calls with 0/1", false,
+                false)
+
+static cl::list<std::string>
+ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"), cl::Hidden,
+            cl::desc("A list of string=num assignments"),
+            cl::ValueRequired);
+
+/// The command line can look as follows :
+/// -nvvm-reflect-list a=1,b=2 -nvvm-reflect-list c=3,d=0 -R e=2
+/// The strings "a=1,b=2", "c=3,d=0", "e=2" are available in the
+/// ReflectList vector. First, each of ReflectList[i] is 'split'
+/// using "," as the delimiter. Then each of this part is split
+/// using "=" as the delimiter.
+void NVVMReflect::setVarMap() {
+  for (unsigned i = 0, e = ReflectList.size(); i != e; ++i) {
+    DEBUG(dbgs() << "Option : "  << ReflectList[i] << "\n");
+    SmallVector<StringRef, 4> NameValList;
+    StringRef(ReflectList[i]).split(NameValList, ',');
+    for (unsigned j = 0, ej = NameValList.size(); j != ej; ++j) {
+      SmallVector<StringRef, 2> NameValPair;
+      NameValList[j].split(NameValPair, '=');
+      assert(NameValPair.size() == 2 && "name=val expected");
+      std::stringstream ValStream(NameValPair[1]);
+      int Val;
+      ValStream >> Val;
+      assert((!(ValStream.fail())) && "integer value expected");
+      VarMap[NameValPair[0]] = Val;
+    }
+  }
+}
+
+bool NVVMReflect::runOnFunction(Function &F) {
+  if (!NVVMReflectEnabled)
+    return false;
+
+  if (F.getName() == NVVM_REFLECT_FUNCTION) {
+    assert(F.isDeclaration() && "_reflect function should not have a body");
+    assert(F.getReturnType()->isIntegerTy() &&
+           "_reflect's return type should be integer");
+    return false;
+  }
+
+  SmallVector<Instruction *, 4> ToRemove;
+
+  // Go through the calls in this function.  Each call to __nvvm_reflect or
+  // llvm.nvvm.reflect should be a CallInst with a ConstantArray argument.
+  // First validate that. If the c-string corresponding to the ConstantArray can
+  // be found successfully, see if it can be found in VarMap. If so, replace the
+  // uses of CallInst with the value found in VarMap. If not, replace the use
+  // with value 0.
+
+  // The IR for __nvvm_reflect calls differs between CUDA versions.
+  //
+  // CUDA 6.5 and earlier uses this sequence:
+  //    %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8
+  //        (i8 addrspace(4)* getelementptr inbounds
+  //           ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0))
+  //    %reflect = tail call i32 @__nvvm_reflect(i8* %ptr)
+  //
+  // The value returned by Sym->getOperand(0) is a Constant with a
+  // ConstantDataSequential operand which can be converted to string and used
+  // for lookup.
+  //
+  // CUDA 7.0 does it slightly differently:
+  //   %reflect = call i32 @__nvvm_reflect(i8* addrspacecast
+  //        (i8 addrspace(1)* getelementptr inbounds
+  //           ([8 x i8], [8 x i8] addrspace(1)* @str, i32 0, i32 0) to i8*))
+  //
+  // In this case, we get a Constant with a GlobalVariable operand and we need
+  // to dig deeper to find its initializer with the string we'll use for lookup.
+  for (Instruction &I : instructions(F)) {
+    CallInst *Call = dyn_cast<CallInst>(&I);
+    if (!Call)
+      continue;
+    Function *Callee = Call->getCalledFunction();
+    if (!Callee || (Callee->getName() != NVVM_REFLECT_FUNCTION &&
+                    Callee->getIntrinsicID() != Intrinsic::nvvm_reflect))
+      continue;
+
+    // FIXME: Improve error handling here and elsewhere in this pass.
+    assert(Call->getNumOperands() == 2 &&
+           "Wrong number of operands to __nvvm_reflect function");
+
+    // In cuda 6.5 and earlier, we will have an extra constant-to-generic
+    // conversion of the string.
+    const Value *Str = Call->getArgOperand(0);
+    if (const CallInst *ConvCall = dyn_cast<CallInst>(Str)) {
+      // FIXME: Add assertions about ConvCall.
+      Str = ConvCall->getArgOperand(0);
+    }
+    assert(isa<ConstantExpr>(Str) &&
+           "Format of __nvvm__reflect function not recognized");
+    const ConstantExpr *GEP = cast<ConstantExpr>(Str);
+
+    const Value *Sym = GEP->getOperand(0);
+    assert(isa<Constant>(Sym) &&
+           "Format of __nvvm_reflect function not recognized");
+
+    const Value *Operand = cast<Constant>(Sym)->getOperand(0);
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Operand)) {
+      // For CUDA-7.0 style __nvvm_reflect calls, we need to find the operand's
+      // initializer.
+      assert(GV->hasInitializer() &&
+             "Format of _reflect function not recognized");
+      const Constant *Initializer = GV->getInitializer();
+      Operand = Initializer;
+    }
+
+    assert(isa<ConstantDataSequential>(Operand) &&
+           "Format of _reflect function not recognized");
+    assert(cast<ConstantDataSequential>(Operand)->isCString() &&
+           "Format of _reflect function not recognized");
+
+    StringRef ReflectArg = cast<ConstantDataSequential>(Operand)->getAsString();
+    ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1);
+    DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n");
+
+    int ReflectVal = 0; // The default value is 0
+    auto Iter = VarMap.find(ReflectArg);
+    if (Iter != VarMap.end())
+      ReflectVal = Iter->second;
+    else if (ReflectArg == "__CUDA_FTZ") {
+      // Try to pull __CUDA_FTZ from the nvvm-reflect-ftz module flag.
+      if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
+              F.getParent()->getModuleFlag("nvvm-reflect-ftz")))
+        ReflectVal = Flag->getSExtValue();
+    }
+    Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
+    ToRemove.push_back(Call);
+  }
+
+  for (Instruction *I : ToRemove)
+    I->eraseFromParent();
+
+  return ToRemove.size() > 0;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp b/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
new file mode 100644
index 000000000000..d44876abf729
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
@@ -0,0 +1,29 @@
+//===-- NVPTXTargetInfo.cpp - NVPTX Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheNVPTXTarget32() {
+  static Target TheNVPTXTarget32;
+  return TheNVPTXTarget32;
+}
+Target &llvm::getTheNVPTXTarget64() {
+  static Target TheNVPTXTarget64;
+  return TheNVPTXTarget64;
+}
+
+extern "C" void LLVMInitializeNVPTXTargetInfo() {
+  RegisterTarget<Triple::nvptx> X(getTheNVPTXTarget32(), "nvptx",
+                                  "NVIDIA PTX 32-bit");
+  RegisterTarget<Triple::nvptx64> Y(getTheNVPTXTarget64(), "nvptx64",
+                                    "NVIDIA PTX 64-bit");
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h b/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h
new file mode 100644
index 000000000000..02c5a94c3d03
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h
@@ -0,0 +1,122 @@
+#ifndef CL_COMMON_DEFINES_H
+#define CL_COMMON_DEFINES_H
+// This file includes defines that are common to both kernel code and
+// the NVPTX back-end.
+
+//
+// Common defines for Image intrinsics
+// Channel order
+enum {
+  CLK_R = 0x10B0,
+  CLK_A = 0x10B1,
+  CLK_RG = 0x10B2,
+  CLK_RA = 0x10B3,
+  CLK_RGB = 0x10B4,
+  CLK_RGBA = 0x10B5,
+  CLK_BGRA = 0x10B6,
+  CLK_ARGB = 0x10B7,
+
+#if (__NV_CL_C_VERSION == __NV_CL_C_VERSION_1_0)
+  CLK_xRGB = 0x10B7,
+#endif
+
+  CLK_INTENSITY = 0x10B8,
+  CLK_LUMINANCE = 0x10B9
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+      ,
+  CLK_Rx = 0x10BA,
+  CLK_RGx = 0x10BB,
+  CLK_RGBx = 0x10BC
+#endif
+};
+
+typedef enum clk_channel_type {
+  // valid formats for float return types
+  CLK_SNORM_INT8 = 0x10D0,  // four channel RGBA unorm8
+  CLK_SNORM_INT16 = 0x10D1, // four channel RGBA unorm16
+  CLK_UNORM_INT8 = 0x10D2,  // four channel RGBA unorm8
+  CLK_UNORM_INT16 = 0x10D3, // four channel RGBA unorm16
+  CLK_HALF_FLOAT = 0x10DD,  // four channel RGBA half
+  CLK_FLOAT = 0x10DE,       // four channel RGBA float
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+  CLK_UNORM_SHORT_565 = 0x10D4,
+  CLK_UNORM_SHORT_555 = 0x10D5,
+  CLK_UNORM_INT_101010 = 0x10D6,
+#endif
+
+  // valid only for integer return types
+  CLK_SIGNED_INT8 = 0x10D7,
+  CLK_SIGNED_INT16 = 0x10D8,
+  CLK_SIGNED_INT32 = 0x10D9,
+  CLK_UNSIGNED_INT8 = 0x10DA,
+  CLK_UNSIGNED_INT16 = 0x10DB,
+  CLK_UNSIGNED_INT32 = 0x10DC,
+
+  // CI SPI for CPU
+  __CLK_UNORM_INT8888,  // four channel ARGB unorm8
+  __CLK_UNORM_INT8888R, // four channel BGRA unorm8
+
+  __CLK_VALID_IMAGE_TYPE_COUNT,
+  __CLK_INVALID_IMAGE_TYPE = __CLK_VALID_IMAGE_TYPE_COUNT,
+  __CLK_VALID_IMAGE_TYPE_MASK_BITS = 4, // number of bits required to
+                                        // represent any image type
+  __CLK_VALID_IMAGE_TYPE_MASK = (1 << __CLK_VALID_IMAGE_TYPE_MASK_BITS) - 1
+} clk_channel_type;
+
+typedef enum clk_sampler_type {
+  __CLK_ADDRESS_BASE = 0,
+  CLK_ADDRESS_NONE = 0 << __CLK_ADDRESS_BASE,
+  CLK_ADDRESS_CLAMP = 1 << __CLK_ADDRESS_BASE,
+  CLK_ADDRESS_CLAMP_TO_EDGE = 2 << __CLK_ADDRESS_BASE,
+  CLK_ADDRESS_REPEAT = 3 << __CLK_ADDRESS_BASE,
+  CLK_ADDRESS_MIRROR = 4 << __CLK_ADDRESS_BASE,
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+  CLK_ADDRESS_MIRRORED_REPEAT = CLK_ADDRESS_MIRROR,
+#endif
+  __CLK_ADDRESS_MASK =
+      CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP | CLK_ADDRESS_CLAMP_TO_EDGE |
+      CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR,
+  __CLK_ADDRESS_BITS = 3, // number of bits required to
+                          // represent address info
+
+  __CLK_NORMALIZED_BASE = __CLK_ADDRESS_BITS,
+  CLK_NORMALIZED_COORDS_FALSE = 0,
+  CLK_NORMALIZED_COORDS_TRUE = 1 << __CLK_NORMALIZED_BASE,
+  __CLK_NORMALIZED_MASK =
+      CLK_NORMALIZED_COORDS_FALSE | CLK_NORMALIZED_COORDS_TRUE,
+  __CLK_NORMALIZED_BITS = 1, // number of bits required to
+                             // represent normalization
+
+  __CLK_FILTER_BASE = __CLK_NORMALIZED_BASE + __CLK_NORMALIZED_BITS,
+  CLK_FILTER_NEAREST = 0 << __CLK_FILTER_BASE,
+  CLK_FILTER_LINEAR = 1 << __CLK_FILTER_BASE,
+  CLK_FILTER_ANISOTROPIC = 2 << __CLK_FILTER_BASE,
+  __CLK_FILTER_MASK =
+      CLK_FILTER_NEAREST | CLK_FILTER_LINEAR | CLK_FILTER_ANISOTROPIC,
+  __CLK_FILTER_BITS = 2, // number of bits required to
+                         // represent address info
+
+  __CLK_MIP_BASE = __CLK_FILTER_BASE + __CLK_FILTER_BITS,
+  CLK_MIP_NEAREST = 0 << __CLK_MIP_BASE,
+  CLK_MIP_LINEAR = 1 << __CLK_MIP_BASE,
+  CLK_MIP_ANISOTROPIC = 2 << __CLK_MIP_BASE,
+  __CLK_MIP_MASK = CLK_MIP_NEAREST | CLK_MIP_LINEAR | CLK_MIP_ANISOTROPIC,
+  __CLK_MIP_BITS = 2,
+
+  __CLK_SAMPLER_BITS = __CLK_MIP_BASE + __CLK_MIP_BITS,
+  __CLK_SAMPLER_MASK = __CLK_MIP_MASK | __CLK_FILTER_MASK |
+                       __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK,
+
+  __CLK_ANISOTROPIC_RATIO_BITS = 5,
+  __CLK_ANISOTROPIC_RATIO_MASK =
+      (int) 0x80000000 >> (__CLK_ANISOTROPIC_RATIO_BITS - 1)
+} clk_sampler_type;
+
+// Memory synchronization
+#define CLK_LOCAL_MEM_FENCE (1 << 0)
+#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+
+#endif // CL_COMMON_DEFINES_H
diff --git a/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
new file mode 100644
index 000000000000..52432a5820fb
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -0,0 +1,1966 @@
+//===-- PPCAsmParser.cpp - Parse PowerPC asm to MCInst instructions -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCExpr.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "PPCTargetStreamer.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static const MCPhysReg RRegs[32] = {
+  PPC::R0,  PPC::R1,  PPC::R2,  PPC::R3,
+  PPC::R4,  PPC::R5,  PPC::R6,  PPC::R7,
+  PPC::R8,  PPC::R9,  PPC::R10, PPC::R11,
+  PPC::R12, PPC::R13, PPC::R14, PPC::R15,
+  PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+  PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+  PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+  PPC::R28, PPC::R29, PPC::R30, PPC::R31
+};
+static const MCPhysReg RRegsNoR0[32] = {
+  PPC::ZERO,
+            PPC::R1,  PPC::R2,  PPC::R3,
+  PPC::R4,  PPC::R5,  PPC::R6,  PPC::R7,
+  PPC::R8,  PPC::R9,  PPC::R10, PPC::R11,
+  PPC::R12, PPC::R13, PPC::R14, PPC::R15,
+  PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+  PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+  PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+  PPC::R28, PPC::R29, PPC::R30, PPC::R31
+};
+static const MCPhysReg XRegs[32] = {
+  PPC::X0,  PPC::X1,  PPC::X2,  PPC::X3,
+  PPC::X4,  PPC::X5,  PPC::X6,  PPC::X7,
+  PPC::X8,  PPC::X9,  PPC::X10, PPC::X11,
+  PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+  PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+  PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+  PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+  PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+static const MCPhysReg XRegsNoX0[32] = {
+  PPC::ZERO8,
+            PPC::X1,  PPC::X2,  PPC::X3,
+  PPC::X4,  PPC::X5,  PPC::X6,  PPC::X7,
+  PPC::X8,  PPC::X9,  PPC::X10, PPC::X11,
+  PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+  PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+  PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+  PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+  PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+static const MCPhysReg FRegs[32] = {
+  PPC::F0,  PPC::F1,  PPC::F2,  PPC::F3,
+  PPC::F4,  PPC::F5,  PPC::F6,  PPC::F7,
+  PPC::F8,  PPC::F9,  PPC::F10, PPC::F11,
+  PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+  PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+  PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+  PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+  PPC::F28, PPC::F29, PPC::F30, PPC::F31
+};
+static const MCPhysReg VFRegs[32] = {
+  PPC::VF0,  PPC::VF1,  PPC::VF2,  PPC::VF3,
+  PPC::VF4,  PPC::VF5,  PPC::VF6,  PPC::VF7,
+  PPC::VF8,  PPC::VF9,  PPC::VF10, PPC::VF11,
+  PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+  PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+  PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+  PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+  PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
+static const MCPhysReg VRegs[32] = {
+  PPC::V0,  PPC::V1,  PPC::V2,  PPC::V3,
+  PPC::V4,  PPC::V5,  PPC::V6,  PPC::V7,
+  PPC::V8,  PPC::V9,  PPC::V10, PPC::V11,
+  PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+  PPC::V16, PPC::V17, PPC::V18, PPC::V19,
+  PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+  PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+  PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+static const MCPhysReg VSRegs[64] = {
+  PPC::VSL0,  PPC::VSL1,  PPC::VSL2,  PPC::VSL3,
+  PPC::VSL4,  PPC::VSL5,  PPC::VSL6,  PPC::VSL7,
+  PPC::VSL8,  PPC::VSL9,  PPC::VSL10, PPC::VSL11,
+  PPC::VSL12, PPC::VSL13, PPC::VSL14, PPC::VSL15,
+  PPC::VSL16, PPC::VSL17, PPC::VSL18, PPC::VSL19,
+  PPC::VSL20, PPC::VSL21, PPC::VSL22, PPC::VSL23,
+  PPC::VSL24, PPC::VSL25, PPC::VSL26, PPC::VSL27,
+  PPC::VSL28, PPC::VSL29, PPC::VSL30, PPC::VSL31,
+
+  PPC::V0,  PPC::V1,  PPC::V2,  PPC::V3,
+  PPC::V4,  PPC::V5,  PPC::V6,  PPC::V7,
+  PPC::V8,  PPC::V9,  PPC::V10, PPC::V11,
+  PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+  PPC::V16, PPC::V17, PPC::V18, PPC::V19,
+  PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+  PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+  PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+static const MCPhysReg VSFRegs[64] = {
+  PPC::F0,  PPC::F1,  PPC::F2,  PPC::F3,
+  PPC::F4,  PPC::F5,  PPC::F6,  PPC::F7,
+  PPC::F8,  PPC::F9,  PPC::F10, PPC::F11,
+  PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+  PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+  PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+  PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+  PPC::F28, PPC::F29, PPC::F30, PPC::F31,
+
+  PPC::VF0,  PPC::VF1,  PPC::VF2,  PPC::VF3,
+  PPC::VF4,  PPC::VF5,  PPC::VF6,  PPC::VF7,
+  PPC::VF8,  PPC::VF9,  PPC::VF10, PPC::VF11,
+  PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+  PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+  PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+  PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+  PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
+static const MCPhysReg VSSRegs[64] = {
+  PPC::F0,  PPC::F1,  PPC::F2,  PPC::F3,
+  PPC::F4,  PPC::F5,  PPC::F6,  PPC::F7,
+  PPC::F8,  PPC::F9,  PPC::F10, PPC::F11,
+  PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+  PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+  PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+  PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+  PPC::F28, PPC::F29, PPC::F30, PPC::F31,
+
+  PPC::VF0,  PPC::VF1,  PPC::VF2,  PPC::VF3,
+  PPC::VF4,  PPC::VF5,  PPC::VF6,  PPC::VF7,
+  PPC::VF8,  PPC::VF9,  PPC::VF10, PPC::VF11,
+  PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+  PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+  PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+  PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+  PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
+static unsigned QFRegs[32] = {
+  PPC::QF0,  PPC::QF1,  PPC::QF2,  PPC::QF3,
+  PPC::QF4,  PPC::QF5,  PPC::QF6,  PPC::QF7,
+  PPC::QF8,  PPC::QF9,  PPC::QF10, PPC::QF11,
+  PPC::QF12, PPC::QF13, PPC::QF14, PPC::QF15,
+  PPC::QF16, PPC::QF17, PPC::QF18, PPC::QF19,
+  PPC::QF20, PPC::QF21, PPC::QF22, PPC::QF23,
+  PPC::QF24, PPC::QF25, PPC::QF26, PPC::QF27,
+  PPC::QF28, PPC::QF29, PPC::QF30, PPC::QF31
+};
+static const MCPhysReg CRBITRegs[32] = {
+  PPC::CR0LT, PPC::CR0GT, PPC::CR0EQ, PPC::CR0UN,
+  PPC::CR1LT, PPC::CR1GT, PPC::CR1EQ, PPC::CR1UN,
+  PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
+  PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
+  PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
+  PPC::CR5LT, PPC::CR5GT, PPC::CR5EQ, PPC::CR5UN,
+  PPC::CR6LT, PPC::CR6GT, PPC::CR6EQ, PPC::CR6UN,
+  PPC::CR7LT, PPC::CR7GT, PPC::CR7EQ, PPC::CR7UN
+};
+static const MCPhysReg CRRegs[8] = {
+  PPC::CR0, PPC::CR1, PPC::CR2, PPC::CR3,
+  PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7
+};
+
+// Evaluate an expression containing condition register
+// or condition register field symbols.  Returns positive
+// value on success, or -1 on error.
+static int64_t
+EvaluateCRExpr(const MCExpr *E) {
+  switch (E->getKind()) {
+  case MCExpr::Target:
+    return -1;
+
+  case MCExpr::Constant: {
+    int64_t Res = cast<MCConstantExpr>(E)->getValue();
+    return Res < 0 ? -1 : Res;
+  }
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
+    StringRef Name = SRE->getSymbol().getName();
+
+    if (Name == "lt") return 0;
+    if (Name == "gt") return 1;
+    if (Name == "eq") return 2;
+    if (Name == "so") return 3;
+    if (Name == "un") return 3;
+
+    if (Name == "cr0") return 0;
+    if (Name == "cr1") return 1;
+    if (Name == "cr2") return 2;
+    if (Name == "cr3") return 3;
+    if (Name == "cr4") return 4;
+    if (Name == "cr5") return 5;
+    if (Name == "cr6") return 6;
+    if (Name == "cr7") return 7;
+
+    return -1;
+  }
+
+  case MCExpr::Unary:
+    return -1;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
+    int64_t LHSVal = EvaluateCRExpr(BE->getLHS());
+    int64_t RHSVal = EvaluateCRExpr(BE->getRHS());
+    int64_t Res;
+
+    if (LHSVal < 0 || RHSVal < 0)
+      return -1;
+
+    switch (BE->getOpcode()) {
+    default: return -1;
+    case MCBinaryExpr::Add: Res = LHSVal + RHSVal; break;
+    case MCBinaryExpr::Mul: Res = LHSVal * RHSVal; break;
+    }
+
+    return Res < 0 ? -1 : Res;
+  }
+  }
+
+  llvm_unreachable("Invalid expression kind!");
+}
+
+namespace {
+
+struct PPCOperand;
+
+class PPCAsmParser : public MCTargetAsmParser {
+  const MCInstrInfo &MII;
+  bool IsPPC64;
+  bool IsDarwin;
+
+  void Warning(SMLoc L, const Twine &Msg) { getParser().Warning(L, Msg); }
+
+  bool isPPC64() const { return IsPPC64; }
+  bool isDarwin() const { return IsDarwin; }
+
+  bool MatchRegisterName(unsigned &RegNo, int64_t &IntVal);
+
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+  const MCExpr *ExtractModifierFromExpr(const MCExpr *E,
+                                        PPCMCExpr::VariantKind &Variant);
+  const MCExpr *FixupVariantKind(const MCExpr *E);
+  bool ParseExpression(const MCExpr *&EVal);
+  bool ParseDarwinExpression(const MCExpr *&EVal);
+
+  bool ParseOperand(OperandVector &Operands);
+
+  bool ParseDirectiveWord(unsigned Size, AsmToken ID);
+  bool ParseDirectiveTC(unsigned Size, AsmToken ID);
+  bool ParseDirectiveMachine(SMLoc L);
+  bool ParseDarwinDirectiveMachine(SMLoc L);
+  bool ParseDirectiveAbiVersion(SMLoc L);
+  bool ParseDirectiveLocalEntry(SMLoc L);
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+
+  void ProcessInstruction(MCInst &Inst, const OperandVector &Ops);
+
+  /// @name Auto-generated Match Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "PPCGenAsmMatcher.inc"
+
+  /// }
+
+
+public:
+  PPCAsmParser(const MCSubtargetInfo &STI, MCAsmParser &,
+               const MCInstrInfo &MII, const MCTargetOptions &Options)
+    : MCTargetAsmParser(Options, STI), MII(MII) {
+    // Check for 64-bit vs. 32-bit pointer mode.
+    const Triple &TheTriple = STI.getTargetTriple();
+    IsPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
+               TheTriple.getArch() == Triple::ppc64le);
+    IsDarwin = TheTriple.isMacOSX();
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+
+  bool ParseDirective(AsmToken DirectiveID) override;
+
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
+
+  const MCExpr *applyModifierToExpr(const MCExpr *E,
+                                    MCSymbolRefExpr::VariantKind,
+                                    MCContext &Ctx) override;
+};
+
+/// PPCOperand - Instances of this class represent a parsed PowerPC machine
+/// instruction.
+struct PPCOperand : public MCParsedAsmOperand {
+  enum KindTy {
+    Token,
+    Immediate,
+    ContextImmediate,
+    Expression,
+    TLSRegister
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+  bool IsPPC64;
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct ImmOp {
+    int64_t Val;
+  };
+
+  struct ExprOp {
+    const MCExpr *Val;
+    int64_t CRVal;     // Cached result of EvaluateCRExpr(Val)
+  };
+
+  struct TLSRegOp {
+    const MCSymbolRefExpr *Sym;
+  };
+
+  union {
+    struct TokOp Tok;
+    struct ImmOp Imm;
+    struct ExprOp Expr;
+    struct TLSRegOp TLSReg;
+  };
+
+  PPCOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+public:
+  PPCOperand(const PPCOperand &o) : MCParsedAsmOperand() {
+    Kind = o.Kind;
+    StartLoc = o.StartLoc;
+    EndLoc = o.EndLoc;
+    IsPPC64 = o.IsPPC64;
+    switch (Kind) {
+    case Token:
+      Tok = o.Tok;
+      break;
+    case Immediate:
+    case ContextImmediate:
+      Imm = o.Imm;
+      break;
+    case Expression:
+      Expr = o.Expr;
+      break;
+    case TLSRegister:
+      TLSReg = o.TLSReg;
+      break;
+    }
+  }
+
+  // Disable use of sized deallocation due to overallocation of PPCOperand
+  // objects in CreateTokenWithStringCopy.
+  void operator delete(void *p) { ::operator delete(p); }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+
+  /// isPPC64 - True if this operand is for an instruction in 64-bit mode.
+  bool isPPC64() const { return IsPPC64; }
+
+  int64_t getImm() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+  int64_t getImmS16Context() const {
+    assert((Kind == Immediate || Kind == ContextImmediate) &&
+           "Invalid access!");
+    if (Kind == Immediate)
+      return Imm.Val;
+    return static_cast<int16_t>(Imm.Val);
+  }
+  int64_t getImmU16Context() const {
+    assert((Kind == Immediate || Kind == ContextImmediate) &&
+           "Invalid access!");
+    return Imm.Val;
+  }
+
+  const MCExpr *getExpr() const {
+    assert(Kind == Expression && "Invalid access!");
+    return Expr.Val;
+  }
+
+  int64_t getExprCRVal() const {
+    assert(Kind == Expression && "Invalid access!");
+    return Expr.CRVal;
+  }
+
+  const MCExpr *getTLSReg() const {
+    assert(Kind == TLSRegister && "Invalid access!");
+    return TLSReg.Sym;
+  }
+
+  unsigned getReg() const override {
+    assert(isRegNumber() && "Invalid access!");
+    return (unsigned) Imm.Val;
+  }
+
+  unsigned getVSReg() const {
+    assert(isVSRegNumber() && "Invalid access!");
+    return (unsigned) Imm.Val;
+  }
+
+  unsigned getCCReg() const {
+    assert(isCCRegNumber() && "Invalid access!");
+    return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal);
+  }
+
+  unsigned getCRBit() const {
+    assert(isCRBitNumber() && "Invalid access!");
+    return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal);
+  }
+
+  unsigned getCRBitMask() const {
+    assert(isCRBitMask() && "Invalid access!");
+    return 7 - countTrailingZeros<uint64_t>(Imm.Val);
+  }
+
+  bool isToken() const override { return Kind == Token; }
+  bool isImm() const override {
+    return Kind == Immediate || Kind == Expression;
+  }
+  bool isU1Imm() const { return Kind == Immediate && isUInt<1>(getImm()); }
+  bool isU2Imm() const { return Kind == Immediate && isUInt<2>(getImm()); }
+  bool isU3Imm() const { return Kind == Immediate && isUInt<3>(getImm()); }
+  bool isU4Imm() const { return Kind == Immediate && isUInt<4>(getImm()); }
+  bool isU5Imm() const { return Kind == Immediate && isUInt<5>(getImm()); }
+  bool isS5Imm() const { return Kind == Immediate && isInt<5>(getImm()); }
+  bool isU6Imm() const { return Kind == Immediate && isUInt<6>(getImm()); }
+  bool isU6ImmX2() const { return Kind == Immediate &&
+                                  isUInt<6>(getImm()) &&
+                                  (getImm() & 1) == 0; }
+  bool isU7Imm() const { return Kind == Immediate && isUInt<7>(getImm()); }
+  bool isU7ImmX4() const { return Kind == Immediate &&
+                                  isUInt<7>(getImm()) &&
+                                  (getImm() & 3) == 0; }
+  bool isU8Imm() const { return Kind == Immediate && isUInt<8>(getImm()); }
+  bool isU8ImmX8() const { return Kind == Immediate &&
+                                  isUInt<8>(getImm()) &&
+                                  (getImm() & 7) == 0; }
+
+  bool isU10Imm() const { return Kind == Immediate && isUInt<10>(getImm()); }
+  bool isU12Imm() const { return Kind == Immediate && isUInt<12>(getImm()); }
+  bool isU16Imm() const {
+    switch (Kind) {
+      case Expression:
+        return true;
+      case Immediate:
+      case ContextImmediate:
+        return isUInt<16>(getImmU16Context());
+      default:
+        return false;
+    }
+  }
+  bool isS16Imm() const {
+    switch (Kind) {
+      case Expression:
+        return true;
+      case Immediate:
+      case ContextImmediate:
+        return isInt<16>(getImmS16Context());
+      default:
+        return false;
+    }
+  }
+  bool isS16ImmX4() const { return Kind == Expression ||
+                                   (Kind == Immediate && isInt<16>(getImm()) &&
+                                    (getImm() & 3) == 0); }
+  bool isS16ImmX16() const { return Kind == Expression ||
+                                    (Kind == Immediate && isInt<16>(getImm()) &&
+                                     (getImm() & 15) == 0); }
+  bool isS17Imm() const {
+    switch (Kind) {
+      case Expression:
+        return true;
+      case Immediate:
+      case ContextImmediate:
+        return isInt<17>(getImmS16Context());
+      default:
+        return false;
+    }
+  }
+  bool isTLSReg() const { return Kind == TLSRegister; }
+  bool isDirectBr() const {
+    if (Kind == Expression)
+      return true;
+    if (Kind != Immediate)
+      return false;
+    // Operand must be 64-bit aligned, signed 27-bit immediate.
+    if ((getImm() & 3) != 0)
+      return false;
+    if (isInt<26>(getImm()))
+      return true;
+    if (!IsPPC64) {
+      // In 32-bit mode, large 32-bit quantities wrap around.
+      if (isUInt<32>(getImm()) && isInt<26>(static_cast<int32_t>(getImm())))
+        return true;
+    }
+    return false;
+  }
+  bool isCondBr() const { return Kind == Expression ||
+                                 (Kind == Immediate && isInt<16>(getImm()) &&
+                                  (getImm() & 3) == 0); }
+  bool isRegNumber() const { return Kind == Immediate && isUInt<5>(getImm()); }
+  bool isVSRegNumber() const {
+    return Kind == Immediate && isUInt<6>(getImm());
+  }
+  bool isCCRegNumber() const { return (Kind == Expression
+                                       && isUInt<3>(getExprCRVal())) ||
+                                      (Kind == Immediate
+                                       && isUInt<3>(getImm())); }
+  bool isCRBitNumber() const { return (Kind == Expression
+                                       && isUInt<5>(getExprCRVal())) ||
+                                      (Kind == Immediate
+                                       && isUInt<5>(getImm())); }
+  bool isCRBitMask() const { return Kind == Immediate && isUInt<8>(getImm()) &&
+                                    isPowerOf2_32(getImm()); }
+  bool isATBitsAsHint() const { return false; }
+  bool isMem() const override { return false; }
+  bool isReg() const override { return false; }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    llvm_unreachable("addRegOperands");
+  }
+
+  void addRegGPRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(RRegs[getReg()]));
+  }
+
+  void addRegGPRCNoR0Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(RRegsNoR0[getReg()]));
+  }
+
+  void addRegG8RCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(XRegs[getReg()]));
+  }
+
+  void addRegG8RCNoX0Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(XRegsNoX0[getReg()]));
+  }
+
+  void addRegGxRCOperands(MCInst &Inst, unsigned N) const {
+    if (isPPC64())
+      addRegG8RCOperands(Inst, N);
+    else
+      addRegGPRCOperands(Inst, N);
+  }
+
+  void addRegGxRCNoR0Operands(MCInst &Inst, unsigned N) const {
+    if (isPPC64())
+      addRegG8RCNoX0Operands(Inst, N);
+    else
+      addRegGPRCNoR0Operands(Inst, N);
+  }
+
+  void addRegF4RCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(FRegs[getReg()]));
+  }
+
+  void addRegF8RCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(FRegs[getReg()]));
+  }
+
+  void addRegVFRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(VFRegs[getReg()]));
+  }
+
+  void addRegVRRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(VRegs[getReg()]));
+  }
+
+  void addRegVSRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(VSRegs[getVSReg()]));
+  }
+
+  void addRegVSFRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(VSFRegs[getVSReg()]));
+  }
+
+  void addRegVSSRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(VSSRegs[getVSReg()]));
+  }
+
+  void addRegQFRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(QFRegs[getReg()]));
+  }
+
+  void addRegQSRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(QFRegs[getReg()]));
+  }
+
+  void addRegQBRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(QFRegs[getReg()]));
+  }
+
+  void addRegCRBITRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(CRBITRegs[getCRBit()]));
+  }
+
+  void addRegCRRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(CRRegs[getCCReg()]));
+  }
+
+  void addCRBitMaskOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(CRRegs[getCRBitMask()]));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (Kind == Immediate)
+      Inst.addOperand(MCOperand::createImm(getImm()));
+    else
+      Inst.addOperand(MCOperand::createExpr(getExpr()));
+  }
+
+  void addS16ImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    switch (Kind) {
+      case Immediate:
+        Inst.addOperand(MCOperand::createImm(getImm()));
+        break;
+      case ContextImmediate:
+        Inst.addOperand(MCOperand::createImm(getImmS16Context()));
+        break;
+      default:
+        Inst.addOperand(MCOperand::createExpr(getExpr()));
+        break;
+    }
+  }
+
+  void addU16ImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    switch (Kind) {
+      case Immediate:
+        Inst.addOperand(MCOperand::createImm(getImm()));
+        break;
+      case ContextImmediate:
+        Inst.addOperand(MCOperand::createImm(getImmU16Context()));
+        break;
+      default:
+        Inst.addOperand(MCOperand::createExpr(getExpr()));
+        break;
+    }
+  }
+
+  void addBranchTargetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (Kind == Immediate)
+      Inst.addOperand(MCOperand::createImm(getImm() / 4));
+    else
+      Inst.addOperand(MCOperand::createExpr(getExpr()));
+  }
+
+  void addTLSRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createExpr(getTLSReg()));
+  }
+
+  StringRef getToken() const {
+    assert(Kind == Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  void print(raw_ostream &OS) const override;
+
+  static std::unique_ptr<PPCOperand> CreateToken(StringRef Str, SMLoc S,
+                                                 bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+
+  static std::unique_ptr<PPCOperand>
+  CreateTokenWithStringCopy(StringRef Str, SMLoc S, bool IsPPC64) {
+    // Allocate extra memory for the string and copy it.
+    // FIXME: This is incorrect, Operands are owned by unique_ptr with a default
+    // deleter which will destroy them by simply using "delete", not correctly
+    // calling operator delete on this extra memory after calling the dtor
+    // explicitly.
+    void *Mem = ::operator new(sizeof(PPCOperand) + Str.size());
+    std::unique_ptr<PPCOperand> Op(new (Mem) PPCOperand(Token));
+    Op->Tok.Data = reinterpret_cast<const char *>(Op.get() + 1);
+    Op->Tok.Length = Str.size();
+    std::memcpy(const_cast<char *>(Op->Tok.Data), Str.data(), Str.size());
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+
+  static std::unique_ptr<PPCOperand> CreateImm(int64_t Val, SMLoc S, SMLoc E,
+                                               bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+
+  static std::unique_ptr<PPCOperand> CreateExpr(const MCExpr *Val, SMLoc S,
+                                                SMLoc E, bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(Expression);
+    Op->Expr.Val = Val;
+    Op->Expr.CRVal = EvaluateCRExpr(Val);
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+
+  static std::unique_ptr<PPCOperand>
+  CreateTLSReg(const MCSymbolRefExpr *Sym, SMLoc S, SMLoc E, bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(TLSRegister);
+    Op->TLSReg.Sym = Sym;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+
+  static std::unique_ptr<PPCOperand>
+  CreateContextImm(int64_t Val, SMLoc S, SMLoc E, bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(ContextImmediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+
+  static std::unique_ptr<PPCOperand>
+  CreateFromMCExpr(const MCExpr *Val, SMLoc S, SMLoc E, bool IsPPC64) {
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Val))
+      return CreateImm(CE->getValue(), S, E, IsPPC64);
+
+    if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Val))
+      if (SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS)
+        return CreateTLSReg(SRE, S, E, IsPPC64);
+
+    if (const PPCMCExpr *TE = dyn_cast<PPCMCExpr>(Val)) {
+      int64_t Res;
+      if (TE->evaluateAsConstant(Res))
+        return CreateContextImm(Res, S, E, IsPPC64);
+    }
+
+    return CreateExpr(Val, S, E, IsPPC64);
+  }
+};
+
+} // end anonymous namespace.
+
+void PPCOperand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case Token:
+    OS << "'" << getToken() << "'";
+    break;
+  case Immediate:
+  case ContextImmediate:
+    OS << getImm();
+    break;
+  case Expression:
+    OS << *getExpr();
+    break;
+  case TLSRegister:
+    OS << *getTLSReg();
+    break;
+  }
+}
+
+static void
+addNegOperand(MCInst &Inst, MCOperand &Op, MCContext &Ctx) {
+  if (Op.isImm()) {
+    Inst.addOperand(MCOperand::createImm(-Op.getImm()));
+    return;
+  }
+  const MCExpr *Expr = Op.getExpr();
+  if (const MCUnaryExpr *UnExpr = dyn_cast<MCUnaryExpr>(Expr)) {
+    if (UnExpr->getOpcode() == MCUnaryExpr::Minus) {
+      Inst.addOperand(MCOperand::createExpr(UnExpr->getSubExpr()));
+      return;
+    }
+  } else if (const MCBinaryExpr *BinExpr = dyn_cast<MCBinaryExpr>(Expr)) {
+    if (BinExpr->getOpcode() == MCBinaryExpr::Sub) {
+      const MCExpr *NE = MCBinaryExpr::createSub(BinExpr->getRHS(),
+                                                 BinExpr->getLHS(), Ctx);
+      Inst.addOperand(MCOperand::createExpr(NE));
+      return;
+    }
+  }
+  Inst.addOperand(MCOperand::createExpr(MCUnaryExpr::createMinus(Expr, Ctx)));
+}
+
+void PPCAsmParser::ProcessInstruction(MCInst &Inst,
+                                      const OperandVector &Operands) {
+  int Opcode = Inst.getOpcode();
+  switch (Opcode) {
+  case PPC::DCBTx:
+  case PPC::DCBTT:
+  case PPC::DCBTSTx:
+  case PPC::DCBTSTT: {
+    MCInst TmpInst;
+    TmpInst.setOpcode((Opcode == PPC::DCBTx || Opcode == PPC::DCBTT) ?
+                      PPC::DCBT : PPC::DCBTST);
+    TmpInst.addOperand(MCOperand::createImm(
+      (Opcode == PPC::DCBTx || Opcode == PPC::DCBTSTx) ? 0 : 16));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::DCBTCT:
+  case PPC::DCBTDS: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(PPC::DCBT);
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::DCBTSTCT:
+  case PPC::DCBTSTDS: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(PPC::DCBTST);
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::DCBFx:
+  case PPC::DCBFL:
+  case PPC::DCBFLP: {
+    int L = 0;
+    if (Opcode == PPC::DCBFL)
+      L = 1;
+    else if (Opcode == PPC::DCBFLP)
+      L = 3;
+
+    MCInst TmpInst;
+    TmpInst.setOpcode(PPC::DCBF);
+    TmpInst.addOperand(MCOperand::createImm(L));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::LAx: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(PPC::LA);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SUBI: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(PPC::ADDI);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    addNegOperand(TmpInst, Inst.getOperand(2), getContext());
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SUBIS: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(PPC::ADDIS);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    addNegOperand(TmpInst, Inst.getOperand(2), getContext());
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SUBIC: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(PPC::ADDIC);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    addNegOperand(TmpInst, Inst.getOperand(2), getContext());
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SUBICo: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(PPC::ADDICo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    addNegOperand(TmpInst, Inst.getOperand(2), getContext());
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::EXTLWI:
+  case PPC::EXTLWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::EXTLWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(B));
+    TmpInst.addOperand(MCOperand::createImm(0));
+    TmpInst.addOperand(MCOperand::createImm(N - 1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::EXTRWI:
+  case PPC::EXTRWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::EXTRWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(B + N));
+    TmpInst.addOperand(MCOperand::createImm(32 - N));
+    TmpInst.addOperand(MCOperand::createImm(31));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::INSLWI:
+  case PPC::INSLWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::INSLWI? PPC::RLWIMI : PPC::RLWIMIo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(32 - B));
+    TmpInst.addOperand(MCOperand::createImm(B));
+    TmpInst.addOperand(MCOperand::createImm((B + N) - 1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::INSRWI:
+  case PPC::INSRWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::INSRWI? PPC::RLWIMI : PPC::RLWIMIo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(32 - (B + N)));
+    TmpInst.addOperand(MCOperand::createImm(B));
+    TmpInst.addOperand(MCOperand::createImm((B + N) - 1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::ROTRWI:
+  case PPC::ROTRWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::ROTRWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(32 - N));
+    TmpInst.addOperand(MCOperand::createImm(0));
+    TmpInst.addOperand(MCOperand::createImm(31));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SLWI:
+  case PPC::SLWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::SLWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(N));
+    TmpInst.addOperand(MCOperand::createImm(0));
+    TmpInst.addOperand(MCOperand::createImm(31 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SRWI:
+  case PPC::SRWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::SRWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(32 - N));
+    TmpInst.addOperand(MCOperand::createImm(N));
+    TmpInst.addOperand(MCOperand::createImm(31));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::CLRRWI:
+  case PPC::CLRRWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::CLRRWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(0));
+    TmpInst.addOperand(MCOperand::createImm(0));
+    TmpInst.addOperand(MCOperand::createImm(31 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::CLRLSLWI:
+  case PPC::CLRLSLWIo: {
+    MCInst TmpInst;
+    int64_t B = Inst.getOperand(2).getImm();
+    int64_t N = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::CLRLSLWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(N));
+    TmpInst.addOperand(MCOperand::createImm(B - N));
+    TmpInst.addOperand(MCOperand::createImm(31 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::EXTLDI:
+  case PPC::EXTLDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::EXTLDI? PPC::RLDICR : PPC::RLDICRo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(B));
+    TmpInst.addOperand(MCOperand::createImm(N - 1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::EXTRDI:
+  case PPC::EXTRDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::EXTRDI? PPC::RLDICL : PPC::RLDICLo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(B + N));
+    TmpInst.addOperand(MCOperand::createImm(64 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::INSRDI:
+  case PPC::INSRDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::INSRDI? PPC::RLDIMI : PPC::RLDIMIo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(64 - (B + N)));
+    TmpInst.addOperand(MCOperand::createImm(B));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::ROTRDI:
+  case PPC::ROTRDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::ROTRDI? PPC::RLDICL : PPC::RLDICLo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(64 - N));
+    TmpInst.addOperand(MCOperand::createImm(0));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SLDI:
+  case PPC::SLDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::SLDI? PPC::RLDICR : PPC::RLDICRo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(N));
+    TmpInst.addOperand(MCOperand::createImm(63 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SRDI:
+  case PPC::SRDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::SRDI? PPC::RLDICL : PPC::RLDICLo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(64 - N));
+    TmpInst.addOperand(MCOperand::createImm(N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::CLRRDI:
+  case PPC::CLRRDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::CLRRDI? PPC::RLDICR : PPC::RLDICRo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(0));
+    TmpInst.addOperand(MCOperand::createImm(63 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::CLRLSLDI:
+  case PPC::CLRLSLDIo: {
+    MCInst TmpInst;
+    int64_t B = Inst.getOperand(2).getImm();
+    int64_t N = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::CLRLSLDI? PPC::RLDIC : PPC::RLDICo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(N));
+    TmpInst.addOperand(MCOperand::createImm(B - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::RLWINMbm:
+  case PPC::RLWINMobm: {
+    unsigned MB, ME;
+    int64_t BM = Inst.getOperand(3).getImm();
+    if (!isRunOfOnes(BM, MB, ME))
+      break;
+
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opcode == PPC::RLWINMbm ? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(MCOperand::createImm(MB));
+    TmpInst.addOperand(MCOperand::createImm(ME));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::RLWIMIbm:
+  case PPC::RLWIMIobm: {
+    unsigned MB, ME;
+    int64_t BM = Inst.getOperand(3).getImm();
+    if (!isRunOfOnes(BM, MB, ME))
+      break;
+
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opcode == PPC::RLWIMIbm ? PPC::RLWIMI : PPC::RLWIMIo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0)); // The tied operand.
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(MCOperand::createImm(MB));
+    TmpInst.addOperand(MCOperand::createImm(ME));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::RLWNMbm:
+  case PPC::RLWNMobm: {
+    unsigned MB, ME;
+    int64_t BM = Inst.getOperand(3).getImm();
+    if (!isRunOfOnes(BM, MB, ME))
+      break;
+
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opcode == PPC::RLWNMbm ? PPC::RLWNM : PPC::RLWNMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(MCOperand::createImm(MB));
+    TmpInst.addOperand(MCOperand::createImm(ME));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::MFTB: {
+    if (getSTI().getFeatureBits()[PPC::FeatureMFTB]) {
+      assert(Inst.getNumOperands() == 2 && "Expecting two operands");
+      Inst.setOpcode(PPC::MFSPR);
+    }
+    break;
+  }
+  case PPC::CP_COPYx:
+  case PPC::CP_COPY_FIRST: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(PPC::CP_COPY);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(Opcode == PPC::CP_COPYx ? 0 : 1));
+
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::CP_PASTEx :
+  case PPC::CP_PASTE_LAST: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opcode == PPC::CP_PASTEx ?
+                      PPC::CP_PASTE : PPC::CP_PASTEo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(Opcode == PPC::CP_PASTEx ? 0 : 1));
+
+    Inst = TmpInst;
+    break;
+  }
+  }
+}
+
+bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                           OperandVector &Operands,
+                                           MCStreamer &Out, uint64_t &ErrorInfo,
+                                           bool MatchingInlineAsm) {
+  MCInst Inst;
+
+  switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
+  case Match_Success:
+    // Post-process instructions (typically extended mnemonics)
+    ProcessInstruction(Inst, Operands);
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst, getSTI());
+    return false;
+  case Match_MissingFeature:
+    return Error(IDLoc, "instruction use requires an option to be enabled");
+  case Match_MnemonicFail:
+    return Error(IDLoc, "unrecognized instruction mnemonic");
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0ULL) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((PPCOperand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+}
+
+bool PPCAsmParser::MatchRegisterName(unsigned &RegNo, int64_t &IntVal) {
+  if (getParser().getTok().is(AsmToken::Identifier)) {
+    StringRef Name = getParser().getTok().getString();
+    if (Name.equals_lower("lr")) {
+      RegNo = isPPC64()? PPC::LR8 : PPC::LR;
+      IntVal = 8;
+    } else if (Name.equals_lower("ctr")) {
+      RegNo = isPPC64()? PPC::CTR8 : PPC::CTR;
+      IntVal = 9;
+    } else if (Name.equals_lower("vrsave")) {
+      RegNo = PPC::VRSAVE;
+      IntVal = 256;
+    } else if (Name.startswith_lower("r") &&
+               !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+      RegNo = isPPC64()? XRegs[IntVal] : RRegs[IntVal];
+    } else if (Name.startswith_lower("f") &&
+               !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+      RegNo = FRegs[IntVal];
+    } else if (Name.startswith_lower("vs") &&
+               !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 64) {
+      RegNo = VSRegs[IntVal];
+    } else if (Name.startswith_lower("v") &&
+               !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+      RegNo = VRegs[IntVal];
+    } else if (Name.startswith_lower("q") &&
+               !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+      RegNo = QFRegs[IntVal];
+    } else if (Name.startswith_lower("cr") &&
+               !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) {
+      RegNo = CRRegs[IntVal];
+    } else
+      return true;
+    getParser().Lex();
+    return false;
+  }
+  return true;
+}
+
+bool PPCAsmParser::
+ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+  const AsmToken &Tok = getParser().getTok();
+  StartLoc = Tok.getLoc();
+  EndLoc = Tok.getEndLoc();
+  RegNo = 0;
+  int64_t IntVal;
+  if (MatchRegisterName(RegNo, IntVal))
+    return TokError("invalid register name");
+  return false;
+}
+
+/// Extract \code @l/@ha \endcode modifier from expression.  Recursively scan
+/// the expression and check for VK_PPC_LO/HI/HA
+/// symbol variants.  If all symbols with modifier use the same
+/// variant, return the corresponding PPCMCExpr::VariantKind,
+/// and a modified expression using the default symbol variant.
+/// Otherwise, return NULL.
+const MCExpr *PPCAsmParser::
+ExtractModifierFromExpr(const MCExpr *E,
+                        PPCMCExpr::VariantKind &Variant) {
+  MCContext &Context = getParser().getContext();
+  Variant = PPCMCExpr::VK_PPC_None;
+
+  switch (E->getKind()) {
+  case MCExpr::Target:
+  case MCExpr::Constant:
+    return nullptr;
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
+
+    switch (SRE->getKind()) {
+    case MCSymbolRefExpr::VK_PPC_LO:
+      Variant = PPCMCExpr::VK_PPC_LO;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HI:
+      Variant = PPCMCExpr::VK_PPC_HI;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HA:
+      Variant = PPCMCExpr::VK_PPC_HA;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HIGHER:
+      Variant = PPCMCExpr::VK_PPC_HIGHER;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HIGHERA:
+      Variant = PPCMCExpr::VK_PPC_HIGHERA;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HIGHEST:
+      Variant = PPCMCExpr::VK_PPC_HIGHEST;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HIGHESTA:
+      Variant = PPCMCExpr::VK_PPC_HIGHESTA;
+      break;
+    default:
+      return nullptr;
+    }
+
+    return MCSymbolRefExpr::create(&SRE->getSymbol(), Context);
+  }
+
+  case MCExpr::Unary: {
+    const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
+    const MCExpr *Sub = ExtractModifierFromExpr(UE->getSubExpr(), Variant);
+    if (!Sub)
+      return nullptr;
+    return MCUnaryExpr::create(UE->getOpcode(), Sub, Context);
+  }
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
+    PPCMCExpr::VariantKind LHSVariant, RHSVariant;
+    const MCExpr *LHS = ExtractModifierFromExpr(BE->getLHS(), LHSVariant);
+    const MCExpr *RHS = ExtractModifierFromExpr(BE->getRHS(), RHSVariant);
+
+    if (!LHS && !RHS)
+      return nullptr;
+
+    if (!LHS) LHS = BE->getLHS();
+    if (!RHS) RHS = BE->getRHS();
+
+    if (LHSVariant == PPCMCExpr::VK_PPC_None)
+      Variant = RHSVariant;
+    else if (RHSVariant == PPCMCExpr::VK_PPC_None)
+      Variant = LHSVariant;
+    else if (LHSVariant == RHSVariant)
+      Variant = LHSVariant;
+    else
+      return nullptr;
+
+    return MCBinaryExpr::create(BE->getOpcode(), LHS, RHS, Context);
+  }
+  }
+
+  llvm_unreachable("Invalid expression kind!");
+}
+
+/// Find all VK_TLSGD/VK_TLSLD symbol references in expression and replace
+/// them by VK_PPC_TLSGD/VK_PPC_TLSLD.  This is necessary to avoid having
+/// _GLOBAL_OFFSET_TABLE_ created via ELFObjectWriter::RelocNeedsGOT.
+/// FIXME: This is a hack.
+const MCExpr *PPCAsmParser::
+FixupVariantKind(const MCExpr *E) {
+  MCContext &Context = getParser().getContext();
+
+  switch (E->getKind()) {
+  case MCExpr::Target:
+  case MCExpr::Constant:
+    return E;
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
+    MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+
+    switch (SRE->getKind()) {
+    case MCSymbolRefExpr::VK_TLSGD:
+      Variant = MCSymbolRefExpr::VK_PPC_TLSGD;
+      break;
+    case MCSymbolRefExpr::VK_TLSLD:
+      Variant = MCSymbolRefExpr::VK_PPC_TLSLD;
+      break;
+    default:
+      return E;
+    }
+    return MCSymbolRefExpr::create(&SRE->getSymbol(), Variant, Context);
+  }
+
+  case MCExpr::Unary: {
+    const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
+    const MCExpr *Sub = FixupVariantKind(UE->getSubExpr());
+    if (Sub == UE->getSubExpr())
+      return E;
+    return MCUnaryExpr::create(UE->getOpcode(), Sub, Context);
+  }
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
+    const MCExpr *LHS = FixupVariantKind(BE->getLHS());
+    const MCExpr *RHS = FixupVariantKind(BE->getRHS());
+    if (LHS == BE->getLHS() && RHS == BE->getRHS())
+      return E;
+    return MCBinaryExpr::create(BE->getOpcode(), LHS, RHS, Context);
+  }
+  }
+
+  llvm_unreachable("Invalid expression kind!");
+}
+
+/// ParseExpression.  This differs from the default "parseExpression" in that
+/// it handles modifiers.
+bool PPCAsmParser::
+ParseExpression(const MCExpr *&EVal) {
+
+  if (isDarwin())
+    return ParseDarwinExpression(EVal);
+
+  // (ELF Platforms)
+  // Handle \code @l/@ha \endcode
+  if (getParser().parseExpression(EVal))
+    return true;
+
+  EVal = FixupVariantKind(EVal);
+
+  PPCMCExpr::VariantKind Variant;
+  const MCExpr *E = ExtractModifierFromExpr(EVal, Variant);
+  if (E)
+    EVal = PPCMCExpr::create(Variant, E, false, getParser().getContext());
+
+  return false;
+}
+
+/// ParseDarwinExpression.  (MachO Platforms)
+/// This differs from the default "parseExpression" in that it handles detection
+/// of the \code hi16(), ha16() and lo16() \endcode modifiers.  At present,
+/// parseExpression() doesn't recognise the modifiers when in the Darwin/MachO
+/// syntax form so it is done here.  TODO: Determine if there is merit in
+/// arranging for this to be done at a higher level.
+bool PPCAsmParser::
+ParseDarwinExpression(const MCExpr *&EVal) {
+  MCAsmParser &Parser = getParser();
+  PPCMCExpr::VariantKind Variant = PPCMCExpr::VK_PPC_None;
+  switch (getLexer().getKind()) {
+  default:
+    break;
+  case AsmToken::Identifier:
+    // Compiler-generated Darwin identifiers begin with L,l,_ or "; thus
+    // something starting with any other char should be part of the
+    // asm syntax.  If handwritten asm includes an identifier like lo16,
+    // then all bets are off - but no-one would do that, right?
+    StringRef poss = Parser.getTok().getString();
+    if (poss.equals_lower("lo16")) {
+      Variant = PPCMCExpr::VK_PPC_LO;
+    } else if (poss.equals_lower("hi16")) {
+      Variant = PPCMCExpr::VK_PPC_HI;
+    } else if (poss.equals_lower("ha16")) {
+      Variant = PPCMCExpr::VK_PPC_HA;
+    }
+    if (Variant != PPCMCExpr::VK_PPC_None) {
+      Parser.Lex(); // Eat the xx16
+      if (getLexer().isNot(AsmToken::LParen))
+        return Error(Parser.getTok().getLoc(), "expected '('");
+      Parser.Lex(); // Eat the '('
+    }
+    break;
+  }
+
+  if (getParser().parseExpression(EVal))
+    return true;
+
+  if (Variant != PPCMCExpr::VK_PPC_None) {
+    if (getLexer().isNot(AsmToken::RParen))
+      return Error(Parser.getTok().getLoc(), "expected ')'");
+    Parser.Lex(); // Eat the ')'
+    EVal = PPCMCExpr::create(Variant, EVal, false, getParser().getContext());
+  }
+  return false;
+}
+
+/// ParseOperand
+/// This handles registers in the form 'NN', '%rNN' for ELF platforms and
+/// rNN for MachO.
+bool PPCAsmParser::ParseOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = Parser.getTok().getLoc();
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  const MCExpr *EVal;
+
+  // Attempt to parse the next token as an immediate
+  switch (getLexer().getKind()) {
+  // Special handling for register names.  These are interpreted
+  // as immediates corresponding to the register number.
+  case AsmToken::Percent:
+    Parser.Lex(); // Eat the '%'.
+    unsigned RegNo;
+    int64_t IntVal;
+    if (MatchRegisterName(RegNo, IntVal))
+      return Error(S, "invalid register name");
+
+    Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
+    return false;
+
+  case AsmToken::Identifier:
+  case AsmToken::LParen:
+  case AsmToken::Plus:
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+  case AsmToken::Dot:
+  case AsmToken::Dollar:
+  case AsmToken::Exclaim:
+  case AsmToken::Tilde:
+    // Note that non-register-name identifiers from the compiler will begin
+    // with '_', 'L'/'l' or '"'.  Of course, handwritten asm could include
+    // identifiers like r31foo - so we fall through in the event that parsing
+    // a register name fails.
+    if (isDarwin()) {
+      unsigned RegNo;
+      int64_t IntVal;
+      if (!MatchRegisterName(RegNo, IntVal)) {
+        Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
+        return false;
+      }
+    }
+    // All other expressions
+
+    if (!ParseExpression(EVal))
+      break;
+    // Fall-through
+    LLVM_FALLTHROUGH;
+  default:
+    return Error(S, "unknown operand");
+  }
+
+  // Push the parsed operand into the list of operands
+  Operands.push_back(PPCOperand::CreateFromMCExpr(EVal, S, E, isPPC64()));
+
+  // Check whether this is a TLS call expression
+  bool TLSCall = false;
+  if (const MCSymbolRefExpr *Ref = dyn_cast<MCSymbolRefExpr>(EVal))
+    TLSCall = Ref->getSymbol().getName() == "__tls_get_addr";
+
+  if (TLSCall && getLexer().is(AsmToken::LParen)) {
+    const MCExpr *TLSSym;
+
+    Parser.Lex(); // Eat the '('.
+    S = Parser.getTok().getLoc();
+    if (ParseExpression(TLSSym))
+      return Error(S, "invalid TLS call expression");
+    if (getLexer().isNot(AsmToken::RParen))
+      return Error(Parser.getTok().getLoc(), "missing ')'");
+    E = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat the ')'.
+
+    Operands.push_back(PPCOperand::CreateFromMCExpr(TLSSym, S, E, isPPC64()));
+  }
+
+  // Otherwise, check for D-form memory operands
+  if (!TLSCall && getLexer().is(AsmToken::LParen)) {
+    Parser.Lex(); // Eat the '('.
+    S = Parser.getTok().getLoc();
+
+    int64_t IntVal;
+    switch (getLexer().getKind()) {
+    case AsmToken::Percent:
+      Parser.Lex(); // Eat the '%'.
+      unsigned RegNo;
+      if (MatchRegisterName(RegNo, IntVal))
+        return Error(S, "invalid register name");
+      break;
+
+    case AsmToken::Integer:
+      if (isDarwin())
+        return Error(S, "unexpected integer value");
+      else if (getParser().parseAbsoluteExpression(IntVal) || IntVal < 0 ||
+               IntVal > 31)
+        return Error(S, "invalid register number");
+      break;
+   case AsmToken::Identifier:
+    if (isDarwin()) {
+      unsigned RegNo;
+      if (!MatchRegisterName(RegNo, IntVal)) {
+        break;
+      }
+    }
+    LLVM_FALLTHROUGH;
+
+    default:
+      return Error(S, "invalid memory operand");
+    }
+
+    E = Parser.getTok().getLoc();
+    if (parseToken(AsmToken::RParen, "missing ')'"))
+      return true;
+    Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64()));
+  }
+
+  return false;
+}
+
+/// Parse an instruction mnemonic followed by its operands.
+bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                    SMLoc NameLoc, OperandVector &Operands) {
+  // The first operand is the token for the instruction name.
+  // If the next character is a '+' or '-', we need to add it to the
+  // instruction name, to match what TableGen is doing.
+  std::string NewOpcode;
+  if (parseOptionalToken(AsmToken::Plus)) {
+    NewOpcode = Name;
+    NewOpcode += '+';
+    Name = NewOpcode;
+  }
+  if (parseOptionalToken(AsmToken::Minus)) {
+    NewOpcode = Name;
+    NewOpcode += '-';
+    Name = NewOpcode;
+  }
+  // If the instruction ends in a '.', we need to create a separate
+  // token for it, to match what TableGen is doing.
+  size_t Dot = Name.find('.');
+  StringRef Mnemonic = Name.slice(0, Dot);
+  if (!NewOpcode.empty()) // Underlying memory for Name is volatile.
+    Operands.push_back(
+        PPCOperand::CreateTokenWithStringCopy(Mnemonic, NameLoc, isPPC64()));
+  else
+    Operands.push_back(PPCOperand::CreateToken(Mnemonic, NameLoc, isPPC64()));
+  if (Dot != StringRef::npos) {
+    SMLoc DotLoc = SMLoc::getFromPointer(NameLoc.getPointer() + Dot);
+    StringRef DotStr = Name.slice(Dot, StringRef::npos);
+    if (!NewOpcode.empty()) // Underlying memory for Name is volatile.
+      Operands.push_back(
+          PPCOperand::CreateTokenWithStringCopy(DotStr, DotLoc, isPPC64()));
+    else
+      Operands.push_back(PPCOperand::CreateToken(DotStr, DotLoc, isPPC64()));
+  }
+
+  // If there are no more operands then finish
+  if (parseOptionalToken(AsmToken::EndOfStatement))
+    return false;
+
+  // Parse the first operand
+  if (ParseOperand(Operands))
+    return true;
+
+  while (!parseOptionalToken(AsmToken::EndOfStatement)) {
+    if (parseToken(AsmToken::Comma) || ParseOperand(Operands))
+      return true;
+  }
+
+  // We'll now deal with an unfortunate special case: the syntax for the dcbt
+  // and dcbtst instructions differs for server vs. embedded cores.
+  //  The syntax for dcbt is:
+  //    dcbt ra, rb, th [server]
+  //    dcbt th, ra, rb [embedded]
+  //  where th can be omitted when it is 0. dcbtst is the same. We take the
+  //  server form to be the default, so swap the operands if we're parsing for
+  //  an embedded core (they'll be swapped again upon printing).
+  if (getSTI().getFeatureBits()[PPC::FeatureBookE] &&
+      Operands.size() == 4 &&
+      (Name == "dcbt" || Name == "dcbtst")) {
+    std::swap(Operands[1], Operands[3]);
+    std::swap(Operands[2], Operands[1]);
+  }
+
+  return false;
+}
+
+/// ParseDirective parses the PPC specific directives
+bool PPCAsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (isDarwin()) {
+    if (IDVal == ".machine")
+      ParseDarwinDirectiveMachine(DirectiveID.getLoc());
+    else
+      return true;
+  } else if (IDVal == ".word")
+    ParseDirectiveWord(2, DirectiveID);
+  else if (IDVal == ".llong")
+    ParseDirectiveWord(8, DirectiveID);
+  else if (IDVal == ".tc")
+    ParseDirectiveTC(isPPC64() ? 8 : 4, DirectiveID);
+  else if (IDVal == ".machine")
+    ParseDirectiveMachine(DirectiveID.getLoc());
+  else if (IDVal == ".abiversion")
+    ParseDirectiveAbiVersion(DirectiveID.getLoc());
+  else if (IDVal == ".localentry")
+    ParseDirectiveLocalEntry(DirectiveID.getLoc());
+  else
+    return true;
+  return false;
+}
+
+/// ParseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool PPCAsmParser::ParseDirectiveWord(unsigned Size, AsmToken ID) {
+  auto parseOp = [&]() -> bool {
+    const MCExpr *Value;
+    SMLoc ExprLoc = getParser().getTok().getLoc();
+    if (getParser().parseExpression(Value))
+      return true;
+    if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) {
+      assert(Size <= 8 && "Invalid size");
+      uint64_t IntValue = MCE->getValue();
+      if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
+        return Error(ExprLoc, "literal value out of range for '" +
+                                  ID.getIdentifier() + "' directive");
+      getStreamer().EmitIntValue(IntValue, Size);
+    } else
+      getStreamer().EmitValue(Value, Size, ExprLoc);
+    return false;
+  };
+
+  if (parseMany(parseOp))
+    return addErrorSuffix(" in '" + ID.getIdentifier() + "' directive");
+  return false;
+}
+
+/// ParseDirectiveTC
+///  ::= .tc [ symbol (, expression)* ]
+bool PPCAsmParser::ParseDirectiveTC(unsigned Size, AsmToken ID) {
+  MCAsmParser &Parser = getParser();
+  // Skip TC symbol, which is only used with XCOFF.
+  while (getLexer().isNot(AsmToken::EndOfStatement)
+         && getLexer().isNot(AsmToken::Comma))
+    Parser.Lex();
+  if (parseToken(AsmToken::Comma))
+    return addErrorSuffix(" in '.tc' directive");
+
+  // Align to word size.
+  getParser().getStreamer().EmitValueToAlignment(Size);
+
+  // Emit expressions.
+  return ParseDirectiveWord(Size, ID);
+}
+
+/// ParseDirectiveMachine (ELF platforms)
+///  ::= .machine [ cpu | "push" | "pop" ]
+bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  if (Parser.getTok().isNot(AsmToken::Identifier) &&
+      Parser.getTok().isNot(AsmToken::String))
+    return Error(L, "unexpected token in '.machine' directive");
+
+  StringRef CPU = Parser.getTok().getIdentifier();
+
+  // FIXME: Right now, the parser always allows any available
+  // instruction, so the .machine directive is not useful.
+  // Implement ".machine any" (by doing nothing) for the benefit
+  // of existing assembler code.  Likewise, we can then implement
+  // ".machine push" and ".machine pop" as no-op.
+  if (CPU != "any" && CPU != "push" && CPU != "pop")
+    return TokError("unrecognized machine type");
+
+  Parser.Lex();
+
+  if (parseToken(AsmToken::EndOfStatement))
+    return addErrorSuffix(" in '.machine' directive");
+
+  PPCTargetStreamer &TStreamer =
+      *static_cast<PPCTargetStreamer *>(
+           getParser().getStreamer().getTargetStreamer());
+  TStreamer.emitMachine(CPU);
+
+  return false;
+}
+
+/// ParseDarwinDirectiveMachine (Mach-o platforms)
+///  ::= .machine cpu-identifier
+bool PPCAsmParser::ParseDarwinDirectiveMachine(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  if (Parser.getTok().isNot(AsmToken::Identifier) &&
+      Parser.getTok().isNot(AsmToken::String))
+    return Error(L, "unexpected token in directive");
+
+  StringRef CPU = Parser.getTok().getIdentifier();
+  Parser.Lex();
+
+  // FIXME: this is only the 'default' set of cpu variants.
+  // However we don't act on this information at present, this is simply
+  // allowing parsing to proceed with minimal sanity checking.
+  if (check(CPU != "ppc7400" && CPU != "ppc" && CPU != "ppc64", L,
+            "unrecognized cpu type") ||
+      check(isPPC64() && (CPU == "ppc7400" || CPU == "ppc"), L,
+            "wrong cpu type specified for 64bit") ||
+      check(!isPPC64() && CPU == "ppc64", L,
+            "wrong cpu type specified for 32bit") ||
+      parseToken(AsmToken::EndOfStatement))
+    return addErrorSuffix(" in '.machine' directive");
+  return false;
+}
+
+/// ParseDirectiveAbiVersion
+///  ::= .abiversion constant-expression
+bool PPCAsmParser::ParseDirectiveAbiVersion(SMLoc L) {
+  int64_t AbiVersion;
+  if (check(getParser().parseAbsoluteExpression(AbiVersion), L,
+            "expected constant expression") ||
+      parseToken(AsmToken::EndOfStatement))
+    return addErrorSuffix(" in '.abiversion' directive");
+
+  PPCTargetStreamer &TStreamer =
+      *static_cast<PPCTargetStreamer *>(
+           getParser().getStreamer().getTargetStreamer());
+  TStreamer.emitAbiVersion(AbiVersion);
+
+  return false;
+}
+
+/// ParseDirectiveLocalEntry
+///  ::= .localentry symbol, expression
+bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) {
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return Error(L, "expected identifier in '.localentry' directive");
+
+  MCSymbolELF *Sym = cast<MCSymbolELF>(getContext().getOrCreateSymbol(Name));
+  const MCExpr *Expr;
+
+  if (parseToken(AsmToken::Comma) ||
+      check(getParser().parseExpression(Expr), L, "expected expression") ||
+      parseToken(AsmToken::EndOfStatement))
+    return addErrorSuffix(" in '.localentry' directive");
+
+  PPCTargetStreamer &TStreamer =
+      *static_cast<PPCTargetStreamer *>(
+           getParser().getStreamer().getTargetStreamer());
+  TStreamer.emitLocalEntry(Sym, Expr);
+
+  return false;
+}
+
+
+
+/// Force static initialization.
+extern "C" void LLVMInitializePowerPCAsmParser() {
+  RegisterMCAsmParser<PPCAsmParser> A(getThePPC32Target());
+  RegisterMCAsmParser<PPCAsmParser> B(getThePPC64Target());
+  RegisterMCAsmParser<PPCAsmParser> C(getThePPC64LETarget());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "PPCGenAsmMatcher.inc"
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+                                                  unsigned Kind) {
+  // If the kind is a token for a literal immediate, check if our asm
+  // operand matches. This is for InstAliases which have a fixed-value
+  // immediate in the syntax.
+  int64_t ImmVal;
+  switch (Kind) {
+    case MCK_0: ImmVal = 0; break;
+    case MCK_1: ImmVal = 1; break;
+    case MCK_2: ImmVal = 2; break;
+    case MCK_3: ImmVal = 3; break;
+    case MCK_4: ImmVal = 4; break;
+    case MCK_5: ImmVal = 5; break;
+    case MCK_6: ImmVal = 6; break;
+    case MCK_7: ImmVal = 7; break;
+    default: return Match_InvalidOperand;
+  }
+
+  PPCOperand &Op = static_cast<PPCOperand &>(AsmOp);
+  if (Op.isImm() && Op.getImm() == ImmVal)
+    return Match_Success;
+
+  return Match_InvalidOperand;
+}
+
+const MCExpr *
+PPCAsmParser::applyModifierToExpr(const MCExpr *E,
+                                  MCSymbolRefExpr::VariantKind Variant,
+                                  MCContext &Ctx) {
+  switch (Variant) {
+  case MCSymbolRefExpr::VK_PPC_LO:
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_LO, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HI:
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HI, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HA:
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HA, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHER:
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHER, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHERA:
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHERA, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHEST:
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHEST, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHESTA:
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHESTA, E, false, Ctx);
+  default:
+    return nullptr;
+  }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
new file mode 100644
index 000000000000..12ffbfdeacc1
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -0,0 +1,440 @@
+//===------ PPCDisassembler.cpp - Disassembler for PowerPC ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+class PPCDisassembler : public MCDisassembler {
+  bool IsLittleEndian;
+
+public:
+  PPCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                  bool IsLittleEndian)
+      : MCDisassembler(STI, Ctx), IsLittleEndian(IsLittleEndian) {}
+
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
+};
+} // end anonymous namespace
+
+static MCDisassembler *createPPCDisassembler(const Target &T,
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/false);
+}
+
+static MCDisassembler *createPPCLEDisassembler(const Target &T,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/true);
+}
+
+extern "C" void LLVMInitializePowerPCDisassembler() {
+  // Register the disassembler for each target.
+  TargetRegistry::RegisterMCDisassembler(getThePPC32Target(),
+                                         createPPCDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getThePPC64Target(),
+                                         createPPCDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getThePPC64LETarget(),
+                                         createPPCLEDisassembler);
+}
+
+// FIXME: These can be generated by TableGen from the existing register
+// encoding values!
+
+static const unsigned CRRegs[] = {
+  PPC::CR0, PPC::CR1, PPC::CR2, PPC::CR3,
+  PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7
+};
+
+static const unsigned CRBITRegs[] = {
+  PPC::CR0LT, PPC::CR0GT, PPC::CR0EQ, PPC::CR0UN,
+  PPC::CR1LT, PPC::CR1GT, PPC::CR1EQ, PPC::CR1UN,
+  PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
+  PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
+  PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
+  PPC::CR5LT, PPC::CR5GT, PPC::CR5EQ, PPC::CR5UN,
+  PPC::CR6LT, PPC::CR6GT, PPC::CR6EQ, PPC::CR6UN,
+  PPC::CR7LT, PPC::CR7GT, PPC::CR7EQ, PPC::CR7UN
+};
+
+static const unsigned FRegs[] = {
+  PPC::F0, PPC::F1, PPC::F2, PPC::F3,
+  PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+  PPC::F8, PPC::F9, PPC::F10, PPC::F11,
+  PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+  PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+  PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+  PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+  PPC::F28, PPC::F29, PPC::F30, PPC::F31
+};
+
+static const unsigned VFRegs[] = {
+  PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3,
+  PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7,
+  PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11,
+  PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+  PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+  PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+  PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+  PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
+
+static const unsigned VRegs[] = {
+  PPC::V0, PPC::V1, PPC::V2, PPC::V3,
+  PPC::V4, PPC::V5, PPC::V6, PPC::V7,
+  PPC::V8, PPC::V9, PPC::V10, PPC::V11,
+  PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+  PPC::V16, PPC::V17, PPC::V18, PPC::V19,
+  PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+  PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+  PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+
+static const unsigned VSRegs[] = {
+  PPC::VSL0, PPC::VSL1, PPC::VSL2, PPC::VSL3,
+  PPC::VSL4, PPC::VSL5, PPC::VSL6, PPC::VSL7,
+  PPC::VSL8, PPC::VSL9, PPC::VSL10, PPC::VSL11,
+  PPC::VSL12, PPC::VSL13, PPC::VSL14, PPC::VSL15,
+  PPC::VSL16, PPC::VSL17, PPC::VSL18, PPC::VSL19,
+  PPC::VSL20, PPC::VSL21, PPC::VSL22, PPC::VSL23,
+  PPC::VSL24, PPC::VSL25, PPC::VSL26, PPC::VSL27,
+  PPC::VSL28, PPC::VSL29, PPC::VSL30, PPC::VSL31,
+
+  PPC::V0, PPC::V1, PPC::V2, PPC::V3,
+  PPC::V4, PPC::V5, PPC::V6, PPC::V7,
+  PPC::V8, PPC::V9, PPC::V10, PPC::V11,
+  PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+  PPC::V16, PPC::V17, PPC::V18, PPC::V19,
+  PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+  PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+  PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+
+static const unsigned VSFRegs[] = {
+  PPC::F0, PPC::F1, PPC::F2, PPC::F3,
+  PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+  PPC::F8, PPC::F9, PPC::F10, PPC::F11,
+  PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+  PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+  PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+  PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+  PPC::F28, PPC::F29, PPC::F30, PPC::F31,
+
+  PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3,
+  PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7,
+  PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11,
+  PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+  PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+  PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+  PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+  PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
+
+static const unsigned VSSRegs[] = {
+  PPC::F0, PPC::F1, PPC::F2, PPC::F3,
+  PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+  PPC::F8, PPC::F9, PPC::F10, PPC::F11,
+  PPC::F12, PPC::F13, PPC::F14, PPC::F15,
+  PPC::F16, PPC::F17, PPC::F18, PPC::F19,
+  PPC::F20, PPC::F21, PPC::F22, PPC::F23,
+  PPC::F24, PPC::F25, PPC::F26, PPC::F27,
+  PPC::F28, PPC::F29, PPC::F30, PPC::F31,
+
+  PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3,
+  PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7,
+  PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11,
+  PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+  PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+  PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+  PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+  PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
+
+static const unsigned GPRegs[] = {
+  PPC::R0, PPC::R1, PPC::R2, PPC::R3,
+  PPC::R4, PPC::R5, PPC::R6, PPC::R7,
+  PPC::R8, PPC::R9, PPC::R10, PPC::R11,
+  PPC::R12, PPC::R13, PPC::R14, PPC::R15,
+  PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+  PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+  PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+  PPC::R28, PPC::R29, PPC::R30, PPC::R31
+};
+
+static const unsigned GP0Regs[] = {
+  PPC::ZERO, PPC::R1, PPC::R2, PPC::R3,
+  PPC::R4, PPC::R5, PPC::R6, PPC::R7,
+  PPC::R8, PPC::R9, PPC::R10, PPC::R11,
+  PPC::R12, PPC::R13, PPC::R14, PPC::R15,
+  PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+  PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+  PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+  PPC::R28, PPC::R29, PPC::R30, PPC::R31
+};
+
+static const unsigned G8Regs[] = {
+  PPC::X0, PPC::X1, PPC::X2, PPC::X3,
+  PPC::X4, PPC::X5, PPC::X6, PPC::X7,
+  PPC::X8, PPC::X9, PPC::X10, PPC::X11,
+  PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+  PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+  PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+  PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+  PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+
+static const unsigned QFRegs[] = {
+  PPC::QF0, PPC::QF1, PPC::QF2, PPC::QF3,
+  PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7,
+  PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11,
+  PPC::QF12, PPC::QF13, PPC::QF14, PPC::QF15,
+  PPC::QF16, PPC::QF17, PPC::QF18, PPC::QF19,
+  PPC::QF20, PPC::QF21, PPC::QF22, PPC::QF23,
+  PPC::QF24, PPC::QF25, PPC::QF26, PPC::QF27,
+  PPC::QF28, PPC::QF29, PPC::QF30, PPC::QF31
+};
+
+template <std::size_t N>
+static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                        const unsigned (&Regs)[N]) {
+  assert(RegNo < N && "Invalid register number");
+  Inst.addOperand(MCOperand::createReg(Regs[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCRRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, CRRegs);
+}
+
+static DecodeStatus DecodeCRRC0RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, CRRegs);
+}
+
+static DecodeStatus DecodeCRBITRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, CRBITRegs);
+}
+
+static DecodeStatus DecodeF4RCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, FRegs);
+}
+
+static DecodeStatus DecodeF8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, FRegs);
+}
+
+static DecodeStatus DecodeVFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, VFRegs);
+}
+
+static DecodeStatus DecodeVRRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, VRegs);
+}
+
+static DecodeStatus DecodeVSRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, VSRegs);
+}
+
+static DecodeStatus DecodeVSFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, VSFRegs);
+}
+
+static DecodeStatus DecodeVSSRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, VSSRegs);
+}
+
+static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, GPRegs);
+}
+
+static DecodeStatus DecodeGPRC_NOR0RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, GP0Regs);
+}
+
+static DecodeStatus DecodeG8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, G8Regs);
+}
+
+#define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass
+#define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass
+
+static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, QFRegs);
+}
+
+#define DecodeQSRCRegisterClass DecodeQFRCRegisterClass
+#define DecodeQBRCRegisterClass DecodeQFRCRegisterClass
+
+template<unsigned N>
+static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm,
+                                      int64_t Address, const void *Decoder) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::createImm(Imm));
+  return MCDisassembler::Success;
+}
+
+template<unsigned N>
+static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm,
+                                      int64_t Address, const void *Decoder) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::createImm(SignExtend64<N>(Imm)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeMemRIOperands(MCInst &Inst, uint64_t Imm,
+                                        int64_t Address, const void *Decoder) {
+  // Decode the memri field (imm, reg), which has the low 16-bits as the
+  // displacement and the next 5 bits as the register #.
+
+  uint64_t Base = Imm >> 16;
+  uint64_t Disp = Imm & 0xFFFF;
+
+  assert(Base < 32 && "Invalid base register");
+
+  switch (Inst.getOpcode()) {
+  default: break;
+  case PPC::LBZU:
+  case PPC::LHAU:
+  case PPC::LHZU:
+  case PPC::LWZU:
+  case PPC::LFSU:
+  case PPC::LFDU:
+    // Add the tied output operand.
+    Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+    break;
+  case PPC::STBU:
+  case PPC::STHU:
+  case PPC::STWU:
+  case PPC::STFSU:
+  case PPC::STFDU:
+    Inst.insert(Inst.begin(), MCOperand::createReg(GP0Regs[Base]));
+    break;
+  }
+
+  Inst.addOperand(MCOperand::createImm(SignExtend64<16>(Disp)));
+  Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeMemRIXOperands(MCInst &Inst, uint64_t Imm,
+                                         int64_t Address, const void *Decoder) {
+  // Decode the memrix field (imm, reg), which has the low 14-bits as the
+  // displacement and the next 5 bits as the register #.
+
+  uint64_t Base = Imm >> 14;
+  uint64_t Disp = Imm & 0x3FFF;
+
+  assert(Base < 32 && "Invalid base register");
+
+  if (Inst.getOpcode() == PPC::LDU)
+    // Add the tied output operand.
+    Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+  else if (Inst.getOpcode() == PPC::STDU)
+    Inst.insert(Inst.begin(), MCOperand::createReg(GP0Regs[Base]));
+
+  Inst.addOperand(MCOperand::createImm(SignExtend64<16>(Disp << 2)));
+  Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeMemRIX16Operands(MCInst &Inst, uint64_t Imm,
+                                         int64_t Address, const void *Decoder) {
+  // Decode the memrix16 field (imm, reg), which has the low 12-bits as the
+  // displacement with 16-byte aligned, and the next 5 bits as the register #.
+
+  uint64_t Base = Imm >> 12;
+  uint64_t Disp = Imm & 0xFFF;
+
+  assert(Base < 32 && "Invalid base register");
+
+  Inst.addOperand(MCOperand::createImm(SignExtend64<16>(Disp << 4)));
+  Inst.addOperand(MCOperand::createReg(GP0Regs[Base]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeCRBitMOperand(MCInst &Inst, uint64_t Imm,
+                                        int64_t Address, const void *Decoder) {
+  // The cr bit encoding is 0x80 >> cr_reg_num.
+
+  unsigned Zeros = countTrailingZeros(Imm);
+  assert(Zeros < 8 && "Invalid CR bit value");
+
+  Inst.addOperand(MCOperand::createReg(CRRegs[7 - Zeros]));
+  return MCDisassembler::Success;
+}
+
+#include "PPCGenDisassemblerTables.inc"
+
+DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                             ArrayRef<uint8_t> Bytes,
+                                             uint64_t Address, raw_ostream &OS,
+                                             raw_ostream &CS) const {
+  // Get the four bytes of the instruction.
+  Size = 4;
+  if (Bytes.size() < 4) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Read the instruction in the proper endianness.
+  uint32_t Inst = IsLittleEndian ? support::endian::read32le(Bytes.data())
+                                 : support::endian::read32be(Bytes.data());
+
+  if (STI.getFeatureBits()[PPC::FeatureQPX]) {
+    DecodeStatus result =
+      decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI);
+    if (result != MCDisassembler::Fail)
+      return result;
+  }
+
+  return decodeInstruction(DecoderTable32, MI, Inst, Address, this, STI);
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
new file mode 100644
index 000000000000..609d959c6d08
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -0,0 +1,506 @@
+//===-- PPCInstPrinter.cpp - Convert PPC MCInst to assembly syntax --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an PPC MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstPrinter.h"
+#include "PPCInstrInfo.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// FIXME: Once the integrated assembler supports full register names, tie this
+// to the verbose-asm setting.
+static cl::opt<bool>
+FullRegNames("ppc-asm-full-reg-names", cl::Hidden, cl::init(false),
+             cl::desc("Use full register names when printing assembly"));
+
+// Useful for testing purposes. Prints vs{31-63} as v{0-31} respectively.
+static cl::opt<bool>
+ShowVSRNumsAsVR("ppc-vsr-nums-as-vr", cl::Hidden, cl::init(false),
+             cl::desc("Prints full register names with vs{31-63} as v{0-31}"));
+
+#define PRINT_ALIAS_INSTR
+#include "PPCGenAsmWriter.inc"
+
+void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  const char *RegName = getRegisterName(RegNo);
+  if (RegName[0] == 'q' /* QPX */) {
+    // The system toolchain on the BG/Q does not understand QPX register names
+    // in .cfi_* directives, so print the name of the floating-point
+    // subregister instead.
+    std::string RN(RegName);
+
+    RN[0] = 'f';
+    OS << RN;
+
+    return;
+  }
+
+  OS << RegName;
+}
+
+void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot, const MCSubtargetInfo &STI) {
+  // Check for slwi/srwi mnemonics.
+  if (MI->getOpcode() == PPC::RLWINM) {
+    unsigned char SH = MI->getOperand(2).getImm();
+    unsigned char MB = MI->getOperand(3).getImm();
+    unsigned char ME = MI->getOperand(4).getImm();
+    bool useSubstituteMnemonic = false;
+    if (SH <= 31 && MB == 0 && ME == (31-SH)) {
+      O << "\tslwi "; useSubstituteMnemonic = true;
+    }
+    if (SH <= 31 && MB == (32-SH) && ME == 31) {
+      O << "\tsrwi "; useSubstituteMnemonic = true;
+      SH = 32-SH;
+    }
+    if (useSubstituteMnemonic) {
+      printOperand(MI, 0, O);
+      O << ", ";
+      printOperand(MI, 1, O);
+      O << ", " << (unsigned int)SH;
+
+      printAnnotation(O, Annot);
+      return;
+    }
+  }
+  
+  if ((MI->getOpcode() == PPC::OR || MI->getOpcode() == PPC::OR8) &&
+      MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+    O << "\tmr ";
+    printOperand(MI, 0, O);
+    O << ", ";
+    printOperand(MI, 1, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+  
+  if (MI->getOpcode() == PPC::RLDICR) {
+    unsigned char SH = MI->getOperand(2).getImm();
+    unsigned char ME = MI->getOperand(3).getImm();
+    // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH
+    if (63-SH == ME) {
+      O << "\tsldi ";
+      printOperand(MI, 0, O);
+      O << ", ";
+      printOperand(MI, 1, O);
+      O << ", " << (unsigned int)SH;
+      printAnnotation(O, Annot);
+      return;
+    }
+  }
+
+  // dcbt[st] is printed manually here because:
+  //  1. The assembly syntax is different between embedded and server targets
+  //  2. We must print the short mnemonics for TH == 0 because the
+  //     embedded/server syntax default will not be stable across assemblers
+  //  The syntax for dcbt is:
+  //    dcbt ra, rb, th [server]
+  //    dcbt th, ra, rb [embedded]
+  //  where th can be omitted when it is 0. dcbtst is the same.
+  if (MI->getOpcode() == PPC::DCBT || MI->getOpcode() == PPC::DCBTST) {
+    unsigned char TH = MI->getOperand(0).getImm();
+    O << "\tdcbt";
+    if (MI->getOpcode() == PPC::DCBTST)
+      O << "st";
+    if (TH == 16)
+      O << "t";
+    O << " ";
+
+    bool IsBookE = STI.getFeatureBits()[PPC::FeatureBookE];
+    if (IsBookE && TH != 0 && TH != 16)
+      O << (unsigned int) TH << ", ";
+
+    printOperand(MI, 1, O);
+    O << ", ";
+    printOperand(MI, 2, O);
+
+    if (!IsBookE && TH != 0 && TH != 16)
+      O << ", " << (unsigned int) TH;
+
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  if (MI->getOpcode() == PPC::DCBF) {
+    unsigned char L = MI->getOperand(0).getImm();
+    if (!L || L == 1 || L == 3) {
+      O << "\tdcbf";
+      if (L == 1 || L == 3)
+        O << "l";
+      if (L == 3)
+        O << "p";
+      O << " ";
+
+      printOperand(MI, 1, O);
+      O << ", ";
+      printOperand(MI, 2, O);
+
+      printAnnotation(O, Annot);
+      return;
+    }
+  }
+  
+  if (!printAliasInstr(MI, O))
+    printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+
+void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O, 
+                                           const char *Modifier) {
+  unsigned Code = MI->getOperand(OpNo).getImm();
+
+  if (StringRef(Modifier) == "cc") {
+    switch ((PPC::Predicate)Code) {
+    case PPC::PRED_LT_MINUS:
+    case PPC::PRED_LT_PLUS:
+    case PPC::PRED_LT:
+      O << "lt";
+      return;
+    case PPC::PRED_LE_MINUS:
+    case PPC::PRED_LE_PLUS:
+    case PPC::PRED_LE:
+      O << "le";
+      return;
+    case PPC::PRED_EQ_MINUS:
+    case PPC::PRED_EQ_PLUS:
+    case PPC::PRED_EQ:
+      O << "eq";
+      return;
+    case PPC::PRED_GE_MINUS:
+    case PPC::PRED_GE_PLUS:
+    case PPC::PRED_GE:
+      O << "ge";
+      return;
+    case PPC::PRED_GT_MINUS:
+    case PPC::PRED_GT_PLUS:
+    case PPC::PRED_GT:
+      O << "gt";
+      return;
+    case PPC::PRED_NE_MINUS:
+    case PPC::PRED_NE_PLUS:
+    case PPC::PRED_NE:
+      O << "ne";
+      return;
+    case PPC::PRED_UN_MINUS:
+    case PPC::PRED_UN_PLUS:
+    case PPC::PRED_UN:
+      O << "un";
+      return;
+    case PPC::PRED_NU_MINUS:
+    case PPC::PRED_NU_PLUS:
+    case PPC::PRED_NU:
+      O << "nu";
+      return;
+    case PPC::PRED_BIT_SET:
+    case PPC::PRED_BIT_UNSET:
+      llvm_unreachable("Invalid use of bit predicate code");
+    }
+    llvm_unreachable("Invalid predicate code");
+  }
+
+  if (StringRef(Modifier) == "pm") {
+    switch ((PPC::Predicate)Code) {
+    case PPC::PRED_LT:
+    case PPC::PRED_LE:
+    case PPC::PRED_EQ:
+    case PPC::PRED_GE:
+    case PPC::PRED_GT:
+    case PPC::PRED_NE:
+    case PPC::PRED_UN:
+    case PPC::PRED_NU:
+      return;
+    case PPC::PRED_LT_MINUS:
+    case PPC::PRED_LE_MINUS:
+    case PPC::PRED_EQ_MINUS:
+    case PPC::PRED_GE_MINUS:
+    case PPC::PRED_GT_MINUS:
+    case PPC::PRED_NE_MINUS:
+    case PPC::PRED_UN_MINUS:
+    case PPC::PRED_NU_MINUS:
+      O << "-";
+      return;
+    case PPC::PRED_LT_PLUS:
+    case PPC::PRED_LE_PLUS:
+    case PPC::PRED_EQ_PLUS:
+    case PPC::PRED_GE_PLUS:
+    case PPC::PRED_GT_PLUS:
+    case PPC::PRED_NE_PLUS:
+    case PPC::PRED_UN_PLUS:
+    case PPC::PRED_NU_PLUS:
+      O << "+";
+      return;
+    case PPC::PRED_BIT_SET:
+    case PPC::PRED_BIT_UNSET:
+      llvm_unreachable("Invalid use of bit predicate code");
+    }
+    llvm_unreachable("Invalid predicate code");
+  }
+  
+  assert(StringRef(Modifier) == "reg" &&
+         "Need to specify 'cc', 'pm' or 'reg' as predicate op modifier!");
+  printOperand(MI, OpNo+1, O);
+}
+
+void PPCInstPrinter::printATBitsAsHint(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned Code = MI->getOperand(OpNo).getImm();
+  if (Code == 2)
+    O << "-";
+  else if (Code == 3)
+    O << "+";
+}
+
+void PPCInstPrinter::printU1ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 1 && "Invalid u1imm argument!");
+  O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printU2ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 3 && "Invalid u2imm argument!");
+  O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printU3ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 8 && "Invalid u3imm argument!");
+  O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 15 && "Invalid u4imm argument!");
+  O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  int Value = MI->getOperand(OpNo).getImm();
+  Value = SignExtend32<5>(Value);
+  O << (int)Value;
+}
+
+void PPCInstPrinter::printU5ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 31 && "Invalid u5imm argument!");
+  O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 63 && "Invalid u6imm argument!");
+  O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printU7ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 127 && "Invalid u7imm argument!");
+  O << (unsigned int)Value;
+}
+
+// Operands of BUILD_VECTOR are signed and we use this to print operands
+// of XXSPLTIB which are unsigned. So we simply truncate to 8 bits and
+// print as unsigned.
+void PPCInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned char Value = MI->getOperand(OpNo).getImm();
+  O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printU10ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  unsigned short Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 1023 && "Invalid u10imm argument!");
+  O << (unsigned short)Value;
+}
+
+void PPCInstPrinter::printU12ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  unsigned short Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 4095 && "Invalid u12imm argument!");
+  O << (unsigned short)Value;
+}
+
+void PPCInstPrinter::printS16ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (MI->getOperand(OpNo).isImm())
+    O << (short)MI->getOperand(OpNo).getImm();
+  else
+    printOperand(MI, OpNo, O);
+}
+
+void PPCInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (MI->getOperand(OpNo).isImm())
+    O << (unsigned short)MI->getOperand(OpNo).getImm();
+  else
+    printOperand(MI, OpNo, O);
+}
+
+void PPCInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (!MI->getOperand(OpNo).isImm())
+    return printOperand(MI, OpNo, O);
+
+  // Branches can take an immediate operand.  This is used by the branch
+  // selection pass to print .+8, an eight byte displacement from the PC.
+  O << ".+";
+  printAbsBranchOperand(MI, OpNo, O);
+}
+
+void PPCInstPrinter::printAbsBranchOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  if (!MI->getOperand(OpNo).isImm())
+    return printOperand(MI, OpNo, O);
+
+  O << SignExtend32<32>((unsigned)MI->getOperand(OpNo).getImm() << 2);
+}
+
+
+void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  unsigned CCReg = MI->getOperand(OpNo).getReg();
+  unsigned RegNo;
+  switch (CCReg) {
+  default: llvm_unreachable("Unknown CR register");
+  case PPC::CR0: RegNo = 0; break;
+  case PPC::CR1: RegNo = 1; break;
+  case PPC::CR2: RegNo = 2; break;
+  case PPC::CR3: RegNo = 3; break;
+  case PPC::CR4: RegNo = 4; break;
+  case PPC::CR5: RegNo = 5; break;
+  case PPC::CR6: RegNo = 6; break;
+  case PPC::CR7: RegNo = 7; break;
+  }
+  O << (0x80 >> RegNo);
+}
+
+void PPCInstPrinter::printMemRegImm(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  printS16ImmOperand(MI, OpNo, O);
+  O << '(';
+  if (MI->getOperand(OpNo+1).getReg() == PPC::R0)
+    O << "0";
+  else
+    printOperand(MI, OpNo+1, O);
+  O << ')';
+}
+
+void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  // When used as the base register, r0 reads constant zero rather than
+  // the value contained in the register.  For this reason, the darwin
+  // assembler requires that we print r0 as 0 (no r) when used as the base.
+  if (MI->getOperand(OpNo).getReg() == PPC::R0)
+    O << "0";
+  else
+    printOperand(MI, OpNo, O);
+  O << ", ";
+  printOperand(MI, OpNo+1, O);
+}
+
+void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  // On PPC64, VariantKind is VK_None, but on PPC32, it's VK_PLT, and it must
+  // come at the _end_ of the expression.
+  const MCOperand &Op = MI->getOperand(OpNo);
+  const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*Op.getExpr());
+  O << refExp.getSymbol().getName();
+  O << '(';
+  printOperand(MI, OpNo+1, O);
+  O << ')';
+  if (refExp.getKind() != MCSymbolRefExpr::VK_None)
+    O << '@' << MCSymbolRefExpr::getVariantKindName(refExp.getKind());
+}
+
+
+/// stripRegisterPrefix - This method strips the character prefix from a
+/// register name so that only the number is left.  Used by for linux asm.
+static const char *stripRegisterPrefix(const char *RegName) {
+  if (FullRegNames || ShowVSRNumsAsVR)
+    return RegName;
+
+  switch (RegName[0]) {
+  case 'r':
+  case 'f':
+  case 'q': // for QPX
+  case 'v':
+    if (RegName[1] == 's')
+      return RegName + 2;
+    return RegName + 1;
+  case 'c': if (RegName[1] == 'r') return RegName + 2;
+  }
+  
+  return RegName;
+}
+
+void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+
+    // There are VSX instructions that use VSX register numbering (vs0 - vs63)
+    // as well as those that use VMX register numbering (v0 - v31 which
+    // correspond to vs32 - vs63). If we have an instruction that uses VSX
+    // numbering, we need to convert the VMX registers to VSX registers.
+    // Namely, we print 32-63 when the instruction operates on one of the
+    // VMX registers.
+    // (Please synchronize with PPCAsmPrinter::printOperand)
+    if ((MII.get(MI->getOpcode()).TSFlags & PPCII::UseVSXReg) &&
+        !ShowVSRNumsAsVR) {
+      if (PPCInstrInfo::isVRRegister(Reg))
+        Reg = PPC::VSX32 + (Reg - PPC::V0);
+      else if (PPCInstrInfo::isVFRegister(Reg))
+        Reg = PPC::VSX32 + (Reg - PPC::VF0);
+    }
+
+    const char *RegName = getRegisterName(Reg);
+    // The linux and AIX assembler does not take register prefixes.
+    if (!isDarwinSyntax())
+      RegName = stripRegisterPrefix(RegName);
+    
+    O << RegName;
+    return;
+  }
+  
+  if (Op.isImm()) {
+    O << Op.getImm();
+    return;
+  }
+  
+  assert(Op.isExpr() && "unknown operand kind in printOperand");
+  Op.getExpr()->print(O, &MAI);
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
new file mode 100644
index 000000000000..9c79ffb1176c
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -0,0 +1,74 @@
+//===- PPCInstPrinter.h - Convert PPC MCInst to assembly syntax -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an PPC MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H
+#define LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class PPCInstPrinter : public MCInstPrinter {
+  bool IsDarwin;
+public:
+  PPCInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                 const MCRegisterInfo &MRI, bool isDarwin)
+    : MCInstPrinter(MAI, MII, MRI), IsDarwin(isDarwin) {}
+  
+  bool isDarwinSyntax() const {
+    return IsDarwin;
+  }
+  
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+  
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+  
+  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx,
+                               raw_ostream &OS);
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPredicateOperand(const MCInst *MI, unsigned OpNo,
+                             raw_ostream &O, const char *Modifier = nullptr);
+  void printATBitsAsHint(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printU1ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU2ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU3ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU7ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU10ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU12ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printS16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAbsBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printTLSCall(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printcrbitm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printMemRegImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemRegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
new file mode 100644
index 000000000000..5847b3a52bfc
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -0,0 +1,241 @@
+//===-- PPCAsmBackend.cpp - PPC Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+  case PPC::fixup_ppc_nofixup:
+    return Value;
+  case PPC::fixup_ppc_brcond14:
+  case PPC::fixup_ppc_brcond14abs:
+    return Value & 0xfffc;
+  case PPC::fixup_ppc_br24:
+  case PPC::fixup_ppc_br24abs:
+    return Value & 0x3fffffc;
+  case PPC::fixup_ppc_half16:
+    return Value & 0xffff;
+  case PPC::fixup_ppc_half16ds:
+    return Value & 0xfffc;
+  }
+}
+
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+    return 1;
+  case FK_Data_2:
+  case PPC::fixup_ppc_half16:
+  case PPC::fixup_ppc_half16ds:
+    return 2;
+  case FK_Data_4:
+  case PPC::fixup_ppc_brcond14:
+  case PPC::fixup_ppc_brcond14abs:
+  case PPC::fixup_ppc_br24:
+  case PPC::fixup_ppc_br24abs:
+    return 4;
+  case FK_Data_8:
+    return 8;
+  case PPC::fixup_ppc_nofixup:
+    return 0;
+  }
+}
+
+namespace {
+
+class PPCAsmBackend : public MCAsmBackend {
+  const Target &TheTarget;
+  bool IsLittleEndian;
+public:
+  PPCAsmBackend(const Target &T, bool isLittle) : MCAsmBackend(), TheTarget(T),
+    IsLittleEndian(isLittle) {}
+
+  unsigned getNumFixupKinds() const override {
+    return PPC::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    const static MCFixupKindInfo InfosBE[PPC::NumTargetFixupKinds] = {
+      // name                    offset  bits  flags
+      { "fixup_ppc_br24",        6,      24,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_brcond14",    16,     14,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_br24abs",     6,      24,   0 },
+      { "fixup_ppc_brcond14abs", 16,     14,   0 },
+      { "fixup_ppc_half16",       0,     16,   0 },
+      { "fixup_ppc_half16ds",     0,     14,   0 },
+      { "fixup_ppc_nofixup",      0,      0,   0 }
+    };
+    const static MCFixupKindInfo InfosLE[PPC::NumTargetFixupKinds] = {
+      // name                    offset  bits  flags
+      { "fixup_ppc_br24",        2,      24,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_brcond14",    2,      14,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_br24abs",     2,      24,   0 },
+      { "fixup_ppc_brcond14abs", 2,      14,   0 },
+      { "fixup_ppc_half16",      0,      16,   0 },
+      { "fixup_ppc_half16ds",    2,      14,   0 },
+      { "fixup_ppc_nofixup",     0,       0,   0 }
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return (IsLittleEndian? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
+  }
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override {
+    Value = adjustFixupValue(Fixup.getKind(), Value);
+    if (!Value) return;           // Doesn't change encoding.
+
+    unsigned Offset = Fixup.getOffset();
+    unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+
+    // For each byte of the fragment that the fixup touches, mask in the bits
+    // from the fixup value. The Value has been "split up" into the appropriate
+    // bitfields above.
+    for (unsigned i = 0; i != NumBytes; ++i) {
+      unsigned Idx = IsLittleEndian ? i : (NumBytes - 1 - i);
+      Data[Offset + i] |= uint8_t((Value >> (Idx * 8)) & 0xff);
+    }
+  }
+
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override {
+    switch ((PPC::Fixups)Fixup.getKind()) {
+    default: break;
+    case PPC::fixup_ppc_br24:
+    case PPC::fixup_ppc_br24abs:
+      // If the target symbol has a local entry point we must not attempt
+      // to resolve the fixup directly.  Emit a relocation and leave
+      // resolution of the final target address to the linker.
+      if (const MCSymbolRefExpr *A = Target.getSymA()) {
+        if (const auto *S = dyn_cast<MCSymbolELF>(&A->getSymbol())) {
+          // The "other" values are stored in the last 6 bits of the second
+          // byte. The traditional defines for STO values assume the full byte
+          // and thus the shift to pack it.
+          unsigned Other = S->getOther() << 2;
+          if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0)
+            IsResolved = false;
+        }
+      }
+      break;
+    }
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override {
+    // FIXME.
+    return false;
+  }
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                            uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    // FIXME.
+    llvm_unreachable("relaxInstruction() unimplemented");
+  }
+
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {
+    // FIXME.
+    llvm_unreachable("relaxInstruction() unimplemented");
+  }
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
+    uint64_t NumNops = Count / 4;
+    for (uint64_t i = 0; i != NumNops; ++i)
+      OW->write32(0x60000000);
+
+    OW->WriteZeros(Count % 4);
+
+    return true;
+  }
+
+  unsigned getPointerSize() const {
+    StringRef Name = TheTarget.getName();
+    if (Name == "ppc64" || Name == "ppc64le") return 8;
+    assert(Name == "ppc32" && "Unknown target name!");
+    return 4;
+  }
+
+  bool isLittleEndian() const {
+    return IsLittleEndian;
+  }
+};
+} // end anonymous namespace
+
+
+// FIXME: This should be in a separate file.
+namespace {
+  class DarwinPPCAsmBackend : public PPCAsmBackend {
+  public:
+    DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, false) { }
+
+    MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+      bool is64 = getPointerSize() == 8;
+      return createPPCMachObjectWriter(
+          OS,
+          /*Is64Bit=*/is64,
+          (is64 ? MachO::CPU_TYPE_POWERPC64 : MachO::CPU_TYPE_POWERPC),
+          MachO::CPU_SUBTYPE_POWERPC_ALL);
+    }
+  };
+
+  class ELFPPCAsmBackend : public PPCAsmBackend {
+    uint8_t OSABI;
+  public:
+    ELFPPCAsmBackend(const Target &T, bool IsLittleEndian, uint8_t OSABI) :
+      PPCAsmBackend(T, IsLittleEndian), OSABI(OSABI) { }
+
+    MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+      bool is64 = getPointerSize() == 8;
+      return createPPCELFObjectWriter(OS, is64, isLittleEndian(), OSABI);
+    }
+  };
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        const Triple &TT, StringRef CPU,
+                                        const MCTargetOptions &Options) {
+  if (TT.isOSDarwin())
+    return new DarwinPPCAsmBackend(T);
+
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
+  bool IsLittleEndian = TT.getArch() == Triple::ppc64le;
+  return new ELFPPCAsmBackend(T, IsLittleEndian, OSABI);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
new file mode 100644
index 000000000000..fd279c60f3f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -0,0 +1,425 @@
+//===-- PPCELFObjectWriter.cpp - PPC ELF Writer ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "MCTargetDesc/PPCMCExpr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+  class PPCELFObjectWriter : public MCELFObjectTargetWriter {
+  public:
+    PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI);
+
+  protected:
+    unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                          const MCFixup &Fixup, bool IsPCRel) const override;
+
+    bool needsRelocateWithSymbol(const MCSymbol &Sym,
+                                 unsigned Type) const override;
+  };
+}
+
+PPCELFObjectWriter::PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI)
+  : MCELFObjectTargetWriter(Is64Bit, OSABI,
+                            Is64Bit ?  ELF::EM_PPC64 : ELF::EM_PPC,
+                            /*HasRelocationAddend*/ true) {}
+
+static MCSymbolRefExpr::VariantKind getAccessVariant(const MCValue &Target,
+                                                     const MCFixup &Fixup) {
+  const MCExpr *Expr = Fixup.getValue();
+
+  if (Expr->getKind() != MCExpr::Target)
+    return Target.getAccessVariant();
+
+  switch (cast<PPCMCExpr>(Expr)->getKind()) {
+  case PPCMCExpr::VK_PPC_None:
+    return MCSymbolRefExpr::VK_None;
+  case PPCMCExpr::VK_PPC_LO:
+    return MCSymbolRefExpr::VK_PPC_LO;
+  case PPCMCExpr::VK_PPC_HI:
+    return MCSymbolRefExpr::VK_PPC_HI;
+  case PPCMCExpr::VK_PPC_HA:
+    return MCSymbolRefExpr::VK_PPC_HA;
+  case PPCMCExpr::VK_PPC_HIGHERA:
+    return MCSymbolRefExpr::VK_PPC_HIGHERA;
+  case PPCMCExpr::VK_PPC_HIGHER:
+    return MCSymbolRefExpr::VK_PPC_HIGHER;
+  case PPCMCExpr::VK_PPC_HIGHEST:
+    return MCSymbolRefExpr::VK_PPC_HIGHEST;
+  case PPCMCExpr::VK_PPC_HIGHESTA:
+    return MCSymbolRefExpr::VK_PPC_HIGHESTA;
+  }
+  llvm_unreachable("unknown PPCMCExpr kind");
+}
+
+unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel) const {
+  MCSymbolRefExpr::VariantKind Modifier = getAccessVariant(Target, Fixup);
+
+  // determine the type of the relocation
+  unsigned Type;
+  if (IsPCRel) {
+    switch ((unsigned)Fixup.getKind()) {
+    default:
+      llvm_unreachable("Unimplemented");
+    case PPC::fixup_ppc_br24:
+    case PPC::fixup_ppc_br24abs:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC_REL24;
+        break;
+      case MCSymbolRefExpr::VK_PLT:
+        Type = ELF::R_PPC_PLTREL24;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LOCAL:
+        Type = ELF::R_PPC_LOCAL24PC;
+        break;
+      }
+      break;
+    case PPC::fixup_ppc_brcond14:
+    case PPC::fixup_ppc_brcond14abs:
+      Type = ELF::R_PPC_REL14;
+      break;
+    case PPC::fixup_ppc_half16:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC_REL16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = ELF::R_PPC_REL16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HI:
+        Type = ELF::R_PPC_REL16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HA:
+        Type = ELF::R_PPC_REL16_HA;
+        break;
+      }
+      break;
+    case PPC::fixup_ppc_half16ds:
+      Target.print(errs());
+      errs() << '\n';
+      report_fatal_error("Invalid PC-relative half16ds relocation");
+    case FK_Data_4:
+    case FK_PCRel_4:
+      Type = ELF::R_PPC_REL32;
+      break;
+    case FK_Data_8:
+    case FK_PCRel_8:
+      Type = ELF::R_PPC64_REL64;
+      break;
+    }
+  } else {
+    switch ((unsigned)Fixup.getKind()) {
+      default: llvm_unreachable("invalid fixup kind!");
+    case PPC::fixup_ppc_br24abs:
+      Type = ELF::R_PPC_ADDR24;
+      break;
+    case PPC::fixup_ppc_brcond14abs:
+      Type = ELF::R_PPC_ADDR14; // XXX: or BRNTAKEN?_
+      break;
+    case PPC::fixup_ppc_half16:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC_ADDR16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = ELF::R_PPC_ADDR16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HI:
+        Type = ELF::R_PPC_ADDR16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HA:
+        Type = ELF::R_PPC_ADDR16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HIGHER:
+        Type = ELF::R_PPC64_ADDR16_HIGHER;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HIGHERA:
+        Type = ELF::R_PPC64_ADDR16_HIGHERA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HIGHEST:
+        Type = ELF::R_PPC64_ADDR16_HIGHEST;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HIGHESTA:
+        Type = ELF::R_PPC64_ADDR16_HIGHESTA;
+        break;
+      case MCSymbolRefExpr::VK_GOT:
+        Type = ELF::R_PPC_GOT16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_LO:
+        Type = ELF::R_PPC_GOT16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_HI:
+        Type = ELF::R_PPC_GOT16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_HA:
+        Type = ELF::R_PPC_GOT16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TOC:
+        Type = ELF::R_PPC64_TOC16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TOC_LO:
+        Type = ELF::R_PPC64_TOC16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TOC_HI:
+        Type = ELF::R_PPC64_TOC16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TOC_HA:
+        Type = ELF::R_PPC64_TOC16_HA;
+        break;
+      case MCSymbolRefExpr::VK_TPREL:
+        Type = ELF::R_PPC_TPREL16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_LO:
+        Type = ELF::R_PPC_TPREL16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HI:
+        Type = ELF::R_PPC_TPREL16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HA:
+        Type = ELF::R_PPC_TPREL16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HIGHER:
+        Type = ELF::R_PPC64_TPREL16_HIGHER;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HIGHERA:
+        Type = ELF::R_PPC64_TPREL16_HIGHERA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HIGHEST:
+        Type = ELF::R_PPC64_TPREL16_HIGHEST;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HIGHESTA:
+        Type = ELF::R_PPC64_TPREL16_HIGHESTA;
+        break;
+      case MCSymbolRefExpr::VK_DTPREL:
+        Type = ELF::R_PPC64_DTPREL16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_LO:
+        Type = ELF::R_PPC64_DTPREL16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HI:
+        Type = ELF::R_PPC64_DTPREL16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HA:
+        Type = ELF::R_PPC64_DTPREL16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHER:
+        Type = ELF::R_PPC64_DTPREL16_HIGHER;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHERA:
+        Type = ELF::R_PPC64_DTPREL16_HIGHERA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHEST:
+        Type = ELF::R_PPC64_DTPREL16_HIGHEST;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHESTA:
+        Type = ELF::R_PPC64_DTPREL16_HIGHESTA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD:
+        if (is64Bit())
+          Type = ELF::R_PPC64_GOT_TLSGD16;
+        else
+          Type = ELF::R_PPC_GOT_TLSGD16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO:
+        Type = ELF::R_PPC64_GOT_TLSGD16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HI:
+        Type = ELF::R_PPC64_GOT_TLSGD16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA:
+        Type = ELF::R_PPC64_GOT_TLSGD16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD:
+        if (is64Bit())
+          Type = ELF::R_PPC64_GOT_TLSLD16;
+        else
+          Type = ELF::R_PPC_GOT_TLSLD16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO:
+        Type = ELF::R_PPC64_GOT_TLSLD16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HI:
+        Type = ELF::R_PPC64_GOT_TLSLD16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA:
+        Type = ELF::R_PPC64_GOT_TLSLD16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL:
+        /* We don't have R_PPC64_GOT_TPREL16, but since GOT offsets
+           are always 4-aligned, we can use R_PPC64_GOT_TPREL16_DS.  */
+        Type = ELF::R_PPC64_GOT_TPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO:
+        /* We don't have R_PPC64_GOT_TPREL16_LO, but since GOT offsets
+           are always 4-aligned, we can use R_PPC64_GOT_TPREL16_LO_DS.  */
+        Type = ELF::R_PPC64_GOT_TPREL16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL_HI:
+        Type = ELF::R_PPC64_GOT_TPREL16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL:
+        /* We don't have R_PPC64_GOT_DTPREL16, but since GOT offsets
+           are always 4-aligned, we can use R_PPC64_GOT_DTPREL16_DS.  */
+        Type = ELF::R_PPC64_GOT_DTPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_LO:
+        /* We don't have R_PPC64_GOT_DTPREL16_LO, but since GOT offsets
+           are always 4-aligned, we can use R_PPC64_GOT_DTPREL16_LO_DS.  */
+        Type = ELF::R_PPC64_GOT_DTPREL16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA:
+        Type = ELF::R_PPC64_GOT_TPREL16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_HI:
+        Type = ELF::R_PPC64_GOT_DTPREL16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_HA:
+        Type = ELF::R_PPC64_GOT_DTPREL16_HA;
+        break;
+      }
+      break;
+    case PPC::fixup_ppc_half16ds:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC64_ADDR16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = ELF::R_PPC64_ADDR16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_GOT:
+        Type = ELF::R_PPC64_GOT16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_LO:
+        Type = ELF::R_PPC64_GOT16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TOC:
+        Type = ELF::R_PPC64_TOC16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TOC_LO:
+        Type = ELF::R_PPC64_TOC16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_TPREL:
+        Type = ELF::R_PPC64_TPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_LO:
+        Type = ELF::R_PPC64_TPREL16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_DTPREL:
+        Type = ELF::R_PPC64_DTPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_LO:
+        Type = ELF::R_PPC64_DTPREL16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL:
+        Type = ELF::R_PPC64_GOT_TPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO:
+        Type = ELF::R_PPC64_GOT_TPREL16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL:
+        Type = ELF::R_PPC64_GOT_DTPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_LO:
+        Type = ELF::R_PPC64_GOT_DTPREL16_LO_DS;
+        break;
+      }
+      break;
+    case PPC::fixup_ppc_nofixup:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_PPC_TLSGD:
+        if (is64Bit())
+          Type = ELF::R_PPC64_TLSGD;
+        else
+          Type = ELF::R_PPC_TLSGD;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TLSLD:
+        if (is64Bit())
+          Type = ELF::R_PPC64_TLSLD;
+        else
+          Type = ELF::R_PPC_TLSLD;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TLS:
+        if (is64Bit())
+          Type = ELF::R_PPC64_TLS;
+        else
+          Type = ELF::R_PPC_TLS;
+        break;
+      }
+      break;
+    case FK_Data_8:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_PPC_TOCBASE:
+        Type = ELF::R_PPC64_TOC;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC64_ADDR64;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPMOD:
+        Type = ELF::R_PPC64_DTPMOD64;
+        break;
+      case MCSymbolRefExpr::VK_TPREL:
+        Type = ELF::R_PPC64_TPREL64;
+        break;
+      case MCSymbolRefExpr::VK_DTPREL:
+        Type = ELF::R_PPC64_DTPREL64;
+        break;
+      }
+      break;
+    case FK_Data_4:
+      Type = ELF::R_PPC_ADDR32;
+      break;
+    case FK_Data_2:
+      Type = ELF::R_PPC_ADDR16;
+      break;
+    }
+  }
+  return Type;
+}
+
+bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
+                                                 unsigned Type) const {
+  switch (Type) {
+    default:
+      return false;
+
+    case ELF::R_PPC_REL24:
+      // If the target symbol has a local entry point, we must keep the
+      // target symbol to preserve that information for the linker.
+      // The "other" values are stored in the last 6 bits of the second byte.
+      // The traditional defines for STO values assume the full byte and thus
+      // the shift to pack it.
+      unsigned Other = cast<MCSymbolELF>(Sym).getOther() << 2;
+      return (Other & ELF::STO_PPC64_LOCAL_MASK) != 0;
+  }
+}
+
+MCObjectWriter *llvm::createPPCELFObjectWriter(raw_pwrite_stream &OS,
+                                               bool Is64Bit,
+                                               bool IsLittleEndian,
+                                               uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new PPCELFObjectWriter(Is64Bit, OSABI);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
new file mode 100644
index 000000000000..ae43e59d3cb1
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -0,0 +1,56 @@
+//===-- PPCFixupKinds.h - PPC Specific Fixup Entries ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCFIXUPKINDS_H
+#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+#undef PPC
+
+namespace llvm {
+namespace PPC {
+enum Fixups {
+  // fixup_ppc_br24 - 24-bit PC relative relocation for direct branches like 'b'
+  // and 'bl'.
+  fixup_ppc_br24 = FirstTargetFixupKind,
+  
+  /// fixup_ppc_brcond14 - 14-bit PC relative relocation for conditional
+  /// branches.
+  fixup_ppc_brcond14,
+  
+  /// fixup_ppc_br24abs - 24-bit absolute relocation for direct branches
+  /// like 'ba' and 'bla'.
+  fixup_ppc_br24abs,
+
+  /// fixup_ppc_brcond14abs - 14-bit absolute relocation for conditional
+  /// branches.
+  fixup_ppc_brcond14abs,
+
+  /// fixup_ppc_half16 - A 16-bit fixup corresponding to lo16(_foo)
+  /// or ha16(_foo) for instrs like 'li' or 'addis'.
+  fixup_ppc_half16,
+  
+  /// fixup_ppc_half16ds - A 14-bit fixup corresponding to lo16(_foo) with
+  /// implied 2 zero bits for instrs like 'std'.
+  fixup_ppc_half16ds,
+
+  /// fixup_ppc_nofixup - Not a true fixup, but ties a symbol to a call
+  /// to __tls_get_addr for the TLS general and local dynamic models,
+  /// or inserts the thread-pointer register number.
+  fixup_ppc_nofixup,
+  
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
new file mode 100644
index 000000000000..d8fab5b7c01a
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -0,0 +1,83 @@
+//===-- PPCMCAsmInfo.cpp - PPC asm properties -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MCAsmInfoDarwin properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+void PPCMCAsmInfoDarwin::anchor() { }
+
+PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit, const Triple& T) {
+  if (is64Bit) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
+  IsLittleEndian = false;
+
+  CommentString = ";";
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  if (!is64Bit)
+    Data64bitsDirective = nullptr; // We can't emit a 64-bit unit in PPC32 mode.
+
+  AssemblerDialect = 1;           // New-Style mnemonics.
+  SupportsDebugInformation= true; // Debug information.
+
+  // The installed assembler for OSX < 10.6 lacks some directives.
+  // FIXME: this should really be a check on the assembler characteristics
+  // rather than OS version
+  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6))
+    HasWeakDefCanBeHiddenDirective = false;
+
+  UseIntegratedAssembler = true;
+}
+
+void PPCELFMCAsmInfo::anchor() { }
+
+PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) {
+  // FIXME: This is not always needed. For example, it is not needed in the
+  // v2 abi.
+  NeedsLocalForSize = true;
+
+  if (is64Bit) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
+  IsLittleEndian = T.getArch() == Triple::ppc64le;
+
+  // ".comm align is in bytes but .align is pow-2."
+  AlignmentIsInBytes = false;
+
+  CommentString = "#";
+
+  // Uses '.section' before '.bss' directive
+  UsesELFSectionDirectiveForBSS = true;  
+
+  // Debug Information
+  SupportsDebugInformation = true;
+
+  DollarIsPC = true;
+
+  // Set up DWARF directives
+  MinInstAlignment = 4;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+    
+  ZeroDirective = "\t.space\t";
+  Data64bitsDirective = is64Bit ? "\t.quad\t" : nullptr;
+  AssemblerDialect = 1;           // New-Style mnemonics.
+  LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
+
+  UseIntegratedAssembler = true;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
new file mode 100644
index 000000000000..e252ac944d40
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -0,0 +1,39 @@
+//===-- PPCMCAsmInfo.h - PPC asm properties --------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MCAsmInfoDarwin class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCASMINFO_H
+#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin {
+  virtual void anchor();
+
+public:
+  explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple &);
+};
+
+class PPCELFMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit PPCELFMCAsmInfo(bool is64Bit, const Triple &);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
new file mode 100644
index 000000000000..017d21af08a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -0,0 +1,389 @@
+//===-- PPCMCCodeEmitter.cpp - Convert PPC code to machine code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPCMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace {
+class PPCMCCodeEmitter : public MCCodeEmitter {
+  PPCMCCodeEmitter(const PPCMCCodeEmitter &) = delete;
+  void operator=(const PPCMCCodeEmitter &) = delete;
+
+  const MCInstrInfo &MCII;
+  const MCContext &CTX;
+  bool IsLittleEndian;
+
+public:
+  PPCMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+      : MCII(mcii), CTX(ctx),
+        IsLittleEndian(ctx.getAsmInfo()->isLittleEndian()) {}
+
+  ~PPCMCCodeEmitter() override {}
+
+  unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+  unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+  unsigned getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  unsigned getImm16Encoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
+  unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+  unsigned getSPE8DisEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned getSPE4DisEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned getSPE2DisEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getTLSCallEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override {
+    verifyInstructionPredicates(MI,
+                                computeAvailableFeatures(STI.getFeatureBits()));
+
+    unsigned Opcode = MI.getOpcode();
+    const MCInstrDesc &Desc = MCII.get(Opcode);
+
+    uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
+
+    // Output the constant in big/little endian byte order.
+    unsigned Size = Desc.getSize();
+    switch (Size) {
+    case 0:
+      break;
+    case 4:
+      if (IsLittleEndian) {
+        support::endian::Writer<support::little>(OS).write<uint32_t>(Bits);
+      } else {
+        support::endian::Writer<support::big>(OS).write<uint32_t>(Bits);
+      }
+      break;
+    case 8:
+      // If we emit a pair of instructions, the first one is
+      // always in the top 32 bits, even on little-endian.
+      if (IsLittleEndian) {
+        uint64_t Swapped = (Bits << 32) | (Bits >> 32);
+        support::endian::Writer<support::little>(OS).write<uint64_t>(Swapped);
+      } else {
+        support::endian::Writer<support::big>(OS).write<uint64_t>(Bits);
+      }
+      break;
+    default:
+      llvm_unreachable ("Invalid instruction size");
+    }
+    
+    ++MCNumEmitted;  // Keep track of the # of mi's emitted.
+  }
+
+private:
+  uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+  void verifyInstructionPredicates(const MCInst &MI,
+                                   uint64_t AvailableFeatures) const;
+};
+  
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
+                                            MCContext &Ctx) {
+  return new PPCMCCodeEmitter(MCII, Ctx);
+}
+
+unsigned PPCMCCodeEmitter::
+getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
+  
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_br24));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::getCondBrEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
+
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_brcond14));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::
+getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
+
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_br24abs));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::
+getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
+
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_brcond14abs));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
+  
+  // Add a fixup for the immediate field.
+  Fixups.push_back(MCFixup::create(IsLittleEndian? 0 : 2, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_half16));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  // Encode (imm, reg) as a memri, which has the low 16-bits as the
+  // displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 16;
+  
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm())
+    return (getMachineOpValue(MI, MO, Fixups, STI) & 0xFFFF) | RegBits;
+  
+  // Add a fixup for the displacement field.
+  Fixups.push_back(MCFixup::create(IsLittleEndian? 0 : 2, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_half16));
+  return RegBits;
+}
+
+
+unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  // Encode (imm, reg) as a memrix, which has the low 14-bits as the
+  // displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 14;
+  
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm())
+    return ((getMachineOpValue(MI, MO, Fixups, STI) >> 2) & 0x3FFF) | RegBits;
+  
+  // Add a fixup for the displacement field.
+  Fixups.push_back(MCFixup::create(IsLittleEndian? 0 : 2, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_half16ds));
+  return RegBits;
+}
+
+unsigned PPCMCCodeEmitter::getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  // Encode (imm, reg) as a memrix16, which has the low 12-bits as the
+  // displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 12;
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  assert(MO.isImm());
+
+  return ((getMachineOpValue(MI, MO, Fixups, STI) >> 4) & 0xFFF) | RegBits;
+}
+
+unsigned PPCMCCodeEmitter::getSPE8DisEncoding(const MCInst &MI, unsigned OpNo,
+                                              SmallVectorImpl<MCFixup> &Fixups,
+                                              const MCSubtargetInfo &STI)
+                                              const {
+  // Encode (imm, reg) as a spe8dis, which has the low 5-bits of (imm / 8)
+  // as the displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  uint32_t RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 5;
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  assert(MO.isImm());
+  uint32_t Imm = getMachineOpValue(MI, MO, Fixups, STI) >> 3;
+  return reverseBits(Imm | RegBits) >> 22;
+}
+
+
+unsigned PPCMCCodeEmitter::getSPE4DisEncoding(const MCInst &MI, unsigned OpNo,
+                                              SmallVectorImpl<MCFixup> &Fixups,
+                                              const MCSubtargetInfo &STI)
+                                              const {
+  // Encode (imm, reg) as a spe4dis, which has the low 5-bits of (imm / 4)
+  // as the displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  uint32_t RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 5;
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  assert(MO.isImm());
+  uint32_t Imm = getMachineOpValue(MI, MO, Fixups, STI) >> 2;
+  return reverseBits(Imm | RegBits) >> 22;
+}
+
+
+unsigned PPCMCCodeEmitter::getSPE2DisEncoding(const MCInst &MI, unsigned OpNo,
+                                              SmallVectorImpl<MCFixup> &Fixups,
+                                              const MCSubtargetInfo &STI)
+                                              const {
+  // Encode (imm, reg) as a spe2dis, which has the low 5-bits of (imm / 2)
+  // as the displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  uint32_t RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 5;
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  assert(MO.isImm());
+  uint32_t Imm = getMachineOpValue(MI, MO, Fixups, STI) >> 1;
+  return reverseBits(Imm | RegBits) >> 22;
+}
+
+
+unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg()) return getMachineOpValue(MI, MO, Fixups, STI);
+  
+  // Add a fixup for the TLS register, which simply provides a relocation
+  // hint to the linker that this statement is part of a relocation sequence.
+  // Return the thread-pointer register's encoding.
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_nofixup));
+  const Triple &TT = STI.getTargetTriple();
+  bool isPPC64 = TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le;
+  return CTX.getRegisterInfo()->getEncodingValue(isPPC64 ? PPC::X13 : PPC::R2);
+}
+
+unsigned PPCMCCodeEmitter::getTLSCallEncoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  // For special TLS calls, we need two fixups; one for the branch target
+  // (__tls_get_addr), which we create via getDirectBrEncoding as usual,
+  // and one for the TLSGD or TLSLD symbol, which is emitted here.
+  const MCOperand &MO = MI.getOperand(OpNo+1);
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_nofixup));
+  return getDirectBrEncoding(MI, OpNo, Fixups, STI);
+}
+
+unsigned PPCMCCodeEmitter::
+get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  assert((MI.getOpcode() == PPC::MTOCRF || MI.getOpcode() == PPC::MTOCRF8 ||
+          MI.getOpcode() == PPC::MFOCRF || MI.getOpcode() == PPC::MFOCRF8) &&
+         (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7));
+  return 0x80 >> CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+}
+
+unsigned PPCMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  if (MO.isReg()) {
+    // MTOCRF/MFOCRF should go through get_crbitm_encoding for the CR operand.
+    // The GPR operand should come through here though.
+    assert((MI.getOpcode() != PPC::MTOCRF && MI.getOpcode() != PPC::MTOCRF8 &&
+            MI.getOpcode() != PPC::MFOCRF && MI.getOpcode() != PPC::MFOCRF8) ||
+           MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
+    unsigned Reg = MO.getReg();
+    unsigned Encode = CTX.getRegisterInfo()->getEncodingValue(Reg);
+
+    if ((MCII.get(MI.getOpcode()).TSFlags & PPCII::UseVSXReg))
+      if (PPCInstrInfo::isVRRegister(Reg))
+        Encode += 32;
+
+    return Encode;
+  }
+  
+  assert(MO.isImm() &&
+         "Relocation required in an instruction that we cannot encode!");
+  return MO.getImm();
+}
+
+
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "PPCGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
new file mode 100644
index 000000000000..6b97d4c1456b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -0,0 +1,150 @@
+//===-- PPCMCExpr.cpp - PPC specific MC expression classes ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCFixupKinds.h"
+#include "PPCMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppcmcexpr"
+
+const PPCMCExpr*
+PPCMCExpr::create(VariantKind Kind, const MCExpr *Expr,
+                  bool isDarwin, MCContext &Ctx) {
+  return new (Ctx) PPCMCExpr(Kind, Expr, isDarwin);
+}
+
+void PPCMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  if (isDarwinSyntax()) {
+    switch (Kind) {
+    default: llvm_unreachable("Invalid kind!");
+    case VK_PPC_LO: OS << "lo16"; break;
+    case VK_PPC_HI: OS << "hi16"; break;
+    case VK_PPC_HA: OS << "ha16"; break;
+    }
+
+    OS << '(';
+    getSubExpr()->print(OS, MAI);
+    OS << ')';
+  } else {
+    getSubExpr()->print(OS, MAI);
+
+    switch (Kind) {
+    default: llvm_unreachable("Invalid kind!");
+    case VK_PPC_LO: OS << "@l"; break;
+    case VK_PPC_HI: OS << "@h"; break;
+    case VK_PPC_HA: OS << "@ha"; break;
+    case VK_PPC_HIGHER: OS << "@higher"; break;
+    case VK_PPC_HIGHERA: OS << "@highera"; break;
+    case VK_PPC_HIGHEST: OS << "@highest"; break;
+    case VK_PPC_HIGHESTA: OS << "@highesta"; break;
+    }
+  }
+}
+
+bool
+PPCMCExpr::evaluateAsConstant(int64_t &Res) const {
+  MCValue Value;
+
+  if (!getSubExpr()->evaluateAsRelocatable(Value, nullptr, nullptr))
+    return false;
+
+  if (!Value.isAbsolute())
+    return false;
+
+  Res = evaluateAsInt64(Value.getConstant());
+  return true;
+}
+
+int64_t
+PPCMCExpr::evaluateAsInt64(int64_t Value) const {
+  switch (Kind) {
+    case VK_PPC_LO:
+      return Value & 0xffff;
+    case VK_PPC_HI:
+      return (Value >> 16) & 0xffff;
+    case VK_PPC_HA:
+      return ((Value + 0x8000) >> 16) & 0xffff;
+    case VK_PPC_HIGHER:
+      return (Value >> 32) & 0xffff;
+    case VK_PPC_HIGHERA:
+      return ((Value + 0x8000) >> 32) & 0xffff;
+    case VK_PPC_HIGHEST:
+      return (Value >> 48) & 0xffff;
+    case VK_PPC_HIGHESTA:
+      return ((Value + 0x8000) >> 48) & 0xffff;
+    case VK_PPC_None:
+      break;
+  }
+  llvm_unreachable("Invalid kind!");
+}
+
+bool
+PPCMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+                                     const MCAsmLayout *Layout,
+                                     const MCFixup *Fixup) const {
+  MCValue Value;
+
+  if (!getSubExpr()->evaluateAsRelocatable(Value, Layout, Fixup))
+    return false;
+
+  if (Value.isAbsolute()) {
+    int64_t Result = evaluateAsInt64(Value.getConstant());
+    if ((Fixup == nullptr || (unsigned)Fixup->getKind() != PPC::fixup_ppc_half16) &&
+        (Result >= 0x8000))
+      return false;
+    Res = MCValue::get(Result);
+  } else {
+    if (!Layout)
+      return false;
+
+    MCContext &Context = Layout->getAssembler().getContext();
+    const MCSymbolRefExpr *Sym = Value.getSymA();
+    MCSymbolRefExpr::VariantKind Modifier = Sym->getKind();
+    if (Modifier != MCSymbolRefExpr::VK_None)
+      return false;
+    switch (Kind) {
+      default:
+        llvm_unreachable("Invalid kind!");
+      case VK_PPC_LO:
+        Modifier = MCSymbolRefExpr::VK_PPC_LO;
+        break;
+      case VK_PPC_HI:
+        Modifier = MCSymbolRefExpr::VK_PPC_HI;
+        break;
+      case VK_PPC_HA:
+        Modifier = MCSymbolRefExpr::VK_PPC_HA;
+        break;
+      case VK_PPC_HIGHERA:
+        Modifier = MCSymbolRefExpr::VK_PPC_HIGHERA;
+        break;
+      case VK_PPC_HIGHER:
+        Modifier = MCSymbolRefExpr::VK_PPC_HIGHER;
+        break;
+      case VK_PPC_HIGHEST:
+        Modifier = MCSymbolRefExpr::VK_PPC_HIGHEST;
+        break;
+      case VK_PPC_HIGHESTA:
+        Modifier = MCSymbolRefExpr::VK_PPC_HIGHESTA;
+        break;
+    }
+    Sym = MCSymbolRefExpr::create(&Sym->getSymbol(), Modifier, Context);
+    Res = MCValue::get(Sym, Value.getSymB(), Value.getConstant());
+  }
+
+  return true;
+}
+
+void PPCMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
new file mode 100644
index 000000000000..d42a111cc43e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -0,0 +1,100 @@
+//===-- PPCMCExpr.h - PPC specific MC expression classes --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCEXPR_H
+#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCEXPR_H
+
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+namespace llvm {
+
+class PPCMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_PPC_None,
+    VK_PPC_LO,
+    VK_PPC_HI,
+    VK_PPC_HA,
+    VK_PPC_HIGHER,
+    VK_PPC_HIGHERA,
+    VK_PPC_HIGHEST,
+    VK_PPC_HIGHESTA
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+  bool IsDarwin;
+
+  int64_t evaluateAsInt64(int64_t Value) const;
+
+  explicit PPCMCExpr(VariantKind Kind, const MCExpr *Expr, bool IsDarwin)
+      : Kind(Kind), Expr(Expr), IsDarwin(IsDarwin) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const PPCMCExpr *create(VariantKind Kind, const MCExpr *Expr,
+                                 bool isDarwin, MCContext &Ctx);
+
+  static const PPCMCExpr *createLo(const MCExpr *Expr,
+                                   bool isDarwin, MCContext &Ctx) {
+    return create(VK_PPC_LO, Expr, isDarwin, Ctx);
+  }
+
+  static const PPCMCExpr *createHi(const MCExpr *Expr,
+                                   bool isDarwin, MCContext &Ctx) {
+    return create(VK_PPC_HI, Expr, isDarwin, Ctx);
+  }
+
+  static const PPCMCExpr *createHa(const MCExpr *Expr,
+                                   bool isDarwin, MCContext &Ctx) {
+    return create(VK_PPC_HA, Expr, isDarwin, Ctx);
+  }
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// isDarwinSyntax - True if expression is to be printed using Darwin syntax.
+  bool isDarwinSyntax() const { return IsDarwin; }
+
+
+  /// @}
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+  MCFragment *findAssociatedFragment() const override {
+    return getSubExpr()->findAssociatedFragment();
+  }
+
+  // There are no TLS PPCMCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
+
+  bool evaluateAsConstant(int64_t &Res) const;
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
new file mode 100644
index 000000000000..bbd10e5b260f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -0,0 +1,264 @@
+//===-- PPCMCTargetDesc.cpp - PowerPC Target Descriptions -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides PowerPC specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMCTargetDesc.h"
+#include "InstPrinter/PPCInstPrinter.h"
+#include "PPCMCAsmInfo.h"
+#include "PPCTargetStreamer.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "PPCGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "PPCGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "PPCGenRegisterInfo.inc"
+
+// Pin the vtable to this file.
+PPCTargetStreamer::~PPCTargetStreamer() {}
+PPCTargetStreamer::PPCTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+static MCInstrInfo *createPPCMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitPPCMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createPPCMCRegisterInfo(const Triple &TT) {
+  bool isPPC64 =
+      (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le);
+  unsigned Flavour = isPPC64 ? 0 : 1;
+  unsigned RA = isPPC64 ? PPC::LR8 : PPC::LR;
+
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitPPCMCRegisterInfo(X, RA, Flavour, Flavour);
+  return X;
+}
+
+static MCSubtargetInfo *createPPCMCSubtargetInfo(const Triple &TT,
+                                                 StringRef CPU, StringRef FS) {
+  return createPPCMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI,
+                                     const Triple &TheTriple) {
+  bool isPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
+                  TheTriple.getArch() == Triple::ppc64le);
+
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSDarwin())
+    MAI = new PPCMCAsmInfoDarwin(isPPC64, TheTriple);
+  else
+    MAI = new PPCELFMCAsmInfo(isPPC64, TheTriple);
+
+  // Initial state of the frame pointer is R1.
+  unsigned Reg = isPPC64 ? PPC::X1 : PPC::R1;
+  MCCFIInstruction Inst =
+      MCCFIInstruction::createDefCfa(nullptr, MRI.getDwarfRegNum(Reg, true), 0);
+  MAI->addInitialFrameState(Inst);
+
+  return MAI;
+}
+
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+                              CodeModel::Model &CM) {
+  if (CM == CodeModel::Default) {
+    if (!TT.isOSDarwin() &&
+        (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le))
+      CM = CodeModel::Medium;
+  }
+}
+
+namespace {
+class PPCTargetAsmStreamer : public PPCTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  PPCTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS)
+      : PPCTargetStreamer(S), OS(OS) {}
+  void emitTCEntry(const MCSymbol &S) override {
+    OS << "\t.tc ";
+    OS << S.getName();
+    OS << "[TC],";
+    OS << S.getName();
+    OS << '\n';
+  }
+  void emitMachine(StringRef CPU) override {
+    OS << "\t.machine " << CPU << '\n';
+  }
+  void emitAbiVersion(int AbiVersion) override {
+    OS << "\t.abiversion " << AbiVersion << '\n';
+  }
+  void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
+    const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
+
+    OS << "\t.localentry\t";
+    S->print(OS, MAI);
+    OS << ", ";
+    LocalOffset->print(OS, MAI);
+    OS << '\n';
+  }
+};
+
+class PPCTargetELFStreamer : public PPCTargetStreamer {
+public:
+  PPCTargetELFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
+  MCELFStreamer &getStreamer() {
+    return static_cast<MCELFStreamer &>(Streamer);
+  }
+  void emitTCEntry(const MCSymbol &S) override {
+    // Creates a R_PPC64_TOC relocation
+    Streamer.EmitValueToAlignment(8);
+    Streamer.EmitSymbolValue(&S, 8);
+  }
+  void emitMachine(StringRef CPU) override {
+    // FIXME: Is there anything to do in here or does this directive only
+    // limit the parser?
+  }
+  void emitAbiVersion(int AbiVersion) override {
+    MCAssembler &MCA = getStreamer().getAssembler();
+    unsigned Flags = MCA.getELFHeaderEFlags();
+    Flags &= ~ELF::EF_PPC64_ABI;
+    Flags |= (AbiVersion & ELF::EF_PPC64_ABI);
+    MCA.setELFHeaderEFlags(Flags);
+  }
+  void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
+    MCAssembler &MCA = getStreamer().getAssembler();
+
+    int64_t Res;
+    if (!LocalOffset->evaluateAsAbsolute(Res, MCA))
+      report_fatal_error(".localentry expression must be absolute.");
+
+    unsigned Encoded = ELF::encodePPC64LocalEntryOffset(Res);
+    if (Res != ELF::decodePPC64LocalEntryOffset(Encoded))
+      report_fatal_error(".localentry expression cannot be encoded.");
+
+    unsigned Other = S->getOther();
+    Other &= ~ELF::STO_PPC64_LOCAL_MASK;
+    Other |= Encoded;
+    S->setOther(Other);
+
+    // For GAS compatibility, unless we already saw a .abiversion directive,
+    // set e_flags to indicate ELFv2 ABI.
+    unsigned Flags = MCA.getELFHeaderEFlags();
+    if ((Flags & ELF::EF_PPC64_ABI) == 0)
+      MCA.setELFHeaderEFlags(Flags | 2);
+  }
+  void emitAssignment(MCSymbol *S, const MCExpr *Value) override {
+    auto *Symbol = cast<MCSymbolELF>(S);
+    // When encoding an assignment to set symbol A to symbol B, also copy
+    // the st_other bits encoding the local entry point offset.
+    if (Value->getKind() != MCExpr::SymbolRef)
+      return;
+    const auto &RhsSym = cast<MCSymbolELF>(
+        static_cast<const MCSymbolRefExpr *>(Value)->getSymbol());
+    unsigned Other = Symbol->getOther();
+    Other &= ~ELF::STO_PPC64_LOCAL_MASK;
+    Other |= RhsSym.getOther() & ELF::STO_PPC64_LOCAL_MASK;
+    Symbol->setOther(Other);
+  }
+};
+
+class PPCTargetMachOStreamer : public PPCTargetStreamer {
+public:
+  PPCTargetMachOStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
+  void emitTCEntry(const MCSymbol &S) override {
+    llvm_unreachable("Unknown pseudo-op: .tc");
+  }
+  void emitMachine(StringRef CPU) override {
+    // FIXME: We should update the CPUType, CPUSubType in the Object file if
+    // the new values are different from the defaults.
+  }
+  void emitAbiVersion(int AbiVersion) override {
+    llvm_unreachable("Unknown pseudo-op: .abiversion");
+  }
+  void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
+    llvm_unreachable("Unknown pseudo-op: .localentry");
+  }
+};
+}
+
+static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
+                                                 formatted_raw_ostream &OS,
+                                                 MCInstPrinter *InstPrint,
+                                                 bool isVerboseAsm) {
+  return new PPCTargetAsmStreamer(S, OS);
+}
+
+static MCTargetStreamer *
+createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  const Triple &TT = STI.getTargetTriple();
+  if (TT.isOSBinFormatELF())
+    return new PPCTargetELFStreamer(S);
+  return new PPCTargetMachOStreamer(S);
+}
+
+static MCInstPrinter *createPPCMCInstPrinter(const Triple &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
+                                             const MCRegisterInfo &MRI) {
+  return new PPCInstPrinter(MAI, MII, MRI, T.isOSDarwin());
+}
+
+extern "C" void LLVMInitializePowerPCTargetMC() {
+  for (Target *T :
+       {&getThePPC32Target(), &getThePPC64Target(), &getThePPC64LETarget()}) {
+    // Register the MC asm info.
+    RegisterMCAsmInfoFn C(*T, createPPCMCAsmInfo);
+
+    // Register the MC codegen info.
+    TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts);
+
+    // Register the MC instruction info.
+    TargetRegistry::RegisterMCInstrInfo(*T, createPPCMCInstrInfo);
+
+    // Register the MC register info.
+    TargetRegistry::RegisterMCRegInfo(*T, createPPCMCRegisterInfo);
+
+    // Register the MC subtarget info.
+    TargetRegistry::RegisterMCSubtargetInfo(*T, createPPCMCSubtargetInfo);
+
+    // Register the MC Code Emitter
+    TargetRegistry::RegisterMCCodeEmitter(*T, createPPCMCCodeEmitter);
+
+    // Register the asm backend.
+    TargetRegistry::RegisterMCAsmBackend(*T, createPPCAsmBackend);
+
+    // Register the object target streamer.
+    TargetRegistry::RegisterObjectTargetStreamer(*T,
+                                                 createObjectTargetStreamer);
+
+    // Register the asm target streamer.
+    TargetRegistry::RegisterAsmTargetStreamer(*T, createAsmTargetStreamer);
+
+    // Register the MCInstPrinter.
+    TargetRegistry::RegisterMCInstPrinter(*T, createPPCMCInstPrinter);
+  }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
new file mode 100644
index 000000000000..0989e0c8e268
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -0,0 +1,106 @@
+//===-- PPCMCTargetDesc.h - PowerPC Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides PowerPC specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCTARGETDESC_H
+#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCTARGETDESC_H
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/MathExtras.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class Target;
+class Triple;
+class StringRef;
+class raw_pwrite_stream;
+class raw_ostream;
+
+Target &getThePPC32Target();
+Target &getThePPC64Target();
+Target &getThePPC64LETarget();
+
+MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
+                                      MCContext &Ctx);
+
+MCAsmBackend *createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  const Triple &TT, StringRef CPU,
+                                  const MCTargetOptions &Options);
+
+/// Construct an PPC ELF object writer.
+MCObjectWriter *createPPCELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+                                         bool IsLittleEndian, uint8_t OSABI);
+/// Construct a PPC Mach-O object writer.
+MCObjectWriter *createPPCMachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+                                          uint32_t CPUType,
+                                          uint32_t CPUSubtype);
+
+/// Returns true iff Val consists of one contiguous run of 1s with any number of
+/// 0s on either side.  The 1s are allowed to wrap from LSB to MSB, so
+/// 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.  0x0F0F0000 is not,
+/// since all 1s are not contiguous.
+static inline bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
+  if (!Val)
+    return false;
+
+  if (isShiftedMask_32(Val)) {
+    // look for the first non-zero bit
+    MB = countLeadingZeros(Val);
+    // look for the first zero bit after the run of ones
+    ME = countLeadingZeros((Val - 1) ^ Val);
+    return true;
+  } else {
+    Val = ~Val; // invert mask
+    if (isShiftedMask_32(Val)) {
+      // effectively look for the first zero bit
+      ME = countLeadingZeros(Val) - 1;
+      // effectively look for the first one bit after the run of zeros
+      MB = countLeadingZeros((Val - 1) ^ Val) + 1;
+      return true;
+    }
+  }
+  // no run present
+  return false;
+}
+
+} // End llvm namespace
+
+// Generated files will use "namespace PPC". To avoid symbol clash,
+// undefine PPC here. PPC may be predefined on some hosts.
+#undef PPC
+
+// Defines symbolic names for PowerPC registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "PPCGenRegisterInfo.inc"
+
+// Defines symbolic names for the PowerPC instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "PPCGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "PPCGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
new file mode 100644
index 000000000000..1f38a8c947e7
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -0,0 +1,383 @@
+//===-- PPCMachObjectWriter.cpp - PPC Mach-O Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
+
+using namespace llvm;
+
+namespace {
+class PPCMachObjectWriter : public MCMachObjectTargetWriter {
+  bool recordScatteredRelocation(MachObjectWriter *Writer,
+                                 const MCAssembler &Asm,
+                                 const MCAsmLayout &Layout,
+                                 const MCFragment *Fragment,
+                                 const MCFixup &Fixup, MCValue Target,
+                                 unsigned Log2Size, uint64_t &FixedValue);
+
+  void RecordPPCRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+                           const MCAsmLayout &Layout,
+                           const MCFragment *Fragment, const MCFixup &Fixup,
+                           MCValue Target, uint64_t &FixedValue);
+
+public:
+  PPCMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
+
+  void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override {
+    if (Writer->is64Bit()) {
+      report_fatal_error("Relocation emission for MachO/PPC64 unimplemented.");
+    } else
+      RecordPPCRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                          FixedValue);
+  }
+};
+}
+
+/// computes the log2 of the size of the relocation,
+/// used for relocation_info::r_length.
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+  switch (Kind) {
+  default:
+    report_fatal_error("log2size(FixupKind): Unhandled fixup kind!");
+  case FK_PCRel_1:
+  case FK_Data_1:
+    return 0;
+  case FK_PCRel_2:
+  case FK_Data_2:
+    return 1;
+  case FK_PCRel_4:
+  case PPC::fixup_ppc_brcond14:
+  case PPC::fixup_ppc_half16:
+  case PPC::fixup_ppc_br24:
+  case FK_Data_4:
+    return 2;
+  case FK_PCRel_8:
+  case FK_Data_8:
+    return 3;
+  }
+  return 0;
+}
+
+/// Translates generic PPC fixup kind to Mach-O/PPC relocation type enum.
+/// Outline based on PPCELFObjectWriter::getRelocType().
+static unsigned getRelocType(const MCValue &Target,
+                             const MCFixupKind FixupKind, // from
+                                                          // Fixup.getKind()
+                             const bool IsPCRel) {
+  const MCSymbolRefExpr::VariantKind Modifier =
+      Target.isAbsolute() ? MCSymbolRefExpr::VK_None
+                          : Target.getSymA()->getKind();
+  // determine the type of the relocation
+  unsigned Type = MachO::GENERIC_RELOC_VANILLA;
+  if (IsPCRel) { // relative to PC
+    switch ((unsigned)FixupKind) {
+    default:
+      report_fatal_error("Unimplemented fixup kind (relative)");
+    case PPC::fixup_ppc_br24:
+      Type = MachO::PPC_RELOC_BR24; // R_PPC_REL24
+      break;
+    case PPC::fixup_ppc_brcond14:
+      Type = MachO::PPC_RELOC_BR14;
+      break;
+    case PPC::fixup_ppc_half16:
+      switch (Modifier) {
+      default:
+        llvm_unreachable("Unsupported modifier for half16 fixup");
+      case MCSymbolRefExpr::VK_PPC_HA:
+        Type = MachO::PPC_RELOC_HA16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = MachO::PPC_RELOC_LO16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HI:
+        Type = MachO::PPC_RELOC_HI16;
+        break;
+      }
+      break;
+    }
+  } else {
+    switch ((unsigned)FixupKind) {
+    default:
+      report_fatal_error("Unimplemented fixup kind (absolute)!");
+    case PPC::fixup_ppc_half16:
+      switch (Modifier) {
+      default:
+        llvm_unreachable("Unsupported modifier for half16 fixup");
+      case MCSymbolRefExpr::VK_PPC_HA:
+        Type = MachO::PPC_RELOC_HA16_SECTDIFF;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = MachO::PPC_RELOC_LO16_SECTDIFF;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HI:
+        Type = MachO::PPC_RELOC_HI16_SECTDIFF;
+        break;
+      }
+      break;
+    case FK_Data_4:
+      break;
+    case FK_Data_2:
+      break;
+    }
+  }
+  return Type;
+}
+
+static void makeRelocationInfo(MachO::any_relocation_info &MRE,
+                               const uint32_t FixupOffset, const uint32_t Index,
+                               const unsigned IsPCRel, const unsigned Log2Size,
+                               const unsigned IsExtern, const unsigned Type) {
+  MRE.r_word0 = FixupOffset;
+  // The bitfield offsets that work (as determined by trial-and-error)
+  // are different than what is documented in the mach-o manuals.
+  // This appears to be an endianness issue; reversing the order of the
+  // documented bitfields in <llvm/Support/MachO.h> fixes this (but
+  // breaks x86/ARM assembly).
+  MRE.r_word1 = ((Index << 8) |    // was << 0
+                 (IsPCRel << 7) |  // was << 24
+                 (Log2Size << 5) | // was << 25
+                 (IsExtern << 4) | // was << 27
+                 (Type << 0));     // was << 28
+}
+
+static void
+makeScatteredRelocationInfo(MachO::any_relocation_info &MRE,
+                            const uint32_t Addr, const unsigned Type,
+                            const unsigned Log2Size, const unsigned IsPCRel,
+                            const uint32_t Value2) {
+  // For notes on bitfield positions and endianness, see:
+  // https://developer.apple.com/library/mac/documentation/developertools/conceptual/MachORuntime/Reference/reference.html#//apple_ref/doc/uid/20001298-scattered_relocation_entry
+  MRE.r_word0 = ((Addr << 0) | (Type << 24) | (Log2Size << 28) |
+                 (IsPCRel << 30) | MachO::R_SCATTERED);
+  MRE.r_word1 = Value2;
+}
+
+/// Compute fixup offset (address).
+static uint32_t getFixupOffset(const MCAsmLayout &Layout,
+                               const MCFragment *Fragment,
+                               const MCFixup &Fixup) {
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+  // On Mach-O, ppc_fixup_half16 relocations must refer to the
+  // start of the instruction, not the second halfword, as ELF does
+  if (unsigned(Fixup.getKind()) == PPC::fixup_ppc_half16)
+    FixupOffset &= ~uint32_t(3);
+  return FixupOffset;
+}
+
+/// \return false if falling back to using non-scattered relocation,
+/// otherwise true for normal scattered relocation.
+/// based on X86MachObjectWriter::recordScatteredRelocation
+/// and ARMMachObjectWriter::recordScatteredRelocation
+bool PPCMachObjectWriter::recordScatteredRelocation(
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    unsigned Log2Size, uint64_t &FixedValue) {
+  // caller already computes these, can we just pass and reuse?
+  const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup);
+  const MCFixupKind FK = Fixup.getKind();
+  const unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, FK);
+  const unsigned Type = getRelocType(Target, FK, IsPCRel);
+
+  // Is this a local or SECTDIFF relocation entry?
+  // SECTDIFF relocation entries have symbol subtractions,
+  // and require two entries, the first for the add-symbol value,
+  // the second for the subtract-symbol value.
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+
+  if (!A->getFragment())
+    report_fatal_error("symbol '" + A->getName() +
+                       "' can not be undefined in a subtraction expression");
+
+  uint32_t Value = Writer->getSymbolAddress(*A, Layout);
+  uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
+  FixedValue += SecAddr;
+  uint32_t Value2 = 0;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    const MCSymbol *SB = &B->getSymbol();
+
+    if (!SB->getFragment())
+      report_fatal_error("symbol '" + B->getSymbol().getName() +
+                         "' can not be undefined in a subtraction expression");
+
+    // FIXME: is Type correct? see include/llvm/Support/MachO.h
+    Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
+    FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
+  }
+  // FIXME: does FixedValue get used??
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  if (Type == MachO::PPC_RELOC_SECTDIFF ||
+      Type == MachO::PPC_RELOC_HI16_SECTDIFF ||
+      Type == MachO::PPC_RELOC_LO16_SECTDIFF ||
+      Type == MachO::PPC_RELOC_HA16_SECTDIFF ||
+      Type == MachO::PPC_RELOC_LO14_SECTDIFF ||
+      Type == MachO::PPC_RELOC_LOCAL_SECTDIFF) {
+    // X86 had this piece, but ARM does not
+    // If the offset is too large to fit in a scattered relocation,
+    // we're hosed. It's an unfortunate limitation of the MachO format.
+    if (FixupOffset > 0xffffff) {
+      char Buffer[32];
+      format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                  Twine("Section too large, can't encode "
+                                        "r_address (") +
+                                      Buffer + ") into 24 bits of scattered "
+                                               "relocation entry.");
+      return false;
+    }
+
+    // Is this supposed to follow MCTarget/PPCAsmBackend.cpp:adjustFixupValue()?
+    // see PPCMCExpr::evaluateAsRelocatableImpl()
+    uint32_t other_half = 0;
+    switch (Type) {
+    case MachO::PPC_RELOC_LO16_SECTDIFF:
+      other_half = (FixedValue >> 16) & 0xffff;
+      // applyFixupOffset longer extracts the high part because it now assumes
+      // this was already done.
+      // It looks like this is not true for the FixedValue needed with Mach-O
+      // relocs.
+      // So we need to adjust FixedValue again here.
+      FixedValue &= 0xffff;
+      break;
+    case MachO::PPC_RELOC_HA16_SECTDIFF:
+      other_half = FixedValue & 0xffff;
+      FixedValue =
+          ((FixedValue >> 16) + ((FixedValue & 0x8000) ? 1 : 0)) & 0xffff;
+      break;
+    case MachO::PPC_RELOC_HI16_SECTDIFF:
+      other_half = FixedValue & 0xffff;
+      FixedValue = (FixedValue >> 16) & 0xffff;
+      break;
+    default:
+      llvm_unreachable("Invalid PPC scattered relocation type.");
+      break;
+    }
+
+    MachO::any_relocation_info MRE;
+    makeScatteredRelocationInfo(MRE, other_half, MachO::GENERIC_RELOC_PAIR,
+                                Log2Size, IsPCRel, Value2);
+    Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+  } else {
+    // If the offset is more than 24-bits, it won't fit in a scattered
+    // relocation offset field, so we fall back to using a non-scattered
+    // relocation. This is a bit risky, as if the offset reaches out of
+    // the block and the linker is doing scattered loading on this
+    // symbol, things can go badly.
+    //
+    // Required for 'as' compatibility.
+    if (FixupOffset > 0xffffff)
+      return false;
+  }
+  MachO::any_relocation_info MRE;
+  makeScatteredRelocationInfo(MRE, FixupOffset, Type, Log2Size, IsPCRel, Value);
+  Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+  return true;
+}
+
+// see PPCELFObjectWriter for a general outline of cases
+void PPCMachObjectWriter::RecordPPCRelocation(
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    uint64_t &FixedValue) {
+  const MCFixupKind FK = Fixup.getKind(); // unsigned
+  const unsigned Log2Size = getFixupKindLog2Size(FK);
+  const bool IsPCRel = Writer->isFixupKindPCRel(Asm, FK);
+  const unsigned RelocType = getRelocType(Target, FK, IsPCRel);
+
+  // If this is a difference or a defined symbol plus an offset, then we need a
+  // scattered relocation entry. Differences always require scattered
+  // relocations.
+  if (Target.getSymB() &&
+      // Q: are branch targets ever scattered?
+      RelocType != MachO::PPC_RELOC_BR24 &&
+      RelocType != MachO::PPC_RELOC_BR14) {
+    recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                              Log2Size, FixedValue);
+    return;
+  }
+
+  // this doesn't seem right for RIT_PPC_BR24
+  // Get the symbol data, if any.
+  const MCSymbol *A = nullptr;
+  if (Target.getSymA())
+    A = &Target.getSymA()->getSymbol();
+
+  // See <reloc.h>.
+  const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup);
+  unsigned Index = 0;
+  unsigned Type = RelocType;
+
+  const MCSymbol *RelSymbol = nullptr;
+  if (Target.isAbsolute()) { // constant
+                             // SymbolNum of 0 indicates the absolute section.
+                             //
+    // FIXME: Currently, these are never generated (see code below). I cannot
+    // find a case where they are actually emitted.
+    report_fatal_error("FIXME: relocations to absolute targets "
+                       "not yet implemented");
+    // the above line stolen from ARM, not sure
+  } else {
+    // Resolve constant variables.
+    if (A->isVariable()) {
+      int64_t Res;
+      if (A->getVariableValue()->evaluateAsAbsolute(
+              Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+    }
+
+    // Check whether we need an external or internal relocation.
+    if (Writer->doesSymbolRequireExternRelocation(*A)) {
+      RelSymbol = A;
+      // For external relocations, make sure to offset the fixup value to
+      // compensate for the addend of the symbol address, if it was
+      // undefined. This occurs with weak definitions, for example.
+      if (!A->isUndefined())
+        FixedValue -= Layout.getSymbolOffset(*A);
+    } else {
+      // The index is the section ordinal (1-based).
+      const MCSection &Sec = A->getSection();
+      Index = Sec.getOrdinal() + 1;
+      FixedValue += Writer->getSectionAddress(&Sec);
+    }
+    if (IsPCRel)
+      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+  }
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  makeRelocationInfo(MRE, FixupOffset, Index, IsPCRel, Log2Size, false, Type);
+  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createPPCMachObjectWriter(raw_pwrite_stream &OS,
+                                                bool Is64Bit, uint32_t CPUType,
+                                                uint32_t CPUSubtype) {
+  return createMachObjectWriter(
+      new PPCMachObjectWriter(Is64Bit, CPUType, CPUSubtype), OS,
+      /*IsLittleEndian=*/false);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
new file mode 100644
index 000000000000..c2987b641c04
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
@@ -0,0 +1,86 @@
+//===-- PPCPredicates.cpp - PPC Branch Predicate Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PowerPC branch predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCPredicates.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+using namespace llvm;
+
+PPC::Predicate PPC::InvertPredicate(PPC::Predicate Opcode) {
+  switch (Opcode) {
+  case PPC::PRED_EQ: return PPC::PRED_NE;
+  case PPC::PRED_NE: return PPC::PRED_EQ;
+  case PPC::PRED_LT: return PPC::PRED_GE;
+  case PPC::PRED_GE: return PPC::PRED_LT;
+  case PPC::PRED_GT: return PPC::PRED_LE;
+  case PPC::PRED_LE: return PPC::PRED_GT;
+  case PPC::PRED_NU: return PPC::PRED_UN;
+  case PPC::PRED_UN: return PPC::PRED_NU;
+  case PPC::PRED_EQ_MINUS: return PPC::PRED_NE_PLUS;
+  case PPC::PRED_NE_MINUS: return PPC::PRED_EQ_PLUS;
+  case PPC::PRED_LT_MINUS: return PPC::PRED_GE_PLUS;
+  case PPC::PRED_GE_MINUS: return PPC::PRED_LT_PLUS;
+  case PPC::PRED_GT_MINUS: return PPC::PRED_LE_PLUS;
+  case PPC::PRED_LE_MINUS: return PPC::PRED_GT_PLUS;
+  case PPC::PRED_NU_MINUS: return PPC::PRED_UN_PLUS;
+  case PPC::PRED_UN_MINUS: return PPC::PRED_NU_PLUS;
+  case PPC::PRED_EQ_PLUS: return PPC::PRED_NE_MINUS;
+  case PPC::PRED_NE_PLUS: return PPC::PRED_EQ_MINUS;
+  case PPC::PRED_LT_PLUS: return PPC::PRED_GE_MINUS;
+  case PPC::PRED_GE_PLUS: return PPC::PRED_LT_MINUS;
+  case PPC::PRED_GT_PLUS: return PPC::PRED_LE_MINUS;
+  case PPC::PRED_LE_PLUS: return PPC::PRED_GT_MINUS;
+  case PPC::PRED_NU_PLUS: return PPC::PRED_UN_MINUS;
+  case PPC::PRED_UN_PLUS: return PPC::PRED_NU_MINUS;
+
+  // Simple predicates for single condition-register bits.
+  case PPC::PRED_BIT_SET:   return PPC::PRED_BIT_UNSET;
+  case PPC::PRED_BIT_UNSET: return PPC::PRED_BIT_SET;
+  }
+  llvm_unreachable("Unknown PPC branch opcode!");
+}
+
+PPC::Predicate PPC::getSwappedPredicate(PPC::Predicate Opcode) {
+  switch (Opcode) {
+  case PPC::PRED_EQ: return PPC::PRED_EQ;
+  case PPC::PRED_NE: return PPC::PRED_NE;
+  case PPC::PRED_LT: return PPC::PRED_GT;
+  case PPC::PRED_GE: return PPC::PRED_LE;
+  case PPC::PRED_GT: return PPC::PRED_LT;
+  case PPC::PRED_LE: return PPC::PRED_GE;
+  case PPC::PRED_NU: return PPC::PRED_NU;
+  case PPC::PRED_UN: return PPC::PRED_UN;
+  case PPC::PRED_EQ_MINUS: return PPC::PRED_EQ_MINUS;
+  case PPC::PRED_NE_MINUS: return PPC::PRED_NE_MINUS;
+  case PPC::PRED_LT_MINUS: return PPC::PRED_GT_MINUS;
+  case PPC::PRED_GE_MINUS: return PPC::PRED_LE_MINUS;
+  case PPC::PRED_GT_MINUS: return PPC::PRED_LT_MINUS;
+  case PPC::PRED_LE_MINUS: return PPC::PRED_GE_MINUS;
+  case PPC::PRED_NU_MINUS: return PPC::PRED_NU_MINUS;
+  case PPC::PRED_UN_MINUS: return PPC::PRED_UN_MINUS;
+  case PPC::PRED_EQ_PLUS: return PPC::PRED_EQ_PLUS;
+  case PPC::PRED_NE_PLUS: return PPC::PRED_NE_PLUS;
+  case PPC::PRED_LT_PLUS: return PPC::PRED_GT_PLUS;
+  case PPC::PRED_GE_PLUS: return PPC::PRED_LE_PLUS;
+  case PPC::PRED_GT_PLUS: return PPC::PRED_LT_PLUS;
+  case PPC::PRED_LE_PLUS: return PPC::PRED_GE_PLUS;
+  case PPC::PRED_NU_PLUS: return PPC::PRED_NU_PLUS;
+  case PPC::PRED_UN_PLUS: return PPC::PRED_UN_PLUS;
+
+  case PPC::PRED_BIT_SET:
+  case PPC::PRED_BIT_UNSET:
+    llvm_unreachable("Invalid use of bit predicate code");
+  }
+  llvm_unreachable("Unknown PPC branch opcode!");
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
new file mode 100644
index 000000000000..acea600fbb0d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -0,0 +1,76 @@
+//===-- PPCPredicates.h - PPC Branch Predicate Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PowerPC branch predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCPREDICATES_H
+#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCPREDICATES_H
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+// Generated files will use "namespace PPC". To avoid symbol clash,
+// undefine PPC here. PPC may be predefined on some hosts.
+#undef PPC
+
+namespace llvm {
+namespace PPC {
+  /// Predicate - These are "(BI << 5) | BO"  for various predicates.
+  enum Predicate {
+    PRED_LT       = (0 << 5) | 12,
+    PRED_LE       = (1 << 5) |  4,
+    PRED_EQ       = (2 << 5) | 12,
+    PRED_GE       = (0 << 5) |  4,
+    PRED_GT       = (1 << 5) | 12,
+    PRED_NE       = (2 << 5) |  4,
+    PRED_UN       = (3 << 5) | 12,
+    PRED_NU       = (3 << 5) |  4,
+    PRED_LT_MINUS = (0 << 5) | 14,
+    PRED_LE_MINUS = (1 << 5) |  6,
+    PRED_EQ_MINUS = (2 << 5) | 14,
+    PRED_GE_MINUS = (0 << 5) |  6,
+    PRED_GT_MINUS = (1 << 5) | 14,
+    PRED_NE_MINUS = (2 << 5) |  6,
+    PRED_UN_MINUS = (3 << 5) | 14,
+    PRED_NU_MINUS = (3 << 5) |  6,
+    PRED_LT_PLUS  = (0 << 5) | 15,
+    PRED_LE_PLUS  = (1 << 5) |  7,
+    PRED_EQ_PLUS  = (2 << 5) | 15,
+    PRED_GE_PLUS  = (0 << 5) |  7,
+    PRED_GT_PLUS  = (1 << 5) | 15,
+    PRED_NE_PLUS  = (2 << 5) |  7,
+    PRED_UN_PLUS  = (3 << 5) | 15,
+    PRED_NU_PLUS  = (3 << 5) |  7,
+
+    // When dealing with individual condition-register bits, we have simple set
+    // and unset predicates.
+    PRED_BIT_SET =   1024,
+    PRED_BIT_UNSET = 1025
+  };
+  
+  // Bit for branch taken (plus) or not-taken (minus) hint
+  enum BranchHintBit {
+    BR_NO_HINT       = 0x0,
+    BR_NONTAKEN_HINT = 0x2,
+    BR_TAKEN_HINT    = 0x3,
+    BR_HINT_MASK     = 0X3
+  };
+
+  /// Invert the specified predicate.  != -> ==, < -> >=.
+  Predicate InvertPredicate(Predicate Opcode);
+
+  /// Assume the condition register is set by MI(a,b), return the predicate if
+  /// we modify the instructions such that condition register is set by MI(b,a).
+  Predicate getSwappedPredicate(Predicate Opcode);
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/P9InstrResources.td b/contrib/llvm/lib/Target/PowerPC/P9InstrResources.td
new file mode 100644
index 000000000000..aea022f88766
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/P9InstrResources.td
@@ -0,0 +1,808 @@
+//===- P9InstrResources.td - P9 Instruction Resource Defs  -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines resources required by some of P9 instruction. This is part
+// P9 processor model used for instruction scheduling. Not every instruction
+// is listed here. Instructions in this file belong to itinerary classes that
+// have instructions with different resource requirements.
+//
+//===----------------------------------------------------------------------===//
+
+
+def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C,
+              DISP_1C, DISP_1C],
+      (instrs
+    VADDCUW,
+    VADDUBM,
+    VADDUDM,
+    VADDUHM,
+    VADDUWM,
+    VAND,
+    VANDC,
+    VCMPEQUB,
+    VCMPEQUBo,
+    VCMPEQUD,
+    VCMPEQUDo,
+    VCMPEQUH,
+    VCMPEQUHo,
+    VCMPEQUW,
+    VCMPEQUWo,
+    VCMPGTSB,
+    VCMPGTSBo,
+    VCMPGTSD,
+    VCMPGTSDo,
+    VCMPGTSH,
+    VCMPGTSHo,
+    VCMPGTSW,
+    VCMPGTSWo,
+    VCMPGTUB,
+    VCMPGTUBo,
+    VCMPGTUD,
+    VCMPGTUDo,
+    VCMPGTUH,
+    VCMPGTUHo,
+    VCMPGTUW,
+    VCMPGTUWo,
+    VCMPNEB,
+    VCMPNEBo,
+    VCMPNEH,
+    VCMPNEHo,
+    VCMPNEW,
+    VCMPNEWo,
+    VCMPNEZB,
+    VCMPNEZBo,
+    VCMPNEZH,
+    VCMPNEZHo,
+    VCMPNEZW,
+    VCMPNEZWo,
+    VEQV,
+    VEXTSB2D,
+    VEXTSB2W,
+    VEXTSH2D,
+    VEXTSH2W,
+    VEXTSW2D,
+    VMRGEW,
+    VMRGOW,
+    VNAND,
+    VNEGD,
+    VNEGW,
+    VNOR,
+    VOR,
+    VORC,
+    VPOPCNTB,
+    VPOPCNTH,
+    VPOPCNTW,
+    VSEL,
+    VSUBCUW,
+    VSUBUBM,
+    VSUBUDM,
+    VSUBUHM,
+    VSUBUWM,
+    VXOR,
+    V_SET0B,
+    V_SET0H,
+    V_SET0,
+    XVABSDP,
+    XVABSSP,
+    XVCPSGNDP,
+    XVCPSGNSP,
+    XVIEXPDP,
+    XVNABSDP,
+    XVNABSSP,
+    XVNEGDP,
+    XVNEGSP,
+    XVXEXPDP,
+    XXLAND,
+    XXLANDC,
+    XXLEQV,
+    XXLNAND,
+    XXLNOR,
+    XXLOR,
+    XXLORf,
+    XXLORC,
+    XXLXOR,
+    XXSEL
+)>;
+
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
+      (instrs
+    XSABSQP,
+    XSCPSGNQP,
+    XSIEXPQP,
+    XSNABSQP,
+    XSNEGQP,
+    XSXEXPQP,
+    XSABSDP,
+    XSCPSGNDP,
+    XSIEXPDP,
+    XSNABSDP,
+    XSNEGDP,
+    XSXEXPDP
+)>;
+
+def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+      (instrs
+
+    VMINSB,
+    VMINSD,
+    VMINSH,
+    VMINSW,
+    VMINUB,
+    VMINUD,
+    VMINUH,
+    VMINUW,
+    VPOPCNTD,
+    VPRTYBD,
+    VPRTYBW,
+    VRLB,
+    VRLD,
+    VRLDMI,
+    VRLDNM,
+    VRLH,
+    VRLW,
+    VRLWMI,
+    VRLWNM,
+    VSHASIGMAD,
+    VSHASIGMAW,
+    VSLB,
+    VSLD,
+    VSLH,
+    VSLW,
+    VSRAB,
+    VSRAD,
+    VSRAH,
+    VSRAW,
+    VSRB,
+    VSRD,
+    VSRH,
+    VSRW,
+    VSUBSBS,
+    VSUBSHS,
+    VSUBSWS,
+    VSUBUBS,
+    VSUBUHS,
+    VSUBUWS,
+    XSCMPEQDP,
+    XSCMPEXPDP,
+    XSCMPGEDP,
+    XSCMPGTDP,
+    XSCMPODP,
+    XSCMPUDP,
+    XSCVSPDPN,
+    XSMAXCDP,
+    XSMAXDP,
+    XSMAXJDP,
+    XSMINCDP,
+    XSMINDP,
+    XSMINJDP,
+    XSTDIVDP,
+    XSTSQRTDP,
+    XSTSTDCDP,
+    XSTSTDCSP,
+    XSXSIGDP,
+    XVCMPEQDP,
+    XVCMPEQDPo,
+    XVCMPEQSP,
+    XVCMPEQSPo,
+    XVCMPGEDP,
+    XVCMPGEDPo,
+    XVCMPGESP,
+    XVCMPGESPo,
+    XVCMPGTDP,
+    XVCMPGTDPo,
+    XVCMPGTSP,
+    XVCMPGTSPo,
+    XVIEXPSP,
+    XVMAXDP,
+    XVMAXSP,
+    XVMINDP,
+    XVMINSP,
+    XVTDIVDP,
+    XVTDIVSP,
+    XVTSQRTDP,
+    XVTSQRTSP,
+    XVTSTDCDP,
+    XVTSTDCSP,
+    XVXEXPSP,
+    XVXSIGDP,
+    XVXSIGSP
+)>;
+
+def : InstRW<[P9_ALUE_4C, P9_ALUO_4C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+      (instrs
+    VABSDUB,
+    VABSDUH,
+    VABSDUW,
+    VADDSBS,
+    VADDSHS,
+    VADDSWS,
+    VADDUBS,
+    VADDUHS,
+    VADDUWS,
+    VAVGSB,
+    VAVGSH,
+    VAVGSW,
+    VAVGUB,
+    VAVGUH,
+    VAVGUW,
+    VBPERMD,
+    VCLZB,
+    VCLZD,
+    VCLZH,
+    VCLZW,
+    VCMPBFP,
+    VCMPBFPo,
+    VCMPGTFP,
+    VCMPGTFPo,
+    VCTZB,
+    VCTZD,
+    VCTZH,
+    VCTZW,
+    VMAXFP,
+    VMAXSB,
+    VMAXSD,
+    VMAXSH,
+    VMAXSW,
+    VMAXUB,
+    VMAXUD,
+    VMAXUH,
+    VMAXUW,
+    VMINFP,
+    VCMPEQFP,
+    VCMPEQFPo,
+    VCMPGEFP,
+    VCMPGEFPo
+)>;
+
+def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+      (instrs
+    VADDFP,
+    VCTSXS,
+    VCTSXS_0,
+    VCTUXS,
+    VCTUXS_0,
+    VEXPTEFP,
+    VLOGEFP,
+    VMADDFP,
+    VMHADDSHS,
+    VNMSUBFP,
+    VREFP,
+    VRFIM,
+    VRFIN,
+    VRFIP,
+    VRFIZ,
+    VRSQRTEFP,
+    VSUBFP,
+    XVADDDP,
+    XVADDSP,
+    XVCVDPSP,
+    XVCVDPSXDS,
+    XVCVDPSXWS,
+    XVCVDPUXDS,
+    XVCVDPUXWS,
+    XVCVHPSP,
+    XVCVSPDP,
+    XVCVSPHP,
+    XVCVSPSXDS,
+    XVCVSPSXWS,
+    XVCVSPUXDS,
+    XVCVSPUXWS,
+    XVCVSXDDP,
+    XVCVSXDSP,
+    XVCVSXWDP,
+    XVCVSXWSP,
+    XVCVUXDDP,
+    XVCVUXDSP,
+    XVCVUXWDP,
+    XVCVUXWSP,
+    XVMADDADP,
+    XVMADDASP,
+    XVMADDMDP,
+    XVMADDMSP,
+    XVMSUBADP,
+    XVMSUBASP,
+    XVMSUBMDP,
+    XVMSUBMSP,
+    XVMULDP,
+    XVMULSP,
+    XVNMADDADP,
+    XVNMADDASP,
+    XVNMADDMDP,
+    XVNMADDMSP,
+    XVNMSUBADP,
+    XVNMSUBASP,
+    XVNMSUBMDP,
+    XVNMSUBMSP,
+    XVRDPI,
+    XVRDPIC,
+    XVRDPIM,
+    XVRDPIP,
+    XVRDPIZ,
+    XVREDP,
+    XVRESP,
+    XVRSPI,
+    XVRSPIC,
+    XVRSPIM,
+    XVRSPIP,
+    XVRSPIZ,
+    XVRSQRTEDP,
+    XVRSQRTESP,
+    XVSUBDP,
+    XVSUBSP,
+    VCFSX,
+    VCFSX_0,
+    VCFUX,
+    VCFUX_0,
+    VMHRADDSHS,
+    VMLADDUHM,
+    VMSUMMBM,
+    VMSUMSHM,
+    VMSUMSHS,
+    VMSUMUBM,
+    VMSUMUHM,
+    VMSUMUHS,
+    VMULESB,
+    VMULESH,
+    VMULESW,
+    VMULEUB,
+    VMULEUH,
+    VMULEUW,
+    VMULOSB,
+    VMULOSH,
+    VMULOSW,
+    VMULOUB,
+    VMULOUH,
+    VMULOUW,
+    VMULUWM,
+    VSUM2SWS,
+    VSUM4SBS,
+    VSUM4SHS,
+    VSUM4UBS,
+    VSUMSWS
+)>;
+
+def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    XSMADDADP,
+    XSMADDASP,
+    XSMADDMDP,
+    XSMADDMSP,
+    XSMSUBADP,
+    XSMSUBASP,
+    XSMSUBMDP,
+    XSMSUBMSP,
+    XSMULDP,
+    XSMULSP,
+    XSNMADDADP,
+    XSNMADDASP,
+    XSNMADDMDP,
+    XSNMADDMSP,
+    XSNMSUBADP,
+    XSNMSUBASP,
+    XSNMSUBMDP,
+    XSNMSUBMSP
+)>;
+
+
+def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C],
+      (instrs
+    XSADDDP,
+    XSADDSP,
+    XSCVDPHP,
+    XSCVDPSP,
+    XSCVDPSXDS,
+    XSCVDPSXWS,
+    XSCVDPUXDS,
+    XSCVDPUXWS,
+    XSCVHPDP,
+    XSCVSPDP,
+    XSCVSXDDP,
+    XSCVSXDSP,
+    XSCVUXDDP,
+    XSCVUXDSP,
+    XSRDPI,
+    XSRDPIC,
+    XSRDPIM,
+    XSRDPIP,
+    XSRDPIZ,
+    XSREDP,
+    XSRESP,
+    //XSRSP,
+    XSRSQRTEDP,
+    XSRSQRTESP,
+    XSSUBDP,
+    XSSUBSP,
+    XSCVDPSPN
+)>;
+
+def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C],
+      (instrs
+    VBPERMQ,
+    VCLZLSBB,
+    VCTZLSBB,
+    VEXTRACTD,
+    VEXTRACTUB,
+    VEXTRACTUH,
+    VEXTRACTUW,
+    VEXTUBLX,
+    VEXTUBRX,
+    VEXTUHLX,
+    VEXTUHRX,
+    VEXTUWLX,
+    VEXTUWRX,
+    VGBBD,
+    VINSERTB,
+    VINSERTD,
+    VINSERTH,
+    VINSERTW,
+    VMRGHB,
+    VMRGHH,
+    VMRGHW,
+    VMRGLB,
+    VMRGLH,
+    VMRGLW,
+    VPERM,
+    VPERMR,
+    VPERMXOR,
+    VPKPX,
+    VPKSDSS,
+    VPKSDUS,
+    VPKSHSS,
+    VPKSHUS,
+    VPKSWSS,
+    VPKSWUS,
+    VPKUDUM,
+    VPKUDUS,
+    VPKUHUM,
+    VPKUHUS,
+    VPKUWUM,
+    VPKUWUS,
+    VPRTYBQ,
+    VSL,
+    VSLDOI,
+    VSLO,
+    VSLV,
+    VSPLTB,
+    VSPLTH,
+    VSPLTISB,
+    VSPLTISH,
+    VSPLTISW,
+    VSPLTW,
+    VSR,
+    VSRO,
+    VSRV,
+    VUPKHPX,
+    VUPKHSB,
+    VUPKHSH,
+    VUPKHSW,
+    VUPKLPX,
+    VUPKLSB,
+    VUPKLSH,
+    VUPKLSW,
+    XXBRD,
+    XXBRH,
+    XXBRQ,
+    XXBRW,
+    XXEXTRACTUW,
+    XXINSERTW,
+    XXMRGHW,
+    XXMRGLW,
+    XXPERM,
+    XXPERMR,
+    XXSLDWI,
+    XXSPLTIB,
+    XXSPLTW,
+    VADDCUQ,
+    VADDECUQ,
+    VADDEUQM,
+    VADDUQM,
+    VMUL10CUQ,
+    VMUL10ECUQ,
+    VMUL10EUQ,
+    VMUL10UQ,
+    VSUBCUQ,
+    VSUBECUQ,
+    VSUBEUQM,
+    VSUBUQM,
+    XSCMPEXPQP,
+    XSCMPOQP,
+    XSCMPUQP,
+    XSTSTDCQP,
+    XSXSIGQP
+)>;
+
+def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+      (instrs
+    XSADDQP,
+    XSADDQPO,
+    XSCVDPQP,
+    XSCVQPDP,
+    XSCVQPDPO,
+    XSCVQPSDZ,
+    XSCVQPSWZ,
+    XSCVQPUDZ,
+    XSCVQPUWZ,
+    XSCVSDQP,
+    XSCVUDQP,
+    XSRQPI,
+    XSRQPXP,
+    XSSUBQP,
+    XSSUBQPO
+)>;
+
+def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+      (instrs
+    XSMADDQP,
+    XSMADDQPO,
+    XSMSUBQP,
+    XSMSUBQPO,
+    XSMULQP,
+    XSMULQPO,
+    XSNMADDQP,
+    XSNMADDQPO,
+    XSNMSUBQP,
+    XSNMSUBQPO
+)>;
+
+def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+      (instrs
+    XSDIVQP,
+    XSDIVQPO
+)>;
+
+def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+      (instrs
+    XSSQRTQP,
+    XSSQRTQPO
+)>;
+
+// Load Operation in IIC_LdStLFD
+
+def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C],
+      (instrs
+    LXSDX,
+    LXVD2X,
+    LXSIWZX,
+    LXV,
+    LXSD
+)>;
+
+def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    LFIWZX,
+    LFDX,
+    LFD
+)>;
+
+def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    LXSSPX,
+    LXSIWAX,
+    LXSSP
+)>;
+
+def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    LFIWAX,
+    LFSX,
+    LFS
+)>;
+
+def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C],
+      (instrs
+    LXVDSX,
+    LXVW4X
+)>;
+
+// Store Operations in IIC_LdStSTFD.
+
+def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    STFS,
+    STFD,
+    STFIWX,
+    STFSX,
+    STFDX,
+    STXSDX,
+    STXSSPX,
+    STXSIWX
+)>;
+
+def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C],
+      (instrs
+    STXVD2X,
+    STXVW4X
+)>;
+
+
+// Divide Operations in IIC_IntDivW, IIC_IntDivD.
+
+def : InstRW<[P9_DIV_16C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+      (instrs
+    DIVW,
+    DIVWU
+)>;
+
+def : InstRW<[P9_DIV_24C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+      (instrs
+    DIVWE,
+    DIVD,
+    DIVWEU,
+    DIVDU
+)>;
+
+def : InstRW<[P9_DIV_40C_8, IP_EXECE_1C, DISP_1C, DISP_1C],
+      (instrs
+    DIVDE,
+    DIVDEU
+)>;
+
+def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    DIVWEo,
+    DIVWEUo
+)>;
+
+def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    DIVDEo,
+    DIVDEUo
+)>;
+
+// Rotate Operations in IIC_IntRotateD, IIC_IntRotateDI
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
+      (instrs
+    SLD,
+    SRD,
+    SRAD,
+    SRADI,
+    RLDIC
+)>;
+
+def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    RLDCL,
+    RLDCR,
+    RLDIMI,
+    RLDICL,
+    RLDICR,
+    RLDICL_32_64
+)>;
+
+// CR access instructions in _BrMCR, IIC_BrMCRX.
+
+def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    MTOCRF,
+    MTOCRF8,
+    MTCRF,
+    MTCRF8
+)>;
+
+def : InstRW<[P9_ALU_5C, IP_EXEC_1C, DISP_1C, DISP_1C],
+      (instrs
+    MCRF,
+    MCRXRX
+)>;
+
+def : InstRW<[P9_ALU_5C, P9_ALU_5C, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    MCRFS
+)>;
+
+// FP Div instructions in IIC_FPDivD and IIC_FPDivS.
+
+def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    FDIV,
+    XSDIVDP
+)>;
+
+def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    FDIVS,
+    XSDIVSP
+)>;
+
+def : InstRW<[P9_DP_24C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+      (instrs
+    XVDIVSP
+)>;
+
+def : InstRW<[P9_DP_33C_8, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C],
+      (instrs
+    XVDIVDP
+)>;
+
+// FP Instructions in IIC_FPGeneral, IIC_FPFused
+
+def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    FRSP,
+    FRIND,
+    FRINS,
+    FRIPD,
+    FRIPS,
+    FRIZD,
+    FRIZS,
+    FRIMD,
+    FRIMS,
+    FRE,
+    FRES,
+    FRSQRTE,
+    FRSQRTES,
+    FMADDS,
+    FMADD,
+    FMSUBS,
+    FMSUB,
+    FNMADDS,
+    FNMADD,
+    FNMSUBS,
+    FNMSUB,
+    FSELD,
+    FSELS,
+    FADDS,
+    FMULS,
+    FMUL,
+    FSUBS,
+    FCFID,
+    FCTID,
+    FCTIDZ,
+    FCFIDU,
+    FCFIDS,
+    FCFIDUS,
+    FCTIDUZ,
+    FCTIWUZ,
+    FCTIW,
+    FCTIWZ
+)>;
+
+def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    FMR,
+    FABSD,
+    FABSS,
+    FNABSD,
+    FNABSS,
+    FNEGD,
+    FNEGS,
+    FCPSGND,
+    FCPSGNS
+)>;
+
+def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    FCMPUS,
+    FCMPUD
+)>;
+
+// Load instructions in IIC_LdStLFDU and IIC_LdStLFDUX.
+
+def : InstRW<[P9_LoadAndALUOp_7C, P9_ALU_2C,
+              IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    LFSU,
+    LFSUX
+)>;
+
+def : InstRW<[P9_LS_5C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C,
+              DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+      (instrs
+    LFDU,
+    LFDUX
+)>;
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.h b/contrib/llvm/lib/Target/PowerPC/PPC.h
new file mode 100644
index 000000000000..e01f49dce81e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPC.h
@@ -0,0 +1,104 @@
+//===-- PPC.h - Top-level interface for PowerPC Target ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// PowerPC back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPC_H
+#define LLVM_LIB_TARGET_POWERPC_PPC_H
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+namespace llvm {
+  class PPCTargetMachine;
+  class PassRegistry;
+  class FunctionPass;
+  class ImmutablePass;
+  class MachineInstr;
+  class AsmPrinter;
+  class MCInst;
+
+  FunctionPass *createPPCCTRLoops(PPCTargetMachine &TM);
+#ifndef NDEBUG
+  FunctionPass *createPPCCTRLoopsVerify();
+#endif
+  FunctionPass *createPPCLoopPreIncPrepPass(PPCTargetMachine &TM);
+  FunctionPass *createPPCTOCRegDepsPass();
+  FunctionPass *createPPCEarlyReturnPass();
+  FunctionPass *createPPCVSXCopyPass();
+  FunctionPass *createPPCVSXFMAMutatePass();
+  FunctionPass *createPPCVSXSwapRemovalPass();
+  FunctionPass *createPPCMIPeepholePass();
+  FunctionPass *createPPCBranchSelectionPass();
+  FunctionPass *createPPCQPXLoadSplatPass();
+  FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
+  FunctionPass *createPPCTLSDynamicCallPass();
+  FunctionPass *createPPCBoolRetToIntPass();
+  void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                    AsmPrinter &AP, bool isDarwin);
+
+  void initializePPCVSXFMAMutatePass(PassRegistry&);
+  void initializePPCBoolRetToIntPass(PassRegistry&);
+  extern char &PPCVSXFMAMutateID;
+
+  namespace PPCII {
+    
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // PPC Specific MachineOperand flags.
+    MO_NO_FLAG,
+
+    /// On a symbol operand "FOO", this indicates that the reference is actually
+    /// to "FOO@plt".  This is used for calls and jumps to external functions on
+    /// for PIC calls on Linux and ELF systems.
+    MO_PLT = 1,
+
+    /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to
+    /// the function's picbase, e.g. lo16(symbol-picbase).
+    MO_PIC_FLAG = 2,
+
+    /// MO_NLP_FLAG - If this bit is set, the symbol reference is actually to
+    /// the non_lazy_ptr for the global, e.g. lo16(symbol$non_lazy_ptr-picbase).
+    MO_NLP_FLAG = 4,
+
+    /// MO_NLP_HIDDEN_FLAG - If this bit is set, the symbol reference is to a
+    /// symbol with hidden visibility.  This causes a different kind of
+    /// non-lazy-pointer to be generated.
+    MO_NLP_HIDDEN_FLAG = 8,
+
+    /// The next are not flags but distinct values.
+    MO_ACCESS_MASK = 0xf0,
+
+    /// MO_LO, MO_HA - lo16(symbol) and ha16(symbol)
+    MO_LO = 1 << 4,
+    MO_HA = 2 << 4,
+
+    MO_TPREL_LO = 4 << 4,
+    MO_TPREL_HA = 3 << 4,
+
+    /// These values identify relocations on immediates folded
+    /// into memory operations.
+    MO_DTPREL_LO = 5 << 4,
+    MO_TLSLD_LO = 6 << 4,
+    MO_TOC_LO = 7 << 4,
+
+    // Symbol for VK_PPC_TLS fixup attached to an ADD instruction
+    MO_TLS = 8 << 4
+  };
+  } // end namespace PPCII
+  
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.td b/contrib/llvm/lib/Target/PowerPC/PPC.td
new file mode 100644
index 000000000000..46502208b175
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPC.td
@@ -0,0 +1,468 @@
+//===-- PPC.td - Describe the PowerPC Target Machine -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing.
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC Subtarget features.
+//
+
+//===----------------------------------------------------------------------===//
+// CPU Directives                                                             //
+//===----------------------------------------------------------------------===//
+
+def Directive440 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_440", "">;
+def Directive601 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_601", "">;
+def Directive602 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_602", "">;
+def Directive603 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive604 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive620 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive7400: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_7400", "">;
+def Directive750 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_750", "">;
+def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">;
+def Directive32  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">;
+def Directive64  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">;
+def DirectiveA2  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">;
+def DirectiveE500mc : SubtargetFeature<"", "DarwinDirective",
+                                       "PPC::DIR_E500mc", "">;
+def DirectiveE5500  : SubtargetFeature<"", "DarwinDirective",
+                                       "PPC::DIR_E5500", "">;
+def DirectivePwr3: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR3", "">;
+def DirectivePwr4: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR4", "">;
+def DirectivePwr5: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5", "">;
+def DirectivePwr5x
+    : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5X", "">;
+def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">;
+def DirectivePwr6x
+    : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6X", "">;
+def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">;
+def DirectivePwr8: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR8", "">;
+def DirectivePwr9: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR9", "">;
+
+def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
+                                        "Enable 64-bit instructions">;
+def FeatureHardFloat : SubtargetFeature<"hard-float", "HasHardFloat", "true",
+                              "Enable floating-point instructions">;
+def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
+                              "Enable 64-bit registers usage for ppc32 [beta]">;
+def FeatureCRBits    : SubtargetFeature<"crbits", "UseCRBits", "true",
+                              "Use condition-register bits individually">;
+def FeatureAltivec   : SubtargetFeature<"altivec","HasAltivec", "true",
+                                        "Enable Altivec instructions",
+                                        [FeatureHardFloat]>;
+def FeatureSPE       : SubtargetFeature<"spe","HasSPE", "true",
+                                        "Enable SPE instructions",
+                                        [FeatureHardFloat]>;
+def FeatureMFOCRF    : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
+                                        "Enable the MFOCRF instruction">;
+def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
+                                        "Enable the fsqrt instruction",
+                                        [FeatureHardFloat]>;
+def FeatureFCPSGN    : SubtargetFeature<"fcpsgn", "HasFCPSGN", "true",
+                                        "Enable the fcpsgn instruction",
+                                        [FeatureHardFloat]>;
+def FeatureFRE       : SubtargetFeature<"fre", "HasFRE", "true",
+                                        "Enable the fre instruction",
+                                        [FeatureHardFloat]>;
+def FeatureFRES      : SubtargetFeature<"fres", "HasFRES", "true",
+                                        "Enable the fres instruction",
+                                        [FeatureHardFloat]>;
+def FeatureFRSQRTE   : SubtargetFeature<"frsqrte", "HasFRSQRTE", "true",
+                                        "Enable the frsqrte instruction",
+                                        [FeatureHardFloat]>;
+def FeatureFRSQRTES  : SubtargetFeature<"frsqrtes", "HasFRSQRTES", "true",
+                                        "Enable the frsqrtes instruction",
+                                        [FeatureHardFloat]>;
+def FeatureRecipPrec : SubtargetFeature<"recipprec", "HasRecipPrec", "true",
+                              "Assume higher precision reciprocal estimates">;
+def FeatureSTFIWX    : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
+                                        "Enable the stfiwx instruction",
+                                        [FeatureHardFloat]>;
+def FeatureLFIWAX    : SubtargetFeature<"lfiwax","HasLFIWAX", "true",
+                                        "Enable the lfiwax instruction",
+                                        [FeatureHardFloat]>;
+def FeatureFPRND     : SubtargetFeature<"fprnd", "HasFPRND", "true",
+                                        "Enable the fri[mnpz] instructions",
+                                        [FeatureHardFloat]>;
+def FeatureFPCVT     : SubtargetFeature<"fpcvt", "HasFPCVT", "true",
+  "Enable fc[ft]* (unsigned and single-precision) and lfiwzx instructions",
+                                        [FeatureHardFloat]>;
+def FeatureISEL      : SubtargetFeature<"isel","HasISEL", "true",
+                                        "Enable the isel instruction">;
+def FeatureBPERMD    : SubtargetFeature<"bpermd", "HasBPERMD", "true",
+                                        "Enable the bpermd instruction">;
+def FeatureExtDiv    : SubtargetFeature<"extdiv", "HasExtDiv", "true",
+                                        "Enable extended divide instructions">;
+def FeatureLDBRX     : SubtargetFeature<"ldbrx","HasLDBRX", "true",
+                                        "Enable the ldbrx instruction">;
+def FeatureCMPB      : SubtargetFeature<"cmpb", "HasCMPB", "true",
+                                        "Enable the cmpb instruction">;
+def FeatureICBT      : SubtargetFeature<"icbt","HasICBT", "true",
+                                        "Enable icbt instruction">;
+def FeatureBookE     : SubtargetFeature<"booke", "IsBookE", "true",
+                                        "Enable Book E instructions",
+                                        [FeatureICBT]>;
+def FeatureMSYNC     : SubtargetFeature<"msync", "HasOnlyMSYNC", "true",
+                              "Has only the msync instruction instead of sync",
+                              [FeatureBookE]>;
+def FeatureE500      : SubtargetFeature<"e500", "IsE500", "true",
+                                        "Enable E500/E500mc instructions">;
+def FeaturePPC4xx    : SubtargetFeature<"ppc4xx", "IsPPC4xx", "true",
+                                        "Enable PPC 4xx instructions">;
+def FeaturePPC6xx    : SubtargetFeature<"ppc6xx", "IsPPC6xx", "true",
+                                        "Enable PPC 6xx instructions">;
+def FeatureQPX       : SubtargetFeature<"qpx","HasQPX", "true",
+                                        "Enable QPX instructions",
+                                        [FeatureHardFloat]>;
+def FeatureVSX       : SubtargetFeature<"vsx","HasVSX", "true",
+                                        "Enable VSX instructions",
+                                        [FeatureAltivec]>;
+def FeatureP8Altivec : SubtargetFeature<"power8-altivec", "HasP8Altivec", "true",
+                                        "Enable POWER8 Altivec instructions",
+                                        [FeatureAltivec]>;
+def FeatureP8Crypto : SubtargetFeature<"crypto", "HasP8Crypto", "true",
+                                       "Enable POWER8 Crypto instructions",
+                                       [FeatureP8Altivec]>;
+def FeatureP8Vector  : SubtargetFeature<"power8-vector", "HasP8Vector", "true",
+                                        "Enable POWER8 vector instructions",
+                                        [FeatureVSX, FeatureP8Altivec]>;
+def FeatureDirectMove :
+  SubtargetFeature<"direct-move", "HasDirectMove", "true",
+                   "Enable Power8 direct move instructions",
+                   [FeatureVSX]>;
+def FeaturePartwordAtomic : SubtargetFeature<"partword-atomics",
+                                             "HasPartwordAtomics", "true",
+                                             "Enable l[bh]arx and st[bh]cx.">;
+def FeatureInvariantFunctionDescriptors :
+  SubtargetFeature<"invariant-function-descriptors",
+                   "HasInvariantFunctionDescriptors", "true",
+                   "Assume function descriptors are invariant">;
+def FeatureLongCall : SubtargetFeature<"longcall", "UseLongCalls", "true",
+                                       "Always use indirect calls">;
+def FeatureHTM : SubtargetFeature<"htm", "HasHTM", "true",
+                                  "Enable Hardware Transactional Memory instructions">;
+def FeatureMFTB   : SubtargetFeature<"", "FeatureMFTB", "true",
+                                        "Implement mftb using the mfspr instruction">;
+def FeatureFusion : SubtargetFeature<"fusion", "HasFusion", "true",
+                                     "Target supports add/load integer fusion.">;
+def FeatureFloat128 :
+  SubtargetFeature<"float128", "HasFloat128", "true",
+                   "Enable the __float128 data type for IEEE-754R Binary128.",
+                   [FeatureVSX]>;
+def FeaturePOPCNTD   : SubtargetFeature<"popcntd","HasPOPCNTD",
+                                        "POPCNTD_Fast",
+                                        "Enable the popcnt[dw] instructions">;
+// Note that for the a2/a2q processor models we should not use popcnt[dw] by
+// default. These processors do support the instructions, but they're
+// microcoded, and the software emulation is about twice as fast.
+def FeatureSlowPOPCNTD : SubtargetFeature<"slow-popcntd","HasPOPCNTD",
+                                          "POPCNTD_Slow",
+                                          "Has slow popcnt[dw] instructions">;
+
+def DeprecatedDST    : SubtargetFeature<"", "DeprecatedDST", "true",
+  "Treat vector data stream cache control instructions as deprecated">;
+
+def FeatureISA3_0 : SubtargetFeature<"isa-v30-instructions", "IsISA3_0",
+                                     "true",
+                                     "Enable instructions added in ISA 3.0.">;
+def FeatureP9Altivec : SubtargetFeature<"power9-altivec", "HasP9Altivec", "true",
+                                        "Enable POWER9 Altivec instructions",
+                                        [FeatureISA3_0, FeatureP8Altivec]>;
+def FeatureP9Vector  : SubtargetFeature<"power9-vector", "HasP9Vector", "true",
+                                        "Enable POWER9 vector instructions",
+                                        [FeatureISA3_0, FeatureP8Vector,
+                                         FeatureP9Altivec]>;
+
+// Since new processors generally contain a superset of features of those that
+// came before them, the idea is to make implementations of new processors
+// less error prone and easier to read.
+// Namely:
+//     list<SubtargetFeature> Power8FeatureList = ...
+//     list<SubtargetFeature> FutureProcessorSpecificFeatureList =
+//         [ features that Power8 does not support ]
+//     list<SubtargetFeature> FutureProcessorFeatureList =
+//         !listconcat(Power8FeatureList, FutureProcessorSpecificFeatureList)
+
+// Makes it explicit and obvious what is new in FutureProcesor vs. Power8 as
+// well as providing a single point of definition if the feature set will be
+// used elsewhere.
+def ProcessorFeatures {
+  list<SubtargetFeature> Power7FeatureList =
+      [DirectivePwr7, FeatureAltivec, FeatureVSX,
+       FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
+       FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
+       FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
+       FeatureFPRND, FeatureFPCVT, FeatureISEL,
+       FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX,
+       Feature64Bit /*, Feature64BitRegs */,
+       FeatureBPERMD, FeatureExtDiv,
+       FeatureMFTB, DeprecatedDST];
+  list<SubtargetFeature> Power8SpecificFeatures =
+      [DirectivePwr8, FeatureP8Altivec, FeatureP8Vector, FeatureP8Crypto,
+       FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic,
+       FeatureFusion];
+  list<SubtargetFeature> Power8FeatureList =
+      !listconcat(Power7FeatureList, Power8SpecificFeatures);
+  list<SubtargetFeature> Power9SpecificFeatures =
+      [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0];
+  list<SubtargetFeature> Power9FeatureList =
+      !listconcat(Power8FeatureList, Power9SpecificFeatures);
+}
+
+// Note: Future features to add when support is extended to more
+// recent ISA levels:
+//
+// DFP          p6, p6x, p7        decimal floating-point instructions
+// POPCNTB      p5 through p7      popcntb and related instructions
+
+//===----------------------------------------------------------------------===//
+// Classes used for relation maps.
+//===----------------------------------------------------------------------===//
+// RecFormRel - Filter class used to relate non-record-form instructions with
+// their record-form variants.
+class RecFormRel;
+
+// AltVSXFMARel - Filter class used to relate the primary addend-killing VSX
+// FMA instruction forms with their corresponding factor-killing forms.
+class AltVSXFMARel {
+  bit IsVSXFMAAlt = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Relation Map Definitions.
+//===----------------------------------------------------------------------===//
+
+def getRecordFormOpcode : InstrMapping {
+  let FilterClass = "RecFormRel";
+  // Instructions with the same BaseName and Interpretation64Bit values
+  // form a row.
+  let RowFields = ["BaseName", "Interpretation64Bit"];
+  // Instructions with the same RC value form a column.
+  let ColFields = ["RC"];
+  // The key column are the non-record-form instructions.
+  let KeyCol = ["0"];
+  // Value columns RC=1
+  let ValueCols = [["1"]];
+}
+
+def getNonRecordFormOpcode : InstrMapping {
+  let FilterClass = "RecFormRel";
+  // Instructions with the same BaseName and Interpretation64Bit values
+  // form a row.
+  let RowFields = ["BaseName", "Interpretation64Bit"];
+  // Instructions with the same RC value form a column.
+  let ColFields = ["RC"];
+  // The key column are the record-form instructions.
+  let KeyCol = ["1"];
+  // Value columns are RC=0
+  let ValueCols = [["0"]];
+}
+
+def getAltVSXFMAOpcode : InstrMapping {
+  let FilterClass = "AltVSXFMARel";
+  // Instructions with the same BaseName and Interpretation64Bit values
+  // form a row.
+  let RowFields = ["BaseName"];
+  // Instructions with the same RC value form a column.
+  let ColFields = ["IsVSXFMAAlt"];
+  // The key column are the (default) addend-killing instructions.
+  let KeyCol = ["0"];
+  // Value columns IsVSXFMAAlt=1
+  let ValueCols = [["1"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "PPCRegisterInfo.td"
+include "PPCSchedule.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC processors supported.
+//
+
+def : Processor<"generic", G3Itineraries, [Directive32, FeatureHardFloat,
+                                           FeatureMFTB]>;
+def : ProcessorModel<"440", PPC440Model, [Directive440, FeatureISEL,
+                                          FeatureFRES, FeatureFRSQRTE,
+                                          FeatureICBT, FeatureBookE, 
+                                          FeatureMSYNC, FeatureMFTB]>;
+def : ProcessorModel<"450", PPC440Model, [Directive440, FeatureISEL,
+                                          FeatureFRES, FeatureFRSQRTE,
+                                          FeatureICBT, FeatureBookE, 
+                                          FeatureMSYNC, FeatureMFTB]>;
+def : Processor<"601", G3Itineraries, [Directive601, FeatureHardFloat]>;
+def : Processor<"602", G3Itineraries, [Directive602, FeatureHardFloat,
+                                       FeatureMFTB]>;
+def : Processor<"603", G3Itineraries, [Directive603,
+                                       FeatureFRES, FeatureFRSQRTE,
+                                       FeatureMFTB]>;
+def : Processor<"603e", G3Itineraries, [Directive603,
+                                        FeatureFRES, FeatureFRSQRTE,
+                                        FeatureMFTB]>;
+def : Processor<"603ev", G3Itineraries, [Directive603,
+                                         FeatureFRES, FeatureFRSQRTE,
+                                         FeatureMFTB]>;
+def : Processor<"604", G3Itineraries, [Directive604,
+                                       FeatureFRES, FeatureFRSQRTE,
+                                       FeatureMFTB]>;
+def : Processor<"604e", G3Itineraries, [Directive604,
+                                        FeatureFRES, FeatureFRSQRTE,
+                                        FeatureMFTB]>;
+def : Processor<"620", G3Itineraries, [Directive620,
+                                       FeatureFRES, FeatureFRSQRTE,
+                                       FeatureMFTB]>;
+def : Processor<"750", G4Itineraries, [Directive750,
+                                       FeatureFRES, FeatureFRSQRTE,
+                                       FeatureMFTB]>;
+def : Processor<"g3", G3Itineraries, [Directive750,
+                                      FeatureFRES, FeatureFRSQRTE,
+                                      FeatureMFTB]>;
+def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec,
+                                        FeatureFRES, FeatureFRSQRTE,
+                                        FeatureMFTB]>;
+def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec,
+                                      FeatureFRES, FeatureFRSQRTE,
+                                      FeatureMFTB]>;
+def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec,
+                                            FeatureFRES, FeatureFRSQRTE,
+                                            FeatureMFTB]>;
+def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec,
+                                           FeatureFRES, FeatureFRSQRTE, 
+                                           FeatureMFTB]>;
+
+def : ProcessorModel<"970", G5Model,
+                  [Directive970, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt,
+                   FeatureFRES, FeatureFRSQRTE, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */,
+                   FeatureMFTB]>;
+def : ProcessorModel<"g5", G5Model,
+                  [Directive970, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
+                   FeatureFRES, FeatureFRSQRTE,
+                   Feature64Bit /*, Feature64BitRegs */,
+                   FeatureMFTB, DeprecatedDST]>;
+def : ProcessorModel<"e500mc", PPCE500mcModel,
+                  [DirectiveE500mc,
+                   FeatureSTFIWX, FeatureICBT, FeatureBookE, 
+                   FeatureISEL, FeatureMFTB]>;
+def : ProcessorModel<"e5500", PPCE5500Model,
+                  [DirectiveE5500, FeatureMFOCRF, Feature64Bit,
+                   FeatureSTFIWX, FeatureICBT, FeatureBookE, 
+                   FeatureISEL, FeatureMFTB]>;
+def : ProcessorModel<"a2", PPCA2Model,
+                  [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF,
+                   FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+                   FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, FeatureFPCVT, FeatureISEL,
+                   FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX,
+                   Feature64Bit /*, Feature64BitRegs */, FeatureMFTB]>;
+def : ProcessorModel<"a2q", PPCA2Model,
+                  [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF,
+                   FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+                   FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, FeatureFPCVT, FeatureISEL,
+                   FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX,
+                   Feature64Bit /*, Feature64BitRegs */, FeatureQPX,
+                   FeatureMFTB]>;
+def : ProcessorModel<"pwr3", G5Model,
+                  [DirectivePwr3, FeatureAltivec,
+                   FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
+                   FeatureSTFIWX, Feature64Bit]>;
+def : ProcessorModel<"pwr4", G5Model,
+                  [DirectivePwr4, FeatureAltivec, FeatureMFOCRF,
+                   FeatureFSqrt, FeatureFRES, FeatureFRSQRTE,
+                   FeatureSTFIWX, Feature64Bit, FeatureMFTB]>;
+def : ProcessorModel<"pwr5", G5Model,
+                  [DirectivePwr5, FeatureAltivec, FeatureMFOCRF,
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureSTFIWX, Feature64Bit,
+                   FeatureMFTB, DeprecatedDST]>;
+def : ProcessorModel<"pwr5x", G5Model,
+                  [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureSTFIWX, FeatureFPRND, Feature64Bit,
+                   FeatureMFTB, DeprecatedDST]>;
+def : ProcessorModel<"pwr6", G5Model,
+                  [DirectivePwr6, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
+                   FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, FeatureCMPB,
+                   FeatureFPRND, Feature64Bit /*, Feature64BitRegs */,
+                   FeatureMFTB, DeprecatedDST]>;
+def : ProcessorModel<"pwr6x", G5Model,
+                  [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
+                   FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+                   FeatureSTFIWX, FeatureLFIWAX, FeatureCMPB,
+                   FeatureFPRND, Feature64Bit,
+                   FeatureMFTB, DeprecatedDST]>;
+def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.Power7FeatureList>;
+def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.Power8FeatureList>;
+def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.Power9FeatureList>; 
+def : Processor<"ppc", G3Itineraries, [Directive32, FeatureHardFloat,
+                                       FeatureMFTB]>;
+def : Processor<"ppc32", G3Itineraries, [Directive32, FeatureHardFloat,
+                                         FeatureMFTB]>;
+def : ProcessorModel<"ppc64", G5Model,
+                  [Directive64, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureFRES,
+                   FeatureFRSQRTE, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */,
+                   FeatureMFTB]>;
+def : ProcessorModel<"ppc64le", P8Model, ProcessorFeatures.Power8FeatureList>;
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "PPCCallingConv.td"
+
+def PPCInstrInfo : InstrInfo {
+  let isLittleEndianEncoding = 1;
+
+  // FIXME: Unset this when no longer needed!
+  let decodePositionallyEncodedOperands = 1;
+
+  let noNamedPositionallyEncodedOperands = 1;
+}
+
+def PPCAsmParser : AsmParser {
+  let ShouldEmitMatchRegisterName = 0;
+}
+
+def PPCAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+
+  // We do not use hard coded registers in asm strings.  However, some
+  // InstAlias definitions use immediate literals.  Set RegisterPrefix
+  // so that those are not misinterpreted as registers.
+  string RegisterPrefix = "%";
+  string BreakCharacters = ".";
+}
+
+def PPC : Target {
+  // Information about the instructions.
+  let InstructionSet = PPCInstrInfo;
+
+  let AssemblyParsers = [PPCAsmParser];
+  let AssemblyParserVariants = [PPCAsmParserVariant];
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
new file mode 100644
index 000000000000..f0e0ebc4946c
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -0,0 +1,1471 @@
+//===-- PPCAsmPrinter.cpp - Print machine instrs to PowerPC assembly ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to PowerPC assembly language. This printer is
+// the output mechanism used by `llc'.
+//
+// Documentation at http://developer.apple.com/documentation/DeveloperTools/
+// Reference/Assembler/ASMIntroduction/chapter_1_section_1.html
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "InstPrinter/PPCInstPrinter.h"
+#include "MCTargetDesc/PPCMCExpr.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "PPCTargetStreamer.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asmprinter"
+
+namespace {
+
+class PPCAsmPrinter : public AsmPrinter {
+protected:
+  MapVector<MCSymbol *, MCSymbol *> TOC;
+  const PPCSubtarget *Subtarget;
+  StackMaps SM;
+
+public:
+  explicit PPCAsmPrinter(TargetMachine &TM,
+                         std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)), SM(*this) {}
+
+  StringRef getPassName() const override { return "PowerPC Assembly Printer"; }
+
+  MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym);
+
+  bool doInitialization(Module &M) override {
+    if (!TOC.empty())
+      TOC.clear();
+    return AsmPrinter::doInitialization(M);
+  }
+
+    void EmitInstruction(const MachineInstr *MI) override;
+
+    void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &O) override;
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &O) override;
+
+    void EmitEndOfAsmFile(Module &M) override;
+
+    void LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI);
+    void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI);
+    void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK);
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      Subtarget = &MF.getSubtarget<PPCSubtarget>();
+      return AsmPrinter::runOnMachineFunction(MF);
+    }
+  };
+
+  /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
+  class PPCLinuxAsmPrinter : public PPCAsmPrinter {
+  public:
+    explicit PPCLinuxAsmPrinter(TargetMachine &TM,
+                                std::unique_ptr<MCStreamer> Streamer)
+        : PPCAsmPrinter(TM, std::move(Streamer)) {}
+
+    StringRef getPassName() const override {
+      return "Linux PPC Assembly Printer";
+    }
+
+    bool doFinalization(Module &M) override;
+    void EmitStartOfAsmFile(Module &M) override;
+
+    void EmitFunctionEntryLabel() override;
+
+    void EmitFunctionBodyStart() override;
+    void EmitFunctionBodyEnd() override;
+  };
+
+  /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
+  /// OS X
+  class PPCDarwinAsmPrinter : public PPCAsmPrinter {
+  public:
+    explicit PPCDarwinAsmPrinter(TargetMachine &TM,
+                                 std::unique_ptr<MCStreamer> Streamer)
+        : PPCAsmPrinter(TM, std::move(Streamer)) {}
+
+    StringRef getPassName() const override {
+      return "Darwin PPC Assembly Printer";
+    }
+
+    bool doFinalization(Module &M) override;
+    void EmitStartOfAsmFile(Module &M) override;
+  };
+
+} // end anonymous namespace
+
+/// stripRegisterPrefix - This method strips the character prefix from a
+/// register name so that only the number is left.  Used by for linux asm.
+static const char *stripRegisterPrefix(const char *RegName) {
+  switch (RegName[0]) {
+    case 'r':
+    case 'f':
+    case 'q': // for QPX
+    case 'v':
+      if (RegName[1] == 's')
+        return RegName + 2;
+      return RegName + 1;
+    case 'c': if (RegName[1] == 'r') return RegName + 2;
+  }
+
+  return RegName;
+}
+
+void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  const DataLayout &DL = getDataLayout();
+  const MachineOperand &MO = MI->getOperand(OpNo);
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register: {
+    unsigned Reg = MO.getReg();
+
+    // There are VSX instructions that use VSX register numbering (vs0 - vs63)
+    // as well as those that use VMX register numbering (v0 - v31 which
+    // correspond to vs32 - vs63). If we have an instruction that uses VSX
+    // numbering, we need to convert the VMX registers to VSX registers.
+    // Namely, we print 32-63 when the instruction operates on one of the
+    // VMX registers.
+    // (Please synchronize with PPCInstPrinter::printOperand)
+    if (MI->getDesc().TSFlags & PPCII::UseVSXReg) {
+      if (PPCInstrInfo::isVRRegister(Reg))
+        Reg = PPC::VSX32 + (Reg - PPC::V0);
+      else if (PPCInstrInfo::isVFRegister(Reg))
+        Reg = PPC::VSX32 + (Reg - PPC::VF0);
+    }
+    const char *RegName = PPCInstPrinter::getRegisterName(Reg);
+
+    // Linux assembler (Others?) does not take register mnemonics.
+    // FIXME - What about special registers used in mfspr/mtspr?
+    if (!Subtarget->isDarwin())
+      RegName = stripRegisterPrefix(RegName);
+    O << RegName;
+    return;
+  }
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    MO.getMBB()->getSymbol()->print(O, MAI);
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+      << MO.getIndex();
+    return;
+  case MachineOperand::MO_BlockAddress:
+    GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI);
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    // Computing the address of a global symbol, not calling it.
+    const GlobalValue *GV = MO.getGlobal();
+    MCSymbol *SymToPrint;
+
+    // External or weakly linked global variables need non-lazily-resolved stubs
+    if (Subtarget->hasLazyResolverStub(GV)) {
+      SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+      MachineModuleInfoImpl::StubValueTy &StubSym =
+          MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(
+              SymToPrint);
+      if (!StubSym.getPointer())
+        StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
+                                                     !GV->hasInternalLinkage());
+    } else {
+      SymToPrint = getSymbol(GV);
+    }
+
+    SymToPrint->print(O, MAI);
+
+    printOffset(MO.getOffset(), O);
+    return;
+  }
+
+  default:
+    O << "<unknown operand type: " << (unsigned)MO.getType() << ">";
+    return;
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant,
+                                    const char *ExtraCode, raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'c': // Don't print "$" before a global var name or constant.
+      break; // PPC never has a prefix.
+    case 'L': // Write second word of DImode reference.
+      // Verify that this operand has two consecutive registers.
+      if (!MI->getOperand(OpNo).isReg() ||
+          OpNo+1 == MI->getNumOperands() ||
+          !MI->getOperand(OpNo+1).isReg())
+        return true;
+      ++OpNo;   // Return the high-part.
+      break;
+    case 'I':
+      // Write 'i' if an integer constant, otherwise nothing.  Used to print
+      // addi vs add, etc.
+      if (MI->getOperand(OpNo).isImm())
+        O << "i";
+      return false;
+    }
+  }
+
+  printOperand(MI, OpNo, O);
+  return false;
+}
+
+// At the moment, all inline asm memory operands are a single register.
+// In any case, the output of this routine should always be just one
+// assembler operand.
+
+bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                          unsigned AsmVariant,
+                                          const char *ExtraCode,
+                                          raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'y': // A memory reference for an X-form instruction
+      {
+        const char *RegName = "r0";
+        if (!Subtarget->isDarwin())
+          RegName = stripRegisterPrefix(RegName);
+        O << RegName << ", ";
+        printOperand(MI, OpNo, O);
+        return false;
+      }
+    case 'U': // Print 'u' for update form.
+    case 'X': // Print 'x' for indexed form.
+    {
+      // FIXME: Currently for PowerPC memory operands are always loaded
+      // into a register, so we never get an update or indexed form.
+      // This is bad even for offset forms, since even if we know we
+      // have a value in -16(r1), we will generate a load into r<n>
+      // and then load from 0(r<n>).  Until that issue is fixed,
+      // tolerate 'U' and 'X' but don't output anything.
+      assert(MI->getOperand(OpNo).isReg());
+      return false;
+    }
+    }
+  }
+
+  assert(MI->getOperand(OpNo).isReg());
+  O << "0(";
+  printOperand(MI, OpNo, O);
+  O << ")";
+  return false;
+}
+
+/// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry
+/// exists for it.  If not, create one.  Then return a symbol that references
+/// the TOC entry.
+MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) {
+  MCSymbol *&TOCEntry = TOC[Sym];
+  if (!TOCEntry)
+    TOCEntry = createTempSymbol("C");
+  return TOCEntry;
+}
+
+void PPCAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  SM.serializeToStackMapSection();
+}
+
+void PPCAsmPrinter::LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI) {
+  unsigned NumNOPBytes = MI.getOperand(1).getImm();
+
+  SM.recordStackMap(MI);
+  assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+
+  // Scan ahead to trim the shadow.
+  const MachineBasicBlock &MBB = *MI.getParent();
+  MachineBasicBlock::const_iterator MII(MI);
+  ++MII;
+  while (NumNOPBytes > 0) {
+    if (MII == MBB.end() || MII->isCall() ||
+        MII->getOpcode() == PPC::DBG_VALUE ||
+        MII->getOpcode() == TargetOpcode::PATCHPOINT ||
+        MII->getOpcode() == TargetOpcode::STACKMAP)
+      break;
+    ++MII;
+    NumNOPBytes -= 4;
+  }
+
+  // Emit nops.
+  for (unsigned i = 0; i < NumNOPBytes; i += 4)
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>
+void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) {
+  SM.recordPatchPoint(MI);
+  PatchPointOpers Opers(&MI);
+
+  unsigned EncodedBytes = 0;
+  const MachineOperand &CalleeMO = Opers.getCallTarget();
+
+  if (CalleeMO.isImm()) {
+    int64_t CallTarget = CalleeMO.getImm();
+    if (CallTarget) {
+      assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
+             "High 16 bits of call target should be zero.");
+      unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+      EncodedBytes = 0;
+      // Materialize the jump address:
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI8)
+                                      .addReg(ScratchReg)
+                                      .addImm((CallTarget >> 32) & 0xFFFF));
+      ++EncodedBytes;
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::RLDIC)
+                                      .addReg(ScratchReg)
+                                      .addReg(ScratchReg)
+                                      .addImm(32).addImm(16));
+      ++EncodedBytes;
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ORIS8)
+                                      .addReg(ScratchReg)
+                                      .addReg(ScratchReg)
+                                      .addImm((CallTarget >> 16) & 0xFFFF));
+      ++EncodedBytes;
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ORI8)
+                                      .addReg(ScratchReg)
+                                      .addReg(ScratchReg)
+                                      .addImm(CallTarget & 0xFFFF));
+
+      // Save the current TOC pointer before the remote call.
+      int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40;
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::STD)
+                                      .addReg(PPC::X2)
+                                      .addImm(TOCSaveOffset)
+                                      .addReg(PPC::X1));
+      ++EncodedBytes;
+
+      // If we're on ELFv1, then we need to load the actual function pointer
+      // from the function descriptor.
+      if (!Subtarget->isELFv2ABI()) {
+        // Load the new TOC pointer and the function address, but not r11
+        // (needing this is rare, and loading it here would prevent passing it
+        // via a 'nest' parameter.
+        EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
+                                        .addReg(PPC::X2)
+                                        .addImm(8)
+                                        .addReg(ScratchReg));
+        ++EncodedBytes;
+        EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
+                                        .addReg(ScratchReg)
+                                        .addImm(0)
+                                        .addReg(ScratchReg));
+        ++EncodedBytes;
+      }
+
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTCTR8)
+                                      .addReg(ScratchReg));
+      ++EncodedBytes;
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BCTRL8));
+      ++EncodedBytes;
+
+      // Restore the TOC pointer after the call.
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
+                                      .addReg(PPC::X2)
+                                      .addImm(TOCSaveOffset)
+                                      .addReg(PPC::X1));
+      ++EncodedBytes;
+    }
+  } else if (CalleeMO.isGlobal()) {
+    const GlobalValue *GValue = CalleeMO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, OutContext);
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL8_NOP)
+                                    .addExpr(SymVar));
+    EncodedBytes += 2;
+  }
+
+  // Each instruction is 4 bytes.
+  EncodedBytes *= 4;
+
+  // Emit padding.
+  unsigned NumBytes = Opers.getNumPatchBytes();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+  assert((NumBytes - EncodedBytes) % 4 == 0 &&
+         "Invalid number of NOP bytes requested!");
+  for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
+}
+
+/// EmitTlsCall -- Given a GETtls[ld]ADDR[32] instruction, print a
+/// call to __tls_get_addr to the current output stream.
+void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
+                                MCSymbolRefExpr::VariantKind VK) {
+  StringRef Name = "__tls_get_addr";
+  MCSymbol *TlsGetAddr = OutContext.getOrCreateSymbol(Name);
+  MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+
+  assert(MI->getOperand(0).isReg() &&
+         ((Subtarget->isPPC64() && MI->getOperand(0).getReg() == PPC::X3) ||
+          (!Subtarget->isPPC64() && MI->getOperand(0).getReg() == PPC::R3)) &&
+         "GETtls[ld]ADDR[32] must define GPR3");
+  assert(MI->getOperand(1).isReg() &&
+         ((Subtarget->isPPC64() && MI->getOperand(1).getReg() == PPC::X3) ||
+          (!Subtarget->isPPC64() && MI->getOperand(1).getReg() == PPC::R3)) &&
+         "GETtls[ld]ADDR[32] must read GPR3");
+
+  if (!Subtarget->isPPC64() && !Subtarget->isDarwin() &&
+      isPositionIndependent())
+    Kind = MCSymbolRefExpr::VK_PLT;
+  const MCSymbolRefExpr *TlsRef =
+    MCSymbolRefExpr::create(TlsGetAddr, Kind, OutContext);
+  const MachineOperand &MO = MI->getOperand(2);
+  const GlobalValue *GValue = MO.getGlobal();
+  MCSymbol *MOSymbol = getSymbol(GValue);
+  const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, VK, OutContext);
+  EmitToStreamer(*OutStreamer,
+                 MCInstBuilder(Subtarget->isPPC64() ?
+                               PPC::BL8_NOP_TLS : PPC::BL_TLS)
+                 .addExpr(TlsRef)
+                 .addExpr(SymVar));
+}
+
+/// EmitInstruction -- Print out a single PowerPC MI in Darwin syntax to
+/// the current output stream.
+///
+void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MCInst TmpInst;
+  bool isPPC64 = Subtarget->isPPC64();
+  bool isDarwin = TM.getTargetTriple().isOSDarwin();
+  const Module *M = MF->getFunction()->getParent();
+  PICLevel::Level PL = M->getPICLevel();
+
+  // Lower multi-instruction pseudo operations.
+  switch (MI->getOpcode()) {
+  default: break;
+  case TargetOpcode::DBG_VALUE:
+    llvm_unreachable("Should be handled target independently");
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(SM, *MI);
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(SM, *MI);
+
+  case PPC::MoveGOTtoLR: {
+    // Transform %LR = MoveGOTtoLR
+    // Into this: bl _GLOBAL_OFFSET_TABLE_@local-4
+    // _GLOBAL_OFFSET_TABLE_@local-4 (instruction preceding
+    // _GLOBAL_OFFSET_TABLE_) has exactly one instruction:
+    //      blrl
+    // This will return the pointer to _GLOBAL_OFFSET_TABLE_@local
+    MCSymbol *GOTSymbol =
+      OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
+    const MCExpr *OffsExpr =
+      MCBinaryExpr::createSub(MCSymbolRefExpr::create(GOTSymbol,
+                                                      MCSymbolRefExpr::VK_PPC_LOCAL,
+                                                      OutContext),
+                              MCConstantExpr::create(4, OutContext),
+                              OutContext);
+
+    // Emit the 'bl'.
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL).addExpr(OffsExpr));
+    return;
+  }
+  case PPC::MovePCtoLR:
+  case PPC::MovePCtoLR8: {
+    // Transform %LR = MovePCtoLR
+    // Into this, where the label is the PIC base:
+    //     bl L1$pb
+    // L1$pb:
+    MCSymbol *PICBase = MF->getPICBaseSymbol();
+
+    // Emit the 'bl'.
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(PPC::BL)
+                       // FIXME: We would like an efficient form for this, so we
+                       // don't have to do a lot of extra uniquing.
+                       .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
+
+    // Emit the label.
+    OutStreamer->EmitLabel(PICBase);
+    return;
+  }
+  case PPC::UpdateGBR: {
+    // Transform %Rd = UpdateGBR(%Rt, %Ri)
+    // Into: lwz %Rt, .L0$poff - .L0$pb(%Ri)
+    //       add %Rd, %Rt, %Ri
+    // Get the offset from the GOT Base Register to the GOT
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+    MCSymbol *PICOffset =
+      MF->getInfo<PPCFunctionInfo>()->getPICOffsetSymbol();
+    TmpInst.setOpcode(PPC::LWZ);
+    const MCExpr *Exp =
+      MCSymbolRefExpr::create(PICOffset, MCSymbolRefExpr::VK_None, OutContext);
+    const MCExpr *PB =
+      MCSymbolRefExpr::create(MF->getPICBaseSymbol(),
+                              MCSymbolRefExpr::VK_None,
+                              OutContext);
+    const MCOperand TR = TmpInst.getOperand(1);
+    const MCOperand PICR = TmpInst.getOperand(0);
+
+    // Step 1: lwz %Rt, .L$poff - .L$pb(%Ri)
+    TmpInst.getOperand(1) =
+        MCOperand::createExpr(MCBinaryExpr::createSub(Exp, PB, OutContext));
+    TmpInst.getOperand(0) = TR;
+    TmpInst.getOperand(2) = PICR;
+    EmitToStreamer(*OutStreamer, TmpInst);
+
+    TmpInst.setOpcode(PPC::ADD4);
+    TmpInst.getOperand(0) = PICR;
+    TmpInst.getOperand(1) = TR;
+    TmpInst.getOperand(2) = PICR;
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
+  }
+  case PPC::LWZtoc: {
+    // Transform %R3 = LWZtoc <ga:@min1>, %R2
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+
+    // Change the opcode to LWZ, and the global address operand to be a
+    // reference to the GOT entry we will synthesize later.
+    TmpInst.setOpcode(PPC::LWZ);
+    const MachineOperand &MO = MI->getOperand(1);
+
+    // Map symbol -> label of TOC entry
+    assert(MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress());
+    MCSymbol *MOSymbol = nullptr;
+    if (MO.isGlobal())
+      MOSymbol = getSymbol(MO.getGlobal());
+    else if (MO.isCPI())
+      MOSymbol = GetCPISymbol(MO.getIndex());
+    else if (MO.isJTI())
+      MOSymbol = GetJTISymbol(MO.getIndex());
+    else if (MO.isBlockAddress())
+      MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
+
+    if (PL == PICLevel::SmallPIC) {
+      const MCExpr *Exp =
+        MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_GOT,
+                                OutContext);
+      TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+    } else {
+      MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+
+      const MCExpr *Exp =
+        MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_None,
+                                OutContext);
+      const MCExpr *PB =
+        MCSymbolRefExpr::create(OutContext.getOrCreateSymbol(Twine(".LTOC")),
+                                                             OutContext);
+      Exp = MCBinaryExpr::createSub(Exp, PB, OutContext);
+      TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+    }
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
+  }
+  case PPC::LDtocJTI:
+  case PPC::LDtocCPT:
+  case PPC::LDtocBA:
+  case PPC::LDtoc: {
+    // Transform %X3 = LDtoc <ga:@min1>, %X2
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+
+    // Change the opcode to LD, and the global address operand to be a
+    // reference to the TOC entry we will synthesize later.
+    TmpInst.setOpcode(PPC::LD);
+    const MachineOperand &MO = MI->getOperand(1);
+
+    // Map symbol -> label of TOC entry
+    assert(MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress());
+    MCSymbol *MOSymbol = nullptr;
+    if (MO.isGlobal())
+      MOSymbol = getSymbol(MO.getGlobal());
+    else if (MO.isCPI())
+      MOSymbol = GetCPISymbol(MO.getIndex());
+    else if (MO.isJTI())
+      MOSymbol = GetJTISymbol(MO.getIndex());
+    else if (MO.isBlockAddress())
+      MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
+
+    MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+
+    const MCExpr *Exp =
+      MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC,
+                              OutContext);
+    TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
+  }
+
+  case PPC::ADDIStocHA: {
+    // Transform %Xd = ADDIStocHA %X2, <ga:@sym>
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+
+    // Change the opcode to ADDIS8.  If the global address is external, has
+    // common linkage, is a non-local function address, or is a jump table
+    // address, then generate a TOC entry and reference that.  Otherwise
+    // reference the symbol directly.
+    TmpInst.setOpcode(PPC::ADDIS8);
+    const MachineOperand &MO = MI->getOperand(2);
+    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() ||
+            MO.isBlockAddress()) &&
+           "Invalid operand for ADDIStocHA!");
+    MCSymbol *MOSymbol = nullptr;
+    bool GlobalToc = false;
+
+    if (MO.isGlobal()) {
+      const GlobalValue *GV = MO.getGlobal();
+      MOSymbol = getSymbol(GV);
+      unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+      GlobalToc = (GVFlags & PPCII::MO_NLP_FLAG);
+    } else if (MO.isCPI()) {
+      MOSymbol = GetCPISymbol(MO.getIndex());
+    } else if (MO.isJTI()) {
+      MOSymbol = GetJTISymbol(MO.getIndex());
+    } else if (MO.isBlockAddress()) {
+      MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
+    }
+
+    if (GlobalToc || MO.isJTI() || MO.isBlockAddress() ||
+        TM.getCodeModel() == CodeModel::Large)
+      MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+
+    const MCExpr *Exp =
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_HA,
+                              OutContext);
+
+    if (!MO.isJTI() && MO.getOffset())
+      Exp = MCBinaryExpr::createAdd(Exp,
+                                    MCConstantExpr::create(MO.getOffset(),
+                                                           OutContext),
+                                    OutContext);
+
+    TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
+  }
+  case PPC::LDtocL: {
+    // Transform %Xd = LDtocL <ga:@sym>, %Xs
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+
+    // Change the opcode to LD.  If the global address is external, has
+    // common linkage, or is a jump table address, then reference the
+    // associated TOC entry.  Otherwise reference the symbol directly.
+    TmpInst.setOpcode(PPC::LD);
+    const MachineOperand &MO = MI->getOperand(1);
+    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() ||
+            MO.isBlockAddress()) &&
+           "Invalid operand for LDtocL!");
+    MCSymbol *MOSymbol = nullptr;
+
+    if (MO.isJTI())
+      MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex()));
+    else if (MO.isBlockAddress()) {
+      MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
+      MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+    }
+    else if (MO.isCPI()) {
+      MOSymbol = GetCPISymbol(MO.getIndex());
+      if (TM.getCodeModel() == CodeModel::Large)
+        MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+    }
+    else if (MO.isGlobal()) {
+      const GlobalValue *GV = MO.getGlobal();
+      MOSymbol = getSymbol(GV);
+      DEBUG(
+        unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+        assert((GVFlags & PPCII::MO_NLP_FLAG) &&
+               "LDtocL used on symbol that could be accessed directly is "
+               "invalid. Must match ADDIStocHA."));
+      MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+    }
+
+    const MCExpr *Exp =
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
+                              OutContext);
+    TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
+  }
+  case PPC::ADDItocL: {
+    // Transform %Xd = ADDItocL %Xs, <ga:@sym>
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+
+    // Change the opcode to ADDI8.  If the global address is external, then
+    // generate a TOC entry and reference that.  Otherwise reference the
+    // symbol directly.
+    TmpInst.setOpcode(PPC::ADDI8);
+    const MachineOperand &MO = MI->getOperand(2);
+    assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL");
+    MCSymbol *MOSymbol = nullptr;
+
+    if (MO.isGlobal()) {
+      const GlobalValue *GV = MO.getGlobal();
+      DEBUG(
+        unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+        assert (
+            !(GVFlags & PPCII::MO_NLP_FLAG) &&
+            "Interposable definitions must use indirect access."));
+      MOSymbol = getSymbol(GV);
+    } else if (MO.isCPI()) {
+      MOSymbol = GetCPISymbol(MO.getIndex());
+    }
+
+    const MCExpr *Exp =
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
+                              OutContext);
+    TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
+  }
+  case PPC::ADDISgotTprelHA: {
+    // Transform: %Xd = ADDISgotTprelHA %X2, <ga:@sym>
+    // Into:      %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
+    assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymGotTprel =
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA,
+                              OutContext);
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
+                                 .addReg(MI->getOperand(0).getReg())
+                                 .addReg(MI->getOperand(1).getReg())
+                                 .addExpr(SymGotTprel));
+    return;
+  }
+  case PPC::LDgotTprelL:
+  case PPC::LDgotTprelL32: {
+    // Transform %Xd = LDgotTprelL <ga:@sym>, %Xs
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+
+    // Change the opcode to LD.
+    TmpInst.setOpcode(isPPC64 ? PPC::LD : PPC::LWZ);
+    const MachineOperand &MO = MI->getOperand(1);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *Exp =
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO,
+                              OutContext);
+    TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
+    EmitToStreamer(*OutStreamer, TmpInst);
+    return;
+  }
+
+  case PPC::PPC32PICGOT: {
+    MCSymbol *GOTSymbol = OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
+    MCSymbol *GOTRef = OutContext.createTempSymbol();
+    MCSymbol *NextInstr = OutContext.createTempSymbol();
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL)
+      // FIXME: We would like an efficient form for this, so we don't have to do
+      // a lot of extra uniquing.
+      .addExpr(MCSymbolRefExpr::create(NextInstr, OutContext)));
+    const MCExpr *OffsExpr =
+      MCBinaryExpr::createSub(MCSymbolRefExpr::create(GOTSymbol, OutContext),
+                                MCSymbolRefExpr::create(GOTRef, OutContext),
+        OutContext);
+    OutStreamer->EmitLabel(GOTRef);
+    OutStreamer->EmitValue(OffsExpr, 4);
+    OutStreamer->EmitLabel(NextInstr);
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR)
+                                 .addReg(MI->getOperand(0).getReg()));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LWZ)
+                                 .addReg(MI->getOperand(1).getReg())
+                                 .addImm(0)
+                                 .addReg(MI->getOperand(0).getReg()));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADD4)
+                                 .addReg(MI->getOperand(0).getReg())
+                                 .addReg(MI->getOperand(1).getReg())
+                                 .addReg(MI->getOperand(0).getReg()));
+    return;
+  }
+  case PPC::PPC32GOT: {
+    MCSymbol *GOTSymbol =
+        OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
+    const MCExpr *SymGotTlsL = MCSymbolRefExpr::create(
+        GOTSymbol, MCSymbolRefExpr::VK_PPC_LO, OutContext);
+    const MCExpr *SymGotTlsHA = MCSymbolRefExpr::create(
+        GOTSymbol, MCSymbolRefExpr::VK_PPC_HA, OutContext);
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI)
+                                 .addReg(MI->getOperand(0).getReg())
+                                 .addExpr(SymGotTlsL));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
+                                 .addReg(MI->getOperand(0).getReg())
+                                 .addReg(MI->getOperand(0).getReg())
+                                 .addExpr(SymGotTlsHA));
+    return;
+  }
+  case PPC::ADDIStlsgdHA: {
+    // Transform: %Xd = ADDIStlsgdHA %X2, <ga:@sym>
+    // Into:      %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
+    assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymGotTlsGD =
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA,
+                              OutContext);
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
+                                 .addReg(MI->getOperand(0).getReg())
+                                 .addReg(MI->getOperand(1).getReg())
+                                 .addExpr(SymGotTlsGD));
+    return;
+  }
+  case PPC::ADDItlsgdL:
+    // Transform: %Xd = ADDItlsgdL %Xs, <ga:@sym>
+    // Into:      %Xd = ADDI8 %Xs, sym@got@tlsgd@l
+  case PPC::ADDItlsgdL32: {
+    // Transform: %Rd = ADDItlsgdL32 %Rs, <ga:@sym>
+    // Into:      %Rd = ADDI %Rs, sym@got@tlsgd
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymGotTlsGD = MCSymbolRefExpr::create(
+        MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO
+                                       : MCSymbolRefExpr::VK_PPC_GOT_TLSGD,
+        OutContext);
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+                   .addReg(MI->getOperand(0).getReg())
+                   .addReg(MI->getOperand(1).getReg())
+                   .addExpr(SymGotTlsGD));
+    return;
+  }
+  case PPC::GETtlsADDR:
+    // Transform: %X3 = GETtlsADDR %X3, <ga:@sym>
+    // Into: BL8_NOP_TLS __tls_get_addr(sym at tlsgd)
+  case PPC::GETtlsADDR32: {
+    // Transform: %R3 = GETtlsADDR32 %R3, <ga:@sym>
+    // Into: BL_TLS __tls_get_addr(sym at tlsgd)@PLT
+    EmitTlsCall(MI, MCSymbolRefExpr::VK_PPC_TLSGD);
+    return;
+  }
+  case PPC::ADDIStlsldHA: {
+    // Transform: %Xd = ADDIStlsldHA %X2, <ga:@sym>
+    // Into:      %Xd = ADDIS8 %X2, sym@got@tlsld@ha
+    assert(Subtarget->isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymGotTlsLD =
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA,
+                              OutContext);
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
+                                 .addReg(MI->getOperand(0).getReg())
+                                 .addReg(MI->getOperand(1).getReg())
+                                 .addExpr(SymGotTlsLD));
+    return;
+  }
+  case PPC::ADDItlsldL:
+    // Transform: %Xd = ADDItlsldL %Xs, <ga:@sym>
+    // Into:      %Xd = ADDI8 %Xs, sym@got@tlsld@l
+  case PPC::ADDItlsldL32: {
+    // Transform: %Rd = ADDItlsldL32 %Rs, <ga:@sym>
+    // Into:      %Rd = ADDI %Rs, sym@got@tlsld
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymGotTlsLD = MCSymbolRefExpr::create(
+        MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO
+                                       : MCSymbolRefExpr::VK_PPC_GOT_TLSLD,
+        OutContext);
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+                       .addReg(MI->getOperand(0).getReg())
+                       .addReg(MI->getOperand(1).getReg())
+                       .addExpr(SymGotTlsLD));
+    return;
+  }
+  case PPC::GETtlsldADDR:
+    // Transform: %X3 = GETtlsldADDR %X3, <ga:@sym>
+    // Into: BL8_NOP_TLS __tls_get_addr(sym at tlsld)
+  case PPC::GETtlsldADDR32: {
+    // Transform: %R3 = GETtlsldADDR32 %R3, <ga:@sym>
+    // Into: BL_TLS __tls_get_addr(sym at tlsld)@PLT
+    EmitTlsCall(MI, MCSymbolRefExpr::VK_PPC_TLSLD);
+    return;
+  }
+  case PPC::ADDISdtprelHA:
+    // Transform: %Xd = ADDISdtprelHA %Xs, <ga:@sym>
+    // Into:      %Xd = ADDIS8 %Xs, sym@dtprel@ha
+  case PPC::ADDISdtprelHA32: {
+    // Transform: %Rd = ADDISdtprelHA32 %Rs, <ga:@sym>
+    // Into:      %Rd = ADDIS %Rs, sym@dtprel@ha
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymDtprel =
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_HA,
+                              OutContext);
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDIS8 : PPC::ADDIS)
+            .addReg(MI->getOperand(0).getReg())
+            .addReg(MI->getOperand(1).getReg())
+            .addExpr(SymDtprel));
+    return;
+  }
+  case PPC::ADDIdtprelL:
+    // Transform: %Xd = ADDIdtprelL %Xs, <ga:@sym>
+    // Into:      %Xd = ADDI8 %Xs, sym@dtprel@l
+  case PPC::ADDIdtprelL32: {
+    // Transform: %Rd = ADDIdtprelL32 %Rs, <ga:@sym>
+    // Into:      %Rd = ADDI %Rs, sym@dtprel@l
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = getSymbol(GValue);
+    const MCExpr *SymDtprel =
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO,
+                              OutContext);
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+                       .addReg(MI->getOperand(0).getReg())
+                       .addReg(MI->getOperand(1).getReg())
+                       .addExpr(SymDtprel));
+    return;
+  }
+  case PPC::MFOCRF:
+  case PPC::MFOCRF8:
+    if (!Subtarget->hasMFOCRF()) {
+      // Transform: %R3 = MFOCRF %CR7
+      // Into:      %R3 = MFCR   ;; cr7
+      unsigned NewOpcode =
+        MI->getOpcode() == PPC::MFOCRF ? PPC::MFCR : PPC::MFCR8;
+      OutStreamer->AddComment(PPCInstPrinter::
+                              getRegisterName(MI->getOperand(1).getReg()));
+      EmitToStreamer(*OutStreamer, MCInstBuilder(NewOpcode)
+                                  .addReg(MI->getOperand(0).getReg()));
+      return;
+    }
+    break;
+  case PPC::MTOCRF:
+  case PPC::MTOCRF8:
+    if (!Subtarget->hasMFOCRF()) {
+      // Transform: %CR7 = MTOCRF %R3
+      // Into:      MTCRF mask, %R3 ;; cr7
+      unsigned NewOpcode =
+        MI->getOpcode() == PPC::MTOCRF ? PPC::MTCRF : PPC::MTCRF8;
+      unsigned Mask = 0x80 >> OutContext.getRegisterInfo()
+                              ->getEncodingValue(MI->getOperand(0).getReg());
+      OutStreamer->AddComment(PPCInstPrinter::
+                              getRegisterName(MI->getOperand(0).getReg()));
+      EmitToStreamer(*OutStreamer, MCInstBuilder(NewOpcode)
+                                     .addImm(Mask)
+                                     .addReg(MI->getOperand(1).getReg()));
+      return;
+    }
+    break;
+  case PPC::LD:
+  case PPC::STD:
+  case PPC::LWA_32:
+  case PPC::LWA: {
+    // Verify alignment is legal, so we don't create relocations
+    // that can't be supported.
+    // FIXME:  This test is currently disabled for Darwin.  The test
+    // suite shows a handful of test cases that fail this check for
+    // Darwin.  Those need to be investigated before this sanity test
+    // can be enabled for those subtargets.
+    if (!Subtarget->isDarwin()) {
+      unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
+      const MachineOperand &MO = MI->getOperand(OpNum);
+      if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
+        llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
+    }
+    // Now process the instruction normally.
+    break;
+  }
+  }
+
+  LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+  EmitToStreamer(*OutStreamer, TmpInst);
+}
+
+void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  if (static_cast<const PPCTargetMachine &>(TM).isELFv2ABI()) {
+    PPCTargetStreamer *TS =
+      static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
+
+    if (TS)
+      TS->emitAbiVersion(2);
+  }
+
+  if (static_cast<const PPCTargetMachine &>(TM).isPPC64() ||
+      !isPositionIndependent())
+    return AsmPrinter::EmitStartOfAsmFile(M);
+
+  if (M.getPICLevel() == PICLevel::SmallPIC)
+    return AsmPrinter::EmitStartOfAsmFile(M);
+
+  OutStreamer->SwitchSection(OutContext.getELFSection(
+      ".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC));
+
+  MCSymbol *TOCSym = OutContext.getOrCreateSymbol(Twine(".LTOC"));
+  MCSymbol *CurrentPos = OutContext.createTempSymbol();
+
+  OutStreamer->EmitLabel(CurrentPos);
+
+  // The GOT pointer points to the middle of the GOT, in order to reference the
+  // entire 64kB range.  0x8000 is the midpoint.
+  const MCExpr *tocExpr =
+    MCBinaryExpr::createAdd(MCSymbolRefExpr::create(CurrentPos, OutContext),
+                            MCConstantExpr::create(0x8000, OutContext),
+                            OutContext);
+
+  OutStreamer->EmitAssignment(TOCSym, tocExpr);
+
+  OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+}
+
+void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
+  // linux/ppc32 - Normal entry label.
+  if (!Subtarget->isPPC64() &&
+      (!isPositionIndependent() ||
+       MF->getFunction()->getParent()->getPICLevel() == PICLevel::SmallPIC))
+    return AsmPrinter::EmitFunctionEntryLabel();
+
+  if (!Subtarget->isPPC64()) {
+    const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
+    if (PPCFI->usesPICBase()) {
+      MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol();
+      MCSymbol *PICBase = MF->getPICBaseSymbol();
+      OutStreamer->EmitLabel(RelocSymbol);
+
+      const MCExpr *OffsExpr =
+        MCBinaryExpr::createSub(
+          MCSymbolRefExpr::create(OutContext.getOrCreateSymbol(Twine(".LTOC")),
+                                                               OutContext),
+                                  MCSymbolRefExpr::create(PICBase, OutContext),
+          OutContext);
+      OutStreamer->EmitValue(OffsExpr, 4);
+      OutStreamer->EmitLabel(CurrentFnSym);
+      return;
+    } else
+      return AsmPrinter::EmitFunctionEntryLabel();
+  }
+
+  // ELFv2 ABI - Normal entry label.
+  if (Subtarget->isELFv2ABI()) {
+    // In the Large code model, we allow arbitrary displacements between
+    // the text section and its associated TOC section.  We place the
+    // full 8-byte offset to the TOC in memory immediatedly preceding
+    // the function global entry point.
+    if (TM.getCodeModel() == CodeModel::Large
+        && !MF->getRegInfo().use_empty(PPC::X2)) {
+      const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
+
+      MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC."));
+      MCSymbol *GlobalEPSymbol = PPCFI->getGlobalEPSymbol();
+      const MCExpr *TOCDeltaExpr =
+        MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
+                                MCSymbolRefExpr::create(GlobalEPSymbol,
+                                                        OutContext),
+                                OutContext);
+
+      OutStreamer->EmitLabel(PPCFI->getTOCOffsetSymbol());
+      OutStreamer->EmitValue(TOCDeltaExpr, 8);
+    }
+    return AsmPrinter::EmitFunctionEntryLabel();
+  }
+
+  // Emit an official procedure descriptor.
+  MCSectionSubPair Current = OutStreamer->getCurrentSection();
+  MCSectionELF *Section = OutStreamer->getContext().getELFSection(
+      ".opd", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+  OutStreamer->SwitchSection(Section);
+  OutStreamer->EmitLabel(CurrentFnSym);
+  OutStreamer->EmitValueToAlignment(8);
+  MCSymbol *Symbol1 = CurrentFnSymForSize;
+  // Generates a R_PPC64_ADDR64 (from FK_DATA_8) relocation for the function
+  // entry point.
+  OutStreamer->EmitValue(MCSymbolRefExpr::create(Symbol1, OutContext),
+                         8 /*size*/);
+  MCSymbol *Symbol2 = OutContext.getOrCreateSymbol(StringRef(".TOC."));
+  // Generates a R_PPC64_TOC relocation for TOC base insertion.
+  OutStreamer->EmitValue(
+    MCSymbolRefExpr::create(Symbol2, MCSymbolRefExpr::VK_PPC_TOCBASE, OutContext),
+    8/*size*/);
+  // Emit a null environment pointer.
+  OutStreamer->EmitIntValue(0, 8 /* size */);
+  OutStreamer->SwitchSection(Current.first, Current.second);
+}
+
+bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
+  const DataLayout &DL = getDataLayout();
+
+  bool isPPC64 = DL.getPointerSizeInBits() == 64;
+
+  PPCTargetStreamer &TS =
+      static_cast<PPCTargetStreamer &>(*OutStreamer->getTargetStreamer());
+
+  if (!TOC.empty()) {
+    MCSectionELF *Section;
+
+    if (isPPC64)
+      Section = OutStreamer->getContext().getELFSection(
+          ".toc", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+        else
+          Section = OutStreamer->getContext().getELFSection(
+              ".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+    OutStreamer->SwitchSection(Section);
+
+    for (MapVector<MCSymbol*, MCSymbol*>::iterator I = TOC.begin(),
+         E = TOC.end(); I != E; ++I) {
+      OutStreamer->EmitLabel(I->second);
+      MCSymbol *S = I->first;
+      if (isPPC64) {
+        TS.emitTCEntry(*S);
+      } else {
+        OutStreamer->EmitValueToAlignment(4);
+        OutStreamer->EmitSymbolValue(S, 4);
+      }
+    }
+  }
+
+  return AsmPrinter::doFinalization(M);
+}
+
+/// EmitFunctionBodyStart - Emit a global entry point prefix for ELFv2.
+void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
+  // In the ELFv2 ABI, in functions that use the TOC register, we need to
+  // provide two entry points.  The ABI guarantees that when calling the
+  // local entry point, r2 is set up by the caller to contain the TOC base
+  // for this function, and when calling the global entry point, r12 is set
+  // up by the caller to hold the address of the global entry point.  We
+  // thus emit a prefix sequence along the following lines:
+  //
+  // func:
+  // .Lfunc_gepNN:
+  //         # global entry point
+  //         addis r2,r12,(.TOC.-.Lfunc_gepNN)@ha
+  //         addi  r2,r2,(.TOC.-.Lfunc_gepNN)@l
+  // .Lfunc_lepNN:
+  //         .localentry func, .Lfunc_lepNN-.Lfunc_gepNN
+  //         # local entry point, followed by function body
+  //
+  // For the Large code model, we create
+  //
+  // .Lfunc_tocNN:
+  //         .quad .TOC.-.Lfunc_gepNN      # done by EmitFunctionEntryLabel
+  // func:
+  // .Lfunc_gepNN:
+  //         # global entry point
+  //         ld    r2,.Lfunc_tocNN-.Lfunc_gepNN(r12)
+  //         add   r2,r2,r12
+  // .Lfunc_lepNN:
+  //         .localentry func, .Lfunc_lepNN-.Lfunc_gepNN
+  //         # local entry point, followed by function body
+  //
+  // This ensures we have r2 set up correctly while executing the function
+  // body, no matter which entry point is called.
+  if (Subtarget->isELFv2ABI()
+      // Only do all that if the function uses r2 in the first place.
+      && !MF->getRegInfo().use_empty(PPC::X2)) {
+    // Note: The logic here must be synchronized with the code in the
+    // branch-selection pass which sets the offset of the first block in the
+    // function. This matters because it affects the alignment.
+    const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
+
+    MCSymbol *GlobalEntryLabel = PPCFI->getGlobalEPSymbol();
+    OutStreamer->EmitLabel(GlobalEntryLabel);
+    const MCSymbolRefExpr *GlobalEntryLabelExp =
+      MCSymbolRefExpr::create(GlobalEntryLabel, OutContext);
+
+    if (TM.getCodeModel() != CodeModel::Large) {
+      MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC."));
+      const MCExpr *TOCDeltaExpr =
+        MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
+                                GlobalEntryLabelExp, OutContext);
+
+      const MCExpr *TOCDeltaHi =
+        PPCMCExpr::createHa(TOCDeltaExpr, false, OutContext);
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
+                                   .addReg(PPC::X2)
+                                   .addReg(PPC::X12)
+                                   .addExpr(TOCDeltaHi));
+
+      const MCExpr *TOCDeltaLo =
+        PPCMCExpr::createLo(TOCDeltaExpr, false, OutContext);
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI)
+                                   .addReg(PPC::X2)
+                                   .addReg(PPC::X2)
+                                   .addExpr(TOCDeltaLo));
+    } else {
+      MCSymbol *TOCOffset = PPCFI->getTOCOffsetSymbol();
+      const MCExpr *TOCOffsetDeltaExpr =
+        MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCOffset, OutContext),
+                                GlobalEntryLabelExp, OutContext);
+
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD)
+                                   .addReg(PPC::X2)
+                                   .addExpr(TOCOffsetDeltaExpr)
+                                   .addReg(PPC::X12));
+      EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADD8)
+                                   .addReg(PPC::X2)
+                                   .addReg(PPC::X2)
+                                   .addReg(PPC::X12));
+    }
+
+    MCSymbol *LocalEntryLabel = PPCFI->getLocalEPSymbol();
+    OutStreamer->EmitLabel(LocalEntryLabel);
+    const MCSymbolRefExpr *LocalEntryLabelExp =
+       MCSymbolRefExpr::create(LocalEntryLabel, OutContext);
+    const MCExpr *LocalOffsetExp =
+      MCBinaryExpr::createSub(LocalEntryLabelExp,
+                              GlobalEntryLabelExp, OutContext);
+
+    PPCTargetStreamer *TS =
+      static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
+
+    if (TS)
+      TS->emitLocalEntry(cast<MCSymbolELF>(CurrentFnSym), LocalOffsetExp);
+  }
+}
+
+/// EmitFunctionBodyEnd - Print the traceback table before the .size
+/// directive.
+///
+void PPCLinuxAsmPrinter::EmitFunctionBodyEnd() {
+  // Only the 64-bit target requires a traceback table.  For now,
+  // we only emit the word of zeroes that GDB requires to find
+  // the end of the function, and zeroes for the eight-byte
+  // mandatory fields.
+  // FIXME: We should fill in the eight-byte mandatory fields as described in
+  // the PPC64 ELF ABI (this is a low-priority item because GDB does not
+  // currently make use of these fields).
+  if (Subtarget->isPPC64()) {
+    OutStreamer->EmitIntValue(0, 4/*size*/);
+    OutStreamer->EmitIntValue(0, 8/*size*/);
+  }
+}
+
+void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  static const char *const CPUDirectives[] = {
+    "",
+    "ppc",
+    "ppc440",
+    "ppc601",
+    "ppc602",
+    "ppc603",
+    "ppc7400",
+    "ppc750",
+    "ppc970",
+    "ppcA2",
+    "ppce500mc",
+    "ppce5500",
+    "power3",
+    "power4",
+    "power5",
+    "power5x",
+    "power6",
+    "power6x",
+    "power7",
+    // FIXME: why is power8 missing here?
+    "ppc64",
+    "ppc64le",
+    "power9"
+  };
+
+  // Get the numerically largest directive.
+  // FIXME: How should we merge darwin directives?
+  unsigned Directive = PPC::DIR_NONE;
+  for (const Function &F : M) {
+    const PPCSubtarget &STI = TM.getSubtarget<PPCSubtarget>(F);
+    unsigned FDir = STI.getDarwinDirective();
+    Directive = Directive > FDir ? FDir : STI.getDarwinDirective();
+    if (STI.hasMFOCRF() && Directive < PPC::DIR_970)
+      Directive = PPC::DIR_970;
+    if (STI.hasAltivec() && Directive < PPC::DIR_7400)
+      Directive = PPC::DIR_7400;
+    if (STI.isPPC64() && Directive < PPC::DIR_64)
+      Directive = PPC::DIR_64;
+  }
+
+  assert(Directive <= PPC::DIR_64 && "Directive out of range.");
+
+  assert(Directive < array_lengthof(CPUDirectives) &&
+         "CPUDirectives[] might not be up-to-date!");
+  PPCTargetStreamer &TStreamer =
+      *static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
+  TStreamer.emitMachine(CPUDirectives[Directive]);
+
+  // Prime text sections so they are adjacent.  This reduces the likelihood a
+  // large data or debug section causes a branch to exceed 16M limit.
+  const TargetLoweringObjectFileMachO &TLOFMacho =
+      static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+  OutStreamer->SwitchSection(TLOFMacho.getTextCoalSection());
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    OutStreamer->SwitchSection(
+           OutContext.getMachOSection("__TEXT", "__picsymbolstub1",
+                                      MachO::S_SYMBOL_STUBS |
+                                      MachO::S_ATTR_PURE_INSTRUCTIONS,
+                                      32, SectionKind::getText()));
+  } else if (TM.getRelocationModel() == Reloc::DynamicNoPIC) {
+    OutStreamer->SwitchSection(
+           OutContext.getMachOSection("__TEXT","__symbol_stub1",
+                                      MachO::S_SYMBOL_STUBS |
+                                      MachO::S_ATTR_PURE_INSTRUCTIONS,
+                                      16, SectionKind::getText()));
+  }
+  OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+}
+
+bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
+  bool isPPC64 = getDataLayout().getPointerSizeInBits() == 64;
+
+  // Darwin/PPC always uses mach-o.
+  const TargetLoweringObjectFileMachO &TLOFMacho =
+      static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+  if (MMI) {
+    MachineModuleInfoMachO &MMIMacho =
+        MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+    if (MAI->doesSupportExceptionHandling()) {
+      // Add the (possibly multiple) personalities to the set of global values.
+      // Only referenced functions get into the Personalities list.
+      for (const Function *Personality : MMI->getPersonalities()) {
+        if (Personality) {
+          MCSymbol *NLPSym =
+              getSymbolWithGlobalValueBase(Personality, "$non_lazy_ptr");
+          MachineModuleInfoImpl::StubValueTy &StubSym =
+              MMIMacho.getGVStubEntry(NLPSym);
+          StubSym =
+              MachineModuleInfoImpl::StubValueTy(getSymbol(Personality), true);
+        }
+      }
+    }
+
+    // Output stubs for dynamically-linked functions.
+    MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetGVStubList();
+
+    // Output macho stubs for external and common global variables.
+    if (!Stubs.empty()) {
+      // Switch with ".non_lazy_symbol_pointer" directive.
+      OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+      EmitAlignment(isPPC64 ? 3 : 2);
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        // L_foo$stub:
+        OutStreamer->EmitLabel(Stubs[i].first);
+        //   .indirect_symbol _foo
+        MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second;
+        OutStreamer->EmitSymbolAttribute(MCSym.getPointer(),
+                                         MCSA_IndirectSymbol);
+
+        if (MCSym.getInt())
+          // External to current translation unit.
+          OutStreamer->EmitIntValue(0, isPPC64 ? 8 : 4 /*size*/);
+        else
+          // Internal to current translation unit.
+          //
+          // When we place the LSDA into the TEXT section, the type info
+          // pointers
+          // need to be indirect and pc-rel. We accomplish this by using NLPs.
+          // However, sometimes the types are local to the file. So we need to
+          // fill in the value for the NLP in those cases.
+          OutStreamer->EmitValue(
+              MCSymbolRefExpr::create(MCSym.getPointer(), OutContext),
+              isPPC64 ? 8 : 4 /*size*/);
+      }
+
+      Stubs.clear();
+      OutStreamer->AddBlankLine();
+    }
+  }
+
+  // Funny Darwin hack: This flag tells the linker that no global symbols
+  // contain code that falls through to other global symbols (e.g. the obvious
+  // implementation of multiple entry points).  If this doesn't occur, the
+  // linker can safely perform dead code stripping.  Since LLVM never generates
+  // code that does this, it is always safe to set.
+  OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+
+  return AsmPrinter::doFinalization(M);
+}
+
+/// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code
+/// for a MachineFunction to the given output stream, in a format that the
+/// Darwin assembler can deal with.
+///
+static AsmPrinter *
+createPPCAsmPrinterPass(TargetMachine &tm,
+                        std::unique_ptr<MCStreamer> &&Streamer) {
+  if (tm.getTargetTriple().isMacOSX())
+    return new PPCDarwinAsmPrinter(tm, std::move(Streamer));
+  return new PPCLinuxAsmPrinter(tm, std::move(Streamer));
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializePowerPCAsmPrinter() {
+  TargetRegistry::RegisterAsmPrinter(getThePPC32Target(),
+                                     createPPCAsmPrinterPass);
+  TargetRegistry::RegisterAsmPrinter(getThePPC64Target(),
+                                     createPPCAsmPrinterPass);
+  TargetRegistry::RegisterAsmPrinter(getThePPC64LETarget(),
+                                     createPPCAsmPrinterPass);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
new file mode 100644
index 000000000000..93c201d03869
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -0,0 +1,274 @@
+//===- PPCBoolRetToInt.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements converting i1 values to i32 if they could be more
+// profitably allocated as GPRs rather than CRs. This pass will become totally
+// unnecessary if Register Bank Allocation and Global Instruction Selection ever
+// go upstream.
+//
+// Presently, the pass converts i1 Constants, and Arguments to i32 if the
+// transitive closure of their uses includes only PHINodes, CallInsts, and
+// ReturnInsts. The rational is that arguments are generally passed and returned
+// in GPRs rather than CRs, so casting them to i32 at the LLVM IR level will
+// actually save casts at the Machine Instruction level.
+//
+// It might be useful to expand this pass to add bit-wise operations to the list
+// of safe transitive closure types. Also, we miss some opportunities when LLVM
+// represents logical AND and OR operations with control flow rather than data
+// flow. For example by lowering the expression: return (A && B && C)
+//
+// as: return A ? true : B && C.
+//
+// There's code in SimplifyCFG that code be used to turn control flow in data
+// flow using SelectInsts. Selects are slow on some architectures (P7/P8), so
+// this probably isn't good in general, but for the special case of i1, the
+// Selects could be further lowered to bit operations that are fast everywhere.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Pass.h"
+#include <cassert>
+
+using namespace llvm;
+
+namespace {
+
+#define DEBUG_TYPE "bool-ret-to-int"
+
+STATISTIC(NumBoolRetPromotion,
+          "Number of times a bool feeding a RetInst was promoted to an int");
+STATISTIC(NumBoolCallPromotion,
+          "Number of times a bool feeding a CallInst was promoted to an int");
+STATISTIC(NumBoolToIntPromotion,
+          "Total number of times a bool was promoted to an int");
+
+class PPCBoolRetToInt : public FunctionPass {
+  static SmallPtrSet<Value *, 8> findAllDefs(Value *V) {
+    SmallPtrSet<Value *, 8> Defs;
+    SmallVector<Value *, 8> WorkList;
+    WorkList.push_back(V);
+    Defs.insert(V);
+    while (!WorkList.empty()) {
+      Value *Curr = WorkList.back();
+      WorkList.pop_back();
+      auto *CurrUser = dyn_cast<User>(Curr);
+      // Operands of CallInst are skipped because they may not be Bool type,
+      // and their positions are defined by ABI.
+      if (CurrUser && !isa<CallInst>(Curr))
+        for (auto &Op : CurrUser->operands())
+          if (Defs.insert(Op).second)
+            WorkList.push_back(Op);
+    }
+    return Defs;
+  }
+
+  // Translate a i1 value to an equivalent i32 value:
+  static Value *translate(Value *V) {
+    Type *Int32Ty = Type::getInt32Ty(V->getContext());
+    if (auto *C = dyn_cast<Constant>(V))
+      return ConstantExpr::getZExt(C, Int32Ty);
+    if (auto *P = dyn_cast<PHINode>(V)) {
+      // Temporarily set the operands to 0. We'll fix this later in
+      // runOnUse.
+      Value *Zero = Constant::getNullValue(Int32Ty);
+      PHINode *Q =
+        PHINode::Create(Int32Ty, P->getNumIncomingValues(), P->getName(), P);
+      for (unsigned i = 0; i < P->getNumOperands(); ++i)
+        Q->addIncoming(Zero, P->getIncomingBlock(i));
+      return Q;
+    }
+
+    auto *A = dyn_cast<Argument>(V);
+    auto *I = dyn_cast<Instruction>(V);
+    assert((A || I) && "Unknown value type");
+
+    auto InstPt =
+      A ? &*A->getParent()->getEntryBlock().begin() : I->getNextNode();
+    return new ZExtInst(V, Int32Ty, "", InstPt);
+  }
+
+  typedef SmallPtrSet<const PHINode *, 8> PHINodeSet;
+
+  // A PHINode is Promotable if:
+  // 1. Its type is i1 AND
+  // 2. All of its uses are ReturnInt, CallInst, PHINode, or DbgInfoIntrinsic
+  // AND
+  // 3. All of its operands are Constant or Argument or
+  //    CallInst or PHINode AND
+  // 4. All of its PHINode uses are Promotable AND
+  // 5. All of its PHINode operands are Promotable
+  static PHINodeSet getPromotablePHINodes(const Function &F) {
+    PHINodeSet Promotable;
+    // Condition 1
+    for (auto &BB : F)
+      for (auto &I : BB)
+        if (const auto *P = dyn_cast<PHINode>(&I))
+          if (P->getType()->isIntegerTy(1))
+            Promotable.insert(P);
+
+    SmallVector<const PHINode *, 8> ToRemove;
+    for (const PHINode *P : Promotable) {
+      // Condition 2 and 3
+      auto IsValidUser = [] (const Value *V) -> bool {
+        return isa<ReturnInst>(V) || isa<CallInst>(V) || isa<PHINode>(V) ||
+        isa<DbgInfoIntrinsic>(V);
+      };
+      auto IsValidOperand = [] (const Value *V) -> bool {
+        return isa<Constant>(V) || isa<Argument>(V) || isa<CallInst>(V) ||
+        isa<PHINode>(V);
+      };
+      const auto &Users = P->users();
+      const auto &Operands = P->operands();
+      if (!llvm::all_of(Users, IsValidUser) ||
+          !llvm::all_of(Operands, IsValidOperand))
+        ToRemove.push_back(P);
+    }
+
+    // Iterate to convergence
+    auto IsPromotable = [&Promotable] (const Value *V) -> bool {
+      const auto *Phi = dyn_cast<PHINode>(V);
+      return !Phi || Promotable.count(Phi);
+    };
+    while (!ToRemove.empty()) {
+      for (auto &User : ToRemove)
+        Promotable.erase(User);
+      ToRemove.clear();
+
+      for (const PHINode *P : Promotable) {
+        // Condition 4 and 5
+        const auto &Users = P->users();
+        const auto &Operands = P->operands();
+        if (!llvm::all_of(Users, IsPromotable) ||
+            !llvm::all_of(Operands, IsPromotable))
+          ToRemove.push_back(P);
+      }
+    }
+
+    return Promotable;
+  }
+
+  typedef DenseMap<Value *, Value *> B2IMap;
+
+ public:
+  static char ID;
+
+  PPCBoolRetToInt() : FunctionPass(ID) {
+    initializePPCBoolRetToIntPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    PHINodeSet PromotablePHINodes = getPromotablePHINodes(F);
+    B2IMap Bool2IntMap;
+    bool Changed = false;
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        if (auto *R = dyn_cast<ReturnInst>(&I))
+          if (F.getReturnType()->isIntegerTy(1))
+            Changed |=
+              runOnUse(R->getOperandUse(0), PromotablePHINodes, Bool2IntMap);
+
+        if (auto *CI = dyn_cast<CallInst>(&I))
+          for (auto &U : CI->operands())
+            if (U->getType()->isIntegerTy(1))
+              Changed |= runOnUse(U, PromotablePHINodes, Bool2IntMap);
+      }
+    }
+
+    return Changed;
+  }
+
+  static bool runOnUse(Use &U, const PHINodeSet &PromotablePHINodes,
+                       B2IMap &BoolToIntMap) {
+    auto Defs = findAllDefs(U);
+
+    // If the values are all Constants or Arguments, don't bother
+    if (llvm::none_of(Defs, isa<Instruction, Value *>))
+      return false;
+
+    // Presently, we only know how to handle PHINode, Constant, Arguments and
+    // CallInst. Potentially, bitwise operations (AND, OR, XOR, NOT) and sign
+    // extension could also be handled in the future.
+    for (Value *V : Defs)
+      if (!isa<PHINode>(V) && !isa<Constant>(V) &&
+          !isa<Argument>(V) && !isa<CallInst>(V))
+        return false;
+
+    for (Value *V : Defs)
+      if (const auto *P = dyn_cast<PHINode>(V))
+        if (!PromotablePHINodes.count(P))
+          return false;
+
+    if (isa<ReturnInst>(U.getUser()))
+      ++NumBoolRetPromotion;
+    if (isa<CallInst>(U.getUser()))
+      ++NumBoolCallPromotion;
+    ++NumBoolToIntPromotion;
+
+    for (Value *V : Defs)
+      if (!BoolToIntMap.count(V))
+        BoolToIntMap[V] = translate(V);
+
+    // Replace the operands of the translated instructions. They were set to
+    // zero in the translate function.
+    for (auto &Pair : BoolToIntMap) {
+      auto *First = dyn_cast<User>(Pair.first);
+      auto *Second = dyn_cast<User>(Pair.second);
+      assert((!First || Second) && "translated from user to non-user!?");
+      // Operands of CallInst are skipped because they may not be Bool type,
+      // and their positions are defined by ABI.
+      if (First && !isa<CallInst>(First))
+        for (unsigned i = 0; i < First->getNumOperands(); ++i)
+          Second->setOperand(i, BoolToIntMap[First->getOperand(i)]);
+    }
+
+    Value *IntRetVal = BoolToIntMap[U];
+    Type *Int1Ty = Type::getInt1Ty(U->getContext());
+    auto *I = cast<Instruction>(U.getUser());
+    Value *BackToBool = new TruncInst(IntRetVal, Int1Ty, "backToBool", I);
+    U.set(BackToBool);
+
+    return true;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char PPCBoolRetToInt::ID = 0;
+INITIALIZE_PASS(PPCBoolRetToInt, "bool-ret-to-int",
+                "Convert i1 constants to i32 if they are returned",
+                false, false)
+
+FunctionPass *llvm::createPPCBoolRetToIntPass() { return new PPCBoolRetToInt(); }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
new file mode 100644
index 000000000000..ae76386fdfb6
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -0,0 +1,283 @@
+//===-- PPCBranchSelector.cpp - Emit long conditional branches ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that scans a machine function to determine which
+// conditional branches need more than 16 bits of displacement to reach their
+// target basic block.  It does this in two passes; a calculation of basic block
+// positions pass, and a branch pseudo op to machine branch opcode pass.  This
+// pass should be run last, just before the assembly printer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-branch-select"
+
+STATISTIC(NumExpanded, "Number of branches expanded to long format");
+
+namespace llvm {
+  void initializePPCBSelPass(PassRegistry&);
+}
+
+namespace {
+  struct PPCBSel : public MachineFunctionPass {
+    static char ID;
+    PPCBSel() : MachineFunctionPass(ID) {
+      initializePPCBSelPass(*PassRegistry::getPassRegistry());
+    }
+
+    // The sizes of the basic blocks in the function (the first
+    // element of the pair); the second element of the pair is the amount of the
+    // size that is due to potential padding.
+    std::vector<std::pair<unsigned, unsigned>> BlockSizes;
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    StringRef getPassName() const override { return "PowerPC Branch Selector"; }
+  };
+  char PPCBSel::ID = 0;
+}
+
+INITIALIZE_PASS(PPCBSel, "ppc-branch-select", "PowerPC Branch Selector",
+                false, false)
+
+/// createPPCBranchSelectionPass - returns an instance of the Branch Selection
+/// Pass
+///
+FunctionPass *llvm::createPPCBranchSelectionPass() {
+  return new PPCBSel();
+}
+
+bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
+  const PPCInstrInfo *TII =
+      static_cast<const PPCInstrInfo *>(Fn.getSubtarget().getInstrInfo());
+  // Give the blocks of the function a dense, in-order, numbering.
+  Fn.RenumberBlocks();
+  BlockSizes.resize(Fn.getNumBlockIDs());
+
+  auto GetAlignmentAdjustment =
+    [TII](MachineBasicBlock &MBB, unsigned Offset) -> unsigned {
+    unsigned Align = MBB.getAlignment();
+    if (!Align)
+      return 0;
+
+    unsigned AlignAmt = 1 << Align;
+    unsigned ParentAlign = MBB.getParent()->getAlignment();
+
+    if (Align <= ParentAlign)
+      return OffsetToAlignment(Offset, AlignAmt);
+
+    // The alignment of this MBB is larger than the function's alignment, so we
+    // can't tell whether or not it will insert nops. Assume that it will.
+    return AlignAmt + OffsetToAlignment(Offset, AlignAmt);
+  };
+
+  // We need to be careful about the offset of the first block in the function
+  // because it might not have the function's alignment. This happens because,
+  // under the ELFv2 ABI, for functions which require a TOC pointer, we add a
+  // two-instruction sequence to the start of the function.
+  // Note: This needs to be synchronized with the check in
+  // PPCLinuxAsmPrinter::EmitFunctionBodyStart.
+  unsigned InitialOffset = 0;
+  if (Fn.getSubtarget<PPCSubtarget>().isELFv2ABI() &&
+      !Fn.getRegInfo().use_empty(PPC::X2))
+    InitialOffset = 8;
+
+  // Measure each MBB and compute a size for the entire function.
+  unsigned FuncSize = InitialOffset;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock *MBB = &*MFI;
+
+    // The end of the previous block may have extra nops if this block has an
+    // alignment requirement.
+    if (MBB->getNumber() > 0) {
+      unsigned AlignExtra = GetAlignmentAdjustment(*MBB, FuncSize);
+
+      auto &BS = BlockSizes[MBB->getNumber()-1];
+      BS.first += AlignExtra;
+      BS.second = AlignExtra;
+
+      FuncSize += AlignExtra;
+    }
+
+    unsigned BlockSize = 0;
+    for (MachineInstr &MI : *MBB)
+      BlockSize += TII->getInstSizeInBytes(MI);
+
+    BlockSizes[MBB->getNumber()].first = BlockSize;
+    FuncSize += BlockSize;
+  }
+  
+  // If the entire function is smaller than the displacement of a branch field,
+  // we know we don't need to shrink any branches in this function.  This is a
+  // common case.
+  if (FuncSize < (1 << 15)) {
+    BlockSizes.clear();
+    return false;
+  }
+  
+  // For each conditional branch, if the offset to its destination is larger
+  // than the offset field allows, transform it into a long branch sequence
+  // like this:
+  //   short branch:
+  //     bCC MBB
+  //   long branch:
+  //     b!CC $PC+8
+  //     b MBB
+  //
+  bool MadeChange = true;
+  bool EverMadeChange = false;
+  while (MadeChange) {
+    // Iteratively expand branches until we reach a fixed point.
+    MadeChange = false;
+  
+    for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+         ++MFI) {
+      MachineBasicBlock &MBB = *MFI;
+      unsigned MBBStartOffset = 0;
+      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+           I != E; ++I) {
+        MachineBasicBlock *Dest = nullptr;
+        if (I->getOpcode() == PPC::BCC && !I->getOperand(2).isImm())
+          Dest = I->getOperand(2).getMBB();
+        else if ((I->getOpcode() == PPC::BC || I->getOpcode() == PPC::BCn) &&
+                 !I->getOperand(1).isImm())
+          Dest = I->getOperand(1).getMBB();
+        else if ((I->getOpcode() == PPC::BDNZ8 || I->getOpcode() == PPC::BDNZ ||
+                  I->getOpcode() == PPC::BDZ8  || I->getOpcode() == PPC::BDZ) &&
+                 !I->getOperand(0).isImm())
+          Dest = I->getOperand(0).getMBB();
+
+        if (!Dest) {
+          MBBStartOffset += TII->getInstSizeInBytes(*I);
+          continue;
+        }
+        
+        // Determine the offset from the current branch to the destination
+        // block.
+        int BranchSize;
+        if (Dest->getNumber() <= MBB.getNumber()) {
+          // If this is a backwards branch, the delta is the offset from the
+          // start of this block to this branch, plus the sizes of all blocks
+          // from this block to the dest.
+          BranchSize = MBBStartOffset;
+          
+          for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i].first;
+        } else {
+          // Otherwise, add the size of the blocks between this block and the
+          // dest to the number of bytes left in this block.
+          BranchSize = -MBBStartOffset;
+
+          for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i].first;
+        }
+
+        // If this branch is in range, ignore it.
+        if (isInt<16>(BranchSize)) {
+          MBBStartOffset += 4;
+          continue;
+        }
+
+        // Otherwise, we have to expand it to a long branch.
+        MachineInstr &OldBranch = *I;
+        DebugLoc dl = OldBranch.getDebugLoc();
+
+        if (I->getOpcode() == PPC::BCC) {
+          // The BCC operands are:
+          // 0. PPC branch predicate
+          // 1. CR register
+          // 2. Target MBB
+          PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm();
+          unsigned CRReg = I->getOperand(1).getReg();
+       
+          // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition.
+          BuildMI(MBB, I, dl, TII->get(PPC::BCC))
+            .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2);
+        } else if (I->getOpcode() == PPC::BC) {
+          unsigned CRBit = I->getOperand(0).getReg();
+          BuildMI(MBB, I, dl, TII->get(PPC::BCn)).addReg(CRBit).addImm(2);
+        } else if (I->getOpcode() == PPC::BCn) {
+          unsigned CRBit = I->getOperand(0).getReg();
+          BuildMI(MBB, I, dl, TII->get(PPC::BC)).addReg(CRBit).addImm(2);
+        } else if (I->getOpcode() == PPC::BDNZ) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDZ)).addImm(2);
+        } else if (I->getOpcode() == PPC::BDNZ8) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDZ8)).addImm(2);
+        } else if (I->getOpcode() == PPC::BDZ) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDNZ)).addImm(2);
+        } else if (I->getOpcode() == PPC::BDZ8) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDNZ8)).addImm(2);
+        } else {
+           llvm_unreachable("Unhandled branch type!");
+        }
+        
+        // Uncond branch to the real destination.
+        I = BuildMI(MBB, I, dl, TII->get(PPC::B)).addMBB(Dest);
+
+        // Remove the old branch from the function.
+        OldBranch.eraseFromParent();
+
+        // Remember that this instruction is 8-bytes, increase the size of the
+        // block by 4, remember to iterate.
+        BlockSizes[MBB.getNumber()].first += 4;
+        MBBStartOffset += 8;
+        ++NumExpanded;
+        MadeChange = true;
+      }
+    }
+
+    if (MadeChange) {
+      // If we're going to iterate again, make sure we've updated our
+      // padding-based contributions to the block sizes.
+      unsigned Offset = InitialOffset;
+      for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+           ++MFI) {
+        MachineBasicBlock *MBB = &*MFI;
+
+        if (MBB->getNumber() > 0) {
+          auto &BS = BlockSizes[MBB->getNumber()-1];
+          BS.first -= BS.second;
+          Offset -= BS.second;
+
+          unsigned AlignExtra = GetAlignmentAdjustment(*MBB, Offset);
+
+          BS.first += AlignExtra;
+          BS.second = AlignExtra;
+
+          Offset += AlignExtra;
+        }
+
+        Offset += BlockSizes[MBB->getNumber()].first;
+      }
+    }
+
+    EverMadeChange |= MadeChange;
+  }
+  
+  BlockSizes.clear();
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCCState.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCCState.cpp
new file mode 100644
index 000000000000..5510a95430f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCCState.cpp
@@ -0,0 +1,36 @@
+//===---- PPCCCState.cpp - CCState with PowerPC specific extensions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCCCState.h"
+#include "PPCSubtarget.h"
+#include "llvm/IR/Module.h"
+using namespace llvm;
+
+// Identify lowered values that originated from ppcf128 arguments and record
+// this.
+void PPCCCState::PreAnalyzeCallOperands(
+    const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  for (const auto &I : Outs) {
+    if (I.ArgVT == llvm::MVT::ppcf128)
+      OriginalArgWasPPCF128.push_back(true);
+    else
+      OriginalArgWasPPCF128.push_back(false);
+  }
+}
+
+void PPCCCState::PreAnalyzeFormalArguments(
+    const SmallVectorImpl<ISD::InputArg> &Ins) {
+  for (const auto &I : Ins) {
+    if (I.ArgVT == llvm::MVT::ppcf128) {
+      OriginalArgWasPPCF128.push_back(true);
+    } else {
+      OriginalArgWasPPCF128.push_back(false);
+    }
+  }
+}
+\ No newline at end of file
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCCState.h b/contrib/llvm/lib/Target/PowerPC/PPCCCState.h
new file mode 100644
index 000000000000..9be9f11dbea3
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCCState.h
@@ -0,0 +1,42 @@
+//===---- PPCCCState.h - CCState with PowerPC specific extensions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCCCSTATE_H
+#define PPCCCSTATE_H
+
+#include "PPCISelLowering.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+
+namespace llvm {
+
+class PPCCCState : public CCState {
+public:
+
+  void
+  PreAnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs);
+  void
+  PreAnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins);
+
+private:
+
+  // Records whether the value has been lowered from an ppcf128.
+  SmallVector<bool, 4> OriginalArgWasPPCF128;
+
+public:
+  PPCCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+             SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
+        : CCState(CC, isVarArg, MF, locs, C) {}
+
+  bool WasOriginalArgPPCF128(unsigned ValNo) { return OriginalArgWasPPCF128[ValNo]; }
+  void clearWasPPCF128() { OriginalArgWasPPCF128.clear(); }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
new file mode 100644
index 000000000000..2c62a0f1d909
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -0,0 +1,728 @@
+//===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies loops where we can generate the PPC branch instructions
+// that decrement and test the count register (CTR) (bdnz and friends).
+//
+// The pattern that defines the induction variable can changed depending on
+// prior optimizations.  For example, the IndVarSimplify phase run by 'opt'
+// normalizes induction variables, and the Loop Strength Reduction pass
+// run by 'llc' may also make changes to the induction variable.
+//
+// Criteria for CTR loops:
+//  - Countable loops (w/ ind. var for a trip count)
+//  - Try inner-most loops first
+//  - No nested CTR loops.
+//  - No function calls in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "PPC.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#ifndef NDEBUG
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#endif
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ctrloops"
+
+#ifndef NDEBUG
+static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
+#endif
+
+STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
+
+namespace llvm {
+  void initializePPCCTRLoopsPass(PassRegistry&);
+#ifndef NDEBUG
+  void initializePPCCTRLoopsVerifyPass(PassRegistry&);
+#endif
+}
+
+namespace {
+  struct PPCCTRLoops : public FunctionPass {
+
+#ifndef NDEBUG
+    static int Counter;
+#endif
+
+  public:
+    static char ID;
+
+    PPCCTRLoops() : FunctionPass(ID), TM(nullptr) {
+      initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
+    }
+    PPCCTRLoops(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
+      initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
+    }
+
+  private:
+    bool mightUseCTR(const Triple &TT, BasicBlock *BB);
+    bool convertToCTRLoop(Loop *L);
+
+  private:
+    PPCTargetMachine *TM;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
+    const DataLayout *DL;
+    DominatorTree *DT;
+    const TargetLibraryInfo *LibInfo;
+    bool PreserveLCSSA;
+  };
+
+  char PPCCTRLoops::ID = 0;
+#ifndef NDEBUG
+  int PPCCTRLoops::Counter = 0;
+#endif
+
+#ifndef NDEBUG
+  struct PPCCTRLoopsVerify : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    PPCCTRLoopsVerify() : MachineFunctionPass(ID) {
+      initializePPCCTRLoopsVerifyPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    MachineDominatorTree *MDT;
+  };
+
+  char PPCCTRLoopsVerify::ID = 0;
+#endif // NDEBUG
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
+                    false, false)
+
+FunctionPass *llvm::createPPCCTRLoops(PPCTargetMachine &TM) {
+  return new PPCCTRLoops(TM);
+}
+
+#ifndef NDEBUG
+INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
+                      "PowerPC CTR Loops Verify", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
+                    "PowerPC CTR Loops Verify", false, false)
+
+FunctionPass *llvm::createPPCCTRLoopsVerify() {
+  return new PPCCTRLoopsVerify();
+}
+#endif // NDEBUG
+
+bool PPCCTRLoops::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  DL = &F.getParent()->getDataLayout();
+  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+  LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
+  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+  bool MadeChange = false;
+
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end();
+       I != E; ++I) {
+    Loop *L = *I;
+    if (!L->getParentLoop())
+      MadeChange |= convertToCTRLoop(L);
+  }
+
+  return MadeChange;
+}
+
+static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) {
+  if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
+    return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
+
+  return false;
+}
+
+// Determining the address of a TLS variable results in a function call in
+// certain TLS models.
+static bool memAddrUsesCTR(const PPCTargetMachine *TM,
+                           const Value *MemAddr) {
+  const auto *GV = dyn_cast<GlobalValue>(MemAddr);
+  if (!GV) {
+    // Recurse to check for constants that refer to TLS global variables.
+    if (const auto *CV = dyn_cast<Constant>(MemAddr))
+      for (const auto &CO : CV->operands())
+        if (memAddrUsesCTR(TM, CO))
+          return true;
+
+    return false;
+  }
+
+  if (!GV->isThreadLocal())
+    return false;
+  if (!TM)
+    return true;
+  TLSModel::Model Model = TM->getTLSModel(GV);
+  return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic;
+}
+
+bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
+  for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
+       J != JE; ++J) {
+    if (CallInst *CI = dyn_cast<CallInst>(J)) {
+      if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
+        // Inline ASM is okay, unless it clobbers the ctr register.
+        InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
+        for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
+          InlineAsm::ConstraintInfo &C = CIV[i];
+          if (C.Type != InlineAsm::isInput)
+            for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
+              if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
+                return true;
+        }
+
+        continue;
+      }
+
+      if (!TM)
+        return true;
+      const TargetLowering *TLI =
+          TM->getSubtargetImpl(*BB->getParent())->getTargetLowering();
+
+      if (Function *F = CI->getCalledFunction()) {
+        // Most intrinsics don't become function calls, but some might.
+        // sin, cos, exp and log are always calls.
+        unsigned Opcode = 0;
+        if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
+          switch (F->getIntrinsicID()) {
+          default: continue;
+          // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
+          // we're definitely using CTR.
+          case Intrinsic::ppc_is_decremented_ctr_nonzero:
+          case Intrinsic::ppc_mtctr:
+            return true;
+
+// VisualStudio defines setjmp as _setjmp
+#if defined(_MSC_VER) && defined(setjmp) && \
+                       !defined(setjmp_undefined_for_msvc)
+#  pragma push_macro("setjmp")
+#  undef setjmp
+#  define setjmp_undefined_for_msvc
+#endif
+
+          case Intrinsic::setjmp:
+
+#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
+ // let's return it to _setjmp state
+#  pragma pop_macro("setjmp")
+#  undef setjmp_undefined_for_msvc
+#endif
+
+          case Intrinsic::longjmp:
+
+          // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
+          // because, although it does clobber the counter register, the
+          // control can't then return to inside the loop unless there is also
+          // an eh_sjlj_setjmp.
+          case Intrinsic::eh_sjlj_setjmp:
+
+          case Intrinsic::memcpy:
+          case Intrinsic::memmove:
+          case Intrinsic::memset:
+          case Intrinsic::powi:
+          case Intrinsic::log:
+          case Intrinsic::log2:
+          case Intrinsic::log10:
+          case Intrinsic::exp:
+          case Intrinsic::exp2:
+          case Intrinsic::pow:
+          case Intrinsic::sin:
+          case Intrinsic::cos:
+            return true;
+          case Intrinsic::copysign:
+            if (CI->getArgOperand(0)->getType()->getScalarType()->
+                isPPC_FP128Ty())
+              return true;
+            else
+              continue; // ISD::FCOPYSIGN is never a library call.
+          case Intrinsic::sqrt:      Opcode = ISD::FSQRT;      break;
+          case Intrinsic::floor:     Opcode = ISD::FFLOOR;     break;
+          case Intrinsic::ceil:      Opcode = ISD::FCEIL;      break;
+          case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
+          case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
+          case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
+          case Intrinsic::round:     Opcode = ISD::FROUND;     break;
+          case Intrinsic::minnum:    Opcode = ISD::FMINNUM;    break;
+          case Intrinsic::maxnum:    Opcode = ISD::FMAXNUM;    break;
+          }
+        }
+
+        // PowerPC does not use [US]DIVREM or other library calls for
+        // operations on regular types which are not otherwise library calls
+        // (i.e. soft float or atomics). If adapting for targets that do,
+        // additional care is required here.
+
+        LibFunc::Func Func;
+        if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
+            LibInfo->getLibFunc(F->getName(), Func) &&
+            LibInfo->hasOptimizedCodeGen(Func)) {
+          // Non-read-only functions are never treated as intrinsics.
+          if (!CI->onlyReadsMemory())
+            return true;
+
+          // Conversion happens only for FP calls.
+          if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
+            return true;
+
+          switch (Func) {
+          default: return true;
+          case LibFunc::copysign:
+          case LibFunc::copysignf:
+            continue; // ISD::FCOPYSIGN is never a library call.
+          case LibFunc::copysignl:
+            return true;
+          case LibFunc::fabs:
+          case LibFunc::fabsf:
+          case LibFunc::fabsl:
+            continue; // ISD::FABS is never a library call.
+          case LibFunc::sqrt:
+          case LibFunc::sqrtf:
+          case LibFunc::sqrtl:
+            Opcode = ISD::FSQRT; break;
+          case LibFunc::floor:
+          case LibFunc::floorf:
+          case LibFunc::floorl:
+            Opcode = ISD::FFLOOR; break;
+          case LibFunc::nearbyint:
+          case LibFunc::nearbyintf:
+          case LibFunc::nearbyintl:
+            Opcode = ISD::FNEARBYINT; break;
+          case LibFunc::ceil:
+          case LibFunc::ceilf:
+          case LibFunc::ceill:
+            Opcode = ISD::FCEIL; break;
+          case LibFunc::rint:
+          case LibFunc::rintf:
+          case LibFunc::rintl:
+            Opcode = ISD::FRINT; break;
+          case LibFunc::round:
+          case LibFunc::roundf:
+          case LibFunc::roundl:
+            Opcode = ISD::FROUND; break;
+          case LibFunc::trunc:
+          case LibFunc::truncf:
+          case LibFunc::truncl:
+            Opcode = ISD::FTRUNC; break;
+          case LibFunc::fmin:
+          case LibFunc::fminf:
+          case LibFunc::fminl:
+            Opcode = ISD::FMINNUM; break;
+          case LibFunc::fmax:
+          case LibFunc::fmaxf:
+          case LibFunc::fmaxl:
+            Opcode = ISD::FMAXNUM; break;
+          }
+        }
+
+        if (Opcode) {
+          auto &DL = CI->getModule()->getDataLayout();
+          MVT VTy = TLI->getSimpleValueType(DL, CI->getArgOperand(0)->getType(),
+                                            true);
+          if (VTy == MVT::Other)
+            return true;
+
+          if (TLI->isOperationLegalOrCustom(Opcode, VTy))
+            continue;
+          else if (VTy.isVector() &&
+                   TLI->isOperationLegalOrCustom(Opcode, VTy.getScalarType()))
+            continue;
+
+          return true;
+        }
+      }
+
+      return true;
+    } else if (isa<BinaryOperator>(J) &&
+               J->getType()->getScalarType()->isPPC_FP128Ty()) {
+      // Most operations on ppc_f128 values become calls.
+      return true;
+    } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
+               isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
+      CastInst *CI = cast<CastInst>(J);
+      if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
+          CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
+          isLargeIntegerTy(TT.isArch32Bit(), CI->getSrcTy()->getScalarType()) ||
+          isLargeIntegerTy(TT.isArch32Bit(), CI->getDestTy()->getScalarType()))
+        return true;
+    } else if (isLargeIntegerTy(TT.isArch32Bit(),
+                                J->getType()->getScalarType()) &&
+               (J->getOpcode() == Instruction::UDiv ||
+                J->getOpcode() == Instruction::SDiv ||
+                J->getOpcode() == Instruction::URem ||
+                J->getOpcode() == Instruction::SRem)) {
+      return true;
+    } else if (TT.isArch32Bit() &&
+               isLargeIntegerTy(false, J->getType()->getScalarType()) &&
+               (J->getOpcode() == Instruction::Shl ||
+                J->getOpcode() == Instruction::AShr ||
+                J->getOpcode() == Instruction::LShr)) {
+      // Only on PPC32, for 128-bit integers (specifically not 64-bit
+      // integers), these might be runtime calls.
+      return true;
+    } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
+      // On PowerPC, indirect jumps use the counter register.
+      return true;
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
+      if (!TM)
+        return true;
+      const TargetLowering *TLI =
+          TM->getSubtargetImpl(*BB->getParent())->getTargetLowering();
+
+      if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
+        return true;
+    }
+
+    if (TM->getSubtargetImpl(*BB->getParent())->getTargetLowering()->useSoftFloat()) {
+      switch(J->getOpcode()) {
+      case Instruction::FAdd:
+      case Instruction::FSub:
+      case Instruction::FMul:
+      case Instruction::FDiv:
+      case Instruction::FRem:
+      case Instruction::FPTrunc:
+      case Instruction::FPExt:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::UIToFP:
+      case Instruction::SIToFP:
+      case Instruction::FCmp:
+        return true;
+      }
+    }
+
+    for (Value *Operand : J->operands())
+      if (memAddrUsesCTR(TM, Operand))
+        return true;
+  }
+
+  return false;
+}
+
+bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
+  bool MadeChange = false;
+
+  const Triple TT =
+      Triple(L->getHeader()->getParent()->getParent()->getTargetTriple());
+  if (!TT.isArch32Bit() && !TT.isArch64Bit())
+    return MadeChange; // Unknown arch. type.
+
+  // Process nested loops first.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
+    MadeChange |= convertToCTRLoop(*I);
+    DEBUG(dbgs() << "Nested loop converted\n");
+  }
+
+  // If a nested loop has been converted, then we can't convert this loop.
+  if (MadeChange)
+    return MadeChange;
+
+#ifndef NDEBUG
+  // Stop trying after reaching the limit (if any).
+  int Limit = CTRLoopLimit;
+  if (Limit >= 0) {
+    if (Counter >= CTRLoopLimit)
+      return false;
+    Counter++;
+  }
+#endif
+
+  // We don't want to spill/restore the counter register, and so we don't
+  // want to use the counter register if the loop contains calls.
+  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+       I != IE; ++I)
+    if (mightUseCTR(TT, *I))
+      return MadeChange;
+
+  SmallVector<BasicBlock*, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  BasicBlock *CountedExitBlock = nullptr;
+  const SCEV *ExitCount = nullptr;
+  BranchInst *CountedExitBranch = nullptr;
+  for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
+       IE = ExitingBlocks.end(); I != IE; ++I) {
+    const SCEV *EC = SE->getExitCount(L, *I);
+    DEBUG(dbgs() << "Exit Count for " << *L << " from block " <<
+                    (*I)->getName() << ": " << *EC << "\n");
+    if (isa<SCEVCouldNotCompute>(EC))
+      continue;
+    if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
+      if (ConstEC->getValue()->isZero())
+        continue;
+    } else if (!SE->isLoopInvariant(EC, L))
+      continue;
+
+    if (SE->getTypeSizeInBits(EC->getType()) > (TT.isArch64Bit() ? 64 : 32))
+      continue;
+
+    // We now have a loop-invariant count of loop iterations (which is not the
+    // constant zero) for which we know that this loop will not exit via this
+    // exisiting block.
+
+    // We need to make sure that this block will run on every loop iteration.
+    // For this to be true, we must dominate all blocks with backedges. Such
+    // blocks are in-loop predecessors to the header block.
+    bool NotAlways = false;
+    for (pred_iterator PI = pred_begin(L->getHeader()),
+         PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
+      if (!L->contains(*PI))
+        continue;
+
+      if (!DT->dominates(*I, *PI)) {
+        NotAlways = true;
+        break;
+      }
+    }
+
+    if (NotAlways)
+      continue;
+
+    // Make sure this blocks ends with a conditional branch.
+    Instruction *TI = (*I)->getTerminator();
+    if (!TI)
+      continue;
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (!BI->isConditional())
+        continue;
+
+      CountedExitBranch = BI;
+    } else
+      continue;
+
+    // Note that this block may not be the loop latch block, even if the loop
+    // has a latch block.
+    CountedExitBlock = *I;
+    ExitCount = EC;
+    break;
+  }
+
+  if (!CountedExitBlock)
+    return MadeChange;
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+
+  // If we don't have a preheader, then insert one. If we already have a
+  // preheader, then we can use it (except if the preheader contains a use of
+  // the CTR register because some such uses might be reordered by the
+  // selection DAG after the mtctr instruction).
+  if (!Preheader || mightUseCTR(TT, Preheader))
+    Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
+  if (!Preheader)
+    return MadeChange;
+
+  DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() << "\n");
+
+  // Insert the count into the preheader and replace the condition used by the
+  // selected branch.
+  MadeChange = true;
+
+  SCEVExpander SCEVE(*SE, Preheader->getModule()->getDataLayout(), "loopcnt");
+  LLVMContext &C = SE->getContext();
+  Type *CountType = TT.isArch64Bit() ? Type::getInt64Ty(C) :
+                                       Type::getInt32Ty(C);
+  if (!ExitCount->getType()->isPointerTy() &&
+      ExitCount->getType() != CountType)
+    ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
+  ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType));
+  Value *ECValue =
+      SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator());
+
+  IRBuilder<> CountBuilder(Preheader->getTerminator());
+  Module *M = Preheader->getParent()->getParent();
+  Value *MTCTRFunc = Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr,
+                                               CountType);
+  CountBuilder.CreateCall(MTCTRFunc, ECValue);
+
+  IRBuilder<> CondBuilder(CountedExitBranch);
+  Value *DecFunc =
+    Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero);
+  Value *NewCond = CondBuilder.CreateCall(DecFunc, {});
+  Value *OldCond = CountedExitBranch->getCondition();
+  CountedExitBranch->setCondition(NewCond);
+
+  // The false branch must exit the loop.
+  if (!L->contains(CountedExitBranch->getSuccessor(0)))
+    CountedExitBranch->swapSuccessors();
+
+  // The old condition may be dead now, and may have even created a dead PHI
+  // (the original induction variable).
+  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+  DeleteDeadPHIs(CountedExitBlock);
+
+  ++NumCTRLoops;
+  return MadeChange;
+}
+
+#ifndef NDEBUG
+static bool clobbersCTR(const MachineInstr &MI) {
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (MO.isReg()) {
+      if (MO.isDef() && (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8))
+        return true;
+    } else if (MO.isRegMask()) {
+      if (MO.clobbersPhysReg(PPC::CTR) || MO.clobbersPhysReg(PPC::CTR8))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+static bool verifyCTRBranch(MachineBasicBlock *MBB,
+                            MachineBasicBlock::iterator I) {
+  MachineBasicBlock::iterator BI = I;
+  SmallSet<MachineBasicBlock *, 16>   Visited;
+  SmallVector<MachineBasicBlock *, 8> Preds;
+  bool CheckPreds;
+
+  if (I == MBB->begin()) {
+    Visited.insert(MBB);
+    goto queue_preds;
+  } else
+    --I;
+
+check_block:
+  Visited.insert(MBB);
+  if (I == MBB->end())
+    goto queue_preds;
+
+  CheckPreds = true;
+  for (MachineBasicBlock::iterator IE = MBB->begin();; --I) {
+    unsigned Opc = I->getOpcode();
+    if (Opc == PPC::MTCTRloop || Opc == PPC::MTCTR8loop) {
+      CheckPreds = false;
+      break;
+    }
+
+    if (I != BI && clobbersCTR(*I)) {
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " (" <<
+                      MBB->getFullName() << ") instruction " << *I <<
+                      " clobbers CTR, invalidating " << "BB#" <<
+                      BI->getParent()->getNumber() << " (" <<
+                      BI->getParent()->getFullName() << ") instruction " <<
+                      *BI << "\n");
+      return false;
+    }
+
+    if (I == IE)
+      break;
+  }
+
+  if (!CheckPreds && Preds.empty())
+    return true;
+
+  if (CheckPreds) {
+queue_preds:
+    if (MachineFunction::iterator(MBB) == MBB->getParent()->begin()) {
+      DEBUG(dbgs() << "Unable to find a MTCTR instruction for BB#" <<
+                      BI->getParent()->getNumber() << " (" <<
+                      BI->getParent()->getFullName() << ") instruction " <<
+                      *BI << "\n");
+      return false;
+    }
+
+    for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PIE = MBB->pred_end(); PI != PIE; ++PI)
+      Preds.push_back(*PI);
+  }
+
+  do {
+    MBB = Preds.pop_back_val();
+    if (!Visited.count(MBB)) {
+      I = MBB->getLastNonDebugInstr();
+      goto check_block;
+    }
+  } while (!Preds.empty());
+
+  return true;
+}
+
+bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) {
+  MDT = &getAnalysis<MachineDominatorTree>();
+
+  // Verify that all bdnz/bdz instructions are dominated by a loop mtctr before
+  // any other instructions that might clobber the ctr register.
+  for (MachineFunction::iterator I = MF.begin(), IE = MF.end();
+       I != IE; ++I) {
+    MachineBasicBlock *MBB = &*I;
+    if (!MDT->isReachableFromEntry(MBB))
+      continue;
+
+    for (MachineBasicBlock::iterator MII = MBB->getFirstTerminator(),
+      MIIE = MBB->end(); MII != MIIE; ++MII) {
+      unsigned Opc = MII->getOpcode();
+      if (Opc == PPC::BDNZ8 || Opc == PPC::BDNZ ||
+          Opc == PPC::BDZ8  || Opc == PPC::BDZ)
+        if (!verifyCTRBranch(MBB, MII))
+          llvm_unreachable("Invalid PPC CTR loop!");
+    }
+  }
+
+  return false;
+}
+#endif // NDEBUG
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.h b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.h
new file mode 100644
index 000000000000..eb904a858592
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.h
@@ -0,0 +1,35 @@
+//=== PPCCallingConv.h - PPC Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the PPC Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_PPC_PPCCALLINGCONV_H
+#define LLVM_LIB_TARGET_PPC_PPCCALLINGCONV_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+inline bool CC_PPC_AnyReg_Error(unsigned &, MVT &, MVT &,
+                                CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
+                                CCState &) {
+  llvm_unreachable("The AnyReg calling convention is only supported by the " \
+                   "stackmap and patchpoint intrinsics.");
+  // gracefully fallback to PPC C calling convention on Release builds.
+  return false;
+}
+
+} // End llvm namespace
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td
new file mode 100644
index 000000000000..a4f4c8688cc1
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td
@@ -0,0 +1,284 @@
+//===- PPCCallingConv.td - Calling Conventions for PowerPC -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the PowerPC 32- and 64-bit
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+    : CCIf<!strconcat("static_cast<const PPCSubtarget&>"
+                       "(State.getMachineFunction().getSubtarget()).",
+                     F),
+          A>;
+class CCIfNotSubtarget<string F, CCAction A>
+    : CCIf<!strconcat("!static_cast<const PPCSubtarget&>"
+                       "(State.getMachineFunction().getSubtarget()).",
+                     F),
+          A>;
+class CCIfOrigArgWasNotPPCF128<CCAction A>
+    : CCIf<"!static_cast<PPCCCState *>(&State)->WasOriginalArgPPCF128(ValNo)",
+           A>;
+class CCIfOrigArgWasPPCF128<CCAction A>
+    : CCIf<"static_cast<PPCCCState *>(&State)->WasOriginalArgPPCF128(ValNo)",
+           A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+
+// PPC64 AnyReg return-value convention. No explicit register is specified for
+// the return-value. The register allocator is allowed and expected to choose
+// any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the PPC C calling convention.
+def RetCC_PPC64_AnyReg : CallingConv<[
+  CCCustom<"CC_PPC_AnyReg_Error">
+]>;
+
+// Return-value convention for PowerPC
+def RetCC_PPC : CallingConv<[
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_PPC64_AnyReg>>,
+
+  // On PPC64, integer return values are always promoted to i64
+  CCIfType<[i32, i1], CCIfSubtarget<"isPPC64()", CCPromoteToType<i64>>>,
+  CCIfType<[i1], CCIfNotSubtarget<"isPPC64()", CCPromoteToType<i32>>>,
+
+  CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
+  CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>,
+  CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
+
+  // Floating point types returned as "direct" go into F1 .. F8; note that
+  // only the ELFv2 ABI fully utilizes all these registers.
+  CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+  CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+
+  // QPX vectors are returned in QF1 and QF2. 
+  CCIfType<[v4f64, v4f32, v4i1],
+           CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>,
+ 
+  // Vector types returned as "direct" go into V2 .. V9; note that only the
+  // ELFv2 ABI fully utilizes all these registers.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
+           CCIfSubtarget<"hasAltivec()",
+           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>
+]>;
+
+// No explicit register is specified for the AnyReg calling convention. The
+// register allocator may assign the arguments to any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the PPC C calling convention.
+def CC_PPC64_AnyReg : CallingConv<[
+  CCCustom<"CC_PPC_AnyReg_Error">
+]>;
+
+// Note that we don't currently have calling conventions for 64-bit
+// PowerPC, but handle all the complexities of the ABI in the lowering
+// logic.  FIXME: See if the logic can be simplified with use of CCs.
+// This may require some extensions to current table generation.
+
+// Simple calling convention for 64-bit ELF PowerPC fast isel.
+// Only handle ints and floats.  All ints are promoted to i64.
+// Vector types and quadword ints are not handled.
+def CC_PPC64_ELF_FIS : CallingConv<[
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_PPC64_AnyReg>>,
+
+  CCIfType<[i1],  CCPromoteToType<i64>>,
+  CCIfType<[i8],  CCPromoteToType<i64>>,
+  CCIfType<[i16], CCPromoteToType<i64>>,
+  CCIfType<[i32], CCPromoteToType<i64>>,
+  CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>,
+  CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>
+]>;
+
+// Simple return-value convention for 64-bit ELF PowerPC fast isel.
+// All small ints are promoted to i64.  Vector types, quadword ints,
+// and multiple register returns are "supported" to avoid compile
+// errors, but none are handled by the fast selector.
+def RetCC_PPC64_ELF_FIS : CallingConv<[
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_PPC64_AnyReg>>,
+
+  CCIfType<[i1],   CCPromoteToType<i64>>,
+  CCIfType<[i8],   CCPromoteToType<i64>>,
+  CCIfType<[i16],  CCPromoteToType<i64>>,
+  CCIfType<[i32],  CCPromoteToType<i64>>,
+  CCIfType<[i64],  CCAssignToReg<[X3, X4, X5, X6]>>,
+  CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
+  CCIfType<[f32],  CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+  CCIfType<[f64],  CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+  CCIfType<[v4f64, v4f32, v4i1],
+           CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>,
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
+           CCIfSubtarget<"hasAltivec()",
+           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC System V Release 4 32-bit ABI
+//===----------------------------------------------------------------------===//
+
+def CC_PPC32_SVR4_Common : CallingConv<[
+  CCIfType<[i1], CCPromoteToType<i32>>,
+
+  // The ABI requires i64 to be passed in two adjacent registers with the first
+  // register having an odd register number.
+  CCIfType<[i32],
+  CCIfSplit<CCIfSubtarget<"useSoftFloat()", 
+            CCIfOrigArgWasNotPPCF128<
+            CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>>>,
+  
+  CCIfType<[i32],
+  CCIfSplit<CCIfNotSubtarget<"useSoftFloat()", 
+                            CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>>,
+  CCIfSplit<CCIfSubtarget<"useSoftFloat()",
+                          CCIfOrigArgWasPPCF128<CCCustom<
+                          "CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128">>>>,
+
+  // The 'nest' parameter, if any, is passed in R11.
+  CCIfNest<CCAssignToReg<[R11]>>,
+
+  // The first 8 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
+
+  // Make sure the i64 words from a long double are either both passed in
+  // registers or both passed on the stack.
+  CCIfType<[f64], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignFPArgRegs">>>,
+  
+  // FP values are passed in F1 - F8.
+  CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+
+  // Split arguments have an alignment of 8 bytes on the stack.
+  CCIfType<[i32], CCIfSplit<CCAssignToStack<4, 8>>>,
+  
+  CCIfType<[i32], CCAssignToStack<4, 4>>,
+  
+  // Floats are stored in double precision format, thus they have the same
+  // alignment and size as doubles.
+  CCIfType<[f32,f64], CCAssignToStack<8, 8>>,  
+
+  // QPX vectors that are stored in double precision need 32-byte alignment.
+  CCIfType<[v4f64, v4i1], CCAssignToStack<32, 32>>,
+
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>
+]>;
+
+// This calling convention puts vector arguments always on the stack. It is used
+// to assign vector arguments which belong to the variable portion of the
+// parameter list of a variable argument function.
+def CC_PPC32_SVR4_VarArg : CallingConv<[
+  CCDelegateTo<CC_PPC32_SVR4_Common>
+]>;
+
+// In contrast to CC_PPC32_SVR4_VarArg, this calling convention first tries to
+// put vector arguments in vector registers before putting them on the stack.
+def CC_PPC32_SVR4 : CallingConv<[
+  // QPX vectors mirror the scalar FP convention.
+  CCIfType<[v4f64, v4f32, v4i1], CCIfSubtarget<"hasQPX()",
+    CCAssignToReg<[QF1, QF2, QF3, QF4, QF5, QF6, QF7, QF8]>>>,
+
+  // The first 12 Vector arguments are passed in AltiVec registers.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
+           CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7,
+                          V8, V9, V10, V11, V12, V13]>>>,
+           
+  CCDelegateTo<CC_PPC32_SVR4_Common>
+]>;  
+
+// Helper "calling convention" to handle aggregate by value arguments.
+// Aggregate by value arguments are always placed in the local variable space
+// of the caller. This calling convention is only used to assign those stack
+// offsets in the callers stack frame.
+//
+// Still, the address of the aggregate copy in the callers stack frame is passed
+// in a GPR (or in the parameter list area if all GPRs are allocated) from the
+// caller to the callee. The location for the address argument is assigned by
+// the CC_PPC32_SVR4 calling convention.
+//
+// The only purpose of CC_PPC32_SVR4_Custom_Dummy is to skip arguments which are
+// not passed by value.
+ 
+def CC_PPC32_SVR4_ByVal : CallingConv<[
+  CCIfByVal<CCPassByVal<4, 4>>,
+  
+  CCCustom<"CC_PPC32_SVR4_Custom_Dummy">
+]>;
+
+def CSR_Altivec : CalleeSavedRegs<(add V20, V21, V22, V23, V24, V25, V26, V27,
+                                       V28, V29, V30, V31)>;
+
+def CSR_Darwin32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20,
+                                        R21, R22, R23, R24, R25, R26, R27, R28,
+                                        R29, R30, R31, F14, F15, F16, F17, F18,
+                                        F19, F20, F21, F22, F23, F24, F25, F26,
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4
+                                   )>;
+
+def CSR_Darwin32_Altivec : CalleeSavedRegs<(add CSR_Darwin32, CSR_Altivec)>;
+
+def CSR_SVR432   : CalleeSavedRegs<(add R14, R15, R16, R17, R18, R19, R20,
+                                        R21, R22, R23, R24, R25, R26, R27, R28,
+                                        R29, R30, R31, F14, F15, F16, F17, F18,
+                                        F19, F20, F21, F22, F23, F24, F25, F26,
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4
+                                   )>;
+
+def CSR_SVR432_Altivec : CalleeSavedRegs<(add CSR_SVR432, CSR_Altivec)>;
+
+def CSR_Darwin64 : CalleeSavedRegs<(add X13, X14, X15, X16, X17, X18, X19, X20,
+                                        X21, X22, X23, X24, X25, X26, X27, X28,
+                                        X29, X30, X31, F14, F15, F16, F17, F18,
+                                        F19, F20, F21, F22, F23, F24, F25, F26,
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4
+                                   )>;
+
+def CSR_Darwin64_Altivec : CalleeSavedRegs<(add CSR_Darwin64, CSR_Altivec)>;
+
+def CSR_SVR464   : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20,
+                                        X21, X22, X23, X24, X25, X26, X27, X28,
+                                        X29, X30, X31, F14, F15, F16, F17, F18,
+                                        F19, F20, F21, F22, F23, F24, F25, F26,
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4
+                                   )>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_SRV464_TLS_PE : CalleeSavedRegs<(add)>;
+
+def CSR_SVR464_ViaCopy : CalleeSavedRegs<(add CSR_SVR464)>;
+
+def CSR_SVR464_Altivec : CalleeSavedRegs<(add CSR_SVR464, CSR_Altivec)>;
+
+def CSR_SVR464_Altivec_ViaCopy : CalleeSavedRegs<(add CSR_SVR464_Altivec)>;
+
+def CSR_SVR464_R2 : CalleeSavedRegs<(add CSR_SVR464, X2)>;
+
+def CSR_SVR464_R2_ViaCopy : CalleeSavedRegs<(add CSR_SVR464_R2)>;
+
+def CSR_SVR464_R2_Altivec : CalleeSavedRegs<(add CSR_SVR464_Altivec, X2)>;
+
+def CSR_SVR464_R2_Altivec_ViaCopy : CalleeSavedRegs<(add CSR_SVR464_R2_Altivec)>;
+
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_64_AllRegs: CalleeSavedRegs<(add X0, (sequence "X%u", 3, 10),
+                                             (sequence "X%u", 14, 31),
+                                             (sequence "F%u", 0, 31),
+                                             (sequence "CR%u", 0, 7))>;
+
+def CSR_64_AllRegs_Altivec : CalleeSavedRegs<(add CSR_64_AllRegs,
+                                             (sequence "V%u", 0, 31))>;
+
+def CSR_64_AllRegs_VSX : CalleeSavedRegs<(add CSR_64_AllRegs_Altivec,
+                                         (sequence "VSL%u", 0, 31))>;
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
new file mode 100644
index 000000000000..6bd229625fc3
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
@@ -0,0 +1,213 @@
+//===------------- PPCEarlyReturn.cpp - Form Early Returns ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass that form early (predicated) returns. If-conversion handles some of
+// this, but this pass picks up some remaining cases.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-early-ret"
+STATISTIC(NumBCLR, "Number of early conditional returns");
+STATISTIC(NumBLR,  "Number of early returns");
+
+namespace llvm {
+  void initializePPCEarlyReturnPass(PassRegistry&);
+}
+
+namespace {
+  // PPCEarlyReturn pass - For simple functions without epilogue code, move
+  // returns up, and create conditional returns, to avoid unnecessary
+  // branch-to-blr sequences.
+  struct PPCEarlyReturn : public MachineFunctionPass {
+    static char ID;
+    PPCEarlyReturn() : MachineFunctionPass(ID) {
+      initializePPCEarlyReturnPass(*PassRegistry::getPassRegistry());
+    }
+
+    const TargetInstrInfo *TII;
+
+protected:
+    bool processBlock(MachineBasicBlock &ReturnMBB) {
+      bool Changed = false;
+
+      MachineBasicBlock::iterator I = ReturnMBB.begin();
+      I = ReturnMBB.SkipPHIsLabelsAndDebug(I);
+
+      // The block must be essentially empty except for the blr.
+      if (I == ReturnMBB.end() ||
+          (I->getOpcode() != PPC::BLR && I->getOpcode() != PPC::BLR8) ||
+          I != ReturnMBB.getLastNonDebugInstr())
+        return Changed;
+
+      SmallVector<MachineBasicBlock*, 8> PredToRemove;
+      for (MachineBasicBlock::pred_iterator PI = ReturnMBB.pred_begin(),
+           PIE = ReturnMBB.pred_end(); PI != PIE; ++PI) {
+        bool OtherReference = false, BlockChanged = false;
+
+        if ((*PI)->empty())
+          continue;
+        
+        for (MachineBasicBlock::iterator J = (*PI)->getLastNonDebugInstr();;) {
+          if (J == (*PI)->end())
+            break;
+
+          if (J->getOpcode() == PPC::B) {
+            if (J->getOperand(0).getMBB() == &ReturnMBB) {
+              // This is an unconditional branch to the return. Replace the
+              // branch with a blr.
+              BuildMI(**PI, J, J->getDebugLoc(), TII->get(I->getOpcode()))
+                  .copyImplicitOps(*I);
+              MachineBasicBlock::iterator K = J--;
+              K->eraseFromParent();
+              BlockChanged = true;
+              ++NumBLR;
+              continue;
+            }
+          } else if (J->getOpcode() == PPC::BCC) {
+            if (J->getOperand(2).getMBB() == &ReturnMBB) {
+              // This is a conditional branch to the return. Replace the branch
+              // with a bclr.
+              BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR))
+                  .addImm(J->getOperand(0).getImm())
+                  .addReg(J->getOperand(1).getReg())
+                  .copyImplicitOps(*I);
+              MachineBasicBlock::iterator K = J--;
+              K->eraseFromParent();
+              BlockChanged = true;
+              ++NumBCLR;
+              continue;
+            }
+          } else if (J->getOpcode() == PPC::BC || J->getOpcode() == PPC::BCn) {
+            if (J->getOperand(1).getMBB() == &ReturnMBB) {
+              // This is a conditional branch to the return. Replace the branch
+              // with a bclr.
+              BuildMI(
+                  **PI, J, J->getDebugLoc(),
+                  TII->get(J->getOpcode() == PPC::BC ? PPC::BCLR : PPC::BCLRn))
+                  .addReg(J->getOperand(0).getReg())
+                  .copyImplicitOps(*I);
+              MachineBasicBlock::iterator K = J--;
+              K->eraseFromParent();
+              BlockChanged = true;
+              ++NumBCLR;
+              continue;
+            }
+          } else if (J->isBranch()) {
+            if (J->isIndirectBranch()) {
+              if (ReturnMBB.hasAddressTaken())
+                OtherReference = true;
+            } else
+              for (unsigned i = 0; i < J->getNumOperands(); ++i)
+                if (J->getOperand(i).isMBB() &&
+                    J->getOperand(i).getMBB() == &ReturnMBB)
+                  OtherReference = true;
+          } else if (!J->isTerminator() && !J->isDebugValue())
+            break;
+
+          if (J == (*PI)->begin())
+            break;
+
+          --J;
+        }
+
+        if ((*PI)->canFallThrough() && (*PI)->isLayoutSuccessor(&ReturnMBB))
+          OtherReference = true;
+
+        // Predecessors are stored in a vector and can't be removed here.
+        if (!OtherReference && BlockChanged) {
+          PredToRemove.push_back(*PI);
+        }
+
+        if (BlockChanged)
+          Changed = true;
+      }
+
+      for (unsigned i = 0, ie = PredToRemove.size(); i != ie; ++i)
+        PredToRemove[i]->removeSuccessor(&ReturnMBB, true);
+
+      if (Changed && !ReturnMBB.hasAddressTaken()) {
+        // We now might be able to merge this blr-only block into its
+        // by-layout predecessor.
+        if (ReturnMBB.pred_size() == 1) {
+          MachineBasicBlock &PrevMBB = **ReturnMBB.pred_begin();
+          if (PrevMBB.isLayoutSuccessor(&ReturnMBB) && PrevMBB.canFallThrough()) {
+            // Move the blr into the preceding block.
+            PrevMBB.splice(PrevMBB.end(), &ReturnMBB, I);
+            PrevMBB.removeSuccessor(&ReturnMBB, true);
+          }
+        }
+
+        if (ReturnMBB.pred_empty())
+          ReturnMBB.eraseFromParent();
+      }
+
+      return Changed;
+    }
+
+public:
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      if (skipFunction(*MF.getFunction()))
+        return false;
+
+      TII = MF.getSubtarget().getInstrInfo();
+
+      bool Changed = false;
+
+      // If the function does not have at least two blocks, then there is
+      // nothing to do.
+      if (MF.size() < 2)
+        return Changed;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS(PPCEarlyReturn, DEBUG_TYPE,
+                "PowerPC Early-Return Creation", false, false)
+
+char PPCEarlyReturn::ID = 0;
+FunctionPass*
+llvm::createPPCEarlyReturnPass() { return new PPCEarlyReturn(); }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
new file mode 100644
index 000000000000..9b91b9ab8f82
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -0,0 +1,2374 @@
+//===-- PPCFastISel.cpp - PowerPC FastISel implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PowerPC-specific support for the FastISel class. Some
+// of the target-specific code is generated by tablegen in the file
+// PPCGenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCCallingConv.h"
+#include "PPCCCState.h"
+#include "PPCISelLowering.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+//===----------------------------------------------------------------------===//
+//
+// TBD:
+//   fastLowerArguments: Handle simple cases.
+//   PPCMaterializeGV: Handle TLS.
+//   SelectCall: Handle function pointers.
+//   SelectCall: Handle multi-register return values.
+//   SelectCall: Optimize away nops for local calls.
+//   processCallArgs: Handle bit-converted arguments.
+//   finishCall: Handle multi-register return values.
+//   PPCComputeAddress: Handle parameter references as FrameIndex's.
+//   PPCEmitCmp: Handle immediate as operand 1.
+//   SelectCall: Handle small byval arguments.
+//   SelectIntrinsicCall: Implement.
+//   SelectSelect: Implement.
+//   Consider factoring isTypeLegal into the base class.
+//   Implement switches and jump tables.
+//
+//===----------------------------------------------------------------------===//
+using namespace llvm;
+
+#define DEBUG_TYPE "ppcfastisel"
+
+namespace {
+
+typedef struct Address {
+  enum {
+    RegBase,
+    FrameIndexBase
+  } BaseType;
+
+  union {
+    unsigned Reg;
+    int FI;
+  } Base;
+
+  long Offset;
+
+  // Innocuous defaults for our address.
+  Address()
+   : BaseType(RegBase), Offset(0) {
+     Base.Reg = 0;
+   }
+} Address;
+
+class PPCFastISel final : public FastISel {
+
+  const TargetMachine &TM;
+  const PPCSubtarget *PPCSubTarget;
+  PPCFunctionInfo *PPCFuncInfo;
+  const TargetInstrInfo &TII;
+  const TargetLowering &TLI;
+  LLVMContext *Context;
+
+  public:
+    explicit PPCFastISel(FunctionLoweringInfo &FuncInfo,
+                         const TargetLibraryInfo *LibInfo)
+        : FastISel(FuncInfo, LibInfo), TM(FuncInfo.MF->getTarget()),
+          PPCSubTarget(&FuncInfo.MF->getSubtarget<PPCSubtarget>()),
+          PPCFuncInfo(FuncInfo.MF->getInfo<PPCFunctionInfo>()),
+          TII(*PPCSubTarget->getInstrInfo()),
+          TLI(*PPCSubTarget->getTargetLowering()),
+          Context(&FuncInfo.Fn->getContext()) {}
+
+  // Backend specific FastISel code.
+  private:
+    bool fastSelectInstruction(const Instruction *I) override;
+    unsigned fastMaterializeConstant(const Constant *C) override;
+    unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
+    bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                             const LoadInst *LI) override;
+    bool fastLowerArguments() override;
+    unsigned fastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm) override;
+    unsigned fastEmitInst_ri(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             uint64_t Imm);
+    unsigned fastEmitInst_r(unsigned MachineInstOpcode,
+                            const TargetRegisterClass *RC,
+                            unsigned Op0, bool Op0IsKill);
+    unsigned fastEmitInst_rr(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             unsigned Op1, bool Op1IsKill);
+
+    bool fastLowerCall(CallLoweringInfo &CLI) override;
+
+  // Instruction selection routines.
+  private:
+    bool SelectLoad(const Instruction *I);
+    bool SelectStore(const Instruction *I);
+    bool SelectBranch(const Instruction *I);
+    bool SelectIndirectBr(const Instruction *I);
+    bool SelectFPExt(const Instruction *I);
+    bool SelectFPTrunc(const Instruction *I);
+    bool SelectIToFP(const Instruction *I, bool IsSigned);
+    bool SelectFPToI(const Instruction *I, bool IsSigned);
+    bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode);
+    bool SelectRet(const Instruction *I);
+    bool SelectTrunc(const Instruction *I);
+    bool SelectIntExt(const Instruction *I);
+
+  // Utility routines.
+  private:
+    bool isTypeLegal(Type *Ty, MVT &VT);
+    bool isLoadTypeLegal(Type *Ty, MVT &VT);
+    bool isValueAvailable(const Value *V) const;
+    bool isVSFRCRegClass(const TargetRegisterClass *RC) const {
+      return RC->getID() == PPC::VSFRCRegClassID;
+    }
+    bool isVSSRCRegClass(const TargetRegisterClass *RC) const {
+      return RC->getID() == PPC::VSSRCRegClassID;
+    }
+    bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value,
+                    bool isZExt, unsigned DestReg);
+    bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                     const TargetRegisterClass *RC, bool IsZExt = true,
+                     unsigned FP64LoadOpc = PPC::LFD);
+    bool PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr);
+    bool PPCComputeAddress(const Value *Obj, Address &Addr);
+    void PPCSimplifyAddress(Address &Addr, bool &UseOffset,
+                            unsigned &IndexReg);
+    bool PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                           unsigned DestReg, bool IsZExt);
+    unsigned PPCMaterializeFP(const ConstantFP *CFP, MVT VT);
+    unsigned PPCMaterializeGV(const GlobalValue *GV, MVT VT);
+    unsigned PPCMaterializeInt(const ConstantInt *CI, MVT VT,
+                               bool UseSExt = true);
+    unsigned PPCMaterialize32BitInt(int64_t Imm,
+                                    const TargetRegisterClass *RC);
+    unsigned PPCMaterialize64BitInt(int64_t Imm,
+                                    const TargetRegisterClass *RC);
+    unsigned PPCMoveToIntReg(const Instruction *I, MVT VT,
+                             unsigned SrcReg, bool IsSigned);
+    unsigned PPCMoveToFPReg(MVT VT, unsigned SrcReg, bool IsSigned);
+
+  // Call handling routines.
+  private:
+    bool processCallArgs(SmallVectorImpl<Value*> &Args,
+                         SmallVectorImpl<unsigned> &ArgRegs,
+                         SmallVectorImpl<MVT> &ArgVTs,
+                         SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                         SmallVectorImpl<unsigned> &RegArgs,
+                         CallingConv::ID CC,
+                         unsigned &NumBytes,
+                         bool IsVarArg);
+    bool finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumBytes);
+    LLVM_ATTRIBUTE_UNUSED CCAssignFn *usePPC32CCs(unsigned Flag);
+
+  private:
+  #include "PPCGenFastISel.inc"
+
+};
+
+} // end anonymous namespace
+
+#include "PPCGenCallingConv.inc"
+
+// Function whose sole purpose is to kill compiler warnings
+// stemming from unused functions included from PPCGenCallingConv.inc.
+CCAssignFn *PPCFastISel::usePPC32CCs(unsigned Flag) {
+  if (Flag == 1)
+    return CC_PPC32_SVR4;
+  else if (Flag == 2)
+    return CC_PPC32_SVR4_ByVal;
+  else if (Flag == 3)
+    return CC_PPC32_SVR4_VarArg;
+  else
+    return RetCC_PPC;
+}
+
+static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) {
+  switch (Pred) {
+    // These are not representable with any single compare.
+    case CmpInst::FCMP_FALSE:
+    case CmpInst::FCMP_TRUE:
+    // Major concern about the following 6 cases is NaN result. The comparison
+    // result consists of 4 bits, indicating lt, eq, gt and un (unordered),
+    // only one of which will be set. The result is generated by fcmpu
+    // instruction. However, bc instruction only inspects one of the first 3
+    // bits, so when un is set, bc instruction may jump to to an undesired
+    // place.
+    //
+    // More specifically, if we expect an unordered comparison and un is set, we
+    // expect to always go to true branch; in such case UEQ, UGT and ULT still
+    // give false, which are undesired; but UNE, UGE, ULE happen to give true,
+    // since they are tested by inspecting !eq, !lt, !gt, respectively.
+    //
+    // Similarly, for ordered comparison, when un is set, we always expect the
+    // result to be false. In such case OGT, OLT and OEQ is good, since they are
+    // actually testing GT, LT, and EQ respectively, which are false. OGE, OLE
+    // and ONE are tested through !lt, !gt and !eq, and these are true.
+    case CmpInst::FCMP_UEQ:
+    case CmpInst::FCMP_UGT:
+    case CmpInst::FCMP_ULT:
+    case CmpInst::FCMP_OGE:
+    case CmpInst::FCMP_OLE:
+    case CmpInst::FCMP_ONE:
+    default:
+      return Optional<PPC::Predicate>();
+
+    case CmpInst::FCMP_OEQ:
+    case CmpInst::ICMP_EQ:
+      return PPC::PRED_EQ;
+
+    case CmpInst::FCMP_OGT:
+    case CmpInst::ICMP_UGT:
+    case CmpInst::ICMP_SGT:
+      return PPC::PRED_GT;
+
+    case CmpInst::FCMP_UGE:
+    case CmpInst::ICMP_UGE:
+    case CmpInst::ICMP_SGE:
+      return PPC::PRED_GE;
+
+    case CmpInst::FCMP_OLT:
+    case CmpInst::ICMP_ULT:
+    case CmpInst::ICMP_SLT:
+      return PPC::PRED_LT;
+
+    case CmpInst::FCMP_ULE:
+    case CmpInst::ICMP_ULE:
+    case CmpInst::ICMP_SLE:
+      return PPC::PRED_LE;
+
+    case CmpInst::FCMP_UNE:
+    case CmpInst::ICMP_NE:
+      return PPC::PRED_NE;
+
+    case CmpInst::FCMP_ORD:
+      return PPC::PRED_NU;
+
+    case CmpInst::FCMP_UNO:
+      return PPC::PRED_UN;
+  }
+}
+
+// Determine whether the type Ty is simple enough to be handled by
+// fast-isel, and return its equivalent machine type in VT.
+// FIXME: Copied directly from ARM -- factor into base class?
+bool PPCFastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT Evt = TLI.getValueType(DL, Ty, true);
+
+  // Only handle simple types.
+  if (Evt == MVT::Other || !Evt.isSimple()) return false;
+  VT = Evt.getSimpleVT();
+
+  // Handle all legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+// Determine whether the type Ty is simple enough to be handled by
+// fast-isel as a load target, and return its equivalent machine type in VT.
+bool PPCFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT)) return true;
+
+  // If this is a type than can be sign or zero-extended to a basic operation
+  // go ahead and accept it now.
+  if (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) {
+    return true;
+  }
+
+  return false;
+}
+
+bool PPCFastISel::isValueAvailable(const Value *V) const {
+  if (!isa<Instruction>(V))
+    return true;
+
+  const auto *I = cast<Instruction>(V);
+  return FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB;
+}
+
+// Given a value Obj, create an Address object Addr that represents its
+// address.  Return false if we can't handle it.
+bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+    default:
+      break;
+    case Instruction::BitCast:
+      // Look through bitcasts.
+      return PPCComputeAddress(U->getOperand(0), Addr);
+    case Instruction::IntToPtr:
+      // Look past no-op inttoptrs.
+      if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+          TLI.getPointerTy(DL))
+        return PPCComputeAddress(U->getOperand(0), Addr);
+      break;
+    case Instruction::PtrToInt:
+      // Look past no-op ptrtoints.
+      if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+        return PPCComputeAddress(U->getOperand(0), Addr);
+      break;
+    case Instruction::GetElementPtr: {
+      Address SavedAddr = Addr;
+      long TmpOffset = Addr.Offset;
+
+      // Iterate through the GEP folding the constants into offsets where
+      // we can.
+      gep_type_iterator GTI = gep_type_begin(U);
+      for (User::const_op_iterator II = U->op_begin() + 1, IE = U->op_end();
+           II != IE; ++II, ++GTI) {
+        const Value *Op = *II;
+        if (StructType *STy = GTI.getStructTypeOrNull()) {
+          const StructLayout *SL = DL.getStructLayout(STy);
+          unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+          TmpOffset += SL->getElementOffset(Idx);
+        } else {
+          uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+          for (;;) {
+            if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+              // Constant-offset addressing.
+              TmpOffset += CI->getSExtValue() * S;
+              break;
+            }
+            if (canFoldAddIntoGEP(U, Op)) {
+              // A compatible add with a constant operand. Fold the constant.
+              ConstantInt *CI =
+              cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+              TmpOffset += CI->getSExtValue() * S;
+              // Iterate on the other operand.
+              Op = cast<AddOperator>(Op)->getOperand(0);
+              continue;
+            }
+            // Unsupported
+            goto unsupported_gep;
+          }
+        }
+      }
+
+      // Try to grab the base operand now.
+      Addr.Offset = TmpOffset;
+      if (PPCComputeAddress(U->getOperand(0), Addr)) return true;
+
+      // We failed, restore everything and try the other options.
+      Addr = SavedAddr;
+
+      unsupported_gep:
+      break;
+    }
+    case Instruction::Alloca: {
+      const AllocaInst *AI = cast<AllocaInst>(Obj);
+      DenseMap<const AllocaInst*, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+      if (SI != FuncInfo.StaticAllocaMap.end()) {
+        Addr.BaseType = Address::FrameIndexBase;
+        Addr.Base.FI = SI->second;
+        return true;
+      }
+      break;
+    }
+  }
+
+  // FIXME: References to parameters fall through to the behavior
+  // below.  They should be able to reference a frame index since
+  // they are stored to the stack, so we can get "ld rx, offset(r1)"
+  // instead of "addi ry, r1, offset / ld rx, 0(ry)".  Obj will
+  // just contain the parameter.  Try to handle this with a FI.
+
+  // Try to get this in a register if nothing else has worked.
+  if (Addr.Base.Reg == 0)
+    Addr.Base.Reg = getRegForValue(Obj);
+
+  // Prevent assignment of base register to X0, which is inappropriate
+  // for loads and stores alike.
+  if (Addr.Base.Reg != 0)
+    MRI.setRegClass(Addr.Base.Reg, &PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  return Addr.Base.Reg != 0;
+}
+
+// Fix up some addresses that can't be used directly.  For example, if
+// an offset won't fit in an instruction field, we may need to move it
+// into an index register.
+void PPCFastISel::PPCSimplifyAddress(Address &Addr, bool &UseOffset,
+                                     unsigned &IndexReg) {
+
+  // Check whether the offset fits in the instruction field.
+  if (!isInt<16>(Addr.Offset))
+    UseOffset = false;
+
+  // If this is a stack pointer and the offset needs to be simplified then
+  // put the alloca address into a register, set the base type back to
+  // register and continue. This should almost never happen.
+  if (!UseOffset && Addr.BaseType == Address::FrameIndexBase) {
+    unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8),
+            ResultReg).addFrameIndex(Addr.Base.FI).addImm(0);
+    Addr.Base.Reg = ResultReg;
+    Addr.BaseType = Address::RegBase;
+  }
+
+  if (!UseOffset) {
+    IntegerType *OffsetTy = Type::getInt64Ty(*Context);
+    const ConstantInt *Offset =
+      ConstantInt::getSigned(OffsetTy, (int64_t)(Addr.Offset));
+    IndexReg = PPCMaterializeInt(Offset, MVT::i64);
+    assert(IndexReg && "Unexpected error in PPCMaterializeInt!");
+  }
+}
+
+// Emit a load instruction if possible, returning true if we succeeded,
+// otherwise false.  See commentary below for how the register class of
+// the load is determined.
+bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                              const TargetRegisterClass *RC,
+                              bool IsZExt, unsigned FP64LoadOpc) {
+  unsigned Opc;
+  bool UseOffset = true;
+
+  // If ResultReg is given, it determines the register class of the load.
+  // Otherwise, RC is the register class to use.  If the result of the
+  // load isn't anticipated in this block, both may be zero, in which
+  // case we must make a conservative guess.  In particular, don't assign
+  // R0 or X0 to the result register, as the result may be used in a load,
+  // store, add-immediate, or isel that won't permit this.  (Though
+  // perhaps the spill and reload of live-exit values would handle this?)
+  const TargetRegisterClass *UseRC =
+    (ResultReg ? MRI.getRegClass(ResultReg) :
+     (RC ? RC :
+      (VT == MVT::f64 ? &PPC::F8RCRegClass :
+       (VT == MVT::f32 ? &PPC::F4RCRegClass :
+        (VT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
+         &PPC::GPRC_and_GPRC_NOR0RegClass)))));
+
+  bool Is32BitInt = UseRC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  switch (VT.SimpleTy) {
+    default: // e.g., vector types not handled
+      return false;
+    case MVT::i8:
+      Opc = Is32BitInt ? PPC::LBZ : PPC::LBZ8;
+      break;
+    case MVT::i16:
+      Opc = (IsZExt ? (Is32BitInt ? PPC::LHZ : PPC::LHZ8)
+                    : (Is32BitInt ? PPC::LHA : PPC::LHA8));
+      break;
+    case MVT::i32:
+      Opc = (IsZExt ? (Is32BitInt ? PPC::LWZ : PPC::LWZ8)
+                    : (Is32BitInt ? PPC::LWA_32 : PPC::LWA));
+      if ((Opc == PPC::LWA || Opc == PPC::LWA_32) && ((Addr.Offset & 3) != 0))
+        UseOffset = false;
+      break;
+    case MVT::i64:
+      Opc = PPC::LD;
+      assert(UseRC->hasSuperClassEq(&PPC::G8RCRegClass) &&
+             "64-bit load with 32-bit target??");
+      UseOffset = ((Addr.Offset & 3) == 0);
+      break;
+    case MVT::f32:
+      Opc = PPC::LFS;
+      break;
+    case MVT::f64:
+      Opc = FP64LoadOpc;
+      break;
+  }
+
+  // If necessary, materialize the offset into a register and use
+  // the indexed form.  Also handle stack pointers with special needs.
+  unsigned IndexReg = 0;
+  PPCSimplifyAddress(Addr, UseOffset, IndexReg);
+
+  // If this is a potential VSX load with an offset of 0, a VSX indexed load can
+  // be used.
+  bool IsVSSRC = isVSSRCRegClass(UseRC);
+  bool IsVSFRC = isVSFRCRegClass(UseRC);
+  bool Is32VSXLoad = IsVSSRC && Opc == PPC::LFS;
+  bool Is64VSXLoad = IsVSFRC && Opc == PPC::LFD;
+  if ((Is32VSXLoad || Is64VSXLoad) &&
+      (Addr.BaseType != Address::FrameIndexBase) && UseOffset &&
+      (Addr.Offset == 0)) {
+    UseOffset = false;
+  }
+
+  if (ResultReg == 0)
+    ResultReg = createResultReg(UseRC);
+
+  // Note: If we still have a frame index here, we know the offset is
+  // in range, as otherwise PPCSimplifyAddress would have converted it
+  // into a RegBase.
+  if (Addr.BaseType == Address::FrameIndexBase) {
+    // VSX only provides an indexed load.
+    if (Is32VSXLoad || Is64VSXLoad) return false;
+
+    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(*FuncInfo.MF, Addr.Base.FI,
+                                          Addr.Offset),
+        MachineMemOperand::MOLoad, MFI.getObjectSize(Addr.Base.FI),
+        MFI.getObjectAlignment(Addr.Base.FI));
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO);
+
+  // Base reg with offset in range.
+  } else if (UseOffset) {
+    // VSX only provides an indexed load.
+    if (Is32VSXLoad || Is64VSXLoad) return false;
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addImm(Addr.Offset).addReg(Addr.Base.Reg);
+
+  // Indexed form.
+  } else {
+    // Get the RR opcode corresponding to the RI one.  FIXME: It would be
+    // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
+    // is hard to get at.
+    switch (Opc) {
+      default:        llvm_unreachable("Unexpected opcode!");
+      case PPC::LBZ:    Opc = PPC::LBZX;    break;
+      case PPC::LBZ8:   Opc = PPC::LBZX8;   break;
+      case PPC::LHZ:    Opc = PPC::LHZX;    break;
+      case PPC::LHZ8:   Opc = PPC::LHZX8;   break;
+      case PPC::LHA:    Opc = PPC::LHAX;    break;
+      case PPC::LHA8:   Opc = PPC::LHAX8;   break;
+      case PPC::LWZ:    Opc = PPC::LWZX;    break;
+      case PPC::LWZ8:   Opc = PPC::LWZX8;   break;
+      case PPC::LWA:    Opc = PPC::LWAX;    break;
+      case PPC::LWA_32: Opc = PPC::LWAX_32; break;
+      case PPC::LD:     Opc = PPC::LDX;     break;
+      case PPC::LFS:    Opc = IsVSSRC ? PPC::LXSSPX : PPC::LFSX; break;
+      case PPC::LFD:    Opc = IsVSFRC ? PPC::LXSDX : PPC::LFDX; break;
+    }
+
+    auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                       ResultReg);
+
+    // If we have an index register defined we use it in the store inst,
+    // otherwise we use X0 as base as it makes the vector instructions to
+    // use zero in the computation of the effective address regardless the
+    // content of the register.
+    if (IndexReg)
+      MIB.addReg(Addr.Base.Reg).addReg(IndexReg);
+    else
+      MIB.addReg(PPC::ZERO8).addReg(Addr.Base.Reg);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select a load instruction.
+bool PPCFastISel::SelectLoad(const Instruction *I) {
+  // FIXME: No atomic loads are supported.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getType(), VT))
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(I->getOperand(0), Addr))
+    return false;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.  This is necessary
+  // to constrain RA from using R0/X0 when this is not legal.
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
+
+  unsigned ResultReg = 0;
+  if (!PPCEmitLoad(VT, ResultReg, Addr, RC))
+    return false;
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+// Emit a store instruction to store SrcReg at Addr.
+bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
+  assert(SrcReg && "Nothing to store!");
+  unsigned Opc;
+  bool UseOffset = true;
+
+  const TargetRegisterClass *RC = MRI.getRegClass(SrcReg);
+  bool Is32BitInt = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  switch (VT.SimpleTy) {
+    default: // e.g., vector types not handled
+      return false;
+    case MVT::i8:
+      Opc = Is32BitInt ? PPC::STB : PPC::STB8;
+      break;
+    case MVT::i16:
+      Opc = Is32BitInt ? PPC::STH : PPC::STH8;
+      break;
+    case MVT::i32:
+      assert(Is32BitInt && "Not GPRC for i32??");
+      Opc = PPC::STW;
+      break;
+    case MVT::i64:
+      Opc = PPC::STD;
+      UseOffset = ((Addr.Offset & 3) == 0);
+      break;
+    case MVT::f32:
+      Opc = PPC::STFS;
+      break;
+    case MVT::f64:
+      Opc = PPC::STFD;
+      break;
+  }
+
+  // If necessary, materialize the offset into a register and use
+  // the indexed form.  Also handle stack pointers with special needs.
+  unsigned IndexReg = 0;
+  PPCSimplifyAddress(Addr, UseOffset, IndexReg);
+
+  // If this is a potential VSX store with an offset of 0, a VSX indexed store
+  // can be used.
+  bool IsVSSRC = isVSSRCRegClass(RC);
+  bool IsVSFRC = isVSFRCRegClass(RC);
+  bool Is32VSXStore = IsVSSRC && Opc == PPC::STFS;
+  bool Is64VSXStore = IsVSFRC && Opc == PPC::STFD;
+  if ((Is32VSXStore || Is64VSXStore) &&
+      (Addr.BaseType != Address::FrameIndexBase) && UseOffset &&
+      (Addr.Offset == 0)) {
+    UseOffset = false;
+  }
+
+  // Note: If we still have a frame index here, we know the offset is
+  // in range, as otherwise PPCSimplifyAddress would have converted it
+  // into a RegBase.
+  if (Addr.BaseType == Address::FrameIndexBase) {
+    // VSX only provides an indexed store.
+    if (Is32VSXStore || Is64VSXStore) return false;
+
+    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(*FuncInfo.MF, Addr.Base.FI,
+                                          Addr.Offset),
+        MachineMemOperand::MOStore, MFI.getObjectSize(Addr.Base.FI),
+        MFI.getObjectAlignment(Addr.Base.FI));
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+        .addReg(SrcReg)
+        .addImm(Addr.Offset)
+        .addFrameIndex(Addr.Base.FI)
+        .addMemOperand(MMO);
+
+  // Base reg with offset in range.
+  } else if (UseOffset) {
+    // VSX only provides an indexed store.
+    if (Is32VSXStore || Is64VSXStore)
+      return false;
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+      .addReg(SrcReg).addImm(Addr.Offset).addReg(Addr.Base.Reg);
+
+  // Indexed form.
+  } else {
+    // Get the RR opcode corresponding to the RI one.  FIXME: It would be
+    // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
+    // is hard to get at.
+    switch (Opc) {
+      default:        llvm_unreachable("Unexpected opcode!");
+      case PPC::STB:  Opc = PPC::STBX;  break;
+      case PPC::STH : Opc = PPC::STHX;  break;
+      case PPC::STW : Opc = PPC::STWX;  break;
+      case PPC::STB8: Opc = PPC::STBX8; break;
+      case PPC::STH8: Opc = PPC::STHX8; break;
+      case PPC::STW8: Opc = PPC::STWX8; break;
+      case PPC::STD:  Opc = PPC::STDX;  break;
+      case PPC::STFS: Opc = IsVSSRC ? PPC::STXSSPX : PPC::STFSX; break;
+      case PPC::STFD: Opc = IsVSFRC ? PPC::STXSDX : PPC::STFDX; break;
+    }
+
+    auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+        .addReg(SrcReg);
+
+    // If we have an index register defined we use it in the store inst,
+    // otherwise we use X0 as base as it makes the vector instructions to
+    // use zero in the computation of the effective address regardless the
+    // content of the register.
+    if (IndexReg)
+      MIB.addReg(Addr.Base.Reg).addReg(IndexReg);
+    else
+      MIB.addReg(PPC::ZERO8).addReg(Addr.Base.Reg);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select a store instruction.
+bool PPCFastISel::SelectStore(const Instruction *I) {
+  Value *Op0 = I->getOperand(0);
+  unsigned SrcReg = 0;
+
+  // FIXME: No atomics loads are supported.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(Op0->getType(), VT))
+    return false;
+
+  // Get the value to be stored into a register.
+  SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0)
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!PPCEmitStore(VT, SrcReg, Addr))
+    return false;
+
+  return true;
+}
+
+// Attempt to fast-select a branch instruction.
+bool PPCFastISel::SelectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *BrBB = FuncInfo.MBB;
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  // For now, just try the simplest case where it's fed by a compare.
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    if (isValueAvailable(CI)) {
+      Optional<PPC::Predicate> OptPPCPred = getComparePred(CI->getPredicate());
+      if (!OptPPCPred)
+        return false;
+
+      PPC::Predicate PPCPred = OptPPCPred.getValue();
+
+      // Take advantage of fall-through opportunities.
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        PPCPred = PPC::InvertPredicate(PPCPred);
+      }
+
+      unsigned CondReg = createResultReg(&PPC::CRRCRegClass);
+
+      if (!PPCEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
+                      CondReg))
+        return false;
+
+      BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCC))
+        .addImm(PPCPred).addReg(CondReg).addMBB(TBB);
+      finishCondBranch(BI->getParent(), TBB, FBB);
+      return true;
+    }
+  } else if (const ConstantInt *CI =
+             dyn_cast<ConstantInt>(BI->getCondition())) {
+    uint64_t Imm = CI->getZExtValue();
+    MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+    fastEmitBranch(Target, DbgLoc);
+    return true;
+  }
+
+  // FIXME: ARM looks for a case where the block containing the compare
+  // has been split from the block containing the branch.  If this happens,
+  // there is a vreg available containing the result of the compare.  I'm
+  // not sure we can do much, as we've lost the predicate information with
+  // the compare instruction -- we have a 4-bit CR but don't know which bit
+  // to test here.
+  return false;
+}
+
+// Attempt to emit a compare of the two source values.  Signed and unsigned
+// comparisons are supported.  Return false if we can't handle it.
+bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
+                             bool IsZExt, unsigned DestReg) {
+  Type *Ty = SrcValue1->getType();
+  EVT SrcEVT = TLI.getValueType(DL, Ty, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  if (SrcVT == MVT::i1 && PPCSubTarget->useCRBits())
+    return false;
+
+  // See if operand 2 is an immediate encodeable in the compare.
+  // FIXME: Operands are not in canonical order at -O0, so an immediate
+  // operand in position 1 is a lost opportunity for now.  We are
+  // similar to ARM in this regard.
+  long Imm = 0;
+  bool UseImm = false;
+
+  // Only 16-bit integer constants can be represented in compares for
+  // PowerPC.  Others will be materialized into a register.
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(SrcValue2)) {
+    if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
+        SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+      const APInt &CIVal = ConstInt->getValue();
+      Imm = (IsZExt) ? (long)CIVal.getZExtValue() : (long)CIVal.getSExtValue();
+      if ((IsZExt && isUInt<16>(Imm)) || (!IsZExt && isInt<16>(Imm)))
+        UseImm = true;
+    }
+  }
+
+  unsigned CmpOpc;
+  bool NeedsExt = false;
+  switch (SrcVT.SimpleTy) {
+    default: return false;
+    case MVT::f32:
+      CmpOpc = PPC::FCMPUS;
+      break;
+    case MVT::f64:
+      CmpOpc = PPC::FCMPUD;
+      break;
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+      NeedsExt = true;
+      // Intentional fall-through.
+    case MVT::i32:
+      if (!UseImm)
+        CmpOpc = IsZExt ? PPC::CMPLW : PPC::CMPW;
+      else
+        CmpOpc = IsZExt ? PPC::CMPLWI : PPC::CMPWI;
+      break;
+    case MVT::i64:
+      if (!UseImm)
+        CmpOpc = IsZExt ? PPC::CMPLD : PPC::CMPD;
+      else
+        CmpOpc = IsZExt ? PPC::CMPLDI : PPC::CMPDI;
+      break;
+  }
+
+  unsigned SrcReg1 = getRegForValue(SrcValue1);
+  if (SrcReg1 == 0)
+    return false;
+
+  unsigned SrcReg2 = 0;
+  if (!UseImm) {
+    SrcReg2 = getRegForValue(SrcValue2);
+    if (SrcReg2 == 0)
+      return false;
+  }
+
+  if (NeedsExt) {
+    unsigned ExtReg = createResultReg(&PPC::GPRCRegClass);
+    if (!PPCEmitIntExt(SrcVT, SrcReg1, MVT::i32, ExtReg, IsZExt))
+      return false;
+    SrcReg1 = ExtReg;
+
+    if (!UseImm) {
+      unsigned ExtReg = createResultReg(&PPC::GPRCRegClass);
+      if (!PPCEmitIntExt(SrcVT, SrcReg2, MVT::i32, ExtReg, IsZExt))
+        return false;
+      SrcReg2 = ExtReg;
+    }
+  }
+
+  if (!UseImm)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc), DestReg)
+      .addReg(SrcReg1).addReg(SrcReg2);
+  else
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc), DestReg)
+      .addReg(SrcReg1).addImm(Imm);
+
+  return true;
+}
+
+// Attempt to fast-select a floating-point extend instruction.
+bool PPCFastISel::SelectFPExt(const Instruction *I) {
+  Value *Src  = I->getOperand(0);
+  EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
+
+  if (SrcVT != MVT::f32 || DestVT != MVT::f64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  // No code is generated for a FP extend.
+  updateValueMap(I, SrcReg);
+  return true;
+}
+
+// Attempt to fast-select a floating-point truncate instruction.
+bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
+  Value *Src  = I->getOperand(0);
+  EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
+
+  if (SrcVT != MVT::f64 || DestVT != MVT::f32)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  // Round the result to single precision.
+  unsigned DestReg = createResultReg(&PPC::F4RCRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP), DestReg)
+    .addReg(SrcReg);
+
+  updateValueMap(I, DestReg);
+  return true;
+}
+
+// Move an i32 or i64 value in a GPR to an f64 value in an FPR.
+// FIXME: When direct register moves are implemented (see PowerISA 2.07),
+// those should be used instead of moving via a stack slot when the
+// subtarget permits.
+// FIXME: The code here is sloppy for the 4-byte case.  Can use a 4-byte
+// stack slot and 4-byte store/load sequence.  Or just sext the 4-byte
+// case to 8 bytes which produces tighter code but wastes stack space.
+unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg,
+                                     bool IsSigned) {
+
+  // If necessary, extend 32-bit int to 64-bit.
+  if (SrcVT == MVT::i32) {
+    unsigned TmpReg = createResultReg(&PPC::G8RCRegClass);
+    if (!PPCEmitIntExt(MVT::i32, SrcReg, MVT::i64, TmpReg, !IsSigned))
+      return 0;
+    SrcReg = TmpReg;
+  }
+
+  // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary.
+  Address Addr;
+  Addr.BaseType = Address::FrameIndexBase;
+  Addr.Base.FI = MFI.CreateStackObject(8, 8, false);
+
+  // Store the value from the GPR.
+  if (!PPCEmitStore(MVT::i64, SrcReg, Addr))
+    return 0;
+
+  // Load the integer value into an FPR.  The kind of load used depends
+  // on a number of conditions.
+  unsigned LoadOpc = PPC::LFD;
+
+  if (SrcVT == MVT::i32) {
+    if (!IsSigned) {
+      LoadOpc = PPC::LFIWZX;
+      Addr.Offset = (PPCSubTarget->isLittleEndian()) ? 0 : 4;
+    } else if (PPCSubTarget->hasLFIWAX()) {
+      LoadOpc = PPC::LFIWAX;
+      Addr.Offset = (PPCSubTarget->isLittleEndian()) ? 0 : 4;
+    }
+  }
+
+  const TargetRegisterClass *RC = &PPC::F8RCRegClass;
+  unsigned ResultReg = 0;
+  if (!PPCEmitLoad(MVT::f64, ResultReg, Addr, RC, !IsSigned, LoadOpc))
+    return 0;
+
+  return ResultReg;
+}
+
+// Attempt to fast-select an integer-to-floating-point conversion.
+// FIXME: Once fast-isel has better support for VSX, conversions using
+//        direct moves should be implemented.
+bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
+  MVT DstVT;
+  Type *DstTy = I->getType();
+  if (!isTypeLegal(DstTy, DstVT))
+    return false;
+
+  if (DstVT != MVT::f32 && DstVT != MVT::f64)
+    return false;
+
+  Value *Src = I->getOperand(0);
+  EVT SrcEVT = TLI.getValueType(DL, Src->getType(), true);
+  if (!SrcEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  if (SrcVT != MVT::i8  && SrcVT != MVT::i16 &&
+      SrcVT != MVT::i32 && SrcVT != MVT::i64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (SrcReg == 0)
+    return false;
+
+  // We can only lower an unsigned convert if we have the newer
+  // floating-point conversion operations.
+  if (!IsSigned && !PPCSubTarget->hasFPCVT())
+    return false;
+
+  // FIXME: For now we require the newer floating-point conversion operations
+  // (which are present only on P7 and A2 server models) when converting
+  // to single-precision float.  Otherwise we have to generate a lot of
+  // fiddly code to avoid double rounding.  If necessary, the fiddly code
+  // can be found in PPCTargetLowering::LowerINT_TO_FP().
+  if (DstVT == MVT::f32 && !PPCSubTarget->hasFPCVT())
+    return false;
+
+  // Extend the input if necessary.
+  if (SrcVT == MVT::i8 || SrcVT == MVT::i16) {
+    unsigned TmpReg = createResultReg(&PPC::G8RCRegClass);
+    if (!PPCEmitIntExt(SrcVT, SrcReg, MVT::i64, TmpReg, !IsSigned))
+      return false;
+    SrcVT = MVT::i64;
+    SrcReg = TmpReg;
+  }
+
+  // Move the integer value to an FPR.
+  unsigned FPReg = PPCMoveToFPReg(SrcVT, SrcReg, IsSigned);
+  if (FPReg == 0)
+    return false;
+
+  // Determine the opcode for the conversion.
+  const TargetRegisterClass *RC = &PPC::F8RCRegClass;
+  unsigned DestReg = createResultReg(RC);
+  unsigned Opc;
+
+  if (DstVT == MVT::f32)
+    Opc = IsSigned ? PPC::FCFIDS : PPC::FCFIDUS;
+  else
+    Opc = IsSigned ? PPC::FCFID : PPC::FCFIDU;
+
+  // Generate the convert.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+    .addReg(FPReg);
+
+  updateValueMap(I, DestReg);
+  return true;
+}
+
+// Move the floating-point value in SrcReg into an integer destination
+// register, and return the register (or zero if we can't handle it).
+// FIXME: When direct register moves are implemented (see PowerISA 2.07),
+// those should be used instead of moving via a stack slot when the
+// subtarget permits.
+unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT,
+                                      unsigned SrcReg, bool IsSigned) {
+  // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary.
+  // Note that if have STFIWX available, we could use a 4-byte stack
+  // slot for i32, but this being fast-isel we'll just go with the
+  // easiest code gen possible.
+  Address Addr;
+  Addr.BaseType = Address::FrameIndexBase;
+  Addr.Base.FI = MFI.CreateStackObject(8, 8, false);
+
+  // Store the value from the FPR.
+  if (!PPCEmitStore(MVT::f64, SrcReg, Addr))
+    return 0;
+
+  // Reload it into a GPR.  If we want an i32 on big endian, modify the
+  // address to have a 4-byte offset so we load from the right place.
+  if (VT == MVT::i32)
+    Addr.Offset = (PPCSubTarget->isLittleEndian()) ? 0 : 4;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
+
+  unsigned ResultReg = 0;
+  if (!PPCEmitLoad(VT, ResultReg, Addr, RC, !IsSigned))
+    return 0;
+
+  return ResultReg;
+}
+
+// Attempt to fast-select a floating-point-to-integer conversion.
+// FIXME: Once fast-isel has better support for VSX, conversions using
+//        direct moves should be implemented.
+bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
+  MVT DstVT, SrcVT;
+  Type *DstTy = I->getType();
+  if (!isTypeLegal(DstTy, DstVT))
+    return false;
+
+  if (DstVT != MVT::i32 && DstVT != MVT::i64)
+    return false;
+
+  // If we don't have FCTIDUZ and we need it, punt to SelectionDAG.
+  if (DstVT == MVT::i64 && !IsSigned && !PPCSubTarget->hasFPCVT())
+    return false;
+
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+  if (!isTypeLegal(SrcTy, SrcVT))
+    return false;
+
+  if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (SrcReg == 0)
+    return false;
+
+  // Convert f32 to f64 if necessary.  This is just a meaningless copy
+  // to get the register class right.
+  const TargetRegisterClass *InRC = MRI.getRegClass(SrcReg);
+  if (InRC == &PPC::F4RCRegClass) {
+    unsigned TmpReg = createResultReg(&PPC::F8RCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), TmpReg)
+      .addReg(SrcReg);
+    SrcReg = TmpReg;
+  }
+
+  // Determine the opcode for the conversion, which takes place
+  // entirely within FPRs.
+  unsigned DestReg = createResultReg(&PPC::F8RCRegClass);
+  unsigned Opc;
+
+  if (DstVT == MVT::i32)
+    if (IsSigned)
+      Opc = PPC::FCTIWZ;
+    else
+      Opc = PPCSubTarget->hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
+  else
+    Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ;
+
+  // Generate the convert.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+    .addReg(SrcReg);
+
+  // Now move the integer value from a float register to an integer register.
+  unsigned IntReg = PPCMoveToIntReg(I, DstVT, DestReg, IsSigned);
+  if (IntReg == 0)
+    return false;
+
+  updateValueMap(I, IntReg);
+  return true;
+}
+
+// Attempt to fast-select a binary integer operation that isn't already
+// handled automatically.
+bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
+
+  // We can get here in the case when we have a binary operation on a non-legal
+  // type and the target independent selector doesn't know how to handle it.
+  if (DestVT != MVT::i16 && DestVT != MVT::i8)
+    return false;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.  If there is no register,
+  // make a conservative choice (don't assign R0).
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    (AssignedReg ? MRI.getRegClass(AssignedReg) :
+     &PPC::GPRC_and_GPRC_NOR0RegClass);
+  bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  unsigned Opc;
+  switch (ISDOpcode) {
+    default: return false;
+    case ISD::ADD:
+      Opc = IsGPRC ? PPC::ADD4 : PPC::ADD8;
+      break;
+    case ISD::OR:
+      Opc = IsGPRC ? PPC::OR : PPC::OR8;
+      break;
+    case ISD::SUB:
+      Opc = IsGPRC ? PPC::SUBF : PPC::SUBF8;
+      break;
+  }
+
+  unsigned ResultReg = createResultReg(RC ? RC : &PPC::G8RCRegClass);
+  unsigned SrcReg1 = getRegForValue(I->getOperand(0));
+  if (SrcReg1 == 0) return false;
+
+  // Handle case of small immediate operand.
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    const APInt &CIVal = ConstInt->getValue();
+    int Imm = (int)CIVal.getSExtValue();
+    bool UseImm = true;
+    if (isInt<16>(Imm)) {
+      switch (Opc) {
+        default:
+          llvm_unreachable("Missing case!");
+        case PPC::ADD4:
+          Opc = PPC::ADDI;
+          MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass);
+          break;
+        case PPC::ADD8:
+          Opc = PPC::ADDI8;
+          MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass);
+          break;
+        case PPC::OR:
+          Opc = PPC::ORI;
+          break;
+        case PPC::OR8:
+          Opc = PPC::ORI8;
+          break;
+        case PPC::SUBF:
+          if (Imm == -32768)
+            UseImm = false;
+          else {
+            Opc = PPC::ADDI;
+            MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass);
+            Imm = -Imm;
+          }
+          break;
+        case PPC::SUBF8:
+          if (Imm == -32768)
+            UseImm = false;
+          else {
+            Opc = PPC::ADDI8;
+            MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass);
+            Imm = -Imm;
+          }
+          break;
+      }
+
+      if (UseImm) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                ResultReg)
+            .addReg(SrcReg1)
+            .addImm(Imm);
+        updateValueMap(I, ResultReg);
+        return true;
+      }
+    }
+  }
+
+  // Reg-reg case.
+  unsigned SrcReg2 = getRegForValue(I->getOperand(1));
+  if (SrcReg2 == 0) return false;
+
+  // Reverse operands for subtract-from.
+  if (ISDOpcode == ISD::SUB)
+    std::swap(SrcReg1, SrcReg2);
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+    .addReg(SrcReg1).addReg(SrcReg2);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+// Handle arguments to a call that we're attempting to fast-select.
+// Return false if the arguments are too complex for us at the moment.
+bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
+                                  SmallVectorImpl<unsigned> &ArgRegs,
+                                  SmallVectorImpl<MVT> &ArgVTs,
+                                  SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                                  SmallVectorImpl<unsigned> &RegArgs,
+                                  CallingConv::ID CC,
+                                  unsigned &NumBytes,
+                                  bool IsVarArg) {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, *Context);
+
+  // Reserve space for the linkage area on the stack.
+  unsigned LinkageSize = PPCSubTarget->getFrameLowering()->getLinkageSize();
+  CCInfo.AllocateStack(LinkageSize, 8);
+
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS);
+
+  // Bail out if we can't handle any of the arguments.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Skip vector arguments for now, as well as long double and
+    // uint128_t, and anything that isn't passed in a register.
+    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64 || ArgVT == MVT::i1 ||
+        !VA.isRegLoc() || VA.needsCustom())
+      return false;
+
+    // Skip bit-converted arguments for now.
+    if (VA.getLocInfo() == CCValAssign::BCvt)
+      return false;
+  }
+
+  // Get a count of how many bytes are to be pushed onto the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+
+  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // the stack, allowing va_start to index over them in memory if its varargs.
+  // Because we cannot tell if this is needed on the caller side, we have to
+  // conservatively assume that it is needed.  As such, make sure we have at
+  // least enough stack space for the caller to store the 8 GPRs.
+  // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area.
+  NumBytes = std::max(NumBytes, LinkageSize + 64);
+
+  // Issue CALLSEQ_START.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(TII.getCallFrameSetupOpcode()))
+    .addImm(NumBytes);
+
+  // Prepare to assign register arguments.  Every argument uses up a
+  // GPR protocol register even if it's passed in a floating-point
+  // register (unless we're using the fast calling convention).
+  unsigned NextGPR = PPC::X3;
+  unsigned NextFPR = PPC::F1;
+
+  // Process arguments.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    unsigned Arg = ArgRegs[VA.getValNo()];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Handle argument promotion and bitcasts.
+    switch (VA.getLocInfo()) {
+      default:
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::SExt: {
+        MVT DestVT = VA.getLocVT();
+        const TargetRegisterClass *RC =
+          (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+        unsigned TmpReg = createResultReg(RC);
+        if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/false))
+          llvm_unreachable("Failed to emit a sext!");
+        ArgVT = DestVT;
+        Arg = TmpReg;
+        break;
+      }
+      case CCValAssign::AExt:
+      case CCValAssign::ZExt: {
+        MVT DestVT = VA.getLocVT();
+        const TargetRegisterClass *RC =
+          (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+        unsigned TmpReg = createResultReg(RC);
+        if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/true))
+          llvm_unreachable("Failed to emit a zext!");
+        ArgVT = DestVT;
+        Arg = TmpReg;
+        break;
+      }
+      case CCValAssign::BCvt: {
+        // FIXME: Not yet handled.
+        llvm_unreachable("Should have bailed before getting here!");
+        break;
+      }
+    }
+
+    // Copy this argument to the appropriate register.
+    unsigned ArgReg;
+    if (ArgVT == MVT::f32 || ArgVT == MVT::f64) {
+      ArgReg = NextFPR++;
+      if (CC != CallingConv::Fast)
+        ++NextGPR;
+    } else
+      ArgReg = NextGPR++;
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ArgReg).addReg(Arg);
+    RegArgs.push_back(ArgReg);
+  }
+
+  return true;
+}
+
+// For a call that we've determined we can fast-select, finish the
+// call sequence and generate a copy to obtain the return value (if any).
+bool PPCFastISel::finishCall(MVT RetVT, CallLoweringInfo &CLI, unsigned &NumBytes) {
+  CallingConv::ID CC = CLI.CallConv;
+
+  // Issue CallSEQ_END.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(TII.getCallFrameDestroyOpcode()))
+    .addImm(NumBytes).addImm(0);
+
+  // Next, generate a copy to obtain the return value.
+  // FIXME: No multi-register return values yet, though I don't foresee
+  // any real difficulties there.
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+    CCValAssign &VA = RVLocs[0];
+    assert(RVLocs.size() == 1 && "No support for multi-reg return values!");
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    MVT DestVT = VA.getValVT();
+    MVT CopyVT = DestVT;
+
+    // Ints smaller than a register still arrive in a full 64-bit
+    // register, so make sure we recognize this.
+    if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32)
+      CopyVT = MVT::i64;
+
+    unsigned SourcePhysReg = VA.getLocReg();
+    unsigned ResultReg = 0;
+
+    if (RetVT == CopyVT) {
+      const TargetRegisterClass *CpyRC = TLI.getRegClassFor(CopyVT);
+      ResultReg = createResultReg(CpyRC);
+
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(SourcePhysReg);
+
+    // If necessary, round the floating result to single precision.
+    } else if (CopyVT == MVT::f64) {
+      ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP),
+              ResultReg).addReg(SourcePhysReg);
+
+    // If only the low half of a general register is needed, generate
+    // a GPRC copy instead of a G8RC copy.  (EXTRACT_SUBREG can't be
+    // used along the fast-isel path (not lowered), and downstream logic
+    // also doesn't like a direct subreg copy on a physical reg.)
+    } else if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32) {
+      ResultReg = createResultReg(&PPC::GPRCRegClass);
+      // Convert physical register from G8RC to GPRC.
+      SourcePhysReg -= PPC::X0 - PPC::R0;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(SourcePhysReg);
+    }
+
+    assert(ResultReg && "ResultReg unset!");
+    CLI.InRegs.push_back(SourcePhysReg);
+    CLI.ResultReg = ResultReg;
+    CLI.NumResultRegs = 1;
+  }
+
+  return true;
+}
+
+bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) {
+  CallingConv::ID CC  = CLI.CallConv;
+  bool IsTailCall     = CLI.IsTailCall;
+  bool IsVarArg       = CLI.IsVarArg;
+  const Value *Callee = CLI.Callee;
+  const MCSymbol *Symbol = CLI.Symbol;
+
+  if (!Callee && !Symbol)
+    return false;
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (IsTailCall)
+    return false;
+
+  // Let SDISel handle vararg functions.
+  if (IsVarArg)
+    return false;
+
+  // Handle simple calls for now, with legal return types and
+  // those that can be extended.
+  Type *RetTy = CLI.RetTy;
+  MVT RetVT;
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 &&
+           RetVT != MVT::i8)
+    return false;
+  else if (RetVT == MVT::i1 && PPCSubTarget->useCRBits())
+    // We can't handle boolean returns when CR bits are in use.
+    return false;
+
+  // FIXME: No multi-register return values yet.
+  if (RetVT != MVT::isVoid && RetVT != MVT::i8 && RetVT != MVT::i16 &&
+      RetVT != MVT::i32 && RetVT != MVT::i64 && RetVT != MVT::f32 &&
+      RetVT != MVT::f64) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+    if (RVLocs.size() > 1)
+      return false;
+  }
+
+  // Bail early if more than 8 arguments, as we only currently
+  // handle arguments passed in registers.
+  unsigned NumArgs = CLI.OutVals.size();
+  if (NumArgs > 8)
+    return false;
+
+  // Set up the argument vectors.
+  SmallVector<Value*, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+
+  Args.reserve(NumArgs);
+  ArgRegs.reserve(NumArgs);
+  ArgVTs.reserve(NumArgs);
+  ArgFlags.reserve(NumArgs);
+
+  for (unsigned i = 0, ie = NumArgs; i != ie; ++i) {
+    // Only handle easy calls for now.  It would be reasonably easy
+    // to handle <= 8-byte structures passed ByVal in registers, but we
+    // have to ensure they are right-justified in the register.
+    ISD::ArgFlagsTy Flags = CLI.OutFlags[i];
+    if (Flags.isInReg() || Flags.isSRet() || Flags.isNest() || Flags.isByVal())
+      return false;
+
+    Value *ArgValue = CLI.OutVals[i];
+    Type *ArgTy = ArgValue->getType();
+    MVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8)
+      return false;
+
+    if (ArgVT.isVector())
+      return false;
+
+    unsigned Arg = getRegForValue(ArgValue);
+    if (Arg == 0)
+      return false;
+
+    Args.push_back(ArgValue);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Process the arguments.
+  SmallVector<unsigned, 8> RegArgs;
+  unsigned NumBytes;
+
+  if (!processCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+                       RegArgs, CC, NumBytes, IsVarArg))
+    return false;
+
+  MachineInstrBuilder MIB;
+  // FIXME: No handling for function pointers yet.  This requires
+  // implementing the function descriptor (OPD) setup.
+  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+  if (!GV) {
+    // patchpoints are a special case; they always dispatch to a pointer value.
+    // However, we don't actually want to generate the indirect call sequence
+    // here (that will be generated, as necessary, during asm printing), and
+    // the call we generate here will be erased by FastISel::selectPatchpoint,
+    // so don't try very hard...
+    if (CLI.IsPatchPoint)
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::NOP));
+    else
+      return false;
+  } else {
+    // Build direct call with NOP for TOC restore.
+    // FIXME: We can and should optimize away the NOP for local calls.
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                  TII.get(PPC::BL8_NOP));
+    // Add callee.
+    MIB.addGlobalAddress(GV);
+  }
+
+  // Add implicit physical register uses to the call.
+  for (unsigned II = 0, IE = RegArgs.size(); II != IE; ++II)
+    MIB.addReg(RegArgs[II], RegState::Implicit);
+
+  // Direct calls, in both the ELF V1 and V2 ABIs, need the TOC register live
+  // into the call.
+  PPCFuncInfo->setUsesTOCBasePtr();
+  MIB.addReg(PPC::X2, RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.  Proper
+  // defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
+
+  CLI.Call = MIB;
+
+  // Finish off the call including any return values.
+  return finishCall(RetVT, CLI, NumBytes);
+}
+
+// Attempt to fast-select a return instruction.
+bool PPCFastISel::SelectRet(const Instruction *I) {
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  if (TLI.supportSplitCSR(FuncInfo.MF))
+    return false;
+
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+  CallingConv::ID CC = F.getCallingConv();
+
+  if (Ret->getNumOperands() > 0) {
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, *Context);
+    CCInfo.AnalyzeReturn(Outs, RetCC_PPC64_ELF_FIS);
+    const Value *RV = Ret->getOperand(0);
+
+    // FIXME: Only one output register for now.
+    if (ValLocs.size() > 1)
+      return false;
+
+    // Special case for returning a constant integer of any size - materialize
+    // the constant as an i64 and copy it to the return register.
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(RV)) {
+      CCValAssign &VA = ValLocs[0];
+
+      unsigned RetReg = VA.getLocReg();
+      // We still need to worry about properly extending the sign. For example,
+      // we could have only a single bit or a constant that needs zero
+      // extension rather than sign extension. Make sure we pass the return
+      // value extension property to integer materialization.
+      unsigned SrcReg =
+          PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() != CCValAssign::ZExt);
+
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg);
+
+      RetRegs.push_back(RetReg);
+
+    } else {
+      unsigned Reg = getRegForValue(RV);
+
+      if (Reg == 0)
+        return false;
+
+      // Copy the result values into the output registers.
+      for (unsigned i = 0; i < ValLocs.size(); ++i) {
+
+        CCValAssign &VA = ValLocs[i];
+        assert(VA.isRegLoc() && "Can only return in registers!");
+        RetRegs.push_back(VA.getLocReg());
+        unsigned SrcReg = Reg + VA.getValNo();
+
+        EVT RVEVT = TLI.getValueType(DL, RV->getType());
+        if (!RVEVT.isSimple())
+          return false;
+        MVT RVVT = RVEVT.getSimpleVT();
+        MVT DestVT = VA.getLocVT();
+
+        if (RVVT != DestVT && RVVT != MVT::i8 &&
+            RVVT != MVT::i16 && RVVT != MVT::i32)
+          return false;
+
+        if (RVVT != DestVT) {
+          switch (VA.getLocInfo()) {
+            default:
+              llvm_unreachable("Unknown loc info!");
+            case CCValAssign::Full:
+              llvm_unreachable("Full value assign but types don't match?");
+            case CCValAssign::AExt:
+            case CCValAssign::ZExt: {
+              const TargetRegisterClass *RC =
+                (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+              unsigned TmpReg = createResultReg(RC);
+              if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, true))
+                return false;
+              SrcReg = TmpReg;
+              break;
+            }
+            case CCValAssign::SExt: {
+              const TargetRegisterClass *RC =
+                (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+              unsigned TmpReg = createResultReg(RC);
+              if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, false))
+                return false;
+              SrcReg = TmpReg;
+              break;
+            }
+          }
+        }
+
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(TargetOpcode::COPY), RetRegs[i])
+          .addReg(SrcReg);
+      }
+    }
+  }
+
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(PPC::BLR8));
+
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+
+  return true;
+}
+
+// Attempt to emit an integer extend of SrcReg into DestReg.  Both
+// signed and zero extensions are supported.  Return false if we
+// can't handle it.
+bool PPCFastISel::PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                unsigned DestReg, bool IsZExt) {
+  if (DestVT != MVT::i32 && DestVT != MVT::i64)
+    return false;
+  if (SrcVT != MVT::i8 && SrcVT != MVT::i16 && SrcVT != MVT::i32)
+    return false;
+
+  // Signed extensions use EXTSB, EXTSH, EXTSW.
+  if (!IsZExt) {
+    unsigned Opc;
+    if (SrcVT == MVT::i8)
+      Opc = (DestVT == MVT::i32) ? PPC::EXTSB : PPC::EXTSB8_32_64;
+    else if (SrcVT == MVT::i16)
+      Opc = (DestVT == MVT::i32) ? PPC::EXTSH : PPC::EXTSH8_32_64;
+    else {
+      assert(DestVT == MVT::i64 && "Signed extend from i32 to i32??");
+      Opc = PPC::EXTSW_32_64;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+      .addReg(SrcReg);
+
+  // Unsigned 32-bit extensions use RLWINM.
+  } else if (DestVT == MVT::i32) {
+    unsigned MB;
+    if (SrcVT == MVT::i8)
+      MB = 24;
+    else {
+      assert(SrcVT == MVT::i16 && "Unsigned extend from i32 to i32??");
+      MB = 16;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::RLWINM),
+            DestReg)
+      .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB).addImm(/*ME=*/31);
+
+  // Unsigned 64-bit extensions use RLDICL (with a 32-bit source).
+  } else {
+    unsigned MB;
+    if (SrcVT == MVT::i8)
+      MB = 56;
+    else if (SrcVT == MVT::i16)
+      MB = 48;
+    else
+      MB = 32;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(PPC::RLDICL_32_64), DestReg)
+      .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select an indirect branch instruction.
+bool PPCFastISel::SelectIndirectBr(const Instruction *I) {
+  unsigned AddrReg = getRegForValue(I->getOperand(0));
+  if (AddrReg == 0)
+    return false;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::MTCTR8))
+    .addReg(AddrReg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCTR8));
+
+  const IndirectBrInst *IB = cast<IndirectBrInst>(I);
+  for (const BasicBlock *SuccBB : IB->successors())
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[SuccBB]);
+
+  return true;
+}
+
+// Attempt to fast-select an integer truncate instruction.
+bool PPCFastISel::SelectTrunc(const Instruction *I) {
+  Value *Src  = I->getOperand(0);
+  EVT SrcVT = TLI.getValueType(DL, Src->getType(), true);
+  EVT DestVT = TLI.getValueType(DL, I->getType(), true);
+
+  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16)
+    return false;
+
+  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  // The only interesting case is when we need to switch register classes.
+  if (SrcVT == MVT::i64) {
+    unsigned ResultReg = createResultReg(&PPC::GPRCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(SrcReg, 0, PPC::sub_32);
+    SrcReg = ResultReg;
+  }
+
+  updateValueMap(I, SrcReg);
+  return true;
+}
+
+// Attempt to fast-select an integer extend instruction.
+bool PPCFastISel::SelectIntExt(const Instruction *I) {
+  Type *DestTy = I->getType();
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+
+  bool IsZExt = isa<ZExtInst>(I);
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg) return false;
+
+  EVT SrcEVT, DestEVT;
+  SrcEVT = TLI.getValueType(DL, SrcTy, true);
+  DestEVT = TLI.getValueType(DL, DestTy, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+
+  // If we know the register class needed for the result of this
+  // instruction, use it.  Otherwise pick the register class of the
+  // correct size that does not contain X0/R0, since we don't know
+  // whether downstream uses permit that assignment.
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    (AssignedReg ? MRI.getRegClass(AssignedReg) :
+     (DestVT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
+      &PPC::GPRC_and_GPRC_NOR0RegClass));
+  unsigned ResultReg = createResultReg(RC);
+
+  if (!PPCEmitIntExt(SrcVT, SrcReg, DestVT, ResultReg, IsZExt))
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+// Attempt to fast-select an instruction that wasn't handled by
+// the table-generated machinery.
+bool PPCFastISel::fastSelectInstruction(const Instruction *I) {
+
+  switch (I->getOpcode()) {
+    case Instruction::Load:
+      return SelectLoad(I);
+    case Instruction::Store:
+      return SelectStore(I);
+    case Instruction::Br:
+      return SelectBranch(I);
+    case Instruction::IndirectBr:
+      return SelectIndirectBr(I);
+    case Instruction::FPExt:
+      return SelectFPExt(I);
+    case Instruction::FPTrunc:
+      return SelectFPTrunc(I);
+    case Instruction::SIToFP:
+      return SelectIToFP(I, /*IsSigned*/ true);
+    case Instruction::UIToFP:
+      return SelectIToFP(I, /*IsSigned*/ false);
+    case Instruction::FPToSI:
+      return SelectFPToI(I, /*IsSigned*/ true);
+    case Instruction::FPToUI:
+      return SelectFPToI(I, /*IsSigned*/ false);
+    case Instruction::Add:
+      return SelectBinaryIntOp(I, ISD::ADD);
+    case Instruction::Or:
+      return SelectBinaryIntOp(I, ISD::OR);
+    case Instruction::Sub:
+      return SelectBinaryIntOp(I, ISD::SUB);
+    case Instruction::Call:
+      return selectCall(I);
+    case Instruction::Ret:
+      return SelectRet(I);
+    case Instruction::Trunc:
+      return SelectTrunc(I);
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      return SelectIntExt(I);
+    // Here add other flavors of Instruction::XXX that automated
+    // cases don't catch.  For example, switches are terminators
+    // that aren't yet handled.
+    default:
+      break;
+  }
+  return false;
+}
+
+// Materialize a floating-point constant into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
+  // No plans to handle long double here.
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return 0;
+
+  // All FP constants are loaded from the constant pool.
+  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+  assert(Align > 0 && "Unexpectedly missing alignment information!");
+  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+  const TargetRegisterClass *RC =
+    (VT == MVT::f32) ? &PPC::F4RCRegClass : &PPC::F8RCRegClass;
+  unsigned DestReg = createResultReg(RC);
+  CodeModel::Model CModel = TM.getCodeModel();
+
+  MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+      MachinePointerInfo::getConstantPool(*FuncInfo.MF),
+      MachineMemOperand::MOLoad, (VT == MVT::f32) ? 4 : 8, Align);
+
+  unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD;
+  unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  PPCFuncInfo->setUsesTOCBasePtr();
+  // For small code model, generate a LF[SD](0, LDtocCPT(Idx, X2)).
+  if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocCPT),
+            TmpReg)
+      .addConstantPoolIndex(Idx).addReg(PPC::X2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+      .addImm(0).addReg(TmpReg).addMemOperand(MMO);
+  } else {
+    // Otherwise we generate LF[SD](Idx[lo], ADDIStocHA(X2, Idx)).
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
+            TmpReg).addReg(PPC::X2).addConstantPoolIndex(Idx);
+    // But for large code model, we must generate a LDtocL followed
+    // by the LF[SD].
+    if (CModel == CodeModel::Large) {
+      unsigned TmpReg2 = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
+              TmpReg2).addConstantPoolIndex(Idx).addReg(TmpReg);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+          .addImm(0)
+          .addReg(TmpReg2);
+    } else
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
+        .addConstantPoolIndex(Idx, 0, PPCII::MO_TOC_LO)
+        .addReg(TmpReg)
+        .addMemOperand(MMO);
+  }
+
+  return DestReg;
+}
+
+// Materialize the address of a global value into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
+  assert(VT == MVT::i64 && "Non-address!");
+  const TargetRegisterClass *RC = &PPC::G8RC_and_G8RC_NOX0RegClass;
+  unsigned DestReg = createResultReg(RC);
+
+  // Global values may be plain old object addresses, TLS object
+  // addresses, constant pool entries, or jump tables.  How we generate
+  // code for these may depend on small, medium, or large code model.
+  CodeModel::Model CModel = TM.getCodeModel();
+
+  // FIXME: Jump tables are not yet required because fast-isel doesn't
+  // handle switches; if that changes, we need them as well.  For now,
+  // what follows assumes everything's a generic (or TLS) global address.
+
+  // FIXME: We don't yet handle the complexity of TLS.
+  if (GV->isThreadLocal())
+    return 0;
+
+  PPCFuncInfo->setUsesTOCBasePtr();
+  // For small code model, generate a simple TOC load.
+  if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtoc),
+            DestReg)
+        .addGlobalAddress(GV)
+        .addReg(PPC::X2);
+  else {
+    // If the address is an externally defined symbol, a symbol with common
+    // or externally available linkage, a non-local function address, or a
+    // jump table address (not yet needed), or if we are generating code
+    // for large code model, we generate:
+    //       LDtocL(GV, ADDIStocHA(%X2, GV))
+    // Otherwise we generate:
+    //       ADDItocL(ADDIStocHA(%X2, GV), GV)
+    // Either way, start with the ADDIStocHA:
+    unsigned HighPartReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA),
+            HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
+
+    unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV);
+    if (GVFlags & PPCII::MO_NLP_FLAG) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
+              DestReg).addGlobalAddress(GV).addReg(HighPartReg);
+    } else {
+      // Otherwise generate the ADDItocL.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDItocL),
+              DestReg).addReg(HighPartReg).addGlobalAddress(GV);
+    }
+  }
+
+  return DestReg;
+}
+
+// Materialize a 32-bit integer constant into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterialize32BitInt(int64_t Imm,
+                                             const TargetRegisterClass *RC) {
+  unsigned Lo = Imm & 0xFFFF;
+  unsigned Hi = (Imm >> 16) & 0xFFFF;
+
+  unsigned ResultReg = createResultReg(RC);
+  bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  if (isInt<16>(Imm))
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(IsGPRC ? PPC::LI : PPC::LI8), ResultReg)
+      .addImm(Imm);
+  else if (Lo) {
+    // Both Lo and Hi have nonzero bits.
+    unsigned TmpReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), TmpReg)
+      .addImm(Hi);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(IsGPRC ? PPC::ORI : PPC::ORI8), ResultReg)
+      .addReg(TmpReg).addImm(Lo);
+  } else
+    // Just Hi bits.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), ResultReg)
+        .addImm(Hi);
+
+  return ResultReg;
+}
+
+// Materialize a 64-bit integer constant into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm,
+                                             const TargetRegisterClass *RC) {
+  unsigned Remainder = 0;
+  unsigned Shift = 0;
+
+  // If the value doesn't fit in 32 bits, see if we can shift it
+  // so that it fits in 32 bits.
+  if (!isInt<32>(Imm)) {
+    Shift = countTrailingZeros<uint64_t>(Imm);
+    int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+
+    if (isInt<32>(ImmSh))
+      Imm = ImmSh;
+    else {
+      Remainder = Imm;
+      Shift = 32;
+      Imm >>= 32;
+    }
+  }
+
+  // Handle the high-order 32 bits (if shifted) or the whole 32 bits
+  // (if not shifted).
+  unsigned TmpReg1 = PPCMaterialize32BitInt(Imm, RC);
+  if (!Shift)
+    return TmpReg1;
+
+  // If upper 32 bits were not zero, we've built them and need to shift
+  // them into place.
+  unsigned TmpReg2;
+  if (Imm) {
+    TmpReg2 = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::RLDICR),
+            TmpReg2).addReg(TmpReg1).addImm(Shift).addImm(63 - Shift);
+  } else
+    TmpReg2 = TmpReg1;
+
+  unsigned TmpReg3, Hi, Lo;
+  if ((Hi = (Remainder >> 16) & 0xFFFF)) {
+    TmpReg3 = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ORIS8),
+            TmpReg3).addReg(TmpReg2).addImm(Hi);
+  } else
+    TmpReg3 = TmpReg2;
+
+  if ((Lo = Remainder & 0xFFFF)) {
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ORI8),
+            ResultReg).addReg(TmpReg3).addImm(Lo);
+    return ResultReg;
+  }
+
+  return TmpReg3;
+}
+
+// Materialize an integer constant into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT,
+                                        bool UseSExt) {
+  // If we're using CR bit registers for i1 values, handle that as a special
+  // case first.
+  if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
+    unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(CI->isZero() ? PPC::CRUNSET : PPC::CRSET), ImmReg);
+    return ImmReg;
+  }
+
+  if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 &&
+      VT != MVT::i1)
+    return 0;
+
+  const TargetRegisterClass *RC =
+      ((VT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass);
+  int64_t Imm = UseSExt ? CI->getSExtValue() : CI->getZExtValue();
+
+  // If the constant is in range, use a load-immediate.
+  // Since LI will sign extend the constant we need to make sure that for
+  // our zeroext constants that the sign extended constant fits into 16-bits -
+  // a range of 0..0x7fff.
+  if (isInt<16>(Imm)) {
+    unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
+    unsigned ImmReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
+        .addImm(Imm);
+    return ImmReg;
+  }
+
+  // Construct the constant piecewise.
+  if (VT == MVT::i64)
+    return PPCMaterialize64BitInt(Imm, RC);
+  else if (VT == MVT::i32)
+    return PPCMaterialize32BitInt(Imm, RC);
+
+  return 0;
+}
+
+// Materialize a constant into a register, and return the register
+// number (or zero if we failed to handle it).
+unsigned PPCFastISel::fastMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(DL, C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple()) return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return PPCMaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return PPCMaterializeGV(GV, VT);
+  else if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
+    // Note that the code in FunctionLoweringInfo::ComputePHILiveOutRegInfo
+    // assumes that constant PHI operands will be zero extended, and failure to
+    // match that assumption will cause problems if we sign extend here but
+    // some user of a PHI is in a block for which we fall back to full SDAG
+    // instruction selection.
+    return PPCMaterializeInt(CI, VT, false);
+
+  return 0;
+}
+
+// Materialize the address created by an alloca into a register, and
+// return the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
+  // Don't handle dynamic allocas.
+  if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
+
+  MVT VT;
+  if (!isLoadTypeLegal(AI->getType(), VT)) return 0;
+
+  DenseMap<const AllocaInst*, int>::iterator SI =
+    FuncInfo.StaticAllocaMap.find(AI);
+
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8),
+            ResultReg).addFrameIndex(SI->second).addImm(0);
+    return ResultReg;
+  }
+
+  return 0;
+}
+
+// Fold loads into extends when possible.
+// FIXME: We can have multiple redundant extend/trunc instructions
+// following a load.  The folding only picks up one.  Extend this
+// to check subsequent instructions for the same pattern and remove
+// them.  Thus ResultReg should be the def reg for the last redundant
+// instruction in a chain, and all intervening instructions can be
+// removed from parent.  Change test/CodeGen/PowerPC/fast-isel-fold.ll
+// to add ELF64-NOT: rldicl to the appropriate tests when this works.
+bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                                      const LoadInst *LI) {
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(LI->getType(), VT))
+    return false;
+
+  // Combine load followed by zero- or sign-extend.
+  bool IsZExt = false;
+  switch(MI->getOpcode()) {
+    default:
+      return false;
+
+    case PPC::RLDICL:
+    case PPC::RLDICL_32_64: {
+      IsZExt = true;
+      unsigned MB = MI->getOperand(3).getImm();
+      if ((VT == MVT::i8 && MB <= 56) ||
+          (VT == MVT::i16 && MB <= 48) ||
+          (VT == MVT::i32 && MB <= 32))
+        break;
+      return false;
+    }
+
+    case PPC::RLWINM:
+    case PPC::RLWINM8: {
+      IsZExt = true;
+      unsigned MB = MI->getOperand(3).getImm();
+      if ((VT == MVT::i8 && MB <= 24) ||
+          (VT == MVT::i16 && MB <= 16))
+        break;
+      return false;
+    }
+
+    case PPC::EXTSB:
+    case PPC::EXTSB8:
+    case PPC::EXTSB8_32_64:
+      /* There is no sign-extending load-byte instruction. */
+      return false;
+
+    case PPC::EXTSH:
+    case PPC::EXTSH8:
+    case PPC::EXTSH8_32_64: {
+      if (VT != MVT::i16 && VT != MVT::i8)
+        return false;
+      break;
+    }
+
+    case PPC::EXTSW:
+    case PPC::EXTSW_32_64: {
+      if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8)
+        return false;
+      break;
+    }
+  }
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(LI->getOperand(0), Addr))
+    return false;
+
+  unsigned ResultReg = MI->getOperand(0).getReg();
+
+  if (!PPCEmitLoad(VT, ResultReg, Addr, nullptr, IsZExt))
+    return false;
+
+  MI->eraseFromParent();
+  return true;
+}
+
+// Attempt to lower call arguments in a faster way than done by
+// the selection DAG code.
+bool PPCFastISel::fastLowerArguments() {
+  // Defer to normal argument lowering for now.  It's reasonably
+  // efficient.  Consider doing something like ARM to handle the
+  // case where all args fit in registers, no varargs, no float
+  // or vector args.
+  return false;
+}
+
+// Handle materializing integer constants into a register.  This is not
+// automatically generated for PowerPC, so must be explicitly created here.
+unsigned PPCFastISel::fastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) {
+
+  if (Opc != ISD::Constant)
+    return 0;
+
+  // If we're using CR bit registers for i1 values, handle that as a special
+  // case first.
+  if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
+    unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(Imm == 0 ? PPC::CRUNSET : PPC::CRSET), ImmReg);
+    return ImmReg;
+  }
+
+  if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 &&
+      VT != MVT::i1)
+    return 0;
+
+  const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass :
+                                   &PPC::GPRCRegClass);
+  if (VT == MVT::i64)
+    return PPCMaterialize64BitInt(Imm, RC);
+  else
+    return PPCMaterialize32BitInt(Imm, RC);
+}
+
+// Override for ADDI and ADDI8 to set the correct register class
+// on RHS operand 0.  The automatic infrastructure naively assumes
+// GPRC for i32 and G8RC for i64; the concept of "no R0" is lost
+// for these cases.  At the moment, none of the other automatically
+// generated RI instructions require special treatment.  However, once
+// SelectSelect is implemented, "isel" requires similar handling.
+//
+// Also be conservative about the output register class.  Avoid
+// assigning R0 or X0 to the output register for GPRC and G8RC
+// register classes, as any such result could be used in ADDI, etc.,
+// where those regs have another meaning.
+unsigned PPCFastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      uint64_t Imm) {
+  if (MachineInstOpcode == PPC::ADDI)
+    MRI.setRegClass(Op0, &PPC::GPRC_and_GPRC_NOR0RegClass);
+  else if (MachineInstOpcode == PPC::ADDI8)
+    MRI.setRegClass(Op0, &PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::fastEmitInst_ri(MachineInstOpcode, UseRC,
+                                   Op0, Op0IsKill, Imm);
+}
+
+// Override for instructions with one register operand to avoid use of
+// R0/X0.  The automatic infrastructure isn't aware of the context so
+// we must be conservative.
+unsigned PPCFastISel::fastEmitInst_r(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass* RC,
+                                     unsigned Op0, bool Op0IsKill) {
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::fastEmitInst_r(MachineInstOpcode, UseRC, Op0, Op0IsKill);
+}
+
+// Override for instructions with two register operands to avoid use
+// of R0/X0.  The automatic infrastructure isn't aware of the context
+// so we must be conservative.
+unsigned PPCFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass* RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      unsigned Op1, bool Op1IsKill) {
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::fastEmitInst_rr(MachineInstOpcode, UseRC, Op0, Op0IsKill,
+                                   Op1, Op1IsKill);
+}
+
+namespace llvm {
+  // Create the fast instruction selector for PowerPC64 ELF.
+  FastISel *PPC::createFastISel(FunctionLoweringInfo &FuncInfo,
+                                const TargetLibraryInfo *LibInfo) {
+    // Only available on 64-bit ELF for now.
+    const PPCSubtarget &Subtarget = FuncInfo.MF->getSubtarget<PPCSubtarget>();
+    if (Subtarget.isPPC64() && Subtarget.isSVR4ABI())
+      return new PPCFastISel(FuncInfo, LibInfo);
+    return nullptr;
+  }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
new file mode 100644
index 000000000000..e786ef9aee0e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -0,0 +1,2164 @@
+//===-- PPCFrameLowering.cpp - PPC Frame Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PPC implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCFrameLowering.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+/// VRRegNo - Map from a numbered VR register to its enum value.
+///
+static const MCPhysReg VRRegNo[] = {
+ PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 ,
+ PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+ PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+ PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+
+static unsigned computeReturnSaveOffset(const PPCSubtarget &STI) {
+  if (STI.isDarwinABI())
+    return STI.isPPC64() ? 16 : 8;
+  // SVR4 ABI:
+  return STI.isPPC64() ? 16 : 4;
+}
+
+static unsigned computeTOCSaveOffset(const PPCSubtarget &STI) {
+  return STI.isELFv2ABI() ? 24 : 40;
+}
+
+static unsigned computeFramePointerSaveOffset(const PPCSubtarget &STI) {
+  // For the Darwin ABI:
+  // We cannot use the TOC save slot (offset +20) in the PowerPC linkage area
+  // for saving the frame pointer (if needed.)  While the published ABI has
+  // not used this slot since at least MacOSX 10.2, there is older code
+  // around that does use it, and that needs to continue to work.
+  if (STI.isDarwinABI())
+    return STI.isPPC64() ? -8U : -4U;
+
+  // SVR4 ABI: First slot in the general register save area.
+  return STI.isPPC64() ? -8U : -4U;
+}
+
+static unsigned computeLinkageSize(const PPCSubtarget &STI) {
+  if (STI.isDarwinABI() || STI.isPPC64())
+    return (STI.isELFv2ABI() ? 4 : 6) * (STI.isPPC64() ? 8 : 4);
+
+  // SVR4 ABI:
+  return 8;
+}
+
+static unsigned computeBasePointerSaveOffset(const PPCSubtarget &STI) {
+  if (STI.isDarwinABI())
+    return STI.isPPC64() ? -16U : -8U;
+
+  // SVR4 ABI: First slot in the general register save area.
+  return STI.isPPC64()
+             ? -16U
+             : STI.getTargetMachine().isPositionIndependent() ? -12U : -8U;
+}
+
+PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
+                          STI.getPlatformStackAlignment(), 0),
+      Subtarget(STI), ReturnSaveOffset(computeReturnSaveOffset(Subtarget)),
+      TOCSaveOffset(computeTOCSaveOffset(Subtarget)),
+      FramePointerSaveOffset(computeFramePointerSaveOffset(Subtarget)),
+      LinkageSize(computeLinkageSize(Subtarget)),
+      BasePointerSaveOffset(computeBasePointerSaveOffset(STI)) {}
+
+// With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
+const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
+    unsigned &NumEntries) const {
+  if (Subtarget.isDarwinABI()) {
+    NumEntries = 1;
+    if (Subtarget.isPPC64()) {
+      static const SpillSlot darwin64Offsets = {PPC::X31, -8};
+      return &darwin64Offsets;
+    } else {
+      static const SpillSlot darwinOffsets = {PPC::R31, -4};
+      return &darwinOffsets;
+    }
+  }
+
+  // Early exit if not using the SVR4 ABI.
+  if (!Subtarget.isSVR4ABI()) {
+    NumEntries = 0;
+    return nullptr;
+  }
+
+  // Note that the offsets here overlap, but this is fixed up in
+  // processFunctionBeforeFrameFinalized.
+
+  static const SpillSlot Offsets[] = {
+      // Floating-point register save area offsets.
+      {PPC::F31, -8},
+      {PPC::F30, -16},
+      {PPC::F29, -24},
+      {PPC::F28, -32},
+      {PPC::F27, -40},
+      {PPC::F26, -48},
+      {PPC::F25, -56},
+      {PPC::F24, -64},
+      {PPC::F23, -72},
+      {PPC::F22, -80},
+      {PPC::F21, -88},
+      {PPC::F20, -96},
+      {PPC::F19, -104},
+      {PPC::F18, -112},
+      {PPC::F17, -120},
+      {PPC::F16, -128},
+      {PPC::F15, -136},
+      {PPC::F14, -144},
+
+      // General register save area offsets.
+      {PPC::R31, -4},
+      {PPC::R30, -8},
+      {PPC::R29, -12},
+      {PPC::R28, -16},
+      {PPC::R27, -20},
+      {PPC::R26, -24},
+      {PPC::R25, -28},
+      {PPC::R24, -32},
+      {PPC::R23, -36},
+      {PPC::R22, -40},
+      {PPC::R21, -44},
+      {PPC::R20, -48},
+      {PPC::R19, -52},
+      {PPC::R18, -56},
+      {PPC::R17, -60},
+      {PPC::R16, -64},
+      {PPC::R15, -68},
+      {PPC::R14, -72},
+
+      // CR save area offset.  We map each of the nonvolatile CR fields
+      // to the slot for CR2, which is the first of the nonvolatile CR
+      // fields to be assigned, so that we only allocate one save slot.
+      // See PPCRegisterInfo::hasReservedSpillSlot() for more information.
+      {PPC::CR2, -4},
+
+      // VRSAVE save area offset.
+      {PPC::VRSAVE, -4},
+
+      // Vector register save area
+      {PPC::V31, -16},
+      {PPC::V30, -32},
+      {PPC::V29, -48},
+      {PPC::V28, -64},
+      {PPC::V27, -80},
+      {PPC::V26, -96},
+      {PPC::V25, -112},
+      {PPC::V24, -128},
+      {PPC::V23, -144},
+      {PPC::V22, -160},
+      {PPC::V21, -176},
+      {PPC::V20, -192}};
+
+  static const SpillSlot Offsets64[] = {
+      // Floating-point register save area offsets.
+      {PPC::F31, -8},
+      {PPC::F30, -16},
+      {PPC::F29, -24},
+      {PPC::F28, -32},
+      {PPC::F27, -40},
+      {PPC::F26, -48},
+      {PPC::F25, -56},
+      {PPC::F24, -64},
+      {PPC::F23, -72},
+      {PPC::F22, -80},
+      {PPC::F21, -88},
+      {PPC::F20, -96},
+      {PPC::F19, -104},
+      {PPC::F18, -112},
+      {PPC::F17, -120},
+      {PPC::F16, -128},
+      {PPC::F15, -136},
+      {PPC::F14, -144},
+
+      // General register save area offsets.
+      {PPC::X31, -8},
+      {PPC::X30, -16},
+      {PPC::X29, -24},
+      {PPC::X28, -32},
+      {PPC::X27, -40},
+      {PPC::X26, -48},
+      {PPC::X25, -56},
+      {PPC::X24, -64},
+      {PPC::X23, -72},
+      {PPC::X22, -80},
+      {PPC::X21, -88},
+      {PPC::X20, -96},
+      {PPC::X19, -104},
+      {PPC::X18, -112},
+      {PPC::X17, -120},
+      {PPC::X16, -128},
+      {PPC::X15, -136},
+      {PPC::X14, -144},
+
+      // VRSAVE save area offset.
+      {PPC::VRSAVE, -4},
+
+      // Vector register save area
+      {PPC::V31, -16},
+      {PPC::V30, -32},
+      {PPC::V29, -48},
+      {PPC::V28, -64},
+      {PPC::V27, -80},
+      {PPC::V26, -96},
+      {PPC::V25, -112},
+      {PPC::V24, -128},
+      {PPC::V23, -144},
+      {PPC::V22, -160},
+      {PPC::V21, -176},
+      {PPC::V20, -192}};
+
+  if (Subtarget.isPPC64()) {
+    NumEntries = array_lengthof(Offsets64);
+
+    return Offsets64;
+  } else {
+    NumEntries = array_lengthof(Offsets);
+
+    return Offsets;
+  }
+}
+
+/// RemoveVRSaveCode - We have found that this function does not need any code
+/// to manipulate the VRSAVE register, even though it uses vector registers.
+/// This can happen when the only registers used are known to be live in or out
+/// of the function.  Remove all of the VRSAVE related code from the function.
+/// FIXME: The removal of the code results in a compile failure at -O0 when the
+/// function contains a function call, as the GPR containing original VRSAVE
+/// contents is spilled and reloaded around the call.  Without the prolog code,
+/// the spill instruction refers to an undefined register.  This code needs
+/// to account for all uses of that GPR.
+static void RemoveVRSaveCode(MachineInstr &MI) {
+  MachineBasicBlock *Entry = MI.getParent();
+  MachineFunction *MF = Entry->getParent();
+
+  // We know that the MTVRSAVE instruction immediately follows MI.  Remove it.
+  MachineBasicBlock::iterator MBBI = MI;
+  ++MBBI;
+  assert(MBBI != Entry->end() && MBBI->getOpcode() == PPC::MTVRSAVE);
+  MBBI->eraseFromParent();
+
+  bool RemovedAllMTVRSAVEs = true;
+  // See if we can find and remove the MTVRSAVE instruction from all of the
+  // epilog blocks.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) {
+    // If last instruction is a return instruction, add an epilogue
+    if (I->isReturnBlock()) {
+      bool FoundIt = false;
+      for (MBBI = I->end(); MBBI != I->begin(); ) {
+        --MBBI;
+        if (MBBI->getOpcode() == PPC::MTVRSAVE) {
+          MBBI->eraseFromParent();  // remove it.
+          FoundIt = true;
+          break;
+        }
+      }
+      RemovedAllMTVRSAVEs &= FoundIt;
+    }
+  }
+
+  // If we found and removed all MTVRSAVE instructions, remove the read of
+  // VRSAVE as well.
+  if (RemovedAllMTVRSAVEs) {
+    MBBI = MI;
+    assert(MBBI != Entry->begin() && "UPDATE_VRSAVE is first instr in block?");
+    --MBBI;
+    assert(MBBI->getOpcode() == PPC::MFVRSAVE && "VRSAVE instrs wandered?");
+    MBBI->eraseFromParent();
+  }
+
+  // Finally, nuke the UPDATE_VRSAVE.
+  MI.eraseFromParent();
+}
+
+// HandleVRSaveUpdate - MI is the UPDATE_VRSAVE instruction introduced by the
+// instruction selector.  Based on the vector registers that have been used,
+// transform this into the appropriate ORI instruction.
+static void HandleVRSaveUpdate(MachineInstr &MI, const TargetInstrInfo &TII) {
+  MachineFunction *MF = MI.getParent()->getParent();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned UsedRegMask = 0;
+  for (unsigned i = 0; i != 32; ++i)
+    if (MRI.isPhysRegModified(VRRegNo[i]))
+      UsedRegMask |= 1 << (31-i);
+
+  // Live in and live out values already must be in the mask, so don't bother
+  // marking them.
+  for (MachineRegisterInfo::livein_iterator
+       I = MF->getRegInfo().livein_begin(),
+       E = MF->getRegInfo().livein_end(); I != E; ++I) {
+    unsigned RegNo = TRI->getEncodingValue(I->first);
+    if (VRRegNo[RegNo] == I->first)        // If this really is a vector reg.
+      UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
+  }
+
+  // Live out registers appear as use operands on return instructions.
+  for (MachineFunction::const_iterator BI = MF->begin(), BE = MF->end();
+       UsedRegMask != 0 && BI != BE; ++BI) {
+    const MachineBasicBlock &MBB = *BI;
+    if (!MBB.isReturnBlock())
+      continue;
+    const MachineInstr &Ret = MBB.back();
+    for (unsigned I = 0, E = Ret.getNumOperands(); I != E; ++I) {
+      const MachineOperand &MO = Ret.getOperand(I);
+      if (!MO.isReg() || !PPC::VRRCRegClass.contains(MO.getReg()))
+        continue;
+      unsigned RegNo = TRI->getEncodingValue(MO.getReg());
+      UsedRegMask &= ~(1 << (31-RegNo));
+    }
+  }
+
+  // If no registers are used, turn this into a copy.
+  if (UsedRegMask == 0) {
+    // Remove all VRSAVE code.
+    RemoveVRSaveCode(MI);
+    return;
+  }
+
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  unsigned DstReg = MI.getOperand(0).getReg();
+
+  if ((UsedRegMask & 0xFFFF) == UsedRegMask) {
+    if (DstReg != SrcReg)
+      BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+          .addReg(SrcReg)
+          .addImm(UsedRegMask);
+    else
+      BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+          .addReg(SrcReg, RegState::Kill)
+          .addImm(UsedRegMask);
+  } else if ((UsedRegMask & 0xFFFF0000) == UsedRegMask) {
+    if (DstReg != SrcReg)
+      BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+          .addReg(SrcReg)
+          .addImm(UsedRegMask >> 16);
+    else
+      BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+          .addReg(SrcReg, RegState::Kill)
+          .addImm(UsedRegMask >> 16);
+  } else {
+    if (DstReg != SrcReg)
+      BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+          .addReg(SrcReg)
+          .addImm(UsedRegMask >> 16);
+    else
+      BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+          .addReg(SrcReg, RegState::Kill)
+          .addImm(UsedRegMask >> 16);
+
+    BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+        .addReg(DstReg, RegState::Kill)
+        .addImm(UsedRegMask & 0xFFFF);
+  }
+
+  // Remove the old UPDATE_VRSAVE instruction.
+  MI.eraseFromParent();
+}
+
+static bool spillsCR(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->isCRSpilled();
+}
+
+static bool spillsVRSAVE(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->isVRSAVESpilled();
+}
+
+static bool hasSpills(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->hasSpills();
+}
+
+static bool hasNonRISpills(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->hasNonRISpills();
+}
+
+/// MustSaveLR - Return true if this function requires that we save the LR
+/// register onto the stack in the prolog and restore it in the epilog of the
+/// function.
+static bool MustSaveLR(const MachineFunction &MF, unsigned LR) {
+  const PPCFunctionInfo *MFI = MF.getInfo<PPCFunctionInfo>();
+
+  // We need a save/restore of LR if there is any def of LR (which is
+  // defined by calls, including the PIC setup sequence), or if there is
+  // some use of the LR stack slot (e.g. for builtin_return_address).
+  // (LR comes in 32 and 64 bit versions.)
+  MachineRegisterInfo::def_iterator RI = MF.getRegInfo().def_begin(LR);
+  return RI !=MF.getRegInfo().def_end() || MFI->isLRStoreRequired();
+}
+
+/// determineFrameLayout - Determine the size of the frame and maximum call
+/// frame size.
+unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
+                                                bool UpdateMF,
+                                                bool UseEstimate) const {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // Get the number of bytes to allocate from the FrameInfo
+  unsigned FrameSize =
+    UseEstimate ? MFI.estimateStackSize(MF) : MFI.getStackSize();
+
+  // Get stack alignments. The frame must be aligned to the greatest of these:
+  unsigned TargetAlign = getStackAlignment(); // alignment required per the ABI
+  unsigned MaxAlign = MFI.getMaxAlignment(); // algmt required by data in frame
+  unsigned AlignMask = std::max(MaxAlign, TargetAlign) - 1;
+
+  const PPCRegisterInfo *RegInfo =
+      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+
+  // If we are a leaf function, and use up to 224 bytes of stack space,
+  // don't have a frame pointer, calls, or dynamic alloca then we do not need
+  // to adjust the stack pointer (we fit in the Red Zone).
+  // The 32-bit SVR4 ABI has no Red Zone. However, it can still generate
+  // stackless code if all local vars are reg-allocated.
+  bool DisableRedZone = MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+  unsigned LR = RegInfo->getRARegister();
+  if (!DisableRedZone &&
+      (Subtarget.isPPC64() ||                      // 32-bit SVR4, no stack-
+       !Subtarget.isSVR4ABI() ||                   //   allocated locals.
+        FrameSize == 0) &&
+      FrameSize <= 224 &&                          // Fits in red zone.
+      !MFI.hasVarSizedObjects() &&                 // No dynamic alloca.
+      !MFI.adjustsStack() &&                       // No calls.
+      !MustSaveLR(MF, LR) &&
+      !RegInfo->hasBasePointer(MF)) { // No special alignment.
+    // No need for frame
+    if (UpdateMF)
+      MFI.setStackSize(0);
+    return 0;
+  }
+
+  // Get the maximum call frame size of all the calls.
+  unsigned maxCallFrameSize = MFI.getMaxCallFrameSize();
+
+  // Maximum call frame needs to be at least big enough for linkage area.
+  unsigned minCallFrameSize = getLinkageSize();
+  maxCallFrameSize = std::max(maxCallFrameSize, minCallFrameSize);
+
+  // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
+  // that allocations will be aligned.
+  if (MFI.hasVarSizedObjects())
+    maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask;
+
+  // Update maximum call frame size.
+  if (UpdateMF)
+    MFI.setMaxCallFrameSize(maxCallFrameSize);
+
+  // Include call frame size in total.
+  FrameSize += maxCallFrameSize;
+
+  // Make sure the frame is aligned.
+  FrameSize = (FrameSize + AlignMask) & ~AlignMask;
+
+  // Update frame info.
+  if (UpdateMF)
+    MFI.setStackSize(FrameSize);
+
+  return FrameSize;
+}
+
+// hasFP - Return true if the specified function actually has a dedicated frame
+// pointer register.
+bool PPCFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  // FIXME: This is pretty much broken by design: hasFP() might be called really
+  // early, before the stack layout was calculated and thus hasFP() might return
+  // true or false here depending on the time of call.
+  return (MFI.getStackSize()) && needsFP(MF);
+}
+
+// needsFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool PPCFrameLowering::needsFP(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // Naked functions have no stack frame pushed, so we don't have a frame
+  // pointer.
+  if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
+    return false;
+
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+    MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint() ||
+    (MF.getTarget().Options.GuaranteedTailCallOpt &&
+     MF.getInfo<PPCFunctionInfo>()->hasFastCall());
+}
+
+void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
+  bool is31 = needsFP(MF);
+  unsigned FPReg  = is31 ? PPC::R31 : PPC::R1;
+  unsigned FP8Reg = is31 ? PPC::X31 : PPC::X1;
+
+  const PPCRegisterInfo *RegInfo =
+      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  bool HasBP = RegInfo->hasBasePointer(MF);
+  unsigned BPReg  = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg;
+  unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; ++BI)
+    for (MachineBasicBlock::iterator MBBI = BI->end(); MBBI != BI->begin(); ) {
+      --MBBI;
+      for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) {
+        MachineOperand &MO = MBBI->getOperand(I);
+        if (!MO.isReg())
+          continue;
+
+        switch (MO.getReg()) {
+        case PPC::FP:
+          MO.setReg(FPReg);
+          break;
+        case PPC::FP8:
+          MO.setReg(FP8Reg);
+          break;
+        case PPC::BP:
+          MO.setReg(BPReg);
+          break;
+        case PPC::BP8:
+          MO.setReg(BP8Reg);
+          break;
+
+        }
+      }
+    }
+}
+
+/*  This function will do the following:
+    - If MBB is an entry or exit block, set SR1 and SR2 to R0 and R12
+      respectively (defaults recommended by the ABI) and return true
+    - If MBB is not an entry block, initialize the register scavenger and look
+      for available registers.
+    - If the defaults (R0/R12) are available, return true
+    - If TwoUniqueRegsRequired is set to true, it looks for two unique
+      registers. Otherwise, look for a single available register.
+      - If the required registers are found, set SR1 and SR2 and return true.
+      - If the required registers are not found, set SR2 or both SR1 and SR2 to
+        PPC::NoRegister and return false.
+
+    Note that if both SR1 and SR2 are valid parameters and TwoUniqueRegsRequired
+    is not set, this function will attempt to find two different registers, but
+    still return true if only one register is available (and set SR1 == SR2).
+*/
+bool
+PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
+                                      bool UseAtEnd,
+                                      bool TwoUniqueRegsRequired,
+                                      unsigned *SR1,
+                                      unsigned *SR2) const {
+  RegScavenger RS;
+  unsigned R0 =  Subtarget.isPPC64() ? PPC::X0 : PPC::R0;
+  unsigned R12 = Subtarget.isPPC64() ? PPC::X12 : PPC::R12;
+
+  // Set the defaults for the two scratch registers.
+  if (SR1)
+    *SR1 = R0;
+
+  if (SR2) {
+    assert (SR1 && "Asking for the second scratch register but not the first?");
+    *SR2 = R12;
+  }
+
+  // If MBB is an entry or exit block, use R0 and R12 as the scratch registers.
+  if ((UseAtEnd && MBB->isReturnBlock()) ||
+      (!UseAtEnd && (&MBB->getParent()->front() == MBB)))
+    return true;
+
+  RS.enterBasicBlock(*MBB);
+
+  if (UseAtEnd && !MBB->empty()) {
+    // The scratch register will be used at the end of the block, so must
+    // consider all registers used within the block
+
+    MachineBasicBlock::iterator MBBI = MBB->getFirstTerminator();
+    // If no terminator, back iterator up to previous instruction.
+    if (MBBI == MBB->end())
+      MBBI = std::prev(MBBI);
+
+    if (MBBI != MBB->begin())
+      RS.forward(MBBI);
+  }
+
+  // If the two registers are available, we're all good.
+  // Note that we only return here if both R0 and R12 are available because
+  // although the function may not require two unique registers, it may benefit
+  // from having two so we should try to provide them.
+  if (!RS.isRegUsed(R0) && !RS.isRegUsed(R12))
+    return true;
+
+  // Get the list of callee-saved registers for the target.
+  const PPCRegisterInfo *RegInfo =
+      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MBB->getParent());
+
+  // Get all the available registers in the block.
+  BitVector BV = RS.getRegsAvailable(Subtarget.isPPC64() ? &PPC::G8RCRegClass :
+                                     &PPC::GPRCRegClass);
+
+  // We shouldn't use callee-saved registers as scratch registers as they may be
+  // available when looking for a candidate block for shrink wrapping but not
+  // available when the actual prologue/epilogue is being emitted because they
+  // were added as live-in to the prologue block by PrologueEpilogueInserter.
+  for (int i = 0; CSRegs[i]; ++i)
+    BV.reset(CSRegs[i]);
+
+  // Set the first scratch register to the first available one.
+  if (SR1) {
+    int FirstScratchReg = BV.find_first();
+    *SR1 = FirstScratchReg == -1 ? (unsigned)PPC::NoRegister : FirstScratchReg;
+  }
+
+  // If there is another one available, set the second scratch register to that.
+  // Otherwise, set it to either PPC::NoRegister if this function requires two
+  // or to whatever SR1 is set to if this function doesn't require two.
+  if (SR2) {
+    int SecondScratchReg = BV.find_next(*SR1);
+    if (SecondScratchReg != -1)
+      *SR2 = SecondScratchReg;
+    else
+      *SR2 = TwoUniqueRegsRequired ? (unsigned)PPC::NoRegister : *SR1;
+  }
+
+  // Now that we've done our best to provide both registers, double check
+  // whether we were unable to provide enough.
+  if (BV.count() < (TwoUniqueRegsRequired ? 2U : 1U))
+    return false;
+
+  return true;
+}
+
+// We need a scratch register for spilling LR and for spilling CR. By default,
+// we use two scratch registers to hide latency. However, if only one scratch
+// register is available, we can adjust for that by not overlapping the spill
+// code. However, if we need to realign the stack (i.e. have a base pointer)
+// and the stack frame is large, we need two scratch registers.
+bool
+PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const {
+  const PPCRegisterInfo *RegInfo =
+      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  MachineFunction &MF = *(MBB->getParent());
+  bool HasBP = RegInfo->hasBasePointer(MF);
+  unsigned FrameSize = determineFrameLayout(MF, false);
+  int NegFrameSize = -FrameSize;
+  bool IsLargeFrame = !isInt<16>(NegFrameSize);
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  unsigned MaxAlign = MFI.getMaxAlignment();
+  bool HasRedZone = Subtarget.isPPC64() || !Subtarget.isSVR4ABI();
+
+  return (IsLargeFrame || !HasRedZone) && HasBP && MaxAlign > 1;
+}
+
+bool PPCFrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
+  MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
+
+  return findScratchRegister(TmpMBB, false,
+                             twoUniqueScratchRegsRequired(TmpMBB));
+}
+
+bool PPCFrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
+  MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
+
+  return findScratchRegister(TmpMBB, true);
+}
+
+void PPCFrameLowering::emitPrologue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const PPCInstrInfo &TII =
+      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+  const PPCRegisterInfo *RegInfo =
+      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  DebugLoc dl;
+  bool needsCFI = MMI.hasDebugInfo() ||
+    MF.getFunction()->needsUnwindTableEntry();
+
+  // Get processor type.
+  bool isPPC64 = Subtarget.isPPC64();
+  // Get the ABI.
+  bool isSVR4ABI = Subtarget.isSVR4ABI();
+  bool isELFv2ABI = Subtarget.isELFv2ABI();
+  assert((Subtarget.isDarwinABI() || isSVR4ABI) &&
+         "Currently only Darwin and SVR4 ABIs are supported for PowerPC.");
+
+  // Scan the prolog, looking for an UPDATE_VRSAVE instruction.  If we find it,
+  // process it.
+  if (!isSVR4ABI)
+    for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) {
+      if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) {
+        HandleVRSaveUpdate(*MBBI, TII);
+        break;
+      }
+    }
+
+  // Move MBBI back to the beginning of the prologue block.
+  MBBI = MBB.begin();
+
+  // Work out frame sizes.
+  unsigned FrameSize = determineFrameLayout(MF);
+  int NegFrameSize = -FrameSize;
+  if (!isInt<32>(NegFrameSize))
+    llvm_unreachable("Unhandled stack size!");
+
+  if (MFI.isFrameAddressTaken())
+    replaceFPWithRealFP(MF);
+
+  // Check if the link register (LR) must be saved.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  bool MustSaveLR = FI->mustSaveLR();
+  const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
+  bool MustSaveCR = !MustSaveCRs.empty();
+  // Do we have a frame pointer and/or base pointer for this function?
+  bool HasFP = hasFP(MF);
+  bool HasBP = RegInfo->hasBasePointer(MF);
+  bool HasRedZone = isPPC64 || !isSVR4ABI;
+
+  unsigned SPReg       = isPPC64 ? PPC::X1  : PPC::R1;
+  unsigned BPReg       = RegInfo->getBaseRegister(MF);
+  unsigned FPReg       = isPPC64 ? PPC::X31 : PPC::R31;
+  unsigned LRReg       = isPPC64 ? PPC::LR8 : PPC::LR;
+  unsigned ScratchReg  = 0;
+  unsigned TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
+  //  ...(R12/X12 is volatile in both Darwin & SVR4, & can't be a function arg.)
+  const MCInstrDesc& MFLRInst = TII.get(isPPC64 ? PPC::MFLR8
+                                                : PPC::MFLR );
+  const MCInstrDesc& StoreInst = TII.get(isPPC64 ? PPC::STD
+                                                 : PPC::STW );
+  const MCInstrDesc& StoreUpdtInst = TII.get(isPPC64 ? PPC::STDU
+                                                     : PPC::STWU );
+  const MCInstrDesc& StoreUpdtIdxInst = TII.get(isPPC64 ? PPC::STDUX
+                                                        : PPC::STWUX);
+  const MCInstrDesc& LoadImmShiftedInst = TII.get(isPPC64 ? PPC::LIS8
+                                                          : PPC::LIS );
+  const MCInstrDesc& OrImmInst = TII.get(isPPC64 ? PPC::ORI8
+                                                 : PPC::ORI );
+  const MCInstrDesc& OrInst = TII.get(isPPC64 ? PPC::OR8
+                                              : PPC::OR );
+  const MCInstrDesc& SubtractCarryingInst = TII.get(isPPC64 ? PPC::SUBFC8
+                                                            : PPC::SUBFC);
+  const MCInstrDesc& SubtractImmCarryingInst = TII.get(isPPC64 ? PPC::SUBFIC8
+                                                               : PPC::SUBFIC);
+
+  // Regarding this assert: Even though LR is saved in the caller's frame (i.e.,
+  // LROffset is positive), that slot is callee-owned. Because PPC32 SVR4 has no
+  // Red Zone, an asynchronous event (a form of "callee") could claim a frame &
+  // overwrite it, so PPC32 SVR4 must claim at least a minimal frame to save LR.
+  assert((isPPC64 || !isSVR4ABI || !(!FrameSize && (MustSaveLR || HasFP))) &&
+         "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4.");
+
+  // Using the same bool variable as below to suppress compiler warnings.
+  bool SingleScratchReg =
+    findScratchRegister(&MBB, false, twoUniqueScratchRegsRequired(&MBB),
+                        &ScratchReg, &TempReg);
+  assert(SingleScratchReg &&
+         "Required number of registers not available in this block");
+
+  SingleScratchReg = ScratchReg == TempReg;
+
+  int LROffset = getReturnSaveOffset();
+
+  int FPOffset = 0;
+  if (HasFP) {
+    if (isSVR4ABI) {
+      MachineFrameInfo &MFI = MF.getFrameInfo();
+      int FPIndex = FI->getFramePointerSaveIndex();
+      assert(FPIndex && "No Frame Pointer Save Slot!");
+      FPOffset = MFI.getObjectOffset(FPIndex);
+    } else {
+      FPOffset = getFramePointerSaveOffset();
+    }
+  }
+
+  int BPOffset = 0;
+  if (HasBP) {
+    if (isSVR4ABI) {
+      MachineFrameInfo &MFI = MF.getFrameInfo();
+      int BPIndex = FI->getBasePointerSaveIndex();
+      assert(BPIndex && "No Base Pointer Save Slot!");
+      BPOffset = MFI.getObjectOffset(BPIndex);
+    } else {
+      BPOffset = getBasePointerSaveOffset();
+    }
+  }
+
+  int PBPOffset = 0;
+  if (FI->usesPICBase()) {
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    int PBPIndex = FI->getPICBasePointerSaveIndex();
+    assert(PBPIndex && "No PIC Base Pointer Save Slot!");
+    PBPOffset = MFI.getObjectOffset(PBPIndex);
+  }
+
+  // Get stack alignments.
+  unsigned MaxAlign = MFI.getMaxAlignment();
+  if (HasBP && MaxAlign > 1)
+    assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
+           "Invalid alignment!");
+
+  // Frames of 32KB & larger require special handling because they cannot be
+  // indexed into with a simple STDU/STWU/STD/STW immediate offset operand.
+  bool isLargeFrame = !isInt<16>(NegFrameSize);
+
+  assert((isPPC64 || !MustSaveCR) &&
+         "Prologue CR saving supported only in 64-bit mode");
+
+  // If we need to spill the CR and the LR but we don't have two separate
+  // registers available, we must spill them one at a time
+  if (MustSaveCR && SingleScratchReg && MustSaveLR) {
+    // In the ELFv2 ABI, we are not required to save all CR fields.
+    // If only one or two CR fields are clobbered, it is more efficient to use
+    // mfocrf to selectively save just those fields, because mfocrf has short
+    // latency compares to mfcr.
+    unsigned MfcrOpcode = PPC::MFCR8;
+    unsigned CrState = RegState::ImplicitKill;
+    if (isELFv2ABI && MustSaveCRs.size() == 1) {
+      MfcrOpcode = PPC::MFOCRF8;
+      CrState = RegState::Kill;
+    }
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(MfcrOpcode), TempReg);
+    for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+      MIB.addReg(MustSaveCRs[i], CrState);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
+      .addReg(TempReg, getKillRegState(true))
+      .addImm(8)
+      .addReg(SPReg);
+  }
+
+  if (MustSaveLR)
+    BuildMI(MBB, MBBI, dl, MFLRInst, ScratchReg);
+
+  if (MustSaveCR &&
+      !(SingleScratchReg && MustSaveLR)) { // will only occur for PPC64
+    // In the ELFv2 ABI, we are not required to save all CR fields.
+    // If only one or two CR fields are clobbered, it is more efficient to use
+    // mfocrf to selectively save just those fields, because mfocrf has short
+    // latency compares to mfcr.
+    unsigned MfcrOpcode = PPC::MFCR8;
+    unsigned CrState = RegState::ImplicitKill;
+    if (isELFv2ABI && MustSaveCRs.size() == 1) {
+      MfcrOpcode = PPC::MFOCRF8;
+      CrState = RegState::Kill;
+    }
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(MfcrOpcode), TempReg);
+    for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+      MIB.addReg(MustSaveCRs[i], CrState);
+  }
+
+  if (HasRedZone) {
+    if (HasFP)
+      BuildMI(MBB, MBBI, dl, StoreInst)
+        .addReg(FPReg)
+        .addImm(FPOffset)
+        .addReg(SPReg);
+    if (FI->usesPICBase())
+      BuildMI(MBB, MBBI, dl, StoreInst)
+        .addReg(PPC::R30)
+        .addImm(PBPOffset)
+        .addReg(SPReg);
+    if (HasBP)
+      BuildMI(MBB, MBBI, dl, StoreInst)
+        .addReg(BPReg)
+        .addImm(BPOffset)
+        .addReg(SPReg);
+  }
+
+  if (MustSaveLR)
+    BuildMI(MBB, MBBI, dl, StoreInst)
+      .addReg(ScratchReg, getKillRegState(true))
+      .addImm(LROffset)
+      .addReg(SPReg);
+
+  if (MustSaveCR &&
+      !(SingleScratchReg && MustSaveLR)) { // will only occur for PPC64
+    assert(HasRedZone && "A red zone is always available on PPC64");
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
+      .addReg(TempReg, getKillRegState(true))
+      .addImm(8)
+      .addReg(SPReg);
+  }
+
+  // Skip the rest if this is a leaf function & all spills fit in the Red Zone.
+  if (!FrameSize)
+    return;
+
+  // Adjust stack pointer: r1 += NegFrameSize.
+  // If there is a preferred stack alignment, align R1 now
+
+  if (HasBP && HasRedZone) {
+    // Save a copy of r1 as the base pointer.
+    BuildMI(MBB, MBBI, dl, OrInst, BPReg)
+      .addReg(SPReg)
+      .addReg(SPReg);
+  }
+
+  // Have we generated a STUX instruction to claim stack frame? If so,
+  // the negated frame size will be placed in ScratchReg.
+  bool HasSTUX = false;
+
+  // This condition must be kept in sync with canUseAsPrologue.
+  if (HasBP && MaxAlign > 1) {
+    if (isPPC64)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), ScratchReg)
+        .addReg(SPReg)
+        .addImm(0)
+        .addImm(64 - Log2_32(MaxAlign));
+    else // PPC32...
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), ScratchReg)
+        .addReg(SPReg)
+        .addImm(0)
+        .addImm(32 - Log2_32(MaxAlign))
+        .addImm(31);
+    if (!isLargeFrame) {
+      BuildMI(MBB, MBBI, dl, SubtractImmCarryingInst, ScratchReg)
+        .addReg(ScratchReg, RegState::Kill)
+        .addImm(NegFrameSize);
+    } else {
+      assert(!SingleScratchReg && "Only a single scratch reg available");
+      BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg)
+        .addImm(NegFrameSize >> 16);
+      BuildMI(MBB, MBBI, dl, OrImmInst, TempReg)
+        .addReg(TempReg, RegState::Kill)
+        .addImm(NegFrameSize & 0xFFFF);
+      BuildMI(MBB, MBBI, dl, SubtractCarryingInst, ScratchReg)
+        .addReg(ScratchReg, RegState::Kill)
+        .addReg(TempReg, RegState::Kill);
+    }
+
+    BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
+      .addReg(SPReg, RegState::Kill)
+      .addReg(SPReg)
+      .addReg(ScratchReg);
+    HasSTUX = true;
+
+  } else if (!isLargeFrame) {
+    BuildMI(MBB, MBBI, dl, StoreUpdtInst, SPReg)
+      .addReg(SPReg)
+      .addImm(NegFrameSize)
+      .addReg(SPReg);
+
+  } else {
+    BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
+      .addImm(NegFrameSize >> 16);
+    BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+      .addReg(ScratchReg, RegState::Kill)
+      .addImm(NegFrameSize & 0xFFFF);
+    BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
+      .addReg(SPReg, RegState::Kill)
+      .addReg(SPReg)
+      .addReg(ScratchReg);
+    HasSTUX = true;
+  }
+
+  if (!HasRedZone) {
+    assert(!isPPC64 && "A red zone is always available on PPC64");
+    if (HasSTUX) {
+      // The negated frame size is in ScratchReg, and the SPReg has been
+      // decremented by the frame size: SPReg = old SPReg + ScratchReg.
+      // Since FPOffset, PBPOffset, etc. are relative to the beginning of
+      // the stack frame (i.e. the old SP), ideally, we would put the old
+      // SP into a register and use it as the base for the stores. The
+      // problem is that the only available register may be ScratchReg,
+      // which could be R0, and R0 cannot be used as a base address.
+
+      // First, set ScratchReg to the old SP. This may need to be modified
+      // later.
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBF), ScratchReg)
+        .addReg(ScratchReg, RegState::Kill)
+        .addReg(SPReg);
+
+      if (ScratchReg == PPC::R0) {
+        // R0 cannot be used as a base register, but it can be used as an
+        // index in a store-indexed.
+        int LastOffset = 0;
+        if (HasFP)  {
+          // R0 += (FPOffset-LastOffset).
+          // Need addic, since addi treats R0 as 0.
+          BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDIC), ScratchReg)
+            .addReg(ScratchReg)
+            .addImm(FPOffset-LastOffset);
+          LastOffset = FPOffset;
+          // Store FP into *R0.
+          BuildMI(MBB, MBBI, dl, TII.get(PPC::STWX))
+            .addReg(FPReg, RegState::Kill)  // Save FP.
+            .addReg(PPC::ZERO)
+            .addReg(ScratchReg);  // This will be the index (R0 is ok here).
+        }
+        if (FI->usesPICBase()) {
+          // R0 += (PBPOffset-LastOffset).
+          BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDIC), ScratchReg)
+            .addReg(ScratchReg)
+            .addImm(PBPOffset-LastOffset);
+          LastOffset = PBPOffset;
+          BuildMI(MBB, MBBI, dl, TII.get(PPC::STWX))
+            .addReg(PPC::R30, RegState::Kill)  // Save PIC base pointer.
+            .addReg(PPC::ZERO)
+            .addReg(ScratchReg);  // This will be the index (R0 is ok here).
+        }
+        if (HasBP) {
+          // R0 += (BPOffset-LastOffset).
+          BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDIC), ScratchReg)
+            .addReg(ScratchReg)
+            .addImm(BPOffset-LastOffset);
+          LastOffset = BPOffset;
+          BuildMI(MBB, MBBI, dl, TII.get(PPC::STWX))
+            .addReg(BPReg, RegState::Kill)  // Save BP.
+            .addReg(PPC::ZERO)
+            .addReg(ScratchReg);  // This will be the index (R0 is ok here).
+          // BP = R0-LastOffset
+          BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDIC), BPReg)
+            .addReg(ScratchReg, RegState::Kill)
+            .addImm(-LastOffset);
+        }
+      } else {
+        // ScratchReg is not R0, so use it as the base register. It is
+        // already set to the old SP, so we can use the offsets directly.
+
+        // Now that the stack frame has been allocated, save all the necessary
+        // registers using ScratchReg as the base address.
+        if (HasFP)
+          BuildMI(MBB, MBBI, dl, StoreInst)
+            .addReg(FPReg)
+            .addImm(FPOffset)
+            .addReg(ScratchReg);
+        if (FI->usesPICBase())
+          BuildMI(MBB, MBBI, dl, StoreInst)
+            .addReg(PPC::R30)
+            .addImm(PBPOffset)
+            .addReg(ScratchReg);
+        if (HasBP) {
+          BuildMI(MBB, MBBI, dl, StoreInst)
+            .addReg(BPReg)
+            .addImm(BPOffset)
+            .addReg(ScratchReg);
+          BuildMI(MBB, MBBI, dl, OrInst, BPReg)
+            .addReg(ScratchReg, RegState::Kill)
+            .addReg(ScratchReg);
+        }
+      }
+    } else {
+      // The frame size is a known 16-bit constant (fitting in the immediate
+      // field of STWU). To be here we have to be compiling for PPC32.
+      // Since the SPReg has been decreased by FrameSize, add it back to each
+      // offset.
+      if (HasFP)
+        BuildMI(MBB, MBBI, dl, StoreInst)
+          .addReg(FPReg)
+          .addImm(FrameSize + FPOffset)
+          .addReg(SPReg);
+      if (FI->usesPICBase())
+        BuildMI(MBB, MBBI, dl, StoreInst)
+          .addReg(PPC::R30)
+          .addImm(FrameSize + PBPOffset)
+          .addReg(SPReg);
+      if (HasBP) {
+        BuildMI(MBB, MBBI, dl, StoreInst)
+          .addReg(BPReg)
+          .addImm(FrameSize + BPOffset)
+          .addReg(SPReg);
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), BPReg)
+          .addReg(SPReg)
+          .addImm(FrameSize);
+      }
+    }
+  }
+
+  // Add Call Frame Information for the instructions we generated above.
+  if (needsCFI) {
+    unsigned CFIIndex;
+
+    if (HasBP) {
+      // Define CFA in terms of BP. Do this in preference to using FP/SP,
+      // because if the stack needed aligning then CFA won't be at a fixed
+      // offset from FP/SP.
+      unsigned Reg = MRI->getDwarfRegNum(BPReg, true);
+      CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+    } else {
+      // Adjust the definition of CFA to account for the change in SP.
+      assert(NegFrameSize);
+      CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, NegFrameSize));
+    }
+    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+
+    if (HasFP) {
+      // Describe where FP was saved, at a fixed offset from CFA.
+      unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
+      CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, FPOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+
+    if (FI->usesPICBase()) {
+      // Describe where FP was saved, at a fixed offset from CFA.
+      unsigned Reg = MRI->getDwarfRegNum(PPC::R30, true);
+      CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, PBPOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+
+    if (HasBP) {
+      // Describe where BP was saved, at a fixed offset from CFA.
+      unsigned Reg = MRI->getDwarfRegNum(BPReg, true);
+      CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, BPOffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+
+    if (MustSaveLR) {
+      // Describe where LR was saved, at a fixed offset from CFA.
+      unsigned Reg = MRI->getDwarfRegNum(LRReg, true);
+      CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, LROffset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
+
+  // If there is a frame pointer, copy R1 into R31
+  if (HasFP) {
+    BuildMI(MBB, MBBI, dl, OrInst, FPReg)
+      .addReg(SPReg)
+      .addReg(SPReg);
+
+    if (!HasBP && needsCFI) {
+      // Change the definition of CFA from SP+offset to FP+offset, because SP
+      // will change at every alloca.
+      unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
+      unsigned CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
+
+  if (needsCFI) {
+    // Describe where callee saved registers were saved, at fixed offsets from
+    // CFA.
+    const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+      unsigned Reg = CSI[I].getReg();
+      if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue;
+
+      // This is a bit of a hack: CR2LT, CR2GT, CR2EQ and CR2UN are just
+      // subregisters of CR2. We just need to emit a move of CR2.
+      if (PPC::CRBITRCRegClass.contains(Reg))
+        continue;
+
+      // For SVR4, don't emit a move for the CR spill slot if we haven't
+      // spilled CRs.
+      if (isSVR4ABI && (PPC::CR2 <= Reg && Reg <= PPC::CR4)
+          && !MustSaveCR)
+        continue;
+
+      // For 64-bit SVR4 when we have spilled CRs, the spill location
+      // is SP+8, not a frame-relative slot.
+      if (isSVR4ABI && isPPC64 && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+        // In the ELFv1 ABI, only CR2 is noted in CFI and stands in for
+        // the whole CR word.  In the ELFv2 ABI, every CR that was
+        // actually saved gets its own CFI record.
+        unsigned CRReg = isELFv2ABI? Reg : (unsigned) PPC::CR2;
+        unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+            nullptr, MRI->getDwarfRegNum(CRReg, true), 8));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+        continue;
+      }
+
+      int Offset = MFI.getObjectOffset(CSI[I].getFrameIdx());
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+          nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
+}
+
+void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  DebugLoc dl;
+
+  if (MBBI != MBB.end())
+    dl = MBBI->getDebugLoc();
+  
+  const PPCInstrInfo &TII =
+      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+  const PPCRegisterInfo *RegInfo =
+      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+
+  // Get alignment info so we know how to restore the SP.
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // Get the number of bytes allocated from the FrameInfo.
+  int FrameSize = MFI.getStackSize();
+
+  // Get processor type.
+  bool isPPC64 = Subtarget.isPPC64();
+  // Get the ABI.
+  bool isSVR4ABI = Subtarget.isSVR4ABI();
+
+  // Check if the link register (LR) has been saved.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  bool MustSaveLR = FI->mustSaveLR();
+  const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
+  bool MustSaveCR = !MustSaveCRs.empty();
+  // Do we have a frame pointer and/or base pointer for this function?
+  bool HasFP = hasFP(MF);
+  bool HasBP = RegInfo->hasBasePointer(MF);
+  bool HasRedZone = Subtarget.isPPC64() || !Subtarget.isSVR4ABI();
+
+  unsigned SPReg      = isPPC64 ? PPC::X1  : PPC::R1;
+  unsigned BPReg      = RegInfo->getBaseRegister(MF);
+  unsigned FPReg      = isPPC64 ? PPC::X31 : PPC::R31;
+  unsigned ScratchReg = 0;
+  unsigned TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
+  const MCInstrDesc& MTLRInst = TII.get( isPPC64 ? PPC::MTLR8
+                                                 : PPC::MTLR );
+  const MCInstrDesc& LoadInst = TII.get( isPPC64 ? PPC::LD
+                                                 : PPC::LWZ );
+  const MCInstrDesc& LoadImmShiftedInst = TII.get( isPPC64 ? PPC::LIS8
+                                                           : PPC::LIS );
+  const MCInstrDesc& OrInst = TII.get(isPPC64 ? PPC::OR8
+                                              : PPC::OR );
+  const MCInstrDesc& OrImmInst = TII.get( isPPC64 ? PPC::ORI8
+                                                  : PPC::ORI );
+  const MCInstrDesc& AddImmInst = TII.get( isPPC64 ? PPC::ADDI8
+                                                   : PPC::ADDI );
+  const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8
+                                                : PPC::ADD4 );
+
+  int LROffset = getReturnSaveOffset();
+
+  int FPOffset = 0;
+
+  // Using the same bool variable as below to suppress compiler warnings.
+  bool SingleScratchReg = findScratchRegister(&MBB, true, false, &ScratchReg,
+                                              &TempReg);
+  assert(SingleScratchReg &&
+         "Could not find an available scratch register");
+
+  SingleScratchReg = ScratchReg == TempReg;
+
+  if (HasFP) {
+    if (isSVR4ABI) {
+      int FPIndex = FI->getFramePointerSaveIndex();
+      assert(FPIndex && "No Frame Pointer Save Slot!");
+      FPOffset = MFI.getObjectOffset(FPIndex);
+    } else {
+      FPOffset = getFramePointerSaveOffset();
+    }
+  }
+
+  int BPOffset = 0;
+  if (HasBP) {
+    if (isSVR4ABI) {
+      int BPIndex = FI->getBasePointerSaveIndex();
+      assert(BPIndex && "No Base Pointer Save Slot!");
+      BPOffset = MFI.getObjectOffset(BPIndex);
+    } else {
+      BPOffset = getBasePointerSaveOffset();
+    }
+  }
+
+  int PBPOffset = 0;
+  if (FI->usesPICBase()) {
+    int PBPIndex = FI->getPICBasePointerSaveIndex();
+    assert(PBPIndex && "No PIC Base Pointer Save Slot!");
+    PBPOffset = MFI.getObjectOffset(PBPIndex);
+  }
+
+  bool IsReturnBlock = (MBBI != MBB.end() && MBBI->isReturn());
+  
+  if (IsReturnBlock) {
+    unsigned RetOpcode = MBBI->getOpcode();
+    bool UsesTCRet =  RetOpcode == PPC::TCRETURNri ||
+                      RetOpcode == PPC::TCRETURNdi ||
+                      RetOpcode == PPC::TCRETURNai ||
+                      RetOpcode == PPC::TCRETURNri8 ||
+                      RetOpcode == PPC::TCRETURNdi8 ||
+                      RetOpcode == PPC::TCRETURNai8;
+
+    if (UsesTCRet) {
+      int MaxTCRetDelta = FI->getTailCallSPDelta();
+      MachineOperand &StackAdjust = MBBI->getOperand(1);
+      assert(StackAdjust.isImm() && "Expecting immediate value.");
+      // Adjust stack pointer.
+      int StackAdj = StackAdjust.getImm();
+      int Delta = StackAdj - MaxTCRetDelta;
+      assert((Delta >= 0) && "Delta must be positive");
+      if (MaxTCRetDelta>0)
+        FrameSize += (StackAdj +Delta);
+      else
+        FrameSize += StackAdj;
+    }
+  }
+
+  // Frames of 32KB & larger require special handling because they cannot be
+  // indexed into with a simple LD/LWZ immediate offset operand.
+  bool isLargeFrame = !isInt<16>(FrameSize);
+
+  // On targets without red zone, the SP needs to be restored last, so that
+  // all live contents of the stack frame are upwards of the SP. This means
+  // that we cannot restore SP just now, since there may be more registers
+  // to restore from the stack frame (e.g. R31). If the frame size is not
+  // a simple immediate value, we will need a spare register to hold the
+  // restored SP. If the frame size is known and small, we can simply adjust
+  // the offsets of the registers to be restored, and still use SP to restore
+  // them. In such case, the final update of SP will be to add the frame
+  // size to it.
+  // To simplify the code, set RBReg to the base register used to restore
+  // values from the stack, and set SPAdd to the value that needs to be added
+  // to the SP at the end. The default values are as if red zone was present.
+  unsigned RBReg = SPReg;
+  unsigned SPAdd = 0;
+
+  if (FrameSize) {
+    // In the prologue, the loaded (or persistent) stack pointer value is
+    // offset by the STDU/STDUX/STWU/STWUX instruction. For targets with red
+    // zone add this offset back now.
+
+    // If this function contained a fastcc call and GuaranteedTailCallOpt is
+    // enabled (=> hasFastCall()==true) the fastcc call might contain a tail
+    // call which invalidates the stack pointer value in SP(0). So we use the
+    // value of R31 in this case.
+    if (FI->hasFastCall()) {
+      assert(HasFP && "Expecting a valid frame pointer.");
+      if (!HasRedZone)
+        RBReg = FPReg;
+      if (!isLargeFrame) {
+        BuildMI(MBB, MBBI, dl, AddImmInst, RBReg)
+          .addReg(FPReg).addImm(FrameSize);
+      } else {
+        BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
+          .addImm(FrameSize >> 16);
+        BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+          .addReg(ScratchReg, RegState::Kill)
+          .addImm(FrameSize & 0xFFFF);
+        BuildMI(MBB, MBBI, dl, AddInst)
+          .addReg(RBReg)
+          .addReg(FPReg)
+          .addReg(ScratchReg);
+      }
+    } else if (!isLargeFrame && !HasBP && !MFI.hasVarSizedObjects()) {
+      if (HasRedZone) {
+        BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+          .addReg(SPReg)
+          .addImm(FrameSize);
+      } else {
+        // Make sure that adding FrameSize will not overflow the max offset
+        // size.
+        assert(FPOffset <= 0 && BPOffset <= 0 && PBPOffset <= 0 &&
+               "Local offsets should be negative");
+        SPAdd = FrameSize;
+        FPOffset += FrameSize;
+        BPOffset += FrameSize;
+        PBPOffset += FrameSize;
+      }
+    } else {
+      // We don't want to use ScratchReg as a base register, because it
+      // could happen to be R0. Use FP instead, but make sure to preserve it.
+      if (!HasRedZone) {
+        // If FP is not saved, copy it to ScratchReg.
+        if (!HasFP)
+          BuildMI(MBB, MBBI, dl, OrInst, ScratchReg)
+            .addReg(FPReg)
+            .addReg(FPReg);
+        RBReg = FPReg;
+      }
+      BuildMI(MBB, MBBI, dl, LoadInst, RBReg)
+        .addImm(0)
+        .addReg(SPReg);
+    }
+  }
+  assert(RBReg != ScratchReg && "Should have avoided ScratchReg");
+  // If there is no red zone, ScratchReg may be needed for holding a useful
+  // value (although not the base register). Make sure it is not overwritten
+  // too early.
+
+  assert((isPPC64 || !MustSaveCR) &&
+         "Epilogue CR restoring supported only in 64-bit mode");
+
+  // If we need to restore both the LR and the CR and we only have one
+  // available scratch register, we must do them one at a time.
+  if (MustSaveCR && SingleScratchReg && MustSaveLR) {
+    // Here TempReg == ScratchReg, and in the absence of red zone ScratchReg
+    // is live here.
+    assert(HasRedZone && "Expecting red zone");
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)
+      .addImm(8)
+      .addReg(SPReg);
+    for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])
+        .addReg(TempReg, getKillRegState(i == e-1));
+  }
+
+  // Delay restoring of the LR if ScratchReg is needed. This is ok, since
+  // LR is stored in the caller's stack frame. ScratchReg will be needed
+  // if RBReg is anything other than SP. We shouldn't use ScratchReg as
+  // a base register anyway, because it may happen to be R0.
+  bool LoadedLR = false;
+  if (MustSaveLR && RBReg == SPReg && isInt<16>(LROffset+SPAdd)) {
+    BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg)
+      .addImm(LROffset+SPAdd)
+      .addReg(RBReg);
+    LoadedLR = true;
+  }
+
+  if (MustSaveCR && !(SingleScratchReg && MustSaveLR)) {
+    // This will only occur for PPC64.
+    assert(isPPC64 && "Expecting 64-bit mode");
+    assert(RBReg == SPReg && "Should be using SP as a base register");
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)
+      .addImm(8)
+      .addReg(RBReg);
+  }
+
+  if (HasFP) {
+    // If there is red zone, restore FP directly, since SP has already been
+    // restored. Otherwise, restore the value of FP into ScratchReg.
+    if (HasRedZone || RBReg == SPReg)
+      BuildMI(MBB, MBBI, dl, LoadInst, FPReg)
+        .addImm(FPOffset)
+        .addReg(SPReg);
+    else
+      BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg)
+        .addImm(FPOffset)
+        .addReg(RBReg);
+  }
+
+  if (FI->usesPICBase())
+    BuildMI(MBB, MBBI, dl, LoadInst)
+      .addReg(PPC::R30)
+      .addImm(PBPOffset)
+      .addReg(RBReg);
+
+  if (HasBP)
+    BuildMI(MBB, MBBI, dl, LoadInst, BPReg)
+      .addImm(BPOffset)
+      .addReg(RBReg);
+
+  // There is nothing more to be loaded from the stack, so now we can
+  // restore SP: SP = RBReg + SPAdd.
+  if (RBReg != SPReg || SPAdd != 0) {
+    assert(!HasRedZone && "This should not happen with red zone");
+    // If SPAdd is 0, generate a copy.
+    if (SPAdd == 0)
+      BuildMI(MBB, MBBI, dl, OrInst, SPReg)
+        .addReg(RBReg)
+        .addReg(RBReg);
+    else
+      BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+        .addReg(RBReg)
+        .addImm(SPAdd);
+
+    assert(RBReg != ScratchReg && "Should be using FP or SP as base register");
+    if (RBReg == FPReg)
+      BuildMI(MBB, MBBI, dl, OrInst, FPReg)
+        .addReg(ScratchReg)
+        .addReg(ScratchReg);
+
+    // Now load the LR from the caller's stack frame.
+    if (MustSaveLR && !LoadedLR)
+      BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg)
+        .addImm(LROffset)
+        .addReg(SPReg);
+  }
+
+  if (MustSaveCR &&
+      !(SingleScratchReg && MustSaveLR)) // will only occur for PPC64
+    for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])
+        .addReg(TempReg, getKillRegState(i == e-1));
+
+  if (MustSaveLR)
+    BuildMI(MBB, MBBI, dl, MTLRInst).addReg(ScratchReg);
+
+  // Callee pop calling convention. Pop parameter/linkage area. Used for tail
+  // call optimization
+  if (IsReturnBlock) {
+    unsigned RetOpcode = MBBI->getOpcode();
+    if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+        (RetOpcode == PPC::BLR || RetOpcode == PPC::BLR8) &&
+        MF.getFunction()->getCallingConv() == CallingConv::Fast) {
+      PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+      unsigned CallerAllocatedAmt = FI->getMinReservedArea();
+
+      if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) {
+        BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+          .addReg(SPReg).addImm(CallerAllocatedAmt);
+      } else {
+        BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
+          .addImm(CallerAllocatedAmt >> 16);
+        BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+          .addReg(ScratchReg, RegState::Kill)
+          .addImm(CallerAllocatedAmt & 0xFFFF);
+        BuildMI(MBB, MBBI, dl, AddInst)
+          .addReg(SPReg)
+          .addReg(FPReg)
+          .addReg(ScratchReg);
+      }
+    } else {
+      createTailCallBranchInstr(MBB);
+    }
+  }
+}
+
+void PPCFrameLowering::createTailCallBranchInstr(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  DebugLoc dl;
+
+  if (MBBI != MBB.end())
+    dl = MBBI->getDebugLoc();
+
+  const PPCInstrInfo &TII =
+      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+
+  // Create branch instruction for pseudo tail call return instruction
+  unsigned RetOpcode = MBBI->getOpcode();
+  if (RetOpcode == PPC::TCRETURNdi) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)).
+      addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+  } else if (RetOpcode == PPC::TCRETURNri) {
+    MBBI = MBB.getLastNonDebugInstr();
+    assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR));
+  } else if (RetOpcode == PPC::TCRETURNai) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm());
+  } else if (RetOpcode == PPC::TCRETURNdi8) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)).
+      addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+  } else if (RetOpcode == PPC::TCRETURNri8) {
+    MBBI = MBB.getLastNonDebugInstr();
+    assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8));
+  } else if (RetOpcode == PPC::TCRETURNai8) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm());
+  }
+}
+
+void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                            BitVector &SavedRegs,
+                                            RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+  const PPCRegisterInfo *RegInfo =
+      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+
+  //  Save and clear the LR state.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  unsigned LR = RegInfo->getRARegister();
+  FI->setMustSaveLR(MustSaveLR(MF, LR));
+  SavedRegs.reset(LR);
+
+  //  Save R31 if necessary
+  int FPSI = FI->getFramePointerSaveIndex();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI  = Subtarget.isDarwinABI();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // If the frame pointer save index hasn't been defined yet.
+  if (!FPSI && needsFP(MF)) {
+    // Find out what the fix offset of the frame pointer save area.
+    int FPOffset = getFramePointerSaveOffset();
+    // Allocate the frame index for frame pointer save area.
+    FPSI = MFI.CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
+    // Save the result.
+    FI->setFramePointerSaveIndex(FPSI);
+  }
+
+  int BPSI = FI->getBasePointerSaveIndex();
+  if (!BPSI && RegInfo->hasBasePointer(MF)) {
+    int BPOffset = getBasePointerSaveOffset();
+    // Allocate the frame index for the base pointer save area.
+    BPSI = MFI.CreateFixedObject(isPPC64? 8 : 4, BPOffset, true);
+    // Save the result.
+    FI->setBasePointerSaveIndex(BPSI);
+  }
+
+  // Reserve stack space for the PIC Base register (R30).
+  // Only used in SVR4 32-bit.
+  if (FI->usesPICBase()) {
+    int PBPSI = MFI.CreateFixedObject(4, -8, true);
+    FI->setPICBasePointerSaveIndex(PBPSI);
+  }
+
+  // Make sure we don't explicitly spill r31, because, for example, we have
+  // some inline asm which explicity clobbers it, when we otherwise have a
+  // frame pointer and are using r31's spill slot for the prologue/epilogue
+  // code. Same goes for the base pointer and the PIC base register.
+  if (needsFP(MF))
+    SavedRegs.reset(isPPC64 ? PPC::X31 : PPC::R31);
+  if (RegInfo->hasBasePointer(MF))
+    SavedRegs.reset(RegInfo->getBaseRegister(MF));
+  if (FI->usesPICBase())
+    SavedRegs.reset(PPC::R30);
+
+  // Reserve stack space to move the linkage area to in case of a tail call.
+  int TCSPDelta = 0;
+  if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+      (TCSPDelta = FI->getTailCallSPDelta()) < 0) {
+    MFI.CreateFixedObject(-1 * TCSPDelta, TCSPDelta, true);
+  }
+
+  // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the
+  // function uses CR 2, 3, or 4.
+  if (!isPPC64 && !isDarwinABI &&
+      (SavedRegs.test(PPC::CR2) ||
+       SavedRegs.test(PPC::CR3) ||
+       SavedRegs.test(PPC::CR4))) {
+    int FrameIdx = MFI.CreateFixedObject((uint64_t)4, (int64_t)-4, true);
+    FI->setCRSpillFrameIndex(FrameIdx);
+  }
+}
+
+void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                                       RegScavenger *RS) const {
+  // Early exit if not using the SVR4 ABI.
+  if (!Subtarget.isSVR4ABI()) {
+    addScavengingSpillSlot(MF, RS);
+    return;
+  }
+
+  // Get callee saved register information.
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+  // If the function is shrink-wrapped, and if the function has a tail call, the
+  // tail call might not be in the new RestoreBlock, so real branch instruction
+  // won't be generated by emitEpilogue(), because shrink-wrap has chosen new
+  // RestoreBlock. So we handle this case here.
+  if (MFI.getSavePoint() && MFI.hasTailCall()) {
+    MachineBasicBlock *RestoreBlock = MFI.getRestorePoint();
+    for (MachineBasicBlock &MBB : MF) {
+      if (MBB.isReturnBlock() && (&MBB) != RestoreBlock)
+        createTailCallBranchInstr(MBB);
+    }
+  }
+
+  // Early exit if no callee saved registers are modified!
+  if (CSI.empty() && !needsFP(MF)) {
+    addScavengingSpillSlot(MF, RS);
+    return;
+  }
+
+  unsigned MinGPR = PPC::R31;
+  unsigned MinG8R = PPC::X31;
+  unsigned MinFPR = PPC::F31;
+  unsigned MinVR = PPC::V31;
+
+  bool HasGPSaveArea = false;
+  bool HasG8SaveArea = false;
+  bool HasFPSaveArea = false;
+  bool HasVRSAVESaveArea = false;
+  bool HasVRSaveArea = false;
+
+  SmallVector<CalleeSavedInfo, 18> GPRegs;
+  SmallVector<CalleeSavedInfo, 18> G8Regs;
+  SmallVector<CalleeSavedInfo, 18> FPRegs;
+  SmallVector<CalleeSavedInfo, 18> VRegs;
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    if (PPC::GPRCRegClass.contains(Reg)) {
+      HasGPSaveArea = true;
+
+      GPRegs.push_back(CSI[i]);
+
+      if (Reg < MinGPR) {
+        MinGPR = Reg;
+      }
+    } else if (PPC::G8RCRegClass.contains(Reg)) {
+      HasG8SaveArea = true;
+
+      G8Regs.push_back(CSI[i]);
+
+      if (Reg < MinG8R) {
+        MinG8R = Reg;
+      }
+    } else if (PPC::F8RCRegClass.contains(Reg)) {
+      HasFPSaveArea = true;
+
+      FPRegs.push_back(CSI[i]);
+
+      if (Reg < MinFPR) {
+        MinFPR = Reg;
+      }
+    } else if (PPC::CRBITRCRegClass.contains(Reg) ||
+               PPC::CRRCRegClass.contains(Reg)) {
+      ; // do nothing, as we already know whether CRs are spilled
+    } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
+      HasVRSAVESaveArea = true;
+    } else if (PPC::VRRCRegClass.contains(Reg)) {
+      HasVRSaveArea = true;
+
+      VRegs.push_back(CSI[i]);
+
+      if (Reg < MinVR) {
+        MinVR = Reg;
+      }
+    } else {
+      llvm_unreachable("Unknown RegisterClass!");
+    }
+  }
+
+  PPCFunctionInfo *PFI = MF.getInfo<PPCFunctionInfo>();
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+  int64_t LowerBound = 0;
+
+  // Take into account stack space reserved for tail calls.
+  int TCSPDelta = 0;
+  if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+      (TCSPDelta = PFI->getTailCallSPDelta()) < 0) {
+    LowerBound = TCSPDelta;
+  }
+
+  // The Floating-point register save area is right below the back chain word
+  // of the previous stack frame.
+  if (HasFPSaveArea) {
+    for (unsigned i = 0, e = FPRegs.size(); i != e; ++i) {
+      int FI = FPRegs[i].getFrameIdx();
+
+      MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+    }
+
+    LowerBound -= (31 - TRI->getEncodingValue(MinFPR) + 1) * 8;
+  }
+
+  // Check whether the frame pointer register is allocated. If so, make sure it
+  // is spilled to the correct offset.
+  if (needsFP(MF)) {
+    HasGPSaveArea = true;
+
+    int FI = PFI->getFramePointerSaveIndex();
+    assert(FI && "No Frame Pointer Save Slot!");
+
+    MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+  }
+
+  if (PFI->usesPICBase()) {
+    HasGPSaveArea = true;
+
+    int FI = PFI->getPICBasePointerSaveIndex();
+    assert(FI && "No PIC Base Pointer Save Slot!");
+
+    MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+  }
+
+  const PPCRegisterInfo *RegInfo =
+      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  if (RegInfo->hasBasePointer(MF)) {
+    HasGPSaveArea = true;
+
+    int FI = PFI->getBasePointerSaveIndex();
+    assert(FI && "No Base Pointer Save Slot!");
+
+    MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+  }
+
+  // General register save area starts right below the Floating-point
+  // register save area.
+  if (HasGPSaveArea || HasG8SaveArea) {
+    // Move general register save area spill slots down, taking into account
+    // the size of the Floating-point register save area.
+    for (unsigned i = 0, e = GPRegs.size(); i != e; ++i) {
+      int FI = GPRegs[i].getFrameIdx();
+
+      MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+    }
+
+    // Move general register save area spill slots down, taking into account
+    // the size of the Floating-point register save area.
+    for (unsigned i = 0, e = G8Regs.size(); i != e; ++i) {
+      int FI = G8Regs[i].getFrameIdx();
+
+      MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+    }
+
+    unsigned MinReg =
+      std::min<unsigned>(TRI->getEncodingValue(MinGPR),
+                         TRI->getEncodingValue(MinG8R));
+
+    if (Subtarget.isPPC64()) {
+      LowerBound -= (31 - MinReg + 1) * 8;
+    } else {
+      LowerBound -= (31 - MinReg + 1) * 4;
+    }
+  }
+
+  // For 32-bit only, the CR save area is below the general register
+  // save area.  For 64-bit SVR4, the CR save area is addressed relative
+  // to the stack pointer and hence does not need an adjustment here.
+  // Only CR2 (the first nonvolatile spilled) has an associated frame
+  // index so that we have a single uniform save area.
+  if (spillsCR(MF) && !(Subtarget.isPPC64() && Subtarget.isSVR4ABI())) {
+    // Adjust the frame index of the CR spill slot.
+    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+      unsigned Reg = CSI[i].getReg();
+
+      if ((Subtarget.isSVR4ABI() && Reg == PPC::CR2)
+          // Leave Darwin logic as-is.
+          || (!Subtarget.isSVR4ABI() &&
+              (PPC::CRBITRCRegClass.contains(Reg) ||
+               PPC::CRRCRegClass.contains(Reg)))) {
+        int FI = CSI[i].getFrameIdx();
+
+        MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+      }
+    }
+
+    LowerBound -= 4; // The CR save area is always 4 bytes long.
+  }
+
+  if (HasVRSAVESaveArea) {
+    // FIXME SVR4: Is it actually possible to have multiple elements in CSI
+    //             which have the VRSAVE register class?
+    // Adjust the frame index of the VRSAVE spill slot.
+    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+      unsigned Reg = CSI[i].getReg();
+
+      if (PPC::VRSAVERCRegClass.contains(Reg)) {
+        int FI = CSI[i].getFrameIdx();
+
+        MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+      }
+    }
+
+    LowerBound -= 4; // The VRSAVE save area is always 4 bytes long.
+  }
+
+  if (HasVRSaveArea) {
+    // Insert alignment padding, we need 16-byte alignment.
+    LowerBound = (LowerBound - 15) & ~(15);
+
+    for (unsigned i = 0, e = VRegs.size(); i != e; ++i) {
+      int FI = VRegs[i].getFrameIdx();
+
+      MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+    }
+  }
+
+  addScavengingSpillSlot(MF, RS);
+}
+
+void
+PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
+                                         RegScavenger *RS) const {
+  // Reserve a slot closest to SP or frame pointer if we have a dynalloc or
+  // a large stack, which will require scavenging a register to materialize a
+  // large offset.
+
+  // We need to have a scavenger spill slot for spills if the frame size is
+  // large. In case there is no free register for large-offset addressing,
+  // this slot is used for the necessary emergency spill. Also, we need the
+  // slot for dynamic stack allocations.
+
+  // The scavenger might be invoked if the frame offset does not fit into
+  // the 16-bit immediate. We don't know the complete frame size here
+  // because we've not yet computed callee-saved register spills or the
+  // needed alignment padding.
+  unsigned StackSize = determineFrameLayout(MF, false, true);
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (MFI.hasVarSizedObjects() || spillsCR(MF) || spillsVRSAVE(MF) ||
+      hasNonRISpills(MF) || (hasSpills(MF) && !isInt<16>(StackSize))) {
+    const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+    const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+    const TargetRegisterClass *RC = Subtarget.isPPC64() ? G8RC : GPRC;
+    RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
+                                                      RC->getAlignment(),
+                                                      false));
+
+    // Might we have over-aligned allocas?
+    bool HasAlVars = MFI.hasVarSizedObjects() &&
+                     MFI.getMaxAlignment() > getStackAlignment();
+
+    // These kinds of spills might need two registers.
+    if (spillsCR(MF) || spillsVRSAVE(MF) || HasAlVars)
+      RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
+                                                        RC->getAlignment(),
+                                                        false));
+
+  }
+}
+
+bool
+PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI,
+                                     const std::vector<CalleeSavedInfo> &CSI,
+                                     const TargetRegisterInfo *TRI) const {
+
+  // Currently, this function only handles SVR4 32- and 64-bit ABIs.
+  // Return false otherwise to maintain pre-existing behavior.
+  if (!Subtarget.isSVR4ABI())
+    return false;
+
+  MachineFunction *MF = MBB.getParent();
+  const PPCInstrInfo &TII =
+      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+  DebugLoc DL;
+  bool CRSpilled = false;
+  MachineInstrBuilder CRMIB;
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    // Only Darwin actually uses the VRSAVE register, but it can still appear
+    // here if, for example, @llvm.eh.unwind.init() is used.  If we're not on
+    // Darwin, ignore it.
+    if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI())
+      continue;
+
+    // CR2 through CR4 are the nonvolatile CR fields.
+    bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4;
+
+    // Add the callee-saved register as live-in; it's killed at the spill.
+    MBB.addLiveIn(Reg);
+
+    if (CRSpilled && IsCRField) {
+      CRMIB.addReg(Reg, RegState::ImplicitKill);
+      continue;
+    }
+
+    // Insert the spill to the stack frame.
+    if (IsCRField) {
+      PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>();
+      if (Subtarget.isPPC64()) {
+        // The actual spill will happen at the start of the prologue.
+        FuncInfo->addMustSaveCR(Reg);
+      } else {
+        CRSpilled = true;
+        FuncInfo->setSpillsCR();
+
+        // 32-bit:  FP-relative.  Note that we made sure CR2-CR4 all have
+        // the same frame index in PPCRegisterInfo::hasReservedSpillSlot.
+        CRMIB = BuildMI(*MF, DL, TII.get(PPC::MFCR), PPC::R12)
+                  .addReg(Reg, RegState::ImplicitKill);
+
+        MBB.insert(MI, CRMIB);
+        MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::STW))
+                                         .addReg(PPC::R12,
+                                                 getKillRegState(true)),
+                                         CSI[i].getFrameIdx()));
+      }
+    } else {
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.storeRegToStackSlot(MBB, MI, Reg, true,
+                              CSI[i].getFrameIdx(), RC, TRI);
+    }
+  }
+  return true;
+}
+
+static void
+restoreCRs(bool isPPC64, bool is31,
+           bool CR2Spilled, bool CR3Spilled, bool CR4Spilled,
+           MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+           const std::vector<CalleeSavedInfo> &CSI, unsigned CSIIndex) {
+
+  MachineFunction *MF = MBB.getParent();
+  const PPCInstrInfo &TII = *MF->getSubtarget<PPCSubtarget>().getInstrInfo();
+  DebugLoc DL;
+  unsigned RestoreOp, MoveReg;
+
+  if (isPPC64)
+    // This is handled during epilogue generation.
+    return;
+  else {
+    // 32-bit:  FP-relative
+    MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::LWZ),
+                                             PPC::R12),
+                                     CSI[CSIIndex].getFrameIdx()));
+    RestoreOp = PPC::MTOCRF;
+    MoveReg = PPC::R12;
+  }
+
+  if (CR2Spilled)
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR2)
+               .addReg(MoveReg, getKillRegState(!CR3Spilled && !CR4Spilled)));
+
+  if (CR3Spilled)
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR3)
+               .addReg(MoveReg, getKillRegState(!CR4Spilled)));
+
+  if (CR4Spilled)
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR4)
+               .addReg(MoveReg, getKillRegState(true)));
+}
+
+MachineBasicBlock::iterator PPCFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+      I->getOpcode() == PPC::ADJCALLSTACKUP) {
+    // Add (actually subtract) back the amount the callee popped on return.
+    if (int CalleeAmt =  I->getOperand(1).getImm()) {
+      bool is64Bit = Subtarget.isPPC64();
+      CalleeAmt *= -1;
+      unsigned StackReg = is64Bit ? PPC::X1 : PPC::R1;
+      unsigned TmpReg = is64Bit ? PPC::X0 : PPC::R0;
+      unsigned ADDIInstr = is64Bit ? PPC::ADDI8 : PPC::ADDI;
+      unsigned ADDInstr = is64Bit ? PPC::ADD8 : PPC::ADD4;
+      unsigned LISInstr = is64Bit ? PPC::LIS8 : PPC::LIS;
+      unsigned ORIInstr = is64Bit ? PPC::ORI8 : PPC::ORI;
+      const DebugLoc &dl = I->getDebugLoc();
+
+      if (isInt<16>(CalleeAmt)) {
+        BuildMI(MBB, I, dl, TII.get(ADDIInstr), StackReg)
+          .addReg(StackReg, RegState::Kill)
+          .addImm(CalleeAmt);
+      } else {
+        MachineBasicBlock::iterator MBBI = I;
+        BuildMI(MBB, MBBI, dl, TII.get(LISInstr), TmpReg)
+          .addImm(CalleeAmt >> 16);
+        BuildMI(MBB, MBBI, dl, TII.get(ORIInstr), TmpReg)
+          .addReg(TmpReg, RegState::Kill)
+          .addImm(CalleeAmt & 0xFFFF);
+        BuildMI(MBB, MBBI, dl, TII.get(ADDInstr), StackReg)
+          .addReg(StackReg, RegState::Kill)
+          .addReg(TmpReg);
+      }
+    }
+  }
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  return MBB.erase(I);
+}
+
+bool
+PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+
+  // Currently, this function only handles SVR4 32- and 64-bit ABIs.
+  // Return false otherwise to maintain pre-existing behavior.
+  if (!Subtarget.isSVR4ABI())
+    return false;
+
+  MachineFunction *MF = MBB.getParent();
+  const PPCInstrInfo &TII =
+      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+  bool CR2Spilled = false;
+  bool CR3Spilled = false;
+  bool CR4Spilled = false;
+  unsigned CSIIndex = 0;
+
+  // Initialize insertion-point logic; we will be restoring in reverse
+  // order of spill.
+  MachineBasicBlock::iterator I = MI, BeforeI = I;
+  bool AtStart = I == MBB.begin();
+
+  if (!AtStart)
+    --BeforeI;
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+
+    // Only Darwin actually uses the VRSAVE register, but it can still appear
+    // here if, for example, @llvm.eh.unwind.init() is used.  If we're not on
+    // Darwin, ignore it.
+    if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI())
+      continue;
+
+    if (Reg == PPC::CR2) {
+      CR2Spilled = true;
+      // The spill slot is associated only with CR2, which is the
+      // first nonvolatile spilled.  Save it here.
+      CSIIndex = i;
+      continue;
+    } else if (Reg == PPC::CR3) {
+      CR3Spilled = true;
+      continue;
+    } else if (Reg == PPC::CR4) {
+      CR4Spilled = true;
+      continue;
+    } else {
+      // When we first encounter a non-CR register after seeing at
+      // least one CR register, restore all spilled CRs together.
+      if ((CR2Spilled || CR3Spilled || CR4Spilled)
+          && !(PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+        bool is31 = needsFP(*MF);
+        restoreCRs(Subtarget.isPPC64(), is31,
+                   CR2Spilled, CR3Spilled, CR4Spilled,
+                   MBB, I, CSI, CSIIndex);
+        CR2Spilled = CR3Spilled = CR4Spilled = false;
+      }
+
+      // Default behavior for non-CR saves.
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(),
+                               RC, TRI);
+      assert(I != MBB.begin() &&
+             "loadRegFromStackSlot didn't insert any code!");
+      }
+
+    // Insert in reverse order.
+    if (AtStart)
+      I = MBB.begin();
+    else {
+      I = BeforeI;
+      ++I;
+    }
+  }
+
+  // If we haven't yet spilled the CRs, do so now.
+  if (CR2Spilled || CR3Spilled || CR4Spilled) {
+    bool is31 = needsFP(*MF);
+    restoreCRs(Subtarget.isPPC64(), is31, CR2Spilled, CR3Spilled, CR4Spilled,
+               MBB, I, CSI, CSIIndex);
+  }
+
+  return true;
+}
+
+bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+  return (MF.getSubtarget<PPCSubtarget>().isSVR4ABI() &&
+          MF.getSubtarget<PPCSubtarget>().isPPC64());
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h
new file mode 100644
index 000000000000..28b0c57f0ffb
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h
@@ -0,0 +1,149 @@
+//===-- PPCFrameLowering.h - Define frame lowering for PowerPC --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCFRAMELOWERING_H
+#define LLVM_LIB_TARGET_POWERPC_PPCFRAMELOWERING_H
+
+#include "PPC.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class PPCSubtarget;
+
+class PPCFrameLowering: public TargetFrameLowering {
+  const PPCSubtarget &Subtarget;
+  const unsigned ReturnSaveOffset;
+  const unsigned TOCSaveOffset;
+  const unsigned FramePointerSaveOffset;
+  const unsigned LinkageSize;
+  const unsigned BasePointerSaveOffset;
+
+  /**
+   * \brief Find register[s] that can be used in function prologue and epilogue
+   *
+   * Find register[s] that can be use as scratch register[s] in function
+   * prologue and epilogue to save various registers (Link Register, Base
+   * Pointer, etc.). Prefer R0/R12, if available. Otherwise choose whatever
+   * register[s] are available.
+   *
+   * This method will return true if it is able to find enough unique scratch
+   * registers (1 or 2 depending on the requirement). If it is unable to find
+   * enough available registers in the block, it will return false and set
+   * any passed output parameter that corresponds to a required unique register
+   * to PPC::NoRegister.
+   *
+   * \param[in] MBB The machine basic block to find an available register for
+   * \param[in] UseAtEnd Specify whether the scratch register will be used at
+   *                     the end of the basic block (i.e., will the scratch
+   *                     register kill a register defined in the basic block)
+   * \param[in] TwoUniqueRegsRequired Specify whether this basic block will
+   *                                  require two unique scratch registers.
+   * \param[out] SR1 The scratch register to use
+   * \param[out] SR2 The second scratch register. If this pointer is not null
+   *                 the function will attempt to set it to an available
+   *                 register regardless of whether there is a hard requirement
+   *                 for two unique scratch registers.
+   * \return true if the required number of registers was found.
+   *         false if the required number of scratch register weren't available.
+   *         If either output parameter refers to a required scratch register
+   *         that isn't available, it will be set to an invalid value.
+   */
+  bool findScratchRegister(MachineBasicBlock *MBB,
+                           bool UseAtEnd,
+                           bool TwoUniqueRegsRequired = false,
+                           unsigned *SR1 = nullptr,
+                           unsigned *SR2 = nullptr) const;
+  bool twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const;
+
+  /**
+   * \brief Create branch instruction for PPC::TCRETURN* (tail call return)
+   *
+   * \param[in] MBB that is terminated by PPC::TCRETURN*
+   */
+  void createTailCallBranchInstr(MachineBasicBlock &MBB) const;
+
+public:
+  PPCFrameLowering(const PPCSubtarget &STI);
+
+  unsigned determineFrameLayout(MachineFunction &MF,
+                                bool UpdateMF = true,
+                                bool UseEstimate = false) const;
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool needsFP(const MachineFunction &MF) const;
+  void replaceFPWithRealFP(MachineFunction &MF) const;
+
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS = nullptr) const override;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                     RegScavenger *RS = nullptr) const override;
+  void addScavengingSpillSlot(MachineFunction &MF, RegScavenger *RS) const;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  /// targetHandlesStackFrameRounding - Returns true if the target is
+  /// responsible for rounding up the stack frame (probably at emitPrologue
+  /// time).
+  bool targetHandlesStackFrameRounding() const override { return true; }
+
+  /// getReturnSaveOffset - Return the previous frame offset to save the
+  /// return address.
+  unsigned getReturnSaveOffset() const { return ReturnSaveOffset; }
+
+  /// getTOCSaveOffset - Return the previous frame offset to save the
+  /// TOC register -- 64-bit SVR4 ABI only.
+  unsigned getTOCSaveOffset() const { return TOCSaveOffset; }
+
+  /// getFramePointerSaveOffset - Return the previous frame offset to save the
+  /// frame pointer.
+  unsigned getFramePointerSaveOffset() const { return FramePointerSaveOffset; }
+
+  /// getBasePointerSaveOffset - Return the previous frame offset to save the
+  /// base pointer.
+  unsigned getBasePointerSaveOffset() const { return BasePointerSaveOffset; }
+
+  /// getLinkageSize - Return the size of the PowerPC ABI linkage area.
+  ///
+  unsigned getLinkageSize() const { return LinkageSize; }
+
+  const SpillSlot *
+  getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
+
+  bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
+  /// Methods used by shrink wrapping to determine if MBB can be used for the
+  /// function prologue/epilogue.
+  bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
+  bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
new file mode 100644
index 000000000000..f327396370f6
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -0,0 +1,436 @@
+//===-- PPCHazardRecognizers.cpp - PowerPC Hazard Recognizer Impls --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements hazard recognizers for scheduling on PowerPC processors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCHazardRecognizers.h"
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "pre-RA-sched"
+
+bool PPCDispatchGroupSBHazardRecognizer::isLoadAfterStore(SUnit *SU) {
+  // FIXME: Move this.
+  if (isBCTRAfterSet(SU))
+    return true;
+
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  if (!MCID)
+    return false;
+
+  if (!MCID->mayLoad())
+    return false;
+
+  // SU is a load; for any predecessors in this dispatch group, that are stores,
+  // and with which we have an ordering dependency, return true.
+  for (unsigned i = 0, ie = (unsigned) SU->Preds.size(); i != ie; ++i) {
+    const MCInstrDesc *PredMCID = DAG->getInstrDesc(SU->Preds[i].getSUnit());
+    if (!PredMCID || !PredMCID->mayStore())
+      continue;
+
+    if (!SU->Preds[i].isNormalMemory() && !SU->Preds[i].isBarrier())
+      continue;
+
+    for (unsigned j = 0, je = CurGroup.size(); j != je; ++j)
+      if (SU->Preds[i].getSUnit() == CurGroup[j])
+        return true;
+  }
+
+  return false; 
+}
+
+bool PPCDispatchGroupSBHazardRecognizer::isBCTRAfterSet(SUnit *SU) {
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  if (!MCID)
+    return false;
+
+  if (!MCID->isBranch())
+    return false;
+
+  // SU is a branch; for any predecessors in this dispatch group, with which we
+  // have a data dependence and set the counter register, return true.
+  for (unsigned i = 0, ie = (unsigned) SU->Preds.size(); i != ie; ++i) {
+    const MCInstrDesc *PredMCID = DAG->getInstrDesc(SU->Preds[i].getSUnit());
+    if (!PredMCID || PredMCID->getSchedClass() != PPC::Sched::IIC_SprMTSPR)
+      continue;
+
+    if (SU->Preds[i].isCtrl())
+      continue;
+
+    for (unsigned j = 0, je = CurGroup.size(); j != je; ++j)
+      if (SU->Preds[i].getSUnit() == CurGroup[j])
+        return true;
+  }
+
+  return false; 
+}
+
+// FIXME: Remove this when we don't need this:
+namespace llvm { namespace PPC { extern int getNonRecordFormOpcode(uint16_t); } }
+
+// FIXME: A lot of code in PPCDispatchGroupSBHazardRecognizer is P7 specific.
+
+bool PPCDispatchGroupSBHazardRecognizer::mustComeFirst(const MCInstrDesc *MCID,
+                                                       unsigned &NSlots) {
+  // FIXME: Indirectly, this information is contained in the itinerary, and
+  // we should derive it from there instead of separately specifying it
+  // here.
+  unsigned IIC = MCID->getSchedClass();
+  switch (IIC) {
+  default:
+    NSlots = 1;
+    break;
+  case PPC::Sched::IIC_IntDivW:
+  case PPC::Sched::IIC_IntDivD:
+  case PPC::Sched::IIC_LdStLoadUpd:
+  case PPC::Sched::IIC_LdStLDU:
+  case PPC::Sched::IIC_LdStLFDU:
+  case PPC::Sched::IIC_LdStLFDUX:
+  case PPC::Sched::IIC_LdStLHA:
+  case PPC::Sched::IIC_LdStLHAU:
+  case PPC::Sched::IIC_LdStLWA:
+  case PPC::Sched::IIC_LdStSTDU:
+  case PPC::Sched::IIC_LdStSTFDU:
+    NSlots = 2;
+    break;
+  case PPC::Sched::IIC_LdStLoadUpdX:
+  case PPC::Sched::IIC_LdStLDUX:
+  case PPC::Sched::IIC_LdStLHAUX:
+  case PPC::Sched::IIC_LdStLWARX:
+  case PPC::Sched::IIC_LdStLDARX:
+  case PPC::Sched::IIC_LdStSTDUX:
+  case PPC::Sched::IIC_LdStSTDCX:
+  case PPC::Sched::IIC_LdStSTWCX:
+  case PPC::Sched::IIC_BrMCRX: // mtcr
+  // FIXME: Add sync/isync (here and in the itinerary).
+    NSlots = 4;
+    break;
+  }
+
+  // FIXME: record-form instructions need a different itinerary class.
+  if (NSlots == 1 && PPC::getNonRecordFormOpcode(MCID->getOpcode()) != -1)
+    NSlots = 2;
+
+  switch (IIC) {
+  default:
+    // All multi-slot instructions must come first.
+    return NSlots > 1;
+  case PPC::Sched::IIC_BrCR: // cr logicals
+  case PPC::Sched::IIC_SprMFCR:
+  case PPC::Sched::IIC_SprMFCRF:
+  case PPC::Sched::IIC_SprMTSPR:
+    return true;
+  }
+}
+
+ScheduleHazardRecognizer::HazardType
+PPCDispatchGroupSBHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  if (Stalls == 0 && isLoadAfterStore(SU))
+    return NoopHazard;
+
+  return ScoreboardHazardRecognizer::getHazardType(SU, Stalls);
+}
+
+bool PPCDispatchGroupSBHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  unsigned NSlots;
+  if (MCID && mustComeFirst(MCID, NSlots) && CurSlots)
+    return true;
+
+  return ScoreboardHazardRecognizer::ShouldPreferAnother(SU);
+}
+
+unsigned PPCDispatchGroupSBHazardRecognizer::PreEmitNoops(SUnit *SU) {
+  // We only need to fill out a maximum of 5 slots here: The 6th slot could
+  // only be a second branch, and otherwise the next instruction will start a
+  // new group.
+  if (isLoadAfterStore(SU) && CurSlots < 6) {
+    unsigned Directive =
+        DAG->MF.getSubtarget<PPCSubtarget>().getDarwinDirective();
+    // If we're using a special group-terminating nop, then we need only one.
+    // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
+    if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 ||
+        Directive == PPC::DIR_PWR8 || Directive == PPC::DIR_PWR9)
+      return 1;
+
+    return 5 - CurSlots;
+  }
+
+  return ScoreboardHazardRecognizer::PreEmitNoops(SU);
+}
+
+void PPCDispatchGroupSBHazardRecognizer::EmitInstruction(SUnit *SU) {
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  if (MCID) {
+    if (CurSlots == 5 || (MCID->isBranch() && CurBranches == 1)) {
+      CurGroup.clear();
+      CurSlots = CurBranches = 0;
+    } else {
+      DEBUG(dbgs() << "**** Adding to dispatch group: SU(" <<
+                      SU->NodeNum << "): ");
+      DEBUG(DAG->dumpNode(SU));
+
+      unsigned NSlots;
+      bool MustBeFirst = mustComeFirst(MCID, NSlots);
+
+      // If this instruction must come first, but does not, then it starts a
+      // new group.
+      if (MustBeFirst && CurSlots) {
+        CurSlots = CurBranches = 0;
+        CurGroup.clear();
+      }
+
+      CurSlots += NSlots;
+      CurGroup.push_back(SU);
+
+      if (MCID->isBranch())
+        ++CurBranches;
+    }
+  }
+
+  return ScoreboardHazardRecognizer::EmitInstruction(SU);
+}
+
+void PPCDispatchGroupSBHazardRecognizer::AdvanceCycle() {
+  return ScoreboardHazardRecognizer::AdvanceCycle();
+}
+
+void PPCDispatchGroupSBHazardRecognizer::RecedeCycle() {
+  llvm_unreachable("Bottom-up scheduling not supported");
+}
+
+void PPCDispatchGroupSBHazardRecognizer::Reset() {
+  CurGroup.clear();
+  CurSlots = CurBranches = 0;
+  return ScoreboardHazardRecognizer::Reset();
+}
+
+void PPCDispatchGroupSBHazardRecognizer::EmitNoop() {
+  unsigned Directive =
+      DAG->MF.getSubtarget<PPCSubtarget>().getDarwinDirective();
+  // If the group has now filled all of its slots, or if we're using a special
+  // group-terminating nop, the group is complete.
+  // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
+  if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 ||
+      Directive == PPC::DIR_PWR8 || Directive == PPC::DIR_PWR9 ||
+      CurSlots == 6) {
+    CurGroup.clear();
+    CurSlots = CurBranches = 0;
+  } else {
+    CurGroup.push_back(nullptr);
+    ++CurSlots;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// PowerPC 970 Hazard Recognizer
+//
+// This models the dispatch group formation of the PPC970 processor.  Dispatch
+// groups are bundles of up to five instructions that can contain various mixes
+// of instructions.  The PPC970 can dispatch a peak of 4 non-branch and one
+// branch instruction per-cycle.
+//
+// There are a number of restrictions to dispatch group formation: some
+// instructions can only be issued in the first slot of a dispatch group, & some
+// instructions fill an entire dispatch group.  Additionally, only branches can
+// issue in the 5th (last) slot.
+//
+// Finally, there are a number of "structural" hazards on the PPC970.  These
+// conditions cause large performance penalties due to misprediction, recovery,
+// and replay logic that has to happen.  These cases include setting a CTR and
+// branching through it in the same dispatch group, and storing to an address,
+// then loading from the same address within a dispatch group.  To avoid these
+// conditions, we insert no-op instructions when appropriate.
+//
+// FIXME: This is missing some significant cases:
+//   1. Modeling of microcoded instructions.
+//   2. Handling of serialized operations.
+//   3. Handling of the esoteric cases in "Resource-based Instruction Grouping".
+//
+
+PPCHazardRecognizer970::PPCHazardRecognizer970(const ScheduleDAG &DAG)
+    : DAG(DAG) {
+  EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::EndDispatchGroup() {
+  DEBUG(errs() << "=== Start of dispatch group\n");
+  NumIssued = 0;
+
+  // Structural hazard info.
+  HasCTRSet = false;
+  NumStores = 0;
+}
+
+
+PPCII::PPC970_Unit
+PPCHazardRecognizer970::GetInstrType(unsigned Opcode,
+                                     bool &isFirst, bool &isSingle,
+                                     bool &isCracked,
+                                     bool &isLoad, bool &isStore) {
+  const MCInstrDesc &MCID = DAG.TII->get(Opcode);
+
+  isLoad  = MCID.mayLoad();
+  isStore = MCID.mayStore();
+
+  uint64_t TSFlags = MCID.TSFlags;
+
+  isFirst   = TSFlags & PPCII::PPC970_First;
+  isSingle  = TSFlags & PPCII::PPC970_Single;
+  isCracked = TSFlags & PPCII::PPC970_Cracked;
+  return (PPCII::PPC970_Unit)(TSFlags & PPCII::PPC970_Mask);
+}
+
+/// isLoadOfStoredAddress - If we have a load from the previously stored pointer
+/// as indicated by StorePtr1/StorePtr2/StoreSize, return true.
+bool PPCHazardRecognizer970::
+isLoadOfStoredAddress(uint64_t LoadSize, int64_t LoadOffset,
+  const Value *LoadValue) const {
+  for (unsigned i = 0, e = NumStores; i != e; ++i) {
+    // Handle exact and commuted addresses.
+    if (LoadValue == StoreValue[i] && LoadOffset == StoreOffset[i])
+      return true;
+
+    // Okay, we don't have an exact match, if this is an indexed offset, see if
+    // we have overlap (which happens during fp->int conversion for example).
+    if (StoreValue[i] == LoadValue) {
+      // Okay the base pointers match, so we have [c1+r] vs [c2+r].  Check
+      // to see if the load and store actually overlap.
+      if (StoreOffset[i] < LoadOffset) {
+        if (int64_t(StoreOffset[i]+StoreSize[i]) > LoadOffset) return true;
+      } else {
+        if (int64_t(LoadOffset+LoadSize) > StoreOffset[i]) return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// getHazardType - We return hazard for any non-branch instruction that would
+/// terminate the dispatch group.  We turn NoopHazard for any
+/// instructions that wouldn't terminate the dispatch group that would cause a
+/// pipeline flush.
+ScheduleHazardRecognizer::HazardType PPCHazardRecognizer970::
+getHazardType(SUnit *SU, int Stalls) {
+  assert(Stalls == 0 && "PPC hazards don't support scoreboard lookahead");
+
+  MachineInstr *MI = SU->getInstr();
+
+  if (MI->isDebugValue())
+    return NoHazard;
+
+  unsigned Opcode = MI->getOpcode();
+  bool isFirst, isSingle, isCracked, isLoad, isStore;
+  PPCII::PPC970_Unit InstrType =
+    GetInstrType(Opcode, isFirst, isSingle, isCracked,
+                 isLoad, isStore);
+  if (InstrType == PPCII::PPC970_Pseudo) return NoHazard;
+
+  // We can only issue a PPC970_First/PPC970_Single instruction (such as
+  // crand/mtspr/etc) if this is the first cycle of the dispatch group.
+  if (NumIssued != 0 && (isFirst || isSingle))
+    return Hazard;
+
+  // If this instruction is cracked into two ops by the decoder, we know that
+  // it is not a branch and that it cannot issue if 3 other instructions are
+  // already in the dispatch group.
+  if (isCracked && NumIssued > 2)
+    return Hazard;
+
+  switch (InstrType) {
+  default: llvm_unreachable("Unknown instruction type!");
+  case PPCII::PPC970_FXU:
+  case PPCII::PPC970_LSU:
+  case PPCII::PPC970_FPU:
+  case PPCII::PPC970_VALU:
+  case PPCII::PPC970_VPERM:
+    // We can only issue a branch as the last instruction in a group.
+    if (NumIssued == 4) return Hazard;
+    break;
+  case PPCII::PPC970_CRU:
+    // We can only issue a CR instruction in the first two slots.
+    if (NumIssued >= 2) return Hazard;
+    break;
+  case PPCII::PPC970_BRU:
+    break;
+  }
+
+  // Do not allow MTCTR and BCTRL to be in the same dispatch group.
+  if (HasCTRSet && Opcode == PPC::BCTRL)
+    return NoopHazard;
+
+  // If this is a load following a store, make sure it's not to the same or
+  // overlapping address.
+  if (isLoad && NumStores && !MI->memoperands_empty()) {
+    MachineMemOperand *MO = *MI->memoperands_begin();
+    if (isLoadOfStoredAddress(MO->getSize(),
+                              MO->getOffset(), MO->getValue()))
+      return NoopHazard;
+  }
+
+  return NoHazard;
+}
+
+void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) {
+  MachineInstr *MI = SU->getInstr();
+
+  if (MI->isDebugValue())
+    return;
+
+  unsigned Opcode = MI->getOpcode();
+  bool isFirst, isSingle, isCracked, isLoad, isStore;
+  PPCII::PPC970_Unit InstrType =
+    GetInstrType(Opcode, isFirst, isSingle, isCracked,
+                 isLoad, isStore);
+  if (InstrType == PPCII::PPC970_Pseudo) return;
+
+  // Update structural hazard information.
+  if (Opcode == PPC::MTCTR || Opcode == PPC::MTCTR8) HasCTRSet = true;
+
+  // Track the address stored to.
+  if (isStore && NumStores < 4 && !MI->memoperands_empty()) {
+    MachineMemOperand *MO = *MI->memoperands_begin();
+    StoreSize[NumStores] = MO->getSize();
+    StoreOffset[NumStores] = MO->getOffset();
+    StoreValue[NumStores] = MO->getValue();
+    ++NumStores;
+  }
+
+  if (InstrType == PPCII::PPC970_BRU || isSingle)
+    NumIssued = 4;  // Terminate a d-group.
+  ++NumIssued;
+
+  // If this instruction is cracked into two ops by the decoder, remember that
+  // we issued two pieces.
+  if (isCracked)
+    ++NumIssued;
+
+  if (NumIssued == 5)
+    EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::AdvanceCycle() {
+  assert(NumIssued < 5 && "Illegal dispatch group!");
+  ++NumIssued;
+  if (NumIssued == 5)
+    EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::Reset() {
+  EndDispatchGroup();
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h
new file mode 100644
index 000000000000..4b502147ca63
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h
@@ -0,0 +1,102 @@
+//===-- PPCHazardRecognizers.h - PowerPC Hazard Recognizers -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling on PowerPC processors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCHAZARDRECOGNIZERS_H
+#define LLVM_LIB_TARGET_POWERPC_PPCHAZARDRECOGNIZERS_H
+
+#include "PPCInstrInfo.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+
+namespace llvm {
+
+/// PPCDispatchGroupSBHazardRecognizer - This class implements a scoreboard-based
+/// hazard recognizer for PPC ooo processors with dispatch-group hazards.
+class PPCDispatchGroupSBHazardRecognizer : public ScoreboardHazardRecognizer {
+  const ScheduleDAG *DAG;
+  SmallVector<SUnit *, 7> CurGroup;
+  unsigned CurSlots, CurBranches;
+
+  bool isLoadAfterStore(SUnit *SU);
+  bool isBCTRAfterSet(SUnit *SU);
+  bool mustComeFirst(const MCInstrDesc *MCID, unsigned &NSlots);
+public:
+  PPCDispatchGroupSBHazardRecognizer(const InstrItineraryData *ItinData,
+                         const ScheduleDAG *DAG_) :
+    ScoreboardHazardRecognizer(ItinData, DAG_), DAG(DAG_),
+    CurSlots(0), CurBranches(0) {}
+
+  HazardType getHazardType(SUnit *SU, int Stalls) override;
+  bool ShouldPreferAnother(SUnit* SU) override;
+  unsigned PreEmitNoops(SUnit *SU) override;
+  void EmitInstruction(SUnit *SU) override;
+  void AdvanceCycle() override;
+  void RecedeCycle() override;
+  void Reset() override;
+  void EmitNoop() override;
+};
+
+/// PPCHazardRecognizer970 - This class defines a finite state automata that
+/// models the dispatch logic on the PowerPC 970 (aka G5) processor.  This
+/// promotes good dispatch group formation and implements noop insertion to
+/// avoid structural hazards that cause significant performance penalties (e.g.
+/// setting the CTR register then branching through it within a dispatch group),
+/// or storing then loading from the same address within a dispatch group.
+class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
+  const ScheduleDAG &DAG;
+
+  unsigned NumIssued;  // Number of insts issued, including advanced cycles.
+
+  // Various things that can cause a structural hazard.
+
+  // HasCTRSet - If the CTR register is set in this group, disallow BCTRL.
+  bool HasCTRSet;
+
+  // StoredPtr - Keep track of the address of any store.  If we see a load from
+  // the same address (or one that aliases it), disallow the store.  We can have
+  // up to four stores in one dispatch group, hence we track up to 4.
+  //
+  // This is null if we haven't seen a store yet.  We keep track of both
+  // operands of the store here, since we support [r+r] and [r+i] addressing.
+  const Value *StoreValue[4];
+  int64_t StoreOffset[4];
+  uint64_t StoreSize[4];
+  unsigned NumStores;
+
+public:
+  PPCHazardRecognizer970(const ScheduleDAG &DAG);
+  HazardType getHazardType(SUnit *SU, int Stalls) override;
+  void EmitInstruction(SUnit *SU) override;
+  void AdvanceCycle() override;
+  void Reset() override;
+
+private:
+  /// EndDispatchGroup - Called when we are finishing a new dispatch group.
+  ///
+  void EndDispatchGroup();
+
+  /// GetInstrType - Classify the specified powerpc opcode according to its
+  /// pipeline.
+  PPCII::PPC970_Unit GetInstrType(unsigned Opcode,
+                                  bool &isFirst, bool &isSingle,bool &isCracked,
+                                  bool &isLoad, bool &isStore);
+
+  bool isLoadOfStoredAddress(uint64_t LoadSize, int64_t LoadOffset,
+                             const Value *LoadValue) const;
+};
+
+} // end namespace llvm
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
new file mode 100644
index 000000000000..1e51c1f651c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -0,0 +1,4529 @@
+//===-- PPCISelDAGToDAG.cpp - PPC --pattern matching inst selector --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for PowerPC,
+// converting from a legalized dag to a PPC dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-codegen"
+
+// FIXME: Remove this once the bug has been fixed!
+cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
+cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
+
+static cl::opt<bool>
+    UseBitPermRewriter("ppc-use-bit-perm-rewriter", cl::init(true),
+                       cl::desc("use aggressive ppc isel for bit permutations"),
+                       cl::Hidden);
+static cl::opt<bool> BPermRewriterNoMasking(
+    "ppc-bit-perm-rewriter-stress-rotates",
+    cl::desc("stress rotate selection in aggressive ppc isel for "
+             "bit permutations"),
+    cl::Hidden);
+
+static cl::opt<bool> EnableBranchHint(
+  "ppc-use-branch-hint", cl::init(true),
+    cl::desc("Enable static hinting of branches on ppc"),
+    cl::Hidden);
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  /// PPCDAGToDAGISel - PPC specific code to select PPC machine
+  /// instructions for SelectionDAG operations.
+  ///
+  class PPCDAGToDAGISel : public SelectionDAGISel {
+    const PPCTargetMachine &TM;
+    const PPCSubtarget *PPCSubTarget;
+    const PPCTargetLowering *PPCLowering;
+    unsigned GlobalBaseReg;
+  public:
+    explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
+        : SelectionDAGISel(tm), TM(tm) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      // Make sure we re-emit a set of the global base reg if necessary
+      GlobalBaseReg = 0;
+      PPCSubTarget = &MF.getSubtarget<PPCSubtarget>();
+      PPCLowering = PPCSubTarget->getTargetLowering();
+      SelectionDAGISel::runOnMachineFunction(MF);
+
+      if (!PPCSubTarget->isSVR4ABI())
+        InsertVRSaveCode(MF);
+
+      return true;
+    }
+
+    void PreprocessISelDAG() override;
+    void PostprocessISelDAG() override;
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
+      return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+    }
+
+    /// getI64Imm - Return a target constant with the specified value, of type
+    /// i64.
+    inline SDValue getI64Imm(uint64_t Imm, const SDLoc &dl) {
+      return CurDAG->getTargetConstant(Imm, dl, MVT::i64);
+    }
+
+    /// getSmallIPtrImm - Return a target constant of pointer type.
+    inline SDValue getSmallIPtrImm(unsigned Imm, const SDLoc &dl) {
+      return CurDAG->getTargetConstant(
+          Imm, dl, PPCLowering->getPointerTy(CurDAG->getDataLayout()));
+    }
+
+    /// isRotateAndMask - Returns true if Mask and Shift can be folded into a
+    /// rotate and mask opcode and mask operation.
+    static bool isRotateAndMask(SDNode *N, unsigned Mask, bool isShiftMask,
+                                unsigned &SH, unsigned &MB, unsigned &ME);
+
+    /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC
+    /// base register.  Return the virtual register that holds this value.
+    SDNode *getGlobalBaseReg();
+
+    void selectFrameIndex(SDNode *SN, SDNode *N, unsigned Offset = 0);
+
+    // Select - Convert the specified operand from a target-independent to a
+    // target-specific node if it hasn't already been changed.
+    void Select(SDNode *N) override;
+
+    bool tryBitfieldInsert(SDNode *N);
+    bool tryBitPermutation(SDNode *N);
+
+    /// SelectCC - Select a comparison of the specified values with the
+    /// specified condition code, returning the CR# of the expression.
+    SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                     const SDLoc &dl);
+
+    /// SelectAddrImm - Returns true if the address N can be represented by
+    /// a base register plus a signed 16-bit displacement [r+imm].
+    bool SelectAddrImm(SDValue N, SDValue &Disp,
+                       SDValue &Base) {
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
+    }
+
+    /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
+    /// immediate field.  Note that the operand at this point is already the
+    /// result of a prior SelectAddressRegImm call.
+    bool SelectAddrImmOffs(SDValue N, SDValue &Out) const {
+      if (N.getOpcode() == ISD::TargetConstant ||
+          N.getOpcode() == ISD::TargetGlobalAddress) {
+        Out = N;
+        return true;
+      }
+
+      return false;
+    }
+
+    /// SelectAddrIdx - Given the specified addressed, check to see if it can be
+    /// represented as an indexed [r+r] operation.  Returns false if it can
+    /// be represented by [r+imm], which are preferred.
+    bool SelectAddrIdx(SDValue N, SDValue &Base, SDValue &Index) {
+      return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG);
+    }
+
+    /// SelectAddrIdxOnly - Given the specified addressed, force it to be
+    /// represented as an indexed [r+r] operation.
+    bool SelectAddrIdxOnly(SDValue N, SDValue &Base, SDValue &Index) {
+      return PPCLowering->SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
+    }
+
+    /// SelectAddrImmX4 - Returns true if the address N can be represented by
+    /// a base register plus a signed 16-bit displacement that is a multiple of 4.
+    /// Suitable for use by STD and friends.
+    bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) {
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
+    }
+
+    // Select an address into a single register.
+    bool SelectAddr(SDValue N, SDValue &Base) {
+      Base = N;
+      return true;
+    }
+
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.  It is always correct to compute the value into
+    /// a register.  The case of adding a (possibly relocatable) constant to a
+    /// register can be improved, but it is wrong to substitute Reg+Reg for
+    /// Reg in an asm, because the load or store opcode would have to change.
+    bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                      unsigned ConstraintID,
+                                      std::vector<SDValue> &OutOps) override {
+
+      switch(ConstraintID) {
+      default:
+        errs() << "ConstraintID: " << ConstraintID << "\n";
+        llvm_unreachable("Unexpected asm memory constraint");
+      case InlineAsm::Constraint_es:
+      case InlineAsm::Constraint_i:
+      case InlineAsm::Constraint_m:
+      case InlineAsm::Constraint_o:
+      case InlineAsm::Constraint_Q:
+      case InlineAsm::Constraint_Z:
+      case InlineAsm::Constraint_Zy:
+        // We need to make sure that this one operand does not end up in r0
+        // (because we might end up lowering this as 0(%op)).
+        const TargetRegisterInfo *TRI = PPCSubTarget->getRegisterInfo();
+        const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF, /*Kind=*/1);
+        SDLoc dl(Op);
+        SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
+        SDValue NewOp =
+          SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                         dl, Op.getValueType(),
+                                         Op, RC), 0);
+
+        OutOps.push_back(NewOp);
+        return false;
+      }
+      return true;
+    }
+
+    void InsertVRSaveCode(MachineFunction &MF);
+
+    StringRef getPassName() const override {
+      return "PowerPC DAG->DAG Pattern Instruction Selection";
+    }
+
+// Include the pieces autogenerated from the target description.
+#include "PPCGenDAGISel.inc"
+
+private:
+    bool trySETCC(SDNode *N);
+
+    void PeepholePPC64();
+    void PeepholePPC64ZExt();
+    void PeepholeCROps();
+
+    SDValue combineToCMPB(SDNode *N);
+    void foldBoolExts(SDValue &Res, SDNode *&N);
+
+    bool AllUsersSelectZero(SDNode *N);
+    void SwapAllSelectUsers(SDNode *N);
+
+    void transferMemOperands(SDNode *N, SDNode *Result);
+  };
+}
+
+/// InsertVRSaveCode - Once the entire function has been instruction selected,
+/// all virtual registers are created and all machine instructions are built,
+/// check to see if we need to save/restore VRSAVE.  If so, do it.
+void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
+  // Check to see if this function uses vector registers, which means we have to
+  // save and restore the VRSAVE register and update it with the regs we use.
+  //
+  // In this case, there will be virtual registers of vector type created
+  // by the scheduler.  Detect them now.
+  bool HasVectorVReg = false;
+  for (unsigned i = 0, e = RegInfo->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (RegInfo->getRegClass(Reg) == &PPC::VRRCRegClass) {
+      HasVectorVReg = true;
+      break;
+    }
+  }
+  if (!HasVectorVReg) return;  // nothing to do.
+
+  // If we have a vector register, we want to emit code into the entry and exit
+  // blocks to save and restore the VRSAVE register.  We do this here (instead
+  // of marking all vector instructions as clobbering VRSAVE) for two reasons:
+  //
+  // 1. This (trivially) reduces the load on the register allocator, by not
+  //    having to represent the live range of the VRSAVE register.
+  // 2. This (more significantly) allows us to create a temporary virtual
+  //    register to hold the saved VRSAVE value, allowing this temporary to be
+  //    register allocated, instead of forcing it to be spilled to the stack.
+
+  // Create two vregs - one to hold the VRSAVE register that is live-in to the
+  // function and one for the value after having bits or'd into it.
+  unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+  unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+
+  const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo();
+  MachineBasicBlock &EntryBB = *Fn.begin();
+  DebugLoc dl;
+  // Emit the following code into the entry block:
+  // InVRSAVE = MFVRSAVE
+  // UpdatedVRSAVE = UPDATE_VRSAVE InVRSAVE
+  // MTVRSAVE UpdatedVRSAVE
+  MachineBasicBlock::iterator IP = EntryBB.begin();  // Insert Point
+  BuildMI(EntryBB, IP, dl, TII.get(PPC::MFVRSAVE), InVRSAVE);
+  BuildMI(EntryBB, IP, dl, TII.get(PPC::UPDATE_VRSAVE),
+          UpdatedVRSAVE).addReg(InVRSAVE);
+  BuildMI(EntryBB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(UpdatedVRSAVE);
+
+  // Find all return blocks, outputting a restore in each epilog.
+  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
+    if (BB->isReturnBlock()) {
+      IP = BB->end(); --IP;
+
+      // Skip over all terminator instructions, which are part of the return
+      // sequence.
+      MachineBasicBlock::iterator I2 = IP;
+      while (I2 != BB->begin() && (--I2)->isTerminator())
+        IP = I2;
+
+      // Emit: MTVRSAVE InVRSave
+      BuildMI(*BB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(InVRSAVE);
+    }
+  }
+}
+
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// base address to use for accessing globals into a register.
+///
+SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
+  if (!GlobalBaseReg) {
+    const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo();
+    // Insert the set of GlobalBaseReg into the first MBB of the function
+    MachineBasicBlock &FirstMBB = MF->front();
+    MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+    const Module *M = MF->getFunction()->getParent();
+    DebugLoc dl;
+
+    if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) == MVT::i32) {
+      if (PPCSubTarget->isTargetELF()) {
+        GlobalBaseReg = PPC::R30;
+        if (M->getPICLevel() == PICLevel::SmallPIC) {
+          BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MoveGOTtoLR));
+          BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
+          MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true);
+        } else {
+          BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
+          BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
+          unsigned TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+          BuildMI(FirstMBB, MBBI, dl,
+                  TII.get(PPC::UpdateGBR), GlobalBaseReg)
+                  .addReg(TempReg, RegState::Define).addReg(GlobalBaseReg);
+          MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true);
+        }
+      } else {
+        GlobalBaseReg =
+          RegInfo->createVirtualRegister(&PPC::GPRC_and_GPRC_NOR0RegClass);
+        BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
+        BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
+      }
+    } else {
+      GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RC_and_G8RC_NOX0RegClass);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8));
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg);
+    }
+  }
+  return CurDAG->getRegister(GlobalBaseReg,
+                             PPCLowering->getPointerTy(CurDAG->getDataLayout()))
+      .getNode();
+}
+
+/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 16-bit value.  If so, this returns true and the
+/// immediate.
+static bool isIntS16Immediate(SDNode *N, short &Imm) {
+  if (N->getOpcode() != ISD::Constant)
+    return false;
+
+  Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
+  if (N->getValueType(0) == MVT::i32)
+    return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+  else
+    return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+}
+
+static bool isIntS16Immediate(SDValue Op, short &Imm) {
+  return isIntS16Immediate(Op.getNode(), Imm);
+}
+
+
+/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
+    Imm = cast<ConstantSDNode>(N)->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+/// isInt64Immediate - This method tests to see if the node is a 64-bit constant
+/// operand.  If so Imm will receive the 64-bit value.
+static bool isInt64Immediate(SDNode *N, uint64_t &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i64) {
+    Imm = cast<ConstantSDNode>(N)->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+// isInt32Immediate - This method tests to see if a constant operand.
+// If so Imm will receive the 32 bit value.
+static bool isInt32Immediate(SDValue N, unsigned &Imm) {
+  return isInt32Immediate(N.getNode(), Imm);
+}
+
+static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo,
+                              const SDValue &DestMBB) {
+  assert(isa<BasicBlockSDNode>(DestMBB));
+
+  if (!FuncInfo->BPI) return PPC::BR_NO_HINT;
+
+  const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
+  const TerminatorInst *BBTerm = BB->getTerminator();
+
+  if (BBTerm->getNumSuccessors() != 2) return PPC::BR_NO_HINT;
+
+  const BasicBlock *TBB = BBTerm->getSuccessor(0);
+  const BasicBlock *FBB = BBTerm->getSuccessor(1);
+
+  auto TProb = FuncInfo->BPI->getEdgeProbability(BB, TBB);
+  auto FProb = FuncInfo->BPI->getEdgeProbability(BB, FBB);
+
+  // We only want to handle cases which are easy to predict at static time, e.g.
+  // C++ throw statement, that is very likely not taken, or calling never
+  // returned function, e.g. stdlib exit(). So we set Threshold to filter
+  // unwanted cases.
+  //
+  // Below is LLVM branch weight table, we only want to handle case 1, 2
+  //
+  // Case                  Taken:Nontaken  Example
+  // 1. Unreachable        1048575:1       C++ throw, stdlib exit(),
+  // 2. Invoke-terminating 1:1048575
+  // 3. Coldblock          4:64            __builtin_expect
+  // 4. Loop Branch        124:4           For loop
+  // 5. PH/ZH/FPH          20:12
+  const uint32_t Threshold = 10000;
+
+  if (std::max(TProb, FProb) / Threshold < std::min(TProb, FProb))
+    return PPC::BR_NO_HINT;
+
+  DEBUG(dbgs() << "Use branch hint for '" << FuncInfo->Fn->getName() << "::"
+               << BB->getName() << "'\n"
+               << " -> " << TBB->getName() << ": " << TProb << "\n"
+               << " -> " << FBB->getName() << ": " << FProb << "\n");
+
+  const BasicBlockSDNode *BBDN = cast<BasicBlockSDNode>(DestMBB);
+
+  // If Dest BasicBlock is False-BasicBlock (FBB), swap branch probabilities,
+  // because we want 'TProb' stands for 'branch probability' to Dest BasicBlock
+  if (BBDN->getBasicBlock()->getBasicBlock() != TBB)
+    std::swap(TProb, FProb);
+
+  return (TProb > FProb) ? PPC::BR_TAKEN_HINT : PPC::BR_NONTAKEN_HINT;
+}
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
+  return N->getOpcode() == Opc
+         && isInt32Immediate(N->getOperand(1).getNode(), Imm);
+}
+
+void PPCDAGToDAGISel::selectFrameIndex(SDNode *SN, SDNode *N, unsigned Offset) {
+  SDLoc dl(SN);
+  int FI = cast<FrameIndexSDNode>(N)->getIndex();
+  SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0));
+  unsigned Opc = N->getValueType(0) == MVT::i32 ? PPC::ADDI : PPC::ADDI8;
+  if (SN->hasOneUse())
+    CurDAG->SelectNodeTo(SN, Opc, N->getValueType(0), TFI,
+                         getSmallIPtrImm(Offset, dl));
+  else
+    ReplaceNode(SN, CurDAG->getMachineNode(Opc, dl, N->getValueType(0), TFI,
+                                           getSmallIPtrImm(Offset, dl)));
+}
+
+bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask,
+                                      bool isShiftMask, unsigned &SH,
+                                      unsigned &MB, unsigned &ME) {
+  // Don't even go down this path for i64, since different logic will be
+  // necessary for rldicl/rldicr/rldimi.
+  if (N->getValueType(0) != MVT::i32)
+    return false;
+
+  unsigned Shift  = 32;
+  unsigned Indeterminant = ~0;  // bit mask marking indeterminant results
+  unsigned Opcode = N->getOpcode();
+  if (N->getNumOperands() != 2 ||
+      !isInt32Immediate(N->getOperand(1).getNode(), Shift) || (Shift > 31))
+    return false;
+
+  if (Opcode == ISD::SHL) {
+    // apply shift left to mask if it comes first
+    if (isShiftMask) Mask = Mask << Shift;
+    // determine which bits are made indeterminant by shift
+    Indeterminant = ~(0xFFFFFFFFu << Shift);
+  } else if (Opcode == ISD::SRL) {
+    // apply shift right to mask if it comes first
+    if (isShiftMask) Mask = Mask >> Shift;
+    // determine which bits are made indeterminant by shift
+    Indeterminant = ~(0xFFFFFFFFu >> Shift);
+    // adjust for the left rotate
+    Shift = 32 - Shift;
+  } else if (Opcode == ISD::ROTL) {
+    Indeterminant = 0;
+  } else {
+    return false;
+  }
+
+  // if the mask doesn't intersect any Indeterminant bits
+  if (Mask && !(Mask & Indeterminant)) {
+    SH = Shift & 31;
+    // make sure the mask is still a mask (wrap arounds may not be)
+    return isRunOfOnes(Mask, MB, ME);
+  }
+  return false;
+}
+
+/// Turn an or of two masked values into the rotate left word immediate then
+/// mask insert (rlwimi) instruction.
+bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDLoc dl(N);
+
+  APInt LKZ, LKO, RKZ, RKO;
+  CurDAG->computeKnownBits(Op0, LKZ, LKO);
+  CurDAG->computeKnownBits(Op1, RKZ, RKO);
+
+  unsigned TargetMask = LKZ.getZExtValue();
+  unsigned InsertMask = RKZ.getZExtValue();
+
+  if ((TargetMask | InsertMask) == 0xFFFFFFFF) {
+    unsigned Op0Opc = Op0.getOpcode();
+    unsigned Op1Opc = Op1.getOpcode();
+    unsigned Value, SH = 0;
+    TargetMask = ~TargetMask;
+    InsertMask = ~InsertMask;
+
+    // If the LHS has a foldable shift and the RHS does not, then swap it to the
+    // RHS so that we can fold the shift into the insert.
+    if (Op0Opc == ISD::AND && Op1Opc == ISD::AND) {
+      if (Op0.getOperand(0).getOpcode() == ISD::SHL ||
+          Op0.getOperand(0).getOpcode() == ISD::SRL) {
+        if (Op1.getOperand(0).getOpcode() != ISD::SHL &&
+            Op1.getOperand(0).getOpcode() != ISD::SRL) {
+          std::swap(Op0, Op1);
+          std::swap(Op0Opc, Op1Opc);
+          std::swap(TargetMask, InsertMask);
+        }
+      }
+    } else if (Op0Opc == ISD::SHL || Op0Opc == ISD::SRL) {
+      if (Op1Opc == ISD::AND && Op1.getOperand(0).getOpcode() != ISD::SHL &&
+          Op1.getOperand(0).getOpcode() != ISD::SRL) {
+        std::swap(Op0, Op1);
+        std::swap(Op0Opc, Op1Opc);
+        std::swap(TargetMask, InsertMask);
+      }
+    }
+
+    unsigned MB, ME;
+    if (isRunOfOnes(InsertMask, MB, ME)) {
+      SDValue Tmp1, Tmp2;
+
+      if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) &&
+          isInt32Immediate(Op1.getOperand(1), Value)) {
+        Op1 = Op1.getOperand(0);
+        SH  = (Op1Opc == ISD::SHL) ? Value : 32 - Value;
+      }
+      if (Op1Opc == ISD::AND) {
+       // The AND mask might not be a constant, and we need to make sure that
+       // if we're going to fold the masking with the insert, all bits not
+       // know to be zero in the mask are known to be one.
+        APInt MKZ, MKO;
+        CurDAG->computeKnownBits(Op1.getOperand(1), MKZ, MKO);
+        bool CanFoldMask = InsertMask == MKO.getZExtValue();
+
+        unsigned SHOpc = Op1.getOperand(0).getOpcode();
+        if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) && CanFoldMask &&
+            isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) {
+          // Note that Value must be in range here (less than 32) because
+          // otherwise there would not be any bits set in InsertMask.
+          Op1 = Op1.getOperand(0).getOperand(0);
+          SH  = (SHOpc == ISD::SHL) ? Value : 32 - Value;
+        }
+      }
+
+      SH &= 31;
+      SDValue Ops[] = { Op0, Op1, getI32Imm(SH, dl), getI32Imm(MB, dl),
+                          getI32Imm(ME, dl) };
+      ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops));
+      return true;
+    }
+  }
+  return false;
+}
+
+// Predict the number of instructions that would be generated by calling
+// getInt64(N).
+static unsigned getInt64CountDirect(int64_t Imm) {
+  // Assume no remaining bits.
+  unsigned Remainder = 0;
+  // Assume no shift required.
+  unsigned Shift = 0;
+
+  // If it can't be represented as a 32 bit value.
+  if (!isInt<32>(Imm)) {
+    Shift = countTrailingZeros<uint64_t>(Imm);
+    int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+
+    // If the shifted value fits 32 bits.
+    if (isInt<32>(ImmSh)) {
+      // Go with the shifted value.
+      Imm = ImmSh;
+    } else {
+      // Still stuck with a 64 bit value.
+      Remainder = Imm;
+      Shift = 32;
+      Imm >>= 32;
+    }
+  }
+
+  // Intermediate operand.
+  unsigned Result = 0;
+
+  // Handle first 32 bits.
+  unsigned Lo = Imm & 0xFFFF;
+
+  // Simple value.
+  if (isInt<16>(Imm)) {
+    // Just the Lo bits.
+    ++Result;
+  } else if (Lo) {
+    // Handle the Hi bits and Lo bits.
+    Result += 2;
+  } else {
+    // Just the Hi bits.
+    ++Result;
+  }
+
+  // If no shift, we're done.
+  if (!Shift) return Result;
+
+  // If Hi word == Lo word,
+  // we can use rldimi to insert the Lo word into Hi word.
+  if ((unsigned)(Imm & 0xFFFFFFFF) == Remainder) {
+    ++Result;
+    return Result;
+  }
+
+  // Shift for next step if the upper 32-bits were not zero.
+  if (Imm)
+    ++Result;
+
+  // Add in the last bits as required.
+  if ((Remainder >> 16) & 0xFFFF)
+    ++Result;
+  if (Remainder & 0xFFFF)
+    ++Result;
+
+  return Result;
+}
+
+static uint64_t Rot64(uint64_t Imm, unsigned R) {
+  return (Imm << R) | (Imm >> (64 - R));
+}
+
+static unsigned getInt64Count(int64_t Imm) {
+  unsigned Count = getInt64CountDirect(Imm);
+  if (Count == 1)
+    return Count;
+
+  for (unsigned r = 1; r < 63; ++r) {
+    uint64_t RImm = Rot64(Imm, r);
+    unsigned RCount = getInt64CountDirect(RImm) + 1;
+    Count = std::min(Count, RCount);
+
+    // See comments in getInt64 for an explanation of the logic below.
+    unsigned LS = findLastSet(RImm);
+    if (LS != r-1)
+      continue;
+
+    uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
+    uint64_t RImmWithOnes = RImm | OnesMask;
+
+    RCount = getInt64CountDirect(RImmWithOnes) + 1;
+    Count = std::min(Count, RCount);
+  }
+
+  return Count;
+}
+
+// Select a 64-bit constant. For cost-modeling purposes, getInt64Count
+// (above) needs to be kept in sync with this function.
+static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl,
+                              int64_t Imm) {
+  // Assume no remaining bits.
+  unsigned Remainder = 0;
+  // Assume no shift required.
+  unsigned Shift = 0;
+
+  // If it can't be represented as a 32 bit value.
+  if (!isInt<32>(Imm)) {
+    Shift = countTrailingZeros<uint64_t>(Imm);
+    int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+
+    // If the shifted value fits 32 bits.
+    if (isInt<32>(ImmSh)) {
+      // Go with the shifted value.
+      Imm = ImmSh;
+    } else {
+      // Still stuck with a 64 bit value.
+      Remainder = Imm;
+      Shift = 32;
+      Imm >>= 32;
+    }
+  }
+
+  // Intermediate operand.
+  SDNode *Result;
+
+  // Handle first 32 bits.
+  unsigned Lo = Imm & 0xFFFF;
+  unsigned Hi = (Imm >> 16) & 0xFFFF;
+
+  auto getI32Imm = [CurDAG, dl](unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+  };
+
+  // Simple value.
+  if (isInt<16>(Imm)) {
+    // Just the Lo bits.
+    Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
+  } else if (Lo) {
+    // Handle the Hi bits.
+    unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
+    Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi));
+    // And Lo bits.
+    Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+                                    SDValue(Result, 0), getI32Imm(Lo));
+  } else {
+    // Just the Hi bits.
+    Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
+  }
+
+  // If no shift, we're done.
+  if (!Shift) return Result;
+
+  // If Hi word == Lo word,
+  // we can use rldimi to insert the Lo word into Hi word.
+  if ((unsigned)(Imm & 0xFFFFFFFF) == Remainder) {
+    SDValue Ops[] =
+      { SDValue(Result, 0), SDValue(Result, 0), getI32Imm(Shift), getI32Imm(0)};
+    return CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops);
+  }
+
+  // Shift for next step if the upper 32-bits were not zero.
+  if (Imm) {
+    Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64,
+                                    SDValue(Result, 0),
+                                    getI32Imm(Shift),
+                                    getI32Imm(63 - Shift));
+  }
+
+  // Add in the last bits as required.
+  if ((Hi = (Remainder >> 16) & 0xFFFF)) {
+    Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
+                                    SDValue(Result, 0), getI32Imm(Hi));
+  }
+  if ((Lo = Remainder & 0xFFFF)) {
+    Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+                                    SDValue(Result, 0), getI32Imm(Lo));
+  }
+
+  return Result;
+}
+
+static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) {
+  unsigned Count = getInt64CountDirect(Imm);
+  if (Count == 1)
+    return getInt64Direct(CurDAG, dl, Imm);
+
+  unsigned RMin = 0;
+
+  int64_t MatImm;
+  unsigned MaskEnd;
+
+  for (unsigned r = 1; r < 63; ++r) {
+    uint64_t RImm = Rot64(Imm, r);
+    unsigned RCount = getInt64CountDirect(RImm) + 1;
+    if (RCount < Count) {
+      Count = RCount;
+      RMin = r;
+      MatImm = RImm;
+      MaskEnd = 63;
+    }
+
+    // If the immediate to generate has many trailing zeros, it might be
+    // worthwhile to generate a rotated value with too many leading ones
+    // (because that's free with li/lis's sign-extension semantics), and then
+    // mask them off after rotation.
+
+    unsigned LS = findLastSet(RImm);
+    // We're adding (63-LS) higher-order ones, and we expect to mask them off
+    // after performing the inverse rotation by (64-r). So we need that:
+    //   63-LS == 64-r => LS == r-1
+    if (LS != r-1)
+      continue;
+
+    uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1));
+    uint64_t RImmWithOnes = RImm | OnesMask;
+
+    RCount = getInt64CountDirect(RImmWithOnes) + 1;
+    if (RCount < Count) {
+      Count = RCount;
+      RMin = r;
+      MatImm = RImmWithOnes;
+      MaskEnd = LS;
+    }
+  }
+
+  if (!RMin)
+    return getInt64Direct(CurDAG, dl, Imm);
+
+  auto getI32Imm = [CurDAG, dl](unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+  };
+
+  SDValue Val = SDValue(getInt64Direct(CurDAG, dl, MatImm), 0);
+  return CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Val,
+                                getI32Imm(64 - RMin), getI32Imm(MaskEnd));
+}
+
+// Select a 64-bit constant.
+static SDNode *getInt64(SelectionDAG *CurDAG, SDNode *N) {
+  SDLoc dl(N);
+
+  // Get 64 bit value.
+  int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
+  return getInt64(CurDAG, dl, Imm);
+}
+
+namespace {
+class BitPermutationSelector {
+  struct ValueBit {
+    SDValue V;
+
+    // The bit number in the value, using a convention where bit 0 is the
+    // lowest-order bit.
+    unsigned Idx;
+
+    enum Kind {
+      ConstZero,
+      Variable
+    } K;
+
+    ValueBit(SDValue V, unsigned I, Kind K = Variable)
+      : V(V), Idx(I), K(K) {}
+    ValueBit(Kind K = Variable)
+      : V(SDValue(nullptr, 0)), Idx(UINT32_MAX), K(K) {}
+
+    bool isZero() const {
+      return K == ConstZero;
+    }
+
+    bool hasValue() const {
+      return K == Variable;
+    }
+
+    SDValue getValue() const {
+      assert(hasValue() && "Cannot get the value of a constant bit");
+      return V;
+    }
+
+    unsigned getValueBitIndex() const {
+      assert(hasValue() && "Cannot get the value bit index of a constant bit");
+      return Idx;
+    }
+  };
+
+  // A bit group has the same underlying value and the same rotate factor.
+  struct BitGroup {
+    SDValue V;
+    unsigned RLAmt;
+    unsigned StartIdx, EndIdx;
+
+    // This rotation amount assumes that the lower 32 bits of the quantity are
+    // replicated in the high 32 bits by the rotation operator (which is done
+    // by rlwinm and friends in 64-bit mode).
+    bool Repl32;
+    // Did converting to Repl32 == true change the rotation factor? If it did,
+    // it decreased it by 32.
+    bool Repl32CR;
+    // Was this group coalesced after setting Repl32 to true?
+    bool Repl32Coalesced;
+
+    BitGroup(SDValue V, unsigned R, unsigned S, unsigned E)
+      : V(V), RLAmt(R), StartIdx(S), EndIdx(E), Repl32(false), Repl32CR(false),
+        Repl32Coalesced(false) {
+      DEBUG(dbgs() << "\tbit group for " << V.getNode() << " RLAmt = " << R <<
+                      " [" << S << ", " << E << "]\n");
+    }
+  };
+
+  // Information on each (Value, RLAmt) pair (like the number of groups
+  // associated with each) used to choose the lowering method.
+  struct ValueRotInfo {
+    SDValue V;
+    unsigned RLAmt;
+    unsigned NumGroups;
+    unsigned FirstGroupStartIdx;
+    bool Repl32;
+
+    ValueRotInfo()
+      : RLAmt(UINT32_MAX), NumGroups(0), FirstGroupStartIdx(UINT32_MAX),
+        Repl32(false) {}
+
+    // For sorting (in reverse order) by NumGroups, and then by
+    // FirstGroupStartIdx.
+    bool operator < (const ValueRotInfo &Other) const {
+      // We need to sort so that the non-Repl32 come first because, when we're
+      // doing masking, the Repl32 bit groups might be subsumed into the 64-bit
+      // masking operation.
+      if (Repl32 < Other.Repl32)
+        return true;
+      else if (Repl32 > Other.Repl32)
+        return false;
+      else if (NumGroups > Other.NumGroups)
+        return true;
+      else if (NumGroups < Other.NumGroups)
+        return false;
+      else if (FirstGroupStartIdx < Other.FirstGroupStartIdx)
+        return true;
+      return false;
+    }
+  };
+
+  using ValueBitsMemoizedValue = std::pair<bool, SmallVector<ValueBit, 64>>;
+  using ValueBitsMemoizer =
+      DenseMap<SDValue, std::unique_ptr<ValueBitsMemoizedValue>>;
+  ValueBitsMemoizer Memoizer;
+
+  // Return a pair of bool and a SmallVector pointer to a memoization entry.
+  // The bool is true if something interesting was deduced, otherwise if we're
+  // providing only a generic representation of V (or something else likewise
+  // uninteresting for instruction selection) through the SmallVector.
+  std::pair<bool, SmallVector<ValueBit, 64> *> getValueBits(SDValue V,
+                                                            unsigned NumBits) {
+    auto &ValueEntry = Memoizer[V];
+    if (ValueEntry)
+      return std::make_pair(ValueEntry->first, &ValueEntry->second);
+    ValueEntry.reset(new ValueBitsMemoizedValue());
+    bool &Interesting = ValueEntry->first;
+    SmallVector<ValueBit, 64> &Bits = ValueEntry->second;
+    Bits.resize(NumBits);
+
+    switch (V.getOpcode()) {
+    default: break;
+    case ISD::ROTL:
+      if (isa<ConstantSDNode>(V.getOperand(1))) {
+        unsigned RotAmt = V.getConstantOperandVal(1);
+
+        const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
+
+        for (unsigned i = 0; i < NumBits; ++i)
+          Bits[i] = LHSBits[i < RotAmt ? i + (NumBits - RotAmt) : i - RotAmt];
+
+        return std::make_pair(Interesting = true, &Bits);
+      }
+      break;
+    case ISD::SHL:
+      if (isa<ConstantSDNode>(V.getOperand(1))) {
+        unsigned ShiftAmt = V.getConstantOperandVal(1);
+
+        const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
+
+        for (unsigned i = ShiftAmt; i < NumBits; ++i)
+          Bits[i] = LHSBits[i - ShiftAmt];
+
+        for (unsigned i = 0; i < ShiftAmt; ++i)
+          Bits[i] = ValueBit(ValueBit::ConstZero);
+
+        return std::make_pair(Interesting = true, &Bits);
+      }
+      break;
+    case ISD::SRL:
+      if (isa<ConstantSDNode>(V.getOperand(1))) {
+        unsigned ShiftAmt = V.getConstantOperandVal(1);
+
+        const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
+
+        for (unsigned i = 0; i < NumBits - ShiftAmt; ++i)
+          Bits[i] = LHSBits[i + ShiftAmt];
+
+        for (unsigned i = NumBits - ShiftAmt; i < NumBits; ++i)
+          Bits[i] = ValueBit(ValueBit::ConstZero);
+
+        return std::make_pair(Interesting = true, &Bits);
+      }
+      break;
+    case ISD::AND:
+      if (isa<ConstantSDNode>(V.getOperand(1))) {
+        uint64_t Mask = V.getConstantOperandVal(1);
+
+        const SmallVector<ValueBit, 64> *LHSBits;
+        // Mark this as interesting, only if the LHS was also interesting. This
+        // prevents the overall procedure from matching a single immediate 'and'
+        // (which is non-optimal because such an and might be folded with other
+        // things if we don't select it here).
+        std::tie(Interesting, LHSBits) = getValueBits(V.getOperand(0), NumBits);
+
+        for (unsigned i = 0; i < NumBits; ++i)
+          if (((Mask >> i) & 1) == 1)
+            Bits[i] = (*LHSBits)[i];
+          else
+            Bits[i] = ValueBit(ValueBit::ConstZero);
+
+        return std::make_pair(Interesting, &Bits);
+      }
+      break;
+    case ISD::OR: {
+      const auto &LHSBits = *getValueBits(V.getOperand(0), NumBits).second;
+      const auto &RHSBits = *getValueBits(V.getOperand(1), NumBits).second;
+
+      bool AllDisjoint = true;
+      for (unsigned i = 0; i < NumBits; ++i)
+        if (LHSBits[i].isZero())
+          Bits[i] = RHSBits[i];
+        else if (RHSBits[i].isZero())
+          Bits[i] = LHSBits[i];
+        else {
+          AllDisjoint = false;
+          break;
+        }
+
+      if (!AllDisjoint)
+        break;
+
+      return std::make_pair(Interesting = true, &Bits);
+    }
+    }
+
+    for (unsigned i = 0; i < NumBits; ++i)
+      Bits[i] = ValueBit(V, i);
+
+    return std::make_pair(Interesting = false, &Bits);
+  }
+
+  // For each value (except the constant ones), compute the left-rotate amount
+  // to get it from its original to final position.
+  void computeRotationAmounts() {
+    HasZeros = false;
+    RLAmt.resize(Bits.size());
+    for (unsigned i = 0; i < Bits.size(); ++i)
+      if (Bits[i].hasValue()) {
+        unsigned VBI = Bits[i].getValueBitIndex();
+        if (i >= VBI)
+          RLAmt[i] = i - VBI;
+        else
+          RLAmt[i] = Bits.size() - (VBI - i);
+      } else if (Bits[i].isZero()) {
+        HasZeros = true;
+        RLAmt[i] = UINT32_MAX;
+      } else {
+        llvm_unreachable("Unknown value bit type");
+      }
+  }
+
+  // Collect groups of consecutive bits with the same underlying value and
+  // rotation factor. If we're doing late masking, we ignore zeros, otherwise
+  // they break up groups.
+  void collectBitGroups(bool LateMask) {
+    BitGroups.clear();
+
+    unsigned LastRLAmt = RLAmt[0];
+    SDValue LastValue = Bits[0].hasValue() ? Bits[0].getValue() : SDValue();
+    unsigned LastGroupStartIdx = 0;
+    for (unsigned i = 1; i < Bits.size(); ++i) {
+      unsigned ThisRLAmt = RLAmt[i];
+      SDValue ThisValue = Bits[i].hasValue() ? Bits[i].getValue() : SDValue();
+      if (LateMask && !ThisValue) {
+        ThisValue = LastValue;
+        ThisRLAmt = LastRLAmt;
+        // If we're doing late masking, then the first bit group always starts
+        // at zero (even if the first bits were zero).
+        if (BitGroups.empty())
+          LastGroupStartIdx = 0;
+      }
+
+      // If this bit has the same underlying value and the same rotate factor as
+      // the last one, then they're part of the same group.
+      if (ThisRLAmt == LastRLAmt && ThisValue == LastValue)
+        continue;
+
+      if (LastValue.getNode())
+        BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
+                                     i-1));
+      LastRLAmt = ThisRLAmt;
+      LastValue = ThisValue;
+      LastGroupStartIdx = i;
+    }
+    if (LastValue.getNode())
+      BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
+                                   Bits.size()-1));
+
+    if (BitGroups.empty())
+      return;
+
+    // We might be able to combine the first and last groups.
+    if (BitGroups.size() > 1) {
+      // If the first and last groups are the same, then remove the first group
+      // in favor of the last group, making the ending index of the last group
+      // equal to the ending index of the to-be-removed first group.
+      if (BitGroups[0].StartIdx == 0 &&
+          BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 &&
+          BitGroups[0].V == BitGroups[BitGroups.size()-1].V &&
+          BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) {
+        DEBUG(dbgs() << "\tcombining final bit group with initial one\n");
+        BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx;
+        BitGroups.erase(BitGroups.begin());
+      }
+    }
+  }
+
+  // Take all (SDValue, RLAmt) pairs and sort them by the number of groups
+  // associated with each. If there is a degeneracy, pick the one that occurs
+  // first (in the final value).
+  void collectValueRotInfo() {
+    ValueRots.clear();
+
+    for (auto &BG : BitGroups) {
+      unsigned RLAmtKey = BG.RLAmt + (BG.Repl32 ? 64 : 0);
+      ValueRotInfo &VRI = ValueRots[std::make_pair(BG.V, RLAmtKey)];
+      VRI.V = BG.V;
+      VRI.RLAmt = BG.RLAmt;
+      VRI.Repl32 = BG.Repl32;
+      VRI.NumGroups += 1;
+      VRI.FirstGroupStartIdx = std::min(VRI.FirstGroupStartIdx, BG.StartIdx);
+    }
+
+    // Now that we've collected the various ValueRotInfo instances, we need to
+    // sort them.
+    ValueRotsVec.clear();
+    for (auto &I : ValueRots) {
+      ValueRotsVec.push_back(I.second);
+    }
+    std::sort(ValueRotsVec.begin(), ValueRotsVec.end());
+  }
+
+  // In 64-bit mode, rlwinm and friends have a rotation operator that
+  // replicates the low-order 32 bits into the high-order 32-bits. The mask
+  // indices of these instructions can only be in the lower 32 bits, so they
+  // can only represent some 64-bit bit groups. However, when they can be used,
+  // the 32-bit replication can be used to represent, as a single bit group,
+  // otherwise separate bit groups. We'll convert to replicated-32-bit bit
+  // groups when possible. Returns true if any of the bit groups were
+  // converted.
+  void assignRepl32BitGroups() {
+    // If we have bits like this:
+    //
+    // Indices:    15 14 13 12 11 10 9 8  7  6  5  4  3  2  1  0
+    // V bits: ... 7  6  5  4  3  2  1 0 31 30 29 28 27 26 25 24
+    // Groups:    |      RLAmt = 8      |      RLAmt = 40       |
+    //
+    // But, making use of a 32-bit operation that replicates the low-order 32
+    // bits into the high-order 32 bits, this can be one bit group with a RLAmt
+    // of 8.
+
+    auto IsAllLow32 = [this](BitGroup & BG) {
+      if (BG.StartIdx <= BG.EndIdx) {
+        for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i) {
+          if (!Bits[i].hasValue())
+            continue;
+          if (Bits[i].getValueBitIndex() >= 32)
+            return false;
+        }
+      } else {
+        for (unsigned i = BG.StartIdx; i < Bits.size(); ++i) {
+          if (!Bits[i].hasValue())
+            continue;
+          if (Bits[i].getValueBitIndex() >= 32)
+            return false;
+        }
+        for (unsigned i = 0; i <= BG.EndIdx; ++i) {
+          if (!Bits[i].hasValue())
+            continue;
+          if (Bits[i].getValueBitIndex() >= 32)
+            return false;
+        }
+      }
+
+      return true;
+    };
+
+    for (auto &BG : BitGroups) {
+      if (BG.StartIdx < 32 && BG.EndIdx < 32) {
+        if (IsAllLow32(BG)) {
+          if (BG.RLAmt >= 32) {
+            BG.RLAmt -= 32;
+            BG.Repl32CR = true;
+          }
+
+          BG.Repl32 = true;
+
+          DEBUG(dbgs() << "\t32-bit replicated bit group for " <<
+                          BG.V.getNode() << " RLAmt = " << BG.RLAmt <<
+                          " [" << BG.StartIdx << ", " << BG.EndIdx << "]\n");
+        }
+      }
+    }
+
+    // Now walk through the bit groups, consolidating where possible.
+    for (auto I = BitGroups.begin(); I != BitGroups.end();) {
+      // We might want to remove this bit group by merging it with the previous
+      // group (which might be the ending group).
+      auto IP = (I == BitGroups.begin()) ?
+                std::prev(BitGroups.end()) : std::prev(I);
+      if (I->Repl32 && IP->Repl32 && I->V == IP->V && I->RLAmt == IP->RLAmt &&
+          I->StartIdx == (IP->EndIdx + 1) % 64 && I != IP) {
+
+        DEBUG(dbgs() << "\tcombining 32-bit replicated bit group for " <<
+                        I->V.getNode() << " RLAmt = " << I->RLAmt <<
+                        " [" << I->StartIdx << ", " << I->EndIdx <<
+                        "] with group with range [" <<
+                        IP->StartIdx << ", " << IP->EndIdx << "]\n");
+
+        IP->EndIdx = I->EndIdx;
+        IP->Repl32CR = IP->Repl32CR || I->Repl32CR;
+        IP->Repl32Coalesced = true;
+        I = BitGroups.erase(I);
+        continue;
+      } else {
+        // There is a special case worth handling: If there is a single group
+        // covering the entire upper 32 bits, and it can be merged with both
+        // the next and previous groups (which might be the same group), then
+        // do so. If it is the same group (so there will be only one group in
+        // total), then we need to reverse the order of the range so that it
+        // covers the entire 64 bits.
+        if (I->StartIdx == 32 && I->EndIdx == 63) {
+          assert(std::next(I) == BitGroups.end() &&
+                 "bit group ends at index 63 but there is another?");
+          auto IN = BitGroups.begin();
+
+          if (IP->Repl32 && IN->Repl32 && I->V == IP->V && I->V == IN->V &&
+              (I->RLAmt % 32) == IP->RLAmt && (I->RLAmt % 32) == IN->RLAmt &&
+              IP->EndIdx == 31 && IN->StartIdx == 0 && I != IP &&
+              IsAllLow32(*I)) {
+
+            DEBUG(dbgs() << "\tcombining bit group for " <<
+                            I->V.getNode() << " RLAmt = " << I->RLAmt <<
+                            " [" << I->StartIdx << ", " << I->EndIdx <<
+                            "] with 32-bit replicated groups with ranges [" <<
+                            IP->StartIdx << ", " << IP->EndIdx << "] and [" <<
+                            IN->StartIdx << ", " << IN->EndIdx << "]\n");
+
+            if (IP == IN) {
+              // There is only one other group; change it to cover the whole
+              // range (backward, so that it can still be Repl32 but cover the
+              // whole 64-bit range).
+              IP->StartIdx = 31;
+              IP->EndIdx = 30;
+              IP->Repl32CR = IP->Repl32CR || I->RLAmt >= 32;
+              IP->Repl32Coalesced = true;
+              I = BitGroups.erase(I);
+            } else {
+              // There are two separate groups, one before this group and one
+              // after us (at the beginning). We're going to remove this group,
+              // but also the group at the very beginning.
+              IP->EndIdx = IN->EndIdx;
+              IP->Repl32CR = IP->Repl32CR || IN->Repl32CR || I->RLAmt >= 32;
+              IP->Repl32Coalesced = true;
+              I = BitGroups.erase(I);
+              BitGroups.erase(BitGroups.begin());
+            }
+
+            // This must be the last group in the vector (and we might have
+            // just invalidated the iterator above), so break here.
+            break;
+          }
+        }
+      }
+
+      ++I;
+    }
+  }
+
+  SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
+    return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+  }
+
+  uint64_t getZerosMask() {
+    uint64_t Mask = 0;
+    for (unsigned i = 0; i < Bits.size(); ++i) {
+      if (Bits[i].hasValue())
+        continue;
+      Mask |= (UINT64_C(1) << i);
+    }
+
+    return ~Mask;
+  }
+
+  // Depending on the number of groups for a particular value, it might be
+  // better to rotate, mask explicitly (using andi/andis), and then or the
+  // result. Select this part of the result first.
+  void SelectAndParts32(const SDLoc &dl, SDValue &Res, unsigned *InstCnt) {
+    if (BPermRewriterNoMasking)
+      return;
+
+    for (ValueRotInfo &VRI : ValueRotsVec) {
+      unsigned Mask = 0;
+      for (unsigned i = 0; i < Bits.size(); ++i) {
+        if (!Bits[i].hasValue() || Bits[i].getValue() != VRI.V)
+          continue;
+        if (RLAmt[i] != VRI.RLAmt)
+          continue;
+        Mask |= (1u << i);
+      }
+
+      // Compute the masks for andi/andis that would be necessary.
+      unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16;
+      assert((ANDIMask != 0 || ANDISMask != 0) &&
+             "No set bits in mask for value bit groups");
+      bool NeedsRotate = VRI.RLAmt != 0;
+
+      // We're trying to minimize the number of instructions. If we have one
+      // group, using one of andi/andis can break even.  If we have three
+      // groups, we can use both andi and andis and break even (to use both
+      // andi and andis we also need to or the results together). We need four
+      // groups if we also need to rotate. To use andi/andis we need to do more
+      // than break even because rotate-and-mask instructions tend to be easier
+      // to schedule.
+
+      // FIXME: We've biased here against using andi/andis, which is right for
+      // POWER cores, but not optimal everywhere. For example, on the A2,
+      // andi/andis have single-cycle latency whereas the rotate-and-mask
+      // instructions take two cycles, and it would be better to bias toward
+      // andi/andis in break-even cases.
+
+      unsigned NumAndInsts = (unsigned) NeedsRotate +
+                             (unsigned) (ANDIMask != 0) +
+                             (unsigned) (ANDISMask != 0) +
+                             (unsigned) (ANDIMask != 0 && ANDISMask != 0) +
+                             (unsigned) (bool) Res;
+
+      DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() <<
+                      " RL: " << VRI.RLAmt << ":" <<
+                      "\n\t\t\tisel using masking: " << NumAndInsts <<
+                      " using rotates: " << VRI.NumGroups << "\n");
+
+      if (NumAndInsts >= VRI.NumGroups)
+        continue;
+
+      DEBUG(dbgs() << "\t\t\t\tusing masking\n");
+
+      if (InstCnt) *InstCnt += NumAndInsts;
+
+      SDValue VRot;
+      if (VRI.RLAmt) {
+        SDValue Ops[] =
+          { VRI.V, getI32Imm(VRI.RLAmt, dl), getI32Imm(0, dl),
+            getI32Imm(31, dl) };
+        VRot = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
+                                              Ops), 0);
+      } else {
+        VRot = VRI.V;
+      }
+
+      SDValue ANDIVal, ANDISVal;
+      if (ANDIMask != 0)
+        ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32,
+                            VRot, getI32Imm(ANDIMask, dl)), 0);
+      if (ANDISMask != 0)
+        ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32,
+                             VRot, getI32Imm(ANDISMask, dl)), 0);
+
+      SDValue TotalVal;
+      if (!ANDIVal)
+        TotalVal = ANDISVal;
+      else if (!ANDISVal)
+        TotalVal = ANDIVal;
+      else
+        TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
+                             ANDIVal, ANDISVal), 0);
+
+      if (!Res)
+        Res = TotalVal;
+      else
+        Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
+                        Res, TotalVal), 0);
+
+      // Now, remove all groups with this underlying value and rotation
+      // factor.
+      eraseMatchingBitGroups([VRI](const BitGroup &BG) {
+        return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt;
+      });
+    }
+  }
+
+  // Instruction selection for the 32-bit case.
+  SDNode *Select32(SDNode *N, bool LateMask, unsigned *InstCnt) {
+    SDLoc dl(N);
+    SDValue Res;
+
+    if (InstCnt) *InstCnt = 0;
+
+    // Take care of cases that should use andi/andis first.
+    SelectAndParts32(dl, Res, InstCnt);
+
+    // If we've not yet selected a 'starting' instruction, and we have no zeros
+    // to fill in, select the (Value, RLAmt) with the highest priority (largest
+    // number of groups), and start with this rotated value.
+    if ((!HasZeros || LateMask) && !Res) {
+      ValueRotInfo &VRI = ValueRotsVec[0];
+      if (VRI.RLAmt) {
+        if (InstCnt) *InstCnt += 1;
+        SDValue Ops[] =
+          { VRI.V, getI32Imm(VRI.RLAmt, dl), getI32Imm(0, dl),
+            getI32Imm(31, dl) };
+        Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops),
+                      0);
+      } else {
+        Res = VRI.V;
+      }
+
+      // Now, remove all groups with this underlying value and rotation factor.
+      eraseMatchingBitGroups([VRI](const BitGroup &BG) {
+        return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt;
+      });
+    }
+
+    if (InstCnt) *InstCnt += BitGroups.size();
+
+    // Insert the other groups (one at a time).
+    for (auto &BG : BitGroups) {
+      if (!Res) {
+        SDValue Ops[] =
+          { BG.V, getI32Imm(BG.RLAmt, dl),
+            getI32Imm(Bits.size() - BG.EndIdx - 1, dl),
+            getI32Imm(Bits.size() - BG.StartIdx - 1, dl) };
+        Res = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
+      } else {
+        SDValue Ops[] =
+          { Res, BG.V, getI32Imm(BG.RLAmt, dl),
+              getI32Imm(Bits.size() - BG.EndIdx - 1, dl),
+            getI32Imm(Bits.size() - BG.StartIdx - 1, dl) };
+        Res = SDValue(CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops), 0);
+      }
+    }
+
+    if (LateMask) {
+      unsigned Mask = (unsigned) getZerosMask();
+
+      unsigned ANDIMask = (Mask & UINT16_MAX), ANDISMask = Mask >> 16;
+      assert((ANDIMask != 0 || ANDISMask != 0) &&
+             "No set bits in zeros mask?");
+
+      if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) +
+                               (unsigned) (ANDISMask != 0) +
+                               (unsigned) (ANDIMask != 0 && ANDISMask != 0);
+
+      SDValue ANDIVal, ANDISVal;
+      if (ANDIMask != 0)
+        ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32,
+                            Res, getI32Imm(ANDIMask, dl)), 0);
+      if (ANDISMask != 0)
+        ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32,
+                             Res, getI32Imm(ANDISMask, dl)), 0);
+
+      if (!ANDIVal)
+        Res = ANDISVal;
+      else if (!ANDISVal)
+        Res = ANDIVal;
+      else
+        Res = SDValue(CurDAG->getMachineNode(PPC::OR, dl, MVT::i32,
+                        ANDIVal, ANDISVal), 0);
+    }
+
+    return Res.getNode();
+  }
+
+  unsigned SelectRotMask64Count(unsigned RLAmt, bool Repl32,
+                                unsigned MaskStart, unsigned MaskEnd,
+                                bool IsIns) {
+    // In the notation used by the instructions, 'start' and 'end' are reversed
+    // because bits are counted from high to low order.
+    unsigned InstMaskStart = 64 - MaskEnd - 1,
+             InstMaskEnd   = 64 - MaskStart - 1;
+
+    if (Repl32)
+      return 1;
+
+    if ((!IsIns && (InstMaskEnd == 63 || InstMaskStart == 0)) ||
+        InstMaskEnd == 63 - RLAmt)
+      return 1;
+
+    return 2;
+  }
+
+  // For 64-bit values, not all combinations of rotates and masks are
+  // available. Produce one if it is available.
+  SDValue SelectRotMask64(SDValue V, const SDLoc &dl, unsigned RLAmt,
+                          bool Repl32, unsigned MaskStart, unsigned MaskEnd,
+                          unsigned *InstCnt = nullptr) {
+    // In the notation used by the instructions, 'start' and 'end' are reversed
+    // because bits are counted from high to low order.
+    unsigned InstMaskStart = 64 - MaskEnd - 1,
+             InstMaskEnd   = 64 - MaskStart - 1;
+
+    if (InstCnt) *InstCnt += 1;
+
+    if (Repl32) {
+      // This rotation amount assumes that the lower 32 bits of the quantity
+      // are replicated in the high 32 bits by the rotation operator (which is
+      // done by rlwinm and friends).
+      assert(InstMaskStart >= 32 && "Mask cannot start out of range");
+      assert(InstMaskEnd   >= 32 && "Mask cannot end out of range");
+      SDValue Ops[] =
+        { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart - 32, dl),
+          getI32Imm(InstMaskEnd - 32, dl) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLWINM8, dl, MVT::i64,
+                                            Ops), 0);
+    }
+
+    if (InstMaskEnd == 63) {
+      SDValue Ops[] =
+        { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Ops), 0);
+    }
+
+    if (InstMaskStart == 0) {
+      SDValue Ops[] =
+        { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskEnd, dl) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Ops), 0);
+    }
+
+    if (InstMaskEnd == 63 - RLAmt) {
+      SDValue Ops[] =
+        { V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, Ops), 0);
+    }
+
+    // We cannot do this with a single instruction, so we'll use two. The
+    // problem is that we're not free to choose both a rotation amount and mask
+    // start and end independently. We can choose an arbitrary mask start and
+    // end, but then the rotation amount is fixed. Rotation, however, can be
+    // inverted, and so by applying an "inverse" rotation first, we can get the
+    // desired result.
+    if (InstCnt) *InstCnt += 1;
+
+    // The rotation mask for the second instruction must be MaskStart.
+    unsigned RLAmt2 = MaskStart;
+    // The first instruction must rotate V so that the overall rotation amount
+    // is RLAmt.
+    unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64;
+    if (RLAmt1)
+      V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63);
+    return SelectRotMask64(V, dl, RLAmt2, false, MaskStart, MaskEnd);
+  }
+
+  // For 64-bit values, not all combinations of rotates and masks are
+  // available. Produce a rotate-mask-and-insert if one is available.
+  SDValue SelectRotMaskIns64(SDValue Base, SDValue V, const SDLoc &dl,
+                             unsigned RLAmt, bool Repl32, unsigned MaskStart,
+                             unsigned MaskEnd, unsigned *InstCnt = nullptr) {
+    // In the notation used by the instructions, 'start' and 'end' are reversed
+    // because bits are counted from high to low order.
+    unsigned InstMaskStart = 64 - MaskEnd - 1,
+             InstMaskEnd   = 64 - MaskStart - 1;
+
+    if (InstCnt) *InstCnt += 1;
+
+    if (Repl32) {
+      // This rotation amount assumes that the lower 32 bits of the quantity
+      // are replicated in the high 32 bits by the rotation operator (which is
+      // done by rlwinm and friends).
+      assert(InstMaskStart >= 32 && "Mask cannot start out of range");
+      assert(InstMaskEnd   >= 32 && "Mask cannot end out of range");
+      SDValue Ops[] =
+        { Base, V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart - 32, dl),
+          getI32Imm(InstMaskEnd - 32, dl) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLWIMI8, dl, MVT::i64,
+                                            Ops), 0);
+    }
+
+    if (InstMaskEnd == 63 - RLAmt) {
+      SDValue Ops[] =
+        { Base, V, getI32Imm(RLAmt, dl), getI32Imm(InstMaskStart, dl) };
+      return SDValue(CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops), 0);
+    }
+
+    // We cannot do this with a single instruction, so we'll use two. The
+    // problem is that we're not free to choose both a rotation amount and mask
+    // start and end independently. We can choose an arbitrary mask start and
+    // end, but then the rotation amount is fixed. Rotation, however, can be
+    // inverted, and so by applying an "inverse" rotation first, we can get the
+    // desired result.
+    if (InstCnt) *InstCnt += 1;
+
+    // The rotation mask for the second instruction must be MaskStart.
+    unsigned RLAmt2 = MaskStart;
+    // The first instruction must rotate V so that the overall rotation amount
+    // is RLAmt.
+    unsigned RLAmt1 = (64 + RLAmt - RLAmt2) % 64;
+    if (RLAmt1)
+      V = SelectRotMask64(V, dl, RLAmt1, false, 0, 63);
+    return SelectRotMaskIns64(Base, V, dl, RLAmt2, false, MaskStart, MaskEnd);
+  }
+
+  void SelectAndParts64(const SDLoc &dl, SDValue &Res, unsigned *InstCnt) {
+    if (BPermRewriterNoMasking)
+      return;
+
+    // The idea here is the same as in the 32-bit version, but with additional
+    // complications from the fact that Repl32 might be true. Because we
+    // aggressively convert bit groups to Repl32 form (which, for small
+    // rotation factors, involves no other change), and then coalesce, it might
+    // be the case that a single 64-bit masking operation could handle both
+    // some Repl32 groups and some non-Repl32 groups. If converting to Repl32
+    // form allowed coalescing, then we must use a 32-bit rotaton in order to
+    // completely capture the new combined bit group.
+
+    for (ValueRotInfo &VRI : ValueRotsVec) {
+      uint64_t Mask = 0;
+
+      // We need to add to the mask all bits from the associated bit groups.
+      // If Repl32 is false, we need to add bits from bit groups that have
+      // Repl32 true, but are trivially convertable to Repl32 false. Such a
+      // group is trivially convertable if it overlaps only with the lower 32
+      // bits, and the group has not been coalesced.
+      auto MatchingBG = [VRI](const BitGroup &BG) {
+        if (VRI.V != BG.V)
+          return false;
+
+        unsigned EffRLAmt = BG.RLAmt;
+        if (!VRI.Repl32 && BG.Repl32) {
+          if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx <= BG.EndIdx &&
+              !BG.Repl32Coalesced) {
+            if (BG.Repl32CR)
+              EffRLAmt += 32;
+          } else {
+            return false;
+          }
+        } else if (VRI.Repl32 != BG.Repl32) {
+          return false;
+        }
+
+        return VRI.RLAmt == EffRLAmt;
+      };
+
+      for (auto &BG : BitGroups) {
+        if (!MatchingBG(BG))
+          continue;
+
+        if (BG.StartIdx <= BG.EndIdx) {
+          for (unsigned i = BG.StartIdx; i <= BG.EndIdx; ++i)
+            Mask |= (UINT64_C(1) << i);
+        } else {
+          for (unsigned i = BG.StartIdx; i < Bits.size(); ++i)
+            Mask |= (UINT64_C(1) << i);
+          for (unsigned i = 0; i <= BG.EndIdx; ++i)
+            Mask |= (UINT64_C(1) << i);
+        }
+      }
+
+      // We can use the 32-bit andi/andis technique if the mask does not
+      // require any higher-order bits. This can save an instruction compared
+      // to always using the general 64-bit technique.
+      bool Use32BitInsts = isUInt<32>(Mask);
+      // Compute the masks for andi/andis that would be necessary.
+      unsigned ANDIMask = (Mask & UINT16_MAX),
+               ANDISMask = (Mask >> 16) & UINT16_MAX;
+
+      bool NeedsRotate = VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask));
+
+      unsigned NumAndInsts = (unsigned) NeedsRotate +
+                             (unsigned) (bool) Res;
+      if (Use32BitInsts)
+        NumAndInsts += (unsigned) (ANDIMask != 0) + (unsigned) (ANDISMask != 0) +
+                       (unsigned) (ANDIMask != 0 && ANDISMask != 0);
+      else
+        NumAndInsts += getInt64Count(Mask) + /* and */ 1;
+
+      unsigned NumRLInsts = 0;
+      bool FirstBG = true;
+      bool MoreBG = false;
+      for (auto &BG : BitGroups) {
+        if (!MatchingBG(BG)) {
+          MoreBG = true;
+          continue;
+        }
+        NumRLInsts +=
+          SelectRotMask64Count(BG.RLAmt, BG.Repl32, BG.StartIdx, BG.EndIdx,
+                               !FirstBG);
+        FirstBG = false;
+      }
+
+      DEBUG(dbgs() << "\t\trotation groups for " << VRI.V.getNode() <<
+                      " RL: " << VRI.RLAmt << (VRI.Repl32 ? " (32):" : ":") <<
+                      "\n\t\t\tisel using masking: " << NumAndInsts <<
+                      " using rotates: " << NumRLInsts << "\n");
+
+      // When we'd use andi/andis, we bias toward using the rotates (andi only
+      // has a record form, and is cracked on POWER cores). However, when using
+      // general 64-bit constant formation, bias toward the constant form,
+      // because that exposes more opportunities for CSE.
+      if (NumAndInsts > NumRLInsts)
+        continue;
+      // When merging multiple bit groups, instruction or is used.
+      // But when rotate is used, rldimi can inert the rotated value into any
+      // register, so instruction or can be avoided.
+      if ((Use32BitInsts || MoreBG) && NumAndInsts == NumRLInsts)
+        continue;
+
+      DEBUG(dbgs() << "\t\t\t\tusing masking\n");
+
+      if (InstCnt) *InstCnt += NumAndInsts;
+
+      SDValue VRot;
+      // We actually need to generate a rotation if we have a non-zero rotation
+      // factor or, in the Repl32 case, if we care about any of the
+      // higher-order replicated bits. In the latter case, we generate a mask
+      // backward so that it actually includes the entire 64 bits.
+      if (VRI.RLAmt || (VRI.Repl32 && !isUInt<32>(Mask)))
+        VRot = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32,
+                               VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63);
+      else
+        VRot = VRI.V;
+
+      SDValue TotalVal;
+      if (Use32BitInsts) {
+        assert((ANDIMask != 0 || ANDISMask != 0) &&
+               "No set bits in mask when using 32-bit ands for 64-bit value");
+
+        SDValue ANDIVal, ANDISVal;
+        if (ANDIMask != 0)
+          ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64,
+                              VRot, getI32Imm(ANDIMask, dl)), 0);
+        if (ANDISMask != 0)
+          ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64,
+                               VRot, getI32Imm(ANDISMask, dl)), 0);
+
+        if (!ANDIVal)
+          TotalVal = ANDISVal;
+        else if (!ANDISVal)
+          TotalVal = ANDIVal;
+        else
+          TotalVal = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+                               ANDIVal, ANDISVal), 0);
+      } else {
+        TotalVal = SDValue(getInt64(CurDAG, dl, Mask), 0);
+        TotalVal =
+          SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
+                                         VRot, TotalVal), 0);
+     }
+
+      if (!Res)
+        Res = TotalVal;
+      else
+        Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+                                             Res, TotalVal), 0);
+
+      // Now, remove all groups with this underlying value and rotation
+      // factor.
+      eraseMatchingBitGroups(MatchingBG);
+    }
+  }
+
+  // Instruction selection for the 64-bit case.
+  SDNode *Select64(SDNode *N, bool LateMask, unsigned *InstCnt) {
+    SDLoc dl(N);
+    SDValue Res;
+
+    if (InstCnt) *InstCnt = 0;
+
+    // Take care of cases that should use andi/andis first.
+    SelectAndParts64(dl, Res, InstCnt);
+
+    // If we've not yet selected a 'starting' instruction, and we have no zeros
+    // to fill in, select the (Value, RLAmt) with the highest priority (largest
+    // number of groups), and start with this rotated value.
+    if ((!HasZeros || LateMask) && !Res) {
+      // If we have both Repl32 groups and non-Repl32 groups, the non-Repl32
+      // groups will come first, and so the VRI representing the largest number
+      // of groups might not be first (it might be the first Repl32 groups).
+      unsigned MaxGroupsIdx = 0;
+      if (!ValueRotsVec[0].Repl32) {
+        for (unsigned i = 0, ie = ValueRotsVec.size(); i < ie; ++i)
+          if (ValueRotsVec[i].Repl32) {
+            if (ValueRotsVec[i].NumGroups > ValueRotsVec[0].NumGroups)
+              MaxGroupsIdx = i;
+            break;
+          }
+      }
+
+      ValueRotInfo &VRI = ValueRotsVec[MaxGroupsIdx];
+      bool NeedsRotate = false;
+      if (VRI.RLAmt) {
+        NeedsRotate = true;
+      } else if (VRI.Repl32) {
+        for (auto &BG : BitGroups) {
+          if (BG.V != VRI.V || BG.RLAmt != VRI.RLAmt ||
+              BG.Repl32 != VRI.Repl32)
+            continue;
+
+          // We don't need a rotate if the bit group is confined to the lower
+          // 32 bits.
+          if (BG.StartIdx < 32 && BG.EndIdx < 32 && BG.StartIdx < BG.EndIdx)
+            continue;
+
+          NeedsRotate = true;
+          break;
+        }
+      }
+
+      if (NeedsRotate)
+        Res = SelectRotMask64(VRI.V, dl, VRI.RLAmt, VRI.Repl32,
+                              VRI.Repl32 ? 31 : 0, VRI.Repl32 ? 30 : 63,
+                              InstCnt);
+      else
+        Res = VRI.V;
+
+      // Now, remove all groups with this underlying value and rotation factor.
+      if (Res)
+        eraseMatchingBitGroups([VRI](const BitGroup &BG) {
+          return BG.V == VRI.V && BG.RLAmt == VRI.RLAmt &&
+                 BG.Repl32 == VRI.Repl32;
+        });
+    }
+
+    // Because 64-bit rotates are more flexible than inserts, we might have a
+    // preference regarding which one we do first (to save one instruction).
+    if (!Res)
+      for (auto I = BitGroups.begin(), IE = BitGroups.end(); I != IE; ++I) {
+        if (SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx,
+                                false) <
+            SelectRotMask64Count(I->RLAmt, I->Repl32, I->StartIdx, I->EndIdx,
+                                true)) {
+          if (I != BitGroups.begin()) {
+            BitGroup BG = *I;
+            BitGroups.erase(I);
+            BitGroups.insert(BitGroups.begin(), BG);
+          }
+
+          break;
+        }
+      }
+
+    // Insert the other groups (one at a time).
+    for (auto &BG : BitGroups) {
+      if (!Res)
+        Res = SelectRotMask64(BG.V, dl, BG.RLAmt, BG.Repl32, BG.StartIdx,
+                              BG.EndIdx, InstCnt);
+      else
+        Res = SelectRotMaskIns64(Res, BG.V, dl, BG.RLAmt, BG.Repl32,
+                                 BG.StartIdx, BG.EndIdx, InstCnt);
+    }
+
+    if (LateMask) {
+      uint64_t Mask = getZerosMask();
+
+      // We can use the 32-bit andi/andis technique if the mask does not
+      // require any higher-order bits. This can save an instruction compared
+      // to always using the general 64-bit technique.
+      bool Use32BitInsts = isUInt<32>(Mask);
+      // Compute the masks for andi/andis that would be necessary.
+      unsigned ANDIMask = (Mask & UINT16_MAX),
+               ANDISMask = (Mask >> 16) & UINT16_MAX;
+
+      if (Use32BitInsts) {
+        assert((ANDIMask != 0 || ANDISMask != 0) &&
+               "No set bits in mask when using 32-bit ands for 64-bit value");
+
+        if (InstCnt) *InstCnt += (unsigned) (ANDIMask != 0) +
+                                 (unsigned) (ANDISMask != 0) +
+                                 (unsigned) (ANDIMask != 0 && ANDISMask != 0);
+
+        SDValue ANDIVal, ANDISVal;
+        if (ANDIMask != 0)
+          ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64,
+                              Res, getI32Imm(ANDIMask, dl)), 0);
+        if (ANDISMask != 0)
+          ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64,
+                               Res, getI32Imm(ANDISMask, dl)), 0);
+
+        if (!ANDIVal)
+          Res = ANDISVal;
+        else if (!ANDISVal)
+          Res = ANDIVal;
+        else
+          Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64,
+                          ANDIVal, ANDISVal), 0);
+      } else {
+        if (InstCnt) *InstCnt += getInt64Count(Mask) + /* and */ 1;
+
+        SDValue MaskVal = SDValue(getInt64(CurDAG, dl, Mask), 0);
+        Res =
+          SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64,
+                                         Res, MaskVal), 0);
+      }
+    }
+
+    return Res.getNode();
+  }
+
+  SDNode *Select(SDNode *N, bool LateMask, unsigned *InstCnt = nullptr) {
+    // Fill in BitGroups.
+    collectBitGroups(LateMask);
+    if (BitGroups.empty())
+      return nullptr;
+
+    // For 64-bit values, figure out when we can use 32-bit instructions.
+    if (Bits.size() == 64)
+      assignRepl32BitGroups();
+
+    // Fill in ValueRotsVec.
+    collectValueRotInfo();
+
+    if (Bits.size() == 32) {
+      return Select32(N, LateMask, InstCnt);
+    } else {
+      assert(Bits.size() == 64 && "Not 64 bits here?");
+      return Select64(N, LateMask, InstCnt);
+    }
+
+    return nullptr;
+  }
+
+  void eraseMatchingBitGroups(function_ref<bool(const BitGroup &)> F) {
+    BitGroups.erase(remove_if(BitGroups, F), BitGroups.end());
+  }
+
+  SmallVector<ValueBit, 64> Bits;
+
+  bool HasZeros;
+  SmallVector<unsigned, 64> RLAmt;
+
+  SmallVector<BitGroup, 16> BitGroups;
+
+  DenseMap<std::pair<SDValue, unsigned>, ValueRotInfo> ValueRots;
+  SmallVector<ValueRotInfo, 16> ValueRotsVec;
+
+  SelectionDAG *CurDAG;
+
+public:
+  BitPermutationSelector(SelectionDAG *DAG)
+    : CurDAG(DAG) {}
+
+  // Here we try to match complex bit permutations into a set of
+  // rotate-and-shift/shift/and/or instructions, using a set of heuristics
+  // known to produce optimial code for common cases (like i32 byte swapping).
+  SDNode *Select(SDNode *N) {
+    Memoizer.clear();
+    auto Result =
+        getValueBits(SDValue(N, 0), N->getValueType(0).getSizeInBits());
+    if (!Result.first)
+      return nullptr;
+    Bits = std::move(*Result.second);
+
+    DEBUG(dbgs() << "Considering bit-permutation-based instruction"
+                    " selection for:    ");
+    DEBUG(N->dump(CurDAG));
+
+    // Fill it RLAmt and set HasZeros.
+    computeRotationAmounts();
+
+    if (!HasZeros)
+      return Select(N, false);
+
+    // We currently have two techniques for handling results with zeros: early
+    // masking (the default) and late masking. Late masking is sometimes more
+    // efficient, but because the structure of the bit groups is different, it
+    // is hard to tell without generating both and comparing the results. With
+    // late masking, we ignore zeros in the resulting value when inserting each
+    // set of bit groups, and then mask in the zeros at the end. With early
+    // masking, we only insert the non-zero parts of the result at every step.
+
+    unsigned InstCnt, InstCntLateMask;
+    DEBUG(dbgs() << "\tEarly masking:\n");
+    SDNode *RN = Select(N, false, &InstCnt);
+    DEBUG(dbgs() << "\t\tisel would use " << InstCnt << " instructions\n");
+
+    DEBUG(dbgs() << "\tLate masking:\n");
+    SDNode *RNLM = Select(N, true, &InstCntLateMask);
+    DEBUG(dbgs() << "\t\tisel would use " << InstCntLateMask <<
+                    " instructions\n");
+
+    if (InstCnt <= InstCntLateMask) {
+      DEBUG(dbgs() << "\tUsing early-masking for isel\n");
+      return RN;
+    }
+
+    DEBUG(dbgs() << "\tUsing late-masking for isel\n");
+    return RNLM;
+  }
+};
+} // anonymous namespace
+
+bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) {
+  if (N->getValueType(0) != MVT::i32 &&
+      N->getValueType(0) != MVT::i64)
+    return false;
+
+  if (!UseBitPermRewriter)
+    return false;
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::ROTL:
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::AND:
+  case ISD::OR: {
+    BitPermutationSelector BPS(CurDAG);
+    if (SDNode *New = BPS.Select(N)) {
+      ReplaceNode(N, New);
+      return true;
+    }
+    return false;
+  }
+  }
+
+  return false;
+}
+
+/// SelectCC - Select a comparison of the specified values with the specified
+/// condition code, returning the CR# of the expression.
+SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                  const SDLoc &dl) {
+  // Always select the LHS.
+  unsigned Opc;
+
+  if (LHS.getValueType() == MVT::i32) {
+    unsigned Imm;
+    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+      if (isInt32Immediate(RHS, Imm)) {
+        // SETEQ/SETNE comparison with 16-bit immediate, fold it.
+        if (isUInt<16>(Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
+                                                getI32Imm(Imm & 0xFFFF, dl)),
+                         0);
+        // If this is a 16-bit signed immediate, fold it.
+        if (isInt<16>((int)Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
+                                                getI32Imm(Imm & 0xFFFF, dl)),
+                         0);
+
+        // For non-equality comparisons, the default code would materialize the
+        // constant, then compare against it, like this:
+        //   lis r2, 4660
+        //   ori r2, r2, 22136
+        //   cmpw cr0, r3, r2
+        // Since we are just comparing for equality, we can emit this instead:
+        //   xoris r0,r3,0x1234
+        //   cmplwi cr0,r0,0x5678
+        //   beq cr0,L6
+        SDValue Xor(CurDAG->getMachineNode(PPC::XORIS, dl, MVT::i32, LHS,
+                                           getI32Imm(Imm >> 16, dl)), 0);
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, Xor,
+                                              getI32Imm(Imm & 0xFFFF, dl)), 0);
+      }
+      Opc = PPC::CMPLW;
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (isInt32Immediate(RHS, Imm) && isUInt<16>(Imm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
+                                              getI32Imm(Imm & 0xFFFF, dl)), 0);
+      Opc = PPC::CMPLW;
+    } else {
+      short SImm;
+      if (isIntS16Immediate(RHS, SImm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
+                                              getI32Imm((int)SImm & 0xFFFF,
+                                                        dl)),
+                         0);
+      Opc = PPC::CMPW;
+    }
+  } else if (LHS.getValueType() == MVT::i64) {
+    uint64_t Imm;
+    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+      if (isInt64Immediate(RHS.getNode(), Imm)) {
+        // SETEQ/SETNE comparison with 16-bit immediate, fold it.
+        if (isUInt<16>(Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
+                                                getI32Imm(Imm & 0xFFFF, dl)),
+                         0);
+        // If this is a 16-bit signed immediate, fold it.
+        if (isInt<16>(Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
+                                                getI32Imm(Imm & 0xFFFF, dl)),
+                         0);
+
+        // For non-equality comparisons, the default code would materialize the
+        // constant, then compare against it, like this:
+        //   lis r2, 4660
+        //   ori r2, r2, 22136
+        //   cmpd cr0, r3, r2
+        // Since we are just comparing for equality, we can emit this instead:
+        //   xoris r0,r3,0x1234
+        //   cmpldi cr0,r0,0x5678
+        //   beq cr0,L6
+        if (isUInt<32>(Imm)) {
+          SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS,
+                                             getI64Imm(Imm >> 16, dl)), 0);
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor,
+                                                getI64Imm(Imm & 0xFFFF, dl)),
+                         0);
+        }
+      }
+      Opc = PPC::CMPLD;
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (isInt64Immediate(RHS.getNode(), Imm) && isUInt<16>(Imm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
+                                              getI64Imm(Imm & 0xFFFF, dl)), 0);
+      Opc = PPC::CMPLD;
+    } else {
+      short SImm;
+      if (isIntS16Immediate(RHS, SImm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
+                                              getI64Imm(SImm & 0xFFFF, dl)),
+                         0);
+      Opc = PPC::CMPD;
+    }
+  } else if (LHS.getValueType() == MVT::f32) {
+    Opc = PPC::FCMPUS;
+  } else {
+    assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
+    Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
+  }
+  return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
+}
+
+static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) {
+  switch (CC) {
+  case ISD::SETUEQ:
+  case ISD::SETONE:
+  case ISD::SETOLE:
+  case ISD::SETOGE:
+    llvm_unreachable("Should be lowered by legalize!");
+  default: llvm_unreachable("Unknown condition!");
+  case ISD::SETOEQ:
+  case ISD::SETEQ:  return PPC::PRED_EQ;
+  case ISD::SETUNE:
+  case ISD::SETNE:  return PPC::PRED_NE;
+  case ISD::SETOLT:
+  case ISD::SETLT:  return PPC::PRED_LT;
+  case ISD::SETULE:
+  case ISD::SETLE:  return PPC::PRED_LE;
+  case ISD::SETOGT:
+  case ISD::SETGT:  return PPC::PRED_GT;
+  case ISD::SETUGE:
+  case ISD::SETGE:  return PPC::PRED_GE;
+  case ISD::SETO:   return PPC::PRED_NU;
+  case ISD::SETUO:  return PPC::PRED_UN;
+    // These two are invalid for floating point.  Assume we have int.
+  case ISD::SETULT: return PPC::PRED_LT;
+  case ISD::SETUGT: return PPC::PRED_GT;
+  }
+}
+
+/// getCRIdxForSetCC - Return the index of the condition register field
+/// associated with the SetCC condition, and whether or not the field is
+/// treated as inverted.  That is, lt = 0; ge = 0 inverted.
+static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) {
+  Invert = false;
+  switch (CC) {
+  default: llvm_unreachable("Unknown condition!");
+  case ISD::SETOLT:
+  case ISD::SETLT:  return 0;                  // Bit #0 = SETOLT
+  case ISD::SETOGT:
+  case ISD::SETGT:  return 1;                  // Bit #1 = SETOGT
+  case ISD::SETOEQ:
+  case ISD::SETEQ:  return 2;                  // Bit #2 = SETOEQ
+  case ISD::SETUO:  return 3;                  // Bit #3 = SETUO
+  case ISD::SETUGE:
+  case ISD::SETGE:  Invert = true; return 0;   // !Bit #0 = SETUGE
+  case ISD::SETULE:
+  case ISD::SETLE:  Invert = true; return 1;   // !Bit #1 = SETULE
+  case ISD::SETUNE:
+  case ISD::SETNE:  Invert = true; return 2;   // !Bit #2 = SETUNE
+  case ISD::SETO:   Invert = true; return 3;   // !Bit #3 = SETO
+  case ISD::SETUEQ:
+  case ISD::SETOGE:
+  case ISD::SETOLE:
+  case ISD::SETONE:
+    llvm_unreachable("Invalid branch code: should be expanded by legalize");
+  // These are invalid for floating point.  Assume integer.
+  case ISD::SETULT: return 0;
+  case ISD::SETUGT: return 1;
+  }
+}
+
+// getVCmpInst: return the vector compare instruction for the specified
+// vector type and condition code. Since this is for altivec specific code,
+// only support the altivec types (v16i8, v8i16, v4i32, v2i64, and v4f32).
+static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
+                                bool HasVSX, bool &Swap, bool &Negate) {
+  Swap = false;
+  Negate = false;
+
+  if (VecVT.isFloatingPoint()) {
+    /* Handle some cases by swapping input operands.  */
+    switch (CC) {
+      case ISD::SETLE: CC = ISD::SETGE; Swap = true; break;
+      case ISD::SETLT: CC = ISD::SETGT; Swap = true; break;
+      case ISD::SETOLE: CC = ISD::SETOGE; Swap = true; break;
+      case ISD::SETOLT: CC = ISD::SETOGT; Swap = true; break;
+      case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break;
+      case ISD::SETUGT: CC = ISD::SETULT; Swap = true; break;
+      default: break;
+    }
+    /* Handle some cases by negating the result.  */
+    switch (CC) {
+      case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break;
+      case ISD::SETUNE: CC = ISD::SETOEQ; Negate = true; break;
+      case ISD::SETULE: CC = ISD::SETOGT; Negate = true; break;
+      case ISD::SETULT: CC = ISD::SETOGE; Negate = true; break;
+      default: break;
+    }
+    /* We have instructions implementing the remaining cases.  */
+    switch (CC) {
+      case ISD::SETEQ:
+      case ISD::SETOEQ:
+        if (VecVT == MVT::v4f32)
+          return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP;
+        else if (VecVT == MVT::v2f64)
+          return PPC::XVCMPEQDP;
+        break;
+      case ISD::SETGT:
+      case ISD::SETOGT:
+        if (VecVT == MVT::v4f32)
+          return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP;
+        else if (VecVT == MVT::v2f64)
+          return PPC::XVCMPGTDP;
+        break;
+      case ISD::SETGE:
+      case ISD::SETOGE:
+        if (VecVT == MVT::v4f32)
+          return HasVSX ? PPC::XVCMPGESP : PPC::VCMPGEFP;
+        else if (VecVT == MVT::v2f64)
+          return PPC::XVCMPGEDP;
+        break;
+      default:
+        break;
+    }
+    llvm_unreachable("Invalid floating-point vector compare condition");
+  } else {
+    /* Handle some cases by swapping input operands.  */
+    switch (CC) {
+      case ISD::SETGE: CC = ISD::SETLE; Swap = true; break;
+      case ISD::SETLT: CC = ISD::SETGT; Swap = true; break;
+      case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break;
+      case ISD::SETULT: CC = ISD::SETUGT; Swap = true; break;
+      default: break;
+    }
+    /* Handle some cases by negating the result.  */
+    switch (CC) {
+      case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break;
+      case ISD::SETUNE: CC = ISD::SETUEQ; Negate = true; break;
+      case ISD::SETLE: CC = ISD::SETGT; Negate = true; break;
+      case ISD::SETULE: CC = ISD::SETUGT; Negate = true; break;
+      default: break;
+    }
+    /* We have instructions implementing the remaining cases.  */
+    switch (CC) {
+      case ISD::SETEQ:
+      case ISD::SETUEQ:
+        if (VecVT == MVT::v16i8)
+          return PPC::VCMPEQUB;
+        else if (VecVT == MVT::v8i16)
+          return PPC::VCMPEQUH;
+        else if (VecVT == MVT::v4i32)
+          return PPC::VCMPEQUW;
+        else if (VecVT == MVT::v2i64)
+          return PPC::VCMPEQUD;
+        break;
+      case ISD::SETGT:
+        if (VecVT == MVT::v16i8)
+          return PPC::VCMPGTSB;
+        else if (VecVT == MVT::v8i16)
+          return PPC::VCMPGTSH;
+        else if (VecVT == MVT::v4i32)
+          return PPC::VCMPGTSW;
+        else if (VecVT == MVT::v2i64)
+          return PPC::VCMPGTSD;
+        break;
+      case ISD::SETUGT:
+        if (VecVT == MVT::v16i8)
+          return PPC::VCMPGTUB;
+        else if (VecVT == MVT::v8i16)
+          return PPC::VCMPGTUH;
+        else if (VecVT == MVT::v4i32)
+          return PPC::VCMPGTUW;
+        else if (VecVT == MVT::v2i64)
+          return PPC::VCMPGTUD;
+        break;
+      default:
+        break;
+    }
+    llvm_unreachable("Invalid integer vector compare condition");
+  }
+}
+
+bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
+  SDLoc dl(N);
+  unsigned Imm;
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  EVT PtrVT =
+      CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout());
+  bool isPPC64 = (PtrVT == MVT::i64);
+
+  if (!PPCSubTarget->useCRBits() &&
+      isInt32Immediate(N->getOperand(1), Imm)) {
+    // We can codegen setcc op, imm very efficiently compared to a brcond.
+    // Check for those cases here.
+    // setcc op, 0
+    if (Imm == 0) {
+      SDValue Op = N->getOperand(0);
+      switch (CC) {
+      default: break;
+      case ISD::SETEQ: {
+        Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0);
+        SDValue Ops[] = { Op, getI32Imm(27, dl), getI32Imm(5, dl),
+                          getI32Imm(31, dl) };
+        CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        return true;
+      }
+      case ISD::SETNE: {
+        if (isPPC64) break;
+        SDValue AD =
+          SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+                                         Op, getI32Imm(~0U, dl)), 0);
+        CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op, AD.getValue(1));
+        return true;
+      }
+      case ISD::SETLT: {
+        SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl),
+                          getI32Imm(31, dl) };
+        CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        return true;
+      }
+      case ISD::SETGT: {
+        SDValue T =
+          SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0);
+        T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0);
+        SDValue Ops[] = { T, getI32Imm(1, dl), getI32Imm(31, dl),
+                          getI32Imm(31, dl) };
+        CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        return true;
+      }
+      }
+    } else if (Imm == ~0U) {        // setcc op, -1
+      SDValue Op = N->getOperand(0);
+      switch (CC) {
+      default: break;
+      case ISD::SETEQ:
+        if (isPPC64) break;
+        Op = SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+                                            Op, getI32Imm(1, dl)), 0);
+        CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32,
+                             SDValue(CurDAG->getMachineNode(PPC::LI, dl,
+                                                            MVT::i32,
+                                                            getI32Imm(0, dl)),
+                                     0), Op.getValue(1));
+        return true;
+      case ISD::SETNE: {
+        if (isPPC64) break;
+        Op = SDValue(CurDAG->getMachineNode(PPC::NOR, dl, MVT::i32, Op, Op), 0);
+        SDNode *AD = CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+                                            Op, getI32Imm(~0U, dl));
+        CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0), Op,
+                             SDValue(AD, 1));
+        return true;
+      }
+      case ISD::SETLT: {
+        SDValue AD = SDValue(CurDAG->getMachineNode(PPC::ADDI, dl, MVT::i32, Op,
+                                                    getI32Imm(1, dl)), 0);
+        SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD,
+                                                    Op), 0);
+        SDValue Ops[] = { AN, getI32Imm(1, dl), getI32Imm(31, dl),
+                          getI32Imm(31, dl) };
+        CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+        return true;
+      }
+      case ISD::SETGT: {
+        SDValue Ops[] = { Op, getI32Imm(1, dl), getI32Imm(31, dl),
+                          getI32Imm(31, dl) };
+        Op = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
+        CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op, getI32Imm(1, dl));
+        return true;
+      }
+      }
+    }
+  }
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Altivec Vector compare instructions do not set any CR register by default and
+  // vector compare operations return the same type as the operands.
+  if (LHS.getValueType().isVector()) {
+    if (PPCSubTarget->hasQPX())
+      return false;
+
+    EVT VecVT = LHS.getValueType();
+    bool Swap, Negate;
+    unsigned int VCmpInst = getVCmpInst(VecVT.getSimpleVT(), CC,
+                                        PPCSubTarget->hasVSX(), Swap, Negate);
+    if (Swap)
+      std::swap(LHS, RHS);
+
+    EVT ResVT = VecVT.changeVectorElementTypeToInteger();
+    if (Negate) {
+      SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, ResVT, LHS, RHS), 0);
+      CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR : PPC::VNOR,
+                           ResVT, VCmp, VCmp);
+      return true;
+    }
+
+    CurDAG->SelectNodeTo(N, VCmpInst, ResVT, LHS, RHS);
+    return true;
+  }
+
+  if (PPCSubTarget->useCRBits())
+    return false;
+
+  bool Inv;
+  unsigned Idx = getCRIdxForSetCC(CC, Inv);
+  SDValue CCReg = SelectCC(LHS, RHS, CC, dl);
+  SDValue IntCR;
+
+  // Force the ccreg into CR7.
+  SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32);
+
+  SDValue InFlag(nullptr, 0);  // Null incoming flag value.
+  CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg,
+                               InFlag).getValue(1);
+
+  IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
+                                         CCReg), 0);
+
+  SDValue Ops[] = { IntCR, getI32Imm((32 - (3 - Idx)) & 31, dl),
+                      getI32Imm(31, dl), getI32Imm(31, dl) };
+  if (!Inv) {
+    CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+    return true;
+  }
+
+  // Get the specified bit.
+  SDValue Tmp =
+    SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
+  CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1, dl));
+  return true;
+}
+
+void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+}
+
+
+// Select - Convert the specified operand from a target-independent to a
+// target-specific node if it hasn't already been changed.
+void PPCDAGToDAGISel::Select(SDNode *N) {
+  SDLoc dl(N);
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return;   // Already selected.
+  }
+
+  // In case any misguided DAG-level optimizations form an ADD with a
+  // TargetConstant operand, crash here instead of miscompiling (by selecting
+  // an r+r add instead of some kind of r+i add).
+  if (N->getOpcode() == ISD::ADD &&
+      N->getOperand(1).getOpcode() == ISD::TargetConstant)
+    llvm_unreachable("Invalid ADD with TargetConstant operand");
+
+  // Try matching complex bit permutations before doing anything else.
+  if (tryBitPermutation(N))
+    return;
+
+  switch (N->getOpcode()) {
+  default: break;
+
+  case ISD::Constant: {
+    if (N->getValueType(0) == MVT::i64) {
+      ReplaceNode(N, getInt64(CurDAG, N));
+      return;
+    }
+    break;
+  }
+
+  case ISD::SETCC: {
+    if (trySETCC(N))
+      return;
+    break;
+  }
+  case PPCISD::GlobalBaseReg:
+    ReplaceNode(N, getGlobalBaseReg());
+    return;
+
+  case ISD::FrameIndex:
+    selectFrameIndex(N, N);
+    return;
+
+  case PPCISD::MFOCRF: {
+    SDValue InFlag = N->getOperand(1);
+    ReplaceNode(N, CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32,
+                                          N->getOperand(0), InFlag));
+    return;
+  }
+
+  case PPCISD::READ_TIME_BASE: {
+    ReplaceNode(N, CurDAG->getMachineNode(PPC::ReadTB, dl, MVT::i32, MVT::i32,
+                                          MVT::Other, N->getOperand(0)));
+    return;
+  }
+
+  case PPCISD::SRA_ADDZE: {
+    SDValue N0 = N->getOperand(0);
+    SDValue ShiftAmt =
+      CurDAG->getTargetConstant(*cast<ConstantSDNode>(N->getOperand(1))->
+                                  getConstantIntValue(), dl,
+                                  N->getValueType(0));
+    if (N->getValueType(0) == MVT::i64) {
+      SDNode *Op =
+        CurDAG->getMachineNode(PPC::SRADI, dl, MVT::i64, MVT::Glue,
+                               N0, ShiftAmt);
+      CurDAG->SelectNodeTo(N, PPC::ADDZE8, MVT::i64, SDValue(Op, 0),
+                           SDValue(Op, 1));
+      return;
+    } else {
+      assert(N->getValueType(0) == MVT::i32 &&
+             "Expecting i64 or i32 in PPCISD::SRA_ADDZE");
+      SDNode *Op =
+        CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue,
+                               N0, ShiftAmt);
+      CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, SDValue(Op, 0),
+                           SDValue(Op, 1));
+      return;
+    }
+  }
+
+  case ISD::LOAD: {
+    // Handle preincrement loads.
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    EVT LoadedVT = LD->getMemoryVT();
+
+    // Normal loads are handled by code generated from the .td file.
+    if (LD->getAddressingMode() != ISD::PRE_INC)
+      break;
+
+    SDValue Offset = LD->getOffset();
+    if (Offset.getOpcode() == ISD::TargetConstant ||
+        Offset.getOpcode() == ISD::TargetGlobalAddress) {
+
+      unsigned Opcode;
+      bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
+      if (LD->getValueType(0) != MVT::i64) {
+        // Handle PPC32 integer and normal FP loads.
+        assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::f64: Opcode = PPC::LFDU; break;
+          case MVT::f32: Opcode = PPC::LFSU; break;
+          case MVT::i32: Opcode = PPC::LWZU; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAU : PPC::LHZU; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZU; break;
+        }
+      } else {
+        assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
+        assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::i64: Opcode = PPC::LDU; break;
+          case MVT::i32: Opcode = PPC::LWZU8; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAU8 : PPC::LHZU8; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZU8; break;
+        }
+      }
+
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[] = { Offset, Base, Chain };
+      SDNode *MN = CurDAG->getMachineNode(
+          Opcode, dl, LD->getValueType(0),
+          PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other, Ops);
+      transferMemOperands(N, MN);
+      ReplaceNode(N, MN);
+      return;
+    } else {
+      unsigned Opcode;
+      bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
+      if (LD->getValueType(0) != MVT::i64) {
+        // Handle PPC32 integer and normal FP loads.
+        assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::v4f64: Opcode = PPC::QVLFDUX; break; // QPX
+          case MVT::v4f32: Opcode = PPC::QVLFSUX; break; // QPX
+          case MVT::f64: Opcode = PPC::LFDUX; break;
+          case MVT::f32: Opcode = PPC::LFSUX; break;
+          case MVT::i32: Opcode = PPC::LWZUX; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAUX : PPC::LHZUX; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZUX; break;
+        }
+      } else {
+        assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
+        assert((!isSExt || LoadedVT == MVT::i16 || LoadedVT == MVT::i32) &&
+               "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::i64: Opcode = PPC::LDUX; break;
+          case MVT::i32: Opcode = isSExt ? PPC::LWAUX  : PPC::LWZUX8; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAUX8 : PPC::LHZUX8; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZUX8; break;
+        }
+      }
+
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[] = { Base, Offset, Chain };
+      SDNode *MN = CurDAG->getMachineNode(
+          Opcode, dl, LD->getValueType(0),
+          PPCLowering->getPointerTy(CurDAG->getDataLayout()), MVT::Other, Ops);
+      transferMemOperands(N, MN);
+      ReplaceNode(N, MN);
+      return;
+    }
+  }
+
+  case ISD::AND: {
+    unsigned Imm, Imm2, SH, MB, ME;
+    uint64_t Imm64;
+
+    // If this is an and of a value rotated between 0 and 31 bits and then and'd
+    // with a mask, emit rlwinm
+    if (isInt32Immediate(N->getOperand(1), Imm) &&
+        isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) {
+      SDValue Val = N->getOperand(0).getOperand(0);
+      SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl),
+                        getI32Imm(ME, dl) };
+      CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+      return;
+    }
+    // If this is just a masked value where the input is not handled above, and
+    // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm
+    if (isInt32Immediate(N->getOperand(1), Imm) &&
+        isRunOfOnes(Imm, MB, ME) &&
+        N->getOperand(0).getOpcode() != ISD::ROTL) {
+      SDValue Val = N->getOperand(0);
+      SDValue Ops[] = { Val, getI32Imm(0, dl), getI32Imm(MB, dl),
+                        getI32Imm(ME, dl) };
+      CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+      return;
+    }
+    // If this is a 64-bit zero-extension mask, emit rldicl.
+    if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
+        isMask_64(Imm64)) {
+      SDValue Val = N->getOperand(0);
+      MB = 64 - countTrailingOnes(Imm64);
+      SH = 0;
+
+      if (Val.getOpcode() == ISD::ANY_EXTEND) {
+        auto Op0 = Val.getOperand(0);
+        if ( Op0.getOpcode() == ISD::SRL &&
+           isInt32Immediate(Op0.getOperand(1).getNode(), Imm) && Imm <= MB) {
+
+           auto ResultType = Val.getNode()->getValueType(0);
+           auto ImDef = CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl,
+                                               ResultType);
+           SDValue IDVal (ImDef, 0);
+
+           Val = SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl,
+                         ResultType, IDVal, Op0.getOperand(0),
+                         getI32Imm(1, dl)), 0);
+           SH = 64 - Imm;
+        }
+      }
+
+      // If the operand is a logical right shift, we can fold it into this
+      // instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb)
+      // for n <= mb. The right shift is really a left rotate followed by a
+      // mask, and this mask is a more-restrictive sub-mask of the mask implied
+      // by the shift.
+      if (Val.getOpcode() == ISD::SRL &&
+          isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) {
+        assert(Imm < 64 && "Illegal shift amount");
+        Val = Val.getOperand(0);
+        SH = 64 - Imm;
+      }
+
+      SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) };
+      CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops);
+      return;
+    }
+    // AND X, 0 -> 0, not "rlwinm 32".
+    if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
+      ReplaceUses(SDValue(N, 0), N->getOperand(1));
+      return;
+    }
+    // ISD::OR doesn't get all the bitfield insertion fun.
+    // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) might be a
+    // bitfield insert.
+    if (isInt32Immediate(N->getOperand(1), Imm) &&
+        N->getOperand(0).getOpcode() == ISD::OR &&
+        isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) {
+      // The idea here is to check whether this is equivalent to:
+      //   (c1 & m) | (x & ~m)
+      // where m is a run-of-ones mask. The logic here is that, for each bit in
+      // c1 and c2:
+      //  - if both are 1, then the output will be 1.
+      //  - if both are 0, then the output will be 0.
+      //  - if the bit in c1 is 0, and the bit in c2 is 1, then the output will
+      //    come from x.
+      //  - if the bit in c1 is 1, and the bit in c2 is 0, then the output will
+      //    be 0.
+      //  If that last condition is never the case, then we can form m from the
+      //  bits that are the same between c1 and c2.
+      unsigned MB, ME;
+      if (isRunOfOnes(~(Imm^Imm2), MB, ME) && !(~Imm & Imm2)) {
+        SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                            N->getOperand(0).getOperand(1),
+                            getI32Imm(0, dl), getI32Imm(MB, dl),
+                            getI32Imm(ME, dl) };
+        ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops));
+        return;
+      }
+    }
+
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::OR: {
+    if (N->getValueType(0) == MVT::i32)
+      if (tryBitfieldInsert(N))
+        return;
+
+    short Imm;
+    if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
+        isIntS16Immediate(N->getOperand(1), Imm)) {
+      APInt LHSKnownZero, LHSKnownOne;
+      CurDAG->computeKnownBits(N->getOperand(0), LHSKnownZero, LHSKnownOne);
+
+      // If this is equivalent to an add, then we can fold it with the
+      // FrameIndex calculation.
+      if ((LHSKnownZero.getZExtValue()|~(uint64_t)Imm) == ~0ULL) {
+        selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
+        return;
+      }
+    }
+
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::ADD: {
+    short Imm;
+    if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
+        isIntS16Immediate(N->getOperand(1), Imm)) {
+      selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
+      return;
+    }
+
+    break;
+  }
+  case ISD::SHL: {
+    unsigned Imm, SH, MB, ME;
+    if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
+        isRotateAndMask(N, Imm, true, SH, MB, ME)) {
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                          getI32Imm(SH, dl), getI32Imm(MB, dl),
+                          getI32Imm(ME, dl) };
+      CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+      return;
+    }
+
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::SRL: {
+    unsigned Imm, SH, MB, ME;
+    if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
+        isRotateAndMask(N, Imm, true, SH, MB, ME)) {
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                          getI32Imm(SH, dl), getI32Imm(MB, dl),
+                          getI32Imm(ME, dl) };
+      CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+      return;
+    }
+
+    // Other cases are autogenerated.
+    break;
+  }
+  // FIXME: Remove this once the ANDI glue bug is fixed:
+  case PPCISD::ANDIo_1_EQ_BIT:
+  case PPCISD::ANDIo_1_GT_BIT: {
+    if (!ANDIGlueBug)
+      break;
+
+    EVT InVT = N->getOperand(0).getValueType();
+    assert((InVT == MVT::i64 || InVT == MVT::i32) &&
+           "Invalid input type for ANDIo_1_EQ_BIT");
+
+    unsigned Opcode = (InVT == MVT::i64) ? PPC::ANDIo8 : PPC::ANDIo;
+    SDValue AndI(CurDAG->getMachineNode(Opcode, dl, InVT, MVT::Glue,
+                                        N->getOperand(0),
+                                        CurDAG->getTargetConstant(1, dl, InVT)),
+                 0);
+    SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32);
+    SDValue SRIdxVal =
+      CurDAG->getTargetConstant(N->getOpcode() == PPCISD::ANDIo_1_EQ_BIT ?
+                                PPC::sub_eq : PPC::sub_gt, dl, MVT::i32);
+
+    CurDAG->SelectNodeTo(N, TargetOpcode::EXTRACT_SUBREG, MVT::i1, CR0Reg,
+                         SRIdxVal, SDValue(AndI.getNode(), 1) /* glue */);
+    return;
+  }
+  case ISD::SELECT_CC: {
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+    EVT PtrVT =
+        CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout());
+    bool isPPC64 = (PtrVT == MVT::i64);
+
+    // If this is a select of i1 operands, we'll pattern match it.
+    if (PPCSubTarget->useCRBits() &&
+        N->getOperand(0).getValueType() == MVT::i1)
+      break;
+
+    // Handle the setcc cases here.  select_cc lhs, 0, 1, 0, cc
+    if (!isPPC64)
+      if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+        if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+          if (ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N->getOperand(3)))
+            if (N1C->isNullValue() && N3C->isNullValue() &&
+                N2C->getZExtValue() == 1ULL && CC == ISD::SETNE &&
+                // FIXME: Implement this optzn for PPC64.
+                N->getValueType(0) == MVT::i32) {
+              SDNode *Tmp =
+                CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+                                       N->getOperand(0), getI32Imm(~0U, dl));
+              CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(Tmp, 0),
+                                   N->getOperand(0), SDValue(Tmp, 1));
+              return;
+            }
+
+    SDValue CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC, dl);
+
+    if (N->getValueType(0) == MVT::i1) {
+      // An i1 select is: (c & t) | (!c & f).
+      bool Inv;
+      unsigned Idx = getCRIdxForSetCC(CC, Inv);
+
+      unsigned SRI;
+      switch (Idx) {
+      default: llvm_unreachable("Invalid CC index");
+      case 0: SRI = PPC::sub_lt; break;
+      case 1: SRI = PPC::sub_gt; break;
+      case 2: SRI = PPC::sub_eq; break;
+      case 3: SRI = PPC::sub_un; break;
+      }
+
+      SDValue CCBit = CurDAG->getTargetExtractSubreg(SRI, dl, MVT::i1, CCReg);
+
+      SDValue NotCCBit(CurDAG->getMachineNode(PPC::CRNOR, dl, MVT::i1,
+                                              CCBit, CCBit), 0);
+      SDValue C =    Inv ? NotCCBit : CCBit,
+              NotC = Inv ? CCBit    : NotCCBit;
+
+      SDValue CAndT(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1,
+                                           C, N->getOperand(2)), 0);
+      SDValue NotCAndF(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1,
+                                              NotC, N->getOperand(3)), 0);
+
+      CurDAG->SelectNodeTo(N, PPC::CROR, MVT::i1, CAndT, NotCAndF);
+      return;
+    }
+
+    unsigned BROpc = getPredicateForSetCC(CC);
+
+    unsigned SelectCCOp;
+    if (N->getValueType(0) == MVT::i32)
+      SelectCCOp = PPC::SELECT_CC_I4;
+    else if (N->getValueType(0) == MVT::i64)
+      SelectCCOp = PPC::SELECT_CC_I8;
+    else if (N->getValueType(0) == MVT::f32)
+      if (PPCSubTarget->hasP8Vector())
+        SelectCCOp = PPC::SELECT_CC_VSSRC;
+      else
+        SelectCCOp = PPC::SELECT_CC_F4;
+    else if (N->getValueType(0) == MVT::f64)
+      if (PPCSubTarget->hasVSX())
+        SelectCCOp = PPC::SELECT_CC_VSFRC;
+      else
+        SelectCCOp = PPC::SELECT_CC_F8;
+    else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f64)
+      SelectCCOp = PPC::SELECT_CC_QFRC;
+    else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f32)
+      SelectCCOp = PPC::SELECT_CC_QSRC;
+    else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4i1)
+      SelectCCOp = PPC::SELECT_CC_QBRC;
+    else if (N->getValueType(0) == MVT::v2f64 ||
+             N->getValueType(0) == MVT::v2i64)
+      SelectCCOp = PPC::SELECT_CC_VSRC;
+    else
+      SelectCCOp = PPC::SELECT_CC_VRRC;
+
+    SDValue Ops[] = { CCReg, N->getOperand(2), N->getOperand(3),
+                        getI32Imm(BROpc, dl) };
+    CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops);
+    return;
+  }
+  case ISD::VSELECT:
+    if (PPCSubTarget->hasVSX()) {
+      SDValue Ops[] = { N->getOperand(2), N->getOperand(1), N->getOperand(0) };
+      CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops);
+      return;
+    }
+
+    break;
+  case ISD::VECTOR_SHUFFLE:
+    if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
+                                  N->getValueType(0) == MVT::v2i64)) {
+      ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+
+      SDValue Op1 = N->getOperand(SVN->getMaskElt(0) < 2 ? 0 : 1),
+              Op2 = N->getOperand(SVN->getMaskElt(1) < 2 ? 0 : 1);
+      unsigned DM[2];
+
+      for (int i = 0; i < 2; ++i)
+        if (SVN->getMaskElt(i) <= 0 || SVN->getMaskElt(i) == 2)
+          DM[i] = 0;
+        else
+          DM[i] = 1;
+
+      if (Op1 == Op2 && DM[0] == 0 && DM[1] == 0 &&
+          Op1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+          isa<LoadSDNode>(Op1.getOperand(0))) {
+        LoadSDNode *LD = cast<LoadSDNode>(Op1.getOperand(0));
+        SDValue Base, Offset;
+
+        if (LD->isUnindexed() && LD->hasOneUse() && Op1.hasOneUse() &&
+            (LD->getMemoryVT() == MVT::f64 ||
+             LD->getMemoryVT() == MVT::i64) &&
+            SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) {
+          SDValue Chain = LD->getChain();
+          SDValue Ops[] = { Base, Offset, Chain };
+          CurDAG->SelectNodeTo(N, PPC::LXVDSX, N->getValueType(0), Ops);
+          return;
+        }
+      }
+
+      // For little endian, we must swap the input operands and adjust
+      // the mask elements (reverse and invert them).
+      if (PPCSubTarget->isLittleEndian()) {
+        std::swap(Op1, Op2);
+        unsigned tmp = DM[0];
+        DM[0] = 1 - DM[1];
+        DM[1] = 1 - tmp;
+      }
+
+      SDValue DMV = CurDAG->getTargetConstant(DM[1] | (DM[0] << 1), dl,
+                                              MVT::i32);
+      SDValue Ops[] = { Op1, Op2, DMV };
+      CurDAG->SelectNodeTo(N, PPC::XXPERMDI, N->getValueType(0), Ops);
+      return;
+    }
+
+    break;
+  case PPCISD::BDNZ:
+  case PPCISD::BDZ: {
+    bool IsPPC64 = PPCSubTarget->isPPC64();
+    SDValue Ops[] = { N->getOperand(1), N->getOperand(0) };
+    CurDAG->SelectNodeTo(N, N->getOpcode() == PPCISD::BDNZ
+                                ? (IsPPC64 ? PPC::BDNZ8 : PPC::BDNZ)
+                                : (IsPPC64 ? PPC::BDZ8 : PPC::BDZ),
+                         MVT::Other, Ops);
+    return;
+  }
+  case PPCISD::COND_BRANCH: {
+    // Op #0 is the Chain.
+    // Op #1 is the PPC::PRED_* number.
+    // Op #2 is the CR#
+    // Op #3 is the Dest MBB
+    // Op #4 is the Flag.
+    // Prevent PPC::PRED_* from being selected into LI.
+    unsigned PCC = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    if (EnableBranchHint)
+      PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(3));
+
+    SDValue Pred = getI32Imm(PCC, dl);
+    SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3),
+      N->getOperand(0), N->getOperand(4) };
+    CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
+    return;
+  }
+  case ISD::BR_CC: {
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+    unsigned PCC = getPredicateForSetCC(CC);
+
+    if (N->getOperand(2).getValueType() == MVT::i1) {
+      unsigned Opc;
+      bool Swap;
+      switch (PCC) {
+      default: llvm_unreachable("Unexpected Boolean-operand predicate");
+      case PPC::PRED_LT: Opc = PPC::CRANDC; Swap = true;  break;
+      case PPC::PRED_LE: Opc = PPC::CRORC;  Swap = true;  break;
+      case PPC::PRED_EQ: Opc = PPC::CREQV;  Swap = false; break;
+      case PPC::PRED_GE: Opc = PPC::CRORC;  Swap = false; break;
+      case PPC::PRED_GT: Opc = PPC::CRANDC; Swap = false; break;
+      case PPC::PRED_NE: Opc = PPC::CRXOR;  Swap = false; break;
+      }
+
+      SDValue BitComp(CurDAG->getMachineNode(Opc, dl, MVT::i1,
+                                             N->getOperand(Swap ? 3 : 2),
+                                             N->getOperand(Swap ? 2 : 3)), 0);
+      CurDAG->SelectNodeTo(N, PPC::BC, MVT::Other, BitComp, N->getOperand(4),
+                           N->getOperand(0));
+      return;
+    }
+
+    if (EnableBranchHint)
+      PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(4));
+
+    SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl);
+    SDValue Ops[] = { getI32Imm(PCC, dl), CondCode,
+                        N->getOperand(4), N->getOperand(0) };
+    CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
+    return;
+  }
+  case ISD::BRIND: {
+    // FIXME: Should custom lower this.
+    SDValue Chain = N->getOperand(0);
+    SDValue Target = N->getOperand(1);
+    unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8;
+    unsigned Reg = Target.getValueType() == MVT::i32 ? PPC::BCTR : PPC::BCTR8;
+    Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Target,
+                                           Chain), 0);
+    CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain);
+    return;
+  }
+  case PPCISD::TOC_ENTRY: {
+    assert ((PPCSubTarget->isPPC64() || PPCSubTarget->isSVR4ABI()) &&
+            "Only supported for 64-bit ABI and 32-bit SVR4");
+    if (PPCSubTarget->isSVR4ABI() && !PPCSubTarget->isPPC64()) {
+      SDValue GA = N->getOperand(0);
+      SDNode *MN = CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA,
+                                          N->getOperand(1));
+      transferMemOperands(N, MN);
+      ReplaceNode(N, MN);
+      return;
+    }
+
+    // For medium and large code model, we generate two instructions as
+    // described below.  Otherwise we allow SelectCodeCommon to handle this,
+    // selecting one of LDtoc, LDtocJTI, LDtocCPT, and LDtocBA.
+    CodeModel::Model CModel = TM.getCodeModel();
+    if (CModel != CodeModel::Medium && CModel != CodeModel::Large)
+      break;
+
+    // The first source operand is a TargetGlobalAddress or a TargetJumpTable.
+    // If it must be toc-referenced according to PPCSubTarget, we generate:
+    //   LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>))
+    // Otherwise we generate:
+    //   ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>)
+    SDValue GA = N->getOperand(0);
+    SDValue TOCbase = N->getOperand(1);
+    SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64,
+                                         TOCbase, GA);
+
+    if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA) ||
+        CModel == CodeModel::Large) {
+      SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
+                                          SDValue(Tmp, 0));
+      transferMemOperands(N, MN);
+      ReplaceNode(N, MN);
+      return;
+    }
+
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
+      const GlobalValue *GV = G->getGlobal();
+      unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV);
+      if (GVFlags & PPCII::MO_NLP_FLAG) {
+        SDNode *MN = CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
+                                            SDValue(Tmp, 0));
+        transferMemOperands(N, MN);
+        ReplaceNode(N, MN);
+        return;
+      }
+    }
+
+    ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
+                                          SDValue(Tmp, 0), GA));
+    return;
+  }
+  case PPCISD::PPC32_PICGOT: {
+    // Generate a PIC-safe GOT reference.
+    assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() &&
+      "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4");
+    CurDAG->SelectNodeTo(N, PPC::PPC32PICGOT,
+                         PPCLowering->getPointerTy(CurDAG->getDataLayout()),
+                         MVT::i32);
+    return;
+  }
+  case PPCISD::VADD_SPLAT: {
+    // This expands into one of three sequences, depending on whether
+    // the first operand is odd or even, positive or negative.
+    assert(isa<ConstantSDNode>(N->getOperand(0)) &&
+           isa<ConstantSDNode>(N->getOperand(1)) &&
+           "Invalid operand on VADD_SPLAT!");
+
+    int Elt     = N->getConstantOperandVal(0);
+    int EltSize = N->getConstantOperandVal(1);
+    unsigned Opc1, Opc2, Opc3;
+    EVT VT;
+
+    if (EltSize == 1) {
+      Opc1 = PPC::VSPLTISB;
+      Opc2 = PPC::VADDUBM;
+      Opc3 = PPC::VSUBUBM;
+      VT = MVT::v16i8;
+    } else if (EltSize == 2) {
+      Opc1 = PPC::VSPLTISH;
+      Opc2 = PPC::VADDUHM;
+      Opc3 = PPC::VSUBUHM;
+      VT = MVT::v8i16;
+    } else {
+      assert(EltSize == 4 && "Invalid element size on VADD_SPLAT!");
+      Opc1 = PPC::VSPLTISW;
+      Opc2 = PPC::VADDUWM;
+      Opc3 = PPC::VSUBUWM;
+      VT = MVT::v4i32;
+    }
+
+    if ((Elt & 1) == 0) {
+      // Elt is even, in the range [-32,-18] + [16,30].
+      //
+      // Convert: VADD_SPLAT elt, size
+      // Into:    tmp = VSPLTIS[BHW] elt
+      //          VADDU[BHW]M tmp, tmp
+      // Where:   [BHW] = B for size = 1, H for size = 2, W for size = 4
+      SDValue EltVal = getI32Imm(Elt >> 1, dl);
+      SDNode *Tmp = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      SDValue TmpVal = SDValue(Tmp, 0);
+      ReplaceNode(N, CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal));
+      return;
+
+    } else if (Elt > 0) {
+      // Elt is odd and positive, in the range [17,31].
+      //
+      // Convert: VADD_SPLAT elt, size
+      // Into:    tmp1 = VSPLTIS[BHW] elt-16
+      //          tmp2 = VSPLTIS[BHW] -16
+      //          VSUBU[BHW]M tmp1, tmp2
+      SDValue EltVal = getI32Imm(Elt - 16, dl);
+      SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      EltVal = getI32Imm(-16, dl);
+      SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      ReplaceNode(N, CurDAG->getMachineNode(Opc3, dl, VT, SDValue(Tmp1, 0),
+                                            SDValue(Tmp2, 0)));
+      return;
+
+    } else {
+      // Elt is odd and negative, in the range [-31,-17].
+      //
+      // Convert: VADD_SPLAT elt, size
+      // Into:    tmp1 = VSPLTIS[BHW] elt+16
+      //          tmp2 = VSPLTIS[BHW] -16
+      //          VADDU[BHW]M tmp1, tmp2
+      SDValue EltVal = getI32Imm(Elt + 16, dl);
+      SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      EltVal = getI32Imm(-16, dl);
+      SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      ReplaceNode(N, CurDAG->getMachineNode(Opc2, dl, VT, SDValue(Tmp1, 0),
+                                            SDValue(Tmp2, 0)));
+      return;
+    }
+  }
+  }
+
+  SelectCode(N);
+}
+
+// If the target supports the cmpb instruction, do the idiom recognition here.
+// We don't do this as a DAG combine because we don't want to do it as nodes
+// are being combined (because we might miss part of the eventual idiom). We
+// don't want to do it during instruction selection because we want to reuse
+// the logic for lowering the masking operations already part of the
+// instruction selector.
+SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) {
+  SDLoc dl(N);
+
+  assert(N->getOpcode() == ISD::OR &&
+         "Only OR nodes are supported for CMPB");
+
+  SDValue Res;
+  if (!PPCSubTarget->hasCMPB())
+    return Res;
+
+  if (N->getValueType(0) != MVT::i32 &&
+      N->getValueType(0) != MVT::i64)
+    return Res;
+
+  EVT VT = N->getValueType(0);
+
+  SDValue RHS, LHS;
+  bool BytesFound[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+  uint64_t Mask = 0, Alt = 0;
+
+  auto IsByteSelectCC = [this](SDValue O, unsigned &b,
+                               uint64_t &Mask, uint64_t &Alt,
+                               SDValue &LHS, SDValue &RHS) {
+    if (O.getOpcode() != ISD::SELECT_CC)
+      return false;
+    ISD::CondCode CC = cast<CondCodeSDNode>(O.getOperand(4))->get();
+
+    if (!isa<ConstantSDNode>(O.getOperand(2)) ||
+        !isa<ConstantSDNode>(O.getOperand(3)))
+      return false;
+
+    uint64_t PM = O.getConstantOperandVal(2);
+    uint64_t PAlt = O.getConstantOperandVal(3);
+    for (b = 0; b < 8; ++b) {
+      uint64_t Mask = UINT64_C(0xFF) << (8*b);
+      if (PM && (PM & Mask) == PM && (PAlt & Mask) == PAlt)
+        break;
+    }
+
+    if (b == 8)
+      return false;
+    Mask |= PM;
+    Alt  |= PAlt;
+
+    if (!isa<ConstantSDNode>(O.getOperand(1)) ||
+        O.getConstantOperandVal(1) != 0) {
+      SDValue Op0 = O.getOperand(0), Op1 = O.getOperand(1);
+      if (Op0.getOpcode() == ISD::TRUNCATE)
+        Op0 = Op0.getOperand(0);
+      if (Op1.getOpcode() == ISD::TRUNCATE)
+        Op1 = Op1.getOperand(0);
+
+      if (Op0.getOpcode() == ISD::SRL && Op1.getOpcode() == ISD::SRL &&
+          Op0.getOperand(1) == Op1.getOperand(1) && CC == ISD::SETEQ &&
+          isa<ConstantSDNode>(Op0.getOperand(1))) {
+
+        unsigned Bits = Op0.getValueSizeInBits();
+        if (b != Bits/8-1)
+          return false;
+        if (Op0.getConstantOperandVal(1) != Bits-8)
+          return false;
+
+        LHS = Op0.getOperand(0);
+        RHS = Op1.getOperand(0);
+        return true;
+      }
+
+      // When we have small integers (i16 to be specific), the form present
+      // post-legalization uses SETULT in the SELECT_CC for the
+      // higher-order byte, depending on the fact that the
+      // even-higher-order bytes are known to all be zero, for example:
+      //   select_cc (xor $lhs, $rhs), 256, 65280, 0, setult
+      // (so when the second byte is the same, because all higher-order
+      // bits from bytes 3 and 4 are known to be zero, the result of the
+      // xor can be at most 255)
+      if (Op0.getOpcode() == ISD::XOR && CC == ISD::SETULT &&
+          isa<ConstantSDNode>(O.getOperand(1))) {
+
+        uint64_t ULim = O.getConstantOperandVal(1);
+        if (ULim != (UINT64_C(1) << b*8))
+          return false;
+
+        // Now we need to make sure that the upper bytes are known to be
+        // zero.
+        unsigned Bits = Op0.getValueSizeInBits();
+        if (!CurDAG->MaskedValueIsZero(
+                Op0, APInt::getHighBitsSet(Bits, Bits - (b + 1) * 8)))
+          return false;
+
+        LHS = Op0.getOperand(0);
+        RHS = Op0.getOperand(1);
+        return true;
+      }
+
+      return false;
+    }
+
+    if (CC != ISD::SETEQ)
+      return false;
+
+    SDValue Op = O.getOperand(0);
+    if (Op.getOpcode() == ISD::AND) {
+      if (!isa<ConstantSDNode>(Op.getOperand(1)))
+        return false;
+      if (Op.getConstantOperandVal(1) != (UINT64_C(0xFF) << (8*b)))
+        return false;
+
+      SDValue XOR = Op.getOperand(0);
+      if (XOR.getOpcode() == ISD::TRUNCATE)
+        XOR = XOR.getOperand(0);
+      if (XOR.getOpcode() != ISD::XOR)
+        return false;
+
+      LHS = XOR.getOperand(0);
+      RHS = XOR.getOperand(1);
+      return true;
+    } else if (Op.getOpcode() == ISD::SRL) {
+      if (!isa<ConstantSDNode>(Op.getOperand(1)))
+        return false;
+      unsigned Bits = Op.getValueSizeInBits();
+      if (b != Bits/8-1)
+        return false;
+      if (Op.getConstantOperandVal(1) != Bits-8)
+        return false;
+
+      SDValue XOR = Op.getOperand(0);
+      if (XOR.getOpcode() == ISD::TRUNCATE)
+        XOR = XOR.getOperand(0);
+      if (XOR.getOpcode() != ISD::XOR)
+        return false;
+
+      LHS = XOR.getOperand(0);
+      RHS = XOR.getOperand(1);
+      return true;
+    }
+
+    return false;
+  };
+
+  SmallVector<SDValue, 8> Queue(1, SDValue(N, 0));
+  while (!Queue.empty()) {
+    SDValue V = Queue.pop_back_val();
+
+    for (const SDValue &O : V.getNode()->ops()) {
+      unsigned b;
+      uint64_t M = 0, A = 0;
+      SDValue OLHS, ORHS;
+      if (O.getOpcode() == ISD::OR) {
+        Queue.push_back(O);
+      } else if (IsByteSelectCC(O, b, M, A, OLHS, ORHS)) {
+        if (!LHS) {
+          LHS = OLHS;
+          RHS = ORHS;
+          BytesFound[b] = true;
+          Mask |= M;
+          Alt  |= A;
+        } else if ((LHS == ORHS && RHS == OLHS) ||
+                   (RHS == ORHS && LHS == OLHS)) {
+          BytesFound[b] = true;
+          Mask |= M;
+          Alt  |= A;
+        } else {
+          return Res;
+        }
+      } else {
+        return Res;
+      }
+    }
+  }
+
+  unsigned LastB = 0, BCnt = 0;
+  for (unsigned i = 0; i < 8; ++i)
+    if (BytesFound[LastB]) {
+      ++BCnt;
+      LastB = i;
+    }
+
+  if (!LastB || BCnt < 2)
+    return Res;
+
+  // Because we'll be zero-extending the output anyway if don't have a specific
+  // value for each input byte (via the Mask), we can 'anyext' the inputs.
+  if (LHS.getValueType() != VT) {
+    LHS = CurDAG->getAnyExtOrTrunc(LHS, dl, VT);
+    RHS = CurDAG->getAnyExtOrTrunc(RHS, dl, VT);
+  }
+
+  Res = CurDAG->getNode(PPCISD::CMPB, dl, VT, LHS, RHS);
+
+  bool NonTrivialMask = ((int64_t) Mask) != INT64_C(-1);
+  if (NonTrivialMask && !Alt) {
+    // Res = Mask & CMPB
+    Res = CurDAG->getNode(ISD::AND, dl, VT, Res,
+                          CurDAG->getConstant(Mask, dl, VT));
+  } else if (Alt) {
+    // Res = (CMPB & Mask) | (~CMPB & Alt)
+    // Which, as suggested here:
+    //   https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge
+    // can be written as:
+    // Res = Alt ^ ((Alt ^ Mask) & CMPB)
+    // useful because the (Alt ^ Mask) can be pre-computed.
+    Res = CurDAG->getNode(ISD::AND, dl, VT, Res,
+                          CurDAG->getConstant(Mask ^ Alt, dl, VT));
+    Res = CurDAG->getNode(ISD::XOR, dl, VT, Res,
+                          CurDAG->getConstant(Alt, dl, VT));
+  }
+
+  return Res;
+}
+
+// When CR bit registers are enabled, an extension of an i1 variable to a i32
+// or i64 value is lowered in terms of a SELECT_I[48] operation, and thus
+// involves constant materialization of a 0 or a 1 or both. If the result of
+// the extension is then operated upon by some operator that can be constant
+// folded with a constant 0 or 1, and that constant can be materialized using
+// only one instruction (like a zero or one), then we should fold in those
+// operations with the select.
+void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) {
+  if (!PPCSubTarget->useCRBits())
+    return;
+
+  if (N->getOpcode() != ISD::ZERO_EXTEND &&
+      N->getOpcode() != ISD::SIGN_EXTEND &&
+      N->getOpcode() != ISD::ANY_EXTEND)
+    return;
+
+  if (N->getOperand(0).getValueType() != MVT::i1)
+    return;
+
+  if (!N->hasOneUse())
+    return;
+
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Cond = N->getOperand(0);
+  SDValue ConstTrue =
+    CurDAG->getConstant(N->getOpcode() == ISD::SIGN_EXTEND ? -1 : 1, dl, VT);
+  SDValue ConstFalse = CurDAG->getConstant(0, dl, VT);
+
+  do {
+    SDNode *User = *N->use_begin();
+    if (User->getNumOperands() != 2)
+      break;
+
+    auto TryFold = [this, N, User, dl](SDValue Val) {
+      SDValue UserO0 = User->getOperand(0), UserO1 = User->getOperand(1);
+      SDValue O0 = UserO0.getNode() == N ? Val : UserO0;
+      SDValue O1 = UserO1.getNode() == N ? Val : UserO1;
+
+      return CurDAG->FoldConstantArithmetic(User->getOpcode(), dl,
+                                            User->getValueType(0),
+                                            O0.getNode(), O1.getNode());
+    };
+
+    SDValue TrueRes = TryFold(ConstTrue);
+    if (!TrueRes)
+      break;
+    SDValue FalseRes = TryFold(ConstFalse);
+    if (!FalseRes)
+      break;
+
+    // For us to materialize these using one instruction, we must be able to
+    // represent them as signed 16-bit integers.
+    uint64_t True  = cast<ConstantSDNode>(TrueRes)->getZExtValue(),
+             False = cast<ConstantSDNode>(FalseRes)->getZExtValue();
+    if (!isInt<16>(True) || !isInt<16>(False))
+      break;
+
+    // We can replace User with a new SELECT node, and try again to see if we
+    // can fold the select with its user.
+    Res = CurDAG->getSelect(dl, User->getValueType(0), Cond, TrueRes, FalseRes);
+    N = User;
+    ConstTrue = TrueRes;
+    ConstFalse = FalseRes;
+  } while (N->hasOneUse());
+}
+
+void PPCDAGToDAGISel::PreprocessISelDAG() {
+  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+  ++Position;
+
+  bool MadeChange = false;
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = &*--Position;
+    if (N->use_empty())
+      continue;
+
+    SDValue Res;
+    switch (N->getOpcode()) {
+    default: break;
+    case ISD::OR:
+      Res = combineToCMPB(N);
+      break;
+    }
+
+    if (!Res)
+      foldBoolExts(Res, N);
+
+    if (Res) {
+      DEBUG(dbgs() << "PPC DAG preprocessing replacing:\nOld:    ");
+      DEBUG(N->dump(CurDAG));
+      DEBUG(dbgs() << "\nNew: ");
+      DEBUG(Res.getNode()->dump(CurDAG));
+      DEBUG(dbgs() << "\n");
+
+      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+      MadeChange = true;
+    }
+  }
+
+  if (MadeChange)
+    CurDAG->RemoveDeadNodes();
+}
+
+/// PostprocessISelDAG - Perform some late peephole optimizations
+/// on the DAG representation.
+void PPCDAGToDAGISel::PostprocessISelDAG() {
+
+  // Skip peepholes at -O0.
+  if (TM.getOptLevel() == CodeGenOpt::None)
+    return;
+
+  PeepholePPC64();
+  PeepholeCROps();
+  PeepholePPC64ZExt();
+}
+
+// Check if all users of this node will become isel where the second operand
+// is the constant zero. If this is so, and if we can negate the condition,
+// then we can flip the true and false operands. This will allow the zero to
+// be folded with the isel so that we don't need to materialize a register
+// containing zero.
+bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
+  // If we're not using isel, then this does not matter.
+  if (!PPCSubTarget->hasISEL())
+    return false;
+
+  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+       UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (!User->isMachineOpcode())
+      return false;
+    if (User->getMachineOpcode() != PPC::SELECT_I4 &&
+        User->getMachineOpcode() != PPC::SELECT_I8)
+      return false;
+
+    SDNode *Op2 = User->getOperand(2).getNode();
+    if (!Op2->isMachineOpcode())
+      return false;
+
+    if (Op2->getMachineOpcode() != PPC::LI &&
+        Op2->getMachineOpcode() != PPC::LI8)
+      return false;
+
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op2->getOperand(0));
+    if (!C)
+      return false;
+
+    if (!C->isNullValue())
+      return false;
+  }
+
+  return true;
+}
+
+void PPCDAGToDAGISel::SwapAllSelectUsers(SDNode *N) {
+  SmallVector<SDNode *, 4> ToReplace;
+  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+       UI != UE; ++UI) {
+    SDNode *User = *UI;
+    assert((User->getMachineOpcode() == PPC::SELECT_I4 ||
+            User->getMachineOpcode() == PPC::SELECT_I8) &&
+           "Must have all select users");
+    ToReplace.push_back(User);
+  }
+
+  for (SmallVector<SDNode *, 4>::iterator UI = ToReplace.begin(),
+       UE = ToReplace.end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    SDNode *ResNode =
+      CurDAG->getMachineNode(User->getMachineOpcode(), SDLoc(User),
+                             User->getValueType(0), User->getOperand(0),
+                             User->getOperand(2),
+                             User->getOperand(1));
+
+      DEBUG(dbgs() << "CR Peephole replacing:\nOld:    ");
+      DEBUG(User->dump(CurDAG));
+      DEBUG(dbgs() << "\nNew: ");
+      DEBUG(ResNode->dump(CurDAG));
+      DEBUG(dbgs() << "\n");
+
+      ReplaceUses(User, ResNode);
+  }
+}
+
+void PPCDAGToDAGISel::PeepholeCROps() {
+  bool IsModified;
+  do {
+    IsModified = false;
+    for (SDNode &Node : CurDAG->allnodes()) {
+      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node);
+      if (!MachineNode || MachineNode->use_empty())
+        continue;
+      SDNode *ResNode = MachineNode;
+
+      bool Op1Set   = false, Op1Unset = false,
+           Op1Not   = false,
+           Op2Set   = false, Op2Unset = false,
+           Op2Not   = false;
+
+      unsigned Opcode = MachineNode->getMachineOpcode();
+      switch (Opcode) {
+      default: break;
+      case PPC::CRAND:
+      case PPC::CRNAND:
+      case PPC::CROR:
+      case PPC::CRXOR:
+      case PPC::CRNOR:
+      case PPC::CREQV:
+      case PPC::CRANDC:
+      case PPC::CRORC: {
+        SDValue Op = MachineNode->getOperand(1);
+        if (Op.isMachineOpcode()) {
+          if (Op.getMachineOpcode() == PPC::CRSET)
+            Op2Set = true;
+          else if (Op.getMachineOpcode() == PPC::CRUNSET)
+            Op2Unset = true;
+          else if (Op.getMachineOpcode() == PPC::CRNOR &&
+                   Op.getOperand(0) == Op.getOperand(1))
+            Op2Not = true;
+        }
+        LLVM_FALLTHROUGH;
+      }
+      case PPC::BC:
+      case PPC::BCn:
+      case PPC::SELECT_I4:
+      case PPC::SELECT_I8:
+      case PPC::SELECT_F4:
+      case PPC::SELECT_F8:
+      case PPC::SELECT_QFRC:
+      case PPC::SELECT_QSRC:
+      case PPC::SELECT_QBRC:
+      case PPC::SELECT_VRRC:
+      case PPC::SELECT_VSFRC:
+      case PPC::SELECT_VSSRC:
+      case PPC::SELECT_VSRC: {
+        SDValue Op = MachineNode->getOperand(0);
+        if (Op.isMachineOpcode()) {
+          if (Op.getMachineOpcode() == PPC::CRSET)
+            Op1Set = true;
+          else if (Op.getMachineOpcode() == PPC::CRUNSET)
+            Op1Unset = true;
+          else if (Op.getMachineOpcode() == PPC::CRNOR &&
+                   Op.getOperand(0) == Op.getOperand(1))
+            Op1Not = true;
+        }
+        }
+        break;
+      }
+
+      bool SelectSwap = false;
+      switch (Opcode) {
+      default: break;
+      case PPC::CRAND:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // x & x = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Set)
+          // 1 & y = y
+          ResNode = MachineNode->getOperand(1).getNode();
+        else if (Op2Set)
+          // x & 1 = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Unset || Op2Unset)
+          // x & 0 = 0 & y = 0
+          ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Not)
+          // ~x & y = andc(y, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(0).
+                                             getOperand(0));
+        else if (Op2Not)
+          // x & ~y = andc(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode)) {
+          ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1));
+          SelectSwap = true;
+        }
+        break;
+      case PPC::CRNAND:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // nand(x, x) -> nor(x, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (Op1Set)
+          // nand(1, y) -> nor(y, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op2Set)
+          // nand(x, 1) -> nor(x, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (Op1Unset || Op2Unset)
+          // nand(x, 0) = nand(0, y) = 1
+          ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Not)
+          // nand(~x, y) = ~(~x & y) = x | ~y = orc(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // nand(x, ~y) = ~x | y = orc(y, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (AllUsersSelectZero(MachineNode)) {
+          ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1));
+          SelectSwap = true;
+        }
+        break;
+      case PPC::CROR:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // x | x = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Set || Op2Set)
+          // x | 1 = 1 | y = 1
+          ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Unset)
+          // 0 | y = y
+          ResNode = MachineNode->getOperand(1).getNode();
+        else if (Op2Unset)
+          // x | 0 = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Not)
+          // ~x | y = orc(y, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(0).
+                                             getOperand(0));
+        else if (Op2Not)
+          // x | ~y = orc(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode)) {
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1));
+          SelectSwap = true;
+        }
+        break;
+      case PPC::CRXOR:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // xor(x, x) = 0
+          ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Set)
+          // xor(1, y) -> nor(y, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op2Set)
+          // xor(x, 1) -> nor(x, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (Op1Unset)
+          // xor(0, y) = y
+          ResNode = MachineNode->getOperand(1).getNode();
+        else if (Op2Unset)
+          // xor(x, 0) = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Not)
+          // xor(~x, y) = eqv(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // xor(x, ~y) = eqv(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode)) {
+          ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1));
+          SelectSwap = true;
+        }
+        break;
+      case PPC::CRNOR:
+        if (Op1Set || Op2Set)
+          // nor(1, y) -> 0
+          ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Unset)
+          // nor(0, y) = ~y -> nor(y, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op2Unset)
+          // nor(x, 0) = ~x
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (Op1Not)
+          // nor(~x, y) = andc(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // nor(x, ~y) = andc(y, x)
+          ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (AllUsersSelectZero(MachineNode)) {
+          ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1));
+          SelectSwap = true;
+        }
+        break;
+      case PPC::CREQV:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // eqv(x, x) = 1
+          ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Set)
+          // eqv(1, y) = y
+          ResNode = MachineNode->getOperand(1).getNode();
+        else if (Op2Set)
+          // eqv(x, 1) = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Unset)
+          // eqv(0, y) = ~y -> nor(y, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op2Unset)
+          // eqv(x, 0) = ~x
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(0));
+        else if (Op1Not)
+          // eqv(~x, y) = xor(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // eqv(x, ~y) = xor(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode)) {
+          ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1));
+          SelectSwap = true;
+        }
+        break;
+      case PPC::CRANDC:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // andc(x, x) = 0
+          ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Set)
+          // andc(1, y) = ~y
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op1Unset || Op2Set)
+          // andc(0, y) = andc(x, 1) = 0
+          ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op2Unset)
+          // andc(x, 0) = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Not)
+          // andc(~x, y) = ~(x | y) = nor(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // andc(x, ~y) = x & y
+          ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode)) {
+          ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(0));
+          SelectSwap = true;
+        }
+        break;
+      case PPC::CRORC:
+        if (MachineNode->getOperand(0) == MachineNode->getOperand(1))
+          // orc(x, x) = 1
+          ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op1Set || Op2Unset)
+          // orc(1, y) = orc(x, 0) = 1
+          ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode),
+                                           MVT::i1);
+        else if (Op2Set)
+          // orc(x, 1) = x
+          ResNode = MachineNode->getOperand(0).getNode();
+        else if (Op1Unset)
+          // orc(0, y) = ~y
+          ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(1));
+        else if (Op1Not)
+          // orc(~x, y) = ~(x & y) = nand(x, y)
+          ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0).
+                                                      getOperand(0),
+                                           MachineNode->getOperand(1));
+        else if (Op2Not)
+          // orc(x, ~y) = x | y
+          ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(0),
+                                           MachineNode->getOperand(1).
+                                             getOperand(0));
+        else if (AllUsersSelectZero(MachineNode)) {
+          ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode),
+                                           MVT::i1, MachineNode->getOperand(1),
+                                           MachineNode->getOperand(0));
+          SelectSwap = true;
+        }
+        break;
+      case PPC::SELECT_I4:
+      case PPC::SELECT_I8:
+      case PPC::SELECT_F4:
+      case PPC::SELECT_F8:
+      case PPC::SELECT_QFRC:
+      case PPC::SELECT_QSRC:
+      case PPC::SELECT_QBRC:
+      case PPC::SELECT_VRRC:
+      case PPC::SELECT_VSFRC:
+      case PPC::SELECT_VSSRC:
+      case PPC::SELECT_VSRC:
+        if (Op1Set)
+          ResNode = MachineNode->getOperand(1).getNode();
+        else if (Op1Unset)
+          ResNode = MachineNode->getOperand(2).getNode();
+        else if (Op1Not)
+          ResNode = CurDAG->getMachineNode(MachineNode->getMachineOpcode(),
+                                           SDLoc(MachineNode),
+                                           MachineNode->getValueType(0),
+                                           MachineNode->getOperand(0).
+                                             getOperand(0),
+                                           MachineNode->getOperand(2),
+                                           MachineNode->getOperand(1));
+        break;
+      case PPC::BC:
+      case PPC::BCn:
+        if (Op1Not)
+          ResNode = CurDAG->getMachineNode(Opcode == PPC::BC ? PPC::BCn :
+                                                               PPC::BC,
+                                           SDLoc(MachineNode),
+                                           MVT::Other,
+                                           MachineNode->getOperand(0).
+                                             getOperand(0),
+                                           MachineNode->getOperand(1),
+                                           MachineNode->getOperand(2));
+        // FIXME: Handle Op1Set, Op1Unset here too.
+        break;
+      }
+
+      // If we're inverting this node because it is used only by selects that
+      // we'd like to swap, then swap the selects before the node replacement.
+      if (SelectSwap)
+        SwapAllSelectUsers(MachineNode);
+
+      if (ResNode != MachineNode) {
+        DEBUG(dbgs() << "CR Peephole replacing:\nOld:    ");
+        DEBUG(MachineNode->dump(CurDAG));
+        DEBUG(dbgs() << "\nNew: ");
+        DEBUG(ResNode->dump(CurDAG));
+        DEBUG(dbgs() << "\n");
+
+        ReplaceUses(MachineNode, ResNode);
+        IsModified = true;
+      }
+    }
+    if (IsModified)
+      CurDAG->RemoveDeadNodes();
+  } while (IsModified);
+}
+
+// Gather the set of 32-bit operations that are known to have their
+// higher-order 32 bits zero, where ToPromote contains all such operations.
+static bool PeepholePPC64ZExtGather(SDValue Op32,
+                                    SmallPtrSetImpl<SDNode *> &ToPromote) {
+  if (!Op32.isMachineOpcode())
+    return false;
+
+  // First, check for the "frontier" instructions (those that will clear the
+  // higher-order 32 bits.
+
+  // For RLWINM and RLWNM, we need to make sure that the mask does not wrap
+  // around. If it does not, then these instructions will clear the
+  // higher-order bits.
+  if ((Op32.getMachineOpcode() == PPC::RLWINM ||
+       Op32.getMachineOpcode() == PPC::RLWNM) &&
+      Op32.getConstantOperandVal(2) <= Op32.getConstantOperandVal(3)) {
+    ToPromote.insert(Op32.getNode());
+    return true;
+  }
+
+  // SLW and SRW always clear the higher-order bits.
+  if (Op32.getMachineOpcode() == PPC::SLW ||
+      Op32.getMachineOpcode() == PPC::SRW) {
+    ToPromote.insert(Op32.getNode());
+    return true;
+  }
+
+  // For LI and LIS, we need the immediate to be positive (so that it is not
+  // sign extended).
+  if (Op32.getMachineOpcode() == PPC::LI ||
+      Op32.getMachineOpcode() == PPC::LIS) {
+    if (!isUInt<15>(Op32.getConstantOperandVal(0)))
+      return false;
+
+    ToPromote.insert(Op32.getNode());
+    return true;
+  }
+
+  // LHBRX and LWBRX always clear the higher-order bits.
+  if (Op32.getMachineOpcode() == PPC::LHBRX ||
+      Op32.getMachineOpcode() == PPC::LWBRX) {
+    ToPromote.insert(Op32.getNode());
+    return true;
+  }
+
+  // CNT[LT]ZW always produce a 64-bit value in [0,32], and so is zero extended.
+  if (Op32.getMachineOpcode() == PPC::CNTLZW ||
+      Op32.getMachineOpcode() == PPC::CNTTZW) {
+    ToPromote.insert(Op32.getNode());
+    return true;
+  }
+
+  // Next, check for those instructions we can look through.
+
+  // Assuming the mask does not wrap around, then the higher-order bits are
+  // taken directly from the first operand.
+  if (Op32.getMachineOpcode() == PPC::RLWIMI &&
+      Op32.getConstantOperandVal(3) <= Op32.getConstantOperandVal(4)) {
+    SmallPtrSet<SDNode *, 16> ToPromote1;
+    if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1))
+      return false;
+
+    ToPromote.insert(Op32.getNode());
+    ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+    return true;
+  }
+
+  // For OR, the higher-order bits are zero if that is true for both operands.
+  // For SELECT_I4, the same is true (but the relevant operand numbers are
+  // shifted by 1).
+  if (Op32.getMachineOpcode() == PPC::OR ||
+      Op32.getMachineOpcode() == PPC::SELECT_I4) {
+    unsigned B = Op32.getMachineOpcode() == PPC::SELECT_I4 ? 1 : 0;
+    SmallPtrSet<SDNode *, 16> ToPromote1;
+    if (!PeepholePPC64ZExtGather(Op32.getOperand(B+0), ToPromote1))
+      return false;
+    if (!PeepholePPC64ZExtGather(Op32.getOperand(B+1), ToPromote1))
+      return false;
+
+    ToPromote.insert(Op32.getNode());
+    ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+    return true;
+  }
+
+  // For ORI and ORIS, we need the higher-order bits of the first operand to be
+  // zero, and also for the constant to be positive (so that it is not sign
+  // extended).
+  if (Op32.getMachineOpcode() == PPC::ORI ||
+      Op32.getMachineOpcode() == PPC::ORIS) {
+    SmallPtrSet<SDNode *, 16> ToPromote1;
+    if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1))
+      return false;
+    if (!isUInt<15>(Op32.getConstantOperandVal(1)))
+      return false;
+
+    ToPromote.insert(Op32.getNode());
+    ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+    return true;
+  }
+
+  // The higher-order bits of AND are zero if that is true for at least one of
+  // the operands.
+  if (Op32.getMachineOpcode() == PPC::AND) {
+    SmallPtrSet<SDNode *, 16> ToPromote1, ToPromote2;
+    bool Op0OK =
+      PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1);
+    bool Op1OK =
+      PeepholePPC64ZExtGather(Op32.getOperand(1), ToPromote2);
+    if (!Op0OK && !Op1OK)
+      return false;
+
+    ToPromote.insert(Op32.getNode());
+
+    if (Op0OK)
+      ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+
+    if (Op1OK)
+      ToPromote.insert(ToPromote2.begin(), ToPromote2.end());
+
+    return true;
+  }
+
+  // For ANDI and ANDIS, the higher-order bits are zero if either that is true
+  // of the first operand, or if the second operand is positive (so that it is
+  // not sign extended).
+  if (Op32.getMachineOpcode() == PPC::ANDIo ||
+      Op32.getMachineOpcode() == PPC::ANDISo) {
+    SmallPtrSet<SDNode *, 16> ToPromote1;
+    bool Op0OK =
+      PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1);
+    bool Op1OK = isUInt<15>(Op32.getConstantOperandVal(1));
+    if (!Op0OK && !Op1OK)
+      return false;
+
+    ToPromote.insert(Op32.getNode());
+
+    if (Op0OK)
+      ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
+
+    return true;
+  }
+
+  return false;
+}
+
+void PPCDAGToDAGISel::PeepholePPC64ZExt() {
+  if (!PPCSubTarget->isPPC64())
+    return;
+
+  // When we zero-extend from i32 to i64, we use a pattern like this:
+  // def : Pat<(i64 (zext i32:$in)),
+  //           (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32),
+  //                   0, 32)>;
+  // There are several 32-bit shift/rotate instructions, however, that will
+  // clear the higher-order bits of their output, rendering the RLDICL
+  // unnecessary. When that happens, we remove it here, and redefine the
+  // relevant 32-bit operation to be a 64-bit operation.
+
+  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+  ++Position;
+
+  bool MadeChange = false;
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = &*--Position;
+    // Skip dead nodes and any non-machine opcodes.
+    if (N->use_empty() || !N->isMachineOpcode())
+      continue;
+
+    if (N->getMachineOpcode() != PPC::RLDICL)
+      continue;
+
+    if (N->getConstantOperandVal(1) != 0 ||
+        N->getConstantOperandVal(2) != 32)
+      continue;
+
+    SDValue ISR = N->getOperand(0);
+    if (!ISR.isMachineOpcode() ||
+        ISR.getMachineOpcode() != TargetOpcode::INSERT_SUBREG)
+      continue;
+
+    if (!ISR.hasOneUse())
+      continue;
+
+    if (ISR.getConstantOperandVal(2) != PPC::sub_32)
+      continue;
+
+    SDValue IDef = ISR.getOperand(0);
+    if (!IDef.isMachineOpcode() ||
+        IDef.getMachineOpcode() != TargetOpcode::IMPLICIT_DEF)
+      continue;
+
+    // We now know that we're looking at a canonical i32 -> i64 zext. See if we
+    // can get rid of it.
+
+    SDValue Op32 = ISR->getOperand(1);
+    if (!Op32.isMachineOpcode())
+      continue;
+
+    // There are some 32-bit instructions that always clear the high-order 32
+    // bits, there are also some instructions (like AND) that we can look
+    // through.
+    SmallPtrSet<SDNode *, 16> ToPromote;
+    if (!PeepholePPC64ZExtGather(Op32, ToPromote))
+      continue;
+
+    // If the ToPromote set contains nodes that have uses outside of the set
+    // (except for the original INSERT_SUBREG), then abort the transformation.
+    bool OutsideUse = false;
+    for (SDNode *PN : ToPromote) {
+      for (SDNode *UN : PN->uses()) {
+        if (!ToPromote.count(UN) && UN != ISR.getNode()) {
+          OutsideUse = true;
+          break;
+        }
+      }
+
+      if (OutsideUse)
+        break;
+    }
+    if (OutsideUse)
+      continue;
+
+    MadeChange = true;
+
+    // We now know that this zero extension can be removed by promoting to
+    // nodes in ToPromote to 64-bit operations, where for operations in the
+    // frontier of the set, we need to insert INSERT_SUBREGs for their
+    // operands.
+    for (SDNode *PN : ToPromote) {
+      unsigned NewOpcode;
+      switch (PN->getMachineOpcode()) {
+      default:
+        llvm_unreachable("Don't know the 64-bit variant of this instruction");
+      case PPC::RLWINM:    NewOpcode = PPC::RLWINM8; break;
+      case PPC::RLWNM:     NewOpcode = PPC::RLWNM8; break;
+      case PPC::SLW:       NewOpcode = PPC::SLW8; break;
+      case PPC::SRW:       NewOpcode = PPC::SRW8; break;
+      case PPC::LI:        NewOpcode = PPC::LI8; break;
+      case PPC::LIS:       NewOpcode = PPC::LIS8; break;
+      case PPC::LHBRX:     NewOpcode = PPC::LHBRX8; break;
+      case PPC::LWBRX:     NewOpcode = PPC::LWBRX8; break;
+      case PPC::CNTLZW:    NewOpcode = PPC::CNTLZW8; break;
+      case PPC::CNTTZW:    NewOpcode = PPC::CNTTZW8; break;
+      case PPC::RLWIMI:    NewOpcode = PPC::RLWIMI8; break;
+      case PPC::OR:        NewOpcode = PPC::OR8; break;
+      case PPC::SELECT_I4: NewOpcode = PPC::SELECT_I8; break;
+      case PPC::ORI:       NewOpcode = PPC::ORI8; break;
+      case PPC::ORIS:      NewOpcode = PPC::ORIS8; break;
+      case PPC::AND:       NewOpcode = PPC::AND8; break;
+      case PPC::ANDIo:     NewOpcode = PPC::ANDIo8; break;
+      case PPC::ANDISo:    NewOpcode = PPC::ANDISo8; break;
+      }
+
+      // Note: During the replacement process, the nodes will be in an
+      // inconsistent state (some instructions will have operands with values
+      // of the wrong type). Once done, however, everything should be right
+      // again.
+
+      SmallVector<SDValue, 4> Ops;
+      for (const SDValue &V : PN->ops()) {
+        if (!ToPromote.count(V.getNode()) && V.getValueType() == MVT::i32 &&
+            !isa<ConstantSDNode>(V)) {
+          SDValue ReplOpOps[] = { ISR.getOperand(0), V, ISR.getOperand(2) };
+          SDNode *ReplOp =
+            CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, SDLoc(V),
+                                   ISR.getNode()->getVTList(), ReplOpOps);
+          Ops.push_back(SDValue(ReplOp, 0));
+        } else {
+          Ops.push_back(V);
+        }
+      }
+
+      // Because all to-be-promoted nodes only have users that are other
+      // promoted nodes (or the original INSERT_SUBREG), we can safely replace
+      // the i32 result value type with i64.
+
+      SmallVector<EVT, 2> NewVTs;
+      SDVTList VTs = PN->getVTList();
+      for (unsigned i = 0, ie = VTs.NumVTs; i != ie; ++i)
+        if (VTs.VTs[i] == MVT::i32)
+          NewVTs.push_back(MVT::i64);
+        else
+          NewVTs.push_back(VTs.VTs[i]);
+
+      DEBUG(dbgs() << "PPC64 ZExt Peephole morphing:\nOld:    ");
+      DEBUG(PN->dump(CurDAG));
+
+      CurDAG->SelectNodeTo(PN, NewOpcode, CurDAG->getVTList(NewVTs), Ops);
+
+      DEBUG(dbgs() << "\nNew: ");
+      DEBUG(PN->dump(CurDAG));
+      DEBUG(dbgs() << "\n");
+    }
+
+    // Now we replace the original zero extend and its associated INSERT_SUBREG
+    // with the value feeding the INSERT_SUBREG (which has now been promoted to
+    // return an i64).
+
+    DEBUG(dbgs() << "PPC64 ZExt Peephole replacing:\nOld:    ");
+    DEBUG(N->dump(CurDAG));
+    DEBUG(dbgs() << "\nNew: ");
+    DEBUG(Op32.getNode()->dump(CurDAG));
+    DEBUG(dbgs() << "\n");
+
+    ReplaceUses(N, Op32.getNode());
+  }
+
+  if (MadeChange)
+    CurDAG->RemoveDeadNodes();
+}
+
+void PPCDAGToDAGISel::PeepholePPC64() {
+  // These optimizations are currently supported only for 64-bit SVR4.
+  if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64())
+    return;
+
+  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+  ++Position;
+
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = &*--Position;
+    // Skip dead nodes and any non-machine opcodes.
+    if (N->use_empty() || !N->isMachineOpcode())
+      continue;
+
+    unsigned FirstOp;
+    unsigned StorageOpcode = N->getMachineOpcode();
+
+    switch (StorageOpcode) {
+    default: continue;
+
+    case PPC::LBZ:
+    case PPC::LBZ8:
+    case PPC::LD:
+    case PPC::LFD:
+    case PPC::LFS:
+    case PPC::LHA:
+    case PPC::LHA8:
+    case PPC::LHZ:
+    case PPC::LHZ8:
+    case PPC::LWA:
+    case PPC::LWZ:
+    case PPC::LWZ8:
+      FirstOp = 0;
+      break;
+
+    case PPC::STB:
+    case PPC::STB8:
+    case PPC::STD:
+    case PPC::STFD:
+    case PPC::STFS:
+    case PPC::STH:
+    case PPC::STH8:
+    case PPC::STW:
+    case PPC::STW8:
+      FirstOp = 1;
+      break;
+    }
+
+    // If this is a load or store with a zero offset, or within the alignment,
+    // we may be able to fold an add-immediate into the memory operation.
+    // The check against alignment is below, as it can't occur until we check
+    // the arguments to N
+    if (!isa<ConstantSDNode>(N->getOperand(FirstOp)))
+      continue;
+
+    SDValue Base = N->getOperand(FirstOp + 1);
+    if (!Base.isMachineOpcode())
+      continue;
+
+    unsigned Flags = 0;
+    bool ReplaceFlags = true;
+
+    // When the feeding operation is an add-immediate of some sort,
+    // determine whether we need to add relocation information to the
+    // target flags on the immediate operand when we fold it into the
+    // load instruction.
+    //
+    // For something like ADDItocL, the relocation information is
+    // inferred from the opcode; when we process it in the AsmPrinter,
+    // we add the necessary relocation there.  A load, though, can receive
+    // relocation from various flavors of ADDIxxx, so we need to carry
+    // the relocation information in the target flags.
+    switch (Base.getMachineOpcode()) {
+    default: continue;
+
+    case PPC::ADDI8:
+    case PPC::ADDI:
+      // In some cases (such as TLS) the relocation information
+      // is already in place on the operand, so copying the operand
+      // is sufficient.
+      ReplaceFlags = false;
+      // For these cases, the immediate may not be divisible by 4, in
+      // which case the fold is illegal for DS-form instructions.  (The
+      // other cases provide aligned addresses and are always safe.)
+      if ((StorageOpcode == PPC::LWA ||
+           StorageOpcode == PPC::LD  ||
+           StorageOpcode == PPC::STD) &&
+          (!isa<ConstantSDNode>(Base.getOperand(1)) ||
+           Base.getConstantOperandVal(1) % 4 != 0))
+        continue;
+      break;
+    case PPC::ADDIdtprelL:
+      Flags = PPCII::MO_DTPREL_LO;
+      break;
+    case PPC::ADDItlsldL:
+      Flags = PPCII::MO_TLSLD_LO;
+      break;
+    case PPC::ADDItocL:
+      Flags = PPCII::MO_TOC_LO;
+      break;
+    }
+
+    SDValue ImmOpnd = Base.getOperand(1);
+
+    // On PPC64, the TOC base pointer is guaranteed by the ABI only to have
+    // 8-byte alignment, and so we can only use offsets less than 8 (otherwise,
+    // we might have needed different @ha relocation values for the offset
+    // pointers).
+    int MaxDisplacement = 7;
+    if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
+      const GlobalValue *GV = GA->getGlobal();
+      MaxDisplacement = std::min((int) GV->getAlignment() - 1, MaxDisplacement);
+    }
+
+    bool UpdateHBase = false;
+    SDValue HBase = Base.getOperand(0);
+
+    int Offset = N->getConstantOperandVal(FirstOp);
+    if (ReplaceFlags) {
+      if (Offset < 0 || Offset > MaxDisplacement) {
+        // If we have a addi(toc@l)/addis(toc@ha) pair, and the addis has only
+        // one use, then we can do this for any offset, we just need to also
+        // update the offset (i.e. the symbol addend) on the addis also.
+        if (Base.getMachineOpcode() != PPC::ADDItocL)
+          continue;
+
+        if (!HBase.isMachineOpcode() ||
+            HBase.getMachineOpcode() != PPC::ADDIStocHA)
+          continue;
+
+        if (!Base.hasOneUse() || !HBase.hasOneUse())
+          continue;
+
+        SDValue HImmOpnd = HBase.getOperand(1);
+        if (HImmOpnd != ImmOpnd)
+          continue;
+
+        UpdateHBase = true;
+      }
+    } else {
+      // If we're directly folding the addend from an addi instruction, then:
+      //  1. In general, the offset on the memory access must be zero.
+      //  2. If the addend is a constant, then it can be combined with a
+      //     non-zero offset, but only if the result meets the encoding
+      //     requirements.
+      if (auto *C = dyn_cast<ConstantSDNode>(ImmOpnd)) {
+        Offset += C->getSExtValue();
+
+        if ((StorageOpcode == PPC::LWA || StorageOpcode == PPC::LD ||
+             StorageOpcode == PPC::STD) && (Offset % 4) != 0)
+          continue;
+
+        if (!isInt<16>(Offset))
+          continue;
+
+        ImmOpnd = CurDAG->getTargetConstant(Offset, SDLoc(ImmOpnd),
+                                            ImmOpnd.getValueType());
+      } else if (Offset != 0) {
+        continue;
+      }
+    }
+
+    // We found an opportunity.  Reverse the operands from the add
+    // immediate and substitute them into the load or store.  If
+    // needed, update the target flags for the immediate operand to
+    // reflect the necessary relocation information.
+    DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase:    ");
+    DEBUG(Base->dump(CurDAG));
+    DEBUG(dbgs() << "\nN: ");
+    DEBUG(N->dump(CurDAG));
+    DEBUG(dbgs() << "\n");
+
+    // If the relocation information isn't already present on the
+    // immediate operand, add it now.
+    if (ReplaceFlags) {
+      if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
+        SDLoc dl(GA);
+        const GlobalValue *GV = GA->getGlobal();
+        // We can't perform this optimization for data whose alignment
+        // is insufficient for the instruction encoding.
+        if (GV->getAlignment() < 4 &&
+            (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
+             StorageOpcode == PPC::LWA || (Offset % 4) != 0)) {
+          DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
+          continue;
+        }
+        ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, Offset, Flags);
+      } else if (ConstantPoolSDNode *CP =
+                 dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
+        const Constant *C = CP->getConstVal();
+        ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64,
+                                                CP->getAlignment(),
+                                                Offset, Flags);
+      }
+    }
+
+    if (FirstOp == 1) // Store
+      (void)CurDAG->UpdateNodeOperands(N, N->getOperand(0), ImmOpnd,
+                                       Base.getOperand(0), N->getOperand(3));
+    else // Load
+      (void)CurDAG->UpdateNodeOperands(N, ImmOpnd, Base.getOperand(0),
+                                       N->getOperand(2));
+
+    if (UpdateHBase)
+      (void)CurDAG->UpdateNodeOperands(HBase.getNode(), HBase.getOperand(0),
+                                       ImmOpnd);
+
+    // The add-immediate may now be dead, in which case remove it.
+    if (Base.getNode()->use_empty())
+      CurDAG->RemoveDeadNode(Base.getNode());
+  }
+}
+
+
+/// createPPCISelDag - This pass converts a legalized DAG into a
+/// PowerPC-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) {
+  return new PPCDAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
new file mode 100644
index 000000000000..aa3ffde24b99
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -0,0 +1,12771 @@
+//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPCISelLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCISelLowering.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCCallingConv.h"
+#include "PPCCCState.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCPerfectShuffle.h"
+#include "PPCTargetMachine.h"
+#include "PPCTargetObjectFile.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include <list>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-lowering"
+
+static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
+cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
+
+static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
+cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
+
+static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
+cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
+
+static cl::opt<bool> DisableSCO("disable-ppc-sco",
+cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumSiblingCalls, "Number of sibling calls");
+
+// FIXME: Remove this once the bug has been fixed!
+extern cl::opt<bool> ANDIGlueBug;
+
+PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
+                                     const PPCSubtarget &STI)
+    : TargetLowering(TM), Subtarget(STI) {
+  // Use _setjmp/_longjmp instead of setjmp/longjmp.
+  setUseUnderscoreSetJmp(true);
+  setUseUnderscoreLongJmp(true);
+
+  // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
+  // arguments are at least 4/8 bytes aligned.
+  bool isPPC64 = Subtarget.isPPC64();
+  setMinStackArgumentAlignment(isPPC64 ? 8:4);
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
+  if (!useSoftFloat()) {
+    addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
+    addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
+  }
+
+  // PowerPC has an i16 but no i8 (or i1) SEXTLOAD
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
+  }
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // PowerPC has pre-inc load and store's.
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal);
+
+  if (Subtarget.useCRBits()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+    if (isPPC64 || Subtarget.hasFPCVT()) {
+      setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
+      AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
+                         isPPC64 ? MVT::i64 : MVT::i32);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
+      AddPromotedToType(ISD::UINT_TO_FP, MVT::i1,
+                        isPPC64 ? MVT::i64 : MVT::i32);
+    } else {
+      setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
+    }
+
+    // PowerPC does not support direct load / store of condition registers
+    setOperationAction(ISD::LOAD, MVT::i1, Custom);
+    setOperationAction(ISD::STORE, MVT::i1, Custom);
+
+    // FIXME: Remove this once the ANDI glue bug is fixed:
+    if (ANDIGlueBug)
+      setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
+
+    for (MVT VT : MVT::integer_valuetypes()) {
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+      setTruncStoreAction(VT, MVT::i1, Expand);
+    }
+
+    addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
+  }
+
+  // This is used in the ppcf128->int sequence.  Note it has different semantics
+  // from FP_ROUND:  that rounds to nearest, this rounds to zero.
+  setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom);
+
+  // We do not currently implement these libm ops for PowerPC.
+  setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
+  setOperationAction(ISD::FCEIL,  MVT::ppcf128, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);
+  setOperationAction(ISD::FRINT,  MVT::ppcf128, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
+  setOperationAction(ISD::FREM, MVT::ppcf128, Expand);
+
+  // PowerPC has no SREM/UREM instructions
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+
+  // We don't support sin/cos/sqrt/fmod/pow
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FPOW , MVT::f64, Expand);
+  setOperationAction(ISD::FMA  , MVT::f64, Legal);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FREM , MVT::f32, Expand);
+  setOperationAction(ISD::FPOW , MVT::f32, Expand);
+  setOperationAction(ISD::FMA  , MVT::f32, Legal);
+
+  setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
+
+  // If we're enabling GP optimizations, use hardware square root
+  if (!Subtarget.hasFSQRT() &&
+      !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() &&
+        Subtarget.hasFRE()))
+    setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+
+  if (!Subtarget.hasFSQRT() &&
+      !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() &&
+        Subtarget.hasFRES()))
+    setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+
+  if (Subtarget.hasFCPSGN()) {
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
+  } else {
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  }
+
+  if (Subtarget.hasFPRND()) {
+    setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+    setOperationAction(ISD::FCEIL,  MVT::f64, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+    setOperationAction(ISD::FROUND, MVT::f64, Legal);
+
+    setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+    setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::f32, Legal);
+  }
+
+  // PowerPC does not have BSWAP
+  // CTPOP or CTTZ were introduced in P8/P9 respectivelly
+  setOperationAction(ISD::BSWAP, MVT::i32  , Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
+  if (Subtarget.isISA3_0()) {
+    setOperationAction(ISD::CTTZ , MVT::i32  , Legal);
+    setOperationAction(ISD::CTTZ , MVT::i64  , Legal);
+  } else {
+    setOperationAction(ISD::CTTZ , MVT::i32  , Expand);
+    setOperationAction(ISD::CTTZ , MVT::i64  , Expand);
+  }
+
+  if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {
+    setOperationAction(ISD::CTPOP, MVT::i32  , Legal);
+    setOperationAction(ISD::CTPOP, MVT::i64  , Legal);
+  } else {
+    setOperationAction(ISD::CTPOP, MVT::i32  , Expand);
+    setOperationAction(ISD::CTPOP, MVT::i64  , Expand);
+  }
+
+  // PowerPC does not have ROTR
+  setOperationAction(ISD::ROTR, MVT::i32   , Expand);
+  setOperationAction(ISD::ROTR, MVT::i64   , Expand);
+
+  if (!Subtarget.useCRBits()) {
+    // PowerPC does not have Select
+    setOperationAction(ISD::SELECT, MVT::i32, Expand);
+    setOperationAction(ISD::SELECT, MVT::i64, Expand);
+    setOperationAction(ISD::SELECT, MVT::f32, Expand);
+    setOperationAction(ISD::SELECT, MVT::f64, Expand);
+  }
+
+  // PowerPC wants to turn select_cc of FP into fsel when possible.
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  // PowerPC wants to optimize integer setcc a bit
+  if (!Subtarget.useCRBits())
+    setOperationAction(ISD::SETCC, MVT::i32, Custom);
+
+  // PowerPC does not have BRCOND which requires SetCC
+  if (!Subtarget.useCRBits())
+    setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+
+  setOperationAction(ISD::BR_JT,  MVT::Other, Expand);
+
+  // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+
+  // PowerPC does not have [U|S]INT_TO_FP
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+
+  if (Subtarget.hasDirectMove() && isPPC64) {
+    setOperationAction(ISD::BITCAST, MVT::f32, Legal);
+    setOperationAction(ISD::BITCAST, MVT::i32, Legal);
+    setOperationAction(ISD::BITCAST, MVT::i64, Legal);
+    setOperationAction(ISD::BITCAST, MVT::f64, Legal);
+  } else {
+    setOperationAction(ISD::BITCAST, MVT::f32, Expand);
+    setOperationAction(ISD::BITCAST, MVT::i32, Expand);
+    setOperationAction(ISD::BITCAST, MVT::i64, Expand);
+    setOperationAction(ISD::BITCAST, MVT::f64, Expand);
+  }
+
+  // We cannot sextinreg(i1).  Expand to shifts.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
+  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
+  // support continuation, user-level threading, and etc.. As a result, no
+  // other SjLj exception interfaces are implemented and please don't build
+  // your own exception handling based on them.
+  // LLVM/Clang supports zero-cost DWARF exception handling.
+  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+
+  // We want to legalize GlobalAddress and ConstantPool nodes into the
+  // appropriate instructions to materialize the address.
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::BlockAddress,  MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i32, Custom);
+  setOperationAction(ISD::JumpTable,     MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+  setOperationAction(ISD::BlockAddress,  MVT::i64, Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i64, Custom);
+  setOperationAction(ISD::JumpTable,     MVT::i64, Custom);
+
+  // TRAP is legal.
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // TRAMPOLINE is custom lowered.
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+
+  if (Subtarget.isSVR4ABI()) {
+    if (isPPC64) {
+      // VAARG always uses double-word chunks, so promote anything smaller.
+      setOperationAction(ISD::VAARG, MVT::i1, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::i8, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::i16, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::i32, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::Other, Expand);
+    } else {
+      // VAARG is custom lowered with the 32-bit SVR4 ABI.
+      setOperationAction(ISD::VAARG, MVT::Other, Custom);
+      setOperationAction(ISD::VAARG, MVT::i64, Custom);
+    }
+  } else
+    setOperationAction(ISD::VAARG, MVT::Other, Expand);
+
+  if (Subtarget.isSVR4ABI() && !isPPC64)
+    // VACOPY is custom lowered with the 32-bit SVR4 ABI.
+    setOperationAction(ISD::VACOPY            , MVT::Other, Custom);
+  else
+    setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Custom);
+  setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom);
+  setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom);
+  setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
+  setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  // To handle counter-based loop conditions.
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
+
+  // Comparisons that require checking two conditions.
+  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
+
+  if (Subtarget.has64BitSupport()) {
+    // They also have instructions for converting between i64 and fp.
+    setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+    // This is just the low 32 bits of a (signed) fp->i64 conversion.
+    // We cannot do this with Promote because i64 is not a legal type.
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+
+    if (Subtarget.hasLFIWAX() || Subtarget.isPPC64())
+      setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  } else {
+    // PowerPC does not have FP_TO_UINT on 32-bit implementations.
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+  }
+
+  // With the instructions enabled under FPCVT, we can do everything.
+  if (Subtarget.hasFPCVT()) {
+    if (Subtarget.has64BitSupport()) {
+      setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+      setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+    }
+
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  }
+
+  if (Subtarget.use64BitRegs()) {
+    // 64-bit PowerPC implementations can support i64 types directly
+    addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
+    // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
+    setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+    // 64-bit PowerPC wants to expand i128 shifts itself.
+    setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+    setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+    setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+  } else {
+    // 32-bit PowerPC wants to expand i64 shifts itself.
+    setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+    setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+    setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  }
+
+  if (Subtarget.hasAltivec()) {
+    // First set operation action for all vector types to expand. Then we
+    // will selectively turn on ones that can be effectively codegen'd.
+    for (MVT VT : MVT::vector_valuetypes()) {
+      // add/sub are legal for all supported vector VT's.
+      setOperationAction(ISD::ADD, VT, Legal);
+      setOperationAction(ISD::SUB, VT, Legal);
+
+      // Vector instructions introduced in P8
+      if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
+        setOperationAction(ISD::CTPOP, VT, Legal);
+        setOperationAction(ISD::CTLZ, VT, Legal);
+      }
+      else {
+        setOperationAction(ISD::CTPOP, VT, Expand);
+        setOperationAction(ISD::CTLZ, VT, Expand);
+      }
+
+      // Vector instructions introduced in P9
+      if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128))
+        setOperationAction(ISD::CTTZ, VT, Legal);
+      else
+        setOperationAction(ISD::CTTZ, VT, Expand);
+
+      // We promote all shuffles to v16i8.
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);
+      AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);
+
+      // We promote all non-typed operations to v4i32.
+      setOperationAction(ISD::AND   , VT, Promote);
+      AddPromotedToType (ISD::AND   , VT, MVT::v4i32);
+      setOperationAction(ISD::OR    , VT, Promote);
+      AddPromotedToType (ISD::OR    , VT, MVT::v4i32);
+      setOperationAction(ISD::XOR   , VT, Promote);
+      AddPromotedToType (ISD::XOR   , VT, MVT::v4i32);
+      setOperationAction(ISD::LOAD  , VT, Promote);
+      AddPromotedToType (ISD::LOAD  , VT, MVT::v4i32);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);
+      setOperationAction(ISD::SELECT_CC, VT, Promote);
+      AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32);
+      setOperationAction(ISD::STORE, VT, Promote);
+      AddPromotedToType (ISD::STORE, VT, MVT::v4i32);
+
+      // No other operations are legal.
+      setOperationAction(ISD::MUL , VT, Expand);
+      setOperationAction(ISD::SDIV, VT, Expand);
+      setOperationAction(ISD::SREM, VT, Expand);
+      setOperationAction(ISD::UDIV, VT, Expand);
+      setOperationAction(ISD::UREM, VT, Expand);
+      setOperationAction(ISD::FDIV, VT, Expand);
+      setOperationAction(ISD::FREM, VT, Expand);
+      setOperationAction(ISD::FNEG, VT, Expand);
+      setOperationAction(ISD::FSQRT, VT, Expand);
+      setOperationAction(ISD::FLOG, VT, Expand);
+      setOperationAction(ISD::FLOG10, VT, Expand);
+      setOperationAction(ISD::FLOG2, VT, Expand);
+      setOperationAction(ISD::FEXP, VT, Expand);
+      setOperationAction(ISD::FEXP2, VT, Expand);
+      setOperationAction(ISD::FSIN, VT, Expand);
+      setOperationAction(ISD::FCOS, VT, Expand);
+      setOperationAction(ISD::FABS, VT, Expand);
+      setOperationAction(ISD::FPOWI, VT, Expand);
+      setOperationAction(ISD::FFLOOR, VT, Expand);
+      setOperationAction(ISD::FCEIL,  VT, Expand);
+      setOperationAction(ISD::FTRUNC, VT, Expand);
+      setOperationAction(ISD::FRINT,  VT, Expand);
+      setOperationAction(ISD::FNEARBYINT, VT, Expand);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
+      setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
+      setOperationAction(ISD::MULHU, VT, Expand);
+      setOperationAction(ISD::MULHS, VT, Expand);
+      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+      setOperationAction(ISD::UDIVREM, VT, Expand);
+      setOperationAction(ISD::SDIVREM, VT, Expand);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
+      setOperationAction(ISD::FPOW, VT, Expand);
+      setOperationAction(ISD::BSWAP, VT, Expand);
+      setOperationAction(ISD::VSELECT, VT, Expand);
+      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+      setOperationAction(ISD::ROTL, VT, Expand);
+      setOperationAction(ISD::ROTR, VT, Expand);
+
+      for (MVT InnerVT : MVT::vector_valuetypes()) {
+        setTruncStoreAction(VT, InnerVT, Expand);
+        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+      }
+    }
+
+    // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
+    // with merges, splats, etc.
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
+
+    setOperationAction(ISD::AND   , MVT::v4i32, Legal);
+    setOperationAction(ISD::OR    , MVT::v4i32, Legal);
+    setOperationAction(ISD::XOR   , MVT::v4i32, Legal);
+    setOperationAction(ISD::LOAD  , MVT::v4i32, Legal);
+    setOperationAction(ISD::SELECT, MVT::v4i32,
+                       Subtarget.useCRBits() ? Legal : Expand);
+    setOperationAction(ISD::STORE , MVT::v4i32, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
+
+    addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);
+
+    setOperationAction(ISD::MUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMA, MVT::v4f32, Legal);
+
+    if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) {
+      setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+      setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+    }
+
+    if (Subtarget.hasP8Altivec())
+      setOperationAction(ISD::MUL, MVT::v4i32, Legal);
+    else
+      setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v16i8, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
+
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+
+    // Altivec does not contain unordered floating-point compare instructions
+    setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETO,   MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);
+
+    if (Subtarget.hasVSX()) {
+      setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
+      if (Subtarget.hasP8Vector()) {
+        setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
+        setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
+      }
+      if (Subtarget.hasDirectMove() && isPPC64) {
+        setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal);
+        setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal);
+        setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal);
+        setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal);
+        setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal);
+        setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal);
+        setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
+        setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
+      }
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
+
+      setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
+      setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
+      setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
+      setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
+      setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
+
+      setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
+
+      setOperationAction(ISD::MUL, MVT::v2f64, Legal);
+      setOperationAction(ISD::FMA, MVT::v2f64, Legal);
+
+      setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
+      setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
+
+      setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
+      setOperationAction(ISD::VSELECT, MVT::v8i16, Legal);
+      setOperationAction(ISD::VSELECT, MVT::v4i32, Legal);
+      setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
+      setOperationAction(ISD::VSELECT, MVT::v2f64, Legal);
+
+      // Share the Altivec comparison restrictions.
+      setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);
+      setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);
+      setCondCodeAction(ISD::SETO,   MVT::v2f64, Expand);
+      setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand);
+
+      setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
+      setOperationAction(ISD::STORE, MVT::v2f64, Legal);
+
+      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal);
+
+      if (Subtarget.hasP8Vector())
+        addRegisterClass(MVT::f32, &PPC::VSSRCRegClass);
+
+      addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);
+
+      addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass);
+      addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);
+      addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);
+
+      if (Subtarget.hasP8Altivec()) {
+        setOperationAction(ISD::SHL, MVT::v2i64, Legal);
+        setOperationAction(ISD::SRA, MVT::v2i64, Legal);
+        setOperationAction(ISD::SRL, MVT::v2i64, Legal);
+
+        setOperationAction(ISD::SETCC, MVT::v2i64, Legal);
+      }
+      else {
+        setOperationAction(ISD::SHL, MVT::v2i64, Expand);
+        setOperationAction(ISD::SRA, MVT::v2i64, Expand);
+        setOperationAction(ISD::SRL, MVT::v2i64, Expand);
+
+        setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
+
+        // VSX v2i64 only supports non-arithmetic operations.
+        setOperationAction(ISD::ADD, MVT::v2i64, Expand);
+        setOperationAction(ISD::SUB, MVT::v2i64, Expand);
+      }
+
+      setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
+      AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);
+      setOperationAction(ISD::STORE, MVT::v2i64, Promote);
+      AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64);
+
+      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal);
+
+      setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
+      setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+
+      // Vector operation legalization checks the result type of
+      // SIGN_EXTEND_INREG, overall legalization checks the inner type.
+      setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
+      setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
+      setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+      setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
+
+      setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
+      setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
+      setOperationAction(ISD::FABS, MVT::v4f32, Legal);
+      setOperationAction(ISD::FABS, MVT::v2f64, Legal);
+
+      if (Subtarget.hasDirectMove())
+        setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
+      setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
+
+      addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
+    }
+
+    if (Subtarget.hasP8Altivec()) {
+      addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass);
+      addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass);
+    }
+
+    if (Subtarget.hasP9Vector()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+    }
+  }
+
+  if (Subtarget.hasQPX()) {
+    setOperationAction(ISD::FADD, MVT::v4f64, Legal);
+    setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
+    setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
+    setOperationAction(ISD::FREM, MVT::v4f64, Expand);
+
+    setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal);
+    setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand);
+
+    setOperationAction(ISD::LOAD  , MVT::v4f64, Custom);
+    setOperationAction(ISD::STORE , MVT::v4f64, Custom);
+
+    setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom);
+
+    if (!Subtarget.useCRBits())
+      setOperationAction(ISD::SELECT, MVT::v4f64, Expand);
+    setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal);
+    setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand);
+    setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal);
+    setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand);
+
+    setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal);
+    setOperationAction(ISD::FP_ROUND_INREG , MVT::v4f32, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
+
+    setOperationAction(ISD::FNEG , MVT::v4f64, Legal);
+    setOperationAction(ISD::FABS , MVT::v4f64, Legal);
+    setOperationAction(ISD::FSIN , MVT::v4f64, Expand);
+    setOperationAction(ISD::FCOS , MVT::v4f64, Expand);
+    setOperationAction(ISD::FPOWI , MVT::v4f64, Expand);
+    setOperationAction(ISD::FPOW , MVT::v4f64, Expand);
+    setOperationAction(ISD::FLOG , MVT::v4f64, Expand);
+    setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand);
+    setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand);
+    setOperationAction(ISD::FEXP , MVT::v4f64, Expand);
+    setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand);
+
+    setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal);
+
+    setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal);
+    setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal);
+
+    addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass);
+
+    setOperationAction(ISD::FADD, MVT::v4f32, Legal);
+    setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FREM, MVT::v4f32, Expand);
+
+    setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
+    setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand);
+
+    setOperationAction(ISD::LOAD  , MVT::v4f32, Custom);
+    setOperationAction(ISD::STORE , MVT::v4f32, Custom);
+
+    if (!Subtarget.useCRBits())
+      setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
+    setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal);
+    setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand);
+    setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal);
+    setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand);
+
+    setOperationAction(ISD::FNEG , MVT::v4f32, Legal);
+    setOperationAction(ISD::FABS , MVT::v4f32, Legal);
+    setOperationAction(ISD::FSIN , MVT::v4f32, Expand);
+    setOperationAction(ISD::FCOS , MVT::v4f32, Expand);
+    setOperationAction(ISD::FPOWI , MVT::v4f32, Expand);
+    setOperationAction(ISD::FPOW , MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG , MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand);
+    setOperationAction(ISD::FEXP , MVT::v4f32, Expand);
+    setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand);
+
+    setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
+
+    setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal);
+    setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal);
+
+    addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass);
+
+    setOperationAction(ISD::AND , MVT::v4i1, Legal);
+    setOperationAction(ISD::OR , MVT::v4i1, Legal);
+    setOperationAction(ISD::XOR , MVT::v4i1, Legal);
+
+    if (!Subtarget.useCRBits())
+      setOperationAction(ISD::SELECT, MVT::v4i1, Expand);
+    setOperationAction(ISD::VSELECT, MVT::v4i1, Legal);
+
+    setOperationAction(ISD::LOAD  , MVT::v4i1, Custom);
+    setOperationAction(ISD::STORE , MVT::v4i1, Custom);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand);
+    setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);
+
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
+
+    addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass);
+
+    setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
+    setOperationAction(ISD::FCEIL,  MVT::v4f64, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
+    setOperationAction(ISD::FROUND, MVT::v4f64, Legal);
+
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+    setOperationAction(ISD::FCEIL,  MVT::v4f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
+
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
+
+    // These need to set FE_INEXACT, and so cannot be vectorized here.
+    setOperationAction(ISD::FRINT, MVT::v4f64, Expand);
+    setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
+
+    if (TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
+      setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
+
+      setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+      setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+    } else {
+      setOperationAction(ISD::FDIV, MVT::v4f64, Expand);
+      setOperationAction(ISD::FSQRT, MVT::v4f64, Expand);
+
+      setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
+      setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
+    }
+  }
+
+  if (Subtarget.has64BitSupport())
+    setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
+
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom);
+
+  if (!isPPC64) {
+    setOperationAction(ISD::ATOMIC_LOAD,  MVT::i64, Expand);
+    setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
+  }
+
+  setBooleanContents(ZeroOrOneBooleanContent);
+
+  if (Subtarget.hasAltivec()) {
+    // Altivec instructions set fields to all zeros or all ones.
+    setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+  }
+
+  if (!isPPC64) {
+    // These libcalls are not available in 32-bit.
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
+  }
+
+  setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::BUILD_VECTOR);
+  if (Subtarget.hasFPCVT())
+    setTargetDAGCombine(ISD::UINT_TO_FP);
+  setTargetDAGCombine(ISD::LOAD);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::BR_CC);
+  if (Subtarget.useCRBits())
+    setTargetDAGCombine(ISD::BRCOND);
+  setTargetDAGCombine(ISD::BSWAP);
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
+
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+
+  if (Subtarget.useCRBits()) {
+    setTargetDAGCombine(ISD::TRUNCATE);
+    setTargetDAGCombine(ISD::SETCC);
+    setTargetDAGCombine(ISD::SELECT_CC);
+  }
+
+  // Use reciprocal estimates.
+  if (TM.Options.UnsafeFPMath) {
+    setTargetDAGCombine(ISD::FDIV);
+    setTargetDAGCombine(ISD::FSQRT);
+  }
+
+  // Darwin long double math library functions have $LDBL128 appended.
+  if (Subtarget.isDarwin()) {
+    setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
+    setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
+    setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
+    setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128");
+    setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128");
+    setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128");
+    setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128");
+    setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128");
+    setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128");
+    setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
+  }
+
+  // With 32 condition bits, we don't need to sink (and duplicate) compares
+  // aggressively in CodeGenPrep.
+  if (Subtarget.useCRBits()) {
+    setHasMultipleConditionRegisters();
+    setJumpIsExpensive();
+  }
+
+  setMinFunctionAlignment(2);
+  if (Subtarget.isDarwin())
+    setPrefFunctionAlignment(4);
+
+  switch (Subtarget.getDarwinDirective()) {
+  default: break;
+  case PPC::DIR_970:
+  case PPC::DIR_A2:
+  case PPC::DIR_E500mc:
+  case PPC::DIR_E5500:
+  case PPC::DIR_PWR4:
+  case PPC::DIR_PWR5:
+  case PPC::DIR_PWR5X:
+  case PPC::DIR_PWR6:
+  case PPC::DIR_PWR6X:
+  case PPC::DIR_PWR7:
+  case PPC::DIR_PWR8:
+  case PPC::DIR_PWR9:
+    setPrefFunctionAlignment(4);
+    setPrefLoopAlignment(4);
+    break;
+  }
+
+  if (Subtarget.enableMachineScheduler())
+    setSchedulingPreference(Sched::Source);
+  else
+    setSchedulingPreference(Sched::Hybrid);
+
+  computeRegisterProperties(STI.getRegisterInfo());
+
+  // The Freescale cores do better with aggressive inlining of memcpy and
+  // friends. GCC uses same threshold of 128 bytes (= 32 word stores).
+  if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc ||
+      Subtarget.getDarwinDirective() == PPC::DIR_E5500) {
+    MaxStoresPerMemset = 32;
+    MaxStoresPerMemsetOptSize = 16;
+    MaxStoresPerMemcpy = 32;
+    MaxStoresPerMemcpyOptSize = 8;
+    MaxStoresPerMemmove = 32;
+    MaxStoresPerMemmoveOptSize = 8;
+  } else if (Subtarget.getDarwinDirective() == PPC::DIR_A2) {
+    // The A2 also benefits from (very) aggressive inlining of memcpy and
+    // friends. The overhead of a the function call, even when warm, can be
+    // over one hundred cycles.
+    MaxStoresPerMemset = 128;
+    MaxStoresPerMemcpy = 128;
+    MaxStoresPerMemmove = 128;
+  }
+}
+
+/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
+                             unsigned MaxMaxAlign) {
+  if (MaxAlign == MaxMaxAlign)
+    return;
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256)
+      MaxAlign = 32;
+    else if (VTy->getBitWidth() >= 128 && MaxAlign < 16)
+      MaxAlign = 16;
+  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    unsigned EltAlign = 0;
+    getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
+    if (EltAlign > MaxAlign)
+      MaxAlign = EltAlign;
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    for (auto *EltTy : STy->elements()) {
+      unsigned EltAlign = 0;
+      getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign);
+      if (EltAlign > MaxAlign)
+        MaxAlign = EltAlign;
+      if (MaxAlign == MaxMaxAlign)
+        break;
+    }
+  }
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area.
+unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
+                                                  const DataLayout &DL) const {
+  // Darwin passes everything on 4 byte boundary.
+  if (Subtarget.isDarwin())
+    return 4;
+
+  // 16byte and wider vectors are passed on 16byte boundary.
+  // The rest is 8 on PPC64 and 4 on PPC32 boundary.
+  unsigned Align = Subtarget.isPPC64() ? 8 : 4;
+  if (Subtarget.hasAltivec() || Subtarget.hasQPX())
+    getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16);
+  return Align;
+}
+
+bool PPCTargetLowering::useSoftFloat() const {
+  return Subtarget.useSoftFloat();
+}
+
+const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch ((PPCISD::NodeType)Opcode) {
+  case PPCISD::FIRST_NUMBER:    break;
+  case PPCISD::FSEL:            return "PPCISD::FSEL";
+  case PPCISD::FCFID:           return "PPCISD::FCFID";
+  case PPCISD::FCFIDU:          return "PPCISD::FCFIDU";
+  case PPCISD::FCFIDS:          return "PPCISD::FCFIDS";
+  case PPCISD::FCFIDUS:         return "PPCISD::FCFIDUS";
+  case PPCISD::FCTIDZ:          return "PPCISD::FCTIDZ";
+  case PPCISD::FCTIWZ:          return "PPCISD::FCTIWZ";
+  case PPCISD::FCTIDUZ:         return "PPCISD::FCTIDUZ";
+  case PPCISD::FCTIWUZ:         return "PPCISD::FCTIWUZ";
+  case PPCISD::FRE:             return "PPCISD::FRE";
+  case PPCISD::FRSQRTE:         return "PPCISD::FRSQRTE";
+  case PPCISD::STFIWX:          return "PPCISD::STFIWX";
+  case PPCISD::VMADDFP:         return "PPCISD::VMADDFP";
+  case PPCISD::VNMSUBFP:        return "PPCISD::VNMSUBFP";
+  case PPCISD::VPERM:           return "PPCISD::VPERM";
+  case PPCISD::XXSPLT:          return "PPCISD::XXSPLT";
+  case PPCISD::XXINSERT:        return "PPCISD::XXINSERT";
+  case PPCISD::VECSHL:          return "PPCISD::VECSHL";
+  case PPCISD::CMPB:            return "PPCISD::CMPB";
+  case PPCISD::Hi:              return "PPCISD::Hi";
+  case PPCISD::Lo:              return "PPCISD::Lo";
+  case PPCISD::TOC_ENTRY:       return "PPCISD::TOC_ENTRY";
+  case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
+  case PPCISD::DYNAREAOFFSET:   return "PPCISD::DYNAREAOFFSET";
+  case PPCISD::GlobalBaseReg:   return "PPCISD::GlobalBaseReg";
+  case PPCISD::SRL:             return "PPCISD::SRL";
+  case PPCISD::SRA:             return "PPCISD::SRA";
+  case PPCISD::SHL:             return "PPCISD::SHL";
+  case PPCISD::SRA_ADDZE:       return "PPCISD::SRA_ADDZE";
+  case PPCISD::CALL:            return "PPCISD::CALL";
+  case PPCISD::CALL_NOP:        return "PPCISD::CALL_NOP";
+  case PPCISD::MTCTR:           return "PPCISD::MTCTR";
+  case PPCISD::BCTRL:           return "PPCISD::BCTRL";
+  case PPCISD::BCTRL_LOAD_TOC:  return "PPCISD::BCTRL_LOAD_TOC";
+  case PPCISD::RET_FLAG:        return "PPCISD::RET_FLAG";
+  case PPCISD::READ_TIME_BASE:  return "PPCISD::READ_TIME_BASE";
+  case PPCISD::EH_SJLJ_SETJMP:  return "PPCISD::EH_SJLJ_SETJMP";
+  case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
+  case PPCISD::MFOCRF:          return "PPCISD::MFOCRF";
+  case PPCISD::MFVSR:           return "PPCISD::MFVSR";
+  case PPCISD::MTVSRA:          return "PPCISD::MTVSRA";
+  case PPCISD::MTVSRZ:          return "PPCISD::MTVSRZ";
+  case PPCISD::SINT_VEC_TO_FP:  return "PPCISD::SINT_VEC_TO_FP";
+  case PPCISD::UINT_VEC_TO_FP:  return "PPCISD::UINT_VEC_TO_FP";
+  case PPCISD::ANDIo_1_EQ_BIT:  return "PPCISD::ANDIo_1_EQ_BIT";
+  case PPCISD::ANDIo_1_GT_BIT:  return "PPCISD::ANDIo_1_GT_BIT";
+  case PPCISD::VCMP:            return "PPCISD::VCMP";
+  case PPCISD::VCMPo:           return "PPCISD::VCMPo";
+  case PPCISD::LBRX:            return "PPCISD::LBRX";
+  case PPCISD::STBRX:           return "PPCISD::STBRX";
+  case PPCISD::LFIWAX:          return "PPCISD::LFIWAX";
+  case PPCISD::LFIWZX:          return "PPCISD::LFIWZX";
+  case PPCISD::LXSIZX:          return "PPCISD::LXSIZX";
+  case PPCISD::STXSIX:          return "PPCISD::STXSIX";
+  case PPCISD::VEXTS:           return "PPCISD::VEXTS";
+  case PPCISD::LXVD2X:          return "PPCISD::LXVD2X";
+  case PPCISD::STXVD2X:         return "PPCISD::STXVD2X";
+  case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
+  case PPCISD::BDNZ:            return "PPCISD::BDNZ";
+  case PPCISD::BDZ:             return "PPCISD::BDZ";
+  case PPCISD::MFFS:            return "PPCISD::MFFS";
+  case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
+  case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
+  case PPCISD::CR6SET:          return "PPCISD::CR6SET";
+  case PPCISD::CR6UNSET:        return "PPCISD::CR6UNSET";
+  case PPCISD::PPC32_GOT:       return "PPCISD::PPC32_GOT";
+  case PPCISD::PPC32_PICGOT:    return "PPCISD::PPC32_PICGOT";
+  case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
+  case PPCISD::LD_GOT_TPREL_L:  return "PPCISD::LD_GOT_TPREL_L";
+  case PPCISD::ADD_TLS:         return "PPCISD::ADD_TLS";
+  case PPCISD::ADDIS_TLSGD_HA:  return "PPCISD::ADDIS_TLSGD_HA";
+  case PPCISD::ADDI_TLSGD_L:    return "PPCISD::ADDI_TLSGD_L";
+  case PPCISD::GET_TLS_ADDR:    return "PPCISD::GET_TLS_ADDR";
+  case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR";
+  case PPCISD::ADDIS_TLSLD_HA:  return "PPCISD::ADDIS_TLSLD_HA";
+  case PPCISD::ADDI_TLSLD_L:    return "PPCISD::ADDI_TLSLD_L";
+  case PPCISD::GET_TLSLD_ADDR:  return "PPCISD::GET_TLSLD_ADDR";
+  case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR";
+  case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
+  case PPCISD::ADDI_DTPREL_L:   return "PPCISD::ADDI_DTPREL_L";
+  case PPCISD::VADD_SPLAT:      return "PPCISD::VADD_SPLAT";
+  case PPCISD::SC:              return "PPCISD::SC";
+  case PPCISD::CLRBHRB:         return "PPCISD::CLRBHRB";
+  case PPCISD::MFBHRBE:         return "PPCISD::MFBHRBE";
+  case PPCISD::RFEBB:           return "PPCISD::RFEBB";
+  case PPCISD::XXSWAPD:         return "PPCISD::XXSWAPD";
+  case PPCISD::SWAP_NO_CHAIN:   return "PPCISD::SWAP_NO_CHAIN";
+  case PPCISD::QVFPERM:         return "PPCISD::QVFPERM";
+  case PPCISD::QVGPCI:          return "PPCISD::QVGPCI";
+  case PPCISD::QVALIGNI:        return "PPCISD::QVALIGNI";
+  case PPCISD::QVESPLATI:       return "PPCISD::QVESPLATI";
+  case PPCISD::QBFLT:           return "PPCISD::QBFLT";
+  case PPCISD::QVLFSb:          return "PPCISD::QVLFSb";
+  }
+  return nullptr;
+}
+
+EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
+                                          EVT VT) const {
+  if (!VT.isVector())
+    return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
+
+  if (Subtarget.hasQPX())
+    return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
+
+  return VT.changeVectorElementTypeToInteger();
+}
+
+bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
+  assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Node matching predicates, for use by the tblgen matching code.
+//===----------------------------------------------------------------------===//
+
+/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
+static bool isFloatingPointZero(SDValue Op) {
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
+    return CFP->getValueAPF().isZero();
+  else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
+    // Maybe this has already been legalized into the constant pool?
+    if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))
+      if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
+        return CFP->getValueAPF().isZero();
+  }
+  return false;
+}
+
+/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode.  Return
+/// true if Op is undef or if it matches the specified value.
+static bool isConstantOrUndef(int Op, int Val) {
+  return Op < 0 || Op == Val;
+}
+
+/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
+/// VPKUHUM instruction.
+/// The ShuffleKind distinguishes between big-endian operations with
+/// two different inputs (0), either-endian operations with two identical
+/// inputs (1), and little-endian operations with two different inputs (2).
+/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
+                               SelectionDAG &DAG) {
+  bool IsLE = DAG.getDataLayout().isLittleEndian();
+  if (ShuffleKind == 0) {
+    if (IsLE)
+      return false;
+    for (unsigned i = 0; i != 16; ++i)
+      if (!isConstantOrUndef(N->getMaskElt(i), i*2+1))
+        return false;
+  } else if (ShuffleKind == 2) {
+    if (!IsLE)
+      return false;
+    for (unsigned i = 0; i != 16; ++i)
+      if (!isConstantOrUndef(N->getMaskElt(i), i*2))
+        return false;
+  } else if (ShuffleKind == 1) {
+    unsigned j = IsLE ? 0 : 1;
+    for (unsigned i = 0; i != 8; ++i)
+      if (!isConstantOrUndef(N->getMaskElt(i),    i*2+j) ||
+          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+j))
+        return false;
+  }
+  return true;
+}
+
+/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
+/// VPKUWUM instruction.
+/// The ShuffleKind distinguishes between big-endian operations with
+/// two different inputs (0), either-endian operations with two identical
+/// inputs (1), and little-endian operations with two different inputs (2).
+/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
+                               SelectionDAG &DAG) {
+  bool IsLE = DAG.getDataLayout().isLittleEndian();
+  if (ShuffleKind == 0) {
+    if (IsLE)
+      return false;
+    for (unsigned i = 0; i != 16; i += 2)
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+2) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+3))
+        return false;
+  } else if (ShuffleKind == 2) {
+    if (!IsLE)
+      return false;
+    for (unsigned i = 0; i != 16; i += 2)
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+1))
+        return false;
+  } else if (ShuffleKind == 1) {
+    unsigned j = IsLE ? 0 : 2;
+    for (unsigned i = 0; i != 8; i += 2)
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+j)   ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+j+1) ||
+          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+j)   ||
+          !isConstantOrUndef(N->getMaskElt(i+9),  i*2+j+1))
+        return false;
+  }
+  return true;
+}
+
+/// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
+/// VPKUDUM instruction, AND the VPKUDUM instruction exists for the
+/// current subtarget.
+///
+/// The ShuffleKind distinguishes between big-endian operations with
+/// two different inputs (0), either-endian operations with two identical
+/// inputs (1), and little-endian operations with two different inputs (2).
+/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
+                               SelectionDAG &DAG) {
+  const PPCSubtarget& Subtarget =
+    static_cast<const PPCSubtarget&>(DAG.getSubtarget());
+  if (!Subtarget.hasP8Vector())
+    return false;
+
+  bool IsLE = DAG.getDataLayout().isLittleEndian();
+  if (ShuffleKind == 0) {
+    if (IsLE)
+      return false;
+    for (unsigned i = 0; i != 16; i += 4)
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+4) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+5) ||
+          !isConstantOrUndef(N->getMaskElt(i+2),  i*2+6) ||
+          !isConstantOrUndef(N->getMaskElt(i+3),  i*2+7))
+        return false;
+  } else if (ShuffleKind == 2) {
+    if (!IsLE)
+      return false;
+    for (unsigned i = 0; i != 16; i += 4)
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+1) ||
+          !isConstantOrUndef(N->getMaskElt(i+2),  i*2+2) ||
+          !isConstantOrUndef(N->getMaskElt(i+3),  i*2+3))
+        return false;
+  } else if (ShuffleKind == 1) {
+    unsigned j = IsLE ? 0 : 4;
+    for (unsigned i = 0; i != 8; i += 4)
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+j)   ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+j+1) ||
+          !isConstantOrUndef(N->getMaskElt(i+2),  i*2+j+2) ||
+          !isConstantOrUndef(N->getMaskElt(i+3),  i*2+j+3) ||
+          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+j)   ||
+          !isConstantOrUndef(N->getMaskElt(i+9),  i*2+j+1) ||
+          !isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) ||
+          !isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3))
+        return false;
+  }
+  return true;
+}
+
+/// isVMerge - Common function, used to match vmrg* shuffles.
+///
+static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
+                     unsigned LHSStart, unsigned RHSStart) {
+  if (N->getValueType(0) != MVT::v16i8)
+    return false;
+  assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
+         "Unsupported merge size!");
+
+  for (unsigned i = 0; i != 8/UnitSize; ++i)     // Step over units
+    for (unsigned j = 0; j != UnitSize; ++j) {   // Step over bytes within unit
+      if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j),
+                             LHSStart+j+i*UnitSize) ||
+          !isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j),
+                             RHSStart+j+i*UnitSize))
+        return false;
+    }
+  return true;
+}
+
+/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
+/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
+/// The ShuffleKind distinguishes between big-endian merges with two
+/// different inputs (0), either-endian merges with two identical inputs (1),
+/// and little-endian merges with two different inputs (2).  For the latter,
+/// the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+                             unsigned ShuffleKind, SelectionDAG &DAG) {
+  if (DAG.getDataLayout().isLittleEndian()) {
+    if (ShuffleKind == 1) // unary
+      return isVMerge(N, UnitSize, 0, 0);
+    else if (ShuffleKind == 2) // swapped
+      return isVMerge(N, UnitSize, 0, 16);
+    else
+      return false;
+  } else {
+    if (ShuffleKind == 1) // unary
+      return isVMerge(N, UnitSize, 8, 8);
+    else if (ShuffleKind == 0) // normal
+      return isVMerge(N, UnitSize, 8, 24);
+    else
+      return false;
+  }
+}
+
+/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
+/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
+/// The ShuffleKind distinguishes between big-endian merges with two
+/// different inputs (0), either-endian merges with two identical inputs (1),
+/// and little-endian merges with two different inputs (2).  For the latter,
+/// the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+                             unsigned ShuffleKind, SelectionDAG &DAG) {
+  if (DAG.getDataLayout().isLittleEndian()) {
+    if (ShuffleKind == 1) // unary
+      return isVMerge(N, UnitSize, 8, 8);
+    else if (ShuffleKind == 2) // swapped
+      return isVMerge(N, UnitSize, 8, 24);
+    else
+      return false;
+  } else {
+    if (ShuffleKind == 1) // unary
+      return isVMerge(N, UnitSize, 0, 0);
+    else if (ShuffleKind == 0) // normal
+      return isVMerge(N, UnitSize, 0, 16);
+    else
+      return false;
+  }
+}
+
+/**
+ * \brief Common function used to match vmrgew and vmrgow shuffles
+ *
+ * The indexOffset determines whether to look for even or odd words in
+ * the shuffle mask. This is based on the of the endianness of the target
+ * machine.
+ *   - Little Endian:
+ *     - Use offset of 0 to check for odd elements
+ *     - Use offset of 4 to check for even elements
+ *   - Big Endian:
+ *     - Use offset of 0 to check for even elements
+ *     - Use offset of 4 to check for odd elements
+ * A detailed description of the vector element ordering for little endian and
+ * big endian can be found at
+ * http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html
+ * Targeting your applications - what little endian and big endian IBM XL C/C++
+ * compiler differences mean to you
+ *
+ * The mask to the shuffle vector instruction specifies the indices of the
+ * elements from the two input vectors to place in the result. The elements are
+ * numbered in array-access order, starting with the first vector. These vectors
+ * are always of type v16i8, thus each vector will contain 16 elements of size
+ * 8. More info on the shuffle vector can be found in the
+ * http://llvm.org/docs/LangRef.html#shufflevector-instruction
+ * Language Reference.
+ *
+ * The RHSStartValue indicates whether the same input vectors are used (unary)
+ * or two different input vectors are used, based on the following:
+ *   - If the instruction uses the same vector for both inputs, the range of the
+ *     indices will be 0 to 15. In this case, the RHSStart value passed should
+ *     be 0.
+ *   - If the instruction has two different vectors then the range of the
+ *     indices will be 0 to 31. In this case, the RHSStart value passed should
+ *     be 16 (indices 0-15 specify elements in the first vector while indices 16
+ *     to 31 specify elements in the second vector).
+ *
+ * \param[in] N The shuffle vector SD Node to analyze
+ * \param[in] IndexOffset Specifies whether to look for even or odd elements
+ * \param[in] RHSStartValue Specifies the starting index for the righthand input
+ * vector to the shuffle_vector instruction
+ * \return true iff this shuffle vector represents an even or odd word merge
+ */
+static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
+                     unsigned RHSStartValue) {
+  if (N->getValueType(0) != MVT::v16i8)
+    return false;
+
+  for (unsigned i = 0; i < 2; ++i)
+    for (unsigned j = 0; j < 4; ++j)
+      if (!isConstantOrUndef(N->getMaskElt(i*4+j),
+                             i*RHSStartValue+j+IndexOffset) ||
+          !isConstantOrUndef(N->getMaskElt(i*4+j+8),
+                             i*RHSStartValue+j+IndexOffset+8))
+        return false;
+  return true;
+}
+
+/**
+ * \brief Determine if the specified shuffle mask is suitable for the vmrgew or
+ * vmrgow instructions.
+ *
+ * \param[in] N The shuffle vector SD Node to analyze
+ * \param[in] CheckEven Check for an even merge (true) or an odd merge (false)
+ * \param[in] ShuffleKind Identify the type of merge:
+ *   - 0 = big-endian merge with two different inputs;
+ *   - 1 = either-endian merge with two identical inputs;
+ *   - 2 = little-endian merge with two different inputs (inputs are swapped for
+ *     little-endian merges).
+ * \param[in] DAG The current SelectionDAG
+ * \return true iff this shuffle mask
+ */
+bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
+                              unsigned ShuffleKind, SelectionDAG &DAG) {
+  if (DAG.getDataLayout().isLittleEndian()) {
+    unsigned indexOffset = CheckEven ? 4 : 0;
+    if (ShuffleKind == 1) // Unary
+      return isVMerge(N, indexOffset, 0);
+    else if (ShuffleKind == 2) // swapped
+      return isVMerge(N, indexOffset, 16);
+    else
+      return false;
+  }
+  else {
+    unsigned indexOffset = CheckEven ? 0 : 4;
+    if (ShuffleKind == 1) // Unary
+      return isVMerge(N, indexOffset, 0);
+    else if (ShuffleKind == 0) // Normal
+      return isVMerge(N, indexOffset, 16);
+    else
+      return false;
+  }
+  return false;
+}
+
+/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
+/// amount, otherwise return -1.
+/// The ShuffleKind distinguishes between big-endian operations with two
+/// different inputs (0), either-endian operations with two identical inputs
+/// (1), and little-endian operations with two different inputs (2).  For the
+/// latter, the input operands are swapped (see PPCInstrAltivec.td).
+int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
+                             SelectionDAG &DAG) {
+  if (N->getValueType(0) != MVT::v16i8)
+    return -1;
+
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+
+  // Find the first non-undef value in the shuffle mask.
+  unsigned i;
+  for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i)
+    /*search*/;
+
+  if (i == 16) return -1;  // all undef.
+
+  // Otherwise, check to see if the rest of the elements are consecutively
+  // numbered from this value.
+  unsigned ShiftAmt = SVOp->getMaskElt(i);
+  if (ShiftAmt < i) return -1;
+
+  ShiftAmt -= i;
+  bool isLE = DAG.getDataLayout().isLittleEndian();
+
+  if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != 16; ++i)
+      if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
+        return -1;
+  } else if (ShuffleKind == 1) {
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != 16; ++i)
+      if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
+        return -1;
+  } else
+    return -1;
+
+  if (isLE)
+    ShiftAmt = 16 - ShiftAmt;
+
+  return ShiftAmt;
+}
+
+/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a splat of a single element that is suitable for input to
+/// VSPLTB/VSPLTH/VSPLTW.
+bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
+  assert(N->getValueType(0) == MVT::v16i8 &&
+         (EltSize == 1 || EltSize == 2 || EltSize == 4));
+
+  // The consecutive indices need to specify an element, not part of two
+  // different elements.  So abandon ship early if this isn't the case.
+  if (N->getMaskElt(0) % EltSize != 0)
+    return false;
+
+  // This is a splat operation if each element of the permute is the same, and
+  // if the value doesn't reference the second vector.
+  unsigned ElementBase = N->getMaskElt(0);
+
+  // FIXME: Handle UNDEF elements too!
+  if (ElementBase >= 16)
+    return false;
+
+  // Check that the indices are consecutive, in the case of a multi-byte element
+  // splatted with a v16i8 mask.
+  for (unsigned i = 1; i != EltSize; ++i)
+    if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase))
+      return false;
+
+  for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
+    if (N->getMaskElt(i) < 0) continue;
+    for (unsigned j = 0; j != EltSize; ++j)
+      if (N->getMaskElt(i+j) != N->getMaskElt(j))
+        return false;
+  }
+  return true;
+}
+
+bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+                          unsigned &InsertAtByte, bool &Swap, bool IsLE) {
+
+  // Check that the mask is shuffling words
+  for (unsigned i = 0; i < 4; ++i) {
+    unsigned B0 = N->getMaskElt(i*4);
+    unsigned B1 = N->getMaskElt(i*4+1);
+    unsigned B2 = N->getMaskElt(i*4+2);
+    unsigned B3 = N->getMaskElt(i*4+3);
+    if (B0 % 4)
+      return false;
+    if (B1 != B0+1 || B2 != B1+1 || B3 != B2+1)
+      return false;
+  }
+
+  // Now we look at mask elements 0,4,8,12
+  unsigned M0 = N->getMaskElt(0) / 4;
+  unsigned M1 = N->getMaskElt(4) / 4;
+  unsigned M2 = N->getMaskElt(8) / 4;
+  unsigned M3 = N->getMaskElt(12) / 4;
+  unsigned LittleEndianShifts[] = { 2, 1, 0, 3 };
+  unsigned BigEndianShifts[] = { 3, 0, 1, 2 };
+
+  // Below, let H and L be arbitrary elements of the shuffle mask
+  // where H is in the range [4,7] and L is in the range [0,3].
+  // H, 1, 2, 3 or L, 5, 6, 7
+  if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) ||
+      (M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) {
+    ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3];
+    InsertAtByte = IsLE ? 12 : 0;
+    Swap = M0 < 4;
+    return true;
+  }
+  // 0, H, 2, 3 or 4, L, 6, 7
+  if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) ||
+      (M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) {
+    ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3];
+    InsertAtByte = IsLE ? 8 : 4;
+    Swap = M1 < 4;
+    return true;
+  }
+  // 0, 1, H, 3 or 4, 5, L, 7
+  if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) ||
+      (M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) {
+    ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3];
+    InsertAtByte = IsLE ? 4 : 8;
+    Swap = M2 < 4;
+    return true;
+  }
+  // 0, 1, 2, H or 4, 5, 6, L
+  if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) ||
+      (M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) {
+    ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3];
+    InsertAtByte = IsLE ? 0 : 12;
+    Swap = M3 < 4;
+    return true;
+  }
+
+  // If both vector operands for the shuffle are the same vector, the mask will
+  // contain only elements from the first one and the second one will be undef.
+  if (N->getOperand(1).isUndef()) {
+    ShiftElts = 0;
+    Swap = true;
+    unsigned XXINSERTWSrcElem = IsLE ? 2 : 1;
+    if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) {
+      InsertAtByte = IsLE ? 12 : 0;
+      return true;
+    }
+    if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) {
+      InsertAtByte = IsLE ? 8 : 4;
+      return true;
+    }
+    if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) {
+      InsertAtByte = IsLE ? 4 : 8;
+      return true;
+    }
+    if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) {
+      InsertAtByte = IsLE ? 0 : 12;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
+/// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
+unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
+                                SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  assert(isSplatShuffleMask(SVOp, EltSize));
+  if (DAG.getDataLayout().isLittleEndian())
+    return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
+  else
+    return SVOp->getMaskElt(0) / EltSize;
+}
+
+/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
+/// by using a vspltis[bhw] instruction of the specified element size, return
+/// the constant being splatted.  The ByteSize field indicates the number of
+/// bytes of each element [124] -> [bhw].
+SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
+  SDValue OpVal(nullptr, 0);
+
+  // If ByteSize of the splat is bigger than the element size of the
+  // build_vector, then we have a case where we are checking for a splat where
+  // multiple elements of the buildvector are folded together into a single
+  // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
+  unsigned EltSize = 16/N->getNumOperands();
+  if (EltSize < ByteSize) {
+    unsigned Multiple = ByteSize/EltSize;   // Number of BV entries per spltval.
+    SDValue UniquedVals[4];
+    assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");
+
+    // See if all of the elements in the buildvector agree across.
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+      if (N->getOperand(i).isUndef()) continue;
+      // If the element isn't a constant, bail fully out.
+      if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
+
+
+      if (!UniquedVals[i&(Multiple-1)].getNode())
+        UniquedVals[i&(Multiple-1)] = N->getOperand(i);
+      else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
+        return SDValue();  // no match.
+    }
+
+    // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
+    // either constant or undef values that are identical for each chunk.  See
+    // if these chunks can form into a larger vspltis*.
+
+    // Check to see if all of the leading entries are either 0 or -1.  If
+    // neither, then this won't fit into the immediate field.
+    bool LeadingZero = true;
+    bool LeadingOnes = true;
+    for (unsigned i = 0; i != Multiple-1; ++i) {
+      if (!UniquedVals[i].getNode()) continue;  // Must have been undefs.
+
+      LeadingZero &= isNullConstant(UniquedVals[i]);
+      LeadingOnes &= isAllOnesConstant(UniquedVals[i]);
+    }
+    // Finally, check the least significant entry.
+    if (LeadingZero) {
+      if (!UniquedVals[Multiple-1].getNode())
+        return DAG.getTargetConstant(0, SDLoc(N), MVT::i32);  // 0,0,0,undef
+      int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
+      if (Val < 16)                                   // 0,0,0,4 -> vspltisw(4)
+        return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
+    }
+    if (LeadingOnes) {
+      if (!UniquedVals[Multiple-1].getNode())
+        return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef
+      int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
+      if (Val >= -16)                            // -1,-1,-1,-2 -> vspltisw(-2)
+        return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
+    }
+
+    return SDValue();
+  }
+
+  // Check to see if this buildvec has a single non-undef value in its elements.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (N->getOperand(i).isUndef()) continue;
+    if (!OpVal.getNode())
+      OpVal = N->getOperand(i);
+    else if (OpVal != N->getOperand(i))
+      return SDValue();
+  }
+
+  if (!OpVal.getNode()) return SDValue();  // All UNDEF: use implicit def.
+
+  unsigned ValSizeInBytes = EltSize;
+  uint64_t Value = 0;
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
+    Value = CN->getZExtValue();
+  } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
+    assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
+    Value = FloatToBits(CN->getValueAPF().convertToFloat());
+  }
+
+  // If the splat value is larger than the element value, then we can never do
+  // this splat.  The only case that we could fit the replicated bits into our
+  // immediate field for would be zero, and we prefer to use vxor for it.
+  if (ValSizeInBytes < ByteSize) return SDValue();
+
+  // If the element value is larger than the splat value, check if it consists
+  // of a repeated bit pattern of size ByteSize.
+  if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8))
+    return SDValue();
+
+  // Properly sign extend the value.
+  int MaskVal = SignExtend32(Value, ByteSize * 8);
+
+  // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
+  if (MaskVal == 0) return SDValue();
+
+  // Finally, if this value fits in a 5 bit sext field, return it
+  if (SignExtend32<5>(MaskVal) == MaskVal)
+    return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32);
+  return SDValue();
+}
+
+/// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift
+/// amount, otherwise return -1.
+int PPC::isQVALIGNIShuffleMask(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1)
+    return -1;
+
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+
+  // Find the first non-undef value in the shuffle mask.
+  unsigned i;
+  for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i)
+    /*search*/;
+
+  if (i == 4) return -1;  // all undef.
+
+  // Otherwise, check to see if the rest of the elements are consecutively
+  // numbered from this value.
+  unsigned ShiftAmt = SVOp->getMaskElt(i);
+  if (ShiftAmt < i) return -1;
+  ShiftAmt -= i;
+
+  // Check the rest of the elements to see if they are consecutive.
+  for (++i; i != 4; ++i)
+    if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
+      return -1;
+
+  return ShiftAmt;
+}
+
+//===----------------------------------------------------------------------===//
+//  Addressing Mode Selection
+//===----------------------------------------------------------------------===//
+
+/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 16-bit value.  If so, this returns true and the
+/// immediate.
+static bool isIntS16Immediate(SDNode *N, short &Imm) {
+  if (!isa<ConstantSDNode>(N))
+    return false;
+
+  Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
+  if (N->getValueType(0) == MVT::i32)
+    return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+  else
+    return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+}
+static bool isIntS16Immediate(SDValue Op, short &Imm) {
+  return isIntS16Immediate(Op.getNode(), Imm);
+}
+
+/// SelectAddressRegReg - Given the specified addressed, check to see if it
+/// can be represented as an indexed [r+r] operation.  Returns false if it
+/// can be more efficiently represented with [r+imm].
+bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
+                                            SDValue &Index,
+                                            SelectionDAG &DAG) const {
+  short imm = 0;
+  if (N.getOpcode() == ISD::ADD) {
+    if (isIntS16Immediate(N.getOperand(1), imm))
+      return false;    // r+i
+    if (N.getOperand(1).getOpcode() == PPCISD::Lo)
+      return false;    // r+i
+
+    Base = N.getOperand(0);
+    Index = N.getOperand(1);
+    return true;
+  } else if (N.getOpcode() == ISD::OR) {
+    if (isIntS16Immediate(N.getOperand(1), imm))
+      return false;    // r+i can fold it if we can.
+
+    // If this is an or of disjoint bitfields, we can codegen this as an add
+    // (for better address arithmetic) if the LHS and RHS of the OR are provably
+    // disjoint.
+    APInt LHSKnownZero, LHSKnownOne;
+    APInt RHSKnownZero, RHSKnownOne;
+    DAG.computeKnownBits(N.getOperand(0),
+                         LHSKnownZero, LHSKnownOne);
+
+    if (LHSKnownZero.getBoolValue()) {
+      DAG.computeKnownBits(N.getOperand(1),
+                           RHSKnownZero, RHSKnownOne);
+      // If all of the bits are known zero on the LHS or RHS, the add won't
+      // carry.
+      if (~(LHSKnownZero | RHSKnownZero) == 0) {
+        Base = N.getOperand(0);
+        Index = N.getOperand(1);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+// If we happen to be doing an i64 load or store into a stack slot that has
+// less than a 4-byte alignment, then the frame-index elimination may need to
+// use an indexed load or store instruction (because the offset may not be a
+// multiple of 4). The extra register needed to hold the offset comes from the
+// register scavenger, and it is possible that the scavenger will need to use
+// an emergency spill slot. As a result, we need to make sure that a spill slot
+// is allocated when doing an i64 load/store into a less-than-4-byte-aligned
+// stack slot.
+static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
+  // FIXME: This does not handle the LWA case.
+  if (VT != MVT::i64)
+    return;
+
+  // NOTE: We'll exclude negative FIs here, which come from argument
+  // lowering, because there are no known test cases triggering this problem
+  // using packed structures (or similar). We can remove this exclusion if
+  // we find such a test case. The reason why this is so test-case driven is
+  // because this entire 'fixup' is only to prevent crashes (from the
+  // register scavenger) on not-really-valid inputs. For example, if we have:
+  //   %a = alloca i1
+  //   %b = bitcast i1* %a to i64*
+  //   store i64* a, i64 b
+  // then the store should really be marked as 'align 1', but is not. If it
+  // were marked as 'align 1' then the indexed form would have been
+  // instruction-selected initially, and the problem this 'fixup' is preventing
+  // won't happen regardless.
+  if (FrameIdx < 0)
+    return;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  unsigned Align = MFI.getObjectAlignment(FrameIdx);
+  if (Align >= 4)
+    return;
+
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setHasNonRISpills();
+}
+
+/// Returns true if the address N can be represented by a base register plus
+/// a signed 16-bit displacement [r+imm], and if it is not better
+/// represented as reg+reg.  If Aligned is true, only accept displacements
+/// suitable for STD and friends, i.e. multiples of 4.
+bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
+                                            SDValue &Base,
+                                            SelectionDAG &DAG,
+                                            bool Aligned) const {
+  // FIXME dl should come from parent load or store, not from address
+  SDLoc dl(N);
+  // If this can be more profitably realized as r+r, fail.
+  if (SelectAddressRegReg(N, Disp, Base, DAG))
+    return false;
+
+  if (N.getOpcode() == ISD::ADD) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm) &&
+        (!Aligned || (imm & 3) == 0)) {
+      Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+        Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+        fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
+      } else {
+        Base = N.getOperand(0);
+      }
+      return true; // [r+i]
+    } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
+      // Match LOAD (ADD (X, Lo(G))).
+      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+             && "Cannot handle constant offsets yet!");
+      Disp = N.getOperand(1).getOperand(0);  // The global address.
+      assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
+             Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
+             Disp.getOpcode() == ISD::TargetConstantPool ||
+             Disp.getOpcode() == ISD::TargetJumpTable);
+      Base = N.getOperand(0);
+      return true;  // [&g+r]
+    }
+  } else if (N.getOpcode() == ISD::OR) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm) &&
+        (!Aligned || (imm & 3) == 0)) {
+      // If this is an or of disjoint bitfields, we can codegen this as an add
+      // (for better address arithmetic) if the LHS and RHS of the OR are
+      // provably disjoint.
+      APInt LHSKnownZero, LHSKnownOne;
+      DAG.computeKnownBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
+
+      if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
+        // If all of the bits are known zero on the LHS or RHS, the add won't
+        // carry.
+        if (FrameIndexSDNode *FI =
+              dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+          Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+          fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
+        } else {
+          Base = N.getOperand(0);
+        }
+        Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
+        return true;
+      }
+    }
+  } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    // Loading from a constant address.
+
+    // If this address fits entirely in a 16-bit sext immediate field, codegen
+    // this as "d, 0"
+    short Imm;
+    if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) {
+      Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
+      Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+                             CN->getValueType(0));
+      return true;
+    }
+
+    // Handle 32-bit sext immediates with LIS + addr mode.
+    if ((CN->getValueType(0) == MVT::i32 ||
+         (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
+        (!Aligned || (CN->getZExtValue() & 3) == 0)) {
+      int Addr = (int)CN->getZExtValue();
+
+      // Otherwise, break this down into an LIS + disp.
+      Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32);
+
+      Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl,
+                                   MVT::i32);
+      unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
+      Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0);
+      return true;
+    }
+  }
+
+  Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout()));
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
+    Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+    fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
+  } else
+    Base = N;
+  return true;      // [r+0]
+}
+
+/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
+/// represented as an indexed [r+r] operation.
+bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
+                                                SDValue &Index,
+                                                SelectionDAG &DAG) const {
+  // Check to see if we can easily represent this as an [r+r] address.  This
+  // will fail if it thinks that the address is more profitably represented as
+  // reg+imm, e.g. where imm = 0.
+  if (SelectAddressRegReg(N, Base, Index, DAG))
+    return true;
+
+  // If the operand is an addition, always emit this as [r+r], since this is
+  // better (for code size, and execution, as the memop does the add for free)
+  // than emitting an explicit add.
+  if (N.getOpcode() == ISD::ADD) {
+    Base = N.getOperand(0);
+    Index = N.getOperand(1);
+    return true;
+  }
+
+  // Otherwise, do it the hard way, using R0 as the base register.
+  Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+                         N.getValueType());
+  Index = N;
+  return true;
+}
+
+/// getPreIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if the node's address
+/// can be legally represented as pre-indexed load / store address.
+bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                                  SDValue &Offset,
+                                                  ISD::MemIndexedMode &AM,
+                                                  SelectionDAG &DAG) const {
+  if (DisablePPCPreinc) return false;
+
+  bool isLoad = true;
+  SDValue Ptr;
+  EVT VT;
+  unsigned Alignment;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    Ptr = LD->getBasePtr();
+    VT = LD->getMemoryVT();
+    Alignment = LD->getAlignment();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    Ptr = ST->getBasePtr();
+    VT  = ST->getMemoryVT();
+    Alignment = ST->getAlignment();
+    isLoad = false;
+  } else
+    return false;
+
+  // PowerPC doesn't have preinc load/store instructions for vectors (except
+  // for QPX, which does have preinc r+r forms).
+  if (VT.isVector()) {
+    if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) {
+      return false;
+    } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) {
+      AM = ISD::PRE_INC;
+      return true;
+    }
+  }
+
+  if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
+
+    // Common code will reject creating a pre-inc form if the base pointer
+    // is a frame index, or if N is a store and the base pointer is either
+    // the same as or a predecessor of the value being stored.  Check for
+    // those situations here, and try with swapped Base/Offset instead.
+    bool Swap = false;
+
+    if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base))
+      Swap = true;
+    else if (!isLoad) {
+      SDValue Val = cast<StoreSDNode>(N)->getValue();
+      if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode()))
+        Swap = true;
+    }
+
+    if (Swap)
+      std::swap(Base, Offset);
+
+    AM = ISD::PRE_INC;
+    return true;
+  }
+
+  // LDU/STU can only handle immediates that are a multiple of 4.
+  if (VT != MVT::i64) {
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false))
+      return false;
+  } else {
+    // LDU/STU need an address with at least 4-byte alignment.
+    if (Alignment < 4)
+      return false;
+
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true))
+      return false;
+  }
+
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    // PPC64 doesn't have lwau, but it does have lwaux.  Reject preinc load of
+    // sext i32 to i64 when addr mode is r+i.
+    if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
+        LD->getExtensionType() == ISD::SEXTLOAD &&
+        isa<ConstantSDNode>(Offset))
+      return false;
+  }
+
+  AM = ISD::PRE_INC;
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  LowerOperation implementation
+//===----------------------------------------------------------------------===//
+
+/// Return true if we should reference labels using a PICBase, set the HiOpFlags
+/// and LoOpFlags to the target MO flags.
+static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
+                               unsigned &HiOpFlags, unsigned &LoOpFlags,
+                               const GlobalValue *GV = nullptr) {
+  HiOpFlags = PPCII::MO_HA;
+  LoOpFlags = PPCII::MO_LO;
+
+  // Don't use the pic base if not in PIC relocation model.
+  if (IsPIC) {
+    HiOpFlags |= PPCII::MO_PIC_FLAG;
+    LoOpFlags |= PPCII::MO_PIC_FLAG;
+  }
+
+  // If this is a reference to a global value that requires a non-lazy-ptr, make
+  // sure that instruction lowering adds it.
+  if (GV && Subtarget.hasLazyResolverStub(GV)) {
+    HiOpFlags |= PPCII::MO_NLP_FLAG;
+    LoOpFlags |= PPCII::MO_NLP_FLAG;
+
+    if (GV->hasHiddenVisibility()) {
+      HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
+      LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
+    }
+  }
+}
+
+static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
+                             SelectionDAG &DAG) {
+  SDLoc DL(HiPart);
+  EVT PtrVT = HiPart.getValueType();
+  SDValue Zero = DAG.getConstant(0, DL, PtrVT);
+
+  SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero);
+  SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero);
+
+  // With PIC, the first instruction is actually "GR+hi(&G)".
+  if (isPIC)
+    Hi = DAG.getNode(ISD::ADD, DL, PtrVT,
+                     DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi);
+
+  // Generate non-pic code that has direct accesses to the constant pool.
+  // The address of the global is just (hi(&g)+lo(&g)).
+  return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
+}
+
+static void setUsesTOCBasePtr(MachineFunction &MF) {
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setUsesTOCBasePtr();
+}
+
+static void setUsesTOCBasePtr(SelectionDAG &DAG) {
+  setUsesTOCBasePtr(DAG.getMachineFunction());
+}
+
+static SDValue getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, bool Is64Bit,
+                           SDValue GA) {
+  EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
+  SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) :
+                DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);
+
+  SDValue Ops[] = { GA, Reg };
+  return DAG.getMemIntrinsicNode(
+      PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
+      MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, false, true,
+      false, 0);
+}
+
+SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  EVT PtrVT = Op.getValueType();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  const Constant *C = CP->getConstVal();
+
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+    setUsesTOCBasePtr(DAG);
+    SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
+    return getTOCEntry(DAG, SDLoc(CP), true, GA);
+  }
+
+  unsigned MOHiFlag, MOLoFlag;
+  bool IsPIC = isPositionIndependent();
+  getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
+
+  if (IsPIC && Subtarget.isSVR4ABI()) {
+    SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
+                                           PPCII::MO_PIC_FLAG);
+    return getTOCEntry(DAG, SDLoc(CP), false, GA);
+  }
+
+  SDValue CPIHi =
+    DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag);
+  SDValue CPILo =
+    DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag);
+  return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG);
+}
+
+// For 64-bit PowerPC, prefer the more compact relative encodings.
+// This trades 32 bits per jump table entry for one or two instructions
+// on the jump site.
+unsigned PPCTargetLowering::getJumpTableEncoding() const {
+  if (isJumpTableRelative())
+    return MachineJumpTableInfo::EK_LabelDifference32;
+
+  return TargetLowering::getJumpTableEncoding();
+}
+
+bool PPCTargetLowering::isJumpTableRelative() const {
+  if (Subtarget.isPPC64())
+    return true;
+  return TargetLowering::isJumpTableRelative();
+}
+
+SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,
+                                                    SelectionDAG &DAG) const {
+  if (!Subtarget.isPPC64())
+    return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
+
+  switch (getTargetMachine().getCodeModel()) {
+  case CodeModel::Default:
+  case CodeModel::Small:
+  case CodeModel::Medium:
+    return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
+  default:
+    return DAG.getNode(PPCISD::GlobalBaseReg, SDLoc(),
+                       getPointerTy(DAG.getDataLayout()));
+  }
+}
+
+const MCExpr *
+PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+                                                unsigned JTI,
+                                                MCContext &Ctx) const {
+  if (!Subtarget.isPPC64())
+    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
+
+  switch (getTargetMachine().getCodeModel()) {
+  case CodeModel::Default:
+  case CodeModel::Small:
+  case CodeModel::Medium:
+    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
+  default:
+    return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
+  }
+}
+
+SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  EVT PtrVT = Op.getValueType();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+    setUsesTOCBasePtr(DAG);
+    SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+    return getTOCEntry(DAG, SDLoc(JT), true, GA);
+  }
+
+  unsigned MOHiFlag, MOLoFlag;
+  bool IsPIC = isPositionIndependent();
+  getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
+
+  if (IsPIC && Subtarget.isSVR4ABI()) {
+    SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                                        PPCII::MO_PIC_FLAG);
+    return getTOCEntry(DAG, SDLoc(GA), false, GA);
+  }
+
+  SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
+  SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);
+  return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG);
+}
+
+SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  EVT PtrVT = Op.getValueType();
+  BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
+  const BlockAddress *BA = BASDN->getBlockAddress();
+
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual BlockAddress is stored in the TOC.
+  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+    setUsesTOCBasePtr(DAG);
+    SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
+    return getTOCEntry(DAG, SDLoc(BASDN), true, GA);
+  }
+
+  unsigned MOHiFlag, MOLoFlag;
+  bool IsPIC = isPositionIndependent();
+  getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
+  SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
+  SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
+  return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG);
+}
+
+SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+
+  // FIXME: TLS addresses currently use medium model code sequences,
+  // which is the most useful form.  Eventually support for small and
+  // large models could be added if users need it, at the cost of
+  // additional complexity.
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  if (DAG.getTarget().Options.EmulatedTLS)
+    return LowerToTLSEmulatedModel(GA, DAG);
+
+  SDLoc dl(GA);
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  bool is64bit = Subtarget.isPPC64();
+  const Module *M = DAG.getMachineFunction().getFunction()->getParent();
+  PICLevel::Level picLevel = M->getPICLevel();
+
+  TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
+
+  if (Model == TLSModel::LocalExec) {
+    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                               PPCII::MO_TPREL_HA);
+    SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                               PPCII::MO_TPREL_LO);
+    SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
+                                     is64bit ? MVT::i64 : MVT::i32);
+    SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
+    return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
+  }
+
+  if (Model == TLSModel::InitialExec) {
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+    SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                                PPCII::MO_TLS);
+    SDValue GOTPtr;
+    if (is64bit) {
+      setUsesTOCBasePtr(DAG);
+      SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+      GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
+                           PtrVT, GOTReg, TGA);
+    } else
+      GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
+    SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
+                                   PtrVT, TGA, GOTPtr);
+    return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
+  }
+
+  if (Model == TLSModel::GeneralDynamic) {
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+    SDValue GOTPtr;
+    if (is64bit) {
+      setUsesTOCBasePtr(DAG);
+      SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+      GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
+                                   GOTReg, TGA);
+    } else {
+      if (picLevel == PICLevel::SmallPIC)
+        GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
+      else
+        GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+    }
+    return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT,
+                       GOTPtr, TGA, TGA);
+  }
+
+  if (Model == TLSModel::LocalDynamic) {
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+    SDValue GOTPtr;
+    if (is64bit) {
+      setUsesTOCBasePtr(DAG);
+      SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+      GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
+                           GOTReg, TGA);
+    } else {
+      if (picLevel == PICLevel::SmallPIC)
+        GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
+      else
+        GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+    }
+    SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl,
+                                  PtrVT, GOTPtr, TGA, TGA);
+    SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl,
+                                      PtrVT, TLSAddr, TGA);
+    return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
+  }
+
+  llvm_unreachable("Unknown TLS model!");
+}
+
+SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT PtrVT = Op.getValueType();
+  GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
+  SDLoc DL(GSDN);
+  const GlobalValue *GV = GSDN->getGlobal();
+
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+    setUsesTOCBasePtr(DAG);
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
+    return getTOCEntry(DAG, DL, true, GA);
+  }
+
+  unsigned MOHiFlag, MOLoFlag;
+  bool IsPIC = isPositionIndependent();
+  getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV);
+
+  if (IsPIC && Subtarget.isSVR4ABI()) {
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
+                                            GSDN->getOffset(),
+                                            PPCII::MO_PIC_FLAG);
+    return getTOCEntry(DAG, DL, false, GA);
+  }
+
+  SDValue GAHi =
+    DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag);
+  SDValue GALo =
+    DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);
+
+  SDValue Ptr = LowerLabelRef(GAHi, GALo, IsPIC, DAG);
+
+  // If the global reference is actually to a non-lazy-pointer, we have to do an
+  // extra load to get the address of the global.
+  if (MOHiFlag & PPCII::MO_NLP_FLAG)
+    Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
+  return Ptr;
+}
+
+SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDLoc dl(Op);
+
+  if (Op.getValueType() == MVT::v2i64) {
+    // When the operands themselves are v2i64 values, we need to do something
+    // special because VSX has no underlying comparison operations for these.
+    if (Op.getOperand(0).getValueType() == MVT::v2i64) {
+      // Equality can be handled by casting to the legal type for Altivec
+      // comparisons, everything else needs to be expanded.
+      if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+        return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
+                 DAG.getSetCC(dl, MVT::v4i32,
+                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)),
+                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)),
+                   CC));
+      }
+
+      return SDValue();
+    }
+
+    // We handle most of these in the usual way.
+    return Op;
+  }
+
+  // If we're comparing for equality to zero, expose the fact that this is
+  // implemented as a ctlz/srl pair on ppc, so that the dag combiner can
+  // fold the new nodes.
+  if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG))
+    return V;
+
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+    // Leave comparisons against 0 and -1 alone for now, since they're usually
+    // optimized.  FIXME: revisit this when we can custom lower all setcc
+    // optimizations.
+    if (C->isAllOnesValue() || C->isNullValue())
+      return SDValue();
+  }
+
+  // If we have an integer seteq/setne, turn it into a compare against zero
+  // by xor'ing the rhs with the lhs, which is faster than setting a
+  // condition register, reading it back out, and masking the correct bit.  The
+  // normal approach here uses sub to do this instead of xor.  Using xor exposes
+  // the result to other bit-twiddling opportunities.
+  EVT LHSVT = Op.getOperand(0).getValueType();
+  if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    EVT VT = Op.getValueType();
+    SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0),
+                                Op.getOperand(1));
+    return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC);
+  }
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  EVT VT = Node->getValueType(0);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue InChain = Node->getOperand(0);
+  SDValue VAListPtr = Node->getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  SDLoc dl(Node);
+
+  assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");
+
+  // gpr_index
+  SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
+                                    VAListPtr, MachinePointerInfo(SV), MVT::i8);
+  InChain = GprIndex.getValue(1);
+
+  if (VT == MVT::i64) {
+    // Check if GprIndex is even
+    SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex,
+                                 DAG.getConstant(1, dl, MVT::i32));
+    SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd,
+                                DAG.getConstant(0, dl, MVT::i32), ISD::SETNE);
+    SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex,
+                                          DAG.getConstant(1, dl, MVT::i32));
+    // Align GprIndex to be even if it isn't
+    GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne,
+                           GprIndex);
+  }
+
+  // fpr index is 1 byte after gpr
+  SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
+                               DAG.getConstant(1, dl, MVT::i32));
+
+  // fpr
+  SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
+                                    FprPtr, MachinePointerInfo(SV), MVT::i8);
+  InChain = FprIndex.getValue(1);
+
+  SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
+                                       DAG.getConstant(8, dl, MVT::i32));
+
+  SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
+                                        DAG.getConstant(4, dl, MVT::i32));
+
+  // areas
+  SDValue OverflowArea =
+      DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo());
+  InChain = OverflowArea.getValue(1);
+
+  SDValue RegSaveArea =
+      DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo());
+  InChain = RegSaveArea.getValue(1);
+
+  // select overflow_area if index > 8
+  SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex,
+                            DAG.getConstant(8, dl, MVT::i32), ISD::SETLT);
+
+  // adjustment constant gpr_index * 4/8
+  SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32,
+                                    VT.isInteger() ? GprIndex : FprIndex,
+                                    DAG.getConstant(VT.isInteger() ? 4 : 8, dl,
+                                                    MVT::i32));
+
+  // OurReg = RegSaveArea + RegConstant
+  SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea,
+                               RegConstant);
+
+  // Floating types are 32 bytes into RegSaveArea
+  if (VT.isFloatingPoint())
+    OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg,
+                         DAG.getConstant(32, dl, MVT::i32));
+
+  // increase {f,g}pr_index by 1 (or 2 if VT is i64)
+  SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                   VT.isInteger() ? GprIndex : FprIndex,
+                                   DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl,
+                                                   MVT::i32));
+
+  InChain = DAG.getTruncStore(InChain, dl, IndexPlus1,
+                              VT.isInteger() ? VAListPtr : FprPtr,
+                              MachinePointerInfo(SV), MVT::i8);
+
+  // determine if we should load from reg_save_area or overflow_area
+  SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea);
+
+  // increase overflow_area by 4/8 if gpr/fpr > 8
+  SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea,
+                                          DAG.getConstant(VT.isInteger() ? 4 : 8,
+                                          dl, MVT::i32));
+
+  OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea,
+                             OverflowAreaPlusN);
+
+  InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr,
+                              MachinePointerInfo(), MVT::i32);
+
+  return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo());
+}
+
+SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
+  assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");
+
+  // We have to copy the entire va_list struct:
+  // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte
+  return DAG.getMemcpy(Op.getOperand(0), Op,
+                       Op.getOperand(1), Op.getOperand(2),
+                       DAG.getConstant(12, SDLoc(Op), MVT::i32), 8, false, true,
+                       false, MachinePointerInfo(), MachinePointerInfo());
+}
+
+SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  return Op.getOperand(0);
+}
+
+SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue FPtr = Op.getOperand(2); // nested function
+  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+  SDLoc dl(Op);
+
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  bool isPPC64 = (PtrVT == MVT::i64);
+  Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Ty = IntPtrTy;
+  Entry.Node = Trmp; Args.push_back(Entry);
+
+  // TrampSize == (isPPC64 ? 48 : 40);
+  Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl,
+                               isPPC64 ? MVT::i64 : MVT::i32);
+  Args.push_back(Entry);
+
+  Entry.Node = FPtr; Args.push_back(Entry);
+  Entry.Node = Nest; Args.push_back(Entry);
+
+  // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+               DAG.getExternalSymbol("__trampoline_setup", PtrVT),
+               std::move(Args));
+
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.second;
+}
+
+SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
+
+  SDLoc dl(Op);
+
+  if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+    const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+    return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
+                        MachinePointerInfo(SV));
+  }
+
+  // For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
+  // We suppose the given va_list is already allocated.
+  //
+  // typedef struct {
+  //  char gpr;     /* index into the array of 8 GPRs
+  //                 * stored in the register save area
+  //                 * gpr=0 corresponds to r3,
+  //                 * gpr=1 to r4, etc.
+  //                 */
+  //  char fpr;     /* index into the array of 8 FPRs
+  //                 * stored in the register save area
+  //                 * fpr=0 corresponds to f1,
+  //                 * fpr=1 to f2, etc.
+  //                 */
+  //  char *overflow_arg_area;
+  //                /* location on stack that holds
+  //                 * the next overflow argument
+  //                 */
+  //  char *reg_save_area;
+  //               /* where r3:r10 and f1:f8 (if saved)
+  //                * are stored
+  //                */
+  // } va_list[1];
+
+  SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32);
+  SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32);
+  SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),
+                                            PtrVT);
+  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                 PtrVT);
+
+  uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
+  SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT);
+
+  uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
+  SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT);
+
+  uint64_t FPROffset = 1;
+  SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT);
+
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+
+  // Store first byte : number of int regs
+  SDValue firstStore =
+      DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1),
+                        MachinePointerInfo(SV), MVT::i8);
+  uint64_t nextOffset = FPROffset;
+  SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),
+                                  ConstFPROffset);
+
+  // Store second byte : number of float regs
+  SDValue secondStore =
+      DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr,
+                        MachinePointerInfo(SV, nextOffset), MVT::i8);
+  nextOffset += StackOffset;
+  nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);
+
+  // Store second word : arguments given on stack
+  SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr,
+                                    MachinePointerInfo(SV, nextOffset));
+  nextOffset += FrameOffset;
+  nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);
+
+  // Store third word : arguments given in registers
+  return DAG.getStore(thirdStore, dl, FR, nextPtr,
+                      MachinePointerInfo(SV, nextOffset));
+}
+
+#include "PPCGenCallingConv.inc"
+
+// Function whose sole purpose is to kill compiler warnings
+// stemming from unused functions included from PPCGenCallingConv.inc.
+CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const {
+  return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS;
+}
+
+bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                      CCValAssign::LocInfo &LocInfo,
+                                      ISD::ArgFlagsTy &ArgFlags,
+                                      CCState &State) {
+  return true;
+}
+
+bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
+                                             MVT &LocVT,
+                                             CCValAssign::LocInfo &LocInfo,
+                                             ISD::ArgFlagsTy &ArgFlags,
+                                             CCState &State) {
+  static const MCPhysReg ArgRegs[] = {
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  const unsigned NumArgRegs = array_lengthof(ArgRegs);
+
+  unsigned RegNum = State.getFirstUnallocated(ArgRegs);
+
+  // Skip one register if the first unallocated register has an even register
+  // number and there are still argument registers available which have not been
+  // allocated yet. RegNum is actually an index into ArgRegs, which means we
+  // need to skip a register if RegNum is odd.
+  if (RegNum != NumArgRegs && RegNum % 2 == 1) {
+    State.AllocateReg(ArgRegs[RegNum]);
+  }
+
+  // Always return false here, as this function only makes sure that the first
+  // unallocated register has an odd register number and does not actually
+  // allocate a register for the current argument.
+  return false;
+}
+
+bool 
+llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT,
+                                                  MVT &LocVT,
+                                                  CCValAssign::LocInfo &LocInfo,
+                                                  ISD::ArgFlagsTy &ArgFlags,
+                                                  CCState &State) {
+  static const MCPhysReg ArgRegs[] = {
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  const unsigned NumArgRegs = array_lengthof(ArgRegs);
+
+  unsigned RegNum = State.getFirstUnallocated(ArgRegs);
+  int RegsLeft = NumArgRegs - RegNum;
+
+  // Skip if there is not enough registers left for long double type (4 gpr regs 
+  // in soft float mode) and put long double argument on the stack.
+  if (RegNum != NumArgRegs && RegsLeft < 4) {
+    for (int i = 0; i < RegsLeft; i++) {
+      State.AllocateReg(ArgRegs[RegNum + i]);
+    }
+  }
+
+  return false;
+}
+
+bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
+                                               MVT &LocVT,
+                                               CCValAssign::LocInfo &LocInfo,
+                                               ISD::ArgFlagsTy &ArgFlags,
+                                               CCState &State) {
+  static const MCPhysReg ArgRegs[] = {
+    PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+    PPC::F8
+  };
+
+  const unsigned NumArgRegs = array_lengthof(ArgRegs);
+
+  unsigned RegNum = State.getFirstUnallocated(ArgRegs);
+
+  // If there is only one Floating-point register left we need to put both f64
+  // values of a split ppc_fp128 value on the stack.
+  if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) {
+    State.AllocateReg(ArgRegs[RegNum]);
+  }
+
+  // Always return false here, as this function only makes sure that the two f64
+  // values a ppc_fp128 value is split into are both passed in registers or both
+  // passed on the stack and does not actually allocate a register for the
+  // current argument.
+  return false;
+}
+
+/// FPR - The set of FP registers that should be allocated for arguments,
+/// on Darwin.
+static const MCPhysReg FPR[] = {PPC::F1,  PPC::F2,  PPC::F3, PPC::F4, PPC::F5,
+                                PPC::F6,  PPC::F7,  PPC::F8, PPC::F9, PPC::F10,
+                                PPC::F11, PPC::F12, PPC::F13};
+
+/// QFPR - The set of QPX registers that should be allocated for arguments.
+static const MCPhysReg QFPR[] = {
+    PPC::QF1, PPC::QF2, PPC::QF3,  PPC::QF4,  PPC::QF5,  PPC::QF6, PPC::QF7,
+    PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13};
+
+/// CalculateStackSlotSize - Calculates the size reserved for this argument on
+/// the stack.
+static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
+                                       unsigned PtrByteSize) {
+  unsigned ArgSize = ArgVT.getStoreSize();
+  if (Flags.isByVal())
+    ArgSize = Flags.getByValSize();
+
+  // Round up to multiples of the pointer size, except for array members,
+  // which are always packed.
+  if (!Flags.isInConsecutiveRegs())
+    ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+
+  return ArgSize;
+}
+
+/// CalculateStackSlotAlignment - Calculates the alignment of this argument
+/// on the stack.
+static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
+                                            ISD::ArgFlagsTy Flags,
+                                            unsigned PtrByteSize) {
+  unsigned Align = PtrByteSize;
+
+  // Altivec parameters are padded to a 16 byte boundary.
+  if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
+      ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
+      ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
+      ArgVT == MVT::v1i128)
+    Align = 16;
+  // QPX vector types stored in double-precision are padded to a 32 byte
+  // boundary.
+  else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1)
+    Align = 32;
+
+  // ByVal parameters are aligned as requested.
+  if (Flags.isByVal()) {
+    unsigned BVAlign = Flags.getByValAlign();
+    if (BVAlign > PtrByteSize) {
+      if (BVAlign % PtrByteSize != 0)
+          llvm_unreachable(
+            "ByVal alignment is not a multiple of the pointer size");
+
+      Align = BVAlign;
+    }
+  }
+
+  // Array members are always packed to their original alignment.
+  if (Flags.isInConsecutiveRegs()) {
+    // If the array member was split into multiple registers, the first
+    // needs to be aligned to the size of the full type.  (Except for
+    // ppcf128, which is only aligned as its f64 components.)
+    if (Flags.isSplit() && OrigVT != MVT::ppcf128)
+      Align = OrigVT.getStoreSize();
+    else
+      Align = ArgVT.getStoreSize();
+  }
+
+  return Align;
+}
+
+/// CalculateStackSlotUsed - Return whether this argument will use its
+/// stack slot (instead of being passed in registers).  ArgOffset,
+/// AvailableFPRs, and AvailableVRs must hold the current argument
+/// position, and will be updated to account for this argument.
+static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
+                                   ISD::ArgFlagsTy Flags,
+                                   unsigned PtrByteSize,
+                                   unsigned LinkageSize,
+                                   unsigned ParamAreaSize,
+                                   unsigned &ArgOffset,
+                                   unsigned &AvailableFPRs,
+                                   unsigned &AvailableVRs, bool HasQPX) {
+  bool UseMemory = false;
+
+  // Respect alignment of argument on the stack.
+  unsigned Align =
+    CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+  ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+  // If there's no space left in the argument save area, we must
+  // use memory (this check also catches zero-sized arguments).
+  if (ArgOffset >= LinkageSize + ParamAreaSize)
+    UseMemory = true;
+
+  // Allocate argument on the stack.
+  ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+  if (Flags.isInConsecutiveRegsLast())
+    ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+  // If we overran the argument save area, we must use memory
+  // (this check catches arguments passed partially in memory)
+  if (ArgOffset > LinkageSize + ParamAreaSize)
+    UseMemory = true;
+
+  // However, if the argument is actually passed in an FPR or a VR,
+  // we don't use memory after all.
+  if (!Flags.isByVal()) {
+    if (ArgVT == MVT::f32 || ArgVT == MVT::f64 ||
+        // QPX registers overlap with the scalar FP registers.
+        (HasQPX && (ArgVT == MVT::v4f32 ||
+                    ArgVT == MVT::v4f64 ||
+                    ArgVT == MVT::v4i1)))
+      if (AvailableFPRs > 0) {
+        --AvailableFPRs;
+        return false;
+      }
+    if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
+        ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
+        ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
+        ArgVT == MVT::v1i128)
+      if (AvailableVRs > 0) {
+        --AvailableVRs;
+        return false;
+      }
+  }
+
+  return UseMemory;
+}
+
+/// EnsureStackAlignment - Round stack frame size up from NumBytes to
+/// ensure minimum alignment required for target.
+static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
+                                     unsigned NumBytes) {
+  unsigned TargetAlign = Lowering->getStackAlignment();
+  unsigned AlignMask = TargetAlign - 1;
+  NumBytes = (NumBytes + AlignMask) & ~AlignMask;
+  return NumBytes;
+}
+
+SDValue PPCTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  if (Subtarget.isSVR4ABI()) {
+    if (Subtarget.isPPC64())
+      return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins,
+                                         dl, DAG, InVals);
+    else
+      return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins,
+                                         dl, DAG, InVals);
+  } else {
+    return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins,
+                                       dl, DAG, InVals);
+  }
+}
+
+SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+  // 32-bit SVR4 ABI Stack Frame Layout:
+  //              +-----------------------------------+
+  //        +-->  |            Back chain             |
+  //        |     +-----------------------------------+
+  //        |     | Floating-point register save area |
+  //        |     +-----------------------------------+
+  //        |     |    General register save area     |
+  //        |     +-----------------------------------+
+  //        |     |          CR save word             |
+  //        |     +-----------------------------------+
+  //        |     |         VRSAVE save word          |
+  //        |     +-----------------------------------+
+  //        |     |         Alignment padding         |
+  //        |     +-----------------------------------+
+  //        |     |     Vector register save area     |
+  //        |     +-----------------------------------+
+  //        |     |       Local variable space        |
+  //        |     +-----------------------------------+
+  //        |     |        Parameter list area        |
+  //        |     +-----------------------------------+
+  //        |     |           LR save word            |
+  //        |     +-----------------------------------+
+  // SP-->  +---  |            Back chain             |
+  //              +-----------------------------------+
+  //
+  // Specifications:
+  //   System V Application Binary Interface PowerPC Processor Supplement
+  //   AltiVec Technology Programming Interface Manual
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
+  // Potential tail calls could cause overwriting of argument stack slots.
+  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+                       (CallConv == CallingConv::Fast));
+  unsigned PtrByteSize = 4;
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+
+  // Reserve space for the linkage area on the stack.
+  unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+  CCInfo.AllocateStack(LinkageSize, PtrByteSize);
+  if (useSoftFloat())
+    CCInfo.PreAnalyzeFormalArguments(Ins);
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
+  CCInfo.clearWasPPCF128();
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    // Arguments stored in registers.
+    if (VA.isRegLoc()) {
+      const TargetRegisterClass *RC;
+      EVT ValVT = VA.getValVT();
+
+      switch (ValVT.getSimpleVT().SimpleTy) {
+        default:
+          llvm_unreachable("ValVT not supported by formal arguments Lowering");
+        case MVT::i1:
+        case MVT::i32:
+          RC = &PPC::GPRCRegClass;
+          break;
+        case MVT::f32:
+          if (Subtarget.hasP8Vector())
+            RC = &PPC::VSSRCRegClass;
+          else
+            RC = &PPC::F4RCRegClass;
+          break;
+        case MVT::f64:
+          if (Subtarget.hasVSX())
+            RC = &PPC::VSFRCRegClass;
+          else
+            RC = &PPC::F8RCRegClass;
+          break;
+        case MVT::v16i8:
+        case MVT::v8i16:
+        case MVT::v4i32:
+          RC = &PPC::VRRCRegClass;
+          break;
+        case MVT::v4f32:
+          RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass;
+          break;
+        case MVT::v2f64:
+        case MVT::v2i64:
+          RC = &PPC::VRRCRegClass;
+          break;
+        case MVT::v4f64:
+          RC = &PPC::QFRCRegClass;
+          break;
+        case MVT::v4i1:
+          RC = &PPC::QBRCRegClass;
+          break;
+      }
+
+      // Transform the arguments stored in physical registers into virtual ones.
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
+                                            ValVT == MVT::i1 ? MVT::i32 : ValVT);
+
+      if (ValVT == MVT::i1)
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);
+
+      InVals.push_back(ArgValue);
+    } else {
+      // Argument stored in memory.
+      assert(VA.isMemLoc());
+
+      unsigned ArgSize = VA.getLocVT().getStoreSize();
+      int FI = MFI.CreateFixedObject(ArgSize, VA.getLocMemOffset(),
+                                     isImmutable);
+
+      // Create load nodes to retrieve arguments from the stack.
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      InVals.push_back(
+          DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
+    }
+  }
+
+  // Assign locations to all of the incoming aggregate by value arguments.
+  // Aggregates passed by value are stored in the local variable space of the
+  // caller's stack frame, right above the parameter list area.
+  SmallVector<CCValAssign, 16> ByValArgLocs;
+  CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                      ByValArgLocs, *DAG.getContext());
+
+  // Reserve stack space for the allocations in CCInfo.
+  CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
+
+  CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);
+
+  // Area that is at least reserved in the caller of this function.
+  unsigned MinReservedArea = CCByValInfo.getNextStackOffset();
+  MinReservedArea = std::max(MinReservedArea, LinkageSize);
+
+  // Set the size that is at least reserved in caller of this function.  Tail
+  // call optimized function's reserved stack space needs to be aligned so that
+  // taking the difference between two stack areas will result in an aligned
+  // stack.
+  MinReservedArea =
+      EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
+  FuncInfo->setMinReservedArea(MinReservedArea);
+
+  SmallVector<SDValue, 8> MemOps;
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    static const MCPhysReg GPArgRegs[] = {
+      PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+      PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+    };
+    const unsigned NumGPArgRegs = array_lengthof(GPArgRegs);
+
+    static const MCPhysReg FPArgRegs[] = {
+      PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+      PPC::F8
+    };
+    unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
+
+    if (useSoftFloat())
+       NumFPArgRegs = 0;
+
+    FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs));
+    FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs));
+
+    // Make room for NumGPArgRegs and NumFPArgRegs.
+    int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
+                NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;
+
+    FuncInfo->setVarArgsStackOffset(
+      MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
+                            CCInfo.getNextStackOffset(), true));
+
+    FuncInfo->setVarArgsFrameIndex(MFI.CreateStackObject(Depth, 8, false));
+    SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+    // The fixed integer arguments of a variadic function are stored to the
+    // VarArgsFrameIndex on the stack so that they may be loaded by
+    // dereferencing the result of va_next.
+    for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) {
+      // Get an existing live-in vreg, or add a new one.
+      unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
+      if (!VReg)
+        VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);
+
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+
+    // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
+    // is set.
+    // The double arguments are stored to the VarArgsFrameIndex
+    // on the stack.
+    for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
+      // Get an existing live-in vreg, or add a new one.
+      unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);
+      if (!VReg)
+        VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);
+
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
+      MemOps.push_back(Store);
+      // Increment the address by eight for the next argument to store
+      SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl,
+                                         PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+
+  return Chain;
+}
+
+// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+// value to MVT::i64 and then truncate to the correct register size.
+SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags,
+                                             EVT ObjectVT, SelectionDAG &DAG,
+                                             SDValue ArgVal,
+                                             const SDLoc &dl) const {
+  if (Flags.isSExt())
+    ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
+                         DAG.getValueType(ObjectVT));
+  else if (Flags.isZExt())
+    ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
+                         DAG.getValueType(ObjectVT));
+
+  return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal);
+}
+
+SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  // TODO: add description of PPC stack frame format, or at least some docs.
+  //
+  bool isELFv2ABI = Subtarget.isELFv2ABI();
+  bool isLittleEndian = Subtarget.isLittleEndian();
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  assert(!(CallConv == CallingConv::Fast && isVarArg) &&
+         "fastcc not supported on varargs functions");
+
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
+  // Potential tail calls could cause overwriting of argument stack slots.
+  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+                       (CallConv == CallingConv::Fast));
+  unsigned PtrByteSize = 8;
+  unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+
+  static const MCPhysReg GPR[] = {
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const MCPhysReg VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+
+  const unsigned Num_GPR_Regs = array_lengthof(GPR);
+  const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
+  const unsigned Num_VR_Regs  = array_lengthof(VR);
+  const unsigned Num_QFPR_Regs = Num_FPR_Regs;
+
+  // Do a first pass over the arguments to determine whether the ABI
+  // guarantees that our caller has allocated the parameter save area
+  // on its stack frame.  In the ELFv1 ABI, this is always the case;
+  // in the ELFv2 ABI, it is true if this is a vararg function or if
+  // any parameter is located in a stack slot.
+
+  bool HasParameterArea = !isELFv2ABI || isVarArg;
+  unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;
+  unsigned NumBytes = LinkageSize;
+  unsigned AvailableFPRs = Num_FPR_Regs;
+  unsigned AvailableVRs = Num_VR_Regs;
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    if (Ins[i].Flags.isNest())
+      continue;
+
+    if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
+                               PtrByteSize, LinkageSize, ParamAreaSize,
+                               NumBytes, AvailableFPRs, AvailableVRs,
+                               Subtarget.hasQPX()))
+      HasParameterArea = true;
+  }
+
+  // Add DAG nodes to load the arguments or copy them out of registers.  On
+  // entry to a function on PPC, the arguments start after the linkage area,
+  // although the first ones are often in registers.
+
+  unsigned ArgOffset = LinkageSize;
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+  unsigned &QFPR_idx = FPR_idx;
+  SmallVector<SDValue, 8> MemOps;
+  Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
+    SDValue ArgVal;
+    bool needsLoad = false;
+    EVT ObjectVT = Ins[ArgNo].VT;
+    EVT OrigVT = Ins[ArgNo].ArgVT;
+    unsigned ObjSize = ObjectVT.getStoreSize();
+    unsigned ArgSize = ObjSize;
+    ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+    if (Ins[ArgNo].isOrigArg()) {
+      std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[ArgNo].getOrigArgIndex();
+    }
+    // We re-align the argument offset for each argument, except when using the
+    // fast calling convention, when we need to make sure we do that only when
+    // we'll actually use a stack slot.
+    unsigned CurArgOffset, Align;
+    auto ComputeArgOffset = [&]() {
+      /* Respect alignment of argument on the stack.  */
+      Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
+      ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+      CurArgOffset = ArgOffset;
+    };
+
+    if (CallConv != CallingConv::Fast) {
+      ComputeArgOffset();
+
+      /* Compute GPR index associated with argument offset.  */
+      GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+      GPR_idx = std::min(GPR_idx, Num_GPR_Regs);
+    }
+
+    // FIXME the codegen can be much improved in some cases.
+    // We do not have to keep everything in memory.
+    if (Flags.isByVal()) {
+      assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
+
+      if (CallConv == CallingConv::Fast)
+        ComputeArgOffset();
+
+      // ObjSize is the true size, ArgSize rounded up to multiple of registers.
+      ObjSize = Flags.getByValSize();
+      ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      // Empty aggregate parameters do not take up registers.  Examples:
+      //   struct { } a;
+      //   union  { } b;
+      //   int c[0];
+      // etc.  However, we have to provide a place-holder in InVals, so
+      // pretend we have an 8-byte item at the current address for that
+      // purpose.
+      if (!ObjSize) {
+        int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
+        SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+        InVals.push_back(FIN);
+        continue;
+      }
+
+      // Create a stack object covering all stack doublewords occupied
+      // by the argument.  If the argument is (fully or partially) on
+      // the stack, or if the argument is fully in registers but the
+      // caller has allocated the parameter save anyway, we can refer
+      // directly to the caller's stack frame.  Otherwise, create a
+      // local copy in our own frame.
+      int FI;
+      if (HasParameterArea ||
+          ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
+        FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true);
+      else
+        FI = MFI.CreateStackObject(ArgSize, Align, false);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+
+      // Handle aggregates smaller than 8 bytes.
+      if (ObjSize < PtrByteSize) {
+        // The value of the object is its address, which differs from the
+        // address of the enclosing doubleword on big-endian systems.
+        SDValue Arg = FIN;
+        if (!isLittleEndian) {
+          SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT);
+          Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff);
+        }
+        InVals.push_back(Arg);
+
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          SDValue Store;
+
+          if (ObjSize==1 || ObjSize==2 || ObjSize==4) {
+            EVT ObjType = (ObjSize == 1 ? MVT::i8 :
+                           (ObjSize == 2 ? MVT::i16 : MVT::i32));
+            Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
+                                      MachinePointerInfo(&*FuncArg), ObjType);
+          } else {
+            // For sizes that don't fit a truncating store (3, 5, 6, 7),
+            // store the whole register as-is to the parameter save area
+            // slot.
+            Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                 MachinePointerInfo(&*FuncArg));
+          }
+
+          MemOps.push_back(Store);
+        }
+        // Whether we copied from a register or not, advance the offset
+        // into the parameter save area by a full doubleword.
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+
+      // The value of the object is its address, which is the address of
+      // its first stack doubleword.
+      InVals.push_back(FIN);
+
+      // Store whatever pieces of the object are in registers to memory.
+      for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
+        if (GPR_idx == Num_GPR_Regs)
+          break;
+
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+        SDValue Addr = FIN;
+        if (j) {
+          SDValue Off = DAG.getConstant(j, dl, PtrVT);
+          Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
+        }
+        SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr,
+                                     MachinePointerInfo(&*FuncArg, j));
+        MemOps.push_back(Store);
+        ++GPR_idx;
+      }
+      ArgOffset += ArgSize;
+      continue;
+    }
+
+    switch (ObjectVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unhandled argument type!");
+    case MVT::i1:
+    case MVT::i32:
+    case MVT::i64:
+      if (Flags.isNest()) {
+        // The 'nest' parameter, if any, is passed in R11.
+        unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
+          ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
+
+        break;
+      }
+
+      // These can be scalar arguments or elements of an integer array type
+      // passed directly.  Clang may use those instead of "byval" aggregate
+      // types to avoid forcing arguments to memory unnecessarily.
+      if (GPR_idx != Num_GPR_Regs) {
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
+          // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+          // value to MVT::i64 and then truncate to the correct register size.
+          ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
+      } else {
+        if (CallConv == CallingConv::Fast)
+          ComputeArgOffset();
+
+        needsLoad = true;
+        ArgSize = PtrByteSize;
+      }
+      if (CallConv != CallingConv::Fast || needsLoad)
+        ArgOffset += 8;
+      break;
+
+    case MVT::f32:
+    case MVT::f64:
+      // These can be scalar arguments or elements of a float array type
+      // passed directly.  The latter are used to implement ELFv2 homogenous
+      // float aggregates.
+      if (FPR_idx != Num_FPR_Regs) {
+        unsigned VReg;
+
+        if (ObjectVT == MVT::f32)
+          VReg = MF.addLiveIn(FPR[FPR_idx],
+                              Subtarget.hasP8Vector()
+                                  ? &PPC::VSSRCRegClass
+                                  : &PPC::F4RCRegClass);
+        else
+          VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX()
+                                                ? &PPC::VSFRCRegClass
+                                                : &PPC::F8RCRegClass);
+
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        ++FPR_idx;
+      } else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) {
+        // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
+        // once we support fp <-> gpr moves.
+
+        // This can only ever happen in the presence of f32 array types,
+        // since otherwise we never run out of FPRs before running out
+        // of GPRs.
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::f32) {
+          if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
+            ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal,
+                                 DAG.getConstant(32, dl, MVT::i32));
+          ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
+        }
+
+        ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal);
+      } else {
+        if (CallConv == CallingConv::Fast)
+          ComputeArgOffset();
+
+        needsLoad = true;
+      }
+
+      // When passing an array of floats, the array occupies consecutive
+      // space in the argument area; only round up to the next doubleword
+      // at the end of the array.  Otherwise, each float takes 8 bytes.
+      if (CallConv != CallingConv::Fast || needsLoad) {
+        ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
+        ArgOffset += ArgSize;
+        if (Flags.isInConsecutiveRegsLast())
+          ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      }
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+    case MVT::v2f64:
+    case MVT::v2i64:
+    case MVT::v1i128:
+      if (!Subtarget.hasQPX()) {
+      // These can be scalar arguments or elements of a vector array type
+      // passed directly.  The latter are used to implement ELFv2 homogenous
+      // vector aggregates.
+      if (VR_idx != Num_VR_Regs) {
+        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        ++VR_idx;
+      } else {
+        if (CallConv == CallingConv::Fast)
+          ComputeArgOffset();
+
+        needsLoad = true;
+      }
+      if (CallConv != CallingConv::Fast || needsLoad)
+        ArgOffset += 16;
+      break;
+      } // not QPX
+
+      assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 &&
+             "Invalid QPX parameter type");
+      /* fall through */
+
+    case MVT::v4f64:
+    case MVT::v4i1:
+      // QPX vectors are treated like their scalar floating-point subregisters
+      // (except that they're larger).
+      unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32;
+      if (QFPR_idx != Num_QFPR_Regs) {
+        const TargetRegisterClass *RC;
+        switch (ObjectVT.getSimpleVT().SimpleTy) {
+        case MVT::v4f64: RC = &PPC::QFRCRegClass; break;
+        case MVT::v4f32: RC = &PPC::QSRCRegClass; break;
+        default:         RC = &PPC::QBRCRegClass; break;
+        }
+
+        unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        ++QFPR_idx;
+      } else {
+        if (CallConv == CallingConv::Fast)
+          ComputeArgOffset();
+        needsLoad = true;
+      }
+      if (CallConv != CallingConv::Fast || needsLoad)
+        ArgOffset += Sz;
+      break;
+    }
+
+    // We need to load the argument to a virtual register if we determined
+    // above that we ran out of physical registers of the appropriate type.
+    if (needsLoad) {
+      if (ObjSize < ArgSize && !isLittleEndian)
+        CurArgOffset += ArgSize - ObjSize;
+      int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, isImmutable);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
+    }
+
+    InVals.push_back(ArgVal);
+  }
+
+  // Area that is at least reserved in the caller of this function.
+  unsigned MinReservedArea;
+  if (HasParameterArea)
+    MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);
+  else
+    MinReservedArea = LinkageSize;
+
+  // Set the size that is at least reserved in caller of this function.  Tail
+  // call optimized functions' reserved stack space needs to be aligned so that
+  // taking the difference between two stack areas will result in an aligned
+  // stack.
+  MinReservedArea =
+      EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
+  FuncInfo->setMinReservedArea(MinReservedArea);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    int Depth = ArgOffset;
+
+    FuncInfo->setVarArgsFrameIndex(
+      MFI.CreateFixedObject(PtrByteSize, Depth, true));
+    SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+    // If this function is vararg, store any remaining integer argument regs
+    // to their spots on the stack so that they may be loaded by dereferencing
+    // the result of va_next.
+    for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+         GPR_idx < Num_GPR_Regs; ++GPR_idx) {
+      unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+
+  return Chain;
+}
+
+SDValue PPCTargetLowering::LowerFormalArguments_Darwin(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  // TODO: add description of PPC stack frame format, or at least some docs.
+  //
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
+  bool isPPC64 = PtrVT == MVT::i64;
+  // Potential tail calls could cause overwriting of argument stack slots.
+  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+                       (CallConv == CallingConv::Fast));
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+  unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+  unsigned ArgOffset = LinkageSize;
+  // Area that is at least reserved in caller of this function.
+  unsigned MinReservedArea = ArgOffset;
+
+  static const MCPhysReg GPR_32[] = {           // 32-bit registers.
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  static const MCPhysReg GPR_64[] = {           // 64-bit registers.
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const MCPhysReg VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+
+  const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
+  const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
+  const unsigned Num_VR_Regs  = array_lengthof( VR);
+
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
+
+  // In 32-bit non-varargs functions, the stack space for vectors is after the
+  // stack space for non-vectors.  We do not use this space unless we have
+  // too many vectors to fit in registers, something that only occurs in
+  // constructed examples:), but we have to walk the arglist to figure
+  // that out...for the pathological case, compute VecArgOffset as the
+  // start of the vector parameter area.  Computing VecArgOffset is the
+  // entire point of the following loop.
+  unsigned VecArgOffset = ArgOffset;
+  if (!isVarArg && !isPPC64) {
+    for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
+         ++ArgNo) {
+      EVT ObjectVT = Ins[ArgNo].VT;
+      ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+
+      if (Flags.isByVal()) {
+        // ObjSize is the true size, ArgSize rounded up to multiple of regs.
+        unsigned ObjSize = Flags.getByValSize();
+        unsigned ArgSize =
+                ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+        VecArgOffset += ArgSize;
+        continue;
+      }
+
+      switch(ObjectVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unhandled argument type!");
+      case MVT::i1:
+      case MVT::i32:
+      case MVT::f32:
+        VecArgOffset += 4;
+        break;
+      case MVT::i64:  // PPC64
+      case MVT::f64:
+        // FIXME: We are guaranteed to be !isPPC64 at this point.
+        // Does MVT::i64 apply?
+        VecArgOffset += 8;
+        break;
+      case MVT::v4f32:
+      case MVT::v4i32:
+      case MVT::v8i16:
+      case MVT::v16i8:
+        // Nothing to do, we're only looking at Nonvector args here.
+        break;
+      }
+    }
+  }
+  // We've found where the vector parameter area in memory is.  Skip the
+  // first 12 parameters; these don't use that memory.
+  VecArgOffset = ((VecArgOffset+15)/16)*16;
+  VecArgOffset += 12*16;
+
+  // Add DAG nodes to load the arguments or copy them out of registers.  On
+  // entry to a function on PPC, the arguments start after the linkage area,
+  // although the first ones are often in registers.
+
+  SmallVector<SDValue, 8> MemOps;
+  unsigned nAltivecParamsAtEnd = 0;
+  Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
+    SDValue ArgVal;
+    bool needsLoad = false;
+    EVT ObjectVT = Ins[ArgNo].VT;
+    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
+    unsigned ArgSize = ObjSize;
+    ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+    if (Ins[ArgNo].isOrigArg()) {
+      std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[ArgNo].getOrigArgIndex();
+    }
+    unsigned CurArgOffset = ArgOffset;
+
+    // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
+    if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
+        ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
+      if (isVarArg || isPPC64) {
+        MinReservedArea = ((MinReservedArea+15)/16)*16;
+        MinReservedArea += CalculateStackSlotSize(ObjectVT,
+                                                  Flags,
+                                                  PtrByteSize);
+      } else  nAltivecParamsAtEnd++;
+    } else
+      // Calculate min reserved area.
+      MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
+                                                Flags,
+                                                PtrByteSize);
+
+    // FIXME the codegen can be much improved in some cases.
+    // We do not have to keep everything in memory.
+    if (Flags.isByVal()) {
+      assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
+
+      // ObjSize is the true size, ArgSize rounded up to multiple of registers.
+      ObjSize = Flags.getByValSize();
+      ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      // Objects of size 1 and 2 are right justified, everything else is
+      // left justified.  This means the memory address is adjusted forwards.
+      if (ObjSize==1 || ObjSize==2) {
+        CurArgOffset = CurArgOffset + (4 - ObjSize);
+      }
+      // The value of the object is its address.
+      int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, false, true);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      InVals.push_back(FIN);
+      if (ObjSize==1 || ObjSize==2) {
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg;
+          if (isPPC64)
+            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          else
+            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
+          SDValue Store =
+              DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
+                                MachinePointerInfo(&*FuncArg), ObjType);
+          MemOps.push_back(Store);
+          ++GPR_idx;
+        }
+
+        ArgOffset += PtrByteSize;
+
+        continue;
+      }
+      for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
+        // Store whatever pieces of the object are in registers
+        // to memory.  ArgOffset will be the address of the beginning
+        // of the object.
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg;
+          if (isPPC64)
+            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          else
+            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
+          SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                       MachinePointerInfo(&*FuncArg, j));
+          MemOps.push_back(Store);
+          ++GPR_idx;
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (ObjectVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unhandled argument type!");
+    case MVT::i1:
+    case MVT::i32:
+      if (!isPPC64) {
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+
+          if (ObjectVT == MVT::i1)
+            ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal);
+
+          ++GPR_idx;
+        } else {
+          needsLoad = true;
+          ArgSize = PtrByteSize;
+        }
+        // All int arguments reserve stack space in the Darwin ABI.
+        ArgOffset += PtrByteSize;
+        break;
+      }
+      LLVM_FALLTHROUGH;
+    case MVT::i64:  // PPC64
+      if (GPR_idx != Num_GPR_Regs) {
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
+          // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+          // value to MVT::i64 and then truncate to the correct register size.
+          ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
+
+        ++GPR_idx;
+      } else {
+        needsLoad = true;
+        ArgSize = PtrByteSize;
+      }
+      // All int arguments reserve stack space in the Darwin ABI.
+      ArgOffset += 8;
+      break;
+
+    case MVT::f32:
+    case MVT::f64:
+      // Every 4 bytes of argument space consumes one of the GPRs available for
+      // argument passing.
+      if (GPR_idx != Num_GPR_Regs) {
+        ++GPR_idx;
+        if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
+          ++GPR_idx;
+      }
+      if (FPR_idx != Num_FPR_Regs) {
+        unsigned VReg;
+
+        if (ObjectVT == MVT::f32)
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
+        else
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
+
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        ++FPR_idx;
+      } else {
+        needsLoad = true;
+      }
+
+      // All FP arguments reserve stack space in the Darwin ABI.
+      ArgOffset += isPPC64 ? 8 : ObjSize;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      // Note that vector arguments in registers don't reserve stack space,
+      // except in varargs functions.
+      if (VR_idx != Num_VR_Regs) {
+        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        if (isVarArg) {
+          while ((ArgOffset % 16) != 0) {
+            ArgOffset += PtrByteSize;
+            if (GPR_idx != Num_GPR_Regs)
+              GPR_idx++;
+          }
+          ArgOffset += 16;
+          GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
+        }
+        ++VR_idx;
+      } else {
+        if (!isVarArg && !isPPC64) {
+          // Vectors go after all the nonvectors.
+          CurArgOffset = VecArgOffset;
+          VecArgOffset += 16;
+        } else {
+          // Vectors are aligned.
+          ArgOffset = ((ArgOffset+15)/16)*16;
+          CurArgOffset = ArgOffset;
+          ArgOffset += 16;
+        }
+        needsLoad = true;
+      }
+      break;
+    }
+
+    // We need to load the argument to a virtual register if we determined above
+    // that we ran out of physical registers of the appropriate type.
+    if (needsLoad) {
+      int FI = MFI.CreateFixedObject(ObjSize,
+                                     CurArgOffset + (ArgSize - ObjSize),
+                                     isImmutable);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
+    }
+
+    InVals.push_back(ArgVal);
+  }
+
+  // Allow for Altivec parameters at the end, if needed.
+  if (nAltivecParamsAtEnd) {
+    MinReservedArea = ((MinReservedArea+15)/16)*16;
+    MinReservedArea += 16*nAltivecParamsAtEnd;
+  }
+
+  // Area that is at least reserved in the caller of this function.
+  MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize);
+
+  // Set the size that is at least reserved in caller of this function.  Tail
+  // call optimized functions' reserved stack space needs to be aligned so that
+  // taking the difference between two stack areas will result in an aligned
+  // stack.
+  MinReservedArea =
+      EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
+  FuncInfo->setMinReservedArea(MinReservedArea);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    int Depth = ArgOffset;
+
+    FuncInfo->setVarArgsFrameIndex(
+      MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
+                            Depth, true));
+    SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+    // If this function is vararg, store any remaining integer argument regs
+    // to their spots on the stack so that they may be loaded by dereferencing
+    // the result of va_next.
+    for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
+      unsigned VReg;
+
+      if (isPPC64)
+        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+      else
+        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+
+  return Chain;
+}
+
+/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
+/// adjusted to accommodate the arguments for the tailcall.
+static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
+                                   unsigned ParamSize) {
+
+  if (!isTailCall) return 0;
+
+  PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
+  unsigned CallerMinReservedArea = FI->getMinReservedArea();
+  int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
+  // Remember only if the new adjustement is bigger.
+  if (SPDiff < FI->getTailCallSPDelta())
+    FI->setTailCallSPDelta(SPDiff);
+
+  return SPDiff;
+}
+
+static bool isFunctionGlobalAddress(SDValue Callee);
+
+static bool
+resideInSameModule(SDValue Callee, Reloc::Model RelMod) {
+  // If !G, Callee can be an external symbol.
+  GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+  if (!G) return false;
+
+  const GlobalValue *GV = G->getGlobal();
+
+  if (GV->isDeclaration()) return false;
+
+  switch(GV->getLinkage()) {
+  default: llvm_unreachable("unknow linkage type");
+  case GlobalValue::AvailableExternallyLinkage:
+  case GlobalValue::ExternalWeakLinkage:
+    return false;
+
+  // Callee with weak linkage is allowed if it has hidden or protected
+  // visibility
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage: // e.g. c++ inline functions
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:     // e.g. c++ template instantiation
+    if (GV->hasDefaultVisibility())
+      return false;
+
+  case GlobalValue::ExternalLinkage:
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::PrivateLinkage:
+    break;
+  }
+
+  // With '-fPIC', calling default visiblity function need insert 'nop' after
+  // function call, no matter that function resides in same module or not, so
+  // we treat it as in different module.
+  if (RelMod == Reloc::PIC_ && GV->hasDefaultVisibility())
+    return false;
+
+  return true;
+}
+
+static bool
+needStackSlotPassParameters(const PPCSubtarget &Subtarget,
+                            const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64());
+
+  const unsigned PtrByteSize = 8;
+  const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+
+  static const MCPhysReg GPR[] = {
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const MCPhysReg VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+
+  const unsigned NumGPRs = array_lengthof(GPR);
+  const unsigned NumFPRs = 13;
+  const unsigned NumVRs = array_lengthof(VR);
+  const unsigned ParamAreaSize = NumGPRs * PtrByteSize;
+
+  unsigned NumBytes = LinkageSize;
+  unsigned AvailableFPRs = NumFPRs;
+  unsigned AvailableVRs = NumVRs;
+
+  for (const ISD::OutputArg& Param : Outs) {
+    if (Param.Flags.isNest()) continue;
+
+    if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags,
+                               PtrByteSize, LinkageSize, ParamAreaSize,
+                               NumBytes, AvailableFPRs, AvailableVRs,
+                               Subtarget.hasQPX()))
+      return true;
+  }
+  return false;
+}
+
+static bool
+hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) {
+  if (CS->arg_size() != CallerFn->getArgumentList().size())
+    return false;
+
+  ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin();
+  ImmutableCallSite::arg_iterator CalleeArgEnd = CS->arg_end();
+  Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
+
+  for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
+    const Value* CalleeArg = *CalleeArgIter;
+    const Value* CallerArg = &(*CallerArgIter);
+    if (CalleeArg == CallerArg)
+      continue;
+
+    // e.g. @caller([4 x i64] %a, [4 x i64] %b) {
+    //        tail call @callee([4 x i64] undef, [4 x i64] %b)
+    //      }
+    // 1st argument of callee is undef and has the same type as caller.
+    if (CalleeArg->getType() == CallerArg->getType() &&
+        isa<UndefValue>(CalleeArg))
+      continue;
+
+    return false;
+  }
+
+  return true;
+}
+
+bool
+PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
+                                    SDValue Callee,
+                                    CallingConv::ID CalleeCC,
+                                    ImmutableCallSite *CS,
+                                    bool isVarArg,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SelectionDAG& DAG) const {
+  bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
+
+  if (DisableSCO && !TailCallOpt) return false;
+
+  // Variadic argument functions are not supported.
+  if (isVarArg) return false;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
+
+  // Tail or Sibling call optimization (TCO/SCO) needs callee and caller has
+  // the same calling convention
+  if (CallerCC != CalleeCC) return false;
+
+  // SCO support C calling convention
+  if (CalleeCC != CallingConv::Fast && CalleeCC != CallingConv::C)
+    return false;
+
+  // Caller contains any byval parameter is not supported.
+  if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
+    return false;
+
+  // Callee contains any byval parameter is not supported, too.
+  // Note: This is a quick work around, because in some cases, e.g.
+  // caller's stack size > callee's stack size, we are still able to apply
+  // sibling call optimization. See: https://reviews.llvm.org/D23441#513574
+  if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); }))
+    return false;
+
+  // No TCO/SCO on indirect call because Caller have to restore its TOC
+  if (!isFunctionGlobalAddress(Callee) &&
+      !isa<ExternalSymbolSDNode>(Callee))
+    return false;
+
+  // Check if Callee resides in the same module, because for now, PPC64 SVR4 ABI
+  // (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another
+  // module.
+  // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
+  if (!resideInSameModule(Callee, getTargetMachine().getRelocationModel()))
+    return false;
+
+  // TCO allows altering callee ABI, so we don't have to check further.
+  if (CalleeCC == CallingConv::Fast && TailCallOpt)
+    return true;
+
+  if (DisableSCO) return false;
+
+  // If callee use the same argument list that caller is using, then we can
+  // apply SCO on this case. If it is not, then we need to check if callee needs
+  // stack for passing arguments.
+  if (!hasSameArgumentList(MF.getFunction(), CS) &&
+      needStackSlotPassParameters(Subtarget, Outs)) {
+    return false;
+  }
+
+  return true;
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                                     CallingConv::ID CalleeCC,
+                                                     bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                                     SelectionDAG& DAG) const {
+  if (!getTargetMachine().Options.GuaranteedTailCallOpt)
+    return false;
+
+  // Variable argument functions are not supported.
+  if (isVarArg)
+    return false;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
+  if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
+    // Functions containing by val parameters are not supported.
+    for (unsigned i = 0; i != Ins.size(); i++) {
+       ISD::ArgFlagsTy Flags = Ins[i].Flags;
+       if (Flags.isByVal()) return false;
+    }
+
+    // Non-PIC/GOT tail calls are supported.
+    if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
+      return true;
+
+    // At the moment we can only do local tail calls (in same module, hidden
+    // or protected) if we are generating PIC.
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+      return G->getGlobal()->hasHiddenVisibility()
+          || G->getGlobal()->hasProtectedVisibility();
+  }
+
+  return false;
+}
+
+/// isCallCompatibleAddress - Return the immediate to use if the specified
+/// 32-bit value is representable in the immediate field of a BxA instruction.
+static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+  if (!C) return nullptr;
+
+  int Addr = C->getZExtValue();
+  if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
+      SignExtend32<26>(Addr) != Addr)
+    return nullptr;  // Top 6 bits have to be sext of immediate.
+
+  return DAG
+      .getConstant(
+          (int)C->getZExtValue() >> 2, SDLoc(Op),
+          DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()))
+      .getNode();
+}
+
+namespace {
+
+struct TailCallArgumentInfo {
+  SDValue Arg;
+  SDValue FrameIdxOp;
+  int       FrameIdx;
+
+  TailCallArgumentInfo() : FrameIdx(0) {}
+};
+}
+
+/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
+static void StoreTailCallArgumentsToStackSlot(
+    SelectionDAG &DAG, SDValue Chain,
+    const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
+    SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) {
+  for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
+    SDValue Arg = TailCallArgs[i].Arg;
+    SDValue FIN = TailCallArgs[i].FrameIdxOp;
+    int FI = TailCallArgs[i].FrameIdx;
+    // Store relative to framepointer.
+    MemOpChains.push_back(DAG.getStore(
+        Chain, dl, Arg, FIN,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
+  }
+}
+
+/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
+/// the appropriate stack slot for the tail call optimized function call.
+static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,
+                                             SDValue OldRetAddr, SDValue OldFP,
+                                             int SPDiff, const SDLoc &dl) {
+  if (SPDiff) {
+    // Calculate the new stack slot for the return address.
+    MachineFunction &MF = DAG.getMachineFunction();
+    const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+    const PPCFrameLowering *FL = Subtarget.getFrameLowering();
+    bool isPPC64 = Subtarget.isPPC64();
+    int SlotSize = isPPC64 ? 8 : 4;
+    int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset();
+    int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize,
+                                                         NewRetAddrLoc, true);
+    EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
+    SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
+    Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
+                         MachinePointerInfo::getFixedStack(MF, NewRetAddr));
+
+    // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
+    // slot as the FP is never overwritten.
+    if (Subtarget.isDarwinABI()) {
+      int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset();
+      int NewFPIdx = MF.getFrameInfo().CreateFixedObject(SlotSize, NewFPLoc,
+                                                         true);
+      SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
+      Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
+                           MachinePointerInfo::getFixedStack(
+                               DAG.getMachineFunction(), NewFPIdx));
+    }
+  }
+  return Chain;
+}
+
+/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
+/// the position of the argument.
+static void
+CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
+                         SDValue Arg, int SPDiff, unsigned ArgOffset,
+                     SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) {
+  int Offset = ArgOffset + SPDiff;
+  uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8;
+  int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
+  EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
+  SDValue FIN = DAG.getFrameIndex(FI, VT);
+  TailCallArgumentInfo Info;
+  Info.Arg = Arg;
+  Info.FrameIdxOp = FIN;
+  Info.FrameIdx = FI;
+  TailCallArguments.push_back(Info);
+}
+
+/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
+/// stack slot. Returns the chain as result and the loaded frame pointers in
+/// LROpOut/FPOpout. Used when tail calling.
+SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(
+    SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut,
+    SDValue &FPOpOut, const SDLoc &dl) const {
+  if (SPDiff) {
+    // Load the LR and FP stack slot for later adjusting.
+    EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
+    LROpOut = getReturnAddrFrameIndex(DAG);
+    LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo());
+    Chain = SDValue(LROpOut.getNode(), 1);
+
+    // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack
+    // slot as the FP is never overwritten.
+    if (Subtarget.isDarwinABI()) {
+      FPOpOut = getFramePointerFrameIndex(DAG);
+      FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo());
+      Chain = SDValue(FPOpOut.getNode(), 1);
+    }
+  }
+  return Chain;
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" of size "Size".  Alignment information is
+/// specified by the specific parameter attribute. The copy will be passed as
+/// a byval function parameter.
+/// Sometimes what we are copying is the end of a larger object, the part that
+/// does not fit in registers.
+static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
+                                         SDValue Chain, ISD::ArgFlagsTy Flags,
+                                         SelectionDAG &DAG, const SDLoc &dl) {
+  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       false, false, false, MachinePointerInfo(),
+                       MachinePointerInfo());
+}
+
+/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
+/// tail calls.
+static void LowerMemOpCallTo(
+    SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg,
+    SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64,
+    bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
+    SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) {
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+  if (!isTailCall) {
+    if (isVector) {
+      SDValue StackPtr;
+      if (isPPC64)
+        StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+      else
+        StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+      PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                           DAG.getConstant(ArgOffset, dl, PtrVT));
+    }
+    MemOpChains.push_back(
+        DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
+    // Calculate and remember argument location.
+  } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,
+                                  TailCallArguments);
+}
+
+static void
+PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
+                const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp,
+                SDValue FPOp,
+                SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
+  // Emit a sequence of copyto/copyfrom virtual registers for arguments that
+  // might overwrite each other in case of tail call optimization.
+  SmallVector<SDValue, 8> MemOpChains2;
+  // Do not flag preceding copytoreg stuff together with the following stuff.
+  InFlag = SDValue();
+  StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
+                                    MemOpChains2, dl);
+  if (!MemOpChains2.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
+
+  // Store the return address to the appropriate stack slot.
+  Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl);
+
+  // Emit callseq_end just before tailcall node.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
+                             DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+  InFlag = Chain.getValue(1);
+}
+
+// Is this global address that of a function that can be called by name? (as
+// opposed to something that must hold a descriptor for an indirect call).
+static bool isFunctionGlobalAddress(SDValue Callee) {
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    if (Callee.getOpcode() == ISD::GlobalTLSAddress ||
+        Callee.getOpcode() == ISD::TargetGlobalTLSAddress)
+      return false;
+
+    return G->getGlobal()->getValueType()->isFunctionTy();
+  }
+
+  return false;
+}
+
+static unsigned
+PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
+            SDValue CallSeqStart, const SDLoc &dl, int SPDiff, bool isTailCall,
+            bool isPatchPoint, bool hasNest,
+            SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
+            SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
+            ImmutableCallSite *CS, const PPCSubtarget &Subtarget) {
+
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isSVR4ABI = Subtarget.isSVR4ABI();
+  bool isELFv2ABI = Subtarget.isELFv2ABI();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+  NodeTys.push_back(MVT::Other);   // Returns a chain
+  NodeTys.push_back(MVT::Glue);    // Returns a flag for retval copy to use.
+
+  unsigned CallOpc = PPCISD::CALL;
+
+  bool needIndirectCall = true;
+  if (!isSVR4ABI || !isPPC64)
+    if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) {
+      // If this is an absolute destination address, use the munged value.
+      Callee = SDValue(Dest, 0);
+      needIndirectCall = false;
+    }
+
+  // PC-relative references to external symbols should go through $stub, unless
+  // we're building with the leopard linker or later, which automatically
+  // synthesizes these stubs.
+  const TargetMachine &TM = DAG.getTarget();
+  const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
+  const GlobalValue *GV = nullptr;
+  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    GV = G->getGlobal();
+  bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
+  bool UsePlt = !Local && Subtarget.isTargetELF() && !isPPC64;
+
+  if (isFunctionGlobalAddress(Callee)) {
+    GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
+    // A call to a TLS address is actually an indirect call to a
+    // thread-specific pointer.
+    unsigned OpFlags = 0;
+    if (UsePlt)
+      OpFlags = PPCII::MO_PLT;
+
+    // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
+    // every direct call is) turn it into a TargetGlobalAddress /
+    // TargetExternalSymbol node so that legalize doesn't hack it.
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
+                                        Callee.getValueType(), 0, OpFlags);
+    needIndirectCall = false;
+  }
+
+  if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    unsigned char OpFlags = 0;
+
+    if (UsePlt)
+      OpFlags = PPCII::MO_PLT;
+
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(),
+                                         OpFlags);
+    needIndirectCall = false;
+  }
+
+  if (isPatchPoint) {
+    // We'll form an invalid direct call when lowering a patchpoint; the full
+    // sequence for an indirect call is complicated, and many of the
+    // instructions introduced might have side effects (and, thus, can't be
+    // removed later). The call itself will be removed as soon as the
+    // argument/return lowering is complete, so the fact that it has the wrong
+    // kind of operands should not really matter.
+    needIndirectCall = false;
+  }
+
+  if (needIndirectCall) {
+    // Otherwise, this is an indirect call.  We have to use a MTCTR/BCTRL pair
+    // to do the call, we can't use PPCISD::CALL.
+    SDValue MTCTROps[] = {Chain, Callee, InFlag};
+
+    if (isSVR4ABI && isPPC64 && !isELFv2ABI) {
+      // Function pointers in the 64-bit SVR4 ABI do not point to the function
+      // entry point, but to the function descriptor (the function entry point
+      // address is part of the function descriptor though).
+      // The function descriptor is a three doubleword structure with the
+      // following fields: function entry point, TOC base address and
+      // environment pointer.
+      // Thus for a call through a function pointer, the following actions need
+      // to be performed:
+      //   1. Save the TOC of the caller in the TOC save area of its stack
+      //      frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
+      //   2. Load the address of the function entry point from the function
+      //      descriptor.
+      //   3. Load the TOC of the callee from the function descriptor into r2.
+      //   4. Load the environment pointer from the function descriptor into
+      //      r11.
+      //   5. Branch to the function entry point address.
+      //   6. On return of the callee, the TOC of the caller needs to be
+      //      restored (this is done in FinishCall()).
+      //
+      // The loads are scheduled at the beginning of the call sequence, and the
+      // register copies are flagged together to ensure that no other
+      // operations can be scheduled in between. E.g. without flagging the
+      // copies together, a TOC access in the caller could be scheduled between
+      // the assignment of the callee TOC and the branch to the callee, which
+      // results in the TOC access going through the TOC of the callee instead
+      // of going through the TOC of the caller, which leads to incorrect code.
+
+      // Load the address of the function entry point from the function
+      // descriptor.
+      SDValue LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-1);
+      if (LDChain.getValueType() == MVT::Glue)
+        LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-2);
+
+      auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
+                          ? (MachineMemOperand::MODereferenceable |
+                             MachineMemOperand::MOInvariant)
+                          : MachineMemOperand::MONone;
+
+      MachinePointerInfo MPI(CS ? CS->getCalledValue() : nullptr);
+      SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI,
+                                        /* Alignment = */ 8, MMOFlags);
+
+      // Load environment pointer into r11.
+      SDValue PtrOff = DAG.getIntPtrConstant(16, dl);
+      SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff);
+      SDValue LoadEnvPtr =
+          DAG.getLoad(MVT::i64, dl, LDChain, AddPtr, MPI.getWithOffset(16),
+                      /* Alignment = */ 8, MMOFlags);
+
+      SDValue TOCOff = DAG.getIntPtrConstant(8, dl);
+      SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff);
+      SDValue TOCPtr =
+          DAG.getLoad(MVT::i64, dl, LDChain, AddTOC, MPI.getWithOffset(8),
+                      /* Alignment = */ 8, MMOFlags);
+
+      setUsesTOCBasePtr(DAG);
+      SDValue TOCVal = DAG.getCopyToReg(Chain, dl, PPC::X2, TOCPtr,
+                                        InFlag);
+      Chain = TOCVal.getValue(0);
+      InFlag = TOCVal.getValue(1);
+
+      // If the function call has an explicit 'nest' parameter, it takes the
+      // place of the environment pointer.
+      if (!hasNest) {
+        SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr,
+                                          InFlag);
+
+        Chain = EnvVal.getValue(0);
+        InFlag = EnvVal.getValue(1);
+      }
+
+      MTCTROps[0] = Chain;
+      MTCTROps[1] = LoadFuncPtr;
+      MTCTROps[2] = InFlag;
+    }
+
+    Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys,
+                        makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2));
+    InFlag = Chain.getValue(1);
+
+    NodeTys.clear();
+    NodeTys.push_back(MVT::Other);
+    NodeTys.push_back(MVT::Glue);
+    Ops.push_back(Chain);
+    CallOpc = PPCISD::BCTRL;
+    Callee.setNode(nullptr);
+    // Add use of X11 (holding environment pointer)
+    if (isSVR4ABI && isPPC64 && !isELFv2ABI && !hasNest)
+      Ops.push_back(DAG.getRegister(PPC::X11, PtrVT));
+    // Add CTR register as callee so a bctr can be emitted later.
+    if (isTailCall)
+      Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT));
+  }
+
+  // If this is a direct call, pass the chain and the callee.
+  if (Callee.getNode()) {
+    Ops.push_back(Chain);
+    Ops.push_back(Callee);
+  }
+  // If this is a tail call add stack pointer delta.
+  if (isTailCall)
+    Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32));
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // All calls, in both the ELF V1 and V2 ABIs, need the TOC register live
+  // into the call.
+  if (isSVR4ABI && isPPC64 && !isPatchPoint) {
+    setUsesTOCBasePtr(DAG);
+    Ops.push_back(DAG.getRegister(PPC::X2, PtrVT));
+  }
+
+  return CallOpc;
+}
+
+static
+bool isLocalCall(const SDValue &Callee)
+{
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    return G->getGlobal()->isStrongDefinitionForLinker();
+  return false;
+}
+
+SDValue PPCTargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                    *DAG.getContext());
+  CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    SDValue Val = DAG.getCopyFromReg(Chain, dl,
+                                     VA.getLocReg(), VA.getLocVT(), InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::AExt:
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    case CCValAssign::ZExt:
+      Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    case CCValAssign::SExt:
+      Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+SDValue PPCTargetLowering::FinishCall(
+    CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg,
+    bool isPatchPoint, bool hasNest, SelectionDAG &DAG,
+    SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue InFlag,
+    SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
+    unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
+    SmallVectorImpl<SDValue> &InVals, ImmutableCallSite *CS) const {
+
+  std::vector<EVT> NodeTys;
+  SmallVector<SDValue, 8> Ops;
+  unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl,
+                                 SPDiff, isTailCall, isPatchPoint, hasNest,
+                                 RegsToPass, Ops, NodeTys, CS, Subtarget);
+
+  // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
+  if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64())
+    Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
+
+  // When performing tail call optimization the callee pops its arguments off
+  // the stack. Account for this here so these bytes can be pushed back on in
+  // PPCFrameLowering::eliminateCallFramePseudoInstr.
+  int BytesCalleePops =
+    (CallConv == CallingConv::Fast &&
+     getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0;
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const uint32_t *Mask =
+      TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  // Emit tail call.
+  if (isTailCall) {
+    assert(((Callee.getOpcode() == ISD::Register &&
+             cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
+            Callee.getOpcode() == ISD::TargetExternalSymbol ||
+            Callee.getOpcode() == ISD::TargetGlobalAddress ||
+            isa<ConstantSDNode>(Callee)) &&
+    "Expecting an global address, external symbol, absolute value or register");
+
+    DAG.getMachineFunction().getFrameInfo().setHasTailCall();
+    return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops);
+  }
+
+  // Add a NOP immediately after the branch instruction when using the 64-bit
+  // SVR4 ABI. At link time, if caller and callee are in a different module and
+  // thus have a different TOC, the call will be replaced with a call to a stub
+  // function which saves the current TOC, loads the TOC of the callee and
+  // branches to the callee. The NOP will be replaced with a load instruction
+  // which restores the TOC of the caller from the TOC save slot of the current
+  // stack frame. If caller and callee belong to the same module (and have the
+  // same TOC), the NOP will remain unchanged.
+
+  if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() &&
+      !isPatchPoint) {
+    if (CallOpc == PPCISD::BCTRL) {
+      // This is a call through a function pointer.
+      // Restore the caller TOC from the save area into R2.
+      // See PrepareCall() for more information about calls through function
+      // pointers in the 64-bit SVR4 ABI.
+      // We are using a target-specific load with r2 hard coded, because the
+      // result of a target-independent load would never go directly into r2,
+      // since r2 is a reserved register (which prevents the register allocator
+      // from allocating it), resulting in an additional register being
+      // allocated and an unnecessary move instruction being generated.
+      CallOpc = PPCISD::BCTRL_LOAD_TOC;
+
+      EVT PtrVT = getPointerTy(DAG.getDataLayout());
+      SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
+      unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
+      SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
+      SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff);
+
+      // The address needs to go after the chain input but before the flag (or
+      // any other variadic arguments).
+      Ops.insert(std::next(Ops.begin()), AddTOC);
+    } else if ((CallOpc == PPCISD::CALL) &&
+               (!isLocalCall(Callee) ||
+                DAG.getTarget().getRelocationModel() == Reloc::PIC_))
+      // Otherwise insert NOP for non-local calls.
+      CallOpc = PPCISD::CALL_NOP;
+  }
+
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
+                             DAG.getIntPtrConstant(BytesCalleePops, dl, true),
+                             InFlag, dl);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+SDValue
+PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                             SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+  bool isPatchPoint                     = CLI.IsPatchPoint;
+  ImmutableCallSite *CS                 = CLI.CS;
+
+  if (isTailCall) {
+    if (Subtarget.useLongCalls() && !(CS && CS->isMustTailCall()))
+      isTailCall = false;
+    else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
+      isTailCall =
+        IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS,
+                                                 isVarArg, Outs, Ins, DAG);
+    else
+      isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
+                                                     Ins, DAG);
+    if (isTailCall) {
+      ++NumTailCalls;
+      if (!getTargetMachine().Options.GuaranteedTailCallOpt)
+        ++NumSiblingCalls;
+
+      assert(isa<GlobalAddressSDNode>(Callee) &&
+             "Callee should be an llvm::Function object.");
+      DEBUG(
+        const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
+        const unsigned Width = 80 - strlen("TCO caller: ")
+                                  - strlen(", callee linkage: 0, 0");
+        dbgs() << "TCO caller: "
+               << left_justify(DAG.getMachineFunction().getName(), Width)
+               << ", callee linkage: "
+               << GV->getVisibility() << ", " << GV->getLinkage() << "\n"
+      );
+    }
+  }
+
+  if (!isTailCall && CS && CS->isMustTailCall())
+    report_fatal_error("failed to perform tail call elimination on a call "
+                       "site marked musttail");
+
+  // When long calls (i.e. indirect calls) are always used, calls are always
+  // made via function pointer. If we have a function name, first translate it
+  // into a pointer.
+  if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Callee) &&
+      !isTailCall)
+    Callee = LowerGlobalAddress(Callee, DAG);
+
+  if (Subtarget.isSVR4ABI()) {
+    if (Subtarget.isPPC64())
+      return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
+                              isTailCall, isPatchPoint, Outs, OutVals, Ins,
+                              dl, DAG, InVals, CS);
+    else
+      return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
+                              isTailCall, isPatchPoint, Outs, OutVals, Ins,
+                              dl, DAG, InVals, CS);
+  }
+
+  return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
+                          isTailCall, isPatchPoint, Outs, OutVals, Ins,
+                          dl, DAG, InVals, CS);
+}
+
+SDValue PPCTargetLowering::LowerCall_32SVR4(
+    SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+    bool isTailCall, bool isPatchPoint,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+    ImmutableCallSite *CS) const {
+  // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
+  // of the 32-bit SVR4 ABI stack frame layout.
+
+  assert((CallConv == CallingConv::C ||
+          CallConv == CallingConv::Fast) && "Unknown calling convention!");
+
+  unsigned PtrByteSize = 4;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Mark this function as potentially containing a function that contains a
+  // tail call. As a consequence the frame pointer will be used for dynamicalloc
+  // and restoring the callers stack pointer in this functions epilog. This is
+  // done because by tail calling the called function might overwrite the value
+  // in this function's (MF) stack pointer stack slot 0(SP).
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, parameter list area and the part of the local variable space which
+  // contains copies of aggregates which are passed by value.
+
+  // Assign locations to all of the outgoing arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  PPCCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+
+  // Reserve space for the linkage area on the stack.
+  CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(),
+                       PtrByteSize);
+  if (useSoftFloat())
+    CCInfo.PreAnalyzeCallOperands(Outs);
+
+  if (isVarArg) {
+    // Handle fixed and variable vector arguments differently.
+    // Fixed vector arguments go into registers as long as registers are
+    // available. Variable vector arguments always go into memory.
+    unsigned NumArgs = Outs.size();
+
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ArgVT = Outs[i].VT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      bool Result;
+
+      if (Outs[i].IsFixed) {
+        Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
+                               CCInfo);
+      } else {
+        Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
+                                      ArgFlags, CCInfo);
+      }
+
+      if (Result) {
+#ifndef NDEBUG
+        errs() << "Call operand #" << i << " has unhandled type "
+             << EVT(ArgVT).getEVTString() << "\n";
+#endif
+        llvm_unreachable(nullptr);
+      }
+    }
+  } else {
+    // All arguments are treated the same.
+    CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4);
+  }
+  CCInfo.clearWasPPCF128();
+
+  // Assign locations to all of the outgoing aggregate by value arguments.
+  SmallVector<CCValAssign, 16> ByValArgLocs;
+  CCState CCByValInfo(CallConv, isVarArg, MF, ByValArgLocs, *DAG.getContext());
+
+  // Reserve stack space for the allocations in CCInfo.
+  CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
+
+  CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);
+
+  // Size of the linkage area, parameter list area and the part of the local
+  // space variable where copies of aggregates which are passed by value are
+  // stored.
+  unsigned NumBytes = CCByValInfo.getNextStackOffset();
+
+  // Calculate by how many bytes the stack has to be adjusted in case of tail
+  // call optimization.
+  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
+                               dl);
+  SDValue CallSeqStart = Chain;
+
+  // Load the return address and frame pointer so it can be moved somewhere else
+  // later.
+  SDValue LROp, FPOp;
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  bool seenFloatArg = false;
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, j = 0, e = ArgLocs.size();
+       i != e;
+       ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    if (Flags.isByVal()) {
+      // Argument is an aggregate which is passed by value, thus we need to
+      // create a copy of it in the local variable space of the current stack
+      // frame (which is the stack frame of the caller) and pass the address of
+      // this copy to the callee.
+      assert((j < ByValArgLocs.size()) && "Index out of bounds!");
+      CCValAssign &ByValVA = ByValArgLocs[j++];
+      assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
+
+      // Memory reserved in the local variable space of the callers stack frame.
+      unsigned LocMemOffset = ByValVA.getLocMemOffset();
+
+      SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
+      PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
+                           StackPtr, PtrOff);
+
+      // Create a copy of the argument in the local area of the current
+      // stack frame.
+      SDValue MemcpyCall =
+        CreateCopyOfByValArgument(Arg, PtrOff,
+                                  CallSeqStart.getNode()->getOperand(0),
+                                  Flags, DAG, dl);
+
+      // This must go outside the CALLSEQ_START..END.
+      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                           CallSeqStart.getNode()->getOperand(1),
+                           SDLoc(MemcpyCall));
+      DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+                             NewCallSeqStart.getNode());
+      Chain = CallSeqStart = NewCallSeqStart;
+
+      // Pass the address of the aggregate copy on the stack either in a
+      // physical register or in the parameter list area of the current stack
+      // frame to the callee.
+      Arg = PtrOff;
+    }
+
+    if (VA.isRegLoc()) {
+      if (Arg.getValueType() == MVT::i1)
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Arg);
+
+      seenFloatArg |= VA.getLocVT().isFloatingPoint();
+      // Put argument in a physical register.
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      // Put argument in the parameter list area of the current stack frame.
+      assert(VA.isMemLoc());
+      unsigned LocMemOffset = VA.getLocMemOffset();
+
+      if (!isTailCall) {
+        SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
+        PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
+                             StackPtr, PtrOff);
+
+        MemOpChains.push_back(
+            DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
+      } else {
+        // Calculate and remember argument location.
+        CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
+                                 TailCallArguments);
+      }
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Set CR bit 6 to true if this is a vararg call with floating args passed in
+  // registers.
+  if (isVarArg) {
+    SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Ops[] = { Chain, InFlag };
+
+    Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
+                        dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1));
+
+    InFlag = Chain.getValue(1);
+  }
+
+  if (isTailCall)
+    PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
+                    TailCallArguments);
+
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
+                    /* unused except on PPC64 ELFv1 */ false, DAG,
+                    RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
+                    NumBytes, Ins, InVals, CS);
+}
+
+// Copy an argument into memory, being careful to do this outside the
+// call sequence for the call to which the argument belongs.
+SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
+    SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
+    SelectionDAG &DAG, const SDLoc &dl) const {
+  SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
+                        CallSeqStart.getNode()->getOperand(0),
+                        Flags, DAG, dl);
+  // The MEMCPY must go outside the CALLSEQ_START..END.
+  SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                             CallSeqStart.getNode()->getOperand(1),
+                             SDLoc(MemcpyCall));
+  DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+                         NewCallSeqStart.getNode());
+  return NewCallSeqStart;
+}
+
+SDValue PPCTargetLowering::LowerCall_64SVR4(
+    SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+    bool isTailCall, bool isPatchPoint,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+    ImmutableCallSite *CS) const {
+
+  bool isELFv2ABI = Subtarget.isELFv2ABI();
+  bool isLittleEndian = Subtarget.isLittleEndian();
+  unsigned NumOps = Outs.size();
+  bool hasNest = false;
+  bool IsSibCall = false;
+
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  unsigned PtrByteSize = 8;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
+    IsSibCall = true;
+
+  // Mark this function as potentially containing a function that contains a
+  // tail call. As a consequence the frame pointer will be used for dynamicalloc
+  // and restoring the callers stack pointer in this functions epilog. This is
+  // done because by tail calling the called function might overwrite the value
+  // in this function's (MF) stack pointer stack slot 0(SP).
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+  assert(!(CallConv == CallingConv::Fast && isVarArg) &&
+         "fastcc not supported on varargs functions");
+
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, and parameter passing area.  On ELFv1, the linkage area is 48 bytes
+  // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
+  // area is 32 bytes reserved space for [SP][CR][LR][TOC].
+  unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+  unsigned NumBytes = LinkageSize;
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+  unsigned &QFPR_idx = FPR_idx;
+
+  static const MCPhysReg GPR[] = {
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const MCPhysReg VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+
+  const unsigned NumGPRs = array_lengthof(GPR);
+  const unsigned NumFPRs = 13;
+  const unsigned NumVRs  = array_lengthof(VR);
+  const unsigned NumQFPRs = NumFPRs;
+
+  // When using the fast calling convention, we don't provide backing for
+  // arguments that will be in registers.
+  unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;
+
+  // Add up all the space actually used.
+  for (unsigned i = 0; i != NumOps; ++i) {
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    EVT ArgVT = Outs[i].VT;
+    EVT OrigVT = Outs[i].ArgVT;
+
+    if (Flags.isNest())
+      continue;
+
+    if (CallConv == CallingConv::Fast) {
+      if (Flags.isByVal())
+        NumGPRsUsed += (Flags.getByValSize()+7)/8;
+      else
+        switch (ArgVT.getSimpleVT().SimpleTy) {
+        default: llvm_unreachable("Unexpected ValueType for argument!");
+        case MVT::i1:
+        case MVT::i32:
+        case MVT::i64:
+          if (++NumGPRsUsed <= NumGPRs)
+            continue;
+          break;
+        case MVT::v4i32:
+        case MVT::v8i16:
+        case MVT::v16i8:
+        case MVT::v2f64:
+        case MVT::v2i64:
+        case MVT::v1i128:
+          if (++NumVRsUsed <= NumVRs)
+            continue;
+          break;
+        case MVT::v4f32:
+          // When using QPX, this is handled like a FP register, otherwise, it
+          // is an Altivec register.
+          if (Subtarget.hasQPX()) {
+            if (++NumFPRsUsed <= NumFPRs)
+              continue;
+          } else {
+            if (++NumVRsUsed <= NumVRs)
+              continue;
+          }
+          break;
+        case MVT::f32:
+        case MVT::f64:
+        case MVT::v4f64: // QPX
+        case MVT::v4i1:  // QPX
+          if (++NumFPRsUsed <= NumFPRs)
+            continue;
+          break;
+        }
+    }
+
+    /* Respect alignment of argument on the stack.  */
+    unsigned Align =
+      CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+    NumBytes = ((NumBytes + Align - 1) / Align) * Align;
+
+    NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+    if (Flags.isInConsecutiveRegsLast())
+      NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+  }
+
+  unsigned NumBytesActuallyUsed = NumBytes;
+
+  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // the stack, allowing va_start to index over them in memory if its varargs.
+  // Because we cannot tell if this is needed on the caller side, we have to
+  // conservatively assume that it is needed.  As such, make sure we have at
+  // least enough stack space for the caller to store the 8 GPRs.
+  // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area.
+  NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
+
+  // Tail call needs the stack to be aligned.
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
+
+  int SPDiff = 0;
+
+  // Calculate by how many bytes the stack has to be adjusted in case of tail
+  // call optimization.
+  if (!IsSibCall)
+    SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+  // To protect arguments on the stack from being clobbered in a tail call,
+  // force all the loads to happen before doing any other lowering.
+  if (isTailCall)
+    Chain = DAG.getStackArgumentTokenFactor(Chain);
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  if (!IsSibCall)
+    Chain = DAG.getCALLSEQ_START(Chain,
+                                 DAG.getIntPtrConstant(NumBytes, dl, true), dl);
+  SDValue CallSeqStart = Chain;
+
+  // Load the return address and frame pointer so it can be move somewhere else
+  // later.
+  SDValue LROp, FPOp;
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+
+  // Figure out which arguments are going to go in registers, and which in
+  // memory.  Also, if this is a vararg function, floating point operations
+  // must be stored to our stack, and loaded into integer regs as well, if
+  // any integer regs are available for argument passing.
+  unsigned ArgOffset = LinkageSize;
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+
+  SmallVector<SDValue, 8> MemOpChains;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    EVT ArgVT = Outs[i].VT;
+    EVT OrigVT = Outs[i].ArgVT;
+
+    // PtrOff will be used to store the current argument to the stack if a
+    // register cannot be found for it.
+    SDValue PtrOff;
+
+    // We re-align the argument offset for each argument, except when using the
+    // fast calling convention, when we need to make sure we do that only when
+    // we'll actually use a stack slot.
+    auto ComputePtrOff = [&]() {
+      /* Respect alignment of argument on the stack.  */
+      unsigned Align =
+        CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+      ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+
+      PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());
+
+      PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+    };
+
+    if (CallConv != CallingConv::Fast) {
+      ComputePtrOff();
+
+      /* Compute GPR index associated with argument offset.  */
+      GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
+      GPR_idx = std::min(GPR_idx, NumGPRs);
+    }
+
+    // Promote integers to 64-bit values.
+    if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) {
+      // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
+      unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
+    }
+
+    // FIXME memcpy is used way more than necessary.  Correctness first.
+    // Note: "by value" is code for passing a structure by value, not
+    // basic types.
+    if (Flags.isByVal()) {
+      // Note: Size includes alignment padding, so
+      //   struct x { short a; char b; }
+      // will have Size = 4.  With #pragma pack(1), it will have Size = 3.
+      // These are the proper values we need for right-justifying the
+      // aggregate in a parameter register.
+      unsigned Size = Flags.getByValSize();
+
+      // An empty aggregate parameter takes up no storage and no
+      // registers.
+      if (Size == 0)
+        continue;
+
+      if (CallConv == CallingConv::Fast)
+        ComputePtrOff();
+
+      // All aggregates smaller than 8 bytes must be passed right-justified.
+      if (Size==1 || Size==2 || Size==4) {
+        EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
+                                        MachinePointerInfo(), VT);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+
+          ArgOffset += PtrByteSize;
+          continue;
+        }
+      }
+
+      if (GPR_idx == NumGPRs && Size < 8) {
+        SDValue AddPtr = PtrOff;
+        if (!isLittleEndian) {
+          SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
+                                          PtrOff.getValueType());
+          AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        }
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+      // Copy entire object into memory.  There are cases where gcc-generated
+      // code assumes it is there, even if it could be put entirely into
+      // registers.  (This is not what the doc says.)
+
+      // FIXME: The above statement is likely due to a misunderstanding of the
+      // documents.  All arguments must be copied into the parameter area BY
+      // THE CALLEE in the event that the callee takes the address of any
+      // formal argument.  That has not yet been implemented.  However, it is
+      // reasonable to use the stack area as a staging area for the register
+      // load.
+
+      // Skip this for small aggregates, as we will use the same slot for a
+      // right-justified copy, below.
+      if (Size >= 8)
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+
+      // When a register is available, pass a small aggregate right-justified.
+      if (Size < 8 && GPR_idx != NumGPRs) {
+        // The easiest way to get this right-justified in a register
+        // is to copy the structure into the rightmost portion of a
+        // local variable slot, then load the whole slot into the
+        // register.
+        // FIXME: The memcpy seems to produce pretty awful code for
+        // small aggregates, particularly for packed ones.
+        // FIXME: It would be preferable to use the slot in the
+        // parameter save area instead of a new local variable.
+        SDValue AddPtr = PtrOff;
+        if (!isLittleEndian) {
+          SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType());
+          AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        }
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+
+        // Load the slot into the register.
+        SDValue Load =
+            DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo());
+        MemOpChains.push_back(Load.getValue(1));
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+
+        // Done with this argument.
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+
+      // For aggregates larger than PtrByteSize, copy the pieces of the
+      // object that fit into registers from the parameter save area.
+      for (unsigned j=0; j<Size; j+=PtrByteSize) {
+        SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
+        SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load =
+              DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (Arg.getSimpleValueType().SimpleTy) {
+    default: llvm_unreachable("Unexpected ValueType for argument!");
+    case MVT::i1:
+    case MVT::i32:
+    case MVT::i64:
+      if (Flags.isNest()) {
+        // The 'nest' parameter, if any, is passed in R11.
+        RegsToPass.push_back(std::make_pair(PPC::X11, Arg));
+        hasNest = true;
+        break;
+      }
+
+      // These can be scalar arguments or elements of an integer array type
+      // passed directly.  Clang may use those instead of "byval" aggregate
+      // types to avoid forcing arguments to memory unnecessarily.
+      if (GPR_idx != NumGPRs) {
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+      } else {
+        if (CallConv == CallingConv::Fast)
+          ComputePtrOff();
+
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+        if (CallConv == CallingConv::Fast)
+          ArgOffset += PtrByteSize;
+      }
+      if (CallConv != CallingConv::Fast)
+        ArgOffset += PtrByteSize;
+      break;
+    case MVT::f32:
+    case MVT::f64: {
+      // These can be scalar arguments or elements of a float array type
+      // passed directly.  The latter are used to implement ELFv2 homogenous
+      // float aggregates.
+
+      // Named arguments go into FPRs first, and once they overflow, the
+      // remaining arguments go into GPRs and then the parameter save area.
+      // Unnamed arguments for vararg functions always go to GPRs and
+      // then the parameter save area.  For now, put all arguments to vararg
+      // routines always in both locations (FPR *and* GPR or stack slot).
+      bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs;
+      bool NeededLoad = false;
+
+      // First load the argument into the next available FPR.
+      if (FPR_idx != NumFPRs)
+        RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
+
+      // Next, load the argument into GPR or stack slot if needed.
+      if (!NeedGPROrStack)
+        ;
+      else if (GPR_idx != NumGPRs && CallConv != CallingConv::Fast) {
+        // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
+        // once we support fp <-> gpr moves.
+
+        // In the non-vararg case, this can only ever happen in the
+        // presence of f32 array types, since otherwise we never run
+        // out of FPRs before running out of GPRs.
+        SDValue ArgVal;
+
+        // Double values are always passed in a single GPR.
+        if (Arg.getValueType() != MVT::f32) {
+          ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
+
+        // Non-array float values are extended and passed in a GPR.
+        } else if (!Flags.isInConsecutiveRegs()) {
+          ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+          ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
+
+        // If we have an array of floats, we collect every odd element
+        // together with its predecessor into one GPR.
+        } else if (ArgOffset % PtrByteSize != 0) {
+          SDValue Lo, Hi;
+          Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]);
+          Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+          if (!isLittleEndian)
+            std::swap(Lo, Hi);
+          ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+
+        // The final element, if even, goes into the first half of a GPR.
+        } else if (Flags.isInConsecutiveRegsLast()) {
+          ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+          ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
+          if (!isLittleEndian)
+            ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal,
+                                 DAG.getConstant(32, dl, MVT::i32));
+
+        // Non-final even elements are skipped; they will be handled
+        // together the with subsequent argument on the next go-around.
+        } else
+          ArgVal = SDValue();
+
+        if (ArgVal.getNode())
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal));
+      } else {
+        if (CallConv == CallingConv::Fast)
+          ComputePtrOff();
+
+        // Single-precision floating-point values are mapped to the
+        // second (rightmost) word of the stack doubleword.
+        if (Arg.getValueType() == MVT::f32 &&
+            !isLittleEndian && !Flags.isInConsecutiveRegs()) {
+          SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
+          PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+        }
+
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+
+        NeededLoad = true;
+      }
+      // When passing an array of floats, the array occupies consecutive
+      // space in the argument area; only round up to the next doubleword
+      // at the end of the array.  Otherwise, each float takes 8 bytes.
+      if (CallConv != CallingConv::Fast || NeededLoad) {
+        ArgOffset += (Arg.getValueType() == MVT::f32 &&
+                      Flags.isInConsecutiveRegs()) ? 4 : 8;
+        if (Flags.isInConsecutiveRegsLast())
+          ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      }
+      break;
+    }
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+    case MVT::v2f64:
+    case MVT::v2i64:
+    case MVT::v1i128:
+      if (!Subtarget.hasQPX()) {
+      // These can be scalar arguments or elements of a vector array type
+      // passed directly.  The latter are used to implement ELFv2 homogenous
+      // vector aggregates.
+
+      // For a varargs call, named arguments go into VRs or on the stack as
+      // usual; unnamed arguments always go to the stack or the corresponding
+      // GPRs when within range.  For now, we always put the value in both
+      // locations (or even all three).
+      if (isVarArg) {
+        // We could elide this store in the case where the object fits
+        // entirely in R registers.  Maybe later.
+        SDValue Store =
+            DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
+        MemOpChains.push_back(Store);
+        if (VR_idx != NumVRs) {
+          SDValue Load =
+              DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
+        }
+        ArgOffset += 16;
+        for (unsigned i=0; i<16; i+=PtrByteSize) {
+          if (GPR_idx == NumGPRs)
+            break;
+          SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+                                   DAG.getConstant(i, dl, PtrVT));
+          SDValue Load =
+              DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+        }
+        break;
+      }
+
+      // Non-varargs Altivec params go into VRs or on the stack.
+      if (VR_idx != NumVRs) {
+        RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
+      } else {
+        if (CallConv == CallingConv::Fast)
+          ComputePtrOff();
+
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, true, MemOpChains,
+                         TailCallArguments, dl);
+        if (CallConv == CallingConv::Fast)
+          ArgOffset += 16;
+      }
+
+      if (CallConv != CallingConv::Fast)
+        ArgOffset += 16;
+      break;
+      } // not QPX
+
+      assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 &&
+             "Invalid QPX parameter type");
+
+      /* fall through */
+    case MVT::v4f64:
+    case MVT::v4i1: {
+      bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
+      if (isVarArg) {
+        // We could elide this store in the case where the object fits
+        // entirely in R registers.  Maybe later.
+        SDValue Store =
+            DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
+        MemOpChains.push_back(Store);
+        if (QFPR_idx != NumQFPRs) {
+          SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store,
+                                     PtrOff, MachinePointerInfo());
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load));
+        }
+        ArgOffset += (IsF32 ? 16 : 32);
+        for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) {
+          if (GPR_idx == NumGPRs)
+            break;
+          SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+                                   DAG.getConstant(i, dl, PtrVT));
+          SDValue Load =
+              DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+        }
+        break;
+      }
+
+      // Non-varargs QPX params go into registers or on the stack.
+      if (QFPR_idx != NumQFPRs) {
+        RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg));
+      } else {
+        if (CallConv == CallingConv::Fast)
+          ComputePtrOff();
+
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, true, MemOpChains,
+                         TailCallArguments, dl);
+        if (CallConv == CallingConv::Fast)
+          ArgOffset += (IsF32 ? 16 : 32);
+      }
+
+      if (CallConv != CallingConv::Fast)
+        ArgOffset += (IsF32 ? 16 : 32);
+      break;
+      }
+    }
+  }
+
+  assert(NumBytesActuallyUsed == ArgOffset);
+  (void)NumBytesActuallyUsed;
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // Check if this is an indirect call (MTCTR/BCTRL).
+  // See PrepareCall() for more information about calls through function
+  // pointers in the 64-bit SVR4 ABI.
+  if (!isTailCall && !isPatchPoint &&
+      !isFunctionGlobalAddress(Callee) &&
+      !isa<ExternalSymbolSDNode>(Callee)) {
+    // Load r2 into a virtual register and store it to the TOC save area.
+    setUsesTOCBasePtr(DAG);
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
+    // TOC save area offset.
+    unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
+    SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
+    SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+    Chain = DAG.getStore(
+        Val.getValue(1), dl, Val, AddPtr,
+        MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));
+    // In the ELFv2 ABI, R12 must contain the address of an indirect callee.
+    // This does not mean the MTCTR instruction must use R12; it's easier
+    // to model this as an extra parameter, so do that.
+    if (isELFv2ABI && !isPatchPoint)
+      RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
+  }
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  if (isTailCall && !IsSibCall)
+    PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
+                    TailCallArguments);
+
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, hasNest,
+                    DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee,
+                    SPDiff, NumBytes, Ins, InVals, CS);
+}
+
+SDValue PPCTargetLowering::LowerCall_Darwin(
+    SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+    bool isTailCall, bool isPatchPoint,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+    ImmutableCallSite *CS) const {
+
+  unsigned NumOps = Outs.size();
+
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  bool isPPC64 = PtrVT == MVT::i64;
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Mark this function as potentially containing a function that contains a
+  // tail call. As a consequence the frame pointer will be used for dynamicalloc
+  // and restoring the callers stack pointer in this functions epilog. This is
+  // done because by tail calling the called function might overwrite the value
+  // in this function's (MF) stack pointer stack slot 0(SP).
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, and parameter passing area.  We start with 24/48 bytes, which is
+  // prereserved space for [SP][CR][LR][3 x unused].
+  unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+  unsigned NumBytes = LinkageSize;
+
+  // Add up all the space actually used.
+  // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
+  // they all go in registers, but we must reserve stack space for them for
+  // possible use by the caller.  In varargs or 64-bit calls, parameters are
+  // assigned stack space in order, with padding so Altivec parameters are
+  // 16-byte aligned.
+  unsigned nAltivecParamsAtEnd = 0;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    EVT ArgVT = Outs[i].VT;
+    // Varargs Altivec parameters are padded to a 16 byte boundary.
+    if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
+        ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
+        ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) {
+      if (!isVarArg && !isPPC64) {
+        // Non-varargs Altivec parameters go after all the non-Altivec
+        // parameters; handle those later so we know how much padding we need.
+        nAltivecParamsAtEnd++;
+        continue;
+      }
+      // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
+      NumBytes = ((NumBytes+15)/16)*16;
+    }
+    NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+  }
+
+  // Allow for Altivec parameters at the end, if needed.
+  if (nAltivecParamsAtEnd) {
+    NumBytes = ((NumBytes+15)/16)*16;
+    NumBytes += 16*nAltivecParamsAtEnd;
+  }
+
+  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // the stack, allowing va_start to index over them in memory if its varargs.
+  // Because we cannot tell if this is needed on the caller side, we have to
+  // conservatively assume that it is needed.  As such, make sure we have at
+  // least enough stack space for the caller to store the 8 GPRs.
+  NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
+
+  // Tail call needs the stack to be aligned.
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
+
+  // Calculate by how many bytes the stack has to be adjusted in case of tail
+  // call optimization.
+  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+  // To protect arguments on the stack from being clobbered in a tail call,
+  // force all the loads to happen before doing any other lowering.
+  if (isTailCall)
+    Chain = DAG.getStackArgumentTokenFactor(Chain);
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
+                               dl);
+  SDValue CallSeqStart = Chain;
+
+  // Load the return address and frame pointer so it can be move somewhere else
+  // later.
+  SDValue LROp, FPOp;
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr;
+  if (isPPC64)
+    StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+  else
+    StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+
+  // Figure out which arguments are going to go in registers, and which in
+  // memory.  Also, if this is a vararg function, floating point operations
+  // must be stored to our stack, and loaded into integer regs as well, if
+  // any integer regs are available for argument passing.
+  unsigned ArgOffset = LinkageSize;
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  static const MCPhysReg GPR_32[] = {           // 32-bit registers.
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  static const MCPhysReg GPR_64[] = {           // 64-bit registers.
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const MCPhysReg VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+  const unsigned NumGPRs = array_lengthof(GPR_32);
+  const unsigned NumFPRs = 13;
+  const unsigned NumVRs  = array_lengthof(VR);
+
+  const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+
+  SmallVector<SDValue, 8> MemOpChains;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    // PtrOff will be used to store the current argument to the stack if a
+    // register cannot be found for it.
+    SDValue PtrOff;
+
+    PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());
+
+    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+
+    // On PPC64, promote integers to 64-bit values.
+    if (isPPC64 && Arg.getValueType() == MVT::i32) {
+      // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
+      unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
+    }
+
+    // FIXME memcpy is used way more than necessary.  Correctness first.
+    // Note: "by value" is code for passing a structure by value, not
+    // basic types.
+    if (Flags.isByVal()) {
+      unsigned Size = Flags.getByValSize();
+      // Very small objects are passed right-justified.  Everything else is
+      // passed left-justified.
+      if (Size==1 || Size==2) {
+        EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
+                                        MachinePointerInfo(), VT);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+
+          ArgOffset += PtrByteSize;
+        } else {
+          SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
+                                          PtrOff.getValueType());
+          SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+          Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                            CallSeqStart,
+                                                            Flags, DAG, dl);
+          ArgOffset += PtrByteSize;
+        }
+        continue;
+      }
+      // Copy entire object into memory.  There are cases where gcc-generated
+      // code assumes it is there, even if it could be put entirely into
+      // registers.  (This is not what the doc says.)
+      Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
+                                                        CallSeqStart,
+                                                        Flags, DAG, dl);
+
+      // For small aggregates (Darwin only) and aggregates >= PtrByteSize,
+      // copy the pieces of the object that fit into registers from the
+      // parameter save area.
+      for (unsigned j=0; j<Size; j+=PtrByteSize) {
+        SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
+        SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load =
+              DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (Arg.getSimpleValueType().SimpleTy) {
+    default: llvm_unreachable("Unexpected ValueType for argument!");
+    case MVT::i1:
+    case MVT::i32:
+    case MVT::i64:
+      if (GPR_idx != NumGPRs) {
+        if (Arg.getValueType() == MVT::i1)
+          Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg);
+
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         isPPC64, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      }
+      ArgOffset += PtrByteSize;
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (FPR_idx != NumFPRs) {
+        RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
+
+        if (isVarArg) {
+          SDValue Store =
+              DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
+          MemOpChains.push_back(Store);
+
+          // Float varargs are always shadowed in available integer registers
+          if (GPR_idx != NumGPRs) {
+            SDValue Load =
+                DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
+            MemOpChains.push_back(Load.getValue(1));
+            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          }
+          if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
+            SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
+            PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+            SDValue Load =
+                DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
+            MemOpChains.push_back(Load.getValue(1));
+            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          }
+        } else {
+          // If we have any FPRs remaining, we may also have GPRs remaining.
+          // Args passed in FPRs consume either 1 (f32) or 2 (f64) available
+          // GPRs.
+          if (GPR_idx != NumGPRs)
+            ++GPR_idx;
+          if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 &&
+              !isPPC64)  // PPC64 has 64-bit GPR's obviously :)
+            ++GPR_idx;
+        }
+      } else
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         isPPC64, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      if (isPPC64)
+        ArgOffset += 8;
+      else
+        ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      if (isVarArg) {
+        // These go aligned on the stack, or in the corresponding R registers
+        // when within range.  The Darwin PPC ABI doc claims they also go in
+        // V registers; in fact gcc does this only for arguments that are
+        // prototyped, not for those that match the ...  We do it for all
+        // arguments, seems to work.
+        while (ArgOffset % 16 !=0) {
+          ArgOffset += PtrByteSize;
+          if (GPR_idx != NumGPRs)
+            GPR_idx++;
+        }
+        // We could elide this store in the case where the object fits
+        // entirely in R registers.  Maybe later.
+        PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                             DAG.getConstant(ArgOffset, dl, PtrVT));
+        SDValue Store =
+            DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
+        MemOpChains.push_back(Store);
+        if (VR_idx != NumVRs) {
+          SDValue Load =
+              DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
+        }
+        ArgOffset += 16;
+        for (unsigned i=0; i<16; i+=PtrByteSize) {
+          if (GPR_idx == NumGPRs)
+            break;
+          SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+                                   DAG.getConstant(i, dl, PtrVT));
+          SDValue Load =
+              DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+        }
+        break;
+      }
+
+      // Non-varargs Altivec params generally go in registers, but have
+      // stack space allocated at the end.
+      if (VR_idx != NumVRs) {
+        // Doesn't have GPR space allocated.
+        RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
+      } else if (nAltivecParamsAtEnd==0) {
+        // We are emitting Altivec params in order.
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         isPPC64, isTailCall, true, MemOpChains,
+                         TailCallArguments, dl);
+        ArgOffset += 16;
+      }
+      break;
+    }
+  }
+  // If all Altivec parameters fit in registers, as they usually do,
+  // they get stack space following the non-Altivec parameters.  We
+  // don't track this here because nobody below needs it.
+  // If there are more Altivec parameters than fit in registers emit
+  // the stores here.
+  if (!isVarArg && nAltivecParamsAtEnd > NumVRs) {
+    unsigned j = 0;
+    // Offset is aligned; skip 1st 12 params which go in V registers.
+    ArgOffset = ((ArgOffset+15)/16)*16;
+    ArgOffset += 12*16;
+    for (unsigned i = 0; i != NumOps; ++i) {
+      SDValue Arg = OutVals[i];
+      EVT ArgType = Outs[i].VT;
+      if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 ||
+          ArgType==MVT::v8i16 || ArgType==MVT::v16i8) {
+        if (++j > NumVRs) {
+          SDValue PtrOff;
+          // We are emitting Altivec params in order.
+          LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                           isPPC64, isTailCall, true, MemOpChains,
+                           TailCallArguments, dl);
+          ArgOffset += 16;
+        }
+      }
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // On Darwin, R12 must contain the address of an indirect callee.  This does
+  // not mean the MTCTR instruction must use R12; it's easier to model this as
+  // an extra parameter, so do that.
+  if (!isTailCall &&
+      !isFunctionGlobalAddress(Callee) &&
+      !isa<ExternalSymbolSDNode>(Callee) &&
+      !isBLACompatibleAddress(Callee, DAG))
+    RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
+                                                   PPC::R12), Callee));
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  if (isTailCall)
+    PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
+                    TailCallArguments);
+
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
+                    /* unused except on PPC64 ELFv1 */ false, DAG,
+                    RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
+                    NumBytes, Ins, InVals, CS);
+}
+
+bool
+PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                  MachineFunction &MF, bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_PPC);
+}
+
+SDValue
+PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
+                               const SDLoc &dl, SelectionDAG &DAG) const {
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    SDValue Arg = OutVals[i];
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const MCPhysReg *I =
+    TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+  if (I) {
+    for (; *I; ++I) {
+
+      if (PPC::G8RCRegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+      else if (PPC::F8RCRegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
+      else if (PPC::CRRCRegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::i1));
+      else if (PPC::VRRCRegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::Other));
+      else
+        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+    }
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
+}
+
+SDValue
+PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+
+  // Get the corect type for integers.
+  EVT IntVT = Op.getValueType();
+
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue FPSIdx = getFramePointerFrameIndex(DAG);
+  // Build a DYNAREAOFFSET node.
+  SDValue Ops[2] = {Chain, FPSIdx};
+  SDVTList VTs = DAG.getVTList(IntVT);
+  return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops);
+}
+
+SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  // When we pop the dynamic allocation we need to restore the SP link.
+  SDLoc dl(Op);
+
+  // Get the corect type for pointers.
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // Construct the stack pointer operand.
+  bool isPPC64 = Subtarget.isPPC64();
+  unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
+  SDValue StackPtr = DAG.getRegister(SP, PtrVT);
+
+  // Get the operands for the STACKRESTORE.
+  SDValue Chain = Op.getOperand(0);
+  SDValue SaveSP = Op.getOperand(1);
+
+  // Load the old link SP.
+  SDValue LoadLinkSP =
+      DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo());
+
+  // Restore the stack pointer.
+  Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);
+
+  // Store the old link SP.
+  return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo());
+}
+
+SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool isPPC64 = Subtarget.isPPC64();
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
+
+  // Get current frame pointer save index.  The users of this index will be
+  // primarily DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int RASI = FI->getReturnAddrSaveIndex();
+
+  // If the frame pointer save index hasn't been defined yet.
+  if (!RASI) {
+    // Find out what the fix offset of the frame pointer save area.
+    int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset();
+    // Allocate the frame index for frame pointer save area.
+    RASI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, LROffset, false);
+    // Save the result.
+    FI->setReturnAddrSaveIndex(RASI);
+  }
+  return DAG.getFrameIndex(RASI, PtrVT);
+}
+
+SDValue
+PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool isPPC64 = Subtarget.isPPC64();
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
+
+  // Get current frame pointer save index.  The users of this index will be
+  // primarily DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int FPSI = FI->getFramePointerSaveIndex();
+
+  // If the frame pointer save index hasn't been defined yet.
+  if (!FPSI) {
+    // Find out what the fix offset of the frame pointer save area.
+    int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset();
+    // Allocate the frame index for frame pointer save area.
+    FPSI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
+    // Save the result.
+    FI->setFramePointerSaveIndex(FPSI);
+  }
+  return DAG.getFrameIndex(FPSI, PtrVT);
+}
+
+SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  SDLoc dl(Op);
+
+  // Get the corect type for pointers.
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  // Negate the size.
+  SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
+                                DAG.getConstant(0, dl, PtrVT), Size);
+  // Construct a node for the frame pointer save index.
+  SDValue FPSIdx = getFramePointerFrameIndex(DAG);
+  // Build a DYNALLOC node.
+  SDValue Ops[3] = { Chain, NegSize, FPSIdx };
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
+  return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);
+}
+
+SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  bool isPPC64 = Subtarget.isPPC64();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  int FI = MF.getFrameInfo().CreateFixedObject(isPPC64 ? 8 : 4, 0, false);
+  return DAG.getFrameIndex(FI, PtrVT);
+}
+
+SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL,
+                     DAG.getVTList(MVT::i32, MVT::Other),
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  if (Op.getValueType().isVector())
+    return LowerVectorLoad(Op, DAG);
+
+  assert(Op.getValueType() == MVT::i1 &&
+         "Custom lowering only for i1 loads");
+
+  // First, load 8 bits into 32 bits, then truncate to 1 bit.
+
+  SDLoc dl(Op);
+  LoadSDNode *LD = cast<LoadSDNode>(Op);
+
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  MachineMemOperand *MMO = LD->getMemOperand();
+
+  SDValue NewLD =
+      DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain,
+                     BasePtr, MVT::i8, MMO);
+  SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);
+
+  SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  if (Op.getOperand(1).getValueType().isVector())
+    return LowerVectorStore(Op, DAG);
+
+  assert(Op.getOperand(1).getValueType() == MVT::i1 &&
+         "Custom lowering only for i1 stores");
+
+  // First, zero extend to 32 bits, then use a truncating store to 8 bits.
+
+  SDLoc dl(Op);
+  StoreSDNode *ST = cast<StoreSDNode>(Op);
+
+  SDValue Chain = ST->getChain();
+  SDValue BasePtr = ST->getBasePtr();
+  SDValue Value = ST->getValue();
+  MachineMemOperand *MMO = ST->getMemOperand();
+
+  Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()),
+                      Value);
+  return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO);
+}
+
+// FIXME: Remove this once the ANDI glue bug is fixed:
+SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::i1 &&
+         "Custom lowering only for i1 results");
+
+  SDLoc DL(Op);
+  return DAG.getNode(PPCISD::ANDIo_1_GT_BIT, DL, MVT::i1,
+                     Op.getOperand(0));
+}
+
+/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
+/// possible.
+SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  // Not FP? Not a fsel.
+  if (!Op.getOperand(0).getValueType().isFloatingPoint() ||
+      !Op.getOperand(2).getValueType().isFloatingPoint())
+    return Op;
+
+  // We might be able to do better than this under some circumstances, but in
+  // general, fsel-based lowering of select is a finite-math-only optimization.
+  // For more information, see section F.3 of the 2.06 ISA specification.
+  if (!DAG.getTarget().Options.NoInfsFPMath ||
+      !DAG.getTarget().Options.NoNaNsFPMath)
+    return Op;
+  // TODO: Propagate flags from the select rather than global settings.
+  SDNodeFlags Flags;
+  Flags.setNoInfs(true);
+  Flags.setNoNaNs(true);
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+  EVT ResVT = Op.getValueType();
+  EVT CmpVT = Op.getOperand(0).getValueType();
+  SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+  SDValue TV  = Op.getOperand(2), FV  = Op.getOperand(3);
+  SDLoc dl(Op);
+
+  // If the RHS of the comparison is a 0.0, we don't need to do the
+  // subtraction at all.
+  SDValue Sel1;
+  if (isFloatingPointZero(RHS))
+    switch (CC) {
+    default: break;       // SETUO etc aren't handled by fsel.
+    case ISD::SETNE:
+      std::swap(TV, FV);
+    case ISD::SETEQ:
+      if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
+        LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
+      Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
+      if (Sel1.getValueType() == MVT::f32)   // Comparison is always 64-bits
+        Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT,
+                         DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV);
+    case ISD::SETULT:
+    case ISD::SETLT:
+      std::swap(TV, FV);  // fsel is natively setge, swap operands for setlt
+    case ISD::SETOGE:
+    case ISD::SETGE:
+      if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
+        LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
+    case ISD::SETUGT:
+    case ISD::SETGT:
+      std::swap(TV, FV);  // fsel is natively setge, swap operands for setlt
+    case ISD::SETOLE:
+    case ISD::SETLE:
+      if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
+        LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT,
+                         DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV);
+    }
+
+  SDValue Cmp;
+  switch (CC) {
+  default: break;       // SETUO etc aren't handled by fsel.
+  case ISD::SETNE:
+    std::swap(TV, FV);
+  case ISD::SETEQ:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+    Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
+    if (Sel1.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
+    return DAG.getNode(PPCISD::FSEL, dl, ResVT,
+                       DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV);
+  case ISD::SETULT:
+  case ISD::SETLT:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+    return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
+  case ISD::SETOGE:
+  case ISD::SETGE:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+    return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
+  case ISD::SETUGT:
+  case ISD::SETGT:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+    return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
+  case ISD::SETOLE:
+  case ISD::SETLE:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+    return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
+  }
+  return Op;
+}
+
+void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
+                                               SelectionDAG &DAG,
+                                               const SDLoc &dl) const {
+  assert(Op.getOperand(0).getValueType().isFloatingPoint());
+  SDValue Src = Op.getOperand(0);
+  if (Src.getValueType() == MVT::f32)
+    Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
+
+  SDValue Tmp;
+  switch (Op.getSimpleValueType().SimpleTy) {
+  default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
+  case MVT::i32:
+    Tmp = DAG.getNode(
+        Op.getOpcode() == ISD::FP_TO_SINT
+            ? PPCISD::FCTIWZ
+            : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
+        dl, MVT::f64, Src);
+    break;
+  case MVT::i64:
+    assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
+           "i64 FP_TO_UINT is supported only with FPCVT");
+    Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
+                                                        PPCISD::FCTIDUZ,
+                      dl, MVT::f64, Src);
+    break;
+  }
+
+  // Convert the FP value to an int value through memory.
+  bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
+    (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT());
+  SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
+  int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
+  MachinePointerInfo MPI =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+
+  // Emit a store to the stack slot.
+  SDValue Chain;
+  if (i32Stack) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4);
+    SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
+    Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
+              DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
+  } else
+    Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI);
+
+  // Result is a load from the stack slot.  If loading 4 bytes, make sure to
+  // add in a bias on big endian.
+  if (Op.getValueType() == MVT::i32 && !i32Stack) {
+    FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
+                        DAG.getConstant(4, dl, FIPtr.getValueType()));
+    MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4);
+  }
+
+  RLI.Chain = Chain;
+  RLI.Ptr = FIPtr;
+  RLI.MPI = MPI;
+}
+
+/// \brief Custom lowers floating point to integer conversions to use
+/// the direct move instructions available in ISA 2.07 to avoid the
+/// need for load/store combinations.
+SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
+                                                    SelectionDAG &DAG,
+                                                    const SDLoc &dl) const {
+  assert(Op.getOperand(0).getValueType().isFloatingPoint());
+  SDValue Src = Op.getOperand(0);
+
+  if (Src.getValueType() == MVT::f32)
+    Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
+
+  SDValue Tmp;
+  switch (Op.getSimpleValueType().SimpleTy) {
+  default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
+  case MVT::i32:
+    Tmp = DAG.getNode(
+        Op.getOpcode() == ISD::FP_TO_SINT
+            ? PPCISD::FCTIWZ
+            : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
+        dl, MVT::f64, Src);
+    Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp);
+    break;
+  case MVT::i64:
+    assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
+           "i64 FP_TO_UINT is supported only with FPCVT");
+    Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
+                                                        PPCISD::FCTIDUZ,
+                      dl, MVT::f64, Src);
+    Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp);
+    break;
+  }
+  return Tmp;
+}
+
+SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+                                          const SDLoc &dl) const {
+  if (Subtarget.hasDirectMove() && Subtarget.isPPC64())
+    return LowerFP_TO_INTDirectMove(Op, DAG, dl);
+
+  ReuseLoadInfo RLI;
+  LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
+
+  return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI,
+                     RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
+}
+
+// We're trying to insert a regular store, S, and then a load, L. If the
+// incoming value, O, is a load, we might just be able to have our load use the
+// address used by O. However, we don't know if anything else will store to
+// that address before we can load from it. To prevent this situation, we need
+// to insert our load, L, into the chain as a peer of O. To do this, we give L
+// the same chain operand as O, we create a token factor from the chain results
+// of O and L, and we replace all uses of O's chain result with that token
+// factor (see spliceIntoChain below for this last part).
+bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
+                                            ReuseLoadInfo &RLI,
+                                            SelectionDAG &DAG,
+                                            ISD::LoadExtType ET) const {
+  SDLoc dl(Op);
+  if (ET == ISD::NON_EXTLOAD &&
+      (Op.getOpcode() == ISD::FP_TO_UINT ||
+       Op.getOpcode() == ISD::FP_TO_SINT) &&
+      isOperationLegalOrCustom(Op.getOpcode(),
+                               Op.getOperand(0).getValueType())) {
+
+    LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
+    return true;
+  }
+
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(Op);
+  if (!LD || LD->getExtensionType() != ET || LD->isVolatile() ||
+      LD->isNonTemporal())
+    return false;
+  if (LD->getMemoryVT() != MemVT)
+    return false;
+
+  RLI.Ptr = LD->getBasePtr();
+  if (LD->isIndexed() && !LD->getOffset().isUndef()) {
+    assert(LD->getAddressingMode() == ISD::PRE_INC &&
+           "Non-pre-inc AM on PPC?");
+    RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr,
+                          LD->getOffset());
+  }
+
+  RLI.Chain = LD->getChain();
+  RLI.MPI = LD->getPointerInfo();
+  RLI.IsDereferenceable = LD->isDereferenceable();
+  RLI.IsInvariant = LD->isInvariant();
+  RLI.Alignment = LD->getAlignment();
+  RLI.AAInfo = LD->getAAInfo();
+  RLI.Ranges = LD->getRanges();
+
+  RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1);
+  return true;
+}
+
+// Given the head of the old chain, ResChain, insert a token factor containing
+// it and NewResChain, and make users of ResChain now be users of that token
+// factor.
+void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
+                                        SDValue NewResChain,
+                                        SelectionDAG &DAG) const {
+  if (!ResChain)
+    return;
+
+  SDLoc dl(NewResChain);
+
+  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                           NewResChain, DAG.getUNDEF(MVT::Other));
+  assert(TF.getNode() != NewResChain.getNode() &&
+         "A new TF really is required here");
+
+  DAG.ReplaceAllUsesOfValueWith(ResChain, TF);
+  DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain);
+}
+
+/// \brief Analyze profitability of direct move
+/// prefer float load to int load plus direct move
+/// when there is no integer use of int load
+bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
+  SDNode *Origin = Op.getOperand(0).getNode();
+  if (Origin->getOpcode() != ISD::LOAD)
+    return true;
+
+  // If there is no LXSIBZX/LXSIHZX, like Power8,
+  // prefer direct move if the memory size is 1 or 2 bytes.
+  MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand();
+  if (!Subtarget.hasP9Vector() && MMO->getSize() <= 2)
+    return true;
+
+  for (SDNode::use_iterator UI = Origin->use_begin(),
+                            UE = Origin->use_end();
+       UI != UE; ++UI) {
+
+    // Only look at the users of the loaded value.
+    if (UI.getUse().get().getResNo() != 0)
+      continue;
+
+    if (UI->getOpcode() != ISD::SINT_TO_FP &&
+        UI->getOpcode() != ISD::UINT_TO_FP)
+      return true;
+  }
+
+  return false;
+}
+
+/// \brief Custom lowers integer to floating point conversions to use
+/// the direct move instructions available in ISA 2.07 to avoid the
+/// need for load/store combinations.
+SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
+                                                    SelectionDAG &DAG,
+                                                    const SDLoc &dl) const {
+  assert((Op.getValueType() == MVT::f32 ||
+          Op.getValueType() == MVT::f64) &&
+         "Invalid floating point type as target of conversion");
+  assert(Subtarget.hasFPCVT() &&
+         "Int to FP conversions with direct moves require FPCVT");
+  SDValue FP;
+  SDValue Src = Op.getOperand(0);
+  bool SinglePrec = Op.getValueType() == MVT::f32;
+  bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32;
+  bool Signed = Op.getOpcode() == ISD::SINT_TO_FP;
+  unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) :
+                             (SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU);
+
+  if (WordInt) {
+    FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ,
+                     dl, MVT::f64, Src);
+    FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
+  }
+  else {
+    FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src);
+    FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
+  }
+
+  return FP;
+}
+
+SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+
+  if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) {
+    if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64)
+      return SDValue();
+
+    SDValue Value = Op.getOperand(0);
+    // The values are now known to be -1 (false) or 1 (true). To convert this
+    // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
+    // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
+    Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
+
+    SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
+
+    Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
+
+    if (Op.getValueType() != MVT::v4f64)
+      Value = DAG.getNode(ISD::FP_ROUND, dl,
+                          Op.getValueType(), Value,
+                          DAG.getIntPtrConstant(1, dl));
+    return Value;
+  }
+
+  // Don't handle ppc_fp128 here; let it be lowered to a libcall.
+  if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
+    return SDValue();
+
+  if (Op.getOperand(0).getValueType() == MVT::i1)
+    return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0),
+                       DAG.getConstantFP(1.0, dl, Op.getValueType()),
+                       DAG.getConstantFP(0.0, dl, Op.getValueType()));
+
+  // If we have direct moves, we can do all the conversion, skip the store/load
+  // however, without FPCVT we can't do most conversions.
+  if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) &&
+      Subtarget.isPPC64() && Subtarget.hasFPCVT())
+    return LowerINT_TO_FPDirectMove(Op, DAG, dl);
+
+  assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
+         "UINT_TO_FP is supported only with FPCVT");
+
+  // If we have FCFIDS, then use it when converting to single-precision.
+  // Otherwise, convert to double-precision and then round.
+  unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
+                       ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
+                                                            : PPCISD::FCFIDS)
+                       : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
+                                                            : PPCISD::FCFID);
+  MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
+                  ? MVT::f32
+                  : MVT::f64;
+
+  if (Op.getOperand(0).getValueType() == MVT::i64) {
+    SDValue SINT = Op.getOperand(0);
+    // When converting to single-precision, we actually need to convert
+    // to double-precision first and then round to single-precision.
+    // To avoid double-rounding effects during that operation, we have
+    // to prepare the input operand.  Bits that might be truncated when
+    // converting to double-precision are replaced by a bit that won't
+    // be lost at this stage, but is below the single-precision rounding
+    // position.
+    //
+    // However, if -enable-unsafe-fp-math is in effect, accept double
+    // rounding to avoid the extra overhead.
+    if (Op.getValueType() == MVT::f32 &&
+        !Subtarget.hasFPCVT() &&
+        !DAG.getTarget().Options.UnsafeFPMath) {
+
+      // Twiddle input to make sure the low 11 bits are zero.  (If this
+      // is the case, we are guaranteed the value will fit into the 53 bit
+      // mantissa of an IEEE double-precision value without rounding.)
+      // If any of those low 11 bits were not zero originally, make sure
+      // bit 12 (value 2048) is set instead, so that the final rounding
+      // to single-precision gets the correct result.
+      SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
+                                  SINT, DAG.getConstant(2047, dl, MVT::i64));
+      Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
+                          Round, DAG.getConstant(2047, dl, MVT::i64));
+      Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
+      Round = DAG.getNode(ISD::AND, dl, MVT::i64,
+                          Round, DAG.getConstant(-2048, dl, MVT::i64));
+
+      // However, we cannot use that value unconditionally: if the magnitude
+      // of the input value is small, the bit-twiddling we did above might
+      // end up visibly changing the output.  Fortunately, in that case, we
+      // don't need to twiddle bits since the original input will convert
+      // exactly to double-precision floating-point already.  Therefore,
+      // construct a conditional to use the original value if the top 11
+      // bits are all sign-bit copies, and use the rounded value computed
+      // above otherwise.
+      SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
+                                 SINT, DAG.getConstant(53, dl, MVT::i32));
+      Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
+                         Cond, DAG.getConstant(1, dl, MVT::i64));
+      Cond = DAG.getSetCC(dl, MVT::i32,
+                          Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);
+
+      SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
+    }
+
+    ReuseLoadInfo RLI;
+    SDValue Bits;
+
+    MachineFunction &MF = DAG.getMachineFunction();
+    if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) {
+      Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI,
+                         RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
+      spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
+    } else if (Subtarget.hasLFIWAX() &&
+               canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) {
+      MachineMemOperand *MMO =
+        MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+                                RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+      SDValue Ops[] = { RLI.Chain, RLI.Ptr };
+      Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl,
+                                     DAG.getVTList(MVT::f64, MVT::Other),
+                                     Ops, MVT::i32, MMO);
+      spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
+    } else if (Subtarget.hasFPCVT() &&
+               canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) {
+      MachineMemOperand *MMO =
+        MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+                                RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+      SDValue Ops[] = { RLI.Chain, RLI.Ptr };
+      Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl,
+                                     DAG.getVTList(MVT::f64, MVT::Other),
+                                     Ops, MVT::i32, MMO);
+      spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
+    } else if (((Subtarget.hasLFIWAX() &&
+                 SINT.getOpcode() == ISD::SIGN_EXTEND) ||
+                (Subtarget.hasFPCVT() &&
+                 SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
+               SINT.getOperand(0).getValueType() == MVT::i32) {
+      MachineFrameInfo &MFI = MF.getFrameInfo();
+      EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+      int FrameIdx = MFI.CreateStackObject(4, 4, false);
+      SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+      SDValue Store =
+          DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
+                       MachinePointerInfo::getFixedStack(
+                           DAG.getMachineFunction(), FrameIdx));
+
+      assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
+             "Expected an i32 store");
+
+      RLI.Ptr = FIdx;
+      RLI.Chain = Store;
+      RLI.MPI =
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+      RLI.Alignment = 4;
+
+      MachineMemOperand *MMO =
+        MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+                                RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+      SDValue Ops[] = { RLI.Chain, RLI.Ptr };
+      Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ?
+                                     PPCISD::LFIWZX : PPCISD::LFIWAX,
+                                     dl, DAG.getVTList(MVT::f64, MVT::Other),
+                                     Ops, MVT::i32, MMO);
+    } else
+      Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
+
+    SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);
+
+    if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
+      FP = DAG.getNode(ISD::FP_ROUND, dl,
+                       MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
+    return FP;
+  }
+
+  assert(Op.getOperand(0).getValueType() == MVT::i32 &&
+         "Unhandled INT_TO_FP type in custom expander!");
+  // Since we only generate this in 64-bit mode, we can take advantage of
+  // 64-bit registers.  In particular, sign extend the input value into the
+  // 64-bit register with extsw, store the WHOLE 64-bit value into the stack
+  // then lfd it and fcfid it.
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
+
+  SDValue Ld;
+  if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
+    ReuseLoadInfo RLI;
+    bool ReusingLoad;
+    if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI,
+                                            DAG))) {
+      int FrameIdx = MFI.CreateStackObject(4, 4, false);
+      SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+      SDValue Store =
+          DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
+                       MachinePointerInfo::getFixedStack(
+                           DAG.getMachineFunction(), FrameIdx));
+
+      assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
+             "Expected an i32 store");
+
+      RLI.Ptr = FIdx;
+      RLI.Chain = Store;
+      RLI.MPI =
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+      RLI.Alignment = 4;
+    }
+
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+                              RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+    SDValue Ops[] = { RLI.Chain, RLI.Ptr };
+    Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
+                                   PPCISD::LFIWZX : PPCISD::LFIWAX,
+                                 dl, DAG.getVTList(MVT::f64, MVT::Other),
+                                 Ops, MVT::i32, MMO);
+    if (ReusingLoad)
+      spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG);
+  } else {
+    assert(Subtarget.isPPC64() &&
+           "i32->FP without LFIWAX supported only on PPC64");
+
+    int FrameIdx = MFI.CreateStackObject(8, 8, false);
+    SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+    SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64,
+                                Op.getOperand(0));
+
+    // STD the extended value into the stack slot.
+    SDValue Store = DAG.getStore(
+        DAG.getEntryNode(), dl, Ext64, FIdx,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
+
+    // Load the value as a double.
+    Ld = DAG.getLoad(
+        MVT::f64, dl, Store, FIdx,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
+  }
+
+  // FCFID it and return it.
+  SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld);
+  if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
+    FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
+                     DAG.getIntPtrConstant(0, dl));
+  return FP;
+}
+
+SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  /*
+   The rounding mode is in bits 30:31 of FPSR, and has the following
+   settings:
+     00 Round to nearest
+     01 Round to 0
+     10 Round to +inf
+     11 Round to -inf
+
+  FLT_ROUNDS, on the other hand, expects the following:
+    -1 Undefined
+     0 Round to 0
+     1 Round to nearest
+     2 Round to +inf
+     3 Round to -inf
+
+  To perform the conversion, we do:
+    ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
+  */
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  EVT VT = Op.getValueType();
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
+
+  // Save FP Control Word to register
+  EVT NodeTys[] = {
+    MVT::f64,    // return register
+    MVT::Glue    // unused in this context
+  };
+  SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None);
+
+  // Save FP register to stack slot
+  int SSFI = MF.getFrameInfo().CreateStackObject(8, 8, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, StackSlot,
+                               MachinePointerInfo());
+
+  // Load FP Control Word from low 32 bits of stack slot.
+  SDValue Four = DAG.getConstant(4, dl, PtrVT);
+  SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
+  SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo());
+
+  // Transform as necessary
+  SDValue CWD1 =
+    DAG.getNode(ISD::AND, dl, MVT::i32,
+                CWD, DAG.getConstant(3, dl, MVT::i32));
+  SDValue CWD2 =
+    DAG.getNode(ISD::SRL, dl, MVT::i32,
+                DAG.getNode(ISD::AND, dl, MVT::i32,
+                            DAG.getNode(ISD::XOR, dl, MVT::i32,
+                                        CWD, DAG.getConstant(3, dl, MVT::i32)),
+                            DAG.getConstant(3, dl, MVT::i32)),
+                DAG.getConstant(1, dl, MVT::i32));
+
+  SDValue RetVal =
+    DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);
+
+  return DAG.getNode((VT.getSizeInBits() < 16 ?
+                      ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
+}
+
+SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  SDLoc dl(Op);
+  assert(Op.getNumOperands() == 3 &&
+         VT == Op.getOperand(1).getValueType() &&
+         "Unexpected SHL!");
+
+  // Expand into a bunch of logical ops.  Note that these ops
+  // depend on the PPC behavior for oversized shift amounts.
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+  EVT AmtVT = Amt.getValueType();
+
+  SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+                             DAG.getConstant(BitWidth, dl, AmtVT), Amt);
+  SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt);
+  SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1);
+  SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3);
+  SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+                             DAG.getConstant(-BitWidth, dl, AmtVT));
+  SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5);
+  SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
+  SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
+  SDValue OutOps[] = { OutLo, OutHi };
+  return DAG.getMergeValues(OutOps, dl);
+}
+
+SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned BitWidth = VT.getSizeInBits();
+  assert(Op.getNumOperands() == 3 &&
+         VT == Op.getOperand(1).getValueType() &&
+         "Unexpected SRL!");
+
+  // Expand into a bunch of logical ops.  Note that these ops
+  // depend on the PPC behavior for oversized shift amounts.
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+  EVT AmtVT = Amt.getValueType();
+
+  SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+                             DAG.getConstant(BitWidth, dl, AmtVT), Amt);
+  SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
+  SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
+  SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
+  SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+                             DAG.getConstant(-BitWidth, dl, AmtVT));
+  SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5);
+  SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
+  SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
+  SDValue OutOps[] = { OutLo, OutHi };
+  return DAG.getMergeValues(OutOps, dl);
+}
+
+SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  assert(Op.getNumOperands() == 3 &&
+         VT == Op.getOperand(1).getValueType() &&
+         "Unexpected SRA!");
+
+  // Expand into a bunch of logical ops, followed by a select_cc.
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+  EVT AmtVT = Amt.getValueType();
+
+  SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+                             DAG.getConstant(BitWidth, dl, AmtVT), Amt);
+  SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
+  SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
+  SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
+  SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+                             DAG.getConstant(-BitWidth, dl, AmtVT));
+  SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5);
+  SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt);
+  SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT),
+                                  Tmp4, Tmp6, ISD::SETLE);
+  SDValue OutOps[] = { OutLo, OutHi };
+  return DAG.getMergeValues(OutOps, dl);
+}
+
+//===----------------------------------------------------------------------===//
+// Vector related lowering.
+//
+
+/// BuildSplatI - Build a canonical splati of Val with an element size of
+/// SplatSize.  Cast the result to VT.
+static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
+                           SelectionDAG &DAG, const SDLoc &dl) {
+  assert(Val >= -16 && Val <= 15 && "vsplti is out of range!");
+
+  static const MVT VTys[] = { // canonical VT to use for each size.
+    MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
+  };
+
+  EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
+
+  // Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
+  if (Val == -1)
+    SplatSize = 1;
+
+  EVT CanonicalVT = VTys[SplatSize-1];
+
+  // Build a canonical splat for this value.
+  return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT));
+}
+
+/// BuildIntrinsicOp - Return a unary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG,
+                                const SDLoc &dl, EVT DestVT = MVT::Other) {
+  if (DestVT == MVT::Other) DestVT = Op.getValueType();
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+                     DAG.getConstant(IID, dl, MVT::i32), Op);
+}
+
+/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
+                                SelectionDAG &DAG, const SDLoc &dl,
+                                EVT DestVT = MVT::Other) {
+  if (DestVT == MVT::Other) DestVT = LHS.getValueType();
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+                     DAG.getConstant(IID, dl, MVT::i32), LHS, RHS);
+}
+
+/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
+                                SDValue Op2, SelectionDAG &DAG, const SDLoc &dl,
+                                EVT DestVT = MVT::Other) {
+  if (DestVT == MVT::Other) DestVT = Op0.getValueType();
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+                     DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2);
+}
+
+/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
+/// amount.  The result has the specified value type.
+static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
+                           SelectionDAG &DAG, const SDLoc &dl) {
+  // Force LHS/RHS to be the right type.
+  LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);
+  RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);
+
+  int Ops[16];
+  for (unsigned i = 0; i != 16; ++i)
+    Ops[i] = i + Amt;
+  SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops);
+  return DAG.getNode(ISD::BITCAST, dl, VT, T);
+}
+
+/// Do we have an efficient pattern in a .td file for this node?
+///
+/// \param V - pointer to the BuildVectorSDNode being matched
+/// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves?
+///
+/// There are some patterns where it is beneficial to keep a BUILD_VECTOR
+/// node as a BUILD_VECTOR node rather than expanding it. The patterns where
+/// the opposite is true (expansion is beneficial) are:
+/// - The node builds a vector out of integers that are not 32 or 64-bits
+/// - The node builds a vector out of constants
+/// - The node is a "load-and-splat"
+/// In all other cases, we will choose to keep the BUILD_VECTOR.
+static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
+                                            bool HasDirectMove) {
+  EVT VecVT = V->getValueType(0);
+  bool RightType = VecVT == MVT::v2f64 || VecVT == MVT::v4f32 ||
+    (HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32));
+  if (!RightType)
+    return false;
+
+  bool IsSplat = true;
+  bool IsLoad = false;
+  SDValue Op0 = V->getOperand(0);
+
+  // This function is called in a block that confirms the node is not a constant
+  // splat. So a constant BUILD_VECTOR here means the vector is built out of
+  // different constants.
+  if (V->isConstant())
+    return false;
+  for (int i = 0, e = V->getNumOperands(); i < e; ++i) {
+    if (V->getOperand(i).isUndef())
+      return false;
+    // We want to expand nodes that represent load-and-splat even if the
+    // loaded value is a floating point truncation or conversion to int.
+    if (V->getOperand(i).getOpcode() == ISD::LOAD ||
+        (V->getOperand(i).getOpcode() == ISD::FP_ROUND &&
+         V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) ||
+        (V->getOperand(i).getOpcode() == ISD::FP_TO_SINT &&
+         V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) ||
+        (V->getOperand(i).getOpcode() == ISD::FP_TO_UINT &&
+         V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD))
+      IsLoad = true;
+    // If the operands are different or the input is not a load and has more
+    // uses than just this BV node, then it isn't a splat.
+    if (V->getOperand(i) != Op0 ||
+        (!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode())))
+      IsSplat = false;
+  }
+  return !(IsSplat && IsLoad);
+}
+
+// If this is a case we can't handle, return null and let the default
+// expansion code take care of it.  If we CAN select this case, and if it
+// selects to a single instruction, return Op.  Otherwise, if we can codegen
+// this case more efficiently than a constant pool load, lower it to the
+// sequence of ops that should be used.
+SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
+
+  if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) {
+    // We first build an i32 vector, load it into a QPX register,
+    // then convert it to a floating-point vector and compare it
+    // to a zero vector to get the boolean result.
+    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+    int FrameIdx = MFI.CreateStackObject(16, 16, false);
+    MachinePointerInfo PtrInfo =
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
+    SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+    assert(BVN->getNumOperands() == 4 &&
+      "BUILD_VECTOR for v4i1 does not have 4 operands");
+
+    bool IsConst = true;
+    for (unsigned i = 0; i < 4; ++i) {
+      if (BVN->getOperand(i).isUndef()) continue;
+      if (!isa<ConstantSDNode>(BVN->getOperand(i))) {
+        IsConst = false;
+        break;
+      }
+    }
+
+    if (IsConst) {
+      Constant *One =
+        ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0);
+      Constant *NegOne =
+        ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0);
+
+      Constant *CV[4];
+      for (unsigned i = 0; i < 4; ++i) {
+        if (BVN->getOperand(i).isUndef())
+          CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext()));
+        else if (isNullConstant(BVN->getOperand(i)))
+          CV[i] = NegOne;
+        else
+          CV[i] = One;
+      }
+
+      Constant *CP = ConstantVector::get(CV);
+      SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()),
+                                          16 /* alignment */);
+
+      SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
+      SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other});
+      return DAG.getMemIntrinsicNode(
+          PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+    }
+
+    SmallVector<SDValue, 4> Stores;
+    for (unsigned i = 0; i < 4; ++i) {
+      if (BVN->getOperand(i).isUndef()) continue;
+
+      unsigned Offset = 4*i;
+      SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
+      Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
+
+      unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize();
+      if (StoreSize > 4) {
+        Stores.push_back(
+            DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx,
+                              PtrInfo.getWithOffset(Offset), MVT::i32));
+      } else {
+        SDValue StoreValue = BVN->getOperand(i);
+        if (StoreSize < 4)
+          StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue);
+
+        Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx,
+                                      PtrInfo.getWithOffset(Offset)));
+      }
+    }
+
+    SDValue StoreChain;
+    if (!Stores.empty())
+      StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+    else
+      StoreChain = DAG.getEntryNode();
+
+    // Now load from v4i32 into the QPX register; this will extend it to
+    // v4i64 but not yet convert it to a floating point. Nevertheless, this
+    // is typed as v4f64 because the QPX register integer states are not
+    // explicitly represented.
+
+    SDValue Ops[] = {StoreChain,
+                     DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32),
+                     FIdx};
+    SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other});
+
+    SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN,
+      dl, VTs, Ops, MVT::v4i32, PtrInfo);
+    LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
+      DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32),
+      LoadedVect);
+
+    SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64);
+
+    return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ);
+  }
+
+  // All other QPX vectors are handled by generic code.
+  if (Subtarget.hasQPX())
+    return SDValue();
+
+  // Check if this is a splat of a constant value.
+  APInt APSplatBits, APSplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                             HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
+      SplatBitSize > 32) {
+    // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be
+    // lowered to VSX instructions under certain conditions.
+    // Without VSX, there is no pattern more efficient than expanding the node.
+    if (Subtarget.hasVSX() &&
+        haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove()))
+      return Op;
+    return SDValue();
+  }
+
+  unsigned SplatBits = APSplatBits.getZExtValue();
+  unsigned SplatUndef = APSplatUndef.getZExtValue();
+  unsigned SplatSize = SplatBitSize / 8;
+
+  // First, handle single instruction cases.
+
+  // All zeros?
+  if (SplatBits == 0) {
+    // Canonicalize all zero vectors to be v4i32.
+    if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
+      SDValue Z = DAG.getConstant(0, dl, MVT::v4i32);
+      Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z);
+    }
+    return Op;
+  }
+
+  // We have XXSPLTIB for constant splats one byte wide
+  if (Subtarget.hasP9Vector() && SplatSize == 1) {
+    // This is a splat of 1-byte elements with some elements potentially undef.
+    // Rather than trying to match undef in the SDAG patterns, ensure that all
+    // elements are the same constant.
+    if (HasAnyUndefs || ISD::isBuildVectorAllOnes(BVN)) {
+      SmallVector<SDValue, 16> Ops(16, DAG.getConstant(SplatBits,
+                                                       dl, MVT::i32));
+      SDValue NewBV = DAG.getBuildVector(MVT::v16i8, dl, Ops);
+      if (Op.getValueType() != MVT::v16i8)
+        return DAG.getBitcast(Op.getValueType(), NewBV);
+      return NewBV;
+    }
+    return Op;
+  }
+
+  // If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
+  int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
+                    (32-SplatBitSize));
+  if (SextVal >= -16 && SextVal <= 15)
+    return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);
+
+  // Two instruction sequences.
+
+  // If this value is in the range [-32,30] and is even, use:
+  //     VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
+  // If this value is in the range [17,31] and is odd, use:
+  //     VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
+  // If this value is in the range [-31,-17] and is odd, use:
+  //     VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
+  // Note the last two are three-instruction sequences.
+  if (SextVal >= -32 && SextVal <= 31) {
+    // To avoid having these optimizations undone by constant folding,
+    // we convert to a pseudo that will be expanded later into one of
+    // the above forms.
+    SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32);
+    EVT VT = (SplatSize == 1 ? MVT::v16i8 :
+              (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
+    SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32);
+    SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
+    if (VT == Op.getValueType())
+      return RetVal;
+    else
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal);
+  }
+
+  // If this is 0x8000_0000 x 4, turn into vspltisw + vslw.  If it is
+  // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000).  This is important
+  // for fneg/fabs.
+  if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
+    // Make -1 and vspltisw -1:
+    SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl);
+
+    // Make the VSLW intrinsic, computing 0x8000_0000.
+    SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
+                                   OnesV, DAG, dl);
+
+    // xor by OnesV to invert it.
+    Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV);
+    return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+  }
+
+  // Check to see if this is a wide variety of vsplti*, binop self cases.
+  static const signed char SplatCsts[] = {
+    -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
+    -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
+  };
+
+  for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) {
+    // Indirect through the SplatCsts array so that we favor 'vsplti -1' for
+    // cases which are ambiguous (e.g. formation of 0x8000_0000).  'vsplti -1'
+    int i = SplatCsts[idx];
+
+    // Figure out what shift amount will be used by altivec if shifted by i in
+    // this splat size.
+    unsigned TypeShiftAmt = i & (SplatBitSize-1);
+
+    // vsplti + shl self.
+    if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
+        Intrinsic::ppc_altivec_vslw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+    }
+
+    // vsplti + srl self.
+    if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
+        Intrinsic::ppc_altivec_vsrw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+    }
+
+    // vsplti + sra self.
+    if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
+        Intrinsic::ppc_altivec_vsraw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+    }
+
+    // vsplti + rol self.
+    if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
+                         ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
+        Intrinsic::ppc_altivec_vrlw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+    }
+
+    // t = vsplti c, result = vsldoi t, t, 1
+    if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
+      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
+      return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
+    }
+    // t = vsplti c, result = vsldoi t, t, 2
+    if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
+      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
+      return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
+    }
+    // t = vsplti c, result = vsldoi t, t, 3
+    if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
+      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
+      return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
+    }
+  }
+
+  return SDValue();
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      const SDLoc &dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
+  unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
+
+  enum {
+    OP_COPY = 0,  // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VMRGHW,
+    OP_VMRGLW,
+    OP_VSPLTISW0,
+    OP_VSPLTISW1,
+    OP_VSPLTISW2,
+    OP_VSPLTISW3,
+    OP_VSLDOI4,
+    OP_VSLDOI8,
+    OP_VSLDOI12
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1*9+2)*9+3) return LHS;
+    assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+
+  int ShufIdxs[16];
+  switch (OpNum) {
+  default: llvm_unreachable("Unknown i32 permute!");
+  case OP_VMRGHW:
+    ShufIdxs[ 0] =  0; ShufIdxs[ 1] =  1; ShufIdxs[ 2] =  2; ShufIdxs[ 3] =  3;
+    ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
+    ShufIdxs[ 8] =  4; ShufIdxs[ 9] =  5; ShufIdxs[10] =  6; ShufIdxs[11] =  7;
+    ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
+    break;
+  case OP_VMRGLW:
+    ShufIdxs[ 0] =  8; ShufIdxs[ 1] =  9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
+    ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
+    ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
+    ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
+    break;
+  case OP_VSPLTISW0:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+0;
+    break;
+  case OP_VSPLTISW1:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+4;
+    break;
+  case OP_VSPLTISW2:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+8;
+    break;
+  case OP_VSPLTISW3:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+12;
+    break;
+  case OP_VSLDOI4:
+    return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl);
+  case OP_VSLDOI8:
+    return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl);
+  case OP_VSLDOI12:
+    return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);
+  }
+  EVT VT = OpLHS.getValueType();
+  OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS);
+  OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS);
+  SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);
+  return DAG.getNode(ISD::BITCAST, dl, VT, T);
+}
+
+/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE.  If this
+/// is a shuffle we can handle in a single instruction, return it.  Otherwise,
+/// return the code it can be lowered into.  Worst case, it can always be
+/// lowered into a vperm.
+SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  EVT VT = Op.getValueType();
+  bool isLittleEndian = Subtarget.isLittleEndian();
+
+  unsigned ShiftElts, InsertAtByte;
+  bool Swap;
+  if (Subtarget.hasP9Vector() &&
+      PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
+                           isLittleEndian)) {
+    if (Swap)
+      std::swap(V1, V2);
+    SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
+    SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2);
+    if (ShiftElts) {
+      SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2,
+                                DAG.getConstant(ShiftElts, dl, MVT::i32));
+      SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Shl,
+                                DAG.getConstant(InsertAtByte, dl, MVT::i32));
+      return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
+    }
+    SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Conv2,
+                              DAG.getConstant(InsertAtByte, dl, MVT::i32));
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
+  }
+
+  if (Subtarget.hasVSX()) {
+    if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
+      int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
+
+      // If the source for the shuffle is a scalar_to_vector that came from a
+      // 32-bit load, it will have used LXVWSX so we don't need to splat again.
+      if (Subtarget.hasP9Vector() &&
+          ((isLittleEndian && SplatIdx == 3) ||
+           (!isLittleEndian && SplatIdx == 0))) {
+        SDValue Src = V1.getOperand(0);
+        if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+            Src.getOperand(0).getOpcode() == ISD::LOAD &&
+            Src.getOperand(0).hasOneUse())
+          return V1;
+      }
+      SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
+      SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
+                                  DAG.getConstant(SplatIdx, dl, MVT::i32));
+      return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat);
+    }
+
+    // Left shifts of 8 bytes are actually swaps. Convert accordingly.
+    if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) {
+      SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
+      SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv);
+      return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap);
+    }
+
+  }
+
+  if (Subtarget.hasQPX()) {
+    if (VT.getVectorNumElements() != 4)
+      return SDValue();
+
+    if (V2.isUndef()) V2 = V1;
+
+    int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp);
+    if (AlignIdx != -1) {
+      return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2,
+                         DAG.getConstant(AlignIdx, dl, MVT::i32));
+    } else if (SVOp->isSplat()) {
+      int SplatIdx = SVOp->getSplatIndex();
+      if (SplatIdx >= 4) {
+        std::swap(V1, V2);
+        SplatIdx -= 4;
+      }
+
+      return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1,
+                         DAG.getConstant(SplatIdx, dl, MVT::i32));
+    }
+
+    // Lower this into a qvgpci/qvfperm pair.
+
+    // Compute the qvgpci literal
+    unsigned idx = 0;
+    for (unsigned i = 0; i < 4; ++i) {
+      int m = SVOp->getMaskElt(i);
+      unsigned mm = m >= 0 ? (unsigned) m : i;
+      idx |= mm << (3-i)*3;
+    }
+
+    SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64,
+                             DAG.getConstant(idx, dl, MVT::i32));
+    return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3);
+  }
+
+  // Cases that are handled by instructions that take permute immediates
+  // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
+  // selected by the instruction selector.
+  if (V2.isUndef()) {
+    if (PPC::isSplatShuffleMask(SVOp, 1) ||
+        PPC::isSplatShuffleMask(SVOp, 2) ||
+        PPC::isSplatShuffleMask(SVOp, 4) ||
+        PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) ||
+        PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) ||
+        PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 ||
+        PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) ||
+        PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) ||
+        PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) ||
+        PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) ||
+        PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) ||
+        PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) ||
+        (Subtarget.hasP8Altivec() && (
+         PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) ||
+         PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) ||
+         PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) {
+      return Op;
+    }
+  }
+
+  // Altivec has a variety of "shuffle immediates" that take two vector inputs
+  // and produce a fixed permutation.  If any of these match, do not lower to
+  // VPERM.
+  unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
+  if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) ||
+      PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) ||
+      PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 ||
+      PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
+      PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
+      PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
+      PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
+      PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
+      PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
+      (Subtarget.hasP8Altivec() && (
+       PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) ||
+       PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) ||
+       PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG))))
+    return Op;
+
+  // Check to see if this is a shuffle of 4-byte values.  If so, we can use our
+  // perfect shuffle table to emit an optimal matching sequence.
+  ArrayRef<int> PermMask = SVOp->getMask();
+
+  unsigned PFIndexes[4];
+  bool isFourElementShuffle = true;
+  for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
+    unsigned EltNo = 8;   // Start out undef.
+    for (unsigned j = 0; j != 4; ++j) {  // Intra-element byte.
+      if (PermMask[i*4+j] < 0)
+        continue;   // Undef, ignore it.
+
+      unsigned ByteSource = PermMask[i*4+j];
+      if ((ByteSource & 3) != j) {
+        isFourElementShuffle = false;
+        break;
+      }
+
+      if (EltNo == 8) {
+        EltNo = ByteSource/4;
+      } else if (EltNo != ByteSource/4) {
+        isFourElementShuffle = false;
+        break;
+      }
+    }
+    PFIndexes[i] = EltNo;
+  }
+
+  // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
+  // perfect shuffle vector to determine if it is cost effective to do this as
+  // discrete instructions, or whether we should use a vperm.
+  // For now, we skip this for little endian until such time as we have a
+  // little-endian perfect shuffle table.
+  if (isFourElementShuffle && !isLittleEndian) {
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex =
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost  = (PFEntry >> 30);
+
+    // Determining when to avoid vperm is tricky.  Many things affect the cost
+    // of vperm, particularly how many times the perm mask needs to be computed.
+    // For example, if the perm mask can be hoisted out of a loop or is already
+    // used (perhaps because there are multiple permutes with the same shuffle
+    // mask?) the vperm has a cost of 1.  OTOH, hoisting the permute mask out of
+    // the loop requires an extra register.
+    //
+    // As a compromise, we only emit discrete instructions if the shuffle can be
+    // generated in 3 or fewer operations.  When we have loop information
+    // available, if this block is within a loop, we should avoid using vperm
+    // for 3-operation perms and use a constant pool load instead.
+    if (Cost < 3)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+  }
+
+  // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
+  // vector that will get spilled to the constant pool.
+  if (V2.isUndef()) V2 = V1;
+
+  // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
+  // that it is in input element units, not in bytes.  Convert now.
+
+  // For little endian, the order of the input vectors is reversed, and
+  // the permutation mask is complemented with respect to 31.  This is
+  // necessary to produce proper semantics with the big-endian-biased vperm
+  // instruction.
+  EVT EltVT = V1.getValueType().getVectorElementType();
+  unsigned BytesPerElement = EltVT.getSizeInBits()/8;
+
+  SmallVector<SDValue, 16> ResultMask;
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+    unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
+
+    for (unsigned j = 0; j != BytesPerElement; ++j)
+      if (isLittleEndian)
+        ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement + j),
+                                             dl, MVT::i32));
+      else
+        ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement + j, dl,
+                                             MVT::i32));
+  }
+
+  SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask);
+  if (isLittleEndian)
+    return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
+                       V2, V1, VPermMask);
+  else
+    return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
+                       V1, V2, VPermMask);
+}
+
+/// getVectorCompareInfo - Given an intrinsic, return false if it is not a
+/// vector comparison.  If it is, return true and fill in Opc/isDot with
+/// information about the intrinsic.
+static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
+                                 bool &isDot, const PPCSubtarget &Subtarget) {
+  unsigned IntrinsicID =
+    cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
+  CompareOpc = -1;
+  isDot = false;
+  switch (IntrinsicID) {
+  default: return false;
+    // Comparison predicates.
+  case Intrinsic::ppc_altivec_vcmpbfp_p:  CompareOpc = 966; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc =   6; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc =  70; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequd_p:
+    if (Subtarget.hasP8Altivec()) {
+      CompareOpc = 199;
+      isDot = 1;
+    } else
+      return false;
+
+    break;
+  case Intrinsic::ppc_altivec_vcmpneb_p:
+  case Intrinsic::ppc_altivec_vcmpneh_p:
+  case Intrinsic::ppc_altivec_vcmpnew_p:
+  case Intrinsic::ppc_altivec_vcmpnezb_p:
+  case Intrinsic::ppc_altivec_vcmpnezh_p:
+  case Intrinsic::ppc_altivec_vcmpnezw_p:
+    if (Subtarget.hasP9Altivec()) {
+      switch(IntrinsicID) {
+      default: llvm_unreachable("Unknown comparison intrinsic.");
+      case Intrinsic::ppc_altivec_vcmpneb_p: CompareOpc = 7; break;
+      case Intrinsic::ppc_altivec_vcmpneh_p: CompareOpc = 71; break;
+      case Intrinsic::ppc_altivec_vcmpnew_p: CompareOpc = 135; break;
+      case Intrinsic::ppc_altivec_vcmpnezb_p: CompareOpc = 263; break;
+      case Intrinsic::ppc_altivec_vcmpnezh_p: CompareOpc = 327; break;
+      case Intrinsic::ppc_altivec_vcmpnezw_p: CompareOpc = 391; break;
+      }
+      isDot = 1;
+    } else
+      return false;
+
+    break;
+  case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsd_p:
+    if (Subtarget.hasP8Altivec()) {
+      CompareOpc = 967;
+      isDot = 1;
+    } else
+      return false;
+
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtud_p:
+    if (Subtarget.hasP8Altivec()) {
+      CompareOpc = 711;
+      isDot = 1;
+    } else
+      return false;
+
+    break;
+    // VSX predicate comparisons use the same infrastructure
+  case Intrinsic::ppc_vsx_xvcmpeqdp_p:
+  case Intrinsic::ppc_vsx_xvcmpgedp_p:
+  case Intrinsic::ppc_vsx_xvcmpgtdp_p:
+  case Intrinsic::ppc_vsx_xvcmpeqsp_p:
+  case Intrinsic::ppc_vsx_xvcmpgesp_p:
+  case Intrinsic::ppc_vsx_xvcmpgtsp_p:
+    if (Subtarget.hasVSX()) {
+      switch (IntrinsicID) {
+      case Intrinsic::ppc_vsx_xvcmpeqdp_p: CompareOpc = 99; break;
+      case Intrinsic::ppc_vsx_xvcmpgedp_p: CompareOpc = 115; break;
+      case Intrinsic::ppc_vsx_xvcmpgtdp_p: CompareOpc = 107; break;
+      case Intrinsic::ppc_vsx_xvcmpeqsp_p: CompareOpc = 67; break;
+      case Intrinsic::ppc_vsx_xvcmpgesp_p: CompareOpc = 83; break;
+      case Intrinsic::ppc_vsx_xvcmpgtsp_p: CompareOpc = 75; break;
+      }
+      isDot = 1;
+    }
+    else
+      return false;
+
+    break;
+
+    // Normal Comparisons.
+  case Intrinsic::ppc_altivec_vcmpbfp:    CompareOpc = 966; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpeqfp:   CompareOpc = 198; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequb:   CompareOpc =   6; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequh:   CompareOpc =  70; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequw:   CompareOpc = 134; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequd:
+    if (Subtarget.hasP8Altivec()) {
+      CompareOpc = 199;
+      isDot = 0;
+    } else
+      return false;
+
+    break;
+  case Intrinsic::ppc_altivec_vcmpneb:
+  case Intrinsic::ppc_altivec_vcmpneh:
+  case Intrinsic::ppc_altivec_vcmpnew:
+  case Intrinsic::ppc_altivec_vcmpnezb:
+  case Intrinsic::ppc_altivec_vcmpnezh:
+  case Intrinsic::ppc_altivec_vcmpnezw:
+    if (Subtarget.hasP9Altivec()) {
+      switch (IntrinsicID) {
+      default: llvm_unreachable("Unknown comparison intrinsic.");
+      case Intrinsic::ppc_altivec_vcmpneb: CompareOpc = 7; break;
+      case Intrinsic::ppc_altivec_vcmpneh: CompareOpc = 71; break;
+      case Intrinsic::ppc_altivec_vcmpnew: CompareOpc = 135; break;
+      case Intrinsic::ppc_altivec_vcmpnezb: CompareOpc = 263; break;
+      case Intrinsic::ppc_altivec_vcmpnezh: CompareOpc = 327; break;
+      case Intrinsic::ppc_altivec_vcmpnezw: CompareOpc = 391; break;
+      }
+      isDot = 0;
+    } else
+      return false;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgefp:   CompareOpc = 454; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtfp:   CompareOpc = 710; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsb:   CompareOpc = 774; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsh:   CompareOpc = 838; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsw:   CompareOpc = 902; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsd:
+    if (Subtarget.hasP8Altivec()) {
+      CompareOpc = 967;
+      isDot = 0;
+    } else
+      return false;
+
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtub:   CompareOpc = 518; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtuh:   CompareOpc = 582; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtuw:   CompareOpc = 646; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtud:
+    if (Subtarget.hasP8Altivec()) {
+      CompareOpc = 711;
+      isDot = 0;
+    } else
+      return false;
+
+    break;
+  }
+  return true;
+}
+
+/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
+/// lower, do it, otherwise return null.
+SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  unsigned IntrinsicID =
+    cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+
+  if (IntrinsicID == Intrinsic::thread_pointer) {
+    // Reads the thread pointer register, used for __builtin_thread_pointer.
+    bool is64bit = Subtarget.isPPC64();
+    return DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
+                           is64bit ? MVT::i64 : MVT::i32);
+  }
+
+  // If this is a lowered altivec predicate compare, CompareOpc is set to the
+  // opcode number of the comparison.
+  SDLoc dl(Op);
+  int CompareOpc;
+  bool isDot;
+  if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget))
+    return SDValue();    // Don't custom lower most intrinsics.
+
+  // If this is a non-dot comparison, make the VCMP node and we are done.
+  if (!isDot) {
+    SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(),
+                              Op.getOperand(1), Op.getOperand(2),
+                              DAG.getConstant(CompareOpc, dl, MVT::i32));
+    return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp);
+  }
+
+  // Create the PPCISD altivec 'dot' comparison node.
+  SDValue Ops[] = {
+    Op.getOperand(2),  // LHS
+    Op.getOperand(3),  // RHS
+    DAG.getConstant(CompareOpc, dl, MVT::i32)
+  };
+  EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
+  SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
+
+  // Now that we have the comparison, emit a copy from the CR to a GPR.
+  // This is flagged to the above dot comparison.
+  SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32,
+                                DAG.getRegister(PPC::CR6, MVT::i32),
+                                CompNode.getValue(1));
+
+  // Unpack the result based on how the target uses it.
+  unsigned BitNo;   // Bit # of CR6.
+  bool InvertBit;   // Invert result?
+  switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
+  default:  // Can't happen, don't crash on invalid number though.
+  case 0:   // Return the value of the EQ bit of CR6.
+    BitNo = 0; InvertBit = false;
+    break;
+  case 1:   // Return the inverted value of the EQ bit of CR6.
+    BitNo = 0; InvertBit = true;
+    break;
+  case 2:   // Return the value of the LT bit of CR6.
+    BitNo = 2; InvertBit = false;
+    break;
+  case 3:   // Return the inverted value of the LT bit of CR6.
+    BitNo = 2; InvertBit = true;
+    break;
+  }
+
+  // Shift the bit into the low position.
+  Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags,
+                      DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32));
+  // Isolate the bit.
+  Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags,
+                      DAG.getConstant(1, dl, MVT::i32));
+
+  // If we are supposed to, toggle the bit.
+  if (InvertBit)
+    Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags,
+                        DAG.getConstant(1, dl, MVT::i32));
+  return Flags;
+}
+
+SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  // For v2i64 (VSX), we can pattern patch the v2i32 case (using fp <-> int
+  // instructions), but for smaller types, we need to first extend up to v2i32
+  // before doing going farther.
+  if (Op.getValueType() == MVT::v2i64) {
+    EVT ExtVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    if (ExtVT != MVT::v2i32) {
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0));
+      Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, Op,
+                       DAG.getValueType(EVT::getVectorVT(*DAG.getContext(),
+                                        ExtVT.getVectorElementType(), 4)));
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Op);
+      Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v2i64, Op,
+                       DAG.getValueType(MVT::v2i32));
+    }
+
+    return Op;
+  }
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  // Create a stack slot that is 16-byte aligned.
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  int FrameIdx = MFI.CreateStackObject(16, 16, false);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+  // Store the input value into Value#0 of the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
+                               MachinePointerInfo());
+  // Load it out.
+  return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
+}
+
+SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
+         "Should only be called for ISD::INSERT_VECTOR_ELT");
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+  // We have legal lowering for constant indices but not for variable ones.
+  if (C)
+    return Op;
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDNode *N = Op.getNode();
+
+  assert(N->getOperand(0).getValueType() == MVT::v4i1 &&
+         "Unknown extract_vector_elt type");
+
+  SDValue Value = N->getOperand(0);
+
+  // The first part of this is like the store lowering except that we don't
+  // need to track the chain.
+
+  // The values are now known to be -1 (false) or 1 (true). To convert this
+  // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
+  // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
+  Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
+
+  // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
+  // understand how to form the extending load.
+  SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
+
+  Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
+
+  // Now convert to an integer and store.
+  Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
+    DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
+    Value);
+
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  int FrameIdx = MFI.CreateStackObject(16, 16, false);
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+  SDValue StoreChain = DAG.getEntryNode();
+  SDValue Ops[] = {StoreChain,
+                   DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
+                   Value, FIdx};
+  SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other);
+
+  StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
+    dl, VTs, Ops, MVT::v4i32, PtrInfo);
+
+  // Extract the value requested.
+  unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
+  Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
+
+  SDValue IntVal =
+      DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset));
+
+  if (!Subtarget.useCRBits())
+    return IntVal;
+
+  return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal);
+}
+
+/// Lowering for QPX v4i1 loads
+SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
+  SDValue LoadChain = LN->getChain();
+  SDValue BasePtr = LN->getBasePtr();
+
+  if (Op.getValueType() == MVT::v4f64 ||
+      Op.getValueType() == MVT::v4f32) {
+    EVT MemVT = LN->getMemoryVT();
+    unsigned Alignment = LN->getAlignment();
+
+    // If this load is properly aligned, then it is legal.
+    if (Alignment >= MemVT.getStoreSize())
+      return Op;
+
+    EVT ScalarVT = Op.getValueType().getScalarType(),
+        ScalarMemVT = MemVT.getScalarType();
+    unsigned Stride = ScalarMemVT.getStoreSize();
+
+    SDValue Vals[4], LoadChains[4];
+    for (unsigned Idx = 0; Idx < 4; ++Idx) {
+      SDValue Load;
+      if (ScalarVT != ScalarMemVT)
+        Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain,
+                              BasePtr,
+                              LN->getPointerInfo().getWithOffset(Idx * Stride),
+                              ScalarMemVT, MinAlign(Alignment, Idx * Stride),
+                              LN->getMemOperand()->getFlags(), LN->getAAInfo());
+      else
+        Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr,
+                           LN->getPointerInfo().getWithOffset(Idx * Stride),
+                           MinAlign(Alignment, Idx * Stride),
+                           LN->getMemOperand()->getFlags(), LN->getAAInfo());
+
+      if (Idx == 0 && LN->isIndexed()) {
+        assert(LN->getAddressingMode() == ISD::PRE_INC &&
+               "Unknown addressing mode on vector load");
+        Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(),
+                                  LN->getAddressingMode());
+      }
+
+      Vals[Idx] = Load;
+      LoadChains[Idx] = Load.getValue(1);
+
+      BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                            DAG.getConstant(Stride, dl,
+                                            BasePtr.getValueType()));
+    }
+
+    SDValue TF =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
+    SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals);
+
+    if (LN->isIndexed()) {
+      SDValue RetOps[] = { Value, Vals[0].getValue(1), TF };
+      return DAG.getMergeValues(RetOps, dl);
+    }
+
+    SDValue RetOps[] = { Value, TF };
+    return DAG.getMergeValues(RetOps, dl);
+  }
+
+  assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower");
+  assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported");
+
+  // To lower v4i1 from a byte array, we load the byte elements of the
+  // vector and then reuse the BUILD_VECTOR logic.
+
+  SDValue VectElmts[4], VectElmtChains[4];
+  for (unsigned i = 0; i < 4; ++i) {
+    SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
+    Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
+
+    VectElmts[i] = DAG.getExtLoad(
+        ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx,
+        LN->getPointerInfo().getWithOffset(i), MVT::i8,
+        /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo());
+    VectElmtChains[i] = VectElmts[i].getValue(1);
+  }
+
+  LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains);
+  SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts);
+
+  SDValue RVals[] = { Value, LoadChain };
+  return DAG.getMergeValues(RVals, dl);
+}
+
+/// Lowering for QPX v4i1 stores
+SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
+  SDValue StoreChain = SN->getChain();
+  SDValue BasePtr = SN->getBasePtr();
+  SDValue Value = SN->getValue();
+
+  if (Value.getValueType() == MVT::v4f64 ||
+      Value.getValueType() == MVT::v4f32) {
+    EVT MemVT = SN->getMemoryVT();
+    unsigned Alignment = SN->getAlignment();
+
+    // If this store is properly aligned, then it is legal.
+    if (Alignment >= MemVT.getStoreSize())
+      return Op;
+
+    EVT ScalarVT = Value.getValueType().getScalarType(),
+        ScalarMemVT = MemVT.getScalarType();
+    unsigned Stride = ScalarMemVT.getStoreSize();
+
+    SDValue Stores[4];
+    for (unsigned Idx = 0; Idx < 4; ++Idx) {
+      SDValue Ex = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
+          DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout())));
+      SDValue Store;
+      if (ScalarVT != ScalarMemVT)
+        Store =
+            DAG.getTruncStore(StoreChain, dl, Ex, BasePtr,
+                              SN->getPointerInfo().getWithOffset(Idx * Stride),
+                              ScalarMemVT, MinAlign(Alignment, Idx * Stride),
+                              SN->getMemOperand()->getFlags(), SN->getAAInfo());
+      else
+        Store = DAG.getStore(StoreChain, dl, Ex, BasePtr,
+                             SN->getPointerInfo().getWithOffset(Idx * Stride),
+                             MinAlign(Alignment, Idx * Stride),
+                             SN->getMemOperand()->getFlags(), SN->getAAInfo());
+
+      if (Idx == 0 && SN->isIndexed()) {
+        assert(SN->getAddressingMode() == ISD::PRE_INC &&
+               "Unknown addressing mode on vector store");
+        Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(),
+                                    SN->getAddressingMode());
+      }
+
+      BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                            DAG.getConstant(Stride, dl,
+                                            BasePtr.getValueType()));
+      Stores[Idx] = Store;
+    }
+
+    SDValue TF =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+
+    if (SN->isIndexed()) {
+      SDValue RetOps[] = { TF, Stores[0].getValue(1) };
+      return DAG.getMergeValues(RetOps, dl);
+    }
+
+    return TF;
+  }
+
+  assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported");
+  assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower");
+
+  // The values are now known to be -1 (false) or 1 (true). To convert this
+  // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
+  // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
+  Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
+
+  // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
+  // understand how to form the extending load.
+  SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
+
+  Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
+
+  // Now convert to an integer and store.
+  Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
+    DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
+    Value);
+
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  int FrameIdx = MFI.CreateStackObject(16, 16, false);
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+  SDValue Ops[] = {StoreChain,
+                   DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
+                   Value, FIdx};
+  SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other);
+
+  StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
+    dl, VTs, Ops, MVT::v4i32, PtrInfo);
+
+  // Move data into the byte array.
+  SDValue Loads[4], LoadChains[4];
+  for (unsigned i = 0; i < 4; ++i) {
+    unsigned Offset = 4*i;
+    SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
+    Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
+
+    Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx,
+                           PtrInfo.getWithOffset(Offset));
+    LoadChains[i] = Loads[i].getValue(1);
+  }
+
+  StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
+
+  SDValue Stores[4];
+  for (unsigned i = 0; i < 4; ++i) {
+    SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
+    Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
+
+    Stores[i] = DAG.getTruncStore(
+        StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i),
+        MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(),
+        SN->getAAInfo());
+  }
+
+  StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+
+  return StoreChain;
+}
+
+SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  if (Op.getValueType() == MVT::v4i32) {
+    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+
+    SDValue Zero  = BuildSplatI(  0, 1, MVT::v4i32, DAG, dl);
+    SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt.
+
+    SDValue RHSSwap =   // = vrlw RHS, 16
+      BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);
+
+    // Shrinkify inputs to v8i16.
+    LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS);
+    RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS);
+    RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap);
+
+    // Low parts multiplied together, generating 32-bit results (we ignore the
+    // top parts).
+    SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
+                                        LHS, RHS, DAG, dl, MVT::v4i32);
+
+    SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
+                                      LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);
+    // Shift the high parts up 16 bits.
+    HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
+                              Neg16, DAG, dl);
+    return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
+  } else if (Op.getValueType() == MVT::v8i16) {
+    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+
+    SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl);
+
+    return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm,
+                            LHS, RHS, Zero, DAG, dl);
+  } else if (Op.getValueType() == MVT::v16i8) {
+    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+    bool isLittleEndian = Subtarget.isLittleEndian();
+
+    // Multiply the even 8-bit parts, producing 16-bit sums.
+    SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
+                                           LHS, RHS, DAG, dl, MVT::v8i16);
+    EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts);
+
+    // Multiply the odd 8-bit parts, producing 16-bit sums.
+    SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,
+                                          LHS, RHS, DAG, dl, MVT::v8i16);
+    OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts);
+
+    // Merge the results together.  Because vmuleub and vmuloub are
+    // instructions with a big-endian bias, we must reverse the
+    // element numbering and reverse the meaning of "odd" and "even"
+    // when generating little endian code.
+    int Ops[16];
+    for (unsigned i = 0; i != 8; ++i) {
+      if (isLittleEndian) {
+        Ops[i*2  ] = 2*i;
+        Ops[i*2+1] = 2*i+16;
+      } else {
+        Ops[i*2  ] = 2*i+1;
+        Ops[i*2+1] = 2*i+1+16;
+      }
+    }
+    if (isLittleEndian)
+      return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops);
+    else
+      return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
+  } else {
+    llvm_unreachable("Unknown mul to lower!");
+  }
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Wasn't expecting to be able to lower this!");
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+  case ISD::SETCC:              return LowerSETCC(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
+
+  case ISD::VAARG:
+    return LowerVAARG(Op, DAG);
+
+  case ISD::VACOPY:
+    return LowerVACOPY(Op, DAG);
+
+  case ISD::STACKRESTORE:
+    return LowerSTACKRESTORE(Op, DAG);
+
+  case ISD::DYNAMIC_STACKALLOC:
+    return LowerDYNAMIC_STACKALLOC(Op, DAG);
+
+  case ISD::GET_DYNAMIC_AREA_OFFSET:
+    return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
+
+  case ISD::EH_DWARF_CFA:
+    return LowerEH_DWARF_CFA(Op, DAG);
+
+  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
+  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
+
+  case ISD::LOAD:               return LowerLOAD(Op, DAG);
+  case ISD::STORE:              return LowerSTORE(Op, DAG);
+  case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
+  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:         return LowerFP_TO_INT(Op, DAG,
+                                                      SDLoc(Op));
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:         return LowerINT_TO_FP(Op, DAG);
+  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
+
+  // Lower 64-bit shifts.
+  case ISD::SHL_PARTS:          return LowerSHL_PARTS(Op, DAG);
+  case ISD::SRL_PARTS:          return LowerSRL_PARTS(Op, DAG);
+  case ISD::SRA_PARTS:          return LowerSRA_PARTS(Op, DAG);
+
+  // Vector-related lowering.
+  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::MUL:                return LowerMUL(Op, DAG);
+
+  // For counter-based loop handling.
+  case ISD::INTRINSIC_W_CHAIN:  return SDValue();
+
+  // Frame & Return address.
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  }
+}
+
+void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) const {
+  SDLoc dl(N);
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
+  case ISD::READCYCLECOUNTER: {
+    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+    SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));
+
+    Results.push_back(RTB);
+    Results.push_back(RTB.getValue(1));
+    Results.push_back(RTB.getValue(2));
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
+        Intrinsic::ppc_is_decremented_ctr_nonzero)
+      break;
+
+    assert(N->getValueType(0) == MVT::i1 &&
+           "Unexpected result type for CTR decrement intrinsic");
+    EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                                 N->getValueType(0));
+    SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
+    SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
+                                 N->getOperand(1));
+
+    Results.push_back(NewInt);
+    Results.push_back(NewInt.getValue(1));
+    break;
+  }
+  case ISD::VAARG: {
+    if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64())
+      return;
+
+    EVT VT = N->getValueType(0);
+
+    if (VT == MVT::i64) {
+      SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG);
+
+      Results.push_back(NewNode);
+      Results.push_back(NewNode.getValue(1));
+    }
+    return;
+  }
+  case ISD::FP_ROUND_INREG: {
+    assert(N->getValueType(0) == MVT::ppcf128);
+    assert(N->getOperand(0).getValueType() == MVT::ppcf128);
+    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+                             MVT::f64, N->getOperand(0),
+                             DAG.getIntPtrConstant(0, dl));
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+                             MVT::f64, N->getOperand(0),
+                             DAG.getIntPtrConstant(1, dl));
+
+    // Add the two halves of the long double in round-to-zero mode.
+    SDValue FPreg = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
+
+    // We know the low half is about to be thrown away, so just use something
+    // convenient.
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
+                                FPreg, FPreg));
+    return;
+  }
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    // LowerFP_TO_INT() can only handle f32 and f64.
+    if (N->getOperand(0).getValueType() == MVT::ppcf128)
+      return;
+    Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
+    return;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Function *Func = Intrinsic::getDeclaration(M, Id);
+  return Builder.CreateCall(Func, {});
+}
+
+// The mappings for emitLeading/TrailingFence is taken from
+// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
+Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+                                         AtomicOrdering Ord, bool IsStore,
+                                         bool IsLoad) const {
+  if (Ord == AtomicOrdering::SequentiallyConsistent)
+    return callIntrinsic(Builder, Intrinsic::ppc_sync);
+  if (isReleaseOrStronger(Ord))
+    return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
+  return nullptr;
+}
+
+Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+                                          AtomicOrdering Ord, bool IsStore,
+                                          bool IsLoad) const {
+  if (IsLoad && isAcquireOrStronger(Ord))
+    return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
+  // FIXME: this is too conservative, a dependent branch + isync is enough.
+  // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
+  // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
+  // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
+  return nullptr;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
+                                    unsigned AtomicSize,
+                                    unsigned BinOpcode,
+                                    unsigned CmpOpcode,
+                                    unsigned CmpPred) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+
+  auto LoadMnemonic = PPC::LDARX;
+  auto StoreMnemonic = PPC::STDCX;
+  switch (AtomicSize) {
+  default:
+    llvm_unreachable("Unexpected size of atomic entity");
+  case 1:
+    LoadMnemonic = PPC::LBARX;
+    StoreMnemonic = PPC::STBCX;
+    assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
+    break;
+  case 2:
+    LoadMnemonic = PPC::LHARX;
+    StoreMnemonic = PPC::STHCX;
+    assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
+    break;
+  case 4:
+    LoadMnemonic = PPC::LWARX;
+    StoreMnemonic = PPC::STWCX;
+    break;
+  case 8:
+    LoadMnemonic = PPC::LDARX;
+    StoreMnemonic = PPC::STDCX;
+    break;
+  }
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *F = BB->getParent();
+  MachineFunction::iterator It = ++BB->getIterator();
+
+  unsigned dest = MI.getOperand(0).getReg();
+  unsigned ptrA = MI.getOperand(1).getReg();
+  unsigned ptrB = MI.getOperand(2).getReg();
+  unsigned incr = MI.getOperand(3).getReg();
+  DebugLoc dl = MI.getDebugLoc();
+
+  MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB =
+    CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
+  MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, loopMBB);
+  if (CmpOpcode)
+    F->insert(It, loop2MBB);
+  F->insert(It, exitMBB);
+  exitMBB->splice(exitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  MachineRegisterInfo &RegInfo = F->getRegInfo();
+  unsigned TmpReg = (!BinOpcode) ? incr :
+    RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass
+                                           : &PPC::GPRCRegClass);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   l[wd]arx dest, ptr
+  //   add r0, dest, incr
+  //   st[wd]cx. r0, ptr
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+
+  // For max/min...
+  //  loopMBB:
+  //   l[wd]arx dest, ptr
+  //   cmpl?[wd] incr, dest
+  //   bgt exitMBB
+  //  loop2MBB:
+  //   st[wd]cx. dest, ptr
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(LoadMnemonic), dest)
+    .addReg(ptrA).addReg(ptrB);
+  if (BinOpcode)
+    BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest);
+  if (CmpOpcode) {
+    // Signed comparisons of byte or halfword values must be sign-extended.
+    if (CmpOpcode == PPC::CMPW && AtomicSize < 4) {
+      unsigned ExtReg =  RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
+      BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH),
+              ExtReg).addReg(dest);
+      BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
+        .addReg(incr).addReg(ExtReg);
+    } else
+      BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
+        .addReg(incr).addReg(dest);
+
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB);
+    BB->addSuccessor(loop2MBB);
+    BB->addSuccessor(exitMBB);
+    BB = loop2MBB;
+  }
+  BuildMI(BB, dl, TII->get(StoreMnemonic))
+    .addReg(TmpReg).addReg(ptrA).addReg(ptrB);
+  BuildMI(BB, dl, TII->get(PPC::BCC))
+    .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+  return BB;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr &MI,
+                                            MachineBasicBlock *BB,
+                                            bool is8bit, // operation
+                                            unsigned BinOpcode,
+                                            unsigned CmpOpcode,
+                                            unsigned CmpPred) const {
+  // If we support part-word atomic mnemonics, just use them
+  if (Subtarget.hasPartwordAtomics())
+    return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode,
+                            CmpOpcode, CmpPred);
+
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  // In 64 bit mode we have to use 64 bits for addresses, even though the
+  // lwarx/stwcx are 32 bits.  With the 32-bit atomics we can use address
+  // registers without caring whether they're 32 or 64, but here we're
+  // doing actual arithmetic on the addresses.
+  bool is64bit = Subtarget.isPPC64();
+  bool isLittleEndian = Subtarget.isLittleEndian();
+  unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *F = BB->getParent();
+  MachineFunction::iterator It = ++BB->getIterator();
+
+  unsigned dest = MI.getOperand(0).getReg();
+  unsigned ptrA = MI.getOperand(1).getReg();
+  unsigned ptrB = MI.getOperand(2).getReg();
+  unsigned incr = MI.getOperand(3).getReg();
+  DebugLoc dl = MI.getDebugLoc();
+
+  MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB =
+    CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
+  MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, loopMBB);
+  if (CmpOpcode)
+    F->insert(It, loop2MBB);
+  F->insert(It, exitMBB);
+  exitMBB->splice(exitMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  MachineRegisterInfo &RegInfo = F->getRegInfo();
+  const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass
+                                          : &PPC::GPRCRegClass;
+  unsigned PtrReg = RegInfo.createVirtualRegister(RC);
+  unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftReg =
+    isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC);
+  unsigned Incr2Reg = RegInfo.createVirtualRegister(RC);
+  unsigned MaskReg = RegInfo.createVirtualRegister(RC);
+  unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
+  unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
+  unsigned Ptr1Reg;
+  unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  // The 4-byte load must be aligned, while a char or short may be
+  // anywhere in the word.  Hence all this nasty bookkeeping code.
+  //   add ptr1, ptrA, ptrB [copy if ptrA==0]
+  //   rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
+  //   xori shift, shift1, 24 [16]
+  //   rlwinm ptr, ptr1, 0, 0, 29
+  //   slw incr2, incr, shift
+  //   li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
+  //   slw mask, mask2, shift
+  //  loopMBB:
+  //   lwarx tmpDest, ptr
+  //   add tmp, tmpDest, incr2
+  //   andc tmp2, tmpDest, mask
+  //   and tmp3, tmp, mask
+  //   or tmp4, tmp3, tmp2
+  //   stwcx. tmp4, ptr
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  //   srw dest, tmpDest, shift
+  if (ptrA != ZeroReg) {
+    Ptr1Reg = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
+      .addReg(ptrA).addReg(ptrB);
+  } else {
+    Ptr1Reg = ptrB;
+  }
+  BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
+      .addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
+  if (!isLittleEndian)
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
+        .addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
+  if (is64bit)
+    BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
+      .addReg(Ptr1Reg).addImm(0).addImm(61);
+  else
+    BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
+      .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
+  BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg)
+      .addReg(incr).addReg(ShiftReg);
+  if (is8bit)
+    BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
+  else {
+    BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
+    BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535);
+  }
+  BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
+      .addReg(Mask2Reg).addReg(ShiftReg);
+
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
+    .addReg(ZeroReg).addReg(PtrReg);
+  if (BinOpcode)
+    BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)
+      .addReg(Incr2Reg).addReg(TmpDestReg);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg)
+    .addReg(TmpDestReg).addReg(MaskReg);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg)
+    .addReg(TmpReg).addReg(MaskReg);
+  if (CmpOpcode) {
+    // For unsigned comparisons, we can directly compare the shifted values.
+    // For signed comparisons we shift and sign extend.
+    unsigned SReg = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), SReg)
+      .addReg(TmpDestReg).addReg(MaskReg);
+    unsigned ValueReg = SReg;
+    unsigned CmpReg = Incr2Reg;
+    if (CmpOpcode == PPC::CMPW) {
+      ValueReg = RegInfo.createVirtualRegister(RC);
+      BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg)
+        .addReg(SReg).addReg(ShiftReg);
+      unsigned ValueSReg = RegInfo.createVirtualRegister(RC);
+      BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg)
+        .addReg(ValueReg);
+      ValueReg = ValueSReg;
+      CmpReg = incr;
+    }
+    BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
+      .addReg(CmpReg).addReg(ValueReg);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB);
+    BB->addSuccessor(loop2MBB);
+    BB->addSuccessor(exitMBB);
+    BB = loop2MBB;
+  }
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg)
+    .addReg(Tmp3Reg).addReg(Tmp2Reg);
+  BuildMI(BB, dl, TII->get(PPC::STWCX))
+    .addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg);
+  BuildMI(BB, dl, TII->get(PPC::BCC))
+    .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+  BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg)
+    .addReg(ShiftReg);
+  return BB;
+}
+
+llvm::MachineBasicBlock *
+PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
+                                    MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI.getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = ++MBB->getIterator();
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
+
+  unsigned DstReg = MI.getOperand(0).getReg();
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  unsigned mainDstReg = MRI.createVirtualRegister(RC);
+  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+  MVT PVT = getPointerTy(MF->getDataLayout());
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+  // For v = setjmp(buf), we generate
+  //
+  // thisMBB:
+  //  SjLjSetup mainMBB
+  //  bl mainMBB
+  //  v_restore = 1
+  //  b sinkMBB
+  //
+  // mainMBB:
+  //  buf[LabelOffset] = LR
+  //  v_main = 0
+  //
+  // sinkMBB:
+  //  v = phi(main, restore)
+  //
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+
+  MachineInstrBuilder MIB;
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // Note that the structure of the jmp_buf used here is not compatible
+  // with that used by libc, and is not designed to be. Specifically, it
+  // stores only those 'reserved' registers that LLVM does not otherwise
+  // understand how to spill. Also, by convention, by the time this
+  // intrinsic is called, Clang has already stored the frame address in the
+  // first slot of the buffer and stack address in the third. Following the
+  // X86 target code, we'll store the jump address in the second slot. We also
+  // need to save the TOC pointer (R2) to handle jumps between shared
+  // libraries, and that will be stored in the fourth slot. The thread
+  // identifier (R13) is not affected.
+
+  // thisMBB:
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  const int64_t TOCOffset   = 3 * PVT.getStoreSize();
+  const int64_t BPOffset    = 4 * PVT.getStoreSize();
+
+  // Prepare IP either in reg.
+  const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+  unsigned LabelReg = MRI.createVirtualRegister(PtrRC);
+  unsigned BufReg = MI.getOperand(1).getReg();
+
+  if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) {
+    setUsesTOCBasePtr(*MBB->getParent());
+    MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
+            .addReg(PPC::X2)
+            .addImm(TOCOffset)
+            .addReg(BufReg);
+    MIB.setMemRefs(MMOBegin, MMOEnd);
+  }
+
+  // Naked functions never have a base pointer, and so we use r1. For all
+  // other functions, this decision must be delayed until during PEI.
+  unsigned BaseReg;
+  if (MF->getFunction()->hasFnAttribute(Attribute::Naked))
+    BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
+  else
+    BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
+
+  MIB = BuildMI(*thisMBB, MI, DL,
+                TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW))
+            .addReg(BaseReg)
+            .addImm(BPOffset)
+            .addReg(BufReg);
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  // Setup
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
+  const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  MIB.addRegMask(TRI->getNoPreservedMask());
+
+  BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
+
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup))
+          .addMBB(mainMBB);
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB);
+
+  thisMBB->addSuccessor(mainMBB, BranchProbability::getZero());
+  thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne());
+
+  // mainMBB:
+  //  mainDstReg = 0
+  MIB =
+      BuildMI(mainMBB, DL,
+              TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);
+
+  // Store IP
+  if (Subtarget.isPPC64()) {
+    MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
+            .addReg(LabelReg)
+            .addImm(LabelOffset)
+            .addReg(BufReg);
+  } else {
+    MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW))
+            .addReg(LabelReg)
+            .addImm(LabelOffset)
+            .addReg(BufReg);
+  }
+
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(PPC::PHI), DstReg)
+    .addReg(mainDstReg).addMBB(mainMBB)
+    .addReg(restoreDstReg).addMBB(thisMBB);
+
+  MI.eraseFromParent();
+  return sinkMBB;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
+                                     MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI.getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
+
+  MVT PVT = getPointerTy(MF->getDataLayout());
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+
+  const TargetRegisterClass *RC =
+    (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+  unsigned Tmp = MRI.createVirtualRegister(RC);
+  // Since FP is only updated here but NOT referenced, it's treated as GPR.
+  unsigned FP  = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
+  unsigned SP  = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
+  unsigned BP =
+      (PVT == MVT::i64)
+          ? PPC::X30
+          : (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29
+                                                              : PPC::R30);
+
+  MachineInstrBuilder MIB;
+
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  const int64_t SPOffset    = 2 * PVT.getStoreSize();
+  const int64_t TOCOffset   = 3 * PVT.getStoreSize();
+  const int64_t BPOffset    = 4 * PVT.getStoreSize();
+
+  unsigned BufReg = MI.getOperand(0).getReg();
+
+  // Reload FP (the jumped-to function may not have had a
+  // frame pointer, and if so, then its r31 will be restored
+  // as necessary).
+  if (PVT == MVT::i64) {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP)
+            .addImm(0)
+            .addReg(BufReg);
+  } else {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP)
+            .addImm(0)
+            .addReg(BufReg);
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  // Reload IP
+  if (PVT == MVT::i64) {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp)
+            .addImm(LabelOffset)
+            .addReg(BufReg);
+  } else {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp)
+            .addImm(LabelOffset)
+            .addReg(BufReg);
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  // Reload SP
+  if (PVT == MVT::i64) {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP)
+            .addImm(SPOffset)
+            .addReg(BufReg);
+  } else {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP)
+            .addImm(SPOffset)
+            .addReg(BufReg);
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  // Reload BP
+  if (PVT == MVT::i64) {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP)
+            .addImm(BPOffset)
+            .addReg(BufReg);
+  } else {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP)
+            .addImm(BPOffset)
+            .addReg(BufReg);
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  // Reload TOC
+  if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
+    setUsesTOCBasePtr(*MBB->getParent());
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
+            .addImm(TOCOffset)
+            .addReg(BufReg);
+
+    MIB.setMemRefs(MMOBegin, MMOEnd);
+  }
+
+  // Jump
+  BuildMI(*MBB, MI, DL,
+          TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp);
+  BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));
+
+  MI.eraseFromParent();
+  return MBB;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                               MachineBasicBlock *BB) const {
+  if (MI.getOpcode() == TargetOpcode::STACKMAP ||
+      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+    if (Subtarget.isPPC64() && Subtarget.isSVR4ABI() &&
+        MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+      // Call lowering should have added an r2 operand to indicate a dependence
+      // on the TOC base pointer value. It can't however, because there is no
+      // way to mark the dependence as implicit there, and so the stackmap code
+      // will confuse it with a regular operand. Instead, add the dependence
+      // here.
+      setUsesTOCBasePtr(*BB->getParent());
+      MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true));
+    }
+
+    return emitPatchPoint(MI, BB);
+  }
+
+  if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 ||
+      MI.getOpcode() == PPC::EH_SjLj_SetJmp64) {
+    return emitEHSjLjSetJmp(MI, BB);
+  } else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 ||
+             MI.getOpcode() == PPC::EH_SjLj_LongJmp64) {
+    return emitEHSjLjLongJmp(MI, BB);
+  }
+
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+
+  // To "insert" these instructions we actually have to insert their
+  // control-flow patterns.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = ++BB->getIterator();
+
+  MachineFunction *F = BB->getParent();
+
+  if (Subtarget.hasISEL() &&
+      (MI.getOpcode() == PPC::SELECT_CC_I4 ||
+       MI.getOpcode() == PPC::SELECT_CC_I8 ||
+       MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8)) {
+    SmallVector<MachineOperand, 2> Cond;
+    if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
+        MI.getOpcode() == PPC::SELECT_CC_I8)
+      Cond.push_back(MI.getOperand(4));
+    else
+      Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
+    Cond.push_back(MI.getOperand(1));
+
+    DebugLoc dl = MI.getDebugLoc();
+    TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond,
+                      MI.getOperand(2).getReg(), MI.getOperand(3).getReg());
+  } else if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
+             MI.getOpcode() == PPC::SELECT_CC_I8 ||
+             MI.getOpcode() == PPC::SELECT_CC_F4 ||
+             MI.getOpcode() == PPC::SELECT_CC_F8 ||
+             MI.getOpcode() == PPC::SELECT_CC_QFRC ||
+             MI.getOpcode() == PPC::SELECT_CC_QSRC ||
+             MI.getOpcode() == PPC::SELECT_CC_QBRC ||
+             MI.getOpcode() == PPC::SELECT_CC_VRRC ||
+             MI.getOpcode() == PPC::SELECT_CC_VSFRC ||
+             MI.getOpcode() == PPC::SELECT_CC_VSSRC ||
+             MI.getOpcode() == PPC::SELECT_CC_VSRC ||
+             MI.getOpcode() == PPC::SELECT_I4 ||
+             MI.getOpcode() == PPC::SELECT_I8 ||
+             MI.getOpcode() == PPC::SELECT_F4 ||
+             MI.getOpcode() == PPC::SELECT_F8 ||
+             MI.getOpcode() == PPC::SELECT_QFRC ||
+             MI.getOpcode() == PPC::SELECT_QSRC ||
+             MI.getOpcode() == PPC::SELECT_QBRC ||
+             MI.getOpcode() == PPC::SELECT_VRRC ||
+             MI.getOpcode() == PPC::SELECT_VSFRC ||
+             MI.getOpcode() == PPC::SELECT_VSSRC ||
+             MI.getOpcode() == PPC::SELECT_VSRC) {
+    // The incoming instruction knows the destination vreg to set, the
+    // condition code register to branch on, the true/false values to
+    // select between, and a branch opcode to use.
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   cmpTY ccX, r1, r2
+    //   bCC copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineBasicBlock *thisMBB = BB;
+    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    DebugLoc dl = MI.getDebugLoc();
+    F->insert(It, copy0MBB);
+    F->insert(It, sinkMBB);
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    sinkMBB->splice(sinkMBB->begin(), BB,
+                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
+    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    // Next, add the true and fallthrough blocks as its successors.
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    if (MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 ||
+        MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 ||
+        MI.getOpcode() == PPC::SELECT_QFRC ||
+        MI.getOpcode() == PPC::SELECT_QSRC ||
+        MI.getOpcode() == PPC::SELECT_QBRC ||
+        MI.getOpcode() == PPC::SELECT_VRRC ||
+        MI.getOpcode() == PPC::SELECT_VSFRC ||
+        MI.getOpcode() == PPC::SELECT_VSSRC ||
+        MI.getOpcode() == PPC::SELECT_VSRC) {
+      BuildMI(BB, dl, TII->get(PPC::BC))
+          .addReg(MI.getOperand(1).getReg())
+          .addMBB(sinkMBB);
+    } else {
+      unsigned SelectPred = MI.getOperand(4).getImm();
+      BuildMI(BB, dl, TII->get(PPC::BCC))
+          .addImm(SelectPred)
+          .addReg(MI.getOperand(1).getReg())
+          .addMBB(sinkMBB);
+    }
+
+    //  copy0MBB:
+    //   %FalseValue = ...
+    //   # fallthrough to sinkMBB
+    BB = copy0MBB;
+
+    // Update machine-CFG edges
+    BB->addSuccessor(sinkMBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    BB = sinkMBB;
+    BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg())
+        .addReg(MI.getOperand(3).getReg())
+        .addMBB(copy0MBB)
+        .addReg(MI.getOperand(2).getReg())
+        .addMBB(thisMBB);
+  } else if (MI.getOpcode() == PPC::ReadTB) {
+    // To read the 64-bit time-base register on a 32-bit target, we read the
+    // two halves. Should the counter have wrapped while it was being read, we
+    // need to try again.
+    // ...
+    // readLoop:
+    // mfspr Rx,TBU # load from TBU
+    // mfspr Ry,TB  # load from TB
+    // mfspr Rz,TBU # load from TBU
+    // cmpw crX,Rx,Rz # check if 'old'='new'
+    // bne readLoop   # branch if they're not equal
+    // ...
+
+    MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    DebugLoc dl = MI.getDebugLoc();
+    F->insert(It, readMBB);
+    F->insert(It, sinkMBB);
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    sinkMBB->splice(sinkMBB->begin(), BB,
+                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
+    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    BB->addSuccessor(readMBB);
+    BB = readMBB;
+
+    MachineRegisterInfo &RegInfo = F->getRegInfo();
+    unsigned ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
+    unsigned LoReg = MI.getOperand(0).getReg();
+    unsigned HiReg = MI.getOperand(1).getReg();
+
+    BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269);
+    BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268);
+    BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269);
+
+    unsigned CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
+
+    BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg)
+      .addReg(HiReg).addReg(ReadAgainReg);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(CmpReg).addMBB(readMBB);
+
+    BB->addSuccessor(readMBB);
+    BB->addSuccessor(sinkMBB);
+  } else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
+    BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
+    BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8);
+
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
+    BB = EmitAtomicBinary(MI, BB, 4, PPC::AND);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
+    BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8);
+
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
+    BB = EmitAtomicBinary(MI, BB, 4, PPC::OR);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
+    BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8);
+
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
+    BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
+    BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8);
+
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
+    BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
+    BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8);
+
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
+    BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
+    BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8);
+
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GE);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GE);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32)
+    BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GE);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64)
+    BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GE);
+
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LE);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LE);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32)
+    BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LE);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64)
+    BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LE);
+
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GE);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GE);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32)
+    BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GE);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64)
+    BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GE);
+
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LE);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LE);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32)
+    BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LE);
+  else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64)
+    BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LE);
+
+  else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, 0);
+  else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, 0);
+  else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32)
+    BB = EmitAtomicBinary(MI, BB, 4, 0);
+  else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64)
+    BB = EmitAtomicBinary(MI, BB, 8, 0);
+
+  else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
+           MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 ||
+           (Subtarget.hasPartwordAtomics() &&
+            MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) ||
+           (Subtarget.hasPartwordAtomics() &&
+            MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) {
+    bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
+
+    auto LoadMnemonic = PPC::LDARX;
+    auto StoreMnemonic = PPC::STDCX;
+    switch (MI.getOpcode()) {
+    default:
+      llvm_unreachable("Compare and swap of unknown size");
+    case PPC::ATOMIC_CMP_SWAP_I8:
+      LoadMnemonic = PPC::LBARX;
+      StoreMnemonic = PPC::STBCX;
+      assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
+      break;
+    case PPC::ATOMIC_CMP_SWAP_I16:
+      LoadMnemonic = PPC::LHARX;
+      StoreMnemonic = PPC::STHCX;
+      assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
+      break;
+    case PPC::ATOMIC_CMP_SWAP_I32:
+      LoadMnemonic = PPC::LWARX;
+      StoreMnemonic = PPC::STWCX;
+      break;
+    case PPC::ATOMIC_CMP_SWAP_I64:
+      LoadMnemonic = PPC::LDARX;
+      StoreMnemonic = PPC::STDCX;
+      break;
+    }
+    unsigned dest = MI.getOperand(0).getReg();
+    unsigned ptrA = MI.getOperand(1).getReg();
+    unsigned ptrB = MI.getOperand(2).getReg();
+    unsigned oldval = MI.getOperand(3).getReg();
+    unsigned newval = MI.getOperand(4).getReg();
+    DebugLoc dl = MI.getDebugLoc();
+
+    MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, loop1MBB);
+    F->insert(It, loop2MBB);
+    F->insert(It, midMBB);
+    F->insert(It, exitMBB);
+    exitMBB->splice(exitMBB->begin(), BB,
+                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
+    exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    //  thisMBB:
+    //   ...
+    //   fallthrough --> loopMBB
+    BB->addSuccessor(loop1MBB);
+
+    // loop1MBB:
+    //   l[bhwd]arx dest, ptr
+    //   cmp[wd] dest, oldval
+    //   bne- midMBB
+    // loop2MBB:
+    //   st[bhwd]cx. newval, ptr
+    //   bne- loopMBB
+    //   b exitBB
+    // midMBB:
+    //   st[bhwd]cx. dest, ptr
+    // exitBB:
+    BB = loop1MBB;
+    BuildMI(BB, dl, TII->get(LoadMnemonic), dest)
+      .addReg(ptrA).addReg(ptrB);
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0)
+      .addReg(oldval).addReg(dest);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
+    BB->addSuccessor(loop2MBB);
+    BB->addSuccessor(midMBB);
+
+    BB = loop2MBB;
+    BuildMI(BB, dl, TII->get(StoreMnemonic))
+      .addReg(newval).addReg(ptrA).addReg(ptrB);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
+    BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
+    BB->addSuccessor(loop1MBB);
+    BB->addSuccessor(exitMBB);
+
+    BB = midMBB;
+    BuildMI(BB, dl, TII->get(StoreMnemonic))
+      .addReg(dest).addReg(ptrA).addReg(ptrB);
+    BB->addSuccessor(exitMBB);
+
+    //  exitMBB:
+    //   ...
+    BB = exitMBB;
+  } else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
+             MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
+    // We must use 64-bit registers for addresses when targeting 64-bit,
+    // since we're actually doing arithmetic on them.  Other registers
+    // can be 32-bit.
+    bool is64bit = Subtarget.isPPC64();
+    bool isLittleEndian = Subtarget.isLittleEndian();
+    bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
+
+    unsigned dest = MI.getOperand(0).getReg();
+    unsigned ptrA = MI.getOperand(1).getReg();
+    unsigned ptrB = MI.getOperand(2).getReg();
+    unsigned oldval = MI.getOperand(3).getReg();
+    unsigned newval = MI.getOperand(4).getReg();
+    DebugLoc dl = MI.getDebugLoc();
+
+    MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, loop1MBB);
+    F->insert(It, loop2MBB);
+    F->insert(It, midMBB);
+    F->insert(It, exitMBB);
+    exitMBB->splice(exitMBB->begin(), BB,
+                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
+    exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    MachineRegisterInfo &RegInfo = F->getRegInfo();
+    const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass
+                                            : &PPC::GPRCRegClass;
+    unsigned PtrReg = RegInfo.createVirtualRegister(RC);
+    unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
+    unsigned ShiftReg =
+      isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(RC);
+    unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC);
+    unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC);
+    unsigned MaskReg = RegInfo.createVirtualRegister(RC);
+    unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
+    unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
+    unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
+    unsigned Ptr1Reg;
+    unsigned TmpReg = RegInfo.createVirtualRegister(RC);
+    unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
+    //  thisMBB:
+    //   ...
+    //   fallthrough --> loopMBB
+    BB->addSuccessor(loop1MBB);
+
+    // The 4-byte load must be aligned, while a char or short may be
+    // anywhere in the word.  Hence all this nasty bookkeeping code.
+    //   add ptr1, ptrA, ptrB [copy if ptrA==0]
+    //   rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
+    //   xori shift, shift1, 24 [16]
+    //   rlwinm ptr, ptr1, 0, 0, 29
+    //   slw newval2, newval, shift
+    //   slw oldval2, oldval,shift
+    //   li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
+    //   slw mask, mask2, shift
+    //   and newval3, newval2, mask
+    //   and oldval3, oldval2, mask
+    // loop1MBB:
+    //   lwarx tmpDest, ptr
+    //   and tmp, tmpDest, mask
+    //   cmpw tmp, oldval3
+    //   bne- midMBB
+    // loop2MBB:
+    //   andc tmp2, tmpDest, mask
+    //   or tmp4, tmp2, newval3
+    //   stwcx. tmp4, ptr
+    //   bne- loop1MBB
+    //   b exitBB
+    // midMBB:
+    //   stwcx. tmpDest, ptr
+    // exitBB:
+    //   srw dest, tmpDest, shift
+    if (ptrA != ZeroReg) {
+      Ptr1Reg = RegInfo.createVirtualRegister(RC);
+      BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
+        .addReg(ptrA).addReg(ptrB);
+    } else {
+      Ptr1Reg = ptrB;
+    }
+    BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
+        .addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
+    if (!isLittleEndian)
+      BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
+          .addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
+    if (is64bit)
+      BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
+        .addReg(Ptr1Reg).addImm(0).addImm(61);
+    else
+      BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
+        .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
+    BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg)
+        .addReg(newval).addReg(ShiftReg);
+    BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg)
+        .addReg(oldval).addReg(ShiftReg);
+    if (is8bit)
+      BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
+    else {
+      BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
+      BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
+        .addReg(Mask3Reg).addImm(65535);
+    }
+    BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
+        .addReg(Mask2Reg).addReg(ShiftReg);
+    BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg)
+        .addReg(NewVal2Reg).addReg(MaskReg);
+    BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg)
+        .addReg(OldVal2Reg).addReg(MaskReg);
+
+    BB = loop1MBB;
+    BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
+        .addReg(ZeroReg).addReg(PtrReg);
+    BuildMI(BB, dl, TII->get(PPC::AND),TmpReg)
+        .addReg(TmpDestReg).addReg(MaskReg);
+    BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0)
+        .addReg(TmpReg).addReg(OldVal3Reg);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+        .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
+    BB->addSuccessor(loop2MBB);
+    BB->addSuccessor(midMBB);
+
+    BB = loop2MBB;
+    BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg)
+        .addReg(TmpDestReg).addReg(MaskReg);
+    BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg)
+        .addReg(Tmp2Reg).addReg(NewVal3Reg);
+    BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg)
+        .addReg(ZeroReg).addReg(PtrReg);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
+    BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
+    BB->addSuccessor(loop1MBB);
+    BB->addSuccessor(exitMBB);
+
+    BB = midMBB;
+    BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg)
+      .addReg(ZeroReg).addReg(PtrReg);
+    BB->addSuccessor(exitMBB);
+
+    //  exitMBB:
+    //   ...
+    BB = exitMBB;
+    BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg)
+      .addReg(ShiftReg);
+  } else if (MI.getOpcode() == PPC::FADDrtz) {
+    // This pseudo performs an FADD with rounding mode temporarily forced
+    // to round-to-zero.  We emit this via custom inserter since the FPSCR
+    // is not modeled at the SelectionDAG level.
+    unsigned Dest = MI.getOperand(0).getReg();
+    unsigned Src1 = MI.getOperand(1).getReg();
+    unsigned Src2 = MI.getOperand(2).getReg();
+    DebugLoc dl = MI.getDebugLoc();
+
+    MachineRegisterInfo &RegInfo = F->getRegInfo();
+    unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
+
+    // Save FPSCR value.
+    BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);
+
+    // Set rounding mode to round-to-zero.
+    BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31);
+    BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30);
+
+    // Perform addition.
+    BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2);
+
+    // Restore FPSCR value.
+    BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg);
+  } else if (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT ||
+             MI.getOpcode() == PPC::ANDIo_1_GT_BIT ||
+             MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
+             MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) {
+    unsigned Opcode = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
+                       MI.getOpcode() == PPC::ANDIo_1_GT_BIT8)
+                          ? PPC::ANDIo8
+                          : PPC::ANDIo;
+    bool isEQ = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT ||
+                 MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8);
+
+    MachineRegisterInfo &RegInfo = F->getRegInfo();
+    unsigned Dest = RegInfo.createVirtualRegister(Opcode == PPC::ANDIo ?
+                                                  &PPC::GPRCRegClass :
+                                                  &PPC::G8RCRegClass);
+
+    DebugLoc dl = MI.getDebugLoc();
+    BuildMI(*BB, MI, dl, TII->get(Opcode), Dest)
+        .addReg(MI.getOperand(1).getReg())
+        .addImm(1);
+    BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY),
+            MI.getOperand(0).getReg())
+        .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT);
+  } else if (MI.getOpcode() == PPC::TCHECK_RET) {
+    DebugLoc Dl = MI.getDebugLoc();
+    MachineRegisterInfo &RegInfo = F->getRegInfo();
+    unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
+    BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg);
+    return BB;
+  } else {
+    llvm_unreachable("Unexpected instr type to insert");
+  }
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
+  // For the estimates, convergence is quadratic, so we essentially double the
+  // number of digits correct after every iteration. For both FRE and FRSQRTE,
+  // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
+  // this is 2^-14. IEEE float has 23 digits and double has 52 digits.
+  int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
+  if (VT.getScalarType() == MVT::f64)
+    RefinementSteps++;
+  return RefinementSteps;
+}
+
+SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
+                                           int Enabled, int &RefinementSteps,
+                                           bool &UseOneConstNR,
+                                           bool Reciprocal) const {
+  EVT VT = Operand.getValueType();
+  if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
+      (VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
+      (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
+      (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
+      (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
+      (VT == MVT::v4f64 && Subtarget.hasQPX())) {
+    if (RefinementSteps == ReciprocalEstimate::Unspecified)
+      RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
+
+    UseOneConstNR = true;
+    return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
+  }
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
+                                            int Enabled,
+                                            int &RefinementSteps) const {
+  EVT VT = Operand.getValueType();
+  if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
+      (VT == MVT::f64 && Subtarget.hasFRE()) ||
+      (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
+      (VT == MVT::v2f64 && Subtarget.hasVSX()) ||
+      (VT == MVT::v4f32 && Subtarget.hasQPX()) ||
+      (VT == MVT::v4f64 && Subtarget.hasQPX())) {
+    if (RefinementSteps == ReciprocalEstimate::Unspecified)
+      RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
+    return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
+  }
+  return SDValue();
+}
+
+unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
+  // Note: This functionality is used only when unsafe-fp-math is enabled, and
+  // on cores with reciprocal estimates (which are used when unsafe-fp-math is
+  // enabled for division), this functionality is redundant with the default
+  // combiner logic (once the division -> reciprocal/multiply transformation
+  // has taken place). As a result, this matters more for older cores than for
+  // newer ones.
+
+  // Combine multiple FDIVs with the same divisor into multiple FMULs by the
+  // reciprocal if there are two or more FDIVs (for embedded cores with only
+  // one FP pipeline) for three or more FDIVs (for generic OOO cores).
+  switch (Subtarget.getDarwinDirective()) {
+  default:
+    return 3;
+  case PPC::DIR_440:
+  case PPC::DIR_A2:
+  case PPC::DIR_E500mc:
+  case PPC::DIR_E5500:
+    return 2;
+  }
+}
+
+// isConsecutiveLSLoc needs to work even if all adds have not yet been
+// collapsed, and so we need to look through chains of them.
+static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base,
+                                     int64_t& Offset, SelectionDAG &DAG) {
+  if (DAG.isBaseWithConstantOffset(Loc)) {
+    Base = Loc.getOperand(0);
+    Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();
+
+    // The base might itself be a base plus an offset, and if so, accumulate
+    // that as well.
+    getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG);
+  }
+}
+
+static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
+                            unsigned Bytes, int Dist,
+                            SelectionDAG &DAG) {
+  if (VT.getSizeInBits() / 8 != Bytes)
+    return false;
+
+  SDValue BaseLoc = Base->getBasePtr();
+  if (Loc.getOpcode() == ISD::FrameIndex) {
+    if (BaseLoc.getOpcode() != ISD::FrameIndex)
+      return false;
+    const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+    int FI  = cast<FrameIndexSDNode>(Loc)->getIndex();
+    int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
+    int FS  = MFI.getObjectSize(FI);
+    int BFS = MFI.getObjectSize(BFI);
+    if (FS != BFS || FS != (int)Bytes) return false;
+    return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes);
+  }
+
+  SDValue Base1 = Loc, Base2 = BaseLoc;
+  int64_t Offset1 = 0, Offset2 = 0;
+  getBaseWithConstantOffset(Loc, Base1, Offset1, DAG);
+  getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG);
+  if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes))
+    return true;
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const GlobalValue *GV1 = nullptr;
+  const GlobalValue *GV2 = nullptr;
+  Offset1 = 0;
+  Offset2 = 0;
+  bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
+  bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
+  if (isGA1 && isGA2 && GV1 == GV2)
+    return Offset1 == (Offset2 + Dist*Bytes);
+  return false;
+}
+
+// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
+// not enforce equality of the chain operands.
+static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
+                            unsigned Bytes, int Dist,
+                            SelectionDAG &DAG) {
+  if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) {
+    EVT VT = LS->getMemoryVT();
+    SDValue Loc = LS->getBasePtr();
+    return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG);
+  }
+
+  if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+    EVT VT;
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    default: return false;
+    case Intrinsic::ppc_qpx_qvlfd:
+    case Intrinsic::ppc_qpx_qvlfda:
+      VT = MVT::v4f64;
+      break;
+    case Intrinsic::ppc_qpx_qvlfs:
+    case Intrinsic::ppc_qpx_qvlfsa:
+      VT = MVT::v4f32;
+      break;
+    case Intrinsic::ppc_qpx_qvlfcd:
+    case Intrinsic::ppc_qpx_qvlfcda:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_qpx_qvlfcs:
+    case Intrinsic::ppc_qpx_qvlfcsa:
+      VT = MVT::v2f32;
+      break;
+    case Intrinsic::ppc_qpx_qvlfiwa:
+    case Intrinsic::ppc_qpx_qvlfiwz:
+    case Intrinsic::ppc_altivec_lvx:
+    case Intrinsic::ppc_altivec_lvxl:
+    case Intrinsic::ppc_vsx_lxvw4x:
+    case Intrinsic::ppc_vsx_lxvw4x_be:
+      VT = MVT::v4i32;
+      break;
+    case Intrinsic::ppc_vsx_lxvd2x:
+    case Intrinsic::ppc_vsx_lxvd2x_be:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_altivec_lvebx:
+      VT = MVT::i8;
+      break;
+    case Intrinsic::ppc_altivec_lvehx:
+      VT = MVT::i16;
+      break;
+    case Intrinsic::ppc_altivec_lvewx:
+      VT = MVT::i32;
+      break;
+    }
+
+    return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG);
+  }
+
+  if (N->getOpcode() == ISD::INTRINSIC_VOID) {
+    EVT VT;
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    default: return false;
+    case Intrinsic::ppc_qpx_qvstfd:
+    case Intrinsic::ppc_qpx_qvstfda:
+      VT = MVT::v4f64;
+      break;
+    case Intrinsic::ppc_qpx_qvstfs:
+    case Intrinsic::ppc_qpx_qvstfsa:
+      VT = MVT::v4f32;
+      break;
+    case Intrinsic::ppc_qpx_qvstfcd:
+    case Intrinsic::ppc_qpx_qvstfcda:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_qpx_qvstfcs:
+    case Intrinsic::ppc_qpx_qvstfcsa:
+      VT = MVT::v2f32;
+      break;
+    case Intrinsic::ppc_qpx_qvstfiw:
+    case Intrinsic::ppc_qpx_qvstfiwa:
+    case Intrinsic::ppc_altivec_stvx:
+    case Intrinsic::ppc_altivec_stvxl:
+    case Intrinsic::ppc_vsx_stxvw4x:
+      VT = MVT::v4i32;
+      break;
+    case Intrinsic::ppc_vsx_stxvd2x:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_vsx_stxvw4x_be:
+      VT = MVT::v4i32;
+      break;
+    case Intrinsic::ppc_vsx_stxvd2x_be:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_altivec_stvebx:
+      VT = MVT::i8;
+      break;
+    case Intrinsic::ppc_altivec_stvehx:
+      VT = MVT::i16;
+      break;
+    case Intrinsic::ppc_altivec_stvewx:
+      VT = MVT::i32;
+      break;
+    }
+
+    return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG);
+  }
+
+  return false;
+}
+
+// Return true is there is a nearyby consecutive load to the one provided
+// (regardless of alignment). We search up and down the chain, looking though
+// token factors and other loads (but nothing else). As a result, a true result
+// indicates that it is safe to create a new consecutive load adjacent to the
+// load provided.
+static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
+  SDValue Chain = LD->getChain();
+  EVT VT = LD->getMemoryVT();
+
+  SmallSet<SDNode *, 16> LoadRoots;
+  SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
+  SmallSet<SDNode *, 16> Visited;
+
+  // First, search up the chain, branching to follow all token-factor operands.
+  // If we find a consecutive load, then we're done, otherwise, record all
+  // nodes just above the top-level loads and token factors.
+  while (!Queue.empty()) {
+    SDNode *ChainNext = Queue.pop_back_val();
+    if (!Visited.insert(ChainNext).second)
+      continue;
+
+    if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) {
+      if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
+        return true;
+
+      if (!Visited.count(ChainLD->getChain().getNode()))
+        Queue.push_back(ChainLD->getChain().getNode());
+    } else if (ChainNext->getOpcode() == ISD::TokenFactor) {
+      for (const SDUse &O : ChainNext->ops())
+        if (!Visited.count(O.getNode()))
+          Queue.push_back(O.getNode());
+    } else
+      LoadRoots.insert(ChainNext);
+  }
+
+  // Second, search down the chain, starting from the top-level nodes recorded
+  // in the first phase. These top-level nodes are the nodes just above all
+  // loads and token factors. Starting with their uses, recursively look though
+  // all loads (just the chain uses) and token factors to find a consecutive
+  // load.
+  Visited.clear();
+  Queue.clear();
+
+  for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(),
+       IE = LoadRoots.end(); I != IE; ++I) {
+    Queue.push_back(*I);
+
+    while (!Queue.empty()) {
+      SDNode *LoadRoot = Queue.pop_back_val();
+      if (!Visited.insert(LoadRoot).second)
+        continue;
+
+      if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot))
+        if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
+          return true;
+
+      for (SDNode::use_iterator UI = LoadRoot->use_begin(),
+           UE = LoadRoot->use_end(); UI != UE; ++UI)
+        if (((isa<MemSDNode>(*UI) &&
+            cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) ||
+            UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
+          Queue.push_back(*UI);
+    }
+  }
+
+  return false;
+}
+
+
+/// This function is called when we have proved that a SETCC node can be replaced
+/// by subtraction (and other supporting instructions) so that the result of
+/// comparison is kept in a GPR instead of CR. This function is purely for
+/// codegen purposes and has some flags to guide the codegen process.
+static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
+                                     bool Swap, SDLoc &DL, SelectionDAG &DAG) {
+
+  assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
+
+  // Zero extend the operands to the largest legal integer. Originally, they
+  // must be of a strictly smaller size.
+  auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0),
+                         DAG.getConstant(Size, DL, MVT::i32));
+  auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1),
+                         DAG.getConstant(Size, DL, MVT::i32));
+
+  // Swap if needed. Depends on the condition code.
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  // Subtract extended integers.
+  auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1);
+
+  // Move the sign bit to the least significant position and zero out the rest.
+  // Now the least significant bit carries the result of original comparison.
+  auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode,
+                             DAG.getConstant(Size - 1, DL, MVT::i32));
+  auto Final = Shifted;
+
+  // Complement the result if needed. Based on the condition code.
+  if (Complement)
+    Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted,
+                        DAG.getConstant(1, DL, MVT::i64));
+
+  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final);
+}
+
+SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+
+  assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  // Size of integers being compared has a critical role in the following
+  // analysis, so we prefer to do this when all types are legal.
+  if (!DCI.isAfterLegalizeVectorOps())
+    return SDValue();
+
+  // If all users of SETCC extend its value to a legal integer type
+  // then we replace SETCC with a subtraction
+  for (SDNode::use_iterator UI = N->use_begin(),
+       UE = N->use_end(); UI != UE; ++UI) {
+    if (UI->getOpcode() != ISD::ZERO_EXTEND)
+      return SDValue();
+  }
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  auto OpSize = N->getOperand(0).getValueSizeInBits();
+
+  unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits();
+
+  if (OpSize < Size) {
+    switch (CC) {
+    default: break;
+    case ISD::SETULT:
+      return generateEquivalentSub(N, Size, false, false, DL, DAG);
+    case ISD::SETULE:
+      return generateEquivalentSub(N, Size, true, true, DL, DAG);
+    case ISD::SETUGT:
+      return generateEquivalentSub(N, Size, false, true, DL, DAG);
+    case ISD::SETUGE:
+      return generateEquivalentSub(N, Size, true, false, DL, DAG);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+
+  assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits");
+  // If we're tracking CR bits, we need to be careful that we don't have:
+  //   trunc(binary-ops(zext(x), zext(y)))
+  // or
+  //   trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
+  // such that we're unnecessarily moving things into GPRs when it would be
+  // better to keep them in CR bits.
+
+  // Note that trunc here can be an actual i1 trunc, or can be the effective
+  // truncation that comes from a setcc or select_cc.
+  if (N->getOpcode() == ISD::TRUNCATE &&
+      N->getValueType(0) != MVT::i1)
+    return SDValue();
+
+  if (N->getOperand(0).getValueType() != MVT::i32 &&
+      N->getOperand(0).getValueType() != MVT::i64)
+    return SDValue();
+
+  if (N->getOpcode() == ISD::SETCC ||
+      N->getOpcode() == ISD::SELECT_CC) {
+    // If we're looking at a comparison, then we need to make sure that the
+    // high bits (all except for the first) don't matter the result.
+    ISD::CondCode CC =
+      cast<CondCodeSDNode>(N->getOperand(
+        N->getOpcode() == ISD::SETCC ? 2 : 4))->get();
+    unsigned OpBits = N->getOperand(0).getValueSizeInBits();
+
+    if (ISD::isSignedIntSetCC(CC)) {
+      if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits ||
+          DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits)
+        return SDValue();
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (!DAG.MaskedValueIsZero(N->getOperand(0),
+                                 APInt::getHighBitsSet(OpBits, OpBits-1)) ||
+          !DAG.MaskedValueIsZero(N->getOperand(1),
+                                 APInt::getHighBitsSet(OpBits, OpBits-1)))
+        return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI)
+                                             : SDValue());
+    } else {
+      // This is neither a signed nor an unsigned comparison, just make sure
+      // that the high bits are equal.
+      APInt Op1Zero, Op1One;
+      APInt Op2Zero, Op2One;
+      DAG.computeKnownBits(N->getOperand(0), Op1Zero, Op1One);
+      DAG.computeKnownBits(N->getOperand(1), Op2Zero, Op2One);
+
+      // We don't really care about what is known about the first bit (if
+      // anything), so clear it in all masks prior to comparing them.
+      Op1Zero.clearBit(0); Op1One.clearBit(0);
+      Op2Zero.clearBit(0); Op2One.clearBit(0);
+
+      if (Op1Zero != Op2Zero || Op1One != Op2One)
+        return SDValue();
+    }
+  }
+
+  // We now know that the higher-order bits are irrelevant, we just need to
+  // make sure that all of the intermediate operations are bit operations, and
+  // all inputs are extensions.
+  if (N->getOperand(0).getOpcode() != ISD::AND &&
+      N->getOperand(0).getOpcode() != ISD::OR  &&
+      N->getOperand(0).getOpcode() != ISD::XOR &&
+      N->getOperand(0).getOpcode() != ISD::SELECT &&
+      N->getOperand(0).getOpcode() != ISD::SELECT_CC &&
+      N->getOperand(0).getOpcode() != ISD::TRUNCATE &&
+      N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND &&
+      N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
+      N->getOperand(0).getOpcode() != ISD::ANY_EXTEND)
+    return SDValue();
+
+  if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) &&
+      N->getOperand(1).getOpcode() != ISD::AND &&
+      N->getOperand(1).getOpcode() != ISD::OR  &&
+      N->getOperand(1).getOpcode() != ISD::XOR &&
+      N->getOperand(1).getOpcode() != ISD::SELECT &&
+      N->getOperand(1).getOpcode() != ISD::SELECT_CC &&
+      N->getOperand(1).getOpcode() != ISD::TRUNCATE &&
+      N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND &&
+      N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
+      N->getOperand(1).getOpcode() != ISD::ANY_EXTEND)
+    return SDValue();
+
+  SmallVector<SDValue, 4> Inputs;
+  SmallVector<SDValue, 8> BinOps, PromOps;
+  SmallPtrSet<SDNode *, 16> Visited;
+
+  for (unsigned i = 0; i < 2; ++i) {
+    if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
+          N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
+          N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
+          N->getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
+        isa<ConstantSDNode>(N->getOperand(i)))
+      Inputs.push_back(N->getOperand(i));
+    else
+      BinOps.push_back(N->getOperand(i));
+
+    if (N->getOpcode() == ISD::TRUNCATE)
+      break;
+  }
+
+  // Visit all inputs, collect all binary operations (and, or, xor and
+  // select) that are all fed by extensions.
+  while (!BinOps.empty()) {
+    SDValue BinOp = BinOps.back();
+    BinOps.pop_back();
+
+    if (!Visited.insert(BinOp.getNode()).second)
+      continue;
+
+    PromOps.push_back(BinOp);
+
+    for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
+      // The condition of the select is not promoted.
+      if (BinOp.getOpcode() == ISD::SELECT && i == 0)
+        continue;
+      if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
+        continue;
+
+      if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
+            BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
+            BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
+           BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
+          isa<ConstantSDNode>(BinOp.getOperand(i))) {
+        Inputs.push_back(BinOp.getOperand(i));
+      } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
+                 BinOp.getOperand(i).getOpcode() == ISD::OR  ||
+                 BinOp.getOperand(i).getOpcode() == ISD::XOR ||
+                 BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
+                 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC ||
+                 BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
+                 BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
+                 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
+                 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {
+        BinOps.push_back(BinOp.getOperand(i));
+      } else {
+        // We have an input that is not an extension or another binary
+        // operation; we'll abort this transformation.
+        return SDValue();
+      }
+    }
+  }
+
+  // Make sure that this is a self-contained cluster of operations (which
+  // is not quite the same thing as saying that everything has only one
+  // use).
+  for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+    if (isa<ConstantSDNode>(Inputs[i]))
+      continue;
+
+    for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
+                              UE = Inputs[i].getNode()->use_end();
+         UI != UE; ++UI) {
+      SDNode *User = *UI;
+      if (User != N && !Visited.count(User))
+        return SDValue();
+
+      // Make sure that we're not going to promote the non-output-value
+      // operand(s) or SELECT or SELECT_CC.
+      // FIXME: Although we could sometimes handle this, and it does occur in
+      // practice that one of the condition inputs to the select is also one of
+      // the outputs, we currently can't deal with this.
+      if (User->getOpcode() == ISD::SELECT) {
+        if (User->getOperand(0) == Inputs[i])
+          return SDValue();
+      } else if (User->getOpcode() == ISD::SELECT_CC) {
+        if (User->getOperand(0) == Inputs[i] ||
+            User->getOperand(1) == Inputs[i])
+          return SDValue();
+      }
+    }
+  }
+
+  for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
+    for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
+                              UE = PromOps[i].getNode()->use_end();
+         UI != UE; ++UI) {
+      SDNode *User = *UI;
+      if (User != N && !Visited.count(User))
+        return SDValue();
+
+      // Make sure that we're not going to promote the non-output-value
+      // operand(s) or SELECT or SELECT_CC.
+      // FIXME: Although we could sometimes handle this, and it does occur in
+      // practice that one of the condition inputs to the select is also one of
+      // the outputs, we currently can't deal with this.
+      if (User->getOpcode() == ISD::SELECT) {
+        if (User->getOperand(0) == PromOps[i])
+          return SDValue();
+      } else if (User->getOpcode() == ISD::SELECT_CC) {
+        if (User->getOperand(0) == PromOps[i] ||
+            User->getOperand(1) == PromOps[i])
+          return SDValue();
+      }
+    }
+  }
+
+  // Replace all inputs with the extension operand.
+  for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+    // Constants may have users outside the cluster of to-be-promoted nodes,
+    // and so we need to replace those as we do the promotions.
+    if (isa<ConstantSDNode>(Inputs[i]))
+      continue;
+    else
+      DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0));
+  }
+
+  std::list<HandleSDNode> PromOpHandles;
+  for (auto &PromOp : PromOps)
+    PromOpHandles.emplace_back(PromOp);
+
+  // Replace all operations (these are all the same, but have a different
+  // (i1) return type). DAG.getNode will validate that the types of
+  // a binary operator match, so go through the list in reverse so that
+  // we've likely promoted both operands first. Any intermediate truncations or
+  // extensions disappear.
+  while (!PromOpHandles.empty()) {
+    SDValue PromOp = PromOpHandles.back().getValue();
+    PromOpHandles.pop_back();
+
+    if (PromOp.getOpcode() == ISD::TRUNCATE ||
+        PromOp.getOpcode() == ISD::SIGN_EXTEND ||
+        PromOp.getOpcode() == ISD::ZERO_EXTEND ||
+        PromOp.getOpcode() == ISD::ANY_EXTEND) {
+      if (!isa<ConstantSDNode>(PromOp.getOperand(0)) &&
+          PromOp.getOperand(0).getValueType() != MVT::i1) {
+        // The operand is not yet ready (see comment below).
+        PromOpHandles.emplace_front(PromOp);
+        continue;
+      }
+
+      SDValue RepValue = PromOp.getOperand(0);
+      if (isa<ConstantSDNode>(RepValue))
+        RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue);
+
+      DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue);
+      continue;
+    }
+
+    unsigned C;
+    switch (PromOp.getOpcode()) {
+    default:             C = 0; break;
+    case ISD::SELECT:    C = 1; break;
+    case ISD::SELECT_CC: C = 2; break;
+    }
+
+    if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
+         PromOp.getOperand(C).getValueType() != MVT::i1) ||
+        (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
+         PromOp.getOperand(C+1).getValueType() != MVT::i1)) {
+      // The to-be-promoted operands of this node have not yet been
+      // promoted (this should be rare because we're going through the
+      // list backward, but if one of the operands has several users in
+      // this cluster of to-be-promoted nodes, it is possible).
+      PromOpHandles.emplace_front(PromOp);
+      continue;
+    }
+
+    SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
+                                PromOp.getNode()->op_end());
+
+    // If there are any constant inputs, make sure they're replaced now.
+    for (unsigned i = 0; i < 2; ++i)
+      if (isa<ConstantSDNode>(Ops[C+i]))
+        Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]);
+
+    DAG.ReplaceAllUsesOfValueWith(PromOp,
+      DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops));
+  }
+
+  // Now we're left with the initial truncation itself.
+  if (N->getOpcode() == ISD::TRUNCATE)
+    return N->getOperand(0);
+
+  // Otherwise, this is a comparison. The operands to be compared have just
+  // changed type (to i1), but everything else is the same.
+  return SDValue(N, 0);
+}
+
+SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+
+  // If we're tracking CR bits, we need to be careful that we don't have:
+  //   zext(binary-ops(trunc(x), trunc(y)))
+  // or
+  //   zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
+  // such that we're unnecessarily moving things into CR bits that can more
+  // efficiently stay in GPRs. Note that if we're not certain that the high
+  // bits are set as required by the final extension, we still may need to do
+  // some masking to get the proper behavior.
+
+  // This same functionality is important on PPC64 when dealing with
+  // 32-to-64-bit extensions; these occur often when 32-bit values are used as
+  // the return values of functions. Because it is so similar, it is handled
+  // here as well.
+
+  if (N->getValueType(0) != MVT::i32 &&
+      N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) ||
+        (N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64())))
+    return SDValue();
+
+  if (N->getOperand(0).getOpcode() != ISD::AND &&
+      N->getOperand(0).getOpcode() != ISD::OR  &&
+      N->getOperand(0).getOpcode() != ISD::XOR &&
+      N->getOperand(0).getOpcode() != ISD::SELECT &&
+      N->getOperand(0).getOpcode() != ISD::SELECT_CC)
+    return SDValue();
+
+  SmallVector<SDValue, 4> Inputs;
+  SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps;
+  SmallPtrSet<SDNode *, 16> Visited;
+
+  // Visit all inputs, collect all binary operations (and, or, xor and
+  // select) that are all fed by truncations.
+  while (!BinOps.empty()) {
+    SDValue BinOp = BinOps.back();
+    BinOps.pop_back();
+
+    if (!Visited.insert(BinOp.getNode()).second)
+      continue;
+
+    PromOps.push_back(BinOp);
+
+    for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
+      // The condition of the select is not promoted.
+      if (BinOp.getOpcode() == ISD::SELECT && i == 0)
+        continue;
+      if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
+        continue;
+
+      if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
+          isa<ConstantSDNode>(BinOp.getOperand(i))) {
+        Inputs.push_back(BinOp.getOperand(i));
+      } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
+                 BinOp.getOperand(i).getOpcode() == ISD::OR  ||
+                 BinOp.getOperand(i).getOpcode() == ISD::XOR ||
+                 BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
+                 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {
+        BinOps.push_back(BinOp.getOperand(i));
+      } else {
+        // We have an input that is not a truncation or another binary
+        // operation; we'll abort this transformation.
+        return SDValue();
+      }
+    }
+  }
+
+  // The operands of a select that must be truncated when the select is
+  // promoted because the operand is actually part of the to-be-promoted set.
+  DenseMap<SDNode *, EVT> SelectTruncOp[2];
+
+  // Make sure that this is a self-contained cluster of operations (which
+  // is not quite the same thing as saying that everything has only one
+  // use).
+  for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+    if (isa<ConstantSDNode>(Inputs[i]))
+      continue;
+
+    for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
+                              UE = Inputs[i].getNode()->use_end();
+         UI != UE; ++UI) {
+      SDNode *User = *UI;
+      if (User != N && !Visited.count(User))
+        return SDValue();
+
+      // If we're going to promote the non-output-value operand(s) or SELECT or
+      // SELECT_CC, record them for truncation.
+      if (User->getOpcode() == ISD::SELECT) {
+        if (User->getOperand(0) == Inputs[i])
+          SelectTruncOp[0].insert(std::make_pair(User,
+                                    User->getOperand(0).getValueType()));
+      } else if (User->getOpcode() == ISD::SELECT_CC) {
+        if (User->getOperand(0) == Inputs[i])
+          SelectTruncOp[0].insert(std::make_pair(User,
+                                    User->getOperand(0).getValueType()));
+        if (User->getOperand(1) == Inputs[i])
+          SelectTruncOp[1].insert(std::make_pair(User,
+                                    User->getOperand(1).getValueType()));
+      }
+    }
+  }
+
+  for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
+    for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
+                              UE = PromOps[i].getNode()->use_end();
+         UI != UE; ++UI) {
+      SDNode *User = *UI;
+      if (User != N && !Visited.count(User))
+        return SDValue();
+
+      // If we're going to promote the non-output-value operand(s) or SELECT or
+      // SELECT_CC, record them for truncation.
+      if (User->getOpcode() == ISD::SELECT) {
+        if (User->getOperand(0) == PromOps[i])
+          SelectTruncOp[0].insert(std::make_pair(User,
+                                    User->getOperand(0).getValueType()));
+      } else if (User->getOpcode() == ISD::SELECT_CC) {
+        if (User->getOperand(0) == PromOps[i])
+          SelectTruncOp[0].insert(std::make_pair(User,
+                                    User->getOperand(0).getValueType()));
+        if (User->getOperand(1) == PromOps[i])
+          SelectTruncOp[1].insert(std::make_pair(User,
+                                    User->getOperand(1).getValueType()));
+      }
+    }
+  }
+
+  unsigned PromBits = N->getOperand(0).getValueSizeInBits();
+  bool ReallyNeedsExt = false;
+  if (N->getOpcode() != ISD::ANY_EXTEND) {
+    // If all of the inputs are not already sign/zero extended, then
+    // we'll still need to do that at the end.
+    for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+      if (isa<ConstantSDNode>(Inputs[i]))
+        continue;
+
+      unsigned OpBits =
+        Inputs[i].getOperand(0).getValueSizeInBits();
+      assert(PromBits < OpBits && "Truncation not to a smaller bit count?");
+
+      if ((N->getOpcode() == ISD::ZERO_EXTEND &&
+           !DAG.MaskedValueIsZero(Inputs[i].getOperand(0),
+                                  APInt::getHighBitsSet(OpBits,
+                                                        OpBits-PromBits))) ||
+          (N->getOpcode() == ISD::SIGN_EXTEND &&
+           DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) <
+             (OpBits-(PromBits-1)))) {
+        ReallyNeedsExt = true;
+        break;
+      }
+    }
+  }
+
+  // Replace all inputs, either with the truncation operand, or a
+  // truncation or extension to the final output type.
+  for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
+    // Constant inputs need to be replaced with the to-be-promoted nodes that
+    // use them because they might have users outside of the cluster of
+    // promoted nodes.
+    if (isa<ConstantSDNode>(Inputs[i]))
+      continue;
+
+    SDValue InSrc = Inputs[i].getOperand(0);
+    if (Inputs[i].getValueType() == N->getValueType(0))
+      DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc);
+    else if (N->getOpcode() == ISD::SIGN_EXTEND)
+      DAG.ReplaceAllUsesOfValueWith(Inputs[i],
+        DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0)));
+    else if (N->getOpcode() == ISD::ZERO_EXTEND)
+      DAG.ReplaceAllUsesOfValueWith(Inputs[i],
+        DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0)));
+    else
+      DAG.ReplaceAllUsesOfValueWith(Inputs[i],
+        DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0)));
+  }
+
+  std::list<HandleSDNode> PromOpHandles;
+  for (auto &PromOp : PromOps)
+    PromOpHandles.emplace_back(PromOp);
+
+  // Replace all operations (these are all the same, but have a different
+  // (promoted) return type). DAG.getNode will validate that the types of
+  // a binary operator match, so go through the list in reverse so that
+  // we've likely promoted both operands first.
+  while (!PromOpHandles.empty()) {
+    SDValue PromOp = PromOpHandles.back().getValue();
+    PromOpHandles.pop_back();
+
+    unsigned C;
+    switch (PromOp.getOpcode()) {
+    default:             C = 0; break;
+    case ISD::SELECT:    C = 1; break;
+    case ISD::SELECT_CC: C = 2; break;
+    }
+
+    if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
+         PromOp.getOperand(C).getValueType() != N->getValueType(0)) ||
+        (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
+         PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) {
+      // The to-be-promoted operands of this node have not yet been
+      // promoted (this should be rare because we're going through the
+      // list backward, but if one of the operands has several users in
+      // this cluster of to-be-promoted nodes, it is possible).
+      PromOpHandles.emplace_front(PromOp);
+      continue;
+    }
+
+    // For SELECT and SELECT_CC nodes, we do a similar check for any
+    // to-be-promoted comparison inputs.
+    if (PromOp.getOpcode() == ISD::SELECT ||
+        PromOp.getOpcode() == ISD::SELECT_CC) {
+      if ((SelectTruncOp[0].count(PromOp.getNode()) &&
+           PromOp.getOperand(0).getValueType() != N->getValueType(0)) ||
+          (SelectTruncOp[1].count(PromOp.getNode()) &&
+           PromOp.getOperand(1).getValueType() != N->getValueType(0))) {
+        PromOpHandles.emplace_front(PromOp);
+        continue;
+      }
+    }
+
+    SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
+                                PromOp.getNode()->op_end());
+
+    // If this node has constant inputs, then they'll need to be promoted here.
+    for (unsigned i = 0; i < 2; ++i) {
+      if (!isa<ConstantSDNode>(Ops[C+i]))
+        continue;
+      if (Ops[C+i].getValueType() == N->getValueType(0))
+        continue;
+
+      if (N->getOpcode() == ISD::SIGN_EXTEND)
+        Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
+      else if (N->getOpcode() == ISD::ZERO_EXTEND)
+        Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
+      else
+        Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
+    }
+
+    // If we've promoted the comparison inputs of a SELECT or SELECT_CC,
+    // truncate them again to the original value type.
+    if (PromOp.getOpcode() == ISD::SELECT ||
+        PromOp.getOpcode() == ISD::SELECT_CC) {
+      auto SI0 = SelectTruncOp[0].find(PromOp.getNode());
+      if (SI0 != SelectTruncOp[0].end())
+        Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]);
+      auto SI1 = SelectTruncOp[1].find(PromOp.getNode());
+      if (SI1 != SelectTruncOp[1].end())
+        Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]);
+    }
+
+    DAG.ReplaceAllUsesOfValueWith(PromOp,
+      DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));
+  }
+
+  // Now we're left with the initial extension itself.
+  if (!ReallyNeedsExt)
+    return N->getOperand(0);
+
+  // To zero extend, just mask off everything except for the first bit (in the
+  // i1 case).
+  if (N->getOpcode() == ISD::ZERO_EXTEND)
+    return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0),
+                       DAG.getConstant(APInt::getLowBitsSet(
+                                         N->getValueSizeInBits(0), PromBits),
+                                       dl, N->getValueType(0)));
+
+  assert(N->getOpcode() == ISD::SIGN_EXTEND &&
+         "Invalid extension type");
+  EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout());
+  SDValue ShiftCst =
+      DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);
+  return DAG.getNode(
+      ISD::SRA, dl, N->getValueType(0),
+      DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst),
+      ShiftCst);
+}
+
+/// \brief Reduces the number of fp-to-int conversion when building a vector.
+///
+/// If this vector is built out of floating to integer conversions,
+/// transform it to a vector built out of floating point values followed by a
+/// single floating to integer conversion of the vector.
+/// Namely  (build_vector (fptosi $A), (fptosi $B), ...)
+/// becomes (fptosi (build_vector ($A, $B, ...)))
+SDValue PPCTargetLowering::
+combineElementTruncationToVectorTruncation(SDNode *N,
+                                           DAGCombinerInfo &DCI) const {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR &&
+         "Should be called with a BUILD_VECTOR node");
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+
+  SDValue FirstInput = N->getOperand(0);
+  assert(FirstInput.getOpcode() == PPCISD::MFVSR &&
+         "The input operand must be an fp-to-int conversion.");
+
+  // This combine happens after legalization so the fp_to_[su]i nodes are
+  // already converted to PPCSISD nodes.
+  unsigned FirstConversion = FirstInput.getOperand(0).getOpcode();
+  if (FirstConversion == PPCISD::FCTIDZ ||
+      FirstConversion == PPCISD::FCTIDUZ ||
+      FirstConversion == PPCISD::FCTIWZ ||
+      FirstConversion == PPCISD::FCTIWUZ) {
+    bool IsSplat = true;
+    bool Is32Bit = FirstConversion == PPCISD::FCTIWZ ||
+      FirstConversion == PPCISD::FCTIWUZ;
+    EVT SrcVT = FirstInput.getOperand(0).getValueType();
+    SmallVector<SDValue, 4> Ops;
+    EVT TargetVT = N->getValueType(0);
+    for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
+      if (N->getOperand(i).getOpcode() != PPCISD::MFVSR)
+        return SDValue();
+      unsigned NextConversion = N->getOperand(i).getOperand(0).getOpcode();
+      if (NextConversion != FirstConversion)
+        return SDValue();
+      if (N->getOperand(i) != FirstInput)
+        IsSplat = false;
+    }
+
+    // If this is a splat, we leave it as-is since there will be only a single
+    // fp-to-int conversion followed by a splat of the integer. This is better
+    // for 32-bit and smaller ints and neutral for 64-bit ints.
+    if (IsSplat)
+      return SDValue();
+
+    // Now that we know we have the right type of node, get its operands
+    for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
+      SDValue In = N->getOperand(i).getOperand(0);
+      // For 32-bit values, we need to add an FP_ROUND node.
+      if (Is32Bit) {
+        if (In.isUndef())
+          Ops.push_back(DAG.getUNDEF(SrcVT));
+        else {
+          SDValue Trunc = DAG.getNode(ISD::FP_ROUND, dl,
+                                      MVT::f32, In.getOperand(0),
+                                      DAG.getIntPtrConstant(1, dl));
+          Ops.push_back(Trunc);
+        }
+      } else
+        Ops.push_back(In.isUndef() ? DAG.getUNDEF(SrcVT) : In.getOperand(0));
+    }
+
+    unsigned Opcode;
+    if (FirstConversion == PPCISD::FCTIDZ ||
+        FirstConversion == PPCISD::FCTIWZ)
+      Opcode = ISD::FP_TO_SINT;
+    else
+      Opcode = ISD::FP_TO_UINT;
+
+    EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32;
+    SDValue BV = DAG.getBuildVector(NewVT, dl, Ops);
+    return DAG.getNode(Opcode, dl, TargetVT, BV);
+  }
+  return SDValue();
+}
+
+/// \brief Reduce the number of loads when building a vector.
+///
+/// Building a vector out of multiple loads can be converted to a load
+/// of the vector type if the loads are consecutive. If the loads are
+/// consecutive but in descending order, a shuffle is added at the end
+/// to reorder the vector.
+static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR &&
+         "Should be called with a BUILD_VECTOR node");
+
+  SDLoc dl(N);
+  bool InputsAreConsecutiveLoads = true;
+  bool InputsAreReverseConsecutive = true;
+  unsigned ElemSize = N->getValueType(0).getScalarSizeInBits() / 8;
+  SDValue FirstInput = N->getOperand(0);
+  bool IsRoundOfExtLoad = false;
+
+  if (FirstInput.getOpcode() == ISD::FP_ROUND &&
+      FirstInput.getOperand(0).getOpcode() == ISD::LOAD) {
+    LoadSDNode *LD = dyn_cast<LoadSDNode>(FirstInput.getOperand(0));
+    IsRoundOfExtLoad = LD->getExtensionType() == ISD::EXTLOAD;
+  }
+  // Not a build vector of (possibly fp_rounded) loads.
+  if (!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD)
+    return SDValue();
+
+  for (int i = 1, e = N->getNumOperands(); i < e; ++i) {
+    // If any inputs are fp_round(extload), they all must be.
+    if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND)
+      return SDValue();
+
+    SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(i).getOperand(0) :
+      N->getOperand(i);
+    if (NextInput.getOpcode() != ISD::LOAD)
+      return SDValue();
+
+    SDValue PreviousInput =
+      IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1);
+    LoadSDNode *LD1 = dyn_cast<LoadSDNode>(PreviousInput);
+    LoadSDNode *LD2 = dyn_cast<LoadSDNode>(NextInput);
+
+    // If any inputs are fp_round(extload), they all must be.
+    if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD)
+      return SDValue();
+
+    if (!isConsecutiveLS(LD2, LD1, ElemSize, 1, DAG))
+      InputsAreConsecutiveLoads = false;
+    if (!isConsecutiveLS(LD1, LD2, ElemSize, 1, DAG))
+      InputsAreReverseConsecutive = false;
+
+    // Exit early if the loads are neither consecutive nor reverse consecutive.
+    if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive)
+      return SDValue();
+  }
+
+  assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) &&
+         "The loads cannot be both consecutive and reverse consecutive.");
+
+  SDValue FirstLoadOp =
+    IsRoundOfExtLoad ? FirstInput.getOperand(0) : FirstInput;
+  SDValue LastLoadOp =
+    IsRoundOfExtLoad ? N->getOperand(N->getNumOperands()-1).getOperand(0) :
+                       N->getOperand(N->getNumOperands()-1);
+
+  LoadSDNode *LD1 = dyn_cast<LoadSDNode>(FirstLoadOp);
+  LoadSDNode *LDL = dyn_cast<LoadSDNode>(LastLoadOp);
+  if (InputsAreConsecutiveLoads) {
+    assert(LD1 && "Input needs to be a LoadSDNode.");
+    return DAG.getLoad(N->getValueType(0), dl, LD1->getChain(),
+                       LD1->getBasePtr(), LD1->getPointerInfo(),
+                       LD1->getAlignment());
+  }
+  if (InputsAreReverseConsecutive) {
+    assert(LDL && "Input needs to be a LoadSDNode.");
+    SDValue Load = DAG.getLoad(N->getValueType(0), dl, LDL->getChain(),
+                               LDL->getBasePtr(), LDL->getPointerInfo(),
+                               LDL->getAlignment());
+    SmallVector<int, 16> Ops;
+    for (int i = N->getNumOperands() - 1; i >= 0; i--)
+      Ops.push_back(i);
+
+    return DAG.getVectorShuffle(N->getValueType(0), dl, Load,
+                                DAG.getUNDEF(N->getValueType(0)), Ops);
+  }
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR &&
+         "Should be called with a BUILD_VECTOR node");
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+
+  if (!Subtarget.hasVSX())
+    return SDValue();
+
+  // The target independent DAG combiner will leave a build_vector of
+  // float-to-int conversions intact. We can generate MUCH better code for
+  // a float-to-int conversion of a vector of floats.
+  SDValue FirstInput = N->getOperand(0);
+  if (FirstInput.getOpcode() == PPCISD::MFVSR) {
+    SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI);
+    if (Reduced)
+      return Reduced;
+  }
+
+  // If we're building a vector out of consecutive loads, just load that
+  // vector type.
+  SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG);
+  if (Reduced)
+    return Reduced;
+
+  if (N->getValueType(0) != MVT::v2f64)
+    return SDValue();
+
+  // Looking for:
+  // (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1))
+  if (FirstInput.getOpcode() != ISD::SINT_TO_FP &&
+      FirstInput.getOpcode() != ISD::UINT_TO_FP)
+    return SDValue();
+  if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP &&
+      N->getOperand(1).getOpcode() != ISD::UINT_TO_FP)
+    return SDValue();
+  if (FirstInput.getOpcode() != N->getOperand(1).getOpcode())
+    return SDValue();
+
+  SDValue Ext1 = FirstInput.getOperand(0);
+  SDValue Ext2 = N->getOperand(1).getOperand(0);
+  if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+     Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+
+  ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Ext1.getOperand(1));
+  ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1));
+  if (!Ext1Op || !Ext2Op)
+    return SDValue();
+  if (Ext1.getValueType() != MVT::i32 ||
+      Ext2.getValueType() != MVT::i32)
+  if (Ext1.getOperand(0) != Ext2.getOperand(0))
+    return SDValue();
+
+  int FirstElem = Ext1Op->getZExtValue();
+  int SecondElem = Ext2Op->getZExtValue();
+  int SubvecIdx;
+  if (FirstElem == 0 && SecondElem == 1)
+    SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0;
+  else if (FirstElem == 2 && SecondElem == 3)
+    SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1;
+  else
+    return SDValue();
+
+  SDValue SrcVec = Ext1.getOperand(0);
+  auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ?
+    PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP;
+  return DAG.getNode(NodeType, dl, MVT::v2f64,
+                     SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl));
+}
+
+SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  assert((N->getOpcode() == ISD::SINT_TO_FP ||
+          N->getOpcode() == ISD::UINT_TO_FP) &&
+         "Need an int -> FP conversion node here");
+
+  if (useSoftFloat() || !Subtarget.has64BitSupport())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  SDValue Op(N, 0);
+
+  SDValue FirstOperand(Op.getOperand(0));
+  bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&
+    (FirstOperand.getValueType() == MVT::i8 ||
+     FirstOperand.getValueType() == MVT::i16);
+  if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) {
+    bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
+    bool DstDouble = Op.getValueType() == MVT::f64;
+    unsigned ConvOp = Signed ?
+      (DstDouble ? PPCISD::FCFID  : PPCISD::FCFIDS) :
+      (DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS);
+    SDValue WidthConst =
+      DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2,
+                            dl, false);
+    LoadSDNode *LDN = cast<LoadSDNode>(FirstOperand.getNode());
+    SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst };
+    SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl,
+                                         DAG.getVTList(MVT::f64, MVT::Other),
+                                         Ops, MVT::i8, LDN->getMemOperand());
+
+    // For signed conversion, we need to sign-extend the value in the VSR
+    if (Signed) {
+      SDValue ExtOps[] = { Ld, WidthConst };
+      SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps);
+      return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext);
+    } else
+      return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld);
+  }
+
+  // Don't handle ppc_fp128 here or i1 conversions.
+  if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
+    return SDValue();
+  if (Op.getOperand(0).getValueType() == MVT::i1)
+    return SDValue();
+
+  // For i32 intermediate values, unfortunately, the conversion functions
+  // leave the upper 32 bits of the value are undefined. Within the set of
+  // scalar instructions, we have no method for zero- or sign-extending the
+  // value. Thus, we cannot handle i32 intermediate values here.
+  if (Op.getOperand(0).getValueType() == MVT::i32)
+    return SDValue();
+
+  assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
+         "UINT_TO_FP is supported only with FPCVT");
+
+  // If we have FCFIDS, then use it when converting to single-precision.
+  // Otherwise, convert to double-precision and then round.
+  unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
+                       ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
+                                                            : PPCISD::FCFIDS)
+                       : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
+                                                            : PPCISD::FCFID);
+  MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
+                  ? MVT::f32
+                  : MVT::f64;
+
+  // If we're converting from a float, to an int, and back to a float again,
+  // then we don't need the store/load pair at all.
+  if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT &&
+       Subtarget.hasFPCVT()) ||
+      (Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) {
+    SDValue Src = Op.getOperand(0).getOperand(0);
+    if (Src.getValueType() == MVT::f32) {
+      Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
+      DCI.AddToWorklist(Src.getNode());
+    } else if (Src.getValueType() != MVT::f64) {
+      // Make sure that we don't pick up a ppc_fp128 source value.
+      return SDValue();
+    }
+
+    unsigned FCTOp =
+      Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
+                                                        PPCISD::FCTIDUZ;
+
+    SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src);
+    SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp);
+
+    if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
+      FP = DAG.getNode(ISD::FP_ROUND, dl,
+                       MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
+      DCI.AddToWorklist(FP.getNode());
+    }
+
+    return FP;
+  }
+
+  return SDValue();
+}
+
+// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
+// builtins) into loads with swaps.
+SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  SDValue Chain;
+  SDValue Base;
+  MachineMemOperand *MMO;
+
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode for little endian VSX load");
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    Chain = LD->getChain();
+    Base = LD->getBasePtr();
+    MMO = LD->getMemOperand();
+    // If the MMO suggests this isn't a load of a full vector, leave
+    // things alone.  For a built-in, we have to make the change for
+    // correctness, so if there is a size problem that will be a bug.
+    if (MMO->getSize() < 16)
+      return SDValue();
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
+    Chain = Intrin->getChain();
+    // Similarly to the store case below, Intrin->getBasePtr() doesn't get
+    // us what we want. Get operand 2 instead.
+    Base = Intrin->getOperand(2);
+    MMO = Intrin->getMemOperand();
+    break;
+  }
+  }
+
+  MVT VecTy = N->getValueType(0).getSimpleVT();
+  SDValue LoadOps[] = { Chain, Base };
+  SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
+                                         DAG.getVTList(MVT::v2f64, MVT::Other),
+                                         LoadOps, MVT::v2f64, MMO);
+
+  DCI.AddToWorklist(Load.getNode());
+  Chain = Load.getValue(1);
+  SDValue Swap = DAG.getNode(
+      PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load);
+  DCI.AddToWorklist(Swap.getNode());
+
+  // Add a bitcast if the resulting load type doesn't match v2f64.
+  if (VecTy != MVT::v2f64) {
+    SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap);
+    DCI.AddToWorklist(N.getNode());
+    // Package {bitcast value, swap's chain} to match Load's shape.
+    return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other),
+                       N, Swap.getValue(1));
+  }
+
+  return Swap;
+}
+
+// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for
+// builtins) into stores with swaps.
+SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  SDValue Chain;
+  SDValue Base;
+  unsigned SrcOpnd;
+  MachineMemOperand *MMO;
+
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode for little endian VSX store");
+  case ISD::STORE: {
+    StoreSDNode *ST = cast<StoreSDNode>(N);
+    Chain = ST->getChain();
+    Base = ST->getBasePtr();
+    MMO = ST->getMemOperand();
+    SrcOpnd = 1;
+    // If the MMO suggests this isn't a store of a full vector, leave
+    // things alone.  For a built-in, we have to make the change for
+    // correctness, so if there is a size problem that will be a bug.
+    if (MMO->getSize() < 16)
+      return SDValue();
+    break;
+  }
+  case ISD::INTRINSIC_VOID: {
+    MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
+    Chain = Intrin->getChain();
+    // Intrin->getBasePtr() oddly does not get what we want.
+    Base = Intrin->getOperand(3);
+    MMO = Intrin->getMemOperand();
+    SrcOpnd = 2;
+    break;
+  }
+  }
+
+  SDValue Src = N->getOperand(SrcOpnd);
+  MVT VecTy = Src.getValueType().getSimpleVT();
+
+  // All stores are done as v2f64 and possible bit cast.
+  if (VecTy != MVT::v2f64) {
+    Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src);
+    DCI.AddToWorklist(Src.getNode());
+  }
+
+  SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
+                             DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src);
+  DCI.AddToWorklist(Swap.getNode());
+  Chain = Swap.getValue(1);
+  SDValue StoreOps[] = { Chain, Swap, Base };
+  SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl,
+                                          DAG.getVTList(MVT::Other),
+                                          StoreOps, VecTy, MMO);
+  DCI.AddToWorklist(Store.getNode());
+  return Store;
+}
+
+SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  switch (N->getOpcode()) {
+  default: break;
+  case PPCISD::SHL:
+    if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
+        return N->getOperand(0);
+    break;
+  case PPCISD::SRL:
+    if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0.
+        return N->getOperand(0);
+    break;
+  case PPCISD::SRA:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (C->isNullValue() ||   //  0 >>s V -> 0.
+          C->isAllOnesValue())    // -1 >>s V -> -1.
+        return N->getOperand(0);
+    }
+    break;
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+    return DAGCombineExtBoolTrunc(N, DCI);
+  case ISD::TRUNCATE:
+  case ISD::SETCC:
+  case ISD::SELECT_CC:
+    return DAGCombineTruncBoolExt(N, DCI);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return combineFPToIntToFP(N, DCI);
+  case ISD::STORE: {
+    EVT Op1VT = N->getOperand(1).getValueType();
+    bool ValidTypeForStoreFltAsInt = (Op1VT == MVT::i32) ||
+      (Subtarget.hasP9Vector() && (Op1VT == MVT::i8 || Op1VT == MVT::i16));
+
+    // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)).
+    if (Subtarget.hasSTFIWX() && !cast<StoreSDNode>(N)->isTruncatingStore() &&
+        N->getOperand(1).getOpcode() == ISD::FP_TO_SINT &&
+        ValidTypeForStoreFltAsInt &&
+        N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) {
+      SDValue Val = N->getOperand(1).getOperand(0);
+      if (Val.getValueType() == MVT::f32) {
+        Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
+        DCI.AddToWorklist(Val.getNode());
+      }
+      Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val);
+      DCI.AddToWorklist(Val.getNode());
+
+      if (Op1VT == MVT::i32) {
+        SDValue Ops[] = {
+          N->getOperand(0), Val, N->getOperand(2),
+          DAG.getValueType(N->getOperand(1).getValueType())
+        };
+
+        Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
+                DAG.getVTList(MVT::Other), Ops,
+                cast<StoreSDNode>(N)->getMemoryVT(),
+                cast<StoreSDNode>(N)->getMemOperand());
+      } else {
+        unsigned WidthInBytes =
+          N->getOperand(1).getValueType() == MVT::i8 ? 1 : 2;
+        SDValue WidthConst = DAG.getIntPtrConstant(WidthInBytes, dl, false);
+
+        SDValue Ops[] = {
+          N->getOperand(0), Val, N->getOperand(2), WidthConst,
+          DAG.getValueType(N->getOperand(1).getValueType())
+        };
+        Val = DAG.getMemIntrinsicNode(PPCISD::STXSIX, dl,
+                                      DAG.getVTList(MVT::Other), Ops,
+                                      cast<StoreSDNode>(N)->getMemoryVT(),
+                                      cast<StoreSDNode>(N)->getMemOperand());
+      }
+
+      DCI.AddToWorklist(Val.getNode());
+      return Val;
+    }
+
+    // Turn STORE (BSWAP) -> sthbrx/stwbrx.
+    if (cast<StoreSDNode>(N)->isUnindexed() &&
+        N->getOperand(1).getOpcode() == ISD::BSWAP &&
+        N->getOperand(1).getNode()->hasOneUse() &&
+        (N->getOperand(1).getValueType() == MVT::i32 ||
+         N->getOperand(1).getValueType() == MVT::i16 ||
+         (Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
+          N->getOperand(1).getValueType() == MVT::i64))) {
+      SDValue BSwapOp = N->getOperand(1).getOperand(0);
+      // Do an any-extend to 32-bits if this is a half-word input.
+      if (BSwapOp.getValueType() == MVT::i16)
+        BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);
+
+      SDValue Ops[] = {
+        N->getOperand(0), BSwapOp, N->getOperand(2),
+        DAG.getValueType(N->getOperand(1).getValueType())
+      };
+      return
+        DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
+                                Ops, cast<StoreSDNode>(N)->getMemoryVT(),
+                                cast<StoreSDNode>(N)->getMemOperand());
+    }
+
+    // For little endian, VSX stores require generating xxswapd/lxvd2x.
+    // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
+    EVT VT = N->getOperand(1).getValueType();
+    if (VT.isSimple()) {
+      MVT StoreVT = VT.getSimpleVT();
+      if (Subtarget.needsSwapsForVSXMemOps() &&
+          (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||
+           StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))
+        return expandVSXStoreForLE(N, DCI);
+    }
+    break;
+  }
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    EVT VT = LD->getValueType(0);
+
+    // For little endian, VSX loads require generating lxvd2x/xxswapd.
+    // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
+    if (VT.isSimple()) {
+      MVT LoadVT = VT.getSimpleVT();
+      if (Subtarget.needsSwapsForVSXMemOps() &&
+          (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 ||
+           LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32))
+        return expandVSXLoadForLE(N, DCI);
+    }
+
+    // We sometimes end up with a 64-bit integer load, from which we extract
+    // two single-precision floating-point numbers. This happens with
+    // std::complex<float>, and other similar structures, because of the way we
+    // canonicalize structure copies. However, if we lack direct moves,
+    // then the final bitcasts from the extracted integer values to the
+    // floating-point numbers turn into store/load pairs. Even with direct moves,
+    // just loading the two floating-point numbers is likely better.
+    auto ReplaceTwoFloatLoad = [&]() {
+      if (VT != MVT::i64)
+        return false;
+
+      if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
+          LD->isVolatile())
+        return false;
+
+      //  We're looking for a sequence like this:
+      //  t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
+      //      t16: i64 = srl t13, Constant:i32<32>
+      //    t17: i32 = truncate t16
+      //  t18: f32 = bitcast t17
+      //    t19: i32 = truncate t13
+      //  t20: f32 = bitcast t19
+
+      if (!LD->hasNUsesOfValue(2, 0))
+        return false;
+
+      auto UI = LD->use_begin();
+      while (UI.getUse().getResNo() != 0) ++UI;
+      SDNode *Trunc = *UI++;
+      while (UI.getUse().getResNo() != 0) ++UI;
+      SDNode *RightShift = *UI;
+      if (Trunc->getOpcode() != ISD::TRUNCATE)
+        std::swap(Trunc, RightShift);
+
+      if (Trunc->getOpcode() != ISD::TRUNCATE ||
+          Trunc->getValueType(0) != MVT::i32 ||
+          !Trunc->hasOneUse())
+        return false;
+      if (RightShift->getOpcode() != ISD::SRL ||
+          !isa<ConstantSDNode>(RightShift->getOperand(1)) ||
+          RightShift->getConstantOperandVal(1) != 32 ||
+          !RightShift->hasOneUse())
+        return false;
+
+      SDNode *Trunc2 = *RightShift->use_begin();
+      if (Trunc2->getOpcode() != ISD::TRUNCATE ||
+          Trunc2->getValueType(0) != MVT::i32 ||
+          !Trunc2->hasOneUse())
+        return false;
+
+      SDNode *Bitcast = *Trunc->use_begin();
+      SDNode *Bitcast2 = *Trunc2->use_begin();
+
+      if (Bitcast->getOpcode() != ISD::BITCAST ||
+          Bitcast->getValueType(0) != MVT::f32)
+        return false;
+      if (Bitcast2->getOpcode() != ISD::BITCAST ||
+          Bitcast2->getValueType(0) != MVT::f32)
+        return false;
+
+      if (Subtarget.isLittleEndian())
+        std::swap(Bitcast, Bitcast2);
+
+      // Bitcast has the second float (in memory-layout order) and Bitcast2
+      // has the first one.
+
+      SDValue BasePtr = LD->getBasePtr();
+      if (LD->isIndexed()) {
+        assert(LD->getAddressingMode() == ISD::PRE_INC &&
+               "Non-pre-inc AM on PPC?");
+        BasePtr =
+          DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                      LD->getOffset());
+      }
+
+      auto MMOFlags =
+          LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile;
+      SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
+                                      LD->getPointerInfo(), LD->getAlignment(),
+                                      MMOFlags, LD->getAAInfo());
+      SDValue AddPtr =
+        DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
+                    BasePtr, DAG.getIntPtrConstant(4, dl));
+      SDValue FloatLoad2 = DAG.getLoad(
+          MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
+          LD->getPointerInfo().getWithOffset(4),
+          MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo());
+
+      if (LD->isIndexed()) {
+        // Note that DAGCombine should re-form any pre-increment load(s) from
+        // what is produced here if that makes sense.
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
+      }
+
+      DCI.CombineTo(Bitcast2, FloatLoad);
+      DCI.CombineTo(Bitcast, FloatLoad2);
+
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
+                                    SDValue(FloatLoad2.getNode(), 1));
+      return true;
+    };
+
+    if (ReplaceTwoFloatLoad())
+      return SDValue(N, 0);
+
+    EVT MemVT = LD->getMemoryVT();
+    Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
+    unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
+    Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext());
+    unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy);
+    if (LD->isUnindexed() && VT.isVector() &&
+        ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
+          // P8 and later hardware should just use LOAD.
+          !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 ||
+                                       VT == MVT::v4i32 || VT == MVT::v4f32)) ||
+         (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) &&
+          LD->getAlignment() >= ScalarABIAlignment)) &&
+        LD->getAlignment() < ABIAlignment) {
+      // This is a type-legal unaligned Altivec or QPX load.
+      SDValue Chain = LD->getChain();
+      SDValue Ptr = LD->getBasePtr();
+      bool isLittleEndian = Subtarget.isLittleEndian();
+
+      // This implements the loading of unaligned vectors as described in
+      // the venerable Apple Velocity Engine overview. Specifically:
+      // https://developer.apple.com/hardwaredrivers/ve/alignment.html
+      // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
+      //
+      // The general idea is to expand a sequence of one or more unaligned
+      // loads into an alignment-based permutation-control instruction (lvsl
+      // or lvsr), a series of regular vector loads (which always truncate
+      // their input address to an aligned address), and a series of
+      // permutations.  The results of these permutations are the requested
+      // loaded values.  The trick is that the last "extra" load is not taken
+      // from the address you might suspect (sizeof(vector) bytes after the
+      // last requested load), but rather sizeof(vector) - 1 bytes after the
+      // last requested vector. The point of this is to avoid a page fault if
+      // the base address happened to be aligned. This works because if the
+      // base address is aligned, then adding less than a full vector length
+      // will cause the last vector in the sequence to be (re)loaded.
+      // Otherwise, the next vector will be fetched as you might suspect was
+      // necessary.
+
+      // We might be able to reuse the permutation generation from
+      // a different base address offset from this one by an aligned amount.
+      // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
+      // optimization later.
+      Intrinsic::ID Intr, IntrLD, IntrPerm;
+      MVT PermCntlTy, PermTy, LDTy;
+      if (Subtarget.hasAltivec()) {
+        Intr = isLittleEndian ?  Intrinsic::ppc_altivec_lvsr :
+                                 Intrinsic::ppc_altivec_lvsl;
+        IntrLD = Intrinsic::ppc_altivec_lvx;
+        IntrPerm = Intrinsic::ppc_altivec_vperm;
+        PermCntlTy = MVT::v16i8;
+        PermTy = MVT::v4i32;
+        LDTy = MVT::v4i32;
+      } else {
+        Intr =   MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld :
+                                       Intrinsic::ppc_qpx_qvlpcls;
+        IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd :
+                                       Intrinsic::ppc_qpx_qvlfs;
+        IntrPerm = Intrinsic::ppc_qpx_qvfperm;
+        PermCntlTy = MVT::v4f64;
+        PermTy = MVT::v4f64;
+        LDTy = MemVT.getSimpleVT();
+      }
+
+      SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy);
+
+      // Create the new MMO for the new base load. It is like the original MMO,
+      // but represents an area in memory almost twice the vector size centered
+      // on the original address. If the address is unaligned, we might start
+      // reading up to (sizeof(vector)-1) bytes below the address of the
+      // original unaligned load.
+      MachineFunction &MF = DAG.getMachineFunction();
+      MachineMemOperand *BaseMMO =
+        MF.getMachineMemOperand(LD->getMemOperand(),
+                                -(long)MemVT.getStoreSize()+1,
+                                2*MemVT.getStoreSize()-1);
+
+      // Create the new base load.
+      SDValue LDXIntID =
+          DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout()));
+      SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
+      SDValue BaseLoad =
+        DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
+                                DAG.getVTList(PermTy, MVT::Other),
+                                BaseLoadOps, LDTy, BaseMMO);
+
+      // Note that the value of IncOffset (which is provided to the next
+      // load's pointer info offset value, and thus used to calculate the
+      // alignment), and the value of IncValue (which is actually used to
+      // increment the pointer value) are different! This is because we
+      // require the next load to appear to be aligned, even though it
+      // is actually offset from the base pointer by a lesser amount.
+      int IncOffset = VT.getSizeInBits() / 8;
+      int IncValue = IncOffset;
+
+      // Walk (both up and down) the chain looking for another load at the real
+      // (aligned) offset (the alignment of the other load does not matter in
+      // this case). If found, then do not use the offset reduction trick, as
+      // that will prevent the loads from being later combined (as they would
+      // otherwise be duplicates).
+      if (!findConsecutiveLoad(LD, DAG))
+        --IncValue;
+
+      SDValue Increment =
+          DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout()));
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+
+      MachineMemOperand *ExtraMMO =
+        MF.getMachineMemOperand(LD->getMemOperand(),
+                                1, 2*MemVT.getStoreSize()-1);
+      SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };
+      SDValue ExtraLoad =
+        DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
+                                DAG.getVTList(PermTy, MVT::Other),
+                                ExtraLoadOps, LDTy, ExtraMMO);
+
+      SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+        BaseLoad.getValue(1), ExtraLoad.getValue(1));
+
+      // Because vperm has a big-endian bias, we must reverse the order
+      // of the input vectors and complement the permute control vector
+      // when generating little endian code.  We have already handled the
+      // latter by using lvsr instead of lvsl, so just reverse BaseLoad
+      // and ExtraLoad here.
+      SDValue Perm;
+      if (isLittleEndian)
+        Perm = BuildIntrinsicOp(IntrPerm,
+                                ExtraLoad, BaseLoad, PermCntl, DAG, dl);
+      else
+        Perm = BuildIntrinsicOp(IntrPerm,
+                                BaseLoad, ExtraLoad, PermCntl, DAG, dl);
+
+      if (VT != PermTy)
+        Perm = Subtarget.hasAltivec() ?
+                 DAG.getNode(ISD::BITCAST, dl, VT, Perm) :
+                 DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX
+                               DAG.getTargetConstant(1, dl, MVT::i64));
+                               // second argument is 1 because this rounding
+                               // is always exact.
+
+      // The output of the permutation is our loaded result, the TokenFactor is
+      // our new chain.
+      DCI.CombineTo(N, Perm, TF);
+      return SDValue(N, 0);
+    }
+    }
+    break;
+    case ISD::INTRINSIC_WO_CHAIN: {
+      bool isLittleEndian = Subtarget.isLittleEndian();
+      unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+      Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
+                                           : Intrinsic::ppc_altivec_lvsl);
+      if ((IID == Intr ||
+           IID == Intrinsic::ppc_qpx_qvlpcld  ||
+           IID == Intrinsic::ppc_qpx_qvlpcls) &&
+        N->getOperand(1)->getOpcode() == ISD::ADD) {
+        SDValue Add = N->getOperand(1);
+
+        int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ?
+                   5 /* 32 byte alignment */ : 4 /* 16 byte alignment */;
+
+        if (DAG.MaskedValueIsZero(Add->getOperand(1),
+                                  APInt::getAllOnesValue(Bits /* alignment */)
+                                      .zext(Add.getScalarValueSizeInBits()))) {
+          SDNode *BasePtr = Add->getOperand(0).getNode();
+          for (SDNode::use_iterator UI = BasePtr->use_begin(),
+                                    UE = BasePtr->use_end();
+               UI != UE; ++UI) {
+            if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+                cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) {
+              // We've found another LVSL/LVSR, and this address is an aligned
+              // multiple of that one. The results will be the same, so use the
+              // one we've just found instead.
+
+              return SDValue(*UI, 0);
+            }
+          }
+        }
+
+        if (isa<ConstantSDNode>(Add->getOperand(1))) {
+          SDNode *BasePtr = Add->getOperand(0).getNode();
+          for (SDNode::use_iterator UI = BasePtr->use_begin(),
+               UE = BasePtr->use_end(); UI != UE; ++UI) {
+            if (UI->getOpcode() == ISD::ADD &&
+                isa<ConstantSDNode>(UI->getOperand(1)) &&
+                (cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() -
+                 cast<ConstantSDNode>(UI->getOperand(1))->getZExtValue()) %
+                (1ULL << Bits) == 0) {
+              SDNode *OtherAdd = *UI;
+              for (SDNode::use_iterator VI = OtherAdd->use_begin(),
+                   VE = OtherAdd->use_end(); VI != VE; ++VI) {
+                if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+                    cast<ConstantSDNode>(VI->getOperand(0))->getZExtValue() == IID) {
+                  return SDValue(*VI, 0);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    break;
+  case ISD::INTRINSIC_W_CHAIN: {
+    // For little endian, VSX loads require generating lxvd2x/xxswapd.
+    // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
+    if (Subtarget.needsSwapsForVSXMemOps()) {
+      switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+      default:
+        break;
+      case Intrinsic::ppc_vsx_lxvw4x:
+      case Intrinsic::ppc_vsx_lxvd2x:
+        return expandVSXLoadForLE(N, DCI);
+      }
+    }
+    break;
+  }
+  case ISD::INTRINSIC_VOID: {
+    // For little endian, VSX stores require generating xxswapd/stxvd2x.
+    // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
+    if (Subtarget.needsSwapsForVSXMemOps()) {
+      switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+      default:
+        break;
+      case Intrinsic::ppc_vsx_stxvw4x:
+      case Intrinsic::ppc_vsx_stxvd2x:
+        return expandVSXStoreForLE(N, DCI);
+      }
+    }
+    break;
+  }
+  case ISD::BSWAP:
+    // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
+    if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
+        N->getOperand(0).hasOneUse() &&
+        (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 ||
+         (Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
+          N->getValueType(0) == MVT::i64))) {
+      SDValue Load = N->getOperand(0);
+      LoadSDNode *LD = cast<LoadSDNode>(Load);
+      // Create the byte-swapping load.
+      SDValue Ops[] = {
+        LD->getChain(),    // Chain
+        LD->getBasePtr(),  // Ptr
+        DAG.getValueType(N->getValueType(0)) // VT
+      };
+      SDValue BSLoad =
+        DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
+                                DAG.getVTList(N->getValueType(0) == MVT::i64 ?
+                                              MVT::i64 : MVT::i32, MVT::Other),
+                                Ops, LD->getMemoryVT(), LD->getMemOperand());
+
+      // If this is an i16 load, insert the truncate.
+      SDValue ResVal = BSLoad;
+      if (N->getValueType(0) == MVT::i16)
+        ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad);
+
+      // First, combine the bswap away.  This makes the value produced by the
+      // load dead.
+      DCI.CombineTo(N, ResVal);
+
+      // Next, combine the load away, we give it a bogus result value but a real
+      // chain result.  The result value is dead because the bswap is dead.
+      DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
+
+      // Return N so it doesn't get rechecked!
+      return SDValue(N, 0);
+    }
+
+    break;
+  case PPCISD::VCMP: {
+    // If a VCMPo node already exists with exactly the same operands as this
+    // node, use its result instead of this node (VCMPo computes both a CR6 and
+    // a normal output).
+    //
+    if (!N->getOperand(0).hasOneUse() &&
+        !N->getOperand(1).hasOneUse() &&
+        !N->getOperand(2).hasOneUse()) {
+
+      // Scan all of the users of the LHS, looking for VCMPo's that match.
+      SDNode *VCMPoNode = nullptr;
+
+      SDNode *LHSN = N->getOperand(0).getNode();
+      for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
+           UI != E; ++UI)
+        if (UI->getOpcode() == PPCISD::VCMPo &&
+            UI->getOperand(1) == N->getOperand(1) &&
+            UI->getOperand(2) == N->getOperand(2) &&
+            UI->getOperand(0) == N->getOperand(0)) {
+          VCMPoNode = *UI;
+          break;
+        }
+
+      // If there is no VCMPo node, or if the flag value has a single use, don't
+      // transform this.
+      if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1))
+        break;
+
+      // Look at the (necessarily single) use of the flag value.  If it has a
+      // chain, this transformation is more complex.  Note that multiple things
+      // could use the value result, which we should ignore.
+      SDNode *FlagUser = nullptr;
+      for (SDNode::use_iterator UI = VCMPoNode->use_begin();
+           FlagUser == nullptr; ++UI) {
+        assert(UI != VCMPoNode->use_end() && "Didn't find user!");
+        SDNode *User = *UI;
+        for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
+          if (User->getOperand(i) == SDValue(VCMPoNode, 1)) {
+            FlagUser = User;
+            break;
+          }
+        }
+      }
+
+      // If the user is a MFOCRF instruction, we know this is safe.
+      // Otherwise we give up for right now.
+      if (FlagUser->getOpcode() == PPCISD::MFOCRF)
+        return SDValue(VCMPoNode, 0);
+    }
+    break;
+  }
+  case ISD::BRCOND: {
+    SDValue Cond = N->getOperand(1);
+    SDValue Target = N->getOperand(2);
+
+    if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+        cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
+          Intrinsic::ppc_is_decremented_ctr_nonzero) {
+
+      // We now need to make the intrinsic dead (it cannot be instruction
+      // selected).
+      DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0));
+      assert(Cond.getNode()->hasOneUse() &&
+             "Counter decrement has more than one use");
+
+      return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other,
+                         N->getOperand(0), Target);
+    }
+  }
+  break;
+  case ISD::BR_CC: {
+    // If this is a branch on an altivec predicate comparison, lower this so
+    // that we don't have to do a MFOCRF: instead, branch directly on CR6.  This
+    // lowering is done pre-legalize, because the legalizer lowers the predicate
+    // compare down to code that is difficult to reassemble.
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+    SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);
+
+    // Sometimes the promoted value of the intrinsic is ANDed by some non-zero
+    // value. If so, pass-through the AND to get to the intrinsic.
+    if (LHS.getOpcode() == ISD::AND &&
+        LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+        cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
+          Intrinsic::ppc_is_decremented_ctr_nonzero &&
+        isa<ConstantSDNode>(LHS.getOperand(1)) &&
+        !isNullConstant(LHS.getOperand(1)))
+      LHS = LHS.getOperand(0);
+
+    if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+        cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
+          Intrinsic::ppc_is_decremented_ctr_nonzero &&
+        isa<ConstantSDNode>(RHS)) {
+      assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+             "Counter decrement comparison is not EQ or NE");
+
+      unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
+      bool isBDNZ = (CC == ISD::SETEQ && Val) ||
+                    (CC == ISD::SETNE && !Val);
+
+      // We now need to make the intrinsic dead (it cannot be instruction
+      // selected).
+      DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0));
+      assert(LHS.getNode()->hasOneUse() &&
+             "Counter decrement has more than one use");
+
+      return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other,
+                         N->getOperand(0), N->getOperand(4));
+    }
+
+    int CompareOpc;
+    bool isDot;
+
+    if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+        isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) &&
+        getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) {
+      assert(isDot && "Can't compare against a vector result!");
+
+      // If this is a comparison against something other than 0/1, then we know
+      // that the condition is never/always true.
+      unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
+      if (Val != 0 && Val != 1) {
+        if (CC == ISD::SETEQ)      // Cond never true, remove branch.
+          return N->getOperand(0);
+        // Always !=, turn it into an unconditional branch.
+        return DAG.getNode(ISD::BR, dl, MVT::Other,
+                           N->getOperand(0), N->getOperand(4));
+      }
+
+      bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
+
+      // Create the PPCISD altivec 'dot' comparison node.
+      SDValue Ops[] = {
+        LHS.getOperand(2),  // LHS of compare
+        LHS.getOperand(3),  // RHS of compare
+        DAG.getConstant(CompareOpc, dl, MVT::i32)
+      };
+      EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
+      SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
+
+      // Unpack the result based on how the target uses it.
+      PPC::Predicate CompOpc;
+      switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) {
+      default:  // Can't happen, don't crash on invalid number though.
+      case 0:   // Branch on the value of the EQ bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
+        break;
+      case 1:   // Branch on the inverted value of the EQ bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
+        break;
+      case 2:   // Branch on the value of the LT bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
+        break;
+      case 3:   // Branch on the inverted value of the LT bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
+        break;
+      }
+
+      return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0),
+                         DAG.getConstant(CompOpc, dl, MVT::i32),
+                         DAG.getRegister(PPC::CR6, MVT::i32),
+                         N->getOperand(4), CompNode.getValue(1));
+    }
+    break;
+  }
+  case ISD::BUILD_VECTOR:
+    return DAGCombineBuildVector(N, DCI);
+  }
+
+  return SDValue();
+}
+
+SDValue
+PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+                                  SelectionDAG &DAG,
+                                  std::vector<SDNode *> *Created) const {
+  // fold (sdiv X, pow2)
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::i64 && !Subtarget.isPPC64())
+    return SDValue();
+  if ((VT != MVT::i32 && VT != MVT::i64) ||
+      !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+
+  bool IsNegPow2 = (-Divisor).isPowerOf2();
+  unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros();
+  SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT);
+
+  SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt);
+  if (Created)
+    Created->push_back(Op.getNode());
+
+  if (IsNegPow2) {
+    Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
+    if (Created)
+      Created->push_back(Op.getNode());
+  }
+
+  return Op;
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                      APInt &KnownZero,
+                                                      APInt &KnownOne,
+                                                      const SelectionDAG &DAG,
+                                                      unsigned Depth) const {
+  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
+  switch (Op.getOpcode()) {
+  default: break;
+  case PPCISD::LBRX: {
+    // lhbrx is known to have the top bits cleared out.
+    if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)
+      KnownZero = 0xFFFF0000;
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
+    default: break;
+    case Intrinsic::ppc_altivec_vcmpbfp_p:
+    case Intrinsic::ppc_altivec_vcmpeqfp_p:
+    case Intrinsic::ppc_altivec_vcmpequb_p:
+    case Intrinsic::ppc_altivec_vcmpequh_p:
+    case Intrinsic::ppc_altivec_vcmpequw_p:
+    case Intrinsic::ppc_altivec_vcmpequd_p:
+    case Intrinsic::ppc_altivec_vcmpgefp_p:
+    case Intrinsic::ppc_altivec_vcmpgtfp_p:
+    case Intrinsic::ppc_altivec_vcmpgtsb_p:
+    case Intrinsic::ppc_altivec_vcmpgtsh_p:
+    case Intrinsic::ppc_altivec_vcmpgtsw_p:
+    case Intrinsic::ppc_altivec_vcmpgtsd_p:
+    case Intrinsic::ppc_altivec_vcmpgtub_p:
+    case Intrinsic::ppc_altivec_vcmpgtuh_p:
+    case Intrinsic::ppc_altivec_vcmpgtuw_p:
+    case Intrinsic::ppc_altivec_vcmpgtud_p:
+      KnownZero = ~1U;  // All bits but the low one are known to be zero.
+      break;
+    }
+  }
+  }
+}
+
+unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+  switch (Subtarget.getDarwinDirective()) {
+  default: break;
+  case PPC::DIR_970:
+  case PPC::DIR_PWR4:
+  case PPC::DIR_PWR5:
+  case PPC::DIR_PWR5X:
+  case PPC::DIR_PWR6:
+  case PPC::DIR_PWR6X:
+  case PPC::DIR_PWR7:
+  case PPC::DIR_PWR8:
+  case PPC::DIR_PWR9: {
+    if (!ML)
+      break;
+
+    const PPCInstrInfo *TII = Subtarget.getInstrInfo();
+
+    // For small loops (between 5 and 8 instructions), align to a 32-byte
+    // boundary so that the entire loop fits in one instruction-cache line.
+    uint64_t LoopSize = 0;
+    for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
+      for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) {
+        LoopSize += TII->getInstSizeInBytes(*J);
+        if (LoopSize > 32)
+          break;
+      }
+
+    if (LoopSize > 16 && LoopSize <= 32)
+      return 5;
+
+    break;
+  }
+  }
+
+  return TargetLowering::getPrefLoopAlignment(ML);
+}
+
+/// getConstraintType - Given a constraint, return the type of
+/// constraint it is for this target.
+PPCTargetLowering::ConstraintType
+PPCTargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'b':
+    case 'r':
+    case 'f':
+    case 'd':
+    case 'v':
+    case 'y':
+      return C_RegisterClass;
+    case 'Z':
+      // FIXME: While Z does indicate a memory constraint, it specifically
+      // indicates an r+r address (used in conjunction with the 'y' modifier
+      // in the replacement string). Currently, we're forcing the base
+      // register to be r0 in the asm printer (which is interpreted as zero)
+      // and forming the complete address in the second register. This is
+      // suboptimal.
+      return C_Memory;
+    }
+  } else if (Constraint == "wc") { // individual CR bits.
+    return C_RegisterClass;
+  } else if (Constraint == "wa" || Constraint == "wd" ||
+             Constraint == "wf" || Constraint == "ws") {
+    return C_RegisterClass; // VSX registers.
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+PPCTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+
+  // Look at the constraint type.
+  if (StringRef(constraint) == "wc" && type->isIntegerTy(1))
+    return CW_Register; // an individual CR bit.
+  else if ((StringRef(constraint) == "wa" ||
+            StringRef(constraint) == "wd" ||
+            StringRef(constraint) == "wf") &&
+           type->isVectorTy())
+    return CW_Register;
+  else if (StringRef(constraint) == "ws" && type->isDoubleTy())
+    return CW_Register;
+
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'b':
+    if (type->isIntegerTy())
+      weight = CW_Register;
+    break;
+  case 'f':
+    if (type->isFloatTy())
+      weight = CW_Register;
+    break;
+  case 'd':
+    if (type->isDoubleTy())
+      weight = CW_Register;
+    break;
+  case 'v':
+    if (type->isVectorTy())
+      weight = CW_Register;
+    break;
+  case 'y':
+    weight = CW_Register;
+    break;
+  case 'Z':
+    weight = CW_Memory;
+    break;
+  }
+  return weight;
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                StringRef Constraint,
+                                                MVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC RS6000 Constraint Letters
+    switch (Constraint[0]) {
+    case 'b':   // R1-R31
+      if (VT == MVT::i64 && Subtarget.isPPC64())
+        return std::make_pair(0U, &PPC::G8RC_NOX0RegClass);
+      return std::make_pair(0U, &PPC::GPRC_NOR0RegClass);
+    case 'r':   // R0-R31
+      if (VT == MVT::i64 && Subtarget.isPPC64())
+        return std::make_pair(0U, &PPC::G8RCRegClass);
+      return std::make_pair(0U, &PPC::GPRCRegClass);
+    // 'd' and 'f' constraints are both defined to be "the floating point
+    // registers", where one is for 32-bit and the other for 64-bit. We don't
+    // really care overly much here so just give them all the same reg classes.
+    case 'd':
+    case 'f':
+      if (VT == MVT::f32 || VT == MVT::i32)
+        return std::make_pair(0U, &PPC::F4RCRegClass);
+      if (VT == MVT::f64 || VT == MVT::i64)
+        return std::make_pair(0U, &PPC::F8RCRegClass);
+      if (VT == MVT::v4f64 && Subtarget.hasQPX())
+        return std::make_pair(0U, &PPC::QFRCRegClass);
+      if (VT == MVT::v4f32 && Subtarget.hasQPX())
+        return std::make_pair(0U, &PPC::QSRCRegClass);
+      break;
+    case 'v':
+      if (VT == MVT::v4f64 && Subtarget.hasQPX())
+        return std::make_pair(0U, &PPC::QFRCRegClass);
+      if (VT == MVT::v4f32 && Subtarget.hasQPX())
+        return std::make_pair(0U, &PPC::QSRCRegClass);
+      if (Subtarget.hasAltivec())
+        return std::make_pair(0U, &PPC::VRRCRegClass);
+    case 'y':   // crrc
+      return std::make_pair(0U, &PPC::CRRCRegClass);
+    }
+  } else if (Constraint == "wc" && Subtarget.useCRBits()) {
+    // An individual CR bit.
+    return std::make_pair(0U, &PPC::CRBITRCRegClass);
+  } else if ((Constraint == "wa" || Constraint == "wd" ||
+             Constraint == "wf") && Subtarget.hasVSX()) {
+    return std::make_pair(0U, &PPC::VSRCRegClass);
+  } else if (Constraint == "ws" && Subtarget.hasVSX()) {
+    if (VT == MVT::f32 && Subtarget.hasP8Vector())
+      return std::make_pair(0U, &PPC::VSSRCRegClass);
+    else
+      return std::make_pair(0U, &PPC::VSFRCRegClass);
+  }
+
+  std::pair<unsigned, const TargetRegisterClass *> R =
+      TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+
+  // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
+  // (which we call X[0-9]+). If a 64-bit value has been requested, and a
+  // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent
+  // register.
+  // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
+  // the AsmName field from *RegisterInfo.td, then this would not be necessary.
+  if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
+      PPC::GPRCRegClass.contains(R.first))
+    return std::make_pair(TRI->getMatchingSuperReg(R.first,
+                            PPC::sub_32, &PPC::G8RCRegClass),
+                          &PPC::G8RCRegClass);
+
+  // GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.
+  if (!R.second && StringRef("{cc}").equals_lower(Constraint)) {
+    R.first = PPC::CR0;
+    R.second = &PPC::CRRCRegClass;
+  }
+
+  return R;
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result;
+
+  // Only support length 1 constraints.
+  if (Constraint.length() > 1) return;
+
+  char Letter = Constraint[0];
+  switch (Letter) {
+  default: break;
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+  case 'O':
+  case 'P': {
+    ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
+    if (!CST) return; // Must be an immediate to match.
+    SDLoc dl(Op);
+    int64_t Value = CST->getSExtValue();
+    EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative
+                         // numbers are printed as such.
+    switch (Letter) {
+    default: llvm_unreachable("Unknown constraint letter!");
+    case 'I':  // "I" is a signed 16-bit constant.
+      if (isInt<16>(Value))
+        Result = DAG.getTargetConstant(Value, dl, TCVT);
+      break;
+    case 'J':  // "J" is a constant with only the high-order 16 bits nonzero.
+      if (isShiftedUInt<16, 16>(Value))
+        Result = DAG.getTargetConstant(Value, dl, TCVT);
+      break;
+    case 'L':  // "L" is a signed 16-bit constant shifted left 16 bits.
+      if (isShiftedInt<16, 16>(Value))
+        Result = DAG.getTargetConstant(Value, dl, TCVT);
+      break;
+    case 'K':  // "K" is a constant with only the low-order 16 bits nonzero.
+      if (isUInt<16>(Value))
+        Result = DAG.getTargetConstant(Value, dl, TCVT);
+      break;
+    case 'M':  // "M" is a constant that is greater than 31.
+      if (Value > 31)
+        Result = DAG.getTargetConstant(Value, dl, TCVT);
+      break;
+    case 'N':  // "N" is a positive constant that is an exact power of two.
+      if (Value > 0 && isPowerOf2_64(Value))
+        Result = DAG.getTargetConstant(Value, dl, TCVT);
+      break;
+    case 'O':  // "O" is the constant zero.
+      if (Value == 0)
+        Result = DAG.getTargetConstant(Value, dl, TCVT);
+      break;
+    case 'P':  // "P" is a constant whose negation is a signed 16-bit constant.
+      if (isInt<16>(-Value))
+        Result = DAG.getTargetConstant(Value, dl, TCVT);
+      break;
+    }
+    break;
+  }
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  // Handle standard constraint letters.
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                              const AddrMode &AM, Type *Ty,
+                                              unsigned AS) const {
+  // PPC does not allow r+i addressing modes for vectors!
+  if (Ty->isVectorTy() && AM.BaseOffs != 0)
+    return false;
+
+  // PPC allows a sign-extended 16-bit immediate field.
+  if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
+    return false;
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // PPC only support r+r,
+  switch (AM.Scale) {
+  case 0:  // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
+      return false;
+    // Otherwise we have r+r or r+i.
+    break;
+  case 2:
+    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
+      return false;
+    // Allow 2*r as r+r.
+    break;
+  default:
+    // No other scales are supported.
+    return false;
+  }
+
+  return true;
+}
+
+SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setReturnAddressIsTaken(true);
+
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+
+  // Make sure the function does not optimize away the store of the RA to
+  // the stack.
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setLRStoreRequired();
+  bool isPPC64 = Subtarget.isPPC64();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
+
+  if (Depth > 0) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset =
+        DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl,
+                        isPPC64 ? MVT::i64 : MVT::i32);
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
+                       MachinePointerInfo());
+  }
+
+  // Just load the return address off the stack.
+  SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
+  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
+                     MachinePointerInfo());
+}
+
+SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setFrameAddressIsTaken(true);
+
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
+  bool isPPC64 = PtrVT == MVT::i64;
+
+  // Naked functions never have a frame pointer, and so we use r1. For all
+  // other functions, this decision must be delayed until during PEI.
+  unsigned FrameReg;
+  if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
+    FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
+  else
+    FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
+
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,
+                                         PtrVT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
+                            FrameAddr, MachinePointerInfo());
+  return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                              SelectionDAG &DAG) const {
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI = Subtarget.isDarwinABI();
+
+  if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) ||
+      (!isPPC64 && VT != MVT::i32))
+    report_fatal_error("Invalid register global variable type");
+
+  bool is64Bit = isPPC64 && VT == MVT::i64;
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                   .Case("r1", is64Bit ? PPC::X1 : PPC::R1)
+                   .Case("r2", (isDarwinABI || isPPC64) ? 0 : PPC::R2)
+                   .Case("r13", (!isPPC64 && isDarwinABI) ? 0 :
+                                  (is64Bit ? PPC::X13 : PPC::R13))
+                   .Default(0);
+
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
+bool
+PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The PowerPC target isn't yet aware of offsets.
+  return false;
+}
+
+bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                           const CallInst &I,
+                                           unsigned Intrinsic) const {
+
+  switch (Intrinsic) {
+  case Intrinsic::ppc_qpx_qvlfd:
+  case Intrinsic::ppc_qpx_qvlfs:
+  case Intrinsic::ppc_qpx_qvlfcd:
+  case Intrinsic::ppc_qpx_qvlfcs:
+  case Intrinsic::ppc_qpx_qvlfiwa:
+  case Intrinsic::ppc_qpx_qvlfiwz:
+  case Intrinsic::ppc_altivec_lvx:
+  case Intrinsic::ppc_altivec_lvxl:
+  case Intrinsic::ppc_altivec_lvebx:
+  case Intrinsic::ppc_altivec_lvehx:
+  case Intrinsic::ppc_altivec_lvewx:
+  case Intrinsic::ppc_vsx_lxvd2x:
+  case Intrinsic::ppc_vsx_lxvw4x: {
+    EVT VT;
+    switch (Intrinsic) {
+    case Intrinsic::ppc_altivec_lvebx:
+      VT = MVT::i8;
+      break;
+    case Intrinsic::ppc_altivec_lvehx:
+      VT = MVT::i16;
+      break;
+    case Intrinsic::ppc_altivec_lvewx:
+      VT = MVT::i32;
+      break;
+    case Intrinsic::ppc_vsx_lxvd2x:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_qpx_qvlfd:
+      VT = MVT::v4f64;
+      break;
+    case Intrinsic::ppc_qpx_qvlfs:
+      VT = MVT::v4f32;
+      break;
+    case Intrinsic::ppc_qpx_qvlfcd:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_qpx_qvlfcs:
+      VT = MVT::v2f32;
+      break;
+    default:
+      VT = MVT::v4i32;
+      break;
+    }
+
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = VT;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = -VT.getStoreSize()+1;
+    Info.size = 2*VT.getStoreSize()-1;
+    Info.align = 1;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::ppc_qpx_qvlfda:
+  case Intrinsic::ppc_qpx_qvlfsa:
+  case Intrinsic::ppc_qpx_qvlfcda:
+  case Intrinsic::ppc_qpx_qvlfcsa:
+  case Intrinsic::ppc_qpx_qvlfiwaa:
+  case Intrinsic::ppc_qpx_qvlfiwza: {
+    EVT VT;
+    switch (Intrinsic) {
+    case Intrinsic::ppc_qpx_qvlfda:
+      VT = MVT::v4f64;
+      break;
+    case Intrinsic::ppc_qpx_qvlfsa:
+      VT = MVT::v4f32;
+      break;
+    case Intrinsic::ppc_qpx_qvlfcda:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_qpx_qvlfcsa:
+      VT = MVT::v2f32;
+      break;
+    default:
+      VT = MVT::v4i32;
+      break;
+    }
+
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = VT;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.size = VT.getStoreSize();
+    Info.align = 1;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::ppc_qpx_qvstfd:
+  case Intrinsic::ppc_qpx_qvstfs:
+  case Intrinsic::ppc_qpx_qvstfcd:
+  case Intrinsic::ppc_qpx_qvstfcs:
+  case Intrinsic::ppc_qpx_qvstfiw:
+  case Intrinsic::ppc_altivec_stvx:
+  case Intrinsic::ppc_altivec_stvxl:
+  case Intrinsic::ppc_altivec_stvebx:
+  case Intrinsic::ppc_altivec_stvehx:
+  case Intrinsic::ppc_altivec_stvewx:
+  case Intrinsic::ppc_vsx_stxvd2x:
+  case Intrinsic::ppc_vsx_stxvw4x: {
+    EVT VT;
+    switch (Intrinsic) {
+    case Intrinsic::ppc_altivec_stvebx:
+      VT = MVT::i8;
+      break;
+    case Intrinsic::ppc_altivec_stvehx:
+      VT = MVT::i16;
+      break;
+    case Intrinsic::ppc_altivec_stvewx:
+      VT = MVT::i32;
+      break;
+    case Intrinsic::ppc_vsx_stxvd2x:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_qpx_qvstfd:
+      VT = MVT::v4f64;
+      break;
+    case Intrinsic::ppc_qpx_qvstfs:
+      VT = MVT::v4f32;
+      break;
+    case Intrinsic::ppc_qpx_qvstfcd:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_qpx_qvstfcs:
+      VT = MVT::v2f32;
+      break;
+    default:
+      VT = MVT::v4i32;
+      break;
+    }
+
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = VT;
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = -VT.getStoreSize()+1;
+    Info.size = 2*VT.getStoreSize()-1;
+    Info.align = 1;
+    Info.vol = false;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::ppc_qpx_qvstfda:
+  case Intrinsic::ppc_qpx_qvstfsa:
+  case Intrinsic::ppc_qpx_qvstfcda:
+  case Intrinsic::ppc_qpx_qvstfcsa:
+  case Intrinsic::ppc_qpx_qvstfiwa: {
+    EVT VT;
+    switch (Intrinsic) {
+    case Intrinsic::ppc_qpx_qvstfda:
+      VT = MVT::v4f64;
+      break;
+    case Intrinsic::ppc_qpx_qvstfsa:
+      VT = MVT::v4f32;
+      break;
+    case Intrinsic::ppc_qpx_qvstfcda:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_qpx_qvstfcsa:
+      VT = MVT::v2f32;
+      break;
+    default:
+      VT = MVT::v4i32;
+      break;
+    }
+
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = VT;
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = 0;
+    Info.size = VT.getStoreSize();
+    Info.align = 1;
+    Info.vol = false;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
+
+/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove
+/// lowering. If DstAlign is zero that means it's safe to destination
+/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+/// means there isn't a need to check it against alignment requirement,
+/// probably because the source does not need to be loaded. If 'IsMemset' is
+/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+/// source is constant so it does not need to be loaded.
+/// It returns EVT::Other if the type should be determined using generic
+/// target-independent logic.
+EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
+                                           unsigned DstAlign, unsigned SrcAlign,
+                                           bool IsMemset, bool ZeroMemset,
+                                           bool MemcpyStrSrc,
+                                           MachineFunction &MF) const {
+  if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
+    const Function *F = MF.getFunction();
+    // When expanding a memset, require at least two QPX instructions to cover
+    // the cost of loading the value to be stored from the constant pool.
+    if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) &&
+       (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) &&
+        !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+      return MVT::v4f64;
+    }
+
+    // We should use Altivec/VSX loads and stores when available. For unaligned
+    // addresses, unaligned VSX loads are only fast starting with the P8.
+    if (Subtarget.hasAltivec() && Size >= 16 &&
+        (((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) ||
+         ((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))
+      return MVT::v4i32;
+  }
+
+  if (Subtarget.isPPC64()) {
+    return MVT::i64;
+  }
+
+  return MVT::i32;
+}
+
+/// \brief Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                          Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  return !(BitSize == 0 || BitSize > 64);
+}
+
+bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  return NumBits1 == 64 && NumBits2 == 32;
+}
+
+bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  return NumBits1 == 64 && NumBits2 == 32;
+}
+
+bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  // Generally speaking, zexts are not free, but they are free when they can be
+  // folded with other operations.
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) {
+    EVT MemVT = LD->getMemoryVT();
+    if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 ||
+         (Subtarget.isPPC64() && MemVT == MVT::i32)) &&
+        (LD->getExtensionType() == ISD::NON_EXTLOAD ||
+         LD->getExtensionType() == ISD::ZEXTLOAD))
+      return true;
+  }
+
+  // FIXME: Add other cases...
+  //  - 32-bit shifts with a zext to i64
+  //  - zext after ctlz, bswap, etc.
+  //  - zext after and by a constant mask
+
+  return TargetLowering::isZExtFree(Val, VT2);
+}
+
+bool PPCTargetLowering::isFPExtFree(EVT VT) const {
+  assert(VT.isFloatingPoint());
+  return true;
+}
+
+bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  return isInt<16>(Imm) || isUInt<16>(Imm);
+}
+
+bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  return isInt<16>(Imm) || isUInt<16>(Imm);
+}
+
+bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                       unsigned,
+                                                       unsigned,
+                                                       bool *Fast) const {
+  if (DisablePPCUnaligned)
+    return false;
+
+  // PowerPC supports unaligned memory access for simple non-vector types.
+  // Although accessing unaligned addresses is not as efficient as accessing
+  // aligned addresses, it is generally more efficient than manual expansion,
+  // and generally only traps for software emulation when crossing page
+  // boundaries.
+
+  if (!VT.isSimple())
+    return false;
+
+  if (VT.getSimpleVT().isVector()) {
+    if (Subtarget.hasVSX()) {
+      if (VT != MVT::v2f64 && VT != MVT::v2i64 &&
+          VT != MVT::v4f32 && VT != MVT::v4i32)
+        return false;
+    } else {
+      return false;
+    }
+  }
+
+  if (VT == MVT::ppcf128)
+    return false;
+
+  if (Fast)
+    *Fast = true;
+
+  return true;
+}
+
+bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+const MCPhysReg *
+PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
+  // LR is a callee-save register, but we must treat it as clobbered by any call
+  // site. Hence we include LR in the scratch registers, which are in turn added
+  // as implicit-defs for stackmaps and patchpoints. The same reasoning applies
+  // to CTR, which is used by any indirect call.
+  static const MCPhysReg ScratchRegs[] = {
+    PPC::X12, PPC::LR8, PPC::CTR8, 0
+  };
+
+  return ScratchRegs;
+}
+
+unsigned PPCTargetLowering::getExceptionPointerRegister(
+    const Constant *PersonalityFn) const {
+  return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
+}
+
+unsigned PPCTargetLowering::getExceptionSelectorRegister(
+    const Constant *PersonalityFn) const {
+  return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
+}
+
+bool
+PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
+                     EVT VT , unsigned DefinedValues) const {
+  if (VT == MVT::v2i64)
+    return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves
+
+  if (Subtarget.hasVSX() || Subtarget.hasQPX())
+    return true;
+
+  return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
+}
+
+Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
+  if (DisableILPPref || Subtarget.enableMachineScheduler())
+    return TargetLowering::getSchedulingPreference(N);
+
+  return Sched::ILP;
+}
+
+// Create a fast isel object.
+FastISel *
+PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,
+                                  const TargetLibraryInfo *LibInfo) const {
+  return PPC::createFastISel(FuncInfo, LibInfo);
+}
+
+void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+  if (Subtarget.isDarwinABI()) return;
+  if (!Subtarget.isPPC64()) return;
+
+  // Update IsSplitCSR in PPCFunctionInfo
+  PPCFunctionInfo *PFI = Entry->getParent()->getInfo<PPCFunctionInfo>();
+  PFI->setIsSplitCSR(true);
+}
+
+void PPCTargetLowering::insertCopiesSplitCSR(
+  MachineBasicBlock *Entry,
+  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+  const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+  if (!IStart)
+    return;
+
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  MachineBasicBlock::iterator MBBI = Entry->begin();
+  for (const MCPhysReg *I = IStart; *I; ++I) {
+    const TargetRegisterClass *RC = nullptr;
+    if (PPC::G8RCRegClass.contains(*I))
+      RC = &PPC::G8RCRegClass;
+    else if (PPC::F8RCRegClass.contains(*I))
+      RC = &PPC::F8RCRegClass;
+    else if (PPC::CRRCRegClass.contains(*I))
+      RC = &PPC::CRRCRegClass;
+    else if (PPC::VRRCRegClass.contains(*I))
+      RC = &PPC::VRRCRegClass;
+    else
+      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+    unsigned NewVR = MRI->createVirtualRegister(RC);
+    // Create copy from CSR to a virtual register.
+    // FIXME: this currently does not emit CFI pseudo-instructions, it works
+    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+    // nounwind. If we want to generalize this later, we may need to emit
+    // CFI pseudo-instructions.
+    assert(Entry->getParent()->getFunction()->hasFnAttribute(
+             Attribute::NoUnwind) &&
+           "Function should be nounwind in insertCopiesSplitCSR!");
+    Entry->addLiveIn(*I);
+    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+      .addReg(*I);
+
+    // Insert the copy-back instructions right before the terminator
+    for (auto *Exit : Exits)
+      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), *I)
+        .addReg(NewVR);
+  }
+}
+
+// Override to enable LOAD_STACK_GUARD lowering on Linux.
+bool PPCTargetLowering::useLoadStackGuardNode() const {
+  if (!Subtarget.isTargetLinux())
+    return TargetLowering::useLoadStackGuardNode();
+  return true;
+}
+
+// Override to disable global variable loading on Linux.
+void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
+  if (!Subtarget.isTargetLinux())
+    return TargetLowering::insertSSPDeclarations(M);
+}
+
+bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+
+  if (!VT.isSimple() || !Subtarget.hasVSX())
+    return false;
+
+  switch(VT.getSimpleVT().SimpleTy) {
+  default:
+    // For FP types that are currently not supported by PPC backend, return
+    // false. Examples: f16, f80.
+    return false;
+  case MVT::f32:
+  case MVT::f64:
+  case MVT::ppcf128:
+    return Imm.isPosZero();
+  }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
new file mode 100644
index 000000000000..d3c88482f092
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -0,0 +1,1031 @@
+//===-- PPCISelLowering.h - PPC32 DAG Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that PPC uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H
+#define LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H
+
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "PPCRegisterInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  namespace PPCISD {
+    enum NodeType : unsigned {
+      // Start the numbering where the builtin ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// FSEL - Traditional three-operand fsel node.
+      ///
+      FSEL,
+
+      /// FCFID - The FCFID instruction, taking an f64 operand and producing
+      /// and f64 value containing the FP representation of the integer that
+      /// was temporarily in the f64 operand.
+      FCFID,
+
+      /// Newer FCFID[US] integer-to-floating-point conversion instructions for
+      /// unsigned integers and single-precision outputs.
+      FCFIDU, FCFIDS, FCFIDUS,
+
+      /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64
+      /// operand, producing an f64 value containing the integer representation
+      /// of that FP value.
+      FCTIDZ, FCTIWZ,
+
+      /// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for
+      /// unsigned integers.
+      FCTIDUZ, FCTIWUZ,
+
+      /// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in
+      /// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
+      VEXTS,
+
+      /// Reciprocal estimate instructions (unary FP ops).
+      FRE, FRSQRTE,
+
+      // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking
+      // three v4f32 operands and producing a v4f32 result.
+      VMADDFP, VNMSUBFP,
+
+      /// VPERM - The PPC VPERM Instruction.
+      ///
+      VPERM,
+
+      /// XXSPLT - The PPC VSX splat instructions
+      ///
+      XXSPLT,
+
+      /// XXINSERT - The PPC VSX insert instruction
+      ///
+      XXINSERT,
+
+      /// VECSHL - The PPC VSX shift left instruction
+      ///
+      VECSHL,
+
+      /// The CMPB instruction (takes two operands of i32 or i64).
+      CMPB,
+
+      /// Hi/Lo - These represent the high and low 16-bit parts of a global
+      /// address respectively.  These nodes have two operands, the first of
+      /// which must be a TargetGlobalAddress, and the second of which must be a
+      /// Constant.  Selected naively, these turn into 'lis G+C' and 'li G+C',
+      /// though these are usually folded into other nodes.
+      Hi, Lo,
+
+      /// The following two target-specific nodes are used for calls through
+      /// function pointers in the 64-bit SVR4 ABI.
+
+      /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
+      /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+      /// compute an allocation on the stack.
+      DYNALLOC,
+
+      /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+      /// compute an offset from native SP to the address  of the most recent
+      /// dynamic alloca.
+      DYNAREAOFFSET,
+
+      /// GlobalBaseReg - On Darwin, this node represents the result of the mflr
+      /// at function entry, used for PIC code.
+      GlobalBaseReg,
+
+      /// These nodes represent the 32-bit PPC shifts that operate on 6-bit
+      /// shift amounts.  These nodes are generated by the multi-precision shift
+      /// code.
+      SRL, SRA, SHL,
+
+      /// The combination of sra[wd]i and addze used to implemented signed
+      /// integer division by a power of 2. The first operand is the dividend,
+      /// and the second is the constant shift amount (representing the
+      /// divisor).
+      SRA_ADDZE,
+
+      /// CALL - A direct function call.
+      /// CALL_NOP is a call with the special NOP which follows 64-bit
+      /// SVR4 calls.
+      CALL, CALL_NOP,
+
+      /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
+      /// MTCTR instruction.
+      MTCTR,
+
+      /// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a
+      /// BCTRL instruction.
+      BCTRL,
+
+      /// CHAIN,FLAG = BCTRL(CHAIN, ADDR, INFLAG) - The combination of a bctrl
+      /// instruction and the TOC reload required on SVR4 PPC64.
+      BCTRL_LOAD_TOC,
+
+      /// Return with a flag operand, matched by 'blr'
+      RET_FLAG,
+
+      /// R32 = MFOCRF(CRREG, INFLAG) - Represents the MFOCRF instruction.
+      /// This copies the bits corresponding to the specified CRREG into the
+      /// resultant GPR.  Bits corresponding to other CR regs are undefined.
+      MFOCRF,
+
+      /// Direct move from a VSX register to a GPR
+      MFVSR,
+
+      /// Direct move from a GPR to a VSX register (algebraic)
+      MTVSRA,
+
+      /// Direct move from a GPR to a VSX register (zero)
+      MTVSRZ,
+
+      /// Extract a subvector from signed integer vector and convert to FP.
+      /// It is primarily used to convert a (widened) illegal integer vector
+      /// type to a legal floating point vector type.
+      /// For example v2i32 -> widened to v4i32 -> v2f64
+      SINT_VEC_TO_FP,
+
+      /// Extract a subvector from unsigned integer vector and convert to FP.
+      /// As with SINT_VEC_TO_FP, used for converting illegal types.
+      UINT_VEC_TO_FP,
+
+      // FIXME: Remove these once the ANDI glue bug is fixed:
+      /// i1 = ANDIo_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the
+      /// eq or gt bit of CR0 after executing andi. x, 1. This is used to
+      /// implement truncation of i32 or i64 to i1.
+      ANDIo_1_EQ_BIT, ANDIo_1_GT_BIT,
+
+      // READ_TIME_BASE - A read of the 64-bit time-base register on a 32-bit
+      // target (returns (Lo, Hi)). It takes a chain operand.
+      READ_TIME_BASE,
+
+      // EH_SJLJ_SETJMP - SjLj exception handling setjmp.
+      EH_SJLJ_SETJMP,
+
+      // EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
+      EH_SJLJ_LONGJMP,
+
+      /// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP*
+      /// instructions.  For lack of better number, we use the opcode number
+      /// encoding for the OPC field to identify the compare.  For example, 838
+      /// is VCMPGTSH.
+      VCMP,
+
+      /// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the
+      /// altivec VCMP*o instructions.  For lack of better number, we use the
+      /// opcode number encoding for the OPC field to identify the compare.  For
+      /// example, 838 is VCMPGTSH.
+      VCMPo,
+
+      /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
+      /// corresponds to the COND_BRANCH pseudo instruction.  CRRC is the
+      /// condition register to branch on, OPC is the branch opcode to use (e.g.
+      /// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is
+      /// an optional input flag argument.
+      COND_BRANCH,
+
+      /// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based
+      /// loops.
+      BDNZ, BDZ,
+
+      /// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding
+      /// towards zero.  Used only as part of the long double-to-int
+      /// conversion sequence.
+      FADDRTZ,
+
+      /// F8RC = MFFS - This moves the FPSCR (not modeled) into the register.
+      MFFS,
+
+      /// TC_RETURN - A tail call return.
+      ///   operand #0 chain
+      ///   operand #1 callee (register or absolute)
+      ///   operand #2 stack adjustment
+      ///   operand #3 optional in flag
+      TC_RETURN,
+
+      /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls
+      CR6SET,
+      CR6UNSET,
+
+      /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS
+      /// on PPC32.
+      PPC32_GOT,
+
+      /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and
+      /// local dynamic TLS on PPC32.
+      PPC32_PICGOT,
+
+      /// G8RC = ADDIS_GOT_TPREL_HA %X2, Symbol - Used by the initial-exec
+      /// TLS model, produces an ADDIS8 instruction that adds the GOT
+      /// base to sym\@got\@tprel\@ha.
+      ADDIS_GOT_TPREL_HA,
+
+      /// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec
+      /// TLS model, produces a LD instruction with base register G8RReg
+      /// and offset sym\@got\@tprel\@l.  This completes the addition that
+      /// finds the offset of "sym" relative to the thread pointer.
+      LD_GOT_TPREL_L,
+
+      /// G8RC = ADD_TLS G8RReg, Symbol - Used by the initial-exec TLS
+      /// model, produces an ADD instruction that adds the contents of
+      /// G8RReg to the thread pointer.  Symbol contains a relocation
+      /// sym\@tls which is to be replaced by the thread pointer and
+      /// identifies to the linker that the instruction is part of a
+      /// TLS sequence.
+      ADD_TLS,
+
+      /// G8RC = ADDIS_TLSGD_HA %X2, Symbol - For the general-dynamic TLS
+      /// model, produces an ADDIS8 instruction that adds the GOT base
+      /// register to sym\@got\@tlsgd\@ha.
+      ADDIS_TLSGD_HA,
+
+      /// %X3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
+      /// model, produces an ADDI8 instruction that adds G8RReg to
+      /// sym\@got\@tlsgd\@l and stores the result in X3.  Hidden by
+      /// ADDIS_TLSGD_L_ADDR until after register assignment.
+      ADDI_TLSGD_L,
+
+      /// %X3 = GET_TLS_ADDR %X3, Symbol - For the general-dynamic TLS
+      /// model, produces a call to __tls_get_addr(sym\@tlsgd).  Hidden by
+      /// ADDIS_TLSGD_L_ADDR until after register assignment.
+      GET_TLS_ADDR,
+
+      /// G8RC = ADDI_TLSGD_L_ADDR G8RReg, Symbol, Symbol - Op that
+      /// combines ADDI_TLSGD_L and GET_TLS_ADDR until expansion following
+      /// register assignment.
+      ADDI_TLSGD_L_ADDR,
+
+      /// G8RC = ADDIS_TLSLD_HA %X2, Symbol - For the local-dynamic TLS
+      /// model, produces an ADDIS8 instruction that adds the GOT base
+      /// register to sym\@got\@tlsld\@ha.
+      ADDIS_TLSLD_HA,
+
+      /// %X3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
+      /// model, produces an ADDI8 instruction that adds G8RReg to
+      /// sym\@got\@tlsld\@l and stores the result in X3.  Hidden by
+      /// ADDIS_TLSLD_L_ADDR until after register assignment.
+      ADDI_TLSLD_L,
+
+      /// %X3 = GET_TLSLD_ADDR %X3, Symbol - For the local-dynamic TLS
+      /// model, produces a call to __tls_get_addr(sym\@tlsld).  Hidden by
+      /// ADDIS_TLSLD_L_ADDR until after register assignment.
+      GET_TLSLD_ADDR,
+
+      /// G8RC = ADDI_TLSLD_L_ADDR G8RReg, Symbol, Symbol - Op that
+      /// combines ADDI_TLSLD_L and GET_TLSLD_ADDR until expansion
+      /// following register assignment.
+      ADDI_TLSLD_L_ADDR,
+
+      /// G8RC = ADDIS_DTPREL_HA %X3, Symbol - For the local-dynamic TLS
+      /// model, produces an ADDIS8 instruction that adds X3 to
+      /// sym\@dtprel\@ha.
+      ADDIS_DTPREL_HA,
+
+      /// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS
+      /// model, produces an ADDI8 instruction that adds G8RReg to
+      /// sym\@got\@dtprel\@l.
+      ADDI_DTPREL_L,
+
+      /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
+      /// during instruction selection to optimize a BUILD_VECTOR into
+      /// operations on splats.  This is necessary to avoid losing these
+      /// optimizations due to constant folding.
+      VADD_SPLAT,
+
+      /// CHAIN = SC CHAIN, Imm128 - System call.  The 7-bit unsigned
+      /// operand identifies the operating system entry point.
+      SC,
+
+      /// CHAIN = CLRBHRB CHAIN - Clear branch history rolling buffer.
+      CLRBHRB,
+
+      /// GPRC, CHAIN = MFBHRBE CHAIN, Entry, Dummy - Move from branch
+      /// history rolling buffer entry.
+      MFBHRBE,
+
+      /// CHAIN = RFEBB CHAIN, State - Return from event-based branch.
+      RFEBB,
+
+      /// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little
+      /// endian.  Maps to an xxswapd instruction that corrects an lxvd2x
+      /// or stxvd2x instruction.  The chain is necessary because the
+      /// sequence replaces a load and needs to provide the same number
+      /// of outputs.
+      XXSWAPD,
+
+      /// An SDNode for swaps that are not associated with any loads/stores
+      /// and thereby have no chain.
+      SWAP_NO_CHAIN,
+
+      /// QVFPERM = This corresponds to the QPX qvfperm instruction.
+      QVFPERM,
+
+      /// QVGPCI = This corresponds to the QPX qvgpci instruction.
+      QVGPCI,
+
+      /// QVALIGNI = This corresponds to the QPX qvaligni instruction.
+      QVALIGNI,
+
+      /// QVESPLATI = This corresponds to the QPX qvesplati instruction.
+      QVESPLATI,
+
+      /// QBFLT = Access the underlying QPX floating-point boolean
+      /// representation.
+      QBFLT,
+
+      /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
+      /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
+      /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
+      /// i32.
+      STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE,
+
+      /// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a
+      /// byte-swapping load instruction.  It loads "Type" bits, byte swaps it,
+      /// then puts it in the bottom bits of the GPRC.  TYPE can be either i16
+      /// or i32.
+      LBRX,
+
+      /// STFIWX - The STFIWX instruction.  The first operand is an input token
+      /// chain, then an f64 value to store, then an address to store it to.
+      STFIWX,
+
+      /// GPRC, CHAIN = LFIWAX CHAIN, Ptr - This is a floating-point
+      /// load which sign-extends from a 32-bit integer value into the
+      /// destination 64-bit register.
+      LFIWAX,
+
+      /// GPRC, CHAIN = LFIWZX CHAIN, Ptr - This is a floating-point
+      /// load which zero-extends from a 32-bit integer value into the
+      /// destination 64-bit register.
+      LFIWZX,
+
+      /// GPRC, CHAIN = LXSIZX, CHAIN, Ptr, ByteWidth - This is a load of an
+      /// integer smaller than 64 bits into a VSR. The integer is zero-extended.
+      /// This can be used for converting loaded integers to floating point.
+      LXSIZX,
+
+      /// STXSIX - The STXSI[bh]X instruction. The first operand is an input
+      /// chain, then an f64 value to store, then an address to store it to,
+      /// followed by a byte-width for the store.
+      STXSIX,
+
+      /// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian.
+      /// Maps directly to an lxvd2x instruction that will be followed by
+      /// an xxswapd.
+      LXVD2X,
+
+      /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
+      /// Maps directly to an stxvd2x instruction that will be preceded by
+      /// an xxswapd.
+      STXVD2X,
+
+      /// QBRC, CHAIN = QVLFSb CHAIN, Ptr
+      /// The 4xf32 load used for v4i1 constants.
+      QVLFSb,
+
+      /// GPRC = TOC_ENTRY GA, TOC
+      /// Loads the entry for GA from the TOC, where the TOC base is given by
+      /// the last operand.
+      TOC_ENTRY
+    };
+  }
+
+  /// Define some predicates that are used for node matching.
+  namespace PPC {
+    /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
+    /// VPKUHUM instruction.
+    bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
+                              SelectionDAG &DAG);
+
+    /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
+    /// VPKUWUM instruction.
+    bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
+                              SelectionDAG &DAG);
+
+    /// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
+    /// VPKUDUM instruction.
+    bool isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
+                              SelectionDAG &DAG);
+
+    /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
+    /// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
+    bool isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+                            unsigned ShuffleKind, SelectionDAG &DAG);
+
+    /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
+    /// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
+    bool isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+                            unsigned ShuffleKind, SelectionDAG &DAG);
+
+    /// isVMRGEOShuffleMask - Return true if this is a shuffle mask suitable for
+    /// a VMRGEW or VMRGOW instruction
+    bool isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
+                             unsigned ShuffleKind, SelectionDAG &DAG);
+  
+    /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the
+    /// shift amount, otherwise return -1.
+    int isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
+                            SelectionDAG &DAG);
+
+    /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a splat of a single element that is suitable for input to
+    /// VSPLTB/VSPLTH/VSPLTW.
+    bool isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize);
+
+    /// isXXINSERTWMask - Return true if this VECTOR_SHUFFLE can be handled by
+    /// the XXINSERTW instruction introduced in ISA 3.0. This is essentially any
+    /// shuffle of v4f32/v4i32 vectors that just inserts one element from one
+    /// vector into the other. This function will also set a couple of
+    /// output parameters for how much the source vector needs to be shifted and
+    /// what byte number needs to be specified for the instruction to put the
+    /// element in the desired location of the target vector.
+    bool isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+                         unsigned &InsertAtByte, bool &Swap, bool IsLE);
+
+    /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
+    /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
+    unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG);
+
+    /// get_VSPLTI_elt - If this is a build_vector of constants which can be
+    /// formed by using a vspltis[bhw] instruction of the specified element
+    /// size, return the constant being splatted.  The ByteSize field indicates
+    /// the number of bytes of each element [124] -> [bhw].
+    SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+
+    /// If this is a qvaligni shuffle mask, return the shift
+    /// amount, otherwise return -1.
+    int isQVALIGNIShuffleMask(SDNode *N);
+  }
+
+  class PPCTargetLowering : public TargetLowering {
+    const PPCSubtarget &Subtarget;
+
+  public:
+    explicit PPCTargetLowering(const PPCTargetMachine &TM,
+                               const PPCSubtarget &STI);
+
+    /// getTargetNodeName() - This method returns the name of a target specific
+    /// DAG node.
+    const char *getTargetNodeName(unsigned Opcode) const override;
+
+    /// getPreferredVectorAction - The code we generate when vector types are
+    /// legalized by promoting the integer element type is often much worse
+    /// than code we generate if we widen the type for applicable vector types.
+    /// The issue with promoting is that the vector is scalaraized, individual
+    /// elements promoted and then the vector is rebuilt. So say we load a pair
+    /// of v4i8's and shuffle them. This will turn into a mess of 8 extending
+    /// loads, moves back into VSR's (or memory ops if we don't have moves) and
+    /// then the VPERM for the shuffle. All in all a very slow sequence.
+    TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
+      const override {
+      if (VT.getScalarSizeInBits() % 8 == 0)
+        return TypeWidenVector;
+      return TargetLoweringBase::getPreferredVectorAction(VT);
+    }
+    bool useSoftFloat() const override;
+
+    MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+      return MVT::i32;
+    }
+
+    bool isCheapToSpeculateCttz() const override {
+      return true;
+    }
+
+    bool isCheapToSpeculateCtlz() const override {
+      return true;
+    }
+
+    bool isCtlzFast() const override {
+      return true;
+    }
+
+    bool hasAndNotCompare(SDValue) const override {
+      return true;
+    }
+
+    bool supportSplitCSR(MachineFunction *MF) const override {
+      return
+        MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+        MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+    }
+
+    void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+
+    void insertCopiesSplitCSR(
+      MachineBasicBlock *Entry,
+      const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
+    /// getSetCCResultType - Return the ISD::SETCC ValueType
+    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                           EVT VT) const override;
+
+    /// Return true if target always beneficiates from combining into FMA for a
+    /// given value type. This must typically return false on targets where FMA
+    /// takes more cycles to execute than FADD.
+    bool enableAggressiveFMAFusion(EVT VT) const override;
+
+    /// getPreIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if the node's address
+    /// can be legally represented as pre-indexed load / store address.
+    bool getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                   SDValue &Offset,
+                                   ISD::MemIndexedMode &AM,
+                                   SelectionDAG &DAG) const override;
+
+    /// SelectAddressRegReg - Given the specified addressed, check to see if it
+    /// can be represented as an indexed [r+r] operation.  Returns false if it
+    /// can be more efficiently represented with [r+imm].
+    bool SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index,
+                             SelectionDAG &DAG) const;
+
+    /// SelectAddressRegImm - Returns true if the address N can be represented
+    /// by a base register plus a signed 16-bit displacement [r+imm], and if it
+    /// is not better represented as reg+reg.  If Aligned is true, only accept
+    /// displacements suitable for STD and friends, i.e. multiples of 4.
+    bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
+                             SelectionDAG &DAG, bool Aligned) const;
+
+    /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
+    /// represented as an indexed [r+r] operation.
+    bool SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index,
+                                 SelectionDAG &DAG) const;
+
+    Sched::Preference getSchedulingPreference(SDNode *N) const override;
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    ///
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
+
+    SDValue expandVSXLoadForLE(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue expandVSXStoreForLE(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+    SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+                          std::vector<SDNode *> *Created) const override;
+
+    unsigned getRegisterByName(const char* RegName, EVT VT,
+                               SelectionDAG &DAG) const override;
+
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
+
+    unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
+
+    bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+      return true;
+    }
+
+    Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
+                                  bool IsStore, bool IsLoad) const override;
+    Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
+                                   bool IsStore, bool IsLoad) const override;
+
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *MBB) const override;
+    MachineBasicBlock *EmitAtomicBinary(MachineInstr &MI,
+                                        MachineBasicBlock *MBB,
+                                        unsigned AtomicSize,
+                                        unsigned BinOpcode,
+                                        unsigned CmpOpcode = 0,
+                                        unsigned CmpPred = 0) const;
+    MachineBasicBlock *EmitPartwordAtomicBinary(MachineInstr &MI,
+                                                MachineBasicBlock *MBB,
+                                                bool is8bit,
+                                                unsigned Opcode,
+                                                unsigned CmpOpcode = 0,
+                                                unsigned CmpPred = 0) const;
+
+    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
+                                        MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
+                                         MachineBasicBlock *MBB) const;
+
+    ConstraintType getConstraintType(StringRef Constraint) const override;
+
+    /// Examine constraint string and operand type and determine a weight value.
+    /// The operand object must already have been set up with the operand type.
+    ConstraintWeight getSingleConstraintMatchWeight(
+      AsmOperandInfo &info, const char *constraint) const override;
+
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 StringRef Constraint, MVT VT) const override;
+
+    /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+    /// function arguments in the caller parameter area.  This is the actual
+    /// alignment, not its logarithm.
+    unsigned getByValTypeAlignment(Type *Ty,
+                                   const DataLayout &DL) const override;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops.
+    void LowerAsmOperandForConstraint(SDValue Op,
+                                      std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
+
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+      if (ConstraintCode == "es")
+        return InlineAsm::Constraint_es;
+      else if (ConstraintCode == "o")
+        return InlineAsm::Constraint_o;
+      else if (ConstraintCode == "Q")
+        return InlineAsm::Constraint_Q;
+      else if (ConstraintCode == "Z")
+        return InlineAsm::Constraint_Z;
+      else if (ConstraintCode == "Zy")
+        return InlineAsm::Constraint_Zy;
+      return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+    }
+
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
+
+    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalICmpImmediate(int64_t Imm) const override;
+
+    /// isLegalAddImmediate - Return true if the specified immediate is legal
+    /// add immediate, that is the target has add instructions which can
+    /// add a register and the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalAddImmediate(int64_t Imm) const override;
+
+    /// isTruncateFree - Return true if it's free to truncate a value of
+    /// type Ty1 to type Ty2. e.g. On PPC it's free to truncate a i64 value in
+    /// register X1 to i32 by referencing its sub-register R1.
+    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+    bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+    bool isFPExtFree(EVT VT) const override;
+
+    /// \brief Returns true if it is beneficial to convert a load of a constant
+    /// to just the constant itself.
+    bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                           Type *Ty) const override;
+
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
+    bool getTgtMemIntrinsic(IntrinsicInfo &Info,
+                            const CallInst &I,
+                            unsigned Intrinsic) const override;
+
+    /// getOptimalMemOpType - Returns the target specific optimal type for load
+    /// and store operations as a result of memset, memcpy, and memmove
+    /// lowering. If DstAlign is zero that means it's safe to destination
+    /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+    /// means there isn't a need to check it against alignment requirement,
+    /// probably because the source does not need to be loaded. If 'IsMemset' is
+    /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+    /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+    /// source is constant so it does not need to be loaded.
+    /// It returns EVT::Other if the type should be determined using generic
+    /// target-independent logic.
+    EVT
+    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+                        bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                        MachineFunction &MF) const override;
+
+    /// Is unaligned memory access allowed for the given type, and is it fast
+    /// relative to software emulation.
+    bool allowsMisalignedMemoryAccesses(EVT VT,
+                                        unsigned AddrSpace,
+                                        unsigned Align = 1,
+                                        bool *Fast = nullptr) const override;
+
+    /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+    /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+    /// expanded to FMAs when this method returns true, otherwise fmuladd is
+    /// expanded to fmul + fadd.
+    bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+
+    const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+
+    // Should we expand the build vector with shuffles?
+    bool
+    shouldExpandBuildVectorWithShuffles(EVT VT,
+                                        unsigned DefinedValues) const override;
+
+    /// createFastISel - This method returns a target-specific FastISel object,
+    /// or null if the target does not support "fast" instruction selection.
+    FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
+                             const TargetLibraryInfo *LibInfo) const override;
+
+    /// \brief Returns true if an argument of type Ty needs to be passed in a
+    /// contiguous block of registers in calling convention CallConv.
+    bool functionArgumentNeedsConsecutiveRegisters(
+      Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override {
+      // We support any array type as "consecutive" block in the parameter
+      // save area.  The element type defines the alignment requirement and
+      // whether the argument should go in GPRs, FPRs, or VRs if available.
+      //
+      // Note that clang uses this capability both to implement the ELFv2
+      // homogeneous float/vector aggregate ABI, and to avoid having to use
+      // "byval" when passing aggregates that might fully fit in registers.
+      return Ty->isArrayTy();
+    }
+
+    /// If a physical register, this returns the register that receives the
+    /// exception address on entry to an EH pad.
+    unsigned
+    getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+    /// If a physical register, this returns the register that receives the
+    /// exception typeid on entry to a landing pad.
+    unsigned
+    getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
+    /// Override to support customized stack guard loading.
+    bool useLoadStackGuardNode() const override;
+    void insertSSPDeclarations(Module &M) const override;
+
+    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+    unsigned getJumpTableEncoding() const override;
+    bool isJumpTableRelative() const override;
+    SDValue getPICJumpTableRelocBase(SDValue Table,
+                                     SelectionDAG &DAG) const override;
+    const MCExpr *getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+                                               unsigned JTI,
+                                               MCContext &Ctx) const override;
+
+  private:
+    struct ReuseLoadInfo {
+      SDValue Ptr;
+      SDValue Chain;
+      SDValue ResChain;
+      MachinePointerInfo MPI;
+      bool IsDereferenceable;
+      bool IsInvariant;
+      unsigned Alignment;
+      AAMDNodes AAInfo;
+      const MDNode *Ranges;
+
+      ReuseLoadInfo()
+          : IsDereferenceable(false), IsInvariant(false), Alignment(0),
+            Ranges(nullptr) {}
+
+      MachineMemOperand::Flags MMOFlags() const {
+        MachineMemOperand::Flags F = MachineMemOperand::MONone;
+        if (IsDereferenceable)
+          F |= MachineMemOperand::MODereferenceable;
+        if (IsInvariant)
+          F |= MachineMemOperand::MOInvariant;
+        return F;
+      }
+    };
+
+    bool canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI,
+                             SelectionDAG &DAG,
+                             ISD::LoadExtType ET = ISD::NON_EXTLOAD) const;
+    void spliceIntoChain(SDValue ResChain, SDValue NewResChain,
+                         SelectionDAG &DAG) const;
+
+    void LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
+                                SelectionDAG &DAG, const SDLoc &dl) const;
+    SDValue LowerFP_TO_INTDirectMove(SDValue Op, SelectionDAG &DAG,
+                                     const SDLoc &dl) const;
+
+    bool directMoveIsProfitable(const SDValue &Op) const;
+    SDValue LowerINT_TO_FPDirectMove(SDValue Op, SelectionDAG &DAG,
+                                     const SDLoc &dl) const;
+
+    SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
+    SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
+
+    bool
+    IsEligibleForTailCallOptimization(SDValue Callee,
+                                      CallingConv::ID CalleeCC,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      SelectionDAG& DAG) const;
+
+    bool
+    IsEligibleForTailCallOptimization_64SVR4(
+                                    SDValue Callee,
+                                    CallingConv::ID CalleeCC,
+                                    ImmutableCallSite *CS,
+                                    bool isVarArg,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SelectionDAG& DAG) const;
+
+    SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG &DAG, int SPDiff,
+                                         SDValue Chain, SDValue &LROpOut,
+                                         SDValue &FPOpOut,
+                                         const SDLoc &dl) const;
+
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+                           const SDLoc &dl) const;
+    SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            const SDLoc &dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals) const;
+    SDValue FinishCall(CallingConv::ID CallConv, const SDLoc &dl,
+                       bool isTailCall, bool isVarArg, bool isPatchPoint,
+                       bool hasNest, SelectionDAG &DAG,
+                       SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
+                       SDValue InFlag, SDValue Chain, SDValue CallSeqStart,
+                       SDValue &Callee, int SPDiff, unsigned NumBytes,
+                       const SmallVectorImpl<ISD::InputArg> &Ins,
+                       SmallVectorImpl<SDValue> &InVals,
+                       ImmutableCallSite *CS) const;
+
+    SDValue
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
+
+    SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const override;
+
+    bool
+      CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                   bool isVarArg,
+                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                   LLVMContext &Context) const override;
+
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
+
+    SDValue extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
+                              SelectionDAG &DAG, SDValue ArgVal,
+                              const SDLoc &dl) const;
+
+    SDValue LowerFormalArguments_Darwin(
+        SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+        const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+        SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerFormalArguments_64SVR4(
+        SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+        const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+        SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerFormalArguments_32SVR4(
+        SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+        const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+        SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
+                                       SDValue CallSeqStart,
+                                       ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                                       const SDLoc &dl) const;
+
+    SDValue LowerCall_Darwin(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool isTailCall, bool isPatchPoint,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             const SDLoc &dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals,
+                             ImmutableCallSite *CS) const;
+    SDValue LowerCall_64SVR4(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool isTailCall, bool isPatchPoint,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             const SDLoc &dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals,
+                             ImmutableCallSite *CS) const;
+    SDValue LowerCall_32SVR4(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool isTailCall, bool isPatchPoint,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             const SDLoc &dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals,
+                             ImmutableCallSite *CS) const;
+
+    SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
+    /// SETCC with integer subtraction when (1) there is a legal way of doing it
+    /// (2) keeping the result of comparison in GPR has performance benefit.
+    SDValue ConvertSETCCToSubtract(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                            int &RefinementSteps, bool &UseOneConstNR,
+                            bool Reciprocal) const override;
+    SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                             int &RefinementSteps) const override;
+    unsigned combineRepeatedFPDivisors() const override;
+
+    CCAssignFn *useFastISelCCs(unsigned Flag) const;
+
+    SDValue
+      combineElementTruncationToVectorTruncation(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const;
+  };
+
+  namespace PPC {
+    FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
+                             const TargetLibraryInfo *LibInfo);
+  }
+
+  bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                  CCValAssign::LocInfo &LocInfo,
+                                  ISD::ArgFlagsTy &ArgFlags,
+                                  CCState &State);
+
+  bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
+                                         MVT &LocVT,
+                                         CCValAssign::LocInfo &LocInfo,
+                                         ISD::ArgFlagsTy &ArgFlags,
+                                         CCState &State);
+
+  bool 
+  CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT,
+                                                 MVT &LocVT,
+                                                 CCValAssign::LocInfo &LocInfo,
+                                                 ISD::ArgFlagsTy &ArgFlags,
+                                                 CCState &State);
+
+  bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
+                                           MVT &LocVT,
+                                           CCValAssign::LocInfo &LocInfo,
+                                           ISD::ArgFlagsTy &ArgFlags,
+                                           CCState &State);
+}
+
+#endif   // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
new file mode 100644
index 000000000000..03b2257a88a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -0,0 +1,1301 @@
+//===-- PPCInstr64Bit.td - The PowerPC 64-bit Support ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PowerPC 64-bit instructions.  These patterns are used
+// both when in ppc64 mode and when in "use 64-bit extensions in 32-bit" mode.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 64-bit operands.
+//
+def s16imm64 : Operand<i64> {
+  let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCS16ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<16>";
+}
+def u16imm64 : Operand<i64> {
+  let PrintMethod = "printU16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCU16ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<16>";
+}
+def s17imm64 : Operand<i64> {
+  // This operand type is used for addis/lis to allow the assembler parser
+  // to accept immediates in the range -65536..65535 for compatibility with
+  // the GNU assembler.  The operand is treated as 16-bit otherwise.
+  let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCS17ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<16>";
+}
+def tocentry : Operand<iPTR> {
+  let MIOperandInfo = (ops i64imm:$imm);
+}
+def tlsreg : Operand<i64> {
+  let EncoderMethod = "getTLSRegEncoding";
+  let ParserMatchClass = PPCTLSRegOperand;
+}
+def tlsgd : Operand<i64> {}
+def tlscall : Operand<i64> {
+  let PrintMethod = "printTLSCall";
+  let MIOperandInfo = (ops calltarget:$func, tlsgd:$sym);
+  let EncoderMethod = "getTLSCallEncoding";
+}
+
+//===----------------------------------------------------------------------===//
+// 64-bit transformation functions.
+//
+
+def SHL64 : SDNodeXForm<imm, [{
+  // Transformation function: 63 - imm
+  return getI32Imm(63 - N->getZExtValue(), SDLoc(N));
+}]>;
+
+def SRL64 : SDNodeXForm<imm, [{
+  // Transformation function: 64 - imm
+  return N->getZExtValue() ? getI32Imm(64 - N->getZExtValue(), SDLoc(N))
+                           : getI32Imm(0, SDLoc(N));
+}]>;
+
+
+//===----------------------------------------------------------------------===//
+// Calls.
+//
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
+  let isReturn = 1, Uses = [LR8, RM] in
+    def BLR8 : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB,
+                            [(retflag)]>, Requires<[In64BitMode]>;
+  let isBranch = 1, isIndirectBranch = 1, Uses = [CTR8] in {
+    def BCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
+                             []>,
+        Requires<[In64BitMode]>;
+    def BCCCTR8 : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond),
+                              "b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB,
+                              []>,
+        Requires<[In64BitMode]>;
+
+    def BCCTR8  : XLForm_2_br2<19, 528, 12, 0, (outs), (ins crbitrc:$bi),
+                               "bcctr 12, $bi, 0", IIC_BrB, []>,
+        Requires<[In64BitMode]>;
+    def BCCTR8n : XLForm_2_br2<19, 528, 4, 0, (outs), (ins crbitrc:$bi),
+                               "bcctr 4, $bi, 0", IIC_BrB, []>,
+        Requires<[In64BitMode]>;
+  }
+}
+
+let Defs = [LR8] in
+  def MovePCtoLR8 : Pseudo<(outs), (ins), "#MovePCtoLR8", []>,
+                    PPC970_Unit_BRU;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
+  let Defs = [CTR8], Uses = [CTR8] in {
+    def BDZ8  : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
+                        "bdz $dst">;
+    def BDNZ8 : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
+                        "bdnz $dst">;
+  }
+
+  let isReturn = 1, Defs = [CTR8], Uses = [CTR8, LR8, RM] in {
+    def BDZLR8  : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins),
+                              "bdzlr", IIC_BrB, []>;
+    def BDNZLR8 : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins),
+                              "bdnzlr", IIC_BrB, []>;
+  }
+}
+
+
+
+let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
+  // Convenient aliases for call instructions
+  let Uses = [RM] in {
+    def BL8  : IForm<18, 0, 1, (outs), (ins calltarget:$func),
+                     "bl $func", IIC_BrB, []>;  // See Pat patterns below.
+
+    def BL8_TLS  : IForm<18, 0, 1, (outs), (ins tlscall:$func),
+                         "bl $func", IIC_BrB, []>;
+
+    def BLA8 : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
+                     "bla $func", IIC_BrB, [(PPCcall (i64 imm:$func))]>;
+  }
+  let Uses = [RM], isCodeGenOnly = 1 in {
+    def BL8_NOP  : IForm_and_DForm_4_zero<18, 0, 1, 24,
+                             (outs), (ins calltarget:$func),
+                             "bl $func\n\tnop", IIC_BrB, []>;
+
+    def BL8_NOP_TLS : IForm_and_DForm_4_zero<18, 0, 1, 24,
+                                  (outs), (ins tlscall:$func),
+                                  "bl $func\n\tnop", IIC_BrB, []>;
+
+    def BLA8_NOP : IForm_and_DForm_4_zero<18, 1, 1, 24,
+                             (outs), (ins abscalltarget:$func),
+                             "bla $func\n\tnop", IIC_BrB,
+                             [(PPCcall_nop (i64 imm:$func))]>;
+  }
+  let Uses = [CTR8, RM] in {
+    def BCTRL8 : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
+                              "bctrl", IIC_BrB, [(PPCbctrl)]>,
+                 Requires<[In64BitMode]>;
+
+    let isCodeGenOnly = 1 in {
+      def BCCCTRL8 : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond),
+                                 "b${cond:cc}ctrl${cond:pm} ${cond:reg}", IIC_BrB,
+                                 []>,
+          Requires<[In64BitMode]>;
+
+      def BCCTRL8  : XLForm_2_br2<19, 528, 12, 1, (outs), (ins crbitrc:$bi),
+                                  "bcctrl 12, $bi, 0", IIC_BrB, []>,
+          Requires<[In64BitMode]>;
+      def BCCTRL8n : XLForm_2_br2<19, 528, 4, 1, (outs), (ins crbitrc:$bi),
+                                  "bcctrl 4, $bi, 0", IIC_BrB, []>,
+          Requires<[In64BitMode]>;
+    }
+  }
+}
+
+let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
+    Defs = [LR8, X2], Uses = [CTR8, RM], RST = 2 in {
+  def BCTRL8_LDinto_toc :
+    XLForm_2_ext_and_DSForm_1<19, 528, 20, 0, 1, 58, 0, (outs),
+                              (ins memrix:$src),
+                              "bctrl\n\tld 2, $src", IIC_BrB,
+                              [(PPCbctrl_load_toc ixaddr:$src)]>,
+    Requires<[In64BitMode]>;
+}
+
+} // Interpretation64Bit
+
+// FIXME: Duplicating this for the asm parser should be unnecessary, but the
+// previous definition must be marked as CodeGen only to prevent decoding
+// conflicts.
+let Interpretation64Bit = 1, isAsmParserOnly = 1 in
+let isCall = 1, PPC970_Unit = 7, Defs = [LR8], Uses = [RM] in
+def BL8_TLS_ : IForm<18, 0, 1, (outs), (ins tlscall:$func),
+                     "bl $func", IIC_BrB, []>;
+
+// Calls
+def : Pat<(PPCcall (i64 tglobaladdr:$dst)),
+          (BL8 tglobaladdr:$dst)>;
+def : Pat<(PPCcall_nop (i64 tglobaladdr:$dst)),
+          (BL8_NOP tglobaladdr:$dst)>;
+
+def : Pat<(PPCcall (i64 texternalsym:$dst)),
+          (BL8 texternalsym:$dst)>;
+def : Pat<(PPCcall_nop (i64 texternalsym:$dst)),
+          (BL8_NOP texternalsym:$dst)>;
+
+// Atomic operations
+let usesCustomInserter = 1 in {
+  let Defs = [CR0] in {
+    def ATOMIC_LOAD_ADD_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_ADD_I64",
+      [(set i64:$dst, (atomic_load_add_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_SUB_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_SUB_I64",
+      [(set i64:$dst, (atomic_load_sub_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_OR_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_OR_I64",
+      [(set i64:$dst, (atomic_load_or_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_XOR_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_XOR_I64",
+      [(set i64:$dst, (atomic_load_xor_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_AND_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_AND_i64",
+      [(set i64:$dst, (atomic_load_and_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_NAND_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_NAND_I64",
+      [(set i64:$dst, (atomic_load_nand_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_MIN_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MIN_I64",
+      [(set i64:$dst, (atomic_load_min_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_MAX_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_MAX_I64",
+      [(set i64:$dst, (atomic_load_max_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_UMIN_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMIN_I64",
+      [(set i64:$dst, (atomic_load_umin_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_UMAX_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_UMAX_I64",
+      [(set i64:$dst, (atomic_load_umax_64 xoaddr:$ptr, i64:$incr))]>;
+
+    def ATOMIC_CMP_SWAP_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$old, g8rc:$new), "#ATOMIC_CMP_SWAP_I64",
+      [(set i64:$dst, (atomic_cmp_swap_64 xoaddr:$ptr, i64:$old, i64:$new))]>;
+
+    def ATOMIC_SWAP_I64 : Pseudo<
+      (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$new), "#ATOMIC_SWAP_I64",
+      [(set i64:$dst, (atomic_swap_64 xoaddr:$ptr, i64:$new))]>;
+  }
+}
+
+// Instructions to support atomic operations
+let mayLoad = 1, hasSideEffects = 0 in {
+def LDARX : XForm_1<31,  84, (outs g8rc:$rD), (ins memrr:$ptr),
+                    "ldarx $rD, $ptr", IIC_LdStLDARX, []>;
+
+// Instruction to support lock versions of atomics
+// (EH=1 - see Power ISA 2.07 Book II 4.4.2)
+def LDARXL : XForm_1<31,  84, (outs g8rc:$rD), (ins memrr:$ptr),
+                     "ldarx $rD, $ptr, 1", IIC_LdStLDARX, []>, isDOT;
+
+let hasExtraDefRegAllocReq = 1 in
+def LDAT : X_RD5_RS5_IM5<31, 614, (outs g8rc:$rD), (ins g8rc:$rA, u5imm:$FC),
+                         "ldat $rD, $rA, $FC", IIC_LdStLoad>, isPPC64,
+           Requires<[IsISA3_0]>;
+}
+
+let Defs = [CR0], mayStore = 1, hasSideEffects = 0 in
+def STDCX : XForm_1<31, 214, (outs), (ins g8rc:$rS, memrr:$dst),
+                    "stdcx. $rS, $dst", IIC_LdStSTDCX, []>, isDOT;
+
+let mayStore = 1, hasSideEffects = 0 in
+def STDAT : X_RD5_RS5_IM5<31, 742, (outs), (ins g8rc:$rS, g8rc:$rA, u5imm:$FC),
+                          "stdat $rS, $rA, $FC", IIC_LdStStore>, isPPC64,
+            Requires<[IsISA3_0]>;
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNdi8 :Pseudo< (outs),
+                        (ins calltarget:$dst, i32imm:$offset),
+                 "#TC_RETURNd8 $dst $offset",
+                 []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNai8 :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset),
+                 "#TC_RETURNa8 $func $offset",
+                 [(PPCtc_return (i64 imm:$func), imm:$offset)]>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset),
+                 "#TC_RETURNr8 $dst $offset",
+                 []>;
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
+    isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR8, RM] in
+def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
+                             []>,
+    Requires<[In64BitMode]>;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILB8   : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
+                  "b $dst", IIC_BrB,
+                  []>;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILBA8   : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst),
+                  "ba $dst", IIC_BrB,
+                  []>;
+} // Interpretation64Bit
+
+def : Pat<(PPCtc_return (i64 tglobaladdr:$dst),  imm:$imm),
+          (TCRETURNdi8 tglobaladdr:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return (i64 texternalsym:$dst), imm:$imm),
+          (TCRETURNdi8 texternalsym:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm),
+          (TCRETURNri8 CTRRC8:$dst, imm:$imm)>;
+
+
+// 64-bit CR instructions
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+let hasSideEffects = 0 in {
+// mtocrf's input needs to be prepared by shifting by an amount dependent
+// on the cr register selected. Thus, post-ra anti-dep breaking must not
+// later change that register assignment.
+let hasExtraDefRegAllocReq = 1 in {
+def MTOCRF8: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins g8rc:$ST),
+                        "mtocrf $FXM, $ST", IIC_BrMCRX>,
+            PPC970_DGroup_First, PPC970_Unit_CRU;
+
+// Similarly to mtocrf, the mask for mtcrf must be prepared in a way that
+// is dependent on the cr fields being set.
+def MTCRF8 : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, g8rc:$rS),
+                      "mtcrf $FXM, $rS", IIC_BrMCRX>,
+            PPC970_MicroCode, PPC970_Unit_CRU;
+} // hasExtraDefRegAllocReq = 1
+
+// mfocrf's input needs to be prepared by shifting by an amount dependent
+// on the cr register selected. Thus, post-ra anti-dep breaking must not
+// later change that register assignment.
+let hasExtraSrcRegAllocReq = 1 in {
+def MFOCRF8: XFXForm_5a<31, 19, (outs g8rc:$rT), (ins crbitm:$FXM),
+                        "mfocrf $rT, $FXM", IIC_SprMFCRF>,
+             PPC970_DGroup_First, PPC970_Unit_CRU;
+
+// Similarly to mfocrf, the mask for mfcrf must be prepared in a way that
+// is dependent on the cr fields being copied.
+def MFCR8 : XFXForm_3<31, 19, (outs g8rc:$rT), (ins),
+                     "mfcr $rT", IIC_SprMFCR>,
+                     PPC970_MicroCode, PPC970_Unit_CRU;
+} // hasExtraSrcRegAllocReq = 1
+} // hasSideEffects = 0
+
+let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+  let Defs = [CTR8] in
+  def EH_SjLj_SetJmp64  : Pseudo<(outs gprc:$dst), (ins memr:$buf),
+                            "#EH_SJLJ_SETJMP64",
+                            [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[In64BitMode]>;
+  let isTerminator = 1 in
+  def EH_SjLj_LongJmp64 : Pseudo<(outs), (ins memr:$buf),
+                            "#EH_SJLJ_LONGJMP64",
+                            [(PPCeh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[In64BitMode]>;
+}
+
+def MFSPR8 : XFXForm_1<31, 339, (outs g8rc:$RT), (ins i32imm:$SPR),
+                       "mfspr $RT, $SPR", IIC_SprMFSPR>;
+def MTSPR8 : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, g8rc:$RT),
+                       "mtspr $SPR, $RT", IIC_SprMTSPR>;
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit SPR manipulation instrs.
+
+let Uses = [CTR8] in {
+def MFCTR8 : XFXForm_1_ext<31, 339, 9, (outs g8rc:$rT), (ins),
+                           "mfctr $rT", IIC_SprMFSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Pattern = [(PPCmtctr i64:$rS)], Defs = [CTR8] in {
+def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
+                           "mtctr $rS", IIC_SprMTSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let hasSideEffects = 1, Defs = [CTR8] in {
+let Pattern = [(int_ppc_mtctr i64:$rS)] in
+def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
+                               "mtctr $rS", IIC_SprMTSPR>,
+                 PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+let Pattern = [(set i64:$rT, readcyclecounter)] in
+def MFTB8 : XFXForm_1_ext<31, 339, 268, (outs g8rc:$rT), (ins),
+                          "mfspr $rT, 268", IIC_SprMFTB>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+// Note that encoding mftb using mfspr is now the preferred form,
+// and has been since at least ISA v2.03. The mftb instruction has
+// now been phased out. Using mfspr, however, is known not to work on
+// the POWER3.
+
+let Defs = [X1], Uses = [X1] in
+def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8",
+                       [(set i64:$result,
+                             (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>;
+def DYNAREAOFFSET8 : Pseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8",
+                       [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
+
+let Defs = [LR8] in {
+def MTLR8  : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS),
+                           "mtlr $rS", IIC_SprMTSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Uses = [LR8] in {
+def MFLR8  : XFXForm_1_ext<31, 339, 8, (outs g8rc:$rT), (ins),
+                           "mflr $rT", IIC_SprMFSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+} // Interpretation64Bit
+
+//===----------------------------------------------------------------------===//
+// Fixed point instructions.
+//
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+let Interpretation64Bit = 1 in {
+let hasSideEffects = 0 in {
+let isCodeGenOnly = 1 in {
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+def LI8  : DForm_2_r0<14, (outs g8rc:$rD), (ins s16imm64:$imm),
+                      "li $rD, $imm", IIC_IntSimple,
+                      [(set i64:$rD, imm64SExt16:$imm)]>;
+def LIS8 : DForm_2_r0<15, (outs g8rc:$rD), (ins s17imm64:$imm),
+                      "lis $rD, $imm", IIC_IntSimple,
+                      [(set i64:$rD, imm16ShiftedSExt:$imm)]>;
+}
+
+// Logical ops.
+let isCommutable = 1 in {
+defm NAND8: XForm_6r<31, 476, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "nand", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i64:$rA, (not (and i64:$rS, i64:$rB)))]>;
+defm AND8 : XForm_6r<31,  28, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "and", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i64:$rA, (and i64:$rS, i64:$rB))]>;
+} // isCommutable
+defm ANDC8: XForm_6r<31,  60, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "andc", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i64:$rA, (and i64:$rS, (not i64:$rB)))]>;
+let isCommutable = 1 in {
+defm OR8  : XForm_6r<31, 444, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "or", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i64:$rA, (or i64:$rS, i64:$rB))]>;
+defm NOR8 : XForm_6r<31, 124, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "nor", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i64:$rA, (not (or i64:$rS, i64:$rB)))]>;
+} // isCommutable
+defm ORC8 : XForm_6r<31, 412, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "orc", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i64:$rA, (or i64:$rS, (not i64:$rB)))]>;
+let isCommutable = 1 in {
+defm EQV8 : XForm_6r<31, 284, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "eqv", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i64:$rA, (not (xor i64:$rS, i64:$rB)))]>;
+defm XOR8 : XForm_6r<31, 316, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "xor", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i64:$rA, (xor i64:$rS, i64:$rB))]>;
+} // let isCommutable = 1
+
+// Logical ops with immediate.
+let Defs = [CR0] in {
+def ANDIo8  : DForm_4<28, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                      "andi. $dst, $src1, $src2", IIC_IntGeneral,
+                      [(set i64:$dst, (and i64:$src1, immZExt16:$src2))]>,
+                      isDOT;
+def ANDISo8 : DForm_4<29, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                     "andis. $dst, $src1, $src2", IIC_IntGeneral,
+                    [(set i64:$dst, (and i64:$src1, imm16ShiftedZExt:$src2))]>,
+                     isDOT;
+}
+def ORI8    : DForm_4<24, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                      "ori $dst, $src1, $src2", IIC_IntSimple,
+                      [(set i64:$dst, (or i64:$src1, immZExt16:$src2))]>;
+def ORIS8   : DForm_4<25, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                      "oris $dst, $src1, $src2", IIC_IntSimple,
+                    [(set i64:$dst, (or i64:$src1, imm16ShiftedZExt:$src2))]>;
+def XORI8   : DForm_4<26, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                      "xori $dst, $src1, $src2", IIC_IntSimple,
+                      [(set i64:$dst, (xor i64:$src1, immZExt16:$src2))]>;
+def XORIS8  : DForm_4<27, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                      "xoris $dst, $src1, $src2", IIC_IntSimple,
+                   [(set i64:$dst, (xor i64:$src1, imm16ShiftedZExt:$src2))]>;
+
+let isCommutable = 1 in
+defm ADD8  : XOForm_1r<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                       "add", "$rT, $rA, $rB", IIC_IntSimple,
+                       [(set i64:$rT, (add i64:$rA, i64:$rB))]>;
+// ADD8 has a special form: reg = ADD8(reg, sym@tls) for use by the
+// initial-exec thread-local storage model.  We need to forbid r0 here -
+// while it works for add just fine, the linker can relax this to local-exec
+// addi, which won't work for r0.
+def ADD8TLS  : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc_nox0:$rA, tlsreg:$rB),
+                        "add $rT, $rA, $rB", IIC_IntSimple,
+                        [(set i64:$rT, (add i64:$rA, tglobaltlsaddr:$rB))]>;
+                     
+let isCommutable = 1 in
+defm ADDC8 : XOForm_1rc<31, 10, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                        "addc", "$rT, $rA, $rB", IIC_IntGeneral,
+                        [(set i64:$rT, (addc i64:$rA, i64:$rB))]>,
+                        PPC970_DGroup_Cracked;
+
+let Defs = [CARRY] in
+def ADDIC8 : DForm_2<12, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
+                     "addic $rD, $rA, $imm", IIC_IntGeneral,
+                     [(set i64:$rD, (addc i64:$rA, imm64SExt16:$imm))]>;
+def ADDI8  : DForm_2<14, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s16imm64:$imm),
+                     "addi $rD, $rA, $imm", IIC_IntSimple,
+                     [(set i64:$rD, (add i64:$rA, imm64SExt16:$imm))]>;
+def ADDIS8 : DForm_2<15, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s17imm64:$imm),
+                     "addis $rD, $rA, $imm", IIC_IntSimple,
+                     [(set i64:$rD, (add i64:$rA, imm16ShiftedSExt:$imm))]>;
+
+let Defs = [CARRY] in {
+def SUBFIC8: DForm_2< 8, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
+                     "subfic $rD, $rA, $imm", IIC_IntGeneral,
+                     [(set i64:$rD, (subc imm64SExt16:$imm, i64:$rA))]>;
+}
+defm SUBFC8 : XOForm_1rc<31, 8, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                        "subfc", "$rT, $rA, $rB", IIC_IntGeneral,
+                        [(set i64:$rT, (subc i64:$rB, i64:$rA))]>,
+                        PPC970_DGroup_Cracked;
+defm SUBF8 : XOForm_1r<31, 40, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                       "subf", "$rT, $rA, $rB", IIC_IntGeneral,
+                       [(set i64:$rT, (sub i64:$rB, i64:$rA))]>;
+defm NEG8    : XOForm_3r<31, 104, 0, (outs g8rc:$rT), (ins g8rc:$rA),
+                        "neg", "$rT, $rA", IIC_IntSimple,
+                        [(set i64:$rT, (ineg i64:$rA))]>;
+let Uses = [CARRY] in {
+let isCommutable = 1 in
+defm ADDE8   : XOForm_1rc<31, 138, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                          "adde", "$rT, $rA, $rB", IIC_IntGeneral,
+                          [(set i64:$rT, (adde i64:$rA, i64:$rB))]>;
+defm ADDME8  : XOForm_3rc<31, 234, 0, (outs g8rc:$rT), (ins g8rc:$rA),
+                          "addme", "$rT, $rA", IIC_IntGeneral,
+                          [(set i64:$rT, (adde i64:$rA, -1))]>;
+defm ADDZE8  : XOForm_3rc<31, 202, 0, (outs g8rc:$rT), (ins g8rc:$rA),
+                          "addze", "$rT, $rA", IIC_IntGeneral,
+                          [(set i64:$rT, (adde i64:$rA, 0))]>;
+defm SUBFE8  : XOForm_1rc<31, 136, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                          "subfe", "$rT, $rA, $rB", IIC_IntGeneral,
+                          [(set i64:$rT, (sube i64:$rB, i64:$rA))]>;
+defm SUBFME8 : XOForm_3rc<31, 232, 0, (outs g8rc:$rT), (ins g8rc:$rA),
+                          "subfme", "$rT, $rA", IIC_IntGeneral,
+                          [(set i64:$rT, (sube -1, i64:$rA))]>;
+defm SUBFZE8 : XOForm_3rc<31, 200, 0, (outs g8rc:$rT), (ins g8rc:$rA),
+                          "subfze", "$rT, $rA", IIC_IntGeneral,
+                          [(set i64:$rT, (sube 0, i64:$rA))]>;
+}
+} // isCodeGenOnly
+
+// FIXME: Duplicating this for the asm parser should be unnecessary, but the
+// previous definition must be marked as CodeGen only to prevent decoding
+// conflicts.
+let isAsmParserOnly = 1 in
+def ADD8TLS_ : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, tlsreg:$rB),
+                        "add $rT, $rA, $rB", IIC_IntSimple, []>;
+
+let isCommutable = 1 in {
+defm MULHD : XOForm_1r<31, 73, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                       "mulhd", "$rT, $rA, $rB", IIC_IntMulHW,
+                       [(set i64:$rT, (mulhs i64:$rA, i64:$rB))]>;
+defm MULHDU : XOForm_1r<31, 9, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                       "mulhdu", "$rT, $rA, $rB", IIC_IntMulHWU,
+                       [(set i64:$rT, (mulhu i64:$rA, i64:$rB))]>;
+} // isCommutable
+}
+} // Interpretation64Bit
+
+let isCompare = 1, hasSideEffects = 0 in {
+  def CMPD   : XForm_16_ext<31, 0, (outs crrc:$crD), (ins g8rc:$rA, g8rc:$rB),
+                            "cmpd $crD, $rA, $rB", IIC_IntCompare>, isPPC64;
+  def CMPLD  : XForm_16_ext<31, 32, (outs crrc:$crD), (ins g8rc:$rA, g8rc:$rB),
+                            "cmpld $crD, $rA, $rB", IIC_IntCompare>, isPPC64;
+  def CMPDI  : DForm_5_ext<11, (outs crrc:$crD), (ins g8rc:$rA, s16imm64:$imm),
+                           "cmpdi $crD, $rA, $imm", IIC_IntCompare>, isPPC64;
+  def CMPLDI : DForm_6_ext<10, (outs crrc:$dst), (ins g8rc:$src1, u16imm64:$src2),
+                           "cmpldi $dst, $src1, $src2",
+                           IIC_IntCompare>, isPPC64;
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+  def CMPRB8 : X_BF3_L1_RS5_RS5<31, 192, (outs crbitrc:$BF),
+                                (ins u1imm:$L, g8rc:$rA, g8rc:$rB),
+                                "cmprb $BF, $L, $rA, $rB", IIC_IntCompare, []>,
+               Requires<[IsISA3_0]>;
+  def CMPEQB : X_BF3_RS5_RS5<31, 224, (outs crbitrc:$BF),
+                             (ins g8rc:$rA, g8rc:$rB), "cmpeqb $BF, $rA, $rB",
+                             IIC_IntCompare, []>, Requires<[IsISA3_0]>;
+}
+
+let hasSideEffects = 0 in {
+defm SLD  : XForm_6r<31,  27, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
+                     "sld", "$rA, $rS, $rB", IIC_IntRotateD,
+                     [(set i64:$rA, (PPCshl i64:$rS, i32:$rB))]>, isPPC64;
+defm SRD  : XForm_6r<31, 539, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
+                     "srd", "$rA, $rS, $rB", IIC_IntRotateD,
+                     [(set i64:$rA, (PPCsrl i64:$rS, i32:$rB))]>, isPPC64;
+defm SRAD : XForm_6rc<31, 794, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB),
+                      "srad", "$rA, $rS, $rB", IIC_IntRotateD,
+                      [(set i64:$rA, (PPCsra i64:$rS, i32:$rB))]>, isPPC64;
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+defm CNTLZW8 : XForm_11r<31,  26, (outs g8rc:$rA), (ins g8rc:$rS),
+                        "cntlzw", "$rA, $rS", IIC_IntGeneral, []>;
+defm CNTTZW8 : XForm_11r<31, 538, (outs g8rc:$rA), (ins g8rc:$rS),
+                        "cnttzw", "$rA, $rS", IIC_IntGeneral, []>,
+               Requires<[IsISA3_0]>;
+
+defm EXTSB8 : XForm_11r<31, 954, (outs g8rc:$rA), (ins g8rc:$rS),
+                        "extsb", "$rA, $rS", IIC_IntSimple,
+                        [(set i64:$rA, (sext_inreg i64:$rS, i8))]>;
+defm EXTSH8 : XForm_11r<31, 922, (outs g8rc:$rA), (ins g8rc:$rS),
+                        "extsh", "$rA, $rS", IIC_IntSimple,
+                        [(set i64:$rA, (sext_inreg i64:$rS, i16))]>;
+
+defm SLW8  : XForm_6r<31,  24, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                      "slw", "$rA, $rS, $rB", IIC_IntGeneral, []>;
+defm SRW8  : XForm_6r<31, 536, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                      "srw", "$rA, $rS, $rB", IIC_IntGeneral, []>;
+} // Interpretation64Bit
+
+// For fast-isel:
+let isCodeGenOnly = 1 in {
+def EXTSB8_32_64 : XForm_11<31, 954, (outs g8rc:$rA), (ins gprc:$rS),
+                           "extsb $rA, $rS", IIC_IntSimple, []>, isPPC64;
+def EXTSH8_32_64 : XForm_11<31, 922, (outs g8rc:$rA), (ins gprc:$rS),
+                           "extsh $rA, $rS", IIC_IntSimple, []>, isPPC64;
+} // isCodeGenOnly for fast-isel
+
+defm EXTSW  : XForm_11r<31, 986, (outs g8rc:$rA), (ins g8rc:$rS),
+                        "extsw", "$rA, $rS", IIC_IntSimple,
+                        [(set i64:$rA, (sext_inreg i64:$rS, i32))]>, isPPC64;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm EXTSW_32_64 : XForm_11r<31, 986, (outs g8rc:$rA), (ins gprc:$rS),
+                             "extsw", "$rA, $rS", IIC_IntSimple,
+                             [(set i64:$rA, (sext i32:$rS))]>, isPPC64;
+
+defm SRADI  : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
+                         "sradi", "$rA, $rS, $SH", IIC_IntRotateDI,
+                         [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64;
+defm CNTLZD : XForm_11r<31,  58, (outs g8rc:$rA), (ins g8rc:$rS),
+                        "cntlzd", "$rA, $rS", IIC_IntGeneral,
+                        [(set i64:$rA, (ctlz i64:$rS))]>;
+defm CNTTZD : XForm_11r<31, 570, (outs g8rc:$rA), (ins g8rc:$rS),
+                        "cnttzd", "$rA, $rS", IIC_IntGeneral,
+                        [(set i64:$rA, (cttz i64:$rS))]>, Requires<[IsISA3_0]>;
+def POPCNTD : XForm_11<31, 506, (outs g8rc:$rA), (ins g8rc:$rS),
+                       "popcntd $rA, $rS", IIC_IntGeneral,
+                       [(set i64:$rA, (ctpop i64:$rS))]>;
+def BPERMD : XForm_6<31, 252, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                     "bpermd $rA, $rS, $rB", IIC_IntGeneral,
+                     [(set i64:$rA, (int_ppc_bpermd g8rc:$rS, g8rc:$rB))]>,
+                     isPPC64, Requires<[HasBPERMD]>;
+
+let isCodeGenOnly = 1, isCommutable = 1 in
+def CMPB8 : XForm_6<31, 508, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                    "cmpb $rA, $rS, $rB", IIC_IntGeneral,
+                    [(set i64:$rA, (PPCcmpb i64:$rS, i64:$rB))]>;
+
+// popcntw also does a population count on the high 32 bits (storing the
+// results in the high 32-bits of the output). We'll ignore that here (which is
+// safe because we never separately use the high part of the 64-bit registers).
+def POPCNTW : XForm_11<31, 378, (outs gprc:$rA), (ins gprc:$rS),
+                       "popcntw $rA, $rS", IIC_IntGeneral,
+                       [(set i32:$rA, (ctpop i32:$rS))]>;
+
+defm DIVD  : XOForm_1rcr<31, 489, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                          "divd", "$rT, $rA, $rB", IIC_IntDivD,
+                          [(set i64:$rT, (sdiv i64:$rA, i64:$rB))]>, isPPC64;
+defm DIVDU : XOForm_1rcr<31, 457, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                          "divdu", "$rT, $rA, $rB", IIC_IntDivD,
+                          [(set i64:$rT, (udiv i64:$rA, i64:$rB))]>, isPPC64;
+def DIVDE : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                     "divde $rT, $rA, $rB", IIC_IntDivD,
+                     [(set i64:$rT, (int_ppc_divde g8rc:$rA, g8rc:$rB))]>,
+                     isPPC64, Requires<[HasExtDiv]>;
+let Defs = [CR0] in
+def DIVDEo : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                      "divde. $rT, $rA, $rB", IIC_IntDivD,
+                      []>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First,
+                      isPPC64, Requires<[HasExtDiv]>;
+def DIVDEU : XOForm_1<31, 393, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                      "divdeu $rT, $rA, $rB", IIC_IntDivD,
+                      [(set i64:$rT, (int_ppc_divdeu g8rc:$rA, g8rc:$rB))]>,
+                      isPPC64, Requires<[HasExtDiv]>;
+let Defs = [CR0] in
+def DIVDEUo : XOForm_1<31, 393, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                       "divdeu. $rT, $rA, $rB", IIC_IntDivD,
+                       []>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First,
+                        isPPC64, Requires<[HasExtDiv]>;
+let isCommutable = 1 in
+defm MULLD : XOForm_1r<31, 233, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                       "mulld", "$rT, $rA, $rB", IIC_IntMulHD,
+                       [(set i64:$rT, (mul i64:$rA, i64:$rB))]>, isPPC64;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+def MULLI8 : DForm_2<7, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
+                       "mulli $rD, $rA, $imm", IIC_IntMulLI,
+                       [(set i64:$rD, (mul i64:$rA, imm64SExt16:$imm))]>;
+}
+
+let hasSideEffects = 0 in {
+defm RLDIMI : MDForm_1r<30, 3, (outs g8rc:$rA),
+                        (ins g8rc:$rSi, g8rc:$rS, u6imm:$SH, u6imm:$MBE),
+                        "rldimi", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                        []>, isPPC64, RegConstraint<"$rSi = $rA">,
+                        NoEncode<"$rSi">;
+
+// Rotate instructions.
+defm RLDCL  : MDSForm_1r<30, 8,
+                        (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB, u6imm:$MBE),
+                        "rldcl", "$rA, $rS, $rB, $MBE", IIC_IntRotateD,
+                        []>, isPPC64;
+defm RLDCR  : MDSForm_1r<30, 9,
+                        (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB, u6imm:$MBE),
+                        "rldcr", "$rA, $rS, $rB, $MBE", IIC_IntRotateD,
+                        []>, isPPC64;
+defm RLDICL : MDForm_1r<30, 0,
+                        (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
+                        "rldicl", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                        []>, isPPC64;
+// For fast-isel:
+let isCodeGenOnly = 1 in
+def RLDICL_32_64 : MDForm_1<30, 0,
+                           (outs g8rc:$rA),
+                           (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+                           "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                           []>, isPPC64;
+// End fast-isel.
+defm RLDICR : MDForm_1r<30, 1,
+                        (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
+                        "rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                        []>, isPPC64;
+defm RLDIC  : MDForm_1r<30, 2,
+                        (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
+                        "rldic", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                        []>, isPPC64;
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+defm RLWINM8 : MForm_2r<21, (outs g8rc:$rA),
+                        (ins g8rc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+                        "rlwinm", "$rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
+                        []>;
+
+defm RLWNM8  : MForm_2r<23, (outs g8rc:$rA),
+                        (ins g8rc:$rS, g8rc:$rB, u5imm:$MB, u5imm:$ME),
+                        "rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral,
+                        []>;
+
+// RLWIMI can be commuted if the rotate amount is zero.
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm RLWIMI8 : MForm_2r<20, (outs g8rc:$rA),
+                        (ins g8rc:$rSi, g8rc:$rS, u5imm:$SH, u5imm:$MB,
+                        u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME",
+                        IIC_IntRotate, []>, PPC970_DGroup_Cracked,
+                        RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">;
+
+let isSelect = 1 in
+def ISEL8   : AForm_4<31, 15,
+                     (outs g8rc:$rT), (ins g8rc_nox0:$rA, g8rc:$rB, crbitrc:$cond),
+                     "isel $rT, $rA, $rB, $cond", IIC_IntISEL,
+                     []>;
+}  // Interpretation64Bit
+}  // hasSideEffects = 0
+}  // End FXU Operations.
+
+
+//===----------------------------------------------------------------------===//
+// Load/Store instructions.
+//
+
+
+// Sign extending loads.
+let PPC970_Unit = 2 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+def LHA8: DForm_1<42, (outs g8rc:$rD), (ins memri:$src),
+                  "lha $rD, $src", IIC_LdStLHA,
+                  [(set i64:$rD, (sextloadi16 iaddr:$src))]>,
+                  PPC970_DGroup_Cracked;
+def LWA  : DSForm_1<58, 2, (outs g8rc:$rD), (ins memrix:$src),
+                    "lwa $rD, $src", IIC_LdStLWA,
+                    [(set i64:$rD,
+                          (aligned4sextloadi32 ixaddr:$src))]>, isPPC64,
+                    PPC970_DGroup_Cracked;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+def LHAX8: XForm_1<31, 343, (outs g8rc:$rD), (ins memrr:$src),
+                   "lhax $rD, $src", IIC_LdStLHA,
+                   [(set i64:$rD, (sextloadi16 xaddr:$src))]>,
+                   PPC970_DGroup_Cracked;
+def LWAX : XForm_1<31, 341, (outs g8rc:$rD), (ins memrr:$src),
+                   "lwax $rD, $src", IIC_LdStLHA,
+                   [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
+                   PPC970_DGroup_Cracked;
+// For fast-isel:
+let isCodeGenOnly = 1, mayLoad = 1 in {
+def LWA_32  : DSForm_1<58, 2, (outs gprc:$rD), (ins memrix:$src),
+                      "lwa $rD, $src", IIC_LdStLWA, []>, isPPC64,
+                      PPC970_DGroup_Cracked;
+def LWAX_32 : XForm_1<31, 341, (outs gprc:$rD), (ins memrr:$src),
+                     "lwax $rD, $src", IIC_LdStLHA, []>, isPPC64,
+                     PPC970_DGroup_Cracked;
+} // end fast-isel isCodeGenOnly
+
+// Update forms.
+let mayLoad = 1, hasSideEffects = 0 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+def LHAU8 : DForm_1<43, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                    (ins memri:$addr),
+                    "lhau $rD, $addr", IIC_LdStLHAU,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+// NO LWAU!
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+def LHAUX8 : XForm_1<31, 375, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                    (ins memrr:$addr),
+                    "lhaux $rD, $addr", IIC_LdStLHAUX,
+                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LWAUX : XForm_1<31, 373, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                    (ins memrr:$addr),
+                    "lwaux $rD, $addr", IIC_LdStLHAUX,
+                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                    NoEncode<"$ea_result">, isPPC64;
+}
+}
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+// Zero extending loads.
+let PPC970_Unit = 2 in {
+def LBZ8 : DForm_1<34, (outs g8rc:$rD), (ins memri:$src),
+                  "lbz $rD, $src", IIC_LdStLoad,
+                  [(set i64:$rD, (zextloadi8 iaddr:$src))]>;
+def LHZ8 : DForm_1<40, (outs g8rc:$rD), (ins memri:$src),
+                  "lhz $rD, $src", IIC_LdStLoad,
+                  [(set i64:$rD, (zextloadi16 iaddr:$src))]>;
+def LWZ8 : DForm_1<32, (outs g8rc:$rD), (ins memri:$src),
+                  "lwz $rD, $src", IIC_LdStLoad,
+                  [(set i64:$rD, (zextloadi32 iaddr:$src))]>, isPPC64;
+
+def LBZX8 : XForm_1<31,  87, (outs g8rc:$rD), (ins memrr:$src),
+                   "lbzx $rD, $src", IIC_LdStLoad,
+                   [(set i64:$rD, (zextloadi8 xaddr:$src))]>;
+def LHZX8 : XForm_1<31, 279, (outs g8rc:$rD), (ins memrr:$src),
+                   "lhzx $rD, $src", IIC_LdStLoad,
+                   [(set i64:$rD, (zextloadi16 xaddr:$src))]>;
+def LWZX8 : XForm_1<31,  23, (outs g8rc:$rD), (ins memrr:$src),
+                   "lwzx $rD, $src", IIC_LdStLoad,
+                   [(set i64:$rD, (zextloadi32 xaddr:$src))]>;
+                   
+                   
+// Update forms.
+let mayLoad = 1, hasSideEffects = 0 in {
+def LBZU8 : DForm_1<35, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                    "lbzu $rD, $addr", IIC_LdStLoadUpd,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LHZU8 : DForm_1<41, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                    "lhzu $rD, $addr", IIC_LdStLoadUpd,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LWZU8 : DForm_1<33, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                    "lwzu $rD, $addr", IIC_LdStLoadUpd,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+
+def LBZUX8 : XForm_1<31, 119, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lbzux $rD, $addr", IIC_LdStLoadUpdX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+def LHZUX8 : XForm_1<31, 311, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lhzux $rD, $addr", IIC_LdStLoadUpdX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+def LWZUX8 : XForm_1<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lwzux $rD, $addr", IIC_LdStLoadUpdX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+}
+}
+} // Interpretation64Bit
+
+
+// Full 8-byte loads.
+let PPC970_Unit = 2 in {
+def LD   : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src),
+                    "ld $rD, $src", IIC_LdStLD,
+                    [(set i64:$rD, (aligned4load ixaddr:$src))]>, isPPC64;
+// The following four definitions are selected for small code model only.
+// Otherwise, we need to create two instructions to form a 32-bit offset,
+// so we have a custom matcher for TOC_ENTRY in PPCDAGToDAGIsel::Select().
+def LDtoc: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
+                  "#LDtoc",
+                  [(set i64:$rD,
+                     (PPCtoc_entry tglobaladdr:$disp, i64:$reg))]>, isPPC64;
+def LDtocJTI: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
+                  "#LDtocJTI",
+                  [(set i64:$rD,
+                     (PPCtoc_entry tjumptable:$disp, i64:$reg))]>, isPPC64;
+def LDtocCPT: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
+                  "#LDtocCPT",
+                  [(set i64:$rD,
+                     (PPCtoc_entry tconstpool:$disp, i64:$reg))]>, isPPC64;
+def LDtocBA: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
+                  "#LDtocCPT",
+                  [(set i64:$rD,
+                     (PPCtoc_entry tblockaddress:$disp, i64:$reg))]>, isPPC64;
+
+def LDX  : XForm_1<31,  21, (outs g8rc:$rD), (ins memrr:$src),
+                   "ldx $rD, $src", IIC_LdStLD,
+                   [(set i64:$rD, (load xaddr:$src))]>, isPPC64;
+def LDBRX : XForm_1<31,  532, (outs g8rc:$rD), (ins memrr:$src),
+                   "ldbrx $rD, $src", IIC_LdStLoad,
+                   [(set i64:$rD, (PPClbrx xoaddr:$src, i64))]>, isPPC64;
+
+let mayLoad = 1, hasSideEffects = 0, isCodeGenOnly = 1 in {
+def LHBRX8 : XForm_1<31, 790, (outs g8rc:$rD), (ins memrr:$src),
+                   "lhbrx $rD, $src", IIC_LdStLoad, []>;
+def LWBRX8 : XForm_1<31,  534, (outs g8rc:$rD), (ins memrr:$src),
+                   "lwbrx $rD, $src", IIC_LdStLoad, []>;
+}
+
+let mayLoad = 1, hasSideEffects = 0 in {
+def LDU  : DSForm_1<58, 1, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memrix:$addr),
+                    "ldu $rD, $addr", IIC_LdStLDU,
+                    []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
+                    NoEncode<"$ea_result">;
+
+def LDUX : XForm_1<31, 53, (outs g8rc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "ldux $rD, $addr", IIC_LdStLDUX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">, isPPC64;
+
+def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src),
+                   "ldmx $rD, $src", IIC_LdStLD, []>, isPPC64,
+           Requires<[IsISA3_0]>;
+}
+}
+
+// Support for medium and large code model.
+let hasSideEffects = 0 in {
+def ADDIStocHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
+                       "#ADDIStocHA", []>, isPPC64;
+let mayLoad = 1 in
+def LDtocL: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
+                   "#LDtocL", []>, isPPC64;
+def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
+                     "#ADDItocL", []>, isPPC64;
+}
+
+// Support for thread-local storage.
+def ADDISgotTprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+                         "#ADDISgotTprelHA",
+                         [(set i64:$rD,
+                           (PPCaddisGotTprelHA i64:$reg,
+                                               tglobaltlsaddr:$disp))]>,
+                  isPPC64;
+def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg),
+                        "#LDgotTprelL",
+                        [(set i64:$rD,
+                          (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>,
+                 isPPC64;
+def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g),
+          (ADD8TLS $in, tglobaltlsaddr:$g)>;
+def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+                         "#ADDIStlsgdHA",
+                         [(set i64:$rD,
+                           (PPCaddisTlsgdHA i64:$reg, tglobaltlsaddr:$disp))]>,
+                  isPPC64;
+def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+                       "#ADDItlsgdL",
+                       [(set i64:$rD,
+                         (PPCaddiTlsgdL i64:$reg, tglobaltlsaddr:$disp))]>,
+                 isPPC64;
+// LR8 is a true define, while the rest of the Defs are clobbers.  X3 is
+// explicitly defined when this op is created, so not mentioned here.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+    Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
+def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
+                        "#GETtlsADDR",
+                        [(set i64:$rD,
+                          (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>,
+                 isPPC64;
+// Combined op for ADDItlsgdL and GETtlsADDR, late expanded.  X3 and LR8
+// are true defines while the rest of the Defs are clobbers.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+    Defs = [X0,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7]
+    in
+def ADDItlsgdLADDR : Pseudo<(outs g8rc:$rD),
+                            (ins g8rc_nox0:$reg, s16imm64:$disp, tlsgd:$sym),
+                            "#ADDItlsgdLADDR",
+                            [(set i64:$rD,
+                              (PPCaddiTlsgdLAddr i64:$reg,
+                                                 tglobaltlsaddr:$disp,
+                                                 tglobaltlsaddr:$sym))]>,
+                     isPPC64;
+def ADDIStlsldHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+                         "#ADDIStlsldHA",
+                         [(set i64:$rD,
+                           (PPCaddisTlsldHA i64:$reg, tglobaltlsaddr:$disp))]>,
+                  isPPC64;
+def ADDItlsldL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+                       "#ADDItlsldL",
+                       [(set i64:$rD,
+                         (PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>,
+                 isPPC64;
+// LR8 is a true define, while the rest of the Defs are clobbers.  X3 is
+// explicitly defined when this op is created, so not mentioned here.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+    Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
+def GETtlsldADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
+                          "#GETtlsldADDR",
+                          [(set i64:$rD,
+                            (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>,
+                   isPPC64;
+// Combined op for ADDItlsldL and GETtlsADDR, late expanded.  X3 and LR8
+// are true defines, while the rest of the Defs are clobbers.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+    Defs = [X0,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7]
+    in
+def ADDItlsldLADDR : Pseudo<(outs g8rc:$rD),
+                            (ins g8rc_nox0:$reg, s16imm64:$disp, tlsgd:$sym),
+                            "#ADDItlsldLADDR",
+                            [(set i64:$rD,
+                              (PPCaddiTlsldLAddr i64:$reg,
+                                                 tglobaltlsaddr:$disp,
+                                                 tglobaltlsaddr:$sym))]>,
+                     isPPC64;
+def ADDISdtprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+                          "#ADDISdtprelHA",
+                          [(set i64:$rD,
+                            (PPCaddisDtprelHA i64:$reg,
+                                              tglobaltlsaddr:$disp))]>,
+                   isPPC64;
+def ADDIdtprelL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
+                         "#ADDIdtprelL",
+                         [(set i64:$rD,
+                           (PPCaddiDtprelL i64:$reg, tglobaltlsaddr:$disp))]>,
+                  isPPC64;
+
+let PPC970_Unit = 2 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+// Truncating stores.                       
+def STB8 : DForm_1<38, (outs), (ins g8rc:$rS, memri:$src),
+                   "stb $rS, $src", IIC_LdStStore,
+                   [(truncstorei8 i64:$rS, iaddr:$src)]>;
+def STH8 : DForm_1<44, (outs), (ins g8rc:$rS, memri:$src),
+                   "sth $rS, $src", IIC_LdStStore,
+                   [(truncstorei16 i64:$rS, iaddr:$src)]>;
+def STW8 : DForm_1<36, (outs), (ins g8rc:$rS, memri:$src),
+                   "stw $rS, $src", IIC_LdStStore,
+                   [(truncstorei32 i64:$rS, iaddr:$src)]>;
+def STBX8 : XForm_8<31, 215, (outs), (ins g8rc:$rS, memrr:$dst),
+                   "stbx $rS, $dst", IIC_LdStStore,
+                   [(truncstorei8 i64:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+def STHX8 : XForm_8<31, 407, (outs), (ins g8rc:$rS, memrr:$dst),
+                   "sthx $rS, $dst", IIC_LdStStore,
+                   [(truncstorei16 i64:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+def STWX8 : XForm_8<31, 151, (outs), (ins g8rc:$rS, memrr:$dst),
+                   "stwx $rS, $dst", IIC_LdStStore,
+                   [(truncstorei32 i64:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+} // Interpretation64Bit
+
+// Normal 8-byte stores.
+def STD  : DSForm_1<62, 0, (outs), (ins g8rc:$rS, memrix:$dst),
+                    "std $rS, $dst", IIC_LdStSTD,
+                    [(aligned4store i64:$rS, ixaddr:$dst)]>, isPPC64;
+def STDX  : XForm_8<31, 149, (outs), (ins g8rc:$rS, memrr:$dst),
+                   "stdx $rS, $dst", IIC_LdStSTD,
+                   [(store i64:$rS, xaddr:$dst)]>, isPPC64,
+                   PPC970_DGroup_Cracked;
+def STDBRX: XForm_8<31, 660, (outs), (ins g8rc:$rS, memrr:$dst),
+                   "stdbrx $rS, $dst", IIC_LdStStore,
+                   [(PPCstbrx i64:$rS, xoaddr:$dst, i64)]>, isPPC64,
+                   PPC970_DGroup_Cracked;
+}
+
+// Stores with Update (pre-inc).
+let PPC970_Unit = 2, mayStore = 1 in {
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+def STBU8 : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
+                   "stbu $rS, $dst", IIC_LdStStoreUpd, []>,
+                   RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STHU8 : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
+                   "sthu $rS, $dst", IIC_LdStStoreUpd, []>,
+                   RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STWU8 : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
+                   "stwu $rS, $dst", IIC_LdStStoreUpd, []>,
+                   RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+
+def STBUX8: XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
+                    "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STHUX8: XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
+                    "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STWUX8: XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
+                    "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+} // Interpretation64Bit
+
+def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrix:$dst),
+                   "stdu $rS, $dst", IIC_LdStSTDU, []>,
+                   RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">,
+                   isPPC64;
+
+def STDUX : XForm_8<31, 181, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst),
+                    "stdux $rS, $dst", IIC_LdStSTDUX, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked, isPPC64;
+}
+
+// Patterns to match the pre-inc stores.  We can't put the patterns on
+// the instruction definitions directly as ISel wants the address base
+// and offset to be separate operands, not a single complex operand.
+def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STBU8 $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STHU8 $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STWU8 $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(aligned4pre_store i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STDU $rS, iaddroff:$ptroff, $ptrreg)>;
+
+def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STBUX8 $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STHUX8 $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STWUX8 $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STDUX $rS, $ptrreg, $ptroff)>;
+
+
+//===----------------------------------------------------------------------===//
+// Floating point instructions.
+//
+
+
+let PPC970_Unit = 3, hasSideEffects = 0,
+    Uses = [RM] in {  // FPU Operations.
+defm FCFID  : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fcfid", "$frD, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (PPCfcfid f64:$frB))]>, isPPC64;
+defm FCTID  : XForm_26r<63, 814, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fctid", "$frD, $frB", IIC_FPGeneral,
+                        []>, isPPC64;
+defm FCTIDZ : XForm_26r<63, 815, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fctidz", "$frD, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64;
+
+defm FCFIDU  : XForm_26r<63, 974, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fcfidu", "$frD, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (PPCfcfidu f64:$frB))]>, isPPC64;
+defm FCFIDS  : XForm_26r<59, 846, (outs f4rc:$frD), (ins f8rc:$frB),
+                        "fcfids", "$frD, $frB", IIC_FPGeneral,
+                        [(set f32:$frD, (PPCfcfids f64:$frB))]>, isPPC64;
+defm FCFIDUS : XForm_26r<59, 974, (outs f4rc:$frD), (ins f8rc:$frB),
+                        "fcfidus", "$frD, $frB", IIC_FPGeneral,
+                        [(set f32:$frD, (PPCfcfidus f64:$frB))]>, isPPC64;
+defm FCTIDUZ : XForm_26r<63, 943, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fctiduz", "$frD, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (PPCfctiduz f64:$frB))]>, isPPC64;
+defm FCTIWUZ : XForm_26r<63, 143, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fctiwuz", "$frD, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (PPCfctiwuz f64:$frB))]>, isPPC64;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Patterns
+//
+
+// Extensions and truncates to/from 32-bit regs.
+def : Pat<(i64 (zext i32:$in)),
+          (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32),
+                  0, 32)>;
+def : Pat<(i64 (anyext i32:$in)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32)>;
+def : Pat<(i32 (trunc i64:$in)),
+          (EXTRACT_SUBREG $in, sub_32)>;
+
+// Implement the 'not' operation with the NOR instruction.
+// (we could use the default xori pattern, but nor has lower latency on some
+// cores (such as the A2)).
+def i64not : OutPatFrag<(ops node:$in),
+                        (NOR8 $in, $in)>;
+def        : Pat<(not i64:$in),
+                 (i64not $in)>;
+
+// Extending loads with i64 targets.
+def : Pat<(zextloadi1 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(zextloadi1 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi1 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(extloadi1 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi8 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(extloadi8 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi16 iaddr:$src),
+          (LHZ8 iaddr:$src)>;
+def : Pat<(extloadi16 xaddr:$src),
+          (LHZX8 xaddr:$src)>;
+def : Pat<(extloadi32 iaddr:$src),
+          (LWZ8 iaddr:$src)>;
+def : Pat<(extloadi32 xaddr:$src),
+          (LWZX8 xaddr:$src)>;
+
+// Standard shifts.  These are represented separately from the real shifts above
+// so that we can distinguish between shifts that allow 6-bit and 7-bit shift
+// amounts.
+def : Pat<(sra i64:$rS, i32:$rB),
+          (SRAD $rS, $rB)>;
+def : Pat<(srl i64:$rS, i32:$rB),
+          (SRD $rS, $rB)>;
+def : Pat<(shl i64:$rS, i32:$rB),
+          (SLD $rS, $rB)>;
+
+// SHL/SRL
+def : Pat<(shl i64:$in, (i32 imm:$imm)),
+          (RLDICR $in, imm:$imm, (SHL64 imm:$imm))>;
+def : Pat<(srl i64:$in, (i32 imm:$imm)),
+          (RLDICL $in, (SRL64 imm:$imm), imm:$imm)>;
+
+// ROTL
+def : Pat<(rotl i64:$in, i32:$sh),
+          (RLDCL $in, $sh, 0)>;
+def : Pat<(rotl i64:$in, (i32 imm:$imm)),
+          (RLDICL $in, imm:$imm, 0)>;
+
+// Hi and Lo for Darwin Global Addresses.
+def : Pat<(PPChi tglobaladdr:$in, 0), (LIS8 tglobaladdr:$in)>;
+def : Pat<(PPClo tglobaladdr:$in, 0), (LI8  tglobaladdr:$in)>;
+def : Pat<(PPChi tconstpool:$in , 0), (LIS8 tconstpool:$in)>;
+def : Pat<(PPClo tconstpool:$in , 0), (LI8  tconstpool:$in)>;
+def : Pat<(PPChi tjumptable:$in , 0), (LIS8 tjumptable:$in)>;
+def : Pat<(PPClo tjumptable:$in , 0), (LI8  tjumptable:$in)>;
+def : Pat<(PPChi tblockaddress:$in, 0), (LIS8 tblockaddress:$in)>;
+def : Pat<(PPClo tblockaddress:$in, 0), (LI8  tblockaddress:$in)>;
+def : Pat<(PPChi tglobaltlsaddr:$g, i64:$in),
+          (ADDIS8 $in, tglobaltlsaddr:$g)>;
+def : Pat<(PPClo tglobaltlsaddr:$g, i64:$in),
+          (ADDI8 $in, tglobaltlsaddr:$g)>;
+def : Pat<(add i64:$in, (PPChi tglobaladdr:$g, 0)),
+          (ADDIS8 $in, tglobaladdr:$g)>;
+def : Pat<(add i64:$in, (PPChi tconstpool:$g, 0)),
+          (ADDIS8 $in, tconstpool:$g)>;
+def : Pat<(add i64:$in, (PPChi tjumptable:$g, 0)),
+          (ADDIS8 $in, tjumptable:$g)>;
+def : Pat<(add i64:$in, (PPChi tblockaddress:$g, 0)),
+          (ADDIS8 $in, tblockaddress:$g)>;
+
+// Patterns to match r+r indexed loads and stores for
+// addresses without at least 4-byte alignment.
+def : Pat<(i64 (unaligned4sextloadi32 xoaddr:$src)),
+          (LWAX xoaddr:$src)>;
+def : Pat<(i64 (unaligned4load xoaddr:$src)),
+          (LDX xoaddr:$src)>;
+def : Pat<(unaligned4store i64:$rS, xoaddr:$dst),
+          (STDX $rS, xoaddr:$dst)>;
+
+// 64-bits atomic loads and stores
+def : Pat<(atomic_load_64 ixaddr:$src), (LD  memrix:$src)>;
+def : Pat<(atomic_load_64 xaddr:$src),  (LDX memrr:$src)>;
+
+def : Pat<(atomic_store_64 ixaddr:$ptr, i64:$val), (STD  g8rc:$val, memrix:$ptr)>;
+def : Pat<(atomic_store_64 xaddr:$ptr,  i64:$val), (STDX g8rc:$val, memrr:$ptr)>;
+
+let Predicates = [IsISA3_0] in {
+
+class X_L1_RA5_RB5<bits<6> opcode, bits<10> xo, string opc, RegisterOperand ty,
+                   InstrItinClass itin, list<dag> pattern>
+  : X_L1_RS5_RS5<opcode, xo, (outs), (ins ty:$rA, ty:$rB, u1imm:$L),
+                 !strconcat(opc, " $rA, $rB, $L"), itin, pattern>;
+
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+def CP_COPY8   : X_L1_RA5_RB5<31, 774, "copy"  , g8rc, IIC_LdStCOPY, []>;
+def CP_PASTE8  : X_L1_RA5_RB5<31, 902, "paste" , g8rc, IIC_LdStPASTE, []>;
+def CP_PASTE8o : X_L1_RA5_RB5<31, 902, "paste.", g8rc, IIC_LdStPASTE, []>,isDOT;
+}
+
+// SLB Invalidate Entry Global
+def SLBIEG : XForm_26<31, 466, (outs), (ins gprc:$RS, gprc:$RB),
+                      "slbieg $RS, $RB", IIC_SprSLBIEG, []>;
+// SLB Synchronize
+def SLBSYNC : XForm_0<31, 338, (outs), (ins), "slbsync", IIC_SprSLBSYNC, []>;
+
+} // IsISA3_0
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
new file mode 100644
index 000000000000..5c022749ad64
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -0,0 +1,1451 @@
+//===-- PPCInstrAltivec.td - The PowerPC Altivec Extension -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Altivec extension to the PowerPC instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+// *********************************** NOTE ***********************************
+// ** For POWER8 Little Endian, the VSX swap optimization relies on knowing  **
+// ** which VMX and VSX instructions are lane-sensitive and which are not.   **
+// ** A lane-sensitive instruction relies, implicitly or explicitly, on      **
+// ** whether lanes are numbered from left to right.  An instruction like    **
+// ** VADDFP is not lane-sensitive, because each lane of the result vector   **
+// ** relies only on the corresponding lane of the source vectors.  However, **
+// ** an instruction like VMULESB is lane-sensitive, because "even" and      **
+// ** "odd" lanes are different for big-endian and little-endian numbering.  **
+// **                                                                        **
+// ** When adding new VMX and VSX instructions, please consider whether they **
+// ** are lane-sensitive.  If so, they must be added to a switch statement   **
+// ** in PPCVSXSwapRemoval::gatherVectorInstructions().                      **
+// ****************************************************************************
+
+
+//===----------------------------------------------------------------------===//
+// Altivec transformation functions and pattern fragments.
+//
+
+// Since we canonicalize buildvectors to v16i8, all vnots "-1" operands will be
+// of that type.
+def vnot_ppc : PatFrag<(ops node:$in),
+                       (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>;
+
+def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                              (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG);
+}]>;
+def vpkuwum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                              (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG);
+}]>;
+def vpkudum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                              (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUDUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG);
+}]>;
+def vpkuhum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG);
+}]>;
+def vpkuwum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG);
+}]>;
+def vpkudum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUDUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG);
+}]>;
+
+// These fragments are provided for little-endian, where the inputs must be
+// swapped for correct semantics.
+def vpkuhum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG);
+}]>;
+def vpkuwum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG);
+}]>;
+def vpkudum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUDUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG);
+}]>;
+
+def vmrglb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 0, *CurDAG);
+}]>;
+def vmrglh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 0, *CurDAG);
+}]>;
+def vmrglw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 0, *CurDAG);
+}]>;
+def vmrghb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 0, *CurDAG);
+}]>;
+def vmrghh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 0, *CurDAG);
+}]>;
+def vmrghw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 0, *CurDAG);
+}]>;
+
+
+def vmrglb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                               (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 1, *CurDAG);
+}]>;
+def vmrglh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 1, *CurDAG);
+}]>;
+def vmrglw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 1, *CurDAG);
+}]>;
+def vmrghb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 1, *CurDAG);
+}]>;
+def vmrghh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 1, *CurDAG);
+}]>;
+def vmrghw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 1, *CurDAG);
+}]>;
+
+
+// These fragments are provided for little-endian, where the inputs must be
+// swapped for correct semantics.
+def vmrglb_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                               (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 2, *CurDAG);
+}]>;
+def vmrglh_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 2, *CurDAG);
+}]>;
+def vmrglw_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 2, *CurDAG);
+}]>;
+def vmrghb_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 2, *CurDAG);
+}]>;
+def vmrghh_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 2, *CurDAG);
+}]>;
+def vmrghw_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 2, *CurDAG);
+}]>;
+
+
+def vmrgew_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), true, 0, *CurDAG);
+}]>;
+def vmrgow_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), false, 0, *CurDAG);
+}]>;
+def vmrgew_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), true, 1, *CurDAG);
+}]>;
+def vmrgow_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), false, 1, *CurDAG);
+}]>;
+def vmrgew_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), true, 2, *CurDAG);
+}]>;
+def vmrgow_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGEOShuffleMask(cast<ShuffleVectorSDNode>(N), false, 2, *CurDAG);
+}]>;
+
+
+
+def VSLDOI_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, 0, *CurDAG), SDLoc(N));
+}]>;
+def vsldoi_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVSLDOIShuffleMask(N, 0, *CurDAG) != -1;
+}], VSLDOI_get_imm>;
+
+
+/// VSLDOI_unary* - These are used to match vsldoi(X,X), which is turned into
+/// vector_shuffle(X,undef,mask) by the dag combiner.
+def VSLDOI_unary_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, 1, *CurDAG), SDLoc(N));
+}]>;
+def vsldoi_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVSLDOIShuffleMask(N, 1, *CurDAG) != -1;
+}], VSLDOI_unary_get_imm>;
+
+
+/// VSLDOI_swapped* - These fragments are provided for little-endian, where
+/// the inputs must be swapped for correct semantics.
+def VSLDOI_swapped_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, 2, *CurDAG), SDLoc(N));
+}]>;
+def vsldoi_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVSLDOIShuffleMask(N, 2, *CurDAG) != -1;
+}], VSLDOI_get_imm>;
+
+
+// VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm.
+def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 1, *CurDAG), SDLoc(N));
+}]>;
+def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1);
+}], VSPLTB_get_imm>;
+def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 2, *CurDAG), SDLoc(N));
+}]>;
+def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2);
+}], VSPLTH_get_imm>;
+def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 4, *CurDAG), SDLoc(N));
+}]>;
+def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 4);
+}], VSPLTW_get_imm>;
+
+
+// VSPLTISB_get_imm xform function: convert build_vector to VSPLTISB imm.
+def VSPLTISB_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 1, *CurDAG);
+}]>;
+def vecspltisb : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 1, *CurDAG).getNode() != nullptr;
+}], VSPLTISB_get_imm>;
+
+// VSPLTISH_get_imm xform function: convert build_vector to VSPLTISH imm.
+def VSPLTISH_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 2, *CurDAG);
+}]>;
+def vecspltish : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 2, *CurDAG).getNode() != nullptr;
+}], VSPLTISH_get_imm>;
+
+// VSPLTISW_get_imm xform function: convert build_vector to VSPLTISW imm.
+def VSPLTISW_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 4, *CurDAG);
+}]>;
+def vecspltisw : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 4, *CurDAG).getNode() != nullptr;
+}], VSPLTISW_get_imm>;
+
+//===----------------------------------------------------------------------===//
+// Helpers for defining instructions that directly correspond to intrinsics.
+
+// VA1a_Int_Ty - A VAForm_1a intrinsic definition of specific type.
+class VA1a_Int_Ty<bits<6> xo, string opc, Intrinsic IntID, ValueType Ty>
+  : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
+              !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP,
+                       [(set Ty:$vD, (IntID Ty:$vA, Ty:$vB, Ty:$vC))]>;
+
+// VA1a_Int_Ty2 - A VAForm_1a intrinsic definition where the type of the
+// inputs doesn't match the type of the output.
+class VA1a_Int_Ty2<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy,
+                   ValueType InTy>
+  : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
+              !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP,
+                       [(set OutTy:$vD, (IntID InTy:$vA, InTy:$vB, InTy:$vC))]>;
+
+// VA1a_Int_Ty3 - A VAForm_1a intrinsic definition where there are two
+// input types and an output type.
+class VA1a_Int_Ty3<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy,
+                   ValueType In1Ty, ValueType In2Ty>
+  : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
+              !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP,
+                       [(set OutTy:$vD,
+                         (IntID In1Ty:$vA, In1Ty:$vB, In2Ty:$vC))]>;
+
+// VX1_Int_Ty - A VXForm_1 intrinsic definition of specific type.
+class VX1_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
+  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+             !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP,
+             [(set Ty:$vD, (IntID Ty:$vA, Ty:$vB))]>;
+
+// VX1_Int_Ty2 - A VXForm_1 intrinsic definition where the type of the
+// inputs doesn't match the type of the output.
+class VX1_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
+                  ValueType InTy>
+  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+             !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP,
+             [(set OutTy:$vD, (IntID InTy:$vA, InTy:$vB))]>;
+
+// VX1_Int_Ty3 - A VXForm_1 intrinsic definition where there are two
+// input types and an output type.
+class VX1_Int_Ty3<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
+                  ValueType In1Ty, ValueType In2Ty>
+  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+             !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP,
+             [(set OutTy:$vD, (IntID In1Ty:$vA, In2Ty:$vB))]>;
+
+// VX2_Int_SP - A VXForm_2 intrinsic definition of vector single-precision type.
+class VX2_Int_SP<bits<11> xo, string opc, Intrinsic IntID>
+  : VXForm_2<xo, (outs vrrc:$vD), (ins vrrc:$vB),
+             !strconcat(opc, " $vD, $vB"), IIC_VecFP,
+             [(set v4f32:$vD, (IntID v4f32:$vB))]>;
+
+// VX2_Int_Ty2 - A VXForm_2 intrinsic definition where the type of the
+// inputs doesn't match the type of the output.
+class VX2_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
+                  ValueType InTy>
+  : VXForm_2<xo, (outs vrrc:$vD), (ins vrrc:$vB),
+             !strconcat(opc, " $vD, $vB"), IIC_VecFP,
+             [(set OutTy:$vD, (IntID InTy:$vB))]>;
+
+class VXBX_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
+  : VXForm_BX<xo, (outs vrrc:$vD), (ins vrrc:$vA),
+             !strconcat(opc, " $vD, $vA"), IIC_VecFP,
+             [(set Ty:$vD, (IntID Ty:$vA))]>;
+
+class VXCR_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
+  : VXForm_CR<xo, (outs vrrc:$vD), (ins vrrc:$vA, u1imm:$ST, u4imm:$SIX),
+              !strconcat(opc, " $vD, $vA, $ST, $SIX"), IIC_VecFP,
+              [(set Ty:$vD, (IntID Ty:$vA, imm:$ST, imm:$SIX))]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions.
+
+def HasAltivec : Predicate<"PPCSubTarget->hasAltivec()">;
+let Predicates = [HasAltivec] in {
+
+def DSS      : DSS_Form<0, 822, (outs), (ins u5imm:$STRM),
+                        "dss $STRM", IIC_LdStLoad /*FIXME*/, [(int_ppc_altivec_dss imm:$STRM)]>,
+                        Deprecated<DeprecatedDST> {
+  let A = 0;
+  let B = 0;
+}
+
+def DSSALL   : DSS_Form<1, 822, (outs), (ins),
+                        "dssall", IIC_LdStLoad /*FIXME*/, [(int_ppc_altivec_dssall)]>,
+                        Deprecated<DeprecatedDST> {
+  let STRM = 0;
+  let A = 0;
+  let B = 0;
+}
+
+def DST      : DSS_Form<0, 342, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB),
+                        "dst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+                        [(int_ppc_altivec_dst i32:$rA, i32:$rB, imm:$STRM)]>,
+                        Deprecated<DeprecatedDST>;
+
+def DSTT     : DSS_Form<1, 342, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB),
+                        "dstt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+                        [(int_ppc_altivec_dstt i32:$rA, i32:$rB, imm:$STRM)]>,
+                        Deprecated<DeprecatedDST>;
+
+def DSTST    : DSS_Form<0, 374, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB),
+                        "dstst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+                        [(int_ppc_altivec_dstst i32:$rA, i32:$rB, imm:$STRM)]>,
+                        Deprecated<DeprecatedDST>;
+
+def DSTSTT   : DSS_Form<1, 374, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB),
+                        "dststt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+                        [(int_ppc_altivec_dststt i32:$rA, i32:$rB, imm:$STRM)]>,
+                        Deprecated<DeprecatedDST>;
+
+let isCodeGenOnly = 1 in {
+  // The very same instructions as above, but formally matching 64bit registers.
+  def DST64    : DSS_Form<0, 342, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB),
+                          "dst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+                          [(int_ppc_altivec_dst i64:$rA, i32:$rB, imm:$STRM)]>,
+                          Deprecated<DeprecatedDST>;
+
+  def DSTT64   : DSS_Form<1, 342, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB),
+                          "dstt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+                          [(int_ppc_altivec_dstt i64:$rA, i32:$rB, imm:$STRM)]>,
+                          Deprecated<DeprecatedDST>;
+
+  def DSTST64  : DSS_Form<0, 374, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB),
+                          "dstst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+                          [(int_ppc_altivec_dstst i64:$rA, i32:$rB,
+                                                  imm:$STRM)]>,
+                          Deprecated<DeprecatedDST>;
+
+  def DSTSTT64 : DSS_Form<1, 374, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB),
+                          "dststt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+                          [(int_ppc_altivec_dststt i64:$rA, i32:$rB,
+                                                   imm:$STRM)]>,
+                          Deprecated<DeprecatedDST>;
+}
+
+def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins),
+                      "mfvscr $vD", IIC_LdStStore,
+                      [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>; 
+def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB),
+                      "mtvscr $vB", IIC_LdStLoad,
+                      [(int_ppc_altivec_mtvscr v4i32:$vB)]>; 
+
+let PPC970_Unit = 2 in {  // Loads.
+def LVEBX: XForm_1<31,   7, (outs vrrc:$vD), (ins memrr:$src),
+                   "lvebx $vD, $src", IIC_LdStLoad,
+                   [(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>;
+def LVEHX: XForm_1<31,  39, (outs vrrc:$vD), (ins memrr:$src),
+                   "lvehx $vD, $src", IIC_LdStLoad,
+                   [(set v8i16:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>;
+def LVEWX: XForm_1<31,  71, (outs vrrc:$vD), (ins memrr:$src),
+                   "lvewx $vD, $src", IIC_LdStLoad,
+                   [(set v4i32:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>;
+def LVX  : XForm_1<31, 103, (outs vrrc:$vD), (ins memrr:$src),
+                   "lvx $vD, $src", IIC_LdStLoad,
+                   [(set v4i32:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>;
+def LVXL : XForm_1<31, 359, (outs vrrc:$vD), (ins memrr:$src),
+                   "lvxl $vD, $src", IIC_LdStLoad,
+                   [(set v4i32:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>;
+}
+
+def LVSL : XForm_1<31,   6, (outs vrrc:$vD), (ins memrr:$src),
+                   "lvsl $vD, $src", IIC_LdStLoad,
+                   [(set v16i8:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>,
+                   PPC970_Unit_LSU;
+def LVSR : XForm_1<31,  38, (outs vrrc:$vD), (ins memrr:$src),
+                   "lvsr $vD, $src", IIC_LdStLoad,
+                   [(set v16i8:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>,
+                   PPC970_Unit_LSU;
+
+let PPC970_Unit = 2 in {   // Stores.
+def STVEBX: XForm_8<31, 135, (outs), (ins vrrc:$rS, memrr:$dst),
+                   "stvebx $rS, $dst", IIC_LdStStore,
+                   [(int_ppc_altivec_stvebx v16i8:$rS, xoaddr:$dst)]>;
+def STVEHX: XForm_8<31, 167, (outs), (ins vrrc:$rS, memrr:$dst),
+                   "stvehx $rS, $dst", IIC_LdStStore,
+                   [(int_ppc_altivec_stvehx v8i16:$rS, xoaddr:$dst)]>;
+def STVEWX: XForm_8<31, 199, (outs), (ins vrrc:$rS, memrr:$dst),
+                   "stvewx $rS, $dst", IIC_LdStStore,
+                   [(int_ppc_altivec_stvewx v4i32:$rS, xoaddr:$dst)]>;
+def STVX  : XForm_8<31, 231, (outs), (ins vrrc:$rS, memrr:$dst),
+                   "stvx $rS, $dst", IIC_LdStStore,
+                   [(int_ppc_altivec_stvx v4i32:$rS, xoaddr:$dst)]>;
+def STVXL : XForm_8<31, 487, (outs), (ins vrrc:$rS, memrr:$dst),
+                   "stvxl $rS, $dst", IIC_LdStStore,
+                   [(int_ppc_altivec_stvxl v4i32:$rS, xoaddr:$dst)]>;
+}
+
+let PPC970_Unit = 5 in {  // VALU Operations.
+// VA-Form instructions.  3-input AltiVec ops.
+let isCommutable = 1 in {
+def VMADDFP : VAForm_1<46, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB),
+                       "vmaddfp $vD, $vA, $vC, $vB", IIC_VecFP,
+                       [(set v4f32:$vD,
+                        (fma v4f32:$vA, v4f32:$vC, v4f32:$vB))]>;
+
+// FIXME: The fma+fneg pattern won't match because fneg is not legal.
+def VNMSUBFP: VAForm_1<47, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB),
+                       "vnmsubfp $vD, $vA, $vC, $vB", IIC_VecFP,
+                       [(set v4f32:$vD, (fneg (fma v4f32:$vA, v4f32:$vC,
+                                                  (fneg v4f32:$vB))))]>;
+
+def VMHADDSHS  : VA1a_Int_Ty<32, "vmhaddshs", int_ppc_altivec_vmhaddshs, v8i16>;
+def VMHRADDSHS : VA1a_Int_Ty<33, "vmhraddshs", int_ppc_altivec_vmhraddshs,
+                             v8i16>;
+def VMLADDUHM  : VA1a_Int_Ty<34, "vmladduhm", int_ppc_altivec_vmladduhm, v8i16>;
+} // isCommutable
+
+def VPERM      : VA1a_Int_Ty3<43, "vperm", int_ppc_altivec_vperm,
+                              v4i32, v4i32, v16i8>;
+def VSEL       : VA1a_Int_Ty<42, "vsel",  int_ppc_altivec_vsel, v4i32>;
+
+// Shuffles.
+def VSLDOI  : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u5imm:$SH),
+                       "vsldoi $vD, $vA, $vB, $SH", IIC_VecFP,
+                       [(set v16i8:$vD, 
+                         (vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB))]>;
+
+// VX-Form instructions.  AltiVec arithmetic ops.
+let isCommutable = 1 in {
+def VADDFP : VXForm_1<10, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vaddfp $vD, $vA, $vB", IIC_VecFP,
+                      [(set v4f32:$vD, (fadd v4f32:$vA, v4f32:$vB))]>;
+                      
+def VADDUBM : VXForm_1<0, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vaddubm $vD, $vA, $vB", IIC_VecGeneral,
+                      [(set v16i8:$vD, (add v16i8:$vA, v16i8:$vB))]>;
+def VADDUHM : VXForm_1<64, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vadduhm $vD, $vA, $vB", IIC_VecGeneral,
+                      [(set v8i16:$vD, (add v8i16:$vA, v8i16:$vB))]>;
+def VADDUWM : VXForm_1<128, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vadduwm $vD, $vA, $vB", IIC_VecGeneral,
+                      [(set v4i32:$vD, (add v4i32:$vA, v4i32:$vB))]>;
+                      
+def VADDCUW : VX1_Int_Ty<384, "vaddcuw", int_ppc_altivec_vaddcuw, v4i32>;
+def VADDSBS : VX1_Int_Ty<768, "vaddsbs", int_ppc_altivec_vaddsbs, v16i8>;
+def VADDSHS : VX1_Int_Ty<832, "vaddshs", int_ppc_altivec_vaddshs, v8i16>;
+def VADDSWS : VX1_Int_Ty<896, "vaddsws", int_ppc_altivec_vaddsws, v4i32>;
+def VADDUBS : VX1_Int_Ty<512, "vaddubs", int_ppc_altivec_vaddubs, v16i8>;
+def VADDUHS : VX1_Int_Ty<576, "vadduhs", int_ppc_altivec_vadduhs, v8i16>;
+def VADDUWS : VX1_Int_Ty<640, "vadduws", int_ppc_altivec_vadduws, v4i32>;
+} // isCommutable
+
+let isCommutable = 1 in
+def VAND : VXForm_1<1028, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                    "vand $vD, $vA, $vB", IIC_VecFP,
+                    [(set v4i32:$vD, (and v4i32:$vA, v4i32:$vB))]>;
+def VANDC : VXForm_1<1092, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                     "vandc $vD, $vA, $vB", IIC_VecFP,
+                     [(set v4i32:$vD, (and v4i32:$vA,
+                                           (vnot_ppc v4i32:$vB)))]>;
+
+def VCFSX  : VXForm_1<842, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+                      "vcfsx $vD, $vB, $UIMM", IIC_VecFP,
+                      [(set v4f32:$vD,
+                             (int_ppc_altivec_vcfsx v4i32:$vB, imm:$UIMM))]>;
+def VCFUX  : VXForm_1<778, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+                      "vcfux $vD, $vB, $UIMM", IIC_VecFP,
+                      [(set v4f32:$vD,
+                             (int_ppc_altivec_vcfux v4i32:$vB, imm:$UIMM))]>;
+def VCTSXS : VXForm_1<970, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+                      "vctsxs $vD, $vB, $UIMM", IIC_VecFP,
+                      [(set v4i32:$vD,
+                             (int_ppc_altivec_vctsxs v4f32:$vB, imm:$UIMM))]>;
+def VCTUXS : VXForm_1<906, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+                      "vctuxs $vD, $vB, $UIMM", IIC_VecFP,
+                      [(set v4i32:$vD,
+                             (int_ppc_altivec_vctuxs v4f32:$vB, imm:$UIMM))]>;
+
+// Defines with the UIM field set to 0 for floating-point
+// to integer (fp_to_sint/fp_to_uint) conversions and integer
+// to floating-point (sint_to_fp/uint_to_fp) conversions.
+let isCodeGenOnly = 1, VA = 0 in {
+def VCFSX_0 : VXForm_1<842, (outs vrrc:$vD), (ins vrrc:$vB),
+                       "vcfsx $vD, $vB, 0", IIC_VecFP,
+                       [(set v4f32:$vD,
+                             (int_ppc_altivec_vcfsx v4i32:$vB, 0))]>;
+def VCTUXS_0 : VXForm_1<906, (outs vrrc:$vD), (ins vrrc:$vB),
+                        "vctuxs $vD, $vB, 0", IIC_VecFP,
+                        [(set v4i32:$vD,
+                               (int_ppc_altivec_vctuxs v4f32:$vB, 0))]>;
+def VCFUX_0 : VXForm_1<778, (outs vrrc:$vD), (ins vrrc:$vB),
+                       "vcfux $vD, $vB, 0", IIC_VecFP,
+                       [(set v4f32:$vD,
+                               (int_ppc_altivec_vcfux v4i32:$vB, 0))]>;
+def VCTSXS_0 : VXForm_1<970, (outs vrrc:$vD), (ins vrrc:$vB),
+                      "vctsxs $vD, $vB, 0", IIC_VecFP,
+                      [(set v4i32:$vD,
+                             (int_ppc_altivec_vctsxs v4f32:$vB, 0))]>;
+}
+def VEXPTEFP : VX2_Int_SP<394, "vexptefp", int_ppc_altivec_vexptefp>;
+def VLOGEFP  : VX2_Int_SP<458, "vlogefp",  int_ppc_altivec_vlogefp>;
+
+let isCommutable = 1 in {
+def VAVGSB : VX1_Int_Ty<1282, "vavgsb", int_ppc_altivec_vavgsb, v16i8>;
+def VAVGSH : VX1_Int_Ty<1346, "vavgsh", int_ppc_altivec_vavgsh, v8i16>;
+def VAVGSW : VX1_Int_Ty<1410, "vavgsw", int_ppc_altivec_vavgsw, v4i32>;
+def VAVGUB : VX1_Int_Ty<1026, "vavgub", int_ppc_altivec_vavgub, v16i8>;
+def VAVGUH : VX1_Int_Ty<1090, "vavguh", int_ppc_altivec_vavguh, v8i16>;
+def VAVGUW : VX1_Int_Ty<1154, "vavguw", int_ppc_altivec_vavguw, v4i32>;
+
+def VMAXFP : VX1_Int_Ty<1034, "vmaxfp", int_ppc_altivec_vmaxfp, v4f32>;
+def VMAXSB : VX1_Int_Ty< 258, "vmaxsb", int_ppc_altivec_vmaxsb, v16i8>;
+def VMAXSH : VX1_Int_Ty< 322, "vmaxsh", int_ppc_altivec_vmaxsh, v8i16>;
+def VMAXSW : VX1_Int_Ty< 386, "vmaxsw", int_ppc_altivec_vmaxsw, v4i32>;
+def VMAXUB : VX1_Int_Ty<   2, "vmaxub", int_ppc_altivec_vmaxub, v16i8>;
+def VMAXUH : VX1_Int_Ty<  66, "vmaxuh", int_ppc_altivec_vmaxuh, v8i16>;
+def VMAXUW : VX1_Int_Ty< 130, "vmaxuw", int_ppc_altivec_vmaxuw, v4i32>;
+def VMINFP : VX1_Int_Ty<1098, "vminfp", int_ppc_altivec_vminfp, v4f32>;
+def VMINSB : VX1_Int_Ty< 770, "vminsb", int_ppc_altivec_vminsb, v16i8>;
+def VMINSH : VX1_Int_Ty< 834, "vminsh", int_ppc_altivec_vminsh, v8i16>;
+def VMINSW : VX1_Int_Ty< 898, "vminsw", int_ppc_altivec_vminsw, v4i32>;
+def VMINUB : VX1_Int_Ty< 514, "vminub", int_ppc_altivec_vminub, v16i8>;
+def VMINUH : VX1_Int_Ty< 578, "vminuh", int_ppc_altivec_vminuh, v8i16>;
+def VMINUW : VX1_Int_Ty< 642, "vminuw", int_ppc_altivec_vminuw, v4i32>;
+} // isCommutable
+
+def VMRGHB : VXForm_1< 12, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vmrghb $vD, $vA, $vB", IIC_VecFP,
+                      [(set v16i8:$vD, (vmrghb_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGHH : VXForm_1< 76, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vmrghh $vD, $vA, $vB", IIC_VecFP,
+                      [(set v16i8:$vD, (vmrghh_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGHW : VXForm_1<140, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vmrghw $vD, $vA, $vB", IIC_VecFP,
+                      [(set v16i8:$vD, (vmrghw_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGLB : VXForm_1<268, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vmrglb $vD, $vA, $vB", IIC_VecFP,
+                      [(set v16i8:$vD, (vmrglb_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGLH : VXForm_1<332, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vmrglh $vD, $vA, $vB", IIC_VecFP,
+                      [(set v16i8:$vD, (vmrglh_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGLW : VXForm_1<396, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vmrglw $vD, $vA, $vB", IIC_VecFP,
+                      [(set v16i8:$vD, (vmrglw_shuffle v16i8:$vA, v16i8:$vB))]>;
+
+def VMSUMMBM : VA1a_Int_Ty3<37, "vmsummbm", int_ppc_altivec_vmsummbm,
+                            v4i32, v16i8, v4i32>;
+def VMSUMSHM : VA1a_Int_Ty3<40, "vmsumshm", int_ppc_altivec_vmsumshm,
+                            v4i32, v8i16, v4i32>;
+def VMSUMSHS : VA1a_Int_Ty3<41, "vmsumshs", int_ppc_altivec_vmsumshs,
+                            v4i32, v8i16, v4i32>;
+def VMSUMUBM : VA1a_Int_Ty3<36, "vmsumubm", int_ppc_altivec_vmsumubm,
+                            v4i32, v16i8, v4i32>;
+def VMSUMUHM : VA1a_Int_Ty3<38, "vmsumuhm", int_ppc_altivec_vmsumuhm,
+                            v4i32, v8i16, v4i32>;
+def VMSUMUHS : VA1a_Int_Ty3<39, "vmsumuhs", int_ppc_altivec_vmsumuhs,
+                            v4i32, v8i16, v4i32>;
+
+let isCommutable = 1 in {
+def VMULESB : VX1_Int_Ty2<776, "vmulesb", int_ppc_altivec_vmulesb,
+                          v8i16, v16i8>;
+def VMULESH : VX1_Int_Ty2<840, "vmulesh", int_ppc_altivec_vmulesh,
+                          v4i32, v8i16>;
+def VMULEUB : VX1_Int_Ty2<520, "vmuleub", int_ppc_altivec_vmuleub,
+                          v8i16, v16i8>;
+def VMULEUH : VX1_Int_Ty2<584, "vmuleuh", int_ppc_altivec_vmuleuh,
+                          v4i32, v8i16>;
+def VMULOSB : VX1_Int_Ty2<264, "vmulosb", int_ppc_altivec_vmulosb,
+                          v8i16, v16i8>;
+def VMULOSH : VX1_Int_Ty2<328, "vmulosh", int_ppc_altivec_vmulosh,
+                          v4i32, v8i16>;
+def VMULOUB : VX1_Int_Ty2<  8, "vmuloub", int_ppc_altivec_vmuloub,
+                          v8i16, v16i8>;
+def VMULOUH : VX1_Int_Ty2< 72, "vmulouh", int_ppc_altivec_vmulouh,
+                          v4i32, v8i16>;
+} // isCommutable
+                       
+def VREFP     : VX2_Int_SP<266, "vrefp",     int_ppc_altivec_vrefp>;
+def VRFIM     : VX2_Int_SP<714, "vrfim",     int_ppc_altivec_vrfim>;
+def VRFIN     : VX2_Int_SP<522, "vrfin",     int_ppc_altivec_vrfin>;
+def VRFIP     : VX2_Int_SP<650, "vrfip",     int_ppc_altivec_vrfip>;
+def VRFIZ     : VX2_Int_SP<586, "vrfiz",     int_ppc_altivec_vrfiz>;
+def VRSQRTEFP : VX2_Int_SP<330, "vrsqrtefp", int_ppc_altivec_vrsqrtefp>;
+
+def VSUBCUW : VX1_Int_Ty<1408, "vsubcuw", int_ppc_altivec_vsubcuw, v4i32>;
+
+def VSUBFP  : VXForm_1<74, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vsubfp $vD, $vA, $vB", IIC_VecGeneral,
+                      [(set v4f32:$vD, (fsub v4f32:$vA, v4f32:$vB))]>;
+def VSUBUBM : VXForm_1<1024, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vsububm $vD, $vA, $vB", IIC_VecGeneral,
+                      [(set v16i8:$vD, (sub v16i8:$vA, v16i8:$vB))]>;
+def VSUBUHM : VXForm_1<1088, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vsubuhm $vD, $vA, $vB", IIC_VecGeneral,
+                      [(set v8i16:$vD, (sub v8i16:$vA, v8i16:$vB))]>;
+def VSUBUWM : VXForm_1<1152, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vsubuwm $vD, $vA, $vB", IIC_VecGeneral,
+                      [(set v4i32:$vD, (sub v4i32:$vA, v4i32:$vB))]>;
+                      
+def VSUBSBS : VX1_Int_Ty<1792, "vsubsbs" , int_ppc_altivec_vsubsbs, v16i8>;
+def VSUBSHS : VX1_Int_Ty<1856, "vsubshs" , int_ppc_altivec_vsubshs, v8i16>;
+def VSUBSWS : VX1_Int_Ty<1920, "vsubsws" , int_ppc_altivec_vsubsws, v4i32>;
+def VSUBUBS : VX1_Int_Ty<1536, "vsububs" , int_ppc_altivec_vsububs, v16i8>;
+def VSUBUHS : VX1_Int_Ty<1600, "vsubuhs" , int_ppc_altivec_vsubuhs, v8i16>;
+def VSUBUWS : VX1_Int_Ty<1664, "vsubuws" , int_ppc_altivec_vsubuws, v4i32>;
+
+def VSUMSWS : VX1_Int_Ty<1928, "vsumsws" , int_ppc_altivec_vsumsws, v4i32>;
+def VSUM2SWS: VX1_Int_Ty<1672, "vsum2sws", int_ppc_altivec_vsum2sws, v4i32>;
+
+def VSUM4SBS: VX1_Int_Ty3<1800, "vsum4sbs", int_ppc_altivec_vsum4sbs,
+                          v4i32, v16i8, v4i32>;
+def VSUM4SHS: VX1_Int_Ty3<1608, "vsum4shs", int_ppc_altivec_vsum4shs,
+                          v4i32, v8i16, v4i32>;
+def VSUM4UBS: VX1_Int_Ty3<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs,
+                          v4i32, v16i8, v4i32>;
+
+def VNOR : VXForm_1<1284, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                    "vnor $vD, $vA, $vB", IIC_VecFP,
+                    [(set v4i32:$vD, (vnot_ppc (or v4i32:$vA,
+                                                   v4i32:$vB)))]>;
+let isCommutable = 1 in {
+def VOR : VXForm_1<1156, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vor $vD, $vA, $vB", IIC_VecFP,
+                      [(set v4i32:$vD, (or v4i32:$vA, v4i32:$vB))]>;
+def VXOR : VXForm_1<1220, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vxor $vD, $vA, $vB", IIC_VecFP,
+                      [(set v4i32:$vD, (xor v4i32:$vA, v4i32:$vB))]>;
+} // isCommutable
+
+def VRLB   : VX1_Int_Ty<   4, "vrlb", int_ppc_altivec_vrlb, v16i8>;
+def VRLH   : VX1_Int_Ty<  68, "vrlh", int_ppc_altivec_vrlh, v8i16>;
+def VRLW   : VX1_Int_Ty< 132, "vrlw", int_ppc_altivec_vrlw, v4i32>;
+
+def VSL    : VX1_Int_Ty< 452, "vsl" , int_ppc_altivec_vsl,  v4i32 >;
+def VSLO   : VX1_Int_Ty<1036, "vslo", int_ppc_altivec_vslo, v4i32>;
+
+def VSLB   : VX1_Int_Ty< 260, "vslb", int_ppc_altivec_vslb, v16i8>;
+def VSLH   : VX1_Int_Ty< 324, "vslh", int_ppc_altivec_vslh, v8i16>;
+def VSLW   : VX1_Int_Ty< 388, "vslw", int_ppc_altivec_vslw, v4i32>;
+
+def VSPLTB : VXForm_1<524, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+                      "vspltb $vD, $vB, $UIMM", IIC_VecPerm,
+                      [(set v16i8:$vD,
+                        (vspltb_shuffle:$UIMM v16i8:$vB, (undef)))]>;
+def VSPLTH : VXForm_1<588, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+                      "vsplth $vD, $vB, $UIMM", IIC_VecPerm,
+                      [(set v16i8:$vD,
+                        (vsplth_shuffle:$UIMM v16i8:$vB, (undef)))]>;
+def VSPLTW : VXForm_1<652, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
+                      "vspltw $vD, $vB, $UIMM", IIC_VecPerm,
+                      [(set v16i8:$vD, 
+                        (vspltw_shuffle:$UIMM v16i8:$vB, (undef)))]>;
+let isCodeGenOnly = 1 in {
+  def VSPLTBs : VXForm_1<524, (outs vrrc:$vD), (ins u5imm:$UIMM, vfrc:$vB),
+                         "vspltb $vD, $vB, $UIMM", IIC_VecPerm, []>;
+  def VSPLTHs : VXForm_1<588, (outs vrrc:$vD), (ins u5imm:$UIMM, vfrc:$vB),
+                         "vsplth $vD, $vB, $UIMM", IIC_VecPerm, []>;
+}
+
+def VSR    : VX1_Int_Ty< 708, "vsr"  , int_ppc_altivec_vsr,  v4i32>;
+def VSRO   : VX1_Int_Ty<1100, "vsro" , int_ppc_altivec_vsro, v4i32>;
+
+def VSRAB  : VX1_Int_Ty< 772, "vsrab", int_ppc_altivec_vsrab, v16i8>;
+def VSRAH  : VX1_Int_Ty< 836, "vsrah", int_ppc_altivec_vsrah, v8i16>;
+def VSRAW  : VX1_Int_Ty< 900, "vsraw", int_ppc_altivec_vsraw, v4i32>;
+def VSRB   : VX1_Int_Ty< 516, "vsrb" , int_ppc_altivec_vsrb , v16i8>;
+def VSRH   : VX1_Int_Ty< 580, "vsrh" , int_ppc_altivec_vsrh , v8i16>;
+def VSRW   : VX1_Int_Ty< 644, "vsrw" , int_ppc_altivec_vsrw , v4i32>;
+
+
+def VSPLTISB : VXForm_3<780, (outs vrrc:$vD), (ins s5imm:$SIMM),
+                       "vspltisb $vD, $SIMM", IIC_VecPerm,
+                       [(set v16i8:$vD, (v16i8 vecspltisb:$SIMM))]>;
+def VSPLTISH : VXForm_3<844, (outs vrrc:$vD), (ins s5imm:$SIMM),
+                       "vspltish $vD, $SIMM", IIC_VecPerm,
+                       [(set v8i16:$vD, (v8i16 vecspltish:$SIMM))]>;
+def VSPLTISW : VXForm_3<908, (outs vrrc:$vD), (ins s5imm:$SIMM),
+                       "vspltisw $vD, $SIMM", IIC_VecPerm,
+                       [(set v4i32:$vD, (v4i32 vecspltisw:$SIMM))]>;
+
+// Vector Pack.
+def VPKPX   : VX1_Int_Ty2<782, "vpkpx", int_ppc_altivec_vpkpx,
+                          v8i16, v4i32>;
+def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss,
+                          v16i8, v8i16>;
+def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus,
+                          v16i8, v8i16>;
+def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss,
+                          v8i16, v4i32>;
+def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus,
+                          v8i16, v4i32>;
+def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                       "vpkuhum $vD, $vA, $vB", IIC_VecFP,
+                       [(set v16i8:$vD,
+                         (vpkuhum_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VPKUHUS : VX1_Int_Ty2<142, "vpkuhus", int_ppc_altivec_vpkuhus,
+                          v16i8, v8i16>;
+def VPKUWUM : VXForm_1<78, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                       "vpkuwum $vD, $vA, $vB", IIC_VecFP,
+                       [(set v16i8:$vD,
+                         (vpkuwum_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VPKUWUS : VX1_Int_Ty2<206, "vpkuwus", int_ppc_altivec_vpkuwus,
+                          v8i16, v4i32>;
+
+// Vector Unpack.
+def VUPKHPX : VX2_Int_Ty2<846, "vupkhpx", int_ppc_altivec_vupkhpx,
+                          v4i32, v8i16>;
+def VUPKHSB : VX2_Int_Ty2<526, "vupkhsb", int_ppc_altivec_vupkhsb,
+                          v8i16, v16i8>;
+def VUPKHSH : VX2_Int_Ty2<590, "vupkhsh", int_ppc_altivec_vupkhsh,
+                          v4i32, v8i16>;
+def VUPKLPX : VX2_Int_Ty2<974, "vupklpx", int_ppc_altivec_vupklpx,
+                          v4i32, v8i16>;
+def VUPKLSB : VX2_Int_Ty2<654, "vupklsb", int_ppc_altivec_vupklsb,
+                          v8i16, v16i8>;
+def VUPKLSH : VX2_Int_Ty2<718, "vupklsh", int_ppc_altivec_vupklsh,
+                          v4i32, v8i16>;
+
+
+// Altivec Comparisons.
+
+class VCMP<bits<10> xo, string asmstr, ValueType Ty>
+  : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr,
+              IIC_VecFPCompare,
+              [(set Ty:$vD, (Ty (PPCvcmp Ty:$vA, Ty:$vB, xo)))]>;
+class VCMPo<bits<10> xo, string asmstr, ValueType Ty>
+  : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr,
+              IIC_VecFPCompare,
+              [(set Ty:$vD, (Ty (PPCvcmp_o Ty:$vA, Ty:$vB, xo)))]> {
+  let Defs = [CR6];
+  let RC = 1;
+}
+
+// f32 element comparisons.0
+def VCMPBFP   : VCMP <966, "vcmpbfp $vD, $vA, $vB"  , v4f32>;
+def VCMPBFPo  : VCMPo<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>;
+def VCMPEQFP  : VCMP <198, "vcmpeqfp $vD, $vA, $vB" , v4f32>;
+def VCMPEQFPo : VCMPo<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>;
+def VCMPGEFP  : VCMP <454, "vcmpgefp $vD, $vA, $vB" , v4f32>;
+def VCMPGEFPo : VCMPo<454, "vcmpgefp. $vD, $vA, $vB", v4f32>;
+def VCMPGTFP  : VCMP <710, "vcmpgtfp $vD, $vA, $vB" , v4f32>;
+def VCMPGTFPo : VCMPo<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>;
+
+// i8 element comparisons.
+def VCMPEQUB  : VCMP <  6, "vcmpequb $vD, $vA, $vB" , v16i8>;
+def VCMPEQUBo : VCMPo<  6, "vcmpequb. $vD, $vA, $vB", v16i8>;
+def VCMPGTSB  : VCMP <774, "vcmpgtsb $vD, $vA, $vB" , v16i8>;
+def VCMPGTSBo : VCMPo<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>;
+def VCMPGTUB  : VCMP <518, "vcmpgtub $vD, $vA, $vB" , v16i8>;
+def VCMPGTUBo : VCMPo<518, "vcmpgtub. $vD, $vA, $vB", v16i8>;
+
+// i16 element comparisons.
+def VCMPEQUH  : VCMP < 70, "vcmpequh $vD, $vA, $vB" , v8i16>;
+def VCMPEQUHo : VCMPo< 70, "vcmpequh. $vD, $vA, $vB", v8i16>;
+def VCMPGTSH  : VCMP <838, "vcmpgtsh $vD, $vA, $vB" , v8i16>;
+def VCMPGTSHo : VCMPo<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>;
+def VCMPGTUH  : VCMP <582, "vcmpgtuh $vD, $vA, $vB" , v8i16>;
+def VCMPGTUHo : VCMPo<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>;
+
+// i32 element comparisons.
+def VCMPEQUW  : VCMP <134, "vcmpequw $vD, $vA, $vB" , v4i32>;
+def VCMPEQUWo : VCMPo<134, "vcmpequw. $vD, $vA, $vB", v4i32>;
+def VCMPGTSW  : VCMP <902, "vcmpgtsw $vD, $vA, $vB" , v4i32>;
+def VCMPGTSWo : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>;
+def VCMPGTUW  : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>;
+def VCMPGTUWo : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>;
+
+let isCodeGenOnly = 1 in {
+def V_SET0B : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
+                      "vxor $vD, $vD, $vD", IIC_VecFP,
+                      [(set v16i8:$vD, (v16i8 immAllZerosV))]>;
+def V_SET0H : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
+                      "vxor $vD, $vD, $vD", IIC_VecFP,
+                      [(set v8i16:$vD, (v8i16 immAllZerosV))]>;
+def V_SET0  : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
+                      "vxor $vD, $vD, $vD", IIC_VecFP,
+                      [(set v4i32:$vD, (v4i32 immAllZerosV))]>;
+
+let IMM=-1 in {
+def V_SETALLONESB : VXForm_3<908, (outs vrrc:$vD), (ins),
+                      "vspltisw $vD, -1", IIC_VecFP,
+                      [(set v16i8:$vD, (v16i8 immAllOnesV))]>;
+def V_SETALLONESH : VXForm_3<908, (outs vrrc:$vD), (ins),
+                      "vspltisw $vD, -1", IIC_VecFP,
+                      [(set v8i16:$vD, (v8i16 immAllOnesV))]>;
+def V_SETALLONES  : VXForm_3<908, (outs vrrc:$vD), (ins),
+                      "vspltisw $vD, -1", IIC_VecFP,
+                      [(set v4i32:$vD, (v4i32 immAllOnesV))]>;
+}
+}
+} // VALU Operations.
+
+//===----------------------------------------------------------------------===//
+// Additional Altivec Patterns
+//
+
+// Loads.
+def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>;
+
+// Stores.
+def : Pat<(store v4i32:$rS, xoaddr:$dst),
+          (STVX $rS, xoaddr:$dst)>;
+
+// Bit conversions.
+def : Pat<(v16i8 (bitconvert (v8i16 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v1i128 VRRC:$src))), (v16i8 VRRC:$src)>;
+
+def : Pat<(v8i16 (bitconvert (v16i8 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v1i128 VRRC:$src))), (v8i16 VRRC:$src)>;
+
+def : Pat<(v4i32 (bitconvert (v16i8 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v1i128 VRRC:$src))), (v4i32 VRRC:$src)>;
+
+def : Pat<(v4f32 (bitconvert (v16i8 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v1i128 VRRC:$src))), (v4f32 VRRC:$src)>;
+
+def : Pat<(v2i64 (bitconvert (v16i8 VRRC:$src))), (v2i64 VRRC:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 VRRC:$src))), (v2i64 VRRC:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 VRRC:$src))), (v2i64 VRRC:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 VRRC:$src))), (v2i64 VRRC:$src)>;
+def : Pat<(v2i64 (bitconvert (v1i128 VRRC:$src))), (v2i64 VRRC:$src)>;
+
+def : Pat<(v1i128 (bitconvert (v16i8 VRRC:$src))), (v1i128 VRRC:$src)>;
+def : Pat<(v1i128 (bitconvert (v8i16 VRRC:$src))), (v1i128 VRRC:$src)>;
+def : Pat<(v1i128 (bitconvert (v4i32 VRRC:$src))), (v1i128 VRRC:$src)>;
+def : Pat<(v1i128 (bitconvert (v4f32 VRRC:$src))), (v1i128 VRRC:$src)>;
+def : Pat<(v1i128 (bitconvert (v2i64 VRRC:$src))), (v1i128 VRRC:$src)>;
+
+// Shuffles.
+
+// Match vsldoi(x,x), vpkuwum(x,x), vpkuhum(x,x)
+def:Pat<(vsldoi_unary_shuffle:$in v16i8:$vA, undef),
+        (VSLDOI $vA, $vA, (VSLDOI_unary_get_imm $in))>;
+def:Pat<(vpkuwum_unary_shuffle v16i8:$vA, undef),
+        (VPKUWUM $vA, $vA)>;
+def:Pat<(vpkuhum_unary_shuffle v16i8:$vA, undef),
+        (VPKUHUM $vA, $vA)>;
+
+// Match vsldoi(y,x), vpkuwum(y,x), vpkuhum(y,x), i.e., swapped operands.
+// These fragments are matched for little-endian, where the inputs must
+// be swapped for correct semantics.
+def:Pat<(vsldoi_swapped_shuffle:$in v16i8:$vA, v16i8:$vB),
+        (VSLDOI $vB, $vA, (VSLDOI_swapped_get_imm $in))>;
+def:Pat<(vpkuwum_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VPKUWUM $vB, $vA)>;
+def:Pat<(vpkuhum_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VPKUHUM $vB, $vA)>;
+
+// Match vmrg*(x,x)
+def:Pat<(vmrglb_unary_shuffle v16i8:$vA, undef),
+        (VMRGLB $vA, $vA)>;
+def:Pat<(vmrglh_unary_shuffle v16i8:$vA, undef),
+        (VMRGLH $vA, $vA)>;
+def:Pat<(vmrglw_unary_shuffle v16i8:$vA, undef),
+        (VMRGLW $vA, $vA)>;
+def:Pat<(vmrghb_unary_shuffle v16i8:$vA, undef),
+        (VMRGHB $vA, $vA)>;
+def:Pat<(vmrghh_unary_shuffle v16i8:$vA, undef),
+        (VMRGHH $vA, $vA)>;
+def:Pat<(vmrghw_unary_shuffle v16i8:$vA, undef),
+        (VMRGHW $vA, $vA)>;
+
+// Match vmrg*(y,x), i.e., swapped operands.  These fragments
+// are matched for little-endian, where the inputs must be
+// swapped for correct semantics.
+def:Pat<(vmrglb_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGLB $vB, $vA)>;
+def:Pat<(vmrglh_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGLH $vB, $vA)>;
+def:Pat<(vmrglw_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGLW $vB, $vA)>;
+def:Pat<(vmrghb_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGHB $vB, $vA)>;
+def:Pat<(vmrghh_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGHH $vB, $vA)>;
+def:Pat<(vmrghw_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGHW $vB, $vA)>;
+
+// Logical Operations
+def : Pat<(vnot_ppc v4i32:$vA), (VNOR $vA, $vA)>;
+
+def : Pat<(vnot_ppc (or v4i32:$A, v4i32:$B)),
+          (VNOR $A, $B)>;
+def : Pat<(and v4i32:$A, (vnot_ppc v4i32:$B)),
+          (VANDC $A, $B)>;
+
+def : Pat<(fmul v4f32:$vA, v4f32:$vB),
+          (VMADDFP $vA, $vB,
+             (v4i32 (VSLW (V_SETALLONES), (V_SETALLONES))))>; 
+
+// Fused multiply add and multiply sub for packed float.  These are represented
+// separately from the real instructions above, for operations that must have
+// the additional precision, such as Newton-Rhapson (used by divide, sqrt)
+def : Pat<(PPCvmaddfp v4f32:$A, v4f32:$B, v4f32:$C),
+          (VMADDFP $A, $B, $C)>;
+def : Pat<(PPCvnmsubfp v4f32:$A, v4f32:$B, v4f32:$C),
+          (VNMSUBFP $A, $B, $C)>;
+
+def : Pat<(int_ppc_altivec_vmaddfp v4f32:$A, v4f32:$B, v4f32:$C),
+          (VMADDFP $A, $B, $C)>;
+def : Pat<(int_ppc_altivec_vnmsubfp v4f32:$A, v4f32:$B, v4f32:$C),
+          (VNMSUBFP $A, $B, $C)>;
+
+def : Pat<(PPCvperm v16i8:$vA, v16i8:$vB, v16i8:$vC),
+          (VPERM $vA, $vB, $vC)>;
+
+def : Pat<(PPCfre v4f32:$A), (VREFP $A)>;
+def : Pat<(PPCfrsqrte v4f32:$A), (VRSQRTEFP $A)>;
+
+// Vector shifts
+def : Pat<(v16i8 (shl v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSLB $vA, $vB))>;
+def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSLH $vA, $vB))>;
+def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSLW $vA, $vB))>;
+
+def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSRB $vA, $vB))>;
+def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSRH $vA, $vB))>;
+def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSRW $vA, $vB))>;
+
+def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSRAB $vA, $vB))>;
+def : Pat<(v8i16 (sra v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSRAH $vA, $vB))>;
+def : Pat<(v4i32 (sra v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSRAW $vA, $vB))>;
+
+// Float to integer and integer to float conversions
+def : Pat<(v4i32 (fp_to_sint v4f32:$vA)),
+           (VCTSXS_0 $vA)>;
+def : Pat<(v4i32 (fp_to_uint v4f32:$vA)),
+           (VCTUXS_0 $vA)>;
+def : Pat<(v4f32 (sint_to_fp v4i32:$vA)),
+           (VCFSX_0 $vA)>;
+def : Pat<(v4f32 (uint_to_fp v4i32:$vA)),
+           (VCFUX_0 $vA)>;
+
+// Floating-point rounding
+def : Pat<(v4f32 (ffloor v4f32:$vA)),
+          (VRFIM $vA)>;
+def : Pat<(v4f32 (fceil v4f32:$vA)),
+          (VRFIP $vA)>;
+def : Pat<(v4f32 (ftrunc v4f32:$vA)),
+          (VRFIZ $vA)>;
+def : Pat<(v4f32 (fnearbyint v4f32:$vA)),
+          (VRFIN $vA)>;
+
+} // end HasAltivec
+
+def HasP8Altivec : Predicate<"PPCSubTarget->hasP8Altivec()">;
+def HasP8Crypto : Predicate<"PPCSubTarget->hasP8Crypto()">;
+let Predicates = [HasP8Altivec] in {
+
+let isCommutable = 1 in {
+def VMULESW : VX1_Int_Ty2<904, "vmulesw", int_ppc_altivec_vmulesw,
+                          v2i64, v4i32>;
+def VMULEUW : VX1_Int_Ty2<648, "vmuleuw", int_ppc_altivec_vmuleuw,
+                          v2i64, v4i32>;
+def VMULOSW : VX1_Int_Ty2<392, "vmulosw", int_ppc_altivec_vmulosw,
+                          v2i64, v4i32>;
+def VMULOUW : VX1_Int_Ty2<136, "vmulouw", int_ppc_altivec_vmulouw,
+                          v2i64, v4i32>;
+def VMULUWM : VXForm_1<137, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                       "vmuluwm $vD, $vA, $vB", IIC_VecGeneral,
+                       [(set v4i32:$vD, (mul v4i32:$vA, v4i32:$vB))]>;
+def VMAXSD : VX1_Int_Ty<450, "vmaxsd", int_ppc_altivec_vmaxsd, v2i64>;
+def VMAXUD : VX1_Int_Ty<194, "vmaxud", int_ppc_altivec_vmaxud, v2i64>;
+def VMINSD : VX1_Int_Ty<962, "vminsd", int_ppc_altivec_vminsd, v2i64>;
+def VMINUD : VX1_Int_Ty<706, "vminud", int_ppc_altivec_vminud, v2i64>;
+} // isCommutable
+
+// Vector merge 
+def VMRGEW : VXForm_1<1932, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vmrgew $vD, $vA, $vB", IIC_VecFP,
+                      [(set v16i8:$vD, (vmrgew_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGOW : VXForm_1<1676, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vmrgow $vD, $vA, $vB", IIC_VecFP,
+                      [(set v16i8:$vD, (vmrgow_shuffle v16i8:$vA, v16i8:$vB))]>;
+
+// Match vmrgew(x,x) and vmrgow(x,x)
+def:Pat<(vmrgew_unary_shuffle v16i8:$vA, undef),
+        (VMRGEW $vA, $vA)>;
+def:Pat<(vmrgow_unary_shuffle v16i8:$vA, undef),
+        (VMRGOW $vA, $vA)>;
+
+// Match vmrgew(y,x) and vmrgow(y,x), i.e., swapped operands.  These fragments
+// are matched for little-endian, where the inputs must be swapped for correct
+// semantics.w
+def:Pat<(vmrgew_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGEW $vB, $vA)>;
+def:Pat<(vmrgow_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGOW $vB, $vA)>;
+
+
+// Vector shifts
+def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>;
+def VSLD : VXForm_1<1476, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                    "vsld $vD, $vA, $vB", IIC_VecGeneral,
+                    [(set v2i64:$vD, (shl v2i64:$vA, v2i64:$vB))]>;
+def VSRD : VXForm_1<1732, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                   "vsrd $vD, $vA, $vB", IIC_VecGeneral,
+                   [(set v2i64:$vD, (srl v2i64:$vA, v2i64:$vB))]>;
+def VSRAD : VXForm_1<964, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                    "vsrad $vD, $vA, $vB", IIC_VecGeneral,
+                    [(set v2i64:$vD, (sra v2i64:$vA, v2i64:$vB))]>;
+
+// Vector Integer Arithmetic Instructions
+let isCommutable = 1 in {
+def VADDUDM : VXForm_1<192, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                       "vaddudm $vD, $vA, $vB", IIC_VecGeneral,
+                       [(set v2i64:$vD, (add v2i64:$vA, v2i64:$vB))]>;
+def VADDUQM : VXForm_1<256, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                       "vadduqm $vD, $vA, $vB", IIC_VecGeneral,
+                       [(set v1i128:$vD, (add v1i128:$vA, v1i128:$vB))]>;
+} // isCommutable
+
+// Vector Quadword Add
+def VADDEUQM : VA1a_Int_Ty<60, "vaddeuqm", int_ppc_altivec_vaddeuqm, v1i128>;
+def VADDCUQ  : VX1_Int_Ty<320, "vaddcuq", int_ppc_altivec_vaddcuq, v1i128>;
+def VADDECUQ : VA1a_Int_Ty<61, "vaddecuq", int_ppc_altivec_vaddecuq, v1i128>;
+
+// Vector Doubleword Subtract
+def VSUBUDM : VXForm_1<1216, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                       "vsubudm $vD, $vA, $vB", IIC_VecGeneral,
+                       [(set v2i64:$vD, (sub v2i64:$vA, v2i64:$vB))]>;
+
+// Vector Quadword Subtract
+def VSUBUQM : VXForm_1<1280, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                       "vsubuqm $vD, $vA, $vB", IIC_VecGeneral,
+                       [(set v1i128:$vD, (sub v1i128:$vA, v1i128:$vB))]>;
+def VSUBEUQM : VA1a_Int_Ty<62, "vsubeuqm", int_ppc_altivec_vsubeuqm, v1i128>;
+def VSUBCUQ  : VX1_Int_Ty<1344, "vsubcuq", int_ppc_altivec_vsubcuq, v1i128>;
+def VSUBECUQ : VA1a_Int_Ty<63, "vsubecuq", int_ppc_altivec_vsubecuq, v1i128>;
+
+// Count Leading Zeros
+def VCLZB : VXForm_2<1794, (outs vrrc:$vD), (ins vrrc:$vB),
+                     "vclzb $vD, $vB", IIC_VecGeneral,
+                     [(set v16i8:$vD, (ctlz v16i8:$vB))]>;
+def VCLZH : VXForm_2<1858, (outs vrrc:$vD), (ins vrrc:$vB),
+                     "vclzh $vD, $vB", IIC_VecGeneral,
+                     [(set v8i16:$vD, (ctlz v8i16:$vB))]>;
+def VCLZW : VXForm_2<1922, (outs vrrc:$vD), (ins vrrc:$vB),
+                     "vclzw $vD, $vB", IIC_VecGeneral,
+                     [(set v4i32:$vD, (ctlz v4i32:$vB))]>;
+def VCLZD : VXForm_2<1986, (outs vrrc:$vD), (ins vrrc:$vB),
+                     "vclzd $vD, $vB", IIC_VecGeneral,
+                     [(set v2i64:$vD, (ctlz v2i64:$vB))]>;
+
+// Population Count
+def VPOPCNTB : VXForm_2<1795, (outs vrrc:$vD), (ins vrrc:$vB),
+                        "vpopcntb $vD, $vB", IIC_VecGeneral,
+                        [(set v16i8:$vD, (ctpop v16i8:$vB))]>;
+def VPOPCNTH : VXForm_2<1859, (outs vrrc:$vD), (ins vrrc:$vB),
+                        "vpopcnth $vD, $vB", IIC_VecGeneral,
+                        [(set v8i16:$vD, (ctpop v8i16:$vB))]>;
+def VPOPCNTW : VXForm_2<1923, (outs vrrc:$vD), (ins vrrc:$vB),
+                        "vpopcntw $vD, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (ctpop v4i32:$vB))]>;
+def VPOPCNTD : VXForm_2<1987, (outs vrrc:$vD), (ins vrrc:$vB),
+                        "vpopcntd $vD, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (ctpop v2i64:$vB))]>;
+
+let isCommutable = 1 in {
+// FIXME: Use AddedComplexity > 400 to ensure these patterns match before the 
+//        VSX equivalents. We need to fix this up at some point. Two possible
+//        solutions for this problem:
+//        1. Disable Altivec patterns that compete with VSX patterns using the
+//           !HasVSX predicate. This essentially favours VSX over Altivec, in 
+//           hopes of reducing register pressure (larger register set using VSX 
+//           instructions than VMX instructions)
+//        2. Employ a more disciplined use of AddedComplexity, which would provide
+//           more fine-grained control than option 1. This would be beneficial
+//           if we find situations where Altivec is really preferred over VSX. 
+def VEQV  : VXForm_1<1668, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                     "veqv $vD, $vA, $vB", IIC_VecGeneral,
+                     [(set v4i32:$vD, (vnot_ppc (xor v4i32:$vA, v4i32:$vB)))]>;
+def VNAND : VXForm_1<1412, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                     "vnand $vD, $vA, $vB", IIC_VecGeneral,
+                     [(set v4i32:$vD, (vnot_ppc (and v4i32:$vA, v4i32:$vB)))]>;
+} // isCommutable
+
+def VORC : VXForm_1<1348, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                      "vorc $vD, $vA, $vB", IIC_VecGeneral,
+                      [(set v4i32:$vD, (or v4i32:$vA,
+                                           (vnot_ppc v4i32:$vB)))]>;
+
+// i64 element comparisons.
+def VCMPEQUD  : VCMP <199, "vcmpequd $vD, $vA, $vB" , v2i64>;
+def VCMPEQUDo : VCMPo<199, "vcmpequd. $vD, $vA, $vB", v2i64>;
+def VCMPGTSD  : VCMP <967, "vcmpgtsd $vD, $vA, $vB" , v2i64>;
+def VCMPGTSDo : VCMPo<967, "vcmpgtsd. $vD, $vA, $vB", v2i64>;
+def VCMPGTUD  : VCMP <711, "vcmpgtud $vD, $vA, $vB" , v2i64>;
+def VCMPGTUDo : VCMPo<711, "vcmpgtud. $vD, $vA, $vB", v2i64>;
+
+// The cryptography instructions that do not require Category:Vector.Crypto
+def VPMSUMB : VX1_Int_Ty<1032, "vpmsumb",
+                         int_ppc_altivec_crypto_vpmsumb, v16i8>;
+def VPMSUMH : VX1_Int_Ty<1096, "vpmsumh",
+                         int_ppc_altivec_crypto_vpmsumh, v8i16>;
+def VPMSUMW : VX1_Int_Ty<1160, "vpmsumw",
+                         int_ppc_altivec_crypto_vpmsumw, v4i32>;
+def VPMSUMD : VX1_Int_Ty<1224, "vpmsumd",
+                         int_ppc_altivec_crypto_vpmsumd, v2i64>;
+def VPERMXOR : VA1a_Int_Ty<45, "vpermxor",
+                         int_ppc_altivec_crypto_vpermxor, v16i8>;
+
+// Vector doubleword integer pack and unpack.
+def VPKSDSS : VX1_Int_Ty2<1486, "vpksdss", int_ppc_altivec_vpksdss,
+                          v4i32, v2i64>;
+def VPKSDUS : VX1_Int_Ty2<1358, "vpksdus", int_ppc_altivec_vpksdus,
+                          v4i32, v2i64>;
+def VPKUDUM : VXForm_1<1102, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                       "vpkudum $vD, $vA, $vB", IIC_VecFP,
+                       [(set v16i8:$vD,
+                         (vpkudum_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VPKUDUS : VX1_Int_Ty2<1230, "vpkudus", int_ppc_altivec_vpkudus,
+                          v4i32, v2i64>;
+def VUPKHSW : VX2_Int_Ty2<1614, "vupkhsw", int_ppc_altivec_vupkhsw,
+                          v2i64, v4i32>;
+def VUPKLSW : VX2_Int_Ty2<1742, "vupklsw", int_ppc_altivec_vupklsw,
+                          v2i64, v4i32>;
+
+// Shuffle patterns for unary and swapped (LE) vector pack modulo.
+def:Pat<(vpkudum_unary_shuffle v16i8:$vA, undef),
+        (VPKUDUM $vA, $vA)>;
+def:Pat<(vpkudum_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VPKUDUM $vB, $vA)>;
+
+def VGBBD : VX2_Int_Ty2<1292, "vgbbd", int_ppc_altivec_vgbbd, v16i8, v16i8>;
+def VBPERMQ : VX1_Int_Ty2<1356, "vbpermq", int_ppc_altivec_vbpermq,
+                          v2i64, v16i8>;
+} // end HasP8Altivec
+
+// Crypto instructions (from builtins)
+let Predicates = [HasP8Crypto] in {
+def VSHASIGMAW : VXCR_Int_Ty<1666, "vshasigmaw",
+                              int_ppc_altivec_crypto_vshasigmaw, v4i32>;
+def VSHASIGMAD : VXCR_Int_Ty<1730, "vshasigmad",
+                              int_ppc_altivec_crypto_vshasigmad, v2i64>;
+def VCIPHER : VX1_Int_Ty<1288, "vcipher", int_ppc_altivec_crypto_vcipher,
+                         v2i64>;
+def VCIPHERLAST : VX1_Int_Ty<1289, "vcipherlast",
+                              int_ppc_altivec_crypto_vcipherlast, v2i64>;
+def VNCIPHER : VX1_Int_Ty<1352, "vncipher",
+                          int_ppc_altivec_crypto_vncipher, v2i64>;
+def VNCIPHERLAST : VX1_Int_Ty<1353, "vncipherlast",
+                              int_ppc_altivec_crypto_vncipherlast, v2i64>;
+def VSBOX : VXBX_Int_Ty<1480, "vsbox", int_ppc_altivec_crypto_vsbox, v2i64>;
+} // HasP8Crypto
+
+// The following altivec instructions were introduced in Power ISA 3.0
+def HasP9Altivec : Predicate<"PPCSubTarget->hasP9Altivec()">;
+let Predicates = [HasP9Altivec] in {
+
+// i8 element comparisons.
+def VCMPNEB   : VCMP   <  7, "vcmpneb $vD, $vA, $vB"  , v16i8>;
+def VCMPNEBo  : VCMPo  <  7, "vcmpneb. $vD, $vA, $vB" , v16i8>;
+def VCMPNEZB  : VCMP <263, "vcmpnezb $vD, $vA, $vB" , v16i8>;
+def VCMPNEZBo : VCMPo<263, "vcmpnezb. $vD, $vA, $vB", v16i8>;
+
+// i16 element comparisons.
+def VCMPNEH   : VCMP < 71, "vcmpneh $vD, $vA, $vB"  , v8i16>;
+def VCMPNEHo  : VCMPo< 71, "vcmpneh. $vD, $vA, $vB" , v8i16>;
+def VCMPNEZH  : VCMP <327, "vcmpnezh $vD, $vA, $vB" , v8i16>;
+def VCMPNEZHo : VCMPo<327, "vcmpnezh. $vD, $vA, $vB", v8i16>;
+
+// i32 element comparisons.
+def VCMPNEW   : VCMP <135, "vcmpnew $vD, $vA, $vB"  , v4i32>;
+def VCMPNEWo  : VCMPo<135, "vcmpnew. $vD, $vA, $vB" , v4i32>;
+def VCMPNEZW  : VCMP <391, "vcmpnezw $vD, $vA, $vB" , v4i32>;
+def VCMPNEZWo : VCMPo<391, "vcmpnezw. $vD, $vA, $vB", v4i32>;
+
+// VX-Form: [PO VRT / UIM VRB XO].
+// We use VXForm_1 to implement it, that is, we use "VRA" (5 bit) to represent
+// "/ UIM" (1 + 4 bit)
+class VX1_VT5_UIM5_VB5<bits<11> xo, string opc, list<dag> pattern>
+  : VXForm_1<xo, (outs vrrc:$vD), (ins u4imm:$UIMM, vrrc:$vB),
+             !strconcat(opc, " $vD, $vB, $UIMM"), IIC_VecGeneral, pattern>;
+
+class VX1_RT5_RA5_VB5<bits<11> xo, string opc, list<dag> pattern>
+  : VXForm_1<xo, (outs g8rc:$rD), (ins g8rc:$rA, vrrc:$vB),
+             !strconcat(opc, " $rD, $rA, $vB"), IIC_VecGeneral, pattern>;
+
+// Vector Extract Unsigned
+def VEXTRACTUB : VX1_VT5_UIM5_VB5<525, "vextractub", []>;
+def VEXTRACTUH : VX1_VT5_UIM5_VB5<589, "vextractuh", []>;
+def VEXTRACTUW : VX1_VT5_UIM5_VB5<653, "vextractuw", []>;
+def VEXTRACTD  : VX1_VT5_UIM5_VB5<717, "vextractd" , []>;
+
+// Vector Extract Unsigned Byte/Halfword/Word Left/Right-Indexed
+def VEXTUBLX : VX1_RT5_RA5_VB5<1549, "vextublx", []>;
+def VEXTUBRX : VX1_RT5_RA5_VB5<1805, "vextubrx", []>;
+def VEXTUHLX : VX1_RT5_RA5_VB5<1613, "vextuhlx", []>;
+def VEXTUHRX : VX1_RT5_RA5_VB5<1869, "vextuhrx", []>;
+def VEXTUWLX : VX1_RT5_RA5_VB5<1677, "vextuwlx", []>;
+def VEXTUWRX : VX1_RT5_RA5_VB5<1933, "vextuwrx", []>;
+
+// Vector Insert Element Instructions
+def VINSERTB : VX1_VT5_UIM5_VB5<781, "vinsertb", []>;
+def VINSERTH : VX1_VT5_UIM5_VB5<845, "vinserth", []>;
+def VINSERTW : VX1_VT5_UIM5_VB5<909, "vinsertw", []>;
+def VINSERTD : VX1_VT5_UIM5_VB5<973, "vinsertd", []>;
+
+class VX_VT5_EO5_VB5<bits<11> xo, bits<5> eo, string opc, list<dag> pattern>
+  : VXForm_RD5_XO5_RS5<xo, eo, (outs vrrc:$vD), (ins vrrc:$vB),
+                       !strconcat(opc, " $vD, $vB"), IIC_VecGeneral, pattern>;
+class VX_VT5_EO5_VB5s<bits<11> xo, bits<5> eo, string opc, list<dag> pattern>
+  : VXForm_RD5_XO5_RS5<xo, eo, (outs vfrc:$vD), (ins vfrc:$vB),
+                       !strconcat(opc, " $vD, $vB"), IIC_VecGeneral, pattern>;
+
+// Vector Count Leading/Trailing Zero LSB. Result is placed into GPR[rD]
+def VCLZLSBB : VXForm_RD5_XO5_RS5<1538, 0, (outs gprc:$rD), (ins vrrc:$vB),
+                                  "vclzlsbb $rD, $vB", IIC_VecGeneral,
+                                  [(set i32:$rD, (int_ppc_altivec_vclzlsbb
+                                     v16i8:$vB))]>;
+def VCTZLSBB : VXForm_RD5_XO5_RS5<1538, 1, (outs gprc:$rD), (ins vrrc:$vB),
+                                  "vctzlsbb $rD, $vB", IIC_VecGeneral,
+                                  [(set i32:$rD, (int_ppc_altivec_vctzlsbb
+                                     v16i8:$vB))]>;
+// Vector Count Trailing Zeros
+def VCTZB : VX_VT5_EO5_VB5<1538, 28, "vctzb",
+                           [(set v16i8:$vD, (cttz v16i8:$vB))]>;
+def VCTZH : VX_VT5_EO5_VB5<1538, 29, "vctzh",
+                           [(set v8i16:$vD, (cttz v8i16:$vB))]>;
+def VCTZW : VX_VT5_EO5_VB5<1538, 30, "vctzw",
+                           [(set v4i32:$vD, (cttz v4i32:$vB))]>;
+def VCTZD : VX_VT5_EO5_VB5<1538, 31, "vctzd",
+                           [(set v2i64:$vD, (cttz v2i64:$vB))]>;
+
+// Vector Extend Sign
+def VEXTSB2W : VX_VT5_EO5_VB5<1538, 16, "vextsb2w", []>;
+def VEXTSH2W : VX_VT5_EO5_VB5<1538, 17, "vextsh2w", []>;
+def VEXTSB2D : VX_VT5_EO5_VB5<1538, 24, "vextsb2d", []>;
+def VEXTSH2D : VX_VT5_EO5_VB5<1538, 25, "vextsh2d", []>;
+def VEXTSW2D : VX_VT5_EO5_VB5<1538, 26, "vextsw2d", []>;
+let isCodeGenOnly = 1 in {
+  def VEXTSB2Ws : VX_VT5_EO5_VB5s<1538, 16, "vextsb2w", []>;
+  def VEXTSH2Ws : VX_VT5_EO5_VB5s<1538, 17, "vextsh2w", []>;
+  def VEXTSB2Ds : VX_VT5_EO5_VB5s<1538, 24, "vextsb2d", []>;
+  def VEXTSH2Ds : VX_VT5_EO5_VB5s<1538, 25, "vextsh2d", []>;
+  def VEXTSW2Ds : VX_VT5_EO5_VB5s<1538, 26, "vextsw2d", []>;
+}
+
+// Vector Integer Negate
+def VNEGW : VX_VT5_EO5_VB5<1538, 6, "vnegw",
+                           [(set v4i32:$vD,
+                            (sub (v4i32 immAllZerosV), v4i32:$vB))]>;
+
+def VNEGD : VX_VT5_EO5_VB5<1538, 7, "vnegd",
+                           [(set v2i64:$vD,
+                            (sub (v2i64 (bitconvert (v4i32 immAllZerosV))),
+                                  v2i64:$vB))]>;
+
+// Vector Parity Byte
+def VPRTYBW : VX_VT5_EO5_VB5<1538, 8, "vprtybw", [(set v4i32:$vD,
+                            (int_ppc_altivec_vprtybw v4i32:$vB))]>;
+def VPRTYBD : VX_VT5_EO5_VB5<1538,  9, "vprtybd", [(set v2i64:$vD,
+                            (int_ppc_altivec_vprtybd v2i64:$vB))]>;
+def VPRTYBQ : VX_VT5_EO5_VB5<1538, 10, "vprtybq", [(set v1i128:$vD,
+                            (int_ppc_altivec_vprtybq v1i128:$vB))]>;
+
+// Vector (Bit) Permute (Right-indexed)
+def VBPERMD : VXForm_1<1484, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                       "vbpermd $vD, $vA, $vB", IIC_VecFP, []>;
+def VPERMR : VAForm_1a<59, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC),
+                       "vpermr $vD, $vA, $vB, $vC", IIC_VecFP, []>;
+
+class VX1_VT5_VA5_VB5<bits<11> xo, string opc, list<dag> pattern>
+  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+             !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, pattern>;
+
+// Vector Rotate Left Mask/Mask-Insert
+def VRLWNM : VX1_VT5_VA5_VB5<389, "vrlwnm",
+                             [(set v4i32:$vD,
+                                 (int_ppc_altivec_vrlwnm v4i32:$vA,
+                                                         v4i32:$vB))]>;
+def VRLWMI : VXForm_1<133, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vDi),
+                      "vrlwmi $vD, $vA, $vB", IIC_VecFP,
+                      [(set v4i32:$vD,
+                         (int_ppc_altivec_vrlwmi v4i32:$vA, v4i32:$vB,
+                                                 v4i32:$vDi))]>,
+                      RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+def VRLDNM : VX1_VT5_VA5_VB5<453, "vrldnm",
+                             [(set v2i64:$vD,
+                                 (int_ppc_altivec_vrldnm v2i64:$vA,
+                                                         v2i64:$vB))]>;
+def VRLDMI : VXForm_1<197, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vDi),
+                      "vrldmi $vD, $vA, $vB", IIC_VecFP,
+                      [(set v2i64:$vD,
+                         (int_ppc_altivec_vrldmi v2i64:$vA, v2i64:$vB,
+                                                 v2i64:$vDi))]>,
+                      RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+
+// Vector Shift Left/Right
+def VSLV : VX1_VT5_VA5_VB5<1860, "vslv",
+                           [(set v16i8 : $vD, (int_ppc_altivec_vslv v16i8 : $vA, v16i8 : $vB))]>;
+def VSRV : VX1_VT5_VA5_VB5<1796, "vsrv",
+                           [(set v16i8 : $vD, (int_ppc_altivec_vsrv v16i8 : $vA, v16i8 : $vB))]>;
+
+// Vector Multiply-by-10 (& Write Carry) Unsigned Quadword
+def VMUL10UQ   : VXForm_BX<513, (outs vrrc:$vD), (ins vrrc:$vA),
+                           "vmul10uq $vD, $vA", IIC_VecFP, []>;
+def VMUL10CUQ  : VXForm_BX<  1, (outs vrrc:$vD), (ins vrrc:$vA),
+                           "vmul10cuq $vD, $vA", IIC_VecFP, []>;
+
+// Vector Multiply-by-10 Extended (& Write Carry) Unsigned Quadword
+def VMUL10EUQ  : VX1_VT5_VA5_VB5<577, "vmul10euq" , []>;
+def VMUL10ECUQ : VX1_VT5_VA5_VB5< 65, "vmul10ecuq", []>;
+
+// Decimal Integer Format Conversion Instructions
+
+// [PO VRT EO VRB 1 PS XO], "_o" means CR6 is set.
+class VX_VT5_EO5_VB5_PS1_XO9_o<bits<5> eo, bits<9> xo, string opc,
+                               list<dag> pattern>
+  : VX_RD5_EO5_RS5_PS1_XO9<eo, xo, (outs vrrc:$vD), (ins vrrc:$vB, u1imm:$PS),
+                        !strconcat(opc, " $vD, $vB, $PS"), IIC_VecFP, pattern> {
+  let Defs = [CR6];
+}
+
+// [PO VRT EO VRB 1 / XO]
+class VX_VT5_EO5_VB5_XO9_o<bits<5> eo, bits<9> xo, string opc,
+                           list<dag> pattern>
+  : VX_RD5_EO5_RS5_PS1_XO9<eo, xo, (outs vrrc:$vD), (ins vrrc:$vB),
+                           !strconcat(opc, " $vD, $vB"), IIC_VecFP, pattern> {
+  let Defs = [CR6];
+  let PS = 0;
+}
+
+// Decimal Convert From/to National/Zoned/Signed-QWord
+def BCDCFNo  : VX_VT5_EO5_VB5_PS1_XO9_o<7, 385, "bcdcfn." , []>;
+def BCDCFZo  : VX_VT5_EO5_VB5_PS1_XO9_o<6, 385, "bcdcfz." , []>;
+def BCDCTNo  : VX_VT5_EO5_VB5_XO9_o    <5, 385, "bcdctn." , []>;
+def BCDCTZo  : VX_VT5_EO5_VB5_PS1_XO9_o<4, 385, "bcdctz." , []>;
+def BCDCFSQo : VX_VT5_EO5_VB5_PS1_XO9_o<2, 385, "bcdcfsq.", []>;
+def BCDCTSQo : VX_VT5_EO5_VB5_XO9_o    <0, 385, "bcdctsq.", []>;
+
+// Decimal Copy-Sign/Set-Sign
+let Defs = [CR6] in
+def BCDCPSGNo : VX1_VT5_VA5_VB5<833, "bcdcpsgn.", []>;
+
+def BCDSETSGNo : VX_VT5_EO5_VB5_PS1_XO9_o<31, 385, "bcdsetsgn.", []>;
+
+// [PO VRT VRA VRB 1 PS XO], "_o" means CR6 is set.
+class VX_VT5_VA5_VB5_PS1_XO9_o<bits<9> xo, string opc, list<dag> pattern>
+  : VX_RD5_RSp5_PS1_XO9<xo,
+                   (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u1imm:$PS),
+                   !strconcat(opc, " $vD, $vA, $vB, $PS"), IIC_VecFP, pattern> {
+  let Defs = [CR6];
+}
+
+// [PO VRT VRA VRB 1 / XO]
+class VX_VT5_VA5_VB5_XO9_o<bits<9> xo, string opc, list<dag> pattern>
+  : VX_RD5_RSp5_PS1_XO9<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                        !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, pattern> {
+  let Defs = [CR6];
+  let PS = 0;
+}
+
+// Decimal Shift/Unsigned-Shift/Shift-and-Round
+def BCDSo :  VX_VT5_VA5_VB5_PS1_XO9_o<193, "bcds." , []>;
+def BCDUSo : VX_VT5_VA5_VB5_XO9_o    <129, "bcdus.", []>;
+def BCDSRo : VX_VT5_VA5_VB5_PS1_XO9_o<449, "bcdsr.", []>;
+
+// Decimal (Unsigned) Truncate
+def BCDTRUNCo :  VX_VT5_VA5_VB5_PS1_XO9_o<257, "bcdtrunc." , []>;
+def BCDUTRUNCo : VX_VT5_VA5_VB5_XO9_o    <321, "bcdutrunc.", []>;
+
+// Absolute Difference
+def VABSDUB : VXForm_1<1027, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                       "vabsdub $vD, $vA, $vB", IIC_VecGeneral,
+                       [(set v16i8:$vD, (int_ppc_altivec_vabsdub v16i8:$vA, v16i8:$vB))]>;
+def VABSDUH : VXForm_1<1091, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                       "vabsduh $vD, $vA, $vB", IIC_VecGeneral,
+                       [(set v8i16:$vD, (int_ppc_altivec_vabsduh v8i16:$vA, v8i16:$vB))]>;
+def VABSDUW : VXForm_1<1155, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                       "vabsduw $vD, $vA, $vB", IIC_VecGeneral,
+                       [(set v4i32:$vD, (int_ppc_altivec_vabsduw v4i32:$vA, v4i32:$vB))]>;
+} // end HasP9Altivec
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h
new file mode 100644
index 000000000000..cf71b1c59869
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h
@@ -0,0 +1,43 @@
+//===-- PPCInstrBuilder.h - Aides for building PPC insts --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to simplify generating frame and constant pool
+// references.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Dest Reg, Base Reg, and either Reg Index or Immediate
+// Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCINSTRBUILDER_H
+#define LLVM_LIB_TARGET_POWERPC_PPCINSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder&
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0,
+                  bool mem = true) {
+  if (mem)
+    return MIB.addImm(Offset).addFrameIndex(FI);
+  else
+    return MIB.addFrameIndex(FI).addImm(Offset);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td
new file mode 100644
index 000000000000..99689f656c2d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -0,0 +1,1992 @@
+//===- PowerPCInstrFormats.td - PowerPC Instruction Formats --*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// PowerPC instruction formats
+
+class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin>
+        : Instruction {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+  let Size = 4;
+
+  bit PPC64 = 0;  // Default value, override with isPPC64
+
+  let Namespace = "PPC";
+  let Inst{0-5} = opcode;
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+  let Itinerary = itin;
+
+  bits<1> PPC970_First = 0;
+  bits<1> PPC970_Single = 0;
+  bits<1> PPC970_Cracked = 0;
+  bits<3> PPC970_Unit = 0;
+
+  /// These fields correspond to the fields in PPCInstrInfo.h.  Any changes to
+  /// these must be reflected there!  See comments there for what these are.
+  let TSFlags{0}   = PPC970_First;
+  let TSFlags{1}   = PPC970_Single;
+  let TSFlags{2}   = PPC970_Cracked;
+  let TSFlags{5-3} = PPC970_Unit;
+
+  /// Indicate that the VSX instruction is to use VSX numbering/encoding.
+  /// Since ISA 3.0, there are scalar instructions that use the upper
+  /// half of the VSX register set only. Rather than adding further complexity
+  /// to the register class set, the VSX registers just include the Altivec
+  /// registers and this flag decides the numbering to be used for them.
+  bits<1> UseVSXReg = 0;
+  let TSFlags{6}   = UseVSXReg;
+
+  // Fields used for relation models.
+  string BaseName = "";
+
+  // For cases where multiple instruction definitions really represent the
+  // same underlying instruction but with one definition for 64-bit arguments
+  // and one for 32-bit arguments, this bit breaks the degeneracy between
+  // the two forms and allows TableGen to generate mapping tables.
+  bit Interpretation64Bit = 0;
+}
+
+class PPC970_DGroup_First   { bits<1> PPC970_First = 1;  }
+class PPC970_DGroup_Single  { bits<1> PPC970_Single = 1; }
+class PPC970_DGroup_Cracked { bits<1> PPC970_Cracked = 1; }
+class PPC970_MicroCode;
+
+class PPC970_Unit_Pseudo   { bits<3> PPC970_Unit = 0;   }
+class PPC970_Unit_FXU      { bits<3> PPC970_Unit = 1;   }
+class PPC970_Unit_LSU      { bits<3> PPC970_Unit = 2;   }
+class PPC970_Unit_FPU      { bits<3> PPC970_Unit = 3;   }
+class PPC970_Unit_CRU      { bits<3> PPC970_Unit = 4;   }
+class PPC970_Unit_VALU     { bits<3> PPC970_Unit = 5;   }
+class PPC970_Unit_VPERM    { bits<3> PPC970_Unit = 6;   }
+class PPC970_Unit_BRU      { bits<3> PPC970_Unit = 7;   }
+
+class UseVSXReg { bits<1> UseVSXReg = 1; }
+
+// Two joined instructions; used to emit two adjacent instructions as one.
+// The itinerary from the first instruction is used for scheduling and
+// classification.
+class I2<bits<6> opcode1, bits<6> opcode2, dag OOL, dag IOL, string asmstr,
+         InstrItinClass itin>
+        : Instruction {
+  field bits<64> Inst;
+  field bits<64> SoftFail = 0;
+  let Size = 8;
+
+  bit PPC64 = 0;  // Default value, override with isPPC64
+
+  let Namespace = "PPC";
+  let Inst{0-5} = opcode1;
+  let Inst{32-37} = opcode2;
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+  let Itinerary = itin;
+
+  bits<1> PPC970_First = 0;
+  bits<1> PPC970_Single = 0;
+  bits<1> PPC970_Cracked = 0;
+  bits<3> PPC970_Unit = 0;
+
+  /// These fields correspond to the fields in PPCInstrInfo.h.  Any changes to
+  /// these must be reflected there!  See comments there for what these are.
+  let TSFlags{0}   = PPC970_First;
+  let TSFlags{1}   = PPC970_Single;
+  let TSFlags{2}   = PPC970_Cracked;
+  let TSFlags{5-3} = PPC970_Unit;
+
+  // Fields used for relation models.
+  string BaseName = "";
+  bit Interpretation64Bit = 0;
+}
+
+// 1.7.1 I-Form
+class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr,
+            InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+  bits<24> LI;
+
+  let Inst{6-29}  = LI;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+// 1.7.2 B-Form
+class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr>
+  : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
+  bits<7> BIBO;  // 2 bits of BI and 5 bits of BO.
+  bits<3>  CR;
+  bits<14> BD;
+
+  bits<5> BI;
+  let BI{0-1} = BIBO{5-6};
+  let BI{2-4} = CR{0-2};
+
+  let Inst{6-10}  = BIBO{4-0};
+  let Inst{11-15} = BI;
+  let Inst{16-29} = BD;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+class BForm_1<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL,
+             string asmstr>
+  : BForm<opcode, aa, lk, OOL, IOL, asmstr> {
+  let BIBO{4-0} = bo;
+  let BIBO{6-5} = 0;
+  let CR = 0;
+}
+
+class BForm_2<bits<6> opcode, bits<5> bo, bits<5> bi, bit aa, bit lk,
+              dag OOL, dag IOL, string asmstr>
+  : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
+  bits<14> BD;
+
+  let Inst{6-10}  = bo;
+  let Inst{11-15} = bi;
+  let Inst{16-29} = BD;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+class BForm_3<bits<6> opcode, bit aa, bit lk,
+              dag OOL, dag IOL, string asmstr>
+  : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
+  bits<5> BO;
+  bits<5> BI;
+  bits<14> BD;
+
+  let Inst{6-10}  = BO;
+  let Inst{11-15} = BI;
+  let Inst{16-29} = BD;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+class BForm_3_at<bits<6> opcode, bit aa, bit lk,
+                 dag OOL, dag IOL, string asmstr>
+  : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
+  bits<5> BO;
+  bits<2> at;
+  bits<5> BI;
+  bits<14> BD;
+
+  let Inst{6-8}   = BO{4-2};
+  let Inst{9-10}  = at;
+  let Inst{11-15} = BI;
+  let Inst{16-29} = BD;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+class BForm_4<bits<6> opcode, bits<5> bo, bit aa, bit lk,
+              dag OOL, dag IOL, string asmstr>
+  : I<opcode, OOL, IOL, asmstr, IIC_BrB> {
+  bits<5> BI;
+  bits<14> BD;
+
+  let Inst{6-10}  = bo;
+  let Inst{11-15} = BI;
+  let Inst{16-29} = BD;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+// 1.7.3 SC-Form
+class SCForm<bits<6> opcode, bits<1> xo,
+                     dag OOL, dag IOL, string asmstr, InstrItinClass itin,
+                     list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<7>  LEV;
+
+  let Pattern = pattern;
+
+  let Inst{20-26} = LEV;
+  let Inst{30}    = xo;
+}
+
+// 1.7.4 D-Form
+class DForm_base<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<5>  B;
+  bits<16> C;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+
+class DForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<21> Addr;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = Addr{20-16}; // Base Reg
+  let Inst{16-31} = Addr{15-0};  // Displacement
+}
+
+class DForm_1a<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<16> C;
+  bits<5>  B;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+
+
+class DForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : DForm_base<opcode, OOL, IOL, asmstr, itin, pattern> {
+
+  // Even though ADDICo does not really have an RC bit, provide
+  // the declaration of one here so that isDOT has something to set.
+  bit RC = 0;
+}
+
+class DForm_2_r0<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<16> B;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = 0;
+  let Inst{16-31} = B;
+}
+
+class DForm_4<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  B;
+  bits<5>  A;
+  bits<16> C;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+              
+class DForm_4_zero<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+  : DForm_1<opcode, OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+  let Addr = 0;
+}
+
+class DForm_4_fixedreg_zero<bits<6> opcode, bits<5> R, dag OOL, dag IOL,
+                            string asmstr, InstrItinClass itin,
+                            list<dag> pattern>
+  : DForm_4<opcode, OOL, IOL, asmstr, itin, pattern> {
+  let A = R;
+  let B = R;
+  let C = 0; 
+}
+
+class IForm_and_DForm_1<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2,
+            dag OOL, dag IOL, string asmstr,
+            InstrItinClass itin, list<dag> pattern>
+         : I2<opcode1, opcode2, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<21> Addr;
+
+  let Pattern = pattern;
+  bits<24> LI;
+
+  let Inst{6-29}  = LI;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+
+  let Inst{38-42}  = A;
+  let Inst{43-47} = Addr{20-16}; // Base Reg
+  let Inst{48-63} = Addr{15-0};  // Displacement
+}
+
+// This is used to emit BL8+NOP.
+class IForm_and_DForm_4_zero<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2,
+            dag OOL, dag IOL, string asmstr,
+            InstrItinClass itin, list<dag> pattern>
+         :  IForm_and_DForm_1<opcode1, aa, lk, opcode2,
+                              OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+  let Addr = 0;
+}
+
+class DForm_5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3>  BF;
+  bits<1>  L;
+  bits<5>  RA;
+  bits<16> I;
+
+  let Inst{6-8}   = BF;
+  let Inst{9}     = 0;
+  let Inst{10}    = L;
+  let Inst{11-15} = RA;
+  let Inst{16-31} = I;
+}
+
+class DForm_5_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                  InstrItinClass itin>
+  : DForm_5<opcode, OOL, IOL, asmstr, itin> {
+  let L = PPC64;
+}
+
+class DForm_6<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin> 
+  : DForm_5<opcode, OOL, IOL, asmstr, itin>;
+
+class DForm_6_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                  InstrItinClass itin>
+  : DForm_6<opcode, OOL, IOL, asmstr, itin> {
+  let L = PPC64;
+}
+
+
+// 1.7.5 DS-Form
+class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RST;
+  bits<19> DS_RA;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = DS_RA{18-14};  // Register #
+  let Inst{16-29} = DS_RA{13-0};   // Displacement.
+  let Inst{30-31} = xo;
+}
+
+// DQ-Form: [PO T RA DQ TX XO] or [PO S RA DQ SX XO]
+class DQ_RD6_RS5_DQ12<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
+                      string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6>  XT;
+  bits<17> DS_RA;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = DS_RA{16-12};  // Register #
+  let Inst{16-27} = DS_RA{11-0};   // Displacement.
+  let Inst{28}    = XT{5};
+  let Inst{29-31} = xo;
+}
+
+// 1.7.6 X-Form
+class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RST;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class XForm_tlb<bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin> : XForm_base_r3xo<31, xo, OOL, IOL, asmstr, itin, []> {
+  let RST = 0;
+}
+
+class XForm_attn<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  let Inst{21-30} = xo;
+}
+
+// This is the same as XForm_base_r3xo, but the first two operands are swapped
+// when code is emitted.
+class XForm_base_r3xo_swapped
+        <bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+        InstrItinClass itin> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> A;
+  bits<5> RST;
+  bits<5> B;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+
+class XForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+
+class XForm_1a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let RST = 0;
+}
+
+class XForm_rs<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+  let B = 0;
+}
+
+class XForm_tlbws<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RST;
+  bits<5> A;
+  bits<1> WS;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = A;
+  let Inst{20}    = WS;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_6<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+}
+
+class XForm_8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+
+class XForm_10<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+    let Pattern = pattern;
+}
+
+class XForm_11<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+  let B = 0;
+  let Pattern = pattern;
+}
+
+class XForm_16<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<1> L; 
+  bits<5> RA;
+  bits<5> RB;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9}     = 0;
+  let Inst{10}    = L;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_icbt<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<4> CT;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Inst{6} = 0;
+  let Inst{7-10} = CT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-30} = xo;
+  let Inst{31} = 0;
+}
+
+class XForm_sr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RS;
+  bits<4> SR;
+
+  let Inst{6-10} = RS;
+  let Inst{12-15} = SR;
+  let Inst{21-30} = xo;
+}
+
+class XForm_mbar<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> MO;
+
+  let Inst{6-10} = MO;
+  let Inst{21-30} = xo;
+}
+
+class XForm_srin<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RS;
+  bits<5> RB;
+
+  let Inst{6-10} = RS;
+  let Inst{16-20} = RB;
+  let Inst{21-30} = xo;
+}
+
+class XForm_mtmsr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RS;
+  bits<1> L;
+
+  let Inst{6-10} = RS;
+  let Inst{15} = L;
+  let Inst{21-30} = xo;
+}
+
+class XForm_16_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin>
+  : XForm_16<opcode, xo, OOL, IOL, asmstr, itin> {
+  let L = PPC64;
+}
+
+class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<5> FRA;
+  bits<5> FRB;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// Used for QPX
+class XForm_18<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> FRT;
+  bits<5> FRA;
+  bits<5> FRB;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = FRT;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_19<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_18<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let FRA = 0;
+}
+
+class XForm_20<bits<6> opcode, bits<6> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> FRT;
+  bits<5> FRA;
+  bits<5> FRB;
+  bits<4> tttt;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = FRT;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-24} = tttt;
+  let Inst{25-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_24<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+  let Inst{6-10}  = 31;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_24_sync<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+               string asmstr, InstrItinClass itin, list<dag> pattern> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<2> L;
+
+  let Pattern = pattern;
+  let Inst{6-8}   = 0;
+  let Inst{9-10}  = L;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_24_eieio<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+               string asmstr, InstrItinClass itin, list<dag> pattern> 
+  : XForm_24_sync<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let L = 0;
+}
+
+class XForm_25<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+}
+
+class XForm_26<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+}
+
+class XForm_28<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+}
+
+// This is used for MFFS, MTFSB0, MTFSB1.  42 is arbitrary; this series of
+// numbers presumably relates to some document, but I haven't found it.
+class XForm_42<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let Pattern = pattern;
+  bits<5> FM;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = FM;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class XForm_0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let RST = 0;
+  let A = 0;
+  let B = 0;
+}
+
+class XForm_16b<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let RST = 0;
+  let A = 0;
+}
+
+class XForm_htm0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                 string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bit R;
+
+  bit RC = 1;
+
+  let Inst{6-9}   = 0;
+  let Inst{10}    = R;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class XForm_htm1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                 string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bit A;
+
+  bit RC = 1;
+
+  let Inst{6}     = A;
+  let Inst{7-20}  = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class XForm_htm2<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bit L;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{7-9}   = 0;
+  let Inst{10}    = L;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class XForm_htm3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+
+  bit RC = 0;
+
+  let Inst{6-8}   = BF;
+  let Inst{9-20}  = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+// [PO RT RA RB XO /]
+class X_BF3_L1_RS5_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                       string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<1> L;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}   = BF;
+  let Inst{9}     = 0;
+  let Inst{10}    = L;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// Same as XForm_17 but with GPR's and new naming convention
+class X_BF3_RS5_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmstr, InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// e.g. [PO VRT XO VRB XO /] or [PO VRT XO VRB XO RO]
+class X_RD5_XO5_RS5<bits<6> opcode, bits<5> xo2, bits<10> xo, dag OOL, dag IOL,
+                    string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let A = xo2;
+}
+
+class X_BF3_DCMX7_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                      string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<7> DCMX;
+  bits<5> VB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}  = BF;
+  let Inst{9-15} = DCMX;
+  let Inst{16-20} = VB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class X_RD6_IMM8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                 string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<8> IMM8;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-12} = 0;
+  let Inst{13-20} = IMM8;
+  let Inst{21-30} = xo;
+  let Inst{31}    = XT{5};
+}
+
+// XForm_base_r3xo for instructions such as P9 atomics where we don't want
+// to specify an SDAG pattern for matching.
+class X_RD5_RS5_IM5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmstr, InstrItinClass itin>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, []> {
+}
+
+class X_BF3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+            InstrItinClass itin>
+  : XForm_17<opcode, xo, OOL, IOL, asmstr, itin> {
+  let FRA = 0;
+  let FRB = 0;
+}
+
+// [PO /// L RA RB XO /]
+class X_L1_RS5_RS5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                   string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XForm_16<opcode, xo, OOL, IOL, asmstr, itin> {
+  let BF = 0;
+  let Pattern = pattern;
+
+  bit RC = 0;
+  let Inst{31} = RC;
+}
+
+// XX*-Form (VSX)
+class XX1Form<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = XT{5};
+}
+
+class XX1_RS6_RD5_XO<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                     string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XX1Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let B = 0;
+}
+
+class XX2Form<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = 0;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX2Form_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, 
+                InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> CR;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}   = CR;
+  let Inst{9-15}  = 0;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = 0;
+}
+
+class XX2Form_2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, 
+                InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XB;
+  bits<2> D;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-13} = 0;
+  let Inst{14-15} = D;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX2_RD6_UIM5_RS6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
+                       string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XB;
+  bits<5> UIM5;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = UIM5;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+// [PO T XO B XO BX /]
+class XX2_RD5_XO5_RS6<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL, dag IOL,
+                       string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = xo2;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = 0;
+}
+
+// [PO T XO B XO BX TX]
+class XX2_RD6_XO5_RS6<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL, dag IOL,
+                      string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = xo2;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX2_BF3_DCMX7_RS6<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
+                      string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<7> DCMX;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}  = BF;
+  let Inst{9-15} = DCMX;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = 0;
+}
+
+class XX2_RD6_DCMX7_RS6<bits<6> opcode, bits<4> xo1, bits<3> xo2,
+                        dag OOL, dag IOL, string asmstr, InstrItinClass itin,
+                        list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<7> DCMX;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = DCMX{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-24} = xo1;
+  let Inst{25}    = DCMX{5};
+  let Inst{26-28} = xo2;
+  let Inst{29}    = DCMX{6};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX3Form<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-28} = xo;
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX3Form_Zero<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XX3Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let XA = XT;
+  let XB = XT;
+}
+
+class XX3Form_SetZero<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : XX3Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let XB = XT;
+  let XA = XT;
+}
+
+class XX3Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, 
+                InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> CR;
+  bits<6> XA;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}   = CR;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-28} = xo;
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = 0;
+}
+
+class XX3Form_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, 
+                InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<2> D;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21}    = 0;
+  let Inst{22-23} = D;
+  let Inst{24-28} = xo;
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX3Form_Rc<bits<6> opcode, bits<7> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21}    = RC;
+  let Inst{22-28} = xo;
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+class XX4Form<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<6> XC;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = XT{4-0};
+  let Inst{11-15} = XA{4-0};
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-25} = XC{4-0};
+  let Inst{26-27} = xo;
+  let Inst{28}    = XC{5};
+  let Inst{29}    = XA{5};
+  let Inst{30}    = XB{5};
+  let Inst{31}    = XT{5};
+}
+
+// DCB_Form - Form X instruction, used for dcb* instructions.
+class DCB_Form<bits<10> xo, bits<5> immfield, dag OOL, dag IOL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<31, OOL, IOL, asmstr, itin> {
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = immfield;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class DCB_Form_hint<bits<10> xo, dag OOL, dag IOL, string asmstr,
+                    InstrItinClass itin, list<dag> pattern>
+  : I<31, OOL, IOL, asmstr, itin> {
+  bits<5> TH;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = TH;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// DSS_Form - Form X instruction, used for altivec dss* instructions.
+class DSS_Form<bits<1> T, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                      InstrItinClass itin, list<dag> pattern>
+  : I<31, OOL, IOL, asmstr, itin> {
+  bits<2> STRM;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  let Inst{6}     = T;
+  let Inst{7-8}   = 0;
+  let Inst{9-10}  = STRM;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// 1.7.7 XL-Form
+class XLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> CRD;
+  bits<5> CRA;
+  bits<5> CRB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = CRD;
+  let Inst{11-15} = CRA;
+  let Inst{16-20} = CRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XLForm_1_np<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                  InstrItinClass itin, list<dag> pattern>
+  : XLForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let CRD = 0;
+  let CRA = 0;
+  let CRB = 0;
+}
+
+class XLForm_1_gen<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+  : XLForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  bits<5> RT;
+  bits<5> RB;
+
+  let CRD = RT;
+  let CRA = 0;
+  let CRB = RB;
+}
+
+class XLForm_1_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> CRD;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = CRD;
+  let Inst{11-15} = CRD;
+  let Inst{16-20} = CRD;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XLForm_2<bits<6> opcode, bits<10> xo, bit lk, dag OOL, dag IOL, string asmstr, 
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> BO;
+  bits<5> BI;
+  bits<2> BH;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = BO;
+  let Inst{11-15} = BI;
+  let Inst{16-18} = 0;
+  let Inst{19-20} = BH;
+  let Inst{21-30} = xo;
+  let Inst{31}    = lk;
+}
+
+class XLForm_2_br<bits<6> opcode, bits<10> xo, bit lk,
+                  dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> {
+  bits<7> BIBO;  // 2 bits of BI and 5 bits of BO.
+  bits<3>  CR;
+  
+  let BO = BIBO{4-0};
+  let BI{0-1} = BIBO{5-6};
+  let BI{2-4} = CR{0-2};
+  let BH = 0;
+}
+
+class XLForm_2_br2<bits<6> opcode, bits<10> xo, bits<5> bo, bit lk,
+                   dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> {
+  let BO = bo;
+  let BH = 0;
+}
+
+class XLForm_2_ext<bits<6> opcode, bits<10> xo, bits<5> bo,  bits<5> bi, bit lk,
+                  dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> {
+  let BO = bo;
+  let BI = bi;
+  let BH = 0;
+}
+
+class XLForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<3> BFA;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-13} = BFA;
+  let Inst{14-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XLForm_4<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bit W;
+  bits<4> U;
+  
+  bit RC = 0;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-14} = 0;
+  let Inst{15}    = W;
+  let Inst{16-19} = U;
+  let Inst{20}    = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class XLForm_S<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<1> S;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-19}  = 0;
+  let Inst{20}    = S;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XLForm_2_and_DSForm_1<bits<6> opcode1, bits<10> xo1, bit lk,
+                            bits<6> opcode2, bits<2> xo2,
+                            dag OOL, dag IOL, string asmstr,
+                            InstrItinClass itin, list<dag> pattern>
+        : I2<opcode1, opcode2, OOL, IOL, asmstr, itin> {
+  bits<5> BO;
+  bits<5> BI;
+  bits<2> BH;
+
+  bits<5>  RST;
+  bits<19> DS_RA;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = BO;
+  let Inst{11-15} = BI;
+  let Inst{16-18} = 0;
+  let Inst{19-20} = BH;
+  let Inst{21-30} = xo1;
+  let Inst{31}    = lk;
+
+  let Inst{38-42} = RST;
+  let Inst{43-47} = DS_RA{18-14};  // Register #
+  let Inst{48-61} = DS_RA{13-0};   // Displacement.
+  let Inst{62-63} = xo2;
+}
+
+class XLForm_2_ext_and_DSForm_1<bits<6> opcode1, bits<10> xo1,
+                                bits<5> bo, bits<5> bi, bit lk,
+                                bits<6> opcode2, bits<2> xo2,
+                                dag OOL, dag IOL, string asmstr,
+                                InstrItinClass itin, list<dag> pattern>
+  : XLForm_2_and_DSForm_1<opcode1, xo1, lk, opcode2, xo2,
+                          OOL, IOL, asmstr, itin, pattern> {
+  let BO = bo;
+  let BI = bi;
+  let BH = 0;
+}
+
+// 1.7.8 XFX-Form
+class XFXForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RT;
+  bits<10> SPR;
+
+  let Inst{6-10}  = RT;
+  let Inst{11}    = SPR{4};
+  let Inst{12}    = SPR{3};
+  let Inst{13}    = SPR{2};
+  let Inst{14}    = SPR{1};
+  let Inst{15}    = SPR{0};
+  let Inst{16}    = SPR{9};
+  let Inst{17}    = SPR{8};
+  let Inst{18}    = SPR{7};
+  let Inst{19}    = SPR{6};
+  let Inst{20}    = SPR{5};
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_1_ext<bits<6> opcode, bits<10> xo, bits<10> spr, 
+                   dag OOL, dag IOL, string asmstr, InstrItinClass itin> 
+  : XFXForm_1<opcode, xo, OOL, IOL, asmstr, itin> {
+  let SPR = spr;
+}
+
+class XFXForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RT;
+   
+  let Inst{6-10}  = RT;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_3p<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RT;
+  bits<10> Entry;
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RT;
+  let Inst{11-20} = Entry;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<8>  FXM;
+  bits<5>  rS;
+   
+  let Inst{6-10}  = rS;
+  let Inst{11}    = 0;
+  let Inst{12-19} = FXM;
+  let Inst{20}    = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_5a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  ST;
+  bits<8>  FXM;
+   
+  let Inst{6-10}  = ST;
+  let Inst{11}    = 1;
+  let Inst{12-19} = FXM;
+  let Inst{20}    = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_7<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+  : XFXForm_1<opcode, xo, OOL, IOL, asmstr, itin>;
+
+class XFXForm_7_ext<bits<6> opcode, bits<10> xo, bits<10> spr, 
+                    dag OOL, dag IOL, string asmstr, InstrItinClass itin> 
+  : XFXForm_7<opcode, xo, OOL, IOL, asmstr, itin> {
+  let SPR = spr;
+}
+
+// XFL-Form - MTFSF
+// This is probably 1.7.9, but I don't have the reference that uses this
+// numbering scheme...
+class XFLForm<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag>pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<8> FM;
+  bits<5> rT;
+
+  bit RC = 0;    // set by isDOT
+  let Pattern = pattern;
+
+  let Inst{6} = 0;
+  let Inst{7-14}  = FM;
+  let Inst{15} = 0;
+  let Inst{16-20} = rT;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class XFLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin, list<dag>pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bit L;
+  bits<8> FLM;
+  bit W;
+  bits<5> FRB;
+
+  bit RC = 0;    // set by isDOT
+  let Pattern = pattern;
+
+  let Inst{6}     = L;
+  let Inst{7-14}  = FLM;
+  let Inst{15}    = W;
+  let Inst{16-20} = FRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+// 1.7.10 XS-Form - SRADI.
+class XSForm_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> A;
+  bits<5> RS;
+  bits<6> SH;
+
+  bit RC = 0;    // set by isDOT
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = A;
+  let Inst{16-20} = SH{4,3,2,1,0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = SH{5};
+  let Inst{31}    = RC;
+}
+
+// 1.7.11 XO-Form
+class XOForm_1<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21}    = oe;
+  let Inst{22-30} = xo;
+  let Inst{31}    = RC;  
+}
+
+class XOForm_3<bits<6> opcode, bits<9> xo, bit oe, 
+               dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XOForm_1<opcode, xo, oe, OOL, IOL, asmstr, itin, pattern> {
+  let RB = 0;
+}
+
+// 1.7.12 A-Form
+class AForm_1<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> FRT;
+  bits<5> FRA;
+  bits<5> FRC;
+  bits<5> FRB;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = FRT;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-25} = FRC;
+  let Inst{26-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class AForm_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let FRC = 0;
+}
+
+class AForm_3<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let FRB = 0;
+}
+
+class AForm_4<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<5> RB;
+  bits<5> COND;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-25} = COND;
+  let Inst{26-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// Used for QPX
+class AForm_4a<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let FRA = 0;
+  let FRC = 0;
+}
+
+// 1.7.13 M-Form
+class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RA;
+  bits<5> RS;
+  bits<5> RB;
+  bits<5> MB;
+  bits<5> ME;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-25} = MB;
+  let Inst{26-30} = ME;
+  let Inst{31}    = RC;
+}
+
+class MForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : MForm_1<opcode, OOL, IOL, asmstr, itin, pattern> {
+}
+
+// 1.7.14 MD-Form
+class MDForm_1<bits<6> opcode, bits<3> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RA;
+  bits<5> RS;
+  bits<6> SH;
+  bits<6> MBE;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = SH{4,3,2,1,0};
+  let Inst{21-26} = MBE{4,3,2,1,0,5};
+  let Inst{27-29} = xo;
+  let Inst{30}    = SH{5};
+  let Inst{31}    = RC;
+}
+
+class MDSForm_1<bits<6> opcode, bits<4> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RA;
+  bits<5> RS;
+  bits<5> RB;
+  bits<6> MBE;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-26} = MBE{4,3,2,1,0,5};
+  let Inst{27-30} = xo;
+  let Inst{31}    = RC;
+}
+
+
+// E-1 VA-Form
+
+// VAForm_1 - DACB ordering.
+class VAForm_1<bits<6> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VC;
+  bits<5> VB;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-25} = VC;
+  let Inst{26-31} = xo;
+}
+
+// VAForm_1a - DABC ordering.
+class VAForm_1a<bits<6> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bits<5> VC;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-25} = VC;
+  let Inst{26-31} = xo;
+}
+
+class VAForm_2<bits<6> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bits<4> SH;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21}    = 0;
+  let Inst{22-25} = SH;
+  let Inst{26-31} = xo;
+}
+
+// E-2 VX-Form
+class VXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+class VXForm_setzero<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : VXForm_1<xo, OOL, IOL, asmstr, itin, pattern> {
+  let VA = VD;
+  let VB = VD;
+}
+
+
+class VXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+class VXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> IMM;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = IMM;
+  let Inst{16-20} = 0;
+  let Inst{21-31} = xo;
+}
+
+/// VXForm_4 - VX instructions with "VD,0,0" register fields, like mfvscr.
+class VXForm_4<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-31} = xo;
+}
+
+/// VXForm_5 - VX instructions with "0,0,VB" register fields, like mtvscr.
+class VXForm_5<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = 0;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+// e.g. [PO VRT EO VRB XO]
+class VXForm_RD5_XO5_RS5<bits<11> xo, bits<5> eo, dag OOL, dag IOL,
+                         string asmstr, InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> RD;
+  bits<5> VB;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RD;
+  let Inst{11-15} = eo;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+/// VXForm_CR - VX crypto instructions with "VRT, VRA, ST, SIX"
+class VXForm_CR<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<1> ST;
+  bits<4> SIX;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16} =  ST;
+  let Inst{17-20} = SIX;
+  let Inst{21-31} = xo;
+}
+
+/// VXForm_BX - VX crypto instructions with "VRT, VRA, 0 - like vsbox"
+class VXForm_BX<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = 0;
+  let Inst{21-31} = xo;
+}
+
+// E-4 VXR-Form
+class VXRForm_1<bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bit RC = 0;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21}    = RC;
+  let Inst{22-31} = xo;
+}
+
+// VX-Form: [PO VRT EO VRB 1 PS XO]
+class VX_RD5_EO5_RS5_PS1_XO9<bits<5> eo, bits<9> xo,
+                             dag OOL, dag IOL, string asmstr,
+                             InstrItinClass itin, list<dag> pattern>
+  : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VB;
+  bit PS;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = eo;
+  let Inst{16-20} = VB;
+  let Inst{21}    = 1;
+  let Inst{22}    = PS;
+  let Inst{23-31} = xo;
+}
+
+// VX-Form: [PO VRT VRA VRB 1 PS XO] or [PO VRT VRA VRB 1 / XO]
+class VX_RD5_RSp5_PS1_XO9<bits<9> xo, dag OOL, dag IOL, string asmstr,
+                          InstrItinClass itin, list<dag> pattern>
+  : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bit PS;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21}    = 1;
+  let Inst{22}    = PS;
+  let Inst{23-31} = xo;
+}
+
+// Z23-Form (used by QPX)
+class Z23Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> FRT;
+  bits<5> FRA;
+  bits<5> FRB;
+  bits<2> idx;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = FRT;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-22} = idx;
+  let Inst{23-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class Z23Form_2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : Z23Form_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let FRB = 0;
+}
+
+class Z23Form_3<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> FRT;
+  bits<12> idx;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = FRT;
+  let Inst{11-22} = idx;
+  let Inst{23-30} = xo;
+  let Inst{31}    = RC;
+}
+
+//===----------------------------------------------------------------------===//
+class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
+    : I<0, OOL, IOL, asmstr, NoItinerary> {
+  let isCodeGenOnly = 1;
+  let PPC64 = 0;
+  let Pattern = pattern;
+  let Inst{31-0} = 0;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrHTM.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrHTM.td
new file mode 100644
index 000000000000..6c4e2129087c
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrHTM.td
@@ -0,0 +1,172 @@
+//===-- PPCInstrHTM.td - The PowerPC Hardware Transactional Memory  -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hardware Transactional Memory extension to the
+// PowerPC instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+def HasHTM : Predicate<"PPCSubTarget->hasHTM()">;
+
+def HTM_get_imm : SDNodeXForm<imm, [{
+  return getI32Imm (N->getZExtValue(), SDLoc(N));
+}]>;
+
+let hasSideEffects = 1, usesCustomInserter = 1  in {
+def TCHECK_RET : Pseudo<(outs crrc:$out), (ins), "#TCHECK_RET", []>;
+}
+
+
+let Predicates = [HasHTM] in {
+
+def TBEGIN : XForm_htm0 <31, 654,
+                         (outs crrc0:$ret), (ins u1imm:$R), "tbegin. $R", IIC_SprMTSPR, []>;
+
+def TEND : XForm_htm1 <31, 686,
+                       (outs crrc0:$ret), (ins u1imm:$A), "tend. $A", IIC_SprMTSPR, []>;
+
+def TABORT : XForm_base_r3xo <31, 910,
+                              (outs crrc0:$ret), (ins gprc:$A), "tabort. $A", IIC_SprMTSPR,
+                              []>, isDOT {
+  let RST = 0;
+  let B = 0;
+}
+
+def TABORTWC : XForm_base_r3xo <31, 782,
+                                (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, gprc:$B),
+                                "tabortwc. $RTS, $A, $B", IIC_SprMTSPR, []>,
+                                isDOT;
+
+def TABORTWCI : XForm_base_r3xo <31, 846,
+                                 (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, u5imm:$B),
+                                 "tabortwci. $RTS, $A, $B", IIC_SprMTSPR, []>,
+                                 isDOT;
+
+def TABORTDC : XForm_base_r3xo <31, 814,
+                                (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, gprc:$B),
+                                "tabortdc. $RTS, $A, $B", IIC_SprMTSPR, []>,
+                                isDOT;
+
+def TABORTDCI : XForm_base_r3xo <31, 878,
+                                 (outs crrc0:$ret), (ins u5imm:$RTS, gprc:$A, u5imm:$B),
+                                 "tabortdci. $RTS, $A, $B", IIC_SprMTSPR, []>,
+                                 isDOT;
+
+def TSR : XForm_htm2 <31, 750,
+                      (outs crrc0:$ret), (ins u1imm:$L), "tsr. $L", IIC_SprMTSPR, []>,
+                      isDOT;
+
+def TCHECK : XForm_htm3 <31, 718,
+                        (outs), (ins crrc:$BF), "tcheck $BF", IIC_SprMTSPR, []>;
+
+
+def TRECLAIM : XForm_base_r3xo <31, 942,
+                                (outs crrc:$ret), (ins gprc:$A), "treclaim. $A",
+                                IIC_SprMTSPR, []>,
+                                isDOT {
+  let RST = 0;
+  let B = 0;
+}
+
+def TRECHKPT : XForm_base_r3xo <31, 1006,
+                                (outs crrc:$ret), (ins), "trechkpt.", IIC_SprMTSPR, []>,
+                                isDOT {
+  let RST = 0;
+  let A = 0;
+  let B = 0;
+}
+
+// Builtins
+
+// All HTM instructions, with the exception of tcheck, set CR0 with the
+// value of the MSR Transaction State (TS) bits that exist before the
+// instruction is executed.  For tbegin., the EQ bit in CR0 can be used
+// to determine whether the transaction was successfully started (0) or
+// failed (1).  We use an XORI pattern to 'flip' the bit to match the
+// tbegin builtin API which defines a return value of 1 as success.
+
+def : Pat<(int_ppc_tbegin i32:$R),
+           (XORI
+             (EXTRACT_SUBREG (
+               TBEGIN (HTM_get_imm imm:$R)), sub_eq),
+            1)>;
+
+def : Pat<(int_ppc_tend i32:$R),
+          (TEND (HTM_get_imm imm:$R))>;
+
+
+def : Pat<(int_ppc_tabort i32:$R),
+          (TABORT $R)>;
+
+def : Pat<(int_ppc_tabortwc i32:$TO, i32:$RA, i32:$RB),
+          (TABORTWC (HTM_get_imm imm:$TO), $RA, $RB)>;
+
+def : Pat<(int_ppc_tabortwci i32:$TO, i32:$RA, i32:$SI),
+          (TABORTWCI (HTM_get_imm imm:$TO), $RA, (HTM_get_imm imm:$SI))>;
+
+def : Pat<(int_ppc_tabortdc i32:$TO, i32:$RA, i32:$RB),
+          (TABORTDC (HTM_get_imm imm:$TO), $RA, $RB)>;
+
+def : Pat<(int_ppc_tabortdci i32:$TO, i32:$RA, i32:$SI),
+          (TABORTDCI (HTM_get_imm imm:$TO), $RA, (HTM_get_imm imm:$SI))>;
+
+def : Pat<(int_ppc_tcheck),
+          (TCHECK_RET)>;
+
+def : Pat<(int_ppc_treclaim i32:$RA),
+          (TRECLAIM $RA)>;
+
+def : Pat<(int_ppc_trechkpt),
+          (TRECHKPT)>;
+
+def : Pat<(int_ppc_tsr i32:$L),
+          (TSR (HTM_get_imm imm:$L))>;
+
+def : Pat<(int_ppc_get_texasr),
+          (MFSPR8 130)>;
+
+def : Pat<(int_ppc_get_texasru),
+          (MFSPR8 131)>;
+
+def : Pat<(int_ppc_get_tfhar),
+          (MFSPR8 128)>;
+
+def : Pat<(int_ppc_get_tfiar),
+          (MFSPR8 129)>;
+
+
+def : Pat<(int_ppc_set_texasr i64:$V),
+          (MTSPR8 130, $V)>;
+
+def : Pat<(int_ppc_set_texasru i64:$V),
+          (MTSPR8 131, $V)>;
+
+def : Pat<(int_ppc_set_tfhar i64:$V),
+          (MTSPR8 128, $V)>;
+
+def : Pat<(int_ppc_set_tfiar i64:$V),
+          (MTSPR8 129, $V)>;
+
+
+// Extended mnemonics
+def : Pat<(int_ppc_tendall),
+          (TEND 1)>;
+
+def : Pat<(int_ppc_tresume),
+          (TSR 1)>;
+
+def : Pat<(int_ppc_tsuspend),
+          (TSR 0)>;
+
+def : Pat<(i64 (int_ppc_ttest)),
+          (RLDICL (i64 (COPY (TABORTWCI 0, ZERO, 0))), 36, 28)>;
+
+} // [HasHTM]
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
new file mode 100644
index 000000000000..2e0b9355f82b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -0,0 +1,1933 @@
+//===-- PPCInstrInfo.cpp - PowerPC Instruction Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
+#include "PPCHazardRecognizers.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-instr-info"
+
+#define GET_INSTRMAP_INFO
+#define GET_INSTRINFO_CTOR_DTOR
+#include "PPCGenInstrInfo.inc"
+
+static cl::
+opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
+            cl::desc("Disable analysis for CTR loops"));
+
+static cl::opt<bool> DisableCmpOpt("disable-ppc-cmp-opt",
+cl::desc("Disable compare instruction optimization"), cl::Hidden);
+
+static cl::opt<bool> VSXSelfCopyCrash("crash-on-ppc-vsx-self-copy",
+cl::desc("Causes the backend to crash instead of generating a nop VSX copy"),
+cl::Hidden);
+
+static cl::opt<bool>
+UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden,
+  cl::desc("Use the old (incorrect) instruction latency calculation"));
+
+// Pin the vtable to this file.
+void PPCInstrInfo::anchor() {}
+
+PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI)
+    : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
+      Subtarget(STI), RI(STI.getTargetMachine()) {}
+
+/// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
+/// this target when scheduling the DAG.
+ScheduleHazardRecognizer *
+PPCInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+                                           const ScheduleDAG *DAG) const {
+  unsigned Directive =
+      static_cast<const PPCSubtarget *>(STI)->getDarwinDirective();
+  if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 ||
+      Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) {
+    const InstrItineraryData *II =
+        static_cast<const PPCSubtarget *>(STI)->getInstrItineraryData();
+    return new ScoreboardHazardRecognizer(II, DAG);
+  }
+
+  return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG);
+}
+
+/// CreateTargetPostRAHazardRecognizer - Return the postRA hazard recognizer
+/// to use for this target when scheduling the DAG.
+ScheduleHazardRecognizer *
+PPCInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                                 const ScheduleDAG *DAG) const {
+  unsigned Directive =
+      DAG->MF.getSubtarget<PPCSubtarget>().getDarwinDirective();
+
+  // FIXME: Leaving this as-is until we have POWER9 scheduling info
+  if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8)
+    return new PPCDispatchGroupSBHazardRecognizer(II, DAG);
+
+  // Most subtargets use a PPC970 recognizer.
+  if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 &&
+      Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) {
+    assert(DAG->TII && "No InstrInfo?");
+
+    return new PPCHazardRecognizer970(*DAG);
+  }
+
+  return new ScoreboardHazardRecognizer(II, DAG);
+}
+
+unsigned PPCInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                       const MachineInstr &MI,
+                                       unsigned *PredCost) const {
+  if (!ItinData || UseOldLatencyCalc)
+    return PPCGenInstrInfo::getInstrLatency(ItinData, MI, PredCost);
+
+  // The default implementation of getInstrLatency calls getStageLatency, but
+  // getStageLatency does not do the right thing for us. While we have
+  // itinerary, most cores are fully pipelined, and so the itineraries only
+  // express the first part of the pipeline, not every stage. Instead, we need
+  // to use the listed output operand cycle number (using operand 0 here, which
+  // is an output).
+
+  unsigned Latency = 1;
+  unsigned DefClass = MI.getDesc().getSchedClass();
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isDef() || MO.isImplicit())
+      continue;
+
+    int Cycle = ItinData->getOperandCycle(DefClass, i);
+    if (Cycle < 0)
+      continue;
+
+    Latency = std::max(Latency, (unsigned) Cycle);
+  }
+
+  return Latency;
+}
+
+int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    const MachineInstr &DefMI, unsigned DefIdx,
+                                    const MachineInstr &UseMI,
+                                    unsigned UseIdx) const {
+  int Latency = PPCGenInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx,
+                                                   UseMI, UseIdx);
+
+  if (!DefMI.getParent())
+    return Latency;
+
+  const MachineOperand &DefMO = DefMI.getOperand(DefIdx);
+  unsigned Reg = DefMO.getReg();
+
+  bool IsRegCR;
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    const MachineRegisterInfo *MRI =
+        &DefMI.getParent()->getParent()->getRegInfo();
+    IsRegCR = MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRRCRegClass) ||
+              MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRBITRCRegClass);
+  } else {
+    IsRegCR = PPC::CRRCRegClass.contains(Reg) ||
+              PPC::CRBITRCRegClass.contains(Reg);
+  }
+
+  if (UseMI.isBranch() && IsRegCR) {
+    if (Latency < 0)
+      Latency = getInstrLatency(ItinData, DefMI);
+
+    // On some cores, there is an additional delay between writing to a condition
+    // register, and using it from a branch.
+    unsigned Directive = Subtarget.getDarwinDirective();
+    switch (Directive) {
+    default: break;
+    case PPC::DIR_7400:
+    case PPC::DIR_750:
+    case PPC::DIR_970:
+    case PPC::DIR_E5500:
+    case PPC::DIR_PWR4:
+    case PPC::DIR_PWR5:
+    case PPC::DIR_PWR5X:
+    case PPC::DIR_PWR6:
+    case PPC::DIR_PWR6X:
+    case PPC::DIR_PWR7:
+    case PPC::DIR_PWR8:
+    // FIXME: Is this needed for POWER9?
+      Latency += 2;
+      break;
+    }
+  }
+
+  return Latency;
+}
+
+// This function does not list all associative and commutative operations, but
+// only those worth feeding through the machine combiner in an attempt to
+// reduce the critical path. Mostly, this means floating-point operations,
+// because they have high latencies (compared to other operations, such and
+// and/or, which are also associative and commutative, but have low latencies).
+bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
+  switch (Inst.getOpcode()) {
+  // FP Add:
+  case PPC::FADD:
+  case PPC::FADDS:
+  // FP Multiply:
+  case PPC::FMUL:
+  case PPC::FMULS:
+  // Altivec Add:
+  case PPC::VADDFP:
+  // VSX Add:
+  case PPC::XSADDDP:
+  case PPC::XVADDDP:
+  case PPC::XVADDSP:
+  case PPC::XSADDSP:
+  // VSX Multiply:
+  case PPC::XSMULDP:
+  case PPC::XVMULDP:
+  case PPC::XVMULSP:
+  case PPC::XSMULSP:
+  // QPX Add:
+  case PPC::QVFADD:
+  case PPC::QVFADDS:
+  case PPC::QVFADDSs:
+  // QPX Multiply:
+  case PPC::QVFMUL:
+  case PPC::QVFMULS:
+  case PPC::QVFMULSs:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool PPCInstrInfo::getMachineCombinerPatterns(
+    MachineInstr &Root,
+    SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+  // Using the machine combiner in this way is potentially expensive, so
+  // restrict to when aggressive optimizations are desired.
+  if (Subtarget.getTargetMachine().getOptLevel() != CodeGenOpt::Aggressive)
+    return false;
+
+  // FP reassociation is only legal when we don't need strict IEEE semantics.
+  if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath)
+    return false;
+
+  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
+}
+
+// Detect 32 -> 64-bit extensions where we may reuse the low sub-register.
+bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                         unsigned &SrcReg, unsigned &DstReg,
+                                         unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default: return false;
+  case PPC::EXTSW:
+  case PPC::EXTSW_32_64:
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    SubIdx = PPC::sub_32;
+    return true;
+  }
+}
+
+unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                           int &FrameIndex) const {
+  // Note: This list must be kept consistent with LoadRegFromStackSlot.
+  switch (MI.getOpcode()) {
+  default: break;
+  case PPC::LD:
+  case PPC::LWZ:
+  case PPC::LFS:
+  case PPC::LFD:
+  case PPC::RESTORE_CR:
+  case PPC::RESTORE_CRBIT:
+  case PPC::LVX:
+  case PPC::LXVD2X:
+  case PPC::LXVX:
+  case PPC::QVLFDX:
+  case PPC::QVLFSXs:
+  case PPC::QVLFDXb:
+  case PPC::RESTORE_VRSAVE:
+    // Check for the operands added by addFrameReference (the immediate is the
+    // offset which defaults to 0).
+    if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() &&
+        MI.getOperand(2).isFI()) {
+      FrameIndex = MI.getOperand(2).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                          int &FrameIndex) const {
+  // Note: This list must be kept consistent with StoreRegToStackSlot.
+  switch (MI.getOpcode()) {
+  default: break;
+  case PPC::STD:
+  case PPC::STW:
+  case PPC::STFS:
+  case PPC::STFD:
+  case PPC::SPILL_CR:
+  case PPC::SPILL_CRBIT:
+  case PPC::STVX:
+  case PPC::STXVD2X:
+  case PPC::STXVX:
+  case PPC::QVSTFDX:
+  case PPC::QVSTFSXs:
+  case PPC::QVSTFDXb:
+  case PPC::SPILL_VRSAVE:
+    // Check for the operands added by addFrameReference (the immediate is the
+    // offset which defaults to 0).
+    if (MI.getOperand(1).isImm() && !MI.getOperand(1).getImm() &&
+        MI.getOperand(2).isFI()) {
+      FrameIndex = MI.getOperand(2).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+                                                   unsigned OpIdx1,
+                                                   unsigned OpIdx2) const {
+  MachineFunction &MF = *MI.getParent()->getParent();
+
+  // Normal instructions can be commuted the obvious way.
+  if (MI.getOpcode() != PPC::RLWIMI && MI.getOpcode() != PPC::RLWIMIo)
+    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+  // Note that RLWIMI can be commuted as a 32-bit instruction, but not as a
+  // 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because
+  // changing the relative order of the mask operands might change what happens
+  // to the high-bits of the mask (and, thus, the result).
+
+  // Cannot commute if it has a non-zero rotate count.
+  if (MI.getOperand(3).getImm() != 0)
+    return nullptr;
+
+  // If we have a zero rotate count, we have:
+  //   M = mask(MB,ME)
+  //   Op0 = (Op1 & ~M) | (Op2 & M)
+  // Change this to:
+  //   M = mask((ME+1)&31, (MB-1)&31)
+  //   Op0 = (Op2 & ~M) | (Op1 & M)
+
+  // Swap op1/op2
+  assert(((OpIdx1 == 1 && OpIdx2 == 2) || (OpIdx1 == 2 && OpIdx2 == 1)) &&
+         "Only the operands 1 and 2 can be swapped in RLSIMI/RLWIMIo.");
+  unsigned Reg0 = MI.getOperand(0).getReg();
+  unsigned Reg1 = MI.getOperand(1).getReg();
+  unsigned Reg2 = MI.getOperand(2).getReg();
+  unsigned SubReg1 = MI.getOperand(1).getSubReg();
+  unsigned SubReg2 = MI.getOperand(2).getSubReg();
+  bool Reg1IsKill = MI.getOperand(1).isKill();
+  bool Reg2IsKill = MI.getOperand(2).isKill();
+  bool ChangeReg0 = false;
+  // If machine instrs are no longer in two-address forms, update
+  // destination register as well.
+  if (Reg0 == Reg1) {
+    // Must be two address instruction!
+    assert(MI.getDesc().getOperandConstraint(0, MCOI::TIED_TO) &&
+           "Expecting a two-address instruction!");
+    assert(MI.getOperand(0).getSubReg() == SubReg1 && "Tied subreg mismatch");
+    Reg2IsKill = false;
+    ChangeReg0 = true;
+  }
+
+  // Masks.
+  unsigned MB = MI.getOperand(4).getImm();
+  unsigned ME = MI.getOperand(5).getImm();
+
+  // We can't commute a trivial mask (there is no way to represent an all-zero
+  // mask).
+  if (MB == 0 && ME == 31)
+    return nullptr;
+
+  if (NewMI) {
+    // Create a new instruction.
+    unsigned Reg0 = ChangeReg0 ? Reg2 : MI.getOperand(0).getReg();
+    bool Reg0IsDead = MI.getOperand(0).isDead();
+    return BuildMI(MF, MI.getDebugLoc(), MI.getDesc())
+        .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead))
+        .addReg(Reg2, getKillRegState(Reg2IsKill))
+        .addReg(Reg1, getKillRegState(Reg1IsKill))
+        .addImm((ME + 1) & 31)
+        .addImm((MB - 1) & 31);
+  }
+
+  if (ChangeReg0) {
+    MI.getOperand(0).setReg(Reg2);
+    MI.getOperand(0).setSubReg(SubReg2);
+  }
+  MI.getOperand(2).setReg(Reg1);
+  MI.getOperand(1).setReg(Reg2);
+  MI.getOperand(2).setSubReg(SubReg1);
+  MI.getOperand(1).setSubReg(SubReg2);
+  MI.getOperand(2).setIsKill(Reg1IsKill);
+  MI.getOperand(1).setIsKill(Reg2IsKill);
+
+  // Swap the mask around.
+  MI.getOperand(4).setImm((ME + 1) & 31);
+  MI.getOperand(5).setImm((MB - 1) & 31);
+  return &MI;
+}
+
+bool PPCInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+                                         unsigned &SrcOpIdx2) const {
+  // For VSX A-Type FMA instructions, it is the first two operands that can be
+  // commuted, however, because the non-encoded tied input operand is listed
+  // first, the operands to swap are actually the second and third.
+
+  int AltOpc = PPC::getAltVSXFMAOpcode(MI.getOpcode());
+  if (AltOpc == -1)
+    return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+
+  // The commutable operand indices are 2 and 3. Return them in SrcOpIdx1
+  // and SrcOpIdx2.
+  return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3);
+}
+
+void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI) const {
+  // This function is used for scheduling, and the nop wanted here is the type
+  // that terminates dispatch groups on the POWER cores.
+  unsigned Directive = Subtarget.getDarwinDirective();
+  unsigned Opcode;
+  switch (Directive) {
+  default:            Opcode = PPC::NOP; break;
+  case PPC::DIR_PWR6: Opcode = PPC::NOP_GT_PWR6; break;
+  case PPC::DIR_PWR7: Opcode = PPC::NOP_GT_PWR7; break;
+  case PPC::DIR_PWR8: Opcode = PPC::NOP_GT_PWR7; break; /* FIXME: Update when P8 InstrScheduling model is ready */
+  // FIXME: Update when POWER9 scheduling model is ready.
+  case PPC::DIR_PWR9: Opcode = PPC::NOP_GT_PWR7; break;
+  }
+
+  DebugLoc DL;
+  BuildMI(MBB, MI, DL, get(Opcode));
+}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void PPCInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(PPC::NOP);
+}
+
+// Branch analysis.
+// Note: If the condition register is set to CTR or CTR8 then this is a
+// BDNZ (imm == 1) or BDZ (imm == 0) branch.
+bool PPCInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                 MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  bool isPPC64 = Subtarget.isPPC64();
+
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return false;
+
+  if (!isUnpredicatedTerminator(*I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr &LastInst = *I;
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+    if (LastInst.getOpcode() == PPC::B) {
+      if (!LastInst.getOperand(0).isMBB())
+        return true;
+      TBB = LastInst.getOperand(0).getMBB();
+      return false;
+    } else if (LastInst.getOpcode() == PPC::BCC) {
+      if (!LastInst.getOperand(2).isMBB())
+        return true;
+      // Block ends with fall-through condbranch.
+      TBB = LastInst.getOperand(2).getMBB();
+      Cond.push_back(LastInst.getOperand(0));
+      Cond.push_back(LastInst.getOperand(1));
+      return false;
+    } else if (LastInst.getOpcode() == PPC::BC) {
+      if (!LastInst.getOperand(1).isMBB())
+        return true;
+      // Block ends with fall-through condbranch.
+      TBB = LastInst.getOperand(1).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
+      Cond.push_back(LastInst.getOperand(0));
+      return false;
+    } else if (LastInst.getOpcode() == PPC::BCn) {
+      if (!LastInst.getOperand(1).isMBB())
+        return true;
+      // Block ends with fall-through condbranch.
+      TBB = LastInst.getOperand(1).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_UNSET));
+      Cond.push_back(LastInst.getOperand(0));
+      return false;
+    } else if (LastInst.getOpcode() == PPC::BDNZ8 ||
+               LastInst.getOpcode() == PPC::BDNZ) {
+      if (!LastInst.getOperand(0).isMBB())
+        return true;
+      if (DisableCTRLoopAnal)
+        return true;
+      TBB = LastInst.getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(1));
+      Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                               true));
+      return false;
+    } else if (LastInst.getOpcode() == PPC::BDZ8 ||
+               LastInst.getOpcode() == PPC::BDZ) {
+      if (!LastInst.getOperand(0).isMBB())
+        return true;
+      if (DisableCTRLoopAnal)
+        return true;
+      TBB = LastInst.getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(0));
+      Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                               true));
+      return false;
+    }
+
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  // Get the instruction before it if it's a terminator.
+  MachineInstr &SecondLastInst = *I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (I != MBB.begin() && isUnpredicatedTerminator(*--I))
+    return true;
+
+  // If the block ends with PPC::B and PPC:BCC, handle it.
+  if (SecondLastInst.getOpcode() == PPC::BCC &&
+      LastInst.getOpcode() == PPC::B) {
+    if (!SecondLastInst.getOperand(2).isMBB() ||
+        !LastInst.getOperand(0).isMBB())
+      return true;
+    TBB = SecondLastInst.getOperand(2).getMBB();
+    Cond.push_back(SecondLastInst.getOperand(0));
+    Cond.push_back(SecondLastInst.getOperand(1));
+    FBB = LastInst.getOperand(0).getMBB();
+    return false;
+  } else if (SecondLastInst.getOpcode() == PPC::BC &&
+             LastInst.getOpcode() == PPC::B) {
+    if (!SecondLastInst.getOperand(1).isMBB() ||
+        !LastInst.getOperand(0).isMBB())
+      return true;
+    TBB = SecondLastInst.getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
+    Cond.push_back(SecondLastInst.getOperand(0));
+    FBB = LastInst.getOperand(0).getMBB();
+    return false;
+  } else if (SecondLastInst.getOpcode() == PPC::BCn &&
+             LastInst.getOpcode() == PPC::B) {
+    if (!SecondLastInst.getOperand(1).isMBB() ||
+        !LastInst.getOperand(0).isMBB())
+      return true;
+    TBB = SecondLastInst.getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_UNSET));
+    Cond.push_back(SecondLastInst.getOperand(0));
+    FBB = LastInst.getOperand(0).getMBB();
+    return false;
+  } else if ((SecondLastInst.getOpcode() == PPC::BDNZ8 ||
+              SecondLastInst.getOpcode() == PPC::BDNZ) &&
+             LastInst.getOpcode() == PPC::B) {
+    if (!SecondLastInst.getOperand(0).isMBB() ||
+        !LastInst.getOperand(0).isMBB())
+      return true;
+    if (DisableCTRLoopAnal)
+      return true;
+    TBB = SecondLastInst.getOperand(0).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(1));
+    Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                             true));
+    FBB = LastInst.getOperand(0).getMBB();
+    return false;
+  } else if ((SecondLastInst.getOpcode() == PPC::BDZ8 ||
+              SecondLastInst.getOpcode() == PPC::BDZ) &&
+             LastInst.getOpcode() == PPC::B) {
+    if (!SecondLastInst.getOperand(0).isMBB() ||
+        !LastInst.getOperand(0).isMBB())
+      return true;
+    if (DisableCTRLoopAnal)
+      return true;
+    TBB = SecondLastInst.getOperand(0).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(0));
+    Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                             true));
+    FBB = LastInst.getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two PPC:Bs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst.getOpcode() == PPC::B && LastInst.getOpcode() == PPC::B) {
+    if (!SecondLastInst.getOperand(0).isMBB())
+      return true;
+    TBB = SecondLastInst.getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned PPCInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                    int *BytesRemoved) const {
+  assert(!BytesRemoved && "code size not handled");
+
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return 0;
+
+  if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC &&
+      I->getOpcode() != PPC::BC && I->getOpcode() != PPC::BCn &&
+      I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ &&
+      I->getOpcode() != PPC::BDZ8  && I->getOpcode() != PPC::BDZ)
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != PPC::BCC &&
+      I->getOpcode() != PPC::BC && I->getOpcode() != PPC::BCn &&
+      I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ &&
+      I->getOpcode() != PPC::BDZ8  && I->getOpcode() != PPC::BDZ)
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned PPCInstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock *TBB,
+                                    MachineBasicBlock *FBB,
+                                    ArrayRef<MachineOperand> Cond,
+                                    const DebugLoc &DL,
+                                    int *BytesAdded) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "PPC branch conditions have two components!");
+  assert(!BytesAdded && "code size not handled");
+
+  bool isPPC64 = Subtarget.isPPC64();
+
+  // One-way branch.
+  if (!FBB) {
+    if (Cond.empty())   // Unconditional branch
+      BuildMI(&MBB, DL, get(PPC::B)).addMBB(TBB);
+    else if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
+      BuildMI(&MBB, DL, get(Cond[0].getImm() ?
+                              (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                              (isPPC64 ? PPC::BDZ8  : PPC::BDZ))).addMBB(TBB);
+    else if (Cond[0].getImm() == PPC::PRED_BIT_SET)
+      BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB);
+    else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET)
+      BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB);
+    else                // Conditional branch
+      BuildMI(&MBB, DL, get(PPC::BCC))
+        .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB);
+    return 1;
+  }
+
+  // Two-way Conditional Branch.
+  if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
+    BuildMI(&MBB, DL, get(Cond[0].getImm() ?
+                            (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                            (isPPC64 ? PPC::BDZ8  : PPC::BDZ))).addMBB(TBB);
+  else if (Cond[0].getImm() == PPC::PRED_BIT_SET)
+    BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB);
+  else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET)
+    BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB);
+  else
+    BuildMI(&MBB, DL, get(PPC::BCC))
+      .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB);
+  BuildMI(&MBB, DL, get(PPC::B)).addMBB(FBB);
+  return 2;
+}
+
+// Select analysis.
+bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
+                ArrayRef<MachineOperand> Cond,
+                unsigned TrueReg, unsigned FalseReg,
+                int &CondCycles, int &TrueCycles, int &FalseCycles) const {
+  if (!Subtarget.hasISEL())
+    return false;
+
+  if (Cond.size() != 2)
+    return false;
+
+  // If this is really a bdnz-like condition, then it cannot be turned into a
+  // select.
+  if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
+    return false;
+
+  // Check register classes.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+    RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  if (!RC)
+    return false;
+
+  // isel is for regular integer GPRs only.
+  if (!PPC::GPRCRegClass.hasSubClassEq(RC) &&
+      !PPC::GPRC_NOR0RegClass.hasSubClassEq(RC) &&
+      !PPC::G8RCRegClass.hasSubClassEq(RC) &&
+      !PPC::G8RC_NOX0RegClass.hasSubClassEq(RC))
+    return false;
+
+  // FIXME: These numbers are for the A2, how well they work for other cores is
+  // an open question. On the A2, the isel instruction has a 2-cycle latency
+  // but single-cycle throughput. These numbers are used in combination with
+  // the MispredictPenalty setting from the active SchedMachineModel.
+  CondCycles = 1;
+  TrueCycles = 1;
+  FalseCycles = 1;
+
+  return true;
+}
+
+void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI,
+                                const DebugLoc &dl, unsigned DestReg,
+                                ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+                                unsigned FalseReg) const {
+  assert(Cond.size() == 2 &&
+         "PPC branch conditions have two components!");
+
+  assert(Subtarget.hasISEL() &&
+         "Cannot insert select on target without ISEL support");
+
+  // Get the register classes.
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+    RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  assert(RC && "TrueReg and FalseReg must have overlapping register classes");
+
+  bool Is64Bit = PPC::G8RCRegClass.hasSubClassEq(RC) ||
+                 PPC::G8RC_NOX0RegClass.hasSubClassEq(RC);
+  assert((Is64Bit ||
+          PPC::GPRCRegClass.hasSubClassEq(RC) ||
+          PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) &&
+         "isel is for regular integer GPRs only");
+
+  unsigned OpCode = Is64Bit ? PPC::ISEL8 : PPC::ISEL;
+  auto SelectPred = static_cast<PPC::Predicate>(Cond[0].getImm());
+
+  unsigned SubIdx = 0;
+  bool SwapOps = false;
+  switch (SelectPred) {
+  case PPC::PRED_EQ:
+  case PPC::PRED_EQ_MINUS:
+  case PPC::PRED_EQ_PLUS:
+      SubIdx = PPC::sub_eq; SwapOps = false; break;
+  case PPC::PRED_NE:
+  case PPC::PRED_NE_MINUS:
+  case PPC::PRED_NE_PLUS:
+      SubIdx = PPC::sub_eq; SwapOps = true; break;
+  case PPC::PRED_LT:
+  case PPC::PRED_LT_MINUS:
+  case PPC::PRED_LT_PLUS:
+      SubIdx = PPC::sub_lt; SwapOps = false; break;
+  case PPC::PRED_GE:
+  case PPC::PRED_GE_MINUS:
+  case PPC::PRED_GE_PLUS:
+      SubIdx = PPC::sub_lt; SwapOps = true; break;
+  case PPC::PRED_GT:
+  case PPC::PRED_GT_MINUS:
+  case PPC::PRED_GT_PLUS:
+      SubIdx = PPC::sub_gt; SwapOps = false; break;
+  case PPC::PRED_LE:
+  case PPC::PRED_LE_MINUS:
+  case PPC::PRED_LE_PLUS:
+      SubIdx = PPC::sub_gt; SwapOps = true; break;
+  case PPC::PRED_UN:
+  case PPC::PRED_UN_MINUS:
+  case PPC::PRED_UN_PLUS:
+      SubIdx = PPC::sub_un; SwapOps = false; break;
+  case PPC::PRED_NU:
+  case PPC::PRED_NU_MINUS:
+  case PPC::PRED_NU_PLUS:
+      SubIdx = PPC::sub_un; SwapOps = true; break;
+  case PPC::PRED_BIT_SET:   SubIdx = 0; SwapOps = false; break;
+  case PPC::PRED_BIT_UNSET: SubIdx = 0; SwapOps = true; break;
+  }
+
+  unsigned FirstReg =  SwapOps ? FalseReg : TrueReg,
+           SecondReg = SwapOps ? TrueReg  : FalseReg;
+
+  // The first input register of isel cannot be r0. If it is a member
+  // of a register class that can be r0, then copy it first (the
+  // register allocator should eliminate the copy).
+  if (MRI.getRegClass(FirstReg)->contains(PPC::R0) ||
+      MRI.getRegClass(FirstReg)->contains(PPC::X0)) {
+    const TargetRegisterClass *FirstRC =
+      MRI.getRegClass(FirstReg)->contains(PPC::X0) ?
+        &PPC::G8RC_NOX0RegClass : &PPC::GPRC_NOR0RegClass;
+    unsigned OldFirstReg = FirstReg;
+    FirstReg = MRI.createVirtualRegister(FirstRC);
+    BuildMI(MBB, MI, dl, get(TargetOpcode::COPY), FirstReg)
+      .addReg(OldFirstReg);
+  }
+
+  BuildMI(MBB, MI, dl, get(OpCode), DestReg)
+    .addReg(FirstReg).addReg(SecondReg)
+    .addReg(Cond[1].getReg(), 0, SubIdx);
+}
+
+static unsigned getCRBitValue(unsigned CRBit) {
+  unsigned Ret = 4;
+  if (CRBit == PPC::CR0LT || CRBit == PPC::CR1LT ||
+      CRBit == PPC::CR2LT || CRBit == PPC::CR3LT ||
+      CRBit == PPC::CR4LT || CRBit == PPC::CR5LT ||
+      CRBit == PPC::CR6LT || CRBit == PPC::CR7LT)
+    Ret = 3;
+  if (CRBit == PPC::CR0GT || CRBit == PPC::CR1GT ||
+      CRBit == PPC::CR2GT || CRBit == PPC::CR3GT ||
+      CRBit == PPC::CR4GT || CRBit == PPC::CR5GT ||
+      CRBit == PPC::CR6GT || CRBit == PPC::CR7GT)
+    Ret = 2;
+  if (CRBit == PPC::CR0EQ || CRBit == PPC::CR1EQ ||
+      CRBit == PPC::CR2EQ || CRBit == PPC::CR3EQ ||
+      CRBit == PPC::CR4EQ || CRBit == PPC::CR5EQ ||
+      CRBit == PPC::CR6EQ || CRBit == PPC::CR7EQ)
+    Ret = 1;
+  if (CRBit == PPC::CR0UN || CRBit == PPC::CR1UN ||
+      CRBit == PPC::CR2UN || CRBit == PPC::CR3UN ||
+      CRBit == PPC::CR4UN || CRBit == PPC::CR5UN ||
+      CRBit == PPC::CR6UN || CRBit == PPC::CR7UN)
+    Ret = 0;
+
+  assert(Ret != 4 && "Invalid CR bit register");
+  return Ret;
+}
+
+void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I,
+                               const DebugLoc &DL, unsigned DestReg,
+                               unsigned SrcReg, bool KillSrc) const {
+  // We can end up with self copies and similar things as a result of VSX copy
+  // legalization. Promote them here.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  if (PPC::F8RCRegClass.contains(DestReg) &&
+      PPC::VSRCRegClass.contains(SrcReg)) {
+    unsigned SuperReg =
+      TRI->getMatchingSuperReg(DestReg, PPC::sub_64, &PPC::VSRCRegClass);
+
+    if (VSXSelfCopyCrash && SrcReg == SuperReg)
+      llvm_unreachable("nop VSX copy");
+
+    DestReg = SuperReg;
+  } else if (PPC::F8RCRegClass.contains(SrcReg) &&
+             PPC::VSRCRegClass.contains(DestReg)) {
+    unsigned SuperReg =
+      TRI->getMatchingSuperReg(SrcReg, PPC::sub_64, &PPC::VSRCRegClass);
+
+    if (VSXSelfCopyCrash && DestReg == SuperReg)
+      llvm_unreachable("nop VSX copy");
+
+    SrcReg = SuperReg;
+  }
+
+  // Different class register copy
+  if (PPC::CRBITRCRegClass.contains(SrcReg) &&
+      PPC::GPRCRegClass.contains(DestReg)) {
+    unsigned CRReg = getCRFromCRBit(SrcReg);
+    BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg).addReg(CRReg);
+    getKillRegState(KillSrc);
+    // Rotate the CR bit in the CR fields to be the least significant bit and
+    // then mask with 0x1 (MB = ME = 31).
+    BuildMI(MBB, I, DL, get(PPC::RLWINM), DestReg)
+       .addReg(DestReg, RegState::Kill)
+       .addImm(TRI->getEncodingValue(CRReg) * 4 + (4 - getCRBitValue(SrcReg)))
+       .addImm(31)
+       .addImm(31);
+    return;
+  } else if (PPC::CRRCRegClass.contains(SrcReg) &&
+      PPC::G8RCRegClass.contains(DestReg)) {
+    BuildMI(MBB, I, DL, get(PPC::MFOCRF8), DestReg).addReg(SrcReg);
+    getKillRegState(KillSrc);
+    return;
+  } else if (PPC::CRRCRegClass.contains(SrcReg) &&
+      PPC::GPRCRegClass.contains(DestReg)) {
+    BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg).addReg(SrcReg);
+    getKillRegState(KillSrc);
+    return;
+   }
+
+  unsigned Opc;
+  if (PPC::GPRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::OR;
+  else if (PPC::G8RCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::OR8;
+  else if (PPC::F4RCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::FMR;
+  else if (PPC::CRRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::MCRF;
+  else if (PPC::VRRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::VOR;
+  else if (PPC::VSRCRegClass.contains(DestReg, SrcReg))
+    // There are two different ways this can be done:
+    //   1. xxlor : This has lower latency (on the P7), 2 cycles, but can only
+    //      issue in VSU pipeline 0.
+    //   2. xmovdp/xmovsp: This has higher latency (on the P7), 6 cycles, but
+    //      can go to either pipeline.
+    // We'll always use xxlor here, because in practically all cases where
+    // copies are generated, they are close enough to some use that the
+    // lower-latency form is preferable.
+    Opc = PPC::XXLOR;
+  else if (PPC::VSFRCRegClass.contains(DestReg, SrcReg) ||
+           PPC::VSSRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::XXLORf;
+  else if (PPC::QFRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::QVFMR;
+  else if (PPC::QSRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::QVFMRs;
+  else if (PPC::QBRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::QVFMRb;
+  else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::CROR;
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+
+  const MCInstrDesc &MCID = get(Opc);
+  if (MCID.getNumOperands() == 3)
+    BuildMI(MBB, I, DL, MCID, DestReg)
+      .addReg(SrcReg).addReg(SrcReg, getKillRegState(KillSrc));
+  else
+    BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+// This function returns true if a CR spill is necessary and false otherwise.
+bool
+PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
+                                  unsigned SrcReg, bool isKill,
+                                  int FrameIdx,
+                                  const TargetRegisterClass *RC,
+                                  SmallVectorImpl<MachineInstr*> &NewMIs,
+                                  bool &NonRI, bool &SpillsVRS) const{
+  // Note: If additional store instructions are added here,
+  // update isStoreToStackSlot.
+
+  DebugLoc DL;
+  if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
+      PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+  } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
+             PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+  } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFD))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+  } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFS))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+  } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CR))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    return true;
+  } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CRBIT))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    return true;
+  } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STVX))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
+    unsigned Op = Subtarget.hasP9Vector() ? PPC::STXVX : PPC::STXVD2X;
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Op))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
+    unsigned Opc = Subtarget.hasP9Vector() ? PPC::DFSTOREf64 : PPC::STXSDX;
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opc))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
+    unsigned Opc = Subtarget.hasP9Vector() ? PPC::DFSTOREf32 : PPC::STXSSPX;
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opc))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
+    assert(Subtarget.isDarwin() &&
+           "VRSAVE only needs spill/restore on Darwin");
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_VRSAVE))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    SpillsVRS = true;
+  } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFDX))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFSXs))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVSTFDXb))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
+  } else {
+    llvm_unreachable("Unknown regclass!");
+  }
+
+  return false;
+}
+
+void
+PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  unsigned SrcReg, bool isKill, int FrameIdx,
+                                  const TargetRegisterClass *RC,
+                                  const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  SmallVector<MachineInstr*, 4> NewMIs;
+
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setHasSpills();
+
+  // We need to avoid a situation in which the value from a VRRC register is
+  // spilled using an Altivec instruction and reloaded into a VSRC register
+  // using a VSX instruction. The issue with this is that the VSX
+  // load/store instructions swap the doublewords in the vector and the Altivec
+  // ones don't. The register classes on the spill/reload may be different if
+  // the register is defined using an Altivec instruction and is then used by a
+  // VSX instruction.
+  RC = updatedRC(RC);
+
+  bool NonRI = false, SpillsVRS = false;
+  if (StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs,
+                          NonRI, SpillsVRS))
+    FuncInfo->setSpillsCR();
+
+  if (SpillsVRS)
+    FuncInfo->setSpillsVRSAVE();
+
+  if (NonRI)
+    FuncInfo->setHasNonRISpills();
+
+  for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
+    MBB.insert(MI, NewMIs[i]);
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FrameIdx),
+      MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
+      MFI.getObjectAlignment(FrameIdx));
+  NewMIs.back()->addMemOperand(MF, MMO);
+}
+
+bool PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
+                                        unsigned DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC,
+                                        SmallVectorImpl<MachineInstr *> &NewMIs,
+                                        bool &NonRI, bool &SpillsVRS) const {
+  // Note: If additional load instructions are added here,
+  // update isLoadFromStackSlot.
+
+  if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
+      PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
+                                               DestReg), FrameIdx));
+  } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
+             PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD), DestReg),
+                                       FrameIdx));
+  } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFD), DestReg),
+                                       FrameIdx));
+  } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg),
+                                       FrameIdx));
+  } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
+                                               get(PPC::RESTORE_CR), DestReg),
+                                       FrameIdx));
+    return true;
+  } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
+                                               get(PPC::RESTORE_CRBIT), DestReg),
+                                       FrameIdx));
+    return true;
+  } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LVX), DestReg),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
+    unsigned Op = Subtarget.hasP9Vector() ? PPC::LXVX : PPC::LXVD2X;
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Op), DestReg),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
+    unsigned Opc = Subtarget.hasP9Vector() ? PPC::DFLOADf64 : PPC::LXSDX;
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opc),
+                                               DestReg), FrameIdx));
+    NonRI = true;
+  } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
+    unsigned Opc = Subtarget.hasP9Vector() ? PPC::DFLOADf32 : PPC::LXSSPX;
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opc),
+                                               DestReg), FrameIdx));
+    NonRI = true;
+  } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
+    assert(Subtarget.isDarwin() &&
+           "VRSAVE only needs spill/restore on Darwin");
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
+                                               get(PPC::RESTORE_VRSAVE),
+                                               DestReg),
+                                       FrameIdx));
+    SpillsVRS = true;
+  } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFDX), DestReg),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFSXs), DestReg),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::QVLFDXb), DestReg),
+                                       FrameIdx));
+    NonRI = true;
+  } else {
+    llvm_unreachable("Unknown regclass!");
+  }
+
+  return false;
+}
+
+void
+PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned DestReg, int FrameIdx,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  SmallVector<MachineInstr*, 4> NewMIs;
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setHasSpills();
+
+  // We need to avoid a situation in which the value from a VRRC register is
+  // spilled using an Altivec instruction and reloaded into a VSRC register
+  // using a VSX instruction. The issue with this is that the VSX
+  // load/store instructions swap the doublewords in the vector and the Altivec
+  // ones don't. The register classes on the spill/reload may be different if
+  // the register is defined using an Altivec instruction and is then used by a
+  // VSX instruction.
+  if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass)
+    RC = &PPC::VSRCRegClass;
+
+  bool NonRI = false, SpillsVRS = false;
+  if (LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs,
+                           NonRI, SpillsVRS))
+    FuncInfo->setSpillsCR();
+
+  if (SpillsVRS)
+    FuncInfo->setSpillsVRSAVE();
+
+  if (NonRI)
+    FuncInfo->setHasNonRISpills();
+
+  for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
+    MBB.insert(MI, NewMIs[i]);
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FrameIdx),
+      MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
+      MFI.getObjectAlignment(FrameIdx));
+  NewMIs.back()->addMemOperand(MF, MMO);
+}
+
+bool PPCInstrInfo::
+reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 2 && "Invalid PPC branch opcode!");
+  if (Cond[1].getReg() == PPC::CTR8 || Cond[1].getReg() == PPC::CTR)
+    Cond[0].setImm(Cond[0].getImm() == 0 ? 1 : 0);
+  else
+    // Leave the CR# the same, but invert the condition.
+    Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm()));
+  return false;
+}
+
+bool PPCInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+                                 unsigned Reg, MachineRegisterInfo *MRI) const {
+  // For some instructions, it is legal to fold ZERO into the RA register field.
+  // A zero immediate should always be loaded with a single li.
+  unsigned DefOpc = DefMI.getOpcode();
+  if (DefOpc != PPC::LI && DefOpc != PPC::LI8)
+    return false;
+  if (!DefMI.getOperand(1).isImm())
+    return false;
+  if (DefMI.getOperand(1).getImm() != 0)
+    return false;
+
+  // Note that we cannot here invert the arguments of an isel in order to fold
+  // a ZERO into what is presented as the second argument. All we have here
+  // is the condition bit, and that might come from a CR-logical bit operation.
+
+  const MCInstrDesc &UseMCID = UseMI.getDesc();
+
+  // Only fold into real machine instructions.
+  if (UseMCID.isPseudo())
+    return false;
+
+  unsigned UseIdx;
+  for (UseIdx = 0; UseIdx < UseMI.getNumOperands(); ++UseIdx)
+    if (UseMI.getOperand(UseIdx).isReg() &&
+        UseMI.getOperand(UseIdx).getReg() == Reg)
+      break;
+
+  assert(UseIdx < UseMI.getNumOperands() && "Cannot find Reg in UseMI");
+  assert(UseIdx < UseMCID.getNumOperands() && "No operand description for Reg");
+
+  const MCOperandInfo *UseInfo = &UseMCID.OpInfo[UseIdx];
+
+  // We can fold the zero if this register requires a GPRC_NOR0/G8RC_NOX0
+  // register (which might also be specified as a pointer class kind).
+  if (UseInfo->isLookupPtrRegClass()) {
+    if (UseInfo->RegClass /* Kind */ != 1)
+      return false;
+  } else {
+    if (UseInfo->RegClass != PPC::GPRC_NOR0RegClassID &&
+        UseInfo->RegClass != PPC::G8RC_NOX0RegClassID)
+      return false;
+  }
+
+  // Make sure this is not tied to an output register (or otherwise
+  // constrained). This is true for ST?UX registers, for example, which
+  // are tied to their output registers.
+  if (UseInfo->Constraints != 0)
+    return false;
+
+  unsigned ZeroReg;
+  if (UseInfo->isLookupPtrRegClass()) {
+    bool isPPC64 = Subtarget.isPPC64();
+    ZeroReg = isPPC64 ? PPC::ZERO8 : PPC::ZERO;
+  } else {
+    ZeroReg = UseInfo->RegClass == PPC::G8RC_NOX0RegClassID ?
+              PPC::ZERO8 : PPC::ZERO;
+  }
+
+  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+  UseMI.getOperand(UseIdx).setReg(ZeroReg);
+
+  if (DeleteDef)
+    DefMI.eraseFromParent();
+
+  return true;
+}
+
+static bool MBBDefinesCTR(MachineBasicBlock &MBB) {
+  for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+       I != IE; ++I)
+    if (I->definesRegister(PPC::CTR) || I->definesRegister(PPC::CTR8))
+      return true;
+  return false;
+}
+
+// We should make sure that, if we're going to predicate both sides of a
+// condition (a diamond), that both sides don't define the counter register. We
+// can predicate counter-decrement-based branches, but while that predicates
+// the branching, it does not predicate the counter decrement. If we tried to
+// merge the triangle into one predicated block, we'd decrement the counter
+// twice.
+bool PPCInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                     unsigned NumT, unsigned ExtraT,
+                     MachineBasicBlock &FMBB,
+                     unsigned NumF, unsigned ExtraF,
+                     BranchProbability Probability) const {
+  return !(MBBDefinesCTR(TMBB) && MBBDefinesCTR(FMBB));
+}
+
+
+bool PPCInstrInfo::isPredicated(const MachineInstr &MI) const {
+  // The predicated branches are identified by their type, not really by the
+  // explicit presence of a predicate. Furthermore, some of them can be
+  // predicated more than once. Because if conversion won't try to predicate
+  // any instruction which already claims to be predicated (by returning true
+  // here), always return false. In doing so, we let isPredicable() be the
+  // final word on whether not the instruction can be (further) predicated.
+
+  return false;
+}
+
+bool PPCInstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
+  if (!MI.isTerminator())
+    return false;
+
+  // Conditional branch is a special case.
+  if (MI.isBranch() && !MI.isBarrier())
+    return true;
+
+  return !isPredicated(MI);
+}
+
+bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
+                                        ArrayRef<MachineOperand> Pred) const {
+  unsigned OpC = MI.getOpcode();
+  if (OpC == PPC::BLR || OpC == PPC::BLR8) {
+    if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) {
+      bool isPPC64 = Subtarget.isPPC64();
+      MI.setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZLR8 : PPC::BDNZLR)
+                                      : (isPPC64 ? PPC::BDZLR8 : PPC::BDZLR)));
+    } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
+      MI.setDesc(get(PPC::BCLR));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(Pred[1].getReg());
+    } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
+      MI.setDesc(get(PPC::BCLRn));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(Pred[1].getReg());
+    } else {
+      MI.setDesc(get(PPC::BCCLR));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addImm(Pred[0].getImm())
+          .addReg(Pred[1].getReg());
+    }
+
+    return true;
+  } else if (OpC == PPC::B) {
+    if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) {
+      bool isPPC64 = Subtarget.isPPC64();
+      MI.setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ)
+                                      : (isPPC64 ? PPC::BDZ8 : PPC::BDZ)));
+    } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
+      MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
+      MI.RemoveOperand(0);
+
+      MI.setDesc(get(PPC::BC));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(Pred[1].getReg())
+          .addMBB(MBB);
+    } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
+      MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
+      MI.RemoveOperand(0);
+
+      MI.setDesc(get(PPC::BCn));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(Pred[1].getReg())
+          .addMBB(MBB);
+    } else {
+      MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
+      MI.RemoveOperand(0);
+
+      MI.setDesc(get(PPC::BCC));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addImm(Pred[0].getImm())
+          .addReg(Pred[1].getReg())
+          .addMBB(MBB);
+    }
+
+    return true;
+  } else if (OpC == PPC::BCTR  || OpC == PPC::BCTR8 ||
+             OpC == PPC::BCTRL || OpC == PPC::BCTRL8) {
+    if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR)
+      llvm_unreachable("Cannot predicate bctr[l] on the ctr register");
+
+    bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8;
+    bool isPPC64 = Subtarget.isPPC64();
+
+    if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
+      MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8)
+                             : (setLR ? PPC::BCCTRL : PPC::BCCTR)));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(Pred[1].getReg());
+      return true;
+    } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) {
+      MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8n : PPC::BCCTR8n)
+                             : (setLR ? PPC::BCCTRLn : PPC::BCCTRn)));
+      MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+          .addReg(Pred[1].getReg());
+      return true;
+    }
+
+    MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8)
+                           : (setLR ? PPC::BCCCTRL : PPC::BCCCTR)));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+        .addImm(Pred[0].getImm())
+        .addReg(Pred[1].getReg());
+    return true;
+  }
+
+  return false;
+}
+
+bool PPCInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+                                     ArrayRef<MachineOperand> Pred2) const {
+  assert(Pred1.size() == 2 && "Invalid PPC first predicate");
+  assert(Pred2.size() == 2 && "Invalid PPC second predicate");
+
+  if (Pred1[1].getReg() == PPC::CTR8 || Pred1[1].getReg() == PPC::CTR)
+    return false;
+  if (Pred2[1].getReg() == PPC::CTR8 || Pred2[1].getReg() == PPC::CTR)
+    return false;
+
+  // P1 can only subsume P2 if they test the same condition register.
+  if (Pred1[1].getReg() != Pred2[1].getReg())
+    return false;
+
+  PPC::Predicate P1 = (PPC::Predicate) Pred1[0].getImm();
+  PPC::Predicate P2 = (PPC::Predicate) Pred2[0].getImm();
+
+  if (P1 == P2)
+    return true;
+
+  // Does P1 subsume P2, e.g. GE subsumes GT.
+  if (P1 == PPC::PRED_LE &&
+      (P2 == PPC::PRED_LT || P2 == PPC::PRED_EQ))
+    return true;
+  if (P1 == PPC::PRED_GE &&
+      (P2 == PPC::PRED_GT || P2 == PPC::PRED_EQ))
+    return true;
+
+  return false;
+}
+
+bool PPCInstrInfo::DefinesPredicate(MachineInstr &MI,
+                                    std::vector<MachineOperand> &Pred) const {
+  // Note: At the present time, the contents of Pred from this function is
+  // unused by IfConversion. This implementation follows ARM by pushing the
+  // CR-defining operand. Because the 'DZ' and 'DNZ' count as types of
+  // predicate, instructions defining CTR or CTR8 are also included as
+  // predicate-defining instructions.
+
+  const TargetRegisterClass *RCs[] =
+    { &PPC::CRRCRegClass, &PPC::CRBITRCRegClass,
+      &PPC::CTRRCRegClass, &PPC::CTRRC8RegClass };
+
+  bool Found = false;
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    for (unsigned c = 0; c < array_lengthof(RCs) && !Found; ++c) {
+      const TargetRegisterClass *RC = RCs[c];
+      if (MO.isReg()) {
+        if (MO.isDef() && RC->contains(MO.getReg())) {
+          Pred.push_back(MO);
+          Found = true;
+        }
+      } else if (MO.isRegMask()) {
+        for (TargetRegisterClass::iterator I = RC->begin(),
+             IE = RC->end(); I != IE; ++I)
+          if (MO.clobbersPhysReg(*I)) {
+            Pred.push_back(MO);
+            Found = true;
+          }
+      }
+    }
+  }
+
+  return Found;
+}
+
+bool PPCInstrInfo::isPredicable(MachineInstr &MI) const {
+  unsigned OpC = MI.getOpcode();
+  switch (OpC) {
+  default:
+    return false;
+  case PPC::B:
+  case PPC::BLR:
+  case PPC::BLR8:
+  case PPC::BCTR:
+  case PPC::BCTR8:
+  case PPC::BCTRL:
+  case PPC::BCTRL8:
+    return true;
+  }
+}
+
+bool PPCInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                                  unsigned &SrcReg2, int &Mask,
+                                  int &Value) const {
+  unsigned Opc = MI.getOpcode();
+
+  switch (Opc) {
+  default: return false;
+  case PPC::CMPWI:
+  case PPC::CMPLWI:
+  case PPC::CMPDI:
+  case PPC::CMPLDI:
+    SrcReg = MI.getOperand(1).getReg();
+    SrcReg2 = 0;
+    Value = MI.getOperand(2).getImm();
+    Mask = 0xFFFF;
+    return true;
+  case PPC::CMPW:
+  case PPC::CMPLW:
+  case PPC::CMPD:
+  case PPC::CMPLD:
+  case PPC::FCMPUS:
+  case PPC::FCMPUD:
+    SrcReg = MI.getOperand(1).getReg();
+    SrcReg2 = MI.getOperand(2).getReg();
+    return true;
+  }
+}
+
+bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+                                        unsigned SrcReg2, int Mask, int Value,
+                                        const MachineRegisterInfo *MRI) const {
+  if (DisableCmpOpt)
+    return false;
+
+  int OpC = CmpInstr.getOpcode();
+  unsigned CRReg = CmpInstr.getOperand(0).getReg();
+
+  // FP record forms set CR1 based on the execption status bits, not a
+  // comparison with zero.
+  if (OpC == PPC::FCMPUS || OpC == PPC::FCMPUD)
+    return false;
+
+  // The record forms set the condition register based on a signed comparison
+  // with zero (so says the ISA manual). This is not as straightforward as it
+  // seems, however, because this is always a 64-bit comparison on PPC64, even
+  // for instructions that are 32-bit in nature (like slw for example).
+  // So, on PPC32, for unsigned comparisons, we can use the record forms only
+  // for equality checks (as those don't depend on the sign). On PPC64,
+  // we are restricted to equality for unsigned 64-bit comparisons and for
+  // signed 32-bit comparisons the applicability is more restricted.
+  bool isPPC64 = Subtarget.isPPC64();
+  bool is32BitSignedCompare   = OpC ==  PPC::CMPWI || OpC == PPC::CMPW;
+  bool is32BitUnsignedCompare = OpC == PPC::CMPLWI || OpC == PPC::CMPLW;
+  bool is64BitUnsignedCompare = OpC == PPC::CMPLDI || OpC == PPC::CMPLD;
+
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI) return false;
+  int MIOpC = MI->getOpcode();
+
+  bool equalityOnly = false;
+  bool noSub = false;
+  if (isPPC64) {
+    if (is32BitSignedCompare) {
+      // We can perform this optimization only if MI is sign-extending.
+      if (MIOpC == PPC::SRAW  || MIOpC == PPC::SRAWo ||
+          MIOpC == PPC::SRAWI || MIOpC == PPC::SRAWIo ||
+          MIOpC == PPC::EXTSB || MIOpC == PPC::EXTSBo ||
+          MIOpC == PPC::EXTSH || MIOpC == PPC::EXTSHo ||
+          MIOpC == PPC::EXTSW || MIOpC == PPC::EXTSWo) {
+        noSub = true;
+      } else
+        return false;
+    } else if (is32BitUnsignedCompare) {
+      // 32-bit rotate and mask instructions are zero extending only if MB <= ME
+      bool isZeroExtendingRotate  =
+          (MIOpC == PPC::RLWINM || MIOpC == PPC::RLWINMo ||
+           MIOpC == PPC::RLWNM || MIOpC == PPC::RLWNMo)
+          && MI->getOperand(3).getImm() <= MI->getOperand(4).getImm();
+
+      // We can perform this optimization, equality only, if MI is
+      // zero-extending.
+      if (MIOpC == PPC::CNTLZW || MIOpC == PPC::CNTLZWo ||
+          MIOpC == PPC::SLW    || MIOpC == PPC::SLWo ||
+          MIOpC == PPC::SRW    || MIOpC == PPC::SRWo ||
+          isZeroExtendingRotate) {
+        noSub = true;
+        equalityOnly = true;
+      } else
+        return false;
+    } else
+      equalityOnly = is64BitUnsignedCompare;
+  } else
+    equalityOnly = is32BitUnsignedCompare;
+
+  if (equalityOnly) {
+    // We need to check the uses of the condition register in order to reject
+    // non-equality comparisons.
+    for (MachineRegisterInfo::use_instr_iterator I =MRI->use_instr_begin(CRReg),
+         IE = MRI->use_instr_end(); I != IE; ++I) {
+      MachineInstr *UseMI = &*I;
+      if (UseMI->getOpcode() == PPC::BCC) {
+        unsigned Pred = UseMI->getOperand(0).getImm();
+        if (Pred != PPC::PRED_EQ && Pred != PPC::PRED_NE)
+          return false;
+      } else if (UseMI->getOpcode() == PPC::ISEL ||
+                 UseMI->getOpcode() == PPC::ISEL8) {
+        unsigned SubIdx = UseMI->getOperand(3).getSubReg();
+        if (SubIdx != PPC::sub_eq)
+          return false;
+      } else
+        return false;
+    }
+  }
+
+  MachineBasicBlock::iterator I = CmpInstr;
+
+  // Scan forward to find the first use of the compare.
+  for (MachineBasicBlock::iterator EL = CmpInstr.getParent()->end(); I != EL;
+       ++I) {
+    bool FoundUse = false;
+    for (MachineRegisterInfo::use_instr_iterator J =MRI->use_instr_begin(CRReg),
+         JE = MRI->use_instr_end(); J != JE; ++J)
+      if (&*J == &*I) {
+        FoundUse = true;
+        break;
+      }
+
+    if (FoundUse)
+      break;
+  }
+
+  // There are two possible candidates which can be changed to set CR[01].
+  // One is MI, the other is a SUB instruction.
+  // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
+  MachineInstr *Sub = nullptr;
+  if (SrcReg2 != 0)
+    // MI is not a candidate for CMPrr.
+    MI = nullptr;
+  // FIXME: Conservatively refuse to convert an instruction which isn't in the
+  // same BB as the comparison. This is to allow the check below to avoid calls
+  // (and other explicit clobbers); instead we should really check for these
+  // more explicitly (in at least a few predecessors).
+  else if (MI->getParent() != CmpInstr.getParent() || Value != 0) {
+    // PPC does not have a record-form SUBri.
+    return false;
+  }
+
+  // Search for Sub.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  --I;
+
+  // Get ready to iterate backward from CmpInstr.
+  MachineBasicBlock::iterator E = MI, B = CmpInstr.getParent()->begin();
+
+  for (; I != E && !noSub; --I) {
+    const MachineInstr &Instr = *I;
+    unsigned IOpC = Instr.getOpcode();
+
+    if (&*I != &CmpInstr && (Instr.modifiesRegister(PPC::CR0, TRI) ||
+                             Instr.readsRegister(PPC::CR0, TRI)))
+      // This instruction modifies or uses the record condition register after
+      // the one we want to change. While we could do this transformation, it
+      // would likely not be profitable. This transformation removes one
+      // instruction, and so even forcing RA to generate one move probably
+      // makes it unprofitable.
+      return false;
+
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if ((OpC == PPC::CMPW || OpC == PPC::CMPLW ||
+         OpC == PPC::CMPD || OpC == PPC::CMPLD) &&
+        (IOpC == PPC::SUBF || IOpC == PPC::SUBF8) &&
+        ((Instr.getOperand(1).getReg() == SrcReg &&
+          Instr.getOperand(2).getReg() == SrcReg2) ||
+        (Instr.getOperand(1).getReg() == SrcReg2 &&
+         Instr.getOperand(2).getReg() == SrcReg))) {
+      Sub = &*I;
+      break;
+    }
+
+    if (I == B)
+      // The 'and' is below the comparison instruction.
+      return false;
+  }
+
+  // Return false if no candidates exist.
+  if (!MI && !Sub)
+    return false;
+
+  // The single candidate is called MI.
+  if (!MI) MI = Sub;
+
+  int NewOpC = -1;
+  MIOpC = MI->getOpcode();
+  if (MIOpC == PPC::ANDIo || MIOpC == PPC::ANDIo8)
+    NewOpC = MIOpC;
+  else {
+    NewOpC = PPC::getRecordFormOpcode(MIOpC);
+    if (NewOpC == -1 && PPC::getNonRecordFormOpcode(MIOpC) != -1)
+      NewOpC = MIOpC;
+  }
+
+  // FIXME: On the non-embedded POWER architectures, only some of the record
+  // forms are fast, and we should use only the fast ones.
+
+  // The defining instruction has a record form (or is already a record
+  // form). It is possible, however, that we'll need to reverse the condition
+  // code of the users.
+  if (NewOpC == -1)
+    return false;
+
+  SmallVector<std::pair<MachineOperand*, PPC::Predicate>, 4> PredsToUpdate;
+  SmallVector<std::pair<MachineOperand*, unsigned>, 4> SubRegsToUpdate;
+
+  // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on CMP
+  // needs to be updated to be based on SUB.  Push the condition code
+  // operands to OperandsToUpdate.  If it is safe to remove CmpInstr, the
+  // condition code of these operands will be modified.
+  bool ShouldSwap = false;
+  if (Sub) {
+    ShouldSwap = SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+      Sub->getOperand(2).getReg() == SrcReg;
+
+    // The operands to subf are the opposite of sub, so only in the fixed-point
+    // case, invert the order.
+    ShouldSwap = !ShouldSwap;
+  }
+
+  if (ShouldSwap)
+    for (MachineRegisterInfo::use_instr_iterator
+         I = MRI->use_instr_begin(CRReg), IE = MRI->use_instr_end();
+         I != IE; ++I) {
+      MachineInstr *UseMI = &*I;
+      if (UseMI->getOpcode() == PPC::BCC) {
+        PPC::Predicate Pred = (PPC::Predicate) UseMI->getOperand(0).getImm();
+        assert((!equalityOnly ||
+                Pred == PPC::PRED_EQ || Pred == PPC::PRED_NE) &&
+               "Invalid predicate for equality-only optimization");
+        PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
+                                PPC::getSwappedPredicate(Pred)));
+      } else if (UseMI->getOpcode() == PPC::ISEL ||
+                 UseMI->getOpcode() == PPC::ISEL8) {
+        unsigned NewSubReg = UseMI->getOperand(3).getSubReg();
+        assert((!equalityOnly || NewSubReg == PPC::sub_eq) &&
+               "Invalid CR bit for equality-only optimization");
+
+        if (NewSubReg == PPC::sub_lt)
+          NewSubReg = PPC::sub_gt;
+        else if (NewSubReg == PPC::sub_gt)
+          NewSubReg = PPC::sub_lt;
+
+        SubRegsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(3)),
+                                                 NewSubReg));
+      } else // We need to abort on a user we don't understand.
+        return false;
+    }
+
+  // Create a new virtual register to hold the value of the CR set by the
+  // record-form instruction. If the instruction was not previously in
+  // record form, then set the kill flag on the CR.
+  CmpInstr.eraseFromParent();
+
+  MachineBasicBlock::iterator MII = MI;
+  BuildMI(*MI->getParent(), std::next(MII), MI->getDebugLoc(),
+          get(TargetOpcode::COPY), CRReg)
+    .addReg(PPC::CR0, MIOpC != NewOpC ? RegState::Kill : 0);
+
+  // Even if CR0 register were dead before, it is alive now since the
+  // instruction we just built uses it.
+  MI->clearRegisterDeads(PPC::CR0);
+
+  if (MIOpC != NewOpC) {
+    // We need to be careful here: we're replacing one instruction with
+    // another, and we need to make sure that we get all of the right
+    // implicit uses and defs. On the other hand, the caller may be holding
+    // an iterator to this instruction, and so we can't delete it (this is
+    // specifically the case if this is the instruction directly after the
+    // compare).
+
+    const MCInstrDesc &NewDesc = get(NewOpC);
+    MI->setDesc(NewDesc);
+
+    if (NewDesc.ImplicitDefs)
+      for (const MCPhysReg *ImpDefs = NewDesc.getImplicitDefs();
+           *ImpDefs; ++ImpDefs)
+        if (!MI->definesRegister(*ImpDefs))
+          MI->addOperand(*MI->getParent()->getParent(),
+                         MachineOperand::CreateReg(*ImpDefs, true, true));
+    if (NewDesc.ImplicitUses)
+      for (const MCPhysReg *ImpUses = NewDesc.getImplicitUses();
+           *ImpUses; ++ImpUses)
+        if (!MI->readsRegister(*ImpUses))
+          MI->addOperand(*MI->getParent()->getParent(),
+                         MachineOperand::CreateReg(*ImpUses, false, true));
+  }
+  assert(MI->definesRegister(PPC::CR0) &&
+         "Record-form instruction does not define cr0?");
+
+  // Modify the condition code of operands in OperandsToUpdate.
+  // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
+  // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+  for (unsigned i = 0, e = PredsToUpdate.size(); i < e; i++)
+    PredsToUpdate[i].first->setImm(PredsToUpdate[i].second);
+
+  for (unsigned i = 0, e = SubRegsToUpdate.size(); i < e; i++)
+    SubRegsToUpdate[i].first->setSubReg(SubRegsToUpdate[i].second);
+
+  return true;
+}
+
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+///
+unsigned PPCInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+
+  if (Opcode == PPC::INLINEASM) {
+    const MachineFunction *MF = MI.getParent()->getParent();
+    const char *AsmStr = MI.getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  } else if (Opcode == TargetOpcode::STACKMAP) {
+    StackMapOpers Opers(&MI);
+    return Opers.getNumPatchBytes();
+  } else if (Opcode == TargetOpcode::PATCHPOINT) {
+    PatchPointOpers Opers(&MI);
+    return Opers.getNumPatchBytes();
+  } else {
+    const MCInstrDesc &Desc = get(Opcode);
+    return Desc.getSize();
+  }
+}
+
+std::pair<unsigned, unsigned>
+PPCInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+  const unsigned Mask = PPCII::MO_ACCESS_MASK;
+  return std::make_pair(TF & Mask, TF & ~Mask);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+PPCInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+  using namespace PPCII;
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_LO, "ppc-lo"},
+      {MO_HA, "ppc-ha"},
+      {MO_TPREL_LO, "ppc-tprel-lo"},
+      {MO_TPREL_HA, "ppc-tprel-ha"},
+      {MO_DTPREL_LO, "ppc-dtprel-lo"},
+      {MO_TLSLD_LO, "ppc-tlsld-lo"},
+      {MO_TOC_LO, "ppc-toc-lo"},
+      {MO_TLS, "ppc-tls"}};
+  return makeArrayRef(TargetFlags);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
+  using namespace PPCII;
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_PLT, "ppc-plt"},
+      {MO_PIC_FLAG, "ppc-pic"},
+      {MO_NLP_FLAG, "ppc-nlp"},
+      {MO_NLP_HIDDEN_FLAG, "ppc-nlp-hidden"}};
+  return makeArrayRef(TargetFlags);
+}
+
+bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::LOAD_STACK_GUARD: {
+    assert(Subtarget.isTargetLinux() &&
+           "Only Linux target is expected to contain LOAD_STACK_GUARD");
+    const int64_t Offset = Subtarget.isPPC64() ? -0x7010 : -0x7008;
+    const unsigned Reg = Subtarget.isPPC64() ? PPC::X13 : PPC::R2;
+    MI.setDesc(get(Subtarget.isPPC64() ? PPC::LD : PPC::LWZ));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+        .addImm(Offset)
+        .addReg(Reg);
+    return true;
+  }
+  case PPC::DFLOADf32:
+  case PPC::DFLOADf64:
+  case PPC::DFSTOREf32:
+  case PPC::DFSTOREf64: {
+    assert(Subtarget.hasP9Vector() &&
+           "Invalid D-Form Pseudo-ops on non-P9 target.");
+    unsigned UpperOpcode, LowerOpcode;
+    switch (MI.getOpcode()) {
+    case PPC::DFLOADf32:
+      UpperOpcode = PPC::LXSSP;
+      LowerOpcode = PPC::LFS;
+      break;
+    case PPC::DFLOADf64:
+      UpperOpcode = PPC::LXSD;
+      LowerOpcode = PPC::LFD;
+      break;
+    case PPC::DFSTOREf32:
+      UpperOpcode = PPC::STXSSP;
+      LowerOpcode = PPC::STFS;
+      break;
+    case PPC::DFSTOREf64:
+      UpperOpcode = PPC::STXSD;
+      LowerOpcode = PPC::STFD;
+      break;
+    }
+    unsigned TargetReg = MI.getOperand(0).getReg();
+    unsigned Opcode;
+    if ((TargetReg >= PPC::F0 && TargetReg <= PPC::F31) ||
+        (TargetReg >= PPC::VSL0 && TargetReg <= PPC::VSL31))
+      Opcode = LowerOpcode;
+    else
+      Opcode = UpperOpcode;
+    MI.setDesc(get(Opcode));
+    return true;
+  }
+  }
+  return false;
+}
+
+const TargetRegisterClass *
+PPCInstrInfo::updatedRC(const TargetRegisterClass *RC) const {
+  if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass)
+    return &PPC::VSRCRegClass;
+  return RC;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
new file mode 100644
index 000000000000..32b2f009a3f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -0,0 +1,297 @@
+//===-- PPCInstrInfo.h - PowerPC Instruction Information --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCINSTRINFO_H
+#define LLVM_LIB_TARGET_POWERPC_PPCINSTRINFO_H
+
+#include "PPC.h"
+#include "PPCRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "PPCGenInstrInfo.inc"
+
+namespace llvm {
+
+/// PPCII - This namespace holds all of the PowerPC target-specific
+/// per-instruction flags.  These must match the corresponding definitions in
+/// PPC.td and PPCInstrFormats.td.
+namespace PPCII {
+enum {
+  // PPC970 Instruction Flags.  These flags describe the characteristics of the
+  // PowerPC 970 (aka G5) dispatch groups and how they are formed out of
+  // raw machine instructions.
+
+  /// PPC970_First - This instruction starts a new dispatch group, so it will
+  /// always be the first one in the group.
+  PPC970_First = 0x1,
+
+  /// PPC970_Single - This instruction starts a new dispatch group and
+  /// terminates it, so it will be the sole instruction in the group.
+  PPC970_Single = 0x2,
+
+  /// PPC970_Cracked - This instruction is cracked into two pieces, requiring
+  /// two dispatch pipes to be available to issue.
+  PPC970_Cracked = 0x4,
+
+  /// PPC970_Mask/Shift - This is a bitmask that selects the pipeline type that
+  /// an instruction is issued to.
+  PPC970_Shift = 3,
+  PPC970_Mask = 0x07 << PPC970_Shift
+};
+enum PPC970_Unit {
+  /// These are the various PPC970 execution unit pipelines.  Each instruction
+  /// is one of these.
+  PPC970_Pseudo = 0 << PPC970_Shift,   // Pseudo instruction
+  PPC970_FXU    = 1 << PPC970_Shift,   // Fixed Point (aka Integer/ALU) Unit
+  PPC970_LSU    = 2 << PPC970_Shift,   // Load Store Unit
+  PPC970_FPU    = 3 << PPC970_Shift,   // Floating Point Unit
+  PPC970_CRU    = 4 << PPC970_Shift,   // Control Register Unit
+  PPC970_VALU   = 5 << PPC970_Shift,   // Vector ALU
+  PPC970_VPERM  = 6 << PPC970_Shift,   // Vector Permute Unit
+  PPC970_BRU    = 7 << PPC970_Shift    // Branch Unit
+};
+
+enum {
+  /// Shift count to bypass PPC970 flags
+  NewDef_Shift = 6,
+
+  /// The VSX instruction that uses VSX register (vs0-vs63), instead of VMX
+  /// register (v0-v31).
+  UseVSXReg = 0x1 << NewDef_Shift
+};
+} // end namespace PPCII
+
+class PPCSubtarget;
+class PPCInstrInfo : public PPCGenInstrInfo {
+  PPCSubtarget &Subtarget;
+  const PPCRegisterInfo RI;
+
+  bool StoreRegToStackSlot(MachineFunction &MF,
+                           unsigned SrcReg, bool isKill, int FrameIdx,
+                           const TargetRegisterClass *RC,
+                           SmallVectorImpl<MachineInstr*> &NewMIs,
+                           bool &NonRI, bool &SpillsVRS) const;
+  bool LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
+                            unsigned DestReg, int FrameIdx,
+                            const TargetRegisterClass *RC,
+                            SmallVectorImpl<MachineInstr *> &NewMIs,
+                            bool &NonRI, bool &SpillsVRS) const;
+  virtual void anchor();
+
+protected:
+  /// Commutes the operands in the given instruction.
+  /// The commutable operands are specified by their indices OpIdx1 and OpIdx2.
+  ///
+  /// Do not call this method for a non-commutable instruction or for
+  /// non-commutable pair of operand indices OpIdx1 and OpIdx2.
+  /// Even though the instruction is commutable, the method may still
+  /// fail to commute the operands, null pointer is returned in such cases.
+  ///
+  /// For example, we can commute rlwimi instructions, but only if the
+  /// rotate amt is zero.  We also have to munge the immediates a bit.
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+                                       unsigned OpIdx1,
+                                       unsigned OpIdx2) const override;
+
+public:
+  explicit PPCInstrInfo(PPCSubtarget &STI);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const PPCRegisterInfo &getRegisterInfo() const { return RI; }
+
+  ScheduleHazardRecognizer *
+  CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+                               const ScheduleDAG *DAG) const override;
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                     const ScheduleDAG *DAG) const override;
+
+  unsigned getInstrLatency(const InstrItineraryData *ItinData,
+                           const MachineInstr &MI,
+                           unsigned *PredCost = nullptr) const override;
+
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        const MachineInstr &DefMI, unsigned DefIdx,
+                        const MachineInstr &UseMI,
+                        unsigned UseIdx) const override;
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        SDNode *DefNode, unsigned DefIdx,
+                        SDNode *UseNode, unsigned UseIdx) const override {
+    return PPCGenInstrInfo::getOperandLatency(ItinData, DefNode, DefIdx,
+                                              UseNode, UseIdx);
+  }
+
+  bool hasLowDefLatency(const TargetSchedModel &SchedModel,
+                        const MachineInstr &DefMI,
+                        unsigned DefIdx) const override {
+    // Machine LICM should hoist all instructions in low-register-pressure
+    // situations; none are sufficiently free to justify leaving in a loop
+    // body.
+    return false;
+  }
+
+  bool useMachineCombiner() const override {
+    return true;
+  }
+
+  /// Return true when there is potentially a faster code sequence
+  /// for an instruction chain ending in <Root>. All potential patterns are
+  /// output in the <Pattern> array.
+  bool getMachineCombinerPatterns(
+      MachineInstr &Root,
+      SmallVectorImpl<MachineCombinerPattern> &P) const override;
+
+  bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
+
+  bool isCoalescableExtInstr(const MachineInstr &MI,
+                             unsigned &SrcReg, unsigned &DstReg,
+                             unsigned &SubIdx) const override;
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+
+  bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+                             unsigned &SrcOpIdx2) const override;
+
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const override;
+
+
+  // Branch analysis.
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+
+  // Select analysis.
+  bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
+                       unsigned, unsigned, int &, int &, int &) const override;
+  void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                    const DebugLoc &DL, unsigned DstReg,
+                    ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+                    unsigned FalseReg) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool
+  reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+                     MachineRegisterInfo *MRI) const override;
+
+  // If conversion by predication (only supported by some branch instructions).
+  // All of the profitability checks always return true; it is always
+  // profitable to use the predicated branches.
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB,
+                          unsigned NumCycles, unsigned ExtraPredCycles,
+                          BranchProbability Probability) const override {
+    return true;
+  }
+
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                           unsigned NumT, unsigned ExtraT,
+                           MachineBasicBlock &FMBB,
+                           unsigned NumF, unsigned ExtraF,
+                           BranchProbability Probability) const override;
+
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                                 BranchProbability Probability) const override {
+    return true;
+  }
+
+  bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                 MachineBasicBlock &FMBB) const override {
+    return false;
+  }
+
+  // Predication support.
+  bool isPredicated(const MachineInstr &MI) const override;
+
+  bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
+
+  bool PredicateInstruction(MachineInstr &MI,
+                            ArrayRef<MachineOperand> Pred) const override;
+
+  bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+                         ArrayRef<MachineOperand> Pred2) const override;
+
+  bool DefinesPredicate(MachineInstr &MI,
+                        std::vector<MachineOperand> &Pred) const override;
+
+  bool isPredicable(MachineInstr &MI) const override;
+
+  // Comparison optimization.
+
+  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &Mask, int &Value) const override;
+
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int Mask, int Value,
+                            const MachineRegisterInfo *MRI) const override;
+
+  /// GetInstSize - Return the number of bytes of code the specified
+  /// instruction may be.  This returns the maximum number of bytes.
+  ///
+  unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  std::pair<unsigned, unsigned>
+  decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableDirectMachineOperandTargetFlags() const override;
+
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableBitmaskMachineOperandTargetFlags() const override;
+
+  // Lower pseudo instructions after register allocation.
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+  static bool isVFRegister(unsigned Reg) {
+    return Reg >= PPC::VF0 && Reg <= PPC::VF31;
+  }
+  static bool isVRRegister(unsigned Reg) {
+    return Reg >= PPC::V0 && Reg <= PPC::V31;
+  }
+  const TargetRegisterClass *updatedRC(const TargetRegisterClass *RC) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
new file mode 100644
index 000000000000..a7231bd2e2c0
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -0,0 +1,4403 @@
+//===-- PPCInstrInfo.td - The PowerPC Instruction Set ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the subset of the 32-bit PowerPC instruction set, as used
+// by the PowerPC instruction selector.
+//
+//===----------------------------------------------------------------------===//
+
+include "PPCInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific type constraints.
+//
+def SDT_PPCstfiwx : SDTypeProfile<0, 2, [ // stfiwx
+  SDTCisVT<0, f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPClfiwx : SDTypeProfile<1, 1, [ // lfiw[az]x
+  SDTCisVT<0, f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCLxsizx : SDTypeProfile<1, 2, [
+  SDTCisVT<0, f64>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
+]>;
+def SDT_PPCstxsix : SDTypeProfile<0, 3, [
+  SDTCisVT<0, f64>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
+]>;
+def SDT_PPCVexts  : SDTypeProfile<1, 2, [
+  SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2>
+]>;
+
+def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_PPCCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                         SDTCisVT<1, i32> ]>;
+def SDT_PPCvperm   : SDTypeProfile<1, 3, [
+  SDTCisVT<3, v16i8>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>
+]>;
+
+def SDT_PPCVecSplat : SDTypeProfile<1, 2, [ SDTCisVec<0>,
+  SDTCisVec<1>, SDTCisInt<2>
+]>;
+
+def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>,
+  SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
+]>;
+
+def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>,
+  SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
+]>;
+
+def SDT_PPCvcmp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>
+]>;
+
+def SDT_PPCcondbr : SDTypeProfile<0, 3, [
+  SDTCisVT<0, i32>, SDTCisVT<2, OtherVT>
+]>;
+
+def SDT_PPClbrx : SDTypeProfile<1, 2, [
+  SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
+]>;
+def SDT_PPCstbrx : SDTypeProfile<0, 3, [
+  SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
+]>;
+
+def SDT_PPCTC_ret : SDTypeProfile<0, 2, [
+  SDTCisPtrTy<0>, SDTCisVT<1, i32>
+]>;
+
+def tocentry32 : Operand<iPTR> {
+  let MIOperandInfo = (ops i32imm:$imm);
+}
+
+def SDT_PPCqvfperm   : SDTypeProfile<1, 3, [
+  SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVec<3>
+]>;
+def SDT_PPCqvgpci   : SDTypeProfile<1, 1, [
+  SDTCisVec<0>, SDTCisInt<1>
+]>;
+def SDT_PPCqvaligni   : SDTypeProfile<1, 3, [
+  SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3>
+]>;
+def SDT_PPCqvesplati   : SDTypeProfile<1, 2, [
+  SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisInt<2>
+]>;
+
+def SDT_PPCqbflt : SDTypeProfile<1, 1, [
+  SDTCisVec<0>, SDTCisVec<1>
+]>;
+
+def SDT_PPCqvlfsb : SDTypeProfile<1, 1, [
+  SDTCisVec<0>, SDTCisPtrTy<1>
+]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific DAG Nodes.
+//
+
+def PPCfre    : SDNode<"PPCISD::FRE",     SDTFPUnaryOp, []>;
+def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;
+
+def PPCfcfid  : SDNode<"PPCISD::FCFID",   SDTFPUnaryOp, []>;
+def PPCfcfidu : SDNode<"PPCISD::FCFIDU",  SDTFPUnaryOp, []>;
+def PPCfcfids : SDNode<"PPCISD::FCFIDS",  SDTFPRoundOp, []>;
+def PPCfcfidus: SDNode<"PPCISD::FCFIDUS", SDTFPRoundOp, []>;
+def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>;
+def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
+def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
+def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
+def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx,
+                       [SDNPHasChain, SDNPMayStore]>;
+def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx,
+                       [SDNPHasChain, SDNPMayLoad]>;
+def PPClfiwzx : SDNode<"PPCISD::LFIWZX", SDT_PPClfiwx,
+                       [SDNPHasChain, SDNPMayLoad]>;
+def PPClxsizx : SDNode<"PPCISD::LXSIZX", SDT_PPCLxsizx,
+                       [SDNPHasChain, SDNPMayLoad]>;
+def PPCstxsix : SDNode<"PPCISD::STXSIX", SDT_PPCstxsix,
+                       [SDNPHasChain, SDNPMayStore]>;
+def PPCVexts  : SDNode<"PPCISD::VEXTS", SDT_PPCVexts, []>;
+
+// Extract FPSCR (not modeled at the DAG level).
+def PPCmffs   : SDNode<"PPCISD::MFFS",
+                       SDTypeProfile<1, 0, [SDTCisVT<0, f64>]>, []>;
+
+// Perform FADD in round-to-zero mode.
+def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp, []>;
+
+
+def PPCfsel   : SDNode<"PPCISD::FSEL",  
+   // Type constraint for fsel.
+   SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, 
+                        SDTCisFP<0>, SDTCisVT<1, f64>]>, []>;
+
+def PPChi       : SDNode<"PPCISD::Hi", SDTIntBinOp, []>;
+def PPClo       : SDNode<"PPCISD::Lo", SDTIntBinOp, []>;
+def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp,
+                         [SDNPMayLoad, SDNPMemOperand]>;
+def PPCvmaddfp  : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>;
+def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>;
+
+def PPCppc32GOT : SDNode<"PPCISD::PPC32_GOT", SDTIntLeaf, []>;
+
+def PPCaddisGotTprelHA : SDNode<"PPCISD::ADDIS_GOT_TPREL_HA", SDTIntBinOp>;
+def PPCldGotTprelL : SDNode<"PPCISD::LD_GOT_TPREL_L", SDTIntBinOp,
+                            [SDNPMayLoad]>;
+def PPCaddTls     : SDNode<"PPCISD::ADD_TLS", SDTIntBinOp, []>;
+def PPCaddisTlsgdHA : SDNode<"PPCISD::ADDIS_TLSGD_HA", SDTIntBinOp>;
+def PPCaddiTlsgdL   : SDNode<"PPCISD::ADDI_TLSGD_L", SDTIntBinOp>;
+def PPCgetTlsAddr   : SDNode<"PPCISD::GET_TLS_ADDR", SDTIntBinOp>;
+def PPCaddiTlsgdLAddr : SDNode<"PPCISD::ADDI_TLSGD_L_ADDR",
+                               SDTypeProfile<1, 3, [
+                                 SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                 SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
+def PPCaddisTlsldHA : SDNode<"PPCISD::ADDIS_TLSLD_HA", SDTIntBinOp>;
+def PPCaddiTlsldL   : SDNode<"PPCISD::ADDI_TLSLD_L", SDTIntBinOp>;
+def PPCgetTlsldAddr : SDNode<"PPCISD::GET_TLSLD_ADDR", SDTIntBinOp>;
+def PPCaddiTlsldLAddr : SDNode<"PPCISD::ADDI_TLSLD_L_ADDR",
+                               SDTypeProfile<1, 3, [
+                                 SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                 SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
+def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp>;
+def PPCaddiDtprelL   : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
+
+def PPCvperm     : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
+def PPCxxsplt    : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
+def PPCxxinsert  : SDNode<"PPCISD::XXINSERT", SDT_PPCVecInsert, []>;
+def PPCvecshl    : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
+
+def PPCqvfperm   : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>;
+def PPCqvgpci    : SDNode<"PPCISD::QVGPCI", SDT_PPCqvgpci, []>;
+def PPCqvaligni  : SDNode<"PPCISD::QVALIGNI", SDT_PPCqvaligni, []>;
+def PPCqvesplati : SDNode<"PPCISD::QVESPLATI", SDT_PPCqvesplati, []>;
+
+def PPCqbflt     : SDNode<"PPCISD::QBFLT", SDT_PPCqbflt, []>;
+
+def PPCqvlfsb    : SDNode<"PPCISD::QVLFSb", SDT_PPCqvlfsb,
+                          [SDNPHasChain, SDNPMayLoad]>;
+
+def PPCcmpb     : SDNode<"PPCISD::CMPB", SDTIntBinOp, []>;
+
+// These nodes represent the 32-bit PPC shifts that operate on 6-bit shift
+// amounts.  These nodes are generated by the multi-precision shift code.
+def PPCsrl        : SDNode<"PPCISD::SRL"       , SDTIntShiftOp>;
+def PPCsra        : SDNode<"PPCISD::SRA"       , SDTIntShiftOp>;
+def PPCshl        : SDNode<"PPCISD::SHL"       , SDTIntShiftOp>;
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_PPCCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def SDT_PPCCall   : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+def PPCcall  : SDNode<"PPCISD::CALL", SDT_PPCCall,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                       SDNPVariadic]>;
+def PPCcall_nop  : SDNode<"PPCISD::CALL_NOP", SDT_PPCCall,
+                          [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                           SDNPVariadic]>;
+def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def PPCbctrl : SDNode<"PPCISD::BCTRL", SDTNone,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                       SDNPVariadic]>;
+def PPCbctrl_load_toc : SDNode<"PPCISD::BCTRL_LOAD_TOC",
+                               SDTypeProfile<0, 1, []>,
+                               [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                SDNPVariadic]>;
+
+def retflag       : SDNode<"PPCISD::RET_FLAG", SDTNone,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def PPCtc_return : SDNode<"PPCISD::TC_RETURN", SDT_PPCTC_ret,
+                        [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+def PPCeh_sjlj_setjmp  : SDNode<"PPCISD::EH_SJLJ_SETJMP",
+                                SDTypeProfile<1, 1, [SDTCisInt<0>,
+                                                     SDTCisPtrTy<1>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP",
+                                SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+
+def SDT_PPCsc     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def PPCsc         : SDNode<"PPCISD::SC", SDT_PPCsc,
+                           [SDNPHasChain, SDNPSideEffect]>;
+
+def PPCclrbhrb    : SDNode<"PPCISD::CLRBHRB", SDTNone,
+                           [SDNPHasChain, SDNPSideEffect]>;
+def PPCmfbhrbe    : SDNode<"PPCISD::MFBHRBE", SDTIntBinOp, [SDNPHasChain]>;
+def PPCrfebb      : SDNode<"PPCISD::RFEBB", SDT_PPCsc,
+                           [SDNPHasChain, SDNPSideEffect]>;
+
+def PPCvcmp       : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
+def PPCvcmp_o     : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>;
+
+def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
+                           [SDNPHasChain, SDNPOptInGlue]>;
+
+def PPClbrx       : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
+                           [SDNPHasChain, SDNPMayLoad]>;
+def PPCstbrx      : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
+                           [SDNPHasChain, SDNPMayStore]>;
+
+// Instructions to set/unset CR bit 6 for SVR4 vararg calls
+def PPCcr6set   : SDNode<"PPCISD::CR6SET", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+// Instructions to support dynamic alloca.
+def SDTDynOp  : SDTypeProfile<1, 2, []>;
+def SDTDynAreaOp  : SDTypeProfile<1, 1, []>;
+def PPCdynalloc   : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
+def PPCdynareaoffset   : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific transformation functions and pattern fragments.
+//
+
+def SHL32 : SDNodeXForm<imm, [{
+  // Transformation function: 31 - imm
+  return getI32Imm(31 - N->getZExtValue(), SDLoc(N));
+}]>;
+
+def SRL32 : SDNodeXForm<imm, [{
+  // Transformation function: 32 - imm
+  return N->getZExtValue() ? getI32Imm(32 - N->getZExtValue(), SDLoc(N))
+                           : getI32Imm(0, SDLoc(N));
+}]>;
+
+def LO16 : SDNodeXForm<imm, [{
+  // Transformation function: get the low 16 bits.
+  return getI32Imm((unsigned short)N->getZExtValue(), SDLoc(N));
+}]>;
+
+def HI16 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return getI32Imm((unsigned)N->getZExtValue() >> 16, SDLoc(N));
+}]>;
+
+def HA16 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  int Val = N->getZExtValue();
+  return getI32Imm((Val - (signed short)Val) >> 16, SDLoc(N));
+}]>;
+def MB : SDNodeXForm<imm, [{
+  // Transformation function: get the start bit of a mask
+  unsigned mb = 0, me;
+  (void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+  return getI32Imm(mb, SDLoc(N));
+}]>;
+
+def ME : SDNodeXForm<imm, [{
+  // Transformation function: get the end bit of a mask
+  unsigned mb, me = 0;
+  (void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+  return getI32Imm(me, SDLoc(N));
+}]>;
+def maskimm32 : PatLeaf<(imm), [{
+  // maskImm predicate - True if immediate is a run of ones.
+  unsigned mb, me;
+  if (N->getValueType(0) == MVT::i32)
+    return isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+  else
+    return false;
+}]>;
+
+def imm32SExt16  : Operand<i32>, ImmLeaf<i32, [{
+  // imm32SExt16 predicate - True if the i32 immediate fits in a 16-bit
+  // sign extended field.  Used by instructions like 'addi'.
+  return (int32_t)Imm == (short)Imm;
+}]>;
+def imm64SExt16  : Operand<i64>, ImmLeaf<i64, [{
+  // imm64SExt16 predicate - True if the i64 immediate fits in a 16-bit
+  // sign extended field.  Used by instructions like 'addi'.
+  return (int64_t)Imm == (short)Imm;
+}]>;
+def immZExt16  : PatLeaf<(imm), [{
+  // immZExt16 predicate - True if the immediate fits in a 16-bit zero extended
+  // field.  Used by instructions like 'ori'.
+  return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+}], LO16>;
+def immAnyExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm) || isUInt<8>(Imm); }]>;
+def immSExt5NonZero : ImmLeaf<i32, [{ return Imm && isInt<5>(Imm); }]>;
+
+// imm16Shifted* - These match immediates where the low 16-bits are zero.  There
+// are two forms: imm16ShiftedSExt and imm16ShiftedZExt.  These two forms are
+// identical in 32-bit mode, but in 64-bit mode, they return true if the
+// immediate fits into a sign/zero extended 32-bit immediate (with the low bits
+// clear).
+def imm16ShiftedZExt : PatLeaf<(imm), [{
+  // imm16ShiftedZExt predicate - True if only bits in the top 16-bits of the
+  // immediate are set.  Used by instructions like 'xoris'.
+  return (N->getZExtValue() & ~uint64_t(0xFFFF0000)) == 0;
+}], HI16>;
+
+def imm16ShiftedSExt : PatLeaf<(imm), [{
+  // imm16ShiftedSExt predicate - True if only bits in the top 16-bits of the
+  // immediate are set.  Used by instructions like 'addis'.  Identical to 
+  // imm16ShiftedZExt in 32-bit mode.
+  if (N->getZExtValue() & 0xFFFF) return false;
+  if (N->getValueType(0) == MVT::i32)
+    return true;
+  // For 64-bit, make sure it is sext right.
+  return N->getZExtValue() == (uint64_t)(int)N->getZExtValue();
+}], HI16>;
+
+def imm64ZExt32  : Operand<i64>, ImmLeaf<i64, [{
+  // imm64ZExt32 predicate - True if the i64 immediate fits in a 32-bit
+  // zero extended field.
+  return isUInt<32>(Imm);
+}]>;
+
+// Some r+i load/store instructions (such as LD, STD, LDU, etc.) that require
+// restricted memrix (4-aligned) constants are alignment sensitive. If these
+// offsets are hidden behind TOC entries than the values of the lower-order
+// bits cannot be checked directly. As a result, we need to also incorporate
+// an alignment check into the relevant patterns.
+
+def aligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned4store : PatFrag<(ops node:$val, node:$ptr),
+                            (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned4pre_store : PatFrag<
+                          (ops node:$val, node:$base, node:$offset),
+                          (pre_store node:$val, node:$base, node:$offset), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+def unaligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() < 4;
+}]>;
+def unaligned4store : PatFrag<(ops node:$val, node:$ptr),
+                              (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() < 4;
+}]>;
+def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() < 4;
+}]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Flag Definitions.
+
+class isPPC64 { bit PPC64 = 1; }
+class isDOT   { bit RC = 1; }
+
+class RegConstraint<string C> {
+  string Constraints = C;
+}
+class NoEncode<string E> {
+  string DisableEncoding = E;
+}
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Operand Definitions.
+
+// In the default PowerPC assembler syntax, registers are specified simply
+// by number, so they cannot be distinguished from immediate values (without
+// looking at the opcode).  This means that the default operand matching logic
+// for the asm parser does not work, and we need to specify custom matchers.
+// Since those can only be specified with RegisterOperand classes and not
+// directly on the RegisterClass, all instructions patterns used by the asm
+// parser need to use a RegisterOperand (instead of a RegisterClass) for
+// all their register operands.
+// For this purpose, we define one RegisterOperand for each RegisterClass,
+// using the same name as the class, just in lower case.
+
+def PPCRegGPRCAsmOperand : AsmOperandClass {
+  let Name = "RegGPRC"; let PredicateMethod = "isRegNumber";
+}
+def gprc : RegisterOperand<GPRC> {
+  let ParserMatchClass = PPCRegGPRCAsmOperand;
+}
+def PPCRegG8RCAsmOperand : AsmOperandClass {
+  let Name = "RegG8RC"; let PredicateMethod = "isRegNumber";
+}
+def g8rc : RegisterOperand<G8RC> {
+  let ParserMatchClass = PPCRegG8RCAsmOperand;
+}
+def PPCRegGPRCNoR0AsmOperand : AsmOperandClass {
+  let Name = "RegGPRCNoR0"; let PredicateMethod = "isRegNumber";
+}
+def gprc_nor0 : RegisterOperand<GPRC_NOR0> {
+  let ParserMatchClass = PPCRegGPRCNoR0AsmOperand;
+}
+def PPCRegG8RCNoX0AsmOperand : AsmOperandClass {
+  let Name = "RegG8RCNoX0"; let PredicateMethod = "isRegNumber";
+}
+def g8rc_nox0 : RegisterOperand<G8RC_NOX0> {
+  let ParserMatchClass = PPCRegG8RCNoX0AsmOperand;
+}
+def PPCRegF8RCAsmOperand : AsmOperandClass {
+  let Name = "RegF8RC"; let PredicateMethod = "isRegNumber";
+}
+def f8rc : RegisterOperand<F8RC> {
+  let ParserMatchClass = PPCRegF8RCAsmOperand;
+}
+def PPCRegF4RCAsmOperand : AsmOperandClass {
+  let Name = "RegF4RC"; let PredicateMethod = "isRegNumber";
+}
+def f4rc : RegisterOperand<F4RC> {
+  let ParserMatchClass = PPCRegF4RCAsmOperand;
+}
+def PPCRegVRRCAsmOperand : AsmOperandClass {
+  let Name = "RegVRRC"; let PredicateMethod = "isRegNumber";
+}
+def vrrc : RegisterOperand<VRRC> {
+  let ParserMatchClass = PPCRegVRRCAsmOperand;
+}
+def PPCRegVFRCAsmOperand : AsmOperandClass {
+  let Name = "RegVFRC"; let PredicateMethod = "isRegNumber";
+}
+def vfrc : RegisterOperand<VFRC> {
+  let ParserMatchClass = PPCRegVFRCAsmOperand;
+}
+def PPCRegCRBITRCAsmOperand : AsmOperandClass {
+  let Name = "RegCRBITRC"; let PredicateMethod = "isCRBitNumber";
+}
+def crbitrc : RegisterOperand<CRBITRC> {
+  let ParserMatchClass = PPCRegCRBITRCAsmOperand;
+}
+def PPCRegCRRCAsmOperand : AsmOperandClass {
+  let Name = "RegCRRC"; let PredicateMethod = "isCCRegNumber";
+}
+def crrc : RegisterOperand<CRRC> {
+  let ParserMatchClass = PPCRegCRRCAsmOperand;
+}
+def crrc0 : RegisterOperand<CRRC0> {
+  let ParserMatchClass = PPCRegCRRCAsmOperand;
+}
+
+def PPCU1ImmAsmOperand : AsmOperandClass {
+  let Name = "U1Imm"; let PredicateMethod = "isU1Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u1imm   : Operand<i32> {
+  let PrintMethod = "printU1ImmOperand";
+  let ParserMatchClass = PPCU1ImmAsmOperand;
+}
+
+def PPCU2ImmAsmOperand : AsmOperandClass {
+  let Name = "U2Imm"; let PredicateMethod = "isU2Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u2imm   : Operand<i32> {
+  let PrintMethod = "printU2ImmOperand";
+  let ParserMatchClass = PPCU2ImmAsmOperand;
+}
+
+def PPCATBitsAsHintAsmOperand : AsmOperandClass {
+  let Name = "ATBitsAsHint"; let PredicateMethod = "isATBitsAsHint";
+  let RenderMethod = "addImmOperands"; // Irrelevant, predicate always fails.
+}
+def atimm   : Operand<i32> {
+  let PrintMethod = "printATBitsAsHint";
+  let ParserMatchClass = PPCATBitsAsHintAsmOperand;
+}
+
+def PPCU3ImmAsmOperand : AsmOperandClass {
+  let Name = "U3Imm"; let PredicateMethod = "isU3Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u3imm   : Operand<i32> {
+  let PrintMethod = "printU3ImmOperand";
+  let ParserMatchClass = PPCU3ImmAsmOperand;
+}
+
+def PPCU4ImmAsmOperand : AsmOperandClass {
+  let Name = "U4Imm"; let PredicateMethod = "isU4Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u4imm   : Operand<i32> {
+  let PrintMethod = "printU4ImmOperand";
+  let ParserMatchClass = PPCU4ImmAsmOperand;
+}
+def PPCS5ImmAsmOperand : AsmOperandClass {
+  let Name = "S5Imm"; let PredicateMethod = "isS5Imm";
+  let RenderMethod = "addImmOperands";
+}
+def s5imm   : Operand<i32> {
+  let PrintMethod = "printS5ImmOperand";
+  let ParserMatchClass = PPCS5ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<5>";
+}
+def PPCU5ImmAsmOperand : AsmOperandClass {
+  let Name = "U5Imm"; let PredicateMethod = "isU5Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u5imm   : Operand<i32> {
+  let PrintMethod = "printU5ImmOperand";
+  let ParserMatchClass = PPCU5ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<5>";
+}
+def PPCU6ImmAsmOperand : AsmOperandClass {
+  let Name = "U6Imm"; let PredicateMethod = "isU6Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u6imm   : Operand<i32> {
+  let PrintMethod = "printU6ImmOperand";
+  let ParserMatchClass = PPCU6ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<6>";
+}
+def PPCU7ImmAsmOperand : AsmOperandClass {
+  let Name = "U7Imm"; let PredicateMethod = "isU7Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u7imm   : Operand<i32> {
+  let PrintMethod = "printU7ImmOperand";
+  let ParserMatchClass = PPCU7ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<7>";
+}
+def PPCU8ImmAsmOperand : AsmOperandClass {
+  let Name = "U8Imm"; let PredicateMethod = "isU8Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u8imm   : Operand<i32> {
+  let PrintMethod = "printU8ImmOperand";
+  let ParserMatchClass = PPCU8ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<8>";
+}
+def PPCU10ImmAsmOperand : AsmOperandClass {
+  let Name = "U10Imm"; let PredicateMethod = "isU10Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u10imm  : Operand<i32> {
+  let PrintMethod = "printU10ImmOperand";
+  let ParserMatchClass = PPCU10ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<10>";
+}
+def PPCU12ImmAsmOperand : AsmOperandClass {
+  let Name = "U12Imm"; let PredicateMethod = "isU12Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u12imm  : Operand<i32> {
+  let PrintMethod = "printU12ImmOperand";
+  let ParserMatchClass = PPCU12ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<12>";
+}
+def PPCS16ImmAsmOperand : AsmOperandClass {
+  let Name = "S16Imm"; let PredicateMethod = "isS16Imm";
+  let RenderMethod = "addS16ImmOperands";
+}
+def s16imm  : Operand<i32> {
+  let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCS16ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<16>";
+}
+def PPCU16ImmAsmOperand : AsmOperandClass {
+  let Name = "U16Imm"; let PredicateMethod = "isU16Imm";
+  let RenderMethod = "addU16ImmOperands";
+}
+def u16imm  : Operand<i32> {
+  let PrintMethod = "printU16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCU16ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<16>";
+}
+def PPCS17ImmAsmOperand : AsmOperandClass {
+  let Name = "S17Imm"; let PredicateMethod = "isS17Imm";
+  let RenderMethod = "addS16ImmOperands";
+}
+def s17imm  : Operand<i32> {
+  // This operand type is used for addis/lis to allow the assembler parser
+  // to accept immediates in the range -65536..65535 for compatibility with
+  // the GNU assembler.  The operand is treated as 16-bit otherwise.
+  let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCS17ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<16>";
+}
+
+def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
+
+def PPCDirectBrAsmOperand : AsmOperandClass {
+  let Name = "DirectBr"; let PredicateMethod = "isDirectBr";
+  let RenderMethod = "addBranchTargetOperands";
+}
+def directbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printBranchOperand";
+  let EncoderMethod = "getDirectBrEncoding";
+  let ParserMatchClass = PPCDirectBrAsmOperand;
+}
+def absdirectbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printAbsBranchOperand";
+  let EncoderMethod = "getAbsDirectBrEncoding";
+  let ParserMatchClass = PPCDirectBrAsmOperand;
+}
+def PPCCondBrAsmOperand : AsmOperandClass {
+  let Name = "CondBr"; let PredicateMethod = "isCondBr";
+  let RenderMethod = "addBranchTargetOperands";
+}
+def condbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printBranchOperand";
+  let EncoderMethod = "getCondBrEncoding";
+  let ParserMatchClass = PPCCondBrAsmOperand;
+}
+def abscondbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printAbsBranchOperand";
+  let EncoderMethod = "getAbsCondBrEncoding";
+  let ParserMatchClass = PPCCondBrAsmOperand;
+}
+def calltarget : Operand<iPTR> {
+  let PrintMethod = "printBranchOperand";
+  let EncoderMethod = "getDirectBrEncoding";
+  let ParserMatchClass = PPCDirectBrAsmOperand;
+}
+def abscalltarget : Operand<iPTR> {
+  let PrintMethod = "printAbsBranchOperand";
+  let EncoderMethod = "getAbsDirectBrEncoding";
+  let ParserMatchClass = PPCDirectBrAsmOperand;
+}
+def PPCCRBitMaskOperand : AsmOperandClass {
+ let Name = "CRBitMask"; let PredicateMethod = "isCRBitMask";
+}
+def crbitm: Operand<i8> {
+  let PrintMethod = "printcrbitm";
+  let EncoderMethod = "get_crbitm_encoding";
+  let DecoderMethod = "decodeCRBitMOperand";
+  let ParserMatchClass = PPCCRBitMaskOperand;
+}
+// Address operands
+// A version of ptr_rc which excludes R0 (or X0 in 64-bit mode).
+def PPCRegGxRCNoR0Operand : AsmOperandClass {
+  let Name = "RegGxRCNoR0"; let PredicateMethod = "isRegNumber";
+}
+def ptr_rc_nor0 : Operand<iPTR>, PointerLikeRegClass<1> {
+  let ParserMatchClass = PPCRegGxRCNoR0Operand;
+}
+// A version of ptr_rc usable with the asm parser.
+def PPCRegGxRCOperand : AsmOperandClass {
+  let Name = "RegGxRC"; let PredicateMethod = "isRegNumber";
+}
+def ptr_rc_idx : Operand<iPTR>, PointerLikeRegClass<0> {
+  let ParserMatchClass = PPCRegGxRCOperand;
+}
+
+def PPCDispRIOperand : AsmOperandClass {
+ let Name = "DispRI"; let PredicateMethod = "isS16Imm";
+ let RenderMethod = "addS16ImmOperands";
+}
+def dispRI : Operand<iPTR> {
+  let ParserMatchClass = PPCDispRIOperand;
+}
+def PPCDispRIXOperand : AsmOperandClass {
+ let Name = "DispRIX"; let PredicateMethod = "isS16ImmX4";
+ let RenderMethod = "addImmOperands";
+}
+def dispRIX : Operand<iPTR> {
+  let ParserMatchClass = PPCDispRIXOperand;
+}
+def PPCDispRIX16Operand : AsmOperandClass {
+ let Name = "DispRIX16"; let PredicateMethod = "isS16ImmX16";
+ let RenderMethod = "addImmOperands";
+}
+def dispRIX16 : Operand<iPTR> {
+  let ParserMatchClass = PPCDispRIX16Operand;
+}
+def PPCDispSPE8Operand : AsmOperandClass {
+ let Name = "DispSPE8"; let PredicateMethod = "isU8ImmX8";
+ let RenderMethod = "addImmOperands";
+}
+def dispSPE8 : Operand<iPTR> {
+  let ParserMatchClass = PPCDispSPE8Operand;
+}
+def PPCDispSPE4Operand : AsmOperandClass {
+ let Name = "DispSPE4"; let PredicateMethod = "isU7ImmX4";
+ let RenderMethod = "addImmOperands";
+}
+def dispSPE4 : Operand<iPTR> {
+  let ParserMatchClass = PPCDispSPE4Operand;
+}
+def PPCDispSPE2Operand : AsmOperandClass {
+ let Name = "DispSPE2"; let PredicateMethod = "isU6ImmX2";
+ let RenderMethod = "addImmOperands";
+}
+def dispSPE2 : Operand<iPTR> {
+  let ParserMatchClass = PPCDispSPE2Operand;
+}
+
+def memri : Operand<iPTR> {
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispRI:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getMemRIEncoding";
+  let DecoderMethod = "decodeMemRIOperands";
+}
+def memrr : Operand<iPTR> {
+  let PrintMethod = "printMemRegReg";
+  let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg, ptr_rc_idx:$offreg);
+}
+def memrix : Operand<iPTR> {   // memri where the imm is 4-aligned.
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispRIX:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getMemRIXEncoding";
+  let DecoderMethod = "decodeMemRIXOperands";
+}
+def memrix16 : Operand<iPTR> { // memri, imm is 16-aligned, 12-bit, Inst{16:27}
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispRIX16:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getMemRIX16Encoding";
+  let DecoderMethod = "decodeMemRIX16Operands";
+}
+def spe8dis : Operand<iPTR> {   // SPE displacement where the imm is 8-aligned.
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispSPE8:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getSPE8DisEncoding";
+}
+def spe4dis : Operand<iPTR> {   // SPE displacement where the imm is 4-aligned.
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispSPE4:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getSPE4DisEncoding";
+}
+def spe2dis : Operand<iPTR> {   // SPE displacement where the imm is 2-aligned.
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispSPE2:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getSPE2DisEncoding";
+}
+
+// A single-register address. This is used with the SjLj
+// pseudo-instructions.
+def memr : Operand<iPTR> {
+  let MIOperandInfo = (ops ptr_rc:$ptrreg);
+}
+def PPCTLSRegOperand : AsmOperandClass {
+  let Name = "TLSReg"; let PredicateMethod = "isTLSReg";
+  let RenderMethod = "addTLSRegOperands";
+}
+def tlsreg32 : Operand<i32> {
+  let EncoderMethod = "getTLSRegEncoding";
+  let ParserMatchClass = PPCTLSRegOperand;
+}
+def tlsgd32 : Operand<i32> {}
+def tlscall32 : Operand<i32> {
+  let PrintMethod = "printTLSCall";
+  let MIOperandInfo = (ops calltarget:$func, tlsgd32:$sym);
+  let EncoderMethod = "getTLSCallEncoding";
+}
+
+// PowerPC Predicate operand.
+def pred : Operand<OtherVT> {
+  let PrintMethod = "printPredicateOperand";
+  let MIOperandInfo = (ops i32imm:$bibo, crrc:$reg);
+}
+
+// Define PowerPC specific addressing mode.
+def iaddr  : ComplexPattern<iPTR, 2, "SelectAddrImm",    [], []>;
+def xaddr  : ComplexPattern<iPTR, 2, "SelectAddrIdx",    [], []>;
+def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
+def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4",  [], []>; // "std"
+
+// The address in a single register. This is used with the SjLj
+// pseudo-instructions.
+def addr   : ComplexPattern<iPTR, 1, "SelectAddr",[], []>;
+
+/// This is just the offset part of iaddr, used for preinc.
+def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Predicate Definitions.
+def In32BitMode  : Predicate<"!PPCSubTarget->isPPC64()">;
+def In64BitMode  : Predicate<"PPCSubTarget->isPPC64()">;
+def IsBookE  : Predicate<"PPCSubTarget->isBookE()">;
+def IsNotBookE  : Predicate<"!PPCSubTarget->isBookE()">;
+def HasOnlyMSYNC : Predicate<"PPCSubTarget->hasOnlyMSYNC()">;
+def HasSYNC   : Predicate<"!PPCSubTarget->hasOnlyMSYNC()">;
+def IsPPC4xx  : Predicate<"PPCSubTarget->isPPC4xx()">;
+def IsPPC6xx  : Predicate<"PPCSubTarget->isPPC6xx()">;
+def IsE500  : Predicate<"PPCSubTarget->isE500()">;
+def HasSPE  : Predicate<"PPCSubTarget->HasSPE()">;
+def HasICBT : Predicate<"PPCSubTarget->hasICBT()">;
+def HasPartwordAtomics : Predicate<"PPCSubTarget->hasPartwordAtomics()">;
+def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">;
+def NaNsFPMath   : Predicate<"!TM.Options.NoNaNsFPMath">;
+def HasBPERMD : Predicate<"PPCSubTarget->hasBPERMD()">;
+def HasExtDiv : Predicate<"PPCSubTarget->hasExtDiv()">;
+def IsISA3_0 : Predicate<"PPCSubTarget->isISA3_0()">;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Multiclass Definitions.
+
+multiclass XForm_6r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XForm_6<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR0] in
+    def o    : XForm_6<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XForm_6rc<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                     string asmbase, string asmstr, InstrItinClass itin,
+                     list<dag> pattern> {
+  let BaseName = asmbase in {
+    let Defs = [CARRY] in
+    def NAME : XForm_6<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CARRY, CR0] in
+    def o    : XForm_6<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XForm_10rc<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                      string asmbase, string asmstr, InstrItinClass itin,
+                      list<dag> pattern> {
+  let BaseName = asmbase in {
+    let Defs = [CARRY] in
+    def NAME : XForm_10<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CARRY, CR0] in
+    def o    : XForm_10<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XForm_11r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XForm_11<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR0] in
+    def o    : XForm_11<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XOForm_1r<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XOForm_1<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR0] in
+    def o    : XOForm_1<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+// Multiclass for instructions for which the non record form is not cracked
+// and the record form is cracked (i.e. divw, mullw, etc.)
+multiclass XOForm_1rcr<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
+                      string asmbase, string asmstr, InstrItinClass itin,
+                      list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XOForm_1<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR0] in
+    def o    : XOForm_1<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel, PPC970_DGroup_First,
+                       PPC970_DGroup_Cracked;
+  }
+}
+
+multiclass XOForm_1rc<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
+                      string asmbase, string asmstr, InstrItinClass itin,
+                      list<dag> pattern> {
+  let BaseName = asmbase in {
+    let Defs = [CARRY] in
+    def NAME : XOForm_1<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CARRY, CR0] in
+    def o    : XOForm_1<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XOForm_3r<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XOForm_3<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR0] in
+    def o    : XOForm_3<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XOForm_3rc<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL,
+                      string asmbase, string asmstr, InstrItinClass itin,
+                      list<dag> pattern> {
+  let BaseName = asmbase in {
+    let Defs = [CARRY] in
+    def NAME : XOForm_3<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CARRY, CR0] in
+    def o    : XOForm_3<opcode, xo, oe, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass MForm_2r<bits<6> opcode, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : MForm_2<opcode, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR0] in
+    def o    : MForm_2<opcode, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass MDForm_1r<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : MDForm_1<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR0] in
+    def o    : MDForm_1<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass MDSForm_1r<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
+                     string asmbase, string asmstr, InstrItinClass itin,
+                     list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : MDSForm_1<opcode, xo, OOL, IOL,
+                        !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                        pattern>, RecFormRel;
+    let Defs = [CR0] in
+    def o    : MDSForm_1<opcode, xo, OOL, IOL,
+                        !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                        []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XSForm_1rc<bits<6> opcode, bits<9> xo, dag OOL, dag IOL,
+                      string asmbase, string asmstr, InstrItinClass itin,
+                      list<dag> pattern> {
+  let BaseName = asmbase in {
+    let Defs = [CARRY] in
+    def NAME : XSForm_1<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CARRY, CR0] in
+    def o    : XSForm_1<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XForm_26r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XForm_26<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR1] in
+    def o    : XForm_26<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass XForm_28r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XForm_28<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR1] in
+    def o    : XForm_28<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass AForm_1r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : AForm_1<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR1] in
+    def o    : AForm_1<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass AForm_2r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : AForm_2<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR1] in
+    def o    : AForm_2<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+multiclass AForm_3r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : AForm_3<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR1] in
+    def o    : AForm_3<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Definitions.
+
+// Pseudo-instructions:
+
+let hasCtrlDep = 1 in {
+let Defs = [R1], Uses = [R1] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt",
+                              [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2",
+                              [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+def UPDATE_VRSAVE    : Pseudo<(outs gprc:$rD), (ins gprc:$rS),
+                              "UPDATE_VRSAVE $rD, $rS", []>;
+}
+
+let Defs = [R1], Uses = [R1] in
+def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC",
+                       [(set i32:$result,
+                             (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>;
+def DYNAREAOFFSET : Pseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET",
+                       [(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
+                         
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+let usesCustomInserter = 1,    // Expanded after instruction selection.
+    PPC970_Single = 1 in {
+  // Note that SELECT_CC_I4 and SELECT_CC_I8 use the no-r0 register classes
+  // because either operand might become the first operand in an isel, and
+  // that operand cannot be r0.
+  def SELECT_CC_I4 : Pseudo<(outs gprc:$dst), (ins crrc:$cond,
+                              gprc_nor0:$T, gprc_nor0:$F,
+                              i32imm:$BROPC), "#SELECT_CC_I4",
+                              []>;
+  def SELECT_CC_I8 : Pseudo<(outs g8rc:$dst), (ins crrc:$cond,
+                              g8rc_nox0:$T, g8rc_nox0:$F,
+                              i32imm:$BROPC), "#SELECT_CC_I8",
+                              []>;
+  def SELECT_CC_F4  : Pseudo<(outs f4rc:$dst), (ins crrc:$cond, f4rc:$T, f4rc:$F,
+                              i32imm:$BROPC), "#SELECT_CC_F4",
+                              []>;
+  def SELECT_CC_F8  : Pseudo<(outs f8rc:$dst), (ins crrc:$cond, f8rc:$T, f8rc:$F,
+                              i32imm:$BROPC), "#SELECT_CC_F8",
+                              []>;
+  def SELECT_CC_VRRC: Pseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F,
+                              i32imm:$BROPC), "#SELECT_CC_VRRC",
+                              []>;
+
+  // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition
+  // register bit directly.
+  def SELECT_I4 : Pseudo<(outs gprc:$dst), (ins crbitrc:$cond,
+                          gprc_nor0:$T, gprc_nor0:$F), "#SELECT_I4",
+                          [(set i32:$dst, (select i1:$cond, i32:$T, i32:$F))]>;
+  def SELECT_I8 : Pseudo<(outs g8rc:$dst), (ins crbitrc:$cond,
+                          g8rc_nox0:$T, g8rc_nox0:$F), "#SELECT_I8",
+                          [(set i64:$dst, (select i1:$cond, i64:$T, i64:$F))]>;
+  def SELECT_F4  : Pseudo<(outs f4rc:$dst), (ins crbitrc:$cond,
+                          f4rc:$T, f4rc:$F), "#SELECT_F4",
+                          [(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>;
+  def SELECT_F8  : Pseudo<(outs f8rc:$dst), (ins crbitrc:$cond,
+                          f8rc:$T, f8rc:$F), "#SELECT_F8",
+                          [(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>;
+  def SELECT_VRRC: Pseudo<(outs vrrc:$dst), (ins crbitrc:$cond,
+                          vrrc:$T, vrrc:$F), "#SELECT_VRRC",
+                          [(set v4i32:$dst,
+                                (select i1:$cond, v4i32:$T, v4i32:$F))]>;
+}
+
+// SPILL_CR - Indicate that we're dumping the CR register, so we'll need to
+// scavenge a register for it.
+let mayStore = 1 in {
+def SPILL_CR : Pseudo<(outs), (ins crrc:$cond, memri:$F),
+                     "#SPILL_CR", []>;
+def SPILL_CRBIT : Pseudo<(outs), (ins crbitrc:$cond, memri:$F),
+                         "#SPILL_CRBIT", []>;
+}
+
+// RESTORE_CR - Indicate that we're restoring the CR register (previously
+// spilled), so we'll need to scavenge a register for it.
+let mayLoad = 1 in {
+def RESTORE_CR : Pseudo<(outs crrc:$cond), (ins memri:$F),
+                     "#RESTORE_CR", []>;
+def RESTORE_CRBIT : Pseudo<(outs crbitrc:$cond), (ins memri:$F),
+                           "#RESTORE_CRBIT", []>;
+}
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
+  let isReturn = 1, Uses = [LR, RM] in
+    def BLR : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB,
+                           [(retflag)]>, Requires<[In32BitMode]>;
+  let isBranch = 1, isIndirectBranch = 1, Uses = [CTR] in {
+    def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
+                            []>;
+
+    let isCodeGenOnly = 1 in {
+      def BCCCTR : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond),
+                               "b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB,
+                               []>;
+
+      def BCCTR :  XLForm_2_br2<19, 528, 12, 0, (outs), (ins crbitrc:$bi),
+                                "bcctr 12, $bi, 0", IIC_BrB, []>;
+      def BCCTRn : XLForm_2_br2<19, 528, 4, 0, (outs), (ins crbitrc:$bi),
+                                "bcctr 4, $bi, 0", IIC_BrB, []>;
+    }
+  }
+}
+
+let Defs = [LR] in
+  def MovePCtoLR : Pseudo<(outs), (ins), "#MovePCtoLR", []>,
+                   PPC970_Unit_BRU;
+let Defs = [LR] in
+  def MoveGOTtoLR : Pseudo<(outs), (ins), "#MoveGOTtoLR", []>,
+                    PPC970_Unit_BRU;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
+  let isBarrier = 1 in {
+  def B   : IForm<18, 0, 0, (outs), (ins directbrtarget:$dst),
+                  "b $dst", IIC_BrB,
+                  [(br bb:$dst)]>;
+  def BA  : IForm<18, 1, 0, (outs), (ins absdirectbrtarget:$dst),
+                  "ba $dst", IIC_BrB, []>;
+  }
+
+  // BCC represents an arbitrary conditional branch on a predicate.
+  // FIXME: should be able to write a pattern for PPCcondbranch, but can't use
+  // a two-value operand where a dag node expects two operands. :(
+  let isCodeGenOnly = 1 in {
+    def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
+                    "b${cond:cc}${cond:pm} ${cond:reg}, $dst"
+                    /*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>;
+    def BCCA : BForm<16, 1, 0, (outs), (ins pred:$cond, abscondbrtarget:$dst),
+                     "b${cond:cc}a${cond:pm} ${cond:reg}, $dst">;
+
+    let isReturn = 1, Uses = [LR, RM] in
+    def BCCLR : XLForm_2_br<19, 16, 0, (outs), (ins pred:$cond),
+                           "b${cond:cc}lr${cond:pm} ${cond:reg}", IIC_BrB, []>;
+  }
+
+  let isCodeGenOnly = 1 in {
+    let Pattern = [(brcond i1:$bi, bb:$dst)] in
+    def BC  : BForm_4<16, 12, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst),
+             "bc 12, $bi, $dst">;
+
+    let Pattern = [(brcond (not i1:$bi), bb:$dst)] in
+    def BCn : BForm_4<16, 4, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst),
+             "bc 4, $bi, $dst">;
+
+    let isReturn = 1, Uses = [LR, RM] in
+    def BCLR  : XLForm_2_br2<19, 16, 12, 0, (outs), (ins crbitrc:$bi),
+                             "bclr 12, $bi, 0", IIC_BrB, []>;
+    def BCLRn : XLForm_2_br2<19, 16, 4, 0, (outs), (ins crbitrc:$bi),
+                             "bclr 4, $bi, 0", IIC_BrB, []>;
+  }
+
+  let isReturn = 1, Defs = [CTR], Uses = [CTR, LR, RM] in {
+   def BDZLR  : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins),
+                             "bdzlr", IIC_BrB, []>;
+   def BDNZLR : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins),
+                             "bdnzlr", IIC_BrB, []>;
+   def BDZLRp : XLForm_2_ext<19, 16, 27, 0, 0, (outs), (ins),
+                             "bdzlr+", IIC_BrB, []>;
+   def BDNZLRp: XLForm_2_ext<19, 16, 25, 0, 0, (outs), (ins),
+                             "bdnzlr+", IIC_BrB, []>;
+   def BDZLRm : XLForm_2_ext<19, 16, 26, 0, 0, (outs), (ins),
+                             "bdzlr-", IIC_BrB, []>;
+   def BDNZLRm: XLForm_2_ext<19, 16, 24, 0, 0, (outs), (ins),
+                             "bdnzlr-", IIC_BrB, []>;
+  }
+
+  let Defs = [CTR], Uses = [CTR] in {
+    def BDZ  : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdz $dst">;
+    def BDNZ : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdnz $dst">;
+    def BDZA  : BForm_1<16, 18, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdza $dst">;
+    def BDNZA : BForm_1<16, 16, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdnza $dst">;
+    def BDZp : BForm_1<16, 27, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdz+ $dst">;
+    def BDNZp: BForm_1<16, 25, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdnz+ $dst">;
+    def BDZAp : BForm_1<16, 27, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdza+ $dst">;
+    def BDNZAp: BForm_1<16, 25, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdnza+ $dst">;
+    def BDZm : BForm_1<16, 26, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdz- $dst">;
+    def BDNZm: BForm_1<16, 24, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdnz- $dst">;
+    def BDZAm : BForm_1<16, 26, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdza- $dst">;
+    def BDNZAm: BForm_1<16, 24, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdnza- $dst">;
+  }
+}
+
+// The unconditional BCL used by the SjLj setjmp code.
+let isCall = 1, hasCtrlDep = 1, isCodeGenOnly = 1, PPC970_Unit = 7 in {
+  let Defs = [LR], Uses = [RM] in {
+    def BCLalways  : BForm_2<16, 20, 31, 0, 1, (outs), (ins condbrtarget:$dst),
+                            "bcl 20, 31, $dst">;
+  }
+}
+
+let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
+  // Convenient aliases for call instructions
+  let Uses = [RM] in {
+    def BL  : IForm<18, 0, 1, (outs), (ins calltarget:$func),
+                    "bl $func", IIC_BrB, []>;  // See Pat patterns below.
+    def BLA : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
+                    "bla $func", IIC_BrB, [(PPCcall (i32 imm:$func))]>;
+
+    let isCodeGenOnly = 1 in {
+      def BL_TLS  : IForm<18, 0, 1, (outs), (ins tlscall32:$func),
+                          "bl $func", IIC_BrB, []>;
+      def BCCL : BForm<16, 0, 1, (outs), (ins pred:$cond, condbrtarget:$dst),
+                       "b${cond:cc}l${cond:pm} ${cond:reg}, $dst">;
+      def BCCLA : BForm<16, 1, 1, (outs), (ins pred:$cond, abscondbrtarget:$dst),
+                        "b${cond:cc}la${cond:pm} ${cond:reg}, $dst">;
+
+      def BCL  : BForm_4<16, 12, 0, 1, (outs),
+                         (ins crbitrc:$bi, condbrtarget:$dst),
+                         "bcl 12, $bi, $dst">;
+      def BCLn : BForm_4<16, 4, 0, 1, (outs),
+                         (ins crbitrc:$bi, condbrtarget:$dst),
+                         "bcl 4, $bi, $dst">;
+    }
+  }
+  let Uses = [CTR, RM] in {
+    def BCTRL : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
+                             "bctrl", IIC_BrB, [(PPCbctrl)]>,
+                Requires<[In32BitMode]>;
+
+    let isCodeGenOnly = 1 in {
+      def BCCCTRL : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond),
+                                "b${cond:cc}ctrl${cond:pm} ${cond:reg}", IIC_BrB,
+                                []>;
+
+      def BCCTRL  : XLForm_2_br2<19, 528, 12, 1, (outs), (ins crbitrc:$bi),
+                                 "bcctrl 12, $bi, 0", IIC_BrB, []>;
+      def BCCTRLn : XLForm_2_br2<19, 528, 4, 1, (outs), (ins crbitrc:$bi),
+                                 "bcctrl 4, $bi, 0", IIC_BrB, []>;
+    }
+  }
+  let Uses = [LR, RM] in {
+    def BLRL : XLForm_2_ext<19, 16, 20, 0, 1, (outs), (ins),
+                            "blrl", IIC_BrB, []>;
+
+    let isCodeGenOnly = 1 in {
+      def BCCLRL : XLForm_2_br<19, 16, 1, (outs), (ins pred:$cond),
+                              "b${cond:cc}lrl${cond:pm} ${cond:reg}", IIC_BrB,
+                              []>;
+
+      def BCLRL  : XLForm_2_br2<19, 16, 12, 1, (outs), (ins crbitrc:$bi),
+                                "bclrl 12, $bi, 0", IIC_BrB, []>;
+      def BCLRLn : XLForm_2_br2<19, 16, 4, 1, (outs), (ins crbitrc:$bi),
+                                "bclrl 4, $bi, 0", IIC_BrB, []>;
+    }
+  }
+  let Defs = [CTR], Uses = [CTR, RM] in {
+    def BDZL  : BForm_1<16, 18, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdzl $dst">;
+    def BDNZL : BForm_1<16, 16, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdnzl $dst">;
+    def BDZLA  : BForm_1<16, 18, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdzla $dst">;
+    def BDNZLA : BForm_1<16, 16, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdnzla $dst">;
+    def BDZLp : BForm_1<16, 27, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdzl+ $dst">;
+    def BDNZLp: BForm_1<16, 25, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdnzl+ $dst">;
+    def BDZLAp : BForm_1<16, 27, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdzla+ $dst">;
+    def BDNZLAp: BForm_1<16, 25, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdnzla+ $dst">;
+    def BDZLm : BForm_1<16, 26, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdzl- $dst">;
+    def BDNZLm: BForm_1<16, 24, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdnzl- $dst">;
+    def BDZLAm : BForm_1<16, 26, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdzla- $dst">;
+    def BDNZLAm: BForm_1<16, 24, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdnzla- $dst">;
+  }
+  let Defs = [CTR], Uses = [CTR, LR, RM] in {
+    def BDZLRL  : XLForm_2_ext<19, 16, 18, 0, 1, (outs), (ins),
+                               "bdzlrl", IIC_BrB, []>;
+    def BDNZLRL : XLForm_2_ext<19, 16, 16, 0, 1, (outs), (ins),
+                               "bdnzlrl", IIC_BrB, []>;
+    def BDZLRLp : XLForm_2_ext<19, 16, 27, 0, 1, (outs), (ins),
+                               "bdzlrl+", IIC_BrB, []>;
+    def BDNZLRLp: XLForm_2_ext<19, 16, 25, 0, 1, (outs), (ins),
+                               "bdnzlrl+", IIC_BrB, []>;
+    def BDZLRLm : XLForm_2_ext<19, 16, 26, 0, 1, (outs), (ins),
+                               "bdzlrl-", IIC_BrB, []>;
+    def BDNZLRLm: XLForm_2_ext<19, 16, 24, 0, 1, (outs), (ins),
+                               "bdnzlrl-", IIC_BrB, []>;
+  }
+}
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNdi :Pseudo< (outs),
+                        (ins calltarget:$dst, i32imm:$offset),
+                 "#TC_RETURNd $dst $offset",
+                 []>;
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNai :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset),
+                 "#TC_RETURNa $func $offset",
+                 [(PPCtc_return (i32 imm:$func), imm:$offset)]>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset),
+                 "#TC_RETURNr $dst $offset",
+                 []>;
+
+
+let isCodeGenOnly = 1 in {
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
+    isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM]  in
+def TAILBCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB,
+                            []>, Requires<[In32BitMode]>;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILB   : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
+                  "b $dst", IIC_BrB,
+                  []>;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILBA   : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst),
+                  "ba $dst", IIC_BrB,
+                  []>;
+
+}
+
+let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+  let Defs = [CTR] in
+  def EH_SjLj_SetJmp32  : Pseudo<(outs gprc:$dst), (ins memr:$buf),
+                            "#EH_SJLJ_SETJMP32",
+                            [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[In32BitMode]>;
+  let isTerminator = 1 in
+  def EH_SjLj_LongJmp32 : Pseudo<(outs), (ins memr:$buf),
+                            "#EH_SJLJ_LONGJMP32",
+                            [(PPCeh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[In32BitMode]>;
+}
+
+// This pseudo is never removed from the function, as it serves as
+// a terminator.  Size is set to 0 to prevent the builtin assembler
+// from emitting it.
+let isBranch = 1, isTerminator = 1, Size = 0 in {
+  def EH_SjLj_Setup : Pseudo<(outs), (ins directbrtarget:$dst),
+                        "#EH_SjLj_Setup\t$dst", []>;
+}
+
+// System call.
+let PPC970_Unit = 7 in {
+  def SC     : SCForm<17, 1, (outs), (ins i32imm:$lev),
+                      "sc $lev", IIC_BrB, [(PPCsc (i32 imm:$lev))]>;
+}
+
+// Branch history rolling buffer.
+def CLRBHRB : XForm_0<31, 430, (outs), (ins), "clrbhrb", IIC_BrB,
+                      [(PPCclrbhrb)]>,
+                      PPC970_DGroup_Single;
+// The $dmy argument used for MFBHRBE is not needed; however, including
+// it avoids automatic generation of PPCFastISel::fastEmit_i(), which
+// interferes with necessary special handling (see PPCFastISel.cpp).
+def MFBHRBE : XFXForm_3p<31, 302, (outs gprc:$rD),
+                         (ins u10imm:$imm, u10imm:$dmy),
+                         "mfbhrbe $rD, $imm", IIC_BrB,
+                         [(set i32:$rD,
+                               (PPCmfbhrbe imm:$imm, imm:$dmy))]>,
+                         PPC970_DGroup_First;
+
+def RFEBB : XLForm_S<19, 146, (outs), (ins u1imm:$imm), "rfebb $imm",
+                     IIC_BrB, [(PPCrfebb (i32 imm:$imm))]>,
+                     PPC970_DGroup_Single;
+
+// DCB* instructions.
+def DCBA   : DCB_Form<758, 0, (outs), (ins memrr:$dst), "dcba $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBI   : DCB_Form<470, 0, (outs), (ins memrr:$dst), "dcbi $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbi xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBST  : DCB_Form<54, 0, (outs), (ins memrr:$dst), "dcbst $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbst xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBZ   : DCB_Form<1014, 0, (outs), (ins memrr:$dst), "dcbz $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbz xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBZL  : DCB_Form<1014, 1, (outs), (ins memrr:$dst), "dcbzl $dst",
+                      IIC_LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+
+def DCBF   : DCB_Form_hint<86, (outs), (ins u5imm:$TH, memrr:$dst),
+                      "dcbf $dst, $TH", IIC_LdStDCBF, []>,
+                      PPC970_DGroup_Single;
+
+let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in {
+def DCBT   : DCB_Form_hint<278, (outs), (ins u5imm:$TH, memrr:$dst),
+                      "dcbt $dst, $TH", IIC_LdStDCBF, []>,
+                      PPC970_DGroup_Single;
+def DCBTST : DCB_Form_hint<246, (outs), (ins u5imm:$TH, memrr:$dst),
+                      "dcbtst $dst, $TH", IIC_LdStDCBF, []>,
+                      PPC970_DGroup_Single;
+} // hasSideEffects = 0
+
+def ICBT  : XForm_icbt<31, 22, (outs), (ins u4imm:$CT, memrr:$src),
+                       "icbt $CT, $src", IIC_LdStLoad>, Requires<[HasICBT]>;
+
+def : Pat<(int_ppc_dcbt xoaddr:$dst),
+          (DCBT 0, xoaddr:$dst)>;
+def : Pat<(int_ppc_dcbtst xoaddr:$dst),
+          (DCBTST 0, xoaddr:$dst)>;
+def : Pat<(int_ppc_dcbf xoaddr:$dst),
+          (DCBF 0, xoaddr:$dst)>;
+
+def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 1)),
+          (DCBT 0, xoaddr:$dst)>;   // data prefetch for loads
+def : Pat<(prefetch xoaddr:$dst, (i32 1), imm, (i32 1)),
+          (DCBTST 0, xoaddr:$dst)>; // data prefetch for stores
+def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 0)),
+          (ICBT 0, xoaddr:$dst)>, Requires<[HasICBT]>; // inst prefetch (for read)
+
+// Atomic operations
+let usesCustomInserter = 1 in {
+  let Defs = [CR0] in {
+    def ATOMIC_LOAD_ADD_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I8",
+      [(set i32:$dst, (atomic_load_add_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_SUB_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I8",
+      [(set i32:$dst, (atomic_load_sub_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_AND_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I8",
+      [(set i32:$dst, (atomic_load_and_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_OR_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I8",
+      [(set i32:$dst, (atomic_load_or_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_XOR_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "ATOMIC_LOAD_XOR_I8",
+      [(set i32:$dst, (atomic_load_xor_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_NAND_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I8",
+      [(set i32:$dst, (atomic_load_nand_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_MIN_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I8",
+      [(set i32:$dst, (atomic_load_min_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_MAX_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I8",
+      [(set i32:$dst, (atomic_load_max_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_UMIN_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I8",
+      [(set i32:$dst, (atomic_load_umin_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_UMAX_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I8",
+      [(set i32:$dst, (atomic_load_umax_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_ADD_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I16",
+      [(set i32:$dst, (atomic_load_add_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_SUB_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I16",
+      [(set i32:$dst, (atomic_load_sub_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_AND_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I16",
+      [(set i32:$dst, (atomic_load_and_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_OR_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I16",
+      [(set i32:$dst, (atomic_load_or_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_XOR_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I16",
+      [(set i32:$dst, (atomic_load_xor_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_NAND_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I16",
+      [(set i32:$dst, (atomic_load_nand_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_MIN_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I16",
+      [(set i32:$dst, (atomic_load_min_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_MAX_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I16",
+      [(set i32:$dst, (atomic_load_max_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_UMIN_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I16",
+      [(set i32:$dst, (atomic_load_umin_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_UMAX_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I16",
+      [(set i32:$dst, (atomic_load_umax_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_ADD_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I32",
+      [(set i32:$dst, (atomic_load_add_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_SUB_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I32",
+      [(set i32:$dst, (atomic_load_sub_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_AND_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I32",
+      [(set i32:$dst, (atomic_load_and_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_OR_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I32",
+      [(set i32:$dst, (atomic_load_or_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_XOR_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I32",
+      [(set i32:$dst, (atomic_load_xor_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_NAND_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I32",
+      [(set i32:$dst, (atomic_load_nand_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_MIN_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MIN_I32",
+      [(set i32:$dst, (atomic_load_min_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_MAX_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_MAX_I32",
+      [(set i32:$dst, (atomic_load_max_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_UMIN_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMIN_I32",
+      [(set i32:$dst, (atomic_load_umin_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_UMAX_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_UMAX_I32",
+      [(set i32:$dst, (atomic_load_umax_32 xoaddr:$ptr, i32:$incr))]>;
+
+    def ATOMIC_CMP_SWAP_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I8",
+      [(set i32:$dst, (atomic_cmp_swap_8 xoaddr:$ptr, i32:$old, i32:$new))]>;
+    def ATOMIC_CMP_SWAP_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new",
+      [(set i32:$dst, (atomic_cmp_swap_16 xoaddr:$ptr, i32:$old, i32:$new))]>;
+    def ATOMIC_CMP_SWAP_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new",
+      [(set i32:$dst, (atomic_cmp_swap_32 xoaddr:$ptr, i32:$old, i32:$new))]>;
+
+    def ATOMIC_SWAP_I8 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_i8",
+      [(set i32:$dst, (atomic_swap_8 xoaddr:$ptr, i32:$new))]>;
+    def ATOMIC_SWAP_I16 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I16",
+      [(set i32:$dst, (atomic_swap_16 xoaddr:$ptr, i32:$new))]>;
+    def ATOMIC_SWAP_I32 : Pseudo<
+      (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I32",
+      [(set i32:$dst, (atomic_swap_32 xoaddr:$ptr, i32:$new))]>;
+  }
+}
+
+// Instructions to support atomic operations
+let mayLoad = 1, hasSideEffects = 0 in {
+def LBARX : XForm_1<31,  52, (outs gprc:$rD), (ins memrr:$src),
+                    "lbarx $rD, $src", IIC_LdStLWARX, []>,
+                    Requires<[HasPartwordAtomics]>;
+
+def LHARX : XForm_1<31,  116, (outs gprc:$rD), (ins memrr:$src),
+                    "lharx $rD, $src", IIC_LdStLWARX, []>,
+                    Requires<[HasPartwordAtomics]>;
+
+def LWARX : XForm_1<31,  20, (outs gprc:$rD), (ins memrr:$src),
+                    "lwarx $rD, $src", IIC_LdStLWARX, []>;
+
+// Instructions to support lock versions of atomics
+// (EH=1 - see Power ISA 2.07 Book II 4.4.2)
+def LBARXL : XForm_1<31,  52, (outs gprc:$rD), (ins memrr:$src),
+                     "lbarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT,
+                     Requires<[HasPartwordAtomics]>;
+
+def LHARXL : XForm_1<31,  116, (outs gprc:$rD), (ins memrr:$src),
+                     "lharx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT,
+                     Requires<[HasPartwordAtomics]>;
+
+def LWARXL : XForm_1<31,  20, (outs gprc:$rD), (ins memrr:$src),
+                     "lwarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT;
+
+// The atomic instructions use the destination register as well as the next one
+// or two registers in order (modulo 31).
+let hasExtraSrcRegAllocReq = 1 in
+def LWAT : X_RD5_RS5_IM5<31, 582, (outs gprc:$rD), (ins gprc:$rA, u5imm:$FC),
+                         "lwat $rD, $rA, $FC", IIC_LdStLoad>,
+           Requires<[IsISA3_0]>;
+}
+
+let Defs = [CR0], mayStore = 1, hasSideEffects = 0 in {
+def STBCX : XForm_1<31, 694, (outs), (ins gprc:$rS, memrr:$dst),
+                    "stbcx. $rS, $dst", IIC_LdStSTWCX, []>,
+                    isDOT, Requires<[HasPartwordAtomics]>;
+
+def STHCX : XForm_1<31, 726, (outs), (ins gprc:$rS, memrr:$dst),
+                    "sthcx. $rS, $dst", IIC_LdStSTWCX, []>,
+                    isDOT, Requires<[HasPartwordAtomics]>;
+
+def STWCX : XForm_1<31, 150, (outs), (ins gprc:$rS, memrr:$dst),
+                    "stwcx. $rS, $dst", IIC_LdStSTWCX, []>, isDOT;
+}
+
+let mayStore = 1, hasSideEffects = 0 in
+def STWAT : X_RD5_RS5_IM5<31, 710, (outs), (ins gprc:$rS, gprc:$rA, u5imm:$FC),
+                          "stwat $rS, $rA, $FC", IIC_LdStStore>,
+            Requires<[IsISA3_0]>;
+
+let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
+def TRAP  : XForm_24<31, 4, (outs), (ins), "trap", IIC_LdStLoad, [(trap)]>;
+
+def TWI : DForm_base<3, (outs), (ins u5imm:$to, gprc:$rA, s16imm:$imm),
+                     "twi $to, $rA, $imm", IIC_IntTrapW, []>;
+def TW : XForm_1<31, 4, (outs), (ins u5imm:$to, gprc:$rA, gprc:$rB),
+                 "tw $to, $rA, $rB", IIC_IntTrapW, []>;
+def TDI : DForm_base<2, (outs), (ins u5imm:$to, g8rc:$rA, s16imm:$imm),
+                     "tdi $to, $rA, $imm", IIC_IntTrapD, []>;
+def TD : XForm_1<31, 68, (outs), (ins u5imm:$to, g8rc:$rA, g8rc:$rB),
+                 "td $to, $rA, $rB", IIC_IntTrapD, []>;
+
+//===----------------------------------------------------------------------===//
+// PPC32 Load Instructions.
+//
+
+// Unindexed (r+i) Loads. 
+let PPC970_Unit = 2 in {
+def LBZ : DForm_1<34, (outs gprc:$rD), (ins memri:$src),
+                  "lbz $rD, $src", IIC_LdStLoad,
+                  [(set i32:$rD, (zextloadi8 iaddr:$src))]>;
+def LHA : DForm_1<42, (outs gprc:$rD), (ins memri:$src),
+                  "lha $rD, $src", IIC_LdStLHA,
+                  [(set i32:$rD, (sextloadi16 iaddr:$src))]>,
+                  PPC970_DGroup_Cracked;
+def LHZ : DForm_1<40, (outs gprc:$rD), (ins memri:$src),
+                  "lhz $rD, $src", IIC_LdStLoad,
+                  [(set i32:$rD, (zextloadi16 iaddr:$src))]>;
+def LWZ : DForm_1<32, (outs gprc:$rD), (ins memri:$src),
+                  "lwz $rD, $src", IIC_LdStLoad,
+                  [(set i32:$rD, (load iaddr:$src))]>;
+
+def LFS : DForm_1<48, (outs f4rc:$rD), (ins memri:$src),
+                  "lfs $rD, $src", IIC_LdStLFD,
+                  [(set f32:$rD, (load iaddr:$src))]>;
+def LFD : DForm_1<50, (outs f8rc:$rD), (ins memri:$src),
+                  "lfd $rD, $src", IIC_LdStLFD,
+                  [(set f64:$rD, (load iaddr:$src))]>;
+
+
+// Unindexed (r+i) Loads with Update (preinc).
+let mayLoad = 1, hasSideEffects = 0 in {
+def LBZU : DForm_1<35, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                   "lbzu $rD, $addr", IIC_LdStLoadUpd,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHAU : DForm_1<43, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                   "lhau $rD, $addr", IIC_LdStLHAU,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHZU : DForm_1<41, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                   "lhzu $rD, $addr", IIC_LdStLoadUpd,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LWZU : DForm_1<33, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                   "lwzu $rD, $addr", IIC_LdStLoadUpd,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFSU : DForm_1<49, (outs f4rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                  "lfsu $rD, $addr", IIC_LdStLFDU,
+                  []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFDU : DForm_1<51, (outs f8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                  "lfdu $rD, $addr", IIC_LdStLFDU,
+                  []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+
+// Indexed (r+r) Loads with Update (preinc).
+def LBZUX : XForm_1<31, 119, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lbzux $rD, $addr", IIC_LdStLoadUpdX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHAUX : XForm_1<31, 375, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lhaux $rD, $addr", IIC_LdStLHAUX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHZUX : XForm_1<31, 311, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lhzux $rD, $addr", IIC_LdStLoadUpdX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LWZUX : XForm_1<31, 55, (outs gprc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lwzux $rD, $addr", IIC_LdStLoadUpdX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFSUX : XForm_1<31, 567, (outs f4rc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lfsux $rD, $addr", IIC_LdStLFDUX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFDUX : XForm_1<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lfdux $rD, $addr", IIC_LdStLFDUX,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+}
+}
+
+// Indexed (r+r) Loads.
+//
+let PPC970_Unit = 2 in {
+def LBZX : XForm_1<31,  87, (outs gprc:$rD), (ins memrr:$src),
+                   "lbzx $rD, $src", IIC_LdStLoad,
+                   [(set i32:$rD, (zextloadi8 xaddr:$src))]>;
+def LHAX : XForm_1<31, 343, (outs gprc:$rD), (ins memrr:$src),
+                   "lhax $rD, $src", IIC_LdStLHA,
+                   [(set i32:$rD, (sextloadi16 xaddr:$src))]>,
+                   PPC970_DGroup_Cracked;
+def LHZX : XForm_1<31, 279, (outs gprc:$rD), (ins memrr:$src),
+                   "lhzx $rD, $src", IIC_LdStLoad,
+                   [(set i32:$rD, (zextloadi16 xaddr:$src))]>;
+def LWZX : XForm_1<31,  23, (outs gprc:$rD), (ins memrr:$src),
+                   "lwzx $rD, $src", IIC_LdStLoad,
+                   [(set i32:$rD, (load xaddr:$src))]>;
+                   
+                   
+def LHBRX : XForm_1<31, 790, (outs gprc:$rD), (ins memrr:$src),
+                   "lhbrx $rD, $src", IIC_LdStLoad,
+                   [(set i32:$rD, (PPClbrx xoaddr:$src, i16))]>;
+def LWBRX : XForm_1<31,  534, (outs gprc:$rD), (ins memrr:$src),
+                   "lwbrx $rD, $src", IIC_LdStLoad,
+                   [(set i32:$rD, (PPClbrx xoaddr:$src, i32))]>;
+
+def LFSX   : XForm_25<31, 535, (outs f4rc:$frD), (ins memrr:$src),
+                      "lfsx $frD, $src", IIC_LdStLFD,
+                      [(set f32:$frD, (load xaddr:$src))]>;
+def LFDX   : XForm_25<31, 599, (outs f8rc:$frD), (ins memrr:$src),
+                      "lfdx $frD, $src", IIC_LdStLFD,
+                      [(set f64:$frD, (load xaddr:$src))]>;
+
+def LFIWAX : XForm_25<31, 855, (outs f8rc:$frD), (ins memrr:$src),
+                      "lfiwax $frD, $src", IIC_LdStLFD,
+                      [(set f64:$frD, (PPClfiwax xoaddr:$src))]>;
+def LFIWZX : XForm_25<31, 887, (outs f8rc:$frD), (ins memrr:$src),
+                      "lfiwzx $frD, $src", IIC_LdStLFD,
+                      [(set f64:$frD, (PPClfiwzx xoaddr:$src))]>;
+}
+
+// Load Multiple
+def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src),
+                  "lmw $rD, $src", IIC_LdStLMW, []>;
+
+//===----------------------------------------------------------------------===//
+// PPC32 Store Instructions.
+//
+
+// Unindexed (r+i) Stores.
+let PPC970_Unit = 2 in {
+def STB  : DForm_1<38, (outs), (ins gprc:$rS, memri:$src),
+                   "stb $rS, $src", IIC_LdStStore,
+                   [(truncstorei8 i32:$rS, iaddr:$src)]>;
+def STH  : DForm_1<44, (outs), (ins gprc:$rS, memri:$src),
+                   "sth $rS, $src", IIC_LdStStore,
+                   [(truncstorei16 i32:$rS, iaddr:$src)]>;
+def STW  : DForm_1<36, (outs), (ins gprc:$rS, memri:$src),
+                   "stw $rS, $src", IIC_LdStStore,
+                   [(store i32:$rS, iaddr:$src)]>;
+def STFS : DForm_1<52, (outs), (ins f4rc:$rS, memri:$dst),
+                   "stfs $rS, $dst", IIC_LdStSTFD,
+                   [(store f32:$rS, iaddr:$dst)]>;
+def STFD : DForm_1<54, (outs), (ins f8rc:$rS, memri:$dst),
+                   "stfd $rS, $dst", IIC_LdStSTFD,
+                   [(store f64:$rS, iaddr:$dst)]>;
+}
+
+// Unindexed (r+i) Stores with Update (preinc).
+let PPC970_Unit = 2, mayStore = 1 in {
+def STBU  : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
+                    "stbu $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STHU  : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
+                    "sthu $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STWU  : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
+                    "stwu $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STFSU : DForm_1<53, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memri:$dst),
+                    "stfsu $rS, $dst", IIC_LdStSTFDU, []>,
+                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STFDU : DForm_1<55, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memri:$dst),
+                    "stfdu $rS, $dst", IIC_LdStSTFDU, []>,
+                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+}
+
+// Patterns to match the pre-inc stores.  We can't put the patterns on
+// the instruction definitions directly as ISel wants the address base
+// and offset to be separate operands, not a single complex operand.
+def : Pat<(pre_truncsti8 i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STBU $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STHU $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STWU $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STFSU $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STFDU $rS, iaddroff:$ptroff, $ptrreg)>;
+
+// Indexed (r+r) Stores.
+let PPC970_Unit = 2 in {
+def STBX  : XForm_8<31, 215, (outs), (ins gprc:$rS, memrr:$dst),
+                   "stbx $rS, $dst", IIC_LdStStore,
+                   [(truncstorei8 i32:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+def STHX  : XForm_8<31, 407, (outs), (ins gprc:$rS, memrr:$dst),
+                   "sthx $rS, $dst", IIC_LdStStore,
+                   [(truncstorei16 i32:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+def STWX  : XForm_8<31, 151, (outs), (ins gprc:$rS, memrr:$dst),
+                   "stwx $rS, $dst", IIC_LdStStore,
+                   [(store i32:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+ 
+def STHBRX: XForm_8<31, 918, (outs), (ins gprc:$rS, memrr:$dst),
+                   "sthbrx $rS, $dst", IIC_LdStStore,
+                   [(PPCstbrx i32:$rS, xoaddr:$dst, i16)]>,
+                   PPC970_DGroup_Cracked;
+def STWBRX: XForm_8<31, 662, (outs), (ins gprc:$rS, memrr:$dst),
+                   "stwbrx $rS, $dst", IIC_LdStStore,
+                   [(PPCstbrx i32:$rS, xoaddr:$dst, i32)]>,
+                   PPC970_DGroup_Cracked;
+
+def STFIWX: XForm_28<31, 983, (outs), (ins f8rc:$frS, memrr:$dst),
+                     "stfiwx $frS, $dst", IIC_LdStSTFD,
+                     [(PPCstfiwx f64:$frS, xoaddr:$dst)]>;
+                     
+def STFSX : XForm_28<31, 663, (outs), (ins f4rc:$frS, memrr:$dst),
+                     "stfsx $frS, $dst", IIC_LdStSTFD,
+                     [(store f32:$frS, xaddr:$dst)]>;
+def STFDX : XForm_28<31, 727, (outs), (ins f8rc:$frS, memrr:$dst),
+                     "stfdx $frS, $dst", IIC_LdStSTFD,
+                     [(store f64:$frS, xaddr:$dst)]>;
+}
+
+// Indexed (r+r) Stores with Update (preinc).
+let PPC970_Unit = 2, mayStore = 1 in {
+def STBUX : XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
+                    "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STHUX : XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
+                    "sthux $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STWUX : XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
+                    "stwux $rS, $dst", IIC_LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STFSUX: XForm_8<31, 695, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memrr:$dst),
+                    "stfsux $rS, $dst", IIC_LdStSTFDU, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STFDUX: XForm_8<31, 759, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memrr:$dst),
+                    "stfdux $rS, $dst", IIC_LdStSTFDU, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+}
+
+// Patterns to match the pre-inc stores.  We can't put the patterns on
+// the instruction definitions directly as ISel wants the address base
+// and offset to be separate operands, not a single complex operand.
+def : Pat<(pre_truncsti8 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STBUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STHUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STWUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STFSUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STFDUX $rS, $ptrreg, $ptroff)>;
+
+// Store Multiple
+def STMW : DForm_1<47, (outs), (ins gprc:$rS, memri:$dst),
+                   "stmw $rS, $dst", IIC_LdStLMW, []>;
+
+def SYNC : XForm_24_sync<31, 598, (outs), (ins i32imm:$L),
+                        "sync $L", IIC_LdStSync, []>;
+
+let isCodeGenOnly = 1 in {
+  def MSYNC : XForm_24_sync<31, 598, (outs), (ins),
+                           "msync", IIC_LdStSync, []> {
+    let L = 0;
+  }
+}
+
+def : Pat<(int_ppc_sync),   (SYNC 0)>, Requires<[HasSYNC]>;
+def : Pat<(int_ppc_lwsync), (SYNC 1)>, Requires<[HasSYNC]>;
+def : Pat<(int_ppc_sync),   (MSYNC)>, Requires<[HasOnlyMSYNC]>;
+def : Pat<(int_ppc_lwsync), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
+
+//===----------------------------------------------------------------------===//
+// PPC32 Arithmetic Instructions.
+//
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+def ADDI   : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$imm),
+                     "addi $rD, $rA, $imm", IIC_IntSimple,
+                     [(set i32:$rD, (add i32:$rA, imm32SExt16:$imm))]>;
+let BaseName = "addic" in {
+let Defs = [CARRY] in
+def ADDIC  : DForm_2<12, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
+                     "addic $rD, $rA, $imm", IIC_IntGeneral,
+                     [(set i32:$rD, (addc i32:$rA, imm32SExt16:$imm))]>,
+                     RecFormRel, PPC970_DGroup_Cracked;
+let Defs = [CARRY, CR0] in
+def ADDICo : DForm_2<13, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
+                     "addic. $rD, $rA, $imm", IIC_IntGeneral,
+                     []>, isDOT, RecFormRel;
+}
+def ADDIS  : DForm_2<15, (outs gprc:$rD), (ins gprc_nor0:$rA, s17imm:$imm),
+                     "addis $rD, $rA, $imm", IIC_IntSimple,
+                     [(set i32:$rD, (add i32:$rA, imm16ShiftedSExt:$imm))]>;
+let isCodeGenOnly = 1 in
+def LA     : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$sym),
+                     "la $rD, $sym($rA)", IIC_IntGeneral,
+                     [(set i32:$rD, (add i32:$rA,
+                                          (PPClo tglobaladdr:$sym, 0)))]>;
+def MULLI  : DForm_2< 7, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
+                     "mulli $rD, $rA, $imm", IIC_IntMulLI,
+                     [(set i32:$rD, (mul i32:$rA, imm32SExt16:$imm))]>;
+let Defs = [CARRY] in
+def SUBFIC : DForm_2< 8, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
+                     "subfic $rD, $rA, $imm", IIC_IntGeneral,
+                     [(set i32:$rD, (subc imm32SExt16:$imm, i32:$rA))]>;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+  def LI  : DForm_2_r0<14, (outs gprc:$rD), (ins s16imm:$imm),
+                       "li $rD, $imm", IIC_IntSimple,
+                       [(set i32:$rD, imm32SExt16:$imm)]>;
+  def LIS : DForm_2_r0<15, (outs gprc:$rD), (ins s17imm:$imm),
+                       "lis $rD, $imm", IIC_IntSimple,
+                       [(set i32:$rD, imm16ShiftedSExt:$imm)]>;
+}
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+let Defs = [CR0] in {
+def ANDIo : DForm_4<28, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+                    "andi. $dst, $src1, $src2", IIC_IntGeneral,
+                    [(set i32:$dst, (and i32:$src1, immZExt16:$src2))]>,
+                    isDOT;
+def ANDISo : DForm_4<29, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+                    "andis. $dst, $src1, $src2", IIC_IntGeneral,
+                    [(set i32:$dst, (and i32:$src1, imm16ShiftedZExt:$src2))]>,
+                    isDOT;
+}
+def ORI   : DForm_4<24, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+                    "ori $dst, $src1, $src2", IIC_IntSimple,
+                    [(set i32:$dst, (or i32:$src1, immZExt16:$src2))]>;
+def ORIS  : DForm_4<25, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+                    "oris $dst, $src1, $src2", IIC_IntSimple,
+                    [(set i32:$dst, (or i32:$src1, imm16ShiftedZExt:$src2))]>;
+def XORI  : DForm_4<26, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+                    "xori $dst, $src1, $src2", IIC_IntSimple,
+                    [(set i32:$dst, (xor i32:$src1, immZExt16:$src2))]>;
+def XORIS : DForm_4<27, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
+                    "xoris $dst, $src1, $src2", IIC_IntSimple,
+                    [(set i32:$dst, (xor i32:$src1, imm16ShiftedZExt:$src2))]>;
+
+def NOP   : DForm_4_zero<24, (outs), (ins), "nop", IIC_IntSimple,
+                         []>;
+let isCodeGenOnly = 1 in {
+// The POWER6 and POWER7 have special group-terminating nops.
+def NOP_GT_PWR6 : DForm_4_fixedreg_zero<24, 1, (outs), (ins),
+                                        "ori 1, 1, 0", IIC_IntSimple, []>;
+def NOP_GT_PWR7 : DForm_4_fixedreg_zero<24, 2, (outs), (ins),
+                                        "ori 2, 2, 0", IIC_IntSimple, []>;
+}
+
+let isCompare = 1, hasSideEffects = 0 in {
+  def CMPWI : DForm_5_ext<11, (outs crrc:$crD), (ins gprc:$rA, s16imm:$imm),
+                          "cmpwi $crD, $rA, $imm", IIC_IntCompare>;
+  def CMPLWI : DForm_6_ext<10, (outs crrc:$dst), (ins gprc:$src1, u16imm:$src2),
+                           "cmplwi $dst, $src1, $src2", IIC_IntCompare>;
+  def CMPRB  : X_BF3_L1_RS5_RS5<31, 192, (outs crbitrc:$BF),
+                                (ins u1imm:$L, g8rc:$rA, g8rc:$rB),
+                                "cmprb $BF, $L, $rA, $rB", IIC_IntCompare, []>,
+               Requires<[IsISA3_0]>;
+}
+}
+
+let PPC970_Unit = 1, hasSideEffects = 0 in {  // FXU Operations.
+let isCommutable = 1 in {
+defm NAND : XForm_6r<31, 476, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "nand", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i32:$rA, (not (and i32:$rS, i32:$rB)))]>;
+defm AND  : XForm_6r<31,  28, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "and", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i32:$rA, (and i32:$rS, i32:$rB))]>;
+} // isCommutable
+defm ANDC : XForm_6r<31,  60, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "andc", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i32:$rA, (and i32:$rS, (not i32:$rB)))]>;
+let isCommutable = 1 in {
+defm OR   : XForm_6r<31, 444, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "or", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i32:$rA, (or i32:$rS, i32:$rB))]>;
+defm NOR  : XForm_6r<31, 124, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "nor", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i32:$rA, (not (or i32:$rS, i32:$rB)))]>;
+} // isCommutable
+defm ORC  : XForm_6r<31, 412, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "orc", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i32:$rA, (or i32:$rS, (not i32:$rB)))]>;
+let isCommutable = 1 in {
+defm EQV  : XForm_6r<31, 284, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "eqv", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i32:$rA, (not (xor i32:$rS, i32:$rB)))]>;
+defm XOR  : XForm_6r<31, 316, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "xor", "$rA, $rS, $rB", IIC_IntSimple,
+                     [(set i32:$rA, (xor i32:$rS, i32:$rB))]>;
+} // isCommutable
+defm SLW  : XForm_6r<31,  24, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "slw", "$rA, $rS, $rB", IIC_IntGeneral,
+                     [(set i32:$rA, (PPCshl i32:$rS, i32:$rB))]>;
+defm SRW  : XForm_6r<31, 536, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                     "srw", "$rA, $rS, $rB", IIC_IntGeneral,
+                     [(set i32:$rA, (PPCsrl i32:$rS, i32:$rB))]>;
+defm SRAW : XForm_6rc<31, 792, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                      "sraw", "$rA, $rS, $rB", IIC_IntShift,
+                      [(set i32:$rA, (PPCsra i32:$rS, i32:$rB))]>;
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+let hasSideEffects = 0 in {
+defm SRAWI : XForm_10rc<31, 824, (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH),
+                        "srawi", "$rA, $rS, $SH", IIC_IntShift,
+                        [(set i32:$rA, (sra i32:$rS, (i32 imm:$SH)))]>;
+defm CNTLZW : XForm_11r<31,  26, (outs gprc:$rA), (ins gprc:$rS),
+                        "cntlzw", "$rA, $rS", IIC_IntGeneral,
+                        [(set i32:$rA, (ctlz i32:$rS))]>;
+defm CNTTZW : XForm_11r<31, 538, (outs gprc:$rA), (ins gprc:$rS),
+                        "cnttzw", "$rA, $rS", IIC_IntGeneral,
+                        [(set i32:$rA, (cttz i32:$rS))]>, Requires<[IsISA3_0]>;
+defm EXTSB  : XForm_11r<31, 954, (outs gprc:$rA), (ins gprc:$rS),
+                        "extsb", "$rA, $rS", IIC_IntSimple,
+                        [(set i32:$rA, (sext_inreg i32:$rS, i8))]>;
+defm EXTSH  : XForm_11r<31, 922, (outs gprc:$rA), (ins gprc:$rS),
+                        "extsh", "$rA, $rS", IIC_IntSimple,
+                        [(set i32:$rA, (sext_inreg i32:$rS, i16))]>;
+
+let isCommutable = 1 in
+def CMPB : XForm_6<31, 508, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
+                   "cmpb $rA, $rS, $rB", IIC_IntGeneral,
+                   [(set i32:$rA, (PPCcmpb i32:$rS, i32:$rB))]>;
+}
+let isCompare = 1, hasSideEffects = 0 in {
+  def CMPW   : XForm_16_ext<31, 0, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB),
+                            "cmpw $crD, $rA, $rB", IIC_IntCompare>;
+  def CMPLW  : XForm_16_ext<31, 32, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB),
+                            "cmplw $crD, $rA, $rB", IIC_IntCompare>;
+}
+}
+let PPC970_Unit = 3 in {  // FPU Operations.
+//def FCMPO  : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB),
+//                      "fcmpo $crD, $fA, $fB", IIC_FPCompare>;
+let isCompare = 1, hasSideEffects = 0 in {
+  def FCMPUS : XForm_17<63, 0, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB),
+                        "fcmpu $crD, $fA, $fB", IIC_FPCompare>;
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+  def FCMPUD : XForm_17<63, 0, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
+                        "fcmpu $crD, $fA, $fB", IIC_FPCompare>;
+}
+
+let Uses = [RM] in {
+  let hasSideEffects = 0 in {
+  defm FCTIW  : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "fctiw", "$frD, $frB", IIC_FPGeneral,
+                          []>;
+  defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "fctiwz", "$frD, $frB", IIC_FPGeneral,
+                          [(set f64:$frD, (PPCfctiwz f64:$frB))]>;
+
+  defm FRSP   : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB),
+                          "frsp", "$frD, $frB", IIC_FPGeneral,
+                          [(set f32:$frD, (fpround f64:$frB))]>;
+
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+  defm FRIND  : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "frin", "$frD, $frB", IIC_FPGeneral,
+                          [(set f64:$frD, (fround f64:$frB))]>;
+  defm FRINS  : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB),
+                          "frin", "$frD, $frB", IIC_FPGeneral,
+                          [(set f32:$frD, (fround f32:$frB))]>;
+  }
+
+  let hasSideEffects = 0 in {
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+  defm FRIPD  : XForm_26r<63, 456, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "frip", "$frD, $frB", IIC_FPGeneral,
+                          [(set f64:$frD, (fceil f64:$frB))]>;
+  defm FRIPS  : XForm_26r<63, 456, (outs f4rc:$frD), (ins f4rc:$frB),
+                          "frip", "$frD, $frB", IIC_FPGeneral,
+                          [(set f32:$frD, (fceil f32:$frB))]>;
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+  defm FRIZD  : XForm_26r<63, 424, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "friz", "$frD, $frB", IIC_FPGeneral,
+                          [(set f64:$frD, (ftrunc f64:$frB))]>;
+  defm FRIZS  : XForm_26r<63, 424, (outs f4rc:$frD), (ins f4rc:$frB),
+                          "friz", "$frD, $frB", IIC_FPGeneral,
+                          [(set f32:$frD, (ftrunc f32:$frB))]>;
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+  defm FRIMD  : XForm_26r<63, 488, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "frim", "$frD, $frB", IIC_FPGeneral,
+                          [(set f64:$frD, (ffloor f64:$frB))]>;
+  defm FRIMS  : XForm_26r<63, 488, (outs f4rc:$frD), (ins f4rc:$frB),
+                          "frim", "$frD, $frB", IIC_FPGeneral,
+                          [(set f32:$frD, (ffloor f32:$frB))]>;
+
+  defm FSQRT  : XForm_26r<63, 22, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "fsqrt", "$frD, $frB", IIC_FPSqrtD,
+                          [(set f64:$frD, (fsqrt f64:$frB))]>;
+  defm FSQRTS : XForm_26r<59, 22, (outs f4rc:$frD), (ins f4rc:$frB),
+                          "fsqrts", "$frD, $frB", IIC_FPSqrtS,
+                          [(set f32:$frD, (fsqrt f32:$frB))]>;
+  }
+  }
+}
+
+/// Note that FMR is defined as pseudo-ops on the PPC970 because they are
+/// often coalesced away and we don't want the dispatch group builder to think
+/// that they will fill slots (which could cause the load of a LSU reject to
+/// sneak into a d-group with a store).
+let hasSideEffects = 0 in
+defm FMR   : XForm_26r<63, 72, (outs f4rc:$frD), (ins f4rc:$frB),
+                       "fmr", "$frD, $frB", IIC_FPGeneral,
+                       []>,  // (set f32:$frD, f32:$frB)
+                       PPC970_Unit_Pseudo;
+
+let PPC970_Unit = 3, hasSideEffects = 0 in {  // FPU Operations.
+// These are artificially split into two different forms, for 4/8 byte FP.
+defm FABSS  : XForm_26r<63, 264, (outs f4rc:$frD), (ins f4rc:$frB),
+                        "fabs", "$frD, $frB", IIC_FPGeneral,
+                        [(set f32:$frD, (fabs f32:$frB))]>;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm FABSD  : XForm_26r<63, 264, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fabs", "$frD, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (fabs f64:$frB))]>;
+defm FNABSS : XForm_26r<63, 136, (outs f4rc:$frD), (ins f4rc:$frB),
+                        "fnabs", "$frD, $frB", IIC_FPGeneral,
+                        [(set f32:$frD, (fneg (fabs f32:$frB)))]>;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm FNABSD : XForm_26r<63, 136, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fnabs", "$frD, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (fneg (fabs f64:$frB)))]>;
+defm FNEGS  : XForm_26r<63, 40, (outs f4rc:$frD), (ins f4rc:$frB),
+                        "fneg", "$frD, $frB", IIC_FPGeneral,
+                        [(set f32:$frD, (fneg f32:$frB))]>;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm FNEGD  : XForm_26r<63, 40, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fneg", "$frD, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (fneg f64:$frB))]>;
+
+defm FCPSGNS : XForm_28r<63, 8, (outs f4rc:$frD), (ins f4rc:$frA, f4rc:$frB),
+                        "fcpsgn", "$frD, $frA, $frB", IIC_FPGeneral,
+                        [(set f32:$frD, (fcopysign f32:$frB, f32:$frA))]>;
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$frD), (ins f8rc:$frA, f8rc:$frB),
+                        "fcpsgn", "$frD, $frA, $frB", IIC_FPGeneral,
+                        [(set f64:$frD, (fcopysign f64:$frB, f64:$frA))]>;
+
+// Reciprocal estimates.
+defm FRE      : XForm_26r<63, 24, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "fre", "$frD, $frB", IIC_FPGeneral,
+                          [(set f64:$frD, (PPCfre f64:$frB))]>;
+defm FRES     : XForm_26r<59, 24, (outs f4rc:$frD), (ins f4rc:$frB),
+                          "fres", "$frD, $frB", IIC_FPGeneral,
+                          [(set f32:$frD, (PPCfre f32:$frB))]>;
+defm FRSQRTE  : XForm_26r<63, 26, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "frsqrte", "$frD, $frB", IIC_FPGeneral,
+                          [(set f64:$frD, (PPCfrsqrte f64:$frB))]>;
+defm FRSQRTES : XForm_26r<59, 26, (outs f4rc:$frD), (ins f4rc:$frB),
+                          "frsqrtes", "$frD, $frB", IIC_FPGeneral,
+                          [(set f32:$frD, (PPCfrsqrte f32:$frB))]>;
+}
+
+// XL-Form instructions.  condition register logical ops.
+//
+let hasSideEffects = 0 in
+def MCRF   : XLForm_3<19, 0, (outs crrc:$BF), (ins crrc:$BFA),
+                      "mcrf $BF, $BFA", IIC_BrMCR>,
+             PPC970_DGroup_First, PPC970_Unit_CRU;
+
+// FIXME: According to the ISA (section 2.5.1 of version 2.06), the
+// condition-register logical instructions have preferred forms. Specifically,
+// it is preferred that the bit specified by the BT field be in the same
+// condition register as that specified by the bit BB. We might want to account
+// for this via hinting the register allocator and anti-dep breakers, or we
+// could constrain the register class to force this constraint and then loosen
+// it during register allocation via convertToThreeAddress or some similar
+// mechanism.
+
+let isCommutable = 1 in {
+def CRAND  : XLForm_1<19, 257, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crand $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (and i1:$CRA, i1:$CRB))]>;
+
+def CRNAND : XLForm_1<19, 225, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crnand $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (not (and i1:$CRA, i1:$CRB)))]>;
+
+def CROR   : XLForm_1<19, 449, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "cror $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (or i1:$CRA, i1:$CRB))]>;
+
+def CRXOR  : XLForm_1<19, 193, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crxor $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (xor i1:$CRA, i1:$CRB))]>;
+
+def CRNOR  : XLForm_1<19, 33, (outs crbitrc:$CRD),
+                              (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crnor $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (not (or i1:$CRA, i1:$CRB)))]>;
+
+def CREQV  : XLForm_1<19, 289, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "creqv $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (not (xor i1:$CRA, i1:$CRB)))]>;
+} // isCommutable
+
+def CRANDC : XLForm_1<19, 129, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crandc $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (and i1:$CRA, (not i1:$CRB)))]>;
+
+def CRORC  : XLForm_1<19, 417, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crorc $CRD, $CRA, $CRB", IIC_BrCR,
+                      [(set i1:$CRD, (or i1:$CRA, (not i1:$CRB)))]>;
+
+let isCodeGenOnly = 1 in {
+def CRSET  : XLForm_1_ext<19, 289, (outs crbitrc:$dst), (ins),
+              "creqv $dst, $dst, $dst", IIC_BrCR,
+              [(set i1:$dst, 1)]>;
+
+def CRUNSET: XLForm_1_ext<19, 193, (outs crbitrc:$dst), (ins),
+              "crxor $dst, $dst, $dst", IIC_BrCR,
+              [(set i1:$dst, 0)]>;
+
+let Defs = [CR1EQ], CRD = 6 in {
+def CR6SET  : XLForm_1_ext<19, 289, (outs), (ins),
+              "creqv 6, 6, 6", IIC_BrCR,
+              [(PPCcr6set)]>;
+
+def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins),
+              "crxor 6, 6, 6", IIC_BrCR,
+              [(PPCcr6unset)]>;
+}
+}
+
+// XFX-Form instructions.  Instructions that deal with SPRs.
+//
+
+def MFSPR : XFXForm_1<31, 339, (outs gprc:$RT), (ins i32imm:$SPR),
+                      "mfspr $RT, $SPR", IIC_SprMFSPR>;
+def MTSPR : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, gprc:$RT),
+                      "mtspr $SPR, $RT", IIC_SprMTSPR>;
+
+def MFTB : XFXForm_1<31, 371, (outs gprc:$RT), (ins i32imm:$SPR),
+                     "mftb $RT, $SPR", IIC_SprMFTB>;
+
+// A pseudo-instruction used to implement the read of the 64-bit cycle counter
+// on a 32-bit target.
+let hasSideEffects = 1, usesCustomInserter = 1 in
+def ReadTB : Pseudo<(outs gprc:$lo, gprc:$hi), (ins),
+                    "#ReadTB", []>;
+
+let Uses = [CTR] in {
+def MFCTR : XFXForm_1_ext<31, 339, 9, (outs gprc:$rT), (ins),
+                          "mfctr $rT", IIC_SprMFSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Defs = [CTR], Pattern = [(PPCmtctr i32:$rS)] in {
+def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
+                          "mtctr $rS", IIC_SprMTSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in {
+let Pattern = [(int_ppc_mtctr i32:$rS)] in
+def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
+                              "mtctr $rS", IIC_SprMTSPR>,
+                PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+let Defs = [LR] in {
+def MTLR  : XFXForm_7_ext<31, 467, 8, (outs), (ins gprc:$rS),
+                          "mtlr $rS", IIC_SprMTSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Uses = [LR] in {
+def MFLR  : XFXForm_1_ext<31, 339, 8, (outs gprc:$rT), (ins),
+                          "mflr $rT", IIC_SprMFSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+let isCodeGenOnly = 1 in {
+  // Move to/from VRSAVE: despite being a SPR, the VRSAVE register is renamed
+  // like a GPR on the PPC970.  As such, copies in and out have the same
+  // performance characteristics as an OR instruction.
+  def MTVRSAVE : XFXForm_7_ext<31, 467, 256, (outs), (ins gprc:$rS),
+                               "mtspr 256, $rS", IIC_IntGeneral>,
+                 PPC970_DGroup_Single, PPC970_Unit_FXU;
+  def MFVRSAVE : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT), (ins),
+                               "mfspr $rT, 256", IIC_IntGeneral>,
+                 PPC970_DGroup_First, PPC970_Unit_FXU;
+
+  def MTVRSAVEv : XFXForm_7_ext<31, 467, 256,
+                                (outs VRSAVERC:$reg), (ins gprc:$rS),
+                                "mtspr 256, $rS", IIC_IntGeneral>,
+                  PPC970_DGroup_Single, PPC970_Unit_FXU;
+  def MFVRSAVEv : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT),
+                                (ins VRSAVERC:$reg),
+                                "mfspr $rT, 256", IIC_IntGeneral>,
+                  PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+// Aliases for mtvrsave/mfvrsave to mfspr/mtspr.
+def : InstAlias<"mtvrsave $rS", (MTVRSAVE gprc:$rS)>;
+def : InstAlias<"mfvrsave $rS", (MFVRSAVE gprc:$rS)>;
+
+// SPILL_VRSAVE - Indicate that we're dumping the VRSAVE register,
+// so we'll need to scavenge a register for it.
+let mayStore = 1 in
+def SPILL_VRSAVE : Pseudo<(outs), (ins VRSAVERC:$vrsave, memri:$F),
+                     "#SPILL_VRSAVE", []>;
+
+// RESTORE_VRSAVE - Indicate that we're restoring the VRSAVE register (previously
+// spilled), so we'll need to scavenge a register for it.
+let mayLoad = 1 in
+def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F),
+                     "#RESTORE_VRSAVE", []>;
+
+let hasSideEffects = 0 in {
+// mtocrf's input needs to be prepared by shifting by an amount dependent
+// on the cr register selected. Thus, post-ra anti-dep breaking must not
+// later change that register assignment.
+let hasExtraDefRegAllocReq = 1 in {
+def MTOCRF: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins gprc:$ST),
+                       "mtocrf $FXM, $ST", IIC_BrMCRX>,
+            PPC970_DGroup_First, PPC970_Unit_CRU;
+
+// Similarly to mtocrf, the mask for mtcrf must be prepared in a way that
+// is dependent on the cr fields being set.
+def MTCRF : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, gprc:$rS),
+                      "mtcrf $FXM, $rS", IIC_BrMCRX>,
+            PPC970_MicroCode, PPC970_Unit_CRU;
+} // hasExtraDefRegAllocReq = 1
+
+// mfocrf's input needs to be prepared by shifting by an amount dependent
+// on the cr register selected. Thus, post-ra anti-dep breaking must not
+// later change that register assignment.
+let hasExtraSrcRegAllocReq = 1 in {
+def MFOCRF: XFXForm_5a<31, 19, (outs gprc:$rT), (ins crbitm:$FXM),
+                       "mfocrf $rT, $FXM", IIC_SprMFCRF>,
+            PPC970_DGroup_First, PPC970_Unit_CRU;
+
+// Similarly to mfocrf, the mask for mfcrf must be prepared in a way that
+// is dependent on the cr fields being copied.
+def MFCR : XFXForm_3<31, 19, (outs gprc:$rT), (ins),
+                     "mfcr $rT", IIC_SprMFCR>,
+                     PPC970_MicroCode, PPC970_Unit_CRU;
+} // hasExtraSrcRegAllocReq = 1
+
+def MCRXRX : X_BF3<31, 576, (outs crrc:$BF), (ins),
+                   "mcrxrx $BF", IIC_BrMCRX>, Requires<[IsISA3_0]>;
+} // hasSideEffects = 0
+
+// Pseudo instruction to perform FADD in round-to-zero mode.
+let usesCustomInserter = 1, Uses = [RM] in {
+  def FADDrtz: Pseudo<(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), "",
+                      [(set f64:$FRT, (PPCfaddrtz f64:$FRA, f64:$FRB))]>;
+}
+
+// The above pseudo gets expanded to make use of the following instructions
+// to manipulate FPSCR.  Note that FPSCR is not modeled at the DAG level.
+let Uses = [RM], Defs = [RM] in { 
+  def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM),
+                        "mtfsb0 $FM", IIC_IntMTFSB0, []>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+  def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM),
+                        "mtfsb1 $FM", IIC_IntMTFSB0, []>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+  let isCodeGenOnly = 1 in
+  def MTFSFb  : XFLForm<63, 711, (outs), (ins i32imm:$FM, f8rc:$rT),
+                        "mtfsf $FM, $rT", IIC_IntMTFSB0, []>,
+                PPC970_DGroup_Single, PPC970_Unit_FPU;
+}
+let Uses = [RM] in {
+  def MFFS   : XForm_42<63, 583, (outs f8rc:$rT), (ins),
+                         "mffs $rT", IIC_IntMFFS,
+                         [(set f64:$rT, (PPCmffs))]>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+
+  let Defs = [CR1] in
+  def MFFSo : XForm_42<63, 583, (outs f8rc:$rT), (ins),
+                      "mffs. $rT", IIC_IntMFFS, []>, isDOT;
+}
+
+
+let PPC970_Unit = 1, hasSideEffects = 0 in {  // FXU Operations.
+// XO-Form instructions.  Arithmetic instructions that can set overflow bit
+let isCommutable = 1 in
+defm ADD4  : XOForm_1r<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                       "add", "$rT, $rA, $rB", IIC_IntSimple,
+                       [(set i32:$rT, (add i32:$rA, i32:$rB))]>;
+let isCodeGenOnly = 1 in
+def ADD4TLS  : XOForm_1<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, tlsreg32:$rB),
+                       "add $rT, $rA, $rB", IIC_IntSimple,
+                       [(set i32:$rT, (add i32:$rA, tglobaltlsaddr:$rB))]>;
+let isCommutable = 1 in
+defm ADDC  : XOForm_1rc<31, 10, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                        "addc", "$rT, $rA, $rB", IIC_IntGeneral,
+                        [(set i32:$rT, (addc i32:$rA, i32:$rB))]>,
+                        PPC970_DGroup_Cracked;
+
+defm DIVW  : XOForm_1rcr<31, 491, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                          "divw", "$rT, $rA, $rB", IIC_IntDivW,
+                          [(set i32:$rT, (sdiv i32:$rA, i32:$rB))]>;
+defm DIVWU : XOForm_1rcr<31, 459, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                          "divwu", "$rT, $rA, $rB", IIC_IntDivW,
+                          [(set i32:$rT, (udiv i32:$rA, i32:$rB))]>;
+def DIVWE : XOForm_1<31, 427, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                     "divwe $rT, $rA, $rB", IIC_IntDivW,
+                     [(set i32:$rT, (int_ppc_divwe gprc:$rA, gprc:$rB))]>,
+                     Requires<[HasExtDiv]>;
+let Defs = [CR0] in
+def DIVWEo : XOForm_1<31, 427, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                      "divwe. $rT, $rA, $rB", IIC_IntDivW,
+                      []>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First,
+                      Requires<[HasExtDiv]>;
+def DIVWEU : XOForm_1<31, 395, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                      "divweu $rT, $rA, $rB", IIC_IntDivW,
+                      [(set i32:$rT, (int_ppc_divweu gprc:$rA, gprc:$rB))]>,
+                      Requires<[HasExtDiv]>;
+let Defs = [CR0] in
+def DIVWEUo : XOForm_1<31, 395, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                       "divweu. $rT, $rA, $rB", IIC_IntDivW,
+                       []>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First,
+                       Requires<[HasExtDiv]>;
+let isCommutable = 1 in {
+defm MULHW : XOForm_1r<31, 75, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                       "mulhw", "$rT, $rA, $rB", IIC_IntMulHW,
+                       [(set i32:$rT, (mulhs i32:$rA, i32:$rB))]>;
+defm MULHWU : XOForm_1r<31, 11, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                       "mulhwu", "$rT, $rA, $rB", IIC_IntMulHWU,
+                       [(set i32:$rT, (mulhu i32:$rA, i32:$rB))]>;
+defm MULLW : XOForm_1r<31, 235, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                       "mullw", "$rT, $rA, $rB", IIC_IntMulHW,
+                       [(set i32:$rT, (mul i32:$rA, i32:$rB))]>;
+} // isCommutable
+defm SUBF  : XOForm_1r<31, 40, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                       "subf", "$rT, $rA, $rB", IIC_IntGeneral,
+                       [(set i32:$rT, (sub i32:$rB, i32:$rA))]>;
+defm SUBFC : XOForm_1rc<31, 8, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                        "subfc", "$rT, $rA, $rB", IIC_IntGeneral,
+                        [(set i32:$rT, (subc i32:$rB, i32:$rA))]>,
+                        PPC970_DGroup_Cracked;
+defm NEG    : XOForm_3r<31, 104, 0, (outs gprc:$rT), (ins gprc:$rA),
+                        "neg", "$rT, $rA", IIC_IntSimple,
+                        [(set i32:$rT, (ineg i32:$rA))]>;
+let Uses = [CARRY] in {
+let isCommutable = 1 in
+defm ADDE  : XOForm_1rc<31, 138, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                        "adde", "$rT, $rA, $rB", IIC_IntGeneral,
+                        [(set i32:$rT, (adde i32:$rA, i32:$rB))]>;
+defm ADDME  : XOForm_3rc<31, 234, 0, (outs gprc:$rT), (ins gprc:$rA),
+                         "addme", "$rT, $rA", IIC_IntGeneral,
+                         [(set i32:$rT, (adde i32:$rA, -1))]>;
+defm ADDZE  : XOForm_3rc<31, 202, 0, (outs gprc:$rT), (ins gprc:$rA),
+                         "addze", "$rT, $rA", IIC_IntGeneral,
+                         [(set i32:$rT, (adde i32:$rA, 0))]>;
+defm SUBFE : XOForm_1rc<31, 136, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                        "subfe", "$rT, $rA, $rB", IIC_IntGeneral,
+                        [(set i32:$rT, (sube i32:$rB, i32:$rA))]>;
+defm SUBFME : XOForm_3rc<31, 232, 0, (outs gprc:$rT), (ins gprc:$rA),
+                         "subfme", "$rT, $rA", IIC_IntGeneral,
+                         [(set i32:$rT, (sube -1, i32:$rA))]>;
+defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$rT), (ins gprc:$rA),
+                         "subfze", "$rT, $rA", IIC_IntGeneral,
+                         [(set i32:$rT, (sube 0, i32:$rA))]>;
+}
+}
+
+// A-Form instructions.  Most of the instructions executed in the FPU are of
+// this type.
+//
+let PPC970_Unit = 3, hasSideEffects = 0 in {  // FPU Operations.
+let Uses = [RM] in {
+let isCommutable = 1 in {
+  defm FMADD : AForm_1r<63, 29, 
+                      (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
+                      "fmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set f64:$FRT, (fma f64:$FRA, f64:$FRC, f64:$FRB))]>;
+  defm FMADDS : AForm_1r<59, 29,
+                      (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
+                      "fmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+                      [(set f32:$FRT, (fma f32:$FRA, f32:$FRC, f32:$FRB))]>;
+  defm FMSUB : AForm_1r<63, 28,
+                      (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
+                      "fmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set f64:$FRT,
+                            (fma f64:$FRA, f64:$FRC, (fneg f64:$FRB)))]>;
+  defm FMSUBS : AForm_1r<59, 28,
+                      (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
+                      "fmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+                      [(set f32:$FRT,
+                            (fma f32:$FRA, f32:$FRC, (fneg f32:$FRB)))]>;
+  defm FNMADD : AForm_1r<63, 31,
+                      (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
+                      "fnmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set f64:$FRT,
+                            (fneg (fma f64:$FRA, f64:$FRC, f64:$FRB)))]>;
+  defm FNMADDS : AForm_1r<59, 31,
+                      (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
+                      "fnmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+                      [(set f32:$FRT,
+                            (fneg (fma f32:$FRA, f32:$FRC, f32:$FRB)))]>;
+  defm FNMSUB : AForm_1r<63, 30,
+                      (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
+                      "fnmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set f64:$FRT, (fneg (fma f64:$FRA, f64:$FRC,
+                                                 (fneg f64:$FRB))))]>;
+  defm FNMSUBS : AForm_1r<59, 30,
+                      (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
+                      "fnmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+                      [(set f32:$FRT, (fneg (fma f32:$FRA, f32:$FRC,
+                                                 (fneg f32:$FRB))))]>;
+} // isCommutable
+}
+// FSEL is artificially split into 4 and 8-byte forms for the result.  To avoid
+// having 4 of these, force the comparison to always be an 8-byte double (code
+// should use an FMRSD if the input comparison value really wants to be a float)
+// and 4/8 byte forms for the result and operand type..
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm FSELD : AForm_1r<63, 23,
+                      (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
+                      "fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+                      [(set f64:$FRT, (PPCfsel f64:$FRA, f64:$FRC, f64:$FRB))]>;
+defm FSELS : AForm_1r<63, 23,
+                      (outs f4rc:$FRT), (ins f8rc:$FRA, f4rc:$FRC, f4rc:$FRB),
+                      "fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
+                      [(set f32:$FRT, (PPCfsel f64:$FRA, f32:$FRC, f32:$FRB))]>;
+let Uses = [RM] in {
+  let isCommutable = 1 in {
+  defm FADD  : AForm_2r<63, 21,
+                        (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
+                        "fadd", "$FRT, $FRA, $FRB", IIC_FPAddSub,
+                        [(set f64:$FRT, (fadd f64:$FRA, f64:$FRB))]>;
+  defm FADDS : AForm_2r<59, 21,
+                        (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
+                        "fadds", "$FRT, $FRA, $FRB", IIC_FPGeneral,
+                        [(set f32:$FRT, (fadd f32:$FRA, f32:$FRB))]>;
+  } // isCommutable
+  defm FDIV  : AForm_2r<63, 18,
+                        (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
+                        "fdiv", "$FRT, $FRA, $FRB", IIC_FPDivD,
+                        [(set f64:$FRT, (fdiv f64:$FRA, f64:$FRB))]>;
+  defm FDIVS : AForm_2r<59, 18,
+                        (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
+                        "fdivs", "$FRT, $FRA, $FRB", IIC_FPDivS,
+                        [(set f32:$FRT, (fdiv f32:$FRA, f32:$FRB))]>;
+  let isCommutable = 1 in {
+  defm FMUL  : AForm_3r<63, 25,
+                        (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC),
+                        "fmul", "$FRT, $FRA, $FRC", IIC_FPFused,
+                        [(set f64:$FRT, (fmul f64:$FRA, f64:$FRC))]>;
+  defm FMULS : AForm_3r<59, 25,
+                        (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC),
+                        "fmuls", "$FRT, $FRA, $FRC", IIC_FPGeneral,
+                        [(set f32:$FRT, (fmul f32:$FRA, f32:$FRC))]>;
+  } // isCommutable
+  defm FSUB  : AForm_2r<63, 20,
+                        (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
+                        "fsub", "$FRT, $FRA, $FRB", IIC_FPAddSub,
+                        [(set f64:$FRT, (fsub f64:$FRA, f64:$FRB))]>;
+  defm FSUBS : AForm_2r<59, 20,
+                        (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
+                        "fsubs", "$FRT, $FRA, $FRB", IIC_FPGeneral,
+                        [(set f32:$FRT, (fsub f32:$FRA, f32:$FRB))]>;
+  }
+}
+
+let hasSideEffects = 0 in {
+let PPC970_Unit = 1 in {  // FXU Operations.
+  let isSelect = 1 in
+  def ISEL  : AForm_4<31, 15,
+                     (outs gprc:$rT), (ins gprc_nor0:$rA, gprc:$rB, crbitrc:$cond),
+                     "isel $rT, $rA, $rB, $cond", IIC_IntISEL,
+                     []>;
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+// M-Form instructions.  rotate and mask instructions.
+//
+let isCommutable = 1 in {
+// RLWIMI can be commuted if the rotate amount is zero.
+defm RLWIMI : MForm_2r<20, (outs gprc:$rA),
+                       (ins gprc:$rSi, gprc:$rS, u5imm:$SH, u5imm:$MB,
+                       u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME",
+                       IIC_IntRotate, []>, PPC970_DGroup_Cracked,
+                       RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">;
+}
+let BaseName = "rlwinm" in {
+def RLWINM : MForm_2<21,
+                     (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+                     "rlwinm $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
+                     []>, RecFormRel;
+let Defs = [CR0] in
+def RLWINMo : MForm_2<21,
+                      (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+                      "rlwinm. $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
+                      []>, isDOT, RecFormRel, PPC970_DGroup_Cracked;
+}
+defm RLWNM  : MForm_2r<23, (outs gprc:$rA),
+                       (ins gprc:$rS, gprc:$rB, u5imm:$MB, u5imm:$ME),
+                       "rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral,
+                       []>;
+}
+} // hasSideEffects = 0
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Patterns
+//
+
+// Arbitrary immediate support.  Implement in terms of LIS/ORI.
+def : Pat<(i32 imm:$imm),
+          (ORI (LIS (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Implement the 'not' operation with the NOR instruction.
+def i32not : OutPatFrag<(ops node:$in),
+                        (NOR $in, $in)>;
+def        : Pat<(not i32:$in),
+                 (i32not $in)>;
+
+// ADD an arbitrary immediate.
+def : Pat<(add i32:$in, imm:$imm),
+          (ADDIS (ADDI $in, (LO16 imm:$imm)), (HA16 imm:$imm))>;
+// OR an arbitrary immediate.
+def : Pat<(or i32:$in, imm:$imm),
+          (ORIS (ORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
+// XOR an arbitrary immediate.
+def : Pat<(xor i32:$in, imm:$imm),
+          (XORIS (XORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
+// SUBFIC
+def : Pat<(sub imm32SExt16:$imm, i32:$in),
+          (SUBFIC $in, imm:$imm)>;
+
+// SHL/SRL
+def : Pat<(shl i32:$in, (i32 imm:$imm)),
+          (RLWINM $in, imm:$imm, 0, (SHL32 imm:$imm))>;
+def : Pat<(srl i32:$in, (i32 imm:$imm)),
+          (RLWINM $in, (SRL32 imm:$imm), imm:$imm, 31)>;
+
+// ROTL
+def : Pat<(rotl i32:$in, i32:$sh),
+          (RLWNM $in, $sh, 0, 31)>;
+def : Pat<(rotl i32:$in, (i32 imm:$imm)),
+          (RLWINM $in, imm:$imm, 0, 31)>;
+
+// RLWNM
+def : Pat<(and (rotl i32:$in, i32:$sh), maskimm32:$imm),
+          (RLWNM $in, $sh, (MB maskimm32:$imm), (ME maskimm32:$imm))>;
+
+// Calls
+def : Pat<(PPCcall (i32 tglobaladdr:$dst)),
+          (BL tglobaladdr:$dst)>;
+def : Pat<(PPCcall (i32 texternalsym:$dst)),
+          (BL texternalsym:$dst)>;
+
+def : Pat<(PPCtc_return (i32 tglobaladdr:$dst),  imm:$imm),
+          (TCRETURNdi tglobaladdr:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return (i32 texternalsym:$dst), imm:$imm),
+          (TCRETURNdi texternalsym:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return CTRRC:$dst, imm:$imm),
+          (TCRETURNri CTRRC:$dst, imm:$imm)>;
+
+
+
+// Hi and Lo for Darwin Global Addresses.
+def : Pat<(PPChi tglobaladdr:$in, 0), (LIS tglobaladdr:$in)>;
+def : Pat<(PPClo tglobaladdr:$in, 0), (LI tglobaladdr:$in)>;
+def : Pat<(PPChi tconstpool:$in, 0), (LIS tconstpool:$in)>;
+def : Pat<(PPClo tconstpool:$in, 0), (LI tconstpool:$in)>;
+def : Pat<(PPChi tjumptable:$in, 0), (LIS tjumptable:$in)>;
+def : Pat<(PPClo tjumptable:$in, 0), (LI tjumptable:$in)>;
+def : Pat<(PPChi tblockaddress:$in, 0), (LIS tblockaddress:$in)>;
+def : Pat<(PPClo tblockaddress:$in, 0), (LI tblockaddress:$in)>;
+def : Pat<(PPChi tglobaltlsaddr:$g, i32:$in),
+          (ADDIS $in, tglobaltlsaddr:$g)>;
+def : Pat<(PPClo tglobaltlsaddr:$g, i32:$in),
+          (ADDI $in, tglobaltlsaddr:$g)>;
+def : Pat<(add i32:$in, (PPChi tglobaladdr:$g, 0)),
+          (ADDIS $in, tglobaladdr:$g)>;
+def : Pat<(add i32:$in, (PPChi tconstpool:$g, 0)),
+          (ADDIS $in, tconstpool:$g)>;
+def : Pat<(add i32:$in, (PPChi tjumptable:$g, 0)),
+          (ADDIS $in, tjumptable:$g)>;
+def : Pat<(add i32:$in, (PPChi tblockaddress:$g, 0)),
+          (ADDIS $in, tblockaddress:$g)>;
+
+// Support for thread-local storage.
+def PPC32GOT: Pseudo<(outs gprc:$rD), (ins), "#PPC32GOT", 
+                [(set i32:$rD, (PPCppc32GOT))]>;
+
+// Get the _GLOBAL_OFFSET_TABLE_ in PIC mode.
+// This uses two output registers, the first as the real output, the second as a
+// temporary register, used internally in code generation.
+def PPC32PICGOT: Pseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT", 
+                []>, NoEncode<"$rT">;
+
+def LDgotTprelL32: Pseudo<(outs gprc:$rD), (ins s16imm:$disp, gprc_nor0:$reg),
+                           "#LDgotTprelL32",
+                           [(set i32:$rD,
+                             (PPCldGotTprelL tglobaltlsaddr:$disp, i32:$reg))]>;
+def : Pat<(PPCaddTls i32:$in, tglobaltlsaddr:$g),
+          (ADD4TLS $in, tglobaltlsaddr:$g)>;
+
+def ADDItlsgdL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+                         "#ADDItlsgdL32",
+                         [(set i32:$rD,
+                           (PPCaddiTlsgdL i32:$reg, tglobaltlsaddr:$disp))]>;
+// LR is a true define, while the rest of the Defs are clobbers.  R3 is
+// explicitly defined when this op is created, so not mentioned here.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+    Defs = [R0,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
+def GETtlsADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
+                          "GETtlsADDR32",
+                          [(set i32:$rD,
+                            (PPCgetTlsAddr i32:$reg, tglobaltlsaddr:$sym))]>;
+// Combined op for ADDItlsgdL32 and GETtlsADDR32, late expanded.  R3 and LR
+// are true defines while the rest of the Defs are clobbers.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+    Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
+def ADDItlsgdLADDR32 : Pseudo<(outs gprc:$rD),
+                              (ins gprc_nor0:$reg, s16imm:$disp, tlsgd32:$sym),
+                              "#ADDItlsgdLADDR32",
+                              [(set i32:$rD,
+                                (PPCaddiTlsgdLAddr i32:$reg,
+                                                   tglobaltlsaddr:$disp,
+                                                   tglobaltlsaddr:$sym))]>;
+def ADDItlsldL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+                          "#ADDItlsldL32",
+                          [(set i32:$rD,
+                            (PPCaddiTlsldL i32:$reg, tglobaltlsaddr:$disp))]>;
+// LR is a true define, while the rest of the Defs are clobbers.  R3 is
+// explicitly defined when this op is created, so not mentioned here.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+    Defs = [R0,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
+def GETtlsldADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym),
+                            "GETtlsldADDR32",
+                            [(set i32:$rD,
+                              (PPCgetTlsldAddr i32:$reg,
+                                               tglobaltlsaddr:$sym))]>;
+// Combined op for ADDItlsldL32 and GETtlsADDR32, late expanded.  R3 and LR
+// are true defines while the rest of the Defs are clobbers.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+    Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,LR,CTR,CR0,CR1,CR5,CR6,CR7] in
+def ADDItlsldLADDR32 : Pseudo<(outs gprc:$rD),
+                              (ins gprc_nor0:$reg, s16imm:$disp, tlsgd32:$sym),
+                              "#ADDItlsldLADDR32",
+                              [(set i32:$rD,
+                                (PPCaddiTlsldLAddr i32:$reg,
+                                                   tglobaltlsaddr:$disp,
+                                                   tglobaltlsaddr:$sym))]>;
+def ADDIdtprelL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+                           "#ADDIdtprelL32",
+                           [(set i32:$rD,
+                             (PPCaddiDtprelL i32:$reg, tglobaltlsaddr:$disp))]>;
+def ADDISdtprelHA32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+                            "#ADDISdtprelHA32",
+                            [(set i32:$rD,
+                              (PPCaddisDtprelHA i32:$reg,
+                                                tglobaltlsaddr:$disp))]>;
+
+// Support for Position-independent code
+def LWZtoc : Pseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
+                   "#LWZtoc",
+                   [(set i32:$rD,
+                      (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>;
+// Get Global (GOT) Base Register offset, from the word immediately preceding
+// the function label.
+def UpdateGBR : Pseudo<(outs gprc:$rD, gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>;
+
+
+// Standard shifts.  These are represented separately from the real shifts above
+// so that we can distinguish between shifts that allow 5-bit and 6-bit shift
+// amounts.
+def : Pat<(sra i32:$rS, i32:$rB),
+          (SRAW $rS, $rB)>;
+def : Pat<(srl i32:$rS, i32:$rB),
+          (SRW $rS, $rB)>;
+def : Pat<(shl i32:$rS, i32:$rB),
+          (SLW $rS, $rB)>;
+
+def : Pat<(zextloadi1 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(zextloadi1 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi1 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(extloadi1 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi8 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(extloadi8 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi16 iaddr:$src),
+          (LHZ iaddr:$src)>;
+def : Pat<(extloadi16 xaddr:$src),
+          (LHZX xaddr:$src)>;
+def : Pat<(f64 (extloadf32 iaddr:$src)),
+          (COPY_TO_REGCLASS (LFS iaddr:$src), F8RC)>;
+def : Pat<(f64 (extloadf32 xaddr:$src)),
+          (COPY_TO_REGCLASS (LFSX xaddr:$src), F8RC)>;
+
+def : Pat<(f64 (fpextend f32:$src)),
+          (COPY_TO_REGCLASS $src, F8RC)>;
+
+// Only seq_cst fences require the heavyweight sync (SYNC 0).
+// All others can use the lightweight sync (SYNC 1).
+// source: http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
+// The rule for seq_cst is duplicated to work with both 64 bits and 32 bits
+// versions of Power.
+def : Pat<(atomic_fence (i64 7), (imm)), (SYNC 0)>, Requires<[HasSYNC]>;
+def : Pat<(atomic_fence (i32 7), (imm)), (SYNC 0)>, Requires<[HasSYNC]>;
+def : Pat<(atomic_fence (imm),   (imm)), (SYNC 1)>, Requires<[HasSYNC]>;
+def : Pat<(atomic_fence (imm), (imm)), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
+
+// Additional FNMSUB patterns: -a*c + b == -(a*c - b)
+def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
+          (FNMSUB $A, $C, $B)>;
+def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
+          (FNMSUB $A, $C, $B)>;
+def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B),
+          (FNMSUBS $A, $C, $B)>;
+def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B),
+          (FNMSUBS $A, $C, $B)>;
+
+// FCOPYSIGN's operand types need not agree.
+def : Pat<(fcopysign f64:$frB, f32:$frA),
+          (FCPSGND (COPY_TO_REGCLASS $frA, F8RC), $frB)>;
+def : Pat<(fcopysign f32:$frB, f64:$frA),
+          (FCPSGNS (COPY_TO_REGCLASS $frA, F4RC), $frB)>;
+
+include "PPCInstrAltivec.td"
+include "PPCInstrSPE.td"
+include "PPCInstr64Bit.td"
+include "PPCInstrVSX.td"
+include "PPCInstrQPX.td"
+include "PPCInstrHTM.td"
+
+def crnot : OutPatFrag<(ops node:$in),
+                       (CRNOR $in, $in)>;
+def       : Pat<(not i1:$in),
+                (crnot $in)>;
+
+// Patterns for arithmetic i1 operations.
+def : Pat<(add i1:$a, i1:$b),
+          (CRXOR $a, $b)>;
+def : Pat<(sub i1:$a, i1:$b),
+          (CRXOR $a, $b)>;
+def : Pat<(mul i1:$a, i1:$b),
+          (CRAND $a, $b)>;
+
+// We're sometimes asked to materialize i1 -1, which is just 1 in this case
+// (-1 is used to mean all bits set).
+def : Pat<(i1 -1), (CRSET)>;
+
+// i1 extensions, implemented in terms of isel.
+def : Pat<(i32 (zext i1:$in)),
+          (SELECT_I4 $in, (LI 1), (LI 0))>;
+def : Pat<(i32 (sext i1:$in)),
+          (SELECT_I4 $in, (LI -1), (LI 0))>;
+
+def : Pat<(i64 (zext i1:$in)),
+          (SELECT_I8 $in, (LI8 1), (LI8 0))>;
+def : Pat<(i64 (sext i1:$in)),
+          (SELECT_I8 $in, (LI8 -1), (LI8 0))>;
+
+// FIXME: We should choose either a zext or a sext based on other constants
+// already around.
+def : Pat<(i32 (anyext i1:$in)),
+          (SELECT_I4 $in, (LI 1), (LI 0))>;
+def : Pat<(i64 (anyext i1:$in)),
+          (SELECT_I8 $in, (LI8 1), (LI8 0))>;
+
+// match setcc on i1 variables.
+// CRANDC is:
+//   1 1 : F
+//   1 0 : T
+//   0 1 : F
+//   0 0 : F
+//
+// LT is:
+//  -1 -1  : F
+//  -1  0  : T
+//   0 -1  : F
+//   0  0  : F
+//
+// ULT is:
+//   1 1 : F
+//   1 0 : F
+//   0 1 : T
+//   0 0 : F
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETLT)),
+          (CRANDC $s1, $s2)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETULT)),
+          (CRANDC $s2, $s1)>;
+// CRORC is:
+//   1 1 : T
+//   1 0 : T
+//   0 1 : F
+//   0 0 : T
+//
+// LE is:
+//  -1 -1 : T
+//  -1  0 : T
+//   0 -1 : F
+//   0  0 : T
+//
+// ULE is:
+//   1 1 : T
+//   1 0 : F
+//   0 1 : T
+//   0 0 : T
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETLE)),
+          (CRORC $s1, $s2)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETULE)),
+          (CRORC $s2, $s1)>;
+
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETEQ)),
+          (CREQV $s1, $s2)>;
+
+// GE is:
+//  -1 -1 : T
+//  -1  0 : F
+//   0 -1 : T
+//   0  0 : T
+//
+// UGE is:
+//   1 1 : T
+//   1 0 : T
+//   0 1 : F
+//   0 0 : T
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETGE)),
+          (CRORC $s2, $s1)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETUGE)),
+          (CRORC $s1, $s2)>;
+
+// GT is:
+//  -1 -1 : F
+//  -1  0 : F
+//   0 -1 : T
+//   0  0 : F
+//
+// UGT is:
+//  1 1 : F
+//  1 0 : T
+//  0 1 : F
+//  0 0 : F
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETGT)),
+          (CRANDC $s2, $s1)>;
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETUGT)),
+          (CRANDC $s1, $s2)>;
+
+def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETNE)),
+          (CRXOR $s1, $s2)>;
+
+// match setcc on non-i1 (non-vector) variables. Note that SETUEQ, SETOGE,
+// SETOLE, SETONE, SETULT and SETUGT should be expanded by legalize for
+// floating-point types.
+
+multiclass CRNotPat<dag pattern, dag result> {
+  def : Pat<pattern, (crnot result)>;
+  def : Pat<(not pattern), result>;
+
+  // We can also fold the crnot into an extension:
+  def : Pat<(i32 (zext pattern)),
+            (SELECT_I4 result, (LI 0), (LI 1))>;
+  def : Pat<(i32 (sext pattern)),
+            (SELECT_I4 result, (LI 0), (LI -1))>;
+
+  // We can also fold the crnot into an extension:
+  def : Pat<(i64 (zext pattern)),
+            (SELECT_I8 result, (LI8 0), (LI8 1))>;
+  def : Pat<(i64 (sext pattern)),
+            (SELECT_I8 result, (LI8 0), (LI8 -1))>;
+
+  // FIXME: We should choose either a zext or a sext based on other constants
+  // already around.
+  def : Pat<(i32 (anyext pattern)),
+            (SELECT_I4 result, (LI 0), (LI 1))>;
+
+  def : Pat<(i64 (anyext pattern)),
+            (SELECT_I8 result, (LI8 0), (LI8 1))>;
+}
+
+// FIXME: Because of what seems like a bug in TableGen's type-inference code,
+// we need to write imm:$imm in the output patterns below, not just $imm, or
+// else the resulting matcher will not correctly add the immediate operand
+// (making it a register operand instead).
+
+// extended SETCC.
+multiclass ExtSetCCPat<CondCode cc, PatFrag pfrag,
+                       OutPatFrag rfrag, OutPatFrag rfrag8> {
+  def : Pat<(i32 (zext (i1 (pfrag i32:$s1, cc)))),
+            (rfrag $s1)>;
+  def : Pat<(i64 (zext (i1 (pfrag i64:$s1, cc)))),
+            (rfrag8 $s1)>;
+  def : Pat<(i64 (zext (i1 (pfrag i32:$s1, cc)))),
+            (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1), sub_32)>;
+  def : Pat<(i32 (zext (i1 (pfrag i64:$s1, cc)))),
+            (EXTRACT_SUBREG (rfrag8 $s1), sub_32)>;
+
+  def : Pat<(i32 (anyext (i1 (pfrag i32:$s1, cc)))),
+            (rfrag $s1)>;
+  def : Pat<(i64 (anyext (i1 (pfrag i64:$s1, cc)))),
+            (rfrag8 $s1)>;
+  def : Pat<(i64 (anyext (i1 (pfrag i32:$s1, cc)))),
+            (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1), sub_32)>;
+  def : Pat<(i32 (anyext (i1 (pfrag i64:$s1, cc)))),
+            (EXTRACT_SUBREG (rfrag8 $s1), sub_32)>;
+}
+
+// Note that we do all inversions below with i(32|64)not, instead of using
+// (xori x, 1) because on the A2 nor has single-cycle latency while xori
+// has 2-cycle latency.
+
+defm : ExtSetCCPat<SETEQ,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (CNTLZW $in), 27, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (CNTLZD $in), 58, 63)> >;
+ 
+defm : ExtSetCCPat<SETNE,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (i32not (CNTLZW $in)), 27, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (i64not (CNTLZD $in)), 58, 63)> >;
+                 
+defm : ExtSetCCPat<SETLT,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM $in, 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL $in, 1, 63)> >;
+
+defm : ExtSetCCPat<SETGE,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (i32not $in), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (i64not $in), 1, 63)> >;
+
+defm : ExtSetCCPat<SETGT,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (ANDC (NEG $in), $in), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (ANDC8 (NEG8 $in), $in), 1, 63)> >;
+
+defm : ExtSetCCPat<SETLE,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, 0, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (ORC $in, (NEG $in)), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (ORC8 $in, (NEG8 $in)), 1, 63)> >;
+
+defm : ExtSetCCPat<SETLT,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, -1, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (AND $in, (ADDI $in, 1)), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (AND8 $in, (ADDI8 $in, 1)), 1, 63)> >;
+
+defm : ExtSetCCPat<SETGE,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, -1, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (NAND $in, (ADDI $in, 1)), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (NAND8 $in, (ADDI8 $in, 1)), 1, 63)> >;
+
+defm : ExtSetCCPat<SETGT,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, -1, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM (i32not $in), 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL (i64not $in), 1, 63)> >;
+
+defm : ExtSetCCPat<SETLE,
+                   PatFrag<(ops node:$in, node:$cc),
+                           (setcc $in, -1, $cc)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLWINM $in, 1, 31, 31)>,
+                   OutPatFrag<(ops node:$in),
+                              (RLDICL $in, 1, 63)> >;
+
+// An extended SETCC with shift amount.
+multiclass ExtSetCCShiftPat<CondCode cc, PatFrag pfrag,
+                            OutPatFrag rfrag, OutPatFrag rfrag8> {
+  def : Pat<(i32 (zext (i1 (pfrag i32:$s1, i32:$sa, cc)))),
+            (rfrag $s1, $sa)>;
+  def : Pat<(i64 (zext (i1 (pfrag i64:$s1, i32:$sa, cc)))),
+            (rfrag8 $s1, $sa)>;
+  def : Pat<(i64 (zext (i1 (pfrag i32:$s1, i32:$sa, cc)))),
+            (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1, $sa), sub_32)>;
+  def : Pat<(i32 (zext (i1 (pfrag i64:$s1, i32:$sa, cc)))),
+            (EXTRACT_SUBREG (rfrag8 $s1, $sa), sub_32)>;
+
+  def : Pat<(i32 (anyext (i1 (pfrag i32:$s1, i32:$sa, cc)))),
+            (rfrag $s1, $sa)>;
+  def : Pat<(i64 (anyext (i1 (pfrag i64:$s1, i32:$sa, cc)))),
+            (rfrag8 $s1, $sa)>;
+  def : Pat<(i64 (anyext (i1 (pfrag i32:$s1, i32:$sa, cc)))),
+            (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1, $sa), sub_32)>;
+  def : Pat<(i32 (anyext (i1 (pfrag i64:$s1, i32:$sa, cc)))),
+            (EXTRACT_SUBREG (rfrag8 $s1, $sa), sub_32)>;
+}
+
+defm : ExtSetCCShiftPat<SETNE,
+                        PatFrag<(ops node:$in, node:$sa, node:$cc),
+                                (setcc (and $in, (shl 1, $sa)), 0, $cc)>,
+                        OutPatFrag<(ops node:$in, node:$sa),
+                                   (RLWNM $in, (SUBFIC $sa, 32), 31, 31)>,
+                        OutPatFrag<(ops node:$in, node:$sa),
+                                   (RLDCL $in, (SUBFIC $sa, 64), 63)> >;
+
+defm : ExtSetCCShiftPat<SETEQ,
+                        PatFrag<(ops node:$in, node:$sa, node:$cc),
+                                (setcc (and $in, (shl 1, $sa)), 0, $cc)>,
+                        OutPatFrag<(ops node:$in, node:$sa),
+                                   (RLWNM (i32not $in),
+                                          (SUBFIC $sa, 32), 31, 31)>,
+                        OutPatFrag<(ops node:$in, node:$sa),
+                                   (RLDCL (i64not $in),
+                                          (SUBFIC $sa, 64), 63)> >;
+
+// SETCC for i32.
+def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULT)),
+          (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
+def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLT)),
+          (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>;
+def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGT)),
+          (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>;
+def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGT)),
+          (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>;
+def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>;
+def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>;
+
+// For non-equality comparisons, the default code would materialize the
+// constant, then compare against it, like this:
+//   lis r2, 4660
+//   ori r2, r2, 22136
+//   cmpw cr0, r3, r2
+//   beq cr0,L6
+// Since we are just comparing for equality, we can emit this instead:
+//   xoris r0,r3,0x1234
+//   cmplwi cr0,r0,0x5678
+//   beq cr0,L6
+
+def : Pat<(i1 (setcc i32:$s1, imm:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
+                                  (LO16 imm:$imm)), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)),
+                (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGE)),
+                (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULE)),
+                (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLE)),
+                (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i32:$s1, imm:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)),
+                                        (LO16 imm:$imm)), sub_eq)>;
+
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETULT)),
+          (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETLT)),
+          (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETUGT)),
+          (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETGT)),
+          (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETEQ)),
+          (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETUGE)),
+                (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETGE)),
+                (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETULE)),
+                (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETLE)),
+                (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETNE)),
+                (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>;
+
+// SETCC for i64.
+def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULT)),
+          (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETLT)),
+          (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGT)),
+          (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGT)),
+          (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_eq)>;
+def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_eq)>;
+
+// For non-equality comparisons, the default code would materialize the
+// constant, then compare against it, like this:
+//   lis r2, 4660
+//   ori r2, r2, 22136
+//   cmpd cr0, r3, r2
+//   beq cr0,L6
+// Since we are just comparing for equality, we can emit this instead:
+//   xoris r0,r3,0x1234
+//   cmpldi cr0,r0,0x5678
+//   beq cr0,L6
+
+def : Pat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETEQ)),
+          (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
+                                  (LO16 imm:$imm)), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGE)),
+                (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGE)),
+                (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULE)),
+                (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETLE)),
+                (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_eq)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)),
+                (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)),
+                                        (LO16 imm:$imm)), sub_eq)>;
+
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETULT)),
+          (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETLT)),
+          (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETUGT)),
+          (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)),
+          (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)),
+          (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
+
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETUGE)),
+                (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETGE)),
+                (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETULE)),
+                (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETLE)),
+                (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)),
+                (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>;
+
+// SETCC for f32.
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOGT)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETGT)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOEQ)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETUO)),
+          (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
+
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)),
+                (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>;
+
+// SETCC for f64.
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETLT)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOGT)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETGT)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOEQ)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETUO)),
+          (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
+
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>;
+defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)),
+                (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>;
+
+// match select on i1 variables:
+def : Pat<(i1 (select i1:$cond, i1:$tval, i1:$fval)),
+          (CROR (CRAND        $cond , $tval),
+                (CRAND (crnot $cond), $fval))>;
+
+// match selectcc on i1 variables:
+//   select (lhs == rhs), tval, fval is:
+//   ((lhs == rhs) & tval) | (!(lhs == rhs) & fval)
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETLT)),
+           (CROR (CRAND (CRANDC $lhs, $rhs), $tval),
+                 (CRAND (CRORC  $rhs, $lhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETULT)),
+           (CROR (CRAND (CRANDC $rhs, $lhs), $tval),
+                 (CRAND (CRORC  $lhs, $rhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETLE)),
+           (CROR (CRAND (CRORC  $lhs, $rhs), $tval),
+                 (CRAND (CRANDC $rhs, $lhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETULE)),
+           (CROR (CRAND (CRORC  $rhs, $lhs), $tval),
+                 (CRAND (CRANDC $lhs, $rhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETEQ)),
+           (CROR (CRAND (CREQV $lhs, $rhs), $tval),
+                 (CRAND (CRXOR $lhs, $rhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETGE)),
+           (CROR (CRAND (CRORC  $rhs, $lhs), $tval),
+                 (CRAND (CRANDC $lhs, $rhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETUGE)),
+           (CROR (CRAND (CRORC  $lhs, $rhs), $tval),
+                 (CRAND (CRANDC $rhs, $lhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETGT)),
+           (CROR (CRAND (CRANDC $rhs, $lhs), $tval),
+                 (CRAND (CRORC  $lhs, $rhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETUGT)),
+           (CROR (CRAND (CRANDC $lhs, $rhs), $tval),
+                 (CRAND (CRORC  $rhs, $lhs), $fval))>;
+def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETNE)),
+           (CROR (CRAND (CREQV $lhs, $rhs), $fval),
+                 (CRAND (CRXOR $lhs, $rhs), $tval))>;
+
+// match selectcc on i1 variables with non-i1 output.
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETLT)),
+          (SELECT_I4 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETULT)),
+          (SELECT_I4 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETLE)),
+          (SELECT_I4 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETULE)),
+          (SELECT_I4 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETEQ)),
+          (SELECT_I4 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETGE)),
+          (SELECT_I4 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETUGE)),
+          (SELECT_I4 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETGT)),
+          (SELECT_I4 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETUGT)),
+          (SELECT_I4 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETNE)),
+          (SELECT_I4 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETLT)),
+          (SELECT_I8 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETULT)),
+          (SELECT_I8 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETLE)),
+          (SELECT_I8 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETULE)),
+          (SELECT_I8 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETEQ)),
+          (SELECT_I8 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETGE)),
+          (SELECT_I8 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETUGE)),
+          (SELECT_I8 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETGT)),
+          (SELECT_I8 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETUGT)),
+          (SELECT_I8 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETNE)),
+          (SELECT_I8 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
+          (SELECT_F4 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULT)),
+          (SELECT_F4 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)),
+          (SELECT_F4 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULE)),
+          (SELECT_F4 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)),
+          (SELECT_F4 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)),
+          (SELECT_F4 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGE)),
+          (SELECT_F4 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)),
+          (SELECT_F4 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGT)),
+          (SELECT_F4 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)),
+          (SELECT_F4 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)),
+          (SELECT_F8 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULT)),
+          (SELECT_F8 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)),
+          (SELECT_F8 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULE)),
+          (SELECT_F8 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)),
+          (SELECT_F8 (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)),
+          (SELECT_F8 (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGE)),
+          (SELECT_F8 (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)),
+          (SELECT_F8 (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)),
+          (SELECT_F8 (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
+          (SELECT_F8 (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLT)),
+          (SELECT_VRRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETULT)),
+          (SELECT_VRRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLE)),
+          (SELECT_VRRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETULE)),
+          (SELECT_VRRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETEQ)),
+          (SELECT_VRRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETGE)),
+          (SELECT_VRRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETUGE)),
+          (SELECT_VRRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETGT)),
+          (SELECT_VRRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETUGT)),
+          (SELECT_VRRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETNE)),
+          (SELECT_VRRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+let usesCustomInserter = 1 in {
+def ANDIo_1_EQ_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in),
+                             "#ANDIo_1_EQ_BIT",
+                             [(set i1:$dst, (trunc (not i32:$in)))]>;
+def ANDIo_1_GT_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in),
+                             "#ANDIo_1_GT_BIT",
+                             [(set i1:$dst, (trunc i32:$in))]>;
+
+def ANDIo_1_EQ_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in),
+                              "#ANDIo_1_EQ_BIT8",
+                              [(set i1:$dst, (trunc (not i64:$in)))]>;
+def ANDIo_1_GT_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in),
+                              "#ANDIo_1_GT_BIT8",
+                              [(set i1:$dst, (trunc i64:$in))]>;
+}
+
+def : Pat<(i1 (not (trunc i32:$in))),
+           (ANDIo_1_EQ_BIT $in)>;
+def : Pat<(i1 (not (trunc i64:$in))),
+           (ANDIo_1_EQ_BIT8 $in)>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instructions used for assembler/disassembler only
+//
+
+// FIXME: For B=0 or B > 8, the registers following RT are used.
+// WARNING: Do not add patterns for this instruction without fixing this.
+def LSWI  : XForm_base_r3xo<31, 597, (outs gprc:$RT), (ins gprc:$A, u5imm:$B),
+                            "lswi $RT, $A, $B", IIC_LdStLoad, []>;
+
+// FIXME: For B=0 or B > 8, the registers following RT are used.
+// WARNING: Do not add patterns for this instruction without fixing this.
+def STSWI : XForm_base_r3xo<31, 725, (outs), (ins gprc:$RT, gprc:$A, u5imm:$B),
+                            "stswi $RT, $A, $B", IIC_LdStLoad, []>;
+
+def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins),
+                         "isync", IIC_SprISYNC, []>;
+
+def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src),
+                    "icbi $src", IIC_LdStICBI, []>;
+
+// We used to have EIEIO as value but E[0-9A-Z] is a reserved name
+def EnforceIEIO : XForm_24_eieio<31, 854, (outs), (ins),
+                           "eieio", IIC_LdStLoad, []>;
+
+def WAIT : XForm_24_sync<31, 62, (outs), (ins i32imm:$L),
+                         "wait $L", IIC_LdStLoad, []>;
+
+def MBAR : XForm_mbar<31, 854, (outs), (ins u5imm:$MO),
+                         "mbar $MO", IIC_LdStLoad>, Requires<[IsBookE]>;
+
+def MTSR: XForm_sr<31, 210, (outs), (ins gprc:$RS, u4imm:$SR),
+            "mtsr $SR, $RS", IIC_SprMTSR>;
+
+def MFSR: XForm_sr<31, 595, (outs gprc:$RS), (ins u4imm:$SR),
+            "mfsr $RS, $SR", IIC_SprMFSR>;
+
+def MTSRIN: XForm_srin<31, 242, (outs), (ins gprc:$RS, gprc:$RB),
+            "mtsrin $RS, $RB", IIC_SprMTSR>;
+
+def MFSRIN: XForm_srin<31, 659, (outs gprc:$RS), (ins gprc:$RB),
+            "mfsrin $RS, $RB", IIC_SprMFSR>;
+
+def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, i32imm:$L),
+                    "mtmsr $RS, $L", IIC_SprMTMSR>;
+
+def WRTEE: XForm_mtmsr<31, 131, (outs), (ins gprc:$RS),
+                    "wrtee $RS", IIC_SprMTMSR>, Requires<[IsBookE]> {
+  let L = 0;
+}
+
+def WRTEEI: I<31, (outs), (ins i1imm:$E), "wrteei $E", IIC_SprMTMSR>,
+              Requires<[IsBookE]> {
+  bits<1> E;
+
+  let Inst{16} = E;
+  let Inst{21-30} = 163;
+}
+
+def DCCCI : XForm_tlb<454, (outs), (ins gprc:$A, gprc:$B),
+               "dccci $A, $B", IIC_LdStLoad>, Requires<[IsPPC4xx]>;
+def ICCCI : XForm_tlb<966, (outs), (ins gprc:$A, gprc:$B),
+               "iccci $A, $B", IIC_LdStLoad>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"dci 0", (DCCCI R0, R0)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"dccci", (DCCCI R0, R0)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"ici 0", (ICCCI R0, R0)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"iccci", (ICCCI R0, R0)>, Requires<[IsPPC4xx]>;
+
+def MFMSR : XForm_rs<31, 83, (outs gprc:$RT), (ins),
+                  "mfmsr $RT", IIC_SprMFMSR, []>;
+
+def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, i32imm:$L),
+                    "mtmsrd $RS, $L", IIC_SprMTMSRD>;
+
+def MCRFS : XLForm_3<63, 64, (outs crrc:$BF), (ins crrc:$BFA),
+                     "mcrfs $BF, $BFA", IIC_BrMCR>;
+
+def MTFSFI : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
+                      "mtfsfi $BF, $U, $W", IIC_IntMFFS>;
+
+def MTFSFIo : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W),
+                       "mtfsfi. $BF, $U, $W", IIC_IntMFFS>, isDOT;
+
+def : InstAlias<"mtfsfi $BF, $U", (MTFSFI crrc:$BF, i32imm:$U, 0)>;
+def : InstAlias<"mtfsfi. $BF, $U", (MTFSFIo crrc:$BF, i32imm:$U, 0)>;
+
+def MTFSF : XFLForm_1<63, 711, (outs),
+                      (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W),
+                      "mtfsf $FLM, $FRB, $L, $W", IIC_IntMFFS, []>;
+def MTFSFo : XFLForm_1<63, 711, (outs),
+                       (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W),
+                       "mtfsf. $FLM, $FRB, $L, $W", IIC_IntMFFS, []>, isDOT;
+
+def : InstAlias<"mtfsf $FLM, $FRB", (MTFSF i32imm:$FLM, f8rc:$FRB, 0, 0)>;
+def : InstAlias<"mtfsf. $FLM, $FRB", (MTFSFo i32imm:$FLM, f8rc:$FRB, 0, 0)>;
+
+def SLBIE : XForm_16b<31, 434, (outs), (ins gprc:$RB),
+                        "slbie $RB", IIC_SprSLBIE, []>;
+
+def SLBMTE : XForm_26<31, 402, (outs), (ins gprc:$RS, gprc:$RB),
+                    "slbmte $RS, $RB", IIC_SprSLBMTE, []>;
+
+def SLBMFEE : XForm_26<31, 915, (outs gprc:$RT), (ins gprc:$RB),
+                       "slbmfee $RT, $RB", IIC_SprSLBMFEE, []>;
+
+def SLBMFEV : XLForm_1_gen<31, 851, (outs gprc:$RT), (ins gprc:$RB),
+                       "slbmfev $RT, $RB", IIC_SprSLBMFEV, []>;
+
+def SLBIA : XForm_0<31, 498, (outs), (ins), "slbia", IIC_SprSLBIA, []>;
+
+def TLBIA : XForm_0<31, 370, (outs), (ins),
+                        "tlbia", IIC_SprTLBIA, []>;
+
+def TLBSYNC : XForm_0<31, 566, (outs), (ins),
+                        "tlbsync", IIC_SprTLBSYNC, []>;
+
+def TLBIEL : XForm_16b<31, 274, (outs), (ins gprc:$RB),
+                          "tlbiel $RB", IIC_SprTLBIEL, []>;
+
+def TLBLD : XForm_16b<31, 978, (outs), (ins gprc:$RB),
+                          "tlbld $RB", IIC_LdStLoad, []>, Requires<[IsPPC6xx]>;
+def TLBLI : XForm_16b<31, 1010, (outs), (ins gprc:$RB),
+                          "tlbli $RB", IIC_LdStLoad, []>, Requires<[IsPPC6xx]>;
+
+def TLBIE : XForm_26<31, 306, (outs), (ins gprc:$RS, gprc:$RB),
+                          "tlbie $RB,$RS", IIC_SprTLBIE, []>;
+
+def TLBSX : XForm_tlb<914, (outs), (ins gprc:$A, gprc:$B), "tlbsx $A, $B",
+                IIC_LdStLoad>, Requires<[IsBookE]>;
+
+def TLBIVAX : XForm_tlb<786, (outs), (ins gprc:$A, gprc:$B), "tlbivax $A, $B",
+                IIC_LdStLoad>, Requires<[IsBookE]>;
+
+def TLBRE : XForm_24_eieio<31, 946, (outs), (ins),
+                           "tlbre", IIC_LdStLoad, []>, Requires<[IsBookE]>;
+
+def TLBWE : XForm_24_eieio<31, 978, (outs), (ins),
+                           "tlbwe", IIC_LdStLoad, []>, Requires<[IsBookE]>;
+
+def TLBRE2 : XForm_tlbws<31, 946, (outs gprc:$RS), (ins gprc:$A, i1imm:$WS),
+               "tlbre $RS, $A, $WS", IIC_LdStLoad, []>, Requires<[IsPPC4xx]>;
+
+def TLBWE2 : XForm_tlbws<31, 978, (outs), (ins gprc:$RS, gprc:$A, i1imm:$WS),
+               "tlbwe $RS, $A, $WS", IIC_LdStLoad, []>, Requires<[IsPPC4xx]>;
+
+def TLBSX2 : XForm_base_r3xo<31, 914, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
+                             "tlbsx $RST, $A, $B", IIC_LdStLoad, []>,
+                             Requires<[IsPPC4xx]>;
+def TLBSX2D : XForm_base_r3xo<31, 914, (outs),
+                              (ins gprc:$RST, gprc:$A, gprc:$B),
+                              "tlbsx. $RST, $A, $B", IIC_LdStLoad, []>,
+                              Requires<[IsPPC4xx]>, isDOT;
+
+def RFID : XForm_0<19, 18, (outs), (ins), "rfid", IIC_IntRFID, []>;
+
+def RFI : XForm_0<19, 50, (outs), (ins), "rfi", IIC_SprRFI, []>,
+                  Requires<[IsBookE]>;
+def RFCI : XForm_0<19, 51, (outs), (ins), "rfci", IIC_BrB, []>,
+                   Requires<[IsBookE]>;
+
+def RFDI : XForm_0<19, 39, (outs), (ins), "rfdi", IIC_BrB, []>,
+                   Requires<[IsE500]>;
+def RFMCI : XForm_0<19, 38, (outs), (ins), "rfmci", IIC_BrB, []>,
+                    Requires<[IsE500]>;
+
+def MFDCR : XFXForm_1<31, 323, (outs gprc:$RT), (ins i32imm:$SPR),
+                      "mfdcr $RT, $SPR", IIC_SprMFSPR>, Requires<[IsPPC4xx]>;
+def MTDCR : XFXForm_1<31, 451, (outs), (ins gprc:$RT, i32imm:$SPR),
+                      "mtdcr $SPR, $RT", IIC_SprMTSPR>, Requires<[IsPPC4xx]>;
+
+def HRFID : XLForm_1_np<19, 274, (outs), (ins), "hrfid", IIC_BrB, []>;
+def NAP   : XLForm_1_np<19, 434, (outs), (ins), "nap", IIC_BrB, []>;
+
+def ATTN : XForm_attn<0, 256, (outs), (ins), "attn", IIC_BrB>;
+
+def LBZCIX : XForm_base_r3xo<31, 853, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
+                             "lbzcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LHZCIX : XForm_base_r3xo<31, 821, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
+                             "lhzcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LWZCIX : XForm_base_r3xo<31, 789, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
+                             "lwzcix $RST, $A, $B", IIC_LdStLoad, []>;
+def LDCIX :  XForm_base_r3xo<31, 885, (outs gprc:$RST), (ins gprc:$A, gprc:$B),
+                             "ldcix $RST, $A, $B", IIC_LdStLoad, []>;
+
+def STBCIX : XForm_base_r3xo<31, 981, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
+                             "stbcix $RST, $A, $B", IIC_LdStLoad, []>;
+def STHCIX : XForm_base_r3xo<31, 949, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
+                             "sthcix $RST, $A, $B", IIC_LdStLoad, []>;
+def STWCIX : XForm_base_r3xo<31, 917, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
+                             "stwcix $RST, $A, $B", IIC_LdStLoad, []>;
+def STDCIX : XForm_base_r3xo<31, 1013, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
+                             "stdcix $RST, $A, $B", IIC_LdStLoad, []>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Assembler Instruction Aliases
+//
+
+// Pseudo-instructions for alternate assembly syntax (never used by codegen).
+// These are aliases that require C++ handling to convert to the target
+// instruction, while InstAliases can be handled directly by tblgen.
+class PPCAsmPseudo<string asm, dag iops>
+  : Instruction {
+  let Namespace = "PPC";
+  bit PPC64 = 0;  // Default value, override with isPPC64
+
+  let OutOperandList = (outs);
+  let InOperandList = iops;
+  let Pattern = [];
+  let AsmString = asm;
+  let isAsmParserOnly = 1;
+  let isPseudo = 1;
+}
+
+def : InstAlias<"sc", (SC 0)>;
+
+def : InstAlias<"sync", (SYNC 0)>, Requires<[HasSYNC]>;
+def : InstAlias<"msync", (SYNC 0), 0>, Requires<[HasSYNC]>;
+def : InstAlias<"lwsync", (SYNC 1)>, Requires<[HasSYNC]>;
+def : InstAlias<"ptesync", (SYNC 2)>, Requires<[HasSYNC]>;
+
+def : InstAlias<"wait", (WAIT 0)>;
+def : InstAlias<"waitrsv", (WAIT 1)>;
+def : InstAlias<"waitimpl", (WAIT 2)>;
+
+def : InstAlias<"mbar", (MBAR 0)>, Requires<[IsBookE]>;
+
+def DCBTx   : PPCAsmPseudo<"dcbt $dst", (ins memrr:$dst)>;
+def DCBTSTx : PPCAsmPseudo<"dcbtst $dst", (ins memrr:$dst)>;
+
+def DCBTCT : PPCAsmPseudo<"dcbtct $dst, $TH", (ins memrr:$dst, u5imm:$TH)>;
+def DCBTDS : PPCAsmPseudo<"dcbtds $dst, $TH", (ins memrr:$dst, u5imm:$TH)>;
+def DCBTT  : PPCAsmPseudo<"dcbtt $dst", (ins memrr:$dst)>;
+
+def DCBTSTCT : PPCAsmPseudo<"dcbtstct $dst, $TH", (ins memrr:$dst, u5imm:$TH)>;
+def DCBTSTDS : PPCAsmPseudo<"dcbtstds $dst, $TH", (ins memrr:$dst, u5imm:$TH)>;
+def DCBTSTT  : PPCAsmPseudo<"dcbtstt $dst", (ins memrr:$dst)>;
+
+def DCBFx  : PPCAsmPseudo<"dcbf $dst", (ins memrr:$dst)>;
+def DCBFL  : PPCAsmPseudo<"dcbfl $dst", (ins memrr:$dst)>;
+def DCBFLP : PPCAsmPseudo<"dcbflp $dst", (ins memrr:$dst)>;
+
+def : InstAlias<"crset $bx", (CREQV crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
+def : InstAlias<"crclr $bx", (CRXOR crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
+def : InstAlias<"crmove $bx, $by", (CROR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;
+def : InstAlias<"crnot $bx, $by", (CRNOR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;
+
+def : InstAlias<"mtxer $Rx", (MTSPR 1, gprc:$Rx)>;
+def : InstAlias<"mfxer $Rx", (MFSPR gprc:$Rx, 1)>;
+
+def : InstAlias<"mfrtcu $Rx", (MFSPR gprc:$Rx, 4)>;
+def : InstAlias<"mfrtcl $Rx", (MFSPR gprc:$Rx, 5)>;
+
+def : InstAlias<"mtdscr $Rx", (MTSPR 17, gprc:$Rx)>;
+def : InstAlias<"mfdscr $Rx", (MFSPR gprc:$Rx, 17)>;
+
+def : InstAlias<"mtdsisr $Rx", (MTSPR 18, gprc:$Rx)>;
+def : InstAlias<"mfdsisr $Rx", (MFSPR gprc:$Rx, 18)>;
+
+def : InstAlias<"mtdar $Rx", (MTSPR 19, gprc:$Rx)>;
+def : InstAlias<"mfdar $Rx", (MFSPR gprc:$Rx, 19)>;
+
+def : InstAlias<"mtdec $Rx", (MTSPR 22, gprc:$Rx)>;
+def : InstAlias<"mfdec $Rx", (MFSPR gprc:$Rx, 22)>;
+
+def : InstAlias<"mtsdr1 $Rx", (MTSPR 25, gprc:$Rx)>;
+def : InstAlias<"mfsdr1 $Rx", (MFSPR gprc:$Rx, 25)>;
+
+def : InstAlias<"mtsrr0 $Rx", (MTSPR 26, gprc:$Rx)>;
+def : InstAlias<"mfsrr0 $Rx", (MFSPR gprc:$Rx, 26)>;
+
+def : InstAlias<"mtsrr1 $Rx", (MTSPR 27, gprc:$Rx)>;
+def : InstAlias<"mfsrr1 $Rx", (MFSPR gprc:$Rx, 27)>;
+
+def : InstAlias<"mtsrr2 $Rx", (MTSPR 990, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfsrr2 $Rx", (MFSPR gprc:$Rx, 990)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mtsrr3 $Rx", (MTSPR 991, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfsrr3 $Rx", (MFSPR gprc:$Rx, 991)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mtcfar $Rx", (MTSPR 28, gprc:$Rx)>;
+def : InstAlias<"mfcfar $Rx", (MFSPR gprc:$Rx, 28)>;
+
+def : InstAlias<"mtamr $Rx", (MTSPR 29, gprc:$Rx)>;
+def : InstAlias<"mfamr $Rx", (MFSPR gprc:$Rx, 29)>;
+
+def : InstAlias<"mtpid $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsBookE]>;
+def : InstAlias<"mfpid $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsBookE]>;
+
+def : InstAlias<"mftb $Rx", (MFTB gprc:$Rx, 268)>;
+def : InstAlias<"mftbl $Rx", (MFTB gprc:$Rx, 268)>;
+def : InstAlias<"mftbu $Rx", (MFTB gprc:$Rx, 269)>;
+
+def : InstAlias<"mttbl $Rx", (MTSPR 284, gprc:$Rx)>;
+def : InstAlias<"mttbu $Rx", (MTSPR 285, gprc:$Rx)>;
+
+def : InstAlias<"mftblo $Rx", (MFSPR gprc:$Rx, 989)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mttblo $Rx", (MTSPR 989, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mftbhi $Rx", (MFSPR gprc:$Rx, 988)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mttbhi $Rx", (MTSPR 988, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"xnop", (XORI R0, R0, 0)>;
+
+def : InstAlias<"mr $rA, $rB", (OR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+def : InstAlias<"mr. $rA, $rB", (OR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+
+def : InstAlias<"not $rA, $rB", (NOR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+def : InstAlias<"not. $rA, $rB", (NOR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+
+def : InstAlias<"mtcr $rA", (MTCRF8 255, g8rc:$rA)>;
+
+foreach BATR = 0-3 in {
+    def : InstAlias<"mtdbatu "#BATR#", $Rx",
+                    (MTSPR !add(BATR, !add(BATR, 536)), gprc:$Rx)>,
+                    Requires<[IsPPC6xx]>;
+    def : InstAlias<"mfdbatu $Rx, "#BATR,
+                    (MFSPR gprc:$Rx, !add(BATR, !add(BATR, 536)))>,
+                    Requires<[IsPPC6xx]>;
+    def : InstAlias<"mtdbatl "#BATR#", $Rx",
+                    (MTSPR !add(BATR, !add(BATR, 537)), gprc:$Rx)>,
+                    Requires<[IsPPC6xx]>;
+    def : InstAlias<"mfdbatl $Rx, "#BATR,
+                    (MFSPR gprc:$Rx, !add(BATR, !add(BATR, 537)))>,
+                    Requires<[IsPPC6xx]>;
+    def : InstAlias<"mtibatu "#BATR#", $Rx",
+                    (MTSPR !add(BATR, !add(BATR, 528)), gprc:$Rx)>,
+                    Requires<[IsPPC6xx]>;
+    def : InstAlias<"mfibatu $Rx, "#BATR,
+                    (MFSPR gprc:$Rx, !add(BATR, !add(BATR, 528)))>,
+                    Requires<[IsPPC6xx]>;
+    def : InstAlias<"mtibatl "#BATR#", $Rx",
+                    (MTSPR !add(BATR, !add(BATR, 529)), gprc:$Rx)>,
+                    Requires<[IsPPC6xx]>;
+    def : InstAlias<"mfibatl $Rx, "#BATR,
+                    (MFSPR gprc:$Rx, !add(BATR, !add(BATR, 529)))>,
+                    Requires<[IsPPC6xx]>;
+}
+
+foreach BR = 0-7 in {
+    def : InstAlias<"mfbr"#BR#" $Rx",
+                    (MFDCR gprc:$Rx, !add(BR, 0x80))>,
+                    Requires<[IsPPC4xx]>;
+    def : InstAlias<"mtbr"#BR#" $Rx",
+                    (MTDCR gprc:$Rx, !add(BR, 0x80))>,
+                    Requires<[IsPPC4xx]>;
+}
+
+def : InstAlias<"mtdccr $Rx", (MTSPR 1018, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfdccr $Rx", (MFSPR gprc:$Rx, 1018)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mticcr $Rx", (MTSPR 1019, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mficcr $Rx", (MFSPR gprc:$Rx, 1019)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mtdear $Rx", (MTSPR 981, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfdear $Rx", (MFSPR gprc:$Rx, 981)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mtesr $Rx", (MTSPR 980, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfesr $Rx", (MFSPR gprc:$Rx, 980)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mfspefscr $Rx", (MFSPR gprc:$Rx, 512)>;
+def : InstAlias<"mtspefscr $Rx", (MTSPR 512, gprc:$Rx)>;
+
+def : InstAlias<"mttcr $Rx", (MTSPR 986, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mftcr $Rx", (MFSPR gprc:$Rx, 986)>, Requires<[IsPPC4xx]>;
+
+def LAx : PPCAsmPseudo<"la $rA, $addr", (ins gprc:$rA, memri:$addr)>;
+
+def SUBI : PPCAsmPseudo<"subi $rA, $rB, $imm",
+                        (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+def SUBIS : PPCAsmPseudo<"subis $rA, $rB, $imm",
+                         (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+def SUBIC : PPCAsmPseudo<"subic $rA, $rB, $imm",
+                         (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+def SUBICo : PPCAsmPseudo<"subic. $rA, $rB, $imm",
+                          (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+
+def : InstAlias<"sub $rA, $rB, $rC", (SUBF8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+def : InstAlias<"sub. $rA, $rB, $rC", (SUBF8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+def : InstAlias<"subc $rA, $rB, $rC", (SUBFC8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+
+def : InstAlias<"mtmsrd $RS", (MTMSRD gprc:$RS, 0)>;
+def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>;
+
+def : InstAlias<"mfasr $RT", (MFSPR gprc:$RT, 280)>;
+def : InstAlias<"mtasr $RT", (MTSPR 280, gprc:$RT)>;
+
+foreach SPRG = 0-3 in {
+  def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 272))>;
+  def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR gprc:$RT, !add(SPRG, 272))>;
+  def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR !add(SPRG, 272), gprc:$RT)>;
+  def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR !add(SPRG, 272), gprc:$RT)>;
+}
+foreach SPRG = 4-7 in {
+  def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 256))>,
+                  Requires<[IsBookE]>;
+  def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR gprc:$RT, !add(SPRG, 256))>,
+                  Requires<[IsBookE]>;
+  def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR !add(SPRG, 256), gprc:$RT)>,
+                  Requires<[IsBookE]>;
+  def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR !add(SPRG, 256), gprc:$RT)>,
+                  Requires<[IsBookE]>;
+}
+
+def : InstAlias<"mtasr $RS", (MTSPR 280, gprc:$RS)>;
+
+def : InstAlias<"mfdec $RT", (MFSPR gprc:$RT, 22)>;
+def : InstAlias<"mtdec $RT", (MTSPR 22, gprc:$RT)>;
+
+def : InstAlias<"mfpvr $RT", (MFSPR gprc:$RT, 287)>;
+
+def : InstAlias<"mfsdr1 $RT", (MFSPR gprc:$RT, 25)>;
+def : InstAlias<"mtsdr1 $RT", (MTSPR 25, gprc:$RT)>;
+
+def : InstAlias<"mfsrr0 $RT", (MFSPR gprc:$RT, 26)>;
+def : InstAlias<"mfsrr1 $RT", (MFSPR gprc:$RT, 27)>;
+def : InstAlias<"mtsrr0 $RT", (MTSPR 26, gprc:$RT)>;
+def : InstAlias<"mtsrr1 $RT", (MTSPR 27, gprc:$RT)>;
+
+def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>;
+
+def : InstAlias<"tlbrehi $RS, $A", (TLBRE2 gprc:$RS, gprc:$A, 0)>,
+                Requires<[IsPPC4xx]>;
+def : InstAlias<"tlbrelo $RS, $A", (TLBRE2 gprc:$RS, gprc:$A, 1)>,
+                Requires<[IsPPC4xx]>;
+def : InstAlias<"tlbwehi $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 0)>,
+                Requires<[IsPPC4xx]>;
+def : InstAlias<"tlbwelo $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 1)>,
+                Requires<[IsPPC4xx]>;
+
+def EXTLWI : PPCAsmPseudo<"extlwi $rA, $rS, $n, $b",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def EXTLWIo : PPCAsmPseudo<"extlwi. $rA, $rS, $n, $b",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def EXTRWI : PPCAsmPseudo<"extrwi $rA, $rS, $n, $b",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def EXTRWIo : PPCAsmPseudo<"extrwi. $rA, $rS, $n, $b",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def INSLWI : PPCAsmPseudo<"inslwi $rA, $rS, $n, $b",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def INSLWIo : PPCAsmPseudo<"inslwi. $rA, $rS, $n, $b",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def INSRWI : PPCAsmPseudo<"insrwi $rA, $rS, $n, $b",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def INSRWIo : PPCAsmPseudo<"insrwi. $rA, $rS, $n, $b",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def ROTRWI : PPCAsmPseudo<"rotrwi $rA, $rS, $n",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def ROTRWIo : PPCAsmPseudo<"rotrwi. $rA, $rS, $n",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def SLWI : PPCAsmPseudo<"slwi $rA, $rS, $n",
+                        (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def SLWIo : PPCAsmPseudo<"slwi. $rA, $rS, $n",
+                         (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def SRWI : PPCAsmPseudo<"srwi $rA, $rS, $n",
+                        (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def SRWIo : PPCAsmPseudo<"srwi. $rA, $rS, $n",
+                         (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def CLRRWI : PPCAsmPseudo<"clrrwi $rA, $rS, $n",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def CLRRWIo : PPCAsmPseudo<"clrrwi. $rA, $rS, $n",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def CLRLSLWI : PPCAsmPseudo<"clrlslwi $rA, $rS, $b, $n",
+                            (ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>;
+def CLRLSLWIo : PPCAsmPseudo<"clrlslwi. $rA, $rS, $b, $n",
+                             (ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>;
+
+def : InstAlias<"rotlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>;
+def : InstAlias<"rotlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>;
+def : InstAlias<"rotlw $rA, $rS, $rB", (RLWNM gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>;
+def : InstAlias<"rotlw. $rA, $rS, $rB", (RLWNMo gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>;
+def : InstAlias<"clrlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
+def : InstAlias<"clrlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
+
+def : InstAlias<"cntlzw $rA, $rS", (CNTLZW gprc:$rA, gprc:$rS)>;
+def : InstAlias<"cntlzw. $rA, $rS", (CNTLZWo gprc:$rA, gprc:$rS)>;
+// The POWER variant
+def : MnemonicAlias<"cntlz",  "cntlzw">;
+def : MnemonicAlias<"cntlz.", "cntlzw.">;
+
+def EXTLDI : PPCAsmPseudo<"extldi $rA, $rS, $n, $b",
+                          (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def EXTLDIo : PPCAsmPseudo<"extldi. $rA, $rS, $n, $b",
+                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def EXTRDI : PPCAsmPseudo<"extrdi $rA, $rS, $n, $b",
+                          (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def EXTRDIo : PPCAsmPseudo<"extrdi. $rA, $rS, $n, $b",
+                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def INSRDI : PPCAsmPseudo<"insrdi $rA, $rS, $n, $b",
+                          (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def INSRDIo : PPCAsmPseudo<"insrdi. $rA, $rS, $n, $b",
+                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def ROTRDI : PPCAsmPseudo<"rotrdi $rA, $rS, $n",
+                          (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def ROTRDIo : PPCAsmPseudo<"rotrdi. $rA, $rS, $n",
+                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def SLDI : PPCAsmPseudo<"sldi $rA, $rS, $n",
+                        (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def SLDIo : PPCAsmPseudo<"sldi. $rA, $rS, $n",
+                         (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def SRDI : PPCAsmPseudo<"srdi $rA, $rS, $n",
+                        (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def SRDIo : PPCAsmPseudo<"srdi. $rA, $rS, $n",
+                         (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def CLRRDI : PPCAsmPseudo<"clrrdi $rA, $rS, $n",
+                          (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def CLRRDIo : PPCAsmPseudo<"clrrdi. $rA, $rS, $n",
+                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def CLRLSLDI : PPCAsmPseudo<"clrlsldi $rA, $rS, $b, $n",
+                            (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>;
+def CLRLSLDIo : PPCAsmPseudo<"clrlsldi. $rA, $rS, $b, $n",
+                             (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>;
+
+def : InstAlias<"rotldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
+def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
+def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
+def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
+def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
+def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
+
+def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b",
+                            (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
+def RLWINMobm : PPCAsmPseudo<"rlwinm. $rA, $rS, $n, $b",
+                            (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
+def RLWIMIbm : PPCAsmPseudo<"rlwimi $rA, $rS, $n, $b",
+                           (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
+def RLWIMIobm : PPCAsmPseudo<"rlwimi. $rA, $rS, $n, $b",
+                            (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
+def RLWNMbm : PPCAsmPseudo<"rlwnm $rA, $rS, $n, $b",
+                          (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
+def RLWNMobm : PPCAsmPseudo<"rlwnm. $rA, $rS, $n, $b",
+                           (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>;
+
+// These generic branch instruction forms are used for the assembler parser only.
+// Defs and Uses are conservative, since we don't know the BO value.
+let PPC970_Unit = 7 in {
+  let Defs = [CTR], Uses = [CTR, RM] in {
+    def gBC : BForm_3<16, 0, 0, (outs),
+                      (ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst),
+                      "bc $bo, $bi, $dst">;
+    def gBCA : BForm_3<16, 1, 0, (outs),
+                       (ins u5imm:$bo, crbitrc:$bi, abscondbrtarget:$dst),
+                       "bca $bo, $bi, $dst">;
+    let isAsmParserOnly = 1 in {
+      def gBCat : BForm_3_at<16, 0, 0, (outs),
+                             (ins u5imm:$bo, atimm:$at, crbitrc:$bi,
+                                  condbrtarget:$dst),
+                                  "bc$at $bo, $bi, $dst">;
+      def gBCAat : BForm_3_at<16, 1, 0, (outs),
+                              (ins u5imm:$bo, atimm:$at, crbitrc:$bi,
+                                   abscondbrtarget:$dst),
+                                   "bca$at $bo, $bi, $dst">;
+    } // isAsmParserOnly = 1
+  }
+  let Defs = [LR, CTR], Uses = [CTR, RM] in {
+    def gBCL : BForm_3<16, 0, 1, (outs),
+                       (ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst),
+                       "bcl $bo, $bi, $dst">;
+    def gBCLA : BForm_3<16, 1, 1, (outs),
+                        (ins u5imm:$bo, crbitrc:$bi, abscondbrtarget:$dst),
+                        "bcla $bo, $bi, $dst">;
+    let isAsmParserOnly = 1 in {
+      def gBCLat : BForm_3_at<16, 0, 1, (outs),
+                         (ins u5imm:$bo, atimm:$at, crbitrc:$bi,
+                              condbrtarget:$dst),
+                              "bcl$at $bo, $bi, $dst">;
+      def gBCLAat : BForm_3_at<16, 1, 1, (outs),
+                          (ins u5imm:$bo, atimm:$at, crbitrc:$bi,
+                               abscondbrtarget:$dst),
+                               "bcla$at $bo, $bi, $dst">;
+    } // // isAsmParserOnly = 1
+  }
+  let Defs = [CTR], Uses = [CTR, LR, RM] in
+    def gBCLR : XLForm_2<19, 16, 0, (outs),
+                         (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
+                         "bclr $bo, $bi, $bh", IIC_BrB, []>;
+  let Defs = [LR, CTR], Uses = [CTR, LR, RM] in
+    def gBCLRL : XLForm_2<19, 16, 1, (outs),
+                          (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
+                          "bclrl $bo, $bi, $bh", IIC_BrB, []>;
+  let Defs = [CTR], Uses = [CTR, LR, RM] in
+    def gBCCTR : XLForm_2<19, 528, 0, (outs),
+                          (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
+                          "bcctr $bo, $bi, $bh", IIC_BrB, []>;
+  let Defs = [LR, CTR], Uses = [CTR, LR, RM] in
+    def gBCCTRL : XLForm_2<19, 528, 1, (outs),
+                           (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
+                           "bcctrl $bo, $bi, $bh", IIC_BrB, []>;
+}
+
+multiclass BranchSimpleMnemonicAT<string pm, int at> {
+  def : InstAlias<"bc"#pm#" $bo, $bi, $dst", (gBCat u5imm:$bo, at, crbitrc:$bi,
+                                                    condbrtarget:$dst)>;
+  def : InstAlias<"bca"#pm#" $bo, $bi, $dst", (gBCAat u5imm:$bo, at, crbitrc:$bi,
+                                                      condbrtarget:$dst)>;
+  def : InstAlias<"bcl"#pm#" $bo, $bi, $dst", (gBCLat u5imm:$bo, at, crbitrc:$bi,
+                                                      condbrtarget:$dst)>;
+  def : InstAlias<"bcla"#pm#" $bo, $bi, $dst", (gBCLAat u5imm:$bo, at, crbitrc:$bi,
+                                                        condbrtarget:$dst)>;
+}
+defm : BranchSimpleMnemonicAT<"+", 3>;
+defm : BranchSimpleMnemonicAT<"-", 2>;
+
+def : InstAlias<"bclr $bo, $bi", (gBCLR u5imm:$bo, crbitrc:$bi, 0)>;
+def : InstAlias<"bclrl $bo, $bi", (gBCLRL u5imm:$bo, crbitrc:$bi, 0)>;
+def : InstAlias<"bcctr $bo, $bi", (gBCCTR u5imm:$bo, crbitrc:$bi, 0)>;
+def : InstAlias<"bcctrl $bo, $bi", (gBCCTRL u5imm:$bo, crbitrc:$bi, 0)>;
+
+multiclass BranchSimpleMnemonic1<string name, string pm, int bo> {
+  def : InstAlias<"b"#name#pm#" $bi, $dst", (gBC bo, crbitrc:$bi, condbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"a"#pm#" $bi, $dst", (gBCA bo, crbitrc:$bi, abscondbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"lr"#pm#" $bi", (gBCLR bo, crbitrc:$bi, 0)>;
+  def : InstAlias<"b"#name#"l"#pm#" $bi, $dst", (gBCL bo, crbitrc:$bi, condbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"la"#pm#" $bi, $dst", (gBCLA bo, crbitrc:$bi, abscondbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"lrl"#pm#" $bi", (gBCLRL bo, crbitrc:$bi, 0)>;
+}
+multiclass BranchSimpleMnemonic2<string name, string pm, int bo>
+  : BranchSimpleMnemonic1<name, pm, bo> {
+  def : InstAlias<"b"#name#"ctr"#pm#" $bi", (gBCCTR bo, crbitrc:$bi, 0)>;
+  def : InstAlias<"b"#name#"ctrl"#pm#" $bi", (gBCCTRL bo, crbitrc:$bi, 0)>;
+}
+defm : BranchSimpleMnemonic2<"t", "", 12>;
+defm : BranchSimpleMnemonic2<"f", "", 4>;
+defm : BranchSimpleMnemonic2<"t", "-", 14>;
+defm : BranchSimpleMnemonic2<"f", "-", 6>;
+defm : BranchSimpleMnemonic2<"t", "+", 15>;
+defm : BranchSimpleMnemonic2<"f", "+", 7>;
+defm : BranchSimpleMnemonic1<"dnzt", "", 8>;
+defm : BranchSimpleMnemonic1<"dnzf", "", 0>;
+defm : BranchSimpleMnemonic1<"dzt", "", 10>;
+defm : BranchSimpleMnemonic1<"dzf", "", 2>;
+
+multiclass BranchExtendedMnemonicPM<string name, string pm, int bibo> {
+  def : InstAlias<"b"#name#pm#" $cc, $dst",
+                  (BCC bibo, crrc:$cc, condbrtarget:$dst)>;
+  def : InstAlias<"b"#name#pm#" $dst",
+                  (BCC bibo, CR0, condbrtarget:$dst)>;
+
+  def : InstAlias<"b"#name#"a"#pm#" $cc, $dst",
+                  (BCCA bibo, crrc:$cc, abscondbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"a"#pm#" $dst",
+                  (BCCA bibo, CR0, abscondbrtarget:$dst)>;
+
+  def : InstAlias<"b"#name#"lr"#pm#" $cc",
+                  (BCCLR bibo, crrc:$cc)>;
+  def : InstAlias<"b"#name#"lr"#pm,
+                  (BCCLR bibo, CR0)>;
+
+  def : InstAlias<"b"#name#"ctr"#pm#" $cc",
+                  (BCCCTR bibo, crrc:$cc)>;
+  def : InstAlias<"b"#name#"ctr"#pm,
+                  (BCCCTR bibo, CR0)>;
+
+  def : InstAlias<"b"#name#"l"#pm#" $cc, $dst",
+                  (BCCL bibo, crrc:$cc, condbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"l"#pm#" $dst",
+                  (BCCL bibo, CR0, condbrtarget:$dst)>;
+
+  def : InstAlias<"b"#name#"la"#pm#" $cc, $dst",
+                  (BCCLA bibo, crrc:$cc, abscondbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"la"#pm#" $dst",
+                  (BCCLA bibo, CR0, abscondbrtarget:$dst)>;
+
+  def : InstAlias<"b"#name#"lrl"#pm#" $cc",
+                  (BCCLRL bibo, crrc:$cc)>;
+  def : InstAlias<"b"#name#"lrl"#pm,
+                  (BCCLRL bibo, CR0)>;
+
+  def : InstAlias<"b"#name#"ctrl"#pm#" $cc",
+                  (BCCCTRL bibo, crrc:$cc)>;
+  def : InstAlias<"b"#name#"ctrl"#pm,
+                  (BCCCTRL bibo, CR0)>;
+}
+multiclass BranchExtendedMnemonic<string name, int bibo> {
+  defm : BranchExtendedMnemonicPM<name, "", bibo>;
+  defm : BranchExtendedMnemonicPM<name, "-", !add(bibo, 2)>;
+  defm : BranchExtendedMnemonicPM<name, "+", !add(bibo, 3)>;
+}
+defm : BranchExtendedMnemonic<"lt", 12>;
+defm : BranchExtendedMnemonic<"gt", 44>;
+defm : BranchExtendedMnemonic<"eq", 76>;
+defm : BranchExtendedMnemonic<"un", 108>;
+defm : BranchExtendedMnemonic<"so", 108>;
+defm : BranchExtendedMnemonic<"ge", 4>;
+defm : BranchExtendedMnemonic<"nl", 4>;
+defm : BranchExtendedMnemonic<"le", 36>;
+defm : BranchExtendedMnemonic<"ng", 36>;
+defm : BranchExtendedMnemonic<"ne", 68>;
+defm : BranchExtendedMnemonic<"nu", 100>;
+defm : BranchExtendedMnemonic<"ns", 100>;
+
+def : InstAlias<"cmpwi $rA, $imm", (CMPWI CR0, gprc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmpw $rA, $rB", (CMPW CR0, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmplwi $rA, $imm", (CMPLWI CR0, gprc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmplw $rA, $rB", (CMPLW CR0, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmpdi $rA, $imm", (CMPDI CR0, g8rc:$rA, s16imm64:$imm)>;
+def : InstAlias<"cmpd $rA, $rB", (CMPD CR0, g8rc:$rA, g8rc:$rB)>;
+def : InstAlias<"cmpldi $rA, $imm", (CMPLDI CR0, g8rc:$rA, u16imm64:$imm)>;
+def : InstAlias<"cmpld $rA, $rB", (CMPLD CR0, g8rc:$rA, g8rc:$rB)>;
+
+def : InstAlias<"cmpi $bf, 0, $rA, $imm", (CMPWI crrc:$bf, gprc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmp $bf, 0, $rA, $rB", (CMPW crrc:$bf, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmpli $bf, 0, $rA, $imm", (CMPLWI crrc:$bf, gprc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmpl $bf, 0, $rA, $rB", (CMPLW crrc:$bf, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmpi $bf, 1, $rA, $imm", (CMPDI crrc:$bf, g8rc:$rA, s16imm64:$imm)>;
+def : InstAlias<"cmp $bf, 1, $rA, $rB", (CMPD crrc:$bf, g8rc:$rA, g8rc:$rB)>;
+def : InstAlias<"cmpli $bf, 1, $rA, $imm", (CMPLDI crrc:$bf, g8rc:$rA, u16imm64:$imm)>;
+def : InstAlias<"cmpl $bf, 1, $rA, $rB", (CMPLD crrc:$bf, g8rc:$rA, g8rc:$rB)>;
+
+multiclass TrapExtendedMnemonic<string name, int to> {
+  def : InstAlias<"td"#name#"i $rA, $imm", (TDI to, g8rc:$rA, s16imm:$imm)>;
+  def : InstAlias<"td"#name#" $rA, $rB", (TD to, g8rc:$rA, g8rc:$rB)>;
+  def : InstAlias<"tw"#name#"i $rA, $imm", (TWI to, gprc:$rA, s16imm:$imm)>;
+  def : InstAlias<"tw"#name#" $rA, $rB", (TW to, gprc:$rA, gprc:$rB)>;
+}
+defm : TrapExtendedMnemonic<"lt", 16>;
+defm : TrapExtendedMnemonic<"le", 20>;
+defm : TrapExtendedMnemonic<"eq", 4>;
+defm : TrapExtendedMnemonic<"ge", 12>;
+defm : TrapExtendedMnemonic<"gt", 8>;
+defm : TrapExtendedMnemonic<"nl", 12>;
+defm : TrapExtendedMnemonic<"ne", 24>;
+defm : TrapExtendedMnemonic<"ng", 20>;
+defm : TrapExtendedMnemonic<"llt", 2>;
+defm : TrapExtendedMnemonic<"lle", 6>;
+defm : TrapExtendedMnemonic<"lge", 5>;
+defm : TrapExtendedMnemonic<"lgt", 1>;
+defm : TrapExtendedMnemonic<"lnl", 5>;
+defm : TrapExtendedMnemonic<"lng", 6>;
+defm : TrapExtendedMnemonic<"u", 31>;
+
+// Atomic loads
+def : Pat<(atomic_load_8  iaddr:$src), (LBZ  memri:$src)>;
+def : Pat<(atomic_load_16 iaddr:$src), (LHZ  memri:$src)>;
+def : Pat<(atomic_load_32 iaddr:$src), (LWZ  memri:$src)>;
+def : Pat<(atomic_load_8  xaddr:$src), (LBZX memrr:$src)>;
+def : Pat<(atomic_load_16 xaddr:$src), (LHZX memrr:$src)>;
+def : Pat<(atomic_load_32 xaddr:$src), (LWZX memrr:$src)>;
+
+// Atomic stores
+def : Pat<(atomic_store_8  iaddr:$ptr, i32:$val), (STB  gprc:$val, memri:$ptr)>;
+def : Pat<(atomic_store_16 iaddr:$ptr, i32:$val), (STH  gprc:$val, memri:$ptr)>;
+def : Pat<(atomic_store_32 iaddr:$ptr, i32:$val), (STW  gprc:$val, memri:$ptr)>;
+def : Pat<(atomic_store_8  xaddr:$ptr, i32:$val), (STBX gprc:$val, memrr:$ptr)>;
+def : Pat<(atomic_store_16 xaddr:$ptr, i32:$val), (STHX gprc:$val, memrr:$ptr)>;
+def : Pat<(atomic_store_32 xaddr:$ptr, i32:$val), (STWX gprc:$val, memrr:$ptr)>;
+
+let Predicates = [IsISA3_0] in {
+
+// Copy-Paste Facility
+// We prefix 'CP' to COPY due to name conflict in Target.td. We also prefix to
+// PASTE for naming consistency.
+let mayLoad = 1 in
+def CP_COPY   : X_L1_RA5_RB5<31, 774, "copy"  , gprc, IIC_LdStCOPY, []>;
+
+let mayStore = 1 in
+def CP_PASTE  : X_L1_RA5_RB5<31, 902, "paste" , gprc, IIC_LdStPASTE, []>;
+
+let mayStore = 1, Defs = [CR0] in
+def CP_PASTEo : X_L1_RA5_RB5<31, 902, "paste.", gprc, IIC_LdStPASTE, []>, isDOT;
+
+def CP_COPYx  : PPCAsmPseudo<"copy $rA, $rB" , (ins gprc:$rA, gprc:$rB)>;
+def CP_PASTEx : PPCAsmPseudo<"paste $rA, $rB", (ins gprc:$rA, gprc:$rB)>;
+def CP_COPY_FIRST : PPCAsmPseudo<"copy_first $rA, $rB",
+                                  (ins gprc:$rA, gprc:$rB)>;
+def CP_PASTE_LAST : PPCAsmPseudo<"paste_last $rA, $rB",
+                                  (ins gprc:$rA, gprc:$rB)>;
+def CP_ABORT : XForm_0<31, 838, (outs), (ins), "cp_abort", IIC_SprABORT, []>;
+
+// Message Synchronize
+def MSGSYNC : XForm_0<31, 886, (outs), (ins), "msgsync", IIC_SprMSGSYNC, []>;
+
+// Power-Saving Mode Instruction:
+def STOP : XForm_0<19, 370, (outs), (ins), "stop", IIC_SprSTOP, []>;
+
+} // IsISA3_0
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td
new file mode 100644
index 000000000000..4940c77c7ae5
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td
@@ -0,0 +1,1216 @@
+//===- PPCInstrQPX.td - The PowerPC QPX Extension --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the QPX extension to the PowerPC instruction set.
+// Reference:
+// Book Q: QPX Architecture Definition. IBM (as updated in) 2011.
+//
+//===----------------------------------------------------------------------===//
+
+def PPCRegQFRCAsmOperand : AsmOperandClass {
+  let Name = "RegQFRC"; let PredicateMethod = "isRegNumber";
+}
+def qfrc : RegisterOperand<QFRC> {
+  let ParserMatchClass = PPCRegQFRCAsmOperand;
+}
+def PPCRegQSRCAsmOperand : AsmOperandClass {
+  let Name = "RegQSRC"; let PredicateMethod = "isRegNumber";
+}
+def qsrc : RegisterOperand<QSRC> {
+  let ParserMatchClass = PPCRegQSRCAsmOperand;
+}
+def PPCRegQBRCAsmOperand : AsmOperandClass {
+  let Name = "RegQBRC"; let PredicateMethod = "isRegNumber";
+}
+def qbrc : RegisterOperand<QBRC> {
+  let ParserMatchClass = PPCRegQBRCAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+// Helpers for defining instructions that directly correspond to intrinsics.
+
+// QPXA1_Int - A AForm_1 intrinsic definition.
+class QPXA1_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
+  : AForm_1<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+              !strconcat(opc, " $FRT, $FRA, $FRC, $FRB"), IIC_FPFused,
+                       [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>;
+// QPXA1s_Int - A AForm_1 intrinsic definition (simple instructions).
+class QPXA1s_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
+  : AForm_1<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+              !strconcat(opc, " $FRT, $FRA, $FRC, $FRB"), IIC_VecPerm,
+                       [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>;
+// QPXA2_Int - A AForm_2 intrinsic definition.
+class QPXA2_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
+  : AForm_2<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+              !strconcat(opc, " $FRT, $FRA, $FRB"), IIC_FPGeneral,
+                       [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB))]>;
+// QPXA3_Int - A AForm_3 intrinsic definition.
+class QPXA3_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
+  : AForm_3<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC),
+              !strconcat(opc, " $FRT, $FRA, $FRC"), IIC_FPGeneral,
+                       [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRC))]>;
+// QPXA4_Int - A AForm_4a intrinsic definition.
+class QPXA4_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID>
+  : AForm_4a<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRB),
+              !strconcat(opc, " $FRT, $FRB"), IIC_FPGeneral,
+                       [(set v4f64:$FRT, (IntID v4f64:$FRB))]>;
+// QPXX18_Int - A XForm_18 intrinsic definition.
+class QPXX18_Int<bits<6> opcode, bits<10> xo, string opc, Intrinsic IntID>
+  : XForm_18<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+              !strconcat(opc, " $FRT, $FRA, $FRB"), IIC_FPCompare,
+                       [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB))]>;
+// QPXX19_Int - A XForm_19 intrinsic definition.
+class QPXX19_Int<bits<6> opcode, bits<10> xo, string opc, Intrinsic IntID>
+  : XForm_19<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRB),
+              !strconcat(opc, " $FRT, $FRB"), IIC_FPGeneral,
+                       [(set v4f64:$FRT, (IntID v4f64:$FRB))]>;
+
+//===----------------------------------------------------------------------===//
+// Pattern Frags.
+
+def extloadv4f32 : PatFrag<(ops node:$ptr), (extload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::v4f32;
+}]>;
+
+def truncstorev4f32 : PatFrag<(ops node:$val, node:$ptr),
+                            (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32;
+}]>;
+def pre_truncstv4f32 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                               (pre_truncst node:$val,
+                                            node:$base, node:$offset), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32;
+}]>;
+
+def fround_inexact : PatFrag<(ops node:$val), (fpround node:$val), [{
+  return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() == 0;
+}]>;
+
+def fround_exact : PatFrag<(ops node:$val), (fpround node:$val), [{
+  return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() == 1;
+}]>;
+
+let FastIselShouldIgnore = 1 in // FastIsel should ignore all u12 instrs.
+  def u12 : ImmLeaf<i32, [{ return (Imm & 0xFFF) == Imm; }]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions.
+
+def HasQPX : Predicate<"PPCSubTarget->hasQPX()">;
+let Predicates = [HasQPX] in {
+let DecoderNamespace = "QPX" in {
+let hasSideEffects = 0 in { // QPX instructions don't have side effects.
+let Uses = [RM] in {
+  // Add Instructions
+  let isCommutable = 1 in {
+    def QVFADD : AForm_2<4, 21,
+                        (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+                        "qvfadd $FRT, $FRA, $FRB", IIC_FPGeneral,
+                        [(set v4f64:$FRT, (fadd v4f64:$FRA, v4f64:$FRB))]>;
+    let isCodeGenOnly = 1 in
+      def QVFADDS : QPXA2_Int<0, 21, "qvfadds", int_ppc_qpx_qvfadds>;
+    def QVFADDSs : AForm_2<0, 21,
+                          (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+                          "qvfadds $FRT, $FRA, $FRB", IIC_FPGeneral,
+                          [(set v4f32:$FRT, (fadd v4f32:$FRA, v4f32:$FRB))]>;
+  }
+  def QVFSUB : AForm_2<4, 20,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+                      "qvfsub $FRT, $FRA, $FRB", IIC_FPGeneral,
+                      [(set v4f64:$FRT, (fsub v4f64:$FRA, v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+    def QVFSUBS : QPXA2_Int<0, 20, "qvfsubs", int_ppc_qpx_qvfsubs>;
+  def QVFSUBSs : AForm_2<0, 20,
+                        (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+                        "qvfsubs $FRT, $FRA, $FRB", IIC_FPGeneral,
+                        [(set v4f32:$FRT, (fsub v4f32:$FRA, v4f32:$FRB))]>;
+
+  // Estimate Instructions
+  def QVFRE : AForm_4a<4, 24, (outs qfrc:$FRT), (ins qfrc:$FRB),
+                       "qvfre $FRT, $FRB", IIC_FPGeneral,
+                       [(set v4f64:$FRT, (PPCfre v4f64:$FRB))]>;
+  def QVFRES : QPXA4_Int<0, 24, "qvfres", int_ppc_qpx_qvfres>;
+  let isCodeGenOnly = 1 in
+  def QVFRESs : AForm_4a<0, 24, (outs qsrc:$FRT), (ins qsrc:$FRB),
+                         "qvfres $FRT, $FRB", IIC_FPGeneral,
+                         [(set v4f32:$FRT, (PPCfre v4f32:$FRB))]>;
+
+  def QVFRSQRTE : AForm_4a<4, 26, (outs qfrc:$FRT), (ins qfrc:$FRB),
+                           "qvfrsqrte $FRT, $FRB", IIC_FPGeneral,
+                           [(set v4f64:$FRT, (PPCfrsqrte v4f64:$FRB))]>;
+  def QVFRSQRTES : QPXA4_Int<0, 26, "qvfrsqrtes", int_ppc_qpx_qvfrsqrtes>;
+  let isCodeGenOnly = 1 in
+  def QVFRSQRTESs : AForm_4a<0, 26, (outs qsrc:$FRT), (ins qsrc:$FRB),
+                             "qvfrsqrtes $FRT, $FRB", IIC_FPGeneral,
+                             [(set v4f32:$FRT, (PPCfrsqrte v4f32:$FRB))]>;
+
+  // Multiply Instructions
+  let isCommutable = 1 in {
+    def QVFMUL : AForm_3<4, 25,
+                        (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC),
+                        "qvfmul $FRT, $FRA, $FRC", IIC_FPGeneral,
+                        [(set v4f64:$FRT, (fmul v4f64:$FRA, v4f64:$FRC))]>;
+    let isCodeGenOnly = 1 in
+      def QVFMULS : QPXA3_Int<0, 25, "qvfmuls", int_ppc_qpx_qvfmuls>;
+    def QVFMULSs : AForm_3<0, 25,
+                          (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC),
+                          "qvfmuls $FRT, $FRA, $FRC", IIC_FPGeneral,
+                          [(set v4f32:$FRT, (fmul v4f32:$FRA, v4f32:$FRC))]>;
+  }
+  def QVFXMUL : QPXA3_Int<4, 17, "qvfxmul", int_ppc_qpx_qvfxmul>;
+  def QVFXMULS : QPXA3_Int<0, 17, "qvfxmuls", int_ppc_qpx_qvfxmuls>;
+
+  // Multiply-add instructions
+  def QVFMADD : AForm_1<4, 29,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+                      "qvfmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+    def QVFMADDS : QPXA1_Int<0, 29, "qvfmadds", int_ppc_qpx_qvfmadds>;
+  def QVFMADDSs : AForm_1<0, 29,
+                        (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+                        "qvfmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                        [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, v4f32:$FRB))]>;
+  def QVFNMADD : AForm_1<4, 31,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+                      "qvfnmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC,
+                                                   v4f64:$FRB)))]>;
+  let isCodeGenOnly = 1 in
+    def QVFNMADDS : QPXA1_Int<0, 31, "qvfnmadds", int_ppc_qpx_qvfnmadds>;
+  def QVFNMADDSs : AForm_1<0, 31,
+                        (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+                        "qvfnmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                        [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC,
+                                                     v4f32:$FRB)))]>;
+  def QVFMSUB : AForm_1<4, 28,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+                      "qvfmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC,
+                                             (fneg v4f64:$FRB)))]>;
+  let isCodeGenOnly = 1 in
+    def QVFMSUBS : QPXA1_Int<0, 28, "qvfmsubs", int_ppc_qpx_qvfmsubs>;
+  def QVFMSUBSs : AForm_1<0, 28,
+                      (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+                      "qvfmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC,
+                                             (fneg v4f32:$FRB)))]>;
+  def QVFNMSUB : AForm_1<4, 30,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+                      "qvfnmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC,
+                                              (fneg v4f64:$FRB))))]>;
+  let isCodeGenOnly = 1 in
+    def QVFNMSUBS : QPXA1_Int<0, 30, "qvfnmsubs", int_ppc_qpx_qvfnmsubs>;
+  def QVFNMSUBSs : AForm_1<0, 30,
+                      (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+                      "qvfnmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
+                      [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC,
+                                              (fneg v4f32:$FRB))))]>;
+  def QVFXMADD : QPXA1_Int<4, 9, "qvfxmadd", int_ppc_qpx_qvfxmadd>;
+  def QVFXMADDS : QPXA1_Int<0, 9, "qvfxmadds", int_ppc_qpx_qvfxmadds>;
+  def QVFXXNPMADD : QPXA1_Int<4, 11, "qvfxxnpmadd", int_ppc_qpx_qvfxxnpmadd>;
+  def QVFXXNPMADDS : QPXA1_Int<0, 11, "qvfxxnpmadds", int_ppc_qpx_qvfxxnpmadds>;
+  def QVFXXCPNMADD : QPXA1_Int<4, 3, "qvfxxcpnmadd", int_ppc_qpx_qvfxxcpnmadd>;
+  def QVFXXCPNMADDS : QPXA1_Int<0, 3, "qvfxxcpnmadds", int_ppc_qpx_qvfxxcpnmadds>;
+  def QVFXXMADD : QPXA1_Int<4, 1, "qvfxxmadd", int_ppc_qpx_qvfxxmadd>;
+  def QVFXXMADDS : QPXA1_Int<0, 1, "qvfxxmadds", int_ppc_qpx_qvfxxmadds>;
+
+  // Select Instruction
+  let isCodeGenOnly = 1 in
+    def QVFSEL : QPXA1s_Int<4, 23, "qvfsel", int_ppc_qpx_qvfsel>;
+  def QVFSELb : AForm_1<4, 23, (outs qfrc:$FRT),
+                        (ins qbrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+                        "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm,
+                        [(set v4f64:$FRT, (vselect v4i1:$FRA,
+                                                   v4f64:$FRC, v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+  def QVFSELbs : AForm_1<4, 23, (outs qsrc:$FRT),
+                        (ins qbrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+                        "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm,
+                        [(set v4f32:$FRT, (vselect v4i1:$FRA,
+                                                   v4f32:$FRC, v4f32:$FRB))]>;
+  let isCodeGenOnly = 1 in
+  def QVFSELbb: AForm_1<4, 23, (outs qbrc:$FRT),
+                        (ins qbrc:$FRA, qbrc:$FRB, qbrc:$FRC),
+                        "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm,
+                        [(set v4i1:$FRT, (vselect v4i1:$FRA,
+                                                  v4i1:$FRC, v4i1:$FRB))]>;
+
+  // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+  // instruction selection into a branch sequence.
+  let usesCustomInserter = 1 in {
+    def SELECT_CC_QFRC: Pseudo<(outs qfrc:$dst), (ins crrc:$cond, qfrc:$T, qfrc:$F,
+                                i32imm:$BROPC), "#SELECT_CC_QFRC",
+                                []>;
+    def SELECT_CC_QSRC: Pseudo<(outs qsrc:$dst), (ins crrc:$cond, qsrc:$T, qsrc:$F,
+                                i32imm:$BROPC), "#SELECT_CC_QSRC",
+                                []>;
+    def SELECT_CC_QBRC: Pseudo<(outs qbrc:$dst), (ins crrc:$cond, qbrc:$T, qbrc:$F,
+                                i32imm:$BROPC), "#SELECT_CC_QBRC",
+                                []>;
+
+    // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition
+    // register bit directly.
+    def SELECT_QFRC: Pseudo<(outs qfrc:$dst), (ins crbitrc:$cond,
+                            qfrc:$T, qfrc:$F), "#SELECT_QFRC",
+                            [(set v4f64:$dst,
+                                  (select i1:$cond, v4f64:$T, v4f64:$F))]>;
+    def SELECT_QSRC: Pseudo<(outs qsrc:$dst), (ins crbitrc:$cond,
+                            qsrc:$T, qsrc:$F), "#SELECT_QSRC",
+                            [(set v4f32:$dst,
+                                  (select i1:$cond, v4f32:$T, v4f32:$F))]>;
+    def SELECT_QBRC: Pseudo<(outs qbrc:$dst), (ins crbitrc:$cond,
+                            qbrc:$T, qbrc:$F), "#SELECT_QBRC",
+                            [(set v4i1:$dst,
+                                  (select i1:$cond, v4i1:$T, v4i1:$F))]>;
+  }
+
+  // Convert and Round Instructions
+  def QVFCTID : QPXX19_Int<4, 814, "qvfctid", int_ppc_qpx_qvfctid>;
+  let isCodeGenOnly = 1 in
+    def QVFCTIDb : XForm_19<4, 814, (outs qbrc:$FRT), (ins qbrc:$FRB),
+                            "qvfctid $FRT, $FRB", IIC_FPGeneral, []>;
+
+  def QVFCTIDU : QPXX19_Int<4, 942, "qvfctidu", int_ppc_qpx_qvfctidu>;
+  def QVFCTIDZ : QPXX19_Int<4, 815, "qvfctidz", int_ppc_qpx_qvfctidz>;
+  def QVFCTIDUZ : QPXX19_Int<4, 943, "qvfctiduz", int_ppc_qpx_qvfctiduz>;
+  def QVFCTIW : QPXX19_Int<4, 14, "qvfctiw", int_ppc_qpx_qvfctiw>;
+  def QVFCTIWU : QPXX19_Int<4, 142, "qvfctiwu", int_ppc_qpx_qvfctiwu>;
+  def QVFCTIWZ : QPXX19_Int<4, 15, "qvfctiwz", int_ppc_qpx_qvfctiwz>;
+  def QVFCTIWUZ : QPXX19_Int<4, 143, "qvfctiwuz", int_ppc_qpx_qvfctiwuz>;
+  def QVFCFID : QPXX19_Int<4, 846, "qvfcfid", int_ppc_qpx_qvfcfid>;
+  let isCodeGenOnly = 1 in
+    def QVFCFIDb : XForm_19<4, 846, (outs qbrc:$FRT), (ins qbrc:$FRB),
+                            "qvfcfid $FRT, $FRB", IIC_FPGeneral, []>;
+
+  def QVFCFIDU : QPXX19_Int<4, 974, "qvfcfidu", int_ppc_qpx_qvfcfidu>;
+  def QVFCFIDS : QPXX19_Int<0, 846, "qvfcfids", int_ppc_qpx_qvfcfids>;
+  def QVFCFIDUS : QPXX19_Int<0, 974, "qvfcfidus", int_ppc_qpx_qvfcfidus>;
+
+  let isCodeGenOnly = 1 in
+    def QVFRSP : QPXX19_Int<4, 12, "qvfrsp", int_ppc_qpx_qvfrsp>;
+  def QVFRSPs : XForm_19<4, 12,
+                      (outs qsrc:$FRT), (ins qfrc:$FRB),
+                      "qvfrsp $FRT, $FRB", IIC_FPGeneral,
+                      [(set v4f32:$FRT, (fround_inexact v4f64:$FRB))]>;
+
+  def QVFRIZ : XForm_19<4, 424, (outs qfrc:$FRT), (ins qfrc:$FRB),
+                        "qvfriz $FRT, $FRB", IIC_FPGeneral,
+                        [(set v4f64:$FRT, (ftrunc v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+    def QVFRIZs : XForm_19<4, 424, (outs qsrc:$FRT), (ins qsrc:$FRB),
+                           "qvfriz $FRT, $FRB", IIC_FPGeneral,
+                           [(set v4f32:$FRT, (ftrunc v4f32:$FRB))]>;
+
+  def QVFRIN : XForm_19<4, 392, (outs qfrc:$FRT), (ins qfrc:$FRB),
+                        "qvfrin $FRT, $FRB", IIC_FPGeneral,
+                        [(set v4f64:$FRT, (fround v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+    def QVFRINs : XForm_19<4, 392, (outs qsrc:$FRT), (ins qsrc:$FRB),
+                           "qvfrin $FRT, $FRB", IIC_FPGeneral,
+                           [(set v4f32:$FRT, (fround v4f32:$FRB))]>;
+
+  def QVFRIP : XForm_19<4, 456, (outs qfrc:$FRT), (ins qfrc:$FRB),
+                        "qvfrip $FRT, $FRB", IIC_FPGeneral,
+                        [(set v4f64:$FRT, (fceil v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+    def QVFRIPs : XForm_19<4, 456, (outs qsrc:$FRT), (ins qsrc:$FRB),
+                           "qvfrip $FRT, $FRB", IIC_FPGeneral,
+                           [(set v4f32:$FRT, (fceil v4f32:$FRB))]>;
+
+  def QVFRIM : XForm_19<4, 488, (outs qfrc:$FRT), (ins qfrc:$FRB),
+                        "qvfrim $FRT, $FRB", IIC_FPGeneral,
+                        [(set v4f64:$FRT, (ffloor v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+    def QVFRIMs : XForm_19<4, 488, (outs qsrc:$FRT), (ins qsrc:$FRB),
+                           "qvfrim $FRT, $FRB", IIC_FPGeneral,
+                           [(set v4f32:$FRT, (ffloor v4f32:$FRB))]>;
+
+  // Move Instructions
+  def QVFMR : XForm_19<4, 72,
+                      (outs qfrc:$FRT), (ins qfrc:$FRB),
+                      "qvfmr $FRT, $FRB", IIC_VecPerm,
+                      [/* (set v4f64:$FRT, v4f64:$FRB) */]>;
+  let isCodeGenOnly = 1 in {
+    def QVFMRs : XForm_19<4, 72,
+                         (outs qsrc:$FRT), (ins qsrc:$FRB),
+                         "qvfmr $FRT, $FRB", IIC_VecPerm,
+                         [/* (set v4f32:$FRT, v4f32:$FRB) */]>;
+    def QVFMRb : XForm_19<4, 72,
+                         (outs qbrc:$FRT), (ins qbrc:$FRB),
+                         "qvfmr $FRT, $FRB", IIC_VecPerm,
+                         [/* (set v4i1:$FRT, v4i1:$FRB) */]>;
+  }
+  def QVFNEG : XForm_19<4, 40,
+                      (outs qfrc:$FRT), (ins qfrc:$FRB),
+                      "qvfneg $FRT, $FRB", IIC_VecPerm,
+                      [(set v4f64:$FRT, (fneg v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+    def QVFNEGs : XForm_19<4, 40,
+                         (outs qsrc:$FRT), (ins qsrc:$FRB),
+                         "qvfneg $FRT, $FRB", IIC_VecPerm,
+                         [(set v4f32:$FRT, (fneg v4f32:$FRB))]>;
+  def QVFABS : XForm_19<4, 264,
+                      (outs qfrc:$FRT), (ins qfrc:$FRB),
+                      "qvfabs $FRT, $FRB", IIC_VecPerm,
+                      [(set v4f64:$FRT, (fabs v4f64:$FRB))]>;
+  let isCodeGenOnly = 1 in
+    def QVFABSs : XForm_19<4, 264,
+                         (outs qsrc:$FRT), (ins qsrc:$FRB),
+                         "qvfabs $FRT, $FRB", IIC_VecPerm,
+                         [(set v4f32:$FRT, (fabs v4f32:$FRB))]>;
+  def QVFNABS : XForm_19<4, 136,
+                      (outs qfrc:$FRT), (ins qfrc:$FRB),
+                      "qvfnabs $FRT, $FRB", IIC_VecPerm,
+                      [(set v4f64:$FRT, (fneg (fabs v4f64:$FRB)))]>;
+  let isCodeGenOnly = 1 in
+    def QVFNABSs : XForm_19<4, 136,
+                         (outs qsrc:$FRT), (ins qsrc:$FRB),
+                         "qvfnabs $FRT, $FRB", IIC_VecPerm,
+                         [(set v4f32:$FRT, (fneg (fabs v4f32:$FRB)))]>;
+  def QVFCPSGN : XForm_18<4, 8,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+                      "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm,
+                      [(set v4f64:$FRT, (fcopysign v4f64:$FRB, v4f64:$FRA))]>;
+  let isCodeGenOnly = 1 in
+    def QVFCPSGNs : XForm_18<4, 8,
+                         (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+                         "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm,
+                         [(set v4f32:$FRT, (fcopysign v4f32:$FRB, v4f32:$FRA))]>;
+
+  def QVALIGNI : Z23Form_1<4, 5,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u2imm:$idx),
+                      "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm,
+                      [(set v4f64:$FRT,
+                            (PPCqvaligni v4f64:$FRA, v4f64:$FRB,
+                                         (i32 imm:$idx)))]>;
+  let isCodeGenOnly = 1 in
+     def QVALIGNIs : Z23Form_1<4, 5,
+                         (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, u2imm:$idx),
+                         "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm,
+                         [(set v4f32:$FRT,
+                               (PPCqvaligni v4f32:$FRA, v4f32:$FRB,
+                                            (i32 imm:$idx)))]>;
+  let isCodeGenOnly = 1 in
+     def QVALIGNIb : Z23Form_1<4, 5,
+                         (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u2imm:$idx),
+                         "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm,
+                         [(set v4i1:$FRT,
+                               (PPCqvaligni v4i1:$FRA, v4i1:$FRB,
+                                            (i32 imm:$idx)))]>;
+
+  def QVESPLATI : Z23Form_2<4, 37,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, u2imm:$idx),
+                      "qvesplati $FRT, $FRA, $idx", IIC_VecPerm,
+                      [(set v4f64:$FRT,
+                            (PPCqvesplati v4f64:$FRA, (i32 imm:$idx)))]>;
+  let isCodeGenOnly = 1 in
+     def QVESPLATIs : Z23Form_2<4, 37,
+                         (outs qsrc:$FRT), (ins qsrc:$FRA, u2imm:$idx),
+                         "qvesplati $FRT, $FRA, $idx", IIC_VecPerm,
+                         [(set v4f32:$FRT,
+                               (PPCqvesplati v4f32:$FRA, (i32 imm:$idx)))]>;
+  let isCodeGenOnly = 1 in
+     def QVESPLATIb : Z23Form_2<4, 37,
+                         (outs qbrc:$FRT), (ins qbrc:$FRA, u2imm:$idx),
+                         "qvesplati $FRT, $FRA, $idx", IIC_VecPerm,
+                         [(set v4i1:$FRT,
+                               (PPCqvesplati v4i1:$FRA, (i32 imm:$idx)))]>;
+
+  def QVFPERM : AForm_1<4, 6,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+                      "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm,
+                      [(set v4f64:$FRT,
+                            (PPCqvfperm v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>;
+  let isCodeGenOnly = 1 in
+     def QVFPERMs : AForm_1<4, 6,
+                         (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qfrc:$FRC),
+                         "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm,
+                         [(set v4f32:$FRT,
+                               (PPCqvfperm v4f32:$FRA, v4f32:$FRB, v4f64:$FRC))]>;
+
+  let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+  def QVGPCI : Z23Form_3<4, 133,
+                      (outs qfrc:$FRT), (ins u12imm:$idx),
+                      "qvgpci $FRT, $idx", IIC_VecPerm,
+                      [(set v4f64:$FRT, (PPCqvgpci (u12:$idx)))]>;
+
+  // Compare Instruction
+  let isCodeGenOnly = 1 in
+    def QVFTSTNAN : QPXX18_Int<4, 64, "qvftstnan", int_ppc_qpx_qvftstnan>;
+  def QVFTSTNANb : XForm_18<4, 64, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+                           "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare,
+                           [(set v4i1:$FRT,
+                                 (setcc v4f64:$FRA, v4f64:$FRB, SETUO))]>;
+  let isCodeGenOnly = 1 in
+  def QVFTSTNANbs : XForm_18<4, 64, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+                            "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare,
+                            [(set v4i1:$FRT,
+                                  (setcc v4f32:$FRA, v4f32:$FRB, SETUO))]>;
+  let isCodeGenOnly = 1 in
+    def QVFCMPLT : QPXX18_Int<4, 96, "qvfcmplt", int_ppc_qpx_qvfcmplt>;
+  def QVFCMPLTb : XForm_18<4, 96, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+                           "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare,
+                           [(set v4i1:$FRT,
+                                 (setcc v4f64:$FRA, v4f64:$FRB, SETOLT))]>;
+  let isCodeGenOnly = 1 in
+  def QVFCMPLTbs : XForm_18<4, 96, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+                            "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare,
+                            [(set v4i1:$FRT,
+                                  (setcc v4f32:$FRA, v4f32:$FRB, SETOLT))]>;
+  let isCodeGenOnly = 1 in
+    def QVFCMPGT : QPXX18_Int<4, 32, "qvfcmpgt", int_ppc_qpx_qvfcmpgt>;
+  def QVFCMPGTb : XForm_18<4, 32, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+                           "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare,
+                           [(set v4i1:$FRT,
+                                 (setcc v4f64:$FRA, v4f64:$FRB, SETOGT))]>;
+  let isCodeGenOnly = 1 in
+  def QVFCMPGTbs : XForm_18<4, 32, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+                            "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare,
+                            [(set v4i1:$FRT,
+                                  (setcc v4f32:$FRA, v4f32:$FRB, SETOGT))]>;
+  let isCodeGenOnly = 1 in
+    def QVFCMPEQ : QPXX18_Int<4, 0, "qvfcmpeq", int_ppc_qpx_qvfcmpeq>;
+  def QVFCMPEQb : XForm_18<4, 0, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB),
+                           "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare,
+                           [(set v4i1:$FRT,
+                                 (setcc v4f64:$FRA, v4f64:$FRB, SETOEQ))]>;
+  let isCodeGenOnly = 1 in
+  def QVFCMPEQbs : XForm_18<4, 0, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB),
+                            "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare,
+                            [(set v4i1:$FRT,
+                                  (setcc v4f32:$FRA, v4f32:$FRB, SETOEQ))]>;
+
+  let isCodeGenOnly = 1 in
+  def QVFLOGICAL : XForm_20<4, 4,
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u12imm:$tttt),
+                      "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>;
+  def QVFLOGICALb : XForm_20<4, 4,
+                      (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt),
+                      "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>;
+  let isCodeGenOnly = 1 in
+  def QVFLOGICALs : XForm_20<4, 4,
+                      (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt),
+                      "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>;
+
+  // Load indexed instructions
+  let mayLoad = 1 in {
+    def QVLFDX : XForm_1<31, 583,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfdx $FRT, $src", IIC_LdStLFD,
+                        [(set v4f64:$FRT, (load xoaddr:$src))]>;
+    let isCodeGenOnly = 1 in
+    def QVLFDXb : XForm_1<31, 583,
+                        (outs qbrc:$FRT), (ins memrr:$src),
+                        "qvlfdx $FRT, $src", IIC_LdStLFD, []>;
+
+    let RC = 1 in
+    def QVLFDXA : XForm_1<31, 583,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfdxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFDUX : XForm_1<31, 615,
+                        (outs qfrc:$FRT, ptr_rc_nor0:$ea_result),
+                        (ins memrr:$src),
+                        "qvlfdux $FRT, $src", IIC_LdStLFDU, []>,
+                        RegConstraint<"$src.ptrreg = $ea_result">,
+                        NoEncode<"$ea_result">;
+    let RC = 1 in
+    def QVLFDUXA : XForm_1<31, 615,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfduxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFSX : XForm_1<31, 519,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfsx $FRT, $src", IIC_LdStLFD,
+                        [(set v4f64:$FRT, (extloadv4f32 xoaddr:$src))]>;
+
+    let isCodeGenOnly = 1 in
+    def QVLFSXb : XForm_1<31, 519,
+                        (outs qbrc:$FRT), (ins memrr:$src),
+                        "qvlfsx $FRT, $src", IIC_LdStLFD,
+                        [(set v4i1:$FRT, (PPCqvlfsb xoaddr:$src))]>;
+    let isCodeGenOnly = 1 in
+    def QVLFSXs : XForm_1<31, 519,
+                        (outs qsrc:$FRT), (ins memrr:$src),
+                        "qvlfsx $FRT, $src", IIC_LdStLFD,
+                        [(set v4f32:$FRT, (load xoaddr:$src))]>;
+
+    let RC = 1 in
+    def QVLFSXA : XForm_1<31, 519,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfsxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFSUX : XForm_1<31, 551,
+                        (outs qsrc:$FRT, ptr_rc_nor0:$ea_result),
+                        (ins memrr:$src),
+                        "qvlfsux $FRT, $src", IIC_LdStLFDU, []>,
+                        RegConstraint<"$src.ptrreg = $ea_result">,
+                        NoEncode<"$ea_result">;
+
+    let RC = 1 in
+    def QVLFSUXA : XForm_1<31, 551,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfsuxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFCDX : XForm_1<31, 71,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfcdx $FRT, $src", IIC_LdStLFD, []>;
+    let RC = 1 in
+    def QVLFCDXA : XForm_1<31, 71,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfcdxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFCDUX : XForm_1<31, 103,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfcdux $FRT, $src", IIC_LdStLFD, []>;
+    let RC = 1 in
+    def QVLFCDUXA : XForm_1<31, 103,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfcduxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFCSX : XForm_1<31, 7,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfcsx $FRT, $src", IIC_LdStLFD, []>;
+    let isCodeGenOnly = 1 in
+    def QVLFCSXs : XForm_1<31, 7,
+                         (outs qsrc:$FRT), (ins memrr:$src),
+                         "qvlfcsx $FRT, $src", IIC_LdStLFD, []>;
+
+    let RC = 1 in
+    def QVLFCSXA : XForm_1<31, 7,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfcsxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFCSUX : XForm_1<31, 39,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfcsux $FRT, $src", IIC_LdStLFD, []>;
+    let RC = 1 in
+    def QVLFCSUXA : XForm_1<31, 39,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfcsuxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFIWAX : XForm_1<31, 871,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfiwax $FRT, $src", IIC_LdStLFD, []>;
+    let RC = 1 in
+    def QVLFIWAXA : XForm_1<31, 871,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfiwaxa $FRT, $src", IIC_LdStLFD, []>;
+
+    def QVLFIWZX : XForm_1<31, 839,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfiwzx $FRT, $src", IIC_LdStLFD, []>;
+    let RC = 1 in
+    def QVLFIWZXA : XForm_1<31, 839,
+                        (outs qfrc:$FRT), (ins memrr:$src),
+                        "qvlfiwzxa $FRT, $src", IIC_LdStLFD, []>;
+  }
+
+
+  def QVLPCLDX : XForm_1<31, 582,
+                      (outs qfrc:$FRT), (ins memrr:$src),
+                      "qvlpcldx $FRT, $src", IIC_LdStLFD, []>;
+  def QVLPCLSX : XForm_1<31, 518,
+                      (outs qfrc:$FRT), (ins memrr:$src),
+                      "qvlpclsx $FRT, $src", IIC_LdStLFD, []>;
+  let isCodeGenOnly = 1 in
+    def QVLPCLSXint : XForm_11<31, 518,
+                              (outs qfrc:$FRT), (ins G8RC:$src),
+                              "qvlpclsx $FRT, 0, $src", IIC_LdStLFD, []>;
+  def QVLPCRDX : XForm_1<31, 70,
+                      (outs qfrc:$FRT), (ins memrr:$src),
+                      "qvlpcrdx $FRT, $src", IIC_LdStLFD, []>;
+  def QVLPCRSX : XForm_1<31, 6,
+                      (outs qfrc:$FRT), (ins memrr:$src),
+                      "qvlpcrsx $FRT, $src", IIC_LdStLFD, []>;
+
+  // Store indexed instructions
+  let mayStore = 1 in {
+    def QVSTFDX : XForm_8<31, 711,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfdx $FRT, $dst", IIC_LdStSTFD,
+                        [(store qfrc:$FRT, xoaddr:$dst)]>;
+    let isCodeGenOnly = 1 in
+    def QVSTFDXb : XForm_8<31, 711,
+                        (outs), (ins qbrc:$FRT, memrr:$dst),
+                        "qvstfdx $FRT, $dst", IIC_LdStSTFD, []>;
+
+    let RC = 1 in
+    def QVSTFDXA : XForm_8<31, 711,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfdxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFDUX : XForm_8<31, 743, (outs ptr_rc_nor0:$ea_res),
+                           (ins qfrc:$FRT, memrr:$dst),
+                           "qvstfdux $FRT, $dst", IIC_LdStSTFDU, []>,
+                           RegConstraint<"$dst.ptrreg = $ea_res">,
+                           NoEncode<"$ea_res">;
+
+    let RC = 1 in
+    def QVSTFDUXA : XForm_8<31, 743,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfduxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFDXI : XForm_8<31, 709,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfdxi $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFDXIA : XForm_8<31, 709,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfdxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFDUXI : XForm_8<31, 741,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfduxi $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFDUXIA : XForm_8<31, 741,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfduxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFSX : XForm_8<31, 647,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfsx $FRT, $dst", IIC_LdStSTFD,
+                        [(truncstorev4f32 qfrc:$FRT, xoaddr:$dst)]>;
+    let isCodeGenOnly = 1 in
+    def QVSTFSXs : XForm_8<31, 647,
+                         (outs), (ins qsrc:$FRT, memrr:$dst),
+                         "qvstfsx $FRT, $dst", IIC_LdStSTFD,
+                         [(store qsrc:$FRT, xoaddr:$dst)]>;
+
+    let RC = 1 in
+    def QVSTFSXA : XForm_8<31, 647,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfsxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFSUX : XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res),
+                           (ins qsrc:$FRT, memrr:$dst),
+                           "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>,
+                           RegConstraint<"$dst.ptrreg = $ea_res">,
+                           NoEncode<"$ea_res">;
+    let isCodeGenOnly = 1 in
+    def QVSTFSUXs: XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res),
+                           (ins qfrc:$FRT, memrr:$dst),
+                           "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>,
+                           RegConstraint<"$dst.ptrreg = $ea_res">,
+                           NoEncode<"$ea_res">;
+
+    let RC = 1 in
+    def QVSTFSUXA : XForm_8<31, 679,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfsuxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFSXI : XForm_8<31, 645,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfsxi $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFSXIA : XForm_8<31, 645,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfsxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFSUXI : XForm_8<31, 677,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfsuxi $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFSUXIA : XForm_8<31, 677,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfsuxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFCDX : XForm_8<31, 199,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcdx $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFCDXA : XForm_8<31, 199,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcdxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFCSX : XForm_8<31, 135,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>;
+    let isCodeGenOnly = 1 in
+    def QVSTFCSXs : XForm_8<31, 135,
+                         (outs), (ins qsrc:$FRT, memrr:$dst),
+                         "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>;
+
+    let RC = 1 in
+    def QVSTFCSXA : XForm_8<31, 135,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcsxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFCDUX : XForm_8<31, 231,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcdux $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFCDUXA : XForm_8<31, 231,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcduxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFCSUX : XForm_8<31, 167,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcsux $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFCSUXA : XForm_8<31, 167,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcsuxa $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFCDXI : XForm_8<31, 197,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcdxi $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFCDXIA : XForm_8<31, 197,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcdxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFCSXI : XForm_8<31, 133,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcsxi $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFCSXIA : XForm_8<31, 133,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcsxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFCDUXI : XForm_8<31, 229,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcduxi $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFCDUXIA : XForm_8<31, 229,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcduxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFCSUXI : XForm_8<31, 165,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcsuxi $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFCSUXIA : XForm_8<31, 165,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfcsuxia $FRT, $dst", IIC_LdStSTFD, []>;
+
+    def QVSTFIWX : XForm_8<31, 967,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfiwx $FRT, $dst", IIC_LdStSTFD, []>;
+    let RC = 1 in
+    def QVSTFIWXA : XForm_8<31, 967,
+                        (outs), (ins qfrc:$FRT, memrr:$dst),
+                        "qvstfiwxa $FRT, $dst", IIC_LdStSTFD, []>;
+  }
+}
+
+} // neverHasSideEffects
+}
+
+def : InstAlias<"qvfclr $FRT",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 0)>;
+def : InstAlias<"qvfand $FRT, $FRA, $FRB",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 1)>;
+def : InstAlias<"qvfandc $FRT, $FRA, $FRB",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 4)>;
+def : InstAlias<"qvfctfb $FRT, $FRA",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 5)>;
+def : InstAlias<"qvfxor $FRT, $FRA, $FRB",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 6)>;
+def : InstAlias<"qvfor $FRT, $FRA, $FRB",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 7)>;
+def : InstAlias<"qvfnor $FRT, $FRA, $FRB",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 8)>;
+def : InstAlias<"qvfequ $FRT, $FRA, $FRB",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 9)>;
+def : InstAlias<"qvfnot $FRT, $FRA",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 10)>;
+def : InstAlias<"qvforc $FRT, $FRA, $FRB",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 13)>;
+def : InstAlias<"qvfnand $FRT, $FRA, $FRB",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 14)>;
+def : InstAlias<"qvfset $FRT",
+                (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 15)>;
+
+//===----------------------------------------------------------------------===//
+// Additional QPX Patterns
+//
+
+def : Pat<(v4f64 (scalar_to_vector f64:$A)),
+          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), $A, sub_64)>;
+def : Pat<(v4f32 (scalar_to_vector f32:$A)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $A, sub_64)>;
+
+def : Pat<(f64 (extractelt v4f64:$S, 0)),
+          (EXTRACT_SUBREG $S, sub_64)>;
+def : Pat<(f32 (extractelt v4f32:$S, 0)),
+          (EXTRACT_SUBREG $S, sub_64)>;
+
+def : Pat<(f64 (extractelt v4f64:$S, 1)),
+          (EXTRACT_SUBREG (QVESPLATI $S, 1), sub_64)>;
+def : Pat<(f64 (extractelt v4f64:$S, 2)),
+          (EXTRACT_SUBREG (QVESPLATI $S, 2), sub_64)>;
+def : Pat<(f64 (extractelt v4f64:$S, 3)),
+          (EXTRACT_SUBREG (QVESPLATI $S, 3), sub_64)>;
+
+def : Pat<(f32 (extractelt v4f32:$S, 1)),
+          (EXTRACT_SUBREG (QVESPLATIs $S, 1), sub_64)>;
+def : Pat<(f32 (extractelt v4f32:$S, 2)),
+          (EXTRACT_SUBREG (QVESPLATIs $S, 2), sub_64)>;
+def : Pat<(f32 (extractelt v4f32:$S, 3)),
+          (EXTRACT_SUBREG (QVESPLATIs $S, 3), sub_64)>;
+
+def : Pat<(f64 (extractelt v4f64:$S, i64:$F)),
+          (EXTRACT_SUBREG (QVFPERM $S, $S,
+                                   (QVLPCLSXint (RLDICR $F, 2,
+                                                        /* 63-2 = */ 61))),
+                          sub_64)>;
+def : Pat<(f32 (extractelt v4f32:$S, i64:$F)),
+          (EXTRACT_SUBREG (QVFPERMs $S, $S,
+                                    (QVLPCLSXint (RLDICR $F, 2,
+                                                         /* 63-2 = */ 61))),
+                          sub_64)>;
+
+def : Pat<(int_ppc_qpx_qvfperm v4f64:$A, v4f64:$B, v4f64:$C),
+          (QVFPERM $A, $B, $C)>;
+
+def : Pat<(int_ppc_qpx_qvfcpsgn v4f64:$A, v4f64:$B),
+          (QVFCPSGN $A, $B)>;
+
+// FCOPYSIGN's operand types need not agree.
+def : Pat<(fcopysign v4f64:$frB, v4f32:$frA),
+          (QVFCPSGN (COPY_TO_REGCLASS $frA, QFRC), $frB)>;
+def : Pat<(fcopysign QSRC:$frB, QFRC:$frA),
+          (QVFCPSGNs (COPY_TO_REGCLASS $frA, QSRC), $frB)>;
+
+def : Pat<(int_ppc_qpx_qvfneg v4f64:$A), (QVFNEG $A)>;
+def : Pat<(int_ppc_qpx_qvfabs v4f64:$A), (QVFABS $A)>;
+def : Pat<(int_ppc_qpx_qvfnabs v4f64:$A), (QVFNABS $A)>;
+
+def : Pat<(int_ppc_qpx_qvfriz v4f64:$A), (QVFRIZ $A)>;
+def : Pat<(int_ppc_qpx_qvfrin v4f64:$A), (QVFRIN $A)>;
+def : Pat<(int_ppc_qpx_qvfrip v4f64:$A), (QVFRIP $A)>;
+def : Pat<(int_ppc_qpx_qvfrim v4f64:$A), (QVFRIM $A)>;
+
+def : Pat<(int_ppc_qpx_qvfre v4f64:$A), (QVFRE $A)>;
+def : Pat<(int_ppc_qpx_qvfrsqrte v4f64:$A), (QVFRSQRTE $A)>;
+
+def : Pat<(int_ppc_qpx_qvfadd v4f64:$A, v4f64:$B),
+          (QVFADD $A, $B)>;
+def : Pat<(int_ppc_qpx_qvfsub v4f64:$A, v4f64:$B),
+          (QVFSUB $A, $B)>;
+def : Pat<(int_ppc_qpx_qvfmul v4f64:$A, v4f64:$B),
+          (QVFMUL $A, $B)>;
+
+// Additional QVFNMSUB patterns: -a*c + b == -(a*c - b)
+def : Pat<(fma (fneg v4f64:$A), v4f64:$C, v4f64:$B),
+          (QVFNMSUB $A, $B, $C)>;
+def : Pat<(fma v4f64:$A, (fneg v4f64:$C), v4f64:$B),
+          (QVFNMSUB $A, $B, $C)>;
+def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B),
+          (QVFNMSUBSs $A, $B, $C)>;
+def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B),
+          (QVFNMSUBSs $A, $B, $C)>;
+
+def : Pat<(int_ppc_qpx_qvfmadd v4f64:$A, v4f64:$B, v4f64:$C),
+          (QVFMADD $A, $B, $C)>;
+def : Pat<(int_ppc_qpx_qvfnmadd v4f64:$A, v4f64:$B, v4f64:$C),
+          (QVFNMADD $A, $B, $C)>;
+def : Pat<(int_ppc_qpx_qvfmsub v4f64:$A, v4f64:$B, v4f64:$C),
+          (QVFMSUB $A, $B, $C)>;
+def : Pat<(int_ppc_qpx_qvfnmsub v4f64:$A, v4f64:$B, v4f64:$C),
+          (QVFNMSUB $A, $B, $C)>;
+
+def : Pat<(int_ppc_qpx_qvlfd xoaddr:$src),
+          (QVLFDX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src),
+          (QVLFDXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfs xoaddr:$src),
+          (QVLFSX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src),
+          (QVLFSXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfcda xoaddr:$src),
+          (QVLFCDXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfcd xoaddr:$src),
+          (QVLFCDX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfcsa xoaddr:$src),
+          (QVLFCSXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfcs xoaddr:$src),
+          (QVLFCSX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src),
+          (QVLFDXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfiwaa xoaddr:$src),
+          (QVLFIWAXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfiwa xoaddr:$src),
+          (QVLFIWAX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfiwza xoaddr:$src),
+          (QVLFIWZXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfiwz xoaddr:$src),
+          (QVLFIWZX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src),
+          (QVLFSXA xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlpcld xoaddr:$src),
+          (QVLPCLDX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlpcls xoaddr:$src),
+          (QVLPCLSX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlpcrd xoaddr:$src),
+          (QVLPCRDX xoaddr:$src)>;
+def : Pat<(int_ppc_qpx_qvlpcrs xoaddr:$src),
+          (QVLPCRSX xoaddr:$src)>;
+
+def : Pat<(int_ppc_qpx_qvstfd v4f64:$T, xoaddr:$dst),
+          (QVSTFDX $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfs v4f64:$T, xoaddr:$dst),
+          (QVSTFSX $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfcda v4f64:$T, xoaddr:$dst),
+          (QVSTFCDXA $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfcd v4f64:$T, xoaddr:$dst),
+          (QVSTFCDX $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfcsa v4f64:$T, xoaddr:$dst),
+          (QVSTFCSXA $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfcs v4f64:$T, xoaddr:$dst),
+          (QVSTFCSX $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfda v4f64:$T, xoaddr:$dst),
+          (QVSTFDXA $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfiwa v4f64:$T, xoaddr:$dst),
+          (QVSTFIWXA $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfiw v4f64:$T, xoaddr:$dst),
+          (QVSTFIWX $T, xoaddr:$dst)>;
+def : Pat<(int_ppc_qpx_qvstfsa v4f64:$T, xoaddr:$dst),
+          (QVSTFSXA $T, xoaddr:$dst)>;
+
+def : Pat<(pre_store v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (QVSTFDUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store v4f32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (QVSTFSUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_truncstv4f32 v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (QVSTFSUXs $rS, $ptrreg, $ptroff)>;
+
+def : Pat<(int_ppc_qpx_qvflogical  v4f64:$A, v4f64:$B, (i32 imm:$idx)),
+          (QVFLOGICAL $A, $B, imm:$idx)>;
+def : Pat<(int_ppc_qpx_qvgpci (u12:$idx)),
+          (QVGPCI imm:$idx)>;
+
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOGE),
+          (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
+                       (QVFTSTNANb $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOLE),
+          (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
+                       (QVFTSTNANb $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETONE),
+          (QVFLOGICALb (QVFCMPEQb $FRA, $FRB),
+                       (QVFTSTNANb $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETO),
+          (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
+                       (QVFTSTNANb $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUEQ),
+          (QVFLOGICALb (QVFCMPEQb $FRA, $FRB),
+                       (QVFTSTNANb $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGT),
+          (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
+                       (QVFTSTNANb $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGE),
+          (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
+                       (QVFCMPLTb $FRA, $FRB), (i32 13))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULT),
+          (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
+                       (QVFTSTNANb $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULE),
+          (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
+                       (QVFCMPGTb $FRA, $FRB), (i32 13))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUNE),
+          (QVFLOGICALb (QVFTSTNANb $FRA, $FRB),
+                       (QVFCMPEQb $FRA, $FRB), (i32 13))>;
+
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETEQ),
+          (QVFCMPEQb $FRA, $FRB)>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGT),
+          (QVFCMPGTb $FRA, $FRB)>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGE),
+          (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
+                       (QVFCMPLTb $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLT),
+          (QVFCMPLTb $FRA, $FRB)>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLE),
+          (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
+                       (QVFCMPGTb $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETNE),
+          (QVFLOGICALb (QVFCMPEQb $FRA, $FRB),
+                       (QVFCMPEQb $FRA, $FRB), (i32 10))>;
+
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOGE),
+          (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
+                       (QVFTSTNANbs $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOLE),
+          (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
+                       (QVFTSTNANbs $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETONE),
+          (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB),
+                       (QVFTSTNANbs $FRA, $FRB), (i32 8))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETO),
+          (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
+                       (QVFTSTNANbs $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUEQ),
+          (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB),
+                       (QVFTSTNANbs $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGT),
+          (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
+                       (QVFTSTNANbs $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGE),
+          (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
+                       (QVFCMPLTbs $FRA, $FRB), (i32 13))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULT),
+          (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
+                       (QVFTSTNANbs $FRA, $FRB), (i32 7))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULE),
+          (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
+                       (QVFCMPGTbs $FRA, $FRB), (i32 13))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUNE),
+          (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB),
+                       (QVFCMPEQbs $FRA, $FRB), (i32 13))>;
+
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETEQ),
+          (QVFCMPEQbs $FRA, $FRB)>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGT),
+          (QVFCMPGTbs $FRA, $FRB)>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGE),
+          (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
+                       (QVFCMPLTbs $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLT),
+          (QVFCMPLTbs $FRA, $FRB)>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLE),
+          (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
+                       (QVFCMPGTbs $FRA, $FRB), (i32 10))>;
+def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETNE),
+          (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB),
+                       (QVFCMPEQbs $FRA, $FRB), (i32 10))>;
+
+def : Pat<(and v4i1:$FRA, (not v4i1:$FRB)),
+          (QVFLOGICALb $FRA, $FRB, (i32 4))>;
+def : Pat<(not (or v4i1:$FRA, v4i1:$FRB)),
+          (QVFLOGICALb $FRA, $FRB, (i32 8))>;
+def : Pat<(not (xor v4i1:$FRA, v4i1:$FRB)),
+          (QVFLOGICALb $FRA, $FRB, (i32 9))>;
+def : Pat<(or v4i1:$FRA, (not v4i1:$FRB)),
+          (QVFLOGICALb $FRA, $FRB, (i32 13))>;
+def : Pat<(not (and v4i1:$FRA, v4i1:$FRB)),
+          (QVFLOGICALb $FRA, $FRB, (i32 14))>;
+
+def : Pat<(and v4i1:$FRA, v4i1:$FRB),
+          (QVFLOGICALb $FRA, $FRB, (i32 1))>;
+def : Pat<(or v4i1:$FRA, v4i1:$FRB),
+          (QVFLOGICALb $FRA, $FRB, (i32 7))>;
+def : Pat<(xor v4i1:$FRA, v4i1:$FRB),
+          (QVFLOGICALb $FRA, $FRB, (i32 6))>;
+def : Pat<(not v4i1:$FRA),
+          (QVFLOGICALb $FRA, $FRA, (i32 10))>;
+
+def : Pat<(v4f64 (fpextend v4f32:$src)),
+          (COPY_TO_REGCLASS $src, QFRC)>;
+
+def : Pat<(v4f32 (fround_exact v4f64:$src)),
+          (COPY_TO_REGCLASS $src, QSRC)>;
+
+// Extract the underlying floating-point values from the
+// QPX (-1.0, 1.0) boolean representation.
+def : Pat<(v4f64 (PPCqbflt v4i1:$src)),
+          (COPY_TO_REGCLASS $src, QFRC)>;
+
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLT)),
+          (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULT)),
+          (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLE)),
+          (SELECT_QFRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULE)),
+          (SELECT_QFRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETEQ)),
+          (SELECT_QFRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGE)),
+          (SELECT_QFRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGE)),
+          (SELECT_QFRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGT)),
+          (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGT)),
+          (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETNE)),
+          (SELECT_QFRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLT)),
+          (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULT)),
+          (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLE)),
+          (SELECT_QSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULE)),
+          (SELECT_QSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETEQ)),
+          (SELECT_QSRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGE)),
+          (SELECT_QSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGE)),
+          (SELECT_QSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGT)),
+          (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGT)),
+          (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETNE)),
+          (SELECT_QSRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLT)),
+          (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULT)),
+          (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLE)),
+          (SELECT_QBRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULE)),
+          (SELECT_QBRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETEQ)),
+          (SELECT_QBRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGE)),
+          (SELECT_QBRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGE)),
+          (SELECT_QBRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGT)),
+          (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGT)),
+          (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETNE)),
+          (SELECT_QBRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+} // end HasQPX
+
+let Predicates = [HasQPX, NoNaNsFPMath] in {
+def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB),
+          (QVFSELb (QVFCMPLTb $FRA, $FRB), $FRB, $FRA)>;
+def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB),
+          (QVFSELb (QVFCMPGTb $FRA, $FRB), $FRB, $FRA)>;
+
+def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB),
+          (QVFSELbs (QVFCMPLTbs $FRA, $FRB), $FRB, $FRA)>;
+def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB),
+          (QVFSELbs (QVFCMPGTbs $FRA, $FRB), $FRB, $FRA)>;
+}
+
+let Predicates = [HasQPX, NaNsFPMath] in {
+// When either of these operands is NaN, we should return the other operand.
+// QVFCMPLT/QVFCMPGT return false is either operand is NaN, which means we need
+// to explicitly or with a NaN test on the second operand.
+def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB),
+          (QVFSELb (QVFLOGICALb (QVFCMPLTb $FRA, $FRB),
+                                (QVFTSTNANb $FRB, $FRB), (i32 7)),
+                   $FRB, $FRA)>;
+def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB),
+          (QVFSELb (QVFLOGICALb (QVFCMPGTb $FRA, $FRB),
+                                (QVFTSTNANb $FRB, $FRB), (i32 7)),
+                   $FRB, $FRA)>;
+
+def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB),
+          (QVFSELbs (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB),
+                                 (QVFTSTNANbs $FRB, $FRB), (i32 7)),
+                   $FRB, $FRA)>;
+def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB),
+          (QVFSELbs (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB),
+                                 (QVFTSTNANbs $FRB, $FRB), (i32 7)),
+                   $FRB, $FRA)>;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrSPE.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrSPE.td
new file mode 100644
index 000000000000..cc3a4d20a9b2
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrSPE.td
@@ -0,0 +1,447 @@
+//=======-- PPCInstrSPE.td - The PowerPC SPE Extension -*- tablegen -*-=======//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Signal Processing Engine extension to
+// the PowerPC instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+class EVXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Pattern = [];
+  
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-31} = xo;
+}
+
+class EVXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin> : EVXForm_1<xo, OOL, IOL, asmstr, itin> {
+  let RB = 0;
+}
+
+class EVXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> {
+  bits<3> crD;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Pattern = [];
+  
+  let Inst{6-8}  = crD;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-31} = xo;
+}
+
+class EVXForm_D<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<21> D;
+
+  let Pattern = [];
+
+  let Inst{6-10}  = RT;
+  let Inst{20} = D{0};
+  let Inst{19} = D{1};
+  let Inst{18} = D{2};
+  let Inst{17} = D{3};
+  let Inst{16} = D{4};
+  let Inst{15} = D{5};
+  let Inst{14} = D{6};
+  let Inst{13} = D{7};
+  let Inst{12} = D{8};
+  let Inst{11} = D{9};
+  let Inst{11-20} = D{0-9};
+  let Inst{21-31} = xo;
+}
+
+let Predicates = [HasSPE], isAsmParserOnly = 1 in {
+
+def EVLDD          : EVXForm_D<769, (outs gprc:$RT), (ins spe8dis:$dst),
+                               "evldd $RT, $dst", IIC_VecFP>;
+def EVLDW          : EVXForm_D<771, (outs gprc:$RT), (ins spe8dis:$dst),
+                               "evldw $RT, $dst", IIC_VecFP>;
+def EVLDH          : EVXForm_D<773, (outs gprc:$RT), (ins spe8dis:$dst),
+                               "evldh $RT, $dst", IIC_VecFP>;
+def EVLHHESPLAT    : EVXForm_D<777, (outs gprc:$RT), (ins spe2dis:$dst),
+                               "evlhhesplat $RT, $dst", IIC_VecFP>;
+def EVLHHOUSPLAT   : EVXForm_D<781, (outs gprc:$RT), (ins spe2dis:$dst),
+                               "evlhhousplat $RT, $dst", IIC_VecFP>;
+def EVLHHOSSPLAT   : EVXForm_D<783, (outs gprc:$RT), (ins spe2dis:$dst),
+                               "evlhhossplat $RT, $dst", IIC_VecFP>;
+def EVLWHE         : EVXForm_D<785, (outs gprc:$RT), (ins spe4dis:$dst),
+                               "evlwhe $RT, $dst", IIC_VecFP>;
+def EVLWHOU        : EVXForm_D<789, (outs gprc:$RT), (ins spe4dis:$dst),
+                               "evlwhou $RT, $dst", IIC_VecFP>;
+def EVLWHOS        : EVXForm_D<791, (outs gprc:$RT), (ins spe4dis:$dst),
+                               "evlwhos $RT, $dst", IIC_VecFP>;
+def EVLWWSPLAT     : EVXForm_D<793, (outs gprc:$RT), (ins spe4dis:$dst),
+                               "evlwwsplat $RT, $dst", IIC_VecFP>;
+def EVLWHSPLAT     : EVXForm_D<797, (outs gprc:$RT), (ins spe4dis:$dst),
+                               "evlwhsplat $RT, $dst", IIC_VecFP>;
+
+def EVSTDD         : EVXForm_D<801, (outs), (ins gprc:$RT, spe8dis:$dst),
+                               "evstdd $RT, $dst", IIC_VecFP>;
+def EVSTDH         : EVXForm_D<805, (outs), (ins gprc:$RT, spe8dis:$dst),
+                               "evstdh $RT, $dst", IIC_VecFP>;
+def EVSTDW         : EVXForm_D<803, (outs), (ins gprc:$RT, spe8dis:$dst),
+                               "evstdw $RT, $dst", IIC_VecFP>;
+def EVSTWHE        : EVXForm_D<817, (outs), (ins gprc:$RT, spe4dis:$dst),
+                               "evstwhe $RT, $dst", IIC_VecFP>;
+def EVSTWHO        : EVXForm_D<821, (outs), (ins gprc:$RT, spe4dis:$dst),
+                               "evstwho $RT, $dst", IIC_VecFP>;
+def EVSTWWE        : EVXForm_D<825, (outs), (ins gprc:$RT, spe4dis:$dst),
+                               "evstwwe $RT, $dst", IIC_VecFP>;
+def EVSTWWO        : EVXForm_D<829, (outs), (ins gprc:$RT, spe4dis:$dst),
+                               "evstwwo $RT, $dst", IIC_VecFP>;
+
+def EVMRA : EVXForm_1<1220, (outs gprc:$RT), (ins gprc:$RA),
+                      "evmra $RT, $RA", IIC_VecFP> {
+  let RB = 0;
+}
+
+def BRINC          : EVXForm_1<527, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "brinc $RT, $RA, $RB", IIC_VecFP>;
+def EVABS          : EVXForm_2<520, (outs gprc:$RT), (ins gprc:$RA),
+                               "evabs $RT, $RA", IIC_VecFP>;
+
+def EVADDIW        : EVXForm_1<514, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
+                               "evaddiw $RT, $RB, $RA", IIC_VecFP>;
+def EVADDSMIAAW    : EVXForm_2<1225, (outs gprc:$RT), (ins gprc:$RA),
+                               "evaddsmiaaw $RT, $RA", IIC_VecFP>;
+def EVADDSSIAAW    : EVXForm_2<1217, (outs gprc:$RT), (ins gprc:$RA),
+                               "evaddssiaaw $RT, $RA", IIC_VecFP>;
+def EVADDUSIAAW    : EVXForm_2<1216, (outs gprc:$RT), (ins gprc:$RA),
+                               "evaddusiaaw $RT, $RA", IIC_VecFP>;
+def EVADDUMIAAW    : EVXForm_2<1224, (outs gprc:$RT), (ins gprc:$RA),
+                               "evaddumiaaw $RT, $RA", IIC_VecFP>;
+def EVADDW         : EVXForm_1<512, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evaddw $RT, $RA, $RB", IIC_VecFP>;
+
+def EVAND          : EVXForm_1<529, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evand $RT, $RA, $RB", IIC_VecFP>;
+def EVANDC         : EVXForm_1<530, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evandc $RT, $RA, $RB", IIC_VecFP>;
+
+def EVCMPEQ        : EVXForm_3<564, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
+                               "evcmpeq $crD, $RA, $RB", IIC_VecFP>;
+def EVCMPGTS       : EVXForm_3<561, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
+                               "evcmpgts $crD, $RA, $RB", IIC_VecFP>;
+def EVCMPGTU       : EVXForm_3<560, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
+                               "evcmpgtu $crD, $RA, $RB", IIC_VecFP>;
+def EVCMPLTS       : EVXForm_3<563, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
+                               "evcmplts $crD, $RA, $RB", IIC_VecFP>;
+def EVCMPLTU       : EVXForm_3<562, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
+                               "evcmpltu $crD, $RA, $RB", IIC_VecFP>;
+
+def EVCNTLSW       : EVXForm_2<526, (outs gprc:$RT), (ins gprc:$RA),
+                               "evcntlsw $RT, $RA", IIC_VecFP>;
+def EVCNTLZW       : EVXForm_2<525, (outs gprc:$RT), (ins gprc:$RA),
+                               "evcntlzw $RT, $RA", IIC_VecFP>;
+
+def EVDIVWS        : EVXForm_1<1222, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evdivws $RT, $RA, $RB", IIC_VecFP>;
+def EVDIVWU        : EVXForm_1<1223, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evdivwu $RT, $RA, $RB", IIC_VecFP>;
+
+def EVEQV          : EVXForm_1<537, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "eveqv $RT, $RA, $RB", IIC_VecFP>;
+
+def EVEXTSB        : EVXForm_2<522, (outs gprc:$RT), (ins gprc:$RA),
+                               "evextsb $RT, $RA", IIC_VecFP>;
+def EVEXTSH        : EVXForm_2<523, (outs gprc:$RT), (ins gprc:$RA),
+                               "evextsh $RT, $RA", IIC_VecFP>;
+
+def EVLDDX         : EVXForm_1<768, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlddx $RT, $RA, $RB", IIC_VecFP>;
+def EVLDWX         : EVXForm_1<770, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evldwx $RT, $RA, $RB", IIC_VecFP>;
+def EVLDHX         : EVXForm_1<772, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evldhx $RT, $RA, $RB", IIC_VecFP>;
+def EVLHHESPLATX   : EVXForm_1<776, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlhhesplatx $RT, $RA, $RB", IIC_VecFP>;
+def EVLHHOUSPLATX  : EVXForm_1<780, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlhhousplatx $RT, $RA, $RB", IIC_VecFP>;
+def EVLHHOSSPLATX  : EVXForm_1<782, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlhhossplatx $RT, $RA, $RB", IIC_VecFP>;
+def EVLWHEX        : EVXForm_1<784, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlwhex $RT, $RA, $RB", IIC_VecFP>;
+def EVLWHOUX       : EVXForm_1<788, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlwhoux $RT, $RA, $RB", IIC_VecFP>;
+def EVLWHOSX       : EVXForm_1<790, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlwhosx $RT, $RA, $RB", IIC_VecFP>;
+def EVLWWSPLATX    : EVXForm_1<792, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlwwsplatx $RT, $RA, $RB", IIC_VecFP>;
+def EVLWHSPLATX    : EVXForm_1<796, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlwhsplatx $RT, $RA, $RB", IIC_VecFP>;
+
+def EVMERGEHI      : EVXForm_1<556, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmergehi $RT, $RA, $RB", IIC_VecFP>;
+def EVMERGELO      : EVXForm_1<557, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmergelo $RT, $RA, $RB", IIC_VecFP>;
+def EVMERGEHILO    : EVXForm_1<558, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmergehilo $RT, $RA, $RB", IIC_VecFP>;
+def EVMERGELOHI    : EVXForm_1<559, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmergelohi $RT, $RA, $RB", IIC_VecFP>;
+
+def EVMHEGSMFAA    : EVXForm_1<1323, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhegsmfaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEGSMFAN    : EVXForm_1<1451, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhegsmfan $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEGSMIAA    : EVXForm_1<1321, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhegsmiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEGSMIAN    : EVXForm_1<1449, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhegsmian $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEGUMIAA    : EVXForm_1<1320, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhegumiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEGUMIAN    : EVXForm_1<1448, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhegumian $RT, $RA, $RB", IIC_VecFP>;
+
+def EVMHESMF       : EVXForm_1<1035, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhesmf $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMFA      : EVXForm_1<1067, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhesmfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMFAAW    : EVXForm_1<1291, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhesmfaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMFANW    : EVXForm_1<1419, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhesmfanw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMI       : EVXForm_1<1033, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhesmi $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMIA      : EVXForm_1<1065, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhesmia $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMIAAW    : EVXForm_1<1289, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhesmiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMIANW    : EVXForm_1<1417, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhesmianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSF       : EVXForm_1<1027, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhessf $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSFA      : EVXForm_1<1059, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhessfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSFAAW    : EVXForm_1<1283, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhessfaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSFANW    : EVXForm_1<1411, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhessfanw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSIAAW    : EVXForm_1<1281, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhessiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSIANW    : EVXForm_1<1409, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhessianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUMI       : EVXForm_1<1032, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmheumi $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUMIA      : EVXForm_1<1064, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmheumia $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUMIAAW    : EVXForm_1<1288, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmheumiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUMIANW    : EVXForm_1<1416, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmheumianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUSIAAW    : EVXForm_1<1280, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmheusiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUSIANW    : EVXForm_1<1408, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmheusianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGSMFAA    : EVXForm_1<1327, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhogsmfaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGSMFAN    : EVXForm_1<1455, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhogsmfan $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGSMIAA    : EVXForm_1<1325, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhogsmiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGSMIAN    : EVXForm_1<1453, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhogsmian $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGUMIAA    : EVXForm_1<1324, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhogumiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGUMIAN    : EVXForm_1<1452, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhogumian $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMF       : EVXForm_1<1039, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhosmf $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMFA      : EVXForm_1<1071, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhosmfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMFAAW    : EVXForm_1<1295, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhosmfaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMFANW    : EVXForm_1<1423, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhosmfanw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMI       : EVXForm_1<1037, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhosmi $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMIA      : EVXForm_1<1069, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhosmia $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMIAAW    : EVXForm_1<1293, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhosmiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMIANW    : EVXForm_1<1421, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhosmianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSF       : EVXForm_1<1031, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhossf $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSFA      : EVXForm_1<1063, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhossfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSFAAW    : EVXForm_1<1287, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhossfaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSFANW    : EVXForm_1<1415, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhossfanw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSIAAW    : EVXForm_1<1285, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhossiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSIANW    : EVXForm_1<1413, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhossianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUMI       : EVXForm_1<1036, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhoumi $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUMIA      : EVXForm_1<1068, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhoumia $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUMIAAW    : EVXForm_1<1292, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhoumiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUMIANW    : EVXForm_1<1420, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhoumianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUSIAAW    : EVXForm_1<1284, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhousiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUSIANW    : EVXForm_1<1412, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhousianw $RT, $RA, $RB", IIC_VecFP>;
+
+
+def EVMWHSMF       : EVXForm_1<1103, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwhsmf $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHSMFA      : EVXForm_1<1135, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwhsmfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHSMI       : EVXForm_1<1101, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwhsmi $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHSMIA      : EVXForm_1<1133, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwhsmia $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHSSF       : EVXForm_1<1095, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwhssf $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHSSFA      : EVXForm_1<1127, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwhssfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHUMI       : EVXForm_1<1100, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwhumi $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHUMIA      : EVXForm_1<1132, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwhumia $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLSMIAAW    : EVXForm_1<1353, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlsmiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLSMIANW    : EVXForm_1<1481, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlsmianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLSSIAAW    : EVXForm_1<1345, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlssiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLSSIANW    : EVXForm_1<1473, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlssianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUMI       : EVXForm_1<1096, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlumi $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUMIA      : EVXForm_1<1128, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlumia $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUMIAAW    : EVXForm_1<1352, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlumiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUMIANW    : EVXForm_1<1480, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlumianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUSIAAW    : EVXForm_1<1344, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlusiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUSIANW    : EVXForm_1<1472, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlusianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMF        : EVXForm_1<1115, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwsmf $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMFA       : EVXForm_1<1147, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwsmfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMFAA      : EVXForm_1<1371, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwsmfaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMFAN      : EVXForm_1<1499, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwsmfan $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMI        : EVXForm_1<1113, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwsmi $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMIA       : EVXForm_1<1145, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwsmia $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMIAA      : EVXForm_1<1369, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwsmiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMIAN      : EVXForm_1<1497, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwsmian $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSSF        : EVXForm_1<1107, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwssf $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSSFA       : EVXForm_1<1139, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwssfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSSFAA      : EVXForm_1<1363, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwssfaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSSFAN      : EVXForm_1<1491, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwssfan $RT, $RA, $RB", IIC_VecFP>;
+def EVMWUMI        : EVXForm_1<1112, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwumi $RT, $RA, $RB", IIC_VecFP>;
+def EVMWUMIA       : EVXForm_1<1144, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwumia $RT, $RA, $RB", IIC_VecFP>;
+def EVMWUMIAA      : EVXForm_1<1368, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwumiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWUMIAN      : EVXForm_1<1496, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwumian $RT, $RA, $RB", IIC_VecFP>;
+
+
+def EVNAND         : EVXForm_1<542, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evnand $RT, $RA, $RB", IIC_VecFP>;
+
+def EVNEG          : EVXForm_2<521, (outs gprc:$RT), (ins gprc:$RA),
+                               "evneg $RT, $RA", IIC_VecFP>;
+
+def EVNOR          : EVXForm_1<536, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evnor $RT, $RA, $RB", IIC_VecFP>;
+def EVOR           : EVXForm_1<535, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evor $RT, $RA, $RB", IIC_VecFP>;
+def EVORC          : EVXForm_1<539, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evorc $RT, $RA, $RB", IIC_VecFP>;
+
+def EVRLWI         : EVXForm_1<554, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
+                               "evrlwi $RT, $RA, $RB", IIC_VecFP>;
+def EVRLW          : EVXForm_1<552, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evrlw $RT, $RA, $RB", IIC_VecFP>;
+
+def EVRNDW         : EVXForm_2<524, (outs gprc:$RT), (ins gprc:$RA),
+                               "evrndw $RT, $RA", IIC_VecFP>;
+
+def EVSLWI         : EVXForm_1<550, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
+                               "evslwi $RT, $RA, $RB", IIC_VecFP>;
+def EVSLW          : EVXForm_1<548, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evslw $RT, $RA, $RB", IIC_VecFP>;
+
+def EVSPLATFI      : EVXForm_2<555, (outs gprc:$RT), (ins i32imm:$RA),
+                               "evsplatfi $RT, $RA", IIC_VecFP>;
+def EVSPLATI       : EVXForm_2<553, (outs gprc:$RT), (ins i32imm:$RA),
+                               "evsplati $RT, $RA", IIC_VecFP>;
+
+def EVSRWIS        : EVXForm_1<547, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
+                               "evsrwis $RT, $RA, $RB", IIC_VecFP>;
+def EVSRWIU        : EVXForm_1<546, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
+                               "evsrwiu $RT, $RA, $RB", IIC_VecFP>;
+def EVSRWS         : EVXForm_1<545, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evsrws $RT, $RA, $RB", IIC_VecFP>;
+def EVSRWU         : EVXForm_1<544, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evsrwu $RT, $RA, $RB", IIC_VecFP>;
+
+def EVSTDDX        : EVXForm_1<800, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+                               "evstddx $RT, $RA, $RB", IIC_VecFP>;
+def EVSTDHX        : EVXForm_1<804, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+                               "evstdhx $RT, $RA, $RB", IIC_VecFP>;
+def EVSTDWX        : EVXForm_1<802, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+                               "evstdwx $RT, $RA, $RB", IIC_VecFP>;
+def EVSTWHEX       : EVXForm_1<816, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+                               "evstwhex $RT, $RA, $RB", IIC_VecFP>;
+def EVSTWHOX       : EVXForm_1<820, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+                               "evstwhox $RT, $RA, $RB", IIC_VecFP>;
+def EVSTWWEX       : EVXForm_1<824, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+                               "evstwwex $RT, $RA, $RB", IIC_VecFP>;
+def EVSTWWOX       : EVXForm_1<828, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+                               "evstwwox $RT, $RA, $RB", IIC_VecFP>;
+
+def EVSUBFSSIAAW   : EVXForm_2<1219, (outs gprc:$RT), (ins gprc:$RA),
+                               "evsubfssiaaw $RT, $RA", IIC_VecFP>;
+def EVSUBFSMIAAW   : EVXForm_2<1227, (outs gprc:$RT), (ins gprc:$RA),
+                               "evsubfsmiaaw $RT, $RA", IIC_VecFP>;
+def EVSUBFUMIAAW   : EVXForm_2<1226, (outs gprc:$RT), (ins gprc:$RA),
+                               "evsubfumiaaw $RT, $RA", IIC_VecFP>;
+def EVSUBFUSIAAW   : EVXForm_2<1218, (outs gprc:$RT), (ins gprc:$RA),
+                               "evsubfusiaaw $RT, $RA", IIC_VecFP>;
+def EVSUBFW        : EVXForm_1<516, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evsubfw $RT, $RA, $RB", IIC_VecFP>;
+def EVSUBIFW       : EVXForm_1<518, (outs gprc:$RT), (ins u5imm:$RA, gprc:$RB),
+                               "evsubifw $RT, $RA, $RB", IIC_VecFP>;
+def EVXOR          : EVXForm_1<534, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evxor $RT, $RA, $RB", IIC_VecFP>;
+
+} // HasSPE
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td
new file mode 100644
index 000000000000..0d9e3459f47e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -0,0 +1,2924 @@
+//===- PPCInstrVSX.td - The PowerPC VSX Extension --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the VSX extension to the PowerPC instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+// *********************************** NOTE ***********************************
+// ** For POWER8 Little Endian, the VSX swap optimization relies on knowing  **
+// ** which VMX and VSX instructions are lane-sensitive and which are not.   **
+// ** A lane-sensitive instruction relies, implicitly or explicitly, on      **
+// ** whether lanes are numbered from left to right.  An instruction like    **
+// ** VADDFP is not lane-sensitive, because each lane of the result vector   **
+// ** relies only on the corresponding lane of the source vectors.  However, **
+// ** an instruction like VMULESB is lane-sensitive, because "even" and      **
+// ** "odd" lanes are different for big-endian and little-endian numbering.  **
+// **                                                                        **
+// ** When adding new VMX and VSX instructions, please consider whether they **
+// ** are lane-sensitive.  If so, they must be added to a switch statement   **
+// ** in PPCVSXSwapRemoval::gatherVectorInstructions().                      **
+// ****************************************************************************
+
+def PPCRegVSRCAsmOperand : AsmOperandClass {
+  let Name = "RegVSRC"; let PredicateMethod = "isVSRegNumber";
+}
+def vsrc : RegisterOperand<VSRC> {
+  let ParserMatchClass = PPCRegVSRCAsmOperand;
+}
+
+def PPCRegVSFRCAsmOperand : AsmOperandClass {
+  let Name = "RegVSFRC"; let PredicateMethod = "isVSRegNumber";
+}
+def vsfrc : RegisterOperand<VSFRC> {
+  let ParserMatchClass = PPCRegVSFRCAsmOperand;
+}
+
+def PPCRegVSSRCAsmOperand : AsmOperandClass {
+  let Name = "RegVSSRC"; let PredicateMethod = "isVSRegNumber";
+}
+def vssrc : RegisterOperand<VSSRC> {
+  let ParserMatchClass = PPCRegVSSRCAsmOperand;
+}
+
+// Little-endian-specific nodes.
+def SDT_PPClxvd2x : SDTypeProfile<1, 1, [
+  SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCstxvd2x : SDTypeProfile<0, 2, [
+  SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCxxswapd : SDTypeProfile<1, 1, [
+  SDTCisSameAs<0, 1>
+]>;
+def SDTVecConv : SDTypeProfile<1, 2, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>
+]>;
+
+def PPClxvd2x  : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
+                        [SDNPHasChain, SDNPMayLoad]>;
+def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x,
+                        [SDNPHasChain, SDNPMayStore]>;
+def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>;
+def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>;
+def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>;
+def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>;
+def PPCsvec2fp : SDNode<"PPCISD::SINT_VEC_TO_FP", SDTVecConv, []>;
+def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>;
+def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>;
+
+multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
+                    string asmstr, InstrItinClass itin, Intrinsic Int,
+                    ValueType OutTy, ValueType InTy> {
+  let BaseName = asmbase in {
+    def NAME : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       [(set OutTy:$XT, (Int InTy:$XA, InTy:$XB))]>;
+    let Defs = [CR6] in
+    def o    : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       [(set InTy:$XT,
+                                (InTy (PPCvcmp_o InTy:$XA, InTy:$XB, xo)))]>,
+                       isDOT;
+  }
+}
+
+// Instruction form with a single input register for instructions such as
+// XXPERMDI. The reason for defining this is that specifying multiple chained
+// operands (such as loads) to an instruction will perform both chained
+// operations rather than coalescing them into a single register - even though
+// the source memory location is the same. This simply forces the instruction
+// to use the same register for both inputs.
+// For example, an output DAG such as this:
+//   (XXPERMDI (LXSIBZX xoaddr:$src), (LXSIBZX xoaddr:$src ), 0))
+// would result in two load instructions emitted and used as separate inputs
+// to the XXPERMDI instruction.
+class XX3Form_2s<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+  : XX3Form_2<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+    let XB = XA;
+}
+
+def HasVSX : Predicate<"PPCSubTarget->hasVSX()">;
+def IsLittleEndian : Predicate<"PPCSubTarget->isLittleEndian()">;
+def IsBigEndian : Predicate<"!PPCSubTarget->isLittleEndian()">;
+def HasOnlySwappingMemOps : Predicate<"!PPCSubTarget->hasP9Vector()">;
+
+let Predicates = [HasVSX] in {
+let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
+let UseVSXReg = 1 in {
+let hasSideEffects = 0 in { // VSX instructions don't have side effects.
+let Uses = [RM] in {
+
+  // Load indexed instructions
+  let mayLoad = 1 in {
+    let CodeSize = 3 in
+    def LXSDX : XX1Form<31, 588,
+                        (outs vsfrc:$XT), (ins memrr:$src),
+                        "lxsdx $XT, $src", IIC_LdStLFD,
+                        [(set f64:$XT, (load xoaddr:$src))]>;
+
+    let Predicates = [HasVSX, HasOnlySwappingMemOps] in
+    def LXVD2X : XX1Form<31, 844,
+                         (outs vsrc:$XT), (ins memrr:$src),
+                         "lxvd2x $XT, $src", IIC_LdStLFD,
+                         [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>;
+
+    def LXVDSX : XX1Form<31, 332,
+                         (outs vsrc:$XT), (ins memrr:$src),
+                         "lxvdsx $XT, $src", IIC_LdStLFD, []>;
+
+    let Predicates = [HasVSX, HasOnlySwappingMemOps] in
+    def LXVW4X : XX1Form<31, 780,
+                         (outs vsrc:$XT), (ins memrr:$src),
+                         "lxvw4x $XT, $src", IIC_LdStLFD,
+                         [(set v4i32:$XT, (int_ppc_vsx_lxvw4x xoaddr:$src))]>;
+  } // mayLoad
+
+  // Store indexed instructions
+  let mayStore = 1 in {
+    let CodeSize = 3 in
+    def STXSDX : XX1Form<31, 716,
+                        (outs), (ins vsfrc:$XT, memrr:$dst),
+                        "stxsdx $XT, $dst", IIC_LdStSTFD,
+                        [(store f64:$XT, xoaddr:$dst)]>;
+
+    let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
+    // The behaviour of this instruction is endianness-specific so we provide no
+    // pattern to match it without considering endianness.
+    def STXVD2X : XX1Form<31, 972,
+                         (outs), (ins vsrc:$XT, memrr:$dst),
+                         "stxvd2x $XT, $dst", IIC_LdStSTFD,
+                         []>;
+
+    def STXVW4X : XX1Form<31, 908,
+                         (outs), (ins vsrc:$XT, memrr:$dst),
+                         "stxvw4x $XT, $dst", IIC_LdStSTFD,
+                         [(store v4i32:$XT, xoaddr:$dst)]>;
+    }
+  } // mayStore
+
+  // Add/Mul Instructions
+  let isCommutable = 1 in {
+    def XSADDDP : XX3Form<60, 32,
+                          (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                          "xsadddp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fadd f64:$XA, f64:$XB))]>;
+    def XSMULDP : XX3Form<60, 48,
+                          (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                          "xsmuldp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fmul f64:$XA, f64:$XB))]>;
+
+    def XVADDDP : XX3Form<60, 96,
+                          (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                          "xvadddp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fadd v2f64:$XA, v2f64:$XB))]>;
+
+    def XVADDSP : XX3Form<60, 64,
+                          (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                          "xvaddsp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fadd v4f32:$XA, v4f32:$XB))]>;
+
+    def XVMULDP : XX3Form<60, 112,
+                          (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                          "xvmuldp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fmul v2f64:$XA, v2f64:$XB))]>;
+
+    def XVMULSP : XX3Form<60, 80,
+                          (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                          "xvmulsp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fmul v4f32:$XA, v4f32:$XB))]>;
+  }
+
+  // Subtract Instructions
+  def XSSUBDP : XX3Form<60, 40,
+                        (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                        "xssubdp $XT, $XA, $XB", IIC_VecFP,
+                        [(set f64:$XT, (fsub f64:$XA, f64:$XB))]>;
+
+  def XVSUBDP : XX3Form<60, 104,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvsubdp $XT, $XA, $XB", IIC_VecFP,
+                        [(set v2f64:$XT, (fsub v2f64:$XA, v2f64:$XB))]>;
+  def XVSUBSP : XX3Form<60, 72,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvsubsp $XT, $XA, $XB", IIC_VecFP,
+                        [(set v4f32:$XT, (fsub v4f32:$XA, v4f32:$XB))]>;
+
+  // FMA Instructions
+  let BaseName = "XSMADDADP" in {
+  let isCommutable = 1 in
+  def XSMADDADP : XX3Form<60, 33,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsmaddadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fma f64:$XA, f64:$XB, f64:$XTi))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSMADDMDP : XX3Form<60, 41,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSMSUBADP" in {
+  let isCommutable = 1 in
+  def XSMSUBADP : XX3Form<60, 49,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsmsubadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fma f64:$XA, f64:$XB, (fneg f64:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSMSUBMDP : XX3Form<60, 57,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSNMADDADP" in {
+  let isCommutable = 1 in
+  def XSNMADDADP : XX3Form<60, 161,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsnmaddadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, f64:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSNMADDMDP : XX3Form<60, 169,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSNMSUBADP" in {
+  let isCommutable = 1 in
+  def XSNMSUBADP : XX3Form<60, 177,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsnmsubadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, (fneg f64:$XTi))))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSNMSUBMDP : XX3Form<60, 185,
+                          (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
+                          "xsnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVMADDADP" in {
+  let isCommutable = 1 in
+  def XVMADDADP : XX3Form<60, 97,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmaddadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVMADDMDP : XX3Form<60, 105,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVMADDASP" in {
+  let isCommutable = 1 in
+  def XVMADDASP : XX3Form<60, 65,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmaddasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVMADDMSP : XX3Form<60, 73,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVMSUBADP" in {
+  let isCommutable = 1 in
+  def XVMSUBADP : XX3Form<60, 113,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmsubadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVMSUBMDP : XX3Form<60, 121,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVMSUBASP" in {
+  let isCommutable = 1 in
+  def XVMSUBASP : XX3Form<60, 81,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmsubasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVMSUBMSP : XX3Form<60, 89,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVNMADDADP" in {
+  let isCommutable = 1 in
+  def XVNMADDADP : XX3Form<60, 225,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmaddadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVNMADDMDP : XX3Form<60, 233,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVNMADDASP" in {
+  let isCommutable = 1 in
+  def XVNMADDASP : XX3Form<60, 193,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmaddasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVNMADDMSP : XX3Form<60, 201,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVNMSUBADP" in {
+  let isCommutable = 1 in
+  def XVNMSUBADP : XX3Form<60, 241,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmsubadp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi))))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVNMSUBMDP : XX3Form<60, 249,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XVNMSUBASP" in {
+  let isCommutable = 1 in
+  def XVNMSUBASP : XX3Form<60, 209,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmsubasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi))))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XVNMSUBMSP : XX3Form<60, 217,
+                          (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
+                          "xvnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  // Division Instructions
+  def XSDIVDP : XX3Form<60, 56,
+                        (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                        "xsdivdp $XT, $XA, $XB", IIC_FPDivD,
+                        [(set f64:$XT, (fdiv f64:$XA, f64:$XB))]>;
+  def XSSQRTDP : XX2Form<60, 75,
+                        (outs vsfrc:$XT), (ins vsfrc:$XB),
+                        "xssqrtdp $XT, $XB", IIC_FPSqrtD,
+                        [(set f64:$XT, (fsqrt f64:$XB))]>;
+
+  def XSREDP : XX2Form<60, 90,
+                        (outs vsfrc:$XT), (ins vsfrc:$XB),
+                        "xsredp $XT, $XB", IIC_VecFP,
+                        [(set f64:$XT, (PPCfre f64:$XB))]>;
+  def XSRSQRTEDP : XX2Form<60, 74,
+                           (outs vsfrc:$XT), (ins vsfrc:$XB),
+                           "xsrsqrtedp $XT, $XB", IIC_VecFP,
+                           [(set f64:$XT, (PPCfrsqrte f64:$XB))]>;
+
+  def XSTDIVDP : XX3Form_1<60, 61,
+                         (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+                         "xstdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
+  def XSTSQRTDP : XX2Form_1<60, 106,
+                          (outs crrc:$crD), (ins vsfrc:$XB),
+                          "xstsqrtdp $crD, $XB", IIC_FPCompare, []>;
+
+  def XVDIVDP : XX3Form<60, 120,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvdivdp $XT, $XA, $XB", IIC_FPDivD,
+                        [(set v2f64:$XT, (fdiv v2f64:$XA, v2f64:$XB))]>;
+  def XVDIVSP : XX3Form<60, 88,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvdivsp $XT, $XA, $XB", IIC_FPDivS,
+                        [(set v4f32:$XT, (fdiv v4f32:$XA, v4f32:$XB))]>;
+
+  def XVSQRTDP : XX2Form<60, 203,
+                        (outs vsrc:$XT), (ins vsrc:$XB),
+                        "xvsqrtdp $XT, $XB", IIC_FPSqrtD,
+                        [(set v2f64:$XT, (fsqrt v2f64:$XB))]>;
+  def XVSQRTSP : XX2Form<60, 139,
+                        (outs vsrc:$XT), (ins vsrc:$XB),
+                        "xvsqrtsp $XT, $XB", IIC_FPSqrtS,
+                        [(set v4f32:$XT, (fsqrt v4f32:$XB))]>;
+
+  def XVTDIVDP : XX3Form_1<60, 125,
+                         (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
+                         "xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
+  def XVTDIVSP : XX3Form_1<60, 93,
+                         (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
+                         "xvtdivsp $crD, $XA, $XB", IIC_FPCompare, []>;
+
+  def XVTSQRTDP : XX2Form_1<60, 234,
+                          (outs crrc:$crD), (ins vsrc:$XB),
+                          "xvtsqrtdp $crD, $XB", IIC_FPCompare, []>;
+  def XVTSQRTSP : XX2Form_1<60, 170,
+                          (outs crrc:$crD), (ins vsrc:$XB),
+                          "xvtsqrtsp $crD, $XB", IIC_FPCompare, []>;
+
+  def XVREDP : XX2Form<60, 218,
+                        (outs vsrc:$XT), (ins vsrc:$XB),
+                        "xvredp $XT, $XB", IIC_VecFP,
+                        [(set v2f64:$XT, (PPCfre v2f64:$XB))]>;
+  def XVRESP : XX2Form<60, 154,
+                        (outs vsrc:$XT), (ins vsrc:$XB),
+                        "xvresp $XT, $XB", IIC_VecFP,
+                        [(set v4f32:$XT, (PPCfre v4f32:$XB))]>;
+
+  def XVRSQRTEDP : XX2Form<60, 202,
+                           (outs vsrc:$XT), (ins vsrc:$XB),
+                           "xvrsqrtedp $XT, $XB", IIC_VecFP,
+                           [(set v2f64:$XT, (PPCfrsqrte v2f64:$XB))]>;
+  def XVRSQRTESP : XX2Form<60, 138,
+                           (outs vsrc:$XT), (ins vsrc:$XB),
+                           "xvrsqrtesp $XT, $XB", IIC_VecFP,
+                           [(set v4f32:$XT, (PPCfrsqrte v4f32:$XB))]>;
+
+  // Compare Instructions
+  def XSCMPODP : XX3Form_1<60, 43,
+                           (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+                           "xscmpodp $crD, $XA, $XB", IIC_FPCompare, []>;
+  def XSCMPUDP : XX3Form_1<60, 35,
+                           (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+                           "xscmpudp $crD, $XA, $XB", IIC_FPCompare, []>;
+
+  defm XVCMPEQDP : XX3Form_Rcr<60, 99,
+                             "xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare,
+                             int_ppc_vsx_xvcmpeqdp, v2i64, v2f64>;
+  defm XVCMPEQSP : XX3Form_Rcr<60, 67,
+                             "xvcmpeqsp", "$XT, $XA, $XB", IIC_VecFPCompare,
+                             int_ppc_vsx_xvcmpeqsp, v4i32, v4f32>;
+  defm XVCMPGEDP : XX3Form_Rcr<60, 115,
+                             "xvcmpgedp", "$XT, $XA, $XB", IIC_VecFPCompare,
+                             int_ppc_vsx_xvcmpgedp, v2i64, v2f64>;
+  defm XVCMPGESP : XX3Form_Rcr<60, 83,
+                             "xvcmpgesp", "$XT, $XA, $XB", IIC_VecFPCompare,
+                             int_ppc_vsx_xvcmpgesp, v4i32, v4f32>;
+  defm XVCMPGTDP : XX3Form_Rcr<60, 107,
+                             "xvcmpgtdp", "$XT, $XA, $XB", IIC_VecFPCompare,
+                             int_ppc_vsx_xvcmpgtdp, v2i64, v2f64>;
+  defm XVCMPGTSP : XX3Form_Rcr<60, 75,
+                             "xvcmpgtsp", "$XT, $XA, $XB", IIC_VecFPCompare,
+                             int_ppc_vsx_xvcmpgtsp, v4i32, v4f32>;
+
+  // Move Instructions
+  def XSABSDP : XX2Form<60, 345,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsabsdp $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fabs f64:$XB))]>;
+  def XSNABSDP : XX2Form<60, 361,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsnabsdp $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fneg (fabs f64:$XB)))]>;
+  def XSNEGDP : XX2Form<60, 377,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsnegdp $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fneg f64:$XB))]>;
+  def XSCPSGNDP : XX3Form<60, 176,
+                      (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                      "xscpsgndp $XT, $XA, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fcopysign f64:$XB, f64:$XA))]>;
+
+  def XVABSDP : XX2Form<60, 473,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvabsdp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fabs v2f64:$XB))]>;
+
+  def XVABSSP : XX2Form<60, 409,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvabssp $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fabs v4f32:$XB))]>;
+
+  def XVCPSGNDP : XX3Form<60, 240,
+                      (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xvcpsgndp $XT, $XA, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fcopysign v2f64:$XB, v2f64:$XA))]>;
+  def XVCPSGNSP : XX3Form<60, 208,
+                      (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xvcpsgnsp $XT, $XA, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fcopysign v4f32:$XB, v4f32:$XA))]>;
+
+  def XVNABSDP : XX2Form<60, 489,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvnabsdp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fneg (fabs v2f64:$XB)))]>;
+  def XVNABSSP : XX2Form<60, 425,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvnabssp $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fneg (fabs v4f32:$XB)))]>;
+
+  def XVNEGDP : XX2Form<60, 505,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvnegdp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fneg v2f64:$XB))]>;
+  def XVNEGSP : XX2Form<60, 441,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvnegsp $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fneg v4f32:$XB))]>;
+
+  // Conversion Instructions
+  def XSCVDPSP : XX2Form<60, 265,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvdpsp $XT, $XB", IIC_VecFP, []>;
+  def XSCVDPSXDS : XX2Form<60, 344,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvdpsxds $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfctidz f64:$XB))]>;
+  let isCodeGenOnly = 1 in
+  def XSCVDPSXDSs : XX2Form<60, 344,
+                      (outs vssrc:$XT), (ins vssrc:$XB),
+                      "xscvdpsxds $XT, $XB", IIC_VecFP,
+                      [(set f32:$XT, (PPCfctidz f32:$XB))]>;
+  def XSCVDPSXWS : XX2Form<60, 88,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvdpsxws $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfctiwz f64:$XB))]>;
+  let isCodeGenOnly = 1 in
+  def XSCVDPSXWSs : XX2Form<60, 88,
+                      (outs vssrc:$XT), (ins vssrc:$XB),
+                      "xscvdpsxws $XT, $XB", IIC_VecFP,
+                      [(set f32:$XT, (PPCfctiwz f32:$XB))]>;
+  def XSCVDPUXDS : XX2Form<60, 328,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvdpuxds $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfctiduz f64:$XB))]>;
+  let isCodeGenOnly = 1 in
+  def XSCVDPUXDSs : XX2Form<60, 328,
+                      (outs vssrc:$XT), (ins vssrc:$XB),
+                      "xscvdpuxds $XT, $XB", IIC_VecFP,
+                      [(set f32:$XT, (PPCfctiduz f32:$XB))]>;
+  def XSCVDPUXWS : XX2Form<60, 72,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvdpuxws $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfctiwuz f64:$XB))]>;
+  let isCodeGenOnly = 1 in
+  def XSCVDPUXWSs : XX2Form<60, 72,
+                      (outs vssrc:$XT), (ins vssrc:$XB),
+                      "xscvdpuxws $XT, $XB", IIC_VecFP,
+                      [(set f32:$XT, (PPCfctiwuz f32:$XB))]>;
+  def XSCVSPDP : XX2Form<60, 329,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvspdp $XT, $XB", IIC_VecFP, []>;
+  def XSCVSXDDP : XX2Form<60, 376,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvsxddp $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfcfid f64:$XB))]>;
+  def XSCVUXDDP : XX2Form<60, 360,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xscvuxddp $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (PPCfcfidu f64:$XB))]>;
+
+  def XVCVDPSP : XX2Form<60, 393,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvdpsp $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (int_ppc_vsx_xvcvdpsp v2f64:$XB))]>;
+  def XVCVDPSXDS : XX2Form<60, 472,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvdpsxds $XT, $XB", IIC_VecFP,
+                      [(set v2i64:$XT, (fp_to_sint v2f64:$XB))]>;
+  def XVCVDPSXWS : XX2Form<60, 216,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvdpsxws $XT, $XB", IIC_VecFP,
+                      [(set v4i32:$XT, (int_ppc_vsx_xvcvdpsxws v2f64:$XB))]>;
+  def XVCVDPUXDS : XX2Form<60, 456,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvdpuxds $XT, $XB", IIC_VecFP,
+                      [(set v2i64:$XT, (fp_to_uint v2f64:$XB))]>;
+  def XVCVDPUXWS : XX2Form<60, 200,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvdpuxws $XT, $XB", IIC_VecFP,
+                      [(set v4i32:$XT, (int_ppc_vsx_xvcvdpuxws v2f64:$XB))]>;
+
+  def XVCVSPDP : XX2Form<60, 457,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvspdp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (int_ppc_vsx_xvcvspdp v4f32:$XB))]>;
+  def XVCVSPSXDS : XX2Form<60, 408,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvspsxds $XT, $XB", IIC_VecFP, []>;
+  def XVCVSPSXWS : XX2Form<60, 152,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvspsxws $XT, $XB", IIC_VecFP,
+                      [(set v4i32:$XT, (fp_to_sint v4f32:$XB))]>;
+  def XVCVSPUXDS : XX2Form<60, 392,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvspuxds $XT, $XB", IIC_VecFP, []>;
+  def XVCVSPUXWS : XX2Form<60, 136,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvspuxws $XT, $XB", IIC_VecFP,
+                      [(set v4i32:$XT, (fp_to_uint v4f32:$XB))]>;
+  def XVCVSXDDP : XX2Form<60, 504,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvsxddp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (sint_to_fp v2i64:$XB))]>;
+  def XVCVSXDSP : XX2Form<60, 440,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvsxdsp $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (int_ppc_vsx_xvcvsxdsp v2i64:$XB))]>;
+  def XVCVSXWDP : XX2Form<60, 248,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvsxwdp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (int_ppc_vsx_xvcvsxwdp v4i32:$XB))]>;
+  def XVCVSXWSP : XX2Form<60, 184,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvsxwsp $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (sint_to_fp v4i32:$XB))]>;
+  def XVCVUXDDP : XX2Form<60, 488,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvuxddp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (uint_to_fp v2i64:$XB))]>;
+  def XVCVUXDSP : XX2Form<60, 424,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvuxdsp $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (int_ppc_vsx_xvcvuxdsp v2i64:$XB))]>;
+  def XVCVUXWDP : XX2Form<60, 232,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvuxwdp $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (int_ppc_vsx_xvcvuxwdp v4i32:$XB))]>;
+  def XVCVUXWSP : XX2Form<60, 168,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvcvuxwsp $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (uint_to_fp v4i32:$XB))]>;
+
+  // Rounding Instructions
+  def XSRDPI : XX2Form<60, 73,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsrdpi $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fround f64:$XB))]>;
+  def XSRDPIC : XX2Form<60, 107,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsrdpic $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fnearbyint f64:$XB))]>;
+  def XSRDPIM : XX2Form<60, 121,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsrdpim $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (ffloor f64:$XB))]>;
+  def XSRDPIP : XX2Form<60, 105,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsrdpip $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (fceil f64:$XB))]>;
+  def XSRDPIZ : XX2Form<60, 89,
+                      (outs vsfrc:$XT), (ins vsfrc:$XB),
+                      "xsrdpiz $XT, $XB", IIC_VecFP,
+                      [(set f64:$XT, (ftrunc f64:$XB))]>;
+
+  def XVRDPI : XX2Form<60, 201,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrdpi $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fround v2f64:$XB))]>;
+  def XVRDPIC : XX2Form<60, 235,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrdpic $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fnearbyint v2f64:$XB))]>;
+  def XVRDPIM : XX2Form<60, 249,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrdpim $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (ffloor v2f64:$XB))]>;
+  def XVRDPIP : XX2Form<60, 233,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrdpip $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (fceil v2f64:$XB))]>;
+  def XVRDPIZ : XX2Form<60, 217,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrdpiz $XT, $XB", IIC_VecFP,
+                      [(set v2f64:$XT, (ftrunc v2f64:$XB))]>;
+
+  def XVRSPI : XX2Form<60, 137,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrspi $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fround v4f32:$XB))]>;
+  def XVRSPIC : XX2Form<60, 171,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrspic $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fnearbyint v4f32:$XB))]>;
+  def XVRSPIM : XX2Form<60, 185,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrspim $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (ffloor v4f32:$XB))]>;
+  def XVRSPIP : XX2Form<60, 169,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrspip $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (fceil v4f32:$XB))]>;
+  def XVRSPIZ : XX2Form<60, 153,
+                      (outs vsrc:$XT), (ins vsrc:$XB),
+                      "xvrspiz $XT, $XB", IIC_VecFP,
+                      [(set v4f32:$XT, (ftrunc v4f32:$XB))]>;
+
+  // Max/Min Instructions
+  let isCommutable = 1 in {
+  def XSMAXDP : XX3Form<60, 160,
+                        (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                        "xsmaxdp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsfrc:$XT,
+                              (int_ppc_vsx_xsmaxdp vsfrc:$XA, vsfrc:$XB))]>;
+  def XSMINDP : XX3Form<60, 168,
+                        (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                        "xsmindp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsfrc:$XT,
+                              (int_ppc_vsx_xsmindp vsfrc:$XA, vsfrc:$XB))]>;
+
+  def XVMAXDP : XX3Form<60, 224,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvmaxdp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsrc:$XT,
+                              (int_ppc_vsx_xvmaxdp vsrc:$XA, vsrc:$XB))]>;
+  def XVMINDP : XX3Form<60, 232,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvmindp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsrc:$XT,
+                              (int_ppc_vsx_xvmindp vsrc:$XA, vsrc:$XB))]>;
+
+  def XVMAXSP : XX3Form<60, 192,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvmaxsp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsrc:$XT,
+                              (int_ppc_vsx_xvmaxsp vsrc:$XA, vsrc:$XB))]>;
+  def XVMINSP : XX3Form<60, 200,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xvminsp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsrc:$XT,
+                              (int_ppc_vsx_xvminsp vsrc:$XA, vsrc:$XB))]>;
+  } // isCommutable
+} // Uses = [RM]
+
+  // Logical Instructions
+  let isCommutable = 1 in
+  def XXLAND : XX3Form<60, 130,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxland $XT, $XA, $XB", IIC_VecGeneral,
+                       [(set v4i32:$XT, (and v4i32:$XA, v4i32:$XB))]>;
+  def XXLANDC : XX3Form<60, 138,
+                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                        "xxlandc $XT, $XA, $XB", IIC_VecGeneral,
+                        [(set v4i32:$XT, (and v4i32:$XA,
+                                              (vnot_ppc v4i32:$XB)))]>;
+  let isCommutable = 1 in {
+  def XXLNOR : XX3Form<60, 162,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxlnor $XT, $XA, $XB", IIC_VecGeneral,
+                       [(set v4i32:$XT, (vnot_ppc (or v4i32:$XA,
+                                                   v4i32:$XB)))]>;
+  def XXLOR : XX3Form<60, 146,
+                      (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xxlor $XT, $XA, $XB", IIC_VecGeneral,
+                      [(set v4i32:$XT, (or v4i32:$XA, v4i32:$XB))]>;
+  let isCodeGenOnly = 1 in
+  def XXLORf: XX3Form<60, 146,
+                      (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
+                      "xxlor $XT, $XA, $XB", IIC_VecGeneral, []>;
+  def XXLXOR : XX3Form<60, 154,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxlxor $XT, $XA, $XB", IIC_VecGeneral,
+                       [(set v4i32:$XT, (xor v4i32:$XA, v4i32:$XB))]>;
+  } // isCommutable
+  let isCodeGenOnly = 1 in
+  def XXLXORz : XX3Form_Zero<60, 154, (outs vsrc:$XT), (ins),
+                       "xxlxor $XT, $XT, $XT", IIC_VecGeneral,
+                       [(set v4i32:$XT, (v4i32 immAllZerosV))]>;
+
+  let isCodeGenOnly = 1 in {
+    def XXLXORdpz : XX3Form_SetZero<60, 154,
+                         (outs vsfrc:$XT), (ins),
+                         "xxlxor $XT, $XT, $XT", IIC_VecGeneral,
+                         [(set f64:$XT, (fpimm0))]>;
+    def XXLXORspz : XX3Form_SetZero<60, 154,
+                         (outs vssrc:$XT), (ins),
+                         "xxlxor $XT, $XT, $XT", IIC_VecGeneral,
+                         [(set f32:$XT, (fpimm0))]>;
+  }
+
+  // Permutation Instructions
+  def XXMRGHW : XX3Form<60, 18,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxmrghw $XT, $XA, $XB", IIC_VecPerm, []>;
+  def XXMRGLW : XX3Form<60, 50,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxmrglw $XT, $XA, $XB", IIC_VecPerm, []>;
+
+  def XXPERMDI : XX3Form_2<60, 10,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM),
+                       "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, []>;
+  let isCodeGenOnly = 1 in
+  def XXPERMDIs : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$DM),
+                             "xxpermdi $XT, $XA, $XA, $DM", IIC_VecPerm, []>;
+  def XXSEL : XX4Form<60, 3,
+                      (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, vsrc:$XC),
+                      "xxsel $XT, $XA, $XB, $XC", IIC_VecPerm, []>;
+
+  def XXSLDWI : XX3Form_2<60, 2,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$SHW),
+                       "xxsldwi $XT, $XA, $XB, $SHW", IIC_VecPerm,
+                       [(set v4i32:$XT, (PPCvecshl v4i32:$XA, v4i32:$XB,
+                                                  imm32SExt16:$SHW))]>;
+  def XXSPLTW : XX2Form_2<60, 164,
+                       (outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM),
+                       "xxspltw $XT, $XB, $UIM", IIC_VecPerm,
+                       [(set v4i32:$XT,
+                             (PPCxxsplt v4i32:$XB, imm32SExt16:$UIM))]>;
+  let isCodeGenOnly = 1 in
+  def XXSPLTWs : XX2Form_2<60, 164,
+                       (outs vsrc:$XT), (ins vfrc:$XB, u2imm:$UIM),
+                       "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;
+} // hasSideEffects
+} // UseVSXReg = 1
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+let usesCustomInserter = 1,    // Expanded after instruction selection.
+    PPC970_Single = 1 in {
+
+  def SELECT_CC_VSRC: Pseudo<(outs vsrc:$dst),
+                             (ins crrc:$cond, vsrc:$T, vsrc:$F, i32imm:$BROPC),
+                             "#SELECT_CC_VSRC",
+                             []>;
+  def SELECT_VSRC: Pseudo<(outs vsrc:$dst),
+                          (ins crbitrc:$cond, vsrc:$T, vsrc:$F),
+                          "#SELECT_VSRC",
+                          [(set v2f64:$dst,
+                                (select i1:$cond, v2f64:$T, v2f64:$F))]>;
+  def SELECT_CC_VSFRC: Pseudo<(outs f8rc:$dst),
+                              (ins crrc:$cond, f8rc:$T, f8rc:$F,
+                               i32imm:$BROPC), "#SELECT_CC_VSFRC",
+                              []>;
+  def SELECT_VSFRC: Pseudo<(outs f8rc:$dst),
+                           (ins crbitrc:$cond, f8rc:$T, f8rc:$F),
+                           "#SELECT_VSFRC",
+                           [(set f64:$dst,
+                                 (select i1:$cond, f64:$T, f64:$F))]>;
+  def SELECT_CC_VSSRC: Pseudo<(outs f4rc:$dst),
+                              (ins crrc:$cond, f4rc:$T, f4rc:$F,
+                               i32imm:$BROPC), "#SELECT_CC_VSSRC",
+                              []>;
+  def SELECT_VSSRC: Pseudo<(outs f4rc:$dst),
+                           (ins crbitrc:$cond, f4rc:$T, f4rc:$F),
+                           "#SELECT_VSSRC",
+                           [(set f32:$dst,
+                                 (select i1:$cond, f32:$T, f32:$F))]>;
+} // usesCustomInserter
+} // AddedComplexity
+
+def : InstAlias<"xvmovdp $XT, $XB",
+                (XVCPSGNDP vsrc:$XT, vsrc:$XB, vsrc:$XB)>;
+def : InstAlias<"xvmovsp $XT, $XB",
+                (XVCPSGNSP vsrc:$XT, vsrc:$XB, vsrc:$XB)>;
+
+def : InstAlias<"xxspltd $XT, $XB, 0",
+                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 0)>;
+def : InstAlias<"xxspltd $XT, $XB, 1",
+                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 3)>;
+def : InstAlias<"xxmrghd $XT, $XA, $XB",
+                (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 0)>;
+def : InstAlias<"xxmrgld $XT, $XA, $XB",
+                (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 3)>;
+def : InstAlias<"xxswapd $XT, $XB",
+                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 2)>;
+def : InstAlias<"xxspltd $XT, $XB, 0",
+                (XXPERMDIs vsrc:$XT, vsfrc:$XB, 0)>;
+def : InstAlias<"xxspltd $XT, $XB, 1",
+                (XXPERMDIs vsrc:$XT, vsfrc:$XB, 3)>;
+def : InstAlias<"xxswapd $XT, $XB",
+                (XXPERMDIs vsrc:$XT, vsfrc:$XB, 2)>;
+
+let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
+
+def : Pat<(v4i32 (vnot_ppc v4i32:$A)),
+          (v4i32 (XXLNOR $A, $A))>;
+let Predicates = [IsBigEndian] in {
+def : Pat<(v2f64 (scalar_to_vector f64:$A)),
+          (v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>;
+
+def : Pat<(f64 (extractelt v2f64:$S, 0)),
+          (f64 (EXTRACT_SUBREG $S, sub_64))>;
+def : Pat<(f64 (extractelt v2f64:$S, 1)),
+          (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
+}
+
+let Predicates = [IsLittleEndian] in {
+def : Pat<(v2f64 (scalar_to_vector f64:$A)),
+          (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), $A, sub_64),
+                           (SUBREG_TO_REG (i64 1), $A, sub_64), 0))>;
+
+def : Pat<(f64 (extractelt v2f64:$S, 0)),
+          (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
+def : Pat<(f64 (extractelt v2f64:$S, 1)),
+          (f64 (EXTRACT_SUBREG $S, sub_64))>;
+}
+
+// Additional fnmsub patterns: -a*c + b == -(a*c - b)
+def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
+          (XSNMSUBADP $B, $C, $A)>;
+def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
+          (XSNMSUBADP $B, $C, $A)>;
+
+def : Pat<(fma (fneg v2f64:$A), v2f64:$C, v2f64:$B),
+          (XVNMSUBADP $B, $C, $A)>;
+def : Pat<(fma v2f64:$A, (fneg v2f64:$C), v2f64:$B),
+          (XVNMSUBADP $B, $C, $A)>;
+
+def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B),
+          (XVNMSUBASP $B, $C, $A)>;
+def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B),
+          (XVNMSUBASP $B, $C, $A)>;
+
+def : Pat<(v2f64 (bitconvert v4f32:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2f64 (bitconvert v4i32:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2f64 (bitconvert v8i16:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2f64 (bitconvert v16i8:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+
+def : Pat<(v4f32 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v4i32 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v8i16 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v16i8 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+
+def : Pat<(v2i64 (bitconvert v4f32:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2i64 (bitconvert v4i32:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2i64 (bitconvert v8i16:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2i64 (bitconvert v16i8:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+
+def : Pat<(v4f32 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v4i32 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v8i16 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v16i8 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+
+def : Pat<(v2f64 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v2i64 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+
+def : Pat<(v2f64 (bitconvert v1i128:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v1i128 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+
+// sign extension patterns
+// To extend "in place" from v2i32 to v2i64, we have input data like:
+// | undef | i32 | undef | i32 |
+// but xvcvsxwdp expects the input in big-Endian format:
+// | i32 | undef | i32 | undef |
+// so we need to shift everything to the left by one i32 (word) before
+// the conversion.
+def : Pat<(sext_inreg v2i64:$C, v2i32),
+          (XVCVDPSXDS (XVCVSXWDP (XXSLDWI $C, $C, 1)))>;
+def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))),
+          (XVCVSXWDP (XXSLDWI $C, $C, 1))>;
+
+def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 0)),
+          (v2f64 (XVCVSXWDP (v2i64 (XXMRGHW $C, $C))))>;
+def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 1)),
+          (v2f64 (XVCVSXWDP (v2i64 (XXMRGLW $C, $C))))>;
+
+def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 0)),
+          (v2f64 (XVCVUXWDP (v2i64 (XXMRGHW $C, $C))))>;
+def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)),
+          (v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>;
+
+// Loads.
+let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
+  def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+
+  // Stores.
+  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
+            (STXVD2X $rS, xoaddr:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
+            (STXVW4X $rS, xoaddr:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvd2x_be v2f64:$rS, xoaddr:$dst),
+            (STXVD2X $rS, xoaddr:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvw4x_be v4i32:$rS, xoaddr:$dst),
+            (STXVW4X $rS, xoaddr:$dst)>;
+  def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+}
+let Predicates = [IsBigEndian, HasVSX, HasOnlySwappingMemOps] in {
+  def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+  def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+  def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+  def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+  def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+}
+
+// Permutes.
+def : Pat<(v2f64 (PPCxxswapd v2f64:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v2i64 (PPCxxswapd v2i64:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v2f64 (PPCswapNoChain v2f64:$src)), (XXPERMDI $src, $src, 2)>;
+
+// Selects.
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)),
+          (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETULT)),
+          (SELECT_VSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLE)),
+          (SELECT_VSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETULE)),
+          (SELECT_VSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETEQ)),
+          (SELECT_VSRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETGE)),
+          (SELECT_VSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETUGE)),
+          (SELECT_VSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETGT)),
+          (SELECT_VSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETUGT)),
+          (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETNE)),
+          (SELECT_VSRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)),
+          (SELECT_VSFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULT)),
+          (SELECT_VSFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)),
+          (SELECT_VSFRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULE)),
+          (SELECT_VSFRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)),
+          (SELECT_VSFRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)),
+          (SELECT_VSFRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGE)),
+          (SELECT_VSFRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)),
+          (SELECT_VSFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)),
+          (SELECT_VSFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
+          (SELECT_VSFRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+// Divides.
+def : Pat<(int_ppc_vsx_xvdivsp v4f32:$A, v4f32:$B),
+          (XVDIVSP $A, $B)>;
+def : Pat<(int_ppc_vsx_xvdivdp v2f64:$A, v2f64:$B),
+          (XVDIVDP $A, $B)>;
+
+// Reciprocal estimate
+def : Pat<(int_ppc_vsx_xvresp v4f32:$A),
+          (XVRESP $A)>;
+def : Pat<(int_ppc_vsx_xvredp v2f64:$A),
+          (XVREDP $A)>;
+
+// Recip. square root estimate
+def : Pat<(int_ppc_vsx_xvrsqrtesp v4f32:$A),
+          (XVRSQRTESP $A)>;
+def : Pat<(int_ppc_vsx_xvrsqrtedp v2f64:$A),
+          (XVRSQRTEDP $A)>;
+
+let Predicates = [IsLittleEndian] in {
+def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+          (f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+          (f64 (XSCVSXDDP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+          (f64 (XSCVUXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+          (f64 (XSCVUXDDP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+} // IsLittleEndian
+
+let Predicates = [IsBigEndian] in {
+def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+          (f64 (XSCVSXDDP (COPY_TO_REGCLASS $S, VSFRC)))>;
+def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+          (f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+          (f64 (XSCVUXDDP (COPY_TO_REGCLASS $S, VSFRC)))>;
+def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+          (f64 (XSCVUXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+} // IsBigEndian
+
+} // AddedComplexity
+} // HasVSX
+
+def ScalarLoads {
+  dag Li8 =       (i32 (extloadi8 xoaddr:$src));
+  dag ZELi8 =     (i32 (zextloadi8 xoaddr:$src));
+  dag ZELi8i64 =  (i64 (zextloadi8 xoaddr:$src));
+  dag SELi8 =     (i32 (sext_inreg (extloadi8 xoaddr:$src), i8));
+  dag SELi8i64 =  (i64 (sext_inreg (extloadi8 xoaddr:$src), i8));
+
+  dag Li16 =      (i32 (extloadi16 xoaddr:$src));
+  dag ZELi16 =    (i32 (zextloadi16 xoaddr:$src));
+  dag ZELi16i64 = (i64 (zextloadi16 xoaddr:$src));
+  dag SELi16 =    (i32 (sextloadi16 xoaddr:$src));
+  dag SELi16i64 = (i64 (sextloadi16 xoaddr:$src));
+
+  dag Li32 = (i32 (load xoaddr:$src));
+}
+
+// The following VSX instructions were introduced in Power ISA 2.07
+/* FIXME: if the operands are v2i64, these patterns will not match.
+   we should define new patterns or otherwise match the same patterns
+   when the elements are larger than i32.
+*/
+def HasP8Vector : Predicate<"PPCSubTarget->hasP8Vector()">;
+def HasDirectMove : Predicate<"PPCSubTarget->hasDirectMove()">;
+let Predicates = [HasP8Vector] in {
+let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
+  let isCommutable = 1, UseVSXReg = 1 in {
+    def XXLEQV : XX3Form<60, 186,
+                         (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                         "xxleqv $XT, $XA, $XB", IIC_VecGeneral,
+                         [(set v4i32:$XT, (vnot_ppc (xor v4i32:$XA, v4i32:$XB)))]>;
+    def XXLNAND : XX3Form<60, 178,
+                          (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                          "xxlnand $XT, $XA, $XB", IIC_VecGeneral,
+                          [(set v4i32:$XT, (vnot_ppc (and v4i32:$XA,
+                                                    v4i32:$XB)))]>;
+  } // isCommutable, UseVSXReg
+
+  def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B),
+            (XXLEQV $A, $B)>;
+
+  let UseVSXReg = 1 in {
+  def XXLORC : XX3Form<60, 170,
+                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                       "xxlorc $XT, $XA, $XB", IIC_VecGeneral,
+                       [(set v4i32:$XT, (or v4i32:$XA, (vnot_ppc v4i32:$XB)))]>;
+
+  // VSX scalar loads introduced in ISA 2.07
+  let mayLoad = 1 in {
+    let CodeSize = 3 in
+    def LXSSPX : XX1Form<31, 524, (outs vssrc:$XT), (ins memrr:$src),
+                         "lxsspx $XT, $src", IIC_LdStLFD,
+                         [(set f32:$XT, (load xoaddr:$src))]>;
+    def LXSIWAX : XX1Form<31, 76, (outs vsfrc:$XT), (ins memrr:$src),
+                          "lxsiwax $XT, $src", IIC_LdStLFD,
+                          [(set f64:$XT, (PPClfiwax xoaddr:$src))]>;
+    def LXSIWZX : XX1Form<31, 12, (outs vsfrc:$XT), (ins memrr:$src),
+                          "lxsiwzx $XT, $src", IIC_LdStLFD,
+                          [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>;
+  } // mayLoad
+
+  // VSX scalar stores introduced in ISA 2.07
+  let mayStore = 1 in {
+    let CodeSize = 3 in
+    def STXSSPX : XX1Form<31, 652, (outs), (ins vssrc:$XT, memrr:$dst),
+                          "stxsspx $XT, $dst", IIC_LdStSTFD,
+                          [(store f32:$XT, xoaddr:$dst)]>;
+    def STXSIWX : XX1Form<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst),
+                          "stxsiwx $XT, $dst", IIC_LdStSTFD,
+                          [(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
+  } // mayStore
+  } // UseVSXReg = 1
+
+  def : Pat<(f64 (extloadf32 xoaddr:$src)),
+            (COPY_TO_REGCLASS (LXSSPX xoaddr:$src), VSFRC)>;
+  def : Pat<(f32 (fpround (extloadf32 xoaddr:$src))),
+            (f32 (LXSSPX xoaddr:$src))>;
+  def : Pat<(f64 (fpextend f32:$src)),
+            (COPY_TO_REGCLASS $src, VSFRC)>;
+
+  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
+            (SELECT_VSSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULT)),
+            (SELECT_VSSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)),
+            (SELECT_VSSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULE)),
+            (SELECT_VSSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)),
+            (SELECT_VSSRC (CREQV $lhs, $rhs), $tval, $fval)>;
+  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)),
+            (SELECT_VSSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGE)),
+            (SELECT_VSSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)),
+            (SELECT_VSSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGT)),
+            (SELECT_VSSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)),
+            (SELECT_VSSRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+  let UseVSXReg = 1 in {
+  // VSX Elementary Scalar FP arithmetic (SP)
+  let isCommutable = 1 in {
+    def XSADDSP : XX3Form<60, 0,
+                          (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
+                          "xsaddsp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f32:$XT, (fadd f32:$XA, f32:$XB))]>;
+    def XSMULSP : XX3Form<60, 16,
+                          (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
+                          "xsmulsp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f32:$XT, (fmul f32:$XA, f32:$XB))]>;
+  } // isCommutable
+
+  def XSDIVSP : XX3Form<60, 24,
+                        (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
+                        "xsdivsp $XT, $XA, $XB", IIC_FPDivS,
+                        [(set f32:$XT, (fdiv f32:$XA, f32:$XB))]>;
+  def XSRESP : XX2Form<60, 26,
+                        (outs vssrc:$XT), (ins vssrc:$XB),
+                        "xsresp $XT, $XB", IIC_VecFP,
+                        [(set f32:$XT, (PPCfre f32:$XB))]>;
+  def XSSQRTSP : XX2Form<60, 11,
+                        (outs vssrc:$XT), (ins vssrc:$XB),
+                        "xssqrtsp $XT, $XB", IIC_FPSqrtS,
+                        [(set f32:$XT, (fsqrt f32:$XB))]>;
+  def XSRSQRTESP : XX2Form<60, 10,
+                           (outs vssrc:$XT), (ins vssrc:$XB),
+                           "xsrsqrtesp $XT, $XB", IIC_VecFP,
+                           [(set f32:$XT, (PPCfrsqrte f32:$XB))]>;
+  def XSSUBSP : XX3Form<60, 8,
+                        (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
+                        "xssubsp $XT, $XA, $XB", IIC_VecFP,
+                        [(set f32:$XT, (fsub f32:$XA, f32:$XB))]>;
+
+  // FMA Instructions
+  let BaseName = "XSMADDASP" in {
+  let isCommutable = 1 in
+  def XSMADDASP : XX3Form<60, 1,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsmaddasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f32:$XT, (fma f32:$XA, f32:$XB, f32:$XTi))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSMADDMSP : XX3Form<60, 9,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSMSUBASP" in {
+  let isCommutable = 1 in
+  def XSMSUBASP : XX3Form<60, 17,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsmsubasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f32:$XT, (fma f32:$XA, f32:$XB,
+                                              (fneg f32:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSMSUBMSP : XX3Form<60, 25,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSNMADDASP" in {
+  let isCommutable = 1 in
+  def XSNMADDASP : XX3Form<60, 129,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsnmaddasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f32:$XT, (fneg (fma f32:$XA, f32:$XB,
+                                                    f32:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSNMADDMSP : XX3Form<60, 137,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSNMSUBASP" in {
+  let isCommutable = 1 in
+  def XSNMSUBASP : XX3Form<60, 145,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsnmsubasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f32:$XT, (fneg (fma f32:$XA, f32:$XB,
+                                                    (fneg f32:$XTi))))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSNMSUBMSP : XX3Form<60, 153,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  // Single Precision Conversions (FP <-> INT)
+  def XSCVSXDSP : XX2Form<60, 312,
+                      (outs vssrc:$XT), (ins vsfrc:$XB),
+                      "xscvsxdsp $XT, $XB", IIC_VecFP,
+                      [(set f32:$XT, (PPCfcfids f64:$XB))]>;
+  def XSCVUXDSP : XX2Form<60, 296,
+                      (outs vssrc:$XT), (ins vsfrc:$XB),
+                      "xscvuxdsp $XT, $XB", IIC_VecFP,
+                      [(set f32:$XT, (PPCfcfidus f64:$XB))]>;
+
+  // Conversions between vector and scalar single precision
+  def XSCVDPSPN : XX2Form<60, 267, (outs vsrc:$XT), (ins vssrc:$XB),
+                          "xscvdpspn $XT, $XB", IIC_VecFP, []>;
+  def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB),
+                          "xscvspdpn $XT, $XB", IIC_VecFP, []>;
+  } // UseVSXReg = 1
+
+  let Predicates = [IsLittleEndian] in {
+  def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+            (f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+  def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+            (f32 (XSCVSXDSP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+            (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+            (f32 (XSCVUXDSP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+  }
+
+  let Predicates = [IsBigEndian] in {
+  def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+            (f32 (XSCVSXDSP (COPY_TO_REGCLASS $S, VSFRC)))>;
+  def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+            (f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+            (f32 (XSCVUXDSP (COPY_TO_REGCLASS $S, VSFRC)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+            (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+  }
+  def : Pat<(v4i32 (scalar_to_vector ScalarLoads.Li32)),
+            (v4i32 (XXSPLTWs (LXSIWAX xoaddr:$src), 1))>;
+} // AddedComplexity = 400
+} // HasP8Vector
+
+let UseVSXReg = 1, AddedComplexity = 400 in {
+let Predicates = [HasDirectMove] in {
+  // VSX direct move instructions
+  def MFVSRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vsfrc:$XT),
+                              "mfvsrd $rA, $XT", IIC_VecGeneral,
+                              [(set i64:$rA, (PPCmfvsr f64:$XT))]>,
+      Requires<[In64BitMode]>;
+  def MFVSRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsfrc:$XT),
+                               "mfvsrwz $rA, $XT", IIC_VecGeneral,
+                               [(set i32:$rA, (PPCmfvsr f64:$XT))]>;
+  def MTVSRD : XX1_RS6_RD5_XO<31, 179, (outs vsfrc:$XT), (ins g8rc:$rA),
+                              "mtvsrd $XT, $rA", IIC_VecGeneral,
+                              [(set f64:$XT, (PPCmtvsra i64:$rA))]>,
+      Requires<[In64BitMode]>;
+  def MTVSRWA : XX1_RS6_RD5_XO<31, 211, (outs vsfrc:$XT), (ins gprc:$rA),
+                               "mtvsrwa $XT, $rA", IIC_VecGeneral,
+                               [(set f64:$XT, (PPCmtvsra i32:$rA))]>;
+  def MTVSRWZ : XX1_RS6_RD5_XO<31, 243, (outs vsfrc:$XT), (ins gprc:$rA),
+                               "mtvsrwz $XT, $rA", IIC_VecGeneral,
+                               [(set f64:$XT, (PPCmtvsrz i32:$rA))]>;
+} // HasDirectMove
+
+let Predicates = [IsISA3_0, HasDirectMove] in {
+  def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA),
+                              "mtvsrws $XT, $rA", IIC_VecGeneral, []>;
+
+  def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
+                       "mtvsrdd $XT, $rA, $rB", IIC_VecGeneral,
+                       []>, Requires<[In64BitMode]>;
+
+  def MFVSRLD: XX1_RS6_RD5_XO<31, 307, (outs g8rc:$rA), (ins vsrc:$XT),
+                              "mfvsrld $rA, $XT", IIC_VecGeneral,
+                              []>, Requires<[In64BitMode]>;
+
+} // IsISA3_0, HasDirectMove
+} // UseVSXReg = 1
+
+/*  Direct moves of various widths from GPR's into VSR's. Each move lines
+    the value up into element 0 (both BE and LE). Namely, entities smaller than
+    a doubleword are shifted left and moved for BE. For LE, they're moved, then
+    swapped to go into the least significant element of the VSR.
+*/
+def MovesToVSR {
+  dag BE_BYTE_0 =
+    (MTVSRD
+      (RLDICR
+        (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 56, 7));
+  dag BE_HALF_0 =
+    (MTVSRD
+      (RLDICR
+        (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 48, 15));
+  dag BE_WORD_0 =
+    (MTVSRD
+      (RLDICR
+        (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 32, 31));
+  dag BE_DWORD_0 = (MTVSRD $A);
+
+  dag LE_MTVSRW = (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32));
+  dag LE_WORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                                        LE_MTVSRW, sub_64));
+  dag LE_WORD_0 = (XXPERMDI LE_WORD_1, LE_WORD_1, 2);
+  dag LE_DWORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                                         BE_DWORD_0, sub_64));
+  dag LE_DWORD_0 = (XXPERMDI LE_DWORD_1, LE_DWORD_1, 2);
+}
+
+/*  Patterns for extracting elements out of vectors. Integer elements are
+    extracted using direct move operations. Patterns for extracting elements
+    whose indices are not available at compile time are also provided with
+    various _VARIABLE_ patterns.
+    The numbering for the DAG's is for LE, but when used on BE, the correct
+    LE element can just be used (i.e. LE_BYTE_2 == BE_BYTE_13).
+*/
+def VectorExtractions {
+  // Doubleword extraction
+  dag LE_DWORD_0 =
+    (MFVSRD
+      (EXTRACT_SUBREG
+        (XXPERMDI (COPY_TO_REGCLASS $S, VSRC),
+                  (COPY_TO_REGCLASS $S, VSRC), 2), sub_64));
+  dag LE_DWORD_1 = (MFVSRD
+                     (EXTRACT_SUBREG
+                       (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64));
+
+  // Word extraction
+  dag LE_WORD_0 = (MFVSRWZ (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64));
+  dag LE_WORD_1 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 1), sub_64));
+  dag LE_WORD_2 = (MFVSRWZ (EXTRACT_SUBREG
+                             (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64));
+  dag LE_WORD_3 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 3), sub_64));
+
+  // Halfword extraction
+  dag LE_HALF_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 48), sub_32));
+  dag LE_HALF_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 48), sub_32));
+  dag LE_HALF_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 48), sub_32));
+  dag LE_HALF_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 48), sub_32));
+  dag LE_HALF_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 48), sub_32));
+  dag LE_HALF_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 48), sub_32));
+  dag LE_HALF_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 48), sub_32));
+  dag LE_HALF_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 48), sub_32));
+
+  // Byte extraction
+  dag LE_BYTE_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 56), sub_32));
+  dag LE_BYTE_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 56, 56), sub_32));
+  dag LE_BYTE_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 56), sub_32));
+  dag LE_BYTE_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 40, 56), sub_32));
+  dag LE_BYTE_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 56), sub_32));
+  dag LE_BYTE_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 24, 56), sub_32));
+  dag LE_BYTE_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 56), sub_32));
+  dag LE_BYTE_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 8, 56), sub_32));
+  dag LE_BYTE_8 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 56), sub_32));
+  dag LE_BYTE_9 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 56, 56), sub_32));
+  dag LE_BYTE_10 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 56), sub_32));
+  dag LE_BYTE_11 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 40, 56), sub_32));
+  dag LE_BYTE_12 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 56), sub_32));
+  dag LE_BYTE_13 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 24, 56), sub_32));
+  dag LE_BYTE_14 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 56), sub_32));
+  dag LE_BYTE_15 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 8, 56), sub_32));
+
+  /* Variable element number (BE and LE patterns must be specified separately)
+     This is a rather involved process.
+
+     Conceptually, this is how the move is accomplished:
+     1. Identify which doubleword contains the element
+     2. Shift in the VMX register so that the correct doubleword is correctly
+        lined up for the MFVSRD
+     3. Perform the move so that the element (along with some extra stuff)
+        is in the GPR
+     4. Right shift within the GPR so that the element is right-justified
+
+     Of course, the index is an element number which has a different meaning
+     on LE/BE so the patterns have to be specified separately.
+
+     Note: The final result will be the element right-justified with high
+           order bits being arbitrarily defined (namely, whatever was in the
+           vector register to the left of the value originally).
+  */
+
+  /*  LE variable byte
+      Number 1. above:
+      - For elements 0-7, we shift left by 8 bytes since they're on the right
+      - For elements 8-15, we need not shift (shift left by zero bytes)
+      This is accomplished by inverting the bits of the index and AND-ing
+      with 0x8 (i.e. clearing all bits of the index and inverting bit 60).
+  */
+  dag LE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDC8 (LI8 8), $Idx));
+
+  //  Number 2. above:
+  //  - Now that we set up the shift amount, we shift in the VMX register
+  dag LE_VBYTE_PERMUTE = (VPERM $S, $S, LE_VBYTE_PERM_VEC);
+
+  //  Number 3. above:
+  //  - The doubleword containing our element is moved to a GPR
+  dag LE_MV_VBYTE = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS LE_VBYTE_PERMUTE, VSRC)),
+                        sub_64));
+
+  /*  Number 4. above:
+      - Truncate the element number to the range 0-7 (8-15 are symmetrical
+        and out of range values are truncated accordingly)
+      - Multiply by 8 as we need to shift right by the number of bits, not bytes
+      - Shift right in the GPR by the calculated value
+  */
+  dag LE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 7), $Idx), 3, 60),
+                                       sub_32);
+  dag LE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD LE_MV_VBYTE, LE_VBYTE_SHIFT),
+                                         sub_32);
+
+  /*  LE variable halfword
+      Number 1. above:
+      - For elements 0-3, we shift left by 8 since they're on the right
+      - For elements 4-7, we need not shift (shift left by zero bytes)
+      Similarly to the byte pattern, we invert the bits of the index, but we
+      AND with 0x4 (i.e. clear all bits of the index and invert bit 61).
+      Of course, the shift is still by 8 bytes, so we must multiply by 2.
+  */
+  dag LE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 4), $Idx), 1, 62));
+
+  //  Number 2. above:
+  //  - Now that we set up the shift amount, we shift in the VMX register
+  dag LE_VHALF_PERMUTE = (VPERM $S, $S, LE_VHALF_PERM_VEC);
+
+  //  Number 3. above:
+  //  - The doubleword containing our element is moved to a GPR
+  dag LE_MV_VHALF = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS LE_VHALF_PERMUTE, VSRC)),
+                        sub_64));
+
+  /*  Number 4. above:
+      - Truncate the element number to the range 0-3 (4-7 are symmetrical
+        and out of range values are truncated accordingly)
+      - Multiply by 16 as we need to shift right by the number of bits
+      - Shift right in the GPR by the calculated value
+  */
+  dag LE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 3), $Idx), 4, 59),
+                                       sub_32);
+  dag LE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD LE_MV_VHALF, LE_VHALF_SHIFT),
+                                         sub_32);
+
+  /*  LE variable word
+      Number 1. above:
+      - For elements 0-1, we shift left by 8 since they're on the right
+      - For elements 2-3, we need not shift
+  */
+  dag LE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 2), $Idx), 2, 61));
+
+  //  Number 2. above:
+  //  - Now that we set up the shift amount, we shift in the VMX register
+  dag LE_VWORD_PERMUTE = (VPERM $S, $S, LE_VWORD_PERM_VEC);
+
+  //  Number 3. above:
+  //  - The doubleword containing our element is moved to a GPR
+  dag LE_MV_VWORD = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS LE_VWORD_PERMUTE, VSRC)),
+                        sub_64));
+
+  /*  Number 4. above:
+      - Truncate the element number to the range 0-1 (2-3 are symmetrical
+        and out of range values are truncated accordingly)
+      - Multiply by 32 as we need to shift right by the number of bits
+      - Shift right in the GPR by the calculated value
+  */
+  dag LE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 1), $Idx), 5, 58),
+                                       sub_32);
+  dag LE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD LE_MV_VWORD, LE_VWORD_SHIFT),
+                                         sub_32);
+
+  /*  LE variable doubleword
+      Number 1. above:
+      - For element 0, we shift left by 8 since it's on the right
+      - For element 1, we need not shift
+  */
+  dag LE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 1), $Idx), 3, 60));
+
+  //  Number 2. above:
+  //  - Now that we set up the shift amount, we shift in the VMX register
+  dag LE_VDWORD_PERMUTE = (VPERM $S, $S, LE_VDWORD_PERM_VEC);
+
+  // Number 3. above:
+  //  - The doubleword containing our element is moved to a GPR
+  //  - Number 4. is not needed for the doubleword as the value is 64-bits
+  dag LE_VARIABLE_DWORD =
+        (MFVSRD (EXTRACT_SUBREG
+                  (v2i64 (COPY_TO_REGCLASS LE_VDWORD_PERMUTE, VSRC)),
+                  sub_64));
+
+  /*  LE variable float
+      - Shift the vector to line up the desired element to BE Word 0
+      - Convert 32-bit float to a 64-bit single precision float
+  */
+  dag LE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR (XOR8 (LI8 3), $Idx), 2, 61));
+  dag LE_VFLOAT_PERMUTE = (VPERM $S, $S, LE_VFLOAT_PERM_VEC);
+  dag LE_VARIABLE_FLOAT = (XSCVSPDPN LE_VFLOAT_PERMUTE);
+
+  /*  LE variable double
+      Same as the LE doubleword except there is no move.
+  */
+  dag LE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC),
+                                  (COPY_TO_REGCLASS $S, VRRC),
+                                  LE_VDWORD_PERM_VEC);
+  dag LE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS LE_VDOUBLE_PERMUTE, VSRC);
+
+  /*  BE variable byte
+      The algorithm here is the same as the LE variable byte except:
+      - The shift in the VMX register is by 0/8 for opposite element numbers so
+        we simply AND the element number with 0x8
+      - The order of elements after the move to GPR is reversed, so we invert
+        the bits of the index prior to truncating to the range 0-7
+  */
+  dag BE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDIo8 $Idx, 8));
+  dag BE_VBYTE_PERMUTE = (VPERM $S, $S, BE_VBYTE_PERM_VEC);
+  dag BE_MV_VBYTE = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS BE_VBYTE_PERMUTE, VSRC)),
+                        sub_64));
+  dag BE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 7), $Idx), 3, 60),
+                                       sub_32);
+  dag BE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD BE_MV_VBYTE, BE_VBYTE_SHIFT),
+                                         sub_32);
+
+  /*  BE variable halfword
+      The algorithm here is the same as the LE variable halfword except:
+      - The shift in the VMX register is by 0/8 for opposite element numbers so
+        we simply AND the element number with 0x4 and multiply by 2
+      - The order of elements after the move to GPR is reversed, so we invert
+        the bits of the index prior to truncating to the range 0-3
+  */
+  dag BE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 4), 1, 62));
+  dag BE_VHALF_PERMUTE = (VPERM $S, $S, BE_VHALF_PERM_VEC);
+  dag BE_MV_VHALF = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS BE_VHALF_PERMUTE, VSRC)),
+                        sub_64));
+  dag BE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 3), $Idx), 4, 59),
+                                       sub_32);
+  dag BE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD BE_MV_VHALF, BE_VHALF_SHIFT),
+                                         sub_32);
+
+  /*  BE variable word
+      The algorithm is the same as the LE variable word except:
+      - The shift in the VMX register happens for opposite element numbers
+      - The order of elements after the move to GPR is reversed, so we invert
+        the bits of the index prior to truncating to the range 0-1
+  */
+  dag BE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 2), 2, 61));
+  dag BE_VWORD_PERMUTE = (VPERM $S, $S, BE_VWORD_PERM_VEC);
+  dag BE_MV_VWORD = (MFVSRD
+                      (EXTRACT_SUBREG
+                        (v2i64 (COPY_TO_REGCLASS BE_VWORD_PERMUTE, VSRC)),
+                        sub_64));
+  dag BE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 1), $Idx), 5, 58),
+                                       sub_32);
+  dag BE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD BE_MV_VWORD, BE_VWORD_SHIFT),
+                                         sub_32);
+
+  /*  BE variable doubleword
+      Same as the LE doubleword except we shift in the VMX register for opposite
+      element indices.
+  */
+  dag BE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 1), 3, 60));
+  dag BE_VDWORD_PERMUTE = (VPERM $S, $S, BE_VDWORD_PERM_VEC);
+  dag BE_VARIABLE_DWORD =
+        (MFVSRD (EXTRACT_SUBREG
+                  (v2i64 (COPY_TO_REGCLASS BE_VDWORD_PERMUTE, VSRC)),
+                  sub_64));
+
+  /*  BE variable float
+      - Shift the vector to line up the desired element to BE Word 0
+      - Convert 32-bit float to a 64-bit single precision float
+  */
+  dag BE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR $Idx, 2, 61));
+  dag BE_VFLOAT_PERMUTE = (VPERM $S, $S, BE_VFLOAT_PERM_VEC);
+  dag BE_VARIABLE_FLOAT = (XSCVSPDPN BE_VFLOAT_PERMUTE);
+
+  /* BE variable double
+      Same as the BE doubleword except there is no move.
+  */
+  dag BE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC),
+                                  (COPY_TO_REGCLASS $S, VRRC),
+                                  BE_VDWORD_PERM_VEC);
+  dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC);
+}
+
+let AddedComplexity = 400 in {
+// v4f32 scalar <-> vector conversions (BE)
+let Predicates = [IsBigEndian, HasP8Vector] in {
+  def : Pat<(v4f32 (scalar_to_vector f32:$A)),
+            (v4f32 (XSCVDPSPN $A))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, 0)),
+            (f32 (XSCVSPDPN $S))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, 1)),
+            (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, 2)),
+            (f32 (XSCVSPDPN (XXPERMDI $S, $S, 2)))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, 3)),
+            (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
+            (f32 VectorExtractions.BE_VARIABLE_FLOAT)>;
+} // IsBigEndian, HasP8Vector
+
+// Variable index vector_extract for v2f64 does not require P8Vector
+let Predicates = [IsBigEndian, HasVSX] in
+  def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
+            (f64 VectorExtractions.BE_VARIABLE_DOUBLE)>;
+
+let Predicates = [IsBigEndian, HasDirectMove] in {
+  // v16i8 scalar <-> vector conversions (BE)
+  def : Pat<(v16i8 (scalar_to_vector i32:$A)),
+            (v16i8 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64))>;
+  def : Pat<(v8i16 (scalar_to_vector i32:$A)),
+            (v8i16 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_HALF_0, sub_64))>;
+  def : Pat<(v4i32 (scalar_to_vector i32:$A)),
+            (v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>;
+  def : Pat<(v2i64 (scalar_to_vector i64:$A)),
+            (v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+            (i32 VectorExtractions.LE_BYTE_15)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+            (i32 VectorExtractions.LE_BYTE_14)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+            (i32 VectorExtractions.LE_BYTE_13)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+            (i32 VectorExtractions.LE_BYTE_12)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+            (i32 VectorExtractions.LE_BYTE_11)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+            (i32 VectorExtractions.LE_BYTE_10)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+            (i32 VectorExtractions.LE_BYTE_9)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+            (i32 VectorExtractions.LE_BYTE_8)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+            (i32 VectorExtractions.LE_BYTE_7)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+            (i32 VectorExtractions.LE_BYTE_6)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+            (i32 VectorExtractions.LE_BYTE_5)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+            (i32 VectorExtractions.LE_BYTE_4)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+            (i32 VectorExtractions.LE_BYTE_3)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+            (i32 VectorExtractions.LE_BYTE_2)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+            (i32 VectorExtractions.LE_BYTE_1)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+            (i32 VectorExtractions.LE_BYTE_0)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
+            (i32 VectorExtractions.BE_VARIABLE_BYTE)>;
+
+  // v8i16 scalar <-> vector conversions (BE)
+  def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+            (i32 VectorExtractions.LE_HALF_7)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+            (i32 VectorExtractions.LE_HALF_6)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+            (i32 VectorExtractions.LE_HALF_5)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+            (i32 VectorExtractions.LE_HALF_4)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+            (i32 VectorExtractions.LE_HALF_3)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+            (i32 VectorExtractions.LE_HALF_2)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+            (i32 VectorExtractions.LE_HALF_1)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 7)),
+            (i32 VectorExtractions.LE_HALF_0)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
+            (i32 VectorExtractions.BE_VARIABLE_HALF)>;
+
+  // v4i32 scalar <-> vector conversions (BE)
+  def : Pat<(i32 (vector_extract v4i32:$S, 0)),
+            (i32 VectorExtractions.LE_WORD_3)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 1)),
+            (i32 VectorExtractions.LE_WORD_2)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 2)),
+            (i32 VectorExtractions.LE_WORD_1)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 3)),
+            (i32 VectorExtractions.LE_WORD_0)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+            (i32 VectorExtractions.BE_VARIABLE_WORD)>;
+
+  // v2i64 scalar <-> vector conversions (BE)
+  def : Pat<(i64 (vector_extract v2i64:$S, 0)),
+            (i64 VectorExtractions.LE_DWORD_1)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, 1)),
+            (i64 VectorExtractions.LE_DWORD_0)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+            (i64 VectorExtractions.BE_VARIABLE_DWORD)>;
+} // IsBigEndian, HasDirectMove
+
+// v4f32 scalar <-> vector conversions (LE)
+let Predicates = [IsLittleEndian, HasP8Vector] in {
+  def : Pat<(v4f32 (scalar_to_vector f32:$A)),
+            (v4f32 (XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 1))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, 0)),
+            (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, 1)),
+            (f32 (XSCVSPDPN (XXPERMDI $S, $S, 2)))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, 2)),
+            (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, 3)),
+            (f32 (XSCVSPDPN $S))>;
+  def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
+            (f32 VectorExtractions.LE_VARIABLE_FLOAT)>;
+} // IsLittleEndian, HasP8Vector
+
+// Variable index vector_extract for v2f64 does not require P8Vector
+let Predicates = [IsLittleEndian, HasVSX] in
+  def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
+            (f64 VectorExtractions.LE_VARIABLE_DOUBLE)>;
+
+  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+
+let Predicates = [IsLittleEndian, HasDirectMove] in {
+  // v16i8 scalar <-> vector conversions (LE)
+  def : Pat<(v16i8 (scalar_to_vector i32:$A)),
+            (v16i8 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>;
+  def : Pat<(v8i16 (scalar_to_vector i32:$A)),
+            (v8i16 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>;
+  def : Pat<(v4i32 (scalar_to_vector i32:$A)),
+            (v4i32 MovesToVSR.LE_WORD_0)>;
+  def : Pat<(v2i64 (scalar_to_vector i64:$A)),
+            (v2i64 MovesToVSR.LE_DWORD_0)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+            (i32 VectorExtractions.LE_BYTE_0)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+            (i32 VectorExtractions.LE_BYTE_1)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+            (i32 VectorExtractions.LE_BYTE_2)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+            (i32 VectorExtractions.LE_BYTE_3)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+            (i32 VectorExtractions.LE_BYTE_4)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+            (i32 VectorExtractions.LE_BYTE_5)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+            (i32 VectorExtractions.LE_BYTE_6)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+            (i32 VectorExtractions.LE_BYTE_7)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+            (i32 VectorExtractions.LE_BYTE_8)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+            (i32 VectorExtractions.LE_BYTE_9)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+            (i32 VectorExtractions.LE_BYTE_10)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+            (i32 VectorExtractions.LE_BYTE_11)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+            (i32 VectorExtractions.LE_BYTE_12)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+            (i32 VectorExtractions.LE_BYTE_13)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+            (i32 VectorExtractions.LE_BYTE_14)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+            (i32 VectorExtractions.LE_BYTE_15)>;
+  def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
+            (i32 VectorExtractions.LE_VARIABLE_BYTE)>;
+
+  // v8i16 scalar <-> vector conversions (LE)
+  def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+            (i32 VectorExtractions.LE_HALF_0)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+            (i32 VectorExtractions.LE_HALF_1)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+            (i32 VectorExtractions.LE_HALF_2)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+            (i32 VectorExtractions.LE_HALF_3)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+            (i32 VectorExtractions.LE_HALF_4)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+            (i32 VectorExtractions.LE_HALF_5)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+            (i32 VectorExtractions.LE_HALF_6)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, 7)),
+            (i32 VectorExtractions.LE_HALF_7)>;
+  def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
+            (i32 VectorExtractions.LE_VARIABLE_HALF)>;
+
+  // v4i32 scalar <-> vector conversions (LE)
+  def : Pat<(i32 (vector_extract v4i32:$S, 0)),
+            (i32 VectorExtractions.LE_WORD_0)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 1)),
+            (i32 VectorExtractions.LE_WORD_1)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 2)),
+            (i32 VectorExtractions.LE_WORD_2)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, 3)),
+            (i32 VectorExtractions.LE_WORD_3)>;
+  def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+            (i32 VectorExtractions.LE_VARIABLE_WORD)>;
+
+  // v2i64 scalar <-> vector conversions (LE)
+  def : Pat<(i64 (vector_extract v2i64:$S, 0)),
+            (i64 VectorExtractions.LE_DWORD_0)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, 1)),
+            (i64 VectorExtractions.LE_DWORD_1)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+            (i64 VectorExtractions.LE_VARIABLE_DWORD)>;
+} // IsLittleEndian, HasDirectMove
+
+let Predicates = [HasDirectMove, HasVSX] in {
+// bitconvert f32 -> i32
+// (convert to 32-bit fp single, shift right 1 word, move to GPR)
+def : Pat<(i32 (bitconvert f32:$S)),
+          (i32 (MFVSRWZ (EXTRACT_SUBREG
+                          (XXSLDWI (XSCVDPSPN $S),(XSCVDPSPN $S), 3),
+                          sub_64)))>;
+// bitconvert i32 -> f32
+// (move to FPR, shift left 1 word, convert to 64-bit fp single)
+def : Pat<(f32 (bitconvert i32:$A)),
+          (f32 (XSCVSPDPN
+                 (XXSLDWI MovesToVSR.LE_WORD_1, MovesToVSR.LE_WORD_1, 1)))>;
+
+// bitconvert f64 -> i64
+// (move to GPR, nothing else needed)
+def : Pat<(i64 (bitconvert f64:$S)),
+          (i64 (MFVSRD $S))>;
+
+// bitconvert i64 -> f64
+// (move to FPR, nothing else needed)
+def : Pat<(f64 (bitconvert i64:$S)),
+          (f64 (MTVSRD $S))>;
+}
+
+// Materialize a zero-vector of long long
+def : Pat<(v2i64 immAllZerosV),
+          (v2i64 (XXLXORz))>;
+}
+
+def AlignValues {
+  dag F32_TO_BE_WORD1 = (v4f32 (XXSLDWI (XSCVDPSPN $B), (XSCVDPSPN $B), 3));
+  dag I32_TO_BE_WORD1 = (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC);
+}
+
+// The following VSX instructions were introduced in Power ISA 3.0
+def HasP9Vector : Predicate<"PPCSubTarget->hasP9Vector()">;
+let AddedComplexity = 400, Predicates = [HasP9Vector] in {
+
+  // [PO VRT XO VRB XO /]
+  class X_VT5_XO5_VB5<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+                      list<dag> pattern>
+    : X_RD5_XO5_RS5<opcode, xo2, xo, (outs vrrc:$vT), (ins vrrc:$vB),
+                    !strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;
+
+  // [PO VRT XO VRB XO RO], Round to Odd version of [PO VRT XO VRB XO /]
+  class X_VT5_XO5_VB5_Ro<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+                         list<dag> pattern>
+    : X_VT5_XO5_VB5<opcode, xo2, xo, opc, pattern>, isDOT;
+
+  // [PO VRT XO VRB XO /], but the VRB is only used the left 64 bits (or less),
+  // So we use different operand class for VRB
+  class X_VT5_XO5_VB5_TyVB<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+                           RegisterOperand vbtype, list<dag> pattern>
+    : X_RD5_XO5_RS5<opcode, xo2, xo, (outs vrrc:$vT), (ins vbtype:$vB),
+                    !strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;
+
+  let UseVSXReg = 1 in {
+  // [PO T XO B XO BX /]
+  class XX2_RT5_XO5_XB6<bits<6> opcode, bits<5> xo2, bits<9> xo, string opc,
+                        list<dag> pattern>
+    : XX2_RD5_XO5_RS6<opcode, xo2, xo, (outs g8rc:$rT), (ins vsfrc:$XB),
+                      !strconcat(opc, " $rT, $XB"), IIC_VecFP, pattern>;
+
+  // [PO T XO B XO BX TX]
+  class XX2_XT6_XO5_XB6<bits<6> opcode, bits<5> xo2, bits<9> xo, string opc,
+                        RegisterOperand vtype, list<dag> pattern>
+    : XX2_RD6_XO5_RS6<opcode, xo2, xo, (outs vtype:$XT), (ins vtype:$XB),
+                      !strconcat(opc, " $XT, $XB"), IIC_VecFP, pattern>;
+
+  // [PO T A B XO AX BX TX], src and dest register use different operand class
+  class XX3_XT5_XA5_XB5<bits<6> opcode, bits<8> xo, string opc,
+                  RegisterOperand xty, RegisterOperand aty, RegisterOperand bty,
+                  InstrItinClass itin, list<dag> pattern>
+    : XX3Form<opcode, xo, (outs xty:$XT), (ins aty:$XA, bty:$XB),
+              !strconcat(opc, " $XT, $XA, $XB"), itin, pattern>;
+  } // UseVSXReg = 1
+
+  // [PO VRT VRA VRB XO /]
+  class X_VT5_VA5_VB5<bits<6> opcode, bits<10> xo, string opc,
+                      list<dag> pattern>
+    : XForm_1<opcode, xo, (outs vrrc:$vT), (ins vrrc:$vA, vrrc:$vB),
+              !strconcat(opc, " $vT, $vA, $vB"), IIC_VecFP, pattern>;
+
+  // [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /]
+  class X_VT5_VA5_VB5_Ro<bits<6> opcode, bits<10> xo, string opc,
+                         list<dag> pattern>
+    : X_VT5_VA5_VB5<opcode, xo, opc, pattern>, isDOT;
+
+  //===--------------------------------------------------------------------===//
+  // Quad-Precision Scalar Move Instructions:
+
+  // Copy Sign
+  def XSCPSGNQP : X_VT5_VA5_VB5<63, 100, "xscpsgnqp", []>;
+
+  // Absolute/Negative-Absolute/Negate
+  def XSABSQP   : X_VT5_XO5_VB5<63,  0, 804, "xsabsqp" , []>;
+  def XSNABSQP  : X_VT5_XO5_VB5<63,  8, 804, "xsnabsqp", []>;
+  def XSNEGQP   : X_VT5_XO5_VB5<63, 16, 804, "xsnegqp" , []>;
+
+  //===--------------------------------------------------------------------===//
+  // Quad-Precision Scalar Floating-Point Arithmetic Instructions:
+
+  // Add/Divide/Multiply/Subtract
+  def XSADDQP   : X_VT5_VA5_VB5   <63,   4, "xsaddqp" , []>;
+  def XSADDQPO  : X_VT5_VA5_VB5_Ro<63,   4, "xsaddqpo", []>;
+  def XSDIVQP   : X_VT5_VA5_VB5   <63, 548, "xsdivqp" , []>;
+  def XSDIVQPO  : X_VT5_VA5_VB5_Ro<63, 548, "xsdivqpo", []>;
+  def XSMULQP   : X_VT5_VA5_VB5   <63,  36, "xsmulqp" , []>;
+  def XSMULQPO  : X_VT5_VA5_VB5_Ro<63,  36, "xsmulqpo", []>;
+  def XSSUBQP   : X_VT5_VA5_VB5   <63, 516, "xssubqp" , []>;
+  def XSSUBQPO  : X_VT5_VA5_VB5_Ro<63, 516, "xssubqpo", []>;
+
+  // Square-Root
+  def XSSQRTQP  : X_VT5_XO5_VB5   <63, 27, 804, "xssqrtqp" , []>;
+  def XSSQRTQPO : X_VT5_XO5_VB5_Ro<63, 27, 804, "xssqrtqpo", []>;
+
+  // (Negative) Multiply-{Add/Subtract}
+  def XSMADDQP  : X_VT5_VA5_VB5   <63, 388, "xsmaddqp"  , []>;
+  def XSMADDQPO : X_VT5_VA5_VB5_Ro<63, 388, "xsmaddqpo" , []>;
+  def XSMSUBQP  : X_VT5_VA5_VB5   <63, 420, "xsmsubqp"  , []>;
+  def XSMSUBQPO : X_VT5_VA5_VB5_Ro<63, 420, "xsmsubqpo" , []>;
+  def XSNMADDQP : X_VT5_VA5_VB5   <63, 452, "xsnmaddqp" , []>;
+  def XSNMADDQPO: X_VT5_VA5_VB5_Ro<63, 452, "xsnmaddqpo", []>;
+  def XSNMSUBQP : X_VT5_VA5_VB5   <63, 484, "xsnmsubqp" , []>;
+  def XSNMSUBQPO: X_VT5_VA5_VB5_Ro<63, 484, "xsnmsubqpo", []>;
+
+  //===--------------------------------------------------------------------===//
+  // Quad/Double-Precision Compare Instructions:
+
+  // [PO BF // VRA VRB XO /]
+  class X_BF3_VA5_VB5<bits<6> opcode, bits<10> xo, string opc,
+                      list<dag> pattern>
+    : XForm_17<opcode, xo, (outs crrc:$crD), (ins vrrc:$VA, vrrc:$VB),
+               !strconcat(opc, " $crD, $VA, $VB"), IIC_FPCompare> {
+    let Pattern = pattern;
+  }
+
+  // QP Compare Ordered/Unordered
+  def XSCMPOQP : X_BF3_VA5_VB5<63, 132, "xscmpoqp", []>;
+  def XSCMPUQP : X_BF3_VA5_VB5<63, 644, "xscmpuqp", []>;
+
+  // DP/QP Compare Exponents
+  def XSCMPEXPDP : XX3Form_1<60, 59,
+                             (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+                             "xscmpexpdp $crD, $XA, $XB", IIC_FPCompare, []>,
+                   UseVSXReg;
+  def XSCMPEXPQP : X_BF3_VA5_VB5<63, 164, "xscmpexpqp", []>;
+
+  // DP Compare ==, >=, >, !=
+  // Use vsrc for XT, because the entire register of XT is set.
+  // XT.dword[1] = 0x0000_0000_0000_0000
+  def XSCMPEQDP : XX3_XT5_XA5_XB5<60,  3, "xscmpeqdp", vsrc, vsfrc, vsfrc,
+                                  IIC_FPCompare, []>;
+  def XSCMPGEDP : XX3_XT5_XA5_XB5<60, 19, "xscmpgedp", vsrc, vsfrc, vsfrc,
+                                  IIC_FPCompare, []>;
+  def XSCMPGTDP : XX3_XT5_XA5_XB5<60, 11, "xscmpgtdp", vsrc, vsfrc, vsfrc,
+                                  IIC_FPCompare, []>;
+  def XSCMPNEDP : XX3_XT5_XA5_XB5<60, 27, "xscmpnedp", vsrc, vsfrc, vsfrc,
+                                  IIC_FPCompare, []>;
+  let UseVSXReg = 1 in {
+  // Vector Compare Not Equal
+  def XVCMPNEDP  : XX3Form_Rc<60, 123,
+                              (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                              "xvcmpnedp  $XT, $XA, $XB", IIC_VecFPCompare, []>;
+  let Defs = [CR6] in
+  def XVCMPNEDPo : XX3Form_Rc<60, 123,
+                              (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                              "xvcmpnedp. $XT, $XA, $XB", IIC_VecFPCompare, []>,
+                              isDOT;
+  def XVCMPNESP  : XX3Form_Rc<60,  91,
+                              (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                              "xvcmpnesp  $XT, $XA, $XB", IIC_VecFPCompare, []>;
+  let Defs = [CR6] in
+  def XVCMPNESPo : XX3Form_Rc<60,  91,
+                              (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                              "xvcmpnesp. $XT, $XA, $XB", IIC_VecFPCompare, []>,
+                              isDOT;
+  } // UseVSXReg = 1
+
+  //===--------------------------------------------------------------------===//
+  // Quad-Precision Floating-Point Conversion Instructions:
+
+  // Convert DP -> QP
+  def XSCVDPQP  : X_VT5_XO5_VB5_TyVB<63, 22, 836, "xscvdpqp", vfrc, []>;
+
+  // Round & Convert QP -> DP (dword[1] is set to zero)
+  def XSCVQPDP  : X_VT5_XO5_VB5   <63, 20, 836, "xscvqpdp" , []>;
+  def XSCVQPDPO : X_VT5_XO5_VB5_Ro<63, 20, 836, "xscvqpdpo", []>;
+
+  // Truncate & Convert QP -> (Un)Signed (D)Word (dword[1] is set to zero)
+  def XSCVQPSDZ : X_VT5_XO5_VB5<63, 25, 836, "xscvqpsdz", []>;
+  def XSCVQPSWZ : X_VT5_XO5_VB5<63,  9, 836, "xscvqpswz", []>;
+  def XSCVQPUDZ : X_VT5_XO5_VB5<63, 17, 836, "xscvqpudz", []>;
+  def XSCVQPUWZ : X_VT5_XO5_VB5<63,  1, 836, "xscvqpuwz", []>;
+
+  // Convert (Un)Signed DWord -> QP
+  def XSCVSDQP  : X_VT5_XO5_VB5_TyVB<63, 10, 836, "xscvsdqp", vfrc, []>;
+  def XSCVUDQP  : X_VT5_XO5_VB5_TyVB<63,  2, 836, "xscvudqp", vfrc, []>;
+
+  let UseVSXReg = 1 in {
+  //===--------------------------------------------------------------------===//
+  // Round to Floating-Point Integer Instructions
+
+  // (Round &) Convert DP <-> HP
+  // Note! xscvdphp's src and dest register both use the left 64 bits, so we use
+  // vsfrc for src and dest register. xscvhpdp's src only use the left 16 bits,
+  // but we still use vsfrc for it.
+  def XSCVDPHP : XX2_XT6_XO5_XB6<60, 17, 347, "xscvdphp", vsfrc, []>;
+  def XSCVHPDP : XX2_XT6_XO5_XB6<60, 16, 347, "xscvhpdp", vsfrc, []>;
+
+  // Vector HP -> SP
+  def XVCVHPSP : XX2_XT6_XO5_XB6<60, 24, 475, "xvcvhpsp", vsrc, []>;
+  def XVCVSPHP : XX2_XT6_XO5_XB6<60, 25, 475, "xvcvsphp", vsrc,
+                                 [(set v4f32:$XT,
+                                     (int_ppc_vsx_xvcvsphp v4f32:$XB))]>;
+
+  } // UseVSXReg = 1
+
+  // Pattern for matching Vector HP -> Vector SP intrinsic. Defined as a
+  // seperate pattern so that it can convert the input register class from
+  // VRRC(v8i16) to VSRC.
+  def : Pat<(v4f32 (int_ppc_vsx_xvcvhpsp v8i16:$A)),
+            (v4f32 (XVCVHPSP (COPY_TO_REGCLASS $A, VSRC)))>;
+
+  class Z23_VT5_R1_VB5_RMC2_EX1<bits<6> opcode, bits<8> xo, bit ex, string opc,
+                                list<dag> pattern>
+    : Z23Form_1<opcode, xo,
+                (outs vrrc:$vT), (ins u1imm:$r, vrrc:$vB, u2imm:$rmc),
+                !strconcat(opc, " $r, $vT, $vB, $rmc"), IIC_VecFP, pattern> {
+    let RC = ex;
+  }
+
+  // Round to Quad-Precision Integer [with Inexact]
+  def XSRQPI   : Z23_VT5_R1_VB5_RMC2_EX1<63,  5, 0, "xsrqpi" , []>;
+  def XSRQPIX  : Z23_VT5_R1_VB5_RMC2_EX1<63,  5, 1, "xsrqpix", []>;
+
+  // Round Quad-Precision to Double-Extended Precision (fp80)
+  def XSRQPXP  : Z23_VT5_R1_VB5_RMC2_EX1<63, 37, 0, "xsrqpxp", []>;
+
+  //===--------------------------------------------------------------------===//
+  // Insert/Extract Instructions
+
+  // Insert Exponent DP/QP
+  // XT NOTE: XT.dword[1] = 0xUUUU_UUUU_UUUU_UUUU
+  def XSIEXPDP : XX1Form <60, 918, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
+                          "xsiexpdp $XT, $rA, $rB", IIC_VecFP, []>, UseVSXReg;
+  // vB NOTE: only vB.dword[0] is used, that's why we don't use
+  //          X_VT5_VA5_VB5 form
+  def XSIEXPQP : XForm_18<63, 868, (outs vrrc:$vT), (ins vrrc:$vA, vsfrc:$vB),
+                          "xsiexpqp $vT, $vA, $vB", IIC_VecFP, []>;
+
+  // Extract Exponent/Significand DP/QP
+  def XSXEXPDP : XX2_RT5_XO5_XB6<60,  0, 347, "xsxexpdp", []>;
+  def XSXSIGDP : XX2_RT5_XO5_XB6<60,  1, 347, "xsxsigdp", []>;
+
+  def XSXEXPQP : X_VT5_XO5_VB5  <63,  2, 804, "xsxexpqp", []>;
+  def XSXSIGQP : X_VT5_XO5_VB5  <63, 18, 804, "xsxsigqp", []>;
+
+  // Vector Insert Word
+  let UseVSXReg = 1 in {
+  // XB NOTE: Only XB.dword[1] is used, but we use vsrc on XB.
+  def XXINSERTW   :
+    XX2_RD6_UIM5_RS6<60, 181, (outs vsrc:$XT),
+                     (ins vsrc:$XTi, vsrc:$XB, u4imm:$UIM),
+                     "xxinsertw $XT, $XB, $UIM", IIC_VecFP,
+                     [(set v4i32:$XT, (PPCxxinsert v4i32:$XTi, v4i32:$XB,
+                                                   imm32SExt16:$UIM))]>,
+                     RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
+
+  // Vector Extract Unsigned Word
+  def XXEXTRACTUW : XX2_RD6_UIM5_RS6<60, 165,
+                                  (outs vsfrc:$XT), (ins vsrc:$XB, u4imm:$UIMM),
+                                  "xxextractuw $XT, $XB, $UIMM", IIC_VecFP, []>;
+  } // UseVSXReg = 1
+
+  // Vector Insert Exponent DP/SP
+  def XVIEXPDP : XX3_XT5_XA5_XB5<60, 248, "xviexpdp", vsrc, vsrc, vsrc,
+    IIC_VecFP, [(set v2f64: $XT,(int_ppc_vsx_xviexpdp v2i64:$XA, v2i64:$XB))]>;
+  def XVIEXPSP : XX3_XT5_XA5_XB5<60, 216, "xviexpsp", vsrc, vsrc, vsrc,
+    IIC_VecFP, [(set v4f32: $XT,(int_ppc_vsx_xviexpsp v4i32:$XA, v4i32:$XB))]>;
+
+  // Vector Extract Exponent/Significand DP/SP
+  def XVXEXPDP : XX2_XT6_XO5_XB6<60,  0, 475, "xvxexpdp", vsrc,
+                                 [(set v2i64: $XT,
+                                  (int_ppc_vsx_xvxexpdp v2f64:$XB))]>;
+  def XVXEXPSP : XX2_XT6_XO5_XB6<60,  8, 475, "xvxexpsp", vsrc,
+                                 [(set v4i32: $XT,
+                                  (int_ppc_vsx_xvxexpsp v4f32:$XB))]>;
+  def XVXSIGDP : XX2_XT6_XO5_XB6<60,  1, 475, "xvxsigdp", vsrc,
+                                 [(set v2i64: $XT,
+                                  (int_ppc_vsx_xvxsigdp v2f64:$XB))]>;
+  def XVXSIGSP : XX2_XT6_XO5_XB6<60,  9, 475, "xvxsigsp", vsrc,
+                                 [(set v4i32: $XT,
+                                  (int_ppc_vsx_xvxsigsp v4f32:$XB))]>;
+
+  let AddedComplexity = 400, Predicates = [HasP9Vector] in {
+  // Extra patterns expanding to vector Extract Word/Insert Word
+  def : Pat<(v4i32 (int_ppc_vsx_xxinsertw v4i32:$A, v2i64:$B, imm:$IMM)),
+            (v4i32 (XXINSERTW $A, $B, imm:$IMM))>;
+  def : Pat<(v2i64 (int_ppc_vsx_xxextractuw v2i64:$A, imm:$IMM)),
+            (v2i64 (COPY_TO_REGCLASS (XXEXTRACTUW $A, imm:$IMM), VSRC))>;
+  } // AddedComplexity = 400, HasP9Vector
+
+  //===--------------------------------------------------------------------===//
+
+  // Test Data Class SP/DP/QP
+  let UseVSXReg = 1 in {
+  def XSTSTDCSP : XX2_BF3_DCMX7_RS6<60, 298,
+                              (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
+                              "xststdcsp $BF, $XB, $DCMX", IIC_VecFP, []>;
+  def XSTSTDCDP : XX2_BF3_DCMX7_RS6<60, 362,
+                              (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
+                              "xststdcdp $BF, $XB, $DCMX", IIC_VecFP, []>;
+  } // UseVSXReg = 1
+  def XSTSTDCQP : X_BF3_DCMX7_RS5  <63, 708,
+                              (outs crrc:$BF), (ins u7imm:$DCMX, vrrc:$vB),
+                              "xststdcqp $BF, $vB, $DCMX", IIC_VecFP, []>;
+
+  // Vector Test Data Class SP/DP
+  let UseVSXReg = 1 in {
+  def XVTSTDCSP : XX2_RD6_DCMX7_RS6<60, 13, 5,
+                              (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
+                              "xvtstdcsp $XT, $XB, $DCMX", IIC_VecFP,
+                              [(set v4i32: $XT,
+                               (int_ppc_vsx_xvtstdcsp v4f32:$XB, imm:$DCMX))]>;
+  def XVTSTDCDP : XX2_RD6_DCMX7_RS6<60, 15, 5,
+                              (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
+                              "xvtstdcdp $XT, $XB, $DCMX", IIC_VecFP,
+                              [(set v2i64: $XT,
+                               (int_ppc_vsx_xvtstdcdp v2f64:$XB, imm:$DCMX))]>;
+  } // UseVSXReg = 1
+
+  //===--------------------------------------------------------------------===//
+
+  // Maximum/Minimum Type-C/Type-J DP
+  // XT.dword[1] = 0xUUUU_UUUU_UUUU_UUUU, so we use vsrc for XT
+  def XSMAXCDP : XX3_XT5_XA5_XB5<60, 128, "xsmaxcdp", vsrc, vsfrc, vsfrc,
+                                 IIC_VecFP, []>;
+  def XSMAXJDP : XX3_XT5_XA5_XB5<60, 144, "xsmaxjdp", vsrc, vsfrc, vsfrc,
+                                 IIC_VecFP, []>;
+  def XSMINCDP : XX3_XT5_XA5_XB5<60, 136, "xsmincdp", vsrc, vsfrc, vsfrc,
+                                 IIC_VecFP, []>;
+  def XSMINJDP : XX3_XT5_XA5_XB5<60, 152, "xsminjdp", vsrc, vsfrc, vsfrc,
+                                 IIC_VecFP, []>;
+
+  //===--------------------------------------------------------------------===//
+
+  // Vector Byte-Reverse H/W/D/Q Word
+  def XXBRH : XX2_XT6_XO5_XB6<60,  7, 475, "xxbrh", vsrc, []>;
+  def XXBRW : XX2_XT6_XO5_XB6<60, 15, 475, "xxbrw", vsrc, []>;
+  def XXBRD : XX2_XT6_XO5_XB6<60, 23, 475, "xxbrd", vsrc, []>;
+  def XXBRQ : XX2_XT6_XO5_XB6<60, 31, 475, "xxbrq", vsrc, []>;
+
+  // Vector Permute
+  def XXPERM  : XX3_XT5_XA5_XB5<60, 26, "xxperm" , vsrc, vsrc, vsrc,
+                                IIC_VecPerm, []>;
+  def XXPERMR : XX3_XT5_XA5_XB5<60, 58, "xxpermr", vsrc, vsrc, vsrc,
+                                IIC_VecPerm, []>;
+
+  // Vector Splat Immediate Byte
+  def XXSPLTIB : X_RD6_IMM8<60, 360, (outs vsrc:$XT), (ins u8imm:$IMM8),
+                            "xxspltib $XT, $IMM8", IIC_VecPerm, []>, UseVSXReg;
+
+  //===--------------------------------------------------------------------===//
+  // Vector/Scalar Load/Store Instructions
+
+  // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in
+  // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging.
+  let mayLoad = 1 in {
+  // Load Vector
+  def LXV : DQ_RD6_RS5_DQ12<61, 1, (outs vsrc:$XT), (ins memrix16:$src),
+                            "lxv $XT, $src", IIC_LdStLFD, []>, UseVSXReg;
+  // Load DWord
+  def LXSD  : DSForm_1<57, 2, (outs vfrc:$vD), (ins memrix:$src),
+                       "lxsd $vD, $src", IIC_LdStLFD, []>;
+  // Load SP from src, convert it to DP, and place in dword[0]
+  def LXSSP : DSForm_1<57, 3, (outs vfrc:$vD), (ins memrix:$src),
+                       "lxssp $vD, $src", IIC_LdStLFD, []>;
+
+  // [PO T RA RB XO TX] almost equal to [PO S RA RB XO SX], but has different
+  // "out" and "in" dag
+  class X_XT6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
+                      RegisterOperand vtype, list<dag> pattern>
+    : XX1Form<opcode, xo, (outs vtype:$XT), (ins memrr:$src),
+              !strconcat(opc, " $XT, $src"), IIC_LdStLFD, pattern>, UseVSXReg;
+
+  // Load as Integer Byte/Halfword & Zero Indexed
+  def LXSIBZX : X_XT6_RA5_RB5<31, 781, "lxsibzx", vsfrc,
+                              [(set f64:$XT, (PPClxsizx xoaddr:$src, 1))]>;
+  def LXSIHZX : X_XT6_RA5_RB5<31, 813, "lxsihzx", vsfrc,
+                              [(set f64:$XT, (PPClxsizx xoaddr:$src, 2))]>;
+
+  // Load Vector Halfword*8/Byte*16 Indexed
+  def LXVH8X  : X_XT6_RA5_RB5<31, 812, "lxvh8x" , vsrc, []>;
+  def LXVB16X : X_XT6_RA5_RB5<31, 876, "lxvb16x", vsrc, []>;
+
+  // Load Vector Indexed
+  def LXVX    : X_XT6_RA5_RB5<31, 268, "lxvx"   , vsrc,
+                [(set v2f64:$XT, (load xoaddr:$src))]>;
+
+  // Load Vector (Left-justified) with Length
+  def LXVL : XX1Form<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
+                   "lxvl $XT, $src, $rB", IIC_LdStLoad,
+                   [(set v4i32:$XT, (int_ppc_vsx_lxvl addr:$src, i64:$rB))]>,
+                    UseVSXReg;
+  def LXVLL : XX1Form<31,301, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
+                   "lxvll $XT, $src, $rB", IIC_LdStLoad,
+                   [(set v4i32:$XT, (int_ppc_vsx_lxvll addr:$src, i64:$rB))]>,
+                    UseVSXReg;
+
+  // Load Vector Word & Splat Indexed
+  def LXVWSX  : X_XT6_RA5_RB5<31, 364, "lxvwsx" , vsrc, []>;
+  } // mayLoad
+
+  // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in
+  // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging.
+  let mayStore = 1 in {
+  // Store Vector
+  def STXV : DQ_RD6_RS5_DQ12<61, 5, (outs), (ins vsrc:$XT, memrix16:$dst),
+                             "stxv $XT, $dst", IIC_LdStSTFD, []>, UseVSXReg;
+  // Store DWord
+  def STXSD  : DSForm_1<61, 2, (outs), (ins vfrc:$vS, memrix:$dst),
+                        "stxsd $vS, $dst", IIC_LdStSTFD, []>;
+  // Convert DP of dword[0] to SP, and Store to dst
+  def STXSSP : DSForm_1<61, 3, (outs), (ins vfrc:$vS, memrix:$dst),
+                        "stxssp $vS, $dst", IIC_LdStSTFD, []>;
+
+  // [PO S RA RB XO SX]
+  class X_XS6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
+                      RegisterOperand vtype, list<dag> pattern>
+    : XX1Form<opcode, xo, (outs), (ins vtype:$XT, memrr:$dst),
+              !strconcat(opc, " $XT, $dst"), IIC_LdStSTFD, pattern>, UseVSXReg;
+
+  // Store as Integer Byte/Halfword Indexed
+  def STXSIBX  : X_XS6_RA5_RB5<31,  909, "stxsibx" , vsfrc,
+                               [(PPCstxsix f64:$XT, xoaddr:$dst, 1)]>;
+  def STXSIHX  : X_XS6_RA5_RB5<31,  941, "stxsihx" , vsfrc,
+                               [(PPCstxsix f64:$XT, xoaddr:$dst, 2)]>;
+  let isCodeGenOnly = 1 in {
+    def STXSIBXv  : X_XS6_RA5_RB5<31,  909, "stxsibx" , vrrc, []>;
+    def STXSIHXv  : X_XS6_RA5_RB5<31,  941, "stxsihx" , vrrc, []>;
+  }
+
+  // Store Vector Halfword*8/Byte*16 Indexed
+  def STXVH8X  : X_XS6_RA5_RB5<31,  940, "stxvh8x" , vsrc, []>;
+  def STXVB16X : X_XS6_RA5_RB5<31, 1004, "stxvb16x", vsrc, []>;
+
+  // Store Vector Indexed
+  def STXVX    : X_XS6_RA5_RB5<31,  396, "stxvx"   , vsrc,
+                 [(store v2f64:$XT, xoaddr:$dst)]>;
+
+  // Store Vector (Left-justified) with Length
+  def STXVL : XX1Form<31, 397, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB),
+                   "stxvl $XT, $dst, $rB", IIC_LdStLoad,
+                   [(int_ppc_vsx_stxvl v4i32:$XT, addr:$dst, i64:$rB)]>,
+                    UseVSXReg;
+  def STXVLL : XX1Form<31, 429, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB),
+                   "stxvll $XT, $dst, $rB", IIC_LdStLoad,
+                   [(int_ppc_vsx_stxvll v4i32:$XT, addr:$dst, i64:$rB)]>,
+                    UseVSXReg;
+  } // mayStore
+
+  // Patterns for which instructions from ISA 3.0 are a better match
+  let Predicates = [IsLittleEndian, HasP9Vector] in {
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
+  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
+            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
+  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
+            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 8))>;
+  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 2)),
+            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 4))>;
+  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 3)),
+            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
+  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 0)),
+            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
+  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 1)),
+            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>;
+  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 2)),
+            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>;
+  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
+            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;
+  } // IsLittleEndian, HasP9Vector
+
+  let Predicates = [IsBigEndian, HasP9Vector] in {
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 0))))),
+            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 1))))),
+            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 2))))),
+            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
+  def : Pat<(f32 (PPCfcfidus (PPCmtvsrz (i32 (extractelt v4i32:$A, 3))))),
+            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
+  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
+            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
+  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
+            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 4))>;
+  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 2)),
+            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 8))>;
+  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 3)),
+            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
+  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 0)),
+            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;
+  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 1)),
+            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>;
+  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 2)),
+            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>;
+  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
+            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
+  } // IsLittleEndian, HasP9Vector
+
+  def : Pat<(v2f64 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v2i64 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v4f32 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v4i32 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(store v4f32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+
+  def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
+            (v4i32 (LXVWSX xoaddr:$src))>;
+  def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
+            (v4f32 (LXVWSX xoaddr:$src))>;
+  def : Pat<(v4f32 (scalar_to_vector (f32 (fpround (extloadf32 xoaddr:$src))))),
+            (v4f32 (LXVWSX xoaddr:$src))>;
+
+  // Build vectors from i8 loads
+  def : Pat<(v16i8 (scalar_to_vector ScalarLoads.Li8)),
+            (v16i8 (VSPLTBs 7, (LXSIBZX xoaddr:$src)))>;
+  def : Pat<(v8i16 (scalar_to_vector ScalarLoads.ZELi8)),
+            (v8i16 (VSPLTHs 3, (LXSIBZX xoaddr:$src)))>;
+  def : Pat<(v4i32 (scalar_to_vector ScalarLoads.ZELi8)),
+           (v4i32 (XXSPLTWs (LXSIBZX xoaddr:$src), 1))>;
+  def : Pat<(v2i64 (scalar_to_vector ScalarLoads.ZELi8i64)),
+            (v2i64 (XXPERMDIs (LXSIBZX xoaddr:$src), 0))>;
+  def : Pat<(v4i32 (scalar_to_vector ScalarLoads.SELi8)),
+            (v4i32 (XXSPLTWs (VEXTSB2Ws (LXSIBZX xoaddr:$src)), 1))>;
+  def : Pat<(v2i64 (scalar_to_vector ScalarLoads.SELi8i64)),
+            (v2i64 (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0))>;
+
+  // Build vectors from i16 loads
+  def : Pat<(v8i16 (scalar_to_vector ScalarLoads.Li16)),
+            (v8i16 (VSPLTHs 3, (LXSIHZX xoaddr:$src)))>;
+  def : Pat<(v4i32 (scalar_to_vector ScalarLoads.ZELi16)),
+            (v4i32 (XXSPLTWs (LXSIHZX xoaddr:$src), 1))>;
+  def : Pat<(v2i64 (scalar_to_vector ScalarLoads.ZELi16i64)),
+           (v2i64 (XXPERMDIs (LXSIHZX xoaddr:$src), 0))>;
+  def : Pat<(v4i32 (scalar_to_vector ScalarLoads.SELi16)),
+            (v4i32 (XXSPLTWs (VEXTSH2Ws (LXSIHZX xoaddr:$src)), 1))>;
+  def : Pat<(v2i64 (scalar_to_vector ScalarLoads.SELi16i64)),
+            (v2i64 (XXPERMDIs (VEXTSH2Ds (LXSIHZX xoaddr:$src)), 0))>;
+
+  let Predicates = [IsBigEndian, HasP9Vector] in {
+  // Scalar stores of i8
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 9), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 11), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 13), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 15), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
+            (STXSIBXv $S, xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 1), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 3), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 5), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 7), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+
+  // Scalar stores of i16
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
+            (STXSIHXv $S, xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+  } // IsBigEndian, HasP9Vector
+
+  let Predicates = [IsLittleEndian, HasP9Vector] in {
+  // Scalar stores of i8
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 7), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 5), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 3), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 1), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
+            (STXSIBXv $S, xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 15), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 13), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 11), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 9), xoaddr:$dst)>;
+
+  // Scalar stores of i16
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
+            (STXSIHXv $S, xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+  } // IsLittleEndian, HasP9Vector
+
+
+  // Vector sign extensions
+  def : Pat<(f64 (PPCVexts f64:$A, 1)),
+            (f64 (COPY_TO_REGCLASS (VEXTSB2Ds $A), VSFRC))>;
+  def : Pat<(f64 (PPCVexts f64:$A, 2)),
+            (f64 (COPY_TO_REGCLASS (VEXTSH2Ds $A), VSFRC))>;
+
+  let isPseudo = 1 in {
+    def DFLOADf32  : Pseudo<(outs vssrc:$XT), (ins memrix:$src),
+                            "#DFLOADf32",
+                            [(set f32:$XT, (load iaddr:$src))]>;
+    def DFLOADf64  : Pseudo<(outs vsfrc:$XT), (ins memrix:$src),
+                            "#DFLOADf64",
+                            [(set f64:$XT, (load iaddr:$src))]>;
+    def DFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrix:$dst),
+                            "#DFSTOREf32",
+                            [(store f32:$XT, iaddr:$dst)]>;
+    def DFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrix:$dst),
+                            "#DFSTOREf64",
+                            [(store f64:$XT, iaddr:$dst)]>;
+  }
+  def : Pat<(f64 (extloadf32 iaddr:$src)),
+            (COPY_TO_REGCLASS (DFLOADf32 iaddr:$src), VSFRC)>;
+  def : Pat<(f32 (fpround (extloadf32 iaddr:$src))),
+            (f32 (DFLOADf32 iaddr:$src))>;
+} // end HasP9Vector, AddedComplexity
+
+// Integer extend helper dags 32 -> 64
+def AnyExts {
+  dag A = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32);
+  dag B = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $B, sub_32);
+  dag C = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $C, sub_32);
+  dag D = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $D, sub_32);
+}
+
+def DblToFlt {
+  dag A0 = (f32 (fpround (f64 (extractelt v2f64:$A, 0))));
+  dag A1 = (f32 (fpround (f64 (extractelt v2f64:$A, 1))));
+  dag B0 = (f32 (fpround (f64 (extractelt v2f64:$B, 0))));
+  dag B1 = (f32 (fpround (f64 (extractelt v2f64:$B, 1))));
+}
+def FltToIntLoad {
+  dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (extloadf32 xoaddr:$A)))));
+}
+def FltToUIntLoad {
+  dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (extloadf32 xoaddr:$A)))));
+}
+def FltToLongLoad {
+  dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A)))));
+}
+def FltToULongLoad {
+  dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A)))));
+}
+def FltToLong {
+  dag A = (i64 (PPCmfvsr (PPCfctidz (fpextend f32:$A))));
+}
+def FltToULong {
+  dag A = (i64 (PPCmfvsr (PPCfctiduz (fpextend f32:$A))));
+}
+def DblToInt {
+  dag A = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$A))));
+}
+def DblToUInt {
+  dag A = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$A))));
+}
+def DblToLong {
+  dag A = (i64 (PPCmfvsr (f64 (PPCfctidz f64:$A))));
+}
+def DblToULong {
+  dag A = (i64 (PPCmfvsr (f64 (PPCfctiduz f64:$A))));
+}
+def DblToIntLoad {
+  dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A)))));
+}
+def DblToUIntLoad {
+  dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A)))));
+}
+def DblToLongLoad {
+  dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A)))));
+}
+def DblToULongLoad {
+  dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (load xoaddr:$A)))));
+}
+
+// FP merge dags (for f32 -> v4f32)
+def MrgFP {
+  dag AC = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $A, VSRC),
+                               (COPY_TO_REGCLASS $C, VSRC), 0));
+  dag BD = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $B, VSRC),
+                               (COPY_TO_REGCLASS $D, VSRC), 0));
+  dag ABhToFlt = (XVCVDPSP (XXPERMDI $A, $B, 0));
+  dag ABlToFlt = (XVCVDPSP (XXPERMDI $A, $B, 3));
+  dag BAhToFlt = (XVCVDPSP (XXPERMDI $B, $A, 0));
+  dag BAlToFlt = (XVCVDPSP (XXPERMDI $B, $A, 3));
+}
+
+// Patterns for BUILD_VECTOR nodes.
+def NoP9Vector : Predicate<"!PPCSubTarget->hasP9Vector()">;
+let AddedComplexity = 400 in {
+
+  let Predicates = [HasVSX] in {
+    // Build vectors of floating point converted to i32.
+    def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.A,
+                                   DblToInt.A, DblToInt.A)),
+              (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWS $A), VSRC), 1))>;
+    def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.A,
+                                   DblToUInt.A, DblToUInt.A)),
+              (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWS $A), VSRC), 1))>;
+    def : Pat<(v2i64 (build_vector DblToLong.A, DblToLong.A)),
+              (v2i64 (XXPERMDI (COPY_TO_REGCLASS (XSCVDPSXDS $A), VSRC),
+                               (COPY_TO_REGCLASS (XSCVDPSXDS $A), VSRC), 0))>;
+    def : Pat<(v2i64 (build_vector DblToULong.A, DblToULong.A)),
+              (v2i64 (XXPERMDI (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC),
+                               (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC), 0))>;
+    def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)),
+              (v4i32 (XXSPLTW (COPY_TO_REGCLASS
+                                (XSCVDPSXWSs (LXSSPX xoaddr:$A)), VSRC), 1))>;
+    def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)),
+              (v4i32 (XXSPLTW (COPY_TO_REGCLASS
+                                (XSCVDPUXWSs (LXSSPX xoaddr:$A)), VSRC), 1))>;
+    def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)),
+              (v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>;
+
+    // Build vectors of floating point converted to i64.
+    def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)),
+              (v2i64 (XXPERMDIs
+                       (COPY_TO_REGCLASS (XSCVDPSXDSs $A), VSFRC), 0))>;
+    def : Pat<(v2i64 (build_vector FltToULong.A, FltToULong.A)),
+              (v2i64 (XXPERMDIs
+                       (COPY_TO_REGCLASS (XSCVDPUXDSs $A), VSFRC), 0))>;
+    def : Pat<(v2i64 (scalar_to_vector DblToLongLoad.A)),
+              (v2i64 (XVCVDPSXDS (LXVDSX xoaddr:$A)))>;
+    def : Pat<(v2i64 (scalar_to_vector DblToULongLoad.A)),
+              (v2i64 (XVCVDPUXDS (LXVDSX xoaddr:$A)))>;
+  }
+
+  let Predicates = [HasVSX, NoP9Vector] in {
+    // Load-and-splat with fp-to-int conversion (using X-Form VSX loads).
+    def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)),
+              (v4i32 (XXSPLTW (COPY_TO_REGCLASS
+                                (XSCVDPSXWS (LXSDX xoaddr:$A)), VSRC), 1))>;
+    def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)),
+              (v4i32 (XXSPLTW (COPY_TO_REGCLASS
+                                (XSCVDPUXWS (LXSDX xoaddr:$A)), VSRC), 1))>;
+    def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)),
+              (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
+                                              (LXSSPX xoaddr:$A), VSFRC)), 0))>;
+    def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)),
+              (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
+                                              (LXSSPX xoaddr:$A), VSFRC)), 0))>;
+  }
+
+  // Big endian, available on all targets with VSX
+  let Predicates = [IsBigEndian, HasVSX] in {
+    def : Pat<(v2f64 (build_vector f64:$A, f64:$B)),
+              (v2f64 (XXPERMDI
+                        (COPY_TO_REGCLASS $A, VSRC),
+                        (COPY_TO_REGCLASS $B, VSRC), 0))>;
+
+    def : Pat<(v4f32 (build_vector f32:$A, f32:$B, f32:$C, f32:$D)),
+              (VMRGEW MrgFP.AC, MrgFP.BD)>;
+    def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
+                                   DblToFlt.B0, DblToFlt.B1)),
+              (v4f32 (VMRGEW MrgFP.ABhToFlt, MrgFP.ABlToFlt))>;
+  }
+
+  let Predicates = [IsLittleEndian, HasVSX] in {
+  // Little endian, available on all targets with VSX
+    def : Pat<(v2f64 (build_vector f64:$A, f64:$B)),
+              (v2f64 (XXPERMDI
+                        (COPY_TO_REGCLASS $B, VSRC),
+                        (COPY_TO_REGCLASS $A, VSRC), 0))>;
+
+    def : Pat<(v4f32 (build_vector f32:$D, f32:$C, f32:$B, f32:$A)),
+              (VMRGEW MrgFP.AC, MrgFP.BD)>;
+    def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
+                                   DblToFlt.B0, DblToFlt.B1)),
+              (v4f32 (VMRGEW MrgFP.BAhToFlt, MrgFP.BAlToFlt))>;
+  }
+
+  let Predicates = [HasDirectMove] in {
+    // Endianness-neutral constant splat on P8 and newer targets. The reason
+    // for this pattern is that on targets with direct moves, we don't expand
+    // BUILD_VECTOR nodes for v4i32.
+    def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A,
+                                   immSExt5NonZero:$A, immSExt5NonZero:$A)),
+              (v4i32 (VSPLTISW imm:$A))>;
+  }
+
+  let Predicates = [IsBigEndian, HasDirectMove, NoP9Vector] in {
+    // Big endian integer vectors using direct moves.
+    def : Pat<(v2i64 (build_vector i64:$A, i64:$B)),
+              (v2i64 (XXPERMDI
+                        (COPY_TO_REGCLASS (MTVSRD $A), VSRC),
+                        (COPY_TO_REGCLASS (MTVSRD $B), VSRC), 0))>;
+    def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
+              (VMRGOW (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC),
+                                   (COPY_TO_REGCLASS (MTVSRWZ $C), VSRC), 0),
+                      (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC),
+                                   (COPY_TO_REGCLASS (MTVSRWZ $D), VSRC), 0))>;
+    def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
+              (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
+  }
+
+  let Predicates = [IsLittleEndian, HasDirectMove, NoP9Vector] in {
+    // Little endian integer vectors using direct moves.
+    def : Pat<(v2i64 (build_vector i64:$A, i64:$B)),
+              (v2i64 (XXPERMDI
+                        (COPY_TO_REGCLASS (MTVSRD $B), VSRC),
+                        (COPY_TO_REGCLASS (MTVSRD $A), VSRC), 0))>;
+    def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
+              (VMRGOW (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $D), VSRC),
+                                   (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC), 0),
+                      (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $C), VSRC),
+                                   (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 0))>;
+    def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
+              (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
+  }
+
+  let Predicates = [HasP9Vector] in {
+    // Endianness-neutral patterns for const splats with ISA 3.0 instructions.
+    def : Pat<(v4i32 (scalar_to_vector i32:$A)),
+              (v4i32 (MTVSRWS $A))>;
+    def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
+              (v4i32 (MTVSRWS $A))>;
+    def : Pat<(v16i8 (build_vector immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
+                                   immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
+                                   immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
+                                   immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
+                                   immAnyExt8:$A, immAnyExt8:$A, immAnyExt8:$A,
+                                   immAnyExt8:$A)),
+              (v16i8 (COPY_TO_REGCLASS (XXSPLTIB imm:$A), VSRC))>;
+    def : Pat<(v16i8 immAllOnesV),
+              (v16i8 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>;
+    def : Pat<(v8i16 immAllOnesV),
+              (v8i16 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>;
+    def : Pat<(v4i32 immAllOnesV),
+              (v4i32 (XXSPLTIB 255))>;
+    def : Pat<(v2i64 immAllOnesV),
+              (v2i64 (XXSPLTIB 255))>;
+    def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)),
+              (v4i32 (XVCVSPSXWS (LXVWSX xoaddr:$A)))>;
+    def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)),
+              (v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>;
+    def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)),
+              (v4i32 (XXSPLTW (COPY_TO_REGCLASS
+                                (XSCVDPSXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>;
+    def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)),
+              (v4i32 (XXSPLTW (COPY_TO_REGCLASS
+                                (XSCVDPUXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>;
+    def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)),
+              (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
+                                              (DFLOADf32 iaddr:$A),
+                                              VSFRC)), 0))>;
+    def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)),
+              (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
+                                              (DFLOADf32 iaddr:$A),
+                                              VSFRC)), 0))>;
+  }
+
+  let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in {
+    def : Pat<(i64 (extractelt v2i64:$A, 1)),
+              (i64 (MFVSRLD $A))>;
+    // Better way to build integer vectors if we have MTVSRDD. Big endian.
+    def : Pat<(v2i64 (build_vector i64:$rB, i64:$rA)),
+              (v2i64 (MTVSRDD $rB, $rA))>;
+    def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
+              (VMRGOW (COPY_TO_REGCLASS (MTVSRDD AnyExts.A, AnyExts.C), VSRC),
+                      (COPY_TO_REGCLASS (MTVSRDD AnyExts.B, AnyExts.D), VSRC))>;
+  }
+
+  let Predicates = [IsISA3_0, HasDirectMove, IsLittleEndian] in {
+    def : Pat<(i64 (extractelt v2i64:$A, 0)),
+              (i64 (MFVSRLD $A))>;
+    // Better way to build integer vectors if we have MTVSRDD. Little endian.
+    def : Pat<(v2i64 (build_vector i64:$rA, i64:$rB)),
+              (v2i64 (MTVSRDD $rB, $rA))>;
+    def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
+              (VMRGOW (COPY_TO_REGCLASS (MTVSRDD AnyExts.D, AnyExts.B), VSRC),
+                      (COPY_TO_REGCLASS (MTVSRDD AnyExts.C, AnyExts.A), VSRC))>;
+  }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
new file mode 100644
index 000000000000..2c3e75523e8f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
@@ -0,0 +1,452 @@
+//===------ PPCLoopPreIncPrep.cpp - Loop Pre-Inc. AM Prep. Pass -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to prepare loops for pre-increment addressing
+// modes. Additional PHIs are created for loop induction variables used by
+// load/store instructions so that the pre-increment forms can be used.
+// Generically, this means transforming loops like this:
+//   for (int i = 0; i < n; ++i)
+//     array[i] = c;
+// to look like this:
+//   T *p = array[-1];
+//   for (int i = 0; i < n; ++i)
+//     *++p = c;
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppc-loop-preinc-prep"
+
+#include "PPC.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+// By default, we limit this to creating 16 PHIs (which is a little over half
+// of the allocatable register set).
+static cl::opt<unsigned> MaxVars("ppc-preinc-prep-max-vars",
+                                 cl::Hidden, cl::init(16),
+  cl::desc("Potential PHI threshold for PPC preinc loop prep"));
+
+namespace llvm {
+
+  void initializePPCLoopPreIncPrepPass(PassRegistry&);
+
+} // end namespace llvm
+
+namespace {
+
+  class PPCLoopPreIncPrep : public FunctionPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+
+    PPCLoopPreIncPrep() : FunctionPass(ID), TM(nullptr) {
+      initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry());
+    }
+    PPCLoopPreIncPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
+      initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    bool runOnLoop(Loop *L);
+    void simplifyLoopLatch(Loop *L);
+    bool rotateLoop(Loop *L);
+
+  private:
+    PPCTargetMachine *TM;
+    DominatorTree *DT;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
+    bool PreserveLCSSA;
+  };
+
+} // end anonymous namespace
+
+char PPCLoopPreIncPrep::ID = 0;
+static const char *name = "Prepare loop for pre-inc. addressing modes";
+INITIALIZE_PASS_BEGIN(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false)
+
+FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) {
+  return new PPCLoopPreIncPrep(TM);
+}
+
+namespace {
+
+  struct BucketElement {
+    BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {}
+    BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {}
+
+    const SCEVConstant *Offset;
+    Instruction *Instr;
+  };
+
+  struct Bucket {
+    Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B),
+                                            Elements(1, BucketElement(I)) {}
+
+    const SCEV *BaseSCEV;
+    SmallVector<BucketElement, 16> Elements;
+  };
+
+} // end anonymous namespace
+
+static bool IsPtrInBounds(Value *BasePtr) {
+  Value *StrippedBasePtr = BasePtr;
+  while (BitCastInst *BC = dyn_cast<BitCastInst>(StrippedBasePtr))
+    StrippedBasePtr = BC->getOperand(0);
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(StrippedBasePtr))
+    return GEP->isInBounds();
+
+  return false;
+}
+
+static Value *GetPointerOperand(Value *MemI) {
+  if (LoadInst *LMemI = dyn_cast<LoadInst>(MemI)) {
+    return LMemI->getPointerOperand();
+  } else if (StoreInst *SMemI = dyn_cast<StoreInst>(MemI)) {
+    return SMemI->getPointerOperand();
+  } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(MemI)) {
+    if (IMemI->getIntrinsicID() == Intrinsic::prefetch)
+      return IMemI->getArgOperand(0);
+  }
+
+  return nullptr;
+}
+
+bool PPCLoopPreIncPrep::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+  bool MadeChange = false;
+
+  for (auto I = LI->begin(), IE = LI->end(); I != IE; ++I)
+    for (auto L = df_begin(*I), LE = df_end(*I); L != LE; ++L)
+      MadeChange |= runOnLoop(*L);
+
+  return MadeChange;
+}
+
+bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
+  bool MadeChange = false;
+
+  // Only prep. the inner-most loop
+  if (!L->empty())
+    return MadeChange;
+
+  DEBUG(dbgs() << "PIP: Examining: " << *L << "\n");
+
+  BasicBlock *Header = L->getHeader();
+
+  const PPCSubtarget *ST =
+    TM ? TM->getSubtargetImpl(*Header->getParent()) : nullptr;
+
+  unsigned HeaderLoopPredCount =
+    std::distance(pred_begin(Header), pred_end(Header));
+
+  // Collect buckets of comparable addresses used by loads and stores.
+  SmallVector<Bucket, 16> Buckets;
+  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+       I != IE; ++I) {
+    for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end();
+        J != JE; ++J) {
+      Value *PtrValue;
+      Instruction *MemI;
+
+      if (LoadInst *LMemI = dyn_cast<LoadInst>(J)) {
+        MemI = LMemI;
+        PtrValue = LMemI->getPointerOperand();
+      } else if (StoreInst *SMemI = dyn_cast<StoreInst>(J)) {
+        MemI = SMemI;
+        PtrValue = SMemI->getPointerOperand();
+      } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(J)) {
+        if (IMemI->getIntrinsicID() == Intrinsic::prefetch) {
+          MemI = IMemI;
+          PtrValue = IMemI->getArgOperand(0);
+        } else continue;
+      } else continue;
+
+      unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
+      if (PtrAddrSpace)
+        continue;
+
+      // There are no update forms for Altivec vector load/stores.
+      if (ST && ST->hasAltivec() &&
+          PtrValue->getType()->getPointerElementType()->isVectorTy())
+        continue;
+
+      if (L->isLoopInvariant(PtrValue))
+        continue;
+
+      const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L);
+      if (const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV)) {
+        if (LARSCEV->getLoop() != L)
+          continue;
+      } else {
+        continue;
+      }
+
+      bool FoundBucket = false;
+      for (auto &B : Buckets) {
+        const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV);
+        if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) {
+          B.Elements.push_back(BucketElement(CDiff, MemI));
+          FoundBucket = true;
+          break;
+        }
+      }
+
+      if (!FoundBucket) {
+        if (Buckets.size() == MaxVars)
+          return MadeChange;
+        Buckets.push_back(Bucket(LSCEV, MemI));
+      }
+    }
+  }
+
+  if (Buckets.empty())
+    return MadeChange;
+
+  BasicBlock *LoopPredecessor = L->getLoopPredecessor();
+  // If there is no loop predecessor, or the loop predecessor's terminator
+  // returns a value (which might contribute to determining the loop's
+  // iteration space), insert a new preheader for the loop.
+  if (!LoopPredecessor ||
+      !LoopPredecessor->getTerminator()->getType()->isVoidTy()) {
+    LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
+    if (LoopPredecessor)
+      MadeChange = true;
+  }
+  if (!LoopPredecessor)
+    return MadeChange;
+
+  DEBUG(dbgs() << "PIP: Found " << Buckets.size() << " buckets\n");
+
+  SmallSet<BasicBlock *, 16> BBChanged;
+  for (unsigned i = 0, e = Buckets.size(); i != e; ++i) {
+    // The base address of each bucket is transformed into a phi and the others
+    // are rewritten as offsets of that variable.
+
+    // We have a choice now of which instruction's memory operand we use as the
+    // base for the generated PHI. Always picking the first instruction in each
+    // bucket does not work well, specifically because that instruction might
+    // be a prefetch (and there are no pre-increment dcbt variants). Otherwise,
+    // the choice is somewhat arbitrary, because the backend will happily
+    // generate direct offsets from both the pre-incremented and
+    // post-incremented pointer values. Thus, we'll pick the first non-prefetch
+    // instruction in each bucket, and adjust the recurrence and other offsets
+    // accordingly. 
+    for (int j = 0, je = Buckets[i].Elements.size(); j != je; ++j) {
+      if (auto *II = dyn_cast<IntrinsicInst>(Buckets[i].Elements[j].Instr))
+        if (II->getIntrinsicID() == Intrinsic::prefetch)
+          continue;
+
+      // If we'd otherwise pick the first element anyway, there's nothing to do.
+      if (j == 0)
+        break;
+
+      // If our chosen element has no offset from the base pointer, there's
+      // nothing to do.
+      if (!Buckets[i].Elements[j].Offset ||
+          Buckets[i].Elements[j].Offset->isZero())
+        break;
+
+      const SCEV *Offset = Buckets[i].Elements[j].Offset;
+      Buckets[i].BaseSCEV = SE->getAddExpr(Buckets[i].BaseSCEV, Offset);
+      for (auto &E : Buckets[i].Elements) {
+        if (E.Offset)
+          E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset));
+        else
+          E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset));
+      }
+
+      std::swap(Buckets[i].Elements[j], Buckets[i].Elements[0]);
+      break;
+    }
+
+    const SCEVAddRecExpr *BasePtrSCEV =
+      cast<SCEVAddRecExpr>(Buckets[i].BaseSCEV);
+    if (!BasePtrSCEV->isAffine())
+      continue;
+
+    DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n");
+    assert(BasePtrSCEV->getLoop() == L &&
+           "AddRec for the wrong loop?");
+
+    // The instruction corresponding to the Bucket's BaseSCEV must be the first
+    // in the vector of elements.
+    Instruction *MemI = Buckets[i].Elements.begin()->Instr;
+    Value *BasePtr = GetPointerOperand(MemI);
+    assert(BasePtr && "No pointer operand");
+
+    Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext());
+    Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(),
+      BasePtr->getType()->getPointerAddressSpace());
+
+    const SCEV *BasePtrStartSCEV = BasePtrSCEV->getStart();
+    if (!SE->isLoopInvariant(BasePtrStartSCEV, L))
+      continue;
+
+    const SCEVConstant *BasePtrIncSCEV =
+      dyn_cast<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE));
+    if (!BasePtrIncSCEV)
+      continue;
+    BasePtrStartSCEV = SE->getMinusSCEV(BasePtrStartSCEV, BasePtrIncSCEV);
+    if (!isSafeToExpand(BasePtrStartSCEV, *SE))
+      continue;
+
+    DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n");
+
+    PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount,
+      MemI->hasName() ? MemI->getName() + ".phi" : "",
+      Header->getFirstNonPHI());
+
+    SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart");
+    Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy,
+      LoopPredecessor->getTerminator());
+
+    // Note that LoopPredecessor might occur in the predecessor list multiple
+    // times, and we need to add it the right number of times.
+    for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
+         PI != PE; ++PI) {
+      if (*PI != LoopPredecessor)
+        continue;
+
+      NewPHI->addIncoming(BasePtrStart, LoopPredecessor);
+    }
+
+    Instruction *InsPoint = &*Header->getFirstInsertionPt();
+    GetElementPtrInst *PtrInc = GetElementPtrInst::Create(
+        I8Ty, NewPHI, BasePtrIncSCEV->getValue(),
+        MemI->hasName() ? MemI->getName() + ".inc" : "", InsPoint);
+    PtrInc->setIsInBounds(IsPtrInBounds(BasePtr));
+    for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
+         PI != PE; ++PI) {
+      if (*PI == LoopPredecessor)
+        continue;
+
+      NewPHI->addIncoming(PtrInc, *PI);
+    }
+
+    Instruction *NewBasePtr;
+    if (PtrInc->getType() != BasePtr->getType())
+      NewBasePtr = new BitCastInst(PtrInc, BasePtr->getType(),
+        PtrInc->hasName() ? PtrInc->getName() + ".cast" : "", InsPoint);
+    else
+      NewBasePtr = PtrInc;
+
+    if (Instruction *IDel = dyn_cast<Instruction>(BasePtr))
+      BBChanged.insert(IDel->getParent());
+    BasePtr->replaceAllUsesWith(NewBasePtr);
+    RecursivelyDeleteTriviallyDeadInstructions(BasePtr);
+
+    // Keep track of the replacement pointer values we've inserted so that we
+    // don't generate more pointer values than necessary.
+    SmallPtrSet<Value *, 16> NewPtrs;
+    NewPtrs.insert( NewBasePtr);
+
+    for (auto I = std::next(Buckets[i].Elements.begin()),
+         IE = Buckets[i].Elements.end(); I != IE; ++I) {
+      Value *Ptr = GetPointerOperand(I->Instr);
+      assert(Ptr && "No pointer operand");
+      if (NewPtrs.count(Ptr))
+        continue;
+
+      Instruction *RealNewPtr;
+      if (!I->Offset || I->Offset->getValue()->isZero()) {
+        RealNewPtr = NewBasePtr;
+      } else {
+        Instruction *PtrIP = dyn_cast<Instruction>(Ptr);
+        if (PtrIP && isa<Instruction>(NewBasePtr) &&
+            cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent())
+          PtrIP = nullptr;
+        else if (isa<PHINode>(PtrIP))
+          PtrIP = &*PtrIP->getParent()->getFirstInsertionPt();
+        else if (!PtrIP)
+          PtrIP = I->Instr;
+
+        GetElementPtrInst *NewPtr = GetElementPtrInst::Create(
+            I8Ty, PtrInc, I->Offset->getValue(),
+            I->Instr->hasName() ? I->Instr->getName() + ".off" : "", PtrIP);
+        if (!PtrIP)
+          NewPtr->insertAfter(cast<Instruction>(PtrInc));
+        NewPtr->setIsInBounds(IsPtrInBounds(Ptr));
+        RealNewPtr = NewPtr;
+      }
+
+      if (Instruction *IDel = dyn_cast<Instruction>(Ptr))
+        BBChanged.insert(IDel->getParent());
+
+      Instruction *ReplNewPtr;
+      if (Ptr->getType() != RealNewPtr->getType()) {
+        ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(),
+          Ptr->hasName() ? Ptr->getName() + ".cast" : "");
+        ReplNewPtr->insertAfter(RealNewPtr);
+      } else
+        ReplNewPtr = RealNewPtr;
+
+      Ptr->replaceAllUsesWith(ReplNewPtr);
+      RecursivelyDeleteTriviallyDeadInstructions(Ptr);
+
+      NewPtrs.insert(RealNewPtr);
+    }
+
+    MadeChange = true;
+  }
+
+  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+       I != IE; ++I) {
+    if (BBChanged.count(*I))
+      DeleteDeadPHIs(*I);
+  }
+
+  return MadeChange;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
new file mode 100644
index 000000000000..e527b018d4fb
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -0,0 +1,187 @@
+//===-- PPCMCInstLower.cpp - Convert PPC MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower PPC MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCMCExpr.h"
+#include "PPCSubtarget.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+using namespace llvm;
+
+static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) {
+  return AP.MMI->getObjFileInfo<MachineModuleInfoMachO>();
+}
+
+static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO,
+                                      AsmPrinter &AP) {
+  const TargetMachine &TM = AP.TM;
+  Mangler &Mang = TM.getObjFileLowering()->getMangler();
+  const DataLayout &DL = AP.getDataLayout();
+  MCContext &Ctx = AP.OutContext;
+
+  SmallString<128> Name;
+  StringRef Suffix;
+  if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG)
+    Suffix = "$non_lazy_ptr";
+
+  if (!Suffix.empty())
+    Name += DL.getPrivateGlobalPrefix();
+
+  if (!MO.isGlobal()) {
+    assert(MO.isSymbol() && "Isn't a symbol reference");
+    Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
+  } else {
+    const GlobalValue *GV = MO.getGlobal();
+    TM.getNameWithPrefix(Name, GV, Mang);
+  }
+
+  Name += Suffix;
+  MCSymbol *Sym = Ctx.getOrCreateSymbol(Name);
+
+  // If the symbol reference is actually to a non_lazy_ptr, not to the symbol,
+  // then add the suffix.
+  if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG) {
+    MachineModuleInfoMachO &MachO = getMachOMMI(AP);
+
+    MachineModuleInfoImpl::StubValueTy &StubSym = MachO.getGVStubEntry(Sym);
+
+    if (!StubSym.getPointer()) {
+      assert(MO.isGlobal() && "Extern symbol not handled yet");
+      StubSym = MachineModuleInfoImpl::
+                   StubValueTy(AP.getSymbol(MO.getGlobal()),
+                               !MO.getGlobal()->hasInternalLinkage());
+    }
+    return Sym;
+  }
+  
+  return Sym;
+}
+
+static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
+                              AsmPrinter &Printer, bool isDarwin) {
+  MCContext &Ctx = Printer.OutContext;
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+
+  unsigned access = MO.getTargetFlags() & PPCII::MO_ACCESS_MASK;
+
+  switch (access) {
+    case PPCII::MO_TPREL_LO:
+      RefKind = MCSymbolRefExpr::VK_PPC_TPREL_LO;
+      break;
+    case PPCII::MO_TPREL_HA:
+      RefKind = MCSymbolRefExpr::VK_PPC_TPREL_HA;
+      break;
+    case PPCII::MO_DTPREL_LO:
+      RefKind = MCSymbolRefExpr::VK_PPC_DTPREL_LO;
+      break;
+    case PPCII::MO_TLSLD_LO:
+      RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO;
+      break;
+    case PPCII::MO_TOC_LO:
+      RefKind = MCSymbolRefExpr::VK_PPC_TOC_LO;
+      break;
+    case PPCII::MO_TLS:
+      RefKind = MCSymbolRefExpr::VK_PPC_TLS;
+      break;
+  }
+
+  if (MO.getTargetFlags() == PPCII::MO_PLT)
+    RefKind = MCSymbolRefExpr::VK_PLT;
+
+  const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, RefKind, Ctx);
+
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::createAdd(Expr,
+                                   MCConstantExpr::create(MO.getOffset(), Ctx),
+                                   Ctx);
+
+  // Subtract off the PIC base if required.
+  if (MO.getTargetFlags() & PPCII::MO_PIC_FLAG) {
+    const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+    
+    const MCExpr *PB = MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
+    Expr = MCBinaryExpr::createSub(Expr, PB, Ctx);
+  }
+
+  // Add ha16() / lo16() markers if required.
+  switch (access) {
+    case PPCII::MO_LO:
+      Expr = PPCMCExpr::createLo(Expr, isDarwin, Ctx);
+      break;
+    case PPCII::MO_HA:
+      Expr = PPCMCExpr::createHa(Expr, isDarwin, Ctx);
+      break;
+  }
+
+  return MCOperand::createExpr(Expr);
+}
+
+void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                        AsmPrinter &AP, bool isDarwin) {
+  OutMI.setOpcode(MI->getOpcode());
+  
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      assert(!MO.getSubReg() && "Subregs should be eliminated!");
+      assert(MO.getReg() > PPC::NoRegister &&
+             MO.getReg() < PPC::NUM_TARGET_REGS &&
+             "Invalid register for this target!");
+      MCOp = MCOperand::createReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::createImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
+                                      MO.getMBB()->getSymbol(), AP.OutContext));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin);
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin);
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin);
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP,
+                          isDarwin);
+      break;
+    case MachineOperand::MO_RegisterMask:
+      continue;
+    }
+    
+    OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
new file mode 100644
index 000000000000..2413af3f7042
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -0,0 +1,392 @@
+//===-------------- PPCMIPeephole.cpp - MI Peephole Cleanups -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This pass performs peephole optimizations to clean up ugly code
+// sequences at the MachineInstruction layer.  It runs at the end of
+// the SSA phases, following VSX swap removal.  A pass of dead code
+// elimination follows this one for quick clean-up of any dead
+// instructions introduced here.  Although we could do this as callbacks
+// from the generic peephole pass, this would have a couple of bad
+// effects:  it might remove optimization opportunities for VSX swap
+// removal, and it would miss cleanups made possible following VSX
+// swap removal.
+//
+//===---------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-mi-peepholes"
+
+namespace llvm {
+  void initializePPCMIPeepholePass(PassRegistry&);
+}
+
+namespace {
+
+struct PPCMIPeephole : public MachineFunctionPass {
+
+  static char ID;
+  const PPCInstrInfo *TII;
+  MachineFunction *MF;
+  MachineRegisterInfo *MRI;
+
+  PPCMIPeephole() : MachineFunctionPass(ID) {
+    initializePPCMIPeepholePass(*PassRegistry::getPassRegistry());
+  }
+
+private:
+  // Initialize class variables.
+  void initialize(MachineFunction &MFParm);
+
+  // Perform peepholes.
+  bool simplifyCode(void);
+
+  // Find the "true" register represented by SrcReg (following chains
+  // of copies and subreg_to_reg operations).
+  unsigned lookThruCopyLike(unsigned SrcReg);
+
+public:
+  // Main entry point for this pass.
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(*MF.getFunction()))
+      return false;
+    initialize(MF);
+    return simplifyCode();
+  }
+};
+
+// Initialize class variables.
+void PPCMIPeephole::initialize(MachineFunction &MFParm) {
+  MF = &MFParm;
+  MRI = &MF->getRegInfo();
+  TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
+  DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n");
+  DEBUG(MF->dump());
+}
+
+// Perform peephole optimizations.
+bool PPCMIPeephole::simplifyCode(void) {
+  bool Simplified = false;
+  MachineInstr* ToErase = nullptr;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
+
+      // If the previous instruction was marked for elimination,
+      // remove it now.
+      if (ToErase) {
+        ToErase->eraseFromParent();
+        ToErase = nullptr;
+      }
+
+      // Ignore debug instructions.
+      if (MI.isDebugValue())
+        continue;
+
+      // Per-opcode peepholes.
+      switch (MI.getOpcode()) {
+
+      default:
+        break;
+
+      case PPC::XXPERMDI: {
+        // Perform simplifications of 2x64 vector swaps and splats.
+        // A swap is identified by an immediate value of 2, and a splat
+        // is identified by an immediate value of 0 or 3.
+        int Immed = MI.getOperand(3).getImm();
+
+        if (Immed != 1) {
+
+          // For each of these simplifications, we need the two source
+          // regs to match.  Unfortunately, MachineCSE ignores COPY and
+          // SUBREG_TO_REG, so for example we can see
+          //   XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed.
+          // We have to look through chains of COPY and SUBREG_TO_REG
+          // to find the real source values for comparison.
+          unsigned TrueReg1 = lookThruCopyLike(MI.getOperand(1).getReg());
+          unsigned TrueReg2 = lookThruCopyLike(MI.getOperand(2).getReg());
+
+          if (TrueReg1 == TrueReg2
+              && TargetRegisterInfo::isVirtualRegister(TrueReg1)) {
+            MachineInstr *DefMI = MRI->getVRegDef(TrueReg1);
+            unsigned DefOpc = DefMI ? DefMI->getOpcode() : 0;
+
+            // If this is a splat fed by a splatting load, the splat is
+            // redundant. Replace with a copy. This doesn't happen directly due
+            // to code in PPCDAGToDAGISel.cpp, but it can happen when converting
+            // a load of a double to a vector of 64-bit integers.
+            auto isConversionOfLoadAndSplat = [=]() -> bool {
+              if (DefOpc != PPC::XVCVDPSXDS && DefOpc != PPC::XVCVDPUXDS)
+                return false;
+              unsigned DefReg = lookThruCopyLike(DefMI->getOperand(1).getReg());
+              if (TargetRegisterInfo::isVirtualRegister(DefReg)) {
+                MachineInstr *LoadMI = MRI->getVRegDef(DefReg);
+                if (LoadMI && LoadMI->getOpcode() == PPC::LXVDSX)
+                  return true;
+              }
+              return false;
+            };
+            if (DefMI && (Immed == 0 || Immed == 3)) {
+              if (DefOpc == PPC::LXVDSX || isConversionOfLoadAndSplat()) {
+                DEBUG(dbgs()
+                      << "Optimizing load-and-splat/splat "
+                      "to load-and-splat/copy: ");
+                DEBUG(MI.dump());
+                BuildMI(MBB, &MI, MI.getDebugLoc(),
+                        TII->get(PPC::COPY), MI.getOperand(0).getReg())
+                  .addOperand(MI.getOperand(1));
+                ToErase = &MI;
+                Simplified = true;
+              }
+            }
+
+            // If this is a splat or a swap fed by another splat, we
+            // can replace it with a copy.
+            if (DefOpc == PPC::XXPERMDI) {
+              unsigned FeedImmed = DefMI->getOperand(3).getImm();
+              unsigned FeedReg1
+                = lookThruCopyLike(DefMI->getOperand(1).getReg());
+              unsigned FeedReg2
+                = lookThruCopyLike(DefMI->getOperand(2).getReg());
+
+              if ((FeedImmed == 0 || FeedImmed == 3) && FeedReg1 == FeedReg2) {
+                DEBUG(dbgs()
+                      << "Optimizing splat/swap or splat/splat "
+                      "to splat/copy: ");
+                DEBUG(MI.dump());
+                BuildMI(MBB, &MI, MI.getDebugLoc(),
+                        TII->get(PPC::COPY), MI.getOperand(0).getReg())
+                  .addOperand(MI.getOperand(1));
+                ToErase = &MI;
+                Simplified = true;
+              }
+
+              // If this is a splat fed by a swap, we can simplify modify
+              // the splat to splat the other value from the swap's input
+              // parameter.
+              else if ((Immed == 0 || Immed == 3)
+                       && FeedImmed == 2 && FeedReg1 == FeedReg2) {
+                DEBUG(dbgs() << "Optimizing swap/splat => splat: ");
+                DEBUG(MI.dump());
+                MI.getOperand(1).setReg(DefMI->getOperand(1).getReg());
+                MI.getOperand(2).setReg(DefMI->getOperand(2).getReg());
+                MI.getOperand(3).setImm(3 - Immed);
+                Simplified = true;
+              }
+
+              // If this is a swap fed by a swap, we can replace it
+              // with a copy from the first swap's input.
+              else if (Immed == 2 && FeedImmed == 2 && FeedReg1 == FeedReg2) {
+                DEBUG(dbgs() << "Optimizing swap/swap => copy: ");
+                DEBUG(MI.dump());
+                BuildMI(MBB, &MI, MI.getDebugLoc(),
+                        TII->get(PPC::COPY), MI.getOperand(0).getReg())
+                  .addOperand(DefMI->getOperand(1));
+                ToErase = &MI;
+                Simplified = true;
+              }
+            } else if ((Immed == 0 || Immed == 3) && DefOpc == PPC::XXPERMDIs &&
+                       (DefMI->getOperand(2).getImm() == 0 ||
+                        DefMI->getOperand(2).getImm() == 3)) {
+              // Splat fed by another splat - switch the output of the first
+              // and remove the second.
+              DefMI->getOperand(0).setReg(MI.getOperand(0).getReg());
+              ToErase = &MI;
+              Simplified = true;
+              DEBUG(dbgs() << "Removing redundant splat: ");
+              DEBUG(MI.dump());
+            }
+          }
+        }
+        break;
+      }
+      case PPC::VSPLTB:
+      case PPC::VSPLTH:
+      case PPC::XXSPLTW: {
+        unsigned MyOpcode = MI.getOpcode();
+        unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2;
+        unsigned TrueReg = lookThruCopyLike(MI.getOperand(OpNo).getReg());
+        if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
+          break;
+        MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
+        if (!DefMI)
+          break;
+        unsigned DefOpcode = DefMI->getOpcode();
+        auto isConvertOfSplat = [=]() -> bool {
+          if (DefOpcode != PPC::XVCVSPSXWS && DefOpcode != PPC::XVCVSPUXWS)
+            return false;
+          unsigned ConvReg = DefMI->getOperand(1).getReg();
+          if (!TargetRegisterInfo::isVirtualRegister(ConvReg))
+            return false;
+          MachineInstr *Splt = MRI->getVRegDef(ConvReg);
+          return Splt && (Splt->getOpcode() == PPC::LXVWSX ||
+            Splt->getOpcode() == PPC::XXSPLTW);
+        };
+        bool AlreadySplat = (MyOpcode == DefOpcode) ||
+          (MyOpcode == PPC::VSPLTB && DefOpcode == PPC::VSPLTBs) ||
+          (MyOpcode == PPC::VSPLTH && DefOpcode == PPC::VSPLTHs) ||
+          (MyOpcode == PPC::XXSPLTW && DefOpcode == PPC::XXSPLTWs) ||
+          (MyOpcode == PPC::XXSPLTW && DefOpcode == PPC::LXVWSX) ||
+          (MyOpcode == PPC::XXSPLTW && DefOpcode == PPC::MTVSRWS)||
+          (MyOpcode == PPC::XXSPLTW && isConvertOfSplat());
+        // If the instruction[s] that feed this splat have already splat
+        // the value, this splat is redundant.
+        if (AlreadySplat) {
+          DEBUG(dbgs() << "Changing redundant splat to a copy: ");
+          DEBUG(MI.dump());
+          BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
+                  MI.getOperand(0).getReg())
+              .addOperand(MI.getOperand(OpNo));
+          ToErase = &MI;
+          Simplified = true;
+        }
+        // Splat fed by a shift. Usually when we align value to splat into
+        // vector element zero.
+        if (DefOpcode == PPC::XXSLDWI) {
+          unsigned ShiftRes = DefMI->getOperand(0).getReg();
+          unsigned ShiftOp1 = DefMI->getOperand(1).getReg();
+          unsigned ShiftOp2 = DefMI->getOperand(2).getReg();
+          unsigned ShiftImm = DefMI->getOperand(3).getImm();
+          unsigned SplatImm = MI.getOperand(2).getImm();
+          if (ShiftOp1 == ShiftOp2) {
+            unsigned NewElem = (SplatImm + ShiftImm) & 0x3;
+            if (MRI->hasOneNonDBGUse(ShiftRes)) {
+              DEBUG(dbgs() << "Removing redundant shift: ");
+              DEBUG(DefMI->dump());
+              ToErase = DefMI;
+            }
+            Simplified = true;
+            DEBUG(dbgs() << "Changing splat immediate from " << SplatImm <<
+                  " to " << NewElem << " in instruction: ");
+            DEBUG(MI.dump());
+            MI.getOperand(1).setReg(ShiftOp1);
+            MI.getOperand(2).setImm(NewElem);
+          }
+        }
+        break;
+      }
+      case PPC::XVCVDPSP: {
+        // If this is a DP->SP conversion fed by an FRSP, the FRSP is redundant.
+        unsigned TrueReg = lookThruCopyLike(MI.getOperand(1).getReg());
+        if (!TargetRegisterInfo::isVirtualRegister(TrueReg))
+          break;
+        MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
+
+        // This can occur when building a vector of single precision or integer
+        // values.
+        if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) {
+          unsigned DefsReg1 = lookThruCopyLike(DefMI->getOperand(1).getReg());
+          unsigned DefsReg2 = lookThruCopyLike(DefMI->getOperand(2).getReg());
+          if (!TargetRegisterInfo::isVirtualRegister(DefsReg1) ||
+              !TargetRegisterInfo::isVirtualRegister(DefsReg2))
+            break;
+          MachineInstr *P1 = MRI->getVRegDef(DefsReg1);
+          MachineInstr *P2 = MRI->getVRegDef(DefsReg2);
+
+          if (!P1 || !P2)
+            break;
+
+          // Remove the passed FRSP instruction if it only feeds this MI and
+          // set any uses of that FRSP (in this MI) to the source of the FRSP.
+          auto removeFRSPIfPossible = [&](MachineInstr *RoundInstr) {
+            if (RoundInstr->getOpcode() == PPC::FRSP &&
+                MRI->hasOneNonDBGUse(RoundInstr->getOperand(0).getReg())) {
+              Simplified = true;
+              unsigned ConvReg1 = RoundInstr->getOperand(1).getReg();
+              unsigned FRSPDefines = RoundInstr->getOperand(0).getReg();
+              MachineInstr &Use = *(MRI->use_instr_begin(FRSPDefines));
+              for (int i = 0, e = Use.getNumOperands(); i < e; ++i)
+                if (Use.getOperand(i).isReg() &&
+                    Use.getOperand(i).getReg() == FRSPDefines)
+                  Use.getOperand(i).setReg(ConvReg1);
+              DEBUG(dbgs() << "Removing redundant FRSP:\n");
+              DEBUG(RoundInstr->dump());
+              DEBUG(dbgs() << "As it feeds instruction:\n");
+              DEBUG(MI.dump());
+              DEBUG(dbgs() << "Through instruction:\n");
+              DEBUG(DefMI->dump());
+              RoundInstr->eraseFromParent();
+            }
+          };
+
+          // If the input to XVCVDPSP is a vector that was built (even
+          // partially) out of FRSP's, the FRSP(s) can safely be removed
+          // since this instruction performs the same operation.
+          if (P1 != P2) {
+            removeFRSPIfPossible(P1);
+            removeFRSPIfPossible(P2);
+            break;
+          }
+          removeFRSPIfPossible(P1);
+        }
+        break;
+      }
+      }
+    }
+    // If the last instruction was marked for elimination,
+    // remove it now.
+    if (ToErase) {
+      ToErase->eraseFromParent();
+      ToErase = nullptr;
+    }
+  }
+
+  return Simplified;
+}
+
+// This is used to find the "true" source register for an
+// XXPERMDI instruction, since MachineCSE does not handle the
+// "copy-like" operations (Copy and SubregToReg).  Returns
+// the original SrcReg unless it is the target of a copy-like
+// operation, in which case we chain backwards through all
+// such operations to the ultimate source register.  If a
+// physical register is encountered, we stop the search.
+unsigned PPCMIPeephole::lookThruCopyLike(unsigned SrcReg) {
+
+  while (true) {
+
+    MachineInstr *MI = MRI->getVRegDef(SrcReg);
+    if (!MI->isCopyLike())
+      return SrcReg;
+
+    unsigned CopySrcReg;
+    if (MI->isCopy())
+      CopySrcReg = MI->getOperand(1).getReg();
+    else {
+      assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike");
+      CopySrcReg = MI->getOperand(2).getReg();
+    }
+
+    if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg))
+      return CopySrcReg;
+
+    SrcReg = CopySrcReg;
+  }
+}
+
+} // end default namespace
+
+INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE,
+                      "PowerPC MI Peephole Optimization", false, false)
+INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE,
+                    "PowerPC MI Peephole Optimization", false, false)
+
+char PPCMIPeephole::ID = 0;
+FunctionPass*
+llvm::createPPCMIPeepholePass() { return new PPCMIPeephole(); }
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..9d91e31165de
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -0,0 +1,46 @@
+//===-- PPCMachineFunctionInfo.cpp - Private data used for PowerPC --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMachineFunctionInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+void PPCFunctionInfo::anchor() { }
+
+MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const {
+  const DataLayout &DL = MF.getDataLayout();
+  return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                           Twine(MF.getFunctionNumber()) +
+                                           "$poff");
+}
+
+MCSymbol *PPCFunctionInfo::getGlobalEPSymbol() const {
+  const DataLayout &DL = MF.getDataLayout();
+  return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                           "func_gep" +
+                                           Twine(MF.getFunctionNumber()));
+}
+
+MCSymbol *PPCFunctionInfo::getLocalEPSymbol() const {
+  const DataLayout &DL = MF.getDataLayout();
+  return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                           "func_lep" +
+                                           Twine(MF.getFunctionNumber()));
+}
+
+MCSymbol *PPCFunctionInfo::getTOCOffsetSymbol() const {
+  const DataLayout &DL = MF.getDataLayout();
+  return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                           "func_toc" +
+                                           Twine(MF.getFunctionNumber()));
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
new file mode 100644
index 000000000000..4c29aa06f048
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -0,0 +1,217 @@
+//===-- PPCMachineFunctionInfo.h - Private data used for PowerPC --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// PPCFunctionInfo - This class is derived from MachineFunction private
+/// PowerPC target-specific information for each MachineFunction.
+class PPCFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// FramePointerSaveIndex - Frame index of where the old frame pointer is
+  /// stored.  Also used as an anchor for instructions that need to be altered
+  /// when using frame pointers (dyna_add, dyna_sub.)
+  int FramePointerSaveIndex;
+  
+  /// ReturnAddrSaveIndex - Frame index of where the return address is stored.
+  ///
+  int ReturnAddrSaveIndex;
+
+  /// Frame index where the old base pointer is stored.
+  int BasePointerSaveIndex;
+
+  /// Frame index where the old PIC base pointer is stored.
+  int PICBasePointerSaveIndex;
+
+  /// MustSaveLR - Indicates whether LR is defined (or clobbered) in the current
+  /// function.  This is only valid after the initial scan of the function by
+  /// PEI.
+  bool MustSaveLR;
+
+  /// Does this function have any stack spills.
+  bool HasSpills;
+
+  /// Does this function spill using instructions with only r+r (not r+i)
+  /// forms.
+  bool HasNonRISpills;
+
+  /// SpillsCR - Indicates whether CR is spilled in the current function.
+  bool SpillsCR;
+
+  /// Indicates whether VRSAVE is spilled in the current function.
+  bool SpillsVRSAVE;
+
+  /// LRStoreRequired - The bool indicates whether there is some explicit use of
+  /// the LR/LR8 stack slot that is not obvious from scanning the code.  This
+  /// requires that the code generator produce a store of LR to the stack on
+  /// entry, even though LR may otherwise apparently not be used.
+  bool LRStoreRequired;
+
+  /// This function makes use of the PPC64 ELF TOC base pointer (register r2).
+  bool UsesTOCBasePtr;
+
+  /// MinReservedArea - This is the frame size that is at least reserved in a
+  /// potential caller (parameter+linkage area).
+  unsigned MinReservedArea;
+
+  /// TailCallSPDelta - Stack pointer delta used when tail calling. Maximum
+  /// amount the stack pointer is adjusted to make the frame bigger for tail
+  /// calls. Used for creating an area before the register spill area.
+  int TailCallSPDelta;
+
+  /// HasFastCall - Does this function contain a fast call. Used to determine
+  /// how the caller's stack pointer should be calculated (epilog/dynamicalloc).
+  bool HasFastCall;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+  /// VarArgsStackOffset - StackOffset for start of stack
+  /// arguments.
+  int VarArgsStackOffset;
+  /// VarArgsNumGPR - Index of the first unused integer
+  /// register for parameter passing.
+  unsigned VarArgsNumGPR;
+  /// VarArgsNumFPR - Index of the first unused double
+  /// register for parameter passing.
+  unsigned VarArgsNumFPR;
+
+  /// CRSpillFrameIndex - FrameIndex for CR spill slot for 32-bit SVR4.
+  int CRSpillFrameIndex;
+
+  /// If any of CR[2-4] need to be saved in the prologue and restored in the
+  /// epilogue then they are added to this array. This is used for the
+  /// 64-bit SVR4 ABI.
+  SmallVector<unsigned, 3> MustSaveCRs;
+
+  /// Hold onto our MachineFunction context.
+  MachineFunction &MF;
+
+  /// Whether this uses the PIC Base register or not.
+  bool UsesPICBase;
+
+  /// True if this function has a subset of CSRs that is handled explicitly via
+  /// copies
+  bool IsSplitCSR;
+
+public:
+  explicit PPCFunctionInfo(MachineFunction &MF) 
+    : FramePointerSaveIndex(0),
+      ReturnAddrSaveIndex(0),
+      BasePointerSaveIndex(0),
+      PICBasePointerSaveIndex(0),
+      HasSpills(false),
+      HasNonRISpills(false),
+      SpillsCR(false),
+      SpillsVRSAVE(false),
+      LRStoreRequired(false),
+      UsesTOCBasePtr(false),
+      MinReservedArea(0),
+      TailCallSPDelta(0),
+      HasFastCall(false),
+      VarArgsFrameIndex(0),
+      VarArgsStackOffset(0),
+      VarArgsNumGPR(0),
+      VarArgsNumFPR(0),
+      CRSpillFrameIndex(0),
+      MF(MF),
+      UsesPICBase(0),
+      IsSplitCSR(false) {}
+
+  int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
+  void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
+  
+  int getReturnAddrSaveIndex() const { return ReturnAddrSaveIndex; }
+  void setReturnAddrSaveIndex(int idx) { ReturnAddrSaveIndex = idx; }
+
+  int getBasePointerSaveIndex() const { return BasePointerSaveIndex; }
+  void setBasePointerSaveIndex(int Idx) { BasePointerSaveIndex = Idx; }
+
+  int getPICBasePointerSaveIndex() const { return PICBasePointerSaveIndex; }
+  void setPICBasePointerSaveIndex(int Idx) { PICBasePointerSaveIndex = Idx; }
+
+  unsigned getMinReservedArea() const { return MinReservedArea; }
+  void setMinReservedArea(unsigned size) { MinReservedArea = size; }
+
+  int getTailCallSPDelta() const { return TailCallSPDelta; }
+  void setTailCallSPDelta(int size) { TailCallSPDelta = size; }
+
+  /// MustSaveLR - This is set when the prolog/epilog inserter does its initial
+  /// scan of the function. It is true if the LR/LR8 register is ever explicitly
+  /// defined/clobbered in the machine function (e.g. by calls and movpctolr,
+  /// which is used in PIC generation), or if the LR stack slot is explicitly
+  /// referenced by builtin_return_address.
+  void setMustSaveLR(bool U) { MustSaveLR = U; }
+  bool mustSaveLR() const    { return MustSaveLR; }
+
+  void setHasSpills()      { HasSpills = true; }
+  bool hasSpills() const   { return HasSpills; }
+
+  void setHasNonRISpills()    { HasNonRISpills = true; }
+  bool hasNonRISpills() const { return HasNonRISpills; }
+
+  void setSpillsCR()       { SpillsCR = true; }
+  bool isCRSpilled() const { return SpillsCR; }
+
+  void setSpillsVRSAVE()       { SpillsVRSAVE = true; }
+  bool isVRSAVESpilled() const { return SpillsVRSAVE; }
+
+  void setLRStoreRequired() { LRStoreRequired = true; }
+  bool isLRStoreRequired() const { return LRStoreRequired; }
+
+  void setUsesTOCBasePtr()    { UsesTOCBasePtr = true; }
+  bool usesTOCBasePtr() const { return UsesTOCBasePtr; }
+
+  void setHasFastCall() { HasFastCall = true; }
+  bool hasFastCall() const { return HasFastCall;}
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+  int getVarArgsStackOffset() const { return VarArgsStackOffset; }
+  void setVarArgsStackOffset(int Offset) { VarArgsStackOffset = Offset; }
+
+  unsigned getVarArgsNumGPR() const { return VarArgsNumGPR; }
+  void setVarArgsNumGPR(unsigned Num) { VarArgsNumGPR = Num; }
+
+  unsigned getVarArgsNumFPR() const { return VarArgsNumFPR; }
+  void setVarArgsNumFPR(unsigned Num) { VarArgsNumFPR = Num; }
+
+  int getCRSpillFrameIndex() const { return CRSpillFrameIndex; }
+  void setCRSpillFrameIndex(int idx) { CRSpillFrameIndex = idx; }
+
+  const SmallVectorImpl<unsigned> &
+    getMustSaveCRs() const { return MustSaveCRs; }
+  void addMustSaveCR(unsigned Reg) { MustSaveCRs.push_back(Reg); }
+
+  void setUsesPICBase(bool uses) { UsesPICBase = uses; }
+  bool usesPICBase() const { return UsesPICBase; }
+
+  bool isSplitCSR() const { return IsSplitCSR; }
+  void setIsSplitCSR(bool s) { IsSplitCSR = s; }
+
+  MCSymbol *getPICOffsetSymbol() const;
+
+  MCSymbol *getGlobalEPSymbol() const;
+  MCSymbol *getLocalEPSymbol() const;
+  MCSymbol *getTOCOffsetSymbol() const;
+};
+
+} // end of namespace llvm
+
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h b/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h
new file mode 100644
index 000000000000..8a1d68011c5f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h
@@ -0,0 +1,6591 @@
+//===-- PPCPerfectShuffle.h - Altivec Perfect Shuffle Table -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle without using vperm.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCPERFECTSHUFFLE_H
+#define LLVM_LIB_TARGET_POWERPC_PPCPERFECTSHUFFLE_H
+
+// 31 entries have cost 0
+// 292 entries have cost 1
+// 1384 entries have cost 2
+// 3061 entries have cost 3
+// 1733 entries have cost 4
+// 60 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+  202162278U,	// <0,0,0,0>: Cost 1 vspltisw0 LHS
+  1140850790U,	// <0,0,0,1>: Cost 2 vmrghw <0,0,0,0>, LHS
+  2617247181U,	// <0,0,0,2>: Cost 3 vsldoi4 <0,0,0,0>, <2,0,3,0>
+  2635163787U,	// <0,0,0,3>: Cost 3 vsldoi4 <3,0,0,0>, <3,0,0,0>
+  1543507254U,	// <0,0,0,4>: Cost 2 vsldoi4 <0,0,0,0>, RHS
+  2281701705U,	// <0,0,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,0,5>
+  2617250133U,	// <0,0,0,6>: Cost 3 vsldoi4 <0,0,0,0>, <6,0,7,0>
+  2659054575U,	// <0,0,0,7>: Cost 3 vsldoi4 <7,0,0,0>, <7,0,0,0>
+  202162278U,	// <0,0,0,u>: Cost 1 vspltisw0 LHS
+  1141686282U,	// <0,0,1,0>: Cost 2 vmrghw LHS, <0,0,1,1>
+  67944550U,	// <0,0,1,1>: Cost 1 vmrghw LHS, LHS
+  1685241958U,	// <0,0,1,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS
+  2215870716U,	// <0,0,1,3>: Cost 3 vmrghw LHS, <0,3,1,0>
+  1141727570U,	// <0,0,1,4>: Cost 2 vmrghw LHS, <0,4,1,5>
+  2215428562U,	// <0,0,1,5>: Cost 3 vmrghw LHS, <0,5,6,7>
+  2215428589U,	// <0,0,1,6>: Cost 3 vmrghw LHS, <0,6,0,7>
+  2659062768U,	// <0,0,1,7>: Cost 3 vsldoi4 <7,0,0,1>, <7,0,0,1>
+  67945117U,	// <0,0,1,u>: Cost 1 vmrghw LHS, LHS
+  2684356045U,	// <0,0,2,0>: Cost 3 vsldoi8 <0,0,0,0>, <2,0,3,0>
+  2216009830U,	// <0,0,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2216009901U,	// <0,0,2,2>: Cost 3 vmrghw <0,2,1,2>, <0,2,1,2>
+  2698290853U,	// <0,0,2,3>: Cost 3 vsldoi8 <2,3,0,0>, <2,3,0,0>
+  3289751890U,	// <0,0,2,4>: Cost 4 vmrghw <0,2,1,2>, <0,4,1,5>
+  3758098275U,	// <0,0,2,5>: Cost 4 vsldoi8 <0,0,0,0>, <2,5,3,1>
+  2684356538U,	// <0,0,2,6>: Cost 3 vsldoi8 <0,0,0,0>, <2,6,3,7>
+  3758098410U,	// <0,0,2,7>: Cost 4 vsldoi8 <0,0,0,0>, <2,7,0,1>
+  2216010397U,	// <0,0,2,u>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2702272651U,	// <0,0,3,0>: Cost 3 vsldoi8 <3,0,0,0>, <3,0,0,0>
+  2216656998U,	// <0,0,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS
+  3844669704U,	// <0,0,3,2>: Cost 4 vsldoi12 <3,2,3,0>, <0,3,2,3>
+  2216657148U,	// <0,0,3,3>: Cost 3 vmrghw <0,3,1,0>, <0,3,1,0>
+  2684357122U,	// <0,0,3,4>: Cost 3 vsldoi8 <0,0,0,0>, <3,4,5,6>
+  3732820066U,	// <0,0,3,5>: Cost 4 vsldoi4 <7,0,0,3>, <5,6,7,0>
+  3778005624U,	// <0,0,3,6>: Cost 4 vsldoi8 <3,3,0,0>, <3,6,0,7>
+  3374713464U,	// <0,0,3,7>: Cost 4 vmrglw <3,2,0,3>, <3,6,0,7>
+  2216657565U,	// <0,0,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2217361408U,	// <0,0,4,0>: Cost 3 vmrghw <0,4,1,5>, <0,0,0,0>
+  1143619686U,	// <0,0,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS
+  3291103405U,	// <0,0,4,2>: Cost 4 vmrghw <0,4,1,5>, <0,2,1,2>
+  3827269988U,	// <0,0,4,3>: Cost 4 vsldoi12 <0,3,1,0>, <0,4,3,5>
+  1143619922U,	// <0,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1610616118U,	// <0,0,4,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS
+  3758099833U,	// <0,0,4,6>: Cost 4 vsldoi8 <0,0,0,0>, <4,6,5,2>
+  3854107016U,	// <0,0,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <0,4,7,5>
+  1143620253U,	// <0,0,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2284396544U,	// <0,0,5,0>: Cost 3 vmrglw <0,4,0,5>, <0,0,0,0>
+  2218025062U,	// <0,0,5,1>: Cost 3 vmrghw <0,5,1,5>, LHS
+  3758100203U,	// <0,0,5,2>: Cost 4 vsldoi8 <0,0,0,0>, <5,2,1,3>
+  3395966100U,	// <0,0,5,3>: Cost 4 vmrglw <6,7,0,5>, <7,2,0,3>
+  3804549052U,	// <0,0,5,4>: Cost 4 vsldoi8 <7,7,0,0>, <5,4,6,5>
+  2302314964U,	// <0,0,5,5>: Cost 3 vmrglw <3,4,0,5>, <3,4,0,5>
+  2785821138U,	// <0,0,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7>
+  3395966428U,	// <0,0,5,7>: Cost 4 vmrglw <6,7,0,5>, <7,6,0,7>
+  2787148260U,	// <0,0,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <0,5,u,7>
+  2684358997U,	// <0,0,6,0>: Cost 3 vsldoi8 <0,0,0,0>, <6,0,7,0>
+  2218631270U,	// <0,0,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS
+  2684359162U,	// <0,0,6,2>: Cost 3 vsldoi8 <0,0,0,0>, <6,2,7,3>
+  3758101042U,	// <0,0,6,3>: Cost 4 vsldoi8 <0,0,0,0>, <6,3,4,5>
+  3732843830U,	// <0,0,6,4>: Cost 4 vsldoi4 <7,0,0,6>, RHS
+  3758101227U,	// <0,0,6,5>: Cost 4 vsldoi8 <0,0,0,0>, <6,5,7,1>
+  2684359480U,	// <0,0,6,6>: Cost 3 vsldoi8 <0,0,0,0>, <6,6,6,6>
+  2724836173U,	// <0,0,6,7>: Cost 3 vsldoi8 <6,7,0,0>, <6,7,0,0>
+  2725499806U,	// <0,0,6,u>: Cost 3 vsldoi8 <6,u,0,0>, <6,u,0,0>
+  2726163439U,	// <0,0,7,0>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0>
+  2219311206U,	// <0,0,7,1>: Cost 3 vmrghw <0,7,1,0>, LHS
+  3868557900U,	// <0,0,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <0,7,2,3>
+  3377400112U,	// <0,0,7,3>: Cost 4 vmrglw <3,6,0,7>, <3,2,0,3>
+  2684360038U,	// <0,0,7,4>: Cost 3 vsldoi8 <0,0,0,0>, <7,4,5,6>
+  3732852834U,	// <0,0,7,5>: Cost 4 vsldoi4 <7,0,0,7>, <5,6,7,0>
+  3871507060U,	// <0,0,7,6>: Cost 4 vsldoi12 <7,6,7,0>, <0,7,6,7>
+  2303658616U,	// <0,0,7,7>: Cost 3 vmrglw <3,6,0,7>, <3,6,0,7>
+  2726163439U,	// <0,0,7,u>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0>
+  202162278U,	// <0,0,u,0>: Cost 1 vspltisw0 LHS
+  72589414U,	// <0,0,u,1>: Cost 1 vmrghw LHS, LHS
+  1685242525U,	// <0,0,u,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS
+  2220073212U,	// <0,0,u,3>: Cost 3 vmrghw LHS, <0,3,1,0>
+  1146331474U,	// <0,0,u,4>: Cost 2 vmrghw LHS, <0,4,1,5>
+  1610619034U,	// <0,0,u,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS
+  2785821138U,	// <0,0,u,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7>
+  2659120119U,	// <0,0,u,7>: Cost 3 vsldoi4 <7,0,0,u>, <7,0,0,u>
+  72589981U,	// <0,0,u,u>: Cost 1 vmrghw LHS, LHS
+  2698297344U,	// <0,1,0,0>: Cost 3 vsldoi8 <2,3,0,1>, <0,0,0,0>
+  1624555622U,	// <0,1,0,1>: Cost 2 vsldoi8 <2,3,0,1>, LHS
+  2758984428U,	// <0,1,0,2>: Cost 3 vsldoi12 <1,2,3,0>, <1,0,2,1>
+  2635237524U,	// <0,1,0,3>: Cost 3 vsldoi4 <3,0,1,0>, <3,0,1,0>
+  2693652818U,	// <0,1,0,4>: Cost 3 vsldoi8 <1,5,0,1>, <0,4,1,5>
+  2281701714U,	// <0,1,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,1,5>
+  2698297846U,	// <0,1,0,6>: Cost 3 vsldoi8 <2,3,0,1>, <0,6,1,7>
+  2659128312U,	// <0,1,0,7>: Cost 3 vsldoi4 <7,0,1,0>, <7,0,1,0>
+  1624556189U,	// <0,1,0,u>: Cost 2 vsldoi8 <2,3,0,1>, LHS
+  1543585802U,	// <0,1,1,0>: Cost 2 vsldoi4 <0,0,1,1>, <0,0,1,1>
+  1141728052U,	// <0,1,1,1>: Cost 2 vmrghw LHS, <1,1,1,1>
+  1141728150U,	// <0,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  2295644334U,	// <0,1,1,3>: Cost 3 vmrglw <2,3,0,1>, <0,2,1,3>
+  1543589174U,	// <0,1,1,4>: Cost 2 vsldoi4 <0,0,1,1>, RHS
+  2290999634U,	// <0,1,1,5>: Cost 3 vmrglw <1,5,0,1>, <0,4,1,5>
+  2617332135U,	// <0,1,1,6>: Cost 3 vsldoi4 <0,0,1,1>, <6,1,7,1>
+  2617332720U,	// <0,1,1,7>: Cost 3 vsldoi4 <0,0,1,1>, <7,0,0,1>
+  1142171004U,	// <0,1,1,u>: Cost 2 vmrghw LHS, <1,u,3,0>
+  1561509990U,	// <0,1,2,0>: Cost 2 vsldoi4 <3,0,1,2>, LHS
+  2623308516U,	// <0,1,2,1>: Cost 3 vsldoi4 <1,0,1,2>, <1,0,1,2>
+  2698298984U,	// <0,1,2,2>: Cost 3 vsldoi8 <2,3,0,1>, <2,2,2,2>
+  835584U,	// <0,1,2,3>: Cost 0 copy LHS
+  1561513270U,	// <0,1,2,4>: Cost 2 vsldoi4 <3,0,1,2>, RHS
+  2647199304U,	// <0,1,2,5>: Cost 3 vsldoi4 <5,0,1,2>, <5,0,1,2>
+  2698299322U,	// <0,1,2,6>: Cost 3 vsldoi8 <2,3,0,1>, <2,6,3,7>
+  1585402874U,	// <0,1,2,7>: Cost 2 vsldoi4 <7,0,1,2>, <7,0,1,2>
+  835584U,	// <0,1,2,u>: Cost 0 copy LHS
+  2698299540U,	// <0,1,3,0>: Cost 3 vsldoi8 <2,3,0,1>, <3,0,1,0>
+  3290399540U,	// <0,1,3,1>: Cost 4 vmrghw <0,3,1,0>, <1,1,1,1>
+  2698299720U,	// <0,1,3,2>: Cost 3 vsldoi8 <2,3,0,1>, <3,2,3,0>
+  2698299804U,	// <0,1,3,3>: Cost 3 vsldoi8 <2,3,0,1>, <3,3,3,3>
+  2698299906U,	// <0,1,3,4>: Cost 3 vsldoi8 <2,3,0,1>, <3,4,5,6>
+  3832726521U,	// <0,1,3,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,3,5,0>
+  2724842160U,	// <0,1,3,6>: Cost 3 vsldoi8 <6,7,0,1>, <3,6,7,0>
+  2706926275U,	// <0,1,3,7>: Cost 3 vsldoi8 <3,7,0,1>, <3,7,0,1>
+  2698300190U,	// <0,1,3,u>: Cost 3 vsldoi8 <2,3,0,1>, <3,u,1,2>
+  2635268198U,	// <0,1,4,0>: Cost 3 vsldoi4 <3,0,1,4>, LHS
+  2217362228U,	// <0,1,4,1>: Cost 3 vmrghw <0,4,1,5>, <1,1,1,1>
+  2217362326U,	// <0,1,4,2>: Cost 3 vmrghw <0,4,1,5>, <1,2,3,0>
+  2635270296U,	// <0,1,4,3>: Cost 3 vsldoi4 <3,0,1,4>, <3,0,1,4>
+  2635271478U,	// <0,1,4,4>: Cost 3 vsldoi4 <3,0,1,4>, RHS
+  1624558902U,	// <0,1,4,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  2659160910U,	// <0,1,4,6>: Cost 3 vsldoi4 <7,0,1,4>, <6,7,0,1>
+  2659161084U,	// <0,1,4,7>: Cost 3 vsldoi4 <7,0,1,4>, <7,0,1,4>
+  1624559145U,	// <0,1,4,u>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  3832726639U,	// <0,1,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,0,1>
+  2714889871U,	// <0,1,5,1>: Cost 3 vsldoi8 <5,1,0,1>, <5,1,0,1>
+  2302314646U,	// <0,1,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2>
+  3834717321U,	// <0,1,5,3>: Cost 4 vsldoi12 <1,5,3,0>, <1,5,3,0>
+  3832726679U,	// <0,1,5,4>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,4,5>
+  2717544403U,	// <0,1,5,5>: Cost 3 vsldoi8 <5,5,0,1>, <5,5,0,1>
+  2718208036U,	// <0,1,5,6>: Cost 3 vsldoi8 <5,6,0,1>, <5,6,0,1>
+  3792613493U,	// <0,1,5,7>: Cost 4 vsldoi8 <5,7,0,1>, <5,7,0,1>
+  2719535302U,	// <0,1,5,u>: Cost 3 vsldoi8 <5,u,0,1>, <5,u,0,1>
+  2659172454U,	// <0,1,6,0>: Cost 3 vsldoi4 <7,0,1,6>, LHS
+  3832726735U,	// <0,1,6,1>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,1,7>
+  2724844026U,	// <0,1,6,2>: Cost 3 vsldoi8 <6,7,0,1>, <6,2,7,3>
+  3775361608U,	// <0,1,6,3>: Cost 4 vsldoi8 <2,u,0,1>, <6,3,7,0>
+  2659175734U,	// <0,1,6,4>: Cost 3 vsldoi4 <7,0,1,6>, RHS
+  3832726771U,	// <0,1,6,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,5,7>
+  2724844344U,	// <0,1,6,6>: Cost 3 vsldoi8 <6,7,0,1>, <6,6,6,6>
+  1651102542U,	// <0,1,6,7>: Cost 2 vsldoi8 <6,7,0,1>, <6,7,0,1>
+  1651766175U,	// <0,1,6,u>: Cost 2 vsldoi8 <6,u,0,1>, <6,u,0,1>
+  2724844536U,	// <0,1,7,0>: Cost 3 vsldoi8 <6,7,0,1>, <7,0,1,0>
+  3377397770U,	// <0,1,7,1>: Cost 4 vmrglw <3,6,0,7>, <0,0,1,1>
+  2698302636U,	// <0,1,7,2>: Cost 3 vsldoi8 <2,3,0,1>, <7,2,3,0>
+  2728162531U,	// <0,1,7,3>: Cost 3 vsldoi8 <7,3,0,1>, <7,3,0,1>
+  2724844902U,	// <0,1,7,4>: Cost 3 vsldoi8 <6,7,0,1>, <7,4,5,6>
+  3377398098U,	// <0,1,7,5>: Cost 4 vmrglw <3,6,0,7>, <0,4,1,5>
+  2724845076U,	// <0,1,7,6>: Cost 3 vsldoi8 <6,7,0,1>, <7,6,7,0>
+  2724845164U,	// <0,1,7,7>: Cost 3 vsldoi8 <6,7,0,1>, <7,7,7,7>
+  2724845186U,	// <0,1,7,u>: Cost 3 vsldoi8 <6,7,0,1>, <7,u,1,2>
+  1561559142U,	// <0,1,u,0>: Cost 2 vsldoi4 <3,0,1,u>, LHS
+  1146331956U,	// <0,1,u,1>: Cost 2 vmrghw LHS, <1,1,1,1>
+  1146332054U,	// <0,1,u,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  835584U,	// <0,1,u,3>: Cost 0 copy LHS
+  1561562422U,	// <0,1,u,4>: Cost 2 vsldoi4 <3,0,1,u>, RHS
+  1624561818U,	// <0,1,u,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  2220074191U,	// <0,1,u,6>: Cost 3 vmrghw LHS, <1,6,1,7>
+  1585452032U,	// <0,1,u,7>: Cost 2 vsldoi4 <7,0,1,u>, <7,0,1,u>
+  835584U,	// <0,1,u,u>: Cost 0 copy LHS
+  2214593997U,	// <0,2,0,0>: Cost 3 vmrghw <0,0,0,0>, <2,0,3,0>
+  2214675999U,	// <0,2,0,1>: Cost 3 vmrghw <0,0,1,1>, <2,1,3,1>
+  2214594152U,	// <0,2,0,2>: Cost 3 vmrghw <0,0,0,0>, <2,2,2,2>
+  1207959654U,	// <0,2,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS
+  3709054262U,	// <0,2,0,4>: Cost 4 vsldoi4 <3,0,2,0>, RHS
+  3375350836U,	// <0,2,0,5>: Cost 4 vmrglw <3,3,0,0>, <1,4,2,5>
+  2214594490U,	// <0,2,0,6>: Cost 3 vmrghw <0,0,0,0>, <2,6,3,7>
+  3288336362U,	// <0,2,0,7>: Cost 4 vmrghw <0,0,0,0>, <2,7,0,1>
+  1207959659U,	// <0,2,0,u>: Cost 2 vmrglw <0,0,0,0>, LHS
+  2215871994U,	// <0,2,1,0>: Cost 3 vmrghw LHS, <2,0,u,0>
+  2215470623U,	// <0,2,1,1>: Cost 3 vmrghw LHS, <2,1,3,1>
+  1141728872U,	// <0,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1141728934U,	// <0,2,1,3>: Cost 2 vmrghw LHS, <2,3,0,1>
+  2215872323U,	// <0,2,1,4>: Cost 3 vmrghw LHS, <2,4,u,5>
+  2215872405U,	// <0,2,1,5>: Cost 3 vmrghw LHS, <2,5,u,6>
+  1141729210U,	// <0,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2215430122U,	// <0,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1141729368U,	// <0,2,1,u>: Cost 2 vmrghw LHS, <2,u,3,3>
+  3289736698U,	// <0,2,2,0>: Cost 4 vmrghw <0,2,1,0>, <2,0,u,0>
+  3289744927U,	// <0,2,2,1>: Cost 4 vmrghw <0,2,1,1>, <2,1,3,1>
+  2216011368U,	// <0,2,2,2>: Cost 3 vmrghw <0,2,1,2>, <2,2,2,2>
+  2216019622U,	// <0,2,2,3>: Cost 3 vmrghw <0,2,1,3>, <2,3,0,1>
+  3289769795U,	// <0,2,2,4>: Cost 4 vmrghw <0,2,1,4>, <2,4,u,5>
+  3289778069U,	// <0,2,2,5>: Cost 4 vmrghw <0,2,1,5>, <2,5,u,6>
+  2216044474U,	// <0,2,2,6>: Cost 3 vmrghw <0,2,1,6>, <2,6,3,7>
+  3732960259U,	// <0,2,2,7>: Cost 4 vsldoi4 <7,0,2,2>, <7,0,2,2>
+  2216061016U,	// <0,2,2,u>: Cost 3 vmrghw <0,2,1,u>, <2,u,3,3>
+  2758985382U,	// <0,2,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,0,1>
+  2758985392U,	// <0,2,3,1>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,1,2>
+  3290400360U,	// <0,2,3,2>: Cost 4 vmrghw <0,3,1,0>, <2,2,2,2>
+  2758985408U,	// <0,2,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,3,0>
+  2758985422U,	// <0,2,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,4,5>
+  2785822424U,	// <0,2,3,5>: Cost 3 vsldoi12 <5,6,7,0>, <2,3,5,6>
+  3290400698U,	// <0,2,3,6>: Cost 4 vmrghw <0,3,1,0>, <2,6,3,7>
+  2765915876U,	// <0,2,3,7>: Cost 3 vsldoi12 <2,3,7,0>, <2,3,7,0>
+  2758985453U,	// <0,2,3,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,u,0>
+  3291104762U,	// <0,2,4,0>: Cost 4 vmrghw <0,4,1,5>, <2,0,u,0>
+  2217362979U,	// <0,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5>
+  2217363048U,	// <0,2,4,2>: Cost 3 vmrghw <0,4,1,5>, <2,2,2,2>
+  2217363110U,	// <0,2,4,3>: Cost 3 vmrghw <0,4,1,5>, <2,3,0,1>
+  3291105087U,	// <0,2,4,4>: Cost 4 vmrghw <0,4,1,5>, <2,4,u,1>
+  3291105173U,	// <0,2,4,5>: Cost 4 vmrghw <0,4,1,5>, <2,5,u,6>
+  2217363386U,	// <0,2,4,6>: Cost 3 vmrghw <0,4,1,5>, <2,6,3,7>
+  3788639688U,	// <0,2,4,7>: Cost 4 vsldoi8 <5,1,0,2>, <4,7,5,0>
+  2217363515U,	// <0,2,4,u>: Cost 3 vmrghw <0,4,1,5>, <2,u,0,1>
+  3376054371U,	// <0,2,5,0>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,0>
+  3788639888U,	// <0,2,5,1>: Cost 4 vsldoi8 <5,1,0,2>, <5,1,0,2>
+  3376055912U,	// <0,2,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,2,2,2>
+  2302312550U,	// <0,2,5,3>: Cost 3 vmrglw <3,4,0,5>, LHS
+  3376054375U,	// <0,2,5,4>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,4>
+  3374728244U,	// <0,2,5,5>: Cost 4 vmrglw <3,2,0,5>, <1,4,2,5>
+  3805229154U,	// <0,2,5,6>: Cost 4 vsldoi8 <7,u,0,2>, <5,6,7,0>
+  3376055512U,	// <0,2,5,7>: Cost 4 vmrglw <3,4,0,5>, <1,6,2,7>
+  2302312555U,	// <0,2,5,u>: Cost 3 vmrglw <3,4,0,5>, LHS
+  3709100134U,	// <0,2,6,0>: Cost 4 vsldoi4 <3,0,2,6>, LHS
+  3709100950U,	// <0,2,6,1>: Cost 4 vsldoi4 <3,0,2,6>, <1,2,3,0>
+  3709102010U,	// <0,2,6,2>: Cost 4 vsldoi4 <3,0,2,6>, <2,6,3,7>
+  2758985658U,	// <0,2,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,3,7>
+  3709103414U,	// <0,2,6,4>: Cost 4 vsldoi4 <3,0,2,6>, RHS
+  3732992098U,	// <0,2,6,5>: Cost 4 vsldoi4 <7,0,2,6>, <5,6,7,0>
+  3292374970U,	// <0,2,6,6>: Cost 4 vmrghw <0,6,0,7>, <2,6,3,7>
+  3798594383U,	// <0,2,6,7>: Cost 4 vsldoi8 <6,7,0,2>, <6,7,0,2>
+  2758985703U,	// <0,2,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,u,7>
+  3788641274U,	// <0,2,7,0>: Cost 4 vsldoi8 <5,1,0,2>, <7,0,1,2>
+  3377398508U,	// <0,2,7,1>: Cost 4 vmrglw <3,6,0,7>, <1,0,2,1>
+  3377398590U,	// <0,2,7,2>: Cost 4 vmrglw <3,6,0,7>, <1,1,2,2>
+  2303656038U,	// <0,2,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS
+  3709111606U,	// <0,2,7,4>: Cost 4 vsldoi4 <3,0,2,7>, RHS
+  3377398836U,	// <0,2,7,5>: Cost 4 vmrglw <3,6,0,7>, <1,4,2,5>
+  3803903447U,	// <0,2,7,6>: Cost 4 vsldoi8 <7,6,0,2>, <7,6,0,2>
+  3293054954U,	// <0,2,7,7>: Cost 4 vmrghw <0,7,1,0>, <2,7,0,1>
+  2303656043U,	// <0,2,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS
+  2220074490U,	// <0,2,u,0>: Cost 3 vmrghw LHS, <2,0,u,0>
+  2220074527U,	// <0,2,u,1>: Cost 3 vmrghw LHS, <2,1,3,1>
+  1146332776U,	// <0,2,u,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1146332838U,	// <0,2,u,3>: Cost 2 vmrghw LHS, <2,3,0,1>
+  2220074819U,	// <0,2,u,4>: Cost 3 vmrghw LHS, <2,4,u,5>
+  2220074901U,	// <0,2,u,5>: Cost 3 vmrghw LHS, <2,5,u,6>
+  1146333114U,	// <0,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2220074986U,	// <0,2,u,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1146333243U,	// <0,2,u,u>: Cost 2 vmrghw LHS, <2,u,0,1>
+  2629410816U,	// <0,3,0,0>: Cost 3 vsldoi4 <2,0,3,0>, <0,0,0,0>
+  2753530006U,	// <0,3,0,1>: Cost 3 vsldoi12 <0,3,1,0>, <3,0,1,2>
+  2629412301U,	// <0,3,0,2>: Cost 3 vsldoi4 <2,0,3,0>, <2,0,3,0>
+  2214594972U,	// <0,3,0,3>: Cost 3 vmrghw <0,0,0,0>, <3,3,3,3>
+  2758985908U,	// <0,3,0,4>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,4,5>
+  3733016674U,	// <0,3,0,5>: Cost 4 vsldoi4 <7,0,3,0>, <5,6,7,0>
+  3777364488U,	// <0,3,0,6>: Cost 4 vsldoi8 <3,2,0,3>, <0,6,3,7>
+  2281703354U,	// <0,3,0,7>: Cost 3 vmrglw <0,0,0,0>, <2,6,3,7>
+  2758985941U,	// <0,3,0,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,u,2>
+  1141729430U,	// <0,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  2215471334U,	// <0,3,1,1>: Cost 3 vmrghw LHS, <3,1,1,1>
+  2215471425U,	// <0,3,1,2>: Cost 3 vmrghw LHS, <3,2,2,2>
+  1141729692U,	// <0,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1141729794U,	// <0,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2215430738U,	// <0,3,1,5>: Cost 3 vmrghw LHS, <3,5,5,5>
+  2215430776U,	// <0,3,1,6>: Cost 3 vmrghw LHS, <3,6,0,7>
+  2295646138U,	// <0,3,1,7>: Cost 3 vmrglw <2,3,0,1>, <2,6,3,7>
+  1141730078U,	// <0,3,1,u>: Cost 2 vmrghw LHS, <3,u,1,2>
+  2758986032U,	// <0,3,2,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,2,0,3>
+  3709141910U,	// <0,3,2,1>: Cost 4 vsldoi4 <3,0,3,2>, <1,2,3,0>
+  3289753921U,	// <0,3,2,2>: Cost 4 vmrghw <0,2,1,2>, <3,2,2,2>
+  2770929992U,	// <0,3,2,3>: Cost 3 vsldoi12 <3,2,3,0>, <3,2,3,0>
+  3289754114U,	// <0,3,2,4>: Cost 4 vmrghw <0,2,1,2>, <3,4,5,6>
+  3362095460U,	// <0,3,2,5>: Cost 5 vmrglw <1,1,0,2>, <0,4,3,5>
+  3832727910U,	// <0,3,2,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,2,6,3>
+  3365414842U,	// <0,3,2,7>: Cost 4 vmrglw <1,6,0,2>, <2,6,3,7>
+  2771298677U,	// <0,3,2,u>: Cost 3 vsldoi12 <3,2,u,0>, <3,2,u,0>
+  2216659094U,	// <0,3,3,0>: Cost 3 vmrghw <0,3,1,0>, <3,0,1,2>
+  3290409190U,	// <0,3,3,1>: Cost 4 vmrghw <0,3,1,1>, <3,1,1,1>
+  2703624496U,	// <0,3,3,2>: Cost 3 vsldoi8 <3,2,0,3>, <3,2,0,3>
+  2216683932U,	// <0,3,3,3>: Cost 3 vmrghw <0,3,1,3>, <3,3,3,3>
+  2216692226U,	// <0,3,3,4>: Cost 3 vmrghw <0,3,1,4>, <3,4,5,6>
+  3733041250U,	// <0,3,3,5>: Cost 4 vsldoi4 <7,0,3,3>, <5,6,7,0>
+  3832727988U,	// <0,3,3,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,3,6,0>
+  3374712762U,	// <0,3,3,7>: Cost 4 vmrglw <3,2,0,3>, <2,6,3,7>
+  2216725278U,	// <0,3,3,u>: Cost 3 vmrghw <0,3,1,u>, <3,u,1,2>
+  2217363606U,	// <0,3,4,0>: Cost 3 vmrghw <0,4,1,5>, <3,0,1,2>
+  3291105510U,	// <0,3,4,1>: Cost 4 vmrghw <0,4,1,5>, <3,1,1,1>
+  3291105601U,	// <0,3,4,2>: Cost 4 vmrghw <0,4,1,5>, <3,2,2,2>
+  2217363868U,	// <0,3,4,3>: Cost 3 vmrghw <0,4,1,5>, <3,3,3,3>
+  2217363970U,	// <0,3,4,4>: Cost 3 vmrghw <0,4,1,5>, <3,4,5,6>
+  2758986242U,	// <0,3,4,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,4,5,6>
+  3727077685U,	// <0,3,4,6>: Cost 4 vsldoi4 <6,0,3,4>, <6,0,3,4>
+  3364767674U,	// <0,3,4,7>: Cost 4 vmrglw <1,5,0,4>, <2,6,3,7>
+  2217364254U,	// <0,3,4,u>: Cost 3 vmrghw <0,4,1,5>, <3,u,1,2>
+  3832728102U,	// <0,3,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <3,5,0,6>
+  3405916003U,	// <0,3,5,1>: Cost 4 vmrglw <u,4,0,5>, <2,5,3,1>
+  3376055840U,	// <0,3,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,1,3,2>
+  3376055679U,	// <0,3,5,3>: Cost 4 vmrglw <3,4,0,5>, <1,u,3,3>
+  3376055194U,	// <0,3,5,4>: Cost 4 vmrglw <3,4,0,5>, <1,2,3,4>
+  3859565138U,	// <0,3,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <3,5,5,5>
+  2727514210U,	// <0,3,5,6>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0>
+  3376056250U,	// <0,3,5,7>: Cost 4 vmrglw <3,4,0,5>, <2,6,3,7>
+  2727514210U,	// <0,3,5,u>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0>
+  2758986360U,	// <0,3,6,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7>
+  3709174678U,	// <0,3,6,1>: Cost 4 vsldoi4 <3,0,3,6>, <1,2,3,0>
+  3795284411U,	// <0,3,6,2>: Cost 4 vsldoi8 <6,2,0,3>, <6,2,0,3>
+  3709175980U,	// <0,3,6,3>: Cost 4 vsldoi4 <3,0,3,6>, <3,0,3,6>
+  3833096860U,	// <0,3,6,4>: Cost 4 vsldoi12 <1,2,u,0>, <3,6,4,7>
+  3376728235U,	// <0,3,6,5>: Cost 5 vmrglw <3,5,0,6>, <3,0,3,5>
+  3859565229U,	// <0,3,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <3,6,6,6>
+  2773879472U,	// <0,3,6,7>: Cost 3 vsldoi12 <3,6,7,0>, <3,6,7,0>
+  2758986360U,	// <0,3,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7>
+  2303656854U,	// <0,3,7,0>: Cost 3 vmrglw <3,6,0,7>, <1,2,3,0>
+  3807229018U,	// <0,3,7,1>: Cost 4 vsldoi8 <u,2,0,3>, <7,1,2,u>
+  2727515284U,	// <0,3,7,2>: Cost 3 vsldoi8 <7,2,0,3>, <7,2,0,3>
+  3377399410U,	// <0,3,7,3>: Cost 4 vmrglw <3,6,0,7>, <2,2,3,3>
+  3377398682U,	// <0,3,7,4>: Cost 4 vmrglw <3,6,0,7>, <1,2,3,4>
+  3801257409U,	// <0,3,7,5>: Cost 4 vsldoi8 <7,2,0,3>, <7,5,6,7>
+  3377399980U,	// <0,3,7,6>: Cost 4 vmrglw <3,6,0,7>, <3,0,3,6>
+  3375409082U,	// <0,3,7,7>: Cost 4 vmrglw <3,3,0,7>, <2,6,3,7>
+  2731497082U,	// <0,3,7,u>: Cost 3 vsldoi8 <7,u,0,3>, <7,u,0,3>
+  1146333334U,	// <0,3,u,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  2220075238U,	// <0,3,u,1>: Cost 3 vmrghw LHS, <3,1,1,1>
+  2220075329U,	// <0,3,u,2>: Cost 3 vmrghw LHS, <3,2,2,2>
+  1146333596U,	// <0,3,u,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1146333698U,	// <0,3,u,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2758986566U,	// <0,3,u,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,u,5,6>
+  2803739472U,	// <0,3,u,6>: Cost 3 vsldoi12 <u,6,7,0>, <3,u,6,7>
+  2295703482U,	// <0,3,u,7>: Cost 3 vmrglw <2,3,0,u>, <2,6,3,7>
+  1146333982U,	// <0,3,u,u>: Cost 2 vmrghw LHS, <3,u,1,2>
+  2214595473U,	// <0,4,0,0>: Cost 3 vmrghw <0,0,0,0>, <4,0,5,0>
+  2693677158U,	// <0,4,0,1>: Cost 3 vsldoi8 <1,5,0,4>, LHS
+  3839437689U,	// <0,4,0,2>: Cost 4 vsldoi12 <2,3,4,0>, <4,0,2,3>
+  3709200559U,	// <0,4,0,3>: Cost 4 vsldoi4 <3,0,4,0>, <3,0,4,0>
+  2693677394U,	// <0,4,0,4>: Cost 3 vsldoi8 <1,5,0,4>, <0,4,1,5>
+  1140854070U,	// <0,4,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS
+  3767419409U,	// <0,4,0,6>: Cost 4 vsldoi8 <1,5,0,4>, <0,6,4,7>
+  3854109604U,	// <0,4,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,0,7,1>
+  1140854313U,	// <0,4,0,u>: Cost 2 vmrghw <0,0,0,0>, RHS
+  1141689234U,	// <0,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2215431114U,	// <0,4,1,1>: Cost 3 vmrghw LHS, <4,1,2,3>
+  2215431221U,	// <0,4,1,2>: Cost 3 vmrghw LHS, <4,2,5,2>
+  2635466928U,	// <0,4,1,3>: Cost 3 vsldoi4 <3,0,4,1>, <3,0,4,1>
+  1141689552U,	// <0,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  67947830U,	// <0,4,1,5>: Cost 1 vmrghw LHS, RHS
+  2215431545U,	// <0,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2>
+  2659357716U,	// <0,4,1,7>: Cost 3 vsldoi4 <7,0,4,1>, <7,0,4,1>
+  67948073U,	// <0,4,1,u>: Cost 1 vmrghw LHS, RHS
+  3767420369U,	// <0,4,2,0>: Cost 4 vsldoi8 <1,5,0,4>, <2,0,3,4>
+  3767420451U,	// <0,4,2,1>: Cost 4 vsldoi8 <1,5,0,4>, <2,1,3,5>
+  3767420520U,	// <0,4,2,2>: Cost 4 vsldoi8 <1,5,0,4>, <2,2,2,2>
+  2698323625U,	// <0,4,2,3>: Cost 3 vsldoi8 <2,3,0,4>, <2,3,0,4>
+  3709218102U,	// <0,4,2,4>: Cost 4 vsldoi4 <3,0,4,2>, RHS
+  2216013110U,	// <0,4,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS
+  3767420858U,	// <0,4,2,6>: Cost 4 vsldoi8 <1,5,0,4>, <2,6,3,7>
+  3774719981U,	// <0,4,2,7>: Cost 4 vsldoi8 <2,7,0,4>, <2,7,0,4>
+  2216013353U,	// <0,4,2,u>: Cost 3 vmrghw <0,2,1,2>, RHS
+  3767421078U,	// <0,4,3,0>: Cost 4 vsldoi8 <1,5,0,4>, <3,0,1,2>
+  3776710880U,	// <0,4,3,1>: Cost 4 vsldoi8 <3,1,0,4>, <3,1,0,4>
+  3833097325U,	// <0,4,3,2>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,2,4>
+  3767421340U,	// <0,4,3,3>: Cost 4 vsldoi8 <1,5,0,4>, <3,3,3,3>
+  3767421442U,	// <0,4,3,4>: Cost 4 vsldoi8 <1,5,0,4>, <3,4,5,6>
+  2216660278U,	// <0,4,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS
+  3833097361U,	// <0,4,3,6>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,6,4>
+  3780692678U,	// <0,4,3,7>: Cost 4 vsldoi8 <3,7,0,4>, <3,7,0,4>
+  2216660521U,	// <0,4,3,u>: Cost 3 vmrghw <0,3,1,0>, RHS
+  2617573416U,	// <0,4,4,0>: Cost 3 vsldoi4 <0,0,4,4>, <0,0,4,4>
+  2217364450U,	// <0,4,4,1>: Cost 3 vmrghw <0,4,1,5>, <4,1,5,0>
+  3691316771U,	// <0,4,4,2>: Cost 4 vsldoi4 <0,0,4,4>, <2,1,3,5>
+  3709233331U,	// <0,4,4,3>: Cost 4 vsldoi4 <3,0,4,4>, <3,0,4,4>
+  2785823952U,	// <0,4,4,4>: Cost 3 vsldoi12 <5,6,7,0>, <4,4,4,4>
+  1143622966U,	// <0,4,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS
+  3691319723U,	// <0,4,4,6>: Cost 4 vsldoi4 <0,0,4,4>, <6,1,7,5>
+  3854109932U,	// <0,4,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,4,7,5>
+  1143623209U,	// <0,4,4,u>: Cost 2 vmrghw <0,4,1,5>, RHS
+  2635497574U,	// <0,4,5,0>: Cost 3 vsldoi4 <3,0,4,5>, LHS
+  2635498390U,	// <0,4,5,1>: Cost 3 vsldoi4 <3,0,4,5>, <1,2,3,0>
+  3709240936U,	// <0,4,5,2>: Cost 4 vsldoi4 <3,0,4,5>, <2,2,2,2>
+  2635499700U,	// <0,4,5,3>: Cost 3 vsldoi4 <3,0,4,5>, <3,0,4,5>
+  2635500854U,	// <0,4,5,4>: Cost 3 vsldoi4 <3,0,4,5>, RHS
+  2785824044U,	// <0,4,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <4,5,5,6>
+  1685245238U,	// <0,4,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2659390488U,	// <0,4,5,7>: Cost 3 vsldoi4 <7,0,4,5>, <7,0,4,5>
+  1685245256U,	// <0,4,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  3839438161U,	// <0,4,6,0>: Cost 4 vsldoi12 <2,3,4,0>, <4,6,0,7>
+  3798610347U,	// <0,4,6,1>: Cost 4 vsldoi8 <6,7,0,4>, <6,1,7,5>
+  3798610426U,	// <0,4,6,2>: Cost 4 vsldoi8 <6,7,0,4>, <6,2,7,3>
+  3795956237U,	// <0,4,6,3>: Cost 4 vsldoi8 <6,3,0,4>, <6,3,0,4>
+  3733138742U,	// <0,4,6,4>: Cost 4 vsldoi4 <7,0,4,6>, RHS
+  2218634550U,	// <0,4,6,5>: Cost 3 vmrghw <0,6,0,7>, RHS
+  3798610744U,	// <0,4,6,6>: Cost 4 vsldoi8 <6,7,0,4>, <6,6,6,6>
+  2724868945U,	// <0,4,6,7>: Cost 3 vsldoi8 <6,7,0,4>, <6,7,0,4>
+  2725532578U,	// <0,4,6,u>: Cost 3 vsldoi8 <6,u,0,4>, <6,u,0,4>
+  3383371465U,	// <0,4,7,0>: Cost 4 vmrglw <4,6,0,7>, <2,3,4,0>
+  3800601668U,	// <0,4,7,1>: Cost 4 vsldoi8 <7,1,0,4>, <7,1,0,4>
+  3775386826U,	// <0,4,7,2>: Cost 5 vsldoi8 <2,u,0,4>, <7,2,6,3>
+  3801928934U,	// <0,4,7,3>: Cost 4 vsldoi8 <7,3,0,4>, <7,3,0,4>
+  3721202998U,	// <0,4,7,4>: Cost 4 vsldoi4 <5,0,4,7>, RHS
+  2780368328U,	// <0,4,7,5>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0>
+  3383372686U,	// <0,4,7,6>: Cost 5 vmrglw <4,6,0,7>, <4,0,4,6>
+  3854110170U,	// <0,4,7,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,7,7,0>
+  2780368328U,	// <0,4,7,u>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0>
+  1146334098U,	// <0,4,u,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2220076002U,	// <0,4,u,1>: Cost 3 vmrghw LHS, <4,1,5,0>
+  2220076085U,	// <0,4,u,2>: Cost 3 vmrghw LHS, <4,2,5,2>
+  2635524279U,	// <0,4,u,3>: Cost 3 vsldoi4 <3,0,4,u>, <3,0,4,u>
+  1146334416U,	// <0,4,u,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  72592694U,	// <0,4,u,5>: Cost 1 vmrghw LHS, RHS
+  1685245481U,	// <0,4,u,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2659415067U,	// <0,4,u,7>: Cost 3 vsldoi4 <7,0,4,u>, <7,0,4,u>
+  72592937U,	// <0,4,u,u>: Cost 1 vmrghw LHS, RHS
+  2281704337U,	// <0,5,0,0>: Cost 3 vmrglw <0,0,0,0>, <4,0,5,0>
+  2704965734U,	// <0,5,0,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  3778707666U,	// <0,5,0,2>: Cost 4 vsldoi8 <3,4,0,5>, <0,2,5,3>
+  3778707708U,	// <0,5,0,3>: Cost 4 vsldoi8 <3,4,0,5>, <0,3,1,0>
+  2687050057U,	// <0,5,0,4>: Cost 3 vsldoi8 <0,4,0,5>, <0,4,0,5>
+  2214596612U,	// <0,5,0,5>: Cost 3 vmrghw <0,0,0,0>, <5,5,5,5>
+  2785824372U,	// <0,5,0,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,0,6,1>
+  3854110332U,	// <0,5,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <5,0,7,0>
+  2704966301U,	// <0,5,0,u>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  1567768678U,	// <0,5,1,0>: Cost 2 vsldoi4 <4,0,5,1>, LHS
+  2312236570U,	// <0,5,1,1>: Cost 3 vmrglw <5,1,0,1>, <4,u,5,1>
+  2215431915U,	// <0,5,1,2>: Cost 3 vmrghw LHS, <5,2,1,3>
+  2641512598U,	// <0,5,1,3>: Cost 3 vsldoi4 <4,0,5,1>, <3,0,1,2>
+  1567771538U,	// <0,5,1,4>: Cost 2 vsldoi4 <4,0,5,1>, <4,0,5,1>
+  1141690372U,	// <0,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1141690466U,	// <0,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  2641515514U,	// <0,5,1,7>: Cost 3 vsldoi4 <4,0,5,1>, <7,0,1,2>
+  1141690615U,	// <0,5,1,u>: Cost 2 vmrghw LHS, <5,u,5,5>
+  3772736973U,	// <0,5,2,0>: Cost 4 vsldoi8 <2,4,0,5>, <2,0,3,0>
+  3778709024U,	// <0,5,2,1>: Cost 4 vsldoi8 <3,4,0,5>, <2,1,3,2>
+  3778709096U,	// <0,5,2,2>: Cost 4 vsldoi8 <3,4,0,5>, <2,2,2,2>
+  3778709158U,	// <0,5,2,3>: Cost 4 vsldoi8 <3,4,0,5>, <2,3,0,1>
+  3772737275U,	// <0,5,2,4>: Cost 4 vsldoi8 <2,4,0,5>, <2,4,0,5>
+  3859566351U,	// <0,5,2,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,2,5,3>
+  3778709434U,	// <0,5,2,6>: Cost 4 vsldoi8 <3,4,0,5>, <2,6,3,7>
+  3805251562U,	// <0,5,2,7>: Cost 4 vsldoi8 <7,u,0,5>, <2,7,0,1>
+  3775391807U,	// <0,5,2,u>: Cost 4 vsldoi8 <2,u,0,5>, <2,u,0,5>
+  2704967830U,	// <0,5,3,0>: Cost 3 vsldoi8 <3,4,0,5>, <3,0,1,2>
+  3776719073U,	// <0,5,3,1>: Cost 4 vsldoi8 <3,1,0,5>, <3,1,0,5>
+  3777382706U,	// <0,5,3,2>: Cost 4 vsldoi8 <3,2,0,5>, <3,2,0,5>
+  3778709887U,	// <0,5,3,3>: Cost 4 vsldoi8 <3,4,0,5>, <3,3,0,1>
+  2704968148U,	// <0,5,3,4>: Cost 3 vsldoi8 <3,4,0,5>, <3,4,0,5>
+  3857428317U,	// <0,5,3,5>: Cost 4 vsldoi12 <5,3,5,0>, <5,3,5,0>
+  3364096514U,	// <0,5,3,6>: Cost 4 vmrglw <1,4,0,3>, <3,4,5,6>
+  3780700871U,	// <0,5,3,7>: Cost 4 vsldoi8 <3,7,0,5>, <3,7,0,5>
+  2707622680U,	// <0,5,3,u>: Cost 3 vsldoi8 <3,u,0,5>, <3,u,0,5>
+  2728856466U,	// <0,5,4,0>: Cost 3 vsldoi8 <7,4,0,5>, <4,0,5,1>
+  3697361674U,	// <0,5,4,1>: Cost 4 vsldoi4 <1,0,5,4>, <1,0,5,4>
+  3697362601U,	// <0,5,4,2>: Cost 4 vsldoi4 <1,0,5,4>, <2,3,0,4>
+  3364766635U,	// <0,5,4,3>: Cost 4 vmrglw <1,5,0,4>, <1,2,5,3>
+  2217365428U,	// <0,5,4,4>: Cost 3 vmrghw <0,4,1,5>, <5,4,5,6>
+  2704969014U,	// <0,5,4,5>: Cost 3 vsldoi8 <3,4,0,5>, RHS
+  2785824700U,	// <0,5,4,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,4,6,5>
+  3364766963U,	// <0,5,4,7>: Cost 4 vmrglw <1,5,0,4>, <1,6,5,7>
+  2704969257U,	// <0,5,4,u>: Cost 3 vsldoi8 <3,4,0,5>, RHS
+  3846148050U,	// <0,5,5,0>: Cost 4 vsldoi12 <3,4,5,0>, <5,5,0,0>
+  2326203282U,	// <0,5,5,1>: Cost 3 vmrglw <7,4,0,5>, <4,0,5,1>
+  3291746027U,	// <0,5,5,2>: Cost 4 vmrghw <0,5,1,2>, <5,2,1,3>
+  3376054482U,	// <0,5,5,3>: Cost 4 vmrglw <3,4,0,5>, <0,2,5,3>
+  3790655366U,	// <0,5,5,4>: Cost 4 vsldoi8 <5,4,0,5>, <5,4,0,5>
+  2785824772U,	// <0,5,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <5,5,5,5>
+  2724876386U,	// <0,5,5,6>: Cost 3 vsldoi8 <6,7,0,5>, <5,6,7,0>
+  3858903057U,	// <0,5,5,7>: Cost 4 vsldoi12 <5,5,7,0>, <5,5,7,0>
+  2736820484U,	// <0,5,5,u>: Cost 3 vsldoi8 <u,7,0,5>, <5,u,7,0>
+  2659467366U,	// <0,5,6,0>: Cost 3 vsldoi4 <7,0,5,6>, LHS
+  3859566643U,	// <0,5,6,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,6,1,7>
+  3798618618U,	// <0,5,6,2>: Cost 4 vsldoi8 <6,7,0,5>, <6,2,7,3>
+  3852857410U,	// <0,5,6,3>: Cost 4 vsldoi12 <4,5,6,0>, <5,6,3,4>
+  2659470646U,	// <0,5,6,4>: Cost 3 vsldoi4 <7,0,5,6>, RHS
+  2659471458U,	// <0,5,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0>
+  3832729696U,	// <0,5,6,6>: Cost 4 vsldoi12 <1,2,3,0>, <5,6,6,7>
+  1712083042U,	// <0,5,6,7>: Cost 2 vsldoi12 <5,6,7,0>, <5,6,7,0>
+  1712156779U,	// <0,5,6,u>: Cost 2 vsldoi12 <5,6,u,0>, <5,6,u,0>
+  2731512826U,	// <0,5,7,0>: Cost 3 vsldoi8 <7,u,0,5>, <7,0,1,2>
+  3859566717U,	// <0,5,7,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,1,0>
+  3798619284U,	// <0,5,7,2>: Cost 4 vsldoi8 <6,7,0,5>, <7,2,0,3>
+  3778712803U,	// <0,5,7,3>: Cost 4 vsldoi8 <3,4,0,5>, <7,3,0,1>
+  2728858936U,	// <0,5,7,4>: Cost 3 vsldoi8 <7,4,0,5>, <7,4,0,5>
+  3859566753U,	// <0,5,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,5,0>
+  3377398135U,	// <0,5,7,6>: Cost 4 vmrglw <3,6,0,7>, <0,4,5,6>
+  3798619686U,	// <0,5,7,7>: Cost 4 vsldoi8 <6,7,0,5>, <7,7,0,0>
+  2731513468U,	// <0,5,7,u>: Cost 3 vsldoi8 <7,u,0,5>, <7,u,0,5>
+  1567826022U,	// <0,5,u,0>: Cost 2 vsldoi4 <4,0,5,u>, LHS
+  2704971566U,	// <0,5,u,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  2220076779U,	// <0,5,u,2>: Cost 3 vmrghw LHS, <5,2,1,3>
+  2641569942U,	// <0,5,u,3>: Cost 3 vsldoi4 <4,0,5,u>, <3,0,1,2>
+  1567828889U,	// <0,5,u,4>: Cost 2 vsldoi4 <4,0,5,u>, <4,0,5,u>
+  1146335236U,	// <0,5,u,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1146335330U,	// <0,5,u,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  1713410308U,	// <0,5,u,7>: Cost 2 vsldoi12 <5,u,7,0>, <5,u,7,0>
+  1713484045U,	// <0,5,u,u>: Cost 2 vsldoi12 <5,u,u,0>, <5,u,u,0>
+  2214596949U,	// <0,6,0,0>: Cost 3 vmrghw <0,0,0,0>, <6,0,7,0>
+  2214678951U,	// <0,6,0,1>: Cost 3 vmrghw <0,0,1,1>, <6,1,7,1>
+  2214597114U,	// <0,6,0,2>: Cost 3 vmrghw <0,0,0,0>, <6,2,7,3>
+  3852857653U,	// <0,6,0,3>: Cost 4 vsldoi12 <4,5,6,0>, <6,0,3,4>
+  3832729919U,	// <0,6,0,4>: Cost 4 vsldoi12 <1,2,3,0>, <6,0,4,5>
+  3721293427U,	// <0,6,0,5>: Cost 4 vsldoi4 <5,0,6,0>, <5,0,6,0>
+  2214597432U,	// <0,6,0,6>: Cost 3 vmrghw <0,0,0,0>, <6,6,6,6>
+  1207962934U,	// <0,6,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS
+  1207962935U,	// <0,6,0,u>: Cost 2 vmrglw <0,0,0,0>, RHS
+  2215432481U,	// <0,6,1,0>: Cost 3 vmrghw LHS, <6,0,1,2>
+  2215432615U,	// <0,6,1,1>: Cost 3 vmrghw LHS, <6,1,7,1>
+  1141690874U,	// <0,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2215432754U,	// <0,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5>
+  2215432817U,	// <0,6,1,4>: Cost 3 vmrghw LHS, <6,4,2,5>
+  2215432939U,	// <0,6,1,5>: Cost 3 vmrghw LHS, <6,5,7,1>
+  1141691192U,	// <0,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1221905718U,	// <0,6,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS
+  1221905719U,	// <0,6,1,u>: Cost 2 vmrglw <2,3,0,1>, RHS
+  3852857787U,	// <0,6,2,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,2,0,3>
+  3289764265U,	// <0,6,2,1>: Cost 4 vmrghw <0,2,1,3>, <6,1,7,3>
+  3289690618U,	// <0,6,2,2>: Cost 4 vmrghw <0,2,0,3>, <6,2,7,3>
+  3862589907U,	// <0,6,2,3>: Cost 4 vsldoi12 <6,2,3,0>, <6,2,3,0>
+  3733253430U,	// <0,6,2,4>: Cost 4 vsldoi4 <7,0,6,2>, RHS
+  3733254242U,	// <0,6,2,5>: Cost 4 vsldoi4 <7,0,6,2>, <5,6,7,0>
+  3777390522U,	// <0,6,2,6>: Cost 4 vsldoi8 <3,2,0,6>, <2,6,3,7>
+  2785825274U,	// <0,6,2,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,7,3>
+  2785825283U,	// <0,6,2,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,u,3>
+  3777390742U,	// <0,6,3,0>: Cost 4 vsldoi8 <3,2,0,6>, <3,0,1,2>
+  3863106066U,	// <0,6,3,1>: Cost 4 vsldoi12 <6,3,1,0>, <6,3,1,0>
+  3777390899U,	// <0,6,3,2>: Cost 4 vsldoi8 <3,2,0,6>, <3,2,0,6>
+  3290436146U,	// <0,6,3,3>: Cost 4 vmrghw <0,3,1,4>, <6,3,4,5>
+  3779381762U,	// <0,6,3,4>: Cost 4 vsldoi8 <3,5,0,6>, <3,4,5,6>
+  3779381798U,	// <0,6,3,5>: Cost 4 vsldoi8 <3,5,0,6>, <3,5,0,6>
+  3733262920U,	// <0,6,3,6>: Cost 4 vsldoi4 <7,0,6,3>, <6,3,7,0>
+  2300972342U,	// <0,6,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS
+  2300972343U,	// <0,6,3,u>: Cost 3 vmrglw <3,2,0,3>, RHS
+  3802606482U,	// <0,6,4,0>: Cost 4 vsldoi8 <7,4,0,6>, <4,0,5,1>
+  2217365931U,	// <0,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5>
+  2217366010U,	// <0,6,4,2>: Cost 3 vmrghw <0,4,1,5>, <6,2,7,3>
+  3291107890U,	// <0,6,4,3>: Cost 4 vmrghw <0,4,1,5>, <6,3,4,5>
+  3291099805U,	// <0,6,4,4>: Cost 4 vmrghw <0,4,1,4>, <6,4,7,4>
+  3777391926U,	// <0,6,4,5>: Cost 4 vsldoi8 <3,2,0,6>, RHS
+  2217366328U,	// <0,6,4,6>: Cost 3 vmrghw <0,4,1,5>, <6,6,6,6>
+  2291027254U,	// <0,6,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS
+  2291027255U,	// <0,6,4,u>: Cost 3 vmrglw <1,5,0,4>, RHS
+  3852858033U,	// <0,6,5,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,5,0,6>
+  3395964532U,	// <0,6,5,1>: Cost 4 vmrglw <6,7,0,5>, <5,0,6,1>
+  3864507069U,	// <0,6,5,2>: Cost 4 vsldoi12 <6,5,2,0>, <6,5,2,0>
+  3376056678U,	// <0,6,5,3>: Cost 5 vmrglw <3,4,0,5>, <3,2,6,3>
+  3721334070U,	// <0,6,5,4>: Cost 4 vsldoi4 <5,0,6,5>, RHS
+  3395964860U,	// <0,6,5,5>: Cost 4 vmrglw <6,7,0,5>, <5,4,6,5>
+  3864802017U,	// <0,6,5,6>: Cost 4 vsldoi12 <6,5,6,0>, <6,5,6,0>
+  2302315830U,	// <0,6,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS
+  2302315831U,	// <0,6,5,u>: Cost 3 vmrglw <3,4,0,5>, RHS
+  3852858108U,	// <0,6,6,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,6,0,0>
+  3398624745U,	// <0,6,6,1>: Cost 4 vmrglw <7,2,0,6>, <2,0,6,1>
+  2218668538U,	// <0,6,6,2>: Cost 3 vmrghw <0,6,1,2>, <6,2,7,3>
+  3292418610U,	// <0,6,6,3>: Cost 4 vmrghw <0,6,1,3>, <6,3,4,5>
+  3733286198U,	// <0,6,6,4>: Cost 4 vsldoi4 <7,0,6,6>, RHS
+  3797299889U,	// <0,6,6,5>: Cost 4 vsldoi8 <6,5,0,6>, <6,5,0,6>
+  2785825592U,	// <0,6,6,6>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,6,6>
+  2785825602U,	// <0,6,6,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,7,7>
+  2785825611U,	// <0,6,6,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,u,7>
+  2785825614U,	// <0,6,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,0,1>
+  2758988632U,	// <0,6,7,1>: Cost 3 vsldoi12 <1,2,3,0>, <6,7,1,2>
+  3377400084U,	// <0,6,7,2>: Cost 4 vmrglw <3,6,0,7>, <3,1,6,2>
+  2792166248U,	// <0,6,7,3>: Cost 3 vsldoi12 <6,7,3,0>, <6,7,3,0>
+  2785825654U,	// <0,6,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,4,5>
+  2785825664U,	// <0,6,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6>
+  3859567493U,	// <0,6,7,6>: Cost 4 vsldoi12 <5,6,7,0>, <6,7,6,2>
+  2303659318U,	// <0,6,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2303659319U,	// <0,6,7,u>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2785825695U,	// <0,6,u,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,0,1>
+  2220077479U,	// <0,6,u,1>: Cost 3 vmrghw LHS, <6,1,7,1>
+  1146335738U,	// <0,6,u,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2792829881U,	// <0,6,u,3>: Cost 3 vsldoi12 <6,u,3,0>, <6,u,3,0>
+  2785825735U,	// <0,6,u,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,4,5>
+  2785825664U,	// <0,6,u,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6>
+  1146336056U,	// <0,6,u,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1221963062U,	// <0,6,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS
+  1221963063U,	// <0,6,u,u>: Cost 2 vmrglw <2,3,0,u>, RHS
+  2653593600U,	// <0,7,0,0>: Cost 3 vsldoi4 <6,0,7,0>, <0,0,0,0>
+  2706309222U,	// <0,7,0,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  3709421498U,	// <0,7,0,2>: Cost 4 vsldoi4 <3,0,7,0>, <2,6,3,7>
+  2281705978U,	// <0,7,0,3>: Cost 3 vmrglw <0,0,0,0>, <6,2,7,3>
+  2785825816U,	// <0,7,0,4>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,4,5>
+  2785825826U,	// <0,7,0,5>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,5,6>
+  2653598037U,	// <0,7,0,6>: Cost 3 vsldoi4 <6,0,7,0>, <6,0,7,0>
+  2214598252U,	// <0,7,0,7>: Cost 3 vmrghw <0,0,0,0>, <7,7,7,7>
+  2706309789U,	// <0,7,0,u>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  1141691386U,	// <0,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  2215433290U,	// <0,7,1,1>: Cost 3 vmrghw LHS, <7,1,1,1>
+  2706310038U,	// <0,7,1,2>: Cost 3 vsldoi8 <3,6,0,7>, <1,2,3,0>
+  2322190842U,	// <0,7,1,3>: Cost 3 vmrglw <6,7,0,1>, <6,2,7,3>
+  1141691750U,	// <0,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2215433654U,	// <0,7,1,5>: Cost 3 vmrghw LHS, <7,5,5,5>
+  2653606230U,	// <0,7,1,6>: Cost 3 vsldoi4 <6,0,7,1>, <6,0,7,1>
+  1141692012U,	// <0,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1141692034U,	// <0,7,1,u>: Cost 2 vmrghw LHS, <7,u,1,2>
+  2785825940U,	// <0,7,2,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,2,0,3>
+  3768108576U,	// <0,7,2,1>: Cost 5 vsldoi8 <1,6,0,7>, <2,1,3,2>
+  3780052584U,	// <0,7,2,2>: Cost 4 vsldoi8 <3,6,0,7>, <2,2,2,2>
+  2794820780U,	// <0,7,2,3>: Cost 3 vsldoi12 <7,2,3,0>, <7,2,3,0>
+  3859641528U,	// <0,7,2,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,2,4,3>
+  3733327970U,	// <0,7,2,5>: Cost 4 vsldoi4 <7,0,7,2>, <5,6,7,0>
+  3778062266U,	// <0,7,2,6>: Cost 4 vsldoi8 <3,3,0,7>, <2,6,3,7>
+  3733328944U,	// <0,7,2,7>: Cost 4 vsldoi4 <7,0,7,2>, <7,0,7,2>
+  2795189465U,	// <0,7,2,u>: Cost 3 vsldoi12 <7,2,u,0>, <7,2,u,0>
+  2324861026U,	// <0,7,3,0>: Cost 3 vmrglw <7,2,0,3>, <5,6,7,0>
+  3780053233U,	// <0,7,3,1>: Cost 4 vsldoi8 <3,6,0,7>, <3,1,2,3>
+  3780053296U,	// <0,7,3,2>: Cost 4 vsldoi8 <3,6,0,7>, <3,2,0,3>
+  3778062725U,	// <0,7,3,3>: Cost 4 vsldoi8 <3,3,0,7>, <3,3,0,7>
+  3780053506U,	// <0,7,3,4>: Cost 4 vsldoi8 <3,6,0,7>, <3,4,5,6>
+  3803941469U,	// <0,7,3,5>: Cost 4 vsldoi8 <7,6,0,7>, <3,5,6,7>
+  2706311800U,	// <0,7,3,6>: Cost 3 vsldoi8 <3,6,0,7>, <3,6,0,7>
+  3398603586U,	// <0,7,3,7>: Cost 4 vmrglw <7,2,0,3>, <6,6,7,7>
+  2707639066U,	// <0,7,3,u>: Cost 3 vsldoi8 <3,u,0,7>, <3,u,0,7>
+  2217366522U,	// <0,7,4,0>: Cost 3 vmrghw <0,4,1,5>, <7,0,1,2>
+  3727369110U,	// <0,7,4,1>: Cost 4 vsldoi4 <6,0,7,4>, <1,2,3,0>
+  3291108500U,	// <0,7,4,2>: Cost 4 vmrghw <0,4,1,5>, <7,2,0,3>
+  3727370872U,	// <0,7,4,3>: Cost 4 vsldoi4 <6,0,7,4>, <3,6,0,7>
+  2217366886U,	// <0,7,4,4>: Cost 3 vmrghw <0,4,1,5>, <7,4,5,6>
+  2706312502U,	// <0,7,4,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  3786026321U,	// <0,7,4,6>: Cost 4 vsldoi8 <4,6,0,7>, <4,6,0,7>
+  2217367148U,	// <0,7,4,7>: Cost 3 vmrghw <0,4,1,5>, <7,7,7,7>
+  2706312745U,	// <0,7,4,u>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  2322223202U,	// <0,7,5,0>: Cost 3 vmrglw <6,7,0,5>, <5,6,7,0>
+  3399946987U,	// <0,7,5,1>: Cost 4 vmrglw <7,4,0,5>, <6,5,7,1>
+  3291780244U,	// <0,7,5,2>: Cost 4 vmrghw <0,5,1,6>, <7,2,0,3>
+  3727378582U,	// <0,7,5,3>: Cost 4 vsldoi4 <6,0,7,5>, <3,0,1,2>
+  3727379766U,	// <0,7,5,4>: Cost 4 vsldoi4 <6,0,7,5>, RHS
+  3859568054U,	// <0,7,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,5,5,5>
+  2785826241U,	// <0,7,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <7,5,6,7>
+  3395965762U,	// <0,7,5,7>: Cost 4 vmrglw <6,7,0,5>, <6,6,7,7>
+  2787153363U,	// <0,7,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <7,5,u,7>
+  2785826268U,	// <0,7,6,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,6,0,7>
+  3780055420U,	// <0,7,6,1>: Cost 5 vsldoi8 <3,6,0,7>, <6,1,2,3>
+  3859568110U,	// <0,7,6,2>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,2,7>
+  3874534903U,	// <0,7,6,3>: Cost 4 vsldoi12 <u,2,3,0>, <7,6,3,7>
+  3859641856U,	// <0,7,6,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,6,4,7>
+  3733360738U,	// <0,7,6,5>: Cost 4 vsldoi4 <7,0,7,6>, <5,6,7,0>
+  3859568145U,	// <0,7,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,6,6>
+  2797770260U,	// <0,7,6,7>: Cost 3 vsldoi12 <7,6,7,0>, <7,6,7,0>
+  2797843997U,	// <0,7,6,u>: Cost 3 vsldoi12 <7,6,u,0>, <7,6,u,0>
+  2785826342U,	// <0,7,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,0,0>
+  3727393686U,	// <0,7,7,1>: Cost 4 vsldoi4 <6,0,7,7>, <1,2,3,0>
+  3868563003U,	// <0,7,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <7,7,2,3>
+  3377397988U,	// <0,7,7,3>: Cost 4 vmrglw <3,6,0,7>, <0,2,7,3>
+  2219349350U,	// <0,7,7,4>: Cost 3 vmrghw <0,7,1,4>, <7,4,5,6>
+  3859568217U,	// <0,7,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,7,5,6>
+  2730202588U,	// <0,7,7,6>: Cost 3 vsldoi8 <7,6,0,7>, <7,6,0,7>
+  2785826412U,	// <0,7,7,7>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,7,7>
+  2731529854U,	// <0,7,7,u>: Cost 3 vsldoi8 <7,u,0,7>, <7,u,0,7>
+  1146336250U,	// <0,7,u,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  2706315054U,	// <0,7,u,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  2653660845U,	// <0,7,u,2>: Cost 3 vsldoi4 <6,0,7,u>, <2,3,0,u>
+  2322248186U,	// <0,7,u,3>: Cost 3 vmrglw <6,7,0,u>, <6,2,7,3>
+  1146336614U,	// <0,7,u,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2706315418U,	// <0,7,u,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  2653663581U,	// <0,7,u,6>: Cost 3 vsldoi4 <6,0,7,u>, <6,0,7,u>
+  1146336876U,	// <0,7,u,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1146336898U,	// <0,7,u,u>: Cost 2 vmrghw LHS, <7,u,1,2>
+  202162278U,	// <0,u,0,0>: Cost 1 vspltisw0 LHS
+  1624612966U,	// <0,u,0,1>: Cost 2 vsldoi8 <2,3,0,u>, LHS
+  2629780986U,	// <0,u,0,2>: Cost 3 vsldoi4 <2,0,u,0>, <2,0,u,0>
+  1207959708U,	// <0,u,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS
+  1544097078U,	// <0,u,0,4>: Cost 2 vsldoi4 <0,0,u,0>, RHS
+  1140856986U,	// <0,u,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS
+  2698355253U,	// <0,u,0,6>: Cost 3 vsldoi8 <2,3,0,u>, <0,6,u,7>
+  1207962952U,	// <0,u,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS
+  202162278U,	// <0,u,0,u>: Cost 1 vspltisw0 LHS
+  1142134483U,	// <0,u,1,0>: Cost 2 vmrghw LHS, <u,0,1,2>
+  67950382U,	// <0,u,1,1>: Cost 1 vmrghw LHS, LHS
+  1142175624U,	// <0,u,1,2>: Cost 2 vmrghw LHS, <u,2,3,3>
+  1142175676U,	// <0,u,1,3>: Cost 2 vmrghw LHS, <u,3,0,1>
+  1142134847U,	// <0,u,1,4>: Cost 2 vmrghw LHS, <u,4,5,6>
+  67950746U,	// <0,u,1,5>: Cost 1 vmrghw LHS, RHS
+  1142175952U,	// <0,u,1,6>: Cost 2 vmrghw LHS, <u,6,3,7>
+  1221905736U,	// <0,u,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS
+  67950949U,	// <0,u,1,u>: Cost 1 vmrghw LHS, LHS
+  1562026086U,	// <0,u,2,0>: Cost 2 vsldoi4 <3,0,u,2>, LHS
+  2216015662U,	// <0,u,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2698356328U,	// <0,u,2,2>: Cost 3 vsldoi8 <2,3,0,u>, <2,2,2,2>
+  835584U,	// <0,u,2,3>: Cost 0 copy LHS
+  1562029366U,	// <0,u,2,4>: Cost 2 vsldoi4 <3,0,u,2>, RHS
+  2216016026U,	// <0,u,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS
+  2698356666U,	// <0,u,2,6>: Cost 3 vsldoi8 <2,3,0,u>, <2,6,3,7>
+  1585919033U,	// <0,u,2,7>: Cost 2 vsldoi4 <7,0,u,2>, <7,0,u,2>
+  835584U,	// <0,u,2,u>: Cost 0 copy LHS
+  2758989756U,	// <0,u,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,0,1>
+  2216662830U,	// <0,u,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2703665461U,	// <0,u,3,2>: Cost 3 vsldoi8 <3,2,0,u>, <3,2,0,u>
+  2758989782U,	// <0,u,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,3,0>
+  2758989796U,	// <0,u,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,4,5>
+  2216663194U,	// <0,u,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS
+  2706319993U,	// <0,u,3,6>: Cost 3 vsldoi8 <3,6,0,u>, <3,6,0,u>
+  2300972360U,	// <0,u,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS
+  2216663397U,	// <0,u,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2217367251U,	// <0,u,4,0>: Cost 3 vmrghw <0,4,1,5>, <u,0,1,2>
+  1143625518U,	// <0,u,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2217367432U,	// <0,u,4,2>: Cost 3 vmrghw <0,4,1,5>, <u,2,3,3>
+  2217367484U,	// <0,u,4,3>: Cost 3 vmrghw <0,4,1,5>, <u,3,0,1>
+  1143619922U,	// <0,u,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1143625882U,	// <0,u,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS
+  2217367760U,	// <0,u,4,6>: Cost 3 vmrghw <0,4,1,5>, <u,6,3,7>
+  2291027272U,	// <0,u,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS
+  1143626085U,	// <0,u,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2635792486U,	// <0,u,5,0>: Cost 3 vsldoi4 <3,0,u,5>, LHS
+  2635793302U,	// <0,u,5,1>: Cost 3 vsldoi4 <3,0,u,5>, <1,2,3,0>
+  2302314646U,	// <0,u,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2>
+  2635794648U,	// <0,u,5,3>: Cost 3 vsldoi4 <3,0,u,5>, <3,0,u,5>
+  2635795766U,	// <0,u,5,4>: Cost 3 vsldoi4 <3,0,u,5>, RHS
+  2717601754U,	// <0,u,5,5>: Cost 3 vsldoi8 <5,5,0,u>, <5,5,0,u>
+  1685248154U,	// <0,u,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2302315848U,	// <0,u,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS
+  1685248172U,	// <0,u,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2759358645U,	// <0,u,6,0>: Cost 3 vsldoi12 <1,2,u,0>, <u,6,0,7>
+  2218637102U,	// <0,u,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS
+  2724901370U,	// <0,u,6,2>: Cost 3 vsldoi8 <6,7,0,u>, <6,2,7,3>
+  2758990032U,	// <0,u,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,6,3,7>
+  2659691830U,	// <0,u,6,4>: Cost 3 vsldoi4 <7,0,u,6>, RHS
+  2659471458U,	// <0,u,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0>
+  2724901688U,	// <0,u,6,6>: Cost 3 vsldoi8 <6,7,0,u>, <6,6,6,6>
+  1651159893U,	// <0,u,6,7>: Cost 2 vsldoi8 <6,7,0,u>, <6,7,0,u>
+  1651823526U,	// <0,u,6,u>: Cost 2 vsldoi8 <6,u,0,u>, <6,u,0,u>
+  2785827072U,	// <0,u,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,0,1>
+  2803964168U,	// <0,u,7,1>: Cost 3 vsldoi12 <u,7,1,0>, <u,7,1,0>
+  2727556249U,	// <0,u,7,2>: Cost 3 vsldoi8 <7,2,0,u>, <7,2,0,u>
+  2303656092U,	// <0,u,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS
+  2785827112U,	// <0,u,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,4,5>
+  2785827122U,	// <0,u,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,5,6>
+  2730210781U,	// <0,u,7,6>: Cost 3 vsldoi8 <7,6,0,u>, <7,6,0,u>
+  2303659336U,	// <0,u,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2303656097U,	// <0,u,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS
+  202162278U,	// <0,u,u,0>: Cost 1 vspltisw0 LHS
+  72595246U,	// <0,u,u,1>: Cost 1 vmrghw LHS, LHS
+  1146337160U,	// <0,u,u,2>: Cost 2 vmrghw LHS, <u,2,3,3>
+  835584U,	// <0,u,u,3>: Cost 0 copy LHS
+  1146337343U,	// <0,u,u,4>: Cost 2 vmrghw LHS, <u,4,5,6>
+  72595610U,	// <0,u,u,5>: Cost 1 vmrghw LHS, RHS
+  1146337488U,	// <0,u,u,6>: Cost 2 vmrghw LHS, <u,6,3,7>
+  1221963080U,	// <0,u,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS
+  835584U,	// <0,u,u,u>: Cost 0 copy LHS
+  2756853760U,	// <1,0,0,0>: Cost 3 vsldoi12 <0,u,1,1>, <0,0,0,0>
+  1677803530U,	// <1,0,0,1>: Cost 2 vsldoi12 <0,0,1,1>, <0,0,1,1>
+  3759497387U,	// <1,0,0,2>: Cost 4 vsldoi8 <0,2,1,0>, <0,2,1,0>
+  2686419196U,	// <1,0,0,3>: Cost 3 vsldoi8 <0,3,1,0>, <0,3,1,0>
+  2751766565U,	// <1,0,0,4>: Cost 3 vsldoi12 <0,0,4,1>, <0,0,4,1>
+  2687746462U,	// <1,0,0,5>: Cost 3 vsldoi8 <0,5,1,0>, <0,5,1,0>
+  3776086518U,	// <1,0,0,6>: Cost 4 vsldoi8 <3,0,1,0>, <0,6,1,7>
+  2689073728U,	// <1,0,0,7>: Cost 3 vsldoi8 <0,7,1,0>, <0,7,1,0>
+  1678319689U,	// <1,0,0,u>: Cost 2 vsldoi12 <0,0,u,1>, <0,0,u,1>
+  2287091712U,	// <1,0,1,0>: Cost 3 vmrglw <0,u,1,1>, <0,0,0,0>
+  1147568230U,	// <1,0,1,1>: Cost 2 vmrghw <1,1,1,1>, LHS
+  1683112038U,	// <1,0,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  3294970108U,	// <1,0,1,3>: Cost 4 vmrghw <1,1,0,0>, <0,3,1,0>
+  2623892790U,	// <1,0,1,4>: Cost 3 vsldoi4 <1,1,0,1>, RHS
+  2647781007U,	// <1,0,1,5>: Cost 3 vsldoi4 <5,1,0,1>, <5,1,0,1>
+  2791948430U,	// <1,0,1,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7>
+  3721524218U,	// <1,0,1,7>: Cost 4 vsldoi4 <5,1,0,1>, <7,0,1,2>
+  1683112092U,	// <1,0,1,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  2222112768U,	// <1,0,2,0>: Cost 3 vmrghw <1,2,3,0>, <0,0,0,0>
+  1148371046U,	// <1,0,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS
+  3356862524U,	// <1,0,2,2>: Cost 4 vmrglw <0,2,1,2>, <2,u,0,2>
+  2702345894U,	// <1,0,2,3>: Cost 3 vsldoi8 <3,0,1,0>, <2,3,0,1>
+  2222113106U,	// <1,0,2,4>: Cost 3 vmrghw <1,2,3,0>, <0,4,1,5>
+  2299709908U,	// <1,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5>
+  3760162746U,	// <1,0,2,6>: Cost 4 vsldoi8 <0,3,1,0>, <2,6,3,7>
+  3369470584U,	// <1,0,2,7>: Cost 4 vmrglw <2,3,1,2>, <3,6,0,7>
+  1148371613U,	// <1,0,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS
+  2686421142U,	// <1,0,3,0>: Cost 3 vsldoi8 <0,3,1,0>, <3,0,1,2>
+  2283128486U,	// <1,0,3,1>: Cost 3 vmrglw <0,2,1,3>, <2,3,0,1>
+  3296305326U,	// <1,0,3,2>: Cost 4 vmrghw <1,3,0,1>, <0,2,1,3>
+  3760163199U,	// <1,0,3,3>: Cost 4 vsldoi8 <0,3,1,0>, <3,3,0,1>
+  3760163330U,	// <1,0,3,4>: Cost 4 vsldoi8 <0,3,1,0>, <3,4,5,6>
+  3779406377U,	// <1,0,3,5>: Cost 4 vsldoi8 <3,5,1,0>, <3,5,1,0>
+  3865690416U,	// <1,0,3,6>: Cost 4 vsldoi12 <6,7,0,1>, <0,3,6,7>
+  3366824568U,	// <1,0,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,0,7>
+  2707655452U,	// <1,0,3,u>: Cost 3 vsldoi8 <3,u,1,0>, <3,u,1,0>
+  2734861202U,	// <1,0,4,0>: Cost 3 vsldoi8 <u,4,1,0>, <4,0,5,1>
+  2756854098U,	// <1,0,4,1>: Cost 3 vsldoi12 <0,u,1,1>, <0,4,1,5>
+  3830595931U,	// <1,0,4,2>: Cost 5 vsldoi12 <0,u,1,1>, <0,4,2,5>
+  3296968960U,	// <1,0,4,3>: Cost 4 vmrghw <1,4,0,1>, <0,3,1,4>
+  3830595949U,	// <1,0,4,4>: Cost 4 vsldoi12 <0,u,1,1>, <0,4,4,5>
+  2686422326U,	// <1,0,4,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  3297378806U,	// <1,0,4,6>: Cost 5 vmrghw <1,4,5,6>, <0,6,1,7>
+  3810594248U,	// <1,0,4,7>: Cost 4 vsldoi8 <u,7,1,0>, <4,7,5,0>
+  2686422569U,	// <1,0,4,u>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  2284470272U,	// <1,0,5,0>: Cost 3 vmrglw <0,4,1,5>, <0,0,0,0>
+  2284471974U,	// <1,0,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,1>
+  3809267435U,	// <1,0,5,2>: Cost 4 vsldoi8 <u,5,1,0>, <5,2,1,3>
+  3297968384U,	// <1,0,5,3>: Cost 4 vmrghw <1,5,4,6>, <0,3,1,4>
+  2284471977U,	// <1,0,5,4>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,4>
+  3721555603U,	// <1,0,5,5>: Cost 4 vsldoi4 <5,1,0,5>, <5,1,0,5>
+  3792679010U,	// <1,0,5,6>: Cost 4 vsldoi8 <5,7,1,0>, <5,6,7,0>
+  3792679037U,	// <1,0,5,7>: Cost 4 vsldoi8 <5,7,1,0>, <5,7,1,0>
+  2284471981U,	// <1,0,5,u>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,u>
+  3356893184U,	// <1,0,6,0>: Cost 4 vmrglw <0,2,1,6>, <0,0,0,0>
+  2224676966U,	// <1,0,6,1>: Cost 3 vmrghw <1,6,1,7>, LHS
+  3298295985U,	// <1,0,6,2>: Cost 4 vmrghw <1,6,0,1>, <0,2,1,6>
+  3298345212U,	// <1,0,6,3>: Cost 4 vmrghw <1,6,0,7>, <0,3,1,0>
+  2224972114U,	// <1,0,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5>
+  3808604907U,	// <1,0,6,5>: Cost 4 vsldoi8 <u,4,1,0>, <6,5,7,1>
+  3799978808U,	// <1,0,6,6>: Cost 4 vsldoi8 <7,0,1,0>, <6,6,6,6>
+  2726237006U,	// <1,0,6,7>: Cost 3 vsldoi8 <7,0,1,0>, <6,7,0,1>
+  2224677522U,	// <1,0,6,u>: Cost 3 vmrghw <1,6,1,7>, <0,u,1,1>
+  2726237176U,	// <1,0,7,0>: Cost 3 vsldoi8 <7,0,1,0>, <7,0,1,0>
+  2285815462U,	// <1,0,7,1>: Cost 3 vmrglw <0,6,1,7>, <2,3,0,1>
+  3805951193U,	// <1,0,7,2>: Cost 4 vsldoi8 <u,0,1,0>, <7,2,u,0>
+  3807941859U,	// <1,0,7,3>: Cost 4 vsldoi8 <u,3,1,0>, <7,3,0,1>
+  3799979366U,	// <1,0,7,4>: Cost 4 vsldoi8 <7,0,1,0>, <7,4,5,6>
+  3803297165U,	// <1,0,7,5>: Cost 4 vsldoi8 <7,5,1,0>, <7,5,1,0>
+  3799979540U,	// <1,0,7,6>: Cost 4 vsldoi8 <7,0,1,0>, <7,6,7,0>
+  3799979628U,	// <1,0,7,7>: Cost 4 vsldoi8 <7,0,1,0>, <7,7,7,7>
+  2731546240U,	// <1,0,7,u>: Cost 3 vsldoi8 <7,u,1,0>, <7,u,1,0>
+  2284494848U,	// <1,0,u,0>: Cost 3 vmrglw <0,4,1,u>, <0,0,0,0>
+  1683112594U,	// <1,0,u,1>: Cost 2 vsldoi12 <0,u,1,1>, <0,u,1,1>
+  1683112605U,	// <1,0,u,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  2734200772U,	// <1,0,u,3>: Cost 3 vsldoi8 <u,3,1,0>, <u,3,1,0>
+  2757075629U,	// <1,0,u,4>: Cost 3 vsldoi12 <0,u,4,1>, <0,u,4,1>
+  2686425242U,	// <1,0,u,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  2791948430U,	// <1,0,u,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7>
+  2736855304U,	// <1,0,u,7>: Cost 3 vsldoi8 <u,7,1,0>, <u,7,1,0>
+  1683112659U,	// <1,0,u,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  1610694666U,	// <1,1,0,0>: Cost 2 vsldoi8 <0,0,1,1>, <0,0,1,1>
+  1616003174U,	// <1,1,0,1>: Cost 2 vsldoi8 <0,u,1,1>, LHS
+  2283767958U,	// <1,1,0,2>: Cost 3 vmrglw <0,3,1,0>, <3,0,1,2>
+  3357507596U,	// <1,1,0,3>: Cost 4 vmrglw <0,3,1,0>, <0,0,1,3>
+  2689745234U,	// <1,1,0,4>: Cost 3 vsldoi8 <0,u,1,1>, <0,4,1,5>
+  3357507922U,	// <1,1,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,1,5>
+  3294397647U,	// <1,1,0,6>: Cost 4 vmrghw <1,0,1,2>, <1,6,1,7>
+  3373433334U,	// <1,1,0,7>: Cost 4 vmrglw <3,0,1,0>, <0,6,1,7>
+  1616003730U,	// <1,1,0,u>: Cost 2 vsldoi8 <0,u,1,1>, <0,u,1,1>
+  1550221414U,	// <1,1,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,1,1,1>: Cost 1 vspltisw1 LHS
+  2287093910U,	// <1,1,1,2>: Cost 3 vmrglw <0,u,1,1>, <3,0,1,2>
+  2287092615U,	// <1,1,1,3>: Cost 3 vmrglw <0,u,1,1>, <1,2,1,3>
+  1550224694U,	// <1,1,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  2287092050U,	// <1,1,1,5>: Cost 3 vmrglw <0,u,1,1>, <0,4,1,5>
+  2689746127U,	// <1,1,1,6>: Cost 3 vsldoi8 <0,u,1,1>, <1,6,1,7>
+  2659800138U,	// <1,1,1,7>: Cost 3 vsldoi4 <7,1,1,1>, <7,1,1,1>
+  269271142U,	// <1,1,1,u>: Cost 1 vspltisw1 LHS
+  2222113516U,	// <1,1,2,0>: Cost 3 vmrghw <1,2,3,0>, <1,0,2,1>
+  2756854663U,	// <1,1,2,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,2,1,3>
+  1148371862U,	// <1,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689746598U,	// <1,1,2,3>: Cost 3 vsldoi8 <0,u,1,1>, <2,3,0,1>
+  2618002742U,	// <1,1,2,4>: Cost 3 vsldoi4 <0,1,1,2>, RHS
+  2299707730U,	// <1,1,2,5>: Cost 3 vmrglw <3,0,1,2>, <0,4,1,5>
+  2689746874U,	// <1,1,2,6>: Cost 3 vsldoi8 <0,u,1,1>, <2,6,3,7>
+  3361506511U,	// <1,1,2,7>: Cost 4 vmrglw <1,0,1,2>, <1,6,1,7>
+  1148371862U,	// <1,1,2,u>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689747094U,	// <1,1,3,0>: Cost 3 vsldoi8 <0,u,1,1>, <3,0,1,2>
+  2691074278U,	// <1,1,3,1>: Cost 3 vsldoi8 <1,1,1,1>, <3,1,1,1>
+  3356870806U,	// <1,1,3,2>: Cost 4 vmrglw <0,2,1,3>, <3,0,1,2>
+  2283126958U,	// <1,1,3,3>: Cost 3 vmrglw <0,2,1,3>, <0,2,1,3>
+  2689747458U,	// <1,1,3,4>: Cost 3 vsldoi8 <0,u,1,1>, <3,4,5,6>
+  3356868946U,	// <1,1,3,5>: Cost 4 vmrglw <0,2,1,3>, <0,4,1,5>
+  3811265144U,	// <1,1,3,6>: Cost 4 vsldoi8 <u,u,1,1>, <3,6,0,7>
+  3362841807U,	// <1,1,3,7>: Cost 4 vmrglw <1,2,1,3>, <1,6,1,7>
+  2689747742U,	// <1,1,3,u>: Cost 3 vsldoi8 <0,u,1,1>, <3,u,1,2>
+  2623987814U,	// <1,1,4,0>: Cost 3 vsldoi4 <1,1,1,4>, LHS
+  2758181931U,	// <1,1,4,1>: Cost 3 vsldoi12 <1,1,1,1>, <1,4,1,5>
+  2223408022U,	// <1,1,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0>
+  3697731734U,	// <1,1,4,3>: Cost 4 vsldoi4 <1,1,1,4>, <3,0,1,2>
+  2283798784U,	// <1,1,4,4>: Cost 3 vmrglw <0,3,1,4>, <0,3,1,4>
+  1616006454U,	// <1,1,4,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  3297379535U,	// <1,1,4,6>: Cost 4 vmrghw <1,4,5,6>, <1,6,1,7>
+  3373466102U,	// <1,1,4,7>: Cost 4 vmrglw <3,0,1,4>, <0,6,1,7>
+  1616006697U,	// <1,1,4,u>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  2760762479U,	// <1,1,5,0>: Cost 3 vsldoi12 <1,5,0,1>, <1,5,0,1>
+  2284470282U,	// <1,1,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,1>
+  2284472470U,	// <1,1,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,1,2>
+  3358212270U,	// <1,1,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,1,3>
+  2284470285U,	// <1,1,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,4>
+  1210728786U,	// <1,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  2737524834U,	// <1,1,5,6>: Cost 3 vsldoi8 <u,u,1,1>, <5,6,7,0>
+  3360867535U,	// <1,1,5,7>: Cost 4 vmrglw <0,u,1,5>, <1,6,1,7>
+  1210728786U,	// <1,1,5,u>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  3697746022U,	// <1,1,6,0>: Cost 4 vsldoi4 <1,1,1,6>, LHS
+  2756854991U,	// <1,1,6,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,6,1,7>
+  2737525242U,	// <1,1,6,2>: Cost 3 vsldoi8 <u,u,1,1>, <6,2,7,3>
+  3839149281U,	// <1,1,6,3>: Cost 4 vsldoi12 <2,3,0,1>, <1,6,3,7>
+  3697749302U,	// <1,1,6,4>: Cost 4 vsldoi4 <1,1,1,6>, RHS
+  3356893522U,	// <1,1,6,5>: Cost 4 vmrglw <0,2,1,6>, <0,4,1,5>
+  2283151537U,	// <1,1,6,6>: Cost 3 vmrglw <0,2,1,6>, <0,2,1,6>
+  2791949566U,	// <1,1,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <1,6,7,0>
+  2792613127U,	// <1,1,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <1,6,u,0>
+  2737525754U,	// <1,1,7,0>: Cost 3 vsldoi8 <u,u,1,1>, <7,0,1,2>
+  2291786386U,	// <1,1,7,1>: Cost 3 vmrglw <1,6,1,7>, <0,u,1,1>
+  3365528292U,	// <1,1,7,2>: Cost 4 vmrglw <1,6,1,7>, <1,0,1,2>
+  3365528455U,	// <1,1,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,1,3>
+  2737526118U,	// <1,1,7,4>: Cost 3 vsldoi8 <u,u,1,1>, <7,4,5,6>
+  3365527890U,	// <1,1,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,1,5>
+  3365528377U,	// <1,1,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,1,1,6>
+  2291786959U,	// <1,1,7,7>: Cost 3 vmrglw <1,6,1,7>, <1,6,1,7>
+  2737526402U,	// <1,1,7,u>: Cost 3 vsldoi8 <u,u,1,1>, <7,u,1,2>
+  1550221414U,	// <1,1,u,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,1,u,1>: Cost 1 vspltisw1 LHS
+  1148371862U,	// <1,1,u,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689750972U,	// <1,1,u,3>: Cost 3 vsldoi8 <0,u,1,1>, <u,3,0,1>
+  1550224694U,	// <1,1,u,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  1616009370U,	// <1,1,u,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  2689751248U,	// <1,1,u,6>: Cost 3 vsldoi8 <0,u,1,1>, <u,6,3,7>
+  2736863497U,	// <1,1,u,7>: Cost 3 vsldoi8 <u,7,1,1>, <u,7,1,1>
+  269271142U,	// <1,1,u,u>: Cost 1 vspltisw1 LHS
+  2702360576U,	// <1,2,0,0>: Cost 3 vsldoi8 <3,0,1,2>, <0,0,0,0>
+  1628618854U,	// <1,2,0,1>: Cost 2 vsldoi8 <3,0,1,2>, LHS
+  2685771949U,	// <1,2,0,2>: Cost 3 vsldoi8 <0,2,1,2>, <0,2,1,2>
+  2283765862U,	// <1,2,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS
+  2702360914U,	// <1,2,0,4>: Cost 3 vsldoi8 <3,0,1,2>, <0,4,1,5>
+  3788046813U,	// <1,2,0,5>: Cost 4 vsldoi8 <5,0,1,2>, <0,5,u,0>
+  2688426481U,	// <1,2,0,6>: Cost 3 vsldoi8 <0,6,1,2>, <0,6,1,2>
+  2726249024U,	// <1,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0>
+  1628619421U,	// <1,2,0,u>: Cost 2 vsldoi8 <3,0,1,2>, LHS
+  2690417380U,	// <1,2,1,0>: Cost 3 vsldoi8 <1,0,1,2>, <1,0,1,2>
+  2702361396U,	// <1,2,1,1>: Cost 3 vsldoi8 <3,0,1,2>, <1,1,1,1>
+  2287093352U,	// <1,2,1,2>: Cost 3 vmrglw <0,u,1,1>, <2,2,2,2>
+  1213349990U,	// <1,2,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS
+  3764159522U,	// <1,2,1,4>: Cost 4 vsldoi8 <1,0,1,2>, <1,4,0,5>
+  3295053672U,	// <1,2,1,5>: Cost 4 vmrghw <1,1,1,1>, <2,5,3,6>
+  2221311930U,	// <1,2,1,6>: Cost 3 vmrghw <1,1,1,1>, <2,6,3,7>
+  3799991593U,	// <1,2,1,7>: Cost 4 vsldoi8 <7,0,1,2>, <1,7,2,7>
+  1213349995U,	// <1,2,1,u>: Cost 2 vmrglw <0,u,1,1>, LHS
+  2624045158U,	// <1,2,2,0>: Cost 3 vsldoi4 <1,1,2,2>, LHS
+  2702362144U,	// <1,2,2,1>: Cost 3 vsldoi8 <3,0,1,2>, <2,1,3,2>
+  2283120232U,	// <1,2,2,2>: Cost 3 vmrglw <0,2,1,2>, <2,2,2,2>
+  1225965670U,	// <1,2,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS
+  2624048438U,	// <1,2,2,4>: Cost 3 vsldoi4 <1,1,2,2>, RHS
+  3356860763U,	// <1,2,2,5>: Cost 4 vmrglw <0,2,1,2>, <0,4,2,5>
+  2222114746U,	// <1,2,2,6>: Cost 3 vmrghw <1,2,3,0>, <2,6,3,7>
+  2299708632U,	// <1,2,2,7>: Cost 3 vmrglw <3,0,1,2>, <1,6,2,7>
+  1225965675U,	// <1,2,2,u>: Cost 2 vmrglw <3,0,1,2>, LHS
+  470597734U,	// <1,2,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1544340276U,	// <1,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544341096U,	// <1,2,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544341916U,	// <1,2,3,3>: Cost 2 vsldoi4 LHS, <3,3,3,3>
+  470601014U,	// <1,2,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1592119300U,	// <1,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1592119802U,	// <1,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592120314U,	// <1,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  470603566U,	// <1,2,3,u>: Cost 1 vsldoi4 LHS, LHS
+  2708335471U,	// <1,2,4,0>: Cost 3 vsldoi8 <4,0,1,2>, <4,0,1,2>
+  3838043908U,	// <1,2,4,1>: Cost 4 vsldoi12 <2,1,3,1>, <2,4,1,5>
+  3357541992U,	// <1,2,4,2>: Cost 4 vmrglw <0,3,1,4>, <2,2,2,2>
+  2283798630U,	// <1,2,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS
+  2726251728U,	// <1,2,4,4>: Cost 3 vsldoi8 <7,0,1,2>, <4,4,4,4>
+  1628622134U,	// <1,2,4,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  3297077178U,	// <1,2,4,6>: Cost 4 vmrghw <1,4,1,5>, <2,6,3,7>
+  2726251976U,	// <1,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0>
+  1628622377U,	// <1,2,4,u>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  2714308168U,	// <1,2,5,0>: Cost 3 vsldoi8 <5,0,1,2>, <5,0,1,2>
+  3297633827U,	// <1,2,5,1>: Cost 4 vmrghw <1,5,0,1>, <2,1,3,5>
+  2284471912U,	// <1,2,5,2>: Cost 3 vmrglw <0,4,1,5>, <2,2,2,2>
+  1210728550U,	// <1,2,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS
+  3776106420U,	// <1,2,5,4>: Cost 4 vsldoi8 <3,0,1,2>, <5,4,5,6>
+  2726252548U,	// <1,2,5,5>: Cost 3 vsldoi8 <7,0,1,2>, <5,5,5,5>
+  2726252642U,	// <1,2,5,6>: Cost 3 vsldoi8 <7,0,1,2>, <5,6,7,0>
+  3799994538U,	// <1,2,5,7>: Cost 4 vsldoi8 <7,0,1,2>, <5,7,6,0>
+  1210728555U,	// <1,2,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2720280865U,	// <1,2,6,0>: Cost 3 vsldoi8 <6,0,1,2>, <6,0,1,2>
+  2702365096U,	// <1,2,6,1>: Cost 3 vsldoi8 <3,0,1,2>, <6,1,7,2>
+  2726253050U,	// <1,2,6,2>: Cost 3 vsldoi8 <7,0,1,2>, <6,2,7,3>
+  2283151462U,	// <1,2,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS
+  3697823030U,	// <1,2,6,4>: Cost 4 vsldoi4 <1,1,2,6>, RHS
+  3298715497U,	// <1,2,6,5>: Cost 4 vmrghw <1,6,5,7>, <2,5,3,7>
+  2726253368U,	// <1,2,6,6>: Cost 3 vsldoi8 <7,0,1,2>, <6,6,6,6>
+  2724926296U,	// <1,2,6,7>: Cost 3 vsldoi8 <6,7,1,2>, <6,7,1,2>
+  2283151467U,	// <1,2,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS
+  1652511738U,	// <1,2,7,0>: Cost 2 vsldoi8 <7,0,1,2>, <7,0,1,2>
+  3371500916U,	// <1,2,7,1>: Cost 4 vmrglw <2,6,1,7>, <1,u,2,1>
+  3365529192U,	// <1,2,7,2>: Cost 4 vmrglw <1,6,1,7>, <2,2,2,2>
+  2291785830U,	// <1,2,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS
+  2726253926U,	// <1,2,7,4>: Cost 3 vsldoi8 <7,0,1,2>, <7,4,5,6>
+  3788051845U,	// <1,2,7,5>: Cost 4 vsldoi8 <5,0,1,2>, <7,5,0,1>
+  3794023894U,	// <1,2,7,6>: Cost 4 vsldoi8 <6,0,1,2>, <7,6,0,1>
+  2726254119U,	// <1,2,7,7>: Cost 3 vsldoi8 <7,0,1,2>, <7,7,0,1>
+  1657820802U,	// <1,2,7,u>: Cost 2 vsldoi8 <7,u,1,2>, <7,u,1,2>
+  470638699U,	// <1,2,u,0>: Cost 1 vsldoi4 LHS, LHS
+  1544381236U,	// <1,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544382056U,	// <1,2,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544382614U,	// <1,2,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  470641974U,	// <1,2,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1628625050U,	// <1,2,u,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  1592160762U,	// <1,2,u,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592161274U,	// <1,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  470644526U,	// <1,2,u,u>: Cost 1 vsldoi4 LHS, LHS
+  2769389708U,	// <1,3,0,0>: Cost 3 vsldoi12 <3,0,0,1>, <3,0,0,1>
+  2685780070U,	// <1,3,0,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2685780142U,	// <1,3,0,2>: Cost 3 vsldoi8 <0,2,1,3>, <0,2,1,3>
+  2686443775U,	// <1,3,0,3>: Cost 3 vsldoi8 <0,3,1,3>, <0,3,1,3>
+  2769684656U,	// <1,3,0,4>: Cost 3 vsldoi12 <3,0,4,1>, <3,0,4,1>
+  3357507940U,	// <1,3,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,3,5>
+  3759522294U,	// <1,3,0,6>: Cost 4 vsldoi8 <0,2,1,3>, <0,6,1,7>
+  3357509562U,	// <1,3,0,7>: Cost 4 vmrglw <0,3,1,0>, <2,6,3,7>
+  2685780637U,	// <1,3,0,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2287092630U,	// <1,3,1,0>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,0>
+  2221312230U,	// <1,3,1,1>: Cost 3 vmrghw <1,1,1,1>, <3,1,1,1>
+  2691752839U,	// <1,3,1,2>: Cost 3 vsldoi8 <1,2,1,3>, <1,2,1,3>
+  2287093362U,	// <1,3,1,3>: Cost 3 vmrglw <0,u,1,1>, <2,2,3,3>
+  2287092634U,	// <1,3,1,4>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,4>
+  3360835107U,	// <1,3,1,5>: Cost 4 vmrglw <0,u,1,1>, <2,1,3,5>
+  3759523041U,	// <1,3,1,6>: Cost 4 vsldoi8 <0,2,1,3>, <1,6,3,7>
+  2287093690U,	// <1,3,1,7>: Cost 3 vmrglw <0,u,1,1>, <2,6,3,7>
+  2287092638U,	// <1,3,1,u>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,u>
+  2222114966U,	// <1,3,2,0>: Cost 3 vmrghw <1,2,3,0>, <3,0,1,2>
+  2222115057U,	// <1,3,2,1>: Cost 3 vmrghw <1,2,3,0>, <3,1,2,3>
+  2630092320U,	// <1,3,2,2>: Cost 3 vsldoi4 <2,1,3,2>, <2,1,3,2>
+  2685781670U,	// <1,3,2,3>: Cost 3 vsldoi8 <0,2,1,3>, <2,3,0,1>
+  2222115330U,	// <1,3,2,4>: Cost 3 vmrghw <1,2,3,0>, <3,4,5,6>
+  3373449572U,	// <1,3,2,5>: Cost 4 vmrglw <3,0,1,2>, <0,4,3,5>
+  2222115448U,	// <1,3,2,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7>
+  2299709370U,	// <1,3,2,7>: Cost 3 vmrglw <3,0,1,2>, <2,6,3,7>
+  2222115614U,	// <1,3,2,u>: Cost 3 vmrghw <1,2,3,0>, <3,u,1,2>
+  2771380607U,	// <1,3,3,0>: Cost 3 vsldoi12 <3,3,0,1>, <3,3,0,1>
+  3356874468U,	// <1,3,3,1>: Cost 4 vmrglw <0,2,1,3>, <u,0,3,1>
+  3759524168U,	// <1,3,3,2>: Cost 4 vsldoi8 <0,2,1,3>, <3,2,3,0>
+  2283792796U,	// <1,3,3,3>: Cost 3 vmrglw <0,3,1,3>, <3,3,3,3>
+  3356869530U,	// <1,3,3,4>: Cost 4 vmrglw <0,2,1,3>, <1,2,3,4>
+  3721760428U,	// <1,3,3,5>: Cost 4 vsldoi4 <5,1,3,3>, <5,1,3,3>
+  3296496248U,	// <1,3,3,6>: Cost 4 vmrghw <1,3,2,6>, <3,6,0,7>
+  3356870586U,	// <1,3,3,7>: Cost 4 vmrglw <0,2,1,3>, <2,6,3,7>
+  2771970503U,	// <1,3,3,u>: Cost 3 vsldoi12 <3,3,u,1>, <3,3,u,1>
+  2772044240U,	// <1,3,4,0>: Cost 3 vsldoi12 <3,4,0,1>, <3,4,0,1>
+  3362186135U,	// <1,3,4,1>: Cost 4 vmrglw <1,1,1,4>, <1,2,3,1>
+  3297151280U,	// <1,3,4,2>: Cost 4 vmrghw <1,4,2,5>, <3,2,0,3>
+  3357542002U,	// <1,3,4,3>: Cost 4 vmrglw <0,3,1,4>, <2,2,3,3>
+  3357540626U,	// <1,3,4,4>: Cost 4 vmrglw <0,3,1,4>, <0,3,3,4>
+  2685783350U,	// <1,3,4,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  3357546622U,	// <1,3,4,6>: Cost 4 vmrglw <0,3,1,4>, <u,5,3,6>
+  3357542330U,	// <1,3,4,7>: Cost 4 vmrglw <0,3,1,4>, <2,6,3,7>
+  2685783593U,	// <1,3,4,u>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  2284471190U,	// <1,3,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,0>
+  3358213015U,	// <1,3,5,1>: Cost 4 vmrglw <0,4,1,5>, <1,2,3,1>
+  2630116899U,	// <1,3,5,2>: Cost 3 vsldoi4 <2,1,3,5>, <2,1,3,5>
+  2284471922U,	// <1,3,5,3>: Cost 3 vmrglw <0,4,1,5>, <2,2,3,3>
+  2284471194U,	// <1,3,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,4>
+  2284471843U,	// <1,3,5,5>: Cost 3 vmrglw <0,4,1,5>, <2,1,3,5>
+  3358218366U,	// <1,3,5,6>: Cost 4 vmrglw <0,4,1,5>, <u,5,3,6>
+  2284472250U,	// <1,3,5,7>: Cost 3 vmrglw <0,4,1,5>, <2,6,3,7>
+  2284471198U,	// <1,3,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,u>
+  2224752790U,	// <1,3,6,0>: Cost 3 vmrghw <1,6,2,7>, <3,0,1,2>
+  3832736385U,	// <1,3,6,1>: Cost 4 vsldoi12 <1,2,3,1>, <3,6,1,7>
+  3703866916U,	// <1,3,6,2>: Cost 4 vsldoi4 <2,1,3,6>, <2,1,3,6>
+  3356894834U,	// <1,3,6,3>: Cost 4 vmrglw <0,2,1,6>, <2,2,3,3>
+  3356894106U,	// <1,3,6,4>: Cost 4 vmrglw <0,2,1,6>, <1,2,3,4>
+  3356894755U,	// <1,3,6,5>: Cost 5 vmrglw <0,2,1,6>, <2,1,3,5>
+  3356899130U,	// <1,3,6,6>: Cost 4 vmrglw <0,2,1,6>, <u,1,3,6>
+  2283153338U,	// <1,3,6,7>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7>
+  2283153338U,	// <1,3,6,u>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7>
+  2774035139U,	// <1,3,7,0>: Cost 3 vsldoi12 <3,7,0,1>, <3,7,0,1>
+  3703874767U,	// <1,3,7,1>: Cost 4 vsldoi4 <2,1,3,7>, <1,6,1,7>
+  3703875109U,	// <1,3,7,2>: Cost 4 vsldoi4 <2,1,3,7>, <2,1,3,7>
+  3365529202U,	// <1,3,7,3>: Cost 4 vmrglw <1,6,1,7>, <2,2,3,3>
+  3365528474U,	// <1,3,7,4>: Cost 4 vmrglw <1,6,1,7>, <1,2,3,4>
+  3789387159U,	// <1,3,7,5>: Cost 4 vsldoi8 <5,2,1,3>, <7,5,2,1>
+  3865692927U,	// <1,3,7,6>: Cost 4 vsldoi12 <6,7,0,1>, <3,7,6,7>
+  3363538874U,	// <1,3,7,7>: Cost 4 vmrglw <1,3,1,7>, <2,6,3,7>
+  2774625035U,	// <1,3,7,u>: Cost 3 vsldoi12 <3,7,u,1>, <3,7,u,1>
+  2284495766U,	// <1,3,u,0>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,0>
+  2685785902U,	// <1,3,u,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2630141478U,	// <1,3,u,2>: Cost 3 vsldoi4 <2,1,3,u>, <2,1,3,u>
+  2283169880U,	// <1,3,u,3>: Cost 3 vmrglw <0,2,1,u>, <2,u,3,3>
+  2284495770U,	// <1,3,u,4>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,4>
+  2685786266U,	// <1,3,u,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  2222115448U,	// <1,3,u,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7>
+  2284496826U,	// <1,3,u,7>: Cost 3 vmrglw <0,4,1,u>, <2,6,3,7>
+  2685786469U,	// <1,3,u,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2684461069U,	// <1,4,0,0>: Cost 3 vsldoi8 <0,0,1,4>, <0,0,1,4>
+  2686451814U,	// <1,4,0,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS
+  3759530159U,	// <1,4,0,2>: Cost 4 vsldoi8 <0,2,1,4>, <0,2,1,4>
+  2686451968U,	// <1,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4>
+  2684461394U,	// <1,4,0,4>: Cost 3 vsldoi8 <0,0,1,4>, <0,4,1,5>
+  1701989266U,	// <1,4,0,5>: Cost 2 vsldoi12 <4,0,5,1>, <4,0,5,1>
+  3776119286U,	// <1,4,0,6>: Cost 4 vsldoi8 <3,0,1,4>, <0,6,1,7>
+  2689106500U,	// <1,4,0,7>: Cost 3 vsldoi8 <0,7,1,4>, <0,7,1,4>
+  1702210477U,	// <1,4,0,u>: Cost 2 vsldoi12 <4,0,u,1>, <4,0,u,1>
+  2221312914U,	// <1,4,1,0>: Cost 3 vmrghw <1,1,1,1>, <4,0,5,1>
+  2691097399U,	// <1,4,1,1>: Cost 3 vsldoi8 <1,1,1,4>, <1,1,1,4>
+  3760194454U,	// <1,4,1,2>: Cost 4 vsldoi8 <0,3,1,4>, <1,2,3,0>
+  3766166489U,	// <1,4,1,3>: Cost 4 vsldoi8 <1,3,1,4>, <1,3,1,4>
+  2334870736U,	// <1,4,1,4>: Cost 3 vmrglw <u,u,1,1>, <4,4,4,4>
+  1147571510U,	// <1,4,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS
+  3760194794U,	// <1,4,1,6>: Cost 4 vsldoi8 <0,3,1,4>, <1,6,4,7>
+  3867315188U,	// <1,4,1,7>: Cost 4 vsldoi12 <7,0,4,1>, <4,1,7,0>
+  1147571753U,	// <1,4,1,u>: Cost 2 vmrghw <1,1,1,1>, RHS
+  2222115730U,	// <1,4,2,0>: Cost 3 vmrghw <1,2,3,0>, <4,0,5,1>
+  2222115812U,	// <1,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2>
+  3760195176U,	// <1,4,2,2>: Cost 4 vsldoi8 <0,3,1,4>, <2,2,2,2>
+  2702378662U,	// <1,4,2,3>: Cost 3 vsldoi8 <3,0,1,4>, <2,3,0,1>
+  2323598544U,	// <1,4,2,4>: Cost 3 vmrglw <7,0,1,2>, <4,4,4,4>
+  1148374326U,	// <1,4,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS
+  3760195514U,	// <1,4,2,6>: Cost 4 vsldoi8 <0,3,1,4>, <2,6,3,7>
+  3373451932U,	// <1,4,2,7>: Cost 4 vmrglw <3,0,1,2>, <3,6,4,7>
+  1148374569U,	// <1,4,2,u>: Cost 2 vmrghw <1,2,3,0>, RHS
+  2702379160U,	// <1,4,3,0>: Cost 3 vsldoi8 <3,0,1,4>, <3,0,1,4>
+  3760195840U,	// <1,4,3,1>: Cost 4 vsldoi8 <0,3,1,4>, <3,1,4,0>
+  3776121160U,	// <1,4,3,2>: Cost 4 vsldoi8 <3,0,1,4>, <3,2,3,0>
+  3760195996U,	// <1,4,3,3>: Cost 4 vsldoi8 <0,3,1,4>, <3,3,3,3>
+  2686454274U,	// <1,4,3,4>: Cost 3 vsldoi8 <0,3,1,4>, <3,4,5,6>
+  3356870350U,	// <1,4,3,5>: Cost 4 vmrglw <0,2,1,3>, <2,3,4,5>
+  3800009392U,	// <1,4,3,6>: Cost 4 vsldoi8 <7,0,1,4>, <3,6,7,0>
+  3366824604U,	// <1,4,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,4,7>
+  2707688224U,	// <1,4,3,u>: Cost 3 vsldoi8 <3,u,1,4>, <3,u,1,4>
+  2775731368U,	// <1,4,4,0>: Cost 3 vsldoi12 <4,0,5,1>, <4,4,0,0>
+  3830820018U,	// <1,4,4,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,4,1,1>
+  3691980454U,	// <1,4,4,2>: Cost 4 vsldoi4 <0,1,4,4>, <2,3,0,1>
+  3357541282U,	// <1,4,4,3>: Cost 4 vmrglw <0,3,1,4>, <1,2,4,3>
+  2781039824U,	// <1,4,4,4>: Cost 3 vsldoi12 <4,u,5,1>, <4,4,4,4>
+  2686455094U,	// <1,4,4,5>: Cost 3 vsldoi8 <0,3,1,4>, RHS
+  3357541528U,	// <1,4,4,6>: Cost 4 vmrglw <0,3,1,4>, <1,5,4,6>
+  3810627020U,	// <1,4,4,7>: Cost 4 vsldoi8 <u,7,1,4>, <4,7,5,4>
+  2686455337U,	// <1,4,4,u>: Cost 3 vsldoi8 <0,3,1,4>, RHS
+  2624217190U,	// <1,4,5,0>: Cost 3 vsldoi4 <1,1,4,5>, LHS
+  2284470309U,	// <1,4,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,1>
+  2618246822U,	// <1,4,5,2>: Cost 3 vsldoi4 <0,1,4,5>, <2,3,0,1>
+  3358212297U,	// <1,4,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,4,3>
+  2284470312U,	// <1,4,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,4>
+  2284470637U,	// <1,4,5,5>: Cost 3 vmrglw <0,4,1,5>, <0,4,4,5>
+  1683115318U,	// <1,4,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  3721851898U,	// <1,4,5,7>: Cost 4 vsldoi4 <5,1,4,5>, <7,0,1,2>
+  1683115336U,	// <1,4,5,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  3794039075U,	// <1,4,6,0>: Cost 4 vsldoi8 <6,0,1,4>, <6,0,1,4>
+  3830820186U,	// <1,4,6,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,6,1,7>
+  3800011258U,	// <1,4,6,2>: Cost 4 vsldoi8 <7,0,1,4>, <6,2,7,3>
+  3807973938U,	// <1,4,6,3>: Cost 4 vsldoi8 <u,3,1,4>, <6,3,4,5>
+  3298716880U,	// <1,4,6,4>: Cost 4 vmrghw <1,6,5,7>, <4,4,4,4>
+  2224680246U,	// <1,4,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS
+  3800011576U,	// <1,4,6,6>: Cost 4 vsldoi8 <7,0,1,4>, <6,6,6,6>
+  2726269774U,	// <1,4,6,7>: Cost 3 vsldoi8 <7,0,1,4>, <6,7,0,1>
+  2224680489U,	// <1,4,6,u>: Cost 3 vmrghw <1,6,1,7>, RHS
+  2726269948U,	// <1,4,7,0>: Cost 3 vsldoi8 <7,0,1,4>, <7,0,1,4>
+  3383444141U,	// <1,4,7,1>: Cost 4 vmrglw <4,6,1,7>, <0,u,4,1>
+  3805983961U,	// <1,4,7,2>: Cost 4 vsldoi8 <u,0,1,4>, <7,2,u,0>
+  3807974667U,	// <1,4,7,3>: Cost 4 vsldoi8 <u,3,1,4>, <7,3,4,5>
+  2736887142U,	// <1,4,7,4>: Cost 3 vsldoi8 <u,7,1,4>, <7,4,5,6>
+  3365528403U,	// <1,4,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,1,4,5>
+  3800012308U,	// <1,4,7,6>: Cost 4 vsldoi8 <7,0,1,4>, <7,6,7,0>
+  3800012396U,	// <1,4,7,7>: Cost 4 vsldoi8 <7,0,1,4>, <7,7,7,7>
+  2731579012U,	// <1,4,7,u>: Cost 3 vsldoi8 <7,u,1,4>, <7,u,1,4>
+  2624241766U,	// <1,4,u,0>: Cost 3 vsldoi4 <1,1,4,u>, LHS
+  2686457646U,	// <1,4,u,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS
+  2618271398U,	// <1,4,u,2>: Cost 3 vsldoi4 <0,1,4,u>, <2,3,0,1>
+  2734233544U,	// <1,4,u,3>: Cost 3 vsldoi8 <u,3,1,4>, <u,3,1,4>
+  2689775679U,	// <1,4,u,4>: Cost 3 vsldoi8 <0,u,1,4>, <u,4,5,6>
+  1152355638U,	// <1,4,u,5>: Cost 2 vmrghw <1,u,3,0>, RHS
+  1683115561U,	// <1,4,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  2736888076U,	// <1,4,u,7>: Cost 3 vsldoi8 <u,7,1,4>, <u,7,1,4>
+  1683115579U,	// <1,4,u,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  2687123456U,	// <1,5,0,0>: Cost 3 vsldoi8 <0,4,1,5>, <0,0,0,0>
+  1613381734U,	// <1,5,0,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  3759538352U,	// <1,5,0,2>: Cost 4 vsldoi8 <0,2,1,5>, <0,2,1,5>
+  3760865532U,	// <1,5,0,3>: Cost 4 vsldoi8 <0,4,1,5>, <0,3,1,0>
+  1613381970U,	// <1,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5>
+  2687787427U,	// <1,5,0,5>: Cost 3 vsldoi8 <0,5,1,5>, <0,5,1,5>
+  2781777524U,	// <1,5,0,6>: Cost 3 vsldoi12 <5,0,6,1>, <5,0,6,1>
+  3733828717U,	// <1,5,0,7>: Cost 4 vsldoi4 <7,1,5,0>, <7,1,5,0>
+  1613382301U,	// <1,5,0,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  2781040271U,	// <1,5,1,0>: Cost 3 vsldoi12 <4,u,5,1>, <5,1,0,1>
+  2687124276U,	// <1,5,1,1>: Cost 3 vsldoi8 <0,4,1,5>, <1,1,1,1>
+  2687124374U,	// <1,5,1,2>: Cost 3 vsldoi8 <0,4,1,5>, <1,2,3,0>
+  3760866297U,	// <1,5,1,3>: Cost 4 vsldoi8 <0,4,1,5>, <1,3,5,0>
+  2693096491U,	// <1,5,1,4>: Cost 3 vsldoi8 <1,4,1,5>, <1,4,1,5>
+  2687124591U,	// <1,5,1,5>: Cost 3 vsldoi8 <0,4,1,5>, <1,5,0,1>
+  2687124723U,	// <1,5,1,6>: Cost 3 vsldoi8 <0,4,1,5>, <1,6,5,7>
+  3360834803U,	// <1,5,1,7>: Cost 4 vmrglw <0,u,1,1>, <1,6,5,7>
+  2687124860U,	// <1,5,1,u>: Cost 3 vsldoi8 <0,4,1,5>, <1,u,3,0>
+  2323598792U,	// <1,5,2,0>: Cost 3 vmrglw <7,0,1,2>, <4,7,5,0>
+  2687125027U,	// <1,5,2,1>: Cost 3 vsldoi8 <0,4,1,5>, <2,1,3,5>
+  2687125096U,	// <1,5,2,2>: Cost 3 vsldoi8 <0,4,1,5>, <2,2,2,2>
+  2687125158U,	// <1,5,2,3>: Cost 3 vsldoi8 <0,4,1,5>, <2,3,0,1>
+  2642185188U,	// <1,5,2,4>: Cost 3 vsldoi4 <4,1,5,2>, <4,1,5,2>
+  2323598554U,	// <1,5,2,5>: Cost 3 vmrglw <7,0,1,2>, <4,4,5,5>
+  2687125434U,	// <1,5,2,6>: Cost 3 vsldoi8 <0,4,1,5>, <2,6,3,7>
+  3373450483U,	// <1,5,2,7>: Cost 4 vmrglw <3,0,1,2>, <1,6,5,7>
+  2687125563U,	// <1,5,2,u>: Cost 3 vsldoi8 <0,4,1,5>, <2,u,0,1>
+  2687125654U,	// <1,5,3,0>: Cost 3 vsldoi8 <0,4,1,5>, <3,0,1,2>
+  2312990234U,	// <1,5,3,1>: Cost 3 vmrglw <5,2,1,3>, <4,u,5,1>
+  3760867649U,	// <1,5,3,2>: Cost 4 vsldoi8 <0,4,1,5>, <3,2,2,2>
+  2687125916U,	// <1,5,3,3>: Cost 3 vsldoi8 <0,4,1,5>, <3,3,3,3>
+  2687126018U,	// <1,5,3,4>: Cost 3 vsldoi8 <0,4,1,5>, <3,4,5,6>
+  3386731738U,	// <1,5,3,5>: Cost 4 vmrglw <5,2,1,3>, <4,4,5,5>
+  3356871170U,	// <1,5,3,6>: Cost 4 vmrglw <0,2,1,3>, <3,4,5,6>
+  3808643779U,	// <1,5,3,7>: Cost 4 vsldoi8 <u,4,1,5>, <3,7,0,1>
+  2687126302U,	// <1,5,3,u>: Cost 3 vsldoi8 <0,4,1,5>, <3,u,1,2>
+  2642198630U,	// <1,5,4,0>: Cost 3 vsldoi4 <4,1,5,4>, LHS
+  2687126498U,	// <1,5,4,1>: Cost 3 vsldoi8 <0,4,1,5>, <4,1,5,0>
+  3715941923U,	// <1,5,4,2>: Cost 4 vsldoi4 <4,1,5,4>, <2,1,3,5>
+  3709970701U,	// <1,5,4,3>: Cost 4 vsldoi4 <3,1,5,4>, <3,1,5,4>
+  2687126736U,	// <1,5,4,4>: Cost 3 vsldoi8 <0,4,1,5>, <4,4,4,4>
+  1613385014U,	// <1,5,4,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2283801090U,	// <1,5,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6>
+  3733861489U,	// <1,5,4,7>: Cost 4 vsldoi4 <7,1,5,4>, <7,1,5,4>
+  1613385257U,	// <1,5,4,u>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2624290918U,	// <1,5,5,0>: Cost 3 vsldoi4 <1,1,5,5>, LHS
+  2624291676U,	// <1,5,5,1>: Cost 3 vsldoi4 <1,1,5,5>, <1,1,5,5>
+  3698034211U,	// <1,5,5,2>: Cost 4 vsldoi4 <1,1,5,5>, <2,1,3,5>
+  2284471211U,	// <1,5,5,3>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,3>
+  2624294198U,	// <1,5,5,4>: Cost 3 vsldoi4 <1,1,5,5>, RHS
+  2284471132U,	// <1,5,5,5>: Cost 3 vmrglw <0,4,1,5>, <1,1,5,5>
+  2284472834U,	// <1,5,5,6>: Cost 3 vmrglw <0,4,1,5>, <3,4,5,6>
+  2284471539U,	// <1,5,5,7>: Cost 3 vmrglw <0,4,1,5>, <1,6,5,7>
+  2284471216U,	// <1,5,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,u>
+  2785316900U,	// <1,5,6,0>: Cost 3 vsldoi12 <5,6,0,1>, <5,6,0,1>
+  2781040691U,	// <1,5,6,1>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,1,7>
+  2734903802U,	// <1,5,6,2>: Cost 3 vsldoi8 <u,4,1,5>, <6,2,7,3>
+  3848736834U,	// <1,5,6,3>: Cost 4 vsldoi12 <3,u,4,1>, <5,6,3,4>
+  3298717620U,	// <1,5,6,4>: Cost 4 vmrghw <1,6,5,7>, <5,4,5,6>
+  3298717700U,	// <1,5,6,5>: Cost 4 vmrghw <1,6,5,7>, <5,5,5,5>
+  2734904120U,	// <1,5,6,6>: Cost 3 vsldoi8 <u,4,1,5>, <6,6,6,6>
+  2781040738U,	// <1,5,6,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,7,0>
+  2781040747U,	// <1,5,6,u>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,u,0>
+  2734904314U,	// <1,5,7,0>: Cost 3 vsldoi8 <u,4,1,5>, <7,0,1,2>
+  2315677210U,	// <1,5,7,1>: Cost 3 vmrglw <5,6,1,7>, <4,u,5,1>
+  3808646292U,	// <1,5,7,2>: Cost 4 vsldoi8 <u,4,1,5>, <7,2,0,3>
+  3808646371U,	// <1,5,7,3>: Cost 4 vsldoi8 <u,4,1,5>, <7,3,0,1>
+  2734904678U,	// <1,5,7,4>: Cost 3 vsldoi8 <u,4,1,5>, <7,4,5,6>
+  3389418714U,	// <1,5,7,5>: Cost 4 vmrglw <5,6,1,7>, <4,4,5,5>
+  3365528656U,	// <1,5,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,4,5,6>
+  2734904940U,	// <1,5,7,7>: Cost 3 vsldoi8 <u,4,1,5>, <7,7,7,7>
+  2734904962U,	// <1,5,7,u>: Cost 3 vsldoi8 <u,4,1,5>, <7,u,1,2>
+  2687129299U,	// <1,5,u,0>: Cost 3 vsldoi8 <0,4,1,5>, <u,0,1,2>
+  1613387566U,	// <1,5,u,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  2687129480U,	// <1,5,u,2>: Cost 3 vsldoi8 <0,4,1,5>, <u,2,3,3>
+  2687129532U,	// <1,5,u,3>: Cost 3 vsldoi8 <0,4,1,5>, <u,3,0,1>
+  1661163546U,	// <1,5,u,4>: Cost 2 vsldoi8 <u,4,1,5>, <u,4,1,5>
+  1613387930U,	// <1,5,u,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2687129808U,	// <1,5,u,6>: Cost 3 vsldoi8 <0,4,1,5>, <u,6,3,7>
+  2781040900U,	// <1,5,u,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,u,7,0>
+  1613388133U,	// <1,5,u,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  3759546368U,	// <1,6,0,0>: Cost 4 vsldoi8 <0,2,1,6>, <0,0,0,0>
+  2685804646U,	// <1,6,0,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  2685804721U,	// <1,6,0,2>: Cost 3 vsldoi8 <0,2,1,6>, <0,2,1,6>
+  3861270834U,	// <1,6,0,3>: Cost 4 vsldoi12 <6,0,3,1>, <6,0,3,1>
+  3759546706U,	// <1,6,0,4>: Cost 4 vsldoi8 <0,2,1,6>, <0,4,1,5>
+  2687795620U,	// <1,6,0,5>: Cost 3 vsldoi8 <0,5,1,6>, <0,5,1,6>
+  2688459253U,	// <1,6,0,6>: Cost 3 vsldoi8 <0,6,1,6>, <0,6,1,6>
+  2283769142U,	// <1,6,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS
+  2685805213U,	// <1,6,0,u>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  3698073702U,	// <1,6,1,0>: Cost 4 vsldoi4 <1,1,6,1>, LHS
+  3759547188U,	// <1,6,1,1>: Cost 4 vsldoi8 <0,2,1,6>, <1,1,1,1>
+  2221314554U,	// <1,6,1,2>: Cost 3 vmrghw <1,1,1,1>, <6,2,7,3>
+  3759547401U,	// <1,6,1,3>: Cost 4 vsldoi8 <0,2,1,6>, <1,3,6,7>
+  3698076982U,	// <1,6,1,4>: Cost 4 vsldoi4 <1,1,6,1>, RHS
+  3767510141U,	// <1,6,1,5>: Cost 4 vsldoi8 <1,5,1,6>, <1,5,1,6>
+  2334872376U,	// <1,6,1,6>: Cost 3 vmrglw <u,u,1,1>, <6,6,6,6>
+  1213353270U,	// <1,6,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS
+  1213353271U,	// <1,6,1,u>: Cost 2 vmrglw <0,u,1,1>, RHS
+  3704053862U,	// <1,6,2,0>: Cost 4 vsldoi4 <2,1,6,2>, LHS
+  3759547961U,	// <1,6,2,1>: Cost 4 vsldoi8 <0,2,1,6>, <2,1,6,0>
+  2222117370U,	// <1,6,2,2>: Cost 3 vmrghw <1,2,3,0>, <6,2,7,3>
+  3759548070U,	// <1,6,2,3>: Cost 4 vsldoi8 <0,2,1,6>, <2,3,0,1>
+  3704057142U,	// <1,6,2,4>: Cost 4 vsldoi4 <2,1,6,2>, RHS
+  3373451057U,	// <1,6,2,5>: Cost 4 vmrglw <3,0,1,2>, <2,4,6,5>
+  2685806522U,	// <1,6,2,6>: Cost 3 vsldoi8 <0,2,1,6>, <2,6,3,7>
+  1225968950U,	// <1,6,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS
+  1225968951U,	// <1,6,2,u>: Cost 2 vmrglw <3,0,1,2>, RHS
+  3759548566U,	// <1,6,3,0>: Cost 4 vsldoi8 <0,2,1,6>, <3,0,1,2>
+  3842912793U,	// <1,6,3,1>: Cost 4 vsldoi12 <2,u,6,1>, <6,3,1,7>
+  3759548774U,	// <1,6,3,2>: Cost 4 vsldoi8 <0,2,1,6>, <3,2,6,3>
+  3759548828U,	// <1,6,3,3>: Cost 4 vsldoi8 <0,2,1,6>, <3,3,3,3>
+  3759548930U,	// <1,6,3,4>: Cost 4 vsldoi8 <0,2,1,6>, <3,4,5,6>
+  3809315421U,	// <1,6,3,5>: Cost 4 vsldoi8 <u,5,1,6>, <3,5,6,7>
+  3386733368U,	// <1,6,3,6>: Cost 4 vmrglw <5,2,1,3>, <6,6,6,6>
+  2283130166U,	// <1,6,3,7>: Cost 3 vmrglw <0,2,1,3>, RHS
+  2283130167U,	// <1,6,3,u>: Cost 3 vmrglw <0,2,1,3>, RHS
+  3704070246U,	// <1,6,4,0>: Cost 4 vsldoi4 <2,1,6,4>, LHS
+  3862229608U,	// <1,6,4,1>: Cost 4 vsldoi12 <6,1,7,1>, <6,4,1,5>
+  3704071741U,	// <1,6,4,2>: Cost 4 vsldoi4 <2,1,6,4>, <2,1,6,4>
+  3721988610U,	// <1,6,4,3>: Cost 4 vsldoi4 <5,1,6,4>, <3,4,5,6>
+  3704073526U,	// <1,6,4,4>: Cost 4 vsldoi4 <2,1,6,4>, RHS
+  2685807926U,	// <1,6,4,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  3865621141U,	// <1,6,4,6>: Cost 4 vsldoi12 <6,6,u,1>, <6,4,6,5>
+  2283801910U,	// <1,6,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS
+  2685808169U,	// <1,6,4,u>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  3710050406U,	// <1,6,5,0>: Cost 4 vsldoi4 <3,1,6,5>, LHS
+  3710051571U,	// <1,6,5,1>: Cost 4 vsldoi4 <3,1,6,5>, <1,6,5,7>
+  3405989597U,	// <1,6,5,2>: Cost 4 vmrglw <u,4,1,5>, <2,3,6,2>
+  3358214502U,	// <1,6,5,3>: Cost 4 vmrglw <0,4,1,5>, <3,2,6,3>
+  3710053686U,	// <1,6,5,4>: Cost 4 vsldoi4 <3,1,6,5>, RHS
+  3721998025U,	// <1,6,5,5>: Cost 4 vsldoi4 <5,1,6,5>, <5,1,6,5>
+  2332250936U,	// <1,6,5,6>: Cost 3 vmrglw <u,4,1,5>, <6,6,6,6>
+  1210731830U,	// <1,6,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS
+  1210731831U,	// <1,6,5,u>: Cost 2 vmrglw <0,4,1,5>, RHS
+  2791289597U,	// <1,6,6,0>: Cost 3 vsldoi12 <6,6,0,1>, <6,6,0,1>
+  3698115430U,	// <1,6,6,1>: Cost 4 vsldoi4 <1,1,6,6>, <1,1,6,6>
+  3698116538U,	// <1,6,6,2>: Cost 4 vsldoi4 <1,1,6,6>, <2,6,3,7>
+  3356894132U,	// <1,6,6,3>: Cost 4 vmrglw <0,2,1,6>, <1,2,6,3>
+  3698117942U,	// <1,6,6,4>: Cost 4 vsldoi4 <1,1,6,6>, RHS
+  3722006218U,	// <1,6,6,5>: Cost 4 vsldoi4 <5,1,6,6>, <5,1,6,6>
+  2781041464U,	// <1,6,6,6>: Cost 3 vsldoi12 <4,u,5,1>, <6,6,6,6>
+  2283154742U,	// <1,6,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS
+  2283154743U,	// <1,6,6,u>: Cost 3 vmrglw <0,2,1,6>, RHS
+  1718211406U,	// <1,6,7,0>: Cost 2 vsldoi12 <6,7,0,1>, <6,7,0,1>
+  2792026967U,	// <1,6,7,1>: Cost 3 vsldoi12 <6,7,1,1>, <6,7,1,1>
+  2765411170U,	// <1,6,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <6,7,2,3>
+  3854783336U,	// <1,6,7,3>: Cost 4 vsldoi12 <4,u,5,1>, <6,7,3,0>
+  2781041526U,	// <1,6,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,7,4,5>
+  3365528664U,	// <1,6,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,4,6,5>
+  2791953290U,	// <1,6,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <6,7,6,7>
+  2291789110U,	// <1,6,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS
+  1718801302U,	// <1,6,7,u>: Cost 2 vsldoi12 <6,7,u,1>, <6,7,u,1>
+  1718875039U,	// <1,6,u,0>: Cost 2 vsldoi12 <6,u,0,1>, <6,u,0,1>
+  2685810478U,	// <1,6,u,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  2792764337U,	// <1,6,u,2>: Cost 3 vsldoi12 <6,u,2,1>, <6,u,2,1>
+  3759552444U,	// <1,6,u,3>: Cost 4 vsldoi8 <0,2,1,6>, <u,3,0,1>
+  2781041607U,	// <1,6,u,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,u,4,5>
+  2685810842U,	// <1,6,u,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  2689792208U,	// <1,6,u,6>: Cost 3 vsldoi8 <0,u,1,6>, <u,6,3,7>
+  1210756406U,	// <1,6,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS
+  1210756407U,	// <1,6,u,u>: Cost 2 vmrglw <0,4,1,u>, RHS
+  2793280496U,	// <1,7,0,0>: Cost 3 vsldoi12 <7,0,0,1>, <7,0,0,1>
+  2694439014U,	// <1,7,0,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  3393343912U,	// <1,7,0,2>: Cost 4 vmrglw <6,3,1,0>, <6,1,7,2>
+  3397325306U,	// <1,7,0,3>: Cost 4 vmrglw <7,0,1,0>, <6,2,7,3>
+  2793575444U,	// <1,7,0,4>: Cost 3 vsldoi12 <7,0,4,1>, <7,0,4,1>
+  3722030797U,	// <1,7,0,5>: Cost 4 vsldoi4 <5,1,7,0>, <5,1,7,0>
+  2688467446U,	// <1,7,0,6>: Cost 3 vsldoi8 <0,6,1,7>, <0,6,1,7>
+  2689131079U,	// <1,7,0,7>: Cost 3 vsldoi8 <0,7,1,7>, <0,7,1,7>
+  2694439570U,	// <1,7,0,u>: Cost 3 vsldoi8 <1,6,1,7>, <0,u,1,1>
+  2654265354U,	// <1,7,1,0>: Cost 3 vsldoi4 <6,1,7,1>, <0,0,1,1>
+  2794017866U,	// <1,7,1,1>: Cost 3 vsldoi12 <7,1,1,1>, <7,1,1,1>
+  3768181639U,	// <1,7,1,2>: Cost 4 vsldoi8 <1,6,1,7>, <1,2,1,3>
+  2334872058U,	// <1,7,1,3>: Cost 3 vmrglw <u,u,1,1>, <6,2,7,3>
+  2654268726U,	// <1,7,1,4>: Cost 3 vsldoi4 <6,1,7,1>, RHS
+  3792069797U,	// <1,7,1,5>: Cost 4 vsldoi8 <5,6,1,7>, <1,5,6,1>
+  2694440143U,	// <1,7,1,6>: Cost 3 vsldoi8 <1,6,1,7>, <1,6,1,7>
+  2334872386U,	// <1,7,1,7>: Cost 3 vmrglw <u,u,1,1>, <6,6,7,7>
+  2695767409U,	// <1,7,1,u>: Cost 3 vsldoi8 <1,u,1,7>, <1,u,1,7>
+  2654273638U,	// <1,7,2,0>: Cost 3 vsldoi4 <6,1,7,2>, LHS
+  2222117973U,	// <1,7,2,1>: Cost 3 vmrghw <1,2,3,0>, <7,1,2,3>
+  2299711912U,	// <1,7,2,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2>
+  2654275734U,	// <1,7,2,3>: Cost 3 vsldoi4 <6,1,7,2>, <3,0,1,2>
+  2654276918U,	// <1,7,2,4>: Cost 3 vsldoi4 <6,1,7,2>, RHS
+  3385397675U,	// <1,7,2,5>: Cost 4 vmrglw <5,0,1,2>, <6,1,7,5>
+  2654278056U,	// <1,7,2,6>: Cost 3 vsldoi4 <6,1,7,2>, <6,1,7,2>
+  2323599627U,	// <1,7,2,7>: Cost 3 vmrglw <7,0,1,2>, <5,u,7,7>
+  2654279470U,	// <1,7,2,u>: Cost 3 vsldoi4 <6,1,7,2>, LHS
+  2795271395U,	// <1,7,3,0>: Cost 3 vsldoi12 <7,3,0,1>, <7,3,0,1>
+  3768183059U,	// <1,7,3,1>: Cost 4 vsldoi8 <1,6,1,7>, <3,1,6,1>
+  3728025254U,	// <1,7,3,2>: Cost 4 vsldoi4 <6,1,7,3>, <2,3,0,1>
+  3768183196U,	// <1,7,3,3>: Cost 4 vsldoi8 <1,6,1,7>, <3,3,3,3>
+  3768183298U,	// <1,7,3,4>: Cost 4 vsldoi8 <1,6,1,7>, <3,4,5,6>
+  3792071255U,	// <1,7,3,5>: Cost 4 vsldoi8 <5,6,1,7>, <3,5,6,1>
+  3780127361U,	// <1,7,3,6>: Cost 4 vsldoi8 <3,6,1,7>, <3,6,1,7>
+  3847779617U,	// <1,7,3,7>: Cost 4 vsldoi12 <3,7,0,1>, <7,3,7,0>
+  2795861291U,	// <1,7,3,u>: Cost 3 vsldoi12 <7,3,u,1>, <7,3,u,1>
+  2795935028U,	// <1,7,4,0>: Cost 3 vsldoi12 <7,4,0,1>, <7,4,0,1>
+  3728032975U,	// <1,7,4,1>: Cost 4 vsldoi4 <6,1,7,4>, <1,6,1,7>
+  3839153480U,	// <1,7,4,2>: Cost 4 vsldoi12 <2,3,0,1>, <7,4,2,3>
+  3397358074U,	// <1,7,4,3>: Cost 4 vmrglw <7,0,1,4>, <6,2,7,3>
+  3854783835U,	// <1,7,4,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,4,4,4>
+  2694442294U,	// <1,7,4,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  3786100058U,	// <1,7,4,6>: Cost 4 vsldoi8 <4,6,1,7>, <4,6,1,7>
+  3722065254U,	// <1,7,4,7>: Cost 4 vsldoi4 <5,1,7,4>, <7,4,5,6>
+  2694442537U,	// <1,7,4,u>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  2654298214U,	// <1,7,5,0>: Cost 3 vsldoi4 <6,1,7,5>, LHS
+  3854783893U,	// <1,7,5,1>: Cost 4 vsldoi12 <4,u,5,1>, <7,5,1,u>
+  3710126010U,	// <1,7,5,2>: Cost 4 vsldoi4 <3,1,7,5>, <2,6,3,7>
+  2332250618U,	// <1,7,5,3>: Cost 3 vmrglw <u,4,1,5>, <6,2,7,3>
+  2654301494U,	// <1,7,5,4>: Cost 3 vsldoi4 <6,1,7,5>, RHS
+  2284474795U,	// <1,7,5,5>: Cost 3 vmrglw <0,4,1,5>, <6,1,7,5>
+  2718330931U,	// <1,7,5,6>: Cost 3 vsldoi8 <5,6,1,7>, <5,6,1,7>
+  2332250946U,	// <1,7,5,7>: Cost 3 vmrglw <u,4,1,5>, <6,6,7,7>
+  2719658197U,	// <1,7,5,u>: Cost 3 vsldoi8 <5,u,1,7>, <5,u,1,7>
+  2332921954U,	// <1,7,6,0>: Cost 3 vmrglw <u,5,1,6>, <5,6,7,0>
+  3768185254U,	// <1,7,6,1>: Cost 4 vsldoi8 <1,6,1,7>, <6,1,7,0>
+  3710134202U,	// <1,7,6,2>: Cost 4 vsldoi4 <3,1,7,6>, <2,6,3,7>
+  3710134561U,	// <1,7,6,3>: Cost 4 vsldoi4 <3,1,7,6>, <3,1,7,6>
+  3710135606U,	// <1,7,6,4>: Cost 4 vsldoi4 <3,1,7,6>, RHS
+  3864884745U,	// <1,7,6,5>: Cost 4 vsldoi12 <6,5,7,1>, <7,6,5,7>
+  3854784017U,	// <1,7,6,6>: Cost 4 vsldoi12 <4,u,5,1>, <7,6,6,6>
+  2791953940U,	// <1,7,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <7,6,7,0>
+  2792617501U,	// <1,7,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <7,6,u,0>
+  2797925927U,	// <1,7,7,0>: Cost 3 vsldoi12 <7,7,0,1>, <7,7,0,1>
+  3365528426U,	// <1,7,7,1>: Cost 4 vmrglw <1,6,1,7>, <1,1,7,1>
+  3728058022U,	// <1,7,7,2>: Cost 4 vsldoi4 <6,1,7,7>, <2,3,0,1>
+  3365528509U,	// <1,7,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,7,3>
+  3854784079U,	// <1,7,7,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,7,4,5>
+  3722088148U,	// <1,7,7,5>: Cost 4 vsldoi4 <5,1,7,7>, <5,1,7,7>
+  3728060845U,	// <1,7,7,6>: Cost 4 vsldoi4 <6,1,7,7>, <6,1,7,7>
+  2781042284U,	// <1,7,7,7>: Cost 3 vsldoi12 <4,u,5,1>, <7,7,7,7>
+  2798515823U,	// <1,7,7,u>: Cost 3 vsldoi12 <7,7,u,1>, <7,7,u,1>
+  2654322705U,	// <1,7,u,0>: Cost 3 vsldoi4 <6,1,7,u>, <0,0,1,u>
+  2694444846U,	// <1,7,u,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  2299711912U,	// <1,7,u,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2>
+  2323649018U,	// <1,7,u,3>: Cost 3 vmrglw <7,0,1,u>, <6,2,7,3>
+  2654326070U,	// <1,7,u,4>: Cost 3 vsldoi4 <6,1,7,u>, RHS
+  2694445210U,	// <1,7,u,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  2654327214U,	// <1,7,u,6>: Cost 3 vsldoi4 <6,1,7,u>, <6,1,7,u>
+  2323649346U,	// <1,7,u,7>: Cost 3 vmrglw <7,0,1,u>, <6,6,7,7>
+  2694445413U,	// <1,7,u,u>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  1610752017U,	// <1,u,0,0>: Cost 2 vsldoi8 <0,0,1,u>, <0,0,1,u>
+  1613406310U,	// <1,u,0,1>: Cost 2 vsldoi8 <0,4,1,u>, LHS
+  2685821107U,	// <1,u,0,2>: Cost 3 vsldoi8 <0,2,1,u>, <0,2,1,u>
+  2283765916U,	// <1,u,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS
+  1613406549U,	// <1,u,0,4>: Cost 2 vsldoi8 <0,4,1,u>, <0,4,1,u>
+  1725880054U,	// <1,u,0,5>: Cost 2 vsldoi12 <u,0,5,1>, <u,0,5,1>
+  2688475639U,	// <1,u,0,6>: Cost 3 vsldoi8 <0,6,1,u>, <0,6,1,u>
+  2283769160U,	// <1,u,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS
+  1613406877U,	// <1,u,0,u>: Cost 2 vsldoi8 <0,4,1,u>, LHS
+  1550221414U,	// <1,u,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,u,1,1>: Cost 1 vspltisw1 LHS
+  1683117870U,	// <1,u,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  1213350044U,	// <1,u,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS
+  1550224694U,	// <1,u,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  1147574426U,	// <1,u,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS
+  2687149326U,	// <1,u,1,6>: Cost 3 vsldoi8 <0,4,1,u>, <1,6,u,7>
+  1213353288U,	// <1,u,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS
+  269271142U,	// <1,u,1,u>: Cost 1 vspltisw1 LHS
+  2222118611U,	// <1,u,2,0>: Cost 3 vmrghw <1,2,3,0>, <u,0,1,2>
+  1148376878U,	// <1,u,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS
+  1148371862U,	// <1,u,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  1225965724U,	// <1,u,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS
+  2222118975U,	// <1,u,2,4>: Cost 3 vmrghw <1,2,3,0>, <u,4,5,6>
+  1148377242U,	// <1,u,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS
+  2687150010U,	// <1,u,2,6>: Cost 3 vsldoi8 <0,4,1,u>, <2,6,3,7>
+  1225968968U,	// <1,u,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS
+  1148377445U,	// <1,u,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS
+  471040156U,	// <1,u,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1544782644U,	// <1,u,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544783464U,	// <1,u,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544784022U,	// <1,u,3,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  471043382U,	// <1,u,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1592561668U,	// <1,u,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1592562170U,	// <1,u,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592562682U,	// <1,u,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  471045934U,	// <1,u,3,u>: Cost 1 vsldoi4 LHS, LHS
+  2708384629U,	// <1,u,4,0>: Cost 3 vsldoi8 <4,0,1,u>, <4,0,1,u>
+  2687151101U,	// <1,u,4,1>: Cost 3 vsldoi8 <0,4,1,u>, <4,1,u,0>
+  2223408022U,	// <1,u,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0>
+  2283798684U,	// <1,u,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS
+  2642422785U,	// <1,u,4,4>: Cost 3 vsldoi4 <4,1,u,4>, <4,1,u,4>
+  1613409590U,	// <1,u,4,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  2283801090U,	// <1,u,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6>
+  2283801928U,	// <1,u,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS
+  1613409833U,	// <1,u,4,u>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  2284471235U,	// <1,u,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,0>
+  2284472046U,	// <1,u,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,u,1>
+  2284472533U,	// <1,u,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,u,2>
+  1210728604U,	// <1,u,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2284471239U,	// <1,u,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,4>
+  1210728786U,	// <1,u,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  1683118234U,	// <1,u,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  1210731848U,	// <1,u,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS
+  1210728609U,	// <1,u,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2720330023U,	// <1,u,6,0>: Cost 3 vsldoi8 <6,0,1,u>, <6,0,1,u>
+  2757376190U,	// <1,u,6,1>: Cost 3 vsldoi12 <0,u,u,1>, <u,6,1,7>
+  2726302202U,	// <1,u,6,2>: Cost 3 vsldoi8 <7,0,1,u>, <6,2,7,3>
+  2283151516U,	// <1,u,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS
+  2224972114U,	// <1,u,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5>
+  2224683162U,	// <1,u,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS
+  2726302520U,	// <1,u,6,6>: Cost 3 vsldoi8 <7,0,1,u>, <6,6,6,6>
+  2283154760U,	// <1,u,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS
+  2283151521U,	// <1,u,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS
+  1652560896U,	// <1,u,7,0>: Cost 2 vsldoi8 <7,0,1,u>, <7,0,1,u>
+  2333590225U,	// <1,u,7,1>: Cost 3 vmrglw <u,6,1,7>, <0,u,u,1>
+  2765412628U,	// <1,u,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <u,7,2,3>
+  2291785884U,	// <1,u,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS
+  2781042984U,	// <1,u,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <u,7,4,5>
+  3365527953U,	// <1,u,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,u,5>
+  2791954748U,	// <1,u,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <u,7,6,7>
+  2291789128U,	// <1,u,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS
+  1657869960U,	// <1,u,7,u>: Cost 2 vsldoi8 <7,u,1,u>, <7,u,1,u>
+  471081121U,	// <1,u,u,0>: Cost 1 vsldoi4 LHS, LHS
+  269271142U,	// <1,u,u,1>: Cost 1 vspltisw1 LHS
+  1544824424U,	// <1,u,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544824982U,	// <1,u,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  471084342U,	// <1,u,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1613412506U,	// <1,u,u,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  1683118477U,	// <1,u,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  1210756424U,	// <1,u,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS
+  471086894U,	// <1,u,u,u>: Cost 1 vsldoi4 LHS, LHS
+  2226757632U,	// <2,0,0,0>: Cost 3 vmrghw <2,0,3,0>, <0,0,0,0>
+  2226757734U,	// <2,0,0,1>: Cost 3 vmrghw <2,0,3,0>, LHS
+  3826622483U,	// <2,0,0,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,0,2,1>
+  3843211292U,	// <2,0,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,0,3,1>
+  3300499794U,	// <2,0,0,4>: Cost 4 vmrghw <2,0,3,0>, <0,4,1,5>
+  3356256724U,	// <2,0,0,5>: Cost 4 vmrglw <0,1,2,0>, <3,4,0,5>
+  3825664056U,	// <2,0,0,6>: Cost 4 vsldoi12 <0,0,6,2>, <0,0,6,2>
+  3762889289U,	// <2,0,0,7>: Cost 4 vsldoi8 <0,7,2,0>, <0,7,2,0>
+  2226758301U,	// <2,0,0,u>: Cost 3 vmrghw <2,0,3,0>, LHS
+  2227429386U,	// <2,0,1,0>: Cost 3 vmrghw <2,1,3,1>, <0,0,1,1>
+  2227429478U,	// <2,0,1,1>: Cost 3 vmrghw <2,1,3,1>, LHS
+  1691156582U,	// <2,0,1,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2666358997U,	// <2,0,1,3>: Cost 3 vsldoi4 <u,2,0,1>, <3,0,u,2>
+  2227462482U,	// <2,0,1,4>: Cost 3 vmrghw <2,1,3,5>, <0,4,1,5>
+  3722186464U,	// <2,0,1,5>: Cost 4 vsldoi4 <5,2,0,1>, <5,2,0,1>
+  3867099278U,	// <2,0,1,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,1,6,7>
+  3366881912U,	// <2,0,1,7>: Cost 4 vmrglw <1,u,2,1>, <3,6,0,7>
+  1691156636U,	// <2,0,1,u>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2228027392U,	// <2,0,2,0>: Cost 3 vmrghw <2,2,2,2>, <0,0,0,0>
+  1154285670U,	// <2,0,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS
+  2228027565U,	// <2,0,2,2>: Cost 3 vmrghw <2,2,2,2>, <0,2,1,2>
+  3301769468U,	// <2,0,2,3>: Cost 4 vmrghw <2,2,2,2>, <0,3,1,0>
+  2228027730U,	// <2,0,2,4>: Cost 3 vmrghw <2,2,2,2>, <0,4,1,5>
+  3301769635U,	// <2,0,2,5>: Cost 4 vmrghw <2,2,2,2>, <0,5,1,5>
+  3780806586U,	// <2,0,2,6>: Cost 4 vsldoi8 <3,7,2,0>, <2,6,3,7>
+  3368880760U,	// <2,0,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,0,7>
+  1154286237U,	// <2,0,2,u>: Cost 2 vmrghw <2,2,2,2>, LHS
+  1213440000U,	// <2,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1213441702U,	// <2,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  2228535470U,	// <2,0,3,2>: Cost 3 vmrghw <2,3,0,1>, <0,2,1,3>
+  2636515632U,	// <2,0,3,3>: Cost 3 vsldoi4 <3,2,0,3>, <3,2,0,3>
+  2287182962U,	// <2,0,3,4>: Cost 3 vmrglw LHS, <1,5,0,4>
+  2660405346U,	// <2,0,3,5>: Cost 3 vsldoi4 <7,2,0,3>, <5,6,7,0>
+  2228535798U,	// <2,0,3,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7>
+  2660406420U,	// <2,0,3,7>: Cost 3 vsldoi4 <7,2,0,3>, <7,2,0,3>
+  1213441709U,	// <2,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  3368894464U,	// <2,0,4,0>: Cost 4 vmrglw <2,2,2,4>, <0,0,0,0>
+  2764898642U,	// <2,0,4,1>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,1,5>
+  3826622811U,	// <2,0,4,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,4,2,5>
+  3843211620U,	// <2,0,4,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,4,3,5>
+  3838640493U,	// <2,0,4,4>: Cost 4 vsldoi12 <2,2,2,2>, <0,4,4,5>
+  2732944694U,	// <2,0,4,5>: Cost 3 vsldoi8 <u,1,2,0>, RHS
+  3797396857U,	// <2,0,4,6>: Cost 4 vsldoi8 <6,5,2,0>, <4,6,5,2>
+  3867099528U,	// <2,0,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <0,4,7,5>
+  2764898705U,	// <2,0,4,u>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,u,5>
+  3364257792U,	// <2,0,5,0>: Cost 4 vmrglw <1,4,2,5>, <0,0,0,0>
+  2230124646U,	// <2,0,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS
+  3304235184U,	// <2,0,5,2>: Cost 4 vmrghw <2,5,u,6>, <0,2,1,5>
+  3364260144U,	// <2,0,5,3>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,3>
+  3303817554U,	// <2,0,5,4>: Cost 4 vmrghw <2,5,3,0>, <0,4,1,5>
+  3364260146U,	// <2,0,5,5>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,5>
+  3867099602U,	// <2,0,5,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,5,6,7>
+  3364260472U,	// <2,0,5,7>: Cost 4 vmrglw <1,4,2,5>, <3,6,0,7>
+  2230125213U,	// <2,0,5,u>: Cost 3 vmrghw <2,5,3,6>, LHS
+  2230796288U,	// <2,0,6,0>: Cost 3 vmrghw <2,6,3,7>, <0,0,0,0>
+  1157054566U,	// <2,0,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS
+  2230796465U,	// <2,0,6,2>: Cost 3 vmrghw <2,6,3,7>, <0,2,1,6>
+  3304538364U,	// <2,0,6,3>: Cost 4 vmrghw <2,6,3,7>, <0,3,1,0>
+  2230796626U,	// <2,0,6,4>: Cost 3 vmrghw <2,6,3,7>, <0,4,1,5>
+  3797398205U,	// <2,0,6,5>: Cost 4 vsldoi8 <6,5,2,0>, <6,5,2,0>
+  3304538614U,	// <2,0,6,6>: Cost 4 vmrghw <2,6,3,7>, <0,6,1,7>
+  3798725471U,	// <2,0,6,7>: Cost 4 vsldoi8 <6,7,2,0>, <6,7,2,0>
+  1157055133U,	// <2,0,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS
+  3371573248U,	// <2,0,7,0>: Cost 4 vmrglw <2,6,2,7>, <0,0,0,0>
+  2231189606U,	// <2,0,7,1>: Cost 3 vmrghw <2,7,0,1>, LHS
+  3801380003U,	// <2,0,7,2>: Cost 4 vsldoi8 <7,2,2,0>, <7,2,2,0>
+  3802043636U,	// <2,0,7,3>: Cost 4 vsldoi8 <7,3,2,0>, <7,3,2,0>
+  3806688614U,	// <2,0,7,4>: Cost 4 vsldoi8 <u,1,2,0>, <7,4,5,6>
+  3356317308U,	// <2,0,7,5>: Cost 4 vmrglw <0,1,2,7>, <7,u,0,5>
+  3804034535U,	// <2,0,7,6>: Cost 4 vsldoi8 <7,6,2,0>, <7,6,2,0>
+  3806688876U,	// <2,0,7,7>: Cost 4 vsldoi8 <u,1,2,0>, <7,7,7,7>
+  2231190173U,	// <2,0,7,u>: Cost 3 vmrghw <2,7,0,1>, LHS
+  1208836096U,	// <2,0,u,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1208837798U,	// <2,0,u,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  1691157149U,	// <2,0,u,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2636556597U,	// <2,0,u,3>: Cost 3 vsldoi4 <3,2,0,u>, <3,2,0,u>
+  2282579625U,	// <2,0,u,4>: Cost 3 vmrglw LHS, <2,3,0,4>
+  2660446306U,	// <2,0,u,5>: Cost 3 vsldoi4 <7,2,0,u>, <5,6,7,0>
+  2228535798U,	// <2,0,u,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7>
+  2660447385U,	// <2,0,u,7>: Cost 3 vsldoi4 <7,2,0,u>, <7,2,0,u>
+  1208837805U,	// <2,0,u,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  3692388523U,	// <2,1,0,0>: Cost 4 vsldoi4 <0,2,1,0>, <0,2,1,0>
+  2757526244U,	// <2,1,0,1>: Cost 3 vsldoi12 <1,0,1,2>, <1,0,1,2>
+  2330290974U,	// <2,1,0,2>: Cost 3 vmrglw <u,1,2,0>, <3,u,1,2>
+  3843212020U,	// <2,1,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <1,0,3,0>
+  3692391734U,	// <2,1,0,4>: Cost 4 vsldoi4 <0,2,1,0>, RHS
+  3300533362U,	// <2,1,0,5>: Cost 4 vmrghw <2,0,3,4>, <1,5,0,4>
+  3794084337U,	// <2,1,0,6>: Cost 4 vsldoi8 <6,0,2,1>, <0,6,1,2>
+  3374170614U,	// <2,1,0,7>: Cost 5 vmrglw <3,1,2,0>, <0,6,1,7>
+  2758042403U,	// <2,1,0,u>: Cost 3 vsldoi12 <1,0,u,2>, <1,0,u,2>
+  2690482924U,	// <2,1,1,0>: Cost 3 vsldoi8 <1,0,2,1>, <1,0,2,1>
+  2764899124U,	// <2,1,1,1>: Cost 3 vsldoi12 <2,2,2,2>, <1,1,1,1>
+  2695791510U,	// <2,1,1,2>: Cost 3 vsldoi8 <1,u,2,1>, <1,2,3,0>
+  3362235271U,	// <2,1,1,3>: Cost 4 vmrglw <1,1,2,1>, <1,2,1,3>
+  3692399926U,	// <2,1,1,4>: Cost 4 vsldoi4 <0,2,1,1>, RHS
+  3832226649U,	// <2,1,1,5>: Cost 4 vsldoi12 <1,1,5,2>, <1,1,5,2>
+  3301205235U,	// <2,1,1,6>: Cost 4 vmrghw <2,1,3,5>, <1,6,5,7>
+  3768870179U,	// <2,1,1,7>: Cost 4 vsldoi8 <1,7,2,1>, <1,7,2,1>
+  2695791988U,	// <2,1,1,u>: Cost 3 vsldoi8 <1,u,2,1>, <1,u,2,1>
+  2618663085U,	// <2,1,2,0>: Cost 3 vsldoi4 <0,2,1,2>, <0,2,1,2>
+  2228028212U,	// <2,1,2,1>: Cost 3 vmrghw <2,2,2,2>, <1,1,1,1>
+  2618664552U,	// <2,1,2,2>: Cost 3 vsldoi4 <0,2,1,2>, <2,2,2,2>
+  2759000984U,	// <2,1,2,3>: Cost 3 vsldoi12 <1,2,3,2>, <1,2,3,2>
+  2618666294U,	// <2,1,2,4>: Cost 3 vsldoi4 <0,2,1,2>, RHS
+  2295136594U,	// <2,1,2,5>: Cost 3 vmrglw <2,2,2,2>, <0,4,1,5>
+  3769534376U,	// <2,1,2,6>: Cost 4 vsldoi8 <1,u,2,1>, <2,6,1,7>
+  2793358266U,	// <2,1,2,7>: Cost 3 vsldoi12 <7,0,1,2>, <1,2,7,0>
+  2618668846U,	// <2,1,2,u>: Cost 3 vsldoi4 <0,2,1,2>, LHS
+  2282536969U,	// <2,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1208795146U,	// <2,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1213442198U,	// <2,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2287181998U,	// <2,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2618674486U,	// <2,1,3,4>: Cost 3 vsldoi4 <0,2,1,3>, RHS
+  1208795474U,	// <2,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2287182001U,	// <2,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287183055U,	// <2,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1208795153U,	// <2,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  3692421295U,	// <2,1,4,0>: Cost 4 vsldoi4 <0,2,1,4>, <0,2,1,4>
+  3838641195U,	// <2,1,4,1>: Cost 4 vsldoi12 <2,2,2,2>, <1,4,1,5>
+  2330323742U,	// <2,1,4,2>: Cost 3 vmrglw <u,1,2,4>, <3,u,1,2>
+  3692423318U,	// <2,1,4,3>: Cost 5 vsldoi4 <0,2,1,4>, <3,0,1,2>
+  3692424502U,	// <2,1,4,4>: Cost 4 vsldoi4 <0,2,1,4>, RHS
+  2695793974U,	// <2,1,4,5>: Cost 3 vsldoi8 <1,u,2,1>, RHS
+  3799395705U,	// <2,1,4,6>: Cost 4 vsldoi8 <6,u,2,1>, <4,6,5,2>
+  3368895695U,	// <2,1,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,1,7>
+  2695794217U,	// <2,1,4,u>: Cost 3 vsldoi8 <1,u,2,1>, RHS
+  3692429488U,	// <2,1,5,0>: Cost 4 vsldoi4 <0,2,1,5>, <0,2,1,5>
+  3364257802U,	// <2,1,5,1>: Cost 4 vmrglw <1,4,2,5>, <0,0,1,1>
+  3692431253U,	// <2,1,5,2>: Cost 4 vsldoi4 <0,2,1,5>, <2,5,u,6>
+  3692431874U,	// <2,1,5,3>: Cost 4 vsldoi4 <0,2,1,5>, <3,4,5,6>
+  3692432694U,	// <2,1,5,4>: Cost 4 vsldoi4 <0,2,1,5>, RHS
+  3364258130U,	// <2,1,5,5>: Cost 4 vmrglw <1,4,2,5>, <0,4,1,5>
+  3303875827U,	// <2,1,5,6>: Cost 4 vmrghw <2,5,3,7>, <1,6,5,7>
+  3867100333U,	// <2,1,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <1,5,7,0>
+  3692435246U,	// <2,1,5,u>: Cost 4 vsldoi4 <0,2,1,5>, LHS
+  2618695857U,	// <2,1,6,0>: Cost 3 vsldoi4 <0,2,1,6>, <0,2,1,6>
+  2230797108U,	// <2,1,6,1>: Cost 3 vmrghw <2,6,3,7>, <1,1,1,1>
+  2618697658U,	// <2,1,6,2>: Cost 3 vsldoi4 <0,2,1,6>, <2,6,3,7>
+  3692439702U,	// <2,1,6,3>: Cost 4 vsldoi4 <0,2,1,6>, <3,0,1,2>
+  2618699062U,	// <2,1,6,4>: Cost 3 vsldoi4 <0,2,1,6>, RHS
+  3364929874U,	// <2,1,6,5>: Cost 4 vmrglw <1,5,2,6>, <0,4,1,5>
+  3692442424U,	// <2,1,6,6>: Cost 4 vsldoi4 <0,2,1,6>, <6,6,6,6>
+  3798733664U,	// <2,1,6,7>: Cost 4 vsldoi8 <6,7,2,1>, <6,7,2,1>
+  2618701614U,	// <2,1,6,u>: Cost 3 vsldoi4 <0,2,1,6>, LHS
+  3799397370U,	// <2,1,7,0>: Cost 4 vsldoi8 <6,u,2,1>, <7,0,1,2>
+  3371573258U,	// <2,1,7,1>: Cost 4 vmrglw <2,6,2,7>, <0,0,1,1>
+  2330351234U,	// <2,1,7,2>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2>
+  3799397658U,	// <2,1,7,3>: Cost 4 vsldoi8 <6,u,2,1>, <7,3,6,2>
+  3799397734U,	// <2,1,7,4>: Cost 4 vsldoi8 <6,u,2,1>, <7,4,5,6>
+  3371573586U,	// <2,1,7,5>: Cost 4 vmrglw <2,6,2,7>, <0,4,1,5>
+  3799397870U,	// <2,1,7,6>: Cost 4 vsldoi8 <6,u,2,1>, <7,6,2,7>
+  3799397956U,	// <2,1,7,7>: Cost 4 vsldoi8 <6,u,2,1>, <7,7,3,3>
+  2330351234U,	// <2,1,7,u>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2>
+  2282577929U,	// <2,1,u,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1208836106U,	// <2,1,u,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1208838294U,	// <2,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2282578094U,	// <2,1,u,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2282577933U,	// <2,1,u,4>: Cost 3 vmrglw LHS, <0,0,1,4>
+  1208836434U,	// <2,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2282578097U,	// <2,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287224015U,	// <2,1,u,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1208836113U,	// <2,1,u,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  2226759117U,	// <2,2,0,0>: Cost 3 vmrghw <2,0,3,0>, <2,0,3,0>
+  1624047718U,	// <2,2,0,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  2697789613U,	// <2,2,0,2>: Cost 3 vsldoi8 <2,2,2,2>, <0,2,1,2>
+  2226767526U,	// <2,2,0,3>: Cost 3 vmrghw <2,0,3,1>, <2,3,0,1>
+  2697789778U,	// <2,2,0,4>: Cost 3 vsldoi8 <2,2,2,2>, <0,4,1,5>
+  3300657000U,	// <2,2,0,5>: Cost 4 vmrghw <2,0,5,1>, <2,5,3,6>
+  2226988986U,	// <2,2,0,6>: Cost 3 vmrghw <2,0,6,1>, <2,6,3,7>
+  3734271139U,	// <2,2,0,7>: Cost 4 vsldoi4 <7,2,2,0>, <7,2,2,0>
+  1624048285U,	// <2,2,0,u>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  3831268868U,	// <2,2,1,0>: Cost 4 vsldoi12 <1,0,1,2>, <2,1,0,1>
+  2293138804U,	// <2,2,1,1>: Cost 3 vmrglw <1,u,2,1>, <1,u,2,1>
+  2697790358U,	// <2,2,1,2>: Cost 3 vsldoi8 <2,2,2,2>, <1,2,3,0>
+  2293137510U,	// <2,2,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS
+  3771532331U,	// <2,2,1,4>: Cost 4 vsldoi8 <2,2,2,2>, <1,4,1,5>
+  3767551106U,	// <2,2,1,5>: Cost 4 vsldoi8 <1,5,2,2>, <1,5,2,2>
+  3301173178U,	// <2,2,1,6>: Cost 4 vmrghw <2,1,3,1>, <2,6,3,7>
+  3372853169U,	// <2,2,1,7>: Cost 4 vmrglw <2,u,2,1>, <2,6,2,7>
+  2293137515U,	// <2,2,1,u>: Cost 3 vmrglw <1,u,2,1>, LHS
+  1556938854U,	// <2,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  2295137733U,	// <2,2,2,1>: Cost 3 vmrglw <2,2,2,2>, <2,0,2,1>
+  336380006U,	// <2,2,2,2>: Cost 1 vspltisw2 LHS
+  1221394534U,	// <2,2,2,3>: Cost 2 vmrglw <2,2,2,2>, LHS
+  1556942134U,	// <2,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  2295138061U,	// <2,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5>
+  2228029370U,	// <2,2,2,6>: Cost 3 vmrghw <2,2,2,2>, <2,6,3,7>
+  2660545701U,	// <2,2,2,7>: Cost 3 vsldoi4 <7,2,2,2>, <7,2,2,2>
+  336380006U,	// <2,2,2,u>: Cost 1 vspltisw2 LHS
+  2697791638U,	// <2,2,3,0>: Cost 3 vsldoi8 <2,2,2,2>, <3,0,1,2>
+  2765489840U,	// <2,2,3,1>: Cost 3 vsldoi12 <2,3,1,2>, <2,3,1,2>
+  1213441640U,	// <2,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2>
+  135053414U,	// <2,2,3,3>: Cost 1 vmrglw LHS, LHS
+  2697792002U,	// <2,2,3,4>: Cost 3 vsldoi8 <2,2,2,2>, <3,4,5,6>
+  2330313780U,	// <2,2,3,5>: Cost 3 vmrglw LHS, <1,4,2,5>
+  2287183549U,	// <2,2,3,6>: Cost 3 vmrglw LHS, <2,3,2,6>
+  2660553894U,	// <2,2,3,7>: Cost 3 vsldoi4 <7,2,2,3>, <7,2,2,3>
+  135053419U,	// <2,2,3,u>: Cost 1 vmrglw LHS, LHS
+  2630697062U,	// <2,2,4,0>: Cost 3 vsldoi4 <2,2,2,4>, LHS
+  3771534282U,	// <2,2,4,1>: Cost 4 vsldoi8 <2,2,2,2>, <4,1,2,3>
+  2764900109U,	// <2,2,4,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,4,2,5>
+  2295152742U,	// <2,2,4,3>: Cost 3 vmrglw <2,2,2,4>, LHS
+  2295154282U,	// <2,2,4,4>: Cost 3 vmrglw <2,2,2,4>, <2,2,2,4>
+  1624050998U,	// <2,2,4,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  2229675962U,	// <2,2,4,6>: Cost 3 vmrghw <2,4,6,5>, <2,6,3,7>
+  3368896433U,	// <2,2,4,7>: Cost 4 vmrglw <2,2,2,4>, <2,6,2,7>
+  1624051241U,	// <2,2,4,u>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  3771534920U,	// <2,2,5,0>: Cost 4 vsldoi8 <2,2,2,2>, <5,0,1,2>
+  3364258540U,	// <2,2,5,1>: Cost 4 vmrglw <1,4,2,5>, <1,0,2,1>
+  2296489576U,	// <2,2,5,2>: Cost 3 vmrglw <2,4,2,5>, <2,2,2,2>
+  2290516070U,	// <2,2,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS
+  3771535284U,	// <2,2,5,4>: Cost 4 vsldoi8 <2,2,2,2>, <5,4,5,6>
+  2290517044U,	// <2,2,5,5>: Cost 3 vmrglw <1,4,2,5>, <1,4,2,5>
+  2697793634U,	// <2,2,5,6>: Cost 3 vsldoi8 <2,2,2,2>, <5,6,7,0>
+  3370231729U,	// <2,2,5,7>: Cost 4 vmrglw <2,4,2,5>, <2,6,2,7>
+  2290516075U,	// <2,2,5,u>: Cost 3 vmrglw <1,4,2,5>, LHS
+  2230797801U,	// <2,2,6,0>: Cost 3 vmrghw <2,6,3,7>, <2,0,6,1>
+  3304539679U,	// <2,2,6,1>: Cost 4 vmrghw <2,6,3,7>, <2,1,3,1>
+  2764900273U,	// <2,2,6,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,2,7>
+  2764900282U,	// <2,2,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,3,7>
+  2230798129U,	// <2,2,6,4>: Cost 3 vmrghw <2,6,3,7>, <2,4,6,5>
+  3304540008U,	// <2,2,6,5>: Cost 4 vmrghw <2,6,3,7>, <2,5,3,6>
+  1157056442U,	// <2,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2725000033U,	// <2,2,6,7>: Cost 3 vsldoi8 <6,7,2,2>, <6,7,2,2>
+  1157056442U,	// <2,2,6,u>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2793359338U,	// <2,2,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <2,7,0,1>
+  3371574725U,	// <2,2,7,1>: Cost 4 vmrglw <2,6,2,7>, <2,0,2,1>
+  2297833064U,	// <2,2,7,2>: Cost 3 vmrglw <2,6,2,7>, <2,2,2,2>
+  2297831526U,	// <2,2,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS
+  2697794918U,	// <2,2,7,4>: Cost 3 vsldoi8 <2,2,2,2>, <7,4,5,6>
+  3371575053U,	// <2,2,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,2,5>
+  3304933297U,	// <2,2,7,6>: Cost 4 vmrghw <2,7,0,1>, <2,6,2,7>
+  2297833393U,	// <2,2,7,7>: Cost 3 vmrglw <2,6,2,7>, <2,6,2,7>
+  2297831531U,	// <2,2,7,u>: Cost 3 vmrglw <2,6,2,7>, LHS
+  1556938854U,	// <2,2,u,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  1624053550U,	// <2,2,u,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  336380006U,	// <2,2,u,2>: Cost 1 vspltisw2 LHS
+  135094374U,	// <2,2,u,3>: Cost 1 vmrglw LHS, LHS
+  1556942134U,	// <2,2,u,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  1624053914U,	// <2,2,u,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  1157056442U,	// <2,2,u,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2660594859U,	// <2,2,u,7>: Cost 3 vsldoi4 <7,2,2,u>, <7,2,2,u>
+  135094379U,	// <2,2,u,u>: Cost 1 vmrglw LHS, LHS
+  1611448320U,	// <2,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  537706598U,	// <2,3,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2689835181U,	// <2,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2689835260U,	// <2,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611448658U,	// <2,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2732966354U,	// <2,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7>
+  2732966390U,	// <2,3,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7>
+  2660603052U,	// <2,3,0,7>: Cost 3 vsldoi4 <7,2,3,0>, <7,2,3,0>
+  537707165U,	// <2,3,0,u>: Cost 1 vsldoi8 LHS, LHS
+  2689835748U,	// <2,3,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2>
+  1611449140U,	// <2,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611449238U,	// <2,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  3763577805U,	// <2,3,1,3>: Cost 4 vsldoi8 LHS, <1,3,0,1>
+  2689836112U,	// <2,3,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6>
+  2689836143U,	// <2,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2689836239U,	// <2,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  3366881210U,	// <2,3,1,7>: Cost 4 vmrglw <1,u,2,1>, <2,6,3,7>
+  1616094588U,	// <2,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  2689836493U,	// <2,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,3,0>
+  2685191711U,	// <2,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1>
+  1611449960U,	// <2,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2>
+  1611450022U,	// <2,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  2689836822U,	// <2,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,3,5>
+  2689836904U,	// <2,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,3,6>
+  1611450298U,	// <2,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  2295138234U,	// <2,3,2,7>: Cost 3 vmrglw <2,2,2,2>, <2,6,3,7>
+  1611450456U,	// <2,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,3,3>
+  1213440918U,	// <2,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  2282538527U,	// <2,3,3,1>: Cost 3 vmrglw LHS, <2,1,3,1>
+  1557022322U,	// <2,3,3,2>: Cost 2 vsldoi4 <2,2,3,3>, <2,2,3,3>
+  1208796786U,	// <2,3,3,3>: Cost 2 vmrglw LHS, <2,2,3,3>
+  1213440922U,	// <2,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  2282538531U,	// <2,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5>
+  2287188094U,	// <2,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6>
+  1213441978U,	// <2,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  1208796791U,	// <2,3,3,u>: Cost 2 vmrglw LHS, <2,2,3,u>
+  1551056998U,	// <2,3,4,0>: Cost 2 vsldoi4 <1,2,3,4>, LHS
+  1551057818U,	// <2,3,4,1>: Cost 2 vsldoi4 <1,2,3,4>, <1,2,3,4>
+  2624800360U,	// <2,3,4,2>: Cost 3 vsldoi4 <1,2,3,4>, <2,2,2,2>
+  2624800918U,	// <2,3,4,3>: Cost 3 vsldoi4 <1,2,3,4>, <3,0,1,2>
+  1551060278U,	// <2,3,4,4>: Cost 2 vsldoi4 <1,2,3,4>, RHS
+  537709878U,	// <2,3,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2732969337U,	// <2,3,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2>
+  2660635824U,	// <2,3,4,7>: Cost 3 vsldoi4 <7,2,3,4>, <7,2,3,4>
+  537710121U,	// <2,3,4,u>: Cost 1 vsldoi8 LHS, RHS
+  2689838664U,	// <2,3,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2>
+  2732969615U,	// <2,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1>
+  2732969707U,	// <2,3,5,2>: Cost 3 vsldoi8 LHS, <5,2,1,3>
+  3763580721U,	// <2,3,5,3>: Cost 4 vsldoi8 LHS, <5,3,0,1>
+  2689839028U,	// <2,3,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6>
+  1659228164U,	// <2,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1659228258U,	// <2,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0>
+  3364259770U,	// <2,3,5,7>: Cost 4 vmrglw <1,4,2,5>, <2,6,3,7>
+  1659228420U,	// <2,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0>
+  2230798486U,	// <2,3,6,0>: Cost 3 vmrghw <2,6,3,7>, <3,0,1,2>
+  2732970407U,	// <2,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1>
+  1659228666U,	// <2,3,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3>
+  2230798748U,	// <2,3,6,3>: Cost 3 vmrghw <2,6,3,7>, <3,3,3,3>
+  2230798850U,	// <2,3,6,4>: Cost 3 vmrghw <2,6,3,7>, <3,4,5,6>
+  2732970731U,	// <2,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1>
+  1659228984U,	// <2,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659229006U,	// <2,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1659229087U,	// <2,3,6,u>: Cost 2 vsldoi8 LHS, <6,u,0,1>
+  1659229178U,	// <2,3,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2>
+  2726999125U,	// <2,3,7,1>: Cost 3 vsldoi8 <7,1,2,3>, <7,1,2,3>
+  2727662758U,	// <2,3,7,2>: Cost 3 vsldoi8 <7,2,2,3>, <7,2,2,3>
+  2732971235U,	// <2,3,7,3>: Cost 3 vsldoi8 LHS, <7,3,0,1>
+  1659229542U,	// <2,3,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6>
+  2732971446U,	// <2,3,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5>
+  2732971484U,	// <2,3,7,6>: Cost 3 vsldoi8 LHS, <7,6,0,7>
+  1659229804U,	// <2,3,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7>
+  1659229826U,	// <2,3,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2>
+  1208837014U,	// <2,3,u,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  537712430U,	// <2,3,u,1>: Cost 1 vsldoi8 LHS, LHS
+  1616099205U,	// <2,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,0>
+  1208837746U,	// <2,3,u,3>: Cost 2 vmrglw LHS, <2,2,3,3>
+  1208837018U,	// <2,3,u,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  537712794U,	// <2,3,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1616099536U,	// <2,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7>
+  1208838074U,	// <2,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  537712997U,	// <2,3,u,u>: Cost 1 vsldoi8 LHS, LHS
+  3771547648U,	// <2,4,0,0>: Cost 4 vsldoi8 <2,2,2,4>, <0,0,0,0>
+  2697805926U,	// <2,4,0,1>: Cost 3 vsldoi8 <2,2,2,4>, LHS
+  3770884269U,	// <2,4,0,2>: Cost 4 vsldoi8 <2,1,2,4>, <0,2,1,2>
+  3806716164U,	// <2,4,0,3>: Cost 4 vsldoi8 <u,1,2,4>, <0,3,1,u>
+  3771547986U,	// <2,4,0,4>: Cost 4 vsldoi8 <2,2,2,4>, <0,4,1,5>
+  2226761014U,	// <2,4,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS
+  3853462427U,	// <2,4,0,6>: Cost 4 vsldoi12 <4,6,5,2>, <4,0,6,1>
+  3867102116U,	// <2,4,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,0,7,1>
+  2226761257U,	// <2,4,0,u>: Cost 3 vmrghw <2,0,3,0>, RHS
+  3849186231U,	// <2,4,1,0>: Cost 4 vsldoi12 <4,0,1,2>, <4,1,0,2>
+  3301207010U,	// <2,4,1,1>: Cost 4 vmrghw <2,1,3,5>, <4,1,5,0>
+  3766240150U,	// <2,4,1,2>: Cost 4 vsldoi8 <1,3,2,4>, <1,2,3,0>
+  3766240226U,	// <2,4,1,3>: Cost 4 vsldoi8 <1,3,2,4>, <1,3,2,4>
+  3301207248U,	// <2,4,1,4>: Cost 4 vmrghw <2,1,3,5>, <4,4,4,4>
+  2227432758U,	// <2,4,1,5>: Cost 3 vmrghw <2,1,3,1>, RHS
+  3758941400U,	// <2,4,1,6>: Cost 4 vsldoi8 <0,1,2,4>, <1,6,2,7>
+  3768894758U,	// <2,4,1,7>: Cost 4 vsldoi8 <1,7,2,4>, <1,7,2,4>
+  2227433001U,	// <2,4,1,u>: Cost 3 vmrghw <2,1,3,1>, RHS
+  2228030354U,	// <2,4,2,0>: Cost 3 vmrghw <2,2,2,2>, <4,0,5,1>
+  3770885657U,	// <2,4,2,1>: Cost 4 vsldoi8 <2,1,2,4>, <2,1,2,4>
+  2697807466U,	// <2,4,2,2>: Cost 3 vsldoi8 <2,2,2,4>, <2,2,2,4>
+  3368880468U,	// <2,4,2,3>: Cost 4 vmrglw <2,2,2,2>, <3,2,4,3>
+  2228030672U,	// <2,4,2,4>: Cost 3 vmrghw <2,2,2,2>, <4,4,4,4>
+  1154288950U,	// <2,4,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS
+  3771549617U,	// <2,4,2,6>: Cost 4 vsldoi8 <2,2,2,4>, <2,6,2,7>
+  3368880796U,	// <2,4,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,4,7>
+  1154289193U,	// <2,4,2,u>: Cost 2 vmrghw <2,2,2,2>, RHS
+  2636808294U,	// <2,4,3,0>: Cost 3 vsldoi4 <3,2,4,3>, LHS
+  2287181861U,	// <2,4,3,1>: Cost 3 vmrglw LHS, <0,0,4,1>
+  2228866102U,	// <2,4,3,2>: Cost 3 vmrghw <2,3,4,5>, <4,2,5,3>
+  2636810580U,	// <2,4,3,3>: Cost 3 vsldoi4 <3,2,4,3>, <3,2,4,3>
+  1256574160U,	// <2,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1213441742U,	// <2,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  2228866430U,	// <2,4,3,6>: Cost 3 vmrghw <2,3,4,5>, <4,6,5,7>
+  2660701368U,	// <2,4,3,7>: Cost 3 vsldoi4 <7,2,4,3>, <7,2,4,3>
+  1213441745U,	// <2,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  3704586342U,	// <2,4,4,0>: Cost 4 vsldoi4 <2,2,4,4>, LHS
+  3782831051U,	// <2,4,4,1>: Cost 4 vsldoi8 <4,1,2,4>, <4,1,2,4>
+  3704587900U,	// <2,4,4,2>: Cost 4 vsldoi4 <2,2,4,4>, <2,2,4,4>
+  3368896123U,	// <2,4,4,3>: Cost 4 vmrglw <2,2,2,4>, <2,2,4,3>
+  2793360592U,	// <2,4,4,4>: Cost 3 vsldoi12 <7,0,1,2>, <4,4,4,4>
+  2697809206U,	// <2,4,4,5>: Cost 3 vsldoi8 <2,2,2,4>, RHS
+  3303198078U,	// <2,4,4,6>: Cost 4 vmrghw <2,4,3,5>, <4,6,5,7>
+  3867102444U,	// <2,4,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,4,7,5>
+  2697809449U,	// <2,4,4,u>: Cost 3 vsldoi8 <2,2,2,4>, RHS
+  2630852710U,	// <2,4,5,0>: Cost 3 vsldoi4 <2,2,4,5>, LHS
+  2624881572U,	// <2,4,5,1>: Cost 3 vsldoi4 <1,2,4,5>, <1,2,4,5>
+  2630854269U,	// <2,4,5,2>: Cost 3 vsldoi4 <2,2,4,5>, <2,2,4,5>
+  2666686677U,	// <2,4,5,3>: Cost 3 vsldoi4 <u,2,4,5>, <3,0,u,2>
+  2630855990U,	// <2,4,5,4>: Cost 3 vsldoi4 <2,2,4,5>, RHS
+  2230127926U,	// <2,4,5,5>: Cost 3 vmrghw <2,5,3,6>, RHS
+  1691159862U,	// <2,4,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  3867102520U,	// <2,4,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,5,7,0>
+  1691159880U,	// <2,4,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2230799250U,	// <2,4,6,0>: Cost 3 vmrghw <2,6,3,7>, <4,0,5,1>
+  3304541130U,	// <2,4,6,1>: Cost 4 vmrghw <2,6,3,7>, <4,1,2,3>
+  2230799417U,	// <2,4,6,2>: Cost 3 vmrghw <2,6,3,7>, <4,2,5,6>
+  3304541323U,	// <2,4,6,3>: Cost 4 vmrghw <2,6,3,7>, <4,3,5,7>
+  2230799568U,	// <2,4,6,4>: Cost 3 vmrghw <2,6,3,7>, <4,4,4,4>
+  1157057846U,	// <2,4,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS
+  3304541566U,	// <2,4,6,6>: Cost 4 vmrghw <2,6,3,7>, <4,6,5,7>
+  3798758243U,	// <2,4,6,7>: Cost 4 vsldoi8 <6,7,2,4>, <6,7,2,4>
+  1157058089U,	// <2,4,6,u>: Cost 2 vmrghw <2,6,3,7>, RHS
+  3806721018U,	// <2,4,7,0>: Cost 4 vsldoi8 <u,1,2,4>, <7,0,1,2>
+  3853831590U,	// <2,4,7,1>: Cost 4 vsldoi12 <4,7,1,2>, <4,7,1,2>
+  3801412775U,	// <2,4,7,2>: Cost 4 vsldoi8 <7,2,2,4>, <7,2,2,4>
+  3802076408U,	// <2,4,7,3>: Cost 4 vsldoi8 <7,3,2,4>, <7,3,2,4>
+  3401436368U,	// <2,4,7,4>: Cost 4 vmrglw <7,6,2,7>, <4,4,4,4>
+  2793360840U,	// <2,4,7,5>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,5,0>
+  3804067307U,	// <2,4,7,6>: Cost 4 vsldoi8 <7,6,2,4>, <7,6,2,4>
+  3867102682U,	// <2,4,7,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,7,7,0>
+  2793360867U,	// <2,4,7,u>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,u,0>
+  2630877286U,	// <2,4,u,0>: Cost 3 vsldoi4 <2,2,4,u>, LHS
+  2282580144U,	// <2,4,u,1>: Cost 3 vmrglw LHS, <3,0,4,1>
+  2630878848U,	// <2,4,u,2>: Cost 3 vsldoi4 <2,2,4,u>, <2,2,4,u>
+  2636851545U,	// <2,4,u,3>: Cost 3 vsldoi4 <3,2,4,u>, <3,2,4,u>
+  1256615120U,	// <2,4,u,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1208837838U,	// <2,4,u,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  1691160105U,	// <2,4,u,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2660742333U,	// <2,4,u,7>: Cost 3 vsldoi4 <7,2,4,u>, <7,2,4,u>
+  1208837841U,	// <2,4,u,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  3766910976U,	// <2,5,0,0>: Cost 4 vsldoi8 <1,4,2,5>, <0,0,0,0>
+  2693169254U,	// <2,5,0,1>: Cost 3 vsldoi8 <1,4,2,5>, LHS
+  3760939181U,	// <2,5,0,2>: Cost 4 vsldoi8 <0,4,2,5>, <0,2,1,2>
+  3843214936U,	// <2,5,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <5,0,3,0>
+  3760939355U,	// <2,5,0,4>: Cost 4 vsldoi8 <0,4,2,5>, <0,4,2,5>
+  3867102827U,	// <2,5,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,5,1>
+  3867102836U,	// <2,5,0,6>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,6,1>
+  3867102844U,	// <2,5,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,7,0>
+  2693169821U,	// <2,5,0,u>: Cost 3 vsldoi8 <1,4,2,5>, LHS
+  3766911724U,	// <2,5,1,0>: Cost 4 vsldoi8 <1,4,2,5>, <1,0,2,1>
+  3766911796U,	// <2,5,1,1>: Cost 4 vsldoi8 <1,4,2,5>, <1,1,1,1>
+  2693170070U,	// <2,5,1,2>: Cost 3 vsldoi8 <1,4,2,5>, <1,2,3,0>
+  3384798262U,	// <2,5,1,3>: Cost 4 vmrglw <4,u,2,1>, <4,2,5,3>
+  2693170228U,	// <2,5,1,4>: Cost 3 vsldoi8 <1,4,2,5>, <1,4,2,5>
+  3301208068U,	// <2,5,1,5>: Cost 4 vmrghw <2,1,3,5>, <5,5,5,5>
+  3366879607U,	// <2,5,1,6>: Cost 4 vmrglw <1,u,2,1>, <0,4,5,6>
+  3867102925U,	// <2,5,1,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,1,7,0>
+  2695824760U,	// <2,5,1,u>: Cost 3 vsldoi8 <1,u,2,5>, <1,u,2,5>
+  2642845798U,	// <2,5,2,0>: Cost 3 vsldoi4 <4,2,5,2>, LHS
+  2295139218U,	// <2,5,2,1>: Cost 3 vmrglw <2,2,2,2>, <4,0,5,1>
+  2699142760U,	// <2,5,2,2>: Cost 3 vsldoi8 <2,4,2,5>, <2,2,2,2>
+  3766912678U,	// <2,5,2,3>: Cost 4 vsldoi8 <1,4,2,5>, <2,3,0,1>
+  2699142925U,	// <2,5,2,4>: Cost 3 vsldoi8 <2,4,2,5>, <2,4,2,5>
+  2228031492U,	// <2,5,2,5>: Cost 3 vmrghw <2,2,2,2>, <5,5,5,5>
+  2295138818U,	// <2,5,2,6>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,6>
+  3368879347U,	// <2,5,2,7>: Cost 4 vmrglw <2,2,2,2>, <1,6,5,7>
+  2295138820U,	// <2,5,2,u>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,u>
+  2287184866U,	// <2,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1256573842U,	// <2,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2642855630U,	// <2,5,3,2>: Cost 3 vsldoi4 <4,2,5,3>, <2,3,4,5>
+  2287182763U,	// <2,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287184870U,	// <2,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1256574170U,	// <2,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1213442562U,	// <2,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287183091U,	// <2,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1213442564U,	// <2,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  3716604006U,	// <2,5,4,0>: Cost 4 vsldoi4 <4,2,5,4>, LHS
+  3716604822U,	// <2,5,4,1>: Cost 4 vsldoi4 <4,2,5,4>, <1,2,3,0>
+  3766914099U,	// <2,5,4,2>: Cost 4 vsldoi8 <1,4,2,5>, <4,2,5,0>
+  3368895403U,	// <2,5,4,3>: Cost 5 vmrglw <2,2,2,4>, <1,2,5,3>
+  3716607031U,	// <2,5,4,4>: Cost 4 vsldoi4 <4,2,5,4>, <4,2,5,4>
+  2693172534U,	// <2,5,4,5>: Cost 3 vsldoi8 <1,4,2,5>, RHS
+  3363588610U,	// <2,5,4,6>: Cost 4 vmrglw <1,3,2,4>, <3,4,5,6>
+  3368895731U,	// <2,5,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,5,7>
+  2693172777U,	// <2,5,4,u>: Cost 3 vsldoi8 <1,4,2,5>, RHS
+  3704668262U,	// <2,5,5,0>: Cost 4 vsldoi4 <2,2,5,5>, LHS
+  3704669078U,	// <2,5,5,1>: Cost 4 vsldoi4 <2,2,5,5>, <1,2,3,0>
+  3704669830U,	// <2,5,5,2>: Cost 4 vsldoi4 <2,2,5,5>, <2,2,5,5>
+  3364259460U,	// <2,5,5,3>: Cost 4 vmrglw <1,4,2,5>, <2,2,5,3>
+  3704671542U,	// <2,5,5,4>: Cost 4 vsldoi4 <2,2,5,5>, RHS
+  2793361412U,	// <2,5,5,5>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5>
+  3364258167U,	// <2,5,5,6>: Cost 4 vmrglw <1,4,2,5>, <0,4,5,6>
+  3867103249U,	// <2,5,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,5,7,0>
+  2793361412U,	// <2,5,5,u>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5>
+  2642878566U,	// <2,5,6,0>: Cost 3 vsldoi4 <4,2,5,6>, LHS
+  3386166810U,	// <2,5,6,1>: Cost 4 vmrglw <5,1,2,6>, <4,u,5,1>
+  2723033594U,	// <2,5,6,2>: Cost 3 vsldoi8 <6,4,2,5>, <6,2,7,3>
+  3848523842U,	// <2,5,6,3>: Cost 4 vsldoi12 <3,u,1,2>, <5,6,3,4>
+  2723033713U,	// <2,5,6,4>: Cost 3 vsldoi8 <6,4,2,5>, <6,4,2,5>
+  2230800388U,	// <2,5,6,5>: Cost 3 vmrghw <2,6,3,7>, <5,5,5,5>
+  2230800482U,	// <2,5,6,6>: Cost 3 vmrghw <2,6,3,7>, <5,6,7,0>
+  2785841252U,	// <2,5,6,7>: Cost 3 vsldoi12 <5,6,7,2>, <5,6,7,2>
+  2785914989U,	// <2,5,6,u>: Cost 3 vsldoi12 <5,6,u,2>, <5,6,u,2>
+  3796775930U,	// <2,5,7,0>: Cost 4 vsldoi8 <6,4,2,5>, <7,0,1,2>
+  3800757335U,	// <2,5,7,1>: Cost 4 vsldoi8 <7,1,2,5>, <7,1,2,5>
+  3853463689U,	// <2,5,7,2>: Cost 4 vsldoi12 <4,6,5,2>, <5,7,2,3>
+  3796776218U,	// <2,5,7,3>: Cost 4 vsldoi8 <6,4,2,5>, <7,3,6,2>
+  3796776294U,	// <2,5,7,4>: Cost 4 vsldoi8 <6,4,2,5>, <7,4,5,6>
+  3803411867U,	// <2,5,7,5>: Cost 4 vsldoi8 <7,5,2,5>, <7,5,2,5>
+  3371575081U,	// <2,5,7,6>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,6>
+  3796776516U,	// <2,5,7,7>: Cost 4 vsldoi8 <6,4,2,5>, <7,7,3,3>
+  3371575083U,	// <2,5,7,u>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,u>
+  2287225826U,	// <2,5,u,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1256614802U,	// <2,5,u,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2642896590U,	// <2,5,u,2>: Cost 3 vsldoi4 <4,2,5,u>, <2,3,4,5>
+  2287223723U,	// <2,5,u,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287225830U,	// <2,5,u,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1256615130U,	// <2,5,u,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1208838658U,	// <2,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287224051U,	// <2,5,u,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1208838660U,	// <2,5,u,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  3772227584U,	// <2,6,0,0>: Cost 4 vsldoi8 <2,3,2,6>, <0,0,0,0>
+  2698485862U,	// <2,6,0,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  3759620282U,	// <2,6,0,2>: Cost 4 vsldoi8 <0,2,2,6>, <0,2,2,6>
+  3710675299U,	// <2,6,0,3>: Cost 4 vsldoi4 <3,2,6,0>, <3,2,6,0>
+  3767583058U,	// <2,6,0,4>: Cost 4 vsldoi8 <1,5,2,6>, <0,4,1,5>
+  3378153265U,	// <2,6,0,5>: Cost 5 vmrglw <3,7,2,0>, <2,4,6,5>
+  3865186637U,	// <2,6,0,6>: Cost 4 vsldoi12 <6,6,2,2>, <6,0,6,1>
+  2330291510U,	// <2,6,0,7>: Cost 3 vmrglw <u,1,2,0>, RHS
+  2698486429U,	// <2,6,0,u>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  3734569062U,	// <2,6,1,0>: Cost 4 vsldoi4 <7,2,6,1>, LHS
+  3764929346U,	// <2,6,1,1>: Cost 4 vsldoi8 <1,1,2,6>, <1,1,2,6>
+  3772228502U,	// <2,6,1,2>: Cost 4 vsldoi8 <2,3,2,6>, <1,2,3,0>
+  3734571158U,	// <2,6,1,3>: Cost 4 vsldoi4 <7,2,6,1>, <3,0,1,2>
+  3734572342U,	// <2,6,1,4>: Cost 4 vsldoi4 <7,2,6,1>, RHS
+  3767583878U,	// <2,6,1,5>: Cost 4 vsldoi8 <1,5,2,6>, <1,5,2,6>
+  3768247511U,	// <2,6,1,6>: Cost 4 vsldoi8 <1,6,2,6>, <1,6,2,6>
+  2293140790U,	// <2,6,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS
+  2293140791U,	// <2,6,1,u>: Cost 3 vmrglw <1,u,2,1>, RHS
+  3704717414U,	// <2,6,2,0>: Cost 4 vsldoi4 <2,2,6,2>, LHS
+  3395424589U,	// <2,6,2,1>: Cost 4 vmrglw <6,6,2,2>, <6,0,6,1>
+  2228031993U,	// <2,6,2,2>: Cost 3 vmrghw <2,2,2,2>, <6,2,7,2>
+  2698487485U,	// <2,6,2,3>: Cost 3 vsldoi8 <2,3,2,6>, <2,3,2,6>
+  3704720694U,	// <2,6,2,4>: Cost 4 vsldoi4 <2,2,6,2>, RHS
+  3773556575U,	// <2,6,2,5>: Cost 4 vsldoi8 <2,5,2,6>, <2,5,2,6>
+  2698487738U,	// <2,6,2,6>: Cost 3 vsldoi8 <2,3,2,6>, <2,6,3,7>
+  1221397814U,	// <2,6,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS
+  1221397815U,	// <2,6,2,u>: Cost 2 vmrglw <2,2,2,2>, RHS
+  2636955750U,	// <2,6,3,0>: Cost 3 vsldoi4 <3,2,6,3>, LHS
+  2330314217U,	// <2,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1>
+  2636957626U,	// <2,6,3,2>: Cost 3 vsldoi4 <3,2,6,3>, <2,6,3,7>
+  2287184230U,	// <2,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  2636959030U,	// <2,6,3,4>: Cost 3 vsldoi4 <3,2,6,3>, RHS
+  2648903448U,	// <2,6,3,5>: Cost 3 vsldoi4 <5,2,6,3>, <5,2,6,3>
+  1256575800U,	// <2,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135056694U,	// <2,6,3,7>: Cost 1 vmrglw LHS, RHS
+  135056695U,	// <2,6,3,u>: Cost 1 vmrglw LHS, RHS
+  3710705766U,	// <2,6,4,0>: Cost 4 vsldoi4 <3,2,6,4>, LHS
+  3698762677U,	// <2,6,4,1>: Cost 5 vsldoi4 <1,2,6,4>, <1,2,6,4>
+  3710707389U,	// <2,6,4,2>: Cost 4 vsldoi4 <3,2,6,4>, <2,3,2,6>
+  3710708071U,	// <2,6,4,3>: Cost 4 vsldoi4 <3,2,6,4>, <3,2,6,4>
+  3710709046U,	// <2,6,4,4>: Cost 4 vsldoi4 <3,2,6,4>, RHS
+  2698489142U,	// <2,6,4,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS
+  3796782457U,	// <2,6,4,6>: Cost 4 vsldoi8 <6,4,2,6>, <4,6,5,2>
+  2295156022U,	// <2,6,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS
+  2295156023U,	// <2,6,4,u>: Cost 3 vmrglw <2,2,2,4>, RHS
+  3303870753U,	// <2,6,5,0>: Cost 4 vmrghw <2,5,3,6>, <6,0,1,2>
+  3788820134U,	// <2,6,5,1>: Cost 4 vsldoi8 <5,1,2,6>, <5,1,2,6>
+  3779530520U,	// <2,6,5,2>: Cost 4 vsldoi8 <3,5,2,6>, <5,2,6,3>
+  3303871026U,	// <2,6,5,3>: Cost 4 vmrghw <2,5,3,6>, <6,3,4,5>
+  3303871117U,	// <2,6,5,4>: Cost 4 vmrghw <2,5,3,6>, <6,4,5,6>
+  3791474666U,	// <2,6,5,5>: Cost 4 vsldoi8 <5,5,2,6>, <5,5,2,6>
+  3792138299U,	// <2,6,5,6>: Cost 4 vsldoi8 <5,6,2,6>, <5,6,2,6>
+  2290519350U,	// <2,6,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS
+  2290519351U,	// <2,6,5,u>: Cost 3 vmrglw <1,4,2,5>, RHS
+  2631008358U,	// <2,6,6,0>: Cost 3 vsldoi4 <2,2,6,6>, LHS
+  3372893673U,	// <2,6,6,1>: Cost 4 vmrglw <2,u,2,6>, <2,0,6,1>
+  2791445264U,	// <2,6,6,2>: Cost 3 vsldoi12 <6,6,2,2>, <6,6,2,2>
+  2230800968U,	// <2,6,6,3>: Cost 3 vmrghw <2,6,3,7>, <6,3,7,0>
+  2631011638U,	// <2,6,6,4>: Cost 3 vsldoi4 <2,2,6,6>, RHS
+  3372894001U,	// <2,6,6,5>: Cost 4 vmrglw <2,u,2,6>, <2,4,6,5>
+  2793362232U,	// <2,6,6,6>: Cost 3 vsldoi12 <7,0,1,2>, <6,6,6,6>
+  2295835958U,	// <2,6,6,7>: Cost 3 vmrglw <2,3,2,6>, RHS
+  2295835959U,	// <2,6,6,u>: Cost 3 vmrglw <2,3,2,6>, RHS
+  2793362254U,	// <2,6,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,0,1>
+  2792035160U,	// <2,6,7,1>: Cost 3 vsldoi12 <6,7,1,2>, <6,7,1,2>
+  2792108897U,	// <2,6,7,2>: Cost 3 vsldoi12 <6,7,2,2>, <6,7,2,2>
+  2769474408U,	// <2,6,7,3>: Cost 3 vsldoi12 <3,0,1,2>, <6,7,3,0>
+  2793362294U,	// <2,6,7,4>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,4,5>
+  3371575089U,	// <2,6,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,6,5>
+  2792403845U,	// <2,6,7,6>: Cost 3 vsldoi12 <6,7,6,2>, <6,7,6,2>
+  2297834806U,	// <2,6,7,7>: Cost 3 vmrglw <2,6,2,7>, RHS
+  2297834807U,	// <2,6,7,u>: Cost 3 vmrglw <2,6,2,7>, RHS
+  2636996710U,	// <2,6,u,0>: Cost 3 vsldoi4 <3,2,6,u>, LHS
+  2698491694U,	// <2,6,u,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  2636998631U,	// <2,6,u,2>: Cost 3 vsldoi4 <3,2,6,u>, <2,6,u,7>
+  2282580326U,	// <2,6,u,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  2636999990U,	// <2,6,u,4>: Cost 3 vsldoi4 <3,2,6,u>, RHS
+  2698492058U,	// <2,6,u,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS
+  1256616760U,	// <2,6,u,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135097654U,	// <2,6,u,7>: Cost 1 vmrglw LHS, RHS
+  135097655U,	// <2,6,u,u>: Cost 1 vmrglw LHS, RHS
+  2666864742U,	// <2,7,0,0>: Cost 3 vsldoi4 <u,2,7,0>, LHS
+  1719620602U,	// <2,7,0,1>: Cost 2 vsldoi12 <7,0,1,2>, <7,0,1,2>
+  3768254637U,	// <2,7,0,2>: Cost 4 vsldoi8 <1,6,2,7>, <0,2,1,2>
+  3393417722U,	// <2,7,0,3>: Cost 4 vmrglw <6,3,2,0>, <6,2,7,3>
+  2666868022U,	// <2,7,0,4>: Cost 3 vsldoi4 <u,2,7,0>, RHS
+  3867104290U,	// <2,7,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,0,5,6>
+  3728667127U,	// <2,7,0,6>: Cost 4 vsldoi4 <6,2,7,0>, <6,2,7,0>
+  2666869817U,	// <2,7,0,7>: Cost 3 vsldoi4 <u,2,7,0>, <7,0,u,2>
+  1720136761U,	// <2,7,0,u>: Cost 2 vsldoi12 <7,0,u,2>, <7,0,u,2>
+  3728670822U,	// <2,7,1,0>: Cost 4 vsldoi4 <6,2,7,1>, LHS
+  3774227252U,	// <2,7,1,1>: Cost 4 vsldoi8 <2,6,2,7>, <1,1,1,1>
+  3774227350U,	// <2,7,1,2>: Cost 4 vsldoi8 <2,6,2,7>, <1,2,3,0>
+  2323001850U,	// <2,7,1,3>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3>
+  3728674102U,	// <2,7,1,4>: Cost 4 vsldoi4 <6,2,7,1>, RHS
+  3774227567U,	// <2,7,1,5>: Cost 5 vsldoi8 <2,6,2,7>, <1,5,0,1>
+  2694513880U,	// <2,7,1,6>: Cost 3 vsldoi8 <1,6,2,7>, <1,6,2,7>
+  3396744002U,	// <2,7,1,7>: Cost 4 vmrglw <6,u,2,1>, <6,6,7,7>
+  2323001850U,	// <2,7,1,u>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3>
+  2654937190U,	// <2,7,2,0>: Cost 3 vsldoi4 <6,2,7,2>, LHS
+  3728679732U,	// <2,7,2,1>: Cost 4 vsldoi4 <6,2,7,2>, <1,1,1,1>
+  2700486248U,	// <2,7,2,2>: Cost 3 vsldoi8 <2,6,2,7>, <2,2,2,2>
+  2321682938U,	// <2,7,2,3>: Cost 3 vmrglw <6,6,2,2>, <6,2,7,3>
+  2654940470U,	// <2,7,2,4>: Cost 3 vsldoi4 <6,2,7,2>, RHS
+  3859584196U,	// <2,7,2,5>: Cost 4 vsldoi12 <5,6,7,2>, <7,2,5,6>
+  2700486577U,	// <2,7,2,6>: Cost 3 vsldoi8 <2,6,2,7>, <2,6,2,7>
+  2228033132U,	// <2,7,2,7>: Cost 3 vmrghw <2,2,2,2>, <7,7,7,7>
+  2701813843U,	// <2,7,2,u>: Cost 3 vsldoi8 <2,u,2,7>, <2,u,2,7>
+  1581203558U,	// <2,7,3,0>: Cost 2 vsldoi4 <6,2,7,3>, LHS
+  2654946100U,	// <2,7,3,1>: Cost 3 vsldoi4 <6,2,7,3>, <1,1,1,1>
+  2637031354U,	// <2,7,3,2>: Cost 3 vsldoi4 <3,2,7,3>, <2,6,3,7>
+  1256575482U,	// <2,7,3,3>: Cost 2 vmrglw LHS, <6,2,7,3>
+  1581206838U,	// <2,7,3,4>: Cost 2 vsldoi4 <6,2,7,3>, RHS
+  2654949380U,	// <2,7,3,5>: Cost 3 vsldoi4 <6,2,7,3>, <5,5,5,5>
+  1581208058U,	// <2,7,3,6>: Cost 2 vsldoi4 <6,2,7,3>, <6,2,7,3>
+  1256575810U,	// <2,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1581209390U,	// <2,7,3,u>: Cost 2 vsldoi4 <6,2,7,3>, LHS
+  3728695398U,	// <2,7,4,0>: Cost 4 vsldoi4 <6,2,7,4>, LHS
+  3869758782U,	// <2,7,4,1>: Cost 4 vsldoi12 <7,4,1,2>, <7,4,1,2>
+  3728696936U,	// <2,7,4,2>: Cost 4 vsldoi4 <6,2,7,4>, <2,2,2,2>
+  3393450490U,	// <2,7,4,3>: Cost 4 vmrglw <6,3,2,4>, <6,2,7,3>
+  3728698678U,	// <2,7,4,4>: Cost 4 vsldoi4 <6,2,7,4>, RHS
+  2700487990U,	// <2,7,4,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  3728699899U,	// <2,7,4,6>: Cost 4 vsldoi4 <6,2,7,4>, <6,2,7,4>
+  3867104626U,	// <2,7,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <7,4,7,0>
+  2700488233U,	// <2,7,4,u>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  3855160709U,	// <2,7,5,0>: Cost 4 vsldoi12 <5,0,1,2>, <7,5,0,1>
+  3728704406U,	// <2,7,5,1>: Cost 4 vsldoi4 <6,2,7,5>, <1,2,3,0>
+  3370233956U,	// <2,7,5,2>: Cost 4 vmrglw <2,4,2,5>, <5,6,7,2>
+  2320380410U,	// <2,7,5,3>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3>
+  3728706870U,	// <2,7,5,4>: Cost 4 vsldoi4 <6,2,7,5>, RHS
+  3867104694U,	// <2,7,5,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,5,5,5>
+  3792146492U,	// <2,7,5,6>: Cost 4 vsldoi8 <5,6,2,7>, <5,6,2,7>
+  3394122562U,	// <2,7,5,7>: Cost 4 vmrglw <6,4,2,5>, <6,6,7,7>
+  2320380410U,	// <2,7,5,u>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3>
+  2230801402U,	// <2,7,6,0>: Cost 3 vmrghw <2,6,3,7>, <7,0,1,2>
+  3768258984U,	// <2,7,6,1>: Cost 4 vsldoi8 <1,6,2,7>, <6,1,7,2>
+  2730349050U,	// <2,7,6,2>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3>
+  3372894575U,	// <2,7,6,3>: Cost 4 vmrglw <2,u,2,6>, <3,2,7,3>
+  2230801766U,	// <2,7,6,4>: Cost 3 vmrghw <2,6,3,7>, <7,4,5,6>
+  3304543670U,	// <2,7,6,5>: Cost 4 vmrghw <2,6,3,7>, <7,5,5,5>
+  3728716285U,	// <2,7,6,6>: Cost 4 vsldoi4 <6,2,7,6>, <6,2,7,6>
+  2230802028U,	// <2,7,6,7>: Cost 3 vmrghw <2,6,3,7>, <7,7,7,7>
+  2730349050U,	// <2,7,6,u>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3>
+  2793362983U,	// <2,7,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,0,1>
+  3728721112U,	// <2,7,7,1>: Cost 4 vsldoi4 <6,2,7,7>, <1,6,2,7>
+  3371574933U,	// <2,7,7,2>: Cost 4 vmrglw <2,6,2,7>, <2,2,7,2>
+  2327695866U,	// <2,7,7,3>: Cost 3 vmrglw <7,6,2,7>, <6,2,7,3>
+  3728723254U,	// <2,7,7,4>: Cost 4 vsldoi4 <6,2,7,7>, RHS
+  3371574855U,	// <2,7,7,5>: Cost 5 vmrglw <2,6,2,7>, <2,1,7,5>
+  2730350062U,	// <2,7,7,6>: Cost 3 vsldoi8 <7,6,2,7>, <7,6,2,7>
+  2793363052U,	// <2,7,7,7>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,7,7>
+  2798671471U,	// <2,7,7,u>: Cost 3 vsldoi12 <7,u,1,2>, <7,7,u,1>
+  1581244518U,	// <2,7,u,0>: Cost 2 vsldoi4 <6,2,7,u>, LHS
+  1724929666U,	// <2,7,u,1>: Cost 2 vsldoi12 <7,u,1,2>, <7,u,1,2>
+  2637072314U,	// <2,7,u,2>: Cost 3 vsldoi4 <3,2,7,u>, <2,6,3,7>
+  1256616442U,	// <2,7,u,3>: Cost 2 vmrglw LHS, <6,2,7,3>
+  1581247798U,	// <2,7,u,4>: Cost 2 vsldoi4 <6,2,7,u>, RHS
+  2700490906U,	// <2,7,u,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  1581249023U,	// <2,7,u,6>: Cost 2 vsldoi4 <6,2,7,u>, <6,2,7,u>
+  1256616770U,	// <2,7,u,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1581250350U,	// <2,7,u,u>: Cost 2 vsldoi4 <6,2,7,u>, LHS
+  1611489280U,	// <2,u,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  537747563U,	// <2,u,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685231277U,	// <2,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2685231356U,	// <2,u,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611489618U,	// <2,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2226763930U,	// <2,u,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS
+  2733007350U,	// <2,u,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7>
+  2660971737U,	// <2,u,0,7>: Cost 3 vsldoi4 <7,2,u,0>, <7,2,u,0>
+  537748125U,	// <2,u,0,u>: Cost 1 vsldoi8 LHS, LHS
+  2689876708U,	// <2,u,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2>
+  1611490100U,	// <2,u,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611490198U,	// <2,u,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  2293137564U,	// <2,u,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS
+  2689877072U,	// <2,u,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6>
+  2689877103U,	// <2,u,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2689877199U,	// <2,u,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  2293140808U,	// <2,u,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS
+  1616135548U,	// <2,u,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  1556938854U,	// <2,u,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  1154291502U,	// <2,u,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS
+  336380006U,	// <2,u,2,2>: Cost 1 vspltisw2 LHS
+  1611490982U,	// <2,u,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  1556942134U,	// <2,u,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  1154291866U,	// <2,u,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS
+  1611491258U,	// <2,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  1221397832U,	// <2,u,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS
+  336380006U,	// <2,u,2,u>: Cost 1 vspltisw2 LHS
+  1611491478U,	// <2,u,3,0>: Cost 2 vsldoi8 LHS, <3,0,1,2>
+  1213440073U,	// <2,u,3,1>: Cost 2 vmrglw LHS, <0,0,u,1>
+  1213442261U,	// <2,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2>
+  135053468U,	// <2,u,3,3>: Cost 1 vmrglw LHS, LHS
+  1611491842U,	// <2,u,3,4>: Cost 2 vsldoi8 LHS, <3,4,5,6>
+  1213440401U,	// <2,u,3,5>: Cost 2 vmrglw LHS, <0,4,u,5>
+  1213442589U,	// <2,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135056712U,	// <2,u,3,7>: Cost 1 vmrglw LHS, RHS
+  135053473U,	// <2,u,3,u>: Cost 1 vmrglw LHS, LHS
+  1551425638U,	// <2,u,4,0>: Cost 2 vsldoi4 <1,2,u,4>, LHS
+  1551426503U,	// <2,u,4,1>: Cost 2 vsldoi4 <1,2,u,4>, <1,2,u,4>
+  2625169000U,	// <2,u,4,2>: Cost 3 vsldoi4 <1,2,u,4>, <2,2,2,2>
+  2625169558U,	// <2,u,4,3>: Cost 3 vsldoi4 <1,2,u,4>, <3,0,1,2>
+  1551428918U,	// <2,u,4,4>: Cost 2 vsldoi4 <1,2,u,4>, RHS
+  537750838U,	// <2,u,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2733010297U,	// <2,u,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2>
+  2295156040U,	// <2,u,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS
+  537751081U,	// <2,u,4,u>: Cost 1 vsldoi8 LHS, RHS
+  2689879624U,	// <2,u,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2>
+  2230130478U,	// <2,u,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS
+  2631149217U,	// <2,u,5,2>: Cost 3 vsldoi4 <2,2,u,5>, <2,2,u,5>
+  2290516124U,	// <2,u,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS
+  2689879988U,	// <2,u,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6>
+  1659269124U,	// <2,u,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1691162778U,	// <2,u,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2290519368U,	// <2,u,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS
+  1691162796U,	// <2,u,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2230802131U,	// <2,u,6,0>: Cost 3 vmrghw <2,6,3,7>, <u,0,1,2>
+  1157060398U,	// <2,u,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS
+  1659269626U,	// <2,u,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3>
+  2764904656U,	// <2,u,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <u,6,3,7>
+  2230802495U,	// <2,u,6,4>: Cost 3 vmrghw <2,6,3,7>, <u,4,5,6>
+  1157060762U,	// <2,u,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS
+  1659269944U,	// <2,u,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659269966U,	// <2,u,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1157060965U,	// <2,u,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS
+  1659270138U,	// <2,u,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2>
+  2727040090U,	// <2,u,7,1>: Cost 3 vsldoi8 <7,1,2,u>, <7,1,2,u>
+  2727703723U,	// <2,u,7,2>: Cost 3 vsldoi8 <7,2,2,u>, <7,2,2,u>
+  2297831580U,	// <2,u,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS
+  1659270502U,	// <2,u,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6>
+  2733012406U,	// <2,u,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5>
+  2730358255U,	// <2,u,7,6>: Cost 3 vsldoi8 <7,6,2,u>, <7,6,2,u>
+  1659270764U,	// <2,u,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7>
+  1659270786U,	// <2,u,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2>
+  1213481923U,	// <2,u,u,0>: Cost 2 vmrglw LHS, <1,2,u,0>
+  537753390U,	// <2,u,u,1>: Cost 1 vsldoi8 LHS, LHS
+  336380006U,	// <2,u,u,2>: Cost 1 vspltisw2 LHS
+  135094428U,	// <2,u,u,3>: Cost 1 vmrglw LHS, LHS
+  1213481927U,	// <2,u,u,4>: Cost 2 vmrglw LHS, <1,2,u,4>
+  537753754U,	// <2,u,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1208838685U,	// <2,u,u,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135097672U,	// <2,u,u,7>: Cost 1 vmrglw LHS, RHS
+  135094433U,	// <2,u,u,u>: Cost 1 vmrglw LHS, LHS
+  1678557184U,	// <3,0,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0>
+  1678557194U,	// <3,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1>
+  2631181989U,	// <3,0,0,2>: Cost 3 vsldoi4 <2,3,0,0>, <2,3,0,0>
+  2289223984U,	// <3,0,0,3>: Cost 3 vmrglw <1,2,3,0>, <3,2,0,3>
+  2756943909U,	// <3,0,0,4>: Cost 3 vsldoi12 LHS, <0,0,4,1>
+  3362965729U,	// <3,0,0,5>: Cost 4 vmrglw <1,2,3,0>, <3,1,0,5>
+  3362966054U,	// <3,0,0,6>: Cost 4 vmrglw <1,2,3,0>, <3,5,0,6>
+  2289224312U,	// <3,0,0,7>: Cost 3 vmrglw <1,2,3,0>, <3,6,0,7>
+  1683202121U,	// <3,0,0,u>: Cost 2 vsldoi12 LHS, <0,0,u,1>
+  1557446758U,	// <3,0,1,0>: Cost 2 vsldoi4 <2,3,0,1>, LHS
+  2752741467U,	// <3,0,1,1>: Cost 3 vsldoi12 LHS, <0,1,1,1>
+  604815462U,	// <3,0,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2631190676U,	// <3,0,1,3>: Cost 3 vsldoi4 <2,3,0,1>, <3,0,1,0>
+  1557450038U,	// <3,0,1,4>: Cost 2 vsldoi4 <2,3,0,1>, RHS
+  2667024388U,	// <3,0,1,5>: Cost 3 vsldoi4 <u,3,0,1>, <5,5,5,5>
+  2800074894U,	// <3,0,1,6>: Cost 3 vsldoi12 LHS, <0,1,6,7>
+  2661053667U,	// <3,0,1,7>: Cost 3 vsldoi4 <7,3,0,1>, <7,3,0,1>
+  604815516U,	// <3,0,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696521165U,	// <3,0,2,0>: Cost 3 vsldoi8 <2,0,3,0>, <2,0,3,0>
+  2752741549U,	// <3,0,2,1>: Cost 3 vsldoi12 LHS, <0,2,1,2>
+  2691876456U,	// <3,0,2,2>: Cost 3 vsldoi8 <1,2,3,0>, <2,2,2,2>
+  2691876518U,	// <3,0,2,3>: Cost 3 vsldoi8 <1,2,3,0>, <2,3,0,1>
+  3830685895U,	// <3,0,2,4>: Cost 4 vsldoi12 LHS, <0,2,4,1>
+  3765618536U,	// <3,0,2,5>: Cost 4 vsldoi8 <1,2,3,0>, <2,5,3,6>
+  2691876794U,	// <3,0,2,6>: Cost 3 vsldoi8 <1,2,3,0>, <2,6,3,7>
+  2701166596U,	// <3,0,2,7>: Cost 3 vsldoi8 <2,7,3,0>, <2,7,3,0>
+  2756944108U,	// <3,0,2,u>: Cost 3 vsldoi12 LHS, <0,2,u,2>
+  2691877014U,	// <3,0,3,0>: Cost 3 vsldoi8 <1,2,3,0>, <3,0,1,2>
+  1161003110U,	// <3,0,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2691877168U,	// <3,0,3,2>: Cost 3 vsldoi8 <1,2,3,0>, <3,2,0,3>
+  2691877246U,	// <3,0,3,3>: Cost 3 vsldoi8 <1,2,3,0>, <3,3,0,0>
+  2691877378U,	// <3,0,3,4>: Cost 3 vsldoi8 <1,2,3,0>, <3,4,5,6>
+  3765619238U,	// <3,0,3,5>: Cost 4 vsldoi8 <1,2,3,0>, <3,5,0,6>
+  2691877496U,	// <3,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7>
+  3368962680U,	// <3,0,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,0,7>
+  1161003677U,	// <3,0,3,u>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2289254400U,	// <3,0,4,0>: Cost 3 vmrglw <1,2,3,4>, <0,0,0,0>
+  1678557522U,	// <3,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5>
+  2631214761U,	// <3,0,4,2>: Cost 3 vsldoi4 <2,3,0,4>, <2,3,0,4>
+  2235580672U,	// <3,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4>
+  2756944237U,	// <3,0,4,4>: Cost 3 vsldoi12 LHS, <0,4,4,5>
+  1618136374U,	// <3,0,4,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS
+  3309322742U,	// <3,0,4,6>: Cost 4 vmrghw <3,4,5,6>, <0,6,1,7>
+  3362998904U,	// <3,0,4,7>: Cost 4 vmrglw <1,2,3,4>, <3,6,0,7>
+  1683202449U,	// <3,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5>
+  3765620296U,	// <3,0,5,0>: Cost 4 vsldoi8 <1,2,3,0>, <5,0,1,2>
+  2752299427U,	// <3,0,5,1>: Cost 3 vsldoi12 LHS, <0,5,1,5>
+  3789508346U,	// <3,0,5,2>: Cost 4 vsldoi8 <5,2,3,0>, <5,2,3,0>
+  3403486842U,	// <3,0,5,3>: Cost 4 vmrglw <u,0,3,5>, <7,u,0,3>
+  3765620660U,	// <3,0,5,4>: Cost 4 vsldoi8 <1,2,3,0>, <5,4,5,6>
+  2733682692U,	// <3,0,5,5>: Cost 3 vsldoi8 <u,2,3,0>, <5,5,5,5>
+  2800075218U,	// <3,0,5,6>: Cost 3 vsldoi12 LHS, <0,5,6,7>
+  3873817044U,	// <3,0,5,7>: Cost 4 vsldoi12 LHS, <0,5,7,0>
+  2800075234U,	// <3,0,5,u>: Cost 3 vsldoi12 LHS, <0,5,u,5>
+  2752299501U,	// <3,0,6,0>: Cost 3 vsldoi12 LHS, <0,6,0,7>
+  2236547174U,	// <3,0,6,1>: Cost 3 vmrghw <3,6,0,7>, LHS
+  2733683194U,	// <3,0,6,2>: Cost 3 vsldoi8 <u,2,3,0>, <6,2,7,3>
+  3844473352U,	// <3,0,6,3>: Cost 4 vsldoi12 <3,2,0,3>, <0,6,3,7>
+  3310289234U,	// <3,0,6,4>: Cost 4 vmrghw <3,6,0,7>, <0,4,1,5>
+  3873817114U,	// <3,0,6,5>: Cost 4 vsldoi12 LHS, <0,6,5,7>
+  2733683512U,	// <3,0,6,6>: Cost 3 vsldoi8 <u,2,3,0>, <6,6,6,6>
+  2725057384U,	// <3,0,6,7>: Cost 3 vsldoi8 <6,7,3,0>, <6,7,3,0>
+  2236547741U,	// <3,0,6,u>: Cost 3 vmrghw <3,6,0,7>, LHS
+  2297905152U,	// <3,0,7,0>: Cost 3 vmrglw <2,6,3,7>, <0,0,0,0>
+  2297906854U,	// <3,0,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,1>
+  2727711916U,	// <3,0,7,2>: Cost 3 vsldoi8 <7,2,3,0>, <7,2,3,0>
+  3371649328U,	// <3,0,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,0,3>
+  2733684070U,	// <3,0,7,4>: Cost 3 vsldoi8 <u,2,3,0>, <7,4,5,6>
+  3734843490U,	// <3,0,7,5>: Cost 4 vsldoi4 <7,3,0,7>, <5,6,7,0>
+  3798799895U,	// <3,0,7,6>: Cost 4 vsldoi8 <6,7,3,0>, <7,6,7,3>
+  2733684332U,	// <3,0,7,7>: Cost 3 vsldoi8 <u,2,3,0>, <7,7,7,7>
+  2297906861U,	// <3,0,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,u>
+  1557504102U,	// <3,0,u,0>: Cost 2 vsldoi4 <2,3,0,u>, LHS
+  1678557842U,	// <3,0,u,1>: Cost 2 vsldoi12 LHS, <0,u,1,1>
+  604816029U,	// <3,0,u,2>: Cost 1 vsldoi12 LHS, LHS
+  2691880892U,	// <3,0,u,3>: Cost 3 vsldoi8 <1,2,3,0>, <u,3,0,1>
+  1557507382U,	// <3,0,u,4>: Cost 2 vsldoi4 <2,3,0,u>, RHS
+  1618139290U,	// <3,0,u,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS
+  2691881168U,	// <3,0,u,6>: Cost 3 vsldoi8 <1,2,3,0>, <u,6,3,7>
+  2661111018U,	// <3,0,u,7>: Cost 3 vsldoi4 <7,3,0,u>, <7,3,0,u>
+  604816083U,	// <3,0,u,u>: Cost 1 vsldoi12 LHS, LHS
+  2619310332U,	// <3,1,0,0>: Cost 3 vsldoi4 <0,3,1,0>, <0,3,1,0>
+  2756944612U,	// <3,1,0,1>: Cost 3 vsldoi12 LHS, <1,0,1,2>
+  2289221724U,	// <3,1,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,1,2>
+  2619312278U,	// <3,1,0,3>: Cost 3 vsldoi4 <0,3,1,0>, <3,0,1,2>
+  2619313462U,	// <3,1,0,4>: Cost 3 vsldoi4 <0,3,1,0>, RHS
+  2289221970U,	// <3,1,0,5>: Cost 3 vmrglw <1,2,3,0>, <0,4,1,5>
+  2232599768U,	// <3,1,0,6>: Cost 3 vmrghw <3,0,1,2>, <1,6,2,7>
+  3362964687U,	// <3,1,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,1,7>
+  2619316014U,	// <3,1,0,u>: Cost 3 vsldoi4 <0,3,1,0>, LHS
+  2756944683U,	// <3,1,1,0>: Cost 3 vsldoi12 LHS, <1,1,0,1>
+  1678558004U,	// <3,1,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2691883927U,	// <3,1,1,2>: Cost 3 vsldoi8 <1,2,3,1>, <1,2,3,1>
+  3826631496U,	// <3,1,1,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,1,3,3>
+  2756944723U,	// <3,1,1,4>: Cost 3 vsldoi12 LHS, <1,1,4,5>
+  2756944732U,	// <3,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5>
+  3830686561U,	// <3,1,1,6>: Cost 4 vsldoi12 LHS, <1,1,6,1>
+  3734869228U,	// <3,1,1,7>: Cost 4 vsldoi4 <7,3,1,1>, <7,3,1,1>
+  1678558004U,	// <3,1,1,u>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2696529358U,	// <3,1,2,0>: Cost 3 vsldoi8 <2,0,3,1>, <2,0,3,1>
+  2756944775U,	// <3,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3>
+  2294548630U,	// <3,1,2,2>: Cost 3 vmrglw <2,1,3,2>, <3,0,1,2>
+  1678558102U,	// <3,1,2,3>: Cost 2 vsldoi12 LHS, <1,2,3,0>
+  2631273782U,	// <3,1,2,4>: Cost 3 vsldoi4 <2,3,1,2>, RHS
+  2756944811U,	// <3,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3>
+  3830686644U,	// <3,1,2,6>: Cost 4 vsldoi12 LHS, <1,2,6,3>
+  2800075706U,	// <3,1,2,7>: Cost 3 vsldoi12 LHS, <1,2,7,0>
+  1679000515U,	// <3,1,2,u>: Cost 2 vsldoi12 LHS, <1,2,u,0>
+  2619334911U,	// <3,1,3,0>: Cost 3 vsldoi4 <0,3,1,3>, <0,3,1,3>
+  2295218186U,	// <3,1,3,1>: Cost 3 vmrglw <2,2,3,3>, <0,0,1,1>
+  2293229718U,	// <3,1,3,2>: Cost 3 vmrglw <1,u,3,3>, <3,0,1,2>
+  2619337116U,	// <3,1,3,3>: Cost 3 vsldoi4 <0,3,1,3>, <3,3,3,3>
+  2619338038U,	// <3,1,3,4>: Cost 3 vsldoi4 <0,3,1,3>, RHS
+  2295218514U,	// <3,1,3,5>: Cost 3 vmrglw <2,2,3,3>, <0,4,1,5>
+  3830686729U,	// <3,1,3,6>: Cost 4 vsldoi12 LHS, <1,3,6,7>
+  3368961231U,	// <3,1,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,1,7>
+  2619340590U,	// <3,1,3,u>: Cost 3 vsldoi4 <0,3,1,3>, LHS
+  2619343104U,	// <3,1,4,0>: Cost 3 vsldoi4 <0,3,1,4>, <0,3,1,4>
+  2289254410U,	// <3,1,4,1>: Cost 3 vmrglw <1,2,3,4>, <0,0,1,1>
+  2289256598U,	// <3,1,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,1,2>
+  2619345410U,	// <3,1,4,3>: Cost 3 vsldoi4 <0,3,1,4>, <3,4,5,6>
+  2619346230U,	// <3,1,4,4>: Cost 3 vsldoi4 <0,3,1,4>, RHS
+  2756944976U,	// <3,1,4,5>: Cost 3 vsldoi12 LHS, <1,4,5,6>
+  3362996401U,	// <3,1,4,6>: Cost 4 vmrglw <1,2,3,4>, <0,2,1,6>
+  3362997455U,	// <3,1,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,1,7>
+  2619348782U,	// <3,1,4,u>: Cost 3 vsldoi4 <0,3,1,4>, LHS
+  2756945007U,	// <3,1,5,0>: Cost 3 vsldoi12 LHS, <1,5,0,1>
+  3830686840U,	// <3,1,5,1>: Cost 4 vsldoi12 LHS, <1,5,1,1>
+  3358361750U,	// <3,1,5,2>: Cost 4 vmrglw <0,4,3,5>, <3,0,1,2>
+  3830686857U,	// <3,1,5,3>: Cost 4 vsldoi12 LHS, <1,5,3,0>
+  2756945047U,	// <3,1,5,4>: Cost 3 vsldoi12 LHS, <1,5,4,5>
+  2294571346U,	// <3,1,5,5>: Cost 3 vmrglw <2,1,3,5>, <0,4,1,5>
+  3806105698U,	// <3,1,5,6>: Cost 4 vsldoi8 <u,0,3,1>, <5,6,7,0>
+  3873817774U,	// <3,1,5,7>: Cost 4 vsldoi12 LHS, <1,5,7,1>
+  2756945079U,	// <3,1,5,u>: Cost 3 vsldoi12 LHS, <1,5,u,1>
+  3830686912U,	// <3,1,6,0>: Cost 4 vsldoi12 LHS, <1,6,0,1>
+  2756945103U,	// <3,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7>
+  2236547990U,	// <3,1,6,2>: Cost 3 vmrghw <3,6,0,7>, <1,2,3,0>
+  3826631905U,	// <3,1,6,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,6,3,7>
+  3830686952U,	// <3,1,6,4>: Cost 4 vsldoi12 LHS, <1,6,4,5>
+  2756945139U,	// <3,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7>
+  3830686972U,	// <3,1,6,6>: Cost 4 vsldoi12 LHS, <1,6,6,7>
+  2800076030U,	// <3,1,6,7>: Cost 3 vsldoi12 LHS, <1,6,7,0>
+  2756945166U,	// <3,1,6,u>: Cost 3 vsldoi12 LHS, <1,6,u,7>
+  3699081318U,	// <3,1,7,0>: Cost 4 vsldoi4 <1,3,1,7>, LHS
+  2297905162U,	// <3,1,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,1>
+  2297907350U,	// <3,1,7,2>: Cost 3 vmrglw <2,6,3,7>, <3,0,1,2>
+  3365675182U,	// <3,1,7,3>: Cost 4 vmrglw <1,6,3,7>, <0,2,1,3>
+  3699084598U,	// <3,1,7,4>: Cost 4 vsldoi4 <1,3,1,7>, RHS
+  2297905490U,	// <3,1,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,1,5>
+  2297905329U,	// <3,1,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6>
+  3368330447U,	// <3,1,7,7>: Cost 4 vmrglw <2,1,3,7>, <1,6,1,7>
+  2297905169U,	// <3,1,7,u>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,u>
+  2619375876U,	// <3,1,u,0>: Cost 3 vsldoi4 <0,3,1,u>, <0,3,1,u>
+  1678558004U,	// <3,1,u,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2289289366U,	// <3,1,u,2>: Cost 3 vmrglw <1,2,3,u>, <3,0,1,2>
+  1679000956U,	// <3,1,u,3>: Cost 2 vsldoi12 LHS, <1,u,3,0>
+  2619378998U,	// <3,1,u,4>: Cost 3 vsldoi4 <0,3,1,u>, RHS
+  2756945297U,	// <3,1,u,5>: Cost 3 vsldoi12 LHS, <1,u,5,3>
+  2297905329U,	// <3,1,u,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6>
+  2800076192U,	// <3,1,u,7>: Cost 3 vsldoi12 LHS, <1,u,7,0>
+  1683203497U,	// <3,1,u,u>: Cost 2 vsldoi12 LHS, <1,u,u,0>
+  3362964203U,	// <3,2,0,0>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,0>
+  2289222380U,	// <3,2,0,1>: Cost 3 vmrglw <1,2,3,0>, <1,0,2,1>
+  2289222462U,	// <3,2,0,2>: Cost 3 vmrglw <1,2,3,0>, <1,1,2,2>
+  1215479910U,	// <3,2,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS
+  3362964207U,	// <3,2,0,4>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,4>
+  2289222708U,	// <3,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5>
+  2232600506U,	// <3,2,0,6>: Cost 3 vmrghw <3,0,1,2>, <2,6,3,7>
+  3396142296U,	// <3,2,0,7>: Cost 4 vmrglw <6,7,3,0>, <1,6,2,7>
+  1215479915U,	// <3,2,0,u>: Cost 2 vmrglw <1,2,3,0>, LHS
+  3699105894U,	// <3,2,1,0>: Cost 4 vsldoi4 <1,3,2,1>, LHS
+  3765633844U,	// <3,2,1,1>: Cost 4 vsldoi8 <1,2,3,2>, <1,1,1,1>
+  2691892120U,	// <3,2,1,2>: Cost 3 vsldoi8 <1,2,3,2>, <1,2,3,2>
+  2752300575U,	// <3,2,1,3>: Cost 3 vsldoi12 LHS, <2,1,3,1>
+  3699109174U,	// <3,2,1,4>: Cost 4 vsldoi4 <1,3,2,1>, RHS
+  3830687280U,	// <3,2,1,5>: Cost 5 vsldoi12 LHS, <2,1,5,0>
+  3830687289U,	// <3,2,1,6>: Cost 4 vsldoi12 LHS, <2,1,6,0>
+  3874260548U,	// <3,2,1,7>: Cost 4 vsldoi12 LHS, <2,1,7,2>
+  2752742988U,	// <3,2,1,u>: Cost 3 vsldoi12 LHS, <2,1,u,1>
+  2631344230U,	// <3,2,2,0>: Cost 3 vsldoi4 <2,3,2,2>, LHS
+  2697201184U,	// <3,2,2,1>: Cost 3 vsldoi8 <2,1,3,2>, <2,1,3,2>
+  1678558824U,	// <3,2,2,2>: Cost 2 vsldoi12 LHS, <2,2,2,2>
+  1678558834U,	// <3,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3>
+  2631347510U,	// <3,2,2,4>: Cost 3 vsldoi4 <2,3,2,2>, RHS
+  3368953613U,	// <3,2,2,5>: Cost 4 vmrglw <2,2,3,2>, <2,4,2,5>
+  2234304442U,	// <3,2,2,6>: Cost 3 vmrghw <3,2,6,3>, <2,6,3,7>
+  3368953777U,	// <3,2,2,7>: Cost 4 vmrglw <2,2,3,2>, <2,6,2,7>
+  1679001247U,	// <3,2,2,u>: Cost 2 vsldoi12 LHS, <2,2,u,3>
+  1678558886U,	// <3,2,3,0>: Cost 2 vsldoi12 LHS, <2,3,0,1>
+  2752300719U,	// <3,2,3,1>: Cost 3 vsldoi12 LHS, <2,3,1,1>
+  2752300729U,	// <3,2,3,2>: Cost 3 vsldoi12 LHS, <2,3,2,2>
+  1221476454U,	// <3,2,3,3>: Cost 2 vmrglw <2,2,3,3>, LHS
+  1678558926U,	// <3,2,3,4>: Cost 2 vsldoi12 LHS, <2,3,4,5>
+  2800076503U,	// <3,2,3,5>: Cost 3 vsldoi12 LHS, <2,3,5,5>
+  2234746810U,	// <3,2,3,6>: Cost 3 vmrghw <3,3,3,3>, <2,6,3,7>
+  2800076516U,	// <3,2,3,7>: Cost 3 vsldoi12 LHS, <2,3,7,0>
+  1678558958U,	// <3,2,3,u>: Cost 2 vsldoi12 LHS, <2,3,u,1>
+  3699130470U,	// <3,2,4,0>: Cost 4 vsldoi4 <1,3,2,4>, LHS
+  3362996972U,	// <3,2,4,1>: Cost 4 vmrglw <1,2,3,4>, <1,0,2,1>
+  2289256040U,	// <3,2,4,2>: Cost 3 vmrglw <1,2,3,4>, <2,2,2,2>
+  1215512678U,	// <3,2,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS
+  3362998676U,	// <3,2,4,4>: Cost 4 vmrglw <1,2,3,4>, <3,3,2,4>
+  2691894582U,	// <3,2,4,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS
+  2235582394U,	// <3,2,4,6>: Cost 3 vmrghw <3,4,5,6>, <2,6,3,7>
+  3734967544U,	// <3,2,4,7>: Cost 4 vsldoi4 <7,3,2,4>, <7,3,2,4>
+  1215512683U,	// <3,2,4,u>: Cost 2 vmrglw <1,2,3,4>, LHS
+  3705110630U,	// <3,2,5,0>: Cost 4 vsldoi4 <2,3,2,5>, LHS
+  3368313985U,	// <3,2,5,1>: Cost 4 vmrglw <2,1,3,5>, <1,5,2,1>
+  3368314472U,	// <3,2,5,2>: Cost 4 vmrglw <2,1,3,5>, <2,2,2,2>
+  2756945768U,	// <3,2,5,3>: Cost 3 vsldoi12 LHS, <2,5,3,6>
+  3705113910U,	// <3,2,5,4>: Cost 4 vsldoi4 <2,3,2,5>, RHS
+  3310061416U,	// <3,2,5,5>: Cost 4 vmrghw <3,5,6,6>, <2,5,3,6>
+  3310135226U,	// <3,2,5,6>: Cost 4 vmrghw <3,5,7,6>, <2,6,3,7>
+  3370305457U,	// <3,2,5,7>: Cost 5 vmrglw <2,4,3,5>, <2,6,2,7>
+  2752743317U,	// <3,2,5,u>: Cost 3 vsldoi12 LHS, <2,5,u,6>
+  2631376998U,	// <3,2,6,0>: Cost 3 vsldoi4 <2,3,2,6>, LHS
+  3705119540U,	// <3,2,6,1>: Cost 4 vsldoi4 <2,3,2,6>, <1,1,1,1>
+  2631378621U,	// <3,2,6,2>: Cost 3 vsldoi4 <2,3,2,6>, <2,3,2,6>
+  1678559162U,	// <3,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7>
+  2631380278U,	// <3,2,6,4>: Cost 3 vsldoi4 <2,3,2,6>, RHS
+  3370976956U,	// <3,2,6,5>: Cost 4 vmrglw <2,5,3,6>, <2,3,2,5>
+  2237065146U,	// <3,2,6,6>: Cost 3 vmrghw <3,6,7,7>, <2,6,3,7>
+  3798815594U,	// <3,2,6,7>: Cost 4 vsldoi8 <6,7,3,2>, <6,7,3,2>
+  1679001575U,	// <3,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7>
+  2800076778U,	// <3,2,7,0>: Cost 3 vsldoi12 LHS, <2,7,0,1>
+  3371647724U,	// <3,2,7,1>: Cost 4 vmrglw <2,6,3,7>, <1,0,2,1>
+  2297906792U,	// <3,2,7,2>: Cost 3 vmrglw <2,6,3,7>, <2,2,2,2>
+  1224163430U,	// <3,2,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS
+  3705130294U,	// <3,2,7,4>: Cost 4 vsldoi4 <2,3,2,7>, RHS
+  3371648052U,	// <3,2,7,5>: Cost 4 vmrglw <2,6,3,7>, <1,4,2,5>
+  2297906877U,	// <3,2,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,3,2,6>
+  3371648702U,	// <3,2,7,7>: Cost 4 vmrglw <2,6,3,7>, <2,3,2,7>
+  1224163435U,	// <3,2,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1679001659U,	// <3,2,u,0>: Cost 2 vsldoi12 LHS, <2,u,0,1>
+  2752743492U,	// <3,2,u,1>: Cost 3 vsldoi12 LHS, <2,u,1,1>
+  1678558824U,	// <3,2,u,2>: Cost 2 vsldoi12 LHS, <2,2,2,2>
+  1678559320U,	// <3,2,u,3>: Cost 2 vsldoi12 LHS, <2,u,3,3>
+  1679001699U,	// <3,2,u,4>: Cost 2 vsldoi12 LHS, <2,u,4,5>
+  2691897498U,	// <3,2,u,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS
+  2237908922U,	// <3,2,u,6>: Cost 3 vmrghw <3,u,1,2>, <2,6,3,7>
+  2800519289U,	// <3,2,u,7>: Cost 3 vsldoi12 LHS, <2,u,7,0>
+  1679001731U,	// <3,2,u,u>: Cost 2 vsldoi12 LHS, <2,u,u,1>
+  1215480726U,	// <3,3,0,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0>
+  1678559382U,	// <3,3,0,1>: Cost 2 vsldoi12 LHS, <3,0,1,2>
+  2631403200U,	// <3,3,0,2>: Cost 3 vsldoi4 <2,3,3,0>, <2,3,3,0>
+  2289223282U,	// <3,3,0,3>: Cost 3 vmrglw <1,2,3,0>, <2,2,3,3>
+  2752301232U,	// <3,3,0,4>: Cost 3 vsldoi12 LHS, <3,0,4,1>
+  3362965027U,	// <3,3,0,5>: Cost 4 vmrglw <1,2,3,0>, <2,1,3,5>
+  3362965352U,	// <3,3,0,6>: Cost 4 vmrglw <1,2,3,0>, <2,5,3,6>
+  2289223610U,	// <3,3,0,7>: Cost 3 vmrglw <1,2,3,0>, <2,6,3,7>
+  1678559445U,	// <3,3,0,u>: Cost 2 vsldoi12 LHS, <3,0,u,2>
+  3830687964U,	// <3,3,1,0>: Cost 4 vsldoi12 LHS, <3,1,0,0>
+  2752301286U,	// <3,3,1,1>: Cost 3 vsldoi12 LHS, <3,1,1,1>
+  2752301297U,	// <3,3,1,2>: Cost 3 vsldoi12 LHS, <3,1,2,3>
+  2305157532U,	// <3,3,1,3>: Cost 3 vmrglw <3,u,3,1>, <3,3,3,3>
+  3830688000U,	// <3,3,1,4>: Cost 4 vsldoi12 LHS, <3,1,4,0>
+  3830688009U,	// <3,3,1,5>: Cost 4 vsldoi12 LHS, <3,1,5,0>
+  3830688019U,	// <3,3,1,6>: Cost 4 vsldoi12 LHS, <3,1,6,1>
+  3362973626U,	// <3,3,1,7>: Cost 4 vmrglw <1,2,3,1>, <2,6,3,7>
+  2752743719U,	// <3,3,1,u>: Cost 3 vsldoi12 LHS, <3,1,u,3>
+  2631417958U,	// <3,3,2,0>: Cost 3 vsldoi4 <2,3,3,2>, LHS
+  3826043193U,	// <3,3,2,1>: Cost 4 vsldoi12 LHS, <3,2,1,3>
+  1624131186U,	// <3,3,2,2>: Cost 2 vsldoi8 <2,2,3,3>, <2,2,3,3>
+  2752301384U,	// <3,3,2,3>: Cost 3 vsldoi12 LHS, <3,2,3,0>
+  2631421238U,	// <3,3,2,4>: Cost 3 vsldoi4 <2,3,3,2>, RHS
+  3826485602U,	// <3,3,2,5>: Cost 4 vsldoi12 LHS, <3,2,5,u>
+  2752301414U,	// <3,3,2,6>: Cost 3 vsldoi12 LHS, <3,2,6,3>
+  2771249519U,	// <3,3,2,7>: Cost 3 vsldoi12 <3,2,7,3>, <3,2,7,3>
+  1628112984U,	// <3,3,2,u>: Cost 2 vsldoi8 <2,u,3,3>, <2,u,3,3>
+  1563656294U,	// <3,3,3,0>: Cost 2 vsldoi4 <3,3,3,3>, LHS
+  2301855911U,	// <3,3,3,1>: Cost 3 vmrglw <3,3,3,3>, <3,0,3,1>
+  2697873730U,	// <3,3,3,2>: Cost 3 vsldoi8 <2,2,3,3>, <3,2,2,3>
+  403488870U,	// <3,3,3,3>: Cost 1 vspltisw3 LHS
+  1563659574U,	// <3,3,3,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS
+  2301856239U,	// <3,3,3,5>: Cost 3 vmrglw <3,3,3,3>, <3,4,3,5>
+  2697874067U,	// <3,3,3,6>: Cost 3 vsldoi8 <2,2,3,3>, <3,6,3,7>
+  2295220154U,	// <3,3,3,7>: Cost 3 vmrglw <2,2,3,3>, <2,6,3,7>
+  403488870U,	// <3,3,3,u>: Cost 1 vspltisw3 LHS
+  2289255318U,	// <3,3,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,3,0>
+  2631435162U,	// <3,3,4,1>: Cost 3 vsldoi4 <2,3,3,4>, <1,2,3,4>
+  2631435972U,	// <3,3,4,2>: Cost 3 vsldoi4 <2,3,3,4>, <2,3,3,4>
+  2289256050U,	// <3,3,4,3>: Cost 3 vmrglw <1,2,3,4>, <2,2,3,3>
+  1215513498U,	// <3,3,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4>
+  1679002114U,	// <3,3,4,5>: Cost 2 vsldoi12 LHS, <3,4,5,6>
+  3362998120U,	// <3,3,4,6>: Cost 4 vmrglw <1,2,3,4>, <2,5,3,6>
+  2289256378U,	// <3,3,4,7>: Cost 3 vmrglw <1,2,3,4>, <2,6,3,7>
+  1679002141U,	// <3,3,4,u>: Cost 2 vsldoi12 LHS, <3,4,u,6>
+  3831130657U,	// <3,3,5,0>: Cost 4 vsldoi12 LHS, <3,5,0,1>
+  3376277671U,	// <3,3,5,1>: Cost 4 vmrglw <3,4,3,5>, <3,0,3,1>
+  3771617012U,	// <3,3,5,2>: Cost 4 vsldoi8 <2,2,3,3>, <5,2,2,3>
+  2302536092U,	// <3,3,5,3>: Cost 3 vmrglw <3,4,3,5>, <3,3,3,3>
+  3831130697U,	// <3,3,5,4>: Cost 4 vsldoi12 LHS, <3,5,4,5>
+  2294572579U,	// <3,3,5,5>: Cost 3 vmrglw <2,1,3,5>, <2,1,3,5>
+  2800519773U,	// <3,3,5,6>: Cost 3 vsldoi12 LHS, <3,5,6,7>
+  3368314810U,	// <3,3,5,7>: Cost 4 vmrglw <2,1,3,5>, <2,6,3,7>
+  2800519791U,	// <3,3,5,u>: Cost 3 vsldoi12 LHS, <3,5,u,7>
+  2800077432U,	// <3,3,6,0>: Cost 3 vsldoi12 LHS, <3,6,0,7>
+  3310291185U,	// <3,3,6,1>: Cost 4 vmrghw <3,6,0,7>, <3,1,2,3>
+  2789165706U,	// <3,3,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <3,6,2,7>
+  2764982931U,	// <3,3,6,3>: Cost 3 vsldoi12 <2,2,3,3>, <3,6,3,7>
+  2800077468U,	// <3,3,6,4>: Cost 3 vsldoi12 LHS, <3,6,4,7>
+  3873819301U,	// <3,3,6,5>: Cost 4 vsldoi12 LHS, <3,6,5,7>
+  2297235304U,	// <3,3,6,6>: Cost 3 vmrglw <2,5,3,6>, <2,5,3,6>
+  2725081963U,	// <3,3,6,7>: Cost 3 vsldoi8 <6,7,3,3>, <6,7,3,3>
+  2725745596U,	// <3,3,6,u>: Cost 3 vsldoi8 <6,u,3,3>, <6,u,3,3>
+  2631458918U,	// <3,3,7,0>: Cost 3 vsldoi4 <2,3,3,7>, LHS
+  3705201460U,	// <3,3,7,1>: Cost 4 vsldoi4 <2,3,3,7>, <1,1,1,1>
+  2631460551U,	// <3,3,7,2>: Cost 3 vsldoi4 <2,3,3,7>, <2,3,3,7>
+  2297906802U,	// <3,3,7,3>: Cost 3 vmrglw <2,6,3,7>, <2,2,3,3>
+  2631462198U,	// <3,3,7,4>: Cost 3 vsldoi4 <2,3,3,7>, RHS
+  3371648547U,	// <3,3,7,5>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,5>
+  3371648548U,	// <3,3,7,6>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,6>
+  1224165306U,	// <3,3,7,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  1224165306U,	// <3,3,7,u>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  1215480726U,	// <3,3,u,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0>
+  1679002398U,	// <3,3,u,1>: Cost 2 vsldoi12 LHS, <3,u,1,2>
+  1659967368U,	// <3,3,u,2>: Cost 2 vsldoi8 <u,2,3,3>, <u,2,3,3>
+  403488870U,	// <3,3,u,3>: Cost 1 vspltisw3 LHS
+  1563659574U,	// <3,3,u,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS
+  1679002438U,	// <3,3,u,5>: Cost 2 vsldoi12 LHS, <3,u,5,6>
+  2756946764U,	// <3,3,u,6>: Cost 3 vsldoi12 LHS, <3,u,6,3>
+  1224165306U,	// <3,3,u,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  403488870U,	// <3,3,u,u>: Cost 1 vspltisw3 LHS
+  2691907584U,	// <3,4,0,0>: Cost 3 vsldoi8 <1,2,3,4>, <0,0,0,0>
+  1618165862U,	// <3,4,0,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  2631476937U,	// <3,4,0,2>: Cost 3 vsldoi4 <2,3,4,0>, <2,3,4,0>
+  2232601732U,	// <3,4,0,3>: Cost 3 vmrghw <3,0,1,2>, <4,3,5,0>
+  2691907922U,	// <3,4,0,4>: Cost 3 vsldoi8 <1,2,3,4>, <0,4,1,5>
+  1158860086U,	// <3,4,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS
+  3306343806U,	// <3,4,0,6>: Cost 4 vmrghw <3,0,1,2>, <4,6,5,7>
+  3366947484U,	// <3,4,0,7>: Cost 4 vmrglw <1,u,3,0>, <3,6,4,7>
+  1618166429U,	// <3,4,0,u>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  2631483494U,	// <3,4,1,0>: Cost 3 vsldoi4 <2,3,4,1>, LHS
+  2691908404U,	// <3,4,1,1>: Cost 3 vsldoi8 <1,2,3,4>, <1,1,1,1>
+  1618166682U,	// <3,4,1,2>: Cost 2 vsldoi8 <1,2,3,4>, <1,2,3,4>
+  3765650393U,	// <3,4,1,3>: Cost 4 vsldoi8 <1,2,3,4>, <1,3,1,4>
+  2631486774U,	// <3,4,1,4>: Cost 3 vsldoi4 <2,3,4,1>, RHS
+  2756946914U,	// <3,4,1,5>: Cost 3 vsldoi12 LHS, <4,1,5,0>
+  3765650639U,	// <3,4,1,6>: Cost 4 vsldoi8 <1,2,3,4>, <1,6,1,7>
+  3735090439U,	// <3,4,1,7>: Cost 4 vsldoi4 <7,3,4,1>, <7,3,4,1>
+  1622148480U,	// <3,4,1,u>: Cost 2 vsldoi8 <1,u,3,4>, <1,u,3,4>
+  3765650893U,	// <3,4,2,0>: Cost 4 vsldoi8 <1,2,3,4>, <2,0,3,0>
+  3831131154U,	// <3,4,2,1>: Cost 4 vsldoi12 LHS, <4,2,1,3>
+  2691909224U,	// <3,4,2,2>: Cost 3 vsldoi8 <1,2,3,4>, <2,2,2,2>
+  2691909286U,	// <3,4,2,3>: Cost 3 vsldoi8 <1,2,3,4>, <2,3,0,1>
+  2699208469U,	// <3,4,2,4>: Cost 3 vsldoi8 <2,4,3,4>, <2,4,3,4>
+  2233863478U,	// <3,4,2,5>: Cost 3 vmrghw <3,2,0,3>, RHS
+  2691909562U,	// <3,4,2,6>: Cost 3 vsldoi8 <1,2,3,4>, <2,6,3,7>
+  2701199368U,	// <3,4,2,7>: Cost 3 vsldoi8 <2,7,3,4>, <2,7,3,4>
+  2691909691U,	// <3,4,2,u>: Cost 3 vsldoi8 <1,2,3,4>, <2,u,0,1>
+  2691909782U,	// <3,4,3,0>: Cost 3 vsldoi8 <1,2,3,4>, <3,0,1,2>
+  3765651686U,	// <3,4,3,1>: Cost 4 vsldoi8 <1,2,3,4>, <3,1,1,1>
+  2691909972U,	// <3,4,3,2>: Cost 3 vsldoi8 <1,2,3,4>, <3,2,4,3>
+  2691910044U,	// <3,4,3,3>: Cost 3 vsldoi8 <1,2,3,4>, <3,3,3,3>
+  2691910096U,	// <3,4,3,4>: Cost 3 vsldoi8 <1,2,3,4>, <3,4,0,1>
+  1161006390U,	// <3,4,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2691910300U,	// <3,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7>
+  3368962716U,	// <3,4,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,4,7>
+  1161006633U,	// <3,4,3,u>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2631508070U,	// <3,4,4,0>: Cost 3 vsldoi4 <2,3,4,4>, LHS
+  2631508890U,	// <3,4,4,1>: Cost 3 vsldoi4 <2,3,4,4>, <1,2,3,4>
+  2631509709U,	// <3,4,4,2>: Cost 3 vsldoi4 <2,3,4,4>, <2,3,4,4>
+  2289256788U,	// <3,4,4,3>: Cost 3 vmrglw <1,2,3,4>, <3,2,4,3>
+  1726336208U,	// <3,4,4,4>: Cost 2 vsldoi12 LHS, <4,4,4,4>
+  1618169142U,	// <3,4,4,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  3362998858U,	// <3,4,4,6>: Cost 4 vmrglw <1,2,3,4>, <3,5,4,6>
+  2289257116U,	// <3,4,4,7>: Cost 3 vmrglw <1,2,3,4>, <3,6,4,7>
+  1618169385U,	// <3,4,4,u>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  1557774438U,	// <3,4,5,0>: Cost 2 vsldoi4 <2,3,4,5>, LHS
+  2631516980U,	// <3,4,5,1>: Cost 3 vsldoi4 <2,3,4,5>, <1,1,1,1>
+  1557776078U,	// <3,4,5,2>: Cost 2 vsldoi4 <2,3,4,5>, <2,3,4,5>
+  2631518358U,	// <3,4,5,3>: Cost 3 vsldoi4 <2,3,4,5>, <3,0,1,2>
+  1557777718U,	// <3,4,5,4>: Cost 2 vsldoi4 <2,3,4,5>, RHS
+  2296563406U,	// <3,4,5,5>: Cost 3 vmrglw <2,4,3,5>, <2,3,4,5>
+  604818742U,	// <3,4,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2661381387U,	// <3,4,5,7>: Cost 3 vsldoi4 <7,3,4,5>, <7,3,4,5>
+  604818760U,	// <3,4,5,u>: Cost 1 vsldoi12 LHS, RHS
+  3705266278U,	// <3,4,6,0>: Cost 4 vsldoi4 <2,3,4,6>, LHS
+  3831131482U,	// <3,4,6,1>: Cost 4 vsldoi12 LHS, <4,6,1,7>
+  2733715962U,	// <3,4,6,2>: Cost 3 vsldoi8 <u,2,3,4>, <6,2,7,3>
+  3844771180U,	// <3,4,6,3>: Cost 4 vsldoi12 <3,2,4,3>, <4,6,3,7>
+  2800078197U,	// <3,4,6,4>: Cost 3 vsldoi12 LHS, <4,6,4,7>
+  2236550454U,	// <3,4,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS
+  2733716280U,	// <3,4,6,6>: Cost 3 vsldoi8 <u,2,3,4>, <6,6,6,6>
+  2725090156U,	// <3,4,6,7>: Cost 3 vsldoi8 <6,7,3,4>, <6,7,3,4>
+  2236550697U,	// <3,4,6,u>: Cost 3 vmrghw <3,6,0,7>, RHS
+  2733716474U,	// <3,4,7,0>: Cost 3 vsldoi8 <u,2,3,4>, <7,0,1,2>
+  3371647013U,	// <3,4,7,1>: Cost 4 vmrglw <2,6,3,7>, <0,0,4,1>
+  2727744688U,	// <3,4,7,2>: Cost 3 vsldoi8 <7,2,3,4>, <7,2,3,4>
+  3371649364U,	// <3,4,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,4,3>
+  2733716838U,	// <3,4,7,4>: Cost 3 vsldoi8 <u,2,3,4>, <7,4,5,6>
+  2297906894U,	// <3,4,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,5>
+  3371647180U,	// <3,4,7,6>: Cost 4 vmrglw <2,6,3,7>, <0,2,4,6>
+  2733717100U,	// <3,4,7,7>: Cost 3 vsldoi8 <u,2,3,4>, <7,7,7,7>
+  2297906897U,	// <3,4,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,u>
+  1557799014U,	// <3,4,u,0>: Cost 2 vsldoi4 <2,3,4,u>, LHS
+  1618171694U,	// <3,4,u,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  1557800657U,	// <3,4,u,2>: Cost 2 vsldoi4 <2,3,4,u>, <2,3,4,u>
+  2691913660U,	// <3,4,u,3>: Cost 3 vsldoi8 <1,2,3,4>, <u,3,0,1>
+  1557802294U,	// <3,4,u,4>: Cost 2 vsldoi4 <2,3,4,u>, RHS
+  1618172058U,	// <3,4,u,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  604818985U,	// <3,4,u,6>: Cost 1 vsldoi12 LHS, RHS
+  2661405966U,	// <3,4,u,7>: Cost 3 vsldoi4 <7,3,4,u>, <7,3,4,u>
+  604819003U,	// <3,4,u,u>: Cost 1 vsldoi12 LHS, RHS
+  2643492966U,	// <3,5,0,0>: Cost 3 vsldoi4 <4,3,5,0>, LHS
+  2756947528U,	// <3,5,0,1>: Cost 3 vsldoi12 LHS, <5,0,1,2>
+  2331029019U,	// <3,5,0,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2>
+  2643495062U,	// <3,5,0,3>: Cost 3 vsldoi4 <4,3,5,0>, <3,0,1,2>
+  2756947554U,	// <3,5,0,4>: Cost 3 vsldoi12 LHS, <5,0,4,1>
+  2800078443U,	// <3,5,0,5>: Cost 3 vsldoi12 LHS, <5,0,5,1>
+  2289224194U,	// <3,5,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,5,6>
+  3362964723U,	// <3,5,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,5,7>
+  2756947590U,	// <3,5,0,u>: Cost 3 vsldoi12 LHS, <5,0,u,1>
+  2800078479U,	// <3,5,1,0>: Cost 3 vsldoi12 LHS, <5,1,0,1>
+  2333027218U,	// <3,5,1,1>: Cost 3 vmrglw <u,5,3,1>, <4,0,5,1>
+  2691916699U,	// <3,5,1,2>: Cost 3 vsldoi8 <1,2,3,5>, <1,2,3,5>
+  3832901294U,	// <3,5,1,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,1,3,5>
+  2800078519U,	// <3,5,1,4>: Cost 3 vsldoi12 LHS, <5,1,4,5>
+  3830689467U,	// <3,5,1,5>: Cost 4 vsldoi12 LHS, <5,1,5,0>
+  3830689481U,	// <3,5,1,6>: Cost 4 vsldoi12 LHS, <5,1,6,5>
+  3873820365U,	// <3,5,1,7>: Cost 4 vsldoi12 LHS, <5,1,7,0>
+  2800078551U,	// <3,5,1,u>: Cost 3 vsldoi12 LHS, <5,1,u,1>
+  3770967487U,	// <3,5,2,0>: Cost 4 vsldoi8 <2,1,3,5>, <2,0,1,4>
+  2697225763U,	// <3,5,2,1>: Cost 3 vsldoi8 <2,1,3,5>, <2,1,3,5>
+  3830689523U,	// <3,5,2,2>: Cost 4 vsldoi12 LHS, <5,2,2,2>
+  2699216590U,	// <3,5,2,3>: Cost 3 vsldoi8 <2,4,3,5>, <2,3,4,5>
+  2699216662U,	// <3,5,2,4>: Cost 3 vsldoi8 <2,4,3,5>, <2,4,3,5>
+  2783047439U,	// <3,5,2,5>: Cost 3 vsldoi12 <5,2,5,3>, <5,2,5,3>
+  2783121176U,	// <3,5,2,6>: Cost 3 vsldoi12 <5,2,6,3>, <5,2,6,3>
+  3856936737U,	// <3,5,2,7>: Cost 4 vsldoi12 <5,2,7,3>, <5,2,7,3>
+  2701871194U,	// <3,5,2,u>: Cost 3 vsldoi8 <2,u,3,5>, <2,u,3,5>
+  2643517542U,	// <3,5,3,0>: Cost 3 vsldoi4 <4,3,5,3>, LHS
+  2331052946U,	// <3,5,3,1>: Cost 3 vmrglw <u,2,3,3>, <4,0,5,1>
+  3699345010U,	// <3,5,3,2>: Cost 4 vsldoi4 <1,3,5,3>, <2,2,3,3>
+  2705189276U,	// <3,5,3,3>: Cost 3 vsldoi8 <3,4,3,5>, <3,3,3,3>
+  2705189359U,	// <3,5,3,4>: Cost 3 vsldoi8 <3,4,3,5>, <3,4,3,5>
+  2331053274U,	// <3,5,3,5>: Cost 3 vmrglw <u,2,3,3>, <4,4,5,5>
+  2295220738U,	// <3,5,3,6>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,6>
+  3368961267U,	// <3,5,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,5,7>
+  2295220740U,	// <3,5,3,u>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,u>
+  2643525734U,	// <3,5,4,0>: Cost 3 vsldoi4 <4,3,5,4>, LHS
+  2331061138U,	// <3,5,4,1>: Cost 3 vmrglw <u,2,3,4>, <4,0,5,1>
+  2235584280U,	// <3,5,4,2>: Cost 3 vmrghw <3,4,5,6>, <5,2,6,3>
+  2643528194U,	// <3,5,4,3>: Cost 3 vsldoi4 <4,3,5,4>, <3,4,5,6>
+  2735713498U,	// <3,5,4,4>: Cost 3 vsldoi8 <u,5,3,5>, <4,4,5,5>
+  2756947892U,	// <3,5,4,5>: Cost 3 vsldoi12 LHS, <5,4,5,6>
+  2289256962U,	// <3,5,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,5,6>
+  3362997491U,	// <3,5,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,5,7>
+  2756947919U,	// <3,5,4,u>: Cost 3 vsldoi12 LHS, <5,4,u,6>
+  2800078803U,	// <3,5,5,0>: Cost 3 vsldoi12 LHS, <5,5,0,1>
+  2800078812U,	// <3,5,5,1>: Cost 3 vsldoi12 LHS, <5,5,1,1>
+  2631591639U,	// <3,5,5,2>: Cost 3 vsldoi4 <2,3,5,5>, <2,3,5,5>
+  3832901616U,	// <3,5,5,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,5,3,3>
+  2800078843U,	// <3,5,5,4>: Cost 3 vsldoi12 LHS, <5,5,4,5>
+  1726337028U,	// <3,5,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2800078862U,	// <3,5,5,6>: Cost 3 vsldoi12 LHS, <5,5,6,6>
+  3368314099U,	// <3,5,5,7>: Cost 4 vmrglw <2,1,3,5>, <1,6,5,7>
+  1726337028U,	// <3,5,5,u>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2800078884U,	// <3,5,6,0>: Cost 3 vsldoi12 LHS, <5,6,0,1>
+  2800078899U,	// <3,5,6,1>: Cost 3 vsldoi12 LHS, <5,6,1,7>
+  2631599832U,	// <3,5,6,2>: Cost 3 vsldoi4 <2,3,5,6>, <2,3,5,6>
+  2800078914U,	// <3,5,6,3>: Cost 3 vsldoi12 LHS, <5,6,3,4>
+  2800078924U,	// <3,5,6,4>: Cost 3 vsldoi12 LHS, <5,6,4,5>
+  2800078935U,	// <3,5,6,5>: Cost 3 vsldoi12 LHS, <5,6,5,7>
+  2297235970U,	// <3,5,6,6>: Cost 3 vmrglw <2,5,3,6>, <3,4,5,6>
+  1726337122U,	// <3,5,6,7>: Cost 2 vsldoi12 LHS, <5,6,7,0>
+  1726337131U,	// <3,5,6,u>: Cost 2 vsldoi12 LHS, <5,6,u,0>
+  3699376230U,	// <3,5,7,0>: Cost 4 vsldoi4 <1,3,5,7>, LHS
+  2333739922U,	// <3,5,7,1>: Cost 3 vmrglw <u,6,3,7>, <4,0,5,1>
+  3699378106U,	// <3,5,7,2>: Cost 4 vsldoi4 <1,3,5,7>, <2,6,3,7>
+  3371647915U,	// <3,5,7,3>: Cost 4 vmrglw <2,6,3,7>, <1,2,5,3>
+  3699379510U,	// <3,5,7,4>: Cost 4 vsldoi4 <1,3,5,7>, RHS
+  2333740250U,	// <3,5,7,5>: Cost 3 vmrglw <u,6,3,7>, <4,4,5,5>
+  2297907714U,	// <3,5,7,6>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,6>
+  3370984691U,	// <3,5,7,7>: Cost 4 vmrglw <2,5,3,7>, <1,6,5,7>
+  2297907716U,	// <3,5,7,u>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,u>
+  2800079046U,	// <3,5,u,0>: Cost 3 vsldoi12 LHS, <5,u,0,1>
+  2756948176U,	// <3,5,u,1>: Cost 3 vsldoi12 LHS, <5,u,1,2>
+  2331029019U,	// <3,5,u,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2>
+  2800079076U,	// <3,5,u,3>: Cost 3 vsldoi12 LHS, <5,u,3,4>
+  2800079085U,	// <3,5,u,4>: Cost 3 vsldoi12 LHS, <5,u,4,4>
+  1726337028U,	// <3,5,u,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2289289730U,	// <3,5,u,6>: Cost 3 vmrglw <1,2,3,u>, <3,4,5,6>
+  1726337284U,	// <3,5,u,7>: Cost 2 vsldoi12 LHS, <5,u,7,0>
+  1726337293U,	// <3,5,u,u>: Cost 2 vsldoi12 LHS, <5,u,u,0>
+  3773628416U,	// <3,6,0,0>: Cost 4 vsldoi8 <2,5,3,6>, <0,0,0,0>
+  2699886694U,	// <3,6,0,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS
+  2789167401U,	// <3,6,0,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,0,2,1>
+  3362965862U,	// <3,6,0,3>: Cost 4 vmrglw <1,2,3,0>, <3,2,6,3>
+  3773628754U,	// <3,6,0,4>: Cost 4 vsldoi8 <2,5,3,6>, <0,4,1,5>
+  3723284326U,	// <3,6,0,5>: Cost 4 vsldoi4 <5,3,6,0>, <5,3,6,0>
+  2800079181U,	// <3,6,0,6>: Cost 3 vsldoi12 LHS, <6,0,6,1>
+  1215483190U,	// <3,6,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS
+  1215483191U,	// <3,6,0,u>: Cost 2 vmrglw <1,2,3,0>, RHS
+  3873821032U,	// <3,6,1,0>: Cost 4 vsldoi12 LHS, <6,1,0,1>
+  3773629236U,	// <3,6,1,1>: Cost 4 vsldoi8 <2,5,3,6>, <1,1,1,1>
+  2691924892U,	// <3,6,1,2>: Cost 3 vsldoi8 <1,2,3,6>, <1,2,3,6>
+  3830690184U,	// <3,6,1,3>: Cost 5 vsldoi12 LHS, <6,1,3,6>
+  3873821072U,	// <3,6,1,4>: Cost 4 vsldoi12 LHS, <6,1,4,5>
+  3873821082U,	// <3,6,1,5>: Cost 4 vsldoi12 LHS, <6,1,5,6>
+  3403453240U,	// <3,6,1,6>: Cost 4 vmrglw <u,0,3,1>, <6,6,6,6>
+  2289233206U,	// <3,6,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS
+  2289233207U,	// <3,6,1,u>: Cost 3 vmrglw <1,2,3,1>, RHS
+  2661498982U,	// <3,6,2,0>: Cost 3 vsldoi4 <7,3,6,2>, LHS
+  3770975780U,	// <3,6,2,1>: Cost 4 vsldoi8 <2,1,3,6>, <2,1,3,6>
+  2631640797U,	// <3,6,2,2>: Cost 3 vsldoi4 <2,3,6,2>, <2,3,6,2>
+  3771639485U,	// <3,6,2,3>: Cost 4 vsldoi8 <2,2,3,6>, <2,3,2,6>
+  2661502262U,	// <3,6,2,4>: Cost 3 vsldoi4 <7,3,6,2>, RHS
+  2699888488U,	// <3,6,2,5>: Cost 3 vsldoi8 <2,5,3,6>, <2,5,3,6>
+  2661503482U,	// <3,6,2,6>: Cost 3 vsldoi4 <7,3,6,2>, <6,2,7,3>
+  1715425786U,	// <3,6,2,7>: Cost 2 vsldoi12 <6,2,7,3>, <6,2,7,3>
+  1715499523U,	// <3,6,2,u>: Cost 2 vsldoi12 <6,2,u,3>, <6,2,u,3>
+  3773630614U,	// <3,6,3,0>: Cost 4 vsldoi8 <2,5,3,6>, <3,0,1,2>
+  3372942825U,	// <3,6,3,1>: Cost 4 vmrglw <2,u,3,3>, <2,0,6,1>
+  2234749434U,	// <3,6,3,2>: Cost 3 vmrghw <3,3,3,3>, <6,2,7,3>
+  3368962406U,	// <3,6,3,3>: Cost 4 vmrglw <2,2,3,3>, <3,2,6,3>
+  2699889154U,	// <3,6,3,4>: Cost 3 vsldoi8 <2,5,3,6>, <3,4,5,6>
+  3773631068U,	// <3,6,3,5>: Cost 4 vsldoi8 <2,5,3,6>, <3,5,6,6>
+  2331054904U,	// <3,6,3,6>: Cost 3 vmrglw <u,2,3,3>, <6,6,6,6>
+  1221479734U,	// <3,6,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS
+  1221479735U,	// <3,6,3,u>: Cost 2 vmrglw <2,2,3,3>, RHS
+  2235584801U,	// <3,6,4,0>: Cost 3 vmrghw <3,4,5,6>, <6,0,1,2>
+  3717342106U,	// <3,6,4,1>: Cost 4 vsldoi4 <4,3,6,4>, <1,2,3,4>
+  2789167729U,	// <3,6,4,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,4,2,5>
+  2235585074U,	// <3,6,4,3>: Cost 3 vmrghw <3,4,5,6>, <6,3,4,5>
+  2235585165U,	// <3,6,4,4>: Cost 3 vmrghw <3,4,5,6>, <6,4,5,6>
+  2699889974U,	// <3,6,4,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS
+  2800079509U,	// <3,6,4,6>: Cost 3 vsldoi12 LHS, <6,4,6,5>
+  1215515958U,	// <3,6,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS
+  1215515959U,	// <3,6,4,u>: Cost 2 vmrglw <1,2,3,4>, RHS
+  3873821356U,	// <3,6,5,0>: Cost 4 vsldoi12 LHS, <6,5,0,1>
+  3372959209U,	// <3,6,5,1>: Cost 5 vmrglw <2,u,3,5>, <2,0,6,1>
+  3862909629U,	// <3,6,5,2>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,2,0>
+  3773632358U,	// <3,6,5,3>: Cost 4 vsldoi8 <2,5,3,6>, <5,3,6,0>
+  3873821396U,	// <3,6,5,4>: Cost 4 vsldoi12 LHS, <6,5,4,5>
+  3873821405U,	// <3,6,5,5>: Cost 4 vsldoi12 LHS, <6,5,5,5>
+  3862909672U,	// <3,6,5,6>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,6,7>
+  2294574390U,	// <3,6,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS
+  2294574391U,	// <3,6,5,u>: Cost 3 vmrglw <2,1,3,5>, RHS
+  2800079613U,	// <3,6,6,0>: Cost 3 vsldoi12 LHS, <6,6,0,1>
+  3873821446U,	// <3,6,6,1>: Cost 4 vsldoi12 LHS, <6,6,1,1>
+  2789167888U,	// <3,6,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,6,2,2>
+  3844920090U,	// <3,6,6,3>: Cost 4 vsldoi12 <3,2,6,3>, <6,6,3,3>
+  2800079653U,	// <3,6,6,4>: Cost 3 vsldoi12 LHS, <6,6,4,5>
+  3723333484U,	// <3,6,6,5>: Cost 4 vsldoi4 <5,3,6,6>, <5,3,6,6>
+  1726337848U,	// <3,6,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1726337858U,	// <3,6,6,7>: Cost 2 vsldoi12 LHS, <6,6,7,7>
+  1726337867U,	// <3,6,6,u>: Cost 2 vsldoi12 LHS, <6,6,u,7>
+  1726337870U,	// <3,6,7,0>: Cost 2 vsldoi12 LHS, <6,7,0,1>
+  2297906665U,	// <3,6,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,0,6,1>
+  2792117090U,	// <3,6,7,2>: Cost 3 vsldoi12 <6,7,2,3>, <6,7,2,3>
+  2297907558U,	// <3,6,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,6,3>
+  1726337910U,	// <3,6,7,4>: Cost 2 vsldoi12 LHS, <6,7,4,5>
+  2297906993U,	// <3,6,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,4,6,5>
+  2297906832U,	// <3,6,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,2,6,6>
+  1224166710U,	// <3,6,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1224166711U,	// <3,6,7,u>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1726337951U,	// <3,6,u,0>: Cost 2 vsldoi12 LHS, <6,u,0,1>
+  2699892526U,	// <3,6,u,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS
+  2789168049U,	// <3,6,u,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,u,2,1>
+  2792854460U,	// <3,6,u,3>: Cost 3 vsldoi12 <6,u,3,3>, <6,u,3,3>
+  1726337991U,	// <3,6,u,4>: Cost 2 vsldoi12 LHS, <6,u,4,5>
+  2699892890U,	// <3,6,u,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS
+  1726337848U,	// <3,6,u,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1215548726U,	// <3,6,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS
+  1215548727U,	// <3,6,u,u>: Cost 2 vmrglw <1,2,3,u>, RHS
+  2700558336U,	// <3,7,0,0>: Cost 3 vsldoi8 <2,6,3,7>, <0,0,0,0>
+  1626816614U,	// <3,7,0,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  2700558513U,	// <3,7,0,2>: Cost 3 vsldoi8 <2,6,3,7>, <0,2,1,6>
+  2331030010U,	// <3,7,0,3>: Cost 3 vmrglw <u,2,3,0>, <6,2,7,3>
+  2700558674U,	// <3,7,0,4>: Cost 3 vsldoi8 <2,6,3,7>, <0,4,1,5>
+  2800079906U,	// <3,7,0,5>: Cost 3 vsldoi12 LHS, <7,0,5,6>
+  2655588936U,	// <3,7,0,6>: Cost 3 vsldoi4 <6,3,7,0>, <6,3,7,0>
+  2800079919U,	// <3,7,0,7>: Cost 3 vsldoi12 LHS, <7,0,7,1>
+  1626817181U,	// <3,7,0,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  3774300899U,	// <3,7,1,0>: Cost 4 vsldoi8 <2,6,3,7>, <1,0,1,1>
+  2700559156U,	// <3,7,1,1>: Cost 3 vsldoi8 <2,6,3,7>, <1,1,1,1>
+  2700559254U,	// <3,7,1,2>: Cost 3 vsldoi8 <2,6,3,7>, <1,2,3,0>
+  3774301148U,	// <3,7,1,3>: Cost 4 vsldoi8 <2,6,3,7>, <1,3,1,7>
+  3774301227U,	// <3,7,1,4>: Cost 4 vsldoi8 <2,6,3,7>, <1,4,1,5>
+  3774301295U,	// <3,7,1,5>: Cost 4 vsldoi8 <2,6,3,7>, <1,5,0,1>
+  3768329441U,	// <3,7,1,6>: Cost 4 vsldoi8 <1,6,3,7>, <1,6,3,7>
+  3403453250U,	// <3,7,1,7>: Cost 4 vmrglw <u,0,3,1>, <6,6,7,7>
+  2700559740U,	// <3,7,1,u>: Cost 3 vsldoi8 <2,6,3,7>, <1,u,3,0>
+  2700559849U,	// <3,7,2,0>: Cost 3 vsldoi8 <2,6,3,7>, <2,0,6,1>
+  3770983973U,	// <3,7,2,1>: Cost 4 vsldoi8 <2,1,3,7>, <2,1,3,7>
+  2700559976U,	// <3,7,2,2>: Cost 3 vsldoi8 <2,6,3,7>, <2,2,2,2>
+  2698569415U,	// <3,7,2,3>: Cost 3 vsldoi8 <2,3,3,7>, <2,3,3,7>
+  2700560177U,	// <3,7,2,4>: Cost 3 vsldoi8 <2,6,3,7>, <2,4,6,5>
+  3773638505U,	// <3,7,2,5>: Cost 4 vsldoi8 <2,5,3,7>, <2,5,3,7>
+  1626818490U,	// <3,7,2,6>: Cost 2 vsldoi8 <2,6,3,7>, <2,6,3,7>
+  2795140307U,	// <3,7,2,7>: Cost 3 vsldoi12 <7,2,7,3>, <7,2,7,3>
+  1628145756U,	// <3,7,2,u>: Cost 2 vsldoi8 <2,u,3,7>, <2,u,3,7>
+  2700560534U,	// <3,7,3,0>: Cost 3 vsldoi8 <2,6,3,7>, <3,0,1,2>
+  3774302438U,	// <3,7,3,1>: Cost 4 vsldoi8 <2,6,3,7>, <3,1,1,1>
+  2700560742U,	// <3,7,3,2>: Cost 3 vsldoi8 <2,6,3,7>, <3,2,6,3>
+  2700560796U,	// <3,7,3,3>: Cost 3 vsldoi8 <2,6,3,7>, <3,3,3,3>
+  2700560898U,	// <3,7,3,4>: Cost 3 vsldoi8 <2,6,3,7>, <3,4,5,6>
+  3774302821U,	// <3,7,3,5>: Cost 4 vsldoi8 <2,6,3,7>, <3,5,7,6>
+  2700561079U,	// <3,7,3,6>: Cost 3 vsldoi8 <2,6,3,7>, <3,6,7,7>
+  2700561091U,	// <3,7,3,7>: Cost 3 vsldoi8 <2,6,3,7>, <3,7,0,1>
+  2700561182U,	// <3,7,3,u>: Cost 3 vsldoi8 <2,6,3,7>, <3,u,1,2>
+  2655617126U,	// <3,7,4,0>: Cost 3 vsldoi4 <6,3,7,4>, LHS
+  3774303178U,	// <3,7,4,1>: Cost 4 vsldoi8 <2,6,3,7>, <4,1,2,3>
+  2655619002U,	// <3,7,4,2>: Cost 3 vsldoi4 <6,3,7,4>, <2,6,3,7>
+  2331062778U,	// <3,7,4,3>: Cost 3 vmrglw <u,2,3,4>, <6,2,7,3>
+  2655620406U,	// <3,7,4,4>: Cost 3 vsldoi4 <6,3,7,4>, RHS
+  1626819894U,	// <3,7,4,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  2655621708U,	// <3,7,4,6>: Cost 3 vsldoi4 <6,3,7,4>, <6,3,7,4>
+  2800080247U,	// <3,7,4,7>: Cost 3 vsldoi12 LHS, <7,4,7,5>
+  1626820137U,	// <3,7,4,u>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  3774303816U,	// <3,7,5,0>: Cost 4 vsldoi8 <2,6,3,7>, <5,0,1,2>
+  3873822093U,	// <3,7,5,1>: Cost 4 vsldoi12 LHS, <7,5,1,0>
+  3774303998U,	// <3,7,5,2>: Cost 4 vsldoi8 <2,6,3,7>, <5,2,3,4>
+  3862910368U,	// <3,7,5,3>: Cost 4 vsldoi12 <6,2,7,3>, <7,5,3,1>
+  3774304180U,	// <3,7,5,4>: Cost 4 vsldoi8 <2,6,3,7>, <5,4,5,6>
+  2800080310U,	// <3,7,5,5>: Cost 3 vsldoi12 LHS, <7,5,5,5>
+  2800080321U,	// <3,7,5,6>: Cost 3 vsldoi12 LHS, <7,5,6,7>
+  3873822147U,	// <3,7,5,7>: Cost 4 vsldoi12 LHS, <7,5,7,0>
+  2800080339U,	// <3,7,5,u>: Cost 3 vsldoi12 LHS, <7,5,u,7>
+  2800080348U,	// <3,7,6,0>: Cost 3 vsldoi12 LHS, <7,6,0,7>
+  3873822181U,	// <3,7,6,1>: Cost 4 vsldoi12 LHS, <7,6,1,7>
+  2789168622U,	// <3,7,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <7,6,2,7>
+  2700563016U,	// <3,7,6,3>: Cost 3 vsldoi8 <2,6,3,7>, <6,3,7,0>
+  2800080384U,	// <3,7,6,4>: Cost 3 vsldoi12 LHS, <7,6,4,7>
+  3862910472U,	// <3,7,6,5>: Cost 4 vsldoi12 <6,2,7,3>, <7,6,5,6>
+  2700563256U,	// <3,7,6,6>: Cost 3 vsldoi8 <2,6,3,7>, <6,6,6,6>
+  2800080404U,	// <3,7,6,7>: Cost 3 vsldoi12 LHS, <7,6,7,0>
+  2793149988U,	// <3,7,6,u>: Cost 3 vsldoi12 <6,u,7,3>, <7,6,u,7>
+  2637725798U,	// <3,7,7,0>: Cost 3 vsldoi4 <3,3,7,7>, LHS
+  3371649227U,	// <3,7,7,1>: Cost 4 vmrglw <2,6,3,7>, <3,0,7,1>
+  2637727674U,	// <3,7,7,2>: Cost 3 vsldoi4 <3,3,7,7>, <2,6,3,7>
+  2297907567U,	// <3,7,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,7,3>
+  2637729078U,	// <3,7,7,4>: Cost 3 vsldoi4 <3,3,7,7>, RHS
+  3371649312U,	// <3,7,7,5>: Cost 4 vmrglw <2,6,3,7>, <3,1,7,5>
+  2655646287U,	// <3,7,7,6>: Cost 3 vsldoi4 <6,3,7,7>, <6,3,7,7>
+  1726338668U,	// <3,7,7,7>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  1726338668U,	// <3,7,7,u>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  2700564179U,	// <3,7,u,0>: Cost 3 vsldoi8 <2,6,3,7>, <u,0,1,2>
+  1626822446U,	// <3,7,u,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  2700564357U,	// <3,7,u,2>: Cost 3 vsldoi8 <2,6,3,7>, <u,2,3,0>
+  2700564412U,	// <3,7,u,3>: Cost 3 vsldoi8 <2,6,3,7>, <u,3,0,1>
+  2700564543U,	// <3,7,u,4>: Cost 3 vsldoi8 <2,6,3,7>, <u,4,5,6>
+  1626822810U,	// <3,7,u,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  1662654672U,	// <3,7,u,6>: Cost 2 vsldoi8 <u,6,3,7>, <u,6,3,7>
+  1726338668U,	// <3,7,u,7>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  1626823013U,	// <3,7,u,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  1678557184U,	// <3,u,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0>
+  1679005395U,	// <3,u,0,1>: Cost 2 vsldoi12 LHS, <u,0,1,2>
+  2289221787U,	// <3,u,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,u,2>
+  1215479964U,	// <3,u,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS
+  2752747245U,	// <3,u,0,4>: Cost 3 vsldoi12 LHS, <u,0,4,1>
+  1158863002U,	// <3,u,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS
+  2289224221U,	// <3,u,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,u,6>
+  1215483208U,	// <3,u,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS
+  1679005458U,	// <3,u,0,u>: Cost 2 vsldoi12 LHS, <u,0,u,2>
+  1558036582U,	// <3,u,1,0>: Cost 2 vsldoi4 <2,3,u,1>, LHS
+  1678558004U,	// <3,u,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  604821294U,	// <3,u,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2752747317U,	// <3,u,1,3>: Cost 3 vsldoi12 LHS, <u,1,3,1>
+  1558039862U,	// <3,u,1,4>: Cost 2 vsldoi4 <2,3,u,1>, RHS
+  2756949830U,	// <3,u,1,5>: Cost 3 vsldoi12 LHS, <u,1,5,0>
+  2800080726U,	// <3,u,1,6>: Cost 3 vsldoi12 LHS, <u,1,6,7>
+  2289233224U,	// <3,u,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS
+  604821348U,	// <3,u,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696586709U,	// <3,u,2,0>: Cost 3 vsldoi8 <2,0,3,u>, <2,0,3,u>
+  2757392246U,	// <3,u,2,1>: Cost 3 vsldoi12 LHS, <u,2,1,3>
+  1624172151U,	// <3,u,2,2>: Cost 2 vsldoi8 <2,2,3,u>, <2,2,3,u>
+  1679005576U,	// <3,u,2,3>: Cost 2 vsldoi12 LHS, <u,2,3,3>
+  2631789878U,	// <3,u,2,4>: Cost 3 vsldoi4 <2,3,u,2>, RHS
+  2699904874U,	// <3,u,2,5>: Cost 3 vsldoi8 <2,5,3,u>, <2,5,3,u>
+  1626826683U,	// <3,u,2,6>: Cost 2 vsldoi8 <2,6,3,u>, <2,6,3,u>
+  1726338988U,	// <3,u,2,7>: Cost 2 vsldoi12 LHS, <u,2,7,3>
+  1683208117U,	// <3,u,2,u>: Cost 2 vsldoi12 LHS, <u,2,u,3>
+  1679005628U,	// <3,u,3,0>: Cost 2 vsldoi12 LHS, <u,3,0,1>
+  1161008942U,	// <3,u,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2752747471U,	// <3,u,3,2>: Cost 3 vsldoi12 LHS, <u,3,2,2>
+  403488870U,	// <3,u,3,3>: Cost 1 vspltisw3 LHS
+  1679005668U,	// <3,u,3,4>: Cost 2 vsldoi12 LHS, <u,3,4,5>
+  1161009306U,	// <3,u,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2691943104U,	// <3,u,3,6>: Cost 3 vsldoi8 <1,2,3,u>, <3,6,u,7>
+  1221479752U,	// <3,u,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS
+  403488870U,	// <3,u,3,u>: Cost 1 vspltisw3 LHS
+  2289255363U,	// <3,u,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,u,0>
+  1161844526U,	// <3,u,4,1>: Cost 2 vmrghw <3,4,5,6>, LHS
+  2289256661U,	// <3,u,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,u,2>
+  1215512732U,	// <3,u,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS
+  1215513498U,	// <3,u,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4>
+  1679005759U,	// <3,u,4,5>: Cost 2 vsldoi12 LHS, <u,4,5,6>
+  2289256989U,	// <3,u,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,u,6>
+  1215515976U,	// <3,u,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS
+  1679005786U,	// <3,u,4,u>: Cost 2 vsldoi12 LHS, <u,4,u,6>
+  1558069350U,	// <3,u,5,0>: Cost 2 vsldoi4 <2,3,u,5>, LHS
+  2631811892U,	// <3,u,5,1>: Cost 3 vsldoi4 <2,3,u,5>, <1,1,1,1>
+  1558071026U,	// <3,u,5,2>: Cost 2 vsldoi4 <2,3,u,5>, <2,3,u,5>
+  2752747646U,	// <3,u,5,3>: Cost 3 vsldoi12 LHS, <u,5,3,6>
+  1558072630U,	// <3,u,5,4>: Cost 2 vsldoi4 <2,3,u,5>, RHS
+  1726337028U,	// <3,u,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  604821658U,	// <3,u,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2294574408U,	// <3,u,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS
+  604821676U,	// <3,u,5,u>: Cost 1 vsldoi12 LHS, RHS
+  2631819366U,	// <3,u,6,0>: Cost 3 vsldoi4 <2,3,u,6>, LHS
+  2757392574U,	// <3,u,6,1>: Cost 3 vsldoi12 LHS, <u,6,1,7>
+  2631821043U,	// <3,u,6,2>: Cost 3 vsldoi4 <2,3,u,6>, <2,3,u,6>
+  1679005904U,	// <3,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7>
+  2631822646U,	// <3,u,6,4>: Cost 3 vsldoi4 <2,3,u,6>, RHS
+  2236553370U,	// <3,u,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS
+  1726337848U,	// <3,u,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1726339309U,	// <3,u,6,7>: Cost 2 vsldoi12 LHS, <u,6,7,0>
+  1683208445U,	// <3,u,6,u>: Cost 2 vsldoi12 LHS, <u,6,u,7>
+  1726339328U,	// <3,u,7,0>: Cost 2 vsldoi12 LHS, <u,7,0,1>
+  2297905225U,	// <3,u,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,u,1>
+  2631829236U,	// <3,u,7,2>: Cost 3 vsldoi4 <2,3,u,7>, <2,3,u,7>
+  1224163484U,	// <3,u,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1726339368U,	// <3,u,7,4>: Cost 2 vsldoi12 LHS, <u,7,4,5>
+  2297905553U,	// <3,u,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,u,5>
+  2297905392U,	// <3,u,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,u,6>
+  1224166728U,	// <3,u,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1224163489U,	// <3,u,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1683208529U,	// <3,u,u,0>: Cost 2 vsldoi12 LHS, <u,u,0,1>
+  1679006043U,	// <3,u,u,1>: Cost 2 vsldoi12 LHS, <u,u,1,2>
+  604821861U,	// <3,u,u,2>: Cost 1 vsldoi12 LHS, LHS
+  403488870U,	// <3,u,u,3>: Cost 1 vspltisw3 LHS
+  1683208569U,	// <3,u,u,4>: Cost 2 vsldoi12 LHS, <u,u,4,5>
+  1679006083U,	// <3,u,u,5>: Cost 2 vsldoi12 LHS, <u,u,5,6>
+  604821901U,	// <3,u,u,6>: Cost 1 vsldoi12 LHS, RHS
+  1215548744U,	// <3,u,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS
+  604821915U,	// <3,u,u,u>: Cost 1 vsldoi12 LHS, LHS
+  2759016448U,	// <4,0,0,0>: Cost 3 vsldoi12 <1,2,3,4>, <0,0,0,0>
+  1165115494U,	// <4,0,0,1>: Cost 2 vmrghw <4,0,5,1>, LHS
+  3717531337U,	// <4,0,0,2>: Cost 4 vsldoi4 <4,4,0,0>, <2,3,4,0>
+  3369675785U,	// <4,0,0,3>: Cost 4 vmrglw <2,3,4,0>, <4,2,0,3>
+  2751791144U,	// <4,0,0,4>: Cost 3 vsldoi12 <0,0,4,4>, <0,0,4,4>
+  2238857630U,	// <4,0,0,5>: Cost 3 vmrghw <4,0,5,1>, <0,5,1,0>
+  3312591341U,	// <4,0,0,6>: Cost 4 vmrghw <4,0,5,0>, <0,6,0,7>
+  3369676113U,	// <4,0,0,7>: Cost 4 vmrglw <2,3,4,0>, <4,6,0,7>
+  1165116061U,	// <4,0,0,u>: Cost 2 vmrghw <4,0,5,1>, LHS
+  2637824102U,	// <4,0,1,0>: Cost 3 vsldoi4 <3,4,0,1>, LHS
+  2637824922U,	// <4,0,1,1>: Cost 3 vsldoi4 <3,4,0,1>, <1,2,3,4>
+  1685274726U,	// <4,0,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2637826512U,	// <4,0,1,3>: Cost 3 vsldoi4 <3,4,0,1>, <3,4,0,1>
+  2637827382U,	// <4,0,1,4>: Cost 3 vsldoi4 <3,4,0,1>, RHS
+  2661716070U,	// <4,0,1,5>: Cost 3 vsldoi4 <7,4,0,1>, <5,6,7,4>
+  3729486427U,	// <4,0,1,6>: Cost 4 vsldoi4 <6,4,0,1>, <6,4,0,1>
+  2661717300U,	// <4,0,1,7>: Cost 3 vsldoi4 <7,4,0,1>, <7,4,0,1>
+  1685274780U,	// <4,0,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  3711574118U,	// <4,0,2,0>: Cost 4 vsldoi4 <3,4,0,2>, LHS
+  2240200806U,	// <4,0,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS
+  3771663992U,	// <4,0,2,2>: Cost 4 vsldoi8 <2,2,4,0>, <2,2,4,0>
+  2698585801U,	// <4,0,2,3>: Cost 3 vsldoi8 <2,3,4,0>, <2,3,4,0>
+  3373672105U,	// <4,0,2,4>: Cost 4 vmrglw <3,0,4,2>, <2,3,0,4>
+  3810813795U,	// <4,0,2,5>: Cost 4 vsldoi8 <u,7,4,0>, <2,5,3,1>
+  3772327866U,	// <4,0,2,6>: Cost 4 vsldoi8 <2,3,4,0>, <2,6,3,7>
+  3386280568U,	// <4,0,2,7>: Cost 5 vmrglw <5,1,4,2>, <3,6,0,7>
+  2701903966U,	// <4,0,2,u>: Cost 3 vsldoi8 <2,u,4,0>, <2,u,4,0>
+  3699638374U,	// <4,0,3,0>: Cost 4 vsldoi4 <1,4,0,3>, LHS
+  2753560832U,	// <4,0,3,1>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4>
+  3772328276U,	// <4,0,3,2>: Cost 4 vsldoi8 <2,3,4,0>, <3,2,4,3>
+  3827302674U,	// <4,0,3,3>: Cost 4 vsldoi12 <0,3,1,4>, <0,3,3,4>
+  3699641654U,	// <4,0,3,4>: Cost 4 vsldoi4 <1,4,0,3>, RHS
+  3779627588U,	// <4,0,3,5>: Cost 4 vsldoi8 <3,5,4,0>, <3,5,4,0>
+  3772328604U,	// <4,0,3,6>: Cost 4 vsldoi8 <2,3,4,0>, <3,6,4,7>
+  3780954854U,	// <4,0,3,7>: Cost 4 vsldoi8 <3,7,4,0>, <3,7,4,0>
+  2753560832U,	// <4,0,3,u>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4>
+  2725129106U,	// <4,0,4,0>: Cost 3 vsldoi8 <6,7,4,0>, <4,0,5,1>
+  1167720550U,	// <4,0,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS
+  3839172953U,	// <4,0,4,2>: Cost 4 vsldoi12 <2,3,0,4>, <0,4,2,3>
+  3772329051U,	// <4,0,4,3>: Cost 4 vsldoi8 <2,3,4,0>, <4,3,0,4>
+  2241462610U,	// <4,0,4,4>: Cost 3 vmrghw <4,4,4,4>, <0,4,1,5>
+  2698587446U,	// <4,0,4,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS
+  3772329297U,	// <4,0,4,6>: Cost 4 vsldoi8 <2,3,4,0>, <4,6,0,7>
+  3735483703U,	// <4,0,4,7>: Cost 4 vsldoi4 <7,4,0,4>, <7,4,0,4>
+  1167721117U,	// <4,0,4,u>: Cost 2 vmrghw <4,4,4,4>, LHS
+  1168556032U,	// <4,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  94814310U,	// <4,0,5,1>: Cost 1 vmrghw RHS, LHS
+  2242298029U,	// <4,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2>
+  2637859284U,	// <4,0,5,3>: Cost 3 vsldoi4 <3,4,0,5>, <3,4,0,5>
+  1168556370U,	// <4,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2242306530U,	// <4,0,5,5>: Cost 3 vmrghw RHS, <0,5,u,5>
+  2242298358U,	// <4,0,5,6>: Cost 3 vmrghw RHS, <0,6,1,7>
+  2661750072U,	// <4,0,5,7>: Cost 3 vsldoi4 <7,4,0,5>, <7,4,0,5>
+  94814877U,	// <4,0,5,u>: Cost 1 vmrghw RHS, LHS
+  3316580362U,	// <4,0,6,0>: Cost 4 vmrghw <4,6,5,1>, <0,0,1,1>
+  2242846822U,	// <4,0,6,1>: Cost 3 vmrghw <4,6,5,2>, LHS
+  3798872570U,	// <4,0,6,2>: Cost 4 vsldoi8 <6,7,4,0>, <6,2,7,3>
+  3796218413U,	// <4,0,6,3>: Cost 4 vsldoi8 <6,3,4,0>, <6,3,4,0>
+  3834528273U,	// <4,0,6,4>: Cost 4 vsldoi12 <1,5,0,4>, <0,6,4,7>
+  3798872811U,	// <4,0,6,5>: Cost 4 vsldoi8 <6,7,4,0>, <6,5,7,1>
+  3316621876U,	// <4,0,6,6>: Cost 4 vmrghw <4,6,5,6>, <0,6,u,6>
+  2725131121U,	// <4,0,6,7>: Cost 3 vsldoi8 <6,7,4,0>, <6,7,4,0>
+  2242847389U,	// <4,0,6,u>: Cost 3 vmrghw <4,6,5,2>, LHS
+  3377692672U,	// <4,0,7,0>: Cost 4 vmrglw <3,6,4,7>, <0,0,0,0>
+  2243493990U,	// <4,0,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS
+  3775648970U,	// <4,0,7,2>: Cost 5 vsldoi8 <2,u,4,0>, <7,2,6,3>
+  3802191110U,	// <4,0,7,3>: Cost 4 vsldoi8 <7,3,4,0>, <7,3,4,0>
+  3317236050U,	// <4,0,7,4>: Cost 4 vmrghw <4,7,5,0>, <0,4,1,5>
+  3803518376U,	// <4,0,7,5>: Cost 4 vsldoi8 <7,5,4,0>, <7,5,4,0>
+  3317236214U,	// <4,0,7,6>: Cost 5 vmrghw <4,7,5,0>, <0,6,1,7>
+  3798873708U,	// <4,0,7,7>: Cost 4 vsldoi8 <6,7,4,0>, <7,7,7,7>
+  2243494557U,	// <4,0,7,u>: Cost 3 vmrghw <4,7,5,0>, LHS
+  1170546688U,	// <4,0,u,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  96804966U,	// <4,0,u,1>: Cost 1 vmrghw RHS, LHS
+  1685275293U,	// <4,0,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2637883863U,	// <4,0,u,3>: Cost 3 vsldoi4 <3,4,0,u>, <3,4,0,u>
+  1170547026U,	// <4,0,u,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2698590362U,	// <4,0,u,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS
+  2244289014U,	// <4,0,u,6>: Cost 3 vmrghw RHS, <0,6,1,7>
+  2661774651U,	// <4,0,u,7>: Cost 3 vsldoi4 <7,4,0,u>, <7,4,0,u>
+  96805533U,	// <4,0,u,u>: Cost 1 vmrghw RHS, LHS
+  2667749478U,	// <4,1,0,0>: Cost 3 vsldoi4 <u,4,1,0>, LHS
+  2689966182U,	// <4,1,0,1>: Cost 3 vsldoi8 <0,u,4,1>, LHS
+  2238571418U,	// <4,1,0,2>: Cost 3 vmrghw <4,0,1,2>, <1,2,3,4>
+  3711633880U,	// <4,1,0,3>: Cost 4 vsldoi4 <3,4,1,0>, <3,4,1,0>
+  2689966418U,	// <4,1,0,4>: Cost 3 vsldoi8 <0,u,4,1>, <0,4,1,5>
+  3361046866U,	// <4,1,0,5>: Cost 4 vmrglw <0,u,4,0>, <0,4,1,5>
+  3741495802U,	// <4,1,0,6>: Cost 4 vsldoi4 <u,4,1,0>, <6,2,7,3>
+  3741496314U,	// <4,1,0,7>: Cost 4 vsldoi4 <u,4,1,0>, <7,0,1,2>
+  2689966765U,	// <4,1,0,u>: Cost 3 vsldoi8 <0,u,4,1>, <0,u,4,1>
+  3764372222U,	// <4,1,1,0>: Cost 4 vsldoi8 <1,0,4,1>, <1,0,4,1>
+  2758206263U,	// <4,1,1,1>: Cost 3 vsldoi12 <1,1,1,4>, <1,1,1,4>
+  2698593178U,	// <4,1,1,2>: Cost 3 vsldoi8 <2,3,4,1>, <1,2,3,4>
+  3361057810U,	// <4,1,1,3>: Cost 4 vmrglw <0,u,4,1>, <4,2,1,3>
+  3827303250U,	// <4,1,1,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,1,4,4>
+  2287313234U,	// <4,1,1,5>: Cost 3 vmrglw <0,u,4,1>, <0,4,1,5>
+  3763709171U,	// <4,1,1,6>: Cost 4 vsldoi8 <0,u,4,1>, <1,6,5,7>
+  3361058138U,	// <4,1,1,7>: Cost 4 vmrglw <0,u,4,1>, <4,6,1,7>
+  2239759744U,	// <4,1,1,u>: Cost 3 vmrghw <4,1,u,3>, <1,u,3,4>
+  2637906022U,	// <4,1,2,0>: Cost 3 vsldoi4 <3,4,1,2>, LHS
+  2637906842U,	// <4,1,2,1>: Cost 3 vsldoi4 <3,4,1,2>, <1,2,3,4>
+  3763709544U,	// <4,1,2,2>: Cost 4 vsldoi8 <0,u,4,1>, <2,2,2,2>
+  1685275546U,	// <4,1,2,3>: Cost 2 vsldoi12 <1,2,3,4>, <1,2,3,4>
+  2637909302U,	// <4,1,2,4>: Cost 3 vsldoi4 <3,4,1,2>, RHS
+  3361063250U,	// <4,1,2,5>: Cost 4 vmrglw <0,u,4,2>, <0,4,1,5>
+  3763709882U,	// <4,1,2,6>: Cost 4 vsldoi8 <0,u,4,1>, <2,6,3,7>
+  3735541054U,	// <4,1,2,7>: Cost 4 vsldoi4 <7,4,1,2>, <7,4,1,2>
+  1685644231U,	// <4,1,2,u>: Cost 2 vsldoi12 <1,2,u,4>, <1,2,u,4>
+  2702575792U,	// <4,1,3,0>: Cost 3 vsldoi8 <3,0,4,1>, <3,0,4,1>
+  3832759257U,	// <4,1,3,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,3,1,4>
+  3833349090U,	// <4,1,3,2>: Cost 4 vsldoi12 <1,3,2,4>, <1,3,2,4>
+  3763710364U,	// <4,1,3,3>: Cost 4 vsldoi8 <0,u,4,1>, <3,3,3,3>
+  2707884546U,	// <4,1,3,4>: Cost 3 vsldoi8 <3,u,4,1>, <3,4,5,6>
+  3361071442U,	// <4,1,3,5>: Cost 4 vmrglw <0,u,4,3>, <0,4,1,5>
+  3772336796U,	// <4,1,3,6>: Cost 4 vsldoi8 <2,3,4,1>, <3,6,4,7>
+  3775654595U,	// <4,1,3,7>: Cost 5 vsldoi8 <2,u,4,1>, <3,7,0,1>
+  2707884856U,	// <4,1,3,u>: Cost 3 vsldoi8 <3,u,4,1>, <3,u,4,1>
+  2667782246U,	// <4,1,4,0>: Cost 3 vsldoi4 <u,4,1,4>, LHS
+  2241463092U,	// <4,1,4,1>: Cost 3 vmrghw <4,4,4,4>, <1,1,1,1>
+  2241553306U,	// <4,1,4,2>: Cost 3 vmrghw <4,4,5,6>, <1,2,3,4>
+  3827303484U,	// <4,1,4,3>: Cost 4 vsldoi12 <0,3,1,4>, <1,4,3,4>
+  2667785424U,	// <4,1,4,4>: Cost 3 vsldoi4 <u,4,1,4>, <4,4,4,4>
+  2689969462U,	// <4,1,4,5>: Cost 3 vsldoi8 <0,u,4,1>, RHS
+  3763711322U,	// <4,1,4,6>: Cost 4 vsldoi8 <0,u,4,1>, <4,6,1,7>
+  3867116636U,	// <4,1,4,7>: Cost 4 vsldoi12 <7,0,1,4>, <1,4,7,0>
+  2689969705U,	// <4,1,4,u>: Cost 3 vsldoi8 <0,u,4,1>, RHS
+  1546273106U,	// <4,1,5,0>: Cost 2 vsldoi4 <0,4,1,5>, <0,4,1,5>
+  1168556852U,	// <4,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1168556950U,	// <4,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  2620016790U,	// <4,1,5,3>: Cost 3 vsldoi4 <0,4,1,5>, <3,0,1,2>
+  1546276150U,	// <4,1,5,4>: Cost 2 vsldoi4 <0,4,1,5>, RHS
+  2620018692U,	// <4,1,5,5>: Cost 3 vsldoi4 <0,4,1,5>, <5,5,5,5>
+  2242299087U,	// <4,1,5,6>: Cost 3 vmrghw RHS, <1,6,1,7>
+  2667795450U,	// <4,1,5,7>: Cost 3 vsldoi4 <u,4,1,5>, <7,0,1,2>
+  1546278702U,	// <4,1,5,u>: Cost 2 vsldoi4 <0,4,1,5>, LHS
+  3781628193U,	// <4,1,6,0>: Cost 4 vsldoi8 <3,u,4,1>, <6,0,1,2>
+  3832759503U,	// <4,1,6,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,1,7>
+  3316261786U,	// <4,1,6,2>: Cost 4 vmrghw <4,6,0,7>, <1,2,3,4>
+  3781628466U,	// <4,1,6,3>: Cost 4 vsldoi8 <3,u,4,1>, <6,3,4,5>
+  3827303658U,	// <4,1,6,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,6,4,7>
+  3361096018U,	// <4,1,6,5>: Cost 4 vmrglw <0,u,4,6>, <0,4,1,5>
+  3788264248U,	// <4,1,6,6>: Cost 4 vsldoi8 <5,0,4,1>, <6,6,6,6>
+  3788264270U,	// <4,1,6,7>: Cost 4 vsldoi8 <5,0,4,1>, <6,7,0,1>
+  3832759566U,	// <4,1,6,u>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,u,7>
+  2726466580U,	// <4,1,7,0>: Cost 3 vsldoi8 <7,0,4,1>, <7,0,4,1>
+  3377692682U,	// <4,1,7,1>: Cost 4 vmrglw <3,6,4,7>, <0,0,1,1>
+  3377694870U,	// <4,1,7,2>: Cost 4 vmrglw <3,6,4,7>, <3,0,1,2>
+  3802199303U,	// <4,1,7,3>: Cost 4 vsldoi8 <7,3,4,1>, <7,3,4,1>
+  2731775334U,	// <4,1,7,4>: Cost 3 vsldoi8 <7,u,4,1>, <7,4,5,6>
+  3377693010U,	// <4,1,7,5>: Cost 4 vmrglw <3,6,4,7>, <0,4,1,5>
+  3365749804U,	// <4,1,7,6>: Cost 5 vmrglw <1,6,4,7>, <1,4,1,6>
+  3788265068U,	// <4,1,7,7>: Cost 4 vsldoi8 <5,0,4,1>, <7,7,7,7>
+  2731775644U,	// <4,1,7,u>: Cost 3 vsldoi8 <7,u,4,1>, <7,u,4,1>
+  1546297685U,	// <4,1,u,0>: Cost 2 vsldoi4 <0,4,1,u>, <0,4,1,u>
+  1170547508U,	// <4,1,u,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1170547606U,	// <4,1,u,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  1689257344U,	// <4,1,u,3>: Cost 2 vsldoi12 <1,u,3,4>, <1,u,3,4>
+  1546300726U,	// <4,1,u,4>: Cost 2 vsldoi4 <0,4,1,u>, RHS
+  2284716370U,	// <4,1,u,5>: Cost 3 vmrglw <0,4,4,u>, <0,4,1,5>
+  2244289743U,	// <4,1,u,6>: Cost 3 vmrghw RHS, <1,6,1,7>
+  2667820026U,	// <4,1,u,7>: Cost 3 vsldoi4 <u,4,1,u>, <7,0,1,2>
+  1546303278U,	// <4,1,u,u>: Cost 2 vsldoi4 <0,4,1,u>, LHS
+  3729621094U,	// <4,2,0,0>: Cost 4 vsldoi4 <6,4,2,0>, LHS
+  3763716198U,	// <4,2,0,1>: Cost 4 vsldoi8 <0,u,4,2>, LHS
+  2238858856U,	// <4,2,0,2>: Cost 3 vmrghw <4,0,5,1>, <2,2,2,2>
+  2295930982U,	// <4,2,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS
+  3763716434U,	// <4,2,0,4>: Cost 4 vsldoi8 <0,u,4,2>, <0,4,1,5>
+  2238859107U,	// <4,2,0,5>: Cost 3 vmrghw <4,0,5,1>, <2,5,3,1>
+  2238859194U,	// <4,2,0,6>: Cost 3 vmrghw <4,0,5,1>, <2,6,3,7>
+  3312601066U,	// <4,2,0,7>: Cost 4 vmrghw <4,0,5,1>, <2,7,0,1>
+  2295930987U,	// <4,2,0,u>: Cost 3 vmrglw <2,3,4,0>, LHS
+  3699769446U,	// <4,2,1,0>: Cost 4 vsldoi4 <1,4,2,1>, LHS
+  3313255971U,	// <4,2,1,1>: Cost 4 vmrghw <4,1,5,0>, <2,1,3,5>
+  3361056360U,	// <4,2,1,2>: Cost 4 vmrglw <0,u,4,1>, <2,2,2,2>
+  2287312998U,	// <4,2,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS
+  3788932148U,	// <4,2,1,4>: Cost 4 vsldoi8 <5,1,4,2>, <1,4,2,5>
+  3313256290U,	// <4,2,1,5>: Cost 4 vmrghw <4,1,5,0>, <2,5,3,0>
+  3838289469U,	// <4,2,1,6>: Cost 4 vsldoi12 <2,1,6,4>, <2,1,6,4>
+  3369682865U,	// <4,2,1,7>: Cost 5 vmrglw <2,3,4,1>, <2,6,2,7>
+  2287313003U,	// <4,2,1,u>: Cost 3 vmrglw <0,u,4,1>, LHS
+  3838658133U,	// <4,2,2,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,0,1>
+  3711722394U,	// <4,2,2,1>: Cost 4 vsldoi4 <3,4,2,2>, <1,2,3,4>
+  2759018088U,	// <4,2,2,2>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,2,2>
+  2759018098U,	// <4,2,2,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,3,3>
+  3838658168U,	// <4,2,2,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,4,0>
+  3369027341U,	// <4,2,2,5>: Cost 4 vmrglw <2,2,4,2>, <2,4,2,5>
+  2240227258U,	// <4,2,2,6>: Cost 3 vmrghw <4,2,5,6>, <2,6,3,7>
+  3735614791U,	// <4,2,2,7>: Cost 4 vsldoi4 <7,4,2,2>, <7,4,2,2>
+  2759018143U,	// <4,2,2,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,u,3>
+  2759018150U,	// <4,2,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,0,1>
+  3831948975U,	// <4,2,3,1>: Cost 4 vsldoi12 <1,1,1,4>, <2,3,1,1>
+  3832759993U,	// <4,2,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <2,3,2,2>
+  2759018180U,	// <4,2,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,3,4>
+  2759018185U,	// <4,2,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,4,0>
+  3839542998U,	// <4,2,3,5>: Cost 4 vsldoi12 <2,3,5,4>, <2,3,5,4>
+  3314640826U,	// <4,2,3,6>: Cost 4 vmrghw <4,3,5,7>, <2,6,3,7>
+  2765948648U,	// <4,2,3,7>: Cost 3 vsldoi12 <2,3,7,4>, <2,3,7,4>
+  2759018222U,	// <4,2,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,u,1>
+  3838658295U,	// <4,2,4,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,4,0,1>
+  3315205667U,	// <4,2,4,1>: Cost 4 vmrghw <4,4,4,4>, <2,1,3,5>
+  2241463912U,	// <4,2,4,2>: Cost 3 vmrghw <4,4,4,4>, <2,2,2,2>
+  1234829414U,	// <4,2,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS
+  2241464085U,	// <4,2,4,4>: Cost 3 vmrghw <4,4,4,4>, <2,4,3,4>
+  2241546087U,	// <4,2,4,5>: Cost 3 vmrghw <4,4,5,5>, <2,5,3,5>
+  2241464250U,	// <4,2,4,6>: Cost 3 vmrghw <4,4,4,4>, <2,6,3,7>
+  3741602873U,	// <4,2,4,7>: Cost 4 vsldoi4 <u,4,2,4>, <7,0,u,2>
+  1234829419U,	// <4,2,4,u>: Cost 2 vmrglw <4,4,4,4>, LHS
+  2626060390U,	// <4,2,5,0>: Cost 3 vsldoi4 <1,4,2,5>, LHS
+  2626061364U,	// <4,2,5,1>: Cost 3 vsldoi4 <1,4,2,5>, <1,4,2,5>
+  1168557672U,	// <4,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1222230118U,	// <4,2,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS
+  2626063670U,	// <4,2,5,4>: Cost 3 vsldoi4 <1,4,2,5>, RHS
+  2242299752U,	// <4,2,5,5>: Cost 3 vmrghw RHS, <2,5,3,6>
+  1168558010U,	// <4,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2242299882U,	// <4,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1>
+  1222230123U,	// <4,2,5,u>: Cost 2 vmrglw <2,3,4,5>, LHS
+  3711754342U,	// <4,2,6,0>: Cost 4 vsldoi4 <3,4,2,6>, LHS
+  3711755162U,	// <4,2,6,1>: Cost 4 vsldoi4 <3,4,2,6>, <1,2,3,4>
+  3838658481U,	// <4,2,6,2>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,2,7>
+  2759018426U,	// <4,2,6,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,3,7>
+  3838658499U,	// <4,2,6,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,4,7>
+  3735646310U,	// <4,2,6,5>: Cost 4 vsldoi4 <7,4,2,6>, <5,6,7,4>
+  3316590522U,	// <4,2,6,6>: Cost 4 vmrghw <4,6,5,2>, <2,6,3,7>
+  3798889331U,	// <4,2,6,7>: Cost 4 vsldoi8 <6,7,4,2>, <6,7,4,2>
+  2759018471U,	// <4,2,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,u,7>
+  3874564074U,	// <4,2,7,0>: Cost 4 vsldoi12 <u,2,3,4>, <2,7,0,1>
+  3800880230U,	// <4,2,7,1>: Cost 4 vsldoi8 <7,1,4,2>, <7,1,4,2>
+  3371722344U,	// <4,2,7,2>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,2>
+  2303950950U,	// <4,2,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS
+  3371722346U,	// <4,2,7,4>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,4>
+  3371722509U,	// <4,2,7,5>: Cost 5 vmrglw <2,6,4,7>, <2,4,2,5>
+  3317237690U,	// <4,2,7,6>: Cost 4 vmrghw <4,7,5,0>, <2,6,3,7>
+  3317237738U,	// <4,2,7,7>: Cost 4 vmrghw <4,7,5,0>, <2,7,0,1>
+  2303950955U,	// <4,2,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS
+  2759018555U,	// <4,2,u,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,0,1>
+  2626085943U,	// <4,2,u,1>: Cost 3 vsldoi4 <1,4,2,u>, <1,4,2,u>
+  1170548328U,	// <4,2,u,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1222254694U,	// <4,2,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS
+  2759018595U,	// <4,2,u,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,4,5>
+  2244290408U,	// <4,2,u,5>: Cost 3 vmrghw RHS, <2,5,3,6>
+  1170548666U,	// <4,2,u,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2769266813U,	// <4,2,u,7>: Cost 3 vsldoi12 <2,u,7,4>, <2,u,7,4>
+  1222254699U,	// <4,2,u,u>: Cost 2 vmrglw <2,3,4,u>, LHS
+  2238859414U,	// <4,3,0,0>: Cost 3 vmrghw <4,0,5,1>, <3,0,1,2>
+  2759018646U,	// <4,3,0,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,1,2>
+  3312314708U,	// <4,3,0,2>: Cost 4 vmrghw <4,0,1,2>, <3,2,4,3>
+  2238859676U,	// <4,3,0,3>: Cost 3 vmrghw <4,0,5,1>, <3,3,3,3>
+  2295931802U,	// <4,3,0,4>: Cost 3 vmrglw <2,3,4,0>, <1,2,3,4>
+  3735670886U,	// <4,3,0,5>: Cost 4 vsldoi4 <7,4,3,0>, <5,6,7,4>
+  3312315036U,	// <4,3,0,6>: Cost 4 vmrghw <4,0,1,2>, <3,6,4,7>
+  3369674682U,	// <4,3,0,7>: Cost 4 vmrglw <2,3,4,0>, <2,6,3,7>
+  2759018709U,	// <4,3,0,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,u,2>
+  3361055638U,	// <4,3,1,0>: Cost 4 vmrglw <0,u,4,1>, <1,2,3,0>
+  3831949542U,	// <4,3,1,1>: Cost 4 vsldoi12 <1,1,1,4>, <3,1,1,1>
+  2703917978U,	// <4,3,1,2>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4>
+  3361056370U,	// <4,3,1,3>: Cost 4 vmrglw <0,u,4,1>, <2,2,3,3>
+  2295939994U,	// <4,3,1,4>: Cost 3 vmrglw <2,3,4,1>, <1,2,3,4>
+  3361056291U,	// <4,3,1,5>: Cost 4 vmrglw <0,u,4,1>, <2,1,3,5>
+  3378972520U,	// <4,3,1,6>: Cost 4 vmrglw <3,u,4,1>, <2,5,3,6>
+  3361056698U,	// <4,3,1,7>: Cost 4 vmrglw <0,u,4,1>, <2,6,3,7>
+  2703917978U,	// <4,3,1,u>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4>
+  3832760624U,	// <4,3,2,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,0,3>
+  3711796122U,	// <4,3,2,1>: Cost 4 vsldoi4 <3,4,3,2>, <1,2,3,4>
+  3832760641U,	// <4,3,2,2>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,2,2>
+  2770962764U,	// <4,3,2,3>: Cost 3 vsldoi12 <3,2,3,4>, <3,2,3,4>
+  2759018836U,	// <4,3,2,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,2,4,3>
+  3827304802U,	// <4,3,2,5>: Cost 5 vsldoi12 <0,3,1,4>, <3,2,5,u>
+  3832760678U,	// <4,3,2,6>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,6,3>
+  3859597679U,	// <4,3,2,7>: Cost 4 vsldoi12 <5,6,7,4>, <3,2,7,3>
+  2771331449U,	// <4,3,2,u>: Cost 3 vsldoi12 <3,2,u,4>, <3,2,u,4>
+  2240841878U,	// <4,3,3,0>: Cost 3 vmrghw <4,3,5,0>, <3,0,1,2>
+  3776997635U,	// <4,3,3,1>: Cost 4 vsldoi8 <3,1,4,3>, <3,1,4,3>
+  2703919444U,	// <4,3,3,2>: Cost 3 vsldoi8 <3,2,4,3>, <3,2,4,3>
+  2759018908U,	// <4,3,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,3,3>
+  2759018918U,	// <4,3,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,4,4>
+  3386951446U,	// <4,3,3,5>: Cost 4 vmrglw <5,2,4,3>, <2,4,3,5>
+  3777661596U,	// <4,3,3,6>: Cost 4 vsldoi8 <3,2,4,3>, <3,6,4,7>
+  3375007674U,	// <4,3,3,7>: Cost 4 vmrglw <3,2,4,3>, <2,6,3,7>
+  2707901242U,	// <4,3,3,u>: Cost 3 vsldoi8 <3,u,4,3>, <3,u,4,3>
+  2759018960U,	// <4,3,4,0>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,0,1>
+  2759018970U,	// <4,3,4,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,1,2>
+  2632099605U,	// <4,3,4,2>: Cost 3 vsldoi4 <2,4,3,4>, <2,4,3,4>
+  2241464732U,	// <4,3,4,3>: Cost 3 vmrghw <4,4,4,4>, <3,3,3,3>
+  2759019000U,	// <4,3,4,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,4,5>
+  2753563138U,	// <4,3,4,5>: Cost 3 vsldoi12 <0,3,1,4>, <3,4,5,6>
+  3777662316U,	// <4,3,4,6>: Cost 4 vsldoi8 <3,2,4,3>, <4,6,3,7>
+  2308573114U,	// <4,3,4,7>: Cost 3 vmrglw <4,4,4,4>, <2,6,3,7>
+  2759019032U,	// <4,3,4,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,u,1>
+  1168558230U,	// <4,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2242300134U,	// <4,3,5,1>: Cost 3 vmrghw RHS, <3,1,1,1>
+  2632107798U,	// <4,3,5,2>: Cost 3 vsldoi4 <2,4,3,5>, <2,4,3,5>
+  1168558492U,	// <4,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1168558594U,	// <4,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  2295973654U,	// <4,3,5,5>: Cost 3 vmrglw <2,3,4,5>, <2,4,3,5>
+  2242300536U,	// <4,3,5,6>: Cost 3 vmrghw RHS, <3,6,0,7>
+  2295973818U,	// <4,3,5,7>: Cost 3 vmrglw <2,3,4,5>, <2,6,3,7>
+  1168558878U,	// <4,3,5,u>: Cost 2 vmrghw RHS, <3,u,1,2>
+  3832760952U,	// <4,3,6,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,6,0,7>
+  3711828890U,	// <4,3,6,1>: Cost 4 vsldoi4 <3,4,3,6>, <1,2,3,4>
+  3316484436U,	// <4,3,6,2>: Cost 4 vmrghw <4,6,3,7>, <3,2,4,3>
+  3711830512U,	// <4,3,6,3>: Cost 4 vsldoi4 <3,4,3,6>, <3,4,3,6>
+  2759019164U,	// <4,3,6,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7>
+  3361097251U,	// <4,3,6,5>: Cost 5 vmrglw <0,u,4,6>, <2,1,3,5>
+  3316624045U,	// <4,3,6,6>: Cost 4 vmrghw <4,6,5,6>, <3,6,6,6>
+  2773912244U,	// <4,3,6,7>: Cost 3 vsldoi12 <3,6,7,4>, <3,6,7,4>
+  2759019164U,	// <4,3,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7>
+  3377693590U,	// <4,3,7,0>: Cost 4 vmrglw <3,6,4,7>, <1,2,3,0>
+  3365751680U,	// <4,3,7,1>: Cost 5 vmrglw <1,6,4,7>, <4,0,3,1>
+  2727810232U,	// <4,3,7,2>: Cost 3 vsldoi8 <7,2,4,3>, <7,2,4,3>
+  3377694322U,	// <4,3,7,3>: Cost 4 vmrglw <3,6,4,7>, <2,2,3,3>
+  2303951770U,	// <4,3,7,4>: Cost 3 vmrglw <3,6,4,7>, <1,2,3,4>
+  3741700198U,	// <4,3,7,5>: Cost 4 vsldoi4 <u,4,3,7>, <5,6,7,4>
+  3377695216U,	// <4,3,7,6>: Cost 4 vmrglw <3,6,4,7>, <3,4,3,6>
+  3375703994U,	// <4,3,7,7>: Cost 4 vmrglw <3,3,4,7>, <2,6,3,7>
+  2731792030U,	// <4,3,7,u>: Cost 3 vsldoi8 <7,u,4,3>, <7,u,4,3>
+  1170548886U,	// <4,3,u,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2759019294U,	// <4,3,u,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,1,2>
+  2632132377U,	// <4,3,u,2>: Cost 3 vsldoi4 <2,4,3,u>, <2,4,3,u>
+  1170549148U,	// <4,3,u,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1170549250U,	// <4,3,u,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  2759019334U,	// <4,3,u,5>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,5,6>
+  2244291192U,	// <4,3,u,6>: Cost 3 vmrghw RHS, <3,6,0,7>
+  2295998394U,	// <4,3,u,7>: Cost 3 vmrglw <2,3,4,u>, <2,6,3,7>
+  1170549534U,	// <4,3,u,u>: Cost 2 vmrghw RHS, <3,u,1,2>
+  1165118354U,	// <4,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1637482598U,	// <4,4,0,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS
+  3711854285U,	// <4,4,0,2>: Cost 4 vsldoi4 <3,4,4,0>, <2,3,4,4>
+  3827305344U,	// <4,4,0,3>: Cost 4 vsldoi12 <0,3,1,4>, <4,0,3,1>
+  2711224658U,	// <4,4,0,4>: Cost 3 vsldoi8 <4,4,4,4>, <0,4,1,5>
+  1165118774U,	// <4,4,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS
+  3312602489U,	// <4,4,0,6>: Cost 4 vmrghw <4,0,5,1>, <4,6,5,2>
+  3369675420U,	// <4,4,0,7>: Cost 4 vmrglw <2,3,4,0>, <3,6,4,7>
+  1165119017U,	// <4,4,0,u>: Cost 2 vmrghw <4,0,5,1>, RHS
+  3369682633U,	// <4,4,1,0>: Cost 4 vmrglw <2,3,4,1>, <2,3,4,0>
+  2287313581U,	// <4,4,1,1>: Cost 3 vmrglw <0,u,4,1>, <0,u,4,1>
+  2759019466U,	// <4,4,1,2>: Cost 3 vsldoi12 <1,2,3,4>, <4,1,2,3>
+  3369683284U,	// <4,4,1,3>: Cost 4 vmrglw <2,3,4,1>, <3,2,4,3>
+  2311204048U,	// <4,4,1,4>: Cost 3 vmrglw <4,u,4,1>, <4,4,4,4>
+  2239319350U,	// <4,4,1,5>: Cost 3 vmrghw <4,1,2,3>, RHS
+  3784967411U,	// <4,4,1,6>: Cost 4 vsldoi8 <4,4,4,4>, <1,6,5,7>
+  3369683612U,	// <4,4,1,7>: Cost 4 vmrglw <2,3,4,1>, <3,6,4,7>
+  2763000832U,	// <4,4,1,u>: Cost 3 vsldoi12 <1,u,3,4>, <4,1,u,3>
+  3711869030U,	// <4,4,2,0>: Cost 4 vsldoi4 <3,4,4,2>, LHS
+  3711869850U,	// <4,4,2,1>: Cost 4 vsldoi4 <3,4,4,2>, <1,2,3,4>
+  2240203830U,	// <4,4,2,2>: Cost 3 vmrghw <4,2,5,3>, <4,2,5,3>
+  2698618573U,	// <4,4,2,3>: Cost 3 vsldoi8 <2,3,4,4>, <2,3,4,4>
+  2711226133U,	// <4,4,2,4>: Cost 3 vsldoi8 <4,4,4,4>, <2,4,3,4>
+  2240204086U,	// <4,4,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS
+  2711226298U,	// <4,4,2,6>: Cost 3 vsldoi8 <4,4,4,4>, <2,6,3,7>
+  3832761416U,	// <4,4,2,7>: Cost 4 vsldoi12 <1,2,3,4>, <4,2,7,3>
+  2701936738U,	// <4,4,2,u>: Cost 3 vsldoi8 <2,u,4,4>, <2,u,4,4>
+  2711226518U,	// <4,4,3,0>: Cost 3 vsldoi8 <4,4,4,4>, <3,0,1,2>
+  3777005828U,	// <4,4,3,1>: Cost 4 vsldoi8 <3,1,4,4>, <3,1,4,4>
+  3832761453U,	// <4,4,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,2,4>
+  2301266260U,	// <4,4,3,3>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3>
+  2705254903U,	// <4,4,3,4>: Cost 3 vsldoi8 <3,4,4,4>, <3,4,4,4>
+  2240843062U,	// <4,4,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS
+  3832761489U,	// <4,4,3,6>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,6,4>
+  3375008412U,	// <4,4,3,7>: Cost 4 vmrglw <3,2,4,3>, <3,6,4,7>
+  2301266260U,	// <4,4,3,u>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3>
+  1570373734U,	// <4,4,4,0>: Cost 2 vsldoi4 <4,4,4,4>, LHS
+  2308574089U,	// <4,4,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,4,1>
+  2644117096U,	// <4,4,4,2>: Cost 3 vsldoi4 <4,4,4,4>, <2,2,2,2>
+  2638146039U,	// <4,4,4,3>: Cost 3 vsldoi4 <3,4,4,4>, <3,4,4,4>
+  229035318U,	// <4,4,4,4>: Cost 1 vspltisw0 RHS
+  1167723830U,	// <4,4,4,5>: Cost 2 vmrghw <4,4,4,4>, RHS
+  2644120058U,	// <4,4,4,6>: Cost 3 vsldoi4 <4,4,4,4>, <6,2,7,3>
+  2662036827U,	// <4,4,4,7>: Cost 3 vsldoi4 <7,4,4,4>, <7,4,4,4>
+  229035318U,	// <4,4,4,u>: Cost 1 vspltisw0 RHS
+  1168558994U,	// <4,4,5,0>: Cost 2 vmrghw RHS, <4,0,5,1>
+  2638152602U,	// <4,4,5,1>: Cost 3 vsldoi4 <3,4,4,5>, <1,2,3,4>
+  2242300981U,	// <4,4,5,2>: Cost 3 vmrghw RHS, <4,2,5,2>
+  2638154232U,	// <4,4,5,3>: Cost 3 vsldoi4 <3,4,4,5>, <3,4,4,5>
+  1168559322U,	// <4,4,5,4>: Cost 2 vmrghw RHS, <4,4,5,5>
+  94817590U,	// <4,4,5,5>: Cost 1 vmrghw RHS, RHS
+  1685278006U,	// <4,4,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  2242309576U,	// <4,4,5,7>: Cost 3 vmrghw RHS, <4,7,5,0>
+  94817833U,	// <4,4,5,u>: Cost 1 vmrghw RHS, RHS
+  3316591506U,	// <4,4,6,0>: Cost 4 vmrghw <4,6,5,2>, <4,0,5,1>
+  3758428587U,	// <4,4,6,1>: Cost 4 vsldoi8 <0,0,4,4>, <6,1,7,5>
+  2711228922U,	// <4,4,6,2>: Cost 3 vsldoi8 <4,4,4,4>, <6,2,7,3>
+  3796251185U,	// <4,4,6,3>: Cost 4 vsldoi8 <6,3,4,4>, <6,3,4,4>
+  2711229085U,	// <4,4,6,4>: Cost 3 vsldoi8 <4,4,4,4>, <6,4,7,4>
+  2242850102U,	// <4,4,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2242850169U,	// <4,4,6,6>: Cost 3 vmrghw <4,6,5,2>, <4,6,5,2>
+  2725163893U,	// <4,4,6,7>: Cost 3 vsldoi8 <6,7,4,4>, <6,7,4,4>
+  2242850345U,	// <4,4,6,u>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2711229434U,	// <4,4,7,0>: Cost 3 vsldoi8 <4,4,4,4>, <7,0,1,2>
+  3377694410U,	// <4,4,7,1>: Cost 4 vmrglw <3,6,4,7>, <2,3,4,1>
+  3868593584U,	// <4,4,7,2>: Cost 4 vsldoi12 <7,2,3,4>, <4,7,2,3>
+  3377695060U,	// <4,4,7,3>: Cost 4 vmrglw <3,6,4,7>, <3,2,4,3>
+  2729145691U,	// <4,4,7,4>: Cost 3 vsldoi8 <7,4,4,4>, <7,4,4,4>
+  2243497270U,	// <4,4,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS
+  3871542744U,	// <4,4,7,6>: Cost 4 vsldoi12 <7,6,7,4>, <4,7,6,7>
+  2303953564U,	// <4,4,7,7>: Cost 3 vmrglw <3,6,4,7>, <3,6,4,7>
+  2243497513U,	// <4,4,7,u>: Cost 3 vmrghw <4,7,5,0>, RHS
+  1170549650U,	// <4,4,u,0>: Cost 2 vmrghw RHS, <4,0,5,1>
+  1637488430U,	// <4,4,u,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS
+  2244291637U,	// <4,4,u,2>: Cost 3 vmrghw RHS, <4,2,5,2>
+  2638178811U,	// <4,4,u,3>: Cost 3 vsldoi4 <3,4,4,u>, <3,4,4,u>
+  229035318U,	// <4,4,u,4>: Cost 1 vspltisw0 RHS
+  96808246U,	// <4,4,u,5>: Cost 1 vmrghw RHS, RHS
+  1685278249U,	// <4,4,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  2244292040U,	// <4,4,u,7>: Cost 3 vmrghw RHS, <4,7,5,0>
+  96808489U,	// <4,4,u,u>: Cost 1 vmrghw RHS, RHS
+  2698625024U,	// <4,5,0,0>: Cost 3 vsldoi8 <2,3,4,5>, <0,0,0,0>
+  1624883302U,	// <4,5,0,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  2638186190U,	// <4,5,0,2>: Cost 3 vsldoi4 <3,4,5,0>, <2,3,4,5>
+  2638187004U,	// <4,5,0,3>: Cost 3 vsldoi4 <3,4,5,0>, <3,4,5,0>
+  2687345005U,	// <4,5,0,4>: Cost 3 vsldoi8 <0,4,4,5>, <0,4,4,5>
+  2238861316U,	// <4,5,0,5>: Cost 3 vmrghw <4,0,5,1>, <5,5,5,5>
+  2662077302U,	// <4,5,0,6>: Cost 3 vsldoi4 <7,4,5,0>, <6,7,4,5>
+  2662077792U,	// <4,5,0,7>: Cost 3 vsldoi4 <7,4,5,0>, <7,4,5,0>
+  1624883869U,	// <4,5,0,u>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  3361057762U,	// <4,5,1,0>: Cost 4 vmrglw <0,u,4,1>, <4,1,5,0>
+  2691326803U,	// <4,5,1,1>: Cost 3 vsldoi8 <1,1,4,5>, <1,1,4,5>
+  2698625942U,	// <4,5,1,2>: Cost 3 vsldoi8 <2,3,4,5>, <1,2,3,0>
+  3361055659U,	// <4,5,1,3>: Cost 4 vmrglw <0,u,4,1>, <1,2,5,3>
+  3761087567U,	// <4,5,1,4>: Cost 4 vsldoi8 <0,4,4,5>, <1,4,5,5>
+  2693981335U,	// <4,5,1,5>: Cost 3 vsldoi8 <1,5,4,5>, <1,5,4,5>
+  2305231362U,	// <4,5,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6>
+  3361055987U,	// <4,5,1,7>: Cost 4 vmrglw <0,u,4,1>, <1,6,5,7>
+  2695972234U,	// <4,5,1,u>: Cost 3 vsldoi8 <1,u,4,5>, <1,u,4,5>
+  2638200934U,	// <4,5,2,0>: Cost 3 vsldoi4 <3,4,5,2>, LHS
+  3761088035U,	// <4,5,2,1>: Cost 4 vsldoi8 <0,4,4,5>, <2,1,3,5>
+  2697963133U,	// <4,5,2,2>: Cost 3 vsldoi8 <2,2,4,5>, <2,2,4,5>
+  1624884942U,	// <4,5,2,3>: Cost 2 vsldoi8 <2,3,4,5>, <2,3,4,5>
+  2698626838U,	// <4,5,2,4>: Cost 3 vsldoi8 <2,3,4,5>, <2,4,3,5>
+  3772368744U,	// <4,5,2,5>: Cost 4 vsldoi8 <2,3,4,5>, <2,5,3,6>
+  2698627002U,	// <4,5,2,6>: Cost 3 vsldoi8 <2,3,4,5>, <2,6,3,7>
+  3775023122U,	// <4,5,2,7>: Cost 4 vsldoi8 <2,7,4,5>, <2,7,4,5>
+  1628203107U,	// <4,5,2,u>: Cost 2 vsldoi8 <2,u,4,5>, <2,u,4,5>
+  2698627222U,	// <4,5,3,0>: Cost 3 vsldoi8 <2,3,4,5>, <3,0,1,2>
+  3765070057U,	// <4,5,3,1>: Cost 4 vsldoi8 <1,1,4,5>, <3,1,1,4>
+  2698627404U,	// <4,5,3,2>: Cost 3 vsldoi8 <2,3,4,5>, <3,2,3,4>
+  2698627484U,	// <4,5,3,3>: Cost 3 vsldoi8 <2,3,4,5>, <3,3,3,3>
+  2698627580U,	// <4,5,3,4>: Cost 3 vsldoi8 <2,3,4,5>, <3,4,5,0>
+  3779668553U,	// <4,5,3,5>: Cost 4 vsldoi8 <3,5,4,5>, <3,5,4,5>
+  2725169844U,	// <4,5,3,6>: Cost 3 vsldoi8 <6,7,4,5>, <3,6,7,4>
+  2707253995U,	// <4,5,3,7>: Cost 3 vsldoi8 <3,7,4,5>, <3,7,4,5>
+  2698627870U,	// <4,5,3,u>: Cost 3 vsldoi8 <2,3,4,5>, <3,u,1,2>
+  2638217318U,	// <4,5,4,0>: Cost 3 vsldoi4 <3,4,5,4>, LHS
+  2308574098U,	// <4,5,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,5,1>
+  2698628150U,	// <4,5,4,2>: Cost 3 vsldoi8 <2,3,4,5>, <4,2,5,3>
+  2638219776U,	// <4,5,4,3>: Cost 3 vsldoi4 <3,4,5,4>, <3,4,5,4>
+  2698628314U,	// <4,5,4,4>: Cost 3 vsldoi8 <2,3,4,5>, <4,4,5,5>
+  1624886582U,	// <4,5,4,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  2698628478U,	// <4,5,4,6>: Cost 3 vsldoi8 <2,3,4,5>, <4,6,5,7>
+  2662110564U,	// <4,5,4,7>: Cost 3 vsldoi4 <7,4,5,4>, <7,4,5,4>
+  1624886825U,	// <4,5,4,u>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  1570455654U,	// <4,5,5,0>: Cost 2 vsldoi4 <4,4,5,5>, LHS
+  2312564250U,	// <4,5,5,1>: Cost 3 vmrglw <5,1,4,5>, <4,u,5,1>
+  2644199118U,	// <4,5,5,2>: Cost 3 vsldoi4 <4,4,5,5>, <2,3,4,5>
+  2295974966U,	// <4,5,5,3>: Cost 3 vmrglw <2,3,4,5>, <4,2,5,3>
+  1570458842U,	// <4,5,5,4>: Cost 2 vsldoi4 <4,4,5,5>, <4,4,5,5>
+  1168568324U,	// <4,5,5,5>: Cost 2 vmrghw RHS, <5,5,5,5>
+  1168568418U,	// <4,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  2295975294U,	// <4,5,5,7>: Cost 3 vmrglw <2,3,4,5>, <4,6,5,7>
+  1168716036U,	// <4,5,5,u>: Cost 2 vmrghw RHS, <5,u,7,0>
+  1564491878U,	// <4,5,6,0>: Cost 2 vsldoi4 <3,4,5,6>, LHS
+  2626290768U,	// <4,5,6,1>: Cost 3 vsldoi4 <1,4,5,6>, <1,4,5,6>
+  2632263465U,	// <4,5,6,2>: Cost 3 vsldoi4 <2,4,5,6>, <2,4,5,6>
+  1564494338U,	// <4,5,6,3>: Cost 2 vsldoi4 <3,4,5,6>, <3,4,5,6>
+  1564495158U,	// <4,5,6,4>: Cost 2 vsldoi4 <3,4,5,6>, RHS
+  2638237464U,	// <4,5,6,5>: Cost 3 vsldoi4 <3,4,5,6>, <5,2,6,3>
+  2656154253U,	// <4,5,6,6>: Cost 3 vsldoi4 <6,4,5,6>, <6,4,5,6>
+  27705344U,	// <4,5,6,7>: Cost 0 copy RHS
+  27705344U,	// <4,5,6,u>: Cost 0 copy RHS
+  2725172218U,	// <4,5,7,0>: Cost 3 vsldoi8 <6,7,4,5>, <7,0,1,2>
+  3859599489U,	// <4,5,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <5,7,1,4>
+  2698630320U,	// <4,5,7,2>: Cost 3 vsldoi8 <2,3,4,5>, <7,2,3,4>
+  2728490251U,	// <4,5,7,3>: Cost 3 vsldoi8 <7,3,4,5>, <7,3,4,5>
+  2725172576U,	// <4,5,7,4>: Cost 3 vsldoi8 <6,7,4,5>, <7,4,5,0>
+  3317239812U,	// <4,5,7,5>: Cost 4 vmrghw <4,7,5,0>, <5,5,5,5>
+  2725172760U,	// <4,5,7,6>: Cost 3 vsldoi8 <6,7,4,5>, <7,6,7,4>
+  2725172844U,	// <4,5,7,7>: Cost 3 vsldoi8 <6,7,4,5>, <7,7,7,7>
+  2725172866U,	// <4,5,7,u>: Cost 3 vsldoi8 <6,7,4,5>, <7,u,1,2>
+  1564508262U,	// <4,5,u,0>: Cost 2 vsldoi4 <3,4,5,u>, LHS
+  1624889134U,	// <4,5,u,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  2698631045U,	// <4,5,u,2>: Cost 3 vsldoi8 <2,3,4,5>, <u,2,3,0>
+  1564510724U,	// <4,5,u,3>: Cost 2 vsldoi4 <3,4,5,u>, <3,4,5,u>
+  1564511542U,	// <4,5,u,4>: Cost 2 vsldoi4 <3,4,5,u>, RHS
+  1624889498U,	// <4,5,u,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  1170550882U,	// <4,5,u,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  27705344U,	// <4,5,u,7>: Cost 0 copy RHS
+  27705344U,	// <4,5,u,u>: Cost 0 copy RHS
+  3312595285U,	// <4,6,0,0>: Cost 4 vmrghw <4,0,5,0>, <6,0,7,0>
+  3763748966U,	// <4,6,0,1>: Cost 4 vsldoi8 <0,u,4,6>, LHS
+  2238861818U,	// <4,6,0,2>: Cost 3 vmrghw <4,0,5,1>, <6,2,7,3>
+  3767730432U,	// <4,6,0,3>: Cost 4 vsldoi8 <1,5,4,6>, <0,3,1,4>
+  3763749202U,	// <4,6,0,4>: Cost 4 vsldoi8 <0,u,4,6>, <0,4,1,5>
+  2238862059U,	// <4,6,0,5>: Cost 3 vmrghw <4,0,5,1>, <6,5,7,1>
+  2238862136U,	// <4,6,0,6>: Cost 3 vmrghw <4,0,5,1>, <6,6,6,6>
+  2295934262U,	// <4,6,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS
+  2295934263U,	// <4,6,0,u>: Cost 3 vmrglw <2,3,4,0>, RHS
+  3378973999U,	// <4,6,1,0>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,0>
+  3378974648U,	// <4,6,1,1>: Cost 4 vmrglw <3,u,4,1>, <5,4,6,1>
+  3779675034U,	// <4,6,1,2>: Cost 4 vsldoi8 <3,5,4,6>, <1,2,3,4>
+  3378974002U,	// <4,6,1,3>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,3>
+  3378974003U,	// <4,6,1,4>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,4>
+  3767731352U,	// <4,6,1,5>: Cost 4 vsldoi8 <1,5,4,6>, <1,5,4,6>
+  3378974734U,	// <4,6,1,6>: Cost 4 vmrglw <3,u,4,1>, <5,5,6,6>
+  2287316278U,	// <4,6,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS
+  2287316279U,	// <4,6,1,u>: Cost 3 vmrglw <0,u,4,1>, RHS
+  3735904358U,	// <4,6,2,0>: Cost 4 vsldoi4 <7,4,6,2>, LHS
+  3763750435U,	// <4,6,2,1>: Cost 5 vsldoi8 <0,u,4,6>, <2,1,3,5>
+  3313938937U,	// <4,6,2,2>: Cost 4 vmrghw <4,2,5,2>, <6,2,7,2>
+  3772376782U,	// <4,6,2,3>: Cost 4 vsldoi8 <2,3,4,6>, <2,3,4,5>
+  3852890591U,	// <4,6,2,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,2,4,3>
+  3735908454U,	// <4,6,2,5>: Cost 4 vsldoi4 <7,4,6,2>, <5,6,7,4>
+  3801573306U,	// <4,6,2,6>: Cost 4 vsldoi8 <7,2,4,6>, <2,6,3,7>
+  2785858042U,	// <4,6,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,7,3>
+  2785858051U,	// <4,6,2,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,u,3>
+  3863065101U,	// <4,6,3,0>: Cost 4 vsldoi12 <6,3,0,4>, <6,3,0,4>
+  3314586024U,	// <4,6,3,1>: Cost 4 vmrghw <4,3,5,0>, <6,1,7,2>
+  3863212575U,	// <4,6,3,2>: Cost 4 vsldoi12 <6,3,2,4>, <6,3,2,4>
+  3863286312U,	// <4,6,3,3>: Cost 4 vsldoi12 <6,3,3,4>, <6,3,3,4>
+  3767732738U,	// <4,6,3,4>: Cost 4 vsldoi8 <1,5,4,6>, <3,4,5,6>
+  3779676746U,	// <4,6,3,5>: Cost 4 vsldoi8 <3,5,4,6>, <3,5,4,6>
+  3398898488U,	// <4,6,3,6>: Cost 4 vmrglw <7,2,4,3>, <6,6,6,6>
+  2301267254U,	// <4,6,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS
+  2301267255U,	// <4,6,3,u>: Cost 3 vmrglw <3,2,4,3>, RHS
+  3852890715U,	// <4,6,4,0>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,0,1>
+  3315208615U,	// <4,6,4,1>: Cost 4 vmrghw <4,4,4,4>, <6,1,7,1>
+  2241466874U,	// <4,6,4,2>: Cost 3 vmrghw <4,4,4,4>, <6,2,7,3>
+  3852890745U,	// <4,6,4,3>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,3,4>
+  2241467037U,	// <4,6,4,4>: Cost 3 vmrghw <4,4,4,4>, <6,4,7,4>
+  2241549039U,	// <4,6,4,5>: Cost 3 vmrghw <4,4,5,5>, <6,5,7,5>
+  2241467192U,	// <4,6,4,6>: Cost 3 vmrghw <4,4,4,4>, <6,6,6,6>
+  1234832694U,	// <4,6,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS
+  1234832695U,	// <4,6,4,u>: Cost 2 vmrglw <4,4,4,4>, RHS
+  2242302241U,	// <4,6,5,0>: Cost 3 vmrghw RHS, <6,0,1,2>
+  2242310567U,	// <4,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1168568826U,	// <4,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2242302514U,	// <4,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2242302605U,	// <4,6,5,4>: Cost 3 vmrghw RHS, <6,4,5,6>
+  2242310891U,	// <4,6,5,5>: Cost 3 vmrghw RHS, <6,5,7,1>
+  1168569144U,	// <4,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1222233398U,	// <4,6,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS
+  1222233399U,	// <4,6,5,u>: Cost 2 vmrglw <2,3,4,5>, RHS
+  3316576545U,	// <4,6,6,0>: Cost 4 vmrghw <4,6,5,0>, <6,0,1,2>
+  3316584871U,	// <4,6,6,1>: Cost 4 vmrghw <4,6,5,1>, <6,1,7,1>
+  2242851322U,	// <4,6,6,2>: Cost 3 vmrghw <4,6,5,2>, <6,2,7,3>
+  3316601394U,	// <4,6,6,3>: Cost 4 vmrghw <4,6,5,3>, <6,3,4,5>
+  3852890916U,	// <4,6,6,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,6,4,4>
+  3316617963U,	// <4,6,6,5>: Cost 4 vmrghw <4,6,5,5>, <6,5,7,1>
+  2242884408U,	// <4,6,6,6>: Cost 3 vmrghw <4,6,5,6>, <6,6,6,6>
+  2785858370U,	// <4,6,6,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,7,7>
+  2785858379U,	// <4,6,6,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,u,7>
+  2785858382U,	// <4,6,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,0,1>
+  3859600215U,	// <4,6,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <6,7,1,1>
+  3317240314U,	// <4,6,7,2>: Cost 4 vmrghw <4,7,5,0>, <6,2,7,3>
+  2792199020U,	// <4,6,7,3>: Cost 3 vsldoi12 <6,7,3,4>, <6,7,3,4>
+  2785858422U,	// <4,6,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,4,5>
+  3856651132U,	// <4,6,7,5>: Cost 4 vsldoi12 <5,2,3,4>, <6,7,5,2>
+  3317240632U,	// <4,6,7,6>: Cost 4 vmrghw <4,7,5,0>, <6,6,6,6>
+  2303954230U,	// <4,6,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2303954231U,	// <4,6,7,u>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2244292897U,	// <4,6,u,0>: Cost 3 vmrghw RHS, <6,0,1,2>
+  2244293031U,	// <4,6,u,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1170551290U,	// <4,6,u,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2244293170U,	// <4,6,u,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2244293261U,	// <4,6,u,4>: Cost 3 vmrghw RHS, <6,4,5,6>
+  2244293355U,	// <4,6,u,5>: Cost 3 vmrghw RHS, <6,5,7,1>
+  1170551608U,	// <4,6,u,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1222257974U,	// <4,6,u,7>: Cost 2 vmrglw <2,3,4,u>, RHS
+  1222257975U,	// <4,6,u,u>: Cost 2 vmrglw <2,3,4,u>, RHS
+  2238862330U,	// <4,7,0,0>: Cost 3 vmrghw <4,0,5,1>, <7,0,1,2>
+  2706604134U,	// <4,7,0,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  3312604308U,	// <4,7,0,2>: Cost 4 vmrghw <4,0,5,1>, <7,2,0,3>
+  3768402176U,	// <4,7,0,3>: Cost 4 vsldoi8 <1,6,4,7>, <0,3,1,4>
+  2238862648U,	// <4,7,0,4>: Cost 3 vmrghw <4,0,5,1>, <7,4,0,5>
+  3859600418U,	// <4,7,0,5>: Cost 4 vsldoi12 <5,6,7,4>, <7,0,5,6>
+  3729994393U,	// <4,7,0,6>: Cost 4 vsldoi4 <6,4,7,0>, <6,4,7,0>
+  2238862956U,	// <4,7,0,7>: Cost 3 vmrghw <4,0,5,1>, <7,7,7,7>
+  2706604701U,	// <4,7,0,u>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  3385610338U,	// <4,7,1,0>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,0>
+  3780346676U,	// <4,7,1,1>: Cost 4 vsldoi8 <3,6,4,7>, <1,1,1,1>
+  2706604954U,	// <4,7,1,2>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4>
+  3385610746U,	// <4,7,1,3>: Cost 4 vmrglw <5,0,4,1>, <6,2,7,3>
+  3385610342U,	// <4,7,1,4>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,4>
+  3385610667U,	// <4,7,1,5>: Cost 4 vmrglw <5,0,4,1>, <6,1,7,5>
+  3768403178U,	// <4,7,1,6>: Cost 4 vsldoi8 <1,6,4,7>, <1,6,4,7>
+  3385611074U,	// <4,7,1,7>: Cost 4 vmrglw <5,0,4,1>, <6,6,7,7>
+  2706604954U,	// <4,7,1,u>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4>
+  3859600532U,	// <4,7,2,0>: Cost 4 vsldoi12 <5,6,7,4>, <7,2,0,3>
+  3712091034U,	// <4,7,2,1>: Cost 5 vsldoi4 <3,4,7,2>, <1,2,3,4>
+  3774375528U,	// <4,7,2,2>: Cost 4 vsldoi8 <2,6,4,7>, <2,2,2,2>
+  2794853552U,	// <4,7,2,3>: Cost 3 vsldoi12 <7,2,3,4>, <7,2,3,4>
+  2785858744U,	// <4,7,2,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,2,4,3>
+  3735982182U,	// <4,7,2,5>: Cost 4 vsldoi4 <7,4,7,2>, <5,6,7,4>
+  3774375875U,	// <4,7,2,6>: Cost 4 vsldoi8 <2,6,4,7>, <2,6,4,7>
+  3735983476U,	// <4,7,2,7>: Cost 4 vsldoi4 <7,4,7,2>, <7,4,7,2>
+  2795222237U,	// <4,7,2,u>: Cost 3 vsldoi12 <7,2,u,4>, <7,2,u,4>
+  3780348054U,	// <4,7,3,0>: Cost 4 vsldoi8 <3,6,4,7>, <3,0,1,2>
+  3730015130U,	// <4,7,3,1>: Cost 4 vsldoi4 <6,4,7,3>, <1,2,3,4>
+  3780348244U,	// <4,7,3,2>: Cost 4 vsldoi8 <3,6,4,7>, <3,2,4,3>
+  3778357673U,	// <4,7,3,3>: Cost 4 vsldoi8 <3,3,4,7>, <3,3,4,7>
+  2325155942U,	// <4,7,3,4>: Cost 3 vmrglw <7,2,4,3>, <5,6,7,4>
+  3779684939U,	// <4,7,3,5>: Cost 5 vsldoi8 <3,5,4,7>, <3,5,4,7>
+  2706606748U,	// <4,7,3,6>: Cost 3 vsldoi8 <3,6,4,7>, <3,6,4,7>
+  3398898498U,	// <4,7,3,7>: Cost 4 vmrglw <7,2,4,3>, <6,6,7,7>
+  2707934014U,	// <4,7,3,u>: Cost 3 vsldoi8 <3,u,4,7>, <3,u,4,7>
+  2785858868U,	// <4,7,4,0>: Cost 3 vsldoi12 <5,6,7,4>, <7,4,0,1>
+  3780348874U,	// <4,7,4,1>: Cost 4 vsldoi8 <3,6,4,7>, <4,1,2,3>
+  3780349000U,	// <4,7,4,2>: Cost 4 vsldoi8 <3,6,4,7>, <4,2,7,3>
+  2308575738U,	// <4,7,4,3>: Cost 3 vmrglw <4,4,4,4>, <6,2,7,3>
+  2656283856U,	// <4,7,4,4>: Cost 3 vsldoi4 <6,4,7,4>, <4,4,4,4>
+  2706607414U,	// <4,7,4,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  2656285341U,	// <4,7,4,6>: Cost 3 vsldoi4 <6,4,7,4>, <6,4,7,4>
+  2241468012U,	// <4,7,4,7>: Cost 3 vmrghw <4,4,4,4>, <7,7,7,7>
+  2706607657U,	// <4,7,4,u>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  1168569338U,	// <4,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2242311242U,	// <4,7,5,1>: Cost 3 vmrghw RHS, <7,1,1,1>
+  2242303178U,	// <4,7,5,2>: Cost 3 vmrghw RHS, <7,2,6,3>
+  2242311395U,	// <4,7,5,3>: Cost 3 vmrghw RHS, <7,3,0,1>
+  1168569702U,	// <4,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  2242311606U,	// <4,7,5,5>: Cost 3 vmrghw RHS, <7,5,5,5>
+  2242311662U,	// <4,7,5,6>: Cost 3 vmrghw RHS, <7,6,2,7>
+  1168569964U,	// <4,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1168569986U,	// <4,7,5,u>: Cost 2 vmrghw RHS, <7,u,1,2>
+  3316593658U,	// <4,7,6,0>: Cost 4 vmrghw <4,6,5,2>, <7,0,1,2>
+  3316593738U,	// <4,7,6,1>: Cost 5 vmrghw <4,6,5,2>, <7,1,1,1>
+  3316634800U,	// <4,7,6,2>: Cost 4 vmrghw <4,6,5,7>, <7,2,3,4>
+  3386978810U,	// <4,7,6,3>: Cost 4 vmrglw <5,2,4,6>, <6,2,7,3>
+  2785859072U,	// <4,7,6,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,6,4,7>
+  3736014950U,	// <4,7,6,5>: Cost 4 vsldoi4 <7,4,7,6>, <5,6,7,4>
+  3316594158U,	// <4,7,6,6>: Cost 4 vmrghw <4,6,5,2>, <7,6,2,7>
+  2797803032U,	// <4,7,6,7>: Cost 3 vsldoi12 <7,6,7,4>, <7,6,7,4>
+  2797876769U,	// <4,7,6,u>: Cost 3 vsldoi12 <7,6,u,4>, <7,6,u,4>
+  2243499002U,	// <4,7,7,0>: Cost 3 vmrghw <4,7,5,0>, <7,0,1,2>
+  3718103962U,	// <4,7,7,1>: Cost 4 vsldoi4 <4,4,7,7>, <1,2,3,4>
+  3317257418U,	// <4,7,7,2>: Cost 4 vmrghw <4,7,5,2>, <7,2,6,3>
+  3377695816U,	// <4,7,7,3>: Cost 4 vmrglw <3,6,4,7>, <4,2,7,3>
+  2243532134U,	// <4,7,7,4>: Cost 3 vmrghw <4,7,5,4>, <7,4,5,6>
+  3317282230U,	// <4,7,7,5>: Cost 4 vmrghw <4,7,5,5>, <7,5,5,5>
+  2730497536U,	// <4,7,7,6>: Cost 3 vsldoi8 <7,6,4,7>, <7,6,4,7>
+  2243556972U,	// <4,7,7,7>: Cost 3 vmrghw <4,7,5,7>, <7,7,7,7>
+  2243565186U,	// <4,7,7,u>: Cost 3 vmrghw <4,7,5,u>, <7,u,1,2>
+  1170551802U,	// <4,7,u,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2706609966U,	// <4,7,u,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  2244293797U,	// <4,7,u,2>: Cost 3 vmrghw RHS, <7,2,2,2>
+  2244293859U,	// <4,7,u,3>: Cost 3 vmrghw RHS, <7,3,0,1>
+  1170552166U,	// <4,7,u,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  2706610330U,	// <4,7,u,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  2244294126U,	// <4,7,u,6>: Cost 3 vmrghw RHS, <7,6,2,7>
+  1170552428U,	// <4,7,u,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1170552450U,	// <4,7,u,u>: Cost 2 vmrghw RHS, <7,u,1,2>
+  1165118354U,	// <4,u,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1624907878U,	// <4,u,0,1>: Cost 2 vsldoi8 <2,3,4,u>, LHS
+  2638407377U,	// <4,u,0,2>: Cost 3 vsldoi4 <3,4,u,0>, <2,3,4,u>
+  2295931036U,	// <4,u,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS
+  2687369584U,	// <4,u,0,4>: Cost 3 vsldoi8 <0,4,4,u>, <0,4,4,u>
+  1165121690U,	// <4,u,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS
+  2662298489U,	// <4,u,0,6>: Cost 3 vsldoi4 <7,4,u,0>, <6,7,4,u>
+  2295934280U,	// <4,u,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS
+  1624908445U,	// <4,u,0,u>: Cost 2 vsldoi8 <2,3,4,u>, LHS
+  2638413926U,	// <4,u,1,0>: Cost 3 vsldoi4 <3,4,u,1>, LHS
+  2691351382U,	// <4,u,1,1>: Cost 3 vsldoi8 <1,1,4,u>, <1,1,4,u>
+  1685280558U,	// <4,u,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2287313052U,	// <4,u,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS
+  2299257799U,	// <4,u,1,4>: Cost 3 vmrglw <2,u,4,1>, <1,2,u,4>
+  2694005914U,	// <4,u,1,5>: Cost 3 vsldoi8 <1,5,4,u>, <1,5,4,u>
+  2305231362U,	// <4,u,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6>
+  2287316296U,	// <4,u,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS
+  1685280612U,	// <4,u,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2638422118U,	// <4,u,2,0>: Cost 3 vsldoi4 <3,4,u,2>, LHS
+  2240206638U,	// <4,u,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS
+  2697987712U,	// <4,u,2,2>: Cost 3 vsldoi8 <2,2,4,u>, <2,2,4,u>
+  1624909521U,	// <4,u,2,3>: Cost 2 vsldoi8 <2,3,4,u>, <2,3,4,u>
+  2759391121U,	// <4,u,2,4>: Cost 3 vsldoi12 <1,2,u,4>, <u,2,4,3>
+  2240207002U,	// <4,u,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS
+  2698651578U,	// <4,u,2,6>: Cost 3 vsldoi8 <2,3,4,u>, <2,6,3,7>
+  2785859500U,	// <4,u,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <u,2,7,3>
+  1628227686U,	// <4,u,2,u>: Cost 2 vsldoi8 <2,u,4,u>, <2,u,4,u>
+  2759022524U,	// <4,u,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,0,1>
+  2801342408U,	// <4,u,3,1>: Cost 3 vsldoi12 <u,3,1,4>, <u,3,1,4>
+  2703960409U,	// <4,u,3,2>: Cost 3 vsldoi8 <3,2,4,u>, <3,2,4,u>
+  2759022554U,	// <4,u,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,3,4>
+  2759022564U,	// <4,u,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,4,5>
+  2240845978U,	// <4,u,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS
+  2706614941U,	// <4,u,3,6>: Cost 3 vsldoi8 <3,6,4,u>, <3,6,4,u>
+  2301267272U,	// <4,u,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS
+  2759022596U,	// <4,u,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,u,1>
+  1570668646U,	// <4,u,4,0>: Cost 2 vsldoi4 <4,4,u,4>, LHS
+  1167726382U,	// <4,u,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS
+  2698652753U,	// <4,u,4,2>: Cost 3 vsldoi8 <2,3,4,u>, <4,2,u,3>
+  1234829468U,	// <4,u,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS
+  229035318U,	// <4,u,4,4>: Cost 1 vspltisw0 RHS
+  1624911158U,	// <4,u,4,5>: Cost 2 vsldoi8 <2,3,4,u>, RHS
+  2698653081U,	// <4,u,4,6>: Cost 3 vsldoi8 <2,3,4,u>, <4,6,u,7>
+  1234832712U,	// <4,u,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS
+  229035318U,	// <4,u,4,u>: Cost 1 vspltisw0 RHS
+  1168561875U,	// <4,u,5,0>: Cost 2 vmrghw RHS, <u,0,1,2>
+  94820142U,	// <4,u,5,1>: Cost 1 vmrghw RHS, LHS
+  1168562053U,	// <4,u,5,2>: Cost 2 vmrghw RHS, <u,2,3,0>
+  1222230172U,	// <4,u,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS
+  1168562239U,	// <4,u,5,4>: Cost 2 vmrghw RHS, <u,4,5,6>
+  94820506U,	// <4,u,5,5>: Cost 1 vmrghw RHS, RHS
+  1685280922U,	// <4,u,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  1222233416U,	// <4,u,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS
+  94820709U,	// <4,u,5,u>: Cost 1 vmrghw RHS, LHS
+  1564713062U,	// <4,u,6,0>: Cost 2 vsldoi4 <3,4,u,6>, LHS
+  2626511979U,	// <4,u,6,1>: Cost 3 vsldoi4 <1,4,u,6>, <1,4,u,6>
+  2632484676U,	// <4,u,6,2>: Cost 3 vsldoi4 <2,4,u,6>, <2,4,u,6>
+  1564715549U,	// <4,u,6,3>: Cost 2 vsldoi4 <3,4,u,6>, <3,4,u,6>
+  1564716342U,	// <4,u,6,4>: Cost 2 vsldoi4 <3,4,u,6>, RHS
+  2242853018U,	// <4,u,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2656375464U,	// <4,u,6,6>: Cost 3 vsldoi4 <6,4,u,6>, <6,4,u,6>
+  27705344U,	// <4,u,6,7>: Cost 0 copy RHS
+  27705344U,	// <4,u,6,u>: Cost 0 copy RHS
+  2785859840U,	// <4,u,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,0,1>
+  2243499822U,	// <4,u,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS
+  2727851197U,	// <4,u,7,2>: Cost 3 vsldoi8 <7,2,4,u>, <7,2,4,u>
+  2303951004U,	// <4,u,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS
+  2785859880U,	// <4,u,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,4,5>
+  2243500186U,	// <4,u,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS
+  2730505729U,	// <4,u,7,6>: Cost 3 vsldoi8 <7,6,4,u>, <7,6,4,u>
+  2303954248U,	// <4,u,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2303951009U,	// <4,u,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS
+  1564729446U,	// <4,u,u,0>: Cost 2 vsldoi4 <3,4,u,u>, LHS
+  96810798U,	// <4,u,u,1>: Cost 1 vmrghw RHS, LHS
+  1685281125U,	// <4,u,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  1222254748U,	// <4,u,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS
+  229035318U,	// <4,u,u,4>: Cost 1 vspltisw0 RHS
+  96811162U,	// <4,u,u,5>: Cost 1 vmrghw RHS, RHS
+  1685281165U,	// <4,u,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  27705344U,	// <4,u,u,7>: Cost 0 copy RHS
+  27705344U,	// <4,u,u,u>: Cost 0 copy RHS
+  2754232320U,	// <5,0,0,0>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,0,0>
+  2754232330U,	// <5,0,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,1,1>
+  3718194894U,	// <5,0,0,2>: Cost 4 vsldoi4 <4,5,0,0>, <2,3,4,5>
+  3376385762U,	// <5,0,0,3>: Cost 4 vmrglw <3,4,5,0>, <5,2,0,3>
+  2754232357U,	// <5,0,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,4,1>
+  3845816370U,	// <5,0,0,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,0,5,5>
+  3782353389U,	// <5,0,0,6>: Cost 4 vsldoi8 <4,0,5,0>, <0,6,0,7>
+  3376386090U,	// <5,0,0,7>: Cost 4 vmrglw <3,4,5,0>, <5,6,0,7>
+  2757402697U,	// <5,0,0,u>: Cost 3 vsldoi12 <0,u,u,5>, <0,0,u,1>
+  2626543718U,	// <5,0,1,0>: Cost 3 vsldoi4 <1,5,0,1>, LHS
+  2626544751U,	// <5,0,1,1>: Cost 3 vsldoi4 <1,5,0,1>, <1,5,0,1>
+  1680490598U,	// <5,0,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  3766428665U,	// <5,0,1,3>: Cost 4 vsldoi8 <1,3,5,0>, <1,3,5,0>
+  2626546998U,	// <5,0,1,4>: Cost 3 vsldoi4 <1,5,0,1>, RHS
+  2650435539U,	// <5,0,1,5>: Cost 3 vsldoi4 <5,5,0,1>, <5,5,0,1>
+  3783017715U,	// <5,0,1,6>: Cost 4 vsldoi8 <4,1,5,0>, <1,6,5,7>
+  3385019000U,	// <5,0,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,0,7>
+  1680490652U,	// <5,0,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  3376398336U,	// <5,0,2,0>: Cost 4 vmrglw <3,4,5,2>, <0,0,0,0>
+  2245877862U,	// <5,0,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS
+  3773064808U,	// <5,0,2,2>: Cost 4 vsldoi8 <2,4,5,0>, <2,2,2,2>
+  2705295054U,	// <5,0,2,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5>
+  3827974343U,	// <5,0,2,4>: Cost 4 vsldoi12 <0,4,1,5>, <0,2,4,1>
+  3845816530U,	// <5,0,2,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,2,5,3>
+  3779037114U,	// <5,0,2,6>: Cost 4 vsldoi8 <3,4,5,0>, <2,6,3,7>
+  3810887658U,	// <5,0,2,7>: Cost 4 vsldoi8 <u,7,5,0>, <2,7,0,1>
+  2245878429U,	// <5,0,2,u>: Cost 3 vmrghw <5,2,1,3>, LHS
+  2710603926U,	// <5,0,3,0>: Cost 3 vsldoi8 <4,3,5,0>, <3,0,1,2>
+  3827974396U,	// <5,0,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <0,3,1,0>
+  3779037516U,	// <5,0,3,2>: Cost 4 vsldoi8 <3,4,5,0>, <3,2,3,4>
+  3779037596U,	// <5,0,3,3>: Cost 4 vsldoi8 <3,4,5,0>, <3,3,3,3>
+  2705295868U,	// <5,0,3,4>: Cost 3 vsldoi8 <3,4,5,0>, <3,4,5,0>
+  3379726804U,	// <5,0,3,5>: Cost 4 vmrglw <4,0,5,3>, <3,4,0,5>
+  3802925748U,	// <5,0,3,6>: Cost 4 vsldoi8 <7,4,5,0>, <3,6,7,4>
+  3363138168U,	// <5,0,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,0,7>
+  2707950400U,	// <5,0,3,u>: Cost 3 vsldoi8 <3,u,5,0>, <3,u,5,0>
+  2626568294U,	// <5,0,4,0>: Cost 3 vsldoi4 <1,5,0,4>, LHS
+  1680490834U,	// <5,0,4,1>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5>
+  3828048219U,	// <5,0,4,2>: Cost 4 vsldoi12 <0,4,2,5>, <0,4,2,5>
+  2710604932U,	// <5,0,4,3>: Cost 3 vsldoi8 <4,3,5,0>, <4,3,5,0>
+  2754232685U,	// <5,0,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,4,4,5>
+  2705296694U,	// <5,0,4,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS
+  3779038590U,	// <5,0,4,6>: Cost 4 vsldoi8 <3,4,5,0>, <4,6,5,7>
+  2713259464U,	// <5,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0>
+  1680490834U,	// <5,0,4,u>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5>
+  2311307264U,	// <5,0,5,0>: Cost 3 vmrglw <4,u,5,5>, <0,0,0,0>
+  1174437990U,	// <5,0,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS
+  3779038946U,	// <5,0,5,2>: Cost 4 vsldoi8 <3,4,5,0>, <5,2,0,3>
+  3845816752U,	// <5,0,5,3>: Cost 4 vsldoi12 <3,4,0,5>, <0,5,3,0>
+  2248180050U,	// <5,0,5,4>: Cost 3 vmrghw <5,5,5,5>, <0,4,1,5>
+  2248180194U,	// <5,0,5,5>: Cost 3 vmrghw <5,5,5,5>, <0,5,u,5>
+  3779039274U,	// <5,0,5,6>: Cost 4 vsldoi8 <3,4,5,0>, <5,6,0,7>
+  3385051768U,	// <5,0,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,0,7>
+  1174438557U,	// <5,0,5,u>: Cost 2 vmrghw <5,5,5,5>, LHS
+  2302689280U,	// <5,0,6,0>: Cost 3 vmrglw <3,4,5,6>, <0,0,0,0>
+  1175208038U,	// <5,0,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS
+  3787002362U,	// <5,0,6,2>: Cost 4 vsldoi8 <4,7,5,0>, <6,2,7,3>
+  3376432160U,	// <5,0,6,3>: Cost 4 vmrglw <3,4,5,6>, <1,4,0,3>
+  2248950098U,	// <5,0,6,4>: Cost 3 vmrghw <5,6,7,0>, <0,4,1,5>
+  2248950180U,	// <5,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6>
+  3376433702U,	// <5,0,6,6>: Cost 4 vmrglw <3,4,5,6>, <3,5,0,6>
+  2729186166U,	// <5,0,6,7>: Cost 3 vsldoi8 <7,4,5,0>, <6,7,4,5>
+  1175208605U,	// <5,0,6,u>: Cost 2 vmrghw <5,6,7,0>, LHS
+  2713261050U,	// <5,0,7,0>: Cost 3 vsldoi8 <4,7,5,0>, <7,0,1,2>
+  3365823599U,	// <5,0,7,1>: Cost 4 vmrglw <1,6,5,7>, <1,5,0,1>
+  3808900317U,	// <5,0,7,2>: Cost 4 vsldoi8 <u,4,5,0>, <7,2,u,4>
+  3784348899U,	// <5,0,7,3>: Cost 4 vsldoi8 <4,3,5,0>, <7,3,0,1>
+  2729186656U,	// <5,0,7,4>: Cost 3 vsldoi8 <7,4,5,0>, <7,4,5,0>
+  3787003268U,	// <5,0,7,5>: Cost 4 vsldoi8 <4,7,5,0>, <7,5,0,0>
+  3802928664U,	// <5,0,7,6>: Cost 4 vsldoi8 <7,4,5,0>, <7,6,7,4>
+  3787003431U,	// <5,0,7,7>: Cost 4 vsldoi8 <4,7,5,0>, <7,7,0,1>
+  2731841188U,	// <5,0,7,u>: Cost 3 vsldoi8 <7,u,5,0>, <7,u,5,0>
+  2626601062U,	// <5,0,u,0>: Cost 3 vsldoi4 <1,5,0,u>, LHS
+  1683145366U,	// <5,0,u,1>: Cost 2 vsldoi12 <0,u,1,5>, <0,u,1,5>
+  1680491165U,	// <5,0,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2705295054U,	// <5,0,u,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5>
+  2754233005U,	// <5,0,u,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,u,4,1>
+  2705299610U,	// <5,0,u,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS
+  3779041488U,	// <5,0,u,6>: Cost 4 vsldoi8 <3,4,5,0>, <u,6,3,7>
+  2737150252U,	// <5,0,u,7>: Cost 3 vsldoi8 <u,7,5,0>, <u,7,5,0>
+  1680491219U,	// <5,0,u,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2713927680U,	// <5,1,0,0>: Cost 3 vsldoi8 <4,u,5,1>, <0,0,0,0>
+  1640185958U,	// <5,1,0,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2310607866U,	// <5,1,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2>
+  3787669756U,	// <5,1,0,3>: Cost 4 vsldoi8 <4,u,5,1>, <0,3,1,0>
+  2713928018U,	// <5,1,0,4>: Cost 3 vsldoi8 <4,u,5,1>, <0,4,1,5>
+  2306621778U,	// <5,1,0,5>: Cost 3 vmrglw <4,1,5,0>, <0,4,1,5>
+  3787670006U,	// <5,1,0,6>: Cost 4 vsldoi8 <4,u,5,1>, <0,6,1,7>
+  3736188301U,	// <5,1,0,7>: Cost 4 vsldoi4 <7,5,1,0>, <7,5,1,0>
+  1640186525U,	// <5,1,0,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2650505318U,	// <5,1,1,0>: Cost 3 vsldoi4 <5,5,1,1>, LHS
+  2754233140U,	// <5,1,1,1>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,1,1>
+  2311276694U,	// <5,1,1,2>: Cost 3 vmrglw <4,u,5,1>, <3,0,1,2>
+  2311278315U,	// <5,1,1,3>: Cost 3 vmrglw <4,u,5,1>, <5,2,1,3>
+  2758435667U,	// <5,1,1,4>: Cost 3 vsldoi12 <1,1,4,5>, <1,1,4,5>
+  2754233180U,	// <5,1,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,5,5>
+  3385016497U,	// <5,1,1,6>: Cost 4 vmrglw <4,u,5,1>, <0,2,1,6>
+  2311278643U,	// <5,1,1,7>: Cost 3 vmrglw <4,u,5,1>, <5,6,1,7>
+  2758730615U,	// <5,1,1,u>: Cost 3 vsldoi12 <1,1,u,5>, <1,1,u,5>
+  3700367462U,	// <5,1,2,0>: Cost 4 vsldoi4 <1,5,1,2>, LHS
+  3830629255U,	// <5,1,2,1>: Cost 4 vsldoi12 <0,u,1,5>, <1,2,1,3>
+  2713929320U,	// <5,1,2,2>: Cost 3 vsldoi8 <4,u,5,1>, <2,2,2,2>
+  2754233238U,	// <5,1,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,3,0>
+  2759099300U,	// <5,1,2,4>: Cost 3 vsldoi12 <1,2,4,5>, <1,2,4,5>
+  2754233259U,	// <5,1,2,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,5,3>
+  2713929658U,	// <5,1,2,6>: Cost 3 vsldoi8 <4,u,5,1>, <2,6,3,7>
+  3872359354U,	// <5,1,2,7>: Cost 4 vsldoi12 <7,u,0,5>, <1,2,7,0>
+  2754233283U,	// <5,1,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,u,0>
+  2713929878U,	// <5,1,3,0>: Cost 3 vsldoi8 <4,u,5,1>, <3,0,1,2>
+  3363135498U,	// <5,1,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,1,1>
+  3363137686U,	// <5,1,3,2>: Cost 4 vmrglw <1,2,5,3>, <3,0,1,2>
+  2713930140U,	// <5,1,3,3>: Cost 3 vsldoi8 <4,u,5,1>, <3,3,3,3>
+  2713930242U,	// <5,1,3,4>: Cost 3 vsldoi8 <4,u,5,1>, <3,4,5,6>
+  2289394002U,	// <5,1,3,5>: Cost 3 vmrglw <1,2,5,3>, <0,4,1,5>
+  3787672184U,	// <5,1,3,6>: Cost 4 vsldoi8 <4,u,5,1>, <3,6,0,7>
+  3787672259U,	// <5,1,3,7>: Cost 4 vsldoi8 <4,u,5,1>, <3,7,0,1>
+  2713930526U,	// <5,1,3,u>: Cost 3 vsldoi8 <4,u,5,1>, <3,u,1,2>
+  1634880402U,	// <5,1,4,0>: Cost 2 vsldoi8 <4,0,5,1>, <4,0,5,1>
+  2760205355U,	// <5,1,4,1>: Cost 3 vsldoi12 <1,4,1,5>, <1,4,1,5>
+  2760279092U,	// <5,1,4,2>: Cost 3 vsldoi12 <1,4,2,5>, <1,4,2,5>
+  3787672708U,	// <5,1,4,3>: Cost 4 vsldoi8 <4,u,5,1>, <4,3,5,0>
+  2713930960U,	// <5,1,4,4>: Cost 3 vsldoi8 <4,u,5,1>, <4,4,4,4>
+  1640189238U,	// <5,1,4,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS
+  3786345848U,	// <5,1,4,6>: Cost 4 vsldoi8 <4,6,5,1>, <4,6,5,1>
+  3787009481U,	// <5,1,4,7>: Cost 4 vsldoi8 <4,7,5,1>, <4,7,5,1>
+  1640189466U,	// <5,1,4,u>: Cost 2 vsldoi8 <4,u,5,1>, <4,u,5,1>
+  2754233455U,	// <5,1,5,0>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,0,1>
+  2713931407U,	// <5,1,5,1>: Cost 3 vsldoi8 <4,u,5,1>, <5,1,0,1>
+  2713931499U,	// <5,1,5,2>: Cost 3 vsldoi8 <4,u,5,1>, <5,2,1,3>
+  3827975305U,	// <5,1,5,3>: Cost 4 vsldoi12 <0,4,1,5>, <1,5,3,0>
+  2754233495U,	// <5,1,5,4>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,4,5>
+  2288746834U,	// <5,1,5,5>: Cost 3 vmrglw <1,1,5,5>, <0,4,1,5>
+  2713931827U,	// <5,1,5,6>: Cost 3 vsldoi8 <4,u,5,1>, <5,6,1,7>
+  3787673725U,	// <5,1,5,7>: Cost 4 vsldoi8 <4,u,5,1>, <5,7,1,0>
+  2754233527U,	// <5,1,5,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,u,1>
+  2668462182U,	// <5,1,6,0>: Cost 3 vsldoi4 <u,5,1,6>, LHS
+  2290746002U,	// <5,1,6,1>: Cost 3 vmrglw <1,4,5,6>, <0,u,1,1>
+  2302691478U,	// <5,1,6,2>: Cost 3 vmrglw <3,4,5,6>, <3,0,1,2>
+  3364488071U,	// <5,1,6,3>: Cost 4 vmrglw <1,4,5,6>, <1,2,1,3>
+  2302689536U,	// <5,1,6,4>: Cost 3 vmrglw <3,4,5,6>, <0,3,1,4>
+  2754233587U,	// <5,1,6,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,6,5,7>
+  2713932600U,	// <5,1,6,6>: Cost 3 vsldoi8 <4,u,5,1>, <6,6,6,6>
+  2713932622U,	// <5,1,6,7>: Cost 3 vsldoi8 <4,u,5,1>, <6,7,0,1>
+  2302689297U,	// <5,1,6,u>: Cost 3 vmrglw <3,4,5,6>, <0,0,1,u>
+  2713932794U,	// <5,1,7,0>: Cost 3 vsldoi8 <4,u,5,1>, <7,0,1,2>
+  3365822474U,	// <5,1,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,1,1>
+  3365824662U,	// <5,1,7,2>: Cost 4 vmrglw <1,6,5,7>, <3,0,1,2>
+  3787674851U,	// <5,1,7,3>: Cost 4 vsldoi8 <4,u,5,1>, <7,3,0,1>
+  2713933158U,	// <5,1,7,4>: Cost 3 vsldoi8 <4,u,5,1>, <7,4,5,6>
+  2292080978U,	// <5,1,7,5>: Cost 3 vmrglw <1,6,5,7>, <0,4,1,5>
+  3365823613U,	// <5,1,7,6>: Cost 4 vmrglw <1,6,5,7>, <1,5,1,6>
+  2713933420U,	// <5,1,7,7>: Cost 3 vsldoi8 <4,u,5,1>, <7,7,7,7>
+  2713933442U,	// <5,1,7,u>: Cost 3 vsldoi8 <4,u,5,1>, <7,u,1,2>
+  1658771190U,	// <5,1,u,0>: Cost 2 vsldoi8 <u,0,5,1>, <u,0,5,1>
+  1640191790U,	// <5,1,u,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2762933624U,	// <5,1,u,2>: Cost 3 vsldoi12 <1,u,2,5>, <1,u,2,5>
+  2754233724U,	// <5,1,u,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,u,3,0>
+  2763081098U,	// <5,1,u,4>: Cost 3 vsldoi12 <1,u,4,5>, <1,u,4,5>
+  1640192154U,	// <5,1,u,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS
+  2713934032U,	// <5,1,u,6>: Cost 3 vsldoi8 <4,u,5,1>, <u,6,3,7>
+  2713934080U,	// <5,1,u,7>: Cost 3 vsldoi8 <4,u,5,1>, <u,7,0,1>
+  1640192357U,	// <5,1,u,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  3779051520U,	// <5,2,0,0>: Cost 4 vsldoi8 <3,4,5,2>, <0,0,0,0>
+  2705309798U,	// <5,2,0,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS
+  3838813637U,	// <5,2,0,2>: Cost 4 vsldoi12 <2,2,4,5>, <2,0,2,1>
+  2302640230U,	// <5,2,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS
+  3765117266U,	// <5,2,0,4>: Cost 4 vsldoi8 <1,1,5,2>, <0,4,1,5>
+  3381027892U,	// <5,2,0,5>: Cost 4 vmrglw <4,2,5,0>, <1,4,2,5>
+  3842794985U,	// <5,2,0,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,0,6,1>
+  3408232554U,	// <5,2,0,7>: Cost 4 vmrglw <u,7,5,0>, <0,1,2,7>
+  2302640235U,	// <5,2,0,u>: Cost 3 vmrglw <3,4,5,0>, LHS
+  3700432998U,	// <5,2,1,0>: Cost 4 vsldoi4 <1,5,2,1>, LHS
+  3765117785U,	// <5,2,1,1>: Cost 4 vsldoi8 <1,1,5,2>, <1,1,5,2>
+  2311276136U,	// <5,2,1,2>: Cost 3 vmrglw <4,u,5,1>, <2,2,2,2>
+  1237532774U,	// <5,2,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS
+  3700436278U,	// <5,2,1,4>: Cost 4 vsldoi4 <1,5,2,1>, RHS
+  3381036084U,	// <5,2,1,5>: Cost 4 vmrglw <4,2,5,1>, <1,4,2,5>
+  3385018045U,	// <5,2,1,6>: Cost 4 vmrglw <4,u,5,1>, <2,3,2,6>
+  3385017560U,	// <5,2,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,2,7>
+  1237532779U,	// <5,2,1,u>: Cost 2 vmrglw <4,u,5,1>, LHS
+  3700441190U,	// <5,2,2,0>: Cost 4 vsldoi4 <1,5,2,2>, LHS
+  3700442242U,	// <5,2,2,1>: Cost 4 vsldoi4 <1,5,2,2>, <1,5,2,2>
+  2754233960U,	// <5,2,2,2>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,2,2>
+  2754233970U,	// <5,2,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,3,3>
+  2765071997U,	// <5,2,2,4>: Cost 3 vsldoi12 <2,2,4,5>, <2,2,4,5>
+  3834021508U,	// <5,2,2,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,2,5,3>
+  3842795152U,	// <5,2,2,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,2,6,6>
+  3376402492U,	// <5,2,2,7>: Cost 4 vmrglw <3,4,5,2>, <5,6,2,7>
+  2754234015U,	// <5,2,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,u,3>
+  2754234022U,	// <5,2,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,3,0,1>
+  3827975855U,	// <5,2,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <2,3,1,1>
+  2644625102U,	// <5,2,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5>
+  2289393766U,	// <5,2,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS
+  1691993806U,	// <5,2,3,4>: Cost 2 vsldoi12 <2,3,4,5>, <2,3,4,5>
+  2785052375U,	// <5,2,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <2,3,5,5>
+  3854812897U,	// <5,2,3,6>: Cost 4 vsldoi12 <4,u,5,5>, <2,3,6,6>
+  3802942187U,	// <5,2,3,7>: Cost 4 vsldoi8 <7,4,5,2>, <3,7,4,5>
+  1692288754U,	// <5,2,3,u>: Cost 2 vsldoi12 <2,3,u,5>, <2,3,u,5>
+  3839846139U,	// <5,2,4,0>: Cost 4 vsldoi12 <2,4,0,5>, <2,4,0,5>
+  2709294052U,	// <5,2,4,1>: Cost 3 vsldoi8 <4,1,5,2>, <4,1,5,2>
+  2766251789U,	// <5,2,4,2>: Cost 3 vsldoi12 <2,4,2,5>, <2,4,2,5>
+  2765735702U,	// <5,2,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,3,5>
+  3840141087U,	// <5,2,4,4>: Cost 4 vsldoi12 <2,4,4,5>, <2,4,4,5>
+  2705313078U,	// <5,2,4,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS
+  2712612217U,	// <5,2,4,6>: Cost 3 vsldoi8 <4,6,5,2>, <4,6,5,2>
+  3787017674U,	// <5,2,4,7>: Cost 4 vsldoi8 <4,7,5,2>, <4,7,5,2>
+  2765735747U,	// <5,2,4,u>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,u,5>
+  3834021704U,	// <5,2,5,0>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,0,1>
+  3834021714U,	// <5,2,5,1>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,1,2>
+  2311308904U,	// <5,2,5,2>: Cost 3 vmrglw <4,u,5,5>, <2,2,2,2>
+  1237565542U,	// <5,2,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS
+  3834021744U,	// <5,2,5,4>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,4,5>
+  3369124916U,	// <5,2,5,5>: Cost 4 vmrglw <2,2,5,5>, <1,4,2,5>
+  2248181690U,	// <5,2,5,6>: Cost 3 vmrghw <5,5,5,5>, <2,6,3,7>
+  3786354825U,	// <5,2,5,7>: Cost 4 vsldoi8 <4,6,5,2>, <5,7,2,3>
+  1237565547U,	// <5,2,5,u>: Cost 2 vmrglw <4,u,5,5>, LHS
+  3700473958U,	// <5,2,6,0>: Cost 4 vsldoi4 <1,5,2,6>, LHS
+  3700475014U,	// <5,2,6,1>: Cost 4 vsldoi4 <1,5,2,6>, <1,5,2,6>
+  2296718952U,	// <5,2,6,2>: Cost 3 vmrglw <2,4,5,6>, <2,2,2,2>
+  1228947558U,	// <5,2,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS
+  3700477238U,	// <5,2,6,4>: Cost 4 vsldoi4 <1,5,2,6>, RHS
+  3834021836U,	// <5,2,6,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,6,5,7>
+  2248951738U,	// <5,2,6,6>: Cost 3 vmrghw <5,6,7,0>, <2,6,3,7>
+  3370461105U,	// <5,2,6,7>: Cost 4 vmrglw <2,4,5,6>, <2,6,2,7>
+  1228947563U,	// <5,2,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS
+  3786355706U,	// <5,2,7,0>: Cost 4 vsldoi8 <4,6,5,2>, <7,0,1,2>
+  3783038037U,	// <5,2,7,1>: Cost 4 vsldoi8 <4,1,5,2>, <7,1,2,3>
+  3365824104U,	// <5,2,7,2>: Cost 4 vmrglw <1,6,5,7>, <2,2,2,2>
+  2292080742U,	// <5,2,7,3>: Cost 3 vmrglw <1,6,5,7>, LHS
+  3842131986U,	// <5,2,7,4>: Cost 4 vsldoi12 <2,7,4,5>, <2,7,4,5>
+  3371795508U,	// <5,2,7,5>: Cost 4 vmrglw <2,6,5,7>, <1,4,2,5>
+  3786356206U,	// <5,2,7,6>: Cost 4 vsldoi8 <4,6,5,2>, <7,6,2,7>
+  3786356332U,	// <5,2,7,7>: Cost 4 vsldoi8 <4,6,5,2>, <7,7,7,7>
+  2292080747U,	// <5,2,7,u>: Cost 3 vmrglw <1,6,5,7>, LHS
+  2754234427U,	// <5,2,u,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,u,0,1>
+  2705315630U,	// <5,2,u,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS
+  2296735336U,	// <5,2,u,2>: Cost 3 vmrglw <2,4,5,u>, <2,2,2,2>
+  1228963942U,	// <5,2,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS
+  1695311971U,	// <5,2,u,4>: Cost 2 vsldoi12 <2,u,4,5>, <2,u,4,5>
+  2705315994U,	// <5,2,u,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS
+  2769201269U,	// <5,2,u,6>: Cost 3 vsldoi12 <2,u,6,5>, <2,u,6,5>
+  3370477489U,	// <5,2,u,7>: Cost 4 vmrglw <2,4,5,u>, <2,6,2,7>
+  1695606919U,	// <5,2,u,u>: Cost 2 vsldoi12 <2,u,u,5>, <2,u,u,5>
+  3827976331U,	// <5,3,0,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,0,0,0>
+  2754234518U,	// <5,3,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,1,2>
+  3706472290U,	// <5,3,0,2>: Cost 4 vsldoi4 <2,5,3,0>, <2,5,3,0>
+  3700500630U,	// <5,3,0,3>: Cost 4 vsldoi4 <1,5,3,0>, <3,0,1,2>
+  2754234544U,	// <5,3,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,4,1>
+  3376383766U,	// <5,3,0,5>: Cost 4 vmrglw <3,4,5,0>, <2,4,3,5>
+  3769770513U,	// <5,3,0,6>: Cost 5 vsldoi8 <1,u,5,3>, <0,6,4,7>
+  3376383930U,	// <5,3,0,7>: Cost 4 vmrglw <3,4,5,0>, <2,6,3,7>
+  2754234581U,	// <5,3,0,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,u,2>
+  2311275414U,	// <5,3,1,0>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,0>
+  2305967971U,	// <5,3,1,1>: Cost 3 vmrglw <4,0,5,1>, <2,5,3,1>
+  2692047787U,	// <5,3,1,2>: Cost 3 vsldoi8 <1,2,5,3>, <1,2,5,3>
+  2311276146U,	// <5,3,1,3>: Cost 3 vmrglw <4,u,5,1>, <2,2,3,3>
+  2311275418U,	// <5,3,1,4>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,4>
+  3765789807U,	// <5,3,1,5>: Cost 4 vsldoi8 <1,2,5,3>, <1,5,0,1>
+  3765789939U,	// <5,3,1,6>: Cost 4 vsldoi8 <1,2,5,3>, <1,6,5,7>
+  2311276474U,	// <5,3,1,7>: Cost 3 vmrglw <4,u,5,1>, <2,6,3,7>
+  2696029585U,	// <5,3,1,u>: Cost 3 vsldoi8 <1,u,5,3>, <1,u,5,3>
+  2311288709U,	// <5,3,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0>
+  3765790243U,	// <5,3,2,1>: Cost 4 vsldoi8 <1,2,5,3>, <2,1,3,5>
+  3827976513U,	// <5,3,2,2>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,2,2>
+  2765736268U,	// <5,3,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <3,2,3,4>
+  2246248962U,	// <5,3,2,4>: Cost 3 vmrghw <5,2,6,3>, <3,4,5,6>
+  3765790563U,	// <5,3,2,5>: Cost 4 vsldoi8 <1,2,5,3>, <2,5,3,1>
+  3827976550U,	// <5,3,2,6>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,6,3>
+  3842795887U,	// <5,3,2,7>: Cost 4 vsldoi12 <2,u,4,5>, <3,2,7,3>
+  2769054073U,	// <5,3,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <3,2,u,4>
+  3827976575U,	// <5,3,3,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,3,0,1>
+  3765790963U,	// <5,3,3,1>: Cost 4 vsldoi8 <1,2,5,3>, <3,1,2,5>
+  3839478162U,	// <5,3,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <3,3,2,2>
+  2754234780U,	// <5,3,3,3>: Cost 3 vsldoi12 <0,4,1,5>, <3,3,3,3>
+  2771708327U,	// <5,3,3,4>: Cost 3 vsldoi12 <3,3,4,5>, <3,3,4,5>
+  3363137059U,	// <5,3,3,5>: Cost 4 vmrglw <1,2,5,3>, <2,1,3,5>
+  3375081320U,	// <5,3,3,6>: Cost 4 vmrglw <3,2,5,3>, <2,5,3,6>
+  3363137466U,	// <5,3,3,7>: Cost 4 vmrglw <1,2,5,3>, <2,6,3,7>
+  2772003275U,	// <5,3,3,u>: Cost 3 vsldoi12 <3,3,u,5>, <3,3,u,5>
+  2772077012U,	// <5,3,4,0>: Cost 3 vsldoi12 <3,4,0,5>, <3,4,0,5>
+  3765791714U,	// <5,3,4,1>: Cost 4 vsldoi8 <1,2,5,3>, <4,1,5,0>
+  2709965878U,	// <5,3,4,2>: Cost 3 vsldoi8 <4,2,5,3>, <4,2,5,3>
+  2772298223U,	// <5,3,4,3>: Cost 3 vsldoi12 <3,4,3,5>, <3,4,3,5>
+  2772371960U,	// <5,3,4,4>: Cost 3 vsldoi12 <3,4,4,5>, <3,4,4,5>
+  2754234882U,	// <5,3,4,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,5,6>
+  3839478282U,	// <5,3,4,6>: Cost 4 vsldoi12 <2,3,4,5>, <3,4,6,5>
+  3376416698U,	// <5,3,4,7>: Cost 4 vmrglw <3,4,5,4>, <2,6,3,7>
+  2754234909U,	// <5,3,4,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,u,6>
+  2311308182U,	// <5,3,5,0>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,0>
+  3765792421U,	// <5,3,5,1>: Cost 4 vsldoi8 <1,2,5,3>, <5,1,2,5>
+  2715938575U,	// <5,3,5,2>: Cost 3 vsldoi8 <5,2,5,3>, <5,2,5,3>
+  2311308914U,	// <5,3,5,3>: Cost 3 vmrglw <4,u,5,5>, <2,2,3,3>
+  2311308186U,	// <5,3,5,4>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,4>
+  2248182354U,	// <5,3,5,5>: Cost 3 vmrghw <5,5,5,5>, <3,5,5,5>
+  3765792837U,	// <5,3,5,6>: Cost 4 vsldoi8 <1,2,5,3>, <5,6,3,7>
+  2311309242U,	// <5,3,5,7>: Cost 3 vmrglw <4,u,5,5>, <2,6,3,7>
+  2311308190U,	// <5,3,5,u>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,u>
+  2632777830U,	// <5,3,6,0>: Cost 3 vsldoi4 <2,5,3,6>, LHS
+  3706520372U,	// <5,3,6,1>: Cost 4 vsldoi4 <2,5,3,6>, <1,1,1,1>
+  2632779624U,	// <5,3,6,2>: Cost 3 vsldoi4 <2,5,3,6>, <2,5,3,6>
+  2632780290U,	// <5,3,6,3>: Cost 3 vsldoi4 <2,5,3,6>, <3,4,5,6>
+  2632781110U,	// <5,3,6,4>: Cost 3 vsldoi4 <2,5,3,6>, RHS
+  2248952413U,	// <5,3,6,5>: Cost 3 vmrghw <5,6,7,0>, <3,5,6,7>
+  2302691176U,	// <5,3,6,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6>
+  2302691258U,	// <5,3,6,7>: Cost 3 vmrglw <3,4,5,6>, <2,6,3,7>
+  2632783662U,	// <5,3,6,u>: Cost 3 vsldoi4 <2,5,3,6>, LHS
+  3365823382U,	// <5,3,7,0>: Cost 4 vmrglw <1,6,5,7>, <1,2,3,0>
+  3706529011U,	// <5,3,7,1>: Cost 4 vsldoi4 <2,5,3,7>, <1,6,5,7>
+  3706529641U,	// <5,3,7,2>: Cost 4 vsldoi4 <2,5,3,7>, <2,5,3,7>
+  3365824114U,	// <5,3,7,3>: Cost 4 vmrglw <1,6,5,7>, <2,2,3,3>
+  2774362859U,	// <5,3,7,4>: Cost 3 vsldoi12 <3,7,4,5>, <3,7,4,5>
+  3365824035U,	// <5,3,7,5>: Cost 4 vmrglw <1,6,5,7>, <2,1,3,5>
+  3383740183U,	// <5,3,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,4,3,6>
+  3363833786U,	// <5,3,7,7>: Cost 4 vmrglw <1,3,5,7>, <2,6,3,7>
+  2774657807U,	// <5,3,7,u>: Cost 3 vsldoi12 <3,7,u,5>, <3,7,u,5>
+  2632794214U,	// <5,3,u,0>: Cost 3 vsldoi4 <2,5,3,u>, LHS
+  2754235166U,	// <5,3,u,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,1,2>
+  2632796010U,	// <5,3,u,2>: Cost 3 vsldoi4 <2,5,3,u>, <2,5,3,u>
+  2632796676U,	// <5,3,u,3>: Cost 3 vsldoi4 <2,5,3,u>, <3,4,5,u>
+  2632797494U,	// <5,3,u,4>: Cost 3 vsldoi4 <2,5,3,u>, RHS
+  2754235206U,	// <5,3,u,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,5,6>
+  2302691176U,	// <5,3,u,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6>
+  2302707642U,	// <5,3,u,7>: Cost 3 vmrglw <3,4,5,u>, <2,6,3,7>
+  2754235229U,	// <5,3,u,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,u,2>
+  3765133325U,	// <5,4,0,0>: Cost 4 vsldoi8 <1,1,5,4>, <0,0,1,4>
+  2705326182U,	// <5,4,0,1>: Cost 3 vsldoi8 <3,4,5,4>, LHS
+  3718489806U,	// <5,4,0,2>: Cost 4 vsldoi4 <4,5,4,0>, <2,3,4,5>
+  3718490624U,	// <5,4,0,3>: Cost 4 vsldoi4 <4,5,4,0>, <3,4,5,4>
+  2709307730U,	// <5,4,0,4>: Cost 3 vsldoi8 <4,1,5,4>, <0,4,1,5>
+  2302641870U,	// <5,4,0,5>: Cost 3 vmrglw <3,4,5,0>, <2,3,4,5>
+  3376383695U,	// <5,4,0,6>: Cost 5 vmrglw <3,4,5,0>, <2,3,4,6>
+  3384351018U,	// <5,4,0,7>: Cost 4 vmrglw <4,7,5,0>, <u,7,4,7>
+  2705326749U,	// <5,4,0,u>: Cost 3 vsldoi8 <3,4,5,4>, LHS
+  2305971057U,	// <5,4,1,0>: Cost 3 vmrglw <4,0,5,1>, <6,7,4,0>
+  3765134171U,	// <5,4,1,1>: Cost 4 vsldoi8 <1,1,5,4>, <1,1,5,4>
+  3766461338U,	// <5,4,1,2>: Cost 4 vsldoi8 <1,3,5,4>, <1,2,3,4>
+  3766461437U,	// <5,4,1,3>: Cost 4 vsldoi8 <1,3,5,4>, <1,3,5,4>
+  2311277776U,	// <5,4,1,4>: Cost 3 vmrglw <4,u,5,1>, <4,4,4,4>
+  2754235362U,	// <5,4,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <4,1,5,0>
+  3783050483U,	// <5,4,1,6>: Cost 4 vsldoi8 <4,1,5,4>, <1,6,5,7>
+  3385019036U,	// <5,4,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,4,7>
+  2311276241U,	// <5,4,1,u>: Cost 3 vmrglw <4,u,5,1>, <2,3,4,u>
+  3718504550U,	// <5,4,2,0>: Cost 4 vsldoi4 <4,5,4,2>, LHS
+  3783050787U,	// <5,4,2,1>: Cost 4 vsldoi8 <4,1,5,4>, <2,1,3,5>
+  3773097576U,	// <5,4,2,2>: Cost 4 vsldoi8 <2,4,5,4>, <2,2,2,2>
+  2705327822U,	// <5,4,2,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5>
+  3773097767U,	// <5,4,2,4>: Cost 4 vsldoi8 <2,4,5,4>, <2,4,5,4>
+  2765737014U,	// <5,4,2,5>: Cost 3 vsldoi12 <2,3,4,5>, <4,2,5,3>
+  3779069882U,	// <5,4,2,6>: Cost 4 vsldoi8 <3,4,5,4>, <2,6,3,7>
+  3376401052U,	// <5,4,2,7>: Cost 5 vmrglw <3,4,5,2>, <3,6,4,7>
+  2245881370U,	// <5,4,2,u>: Cost 3 vmrghw <5,2,1,3>, <4,u,5,1>
+  3779070102U,	// <5,4,3,0>: Cost 4 vsldoi8 <3,4,5,4>, <3,0,1,2>
+  3363135525U,	// <5,4,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,4,1>
+  3779070284U,	// <5,4,3,2>: Cost 4 vsldoi8 <3,4,5,4>, <3,2,3,4>
+  3779070364U,	// <5,4,3,3>: Cost 4 vsldoi8 <3,4,5,4>, <3,3,3,3>
+  2705328640U,	// <5,4,3,4>: Cost 3 vsldoi8 <3,4,5,4>, <3,4,5,4>
+  2307311310U,	// <5,4,3,5>: Cost 3 vmrglw <4,2,5,3>, <2,3,4,5>
+  3866021012U,	// <5,4,3,6>: Cost 4 vsldoi12 <6,7,4,5>, <4,3,6,7>
+  3363138204U,	// <5,4,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,4,7>
+  2707983172U,	// <5,4,3,u>: Cost 3 vsldoi8 <3,u,5,4>, <3,u,5,4>
+  2708646805U,	// <5,4,4,0>: Cost 3 vsldoi8 <4,0,5,4>, <4,0,5,4>
+  2709310438U,	// <5,4,4,1>: Cost 3 vsldoi8 <4,1,5,4>, <4,1,5,4>
+  3779071030U,	// <5,4,4,2>: Cost 4 vsldoi8 <3,4,5,4>, <4,2,5,3>
+  2710637704U,	// <5,4,4,3>: Cost 3 vsldoi8 <4,3,5,4>, <4,3,5,4>
+  2754235600U,	// <5,4,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <4,4,4,4>
+  1704676570U,	// <5,4,4,5>: Cost 2 vsldoi12 <4,4,5,5>, <4,4,5,5>
+  3779071358U,	// <5,4,4,6>: Cost 4 vsldoi8 <3,4,5,4>, <4,6,5,7>
+  2713292236U,	// <5,4,4,7>: Cost 3 vsldoi8 <4,7,5,4>, <4,7,5,4>
+  1704897781U,	// <5,4,4,u>: Cost 2 vsldoi12 <4,4,u,5>, <4,4,u,5>
+  2626871398U,	// <5,4,5,0>: Cost 3 vsldoi4 <1,5,4,5>, LHS
+  2626872471U,	// <5,4,5,1>: Cost 3 vsldoi4 <1,5,4,5>, <1,5,4,5>
+  2765737230U,	// <5,4,5,2>: Cost 3 vsldoi12 <2,3,4,5>, <4,5,2,3>
+  3700615318U,	// <5,4,5,3>: Cost 4 vsldoi4 <1,5,4,5>, <3,0,1,2>
+  2626874678U,	// <5,4,5,4>: Cost 3 vsldoi4 <1,5,4,5>, RHS
+  1174441270U,	// <5,4,5,5>: Cost 2 vmrghw <5,5,5,5>, RHS
+  1680493878U,	// <5,4,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  3385051804U,	// <5,4,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,4,7>
+  1680493896U,	// <5,4,5,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2248952722U,	// <5,4,6,0>: Cost 3 vmrghw <5,6,7,0>, <4,0,5,1>
+  2302692152U,	// <5,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1>
+  3382406107U,	// <5,4,6,2>: Cost 4 vmrglw <4,4,5,6>, <4,1,4,2>
+  3700623874U,	// <5,4,6,3>: Cost 4 vsldoi4 <1,5,4,6>, <3,4,5,6>
+  2248953040U,	// <5,4,6,4>: Cost 3 vmrghw <5,6,7,0>, <4,4,4,4>
+  1175211318U,	// <5,4,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS
+  3376432280U,	// <5,4,6,6>: Cost 4 vmrglw <3,4,5,6>, <1,5,4,6>
+  2729218934U,	// <5,4,6,7>: Cost 3 vsldoi8 <7,4,5,4>, <6,7,4,5>
+  1175211561U,	// <5,4,6,u>: Cost 2 vmrghw <5,6,7,0>, RHS
+  3787035642U,	// <5,4,7,0>: Cost 4 vsldoi8 <4,7,5,4>, <7,0,1,2>
+  3365822501U,	// <5,4,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,4,1>
+  3808933085U,	// <5,4,7,2>: Cost 4 vsldoi8 <u,4,5,4>, <7,2,u,4>
+  3784381707U,	// <5,4,7,3>: Cost 4 vsldoi8 <4,3,5,4>, <7,3,4,5>
+  2713294182U,	// <5,4,7,4>: Cost 3 vsldoi8 <4,7,5,4>, <7,4,5,6>
+  2309998286U,	// <5,4,7,5>: Cost 3 vmrglw <4,6,5,7>, <2,3,4,5>
+  3383740111U,	// <5,4,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,3,4,6>
+  3787036239U,	// <5,4,7,7>: Cost 4 vsldoi8 <4,7,5,4>, <7,7,4,5>
+  2731873960U,	// <5,4,7,u>: Cost 3 vsldoi8 <7,u,5,4>, <7,u,5,4>
+  2626895974U,	// <5,4,u,0>: Cost 3 vsldoi4 <1,5,4,u>, LHS
+  2626897050U,	// <5,4,u,1>: Cost 3 vsldoi4 <1,5,4,u>, <1,5,4,u>
+  2644813518U,	// <5,4,u,2>: Cost 3 vsldoi4 <4,5,4,u>, <2,3,4,5>
+  2705327822U,	// <5,4,u,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5>
+  2626899254U,	// <5,4,u,4>: Cost 3 vsldoi4 <1,5,4,u>, RHS
+  1707331102U,	// <5,4,u,5>: Cost 2 vsldoi12 <4,u,5,5>, <4,u,5,5>
+  1680494121U,	// <5,4,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2737183024U,	// <5,4,u,7>: Cost 3 vsldoi8 <u,7,5,4>, <u,7,5,4>
+  1680494139U,	// <5,4,u,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2302642684U,	// <5,5,0,0>: Cost 3 vmrglw <3,4,5,0>, <3,4,5,0>
+  1640218726U,	// <5,5,0,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  3376384510U,	// <5,5,0,2>: Cost 4 vmrglw <3,4,5,0>, <3,4,5,2>
+  3376385078U,	// <5,5,0,3>: Cost 4 vmrglw <3,4,5,0>, <4,2,5,3>
+  2754236002U,	// <5,5,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <5,0,4,1>
+  2717942242U,	// <5,5,0,5>: Cost 3 vsldoi8 <5,5,5,5>, <0,5,u,5>
+  2244907106U,	// <5,5,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0>
+  3376385406U,	// <5,5,0,7>: Cost 4 vmrglw <3,4,5,0>, <4,6,5,7>
+  1640219293U,	// <5,5,0,u>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  2305969365U,	// <5,5,1,0>: Cost 3 vmrglw <4,0,5,1>, <4,4,5,0>
+  1237536282U,	// <5,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  2713961366U,	// <5,5,1,2>: Cost 3 vsldoi8 <4,u,5,5>, <1,2,3,0>
+  3766469630U,	// <5,5,1,3>: Cost 4 vsldoi8 <1,3,5,5>, <1,3,5,5>
+  2782326455U,	// <5,5,1,4>: Cost 3 vsldoi12 <5,1,4,5>, <5,1,4,5>
+  2311277786U,	// <5,5,1,5>: Cost 3 vmrglw <4,u,5,1>, <4,4,5,5>
+  2311277058U,	// <5,5,1,6>: Cost 3 vmrglw <4,u,5,1>, <3,4,5,6>
+  3385017587U,	// <5,5,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,5,7>
+  1237536282U,	// <5,5,1,u>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  3376400892U,	// <5,5,2,0>: Cost 4 vmrglw <3,4,5,2>, <3,4,5,0>
+  3827977963U,	// <5,5,2,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,2,1,3>
+  2302659070U,	// <5,5,2,2>: Cost 3 vmrglw <3,4,5,2>, <3,4,5,2>
+  2765737726U,	// <5,5,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <5,2,3,4>
+  3839479558U,	// <5,5,2,4>: Cost 4 vsldoi12 <2,3,4,5>, <5,2,4,3>
+  2781073167U,	// <5,5,2,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,2,5,3>
+  2713962426U,	// <5,5,2,6>: Cost 3 vsldoi8 <4,u,5,5>, <2,6,3,7>
+  3376401790U,	// <5,5,2,7>: Cost 4 vmrglw <3,4,5,2>, <4,6,5,7>
+  2769055531U,	// <5,5,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <5,2,u,4>
+  2713962646U,	// <5,5,3,0>: Cost 3 vsldoi8 <4,u,5,5>, <3,0,1,2>
+  3765143786U,	// <5,5,3,1>: Cost 4 vsldoi8 <1,1,5,5>, <3,1,1,5>
+  3839479621U,	// <5,5,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,3,2,3>
+  2289394603U,	// <5,5,3,3>: Cost 3 vmrglw <1,2,5,3>, <1,2,5,3>
+  2713963010U,	// <5,5,3,4>: Cost 3 vsldoi8 <4,u,5,5>, <3,4,5,6>
+  2313285150U,	// <5,5,3,5>: Cost 3 vmrglw <5,2,5,3>, <4,u,5,5>
+  3363138050U,	// <5,5,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,5,6>
+  3363136755U,	// <5,5,3,7>: Cost 4 vmrglw <1,2,5,3>, <1,6,5,7>
+  2713963294U,	// <5,5,3,u>: Cost 3 vsldoi8 <4,u,5,5>, <3,u,1,2>
+  2713963410U,	// <5,5,4,0>: Cost 3 vsldoi8 <4,u,5,5>, <4,0,5,1>
+  3827978127U,	// <5,5,4,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,4,1,5>
+  3839479704U,	// <5,5,4,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,4,2,5>
+  3376417846U,	// <5,5,4,3>: Cost 4 vmrglw <3,4,5,4>, <4,2,5,3>
+  1637567706U,	// <5,5,4,4>: Cost 2 vsldoi8 <4,4,5,5>, <4,4,5,5>
+  1640222006U,	// <5,5,4,5>: Cost 2 vsldoi8 <4,u,5,5>, RHS
+  2310640998U,	// <5,5,4,6>: Cost 3 vmrglw <4,7,5,4>, <7,4,5,6>
+  3376418174U,	// <5,5,4,7>: Cost 4 vmrglw <3,4,5,4>, <4,6,5,7>
+  1640222238U,	// <5,5,4,u>: Cost 2 vsldoi8 <4,u,5,5>, <4,u,5,5>
+  1577091174U,	// <5,5,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  2311310226U,	// <5,5,5,1>: Cost 3 vmrglw <4,u,5,5>, <4,0,5,1>
+  2713964303U,	// <5,5,5,2>: Cost 3 vsldoi8 <4,u,5,5>, <5,2,5,3>
+  2311311119U,	// <5,5,5,3>: Cost 3 vmrglw <4,u,5,5>, <5,2,5,3>
+  1577094454U,	// <5,5,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,5,5,5>: Cost 1 vspltisw1 RHS
+  2311309826U,	// <5,5,5,6>: Cost 3 vmrglw <4,u,5,5>, <3,4,5,6>
+  2311311447U,	// <5,5,5,7>: Cost 3 vmrglw <4,u,5,5>, <5,6,5,7>
+  296144182U,	// <5,5,5,u>: Cost 1 vspltisw1 RHS
+  2248953460U,	// <5,5,6,0>: Cost 3 vmrghw <5,6,7,0>, <5,0,6,1>
+  2326580114U,	// <5,5,6,1>: Cost 3 vmrglw <7,4,5,6>, <4,0,5,1>
+  2713965050U,	// <5,5,6,2>: Cost 3 vsldoi8 <4,u,5,5>, <6,2,7,3>
+  3700697602U,	// <5,5,6,3>: Cost 4 vsldoi4 <1,5,5,6>, <3,4,5,6>
+  2785644620U,	// <5,5,6,4>: Cost 3 vsldoi12 <5,6,4,5>, <5,6,4,5>
+  2781073495U,	// <5,5,6,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,6,5,7>
+  1228950018U,	// <5,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713965390U,	// <5,5,6,7>: Cost 3 vsldoi8 <4,u,5,5>, <6,7,0,1>
+  1228950018U,	// <5,5,6,u>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713965562U,	// <5,5,7,0>: Cost 3 vsldoi8 <4,u,5,5>, <7,0,1,2>
+  3383741330U,	// <5,5,7,1>: Cost 4 vmrglw <4,6,5,7>, <4,0,5,1>
+  3718620878U,	// <5,5,7,2>: Cost 4 vsldoi4 <4,5,5,7>, <2,3,4,5>
+  3365823403U,	// <5,5,7,3>: Cost 4 vmrglw <1,6,5,7>, <1,2,5,3>
+  2713965926U,	// <5,5,7,4>: Cost 3 vsldoi8 <4,u,5,5>, <7,4,5,6>
+  2717947318U,	// <5,5,7,5>: Cost 3 vsldoi8 <5,5,5,5>, <7,5,5,5>
+  3365825026U,	// <5,5,7,6>: Cost 4 vmrglw <1,6,5,7>, <3,4,5,6>
+  2292081907U,	// <5,5,7,7>: Cost 3 vmrglw <1,6,5,7>, <1,6,5,7>
+  2713966210U,	// <5,5,7,u>: Cost 3 vsldoi8 <4,u,5,5>, <7,u,1,2>
+  1577091174U,	// <5,5,u,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  1640224558U,	// <5,5,u,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  2713966469U,	// <5,5,u,2>: Cost 3 vsldoi8 <4,u,5,5>, <u,2,3,0>
+  2713966524U,	// <5,5,u,3>: Cost 3 vsldoi8 <4,u,5,5>, <u,3,0,1>
+  1577094454U,	// <5,5,u,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,5,u,5>: Cost 1 vspltisw1 RHS
+  1228950018U,	// <5,5,u,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713966848U,	// <5,5,u,7>: Cost 3 vsldoi8 <4,u,5,5>, <u,7,0,1>
+  296144182U,	// <5,5,u,u>: Cost 1 vspltisw1 RHS
+  2705342464U,	// <5,6,0,0>: Cost 3 vsldoi8 <3,4,5,6>, <0,0,0,0>
+  1631600742U,	// <5,6,0,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  3773112493U,	// <5,6,0,2>: Cost 4 vsldoi8 <2,4,5,6>, <0,2,1,2>
+  2705342720U,	// <5,6,0,3>: Cost 3 vsldoi8 <3,4,5,6>, <0,3,1,4>
+  2705342802U,	// <5,6,0,4>: Cost 3 vsldoi8 <3,4,5,6>, <0,4,1,5>
+  3779084708U,	// <5,6,0,5>: Cost 4 vsldoi8 <3,4,5,6>, <0,5,1,6>
+  3779084790U,	// <5,6,0,6>: Cost 4 vsldoi8 <3,4,5,6>, <0,6,1,7>
+  2302643510U,	// <5,6,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS
+  1631601309U,	// <5,6,0,u>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  3767141092U,	// <5,6,1,0>: Cost 4 vsldoi8 <1,4,5,6>, <1,0,1,2>
+  2705343284U,	// <5,6,1,1>: Cost 3 vsldoi8 <3,4,5,6>, <1,1,1,1>
+  2705343382U,	// <5,6,1,2>: Cost 3 vsldoi8 <3,4,5,6>, <1,2,3,0>
+  3779085282U,	// <5,6,1,3>: Cost 4 vsldoi8 <3,4,5,6>, <1,3,2,4>
+  2693399632U,	// <5,6,1,4>: Cost 3 vsldoi8 <1,4,5,6>, <1,4,5,6>
+  3767805089U,	// <5,6,1,5>: Cost 4 vsldoi8 <1,5,5,6>, <1,5,5,6>
+  2311279416U,	// <5,6,1,6>: Cost 3 vmrglw <4,u,5,1>, <6,6,6,6>
+  1237536054U,	// <5,6,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS
+  1237536055U,	// <5,6,1,u>: Cost 2 vmrglw <4,u,5,1>, RHS
+  3773113789U,	// <5,6,2,0>: Cost 4 vsldoi8 <2,4,5,6>, <2,0,1,2>
+  3779085855U,	// <5,6,2,1>: Cost 4 vsldoi8 <3,4,5,6>, <2,1,3,1>
+  2699372136U,	// <5,6,2,2>: Cost 3 vsldoi8 <2,4,5,6>, <2,2,2,2>
+  2705344166U,	// <5,6,2,3>: Cost 3 vsldoi8 <3,4,5,6>, <2,3,0,1>
+  2699372329U,	// <5,6,2,4>: Cost 3 vsldoi8 <2,4,5,6>, <2,4,5,6>
+  2705344360U,	// <5,6,2,5>: Cost 3 vsldoi8 <3,4,5,6>, <2,5,3,6>
+  2705344442U,	// <5,6,2,6>: Cost 3 vsldoi8 <3,4,5,6>, <2,6,3,7>
+  2302659894U,	// <5,6,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS
+  2702026861U,	// <5,6,2,u>: Cost 3 vsldoi8 <2,u,5,6>, <2,u,5,6>
+  2705344662U,	// <5,6,3,0>: Cost 3 vsldoi8 <3,4,5,6>, <3,0,1,2>
+  3767142661U,	// <5,6,3,1>: Cost 4 vsldoi8 <1,4,5,6>, <3,1,4,5>
+  3773114689U,	// <5,6,3,2>: Cost 4 vsldoi8 <2,4,5,6>, <3,2,2,2>
+  2705344924U,	// <5,6,3,3>: Cost 3 vsldoi8 <3,4,5,6>, <3,3,3,3>
+  1631603202U,	// <5,6,3,4>: Cost 2 vsldoi8 <3,4,5,6>, <3,4,5,6>
+  3842945597U,	// <5,6,3,5>: Cost 4 vsldoi12 <2,u,6,5>, <6,3,5,7>
+  3779086962U,	// <5,6,3,6>: Cost 4 vsldoi8 <3,4,5,6>, <3,6,0,1>
+  2289397046U,	// <5,6,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS
+  1634257734U,	// <5,6,3,u>: Cost 2 vsldoi8 <3,u,5,6>, <3,u,5,6>
+  2644926566U,	// <5,6,4,0>: Cost 3 vsldoi4 <4,5,6,4>, LHS
+  3779087306U,	// <5,6,4,1>: Cost 4 vsldoi8 <3,4,5,6>, <4,1,2,3>
+  2790142577U,	// <5,6,4,2>: Cost 3 vsldoi12 <6,4,2,5>, <6,4,2,5>
+  2644929026U,	// <5,6,4,3>: Cost 3 vsldoi4 <4,5,6,4>, <3,4,5,6>
+  2711317723U,	// <5,6,4,4>: Cost 3 vsldoi8 <4,4,5,6>, <4,4,5,6>
+  1631604022U,	// <5,6,4,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  2712644989U,	// <5,6,4,6>: Cost 3 vsldoi8 <4,6,5,6>, <4,6,5,6>
+  2302676278U,	// <5,6,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS
+  1631604265U,	// <5,6,4,u>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  3842945708U,	// <5,6,5,0>: Cost 4 vsldoi12 <2,u,6,5>, <6,5,0,1>
+  3767144133U,	// <5,6,5,1>: Cost 4 vsldoi8 <1,4,5,6>, <5,1,6,1>
+  2705346328U,	// <5,6,5,2>: Cost 3 vsldoi8 <3,4,5,6>, <5,2,6,3>
+  3779088207U,	// <5,6,5,3>: Cost 4 vsldoi8 <3,4,5,6>, <5,3,3,4>
+  2717290420U,	// <5,6,5,4>: Cost 3 vsldoi8 <5,4,5,6>, <5,4,5,6>
+  2705346574U,	// <5,6,5,5>: Cost 3 vsldoi8 <3,4,5,6>, <5,5,6,6>
+  2705346596U,	// <5,6,5,6>: Cost 3 vsldoi8 <3,4,5,6>, <5,6,0,1>
+  1237568822U,	// <5,6,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS
+  1237568823U,	// <5,6,5,u>: Cost 2 vmrglw <4,u,5,5>, RHS
+  2650914918U,	// <5,6,6,0>: Cost 3 vsldoi4 <5,5,6,6>, LHS
+  3364490949U,	// <5,6,6,1>: Cost 4 vmrglw <1,4,5,6>, <5,1,6,1>
+  2248954362U,	// <5,6,6,2>: Cost 3 vmrghw <5,6,7,0>, <6,2,7,3>
+  2302693144U,	// <5,6,6,3>: Cost 3 vmrglw <3,4,5,6>, <5,2,6,3>
+  2650918198U,	// <5,6,6,4>: Cost 3 vsldoi4 <5,5,6,6>, RHS
+  2650918926U,	// <5,6,6,5>: Cost 3 vsldoi4 <5,5,6,6>, <5,5,6,6>
+  2302693390U,	// <5,6,6,6>: Cost 3 vmrglw <3,4,5,6>, <5,5,6,6>
+  1228950838U,	// <5,6,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS
+  1228950839U,	// <5,6,6,u>: Cost 2 vmrglw <3,4,5,6>, RHS
+  497467494U,	// <5,6,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1571210036U,	// <5,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1571210856U,	// <5,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571211414U,	// <5,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497470774U,	// <5,6,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1571213316U,	// <5,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1571213818U,	// <5,6,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1571214956U,	// <5,6,7,7>: Cost 2 vsldoi4 RHS, <7,7,7,7>
+  497473326U,	// <5,6,7,u>: Cost 1 vsldoi4 RHS, LHS
+  497475686U,	// <5,6,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631606574U,	// <5,6,u,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  1571219048U,	// <5,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571219606U,	// <5,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497478967U,	// <5,6,u,4>: Cost 1 vsldoi4 RHS, RHS
+  1631606938U,	// <5,6,u,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  1571222010U,	// <5,6,u,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1228967222U,	// <5,6,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS
+  497481518U,	// <5,6,u,u>: Cost 1 vsldoi4 RHS, LHS
+  3768475648U,	// <5,7,0,0>: Cost 4 vsldoi8 <1,6,5,7>, <0,0,0,0>
+  2694733926U,	// <5,7,0,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  3718711395U,	// <5,7,0,2>: Cost 4 vsldoi4 <4,5,7,0>, <2,u,4,5>
+  3384349178U,	// <5,7,0,3>: Cost 4 vmrglw <4,7,5,0>, <6,2,7,3>
+  2694734162U,	// <5,7,0,4>: Cost 3 vsldoi8 <1,6,5,7>, <0,4,1,5>
+  3384347884U,	// <5,7,0,5>: Cost 4 vmrglw <4,7,5,0>, <4,4,7,5>
+  3730658026U,	// <5,7,0,6>: Cost 4 vsldoi4 <6,5,7,0>, <6,5,7,0>
+  3718714362U,	// <5,7,0,7>: Cost 4 vsldoi4 <4,5,7,0>, <7,0,1,2>
+  2694734493U,	// <5,7,0,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2311278690U,	// <5,7,1,0>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,0>
+  2305970923U,	// <5,7,1,1>: Cost 3 vmrglw <4,0,5,1>, <6,5,7,1>
+  3768476566U,	// <5,7,1,2>: Cost 4 vsldoi8 <1,6,5,7>, <1,2,3,0>
+  2311279098U,	// <5,7,1,3>: Cost 3 vmrglw <4,u,5,1>, <6,2,7,3>
+  2311278694U,	// <5,7,1,4>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,4>
+  3768476783U,	// <5,7,1,5>: Cost 4 vsldoi8 <1,6,5,7>, <1,5,0,1>
+  2694735091U,	// <5,7,1,6>: Cost 3 vsldoi8 <1,6,5,7>, <1,6,5,7>
+  2311279426U,	// <5,7,1,7>: Cost 3 vmrglw <4,u,5,1>, <6,6,7,7>
+  2696062357U,	// <5,7,1,u>: Cost 3 vsldoi8 <1,u,5,7>, <1,u,5,7>
+  3383701602U,	// <5,7,2,0>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,0>
+  3768477219U,	// <5,7,2,1>: Cost 4 vsldoi8 <1,6,5,7>, <2,1,3,5>
+  3768477288U,	// <5,7,2,2>: Cost 4 vsldoi8 <1,6,5,7>, <2,2,2,2>
+  2309960186U,	// <5,7,2,3>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3>
+  3383701606U,	// <5,7,2,4>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,4>
+  3768477545U,	// <5,7,2,5>: Cost 4 vsldoi8 <1,6,5,7>, <2,5,3,7>
+  3766486970U,	// <5,7,2,6>: Cost 4 vsldoi8 <1,3,5,7>, <2,6,3,7>
+  3383702338U,	// <5,7,2,7>: Cost 4 vmrglw <4,6,5,2>, <6,6,7,7>
+  2309960186U,	// <5,7,2,u>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3>
+  3768477846U,	// <5,7,3,0>: Cost 4 vsldoi8 <1,6,5,7>, <3,0,1,2>
+  3768477975U,	// <5,7,3,1>: Cost 4 vsldoi8 <1,6,5,7>, <3,1,6,5>
+  3786393932U,	// <5,7,3,2>: Cost 4 vsldoi8 <4,6,5,7>, <3,2,3,4>
+  3768478108U,	// <5,7,3,3>: Cost 4 vsldoi8 <1,6,5,7>, <3,3,3,3>
+  2795599115U,	// <5,7,3,4>: Cost 3 vsldoi12 <7,3,4,5>, <7,3,4,5>
+  3385037470U,	// <5,7,3,5>: Cost 4 vmrglw <4,u,5,3>, <6,4,7,5>
+  3780422309U,	// <5,7,3,6>: Cost 4 vsldoi8 <3,6,5,7>, <3,6,5,7>
+  3848107301U,	// <5,7,3,7>: Cost 4 vsldoi12 <3,7,4,5>, <7,3,7,4>
+  2795894063U,	// <5,7,3,u>: Cost 3 vsldoi12 <7,3,u,5>, <7,3,u,5>
+  2795967800U,	// <5,7,4,0>: Cost 3 vsldoi12 <7,4,0,5>, <7,4,0,5>
+  3768478690U,	// <5,7,4,1>: Cost 4 vsldoi8 <1,6,5,7>, <4,1,5,0>
+  3718744163U,	// <5,7,4,2>: Cost 4 vsldoi4 <4,5,7,4>, <2,u,4,5>
+  3784404107U,	// <5,7,4,3>: Cost 4 vsldoi8 <4,3,5,7>, <4,3,5,7>
+  2796262748U,	// <5,7,4,4>: Cost 3 vsldoi12 <7,4,4,5>, <7,4,4,5>
+  2694737206U,	// <5,7,4,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2712653182U,	// <5,7,4,6>: Cost 3 vsldoi8 <4,6,5,7>, <4,6,5,7>
+  2713316815U,	// <5,7,4,7>: Cost 3 vsldoi8 <4,7,5,7>, <4,7,5,7>
+  2694737449U,	// <5,7,4,u>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2311311458U,	// <5,7,5,0>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,0>
+  3768479433U,	// <5,7,5,1>: Cost 4 vsldoi8 <1,6,5,7>, <5,1,6,5>
+  3768479521U,	// <5,7,5,2>: Cost 4 vsldoi8 <1,6,5,7>, <5,2,7,3>
+  2311311866U,	// <5,7,5,3>: Cost 3 vmrglw <4,u,5,5>, <6,2,7,3>
+  2311311462U,	// <5,7,5,4>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,4>
+  2248185270U,	// <5,7,5,5>: Cost 3 vmrghw <5,5,5,5>, <7,5,5,5>
+  2718625879U,	// <5,7,5,6>: Cost 3 vsldoi8 <5,6,5,7>, <5,6,5,7>
+  2311312194U,	// <5,7,5,7>: Cost 3 vmrglw <4,u,5,5>, <6,6,7,7>
+  2311311466U,	// <5,7,5,u>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,u>
+  2248954874U,	// <5,7,6,0>: Cost 3 vmrghw <5,6,7,0>, <7,0,1,2>
+  3322696778U,	// <5,7,6,1>: Cost 4 vmrghw <5,6,7,0>, <7,1,1,1>
+  2248955028U,	// <5,7,6,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3>
+  2656963074U,	// <5,7,6,3>: Cost 3 vsldoi4 <6,5,7,6>, <3,4,5,6>
+  2248955238U,	// <5,7,6,4>: Cost 3 vmrghw <5,6,7,0>, <7,4,5,6>
+  2248955329U,	// <5,7,6,5>: Cost 3 vmrghw <5,6,7,0>, <7,5,6,7>
+  2656965360U,	// <5,7,6,6>: Cost 3 vsldoi4 <6,5,7,6>, <6,5,7,6>
+  2248955500U,	// <5,7,6,7>: Cost 3 vmrghw <5,6,7,0>, <7,7,7,7>
+  2248955522U,	// <5,7,6,u>: Cost 3 vmrghw <5,6,7,0>, <7,u,1,2>
+  3718766694U,	// <5,7,7,0>: Cost 4 vsldoi4 <4,5,7,7>, LHS
+  3724739827U,	// <5,7,7,1>: Cost 4 vsldoi4 <5,5,7,7>, <1,6,5,7>
+  3718768739U,	// <5,7,7,2>: Cost 4 vsldoi4 <4,5,7,7>, <2,u,4,5>
+  3365826337U,	// <5,7,7,3>: Cost 4 vmrglw <1,6,5,7>, <5,2,7,3>
+  2798253647U,	// <5,7,7,4>: Cost 3 vsldoi12 <7,7,4,5>, <7,7,4,5>
+  3365826258U,	// <5,7,7,5>: Cost 4 vmrglw <1,6,5,7>, <5,1,7,5>
+  3730715377U,	// <5,7,7,6>: Cost 4 vsldoi4 <6,5,7,7>, <6,5,7,7>
+  2310665836U,	// <5,7,7,7>: Cost 3 vmrglw <4,7,5,7>, <7,7,7,7>
+  2798548595U,	// <5,7,7,u>: Cost 3 vsldoi12 <7,7,u,5>, <7,7,u,5>
+  2311336034U,	// <5,7,u,0>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,0>
+  2694739758U,	// <5,7,u,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2248955028U,	// <5,7,u,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3>
+  2311336442U,	// <5,7,u,3>: Cost 3 vmrglw <4,u,5,u>, <6,2,7,3>
+  2311336038U,	// <5,7,u,4>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,4>
+  2694740122U,	// <5,7,u,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2656981746U,	// <5,7,u,6>: Cost 3 vsldoi4 <6,5,7,u>, <6,5,7,u>
+  2311336770U,	// <5,7,u,7>: Cost 3 vmrglw <4,u,5,u>, <6,6,7,7>
+  2694740325U,	// <5,7,u,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2705358848U,	// <5,u,0,0>: Cost 3 vsldoi8 <3,4,5,u>, <0,0,0,0>
+  1631617126U,	// <5,u,0,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  2310607866U,	// <5,u,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2>
+  2302640284U,	// <5,u,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS
+  2754238189U,	// <5,u,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <u,0,4,1>
+  2305296114U,	// <5,u,0,5>: Cost 3 vmrglw <3,u,5,0>, <2,3,u,5>
+  2244907106U,	// <5,u,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0>
+  2302643528U,	// <5,u,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS
+  1631617693U,	// <5,u,0,u>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  2627133542U,	// <5,u,1,0>: Cost 3 vsldoi4 <1,5,u,1>, LHS
+  1237536282U,	// <5,u,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  1680496430U,	// <5,u,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  1237532828U,	// <5,u,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS
+  2693416018U,	// <5,u,1,4>: Cost 3 vsldoi8 <1,4,5,u>, <1,4,5,u>
+  2756892486U,	// <5,u,1,5>: Cost 3 vsldoi12 <0,u,1,5>, <u,1,5,0>
+  2694743284U,	// <5,u,1,6>: Cost 3 vsldoi8 <1,6,5,u>, <1,6,5,u>
+  1237536072U,	// <5,u,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS
+  1680496484U,	// <5,u,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2311288709U,	// <5,u,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0>
+  2245883694U,	// <5,u,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS
+  2699388520U,	// <5,u,2,2>: Cost 3 vsldoi8 <2,4,5,u>, <2,2,2,2>
+  2754238344U,	// <5,u,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,3,3>
+  2699388715U,	// <5,u,2,4>: Cost 3 vsldoi8 <2,4,5,u>, <2,4,5,u>
+  2757408666U,	// <5,u,2,5>: Cost 3 vsldoi12 <0,u,u,5>, <u,2,5,3>
+  2705360826U,	// <5,u,2,6>: Cost 3 vsldoi8 <3,4,5,u>, <2,6,3,7>
+  2302659912U,	// <5,u,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS
+  2754238389U,	// <5,u,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,u,3>
+  2754238396U,	// <5,u,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <u,3,0,1>
+  3827980229U,	// <5,u,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <u,3,1,1>
+  2644625102U,	// <5,u,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5>
+  2289393820U,	// <5,u,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS
+  1631619588U,	// <5,u,3,4>: Cost 2 vsldoi8 <3,4,5,u>, <3,4,5,u>
+  2785056749U,	// <5,u,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <u,3,5,5>
+  3363138077U,	// <5,u,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,u,6>
+  2289397064U,	// <5,u,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS
+  1634274120U,	// <5,u,3,u>: Cost 2 vsldoi8 <3,u,5,u>, <3,u,5,u>
+  1634937753U,	// <5,u,4,0>: Cost 2 vsldoi8 <4,0,5,u>, <4,0,5,u>
+  1728272410U,	// <5,u,4,1>: Cost 2 vsldoi12 <u,4,1,5>, <u,4,1,5>
+  2710006843U,	// <5,u,4,2>: Cost 3 vsldoi8 <4,2,5,u>, <4,2,5,u>
+  2765740076U,	// <5,u,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <u,4,3,5>
+  1637592285U,	// <5,u,4,4>: Cost 2 vsldoi8 <4,4,5,u>, <4,4,5,u>
+  1631620406U,	// <5,u,4,5>: Cost 2 vsldoi8 <3,4,5,u>, RHS
+  2712661375U,	// <5,u,4,6>: Cost 3 vsldoi8 <4,6,5,u>, <4,6,5,u>
+  2302676296U,	// <5,u,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS
+  1631620649U,	// <5,u,4,u>: Cost 2 vsldoi8 <3,4,5,u>, RHS
+  1577091174U,	// <5,u,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  1174443822U,	// <5,u,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS
+  2766035058U,	// <5,u,5,2>: Cost 3 vsldoi12 <2,3,u,5>, <u,5,2,3>
+  1237565596U,	// <5,u,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS
+  1577094454U,	// <5,u,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,u,5,5>: Cost 1 vspltisw1 RHS
+  1680496794U,	// <5,u,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  1237568840U,	// <5,u,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS
+  296144182U,	// <5,u,5,u>: Cost 1 vspltisw1 RHS
+  2633146470U,	// <5,u,6,0>: Cost 3 vsldoi4 <2,5,u,6>, LHS
+  1175213870U,	// <5,u,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS
+  2633148309U,	// <5,u,6,2>: Cost 3 vsldoi4 <2,5,u,6>, <2,5,u,6>
+  1228947612U,	// <5,u,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS
+  2633149750U,	// <5,u,6,4>: Cost 3 vsldoi4 <2,5,u,6>, RHS
+  1175214234U,	// <5,u,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS
+  1228950018U,	// <5,u,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  1228950856U,	// <5,u,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS
+  1228947617U,	// <5,u,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS
+  497614950U,	// <5,u,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1571357492U,	// <5,u,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1571358312U,	// <5,u,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571358870U,	// <5,u,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497618248U,	// <5,u,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1571360772U,	// <5,u,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1571361274U,	// <5,u,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1571361786U,	// <5,u,7,7>: Cost 2 vsldoi4 RHS, <7,0,1,2>
+  497620782U,	// <5,u,7,u>: Cost 1 vsldoi4 RHS, LHS
+  497623142U,	// <5,u,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631622958U,	// <5,u,u,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  1680496997U,	// <5,u,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  1228963996U,	// <5,u,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS
+  497626441U,	// <5,u,u,4>: Cost 1 vsldoi4 RHS, RHS
+  296144182U,	// <5,u,u,5>: Cost 1 vspltisw1 RHS
+  1680497037U,	// <5,u,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  1228967240U,	// <5,u,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS
+  497628974U,	// <5,u,u,u>: Cost 1 vsldoi4 RHS, LHS
+  2772451328U,	// <6,0,0,0>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,0,0>
+  2772451338U,	// <6,0,0,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,1,1>
+  3771146417U,	// <6,0,0,2>: Cost 4 vsldoi8 <2,1,6,0>, <0,2,1,6>
+  3383095739U,	// <6,0,0,3>: Cost 4 vmrglw <4,5,6,0>, <6,2,0,3>
+  3846193189U,	// <6,0,0,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,0,4,1>
+  3724832803U,	// <6,0,0,5>: Cost 4 vsldoi4 <5,6,0,0>, <5,6,0,0>
+  3383095985U,	// <6,0,0,6>: Cost 4 vmrglw <4,5,6,0>, <6,5,0,6>
+  3383096067U,	// <6,0,0,7>: Cost 4 vmrglw <4,5,6,0>, <6,6,0,7>
+  2772451401U,	// <6,0,0,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,u,1>
+  2651095142U,	// <6,0,1,0>: Cost 3 vsldoi4 <5,6,0,1>, LHS
+  2251612262U,	// <6,0,1,1>: Cost 3 vmrghw <6,1,7,1>, LHS
+  1698709606U,	// <6,0,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2651097602U,	// <6,0,1,3>: Cost 3 vsldoi4 <5,6,0,1>, <3,4,5,6>
+  2651098422U,	// <6,0,1,4>: Cost 3 vsldoi4 <5,6,0,1>, RHS
+  2651099172U,	// <6,0,1,5>: Cost 3 vsldoi4 <5,6,0,1>, <5,6,0,1>
+  2657071869U,	// <6,0,1,6>: Cost 3 vsldoi4 <6,6,0,1>, <6,6,0,1>
+  3724841978U,	// <6,0,1,7>: Cost 4 vsldoi4 <5,6,0,1>, <7,0,1,2>
+  1698709660U,	// <6,0,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2252292096U,	// <6,0,2,0>: Cost 3 vmrghw <6,2,7,3>, <0,0,0,0>
+  1178550374U,	// <6,0,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS
+  3826655418U,	// <6,0,2,2>: Cost 4 vsldoi12 <0,2,1,6>, <0,2,2,6>
+  3777783485U,	// <6,0,2,3>: Cost 4 vsldoi8 <3,2,6,0>, <2,3,2,6>
+  2252292434U,	// <6,0,2,4>: Cost 3 vmrghw <6,2,7,3>, <0,4,1,5>
+  3785746280U,	// <6,0,2,5>: Cost 4 vsldoi8 <4,5,6,0>, <2,5,3,6>
+  2252292593U,	// <6,0,2,6>: Cost 3 vmrghw <6,2,7,3>, <0,6,1,2>
+  3736794583U,	// <6,0,2,7>: Cost 4 vsldoi4 <7,6,0,2>, <7,6,0,2>
+  1178550941U,	// <6,0,2,u>: Cost 2 vmrghw <6,2,7,3>, LHS
+  3375153152U,	// <6,0,3,0>: Cost 4 vmrglw <3,2,6,3>, <0,0,0,0>
+  2772451584U,	// <6,0,3,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,1,4>
+  3777784163U,	// <6,0,3,2>: Cost 4 vsldoi8 <3,2,6,0>, <3,2,6,0>
+  3846193426U,	// <6,0,3,3>: Cost 4 vsldoi12 <3,4,5,6>, <0,3,3,4>
+  2712005122U,	// <6,0,3,4>: Cost 3 vsldoi8 <4,5,6,0>, <3,4,5,6>
+  3724857382U,	// <6,0,3,5>: Cost 4 vsldoi4 <5,6,0,3>, <5,6,0,3>
+  3802335864U,	// <6,0,3,6>: Cost 4 vsldoi8 <7,3,6,0>, <3,6,0,7>
+  3801672410U,	// <6,0,3,7>: Cost 4 vsldoi8 <7,2,6,0>, <3,7,2,6>
+  2772451647U,	// <6,0,3,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,u,4>
+  3383123968U,	// <6,0,4,0>: Cost 4 vmrglw <4,5,6,4>, <0,0,0,0>
+  2772451666U,	// <6,0,4,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,1,5>
+  3773803577U,	// <6,0,4,2>: Cost 4 vsldoi8 <2,5,6,0>, <4,2,5,6>
+  3724864002U,	// <6,0,4,3>: Cost 4 vsldoi4 <5,6,0,4>, <3,4,5,6>
+  3846193517U,	// <6,0,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,4,4,5>
+  2712005935U,	// <6,0,4,5>: Cost 3 vsldoi8 <4,5,6,0>, <4,5,6,0>
+  3327009265U,	// <6,0,4,6>: Cost 4 vmrghw <6,4,2,5>, <0,6,1,2>
+  3383126648U,	// <6,0,4,7>: Cost 5 vmrglw <4,5,6,4>, <3,6,0,7>
+  2772451729U,	// <6,0,4,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,u,5>
+  3373178880U,	// <6,0,5,0>: Cost 4 vmrglw <2,u,6,5>, <0,0,0,0>
+  2254266470U,	// <6,0,5,1>: Cost 3 vmrghw <6,5,7,1>, LHS
+  3785748248U,	// <6,0,5,2>: Cost 4 vsldoi8 <4,5,6,0>, <5,2,6,3>
+  3790393190U,	// <6,0,5,3>: Cost 4 vsldoi8 <5,3,6,0>, <5,3,6,0>
+  3328000338U,	// <6,0,5,4>: Cost 4 vmrghw <6,5,7,0>, <0,4,1,5>
+  3785748494U,	// <6,0,5,5>: Cost 4 vsldoi8 <4,5,6,0>, <5,5,6,6>
+  3785748516U,	// <6,0,5,6>: Cost 4 vsldoi8 <4,5,6,0>, <5,6,0,1>
+  3379153528U,	// <6,0,5,7>: Cost 4 vmrglw <3,u,6,5>, <3,6,0,7>
+  2254267037U,	// <6,0,5,u>: Cost 3 vmrghw <6,5,7,1>, LHS
+  2254897152U,	// <6,0,6,0>: Cost 3 vmrghw <6,6,6,6>, <0,0,0,0>
+  1181155430U,	// <6,0,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS
+  3785748923U,	// <6,0,6,2>: Cost 4 vsldoi8 <4,5,6,0>, <6,2,0,3>
+  3785749042U,	// <6,0,6,3>: Cost 4 vsldoi8 <4,5,6,0>, <6,3,4,5>
+  2254897490U,	// <6,0,6,4>: Cost 3 vmrghw <6,6,6,6>, <0,4,1,5>
+  3785749169U,	// <6,0,6,5>: Cost 4 vsldoi8 <4,5,6,0>, <6,5,0,6>
+  2724614962U,	// <6,0,6,6>: Cost 3 vsldoi8 <6,6,6,0>, <6,6,6,0>
+  3787739982U,	// <6,0,6,7>: Cost 4 vsldoi8 <4,u,6,0>, <6,7,0,1>
+  1181155997U,	// <6,0,6,u>: Cost 2 vmrghw <6,6,6,6>, LHS
+  1235664896U,	// <6,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235666598U,	// <6,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  3712943720U,	// <6,0,7,2>: Cost 4 vsldoi4 <3,6,0,7>, <2,2,2,2>
+  2639202936U,	// <6,0,7,3>: Cost 3 vsldoi4 <3,6,0,7>, <3,6,0,7>
+  2639203638U,	// <6,0,7,4>: Cost 3 vsldoi4 <3,6,0,7>, RHS
+  2309409236U,	// <6,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5>
+  3712946517U,	// <6,0,7,6>: Cost 4 vsldoi4 <3,6,0,7>, <6,0,7,0>
+  2309409400U,	// <6,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1235666605U,	// <6,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u>
+  1235673088U,	// <6,0,u,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235674790U,	// <6,0,u,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  1698710173U,	// <6,0,u,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2639211129U,	// <6,0,u,3>: Cost 3 vsldoi4 <3,6,0,u>, <3,6,0,u>
+  2639211830U,	// <6,0,u,4>: Cost 3 vsldoi4 <3,6,0,u>, RHS
+  2712008858U,	// <6,0,u,5>: Cost 3 vsldoi8 <4,5,6,0>, RHS
+  2657129220U,	// <6,0,u,6>: Cost 3 vsldoi4 <6,6,0,u>, <6,6,0,u>
+  2309417592U,	// <6,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1698710227U,	// <6,0,u,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  3775799296U,	// <6,1,0,0>: Cost 4 vsldoi8 <2,u,6,1>, <0,0,0,0>
+  2702057574U,	// <6,1,0,1>: Cost 3 vsldoi8 <2,u,6,1>, LHS
+  3373143763U,	// <6,1,0,2>: Cost 4 vmrglw <2,u,6,0>, <u,0,1,2>
+  3695045122U,	// <6,1,0,3>: Cost 4 vsldoi4 <0,6,1,0>, <3,4,5,6>
+  3775799634U,	// <6,1,0,4>: Cost 4 vsldoi8 <2,u,6,1>, <0,4,1,5>
+  3383091538U,	// <6,1,0,5>: Cost 4 vmrglw <4,5,6,0>, <0,4,1,5>
+  3368493233U,	// <6,1,0,6>: Cost 4 vmrglw <2,1,6,0>, <0,2,1,6>
+  3362522319U,	// <6,1,0,7>: Cost 5 vmrglw <1,1,6,0>, <1,6,1,7>
+  2702058141U,	// <6,1,0,u>: Cost 3 vsldoi8 <2,u,6,1>, LHS
+  3834250027U,	// <6,1,1,0>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,0,1>
+  2772452148U,	// <6,1,1,1>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1>
+  3832038210U,	// <6,1,1,2>: Cost 4 vsldoi12 <1,1,2,6>, <1,1,2,6>
+  3373150660U,	// <6,1,1,3>: Cost 4 vmrglw <2,u,6,1>, <6,2,1,3>
+  3834250067U,	// <6,1,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,4,5>
+  3373146450U,	// <6,1,1,5>: Cost 4 vmrglw <2,u,6,1>, <0,4,1,5>
+  3826656102U,	// <6,1,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,1,6,6>
+  3362530511U,	// <6,1,1,7>: Cost 4 vmrglw <1,1,6,1>, <1,6,1,7>
+  2772452148U,	// <6,1,1,u>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1>
+  2669092966U,	// <6,1,2,0>: Cost 3 vsldoi4 <u,6,1,2>, LHS
+  2252292916U,	// <6,1,2,1>: Cost 3 vmrghw <6,2,7,3>, <1,1,1,1>
+  2252293014U,	// <6,1,2,2>: Cost 3 vmrghw <6,2,7,3>, <1,2,3,0>
+  2772452246U,	// <6,1,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,2,3,0>
+  2669096246U,	// <6,1,2,4>: Cost 3 vsldoi4 <u,6,1,2>, RHS
+  3846194091U,	// <6,1,2,5>: Cost 4 vsldoi12 <3,4,5,6>, <1,2,5,3>
+  2702059450U,	// <6,1,2,6>: Cost 3 vsldoi8 <2,u,6,1>, <2,6,3,7>
+  3870081978U,	// <6,1,2,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,2,7,0>
+  2702059633U,	// <6,1,2,u>: Cost 3 vsldoi8 <2,u,6,1>, <2,u,6,1>
+  3775801494U,	// <6,1,3,0>: Cost 4 vsldoi8 <2,u,6,1>, <3,0,1,2>
+  3777128723U,	// <6,1,3,1>: Cost 4 vsldoi8 <3,1,6,1>, <3,1,6,1>
+  3775801702U,	// <6,1,3,2>: Cost 4 vsldoi8 <2,u,6,1>, <3,2,6,3>
+  3775801756U,	// <6,1,3,3>: Cost 4 vsldoi8 <2,u,6,1>, <3,3,3,3>
+  3775801858U,	// <6,1,3,4>: Cost 4 vsldoi8 <2,u,6,1>, <3,4,5,6>
+  3375153490U,	// <6,1,3,5>: Cost 4 vmrglw <3,2,6,3>, <0,4,1,5>
+  3826656265U,	// <6,1,3,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,3,6,7>
+  3775802051U,	// <6,1,3,7>: Cost 4 vsldoi8 <2,u,6,1>, <3,7,0,1>
+  3775802142U,	// <6,1,3,u>: Cost 4 vsldoi8 <2,u,6,1>, <3,u,1,2>
+  3846194206U,	// <6,1,4,0>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,0,1>
+  3846194219U,	// <6,1,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,1,5>
+  3846194228U,	// <6,1,4,2>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,2,5>
+  3846194236U,	// <6,1,4,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,3,4>
+  3846194246U,	// <6,1,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,4,5>
+  2760508496U,	// <6,1,4,5>: Cost 3 vsldoi12 <1,4,5,6>, <1,4,5,6>
+  3368526001U,	// <6,1,4,6>: Cost 4 vmrglw <2,1,6,4>, <0,2,1,6>
+  3870082144U,	// <6,1,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,4,7,4>
+  2760729707U,	// <6,1,4,u>: Cost 3 vsldoi12 <1,4,u,6>, <1,4,u,6>
+  2714668660U,	// <6,1,5,0>: Cost 3 vsldoi8 <5,0,6,1>, <5,0,6,1>
+  3834619005U,	// <6,1,5,1>: Cost 4 vsldoi12 <1,5,1,6>, <1,5,1,6>
+  3834692742U,	// <6,1,5,2>: Cost 4 vsldoi12 <1,5,2,6>, <1,5,2,6>
+  3846194317U,	// <6,1,5,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,5,3,4>
+  3834840216U,	// <6,1,5,4>: Cost 4 vsldoi12 <1,5,4,6>, <1,5,4,6>
+  3834913953U,	// <6,1,5,5>: Cost 4 vsldoi12 <1,5,5,6>, <1,5,5,6>
+  2719977570U,	// <6,1,5,6>: Cost 3 vsldoi8 <5,u,6,1>, <5,6,7,0>
+  3367208143U,	// <6,1,5,7>: Cost 4 vmrglw <1,u,6,5>, <1,6,1,7>
+  2719977724U,	// <6,1,5,u>: Cost 3 vsldoi8 <5,u,6,1>, <5,u,6,1>
+  2669125734U,	// <6,1,6,0>: Cost 3 vsldoi4 <u,6,1,6>, LHS
+  2254897972U,	// <6,1,6,1>: Cost 3 vmrghw <6,6,6,6>, <1,1,1,1>
+  2254898070U,	// <6,1,6,2>: Cost 3 vmrghw <6,6,6,6>, <1,2,3,0>
+  3775803929U,	// <6,1,6,3>: Cost 4 vsldoi8 <2,u,6,1>, <6,3,1,7>
+  2669129014U,	// <6,1,6,4>: Cost 3 vsldoi4 <u,6,1,6>, RHS
+  2322006354U,	// <6,1,6,5>: Cost 3 vmrglw <6,6,6,6>, <0,4,1,5>
+  2725950264U,	// <6,1,6,6>: Cost 3 vsldoi8 <6,u,6,1>, <6,6,6,6>
+  3793720142U,	// <6,1,6,7>: Cost 4 vsldoi8 <5,u,6,1>, <6,7,0,1>
+  2254898556U,	// <6,1,6,u>: Cost 3 vmrghw <6,6,6,6>, <1,u,3,0>
+  2627330150U,	// <6,1,7,0>: Cost 3 vsldoi4 <1,6,1,7>, LHS
+  1235664906U,	// <6,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235667094U,	// <6,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2309406894U,	// <6,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3>
+  2627333430U,	// <6,1,7,4>: Cost 3 vsldoi4 <1,6,1,7>, RHS
+  1235665234U,	// <6,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309406897U,	// <6,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309407222U,	// <6,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235664913U,	// <6,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  2627338342U,	// <6,1,u,0>: Cost 3 vsldoi4 <1,6,1,u>, LHS
+  1235673098U,	// <6,1,u,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235675286U,	// <6,1,u,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2772452732U,	// <6,1,u,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,u,3,0>
+  2627341622U,	// <6,1,u,4>: Cost 3 vsldoi4 <1,6,1,u>, RHS
+  1235673426U,	// <6,1,u,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309415089U,	// <6,1,u,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309415414U,	// <6,1,u,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235673105U,	// <6,1,u,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  3324683725U,	// <6,2,0,0>: Cost 4 vmrghw <6,0,7,0>, <2,0,3,0>
+  2725290086U,	// <6,2,0,1>: Cost 3 vsldoi8 <6,7,6,2>, LHS
+  3771162801U,	// <6,2,0,2>: Cost 4 vsldoi8 <2,1,6,2>, <0,2,1,6>
+  2309349478U,	// <6,2,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS
+  3730951478U,	// <6,2,0,4>: Cost 4 vsldoi4 <6,6,2,0>, RHS
+  3840738784U,	// <6,2,0,5>: Cost 4 vsldoi12 <2,5,3,6>, <2,0,5,1>
+  3842655721U,	// <6,2,0,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,0,6,1>
+  3736925671U,	// <6,2,0,7>: Cost 4 vsldoi4 <7,6,2,0>, <7,6,2,0>
+  2309349483U,	// <6,2,0,u>: Cost 3 vmrglw <4,5,6,0>, LHS
+  3367840468U,	// <6,2,1,0>: Cost 4 vmrglw <2,0,6,1>, <3,7,2,0>
+  3325355551U,	// <6,2,1,1>: Cost 4 vmrghw <6,1,7,1>, <2,1,3,1>
+  3373147752U,	// <6,2,1,2>: Cost 4 vmrglw <2,u,6,1>, <2,2,2,2>
+  2299404390U,	// <6,2,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS
+  3701099830U,	// <6,2,1,4>: Cost 5 vsldoi4 <1,6,2,1>, RHS
+  3767846054U,	// <6,2,1,5>: Cost 4 vsldoi8 <1,5,6,2>, <1,5,6,2>
+  3826656825U,	// <6,2,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <2,1,6,0>
+  3373147838U,	// <6,2,1,7>: Cost 5 vmrglw <2,u,6,1>, <2,3,2,7>
+  2299404395U,	// <6,2,1,u>: Cost 3 vmrglw <2,u,6,1>, LHS
+  2657222758U,	// <6,2,2,0>: Cost 3 vsldoi4 <6,6,2,2>, LHS
+  3771164219U,	// <6,2,2,1>: Cost 4 vsldoi8 <2,1,6,2>, <2,1,6,2>
+  2766481000U,	// <6,2,2,2>: Cost 3 vsldoi12 <2,4,5,6>, <2,2,2,2>
+  2772452978U,	// <6,2,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,3,3>
+  2657226038U,	// <6,2,2,4>: Cost 3 vsldoi4 <6,6,2,2>, RHS
+  3790407528U,	// <6,2,2,5>: Cost 4 vsldoi8 <5,3,6,2>, <2,5,3,6>
+  2252294074U,	// <6,2,2,6>: Cost 3 vmrghw <6,2,7,3>, <2,6,3,7>
+  2252294148U,	// <6,2,2,7>: Cost 3 vmrghw <6,2,7,3>, <2,7,3,0>
+  2772453023U,	// <6,2,2,u>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,u,3>
+  2772453030U,	// <6,2,3,0>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,0,1>
+  3834250930U,	// <6,2,3,1>: Cost 4 vsldoi12 <1,4,5,6>, <2,3,1,4>
+  2765596349U,	// <6,2,3,2>: Cost 3 vsldoi12 <2,3,2,6>, <2,3,2,6>
+  2301411430U,	// <6,2,3,3>: Cost 3 vmrglw <3,2,6,3>, LHS
+  2772453070U,	// <6,2,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,4,5>
+  2765817560U,	// <6,2,3,5>: Cost 3 vsldoi12 <2,3,5,6>, <2,3,5,6>
+  2252933050U,	// <6,2,3,6>: Cost 3 vmrghw <6,3,7,0>, <2,6,3,7>
+  2796340968U,	// <6,2,3,7>: Cost 3 vsldoi12 <7,4,5,6>, <2,3,7,4>
+  2766038771U,	// <6,2,3,u>: Cost 3 vsldoi12 <2,3,u,6>, <2,3,u,6>
+  3725008998U,	// <6,2,4,0>: Cost 4 vsldoi4 <5,6,2,4>, LHS
+  3368530217U,	// <6,2,4,1>: Cost 5 vmrglw <2,1,6,4>, <6,0,2,1>
+  3840222989U,	// <6,2,4,2>: Cost 4 vsldoi12 <2,4,5,6>, <2,4,2,5>
+  2309382246U,	// <6,2,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS
+  3725012278U,	// <6,2,4,4>: Cost 4 vsldoi4 <5,6,2,4>, RHS
+  2766481193U,	// <6,2,4,5>: Cost 3 vsldoi12 <2,4,5,6>, <2,4,5,6>
+  3842656049U,	// <6,2,4,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,4,6,5>
+  3327010820U,	// <6,2,4,7>: Cost 4 vmrghw <6,4,2,5>, <2,7,3,0>
+  2766702404U,	// <6,2,4,u>: Cost 3 vsldoi12 <2,4,u,6>, <2,4,u,6>
+  3713073254U,	// <6,2,5,0>: Cost 4 vsldoi4 <3,6,2,5>, LHS
+  3789082310U,	// <6,2,5,1>: Cost 4 vsldoi8 <5,1,6,2>, <5,1,6,2>
+  3840665439U,	// <6,2,5,2>: Cost 4 vsldoi12 <2,5,2,6>, <2,5,2,6>
+  2766997352U,	// <6,2,5,3>: Cost 3 vsldoi12 <2,5,3,6>, <2,5,3,6>
+  3713076534U,	// <6,2,5,4>: Cost 4 vsldoi4 <3,6,2,5>, RHS
+  3791736842U,	// <6,2,5,5>: Cost 4 vsldoi8 <5,5,6,2>, <5,5,6,2>
+  3373180605U,	// <6,2,5,6>: Cost 4 vmrglw <2,u,6,5>, <2,3,2,6>
+  3793064108U,	// <6,2,5,7>: Cost 4 vsldoi8 <5,7,6,2>, <5,7,6,2>
+  2767366037U,	// <6,2,5,u>: Cost 3 vsldoi12 <2,5,u,6>, <2,5,u,6>
+  3701137510U,	// <6,2,6,0>: Cost 4 vsldoi4 <1,6,2,6>, LHS
+  3701138647U,	// <6,2,6,1>: Cost 4 vsldoi4 <1,6,2,6>, <1,6,2,6>
+  2254898792U,	// <6,2,6,2>: Cost 3 vmrghw <6,6,6,6>, <2,2,2,2>
+  1248264294U,	// <6,2,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS
+  3701140790U,	// <6,2,6,4>: Cost 4 vsldoi4 <1,6,2,6>, RHS
+  3725029435U,	// <6,2,6,5>: Cost 4 vsldoi4 <5,6,2,6>, <5,6,2,6>
+  2254899130U,	// <6,2,6,6>: Cost 3 vmrghw <6,6,6,6>, <2,6,3,7>
+  2725294981U,	// <6,2,6,7>: Cost 3 vsldoi8 <6,7,6,2>, <6,7,6,2>
+  1248264299U,	// <6,2,6,u>: Cost 2 vmrglw <6,6,6,6>, LHS
+  2633375846U,	// <6,2,7,0>: Cost 3 vsldoi4 <2,6,2,7>, LHS
+  2309407468U,	// <6,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235666536U,	// <6,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  161923174U,	// <6,2,7,3>: Cost 1 vmrglw RHS, LHS
+  2633379126U,	// <6,2,7,4>: Cost 3 vsldoi4 <2,6,2,7>, RHS
+  2309407796U,	// <6,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5>
+  2309408445U,	// <6,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309407960U,	// <6,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  161923179U,	// <6,2,7,u>: Cost 1 vmrglw RHS, LHS
+  2633384038U,	// <6,2,u,0>: Cost 3 vsldoi4 <2,6,2,u>, LHS
+  2309415660U,	// <6,2,u,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235674728U,	// <6,2,u,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  161931366U,	// <6,2,u,3>: Cost 1 vmrglw RHS, LHS
+  2633387318U,	// <6,2,u,4>: Cost 3 vsldoi4 <2,6,2,u>, RHS
+  2769135725U,	// <6,2,u,5>: Cost 3 vsldoi12 <2,u,5,6>, <2,u,5,6>
+  2309416637U,	// <6,2,u,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309416152U,	// <6,2,u,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  161931371U,	// <6,2,u,u>: Cost 1 vmrglw RHS, LHS
+  3777806336U,	// <6,3,0,0>: Cost 4 vsldoi8 <3,2,6,3>, <0,0,0,0>
+  2704064614U,	// <6,3,0,1>: Cost 3 vsldoi8 <3,2,6,3>, LHS
+  3765862577U,	// <6,3,0,2>: Cost 4 vsldoi8 <1,2,6,3>, <0,2,1,6>
+  3843393708U,	// <6,3,0,3>: Cost 4 vsldoi12 <3,0,3,6>, <3,0,3,6>
+  2250516994U,	// <6,3,0,4>: Cost 3 vmrghw <6,0,1,2>, <3,4,5,6>
+  3725054014U,	// <6,3,0,5>: Cost 4 vsldoi4 <5,6,3,0>, <5,6,3,0>
+  3383093096U,	// <6,3,0,6>: Cost 4 vmrglw <4,5,6,0>, <2,5,3,6>
+  3368495034U,	// <6,3,0,7>: Cost 4 vmrglw <2,1,6,0>, <2,6,3,7>
+  2704065181U,	// <6,3,0,u>: Cost 3 vsldoi8 <3,2,6,3>, LHS
+  2251622550U,	// <6,3,1,0>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2>
+  3777807156U,	// <6,3,1,1>: Cost 4 vsldoi8 <3,2,6,3>, <1,1,1,1>
+  3765863348U,	// <6,3,1,2>: Cost 4 vsldoi8 <1,2,6,3>, <1,2,6,3>
+  3373147762U,	// <6,3,1,3>: Cost 4 vmrglw <2,u,6,1>, <2,2,3,3>
+  3834251525U,	// <6,3,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <3,1,4,5>
+  3373147683U,	// <6,3,1,5>: Cost 5 vmrglw <2,u,6,1>, <2,1,3,5>
+  3391727545U,	// <6,3,1,6>: Cost 4 vmrglw <6,0,6,1>, <2,6,3,6>
+  2299406266U,	// <6,3,1,7>: Cost 3 vmrglw <2,u,6,1>, <2,6,3,7>
+  2251622550U,	// <6,3,1,u>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2>
+  2252294294U,	// <6,3,2,0>: Cost 3 vmrghw <6,2,7,3>, <3,0,1,2>
+  3326036198U,	// <6,3,2,1>: Cost 4 vmrghw <6,2,7,3>, <3,1,1,1>
+  3771836045U,	// <6,3,2,2>: Cost 4 vsldoi8 <2,2,6,3>, <2,2,6,3>
+  2252294556U,	// <6,3,2,3>: Cost 3 vmrghw <6,2,7,3>, <3,3,3,3>
+  2252294658U,	// <6,3,2,4>: Cost 3 vmrghw <6,2,7,3>, <3,4,5,6>
+  3840739677U,	// <6,3,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <3,2,5,3>
+  2704066490U,	// <6,3,2,6>: Cost 3 vsldoi8 <3,2,6,3>, <2,6,3,7>
+  3368511418U,	// <6,3,2,7>: Cost 4 vmrglw <2,1,6,2>, <2,6,3,7>
+  2252294942U,	// <6,3,2,u>: Cost 3 vmrghw <6,2,7,3>, <3,u,1,2>
+  3707158630U,	// <6,3,3,0>: Cost 4 vsldoi4 <2,6,3,3>, LHS
+  3765864692U,	// <6,3,3,1>: Cost 5 vsldoi8 <1,2,6,3>, <3,1,2,6>
+  2704066918U,	// <6,3,3,2>: Cost 3 vsldoi8 <3,2,6,3>, <3,2,6,3>
+  2772453788U,	// <6,3,3,3>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,3,3>
+  2772453799U,	// <6,3,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,4,5>
+  3789752888U,	// <6,3,3,5>: Cost 4 vsldoi8 <5,2,6,3>, <3,5,2,6>
+  3840739770U,	// <6,3,3,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,3,6,6>
+  2301413306U,	// <6,3,3,7>: Cost 3 vmrglw <3,2,6,3>, <2,6,3,7>
+  2775108043U,	// <6,3,3,u>: Cost 3 vsldoi12 <3,u,5,6>, <3,3,u,5>
+  2651340902U,	// <6,3,4,0>: Cost 3 vsldoi4 <5,6,3,4>, LHS
+  3846195674U,	// <6,3,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <3,4,1,2>
+  3845974503U,	// <6,3,4,2>: Cost 4 vsldoi12 <3,4,2,6>, <3,4,2,6>
+  2651343362U,	// <6,3,4,3>: Cost 3 vsldoi4 <5,6,3,4>, <3,4,5,6>
+  2651344182U,	// <6,3,4,4>: Cost 3 vsldoi4 <5,6,3,4>, RHS
+  1698712066U,	// <6,3,4,5>: Cost 2 vsldoi12 <3,4,5,6>, <3,4,5,6>
+  3383125864U,	// <6,3,4,6>: Cost 4 vmrglw <4,5,6,4>, <2,5,3,6>
+  3368527802U,	// <6,3,4,7>: Cost 4 vmrglw <2,1,6,4>, <2,6,3,7>
+  1698933277U,	// <6,3,4,u>: Cost 2 vsldoi12 <3,4,u,6>, <3,4,u,6>
+  3373179798U,	// <6,3,5,0>: Cost 4 vmrglw <2,u,6,5>, <1,2,3,0>
+  3707176179U,	// <6,3,5,1>: Cost 5 vsldoi4 <2,6,3,5>, <1,6,5,7>
+  2716012312U,	// <6,3,5,2>: Cost 3 vsldoi8 <5,2,6,3>, <5,2,6,3>
+  3373180530U,	// <6,3,5,3>: Cost 4 vmrglw <2,u,6,5>, <2,2,3,3>
+  2254309890U,	// <6,3,5,4>: Cost 3 vmrghw <6,5,7,6>, <3,4,5,6>
+  3785773070U,	// <6,3,5,5>: Cost 4 vsldoi8 <4,5,6,3>, <5,5,6,6>
+  3840739932U,	// <6,3,5,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,5,6,6>
+  2299439034U,	// <6,3,5,7>: Cost 3 vmrglw <2,u,6,5>, <2,6,3,7>
+  2719994110U,	// <6,3,5,u>: Cost 3 vsldoi8 <5,u,6,3>, <5,u,6,3>
+  2254899350U,	// <6,3,6,0>: Cost 3 vmrghw <6,6,6,6>, <3,0,1,2>
+  3328641254U,	// <6,3,6,1>: Cost 4 vmrghw <6,6,6,6>, <3,1,1,1>
+  2633443257U,	// <6,3,6,2>: Cost 3 vsldoi4 <2,6,3,6>, <2,6,3,6>
+  2254899612U,	// <6,3,6,3>: Cost 3 vmrghw <6,6,6,6>, <3,3,3,3>
+  2254899714U,	// <6,3,6,4>: Cost 3 vmrghw <6,6,6,6>, <3,4,5,6>
+  3785773772U,	// <6,3,6,5>: Cost 4 vsldoi8 <4,5,6,3>, <6,5,3,6>
+  2725966648U,	// <6,3,6,6>: Cost 3 vsldoi8 <6,u,6,3>, <6,6,6,6>
+  2322007994U,	// <6,3,6,7>: Cost 3 vmrglw <6,6,6,6>, <2,6,3,7>
+  2254899998U,	// <6,3,6,u>: Cost 3 vmrghw <6,6,6,6>, <3,u,1,2>
+  1559707750U,	// <6,3,7,0>: Cost 2 vsldoi4 <2,6,3,7>, LHS
+  2633450292U,	// <6,3,7,1>: Cost 3 vsldoi4 <2,6,3,7>, <1,1,1,1>
+  1559709626U,	// <6,3,7,2>: Cost 2 vsldoi4 <2,6,3,7>, <2,6,3,7>
+  1235666546U,	// <6,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1559711030U,	// <6,3,7,4>: Cost 2 vsldoi4 <2,6,3,7>, RHS
+  2309408291U,	// <6,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5>
+  2633454152U,	// <6,3,7,6>: Cost 3 vsldoi4 <2,6,3,7>, <6,3,7,0>
+  1235666874U,	// <6,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1559713582U,	// <6,3,7,u>: Cost 2 vsldoi4 <2,6,3,7>, LHS
+  1559715942U,	// <6,3,u,0>: Cost 2 vsldoi4 <2,6,3,u>, LHS
+  2633458484U,	// <6,3,u,1>: Cost 3 vsldoi4 <2,6,3,u>, <1,1,1,1>
+  1559717819U,	// <6,3,u,2>: Cost 2 vsldoi4 <2,6,3,u>, <2,6,3,u>
+  1235674738U,	// <6,3,u,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1559719222U,	// <6,3,u,4>: Cost 2 vsldoi4 <2,6,3,u>, RHS
+  1701366598U,	// <6,3,u,5>: Cost 2 vsldoi12 <3,u,5,6>, <3,u,5,6>
+  2633462353U,	// <6,3,u,6>: Cost 3 vsldoi4 <2,6,3,u>, <6,3,u,0>
+  1235675066U,	// <6,3,u,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1559721774U,	// <6,3,u,u>: Cost 2 vsldoi4 <2,6,3,u>, LHS
+  3785777152U,	// <6,4,0,0>: Cost 4 vsldoi8 <4,5,6,4>, <0,0,0,0>
+  2712035430U,	// <6,4,0,1>: Cost 3 vsldoi8 <4,5,6,4>, LHS
+  3771179185U,	// <6,4,0,2>: Cost 4 vsldoi8 <2,1,6,4>, <0,2,1,6>
+  3846196096U,	// <6,4,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <4,0,3,1>
+  3785777490U,	// <6,4,0,4>: Cost 4 vsldoi8 <4,5,6,4>, <0,4,1,5>
+  2250517814U,	// <6,4,0,5>: Cost 3 vmrghw <6,0,1,2>, RHS
+  3324259703U,	// <6,4,0,6>: Cost 4 vmrghw <6,0,1,2>, <4,6,5,0>
+  3383092458U,	// <6,4,0,7>: Cost 5 vmrglw <4,5,6,0>, <1,6,4,7>
+  2712035997U,	// <6,4,0,u>: Cost 3 vsldoi8 <4,5,6,4>, LHS
+  3325356946U,	// <6,4,1,0>: Cost 4 vmrghw <6,1,7,1>, <4,0,5,1>
+  3785777972U,	// <6,4,1,1>: Cost 4 vsldoi8 <4,5,6,4>, <1,1,1,1>
+  3846196170U,	// <6,4,1,2>: Cost 4 vsldoi12 <3,4,5,6>, <4,1,2,3>
+  3325365380U,	// <6,4,1,3>: Cost 4 vmrghw <6,1,7,2>, <4,3,5,0>
+  3852168155U,	// <6,4,1,4>: Cost 4 vsldoi12 <4,4,5,6>, <4,1,4,2>
+  2251615542U,	// <6,4,1,5>: Cost 3 vmrghw <6,1,7,1>, RHS
+  3325357432U,	// <6,4,1,6>: Cost 4 vmrghw <6,1,7,1>, <4,6,5,1>
+  3870084088U,	// <6,4,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <4,1,7,4>
+  2251615785U,	// <6,4,1,u>: Cost 3 vmrghw <6,1,7,1>, RHS
+  2252295058U,	// <6,4,2,0>: Cost 3 vmrghw <6,2,7,3>, <4,0,5,1>
+  3771180605U,	// <6,4,2,1>: Cost 4 vsldoi8 <2,1,6,4>, <2,1,6,4>
+  3785778792U,	// <6,4,2,2>: Cost 4 vsldoi8 <4,5,6,4>, <2,2,2,2>
+  3777816253U,	// <6,4,2,3>: Cost 4 vsldoi8 <3,2,6,4>, <2,3,2,6>
+  2252295376U,	// <6,4,2,4>: Cost 3 vmrghw <6,2,7,3>, <4,4,4,4>
+  1178553654U,	// <6,4,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS
+  2252295545U,	// <6,4,2,6>: Cost 3 vmrghw <6,2,7,3>, <4,6,5,2>
+  3326037448U,	// <6,4,2,7>: Cost 4 vmrghw <6,2,7,3>, <4,7,5,0>
+  1178553897U,	// <6,4,2,u>: Cost 2 vmrghw <6,2,7,3>, RHS
+  3785779350U,	// <6,4,3,0>: Cost 4 vsldoi8 <4,5,6,4>, <3,0,1,2>
+  3383118648U,	// <6,4,3,1>: Cost 4 vmrglw <4,5,6,3>, <3,u,4,1>
+  3777816935U,	// <6,4,3,2>: Cost 4 vsldoi8 <3,2,6,4>, <3,2,6,4>
+  3785779612U,	// <6,4,3,3>: Cost 4 vsldoi8 <4,5,6,4>, <3,3,3,3>
+  2712037890U,	// <6,4,3,4>: Cost 3 vsldoi8 <4,5,6,4>, <3,4,5,6>
+  2252754230U,	// <6,4,3,5>: Cost 3 vmrghw <6,3,4,5>, RHS
+  3784452764U,	// <6,4,3,6>: Cost 4 vsldoi8 <4,3,6,4>, <3,6,4,7>
+  3801705178U,	// <6,4,3,7>: Cost 4 vsldoi8 <7,2,6,4>, <3,7,2,6>
+  2252754473U,	// <6,4,3,u>: Cost 3 vmrghw <6,3,4,5>, RHS
+  3787770770U,	// <6,4,4,0>: Cost 4 vsldoi8 <4,u,6,4>, <4,0,5,1>
+  3383126840U,	// <6,4,4,1>: Cost 4 vmrglw <4,5,6,4>, <3,u,4,1>
+  3327380534U,	// <6,4,4,2>: Cost 4 vmrghw <6,4,7,5>, <4,2,5,3>
+  3784453265U,	// <6,4,4,3>: Cost 4 vsldoi8 <4,3,6,4>, <4,3,6,4>
+  2253630672U,	// <6,4,4,4>: Cost 3 vmrghw <6,4,7,4>, <4,4,4,4>
+  2778426587U,	// <6,4,4,5>: Cost 3 vsldoi12 <4,4,5,6>, <4,4,5,6>
+  3383128789U,	// <6,4,4,6>: Cost 4 vmrglw <4,5,6,4>, <6,5,4,6>
+  3381799580U,	// <6,4,4,7>: Cost 4 vmrglw <4,3,6,4>, <3,6,4,7>
+  2778647798U,	// <6,4,4,u>: Cost 3 vsldoi12 <4,4,u,6>, <4,4,u,6>
+  2651422822U,	// <6,4,5,0>: Cost 3 vsldoi4 <5,6,4,5>, LHS
+  3701277928U,	// <6,4,5,1>: Cost 4 vsldoi4 <1,6,4,5>, <1,6,4,5>
+  3701278650U,	// <6,4,5,2>: Cost 4 vsldoi4 <1,6,4,5>, <2,6,3,7>
+  2651425282U,	// <6,4,5,3>: Cost 3 vsldoi4 <5,6,4,5>, <3,4,5,6>
+  2651426102U,	// <6,4,5,4>: Cost 3 vsldoi4 <5,6,4,5>, RHS
+  2651426892U,	// <6,4,5,5>: Cost 3 vsldoi4 <5,6,4,5>, <5,6,4,5>
+  1698712886U,	// <6,4,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  3725169658U,	// <6,4,5,7>: Cost 4 vsldoi4 <5,6,4,5>, <7,0,1,2>
+  1698712904U,	// <6,4,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2254900114U,	// <6,4,6,0>: Cost 3 vmrghw <6,6,6,6>, <4,0,5,1>
+  3389115192U,	// <6,4,6,1>: Cost 4 vmrglw <5,5,6,6>, <3,u,4,1>
+  3785781727U,	// <6,4,6,2>: Cost 4 vsldoi8 <4,5,6,4>, <6,2,4,3>
+  3785781810U,	// <6,4,6,3>: Cost 4 vsldoi8 <4,5,6,4>, <6,3,4,5>
+  2254900432U,	// <6,4,6,4>: Cost 3 vmrghw <6,6,6,6>, <4,4,4,4>
+  1181158710U,	// <6,4,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS
+  2254900605U,	// <6,4,6,6>: Cost 3 vmrghw <6,6,6,6>, <4,6,5,6>
+  3787772750U,	// <6,4,6,7>: Cost 4 vsldoi8 <4,u,6,4>, <6,7,0,1>
+  1181158953U,	// <6,4,6,u>: Cost 2 vmrghw <6,6,6,6>, RHS
+  2639495270U,	// <6,4,7,0>: Cost 3 vsldoi4 <3,6,4,7>, LHS
+  2639496090U,	// <6,4,7,1>: Cost 3 vsldoi4 <3,6,4,7>, <1,2,3,4>
+  3707267011U,	// <6,4,7,2>: Cost 4 vsldoi4 <2,6,4,7>, <2,6,4,7>
+  2639497884U,	// <6,4,7,3>: Cost 3 vsldoi4 <3,6,4,7>, <3,6,4,7>
+  1237658832U,	// <6,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235666638U,	// <6,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  3713241753U,	// <6,4,7,6>: Cost 4 vsldoi4 <3,6,4,7>, <6,4,7,0>
+  2309409436U,	// <6,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1235666641U,	// <6,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u>
+  2639503462U,	// <6,4,u,0>: Cost 3 vsldoi4 <3,6,4,u>, LHS
+  2639504282U,	// <6,4,u,1>: Cost 3 vsldoi4 <3,6,4,u>, <1,2,3,4>
+  3701303226U,	// <6,4,u,2>: Cost 4 vsldoi4 <1,6,4,u>, <2,6,3,7>
+  2639506077U,	// <6,4,u,3>: Cost 3 vsldoi4 <3,6,4,u>, <3,6,4,u>
+  1235676368U,	// <6,4,u,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235674830U,	// <6,4,u,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  1698713129U,	// <6,4,u,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2309417628U,	// <6,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1698713147U,	// <6,4,u,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  3775832064U,	// <6,5,0,0>: Cost 4 vsldoi8 <2,u,6,5>, <0,0,0,0>
+  2702090342U,	// <6,5,0,1>: Cost 3 vsldoi8 <2,u,6,5>, LHS
+  3775832241U,	// <6,5,0,2>: Cost 4 vsldoi8 <2,u,6,5>, <0,2,1,6>
+  3719227906U,	// <6,5,0,3>: Cost 4 vsldoi4 <4,6,5,0>, <3,4,5,6>
+  3775832402U,	// <6,5,0,4>: Cost 4 vsldoi8 <2,u,6,5>, <0,4,1,5>
+  3385085146U,	// <6,5,0,5>: Cost 4 vmrglw <4,u,6,0>, <4,4,5,5>
+  2309351938U,	// <6,5,0,6>: Cost 3 vmrglw <4,5,6,0>, <3,4,5,6>
+  3376459134U,	// <6,5,0,7>: Cost 5 vmrglw <3,4,6,0>, <4,6,5,7>
+  2702090909U,	// <6,5,0,u>: Cost 3 vsldoi8 <2,u,6,5>, LHS
+  3719233546U,	// <6,5,1,0>: Cost 4 vsldoi4 <4,6,5,1>, <0,0,1,1>
+  3775832884U,	// <6,5,1,1>: Cost 4 vsldoi8 <2,u,6,5>, <1,1,1,1>
+  3775832982U,	// <6,5,1,2>: Cost 4 vsldoi8 <2,u,6,5>, <1,2,3,0>
+  3846196909U,	// <6,5,1,3>: Cost 4 vsldoi12 <3,4,5,6>, <5,1,3,4>
+  3719236984U,	// <6,5,1,4>: Cost 4 vsldoi4 <4,6,5,1>, <4,6,5,1>
+  3856150209U,	// <6,5,1,5>: Cost 4 vsldoi12 <5,1,5,6>, <5,1,5,6>
+  3834252997U,	// <6,5,1,6>: Cost 4 vsldoi12 <1,4,5,6>, <5,1,6,1>
+  3870084817U,	// <6,5,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,1,7,4>
+  3769861532U,	// <6,5,1,u>: Cost 4 vsldoi8 <1,u,6,5>, <1,u,6,5>
+  2645500006U,	// <6,5,2,0>: Cost 3 vsldoi4 <4,6,5,2>, LHS
+  3719242548U,	// <6,5,2,1>: Cost 4 vsldoi4 <4,6,5,2>, <1,1,1,1>
+  3775833704U,	// <6,5,2,2>: Cost 4 vsldoi8 <2,u,6,5>, <2,2,2,2>
+  3775833766U,	// <6,5,2,3>: Cost 4 vsldoi8 <2,u,6,5>, <2,3,0,1>
+  2645503353U,	// <6,5,2,4>: Cost 3 vsldoi4 <4,6,5,2>, <4,6,5,2>
+  2252296196U,	// <6,5,2,5>: Cost 3 vmrghw <6,2,7,3>, <5,5,5,5>
+  2702092218U,	// <6,5,2,6>: Cost 3 vsldoi8 <2,u,6,5>, <2,6,3,7>
+  3719246842U,	// <6,5,2,7>: Cost 4 vsldoi4 <4,6,5,2>, <7,0,1,2>
+  2702092405U,	// <6,5,2,u>: Cost 3 vsldoi8 <2,u,6,5>, <2,u,6,5>
+  3775834262U,	// <6,5,3,0>: Cost 4 vsldoi8 <2,u,6,5>, <3,0,1,2>
+  3777161495U,	// <6,5,3,1>: Cost 4 vsldoi8 <3,1,6,5>, <3,1,6,5>
+  3775834470U,	// <6,5,3,2>: Cost 4 vsldoi8 <2,u,6,5>, <3,2,6,3>
+  3775834524U,	// <6,5,3,3>: Cost 4 vsldoi8 <2,u,6,5>, <3,3,3,3>
+  3775834626U,	// <6,5,3,4>: Cost 4 vsldoi8 <2,u,6,5>, <3,4,5,6>
+  3385109722U,	// <6,5,3,5>: Cost 4 vmrglw <4,u,6,3>, <4,4,5,5>
+  2309376514U,	// <6,5,3,6>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6>
+  3775834819U,	// <6,5,3,7>: Cost 4 vsldoi8 <2,u,6,5>, <3,7,0,1>
+  2309376514U,	// <6,5,3,u>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6>
+  3719258214U,	// <6,5,4,0>: Cost 4 vsldoi4 <4,6,5,4>, LHS
+  3385117586U,	// <6,5,4,1>: Cost 4 vmrglw <4,u,6,4>, <4,0,5,1>
+  3327242008U,	// <6,5,4,2>: Cost 4 vmrghw <6,4,5,6>, <5,2,6,3>
+  3719260674U,	// <6,5,4,3>: Cost 4 vsldoi4 <4,6,5,4>, <3,4,5,6>
+  3719261563U,	// <6,5,4,4>: Cost 4 vsldoi4 <4,6,5,4>, <4,6,5,4>
+  2702093622U,	// <6,5,4,5>: Cost 3 vsldoi8 <2,u,6,5>, RHS
+  2309384706U,	// <6,5,4,6>: Cost 3 vmrglw <4,5,6,4>, <3,4,5,6>
+  3870085060U,	// <6,5,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,4,7,4>
+  2702093865U,	// <6,5,4,u>: Cost 3 vsldoi8 <2,u,6,5>, RHS
+  3719266406U,	// <6,5,5,0>: Cost 4 vsldoi4 <4,6,5,5>, LHS
+  3789106889U,	// <6,5,5,1>: Cost 4 vsldoi8 <5,1,6,5>, <5,1,6,5>
+  3785789208U,	// <6,5,5,2>: Cost 4 vsldoi8 <4,5,6,5>, <5,2,6,3>
+  3373183950U,	// <6,5,5,3>: Cost 4 vmrglw <2,u,6,5>, <6,u,5,3>
+  2717355964U,	// <6,5,5,4>: Cost 3 vsldoi8 <5,4,6,5>, <5,4,6,5>
+  2791772164U,	// <6,5,5,5>: Cost 3 vsldoi12 <6,6,6,6>, <5,5,5,5>
+  2772455438U,	// <6,5,5,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,5,6,6>
+  3373183549U,	// <6,5,5,7>: Cost 4 vmrglw <2,u,6,5>, <6,3,5,7>
+  2720010496U,	// <6,5,5,u>: Cost 3 vsldoi8 <5,u,6,5>, <5,u,6,5>
+  2772455460U,	// <6,5,6,0>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,0,1>
+  2322008978U,	// <6,5,6,1>: Cost 3 vmrglw <6,6,6,6>, <4,0,5,1>
+  3840225335U,	// <6,5,6,2>: Cost 4 vsldoi12 <2,4,5,6>, <5,6,2,2>
+  2772455490U,	// <6,5,6,3>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,3,4>
+  2772455500U,	// <6,5,6,4>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,4,5>
+  2254901252U,	// <6,5,6,5>: Cost 3 vmrghw <6,6,6,6>, <5,5,5,5>
+  2772455520U,	// <6,5,6,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,6,7>
+  2785874024U,	// <6,5,6,7>: Cost 3 vsldoi12 <5,6,7,6>, <5,6,7,6>
+  2772455532U,	// <6,5,6,u>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,u,1>
+  2627625062U,	// <6,5,7,0>: Cost 3 vsldoi4 <1,6,5,7>, LHS
+  1235667858U,	// <6,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309409278U,	// <6,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309407659U,	// <6,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2627628342U,	// <6,5,7,4>: Cost 3 vsldoi4 <1,6,5,7>, RHS
+  1235668186U,	// <6,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235667458U,	// <6,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309407987U,	// <6,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235667460U,	// <6,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  2627633254U,	// <6,5,u,0>: Cost 3 vsldoi4 <1,6,5,u>, LHS
+  1235676050U,	// <6,5,u,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309417470U,	// <6,5,u,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309415851U,	// <6,5,u,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2627636534U,	// <6,5,u,4>: Cost 3 vsldoi4 <1,6,5,u>, RHS
+  1235676378U,	// <6,5,u,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235675650U,	// <6,5,u,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309416179U,	// <6,5,u,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235675652U,	// <6,5,u,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  2309352751U,	// <6,6,0,0>: Cost 3 vmrglw <4,5,6,0>, <4,5,6,0>
+  1650917478U,	// <6,6,0,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  2250584570U,	// <6,6,0,2>: Cost 3 vmrghw <6,0,2,1>, <6,2,7,3>
+  3846197554U,	// <6,6,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <6,0,3,1>
+  2724659538U,	// <6,6,0,4>: Cost 3 vsldoi8 <6,6,6,6>, <0,4,1,5>
+  3725275225U,	// <6,6,0,5>: Cost 4 vsldoi4 <5,6,6,0>, <5,6,6,0>
+  2791772493U,	// <6,6,0,6>: Cost 3 vsldoi12 <6,6,6,6>, <6,0,6,1>
+  2309352758U,	// <6,6,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS
+  1650918045U,	// <6,6,0,u>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  3325358368U,	// <6,6,1,0>: Cost 4 vmrghw <6,1,7,1>, <6,0,1,1>
+  2299406449U,	// <6,6,1,1>: Cost 3 vmrglw <2,u,6,1>, <2,u,6,1>
+  2724660118U,	// <6,6,1,2>: Cost 3 vsldoi8 <6,6,6,6>, <1,2,3,0>
+  3373148518U,	// <6,6,1,3>: Cost 4 vmrglw <2,u,6,1>, <3,2,6,3>
+  3834253712U,	// <6,6,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <6,1,4,5>
+  3373147953U,	// <6,6,1,5>: Cost 4 vmrglw <2,u,6,1>, <2,4,6,5>
+  2323297080U,	// <6,6,1,6>: Cost 3 vmrglw <6,u,6,1>, <6,6,6,6>
+  2299407670U,	// <6,6,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS
+  2299407671U,	// <6,6,1,u>: Cost 3 vmrglw <2,u,6,1>, RHS
+  2252296489U,	// <6,6,2,0>: Cost 3 vmrghw <6,2,7,3>, <6,0,2,1>
+  3326038394U,	// <6,6,2,1>: Cost 4 vmrghw <6,2,7,3>, <6,1,2,1>
+  1178554874U,	// <6,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2724660902U,	// <6,6,2,3>: Cost 3 vsldoi8 <6,6,6,6>, <2,3,0,1>
+  2252296817U,	// <6,6,2,4>: Cost 3 vmrghw <6,2,7,3>, <6,4,2,5>
+  3840741864U,	// <6,6,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <6,2,5,3>
+  2252296976U,	// <6,6,2,6>: Cost 3 vmrghw <6,2,7,3>, <6,6,2,2>
+  2785874426U,	// <6,6,2,7>: Cost 3 vsldoi12 <5,6,7,6>, <6,2,7,3>
+  1178554874U,	// <6,6,2,u>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2724661398U,	// <6,6,3,0>: Cost 3 vsldoi8 <6,6,6,6>, <3,0,1,2>
+  3375154665U,	// <6,6,3,1>: Cost 4 vmrglw <3,2,6,3>, <2,0,6,1>
+  3375154909U,	// <6,6,3,2>: Cost 4 vmrglw <3,2,6,3>, <2,3,6,2>
+  2301413734U,	// <6,6,3,3>: Cost 3 vmrglw <3,2,6,3>, <3,2,6,3>
+  2772455986U,	// <6,6,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <6,3,4,5>
+  3375154993U,	// <6,6,3,5>: Cost 4 vmrglw <3,2,6,3>, <2,4,6,5>
+  2323313464U,	// <6,6,3,6>: Cost 3 vmrglw <6,u,6,3>, <6,6,6,6>
+  2301414710U,	// <6,6,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS
+  2301414711U,	// <6,6,3,u>: Cost 3 vmrglw <3,2,6,3>, RHS
+  2724662162U,	// <6,6,4,0>: Cost 3 vsldoi8 <6,6,6,6>, <4,0,5,1>
+  3326939559U,	// <6,6,4,1>: Cost 4 vmrghw <6,4,1,5>, <6,1,7,1>
+  2253271546U,	// <6,6,4,2>: Cost 3 vmrghw <6,4,2,5>, <6,2,7,3>
+  3383127346U,	// <6,6,4,3>: Cost 4 vmrglw <4,5,6,4>, <4,5,6,3>
+  2309385523U,	// <6,6,4,4>: Cost 3 vmrglw <4,5,6,4>, <4,5,6,4>
+  1650920758U,	// <6,6,4,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  2724662653U,	// <6,6,4,6>: Cost 3 vsldoi8 <6,6,6,6>, <4,6,5,6>
+  2309385526U,	// <6,6,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS
+  1650921001U,	// <6,6,4,u>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  3725312102U,	// <6,6,5,0>: Cost 4 vsldoi4 <5,6,6,5>, LHS
+  3373180393U,	// <6,6,5,1>: Cost 4 vmrglw <2,u,6,5>, <2,0,6,1>
+  3791769368U,	// <6,6,5,2>: Cost 4 vsldoi8 <5,5,6,6>, <5,2,6,3>
+  3373181286U,	// <6,6,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,6,3>
+  3725315382U,	// <6,6,5,4>: Cost 4 vsldoi4 <5,6,6,5>, RHS
+  2299439221U,	// <6,6,5,5>: Cost 3 vmrglw <2,u,6,5>, <2,u,6,5>
+  2724663394U,	// <6,6,5,6>: Cost 3 vsldoi8 <6,6,6,6>, <5,6,7,0>
+  2299440438U,	// <6,6,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS
+  2299440439U,	// <6,6,5,u>: Cost 3 vmrglw <2,u,6,5>, RHS
+  1583808614U,	// <6,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  2322010445U,	// <6,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1>
+  2254574074U,	// <6,6,6,2>: Cost 3 vmrghw <6,6,2,2>, <6,2,7,3>
+  2322010609U,	// <6,6,6,3>: Cost 3 vmrglw <6,6,6,6>, <6,2,6,3>
+  1583811894U,	// <6,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  2322010773U,	// <6,6,6,5>: Cost 3 vmrglw <6,6,6,6>, <6,4,6,5>
+  363253046U,	// <6,6,6,6>: Cost 1 vspltisw2 RHS
+  1248267574U,	// <6,6,6,7>: Cost 2 vmrglw <6,6,6,6>, RHS
+  363253046U,	// <6,6,6,u>: Cost 1 vspltisw2 RHS
+  2309410095U,	// <6,6,7,0>: Cost 3 vmrglw RHS, <4,5,6,0>
+  2309408233U,	// <6,6,7,1>: Cost 3 vmrglw RHS, <2,0,6,1>
+  2311402373U,	// <6,6,7,2>: Cost 3 vmrglw RHS, <6,7,6,2>
+  2309409126U,	// <6,6,7,3>: Cost 3 vmrglw RHS, <3,2,6,3>
+  2309410099U,	// <6,6,7,4>: Cost 3 vmrglw RHS, <4,5,6,4>
+  2309408561U,	// <6,6,7,5>: Cost 3 vmrglw RHS, <2,4,6,5>
+  1237660472U,	// <6,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6>
+  161926454U,	// <6,6,7,7>: Cost 1 vmrglw RHS, RHS
+  161926455U,	// <6,6,7,u>: Cost 1 vmrglw RHS, RHS
+  1583808614U,	// <6,6,u,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  1650923310U,	// <6,6,u,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  1178554874U,	// <6,6,u,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2309417318U,	// <6,6,u,3>: Cost 3 vmrglw RHS, <3,2,6,3>
+  1583811894U,	// <6,6,u,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  1650923674U,	// <6,6,u,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  363253046U,	// <6,6,u,6>: Cost 1 vspltisw2 RHS
+  161934646U,	// <6,6,u,7>: Cost 1 vmrglw RHS, RHS
+  161934647U,	// <6,6,u,u>: Cost 1 vmrglw RHS, RHS
+  1638318080U,	// <6,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564576358U,	// <6,7,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712060077U,	// <6,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2712060156U,	// <6,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0>
+  1638318418U,	// <6,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1577865314U,	// <6,7,0,5>: Cost 2 vsldoi4 <5,6,7,0>, <5,6,7,0>
+  2712060406U,	// <6,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2651608058U,	// <6,7,0,7>: Cost 3 vsldoi4 <5,6,7,0>, <7,0,1,2>
+  564576925U,	// <6,7,0,u>: Cost 1 vsldoi8 RHS, LHS
+  2712060643U,	// <6,7,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1>
+  1638318900U,	// <6,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1638318998U,	// <6,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0>
+  3766559753U,	// <6,7,1,3>: Cost 4 vsldoi8 <1,3,6,7>, <1,3,6,7>
+  2712060971U,	// <6,7,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5>
+  2712061039U,	// <6,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2712061135U,	// <6,7,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7>
+  3373148612U,	// <6,7,1,7>: Cost 4 vmrglw <2,u,6,1>, <3,3,7,7>
+  1638319484U,	// <6,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0>
+  2712061373U,	// <6,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  2712061471U,	// <6,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1>
+  1638319720U,	// <6,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638319782U,	// <6,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712061709U,	// <6,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  2712061800U,	// <6,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6>
+  1638320058U,	// <6,7,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7>
+  2252297836U,	// <6,7,2,7>: Cost 3 vmrghw <6,2,7,3>, <7,7,7,7>
+  1638320187U,	// <6,7,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1>
+  1638320278U,	// <6,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712062182U,	// <6,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2712062256U,	// <6,7,3,2>: Cost 3 vsldoi8 RHS, <3,2,0,3>
+  1638320540U,	// <6,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638320642U,	// <6,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712062546U,	// <6,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  2712062584U,	// <6,7,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7>
+  2712062659U,	// <6,7,3,7>: Cost 3 vsldoi8 RHS, <3,7,0,1>
+  1638320926U,	// <6,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638321042U,	// <6,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712062922U,	// <6,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712063029U,	// <6,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2712063108U,	// <6,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0>
+  1638321360U,	// <6,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564579638U,	// <6,7,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712063357U,	// <6,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,6>
+  2712063439U,	// <6,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,7>
+  564579881U,	// <6,7,4,u>: Cost 1 vsldoi8 RHS, RHS
+  2712063560U,	// <6,7,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2>
+  2714054287U,	// <6,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712063742U,	// <6,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  3373181295U,	// <6,7,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,7,3>
+  2712063924U,	// <6,7,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6>
+  1638322180U,	// <6,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1638322274U,	// <6,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0>
+  3373181380U,	// <6,7,5,7>: Cost 4 vmrglw <2,u,6,5>, <3,3,7,7>
+  1640313092U,	// <6,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0>
+  2712064289U,	// <6,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2>
+  2712064423U,	// <6,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1>
+  1638322682U,	// <6,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  2712064562U,	// <6,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5>
+  2712064653U,	// <6,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6>
+  2712064747U,	// <6,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1>
+  1638323000U,	// <6,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6>
+  1638323022U,	// <6,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  1638323168U,	// <6,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,7,3>
+  1237659746U,	// <6,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0>
+  2309411158U,	// <6,7,7,1>: Cost 3 vmrglw RHS, <6,0,7,1>
+  2639718330U,	// <6,7,7,2>: Cost 3 vsldoi4 <3,6,7,7>, <2,6,3,7>
+  1235669498U,	// <6,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3>
+  1237659750U,	// <6,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4>
+  2309411243U,	// <6,7,7,5>: Cost 3 vmrglw RHS, <6,1,7,5>
+  1583895362U,	// <6,7,7,6>: Cost 2 vsldoi4 <6,6,7,7>, <6,6,7,7>
+  1235669826U,	// <6,7,7,7>: Cost 2 vmrglw RHS, <6,6,7,7>
+  1235669503U,	// <6,7,7,u>: Cost 2 vmrglw RHS, <6,2,7,u>
+  1638323923U,	// <6,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2>
+  564582190U,	// <6,7,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1638324101U,	// <6,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0>
+  1638324156U,	// <6,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1>
+  1638324287U,	// <6,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6>
+  564582554U,	// <6,7,u,5>: Cost 1 vsldoi8 RHS, RHS
+  1638324432U,	// <6,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7>
+  1235678018U,	// <6,7,u,7>: Cost 2 vmrglw RHS, <6,6,7,7>
+  564582757U,	// <6,7,u,u>: Cost 1 vsldoi8 RHS, LHS
+  1638326272U,	// <6,u,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564584550U,	// <6,u,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712068269U,	// <6,u,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2309349532U,	// <6,u,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS
+  1638326610U,	// <6,u,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1577939051U,	// <6,u,0,5>: Cost 2 vsldoi4 <5,6,u,0>, <5,6,u,0>
+  2712068598U,	// <6,u,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2309352776U,	// <6,u,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS
+  564585117U,	// <6,u,0,u>: Cost 1 vsldoi8 RHS, LHS
+  2712068835U,	// <6,u,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1>
+  1638327092U,	// <6,u,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1698715438U,	// <6,u,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2299404444U,	// <6,u,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS
+  2712069163U,	// <6,u,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5>
+  2712069231U,	// <6,u,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2712069327U,	// <6,u,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7>
+  2299407688U,	// <6,u,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS
+  1698715492U,	// <6,u,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2712069565U,	// <6,u,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  1178556206U,	// <6,u,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS
+  1638327912U,	// <6,u,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638327974U,	// <6,u,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712069901U,	// <6,u,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  1178556570U,	// <6,u,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS
+  1638328250U,	// <6,u,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7>
+  2252298496U,	// <6,u,2,7>: Cost 3 vmrghw <6,2,7,3>, <u,7,0,1>
+  1638328379U,	// <6,u,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1>
+  1638328470U,	// <6,u,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712070374U,	// <6,u,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2704107883U,	// <6,u,3,2>: Cost 3 vsldoi8 <3,2,6,u>, <3,2,6,u>
+  1638328732U,	// <6,u,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638328834U,	// <6,u,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712070738U,	// <6,u,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  2712070776U,	// <6,u,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7>
+  2301414728U,	// <6,u,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS
+  1638329118U,	// <6,u,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638329234U,	// <6,u,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712071114U,	// <6,u,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712071221U,	// <6,u,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2309382300U,	// <6,u,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS
+  1638329552U,	// <6,u,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564587831U,	// <6,u,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712071545U,	// <6,u,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2>
+  2309385544U,	// <6,u,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS
+  564588073U,	// <6,u,4,u>: Cost 1 vsldoi8 RHS, RHS
+  2712071752U,	// <6,u,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2>
+  2714062479U,	// <6,u,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712071934U,	// <6,u,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  2299437212U,	// <6,u,5,3>: Cost 3 vmrglw <2,u,6,5>, LHS
+  2712072116U,	// <6,u,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6>
+  1638330372U,	// <6,u,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1698715802U,	// <6,u,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2299440456U,	// <6,u,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS
+  1698715820U,	// <6,u,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  1583808614U,	// <6,u,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  1181161262U,	// <6,u,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS
+  1638330874U,	// <6,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  1248264348U,	// <6,u,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS
+  1583811894U,	// <6,u,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  1181161626U,	// <6,u,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS
+  363253046U,	// <6,u,6,6>: Cost 1 vspltisw2 RHS
+  1638331214U,	// <6,u,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  363253046U,	// <6,u,6,u>: Cost 1 vspltisw2 RHS
+  1560076390U,	// <6,u,7,0>: Cost 2 vsldoi4 <2,6,u,7>, LHS
+  1235664969U,	// <6,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1>
+  1560078311U,	// <6,u,7,2>: Cost 2 vsldoi4 <2,6,u,7>, <2,6,u,7>
+  161923228U,	// <6,u,7,3>: Cost 1 vmrglw RHS, LHS
+  1560079670U,	// <6,u,7,4>: Cost 2 vsldoi4 <2,6,u,7>, RHS
+  1235665297U,	// <6,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5>
+  1235667485U,	// <6,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6>
+  161926472U,	// <6,u,7,7>: Cost 1 vmrglw RHS, RHS
+  161923233U,	// <6,u,7,u>: Cost 1 vmrglw RHS, LHS
+  1560084582U,	// <6,u,u,0>: Cost 2 vsldoi4 <2,6,u,u>, LHS
+  564590382U,	// <6,u,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1560086504U,	// <6,u,u,2>: Cost 2 vsldoi4 <2,6,u,u>, <2,6,u,u>
+  161931420U,	// <6,u,u,3>: Cost 1 vmrglw RHS, LHS
+  1560087862U,	// <6,u,u,4>: Cost 2 vsldoi4 <2,6,u,u>, RHS
+  564590746U,	// <6,u,u,5>: Cost 1 vsldoi8 RHS, RHS
+  363253046U,	// <6,u,u,6>: Cost 1 vspltisw2 RHS
+  161934664U,	// <6,u,u,7>: Cost 1 vmrglw RHS, RHS
+  161931425U,	// <6,u,u,u>: Cost 1 vmrglw RHS, LHS
+  1705426944U,	// <7,0,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0>
+  1705426954U,	// <7,0,0,1>: Cost 2 vsldoi12 RHS, <0,0,1,1>
+  3713550266U,	// <7,0,0,2>: Cost 4 vsldoi4 <3,7,0,0>, <2,6,3,7>
+  2316063892U,	// <7,0,0,3>: Cost 3 vmrglw <5,6,7,0>, <7,2,0,3>
+  2779168805U,	// <7,0,0,4>: Cost 3 vsldoi12 RHS, <0,0,4,1>
+  2663698530U,	// <7,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0>
+  2657727309U,	// <7,0,0,6>: Cost 3 vsldoi4 <6,7,0,0>, <6,7,0,0>
+  2316064220U,	// <7,0,0,7>: Cost 3 vmrglw <5,6,7,0>, <7,6,0,7>
+  1705427017U,	// <7,0,0,u>: Cost 2 vsldoi12 RHS, <0,0,u,1>
+  1583988838U,	// <7,0,1,0>: Cost 2 vsldoi4 <6,7,0,1>, LHS
+  2779168859U,	// <7,0,1,1>: Cost 3 vsldoi12 RHS, <0,1,1,1>
+  631685222U,	// <7,0,1,2>: Cost 1 vsldoi12 RHS, LHS
+  2639817411U,	// <7,0,1,3>: Cost 3 vsldoi4 <3,7,0,1>, <3,7,0,1>
+  1583992118U,	// <7,0,1,4>: Cost 2 vsldoi4 <6,7,0,1>, RHS
+  2657734660U,	// <7,0,1,5>: Cost 3 vsldoi4 <6,7,0,1>, <5,5,5,5>
+  1583993678U,	// <7,0,1,6>: Cost 2 vsldoi4 <6,7,0,1>, <6,7,0,1>
+  2657735672U,	// <7,0,1,7>: Cost 3 vsldoi4 <6,7,0,1>, <7,0,1,0>
+  631685276U,	// <7,0,1,u>: Cost 1 vsldoi12 RHS, LHS
+  2779168933U,	// <7,0,2,0>: Cost 3 vsldoi12 RHS, <0,2,0,3>
+  2767667377U,	// <7,0,2,1>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,1,6>
+  2718713448U,	// <7,0,2,2>: Cost 3 vsldoi8 <5,6,7,0>, <2,2,2,2>
+  2718713510U,	// <7,0,2,3>: Cost 3 vsldoi8 <5,6,7,0>, <2,3,0,1>
+  3841409228U,	// <7,0,2,4>: Cost 4 vsldoi12 <2,6,3,7>, <0,2,4,6>
+  3852910802U,	// <7,0,2,5>: Cost 4 vsldoi12 RHS, <0,2,5,3>
+  2718713786U,	// <7,0,2,6>: Cost 3 vsldoi8 <5,6,7,0>, <2,6,3,7>
+  3847160036U,	// <7,0,2,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,2,7,3>
+  2767667440U,	// <7,0,2,u>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,u,6>
+  2718714006U,	// <7,0,3,0>: Cost 3 vsldoi8 <5,6,7,0>, <3,0,1,2>
+  2779169020U,	// <7,0,3,1>: Cost 3 vsldoi12 RHS, <0,3,1,0>
+  3852910853U,	// <7,0,3,2>: Cost 4 vsldoi12 RHS, <0,3,2,0>
+  2718714268U,	// <7,0,3,3>: Cost 3 vsldoi8 <5,6,7,0>, <3,3,3,3>
+  2718714370U,	// <7,0,3,4>: Cost 3 vsldoi8 <5,6,7,0>, <3,4,5,6>
+  2718714461U,	// <7,0,3,5>: Cost 3 vsldoi8 <5,6,7,0>, <3,5,6,7>
+  2706770608U,	// <7,0,3,6>: Cost 3 vsldoi8 <3,6,7,0>, <3,6,7,0>
+  3847160114U,	// <7,0,3,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,3,7,0>
+  2779169083U,	// <7,0,3,u>: Cost 3 vsldoi12 RHS, <0,3,u,0>
+  2718714770U,	// <7,0,4,0>: Cost 3 vsldoi8 <5,6,7,0>, <4,0,5,1>
+  1705427282U,	// <7,0,4,1>: Cost 2 vsldoi12 RHS, <0,4,1,5>
+  3713583034U,	// <7,0,4,2>: Cost 4 vsldoi4 <3,7,0,4>, <2,6,3,7>
+  3713583814U,	// <7,0,4,3>: Cost 4 vsldoi4 <3,7,0,4>, <3,7,0,4>
+  2779169133U,	// <7,0,4,4>: Cost 3 vsldoi12 RHS, <0,4,4,5>
+  1644973366U,	// <7,0,4,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS
+  2657760081U,	// <7,0,4,6>: Cost 3 vsldoi4 <6,7,0,4>, <6,7,0,4>
+  2259468868U,	// <7,0,4,7>: Cost 3 vmrghw <7,4,5,6>, <0,7,1,4>
+  1705427345U,	// <7,0,4,u>: Cost 2 vsldoi12 RHS, <0,4,u,5>
+  2718715508U,	// <7,0,5,0>: Cost 3 vsldoi8 <5,6,7,0>, <5,0,6,1>
+  2260123750U,	// <7,0,5,1>: Cost 3 vmrghw <7,5,5,5>, LHS
+  3792457451U,	// <7,0,5,2>: Cost 4 vsldoi8 <5,6,7,0>, <5,2,1,3>
+  3852911024U,	// <7,0,5,3>: Cost 4 vsldoi12 RHS, <0,5,3,0>
+  2718715836U,	// <7,0,5,4>: Cost 3 vsldoi8 <5,6,7,0>, <5,4,6,5>
+  2718715908U,	// <7,0,5,5>: Cost 3 vsldoi8 <5,6,7,0>, <5,5,5,5>
+  1644974178U,	// <7,0,5,6>: Cost 2 vsldoi8 <5,6,7,0>, <5,6,7,0>
+  3792457853U,	// <7,0,5,7>: Cost 4 vsldoi8 <5,6,7,0>, <5,7,1,0>
+  1646301444U,	// <7,0,5,u>: Cost 2 vsldoi8 <5,u,7,0>, <5,u,7,0>
+  2720706901U,	// <7,0,6,0>: Cost 3 vsldoi8 <6,0,7,0>, <6,0,7,0>
+  2779169270U,	// <7,0,6,1>: Cost 3 vsldoi12 RHS, <0,6,1,7>
+  2718716410U,	// <7,0,6,2>: Cost 3 vsldoi8 <5,6,7,0>, <6,2,7,3>
+  2722697800U,	// <7,0,6,3>: Cost 3 vsldoi8 <6,3,7,0>, <6,3,7,0>
+  3852911121U,	// <7,0,6,4>: Cost 4 vsldoi12 RHS, <0,6,4,7>
+  3852911130U,	// <7,0,6,5>: Cost 4 vsldoi12 RHS, <0,6,5,7>
+  2718716728U,	// <7,0,6,6>: Cost 3 vsldoi8 <5,6,7,0>, <6,6,6,6>
+  2718716750U,	// <7,0,6,7>: Cost 3 vsldoi8 <5,6,7,0>, <6,7,0,1>
+  2779169333U,	// <7,0,6,u>: Cost 3 vsldoi12 RHS, <0,6,u,7>
+  2718716922U,	// <7,0,7,0>: Cost 3 vsldoi8 <5,6,7,0>, <7,0,1,2>
+  1187872870U,	// <7,0,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS
+  2718717076U,	// <7,0,7,2>: Cost 3 vsldoi8 <5,6,7,0>, <7,2,0,3>
+  3847160408U,	// <7,0,7,3>: Cost 4 vsldoi12 <3,6,0,7>, <0,7,3,6>
+  2718717286U,	// <7,0,7,4>: Cost 3 vsldoi8 <5,6,7,0>, <7,4,5,6>
+  2718717377U,	// <7,0,7,5>: Cost 3 vsldoi8 <5,6,7,0>, <7,5,6,7>
+  2718717404U,	// <7,0,7,6>: Cost 3 vsldoi8 <5,6,7,0>, <7,6,0,7>
+  2718717478U,	// <7,0,7,7>: Cost 3 vsldoi8 <5,6,7,0>, <7,7,0,0>
+  1187873437U,	// <7,0,7,u>: Cost 2 vmrghw <7,7,7,7>, LHS
+  1584046182U,	// <7,0,u,0>: Cost 2 vsldoi4 <6,7,0,u>, LHS
+  1705427602U,	// <7,0,u,1>: Cost 2 vsldoi12 RHS, <0,u,1,1>
+  631685789U,	// <7,0,u,2>: Cost 1 vsldoi12 RHS, LHS
+  2639874762U,	// <7,0,u,3>: Cost 3 vsldoi4 <3,7,0,u>, <3,7,0,u>
+  1584049462U,	// <7,0,u,4>: Cost 2 vsldoi4 <6,7,0,u>, RHS
+  1644976282U,	// <7,0,u,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS
+  1584051029U,	// <7,0,u,6>: Cost 2 vsldoi4 <6,7,0,u>, <6,7,0,u>
+  2718718208U,	// <7,0,u,7>: Cost 3 vsldoi8 <5,6,7,0>, <u,7,0,1>
+  631685843U,	// <7,0,u,u>: Cost 1 vsldoi12 RHS, LHS
+  2721374218U,	// <7,1,0,0>: Cost 3 vsldoi8 <6,1,7,1>, <0,0,1,1>
+  2779169507U,	// <7,1,0,1>: Cost 3 vsldoi12 RHS, <1,0,1,1>
+  2779169516U,	// <7,1,0,2>: Cost 3 vsldoi12 RHS, <1,0,2,1>
+  3852911348U,	// <7,1,0,3>: Cost 4 vsldoi12 RHS, <1,0,3,0>
+  2669743414U,	// <7,1,0,4>: Cost 3 vsldoi4 <u,7,1,0>, RHS
+  2316058962U,	// <7,1,0,5>: Cost 3 vmrglw <5,6,7,0>, <0,4,1,5>
+  2316059044U,	// <7,1,0,6>: Cost 3 vmrglw <5,6,7,0>, <0,5,1,6>
+  2669745146U,	// <7,1,0,7>: Cost 3 vsldoi4 <u,7,1,0>, <7,0,1,2>
+  2779169570U,	// <7,1,0,u>: Cost 3 vsldoi12 RHS, <1,0,u,1>
+  2779169579U,	// <7,1,1,0>: Cost 3 vsldoi12 RHS, <1,1,0,1>
+  1705427764U,	// <7,1,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779169598U,	// <7,1,1,2>: Cost 3 vsldoi12 RHS, <1,1,2,2>
+  3713632972U,	// <7,1,1,3>: Cost 4 vsldoi4 <3,7,1,1>, <3,7,1,1>
+  2779169619U,	// <7,1,1,4>: Cost 3 vsldoi12 RHS, <1,1,4,5>
+  2779169628U,	// <7,1,1,5>: Cost 3 vsldoi12 RHS, <1,1,5,5>
+  2657809239U,	// <7,1,1,6>: Cost 3 vsldoi4 <6,7,1,1>, <6,7,1,1>
+  3835290474U,	// <7,1,1,7>: Cost 4 vsldoi12 <1,6,1,7>, <1,1,7,1>
+  1705427764U,	// <7,1,1,u>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779169660U,	// <7,1,2,0>: Cost 3 vsldoi12 RHS, <1,2,0,1>
+  2779169671U,	// <7,1,2,1>: Cost 3 vsldoi12 RHS, <1,2,1,3>
+  2779169680U,	// <7,1,2,2>: Cost 3 vsldoi12 RHS, <1,2,2,3>
+  1705427862U,	// <7,1,2,3>: Cost 2 vsldoi12 RHS, <1,2,3,0>
+  2779169700U,	// <7,1,2,4>: Cost 3 vsldoi12 RHS, <1,2,4,5>
+  2779169707U,	// <7,1,2,5>: Cost 3 vsldoi12 RHS, <1,2,5,3>
+  2657817432U,	// <7,1,2,6>: Cost 3 vsldoi4 <6,7,1,2>, <6,7,1,2>
+  2803057594U,	// <7,1,2,7>: Cost 3 vsldoi12 RHS, <1,2,7,0>
+  1705427907U,	// <7,1,2,u>: Cost 2 vsldoi12 RHS, <1,2,u,0>
+  3776538827U,	// <7,1,3,0>: Cost 4 vsldoi8 <3,0,7,1>, <3,0,7,1>
+  2319400970U,	// <7,1,3,1>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,1>
+  2316085398U,	// <7,1,3,2>: Cost 3 vmrglw <5,6,7,3>, <3,0,1,2>
+  3852911591U,	// <7,1,3,3>: Cost 4 vsldoi12 RHS, <1,3,3,0>
+  3852911600U,	// <7,1,3,4>: Cost 4 vsldoi12 RHS, <1,3,4,0>
+  2319401298U,	// <7,1,3,5>: Cost 3 vmrglw <6,2,7,3>, <0,4,1,5>
+  3833668617U,	// <7,1,3,6>: Cost 4 vsldoi12 <1,3,6,7>, <1,3,6,7>
+  3367265487U,	// <7,1,3,7>: Cost 4 vmrglw <1,u,7,3>, <1,6,1,7>
+  2319400977U,	// <7,1,3,u>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,u>
+  2724031378U,	// <7,1,4,0>: Cost 3 vsldoi8 <6,5,7,1>, <4,0,5,1>
+  2779169835U,	// <7,1,4,1>: Cost 3 vsldoi12 RHS, <1,4,1,5>
+  2779169844U,	// <7,1,4,2>: Cost 3 vsldoi12 RHS, <1,4,2,5>
+  3852911672U,	// <7,1,4,3>: Cost 4 vsldoi12 RHS, <1,4,3,0>
+  2669776182U,	// <7,1,4,4>: Cost 3 vsldoi4 <u,7,1,4>, RHS
+  2779169872U,	// <7,1,4,5>: Cost 3 vsldoi12 RHS, <1,4,5,6>
+  3835290712U,	// <7,1,4,6>: Cost 4 vsldoi12 <1,6,1,7>, <1,4,6,5>
+  2669778278U,	// <7,1,4,7>: Cost 3 vsldoi4 <u,7,1,4>, <7,4,5,6>
+  2779169898U,	// <7,1,4,u>: Cost 3 vsldoi12 RHS, <1,4,u,5>
+  2779169903U,	// <7,1,5,0>: Cost 3 vsldoi12 RHS, <1,5,0,1>
+  3835585661U,	// <7,1,5,1>: Cost 4 vsldoi12 <1,6,5,7>, <1,5,1,6>
+  3841410182U,	// <7,1,5,2>: Cost 4 vsldoi12 <2,6,3,7>, <1,5,2,6>
+  3852911753U,	// <7,1,5,3>: Cost 4 vsldoi12 RHS, <1,5,3,0>
+  2779169943U,	// <7,1,5,4>: Cost 3 vsldoi12 RHS, <1,5,4,5>
+  2318754130U,	// <7,1,5,5>: Cost 3 vmrglw <6,1,7,5>, <0,4,1,5>
+  2718724195U,	// <7,1,5,6>: Cost 3 vsldoi8 <5,6,7,1>, <5,6,7,1>
+  3859178670U,	// <7,1,5,7>: Cost 4 vsldoi12 <5,6,1,7>, <1,5,7,1>
+  2779169975U,	// <7,1,5,u>: Cost 3 vsldoi12 RHS, <1,5,u,1>
+  2720715094U,	// <7,1,6,0>: Cost 3 vsldoi8 <6,0,7,1>, <6,0,7,1>
+  2761549007U,	// <7,1,6,1>: Cost 3 vsldoi12 <1,6,1,7>, <1,6,1,7>
+  2779170008U,	// <7,1,6,2>: Cost 3 vsldoi12 RHS, <1,6,2,7>
+  3835438305U,	// <7,1,6,3>: Cost 4 vsldoi12 <1,6,3,7>, <1,6,3,7>
+  3835512042U,	// <7,1,6,4>: Cost 4 vsldoi12 <1,6,4,7>, <1,6,4,7>
+  2761843955U,	// <7,1,6,5>: Cost 3 vsldoi12 <1,6,5,7>, <1,6,5,7>
+  3835659516U,	// <7,1,6,6>: Cost 4 vsldoi12 <1,6,6,7>, <1,6,6,7>
+  2803057918U,	// <7,1,6,7>: Cost 3 vsldoi12 RHS, <1,6,7,0>
+  2762065166U,	// <7,1,6,u>: Cost 3 vsldoi12 <1,6,u,7>, <1,6,u,7>
+  2669797478U,	// <7,1,7,0>: Cost 3 vsldoi4 <u,7,1,7>, LHS
+  2322087946U,	// <7,1,7,1>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,1>
+  2317448186U,	// <7,1,7,2>: Cost 3 vmrglw <5,u,7,7>, <7,0,1,2>
+  3395829934U,	// <7,1,7,3>: Cost 4 vmrglw <6,6,7,7>, <0,2,1,3>
+  2669800758U,	// <7,1,7,4>: Cost 3 vsldoi4 <u,7,1,7>, RHS
+  2322088274U,	// <7,1,7,5>: Cost 3 vmrglw <6,6,7,7>, <0,4,1,5>
+  3375923377U,	// <7,1,7,6>: Cost 4 vmrglw <3,3,7,7>, <0,2,1,6>
+  2731996780U,	// <7,1,7,7>: Cost 3 vsldoi8 <7,u,7,1>, <7,7,7,7>
+  2322087953U,	// <7,1,7,u>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,u>
+  2779170146U,	// <7,1,u,0>: Cost 3 vsldoi12 RHS, <1,u,0,1>
+  1705427764U,	// <7,1,u,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779170164U,	// <7,1,u,2>: Cost 3 vsldoi12 RHS, <1,u,2,1>
+  1705428348U,	// <7,1,u,3>: Cost 2 vsldoi12 RHS, <1,u,3,0>
+  2779170186U,	// <7,1,u,4>: Cost 3 vsldoi12 RHS, <1,u,4,5>
+  2763171221U,	// <7,1,u,5>: Cost 3 vsldoi12 <1,u,5,7>, <1,u,5,7>
+  2657866590U,	// <7,1,u,6>: Cost 3 vsldoi4 <6,7,1,u>, <6,7,1,u>
+  2803058080U,	// <7,1,u,7>: Cost 3 vsldoi12 RHS, <1,u,7,0>
+  1705428393U,	// <7,1,u,u>: Cost 2 vsldoi12 RHS, <1,u,u,0>
+  3713695846U,	// <7,2,0,0>: Cost 4 vsldoi4 <3,7,2,0>, LHS
+  2779170237U,	// <7,2,0,1>: Cost 3 vsldoi12 RHS, <2,0,1,2>
+  2779170245U,	// <7,2,0,2>: Cost 3 vsldoi12 RHS, <2,0,2,1>
+  1242316902U,	// <7,2,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS
+  3713699126U,	// <7,2,0,4>: Cost 4 vsldoi4 <3,7,2,0>, RHS
+  3852912096U,	// <7,2,0,5>: Cost 4 vsldoi12 RHS, <2,0,5,1>
+  2767668713U,	// <7,2,0,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,0,6,1>
+  2256488426U,	// <7,2,0,7>: Cost 3 vmrghw <7,0,1,2>, <2,7,0,1>
+  1242316907U,	// <7,2,0,u>: Cost 2 vmrglw <5,6,7,0>, LHS
+  3852912132U,	// <7,2,1,0>: Cost 4 vsldoi12 RHS, <2,1,0,1>
+  3852912141U,	// <7,2,1,1>: Cost 4 vsldoi12 RHS, <2,1,1,1>
+  3852912149U,	// <7,2,1,2>: Cost 4 vsldoi12 RHS, <2,1,2,0>
+  2779170335U,	// <7,2,1,3>: Cost 3 vsldoi12 RHS, <2,1,3,1>
+  3852912172U,	// <7,2,1,4>: Cost 4 vsldoi12 RHS, <2,1,4,5>
+  3840747062U,	// <7,2,1,5>: Cost 5 vsldoi12 <2,5,3,7>, <2,1,5,6>
+  3841410617U,	// <7,2,1,6>: Cost 4 vsldoi12 <2,6,3,7>, <2,1,6,0>
+  3795125538U,	// <7,2,1,7>: Cost 4 vsldoi8 <6,1,7,2>, <1,7,2,0>
+  2779170380U,	// <7,2,1,u>: Cost 3 vsldoi12 RHS, <2,1,u,1>
+  2779170389U,	// <7,2,2,0>: Cost 3 vsldoi12 RHS, <2,2,0,1>
+  3852912222U,	// <7,2,2,1>: Cost 4 vsldoi12 RHS, <2,2,1,1>
+  1705428584U,	// <7,2,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1705428594U,	// <7,2,2,3>: Cost 2 vsldoi12 RHS, <2,2,3,3>
+  2779170429U,	// <7,2,2,4>: Cost 3 vsldoi12 RHS, <2,2,4,5>
+  3852912259U,	// <7,2,2,5>: Cost 4 vsldoi12 RHS, <2,2,5,2>
+  2767668880U,	// <7,2,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,2,6,6>
+  3841336981U,	// <7,2,2,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,2,7,2>
+  1705428639U,	// <7,2,2,u>: Cost 2 vsldoi12 RHS, <2,2,u,3>
+  1705428646U,	// <7,2,3,0>: Cost 2 vsldoi12 RHS, <2,3,0,1>
+  2779170479U,	// <7,2,3,1>: Cost 3 vsldoi12 RHS, <2,3,1,1>
+  2767668925U,	// <7,2,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <2,3,2,6>
+  1245659238U,	// <7,2,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS
+  1705428686U,	// <7,2,3,4>: Cost 2 vsldoi12 RHS, <2,3,4,5>
+  2779170519U,	// <7,2,3,5>: Cost 3 vsldoi12 RHS, <2,3,5,5>
+  2657899362U,	// <7,2,3,6>: Cost 3 vsldoi4 <6,7,2,3>, <6,7,2,3>
+  2319406574U,	// <7,2,3,7>: Cost 3 vmrglw <6,2,7,3>, <7,6,2,7>
+  1705428718U,	// <7,2,3,u>: Cost 2 vsldoi12 RHS, <2,3,u,1>
+  3713728614U,	// <7,2,4,0>: Cost 4 vsldoi4 <3,7,2,4>, LHS
+  3852912388U,	// <7,2,4,1>: Cost 4 vsldoi12 RHS, <2,4,1,5>
+  2779170573U,	// <7,2,4,2>: Cost 3 vsldoi12 RHS, <2,4,2,5>
+  1242349670U,	// <7,2,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS
+  3713731894U,	// <7,2,4,4>: Cost 4 vsldoi4 <3,7,2,4>, RHS
+  2779170601U,	// <7,2,4,5>: Cost 3 vsldoi12 RHS, <2,4,5,6>
+  2767669041U,	// <7,2,4,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,4,6,5>
+  3389834456U,	// <7,2,4,7>: Cost 4 vmrglw <5,6,7,4>, <1,6,2,7>
+  1242349675U,	// <7,2,4,u>: Cost 2 vmrglw <5,6,7,4>, LHS
+  3852912456U,	// <7,2,5,0>: Cost 4 vsldoi12 RHS, <2,5,0,1>
+  3852912466U,	// <7,2,5,1>: Cost 4 vsldoi12 RHS, <2,5,1,2>
+  3852912475U,	// <7,2,5,2>: Cost 4 vsldoi12 RHS, <2,5,2,2>
+  2779170664U,	// <7,2,5,3>: Cost 3 vsldoi12 RHS, <2,5,3,6>
+  3852912496U,	// <7,2,5,4>: Cost 4 vsldoi12 RHS, <2,5,4,5>
+  3792474116U,	// <7,2,5,5>: Cost 4 vsldoi8 <5,6,7,2>, <5,5,5,5>
+  2718732388U,	// <7,2,5,6>: Cost 3 vsldoi8 <5,6,7,2>, <5,6,7,2>
+  3841337228U,	// <7,2,5,7>: Cost 5 vsldoi12 <2,6,2,7>, <2,5,7,6>
+  2779170709U,	// <7,2,5,u>: Cost 3 vsldoi12 RHS, <2,5,u,6>
+  2640003174U,	// <7,2,6,0>: Cost 3 vsldoi4 <3,7,2,6>, LHS
+  2721386920U,	// <7,2,6,1>: Cost 3 vsldoi8 <6,1,7,2>, <6,1,7,2>
+  2767595441U,	// <7,2,6,2>: Cost 3 vsldoi12 <2,6,2,7>, <2,6,2,7>
+  1693927354U,	// <7,2,6,3>: Cost 2 vsldoi12 <2,6,3,7>, <2,6,3,7>
+  2640006454U,	// <7,2,6,4>: Cost 3 vsldoi4 <3,7,2,6>, RHS
+  3841558476U,	// <7,2,6,5>: Cost 4 vsldoi12 <2,6,5,7>, <2,6,5,7>
+  2657923941U,	// <7,2,6,6>: Cost 3 vsldoi4 <6,7,2,6>, <6,7,2,6>
+  3841337310U,	// <7,2,6,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,6,7,7>
+  1694296039U,	// <7,2,6,u>: Cost 2 vsldoi12 <2,6,u,7>, <2,6,u,7>
+  2803058666U,	// <7,2,7,0>: Cost 3 vsldoi12 RHS, <2,7,0,1>
+  3852912632U,	// <7,2,7,1>: Cost 4 vsldoi12 RHS, <2,7,1,6>
+  2322089576U,	// <7,2,7,2>: Cost 3 vmrglw <6,6,7,7>, <2,2,2,2>
+  1248346214U,	// <7,2,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS
+  3841337362U,	// <7,2,7,4>: Cost 4 vsldoi12 <2,6,2,7>, <2,7,4,5>
+  3395830836U,	// <7,2,7,5>: Cost 4 vmrglw <6,6,7,7>, <1,4,2,5>
+  2261616570U,	// <7,2,7,6>: Cost 3 vmrghw <7,7,7,7>, <2,6,3,7>
+  3371943857U,	// <7,2,7,7>: Cost 4 vmrglw <2,6,7,7>, <2,6,2,7>
+  1248346219U,	// <7,2,7,u>: Cost 2 vmrglw <6,6,7,7>, LHS
+  1705429051U,	// <7,2,u,0>: Cost 2 vsldoi12 RHS, <2,u,0,1>
+  2779170884U,	// <7,2,u,1>: Cost 3 vsldoi12 RHS, <2,u,1,1>
+  1705428584U,	// <7,2,u,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1695254620U,	// <7,2,u,3>: Cost 2 vsldoi12 <2,u,3,7>, <2,u,3,7>
+  1705429091U,	// <7,2,u,4>: Cost 2 vsldoi12 RHS, <2,u,4,5>
+  2779170924U,	// <7,2,u,5>: Cost 3 vsldoi12 RHS, <2,u,5,5>
+  2767669361U,	// <7,2,u,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,u,6,1>
+  2803058809U,	// <7,2,u,7>: Cost 3 vsldoi12 RHS, <2,u,7,0>
+  1695623305U,	// <7,2,u,u>: Cost 2 vsldoi12 <2,u,u,7>, <2,u,u,7>
+  2779170955U,	// <7,3,0,0>: Cost 3 vsldoi12 RHS, <3,0,0,0>
+  1705429142U,	// <7,3,0,1>: Cost 2 vsldoi12 RHS, <3,0,1,2>
+  2634057732U,	// <7,3,0,2>: Cost 3 vsldoi4 <2,7,3,0>, <2,7,3,0>
+  2779170983U,	// <7,3,0,3>: Cost 3 vsldoi12 RHS, <3,0,3,1>
+  2779170992U,	// <7,3,0,4>: Cost 3 vsldoi12 RHS, <3,0,4,1>
+  3852912829U,	// <7,3,0,5>: Cost 4 vsldoi12 RHS, <3,0,5,5>
+  2657948520U,	// <7,3,0,6>: Cost 3 vsldoi4 <6,7,3,0>, <6,7,3,0>
+  2316060602U,	// <7,3,0,7>: Cost 3 vmrglw <5,6,7,0>, <2,6,3,7>
+  1705429205U,	// <7,3,0,u>: Cost 2 vsldoi12 RHS, <3,0,u,2>
+  3852912860U,	// <7,3,1,0>: Cost 4 vsldoi12 RHS, <3,1,0,0>
+  2779171046U,	// <7,3,1,1>: Cost 3 vsldoi12 RHS, <3,1,1,1>
+  2779171057U,	// <7,3,1,2>: Cost 3 vsldoi12 RHS, <3,1,2,3>
+  3852912887U,	// <7,3,1,3>: Cost 4 vsldoi12 RHS, <3,1,3,0>
+  3852912896U,	// <7,3,1,4>: Cost 4 vsldoi12 RHS, <3,1,4,0>
+  3852912905U,	// <7,3,1,5>: Cost 4 vsldoi12 RHS, <3,1,5,0>
+  3835291923U,	// <7,3,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <3,1,6,1>
+  3841411356U,	// <7,3,1,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,1,7,1>
+  2779171111U,	// <7,3,1,u>: Cost 3 vsldoi12 RHS, <3,1,u,3>
+  2779171120U,	// <7,3,2,0>: Cost 3 vsldoi12 RHS, <3,2,0,3>
+  3852912952U,	// <7,3,2,1>: Cost 4 vsldoi12 RHS, <3,2,1,2>
+  2779171137U,	// <7,3,2,2>: Cost 3 vsldoi12 RHS, <3,2,2,2>
+  2779171144U,	// <7,3,2,3>: Cost 3 vsldoi12 RHS, <3,2,3,0>
+  2779171156U,	// <7,3,2,4>: Cost 3 vsldoi12 RHS, <3,2,4,3>
+  3852912989U,	// <7,3,2,5>: Cost 4 vsldoi12 RHS, <3,2,5,3>
+  2767669606U,	// <7,3,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,6,3>
+  2767669615U,	// <7,3,2,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,7,3>
+  2779171189U,	// <7,3,2,u>: Cost 3 vsldoi12 RHS, <3,2,u,0>
+  2779171198U,	// <7,3,3,0>: Cost 3 vsldoi12 RHS, <3,3,0,0>
+  3852913032U,	// <7,3,3,1>: Cost 4 vsldoi12 RHS, <3,3,1,1>
+  2704140655U,	// <7,3,3,2>: Cost 3 vsldoi8 <3,2,7,3>, <3,2,7,3>
+  1705429404U,	// <7,3,3,3>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2779171238U,	// <7,3,3,4>: Cost 3 vsldoi12 RHS, <3,3,4,4>
+  3852913070U,	// <7,3,3,5>: Cost 4 vsldoi12 RHS, <3,3,5,3>
+  2657973099U,	// <7,3,3,6>: Cost 3 vsldoi4 <6,7,3,3>, <6,7,3,3>
+  2767669700U,	// <7,3,3,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,3,7,7>
+  1705429404U,	// <7,3,3,u>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2779171280U,	// <7,3,4,0>: Cost 3 vsldoi12 RHS, <3,4,0,1>
+  2779171290U,	// <7,3,4,1>: Cost 3 vsldoi12 RHS, <3,4,1,2>
+  2634090504U,	// <7,3,4,2>: Cost 3 vsldoi4 <2,7,3,4>, <2,7,3,4>
+  2779171311U,	// <7,3,4,3>: Cost 3 vsldoi12 RHS, <3,4,3,5>
+  2779171319U,	// <7,3,4,4>: Cost 3 vsldoi12 RHS, <3,4,4,4>
+  1705429506U,	// <7,3,4,5>: Cost 2 vsldoi12 RHS, <3,4,5,6>
+  2722057593U,	// <7,3,4,6>: Cost 3 vsldoi8 <6,2,7,3>, <4,6,5,2>
+  2316093370U,	// <7,3,4,7>: Cost 3 vmrglw <5,6,7,4>, <2,6,3,7>
+  1705429533U,	// <7,3,4,u>: Cost 2 vsldoi12 RHS, <3,4,u,6>
+  3852913185U,	// <7,3,5,0>: Cost 4 vsldoi12 RHS, <3,5,0,1>
+  3795799695U,	// <7,3,5,1>: Cost 4 vsldoi8 <6,2,7,3>, <5,1,0,1>
+  3852913203U,	// <7,3,5,2>: Cost 4 vsldoi12 RHS, <3,5,2,1>
+  3852913214U,	// <7,3,5,3>: Cost 4 vsldoi12 RHS, <3,5,3,3>
+  3852913225U,	// <7,3,5,4>: Cost 4 vsldoi12 RHS, <3,5,4,5>
+  2779171410U,	// <7,3,5,5>: Cost 3 vsldoi12 RHS, <3,5,5,5>
+  2718740581U,	// <7,3,5,6>: Cost 3 vsldoi8 <5,6,7,3>, <5,6,7,3>
+  3841411685U,	// <7,3,5,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,5,7,6>
+  2720067847U,	// <7,3,5,u>: Cost 3 vsldoi8 <5,u,7,3>, <5,u,7,3>
+  2773420664U,	// <7,3,6,0>: Cost 3 vsldoi12 <3,6,0,7>, <3,6,0,7>
+  3847236225U,	// <7,3,6,1>: Cost 4 vsldoi12 <3,6,1,7>, <3,6,1,7>
+  1648316922U,	// <7,3,6,2>: Cost 2 vsldoi8 <6,2,7,3>, <6,2,7,3>
+  2773641875U,	// <7,3,6,3>: Cost 3 vsldoi12 <3,6,3,7>, <3,6,3,7>
+  2773715612U,	// <7,3,6,4>: Cost 3 vsldoi12 <3,6,4,7>, <3,6,4,7>
+  3847531173U,	// <7,3,6,5>: Cost 4 vsldoi12 <3,6,5,7>, <3,6,5,7>
+  2722059024U,	// <7,3,6,6>: Cost 3 vsldoi8 <6,2,7,3>, <6,6,2,2>
+  2767669943U,	// <7,3,6,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,6,7,7>
+  1652298720U,	// <7,3,6,u>: Cost 2 vsldoi8 <6,u,7,3>, <6,u,7,3>
+  2767669955U,	// <7,3,7,0>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,0,1>
+  3841411788U,	// <7,3,7,1>: Cost 4 vsldoi12 <2,6,3,7>, <3,7,1,1>
+  2767669978U,	// <7,3,7,2>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,2,6>
+  2722059546U,	// <7,3,7,3>: Cost 3 vsldoi8 <6,2,7,3>, <7,3,6,2>
+  2767669995U,	// <7,3,7,4>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,4,5>
+  3852913396U,	// <7,3,7,5>: Cost 4 vsldoi12 RHS, <3,7,5,5>
+  2722059758U,	// <7,3,7,6>: Cost 3 vsldoi8 <6,2,7,3>, <7,6,2,7>
+  2302183354U,	// <7,3,7,7>: Cost 3 vmrglw <3,3,7,7>, <2,6,3,7>
+  2767670027U,	// <7,3,7,u>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,u,1>
+  2774747930U,	// <7,3,u,0>: Cost 3 vsldoi12 <3,u,0,7>, <3,u,0,7>
+  1705429790U,	// <7,3,u,1>: Cost 2 vsldoi12 RHS, <3,u,1,2>
+  1660262316U,	// <7,3,u,2>: Cost 2 vsldoi8 <u,2,7,3>, <u,2,7,3>
+  1705429404U,	// <7,3,u,3>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2775042878U,	// <7,3,u,4>: Cost 3 vsldoi12 <3,u,4,7>, <3,u,4,7>
+  1705429830U,	// <7,3,u,5>: Cost 2 vsldoi12 RHS, <3,u,5,6>
+  2779171660U,	// <7,3,u,6>: Cost 3 vsldoi12 RHS, <3,u,6,3>
+  2767670101U,	// <7,3,u,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,u,7,3>
+  1705429853U,	// <7,3,u,u>: Cost 2 vsldoi12 RHS, <3,u,u,2>
+  2718744576U,	// <7,4,0,0>: Cost 3 vsldoi8 <5,6,7,4>, <0,0,0,0>
+  1645002854U,	// <7,4,0,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS
+  3852913527U,	// <7,4,0,2>: Cost 4 vsldoi12 RHS, <4,0,2,1>
+  3852913536U,	// <7,4,0,3>: Cost 4 vsldoi12 RHS, <4,0,3,1>
+  2316061904U,	// <7,4,0,4>: Cost 3 vmrglw <5,6,7,0>, <4,4,4,4>
+  1705429906U,	// <7,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1>
+  2658022257U,	// <7,4,0,6>: Cost 3 vsldoi4 <6,7,4,0>, <6,7,4,0>
+  2256489928U,	// <7,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0>
+  1707420589U,	// <7,4,0,u>: Cost 2 vsldoi12 RHS, <4,0,u,1>
+  3852913590U,	// <7,4,1,0>: Cost 4 vsldoi12 RHS, <4,1,0,1>
+  2718745396U,	// <7,4,1,1>: Cost 3 vsldoi8 <5,6,7,4>, <1,1,1,1>
+  2779171786U,	// <7,4,1,2>: Cost 3 vsldoi12 RHS, <4,1,2,3>
+  3852913616U,	// <7,4,1,3>: Cost 4 vsldoi12 RHS, <4,1,3,0>
+  3852913627U,	// <7,4,1,4>: Cost 4 vsldoi12 RHS, <4,1,4,2>
+  2779171810U,	// <7,4,1,5>: Cost 3 vsldoi12 RHS, <4,1,5,0>
+  3792487631U,	// <7,4,1,6>: Cost 4 vsldoi8 <5,6,7,4>, <1,6,1,7>
+  3394456220U,	// <7,4,1,7>: Cost 4 vmrglw <6,4,7,1>, <3,6,4,7>
+  2779171837U,	// <7,4,1,u>: Cost 3 vsldoi12 RHS, <4,1,u,0>
+  3852913673U,	// <7,4,2,0>: Cost 4 vsldoi12 RHS, <4,2,0,3>
+  3852913682U,	// <7,4,2,1>: Cost 4 vsldoi12 RHS, <4,2,1,3>
+  2718746216U,	// <7,4,2,2>: Cost 3 vsldoi8 <5,6,7,4>, <2,2,2,2>
+  2718746278U,	// <7,4,2,3>: Cost 3 vsldoi8 <5,6,7,4>, <2,3,0,1>
+  2779171885U,	// <7,4,2,4>: Cost 3 vsldoi12 RHS, <4,2,4,3>
+  2779171893U,	// <7,4,2,5>: Cost 3 vsldoi12 RHS, <4,2,5,2>
+  2718746554U,	// <7,4,2,6>: Cost 3 vsldoi8 <5,6,7,4>, <2,6,3,7>
+  3847457864U,	// <7,4,2,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,2,7,3>
+  2779171921U,	// <7,4,2,u>: Cost 3 vsldoi12 RHS, <4,2,u,3>
+  2718746774U,	// <7,4,3,0>: Cost 3 vsldoi8 <5,6,7,4>, <3,0,1,2>
+  3852913762U,	// <7,4,3,1>: Cost 4 vsldoi12 RHS, <4,3,1,2>
+  3852913772U,	// <7,4,3,2>: Cost 4 vsldoi12 RHS, <4,3,2,3>
+  2718747036U,	// <7,4,3,3>: Cost 3 vsldoi8 <5,6,7,4>, <3,3,3,3>
+  2718747138U,	// <7,4,3,4>: Cost 3 vsldoi8 <5,6,7,4>, <3,4,5,6>
+  2779171972U,	// <7,4,3,5>: Cost 3 vsldoi12 RHS, <4,3,5,0>
+  2706803380U,	// <7,4,3,6>: Cost 3 vsldoi8 <3,6,7,4>, <3,6,7,4>
+  3847457946U,	// <7,4,3,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,3,7,4>
+  2781162655U,	// <7,4,3,u>: Cost 3 vsldoi12 RHS, <4,3,u,0>
+  2718747538U,	// <7,4,4,0>: Cost 3 vsldoi8 <5,6,7,4>, <4,0,5,1>
+  3852913842U,	// <7,4,4,1>: Cost 4 vsldoi12 RHS, <4,4,1,1>
+  3852913852U,	// <7,4,4,2>: Cost 4 vsldoi12 RHS, <4,4,2,2>
+  2316096696U,	// <7,4,4,3>: Cost 3 vmrglw <5,6,7,4>, <7,2,4,3>
+  1705430224U,	// <7,4,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4>
+  1705430234U,	// <7,4,4,5>: Cost 2 vsldoi12 RHS, <4,4,5,5>
+  2658055029U,	// <7,4,4,6>: Cost 3 vsldoi4 <6,7,4,4>, <6,7,4,4>
+  2316097024U,	// <7,4,4,7>: Cost 3 vmrglw <5,6,7,4>, <7,6,4,7>
+  1707420917U,	// <7,4,4,u>: Cost 2 vsldoi12 RHS, <4,4,u,5>
+  1584316518U,	// <7,4,5,0>: Cost 2 vsldoi4 <6,7,4,5>, LHS
+  2658059060U,	// <7,4,5,1>: Cost 3 vsldoi4 <6,7,4,5>, <1,1,1,1>
+  2640144314U,	// <7,4,5,2>: Cost 3 vsldoi4 <3,7,4,5>, <2,6,3,7>
+  2640145131U,	// <7,4,5,3>: Cost 3 vsldoi4 <3,7,4,5>, <3,7,4,5>
+  1584319798U,	// <7,4,5,4>: Cost 2 vsldoi4 <6,7,4,5>, RHS
+  2779172134U,	// <7,4,5,5>: Cost 3 vsldoi12 RHS, <4,5,5,0>
+  631688502U,	// <7,4,5,6>: Cost 1 vsldoi12 RHS, RHS
+  2658063354U,	// <7,4,5,7>: Cost 3 vsldoi4 <6,7,4,5>, <7,0,1,2>
+  631688520U,	// <7,4,5,u>: Cost 1 vsldoi12 RHS, RHS
+  3852914001U,	// <7,4,6,0>: Cost 4 vsldoi12 RHS, <4,6,0,7>
+  3852914010U,	// <7,4,6,1>: Cost 4 vsldoi12 RHS, <4,6,1,7>
+  2718749178U,	// <7,4,6,2>: Cost 3 vsldoi8 <5,6,7,4>, <6,2,7,3>
+  2722730572U,	// <7,4,6,3>: Cost 3 vsldoi8 <6,3,7,4>, <6,3,7,4>
+  2723394205U,	// <7,4,6,4>: Cost 3 vsldoi8 <6,4,7,4>, <6,4,7,4>
+  2779172221U,	// <7,4,6,5>: Cost 3 vsldoi12 RHS, <4,6,5,6>
+  2718749496U,	// <7,4,6,6>: Cost 3 vsldoi8 <5,6,7,4>, <6,6,6,6>
+  2718749518U,	// <7,4,6,7>: Cost 3 vsldoi8 <5,6,7,4>, <6,7,0,1>
+  2779172249U,	// <7,4,6,u>: Cost 3 vsldoi12 RHS, <4,6,u,7>
+  2718749690U,	// <7,4,7,0>: Cost 3 vsldoi8 <5,6,7,4>, <7,0,1,2>
+  3847458214U,	// <7,4,7,1>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,1,2>
+  2718749880U,	// <7,4,7,2>: Cost 3 vsldoi8 <5,6,7,4>, <7,2,4,3>
+  3847458236U,	// <7,4,7,3>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,3,6>
+  2718750004U,	// <7,4,7,4>: Cost 3 vsldoi8 <5,6,7,4>, <7,4,0,1>
+  1187876150U,	// <7,4,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS
+  2718750208U,	// <7,4,7,6>: Cost 3 vsldoi8 <5,6,7,4>, <7,6,4,7>
+  2718750286U,	// <7,4,7,7>: Cost 3 vsldoi8 <5,6,7,4>, <7,7,4,4>
+  1187876393U,	// <7,4,7,u>: Cost 2 vmrghw <7,7,7,7>, RHS
+  1584341094U,	// <7,4,u,0>: Cost 2 vsldoi4 <6,7,4,u>, LHS
+  1645008686U,	// <7,4,u,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS
+  2640168890U,	// <7,4,u,2>: Cost 3 vsldoi4 <3,7,4,u>, <2,6,3,7>
+  2640169710U,	// <7,4,u,3>: Cost 3 vsldoi4 <3,7,4,u>, <3,7,4,u>
+  1584344374U,	// <7,4,u,4>: Cost 2 vsldoi4 <6,7,4,u>, RHS
+  1705430554U,	// <7,4,u,5>: Cost 2 vsldoi12 RHS, <4,u,5,1>
+  631688745U,	// <7,4,u,6>: Cost 1 vsldoi12 RHS, RHS
+  2718750976U,	// <7,4,u,7>: Cost 3 vsldoi8 <5,6,7,4>, <u,7,0,1>
+  631688763U,	// <7,4,u,u>: Cost 1 vsldoi12 RHS, RHS
+  2646147174U,	// <7,5,0,0>: Cost 3 vsldoi4 <4,7,5,0>, LHS
+  2779172424U,	// <7,5,0,1>: Cost 3 vsldoi12 RHS, <5,0,1,2>
+  3852914258U,	// <7,5,0,2>: Cost 4 vsldoi12 RHS, <5,0,2,3>
+  3852914268U,	// <7,5,0,3>: Cost 4 vsldoi12 RHS, <5,0,3,4>
+  2779172450U,	// <7,5,0,4>: Cost 3 vsldoi12 RHS, <5,0,4,1>
+  2316061914U,	// <7,5,0,5>: Cost 3 vmrglw <5,6,7,0>, <4,4,5,5>
+  2316061186U,	// <7,5,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,5,6>
+  2646152186U,	// <7,5,0,7>: Cost 3 vsldoi4 <4,7,5,0>, <7,0,1,2>
+  2779172486U,	// <7,5,0,u>: Cost 3 vsldoi12 RHS, <5,0,u,1>
+  2781163151U,	// <7,5,1,0>: Cost 3 vsldoi12 RHS, <5,1,0,1>
+  2321378194U,	// <7,5,1,1>: Cost 3 vmrglw <6,5,7,1>, <4,0,5,1>
+  3852914339U,	// <7,5,1,2>: Cost 4 vsldoi12 RHS, <5,1,2,3>
+  3852914350U,	// <7,5,1,3>: Cost 4 vsldoi12 RHS, <5,1,3,5>
+  2781163191U,	// <7,5,1,4>: Cost 3 vsldoi12 RHS, <5,1,4,5>
+  3852914363U,	// <7,5,1,5>: Cost 4 vsldoi12 RHS, <5,1,5,0>
+  3835588297U,	// <7,5,1,6>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,6,5>
+  3835588306U,	// <7,5,1,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,7,5>
+  2781163223U,	// <7,5,1,u>: Cost 3 vsldoi12 RHS, <5,1,u,1>
+  3852914400U,	// <7,5,2,0>: Cost 4 vsldoi12 RHS, <5,2,0,1>
+  2781163243U,	// <7,5,2,1>: Cost 3 vsldoi12 RHS, <5,2,1,3>
+  3852914419U,	// <7,5,2,2>: Cost 4 vsldoi12 RHS, <5,2,2,2>
+  2779172606U,	// <7,5,2,3>: Cost 3 vsldoi12 RHS, <5,2,3,4>
+  3780552497U,	// <7,5,2,4>: Cost 4 vsldoi8 <3,6,7,5>, <2,4,6,5>
+  2781163279U,	// <7,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3>
+  2779172632U,	// <7,5,2,6>: Cost 3 vsldoi12 RHS, <5,2,6,3>
+  3835588385U,	// <7,5,2,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,2,7,3>
+  2779172650U,	// <7,5,2,u>: Cost 3 vsldoi12 RHS, <5,2,u,3>
+  3852914481U,	// <7,5,3,0>: Cost 4 vsldoi12 RHS, <5,3,0,1>
+  2319403922U,	// <7,5,3,1>: Cost 3 vmrglw <6,2,7,3>, <4,0,5,1>
+  2319404409U,	// <7,5,3,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2>
+  3852914510U,	// <7,5,3,3>: Cost 4 vsldoi12 RHS, <5,3,3,3>
+  3779226131U,	// <7,5,3,4>: Cost 4 vsldoi8 <3,4,7,5>, <3,4,7,5>
+  2319404250U,	// <7,5,3,5>: Cost 3 vmrglw <6,2,7,3>, <4,4,5,5>
+  2319403522U,	// <7,5,3,6>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,6>
+  3852914547U,	// <7,5,3,7>: Cost 4 vsldoi12 RHS, <5,3,7,4>
+  2319403524U,	// <7,5,3,u>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,u>
+  2646179942U,	// <7,5,4,0>: Cost 3 vsldoi4 <4,7,5,4>, LHS
+  2316094354U,	// <7,5,4,1>: Cost 3 vmrglw <5,6,7,4>, <4,0,5,1>
+  3852914582U,	// <7,5,4,2>: Cost 4 vsldoi12 RHS, <5,4,2,3>
+  3852914592U,	// <7,5,4,3>: Cost 4 vsldoi12 RHS, <5,4,3,4>
+  2646183372U,	// <7,5,4,4>: Cost 3 vsldoi4 <4,7,5,4>, <4,7,5,4>
+  2779172788U,	// <7,5,4,5>: Cost 3 vsldoi12 RHS, <5,4,5,6>
+  2316093954U,	// <7,5,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,5,6>
+  2646185318U,	// <7,5,4,7>: Cost 3 vsldoi4 <4,7,5,4>, <7,4,5,6>
+  2779172815U,	// <7,5,4,u>: Cost 3 vsldoi12 RHS, <5,4,u,6>
+  2781163475U,	// <7,5,5,0>: Cost 3 vsldoi12 RHS, <5,5,0,1>
+  2781163484U,	// <7,5,5,1>: Cost 3 vsldoi12 RHS, <5,5,1,1>
+  3852914662U,	// <7,5,5,2>: Cost 4 vsldoi12 RHS, <5,5,2,2>
+  3852914672U,	// <7,5,5,3>: Cost 4 vsldoi12 RHS, <5,5,3,3>
+  2781163515U,	// <7,5,5,4>: Cost 3 vsldoi12 RHS, <5,5,4,5>
+  1705431044U,	// <7,5,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779172878U,	// <7,5,5,6>: Cost 3 vsldoi12 RHS, <5,5,6,6>
+  3835588632U,	// <7,5,5,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,5,7,7>
+  1705431044U,	// <7,5,5,u>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779172900U,	// <7,5,6,0>: Cost 3 vsldoi12 RHS, <5,6,0,1>
+  2781163571U,	// <7,5,6,1>: Cost 3 vsldoi12 RHS, <5,6,1,7>
+  3852914743U,	// <7,5,6,2>: Cost 4 vsldoi12 RHS, <5,6,2,2>
+  2779172930U,	// <7,5,6,3>: Cost 3 vsldoi12 RHS, <5,6,3,4>
+  2779172940U,	// <7,5,6,4>: Cost 3 vsldoi12 RHS, <5,6,4,5>
+  2781163607U,	// <7,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7>
+  2779172960U,	// <7,5,6,6>: Cost 3 vsldoi12 RHS, <5,6,6,7>
+  1705431138U,	// <7,5,6,7>: Cost 2 vsldoi12 RHS, <5,6,7,0>
+  1705578603U,	// <7,5,6,u>: Cost 2 vsldoi12 RHS, <5,6,u,0>
+  2646204518U,	// <7,5,7,0>: Cost 3 vsldoi4 <4,7,5,7>, LHS
+  2322090898U,	// <7,5,7,1>: Cost 3 vmrglw <6,6,7,7>, <4,0,5,1>
+  3719947880U,	// <7,5,7,2>: Cost 4 vsldoi4 <4,7,5,7>, <2,2,2,2>
+  3719948438U,	// <7,5,7,3>: Cost 4 vsldoi4 <4,7,5,7>, <3,0,1,2>
+  2646207951U,	// <7,5,7,4>: Cost 3 vsldoi4 <4,7,5,7>, <4,7,5,7>
+  2322091226U,	// <7,5,7,5>: Cost 3 vmrglw <6,6,7,7>, <4,4,5,5>
+  2322090498U,	// <7,5,7,6>: Cost 3 vmrglw <6,6,7,7>, <3,4,5,6>
+  2646210156U,	// <7,5,7,7>: Cost 3 vsldoi4 <4,7,5,7>, <7,7,7,7>
+  2646210350U,	// <7,5,7,u>: Cost 3 vsldoi4 <4,7,5,7>, LHS
+  2779173062U,	// <7,5,u,0>: Cost 3 vsldoi12 RHS, <5,u,0,1>
+  2779173072U,	// <7,5,u,1>: Cost 3 vsldoi12 RHS, <5,u,1,2>
+  2319404409U,	// <7,5,u,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2>
+  2779173092U,	// <7,5,u,3>: Cost 3 vsldoi12 RHS, <5,u,3,4>
+  2779173101U,	// <7,5,u,4>: Cost 3 vsldoi12 RHS, <5,u,4,4>
+  1705431044U,	// <7,5,u,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779173118U,	// <7,5,u,6>: Cost 3 vsldoi12 RHS, <5,u,6,3>
+  1705578756U,	// <7,5,u,7>: Cost 2 vsldoi12 RHS, <5,u,7,0>
+  1707421965U,	// <7,5,u,u>: Cost 2 vsldoi12 RHS, <5,u,u,0>
+  3852914966U,	// <7,6,0,0>: Cost 4 vsldoi12 RHS, <6,0,0,0>
+  2779173153U,	// <7,6,0,1>: Cost 3 vsldoi12 RHS, <6,0,1,2>
+  2256491002U,	// <7,6,0,2>: Cost 3 vmrghw <7,0,1,2>, <6,2,7,3>
+  3852914994U,	// <7,6,0,3>: Cost 4 vsldoi12 RHS, <6,0,3,1>
+  3852915003U,	// <7,6,0,4>: Cost 4 vsldoi12 RHS, <6,0,4,1>
+  2316062652U,	// <7,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5>
+  2316063544U,	// <7,6,0,6>: Cost 3 vmrglw <5,6,7,0>, <6,6,6,6>
+  1242320182U,	// <7,6,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS
+  1242320183U,	// <7,6,0,u>: Cost 2 vmrglw <5,6,7,0>, RHS
+  3852915048U,	// <7,6,1,0>: Cost 4 vsldoi12 RHS, <6,1,0,1>
+  3377866217U,	// <7,6,1,1>: Cost 4 vmrglw <3,6,7,1>, <2,0,6,1>
+  3852915068U,	// <7,6,1,2>: Cost 4 vsldoi12 RHS, <6,1,2,3>
+  3833672072U,	// <7,6,1,3>: Cost 5 vsldoi12 <1,3,6,7>, <6,1,3,6>
+  3852915088U,	// <7,6,1,4>: Cost 4 vsldoi12 RHS, <6,1,4,5>
+  3395122056U,	// <7,6,1,5>: Cost 4 vmrglw <6,5,7,1>, <6,7,6,5>
+  3389813560U,	// <7,6,1,6>: Cost 4 vmrglw <5,6,7,1>, <6,6,6,6>
+  2779173287U,	// <7,6,1,7>: Cost 3 vsldoi12 RHS, <6,1,7,1>
+  2779320752U,	// <7,6,1,u>: Cost 3 vsldoi12 RHS, <6,1,u,1>
+  2658181222U,	// <7,6,2,0>: Cost 3 vsldoi4 <6,7,6,2>, LHS
+  3852915140U,	// <7,6,2,1>: Cost 4 vsldoi12 RHS, <6,2,1,3>
+  2257973754U,	// <7,6,2,2>: Cost 3 vmrghw <7,2,3,3>, <6,2,7,3>
+  3841413589U,	// <7,6,2,3>: Cost 4 vsldoi12 <2,6,3,7>, <6,2,3,2>
+  2658184502U,	// <7,6,2,4>: Cost 3 vsldoi4 <6,7,6,2>, RHS
+  3852915176U,	// <7,6,2,5>: Cost 4 vsldoi12 RHS, <6,2,5,3>
+  2658186117U,	// <7,6,2,6>: Cost 3 vsldoi4 <6,7,6,2>, <6,7,6,2>
+  1705431546U,	// <7,6,2,7>: Cost 2 vsldoi12 RHS, <6,2,7,3>
+  1705579011U,	// <7,6,2,u>: Cost 2 vsldoi12 RHS, <6,2,u,3>
+  3714015334U,	// <7,6,3,0>: Cost 4 vsldoi4 <3,7,6,3>, LHS
+  3777243425U,	// <7,6,3,1>: Cost 4 vsldoi8 <3,1,7,6>, <3,1,7,6>
+  2319405957U,	// <7,6,3,2>: Cost 3 vmrglw <6,2,7,3>, <6,7,6,2>
+  3375229286U,	// <7,6,3,3>: Cost 4 vmrglw <3,2,7,3>, <3,2,6,3>
+  2779173426U,	// <7,6,3,4>: Cost 3 vsldoi12 RHS, <6,3,4,5>
+  3375228721U,	// <7,6,3,5>: Cost 4 vmrglw <3,2,7,3>, <2,4,6,5>
+  2319405880U,	// <7,6,3,6>: Cost 3 vmrglw <6,2,7,3>, <6,6,6,6>
+  1245662518U,	// <7,6,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS
+  1245662519U,	// <7,6,3,u>: Cost 2 vmrglw <6,2,7,3>, RHS
+  3852915291U,	// <7,6,4,0>: Cost 4 vsldoi12 RHS, <6,4,0,1>
+  3389834729U,	// <7,6,4,1>: Cost 4 vmrglw <5,6,7,4>, <2,0,6,1>
+  2259472890U,	// <7,6,4,2>: Cost 3 vmrghw <7,4,5,6>, <6,2,7,3>
+  3852915321U,	// <7,6,4,3>: Cost 4 vsldoi12 RHS, <6,4,3,4>
+  3852915330U,	// <7,6,4,4>: Cost 4 vsldoi12 RHS, <6,4,4,4>
+  2779173517U,	// <7,6,4,5>: Cost 3 vsldoi12 RHS, <6,4,5,6>
+  2316096312U,	// <7,6,4,6>: Cost 3 vmrglw <5,6,7,4>, <6,6,6,6>
+  1242352950U,	// <7,6,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS
+  1242352951U,	// <7,6,4,u>: Cost 2 vmrglw <5,6,7,4>, RHS
+  3852915372U,	// <7,6,5,0>: Cost 4 vsldoi12 RHS, <6,5,0,1>
+  3835294392U,	// <7,6,5,1>: Cost 5 vsldoi12 <1,6,1,7>, <6,5,1,4>
+  3852915395U,	// <7,6,5,2>: Cost 4 vsldoi12 RHS, <6,5,2,6>
+  3852915404U,	// <7,6,5,3>: Cost 4 vsldoi12 RHS, <6,5,3,6>
+  3852915412U,	// <7,6,5,4>: Cost 4 vsldoi12 RHS, <6,5,4,5>
+  3377899313U,	// <7,6,5,5>: Cost 4 vmrglw <3,6,7,5>, <2,4,6,5>
+  2718765160U,	// <7,6,5,6>: Cost 3 vsldoi8 <5,6,7,6>, <5,6,7,6>
+  2779173611U,	// <7,6,5,7>: Cost 3 vsldoi12 RHS, <6,5,7,1>
+  2779321076U,	// <7,6,5,u>: Cost 3 vsldoi12 RHS, <6,5,u,1>
+  2658213990U,	// <7,6,6,0>: Cost 3 vsldoi4 <6,7,6,6>, LHS
+  3852915462U,	// <7,6,6,1>: Cost 4 vsldoi12 RHS, <6,6,1,1>
+  2718765562U,	// <7,6,6,2>: Cost 3 vsldoi8 <5,6,7,6>, <6,2,7,3>
+  3714042622U,	// <7,6,6,3>: Cost 4 vsldoi4 <3,7,6,6>, <3,7,6,6>
+  2658217270U,	// <7,6,6,4>: Cost 3 vsldoi4 <6,7,6,6>, RHS
+  2724074224U,	// <7,6,6,5>: Cost 3 vsldoi8 <6,5,7,6>, <6,5,7,6>
+  1705431864U,	// <7,6,6,6>: Cost 2 vsldoi12 RHS, <6,6,6,6>
+  1705431874U,	// <7,6,6,7>: Cost 2 vsldoi12 RHS, <6,6,7,7>
+  1705579339U,	// <7,6,6,u>: Cost 2 vsldoi12 RHS, <6,6,u,7>
+  1705431886U,	// <7,6,7,0>: Cost 2 vsldoi12 RHS, <6,7,0,1>
+  2779173719U,	// <7,6,7,1>: Cost 3 vsldoi12 RHS, <6,7,1,1>
+  2779173729U,	// <7,6,7,2>: Cost 3 vsldoi12 RHS, <6,7,2,2>
+  2779173736U,	// <7,6,7,3>: Cost 3 vsldoi12 RHS, <6,7,3,0>
+  1705431926U,	// <7,6,7,4>: Cost 2 vsldoi12 RHS, <6,7,4,5>
+  2779173759U,	// <7,6,7,5>: Cost 3 vsldoi12 RHS, <6,7,5,5>
+  2779173765U,	// <7,6,7,6>: Cost 3 vsldoi12 RHS, <6,7,6,2>
+  1248349494U,	// <7,6,7,7>: Cost 2 vmrglw <6,6,7,7>, RHS
+  1705431958U,	// <7,6,7,u>: Cost 2 vsldoi12 RHS, <6,7,u,1>
+  1705579423U,	// <7,6,u,0>: Cost 2 vsldoi12 RHS, <6,u,0,1>
+  2779173801U,	// <7,6,u,1>: Cost 3 vsldoi12 RHS, <6,u,1,2>
+  2779321266U,	// <7,6,u,2>: Cost 3 vsldoi12 RHS, <6,u,2,2>
+  2779321273U,	// <7,6,u,3>: Cost 3 vsldoi12 RHS, <6,u,3,0>
+  1705579463U,	// <7,6,u,4>: Cost 2 vsldoi12 RHS, <6,u,4,5>
+  2779173841U,	// <7,6,u,5>: Cost 3 vsldoi12 RHS, <6,u,5,6>
+  1705431864U,	// <7,6,u,6>: Cost 2 vsldoi12 RHS, <6,6,6,6>
+  1705432032U,	// <7,6,u,7>: Cost 2 vsldoi12 RHS, <6,u,7,3>
+  1705579495U,	// <7,6,u,u>: Cost 2 vsldoi12 RHS, <6,u,u,1>
+  1242320994U,	// <7,7,0,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0>
+  1705432058U,	// <7,7,0,1>: Cost 2 vsldoi12 RHS, <7,0,1,2>
+  3841414146U,	// <7,7,0,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,0,2,1>
+  2316063226U,	// <7,7,0,3>: Cost 3 vmrglw <5,6,7,0>, <6,2,7,3>
+  2779173908U,	// <7,7,0,4>: Cost 3 vsldoi12 RHS, <7,0,4,1>
+  2658242658U,	// <7,7,0,5>: Cost 3 vsldoi4 <6,7,7,0>, <5,6,7,0>
+  2658243468U,	// <7,7,0,6>: Cost 3 vsldoi4 <6,7,7,0>, <6,7,7,0>
+  2316063554U,	// <7,7,0,7>: Cost 3 vmrglw <5,6,7,0>, <6,6,7,7>
+  1705432121U,	// <7,7,0,u>: Cost 2 vsldoi12 RHS, <7,0,u,2>
+  3852915777U,	// <7,7,1,0>: Cost 4 vsldoi12 RHS, <7,1,0,1>
+  2779173962U,	// <7,7,1,1>: Cost 3 vsldoi12 RHS, <7,1,1,1>
+  2779173973U,	// <7,7,1,2>: Cost 3 vsldoi12 RHS, <7,1,2,3>
+  3389813242U,	// <7,7,1,3>: Cost 4 vmrglw <5,6,7,1>, <6,2,7,3>
+  3852915813U,	// <7,7,1,4>: Cost 4 vsldoi12 RHS, <7,1,4,1>
+  3852915821U,	// <7,7,1,5>: Cost 4 vsldoi12 RHS, <7,1,5,0>
+  3835294839U,	// <7,7,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <7,1,6,1>
+  2329343596U,	// <7,7,1,7>: Cost 3 vmrglw <7,u,7,1>, <7,7,7,7>
+  2779174027U,	// <7,7,1,u>: Cost 3 vsldoi12 RHS, <7,1,u,3>
+  2803061908U,	// <7,7,2,0>: Cost 3 vsldoi12 RHS, <7,2,0,3>
+  3852915869U,	// <7,7,2,1>: Cost 4 vsldoi12 RHS, <7,2,1,3>
+  2779174053U,	// <7,7,2,2>: Cost 3 vsldoi12 RHS, <7,2,2,2>
+  2779174060U,	// <7,7,2,3>: Cost 3 vsldoi12 RHS, <7,2,3,0>
+  2803061944U,	// <7,7,2,4>: Cost 3 vsldoi12 RHS, <7,2,4,3>
+  3852915905U,	// <7,7,2,5>: Cost 4 vsldoi12 RHS, <7,2,5,3>
+  2767672522U,	// <7,7,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <7,2,6,3>
+  2791855315U,	// <7,7,2,7>: Cost 3 vsldoi12 <6,6,7,7>, <7,2,7,3>
+  2768999644U,	// <7,7,2,u>: Cost 3 vsldoi12 <2,u,3,7>, <7,2,u,3>
+  2779174115U,	// <7,7,3,0>: Cost 3 vsldoi12 RHS, <7,3,0,1>
+  3852915948U,	// <7,7,3,1>: Cost 4 vsldoi12 RHS, <7,3,1,1>
+  3841414394U,	// <7,7,3,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,3,2,6>
+  1245663738U,	// <7,7,3,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  2779174155U,	// <7,7,3,4>: Cost 3 vsldoi12 RHS, <7,3,4,5>
+  3852915988U,	// <7,7,3,5>: Cost 4 vsldoi12 RHS, <7,3,5,5>
+  2706827959U,	// <7,7,3,6>: Cost 3 vsldoi8 <3,6,7,7>, <3,6,7,7>
+  2319405890U,	// <7,7,3,7>: Cost 3 vmrglw <6,2,7,3>, <6,6,7,7>
+  1245663738U,	// <7,7,3,u>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  2779174200U,	// <7,7,4,0>: Cost 3 vsldoi12 RHS, <7,4,0,5>
+  3852916030U,	// <7,7,4,1>: Cost 4 vsldoi12 RHS, <7,4,1,2>
+  3714099130U,	// <7,7,4,2>: Cost 4 vsldoi4 <3,7,7,4>, <2,6,3,7>
+  2316095994U,	// <7,7,4,3>: Cost 3 vmrglw <5,6,7,4>, <6,2,7,3>
+  1242353766U,	// <7,7,4,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4>
+  1705432422U,	// <7,7,4,5>: Cost 2 vsldoi12 RHS, <7,4,5,6>
+  2658276240U,	// <7,7,4,6>: Cost 3 vsldoi4 <6,7,7,4>, <6,7,7,4>
+  2316096322U,	// <7,7,4,7>: Cost 3 vmrglw <5,6,7,4>, <6,6,7,7>
+  1705432449U,	// <7,7,4,u>: Cost 2 vsldoi12 RHS, <7,4,u,6>
+  3852916101U,	// <7,7,5,0>: Cost 4 vsldoi12 RHS, <7,5,0,1>
+  3854906765U,	// <7,7,5,1>: Cost 4 vsldoi12 RHS, <7,5,1,0>
+  3852916121U,	// <7,7,5,2>: Cost 4 vsldoi12 RHS, <7,5,2,3>
+  3389846010U,	// <7,7,5,3>: Cost 4 vmrglw <5,6,7,5>, <6,2,7,3>
+  3852916141U,	// <7,7,5,4>: Cost 4 vsldoi12 RHS, <7,5,4,5>
+  2779174326U,	// <7,7,5,5>: Cost 3 vsldoi12 RHS, <7,5,5,5>
+  2779174337U,	// <7,7,5,6>: Cost 3 vsldoi12 RHS, <7,5,6,7>
+  2329376364U,	// <7,7,5,7>: Cost 3 vmrglw <7,u,7,5>, <7,7,7,7>
+  2779321811U,	// <7,7,5,u>: Cost 3 vsldoi12 RHS, <7,5,u,7>
+  2658287718U,	// <7,7,6,0>: Cost 3 vsldoi4 <6,7,7,6>, LHS
+  3852916197U,	// <7,7,6,1>: Cost 4 vsldoi12 RHS, <7,6,1,7>
+  2779174382U,	// <7,7,6,2>: Cost 3 vsldoi12 RHS, <7,6,2,7>
+  2316112378U,	// <7,7,6,3>: Cost 3 vmrglw <5,6,7,6>, <6,2,7,3>
+  2658290998U,	// <7,7,6,4>: Cost 3 vsldoi4 <6,7,7,6>, RHS
+  3852916233U,	// <7,7,6,5>: Cost 4 vsldoi12 RHS, <7,6,5,7>
+  1651004226U,	// <7,7,6,6>: Cost 2 vsldoi8 <6,6,7,7>, <6,6,7,7>
+  2779174420U,	// <7,7,6,7>: Cost 3 vsldoi12 RHS, <7,6,7,0>
+  1652331492U,	// <7,7,6,u>: Cost 2 vsldoi8 <6,u,7,7>, <6,u,7,7>
+  1590526054U,	// <7,7,7,0>: Cost 2 vsldoi4 <7,7,7,7>, LHS
+  2328728623U,	// <7,7,7,1>: Cost 3 vmrglw <7,7,7,7>, <7,0,7,1>
+  2724746451U,	// <7,7,7,2>: Cost 3 vsldoi8 <6,6,7,7>, <7,2,7,3>
+  2322092538U,	// <7,7,7,3>: Cost 3 vmrglw <6,6,7,7>, <6,2,7,3>
+  1590529334U,	// <7,7,7,4>: Cost 2 vsldoi4 <7,7,7,7>, RHS
+  2328728951U,	// <7,7,7,5>: Cost 3 vmrglw <7,7,7,7>, <7,4,7,5>
+  2724746770U,	// <7,7,7,6>: Cost 3 vsldoi8 <6,6,7,7>, <7,6,6,7>
+  430361910U,	// <7,7,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,7,7,u>: Cost 1 vspltisw3 RHS
+  1242320994U,	// <7,7,u,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0>
+  1705580162U,	// <7,7,u,1>: Cost 2 vsldoi12 RHS, <7,u,1,2>
+  2779321996U,	// <7,7,u,2>: Cost 3 vsldoi12 RHS, <7,u,2,3>
+  1245663738U,	// <7,7,u,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  1242353766U,	// <7,7,u,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4>
+  1705580202U,	// <7,7,u,5>: Cost 2 vsldoi12 RHS, <7,u,5,6>
+  1662949620U,	// <7,7,u,6>: Cost 2 vsldoi8 <u,6,7,7>, <u,6,7,7>
+  430361910U,	// <7,7,u,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,7,u,u>: Cost 1 vspltisw3 RHS
+  1705426944U,	// <7,u,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0>
+  1705432787U,	// <7,u,0,1>: Cost 2 vsldoi12 RHS, <u,0,1,2>
+  2316060885U,	// <7,u,0,2>: Cost 3 vmrglw <5,6,7,0>, <3,0,u,2>
+  1242316956U,	// <7,u,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS
+  2779174637U,	// <7,u,0,4>: Cost 3 vsldoi12 RHS, <u,0,4,1>
+  1182750874U,	// <7,u,0,5>: Cost 2 vmrghw <7,0,1,2>, RHS
+  2316061213U,	// <7,u,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,u,6>
+  1242320200U,	// <7,u,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS
+  1705432850U,	// <7,u,0,u>: Cost 2 vsldoi12 RHS, <u,0,u,2>
+  1584578662U,	// <7,u,1,0>: Cost 2 vsldoi4 <6,7,u,1>, LHS
+  1705427764U,	// <7,u,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  631691054U,	// <7,u,1,2>: Cost 1 vsldoi12 RHS, LHS
+  2640407307U,	// <7,u,1,3>: Cost 3 vsldoi4 <3,7,u,1>, <3,7,u,1>
+  1584581942U,	// <7,u,1,4>: Cost 2 vsldoi4 <6,7,u,1>, RHS
+  2779174726U,	// <7,u,1,5>: Cost 3 vsldoi12 RHS, <u,1,5,0>
+  1584583574U,	// <7,u,1,6>: Cost 2 vsldoi4 <6,7,u,1>, <6,7,u,1>
+  2779322201U,	// <7,u,1,7>: Cost 3 vsldoi12 RHS, <u,1,7,1>
+  631691108U,	// <7,u,1,u>: Cost 1 vsldoi12 RHS, LHS
+  2779174763U,	// <7,u,2,0>: Cost 3 vsldoi12 RHS, <u,2,0,1>
+  2779174774U,	// <7,u,2,1>: Cost 3 vsldoi12 RHS, <u,2,1,3>
+  1705428584U,	// <7,u,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1705432965U,	// <7,u,2,3>: Cost 2 vsldoi12 RHS, <u,2,3,0>
+  2779174801U,	// <7,u,2,4>: Cost 3 vsldoi12 RHS, <u,2,4,3>
+  2779174810U,	// <7,u,2,5>: Cost 3 vsldoi12 RHS, <u,2,5,3>
+  2767673251U,	// <7,u,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <u,2,6,3>
+  1705580460U,	// <7,u,2,7>: Cost 2 vsldoi12 RHS, <u,2,7,3>
+  1705433010U,	// <7,u,2,u>: Cost 2 vsldoi12 RHS, <u,2,u,0>
+  1705433020U,	// <7,u,3,0>: Cost 2 vsldoi12 RHS, <u,3,0,1>
+  2779174853U,	// <7,u,3,1>: Cost 3 vsldoi12 RHS, <u,3,1,1>
+  2767673299U,	// <7,u,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <u,3,2,6>
+  1245659292U,	// <7,u,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS
+  1705433060U,	// <7,u,3,4>: Cost 2 vsldoi12 RHS, <u,3,4,5>
+  2779174893U,	// <7,u,3,5>: Cost 3 vsldoi12 RHS, <u,3,5,5>
+  2706836152U,	// <7,u,3,6>: Cost 3 vsldoi8 <3,6,7,u>, <3,6,7,u>
+  1245662536U,	// <7,u,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS
+  1705433092U,	// <7,u,3,u>: Cost 2 vsldoi12 RHS, <u,3,u,1>
+  2779174925U,	// <7,u,4,0>: Cost 3 vsldoi12 RHS, <u,4,0,1>
+  1185732398U,	// <7,u,4,1>: Cost 2 vmrghw <7,4,5,6>, LHS
+  2316093653U,	// <7,u,4,2>: Cost 3 vmrglw <5,6,7,4>, <3,0,u,2>
+  1242349724U,	// <7,u,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS
+  1705430224U,	// <7,u,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4>
+  1705433151U,	// <7,u,4,5>: Cost 2 vsldoi12 RHS, <u,4,5,6>
+  2316093981U,	// <7,u,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,u,6>
+  1242352968U,	// <7,u,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS
+  1705433178U,	// <7,u,4,u>: Cost 2 vsldoi12 RHS, <u,4,u,6>
+  1584611430U,	// <7,u,5,0>: Cost 2 vsldoi4 <6,7,u,5>, LHS
+  2781165670U,	// <7,u,5,1>: Cost 3 vsldoi12 RHS, <u,5,1,0>
+  2640439226U,	// <7,u,5,2>: Cost 3 vsldoi4 <3,7,u,5>, <2,6,3,7>
+  2640440079U,	// <7,u,5,3>: Cost 3 vsldoi4 <3,7,u,5>, <3,7,u,5>
+  1584614710U,	// <7,u,5,4>: Cost 2 vsldoi4 <6,7,u,5>, RHS
+  1705431044U,	// <7,u,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  631691418U,	// <7,u,5,6>: Cost 1 vsldoi12 RHS, RHS
+  2779322525U,	// <7,u,5,7>: Cost 3 vsldoi12 RHS, <u,5,7,1>
+  631691436U,	// <7,u,5,u>: Cost 1 vsldoi12 RHS, RHS
+  2779175087U,	// <7,u,6,0>: Cost 3 vsldoi12 RHS, <u,6,0,1>
+  2779175102U,	// <7,u,6,1>: Cost 3 vsldoi12 RHS, <u,6,1,7>
+  1648357887U,	// <7,u,6,2>: Cost 2 vsldoi8 <6,2,7,u>, <6,2,7,u>
+  1705433296U,	// <7,u,6,3>: Cost 2 vsldoi12 RHS, <u,6,3,7>
+  2779175127U,	// <7,u,6,4>: Cost 3 vsldoi12 RHS, <u,6,4,5>
+  2779175138U,	// <7,u,6,5>: Cost 3 vsldoi12 RHS, <u,6,5,7>
+  1651012419U,	// <7,u,6,6>: Cost 2 vsldoi8 <6,6,7,u>, <6,6,7,u>
+  1705580788U,	// <7,u,6,7>: Cost 2 vsldoi12 RHS, <u,6,7,7>
+  1705433341U,	// <7,u,6,u>: Cost 2 vsldoi12 RHS, <u,6,u,7>
+  1705580800U,	// <7,u,7,0>: Cost 2 vsldoi12 RHS, <u,7,0,1>
+  1187878702U,	// <7,u,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS
+  2768042263U,	// <7,u,7,2>: Cost 3 vsldoi12 <2,6,u,7>, <u,7,2,6>
+  1248346268U,	// <7,u,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS
+  1705580840U,	// <7,u,7,4>: Cost 2 vsldoi12 RHS, <u,7,4,5>
+  1187879066U,	// <7,u,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS
+  2779322679U,	// <7,u,7,6>: Cost 3 vsldoi12 RHS, <u,7,6,2>
+  430361910U,	// <7,u,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,u,7,u>: Cost 1 vspltisw3 RHS
+  1705433425U,	// <7,u,u,0>: Cost 2 vsldoi12 RHS, <u,u,0,1>
+  1705433435U,	// <7,u,u,1>: Cost 2 vsldoi12 RHS, <u,u,1,2>
+  631691621U,	// <7,u,u,2>: Cost 1 vsldoi12 RHS, LHS
+  1705433451U,	// <7,u,u,3>: Cost 2 vsldoi12 RHS, <u,u,3,0>
+  1705433465U,	// <7,u,u,4>: Cost 2 vsldoi12 RHS, <u,u,4,5>
+  1705433475U,	// <7,u,u,5>: Cost 2 vsldoi12 RHS, <u,u,5,6>
+  631691661U,	// <7,u,u,6>: Cost 1 vsldoi12 RHS, RHS
+  430361910U,	// <7,u,u,7>: Cost 1 vspltisw3 RHS
+  631691675U,	// <7,u,u,u>: Cost 1 vsldoi12 RHS, LHS
+  202162278U,	// <u,0,0,0>: Cost 1 vspltisw0 LHS
+  1678598154U,	// <u,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1>
+  2634500154U,	// <u,0,0,2>: Cost 3 vsldoi4 <2,u,0,0>, <2,u,0,0>
+  2289596269U,	// <u,0,0,3>: Cost 3 vmrglw <1,2,u,0>, <u,2,0,3>
+  1548815670U,	// <u,0,0,4>: Cost 2 vsldoi4 <0,u,0,0>, RHS
+  2663698530U,	// <u,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0>
+  2658390942U,	// <u,0,0,6>: Cost 3 vsldoi4 <6,u,0,0>, <6,u,0,0>
+  2289596597U,	// <u,0,0,7>: Cost 3 vmrglw <1,2,u,0>, <u,6,0,7>
+  202162278U,	// <u,0,0,u>: Cost 1 vspltisw0 LHS
+  1560764518U,	// <u,0,1,0>: Cost 2 vsldoi4 <2,u,0,1>, LHS
+  115720294U,	// <u,0,1,1>: Cost 1 vmrghw LHS, LHS
+  604856427U,	// <u,0,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2634508438U,	// <u,0,1,3>: Cost 3 vsldoi4 <2,u,0,1>, <3,0,1,2>
+  1560767798U,	// <u,0,1,4>: Cost 2 vsldoi4 <2,u,0,1>, RHS
+  2652426438U,	// <u,0,1,5>: Cost 3 vsldoi4 <5,u,0,1>, <5,u,0,1>
+  1584657311U,	// <u,0,1,6>: Cost 2 vsldoi4 <6,u,0,1>, <6,u,0,1>
+  2658399226U,	// <u,0,1,7>: Cost 3 vsldoi4 <6,u,0,1>, <7,0,1,2>
+  604856476U,	// <u,0,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696889850U,	// <u,0,2,0>: Cost 3 vsldoi8 <2,0,u,0>, <2,0,u,0>
+  1190174822U,	// <u,0,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS
+  2692245096U,	// <u,0,2,2>: Cost 3 vsldoi8 <1,2,u,0>, <2,2,2,2>
+  2692245158U,	// <u,0,2,3>: Cost 3 vsldoi8 <1,2,u,0>, <2,3,0,1>
+  2263916882U,	// <u,0,2,4>: Cost 3 vmrghw <u,2,3,0>, <0,4,1,5>
+  2299709908U,	// <u,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5>
+  2692245434U,	// <u,0,2,6>: Cost 3 vsldoi8 <1,2,u,0>, <2,6,3,7>
+  2701535281U,	// <u,0,2,7>: Cost 3 vsldoi8 <2,7,u,0>, <2,7,u,0>
+  1190175389U,	// <u,0,2,u>: Cost 2 vmrghw <u,2,3,0>, LHS
+  1209237504U,	// <u,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1209239206U,	// <u,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  2704189813U,	// <u,0,3,2>: Cost 3 vsldoi8 <3,2,u,0>, <3,2,u,0>
+  2692245916U,	// <u,0,3,3>: Cost 3 vsldoi8 <1,2,u,0>, <3,3,3,3>
+  2282981033U,	// <u,0,3,4>: Cost 3 vmrglw LHS, <2,3,0,4>
+  2664386658U,	// <u,0,3,5>: Cost 3 vsldoi4 <7,u,0,3>, <5,6,7,0>
+  2691877496U,	// <u,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7>
+  2664388218U,	// <u,0,3,7>: Cost 3 vsldoi4 <7,u,0,3>, <7,u,0,3>
+  1209239213U,	// <u,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  2289623040U,	// <u,0,4,0>: Cost 3 vmrglw <1,2,u,4>, <0,0,0,0>
+  1678598482U,	// <u,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5>
+  2634532926U,	// <u,0,4,2>: Cost 3 vsldoi4 <2,u,0,4>, <2,u,0,4>
+  2235580672U,	// <u,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4>
+  1143619922U,	// <u,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1618505014U,	// <u,0,4,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS
+  2658423714U,	// <u,0,4,6>: Cost 3 vsldoi4 <6,u,0,4>, <6,u,0,4>
+  2713259464U,	// <u,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0>
+  1683243409U,	// <u,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5>
+  1192443904U,	// <u,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  118702182U,	// <u,0,5,1>: Cost 1 vmrghw RHS, LHS
+  2266185901U,	// <u,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2>
+  2640513816U,	// <u,0,5,3>: Cost 3 vsldoi4 <3,u,0,5>, <3,u,0,5>
+  1192444242U,	// <u,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2718789636U,	// <u,0,5,5>: Cost 3 vsldoi8 <5,6,u,0>, <5,5,5,5>
+  1645047915U,	// <u,0,5,6>: Cost 2 vsldoi8 <5,6,u,0>, <5,6,u,0>
+  2664404604U,	// <u,0,5,7>: Cost 3 vsldoi4 <7,u,0,5>, <7,u,0,5>
+  118702749U,	// <u,0,5,u>: Cost 1 vmrghw RHS, LHS
+  2302910464U,	// <u,0,6,0>: Cost 3 vmrglw <3,4,u,6>, <0,0,0,0>
+  1192886374U,	// <u,0,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS
+  2718790138U,	// <u,0,6,2>: Cost 3 vsldoi8 <5,6,u,0>, <6,2,7,3>
+  2722771537U,	// <u,0,6,3>: Cost 3 vsldoi8 <6,3,u,0>, <6,3,u,0>
+  2266628434U,	// <u,0,6,4>: Cost 3 vmrghw <u,6,3,7>, <0,4,1,5>
+  2248950180U,	// <u,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6>
+  2718790456U,	// <u,0,6,6>: Cost 3 vsldoi8 <5,6,u,0>, <6,6,6,6>
+  2718790478U,	// <u,0,6,7>: Cost 3 vsldoi8 <5,6,u,0>, <6,7,0,1>
+  1192886941U,	// <u,0,6,u>: Cost 2 vmrghw <u,6,3,7>, LHS
+  1235812352U,	// <u,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235814054U,	// <u,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  2728080601U,	// <u,0,7,2>: Cost 3 vsldoi8 <7,2,u,0>, <7,2,u,0>
+  2640530202U,	// <u,0,7,3>: Cost 3 vsldoi4 <3,u,0,7>, <3,u,0,7>
+  2640530742U,	// <u,0,7,4>: Cost 3 vsldoi4 <3,u,0,7>, RHS
+  2309556692U,	// <u,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5>
+  2730735133U,	// <u,0,7,6>: Cost 3 vsldoi8 <7,6,u,0>, <7,6,u,0>
+  2309556856U,	// <u,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1235814061U,	// <u,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u>
+  202162278U,	// <u,0,u,0>: Cost 1 vspltisw0 LHS
+  120365158U,	// <u,0,u,1>: Cost 1 vmrghw LHS, LHS
+  604856989U,	// <u,0,u,2>: Cost 1 vsldoi12 LHS, LHS
+  2692249532U,	// <u,0,u,3>: Cost 3 vsldoi8 <1,2,u,0>, <u,3,0,1>
+  1560825142U,	// <u,0,u,4>: Cost 2 vsldoi4 <2,u,0,u>, RHS
+  1618507930U,	// <u,0,u,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS
+  1584714662U,	// <u,0,u,6>: Cost 2 vsldoi4 <6,u,0,u>, <6,u,0,u>
+  2309565048U,	// <u,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  604857043U,	// <u,0,u,u>: Cost 1 vsldoi12 LHS, LHS
+  1611210825U,	// <u,1,0,0>: Cost 2 vsldoi8 <0,0,u,1>, <0,0,u,1>
+  1616519270U,	// <u,1,0,1>: Cost 2 vsldoi8 <0,u,u,1>, LHS
+  2287605459U,	// <u,1,0,2>: Cost 3 vmrglw <0,u,u,0>, <u,0,1,2>
+  2640546588U,	// <u,1,0,3>: Cost 3 vsldoi4 <3,u,1,0>, <3,u,1,0>
+  2622631222U,	// <u,1,0,4>: Cost 3 vsldoi4 <0,u,1,0>, RHS
+  2289590610U,	// <u,1,0,5>: Cost 3 vmrglw <1,2,u,0>, <0,4,1,5>
+  2664436630U,	// <u,1,0,6>: Cost 3 vsldoi4 <7,u,1,0>, <6,7,u,1>
+  2664437376U,	// <u,1,0,7>: Cost 3 vsldoi4 <7,u,1,0>, <7,u,1,0>
+  1616519889U,	// <u,1,0,u>: Cost 2 vsldoi8 <0,u,u,1>, <0,u,u,1>
+  1548894866U,	// <u,1,1,0>: Cost 2 vsldoi4 <0,u,1,1>, <0,u,1,1>
+  269271142U,	// <u,1,1,1>: Cost 1 vspltisw1 LHS
+  1189462934U,	// <u,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  2622638230U,	// <u,1,1,3>: Cost 3 vsldoi4 <0,u,1,1>, <3,0,1,2>
+  1548897590U,	// <u,1,1,4>: Cost 2 vsldoi4 <0,u,1,1>, RHS
+  2756985692U,	// <u,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5>
+  2658472872U,	// <u,1,1,6>: Cost 3 vsldoi4 <6,u,1,1>, <6,u,1,1>
+  2287614142U,	// <u,1,1,7>: Cost 3 vmrglw <0,u,u,1>, <u,6,1,7>
+  269271142U,	// <u,1,1,u>: Cost 1 vspltisw1 LHS
+  1566818406U,	// <u,1,2,0>: Cost 2 vsldoi4 <3,u,1,2>, LHS
+  2756985735U,	// <u,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3>
+  1148371862U,	// <u,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  835584U,	// <u,1,2,3>: Cost 0 copy LHS
+  1566821686U,	// <u,1,2,4>: Cost 2 vsldoi4 <3,u,1,2>, RHS
+  2756985771U,	// <u,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3>
+  2690262970U,	// <u,1,2,6>: Cost 3 vsldoi8 <0,u,u,1>, <2,6,3,7>
+  1590711938U,	// <u,1,2,7>: Cost 2 vsldoi4 <7,u,1,2>, <7,u,1,2>
+  835584U,	// <u,1,2,u>: Cost 0 copy LHS
+  2282979337U,	// <u,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1209237514U,	// <u,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1209239702U,	// <u,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2282979502U,	// <u,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2282979341U,	// <u,1,3,4>: Cost 3 vmrglw LHS, <0,0,1,4>
+  1209237842U,	// <u,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2282979505U,	// <u,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287625423U,	// <u,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1209237521U,	// <u,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  1635101613U,	// <u,1,4,0>: Cost 2 vsldoi8 <4,0,u,1>, <4,0,u,1>
+  2289623050U,	// <u,1,4,1>: Cost 3 vmrglw <1,2,u,4>, <0,0,1,1>
+  2289625238U,	// <u,1,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,1,2>
+  2640579360U,	// <u,1,4,3>: Cost 3 vsldoi4 <3,u,1,4>, <3,u,1,4>
+  2622663990U,	// <u,1,4,4>: Cost 3 vsldoi4 <0,u,1,4>, RHS
+  1616522550U,	// <u,1,4,5>: Cost 2 vsldoi8 <0,u,u,1>, RHS
+  2664469398U,	// <u,1,4,6>: Cost 3 vsldoi4 <7,u,1,4>, <6,7,u,1>
+  2664470148U,	// <u,1,4,7>: Cost 3 vsldoi4 <7,u,1,4>, <7,u,1,4>
+  1616522793U,	// <u,1,4,u>: Cost 2 vsldoi8 <0,u,u,1>, RHS
+  1548927638U,	// <u,1,5,0>: Cost 2 vsldoi4 <0,u,1,5>, <0,u,1,5>
+  1192444724U,	// <u,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1192444822U,	// <u,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  2622670998U,	// <u,1,5,3>: Cost 3 vsldoi4 <0,u,1,5>, <3,0,1,2>
+  1548930358U,	// <u,1,5,4>: Cost 2 vsldoi4 <0,u,1,5>, RHS
+  1210728786U,	// <u,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  2714153058U,	// <u,1,5,6>: Cost 3 vsldoi8 <4,u,u,1>, <5,6,7,0>
+  2670449658U,	// <u,1,5,7>: Cost 3 vsldoi4 <u,u,1,5>, <7,0,1,2>
+  1548932910U,	// <u,1,5,u>: Cost 2 vsldoi4 <0,u,1,5>, LHS
+  2622677655U,	// <u,1,6,0>: Cost 3 vsldoi4 <0,u,1,6>, <0,u,1,6>
+  2756986063U,	// <u,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7>
+  2302912662U,	// <u,1,6,2>: Cost 3 vmrglw <3,4,u,6>, <3,0,1,2>
+  3696421014U,	// <u,1,6,3>: Cost 4 vsldoi4 <0,u,1,6>, <3,0,1,2>
+  2622680374U,	// <u,1,6,4>: Cost 3 vsldoi4 <0,u,1,6>, RHS
+  2756986099U,	// <u,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7>
+  2714153784U,	// <u,1,6,6>: Cost 3 vsldoi8 <4,u,u,1>, <6,6,6,6>
+  1651692438U,	// <u,1,6,7>: Cost 2 vsldoi8 <6,7,u,1>, <6,7,u,1>
+  1652356071U,	// <u,1,6,u>: Cost 2 vsldoi8 <6,u,u,1>, <6,u,u,1>
+  2628657254U,	// <u,1,7,0>: Cost 3 vsldoi4 <1,u,1,7>, LHS
+  1235812362U,	// <u,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235814550U,	// <u,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2309554350U,	// <u,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3>
+  2628660534U,	// <u,1,7,4>: Cost 3 vsldoi4 <1,u,1,7>, RHS
+  1235812690U,	// <u,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309554353U,	// <u,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309554678U,	// <u,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235812369U,	// <u,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  1548952217U,	// <u,1,u,0>: Cost 2 vsldoi4 <0,u,1,u>, <0,u,1,u>
+  269271142U,	// <u,1,u,1>: Cost 1 vspltisw1 LHS
+  1209280662U,	// <u,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  835584U,	// <u,1,u,3>: Cost 0 copy LHS
+  1548954934U,	// <u,1,u,4>: Cost 2 vsldoi4 <0,u,1,u>, RHS
+  1209278802U,	// <u,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2283020465U,	// <u,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  1590761096U,	// <u,1,u,7>: Cost 2 vsldoi4 <7,u,1,u>, <7,u,1,u>
+  835584U,	// <u,1,u,u>: Cost 0 copy LHS
+  2702876672U,	// <u,2,0,0>: Cost 3 vsldoi8 <3,0,u,2>, <0,0,0,0>
+  1629134950U,	// <u,2,0,1>: Cost 2 vsldoi8 <3,0,u,2>, LHS
+  2289591912U,	// <u,2,0,2>: Cost 3 vmrglw <1,2,u,0>, <2,2,2,2>
+  1215848550U,	// <u,2,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS
+  2702877010U,	// <u,2,0,4>: Cost 3 vsldoi8 <3,0,u,2>, <0,4,1,5>
+  2289222708U,	// <u,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5>
+  2779178473U,	// <u,2,0,6>: Cost 3 vsldoi12 RHS, <2,0,6,1>
+  2726249024U,	// <u,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0>
+  1215848555U,	// <u,2,0,u>: Cost 2 vmrglw <1,2,u,0>, LHS
+  2690933539U,	// <u,2,1,0>: Cost 3 vsldoi8 <1,0,u,2>, <1,0,u,2>
+  2628683124U,	// <u,2,1,1>: Cost 3 vsldoi4 <1,u,2,1>, <1,u,2,1>
+  1189463656U,	// <u,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1213866086U,	// <u,2,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS
+  2628685110U,	// <u,2,1,4>: Cost 3 vsldoi4 <1,u,2,1>, RHS
+  2263205736U,	// <u,2,1,5>: Cost 3 vmrghw LHS, <2,5,3,6>
+  1189463994U,	// <u,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2263205866U,	// <u,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1213866091U,	// <u,2,1,u>: Cost 2 vmrglw <0,u,u,1>, LHS
+  1556938854U,	// <u,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  2697569869U,	// <u,2,2,1>: Cost 3 vsldoi8 <2,1,u,2>, <2,1,u,2>
+  336380006U,	// <u,2,2,2>: Cost 1 vspltisw2 LHS
+  1678599794U,	// <u,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3>
+  1556942134U,	// <u,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  2295138061U,	// <u,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5>
+  2702878650U,	// <u,2,2,6>: Cost 3 vsldoi8 <3,0,u,2>, <2,6,3,7>
+  2300229831U,	// <u,2,2,7>: Cost 3 vmrglw <3,0,u,2>, <u,6,2,7>
+  336380006U,	// <u,2,2,u>: Cost 1 vspltisw2 LHS
+  475243165U,	// <u,2,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1548985140U,	// <u,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1209239144U,	// <u,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2>
+  135495782U,	// <u,2,3,3>: Cost 1 vmrglw LHS, LHS
+  475245878U,	// <u,2,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1596764164U,	// <u,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1596764666U,	// <u,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1596765178U,	// <u,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  135495787U,	// <u,2,3,u>: Cost 1 vmrglw LHS, LHS
+  2708851630U,	// <u,2,4,0>: Cost 3 vsldoi8 <4,0,u,2>, <4,0,u,2>
+  2217362979U,	// <u,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5>
+  2289624680U,	// <u,2,4,2>: Cost 3 vmrglw <1,2,u,4>, <2,2,2,2>
+  1215881318U,	// <u,2,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS
+  2726767824U,	// <u,2,4,4>: Cost 3 vsldoi8 <7,0,u,2>, <4,4,4,4>
+  1629138230U,	// <u,2,4,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS
+  2779178801U,	// <u,2,4,6>: Cost 3 vsldoi12 RHS, <2,4,6,5>
+  2726251976U,	// <u,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0>
+  1215881323U,	// <u,2,4,u>: Cost 2 vmrglw <1,2,u,4>, LHS
+  2628714598U,	// <u,2,5,0>: Cost 3 vsldoi4 <1,u,2,5>, LHS
+  2628715896U,	// <u,2,5,1>: Cost 3 vsldoi4 <1,u,2,5>, <1,u,2,5>
+  1192445544U,	// <u,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1213898854U,	// <u,2,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS
+  2628717878U,	// <u,2,5,4>: Cost 3 vsldoi4 <1,u,2,5>, RHS
+  2726768644U,	// <u,2,5,5>: Cost 3 vsldoi8 <7,0,u,2>, <5,5,5,5>
+  1192445882U,	// <u,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2266187754U,	// <u,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1>
+  1213898859U,	// <u,2,5,u>: Cost 2 vmrglw <0,u,u,5>, LHS
+  2634694758U,	// <u,2,6,0>: Cost 3 vsldoi4 <2,u,2,6>, LHS
+  2721460657U,	// <u,2,6,1>: Cost 3 vsldoi8 <6,1,u,2>, <6,1,u,2>
+  2296940136U,	// <u,2,6,2>: Cost 3 vmrglw <2,4,u,6>, <2,2,2,2>
+  1678600122U,	// <u,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7>
+  2634698038U,	// <u,2,6,4>: Cost 3 vsldoi4 <2,u,2,6>, RHS
+  3370682125U,	// <u,2,6,5>: Cost 4 vmrglw <2,4,u,6>, <2,4,2,5>
+  1157056442U,	// <u,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2725442455U,	// <u,2,6,7>: Cost 3 vsldoi8 <6,7,u,2>, <6,7,u,2>
+  1678600167U,	// <u,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7>
+  1653027897U,	// <u,2,7,0>: Cost 2 vsldoi8 <7,0,u,2>, <7,0,u,2>
+  2309554924U,	// <u,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235813992U,	// <u,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  162070630U,	// <u,2,7,3>: Cost 1 vmrglw RHS, LHS
+  2634706230U,	// <u,2,7,4>: Cost 3 vsldoi4 <2,u,2,7>, RHS
+  2309555252U,	// <u,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5>
+  2309555901U,	// <u,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309555416U,	// <u,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  162070635U,	// <u,2,7,u>: Cost 1 vmrglw RHS, LHS
+  475284130U,	// <u,2,u,0>: Cost 1 vsldoi4 LHS, LHS
+  1549026100U,	// <u,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  336380006U,	// <u,2,u,2>: Cost 1 vspltisw2 LHS
+  135536742U,	// <u,2,u,3>: Cost 1 vmrglw LHS, LHS
+  475286838U,	// <u,2,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1629141146U,	// <u,2,u,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS
+  1194108858U,	// <u,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  1596806138U,	// <u,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  135536747U,	// <u,2,u,u>: Cost 1 vmrglw LHS, LHS
+  1611890688U,	// <u,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  538149020U,	// <u,3,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685632685U,	// <u,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2685632764U,	// <u,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611891026U,	// <u,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2733408722U,	// <u,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7>
+  2658612153U,	// <u,3,0,6>: Cost 3 vsldoi4 <6,u,3,0>, <6,u,3,0>
+  2289592250U,	// <u,3,0,7>: Cost 3 vmrglw <1,2,u,0>, <2,6,3,7>
+  538149533U,	// <u,3,0,u>: Cost 1 vsldoi8 LHS, LHS
+  1189464214U,	// <u,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  1611891508U,	// <u,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611891606U,	// <u,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  1189464476U,	// <u,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1189464578U,	// <u,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2690278511U,	// <u,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2690278607U,	// <u,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  2287609786U,	// <u,3,1,7>: Cost 3 vmrglw <0,u,u,1>, <2,6,3,7>
+  1611892092U,	// <u,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  2685634042U,	// <u,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,u,0>
+  2685634079U,	// <u,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1>
+  1611892328U,	// <u,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2>
+  1611892390U,	// <u,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  2685634371U,	// <u,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,u,5>
+  2685634453U,	// <u,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,u,6>
+  1611892666U,	// <u,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  2300225466U,	// <u,3,2,7>: Cost 3 vmrglw <3,0,u,2>, <2,6,3,7>
+  1611892795U,	// <u,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,0,1>
+  1209238422U,	// <u,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  2282980247U,	// <u,3,3,1>: Cost 3 vmrglw LHS, <1,2,3,1>
+  1561004120U,	// <u,3,3,2>: Cost 2 vsldoi4 <2,u,3,3>, <2,u,3,3>
+  403488870U,	// <u,3,3,3>: Cost 1 vspltisw3 LHS
+  1209238426U,	// <u,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  2282980899U,	// <u,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5>
+  2282985598U,	// <u,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6>
+  1209239482U,	// <u,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  403488870U,	// <u,3,3,u>: Cost 1 vspltisw3 LHS
+  1555038310U,	// <u,3,4,0>: Cost 2 vsldoi4 <1,u,3,4>, LHS
+  1555039616U,	// <u,3,4,1>: Cost 2 vsldoi4 <1,u,3,4>, <1,u,3,4>
+  2628781672U,	// <u,3,4,2>: Cost 3 vsldoi4 <1,u,3,4>, <2,2,2,2>
+  2289624690U,	// <u,3,4,3>: Cost 3 vmrglw <1,2,u,4>, <2,2,3,3>
+  1555041590U,	// <u,3,4,4>: Cost 2 vsldoi4 <1,u,3,4>, RHS
+  538152246U,	// <u,3,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2658644925U,	// <u,3,4,6>: Cost 3 vsldoi4 <6,u,3,4>, <6,u,3,4>
+  2289625018U,	// <u,3,4,7>: Cost 3 vmrglw <1,2,u,4>, <2,6,3,7>
+  538152489U,	// <u,3,4,u>: Cost 1 vsldoi8 LHS, RHS
+  1192446102U,	// <u,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2733411983U,	// <u,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1>
+  2634762330U,	// <u,3,5,2>: Cost 3 vsldoi4 <2,u,3,5>, <2,u,3,5>
+  1192446364U,	// <u,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1192446466U,	// <u,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  1659670532U,	// <u,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1659670626U,	// <u,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0>
+  2287642554U,	// <u,3,5,7>: Cost 3 vmrglw <0,u,u,5>, <2,6,3,7>
+  1659670788U,	// <u,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0>
+  2634768486U,	// <u,3,6,0>: Cost 3 vsldoi4 <2,u,3,6>, LHS
+  2733412775U,	// <u,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1>
+  1648390659U,	// <u,3,6,2>: Cost 2 vsldoi8 <6,2,u,3>, <6,2,u,3>
+  2634770973U,	// <u,3,6,3>: Cost 3 vsldoi4 <2,u,3,6>, <3,4,u,6>
+  2634771766U,	// <u,3,6,4>: Cost 3 vsldoi4 <2,u,3,6>, RHS
+  2733413099U,	// <u,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1>
+  1659671352U,	// <u,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659671374U,	// <u,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1652372457U,	// <u,3,6,u>: Cost 2 vsldoi8 <6,u,u,3>, <6,u,u,3>
+  1561034854U,	// <u,3,7,0>: Cost 2 vsldoi4 <2,u,3,7>, LHS
+  2634777396U,	// <u,3,7,1>: Cost 3 vsldoi4 <2,u,3,7>, <1,1,1,1>
+  1561036892U,	// <u,3,7,2>: Cost 2 vsldoi4 <2,u,3,7>, <2,u,3,7>
+  1235814002U,	// <u,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1561038134U,	// <u,3,7,4>: Cost 2 vsldoi4 <2,u,3,7>, RHS
+  2309555747U,	// <u,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5>
+  2309556072U,	// <u,3,7,6>: Cost 3 vmrglw RHS, <2,5,3,6>
+  1235814330U,	// <u,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1561040686U,	// <u,3,7,u>: Cost 2 vsldoi4 <2,u,3,7>, LHS
+  1611896531U,	// <u,3,u,0>: Cost 2 vsldoi8 LHS, <u,0,1,2>
+  538154798U,	// <u,3,u,1>: Cost 1 vsldoi8 LHS, LHS
+  1611896712U,	// <u,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,3>
+  403488870U,	// <u,3,u,3>: Cost 1 vspltisw3 LHS
+  1611896895U,	// <u,3,u,4>: Cost 2 vsldoi8 LHS, <u,4,5,6>
+  538155162U,	// <u,3,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1611897040U,	// <u,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7>
+  1209280442U,	// <u,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  538155365U,	// <u,3,u,u>: Cost 1 vsldoi8 LHS, LHS
+  1165118354U,	// <u,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1618534502U,	// <u,4,0,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  2634795102U,	// <u,4,0,2>: Cost 3 vsldoi4 <2,u,4,0>, <2,u,4,0>
+  2686451968U,	// <u,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4>
+  2692276562U,	// <u,4,0,4>: Cost 3 vsldoi8 <1,2,u,4>, <0,4,1,5>
+  1705438098U,	// <u,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1>
+  2658685890U,	// <u,4,0,6>: Cost 3 vsldoi4 <6,u,4,0>, <6,u,4,0>
+  2256489928U,	// <u,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0>
+  1618535069U,	// <u,4,0,u>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  1189464978U,	// <u,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2692277044U,	// <u,4,1,1>: Cost 3 vsldoi8 <1,2,u,4>, <1,1,1,1>
+  1618535367U,	// <u,4,1,2>: Cost 2 vsldoi8 <1,2,u,4>, <1,2,u,4>
+  2640775992U,	// <u,4,1,3>: Cost 3 vsldoi4 <3,u,4,1>, <3,u,4,1>
+  1189465296U,	// <u,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  115723574U,	// <u,4,1,5>: Cost 1 vmrghw LHS, RHS
+  2263207289U,	// <u,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2>
+  2664666780U,	// <u,4,1,7>: Cost 3 vsldoi4 <7,u,4,1>, <7,u,4,1>
+  115723817U,	// <u,4,1,u>: Cost 1 vmrghw LHS, RHS
+  2263919506U,	// <u,4,2,0>: Cost 3 vmrghw <u,2,3,0>, <4,0,5,1>
+  2222115812U,	// <u,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2>
+  2692277864U,	// <u,4,2,2>: Cost 3 vsldoi8 <1,2,u,4>, <2,2,2,2>
+  2692277926U,	// <u,4,2,3>: Cost 3 vsldoi8 <1,2,u,4>, <2,3,0,1>
+  2324114640U,	// <u,4,2,4>: Cost 3 vmrglw <7,0,u,2>, <4,4,4,4>
+  1190178102U,	// <u,4,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS
+  2692278202U,	// <u,4,2,6>: Cost 3 vsldoi8 <1,2,u,4>, <2,6,3,7>
+  2701568053U,	// <u,4,2,7>: Cost 3 vsldoi8 <2,7,u,4>, <2,7,u,4>
+  1190178345U,	// <u,4,2,u>: Cost 2 vmrghw <u,2,3,0>, RHS
+  2692278422U,	// <u,4,3,0>: Cost 3 vsldoi8 <1,2,u,4>, <3,0,1,2>
+  2282981552U,	// <u,4,3,1>: Cost 3 vmrglw LHS, <3,0,4,1>
+  2704222585U,	// <u,4,3,2>: Cost 3 vsldoi8 <3,2,u,4>, <3,2,u,4>
+  2692278684U,	// <u,4,3,3>: Cost 3 vsldoi8 <1,2,u,4>, <3,3,3,3>
+  1257016528U,	// <u,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1209239246U,	// <u,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  2691910300U,	// <u,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7>
+  2664683166U,	// <u,4,3,7>: Cost 3 vsldoi4 <7,u,4,3>, <7,u,4,3>
+  1209239249U,	// <u,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  1573027942U,	// <u,4,4,0>: Cost 2 vsldoi4 <4,u,4,4>, LHS
+  2634826695U,	// <u,4,4,1>: Cost 3 vsldoi4 <2,u,4,4>, <1,2,u,4>
+  2634827874U,	// <u,4,4,2>: Cost 3 vsldoi4 <2,u,4,4>, <2,u,4,4>
+  2289629073U,	// <u,4,4,3>: Cost 3 vmrglw <1,2,u,4>, <u,2,4,3>
+  229035318U,	// <u,4,4,4>: Cost 1 vspltisw0 RHS
+  1618537782U,	// <u,4,4,5>: Cost 2 vsldoi8 <1,2,u,4>, RHS
+  2658718662U,	// <u,4,4,6>: Cost 3 vsldoi4 <6,u,4,4>, <6,u,4,4>
+  2289629401U,	// <u,4,4,7>: Cost 3 vmrglw <1,2,u,4>, <u,6,4,7>
+  229035318U,	// <u,4,4,u>: Cost 1 vspltisw0 RHS
+  1561092198U,	// <u,4,5,0>: Cost 2 vsldoi4 <2,u,4,5>, LHS
+  2628863370U,	// <u,4,5,1>: Cost 3 vsldoi4 <1,u,4,5>, <1,u,4,5>
+  1561094243U,	// <u,4,5,2>: Cost 2 vsldoi4 <2,u,4,5>, <2,u,4,5>
+  2634836118U,	// <u,4,5,3>: Cost 3 vsldoi4 <2,u,4,5>, <3,0,1,2>
+  1561095478U,	// <u,4,5,4>: Cost 2 vsldoi4 <2,u,4,5>, RHS
+  118705462U,	// <u,4,5,5>: Cost 1 vmrghw RHS, RHS
+  604859702U,	// <u,4,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2658726906U,	// <u,4,5,7>: Cost 3 vsldoi4 <6,u,4,5>, <7,0,1,2>
+  604859720U,	// <u,4,5,u>: Cost 1 vsldoi12 LHS, RHS
+  2266631058U,	// <u,4,6,0>: Cost 3 vmrghw <u,6,3,7>, <4,0,5,1>
+  2302692152U,	// <u,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1>
+  2718822906U,	// <u,4,6,2>: Cost 3 vsldoi8 <5,6,u,4>, <6,2,7,3>
+  2722804309U,	// <u,4,6,3>: Cost 3 vsldoi8 <6,3,u,4>, <6,3,u,4>
+  2723467942U,	// <u,4,6,4>: Cost 3 vsldoi8 <6,4,u,4>, <6,4,u,4>
+  1192889654U,	// <u,4,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS
+  2718823224U,	// <u,4,6,6>: Cost 3 vsldoi8 <5,6,u,4>, <6,6,6,6>
+  2718823246U,	// <u,4,6,7>: Cost 3 vsldoi8 <5,6,u,4>, <6,7,0,1>
+  1192889897U,	// <u,4,6,u>: Cost 2 vmrghw <u,6,3,7>, RHS
+  2640822374U,	// <u,4,7,0>: Cost 3 vsldoi4 <3,u,4,7>, LHS
+  2640823194U,	// <u,4,7,1>: Cost 3 vsldoi4 <3,u,4,7>, <1,2,3,4>
+  2728113373U,	// <u,4,7,2>: Cost 3 vsldoi8 <7,2,u,4>, <7,2,u,4>
+  2640825150U,	// <u,4,7,3>: Cost 3 vsldoi4 <3,u,4,7>, <3,u,4,7>
+  1235815632U,	// <u,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235814094U,	// <u,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  2730767905U,	// <u,4,7,6>: Cost 3 vsldoi8 <7,6,u,4>, <7,6,u,4>
+  2309556892U,	// <u,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1235814097U,	// <u,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u>
+  1561116774U,	// <u,4,u,0>: Cost 2 vsldoi4 <2,u,4,u>, LHS
+  1618540334U,	// <u,4,u,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  1561118822U,	// <u,4,u,2>: Cost 2 vsldoi4 <2,u,4,u>, <2,u,4,u>
+  2692282300U,	// <u,4,u,3>: Cost 3 vsldoi8 <1,2,u,4>, <u,3,0,1>
+  229035318U,	// <u,4,u,4>: Cost 1 vspltisw0 RHS
+  120368438U,	// <u,4,u,5>: Cost 1 vmrghw LHS, RHS
+  604859945U,	// <u,4,u,6>: Cost 1 vsldoi12 LHS, RHS
+  2309565084U,	// <u,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  604859963U,	// <u,4,u,u>: Cost 1 vsldoi12 LHS, RHS
+  2690293760U,	// <u,5,0,0>: Cost 3 vsldoi8 <0,u,u,5>, <0,0,0,0>
+  1616552038U,	// <u,5,0,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS
+  2640840434U,	// <u,5,0,2>: Cost 3 vsldoi4 <3,u,5,0>, <2,3,u,5>
+  2640841536U,	// <u,5,0,3>: Cost 3 vsldoi4 <3,u,5,0>, <3,u,5,0>
+  1613381970U,	// <u,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5>
+  2316135642U,	// <u,5,0,5>: Cost 3 vmrglw <5,6,u,0>, <4,4,5,5>
+  2289592834U,	// <u,5,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,5,6>
+  2664732324U,	// <u,5,0,7>: Cost 3 vsldoi4 <7,u,5,0>, <7,u,5,0>
+  1616552661U,	// <u,5,0,u>: Cost 2 vsldoi8 <0,u,u,5>, <0,u,u,5>
+  1573077094U,	// <u,5,1,0>: Cost 2 vsldoi4 <4,u,5,1>, LHS
+  1237536282U,	// <u,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  2690294678U,	// <u,5,1,2>: Cost 3 vsldoi8 <0,u,u,5>, <1,2,3,0>
+  2646821014U,	// <u,5,1,3>: Cost 3 vsldoi4 <4,u,5,1>, <3,0,1,2>
+  1573080602U,	// <u,5,1,4>: Cost 2 vsldoi4 <4,u,5,1>, <4,u,5,1>
+  1189466116U,	// <u,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1189466210U,	// <u,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  2646823930U,	// <u,5,1,7>: Cost 3 vsldoi4 <4,u,5,1>, <7,0,1,2>
+  1573082926U,	// <u,5,1,u>: Cost 2 vsldoi4 <4,u,5,1>, LHS
+  2640855142U,	// <u,5,2,0>: Cost 3 vsldoi4 <3,u,5,2>, LHS
+  2697594448U,	// <u,5,2,1>: Cost 3 vsldoi8 <2,1,u,5>, <2,1,u,5>
+  2690295400U,	// <u,5,2,2>: Cost 3 vsldoi8 <0,u,u,5>, <2,2,2,2>
+  1625179890U,	// <u,5,2,3>: Cost 2 vsldoi8 <2,3,u,5>, <2,3,u,5>
+  2699585347U,	// <u,5,2,4>: Cost 3 vsldoi8 <2,4,u,5>, <2,4,u,5>
+  2781171471U,	// <u,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3>
+  2690295738U,	// <u,5,2,6>: Cost 3 vsldoi8 <0,u,u,5>, <2,6,3,7>
+  3775318070U,	// <u,5,2,7>: Cost 4 vsldoi8 <2,7,u,5>, <2,7,u,5>
+  1628498055U,	// <u,5,2,u>: Cost 2 vsldoi8 <2,u,u,5>, <2,u,u,5>
+  2287627234U,	// <u,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1257016210U,	// <u,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2646836942U,	// <u,5,3,2>: Cost 3 vsldoi4 <4,u,5,3>, <2,3,4,5>
+  2287625131U,	// <u,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287627238U,	// <u,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1257016538U,	// <u,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1209240066U,	// <u,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287625459U,	// <u,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1209240068U,	// <u,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  2640871526U,	// <u,5,4,0>: Cost 3 vsldoi4 <3,u,5,4>, LHS
+  2316168082U,	// <u,5,4,1>: Cost 3 vmrglw <5,6,u,4>, <4,0,5,1>
+  2640873202U,	// <u,5,4,2>: Cost 3 vsldoi4 <3,u,5,4>, <2,3,u,5>
+  2640874308U,	// <u,5,4,3>: Cost 3 vsldoi4 <3,u,5,4>, <3,u,5,4>
+  1637788917U,	// <u,5,4,4>: Cost 2 vsldoi8 <4,4,u,5>, <4,4,u,5>
+  1616555318U,	// <u,5,4,5>: Cost 2 vsldoi8 <0,u,u,5>, RHS
+  2287638591U,	// <u,5,4,6>: Cost 3 vmrglw <0,u,u,4>, <u,4,5,6>
+  2664765096U,	// <u,5,4,7>: Cost 3 vsldoi4 <7,u,5,4>, <7,u,5,4>
+  1616555561U,	// <u,5,4,u>: Cost 2 vsldoi8 <0,u,u,5>, RHS
+  1573109862U,	// <u,5,5,0>: Cost 2 vsldoi4 <4,u,5,5>, LHS
+  2646852404U,	// <u,5,5,1>: Cost 3 vsldoi4 <4,u,5,5>, <1,1,1,1>
+  2646853224U,	// <u,5,5,2>: Cost 3 vsldoi4 <4,u,5,5>, <2,2,2,2>
+  2287646618U,	// <u,5,5,3>: Cost 3 vmrglw <0,u,u,5>, <u,2,5,3>
+  1573113374U,	// <u,5,5,4>: Cost 2 vsldoi4 <4,u,5,5>, <4,u,5,5>
+  296144182U,	// <u,5,5,5>: Cost 1 vspltisw1 RHS
+  1192448098U,	// <u,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  2287646946U,	// <u,5,5,7>: Cost 3 vmrglw <0,u,u,5>, <u,6,5,7>
+  296144182U,	// <u,5,5,u>: Cost 1 vspltisw1 RHS
+  1567146086U,	// <u,5,6,0>: Cost 2 vsldoi4 <3,u,5,6>, LHS
+  2628945300U,	// <u,5,6,1>: Cost 3 vsldoi4 <1,u,5,6>, <1,u,5,6>
+  2634917997U,	// <u,5,6,2>: Cost 3 vsldoi4 <2,u,5,6>, <2,u,5,6>
+  1567148870U,	// <u,5,6,3>: Cost 2 vsldoi4 <3,u,5,6>, <3,u,5,6>
+  1567149366U,	// <u,5,6,4>: Cost 2 vsldoi4 <3,u,5,6>, RHS
+  2781171799U,	// <u,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7>
+  1228950018U,	// <u,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  27705344U,	// <u,5,6,7>: Cost 0 copy RHS
+  27705344U,	// <u,5,6,u>: Cost 0 copy RHS
+  2628952166U,	// <u,5,7,0>: Cost 3 vsldoi4 <1,u,5,7>, LHS
+  1235815314U,	// <u,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309556734U,	// <u,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309555115U,	// <u,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2628955446U,	// <u,5,7,4>: Cost 3 vsldoi4 <1,u,5,7>, RHS
+  1235815642U,	// <u,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235814914U,	// <u,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309555443U,	// <u,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235814916U,	// <u,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  1567162470U,	// <u,5,u,0>: Cost 2 vsldoi4 <3,u,5,u>, LHS
+  1616557870U,	// <u,5,u,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS
+  2690299781U,	// <u,5,u,2>: Cost 3 vsldoi8 <0,u,u,5>, <u,2,3,0>
+  1567165256U,	// <u,5,u,3>: Cost 2 vsldoi4 <3,u,5,u>, <3,u,5,u>
+  1567165750U,	// <u,5,u,4>: Cost 2 vsldoi4 <3,u,5,u>, RHS
+  296144182U,	// <u,5,u,5>: Cost 1 vspltisw1 RHS
+  1209281026U,	// <u,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  27705344U,	// <u,5,u,7>: Cost 0 copy RHS
+  27705344U,	// <u,5,u,u>: Cost 0 copy RHS
+  2705563648U,	// <u,6,0,0>: Cost 3 vsldoi8 <3,4,u,6>, <0,0,0,0>
+  1631821926U,	// <u,6,0,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS
+  2262462970U,	// <u,6,0,2>: Cost 3 vmrghw <u,0,1,2>, <6,2,7,3>
+  2646886941U,	// <u,6,0,3>: Cost 3 vsldoi4 <4,u,6,0>, <3,4,u,6>
+  2705563986U,	// <u,6,0,4>: Cost 3 vsldoi8 <3,4,u,6>, <0,4,1,5>
+  2316062652U,	// <u,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5>
+  2316137272U,	// <u,6,0,6>: Cost 3 vmrglw <5,6,u,0>, <6,6,6,6>
+  1215851830U,	// <u,6,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS
+  1215851831U,	// <u,6,0,u>: Cost 2 vmrglw <1,2,u,0>, RHS
+  2634948710U,	// <u,6,1,0>: Cost 3 vsldoi4 <2,u,6,1>, LHS
+  2705564468U,	// <u,6,1,1>: Cost 3 vsldoi8 <3,4,u,6>, <1,1,1,1>
+  1189466618U,	// <u,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2263208498U,	// <u,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5>
+  2693620843U,	// <u,6,1,4>: Cost 3 vsldoi8 <1,4,u,6>, <1,4,u,6>
+  2652868860U,	// <u,6,1,5>: Cost 3 vsldoi4 <5,u,6,1>, <5,u,6,1>
+  1189466936U,	// <u,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1213869366U,	// <u,6,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS
+  1213869367U,	// <u,6,1,u>: Cost 2 vmrglw <0,u,u,1>, RHS
+  2658844774U,	// <u,6,2,0>: Cost 3 vsldoi4 <6,u,6,2>, LHS
+  3771344465U,	// <u,6,2,1>: Cost 4 vsldoi8 <2,1,u,6>, <2,1,u,6>
+  1178554874U,	// <u,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2698929907U,	// <u,6,2,3>: Cost 3 vsldoi8 <2,3,u,6>, <2,3,u,6>
+  2699593540U,	// <u,6,2,4>: Cost 3 vsldoi8 <2,4,u,6>, <2,4,u,6>
+  2700257173U,	// <u,6,2,5>: Cost 3 vsldoi8 <2,5,u,6>, <2,5,u,6>
+  2705565626U,	// <u,6,2,6>: Cost 3 vsldoi8 <3,4,u,6>, <2,6,3,7>
+  1226485046U,	// <u,6,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS
+  1226485047U,	// <u,6,2,u>: Cost 2 vmrglw <3,0,u,2>, RHS
+  2705565846U,	// <u,6,3,0>: Cost 3 vsldoi8 <3,4,u,6>, <3,0,1,2>
+  2330756585U,	// <u,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1>
+  2330756829U,	// <u,6,3,2>: Cost 3 vmrglw LHS, <2,3,6,2>
+  2282981734U,	// <u,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  1631824413U,	// <u,6,3,4>: Cost 2 vsldoi8 <3,4,u,6>, <3,4,u,6>
+  2652885246U,	// <u,6,3,5>: Cost 3 vsldoi4 <5,u,6,3>, <5,u,6,3>
+  1257018168U,	// <u,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135499062U,	// <u,6,3,7>: Cost 1 vmrglw LHS, RHS
+  135499063U,	// <u,6,3,u>: Cost 1 vmrglw LHS, RHS
+  2646917222U,	// <u,6,4,0>: Cost 3 vsldoi4 <4,u,6,4>, LHS
+  2217365931U,	// <u,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5>
+  2790167156U,	// <u,6,4,2>: Cost 3 vsldoi12 <6,4,2,u>, <6,4,2,u>
+  2646919709U,	// <u,6,4,3>: Cost 3 vsldoi4 <4,u,6,4>, <3,4,u,6>
+  2711538934U,	// <u,6,4,4>: Cost 3 vsldoi8 <4,4,u,6>, <4,4,u,6>
+  1631825206U,	// <u,6,4,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS
+  2316170040U,	// <u,6,4,6>: Cost 3 vmrglw <5,6,u,4>, <6,6,6,6>
+  1215884598U,	// <u,6,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS
+  1215884599U,	// <u,6,4,u>: Cost 2 vmrglw <1,2,u,4>, RHS
+  2634981478U,	// <u,6,5,0>: Cost 3 vsldoi4 <2,u,6,5>, LHS
+  2266190247U,	// <u,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1192448506U,	// <u,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2266190386U,	// <u,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2634984758U,	// <u,6,5,4>: Cost 3 vsldoi4 <2,u,6,5>, RHS
+  2652901632U,	// <u,6,5,5>: Cost 3 vsldoi4 <5,u,6,5>, <5,u,6,5>
+  1192448824U,	// <u,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1213902134U,	// <u,6,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS
+  1213902135U,	// <u,6,5,u>: Cost 2 vmrglw <0,u,u,5>, RHS
+  1583808614U,	// <u,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  2322010445U,	// <u,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1>
+  2718839290U,	// <u,6,6,2>: Cost 3 vsldoi8 <5,6,u,6>, <6,2,7,3>
+  2670823965U,	// <u,6,6,3>: Cost 3 vsldoi4 <u,u,6,6>, <3,4,u,6>
+  1583811894U,	// <u,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  2724147961U,	// <u,6,6,5>: Cost 3 vsldoi8 <6,5,u,6>, <6,5,u,6>
+  363253046U,	// <u,6,6,6>: Cost 1 vspltisw2 RHS
+  1229172022U,	// <u,6,6,7>: Cost 2 vmrglw <3,4,u,6>, RHS
+  363253046U,	// <u,6,6,u>: Cost 1 vspltisw2 RHS
+  499458150U,	// <u,6,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1573200692U,	// <u,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1573201512U,	// <u,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1573202070U,	// <u,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  499461673U,	// <u,6,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1573203972U,	// <u,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1235817272U,	// <u,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6>
+  162073910U,	// <u,6,7,7>: Cost 1 vmrglw RHS, RHS
+  162073911U,	// <u,6,7,u>: Cost 1 vmrglw RHS, RHS
+  499466342U,	// <u,6,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631827758U,	// <u,6,u,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS
+  1573209704U,	// <u,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1573210262U,	// <u,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  499469866U,	// <u,6,u,4>: Cost 1 vsldoi4 RHS, RHS
+  1631828122U,	// <u,6,u,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS
+  363253046U,	// <u,6,u,6>: Cost 1 vspltisw2 RHS
+  135540022U,	// <u,6,u,7>: Cost 1 vmrglw LHS, RHS
+  135540023U,	// <u,6,u,u>: Cost 1 vmrglw LHS, RHS
+  1638465536U,	// <u,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564723814U,	// <u,7,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712207533U,	// <u,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2712207612U,	// <u,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0>
+  1638465874U,	// <u,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1579192580U,	// <u,7,0,5>: Cost 2 vsldoi4 <5,u,7,0>, <5,u,7,0>
+  2712207862U,	// <u,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2316137282U,	// <u,7,0,7>: Cost 3 vmrglw <5,6,u,0>, <6,6,7,7>
+  564724381U,	// <u,7,0,u>: Cost 1 vsldoi8 RHS, LHS
+  1189467130U,	// <u,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  1638466356U,	// <u,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1638466454U,	// <u,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0>
+  2311500282U,	// <u,7,1,3>: Cost 3 vmrglw <4,u,u,1>, <6,2,7,3>
+  1189467494U,	// <u,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2712208495U,	// <u,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2694956302U,	// <u,7,1,6>: Cost 3 vsldoi8 <1,6,u,7>, <1,6,u,7>
+  1189467756U,	// <u,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1638466940U,	// <u,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0>
+  2712208829U,	// <u,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  2712208927U,	// <u,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1>
+  1638467176U,	// <u,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638467238U,	// <u,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712209165U,	// <u,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  2712209256U,	// <u,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6>
+  1627187175U,	// <u,7,2,6>: Cost 2 vsldoi8 <2,6,u,7>, <2,6,u,7>
+  2324116290U,	// <u,7,2,7>: Cost 3 vmrglw <7,0,u,2>, <6,6,7,7>
+  1628514441U,	// <u,7,2,u>: Cost 2 vsldoi8 <2,u,u,7>, <2,u,u,7>
+  1638467734U,	// <u,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712209638U,	// <u,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2700929387U,	// <u,7,3,2>: Cost 3 vsldoi8 <2,6,u,7>, <3,2,6,u>
+  1638467996U,	// <u,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638468098U,	// <u,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712210002U,	// <u,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  1585189856U,	// <u,7,3,6>: Cost 2 vsldoi4 <6,u,7,3>, <6,u,7,3>
+  1257018178U,	// <u,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1638468382U,	// <u,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638468498U,	// <u,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712210378U,	// <u,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712210485U,	// <u,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2712210564U,	// <u,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0>
+  1638468816U,	// <u,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564727112U,	// <u,7,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712210809U,	// <u,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2>
+  2712210888U,	// <u,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,0>
+  564727337U,	// <u,7,4,u>: Cost 1 vsldoi8 RHS, RHS
+  1192449018U,	// <u,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2714201743U,	// <u,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712211198U,	// <u,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  2311533050U,	// <u,7,5,3>: Cost 3 vmrglw <4,u,u,5>, <6,2,7,3>
+  1192449382U,	// <u,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  1638469636U,	// <u,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1638469730U,	// <u,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0>
+  1192449644U,	// <u,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1638469892U,	// <u,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0>
+  2712211745U,	// <u,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2>
+  2712211879U,	// <u,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1>
+  1638470138U,	// <u,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  2712212018U,	// <u,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5>
+  2712212109U,	// <u,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6>
+  2712212203U,	// <u,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1>
+  1638470456U,	// <u,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6>
+  1638470478U,	// <u,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  1638470559U,	// <u,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,0,1>
+  1235816546U,	// <u,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0>
+  2309558371U,	// <u,7,7,1>: Cost 3 vmrglw RHS, <5,6,7,1>
+  2641045434U,	// <u,7,7,2>: Cost 3 vsldoi4 <3,u,7,7>, <2,6,3,7>
+  1235816954U,	// <u,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3>
+  1235816550U,	// <u,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4>
+  2309558375U,	// <u,7,7,5>: Cost 3 vmrglw RHS, <5,6,7,5>
+  1585222628U,	// <u,7,7,6>: Cost 2 vsldoi4 <6,u,7,7>, <6,u,7,7>
+  430361910U,	// <u,7,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <u,7,7,u>: Cost 1 vspltisw3 RHS
+  1638471379U,	// <u,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2>
+  564729646U,	// <u,7,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1638471557U,	// <u,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0>
+  1638471612U,	// <u,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1>
+  1638471743U,	// <u,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6>
+  564730010U,	// <u,7,u,5>: Cost 1 vsldoi8 RHS, RHS
+  1638471888U,	// <u,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7>
+  430361910U,	// <u,7,u,7>: Cost 1 vspltisw3 RHS
+  564730213U,	// <u,7,u,u>: Cost 1 vsldoi8 RHS, LHS
+  202162278U,	// <u,u,0,0>: Cost 1 vspltisw0 LHS
+  538189985U,	// <u,u,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685673645U,	// <u,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  1215848604U,	// <u,u,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS
+  1611931986U,	// <u,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  1579266317U,	// <u,u,0,5>: Cost 2 vsldoi4 <5,u,u,0>, <5,u,u,0>
+  2289592861U,	// <u,u,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,u,6>
+  1215851848U,	// <u,u,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS
+  538190493U,	// <u,u,0,u>: Cost 1 vsldoi8 LHS, LHS
+  1549411025U,	// <u,u,1,0>: Cost 2 vsldoi4 <0,u,u,1>, <0,u,u,1>
+  115726126U,	// <u,u,1,1>: Cost 1 vmrghw LHS, LHS
+  604862254U,	// <u,u,1,2>: Cost 1 vsldoi12 LHS, LHS
+  1213866140U,	// <u,u,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS
+  1549413686U,	// <u,u,1,4>: Cost 2 vsldoi4 <0,u,u,1>, RHS
+  115726490U,	// <u,u,1,5>: Cost 1 vmrghw LHS, RHS
+  1585247207U,	// <u,u,1,6>: Cost 2 vsldoi4 <6,u,u,1>, <6,u,u,1>
+  1213869384U,	// <u,u,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS
+  604862308U,	// <u,u,1,u>: Cost 1 vsldoi12 LHS, LHS
+  1567334502U,	// <u,u,2,0>: Cost 2 vsldoi4 <3,u,u,2>, LHS
+  1190180654U,	// <u,u,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS
+  336380006U,	// <u,u,2,2>: Cost 1 vspltisw2 LHS
+  835584U,	// <u,u,2,3>: Cost 0 copy LHS
+  1567337782U,	// <u,u,2,4>: Cost 2 vsldoi4 <3,u,u,2>, RHS
+  1190181018U,	// <u,u,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS
+  1611933626U,	// <u,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  1226485064U,	// <u,u,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS
+  835584U,	// <u,u,2,u>: Cost 0 copy LHS
+  475685587U,	// <u,u,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1209239278U,	// <u,u,3,1>: Cost 2 vmrglw LHS, <2,3,u,1>
+  1209239765U,	// <u,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2>
+  135495836U,	// <u,u,3,3>: Cost 1 vmrglw LHS, LHS
+  475688246U,	// <u,u,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1209239282U,	// <u,u,3,5>: Cost 2 vmrglw LHS, <2,3,u,5>
+  1209240093U,	// <u,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135499080U,	// <u,u,3,7>: Cost 1 vmrglw LHS, RHS
+  135495841U,	// <u,u,3,u>: Cost 1 vmrglw LHS, LHS
+  1555406950U,	// <u,u,4,0>: Cost 2 vsldoi4 <1,u,u,4>, LHS
+  1555408301U,	// <u,u,4,1>: Cost 2 vsldoi4 <1,u,u,4>, <1,u,u,4>
+  2289625301U,	// <u,u,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,u,2>
+  1215881372U,	// <u,u,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS
+  229035318U,	// <u,u,4,4>: Cost 1 vspltisw0 RHS
+  538193206U,	// <u,u,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2289625629U,	// <u,u,4,6>: Cost 3 vmrglw <1,2,u,4>, <3,4,u,6>
+  1215884616U,	// <u,u,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS
+  538193449U,	// <u,u,4,u>: Cost 1 vsldoi8 LHS, RHS
+  1549443797U,	// <u,u,5,0>: Cost 2 vsldoi4 <0,u,u,5>, <0,u,u,5>
+  118708014U,	// <u,u,5,1>: Cost 1 vmrghw RHS, LHS
+  1561389191U,	// <u,u,5,2>: Cost 2 vsldoi4 <2,u,u,5>, <2,u,u,5>
+  1213898908U,	// <u,u,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS
+  1549446454U,	// <u,u,5,4>: Cost 2 vsldoi4 <0,u,u,5>, RHS
+  118708378U,	// <u,u,5,5>: Cost 1 vmrghw RHS, RHS
+  604862618U,	// <u,u,5,6>: Cost 1 vsldoi12 LHS, RHS
+  1213902152U,	// <u,u,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS
+  604862636U,	// <u,u,5,u>: Cost 1 vsldoi12 LHS, RHS
+  1567367270U,	// <u,u,6,0>: Cost 2 vsldoi4 <3,u,u,6>, LHS
+  1192892206U,	// <u,u,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS
+  1638478330U,	// <u,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  1679046864U,	// <u,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7>
+  1567370550U,	// <u,u,6,4>: Cost 2 vsldoi4 <3,u,u,6>, RHS
+  1192892570U,	// <u,u,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS
+  363253046U,	// <u,u,6,6>: Cost 1 vspltisw2 RHS
+  27705344U,	// <u,u,6,7>: Cost 0 copy RHS
+  27705344U,	// <u,u,6,u>: Cost 0 copy RHS
+  499605606U,	// <u,u,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1235812425U,	// <u,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1>
+  1561405577U,	// <u,u,7,2>: Cost 2 vsldoi4 <2,u,u,7>, <2,u,u,7>
+  162070684U,	// <u,u,7,3>: Cost 1 vmrglw RHS, LHS
+  499609147U,	// <u,u,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1235812753U,	// <u,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5>
+  1235814941U,	// <u,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6>
+  162073928U,	// <u,u,7,7>: Cost 1 vmrglw RHS, RHS
+  162070689U,	// <u,u,7,u>: Cost 1 vmrglw RHS, LHS
+  475726552U,	// <u,u,u,0>: Cost 1 vsldoi4 LHS, LHS
+  538195758U,	// <u,u,u,1>: Cost 1 vsldoi8 LHS, LHS
+  604862821U,	// <u,u,u,2>: Cost 1 vsldoi12 LHS, LHS
+  835584U,	// <u,u,u,3>: Cost 0 copy LHS
+  475729206U,	// <u,u,u,4>: Cost 1 vsldoi4 LHS, RHS
+  538196122U,	// <u,u,u,5>: Cost 1 vsldoi8 LHS, RHS
+  604862861U,	// <u,u,u,6>: Cost 1 vsldoi12 LHS, RHS
+  27705344U,	// <u,u,u,7>: Cost 0 copy RHS
+  835584U,	// <u,u,u,u>: Cost 0 copy LHS
+  0
+};
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/contrib/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
new file mode 100644
index 000000000000..8a18ab9e0e9a
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp
@@ -0,0 +1,166 @@
+//===----- PPCQPXLoadSplat.cpp - QPX Load Splat Simplification ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The QPX vector registers overlay the scalar floating-point registers, and
+// any scalar floating-point loads splat their value across all vector lanes.
+// Thus, if we have a scalar load followed by a splat, we can remove the splat
+// (i.e. replace the load with a load-and-splat pseudo instruction).
+//
+// This pass must run after anything that might do store-to-load forwarding.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-qpx-load-splat"
+
+STATISTIC(NumSimplified, "Number of QPX load splats simplified");
+
+namespace llvm {
+  void initializePPCQPXLoadSplatPass(PassRegistry&);
+}
+
+namespace {
+  struct PPCQPXLoadSplat : public MachineFunctionPass {
+    static char ID;
+    PPCQPXLoadSplat() : MachineFunctionPass(ID) {
+      initializePPCQPXLoadSplatPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    StringRef getPassName() const override {
+      return "PowerPC QPX Load Splat Simplification";
+    }
+  };
+  char PPCQPXLoadSplat::ID = 0;
+}
+
+INITIALIZE_PASS(PPCQPXLoadSplat, "ppc-qpx-load-splat",
+                "PowerPC QPX Load Splat Simplification",
+                false, false)
+
+FunctionPass *llvm::createPPCQPXLoadSplatPass() {
+  return new PPCQPXLoadSplat();
+}
+
+bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  bool MadeChange = false;
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+  for (auto MFI = MF.begin(), MFIE = MF.end(); MFI != MFIE; ++MFI) {
+    MachineBasicBlock *MBB = &*MFI;
+    SmallVector<MachineInstr *, 4> Splats;
+
+    for (auto MBBI = MBB->rbegin(); MBBI != MBB->rend(); ++MBBI) {
+      MachineInstr *MI = &*MBBI;
+
+      if (MI->hasUnmodeledSideEffects() || MI->isCall()) {
+        Splats.clear();
+        continue;
+      }
+
+      // We're looking for a sequence like this:
+      // %F0<def> = LFD 0, %X3<kill>, %QF0<imp-def>; mem:LD8[%a](tbaa=!2)
+      // %QF1<def> = QVESPLATI %QF0<kill>, 0, %RM<imp-use>
+
+      for (auto SI = Splats.begin(); SI != Splats.end();) {
+        MachineInstr *SMI = *SI;
+        unsigned SplatReg = SMI->getOperand(0).getReg();
+        unsigned SrcReg = SMI->getOperand(1).getReg();
+
+        if (MI->modifiesRegister(SrcReg, TRI)) {
+          switch (MI->getOpcode()) {
+          default:
+            SI = Splats.erase(SI);
+            continue;
+          case PPC::LFS:
+          case PPC::LFD:
+          case PPC::LFSU:
+          case PPC::LFDU:
+          case PPC::LFSUX:
+          case PPC::LFDUX:
+          case PPC::LFSX:
+          case PPC::LFDX:
+          case PPC::LFIWAX:
+          case PPC::LFIWZX:
+            if (SplatReg != SrcReg) {
+              // We need to change the load to define the scalar subregister of
+              // the QPX splat source register.
+              unsigned SubRegIndex =
+                TRI->getSubRegIndex(SrcReg, MI->getOperand(0).getReg());
+              unsigned SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex);
+
+              // Substitute both the explicit defined register, and also the
+              // implicit def of the containing QPX register.
+              MI->getOperand(0).setReg(SplatSubReg);
+              MI->substituteRegister(SrcReg, SplatReg, 0, *TRI);
+            }
+
+            SI = Splats.erase(SI);
+
+            // If SMI is directly after MI, then MBBI's base iterator is
+            // pointing at SMI.  Adjust MBBI around the call to erase SMI to
+            // avoid invalidating MBBI.
+            ++MBBI;
+            SMI->eraseFromParent();
+            --MBBI;
+
+            ++NumSimplified;
+            MadeChange = true;
+            continue;
+          }
+        }
+
+        // If this instruction defines the splat register, then we cannot move
+        // the previous definition above it. If it reads from the splat
+        // register, then it must already be alive from some previous
+        // definition, and if the splat register is different from the source
+        // register, then this definition must not be the load for which we're
+        // searching.
+        if (MI->modifiesRegister(SplatReg, TRI) ||
+            (SrcReg != SplatReg &&
+             MI->readsRegister(SplatReg, TRI))) {
+          SI = Splats.erase(SI);
+          continue;
+        }
+
+        ++SI;
+      }
+
+      if (MI->getOpcode() != PPC::QVESPLATI &&
+          MI->getOpcode() != PPC::QVESPLATIs &&
+          MI->getOpcode() != PPC::QVESPLATIb)
+        continue;
+      if (MI->getOperand(2).getImm() != 0)
+        continue;
+
+      // If there are other uses of the scalar value after this, replacing
+      // those uses might be non-trivial.
+      if (!MI->getOperand(1).isKill())
+        continue;
+
+      Splats.push_back(MI);
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
new file mode 100644
index 000000000000..e49201402861
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -0,0 +1,1078 @@
+//===-- PPCRegisterInfo.cpp - PowerPC Register Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCRegisterInfo.h"
+#include "PPC.h"
+#include "PPCFrameLowering.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "reginfo"
+
+#define GET_REGINFO_TARGET_DESC
+#include "PPCGenRegisterInfo.inc"
+
+static cl::opt<bool>
+EnableBasePointer("ppc-use-base-pointer", cl::Hidden, cl::init(true),
+         cl::desc("Enable use of a base pointer for complex stack frames"));
+
+static cl::opt<bool>
+AlwaysBasePointer("ppc-always-use-base-pointer", cl::Hidden, cl::init(false),
+         cl::desc("Force the use of a base pointer in every function"));
+
+PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM)
+  : PPCGenRegisterInfo(TM.isPPC64() ? PPC::LR8 : PPC::LR,
+                       TM.isPPC64() ? 0 : 1,
+                       TM.isPPC64() ? 0 : 1),
+    TM(TM) {
+  ImmToIdxMap[PPC::LD]   = PPC::LDX;    ImmToIdxMap[PPC::STD]  = PPC::STDX;
+  ImmToIdxMap[PPC::LBZ]  = PPC::LBZX;   ImmToIdxMap[PPC::STB]  = PPC::STBX;
+  ImmToIdxMap[PPC::LHZ]  = PPC::LHZX;   ImmToIdxMap[PPC::LHA]  = PPC::LHAX;
+  ImmToIdxMap[PPC::LWZ]  = PPC::LWZX;   ImmToIdxMap[PPC::LWA]  = PPC::LWAX;
+  ImmToIdxMap[PPC::LFS]  = PPC::LFSX;   ImmToIdxMap[PPC::LFD]  = PPC::LFDX;
+  ImmToIdxMap[PPC::STH]  = PPC::STHX;   ImmToIdxMap[PPC::STW]  = PPC::STWX;
+  ImmToIdxMap[PPC::STFS] = PPC::STFSX;  ImmToIdxMap[PPC::STFD] = PPC::STFDX;
+  ImmToIdxMap[PPC::ADDI] = PPC::ADD4;
+  ImmToIdxMap[PPC::LWA_32] = PPC::LWAX_32;
+
+  // 64-bit
+  ImmToIdxMap[PPC::LHA8] = PPC::LHAX8; ImmToIdxMap[PPC::LBZ8] = PPC::LBZX8;
+  ImmToIdxMap[PPC::LHZ8] = PPC::LHZX8; ImmToIdxMap[PPC::LWZ8] = PPC::LWZX8;
+  ImmToIdxMap[PPC::STB8] = PPC::STBX8; ImmToIdxMap[PPC::STH8] = PPC::STHX8;
+  ImmToIdxMap[PPC::STW8] = PPC::STWX8; ImmToIdxMap[PPC::STDU] = PPC::STDUX;
+  ImmToIdxMap[PPC::ADDI8] = PPC::ADD8;
+
+  // VSX
+  ImmToIdxMap[PPC::DFLOADf32] = PPC::LXSSPX;
+  ImmToIdxMap[PPC::DFLOADf64] = PPC::LXSDX;
+  ImmToIdxMap[PPC::DFSTOREf32] = PPC::STXSSPX;
+  ImmToIdxMap[PPC::DFSTOREf64] = PPC::STXSDX;
+  ImmToIdxMap[PPC::LXV] = PPC::LXVX;
+  ImmToIdxMap[PPC::LXSD] = PPC::LXSDX;
+  ImmToIdxMap[PPC::LXSSP] = PPC::LXSSPX;
+  ImmToIdxMap[PPC::STXV] = PPC::STXVX;
+  ImmToIdxMap[PPC::STXSD] = PPC::STXSDX;
+  ImmToIdxMap[PPC::STXSSP] = PPC::STXSSPX;
+}
+
+/// getPointerRegClass - Return the register class to use to hold pointers.
+/// This is used for addressing modes.
+const TargetRegisterClass *
+PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                       const {
+  // Note that PPCInstrInfo::FoldImmediate also directly uses this Kind value
+  // when it checks for ZERO folding.
+  if (Kind == 1) {
+    if (TM.isPPC64())
+      return &PPC::G8RC_NOX0RegClass;
+    return &PPC::GPRC_NOR0RegClass;
+  }
+
+  if (TM.isPPC64())
+    return &PPC::G8RCRegClass;
+  return &PPC::GPRCRegClass;
+}
+
+const MCPhysReg*
+PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  const PPCSubtarget &Subtarget = MF->getSubtarget<PPCSubtarget>();
+  if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) {
+    if (Subtarget.hasVSX())
+      return CSR_64_AllRegs_VSX_SaveList;
+    if (Subtarget.hasAltivec())
+      return CSR_64_AllRegs_Altivec_SaveList;
+    return CSR_64_AllRegs_SaveList;
+  }
+
+  if (Subtarget.isDarwinABI())
+    return TM.isPPC64()
+               ? (Subtarget.hasAltivec() ? CSR_Darwin64_Altivec_SaveList
+                                         : CSR_Darwin64_SaveList)
+               : (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_SaveList
+                                         : CSR_Darwin32_SaveList);
+
+  if (TM.isPPC64() && MF->getInfo<PPCFunctionInfo>()->isSplitCSR())
+    return CSR_SRV464_TLS_PE_SaveList;
+
+  // On PPC64, we might need to save r2 (but only if it is not reserved).
+  bool SaveR2 = MF->getRegInfo().isAllocatable(PPC::X2);
+
+  return TM.isPPC64()
+             ? (Subtarget.hasAltivec()
+                    ? (SaveR2 ? CSR_SVR464_R2_Altivec_SaveList
+                              : CSR_SVR464_Altivec_SaveList)
+                    : (SaveR2 ? CSR_SVR464_R2_SaveList : CSR_SVR464_SaveList))
+             : (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_SaveList
+                                       : CSR_SVR432_SaveList);
+}
+
+const MCPhysReg *
+PPCRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
+  assert(MF && "Invalid MachineFunction pointer.");
+  const PPCSubtarget &Subtarget = MF->getSubtarget<PPCSubtarget>();
+  if (Subtarget.isDarwinABI())
+    return nullptr;
+  if (!TM.isPPC64())
+    return nullptr;
+  if (MF->getFunction()->getCallingConv() != CallingConv::CXX_FAST_TLS)
+    return nullptr;
+  if (!MF->getInfo<PPCFunctionInfo>()->isSplitCSR())
+    return nullptr;
+
+  // On PPC64, we might need to save r2 (but only if it is not reserved).
+  bool SaveR2 = !getReservedRegs(*MF).test(PPC::X2);
+  if (Subtarget.hasAltivec())
+    return SaveR2
+      ? CSR_SVR464_R2_Altivec_ViaCopy_SaveList
+      : CSR_SVR464_Altivec_ViaCopy_SaveList;
+  else
+    return SaveR2
+      ? CSR_SVR464_R2_ViaCopy_SaveList
+      : CSR_SVR464_ViaCopy_SaveList;
+}
+
+const uint32_t *
+PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                      CallingConv::ID CC) const {
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  if (CC == CallingConv::AnyReg) {
+    if (Subtarget.hasVSX())
+      return CSR_64_AllRegs_VSX_RegMask;
+    if (Subtarget.hasAltivec())
+      return CSR_64_AllRegs_Altivec_RegMask;
+    return CSR_64_AllRegs_RegMask;
+  }
+
+  if (Subtarget.isDarwinABI())
+    return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_Darwin64_Altivec_RegMask
+                                                  : CSR_Darwin64_RegMask)
+                        : (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_RegMask
+                                                  : CSR_Darwin32_RegMask);
+
+  return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR464_Altivec_RegMask
+                                                : CSR_SVR464_RegMask)
+                      : (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_RegMask
+                                                : CSR_SVR432_RegMask);
+}
+
+const uint32_t*
+PPCRegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
+}
+
+void PPCRegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
+  for (unsigned PseudoReg : {PPC::ZERO, PPC::ZERO8, PPC::RM})
+    Mask[PseudoReg / 32] &= ~(1u << (PseudoReg % 32));
+}
+
+BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
+
+  // The ZERO register is not really a register, but the representation of r0
+  // when used in instructions that treat r0 as the constant 0.
+  Reserved.set(PPC::ZERO);
+  Reserved.set(PPC::ZERO8);
+
+  // The FP register is also not really a register, but is the representation
+  // of the frame pointer register used by ISD::FRAMEADDR.
+  Reserved.set(PPC::FP);
+  Reserved.set(PPC::FP8);
+
+  // The BP register is also not really a register, but is the representation
+  // of the base pointer register used by setjmp.
+  Reserved.set(PPC::BP);
+  Reserved.set(PPC::BP8);
+
+  // The counter registers must be reserved so that counter-based loops can
+  // be correctly formed (and the mtctr instructions are not DCE'd).
+  Reserved.set(PPC::CTR);
+  Reserved.set(PPC::CTR8);
+
+  Reserved.set(PPC::R1);
+  Reserved.set(PPC::LR);
+  Reserved.set(PPC::LR8);
+  Reserved.set(PPC::RM);
+
+  if (!Subtarget.isDarwinABI() || !Subtarget.hasAltivec())
+    Reserved.set(PPC::VRSAVE);
+
+  // The SVR4 ABI reserves r2 and r13
+  if (Subtarget.isSVR4ABI()) {
+    Reserved.set(PPC::R2);  // System-reserved register
+    Reserved.set(PPC::R13); // Small Data Area pointer register
+  }
+
+  // On PPC64, r13 is the thread pointer. Never allocate this register.
+  if (TM.isPPC64()) {
+    Reserved.set(PPC::R13);
+
+    Reserved.set(PPC::X1);
+    Reserved.set(PPC::X13);
+
+    if (TFI->needsFP(MF))
+      Reserved.set(PPC::X31);
+
+    if (hasBasePointer(MF))
+      Reserved.set(PPC::X30);
+
+    // The 64-bit SVR4 ABI reserves r2 for the TOC pointer.
+    if (Subtarget.isSVR4ABI()) {
+      // We only reserve r2 if we need to use the TOC pointer. If we have no
+      // explicit uses of the TOC pointer (meaning we're a leaf function with
+      // no constant-pool loads, etc.) and we have no potential uses inside an
+      // inline asm block, then we can treat r2 has an ordinary callee-saved
+      // register.
+      const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+      if (FuncInfo->usesTOCBasePtr() || MF.hasInlineAsm())
+        Reserved.set(PPC::X2);
+      else
+        Reserved.reset(PPC::R2);
+    }
+  }
+
+  if (TFI->needsFP(MF))
+    Reserved.set(PPC::R31);
+
+  bool IsPositionIndependent = TM.isPositionIndependent();
+  if (hasBasePointer(MF)) {
+    if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent)
+      Reserved.set(PPC::R29);
+    else
+      Reserved.set(PPC::R30);
+  }
+
+  if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent)
+    Reserved.set(PPC::R30);
+
+  // Reserve Altivec registers when Altivec is unavailable.
+  if (!Subtarget.hasAltivec())
+    for (TargetRegisterClass::iterator I = PPC::VRRCRegClass.begin(),
+         IE = PPC::VRRCRegClass.end(); I != IE; ++I)
+      Reserved.set(*I);
+
+  return Reserved;
+}
+
+unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                              MachineFunction &MF) const {
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
+  const unsigned DefaultSafety = 1;
+
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case PPC::G8RC_NOX0RegClassID:
+  case PPC::GPRC_NOR0RegClassID:
+  case PPC::G8RCRegClassID:
+  case PPC::GPRCRegClassID: {
+    unsigned FP = TFI->hasFP(MF) ? 1 : 0;
+    return 32 - FP - DefaultSafety;
+  }
+  case PPC::F8RCRegClassID:
+  case PPC::F4RCRegClassID:
+  case PPC::QFRCRegClassID:
+  case PPC::QSRCRegClassID:
+  case PPC::QBRCRegClassID:
+  case PPC::VRRCRegClassID:
+  case PPC::VFRCRegClassID:
+  case PPC::VSLRCRegClassID:
+    return 32 - DefaultSafety;
+  case PPC::VSRCRegClassID:
+  case PPC::VSFRCRegClassID:
+  case PPC::VSSRCRegClassID:
+    return 64 - DefaultSafety;
+  case PPC::CRRCRegClassID:
+    return 8 - DefaultSafety;
+  }
+}
+
+const TargetRegisterClass *
+PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                                           const MachineFunction &MF) const {
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  if (Subtarget.hasVSX()) {
+    // With VSX, we can inflate various sub-register classes to the full VSX
+    // register set.
+
+    if (RC == &PPC::F8RCRegClass)
+      return &PPC::VSFRCRegClass;
+    else if (RC == &PPC::VRRCRegClass)
+      return &PPC::VSRCRegClass;
+    else if (RC == &PPC::F4RCRegClass && Subtarget.hasP8Vector())
+      return &PPC::VSSRCRegClass;
+  }
+
+  return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF);
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+/// lowerDynamicAlloc - Generate the code for allocating an object in the
+/// current frame.  The sequence of code will be in the general form
+///
+///   addi   R0, SP, \#frameSize ; get the address of the previous frame
+///   stwxu  R0, SP, Rnegsize   ; add and update the SP with the negated size
+///   addi   Rnew, SP, \#maxCalFrameSize ; get the top of the allocation
+///
+void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  // Get the frame info.
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  // Get the instruction info.
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  // Determine whether 64-bit pointers are used.
+  bool LP64 = TM.isPPC64();
+  DebugLoc dl = MI.getDebugLoc();
+
+  // Get the maximum call stack size.
+  unsigned maxCallFrameSize = MFI.getMaxCallFrameSize();
+  // Get the total frame size.
+  unsigned FrameSize = MFI.getStackSize();
+
+  // Get stack alignments.
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
+  unsigned TargetAlign = TFI->getStackAlignment();
+  unsigned MaxAlign = MFI.getMaxAlignment();
+  assert((maxCallFrameSize & (MaxAlign-1)) == 0 &&
+         "Maximum call-frame size not sufficiently aligned");
+
+  // Determine the previous frame's address.  If FrameSize can't be
+  // represented as 16 bits or we need special alignment, then we load the
+  // previous frame's address from 0(SP).  Why not do an addis of the hi?
+  // Because R0 is our only safe tmp register and addi/addis treat R0 as zero.
+  // Constructing the constant and adding would take 3 instructions.
+  // Fortunately, a frame greater than 32K is rare.
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+
+  if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
+    BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
+      .addReg(PPC::R31)
+      .addImm(FrameSize);
+  } else if (LP64) {
+    BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg)
+      .addImm(0)
+      .addReg(PPC::X1);
+  } else {
+    BuildMI(MBB, II, dl, TII.get(PPC::LWZ), Reg)
+      .addImm(0)
+      .addReg(PPC::R1);
+  }
+
+  bool KillNegSizeReg = MI.getOperand(1).isKill();
+  unsigned NegSizeReg = MI.getOperand(1).getReg();
+
+  // Grow the stack and update the stack pointer link, then determine the
+  // address of new allocated space.
+  if (LP64) {
+    if (MaxAlign > TargetAlign) {
+      unsigned UnalNegSizeReg = NegSizeReg;
+      NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC);
+
+      // Unfortunately, there is no andi, only andi., and we can't insert that
+      // here because we might clobber cr0 while it is live.
+      BuildMI(MBB, II, dl, TII.get(PPC::LI8), NegSizeReg)
+        .addImm(~(MaxAlign-1));
+
+      unsigned NegSizeReg1 = NegSizeReg;
+      NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC);
+      BuildMI(MBB, II, dl, TII.get(PPC::AND8), NegSizeReg)
+        .addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg))
+        .addReg(NegSizeReg1, RegState::Kill);
+      KillNegSizeReg = true;
+    }
+
+    BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1)
+      .addReg(Reg, RegState::Kill)
+      .addReg(PPC::X1)
+      .addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
+    BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
+      .addReg(PPC::X1)
+      .addImm(maxCallFrameSize);
+  } else {
+    if (MaxAlign > TargetAlign) {
+      unsigned UnalNegSizeReg = NegSizeReg;
+      NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC);
+
+      // Unfortunately, there is no andi, only andi., and we can't insert that
+      // here because we might clobber cr0 while it is live.
+      BuildMI(MBB, II, dl, TII.get(PPC::LI), NegSizeReg)
+        .addImm(~(MaxAlign-1));
+
+      unsigned NegSizeReg1 = NegSizeReg;
+      NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC);
+      BuildMI(MBB, II, dl, TII.get(PPC::AND), NegSizeReg)
+        .addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg))
+        .addReg(NegSizeReg1, RegState::Kill);
+      KillNegSizeReg = true;
+    }
+
+    BuildMI(MBB, II, dl, TII.get(PPC::STWUX), PPC::R1)
+      .addReg(Reg, RegState::Kill)
+      .addReg(PPC::R1)
+      .addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
+    BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg())
+      .addReg(PPC::R1)
+      .addImm(maxCallFrameSize);
+  }
+
+  // Discard the DYNALLOC instruction.
+  MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerDynamicAreaOffset(
+    MachineBasicBlock::iterator II) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  // Get the frame info.
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  // Get the instruction info.
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+
+  unsigned maxCallFrameSize = MFI.getMaxCallFrameSize();
+  DebugLoc dl = MI.getDebugLoc();
+  BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg())
+      .addImm(maxCallFrameSize);
+  MBB.erase(II);
+}
+
+/// lowerCRSpilling - Generate the code for spilling a CR register. Instead of
+/// reserving a whole register (R0), we scrounge for one here. This generates
+/// code like this:
+///
+///   mfcr rA                  ; Move the conditional register into GPR rA.
+///   rlwinm rA, rA, SB, 0, 31 ; Shift the bits left so they are in CR0's slot.
+///   stw rA, FI               ; Store rA to the frame.
+///
+void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
+                                      unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; SPILL_CR <SrcReg>, <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  bool LP64 = TM.isPPC64();
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  unsigned SrcReg = MI.getOperand(0).getReg();
+
+  // We need to store the CR in the low 4-bits of the saved value. First, issue
+  // an MFOCRF to save all of the CRBits and, if needed, kill the SrcReg.
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
+      .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
+
+  // If the saved register wasn't CR0, shift the bits left so that they are in
+  // CR0's slot.
+  if (SrcReg != PPC::CR0) {
+    unsigned Reg1 = Reg;
+    Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+
+    // rlwinm rA, rA, ShiftBits, 0, 31.
+    BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
+      .addReg(Reg1, RegState::Kill)
+      .addImm(getEncodingValue(SrcReg) * 4)
+      .addImm(0)
+      .addImm(31);
+  }
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::STW8 : PPC::STW))
+                    .addReg(Reg, RegState::Kill),
+                    FrameIndex);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
+                                      unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; <DestReg> = RESTORE_CR <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  bool LP64 = TM.isPPC64();
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  unsigned DestReg = MI.getOperand(0).getReg();
+  assert(MI.definesRegister(DestReg) &&
+    "RESTORE_CR does not define its destination");
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LWZ8 : PPC::LWZ),
+                              Reg), FrameIndex);
+
+  // If the reloaded register isn't CR0, shift the bits right so that they are
+  // in the right CR's slot.
+  if (DestReg != PPC::CR0) {
+    unsigned Reg1 = Reg;
+    Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+
+    unsigned ShiftBits = getEncodingValue(DestReg)*4;
+    // rlwinm r11, r11, 32-ShiftBits, 0, 31.
+    BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
+             .addReg(Reg1, RegState::Kill).addImm(32-ShiftBits).addImm(0)
+             .addImm(31);
+  }
+
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF), DestReg)
+             .addReg(Reg, RegState::Kill);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
+                                         unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; SPILL_CRBIT <SrcReg>, <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  bool LP64 = TM.isPPC64();
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  unsigned SrcReg = MI.getOperand(0).getReg();
+
+  BuildMI(MBB, II, dl, TII.get(TargetOpcode::KILL),
+          getCRFromCRBit(SrcReg))
+          .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
+
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
+      .addReg(getCRFromCRBit(SrcReg));
+
+  // If the saved register wasn't CR0LT, shift the bits left so that the bit to
+  // store is the first one. Mask all but that bit.
+  unsigned Reg1 = Reg;
+  Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+
+  // rlwinm rA, rA, ShiftBits, 0, 0.
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
+    .addReg(Reg1, RegState::Kill)
+    .addImm(getEncodingValue(SrcReg))
+    .addImm(0).addImm(0);
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::STW8 : PPC::STW))
+                    .addReg(Reg, RegState::Kill),
+                    FrameIndex);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II,
+                                      unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; <DestReg> = RESTORE_CRBIT <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  bool LP64 = TM.isPPC64();
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  unsigned DestReg = MI.getOperand(0).getReg();
+  assert(MI.definesRegister(DestReg) &&
+    "RESTORE_CRBIT does not define its destination");
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LWZ8 : PPC::LWZ),
+                              Reg), FrameIndex);
+
+  BuildMI(MBB, II, dl, TII.get(TargetOpcode::IMPLICIT_DEF), DestReg);
+
+  unsigned RegO = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), RegO)
+          .addReg(getCRFromCRBit(DestReg));
+
+  unsigned ShiftBits = getEncodingValue(DestReg);
+  // rlwimi r11, r10, 32-ShiftBits, ..., ...
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWIMI8 : PPC::RLWIMI), RegO)
+      .addReg(RegO, RegState::Kill)
+      .addReg(Reg, RegState::Kill)
+      .addImm(ShiftBits ? 32 - ShiftBits : 0)
+      .addImm(ShiftBits)
+      .addImm(ShiftBits);
+
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF),
+          getCRFromCRBit(DestReg))
+      .addReg(RegO, RegState::Kill)
+      // Make sure we have a use dependency all the way through this
+      // sequence of instructions. We can't have the other bits in the CR
+      // modified in between the mfocrf and the mtocrf.
+      .addReg(getCRFromCRBit(DestReg), RegState::Implicit);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II,
+                                          unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; SPILL_VRSAVE <SrcReg>, <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC);
+  unsigned SrcReg = MI.getOperand(0).getReg();
+
+  BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg)
+      .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
+
+  addFrameReference(
+      BuildMI(MBB, II, dl, TII.get(PPC::STW)).addReg(Reg, RegState::Kill),
+      FrameIndex);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II,
+                                         unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; <DestReg> = RESTORE_VRSAVE <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC);
+  unsigned DestReg = MI.getOperand(0).getReg();
+  assert(MI.definesRegister(DestReg) &&
+    "RESTORE_VRSAVE does not define its destination");
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::LWZ),
+                              Reg), FrameIndex);
+
+  BuildMI(MBB, II, dl, TII.get(PPC::MTVRSAVEv), DestReg)
+             .addReg(Reg, RegState::Kill);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+                                           unsigned Reg, int &FrameIdx) const {
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  // For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4
+  // ABI, return true to prevent allocating an additional frame slot.
+  // For 64-bit, the CR save area is at SP+8; the value of FrameIdx = 0
+  // is arbitrary and will be subsequently ignored.  For 32-bit, we have
+  // previously created the stack slot if needed, so return its FrameIdx.
+  if (Subtarget.isSVR4ABI() && PPC::CR2 <= Reg && Reg <= PPC::CR4) {
+    if (TM.isPPC64())
+      FrameIdx = 0;
+    else {
+      const PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+      FrameIdx = FI->getCRSpillFrameIndex();
+    }
+    return true;
+  }
+  return false;
+}
+
+// Figure out if the offset in the instruction must be a multiple of 4.
+// This is true for instructions like "STD".
+static bool usesIXAddr(const MachineInstr &MI) {
+  unsigned OpC = MI.getOpcode();
+
+  switch (OpC) {
+  default:
+    return false;
+  case PPC::LWA:
+  case PPC::LWA_32:
+  case PPC::LD:
+  case PPC::STD:
+    return true;
+  }
+}
+
+// Return the OffsetOperandNo given the FIOperandNum (and the instruction).
+static unsigned getOffsetONFromFION(const MachineInstr &MI,
+                                    unsigned FIOperandNum) {
+  // Take into account whether it's an add or mem instruction
+  unsigned OffsetOperandNo = (FIOperandNum == 2) ? 1 : 2;
+  if (MI.isInlineAsm())
+    OffsetOperandNo = FIOperandNum - 1;
+  else if (MI.getOpcode() == TargetOpcode::STACKMAP ||
+           MI.getOpcode() == TargetOpcode::PATCHPOINT)
+    OffsetOperandNo = FIOperandNum + 1;
+
+  return OffsetOperandNo;
+}
+
+void
+PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                     int SPAdj, unsigned FIOperandNum,
+                                     RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  // Get the instruction info.
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  // Get the frame info.
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum);
+
+  // Get the frame index.
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+  // Get the frame pointer save index.  Users of this index are primarily
+  // DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int FPSI = FI->getFramePointerSaveIndex();
+  // Get the instruction opcode.
+  unsigned OpC = MI.getOpcode();
+
+  if ((OpC == PPC::DYNAREAOFFSET || OpC == PPC::DYNAREAOFFSET8)) {
+    lowerDynamicAreaOffset(II);
+    return;
+  }
+
+  // Special case for dynamic alloca.
+  if (FPSI && FrameIndex == FPSI &&
+      (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) {
+    lowerDynamicAlloc(II);
+    return;
+  }
+
+  // Special case for pseudo-ops SPILL_CR and RESTORE_CR, etc.
+  if (OpC == PPC::SPILL_CR) {
+    lowerCRSpilling(II, FrameIndex);
+    return;
+  } else if (OpC == PPC::RESTORE_CR) {
+    lowerCRRestore(II, FrameIndex);
+    return;
+  } else if (OpC == PPC::SPILL_CRBIT) {
+    lowerCRBitSpilling(II, FrameIndex);
+    return;
+  } else if (OpC == PPC::RESTORE_CRBIT) {
+    lowerCRBitRestore(II, FrameIndex);
+    return;
+  } else if (OpC == PPC::SPILL_VRSAVE) {
+    lowerVRSAVESpilling(II, FrameIndex);
+    return;
+  } else if (OpC == PPC::RESTORE_VRSAVE) {
+    lowerVRSAVERestore(II, FrameIndex);
+    return;
+  }
+
+  // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
+  MI.getOperand(FIOperandNum).ChangeToRegister(
+    FrameIndex < 0 ? getBaseRegister(MF) : getFrameRegister(MF), false);
+
+  // Figure out if the offset in the instruction is shifted right two bits.
+  bool isIXAddr = usesIXAddr(MI);
+
+  // If the instruction is not present in ImmToIdxMap, then it has no immediate
+  // form (and must be r+r).
+  bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP &&
+                   OpC != TargetOpcode::PATCHPOINT && !ImmToIdxMap.count(OpC);
+
+  // Now add the frame object offset to the offset from r1.
+  int Offset = MFI.getObjectOffset(FrameIndex);
+  Offset += MI.getOperand(OffsetOperandNo).getImm();
+
+  // If we're not using a Frame Pointer that has been set to the value of the
+  // SP before having the stack size subtracted from it, then add the stack size
+  // to Offset to get the correct offset.
+  // Naked functions have stack size 0, although getStackSize may not reflect
+  // that because we didn't call all the pieces that compute it for naked
+  // functions.
+  if (!MF.getFunction()->hasFnAttribute(Attribute::Naked)) {
+    if (!(hasBasePointer(MF) && FrameIndex < 0))
+      Offset += MFI.getStackSize();
+  }
+
+  // If we can, encode the offset directly into the instruction.  If this is a
+  // normal PPC "ri" instruction, any 16-bit value can be safely encoded.  If
+  // this is a PPC64 "ix" instruction, only a 16-bit value with the low two bits
+  // clear can be encoded.  This is extremely uncommon, because normally you
+  // only "std" to a stack slot that is at least 4-byte aligned, but it can
+  // happen in invalid code.
+  assert(OpC != PPC::DBG_VALUE &&
+         "This should be handled in a target-independent way");
+  if (!noImmForm && ((isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) ||
+                     OpC == TargetOpcode::STACKMAP ||
+                     OpC == TargetOpcode::PATCHPOINT)) {
+    MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // The offset doesn't fit into a single register, scavenge one to build the
+  // offset in.
+
+  bool is64Bit = TM.isPPC64();
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  const TargetRegisterClass *RC = is64Bit ? G8RC : GPRC;
+  unsigned SRegHi = MF.getRegInfo().createVirtualRegister(RC),
+           SReg = MF.getRegInfo().createVirtualRegister(RC);
+
+  // Insert a set of rA with the full offset value before the ld, st, or add
+  BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LIS8 : PPC::LIS), SRegHi)
+    .addImm(Offset >> 16);
+  BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::ORI8 : PPC::ORI), SReg)
+    .addReg(SRegHi, RegState::Kill)
+    .addImm(Offset);
+
+  // Convert into indexed form of the instruction:
+  //
+  //   sth 0:rA, 1:imm 2:(rB) ==> sthx 0:rA, 2:rB, 1:r0
+  //   addi 0:rA 1:rB, 2, imm ==> add 0:rA, 1:rB, 2:r0
+  unsigned OperandBase;
+
+  if (noImmForm)
+    OperandBase = 1;
+  else if (OpC != TargetOpcode::INLINEASM) {
+    assert(ImmToIdxMap.count(OpC) &&
+           "No indexed form of load or store available!");
+    unsigned NewOpcode = ImmToIdxMap.find(OpC)->second;
+    MI.setDesc(TII.get(NewOpcode));
+    OperandBase = 1;
+  } else {
+    OperandBase = OffsetOperandNo;
+  }
+
+  unsigned StackReg = MI.getOperand(FIOperandNum).getReg();
+  MI.getOperand(OperandBase).ChangeToRegister(StackReg, false);
+  MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false, false, true);
+}
+
+unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
+
+  if (!TM.isPPC64())
+    return TFI->hasFP(MF) ? PPC::R31 : PPC::R1;
+  else
+    return TFI->hasFP(MF) ? PPC::X31 : PPC::X1;
+}
+
+unsigned PPCRegisterInfo::getBaseRegister(const MachineFunction &MF) const {
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  if (!hasBasePointer(MF))
+    return getFrameRegister(MF);
+
+  if (TM.isPPC64())
+    return PPC::X30;
+
+  if (Subtarget.isSVR4ABI() && TM.isPositionIndependent())
+    return PPC::R29;
+
+  return PPC::R30;
+}
+
+bool PPCRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  if (!EnableBasePointer)
+    return false;
+  if (AlwaysBasePointer)
+    return true;
+
+  // If we need to realign the stack, then the stack pointer can no longer
+  // serve as an offset into the caller's stack space. As a result, we need a
+  // base pointer.
+  return needsStackRealignment(MF);
+}
+
+/// Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool PPCRegisterInfo::
+needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
+  assert(Offset < 0 && "Local offset must be negative");
+
+  // It's the load/store FI references that cause issues, as it can be difficult
+  // to materialize the offset if it won't fit in the literal field. Estimate
+  // based on the size of the local frame and some conservative assumptions
+  // about the rest of the stack frame (note, this is pre-regalloc, so
+  // we don't know everything for certain yet) whether this offset is likely
+  // to be out of range of the immediate. Return true if so.
+
+  // We only generate virtual base registers for loads and stores that have
+  // an r+i form. Return false for everything else.
+  unsigned OpC = MI->getOpcode();
+  if (!ImmToIdxMap.count(OpC))
+    return false;
+
+  // Don't generate a new virtual base register just to add zero to it.
+  if ((OpC == PPC::ADDI || OpC == PPC::ADDI8) &&
+      MI->getOperand(2).getImm() == 0)
+    return false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const PPCFrameLowering *TFI = getFrameLowering(MF);
+  unsigned StackEst = TFI->determineFrameLayout(MF, false, true);
+
+  // If we likely don't need a stack frame, then we probably don't need a
+  // virtual base register either.
+  if (!StackEst)
+    return false;
+
+  // Estimate an offset from the stack pointer.
+  // The incoming offset is relating to the SP at the start of the function,
+  // but when we access the local it'll be relative to the SP after local
+  // allocation, so adjust our SP-relative offset by that allocation size.
+  Offset += StackEst;
+
+  // The frame pointer will point to the end of the stack, so estimate the
+  // offset as the difference between the object offset and the FP location.
+  return !isFrameOffsetLegal(MI, getBaseRegister(MF), Offset);
+}
+
+/// Insert defining instruction(s) for BaseReg to
+/// be a pointer to FrameIdx at the beginning of the basic block.
+void PPCRegisterInfo::
+materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                             unsigned BaseReg, int FrameIdx,
+                             int64_t Offset) const {
+  unsigned ADDriOpc = TM.isPPC64() ? PPC::ADDI8 : PPC::ADDI;
+
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL;                  // Defaults to "unknown"
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+
+  const MachineFunction &MF = *MBB->getParent();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  const MCInstrDesc &MCID = TII.get(ADDriOpc);
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
+
+  BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+    .addFrameIndex(FrameIdx).addImm(Offset);
+}
+
+void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                        int64_t Offset) const {
+  unsigned FIOperandNum = 0;
+  while (!MI.getOperand(FIOperandNum).isFI()) {
+    ++FIOperandNum;
+    assert(FIOperandNum < MI.getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+  }
+
+  MI.getOperand(FIOperandNum).ChangeToRegister(BaseReg, false);
+  unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum);
+  Offset += MI.getOperand(OffsetOperandNo).getImm();
+  MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  const MCInstrDesc &MCID = MI.getDesc();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MRI.constrainRegClass(BaseReg,
+                        TII.getRegClass(MCID, FIOperandNum, this, MF));
+}
+
+bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                         unsigned BaseReg,
+                                         int64_t Offset) const {
+  unsigned FIOperandNum = 0;
+  while (!MI->getOperand(FIOperandNum).isFI()) {
+    ++FIOperandNum;
+    assert(FIOperandNum < MI->getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+  }
+
+  unsigned OffsetOperandNo = getOffsetONFromFION(*MI, FIOperandNum);
+  Offset += MI->getOperand(OffsetOperandNo).getImm();
+
+  return MI->getOpcode() == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm
+         MI->getOpcode() == TargetOpcode::STACKMAP ||
+         MI->getOpcode() == TargetOpcode::PATCHPOINT ||
+         (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0));
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
new file mode 100644
index 000000000000..4a96327fe552
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -0,0 +1,145 @@
+//===-- PPCRegisterInfo.h - PowerPC Register Information Impl ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCREGISTERINFO_H
+#define LLVM_LIB_TARGET_POWERPC_PPCREGISTERINFO_H
+
+#include "PPC.h"
+#include "llvm/ADT/DenseMap.h"
+
+#define GET_REGINFO_HEADER
+#include "PPCGenRegisterInfo.inc"
+
+namespace llvm {
+
+inline static unsigned getCRFromCRBit(unsigned SrcReg) {
+  unsigned Reg = 0;
+  if (SrcReg == PPC::CR0LT || SrcReg == PPC::CR0GT ||
+      SrcReg == PPC::CR0EQ || SrcReg == PPC::CR0UN)
+    Reg = PPC::CR0;
+  else if (SrcReg == PPC::CR1LT || SrcReg == PPC::CR1GT ||
+           SrcReg == PPC::CR1EQ || SrcReg == PPC::CR1UN)
+    Reg = PPC::CR1;
+  else if (SrcReg == PPC::CR2LT || SrcReg == PPC::CR2GT ||
+           SrcReg == PPC::CR2EQ || SrcReg == PPC::CR2UN)
+    Reg = PPC::CR2;
+  else if (SrcReg == PPC::CR3LT || SrcReg == PPC::CR3GT ||
+           SrcReg == PPC::CR3EQ || SrcReg == PPC::CR3UN)
+    Reg = PPC::CR3;
+  else if (SrcReg == PPC::CR4LT || SrcReg == PPC::CR4GT ||
+           SrcReg == PPC::CR4EQ || SrcReg == PPC::CR4UN)
+    Reg = PPC::CR4;
+  else if (SrcReg == PPC::CR5LT || SrcReg == PPC::CR5GT ||
+           SrcReg == PPC::CR5EQ || SrcReg == PPC::CR5UN)
+    Reg = PPC::CR5;
+  else if (SrcReg == PPC::CR6LT || SrcReg == PPC::CR6GT ||
+           SrcReg == PPC::CR6EQ || SrcReg == PPC::CR6UN)
+    Reg = PPC::CR6;
+  else if (SrcReg == PPC::CR7LT || SrcReg == PPC::CR7GT ||
+           SrcReg == PPC::CR7EQ || SrcReg == PPC::CR7UN)
+    Reg = PPC::CR7;
+
+  assert(Reg != 0 && "Invalid CR bit register");
+  return Reg;
+}
+
+class PPCRegisterInfo : public PPCGenRegisterInfo {
+  DenseMap<unsigned, unsigned> ImmToIdxMap;
+  const PPCTargetMachine &TM;
+
+public:
+  PPCRegisterInfo(const PPCTargetMachine &TM);
+
+  /// getPointerRegClass - Return the register class to use to hold pointers.
+  /// This is used for addressing modes.
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const override;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
+
+  const TargetRegisterClass *
+  getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                            const MachineFunction &MF) const override;
+
+  /// Code Generation virtual methods...
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID CC) const override;
+  const uint32_t *getNoPreservedMask() const override;
+
+  void adjustStackMapLiveOutMask(uint32_t *Mask) const override;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  /// We require the register scavenger.
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override {
+    return true;
+  }
+
+  void lowerDynamicAlloc(MachineBasicBlock::iterator II) const;
+  void lowerDynamicAreaOffset(MachineBasicBlock::iterator II) const;
+  void lowerCRSpilling(MachineBasicBlock::iterator II,
+                       unsigned FrameIndex) const;
+  void lowerCRRestore(MachineBasicBlock::iterator II,
+                      unsigned FrameIndex) const;
+  void lowerCRBitSpilling(MachineBasicBlock::iterator II,
+                          unsigned FrameIndex) const;
+  void lowerCRBitRestore(MachineBasicBlock::iterator II,
+                         unsigned FrameIndex) const;
+  void lowerVRSAVESpilling(MachineBasicBlock::iterator II,
+                           unsigned FrameIndex) const;
+  void lowerVRSAVERestore(MachineBasicBlock::iterator II,
+                          unsigned FrameIndex) const;
+
+  bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+                            int &FrameIdx) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  // Support for virtual base registers.
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                    unsigned BaseReg, int FrameIdx,
+                                    int64_t Offset) const override;
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+  bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+                          int64_t Offset) const override;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  // Base pointer (stack realignment) support.
+  unsigned getBaseRegister(const MachineFunction &MF) const;
+  bool hasBasePointer(const MachineFunction &MF) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
new file mode 100644
index 000000000000..896cec7e4f6e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -0,0 +1,352 @@
+//===-- PPCRegisterInfo.td - The PowerPC Register File -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+let Namespace = "PPC" in {
+def sub_lt : SubRegIndex<1>;
+def sub_gt : SubRegIndex<1, 1>;
+def sub_eq : SubRegIndex<1, 2>;
+def sub_un : SubRegIndex<1, 3>;
+def sub_32 : SubRegIndex<32>;
+def sub_64 : SubRegIndex<64>;
+}
+
+
+class PPCReg<string n> : Register<n> {
+  let Namespace = "PPC";
+}
+
+// We identify all our registers with a 5-bit ID, for consistency's sake.
+
+// GPR - One of the 32 32-bit general-purpose registers
+class GPR<bits<5> num, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = num;
+}
+
+// GP8 - One of the 32 64-bit general-purpose registers
+class GP8<GPR SubReg, string n> : PPCReg<n> {
+  let HWEncoding = SubReg.HWEncoding;
+  let SubRegs = [SubReg];
+  let SubRegIndices = [sub_32];
+}
+
+// SPR - One of the 32-bit special-purpose registers
+class SPR<bits<10> num, string n> : PPCReg<n> {
+  let HWEncoding{9-0} = num;
+}
+
+// FPR - One of the 32 64-bit floating-point registers
+class FPR<bits<5> num, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = num;
+}
+
+// QFPR - One of the 32 256-bit floating-point vector registers (used for QPX)
+class QFPR<FPR SubReg, string n> : PPCReg<n> {
+  let HWEncoding = SubReg.HWEncoding;
+  let SubRegs = [SubReg];
+  let SubRegIndices = [sub_64];
+}
+
+// VF - One of the 32 64-bit floating-point subregisters of the vector
+// registers (used by VSX).
+class VF<bits<5> num, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = num;
+  let HWEncoding{5} = 1;
+}
+
+// VR - One of the 32 128-bit vector registers
+class VR<VF SubReg, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = SubReg.HWEncoding{4-0};
+  let HWEncoding{5} = 0;
+  let SubRegs = [SubReg];
+  let SubRegIndices = [sub_64];
+}
+
+// VSRL - One of the 32 128-bit VSX registers that overlap with the scalar
+// floating-point registers.
+class VSRL<FPR SubReg, string n> : PPCReg<n> {
+  let HWEncoding = SubReg.HWEncoding;
+  let SubRegs = [SubReg];
+  let SubRegIndices = [sub_64];
+}
+
+// CR - One of the 8 4-bit condition registers
+class CR<bits<3> num, string n, list<Register> subregs> : PPCReg<n> {
+  let HWEncoding{2-0} = num;
+  let SubRegs = subregs;
+}
+
+// CRBIT - One of the 32 1-bit condition register fields
+class CRBIT<bits<5> num, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = num;
+}
+
+// General-purpose registers
+foreach Index = 0-31 in {
+  def R#Index : GPR<Index, "r"#Index>, DwarfRegNum<[-2, Index]>;
+}
+
+// 64-bit General-purpose registers
+foreach Index = 0-31 in {
+  def X#Index : GP8<!cast<GPR>("R"#Index), "r"#Index>,
+                    DwarfRegNum<[Index, -2]>;
+}
+
+// Floating-point registers
+foreach Index = 0-31 in {
+  def F#Index : FPR<Index, "f"#Index>,
+                DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>;
+}
+
+// 64-bit Floating-point subregisters of Altivec registers
+// Note: the register names are v0-v31 or vs32-vs63 depending on the use.
+//       Custom C++ code is used to produce the correct name and encoding.
+foreach Index = 0-31 in {
+  def VF#Index : VF<Index, "v" #Index>,
+                 DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
+}
+
+// QPX Floating-point registers
+foreach Index = 0-31 in {
+  def QF#Index : QFPR<!cast<FPR>("F"#Index), "q"#Index>,
+                 DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>;
+}
+
+// Vector registers
+foreach Index = 0-31 in {
+  def V#Index : VR<!cast<VF>("VF"#Index), "v"#Index>,
+                DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
+}
+
+// VSX registers
+foreach Index = 0-31 in {
+  def VSL#Index : VSRL<!cast<FPR>("F"#Index), "vs"#Index>,
+                  DwarfRegAlias<!cast<FPR>("F"#Index)>;
+}
+
+// Dummy VSX registers, this defines string: "vs32"-"vs63", and is only used for
+// asm printing.
+foreach Index = 32-63 in {
+  def VSX#Index : PPCReg<"vs"#Index>;
+}
+
+// The reprsentation of r0 when treated as the constant 0.
+def ZERO  : GPR<0, "0">,    DwarfRegAlias<R0>;
+def ZERO8 : GP8<ZERO, "0">, DwarfRegAlias<X0>;
+
+// Representations of the frame pointer used by ISD::FRAMEADDR.
+def FP   : GPR<0 /* arbitrary */, "**FRAME POINTER**">;
+def FP8  : GP8<FP, "**FRAME POINTER**">;
+
+// Representations of the base pointer used by setjmp.
+def BP   : GPR<0 /* arbitrary */, "**BASE POINTER**">;
+def BP8  : GP8<BP, "**BASE POINTER**">;
+
+// Condition register bits
+def CR0LT : CRBIT< 0, "0">;
+def CR0GT : CRBIT< 1, "1">;
+def CR0EQ : CRBIT< 2, "2">;
+def CR0UN : CRBIT< 3, "3">;
+def CR1LT : CRBIT< 4, "4">;
+def CR1GT : CRBIT< 5, "5">;
+def CR1EQ : CRBIT< 6, "6">;
+def CR1UN : CRBIT< 7, "7">;
+def CR2LT : CRBIT< 8, "8">;
+def CR2GT : CRBIT< 9, "9">;
+def CR2EQ : CRBIT<10, "10">;
+def CR2UN : CRBIT<11, "11">;
+def CR3LT : CRBIT<12, "12">;
+def CR3GT : CRBIT<13, "13">;
+def CR3EQ : CRBIT<14, "14">;
+def CR3UN : CRBIT<15, "15">;
+def CR4LT : CRBIT<16, "16">;
+def CR4GT : CRBIT<17, "17">;
+def CR4EQ : CRBIT<18, "18">;
+def CR4UN : CRBIT<19, "19">;
+def CR5LT : CRBIT<20, "20">;
+def CR5GT : CRBIT<21, "21">;
+def CR5EQ : CRBIT<22, "22">;
+def CR5UN : CRBIT<23, "23">;
+def CR6LT : CRBIT<24, "24">;
+def CR6GT : CRBIT<25, "25">;
+def CR6EQ : CRBIT<26, "26">;
+def CR6UN : CRBIT<27, "27">;
+def CR7LT : CRBIT<28, "28">;
+def CR7GT : CRBIT<29, "29">;
+def CR7EQ : CRBIT<30, "30">;
+def CR7UN : CRBIT<31, "31">;
+
+// Condition registers
+let SubRegIndices = [sub_lt, sub_gt, sub_eq, sub_un] in {
+def CR0 : CR<0, "cr0", [CR0LT, CR0GT, CR0EQ, CR0UN]>, DwarfRegNum<[68, 68]>;
+def CR1 : CR<1, "cr1", [CR1LT, CR1GT, CR1EQ, CR1UN]>, DwarfRegNum<[69, 69]>;
+def CR2 : CR<2, "cr2", [CR2LT, CR2GT, CR2EQ, CR2UN]>, DwarfRegNum<[70, 70]>;
+def CR3 : CR<3, "cr3", [CR3LT, CR3GT, CR3EQ, CR3UN]>, DwarfRegNum<[71, 71]>;
+def CR4 : CR<4, "cr4", [CR4LT, CR4GT, CR4EQ, CR4UN]>, DwarfRegNum<[72, 72]>;
+def CR5 : CR<5, "cr5", [CR5LT, CR5GT, CR5EQ, CR5UN]>, DwarfRegNum<[73, 73]>;
+def CR6 : CR<6, "cr6", [CR6LT, CR6GT, CR6EQ, CR6UN]>, DwarfRegNum<[74, 74]>;
+def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75, 75]>;
+}
+
+// Link register
+def LR  : SPR<8, "lr">, DwarfRegNum<[-2, 65]>;
+//let Aliases = [LR] in
+def LR8 : SPR<8, "lr">, DwarfRegNum<[65, -2]>;
+
+// Count register
+def CTR  : SPR<9, "ctr">, DwarfRegNum<[-2, 66]>;
+def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]>;
+
+// VRsave register
+def VRSAVE: SPR<256, "vrsave">, DwarfRegNum<[109]>;
+
+// Carry bit.  In the architecture this is really bit 0 of the XER register
+// (which really is SPR register 1);  this is the only bit interesting to a
+// compiler.
+def CARRY: SPR<1, "ca">, DwarfRegNum<[76]>;
+
+// FP rounding mode:  bits 30 and 31 of the FP status and control register
+// This is not allocated as a normal register; it appears only in
+// Uses and Defs.  The ABI says it needs to be preserved by a function,
+// but this is not achieved by saving and restoring it as with
+// most registers, it has to be done in code; to make this work all the
+// return and call instructions are described as Uses of RM, so instructions
+// that do nothing but change RM will not get deleted.
+def RM: PPCReg<"**ROUNDING MODE**">;
+
+/// Register classes
+// Allocate volatiles first
+// then nonvolatiles in reverse order since stmw/lmw save from rN to r31
+def GPRC : RegisterClass<"PPC", [i32], 32, (add (sequence "R%u", 2, 12),
+                                                (sequence "R%u", 30, 13),
+                                                R31, R0, R1, FP, BP)> {
+  // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
+  // put it at the end of the list.
+  let AltOrders = [(add (sub GPRC, R2), R2)];
+  let AltOrderSelect = [{
+    const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
+    return S.isPPC64() && S.isSVR4ABI();
+  }];
+}
+
+def G8RC : RegisterClass<"PPC", [i64], 64, (add (sequence "X%u", 2, 12),
+                                                (sequence "X%u", 30, 14),
+                                                X31, X13, X0, X1, FP8, BP8)> {
+  // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
+  // put it at the end of the list.
+  let AltOrders = [(add (sub G8RC, X2), X2)];
+  let AltOrderSelect = [{
+    const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
+    return S.isPPC64() && S.isSVR4ABI();
+  }];
+}
+
+// For some instructions r0 is special (representing the value 0 instead of
+// the value in the r0 register), and we use these register subclasses to
+// prevent r0 from being allocated for use by those instructions.
+def GPRC_NOR0 : RegisterClass<"PPC", [i32], 32, (add (sub GPRC, R0), ZERO)> {
+  // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
+  // put it at the end of the list.
+  let AltOrders = [(add (sub GPRC_NOR0, R2), R2)];
+  let AltOrderSelect = [{
+    const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
+    return S.isPPC64() && S.isSVR4ABI();
+  }];
+}
+
+def G8RC_NOX0 : RegisterClass<"PPC", [i64], 64, (add (sub G8RC, X0), ZERO8)> {
+  // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
+  // put it at the end of the list.
+  let AltOrders = [(add (sub G8RC_NOX0, X2), X2)];
+  let AltOrderSelect = [{
+    const PPCSubtarget &S = MF.getSubtarget<PPCSubtarget>();
+    return S.isPPC64() && S.isSVR4ABI();
+  }];
+}
+
+// Allocate volatiles first, then non-volatiles in reverse order. With the SVR4
+// ABI the size of the Floating-point register save area is determined by the
+// allocated non-volatile register with the lowest register number, as FP
+// register N is spilled to offset 8 * (32 - N) below the back chain word of the
+// previous stack frame. By allocating non-volatiles in reverse order we make
+// sure that the Floating-point register save area is always as small as
+// possible because there aren't any unused spill slots.
+def F8RC : RegisterClass<"PPC", [f64], 64, (add (sequence "F%u", 0, 13),
+                                                (sequence "F%u", 31, 14))>;
+def F4RC : RegisterClass<"PPC", [f32], 32, (add F8RC)>;
+
+def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v2i64,v1i128,v4f32,v2f64], 128,
+                         (add V2, V3, V4, V5, V0, V1, V6, V7, V8, V9, V10, V11,
+                             V12, V13, V14, V15, V16, V17, V18, V19, V31, V30,
+                             V29, V28, V27, V26, V25, V24, V23, V22, V21, V20)>;
+
+// VSX register classes (the allocation order mirrors that of the corresponding
+// subregister classes).
+def VSLRC : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128,
+                          (add (sequence "VSL%u", 0, 13),
+                               (sequence "VSL%u", 31, 14))>;
+def VSRC  : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128,
+                          (add VSLRC, VRRC)>;
+
+// Register classes for the 64-bit "scalar" VSX subregisters.
+def VFRC :  RegisterClass<"PPC", [f64], 64,
+                          (add VF2, VF3, VF4, VF5, VF0, VF1, VF6, VF7,
+                               VF8, VF9, VF10, VF11, VF12, VF13, VF14,
+                               VF15, VF16, VF17, VF18, VF19, VF31, VF30,
+                               VF29, VF28, VF27, VF26, VF25, VF24, VF23,
+                               VF22, VF21, VF20)>;
+def VSFRC : RegisterClass<"PPC", [f64], 64, (add F8RC, VFRC)>;
+
+// Register class for single precision scalars in VSX registers
+def VSSRC : RegisterClass<"PPC", [f32], 32, (add VSFRC)>;
+
+// For QPX
+def QFRC : RegisterClass<"PPC", [v4f64], 256, (add (sequence "QF%u", 0, 13),
+                                                (sequence "QF%u", 31, 14))>;
+def QSRC : RegisterClass<"PPC", [v4f32], 128, (add QFRC)>;
+def QBRC : RegisterClass<"PPC", [v4i1], 256, (add QFRC)> {
+  // These are actually stored as floating-point values where a positive
+  // number is true and anything else (including NaN) is false.
+  let Size = 256;
+}
+
+def CRBITRC : RegisterClass<"PPC", [i1], 32,
+  (add CR2LT, CR2GT, CR2EQ, CR2UN,
+       CR3LT, CR3GT, CR3EQ, CR3UN,
+       CR4LT, CR4GT, CR4EQ, CR4UN,
+       CR5LT, CR5GT, CR5EQ, CR5UN,
+       CR6LT, CR6GT, CR6EQ, CR6UN,
+       CR7LT, CR7GT, CR7EQ, CR7UN,
+       CR1LT, CR1GT, CR1EQ, CR1UN,
+       CR0LT, CR0GT, CR0EQ, CR0UN)> {
+  let Size = 32;
+}
+
+def CRRC : RegisterClass<"PPC", [i32], 32, (add CR0, CR1, CR5, CR6,
+                                                CR7, CR2, CR3, CR4)>;
+
+def CRRC0 : RegisterClass<"PPC", [i32], 32, (add CR0)>;
+
+// The CTR registers are not allocatable because they're used by the
+// decrement-and-branch instructions, and thus need to stay live across
+// multiple basic blocks.
+def CTRRC : RegisterClass<"PPC", [i32], 32, (add CTR)> {
+  let isAllocatable = 0;
+}
+def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)> {
+  let isAllocatable = 0;
+}
+
+def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>;
+def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY)> {
+  let CopyCost = -1;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td b/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td
new file mode 100644
index 000000000000..edabe7748673
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td
@@ -0,0 +1,135 @@
+//===-- PPCSchedule.td - PowerPC Scheduling Definitions ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for PowerPC
+//
+def IIC_IntSimple    : InstrItinClass;
+def IIC_IntGeneral   : InstrItinClass;
+def IIC_IntCompare   : InstrItinClass;
+def IIC_IntISEL      : InstrItinClass;
+def IIC_IntDivD      : InstrItinClass;
+def IIC_IntDivW      : InstrItinClass;
+def IIC_IntMFFS      : InstrItinClass;
+def IIC_IntMFVSCR    : InstrItinClass;
+def IIC_IntMTFSB0    : InstrItinClass;
+def IIC_IntMTSRD     : InstrItinClass;
+def IIC_IntMulHD     : InstrItinClass;
+def IIC_IntMulHW     : InstrItinClass;
+def IIC_IntMulHWU    : InstrItinClass;
+def IIC_IntMulLI     : InstrItinClass;
+def IIC_IntRFID      : InstrItinClass;
+def IIC_IntRotateD   : InstrItinClass;
+def IIC_IntRotateDI  : InstrItinClass;
+def IIC_IntRotate    : InstrItinClass;
+def IIC_IntShift     : InstrItinClass;
+def IIC_IntTrapD     : InstrItinClass;
+def IIC_IntTrapW     : InstrItinClass;
+def IIC_BrB          : InstrItinClass;
+def IIC_BrCR         : InstrItinClass;
+def IIC_BrMCR        : InstrItinClass;
+def IIC_BrMCRX       : InstrItinClass;
+def IIC_LdStDCBA     : InstrItinClass;
+def IIC_LdStDCBF     : InstrItinClass;
+def IIC_LdStDCBI     : InstrItinClass;
+def IIC_LdStLoad     : InstrItinClass;
+def IIC_LdStLoadUpd  : InstrItinClass;
+def IIC_LdStLoadUpdX : InstrItinClass;
+def IIC_LdStStore    : InstrItinClass;
+def IIC_LdStStoreUpd : InstrItinClass;
+def IIC_LdStDSS      : InstrItinClass;
+def IIC_LdStICBI     : InstrItinClass;
+def IIC_LdStLD       : InstrItinClass;
+def IIC_LdStLDU      : InstrItinClass;
+def IIC_LdStLDUX     : InstrItinClass;
+def IIC_LdStLDARX    : InstrItinClass;
+def IIC_LdStLFD      : InstrItinClass;
+def IIC_LdStLFDU     : InstrItinClass;
+def IIC_LdStLFDUX    : InstrItinClass;
+def IIC_LdStLHA      : InstrItinClass;
+def IIC_LdStLHAU     : InstrItinClass;
+def IIC_LdStLHAUX    : InstrItinClass;
+def IIC_LdStLMW      : InstrItinClass;
+def IIC_LdStLVecX    : InstrItinClass;
+def IIC_LdStLWA      : InstrItinClass;
+def IIC_LdStLWARX    : InstrItinClass;
+def IIC_LdStSLBIA    : InstrItinClass;
+def IIC_LdStSLBIE    : InstrItinClass;
+def IIC_LdStSTD      : InstrItinClass;
+def IIC_LdStSTDCX    : InstrItinClass;
+def IIC_LdStSTDU     : InstrItinClass;
+def IIC_LdStSTDUX    : InstrItinClass;
+def IIC_LdStSTFD     : InstrItinClass;
+def IIC_LdStSTFDU    : InstrItinClass;
+def IIC_LdStSTVEBX   : InstrItinClass;
+def IIC_LdStSTWCX    : InstrItinClass;
+def IIC_LdStSync     : InstrItinClass;
+def IIC_LdStCOPY     : InstrItinClass;
+def IIC_LdStPASTE    : InstrItinClass;
+def IIC_SprISYNC     : InstrItinClass;
+def IIC_SprMFSR      : InstrItinClass;
+def IIC_SprMTMSR     : InstrItinClass;
+def IIC_SprMTSR      : InstrItinClass;
+def IIC_SprTLBSYNC   : InstrItinClass;
+def IIC_SprMFCR      : InstrItinClass;
+def IIC_SprMFCRF     : InstrItinClass;
+def IIC_SprMFMSR     : InstrItinClass;
+def IIC_SprMFSPR     : InstrItinClass;
+def IIC_SprMFTB      : InstrItinClass;
+def IIC_SprMTSPR     : InstrItinClass;
+def IIC_SprMTSRIN    : InstrItinClass;
+def IIC_SprRFI       : InstrItinClass;
+def IIC_SprSC        : InstrItinClass;
+def IIC_FPGeneral    : InstrItinClass;
+def IIC_FPAddSub     : InstrItinClass;
+def IIC_FPCompare    : InstrItinClass;
+def IIC_FPDivD       : InstrItinClass;
+def IIC_FPDivS       : InstrItinClass;
+def IIC_FPFused      : InstrItinClass;
+def IIC_FPRes        : InstrItinClass;
+def IIC_FPSqrtD      : InstrItinClass;
+def IIC_FPSqrtS      : InstrItinClass;
+def IIC_VecGeneral   : InstrItinClass;
+def IIC_VecFP        : InstrItinClass;
+def IIC_VecFPCompare : InstrItinClass;
+def IIC_VecComplex   : InstrItinClass;
+def IIC_VecPerm      : InstrItinClass;
+def IIC_VecFPRound   : InstrItinClass;
+def IIC_VecVSL       : InstrItinClass;
+def IIC_VecVSR       : InstrItinClass;
+def IIC_SprMTMSRD    : InstrItinClass;
+def IIC_SprSLIE      : InstrItinClass;
+def IIC_SprSLBIE     : InstrItinClass;
+def IIC_SprSLBIEG    : InstrItinClass;
+def IIC_SprSLBMTE    : InstrItinClass;
+def IIC_SprSLBMFEE   : InstrItinClass;
+def IIC_SprSLBMFEV   : InstrItinClass;
+def IIC_SprSLBIA     : InstrItinClass;
+def IIC_SprSLBSYNC   : InstrItinClass;
+def IIC_SprTLBIA     : InstrItinClass;
+def IIC_SprTLBIEL    : InstrItinClass;
+def IIC_SprTLBIE     : InstrItinClass;
+def IIC_SprABORT     : InstrItinClass;
+def IIC_SprMSGSYNC   : InstrItinClass;
+def IIC_SprSTOP      : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+include "PPCScheduleG3.td"
+include "PPCSchedule440.td"
+include "PPCScheduleG4.td"
+include "PPCScheduleG4Plus.td"
+include "PPCScheduleG5.td"
+include "PPCScheduleP7.td"
+include "PPCScheduleP8.td"
+include "PPCScheduleP9.td"
+include "PPCScheduleA2.td"
+include "PPCScheduleE500mc.td"
+include "PPCScheduleE5500.td"
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td b/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td
new file mode 100644
index 000000000000..2455e5e52de5
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td
@@ -0,0 +1,608 @@
+//===-- PPCSchedule440.td - PPC 440 Scheduling Definitions -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Primary reference:
+// PowerPC 440x6 Embedded Processor Core User's Manual.
+// IBM (as updated in) 2010.
+
+// The basic PPC 440 does not include a floating-point unit; the pipeline
+// timings here are constructed to match the FP2 unit shipped with the
+// PPC-440- and PPC-450-based Blue Gene (L and P) supercomputers.
+// References:
+// S. Chatterjee, et al. Design and exploitation of a high-performance
+// SIMD floating-point unit for Blue Gene/L.
+// IBM J. Res. & Dev. 49 (2/3) March/May 2005.
+// also:
+// Carlos Sosa and Brant Knudson. IBM System Blue Gene Solution:
+// Blue Gene/P Application Development.
+// IBM (as updated in) 2009.
+
+//===----------------------------------------------------------------------===//
+// Functional units on the PowerPC 440/450 chip sets
+//
+def P440_DISS1  : FuncUnit; // Issue unit 1
+def P440_DISS2  : FuncUnit; // Issue unit 2
+def P440_LRACC  : FuncUnit; // Register access and dispatch for
+                            // the simple integer (J-pipe) and
+                            // load/store (L-pipe) pipelines
+def P440_IRACC  : FuncUnit; // Register access and dispatch for
+                            // the complex integer (I-pipe) pipeline
+def P440_FRACC  : FuncUnit; // Register access and dispatch for
+                            // the floating-point execution (F-pipe) pipeline
+def P440_IEXE1  : FuncUnit; // Execution stage 1 for the I pipeline
+def P440_IEXE2  : FuncUnit; // Execution stage 2 for the I pipeline
+def P440_IWB    : FuncUnit; // Write-back unit for the I pipeline
+def P440_JEXE1  : FuncUnit; // Execution stage 1 for the J pipeline
+def P440_JEXE2  : FuncUnit; // Execution stage 2 for the J pipeline
+def P440_JWB    : FuncUnit; // Write-back unit for the J pipeline
+def P440_AGEN   : FuncUnit; // Address generation for the L pipeline
+def P440_CRD    : FuncUnit; // D-cache access for the L pipeline
+def P440_LWB    : FuncUnit; // Write-back unit for the L pipeline
+def P440_FEXE1  : FuncUnit; // Execution stage 1 for the F pipeline
+def P440_FEXE2  : FuncUnit; // Execution stage 2 for the F pipeline
+def P440_FEXE3  : FuncUnit; // Execution stage 3 for the F pipeline
+def P440_FEXE4  : FuncUnit; // Execution stage 4 for the F pipeline
+def P440_FEXE5  : FuncUnit; // Execution stage 5 for the F pipeline
+def P440_FEXE6  : FuncUnit; // Execution stage 6 for the F pipeline
+def P440_FWB    : FuncUnit; // Write-back unit for the F pipeline
+
+def P440_LWARX_Hold : FuncUnit; // This is a pseudo-unit which is used
+                                // to make sure that no lwarx/stwcx.
+                                // instructions are issued while another
+                                // lwarx/stwcx. is in the L pipe.
+
+def P440_GPR_Bypass : Bypass; // The bypass for general-purpose regs.
+def P440_FPR_Bypass : Bypass; // The bypass for floating-point regs.
+
+// Notes:
+// Instructions are held in the FRACC, LRACC and IRACC pipeline
+// stages until their source operands become ready. Exceptions:
+//  - Store instructions will hold in the AGEN stage
+//  - The integer multiply-accumulate instruction will hold in
+//    the IEXE1 stage
+//
+// For most I-pipe operations, the result is available at the end of
+// the IEXE1 stage. Operations such as multiply and divide must
+// continue to execute in IEXE2 and IWB. Divide resides in IWB for
+// 33 cycles (multiply also calculates its result in IWB). For all
+// J-pipe instructions, the result is available
+// at the end of the JEXE1 stage. Loads have a 3-cycle latency
+// (data is not available until after the LWB stage).
+//
+// The L1 cache hit latency is four cycles for floating point loads
+// and three cycles for integer loads.
+//
+// The stwcx. instruction requires both the LRACC and the IRACC
+// dispatch stages. It must be issued from DISS0.
+//
+// All lwarx/stwcx. instructions hold in LRACC if another
+// uncommitted lwarx/stwcx. is in AGEN, CRD, or LWB.
+//
+// msync (a.k.a. sync) and mbar will hold in LWB until all load/store
+// resources are empty. AGEN and CRD are held empty until the msync/mbar
+// commits.
+//
+// Most floating-point instructions, computational and move,
+// have a 5-cycle latency. Divide takes longer (30 cycles). Instructions that
+// update the CR take 2 cycles. Stores take 3 cycles and, as mentioned above,
+// loads take 4 cycles (for L1 hit).
+
+//
+// This file defines the itinerary class data for the PPC 440 processor.
+//
+//===----------------------------------------------------------------------===//
+
+
+def PPC440Itineraries : ProcessorItineraries<
+  [P440_DISS1, P440_DISS2, P440_FRACC, P440_IRACC, P440_IEXE1, P440_IEXE2,
+   P440_IWB, P440_LRACC, P440_JEXE1, P440_JEXE2, P440_JWB, P440_AGEN, P440_CRD,
+   P440_LWB, P440_FEXE1, P440_FEXE2, P440_FEXE3, P440_FEXE4, P440_FEXE5,
+   P440_FEXE6, P440_FWB, P440_LWARX_Hold],
+  [P440_GPR_Bypass, P440_FPR_Bypass], [
+  InstrItinData<IIC_IntSimple,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntGeneral, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntISEL,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass, NoBypass]>,
+  InstrItinData<IIC_IntCompare, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntDivW,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<33, [P440_IWB]>],
+                                [36, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntMFFS,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [3, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntMTFSB0,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [3, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHW,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHWU,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulLI,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotate,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntShift,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC, P440_LRACC]>,
+                                 InstrStage<1, [P440_IEXE1, P440_JEXE1]>,
+                                 InstrStage<1, [P440_IEXE2, P440_JEXE2]>,
+                                 InstrStage<1, [P440_IWB, P440_JWB]>],
+                                [2, 0, 0],
+                                [P440_GPR_Bypass,
+                                 P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_IntTrapW,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [2, 0],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_BrB,        [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_BrCR,       [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_BrMCR,      [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_BrMCRX,     [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0, 0],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBA,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBF,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBI,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoad,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [5, 1, 1],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoadUpd,[InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [5, 2, 1, 1],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [5, 2, 1, 1],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStStore,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [2, 1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStICBI,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFD,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFDU,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [2, 1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLFD,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [5, 1, 1],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLFDU,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [5, 2, 1, 1],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLFDUX,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [5, 2, 1, 1],
+                                [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHA,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAU,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAUX,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLMW,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLWARX,  [InstrStage<1, [P440_DISS1]>,
+                                 InstrStage<1, [P440_IRACC], 0>,
+                                 InstrStage<4, [P440_LWARX_Hold], 0>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTD,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTDU,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [2, 1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTDUX,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<2, [P440_LWB]>],
+                                [2, 1, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTDCX,  [InstrStage<1, [P440_DISS1]>,
+                                 InstrStage<1, [P440_IRACC], 0>,
+                                 InstrStage<4, [P440_LWARX_Hold], 0>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTWCX,  [InstrStage<1, [P440_DISS1]>,
+                                 InstrStage<1, [P440_IRACC], 0>,
+                                 InstrStage<4, [P440_LWARX_Hold], 0>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<1, [P440_AGEN]>,
+                                 InstrStage<1, [P440_CRD]>,
+                                 InstrStage<1, [P440_LWB]>],
+                                [4, 1, 1],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSync,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_LRACC]>,
+                                 InstrStage<3, [P440_AGEN], 1>,
+                                 InstrStage<2, [P440_CRD],  1>,
+                                 InstrStage<1, [P440_LWB]>]>,
+  InstrItinData<IIC_SprISYNC,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC], 0>,
+                                 InstrStage<1, [P440_LRACC], 0>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_FEXE1], 0>,
+                                 InstrStage<1, [P440_AGEN],  0>,
+                                 InstrStage<1, [P440_JEXE1], 0>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_FEXE2], 0>,
+                                 InstrStage<1, [P440_CRD],   0>,
+                                 InstrStage<1, [P440_JEXE2], 0>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<6, [P440_FEXE3], 0>,
+                                 InstrStage<6, [P440_LWB],   0>,
+                                 InstrStage<6, [P440_JWB],   0>,
+                                 InstrStage<6, [P440_IWB]>]>,
+  InstrItinData<IIC_SprMFSR,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [2, 0],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTMSR,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [2, 0],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSR,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<3, [P440_IWB]>],
+                                [5, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprTLBSYNC, [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>]>,
+  InstrItinData<IIC_SprMFCR,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFMSR,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [3, 0],
+                                [P440_GPR_Bypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFSPR,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<3, [P440_IWB]>],
+                                [6, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFTB,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<3, [P440_IWB]>],
+                                [6, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSPR,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<3, [P440_IWB]>],
+                                [6, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSRIN,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<3, [P440_IWB]>],
+                                [6, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprRFI,     [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_SprSC,      [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_IRACC]>,
+                                 InstrStage<1, [P440_IEXE1]>,
+                                 InstrStage<1, [P440_IEXE2]>,
+                                 InstrStage<1, [P440_IWB]>],
+                                [4, 0],
+                                [NoBypass, P440_GPR_Bypass]>,
+  InstrItinData<IIC_FPGeneral,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<1, [P440_FWB]>],
+                                [6, 0, 0],
+                                [P440_FPR_Bypass,
+                                 P440_FPR_Bypass, P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPAddSub,   [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<1, [P440_FWB]>],
+                                [6, 0, 0],
+                                [P440_FPR_Bypass,
+                                 P440_FPR_Bypass, P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPCompare,  [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<1, [P440_FWB]>],
+                                [6, 0, 0],
+                                [P440_FPR_Bypass, P440_FPR_Bypass,
+                                 P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivD,     [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<25, [P440_FWB]>],
+                                [31, 0, 0],
+                                [NoBypass, P440_FPR_Bypass, P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivS,     [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<13, [P440_FWB]>],
+                                [19, 0, 0],
+                                [NoBypass, P440_FPR_Bypass, P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPFused,    [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<1, [P440_FWB]>],
+                                [6, 0, 0, 0],
+                                [P440_FPR_Bypass,
+                                 P440_FPR_Bypass, P440_FPR_Bypass,
+                                 P440_FPR_Bypass]>,
+  InstrItinData<IIC_FPRes,      [InstrStage<1, [P440_DISS1, P440_DISS2]>,
+                                 InstrStage<1, [P440_FRACC]>,
+                                 InstrStage<1, [P440_FEXE1]>,
+                                 InstrStage<1, [P440_FEXE2]>,
+                                 InstrStage<1, [P440_FEXE3]>,
+                                 InstrStage<1, [P440_FEXE4]>,
+                                 InstrStage<1, [P440_FEXE5]>,
+                                 InstrStage<1, [P440_FEXE6]>,
+                                 InstrStage<1, [P440_FWB]>],
+                                [6, 0],
+                                [P440_FPR_Bypass, P440_FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// PPC440 machine model for scheduling and other instruction cost heuristics.
+
+def PPC440Model : SchedMachineModel {
+  let IssueWidth = 2;  // 2 instructions are dispatched per cycle.
+  let LoadLatency = 5; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let CompleteModel = 0;
+
+  let Itineraries = PPC440Itineraries;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td
new file mode 100644
index 000000000000..54cfae5d74b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td
@@ -0,0 +1,172 @@
+//===- PPCScheduleA2.td - PPC A2 Scheduling Definitions --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Primary reference:
+// A2 Processor User's Manual.
+// IBM (as updated in) 2010.
+
+//===----------------------------------------------------------------------===//
+// Functional units on the PowerPC A2 chip sets
+//
+def A2_XU     : FuncUnit; // A2_XU pipeline
+def A2_FU     : FuncUnit; // FI pipeline
+
+//
+// This file defines the itinerary class data for the PPC A2 processor.
+//
+//===----------------------------------------------------------------------===//
+
+
+def PPCA2Itineraries : ProcessorItineraries<
+  [A2_XU, A2_FU], [], [
+  InstrItinData<IIC_IntSimple,   [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_IntGeneral,  [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntISEL,     [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0, 0]>,
+  InstrItinData<IIC_IntCompare,  [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntDivW,     [InstrStage<1, [A2_XU]>],
+                                 [39, 0, 0]>,
+  InstrItinData<IIC_IntDivD,     [InstrStage<1, [A2_XU]>],
+                                 [71, 0, 0]>,
+  InstrItinData<IIC_IntMulHW,    [InstrStage<1, [A2_XU]>],
+                                 [5, 0, 0]>,
+  InstrItinData<IIC_IntMulHWU,   [InstrStage<1, [A2_XU]>],
+                                 [5, 0, 0]>,
+  InstrItinData<IIC_IntMulLI,    [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_IntRotate,   [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntRotateD,  [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntRotateDI, [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntShift,    [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0]>,
+  InstrItinData<IIC_IntTrapW,    [InstrStage<1, [A2_XU]>],
+                                 [2, 0]>,
+  InstrItinData<IIC_IntTrapD,    [InstrStage<1, [A2_XU]>],
+                                 [2, 0]>,
+  InstrItinData<IIC_BrB,         [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_BrCR,        [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_BrMCR,       [InstrStage<1, [A2_XU]>],
+                                 [5, 0, 0]>,
+  InstrItinData<IIC_BrMCRX,      [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_LdStDCBA,    [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_LdStDCBF,    [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_LdStDCBI,    [InstrStage<1, [A2_XU]>],
+                                 [1, 0, 0]>,
+  InstrItinData<IIC_LdStLoad,    [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [A2_XU]>],
+                                 [6, 8, 0, 0]>,
+  InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [A2_XU]>],
+                                 [6, 8, 0, 0]>,
+  InstrItinData<IIC_LdStLDU,     [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_LdStLDUX,    [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_LdStStore,   [InstrStage<1, [A2_XU]>],
+                                 [0, 0, 0]>,
+  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0, 0]>,
+  InstrItinData<IIC_LdStICBI,    [InstrStage<1, [A2_XU]>],
+                                 [16, 0, 0]>,
+  InstrItinData<IIC_LdStSTFD,    [InstrStage<1, [A2_XU]>],
+                                 [0, 0, 0]>,
+  InstrItinData<IIC_LdStSTFDU,   [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0, 0]>,
+  InstrItinData<IIC_LdStLFD,     [InstrStage<1, [A2_XU]>],
+                                 [7, 0, 0]>,
+  InstrItinData<IIC_LdStLFDU,    [InstrStage<1, [A2_XU]>],
+                                 [7, 9, 0, 0]>,
+  InstrItinData<IIC_LdStLFDUX,   [InstrStage<1, [A2_XU]>],
+                                 [7, 9, 0, 0]>,
+  InstrItinData<IIC_LdStLHA,     [InstrStage<1, [A2_XU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_LdStLHAU,    [InstrStage<1, [A2_XU]>],
+                                 [6, 8, 0, 0]>,
+  InstrItinData<IIC_LdStLHAUX,   [InstrStage<1, [A2_XU]>],
+                                 [6, 8, 0, 0]>,
+  InstrItinData<IIC_LdStLWARX,   [InstrStage<1, [A2_XU]>],
+                                 [82, 0, 0]>, // L2 latency
+  InstrItinData<IIC_LdStSTD,     [InstrStage<1, [A2_XU]>],
+                                 [0, 0, 0]>,
+  InstrItinData<IIC_LdStSTDU,    [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0, 0]>,
+  InstrItinData<IIC_LdStSTDUX,   [InstrStage<1, [A2_XU]>],
+                                 [2, 0, 0, 0]>,
+  InstrItinData<IIC_LdStSTDCX,   [InstrStage<1, [A2_XU]>],
+                                 [82, 0, 0]>, // L2 latency
+  InstrItinData<IIC_LdStSTWCX,   [InstrStage<1, [A2_XU]>],
+                                 [82, 0, 0]>, // L2 latency
+  InstrItinData<IIC_LdStSync,    [InstrStage<1, [A2_XU]>],
+                                 [6]>,
+  InstrItinData<IIC_SprISYNC,    [InstrStage<1, [A2_XU]>],
+                                 [16]>,
+  InstrItinData<IIC_SprMTMSR,    [InstrStage<1, [A2_XU]>],
+                                 [16, 0]>,
+  InstrItinData<IIC_SprMFCR,     [InstrStage<1, [A2_XU]>],
+                                 [6, 0]>,
+  InstrItinData<IIC_SprMFCRF,    [InstrStage<1, [A2_XU]>],
+                                 [1, 0]>,
+  InstrItinData<IIC_SprMFMSR,    [InstrStage<1, [A2_XU]>],
+                                 [4, 0]>,
+  InstrItinData<IIC_SprMFSPR,    [InstrStage<1, [A2_XU]>],
+                                 [6, 0]>,
+  InstrItinData<IIC_SprMFTB,     [InstrStage<1, [A2_XU]>],
+                                 [4, 0]>,
+  InstrItinData<IIC_SprMTSPR,    [InstrStage<1, [A2_XU]>],
+                                 [6, 0]>,
+  InstrItinData<IIC_SprRFI,      [InstrStage<1, [A2_XU]>],
+                                 [16]>,
+  InstrItinData<IIC_SprSC,       [InstrStage<1, [A2_XU]>],
+                                 [16]>,
+  InstrItinData<IIC_FPGeneral,   [InstrStage<1, [A2_FU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_FPAddSub,    [InstrStage<1, [A2_FU]>],
+                                 [6, 0, 0]>,
+  InstrItinData<IIC_FPCompare,   [InstrStage<1, [A2_FU]>],
+                                 [5, 0, 0]>,
+  InstrItinData<IIC_FPDivD,      [InstrStage<1, [A2_FU]>],
+                                 [72, 0, 0]>,
+  InstrItinData<IIC_FPDivS,      [InstrStage<1, [A2_FU]>],
+                                 [59, 0, 0]>,
+  InstrItinData<IIC_FPSqrtD,     [InstrStage<1, [A2_FU]>],
+                                 [69, 0, 0]>,
+  InstrItinData<IIC_FPSqrtS,     [InstrStage<1, [A2_FU]>],
+                                 [65, 0, 0]>,
+  InstrItinData<IIC_FPFused,     [InstrStage<1, [A2_FU]>],
+                                 [6, 0, 0, 0]>,
+  InstrItinData<IIC_FPRes,       [InstrStage<1, [A2_FU]>],
+                                 [6, 0]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// A2 machine model for scheduling and other instruction cost heuristics.
+
+def PPCA2Model : SchedMachineModel {
+  let IssueWidth = 1;  // 1 instruction is dispatched per cycle.
+  let LoadLatency = 6; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 13;
+
+  let CompleteModel = 0;
+
+  let Itineraries = PPCA2Itineraries;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td
new file mode 100644
index 000000000000..f687d326b52d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td
@@ -0,0 +1,321 @@
+//===-- PPCScheduleE500mc.td - e500mc Scheduling Defs ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Freescale e500mc 32-bit
+// Power processor.
+//
+// All information is derived from the "e500mc Core Reference Manual",
+// Freescale Document Number E500MCRM, Rev. 1, 03/2012.
+//
+//===----------------------------------------------------------------------===//
+// Relevant functional units in the Freescale e500mc core:
+//
+//  * Decode & Dispatch
+//    Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
+//    queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
+def E500_DIS0 : FuncUnit; // Dispatch stage - insn 1
+def E500_DIS1 : FuncUnit; // Dispatch stage - insn 2
+
+//  * Execute
+//    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
+//    Some instructions can only execute in SFX0 but not SFX1.
+//    The CFX has a bypass path, allowing non-divide instructions to execute
+//    while a divide instruction is executed.
+def E500_SFX0  : FuncUnit; // Simple unit 0
+def E500_SFX1  : FuncUnit; // Simple unit 1
+def E500_BU    : FuncUnit; // Branch unit
+def E500_CFX_DivBypass
+               : FuncUnit; // CFX divide bypass path
+def E500_CFX_0 : FuncUnit; // CFX pipeline
+def E500_LSU_0 : FuncUnit; // LSU pipeline
+def E500_FPU_0 : FuncUnit; // FPU pipeline
+
+def E500_GPR_Bypass : Bypass;
+def E500_FPR_Bypass : Bypass;
+def E500_CR_Bypass  : Bypass;
+
+def PPCE500mcItineraries : ProcessorItineraries<
+  [E500_DIS0, E500_DIS1, E500_SFX0, E500_SFX1, E500_BU, E500_CFX_DivBypass,
+   E500_CFX_0, E500_LSU_0, E500_FPU_0],
+  [E500_CR_Bypass, E500_GPR_Bypass, E500_FPR_Bypass], [
+  InstrItinData<IIC_IntSimple,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntGeneral,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntISEL,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass,
+                                  E500_CR_Bypass]>,
+  InstrItinData<IIC_IntCompare,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [5, 1, 1], // Latency = 1 or 2
+                                 [E500_CR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntDivW,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_CFX_0], 0>,
+                                  InstrStage<14, [E500_CFX_DivBypass]>],
+                                 [17, 1, 1], // Latency=4..35, Repeat= 4..35
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMFFS,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<8, [E500_FPU_0]>],
+                                 [11], // Latency = 8
+                                 [E500_FPR_Bypass]>,
+  InstrItinData<IIC_IntMTFSB0,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<8, [E500_FPU_0]>],
+                                 [11, 1, 1], // Latency = 8
+                                 [NoBypass, NoBypass, NoBypass]>,
+  InstrItinData<IIC_IntMulHW,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_CFX_0]>],
+                                 [7, 1, 1], // Latency = 4, Repeat rate = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHWU,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_CFX_0]>],
+                                 [7, 1, 1], // Latency = 4, Repeat rate = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulLI,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_CFX_0]>],
+                                 [7, 1, 1], // Latency = 4, Repeat rate = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotate,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntShift,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_IntTrapW,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<2, [E500_SFX0]>],
+                                 [5, 1], // Latency = 2, Repeat rate = 2
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_BrB,         [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_BU]>],
+                                 [4, 1], // Latency = 1
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_BrCR,        [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_BU]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_CR_Bypass,
+                                  E500_CR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_BrMCR,       [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_BU]>],
+                                 [4, 1], // Latency = 1
+                                 [E500_CR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_BrMCRX,      [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1, 1], // Latency = 1
+                                 [E500_CR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBA,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3, Repeat rate = 1
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBF,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBI,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoad,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStStore,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStICBI,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFD,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1, 1], // Latency = 3
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFDU,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1, 1], // Latency = 3
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLFD,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [7, 1, 1], // Latency = 4
+                                 [E500_FPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLFDU,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [7, 1, 1], // Latency = 4
+                                 [E500_FPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLFDUX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [7, 1, 1], // Latency = 4
+                                 [E500_FPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLHA,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAU,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAUX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLMW,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [7, 1], // Latency = r+3
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLWARX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<3, [E500_LSU_0]>],
+                                 [6, 1, 1], // Latency = 3, Repeat rate = 3
+                                 [E500_GPR_Bypass,
+                                  E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTWCX,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>],
+                                 [6, 1], // Latency = 3
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSync,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0]>]>,
+  InstrItinData<IIC_SprMFSR,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_SFX0]>],
+                                 [7, 1],
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTMSR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<2, [E500_SFX0, E500_SFX1]>],
+                                 [5, 1], // Latency = 2, Repeat rate = 4
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSR,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0]>],
+                                 [5, 1],
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprTLBSYNC,  [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_LSU_0], 0>]>,
+  InstrItinData<IIC_SprMFCR,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<5, [E500_SFX0]>],
+                                 [8, 1],
+                                 [E500_GPR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFCRF,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<5, [E500_SFX0]>],
+                                 [8, 1],
+                                 [E500_GPR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFMSR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_SFX0]>],
+                                 [7, 1], // Latency = 4, Repeat rate = 4
+                                 [E500_GPR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFSPR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1], // Latency = 1, Repeat rate = 1
+                                 [E500_GPR_Bypass, E500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFTB,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_SFX0]>],
+                                 [7, 1], // Latency = 4, Repeat rate = 4
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSPR,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0, E500_SFX1]>],
+                                 [4, 1], // Latency = 1, Repeat rate = 1
+                                 [E500_CR_Bypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSRIN,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<1, [E500_SFX0]>],
+                                 [4, 1],
+                                 [NoBypass, E500_GPR_Bypass]>,
+  InstrItinData<IIC_FPGeneral,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<2, [E500_FPU_0]>],
+                                 [11, 1, 1], // Latency = 8, Repeat rate = 2
+                                 [E500_FPR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPAddSub,    [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_FPU_0]>],
+                                 [13, 1, 1], // Latency = 10, Repeat rate = 4
+                                 [E500_FPR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPCompare,   [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<2, [E500_FPU_0]>],
+                                 [11, 1, 1], // Latency = 8, Repeat rate = 2
+                                 [E500_CR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivD,      [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<68, [E500_FPU_0]>],
+                                 [71, 1, 1], // Latency = 68, Repeat rate = 68
+                                 [E500_FPR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivS,      [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<38, [E500_FPU_0]>],
+                                 [41, 1, 1], // Latency = 38, Repeat rate = 38
+                                 [E500_FPR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPFused,     [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<4, [E500_FPU_0]>],
+                                 [13, 1, 1, 1], // Latency = 10, Repeat rate = 4
+                                 [E500_FPR_Bypass,
+                                  E500_FPR_Bypass, E500_FPR_Bypass,
+                                  E500_FPR_Bypass]>,
+  InstrItinData<IIC_FPRes,       [InstrStage<1, [E500_DIS0, E500_DIS1], 0>,
+                                  InstrStage<38, [E500_FPU_0]>],
+                                 [41, 1], // Latency = 38, Repeat rate = 38
+                                 [E500_FPR_Bypass, E500_FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e500mc machine model for scheduling and other instruction cost heuristics.
+
+def PPCE500mcModel : SchedMachineModel {
+  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
+  let LoadLatency = 5; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let CompleteModel = 0;
+
+  let Itineraries = PPCE500mcItineraries;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td
new file mode 100644
index 000000000000..5db886cf8f94
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td
@@ -0,0 +1,381 @@
+//===-- PPCScheduleE500mc.td - e5500 Scheduling Defs -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Freescale e5500 64-bit
+// Power processor.
+//
+// All information is derived from the "e5500 Core Reference Manual",
+// Freescale Document Number e5500RM, Rev. 1, 03/2012.
+//
+//===----------------------------------------------------------------------===//
+// Relevant functional units in the Freescale e5500 core
+// (These are the same as for the e500mc)
+//
+//  * Decode & Dispatch
+//    Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
+//    queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
+def E5500_DIS0 : FuncUnit;
+def E5500_DIS1 : FuncUnit;
+
+//  * Execute
+//    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
+//    The CFX has a bypass path, allowing non-divide instructions to execute
+//    while a divide instruction is being executed.
+def E5500_SFX0  : FuncUnit; // Simple unit 0
+def E5500_SFX1  : FuncUnit; // Simple unit 1
+def E5500_BU    : FuncUnit; // Branch unit
+def E5500_CFX_DivBypass
+                : FuncUnit; // CFX divide bypass path
+def E5500_CFX_0 : FuncUnit; // CFX pipeline stage 0
+
+def E5500_CFX_1 : FuncUnit; // CFX pipeline stage 1
+
+def E5500_LSU_0 : FuncUnit; // LSU pipeline
+def E5500_FPU_0 : FuncUnit; // FPU pipeline
+
+def E5500_GPR_Bypass : Bypass;
+def E5500_FPR_Bypass : Bypass;
+def E5500_CR_Bypass  : Bypass;
+
+def PPCE5500Itineraries : ProcessorItineraries<
+  [E5500_DIS0, E5500_DIS1, E5500_SFX0, E5500_SFX1, E5500_BU,
+   E5500_CFX_DivBypass, E5500_CFX_0, E5500_CFX_1,
+   E5500_LSU_0, E5500_FPU_0],
+  [E5500_CR_Bypass, E5500_GPR_Bypass, E5500_FPR_Bypass], [
+  InstrItinData<IIC_IntSimple,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5, 2, 2], // Latency = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntGeneral,  [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5, 2, 2], // Latency = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntISEL,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5, 2, 2, 2], // Latency = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass,
+                                  E5500_CR_Bypass]>,
+  InstrItinData<IIC_IntCompare,  [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [6, 2, 2], // Latency = 1 or 2
+                                 [E5500_CR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntDivD,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<26, [E5500_CFX_DivBypass]>],
+                                 [30, 2, 2], // Latency= 4..26, Repeat rate= 4..26
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntDivW,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<16, [E5500_CFX_DivBypass]>],
+                                 [20, 2, 2], // Latency= 4..16, Repeat rate= 4..16
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMFFS,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_FPU_0]>],
+                                 [11], // Latency = 7, Repeat rate = 1
+                                 [E5500_FPR_Bypass]>,
+  InstrItinData<IIC_IntMTFSB0,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<7, [E5500_FPU_0]>],
+                                 [11, 2, 2], // Latency = 7, Repeat rate = 7
+                                 [NoBypass, NoBypass, NoBypass]>,
+  InstrItinData<IIC_IntMulHD,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<2, [E5500_CFX_1]>],
+                                 [9, 2, 2], // Latency = 4..7, Repeat rate = 2..4
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHW,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<1, [E5500_CFX_1]>],
+                                 [8, 2, 2], // Latency = 4, Repeat rate = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulHWU,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<1, [E5500_CFX_1]>],
+                                 [8, 2, 2], // Latency = 4, Repeat rate = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntMulLI,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0], 0>,
+                                  InstrStage<2, [E5500_CFX_1]>],
+                                 [8, 2, 2], // Latency = 4 or 5, Repeat = 2
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotate,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5, 2, 2], // Latency = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotateD,  [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<2, [E5500_SFX0, E5500_SFX1]>],
+                                 [6, 2, 2], // Latency = 2, Repeat rate = 2
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntRotateDI, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5, 2, 2], // Latency = 1, Repeat rate = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntShift,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<2, [E5500_SFX0, E5500_SFX1]>],
+                                 [6, 2, 2], // Latency = 2, Repeat rate = 2
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_IntTrapW,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<2, [E5500_SFX0]>],
+                                 [6, 2], // Latency = 2, Repeat rate = 2
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_BrB,         [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_BU]>],
+                                 [5, 2], // Latency = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_BrCR,        [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_BU]>],
+                                 [5, 2, 2], // Latency = 1
+                                 [E5500_CR_Bypass,
+                                  E5500_CR_Bypass, E5500_CR_Bypass]>,
+  InstrItinData<IIC_BrMCR,       [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_BU]>],
+                                 [5, 2], // Latency = 1
+                                 [E5500_CR_Bypass, E5500_CR_Bypass]>,
+  InstrItinData<IIC_BrMCRX,      [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0]>],
+                                 [5, 2, 2], // Latency = 1
+                                 [E5500_CR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBA,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBF,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStDCBI,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoad,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLD,      [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLDARX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<3, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 3
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLDU,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLDUX,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStStore,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStICBI,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFD,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTFDU,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLFD,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [8, 2, 2], // Latency = 4, Repeat rate = 1
+                                 [E5500_FPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLFDU,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [8, 2, 2], // Latency = 4, Repeat rate = 1
+                                 [E5500_FPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLFDUX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [8, 2, 2], // Latency = 4, Repeat rate = 1
+                                 [E5500_FPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLHA,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLHAU,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLHAUX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStLMW,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<4, [E5500_LSU_0]>],
+                                 [8, 2], // Latency = r+3, Repeat rate = r+3
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStLWARX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<3, [E5500_LSU_0]>],
+                                 [7, 2, 2], // Latency = 3, Repeat rate = 3
+                                 [E5500_GPR_Bypass,
+                                  E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTD,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTDCX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSTDU,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStSTDUX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass],
+                                 2>, // 2 micro-ops
+  InstrItinData<IIC_LdStSTWCX,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>],
+                                 [7, 2], // Latency = 3, Repeat rate = 1
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_LdStSync,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0]>]>,
+  InstrItinData<IIC_SprMTMSR,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<2, [E5500_CFX_0]>],
+                                 [6, 2], // Latency = 2, Repeat rate = 4
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_SprTLBSYNC,  [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_LSU_0], 0>]>,
+  InstrItinData<IIC_SprMFCR,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<5, [E5500_CFX_0]>],
+                                 [9, 2], // Latency = 5, Repeat rate = 5
+                                 [E5500_GPR_Bypass, E5500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFCRF,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<5, [E5500_CFX_0]>],
+                                 [9, 2], // Latency = 5, Repeat rate = 5
+                                 [E5500_GPR_Bypass, E5500_CR_Bypass]>,
+  InstrItinData<IIC_SprMFMSR,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<4, [E5500_SFX0]>],
+                                 [8, 2], // Latency = 4, Repeat rate = 4
+                                 [E5500_GPR_Bypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFSPR,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_CFX_0]>],
+                                 [5], // Latency = 1, Repeat rate = 1
+                                 [E5500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMFTB,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<4, [E5500_CFX_0]>],
+                                 [8, 2], // Latency = 4, Repeat rate = 4
+                                 [NoBypass, E5500_GPR_Bypass]>,
+  InstrItinData<IIC_SprMTSPR,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_SFX0, E5500_SFX1]>],
+                                 [5], // Latency = 1, Repeat rate = 1
+                                 [E5500_GPR_Bypass]>,
+  InstrItinData<IIC_FPGeneral,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_FPU_0]>],
+                                 [11, 2, 2], // Latency = 7, Repeat rate = 1
+                                 [E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPAddSub,    [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_FPU_0]>],
+                                 [11, 2, 2], // Latency = 7, Repeat rate = 1
+                                 [E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPCompare,   [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_FPU_0]>],
+                                 [11, 2, 2], // Latency = 7, Repeat rate = 1
+                                 [E5500_CR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivD,      [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<31, [E5500_FPU_0]>],
+                                 [39, 2, 2], // Latency = 35, Repeat rate = 31
+                                 [E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPDivS,      [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<16, [E5500_FPU_0]>],
+                                 [24, 2, 2], // Latency = 20, Repeat rate = 16
+                                 [E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPFused,     [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<1, [E5500_FPU_0]>],
+                                 [11, 2, 2, 2], // Latency = 7, Repeat rate = 1
+                                 [E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass, E5500_FPR_Bypass,
+                                  E5500_FPR_Bypass]>,
+  InstrItinData<IIC_FPRes,       [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>,
+                                  InstrStage<2, [E5500_FPU_0]>],
+                                 [12, 2], // Latency = 8, Repeat rate = 2
+                                 [E5500_FPR_Bypass, E5500_FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e5500 machine model for scheduling and other instruction cost heuristics.
+
+def PPCE5500Model : SchedMachineModel {
+  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
+  let LoadLatency = 6; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let CompleteModel = 0;
+
+  let Itineraries = PPCE5500Itineraries;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td
new file mode 100644
index 000000000000..21efd8f8f6c9
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td
@@ -0,0 +1,80 @@
+//===-- PPCScheduleG3.td - PPC G3 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G3 (750) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G3_BPU    : FuncUnit; // Branch unit
+def G3_SLU    : FuncUnit; // Store/load unit
+def G3_SRU    : FuncUnit; // special register unit
+def G3_IU1    : FuncUnit; // integer unit 1 (simple)
+def G3_IU2    : FuncUnit; // integer unit 2 (complex)
+def G3_FPU1   : FuncUnit; // floating point unit 1
+
+def G3Itineraries : ProcessorItineraries<
+  [G3_IU1, G3_IU2, G3_FPU1, G3_BPU, G3_SRU, G3_SLU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_IntDivW     , [InstrStage<19, [G3_IU1]>]>,
+  InstrItinData<IIC_IntMFFS     , [InstrStage<1, [G3_FPU1]>]>,
+  InstrItinData<IIC_IntMTFSB0   , [InstrStage<3, [G3_FPU1]>]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<5, [G3_IU1]>]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<6, [G3_IU1]>]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<3, [G3_IU1]>]>,
+  InstrItinData<IIC_IntRotate   , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<1, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<2, [G3_IU1, G3_IU2]>]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [G3_BPU]>]>,
+  InstrItinData<IIC_BrCR        , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_BrMCR       , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_LdStDCBA    , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStDCBF    , [InstrStage<3, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStDCBI    , [InstrStage<3, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStStore   , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStStoreUpd, [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStICBI    , [InstrStage<3, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<2, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<2, [G3_SLU]>]>,  
+  InstrItinData<IIC_LdStLMW     , [InstrStage<34, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStLWARX   , [InstrStage<3, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<8, [G3_SLU]>]>,
+  InstrItinData<IIC_LdStSync    , [InstrStage<3, [G3_SLU]>]>,
+  InstrItinData<IIC_SprISYNC    , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMFSR     , [InstrStage<3, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMTMSR    , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMTSR     , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_SprTLBSYNC  , [InstrStage<3, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMFCR     , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMFMSR    , [InstrStage<1, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMFSPR    , [InstrStage<3, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMFTB     , [InstrStage<3, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_SprMTSRIN   , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_SprRFI      , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_SprSC       , [InstrStage<2, [G3_SRU]>]>,
+  InstrItinData<IIC_FPGeneral   , [InstrStage<1, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<1, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPCompare   , [InstrStage<1, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<31, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<17, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<2, [G3_FPU1]>]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<10, [G3_FPU1]>]>
+]>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td
new file mode 100644
index 000000000000..340773ef7876
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td
@@ -0,0 +1,96 @@
+//===-- PPCScheduleG4.td - PPC G4 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G4 (7400) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G4_BPU    : FuncUnit; // Branch unit
+def G4_SLU    : FuncUnit; // Store/load unit
+def G4_SRU    : FuncUnit; // special register unit
+def G4_IU1    : FuncUnit; // integer unit 1 (simple)
+def G4_IU2    : FuncUnit; // integer unit 2 (complex)
+def G4_FPU1   : FuncUnit; // floating point unit 1
+def G4_VPU    : FuncUnit; // vector permutation unit
+def G4_VIU1   : FuncUnit; // vector integer unit 1 (simple)
+def G4_VIU2   : FuncUnit; // vector integer unit 2 (complex)
+def G4_VFPU   : FuncUnit; // vector floating point unit
+
+def G4Itineraries : ProcessorItineraries<
+  [G4_IU1, G4_IU2, G4_SLU, G4_SRU, G4_BPU, G4_FPU1,
+   G4_VIU1, G4_VIU2, G4_VPU, G4_VFPU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_IntDivW     , [InstrStage<19, [G4_IU1]>]>,
+  InstrItinData<IIC_IntMFFS     , [InstrStage<3, [G4_FPU1]>]>,
+  InstrItinData<IIC_IntMFVSCR   , [InstrStage<1, [G4_VIU1]>]>,
+  InstrItinData<IIC_IntMTFSB0   , [InstrStage<3, [G4_FPU1]>]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<5, [G4_IU1]>]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<6, [G4_IU1]>]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<3, [G4_IU1]>]>,
+  InstrItinData<IIC_IntRotate   , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<1, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<2, [G4_IU1, G4_IU2]>]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [G4_BPU]>]>,
+  InstrItinData<IIC_BrCR        , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_BrMCR       , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_LdStDCBF    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStDCBI    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStStore   , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStStoreUpd, [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStDSS     , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStICBI    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<2, [G4_SLU]>]>, 
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<2, [G4_SLU]>]>, 
+  InstrItinData<IIC_LdStLMW     , [InstrStage<34, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLVecX   , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStLWARX   , [InstrStage<3, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSTVEBX  , [InstrStage<2, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<5, [G4_SLU]>]>,
+  InstrItinData<IIC_LdStSync    , [InstrStage<8, [G4_SLU]>]>,
+  InstrItinData<IIC_SprISYNC    , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMFSR     , [InstrStage<3, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMTMSR    , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMTSR     , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_SprTLBSYNC  , [InstrStage<8, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMFCR     , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMFMSR    , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMFSPR    , [InstrStage<3, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMFTB     , [InstrStage<1, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_SprMTSRIN   , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_SprRFI      , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_SprSC       , [InstrStage<2, [G4_SRU]>]>,
+  InstrItinData<IIC_FPGeneral   , [InstrStage<1, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<1, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPCompare   , [InstrStage<1, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<31, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<17, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<1, [G4_FPU1]>]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<10, [G4_FPU1]>]>,
+  InstrItinData<IIC_VecGeneral  , [InstrStage<1, [G4_VIU1]>]>,
+  InstrItinData<IIC_VecFP       , [InstrStage<4, [G4_VFPU]>]>,
+  InstrItinData<IIC_VecFPCompare, [InstrStage<1, [G4_VIU1]>]>,
+  InstrItinData<IIC_VecComplex  , [InstrStage<3, [G4_VIU2]>]>,
+  InstrItinData<IIC_VecPerm     , [InstrStage<1, [G4_VPU]>]>,
+  InstrItinData<IIC_VecFPRound  , [InstrStage<4, [G4_VFPU]>]>,
+  InstrItinData<IIC_VecVSL      , [InstrStage<1, [G4_VIU1]>]>,
+  InstrItinData<IIC_VecVSR      , [InstrStage<1, [G4_VIU1]>]>
+]>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td
new file mode 100644
index 000000000000..1d9f13fcb850
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td
@@ -0,0 +1,112 @@
+//===-- PPCScheduleG4Plus.td - PPC G4+ Scheduling Defs. ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G4+ (7450) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G4P_BPU    : FuncUnit; // Branch unit
+def G4P_SLU    : FuncUnit; // Store/load unit
+def G4P_SRU    : FuncUnit; // special register unit
+def G4P_IU1    : FuncUnit; // integer unit 1 (simple)
+def G4P_IU2    : FuncUnit; // integer unit 2 (complex)
+def G4P_IU3    : FuncUnit; // integer unit 3 (simple)
+def G4P_IU4    : FuncUnit; // integer unit 4 (simple)
+def G4P_FPU1   : FuncUnit; // floating point unit 1
+def G4P_VPU    : FuncUnit; // vector permutation unit
+def G4P_VIU1   : FuncUnit; // vector integer unit 1 (simple)
+def G4P_VIU2   : FuncUnit; // vector integer unit 2 (complex)
+def G4P_VFPU   : FuncUnit; // vector floating point unit
+
+def G4PlusItineraries : ProcessorItineraries<
+  [G4P_IU1, G4P_IU2, G4P_IU3, G4P_IU4, G4P_BPU, G4P_SLU, G4P_FPU1,
+   G4P_VFPU, G4P_VIU1, G4P_VIU2, G4P_VPU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<1, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<1, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<1, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_IntDivW     , [InstrStage<23, [G4P_IU2]>]>,
+  InstrItinData<IIC_IntMFFS     , [InstrStage<5, [G4P_FPU1]>]>,
+  InstrItinData<IIC_IntMFVSCR   , [InstrStage<2, [G4P_VFPU]>]>,
+  InstrItinData<IIC_IntMTFSB0   , [InstrStage<5, [G4P_FPU1]>]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<4, [G4P_IU2]>]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<4, [G4P_IU2]>]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<3, [G4P_IU2]>]>,
+  InstrItinData<IIC_IntRotate   , [InstrStage<1, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<2, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<2, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [G4P_BPU]>]>,
+  InstrItinData<IIC_BrCR        , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_BrMCR       , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_LdStDCBF    , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStDCBI    , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStStore   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStStoreUpd, [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStDSS     , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStICBI    , [InstrStage<3, [G4P_IU2]>]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<4, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<4, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<4, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<3, [G4P_SLU]>]>,  
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<3, [G4P_SLU]>]>,  
+  InstrItinData<IIC_LdStLMW     , [InstrStage<37, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLVecX   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLWA     , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStLWARX   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSTD     , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSTDCX   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSTDU    , [InstrStage<3, [G4P_SLU]>]>,  
+  InstrItinData<IIC_LdStSTDUX   , [InstrStage<3, [G4P_SLU]>]>,  
+  InstrItinData<IIC_LdStSTVEBX  , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_LdStSync    , [InstrStage<35, [G4P_SLU]>]>,
+  InstrItinData<IIC_SprISYNC    , [InstrStage<0, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_SprMFSR     , [InstrStage<4, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMTMSR    , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMTSR     , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprTLBSYNC  , [InstrStage<3, [G4P_SLU]>]>,
+  InstrItinData<IIC_SprMFCR     , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMFMSR    , [InstrStage<3, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMFSPR    , [InstrStage<4, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMFTB     , [InstrStage<5, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprMTSRIN   , [InstrStage<2, [G4P_IU2]>]>,
+  InstrItinData<IIC_SprRFI      , [InstrStage<1, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_SprSC       , [InstrStage<0, [G4P_IU1, G4P_IU2,
+                                                  G4P_IU3, G4P_IU4]>]>,
+  InstrItinData<IIC_FPGeneral   , [InstrStage<5, [G4P_FPU1]>]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<5, [G4P_FPU1]>]>,  
+  InstrItinData<IIC_FPCompare   , [InstrStage<5, [G4P_FPU1]>]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<35, [G4P_FPU1]>]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<21, [G4P_FPU1]>]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<5, [G4P_FPU1]>]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<14, [G4P_FPU1]>]>,
+  InstrItinData<IIC_VecGeneral  , [InstrStage<1, [G4P_VIU1]>]>,
+  InstrItinData<IIC_VecFP       , [InstrStage<4, [G4P_VFPU]>]>,
+  InstrItinData<IIC_VecFPCompare, [InstrStage<2, [G4P_VFPU]>]>,
+  InstrItinData<IIC_VecComplex  , [InstrStage<4, [G4P_VIU2]>]>,
+  InstrItinData<IIC_VecPerm     , [InstrStage<2, [G4P_VPU]>]>,
+  InstrItinData<IIC_VecFPRound  , [InstrStage<4, [G4P_VIU1]>]>,
+  InstrItinData<IIC_VecVSL      , [InstrStage<2, [G4P_VPU]>]>,
+  InstrItinData<IIC_VecVSR      , [InstrStage<2, [G4P_VPU]>]>
+]>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td
new file mode 100644
index 000000000000..b5a9f96d45ae
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td
@@ -0,0 +1,130 @@
+//===-- PPCScheduleG5.td - PPC G5 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G5 (970) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G5_BPU    : FuncUnit; // Branch unit
+def G5_SLU    : FuncUnit; // Store/load unit
+def G5_SRU    : FuncUnit; // special register unit
+def G5_IU1    : FuncUnit; // integer unit 1 (simple)
+def G5_IU2    : FuncUnit; // integer unit 2 (complex)
+def G5_FPU1   : FuncUnit; // floating point unit 1
+def G5_FPU2   : FuncUnit; // floating point unit 2
+def G5_VPU    : FuncUnit; // vector permutation unit
+def G5_VIU1   : FuncUnit; // vector integer unit 1 (simple)
+def G5_VIU2   : FuncUnit; // vector integer unit 2 (complex)
+def G5_VFPU   : FuncUnit; // vector floating point unit
+
+def G5Itineraries : ProcessorItineraries<
+  [G5_IU1, G5_IU2, G5_SLU, G5_BPU, G5_FPU1, G5_FPU2,
+   G5_VFPU, G5_VIU1, G5_VIU2, G5_VPU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<3, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntDivD     , [InstrStage<68, [G5_IU1]>]>,
+  InstrItinData<IIC_IntDivW     , [InstrStage<36, [G5_IU1]>]>,
+  InstrItinData<IIC_IntMFFS     , [InstrStage<6, [G5_IU2]>]>,
+  InstrItinData<IIC_IntMFVSCR   , [InstrStage<1, [G5_VFPU]>]>,
+  InstrItinData<IIC_IntMTFSB0   , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_IntMulHD    , [InstrStage<7, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<5, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<5, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<4, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntRFID     , [InstrStage<1, [G5_IU2]>]>,
+  InstrItinData<IIC_IntRotateD  , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntRotateDI , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntRotate   , [InstrStage<4, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<2, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntTrapD    , [InstrStage<1, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<1, [G5_IU1, G5_IU2]>]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [G5_BPU]>]>,
+  InstrItinData<IIC_BrCR        , [InstrStage<4, [G5_BPU]>]>,
+  InstrItinData<IIC_BrMCR       , [InstrStage<2, [G5_BPU]>]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<3, [G5_BPU]>]>,
+  InstrItinData<IIC_LdStDCBF    , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStStore   , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStStoreUpd, [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStDSS     , [InstrStage<10, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStICBI    , [InstrStage<40, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<4, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<4, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLD      , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLDU     , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLDUX    , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLDARX   , [InstrStage<11, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLMW     , [InstrStage<64, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLVecX   , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLWA     , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStLWARX   , [InstrStage<11, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSLBIA   , [InstrStage<40, [G5_SLU]>]>, // needs work
+  InstrItinData<IIC_LdStSLBIE   , [InstrStage<2, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTD     , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTDU    , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTDUX   , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTDCX   , [InstrStage<11, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTVEBX  , [InstrStage<5, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<11, [G5_SLU]>]>,
+  InstrItinData<IIC_LdStSync    , [InstrStage<35, [G5_SLU]>]>,
+  InstrItinData<IIC_SprISYNC    , [InstrStage<40, [G5_SLU]>]>, // needs work
+  InstrItinData<IIC_SprMFSR     , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_SprMTMSR    , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_SprMTSR     , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_SprTLBSYNC  , [InstrStage<3, [G5_SLU]>]>,
+  InstrItinData<IIC_SprMFCR     , [InstrStage<2, [G5_IU2]>]>,
+  InstrItinData<IIC_SprMFCRF    , [InstrStage<2, [G5_IU2]>]>,
+  InstrItinData<IIC_SprMFMSR    , [InstrStage<3, [G5_IU2]>]>,
+  InstrItinData<IIC_SprMFSPR    , [InstrStage<3, [G5_IU2]>]>,
+  InstrItinData<IIC_SprMFTB     , [InstrStage<10, [G5_IU2]>]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<8, [G5_IU2]>]>,
+  InstrItinData<IIC_SprSC       , [InstrStage<1, [G5_IU2]>]>,
+  InstrItinData<IIC_FPGeneral   , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPCompare   , [InstrStage<8, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<33, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<33, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPSqrtD     , [InstrStage<40, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_FPSqrtS     , [InstrStage<40, [G5_FPU1, G5_FPU2]>]>,
+  InstrItinData<IIC_VecGeneral  , [InstrStage<2, [G5_VIU1]>]>,
+  InstrItinData<IIC_VecFP       , [InstrStage<8, [G5_VFPU]>]>,
+  InstrItinData<IIC_VecFPCompare, [InstrStage<2, [G5_VFPU]>]>,
+  InstrItinData<IIC_VecComplex  , [InstrStage<5, [G5_VIU2]>]>,
+  InstrItinData<IIC_VecPerm     , [InstrStage<3, [G5_VPU]>]>,
+  InstrItinData<IIC_VecFPRound  , [InstrStage<8, [G5_VFPU]>]>,
+  InstrItinData<IIC_VecVSL      , [InstrStage<2, [G5_VIU1]>]>,
+  InstrItinData<IIC_VecVSR      , [InstrStage<3, [G5_VPU]>]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// G5 machine model for scheduling and other instruction cost heuristics.
+
+def G5Model : SchedMachineModel {
+  let IssueWidth = 4;  // 4 (non-branch) instructions are dispatched per cycle.
+  let LoadLatency = 3; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 16;
+
+  let CompleteModel = 0;
+
+  let Itineraries = G5Itineraries;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP7.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP7.td
new file mode 100644
index 000000000000..a8678f56900e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP7.td
@@ -0,0 +1,397 @@
+//===-- PPCScheduleP7.td - PPC P7 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the POWER7 processor.
+//
+//===----------------------------------------------------------------------===//
+
+// Primary reference:
+// IBM POWER7 multicore server processor
+// B. Sinharoy, et al.
+// IBM J. Res. & Dev. (55) 3. May/June 2011.
+
+// Scheduling for the P7 involves tracking two types of resources:
+//  1. The dispatch bundle slots
+//  2. The functional unit resources
+
+// Dispatch units:
+def P7_DU1    : FuncUnit;
+def P7_DU2    : FuncUnit;
+def P7_DU3    : FuncUnit;
+def P7_DU4    : FuncUnit;
+def P7_DU5    : FuncUnit;
+def P7_DU6    : FuncUnit;
+
+def P7_LS1    : FuncUnit; // Load/Store pipeline 1
+def P7_LS2    : FuncUnit; // Load/Store pipeline 2
+
+def P7_FX1    : FuncUnit; // FX pipeline 1
+def P7_FX2    : FuncUnit; // FX pipeline 2
+
+// VS pipeline 1 (vector integer ops. always here)
+def P7_VS1    : FuncUnit; // VS pipeline 1
+// VS pipeline 2 (128-bit stores and perms. here)
+def P7_VS2    : FuncUnit; // VS pipeline 2
+
+def P7_CRU    : FuncUnit; // CR unit (CR logicals and move-from-SPRs)
+def P7_BRU    : FuncUnit; // BR unit
+
+// Notes:
+// Each LSU pipeline can also execute FX add and logical instructions.
+// Each LSU pipeline can complete a load or store in one cycle.
+//
+// Each store is broken into two parts, AGEN goes to the LSU while a
+// "data steering" op. goes to the FXU or VSU.
+//
+// FX loads have a two cycle load-to-use latency (so one "bubble" cycle).
+// VSU loads have a three cycle load-to-use latency (so two "bubble" cycle).
+//
+// Frequent FX ops. take only one cycle and results can be used again in the
+// next cycle (there is a self-bypass). Getting results from the other FX
+// pipeline takes an additional cycle.
+//
+// The VSU XS is similar to the POWER6, but with a pipeline length of 2 cycles
+// (instead of 3 cycles on the POWER6). VSU XS handles vector FX-style ops.
+// Dispatch of an instruction to VS1 that uses four single prec. inputs
+// (either to a float or XC op). prevents dispatch in that cycle to VS2 of any
+// floating point instruction.
+//
+// The VSU PM is similar to the POWER6, but with a pipeline length of 3 cycles
+// (instead of 4 cycles on the POWER6). vsel is handled by the PM pipeline
+// (unlike on the POWER6).
+//
+// FMA from the VSUs can forward results in 6 cycles. VS1 XS and vector FP
+// share the same write-back, and have a 5-cycle latency difference, so the
+// IFU/IDU will not dispatch an XS instructon 5 cycles after a vector FP
+// op. has been dispatched to VS1.
+//
+// Three cycles after an L1 cache hit, a dependent VSU instruction can issue.
+//
+// Instruction dispatch groups have (at most) four non-branch instructions, and
+// two branches. Unlike on the POWER4/5, a branch does not automatically
+// end the dispatch group, but a second branch must be the last in the group.
+
+def P7Itineraries : ProcessorItineraries<
+  [P7_DU1, P7_DU2, P7_DU3, P7_DU4, P7_DU5, P7_DU6,
+   P7_LS1, P7_LS2, P7_FX1, P7_FX2, P7_VS1, P7_VS2, P7_CRU, P7_BRU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2,
+                                                  P7_LS1, P7_LS2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntISEL,      [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2], 0>,
+                                   InstrStage<1, [P7_BRU]>],
+                                  [1, 1, 1, 1]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1, 1]>,
+  // FIXME: Add record-form itinerary data.
+  InstrItinData<IIC_IntDivW     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<36, [P7_FX1, P7_FX2]>],
+                                  [36, 1, 1]>,
+  InstrItinData<IIC_IntDivD     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<68, [P7_FX1, P7_FX2]>],
+                                  [68, 1, 1]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [4, 1, 1]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [4, 1, 1]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [4, 1, 1]>,
+  InstrItinData<IIC_IntRotate   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                   [1, 1, 1]>,
+  InstrItinData<IIC_IntRotateD  , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                   [1, 1, 1]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1]>,
+  InstrItinData<IIC_IntTrapD    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
+                                   InstrStage<1, [P7_BRU]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_BrCR        , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_CRU]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_BrMCR       , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
+                                   InstrStage<1, [P7_BRU]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
+                                   InstrStage<1, [P7_BRU]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [2, 2, 1, 1]>,
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLD      , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_LdStLDU     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [2, 2, 1, 1]>,
+  InstrItinData<IIC_LdStLDUX    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLVecX   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [4, 4, 1, 1]>,
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [4, 4, 1, 1]>,
+  InstrItinData<IIC_LdStLWA     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLWARX,    [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLDARX,    [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLMW     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_LdStStore   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTD     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTDU    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [2, 1, 1, 1]>,
+  InstrItinData<IIC_LdStSTDUX   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [2, 1, 1, 1]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_FX1, P7_FX2], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [2, 1, 1, 1]>,
+  InstrItinData<IIC_LdStSTVEBX  , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2], 0>,
+                                   InstrStage<1, [P7_VS2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTDCX   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_LS1, P7_LS2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_DU2], 0>,
+                                   InstrStage<1, [P7_DU3], 0>,
+                                   InstrStage<1, [P7_DU4], 0>,
+                                   InstrStage<1, [P7_CRU]>,
+                                   InstrStage<1, [P7_FX1, P7_FX2]>],
+                                  [3, 1]>, // mtcr
+  InstrItinData<IIC_SprMFCR     , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_CRU]>],
+                                  [6, 1]>,
+  InstrItinData<IIC_SprMFCRF    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_CRU]>],
+                                  [3, 1]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_FX1]>],
+                                  [4, 1]>, // mtctr
+  InstrItinData<IIC_FPGeneral   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [5, 1, 1]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [5, 1, 1]>,
+  InstrItinData<IIC_FPCompare   , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [8, 1, 1]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [33, 1, 1]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [27, 1, 1]>,
+  InstrItinData<IIC_FPSqrtD     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [44, 1, 1]>,
+  InstrItinData<IIC_FPSqrtS     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [32, 1, 1]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [5, 1, 1, 1]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<1, [P7_DU1, P7_DU2,
+                                                  P7_DU3, P7_DU4], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [5, 1, 1]>,
+  InstrItinData<IIC_VecGeneral  , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_VecVSL      , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_VecVSR      , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_VecFP       , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [6, 1, 1]>,
+  InstrItinData<IIC_VecFPCompare, [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [6, 1, 1]>,
+  InstrItinData<IIC_VecFPRound  , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1, P7_VS2]>],
+                                  [6, 1, 1]>,
+  InstrItinData<IIC_VecComplex  , [InstrStage<1, [P7_DU1], 0>,
+                                   InstrStage<1, [P7_VS1]>],
+                                  [7, 1, 1]>,
+  InstrItinData<IIC_VecPerm     , [InstrStage<1, [P7_DU1, P7_DU2], 0>,
+                                   InstrStage<1, [P7_VS2]>],
+                                  [3, 1, 1]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// P7 machine model for scheduling and other instruction cost heuristics.
+
+def P7Model : SchedMachineModel {
+  let IssueWidth = 6;  // 4 (non-branch) instructions are dispatched per cycle.
+                       // Note that the dispatch bundle size is 6 (including
+                       // branches), but the total internal issue bandwidth per
+                       // cycle (from all queues) is 8.
+
+  let LoadLatency = 3; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 16;
+
+  // Try to make sure we have at least 10 dispatch groups in a loop.
+  let LoopMicroOpBufferSize = 40;
+
+  let CompleteModel = 0;
+
+  let Itineraries = P7Itineraries;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td
new file mode 100644
index 000000000000..8e52da583a0d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td
@@ -0,0 +1,406 @@
+//===-- PPCScheduleP8.td - PPC P8 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the POWER8 processor.
+//
+//===----------------------------------------------------------------------===//
+
+// Scheduling for the P8 involves tracking two types of resources:
+//  1. The dispatch bundle slots
+//  2. The functional unit resources
+
+// Dispatch units:
+def P8_DU1    : FuncUnit;
+def P8_DU2    : FuncUnit;
+def P8_DU3    : FuncUnit;
+def P8_DU4    : FuncUnit;
+def P8_DU5    : FuncUnit;
+def P8_DU6    : FuncUnit;
+def P8_DU7    : FuncUnit; // Only branch instructions will use DU7,DU8
+def P8_DU8    : FuncUnit;
+
+// 10 insns per cycle (2-LU, 2-LSU, 2-FXU, 2-FPU, 1-CRU, 1-BRU).
+
+def P8_LU1     : FuncUnit; // Loads or fixed-point operations 1
+def P8_LU2     : FuncUnit; // Loads or fixed-point operations 2
+
+// Load/Store pipelines can handle Stores, fixed-point loads, and simple
+// fixed-point operations.
+def P8_LSU1    : FuncUnit; // Load/Store pipeline 1
+def P8_LSU2    : FuncUnit; // Load/Store pipeline 2
+
+// Fixed Point unit
+def P8_FXU1    : FuncUnit; // FX pipeline 1
+def P8_FXU2    : FuncUnit; // FX pipeline 2
+
+// The Floating-Point Unit (FPU) and Vector Media Extension (VMX) units
+// are combined on P7 and newer into a Vector Scalar Unit (VSU).
+// The P8 Instruction latency documents still refers to the unit as the
+// FPU, so keep in mind that FPU==VSU.
+// In contrast to the P7, the VMX units on P8 are symmetric, so no need to
+// split vector integer ops or 128-bit load/store/perms to the specific units.
+def P8_FPU1    : FuncUnit; // VS pipeline 1
+def P8_FPU2    : FuncUnit; // VS pipeline 2
+
+def P8_CRU    : FuncUnit; // CR unit (CR logicals and move-from-SPRs)
+def P8_BRU    : FuncUnit; // BR unit
+
+def P8Itineraries : ProcessorItineraries<
+  [P8_DU1, P8_DU2, P8_DU3, P8_DU4, P8_DU5, P8_DU6, P8_DU7, P8_DU8,
+   P8_LU1, P8_LU2, P8_LSU1, P8_LSU2, P8_FXU1, P8_FXU2,
+   P8_FPU1, P8_FPU2, P8_CRU, P8_BRU], [], [
+  InstrItinData<IIC_IntSimple   , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2,
+                                                  P8_LU1, P8_LU2,
+                                                  P8_LSU1, P8_LSU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntGeneral  , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2, P8_LU1,
+                                                  P8_LU2, P8_LSU1, P8_LSU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntISEL,      [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2], 0>,
+                                   InstrStage<1, [P8_BRU]>],
+                                  [1, 1, 1, 1]>,
+  InstrItinData<IIC_IntCompare  , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntDivW     , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<15, [P8_FXU1, P8_FXU2]>],
+                                  [15, 1, 1]>,
+  InstrItinData<IIC_IntDivD     , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<23, [P8_FXU1, P8_FXU2]>],
+                                  [23, 1, 1]>,
+  InstrItinData<IIC_IntMulHW    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [4, 1, 1]>,
+  InstrItinData<IIC_IntMulHWU   , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [4, 1, 1]>,
+  InstrItinData<IIC_IntMulLI    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [4, 1, 1]>,
+  InstrItinData<IIC_IntRotate   , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                   [1, 1, 1]>,
+  InstrItinData<IIC_IntRotateD  , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                   [1, 1, 1]>,
+  InstrItinData<IIC_IntShift    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_IntTrapW    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [1, 1]>,
+  InstrItinData<IIC_IntTrapD    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [1, 1]>,
+  InstrItinData<IIC_BrB         , [InstrStage<1, [P8_DU7, P8_DU8], 0>,
+                                   InstrStage<1, [P8_BRU]>],
+                                  [3, 1, 1]>,
+  // FIXME - the Br* groups below are not branch related, so should probably
+  // be renamed.
+  // IIC_BrCR consists of the cr* instructions.  (crand,crnor,creqv, etc).
+  // and should be 'First' in dispatch.
+  InstrItinData<IIC_BrCR        , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_CRU]>],
+                                  [3, 1, 1]>,
+  // IIC_BrMCR consists of the mcrf instruction.
+  InstrItinData<IIC_BrMCR       , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_CRU]>],
+                                  [3, 1, 1]>,
+  // IIC_BrMCRX consists of mcrxr (obsolete instruction) and mtcrf, which
+  // should be first in the dispatch group.
+  InstrItinData<IIC_BrMCRX      , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_BrMCRX      , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [3, 1]>,
+  InstrItinData<IIC_LdStLoad    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_LdStLoadUpd , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2 ], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [2, 2, 1, 1]>,
+  // Update-Indexed form loads/stores are no longer first and last in the
+  // dispatch group.  They are simply cracked, so require DU1,DU2.
+  InstrItinData<IIC_LdStLoadUpdX, [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLD      , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_LdStLDU     , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [2, 2, 1, 1]>,
+  InstrItinData<IIC_LdStLDUX    , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLFD     , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LU1, P8_LU2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLVecX   , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LU1, P8_LU2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLFDU    , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LU1, P8_LU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLFDUX   , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LU1, P8_LU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [3, 3, 1, 1]>,
+  InstrItinData<IIC_LdStLHA     , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2,
+                                                  P8_LU1, P8_LU2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLHAU    , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [4, 4, 1, 1]>,
+  // first+last in dispatch group.
+  InstrItinData<IIC_LdStLHAUX   , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_DU3], 0>,
+                                   InstrStage<1, [P8_DU4], 0>,
+                                   InstrStage<1, [P8_DU5], 0>,
+                                   InstrStage<1, [P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [4, 4, 1, 1]>,
+  InstrItinData<IIC_LdStLWA     , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2]>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLWARX,    [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_DU3], 0>,
+                                   InstrStage<1, [P8_DU4], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2]>],
+                                  [3, 1, 1]>,
+  // first+last
+  InstrItinData<IIC_LdStLDARX,    [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_DU3], 0>,
+                                   InstrStage<1, [P8_DU4], 0>,
+                                   InstrStage<1, [P8_DU5], 0>,
+                                   InstrStage<1, [P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2]>],
+                                  [3, 1, 1]>,
+  InstrItinData<IIC_LdStLMW     , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2,
+                                                  P8_LU1, P8_LU2]>],
+                                  [2, 1, 1]>,
+// Stores are dual-issued from the issue queue, so may only take up one
+// dispatch slot.  The instruction will be broken into two IOPS. The agen
+// op is issued to the LSU, and the data op (register fetch) is issued
+// to either the LU (GPR store) or the VSU (FPR store).
+  InstrItinData<IIC_LdStStore   , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2]>,
+                                   InstrStage<1, [P8_LU1, P8_LU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTD     , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LU1, P8_LU2,
+                                                  P8_LSU1, P8_LSU2]>]
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTDU    , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LU1, P8_LU2,
+                                                  P8_LSU1, P8_LSU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [2, 1, 1, 1]>,
+  // First+last
+  InstrItinData<IIC_LdStSTDUX   , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_DU3], 0>,
+                                   InstrStage<1, [P8_DU4], 0>,
+                                   InstrStage<1, [P8_DU5], 0>,
+                                   InstrStage<1, [P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [2, 1, 1, 1]>,
+  InstrItinData<IIC_LdStSTFD    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTFDU   , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [2, 1, 1, 1]>,
+  InstrItinData<IIC_LdStSTVEBX  , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTDCX   , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_DU3], 0>,
+                                   InstrStage<1, [P8_DU4], 0>,
+                                   InstrStage<1, [P8_DU5], 0>,
+                                   InstrStage<1, [P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+                                   InstrStage<1, [P8_LU1, P8_LU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_LdStSTWCX   , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_DU2], 0>,
+                                   InstrStage<1, [P8_DU3], 0>,
+                                   InstrStage<1, [P8_DU4], 0>,
+                                   InstrStage<1, [P8_DU5], 0>,
+                                   InstrStage<1, [P8_DU6], 0>,
+                                   InstrStage<1, [P8_LSU1, P8_LSU2], 0>,
+                                   InstrStage<1, [P8_LU1, P8_LU2]>],
+                                  [1, 1, 1]>,
+  InstrItinData<IIC_SprMFCR     , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_CRU]>],
+                                  [6, 1]>,
+  InstrItinData<IIC_SprMFCRF    , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_CRU]>],
+                                  [3, 1]>,
+  InstrItinData<IIC_SprMTSPR    , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FXU1, P8_FXU2]>],
+                                  [4, 1]>, // mtctr
+  InstrItinData<IIC_FPGeneral   , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [5, 1, 1]>,
+  InstrItinData<IIC_FPAddSub    , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [5, 1, 1]>,
+  InstrItinData<IIC_FPCompare   , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [8, 1, 1]>,
+  InstrItinData<IIC_FPDivD      , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [33, 1, 1]>,
+  InstrItinData<IIC_FPDivS      , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [27, 1, 1]>,
+  InstrItinData<IIC_FPSqrtD     , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [44, 1, 1]>,
+  InstrItinData<IIC_FPSqrtS     , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [32, 1, 1]>,
+  InstrItinData<IIC_FPFused     , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [5, 1, 1, 1]>,
+  InstrItinData<IIC_FPRes       , [InstrStage<1, [P8_DU1, P8_DU2, P8_DU3,
+                                                  P8_DU4, P8_DU5, P8_DU6], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [5, 1, 1]>,
+  InstrItinData<IIC_VecGeneral  , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_VecVSL      , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_VecVSR      , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [2, 1, 1]>,
+  InstrItinData<IIC_VecFP       , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [6, 1, 1]>,
+  InstrItinData<IIC_VecFPCompare, [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [6, 1, 1]>,
+  InstrItinData<IIC_VecFPRound  , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [6, 1, 1]>,
+  InstrItinData<IIC_VecComplex  , [InstrStage<1, [P8_DU1], 0>,
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
+                                  [7, 1, 1]>,
+  InstrItinData<IIC_VecPerm     , [InstrStage<1, [P8_DU1, P8_DU2], 0>,
+                                   InstrStage<1, [P8_FPU2, P8_FPU2]>],
+                                  [3, 1, 1]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// P8 machine model for scheduling and other instruction cost heuristics.
+// P8 has an 8 insn dispatch group (6 non-branch, 2 branch) and can issue up
+// to 10 insns per cycle (2-LU, 2-LSU, 2-FXU, 2-FPU, 1-CRU, 1-BRU).
+
+def P8Model : SchedMachineModel {
+  let IssueWidth = 8;  // up to 8 instructions dispatched per cycle.
+                       // up to six non-branch instructions.
+                       // up to two branches in a dispatch group.
+
+  let LoadLatency = 3; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 16;
+
+  // Try to make sure we have at least 10 dispatch groups in a loop.
+  let LoopMicroOpBufferSize = 60;
+
+  let CompleteModel = 0;
+
+  let Itineraries = P8Itineraries;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td
new file mode 100644
index 000000000000..a9c1bd78b05e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td
@@ -0,0 +1,335 @@
+//===-- PPCScheduleP9.td - PPC P9 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the POWER9 processor.
+//
+//===----------------------------------------------------------------------===//
+include "PPCInstrInfo.td"
+
+def P9Model : SchedMachineModel {
+  let IssueWidth = 8;
+
+  let LoadLatency = 5;
+
+  let MispredictPenalty = 16;
+
+  // Try to make sure we have at least 10 dispatch groups in a loop.
+  let LoopMicroOpBufferSize = 60;
+
+  let CompleteModel = 0;
+
+}
+
+let SchedModel = P9Model in {
+
+  // ***************** Processor Resources *****************
+
+  //Dispatcher:
+  def DISPATCHER : ProcResource<12>;
+
+  // Issue Ports
+  def IP_AGEN : ProcResource<4>;
+  def IP_EXEC : ProcResource<4>;
+  def IP_EXECE : ProcResource<2> {
+    //Even Exec Ports
+    let Super = IP_EXEC;
+  }
+  def IP_EXECO : ProcResource<2> {
+    //Odd Exec Ports
+    let Super = IP_EXEC;
+  }
+
+  // Pipeline Groups
+  def ALU : ProcResource<4>;
+  def ALUE : ProcResource<2> {
+    //Even ALU pipelines
+    let Super = ALU;
+  }
+  def ALUO : ProcResource<2> {
+    //Odd ALU pipelines
+    let Super = ALU;
+  }
+  def DIV : ProcResource<2>;
+  def DP : ProcResource<4>;
+  def DPE : ProcResource<2> {
+    //Even DP pipelines
+    let Super = DP;
+  }
+  def DPO : ProcResource<2> {
+    //Odd DP pipelines
+    let Super = DP;
+  }
+  def LS : ProcResource<4>;
+  def PM : ProcResource<2>;
+  def DFU : ProcResource<1>;
+
+  def TestGroup : ProcResGroup<[ALU, DP]>;
+
+  // ***************** SchedWriteRes Definitions *****************
+
+  //Dispatcher
+  def DISP_1C : SchedWriteRes<[DISPATCHER]> {
+    let NumMicroOps = 0;
+    let Latency = 1;
+  }
+
+  // Issue Ports
+  def IP_AGEN_1C : SchedWriteRes<[IP_AGEN]> {
+    let NumMicroOps = 0;
+    let Latency = 1;
+  }
+
+  def IP_EXEC_1C : SchedWriteRes<[IP_EXEC]> {
+    let NumMicroOps = 0;
+    let Latency = 1;
+  }
+
+  def IP_EXECE_1C : SchedWriteRes<[IP_EXECE]> {
+    let NumMicroOps = 0;
+    let Latency = 1;
+  }
+
+  def IP_EXECO_1C : SchedWriteRes<[IP_EXECO]> {
+    let NumMicroOps = 0;
+    let Latency = 1;
+  }
+
+  //Pipeline Groups
+  def P9_ALU_2C : SchedWriteRes<[ALU]> {
+    let Latency = 2;
+  }
+
+  def P9_ALUE_2C : SchedWriteRes<[ALUE]> {
+    let Latency = 2;
+  }
+
+  def P9_ALUO_2C : SchedWriteRes<[ALUO]> {
+    let Latency = 2;
+  }
+
+  def P9_ALU_3C : SchedWriteRes<[ALU]> {
+    let Latency = 3;
+  }
+
+  def P9_ALUE_3C : SchedWriteRes<[ALUE]> {
+    let Latency = 3;
+  }
+
+  def P9_ALUO_3C : SchedWriteRes<[ALUO]> {
+    let Latency = 3;
+  }
+
+  def P9_ALU_4C : SchedWriteRes<[ALU]> {
+    let Latency = 4;
+  }
+
+  def P9_ALUE_4C : SchedWriteRes<[ALUE]> {
+    let Latency = 4;
+  }
+
+  def P9_ALUO_4C : SchedWriteRes<[ALUO]> {
+    let Latency = 4;
+  }
+
+  def P9_ALU_5C : SchedWriteRes<[ALU]> {
+    let Latency = 5;
+  }
+
+  def P9_ALU_6C : SchedWriteRes<[ALU]> {
+    let Latency = 6;
+  }
+
+  def P9_DIV_16C_8 : SchedWriteRes<[DIV]> {
+    let ResourceCycles = [8];
+    let Latency = 16;
+  }
+
+  def P9_DIV_24C_8 : SchedWriteRes<[DIV]> {
+    let ResourceCycles = [8];
+    let Latency = 24;
+  }
+
+  def P9_DIV_40C_8 : SchedWriteRes<[DIV]> {
+    let ResourceCycles = [8];
+    let Latency = 40;
+  }
+
+  def P9_DP_2C : SchedWriteRes<[DP]> {
+    let Latency = 2;
+  }
+
+  def P9_DP_5C : SchedWriteRes<[DP]> {
+    let Latency = 5;
+  }
+
+  def P9_DP_7C : SchedWriteRes<[DP]> {
+    let Latency = 7;
+  }
+
+  def P9_DPE_7C : SchedWriteRes<[DPE]> {
+    let Latency = 7;
+  }
+
+  def P9_DPO_7C : SchedWriteRes<[DPO]> {
+    let Latency = 7;
+  }
+
+  def P9_DP_22C_5 : SchedWriteRes<[DP]> {
+    let ResourceCycles = [5];
+    let Latency = 22;
+  }
+
+  def P9_DP_24C_8 : SchedWriteRes<[DP]> {
+    let ResourceCycles = [8];
+    let Latency = 24;
+  }
+
+  def P9_DP_26C_5 : SchedWriteRes<[DP]> {
+    let ResourceCycles = [5];
+    let Latency = 22;
+  }
+
+  def P9_DP_27C_7 : SchedWriteRes<[DP]> {
+    let ResourceCycles = [7];
+    let Latency = 27;
+  }
+
+  def P9_DP_33C_8 : SchedWriteRes<[DP]> {
+    let ResourceCycles = [8];
+    let Latency = 33;
+  }
+
+  def P9_DP_36C_10 : SchedWriteRes<[DP]> {
+    let ResourceCycles = [10];
+    let Latency = 36;
+  }
+
+  def P9_PM_3C : SchedWriteRes<[PM]> {
+    let Latency = 3;
+  }
+
+  def P9_PM_7C : SchedWriteRes<[PM]> {
+    let Latency = 3;
+  }
+
+  def P9_LS_1C : SchedWriteRes<[LS]> {
+    let Latency = 1;
+  }
+
+  def P9_LS_4C : SchedWriteRes<[LS]> {
+    let Latency = 4;
+  }
+
+  def P9_LS_5C : SchedWriteRes<[LS]> {
+    let Latency = 5;
+  }
+
+  def P9_DFU_12C : SchedWriteRes<[DFU]> {
+    let Latency = 12;
+  }
+
+  def P9_DFU_24C : SchedWriteRes<[DFU]> {
+    let Latency = 24;
+    let ResourceCycles = [12];
+  }
+
+  def P9_DFU_58C : SchedWriteRes<[DFU]> {
+    let Latency = 58;
+    let ResourceCycles = [44];
+  }
+
+  def P9_DFU_76C : SchedWriteRes<[TestGroup, DFU]> {
+    let Latency = 76;
+    let ResourceCycles = [62];
+  }
+  // ***************** WriteSeq Definitions *****************
+
+  def P9_LoadAndALUOp_6C : WriteSequence<[P9_LS_4C, P9_ALU_2C]>;
+  def P9_LoadAndALUOp_7C : WriteSequence<[P9_LS_5C, P9_ALU_2C]>;
+  def P9_LoadAndPMOp_8C : WriteSequence<[P9_LS_5C, P9_PM_3C]>;
+  def P9_IntDivAndALUOp_26C_8 : WriteSequence<[P9_DIV_24C_8, P9_ALU_2C]>;
+  def P9_IntDivAndALUOp_42C_8 : WriteSequence<[P9_DIV_40C_8, P9_ALU_2C]>;
+  def P9_StoreAndALUOp_4C : WriteSequence<[P9_LS_1C, P9_ALU_3C]>;
+  def P9_ALUOpAndALUOp_4C : WriteSequence<[P9_ALU_2C, P9_ALU_2C]>;
+
+  // ***************** Defining Itinerary Class Resources *****************
+
+  def : ItinRW<[P9_DFU_76C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_IntSimple,
+                                         IIC_IntGeneral]>;
+
+  def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+               [IIC_IntISEL, IIC_IntRotate, IIC_IntShift]>;
+
+  def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_IntCompare]>;
+
+  def : ItinRW<[P9_DP_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+               [IIC_IntMulHW, IIC_IntMulHWU, IIC_IntMulLI]>;
+
+  def : ItinRW<[P9_LS_5C, IP_EXEC_1C, DISP_1C, DISP_1C],
+               [IIC_LdStLoad, IIC_LdStLD]>;
+
+  def : ItinRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
+                DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+               [IIC_LdStLoadUpd, IIC_LdStLDU]>;
+
+  def : ItinRW<[P9_LS_4C, P9_ALU_2C, IP_EXECE_1C, IP_EXECO_1C,
+                DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+               [IIC_LdStLoadUpdX, IIC_LdStLDUX]>;
+
+  def : ItinRW<[P9_LS_1C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
+                DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+               [IIC_LdStSTFDU]>;
+
+  def : ItinRW<[P9_LoadAndALUOp_6C,
+                IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+               [IIC_LdStLHA, IIC_LdStLWA]>;
+
+  def : ItinRW<[P9_LoadAndALUOp_6C, P9_ALU_2C,
+                IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C,
+                DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+               [IIC_LdStLHAU, IIC_LdStLHAUX]>;
+
+  // IIC_LdStLMW contains two microcoded insns. This is not accurate, but
+  // those insns are not used that much, if at all.
+  def : ItinRW<[P9_LS_4C, IP_EXEC_1C, DISP_1C, DISP_1C],
+               [IIC_LdStLWARX, IIC_LdStLDARX, IIC_LdStLMW]>;
+
+  def : ItinRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C],
+               [IIC_LdStSTFD, IIC_LdStSTD, IIC_LdStStore]>;
+
+  def : ItinRW<[P9_LS_1C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
+                DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+               [IIC_LdStSTDU, IIC_LdStSTDUX]>;
+
+  def : ItinRW<[P9_StoreAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C,
+                DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C],
+               [IIC_LdStSTDCX, IIC_LdStSTWCX]>;
+
+  def : ItinRW<[P9_ALU_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
+               [IIC_BrCR, IIC_IntMTFSB0]>;
+
+  def : ItinRW<[P9_ALUOpAndALUOp_4C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C,
+                IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C,
+                DISP_1C, DISP_1C, DISP_1C], [IIC_SprMFCR, IIC_SprMFCRF]>;
+
+  // This class should be broken down to instruction level, once some missing
+  // info is obtained.
+  def : ItinRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C,
+                DISP_1C, DISP_1C, DISP_1C], [IIC_SprMTSPR]>;
+
+  def : ItinRW<[P9_DP_7C, IP_EXEC_1C,
+                DISP_1C, DISP_1C, DISP_1C], [IIC_FPGeneral, IIC_FPAddSub]>;
+
+  def : ItinRW<[P9_DP_36C_10, IP_EXEC_1C], [IIC_FPSqrtD]>;
+  def : ItinRW<[P9_DP_26C_5, P9_DP_26C_5, IP_EXEC_1C, IP_EXEC_1C], [IIC_FPSqrtS]>;
+
+  include "P9InstrResources.td"
+
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
new file mode 100644
index 000000000000..e8a87e7f4437
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -0,0 +1,252 @@
+//===-- PowerPCSubtarget.cpp - PPC Subtarget Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPC specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCSubtarget.h"
+#include "PPC.h"
+#include "PPCRegisterInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "PPCGenSubtargetInfo.inc"
+
+static cl::opt<bool> UseSubRegLiveness("ppc-track-subreg-liveness",
+cl::desc("Enable subregister liveness tracking for PPC"), cl::Hidden);
+
+static cl::opt<bool> QPXStackUnaligned("qpx-stack-unaligned",
+  cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"),
+  cl::Hidden);
+
+PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                            StringRef FS) {
+  initializeEnvironment();
+  initSubtargetFeatures(CPU, FS);
+  return *this;
+}
+
+PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU,
+                           const std::string &FS, const PPCTargetMachine &TM)
+    : PPCGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT),
+      IsPPC64(TargetTriple.getArch() == Triple::ppc64 ||
+              TargetTriple.getArch() == Triple::ppc64le),
+      TM(TM), FrameLowering(initializeSubtargetDependencies(CPU, FS)),
+      InstrInfo(*this), TLInfo(TM, *this) {}
+
+void PPCSubtarget::initializeEnvironment() {
+  StackAlignment = 16;
+  DarwinDirective = PPC::DIR_NONE;
+  HasMFOCRF = false;
+  Has64BitSupport = false;
+  Use64BitRegs = false;
+  UseCRBits = false;
+  HasHardFloat = false;
+  HasAltivec = false;
+  HasSPE = false;
+  HasQPX = false;
+  HasVSX = false;
+  HasP8Vector = false;
+  HasP8Altivec = false;
+  HasP8Crypto = false;
+  HasP9Vector = false;
+  HasP9Altivec = false;
+  HasFCPSGN = false;
+  HasFSQRT = false;
+  HasFRE = false;
+  HasFRES = false;
+  HasFRSQRTE = false;
+  HasFRSQRTES = false;
+  HasRecipPrec = false;
+  HasSTFIWX = false;
+  HasLFIWAX = false;
+  HasFPRND = false;
+  HasFPCVT = false;
+  HasISEL = false;
+  HasBPERMD = false;
+  HasExtDiv = false;
+  HasCMPB = false;
+  HasLDBRX = false;
+  IsBookE = false;
+  HasOnlyMSYNC = false;
+  IsPPC4xx = false;
+  IsPPC6xx = false;
+  IsE500 = false;
+  FeatureMFTB = false;
+  DeprecatedDST = false;
+  HasLazyResolverStubs = false;
+  HasICBT = false;
+  HasInvariantFunctionDescriptors = false;
+  HasPartwordAtomics = false;
+  HasDirectMove = false;
+  IsQPXStackUnaligned = false;
+  HasHTM = false;
+  HasFusion = false;
+  HasFloat128 = false;
+  IsISA3_0 = false;
+  UseLongCalls = false;
+
+  HasPOPCNTD = POPCNTD_Unavailable;
+}
+
+void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
+  // Determine default and user specified characteristics
+  std::string CPUName = CPU;
+  if (CPUName.empty() || CPU == "generic") {
+    // If cross-compiling with -march=ppc64le without -mcpu
+    if (TargetTriple.getArch() == Triple::ppc64le)
+      CPUName = "ppc64le";
+    else
+      CPUName = "generic";
+  }
+
+  // Initialize scheduling itinerary for the specified CPU.
+  InstrItins = getInstrItineraryForCPU(CPUName);
+
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FS);
+
+  // If the user requested use of 64-bit regs, but the cpu selected doesn't
+  // support it, ignore.
+  if (IsPPC64 && has64BitSupport())
+    Use64BitRegs = true;
+
+  // Set up darwin-specific properties.
+  if (isDarwin())
+    HasLazyResolverStubs = true;
+
+  // QPX requires a 32-byte aligned stack. Note that we need to do this if
+  // we're compiling for a BG/Q system regardless of whether or not QPX
+  // is enabled because external functions will assume this alignment.
+  IsQPXStackUnaligned = QPXStackUnaligned;
+  StackAlignment = getPlatformStackAlignment();
+
+  // Determine endianness.
+  // FIXME: Part of the TargetMachine.
+  IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le);
+}
+
+/// Return true if accesses to the specified global have to go through a dyld
+/// lazy resolution stub.  This means that an extra load is required to get the
+/// address of the global.
+bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const {
+  if (!HasLazyResolverStubs)
+    return false;
+  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+    return true;
+  // 32 bit macho has no relocation for a-b if a is undefined, even if b is in
+  // the section that is being relocated. This means we have to use o load even
+  // for GVs that are known to be local to the dso.
+  if (GV->isDeclarationForLinker() || GV->hasCommonLinkage())
+    return true;
+  return false;
+}
+
+// Embedded cores need aggressive scheduling (and some others also benefit).
+static bool needsAggressiveScheduling(unsigned Directive) {
+  switch (Directive) {
+  default: return false;
+  case PPC::DIR_440:
+  case PPC::DIR_A2:
+  case PPC::DIR_E500mc:
+  case PPC::DIR_E5500:
+  case PPC::DIR_PWR7:
+  case PPC::DIR_PWR8:
+  // FIXME: Same as P8 until POWER9 scheduling info is available
+  case PPC::DIR_PWR9:
+    return true;
+  }
+}
+
+bool PPCSubtarget::enableMachineScheduler() const {
+  // Enable MI scheduling for the embedded cores.
+  // FIXME: Enable this for all cores (some additional modeling
+  // may be necessary).
+  return needsAggressiveScheduling(DarwinDirective);
+}
+
+// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
+bool PPCSubtarget::enablePostRAScheduler() const { return true; }
+
+PPCGenSubtargetInfo::AntiDepBreakMode PPCSubtarget::getAntiDepBreakMode() const {
+  return TargetSubtargetInfo::ANTIDEP_ALL;
+}
+
+void PPCSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
+  CriticalPathRCs.clear();
+  CriticalPathRCs.push_back(isPPC64() ?
+                            &PPC::G8RCRegClass : &PPC::GPRCRegClass);
+}
+
+void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                       unsigned NumRegionInstrs) const {
+  if (needsAggressiveScheduling(DarwinDirective)) {
+    Policy.OnlyTopDown = false;
+    Policy.OnlyBottomUp = false;
+  }
+
+  // Spilling is generally expensive on all PPC cores, so always enable
+  // register-pressure tracking.
+  Policy.ShouldTrackPressure = true;
+}
+
+bool PPCSubtarget::useAA() const {
+  // Use AA during code generation for the embedded cores.
+  return needsAggressiveScheduling(DarwinDirective);
+}
+
+bool PPCSubtarget::enableSubRegLiveness() const {
+  return UseSubRegLiveness;
+}
+
+unsigned char PPCSubtarget::classifyGlobalReference(
+    const GlobalValue *GV) const {
+  // Note that currently we don't generate non-pic references.
+  // If a caller wants that, this will have to be updated.
+
+  // Large code model always uses the TOC even for local symbols.
+  if (TM.getCodeModel() == CodeModel::Large)
+    return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG;
+
+  unsigned char flags = PPCII::MO_PIC_FLAG;
+
+  // Only if the relocation mode is PIC do we have to worry about
+  // interposition. In all other cases we can use a slightly looser standard to
+  // decide how to access the symbol.
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // If it's local, or it's non-default, it can't be interposed.
+    if (!GV->hasLocalLinkage() &&
+        GV->hasDefaultVisibility()) {
+      flags |= PPCII::MO_NLP_FLAG;
+    }
+    return flags;
+  }
+
+  if (GV->isStrongDefinitionForLinker())
+    return flags;
+  return flags | PPCII::MO_NLP_FLAG;
+}
+
+bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); }
+bool PPCSubtarget::isPPC64() const { return TM.isPPC64(); }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
new file mode 100644
index 000000000000..7fd907990ceb
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -0,0 +1,322 @@
+//===-- PPCSubtarget.h - Define Subtarget for the PPC ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCSUBTARGET_H
+#define LLVM_LIB_TARGET_POWERPC_PPCSUBTARGET_H
+
+#include "PPCFrameLowering.h"
+#include "PPCISelLowering.h"
+#include "PPCInstrInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "PPCGenSubtargetInfo.inc"
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+namespace llvm {
+class StringRef;
+
+namespace PPC {
+  // -m directive values.
+  enum {
+    DIR_NONE,
+    DIR_32,
+    DIR_440,
+    DIR_601,
+    DIR_602,
+    DIR_603,
+    DIR_7400,
+    DIR_750,
+    DIR_970,
+    DIR_A2,
+    DIR_E500mc,
+    DIR_E5500,
+    DIR_PWR3,
+    DIR_PWR4,
+    DIR_PWR5,
+    DIR_PWR5X,
+    DIR_PWR6,
+    DIR_PWR6X,
+    DIR_PWR7,
+    DIR_PWR8,
+    DIR_PWR9,
+    DIR_64
+  };
+}
+
+class GlobalValue;
+class TargetMachine;
+
+class PPCSubtarget : public PPCGenSubtargetInfo {
+public:
+  enum POPCNTDKind {
+    POPCNTD_Unavailable,
+    POPCNTD_Slow,
+    POPCNTD_Fast
+  };
+
+protected:
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned StackAlignment;
+
+  /// Selected instruction itineraries (one entry per itinerary class.)
+  InstrItineraryData InstrItins;
+
+  /// Which cpu directive was used.
+  unsigned DarwinDirective;
+
+  /// Used by the ISel to turn in optimizations for POWER4-derived architectures
+  bool HasMFOCRF;
+  bool Has64BitSupport;
+  bool Use64BitRegs;
+  bool UseCRBits;
+  bool HasHardFloat;
+  bool IsPPC64;
+  bool HasAltivec;
+  bool HasSPE;
+  bool HasQPX;
+  bool HasVSX;
+  bool HasP8Vector;
+  bool HasP8Altivec;
+  bool HasP8Crypto;
+  bool HasP9Vector;
+  bool HasP9Altivec;
+  bool HasFCPSGN;
+  bool HasFSQRT;
+  bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
+  bool HasRecipPrec;
+  bool HasSTFIWX;
+  bool HasLFIWAX;
+  bool HasFPRND;
+  bool HasFPCVT;
+  bool HasISEL;
+  bool HasBPERMD;
+  bool HasExtDiv;
+  bool HasCMPB;
+  bool HasLDBRX;
+  bool IsBookE;
+  bool HasOnlyMSYNC;
+  bool IsE500;
+  bool IsPPC4xx;
+  bool IsPPC6xx;
+  bool FeatureMFTB;
+  bool DeprecatedDST;
+  bool HasLazyResolverStubs;
+  bool IsLittleEndian;
+  bool HasICBT;
+  bool HasInvariantFunctionDescriptors;
+  bool HasPartwordAtomics;
+  bool HasDirectMove;
+  bool HasHTM;
+  bool HasFusion;
+  bool HasFloat128;
+  bool IsISA3_0;
+  bool UseLongCalls;
+
+  POPCNTDKind HasPOPCNTD;
+
+  /// When targeting QPX running a stock PPC64 Linux kernel where the stack
+  /// alignment has not been changed, we need to keep the 16-byte alignment
+  /// of the stack.
+  bool IsQPXStackUnaligned;
+
+  const PPCTargetMachine &TM;
+  PPCFrameLowering FrameLowering;
+  PPCInstrInfo InstrInfo;
+  PPCTargetLowering TLInfo;
+  SelectionDAGTargetInfo TSInfo;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  PPCSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
+               const PPCTargetMachine &TM);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return StackAlignment; }
+
+  /// getDarwinDirective - Returns the -m directive specified for the cpu.
+  ///
+  unsigned getDarwinDirective() const { return DarwinDirective; }
+
+  /// getInstrItins - Return the instruction itineraries based on subtarget
+  /// selection.
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
+
+  const PPCFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const PPCInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const PPCTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const PPCRegisterInfo *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
+  }
+  const PPCTargetMachine &getTargetMachine() const { return TM; }
+
+  /// initializeSubtargetDependencies - Initializes using a CPU and feature string
+  /// so that we can use initializer lists for subtarget initialization.
+  PPCSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+private:
+  void initializeEnvironment();
+  void initSubtargetFeatures(StringRef CPU, StringRef FS);
+
+public:
+  /// isPPC64 - Return true if we are generating code for 64-bit pointer mode.
+  ///
+  bool isPPC64() const;
+
+  /// has64BitSupport - Return true if the selected CPU supports 64-bit
+  /// instructions, regardless of whether we are in 32-bit or 64-bit mode.
+  bool has64BitSupport() const { return Has64BitSupport; }
+  // useSoftFloat - Return true if soft-float option is turned on.
+  bool useSoftFloat() const { return !HasHardFloat; }
+
+  /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit
+  /// registers in 32-bit mode when possible.  This can only true if
+  /// has64BitSupport() returns true.
+  bool use64BitRegs() const { return Use64BitRegs; }
+
+  /// useCRBits - Return true if we should store and manipulate i1 values in
+  /// the individual condition register bits.
+  bool useCRBits() const { return UseCRBits; }
+
+  /// hasLazyResolverStub - Return true if accesses to the specified global have
+  /// to go through a dyld lazy resolution stub.  This means that an extra load
+  /// is required to get the address of the global.
+  bool hasLazyResolverStub(const GlobalValue *GV) const;
+
+  // isLittleEndian - True if generating little-endian code
+  bool isLittleEndian() const { return IsLittleEndian; }
+
+  // Specific obvious features.
+  bool hasFCPSGN() const { return HasFCPSGN; }
+  bool hasFSQRT() const { return HasFSQRT; }
+  bool hasFRE() const { return HasFRE; }
+  bool hasFRES() const { return HasFRES; }
+  bool hasFRSQRTE() const { return HasFRSQRTE; }
+  bool hasFRSQRTES() const { return HasFRSQRTES; }
+  bool hasRecipPrec() const { return HasRecipPrec; }
+  bool hasSTFIWX() const { return HasSTFIWX; }
+  bool hasLFIWAX() const { return HasLFIWAX; }
+  bool hasFPRND() const { return HasFPRND; }
+  bool hasFPCVT() const { return HasFPCVT; }
+  bool hasAltivec() const { return HasAltivec; }
+  bool hasSPE() const { return HasSPE; }
+  bool hasQPX() const { return HasQPX; }
+  bool hasVSX() const { return HasVSX; }
+  bool hasP8Vector() const { return HasP8Vector; }
+  bool hasP8Altivec() const { return HasP8Altivec; }
+  bool hasP8Crypto() const { return HasP8Crypto; }
+  bool hasP9Vector() const { return HasP9Vector; }
+  bool hasP9Altivec() const { return HasP9Altivec; }
+  bool hasMFOCRF() const { return HasMFOCRF; }
+  bool hasISEL() const { return HasISEL; }
+  bool hasBPERMD() const { return HasBPERMD; }
+  bool hasExtDiv() const { return HasExtDiv; }
+  bool hasCMPB() const { return HasCMPB; }
+  bool hasLDBRX() const { return HasLDBRX; }
+  bool isBookE() const { return IsBookE; }
+  bool hasOnlyMSYNC() const { return HasOnlyMSYNC; }
+  bool isPPC4xx() const { return IsPPC4xx; }
+  bool isPPC6xx() const { return IsPPC6xx; }
+  bool isE500() const { return IsE500; }
+  bool isFeatureMFTB() const { return FeatureMFTB; }
+  bool isDeprecatedDST() const { return DeprecatedDST; }
+  bool hasICBT() const { return HasICBT; }
+  bool hasInvariantFunctionDescriptors() const {
+    return HasInvariantFunctionDescriptors;
+  }
+  bool hasPartwordAtomics() const { return HasPartwordAtomics; }
+  bool hasDirectMove() const { return HasDirectMove; }
+
+  bool isQPXStackUnaligned() const { return IsQPXStackUnaligned; }
+  unsigned getPlatformStackAlignment() const {
+    if ((hasQPX() || isBGQ()) && !isQPXStackUnaligned())
+      return 32;
+
+    return 16;
+  }
+  bool hasHTM() const { return HasHTM; }
+  bool hasFusion() const { return HasFusion; }
+  bool hasFloat128() const { return HasFloat128; }
+  bool isISA3_0() const { return IsISA3_0; }
+  bool useLongCalls() const { return UseLongCalls; }
+  bool needsSwapsForVSXMemOps() const {
+    return hasVSX() && isLittleEndian() && !hasP9Vector();
+  }
+
+  POPCNTDKind hasPOPCNTD() const { return HasPOPCNTD; }
+
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  /// isDarwin - True if this is any darwin platform.
+  bool isDarwin() const { return TargetTriple.isMacOSX(); }
+  /// isBGQ - True if this is a BG/Q platform.
+  bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; }
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+
+  bool isDarwinABI() const { return isTargetMachO() || isDarwin(); }
+  bool isSVR4ABI() const { return !isDarwinABI(); }
+  bool isELFv2ABI() const;
+
+  bool enableEarlyIfConversion() const override { return hasISEL(); }
+
+  // Scheduling customization.
+  bool enableMachineScheduler() const override;
+  // This overrides the PostRAScheduler bit in the SchedModel for each CPU.
+  bool enablePostRAScheduler() const override;
+  AntiDepBreakMode getAntiDepBreakMode() const override;
+  void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;
+
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
+                           unsigned NumRegionInstrs) const override;
+  bool useAA() const override;
+
+  bool enableSubRegLiveness() const override;
+
+  /// classifyGlobalReference - Classify a global variable reference for the
+  /// current subtarget accourding to how we should reference it.
+  unsigned char classifyGlobalReference(const GlobalValue *GV) const;
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
new file mode 100644
index 000000000000..0c1260a2965b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -0,0 +1,174 @@
+//===---------- PPCTLSDynamicCall.cpp - TLS Dynamic Call Fixup ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands ADDItls{ld,gd}LADDR[32] machine instructions into
+// separate ADDItls[gd]L[32] and GETtlsADDR[32] instructions, both of
+// which define GPR3.  A copy is added from GPR3 to the target virtual
+// register of the original instruction.  The GETtlsADDR[32] is really
+// a call instruction, so its target register is constrained to be GPR3.
+// This is not true of ADDItls[gd]L[32], but there is a legacy linker
+// optimization bug that requires the target register of the addi of
+// a local- or general-dynamic TLS access sequence to be GPR3.
+//
+// This is done in a late pass so that TLS variable accesses can be
+// fully commoned by MachineCSE.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-tls-dynamic-call"
+
+namespace llvm {
+  void initializePPCTLSDynamicCallPass(PassRegistry&);
+}
+
+namespace {
+  struct PPCTLSDynamicCall : public MachineFunctionPass {
+    static char ID;
+    PPCTLSDynamicCall() : MachineFunctionPass(ID) {
+      initializePPCTLSDynamicCallPass(*PassRegistry::getPassRegistry());
+    }
+
+    const PPCInstrInfo *TII;
+    LiveIntervals *LIS;
+
+protected:
+    bool processBlock(MachineBasicBlock &MBB) {
+      bool Changed = false;
+      bool Is64Bit = MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64();
+
+      for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+           I != IE;) {
+        MachineInstr &MI = *I;
+
+        if (MI.getOpcode() != PPC::ADDItlsgdLADDR &&
+            MI.getOpcode() != PPC::ADDItlsldLADDR &&
+            MI.getOpcode() != PPC::ADDItlsgdLADDR32 &&
+            MI.getOpcode() != PPC::ADDItlsldLADDR32) {
+          ++I;
+          continue;
+        }
+
+        DEBUG(dbgs() << "TLS Dynamic Call Fixup:\n    " << MI);
+
+        unsigned OutReg = MI.getOperand(0).getReg();
+        unsigned InReg = MI.getOperand(1).getReg();
+        DebugLoc DL = MI.getDebugLoc();
+        unsigned GPR3 = Is64Bit ? PPC::X3 : PPC::R3;
+        unsigned Opc1, Opc2;
+        const unsigned OrigRegs[] = {OutReg, InReg, GPR3};
+
+        switch (MI.getOpcode()) {
+        default:
+          llvm_unreachable("Opcode inconsistency error");
+        case PPC::ADDItlsgdLADDR:
+          Opc1 = PPC::ADDItlsgdL;
+          Opc2 = PPC::GETtlsADDR;
+          break;
+        case PPC::ADDItlsldLADDR:
+          Opc1 = PPC::ADDItlsldL;
+          Opc2 = PPC::GETtlsldADDR;
+          break;
+        case PPC::ADDItlsgdLADDR32:
+          Opc1 = PPC::ADDItlsgdL32;
+          Opc2 = PPC::GETtlsADDR32;
+          break;
+        case PPC::ADDItlsldLADDR32:
+          Opc1 = PPC::ADDItlsldL32;
+          Opc2 = PPC::GETtlsldADDR32;
+          break;
+        }
+
+        // Don't really need to save data to the stack - the clobbered
+        // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr)
+        // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR).
+        BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0);
+
+        // Expand into two ops built prior to the existing instruction.
+        MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3)
+          .addReg(InReg);
+        Addi->addOperand(MI.getOperand(2));
+
+        // The ADDItls* instruction is the first instruction in the
+        // repair range.
+        MachineBasicBlock::iterator First = I;
+        --First;
+
+        MachineInstr *Call = (BuildMI(MBB, I, DL, TII->get(Opc2), GPR3)
+                              .addReg(GPR3));
+        Call->addOperand(MI.getOperand(3));
+
+        BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0);
+
+        BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), OutReg)
+          .addReg(GPR3);
+
+        // The COPY is the last instruction in the repair range.
+        MachineBasicBlock::iterator Last = I;
+        --Last;
+
+        // Move past the original instruction and remove it.
+        ++I;
+        MI.removeFromParent();
+
+        // Repair the live intervals.
+        LIS->repairIntervalsInRange(&MBB, First, Last, OrigRegs);
+        Changed = true;
+      }
+
+      return Changed;
+    }
+
+public:
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      TII = MF.getSubtarget<PPCSubtarget>().getInstrInfo();
+      LIS = &getAnalysis<LiveIntervals>();
+
+      bool Changed = false;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LiveIntervals>();
+      AU.addPreserved<LiveIntervals>();
+      AU.addRequired<SlotIndexes>();
+      AU.addPreserved<SlotIndexes>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS_BEGIN(PPCTLSDynamicCall, DEBUG_TYPE,
+                      "PowerPC TLS Dynamic Call Fixup", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(PPCTLSDynamicCall, DEBUG_TYPE,
+                    "PowerPC TLS Dynamic Call Fixup", false, false)
+
+char PPCTLSDynamicCall::ID = 0;
+FunctionPass*
+llvm::createPPCTLSDynamicCallPass() { return new PPCTLSDynamicCall(); }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp
new file mode 100644
index 000000000000..7c53a5601790
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp
@@ -0,0 +1,155 @@
+//===-- PPCTOCRegDeps.cpp - Add Extra TOC Register Dependencies -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// When resolving an address using the ELF ABI TOC pointer, two relocations are
+// generally required: one for the high part and one for the low part. Only
+// the high part generally explicitly depends on r2 (the TOC pointer). And, so,
+// we might produce code like this:
+//
+// .Ltmp526:
+//         addis 3, 2, .LC12@toc@ha
+// .Ltmp1628:
+//         std 2, 40(1)
+//         ld 5, 0(27)
+//         ld 2, 8(27)
+//         ld 11, 16(27)
+//         ld 3, .LC12@toc@l(3)
+//         rldicl 4, 4, 0, 32
+//         mtctr 5
+//         bctrl
+//         ld 2, 40(1)
+//
+// And there is nothing wrong with this code, as such, but there is a linker bug
+// in binutils (https://sourceware.org/bugzilla/show_bug.cgi?id=18414) that will
+// misoptimize this code sequence to this:
+//         nop
+//         std     r2,40(r1)
+//         ld      r5,0(r27)
+//         ld      r2,8(r27)
+//         ld      r11,16(r27)
+//         ld      r3,-32472(r2)
+//         clrldi  r4,r4,32
+//         mtctr   r5
+//         bctrl
+//         ld      r2,40(r1)
+// because the linker does not know (and does not check) that the value in r2
+// changed in between the instruction using the .LC12@toc@ha (TOC-relative)
+// relocation and the instruction using the .LC12@toc@l(3) relocation.
+// Because it finds these instructions using the relocations (and not by
+// scanning the instructions), it has been asserted that there is no good way
+// to detect the change of r2 in between. As a result, this bug may never be
+// fixed (i.e. it may become part of the definition of the ABI). GCC was
+// updated to add extra dependencies on r2 to instructions using the @toc@l
+// relocations to avoid this problem, and we'll do the same here.
+//
+// This is done as a separate pass because:
+//  1. These extra r2 dependencies are not really properties of the
+//     instructions, but rather due to a linker bug, and maybe one day we'll be
+//     able to get rid of them when targeting linkers without this bug (and,
+//     thus, keeping the logic centralized here will make that
+//     straightforward).
+//  2. There are ISel-level peephole optimizations that propagate the @toc@l
+//     relocations to some user instructions, and so the exta dependencies do
+//     not apply only to a fixed set of instructions (without undesirable
+//     definition replication).
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-toc-reg-deps"
+
+namespace llvm {
+  void initializePPCTOCRegDepsPass(PassRegistry&);
+}
+
+namespace {
+  // PPCTOCRegDeps pass - For simple functions without epilogue code, move
+  // returns up, and create conditional returns, to avoid unnecessary
+  // branch-to-blr sequences.
+  struct PPCTOCRegDeps : public MachineFunctionPass {
+    static char ID;
+    PPCTOCRegDeps() : MachineFunctionPass(ID) {
+      initializePPCTOCRegDepsPass(*PassRegistry::getPassRegistry());
+    }
+
+protected:
+    bool hasTOCLoReloc(const MachineInstr &MI) {
+      if (MI.getOpcode() == PPC::LDtocL ||
+          MI.getOpcode() == PPC::ADDItocL)
+        return true;
+
+      for (const MachineOperand &MO : MI.operands()) {
+        if ((MO.getTargetFlags() & PPCII::MO_ACCESS_MASK) == PPCII::MO_TOC_LO)
+          return true;
+      }
+
+      return false;
+    }
+
+    bool processBlock(MachineBasicBlock &MBB) {
+      bool Changed = false;
+
+      for (auto &MI : MBB) {
+        if (!hasTOCLoReloc(MI))
+          continue;
+
+        MI.addOperand(MachineOperand::CreateReg(PPC::X2,
+                                                false  /*IsDef*/,
+                                                true  /*IsImp*/));
+        Changed = true;
+      }
+
+      return Changed;
+    }
+
+public:
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      bool Changed = false;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS(PPCTOCRegDeps, DEBUG_TYPE,
+                "PowerPC TOC Register Dependencies", false, false)
+
+char PPCTOCRegDeps::ID = 0;
+FunctionPass*
+llvm::createPPCTOCRegDepsPass() { return new PPCTOCRegDeps(); }
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
new file mode 100644
index 000000000000..91b1d24b2e41
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -0,0 +1,429 @@
+//===-- PPCTargetMachine.cpp - Define TargetMachine for PowerPC -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetMachine.h"
+#include "PPC.h"
+#include "PPCTargetObjectFile.h"
+#include "PPCTargetTransformInfo.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+static cl::
+opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden,
+                        cl::desc("Disable CTR loops for PPC"));
+
+static cl::
+opt<bool> DisablePreIncPrep("disable-ppc-preinc-prep", cl::Hidden,
+                            cl::desc("Disable PPC loop preinc prep"));
+
+static cl::opt<bool>
+VSXFMAMutateEarly("schedule-ppc-vsx-fma-mutation-early",
+  cl::Hidden, cl::desc("Schedule VSX FMA instruction mutation early"));
+
+static cl::
+opt<bool> DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden,
+                                cl::desc("Disable VSX Swap Removal for PPC"));
+
+static cl::
+opt<bool> DisableQPXLoadSplat("disable-ppc-qpx-load-splat", cl::Hidden,
+                              cl::desc("Disable QPX load splat simplification"));
+
+static cl::
+opt<bool> DisableMIPeephole("disable-ppc-peephole", cl::Hidden,
+                            cl::desc("Disable machine peepholes for PPC"));
+
+static cl::opt<bool>
+EnableGEPOpt("ppc-gep-opt", cl::Hidden,
+             cl::desc("Enable optimizations on complex GEPs"),
+             cl::init(true));
+
+static cl::opt<bool>
+EnablePrefetch("enable-ppc-prefetching",
+                  cl::desc("disable software prefetching on PPC"),
+                  cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+EnableExtraTOCRegDeps("enable-ppc-extra-toc-reg-deps",
+                      cl::desc("Add extra TOC register dependencies"),
+                      cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableMachineCombinerPass("ppc-machine-combiner",
+                          cl::desc("Enable the machine combiner pass"),
+                          cl::init(true), cl::Hidden);
+
+extern "C" void LLVMInitializePowerPCTarget() {
+  // Register the targets
+  RegisterTargetMachine<PPC32TargetMachine> A(getThePPC32Target());
+  RegisterTargetMachine<PPC64TargetMachine> B(getThePPC64Target());
+  RegisterTargetMachine<PPC64TargetMachine> C(getThePPC64LETarget());
+
+  PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializePPCBoolRetToIntPass(PR);
+}
+
+/// Return the datalayout string of a subtarget.
+static std::string getDataLayoutString(const Triple &T) {
+  bool is64Bit = T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le;
+  std::string Ret;
+
+  // Most PPC* platforms are big endian, PPC64LE is little endian.
+  if (T.getArch() == Triple::ppc64le)
+    Ret = "e";
+  else
+    Ret = "E";
+
+  Ret += DataLayout::getManglingComponent(T);
+
+  // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit
+  // pointers.
+  if (!is64Bit || T.getOS() == Triple::Lv2)
+    Ret += "-p:32:32";
+
+  // Note, the alignment values for f64 and i64 on ppc64 in Darwin
+  // documentation are wrong; these are correct (i.e. "what gcc does").
+  if (is64Bit || !T.isOSDarwin())
+    Ret += "-i64:64";
+  else
+    Ret += "-f64:32:64";
+
+  // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
+  if (is64Bit)
+    Ret += "-n32:64";
+  else
+    Ret += "-n32";
+
+  return Ret;
+}
+
+static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL,
+                                      const Triple &TT) {
+  std::string FullFS = FS;
+
+  // Make sure 64-bit features are available when CPUname is generic
+  if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) {
+    if (!FullFS.empty())
+      FullFS = "+64bit," + FullFS;
+    else
+      FullFS = "+64bit";
+  }
+
+  if (OL >= CodeGenOpt::Default) {
+    if (!FullFS.empty())
+      FullFS = "+crbits," + FullFS;
+    else
+      FullFS = "+crbits";
+  }
+
+  if (OL != CodeGenOpt::None) {
+    if (!FullFS.empty())
+      FullFS = "+invariant-function-descriptors," + FullFS;
+    else
+      FullFS = "+invariant-function-descriptors";
+  }
+
+  return FullFS;
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+  // If it isn't a Mach-O file then it's going to be a linux ELF
+  // object file.
+  if (TT.isOSDarwin())
+    return make_unique<TargetLoweringObjectFileMachO>();
+
+  return make_unique<PPC64LinuxTargetObjectFile>();
+}
+
+static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
+                                                 const TargetOptions &Options) {
+  if (Options.MCOptions.getABIName().startswith("elfv1"))
+    return PPCTargetMachine::PPC_ABI_ELFv1;
+  else if (Options.MCOptions.getABIName().startswith("elfv2"))
+    return PPCTargetMachine::PPC_ABI_ELFv2;
+
+  assert(Options.MCOptions.getABIName().empty() &&
+         "Unknown target-abi option!");
+
+  if (!TT.isMacOSX()) {
+    switch (TT.getArch()) {
+    case Triple::ppc64le:
+      return PPCTargetMachine::PPC_ABI_ELFv2;
+    case Triple::ppc64:
+      return PPCTargetMachine::PPC_ABI_ELFv1;
+    default:
+      // Fallthrough.
+      ;
+    }
+  }
+  return PPCTargetMachine::PPC_ABI_UNKNOWN;
+}
+
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+                                           Optional<Reloc::Model> RM) {
+  if (!RM.hasValue()) {
+    if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) {
+      if (!TT.isOSBinFormatMachO() && !TT.isMacOSX())
+        return Reloc::PIC_;
+    }
+    if (TT.isOSDarwin())
+      return Reloc::DynamicNoPIC;
+    return Reloc::Static;
+  }
+  return *RM;
+}
+
+// The FeatureString here is a little subtle. We are modifying the feature
+// string with what are (currently) non-function specific overrides as it goes
+// into the LLVMTargetMachine constructor and then using the stored value in the
+// Subtarget constructor below it.
+PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
+                                   StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
+                                   Optional<Reloc::Model> RM,
+                                   CodeModel::Model CM, CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, getDataLayoutString(TT), TT, CPU,
+                        computeFSAdditions(FS, OL, TT), Options,
+                        getEffectiveRelocModel(TT, RM), CM, OL),
+      TLOF(createTLOF(getTargetTriple())),
+      TargetABI(computeTargetABI(TT, Options)),
+      Subtarget(TargetTriple, CPU, computeFSAdditions(FS, OL, TT), *this) {
+
+  initAsmInfo();
+}
+
+PPCTargetMachine::~PPCTargetMachine() {}
+
+void PPC32TargetMachine::anchor() { }
+
+PPC32TargetMachine::PPC32TargetMachine(const Target &T, const Triple &TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+    : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+void PPC64TargetMachine::anchor() { }
+
+PPC64TargetMachine::PPC64TargetMachine(const Target &T, const Triple &TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+    : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+const PPCSubtarget *
+PPCTargetMachine::getSubtargetImpl(const Function &F) const {
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+
+  // FIXME: This is related to the code below to reset the target options,
+  // we need to know whether or not the soft float flag is set on the
+  // function before we can generate a subtarget. We also need to use
+  // it as a key for the subtarget since that can be the only difference
+  // between two functions.
+  bool SoftFloat =
+      F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+  // If the soft float attribute is set on the function turn on the soft float
+  // subtarget feature.
+  if (SoftFloat)
+    FS += FS.empty() ? "-hard-float" : ",-hard-float";
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<PPCSubtarget>(
+        TargetTriple, CPU,
+        // FIXME: It would be good to have the subtarget additions here
+        // not necessary. Anything that turns them on/off (overrides) ends
+        // up being put at the end of the feature string, but the defaults
+        // shouldn't require adding them. Fixing this means pulling Feature64Bit
+        // out of most of the target cpus in the .td file and making it set only
+        // as part of initialization via the TargetTriple.
+        computeFSAdditions(FS, getOptLevel(), getTargetTriple()), *this);
+  }
+  return I.get();
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// PPC Code Generator Pass Configuration Options.
+class PPCPassConfig : public TargetPassConfig {
+public:
+  PPCPassConfig(PPCTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  PPCTargetMachine &getPPCTargetMachine() const {
+    return getTM<PPCTargetMachine>();
+  }
+
+  void addIRPasses() override;
+  bool addPreISel() override;
+  bool addILPOpts() override;
+  bool addInstSelector() override;
+  void addMachineSSAOptimization() override;
+  void addPreRegAlloc() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new PPCPassConfig(this, PM);
+}
+
+void PPCPassConfig::addIRPasses() {
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCBoolRetToIntPass());
+  addPass(createAtomicExpandPass(&getPPCTargetMachine()));
+
+  // For the BG/Q (or if explicitly requested), add explicit data prefetch
+  // intrinsics.
+  bool UsePrefetching = TM->getTargetTriple().getVendor() == Triple::BGQ &&
+                        getOptLevel() != CodeGenOpt::None;
+  if (EnablePrefetch.getNumOccurrences() > 0)
+    UsePrefetching = EnablePrefetch;
+  if (UsePrefetching)
+    addPass(createLoopDataPrefetchPass());
+
+  if (TM->getOptLevel() >= CodeGenOpt::Default && EnableGEPOpt) {
+    // Call SeparateConstOffsetFromGEP pass to extract constants within indices
+    // and lower a GEP with multiple indices to either arithmetic operations or
+    // multiple GEPs with single index.
+    addPass(createSeparateConstOffsetFromGEPPass(TM, true));
+    // Call EarlyCSE pass to find and remove subexpressions in the lowered
+    // result.
+    addPass(createEarlyCSEPass());
+    // Do loop invariant code motion in case part of the lowered result is
+    // invariant.
+    addPass(createLICMPass());
+  }
+
+  TargetPassConfig::addIRPasses();
+}
+
+bool PPCPassConfig::addPreISel() {
+  if (!DisablePreIncPrep && getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine()));
+
+  if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCCTRLoops(getPPCTargetMachine()));
+
+  return false;
+}
+
+bool PPCPassConfig::addILPOpts() {
+  addPass(&EarlyIfConverterID);
+
+  if (EnableMachineCombinerPass)
+    addPass(&MachineCombinerID);
+
+  return true;
+}
+
+bool PPCPassConfig::addInstSelector() {
+  // Install an instruction selector.
+  addPass(createPPCISelDag(getPPCTargetMachine()));
+
+#ifndef NDEBUG
+  if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCCTRLoopsVerify());
+#endif
+
+  addPass(createPPCVSXCopyPass());
+  return false;
+}
+
+void PPCPassConfig::addMachineSSAOptimization() {
+  TargetPassConfig::addMachineSSAOptimization();
+  // For little endian, remove where possible the vector swap instructions
+  // introduced at code generation to normalize vector element order.
+  if (TM->getTargetTriple().getArch() == Triple::ppc64le &&
+      !DisableVSXSwapRemoval)
+    addPass(createPPCVSXSwapRemovalPass());
+  // Target-specific peephole cleanups performed after instruction
+  // selection.
+  if (!DisableMIPeephole) {
+    addPass(createPPCMIPeepholePass());
+    addPass(&DeadMachineInstructionElimID);
+  }
+}
+
+void PPCPassConfig::addPreRegAlloc() {
+  if (getOptLevel() != CodeGenOpt::None) {
+    initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
+    insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
+               &PPCVSXFMAMutateID);
+  }
+
+  // FIXME: We probably don't need to run these for -fPIE.
+  if (getPPCTargetMachine().isPositionIndependent()) {
+    // FIXME: LiveVariables should not be necessary here!
+    // PPCTLSDYnamicCallPass uses LiveIntervals which previously dependet on
+    // LiveVariables. This (unnecessary) dependency has been removed now,
+    // however a stage-2 clang build fails without LiveVariables computed here.
+    addPass(&LiveVariablesID, false);
+    addPass(createPPCTLSDynamicCallPass());
+  }
+  if (EnableExtraTOCRegDeps)
+    addPass(createPPCTOCRegDepsPass());
+}
+
+void PPCPassConfig::addPreSched2() {
+  if (getOptLevel() != CodeGenOpt::None) {
+    addPass(&IfConverterID);
+
+    // This optimization must happen after anything that might do store-to-load
+    // forwarding. Here we're after RA (and, thus, when spills are inserted)
+    // but before post-RA scheduling.
+    if (!DisableQPXLoadSplat)
+      addPass(createPPCQPXLoadSplatPass());
+  }
+}
+
+void PPCPassConfig::addPreEmitPass() {
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCEarlyReturnPass(), false);
+  // Must run branch selection immediately preceding the asm printer.
+  addPass(createPPCBranchSelectionPass(), false);
+}
+
+TargetIRAnalysis PPCTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(PPCTTIImpl(this, F));
+  });
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h
new file mode 100644
index 000000000000..59b4f1e30c0e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h
@@ -0,0 +1,85 @@
+//===-- PPCTargetMachine.h - Define TargetMachine for PowerPC ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETMACHINE_H
+#define LLVM_LIB_TARGET_POWERPC_PPCTARGETMACHINE_H
+
+#include "PPCInstrInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// Common code between 32-bit and 64-bit PowerPC targets.
+///
+class PPCTargetMachine : public LLVMTargetMachine {
+public:
+  enum PPCABI { PPC_ABI_UNKNOWN, PPC_ABI_ELFv1, PPC_ABI_ELFv2 };
+private:
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  PPCABI TargetABI;
+  PPCSubtarget Subtarget;
+
+  mutable StringMap<std::unique_ptr<PPCSubtarget>> SubtargetMap;
+
+public:
+  PPCTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                   StringRef FS, const TargetOptions &Options,
+                   Optional<Reloc::Model> RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL);
+
+  ~PPCTargetMachine() override;
+
+  const PPCSubtarget *getSubtargetImpl(const Function &F) const override;
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  TargetIRAnalysis getTargetIRAnalysis() override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+  bool isELFv2ABI() const { return TargetABI == PPC_ABI_ELFv2; }
+  bool isPPC64() const {
+    const Triple &TT = getTargetTriple();
+    return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le);
+  };
+};
+
+/// PowerPC 32-bit target machine.
+///
+class PPC32TargetMachine : public PPCTargetMachine {
+  virtual void anchor();
+public:
+  PPC32TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                     StringRef FS, const TargetOptions &Options,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+};
+
+/// PowerPC 64-bit target machine.
+///
+class PPC64TargetMachine : public PPCTargetMachine {
+  virtual void anchor();
+public:
+  PPC64TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                     StringRef FS, const TargetOptions &Options,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
new file mode 100644
index 000000000000..a049dc3fda93
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
@@ -0,0 +1,59 @@
+//===-- PPCTargetObjectFile.cpp - PPC Object Info -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetObjectFile.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+
+using namespace llvm;
+
+void
+PPC64LinuxTargetObjectFile::
+Initialize(MCContext &Ctx, const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
+MCSection *PPC64LinuxTargetObjectFile::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  // Here override ReadOnlySection to DataRelROSection for PPC64 SVR4 ABI
+  // when we have a constant that contains global relocations.  This is
+  // necessary because of this ABI's handling of pointers to functions in
+  // a shared library.  The address of a function is actually the address
+  // of a function descriptor, which resides in the .opd section.  Generated
+  // code uses the descriptor directly rather than going via the GOT as some
+  // other ABIs do, which means that initialized function pointers must
+  // reference the descriptor.  The linker must convert copy relocs of
+  // pointers to functions in shared libraries into dynamic relocations,
+  // because of an ordering problem with initialization of copy relocs and
+  // PLT entries.  The dynamic relocation will be initialized by the dynamic
+  // linker, so we must use DataRelROSection instead of ReadOnlySection.
+  // For more information, see the description of ELIMINATE_COPY_RELOCS in
+  // GNU ld.
+  if (Kind.isReadOnly()) {
+    const auto *GVar = dyn_cast<GlobalVariable>(GO);
+
+    if (GVar && GVar->isConstant() && GVar->getInitializer()->needsRelocation())
+      Kind = SectionKind::getReadOnlyWithRel();
+  }
+
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
+}
+
+const MCExpr *PPC64LinuxTargetObjectFile::
+getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
+  const MCExpr *Expr =
+    MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_DTPREL, getContext());
+  return MCBinaryExpr::createAdd(Expr,
+                                 MCConstantExpr::create(0x8000, getContext()),
+                                 getContext());
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h
new file mode 100644
index 000000000000..c8b9b2e9790b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h
@@ -0,0 +1,34 @@
+//===-- PPCTargetObjectFile.h - PPC Object Info -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+  /// PPC64LinuxTargetObjectFile - This implementation is used for
+  /// 64-bit PowerPC Linux.
+  class PPC64LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
+
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+    MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+                                      const TargetMachine &TM) const override;
+
+    /// \brief Describe a TLS variable address within debug info.
+    const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
+  };
+
+}  // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h
new file mode 100644
index 000000000000..dbe7617d3542
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h
@@ -0,0 +1,27 @@
+//===-- PPCTargetStreamer.h - PPC Target Streamer --s-----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class PPCTargetStreamer : public MCTargetStreamer {
+public:
+  PPCTargetStreamer(MCStreamer &S);
+  ~PPCTargetStreamer() override;
+  virtual void emitTCEntry(const MCSymbol &S) = 0;
+  virtual void emitMachine(StringRef CPU) = 0;
+  virtual void emitAbiVersion(int AbiVersion) = 0;
+  virtual void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) = 0;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
new file mode 100644
index 000000000000..f7785342b364
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -0,0 +1,443 @@
+//===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "ppctti"
+
+static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
+cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
+
+// This is currently only used for the data prefetch pass which is only enabled
+// for BG/Q by default.
+static cl::opt<unsigned>
+CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
+              cl::desc("The loop prefetch cache line size"));
+
+//===----------------------------------------------------------------------===//
+//
+// PPC cost model.
+//
+//===----------------------------------------------------------------------===//
+
+TargetTransformInfo::PopcntSupportKind
+PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
+    return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow ?
+             TTI::PSK_SlowHardware : TTI::PSK_FastHardware;
+  return TTI::PSK_Software;
+}
+
+int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+  if (DisablePPCConstHoist)
+    return BaseT::getIntImmCost(Imm, Ty);
+
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  if (Imm == 0)
+    return TTI::TCC_Free;
+
+  if (Imm.getBitWidth() <= 64) {
+    if (isInt<16>(Imm.getSExtValue()))
+      return TTI::TCC_Basic;
+
+    if (isInt<32>(Imm.getSExtValue())) {
+      // A constant that can be materialized using lis.
+      if ((Imm.getZExtValue() & 0xFFFF) == 0)
+        return TTI::TCC_Basic;
+
+      return 2 * TTI::TCC_Basic;
+    }
+  }
+
+  return 4 * TTI::TCC_Basic;
+}
+
+int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                              Type *Ty) {
+  if (DisablePPCConstHoist)
+    return BaseT::getIntImmCost(IID, Idx, Imm, Ty);
+
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  switch (IID) {
+  default:
+    return TTI::TCC_Free;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+    if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
+      return TTI::TCC_Free;
+    break;
+  case Intrinsic::experimental_stackmap:
+    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TTI::TCC_Free;
+    break;
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TTI::TCC_Free;
+    break;
+  }
+  return PPCTTIImpl::getIntImmCost(Imm, Ty);
+}
+
+int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                              Type *Ty) {
+  if (DisablePPCConstHoist)
+    return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty);
+
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  unsigned ImmIdx = ~0U;
+  bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
+       ZeroFree = false;
+  switch (Opcode) {
+  default:
+    return TTI::TCC_Free;
+  case Instruction::GetElementPtr:
+    // Always hoist the base address of a GetElementPtr. This prevents the
+    // creation of new constants for every base constant that gets constant
+    // folded with the offset.
+    if (Idx == 0)
+      return 2 * TTI::TCC_Basic;
+    return TTI::TCC_Free;
+  case Instruction::And:
+    RunFree = true; // (for the rotate-and-mask instructions)
+    LLVM_FALLTHROUGH;
+  case Instruction::Add:
+  case Instruction::Or:
+  case Instruction::Xor:
+    ShiftedFree = true;
+    LLVM_FALLTHROUGH;
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    ImmIdx = 1;
+    break;
+  case Instruction::ICmp:
+    UnsignedFree = true;
+    ImmIdx = 1;
+    // Zero comparisons can use record-form instructions.
+    LLVM_FALLTHROUGH;
+  case Instruction::Select:
+    ZeroFree = true;
+    break;
+  case Instruction::PHI:
+  case Instruction::Call:
+  case Instruction::Ret:
+  case Instruction::Load:
+  case Instruction::Store:
+    break;
+  }
+
+  if (ZeroFree && Imm == 0)
+    return TTI::TCC_Free;
+
+  if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
+    if (isInt<16>(Imm.getSExtValue()))
+      return TTI::TCC_Free;
+
+    if (RunFree) {
+      if (Imm.getBitWidth() <= 32 &&
+          (isShiftedMask_32(Imm.getZExtValue()) ||
+           isShiftedMask_32(~Imm.getZExtValue())))
+        return TTI::TCC_Free;
+
+      if (ST->isPPC64() &&
+          (isShiftedMask_64(Imm.getZExtValue()) ||
+           isShiftedMask_64(~Imm.getZExtValue())))
+        return TTI::TCC_Free;
+    }
+
+    if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
+      return TTI::TCC_Free;
+
+    if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
+      return TTI::TCC_Free;
+  }
+
+  return PPCTTIImpl::getIntImmCost(Imm, Ty);
+}
+
+void PPCTTIImpl::getUnrollingPreferences(Loop *L,
+                                         TTI::UnrollingPreferences &UP) {
+  if (ST->getDarwinDirective() == PPC::DIR_A2) {
+    // The A2 is in-order with a deep pipeline, and concatenation unrolling
+    // helps expose latency-hiding opportunities to the instruction scheduler.
+    UP.Partial = UP.Runtime = true;
+
+    // We unroll a lot on the A2 (hundreds of instructions), and the benefits
+    // often outweigh the cost of a division to compute the trip count.
+    UP.AllowExpensiveTripCount = true;
+  }
+
+  BaseT::getUnrollingPreferences(L, UP);
+}
+
+bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
+  // On the A2, always unroll aggressively. For QPX unaligned loads, we depend
+  // on combining the loads generated for consecutive accesses, and failure to
+  // do so is particularly expensive. This makes it much more likely (compared
+  // to only using concatenation unrolling).
+  if (ST->getDarwinDirective() == PPC::DIR_A2)
+    return true;
+
+  return LoopHasReductions;
+}
+
+bool PPCTTIImpl::enableInterleavedAccessVectorization() {
+  return true;
+}
+
+unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
+  if (Vector && !ST->hasAltivec() && !ST->hasQPX())
+    return 0;
+  return ST->hasVSX() ? 64 : 32;
+}
+
+unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) {
+  if (Vector) {
+    if (ST->hasQPX()) return 256;
+    if (ST->hasAltivec()) return 128;
+    return 0;
+  }
+
+  if (ST->isPPC64())
+    return 64;
+  return 32;
+
+}
+
+unsigned PPCTTIImpl::getCacheLineSize() {
+  // This is currently only used for the data prefetch pass which is only
+  // enabled for BG/Q by default.
+  return CacheLineSize;
+}
+
+unsigned PPCTTIImpl::getPrefetchDistance() {
+  // This seems like a reasonable default for the BG/Q (this pass is enabled, by
+  // default, only on the BG/Q).
+  return 300;
+}
+
+unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+  unsigned Directive = ST->getDarwinDirective();
+  // The 440 has no SIMD support, but floating-point instructions
+  // have a 5-cycle latency, so unroll by 5x for latency hiding.
+  if (Directive == PPC::DIR_440)
+    return 5;
+
+  // The A2 has no SIMD support, but floating-point instructions
+  // have a 6-cycle latency, so unroll by 6x for latency hiding.
+  if (Directive == PPC::DIR_A2)
+    return 6;
+
+  // FIXME: For lack of any better information, do no harm...
+  if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
+    return 1;
+
+  // For P7 and P8, floating-point instructions have a 6-cycle latency and
+  // there are two execution units, so unroll by 12x for latency hiding.
+  // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
+  if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
+      Directive == PPC::DIR_PWR9)
+    return 12;
+
+  // For most things, modern systems have two execution units (and
+  // out-of-order execution).
+  return 2;
+}
+
+int PPCTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
+    TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
+  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
+
+  // Fallback to the default implementation.
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+                                       Opd1PropInfo, Opd2PropInfo);
+}
+
+int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                               Type *SubTp) {
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+  // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
+  // (at least in the sense that there need only be one non-loop-invariant
+  // instruction). We need one such shuffle instruction for each actual
+  // register (this is not true for arbitrary shuffles, but is true for the
+  // structured types of shuffles covered by TTI::ShuffleKind).
+  return LT.first;
+}
+
+int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
+
+  return BaseT::getCastInstrCost(Opcode, Dst, Src);
+}
+
+int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
+    // Double-precision scalars are already located in index #0.
+    if (Index == 0)
+      return 0;
+
+    return BaseT::getVectorInstrCost(Opcode, Val, Index);
+  } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
+    // Floating point scalars are already located in index #0.
+    if (Index == 0)
+      return 0;
+
+    return BaseT::getVectorInstrCost(Opcode, Val, Index);
+  }
+
+  // Estimated cost of a load-hit-store delay.  This was obtained
+  // experimentally as a minimum needed to prevent unprofitable
+  // vectorization for the paq8p benchmark.  It may need to be
+  // raised further if other unprofitable cases remain.
+  unsigned LHSPenalty = 2;
+  if (ISD == ISD::INSERT_VECTOR_ELT)
+    LHSPenalty += 7;
+
+  // Vector element insert/extract with Altivec is very expensive,
+  // because they require store and reload with the attendant
+  // processor stall for load-hit-store.  Until VSX is available,
+  // these need to be estimated as very costly.
+  if (ISD == ISD::EXTRACT_VECTOR_ELT ||
+      ISD == ISD::INSERT_VECTOR_ELT)
+    return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index);
+
+  return BaseT::getVectorInstrCost(Opcode, Val, Index);
+}
+
+int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                unsigned AddressSpace) {
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+         "Invalid Opcode");
+
+  int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+
+  bool IsAltivecType = ST->hasAltivec() &&
+                       (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
+                        LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
+  bool IsVSXType = ST->hasVSX() &&
+                   (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
+  bool IsQPXType = ST->hasQPX() &&
+                   (LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
+
+  // VSX has 32b/64b load instructions. Legalization can handle loading of
+  // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
+  // PPCTargetLowering can't compute the cost appropriately. So here we
+  // explicitly check this case.
+  unsigned MemBytes = Src->getPrimitiveSizeInBits();
+  if (Opcode == Instruction::Load && ST->hasVSX() && IsAltivecType &&
+      (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32)))
+    return 1;
+
+  // Aligned loads and stores are easy.
+  unsigned SrcBytes = LT.second.getStoreSize();
+  if (!SrcBytes || !Alignment || Alignment >= SrcBytes)
+    return Cost;
+
+  // If we can use the permutation-based load sequence, then this is also
+  // relatively cheap (not counting loop-invariant instructions): one load plus
+  // one permute (the last load in a series has extra cost, but we're
+  // neglecting that here). Note that on the P7, we could do unaligned loads
+  // for Altivec types using the VSX instructions, but that's more expensive
+  // than using the permutation-based load sequence. On the P8, that's no
+  // longer true.
+  if (Opcode == Instruction::Load &&
+      ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
+      Alignment >= LT.second.getScalarType().getStoreSize())
+    return Cost + LT.first; // Add the cost of the permutations.
+
+  // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
+  // P7, unaligned vector loads are more expensive than the permutation-based
+  // load sequence, so that might be used instead, but regardless, the net cost
+  // is about the same (not counting loop-invariant instructions).
+  if (IsVSXType || (ST->hasVSX() && IsAltivecType))
+    return Cost;
+
+  // PPC in general does not support unaligned loads and stores. They'll need
+  // to be decomposed based on the alignment factor.
+
+  // Add the cost of each scalar load or store.
+  Cost += LT.first*(SrcBytes/Alignment-1);
+
+  // For a vector type, there is also scalarization overhead (only for
+  // stores, loads are expanded using the vector-load + permutation sequence,
+  // which is much less expensive).
+  if (Src->isVectorTy() && Opcode == Instruction::Store)
+    for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
+      Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
+
+  return Cost;
+}
+
+int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                           unsigned Factor,
+                                           ArrayRef<unsigned> Indices,
+                                           unsigned Alignment,
+                                           unsigned AddressSpace) {
+  assert(isa<VectorType>(VecTy) &&
+         "Expect a vector type for interleaved memory op");
+
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
+
+  // Firstly, the cost of load/store operation.
+  int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
+
+  // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
+  // (at least in the sense that there need only be one non-loop-invariant
+  // instruction). For each result vector, we need one shuffle per incoming
+  // vector (except that the first shuffle can take two incoming vectors
+  // because it does not need to take itself).
+  Cost += Factor*(LT.first-1);
+
+  return Cost;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
new file mode 100644
index 000000000000..8308086ccfaa
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -0,0 +1,92 @@
+//===-- PPCTargetTransformInfo.h - PPC specific TTI -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// PPC target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_POWERPC_PPCTARGETTRANSFORMINFO_H
+
+#include "PPC.h"
+#include "PPCTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
+  typedef BasicTTIImplBase<PPCTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const PPCSubtarget *ST;
+  const PPCTargetLowering *TLI;
+
+  const PPCSubtarget *getST() const { return ST; }
+  const PPCTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit PPCTTIImpl(const PPCTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  /// \name Scalar TTI Implementations
+  /// @{
+
+  using BaseT::getIntImmCost;
+  int getIntImmCost(const APInt &Imm, Type *Ty);
+
+  int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+  int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                    Type *Ty);
+
+  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  bool enableAggressiveInterleaving(bool LoopHasReductions);
+  bool enableInterleavedAccessVectorization();
+  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getCacheLineSize();
+  unsigned getPrefetchDistance();
+  unsigned getMaxInterleaveFactor(unsigned VF);
+  int getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                      unsigned AddressSpace);
+  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                 unsigned Factor,
+                                 ArrayRef<unsigned> Indices,
+                                 unsigned Alignment,
+                                 unsigned AddressSpace);
+
+  /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
new file mode 100644
index 000000000000..3b5d8f094fd0
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
@@ -0,0 +1,177 @@
+//===-------------- PPCVSXCopy.cpp - VSX Copy Legalization ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass which deals with the complexity of generating legal VSX register
+// copies to/from register classes which partially overlap with the VSX
+// register file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCHazardRecognizers.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-vsx-copy"
+
+namespace llvm {
+  void initializePPCVSXCopyPass(PassRegistry&);
+}
+
+namespace {
+  // PPCVSXCopy pass - For copies between VSX registers and non-VSX registers
+  // (Altivec and scalar floating-point registers), we need to transform the
+  // copies into subregister copies with other restrictions.
+  struct PPCVSXCopy : public MachineFunctionPass {
+    static char ID;
+    PPCVSXCopy() : MachineFunctionPass(ID) {
+      initializePPCVSXCopyPass(*PassRegistry::getPassRegistry());
+    }
+
+    const TargetInstrInfo *TII;
+
+    bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC,
+                      MachineRegisterInfo &MRI) {
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        return RC->hasSubClassEq(MRI.getRegClass(Reg));
+      } else if (RC->contains(Reg)) {
+        return true;
+      }
+
+      return false;
+    }
+
+    bool IsVSReg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::VSRCRegClass, MRI);
+    }
+
+    bool IsVRReg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::VRRCRegClass, MRI);
+    }
+
+    bool IsF8Reg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI);
+    }
+
+    bool IsVSFReg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::VSFRCRegClass, MRI);
+    }
+
+    bool IsVSSReg(unsigned Reg, MachineRegisterInfo &MRI) {
+      return IsRegInClass(Reg, &PPC::VSSRCRegClass, MRI);
+    }
+
+protected:
+    bool processBlock(MachineBasicBlock &MBB) {
+      bool Changed = false;
+
+      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+      for (MachineInstr &MI : MBB) {
+        if (!MI.isFullCopy())
+          continue;
+
+        MachineOperand &DstMO = MI.getOperand(0);
+        MachineOperand &SrcMO = MI.getOperand(1);
+
+        if ( IsVSReg(DstMO.getReg(), MRI) &&
+            !IsVSReg(SrcMO.getReg(), MRI)) {
+          // This is a copy *to* a VSX register from a non-VSX register.
+          Changed = true;
+
+          const TargetRegisterClass *SrcRC = &PPC::VSLRCRegClass;
+          assert((IsF8Reg(SrcMO.getReg(), MRI) ||
+                  IsVSSReg(SrcMO.getReg(), MRI) ||
+                  IsVSFReg(SrcMO.getReg(), MRI)) &&
+                 "Unknown source for a VSX copy");
+
+          unsigned NewVReg = MRI.createVirtualRegister(SrcRC);
+          BuildMI(MBB, MI, MI.getDebugLoc(),
+                  TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg)
+              .addImm(1) // add 1, not 0, because there is no implicit clearing
+                         // of the high bits.
+              .addOperand(SrcMO)
+              .addImm(PPC::sub_64);
+
+          // The source of the original copy is now the new virtual register.
+          SrcMO.setReg(NewVReg);
+        } else if (!IsVSReg(DstMO.getReg(), MRI) &&
+                    IsVSReg(SrcMO.getReg(), MRI)) {
+          // This is a copy *from* a VSX register to a non-VSX register.
+          Changed = true;
+
+          const TargetRegisterClass *DstRC = &PPC::VSLRCRegClass;
+          assert((IsF8Reg(DstMO.getReg(), MRI) ||
+                  IsVSFReg(DstMO.getReg(), MRI) ||
+                  IsVSSReg(DstMO.getReg(), MRI)) &&
+                 "Unknown destination for a VSX copy");
+
+          // Copy the VSX value into a new VSX register of the correct subclass.
+          unsigned NewVReg = MRI.createVirtualRegister(DstRC);
+          BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
+                  NewVReg)
+              .addOperand(SrcMO);
+
+          // Transform the original copy into a subregister extraction copy.
+          SrcMO.setReg(NewVReg);
+          SrcMO.setSubReg(PPC::sub_64);
+        }
+      }
+
+      return Changed;
+    }
+
+public:
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      // If we don't have VSX on the subtarget, don't do anything.
+      const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
+      if (!STI.hasVSX())
+        return false;
+      TII = STI.getInstrInfo();
+
+      bool Changed = false;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS(PPCVSXCopy, DEBUG_TYPE,
+                "PowerPC VSX Copy Legalization", false, false)
+
+char PPCVSXCopy::ID = 0;
+FunctionPass*
+llvm::createPPCVSXCopyPass() { return new PPCVSXCopy(); }
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
new file mode 100644
index 000000000000..f6d20ced15a0
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -0,0 +1,398 @@
+//===--------------- PPCVSXFMAMutate.cpp - VSX FMA Mutation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass mutates the form of VSX FMA instructions to avoid unnecessary
+// copies.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// Temporarily disable FMA mutation by default, since it doesn't handle
+// cross-basic-block intervals well.
+// See: http://lists.llvm.org/pipermail/llvm-dev/2016-February/095669.html
+//      http://reviews.llvm.org/D17087
+static cl::opt<bool> DisableVSXFMAMutate(
+    "disable-ppc-vsx-fma-mutation",
+    cl::desc("Disable VSX FMA instruction mutation"), cl::init(true),
+    cl::Hidden);
+
+#define DEBUG_TYPE "ppc-vsx-fma-mutate"
+
+namespace llvm { namespace PPC {
+  int getAltVSXFMAOpcode(uint16_t Opcode);
+} }
+
+namespace {
+  // PPCVSXFMAMutate pass - For copies between VSX registers and non-VSX registers
+  // (Altivec and scalar floating-point registers), we need to transform the
+  // copies into subregister copies with other restrictions.
+  struct PPCVSXFMAMutate : public MachineFunctionPass {
+    static char ID;
+    PPCVSXFMAMutate() : MachineFunctionPass(ID) {
+      initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
+    }
+
+    LiveIntervals *LIS;
+    const PPCInstrInfo *TII;
+
+protected:
+    bool processBlock(MachineBasicBlock &MBB) {
+      bool Changed = false;
+
+      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+      const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
+      for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
+           I != IE; ++I) {
+        MachineInstr &MI = *I;
+
+        // The default (A-type) VSX FMA form kills the addend (it is taken from
+        // the target register, which is then updated to reflect the result of
+        // the FMA). If the instruction, however, kills one of the registers
+        // used for the product, then we can use the M-form instruction (which
+        // will take that value from the to-be-defined register).
+
+        int AltOpc = PPC::getAltVSXFMAOpcode(MI.getOpcode());
+        if (AltOpc == -1)
+          continue;
+
+        // This pass is run after register coalescing, and so we're looking for
+        // a situation like this:
+        //   ...
+        //   %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
+        //   %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
+        //                         %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
+        //   ...
+        //   %vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19,
+        //                         %RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19
+        //   ...
+        // Where we can eliminate the copy by changing from the A-type to the
+        // M-type instruction. Specifically, for this example, this means:
+        //   %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16,
+        //                         %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16
+        // is replaced by:
+        //   %vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9,
+        //                         %RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9
+        // and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9
+
+        SlotIndex FMAIdx = LIS->getInstructionIndex(MI);
+
+        VNInfo *AddendValNo =
+            LIS->getInterval(MI.getOperand(1).getReg()).Query(FMAIdx).valueIn();
+
+        // This can be null if the register is undef.
+        if (!AddendValNo)
+          continue;
+
+        MachineInstr *AddendMI = LIS->getInstructionFromIndex(AddendValNo->def);
+
+        // The addend and this instruction must be in the same block.
+
+        if (!AddendMI || AddendMI->getParent() != MI.getParent())
+          continue;
+
+        // The addend must be a full copy within the same register class.
+
+        if (!AddendMI->isFullCopy())
+          continue;
+
+        unsigned AddendSrcReg = AddendMI->getOperand(1).getReg();
+        if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg)) {
+          if (MRI.getRegClass(AddendMI->getOperand(0).getReg()) !=
+              MRI.getRegClass(AddendSrcReg))
+            continue;
+        } else {
+          // If AddendSrcReg is a physical register, make sure the destination
+          // register class contains it.
+          if (!MRI.getRegClass(AddendMI->getOperand(0).getReg())
+                ->contains(AddendSrcReg))
+            continue;
+        }
+
+        // In theory, there could be other uses of the addend copy before this
+        // fma.  We could deal with this, but that would require additional
+        // logic below and I suspect it will not occur in any relevant
+        // situations.  Additionally, check whether the copy source is killed
+        // prior to the fma.  In order to replace the addend here with the
+        // source of the copy, it must still be live here.  We can't use
+        // interval testing for a physical register, so as long as we're
+        // walking the MIs we may as well test liveness here.
+        //
+        // FIXME: There is a case that occurs in practice, like this:
+        //   %vreg9<def> = COPY %F1; VSSRC:%vreg9
+        //   ...
+        //   %vreg6<def> = COPY %vreg9; VSSRC:%vreg6,%vreg9
+        //   %vreg7<def> = COPY %vreg9; VSSRC:%vreg7,%vreg9
+        //   %vreg9<def,tied1> = XSMADDASP %vreg9<tied0>, %vreg1, %vreg4; VSSRC:
+        //   %vreg6<def,tied1> = XSMADDASP %vreg6<tied0>, %vreg1, %vreg2; VSSRC:
+        //   %vreg7<def,tied1> = XSMADDASP %vreg7<tied0>, %vreg1, %vreg3; VSSRC:
+        // which prevents an otherwise-profitable transformation.
+        bool OtherUsers = false, KillsAddendSrc = false;
+        for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI);
+             J != JE; --J) {
+          if (J->readsVirtualRegister(AddendMI->getOperand(0).getReg())) {
+            OtherUsers = true;
+            break;
+          }
+          if (J->modifiesRegister(AddendSrcReg, TRI) ||
+              J->killsRegister(AddendSrcReg, TRI)) {
+            KillsAddendSrc = true;
+            break;
+          }
+        }
+
+        if (OtherUsers || KillsAddendSrc)
+          continue;
+
+
+        // The transformation doesn't work well with things like:
+        //    %vreg5 = A-form-op %vreg5, %vreg11, %vreg5;
+        // unless vreg11 is also a kill, so skip when it is not,
+        // and check operand 3 to see it is also a kill to handle the case:
+        //   %vreg5 = A-form-op %vreg5, %vreg5, %vreg11;
+        // where vreg5 and vreg11 are both kills. This case would be skipped
+        // otherwise.
+        unsigned OldFMAReg = MI.getOperand(0).getReg();
+
+        // Find one of the product operands that is killed by this instruction.
+        unsigned KilledProdOp = 0, OtherProdOp = 0;
+        unsigned Reg2 = MI.getOperand(2).getReg();
+        unsigned Reg3 = MI.getOperand(3).getReg();
+        if (LIS->getInterval(Reg2).Query(FMAIdx).isKill()
+            && Reg2 != OldFMAReg) {
+          KilledProdOp = 2;
+          OtherProdOp  = 3;
+        } else if (LIS->getInterval(Reg3).Query(FMAIdx).isKill()
+            && Reg3 != OldFMAReg) {
+          KilledProdOp = 3;
+          OtherProdOp  = 2;
+        }
+
+        // If there are no usable killed product operands, then this
+        // transformation is likely not profitable.
+        if (!KilledProdOp)
+          continue;
+
+        // If the addend copy is used only by this MI, then the addend source
+        // register is likely not live here. This could be fixed (based on the
+        // legality checks above, the live range for the addend source register
+        // could be extended), but it seems likely that such a trivial copy can
+        // be coalesced away later, and thus is not worth the effort.
+        if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg) &&
+            !LIS->getInterval(AddendSrcReg).liveAt(FMAIdx))
+          continue;
+
+        // Transform: (O2 * O3) + O1 -> (O2 * O1) + O3.
+
+        unsigned KilledProdReg = MI.getOperand(KilledProdOp).getReg();
+        unsigned OtherProdReg = MI.getOperand(OtherProdOp).getReg();
+
+        unsigned AddSubReg = AddendMI->getOperand(1).getSubReg();
+        unsigned KilledProdSubReg = MI.getOperand(KilledProdOp).getSubReg();
+        unsigned OtherProdSubReg = MI.getOperand(OtherProdOp).getSubReg();
+
+        bool AddRegKill = AddendMI->getOperand(1).isKill();
+        bool KilledProdRegKill = MI.getOperand(KilledProdOp).isKill();
+        bool OtherProdRegKill = MI.getOperand(OtherProdOp).isKill();
+
+        bool AddRegUndef = AddendMI->getOperand(1).isUndef();
+        bool KilledProdRegUndef = MI.getOperand(KilledProdOp).isUndef();
+        bool OtherProdRegUndef = MI.getOperand(OtherProdOp).isUndef();
+
+        // If there isn't a class that fits, we can't perform the transform.
+        // This is needed for correctness with a mixture of VSX and Altivec
+        // instructions to make sure that a low VSX register is not assigned to
+        // the Altivec instruction.
+        if (!MRI.constrainRegClass(KilledProdReg,
+                                   MRI.getRegClass(OldFMAReg)))
+          continue;
+
+        assert(OldFMAReg == AddendMI->getOperand(0).getReg() &&
+               "Addend copy not tied to old FMA output!");
+
+        DEBUG(dbgs() << "VSX FMA Mutation:\n    " << MI);
+
+        MI.getOperand(0).setReg(KilledProdReg);
+        MI.getOperand(1).setReg(KilledProdReg);
+        MI.getOperand(3).setReg(AddendSrcReg);
+
+        MI.getOperand(0).setSubReg(KilledProdSubReg);
+        MI.getOperand(1).setSubReg(KilledProdSubReg);
+        MI.getOperand(3).setSubReg(AddSubReg);
+
+        MI.getOperand(1).setIsKill(KilledProdRegKill);
+        MI.getOperand(3).setIsKill(AddRegKill);
+
+        MI.getOperand(1).setIsUndef(KilledProdRegUndef);
+        MI.getOperand(3).setIsUndef(AddRegUndef);
+
+        MI.setDesc(TII->get(AltOpc));
+
+        // If the addend is also a multiplicand, replace it with the addend
+        // source in both places.
+        if (OtherProdReg == AddendMI->getOperand(0).getReg()) {
+          MI.getOperand(2).setReg(AddendSrcReg);
+          MI.getOperand(2).setSubReg(AddSubReg);
+          MI.getOperand(2).setIsKill(AddRegKill);
+          MI.getOperand(2).setIsUndef(AddRegUndef);
+        } else {
+          MI.getOperand(2).setReg(OtherProdReg);
+          MI.getOperand(2).setSubReg(OtherProdSubReg);
+          MI.getOperand(2).setIsKill(OtherProdRegKill);
+          MI.getOperand(2).setIsUndef(OtherProdRegUndef);
+        }
+
+        DEBUG(dbgs() << " -> " << MI);
+
+        // The killed product operand was killed here, so we can reuse it now
+        // for the result of the fma.
+
+        LiveInterval &FMAInt = LIS->getInterval(OldFMAReg);
+        VNInfo *FMAValNo = FMAInt.getVNInfoAt(FMAIdx.getRegSlot());
+        for (auto UI = MRI.reg_nodbg_begin(OldFMAReg), UE = MRI.reg_nodbg_end();
+             UI != UE;) {
+          MachineOperand &UseMO = *UI;
+          MachineInstr *UseMI = UseMO.getParent();
+          ++UI;
+
+          // Don't replace the result register of the copy we're about to erase.
+          if (UseMI == AddendMI)
+            continue;
+
+          UseMO.substVirtReg(KilledProdReg, KilledProdSubReg, *TRI);
+        }
+
+        // Extend the live intervals of the killed product operand to hold the
+        // fma result.
+
+        LiveInterval &NewFMAInt = LIS->getInterval(KilledProdReg);
+        for (LiveInterval::iterator AI = FMAInt.begin(), AE = FMAInt.end();
+             AI != AE; ++AI) {
+          // Don't add the segment that corresponds to the original copy.
+          if (AI->valno == AddendValNo)
+            continue;
+
+          VNInfo *NewFMAValNo =
+            NewFMAInt.getNextValue(AI->start,
+                                   LIS->getVNInfoAllocator());
+
+          NewFMAInt.addSegment(LiveInterval::Segment(AI->start, AI->end,
+                                                     NewFMAValNo));
+        }
+        DEBUG(dbgs() << "  extended: " << NewFMAInt << '\n');
+
+        // Extend the live interval of the addend source (it might end at the
+        // copy to be removed, or somewhere in between there and here). This
+        // is necessary only if it is a physical register.
+        if (!TargetRegisterInfo::isVirtualRegister(AddendSrcReg))
+          for (MCRegUnitIterator Units(AddendSrcReg, TRI); Units.isValid();
+               ++Units) {
+            unsigned Unit = *Units;
+
+            LiveRange &AddendSrcRange = LIS->getRegUnit(Unit);
+            AddendSrcRange.extendInBlock(LIS->getMBBStartIdx(&MBB),
+                                         FMAIdx.getRegSlot());
+            DEBUG(dbgs() << "  extended: " << AddendSrcRange << '\n');
+          }
+
+        FMAInt.removeValNo(FMAValNo);
+        DEBUG(dbgs() << "  trimmed:  " << FMAInt << '\n');
+
+        // Remove the (now unused) copy.
+
+        DEBUG(dbgs() << "  removing: " << *AddendMI << '\n');
+        LIS->RemoveMachineInstrFromMaps(*AddendMI);
+        AddendMI->eraseFromParent();
+
+        Changed = true;
+      }
+
+      return Changed;
+    }
+
+public:
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      if (skipFunction(*MF.getFunction()))
+        return false;
+
+      // If we don't have VSX then go ahead and return without doing
+      // anything.
+      const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
+      if (!STI.hasVSX())
+        return false;
+
+      LIS = &getAnalysis<LiveIntervals>();
+
+      TII = STI.getInstrInfo();
+
+      bool Changed = false;
+
+      if (DisableVSXFMAMutate)
+        return Changed;
+
+      for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
+        MachineBasicBlock &B = *I++;
+        if (processBlock(B))
+          Changed = true;
+      }
+
+      return Changed;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LiveIntervals>();
+      AU.addPreserved<LiveIntervals>();
+      AU.addRequired<SlotIndexes>();
+      AU.addPreserved<SlotIndexes>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+INITIALIZE_PASS_BEGIN(PPCVSXFMAMutate, DEBUG_TYPE,
+                      "PowerPC VSX FMA Mutation", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE,
+                    "PowerPC VSX FMA Mutation", false, false)
+
+char &llvm::PPCVSXFMAMutateID = PPCVSXFMAMutate::ID;
+
+char PPCVSXFMAMutate::ID = 0;
+FunctionPass *llvm::createPPCVSXFMAMutatePass() {
+  return new PPCVSXFMAMutate();
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
new file mode 100644
index 000000000000..8197285b7b1f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -0,0 +1,1035 @@
+//===----------- PPCVSXSwapRemoval.cpp - Remove VSX LE Swaps -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This pass analyzes vector computations and removes unnecessary
+// doubleword swaps (xxswapd instructions).  This pass is performed
+// only for little-endian VSX code generation.
+//
+// For this specific case, loads and stores of v4i32, v4f32, v2i64,
+// and v2f64 vectors are inefficient.  These are implemented using
+// the lxvd2x and stxvd2x instructions, which invert the order of
+// doublewords in a vector register.  Thus code generation inserts
+// an xxswapd after each such load, and prior to each such store.
+//
+// The extra xxswapd instructions reduce performance.  The purpose
+// of this pass is to reduce the number of xxswapd instructions
+// required for correctness.
+//
+// The primary insight is that much code that operates on vectors
+// does not care about the relative order of elements in a register,
+// so long as the correct memory order is preserved.  If we have a
+// computation where all input values are provided by lxvd2x/xxswapd,
+// all outputs are stored using xxswapd/lxvd2x, and all intermediate
+// computations are lane-insensitive (independent of element order),
+// then all the xxswapd instructions associated with the loads and
+// stores may be removed without changing observable semantics.
+//
+// This pass uses standard equivalence class infrastructure to create
+// maximal webs of computations fitting the above description.  Each
+// such web is then optimized by removing its unnecessary xxswapd
+// instructions.
+//
+// There are some lane-sensitive operations for which we can still
+// permit the optimization, provided we modify those operations
+// accordingly.  Such operations are identified as using "special
+// handling" within this module.
+//
+//===---------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-vsx-swaps"
+
+namespace llvm {
+  void initializePPCVSXSwapRemovalPass(PassRegistry&);
+}
+
+namespace {
+
+// A PPCVSXSwapEntry is created for each machine instruction that
+// is relevant to a vector computation.
+struct PPCVSXSwapEntry {
+  // Pointer to the instruction.
+  MachineInstr *VSEMI;
+
+  // Unique ID (position in the swap vector).
+  int VSEId;
+
+  // Attributes of this node.
+  unsigned int IsLoad : 1;
+  unsigned int IsStore : 1;
+  unsigned int IsSwap : 1;
+  unsigned int MentionsPhysVR : 1;
+  unsigned int IsSwappable : 1;
+  unsigned int MentionsPartialVR : 1;
+  unsigned int SpecialHandling : 3;
+  unsigned int WebRejected : 1;
+  unsigned int WillRemove : 1;
+};
+
+enum SHValues {
+  SH_NONE = 0,
+  SH_EXTRACT,
+  SH_INSERT,
+  SH_NOSWAP_LD,
+  SH_NOSWAP_ST,
+  SH_SPLAT,
+  SH_XXPERMDI,
+  SH_COPYWIDEN
+};
+
+struct PPCVSXSwapRemoval : public MachineFunctionPass {
+
+  static char ID;
+  const PPCInstrInfo *TII;
+  MachineFunction *MF;
+  MachineRegisterInfo *MRI;
+
+  // Swap entries are allocated in a vector for better performance.
+  std::vector<PPCVSXSwapEntry> SwapVector;
+
+  // A mapping is maintained between machine instructions and
+  // their swap entries.  The key is the address of the MI.
+  DenseMap<MachineInstr*, int> SwapMap;
+
+  // Equivalence classes are used to gather webs of related computation.
+  // Swap entries are represented by their VSEId fields.
+  EquivalenceClasses<int> *EC;
+
+  PPCVSXSwapRemoval() : MachineFunctionPass(ID) {
+    initializePPCVSXSwapRemovalPass(*PassRegistry::getPassRegistry());
+  }
+
+private:
+  // Initialize data structures.
+  void initialize(MachineFunction &MFParm);
+
+  // Walk the machine instructions to gather vector usage information.
+  // Return true iff vector mentions are present.
+  bool gatherVectorInstructions();
+
+  // Add an entry to the swap vector and swap map.
+  int addSwapEntry(MachineInstr *MI, PPCVSXSwapEntry &SwapEntry);
+
+  // Hunt backwards through COPY and SUBREG_TO_REG chains for a
+  // source register.  VecIdx indicates the swap vector entry to
+  // mark as mentioning a physical register if the search leads
+  // to one.
+  unsigned lookThruCopyLike(unsigned SrcReg, unsigned VecIdx);
+
+  // Generate equivalence classes for related computations (webs).
+  void formWebs();
+
+  // Analyze webs and determine those that cannot be optimized.
+  void recordUnoptimizableWebs();
+
+  // Record which swap instructions can be safely removed.
+  void markSwapsForRemoval();
+
+  // Remove swaps and update other instructions requiring special
+  // handling.  Return true iff any changes are made.
+  bool removeSwaps();
+
+  // Insert a swap instruction from SrcReg to DstReg at the given
+  // InsertPoint.
+  void insertSwap(MachineInstr *MI, MachineBasicBlock::iterator InsertPoint,
+                  unsigned DstReg, unsigned SrcReg);
+
+  // Update instructions requiring special handling.
+  void handleSpecialSwappables(int EntryIdx);
+
+  // Dump a description of the entries in the swap vector.
+  void dumpSwapVector();
+
+  // Return true iff the given register is in the given class.
+  bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) {
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      return RC->hasSubClassEq(MRI->getRegClass(Reg));
+    return RC->contains(Reg);
+  }
+
+  // Return true iff the given register is a full vector register.
+  bool isVecReg(unsigned Reg) {
+    return (isRegInClass(Reg, &PPC::VSRCRegClass) ||
+            isRegInClass(Reg, &PPC::VRRCRegClass));
+  }
+
+  // Return true iff the given register is a partial vector register.
+  bool isScalarVecReg(unsigned Reg) {
+    return (isRegInClass(Reg, &PPC::VSFRCRegClass) ||
+            isRegInClass(Reg, &PPC::VSSRCRegClass));
+  }
+
+  // Return true iff the given register mentions all or part of a
+  // vector register.  Also sets Partial to true if the mention
+  // is for just the floating-point register overlap of the register.
+  bool isAnyVecReg(unsigned Reg, bool &Partial) {
+    if (isScalarVecReg(Reg))
+      Partial = true;
+    return isScalarVecReg(Reg) || isVecReg(Reg);
+  }
+
+public:
+  // Main entry point for this pass.
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(*MF.getFunction()))
+      return false;
+
+    // If we don't have VSX on the subtarget, don't do anything.
+    const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
+    if (!STI.hasVSX())
+      return false;
+
+    bool Changed = false;
+    initialize(MF);
+
+    if (gatherVectorInstructions()) {
+      formWebs();
+      recordUnoptimizableWebs();
+      markSwapsForRemoval();
+      Changed = removeSwaps();
+    }
+
+    // FIXME: See the allocation of EC in initialize().
+    delete EC;
+    return Changed;
+  }
+};
+
+// Initialize data structures for this pass.  In particular, clear the
+// swap vector and allocate the equivalence class mapping before
+// processing each function.
+void PPCVSXSwapRemoval::initialize(MachineFunction &MFParm) {
+  MF = &MFParm;
+  MRI = &MF->getRegInfo();
+  TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
+
+  // An initial vector size of 256 appears to work well in practice.
+  // Small/medium functions with vector content tend not to incur a
+  // reallocation at this size.  Three of the vector tests in
+  // projects/test-suite reallocate, which seems like a reasonable rate.
+  const int InitialVectorSize(256);
+  SwapVector.clear();
+  SwapVector.reserve(InitialVectorSize);
+
+  // FIXME: Currently we allocate EC each time because we don't have
+  // access to the set representation on which to call clear().  Should
+  // consider adding a clear() method to the EquivalenceClasses class.
+  EC = new EquivalenceClasses<int>;
+}
+
+// Create an entry in the swap vector for each instruction that mentions
+// a full vector register, recording various characteristics of the
+// instructions there.
+bool PPCVSXSwapRemoval::gatherVectorInstructions() {
+  bool RelevantFunction = false;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
+
+      if (MI.isDebugValue())
+        continue;
+
+      bool RelevantInstr = false;
+      bool Partial = false;
+
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (isAnyVecReg(Reg, Partial)) {
+          RelevantInstr = true;
+          break;
+        }
+      }
+
+      if (!RelevantInstr)
+        continue;
+
+      RelevantFunction = true;
+
+      // Create a SwapEntry initialized to zeros, then fill in the
+      // instruction and ID fields before pushing it to the back
+      // of the swap vector.
+      PPCVSXSwapEntry SwapEntry{};
+      int VecIdx = addSwapEntry(&MI, SwapEntry);
+
+      switch(MI.getOpcode()) {
+      default:
+        // Unless noted otherwise, an instruction is considered
+        // safe for the optimization.  There are a large number of
+        // such true-SIMD instructions (all vector math, logical,
+        // select, compare, etc.).  However, if the instruction
+        // mentions a partial vector register and does not have
+        // special handling defined, it is not swappable.
+        if (Partial)
+          SwapVector[VecIdx].MentionsPartialVR = 1;
+        else
+          SwapVector[VecIdx].IsSwappable = 1;
+        break;
+      case PPC::XXPERMDI: {
+        // This is a swap if it is of the form XXPERMDI t, s, s, 2.
+        // Unfortunately, MachineCSE ignores COPY and SUBREG_TO_REG, so we
+        // can also see XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), 2,
+        // for example.  We have to look through chains of COPY and
+        // SUBREG_TO_REG to find the real source value for comparison.
+        // If the real source value is a physical register, then mark the
+        // XXPERMDI as mentioning a physical register.
+        int immed = MI.getOperand(3).getImm();
+        if (immed == 2) {
+          unsigned trueReg1 = lookThruCopyLike(MI.getOperand(1).getReg(),
+                                               VecIdx);
+          unsigned trueReg2 = lookThruCopyLike(MI.getOperand(2).getReg(),
+                                               VecIdx);
+          if (trueReg1 == trueReg2)
+            SwapVector[VecIdx].IsSwap = 1;
+          else {
+            // We can still handle these if the two registers are not
+            // identical, by adjusting the form of the XXPERMDI.
+            SwapVector[VecIdx].IsSwappable = 1;
+            SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
+          }
+        // This is a doubleword splat if it is of the form
+        // XXPERMDI t, s, s, 0 or XXPERMDI t, s, s, 3.  As above we
+        // must look through chains of copy-likes to find the source
+        // register.  We turn off the marking for mention of a physical
+        // register, because splatting it is safe; the optimization
+        // will not swap the value in the physical register.  Whether
+        // or not the two input registers are identical, we can handle
+        // these by adjusting the form of the XXPERMDI.
+        } else if (immed == 0 || immed == 3) {
+
+          SwapVector[VecIdx].IsSwappable = 1;
+          SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
+
+          unsigned trueReg1 = lookThruCopyLike(MI.getOperand(1).getReg(),
+                                               VecIdx);
+          unsigned trueReg2 = lookThruCopyLike(MI.getOperand(2).getReg(),
+                                               VecIdx);
+          if (trueReg1 == trueReg2)
+            SwapVector[VecIdx].MentionsPhysVR = 0;
+
+        } else {
+          // We can still handle these by adjusting the form of the XXPERMDI.
+          SwapVector[VecIdx].IsSwappable = 1;
+          SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
+        }
+        break;
+      }
+      case PPC::LVX:
+        // Non-permuting loads are currently unsafe.  We can use special
+        // handling for this in the future.  By not marking these as
+        // IsSwap, we ensure computations containing them will be rejected
+        // for now.
+        SwapVector[VecIdx].IsLoad = 1;
+        break;
+      case PPC::LXVD2X:
+      case PPC::LXVW4X:
+        // Permuting loads are marked as both load and swap, and are
+        // safe for optimization.
+        SwapVector[VecIdx].IsLoad = 1;
+        SwapVector[VecIdx].IsSwap = 1;
+        break;
+      case PPC::LXSDX:
+      case PPC::LXSSPX:
+        // A load of a floating-point value into the high-order half of
+        // a vector register is safe, provided that we introduce a swap
+        // following the load, which will be done by the SUBREG_TO_REG
+        // support.  So just mark these as safe.
+        SwapVector[VecIdx].IsLoad = 1;
+        SwapVector[VecIdx].IsSwappable = 1;
+        break;
+      case PPC::STVX:
+        // Non-permuting stores are currently unsafe.  We can use special
+        // handling for this in the future.  By not marking these as
+        // IsSwap, we ensure computations containing them will be rejected
+        // for now.
+        SwapVector[VecIdx].IsStore = 1;
+        break;
+      case PPC::STXVD2X:
+      case PPC::STXVW4X:
+        // Permuting stores are marked as both store and swap, and are
+        // safe for optimization.
+        SwapVector[VecIdx].IsStore = 1;
+        SwapVector[VecIdx].IsSwap = 1;
+        break;
+      case PPC::COPY:
+        // These are fine provided they are moving between full vector
+        // register classes.
+        if (isVecReg(MI.getOperand(0).getReg()) &&
+            isVecReg(MI.getOperand(1).getReg()))
+          SwapVector[VecIdx].IsSwappable = 1;
+        // If we have a copy from one scalar floating-point register
+        // to another, we can accept this even if it is a physical
+        // register.  The only way this gets involved is if it feeds
+        // a SUBREG_TO_REG, which is handled by introducing a swap.
+        else if (isScalarVecReg(MI.getOperand(0).getReg()) &&
+                 isScalarVecReg(MI.getOperand(1).getReg()))
+          SwapVector[VecIdx].IsSwappable = 1;
+        break;
+      case PPC::SUBREG_TO_REG: {
+        // These are fine provided they are moving between full vector
+        // register classes.  If they are moving from a scalar
+        // floating-point class to a vector class, we can handle those
+        // as well, provided we introduce a swap.  It is generally the
+        // case that we will introduce fewer swaps than we remove, but
+        // (FIXME) a cost model could be used.  However, introduced
+        // swaps could potentially be CSEd, so this is not trivial.
+        if (isVecReg(MI.getOperand(0).getReg()) &&
+            isVecReg(MI.getOperand(2).getReg()))
+          SwapVector[VecIdx].IsSwappable = 1;
+        else if (isVecReg(MI.getOperand(0).getReg()) &&
+                 isScalarVecReg(MI.getOperand(2).getReg())) {
+          SwapVector[VecIdx].IsSwappable = 1;
+          SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYWIDEN;
+        }
+        break;
+      }
+      case PPC::VSPLTB:
+      case PPC::VSPLTH:
+      case PPC::VSPLTW:
+      case PPC::XXSPLTW:
+        // Splats are lane-sensitive, but we can use special handling
+        // to adjust the source lane for the splat.
+        SwapVector[VecIdx].IsSwappable = 1;
+        SwapVector[VecIdx].SpecialHandling = SHValues::SH_SPLAT;
+        break;
+      // The presence of the following lane-sensitive operations in a
+      // web will kill the optimization, at least for now.  For these
+      // we do nothing, causing the optimization to fail.
+      // FIXME: Some of these could be permitted with special handling,
+      // and will be phased in as time permits.
+      // FIXME: There is no simple and maintainable way to express a set
+      // of opcodes having a common attribute in TableGen.  Should this
+      // change, this is a prime candidate to use such a mechanism.
+      case PPC::INLINEASM:
+      case PPC::EXTRACT_SUBREG:
+      case PPC::INSERT_SUBREG:
+      case PPC::COPY_TO_REGCLASS:
+      case PPC::LVEBX:
+      case PPC::LVEHX:
+      case PPC::LVEWX:
+      case PPC::LVSL:
+      case PPC::LVSR:
+      case PPC::LVXL:
+      case PPC::STVEBX:
+      case PPC::STVEHX:
+      case PPC::STVEWX:
+      case PPC::STVXL:
+        // We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX,
+        // by adding special handling for narrowing copies as well as
+        // widening ones.  However, I've experimented with this, and in
+        // practice we currently do not appear to use STXSDX fed by 
+        // a narrowing copy from a full vector register.  Since I can't
+        // generate any useful test cases, I've left this alone for now.
+      case PPC::STXSDX:
+      case PPC::STXSSPX:
+      case PPC::VCIPHER:
+      case PPC::VCIPHERLAST:
+      case PPC::VMRGHB:
+      case PPC::VMRGHH:
+      case PPC::VMRGHW:
+      case PPC::VMRGLB:
+      case PPC::VMRGLH:
+      case PPC::VMRGLW:
+      case PPC::VMULESB:
+      case PPC::VMULESH:
+      case PPC::VMULESW:
+      case PPC::VMULEUB:
+      case PPC::VMULEUH:
+      case PPC::VMULEUW:
+      case PPC::VMULOSB:
+      case PPC::VMULOSH:
+      case PPC::VMULOSW:
+      case PPC::VMULOUB:
+      case PPC::VMULOUH:
+      case PPC::VMULOUW:
+      case PPC::VNCIPHER:
+      case PPC::VNCIPHERLAST:
+      case PPC::VPERM:
+      case PPC::VPERMXOR:
+      case PPC::VPKPX:
+      case PPC::VPKSHSS:
+      case PPC::VPKSHUS:
+      case PPC::VPKSDSS:
+      case PPC::VPKSDUS:
+      case PPC::VPKSWSS:
+      case PPC::VPKSWUS:
+      case PPC::VPKUDUM:
+      case PPC::VPKUDUS:
+      case PPC::VPKUHUM:
+      case PPC::VPKUHUS:
+      case PPC::VPKUWUM:
+      case PPC::VPKUWUS:
+      case PPC::VPMSUMB:
+      case PPC::VPMSUMD:
+      case PPC::VPMSUMH:
+      case PPC::VPMSUMW:
+      case PPC::VRLB:
+      case PPC::VRLD:
+      case PPC::VRLH:
+      case PPC::VRLW:
+      case PPC::VSBOX:
+      case PPC::VSHASIGMAD:
+      case PPC::VSHASIGMAW:
+      case PPC::VSL:
+      case PPC::VSLDOI:
+      case PPC::VSLO:
+      case PPC::VSR:
+      case PPC::VSRO:
+      case PPC::VSUM2SWS:
+      case PPC::VSUM4SBS:
+      case PPC::VSUM4SHS:
+      case PPC::VSUM4UBS:
+      case PPC::VSUMSWS:
+      case PPC::VUPKHPX:
+      case PPC::VUPKHSB:
+      case PPC::VUPKHSH:
+      case PPC::VUPKHSW:
+      case PPC::VUPKLPX:
+      case PPC::VUPKLSB:
+      case PPC::VUPKLSH:
+      case PPC::VUPKLSW:
+      case PPC::XXMRGHW:
+      case PPC::XXMRGLW:
+      // XXSLDWI could be replaced by a general permute with one of three
+      // permute control vectors (for shift values 1, 2, 3).  However,
+      // VPERM has a more restrictive register class.
+      case PPC::XXSLDWI:
+        break;
+      }
+    }
+  }
+
+  if (RelevantFunction) {
+    DEBUG(dbgs() << "Swap vector when first built\n\n");
+    dumpSwapVector();
+  }
+
+  return RelevantFunction;
+}
+
+// Add an entry to the swap vector and swap map, and make a
+// singleton equivalence class for the entry.
+int PPCVSXSwapRemoval::addSwapEntry(MachineInstr *MI,
+                                  PPCVSXSwapEntry& SwapEntry) {
+  SwapEntry.VSEMI = MI;
+  SwapEntry.VSEId = SwapVector.size();
+  SwapVector.push_back(SwapEntry);
+  EC->insert(SwapEntry.VSEId);
+  SwapMap[MI] = SwapEntry.VSEId;
+  return SwapEntry.VSEId;
+}
+
+// This is used to find the "true" source register for an
+// XXPERMDI instruction, since MachineCSE does not handle the
+// "copy-like" operations (Copy and SubregToReg).  Returns
+// the original SrcReg unless it is the target of a copy-like
+// operation, in which case we chain backwards through all
+// such operations to the ultimate source register.  If a
+// physical register is encountered, we stop the search and
+// flag the swap entry indicated by VecIdx (the original
+// XXPERMDI) as mentioning a physical register.
+unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg,
+                                             unsigned VecIdx) {
+  MachineInstr *MI = MRI->getVRegDef(SrcReg);
+  if (!MI->isCopyLike())
+    return SrcReg;
+
+  unsigned CopySrcReg;
+  if (MI->isCopy())
+    CopySrcReg = MI->getOperand(1).getReg();
+  else {
+    assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike");
+    CopySrcReg = MI->getOperand(2).getReg();
+  }
+
+  if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) {
+    if (!isScalarVecReg(CopySrcReg))
+      SwapVector[VecIdx].MentionsPhysVR = 1;
+    return CopySrcReg;
+  }
+
+  return lookThruCopyLike(CopySrcReg, VecIdx);
+}
+
+// Generate equivalence classes for related computations (webs) by
+// def-use relationships of virtual registers.  Mention of a physical
+// register terminates the generation of equivalence classes as this
+// indicates a use of a parameter, definition of a return value, use
+// of a value returned from a call, or definition of a parameter to a
+// call.  Computations with physical register mentions are flagged
+// as such so their containing webs will not be optimized.
+void PPCVSXSwapRemoval::formWebs() {
+
+  DEBUG(dbgs() << "\n*** Forming webs for swap removal ***\n\n");
+
+  for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
+
+    MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+
+    DEBUG(dbgs() << "\n" << SwapVector[EntryIdx].VSEId << " ");
+    DEBUG(MI->dump());
+
+    // It's sufficient to walk vector uses and join them to their unique
+    // definitions.  In addition, check full vector register operands
+    // for physical regs.  We exclude partial-vector register operands
+    // because we can handle them if copied to a full vector.
+    for (const MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+
+      unsigned Reg = MO.getReg();
+      if (!isVecReg(Reg) && !isScalarVecReg(Reg))
+        continue;
+
+      if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+        if (!(MI->isCopy() && isScalarVecReg(Reg)))
+          SwapVector[EntryIdx].MentionsPhysVR = 1;
+        continue;
+      }
+
+      if (!MO.isUse())
+        continue;
+
+      MachineInstr* DefMI = MRI->getVRegDef(Reg);
+      assert(SwapMap.find(DefMI) != SwapMap.end() &&
+             "Inconsistency: def of vector reg not found in swap map!");
+      int DefIdx = SwapMap[DefMI];
+      (void)EC->unionSets(SwapVector[DefIdx].VSEId,
+                          SwapVector[EntryIdx].VSEId);
+
+      DEBUG(dbgs() << format("Unioning %d with %d\n", SwapVector[DefIdx].VSEId,
+                             SwapVector[EntryIdx].VSEId));
+      DEBUG(dbgs() << "  Def: ");
+      DEBUG(DefMI->dump());
+    }
+  }
+}
+
+// Walk the swap vector entries looking for conditions that prevent their
+// containing computations from being optimized.  When such conditions are
+// found, mark the representative of the computation's equivalence class
+// as rejected.
+void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
+
+  DEBUG(dbgs() << "\n*** Rejecting webs for swap removal ***\n\n");
+
+  for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
+    int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
+
+    // If representative is already rejected, don't waste further time.
+    if (SwapVector[Repr].WebRejected)
+      continue;
+
+    // Reject webs containing mentions of physical or partial registers, or
+    // containing operations that we don't know how to handle in a lane-
+    // permuted region.
+    if (SwapVector[EntryIdx].MentionsPhysVR ||
+        SwapVector[EntryIdx].MentionsPartialVR ||
+        !(SwapVector[EntryIdx].IsSwappable || SwapVector[EntryIdx].IsSwap)) {
+
+      SwapVector[Repr].WebRejected = 1;
+
+      DEBUG(dbgs() <<
+            format("Web %d rejected for physreg, partial reg, or not "
+                   "swap[pable]\n", Repr));
+      DEBUG(dbgs() << "  in " << EntryIdx << ": ");
+      DEBUG(SwapVector[EntryIdx].VSEMI->dump());
+      DEBUG(dbgs() << "\n");
+    }
+
+    // Reject webs than contain swapping loads that feed something other
+    // than a swap instruction.
+    else if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) {
+      MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+      unsigned DefReg = MI->getOperand(0).getReg();
+
+      // We skip debug instructions in the analysis.  (Note that debug
+      // location information is still maintained by this optimization
+      // because it remains on the LXVD2X and STXVD2X instructions after
+      // the XXPERMDIs are removed.)
+      for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) {
+        int UseIdx = SwapMap[&UseMI];
+
+        if (!SwapVector[UseIdx].IsSwap || SwapVector[UseIdx].IsLoad ||
+            SwapVector[UseIdx].IsStore) {
+
+          SwapVector[Repr].WebRejected = 1;
+
+          DEBUG(dbgs() <<
+                format("Web %d rejected for load not feeding swap\n", Repr));
+          DEBUG(dbgs() << "  def " << EntryIdx << ": ");
+          DEBUG(MI->dump());
+          DEBUG(dbgs() << "  use " << UseIdx << ": ");
+          DEBUG(UseMI.dump());
+          DEBUG(dbgs() << "\n");
+        }
+      }
+
+    // Reject webs that contain swapping stores that are fed by something
+    // other than a swap instruction.
+    } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) {
+      MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+      unsigned UseReg = MI->getOperand(0).getReg();
+      MachineInstr *DefMI = MRI->getVRegDef(UseReg);
+      unsigned DefReg = DefMI->getOperand(0).getReg();
+      int DefIdx = SwapMap[DefMI];
+
+      if (!SwapVector[DefIdx].IsSwap || SwapVector[DefIdx].IsLoad ||
+          SwapVector[DefIdx].IsStore) {
+
+        SwapVector[Repr].WebRejected = 1;
+
+        DEBUG(dbgs() <<
+              format("Web %d rejected for store not fed by swap\n", Repr));
+        DEBUG(dbgs() << "  def " << DefIdx << ": ");
+        DEBUG(DefMI->dump());
+        DEBUG(dbgs() << "  use " << EntryIdx << ": ");
+        DEBUG(MI->dump());
+        DEBUG(dbgs() << "\n");
+      }
+
+      // Ensure all uses of the register defined by DefMI feed store
+      // instructions
+      for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) {
+        int UseIdx = SwapMap[&UseMI];
+
+        if (SwapVector[UseIdx].VSEMI->getOpcode() != MI->getOpcode()) {
+          SwapVector[Repr].WebRejected = 1;
+
+          DEBUG(dbgs() <<
+                format("Web %d rejected for swap not feeding only stores\n",
+                       Repr));
+          DEBUG(dbgs() << "  def " << " : ");
+          DEBUG(DefMI->dump());
+          DEBUG(dbgs() << "  use " << UseIdx << ": ");
+          DEBUG(SwapVector[UseIdx].VSEMI->dump());
+          DEBUG(dbgs() << "\n");
+        }
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "Swap vector after web analysis:\n\n");
+  dumpSwapVector();
+}
+
+// Walk the swap vector entries looking for swaps fed by permuting loads
+// and swaps that feed permuting stores.  If the containing computation
+// has not been marked rejected, mark each such swap for removal.
+// (Removal is delayed in case optimization has disturbed the pattern,
+// such that multiple loads feed the same swap, etc.)
+void PPCVSXSwapRemoval::markSwapsForRemoval() {
+
+  DEBUG(dbgs() << "\n*** Marking swaps for removal ***\n\n");
+
+  for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
+
+    if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) {
+      int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
+
+      if (!SwapVector[Repr].WebRejected) {
+        MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+        unsigned DefReg = MI->getOperand(0).getReg();
+
+        for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) {
+          int UseIdx = SwapMap[&UseMI];
+          SwapVector[UseIdx].WillRemove = 1;
+
+          DEBUG(dbgs() << "Marking swap fed by load for removal: ");
+          DEBUG(UseMI.dump());
+        }
+      }
+
+    } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) {
+      int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
+
+      if (!SwapVector[Repr].WebRejected) {
+        MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+        unsigned UseReg = MI->getOperand(0).getReg();
+        MachineInstr *DefMI = MRI->getVRegDef(UseReg);
+        int DefIdx = SwapMap[DefMI];
+        SwapVector[DefIdx].WillRemove = 1;
+
+        DEBUG(dbgs() << "Marking swap feeding store for removal: ");
+        DEBUG(DefMI->dump());
+      }
+
+    } else if (SwapVector[EntryIdx].IsSwappable &&
+               SwapVector[EntryIdx].SpecialHandling != 0) {
+      int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
+
+      if (!SwapVector[Repr].WebRejected)
+        handleSpecialSwappables(EntryIdx);
+    }
+  }
+}
+
+// Create an xxswapd instruction and insert it prior to the given point.
+// MI is used to determine basic block and debug loc information.
+// FIXME: When inserting a swap, we should check whether SrcReg is
+// defined by another swap:  SrcReg = XXPERMDI Reg, Reg, 2;  If so,
+// then instead we should generate a copy from Reg to DstReg.
+void PPCVSXSwapRemoval::insertSwap(MachineInstr *MI,
+                                   MachineBasicBlock::iterator InsertPoint,
+                                   unsigned DstReg, unsigned SrcReg) {
+  BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+          TII->get(PPC::XXPERMDI), DstReg)
+    .addReg(SrcReg)
+    .addReg(SrcReg)
+    .addImm(2);
+}
+
+// The identified swap entry requires special handling to allow its
+// containing computation to be optimized.  Perform that handling
+// here.
+// FIXME: Additional opportunities will be phased in with subsequent
+// patches.
+void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
+  switch (SwapVector[EntryIdx].SpecialHandling) {
+
+  default:
+    llvm_unreachable("Unexpected special handling type");
+
+  // For splats based on an index into a vector, add N/2 modulo N
+  // to the index, where N is the number of vector elements.
+  case SHValues::SH_SPLAT: {
+    MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+    unsigned NElts;
+
+    DEBUG(dbgs() << "Changing splat: ");
+    DEBUG(MI->dump());
+
+    switch (MI->getOpcode()) {
+    default:
+      llvm_unreachable("Unexpected splat opcode");
+    case PPC::VSPLTB: NElts = 16; break;
+    case PPC::VSPLTH: NElts = 8;  break;
+    case PPC::VSPLTW:
+    case PPC::XXSPLTW: NElts = 4;  break;
+    }
+
+    unsigned EltNo;
+    if (MI->getOpcode() == PPC::XXSPLTW)
+      EltNo = MI->getOperand(2).getImm();
+    else
+      EltNo = MI->getOperand(1).getImm();
+
+    EltNo = (EltNo + NElts / 2) % NElts;
+    if (MI->getOpcode() == PPC::XXSPLTW)
+      MI->getOperand(2).setImm(EltNo);
+    else
+      MI->getOperand(1).setImm(EltNo);
+
+    DEBUG(dbgs() << "  Into: ");
+    DEBUG(MI->dump());
+    break;
+  }
+
+  // For an XXPERMDI that isn't handled otherwise, we need to
+  // reverse the order of the operands.  If the selector operand
+  // has a value of 0 or 3, we need to change it to 3 or 0,
+  // respectively.  Otherwise we should leave it alone.  (This
+  // is equivalent to reversing the two bits of the selector
+  // operand and complementing the result.)
+  case SHValues::SH_XXPERMDI: {
+    MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+
+    DEBUG(dbgs() << "Changing XXPERMDI: ");
+    DEBUG(MI->dump());
+
+    unsigned Selector = MI->getOperand(3).getImm();
+    if (Selector == 0 || Selector == 3)
+      Selector = 3 - Selector;
+    MI->getOperand(3).setImm(Selector);
+
+    unsigned Reg1 = MI->getOperand(1).getReg();
+    unsigned Reg2 = MI->getOperand(2).getReg();
+    MI->getOperand(1).setReg(Reg2);
+    MI->getOperand(2).setReg(Reg1);
+
+    DEBUG(dbgs() << "  Into: ");
+    DEBUG(MI->dump());
+    break;
+  }
+
+  // For a copy from a scalar floating-point register to a vector
+  // register, removing swaps will leave the copied value in the
+  // wrong lane.  Insert a swap following the copy to fix this.
+  case SHValues::SH_COPYWIDEN: {
+    MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+
+    DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
+    DEBUG(MI->dump());
+
+    unsigned DstReg = MI->getOperand(0).getReg();
+    const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
+    unsigned NewVReg = MRI->createVirtualRegister(DstRC);
+
+    MI->getOperand(0).setReg(NewVReg);
+    DEBUG(dbgs() << "  Into: ");
+    DEBUG(MI->dump());
+
+    auto InsertPoint = ++MachineBasicBlock::iterator(MI);
+
+    // Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG
+    // is copying to a VRRC, we need to be careful to avoid a register
+    // assignment problem.  In this case we must copy from VRRC to VSRC
+    // prior to the swap, and from VSRC to VRRC following the swap.
+    // Coalescing will usually remove all this mess.
+    if (DstRC == &PPC::VRRCRegClass) {
+      unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
+      unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
+
+      BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+              TII->get(PPC::COPY), VSRCTmp1)
+        .addReg(NewVReg);
+      DEBUG(std::prev(InsertPoint)->dump());
+
+      insertSwap(MI, InsertPoint, VSRCTmp2, VSRCTmp1);
+      DEBUG(std::prev(InsertPoint)->dump());
+
+      BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
+              TII->get(PPC::COPY), DstReg)
+        .addReg(VSRCTmp2);
+      DEBUG(std::prev(InsertPoint)->dump());
+
+    } else {
+      insertSwap(MI, InsertPoint, DstReg, NewVReg);
+      DEBUG(std::prev(InsertPoint)->dump());
+    }
+    break;
+  }
+  }
+}
+
+// Walk the swap vector and replace each entry marked for removal with
+// a copy operation.
+bool PPCVSXSwapRemoval::removeSwaps() {
+
+  DEBUG(dbgs() << "\n*** Removing swaps ***\n\n");
+
+  bool Changed = false;
+
+  for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
+    if (SwapVector[EntryIdx].WillRemove) {
+      Changed = true;
+      MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+      MachineBasicBlock *MBB = MI->getParent();
+      BuildMI(*MBB, MI, MI->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+        .addOperand(MI->getOperand(1));
+
+      DEBUG(dbgs() << format("Replaced %d with copy: ",
+                             SwapVector[EntryIdx].VSEId));
+      DEBUG(MI->dump());
+
+      MI->eraseFromParent();
+    }
+  }
+
+  return Changed;
+}
+
+// For debug purposes, dump the contents of the swap vector.
+void PPCVSXSwapRemoval::dumpSwapVector() {
+
+  for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
+
+    MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
+    int ID = SwapVector[EntryIdx].VSEId;
+
+    DEBUG(dbgs() << format("%6d", ID));
+    DEBUG(dbgs() << format("%6d", EC->getLeaderValue(ID)));
+    DEBUG(dbgs() << format(" BB#%3d", MI->getParent()->getNumber()));
+    DEBUG(dbgs() << format("  %14s  ",
+                           TII->getName(MI->getOpcode()).str().c_str()));
+
+    if (SwapVector[EntryIdx].IsLoad)
+      DEBUG(dbgs() << "load ");
+    if (SwapVector[EntryIdx].IsStore)
+      DEBUG(dbgs() << "store ");
+    if (SwapVector[EntryIdx].IsSwap)
+      DEBUG(dbgs() << "swap ");
+    if (SwapVector[EntryIdx].MentionsPhysVR)
+      DEBUG(dbgs() << "physreg ");
+    if (SwapVector[EntryIdx].MentionsPartialVR)
+      DEBUG(dbgs() << "partialreg ");
+
+    if (SwapVector[EntryIdx].IsSwappable) {
+      DEBUG(dbgs() << "swappable ");
+      switch(SwapVector[EntryIdx].SpecialHandling) {
+      default:
+        DEBUG(dbgs() << "special:**unknown**");
+        break;
+      case SH_NONE:
+        break;
+      case SH_EXTRACT:
+        DEBUG(dbgs() << "special:extract ");
+        break;
+      case SH_INSERT:
+        DEBUG(dbgs() << "special:insert ");
+        break;
+      case SH_NOSWAP_LD:
+        DEBUG(dbgs() << "special:load ");
+        break;
+      case SH_NOSWAP_ST:
+        DEBUG(dbgs() << "special:store ");
+        break;
+      case SH_SPLAT:
+        DEBUG(dbgs() << "special:splat ");
+        break;
+      case SH_XXPERMDI:
+        DEBUG(dbgs() << "special:xxpermdi ");
+        break;
+      case SH_COPYWIDEN:
+        DEBUG(dbgs() << "special:copywiden ");
+        break;
+      }
+    }
+
+    if (SwapVector[EntryIdx].WebRejected)
+      DEBUG(dbgs() << "rejected ");
+    if (SwapVector[EntryIdx].WillRemove)
+      DEBUG(dbgs() << "remove ");
+
+    DEBUG(dbgs() << "\n");
+
+    // For no-asserts builds.
+    (void)MI;
+    (void)ID;
+  }
+
+  DEBUG(dbgs() << "\n");
+}
+
+} // end default namespace
+
+INITIALIZE_PASS_BEGIN(PPCVSXSwapRemoval, DEBUG_TYPE,
+                      "PowerPC VSX Swap Removal", false, false)
+INITIALIZE_PASS_END(PPCVSXSwapRemoval, DEBUG_TYPE,
+                    "PowerPC VSX Swap Removal", false, false)
+
+char PPCVSXSwapRemoval::ID = 0;
+FunctionPass*
+llvm::createPPCVSXSwapRemovalPass() { return new PPCVSXSwapRemoval(); }
diff --git a/contrib/llvm/lib/Target/PowerPC/README_P9.txt b/contrib/llvm/lib/Target/PowerPC/README_P9.txt
new file mode 100644
index 000000000000..d56f7cca7b21
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/README_P9.txt
@@ -0,0 +1,605 @@
+//===- README_P9.txt - Notes for improving Power9 code gen ----------------===//
+
+TODO: Instructions Need Implement Instrinstics or Map to LLVM IR
+
+Altivec:
+- Vector Compare Not Equal (Zero):
+  vcmpneb(.) vcmpneh(.) vcmpnew(.)
+  vcmpnezb(.) vcmpnezh(.) vcmpnezw(.)
+  . Same as other VCMP*, use VCMP/VCMPo form (support intrinsic)
+
+- Vector Extract Unsigned: vextractub vextractuh vextractuw vextractd
+  . Don't use llvm extractelement because they have different semantics
+  . Use instrinstics:
+    (set v2i64:$vD, (int_ppc_altivec_vextractub v16i8:$vA, imm:$UIMM))
+    (set v2i64:$vD, (int_ppc_altivec_vextractuh v8i16:$vA, imm:$UIMM))
+    (set v2i64:$vD, (int_ppc_altivec_vextractuw v4i32:$vA, imm:$UIMM))
+    (set v2i64:$vD, (int_ppc_altivec_vextractd  v2i64:$vA, imm:$UIMM))
+
+- Vector Extract Unsigned Byte Left/Right-Indexed:
+  vextublx vextubrx vextuhlx vextuhrx vextuwlx vextuwrx
+  . Use instrinstics:
+    // Left-Indexed
+    (set i64:$rD, (int_ppc_altivec_vextublx i64:$rA, v16i8:$vB))
+    (set i64:$rD, (int_ppc_altivec_vextuhlx i64:$rA, v8i16:$vB))
+    (set i64:$rD, (int_ppc_altivec_vextuwlx i64:$rA, v4i32:$vB))
+
+    // Right-Indexed
+    (set i64:$rD, (int_ppc_altivec_vextubrx i64:$rA, v16i8:$vB))
+    (set i64:$rD, (int_ppc_altivec_vextuhrx i64:$rA, v8i16:$vB))
+    (set i64:$rD, (int_ppc_altivec_vextuwrx i64:$rA, v4i32:$vB))
+
+- Vector Insert Element Instructions: vinsertb vinsertd vinserth vinsertw
+    (set v16i8:$vD, (int_ppc_altivec_vinsertb v16i8:$vA, imm:$UIMM))
+    (set v8i16:$vD, (int_ppc_altivec_vinsertd v8i16:$vA, imm:$UIMM))
+    (set v4i32:$vD, (int_ppc_altivec_vinserth v4i32:$vA, imm:$UIMM))
+    (set v2i64:$vD, (int_ppc_altivec_vinsertw v2i64:$vA, imm:$UIMM))
+
+- Vector Count Leading/Trailing Zero LSB. Result is placed into GPR[rD]:
+  vclzlsbb vctzlsbb
+  . Use intrinsic:
+    (set i64:$rD, (int_ppc_altivec_vclzlsbb v16i8:$vB))
+    (set i64:$rD, (int_ppc_altivec_vctzlsbb v16i8:$vB))
+
+- Vector Count Trailing Zeros: vctzb vctzh vctzw vctzd
+  . Map to llvm cttz
+    (set v16i8:$vD, (cttz v16i8:$vB))     // vctzb
+    (set v8i16:$vD, (cttz v8i16:$vB))     // vctzh
+    (set v4i32:$vD, (cttz v4i32:$vB))     // vctzw
+    (set v2i64:$vD, (cttz v2i64:$vB))     // vctzd
+
+- Vector Extend Sign: vextsb2w vextsh2w vextsb2d vextsh2d vextsw2d
+  . vextsb2w:
+    (set v4i32:$vD, (sext v4i8:$vB))
+
+    // PowerISA_V3.0:
+    do i = 0 to 3
+       VR[VRT].word[i] ← EXTS32(VR[VRB].word[i].byte[3])
+    end
+
+  . vextsh2w:
+    (set v4i32:$vD, (sext v4i16:$vB))
+
+    // PowerISA_V3.0:
+    do i = 0 to 3
+       VR[VRT].word[i] ← EXTS32(VR[VRB].word[i].hword[1])
+    end
+
+  . vextsb2d
+    (set v2i64:$vD, (sext v2i8:$vB))
+
+    // PowerISA_V3.0:
+    do i = 0 to 1
+       VR[VRT].dword[i] ← EXTS64(VR[VRB].dword[i].byte[7])
+    end
+
+  . vextsh2d
+    (set v2i64:$vD, (sext v2i16:$vB))
+
+    // PowerISA_V3.0:
+    do i = 0 to 1
+       VR[VRT].dword[i] ← EXTS64(VR[VRB].dword[i].hword[3])
+    end
+
+  . vextsw2d
+    (set v2i64:$vD, (sext v2i32:$vB))
+
+    // PowerISA_V3.0:
+    do i = 0 to 1
+       VR[VRT].dword[i] ← EXTS64(VR[VRB].dword[i].word[1])
+    end
+
+- Vector Integer Negate: vnegw vnegd
+  . Map to llvm ineg
+    (set v4i32:$rT, (ineg v4i32:$rA))       // vnegw
+    (set v2i64:$rT, (ineg v2i64:$rA))       // vnegd
+
+- Vector Parity Byte: vprtybw vprtybd vprtybq
+  . Use intrinsic:
+    (set v4i32:$rD, (int_ppc_altivec_vprtybw v4i32:$vB))
+    (set v2i64:$rD, (int_ppc_altivec_vprtybd v2i64:$vB))
+    (set v1i128:$rD, (int_ppc_altivec_vprtybq v1i128:$vB))
+
+- Vector (Bit) Permute (Right-indexed):
+  . vbpermd: Same as "vbpermq", use VX1_Int_Ty2:
+    VX1_Int_Ty2<1484, "vbpermd", int_ppc_altivec_vbpermd, v2i64, v2i64>;
+
+  . vpermr: use VA1a_Int_Ty3
+    VA1a_Int_Ty3<59, "vpermr", int_ppc_altivec_vpermr, v16i8, v16i8, v16i8>;
+
+- Vector Rotate Left Mask/Mask-Insert: vrlwnm vrlwmi vrldnm vrldmi
+  . Use intrinsic:
+    VX1_Int_Ty<389, "vrlwnm", int_ppc_altivec_vrlwnm, v4i32>;
+    VX1_Int_Ty<133, "vrlwmi", int_ppc_altivec_vrlwmi, v4i32>;
+    VX1_Int_Ty<453, "vrldnm", int_ppc_altivec_vrldnm, v2i64>;
+    VX1_Int_Ty<197, "vrldmi", int_ppc_altivec_vrldmi, v2i64>;
+
+- Vector Shift Left/Right: vslv vsrv
+  . Use intrinsic, don't map to llvm shl and lshr, because they have different
+    semantics, e.g. vslv:
+
+      do i = 0 to 15
+         sh ← VR[VRB].byte[i].bit[5:7]
+         VR[VRT].byte[i] ← src.byte[i:i+1].bit[sh:sh+7]
+      end
+
+    VR[VRT].byte[i] is composed of 2 bytes from src.byte[i:i+1]
+
+  . VX1_Int_Ty<1860, "vslv", int_ppc_altivec_vslv, v16i8>;
+    VX1_Int_Ty<1796, "vsrv", int_ppc_altivec_vsrv, v16i8>;
+
+- Vector Multiply-by-10 (& Write Carry) Unsigned Quadword:
+  vmul10uq vmul10cuq
+  . Use intrinsic:
+    VX1_Int_Ty<513, "vmul10uq",   int_ppc_altivec_vmul10uq,  v1i128>;
+    VX1_Int_Ty<  1, "vmul10cuq",  int_ppc_altivec_vmul10cuq, v1i128>;
+
+- Vector Multiply-by-10 Extended (& Write Carry) Unsigned Quadword:
+  vmul10euq vmul10ecuq
+  . Use intrinsic:
+    VX1_Int_Ty<577, "vmul10euq",  int_ppc_altivec_vmul10euq, v1i128>;
+    VX1_Int_Ty< 65, "vmul10ecuq", int_ppc_altivec_vmul10ecuq, v1i128>;
+
+- Decimal Convert From/to National/Zoned/Signed-QWord:
+  bcdcfn. bcdcfz. bcdctn. bcdctz. bcdcfsq. bcdctsq.
+  . Use instrinstics:
+    (set v1i128:$vD, (int_ppc_altivec_bcdcfno  v1i128:$vB, i1:$PS))
+    (set v1i128:$vD, (int_ppc_altivec_bcdcfzo  v1i128:$vB, i1:$PS))
+    (set v1i128:$vD, (int_ppc_altivec_bcdctno  v1i128:$vB))
+    (set v1i128:$vD, (int_ppc_altivec_bcdctzo  v1i128:$vB, i1:$PS))
+    (set v1i128:$vD, (int_ppc_altivec_bcdcfsqo v1i128:$vB, i1:$PS))
+    (set v1i128:$vD, (int_ppc_altivec_bcdctsqo v1i128:$vB))
+
+- Decimal Copy-Sign/Set-Sign: bcdcpsgn. bcdsetsgn.
+  . Use instrinstics:
+    (set v1i128:$vD, (int_ppc_altivec_bcdcpsgno v1i128:$vA, v1i128:$vB))
+    (set v1i128:$vD, (int_ppc_altivec_bcdsetsgno v1i128:$vB, i1:$PS))
+
+- Decimal Shift/Unsigned-Shift/Shift-and-Round: bcds. bcdus. bcdsr.
+  . Use instrinstics:
+    (set v1i128:$vD, (int_ppc_altivec_bcdso  v1i128:$vA, v1i128:$vB, i1:$PS))
+    (set v1i128:$vD, (int_ppc_altivec_bcduso v1i128:$vA, v1i128:$vB))
+    (set v1i128:$vD, (int_ppc_altivec_bcdsro v1i128:$vA, v1i128:$vB, i1:$PS))
+
+  . Note! Their VA is accessed only 1 byte, i.e. VA.byte[7]
+
+- Decimal (Unsigned) Truncate: bcdtrunc. bcdutrunc.
+  . Use instrinstics:
+    (set v1i128:$vD, (int_ppc_altivec_bcdso  v1i128:$vA, v1i128:$vB, i1:$PS))
+    (set v1i128:$vD, (int_ppc_altivec_bcduso v1i128:$vA, v1i128:$vB))
+
+  . Note! Their VA is accessed only 2 byte, i.e. VA.hword[3] (VA.bit[48:63])
+
+VSX:
+- QP Copy Sign: xscpsgnqp
+  . Similar to xscpsgndp
+  . (set f128:$vT, (fcopysign f128:$vB, f128:$vA)
+
+- QP Absolute/Negative-Absolute/Negate: xsabsqp xsnabsqp xsnegqp
+  . Similar to xsabsdp/xsnabsdp/xsnegdp
+  . (set f128:$vT, (fabs f128:$vB))             // xsabsqp
+    (set f128:$vT, (fneg (fabs f128:$vB)))      // xsnabsqp
+    (set f128:$vT, (fneg f128:$vB))             // xsnegqp
+
+- QP Add/Divide/Multiply/Subtract/Square-Root:
+  xsaddqp xsdivqp xsmulqp xssubqp xssqrtqp
+  . Similar to xsadddp
+  . isCommutable = 1
+    (set f128:$vT, (fadd f128:$vA, f128:$vB))   // xsaddqp
+    (set f128:$vT, (fmul f128:$vA, f128:$vB))   // xsmulqp
+
+  . isCommutable = 0
+    (set f128:$vT, (fdiv f128:$vA, f128:$vB))   // xsdivqp
+    (set f128:$vT, (fsub f128:$vA, f128:$vB))   // xssubqp
+    (set f128:$vT, (fsqrt f128:$vB)))           // xssqrtqp
+
+- Round to Odd of QP Add/Divide/Multiply/Subtract/Square-Root:
+  xsaddqpo xsdivqpo xsmulqpo xssubqpo xssqrtqpo
+  . Similar to xsrsqrtedp??
+      def XSRSQRTEDP : XX2Form<60, 74,
+                               (outs vsfrc:$XT), (ins vsfrc:$XB),
+                               "xsrsqrtedp $XT, $XB", IIC_VecFP,
+                               [(set f64:$XT, (PPCfrsqrte f64:$XB))]>;
+
+  . Define DAG Node in PPCInstrInfo.td:
+    def PPCfaddrto: SDNode<"PPCISD::FADDRTO", SDTFPBinOp, []>;
+    def PPCfdivrto: SDNode<"PPCISD::FDIVRTO", SDTFPBinOp, []>;
+    def PPCfmulrto: SDNode<"PPCISD::FMULRTO", SDTFPBinOp, []>;
+    def PPCfsubrto: SDNode<"PPCISD::FSUBRTO", SDTFPBinOp, []>;
+    def PPCfsqrtrto: SDNode<"PPCISD::FSQRTRTO", SDTFPUnaryOp, []>;
+
+    DAG patterns of each instruction (PPCInstrVSX.td):
+    . isCommutable = 1
+      (set f128:$vT, (PPCfaddrto f128:$vA, f128:$vB))   // xsaddqpo
+      (set f128:$vT, (PPCfmulrto f128:$vA, f128:$vB))   // xsmulqpo
+
+    . isCommutable = 0
+      (set f128:$vT, (PPCfdivrto f128:$vA, f128:$vB))   // xsdivqpo
+      (set f128:$vT, (PPCfsubrto f128:$vA, f128:$vB))   // xssubqpo
+      (set f128:$vT, (PPCfsqrtrto f128:$vB))            // xssqrtqpo
+
+- QP (Negative) Multiply-{Add/Subtract}: xsmaddqp xsmsubqp xsnmaddqp xsnmsubqp
+  . Ref: xsmaddadp/xsmsubadp/xsnmaddadp/xsnmsubadp
+
+  . isCommutable = 1
+    // xsmaddqp
+    [(set f128:$vT, (fma f128:$vA, f128:$vB, f128:$vTi))]>,
+    RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+    AltVSXFMARel;
+
+    // xsmsubqp
+    [(set f128:$vT, (fma f128:$vA, f128:$vB, (fneg f128:$vTi)))]>,
+    RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+    AltVSXFMARel;
+
+    // xsnmaddqp
+    [(set f128:$vT, (fneg (fma f128:$vA, f128:$vB, f128:$vTi)))]>,
+    RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+    AltVSXFMARel;
+
+    // xsnmsubqp
+    [(set f128:$vT, (fneg (fma f128:$vA, f128:$vB, (fneg f128:$vTi))))]>,
+    RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+    AltVSXFMARel;
+
+- Round to Odd of QP (Negative) Multiply-{Add/Subtract}:
+  xsmaddqpo xsmsubqpo xsnmaddqpo xsnmsubqpo
+  . Similar to xsrsqrtedp??
+
+  . Define DAG Node in PPCInstrInfo.td:
+    def PPCfmarto: SDNode<"PPCISD::FMARTO", SDTFPTernaryOp, []>;
+
+    It looks like we only need to define "PPCfmarto" for these instructions,
+    because according to PowerISA_V3.0, these instructions perform RTO on
+    fma's result:
+        xsmaddqp(o)
+        v      ← bfp_MULTIPLY_ADD(src1, src3, src2)
+        rnd    ← bfp_ROUND_TO_BFP128(RO, FPSCR.RN, v)
+        result ← bfp_CONVERT_TO_BFP128(rnd)
+
+        xsmsubqp(o)
+        v      ← bfp_MULTIPLY_ADD(src1, src3, bfp_NEGATE(src2))
+        rnd    ← bfp_ROUND_TO_BFP128(RO, FPSCR.RN, v)
+        result ← bfp_CONVERT_TO_BFP128(rnd)
+
+        xsnmaddqp(o)
+        v      ← bfp_MULTIPLY_ADD(src1,src3,src2)
+        rnd    ← bfp_NEGATE(bfp_ROUND_TO_BFP128(RO, FPSCR.RN, v))
+        result ← bfp_CONVERT_TO_BFP128(rnd)
+
+        xsnmsubqp(o)
+        v      ← bfp_MULTIPLY_ADD(src1, src3, bfp_NEGATE(src2))
+        rnd    ← bfp_NEGATE(bfp_ROUND_TO_BFP128(RO, FPSCR.RN, v))
+        result ← bfp_CONVERT_TO_BFP128(rnd)
+
+    DAG patterns of each instruction (PPCInstrVSX.td):
+    . isCommutable = 1
+      // xsmaddqpo
+      [(set f128:$vT, (PPCfmarto f128:$vA, f128:$vB, f128:$vTi))]>,
+      RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+      AltVSXFMARel;
+
+      // xsmsubqpo
+      [(set f128:$vT, (PPCfmarto f128:$vA, f128:$vB, (fneg f128:$vTi)))]>,
+      RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+      AltVSXFMARel;
+
+      // xsnmaddqpo
+      [(set f128:$vT, (fneg (PPCfmarto f128:$vA, f128:$vB, f128:$vTi)))]>,
+      RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+      AltVSXFMARel;
+
+      // xsnmsubqpo
+      [(set f128:$vT, (fneg (PPCfmarto f128:$vA, f128:$vB, (fneg f128:$vTi))))]>,
+      RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">,
+      AltVSXFMARel;
+
+- QP Compare Ordered/Unordered: xscmpoqp xscmpuqp
+  . ref: XSCMPUDP
+      def XSCMPUDP : XX3Form_1<60, 35,
+                               (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+                               "xscmpudp $crD, $XA, $XB", IIC_FPCompare, []>;
+
+  . No SDAG, intrinsic, builtin are required??
+    Or llvm fcmp order/unorder compare??
+
+- DP/QP Compare Exponents: xscmpexpdp xscmpexpqp
+  . No SDAG, intrinsic, builtin are required?
+
+- DP Compare ==, >=, >, !=: xscmpeqdp xscmpgedp xscmpgtdp xscmpnedp
+  . I checked existing instruction "XSCMPUDP". They are different in target
+    register. "XSCMPUDP" write to CR field, xscmp*dp write to VSX register
+
+  . Use instrinsic:
+    (set i128:$XT, (int_ppc_vsx_xscmpeqdp f64:$XA, f64:$XB))
+    (set i128:$XT, (int_ppc_vsx_xscmpgedp f64:$XA, f64:$XB))
+    (set i128:$XT, (int_ppc_vsx_xscmpgtdp f64:$XA, f64:$XB))
+    (set i128:$XT, (int_ppc_vsx_xscmpnedp f64:$XA, f64:$XB))
+
+- Vector Compare Not Equal: xvcmpnedp xvcmpnedp. xvcmpnesp xvcmpnesp.
+  . Similar to xvcmpeqdp:
+      defm XVCMPEQDP : XX3Form_Rcr<60, 99,
+                                 "xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare,
+                                 int_ppc_vsx_xvcmpeqdp, v2i64, v2f64>;
+
+  . So we should use "XX3Form_Rcr" to implement instrinsic
+
+- Convert DP -> QP: xscvdpqp
+  . Similar to XSCVDPSP:
+      def XSCVDPSP : XX2Form<60, 265,
+                          (outs vsfrc:$XT), (ins vsfrc:$XB),
+                          "xscvdpsp $XT, $XB", IIC_VecFP, []>;
+  . So, No SDAG, intrinsic, builtin are required??
+
+- Round & Convert QP -> DP (dword[1] is set to zero): xscvqpdp xscvqpdpo
+  . Similar to XSCVDPSP
+  . No SDAG, intrinsic, builtin are required??
+
+- Truncate & Convert QP -> (Un)Signed (D)Word (dword[1] is set to zero):
+  xscvqpsdz xscvqpswz xscvqpudz xscvqpuwz
+  . According to PowerISA_V3.0, these are similar to "XSCVDPSXDS", "XSCVDPSXWS",
+    "XSCVDPUXDS", "XSCVDPUXWS"
+
+  . DAG patterns:
+    (set f128:$XT, (PPCfctidz f128:$XB))    // xscvqpsdz
+    (set f128:$XT, (PPCfctiwz f128:$XB))    // xscvqpswz
+    (set f128:$XT, (PPCfctiduz f128:$XB))   // xscvqpudz
+    (set f128:$XT, (PPCfctiwuz f128:$XB))   // xscvqpuwz
+
+- Convert (Un)Signed DWord -> QP: xscvsdqp xscvudqp
+  . Similar to XSCVSXDSP
+  . (set f128:$XT, (PPCfcfids f64:$XB))     // xscvsdqp
+    (set f128:$XT, (PPCfcfidus f64:$XB))    // xscvudqp
+
+- (Round &) Convert DP <-> HP: xscvdphp xscvhpdp
+  . Similar to XSCVDPSP
+  . No SDAG, intrinsic, builtin are required??
+
+- Vector HP -> SP: xvcvhpsp xvcvsphp
+  . Similar to XVCVDPSP:
+      def XVCVDPSP : XX2Form<60, 393,
+                          (outs vsrc:$XT), (ins vsrc:$XB),
+                          "xvcvdpsp $XT, $XB", IIC_VecFP, []>;
+  . No SDAG, intrinsic, builtin are required??
+
+- Round to Quad-Precision Integer: xsrqpi xsrqpix
+  . These are combination of "XSRDPI", "XSRDPIC", "XSRDPIM", .., because you
+    need to assign rounding mode in instruction
+  . Provide builtin?
+    (set f128:$vT, (int_ppc_vsx_xsrqpi f128:$vB))
+    (set f128:$vT, (int_ppc_vsx_xsrqpix f128:$vB))
+
+- Round Quad-Precision to Double-Extended Precision (fp80): xsrqpxp
+  . Provide builtin?
+    (set f128:$vT, (int_ppc_vsx_xsrqpxp f128:$vB))
+
+Fixed Point Facility:
+
+- Exploit cmprb and cmpeqb (perhaps for something like
+  isalpha/isdigit/isupper/islower and isspace respectivelly). This can
+  perhaps be done through a builtin.
+
+- Provide testing for cnttz[dw]
+- Insert Exponent DP/QP: xsiexpdp xsiexpqp
+  . Use intrinsic?
+  . xsiexpdp:
+    // Note: rA and rB are the unsigned integer value.
+    (set f128:$XT, (int_ppc_vsx_xsiexpdp i64:$rA, i64:$rB))
+
+  . xsiexpqp:
+    (set f128:$vT, (int_ppc_vsx_xsiexpqp f128:$vA, f64:$vB))
+
+- Extract Exponent/Significand DP/QP: xsxexpdp xsxsigdp xsxexpqp xsxsigqp
+  . Use intrinsic?
+  . (set i64:$rT, (int_ppc_vsx_xsxexpdp f64$XB))    // xsxexpdp
+    (set i64:$rT, (int_ppc_vsx_xsxsigdp f64$XB))    // xsxsigdp
+    (set f128:$vT, (int_ppc_vsx_xsxexpqp f128$vB))  // xsxexpqp
+    (set f128:$vT, (int_ppc_vsx_xsxsigqp f128$vB))  // xsxsigqp
+
+- Vector Insert Word: xxinsertw
+  - Useful for inserting f32/i32 elements into vectors (the element to be
+    inserted needs to be prepared)
+  . Note: llvm has insertelem in "Vector Operations"
+    ; yields <n x <ty>>
+    <result> = insertelement <n x <ty>> <val>, <ty> <elt>, <ty2> <idx>
+
+    But how to map to it??
+    [(set v1f128:$XT, (insertelement v1f128:$XTi, f128:$XB, i4:$UIMM))]>,
+    RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+
+  . Or use intrinsic?
+    (set v1f128:$XT, (int_ppc_vsx_xxinsertw v1f128:$XTi, f128:$XB, i4:$UIMM))
+
+- Vector Extract Unsigned Word: xxextractuw
+  - Not useful for extraction of f32 from v4f32 (the current pattern is better -
+    shift->convert)
+  - It is useful for (uint_to_fp (vector_extract v4i32, N))
+  - Unfortunately, it can't be used for (sint_to_fp (vector_extract v4i32, N))
+  . Note: llvm has extractelement in "Vector Operations"
+    ; yields <ty>
+    <result> = extractelement <n x <ty>> <val>, <ty2> <idx>
+
+    How to map to it??
+    [(set f128:$XT, (extractelement v1f128:$XB, i4:$UIMM))]
+
+  . Or use intrinsic?
+    (set f128:$XT, (int_ppc_vsx_xxextractuw v1f128:$XB, i4:$UIMM))
+
+- Vector Insert Exponent DP/SP: xviexpdp xviexpsp
+  . Use intrinsic
+    (set v2f64:$XT, (int_ppc_vsx_xviexpdp v2f64:$XA, v2f64:$XB))
+    (set v4f32:$XT, (int_ppc_vsx_xviexpsp v4f32:$XA, v4f32:$XB))
+
+- Vector Extract Exponent/Significand DP/SP: xvxexpdp xvxexpsp xvxsigdp xvxsigsp
+  . Use intrinsic
+    (set v2f64:$XT, (int_ppc_vsx_xvxexpdp v2f64:$XB))
+    (set v4f32:$XT, (int_ppc_vsx_xvxexpsp v4f32:$XB))
+    (set v2f64:$XT, (int_ppc_vsx_xvxsigdp v2f64:$XB))
+    (set v4f32:$XT, (int_ppc_vsx_xvxsigsp v4f32:$XB))
+
+- Test Data Class SP/DP/QP: xststdcsp xststdcdp xststdcqp
+  . No SDAG, intrinsic, builtin are required?
+    Because it seems that we have no way to map BF field?
+
+    Instruction Form: [PO T XO B XO BX TX]
+    Asm: xststd* BF,XB,DCMX
+
+    BF is an index to CR register field.
+
+- Vector Test Data Class SP/DP: xvtstdcsp xvtstdcdp
+  . Use intrinsic
+    (set v4f32:$XT, (int_ppc_vsx_xvtstdcsp v4f32:$XB, i7:$DCMX))
+    (set v2f64:$XT, (int_ppc_vsx_xvtstdcdp v2f64:$XB, i7:$DCMX))
+
+- Maximum/Minimum Type-C/Type-J DP: xsmaxcdp xsmaxjdp xsmincdp xsminjdp
+  . PowerISA_V3.0:
+    "xsmaxcdp can be used to implement the C/C++/Java conditional operation
+     (x>y)?x:y for single-precision and double-precision arguments."
+
+    Note! c type and j type have different behavior when:
+    1. Either input is NaN
+    2. Both input are +-Infinity, +-Zero
+
+  . dtype map to llvm fmaxnum/fminnum
+    jtype use intrinsic
+
+  . xsmaxcdp xsmincdp
+    (set f64:$XT, (fmaxnum f64:$XA, f64:$XB))
+    (set f64:$XT, (fminnum f64:$XA, f64:$XB))
+
+  . xsmaxjdp xsminjdp
+    (set f64:$XT, (int_ppc_vsx_xsmaxjdp f64:$XA, f64:$XB))
+    (set f64:$XT, (int_ppc_vsx_xsminjdp f64:$XA, f64:$XB))
+
+- Vector Byte-Reverse H/W/D/Q Word: xxbrh xxbrw xxbrd xxbrq
+  . Use intrinsic
+    (set v8i16:$XT, (int_ppc_vsx_xxbrh v8i16:$XB))
+    (set v4i32:$XT, (int_ppc_vsx_xxbrw v4i32:$XB))
+    (set v2i64:$XT, (int_ppc_vsx_xxbrd v2i64:$XB))
+    (set v1i128:$XT, (int_ppc_vsx_xxbrq v1i128:$XB))
+
+- Vector Permute: xxperm xxpermr
+  . I have checked "PPCxxswapd" in PPCInstrVSX.td, but they are different
+  . Use intrinsic
+    (set v16i8:$XT, (int_ppc_vsx_xxperm v16i8:$XA, v16i8:$XB))
+    (set v16i8:$XT, (int_ppc_vsx_xxpermr v16i8:$XA, v16i8:$XB))
+
+- Vector Splat Immediate Byte: xxspltib
+  . Similar to XXSPLTW:
+      def XXSPLTW : XX2Form_2<60, 164,
+                           (outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM),
+                           "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;
+
+  . No SDAG, intrinsic, builtin are required?
+
+- Load/Store Vector: lxv stxv
+  . Has likely SDAG match:
+    (set v?:$XT, (load ix16addr:$src))
+    (set v?:$XT, (store ix16addr:$dst))
+
+  . Need define ix16addr in PPCInstrInfo.td
+    ix16addr: 16-byte aligned, see "def memrix16" in PPCInstrInfo.td
+
+- Load/Store Vector Indexed: lxvx stxvx
+  . Has likely SDAG match:
+    (set v?:$XT, (load xoaddr:$src))
+    (set v?:$XT, (store xoaddr:$dst))
+
+- Load/Store DWord: lxsd stxsd
+  . Similar to lxsdx/stxsdx:
+    def LXSDX : XX1Form<31, 588,
+                        (outs vsfrc:$XT), (ins memrr:$src),
+                        "lxsdx $XT, $src", IIC_LdStLFD,
+                        [(set f64:$XT, (load xoaddr:$src))]>;
+
+  . (set f64:$XT, (load ixaddr:$src))
+    (set f64:$XT, (store ixaddr:$dst))
+
+- Load/Store SP, with conversion from/to DP: lxssp stxssp
+  . Similar to lxsspx/stxsspx:
+    def LXSSPX : XX1Form<31, 524, (outs vssrc:$XT), (ins memrr:$src),
+                         "lxsspx $XT, $src", IIC_LdStLFD,
+                         [(set f32:$XT, (load xoaddr:$src))]>;
+
+  . (set f32:$XT, (load ixaddr:$src))
+    (set f32:$XT, (store ixaddr:$dst))
+
+- Load as Integer Byte/Halfword & Zero Indexed: lxsibzx lxsihzx
+  . Similar to lxsiwzx:
+    def LXSIWZX : XX1Form<31, 12, (outs vsfrc:$XT), (ins memrr:$src),
+                          "lxsiwzx $XT, $src", IIC_LdStLFD,
+                          [(set f64:$XT, (PPClfiwzx xoaddr:$src))]>;
+
+  . (set f64:$XT, (PPClfiwzx xoaddr:$src))
+
+- Store as Integer Byte/Halfword Indexed: stxsibx stxsihx
+  . Similar to stxsiwx:
+    def STXSIWX : XX1Form<31, 140, (outs), (ins vsfrc:$XT, memrr:$dst),
+                          "stxsiwx $XT, $dst", IIC_LdStSTFD,
+                          [(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
+
+  . (PPCstfiwx f64:$XT, xoaddr:$dst)
+
+- Load Vector Halfword*8/Byte*16 Indexed: lxvh8x lxvb16x
+  . Similar to lxvd2x/lxvw4x:
+    def LXVD2X : XX1Form<31, 844,
+                         (outs vsrc:$XT), (ins memrr:$src),
+                         "lxvd2x $XT, $src", IIC_LdStLFD,
+                         [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>;
+
+  . (set v8i16:$XT, (int_ppc_vsx_lxvh8x xoaddr:$src))
+    (set v16i8:$XT, (int_ppc_vsx_lxvb16x xoaddr:$src))
+
+- Store Vector Halfword*8/Byte*16 Indexed: stxvh8x stxvb16x
+  . Similar to stxvd2x/stxvw4x:
+    def STXVD2X : XX1Form<31, 972,
+                         (outs), (ins vsrc:$XT, memrr:$dst),
+                         "stxvd2x $XT, $dst", IIC_LdStSTFD,
+                         [(store v2f64:$XT, xoaddr:$dst)]>;
+
+  . (store v8i16:$XT, xoaddr:$dst)
+    (store v16i8:$XT, xoaddr:$dst)
+
+- Load/Store Vector (Left-justified) with Length: lxvl lxvll stxvl stxvll
+  . Likely needs an intrinsic
+  . (set v?:$XT, (int_ppc_vsx_lxvl xoaddr:$src))
+    (set v?:$XT, (int_ppc_vsx_lxvll xoaddr:$src))
+
+  . (int_ppc_vsx_stxvl xoaddr:$dst))
+    (int_ppc_vsx_stxvll xoaddr:$dst))
+
+- Load Vector Word & Splat Indexed: lxvwsx
+  . Likely needs an intrinsic
+  . (set v?:$XT, (int_ppc_vsx_lxvwsx xoaddr:$src))
+
+Atomic operations (l[dw]at, st[dw]at):
+- Provide custom lowering for common atomic operations to use these
+  instructions with the correct Function Code
+- Ensure the operands are in the correct register (i.e. RT+1, RT+2)
+- Provide builtins since not all FC's necessarily have an existing LLVM
+  atomic operation
+
+Load Doubleword Monitored (ldmx):
+- Investigate whether there are any uses for this. It seems to be related to
+  Garbage Collection so it isn't likely to be all that useful for most
+  languages we deal with.
+
+Move to CR from XER Extended (mcrxrx):
+- Is there a use for this in LLVM?
+
+Fixed Point Facility:
+
+- Copy-Paste Facility: copy copy_first cp_abort paste paste. paste_last
+  . Use instrinstics:
+    (int_ppc_copy_first i32:$rA, i32:$rB)
+    (int_ppc_copy i32:$rA, i32:$rB)
+
+    (int_ppc_paste i32:$rA, i32:$rB)
+    (int_ppc_paste_last i32:$rA, i32:$rB)
+
+    (int_cp_abort)
+
+- Message Synchronize: msgsync
+- SLB*: slbieg slbsync
+- stop
+  . No instrinstics
diff --git a/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
new file mode 100644
index 000000000000..a637dd11f810
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -0,0 +1,37 @@
+//===-- PowerPCTargetInfo.cpp - PowerPC Target Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getThePPC32Target() {
+  static Target ThePPC32Target;
+  return ThePPC32Target;
+}
+Target &llvm::getThePPC64Target() {
+  static Target ThePPC64Target;
+  return ThePPC64Target;
+}
+Target &llvm::getThePPC64LETarget() {
+  static Target ThePPC64LETarget;
+  return ThePPC64LETarget;
+}
+
+extern "C" void LLVMInitializePowerPCTargetInfo() {
+  RegisterTarget<Triple::ppc, /*HasJIT=*/true> X(getThePPC32Target(), "ppc32",
+                                                 "PowerPC 32");
+
+  RegisterTarget<Triple::ppc64, /*HasJIT=*/true> Y(getThePPC64Target(), "ppc64",
+                                                   "PowerPC 64");
+
+  RegisterTarget<Triple::ppc64le, /*HasJIT=*/true> Z(
+      getThePPC64LETarget(), "ppc64le", "PowerPC 64 LE");
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/p9-instrs.txt b/contrib/llvm/lib/Target/PowerPC/p9-instrs.txt
new file mode 100644
index 000000000000..a70582aca398
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/p9-instrs.txt
@@ -0,0 +1,442 @@
+Content:
+========
+. Remaining Instructions (Total 56 Instructions, include 2 unknow instructions) 
+. Done (Total 155 Instructions: 101 VSX, 54 Altivec)
+
+//------------------------------------------------------------------------------
+//. Remaining Instructions
+//------------------------------------------------------------------------------
+GCC reference: https://sourceware.org/ml/binutils/2015-11/msg00071.html
+
+// Add PC Immediate Shifted  DX-form p69
+[PO RT d1 d0 XO d2]         addpcis     RT,D
+                            subpcis Rx,value = addpcis Rx,-value
+
+// 6.17.2 Decimal Integer Format Conversion Instructions
+
+// Decimal Convert From National VX-form p352
+[PO VRT EO VRB 1 PS XO]     bcdcfn.     VRT,VRB,PS
+
+// Decimal Convert From Zoned VX-form p353
+[PO VRT EO VRB 1 PS XO]     bcdcfz.     VRT,VRB,PS
+
+// Decimal Convert To National VX-form p354
+[PO VRT EO VRB 1 / XO]      bcdctn.     VRT,VRB
+
+// Decimal Convert To Zoned VX-form p355
+[PO VRT EO VRB 1 PS XO]     bcdctz.     VRT,VRB,PS
+
+// Decimal Convert From Signed Quadword VX-form p356
+[PO VRT EO VRB 1 PS XO]     bcdcfsq.    VRT,VRB,PS
+
+// Decimal Convert To Signed Quadword VX-form p356
+[PO VRT EO VRB 1 / XO]      bcdctsq.    VRT,VRB
+
+// 6.17.3 Decimal Integer Sign Manipulation Instructions
+
+// Decimal Copy Sign VX-form p358
+[PO VRT VRA VRB XO]         bcdcpsgn.   VRT,VRA,VRB
+
+// Decimal Set Sign VX-form p358
+[PO VRT EO VRB 1 PS XO]     bcdsetsgn.  VRT,VRB,PS
+
+// Decimal Shift VX-form p359
+[PO VRT VRA VRB 1 PS XO]    bcds.       VRT,VRA,VRB,PS
+
+// Decimal Unsigned Shift VX-form p360
+[PO VRT VRA VRB 1 / XO]     bcdus.      VRT,VRA,VRB
+
+// Decimal Shift and Round VX-form p361
+[PO VRT VRA VRB 1 PS XO]    bcdsr.      VRT,VRA,VRB,PS
+
+// 6.17.5 Decimal Integer Truncate Instructions
+
+// Decimal Truncate VX-form p362
+[PO VRT VRA VRB 1 PS XO]    bcdtrunc.   VRT,VRA,VRB,PS
+
+// Decimal Unsigned Truncate VX-form p363
+[PO VRT VRA VRB 1 / XO]     bcdutrunc.  VRT,VRA,VRB
+
+// 3.3.10.1 Character-Type Compare Instructions
+
+// Compare Ranged Byte X-form p87
+[PO BF / L RA RB XO /]      cmprb       BF,L,RA,RB
+
+// Compare Equal Byte X-form p88
+[PO BF // RA RB XO /]       cmpeqb      BF,RA,RB
+
+// 3.3.13 Fixed-Point Logical Instructions
+
+// Count Trailing Zeros Word X-form p95
+[PO RS RA /// XO Rc]        cnttzw(.)   RA,RS
+
+// 3.3.13.1 64-bit Fixed-Point Logical Instructions
+
+// Count Trailing Zeros Doubleword  X-form p98
+[PO RS RA /// XO Rc]        cnttzd(.)   RA,RS
+
+// 4.4 Copy-Paste Facility
+
+// Copy X-form p858
+[PO /// L RA RB XO /]       copy        RA,RB,L
+                            copy_first = copy RA, RB, 1
+// CP_Abort p860
+[PO /// /// /// XO /]       cp_abort
+
+// Paste p859
+[PO /// L RA RB XO Rc]      paste(.)    RA,RB,L
+                            paste_last = paste RA,RB,1
+
+// 3.3.9 Fixed-Point Arithmetic Instructions
+
+// Deliver A Random Number X-form p79
+[PO RT /// L /// XO /]      darn        RT,L
+
+// Multiply-Add High Doubleword VA-form p81
+[PO RT RA RB RC XO]         maddhd      RT,RA.RB,RC
+
+// Multiply-Add High Doubleword Unsigned VA-form  p81
+[PO RT RA RB RC XO]         maddhdu     RT,RA.RB,RC
+
+// Multiply-Add Low Doubleword VA-form p81
+[PO RT RA RB RC XO]         maddld      RT,RA.RB,RC
+
+// Modulo Signed Word X-form p76
+[PO RT RA RB XO /]          modsw       RT,RA,RB
+
+// Modulo Unsigned Word X-form p76
+[PO RT RA RB XO /]          moduw       RT,RA,RB 
+
+// Modulo Signed Doubleword X-form p84
+[PO RT RA RB XO /]          modsd       RT,RA,RB
+
+// Modulo Unsigned Doubleword X-form p84
+[PO RT RA RB XO /]          modud       RT,RA,RB
+
+
+// DFP Test Significance Immediate [Quad] X-form p204
+[PO BF / UIM FRB XO /]      dtstsfi     BF,UIM,FRB
+[PO BF / UIM FRBp XO /]     dtstsfiq    BF,UIM,FRBp
+
+// 3.3.14.2.1 64-bit Fixed-Point Shift Instructions
+
+// Extend-Sign Word and Shift Left Immediate XS-form p109
+[PO RS RA sh XO sh Rc]      extswsli(.) RA,RS,SH
+
+// 4.5.1 Load Atomic
+
+// Load Word Atomic   X-form p864
+[PO RT RA FC XO /]          lwat        RT,RA,FC
+
+// Load Doubleword Atomic X-form p864
+[PO RT RA FC XO /]          ldat        RT,RA,FC
+
+// 4.5.2 Store Atomic
+
+// Store Word Atomic   X-form p866
+[PO RS RA FC XO /]          stwat       RS,RA,FC
+
+// Store Doubleword Atomic   X-form p866
+[PO RS RA FC XO /]          stdat       RS,RA,FC
+
+// 3.3.2.1 64-bit Fixed-Point Load Instructions 
+
+// Load Doubleword Monitored Indexed X-form p54
+[PO RT RA RB XO /]          ldmx        RT,RA,RB
+
+// 3.3.16 Move To/From Vector-Scalar Register Instructions
+
+// Move From VSR Lower Doubleword XX1-form p111
+[PO S RA /// XO SX]         mfvsrld     RA,XS
+
+// Move To VSR Double Doubleword XX1-form p114
+[PO T RA RB XO TX]          mtvsrdd     XT,RA,RB
+
+// Move To VSR Word & Splat XX1-form p115
+[PO T RA /// XO TX]         mtvsrws     XT,RA
+
+// Move to CR from XER Extended X-form p119
+[PO BF // /// /// XO /]     mcrxrx      BF
+
+// Set Boolean X-form p121
+[PO RT BFA // /// XO /]     setb        RT,BFA
+
+// Message Synchronize X-form p1126
+[PO /// /// /// XO /]       msgsync
+
+// SLB Invalidate Entry Global  X-form p1026
+[PO RS /// RB XO /]         slbieg      RS,RB 
+
+// SLB Synchronize  X-form p1031
+[PO /// /// /// XO /]       slbsync
+
+// 3.3.2.1 Power-Saving Mode Instruction
+
+// stop    XL-form p957
+[PO /// /// /// XO /]       stop
+
+// 4.6.4 Wait Instruction
+// Wait X-form p880
+[PO /// WC /// /// XO /]    wait
+
+// Unknow Instructions:
+urfid
+- gcc's implementation:
+    {"urfid",	XL(19,306),	0xffffffff,  POWER9,	PPCNONE,	{0}},
+    (4c 00 02 64|64 02 00 4c) 	urfid
+
+rmieg
+- gcc's implementation: 
+    {"rmieg",	X(31,882),	XRTRA_MASK,  POWER9,	PPCNONE,	{RB}},
+    (7c 00 f6 e4|e4 f6 00 7c) 	rmieg   r30
+
+//------------------------------------------------------------------------------
+//. Done:
+//------------------------------------------------------------------------------
+
+//======================================
+"vsx instructions"
+
+//--------------------------------------
+"7.6.1.2.1 VSX Scalar Move Instructions"
+// VSX Scalar Quad-Precision Move Instructions
+
+// VSX Scalar Copy Sign Quad-Precision X-form p.553
+[PO VRT VRA VRB XO /] xscpsgnqp
+
+// VSX Scalar Absolute Quad-Precision X-form 531
+// VSX Scalar Negate Quad-Precision X-form 627
+// VSX Scalar Negative Absolute Quad-Precision X-form 626
+[PO VRT XO VRB XO /] xsabsqp xsnegqp xsnabsqp
+
+//--------------------------------------
+"7.6.1.3 VSX Floating-Point Arithmetic Instructions"
+
+// VSX Scalar Quad-Precision Elementary Arithmetic
+
+// VSX Scalar Add Quad-Precision [using round to Odd] X-form 539
+// VSX Scalar Divide Quad-Precision [using round to Odd] X-form 584
+// VSX Scalar Multiply Quad-Precision [using round to Odd] X-form 622
+[PO VRT VRA VRB XO RO] xsaddqp xsaddqpo xsdivqp xsdivqpo xsmulqp xsmulqpo
+
+// VSX Scalar Square Root Quad-Precision [using round to Odd] X-form 662
+// VSX Scalar Subtract Quad-Precision [using round to Odd] X-form 667
+                       xssubqp xssubqpo
+
+[PO VRT XO VRB XO RO] xssqrtqp xssqrtqpo
+
+// VSX Scalar Quad-Precision Multiply-Add Arithmetic Instructions
+
+// VSX Scalar Multiply-Add Quad-Precision [using round to Odd] X-form 596
+// VSX Scalar Multiply-Subtract Quad-Precision [using round to Odd] X-form 617
+// VSX Scalar Negative Multiply-Add Quad-Precision [using round to Odd] X-form 636
+// VSX Scalar Negative Multiply-Subtract Quad-Precision [using round to Odd]
+// X-form 645
+[PO VRT VRA VRB XO RO] xsmaddqp xsmaddqpo xsmsubqp xsmsubqpo 
+                       xsnmaddqp xsnmaddqpo xsnmsubqp xsnmsubqpo
+
+22
+//--------------------------------------
+"7.6.1.4 VSX Floating-Point Compare Instructions"
+
+// VSX Scalar Quad-Precision Compare Instructions
+
+// VSX Scalar Compare Ordered Quad-Precision X-form 549
+// VSX Scalar Compare Unordered Quad-Precision X-form 552
+[PO BF // VRA VRB XO /] xscmpoqp xscmpuqp 
+
+"7.6.1.8 VSX Scalar Floating-Point Support Instructions"
+// VSX Scalar Compare Exponents Quad-Precision X-form p. 541 542
+[PO BF // A B XO AX BX /] xscmpexpdp 
+[PO BF // VRA VRB XO /] xscmpexpqp
+
+// VSX Scalar Compare DP, XX3-form, p.543 544 545
+// VSX Scalar Compare Equal Double-Precision, 
+[PO T A B XO AX BX TX]  xscmpeqdp xscmpgedp xscmpgtdp xscmpnedp
+
+// VSX Vector Compare Not Equal Double-Precision XX3-form 691
+[PO T A B Rc XO AX BX TX] xvcmpnedp xvcmpnedp. xvcmpnesp xvcmpnesp.
+
+//--------------------------------------
+"7.6.1.5 VSX FP-FP Conversion Instructions"
+// VSX Scalar Quad-Precision Floating-Point Conversion Instructions
+
+// VSX Scalar round & Convert Quad-Precision format to Double-Precision format
+// [using round to Odd] X-form 567
+[PO VRT XO VRB XO /] xscvqpdp xscvqpdpo (actually [PO VRT XO VRB XO RO])
+[PO VRT XO VRB XO /] xscvdpqp
+
+// VSX Scalar Quad-Precision Convert to Integer Instructions
+
+// VSX Scalar truncate & Convert Quad-Precision format to Signed Doubleword format
+// 568 570 572 574
+[PO VRT XO VRB XO /] xscvqpsdz xscvqpswz xscvqpudz xscvqpuwz
+576 = 580            xscvsdqp xscvudqp
+
+"7.6.1.7 VSX Round to Floating-Point Integer Instructions"
+// VSX Scalar round & Convert Double-Precision format to Half-Precision format
+// XX2-form 554 566
+[PO T XO B XO BX TX] xscvdphp xscvhpdp
+
+// VSX Vector Convert Half-Precision format to Single-Precision format
+// XX2-form 703 705
+[PO T XO B XO BX TX] xvcvhpsp xvcvsphp
+
+// VSX Scalar Round to Quad-Precision Integer [with Inexact] Z23-form 654
+[PO VRT /// R VRB RMC XO EX] xsrqpi xsrqpix
+
+// VSX Scalar Round Quad-Precision to Double-Extended Precision Z23-form 656
+[PO VRT /// R VRB RMC XO /] xsrqpxp 
+def XSRQPXP : Z23Form_1<63, 37,
+                        (outs vrrc:$vT), (ins u5imm:$R, vrrc:$vB, u2imm:$RMC),
+                        "xsrqpxp $vT, $R, $vB, $RMC"), IIC_VecFP, []>;
+
+27~28
+//--------------------------------------
+// VSX Scalar Insert Exponent Double-Precision X-form 588
+// VSX Scalar Insert Exponent Quad-Precision X-form 589
+[PO VT rA rB XO /]  xsiexpdp 
+[PO VRT VRA VRB XO /]  xsiexpqp
+
+// VSX Vector Insert Exponent Double-Precision XX3-form 722
+[PO T A B XO AX BX TX] xviexpdp xviexpsp
+
+// VSX Vector Extract Unsigned Word XX2-form 788
+// VSX Vector Insert Word XX2-form
+[PO T / UIM B XO BX TX] xxextractuw xxinsertw
+
+// VSX Scalar Extract Exponent Double-Precision XX2-form 676
+[PO BF DCMX B XO BX /]  
+[PO T XO B XO BX /] xsxexpdp xsxsigdp
+// X-form
+[PO VRT XO VRB XO /] xsxexpqp xsxsigqp
+
+// VSX Vector Extract Exponent Double-Precision XX2-form 784
+[PO T XO B XO BX TX] xvxexpdp xvxexpsp
+
+// VSX Vector Extract Significand Double-Precision XX2-form 785
+[PO T XO B XO BX TX] xvxsigdp xvxsigsp
+
+//--------------------------------------
+// VSX Scalar Test Data Class Double-Precision XX2-form p673
+// VSX Scalar Test Data Class Quad-Precision X-form 674
+// VSX Scalar Test Data Class Single-Precision XX2-form 675
+[PO BF DCMX B XO BX /]  xststdcdp xststdcsp
+[PO BF DCMX VRB XO /]   xststdcqp 
+
+// VSX Vector Test Data Class Double-Precision XX2-form 782 783
+[PO T dx B XO dc XO dm BX TX] xvtstdcdp xvtstdcsp 
+
+//--------------------------------------
+// VSX Scalar Maximum Type-C Double-Precision XX3-form 601 ~ 609
+[PO T A B XO AX BX TX] xsmaxcdp xsmaxjdp xsmincdp xsminjdp
+
+//--------------------------------------
+// VSX Vector Byte-Reverse Doubleword XX2-form 786 787
+[PO T XO B XO BX TX] xxbrd xxbrh xxbrq xxbrw
+
+// VSX Vector Permute XX3-form 794
+[PO T A B XO AX BX TX] xxperm xxpermr
+
+// VSX Vector Splat Immediate Byte 796 x-form
+[PO T EO IMM8 XO TX] xxspltib   <= sign or unsigned?
+
+30
+//--------------------------------------
+// Load VSX Vector DQ-form 511
+[PO T RA DQ TX XO] lxv 
+
+// Store VSX Vector DQ-form 526
+[PO S RA DQ SX XO] stxv
+
+// Load VSX Scalar Doubleword DS-form 499
+// Load VSX Scalar Single DS-form 504
+[PO VRT RA DS XO] lxsd lxssp 
+
+// Store VSX Scalar Doubleword DS-form 517
+// Store VSX Scalar Single DS-form 520
+[PO VRT RA DS XO] stxsd stxssp
+
+
+// Load VSX Vector Indexed X-form 511
+// Load VSX Scalar as Integer Byte & Zero Indexed X-form 501
+// Load VSX Vector Byte*16 Indexed X-form 506
+// Load VSX Vector with Length X-form 508
+// Load VSX Vector Left-justified with Length X-form 510
+// Load VSX Vector Halfword*8 Indexed X-form 514
+// Load VSX Vector Word & Splat Indexed X-form 516
+[PO T RA RB XO TX] lxvx lxsibzx lxsihzx lxvb16x lxvl lxvll lxvh8x lxvwsx
+
+// Store VSX Scalar as Integer Byte Indexed X-form 518
+// Store VSX Scalar as Integer Halfword Indexed X-form 518
+// Store VSX Vector Byte*16 Indexed X-form 522
+// Store VSX Vector Halfword*8 Indexed X-form 524
+// Store VSX Vector with Length X-form 526
+// Store VSX Vector Left-justified with Length X-form 528
+// Store VSX Vector Indexed X-form 529
+[PO S RA RB XO SX] stxsibx stxsihx stxvb16x stxvh8x stxvl stxvll stxvx
+
+21
+
+//--------------------------------------
+". vector instructions"
+
+[1] PowerISA-v3.0 p.933 - Table 1, and Chapter 6. Vector Facility (altivec)
+[2] https://sourceware.org/ml/binutils/2015-11/msg00071.html
+
+//--------------------------------------
+New patch:
+// vector bit, p.367, 6.16 Vector Bit Permute Instruction
+[PO VRT VRA VRB XO] vbpermd, (existing: vbpermq)
+
+// vector permute, p.280
+[PO VRT VRA VRB VRC XO] vpermr
+
+// vector rotate left, p.341
+[PO VRT VRA VRB XO] vrlwnm vrlwmi vrldnm vrldmi
+
+// vector shift, p.285
+[PO VRT VRA VRB XO] vslv vsrv
+
+// vector multiply-by-10, p.375
+[PO VRT VRA /// XO] vmul10cuq vmul10uq
+[PO VRT VRA VRB XO] vmul10ecuq vmul10euq 
+
+12
+//--------------------------------------
+http://reviews.llvm.org/D15887 + ext + neg + prty - vbpermd
+// vector count leading/trailing zero
+. new vx-form: p.31, 1.6.14 VX-FORM
+[PO RT EO VRB XO] vclzlsbb vctzlsbb (p.363)
+
+// Vector Count Trailing Zeros Instructions, 362
+[PO VRT EO VRB XO] vctzb vctzh vctzw vctzd (v16i8 v8i16 v4i32 v2i64)
+
+// vector extend sign (p.314)
+[PO VRT EO VRB XO] vextsb2w vextsh2w vextsb2d vextsh2d vextsw2d
+
+// vector negate, p.313
+[PO VRT EO VRB XO] vnegd vnegw
+
+// vector parity, p.335
+[PO VRT EO VRB XO] vprtybd vprtybq vprtybw
+
+16
+//--------------------------------------
+// vector compare, p.330
+[PO VRT VRA VRB RC XO] vcmpneb vcmpneb. vcmpneh vcmpneh. vcmpnew vcmpnew.
+                       vcmpnezb vcmpnezb. vcmpnezh vcmpnezh. vcmpnezw vcmpnezw.
+12
+//--------------------------------------
+http://reviews.llvm.org/D15917 + insert
+// vector extract (p.287) ref: vspltb (v2.07, p.227)
+// vector insert, p.288
+[PO VRT / UIM VRB XO] vinsertb vinsertd vinserth vinsertw
+
+// Vector Extract Unsigned
+[PO VRT / UIM VRB XO] vextractub vextractuh vextractuw vextractd
+
+// p.364: Vector Extract Unsigned Left/Right-Indexed
+[PO RT RA VRB XO] vextublx vextubrx vextuhlx vextuhrx vextuwlx vextuwrx 
+
+14
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
new file mode 100644
index 000000000000..f8ef142255c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -0,0 +1,91 @@
+//===-- RISCVAsmBackend.cpp - RISCV Assembler Backend ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class RISCVAsmBackend : public MCAsmBackend {
+  uint8_t OSABI;
+  bool Is64Bit;
+
+public:
+  RISCVAsmBackend(uint8_t OSABI, bool Is64Bit)
+      : MCAsmBackend(), OSABI(OSABI), Is64Bit(Is64Bit) {}
+  ~RISCVAsmBackend() override {}
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+
+  unsigned getNumFixupKinds() const override { return 1; }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {
+
+    llvm_unreachable("RISCVAsmBackend::relaxInstruction() unimplemented");
+  }
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+
+bool RISCVAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  // Once support for the compressed instruction set is added, we will be able
+  // to conditionally support 16-bit NOPs
+  if ((Count % 4) != 0)
+    return false;
+
+  // The canonical nop on RISC-V is addi x0, x0, 0
+  for (uint64_t i = 0; i < Count; i += 4)
+    OW->write32(0x13);
+
+  return true;
+}
+
+void RISCVAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                 unsigned DataSize, uint64_t Value,
+                                 bool IsPCRel) const {
+  return;
+}
+
+MCObjectWriter *
+RISCVAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
+  return createRISCVELFObjectWriter(OS, OSABI, Is64Bit);
+}
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createRISCVAsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          const Triple &TT, StringRef CPU,
+                                          const MCTargetOptions &Options) {
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
+  return new RISCVAsmBackend(OSABI, TT.isArch64Bit());
+}
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
new file mode 100644
index 000000000000..4f085d31a267
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -0,0 +1,47 @@
+//===-- RISCVELFObjectWriter.cpp - RISCV ELF Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class RISCVELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  RISCVELFObjectWriter(uint8_t OSABI, bool Is64Bit);
+
+  ~RISCVELFObjectWriter() override;
+
+protected:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+};
+}
+
+RISCVELFObjectWriter::RISCVELFObjectWriter(uint8_t OSABI, bool Is64Bit)
+    : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_RISCV,
+                              /*HasRelocationAddend*/ false) {}
+
+RISCVELFObjectWriter::~RISCVELFObjectWriter() {}
+
+unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
+                                            const MCValue &Target,
+                                            const MCFixup &Fixup,
+                                            bool IsPCRel) const {
+  llvm_unreachable("invalid fixup kind!");
+}
+
+MCObjectWriter *llvm::createRISCVELFObjectWriter(raw_pwrite_stream &OS,
+                                                 uint8_t OSABI, bool Is64Bit) {
+  MCELFObjectTargetWriter *MOTW = new RISCVELFObjectWriter(OSABI, Is64Bit);
+  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian*/ true);
+}
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
new file mode 100644
index 000000000000..b164df8b595a
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
@@ -0,0 +1,25 @@
+//===-- RISCVMCAsmInfo.cpp - RISCV Asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the RISCVMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+using namespace llvm;
+
+void RISCVMCAsmInfo::anchor() {}
+
+RISCVMCAsmInfo::RISCVMCAsmInfo(const Triple &TT) {
+  PointerSize = CalleeSaveStackSlotSize = TT.isArch64Bit() ? 8 : 4;
+  CommentString = "#";
+  AlignmentIsInBytes = false;
+  SupportsDebugInformation = true;
+}
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
new file mode 100644
index 000000000000..901a1eba8af2
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- RISCVMCAsmInfo.h - RISCV Asm Info ----------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the RISCVMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCASMINFO_H
+#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class RISCVMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit RISCVMCAsmInfo(const Triple &TargetTriple);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
new file mode 100644
index 000000000000..b2ed13758d41
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -0,0 +1,91 @@
+//===-- RISCVMCCodeEmitter.cpp - Convert RISCV code to machine code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the RISCVMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace {
+class RISCVMCCodeEmitter : public MCCodeEmitter {
+  RISCVMCCodeEmitter(const RISCVMCCodeEmitter &) = delete;
+  void operator=(const RISCVMCCodeEmitter &) = delete;
+  MCContext &Ctx;
+
+public:
+  RISCVMCCodeEmitter(MCContext &ctx) : Ctx(ctx) {}
+
+  ~RISCVMCCodeEmitter() override {}
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  /// TableGen'erated function for getting the binary encoding for an
+  /// instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// Return binary encoding of operand. If the machine operand requires
+  /// relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+};
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createRISCVMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              MCContext &Ctx) {
+  return new RISCVMCCodeEmitter(Ctx);
+}
+
+void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  // For now, we only support RISC-V instructions with 32-bit length
+  uint32_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
+  support::endian::Writer<support::little>(OS).write(Bits);
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+unsigned
+RISCVMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
+
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+
+  llvm_unreachable("Unhandled expression!");
+  return 0;
+}
+
+#include "RISCVGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
new file mode 100644
index 000000000000..4fc69a7fcaba
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -0,0 +1,59 @@
+//===-- RISCVMCTargetDesc.cpp - RISCV Target Descriptions -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file provides RISCV-specific target descriptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "RISCVMCTargetDesc.h"
+#include "RISCVMCAsmInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "RISCVGenInstrInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "RISCVGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createRISCVMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitRISCVMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createRISCVMCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitRISCVMCRegisterInfo(X, RISCV::X1_32);
+  return X;
+}
+
+static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI,
+                                       const Triple &TT) {
+  MCAsmInfo *MAI = new RISCVMCAsmInfo(TT);
+  return MAI;
+}
+
+extern "C" void LLVMInitializeRISCVTargetMC() {
+  for (Target *T : {&getTheRISCV32Target(), &getTheRISCV64Target()}) {
+    RegisterMCAsmInfoFn X(*T, createRISCVMCAsmInfo);
+    TargetRegistry::RegisterMCInstrInfo(*T, createRISCVMCInstrInfo);
+    TargetRegistry::RegisterMCRegInfo(*T, createRISCVMCRegisterInfo);
+    TargetRegistry::RegisterMCAsmBackend(*T, createRISCVAsmBackend);
+    TargetRegistry::RegisterMCCodeEmitter(*T, createRISCVMCCodeEmitter);
+  }
+}
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
new file mode 100644
index 000000000000..ddc3bf350452
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
@@ -0,0 +1,58 @@
+//===-- RISCVMCTargetDesc.h - RISCV Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides RISCV specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCTARGETDESC_H
+#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCTARGETDESC_H
+
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Config/config.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class StringRef;
+class Target;
+class Triple;
+class raw_ostream;
+class raw_pwrite_stream;
+
+Target &getTheRISCV32Target();
+Target &getTheRISCV64Target();
+
+MCCodeEmitter *createRISCVMCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        MCContext &Ctx);
+
+MCAsmBackend *createRISCVAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                    const Triple &TT, StringRef CPU,
+                                    const MCTargetOptions &Options);
+
+MCObjectWriter *createRISCVELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI,
+                                           bool Is64Bit);
+}
+
+// Defines symbolic names for RISC-V registers.
+#define GET_REGINFO_ENUM
+#include "RISCVGenRegisterInfo.inc"
+
+// Defines symbolic names for RISC-V instructions.
+#define GET_INSTRINFO_ENUM
+#include "RISCVGenInstrInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/RISCV/RISCV.td b/contrib/llvm/lib/Target/RISCV/RISCV.td
new file mode 100644
index 000000000000..14838309a1bf
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/RISCV.td
@@ -0,0 +1,27 @@
+//===-- RISCV.td - Describe the RISCV Target Machine -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+include "RISCVRegisterInfo.td"
+include "RISCVInstrInfo.td"
+
+
+def RISCVInstrInfo : InstrInfo;
+
+def Feature64Bit   : SubtargetFeature<"64bit", "HasRV64", "true",
+                                      "Implements RV64">;
+
+def : ProcessorModel<"generic-rv32", NoSchedModel, []>;
+
+def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit]>;
+
+def RISCV : Target {
+  let InstructionSet = RISCVInstrInfo;
+}
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/contrib/llvm/lib/Target/RISCV/RISCVInstrFormats.td
new file mode 100644
index 000000000000..1e9bc3bf9bc5
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -0,0 +1,152 @@
+//===-- RISCVInstrFormats.td - RISCV Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+//  These instruction format definitions are structured to match the
+//  description in the RISC-V User-Level ISA specification as closely as
+//  possible. For instance, the specification describes instructions with the
+//  MSB (31st bit) on the left and the LSB (0th bit) on the right. This is
+//  reflected in the order of parameters to each instruction class.
+//
+//  One area of divergence is in the description of immediates. The
+//  specification describes immediate encoding in terms of bit-slicing
+//  operations on the logical value represented. The immediate argument to
+//  these instruction formats instead represents the bit sequence that will be
+//  inserted into the instruction. e.g. although JAL's immediate is logically
+//  a 21-bit value (where the LSB is always zero), we describe it as an imm20
+//  to match how it is encoded.
+//
+//===----------------------------------------------------------------------===//
+
+class RISCVInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : Instruction {
+  field bits<32> Inst;
+  let Size = 4;
+
+  bits<7> Opcode = 0;
+
+  let Inst{6-0} = Opcode;
+
+  let Namespace = "RISCV";
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+}
+
+// Pseudo instructions
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : RISCVInst<outs, ins, asmstr, pattern> {
+  let isPseudo = 1;
+}
+
+class FR<bits<7> funct7, bits<3> funct3, bits<7> opcode, dag outs, dag ins,
+         string asmstr, list<dag> pattern> : RISCVInst<outs, ins, asmstr, pattern>
+{
+  bits<5> rs2;
+  bits<5> rs1;
+  bits<5> rd;
+
+  let Inst{31-25} = funct7;
+  let Inst{24-20} = rs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = funct3;
+  let Inst{11-7} = rd;
+  let Opcode = opcode;
+}
+
+class FI<bits<3> funct3, bits<7> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : RISCVInst<outs, ins, asmstr, pattern>
+{
+  bits<12> imm12;
+  bits<5> rs1;
+  bits<5> rd;
+
+  let Inst{31-20} = imm12;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = funct3;
+  let Inst{11-7} = rd;
+  let Opcode = opcode;
+}
+
+class FI32Shift<bit arithshift, bits<3> funct3, bits<7> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : RISCVInst<outs, ins, asmstr, pattern>
+{
+  bits<5> shamt;
+  bits<5> rs1;
+  bits<5> rd;
+
+  let Inst{31} = 0;
+  let Inst{30} = arithshift;
+  let Inst{29-25} = 0;
+  let Inst{24-20} = shamt;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = funct3;
+  let Inst{11-7} = rd;
+  let Opcode = opcode;
+}
+
+class FS<bits<3> funct3, bits<7> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : RISCVInst<outs, ins, asmstr, pattern>
+{
+  bits<12> imm12;
+  bits<5> rs2;
+  bits<5> rs1;
+
+  let Inst{31-25} = imm12{11-5};
+  let Inst{24-20} = rs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = funct3;
+  let Inst{11-7} = imm12{4-0};
+  let Opcode = opcode;
+}
+
+class FSB<bits<3> funct3, bits<7> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : RISCVInst<outs, ins, asmstr, pattern>
+{
+  bits<12> imm12;
+  bits<5> rs2;
+  bits<5> rs1;
+
+  let Inst{31} = imm12{11};
+  let Inst{30-25} = imm12{9-4};
+  let Inst{24-20} = rs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = funct3;
+  let Inst{11-8} = imm12{3-0};
+  let Inst{7} = imm12{10};
+  let Opcode = opcode;
+}
+
+class FU<bits<7> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : RISCVInst<outs, ins, asmstr, pattern>
+{
+  bits<20> imm20;
+  bits<5> rd;
+
+  let Inst{31-12} = imm20;
+  let Inst{11-7} = rd;
+  let Opcode = opcode;
+}
+
+class FUJ<bits<7> opcode, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : RISCVInst<outs, ins, asmstr, pattern>
+{
+  bits<20> imm20;
+  bits<5> rd;
+
+  let Inst{31} = imm20{19};
+  let Inst{30-21} = imm20{9-0};
+  let Inst{20} = imm20{10};
+  let Inst{19-12} = imm20{18-11};
+  let Inst{11-7} = rd;
+  let Opcode = opcode;
+}
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.td
new file mode 100644
index 000000000000..52530c2f136c
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -0,0 +1,55 @@
+//===-- RISCVInstrInfo.td - Target Description for RISCV ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the RISC-V instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "RISCVInstrFormats.td"
+
+def simm12 : Operand<i32>;
+
+// As noted in RISCVRegisterInfo.td, the hope is that support for
+// variable-sized register classes will mean that instruction definitions do
+// not need to be duplicated for 32-bit and 64-bit register classes. For now
+// we use 'GPR', which is 32-bit. When codegen for both RV32 and RV64 is
+// added, we will need to duplicate instruction definitions unless a proposal
+// like <http://lists.llvm.org/pipermail/llvm-dev/2016-September/105027.html>
+// is adopted.
+
+class ALU_ri<bits<3> funct3, string OpcodeStr> :
+      FI<funct3, 0b0010011, (outs GPR:$rd), (ins GPR:$rs1, simm12:$imm12),
+         OpcodeStr#"\t$rd, $rs1, $imm12", []>
+{
+}
+
+def ADDI  : ALU_ri<0b000, "addi">;
+def SLTI  : ALU_ri<0b010, "slti">;
+def SLTIU : ALU_ri<0b011, "sltiu">;
+def XORI  : ALU_ri<0b100, "xori">;
+def ORI   : ALU_ri<0b110, "ori">;
+def ANDI  : ALU_ri<0b111, "andi">;
+
+class ALU_rr<bits<7> funct7, bits<3> funct3, string OpcodeStr> :
+      FR<funct7, funct3, 0b0110011, (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
+         OpcodeStr#"\t$rd, $rs1, $rs2", []>
+{
+}
+
+def ADD  : ALU_rr<0b0000000, 0b000, "add">;
+def SUB  : ALU_rr<0b0100000, 0b000, "sub">;
+def SLL  : ALU_rr<0b0000000, 0b001, "sll">;
+def SLT  : ALU_rr<0b0000000, 0b010, "slt">;
+def SLTU : ALU_rr<0b0000000, 0b011, "sltu">;
+def XOR  : ALU_rr<0b0000000, 0b100, "xor">;
+def SRL  : ALU_rr<0b0000000, 0b101, "srl">;
+def SRA  : ALU_rr<0b0100000, 0b101, "sra">;
+def OR   : ALU_rr<0b0000000, 0b110, "or">;
+def AND  : ALU_rr<0b0000000, 0b111, "and">;
+
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
new file mode 100644
index 000000000000..f04de217bf0d
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -0,0 +1,90 @@
+//===-- RISCVRegisterInfo.td - RISC-V Register defs --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the RISC-V register file
+//===----------------------------------------------------------------------===//
+
+let Namespace = "RISCV" in {
+  def sub_32 : SubRegIndex<32>;
+
+  class RISCVReg32<bits<5> Enc, string n, list<string> alt = []> : Register<n> {
+    let HWEncoding{4-0} = Enc;
+    let AltNames = alt;
+  }
+
+  // RISCV64 registers don't define an AsmName or AltName. If they specified
+  // names aliasing the RISCVReg32 registers, the generation of the default
+  // MatchRegisterName/MatchRegisterAltName would fail. When necessary,
+  // RISCVAsmParser will need to convert a register number from a RISCVReg32
+  // to the equivalent RISCVReg64.
+  class RISCVReg64<RISCVReg32 subreg> : Register<""> {
+    let HWEncoding{4-0} = subreg.HWEncoding{4-0};
+    let SubRegs = [subreg];
+    let SubRegIndices = [sub_32];
+  }
+
+  def ABIRegAltName : RegAltNameIndex;
+}
+
+// Integer registers
+let RegAltNameIndices = [ABIRegAltName] in {
+  def X0_32    : RISCVReg32<0, "x0", ["zero"]>, DwarfRegNum<[0]>;
+  def X1_32    : RISCVReg32<1, "x1", ["ra"]>, DwarfRegNum<[1]>;
+  def X2_32    : RISCVReg32<2, "x2", ["sp"]>, DwarfRegNum<[2]>;
+  def X3_32    : RISCVReg32<3, "x3", ["gp"]>, DwarfRegNum<[3]>;
+  def X4_32    : RISCVReg32<4, "x4", ["tp"]>, DwarfRegNum<[4]>;
+  def X5_32    : RISCVReg32<5, "x5", ["t0"]>, DwarfRegNum<[5]>;
+  def X6_32    : RISCVReg32<6, "x6", ["t1"]>, DwarfRegNum<[6]>;
+  def X7_32    : RISCVReg32<7, "x7", ["t2"]>, DwarfRegNum<[7]>;
+  def X8_32    : RISCVReg32<8, "x8", ["s0"]>, DwarfRegNum<[8]>;
+  def X9_32    : RISCVReg32<9, "x9", ["s1"]>, DwarfRegNum<[9]>;
+  def X10_32   : RISCVReg32<10,"x10", ["a0"]>, DwarfRegNum<[10]>;
+  def X11_32   : RISCVReg32<11,"x11", ["a1"]>, DwarfRegNum<[11]>;
+  def X12_32   : RISCVReg32<12,"x12", ["a2"]>, DwarfRegNum<[12]>;
+  def X13_32   : RISCVReg32<13,"x13", ["a3"]>, DwarfRegNum<[13]>;
+  def X14_32   : RISCVReg32<14,"x14", ["a4"]>, DwarfRegNum<[14]>;
+  def X15_32   : RISCVReg32<15,"x15", ["a5"]>, DwarfRegNum<[15]>;
+  def X16_32   : RISCVReg32<16,"x16", ["a6"]>, DwarfRegNum<[16]>;
+  def X17_32   : RISCVReg32<17,"x17", ["a7"]>, DwarfRegNum<[17]>;
+  def X18_32   : RISCVReg32<18,"x18", ["s2"]>, DwarfRegNum<[18]>;
+  def X19_32   : RISCVReg32<19,"x19", ["s3"]>, DwarfRegNum<[19]>;
+  def X20_32   : RISCVReg32<20,"x20", ["s4"]>, DwarfRegNum<[20]>;
+  def X21_32   : RISCVReg32<21,"x21", ["s5"]>, DwarfRegNum<[21]>;
+  def X22_32   : RISCVReg32<22,"x22", ["s6"]>, DwarfRegNum<[22]>;
+  def X23_32   : RISCVReg32<23,"x23", ["s7"]>, DwarfRegNum<[23]>;
+  def X24_32   : RISCVReg32<24,"x24", ["s8"]>, DwarfRegNum<[24]>;
+  def X25_32   : RISCVReg32<25,"x25", ["s9"]>, DwarfRegNum<[25]>;
+  def X26_32   : RISCVReg32<26,"x26", ["s10"]>, DwarfRegNum<[26]>;
+  def X27_32   : RISCVReg32<27,"x27", ["s11"]>, DwarfRegNum<[27]>;
+  def X28_32   : RISCVReg32<28,"x28", ["t3"]>, DwarfRegNum<[28]>;
+  def X29_32   : RISCVReg32<29,"x29", ["t4"]>, DwarfRegNum<[29]>;
+  def X30_32   : RISCVReg32<30,"x30", ["t5"]>, DwarfRegNum<[30]>;
+  def X31_32   : RISCVReg32<31,"x31", ["t6"]>, DwarfRegNum<[31]>;
+}
+
+foreach Index = 0-31 in {
+  def X#Index#_64 : RISCVReg64<!cast<RISCVReg32>("X"#Index#"_32")>, DwarfRegNum<[Index]>;
+}
+
+// We currently define separate register classes for the 32-bit and 64-bit
+// GPRs. Once variable-sized register classes
+// <http://lists.llvm.org/pipermail/llvm-dev/2016-September/105027.html> or
+// similar are implemented, we can just use one 'GPR' class for most
+// instruction definitions.
+
+// TODO: once codegen is implemented, registers should be listed in an order
+// reflecting the preferred register allocation sequence.
+def GPR : RegisterClass<"RISCV", [i32], 32, (add
+  (sequence "X%u_32", 0, 31)
+)>;
+
+def GPR64 : RegisterClass<"RISCV", [i64], 64, (add
+  (sequence "X%u_64", 0, 31)
+)>;
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
new file mode 100644
index 000000000000..afbbe004186e
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -0,0 +1,58 @@
+//===-- RISCVTargetMachine.cpp - Define TargetMachine for RISCV -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about RISCV target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCVTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeRISCVTarget() {
+  RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
+  RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
+}
+
+static std::string computeDataLayout(const Triple &TT) {
+  if (TT.isArch64Bit()) {
+    return "e-m:e-i64:64-n32:64-S128";
+  } else {
+    assert(TT.isArch32Bit() && "only RV32 and RV64 are currently supported");
+    return "e-m:e-i64:64-n32-S128";
+  }
+}
+
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+                                           Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::Static;
+  return *RM;
+}
+
+RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
+                        getEffectiveRelocModel(TT, RM), CM, OL),
+      TLOF(make_unique<TargetLoweringObjectFileELF>()) {}
+
+TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new TargetPassConfig(this, PM);
+}
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.h b/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.h
new file mode 100644
index 000000000000..d13e574c9bf8
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.h
@@ -0,0 +1,40 @@
+//===-- RISCVTargetMachine.h - Define TargetMachine for RISCV ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the RISCV specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVTARGETMACHINE_H
+#define LLVM_LIB_TARGET_RISCV_RISCVTARGETMACHINE_H
+
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class RISCVTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+
+public:
+  RISCVTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                     StringRef FS, const TargetOptions &Options,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp b/contrib/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
new file mode 100644
index 000000000000..34932c259156
--- /dev/null
+++ b/contrib/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
@@ -0,0 +1,30 @@
+//===-- RISCVTargetInfo.cpp - RISCV Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+namespace llvm {
+Target &getTheRISCV32Target() {
+  static Target TheRISCV32Target;
+  return TheRISCV32Target;
+}
+
+Target &getTheRISCV64Target() {
+  static Target TheRISCV64Target;
+  return TheRISCV64Target;
+}
+}
+
+extern "C" void LLVMInitializeRISCVTargetInfo() {
+  RegisterTarget<Triple::riscv32> X(getTheRISCV32Target(), "riscv32",
+                                    "32-bit RISC-V");
+  RegisterTarget<Triple::riscv64> Y(getTheRISCV64Target(), "riscv64",
+                                    "64-bit RISC-V");
+}
diff --git a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
new file mode 100644
index 000000000000..e775aa607b53
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -0,0 +1,1300 @@
+//===-- SparcAsmParser.cpp - Parse Sparc assembly to MCInst instructions --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+// The generated AsmMatcher SparcGenAsmMatcher uses "Sparc" as the target
+// namespace. But SPARC backend uses "SP" as its namespace.
+namespace llvm {
+  namespace Sparc {
+    using namespace SP;
+  }
+}
+
+namespace {
+class SparcOperand;
+class SparcAsmParser : public MCTargetAsmParser {
+
+  MCAsmParser &Parser;
+
+  /// @name Auto-generated Match Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "SparcGenAsmMatcher.inc"
+
+  /// }
+
+  // public interface of the MCTargetAsmParser.
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
+
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
+
+  // Custom parse functions for Sparc specific operands.
+  OperandMatchResultTy parseMEMOperand(OperandVector &Operands);
+
+  OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Name);
+
+  OperandMatchResultTy
+  parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Operand,
+                       bool isCall = false);
+
+  OperandMatchResultTy parseBranchModifiers(OperandVector &Operands);
+
+  // Helper function for dealing with %lo / %hi in PIC mode.
+  const SparcMCExpr *adjustPICRelocation(SparcMCExpr::VariantKind VK,
+                                         const MCExpr *subExpr);
+
+  // returns true if Tok is matched to a register and returns register in RegNo.
+  bool matchRegisterName(const AsmToken &Tok, unsigned &RegNo,
+                         unsigned &RegKind);
+
+  bool matchSparcAsmModifiers(const MCExpr *&EVal, SMLoc &EndLoc);
+  bool parseDirectiveWord(unsigned Size, SMLoc L);
+
+  bool is64Bit() const {
+    return getSTI().getTargetTriple().getArch() == Triple::sparcv9;
+  }
+
+  bool expandSET(MCInst &Inst, SMLoc IDLoc,
+                 SmallVectorImpl<MCInst> &Instructions);
+
+public:
+  SparcAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
+                const MCInstrInfo &MII,
+                const MCTargetOptions &Options)
+      : MCTargetAsmParser(Options, sti), Parser(parser) {
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+  }
+
+};
+
+  static const MCPhysReg IntRegs[32] = {
+    Sparc::G0, Sparc::G1, Sparc::G2, Sparc::G3,
+    Sparc::G4, Sparc::G5, Sparc::G6, Sparc::G7,
+    Sparc::O0, Sparc::O1, Sparc::O2, Sparc::O3,
+    Sparc::O4, Sparc::O5, Sparc::O6, Sparc::O7,
+    Sparc::L0, Sparc::L1, Sparc::L2, Sparc::L3,
+    Sparc::L4, Sparc::L5, Sparc::L6, Sparc::L7,
+    Sparc::I0, Sparc::I1, Sparc::I2, Sparc::I3,
+    Sparc::I4, Sparc::I5, Sparc::I6, Sparc::I7 };
+
+  static const MCPhysReg FloatRegs[32] = {
+    Sparc::F0,  Sparc::F1,  Sparc::F2,  Sparc::F3,
+    Sparc::F4,  Sparc::F5,  Sparc::F6,  Sparc::F7,
+    Sparc::F8,  Sparc::F9,  Sparc::F10, Sparc::F11,
+    Sparc::F12, Sparc::F13, Sparc::F14, Sparc::F15,
+    Sparc::F16, Sparc::F17, Sparc::F18, Sparc::F19,
+    Sparc::F20, Sparc::F21, Sparc::F22, Sparc::F23,
+    Sparc::F24, Sparc::F25, Sparc::F26, Sparc::F27,
+    Sparc::F28, Sparc::F29, Sparc::F30, Sparc::F31 };
+
+  static const MCPhysReg DoubleRegs[32] = {
+    Sparc::D0,  Sparc::D1,  Sparc::D2,  Sparc::D3,
+    Sparc::D4,  Sparc::D5,  Sparc::D6,  Sparc::D7,
+    Sparc::D8,  Sparc::D9,  Sparc::D10, Sparc::D11,
+    Sparc::D12, Sparc::D13, Sparc::D14, Sparc::D15,
+    Sparc::D16, Sparc::D17, Sparc::D18, Sparc::D19,
+    Sparc::D20, Sparc::D21, Sparc::D22, Sparc::D23,
+    Sparc::D24, Sparc::D25, Sparc::D26, Sparc::D27,
+    Sparc::D28, Sparc::D29, Sparc::D30, Sparc::D31 };
+
+  static const MCPhysReg QuadFPRegs[32] = {
+    Sparc::Q0,  Sparc::Q1,  Sparc::Q2,  Sparc::Q3,
+    Sparc::Q4,  Sparc::Q5,  Sparc::Q6,  Sparc::Q7,
+    Sparc::Q8,  Sparc::Q9,  Sparc::Q10, Sparc::Q11,
+    Sparc::Q12, Sparc::Q13, Sparc::Q14, Sparc::Q15 };
+
+  static const MCPhysReg ASRRegs[32] = {
+    SP::Y,     SP::ASR1,  SP::ASR2,  SP::ASR3,
+    SP::ASR4,  SP::ASR5,  SP::ASR6, SP::ASR7,
+    SP::ASR8,  SP::ASR9,  SP::ASR10, SP::ASR11,
+    SP::ASR12, SP::ASR13, SP::ASR14, SP::ASR15,
+    SP::ASR16, SP::ASR17, SP::ASR18, SP::ASR19,
+    SP::ASR20, SP::ASR21, SP::ASR22, SP::ASR23,
+    SP::ASR24, SP::ASR25, SP::ASR26, SP::ASR27,
+    SP::ASR28, SP::ASR29, SP::ASR30, SP::ASR31};
+
+  static const MCPhysReg IntPairRegs[] = {
+    Sparc::G0_G1, Sparc::G2_G3, Sparc::G4_G5, Sparc::G6_G7,
+    Sparc::O0_O1, Sparc::O2_O3, Sparc::O4_O5, Sparc::O6_O7,
+    Sparc::L0_L1, Sparc::L2_L3, Sparc::L4_L5, Sparc::L6_L7,
+    Sparc::I0_I1, Sparc::I2_I3, Sparc::I4_I5, Sparc::I6_I7};
+
+  static const MCPhysReg CoprocRegs[32] = {
+    Sparc::C0,  Sparc::C1,  Sparc::C2,  Sparc::C3,
+    Sparc::C4,  Sparc::C5,  Sparc::C6,  Sparc::C7,
+    Sparc::C8,  Sparc::C9,  Sparc::C10, Sparc::C11,
+    Sparc::C12, Sparc::C13, Sparc::C14, Sparc::C15,
+    Sparc::C16, Sparc::C17, Sparc::C18, Sparc::C19,
+    Sparc::C20, Sparc::C21, Sparc::C22, Sparc::C23,
+    Sparc::C24, Sparc::C25, Sparc::C26, Sparc::C27,
+    Sparc::C28, Sparc::C29, Sparc::C30, Sparc::C31 };
+
+  static const MCPhysReg CoprocPairRegs[] = {
+    Sparc::C0_C1,   Sparc::C2_C3,   Sparc::C4_C5,   Sparc::C6_C7,
+    Sparc::C8_C9,   Sparc::C10_C11, Sparc::C12_C13, Sparc::C14_C15,
+    Sparc::C16_C17, Sparc::C18_C19, Sparc::C20_C21, Sparc::C22_C23,
+    Sparc::C24_C25, Sparc::C26_C27, Sparc::C28_C29, Sparc::C30_C31};
+  
+/// SparcOperand - Instances of this class represent a parsed Sparc machine
+/// instruction.
+class SparcOperand : public MCParsedAsmOperand {
+public:
+  enum RegisterKind {
+    rk_None,
+    rk_IntReg,
+    rk_IntPairReg,
+    rk_FloatReg,
+    rk_DoubleReg,
+    rk_QuadReg,
+    rk_CoprocReg,
+    rk_CoprocPairReg,
+    rk_Special,
+  };
+
+private:
+  enum KindTy {
+    k_Token,
+    k_Register,
+    k_Immediate,
+    k_MemoryReg,
+    k_MemoryImm
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+  struct Token {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+    RegisterKind Kind;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct MemOp {
+    unsigned Base;
+    unsigned OffsetReg;
+    const MCExpr *Off;
+  };
+
+  union {
+    struct Token Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+  };
+public:
+  SparcOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+  bool isToken() const override { return Kind == k_Token; }
+  bool isReg() const override { return Kind == k_Register; }
+  bool isImm() const override { return Kind == k_Immediate; }
+  bool isMem() const override { return isMEMrr() || isMEMri(); }
+  bool isMEMrr() const { return Kind == k_MemoryReg; }
+  bool isMEMri() const { return Kind == k_MemoryImm; }
+
+  bool isIntReg() const {
+    return (Kind == k_Register && Reg.Kind == rk_IntReg);
+  }
+
+  bool isFloatReg() const {
+    return (Kind == k_Register && Reg.Kind == rk_FloatReg);
+  }
+
+  bool isFloatOrDoubleReg() const {
+    return (Kind == k_Register && (Reg.Kind == rk_FloatReg
+                                   || Reg.Kind == rk_DoubleReg));
+  }
+
+  bool isCoprocReg() const {
+    return (Kind == k_Register && Reg.Kind == rk_CoprocReg);
+  }
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const override {
+    assert((Kind == k_Register) && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  const MCExpr *getImm() const {
+    assert((Kind == k_Immediate) && "Invalid access!");
+    return Imm.Val;
+  }
+
+  unsigned getMemBase() const {
+    assert((Kind == k_MemoryReg || Kind == k_MemoryImm) && "Invalid access!");
+    return Mem.Base;
+  }
+
+  unsigned getMemOffsetReg() const {
+    assert((Kind == k_MemoryReg) && "Invalid access!");
+    return Mem.OffsetReg;
+  }
+
+  const MCExpr *getMemOff() const {
+    assert((Kind == k_MemoryImm) && "Invalid access!");
+    return Mem.Off;
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override {
+    return StartLoc;
+  }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override {
+    return EndLoc;
+  }
+
+  void print(raw_ostream &OS) const override {
+    switch (Kind) {
+    case k_Token:     OS << "Token: " << getToken() << "\n"; break;
+    case k_Register:  OS << "Reg: #" << getReg() << "\n"; break;
+    case k_Immediate: OS << "Imm: " << getImm() << "\n"; break;
+    case k_MemoryReg: OS << "Mem: " << getMemBase() << "+"
+                         << getMemOffsetReg() << "\n"; break;
+    case k_MemoryImm: assert(getMemOff() != nullptr);
+      OS << "Mem: " << getMemBase()
+         << "+" << *getMemOff()
+         << "\n"; break;
+    }
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCExpr *Expr = getImm();
+    addExpr(Inst, Expr);
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const{
+    // Add as immediate when possible.  Null MCExpr = 0.
+    if (!Expr)
+      Inst.addOperand(MCOperand::createImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  void addMEMrrOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createReg(getMemBase()));
+
+    assert(getMemOffsetReg() != 0 && "Invalid offset");
+    Inst.addOperand(MCOperand::createReg(getMemOffsetReg()));
+  }
+
+  void addMEMriOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createReg(getMemBase()));
+
+    const MCExpr *Expr = getMemOff();
+    addExpr(Inst, Expr);
+  }
+
+  static std::unique_ptr<SparcOperand> CreateToken(StringRef Str, SMLoc S) {
+    auto Op = make_unique<SparcOperand>(k_Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<SparcOperand> CreateReg(unsigned RegNum, unsigned Kind,
+                                                 SMLoc S, SMLoc E) {
+    auto Op = make_unique<SparcOperand>(k_Register);
+    Op->Reg.RegNum = RegNum;
+    Op->Reg.Kind   = (SparcOperand::RegisterKind)Kind;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<SparcOperand> CreateImm(const MCExpr *Val, SMLoc S,
+                                                 SMLoc E) {
+    auto Op = make_unique<SparcOperand>(k_Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static bool MorphToIntPairReg(SparcOperand &Op) {
+    unsigned Reg = Op.getReg();
+    assert(Op.Reg.Kind == rk_IntReg);
+    unsigned regIdx = 32;
+    if (Reg >= Sparc::G0 && Reg <= Sparc::G7)
+      regIdx = Reg - Sparc::G0;
+    else if (Reg >= Sparc::O0 && Reg <= Sparc::O7)
+      regIdx = Reg - Sparc::O0 + 8;
+    else if (Reg >= Sparc::L0 && Reg <= Sparc::L7)
+      regIdx = Reg - Sparc::L0 + 16;
+    else if (Reg >= Sparc::I0 && Reg <= Sparc::I7)
+      regIdx = Reg - Sparc::I0 + 24;
+    if (regIdx % 2 || regIdx > 31)
+      return false;
+    Op.Reg.RegNum = IntPairRegs[regIdx / 2];
+    Op.Reg.Kind = rk_IntPairReg;
+    return true;
+  }
+
+  static bool MorphToDoubleReg(SparcOperand &Op) {
+    unsigned Reg = Op.getReg();
+    assert(Op.Reg.Kind == rk_FloatReg);
+    unsigned regIdx = Reg - Sparc::F0;
+    if (regIdx % 2 || regIdx > 31)
+      return false;
+    Op.Reg.RegNum = DoubleRegs[regIdx / 2];
+    Op.Reg.Kind = rk_DoubleReg;
+    return true;
+  }
+
+  static bool MorphToQuadReg(SparcOperand &Op) {
+    unsigned Reg = Op.getReg();
+    unsigned regIdx = 0;
+    switch (Op.Reg.Kind) {
+    default: llvm_unreachable("Unexpected register kind!");
+    case rk_FloatReg:
+      regIdx = Reg - Sparc::F0;
+      if (regIdx % 4 || regIdx > 31)
+        return false;
+      Reg = QuadFPRegs[regIdx / 4];
+      break;
+    case rk_DoubleReg:
+      regIdx =  Reg - Sparc::D0;
+      if (regIdx % 2 || regIdx > 31)
+        return false;
+      Reg = QuadFPRegs[regIdx / 2];
+      break;
+    }
+    Op.Reg.RegNum = Reg;
+    Op.Reg.Kind = rk_QuadReg;
+    return true;
+  }
+
+  static bool MorphToCoprocPairReg(SparcOperand &Op) {
+    unsigned Reg = Op.getReg();
+    assert(Op.Reg.Kind == rk_CoprocReg);
+    unsigned regIdx = 32;
+    if (Reg >= Sparc::C0 && Reg <= Sparc::C31)
+      regIdx = Reg - Sparc::C0;
+    if (regIdx % 2 || regIdx > 31)
+      return false;
+    Op.Reg.RegNum = CoprocPairRegs[regIdx / 2];
+    Op.Reg.Kind = rk_CoprocPairReg;
+    return true;
+  }
+  
+  static std::unique_ptr<SparcOperand>
+  MorphToMEMrr(unsigned Base, std::unique_ptr<SparcOperand> Op) {
+    unsigned offsetReg = Op->getReg();
+    Op->Kind = k_MemoryReg;
+    Op->Mem.Base = Base;
+    Op->Mem.OffsetReg = offsetReg;
+    Op->Mem.Off = nullptr;
+    return Op;
+  }
+
+  static std::unique_ptr<SparcOperand>
+  CreateMEMr(unsigned Base, SMLoc S, SMLoc E) {
+    auto Op = make_unique<SparcOperand>(k_MemoryReg);
+    Op->Mem.Base = Base;
+    Op->Mem.OffsetReg = Sparc::G0;  // always 0
+    Op->Mem.Off = nullptr;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<SparcOperand>
+  MorphToMEMri(unsigned Base, std::unique_ptr<SparcOperand> Op) {
+    const MCExpr *Imm  = Op->getImm();
+    Op->Kind = k_MemoryImm;
+    Op->Mem.Base = Base;
+    Op->Mem.OffsetReg = 0;
+    Op->Mem.Off = Imm;
+    return Op;
+  }
+};
+
+} // end namespace
+
+bool SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc,
+                               SmallVectorImpl<MCInst> &Instructions) {
+  MCOperand MCRegOp = Inst.getOperand(0);
+  MCOperand MCValOp = Inst.getOperand(1);
+  assert(MCRegOp.isReg());
+  assert(MCValOp.isImm() || MCValOp.isExpr());
+
+  // the imm operand can be either an expression or an immediate.
+  bool IsImm = Inst.getOperand(1).isImm();
+  int64_t RawImmValue = IsImm ? MCValOp.getImm() : 0;
+
+  // Allow either a signed or unsigned 32-bit immediate.
+  if (RawImmValue < -2147483648LL || RawImmValue > 4294967295LL) {
+    return Error(IDLoc,
+                 "set: argument must be between -2147483648 and 4294967295");
+  }
+
+  // If the value was expressed as a large unsigned number, that's ok.
+  // We want to see if it "looks like" a small signed number.
+  int32_t ImmValue = RawImmValue;
+  // For 'set' you can't use 'or' with a negative operand on V9 because
+  // that would splat the sign bit across the upper half of the destination
+  // register, whereas 'set' is defined to zero the high 32 bits.
+  bool IsEffectivelyImm13 =
+      IsImm && ((is64Bit() ? 0 : -4096) <= ImmValue && ImmValue < 4096);
+  const MCExpr *ValExpr;
+  if (IsImm)
+    ValExpr = MCConstantExpr::create(ImmValue, getContext());
+  else
+    ValExpr = MCValOp.getExpr();
+
+  MCOperand PrevReg = MCOperand::createReg(Sparc::G0);
+
+  // If not just a signed imm13 value, then either we use a 'sethi' with a
+  // following 'or', or a 'sethi' by itself if there are no more 1 bits.
+  // In either case, start with the 'sethi'.
+  if (!IsEffectivelyImm13) {
+    MCInst TmpInst;
+    const MCExpr *Expr = adjustPICRelocation(SparcMCExpr::VK_Sparc_HI, ValExpr);
+    TmpInst.setLoc(IDLoc);
+    TmpInst.setOpcode(SP::SETHIi);
+    TmpInst.addOperand(MCRegOp);
+    TmpInst.addOperand(MCOperand::createExpr(Expr));
+    Instructions.push_back(TmpInst);
+    PrevReg = MCRegOp;
+  }
+
+  // The low bits require touching in 3 cases:
+  // * A non-immediate value will always require both instructions.
+  // * An effectively imm13 value needs only an 'or' instruction.
+  // * Otherwise, an immediate that is not effectively imm13 requires the
+  //   'or' only if bits remain after clearing the 22 bits that 'sethi' set.
+  // If the low bits are known zeros, there's nothing to do.
+  // In the second case, and only in that case, must we NOT clear
+  // bits of the immediate value via the %lo() assembler function.
+  // Note also, the 'or' instruction doesn't mind a large value in the case
+  // where the operand to 'set' was 0xFFFFFzzz - it does exactly what you mean.
+  if (!IsImm || IsEffectivelyImm13 || (ImmValue & 0x3ff)) {
+    MCInst TmpInst;
+    const MCExpr *Expr;
+    if (IsEffectivelyImm13)
+      Expr = ValExpr;
+    else
+      Expr = adjustPICRelocation(SparcMCExpr::VK_Sparc_LO, ValExpr);
+    TmpInst.setLoc(IDLoc);
+    TmpInst.setOpcode(SP::ORri);
+    TmpInst.addOperand(MCRegOp);
+    TmpInst.addOperand(PrevReg);
+    TmpInst.addOperand(MCOperand::createExpr(Expr));
+    Instructions.push_back(TmpInst);
+  }
+  return false;
+}
+
+bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                             OperandVector &Operands,
+                                             MCStreamer &Out,
+                                             uint64_t &ErrorInfo,
+                                             bool MatchingInlineAsm) {
+  MCInst Inst;
+  SmallVector<MCInst, 8> Instructions;
+  unsigned MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                                              MatchingInlineAsm);
+  switch (MatchResult) {
+  case Match_Success: {
+    switch (Inst.getOpcode()) {
+    default:
+      Inst.setLoc(IDLoc);
+      Instructions.push_back(Inst);
+      break;
+    case SP::SET:
+      if (expandSET(Inst, IDLoc, Instructions))
+        return true;
+      break;
+    }
+
+    for (const MCInst &I : Instructions) {
+      Out.EmitInstruction(I, getSTI());
+    }
+    return false;
+  }
+
+  case Match_MissingFeature:
+    return Error(IDLoc,
+                 "instruction requires a CPU feature not currently enabled");
+
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0ULL) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((SparcOperand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction mnemonic");
+  }
+  llvm_unreachable("Implement any new match types added!");
+}
+
+bool SparcAsmParser::
+ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc)
+{
+  const AsmToken &Tok = Parser.getTok();
+  StartLoc = Tok.getLoc();
+  EndLoc = Tok.getEndLoc();
+  RegNo = 0;
+  if (getLexer().getKind() != AsmToken::Percent)
+    return false;
+  Parser.Lex();
+  unsigned regKind = SparcOperand::rk_None;
+  if (matchRegisterName(Tok, RegNo, regKind)) {
+    Parser.Lex();
+    return false;
+  }
+
+  return Error(StartLoc, "invalid register name");
+}
+
+static void applyMnemonicAliases(StringRef &Mnemonic, uint64_t Features,
+                                 unsigned VariantID);
+
+bool SparcAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                      StringRef Name, SMLoc NameLoc,
+                                      OperandVector &Operands) {
+
+  // First operand in MCInst is instruction mnemonic.
+  Operands.push_back(SparcOperand::CreateToken(Name, NameLoc));
+
+  // apply mnemonic aliases, if any, so that we can parse operands correctly.
+  applyMnemonicAliases(Name, getAvailableFeatures(), 0);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (getLexer().is(AsmToken::Comma)) {
+      if (parseBranchModifiers(Operands) != MatchOperand_Success) {
+        SMLoc Loc = getLexer().getLoc();
+        return Error(Loc, "unexpected token");
+      }
+    }
+    if (parseOperand(Operands, Name) != MatchOperand_Success) {
+      SMLoc Loc = getLexer().getLoc();
+      return Error(Loc, "unexpected token");
+    }
+
+    while (getLexer().is(AsmToken::Comma) || getLexer().is(AsmToken::Plus)) {
+      if (getLexer().is(AsmToken::Plus)) {
+      // Plus tokens are significant in software_traps (p83, sparcv8.pdf). We must capture them.
+        Operands.push_back(SparcOperand::CreateToken("+", Parser.getTok().getLoc()));
+      }
+      Parser.Lex(); // Eat the comma or plus.
+      // Parse and remember the operand.
+      if (parseOperand(Operands, Name) != MatchOperand_Success) {
+        SMLoc Loc = getLexer().getLoc();
+        return Error(Loc, "unexpected token");
+      }
+    }
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    return Error(Loc, "unexpected token");
+  }
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool SparcAsmParser::
+ParseDirective(AsmToken DirectiveID)
+{
+  StringRef IDVal = DirectiveID.getString();
+
+  if (IDVal == ".byte")
+    return parseDirectiveWord(1, DirectiveID.getLoc());
+
+  if (IDVal == ".half")
+    return parseDirectiveWord(2, DirectiveID.getLoc());
+
+  if (IDVal == ".word")
+    return parseDirectiveWord(4, DirectiveID.getLoc());
+
+  if (IDVal == ".nword")
+    return parseDirectiveWord(is64Bit() ? 8 : 4, DirectiveID.getLoc());
+
+  if (is64Bit() && IDVal == ".xword")
+    return parseDirectiveWord(8, DirectiveID.getLoc());
+
+  if (IDVal == ".register") {
+    // For now, ignore .register directive.
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+  if (IDVal == ".proc") {
+    // For compatibility, ignore this directive.
+    // (It's supposed to be an "optimization" in the Sun assembler)
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  // Let the MC layer to handle other directives.
+  return true;
+}
+
+bool SparcAsmParser:: parseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+  Parser.Lex();
+  return false;
+}
+
+OperandMatchResultTy
+SparcAsmParser::parseMEMOperand(OperandVector &Operands) {
+
+  SMLoc S, E;
+  unsigned BaseReg = 0;
+
+  if (ParseRegister(BaseReg, S, E)) {
+    return MatchOperand_NoMatch;
+  }
+
+  switch (getLexer().getKind()) {
+  default: return MatchOperand_NoMatch;
+
+  case AsmToken::Comma:
+  case AsmToken::RBrac:
+  case AsmToken::EndOfStatement:
+    Operands.push_back(SparcOperand::CreateMEMr(BaseReg, S, E));
+    return MatchOperand_Success;
+
+  case AsmToken:: Plus:
+    Parser.Lex(); // Eat the '+'
+    break;
+  case AsmToken::Minus:
+    break;
+  }
+
+  std::unique_ptr<SparcOperand> Offset;
+  OperandMatchResultTy ResTy = parseSparcAsmOperand(Offset);
+  if (ResTy != MatchOperand_Success || !Offset)
+    return MatchOperand_NoMatch;
+
+  Operands.push_back(
+      Offset->isImm() ? SparcOperand::MorphToMEMri(BaseReg, std::move(Offset))
+                      : SparcOperand::MorphToMEMrr(BaseReg, std::move(Offset)));
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+SparcAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail)
+    return ResTy;
+
+  if (getLexer().is(AsmToken::LBrac)) {
+    // Memory operand
+    Operands.push_back(SparcOperand::CreateToken("[",
+                                                 Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat the [
+
+    if (Mnemonic == "cas" || Mnemonic == "casx" || Mnemonic == "casa") {
+      SMLoc S = Parser.getTok().getLoc();
+      if (getLexer().getKind() != AsmToken::Percent)
+        return MatchOperand_NoMatch;
+      Parser.Lex(); // eat %
+
+      unsigned RegNo, RegKind;
+      if (!matchRegisterName(Parser.getTok(), RegNo, RegKind))
+        return MatchOperand_NoMatch;
+
+      Parser.Lex(); // Eat the identifier token.
+      SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer()-1);
+      Operands.push_back(SparcOperand::CreateReg(RegNo, RegKind, S, E));
+      ResTy = MatchOperand_Success;
+    } else {
+      ResTy = parseMEMOperand(Operands);
+    }
+
+    if (ResTy != MatchOperand_Success)
+      return ResTy;
+
+    if (!getLexer().is(AsmToken::RBrac))
+      return MatchOperand_ParseFail;
+
+    Operands.push_back(SparcOperand::CreateToken("]",
+                                                 Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat the ]
+
+    // Parse an optional address-space identifier after the address.
+    if (getLexer().is(AsmToken::Integer)) {
+      std::unique_ptr<SparcOperand> Op;
+      ResTy = parseSparcAsmOperand(Op, false);
+      if (ResTy != MatchOperand_Success || !Op)
+        return MatchOperand_ParseFail;
+      Operands.push_back(std::move(Op));
+    }
+    return MatchOperand_Success;
+  }
+
+  std::unique_ptr<SparcOperand> Op;
+
+  ResTy = parseSparcAsmOperand(Op, (Mnemonic == "call"));
+  if (ResTy != MatchOperand_Success || !Op)
+    return MatchOperand_ParseFail;
+
+  // Push the parsed operand into the list of operands
+  Operands.push_back(std::move(Op));
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
+                                     bool isCall) {
+
+  SMLoc S = Parser.getTok().getLoc();
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  const MCExpr *EVal;
+
+  Op = nullptr;
+  switch (getLexer().getKind()) {
+  default:  break;
+
+  case AsmToken::Percent:
+    Parser.Lex(); // Eat the '%'.
+    unsigned RegNo;
+    unsigned RegKind;
+    if (matchRegisterName(Parser.getTok(), RegNo, RegKind)) {
+      StringRef name = Parser.getTok().getString();
+      Parser.Lex(); // Eat the identifier token.
+      E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      switch (RegNo) {
+      default:
+        Op = SparcOperand::CreateReg(RegNo, RegKind, S, E);
+        break;
+      case Sparc::PSR:
+        Op = SparcOperand::CreateToken("%psr", S);
+        break;
+      case Sparc::FSR:
+        Op = SparcOperand::CreateToken("%fsr", S);
+        break;
+      case Sparc::FQ:
+        Op = SparcOperand::CreateToken("%fq", S);
+        break;
+      case Sparc::CPSR:
+        Op = SparcOperand::CreateToken("%csr", S);
+        break;
+      case Sparc::CPQ:
+        Op = SparcOperand::CreateToken("%cq", S);
+        break;
+      case Sparc::WIM:
+        Op = SparcOperand::CreateToken("%wim", S);
+        break;
+      case Sparc::TBR:
+        Op = SparcOperand::CreateToken("%tbr", S);
+        break;
+      case Sparc::ICC:
+        if (name == "xcc")
+          Op = SparcOperand::CreateToken("%xcc", S);
+        else
+          Op = SparcOperand::CreateToken("%icc", S);
+        break;
+      }
+      break;
+    }
+    if (matchSparcAsmModifiers(EVal, E)) {
+      E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      Op = SparcOperand::CreateImm(EVal, S, E);
+    }
+    break;
+
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+  case AsmToken::LParen:
+  case AsmToken::Dot:
+    if (!getParser().parseExpression(EVal, E))
+      Op = SparcOperand::CreateImm(EVal, S, E);
+    break;
+
+  case AsmToken::Identifier: {
+    StringRef Identifier;
+    if (!getParser().parseIdentifier(Identifier)) {
+      E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
+
+      const MCExpr *Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None,
+                                                  getContext());
+      if (isCall && getContext().getObjectFileInfo()->isPositionIndependent())
+        Res = SparcMCExpr::create(SparcMCExpr::VK_Sparc_WPLT30, Res,
+                                  getContext());
+      Op = SparcOperand::CreateImm(Res, S, E);
+    }
+    break;
+  }
+  }
+  return (Op) ? MatchOperand_Success : MatchOperand_ParseFail;
+}
+
+OperandMatchResultTy
+SparcAsmParser::parseBranchModifiers(OperandVector &Operands) {
+
+  // parse (,a|,pn|,pt)+
+
+  while (getLexer().is(AsmToken::Comma)) {
+
+    Parser.Lex(); // Eat the comma
+
+    if (!getLexer().is(AsmToken::Identifier))
+      return MatchOperand_ParseFail;
+    StringRef modName = Parser.getTok().getString();
+    if (modName == "a" || modName == "pn" || modName == "pt") {
+      Operands.push_back(SparcOperand::CreateToken(modName,
+                                                   Parser.getTok().getLoc()));
+      Parser.Lex(); // eat the identifier.
+    }
+  }
+  return MatchOperand_Success;
+}
+
+bool SparcAsmParser::matchRegisterName(const AsmToken &Tok,
+                                       unsigned &RegNo,
+                                       unsigned &RegKind)
+{
+  int64_t intVal = 0;
+  RegNo = 0;
+  RegKind = SparcOperand::rk_None;
+  if (Tok.is(AsmToken::Identifier)) {
+    StringRef name = Tok.getString();
+
+    // %fp
+    if (name.equals("fp")) {
+      RegNo = Sparc::I6;
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+    // %sp
+    if (name.equals("sp")) {
+      RegNo = Sparc::O6;
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+
+    if (name.equals("y")) {
+      RegNo = Sparc::Y;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+
+    if (name.substr(0, 3).equals_lower("asr")
+        && !name.substr(3).getAsInteger(10, intVal)
+        && intVal > 0 && intVal < 32) {
+      RegNo = ASRRegs[intVal];
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+
+    // %fprs is an alias of %asr6.
+    if (name.equals("fprs")) {
+      RegNo = ASRRegs[6];
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+
+    if (name.equals("icc")) {
+      RegNo = Sparc::ICC;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+
+    if (name.equals("psr")) {
+      RegNo = Sparc::PSR;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+
+    if (name.equals("fsr")) {
+      RegNo = Sparc::FSR;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+
+    if (name.equals("fq")) {
+      RegNo = Sparc::FQ;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+
+    if (name.equals("csr")) {
+      RegNo = Sparc::CPSR;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+
+    if (name.equals("cq")) {
+      RegNo = Sparc::CPQ;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    
+    if (name.equals("wim")) {
+      RegNo = Sparc::WIM;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+
+    if (name.equals("tbr")) {
+      RegNo = Sparc::TBR;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+
+    if (name.equals("xcc")) {
+      // FIXME:: check 64bit.
+      RegNo = Sparc::ICC;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+
+    // %fcc0 - %fcc3
+    if (name.substr(0, 3).equals_lower("fcc")
+        && !name.substr(3).getAsInteger(10, intVal)
+        && intVal < 4) {
+      // FIXME: check 64bit and  handle %fcc1 - %fcc3
+      RegNo = Sparc::FCC0 + intVal;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+
+    // %g0 - %g7
+    if (name.substr(0, 1).equals_lower("g")
+        && !name.substr(1).getAsInteger(10, intVal)
+        && intVal < 8) {
+      RegNo = IntRegs[intVal];
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+    // %o0 - %o7
+    if (name.substr(0, 1).equals_lower("o")
+        && !name.substr(1).getAsInteger(10, intVal)
+        && intVal < 8) {
+      RegNo = IntRegs[8 + intVal];
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+    if (name.substr(0, 1).equals_lower("l")
+        && !name.substr(1).getAsInteger(10, intVal)
+        && intVal < 8) {
+      RegNo = IntRegs[16 + intVal];
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+    if (name.substr(0, 1).equals_lower("i")
+        && !name.substr(1).getAsInteger(10, intVal)
+        && intVal < 8) {
+      RegNo = IntRegs[24 + intVal];
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+    // %f0 - %f31
+    if (name.substr(0, 1).equals_lower("f")
+        && !name.substr(1, 2).getAsInteger(10, intVal) && intVal < 32) {
+      RegNo = FloatRegs[intVal];
+      RegKind = SparcOperand::rk_FloatReg;
+      return true;
+    }
+    // %f32 - %f62
+    if (name.substr(0, 1).equals_lower("f")
+        && !name.substr(1, 2).getAsInteger(10, intVal)
+        && intVal >= 32 && intVal <= 62 && (intVal % 2 == 0)) {
+      // FIXME: Check V9
+      RegNo = DoubleRegs[intVal/2];
+      RegKind = SparcOperand::rk_DoubleReg;
+      return true;
+    }
+
+    // %r0 - %r31
+    if (name.substr(0, 1).equals_lower("r")
+        && !name.substr(1, 2).getAsInteger(10, intVal) && intVal < 31) {
+      RegNo = IntRegs[intVal];
+      RegKind = SparcOperand::rk_IntReg;
+      return true;
+    }
+
+    // %c0 - %c31
+    if (name.substr(0, 1).equals_lower("c")
+        && !name.substr(1).getAsInteger(10, intVal)
+        && intVal < 32) {
+      RegNo = CoprocRegs[intVal];
+      RegKind = SparcOperand::rk_CoprocReg;
+      return true;
+    }
+    
+    if (name.equals("tpc")) {
+      RegNo = Sparc::TPC;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("tnpc")) {
+      RegNo = Sparc::TNPC;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("tstate")) {
+      RegNo = Sparc::TSTATE;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("tt")) {
+      RegNo = Sparc::TT;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("tick")) {
+      RegNo = Sparc::TICK;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("tba")) {
+      RegNo = Sparc::TBA;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("pstate")) {
+      RegNo = Sparc::PSTATE;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("tl")) {
+      RegNo = Sparc::TL;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("pil")) {
+      RegNo = Sparc::PIL;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("cwp")) {
+      RegNo = Sparc::CWP;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("cansave")) {
+      RegNo = Sparc::CANSAVE;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("canrestore")) {
+      RegNo = Sparc::CANRESTORE;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("cleanwin")) {
+      RegNo = Sparc::CLEANWIN;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("otherwin")) {
+      RegNo = Sparc::OTHERWIN;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+    if (name.equals("wstate")) {
+      RegNo = Sparc::WSTATE;
+      RegKind = SparcOperand::rk_Special;
+      return true;
+    }
+  }
+  return false;
+}
+
+// Determine if an expression contains a reference to the symbol
+// "_GLOBAL_OFFSET_TABLE_".
+static bool hasGOTReference(const MCExpr *Expr) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    if (const SparcMCExpr *SE = dyn_cast<SparcMCExpr>(Expr))
+      return hasGOTReference(SE->getSubExpr());
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    return hasGOTReference(BE->getLHS()) || hasGOTReference(BE->getRHS());
+  }
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    return (SymRef.getSymbol().getName() == "_GLOBAL_OFFSET_TABLE_");
+  }
+
+  case MCExpr::Unary:
+    return hasGOTReference(cast<MCUnaryExpr>(Expr)->getSubExpr());
+  }
+  return false;
+}
+
+const SparcMCExpr *
+SparcAsmParser::adjustPICRelocation(SparcMCExpr::VariantKind VK,
+                                    const MCExpr *subExpr)
+{
+  // When in PIC mode, "%lo(...)" and "%hi(...)" behave differently.
+  // If the expression refers contains _GLOBAL_OFFSETE_TABLE, it is
+  // actually a %pc10 or %pc22 relocation. Otherwise, they are interpreted
+  // as %got10 or %got22 relocation.
+
+  if (getContext().getObjectFileInfo()->isPositionIndependent()) {
+    switch(VK) {
+    default: break;
+    case SparcMCExpr::VK_Sparc_LO:
+      VK = (hasGOTReference(subExpr) ? SparcMCExpr::VK_Sparc_PC10
+                                     : SparcMCExpr::VK_Sparc_GOT10);
+      break;
+    case SparcMCExpr::VK_Sparc_HI:
+      VK = (hasGOTReference(subExpr) ? SparcMCExpr::VK_Sparc_PC22
+                                     : SparcMCExpr::VK_Sparc_GOT22);
+      break;
+    }
+  }
+
+  return SparcMCExpr::create(VK, subExpr, getContext());
+}
+
+bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
+                                            SMLoc &EndLoc)
+{
+  AsmToken Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return false;
+
+  StringRef name = Tok.getString();
+
+  SparcMCExpr::VariantKind VK = SparcMCExpr::parseVariantKind(name);
+
+  if (VK == SparcMCExpr::VK_Sparc_None)
+    return false;
+
+  Parser.Lex(); // Eat the identifier.
+  if (Parser.getTok().getKind() != AsmToken::LParen)
+    return false;
+
+  Parser.Lex(); // Eat the LParen token.
+  const MCExpr *subExpr;
+  if (Parser.parseParenExpression(subExpr, EndLoc))
+    return false;
+
+  EVal = adjustPICRelocation(VK, subExpr);
+  return true;
+}
+
+extern "C" void LLVMInitializeSparcAsmParser() {
+  RegisterMCAsmParser<SparcAsmParser> A(getTheSparcTarget());
+  RegisterMCAsmParser<SparcAsmParser> B(getTheSparcV9Target());
+  RegisterMCAsmParser<SparcAsmParser> C(getTheSparcelTarget());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "SparcGenAsmMatcher.inc"
+
+unsigned SparcAsmParser::validateTargetOperandClass(MCParsedAsmOperand &GOp,
+                                                    unsigned Kind) {
+  SparcOperand &Op = (SparcOperand &)GOp;
+  if (Op.isFloatOrDoubleReg()) {
+    switch (Kind) {
+    default: break;
+    case MCK_DFPRegs:
+      if (!Op.isFloatReg() || SparcOperand::MorphToDoubleReg(Op))
+        return MCTargetAsmParser::Match_Success;
+      break;
+    case MCK_QFPRegs:
+      if (SparcOperand::MorphToQuadReg(Op))
+        return MCTargetAsmParser::Match_Success;
+      break;
+    }
+  }
+  if (Op.isIntReg() && Kind == MCK_IntPair) {
+    if (SparcOperand::MorphToIntPairReg(Op))
+      return MCTargetAsmParser::Match_Success;
+  }
+  if (Op.isCoprocReg() && Kind == MCK_CoprocPair) {
+     if (SparcOperand::MorphToCoprocPairReg(Op))
+       return MCTargetAsmParser::Match_Success;
+   }
+  return Match_InvalidOperand;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
new file mode 100644
index 000000000000..6f9cc314e376
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -0,0 +1,512 @@
+//===-- DelaySlotFiller.cpp - SPARC delay slot filler ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a simple local pass that attempts to fill delay slots with useful
+// instructions. If no instructions can be moved into the delay slot, then a
+// NOP is placed.
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "SparcSubtarget.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "delay-slot-filler"
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+
+static cl::opt<bool> DisableDelaySlotFiller(
+  "disable-sparc-delay-filler",
+  cl::init(false),
+  cl::desc("Disable the Sparc delay slot filler."),
+  cl::Hidden);
+
+namespace {
+  struct Filler : public MachineFunctionPass {
+    const SparcSubtarget *Subtarget;
+
+    static char ID;
+    Filler() : MachineFunctionPass(ID) {}
+
+    StringRef getPassName() const override { return "SPARC Delay Slot Filler"; }
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+    bool runOnMachineFunction(MachineFunction &F) override {
+      bool Changed = false;
+      Subtarget = &F.getSubtarget<SparcSubtarget>();
+
+      // This pass invalidates liveness information when it reorders
+      // instructions to fill delay slot.
+      F.getRegInfo().invalidateLiveness();
+
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI)
+        Changed |= runOnMachineBasicBlock(*FI);
+      return Changed;
+    }
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    void insertCallDefsUses(MachineBasicBlock::iterator MI,
+                            SmallSet<unsigned, 32>& RegDefs,
+                            SmallSet<unsigned, 32>& RegUses);
+
+    void insertDefsUses(MachineBasicBlock::iterator MI,
+                        SmallSet<unsigned, 32>& RegDefs,
+                        SmallSet<unsigned, 32>& RegUses);
+
+    bool IsRegInSet(SmallSet<unsigned, 32>& RegSet,
+                    unsigned Reg);
+
+    bool delayHasHazard(MachineBasicBlock::iterator candidate,
+                        bool &sawLoad, bool &sawStore,
+                        SmallSet<unsigned, 32> &RegDefs,
+                        SmallSet<unsigned, 32> &RegUses);
+
+    MachineBasicBlock::iterator
+    findDelayInstr(MachineBasicBlock &MBB, MachineBasicBlock::iterator slot);
+
+    bool needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize);
+
+    bool tryCombineRestoreWithPrevInst(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI);
+
+  };
+  char Filler::ID = 0;
+} // end of anonymous namespace
+
+/// createSparcDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in Sparc MachineFunctions
+///
+FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) {
+  return new Filler;
+}
+
+
+/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+/// We assume there is only one delay slot per delayed instruction.
+///
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  Subtarget = &MBB.getParent()->getSubtarget<SparcSubtarget>();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
+    MachineBasicBlock::iterator MI = I;
+    ++I;
+
+    // If MI is restore, try combining it with previous inst.
+    if (!DisableDelaySlotFiller &&
+        (MI->getOpcode() == SP::RESTORErr
+         || MI->getOpcode() == SP::RESTOREri)) {
+      Changed |= tryCombineRestoreWithPrevInst(MBB, MI);
+      continue;
+    }
+
+    // TODO: If we ever want to support v7, this needs to be extended
+    // to cover all floating point operations.
+    if (!Subtarget->isV9() &&
+        (MI->getOpcode() == SP::FCMPS || MI->getOpcode() == SP::FCMPD
+         || MI->getOpcode() == SP::FCMPQ)) {
+      BuildMI(MBB, I, MI->getDebugLoc(), TII->get(SP::NOP));
+      Changed = true;
+      continue;
+    }
+
+    // If MI has no delay slot, skip.
+    if (!MI->hasDelaySlot())
+      continue;
+
+    MachineBasicBlock::iterator D = MBB.end();
+
+    if (!DisableDelaySlotFiller)
+      D = findDelayInstr(MBB, MI);
+
+    ++FilledSlots;
+    Changed = true;
+
+    if (D == MBB.end())
+      BuildMI(MBB, I, MI->getDebugLoc(), TII->get(SP::NOP));
+    else
+      MBB.splice(I, &MBB, D);
+
+    unsigned structSize = 0;
+    if (needsUnimp(MI, structSize)) {
+      MachineBasicBlock::iterator J = MI;
+      ++J; // skip the delay filler.
+      assert (J != MBB.end() && "MI needs a delay instruction.");
+      BuildMI(MBB, ++J, MI->getDebugLoc(),
+              TII->get(SP::UNIMP)).addImm(structSize);
+      // Bundle the delay filler and unimp with the instruction.
+      MIBundleBuilder(MBB, MachineBasicBlock::iterator(MI), J);
+    } else {
+      MIBundleBuilder(MBB, MachineBasicBlock::iterator(MI), I);
+    }
+  }
+  return Changed;
+}
+
+MachineBasicBlock::iterator
+Filler::findDelayInstr(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator slot)
+{
+  SmallSet<unsigned, 32> RegDefs;
+  SmallSet<unsigned, 32> RegUses;
+  bool sawLoad = false;
+  bool sawStore = false;
+
+  if (slot == MBB.begin())
+    return MBB.end();
+
+  if (slot->getOpcode() == SP::RET || slot->getOpcode() == SP::TLS_CALL)
+    return MBB.end();
+
+  if (slot->getOpcode() == SP::RETL) {
+    MachineBasicBlock::iterator J = slot;
+    --J;
+
+    if (J->getOpcode() == SP::RESTORErr
+        || J->getOpcode() == SP::RESTOREri) {
+      // change retl to ret.
+      slot->setDesc(Subtarget->getInstrInfo()->get(SP::RET));
+      return J;
+    }
+  }
+
+  // Call's delay filler can def some of call's uses.
+  if (slot->isCall())
+    insertCallDefsUses(slot, RegDefs, RegUses);
+  else
+    insertDefsUses(slot, RegDefs, RegUses);
+
+  bool done = false;
+
+  MachineBasicBlock::iterator I = slot;
+
+  while (!done) {
+    done = (I == MBB.begin());
+
+    if (!done)
+      --I;
+
+    // skip debug value
+    if (I->isDebugValue())
+      continue;
+
+    if (I->hasUnmodeledSideEffects() || I->isInlineAsm() || I->isPosition() ||
+        I->hasDelaySlot() || I->isBundledWithSucc())
+      break;
+
+    if (delayHasHazard(I, sawLoad, sawStore, RegDefs, RegUses)) {
+      insertDefsUses(I, RegDefs, RegUses);
+      continue;
+    }
+
+    return I;
+  }
+  return MBB.end();
+}
+
+bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
+                            bool &sawLoad,
+                            bool &sawStore,
+                            SmallSet<unsigned, 32> &RegDefs,
+                            SmallSet<unsigned, 32> &RegUses)
+{
+
+  if (candidate->isImplicitDef() || candidate->isKill())
+    return true;
+
+  if (candidate->mayLoad()) {
+    sawLoad = true;
+    if (sawStore)
+      return true;
+  }
+
+  if (candidate->mayStore()) {
+    if (sawStore)
+      return true;
+    sawStore = true;
+    if (sawLoad)
+      return true;
+  }
+
+  for (unsigned i = 0, e = candidate->getNumOperands(); i!= e; ++i) {
+    const MachineOperand &MO = candidate->getOperand(i);
+    if (!MO.isReg())
+      continue; // skip
+
+    unsigned Reg = MO.getReg();
+
+    if (MO.isDef()) {
+      // check whether Reg is defined or used before delay slot.
+      if (IsRegInSet(RegDefs, Reg) || IsRegInSet(RegUses, Reg))
+        return true;
+    }
+    if (MO.isUse()) {
+      // check whether Reg is defined before delay slot.
+      if (IsRegInSet(RegDefs, Reg))
+        return true;
+    }
+  }
+
+  unsigned Opcode = candidate->getOpcode();
+  // LD and LDD may have NOPs inserted afterwards in the case of some LEON
+  // processors, so we can't use the delay slot if this feature is switched-on.
+  if (Subtarget->insertNOPLoad()
+      &&
+      Opcode >=  SP::LDDArr && Opcode <= SP::LDrr)
+    return true;
+
+  // Same as above for FDIV and FSQRT on some LEON processors.
+  if (Subtarget->fixAllFDIVSQRT()
+      &&
+      Opcode >=  SP::FDIVD && Opcode <= SP::FSQRTD)
+    return true;
+
+
+  return false;
+}
+
+
+void Filler::insertCallDefsUses(MachineBasicBlock::iterator MI,
+                                SmallSet<unsigned, 32>& RegDefs,
+                                SmallSet<unsigned, 32>& RegUses)
+{
+  // Call defines o7, which is visible to the instruction in delay slot.
+  RegDefs.insert(SP::O7);
+
+  switch(MI->getOpcode()) {
+  default: llvm_unreachable("Unknown opcode.");
+  case SP::CALL: break;
+  case SP::CALLrr:
+  case SP::CALLri:
+    assert(MI->getNumOperands() >= 2);
+    const MachineOperand &Reg = MI->getOperand(0);
+    assert(Reg.isReg() && "CALL first operand is not a register.");
+    assert(Reg.isUse() && "CALL first operand is not a use.");
+    RegUses.insert(Reg.getReg());
+
+    const MachineOperand &Operand1 = MI->getOperand(1);
+    if (Operand1.isImm() || Operand1.isGlobal())
+        break;
+    assert(Operand1.isReg() && "CALLrr second operand is not a register.");
+    assert(Operand1.isUse() && "CALLrr second operand is not a use.");
+    RegUses.insert(Operand1.getReg());
+    break;
+  }
+}
+
+// Insert Defs and Uses of MI into the sets RegDefs and RegUses.
+void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
+                            SmallSet<unsigned, 32>& RegDefs,
+                            SmallSet<unsigned, 32>& RegUses)
+{
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (Reg == 0)
+      continue;
+    if (MO.isDef())
+      RegDefs.insert(Reg);
+    if (MO.isUse()) {
+      // Implicit register uses of retl are return values and
+      // retl does not use them.
+      if (MO.isImplicit() && MI->getOpcode() == SP::RETL)
+        continue;
+      RegUses.insert(Reg);
+    }
+  }
+}
+
+// returns true if the Reg or its alias is in the RegSet.
+bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg)
+{
+  // Check Reg and all aliased Registers.
+  for (MCRegAliasIterator AI(Reg, Subtarget->getRegisterInfo(), true);
+       AI.isValid(); ++AI)
+    if (RegSet.count(*AI))
+      return true;
+  return false;
+}
+
+bool Filler::needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize)
+{
+  if (!I->isCall())
+    return false;
+
+  unsigned structSizeOpNum = 0;
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("Unknown call opcode.");
+  case SP::CALL: structSizeOpNum = 1; break;
+  case SP::CALLrr:
+  case SP::CALLri: structSizeOpNum = 2; break;
+  case SP::TLS_CALL: return false;
+  }
+
+  const MachineOperand &MO = I->getOperand(structSizeOpNum);
+  if (!MO.isImm())
+    return false;
+  StructSize = MO.getImm();
+  return true;
+}
+
+static bool combineRestoreADD(MachineBasicBlock::iterator RestoreMI,
+                              MachineBasicBlock::iterator AddMI,
+                              const TargetInstrInfo *TII)
+{
+  // Before:  add  <op0>, <op1>, %i[0-7]
+  //          restore %g0, %g0, %i[0-7]
+  //
+  // After :  restore <op0>, <op1>, %o[0-7]
+
+  unsigned reg = AddMI->getOperand(0).getReg();
+  if (reg < SP::I0 || reg > SP::I7)
+    return false;
+
+  // Erase RESTORE.
+  RestoreMI->eraseFromParent();
+
+  // Change ADD to RESTORE.
+  AddMI->setDesc(TII->get((AddMI->getOpcode() == SP::ADDrr)
+                          ? SP::RESTORErr
+                          : SP::RESTOREri));
+
+  // Map the destination register.
+  AddMI->getOperand(0).setReg(reg - SP::I0 + SP::O0);
+
+  return true;
+}
+
+static bool combineRestoreOR(MachineBasicBlock::iterator RestoreMI,
+                             MachineBasicBlock::iterator OrMI,
+                             const TargetInstrInfo *TII)
+{
+  // Before:  or  <op0>, <op1>, %i[0-7]
+  //          restore %g0, %g0, %i[0-7]
+  //    and <op0> or <op1> is zero,
+  //
+  // After :  restore <op0>, <op1>, %o[0-7]
+
+  unsigned reg = OrMI->getOperand(0).getReg();
+  if (reg < SP::I0 || reg > SP::I7)
+    return false;
+
+  // check whether it is a copy.
+  if (OrMI->getOpcode() == SP::ORrr
+      && OrMI->getOperand(1).getReg() != SP::G0
+      && OrMI->getOperand(2).getReg() != SP::G0)
+    return false;
+
+  if (OrMI->getOpcode() == SP::ORri
+      && OrMI->getOperand(1).getReg() != SP::G0
+      && (!OrMI->getOperand(2).isImm() || OrMI->getOperand(2).getImm() != 0))
+    return false;
+
+  // Erase RESTORE.
+  RestoreMI->eraseFromParent();
+
+  // Change OR to RESTORE.
+  OrMI->setDesc(TII->get((OrMI->getOpcode() == SP::ORrr)
+                         ? SP::RESTORErr
+                         : SP::RESTOREri));
+
+  // Map the destination register.
+  OrMI->getOperand(0).setReg(reg - SP::I0 + SP::O0);
+
+  return true;
+}
+
+static bool combineRestoreSETHIi(MachineBasicBlock::iterator RestoreMI,
+                                 MachineBasicBlock::iterator SetHiMI,
+                                 const TargetInstrInfo *TII)
+{
+  // Before:  sethi imm3, %i[0-7]
+  //          restore %g0, %g0, %g0
+  //
+  // After :  restore %g0, (imm3<<10), %o[0-7]
+
+  unsigned reg = SetHiMI->getOperand(0).getReg();
+  if (reg < SP::I0 || reg > SP::I7)
+    return false;
+
+  if (!SetHiMI->getOperand(1).isImm())
+    return false;
+
+  int64_t imm = SetHiMI->getOperand(1).getImm();
+
+  // Is it a 3 bit immediate?
+  if (!isInt<3>(imm))
+    return false;
+
+  // Make it a 13 bit immediate.
+  imm = (imm << 10) & 0x1FFF;
+
+  assert(RestoreMI->getOpcode() == SP::RESTORErr);
+
+  RestoreMI->setDesc(TII->get(SP::RESTOREri));
+
+  RestoreMI->getOperand(0).setReg(reg - SP::I0 + SP::O0);
+  RestoreMI->getOperand(1).setReg(SP::G0);
+  RestoreMI->getOperand(2).ChangeToImmediate(imm);
+
+
+  // Erase the original SETHI.
+  SetHiMI->eraseFromParent();
+
+  return true;
+}
+
+bool Filler::tryCombineRestoreWithPrevInst(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI)
+{
+  // No previous instruction.
+  if (MBBI == MBB.begin())
+    return false;
+
+  // assert that MBBI is a "restore %g0, %g0, %g0".
+  assert(MBBI->getOpcode() == SP::RESTORErr
+         && MBBI->getOperand(0).getReg() == SP::G0
+         && MBBI->getOperand(1).getReg() == SP::G0
+         && MBBI->getOperand(2).getReg() == SP::G0);
+
+  MachineBasicBlock::iterator PrevInst = std::prev(MBBI);
+
+  // It cannot be combined with a bundled instruction.
+  if (PrevInst->isBundledWithSucc())
+    return false;
+
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+  switch (PrevInst->getOpcode()) {
+  default: break;
+  case SP::ADDrr:
+  case SP::ADDri: return combineRestoreADD(MBBI, PrevInst, TII); break;
+  case SP::ORrr:
+  case SP::ORri:  return combineRestoreOR(MBBI, PrevInst, TII); break;
+  case SP::SETHIi: return combineRestoreSETHIi(MBBI, PrevInst, TII); break;
+  }
+  // It cannot combine with the previous instruction.
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
new file mode 100644
index 000000000000..da7e0b737e78
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -0,0 +1,670 @@
+//===- SparcDisassembler.cpp - Disassembler for Sparc -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Sparc Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "SparcRegisterInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sparc-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// A disassembler class for Sparc.
+class SparcDisassembler : public MCDisassembler {
+public:
+  SparcDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+      : MCDisassembler(STI, Ctx) {}
+  virtual ~SparcDisassembler() {}
+
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
+};
+}
+
+namespace llvm {
+Target &getTheSparcTarget();
+Target &getTheSparcV9Target();
+Target &getTheSparcelTarget();
+}
+
+static MCDisassembler *createSparcDisassembler(const Target &T,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new SparcDisassembler(STI, Ctx);
+}
+
+
+extern "C" void LLVMInitializeSparcDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(getTheSparcTarget(),
+                                         createSparcDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheSparcV9Target(),
+                                         createSparcDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheSparcelTarget(),
+                                         createSparcDisassembler);
+}
+
+static const unsigned IntRegDecoderTable[] = {
+  SP::G0,  SP::G1,  SP::G2,  SP::G3,
+  SP::G4,  SP::G5,  SP::G6,  SP::G7,
+  SP::O0,  SP::O1,  SP::O2,  SP::O3,
+  SP::O4,  SP::O5,  SP::O6,  SP::O7,
+  SP::L0,  SP::L1,  SP::L2,  SP::L3,
+  SP::L4,  SP::L5,  SP::L6,  SP::L7,
+  SP::I0,  SP::I1,  SP::I2,  SP::I3,
+  SP::I4,  SP::I5,  SP::I6,  SP::I7 };
+
+static const unsigned FPRegDecoderTable[] = {
+  SP::F0,   SP::F1,   SP::F2,   SP::F3,
+  SP::F4,   SP::F5,   SP::F6,   SP::F7,
+  SP::F8,   SP::F9,   SP::F10,  SP::F11,
+  SP::F12,  SP::F13,  SP::F14,  SP::F15,
+  SP::F16,  SP::F17,  SP::F18,  SP::F19,
+  SP::F20,  SP::F21,  SP::F22,  SP::F23,
+  SP::F24,  SP::F25,  SP::F26,  SP::F27,
+  SP::F28,  SP::F29,  SP::F30,  SP::F31 };
+
+static const unsigned DFPRegDecoderTable[] = {
+  SP::D0,   SP::D16,  SP::D1,   SP::D17,
+  SP::D2,   SP::D18,  SP::D3,   SP::D19,
+  SP::D4,   SP::D20,  SP::D5,   SP::D21,
+  SP::D6,   SP::D22,  SP::D7,   SP::D23,
+  SP::D8,   SP::D24,  SP::D9,   SP::D25,
+  SP::D10,  SP::D26,  SP::D11,  SP::D27,
+  SP::D12,  SP::D28,  SP::D13,  SP::D29,
+  SP::D14,  SP::D30,  SP::D15,  SP::D31 };
+
+static const unsigned QFPRegDecoderTable[] = {
+  SP::Q0,  SP::Q8,   ~0U,  ~0U,
+  SP::Q1,  SP::Q9,   ~0U,  ~0U,
+  SP::Q2,  SP::Q10,  ~0U,  ~0U,
+  SP::Q3,  SP::Q11,  ~0U,  ~0U,
+  SP::Q4,  SP::Q12,  ~0U,  ~0U,
+  SP::Q5,  SP::Q13,  ~0U,  ~0U,
+  SP::Q6,  SP::Q14,  ~0U,  ~0U,
+  SP::Q7,  SP::Q15,  ~0U,  ~0U } ;
+
+static const unsigned FCCRegDecoderTable[] = {
+  SP::FCC0, SP::FCC1, SP::FCC2, SP::FCC3 };
+
+static const unsigned ASRRegDecoderTable[] = {
+  SP::Y,     SP::ASR1,  SP::ASR2,  SP::ASR3,
+  SP::ASR4,  SP::ASR5,  SP::ASR6,  SP::ASR7,
+  SP::ASR8,  SP::ASR9,  SP::ASR10, SP::ASR11,
+  SP::ASR12, SP::ASR13, SP::ASR14, SP::ASR15,
+  SP::ASR16, SP::ASR17, SP::ASR18, SP::ASR19,
+  SP::ASR20, SP::ASR21, SP::ASR22, SP::ASR23,
+  SP::ASR24, SP::ASR25, SP::ASR26, SP::ASR27,
+  SP::ASR28, SP::ASR29, SP::ASR30, SP::ASR31};
+
+static const unsigned PRRegDecoderTable[] = {
+  SP::TPC, SP::TNPC, SP::TSTATE, SP::TT, SP::TICK, SP::TBA, SP::PSTATE,
+  SP::TL, SP::PIL, SP::CWP, SP::CANSAVE, SP::CANRESTORE, SP::CLEANWIN,
+  SP::OTHERWIN, SP::WSTATE
+};
+
+static const uint16_t IntPairDecoderTable[] = {
+  SP::G0_G1, SP::G2_G3, SP::G4_G5, SP::G6_G7,
+  SP::O0_O1, SP::O2_O3, SP::O4_O5, SP::O6_O7,
+  SP::L0_L1, SP::L2_L3, SP::L4_L5, SP::L6_L7,
+  SP::I0_I1, SP::I2_I3, SP::I4_I5, SP::I6_I7,
+};
+
+static const unsigned CPRegDecoderTable[] = {
+  SP::C0,  SP::C1,  SP::C2,  SP::C3,
+  SP::C4,  SP::C5,  SP::C6,  SP::C7,
+  SP::C8,  SP::C9,  SP::C10, SP::C11,
+  SP::C12, SP::C13, SP::C14, SP::C15,
+  SP::C16, SP::C17, SP::C18, SP::C19,
+  SP::C20, SP::C21, SP::C22, SP::C23,
+  SP::C24, SP::C25, SP::C26, SP::C27,
+  SP::C28, SP::C29, SP::C30, SP::C31
+};
+
+
+static const uint16_t CPPairDecoderTable[] = {
+  SP::C0_C1,   SP::C2_C3,   SP::C4_C5,   SP::C6_C7,
+  SP::C8_C9,   SP::C10_C11, SP::C12_C13, SP::C14_C15,
+  SP::C16_C17, SP::C18_C19, SP::C20_C21, SP::C22_C23,
+  SP::C24_C25, SP::C26_C27, SP::C28_C29, SP::C30_C31
+};
+
+static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = IntRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeI64RegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = IntRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeFPRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = FPRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeDFPRegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = DFPRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeQFPRegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = QFPRegDecoderTable[RegNo];
+  if (Reg == ~0U)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCPRegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = CPRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFCCRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 3)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(FCCRegDecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeASRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(ASRRegDecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePRRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo >= array_lengthof(PRRegDecoderTable))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(PRRegDecoderTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeIntPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  if ((RegNo & 1))
+    S = MCDisassembler::SoftFail;
+
+  unsigned RegisterPair = IntPairDecoderTable[RegNo/2];
+  Inst.addOperand(MCOperand::createReg(RegisterPair));
+  return S;
+}
+
+static DecodeStatus DecodeCPPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned RegisterPair = CPPairDecoderTable[RegNo/2];
+  Inst.addOperand(MCOperand::createReg(RegisterPair));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                 const void *Decoder);
+static DecodeStatus DecodeLoadDFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus DecodeLoadCP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus DecodeLoadCPPair(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn,
+                                  uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreDFP(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreQFP(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreCP(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreCPPair(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCall(MCInst &Inst, unsigned insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSIMM13(MCInst &Inst, unsigned insn,
+                                 uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeJMPL(MCInst &Inst, unsigned insn, uint64_t Address,
+                               const void *Decoder);
+static DecodeStatus DecodeReturn(MCInst &MI, unsigned insn, uint64_t Address,
+                                 const void *Decoder);
+static DecodeStatus DecodeSWAP(MCInst &Inst, unsigned insn, uint64_t Address,
+                               const void *Decoder);
+static DecodeStatus DecodeTRAP(MCInst &Inst, unsigned insn, uint64_t Address,
+                               const void *Decoder);
+
+#include "SparcGenDisassemblerTables.inc"
+
+/// Read four bytes from the ArrayRef and return 32 bit word.
+static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                      uint64_t &Size, uint32_t &Insn,
+                                      bool IsLittleEndian) {
+  // We want to read exactly 4 Bytes of data.
+  if (Bytes.size() < 4) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  Insn = IsLittleEndian
+             ? (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) |
+                   (Bytes[3] << 24)
+             : (Bytes[3] << 0) | (Bytes[2] << 8) | (Bytes[1] << 16) |
+                   (Bytes[0] << 24);
+
+  return MCDisassembler::Success;
+}
+
+DecodeStatus SparcDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+                                               ArrayRef<uint8_t> Bytes,
+                                               uint64_t Address,
+                                               raw_ostream &VStream,
+                                               raw_ostream &CStream) const {
+  uint32_t Insn;
+  bool isLittleEndian = getContext().getAsmInfo()->isLittleEndian();
+  DecodeStatus Result =
+      readInstruction32(Bytes, Address, Size, Insn, isLittleEndian);
+  if (Result == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  // Calling the auto-generated decoder function.
+  
+  if (STI.getFeatureBits()[Sparc::FeatureV9])
+  {
+    Result = decodeInstruction(DecoderTableSparcV932, Instr, Insn, Address, this, STI);
+  }
+  else
+  {
+    Result = decodeInstruction(DecoderTableSparcV832, Instr, Insn, Address, this, STI);      
+  }
+  if (Result != MCDisassembler::Fail)
+    return Result;
+  
+  Result =
+      decodeInstruction(DecoderTableSparc32, Instr, Insn, Address, this, STI);
+
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+
+typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned insn, uint64_t Address,
+                                   const void *Decoder);
+
+static DecodeStatus DecodeMem(MCInst &MI, unsigned insn, uint64_t Address,
+                              const void *Decoder,
+                              bool isLoad, DecodeFunc DecodeRD) {
+  unsigned rd = fieldFromInstruction(insn, 25, 5);
+  unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+  bool isImm = fieldFromInstruction(insn, 13, 1);
+  bool hasAsi = fieldFromInstruction(insn, 23, 1); // (in op3 field)
+  unsigned asi = fieldFromInstruction(insn, 5, 8);
+  unsigned rs2 = 0;
+  unsigned simm13 = 0;
+  if (isImm)
+    simm13 = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+  else
+    rs2 = fieldFromInstruction(insn, 0, 5);
+
+  DecodeStatus status;
+  if (isLoad) {
+    status = DecodeRD(MI, rd, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+
+  // Decode rs1.
+  status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode imm|rs2.
+  if (isImm)
+    MI.addOperand(MCOperand::createImm(simm13));
+  else {
+    status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+
+  if (hasAsi)
+    MI.addOperand(MCOperand::createImm(asi));
+
+  if (!isLoad) {
+    status = DecodeRD(MI, rd, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeIntRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeIntPairRegisterClass);
+}
+
+static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                 const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadDFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeDFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeQFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadCP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeCPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeLoadCPPair(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true,
+                   DecodeCPPairRegisterClass);
+}
+
+static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeIntRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeIntPairRegisterClass);
+}
+
+static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreDFP(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeDFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreQFP(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeQFPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreCP(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeCPRegsRegisterClass);
+}
+
+static DecodeStatus DecodeStoreCPPair(MCInst &Inst, unsigned insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false,
+                   DecodeCPPairRegisterClass);
+}
+
+static bool tryAddingSymbolicOperand(int64_t Value,  bool isBranch,
+                                     uint64_t Address, uint64_t Offset,
+                                     uint64_t Width, MCInst &MI,
+                                     const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
+                                       Offset, Width);
+}
+
+static DecodeStatus DecodeCall(MCInst &MI, unsigned insn,
+                               uint64_t Address, const void *Decoder) {
+  unsigned tgt = fieldFromInstruction(insn, 0, 30);
+  tgt <<= 2;
+  if (!tryAddingSymbolicOperand(tgt+Address, false, Address,
+                                0, 30, MI, Decoder))
+    MI.addOperand(MCOperand::createImm(tgt));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSIMM13(MCInst &MI, unsigned insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned tgt = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+  MI.addOperand(MCOperand::createImm(tgt));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeJMPL(MCInst &MI, unsigned insn, uint64_t Address,
+                               const void *Decoder) {
+
+  unsigned rd = fieldFromInstruction(insn, 25, 5);
+  unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+  unsigned isImm = fieldFromInstruction(insn, 13, 1);
+  unsigned rs2 = 0;
+  unsigned simm13 = 0;
+  if (isImm)
+    simm13 = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+  else
+    rs2 = fieldFromInstruction(insn, 0, 5);
+
+  // Decode RD.
+  DecodeStatus status = DecodeIntRegsRegisterClass(MI, rd, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS1.
+  status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS1 | SIMM13.
+  if (isImm)
+    MI.addOperand(MCOperand::createImm(simm13));
+  else {
+    status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeReturn(MCInst &MI, unsigned insn, uint64_t Address,
+                                 const void *Decoder) {
+
+  unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+  unsigned isImm = fieldFromInstruction(insn, 13, 1);
+  unsigned rs2 = 0;
+  unsigned simm13 = 0;
+  if (isImm)
+    simm13 = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+  else
+    rs2 = fieldFromInstruction(insn, 0, 5);
+
+  // Decode RS1.
+  DecodeStatus status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS2 | SIMM13.
+  if (isImm)
+    MI.addOperand(MCOperand::createImm(simm13));
+  else {
+    status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSWAP(MCInst &MI, unsigned insn, uint64_t Address,
+                               const void *Decoder) {
+
+  unsigned rd = fieldFromInstruction(insn, 25, 5);
+  unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+  unsigned isImm = fieldFromInstruction(insn, 13, 1);
+  bool hasAsi = fieldFromInstruction(insn, 23, 1); // (in op3 field)
+  unsigned asi = fieldFromInstruction(insn, 5, 8);
+  unsigned rs2 = 0;
+  unsigned simm13 = 0;
+  if (isImm)
+    simm13 = SignExtend32<13>(fieldFromInstruction(insn, 0, 13));
+  else
+    rs2 = fieldFromInstruction(insn, 0, 5);
+
+  // Decode RD.
+  DecodeStatus status = DecodeIntRegsRegisterClass(MI, rd, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS1.
+  status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS1 | SIMM13.
+  if (isImm)
+    MI.addOperand(MCOperand::createImm(simm13));
+  else {
+    status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+
+  if (hasAsi)
+    MI.addOperand(MCOperand::createImm(asi));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeTRAP(MCInst &MI, unsigned insn, uint64_t Address,
+                               const void *Decoder) {
+
+  unsigned rs1 = fieldFromInstruction(insn, 14, 5);
+  unsigned isImm = fieldFromInstruction(insn, 13, 1);
+  unsigned cc =fieldFromInstruction(insn, 25, 4);
+  unsigned rs2 = 0;
+  unsigned imm7 = 0;
+  if (isImm)
+    imm7 = fieldFromInstruction(insn, 0, 7);
+  else
+    rs2 = fieldFromInstruction(insn, 0, 5);
+
+  // Decode RS1.
+  DecodeStatus status = DecodeIntRegsRegisterClass(MI, rs1, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Decode RS1 | IMM7.
+  if (isImm)
+    MI.addOperand(MCOperand::createImm(imm7));
+  else {
+    status = DecodeIntRegsRegisterClass(MI, rs2, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+  
+  // Decode CC
+  MI.addOperand(MCOperand::createImm(cc));
+
+  return MCDisassembler::Success;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
new file mode 100644
index 000000000000..4981deae6af6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
@@ -0,0 +1,197 @@
+//===-- SparcInstPrinter.cpp - Convert Sparc MCInst to assembly syntax -----==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Sparc MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcInstPrinter.h"
+#include "Sparc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// The generated AsmMatcher SparcGenAsmWriter uses "Sparc" as the target
+// namespace. But SPARC backend uses "SP" as its namespace.
+namespace llvm {
+namespace Sparc {
+  using namespace SP;
+}
+}
+
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "SparcGenAsmWriter.inc"
+
+bool SparcInstPrinter::isV9(const MCSubtargetInfo &STI) const {
+  return (STI.getFeatureBits()[Sparc::FeatureV9]) != 0;
+}
+
+void SparcInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const
+{
+  OS << '%' << StringRef(getRegisterName(RegNo)).lower();
+}
+
+void SparcInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                 StringRef Annot, const MCSubtargetInfo &STI) {
+  if (!printAliasInstr(MI, STI, O) && !printSparcAliasInstr(MI, STI, O))
+    printInstruction(MI, STI, O);
+  printAnnotation(O, Annot);
+}
+
+bool SparcInstPrinter::printSparcAliasInstr(const MCInst *MI,
+                                            const MCSubtargetInfo &STI,
+                                            raw_ostream &O) {
+  switch (MI->getOpcode()) {
+  default: return false;
+  case SP::JMPLrr:
+  case SP::JMPLri: {
+    if (MI->getNumOperands() != 3)
+      return false;
+    if (!MI->getOperand(0).isReg())
+      return false;
+    switch (MI->getOperand(0).getReg()) {
+    default: return false;
+    case SP::G0: // jmp $addr | ret | retl
+      if (MI->getOperand(2).isImm() &&
+          MI->getOperand(2).getImm() == 8) {
+        switch(MI->getOperand(1).getReg()) {
+        default: break;
+        case SP::I7: O << "\tret"; return true;
+        case SP::O7: O << "\tretl"; return true;
+        }
+      }
+      O << "\tjmp "; printMemOperand(MI, 1, STI, O);
+      return true;
+    case SP::O7: // call $addr
+      O << "\tcall "; printMemOperand(MI, 1, STI, O);
+      return true;
+    }
+  }
+  case SP::V9FCMPS:  case SP::V9FCMPD:  case SP::V9FCMPQ:
+  case SP::V9FCMPES: case SP::V9FCMPED: case SP::V9FCMPEQ: {
+    if (isV9(STI)
+        || (MI->getNumOperands() != 3)
+        || (!MI->getOperand(0).isReg())
+        || (MI->getOperand(0).getReg() != SP::FCC0))
+      return false;
+    // if V8, skip printing %fcc0.
+    switch(MI->getOpcode()) {
+    default:
+    case SP::V9FCMPS:  O << "\tfcmps "; break;
+    case SP::V9FCMPD:  O << "\tfcmpd "; break;
+    case SP::V9FCMPQ:  O << "\tfcmpq "; break;
+    case SP::V9FCMPES: O << "\tfcmpes "; break;
+    case SP::V9FCMPED: O << "\tfcmped "; break;
+    case SP::V9FCMPEQ: O << "\tfcmpeq "; break;
+    }
+    printOperand(MI, 1, STI, O);
+    O << ", ";
+    printOperand(MI, 2, STI, O);
+    return true;
+  }
+  }
+}
+
+void SparcInstPrinter::printOperand(const MCInst *MI, int opNum,
+                                    const MCSubtargetInfo &STI,
+                                    raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand (opNum);
+
+  if (MO.isReg()) {
+    printRegName(O, MO.getReg());
+    return ;
+  }
+
+  if (MO.isImm()) {
+    switch (MI->getOpcode()) {
+      default:
+        O << (int)MO.getImm(); 
+        return;
+        
+      case SP::TICCri: // Fall through
+      case SP::TICCrr: // Fall through
+      case SP::TRAPri: // Fall through
+      case SP::TRAPrr: // Fall through
+      case SP::TXCCri: // Fall through
+      case SP::TXCCrr: // Fall through
+        // Only seven-bit values up to 127.
+        O << ((int) MO.getImm() & 0x7f);  
+        return;
+    }
+  }
+
+  assert(MO.isExpr() && "Unknown operand kind in printOperand");
+  MO.getExpr()->print(O, &MAI);
+}
+
+void SparcInstPrinter::printMemOperand(const MCInst *MI, int opNum,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O, const char *Modifier) {
+  printOperand(MI, opNum, STI, O);
+
+  // If this is an ADD operand, emit it like normal operands.
+  if (Modifier && !strcmp(Modifier, "arith")) {
+    O << ", ";
+    printOperand(MI, opNum+1, STI, O);
+    return;
+  }
+  const MCOperand &MO = MI->getOperand(opNum+1);
+
+  if (MO.isReg() && MO.getReg() == SP::G0)
+    return;   // don't print "+%g0"
+  if (MO.isImm() && MO.getImm() == 0)
+    return;   // don't print "+0"
+
+  O << "+";
+
+  printOperand(MI, opNum+1, STI, O);
+}
+
+void SparcInstPrinter::printCCOperand(const MCInst *MI, int opNum,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O) {
+  int CC = (int)MI->getOperand(opNum).getImm();
+  switch (MI->getOpcode()) {
+  default: break;
+  case SP::FBCOND:
+  case SP::FBCONDA:
+  case SP::BPFCC:
+  case SP::BPFCCA:
+  case SP::BPFCCNT:
+  case SP::BPFCCANT:
+  case SP::MOVFCCrr:  case SP::V9MOVFCCrr:
+  case SP::MOVFCCri:  case SP::V9MOVFCCri:
+  case SP::FMOVS_FCC: case SP::V9FMOVS_FCC:
+  case SP::FMOVD_FCC: case SP::V9FMOVD_FCC:
+  case SP::FMOVQ_FCC: case SP::V9FMOVQ_FCC:
+    // Make sure CC is a fp conditional flag.
+    CC = (CC < 16) ? (CC + 16) : CC;
+    break;
+  case SP::CBCOND:
+  case SP::CBCONDA:
+    // Make sure CC is a cp conditional flag.
+    CC = (CC < 32) ? (CC + 32) : CC;
+    break;
+  }
+  O << SPARCCondCodeToString((SPCC::CondCodes)CC);
+}
+
+bool SparcInstPrinter::printGetPCX(const MCInst *MI, unsigned opNum,
+                                   const MCSubtargetInfo &STI,
+                                   raw_ostream &O) {
+  llvm_unreachable("FIXME: Implement SparcInstPrinter::printGetPCX.");
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
new file mode 100644
index 000000000000..6f06d1ddae32
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
@@ -0,0 +1,55 @@
+//===-- SparcInstPrinter.h - Convert Sparc MCInst to assembly syntax ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Sparc MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_INSTPRINTER_SPARCINSTPRINTER_H
+#define LLVM_LIB_TARGET_SPARC_INSTPRINTER_SPARCINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class SparcInstPrinter : public MCInstPrinter {
+public:
+  SparcInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                   const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+  bool printSparcAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
+                            raw_ostream &OS);
+  bool isV9(const MCSubtargetInfo &STI) const;
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
+                        raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
+                       raw_ostream &O);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx,
+                               const MCSubtargetInfo &STI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
+                    raw_ostream &OS);
+  void printMemOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
+                       raw_ostream &OS, const char *Modifier = nullptr);
+  void printCCOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
+                      raw_ostream &OS);
+  bool printGetPCX(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                   raw_ostream &OS);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/LeonFeatures.td b/contrib/llvm/lib/Target/Sparc/LeonFeatures.td
new file mode 100755
index 000000000000..d06e734b5a7b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/LeonFeatures.td
@@ -0,0 +1,82 @@
+//===-- LeonFeatures.td - Describe the Leon Features -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// UMAC and SMAC support for LEON3 and LEON4 processors.
+//===----------------------------------------------------------------------===//
+
+//support to casa instruction; for leon3 subtarget only
+def UMACSMACSupport : SubtargetFeature<
+  "hasumacsmac", 
+  "HasUmacSmac", 
+  "true", 
+  "Enable UMAC and SMAC for LEON3 and LEON4 processors"
+>;
+
+
+//===----------------------------------------------------------------------===//
+// CASA Support differs between LEON3-FT GR712RC and LEON3-FT UT699
+// We need to have the option to switch this on and off.
+//===----------------------------------------------------------------------===//
+
+//support to casa instruction; for leon3 subtarget only
+def LeonCASA : SubtargetFeature<
+  "hasleoncasa", 
+  "HasLeonCasa", 
+  "true", 
+  "Enable CASA instruction for LEON3 and LEON4 processors"
+>;
+
+
+def ReplaceSDIV : SubtargetFeature<
+  "replacesdiv",
+  "PerformSDIVReplace",
+  "true",
+  "AT697E erratum fix: Do not emit SDIV, emit SDIVCC instead"
+>;
+          
+def InsertNOPLoad: SubtargetFeature<
+  "insertnopload",
+  "InsertNOPLoad",
+  "true",
+  "LEON3 erratum fix: Insert a NOP instruction after every single-cycle load instruction when the next instruction is another load/store instruction" 
+>;
+
+def FixFSMULD : SubtargetFeature<
+  "fixfsmuld",
+  "FixFSMULD",
+  "true",
+  "LEON erratum fix: Do not use FSMULD" 
+>;
+
+def ReplaceFMULS : SubtargetFeature<
+  "replacefmuls",
+  "ReplaceFMULS",
+  "true",
+  "LEON erratum fix: Replace FMULS instruction with FMULD and relevant conversion instructions" 
+>;
+
+def DetectRoundChange : SubtargetFeature<
+  "detectroundchange",
+  "DetectRoundChange",
+  "true",
+  "LEON3 erratum detection: Detects any rounding mode change "
+  "request: use only the round-to-nearest rounding mode"
+>;
+
+def FixAllFDIVSQRT : SubtargetFeature<
+  "fixallfdivsqrt",
+  "FixAllFDIVSQRT",
+  "true",
+  "LEON erratum fix: Fix FDIVS/FDIVD/FSQRTS/FSQRTD instructions with NOPs and floating-point store" 
+>;
diff --git a/contrib/llvm/lib/Target/Sparc/LeonPasses.cpp b/contrib/llvm/lib/Target/Sparc/LeonPasses.cpp
new file mode 100755
index 000000000000..0acc2875daa8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/LeonPasses.cpp
@@ -0,0 +1,374 @@
+//===------ LeonPasses.cpp - Define passes specific to LEON ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "LeonPasses.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+LEONMachineFunctionPass::LEONMachineFunctionPass(TargetMachine &tm, char &ID)
+    : MachineFunctionPass(ID) {}
+
+LEONMachineFunctionPass::LEONMachineFunctionPass(char &ID)
+    : MachineFunctionPass(ID) {}
+
+int LEONMachineFunctionPass::GetRegIndexForOperand(MachineInstr &MI,
+                                                   int OperandIndex) {
+  if (MI.getNumOperands() > 0) {
+    if (OperandIndex == LAST_OPERAND) {
+      OperandIndex = MI.getNumOperands() - 1;
+    }
+
+    if (MI.getNumOperands() > (unsigned)OperandIndex &&
+        MI.getOperand(OperandIndex).isReg()) {
+      return (int)MI.getOperand(OperandIndex).getReg();
+    }
+  }
+
+  static int NotFoundIndex = -10;
+  // Return a different number each time to avoid any comparisons between the
+  // values returned.
+  NotFoundIndex -= 10;
+  return NotFoundIndex;
+}
+
+// finds a new free FP register
+// checks also the AllocatedRegisters vector
+int LEONMachineFunctionPass::getUnusedFPRegister(MachineRegisterInfo &MRI) {
+  for (int RegisterIndex = SP::F0; RegisterIndex <= SP::F31; ++RegisterIndex) {
+    if (!MRI.isPhysRegUsed(RegisterIndex) &&
+        !is_contained(UsedRegisters, RegisterIndex)) {
+      return RegisterIndex;
+    }
+  }
+
+  return -1;
+}
+
+//*****************************************************************************
+//**** InsertNOPLoad pass
+//*****************************************************************************
+// This pass fixes the incorrectly working Load instructions that exists for
+// some earlier versions of the LEON processor line. NOP instructions must
+// be inserted after the load instruction to ensure that the Load instruction
+// behaves as expected for these processors.
+//
+// This pass inserts a NOP after any LD or LDF instruction.
+//
+char InsertNOPLoad::ID = 0;
+
+InsertNOPLoad::InsertNOPLoad(TargetMachine &tm)
+    : LEONMachineFunctionPass(tm, ID) {}
+
+bool InsertNOPLoad::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<SparcSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc DL = DebugLoc();
+
+  bool Modified = false;
+  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+      MachineInstr &MI = *MBBI;
+      unsigned Opcode = MI.getOpcode();
+      if (Opcode >= SP::LDDArr && Opcode <= SP::LDrr) {
+        MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+        BuildMI(MBB, NMBBI, DL, TII.get(SP::NOP));
+        Modified = true;
+      }
+    }
+  }
+
+  return Modified;
+}
+
+//*****************************************************************************
+//**** FixFSMULD pass
+//*****************************************************************************
+// This pass fixes the incorrectly working FSMULD instruction that exists for
+// some earlier versions of the LEON processor line.
+//
+// The pass should convert the FSMULD operands to double precision in scratch
+// registers, then calculate the result with the FMULD instruction. Therefore,
+// the pass should replace operations of the form:
+// fsmuld %f20,%f21,%f8
+// with the sequence:
+// fstod %f20,%f0
+// fstod %f21,%f2
+// fmuld %f0,%f2,%f8
+//
+char FixFSMULD::ID = 0;
+
+FixFSMULD::FixFSMULD(TargetMachine &tm) : LEONMachineFunctionPass(tm, ID) {}
+
+bool FixFSMULD::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<SparcSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc DL = DebugLoc();
+
+  bool Modified = false;
+  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+
+      MachineInstr &MI = *MBBI;
+      unsigned Opcode = MI.getOpcode();
+
+      const int UNASSIGNED_INDEX = -1;
+      int Reg1Index = UNASSIGNED_INDEX;
+      int Reg2Index = UNASSIGNED_INDEX;
+      int Reg3Index = UNASSIGNED_INDEX;
+
+      if (Opcode == SP::FSMULD && MI.getNumOperands() == 3) {
+        // take the registers from fsmuld %f20,%f21,%f8
+        Reg1Index = MI.getOperand(0).getReg();
+        Reg2Index = MI.getOperand(1).getReg();
+        Reg3Index = MI.getOperand(2).getReg();
+      }
+
+      if (Reg1Index != UNASSIGNED_INDEX && Reg2Index != UNASSIGNED_INDEX &&
+          Reg3Index != UNASSIGNED_INDEX) {
+        clearUsedRegisterList();
+        MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+        // Whatever Reg3Index is hasn't been used yet, so we need to reserve it.
+        markRegisterUsed(Reg3Index);
+        const int ScratchReg1Index = getUnusedFPRegister(MF.getRegInfo());
+        markRegisterUsed(ScratchReg1Index);
+        const int ScratchReg2Index = getUnusedFPRegister(MF.getRegInfo());
+        markRegisterUsed(ScratchReg2Index);
+
+        if (ScratchReg1Index == UNASSIGNED_INDEX ||
+            ScratchReg2Index == UNASSIGNED_INDEX) {
+          errs() << "Cannot allocate free scratch registers for the FixFSMULD "
+                    "pass."
+                 << "\n";
+        } else {
+          // create fstod %f20,%f0
+          BuildMI(MBB, MBBI, DL, TII.get(SP::FSTOD))
+              .addReg(ScratchReg1Index)
+              .addReg(Reg1Index);
+
+          // create fstod %f21,%f2
+          BuildMI(MBB, MBBI, DL, TII.get(SP::FSTOD))
+              .addReg(ScratchReg2Index)
+              .addReg(Reg2Index);
+
+          // create fmuld %f0,%f2,%f8
+          BuildMI(MBB, MBBI, DL, TII.get(SP::FMULD))
+              .addReg(Reg3Index)
+              .addReg(ScratchReg1Index)
+              .addReg(ScratchReg2Index);
+
+          MI.eraseFromParent();
+          MBBI = NMBBI;
+
+          Modified = true;
+        }
+      }
+    }
+  }
+
+  return Modified;
+}
+
+//*****************************************************************************
+//**** ReplaceFMULS pass
+//*****************************************************************************
+// This pass fixes the incorrectly working FMULS instruction that exists for
+// some earlier versions of the LEON processor line.
+//
+// This pass converts the FMULS operands to double precision in scratch
+// registers, then calculates the result with the FMULD instruction.
+// The pass should replace operations of the form:
+// fmuls %f20,%f21,%f8
+// with the sequence:
+// fstod %f20,%f0
+// fstod %f21,%f2
+// fmuld %f0,%f2,%f8
+//
+char ReplaceFMULS::ID = 0;
+
+ReplaceFMULS::ReplaceFMULS(TargetMachine &tm)
+    : LEONMachineFunctionPass(tm, ID) {}
+
+bool ReplaceFMULS::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<SparcSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc DL = DebugLoc();
+
+  bool Modified = false;
+  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+      MachineInstr &MI = *MBBI;
+      unsigned Opcode = MI.getOpcode();
+
+      const int UNASSIGNED_INDEX = -1;
+      int Reg1Index = UNASSIGNED_INDEX;
+      int Reg2Index = UNASSIGNED_INDEX;
+      int Reg3Index = UNASSIGNED_INDEX;
+
+      if (Opcode == SP::FMULS && MI.getNumOperands() == 3) {
+        // take the registers from fmuls %f20,%f21,%f8
+        Reg1Index = MI.getOperand(0).getReg();
+        Reg2Index = MI.getOperand(1).getReg();
+        Reg3Index = MI.getOperand(2).getReg();
+      }
+
+      if (Reg1Index != UNASSIGNED_INDEX && Reg2Index != UNASSIGNED_INDEX &&
+          Reg3Index != UNASSIGNED_INDEX) {
+        clearUsedRegisterList();
+        MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+        // Whatever Reg3Index is hasn't been used yet, so we need to reserve it.
+        markRegisterUsed(Reg3Index);
+        const int ScratchReg1Index = getUnusedFPRegister(MF.getRegInfo());
+        markRegisterUsed(ScratchReg1Index);
+        const int ScratchReg2Index = getUnusedFPRegister(MF.getRegInfo());
+        markRegisterUsed(ScratchReg2Index);
+
+        if (ScratchReg1Index == UNASSIGNED_INDEX ||
+            ScratchReg2Index == UNASSIGNED_INDEX) {
+          errs() << "Cannot allocate free scratch registers for the "
+                    "ReplaceFMULS pass."
+                 << "\n";
+        } else {
+          // create fstod %f20,%f0
+          BuildMI(MBB, MBBI, DL, TII.get(SP::FSTOD))
+              .addReg(ScratchReg1Index)
+              .addReg(Reg1Index);
+
+          // create fstod %f21,%f2
+          BuildMI(MBB, MBBI, DL, TII.get(SP::FSTOD))
+              .addReg(ScratchReg2Index)
+              .addReg(Reg2Index);
+
+          // create fmuld %f0,%f2,%f8
+          BuildMI(MBB, MBBI, DL, TII.get(SP::FMULD))
+              .addReg(Reg3Index)
+              .addReg(ScratchReg1Index)
+              .addReg(ScratchReg2Index);
+
+          MI.eraseFromParent();
+          MBBI = NMBBI;
+
+          Modified = true;
+        }
+      }
+    }
+  }
+
+  return Modified;
+}
+
+
+//*****************************************************************************
+//**** DetectRoundChange pass
+//*****************************************************************************
+// To prevent any explicit change of the default rounding mode, this pass
+// detects any call of the fesetround function.
+// A warning is generated to ensure the user knows this has happened.
+//
+// Detects an erratum in UT699 LEON 3 processor
+
+char DetectRoundChange::ID = 0;
+
+DetectRoundChange::DetectRoundChange(TargetMachine &tm)
+    : LEONMachineFunctionPass(tm, ID) {}
+
+bool DetectRoundChange::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<SparcSubtarget>();
+
+  bool Modified = false;
+  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+      MachineInstr &MI = *MBBI;
+      unsigned Opcode = MI.getOpcode();
+      if (Opcode == SP::CALL && MI.getNumOperands() > 0) {
+        MachineOperand &MO = MI.getOperand(0);
+
+        if (MO.isGlobal()) {
+          StringRef FuncName = MO.getGlobal()->getName();
+          if (FuncName.compare_lower("fesetround") == 0) {
+            errs() << "Error: You are using the detectroundchange "
+                      "option to detect rounding changes that will "
+                      "cause LEON errata. The only way to fix this "
+                      "is to remove the call to fesetround from "
+                      "the source code.\n";
+          }
+        }
+      }
+    }
+  }
+
+  return Modified;
+}
+
+//*****************************************************************************
+//**** FixAllFDIVSQRT pass
+//*****************************************************************************
+// This pass fixes the incorrectly working FDIVx and FSQRTx instructions that
+// exist for some earlier versions of the LEON processor line. Five NOP
+// instructions need to be inserted after these instructions to ensure the
+// correct result is placed in the destination registers before they are used.
+//
+// This pass implements two fixes:
+//  1) fixing the FSQRTS and FSQRTD instructions.
+//  2) fixing the FDIVS and FDIVD instructions.
+//
+// FSQRTS and FDIVS are converted to FDIVD and FSQRTD respectively earlier in
+// the pipeline when this option is enabled, so this pass needs only to deal
+// with the changes that still need implementing for the "double" versions
+// of these instructions.
+//
+char FixAllFDIVSQRT::ID = 0;
+
+FixAllFDIVSQRT::FixAllFDIVSQRT(TargetMachine &tm)
+    : LEONMachineFunctionPass(tm, ID) {}
+
+bool FixAllFDIVSQRT::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<SparcSubtarget>();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc DL = DebugLoc();
+
+  bool Modified = false;
+  for (auto MFI = MF.begin(), E = MF.end(); MFI != E; ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E; ++MBBI) {
+      MachineInstr &MI = *MBBI;
+      unsigned Opcode = MI.getOpcode();
+
+      // Note: FDIVS and FSQRTS cannot be generated when this erratum fix is
+      // switched on so we don't need to check for them here. They will
+      // already have been converted to FSQRTD or FDIVD earlier in the
+      // pipeline.
+      if (Opcode == SP::FSQRTD || Opcode == SP::FDIVD) {
+        for (int InsertedCount = 0; InsertedCount < 5; InsertedCount++)
+          BuildMI(MBB, MBBI, DL, TII.get(SP::NOP));
+
+        MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+        for (int InsertedCount = 0; InsertedCount < 28; InsertedCount++)
+          BuildMI(MBB, NMBBI, DL, TII.get(SP::NOP));
+
+        Modified = true;
+      }
+    }
+  }
+
+  return Modified;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/LeonPasses.h b/contrib/llvm/lib/Target/Sparc/LeonPasses.h
new file mode 100755
index 000000000000..2158cb636bfc
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/LeonPasses.h
@@ -0,0 +1,115 @@
+//===------- LeonPasses.h - Define passes specific to LEON ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_LEON_PASSES_H
+#define LLVM_LIB_TARGET_SPARC_LEON_PASSES_H
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+
+#include "Sparc.h"
+#include "SparcSubtarget.h"
+
+namespace llvm {
+class LLVM_LIBRARY_VISIBILITY LEONMachineFunctionPass
+    : public MachineFunctionPass {
+protected:
+  const SparcSubtarget *Subtarget;
+  const int LAST_OPERAND = -1;
+
+  // this vector holds free registers that we allocate in groups for some of the
+  // LEON passes
+  std::vector<int> UsedRegisters;
+
+protected:
+  LEONMachineFunctionPass(TargetMachine &tm, char &ID);
+  LEONMachineFunctionPass(char &ID);
+
+  int GetRegIndexForOperand(MachineInstr &MI, int OperandIndex);
+  void clearUsedRegisterList() { UsedRegisters.clear(); }
+
+  void markRegisterUsed(int registerIndex) {
+    UsedRegisters.push_back(registerIndex);
+  }
+  int getUnusedFPRegister(MachineRegisterInfo &MRI);
+};
+
+class LLVM_LIBRARY_VISIBILITY InsertNOPLoad : public LEONMachineFunctionPass {
+public:
+  static char ID;
+
+  InsertNOPLoad(TargetMachine &tm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "InsertNOPLoad: Erratum Fix LBR35: insert a NOP instruction after "
+           "every single-cycle load instruction when the next instruction is "
+           "another load/store instruction";
+  }
+};
+
+class LLVM_LIBRARY_VISIBILITY FixFSMULD : public LEONMachineFunctionPass {
+public:
+  static char ID;
+
+  FixFSMULD(TargetMachine &tm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "FixFSMULD: Erratum Fix LBR31: do not select FSMULD";
+  }
+};
+
+class LLVM_LIBRARY_VISIBILITY ReplaceFMULS : public LEONMachineFunctionPass {
+public:
+  static char ID;
+
+  ReplaceFMULS(TargetMachine &tm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "ReplaceFMULS: Erratum Fix LBR32: replace FMULS instruction with a "
+           "routine using conversions/double precision operations to replace "
+           "FMULS";
+  }
+};
+
+class LLVM_LIBRARY_VISIBILITY DetectRoundChange
+    : public LEONMachineFunctionPass {
+public:
+  static char ID;
+
+  DetectRoundChange(TargetMachine &tm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "DetectRoundChange: Leon erratum detection: detect any rounding "
+           "mode change request: use only the round-to-nearest rounding mode";
+  }
+};
+
+class LLVM_LIBRARY_VISIBILITY FixAllFDIVSQRT : public LEONMachineFunctionPass {
+public:
+  static char ID;
+
+  FixAllFDIVSQRT(TargetMachine &tm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "FixAllFDIVSQRT: Erratum Fix LBR34: fix FDIVS/FDIVD/FSQRTS/FSQRTD "
+           "instructions with NOPs and floating-point store";
+  }
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPARC_LEON_PASSES_H
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
new file mode 100644
index 000000000000..6106a6c32dc8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -0,0 +1,306 @@
+//===-- SparcAsmBackend.cpp - Sparc Assembler Backend ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmBackend.h"
+#include "MCTargetDesc/SparcFixupKinds.h"
+#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+    return Value;
+
+  case Sparc::fixup_sparc_wplt30:
+  case Sparc::fixup_sparc_call30:
+    return (Value >> 2) & 0x3fffffff;
+
+  case Sparc::fixup_sparc_br22:
+    return (Value >> 2) & 0x3fffff;
+
+  case Sparc::fixup_sparc_br19:
+    return (Value >> 2) & 0x7ffff;
+
+  case Sparc::fixup_sparc_br16_2:
+    return (Value >> 2) & 0xc000;
+
+  case Sparc::fixup_sparc_br16_14:
+    return (Value >> 2) & 0x3fff;
+
+  case Sparc::fixup_sparc_pc22:
+  case Sparc::fixup_sparc_got22:
+  case Sparc::fixup_sparc_tls_gd_hi22:
+  case Sparc::fixup_sparc_tls_ldm_hi22:
+  case Sparc::fixup_sparc_tls_ie_hi22:
+  case Sparc::fixup_sparc_hi22:
+    return (Value >> 10) & 0x3fffff;
+
+  case Sparc::fixup_sparc_pc10:
+  case Sparc::fixup_sparc_got10:
+  case Sparc::fixup_sparc_tls_gd_lo10:
+  case Sparc::fixup_sparc_tls_ldm_lo10:
+  case Sparc::fixup_sparc_tls_ie_lo10:
+  case Sparc::fixup_sparc_lo10:
+    return Value & 0x3ff;
+
+  case Sparc::fixup_sparc_tls_ldo_hix22:
+  case Sparc::fixup_sparc_tls_le_hix22:
+    return (~Value >> 10) & 0x3fffff;
+
+  case Sparc::fixup_sparc_tls_ldo_lox10:
+  case Sparc::fixup_sparc_tls_le_lox10:
+    return (~(~Value & 0x3ff)) & 0x1fff;
+
+  case Sparc::fixup_sparc_h44:
+    return (Value >> 22) & 0x3fffff;
+
+  case Sparc::fixup_sparc_m44:
+    return (Value >> 12) & 0x3ff;
+
+  case Sparc::fixup_sparc_l44:
+    return Value & 0xfff;
+
+  case Sparc::fixup_sparc_hh:
+    return (Value >> 42) & 0x3fffff;
+
+  case Sparc::fixup_sparc_hm:
+    return (Value >> 32) & 0x3ff;
+
+  case Sparc::fixup_sparc_tls_gd_add:
+  case Sparc::fixup_sparc_tls_gd_call:
+  case Sparc::fixup_sparc_tls_ldm_add:
+  case Sparc::fixup_sparc_tls_ldm_call:
+  case Sparc::fixup_sparc_tls_ldo_add:
+  case Sparc::fixup_sparc_tls_ie_ld:
+  case Sparc::fixup_sparc_tls_ie_ldx:
+  case Sparc::fixup_sparc_tls_ie_add:
+    return 0;
+  }
+}
+
+namespace {
+  class SparcAsmBackend : public MCAsmBackend {
+  protected:
+    const Target &TheTarget;
+    bool IsLittleEndian;
+    bool Is64Bit;
+
+  public:
+    SparcAsmBackend(const Target &T)
+        : MCAsmBackend(), TheTarget(T),
+          IsLittleEndian(StringRef(TheTarget.getName()) == "sparcel"),
+          Is64Bit(StringRef(TheTarget.getName()) == "sparcv9") {}
+
+    unsigned getNumFixupKinds() const override {
+      return Sparc::NumTargetFixupKinds;
+    }
+
+    const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+      const static MCFixupKindInfo InfosBE[Sparc::NumTargetFixupKinds] = {
+        // name                    offset bits  flags
+        { "fixup_sparc_call30",     2,     30,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_br22",      10,     22,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_br19",      13,     19,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_br16_2",    10,      2,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_br16_14",   18,     14,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_hi22",      10,     22,  0 },
+        { "fixup_sparc_lo10",      22,     10,  0 },
+        { "fixup_sparc_h44",       10,     22,  0 },
+        { "fixup_sparc_m44",       22,     10,  0 },
+        { "fixup_sparc_l44",       20,     12,  0 },
+        { "fixup_sparc_hh",        10,     22,  0 },
+        { "fixup_sparc_hm",        22,     10,  0 },
+        { "fixup_sparc_pc22",      10,     22,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_pc10",      22,     10,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_got22",     10,     22,  0 },
+        { "fixup_sparc_got10",     22,     10,  0 },
+        { "fixup_sparc_wplt30",     2,     30,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_tls_gd_hi22",   10, 22,  0 },
+        { "fixup_sparc_tls_gd_lo10",   22, 10,  0 },
+        { "fixup_sparc_tls_gd_add",     0,  0,  0 },
+        { "fixup_sparc_tls_gd_call",    0,  0,  0 },
+        { "fixup_sparc_tls_ldm_hi22",  10, 22,  0 },
+        { "fixup_sparc_tls_ldm_lo10",  22, 10,  0 },
+        { "fixup_sparc_tls_ldm_add",    0,  0,  0 },
+        { "fixup_sparc_tls_ldm_call",   0,  0,  0 },
+        { "fixup_sparc_tls_ldo_hix22", 10, 22,  0 },
+        { "fixup_sparc_tls_ldo_lox10", 22, 10,  0 },
+        { "fixup_sparc_tls_ldo_add",    0,  0,  0 },
+        { "fixup_sparc_tls_ie_hi22",   10, 22,  0 },
+        { "fixup_sparc_tls_ie_lo10",   22, 10,  0 },
+        { "fixup_sparc_tls_ie_ld",      0,  0,  0 },
+        { "fixup_sparc_tls_ie_ldx",     0,  0,  0 },
+        { "fixup_sparc_tls_ie_add",     0,  0,  0 },
+        { "fixup_sparc_tls_le_hix22",   0,  0,  0 },
+        { "fixup_sparc_tls_le_lox10",   0,  0,  0 }
+      };
+
+      const static MCFixupKindInfo InfosLE[Sparc::NumTargetFixupKinds] = {
+        // name                    offset bits  flags
+        { "fixup_sparc_call30",     0,     30,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_br22",       0,     22,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_br19",       0,     19,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_br16_2",    20,      2,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_br16_14",    0,     14,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_hi22",       0,     22,  0 },
+        { "fixup_sparc_lo10",       0,     10,  0 },
+        { "fixup_sparc_h44",        0,     22,  0 },
+        { "fixup_sparc_m44",        0,     10,  0 },
+        { "fixup_sparc_l44",        0,     12,  0 },
+        { "fixup_sparc_hh",         0,     22,  0 },
+        { "fixup_sparc_hm",         0,     10,  0 },
+        { "fixup_sparc_pc22",       0,     22,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_pc10",       0,     10,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_got22",      0,     22,  0 },
+        { "fixup_sparc_got10",      0,     10,  0 },
+        { "fixup_sparc_wplt30",      0,     30,  MCFixupKindInfo::FKF_IsPCRel },
+        { "fixup_sparc_tls_gd_hi22",    0, 22,  0 },
+        { "fixup_sparc_tls_gd_lo10",    0, 10,  0 },
+        { "fixup_sparc_tls_gd_add",     0,  0,  0 },
+        { "fixup_sparc_tls_gd_call",    0,  0,  0 },
+        { "fixup_sparc_tls_ldm_hi22",   0, 22,  0 },
+        { "fixup_sparc_tls_ldm_lo10",   0, 10,  0 },
+        { "fixup_sparc_tls_ldm_add",    0,  0,  0 },
+        { "fixup_sparc_tls_ldm_call",   0,  0,  0 },
+        { "fixup_sparc_tls_ldo_hix22",  0, 22,  0 },
+        { "fixup_sparc_tls_ldo_lox10",  0, 10,  0 },
+        { "fixup_sparc_tls_ldo_add",    0,  0,  0 },
+        { "fixup_sparc_tls_ie_hi22",    0, 22,  0 },
+        { "fixup_sparc_tls_ie_lo10",    0, 10,  0 },
+        { "fixup_sparc_tls_ie_ld",      0,  0,  0 },
+        { "fixup_sparc_tls_ie_ldx",     0,  0,  0 },
+        { "fixup_sparc_tls_ie_add",     0,  0,  0 },
+        { "fixup_sparc_tls_le_hix22",   0,  0,  0 },
+        { "fixup_sparc_tls_le_lox10",   0,  0,  0 }
+      };
+
+      if (Kind < FirstTargetFixupKind)
+        return MCAsmBackend::getFixupKindInfo(Kind);
+
+      assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+             "Invalid kind!");
+      if (IsLittleEndian)
+        return InfosLE[Kind - FirstTargetFixupKind];
+
+      return InfosBE[Kind - FirstTargetFixupKind];
+    }
+
+    void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                           const MCFixup &Fixup, const MCFragment *DF,
+                           const MCValue &Target, uint64_t &Value,
+                           bool &IsResolved) override {
+      switch ((Sparc::Fixups)Fixup.getKind()) {
+      default: break;
+      case Sparc::fixup_sparc_wplt30:
+        if (Target.getSymA()->getSymbol().isTemporary())
+          return;
+      case Sparc::fixup_sparc_tls_gd_hi22:
+      case Sparc::fixup_sparc_tls_gd_lo10:
+      case Sparc::fixup_sparc_tls_gd_add:
+      case Sparc::fixup_sparc_tls_gd_call:
+      case Sparc::fixup_sparc_tls_ldm_hi22:
+      case Sparc::fixup_sparc_tls_ldm_lo10:
+      case Sparc::fixup_sparc_tls_ldm_add:
+      case Sparc::fixup_sparc_tls_ldm_call:
+      case Sparc::fixup_sparc_tls_ldo_hix22:
+      case Sparc::fixup_sparc_tls_ldo_lox10:
+      case Sparc::fixup_sparc_tls_ldo_add:
+      case Sparc::fixup_sparc_tls_ie_hi22:
+      case Sparc::fixup_sparc_tls_ie_lo10:
+      case Sparc::fixup_sparc_tls_ie_ld:
+      case Sparc::fixup_sparc_tls_ie_ldx:
+      case Sparc::fixup_sparc_tls_ie_add:
+      case Sparc::fixup_sparc_tls_le_hix22:
+      case Sparc::fixup_sparc_tls_le_lox10:  IsResolved = false; break;
+      }
+    }
+
+    bool mayNeedRelaxation(const MCInst &Inst) const override {
+      // FIXME.
+      return false;
+    }
+
+    /// fixupNeedsRelaxation - Target specific predicate for whether a given
+    /// fixup requires the associated instruction to be relaxed.
+    bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                              uint64_t Value,
+                              const MCRelaxableFragment *DF,
+                              const MCAsmLayout &Layout) const override {
+      // FIXME.
+      llvm_unreachable("fixupNeedsRelaxation() unimplemented");
+      return false;
+    }
+    void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                          MCInst &Res) const override {
+      // FIXME.
+      llvm_unreachable("relaxInstruction() unimplemented");
+    }
+
+    bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
+      // Cannot emit NOP with size not multiple of 32 bits.
+      if (Count % 4 != 0)
+        return false;
+
+      uint64_t NumNops = Count / 4;
+      for (uint64_t i = 0; i != NumNops; ++i)
+        OW->write32(0x01000000);
+
+      return true;
+    }
+  };
+
+  class ELFSparcAsmBackend : public SparcAsmBackend {
+    Triple::OSType OSType;
+  public:
+    ELFSparcAsmBackend(const Target &T, Triple::OSType OSType) :
+      SparcAsmBackend(T), OSType(OSType) { }
+
+    void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                    uint64_t Value, bool IsPCRel) const override {
+
+      Value = adjustFixupValue(Fixup.getKind(), Value);
+      if (!Value) return;           // Doesn't change encoding.
+
+      unsigned Offset = Fixup.getOffset();
+
+      // For each byte of the fragment that the fixup touches, mask in the bits
+      // from the fixup value. The Value has been "split up" into the
+      // appropriate bitfields above.
+      for (unsigned i = 0; i != 4; ++i) {
+        unsigned Idx = IsLittleEndian ? i : 3 - i;
+        Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+      }
+    }
+
+    MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+      uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(OSType);
+      return createSparcELFObjectWriter(OS, Is64Bit, IsLittleEndian, OSABI);
+    }
+  };
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createSparcAsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI,
+                                          const Triple &TT, StringRef CPU,
+                                          const MCTargetOptions &Options) {
+  return new ELFSparcAsmBackend(T, TT.getOS());
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
new file mode 100644
index 000000000000..d35e45e03466
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -0,0 +1,140 @@
+//===-- SparcELFObjectWriter.cpp - Sparc ELF Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SparcFixupKinds.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+  class SparcELFObjectWriter : public MCELFObjectTargetWriter {
+  public:
+    SparcELFObjectWriter(bool Is64Bit, uint8_t OSABI)
+      : MCELFObjectTargetWriter(Is64Bit, OSABI,
+                                Is64Bit ?  ELF::EM_SPARCV9 : ELF::EM_SPARC,
+                                /*HasRelocationAddend*/ true) {}
+
+    ~SparcELFObjectWriter() override {}
+
+  protected:
+    unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                          const MCFixup &Fixup, bool IsPCRel) const override;
+
+    bool needsRelocateWithSymbol(const MCSymbol &Sym,
+                                 unsigned Type) const override;
+
+  };
+}
+
+unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx,
+                                            const MCValue &Target,
+                                            const MCFixup &Fixup,
+                                            bool IsPCRel) const {
+
+  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Fixup.getValue())) {
+    if (SExpr->getKind() == SparcMCExpr::VK_Sparc_R_DISP32)
+      return ELF::R_SPARC_DISP32;
+  }
+
+  if (IsPCRel) {
+    switch((unsigned)Fixup.getKind()) {
+    default:
+      llvm_unreachable("Unimplemented fixup -> relocation");
+    case FK_Data_1:                  return ELF::R_SPARC_DISP8;
+    case FK_Data_2:                  return ELF::R_SPARC_DISP16;
+    case FK_Data_4:                  return ELF::R_SPARC_DISP32;
+    case FK_Data_8:                  return ELF::R_SPARC_DISP64;
+    case Sparc::fixup_sparc_call30:  return ELF::R_SPARC_WDISP30;
+    case Sparc::fixup_sparc_br22:    return ELF::R_SPARC_WDISP22;
+    case Sparc::fixup_sparc_br19:    return ELF::R_SPARC_WDISP19;
+    case Sparc::fixup_sparc_pc22:    return ELF::R_SPARC_PC22;
+    case Sparc::fixup_sparc_pc10:    return ELF::R_SPARC_PC10;
+    case Sparc::fixup_sparc_wplt30:  return ELF::R_SPARC_WPLT30;
+    }
+  }
+
+  switch((unsigned)Fixup.getKind()) {
+  default:
+    llvm_unreachable("Unimplemented fixup -> relocation");
+  case FK_Data_1:                return ELF::R_SPARC_8;
+  case FK_Data_2:                return ((Fixup.getOffset() % 2)
+                                         ? ELF::R_SPARC_UA16
+                                         : ELF::R_SPARC_16);
+  case FK_Data_4:                return ((Fixup.getOffset() % 4)
+                                         ? ELF::R_SPARC_UA32
+                                         : ELF::R_SPARC_32);
+  case FK_Data_8:                return ((Fixup.getOffset() % 8)
+                                         ? ELF::R_SPARC_UA64
+                                         : ELF::R_SPARC_64);
+  case Sparc::fixup_sparc_hi22:  return ELF::R_SPARC_HI22;
+  case Sparc::fixup_sparc_lo10:  return ELF::R_SPARC_LO10;
+  case Sparc::fixup_sparc_h44:   return ELF::R_SPARC_H44;
+  case Sparc::fixup_sparc_m44:   return ELF::R_SPARC_M44;
+  case Sparc::fixup_sparc_l44:   return ELF::R_SPARC_L44;
+  case Sparc::fixup_sparc_hh:    return ELF::R_SPARC_HH22;
+  case Sparc::fixup_sparc_hm:    return ELF::R_SPARC_HM10;
+  case Sparc::fixup_sparc_got22: return ELF::R_SPARC_GOT22;
+  case Sparc::fixup_sparc_got10: return ELF::R_SPARC_GOT10;
+  case Sparc::fixup_sparc_tls_gd_hi22:   return ELF::R_SPARC_TLS_GD_HI22;
+  case Sparc::fixup_sparc_tls_gd_lo10:   return ELF::R_SPARC_TLS_GD_LO10;
+  case Sparc::fixup_sparc_tls_gd_add:    return ELF::R_SPARC_TLS_GD_ADD;
+  case Sparc::fixup_sparc_tls_gd_call:   return ELF::R_SPARC_TLS_GD_CALL;
+  case Sparc::fixup_sparc_tls_ldm_hi22:  return ELF::R_SPARC_TLS_LDM_HI22;
+  case Sparc::fixup_sparc_tls_ldm_lo10:  return ELF::R_SPARC_TLS_LDM_LO10;
+  case Sparc::fixup_sparc_tls_ldm_add:   return ELF::R_SPARC_TLS_LDM_ADD;
+  case Sparc::fixup_sparc_tls_ldm_call:  return ELF::R_SPARC_TLS_LDM_CALL;
+  case Sparc::fixup_sparc_tls_ldo_hix22: return ELF::R_SPARC_TLS_LDO_HIX22;
+  case Sparc::fixup_sparc_tls_ldo_lox10: return ELF::R_SPARC_TLS_LDO_LOX10;
+  case Sparc::fixup_sparc_tls_ldo_add:   return ELF::R_SPARC_TLS_LDO_ADD;
+  case Sparc::fixup_sparc_tls_ie_hi22:   return ELF::R_SPARC_TLS_IE_HI22;
+  case Sparc::fixup_sparc_tls_ie_lo10:   return ELF::R_SPARC_TLS_IE_LO10;
+  case Sparc::fixup_sparc_tls_ie_ld:     return ELF::R_SPARC_TLS_IE_LD;
+  case Sparc::fixup_sparc_tls_ie_ldx:    return ELF::R_SPARC_TLS_IE_LDX;
+  case Sparc::fixup_sparc_tls_ie_add:    return ELF::R_SPARC_TLS_IE_ADD;
+  case Sparc::fixup_sparc_tls_le_hix22:  return ELF::R_SPARC_TLS_LE_HIX22;
+  case Sparc::fixup_sparc_tls_le_lox10:  return ELF::R_SPARC_TLS_LE_LOX10;
+  }
+
+  return ELF::R_SPARC_NONE;
+}
+
+bool SparcELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
+                                                 unsigned Type) const {
+  switch (Type) {
+    default:
+      return false;
+
+    // All relocations that use a GOT need a symbol, not an offset, as
+    // the offset of the symbol within the section is irrelevant to
+    // where the GOT entry is. Don't need to list all the TLS entries,
+    // as they're all marked as requiring a symbol anyways.
+    case ELF::R_SPARC_GOT10:
+    case ELF::R_SPARC_GOT13:
+    case ELF::R_SPARC_GOT22:
+    case ELF::R_SPARC_GOTDATA_HIX22:
+    case ELF::R_SPARC_GOTDATA_LOX10:
+    case ELF::R_SPARC_GOTDATA_OP_HIX22:
+    case ELF::R_SPARC_GOTDATA_OP_LOX10:
+      return true;
+  }
+}
+
+MCObjectWriter *llvm::createSparcELFObjectWriter(raw_pwrite_stream &OS,
+                                                 bool Is64Bit,
+                                                 bool IsLittleEndian,
+                                                 uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new SparcELFObjectWriter(Is64Bit, OSABI);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
new file mode 100644
index 000000000000..8d79396d936e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
@@ -0,0 +1,97 @@
+//===-- SparcFixupKinds.h - Sparc Specific Fixup Entries --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCFIXUPKINDS_H
+#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+  namespace Sparc {
+    enum Fixups {
+      // fixup_sparc_call30 - 30-bit PC relative relocation for call
+      fixup_sparc_call30 = FirstTargetFixupKind,
+
+      /// fixup_sparc_br22 - 22-bit PC relative relocation for
+      /// branches
+      fixup_sparc_br22,
+
+      /// fixup_sparc_br19 - 19-bit PC relative relocation for
+      /// branches on icc/xcc
+      fixup_sparc_br19,
+
+      /// fixup_sparc_bpr  - 16-bit fixup for bpr
+      fixup_sparc_br16_2,
+      fixup_sparc_br16_14,
+
+      /// fixup_sparc_hi22  - 22-bit fixup corresponding to %hi(foo)
+      /// for sethi
+      fixup_sparc_hi22,
+
+      /// fixup_sparc_lo10  - 10-bit fixup corresponding to %lo(foo)
+      fixup_sparc_lo10,
+
+      /// fixup_sparc_h44  - 22-bit fixup corresponding to %h44(foo)
+      fixup_sparc_h44,
+
+      /// fixup_sparc_m44  - 10-bit fixup corresponding to %m44(foo)
+      fixup_sparc_m44,
+
+      /// fixup_sparc_l44  - 12-bit fixup corresponding to %l44(foo)
+      fixup_sparc_l44,
+
+      /// fixup_sparc_hh  -  22-bit fixup corresponding to %hh(foo)
+      fixup_sparc_hh,
+
+      /// fixup_sparc_hm  -  10-bit fixup corresponding to %hm(foo)
+      fixup_sparc_hm,
+
+      /// fixup_sparc_pc22 - 22-bit fixup corresponding to %pc22(foo)
+      fixup_sparc_pc22,
+
+      /// fixup_sparc_pc10 - 10-bit fixup corresponding to %pc10(foo)
+      fixup_sparc_pc10,
+
+      /// fixup_sparc_got22 - 22-bit fixup corresponding to %got22(foo)
+      fixup_sparc_got22,
+
+      /// fixup_sparc_got10 - 10-bit fixup corresponding to %got10(foo)
+      fixup_sparc_got10,
+
+      /// fixup_sparc_wplt30
+      fixup_sparc_wplt30,
+
+      /// fixups for Thread Local Storage
+      fixup_sparc_tls_gd_hi22,
+      fixup_sparc_tls_gd_lo10,
+      fixup_sparc_tls_gd_add,
+      fixup_sparc_tls_gd_call,
+      fixup_sparc_tls_ldm_hi22,
+      fixup_sparc_tls_ldm_lo10,
+      fixup_sparc_tls_ldm_add,
+      fixup_sparc_tls_ldm_call,
+      fixup_sparc_tls_ldo_hix22,
+      fixup_sparc_tls_ldo_lox10,
+      fixup_sparc_tls_ldo_add,
+      fixup_sparc_tls_ie_hi22,
+      fixup_sparc_tls_ie_lo10,
+      fixup_sparc_tls_ie_ld,
+      fixup_sparc_tls_ie_ldx,
+      fixup_sparc_tls_ie_add,
+      fixup_sparc_tls_le_hix22,
+      fixup_sparc_tls_le_lox10,
+
+      // Marker
+      LastTargetFixupKind,
+      NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+    };
+  }
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
new file mode 100644
index 000000000000..280c6d7937b2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -0,0 +1,70 @@
+//===-- SparcMCAsmInfo.cpp - Sparc asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the SparcMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCAsmInfo.h"
+#include "SparcMCExpr.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+void SparcELFMCAsmInfo::anchor() {}
+
+SparcELFMCAsmInfo::SparcELFMCAsmInfo(const Triple &TheTriple) {
+  bool isV9 = (TheTriple.getArch() == Triple::sparcv9);
+  IsLittleEndian = (TheTriple.getArch() == Triple::sparcel);
+
+  if (isV9) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
+
+  Data16bitsDirective = "\t.half\t";
+  Data32bitsDirective = "\t.word\t";
+  // .xword is only supported by V9.
+  Data64bitsDirective = (isV9) ? "\t.xword\t" : nullptr;
+  ZeroDirective = "\t.skip\t";
+  CommentString = "!";
+  SupportsDebugInformation = true;
+
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  SunStyleELFSectionSwitchSyntax = true;
+  UsesELFSectionDirectiveForBSS = true;
+
+  UseIntegratedAssembler = true;
+}
+
+const MCExpr*
+SparcELFMCAsmInfo::getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                               unsigned Encoding,
+                                               MCStreamer &Streamer) const {
+  if (Encoding & dwarf::DW_EH_PE_pcrel) {
+    MCContext &Ctx = Streamer.getContext();
+    return SparcMCExpr::create(SparcMCExpr::VK_Sparc_R_DISP32,
+                               MCSymbolRefExpr::create(Sym, Ctx), Ctx);
+  }
+
+  return MCAsmInfo::getExprForPersonalitySymbol(Sym, Encoding, Streamer);
+}
+
+const MCExpr*
+SparcELFMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
+                                       unsigned Encoding,
+                                       MCStreamer &Streamer) const {
+  if (Encoding & dwarf::DW_EH_PE_pcrel) {
+    MCContext &Ctx = Streamer.getContext();
+    return SparcMCExpr::create(SparcMCExpr::VK_Sparc_R_DISP32,
+                               MCSymbolRefExpr::create(Sym, Ctx), Ctx);
+  }
+  return MCAsmInfo::getExprForFDESymbol(Sym, Encoding, Streamer);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
new file mode 100644
index 000000000000..ad441227600e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -0,0 +1,38 @@
+//===-- SparcMCAsmInfo.h - Sparc asm properties ----------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SparcMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCASMINFO_H
+#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class SparcELFMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit SparcELFMCAsmInfo(const Triple &TheTriple);
+  const MCExpr*
+  getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+                              MCStreamer &Streamer) const override;
+  const MCExpr* getExprForFDESymbol(const MCSymbol *Sym,
+                                    unsigned Encoding,
+                                    MCStreamer &Streamer) const override;
+
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
new file mode 100644
index 000000000000..86341c61d1e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -0,0 +1,229 @@
+//===-- SparcMCCodeEmitter.cpp - Convert Sparc code to machine code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SparcMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCExpr.h"
+#include "MCTargetDesc/SparcFixupKinds.h"
+#include "SparcMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace {
+class SparcMCCodeEmitter : public MCCodeEmitter {
+  SparcMCCodeEmitter(const SparcMCCodeEmitter &) = delete;
+  void operator=(const SparcMCCodeEmitter &) = delete;
+  const MCInstrInfo &MCII;
+  MCContext &Ctx;
+
+public:
+  SparcMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+      : MCII(mcii), Ctx(ctx) {}
+
+  ~SparcMCCodeEmitter() override {}
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getCallTargetOpValue(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+  unsigned getBranchPredTargetOpValue(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
+  unsigned getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const;
+
+private:
+  uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+  void verifyInstructionPredicates(const MCInst &MI,
+                                   uint64_t AvailableFeatures) const;
+};
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createSparcMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              MCContext &Ctx) {
+  return new SparcMCCodeEmitter(MCII, Ctx);
+}
+
+void SparcMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  verifyInstructionPredicates(MI,
+                              computeAvailableFeatures(STI.getFeatureBits()));
+
+  unsigned Bits = getBinaryCodeForInstr(MI, Fixups, STI);
+
+  if (Ctx.getAsmInfo()->isLittleEndian()) {
+    // Output the bits in little-endian byte order.
+    support::endian::Writer<support::little>(OS).write<uint32_t>(Bits);
+  } else {
+    // Output the bits in big-endian byte order.
+    support::endian::Writer<support::big>(OS).write<uint32_t>(Bits);
+  }
+  unsigned tlsOpNo = 0;
+  switch (MI.getOpcode()) {
+  default: break;
+  case SP::TLS_CALL:   tlsOpNo = 1; break;
+  case SP::TLS_ADDrr:
+  case SP::TLS_ADDXrr:
+  case SP::TLS_LDrr:
+  case SP::TLS_LDXrr:  tlsOpNo = 3; break;
+  }
+  if (tlsOpNo != 0) {
+    const MCOperand &MO = MI.getOperand(tlsOpNo);
+    uint64_t op = getMachineOpValue(MI, MO, Fixups, STI);
+    assert(op == 0 && "Unexpected operand value!");
+    (void)op; // suppress warning.
+  }
+
+  ++MCNumEmitted;  // Keep track of the # of mi's emitted.
+}
+
+
+unsigned SparcMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  if (MO.isImm())
+    return MO.getImm();
+
+  assert(MO.isExpr());
+  const MCExpr *Expr = MO.getExpr();
+  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
+    MCFixupKind Kind = (MCFixupKind)SExpr->getFixupKind();
+    Fixups.push_back(MCFixup::create(0, Expr, Kind));
+    return 0;
+  }
+
+  int64_t Res;
+  if (Expr->evaluateAsAbsolute(Res))
+    return Res;
+
+  llvm_unreachable("Unhandled expression!");
+  return 0;
+}
+
+unsigned SparcMCCodeEmitter::
+getCallTargetOpValue(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm())
+    return getMachineOpValue(MI, MO, Fixups, STI);
+
+  if (MI.getOpcode() == SP::TLS_CALL) {
+    // No fixups for __tls_get_addr. Will emit for fixups for tls_symbol in
+    // encodeInstruction.
+#ifndef NDEBUG
+    // Verify that the callee is actually __tls_get_addr.
+    const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(MO.getExpr());
+    assert(SExpr && SExpr->getSubExpr()->getKind() == MCExpr::SymbolRef &&
+           "Unexpected expression in TLS_CALL");
+    const MCSymbolRefExpr *SymExpr = cast<MCSymbolRefExpr>(SExpr->getSubExpr());
+    assert(SymExpr->getSymbol().getName() == "__tls_get_addr" &&
+           "Unexpected function for TLS_CALL");
+#endif
+    return 0;
+  }
+
+  MCFixupKind fixupKind = (MCFixupKind)Sparc::fixup_sparc_call30;
+
+  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(MO.getExpr())) {
+    if (SExpr->getKind() == SparcMCExpr::VK_Sparc_WPLT30)
+      fixupKind = (MCFixupKind)Sparc::fixup_sparc_wplt30;
+  }
+
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(), fixupKind));
+
+  return 0;
+}
+
+unsigned SparcMCCodeEmitter::
+getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm())
+    return getMachineOpValue(MI, MO, Fixups, STI);
+
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+                                   (MCFixupKind)Sparc::fixup_sparc_br22));
+  return 0;
+}
+
+unsigned SparcMCCodeEmitter::
+getBranchPredTargetOpValue(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm())
+    return getMachineOpValue(MI, MO, Fixups, STI);
+
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+                                   (MCFixupKind)Sparc::fixup_sparc_br19));
+  return 0;
+}
+unsigned SparcMCCodeEmitter::
+getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm())
+    return getMachineOpValue(MI, MO, Fixups, STI);
+
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+                                   (MCFixupKind)Sparc::fixup_sparc_br16_2));
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+                                   (MCFixupKind)Sparc::fixup_sparc_br16_14));
+
+  return 0;
+}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "SparcGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
new file mode 100644
index 000000000000..e85a8cd5e339
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -0,0 +1,221 @@
+//===-- SparcMCExpr.cpp - Sparc specific MC expression classes --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the assembly expression modifiers
+// accepted by the Sparc architecture (e.g. "%hi", "%lo", ...).
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Object/ELF.h"
+
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sparcmcexpr"
+
+const SparcMCExpr*
+SparcMCExpr::create(VariantKind Kind, const MCExpr *Expr,
+                      MCContext &Ctx) {
+    return new (Ctx) SparcMCExpr(Kind, Expr);
+}
+
+void SparcMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+
+  bool closeParen = printVariantKind(OS, Kind);
+
+  const MCExpr *Expr = getSubExpr();
+  Expr->print(OS, MAI);
+
+  if (closeParen)
+    OS << ')';
+}
+
+bool SparcMCExpr::printVariantKind(raw_ostream &OS, VariantKind Kind)
+{
+  bool closeParen = true;
+  switch (Kind) {
+  case VK_Sparc_None:     closeParen = false; break;
+  case VK_Sparc_LO:       OS << "%lo(";  break;
+  case VK_Sparc_HI:       OS << "%hi(";  break;
+  case VK_Sparc_H44:      OS << "%h44("; break;
+  case VK_Sparc_M44:      OS << "%m44("; break;
+  case VK_Sparc_L44:      OS << "%l44("; break;
+  case VK_Sparc_HH:       OS << "%hh(";  break;
+  case VK_Sparc_HM:       OS << "%hm(";  break;
+    // FIXME: use %pc22/%pc10, if system assembler supports them.
+  case VK_Sparc_PC22:     OS << "%hi("; break;
+  case VK_Sparc_PC10:     OS << "%lo("; break;
+    // FIXME: use %got22/%got10, if system assembler supports them.
+  case VK_Sparc_GOT22:    OS << "%hi("; break;
+  case VK_Sparc_GOT10:    OS << "%lo("; break;
+  case VK_Sparc_WPLT30:   closeParen = false; break;
+  case VK_Sparc_R_DISP32: OS << "%r_disp32("; break;
+  case VK_Sparc_TLS_GD_HI22:   OS << "%tgd_hi22(";   break;
+  case VK_Sparc_TLS_GD_LO10:   OS << "%tgd_lo10(";   break;
+  case VK_Sparc_TLS_GD_ADD:    OS << "%tgd_add(";    break;
+  case VK_Sparc_TLS_GD_CALL:   OS << "%tgd_call(";   break;
+  case VK_Sparc_TLS_LDM_HI22:  OS << "%tldm_hi22(";  break;
+  case VK_Sparc_TLS_LDM_LO10:  OS << "%tldm_lo10(";  break;
+  case VK_Sparc_TLS_LDM_ADD:   OS << "%tldm_add(";   break;
+  case VK_Sparc_TLS_LDM_CALL:  OS << "%tldm_call(";  break;
+  case VK_Sparc_TLS_LDO_HIX22: OS << "%tldo_hix22("; break;
+  case VK_Sparc_TLS_LDO_LOX10: OS << "%tldo_lox10("; break;
+  case VK_Sparc_TLS_LDO_ADD:   OS << "%tldo_add(";   break;
+  case VK_Sparc_TLS_IE_HI22:   OS << "%tie_hi22(";   break;
+  case VK_Sparc_TLS_IE_LO10:   OS << "%tie_lo10(";   break;
+  case VK_Sparc_TLS_IE_LD:     OS << "%tie_ld(";     break;
+  case VK_Sparc_TLS_IE_LDX:    OS << "%tie_ldx(";    break;
+  case VK_Sparc_TLS_IE_ADD:    OS << "%tie_add(";    break;
+  case VK_Sparc_TLS_LE_HIX22:  OS << "%tle_hix22(";  break;
+  case VK_Sparc_TLS_LE_LOX10:  OS << "%tle_lox10(";  break;
+  }
+  return closeParen;
+}
+
+SparcMCExpr::VariantKind SparcMCExpr::parseVariantKind(StringRef name)
+{
+  return StringSwitch<SparcMCExpr::VariantKind>(name)
+    .Case("lo",  VK_Sparc_LO)
+    .Case("hi",  VK_Sparc_HI)
+    .Case("h44", VK_Sparc_H44)
+    .Case("m44", VK_Sparc_M44)
+    .Case("l44", VK_Sparc_L44)
+    .Case("hh",  VK_Sparc_HH)
+    .Case("hm",  VK_Sparc_HM)
+    .Case("pc22",  VK_Sparc_PC22)
+    .Case("pc10",  VK_Sparc_PC10)
+    .Case("got22", VK_Sparc_GOT22)
+    .Case("got10", VK_Sparc_GOT10)
+    .Case("r_disp32",   VK_Sparc_R_DISP32)
+    .Case("tgd_hi22",   VK_Sparc_TLS_GD_HI22)
+    .Case("tgd_lo10",   VK_Sparc_TLS_GD_LO10)
+    .Case("tgd_add",    VK_Sparc_TLS_GD_ADD)
+    .Case("tgd_call",   VK_Sparc_TLS_GD_CALL)
+    .Case("tldm_hi22",  VK_Sparc_TLS_LDM_HI22)
+    .Case("tldm_lo10",  VK_Sparc_TLS_LDM_LO10)
+    .Case("tldm_add",   VK_Sparc_TLS_LDM_ADD)
+    .Case("tldm_call",  VK_Sparc_TLS_LDM_CALL)
+    .Case("tldo_hix22", VK_Sparc_TLS_LDO_HIX22)
+    .Case("tldo_lox10", VK_Sparc_TLS_LDO_LOX10)
+    .Case("tldo_add",   VK_Sparc_TLS_LDO_ADD)
+    .Case("tie_hi22",   VK_Sparc_TLS_IE_HI22)
+    .Case("tie_lo10",   VK_Sparc_TLS_IE_LO10)
+    .Case("tie_ld",     VK_Sparc_TLS_IE_LD)
+    .Case("tie_ldx",    VK_Sparc_TLS_IE_LDX)
+    .Case("tie_add",    VK_Sparc_TLS_IE_ADD)
+    .Case("tle_hix22",  VK_Sparc_TLS_LE_HIX22)
+    .Case("tle_lox10",  VK_Sparc_TLS_LE_LOX10)
+    .Default(VK_Sparc_None);
+}
+
+Sparc::Fixups SparcMCExpr::getFixupKind(SparcMCExpr::VariantKind Kind) {
+  switch (Kind) {
+  default: llvm_unreachable("Unhandled SparcMCExpr::VariantKind");
+  case VK_Sparc_LO:       return Sparc::fixup_sparc_lo10;
+  case VK_Sparc_HI:       return Sparc::fixup_sparc_hi22;
+  case VK_Sparc_H44:      return Sparc::fixup_sparc_h44;
+  case VK_Sparc_M44:      return Sparc::fixup_sparc_m44;
+  case VK_Sparc_L44:      return Sparc::fixup_sparc_l44;
+  case VK_Sparc_HH:       return Sparc::fixup_sparc_hh;
+  case VK_Sparc_HM:       return Sparc::fixup_sparc_hm;
+  case VK_Sparc_PC22:     return Sparc::fixup_sparc_pc22;
+  case VK_Sparc_PC10:     return Sparc::fixup_sparc_pc10;
+  case VK_Sparc_GOT22:    return Sparc::fixup_sparc_got22;
+  case VK_Sparc_GOT10:    return Sparc::fixup_sparc_got10;
+  case VK_Sparc_WPLT30:   return Sparc::fixup_sparc_wplt30;
+  case VK_Sparc_TLS_GD_HI22:   return Sparc::fixup_sparc_tls_gd_hi22;
+  case VK_Sparc_TLS_GD_LO10:   return Sparc::fixup_sparc_tls_gd_lo10;
+  case VK_Sparc_TLS_GD_ADD:    return Sparc::fixup_sparc_tls_gd_add;
+  case VK_Sparc_TLS_GD_CALL:   return Sparc::fixup_sparc_tls_gd_call;
+  case VK_Sparc_TLS_LDM_HI22:  return Sparc::fixup_sparc_tls_ldm_hi22;
+  case VK_Sparc_TLS_LDM_LO10:  return Sparc::fixup_sparc_tls_ldm_lo10;
+  case VK_Sparc_TLS_LDM_ADD:   return Sparc::fixup_sparc_tls_ldm_add;
+  case VK_Sparc_TLS_LDM_CALL:  return Sparc::fixup_sparc_tls_ldm_call;
+  case VK_Sparc_TLS_LDO_HIX22: return Sparc::fixup_sparc_tls_ldo_hix22;
+  case VK_Sparc_TLS_LDO_LOX10: return Sparc::fixup_sparc_tls_ldo_lox10;
+  case VK_Sparc_TLS_LDO_ADD:   return Sparc::fixup_sparc_tls_ldo_add;
+  case VK_Sparc_TLS_IE_HI22:   return Sparc::fixup_sparc_tls_ie_hi22;
+  case VK_Sparc_TLS_IE_LO10:   return Sparc::fixup_sparc_tls_ie_lo10;
+  case VK_Sparc_TLS_IE_LD:     return Sparc::fixup_sparc_tls_ie_ld;
+  case VK_Sparc_TLS_IE_LDX:    return Sparc::fixup_sparc_tls_ie_ldx;
+  case VK_Sparc_TLS_IE_ADD:    return Sparc::fixup_sparc_tls_ie_add;
+  case VK_Sparc_TLS_LE_HIX22:  return Sparc::fixup_sparc_tls_le_hix22;
+  case VK_Sparc_TLS_LE_LOX10:  return Sparc::fixup_sparc_tls_le_lox10;
+  }
+}
+
+bool
+SparcMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+                                       const MCAsmLayout *Layout,
+                                       const MCFixup *Fixup) const {
+  return getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup);
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    cast<MCSymbolELF>(SymRef.getSymbol()).setType(ELF::STT_TLS);
+    break;
+  }
+
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+
+}
+
+void SparcMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  switch(getKind()) {
+  default: return;
+  case VK_Sparc_TLS_GD_HI22:
+  case VK_Sparc_TLS_GD_LO10:
+  case VK_Sparc_TLS_GD_ADD:
+  case VK_Sparc_TLS_GD_CALL:
+  case VK_Sparc_TLS_LDM_HI22:
+  case VK_Sparc_TLS_LDM_LO10:
+  case VK_Sparc_TLS_LDM_ADD:
+  case VK_Sparc_TLS_LDM_CALL:
+  case VK_Sparc_TLS_LDO_HIX22:
+  case VK_Sparc_TLS_LDO_LOX10:
+  case VK_Sparc_TLS_LDO_ADD:
+  case VK_Sparc_TLS_IE_HI22:
+  case VK_Sparc_TLS_IE_LO10:
+  case VK_Sparc_TLS_IE_LD:
+  case VK_Sparc_TLS_IE_LDX:
+  case VK_Sparc_TLS_IE_ADD:
+  case VK_Sparc_TLS_LE_HIX22:
+  case VK_Sparc_TLS_LE_LOX10: break;
+  }
+  fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}
+
+void SparcMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
new file mode 100644
index 000000000000..13f08195c764
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -0,0 +1,112 @@
+//====- SparcMCExpr.h - Sparc specific MC expression classes --*- C++ -*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Sparc-specific MCExprs, used for modifiers like
+// "%hi" or "%lo" etc.,
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCEXPR_H
+#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCEXPR_H
+
+#include "SparcFixupKinds.h"
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class StringRef;
+class SparcMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_Sparc_None,
+    VK_Sparc_LO,
+    VK_Sparc_HI,
+    VK_Sparc_H44,
+    VK_Sparc_M44,
+    VK_Sparc_L44,
+    VK_Sparc_HH,
+    VK_Sparc_HM,
+    VK_Sparc_PC22,
+    VK_Sparc_PC10,
+    VK_Sparc_GOT22,
+    VK_Sparc_GOT10,
+    VK_Sparc_WPLT30,
+    VK_Sparc_R_DISP32,
+    VK_Sparc_TLS_GD_HI22,
+    VK_Sparc_TLS_GD_LO10,
+    VK_Sparc_TLS_GD_ADD,
+    VK_Sparc_TLS_GD_CALL,
+    VK_Sparc_TLS_LDM_HI22,
+    VK_Sparc_TLS_LDM_LO10,
+    VK_Sparc_TLS_LDM_ADD,
+    VK_Sparc_TLS_LDM_CALL,
+    VK_Sparc_TLS_LDO_HIX22,
+    VK_Sparc_TLS_LDO_LOX10,
+    VK_Sparc_TLS_LDO_ADD,
+    VK_Sparc_TLS_IE_HI22,
+    VK_Sparc_TLS_IE_LO10,
+    VK_Sparc_TLS_IE_LD,
+    VK_Sparc_TLS_IE_LDX,
+    VK_Sparc_TLS_IE_ADD,
+    VK_Sparc_TLS_LE_HIX22,
+    VK_Sparc_TLS_LE_LOX10
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+
+  explicit SparcMCExpr(VariantKind Kind, const MCExpr *Expr)
+      : Kind(Kind), Expr(Expr) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const SparcMCExpr *create(VariantKind Kind, const MCExpr *Expr,
+                                 MCContext &Ctx);
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// getFixupKind - Get the fixup kind of this expression.
+  Sparc::Fixups getFixupKind() const { return getFixupKind(Kind); }
+
+  /// @}
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+  MCFragment *findAssociatedFragment() const override {
+    return getSubExpr()->findAssociatedFragment();
+  }
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+
+  static bool classof(const SparcMCExpr *) { return true; }
+
+  static VariantKind parseVariantKind(StringRef name);
+  static bool printVariantKind(raw_ostream &OS, VariantKind Kind);
+  static Sparc::Fixups getFixupKind(VariantKind Kind);
+};
+
+} // end namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
new file mode 100644
index 000000000000..889e2fd19ba9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -0,0 +1,170 @@
+//===-- SparcMCTargetDesc.cpp - Sparc Target Descriptions -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Sparc specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCTargetDesc.h"
+#include "InstPrinter/SparcInstPrinter.h"
+#include "SparcMCAsmInfo.h"
+#include "SparcTargetStreamer.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "SparcGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "SparcGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "SparcGenRegisterInfo.inc"
+
+static MCAsmInfo *createSparcMCAsmInfo(const MCRegisterInfo &MRI,
+                                       const Triple &TT) {
+  MCAsmInfo *MAI = new SparcELFMCAsmInfo(TT);
+  unsigned Reg = MRI.getDwarfRegNum(SP::O6, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
+  MAI->addInitialFrameState(Inst);
+  return MAI;
+}
+
+static MCAsmInfo *createSparcV9MCAsmInfo(const MCRegisterInfo &MRI,
+                                         const Triple &TT) {
+  MCAsmInfo *MAI = new SparcELFMCAsmInfo(TT);
+  unsigned Reg = MRI.getDwarfRegNum(SP::O6, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 2047);
+  MAI->addInitialFrameState(Inst);
+  return MAI;
+}
+
+static MCInstrInfo *createSparcMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitSparcMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createSparcMCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitSparcMCRegisterInfo(X, SP::O7);
+  return X;
+}
+
+static MCSubtargetInfo *
+createSparcMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  if (CPU.empty())
+    CPU = (TT.getArch() == Triple::sparcv9) ? "v9" : "v8";
+  return createSparcMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+// Code models. Some only make sense for 64-bit code.
+//
+// SunCC  Reloc   CodeModel  Constraints
+// abs32  Static  Small      text+data+bss linked below 2^32 bytes
+// abs44  Static  Medium     text+data+bss linked below 2^44 bytes
+// abs64  Static  Large      text smaller than 2^31 bytes
+// pic13  PIC_    Small      GOT < 2^13 bytes
+// pic32  PIC_    Medium     GOT < 2^32 bytes
+//
+// All code models require that the text segment is smaller than 2GB.
+
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+                              CodeModel::Model &CM) {
+  // The default 32-bit code model is abs32/pic32 and the default 32-bit
+  // code model for JIT is abs32.
+  switch (CM) {
+  default: break;
+  case CodeModel::Default:
+  case CodeModel::JITDefault: CM = CodeModel::Small; break;
+  }
+}
+
+static void adjustCodeGenOptsV9(const Triple &TT, Reloc::Model RM,
+                                CodeModel::Model &CM) {
+  // The default 64-bit code model is abs44/pic32 and the default 64-bit
+  // code model for JIT is abs64.
+  switch (CM) {
+  default:  break;
+  case CodeModel::Default:
+    CM = RM == Reloc::PIC_ ? CodeModel::Small : CodeModel::Medium;
+    break;
+  case CodeModel::JITDefault:
+    CM = CodeModel::Large;
+    break;
+  }
+}
+
+static MCTargetStreamer *
+createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  return new SparcTargetELFStreamer(S);
+}
+
+static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S,
+                                                 formatted_raw_ostream &OS,
+                                                 MCInstPrinter *InstPrint,
+                                                 bool isVerboseAsm) {
+  return new SparcTargetAsmStreamer(S, OS);
+}
+
+static MCInstPrinter *createSparcMCInstPrinter(const Triple &T,
+                                               unsigned SyntaxVariant,
+                                               const MCAsmInfo &MAI,
+                                               const MCInstrInfo &MII,
+                                               const MCRegisterInfo &MRI) {
+  return new SparcInstPrinter(MAI, MII, MRI);
+}
+
+extern "C" void LLVMInitializeSparcTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(getTheSparcTarget(), createSparcMCAsmInfo);
+  RegisterMCAsmInfoFn Y(getTheSparcV9Target(), createSparcV9MCAsmInfo);
+  RegisterMCAsmInfoFn Z(getTheSparcelTarget(), createSparcMCAsmInfo);
+
+  for (Target *T :
+       {&getTheSparcTarget(), &getTheSparcV9Target(), &getTheSparcelTarget()}) {
+    // Register the MC instruction info.
+    TargetRegistry::RegisterMCInstrInfo(*T, createSparcMCInstrInfo);
+
+    // Register the MC register info.
+    TargetRegistry::RegisterMCRegInfo(*T, createSparcMCRegisterInfo);
+
+    // Register the MC subtarget info.
+    TargetRegistry::RegisterMCSubtargetInfo(*T, createSparcMCSubtargetInfo);
+
+    // Register the MC Code Emitter.
+    TargetRegistry::RegisterMCCodeEmitter(*T, createSparcMCCodeEmitter);
+
+    // Register the asm backend.
+    TargetRegistry::RegisterMCAsmBackend(*T, createSparcAsmBackend);
+
+    // Register the object target streamer.
+    TargetRegistry::RegisterObjectTargetStreamer(*T,
+                                                 createObjectTargetStreamer);
+
+    // Register the asm streamer.
+    TargetRegistry::RegisterAsmTargetStreamer(*T, createTargetAsmStreamer);
+
+    // Register the MCInstPrinter
+    TargetRegistry::RegisterMCInstPrinter(*T, createSparcMCInstPrinter);
+  }
+
+  // Register the MC codegen info.
+  TargetRegistry::registerMCAdjustCodeGenOpts(getTheSparcTarget(),
+                                              adjustCodeGenOpts);
+  TargetRegistry::registerMCAdjustCodeGenOpts(getTheSparcV9Target(),
+                                              adjustCodeGenOptsV9);
+  TargetRegistry::registerMCAdjustCodeGenOpts(getTheSparcelTarget(),
+                                              adjustCodeGenOpts);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
new file mode 100644
index 000000000000..4e754c132d11
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
@@ -0,0 +1,62 @@
+//===-- SparcMCTargetDesc.h - Sparc Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Sparc specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCTARGETDESC_H
+#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class Target;
+class Triple;
+class StringRef;
+class raw_pwrite_stream;
+class raw_ostream;
+
+Target &getTheSparcTarget();
+Target &getTheSparcV9Target();
+Target &getTheSparcelTarget();
+
+MCCodeEmitter *createSparcMCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        MCContext &Ctx);
+MCAsmBackend *createSparcAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                    const Triple &TT, StringRef CPU,
+                                    const MCTargetOptions &Options);
+MCObjectWriter *createSparcELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+                                           bool IsLIttleEndian, uint8_t OSABI);
+} // End llvm namespace
+
+// Defines symbolic names for Sparc registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "SparcGenRegisterInfo.inc"
+
+// Defines symbolic names for the Sparc instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "SparcGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "SparcGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp
new file mode 100644
index 000000000000..94af791e0e75
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcTargetStreamer.cpp
@@ -0,0 +1,46 @@
+//===-- SparcTargetStreamer.cpp - Sparc Target Streamer Methods -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Sparc specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetStreamer.h"
+#include "InstPrinter/SparcInstPrinter.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+// pin vtable to this file
+SparcTargetStreamer::SparcTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+void SparcTargetStreamer::anchor() {}
+
+SparcTargetAsmStreamer::SparcTargetAsmStreamer(MCStreamer &S,
+                                               formatted_raw_ostream &OS)
+    : SparcTargetStreamer(S), OS(OS) {}
+
+void SparcTargetAsmStreamer::emitSparcRegisterIgnore(unsigned reg) {
+  OS << "\t.register "
+     << "%" << StringRef(SparcInstPrinter::getRegisterName(reg)).lower()
+     << ", #ignore\n";
+}
+
+void SparcTargetAsmStreamer::emitSparcRegisterScratch(unsigned reg) {
+  OS << "\t.register "
+     << "%" << StringRef(SparcInstPrinter::getRegisterName(reg)).lower()
+     << ", #scratch\n";
+}
+
+SparcTargetELFStreamer::SparcTargetELFStreamer(MCStreamer &S)
+    : SparcTargetStreamer(S) {}
+
+MCELFStreamer &SparcTargetELFStreamer::getStreamer() {
+  return static_cast<MCELFStreamer &>(Streamer);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/Sparc.h b/contrib/llvm/lib/Target/Sparc/Sparc.h
new file mode 100644
index 000000000000..0a8272d89297
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/Sparc.h
@@ -0,0 +1,167 @@
+//===-- Sparc.h - Top-level interface for Sparc representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Sparc back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARC_H
+#define LLVM_LIB_TARGET_SPARC_SPARC_H
+
+#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class FunctionPass;
+  class SparcTargetMachine;
+  class formatted_raw_ostream;
+  class AsmPrinter;
+  class MCInst;
+  class MachineInstr;
+
+  FunctionPass *createSparcISelDag(SparcTargetMachine &TM);
+  FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM);
+
+  void LowerSparcMachineInstrToMCInst(const MachineInstr *MI,
+                                      MCInst &OutMI,
+                                      AsmPrinter &AP);
+} // end namespace llvm;
+
+namespace llvm {
+  // Enums corresponding to Sparc condition codes, both icc's and fcc's.  These
+  // values must be kept in sync with the ones in the .td file.
+  namespace SPCC {
+    enum CondCodes {
+      ICC_A   =  8   ,  // Always
+      ICC_N   =  0   ,  // Never
+      ICC_NE  =  9   ,  // Not Equal
+      ICC_E   =  1   ,  // Equal
+      ICC_G   = 10   ,  // Greater
+      ICC_LE  =  2   ,  // Less or Equal
+      ICC_GE  = 11   ,  // Greater or Equal
+      ICC_L   =  3   ,  // Less
+      ICC_GU  = 12   ,  // Greater Unsigned
+      ICC_LEU =  4   ,  // Less or Equal Unsigned
+      ICC_CC  = 13   ,  // Carry Clear/Great or Equal Unsigned
+      ICC_CS  =  5   ,  // Carry Set/Less Unsigned
+      ICC_POS = 14   ,  // Positive
+      ICC_NEG =  6   ,  // Negative
+      ICC_VC  = 15   ,  // Overflow Clear
+      ICC_VS  =  7   ,  // Overflow Set
+
+      FCC_A   =  8+16,  // Always
+      FCC_N   =  0+16,  // Never
+      FCC_U   =  7+16,  // Unordered
+      FCC_G   =  6+16,  // Greater
+      FCC_UG  =  5+16,  // Unordered or Greater
+      FCC_L   =  4+16,  // Less
+      FCC_UL  =  3+16,  // Unordered or Less
+      FCC_LG  =  2+16,  // Less or Greater
+      FCC_NE  =  1+16,  // Not Equal
+      FCC_E   =  9+16,  // Equal
+      FCC_UE  = 10+16,  // Unordered or Equal
+      FCC_GE  = 11+16,  // Greater or Equal
+      FCC_UGE = 12+16,  // Unordered or Greater or Equal
+      FCC_LE  = 13+16,  // Less or Equal
+      FCC_ULE = 14+16,  // Unordered or Less or Equal
+      FCC_O   = 15+16,  // Ordered
+        
+      CPCC_A   =  8+32,  // Always
+      CPCC_N   =  0+32,  // Never
+      CPCC_3   =  7+32,
+      CPCC_2   =  6+32,
+      CPCC_23  =  5+32,
+      CPCC_1   =  4+32,
+      CPCC_13  =  3+32,
+      CPCC_12  =  2+32,
+      CPCC_123 =  1+32,
+      CPCC_0   =  9+32,
+      CPCC_03  = 10+32,
+      CPCC_02  = 11+32,
+      CPCC_023 = 12+32,
+      CPCC_01  = 13+32,
+      CPCC_013 = 14+32,
+      CPCC_012 = 15+32
+    };
+  }
+
+  inline static const char *SPARCCondCodeToString(SPCC::CondCodes CC) {
+    switch (CC) {
+    case SPCC::ICC_A:   return "a";
+    case SPCC::ICC_N:   return "n";
+    case SPCC::ICC_NE:  return "ne";
+    case SPCC::ICC_E:   return "e";
+    case SPCC::ICC_G:   return "g";
+    case SPCC::ICC_LE:  return "le";
+    case SPCC::ICC_GE:  return "ge";
+    case SPCC::ICC_L:   return "l";
+    case SPCC::ICC_GU:  return "gu";
+    case SPCC::ICC_LEU: return "leu";
+    case SPCC::ICC_CC:  return "cc";
+    case SPCC::ICC_CS:  return "cs";
+    case SPCC::ICC_POS: return "pos";
+    case SPCC::ICC_NEG: return "neg";
+    case SPCC::ICC_VC:  return "vc";
+    case SPCC::ICC_VS:  return "vs";
+    case SPCC::FCC_A:   return "a";
+    case SPCC::FCC_N:   return "n";
+    case SPCC::FCC_U:   return "u";
+    case SPCC::FCC_G:   return "g";
+    case SPCC::FCC_UG:  return "ug";
+    case SPCC::FCC_L:   return "l";
+    case SPCC::FCC_UL:  return "ul";
+    case SPCC::FCC_LG:  return "lg";
+    case SPCC::FCC_NE:  return "ne";
+    case SPCC::FCC_E:   return "e";
+    case SPCC::FCC_UE:  return "ue";
+    case SPCC::FCC_GE:  return "ge";
+    case SPCC::FCC_UGE: return "uge";
+    case SPCC::FCC_LE:  return "le";
+    case SPCC::FCC_ULE: return "ule";
+    case SPCC::FCC_O:   return "o";
+    case SPCC::CPCC_A:   return "a";
+    case SPCC::CPCC_N:   return "n";
+    case SPCC::CPCC_3:   return "3";
+    case SPCC::CPCC_2:   return "2";
+    case SPCC::CPCC_23:  return "23";
+    case SPCC::CPCC_1:   return "1";
+    case SPCC::CPCC_13:  return "13";
+    case SPCC::CPCC_12:  return "12";
+    case SPCC::CPCC_123: return "123";
+    case SPCC::CPCC_0:   return "0";
+    case SPCC::CPCC_03:  return "03";
+    case SPCC::CPCC_02:  return "02";
+    case SPCC::CPCC_023: return "023";
+    case SPCC::CPCC_01:  return "01";
+    case SPCC::CPCC_013: return "013";
+    case SPCC::CPCC_012: return "012";
+    }
+    llvm_unreachable("Invalid cond code");
+  }
+
+  inline static unsigned HI22(int64_t imm) {
+    return (unsigned)((imm >> 10) & ((1 << 22)-1));
+  }
+
+  inline static unsigned LO10(int64_t imm) {
+    return (unsigned)(imm & 0x3FF);
+  }
+
+  inline static unsigned HIX22(int64_t imm) {
+    return HI22(~imm);
+  }
+
+  inline static unsigned LOX10(int64_t imm) {
+    return ~LO10(~imm);
+  }
+
+}  // end namespace llvm
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/Sparc.td b/contrib/llvm/lib/Target/Sparc/Sparc.td
new file mode 100644
index 000000000000..11004c5a952f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/Sparc.td
@@ -0,0 +1,159 @@
+//===-- Sparc.td - Describe the Sparc Target Machine -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// SPARC Subtarget features.
+//
+
+def FeatureV9
+  : SubtargetFeature<"v9", "IsV9", "true",
+                     "Enable SPARC-V9 instructions">;
+def FeatureV8Deprecated
+  : SubtargetFeature<"deprecated-v8", "V8DeprecatedInsts", "true",
+                     "Enable deprecated V8 instructions in V9 mode">;
+def FeatureVIS
+  : SubtargetFeature<"vis", "IsVIS", "true",
+                     "Enable UltraSPARC Visual Instruction Set extensions">;
+def FeatureVIS2
+  : SubtargetFeature<"vis2", "IsVIS2", "true",
+                     "Enable Visual Instruction Set extensions II">;
+def FeatureVIS3
+  : SubtargetFeature<"vis3", "IsVIS3", "true",
+                     "Enable Visual Instruction Set extensions III">;
+def FeatureLeon
+  : SubtargetFeature<"leon", "IsLeon", "true",
+                     "Enable LEON extensions">;
+
+def FeatureHardQuad
+  : SubtargetFeature<"hard-quad-float", "HasHardQuad", "true",
+                     "Enable quad-word floating point instructions">;
+
+def UsePopc : SubtargetFeature<"popc", "UsePopc", "true",
+                               "Use the popc (population count) instruction">;
+
+def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
+                              "Use software emulation for floating point">;  
+
+//==== Features added predmoninantly for LEON subtarget support                               
+include "LeonFeatures.td"
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "SparcRegisterInfo.td"
+include "SparcCallingConv.td"
+include "SparcSchedule.td"
+include "SparcInstrInfo.td"
+
+def SparcInstrInfo : InstrInfo;
+
+def SparcAsmParser : AsmParser {
+  bit ShouldEmitMatchRegisterName = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// SPARC processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",         []>;
+def : Proc<"v7",              []>;
+def : Proc<"v8",              []>;
+def : Proc<"supersparc",      []>;
+def : Proc<"sparclite",       []>;
+def : Proc<"f934",            []>;
+def : Proc<"hypersparc",      []>;
+def : Proc<"sparclite86x",    []>;
+def : Proc<"sparclet",        []>;
+def : Proc<"tsc701",          []>;
+def : Proc<"myriad2",         [FeatureLeon, LeonCASA]>;
+def : Proc<"myriad2.1",       [FeatureLeon, LeonCASA]>;
+def : Proc<"myriad2.2",       [FeatureLeon, LeonCASA]>;
+def : Proc<"ma2100",          [FeatureLeon, LeonCASA]>;
+def : Proc<"ma2150",          [FeatureLeon, LeonCASA]>;
+def : Proc<"ma2450",          [FeatureLeon, LeonCASA]>;
+def : Proc<"v9",              [FeatureV9]>;
+def : Proc<"ultrasparc",      [FeatureV9, FeatureV8Deprecated, FeatureVIS]>;
+def : Proc<"ultrasparc3",     [FeatureV9, FeatureV8Deprecated, FeatureVIS,
+                               FeatureVIS2]>;
+def : Proc<"niagara",         [FeatureV9, FeatureV8Deprecated, FeatureVIS,
+                               FeatureVIS2]>;
+def : Proc<"niagara2",        [FeatureV9, FeatureV8Deprecated, UsePopc,
+                               FeatureVIS, FeatureVIS2]>;
+def : Proc<"niagara3",        [FeatureV9, FeatureV8Deprecated, UsePopc,
+                               FeatureVIS, FeatureVIS2]>;
+def : Proc<"niagara4",        [FeatureV9, FeatureV8Deprecated, UsePopc,
+                               FeatureVIS, FeatureVIS2, FeatureVIS3]>;
+
+// LEON 2 FT generic
+def : Processor<"leon2", LEON2Itineraries,
+                [FeatureLeon]>;
+
+// LEON 2 FT (AT697E)
+// TO DO: Place-holder: Processor specific features will be added *very* soon here.
+def : Processor<"at697e", LEON2Itineraries,
+                [FeatureLeon, ReplaceSDIV, InsertNOPLoad]>;
+
+// LEON 2 FT (AT697F)
+// TO DO: Place-holder: Processor specific features will be added *very* soon here.
+def : Processor<"at697f", LEON2Itineraries,
+                [FeatureLeon, InsertNOPLoad]>;
+
+
+// LEON 3 FT generic
+def : Processor<"leon3", LEON3Itineraries,
+                [FeatureLeon, UMACSMACSupport]>;
+
+// LEON 3 FT (UT699). Provides features for the UT699 processor
+// - covers all the erratum fixes for LEON3, but does not support the CASA instruction.
+def : Processor<"ut699", LEON3Itineraries, 
+                [FeatureLeon, InsertNOPLoad, FixFSMULD, ReplaceFMULS, FixAllFDIVSQRT]>;
+
+// LEON3 FT (GR712RC). Provides features for the GR712RC processor.
+// - covers all the erratum fixed for LEON3 and support for the CASA instruction. 
+def : Processor<"gr712rc", LEON3Itineraries,
+                [FeatureLeon, LeonCASA]>;
+
+// LEON 4 FT generic
+def : Processor<"leon4", LEON4Itineraries,
+                [FeatureLeon, UMACSMACSupport, LeonCASA]>;
+
+// LEON 4 FT (GR740) 
+// TO DO: Place-holder: Processor specific features will be added *very* soon here.
+def : Processor<"gr740", LEON4Itineraries, 
+                [FeatureLeon, UMACSMACSupport, LeonCASA]>;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def SparcAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  int PassSubtarget = 1;
+  int Variant = 0;
+}
+
+def Sparc : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = SparcInstrInfo;
+  let AssemblyParsers  = [SparcAsmParser];
+  let AssemblyWriters = [SparcAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
new file mode 100644
index 000000000000..31a128a5f271
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -0,0 +1,449 @@
+//===-- SparcAsmPrinter.cpp - Sparc LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format SPARC assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "InstPrinter/SparcInstPrinter.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "SparcInstrInfo.h"
+#include "SparcTargetMachine.h"
+#include "SparcTargetStreamer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+  class SparcAsmPrinter : public AsmPrinter {
+    SparcTargetStreamer &getTargetStreamer() {
+      return static_cast<SparcTargetStreamer &>(
+          *OutStreamer->getTargetStreamer());
+    }
+  public:
+    explicit SparcAsmPrinter(TargetMachine &TM,
+                             std::unique_ptr<MCStreamer> Streamer)
+        : AsmPrinter(TM, std::move(Streamer)) {}
+
+    StringRef getPassName() const override { return "Sparc Assembly Printer"; }
+
+    void printOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
+    void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &OS,
+                         const char *Modifier = nullptr);
+
+    void EmitFunctionBodyStart() override;
+    void EmitInstruction(const MachineInstr *MI) override;
+
+    static const char *getRegisterName(unsigned RegNo) {
+      return SparcInstPrinter::getRegisterName(RegNo);
+    }
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &O) override;
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &O) override;
+
+    void LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
+                                   const MCSubtargetInfo &STI);
+
+  };
+} // end of anonymous namespace
+
+static MCOperand createSparcMCOperand(SparcMCExpr::VariantKind Kind,
+                                      MCSymbol *Sym, MCContext &OutContext) {
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Sym,
+                                                         OutContext);
+  const SparcMCExpr *expr = SparcMCExpr::create(Kind, MCSym, OutContext);
+  return MCOperand::createExpr(expr);
+
+}
+static MCOperand createPCXCallOP(MCSymbol *Label,
+                                 MCContext &OutContext) {
+  return createSparcMCOperand(SparcMCExpr::VK_Sparc_None, Label, OutContext);
+}
+
+static MCOperand createPCXRelExprOp(SparcMCExpr::VariantKind Kind,
+                                    MCSymbol *GOTLabel, MCSymbol *StartLabel,
+                                    MCSymbol *CurLabel,
+                                    MCContext &OutContext)
+{
+  const MCSymbolRefExpr *GOT = MCSymbolRefExpr::create(GOTLabel, OutContext);
+  const MCSymbolRefExpr *Start = MCSymbolRefExpr::create(StartLabel,
+                                                         OutContext);
+  const MCSymbolRefExpr *Cur = MCSymbolRefExpr::create(CurLabel,
+                                                       OutContext);
+
+  const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Cur, Start, OutContext);
+  const MCBinaryExpr *Add = MCBinaryExpr::createAdd(GOT, Sub, OutContext);
+  const SparcMCExpr *expr = SparcMCExpr::create(Kind,
+                                                Add, OutContext);
+  return MCOperand::createExpr(expr);
+}
+
+static void EmitCall(MCStreamer &OutStreamer,
+                     MCOperand &Callee,
+                     const MCSubtargetInfo &STI)
+{
+  MCInst CallInst;
+  CallInst.setOpcode(SP::CALL);
+  CallInst.addOperand(Callee);
+  OutStreamer.EmitInstruction(CallInst, STI);
+}
+
+static void EmitSETHI(MCStreamer &OutStreamer,
+                      MCOperand &Imm, MCOperand &RD,
+                      const MCSubtargetInfo &STI)
+{
+  MCInst SETHIInst;
+  SETHIInst.setOpcode(SP::SETHIi);
+  SETHIInst.addOperand(RD);
+  SETHIInst.addOperand(Imm);
+  OutStreamer.EmitInstruction(SETHIInst, STI);
+}
+
+static void EmitBinary(MCStreamer &OutStreamer, unsigned Opcode,
+                       MCOperand &RS1, MCOperand &Src2, MCOperand &RD,
+                       const MCSubtargetInfo &STI)
+{
+  MCInst Inst;
+  Inst.setOpcode(Opcode);
+  Inst.addOperand(RD);
+  Inst.addOperand(RS1);
+  Inst.addOperand(Src2);
+  OutStreamer.EmitInstruction(Inst, STI);
+}
+
+static void EmitOR(MCStreamer &OutStreamer,
+                   MCOperand &RS1, MCOperand &Imm, MCOperand &RD,
+                   const MCSubtargetInfo &STI) {
+  EmitBinary(OutStreamer, SP::ORri, RS1, Imm, RD, STI);
+}
+
+static void EmitADD(MCStreamer &OutStreamer,
+                    MCOperand &RS1, MCOperand &RS2, MCOperand &RD,
+                    const MCSubtargetInfo &STI) {
+  EmitBinary(OutStreamer, SP::ADDrr, RS1, RS2, RD, STI);
+}
+
+static void EmitSHL(MCStreamer &OutStreamer,
+                    MCOperand &RS1, MCOperand &Imm, MCOperand &RD,
+                    const MCSubtargetInfo &STI) {
+  EmitBinary(OutStreamer, SP::SLLri, RS1, Imm, RD, STI);
+}
+
+
+static void EmitHiLo(MCStreamer &OutStreamer,  MCSymbol *GOTSym,
+                     SparcMCExpr::VariantKind HiKind,
+                     SparcMCExpr::VariantKind LoKind,
+                     MCOperand &RD,
+                     MCContext &OutContext,
+                     const MCSubtargetInfo &STI) {
+
+  MCOperand hi = createSparcMCOperand(HiKind, GOTSym, OutContext);
+  MCOperand lo = createSparcMCOperand(LoKind, GOTSym, OutContext);
+  EmitSETHI(OutStreamer, hi, RD, STI);
+  EmitOR(OutStreamer, RD, lo, RD, STI);
+}
+
+void SparcAsmPrinter::LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
+                                                const MCSubtargetInfo &STI)
+{
+  MCSymbol *GOTLabel   =
+    OutContext.getOrCreateSymbol(Twine("_GLOBAL_OFFSET_TABLE_"));
+
+  const MachineOperand &MO = MI->getOperand(0);
+  assert(MO.getReg() != SP::O7 &&
+         "%o7 is assigned as destination for getpcx!");
+
+  MCOperand MCRegOP = MCOperand::createReg(MO.getReg());
+
+
+  if (!isPositionIndependent()) {
+    // Just load the address of GOT to MCRegOP.
+    switch(TM.getCodeModel()) {
+    default:
+      llvm_unreachable("Unsupported absolute code model");
+    case CodeModel::Small:
+      EmitHiLo(*OutStreamer, GOTLabel,
+               SparcMCExpr::VK_Sparc_HI, SparcMCExpr::VK_Sparc_LO,
+               MCRegOP, OutContext, STI);
+      break;
+    case CodeModel::Medium: {
+      EmitHiLo(*OutStreamer, GOTLabel,
+               SparcMCExpr::VK_Sparc_H44, SparcMCExpr::VK_Sparc_M44,
+               MCRegOP, OutContext, STI);
+      MCOperand imm = MCOperand::createExpr(MCConstantExpr::create(12,
+                                                                   OutContext));
+      EmitSHL(*OutStreamer, MCRegOP, imm, MCRegOP, STI);
+      MCOperand lo = createSparcMCOperand(SparcMCExpr::VK_Sparc_L44,
+                                          GOTLabel, OutContext);
+      EmitOR(*OutStreamer, MCRegOP, lo, MCRegOP, STI);
+      break;
+    }
+    case CodeModel::Large: {
+      EmitHiLo(*OutStreamer, GOTLabel,
+               SparcMCExpr::VK_Sparc_HH, SparcMCExpr::VK_Sparc_HM,
+               MCRegOP, OutContext, STI);
+      MCOperand imm = MCOperand::createExpr(MCConstantExpr::create(32,
+                                                                   OutContext));
+      EmitSHL(*OutStreamer, MCRegOP, imm, MCRegOP, STI);
+      // Use register %o7 to load the lower 32 bits.
+      MCOperand RegO7 = MCOperand::createReg(SP::O7);
+      EmitHiLo(*OutStreamer, GOTLabel,
+               SparcMCExpr::VK_Sparc_HI, SparcMCExpr::VK_Sparc_LO,
+               RegO7, OutContext, STI);
+      EmitADD(*OutStreamer, MCRegOP, RegO7, MCRegOP, STI);
+    }
+    }
+    return;
+  }
+
+  MCSymbol *StartLabel = OutContext.createTempSymbol();
+  MCSymbol *EndLabel   = OutContext.createTempSymbol();
+  MCSymbol *SethiLabel = OutContext.createTempSymbol();
+
+  MCOperand RegO7   = MCOperand::createReg(SP::O7);
+
+  // <StartLabel>:
+  //   call <EndLabel>
+  // <SethiLabel>:
+  //     sethi %hi(_GLOBAL_OFFSET_TABLE_+(<SethiLabel>-<StartLabel>)), <MO>
+  // <EndLabel>:
+  //   or  <MO>, %lo(_GLOBAL_OFFSET_TABLE_+(<EndLabel>-<StartLabel>))), <MO>
+  //   add <MO>, %o7, <MO>
+
+  OutStreamer->EmitLabel(StartLabel);
+  MCOperand Callee =  createPCXCallOP(EndLabel, OutContext);
+  EmitCall(*OutStreamer, Callee, STI);
+  OutStreamer->EmitLabel(SethiLabel);
+  MCOperand hiImm = createPCXRelExprOp(SparcMCExpr::VK_Sparc_PC22,
+                                       GOTLabel, StartLabel, SethiLabel,
+                                       OutContext);
+  EmitSETHI(*OutStreamer, hiImm, MCRegOP, STI);
+  OutStreamer->EmitLabel(EndLabel);
+  MCOperand loImm = createPCXRelExprOp(SparcMCExpr::VK_Sparc_PC10,
+                                       GOTLabel, StartLabel, EndLabel,
+                                       OutContext);
+  EmitOR(*OutStreamer, MCRegOP, loImm, MCRegOP, STI);
+  EmitADD(*OutStreamer, MCRegOP, RegO7, MCRegOP, STI);
+}
+
+void SparcAsmPrinter::EmitInstruction(const MachineInstr *MI)
+{
+
+  switch (MI->getOpcode()) {
+  default: break;
+  case TargetOpcode::DBG_VALUE:
+    // FIXME: Debug Value.
+    return;
+  case SP::GETPCX:
+    LowerGETPCXAndEmitMCInsts(MI, getSubtargetInfo());
+    return;
+  }
+  MachineBasicBlock::const_instr_iterator I = MI->getIterator();
+  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+  do {
+    MCInst TmpInst;
+    LowerSparcMachineInstrToMCInst(&*I, TmpInst, *this);
+    EmitToStreamer(*OutStreamer, TmpInst);
+  } while ((++I != E) && I->isInsideBundle()); // Delay slot check.
+}
+
+void SparcAsmPrinter::EmitFunctionBodyStart() {
+  if (!MF->getSubtarget<SparcSubtarget>().is64Bit())
+    return;
+
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const unsigned globalRegs[] = { SP::G2, SP::G3, SP::G6, SP::G7, 0 };
+  for (unsigned i = 0; globalRegs[i] != 0; ++i) {
+    unsigned reg = globalRegs[i];
+    if (MRI.use_empty(reg))
+      continue;
+
+    if  (reg == SP::G6 || reg == SP::G7)
+      getTargetStreamer().emitSparcRegisterIgnore(reg);
+    else
+      getTargetStreamer().emitSparcRegisterScratch(reg);
+  }
+}
+
+void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                   raw_ostream &O) {
+  const DataLayout &DL = getDataLayout();
+  const MachineOperand &MO = MI->getOperand (opNum);
+  SparcMCExpr::VariantKind TF = (SparcMCExpr::VariantKind) MO.getTargetFlags();
+
+#ifndef NDEBUG
+  // Verify the target flags.
+  if (MO.isGlobal() || MO.isSymbol() || MO.isCPI()) {
+    if (MI->getOpcode() == SP::CALL)
+      assert(TF == SparcMCExpr::VK_Sparc_None &&
+             "Cannot handle target flags on call address");
+    else if (MI->getOpcode() == SP::SETHIi || MI->getOpcode() == SP::SETHIXi)
+      assert((TF == SparcMCExpr::VK_Sparc_HI
+              || TF == SparcMCExpr::VK_Sparc_H44
+              || TF == SparcMCExpr::VK_Sparc_HH
+              || TF == SparcMCExpr::VK_Sparc_TLS_GD_HI22
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDM_HI22
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDO_HIX22
+              || TF == SparcMCExpr::VK_Sparc_TLS_IE_HI22
+              || TF == SparcMCExpr::VK_Sparc_TLS_LE_HIX22) &&
+             "Invalid target flags for address operand on sethi");
+    else if (MI->getOpcode() == SP::TLS_CALL)
+      assert((TF == SparcMCExpr::VK_Sparc_None
+              || TF == SparcMCExpr::VK_Sparc_TLS_GD_CALL
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDM_CALL) &&
+             "Cannot handle target flags on tls call address");
+    else if (MI->getOpcode() == SP::TLS_ADDrr)
+      assert((TF == SparcMCExpr::VK_Sparc_TLS_GD_ADD
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDM_ADD
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDO_ADD
+              || TF == SparcMCExpr::VK_Sparc_TLS_IE_ADD) &&
+             "Cannot handle target flags on add for TLS");
+    else if (MI->getOpcode() == SP::TLS_LDrr)
+      assert(TF == SparcMCExpr::VK_Sparc_TLS_IE_LD &&
+             "Cannot handle target flags on ld for TLS");
+    else if (MI->getOpcode() == SP::TLS_LDXrr)
+      assert(TF == SparcMCExpr::VK_Sparc_TLS_IE_LDX &&
+             "Cannot handle target flags on ldx for TLS");
+    else if (MI->getOpcode() == SP::XORri || MI->getOpcode() == SP::XORXri)
+      assert((TF == SparcMCExpr::VK_Sparc_TLS_LDO_LOX10
+              || TF == SparcMCExpr::VK_Sparc_TLS_LE_LOX10) &&
+             "Cannot handle target flags on xor for TLS");
+    else
+      assert((TF == SparcMCExpr::VK_Sparc_LO
+              || TF == SparcMCExpr::VK_Sparc_M44
+              || TF == SparcMCExpr::VK_Sparc_L44
+              || TF == SparcMCExpr::VK_Sparc_HM
+              || TF == SparcMCExpr::VK_Sparc_TLS_GD_LO10
+              || TF == SparcMCExpr::VK_Sparc_TLS_LDM_LO10
+              || TF == SparcMCExpr::VK_Sparc_TLS_IE_LO10 ) &&
+             "Invalid target flags for small address operand");
+  }
+#endif
+
+
+  bool CloseParen = SparcMCExpr::printVariantKind(O, TF);
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << "%" << StringRef(getRegisterName(MO.getReg())).lower();
+    break;
+
+  case MachineOperand::MO_Immediate:
+    O << (int)MO.getImm();
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MO.getMBB()->getSymbol()->print(O, MAI);
+    return;
+  case MachineOperand::MO_GlobalAddress:
+    getSymbol(MO.getGlobal())->print(O, MAI);
+    break;
+  case MachineOperand::MO_BlockAddress:
+    O <<  GetBlockAddressSymbol(MO.getBlockAddress())->getName();
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    O << MO.getSymbolName();
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+      << MO.getIndex();
+    break;
+  case MachineOperand::MO_Metadata:
+    MO.getMetadata()->printAsOperand(O, MMI->getModule());
+    break;
+  default:
+    llvm_unreachable("<unknown operand type>");
+  }
+  if (CloseParen) O << ")";
+}
+
+void SparcAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+                                      raw_ostream &O, const char *Modifier) {
+  printOperand(MI, opNum, O);
+
+  // If this is an ADD operand, emit it like normal operands.
+  if (Modifier && !strcmp(Modifier, "arith")) {
+    O << ", ";
+    printOperand(MI, opNum+1, O);
+    return;
+  }
+
+  if (MI->getOperand(opNum+1).isReg() &&
+      MI->getOperand(opNum+1).getReg() == SP::G0)
+    return;   // don't print "+%g0"
+  if (MI->getOperand(opNum+1).isImm() &&
+      MI->getOperand(opNum+1).getImm() == 0)
+    return;   // don't print "+0"
+
+  O << "+";
+  printOperand(MI, opNum+1, O);
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode,
+                                      raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'f':
+    case 'r':
+     break;
+    }
+  }
+
+  printOperand(MI, OpNo, O);
+
+  return false;
+}
+
+bool SparcAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                            unsigned OpNo, unsigned AsmVariant,
+                                            const char *ExtraCode,
+                                            raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true;  // Unknown modifier
+
+  O << '[';
+  printMemOperand(MI, OpNo, O);
+  O << ']';
+
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeSparcAsmPrinter() {
+  RegisterAsmPrinter<SparcAsmPrinter> X(getTheSparcTarget());
+  RegisterAsmPrinter<SparcAsmPrinter> Y(getTheSparcV9Target());
+  RegisterAsmPrinter<SparcAsmPrinter> Z(getTheSparcelTarget());
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td b/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td
new file mode 100644
index 000000000000..0aa29d186dc1
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td
@@ -0,0 +1,144 @@
+//===-- SparcCallingConv.td - Calling Conventions Sparc ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the Sparc architectures.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SPARC v8 32-bit.
+//===----------------------------------------------------------------------===//
+
+def CC_Sparc32 : CallingConv<[
+  // Custom assign SRet to [sp+64].
+  CCIfSRet<CCCustom<"CC_Sparc_Assign_SRet">>,
+  // i32 f32 arguments get passed in integer registers if there is space.
+  CCIfType<[i32, f32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
+  // f64 arguments are split and passed through registers or through stack.
+  CCIfType<[f64], CCCustom<"CC_Sparc_Assign_Split_64">>,
+  // As are v2i32 arguments (this would be the default behavior for
+  // v2i32 if it wasn't allocated to the IntPair register-class)
+  CCIfType<[v2i32], CCCustom<"CC_Sparc_Assign_Split_64">>,
+
+
+  // Alternatively, they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
+
+def RetCC_Sparc32 : CallingConv<[
+  CCIfType<[i32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
+  CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1]>>,
+  CCIfType<[v2i32], CCCustom<"CC_Sparc_Assign_Ret_Split_64">>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// SPARC v9 64-bit.
+//===----------------------------------------------------------------------===//
+//
+// The 64-bit ABI conceptually assigns all function arguments to a parameter
+// array starting at [%fp+BIAS+128] in the callee's stack frame. All arguments
+// occupy a multiple of 8 bytes in the array. Integer arguments are extended to
+// 64 bits by the caller. Floats are right-aligned in their 8-byte slot, the
+// first 4 bytes in the slot are undefined.
+//
+// The integer registers %i0 to %i5 shadow the first 48 bytes of the parameter
+// array at fixed offsets. Integer arguments are promoted to registers when
+// possible.
+//
+// The floating point registers %f0 to %f31 shadow the first 128 bytes of the
+// parameter array at fixed offsets. Float and double parameters are promoted
+// to these registers when possible.
+//
+// Structs up to 16 bytes in size are passed by value. They are right-aligned
+// in one or two 8-byte slots in the parameter array. Struct members are
+// promoted to both floating point and integer registers when possible. A
+// struct containing two floats would thus be passed in %f0 and %f1, while two
+// float function arguments would occupy 8 bytes each, and be passed in %f1 and
+// %f3.
+//
+// When a struct { int, float } is passed by value, the int goes in the high
+// bits of an integer register while the float goes in a floating point
+// register.
+//
+// The difference is encoded in LLVM IR using the inreg atttribute on function
+// arguments:
+//
+//   C:   void f(float, float);
+//   IR:  declare void f(float %f1, float %f3)
+//
+//   C:   void f(struct { float f0, f1; });
+//   IR:  declare void f(float inreg %f0, float inreg %f1)
+//
+//   C:   void f(int, float);
+//   IR:  declare void f(int signext %i0, float %f3)
+//
+//   C:   void f(struct { int i0high; float f1; });
+//   IR:  declare void f(i32 inreg %i0high, float inreg %f1)
+//
+// Two ints in a struct are simply coerced to i64:
+//
+//   C:   void f(struct { int i0high, i0low; });
+//   IR:  declare void f(i64 %i0.coerced)
+//
+// The frontend and backend divide the task of producing ABI compliant code for
+// C functions. The C frontend will:
+//
+//  - Annotate integer arguments with zeroext or signext attributes.
+//
+//  - Split structs into one or two 64-bit sized chunks, or 32-bit chunks with
+//    inreg attributes.
+//
+//  - Pass structs larger than 16 bytes indirectly with an explicit pointer
+//    argument. The byval attribute is not used.
+//
+// The backend will:
+//
+//  - Assign all arguments to 64-bit aligned stack slots, 32-bits for inreg.
+//
+//  - Promote to integer or floating point registers depending on type.
+//
+// Function return values are passed exactly like function arguments, except a
+// struct up to 32 bytes in size can be returned in registers.
+
+// Function arguments AND most return values.
+def CC_Sparc64 : CallingConv<[
+  // The frontend uses the inreg flag to indicate i32 and float arguments from
+  // structs. These arguments are not promoted to 64 bits, but they can still
+  // be assigned to integer and float registers.
+  CCIfInReg<CCIfType<[i32, f32], CCCustom<"CC_Sparc64_Half">>>,
+
+  // All integers are promoted to i64 by the caller.
+  CCIfType<[i32], CCPromoteToType<i64>>,
+
+  // Custom assignment is required because stack space is reserved for all
+  // arguments whether they are passed in registers or not.
+  CCCustom<"CC_Sparc64_Full">
+]>;
+
+def RetCC_Sparc64 : CallingConv<[
+  // A single f32 return value always goes in %f0. The ABI doesn't specify what
+  // happens to multiple f32 return values outside a struct.
+  CCIfType<[f32], CCCustom<"CC_Sparc64_Half">>,
+
+  // Otherwise, return values are passed exactly like arguments.
+  CCDelegateTo<CC_Sparc64>
+]>;
+
+// Callee-saved registers are handled by the register window mechanism.
+def CSR : CalleeSavedRegs<(add)> {
+  let OtherPreserved = (add (sequence "I%u", 0, 7),
+                            (sequence "L%u", 0, 7));
+}
+
+// Callee-saved registers for calls with ReturnsTwice attribute.
+def RTCSR : CalleeSavedRegs<(add)> {
+  let OtherPreserved = (add I6, I7);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
new file mode 100644
index 000000000000..122f830e0dc5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -0,0 +1,368 @@
+//===-- SparcFrameLowering.cpp - Sparc Frame Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcFrameLowering.h"
+#include "SparcInstrInfo.h"
+#include "SparcMachineFunctionInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+DisableLeafProc("disable-sparc-leaf-proc",
+                cl::init(false),
+                cl::desc("Disable Sparc leaf procedure optimization."),
+                cl::Hidden);
+
+SparcFrameLowering::SparcFrameLowering(const SparcSubtarget &ST)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
+                          ST.is64Bit() ? 16 : 8, 0, ST.is64Bit() ? 16 : 8) {}
+
+void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF,
+                                          MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          int NumBytes,
+                                          unsigned ADDrr,
+                                          unsigned ADDri) const {
+
+  DebugLoc dl;
+  const SparcInstrInfo &TII =
+      *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  if (NumBytes >= -4096 && NumBytes < 4096) {
+    BuildMI(MBB, MBBI, dl, TII.get(ADDri), SP::O6)
+      .addReg(SP::O6).addImm(NumBytes);
+    return;
+  }
+
+  // Emit this the hard way.  This clobbers G1 which we always know is
+  // available here.
+  if (NumBytes >= 0) {
+    // Emit nonnegative numbers with sethi + or.
+    // sethi %hi(NumBytes), %g1
+    // or %g1, %lo(NumBytes), %g1
+    // add %sp, %g1, %sp
+    BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1)
+      .addImm(HI22(NumBytes));
+    BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
+      .addReg(SP::G1).addImm(LO10(NumBytes));
+    BuildMI(MBB, MBBI, dl, TII.get(ADDrr), SP::O6)
+      .addReg(SP::O6).addReg(SP::G1);
+    return ;
+  }
+
+  // Emit negative numbers with sethi + xor.
+  // sethi %hix(NumBytes), %g1
+  // xor %g1, %lox(NumBytes), %g1
+  // add %sp, %g1, %sp
+  BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1)
+    .addImm(HIX22(NumBytes));
+  BuildMI(MBB, MBBI, dl, TII.get(SP::XORri), SP::G1)
+    .addReg(SP::G1).addImm(LOX10(NumBytes));
+  BuildMI(MBB, MBBI, dl, TII.get(ADDrr), SP::O6)
+    .addReg(SP::O6).addReg(SP::G1);
+}
+
+void SparcFrameLowering::emitPrologue(MachineFunction &MF,
+                                      MachineBasicBlock &MBB) const {
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+
+  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const SparcInstrInfo &TII =
+      *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SparcRegisterInfo &RegInfo =
+      *static_cast<const SparcRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc dl;
+  bool NeedsStackRealignment = RegInfo.needsStackRealignment(MF);
+
+  // FIXME: unfortunately, returning false from canRealignStack
+  // actually just causes needsStackRealignment to return false,
+  // rather than reporting an error, as would be sensible. This is
+  // poor, but fixing that bogosity is going to be a large project.
+  // For now, just see if it's lied, and report an error here.
+  if (!NeedsStackRealignment && MFI.getMaxAlignment() > getStackAlignment())
+    report_fatal_error("Function \"" + Twine(MF.getName()) + "\" required "
+                       "stack re-alignment, but LLVM couldn't handle it "
+                       "(probably because it has a dynamic alloca).");
+
+  // Get the number of bytes to allocate from the FrameInfo
+  int NumBytes = (int) MFI.getStackSize();
+
+  unsigned SAVEri = SP::SAVEri;
+  unsigned SAVErr = SP::SAVErr;
+  if (FuncInfo->isLeafProc()) {
+    if (NumBytes == 0)
+      return;
+    SAVEri = SP::ADDri;
+    SAVErr = SP::ADDrr;
+  }
+
+  // The SPARC ABI is a bit odd in that it requires a reserved 92-byte
+  // (128 in v9) area in the user's stack, starting at %sp. Thus, the
+  // first part of the stack that can actually be used is located at
+  // %sp + 92.
+  //
+  // We therefore need to add that offset to the total stack size
+  // after all the stack objects are placed by
+  // PrologEpilogInserter calculateFrameObjectOffsets. However, since the stack needs to be
+  // aligned *after* the extra size is added, we need to disable
+  // calculateFrameObjectOffsets's built-in stack alignment, by having
+  // targetHandlesStackFrameRounding return true.
+
+
+  // Add the extra call frame stack size, if needed. (This is the same
+  // code as in PrologEpilogInserter, but also gets disabled by
+  // targetHandlesStackFrameRounding)
+  if (MFI.adjustsStack() && hasReservedCallFrame(MF))
+    NumBytes += MFI.getMaxCallFrameSize();
+
+  // Adds the SPARC subtarget-specific spill area to the stack
+  // size. Also ensures target-required alignment.
+  NumBytes = MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes);
+
+  // Finally, ensure that the size is sufficiently aligned for the
+  // data on the stack.
+  if (MFI.getMaxAlignment() > 0) {
+    NumBytes = alignTo(NumBytes, MFI.getMaxAlignment());
+  }
+
+  // Update stack size with corrected value.
+  MFI.setStackSize(NumBytes);
+
+  emitSPAdjustment(MF, MBB, MBBI, -NumBytes, SAVErr, SAVEri);
+
+  unsigned regFP = RegInfo.getDwarfRegNum(SP::I6, true);
+
+  // Emit ".cfi_def_cfa_register 30".
+  unsigned CFIIndex =
+      MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, regFP));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // Emit ".cfi_window_save".
+  CFIIndex = MF.addFrameInst(MCCFIInstruction::createWindowSave(nullptr));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  unsigned regInRA = RegInfo.getDwarfRegNum(SP::I7, true);
+  unsigned regOutRA = RegInfo.getDwarfRegNum(SP::O7, true);
+  // Emit ".cfi_register 15, 31".
+  CFIIndex = MF.addFrameInst(
+      MCCFIInstruction::createRegister(nullptr, regOutRA, regInRA));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  if (NeedsStackRealignment) {
+    // andn %o6, MaxAlign-1, %o6
+    int MaxAlign = MFI.getMaxAlignment();
+    BuildMI(MBB, MBBI, dl, TII.get(SP::ANDNri), SP::O6).addReg(SP::O6).addImm(MaxAlign - 1);
+  }
+}
+
+MachineBasicBlock::iterator SparcFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    MachineInstr &MI = *I;
+    int Size = MI.getOperand(0).getImm();
+    if (MI.getOpcode() == SP::ADJCALLSTACKDOWN)
+      Size = -Size;
+
+    if (Size)
+      emitSPAdjustment(MF, MBB, I, Size, SP::ADDrr, SP::ADDri);
+  }
+  return MBB.erase(I);
+}
+
+
+void SparcFrameLowering::emitEpilogue(MachineFunction &MF,
+                                  MachineBasicBlock &MBB) const {
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  const SparcInstrInfo &TII =
+      *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  DebugLoc dl = MBBI->getDebugLoc();
+  assert(MBBI->getOpcode() == SP::RETL &&
+         "Can only put epilog before 'retl' instruction!");
+  if (!FuncInfo->isLeafProc()) {
+    BuildMI(MBB, MBBI, dl, TII.get(SP::RESTORErr), SP::G0).addReg(SP::G0)
+      .addReg(SP::G0);
+    return;
+  }
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  int NumBytes = (int) MFI.getStackSize();
+  if (NumBytes == 0)
+    return;
+
+  emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri);
+}
+
+bool SparcFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  // Reserve call frame if there are no variable sized objects on the stack.
+  return !MF.getFrameInfo().hasVarSizedObjects();
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool SparcFrameLowering::hasFP(const MachineFunction &MF) const {
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+      RegInfo->needsStackRealignment(MF) ||
+      MFI.hasVarSizedObjects() ||
+      MFI.isFrameAddressTaken();
+}
+
+
+int SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                               unsigned &FrameReg) const {
+  const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const SparcRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+  bool isFixed = MFI.isFixedObjectIndex(FI);
+
+  // Addressable stack objects are accessed using neg. offsets from
+  // %fp, or positive offsets from %sp.
+  bool UseFP;
+
+  // Sparc uses FP-based references in general, even when "hasFP" is
+  // false. That function is rather a misnomer, because %fp is
+  // actually always available, unless isLeafProc.
+  if (FuncInfo->isLeafProc()) {
+    // If there's a leaf proc, all offsets need to be %sp-based,
+    // because we haven't caused %fp to actually point to our frame.
+    UseFP = false;
+  } else if (isFixed) {
+    // Otherwise, argument access should always use %fp.
+    UseFP = true;
+  } else if (RegInfo->needsStackRealignment(MF)) {
+    // If there is dynamic stack realignment, all local object
+    // references need to be via %sp, to take account of the
+    // re-alignment.
+    UseFP = false;
+  } else {
+    // Finally, default to using %fp.
+    UseFP = true;
+  }
+
+  int64_t FrameOffset = MF.getFrameInfo().getObjectOffset(FI) +
+      Subtarget.getStackPointerBias();
+
+  if (UseFP) {
+    FrameReg = RegInfo->getFrameRegister(MF);
+    return FrameOffset;
+  } else {
+    FrameReg = SP::O6; // %sp
+    return FrameOffset + MF.getFrameInfo().getStackSize();
+  }
+}
+
+static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
+{
+
+  for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
+    if (!MRI->reg_nodbg_empty(reg))
+      return false;
+
+  for (unsigned reg = SP::L0; reg <= SP::L7; ++reg)
+    if (!MRI->reg_nodbg_empty(reg))
+      return false;
+
+  return true;
+}
+
+bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
+{
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineFrameInfo    &MFI = MF.getFrameInfo();
+
+  return !(MFI.hasCalls()                  // has calls
+           || !MRI.reg_nodbg_empty(SP::L0) // Too many registers needed
+           || !MRI.reg_nodbg_empty(SP::O6) // %SP is used
+           || hasFP(MF));                  // need %FP
+}
+
+void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  // Remap %i[0-7] to %o[0-7].
+  for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
+    if (MRI.reg_nodbg_empty(reg))
+      continue;
+
+    unsigned mapped_reg = reg - SP::I0 + SP::O0;
+    assert(MRI.reg_nodbg_empty(mapped_reg));
+
+    // Replace I register with O register.
+    MRI.replaceRegWith(reg, mapped_reg);
+
+    // Also replace register pair super-registers.
+    if ((reg - SP::I0) % 2 == 0) {
+      unsigned preg = (reg - SP::I0) / 2 + SP::I0_I1;
+      unsigned mapped_preg = preg - SP::I0_I1 + SP::O0_O1;
+      MRI.replaceRegWith(preg, mapped_preg);
+    }
+  }
+
+  // Rewrite MBB's Live-ins.
+  for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+       MBB != E; ++MBB) {
+    for (unsigned reg = SP::I0_I1; reg <= SP::I6_I7; ++reg) {
+      if (!MBB->isLiveIn(reg))
+        continue;
+      MBB->removeLiveIn(reg);
+      MBB->addLiveIn(reg - SP::I0_I1 + SP::O0_O1);
+    }
+    for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
+      if (!MBB->isLiveIn(reg))
+        continue;
+      MBB->removeLiveIn(reg);
+      MBB->addLiveIn(reg - SP::I0 + SP::O0);
+    }
+  }
+
+  assert(verifyLeafProcRegUse(&MRI));
+#ifdef EXPENSIVE_CHECKS
+  MF.verify(0, "After LeafProc Remapping");
+#endif
+}
+
+void SparcFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                              BitVector &SavedRegs,
+                                              RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+  if (!DisableLeafProc && isLeafProc(MF)) {
+    SparcMachineFunctionInfo *MFI = MF.getInfo<SparcMachineFunctionInfo>();
+    MFI->setLeafProc(true);
+
+    remapRegsForLeafProc(MF);
+  }
+
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h
new file mode 100644
index 000000000000..ac0e69ccde1e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h
@@ -0,0 +1,68 @@
+//===-- SparcFrameLowering.h - Define frame lowering for Sparc --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCFRAMELOWERING_H
+#define LLVM_LIB_TARGET_SPARC_SPARCFRAMELOWERING_H
+
+#include "Sparc.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class SparcSubtarget;
+class SparcFrameLowering : public TargetFrameLowering {
+public:
+  explicit SparcFrameLowering(const SparcSubtarget &ST);
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
+
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  bool hasFP(const MachineFunction &MF) const override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS = nullptr) const override;
+
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+
+  /// targetHandlesStackFrameRounding - Returns true if the target is
+  /// responsible for rounding up the stack frame (probably at emitPrologue
+  /// time).
+  bool targetHandlesStackFrameRounding() const override { return true; }
+
+private:
+  // Remap input registers to output registers for leaf procedure.
+  void remapRegsForLeafProc(MachineFunction &MF) const;
+
+  // Returns true if MF is a leaf procedure.
+  bool isLeafProc(MachineFunction &MF) const;
+
+
+  // Emits code for adjusting SP in function prologue/epilogue.
+  void emitSPAdjustment(MachineFunction &MF,
+                        MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI,
+                        int NumBytes, unsigned ADDrr, unsigned ADDri) const;
+
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
new file mode 100644
index 000000000000..c36e75d1b076
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -0,0 +1,405 @@
+//===-- SparcISelDAGToDAG.cpp - A dag to dag inst selector for Sparc ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the SPARC target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetMachine.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===--------------------------------------------------------------------===//
+/// SparcDAGToDAGISel - SPARC specific code to select SPARC machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+class SparcDAGToDAGISel : public SelectionDAGISel {
+  /// Subtarget - Keep a pointer to the Sparc Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const SparcSubtarget *Subtarget;
+public:
+  explicit SparcDAGToDAGISel(SparcTargetMachine &tm) : SelectionDAGISel(tm) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    Subtarget = &MF.getSubtarget<SparcSubtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
+
+  void Select(SDNode *N) override;
+
+  // Complex Pattern Selectors.
+  bool SelectADDRrr(SDValue N, SDValue &R1, SDValue &R2);
+  bool SelectADDRri(SDValue N, SDValue &Base, SDValue &Offset);
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    unsigned ConstraintID,
+                                    std::vector<SDValue> &OutOps) override;
+
+  StringRef getPassName() const override {
+    return "SPARC DAG->DAG Pattern Instruction Selection";
+  }
+
+  // Include the pieces autogenerated from the target description.
+#include "SparcGenDAGISel.inc"
+
+private:
+  SDNode* getGlobalBaseReg();
+  bool tryInlineAsm(SDNode *N);
+};
+}  // end anonymous namespace
+
+SDNode* SparcDAGToDAGISel::getGlobalBaseReg() {
+  unsigned GlobalBaseReg = Subtarget->getInstrInfo()->getGlobalBaseReg(MF);
+  return CurDAG->getRegister(GlobalBaseReg,
+                             TLI->getPointerTy(CurDAG->getDataLayout()))
+      .getNode();
+}
+
+bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(
+        FIN->getIndex(), TLI->getPointerTy(CurDAG->getDataLayout()));
+    Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress ||
+      Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
+    return false;  // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      if (isInt<13>(CN->getSExtValue())) {
+        if (FrameIndexSDNode *FIN =
+                dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
+          // Constant offset from frame ref.
+          Base = CurDAG->getTargetFrameIndex(
+              FIN->getIndex(), TLI->getPointerTy(CurDAG->getDataLayout()));
+        } else {
+          Base = Addr.getOperand(0);
+        }
+        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(Addr),
+                                           MVT::i32);
+        return true;
+      }
+    }
+    if (Addr.getOperand(0).getOpcode() == SPISD::Lo) {
+      Base = Addr.getOperand(1);
+      Offset = Addr.getOperand(0).getOperand(0);
+      return true;
+    }
+    if (Addr.getOperand(1).getOpcode() == SPISD::Lo) {
+      Base = Addr.getOperand(0);
+      Offset = Addr.getOperand(1).getOperand(0);
+      return true;
+    }
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+  return true;
+}
+
+bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
+  if (Addr.getOpcode() == ISD::FrameIndex) return false;
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress ||
+      Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
+    return false;  // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      if (isInt<13>(CN->getSExtValue()))
+        return false;  // Let the reg+imm pattern catch this!
+    if (Addr.getOperand(0).getOpcode() == SPISD::Lo ||
+        Addr.getOperand(1).getOpcode() == SPISD::Lo)
+      return false;  // Let the reg+imm pattern catch this!
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+    return true;
+  }
+
+  R1 = Addr;
+  R2 = CurDAG->getRegister(SP::G0, TLI->getPointerTy(CurDAG->getDataLayout()));
+  return true;
+}
+
+
+// Re-assemble i64 arguments split up in SelectionDAGBuilder's
+// visitInlineAsm / GetRegistersForValue functions.
+//
+// Note: This function was copied from, and is essentially identical
+// to ARMISelDAGToDAG::SelectInlineAsm. It is very unfortunate that
+// such hacking-up is necessary; a rethink of how inline asm operands
+// are handled may be in order to make doing this more sane.
+//
+// TODO: fix inline asm support so I can simply tell it that 'i64'
+// inputs to asm need to be allocated to the IntPair register type,
+// and have that work. Then, delete this function.
+bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){
+  std::vector<SDValue> AsmNodeOperands;
+  unsigned Flag, Kind;
+  bool Changed = false;
+  unsigned NumOps = N->getNumOperands();
+
+  // Normally, i64 data is bounded to two arbitrary GPRs for "%r"
+  // constraint.  However, some instructions (e.g. ldd/std) require
+  // (even/even+1) GPRs.
+
+  // So, here, we check for this case, and mutate the inlineasm to use
+  // a single IntPair register instead, which guarantees such even/odd
+  // placement.
+
+  SDLoc dl(N);
+  SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1)
+                                   : SDValue(nullptr,0);
+
+  SmallVector<bool, 8> OpChanged;
+  // Glue node will be appended late.
+  for(unsigned i = 0, e = N->getGluedNode() ? NumOps - 1 : NumOps; i < e; ++i) {
+    SDValue op = N->getOperand(i);
+    AsmNodeOperands.push_back(op);
+
+    if (i < InlineAsm::Op_FirstOperand)
+      continue;
+
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i))) {
+      Flag = C->getZExtValue();
+      Kind = InlineAsm::getKind(Flag);
+    }
+    else
+      continue;
+
+    // Immediate operands to inline asm in the SelectionDAG are modeled with
+    // two operands. The first is a constant of value InlineAsm::Kind_Imm, and
+    // the second is a constant with the value of the immediate. If we get here
+    // and we have a Kind_Imm, skip the next operand, and continue.
+    if (Kind == InlineAsm::Kind_Imm) {
+      SDValue op = N->getOperand(++i);
+      AsmNodeOperands.push_back(op);
+      continue;
+    }
+
+    unsigned NumRegs = InlineAsm::getNumOperandRegisters(Flag);
+    if (NumRegs)
+      OpChanged.push_back(false);
+
+    unsigned DefIdx = 0;
+    bool IsTiedToChangedOp = false;
+    // If it's a use that is tied with a previous def, it has no
+    // reg class constraint.
+    if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx))
+      IsTiedToChangedOp = OpChanged[DefIdx];
+
+    if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef
+        && Kind != InlineAsm::Kind_RegDefEarlyClobber)
+      continue;
+
+    unsigned RC;
+    bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC);
+    if ((!IsTiedToChangedOp && (!HasRC || RC != SP::IntRegsRegClassID))
+        || NumRegs != 2)
+      continue;
+
+    assert((i+2 < NumOps) && "Invalid number of operands in inline asm");
+    SDValue V0 = N->getOperand(i+1);
+    SDValue V1 = N->getOperand(i+2);
+    unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg();
+    unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg();
+    SDValue PairedReg;
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+
+    if (Kind == InlineAsm::Kind_RegDef ||
+        Kind == InlineAsm::Kind_RegDefEarlyClobber) {
+      // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to
+      // the original GPRs.
+
+      unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
+      PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32);
+      SDValue Chain = SDValue(N,0);
+
+      SDNode *GU = N->getGluedUser();
+      SDValue RegCopy = CurDAG->getCopyFromReg(Chain, dl, GPVR, MVT::v2i32,
+                                               Chain.getValue(1));
+
+      // Extract values from a GPRPair reg and copy to the original GPR reg.
+      SDValue Sub0 = CurDAG->getTargetExtractSubreg(SP::sub_even, dl, MVT::i32,
+                                                    RegCopy);
+      SDValue Sub1 = CurDAG->getTargetExtractSubreg(SP::sub_odd, dl, MVT::i32,
+                                                    RegCopy);
+      SDValue T0 = CurDAG->getCopyToReg(Sub0, dl, Reg0, Sub0,
+                                        RegCopy.getValue(1));
+      SDValue T1 = CurDAG->getCopyToReg(Sub1, dl, Reg1, Sub1, T0.getValue(1));
+
+      // Update the original glue user.
+      std::vector<SDValue> Ops(GU->op_begin(), GU->op_end()-1);
+      Ops.push_back(T1.getValue(1));
+      CurDAG->UpdateNodeOperands(GU, Ops);
+    }
+    else {
+      // For Kind  == InlineAsm::Kind_RegUse, we first copy two GPRs into a
+      // GPRPair and then pass the GPRPair to the inline asm.
+      SDValue Chain = AsmNodeOperands[InlineAsm::Op_InputChain];
+
+      // As REG_SEQ doesn't take RegisterSDNode, we copy them first.
+      SDValue T0 = CurDAG->getCopyFromReg(Chain, dl, Reg0, MVT::i32,
+                                          Chain.getValue(1));
+      SDValue T1 = CurDAG->getCopyFromReg(Chain, dl, Reg1, MVT::i32,
+                                          T0.getValue(1));
+      SDValue Pair = SDValue(
+          CurDAG->getMachineNode(
+              TargetOpcode::REG_SEQUENCE, dl, MVT::v2i32,
+              {
+                  CurDAG->getTargetConstant(SP::IntPairRegClassID, dl,
+                                            MVT::i32),
+                  T0,
+                  CurDAG->getTargetConstant(SP::sub_even, dl, MVT::i32),
+                  T1,
+                  CurDAG->getTargetConstant(SP::sub_odd, dl, MVT::i32),
+              }),
+          0);
+
+      // Copy REG_SEQ into a GPRPair-typed VR and replace the original two
+      // i32 VRs of inline asm with it.
+      unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass);
+      PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32);
+      Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1));
+
+      AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
+      Glue = Chain.getValue(1);
+    }
+
+    Changed = true;
+
+    if(PairedReg.getNode()) {
+      OpChanged[OpChanged.size() -1 ] = true;
+      Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/);
+      if (IsTiedToChangedOp)
+        Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx);
+      else
+        Flag = InlineAsm::getFlagWordForRegClass(Flag, SP::IntPairRegClassID);
+      // Replace the current flag.
+      AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant(
+          Flag, dl, MVT::i32);
+      // Add the new register node and skip the original two GPRs.
+      AsmNodeOperands.push_back(PairedReg);
+      // Skip the next two GPRs.
+      i += 2;
+    }
+  }
+
+  if (Glue.getNode())
+    AsmNodeOperands.push_back(Glue);
+  if (!Changed)
+    return false;
+
+  SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
+      CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
+  New->setNodeId(-1);
+  ReplaceNode(N, New.getNode());
+  return true;
+}
+
+void SparcDAGToDAGISel::Select(SDNode *N) {
+  SDLoc dl(N);
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return;   // Already selected.
+  }
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::INLINEASM: {
+    if (tryInlineAsm(N))
+      return;
+    break;
+  }
+  case SPISD::GLOBAL_BASE_REG:
+    ReplaceNode(N, getGlobalBaseReg());
+    return;
+
+  case ISD::SDIV:
+  case ISD::UDIV: {
+    // sdivx / udivx handle 64-bit divides.
+    if (N->getValueType(0) == MVT::i64)
+      break;
+    // FIXME: should use a custom expander to expose the SRA to the dag.
+    SDValue DivLHS = N->getOperand(0);
+    SDValue DivRHS = N->getOperand(1);
+
+    // Set the Y register to the high-part.
+    SDValue TopPart;
+    if (N->getOpcode() == ISD::SDIV) {
+      TopPart = SDValue(CurDAG->getMachineNode(SP::SRAri, dl, MVT::i32, DivLHS,
+                                   CurDAG->getTargetConstant(31, dl, MVT::i32)),
+                        0);
+    } else {
+      TopPart = CurDAG->getRegister(SP::G0, MVT::i32);
+    }
+    TopPart = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SP::Y, TopPart,
+                                   SDValue())
+                  .getValue(1);
+
+    // FIXME: Handle div by immediate.
+    unsigned Opcode = N->getOpcode() == ISD::SDIV ? SP::SDIVrr : SP::UDIVrr;
+    // SDIV is a hardware erratum on some LEON2 processors. Replace it with SDIVcc here.
+    if (((SparcTargetMachine&)TM).getSubtargetImpl()->performSDIVReplace()
+        &&
+        Opcode == SP::SDIVrr) {
+      Opcode = SP::SDIVCCrr;
+    }
+    CurDAG->SelectNodeTo(N, Opcode, MVT::i32, DivLHS, DivRHS, TopPart);
+    return;
+  }
+  }
+
+  SelectCode(N);
+}
+
+
+/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+/// inline asm expressions.
+bool
+SparcDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                                unsigned ConstraintID,
+                                                std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+  switch (ConstraintID) {
+  default: return true;
+  case InlineAsm::Constraint_i:
+  case InlineAsm::Constraint_o:
+  case InlineAsm::Constraint_m: // memory
+   if (!SelectADDRrr(Op, Op0, Op1))
+     SelectADDRri(Op, Op0, Op1);
+   break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  return false;
+}
+
+/// createSparcISelDag - This pass converts a legalized DAG into a
+/// SPARC-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createSparcISelDag(SparcTargetMachine &TM) {
+  return new SparcDAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
new file mode 100644
index 000000000000..2ac9aae2471b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -0,0 +1,3574 @@
+//===-- SparcISelLowering.cpp - Sparc DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interfaces that Sparc uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcISelLowering.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "SparcMachineFunctionInfo.h"
+#include "SparcRegisterInfo.h"
+#include "SparcTargetMachine.h"
+#include "SparcTargetObjectFile.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+static bool CC_Sparc_Assign_SRet(unsigned &ValNo, MVT &ValVT,
+                                 MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+                                 ISD::ArgFlagsTy &ArgFlags, CCState &State)
+{
+  assert (ArgFlags.isSRet());
+
+  // Assign SRet argument.
+  State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                         0,
+                                         LocVT, LocInfo));
+  return true;
+}
+
+static bool CC_Sparc_Assign_Split_64(unsigned &ValNo, MVT &ValVT,
+                                     MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+                                     ISD::ArgFlagsTy &ArgFlags, CCState &State)
+{
+  static const MCPhysReg RegList[] = {
+    SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+  };
+  // Try to get first reg.
+  if (unsigned Reg = State.AllocateReg(RegList)) {
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  } else {
+    // Assign whole thing in stack.
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(8,4),
+                                           LocVT, LocInfo));
+    return true;
+  }
+
+  // Try to get second reg.
+  if (unsigned Reg = State.AllocateReg(RegList))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(4,4),
+                                           LocVT, LocInfo));
+  return true;
+}
+
+static bool CC_Sparc_Assign_Ret_Split_64(unsigned &ValNo, MVT &ValVT,
+                                         MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+                                         ISD::ArgFlagsTy &ArgFlags, CCState &State)
+{
+  static const MCPhysReg RegList[] = {
+    SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+  };
+
+  // Try to get first reg.
+  if (unsigned Reg = State.AllocateReg(RegList))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else
+    return false;
+
+  // Try to get second reg.
+  if (unsigned Reg = State.AllocateReg(RegList))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else
+    return false;
+
+  return true;
+}
+
+// Allocate a full-sized argument for the 64-bit ABI.
+static bool CC_Sparc64_Full(unsigned &ValNo, MVT &ValVT,
+                            MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+                            ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  assert((LocVT == MVT::f32 || LocVT == MVT::f128
+          || LocVT.getSizeInBits() == 64) &&
+         "Can't handle non-64 bits locations");
+
+  // Stack space is allocated for all arguments starting from [%fp+BIAS+128].
+  unsigned size      = (LocVT == MVT::f128) ? 16 : 8;
+  unsigned alignment = (LocVT == MVT::f128) ? 16 : 8;
+  unsigned Offset = State.AllocateStack(size, alignment);
+  unsigned Reg = 0;
+
+  if (LocVT == MVT::i64 && Offset < 6*8)
+    // Promote integers to %i0-%i5.
+    Reg = SP::I0 + Offset/8;
+  else if (LocVT == MVT::f64 && Offset < 16*8)
+    // Promote doubles to %d0-%d30. (Which LLVM calls D0-D15).
+    Reg = SP::D0 + Offset/8;
+  else if (LocVT == MVT::f32 && Offset < 16*8)
+    // Promote floats to %f1, %f3, ...
+    Reg = SP::F1 + Offset/4;
+  else if (LocVT == MVT::f128 && Offset < 16*8)
+    // Promote long doubles to %q0-%q28. (Which LLVM calls Q0-Q7).
+    Reg = SP::Q0 + Offset/16;
+
+  // Promote to register when possible, otherwise use the stack slot.
+  if (Reg) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return true;
+  }
+
+  // This argument goes on the stack in an 8-byte slot.
+  // When passing floats, LocVT is smaller than 8 bytes. Adjust the offset to
+  // the right-aligned float. The first 4 bytes of the stack slot are undefined.
+  if (LocVT == MVT::f32)
+    Offset += 4;
+
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return true;
+}
+
+// Allocate a half-sized argument for the 64-bit ABI.
+//
+// This is used when passing { float, int } structs by value in registers.
+static bool CC_Sparc64_Half(unsigned &ValNo, MVT &ValVT,
+                            MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+                            ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  assert(LocVT.getSizeInBits() == 32 && "Can't handle non-32 bits locations");
+  unsigned Offset = State.AllocateStack(4, 4);
+
+  if (LocVT == MVT::f32 && Offset < 16*8) {
+    // Promote floats to %f0-%f31.
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, SP::F0 + Offset/4,
+                                     LocVT, LocInfo));
+    return true;
+  }
+
+  if (LocVT == MVT::i32 && Offset < 6*8) {
+    // Promote integers to %i0-%i5, using half the register.
+    unsigned Reg = SP::I0 + Offset/8;
+    LocVT = MVT::i64;
+    LocInfo = CCValAssign::AExt;
+
+    // Set the Custom bit if this i32 goes in the high bits of a register.
+    if (Offset % 8 == 0)
+      State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg,
+                                             LocVT, LocInfo));
+    else
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return true;
+  }
+
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return true;
+}
+
+#include "SparcGenCallingConv.inc"
+
+// The calling conventions in SparcCallingConv.td are described in terms of the
+// callee's register window. This function translates registers to the
+// corresponding caller window %o register.
+static unsigned toCallerWindow(unsigned Reg) {
+  static_assert(SP::I0 + 7 == SP::I7 && SP::O0 + 7 == SP::O7,
+                "Unexpected enum");
+  if (Reg >= SP::I0 && Reg <= SP::I7)
+    return Reg - SP::I0 + SP::O0;
+  return Reg;
+}
+
+SDValue
+SparcTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                 bool IsVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 const SDLoc &DL, SelectionDAG &DAG) const {
+  if (Subtarget->is64Bit())
+    return LowerReturn_64(Chain, CallConv, IsVarArg, Outs, OutVals, DL, DAG);
+  return LowerReturn_32(Chain, CallConv, IsVarArg, Outs, OutVals, DL, DAG);
+}
+
+SDValue
+SparcTargetLowering::LowerReturn_32(SDValue Chain, CallingConv::ID CallConv,
+                                    bool IsVarArg,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SDLoc &DL, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // CCValAssign - represent the assignment of the return value to locations.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  // Analyze return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_Sparc32);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  // Make room for the return address offset.
+  RetOps.push_back(SDValue());
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0, realRVLocIdx = 0;
+       i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    SDValue Arg = OutVals[realRVLocIdx];
+
+    if (VA.needsCustom()) {
+      assert(VA.getLocVT() == MVT::v2i32);
+      // Legalize ret v2i32 -> ret 2 x i32 (Basically: do what would
+      // happen by default if this wasn't a legal type)
+
+      SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+                                  Arg,
+                                  DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
+      SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+                                  Arg,
+                                  DAG.getConstant(1, DL, getVectorIdxTy(DAG.getDataLayout())));
+
+      Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part0, Flag);
+      Flag = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+      VA = RVLocs[++i]; // skip ahead to next loc
+      Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part1,
+                               Flag);
+    } else
+      Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  unsigned RetAddrOffset = 8; // Call Inst + Delay Slot
+  // If the function returns a struct, copy the SRetReturnReg to I0
+  if (MF.getFunction()->hasStructRetAttr()) {
+    SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
+    unsigned Reg = SFI->getSRetReturnReg();
+    if (!Reg)
+      llvm_unreachable("sret virtual register not created in the entry block");
+    auto PtrVT = getPointerTy(DAG.getDataLayout());
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, PtrVT);
+    Chain = DAG.getCopyToReg(Chain, DL, SP::I0, Val, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(SP::I0, PtrVT));
+    RetAddrOffset = 12; // CallInst + Delay Slot + Unimp
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+  RetOps[1] = DAG.getConstant(RetAddrOffset, DL, MVT::i32);
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
+// Lower return values for the 64-bit ABI.
+// Return values are passed the exactly the same way as function arguments.
+SDValue
+SparcTargetLowering::LowerReturn_64(SDValue Chain, CallingConv::ID CallConv,
+                                    bool IsVarArg,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SDLoc &DL, SelectionDAG &DAG) const {
+  // CCValAssign - represent the assignment of the return value to locations.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  // Analyze return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_Sparc64);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // The second operand on the return instruction is the return address offset.
+  // The return address is always %i7+8 with the 64-bit ABI.
+  RetOps.push_back(DAG.getConstant(8, DL, MVT::i32));
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue OutVal = OutVals[i];
+
+    // Integer return values must be sign or zero extended by the callee.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal);
+      break;
+    case CCValAssign::ZExt:
+      OutVal = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), OutVal);
+      break;
+    case CCValAssign::AExt:
+      OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
+      break;
+    default:
+      llvm_unreachable("Unknown loc info!");
+    }
+
+    // The custom bit on an i32 return value indicates that it should be passed
+    // in the high bits of the register.
+    if (VA.getValVT() == MVT::i32 && VA.needsCustom()) {
+      OutVal = DAG.getNode(ISD::SHL, DL, MVT::i64, OutVal,
+                           DAG.getConstant(32, DL, MVT::i32));
+
+      // The next value may go in the low bits of the same register.
+      // Handle both at once.
+      if (i+1 < RVLocs.size() && RVLocs[i+1].getLocReg() == VA.getLocReg()) {
+        SDValue NV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, OutVals[i+1]);
+        OutVal = DAG.getNode(ISD::OR, DL, MVT::i64, OutVal, NV);
+        // Skip the next value, it's already done.
+        ++i;
+      }
+    }
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
+SDValue SparcTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  if (Subtarget->is64Bit())
+    return LowerFormalArguments_64(Chain, CallConv, IsVarArg, Ins,
+                                   DL, DAG, InVals);
+  return LowerFormalArguments_32(Chain, CallConv, IsVarArg, Ins,
+                                 DL, DAG, InVals);
+}
+
+/// LowerFormalArguments32 - V8 uses a very simple ABI, where all values are
+/// passed in either one or two GPRs, including FP values.  TODO: we should
+/// pass FP values in FP registers for fastcc functions.
+SDValue SparcTargetLowering::LowerFormalArguments_32(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32);
+
+  const unsigned StackOffset = 92;
+  bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
+
+  unsigned InIdx = 0;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i, ++InIdx) {
+    CCValAssign &VA = ArgLocs[i];
+
+    if (Ins[InIdx].Flags.isSRet()) {
+      if (InIdx != 0)
+        report_fatal_error("sparc only supports sret on the first parameter");
+      // Get SRet from [%fp+64].
+      int FrameIdx = MF.getFrameInfo().CreateFixedObject(4, 64, true);
+      SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+      SDValue Arg =
+          DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo());
+      InVals.push_back(Arg);
+      continue;
+    }
+
+    if (VA.isRegLoc()) {
+      if (VA.needsCustom()) {
+        assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32);
+
+        unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+        MF.getRegInfo().addLiveIn(VA.getLocReg(), VRegHi);
+        SDValue HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32);
+
+        assert(i+1 < e);
+        CCValAssign &NextVA = ArgLocs[++i];
+
+        SDValue LoVal;
+        if (NextVA.isMemLoc()) {
+          int FrameIdx = MF.getFrameInfo().
+            CreateFixedObject(4, StackOffset+NextVA.getLocMemOffset(),true);
+          SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+          LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo());
+        } else {
+          unsigned loReg = MF.addLiveIn(NextVA.getLocReg(),
+                                        &SP::IntRegsRegClass);
+          LoVal = DAG.getCopyFromReg(Chain, dl, loReg, MVT::i32);
+        }
+
+        if (IsLittleEndian)
+          std::swap(LoVal, HiVal);
+
+        SDValue WholeValue =
+          DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
+        WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), WholeValue);
+        InVals.push_back(WholeValue);
+        continue;
+      }
+      unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+      MF.getRegInfo().addLiveIn(VA.getLocReg(), VReg);
+      SDValue Arg = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+      if (VA.getLocVT() == MVT::f32)
+        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Arg);
+      else if (VA.getLocVT() != MVT::i32) {
+        Arg = DAG.getNode(ISD::AssertSext, dl, MVT::i32, Arg,
+                          DAG.getValueType(VA.getLocVT()));
+        Arg = DAG.getNode(ISD::TRUNCATE, dl, VA.getLocVT(), Arg);
+      }
+      InVals.push_back(Arg);
+      continue;
+    }
+
+    assert(VA.isMemLoc());
+
+    unsigned Offset = VA.getLocMemOffset()+StackOffset;
+    auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+    if (VA.needsCustom()) {
+      assert(VA.getValVT() == MVT::f64 || VA.getValVT() == MVT::v2i32);
+      // If it is double-word aligned, just load.
+      if (Offset % 8 == 0) {
+        int FI = MF.getFrameInfo().CreateFixedObject(8,
+                                                     Offset,
+                                                     true);
+        SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
+        SDValue Load =
+            DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr, MachinePointerInfo());
+        InVals.push_back(Load);
+        continue;
+      }
+
+      int FI = MF.getFrameInfo().CreateFixedObject(4,
+                                                   Offset,
+                                                   true);
+      SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
+      SDValue HiVal =
+          DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo());
+      int FI2 = MF.getFrameInfo().CreateFixedObject(4,
+                                                    Offset+4,
+                                                    true);
+      SDValue FIPtr2 = DAG.getFrameIndex(FI2, PtrVT);
+
+      SDValue LoVal =
+          DAG.getLoad(MVT::i32, dl, Chain, FIPtr2, MachinePointerInfo());
+
+      if (IsLittleEndian)
+        std::swap(LoVal, HiVal);
+
+      SDValue WholeValue =
+        DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
+      WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), WholeValue);
+      InVals.push_back(WholeValue);
+      continue;
+    }
+
+    int FI = MF.getFrameInfo().CreateFixedObject(4,
+                                                 Offset,
+                                                 true);
+    SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
+    SDValue Load ;
+    if (VA.getValVT() == MVT::i32 || VA.getValVT() == MVT::f32) {
+      Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr, MachinePointerInfo());
+    } else if (VA.getValVT() == MVT::f128) {
+      report_fatal_error("SPARCv8 does not handle f128 in calls; "
+                         "pass indirectly");
+    } else {
+      // We shouldn't see any other value types here.
+      llvm_unreachable("Unexpected ValVT encountered in frame lowering.");
+    }
+    InVals.push_back(Load);
+  }
+
+  if (MF.getFunction()->hasStructRetAttr()) {
+    // Copy the SRet Argument to SRetReturnReg.
+    SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
+    unsigned Reg = SFI->getSRetReturnReg();
+    if (!Reg) {
+      Reg = MF.getRegInfo().createVirtualRegister(&SP::IntRegsRegClass);
+      SFI->setSRetReturnReg(Reg);
+    }
+    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+  }
+
+  // Store remaining ArgRegs to the stack if this is a varargs function.
+  if (isVarArg) {
+    static const MCPhysReg ArgRegs[] = {
+      SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+    };
+    unsigned NumAllocated = CCInfo.getFirstUnallocated(ArgRegs);
+    const MCPhysReg *CurArgReg = ArgRegs+NumAllocated, *ArgRegEnd = ArgRegs+6;
+    unsigned ArgOffset = CCInfo.getNextStackOffset();
+    if (NumAllocated == 6)
+      ArgOffset += StackOffset;
+    else {
+      assert(!ArgOffset);
+      ArgOffset = 68+4*NumAllocated;
+    }
+
+    // Remember the vararg offset for the va_start implementation.
+    FuncInfo->setVarArgsFrameOffset(ArgOffset);
+
+    std::vector<SDValue> OutChains;
+
+    for (; CurArgReg != ArgRegEnd; ++CurArgReg) {
+      unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+      MF.getRegInfo().addLiveIn(*CurArgReg, VReg);
+      SDValue Arg = DAG.getCopyFromReg(DAG.getRoot(), dl, VReg, MVT::i32);
+
+      int FrameIdx = MF.getFrameInfo().CreateFixedObject(4, ArgOffset,
+                                                         true);
+      SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+
+      OutChains.push_back(
+          DAG.getStore(DAG.getRoot(), dl, Arg, FIPtr, MachinePointerInfo()));
+      ArgOffset += 4;
+    }
+
+    if (!OutChains.empty()) {
+      OutChains.push_back(Chain);
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+    }
+  }
+
+  return Chain;
+}
+
+// Lower formal arguments for the 64 bit ABI.
+SDValue SparcTargetLowering::LowerFormalArguments_64(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Analyze arguments according to CC_Sparc64.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc64);
+
+  // The argument array begins at %fp+BIAS+128, after the register save area.
+  const unsigned ArgArea = 128;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (VA.isRegLoc()) {
+      // This argument is passed in a register.
+      // All integer register arguments are promoted by the caller to i64.
+
+      // Create a virtual register for the promoted live-in value.
+      unsigned VReg = MF.addLiveIn(VA.getLocReg(),
+                                   getRegClassFor(VA.getLocVT()));
+      SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());
+
+      // Get the high bits for i32 struct elements.
+      if (VA.getValVT() == MVT::i32 && VA.needsCustom())
+        Arg = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Arg,
+                          DAG.getConstant(32, DL, MVT::i32));
+
+      // The caller promoted the argument, so insert an Assert?ext SDNode so we
+      // won't promote the value again in this function.
+      switch (VA.getLocInfo()) {
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg,
+                          DAG.getValueType(VA.getValVT()));
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
+                          DAG.getValueType(VA.getValVT()));
+        break;
+      default:
+        break;
+      }
+
+      // Truncate the register down to the argument type.
+      if (VA.isExtInLoc())
+        Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
+
+      InVals.push_back(Arg);
+      continue;
+    }
+
+    // The registers are exhausted. This argument was passed on the stack.
+    assert(VA.isMemLoc());
+    // The CC_Sparc64_Full/Half functions compute stack offsets relative to the
+    // beginning of the arguments area at %fp+BIAS+128.
+    unsigned Offset = VA.getLocMemOffset() + ArgArea;
+    unsigned ValSize = VA.getValVT().getSizeInBits() / 8;
+    // Adjust offset for extended arguments, SPARC is big-endian.
+    // The caller will have written the full slot with extended bytes, but we
+    // prefer our own extending loads.
+    if (VA.isExtInLoc())
+      Offset += 8 - ValSize;
+    int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true);
+    InVals.push_back(
+        DAG.getLoad(VA.getValVT(), DL, Chain,
+                    DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
+                    MachinePointerInfo::getFixedStack(MF, FI)));
+  }
+
+  if (!IsVarArg)
+    return Chain;
+
+  // This function takes variable arguments, some of which may have been passed
+  // in registers %i0-%i5. Variable floating point arguments are never passed
+  // in floating point registers. They go on %i0-%i5 or on the stack like
+  // integer arguments.
+  //
+  // The va_start intrinsic needs to know the offset to the first variable
+  // argument.
+  unsigned ArgOffset = CCInfo.getNextStackOffset();
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+  // Skip the 128 bytes of register save area.
+  FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgArea +
+                                  Subtarget->getStackPointerBias());
+
+  // Save the variable arguments that were passed in registers.
+  // The caller is required to reserve stack space for 6 arguments regardless
+  // of how many arguments were actually passed.
+  SmallVector<SDValue, 8> OutChains;
+  for (; ArgOffset < 6*8; ArgOffset += 8) {
+    unsigned VReg = MF.addLiveIn(SP::I0 + ArgOffset/8, &SP::I64RegsRegClass);
+    SDValue VArg = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
+    int FI = MF.getFrameInfo().CreateFixedObject(8, ArgOffset + ArgArea, true);
+    auto PtrVT = getPointerTy(MF.getDataLayout());
+    OutChains.push_back(
+        DAG.getStore(Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT),
+                     MachinePointerInfo::getFixedStack(MF, FI)));
+  }
+
+  if (!OutChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
+
+  return Chain;
+}
+
+SDValue
+SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                               SmallVectorImpl<SDValue> &InVals) const {
+  if (Subtarget->is64Bit())
+    return LowerCall_64(CLI, InVals);
+  return LowerCall_32(CLI, InVals);
+}
+
+static bool hasReturnsTwiceAttr(SelectionDAG &DAG, SDValue Callee,
+                                     ImmutableCallSite *CS) {
+  if (CS)
+    return CS->hasFnAttr(Attribute::ReturnsTwice);
+
+  const Function *CalleeFn = nullptr;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    CalleeFn = dyn_cast<Function>(G->getGlobal());
+  } else if (ExternalSymbolSDNode *E =
+             dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const Function *Fn = DAG.getMachineFunction().getFunction();
+    const Module *M = Fn->getParent();
+    const char *CalleeName = E->getSymbol();
+    CalleeFn = M->getFunction(CalleeName);
+  }
+
+  if (!CalleeFn)
+    return false;
+  return CalleeFn->hasFnAttribute(Attribute::ReturnsTwice);
+}
+
+// Lower a call for the 32-bit ABI.
+SDValue
+SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
+                                  SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  // Sparc target does not yet support tail call optimization.
+  isTailCall = false;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeCallOperands(Outs, CC_Sparc32);
+
+  // Get the size of the outgoing arguments stack space requirement.
+  unsigned ArgsSize = CCInfo.getNextStackOffset();
+
+  // Keep stack frames 8-byte aligned.
+  ArgsSize = (ArgsSize+7) & ~7;
+
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+  // Create local copies for byval args.
+  SmallVector<SDValue, 8> ByValArgs;
+  for (unsigned i = 0,  e = Outs.size(); i != e; ++i) {
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    if (!Flags.isByVal())
+      continue;
+
+    SDValue Arg = OutVals[i];
+    unsigned Size = Flags.getByValSize();
+    unsigned Align = Flags.getByValAlign();
+
+    if (Size > 0U) {
+      int FI = MFI.CreateStackObject(Size, Align, false);
+      SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+      SDValue SizeNode = DAG.getConstant(Size, dl, MVT::i32);
+
+      Chain = DAG.getMemcpy(Chain, dl, FIPtr, Arg, SizeNode, Align,
+                            false,        // isVolatile,
+                            (Size <= 32), // AlwaysInline if size <= 32,
+                            false,        // isTailCall
+                            MachinePointerInfo(), MachinePointerInfo());
+      ByValArgs.push_back(FIPtr);
+    }
+    else {
+      SDValue nullVal;
+      ByValArgs.push_back(nullVal);
+    }
+  }
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, dl, true),
+                               dl);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  const unsigned StackOffset = 92;
+  bool hasStructRetAttr = false;
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, realArgIdx = 0, byvalArgIdx = 0, e = ArgLocs.size();
+       i != e;
+       ++i, ++realArgIdx) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[realArgIdx];
+
+    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+
+    // Use local copy if it is a byval arg.
+    if (Flags.isByVal()) {
+      Arg = ByValArgs[byvalArgIdx++];
+      if (!Arg) {
+        continue;
+      }
+    }
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (Flags.isSRet()) {
+      assert(VA.needsCustom());
+      // store SRet argument in %sp+64
+      SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+      SDValue PtrOff = DAG.getIntPtrConstant(64, dl);
+      PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+      MemOpChains.push_back(
+          DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
+      hasStructRetAttr = true;
+      continue;
+    }
+
+    if (VA.needsCustom()) {
+      assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32);
+
+      if (VA.isMemLoc()) {
+        unsigned Offset = VA.getLocMemOffset() + StackOffset;
+        // if it is double-word aligned, just store.
+        if (Offset % 8 == 0) {
+          SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+          SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
+          PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+          MemOpChains.push_back(
+              DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
+          continue;
+        }
+      }
+
+      if (VA.getLocVT() == MVT::f64) {
+        // Move from the float value from float registers into the
+        // integer registers.
+
+        // TODO: The f64 -> v2i32 conversion is super-inefficient for
+        // constants: it sticks them in the constant pool, then loads
+        // to a fp register, then stores to temp memory, then loads to
+        // integer registers.
+        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, Arg);
+      }
+
+      SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                  Arg,
+                                  DAG.getConstant(0, dl, getVectorIdxTy(DAG.getDataLayout())));
+      SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                  Arg,
+                                  DAG.getConstant(1, dl, getVectorIdxTy(DAG.getDataLayout())));
+
+      if (VA.isRegLoc()) {
+        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Part0));
+        assert(i+1 != e);
+        CCValAssign &NextVA = ArgLocs[++i];
+        if (NextVA.isRegLoc()) {
+          RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Part1));
+        } else {
+          // Store the second part in stack.
+          unsigned Offset = NextVA.getLocMemOffset() + StackOffset;
+          SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+          SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
+          PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+          MemOpChains.push_back(
+              DAG.getStore(Chain, dl, Part1, PtrOff, MachinePointerInfo()));
+        }
+      } else {
+        unsigned Offset = VA.getLocMemOffset() + StackOffset;
+        // Store the first part.
+        SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+        SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
+        PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+        MemOpChains.push_back(
+            DAG.getStore(Chain, dl, Part0, PtrOff, MachinePointerInfo()));
+        // Store the second part.
+        PtrOff = DAG.getIntPtrConstant(Offset + 4, dl);
+        PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+        MemOpChains.push_back(
+            DAG.getStore(Chain, dl, Part1, PtrOff, MachinePointerInfo()));
+      }
+      continue;
+    }
+
+    // Arguments that can be passed on register must be kept at
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      if (VA.getLocVT() != MVT::f32) {
+        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+        continue;
+      }
+      Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      continue;
+    }
+
+    assert(VA.isMemLoc());
+
+    // Create a store off the stack pointer for this argument.
+    SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+    SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() + StackOffset,
+                                           dl);
+    PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+    MemOpChains.push_back(
+        DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
+  }
+
+
+  // Emit all stores, make sure the occur before any copies into physregs.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emitted instructions must be
+  // stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    unsigned Reg = toCallerWindow(RegsToPass[i].first);
+    Chain = DAG.getCopyToReg(Chain, dl, Reg, RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  unsigned SRetArgSize = (hasStructRetAttr)? getSRetArgSize(DAG, Callee):0;
+  bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30 : 0;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32, 0, TF);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32, TF);
+
+  // Returns a chain & a flag for retval copy to use
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+  if (hasStructRetAttr)
+    Ops.push_back(DAG.getTargetConstant(SRetArgSize, dl, MVT::i32));
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(toCallerWindow(RegsToPass[i].first),
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const uint32_t *Mask =
+      ((hasReturnsTwice)
+           ? TRI->getRTCallPreservedMask(CallConv)
+           : TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv));
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(SPISD::CALL, dl, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, dl, true),
+                             DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+  InFlag = Chain.getValue(1);
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  RVInfo.AnalyzeCallResult(Ins, RetCC_Sparc32);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    if (RVLocs[i].getLocVT() == MVT::v2i32) {
+      SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2i32);
+      SDValue Lo = DAG.getCopyFromReg(
+          Chain, dl, toCallerWindow(RVLocs[i++].getLocReg()), MVT::i32, InFlag);
+      Chain = Lo.getValue(1);
+      InFlag = Lo.getValue(2);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2i32, Vec, Lo,
+                        DAG.getConstant(0, dl, MVT::i32));
+      SDValue Hi = DAG.getCopyFromReg(
+          Chain, dl, toCallerWindow(RVLocs[i].getLocReg()), MVT::i32, InFlag);
+      Chain = Hi.getValue(1);
+      InFlag = Hi.getValue(2);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2i32, Vec, Hi,
+                        DAG.getConstant(1, dl, MVT::i32));
+      InVals.push_back(Vec);
+    } else {
+      Chain =
+          DAG.getCopyFromReg(Chain, dl, toCallerWindow(RVLocs[i].getLocReg()),
+                             RVLocs[i].getValVT(), InFlag)
+              .getValue(1);
+      InFlag = Chain.getValue(2);
+      InVals.push_back(Chain.getValue(0));
+    }
+  }
+
+  return Chain;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned SparcTargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                               SelectionDAG &DAG) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+    .Case("i0", SP::I0).Case("i1", SP::I1).Case("i2", SP::I2).Case("i3", SP::I3)
+    .Case("i4", SP::I4).Case("i5", SP::I5).Case("i6", SP::I6).Case("i7", SP::I7)
+    .Case("o0", SP::O0).Case("o1", SP::O1).Case("o2", SP::O2).Case("o3", SP::O3)
+    .Case("o4", SP::O4).Case("o5", SP::O5).Case("o6", SP::O6).Case("o7", SP::O7)
+    .Case("l0", SP::L0).Case("l1", SP::L1).Case("l2", SP::L2).Case("l3", SP::L3)
+    .Case("l4", SP::L4).Case("l5", SP::L5).Case("l6", SP::L6).Case("l7", SP::L7)
+    .Case("g0", SP::G0).Case("g1", SP::G1).Case("g2", SP::G2).Case("g3", SP::G3)
+    .Case("g4", SP::G4).Case("g5", SP::G5).Case("g6", SP::G6).Case("g7", SP::G7)
+    .Default(0);
+
+  if (Reg)
+    return Reg;
+
+  report_fatal_error("Invalid register name global variable");
+}
+
+// This functions returns true if CalleeName is a ABI function that returns
+// a long double (fp128).
+static bool isFP128ABICall(const char *CalleeName)
+{
+  static const char *const ABICalls[] =
+    {  "_Q_add", "_Q_sub", "_Q_mul", "_Q_div",
+       "_Q_sqrt", "_Q_neg",
+       "_Q_itoq", "_Q_stoq", "_Q_dtoq", "_Q_utoq",
+       "_Q_lltoq", "_Q_ulltoq",
+       nullptr
+    };
+  for (const char * const *I = ABICalls; *I != nullptr; ++I)
+    if (strcmp(CalleeName, *I) == 0)
+      return true;
+  return false;
+}
+
+unsigned
+SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
+{
+  const Function *CalleeFn = nullptr;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    CalleeFn = dyn_cast<Function>(G->getGlobal());
+  } else if (ExternalSymbolSDNode *E =
+             dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const Function *Fn = DAG.getMachineFunction().getFunction();
+    const Module *M = Fn->getParent();
+    const char *CalleeName = E->getSymbol();
+    CalleeFn = M->getFunction(CalleeName);
+    if (!CalleeFn && isFP128ABICall(CalleeName))
+      return 16; // Return sizeof(fp128)
+  }
+
+  if (!CalleeFn)
+    return 0;
+
+  // It would be nice to check for the sret attribute on CalleeFn here,
+  // but since it is not part of the function type, any check will misfire.
+
+  PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType());
+  Type *ElementTy = Ty->getElementType();
+  return DAG.getDataLayout().getTypeAllocSize(ElementTy);
+}
+
+
+// Fixup floating point arguments in the ... part of a varargs call.
+//
+// The SPARC v9 ABI requires that floating point arguments are treated the same
+// as integers when calling a varargs function. This does not apply to the
+// fixed arguments that are part of the function's prototype.
+//
+// This function post-processes a CCValAssign array created by
+// AnalyzeCallOperands().
+static void fixupVariableFloatArgs(SmallVectorImpl<CCValAssign> &ArgLocs,
+                                   ArrayRef<ISD::OutputArg> Outs) {
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    const CCValAssign &VA = ArgLocs[i];
+    MVT ValTy = VA.getLocVT();
+    // FIXME: What about f32 arguments? C promotes them to f64 when calling
+    // varargs functions.
+    if (!VA.isRegLoc() || (ValTy != MVT::f64 && ValTy != MVT::f128))
+      continue;
+    // The fixed arguments to a varargs function still go in FP registers.
+    if (Outs[VA.getValNo()].IsFixed)
+      continue;
+
+    // This floating point argument should be reassigned.
+    CCValAssign NewVA;
+
+    // Determine the offset into the argument array.
+    unsigned firstReg = (ValTy == MVT::f64) ? SP::D0 : SP::Q0;
+    unsigned argSize  = (ValTy == MVT::f64) ? 8 : 16;
+    unsigned Offset = argSize * (VA.getLocReg() - firstReg);
+    assert(Offset < 16*8 && "Offset out of range, bad register enum?");
+
+    if (Offset < 6*8) {
+      // This argument should go in %i0-%i5.
+      unsigned IReg = SP::I0 + Offset/8;
+      if (ValTy == MVT::f64)
+        // Full register, just bitconvert into i64.
+        NewVA = CCValAssign::getReg(VA.getValNo(), VA.getValVT(),
+                                    IReg, MVT::i64, CCValAssign::BCvt);
+      else {
+        assert(ValTy == MVT::f128 && "Unexpected type!");
+        // Full register, just bitconvert into i128 -- We will lower this into
+        // two i64s in LowerCall_64.
+        NewVA = CCValAssign::getCustomReg(VA.getValNo(), VA.getValVT(),
+                                          IReg, MVT::i128, CCValAssign::BCvt);
+      }
+    } else {
+      // This needs to go to memory, we're out of integer registers.
+      NewVA = CCValAssign::getMem(VA.getValNo(), VA.getValVT(),
+                                  Offset, VA.getLocVT(), VA.getLocInfo());
+    }
+    ArgLocs[i] = NewVA;
+  }
+}
+
+// Lower a call for the 64-bit ABI.
+SDValue
+SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
+                                  SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc DL = CLI.DL;
+  SDValue Chain = CLI.Chain;
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // Sparc target does not yet support tail call optimization.
+  CLI.IsTailCall = false;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeCallOperands(CLI.Outs, CC_Sparc64);
+
+  // Get the size of the outgoing arguments stack space requirement.
+  // The stack offset computed by CC_Sparc64 includes all arguments.
+  // Called functions expect 6 argument words to exist in the stack frame, used
+  // or not.
+  unsigned ArgsSize = std::max(6*8u, CCInfo.getNextStackOffset());
+
+  // Keep stack frames 16-byte aligned.
+  ArgsSize = alignTo(ArgsSize, 16);
+
+  // Varargs calls require special treatment.
+  if (CLI.IsVarArg)
+    fixupVariableFloatArgs(ArgLocs, CLI.Outs);
+
+  // Adjust the stack pointer to make room for the arguments.
+  // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
+  // with more than 6 arguments.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
+                               DL);
+
+  // Collect the set of registers to pass to the function and their values.
+  // This will be emitted as a sequence of CopyToReg nodes glued to the call
+  // instruction.
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+  // Collect chains from all the memory opeations that copy arguments to the
+  // stack. They must follow the stack pointer adjustment above and precede the
+  // call instruction itself.
+  SmallVector<SDValue, 8> MemOpChains;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    const CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = CLI.OutVals[i];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown location info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      // fixupVariableFloatArgs() may create bitcasts from f128 to i128. But
+      // SPARC does not support i128 natively. Lower it into two i64, see below.
+      if (!VA.needsCustom() || VA.getValVT() != MVT::f128
+          || VA.getLocVT() != MVT::i128)
+        Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (VA.isRegLoc()) {
+      if (VA.needsCustom() && VA.getValVT() == MVT::f128
+          && VA.getLocVT() == MVT::i128) {
+        // Store and reload into the integer register reg and reg+1.
+        unsigned Offset = 8 * (VA.getLocReg() - SP::I0);
+        unsigned StackOffset = Offset + Subtarget->getStackPointerBias() + 128;
+        SDValue StackPtr = DAG.getRegister(SP::O6, PtrVT);
+        SDValue HiPtrOff = DAG.getIntPtrConstant(StackOffset, DL);
+        HiPtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, HiPtrOff);
+        SDValue LoPtrOff = DAG.getIntPtrConstant(StackOffset + 8, DL);
+        LoPtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, LoPtrOff);
+
+        // Store to %sp+BIAS+128+Offset
+        SDValue Store =
+            DAG.getStore(Chain, DL, Arg, HiPtrOff, MachinePointerInfo());
+        // Load into Reg and Reg+1
+        SDValue Hi64 =
+            DAG.getLoad(MVT::i64, DL, Store, HiPtrOff, MachinePointerInfo());
+        SDValue Lo64 =
+            DAG.getLoad(MVT::i64, DL, Store, LoPtrOff, MachinePointerInfo());
+        RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()),
+                                            Hi64));
+        RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()+1),
+                                            Lo64));
+        continue;
+      }
+
+      // The custom bit on an i32 return value indicates that it should be
+      // passed in the high bits of the register.
+      if (VA.getValVT() == MVT::i32 && VA.needsCustom()) {
+        Arg = DAG.getNode(ISD::SHL, DL, MVT::i64, Arg,
+                          DAG.getConstant(32, DL, MVT::i32));
+
+        // The next value may go in the low bits of the same register.
+        // Handle both at once.
+        if (i+1 < ArgLocs.size() && ArgLocs[i+1].isRegLoc() &&
+            ArgLocs[i+1].getLocReg() == VA.getLocReg()) {
+          SDValue NV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64,
+                                   CLI.OutVals[i+1]);
+          Arg = DAG.getNode(ISD::OR, DL, MVT::i64, Arg, NV);
+          // Skip the next value, it's already done.
+          ++i;
+        }
+      }
+      RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()), Arg));
+      continue;
+    }
+
+    assert(VA.isMemLoc());
+
+    // Create a store off the stack pointer for this argument.
+    SDValue StackPtr = DAG.getRegister(SP::O6, PtrVT);
+    // The argument area starts at %fp+BIAS+128 in the callee frame,
+    // %sp+BIAS+128 in ours.
+    SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() +
+                                           Subtarget->getStackPointerBias() +
+                                           128, DL);
+    PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+    MemOpChains.push_back(
+        DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
+  }
+
+  // Emit all stores, make sure they occur before the call.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+  // Build a sequence of CopyToReg nodes glued together with token chain and
+  // glue operands which copy the outgoing args into registers. The InGlue is
+  // necessary since all emitted instructions must be stuck together in order
+  // to pass the live physical registers.
+  SDValue InGlue;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, DL,
+                             RegsToPass[i].first, RegsToPass[i].second, InGlue);
+    InGlue = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  SDValue Callee = CLI.Callee;
+  bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
+  unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30 : 0;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT, 0, TF);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, TF);
+
+  // Build the operands for the call instruction itself.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const uint32_t *Mask =
+      ((hasReturnsTwice) ? TRI->getRTCallPreservedMask(CLI.CallConv)
+                         : TRI->getCallPreservedMask(DAG.getMachineFunction(),
+                                                     CLI.CallConv));
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  // Make sure the CopyToReg nodes are glued to the call instruction which
+  // consumes the registers.
+  if (InGlue.getNode())
+    Ops.push_back(InGlue);
+
+  // Now the call itself.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(SPISD::CALL, DL, NodeTys, Ops);
+  InGlue = Chain.getValue(1);
+
+  // Revert the stack pointer immediately after the call.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
+                             DAG.getIntPtrConstant(0, DL, true), InGlue, DL);
+  InGlue = Chain.getValue(1);
+
+  // Now extract the return values. This is more or less the same as
+  // LowerFormalArguments_64.
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  // Set inreg flag manually for codegen generated library calls that
+  // return float.
+  if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && CLI.CS == nullptr)
+    CLI.Ins[0].Flags.setInReg();
+
+  RVInfo.AnalyzeCallResult(CLI.Ins, RetCC_Sparc64);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    unsigned Reg = toCallerWindow(VA.getLocReg());
+
+    // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
+    // reside in the same register in the high and low bits. Reuse the
+    // CopyFromReg previous node to avoid duplicate copies.
+    SDValue RV;
+    if (RegisterSDNode *SrcReg = dyn_cast<RegisterSDNode>(Chain.getOperand(1)))
+      if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg)
+        RV = Chain.getValue(0);
+
+    // But usually we'll create a new CopyFromReg for a different register.
+    if (!RV.getNode()) {
+      RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue);
+      Chain = RV.getValue(1);
+      InGlue = Chain.getValue(2);
+    }
+
+    // Get the high bits for i32 struct elements.
+    if (VA.getValVT() == MVT::i32 && VA.needsCustom())
+      RV = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), RV,
+                       DAG.getConstant(32, DL, MVT::i32));
+
+    // The callee promoted the return value, so insert an Assert?ext SDNode so
+    // we won't promote the value again in this function.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::SExt:
+      RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV,
+                       DAG.getValueType(VA.getValVT()));
+      break;
+    case CCValAssign::ZExt:
+      RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
+                       DAG.getValueType(VA.getValVT()));
+      break;
+    default:
+      break;
+    }
+
+    // Truncate the register down to the return value type.
+    if (VA.isExtInLoc())
+      RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV);
+
+    InVals.push_back(RV);
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===----------------------------------------------------------------------===//
+
+TargetLowering::AtomicExpansionKind SparcTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  if (AI->getOperation() == AtomicRMWInst::Xchg &&
+      AI->getType()->getPrimitiveSizeInBits() == 32)
+    return AtomicExpansionKind::None; // Uses xchg instruction
+
+  return AtomicExpansionKind::CmpXChg;
+}
+
+/// IntCondCCodeToICC - Convert a DAG integer condition code to a SPARC ICC
+/// condition.
+static SPCC::CondCodes IntCondCCodeToICC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown integer condition code!");
+  case ISD::SETEQ:  return SPCC::ICC_E;
+  case ISD::SETNE:  return SPCC::ICC_NE;
+  case ISD::SETLT:  return SPCC::ICC_L;
+  case ISD::SETGT:  return SPCC::ICC_G;
+  case ISD::SETLE:  return SPCC::ICC_LE;
+  case ISD::SETGE:  return SPCC::ICC_GE;
+  case ISD::SETULT: return SPCC::ICC_CS;
+  case ISD::SETULE: return SPCC::ICC_LEU;
+  case ISD::SETUGT: return SPCC::ICC_GU;
+  case ISD::SETUGE: return SPCC::ICC_CC;
+  }
+}
+
+/// FPCondCCodeToFCC - Convert a DAG floatingp oint condition code to a SPARC
+/// FCC condition.
+static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown fp condition code!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: return SPCC::FCC_E;
+  case ISD::SETNE:
+  case ISD::SETUNE: return SPCC::FCC_NE;
+  case ISD::SETLT:
+  case ISD::SETOLT: return SPCC::FCC_L;
+  case ISD::SETGT:
+  case ISD::SETOGT: return SPCC::FCC_G;
+  case ISD::SETLE:
+  case ISD::SETOLE: return SPCC::FCC_LE;
+  case ISD::SETGE:
+  case ISD::SETOGE: return SPCC::FCC_GE;
+  case ISD::SETULT: return SPCC::FCC_UL;
+  case ISD::SETULE: return SPCC::FCC_ULE;
+  case ISD::SETUGT: return SPCC::FCC_UG;
+  case ISD::SETUGE: return SPCC::FCC_UGE;
+  case ISD::SETUO:  return SPCC::FCC_U;
+  case ISD::SETO:   return SPCC::FCC_O;
+  case ISD::SETONE: return SPCC::FCC_LG;
+  case ISD::SETUEQ: return SPCC::FCC_UE;
+  }
+}
+
+SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
+                                         const SparcSubtarget &STI)
+    : TargetLowering(TM), Subtarget(&STI) {
+  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
+
+  // Instructions which use registers as conditionals examine all the
+  // bits (as does the pseudo SELECT_CC expansion). I don't think it
+  // matters much whether it's ZeroOrOneBooleanContent, or
+  // ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the
+  // former.
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent);
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
+  if (!Subtarget->useSoftFloat()) {
+    addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
+    addRegisterClass(MVT::f64, &SP::DFPRegsRegClass);
+    addRegisterClass(MVT::f128, &SP::QFPRegsRegClass);
+  }
+  if (Subtarget->is64Bit()) {
+    addRegisterClass(MVT::i64, &SP::I64RegsRegClass);
+  } else {
+    // On 32bit sparc, we define a double-register 32bit register
+    // class, as well. This is modeled in LLVM as a 2-vector of i32.
+    addRegisterClass(MVT::v2i32, &SP::IntPairRegClass);
+
+    // ...but almost all operations must be expanded, so set that as
+    // the default.
+    for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
+      setOperationAction(Op, MVT::v2i32, Expand);
+    }
+    // Truncating/extending stores/loads are also not supported.
+    for (MVT VT : MVT::integer_vector_valuetypes()) {
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i32, Expand);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Expand);
+
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, VT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, VT, Expand);
+
+      setTruncStoreAction(VT, MVT::v2i32, Expand);
+      setTruncStoreAction(MVT::v2i32, VT, Expand);
+    }
+    // However, load and store *are* legal.
+    setOperationAction(ISD::LOAD, MVT::v2i32, Legal);
+    setOperationAction(ISD::STORE, MVT::v2i32, Legal);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Legal);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Legal);
+
+    // And we need to promote i64 loads/stores into vector load/store
+    setOperationAction(ISD::LOAD, MVT::i64, Custom);
+    setOperationAction(ISD::STORE, MVT::i64, Custom);
+
+    // Sadly, this doesn't work:
+    //    AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
+    //    AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
+  }
+
+  // Turn FP extload into load/fpextend
+  for (MVT VT : MVT::fp_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
+  }
+
+  // Sparc doesn't have i1 sign extending load
+  for (MVT VT : MVT::integer_valuetypes())
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+
+  // Turn FP truncstore into trunc + store.
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+
+  // Custom legalize GlobalAddress nodes into LO/HI parts.
+  setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
+  setOperationAction(ISD::ConstantPool, PtrVT, Custom);
+  setOperationAction(ISD::BlockAddress, PtrVT, Custom);
+
+  // Sparc doesn't have sext_inreg, replace them with shl/sra
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+
+  // Sparc has no REM or DIVREM operations.
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+
+  // ... nor does SparcV9.
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::UREM, MVT::i64, Expand);
+    setOperationAction(ISD::SREM, MVT::i64, Expand);
+    setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  }
+
+  // Custom expand fp<->sint
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+
+  // Custom Expand fp<->uint
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+
+  setOperationAction(ISD::BITCAST, MVT::f32, Expand);
+  setOperationAction(ISD::BITCAST, MVT::i32, Expand);
+
+  // Sparc has no select or setcc: expand to SELECT_CC.
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f64, Expand);
+  setOperationAction(ISD::SELECT, MVT::f128, Expand);
+
+  setOperationAction(ISD::SETCC, MVT::i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f64, Expand);
+  setOperationAction(ISD::SETCC, MVT::f128, Expand);
+
+  // Sparc doesn't have BRCOND either, it has BR_CC.
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
+
+  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::ADDC, MVT::i64, Custom);
+    setOperationAction(ISD::ADDE, MVT::i64, Custom);
+    setOperationAction(ISD::SUBC, MVT::i64, Custom);
+    setOperationAction(ISD::SUBE, MVT::i64, Custom);
+    setOperationAction(ISD::BITCAST, MVT::f64, Expand);
+    setOperationAction(ISD::BITCAST, MVT::i64, Expand);
+    setOperationAction(ISD::SELECT, MVT::i64, Expand);
+    setOperationAction(ISD::SETCC, MVT::i64, Expand);
+    setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+
+    setOperationAction(ISD::CTPOP, MVT::i64,
+                       Subtarget->usePopc() ? Legal : Expand);
+    setOperationAction(ISD::CTTZ , MVT::i64, Expand);
+    setOperationAction(ISD::CTLZ , MVT::i64, Expand);
+    setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+    setOperationAction(ISD::ROTL , MVT::i64, Expand);
+    setOperationAction(ISD::ROTR , MVT::i64, Expand);
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
+  }
+
+  // ATOMICs.
+  // Atomics are supported on SparcV9. 32-bit atomics are also
+  // supported by some Leon SparcV8 variants. Otherwise, atomics
+  // are unsupported.
+  if (Subtarget->isV9())
+    setMaxAtomicSizeInBitsSupported(64);
+  else if (Subtarget->hasLeonCasa())
+    setMaxAtomicSizeInBitsSupported(32);
+  else
+    setMaxAtomicSizeInBitsSupported(0);
+
+  setMinCmpXchgSizeInBits(32);
+
+  setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Legal);
+
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Legal);
+
+  // Custom Lower Atomic LOAD/STORE
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Legal);
+    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Legal);
+    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Custom);
+  }
+
+  if (!Subtarget->is64Bit()) {
+    // These libcalls are not available in 32-bit.
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
+  }
+
+  if (!Subtarget->isV9()) {
+    // SparcV8 does not have FNEGD and FABSD.
+    setOperationAction(ISD::FNEG, MVT::f64, Custom);
+    setOperationAction(ISD::FABS, MVT::f64, Custom);
+  }
+
+  setOperationAction(ISD::FSIN , MVT::f128, Expand);
+  setOperationAction(ISD::FCOS , MVT::f128, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FREM , MVT::f128, Expand);
+  setOperationAction(ISD::FMA  , MVT::f128, Expand);
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FMA  , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FREM , MVT::f32, Expand);
+  setOperationAction(ISD::FMA  , MVT::f32, Expand);
+  setOperationAction(ISD::CTTZ , MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ , MVT::i32, Expand);
+  setOperationAction(ISD::ROTL , MVT::i32, Expand);
+  setOperationAction(ISD::ROTR , MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW , MVT::f128, Expand);
+  setOperationAction(ISD::FPOW , MVT::f64, Expand);
+  setOperationAction(ISD::FPOW , MVT::f32, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+  // Expands to [SU]MUL_LOHI.
+  setOperationAction(ISD::MULHU,     MVT::i32, Expand);
+  setOperationAction(ISD::MULHS,     MVT::i32, Expand);
+  setOperationAction(ISD::MUL,       MVT::i32, Expand);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+    setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+    setOperationAction(ISD::MULHU,     MVT::i64, Expand);
+    setOperationAction(ISD::MULHS,     MVT::i64, Expand);
+
+    setOperationAction(ISD::UMULO,     MVT::i64, Custom);
+    setOperationAction(ISD::SMULO,     MVT::i64, Custom);
+
+    setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
+    setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+    setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+  }
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  // VAARG needs to be lowered to not do unaligned accesses for doubles.
+  setOperationAction(ISD::VAARG             , MVT::Other, Custom);
+
+  setOperationAction(ISD::TRAP              , MVT::Other, Legal);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
+
+  setStackPointerRegisterToSaveRestore(SP::O6);
+
+  setOperationAction(ISD::CTPOP, MVT::i32,
+                     Subtarget->usePopc() ? Legal : Expand);
+
+  if (Subtarget->isV9() && Subtarget->hasHardQuad()) {
+    setOperationAction(ISD::LOAD, MVT::f128, Legal);
+    setOperationAction(ISD::STORE, MVT::f128, Legal);
+  } else {
+    setOperationAction(ISD::LOAD, MVT::f128, Custom);
+    setOperationAction(ISD::STORE, MVT::f128, Custom);
+  }
+
+  if (Subtarget->hasHardQuad()) {
+    setOperationAction(ISD::FADD,  MVT::f128, Legal);
+    setOperationAction(ISD::FSUB,  MVT::f128, Legal);
+    setOperationAction(ISD::FMUL,  MVT::f128, Legal);
+    setOperationAction(ISD::FDIV,  MVT::f128, Legal);
+    setOperationAction(ISD::FSQRT, MVT::f128, Legal);
+    setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
+    setOperationAction(ISD::FP_ROUND,  MVT::f64, Legal);
+    if (Subtarget->isV9()) {
+      setOperationAction(ISD::FNEG, MVT::f128, Legal);
+      setOperationAction(ISD::FABS, MVT::f128, Legal);
+    } else {
+      setOperationAction(ISD::FNEG, MVT::f128, Custom);
+      setOperationAction(ISD::FABS, MVT::f128, Custom);
+    }
+
+    if (!Subtarget->is64Bit()) {
+      setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Q_qtoll");
+      setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Q_qtoull");
+      setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Q_lltoq");
+      setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Q_ulltoq");
+    }
+
+  } else {
+    // Custom legalize f128 operations.
+
+    setOperationAction(ISD::FADD,  MVT::f128, Custom);
+    setOperationAction(ISD::FSUB,  MVT::f128, Custom);
+    setOperationAction(ISD::FMUL,  MVT::f128, Custom);
+    setOperationAction(ISD::FDIV,  MVT::f128, Custom);
+    setOperationAction(ISD::FSQRT, MVT::f128, Custom);
+    setOperationAction(ISD::FNEG,  MVT::f128, Custom);
+    setOperationAction(ISD::FABS,  MVT::f128, Custom);
+
+    setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+    setOperationAction(ISD::FP_ROUND,  MVT::f64, Custom);
+    setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
+
+    // Setup Runtime library names.
+    if (Subtarget->is64Bit() && !Subtarget->useSoftFloat()) {
+      setLibcallName(RTLIB::ADD_F128,  "_Qp_add");
+      setLibcallName(RTLIB::SUB_F128,  "_Qp_sub");
+      setLibcallName(RTLIB::MUL_F128,  "_Qp_mul");
+      setLibcallName(RTLIB::DIV_F128,  "_Qp_div");
+      setLibcallName(RTLIB::SQRT_F128, "_Qp_sqrt");
+      setLibcallName(RTLIB::FPTOSINT_F128_I32, "_Qp_qtoi");
+      setLibcallName(RTLIB::FPTOUINT_F128_I32, "_Qp_qtoui");
+      setLibcallName(RTLIB::SINTTOFP_I32_F128, "_Qp_itoq");
+      setLibcallName(RTLIB::UINTTOFP_I32_F128, "_Qp_uitoq");
+      setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Qp_qtox");
+      setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Qp_qtoux");
+      setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Qp_xtoq");
+      setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Qp_uxtoq");
+      setLibcallName(RTLIB::FPEXT_F32_F128, "_Qp_stoq");
+      setLibcallName(RTLIB::FPEXT_F64_F128, "_Qp_dtoq");
+      setLibcallName(RTLIB::FPROUND_F128_F32, "_Qp_qtos");
+      setLibcallName(RTLIB::FPROUND_F128_F64, "_Qp_qtod");
+    } else if (!Subtarget->useSoftFloat()) {
+      setLibcallName(RTLIB::ADD_F128,  "_Q_add");
+      setLibcallName(RTLIB::SUB_F128,  "_Q_sub");
+      setLibcallName(RTLIB::MUL_F128,  "_Q_mul");
+      setLibcallName(RTLIB::DIV_F128,  "_Q_div");
+      setLibcallName(RTLIB::SQRT_F128, "_Q_sqrt");
+      setLibcallName(RTLIB::FPTOSINT_F128_I32, "_Q_qtoi");
+      setLibcallName(RTLIB::FPTOUINT_F128_I32, "_Q_qtou");
+      setLibcallName(RTLIB::SINTTOFP_I32_F128, "_Q_itoq");
+      setLibcallName(RTLIB::UINTTOFP_I32_F128, "_Q_utoq");
+      setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Q_qtoll");
+      setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Q_qtoull");
+      setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Q_lltoq");
+      setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Q_ulltoq");
+      setLibcallName(RTLIB::FPEXT_F32_F128, "_Q_stoq");
+      setLibcallName(RTLIB::FPEXT_F64_F128, "_Q_dtoq");
+      setLibcallName(RTLIB::FPROUND_F128_F32, "_Q_qtos");
+      setLibcallName(RTLIB::FPROUND_F128_F64, "_Q_qtod");
+    }
+  }
+
+  if (Subtarget->fixAllFDIVSQRT()) {
+    // Promote FDIVS and FSQRTS to FDIVD and FSQRTD instructions instead as
+    // the former instructions generate errata on LEON processors.
+    setOperationAction(ISD::FDIV, MVT::f32, Promote);
+    setOperationAction(ISD::FSQRT, MVT::f32, Promote);
+  }
+
+  if (Subtarget->replaceFMULS()) {
+    // Promote FMULS to FMULD instructions instead as
+    // the former instructions generate errata on LEON processors.
+    setOperationAction(ISD::FMUL, MVT::f32, Promote);
+  }
+
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  setMinFunctionAlignment(2);
+
+  computeRegisterProperties(Subtarget->getRegisterInfo());
+}
+
+bool SparcTargetLowering::useSoftFloat() const {
+  return Subtarget->useSoftFloat();
+}
+
+const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch ((SPISD::NodeType)Opcode) {
+  case SPISD::FIRST_NUMBER:    break;
+  case SPISD::CMPICC:          return "SPISD::CMPICC";
+  case SPISD::CMPFCC:          return "SPISD::CMPFCC";
+  case SPISD::BRICC:           return "SPISD::BRICC";
+  case SPISD::BRXCC:           return "SPISD::BRXCC";
+  case SPISD::BRFCC:           return "SPISD::BRFCC";
+  case SPISD::SELECT_ICC:      return "SPISD::SELECT_ICC";
+  case SPISD::SELECT_XCC:      return "SPISD::SELECT_XCC";
+  case SPISD::SELECT_FCC:      return "SPISD::SELECT_FCC";
+  case SPISD::EH_SJLJ_SETJMP:  return "SPISD::EH_SJLJ_SETJMP";
+  case SPISD::EH_SJLJ_LONGJMP: return "SPISD::EH_SJLJ_LONGJMP";
+  case SPISD::Hi:              return "SPISD::Hi";
+  case SPISD::Lo:              return "SPISD::Lo";
+  case SPISD::FTOI:            return "SPISD::FTOI";
+  case SPISD::ITOF:            return "SPISD::ITOF";
+  case SPISD::FTOX:            return "SPISD::FTOX";
+  case SPISD::XTOF:            return "SPISD::XTOF";
+  case SPISD::CALL:            return "SPISD::CALL";
+  case SPISD::RET_FLAG:        return "SPISD::RET_FLAG";
+  case SPISD::GLOBAL_BASE_REG: return "SPISD::GLOBAL_BASE_REG";
+  case SPISD::FLUSHW:          return "SPISD::FLUSHW";
+  case SPISD::TLS_ADD:         return "SPISD::TLS_ADD";
+  case SPISD::TLS_LD:          return "SPISD::TLS_LD";
+  case SPISD::TLS_CALL:        return "SPISD::TLS_CALL";
+  }
+  return nullptr;
+}
+
+EVT SparcTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
+                                            EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+/// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
+/// be zero. Op is expected to be a target specific node. Used by DAG
+/// combiner.
+void SparcTargetLowering::computeKnownBitsForTargetNode
+                                (const SDValue Op,
+                                 APInt &KnownZero,
+                                 APInt &KnownOne,
+                                 const SelectionDAG &DAG,
+                                 unsigned Depth) const {
+  APInt KnownZero2, KnownOne2;
+  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
+
+  switch (Op.getOpcode()) {
+  default: break;
+  case SPISD::SELECT_ICC:
+  case SPISD::SELECT_XCC:
+  case SPISD::SELECT_FCC:
+    DAG.computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    DAG.computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
+  }
+}
+
+// Look at LHS/RHS/CC and see if they are a lowered setcc instruction.  If so
+// set LHS/RHS and SPCC to the LHS/RHS of the setcc and SPCC to the condition.
+static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
+                             ISD::CondCode CC, unsigned &SPCC) {
+  if (isNullConstant(RHS) &&
+      CC == ISD::SETNE &&
+      (((LHS.getOpcode() == SPISD::SELECT_ICC ||
+         LHS.getOpcode() == SPISD::SELECT_XCC) &&
+        LHS.getOperand(3).getOpcode() == SPISD::CMPICC) ||
+       (LHS.getOpcode() == SPISD::SELECT_FCC &&
+        LHS.getOperand(3).getOpcode() == SPISD::CMPFCC)) &&
+      isOneConstant(LHS.getOperand(0)) &&
+      isNullConstant(LHS.getOperand(1))) {
+    SDValue CMPCC = LHS.getOperand(3);
+    SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue();
+    LHS = CMPCC.getOperand(0);
+    RHS = CMPCC.getOperand(1);
+  }
+}
+
+// Convert to a target node and set target flags.
+SDValue SparcTargetLowering::withTargetFlags(SDValue Op, unsigned TF,
+                                             SelectionDAG &DAG) const {
+  if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
+    return DAG.getTargetGlobalAddress(GA->getGlobal(),
+                                      SDLoc(GA),
+                                      GA->getValueType(0),
+                                      GA->getOffset(), TF);
+
+  if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op))
+    return DAG.getTargetConstantPool(CP->getConstVal(),
+                                     CP->getValueType(0),
+                                     CP->getAlignment(),
+                                     CP->getOffset(), TF);
+
+  if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
+    return DAG.getTargetBlockAddress(BA->getBlockAddress(),
+                                     Op.getValueType(),
+                                     0,
+                                     TF);
+
+  if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
+    return DAG.getTargetExternalSymbol(ES->getSymbol(),
+                                       ES->getValueType(0), TF);
+
+  llvm_unreachable("Unhandled address SDNode");
+}
+
+// Split Op into high and low parts according to HiTF and LoTF.
+// Return an ADD node combining the parts.
+SDValue SparcTargetLowering::makeHiLoPair(SDValue Op,
+                                          unsigned HiTF, unsigned LoTF,
+                                          SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Hi = DAG.getNode(SPISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
+  SDValue Lo = DAG.getNode(SPISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
+  return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
+}
+
+// Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
+// or ExternalSymbol SDNode.
+SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = getPointerTy(DAG.getDataLayout());
+
+  // Handle PIC mode first. SPARC needs a got load for every variable!
+  if (isPositionIndependent()) {
+    // This is the pic32 code model, the GOT is known to be smaller than 4GB.
+    SDValue HiLo = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_GOT22,
+                                SparcMCExpr::VK_Sparc_GOT10, DAG);
+    SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, VT);
+    SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, VT, GlobalBase, HiLo);
+    // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
+    // function has calls.
+    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI.setHasCalls(true);
+    return DAG.getLoad(VT, DL, DAG.getEntryNode(), AbsAddr,
+                       MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+  }
+
+  // This is one of the absolute code models.
+  switch(getTargetMachine().getCodeModel()) {
+  default:
+    llvm_unreachable("Unsupported absolute code model");
+  case CodeModel::Small:
+    // abs32.
+    return makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HI,
+                        SparcMCExpr::VK_Sparc_LO, DAG);
+  case CodeModel::Medium: {
+    // abs44.
+    SDValue H44 = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_H44,
+                               SparcMCExpr::VK_Sparc_M44, DAG);
+    H44 = DAG.getNode(ISD::SHL, DL, VT, H44, DAG.getConstant(12, DL, MVT::i32));
+    SDValue L44 = withTargetFlags(Op, SparcMCExpr::VK_Sparc_L44, DAG);
+    L44 = DAG.getNode(SPISD::Lo, DL, VT, L44);
+    return DAG.getNode(ISD::ADD, DL, VT, H44, L44);
+  }
+  case CodeModel::Large: {
+    // abs64.
+    SDValue Hi = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HH,
+                              SparcMCExpr::VK_Sparc_HM, DAG);
+    Hi = DAG.getNode(ISD::SHL, DL, VT, Hi, DAG.getConstant(32, DL, MVT::i32));
+    SDValue Lo = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HI,
+                              SparcMCExpr::VK_Sparc_LO, DAG);
+    return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
+  }
+  }
+}
+
+SDValue SparcTargetLowering::LowerGlobalAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  return makeAddress(Op, DAG);
+}
+
+SDValue SparcTargetLowering::LowerConstantPool(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  return makeAddress(Op, DAG);
+}
+
+SDValue SparcTargetLowering::LowerBlockAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  return makeAddress(Op, DAG);
+}
+
+SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  if (DAG.getTarget().Options.EmulatedTLS)
+    return LowerToTLSEmulatedModel(GA, DAG);
+
+  SDLoc DL(GA);
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+  if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
+    unsigned HiTF = ((model == TLSModel::GeneralDynamic)
+                     ? SparcMCExpr::VK_Sparc_TLS_GD_HI22
+                     : SparcMCExpr::VK_Sparc_TLS_LDM_HI22);
+    unsigned LoTF = ((model == TLSModel::GeneralDynamic)
+                     ? SparcMCExpr::VK_Sparc_TLS_GD_LO10
+                     : SparcMCExpr::VK_Sparc_TLS_LDM_LO10);
+    unsigned addTF = ((model == TLSModel::GeneralDynamic)
+                      ? SparcMCExpr::VK_Sparc_TLS_GD_ADD
+                      : SparcMCExpr::VK_Sparc_TLS_LDM_ADD);
+    unsigned callTF = ((model == TLSModel::GeneralDynamic)
+                       ? SparcMCExpr::VK_Sparc_TLS_GD_CALL
+                       : SparcMCExpr::VK_Sparc_TLS_LDM_CALL);
+
+    SDValue HiLo = makeHiLoPair(Op, HiTF, LoTF, DAG);
+    SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
+    SDValue Argument = DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Base, HiLo,
+                               withTargetFlags(Op, addTF, DAG));
+
+    SDValue Chain = DAG.getEntryNode();
+    SDValue InFlag;
+
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(1, DL, true), DL);
+    Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InFlag);
+    InFlag = Chain.getValue(1);
+    SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
+    SDValue Symbol = withTargetFlags(Op, callTF, DAG);
+
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
+        DAG.getMachineFunction(), CallingConv::C);
+    assert(Mask && "Missing call preserved mask for calling convention");
+    SDValue Ops[] = {Chain,
+                     Callee,
+                     Symbol,
+                     DAG.getRegister(SP::O0, PtrVT),
+                     DAG.getRegisterMask(Mask),
+                     InFlag};
+    Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, Ops);
+    InFlag = Chain.getValue(1);
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(1, DL, true),
+                               DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
+    InFlag = Chain.getValue(1);
+    SDValue Ret = DAG.getCopyFromReg(Chain, DL, SP::O0, PtrVT, InFlag);
+
+    if (model != TLSModel::LocalDynamic)
+      return Ret;
+
+    SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
+                 withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_HIX22, DAG));
+    SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
+                 withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_LOX10, DAG));
+    HiLo =  DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
+    return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Ret, HiLo,
+                   withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_ADD, DAG));
+  }
+
+  if (model == TLSModel::InitialExec) {
+    unsigned ldTF     = ((PtrVT == MVT::i64)? SparcMCExpr::VK_Sparc_TLS_IE_LDX
+                         : SparcMCExpr::VK_Sparc_TLS_IE_LD);
+
+    SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
+
+    // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
+    // function has calls.
+    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI.setHasCalls(true);
+
+    SDValue TGA = makeHiLoPair(Op,
+                               SparcMCExpr::VK_Sparc_TLS_IE_HI22,
+                               SparcMCExpr::VK_Sparc_TLS_IE_LO10, DAG);
+    SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, TGA);
+    SDValue Offset = DAG.getNode(SPISD::TLS_LD,
+                                 DL, PtrVT, Ptr,
+                                 withTargetFlags(Op, ldTF, DAG));
+    return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT,
+                       DAG.getRegister(SP::G7, PtrVT), Offset,
+                       withTargetFlags(Op,
+                                       SparcMCExpr::VK_Sparc_TLS_IE_ADD, DAG));
+  }
+
+  assert(model == TLSModel::LocalExec);
+  SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
+                  withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LE_HIX22, DAG));
+  SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
+                  withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LE_LOX10, DAG));
+  SDValue Offset =  DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
+
+  return DAG.getNode(ISD::ADD, DL, PtrVT,
+                     DAG.getRegister(SP::G7, PtrVT), Offset);
+}
+
+SDValue SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain,
+                                                  ArgListTy &Args, SDValue Arg,
+                                                  const SDLoc &DL,
+                                                  SelectionDAG &DAG) const {
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  ArgListEntry Entry;
+  Entry.Node = Arg;
+  Entry.Ty   = ArgTy;
+
+  if (ArgTy->isFP128Ty()) {
+    // Create a stack object and pass the pointer to the library function.
+    int FI = MFI.CreateStackObject(16, 8, false);
+    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+    Chain = DAG.getStore(Chain, DL, Entry.Node, FIPtr, MachinePointerInfo(),
+                         /* Alignment = */ 8);
+
+    Entry.Node = FIPtr;
+    Entry.Ty   = PointerType::getUnqual(ArgTy);
+  }
+  Args.push_back(Entry);
+  return Chain;
+}
+
+SDValue
+SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
+                                 const char *LibFuncName,
+                                 unsigned numArgs) const {
+
+  ArgListTy Args;
+
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  SDValue Callee = DAG.getExternalSymbol(LibFuncName, PtrVT);
+  Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
+  Type *RetTyABI = RetTy;
+  SDValue Chain = DAG.getEntryNode();
+  SDValue RetPtr;
+
+  if (RetTy->isFP128Ty()) {
+    // Create a Stack Object to receive the return value of type f128.
+    ArgListEntry Entry;
+    int RetFI = MFI.CreateStackObject(16, 8, false);
+    RetPtr = DAG.getFrameIndex(RetFI, PtrVT);
+    Entry.Node = RetPtr;
+    Entry.Ty   = PointerType::getUnqual(RetTy);
+    if (!Subtarget->is64Bit())
+      Entry.isSRet = true;
+    Entry.isReturned = false;
+    Args.push_back(Entry);
+    RetTyABI = Type::getVoidTy(*DAG.getContext());
+  }
+
+  assert(Op->getNumOperands() >= numArgs && "Not enough operands!");
+  for (unsigned i = 0, e = numArgs; i != e; ++i) {
+    Chain = LowerF128_LibCallArg(Chain, Args, Op.getOperand(i), SDLoc(Op), DAG);
+  }
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(SDLoc(Op)).setChain(Chain)
+    .setCallee(CallingConv::C, RetTyABI, Callee, std::move(Args));
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+  // chain is in second result.
+  if (RetTyABI == RetTy)
+    return CallInfo.first;
+
+  assert (RetTy->isFP128Ty() && "Unexpected return type!");
+
+  Chain = CallInfo.second;
+
+  // Load RetPtr to get the return value.
+  return DAG.getLoad(Op.getValueType(), SDLoc(Op), Chain, RetPtr,
+                     MachinePointerInfo(), /* Alignment = */ 8);
+}
+
+SDValue SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
+                                              unsigned &SPCC, const SDLoc &DL,
+                                              SelectionDAG &DAG) const {
+
+  const char *LibCall = nullptr;
+  bool is64Bit = Subtarget->is64Bit();
+  switch(SPCC) {
+  default: llvm_unreachable("Unhandled conditional code!");
+  case SPCC::FCC_E  : LibCall = is64Bit? "_Qp_feq" : "_Q_feq"; break;
+  case SPCC::FCC_NE : LibCall = is64Bit? "_Qp_fne" : "_Q_fne"; break;
+  case SPCC::FCC_L  : LibCall = is64Bit? "_Qp_flt" : "_Q_flt"; break;
+  case SPCC::FCC_G  : LibCall = is64Bit? "_Qp_fgt" : "_Q_fgt"; break;
+  case SPCC::FCC_LE : LibCall = is64Bit? "_Qp_fle" : "_Q_fle"; break;
+  case SPCC::FCC_GE : LibCall = is64Bit? "_Qp_fge" : "_Q_fge"; break;
+  case SPCC::FCC_UL :
+  case SPCC::FCC_ULE:
+  case SPCC::FCC_UG :
+  case SPCC::FCC_UGE:
+  case SPCC::FCC_U  :
+  case SPCC::FCC_O  :
+  case SPCC::FCC_LG :
+  case SPCC::FCC_UE : LibCall = is64Bit? "_Qp_cmp" : "_Q_cmp"; break;
+  }
+
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Callee = DAG.getExternalSymbol(LibCall, PtrVT);
+  Type *RetTy = Type::getInt32Ty(*DAG.getContext());
+  ArgListTy Args;
+  SDValue Chain = DAG.getEntryNode();
+  Chain = LowerF128_LibCallArg(Chain, Args, LHS, DL, DAG);
+  Chain = LowerF128_LibCallArg(Chain, Args, RHS, DL, DAG);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL).setChain(Chain)
+    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+  // result is in first, and chain is in second result.
+  SDValue Result =  CallInfo.first;
+
+  switch(SPCC) {
+  default: {
+    SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UL : {
+    SDValue Mask   = DAG.getTargetConstant(1, DL, Result.getValueType());
+    Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+    SDValue RHS    = DAG.getTargetConstant(0, DL, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_ULE: {
+    SDValue RHS = DAG.getTargetConstant(2, DL, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UG :  {
+    SDValue RHS = DAG.getTargetConstant(1, DL, Result.getValueType());
+    SPCC = SPCC::ICC_G;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UGE: {
+    SDValue RHS = DAG.getTargetConstant(1, DL, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+
+  case SPCC::FCC_U  :  {
+    SDValue RHS = DAG.getTargetConstant(3, DL, Result.getValueType());
+    SPCC = SPCC::ICC_E;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_O  :  {
+    SDValue RHS = DAG.getTargetConstant(3, DL, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_LG :  {
+    SDValue Mask   = DAG.getTargetConstant(3, DL, Result.getValueType());
+    Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+    SDValue RHS    = DAG.getTargetConstant(0, DL, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UE : {
+    SDValue Mask   = DAG.getTargetConstant(3, DL, Result.getValueType());
+    Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+    SDValue RHS    = DAG.getTargetConstant(0, DL, Result.getValueType());
+    SPCC = SPCC::ICC_E;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  }
+}
+
+static SDValue
+LowerF128_FPEXTEND(SDValue Op, SelectionDAG &DAG,
+                   const SparcTargetLowering &TLI) {
+
+  if (Op.getOperand(0).getValueType() == MVT::f64)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPEXT_F64_F128), 1);
+
+  if (Op.getOperand(0).getValueType() == MVT::f32)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPEXT_F32_F128), 1);
+
+  llvm_unreachable("fpextend with non-float operand!");
+  return SDValue();
+}
+
+static SDValue
+LowerF128_FPROUND(SDValue Op, SelectionDAG &DAG,
+                  const SparcTargetLowering &TLI) {
+  // FP_ROUND on f64 and f32 are legal.
+  if (Op.getOperand(0).getValueType() != MVT::f128)
+    return Op;
+
+  if (Op.getValueType() == MVT::f64)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPROUND_F128_F64), 1);
+  if (Op.getValueType() == MVT::f32)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPROUND_F128_F32), 1);
+
+  llvm_unreachable("fpround to non-float!");
+  return SDValue();
+}
+
+static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  assert(VT == MVT::i32 || VT == MVT::i64);
+
+  // Expand f128 operations to fp128 abi calls.
+  if (Op.getOperand(0).getValueType() == MVT::f128
+      && (!hasHardQuad || !TLI.isTypeLegal(VT))) {
+    const char *libName = TLI.getLibcallName(VT == MVT::i32
+                                             ? RTLIB::FPTOSINT_F128_I32
+                                             : RTLIB::FPTOSINT_F128_I64);
+    return TLI.LowerF128Op(Op, DAG, libName, 1);
+  }
+
+  // Expand if the resulting type is illegal.
+  if (!TLI.isTypeLegal(VT))
+    return SDValue();
+
+  // Otherwise, Convert the fp value to integer in an FP register.
+  if (VT == MVT::i32)
+    Op = DAG.getNode(SPISD::FTOI, dl, MVT::f32, Op.getOperand(0));
+  else
+    Op = DAG.getNode(SPISD::FTOX, dl, MVT::f64, Op.getOperand(0));
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+}
+
+static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
+  SDLoc dl(Op);
+  EVT OpVT = Op.getOperand(0).getValueType();
+  assert(OpVT == MVT::i32 || (OpVT == MVT::i64));
+
+  EVT floatVT = (OpVT == MVT::i32) ? MVT::f32 : MVT::f64;
+
+  // Expand f128 operations to fp128 ABI calls.
+  if (Op.getValueType() == MVT::f128
+      && (!hasHardQuad || !TLI.isTypeLegal(OpVT))) {
+    const char *libName = TLI.getLibcallName(OpVT == MVT::i32
+                                             ? RTLIB::SINTTOFP_I32_F128
+                                             : RTLIB::SINTTOFP_I64_F128);
+    return TLI.LowerF128Op(Op, DAG, libName, 1);
+  }
+
+  // Expand if the operand type is illegal.
+  if (!TLI.isTypeLegal(OpVT))
+    return SDValue();
+
+  // Otherwise, Convert the int value to FP in an FP register.
+  SDValue Tmp = DAG.getNode(ISD::BITCAST, dl, floatVT, Op.getOperand(0));
+  unsigned opcode = (OpVT == MVT::i32)? SPISD::ITOF : SPISD::XTOF;
+  return DAG.getNode(opcode, dl, Op.getValueType(), Tmp);
+}
+
+static SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  // Expand if it does not involve f128 or the target has support for
+  // quad floating point instructions and the resulting type is legal.
+  if (Op.getOperand(0).getValueType() != MVT::f128 ||
+      (hasHardQuad && TLI.isTypeLegal(VT)))
+    return SDValue();
+
+  assert(VT == MVT::i32 || VT == MVT::i64);
+
+  return TLI.LowerF128Op(Op, DAG,
+                         TLI.getLibcallName(VT == MVT::i32
+                                            ? RTLIB::FPTOUINT_F128_I32
+                                            : RTLIB::FPTOUINT_F128_I64),
+                         1);
+}
+
+static SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
+  SDLoc dl(Op);
+  EVT OpVT = Op.getOperand(0).getValueType();
+  assert(OpVT == MVT::i32 || OpVT == MVT::i64);
+
+  // Expand if it does not involve f128 or the target has support for
+  // quad floating point instructions and the operand type is legal.
+  if (Op.getValueType() != MVT::f128 || (hasHardQuad && TLI.isTypeLegal(OpVT)))
+    return SDValue();
+
+  return TLI.LowerF128Op(Op, DAG,
+                         TLI.getLibcallName(OpVT == MVT::i32
+                                            ? RTLIB::UINTTOFP_I32_F128
+                                            : RTLIB::UINTTOFP_I64_F128),
+                         1);
+}
+
+static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG,
+                          const SparcTargetLowering &TLI,
+                          bool hasHardQuad) {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  SDLoc dl(Op);
+  unsigned Opc, SPCC = ~0U;
+
+  // If this is a br_cc of a "setcc", and if the setcc got lowered into
+  // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+  LookThroughSetCC(LHS, RHS, CC, SPCC);
+
+  // Get the condition flag.
+  SDValue CompareFlag;
+  if (LHS.getValueType().isInteger()) {
+    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, MVT::Glue, LHS, RHS);
+    if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
+    // 32-bit compares use the icc flags, 64-bit uses the xcc flags.
+    Opc = LHS.getValueType() == MVT::i32 ? SPISD::BRICC : SPISD::BRXCC;
+  } else {
+    if (!hasHardQuad && LHS.getValueType() == MVT::f128) {
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+      CompareFlag = TLI.LowerF128Compare(LHS, RHS, SPCC, dl, DAG);
+      Opc = SPISD::BRICC;
+    } else {
+      CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+      Opc = SPISD::BRFCC;
+    }
+  }
+  return DAG.getNode(Opc, dl, MVT::Other, Chain, Dest,
+                     DAG.getConstant(SPCC, dl, MVT::i32), CompareFlag);
+}
+
+static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
+                              const SparcTargetLowering &TLI,
+                              bool hasHardQuad) {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+  SDLoc dl(Op);
+  unsigned Opc, SPCC = ~0U;
+
+  // If this is a select_cc of a "setcc", and if the setcc got lowered into
+  // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+  LookThroughSetCC(LHS, RHS, CC, SPCC);
+
+  SDValue CompareFlag;
+  if (LHS.getValueType().isInteger()) {
+    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, MVT::Glue, LHS, RHS);
+    Opc = LHS.getValueType() == MVT::i32 ?
+          SPISD::SELECT_ICC : SPISD::SELECT_XCC;
+    if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
+  } else {
+    if (!hasHardQuad && LHS.getValueType() == MVT::f128) {
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+      CompareFlag = TLI.LowerF128Compare(LHS, RHS, SPCC, dl, DAG);
+      Opc = SPISD::SELECT_ICC;
+    } else {
+      CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
+      Opc = SPISD::SELECT_FCC;
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+    }
+  }
+  return DAG.getNode(Opc, dl, TrueVal.getValueType(), TrueVal, FalseVal,
+                     DAG.getConstant(SPCC, dl, MVT::i32), CompareFlag);
+}
+
+SDValue SparcTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG,
+    const SparcTargetLowering &TLI) const {
+  SDLoc DL(Op);
+  return DAG.getNode(SPISD::EH_SJLJ_SETJMP, DL,
+      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1));
+
+}
+
+SDValue SparcTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG,
+    const SparcTargetLowering &TLI) const {
+  SDLoc DL(Op);
+  return DAG.getNode(SPISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0), Op.getOperand(1));
+}
+
+static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
+                            const SparcTargetLowering &TLI) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+  auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+
+  // Need frame address to find the address of VarArgsFrameIndex.
+  MF.getFrameInfo().setFrameAddressIsTaken(true);
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  SDLoc DL(Op);
+  SDValue Offset =
+      DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(SP::I6, PtrVT),
+                  DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
+static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  EVT VT = Node->getValueType(0);
+  SDValue InChain = Node->getOperand(0);
+  SDValue VAListPtr = Node->getOperand(1);
+  EVT PtrVT = VAListPtr.getValueType();
+  const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  SDLoc DL(Node);
+  SDValue VAList =
+      DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV));
+  // Increment the pointer, VAList, to the next vaarg.
+  SDValue NextPtr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                                DAG.getIntPtrConstant(VT.getSizeInBits()/8,
+                                                      DL));
+  // Store the incremented VAList to the legalized pointer.
+  InChain = DAG.getStore(VAList.getValue(1), DL, NextPtr, VAListPtr,
+                         MachinePointerInfo(SV));
+  // Load the actual argument out of the pointer VAList.
+  // We can't count on greater alignment than the word size.
+  return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
+                     std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
+}
+
+static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
+                                       const SparcSubtarget *Subtarget) {
+  SDValue Chain = Op.getOperand(0);  // Legalize the chain.
+  SDValue Size  = Op.getOperand(1);  // Legalize the size.
+  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned StackAlign = Subtarget->getFrameLowering()->getStackAlignment();
+  EVT VT = Size->getValueType(0);
+  SDLoc dl(Op);
+
+  // TODO: implement over-aligned alloca. (Note: also implies
+  // supporting support for overaligned function frames + dynamic
+  // allocations, at all, which currently isn't supported)
+  if (Align > StackAlign) {
+    const MachineFunction &MF = DAG.getMachineFunction();
+    report_fatal_error("Function \"" + Twine(MF.getName()) + "\": "
+                       "over-aligned dynamic alloca not supported.");
+  }
+
+  // The resultant pointer needs to be above the register spill area
+  // at the bottom of the stack.
+  unsigned regSpillArea;
+  if (Subtarget->is64Bit()) {
+    regSpillArea = 128;
+  } else {
+    // On Sparc32, the size of the spill area is 92. Unfortunately,
+    // that's only 4-byte aligned, not 8-byte aligned (the stack
+    // pointer is 8-byte aligned). So, if the user asked for an 8-byte
+    // aligned dynamic allocation, we actually need to add 96 to the
+    // bottom of the stack, instead of 92, to ensure 8-byte alignment.
+
+    // That also means adding 4 to the size of the allocation --
+    // before applying the 8-byte rounding. Unfortunately, we the
+    // value we get here has already had rounding applied. So, we need
+    // to add 8, instead, wasting a bit more memory.
+
+    // Further, this only actually needs to be done if the required
+    // alignment is > 4, but, we've lost that info by this point, too,
+    // so we always apply it.
+
+    // (An alternative approach would be to always reserve 96 bytes
+    // instead of the required 92, but then we'd waste 4 extra bytes
+    // in every frame, not just those with dynamic stack allocations)
+
+    // TODO: modify code in SelectionDAGBuilder to make this less sad.
+
+    Size = DAG.getNode(ISD::ADD, dl, VT, Size,
+                       DAG.getConstant(8, dl, VT));
+    regSpillArea = 96;
+  }
+
+  unsigned SPReg = SP::O6;
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+  SDValue NewSP = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+  Chain = DAG.getCopyToReg(SP.getValue(1), dl, SPReg, NewSP);    // Output chain
+
+  regSpillArea += Subtarget->getStackPointerBias();
+
+  SDValue NewVal = DAG.getNode(ISD::ADD, dl, VT, NewSP,
+                               DAG.getConstant(regSpillArea, dl, VT));
+  SDValue Ops[2] = { NewVal, Chain };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+
+static SDValue getFLUSHW(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  SDValue Chain = DAG.getNode(SPISD::FLUSHW,
+                              dl, MVT::Other, DAG.getEntryNode());
+  return Chain;
+}
+
+static SDValue getFRAMEADDR(uint64_t depth, SDValue Op, SelectionDAG &DAG,
+                            const SparcSubtarget *Subtarget) {
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI.setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned FrameReg = SP::I6;
+  unsigned stackBias = Subtarget->getStackPointerBias();
+
+  SDValue FrameAddr;
+
+  if (depth == 0) {
+    FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+    if (Subtarget->is64Bit())
+      FrameAddr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
+                              DAG.getIntPtrConstant(stackBias, dl));
+    return FrameAddr;
+  }
+
+  // flush first to make sure the windowed registers' values are in stack
+  SDValue Chain = getFLUSHW(Op, DAG);
+  FrameAddr = DAG.getCopyFromReg(Chain, dl, FrameReg, VT);
+
+  unsigned Offset = (Subtarget->is64Bit()) ? (stackBias + 112) : 56;
+
+  while (depth--) {
+    SDValue Ptr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
+                              DAG.getIntPtrConstant(Offset, dl));
+    FrameAddr = DAG.getLoad(VT, dl, Chain, Ptr, MachinePointerInfo());
+  }
+  if (Subtarget->is64Bit())
+    FrameAddr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
+                            DAG.getIntPtrConstant(stackBias, dl));
+  return FrameAddr;
+}
+
+
+static SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
+                              const SparcSubtarget *Subtarget) {
+
+  uint64_t depth = Op.getConstantOperandVal(0);
+
+  return getFRAMEADDR(depth, Op, DAG, Subtarget);
+
+}
+
+static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               const SparcSubtarget *Subtarget) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setReturnAddressIsTaken(true);
+
+  if (TLI.verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  uint64_t depth = Op.getConstantOperandVal(0);
+
+  SDValue RetAddr;
+  if (depth == 0) {
+    auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+    unsigned RetReg = MF.addLiveIn(SP::I7, TLI.getRegClassFor(PtrVT));
+    RetAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, RetReg, VT);
+    return RetAddr;
+  }
+
+  // Need frame address to find return address of the caller.
+  SDValue FrameAddr = getFRAMEADDR(depth - 1, Op, DAG, Subtarget);
+
+  unsigned Offset = (Subtarget->is64Bit()) ? 120 : 60;
+  SDValue Ptr = DAG.getNode(ISD::ADD,
+                            dl, VT,
+                            FrameAddr,
+                            DAG.getIntPtrConstant(Offset, dl));
+  RetAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), Ptr, MachinePointerInfo());
+
+  return RetAddr;
+}
+
+static SDValue LowerF64Op(SDValue SrcReg64, const SDLoc &dl, SelectionDAG &DAG,
+                          unsigned opcode) {
+  assert(SrcReg64.getValueType() == MVT::f64 && "LowerF64Op called on non-double!");
+  assert(opcode == ISD::FNEG || opcode == ISD::FABS);
+
+  // Lower fneg/fabs on f64 to fneg/fabs on f32.
+  // fneg f64 => fneg f32:sub_even, fmov f32:sub_odd.
+  // fabs f64 => fabs f32:sub_even, fmov f32:sub_odd.
+
+  // Note: in little-endian, the floating-point value is stored in the
+  // registers are in the opposite order, so the subreg with the sign
+  // bit is the highest-numbered (odd), rather than the
+  // lowest-numbered (even).
+
+  SDValue Hi32 = DAG.getTargetExtractSubreg(SP::sub_even, dl, MVT::f32,
+                                            SrcReg64);
+  SDValue Lo32 = DAG.getTargetExtractSubreg(SP::sub_odd, dl, MVT::f32,
+                                            SrcReg64);
+
+  if (DAG.getDataLayout().isLittleEndian())
+    Lo32 = DAG.getNode(opcode, dl, MVT::f32, Lo32);
+  else
+    Hi32 = DAG.getNode(opcode, dl, MVT::f32, Hi32);
+
+  SDValue DstReg64 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                dl, MVT::f64), 0);
+  DstReg64 = DAG.getTargetInsertSubreg(SP::sub_even, dl, MVT::f64,
+                                       DstReg64, Hi32);
+  DstReg64 = DAG.getTargetInsertSubreg(SP::sub_odd, dl, MVT::f64,
+                                       DstReg64, Lo32);
+  return DstReg64;
+}
+
+// Lower a f128 load into two f64 loads.
+static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG)
+{
+  SDLoc dl(Op);
+  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
+  assert(LdNode && LdNode->getOffset().isUndef()
+         && "Unexpected node type");
+
+  unsigned alignment = LdNode->getAlignment();
+  if (alignment > 8)
+    alignment = 8;
+
+  SDValue Hi64 =
+      DAG.getLoad(MVT::f64, dl, LdNode->getChain(), LdNode->getBasePtr(),
+                  LdNode->getPointerInfo(), alignment);
+  EVT addrVT = LdNode->getBasePtr().getValueType();
+  SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
+                              LdNode->getBasePtr(),
+                              DAG.getConstant(8, dl, addrVT));
+  SDValue Lo64 = DAG.getLoad(MVT::f64, dl, LdNode->getChain(), LoPtr,
+                             LdNode->getPointerInfo(), alignment);
+
+  SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, dl, MVT::i32);
+  SDValue SubRegOdd  = DAG.getTargetConstant(SP::sub_odd64, dl, MVT::i32);
+
+  SDNode *InFP128 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                       dl, MVT::f128);
+  InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
+                               MVT::f128,
+                               SDValue(InFP128, 0),
+                               Hi64,
+                               SubRegEven);
+  InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
+                               MVT::f128,
+                               SDValue(InFP128, 0),
+                               Lo64,
+                               SubRegOdd);
+  SDValue OutChains[2] = { SDValue(Hi64.getNode(), 1),
+                           SDValue(Lo64.getNode(), 1) };
+  SDValue OutChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+  SDValue Ops[2] = {SDValue(InFP128,0), OutChain};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG)
+{
+  LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
+
+  EVT MemVT = LdNode->getMemoryVT();
+  if (MemVT == MVT::f128)
+    return LowerF128Load(Op, DAG);
+
+  return Op;
+}
+
+// Lower a f128 store into two f64 stores.
+static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
+  assert(StNode && StNode->getOffset().isUndef()
+         && "Unexpected node type");
+  SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, dl, MVT::i32);
+  SDValue SubRegOdd  = DAG.getTargetConstant(SP::sub_odd64, dl, MVT::i32);
+
+  SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                    dl,
+                                    MVT::f64,
+                                    StNode->getValue(),
+                                    SubRegEven);
+  SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                    dl,
+                                    MVT::f64,
+                                    StNode->getValue(),
+                                    SubRegOdd);
+
+  unsigned alignment = StNode->getAlignment();
+  if (alignment > 8)
+    alignment = 8;
+
+  SDValue OutChains[2];
+  OutChains[0] =
+      DAG.getStore(StNode->getChain(), dl, SDValue(Hi64, 0),
+                   StNode->getBasePtr(), MachinePointerInfo(), alignment);
+  EVT addrVT = StNode->getBasePtr().getValueType();
+  SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
+                              StNode->getBasePtr(),
+                              DAG.getConstant(8, dl, addrVT));
+  OutChains[1] = DAG.getStore(StNode->getChain(), dl, SDValue(Lo64, 0), LoPtr,
+                              MachinePointerInfo(), alignment);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+}
+
+static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG)
+{
+  SDLoc dl(Op);
+  StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
+
+  EVT MemVT = St->getMemoryVT();
+  if (MemVT == MVT::f128)
+    return LowerF128Store(Op, DAG);
+
+  if (MemVT == MVT::i64) {
+    // Custom handling for i64 stores: turn it into a bitcast and a
+    // v2i32 store.
+    SDValue Val = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, St->getValue());
+    SDValue Chain = DAG.getStore(
+        St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(),
+        St->getAlignment(), St->getMemOperand()->getFlags(), St->getAAInfo());
+    return Chain;
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
+  assert((Op.getOpcode() == ISD::FNEG || Op.getOpcode() == ISD::FABS)
+         && "invalid opcode");
+
+  SDLoc dl(Op);
+
+  if (Op.getValueType() == MVT::f64)
+    return LowerF64Op(Op.getOperand(0), dl, DAG, Op.getOpcode());
+  if (Op.getValueType() != MVT::f128)
+    return Op;
+
+  // Lower fabs/fneg on f128 to fabs/fneg on f64
+  // fabs/fneg f128 => fabs/fneg f64:sub_even64, fmov f64:sub_odd64
+  // (As with LowerF64Op, on little-endian, we need to negate the odd
+  // subreg)
+
+  SDValue SrcReg128 = Op.getOperand(0);
+  SDValue Hi64 = DAG.getTargetExtractSubreg(SP::sub_even64, dl, MVT::f64,
+                                            SrcReg128);
+  SDValue Lo64 = DAG.getTargetExtractSubreg(SP::sub_odd64, dl, MVT::f64,
+                                            SrcReg128);
+
+  if (DAG.getDataLayout().isLittleEndian()) {
+    if (isV9)
+      Lo64 = DAG.getNode(Op.getOpcode(), dl, MVT::f64, Lo64);
+    else
+      Lo64 = LowerF64Op(Lo64, dl, DAG, Op.getOpcode());
+  } else {
+    if (isV9)
+      Hi64 = DAG.getNode(Op.getOpcode(), dl, MVT::f64, Hi64);
+    else
+      Hi64 = LowerF64Op(Hi64, dl, DAG, Op.getOpcode());
+  }
+
+  SDValue DstReg128 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                 dl, MVT::f128), 0);
+  DstReg128 = DAG.getTargetInsertSubreg(SP::sub_even64, dl, MVT::f128,
+                                        DstReg128, Hi64);
+  DstReg128 = DAG.getTargetInsertSubreg(SP::sub_odd64, dl, MVT::f128,
+                                        DstReg128, Lo64);
+  return DstReg128;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+
+  if (Op.getValueType() != MVT::i64)
+    return Op;
+
+  SDLoc dl(Op);
+  SDValue Src1 = Op.getOperand(0);
+  SDValue Src1Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1);
+  SDValue Src1Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src1,
+                               DAG.getConstant(32, dl, MVT::i64));
+  Src1Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1Hi);
+
+  SDValue Src2 = Op.getOperand(1);
+  SDValue Src2Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2);
+  SDValue Src2Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src2,
+                               DAG.getConstant(32, dl, MVT::i64));
+  Src2Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2Hi);
+
+
+  bool hasChain = false;
+  unsigned hiOpc = Op.getOpcode();
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid opcode");
+  case ISD::ADDC: hiOpc = ISD::ADDE; break;
+  case ISD::ADDE: hasChain = true; break;
+  case ISD::SUBC: hiOpc = ISD::SUBE; break;
+  case ISD::SUBE: hasChain = true; break;
+  }
+  SDValue Lo;
+  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Glue);
+  if (hasChain) {
+    Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo,
+                     Op.getOperand(2));
+  } else {
+    Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo);
+  }
+  SDValue Hi = DAG.getNode(hiOpc, dl, VTs, Src1Hi, Src2Hi, Lo.getValue(1));
+  SDValue Carry = Hi.getValue(1);
+
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Lo);
+  Hi = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Hi);
+  Hi = DAG.getNode(ISD::SHL, dl, MVT::i64, Hi,
+                   DAG.getConstant(32, dl, MVT::i64));
+
+  SDValue Dst = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, Lo);
+  SDValue Ops[2] = { Dst, Carry };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+// Custom lower UMULO/SMULO for SPARC. This code is similar to ExpandNode()
+// in LegalizeDAG.cpp except the order of arguments to the library function.
+static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
+                                const SparcTargetLowering &TLI)
+{
+  unsigned opcode = Op.getOpcode();
+  assert((opcode == ISD::UMULO || opcode == ISD::SMULO) && "Invalid Opcode.");
+
+  bool isSigned = (opcode == ISD::SMULO);
+  EVT VT = MVT::i64;
+  EVT WideVT = MVT::i128;
+  SDLoc dl(Op);
+  SDValue LHS = Op.getOperand(0);
+
+  if (LHS.getValueType() != VT)
+    return Op;
+
+  SDValue ShiftAmt = DAG.getConstant(63, dl, VT);
+
+  SDValue RHS = Op.getOperand(1);
+  SDValue HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, ShiftAmt);
+  SDValue HiRHS = DAG.getNode(ISD::SRA, dl, MVT::i64, RHS, ShiftAmt);
+  SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
+
+  SDValue MulResult = TLI.makeLibCall(DAG,
+                                      RTLIB::MUL_I128, WideVT,
+                                      Args, isSigned, dl).first;
+  SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
+                                   MulResult, DAG.getIntPtrConstant(0, dl));
+  SDValue TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
+                                MulResult, DAG.getIntPtrConstant(1, dl));
+  if (isSigned) {
+    SDValue Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, ShiftAmt);
+    TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, Tmp1, ISD::SETNE);
+  } else {
+    TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, DAG.getConstant(0, dl, VT),
+                           ISD::SETNE);
+  }
+  // MulResult is a node with an illegal type. Because such things are not
+  // generally permitted during this phase of legalization, ensure that
+  // nothing is left using the node. The above EXTRACT_ELEMENT nodes should have
+  // been folded.
+  assert(MulResult->use_empty() && "Illegally typed node still in use!");
+
+  SDValue Ops[2] = { BottomHalf, TopHalf } ;
+  return DAG.getMergeValues(Ops, dl);
+}
+
+static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
+  if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
+  // Expand with a fence.
+  return SDValue();
+
+  // Monotonic load/stores are legal.
+  return Op;
+}
+
+SDValue SparcTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDLoc dl(Op);
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+  case Intrinsic::thread_pointer: {
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
+    return DAG.getRegister(SP::G7, PtrVT);
+  }
+  }
+}
+
+SDValue SparcTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+
+  bool hasHardQuad = Subtarget->hasHardQuad();
+  bool isV9        = Subtarget->isV9();
+
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Should not custom lower this!");
+
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG, *this,
+                                                       Subtarget);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG,
+                                                      Subtarget);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::BR_CC:              return LowerBR_CC(Op, DAG, *this,
+                                                  hasHardQuad);
+  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG, *this,
+                                                      hasHardQuad);
+  case ISD::EH_SJLJ_SETJMP:     return LowerEH_SJLJ_SETJMP(Op, DAG, *this);
+  case ISD::EH_SJLJ_LONGJMP:    return LowerEH_SJLJ_LONGJMP(Op, DAG, *this);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG, *this);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG,
+                                                               Subtarget);
+
+  case ISD::LOAD:               return LowerLOAD(Op, DAG);
+  case ISD::STORE:              return LowerSTORE(Op, DAG);
+  case ISD::FADD:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::ADD_F128), 2);
+  case ISD::FSUB:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::SUB_F128), 2);
+  case ISD::FMUL:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::MUL_F128), 2);
+  case ISD::FDIV:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::DIV_F128), 2);
+  case ISD::FSQRT:              return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::SQRT_F128),1);
+  case ISD::FABS:
+  case ISD::FNEG:               return LowerFNEGorFABS(Op, DAG, isV9);
+  case ISD::FP_EXTEND:          return LowerF128_FPEXTEND(Op, DAG, *this);
+  case ISD::FP_ROUND:           return LowerF128_FPROUND(Op, DAG, *this);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::UMULO:
+  case ISD::SMULO:              return LowerUMULO_SMULO(Op, DAG, *this);
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE:       return LowerATOMIC_LOAD_STORE(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  }
+}
+
+MachineBasicBlock *
+SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                                 MachineBasicBlock *BB) const {
+  switch (MI.getOpcode()) {
+  default: llvm_unreachable("Unknown SELECT_CC!");
+  case SP::SELECT_CC_Int_ICC:
+  case SP::SELECT_CC_FP_ICC:
+  case SP::SELECT_CC_DFP_ICC:
+  case SP::SELECT_CC_QFP_ICC:
+    return expandSelectCC(MI, BB, SP::BCOND);
+  case SP::SELECT_CC_Int_FCC:
+  case SP::SELECT_CC_FP_FCC:
+  case SP::SELECT_CC_DFP_FCC:
+  case SP::SELECT_CC_QFP_FCC:
+    return expandSelectCC(MI, BB, SP::FBCOND);
+  case SP::EH_SJLJ_SETJMP32ri:
+  case SP::EH_SJLJ_SETJMP32rr:
+    return emitEHSjLjSetJmp(MI, BB);
+  case SP::EH_SJLJ_LONGJMP32rr:
+  case SP::EH_SJLJ_LONGJMP32ri:
+    return emitEHSjLjLongJmp(MI, BB);
+
+  }
+}
+
+MachineBasicBlock *
+SparcTargetLowering::expandSelectCC(MachineInstr &MI, MachineBasicBlock *BB,
+                                    unsigned BROpcode) const {
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+  unsigned CC = (SPCC::CondCodes)MI.getOperand(3).getImm();
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = ++BB->getIterator();
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   [f]bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, dl, TII.get(BROpcode)).addMBB(sinkMBB).addImm(CC);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = sinkMBB;
+  BuildMI(*BB, BB->begin(), dl, TII.get(SP::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(2).getReg())
+      .addMBB(copy0MBB)
+      .addReg(MI.getOperand(1).getReg())
+      .addMBB(thisMBB);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+SparcTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
+                                       MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI.getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineInstrBuilder MIB;
+
+  MVT PVT = getPointerTy(MF->getDataLayout());
+  unsigned RegSize = PVT.getStoreSize();
+  assert(PVT == MVT::i32 && "Invalid Pointer Size!");
+
+  unsigned Buf = MI.getOperand(0).getReg();
+  unsigned JmpLoc = MRI.createVirtualRegister(&SP::IntRegsRegClass);
+
+  // TO DO: If we do 64-bit handling, this perhaps should be FLUSHW, not TA 3
+  MIB = BuildMI(*MBB, MI, DL, TII->get(SP::TRAPri), SP::G0).addImm(3).addImm(SPCC::ICC_A);
+
+  // Instruction to restore FP
+  const unsigned FP  = SP::I6;
+  MIB = BuildMI(*MBB, MI, DL, TII->get(SP::LDri))
+            .addReg(FP)
+            .addReg(Buf)
+            .addImm(0);
+
+  // Instruction to load jmp location
+  MIB = BuildMI(*MBB, MI, DL, TII->get(SP::LDri))
+            .addReg(JmpLoc, RegState::Define)
+            .addReg(Buf)
+            .addImm(RegSize);
+
+  // Instruction to restore SP
+  const unsigned SP  = SP::O6;
+  MIB = BuildMI(*MBB, MI, DL, TII->get(SP::LDri))
+            .addReg(SP)
+            .addReg(Buf)
+            .addImm(2 * RegSize);
+
+  // Instruction to restore I7
+  MIB = BuildMI(*MBB, MI, DL, TII->get(SP::LDri))
+            .addReg(SP::I7)
+            .addReg(Buf, RegState::Kill)
+            .addImm(3 * RegSize);
+
+  // Jump to JmpLoc
+  BuildMI(*MBB, MI, DL, TII->get(SP::JMPLrr)).addReg(SP::G0).addReg(JmpLoc, RegState::Kill).addReg(SP::G0);
+
+  MI.eraseFromParent();
+  return MBB;
+}
+
+MachineBasicBlock *
+SparcTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
+                                      MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI.getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineInstrBuilder MIB;
+
+  MVT PVT = getPointerTy(MF->getDataLayout());
+  unsigned RegSize = PVT.getStoreSize();
+  assert(PVT == MVT::i32 && "Invalid Pointer Size!");
+
+  unsigned DstReg = MI.getOperand(0).getReg();
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  unsigned mainDstReg = MRI.createVirtualRegister(RC);
+  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+  // For v = setjmp(buf), we generate
+  //
+  // thisMBB:
+  //  buf[0] = FP
+  //  buf[RegSize] = restoreMBB <-- takes address of restoreMBB
+  //  buf[RegSize * 2] = O6
+  //  buf[RegSize * 3] = I7
+  //  Ensure restoreMBB remains in the relocations list (done using a bn instruction)
+  //  b mainMBB
+  //
+  // mainMBB:
+  //  v_main = 0
+  //  b sinkMBB
+  //
+  // restoreMBB:
+  //  v_restore = 1
+  //  --fall through--
+  //
+  // sinkMBB:
+  //  v = phi(main, restore)
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator It = ++MBB->getIterator();
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+
+  MF->insert(It, mainMBB);
+  MF->insert(It, restoreMBB);
+  MF->insert(It, sinkMBB);
+  restoreMBB->setHasAddressTaken();
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)),
+                  MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  unsigned LabelReg = MRI.createVirtualRegister(&SP::IntRegsRegClass);
+  unsigned LabelReg2 = MRI.createVirtualRegister(&SP::IntRegsRegClass);
+  unsigned BufReg = MI.getOperand(1).getReg();
+
+  // Instruction to store FP
+  const unsigned FP  = SP::I6;
+  MIB = BuildMI(thisMBB, DL, TII->get(SP::STri))
+            .addReg(BufReg)
+            .addImm(0)
+            .addReg(FP);
+
+  // Instructions to store jmp location
+  MIB = BuildMI(thisMBB, DL, TII->get(SP::SETHIi))
+            .addReg(LabelReg, RegState::Define)
+            .addMBB(restoreMBB, SparcMCExpr::VK_Sparc_HI);
+
+  MIB = BuildMI(thisMBB, DL, TII->get(SP::ORri))
+            .addReg(LabelReg2, RegState::Define)
+            .addReg(LabelReg, RegState::Kill)
+            .addMBB(restoreMBB, SparcMCExpr::VK_Sparc_LO);
+
+  MIB = BuildMI(thisMBB, DL, TII->get(SP::STri))
+            .addReg(BufReg)
+            .addImm(RegSize)
+            .addReg(LabelReg2, RegState::Kill);
+
+  // Instruction to store SP
+  const unsigned SP  = SP::O6;
+  MIB = BuildMI(thisMBB, DL, TII->get(SP::STri))
+            .addReg(BufReg)
+            .addImm(2 * RegSize)
+            .addReg(SP);
+
+  // Instruction to store I7
+  MIB = BuildMI(thisMBB, DL, TII->get(SP::STri))
+            .addReg(BufReg)
+            .addImm(3 * RegSize)
+            .addReg(SP::I7);
+
+
+  // FIX ME: This next instruction ensures that the restoreMBB block address remains
+  // valid through optimization passes and serves no other purpose. The ICC_N ensures
+  // that the branch is never taken. This commented-out code here was an alternative
+  // attempt to achieve this which brought myriad problems.
+  //MIB = BuildMI(thisMBB, DL, TII->get(SP::EH_SjLj_Setup)).addMBB(restoreMBB, SparcMCExpr::VK_Sparc_None);
+  MIB = BuildMI(thisMBB, DL, TII->get(SP::BCOND))
+              .addMBB(restoreMBB)
+              .addImm(SPCC::ICC_N);
+
+  MIB = BuildMI(thisMBB, DL, TII->get(SP::BCOND))
+              .addMBB(mainMBB)
+              .addImm(SPCC::ICC_A);
+
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(restoreMBB);
+
+
+  // mainMBB:
+  MIB = BuildMI(mainMBB, DL, TII->get(SP::ORrr))
+             .addReg(mainDstReg, RegState::Define)
+             .addReg(SP::G0)
+             .addReg(SP::G0);
+  MIB = BuildMI(mainMBB, DL, TII->get(SP::BCOND)).addMBB(sinkMBB).addImm(SPCC::ICC_A);
+
+  mainMBB->addSuccessor(sinkMBB);
+
+
+  // restoreMBB:
+  MIB = BuildMI(restoreMBB, DL, TII->get(SP::ORri))
+              .addReg(restoreDstReg, RegState::Define)
+              .addReg(SP::G0)
+              .addImm(1);
+  //MIB = BuildMI(restoreMBB, DL, TII->get(SP::BCOND)).addMBB(sinkMBB).addImm(SPCC::ICC_A);
+  restoreMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  MIB = BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+                TII->get(SP::PHI), DstReg)
+             .addReg(mainDstReg).addMBB(mainMBB)
+             .addReg(restoreDstReg).addMBB(restoreMBB);
+
+  MI.eraseFromParent();
+  return sinkMBB;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Sparc Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+SparcTargetLowering::ConstraintType
+SparcTargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:  break;
+    case 'r': return C_RegisterClass;
+    case 'I': // SIMM13
+      return C_Other;
+    }
+  }
+
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+TargetLowering::ConstraintWeight SparcTargetLowering::
+getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                               const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'I': // SIMM13
+    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
+      if (isInt<13>(C->getSExtValue()))
+        weight = CW_Constant;
+    }
+    break;
+  }
+  return weight;
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void SparcTargetLowering::
+LowerAsmOperandForConstraint(SDValue Op,
+                             std::string &Constraint,
+                             std::vector<SDValue> &Ops,
+                             SelectionDAG &DAG) const {
+  SDValue Result(nullptr, 0);
+
+  // Only support length 1 constraints for now.
+  if (Constraint.length() > 1)
+    return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break;
+  case 'I':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (isInt<13>(C->getSExtValue())) {
+        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+      return;
+    }
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                  StringRef Constraint,
+                                                  MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      if (VT == MVT::v2i32)
+        return std::make_pair(0U, &SP::IntPairRegClass);
+      else
+        return std::make_pair(0U, &SP::IntRegsRegClass);
+    }
+  } else if (!Constraint.empty() && Constraint.size() <= 5
+              && Constraint[0] == '{' && *(Constraint.end()-1) == '}') {
+    // constraint = '{r<d>}'
+    // Remove the braces from around the name.
+    StringRef name(Constraint.data()+1, Constraint.size()-2);
+    // Handle register aliases:
+    //       r0-r7   -> g0-g7
+    //       r8-r15  -> o0-o7
+    //       r16-r23 -> l0-l7
+    //       r24-r31 -> i0-i7
+    uint64_t intVal = 0;
+    if (name.substr(0, 1).equals("r")
+        && !name.substr(1).getAsInteger(10, intVal) && intVal <= 31) {
+      const char regTypes[] = { 'g', 'o', 'l', 'i' };
+      char regType = regTypes[intVal/8];
+      char regIdx = '0' + (intVal % 8);
+      char tmp[] = { '{', regType, regIdx, '}', 0 };
+      std::string newConstraint = std::string(tmp);
+      return TargetLowering::getRegForInlineAsmConstraint(TRI, newConstraint,
+                                                          VT);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+bool
+SparcTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The Sparc target isn't yet aware of offsets.
+  return false;
+}
+
+void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
+                                             SmallVectorImpl<SDValue>& Results,
+                                             SelectionDAG &DAG) const {
+
+  SDLoc dl(N);
+
+  RTLIB::Libcall libCall = RTLIB::UNKNOWN_LIBCALL;
+
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
+
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    // Custom lower only if it involves f128 or i64.
+    if (N->getOperand(0).getValueType() != MVT::f128
+        || N->getValueType(0) != MVT::i64)
+      return;
+    libCall = ((N->getOpcode() == ISD::FP_TO_SINT)
+               ? RTLIB::FPTOSINT_F128_I64
+               : RTLIB::FPTOUINT_F128_I64);
+
+    Results.push_back(LowerF128Op(SDValue(N, 0),
+                                  DAG,
+                                  getLibcallName(libCall),
+                                  1));
+    return;
+
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    // Custom lower only if it involves f128 or i64.
+    if (N->getValueType(0) != MVT::f128
+        || N->getOperand(0).getValueType() != MVT::i64)
+      return;
+
+    libCall = ((N->getOpcode() == ISD::SINT_TO_FP)
+               ? RTLIB::SINTTOFP_I64_F128
+               : RTLIB::UINTTOFP_I64_F128);
+
+    Results.push_back(LowerF128Op(SDValue(N, 0),
+                                  DAG,
+                                  getLibcallName(libCall),
+                                  1));
+    return;
+  case ISD::LOAD: {
+    LoadSDNode *Ld = cast<LoadSDNode>(N);
+    // Custom handling only for i64: turn i64 load into a v2i32 load,
+    // and a bitcast.
+    if (Ld->getValueType(0) != MVT::i64 || Ld->getMemoryVT() != MVT::i64)
+      return;
+
+    SDLoc dl(N);
+    SDValue LoadRes = DAG.getExtLoad(
+        Ld->getExtensionType(), dl, MVT::v2i32, Ld->getChain(),
+        Ld->getBasePtr(), Ld->getPointerInfo(), MVT::v2i32, Ld->getAlignment(),
+        Ld->getMemOperand()->getFlags(), Ld->getAAInfo());
+
+    SDValue Res = DAG.getNode(ISD::BITCAST, dl, MVT::i64, LoadRes);
+    Results.push_back(Res);
+    Results.push_back(LoadRes.getValue(1));
+    return;
+  }
+  }
+}
+
+// Override to enable LOAD_STACK_GUARD lowering on Linux.
+bool SparcTargetLowering::useLoadStackGuardNode() const {
+  if (!Subtarget->isTargetLinux())
+    return TargetLowering::useLoadStackGuardNode();
+  return true;
+}
+
+// Override to disable global variable loading on Linux.
+void SparcTargetLowering::insertSSPDeclarations(Module &M) const {
+  if (!Subtarget->isTargetLinux())
+    return TargetLowering::insertSSPDeclarations(M);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h
new file mode 100644
index 000000000000..e0a421b83712
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -0,0 +1,223 @@
+//===-- SparcISelLowering.h - Sparc DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Sparc uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCISELLOWERING_H
+#define LLVM_LIB_TARGET_SPARC_SPARCISELLOWERING_H
+
+#include "Sparc.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  class SparcSubtarget;
+
+  namespace SPISD {
+    enum NodeType : unsigned {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+      CMPICC,      // Compare two GPR operands, set icc+xcc.
+      CMPFCC,      // Compare two FP operands, set fcc.
+      BRICC,       // Branch to dest on icc condition
+      BRXCC,       // Branch to dest on xcc condition (64-bit only).
+      BRFCC,       // Branch to dest on fcc condition
+      SELECT_ICC,  // Select between two values using the current ICC flags.
+      SELECT_XCC,  // Select between two values using the current XCC flags.
+      SELECT_FCC,  // Select between two values using the current FCC flags.
+
+      EH_SJLJ_SETJMP,  // builtin setjmp operation
+      EH_SJLJ_LONGJMP, // builtin longjmp operation
+
+      Hi, Lo,      // Hi/Lo operations, typically on a global address.
+
+      FTOI,        // FP to Int within a FP register.
+      ITOF,        // Int to FP within a FP register.
+      FTOX,        // FP to Int64 within a FP register.
+      XTOF,        // Int64 to FP within a FP register.
+
+      CALL,        // A call instruction.
+      RET_FLAG,    // Return with a flag operand.
+      GLOBAL_BASE_REG, // Global base reg for PIC.
+      FLUSHW,      // FLUSH register windows to stack.
+
+      TLS_ADD,     // For Thread Local Storage (TLS).
+      TLS_LD,
+      TLS_CALL
+    };
+  }
+
+  class SparcTargetLowering : public TargetLowering {
+    const SparcSubtarget *Subtarget;
+  public:
+    SparcTargetLowering(const TargetMachine &TM, const SparcSubtarget &STI);
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+    
+    bool useSoftFloat() const override;
+    
+    /// computeKnownBitsForTargetNode - Determine which of the bits specified
+    /// in Mask are known to be either zero or one and return them in the
+    /// KnownZero/KnownOne bitsets.
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
+
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *MBB) const override;
+
+    const char *getTargetNodeName(unsigned Opcode) const override;
+
+    ConstraintType getConstraintType(StringRef Constraint) const override;
+    ConstraintWeight
+    getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                   const char *constraint) const override;
+    void LowerAsmOperandForConstraint(SDValue Op,
+                                      std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
+
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+      if (ConstraintCode == "o")
+        return InlineAsm::Constraint_o;
+      return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+    }
+
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 StringRef Constraint, MVT VT) const override;
+
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+    MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+      return MVT::i32;
+    }
+
+    unsigned getRegisterByName(const char* RegName, EVT VT,
+                               SelectionDAG &DAG) const override;
+
+    /// If a physical register, this returns the register that receives the
+    /// exception address on entry to an EH pad.
+    unsigned
+    getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+      return SP::I0;
+    }
+
+    /// If a physical register, this returns the register that receives the
+    /// exception typeid on entry to a landing pad.
+    unsigned
+    getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+      return SP::I1;
+    }
+
+    /// Override to support customized stack guard loading.
+    bool useLoadStackGuardNode() const override;
+    void insertSSPDeclarations(Module &M) const override;
+
+    /// getSetCCResultType - Return the ISD::SETCC ValueType
+    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                           EVT VT) const override;
+
+    SDValue
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue LowerFormalArguments_32(SDValue Chain, CallingConv::ID CallConv,
+                                    bool isVarArg,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    const SDLoc &dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerFormalArguments_64(SDValue Chain, CallingConv::ID CallConv,
+                                    bool isVarArg,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    const SDLoc &dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
+                         SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
+                         SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
+    SDValue LowerReturn_32(SDValue Chain, CallingConv::ID CallConv,
+                           bool IsVarArg,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<SDValue> &OutVals,
+                           const SDLoc &DL, SelectionDAG &DAG) const;
+    SDValue LowerReturn_64(SDValue Chain, CallingConv::ID CallConv,
+                           bool IsVarArg,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<SDValue> &OutVals,
+                           const SDLoc &DL, SelectionDAG &DAG) const;
+
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG,
+                                const SparcTargetLowering &TLI) const ;
+    SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG,
+                                 const SparcTargetLowering &TLI) const ;
+
+    unsigned getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const;
+    SDValue withTargetFlags(SDValue Op, unsigned TF, SelectionDAG &DAG) const;
+    SDValue makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
+                         SelectionDAG &DAG) const;
+    SDValue makeAddress(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args, SDValue Arg,
+                                 const SDLoc &DL, SelectionDAG &DAG) const;
+    SDValue LowerF128Op(SDValue Op, SelectionDAG &DAG,
+                        const char *LibFuncName,
+                        unsigned numArgs) const;
+    SDValue LowerF128Compare(SDValue LHS, SDValue RHS, unsigned &SPCC,
+                             const SDLoc &DL, SelectionDAG &DAG) const;
+
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+
+    bool ShouldShrinkFPConstant(EVT VT) const override {
+      // Do not shrink FP constpool if VT == MVT::f128.
+      // (ldd, call _Q_fdtoq) is more expensive than two ldds.
+      return VT != MVT::f128;
+    }
+
+    bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+      // FIXME: We insert fences for each atomics and generate
+      // sub-optimal code for PSO/TSO. (Approximately nobody uses any
+      // mode but TSO, which makes this even more silly)
+      return true;
+    }
+
+    AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
+    void ReplaceNodeResults(SDNode *N,
+                            SmallVectorImpl<SDValue>& Results,
+                            SelectionDAG &DAG) const override;
+
+    MachineBasicBlock *expandSelectCC(MachineInstr &MI, MachineBasicBlock *BB,
+                                      unsigned BROpcode) const;
+    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
+                                        MachineBasicBlock *MBB) const;
+    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
+                                         MachineBasicBlock *MBB) const;
+  };
+} // end namespace llvm
+
+#endif    // SPARC_ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstr64Bit.td b/contrib/llvm/lib/Target/Sparc/SparcInstr64Bit.td
new file mode 100644
index 000000000000..f6518c936ebc
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstr64Bit.td
@@ -0,0 +1,541 @@
+//===-- SparcInstr64Bit.td - 64-bit instructions for Sparc Target ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction definitions and patterns needed for 64-bit
+// code generation on SPARC v9.
+//
+// Some SPARC v9 instructions are defined in SparcInstrInfo.td because they can
+// also be used in 32-bit code running on a SPARC v9 CPU.
+//
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+// The same integer registers are used for i32 and i64 values.
+// When registers hold i32 values, the high bits are don't care.
+// This give us free trunc and anyext.
+def : Pat<(i64 (anyext i32:$val)), (COPY_TO_REGCLASS $val, I64Regs)>;
+def : Pat<(i32 (trunc i64:$val)), (COPY_TO_REGCLASS $val, IntRegs)>;
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Shift Instructions.
+//===----------------------------------------------------------------------===//
+//
+// The 32-bit shift instructions are still available. The left shift srl
+// instructions shift all 64 bits, but it only accepts a 5-bit shift amount.
+//
+// The srl instructions only shift the low 32 bits and clear the high 32 bits.
+// Finally, sra shifts the low 32 bits and sign-extends to 64 bits.
+
+let Predicates = [Is64Bit] in {
+
+def : Pat<(i64 (zext i32:$val)), (SRLri $val, 0)>;
+def : Pat<(i64 (sext i32:$val)), (SRAri $val, 0)>;
+
+def : Pat<(i64 (and i64:$val, 0xffffffff)), (SRLri $val, 0)>;
+def : Pat<(i64 (sext_inreg i64:$val, i32)), (SRAri $val, 0)>;
+
+defm SLLX : F3_S<"sllx", 0b100101, 1, shl, i64, I64Regs>;
+defm SRLX : F3_S<"srlx", 0b100110, 1, srl, i64, I64Regs>;
+defm SRAX : F3_S<"srax", 0b100111, 1, sra, i64, I64Regs>;
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Immediates.
+//===----------------------------------------------------------------------===//
+//
+// All 32-bit immediates can be materialized with sethi+or, but 64-bit
+// immediates may require more code. There may be a point where it is
+// preferable to use a constant pool load instead, depending on the
+// microarchitecture.
+
+// Single-instruction patterns.
+
+// The ALU instructions want their simm13 operands as i32 immediates.
+def as_i32imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+def : Pat<(i64 simm13:$val), (ORri (i64 G0), (as_i32imm $val))>;
+def : Pat<(i64 SETHIimm:$val), (SETHIi (HI22 $val))>;
+
+// Double-instruction patterns.
+
+// All unsigned i32 immediates can be handled by sethi+or.
+def uimm32 : PatLeaf<(imm), [{ return isUInt<32>(N->getZExtValue()); }]>;
+def : Pat<(i64 uimm32:$val), (ORri (SETHIi (HI22 $val)), (LO10 $val))>,
+      Requires<[Is64Bit]>;
+
+// All negative i33 immediates can be handled by sethi+xor.
+def nimm33 : PatLeaf<(imm), [{
+  int64_t Imm = N->getSExtValue();
+  return Imm < 0 && isInt<33>(Imm);
+}]>;
+// Bits 10-31 inverted. Same as assembler's %hix.
+def HIX22 : SDNodeXForm<imm, [{
+  uint64_t Val = (~N->getZExtValue() >> 10) & ((1u << 22) - 1);
+  return CurDAG->getTargetConstant(Val, SDLoc(N), MVT::i32);
+}]>;
+// Bits 0-9 with ones in bits 10-31. Same as assembler's %lox.
+def LOX10 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~(~N->getZExtValue() & 0x3ff), SDLoc(N),
+                                   MVT::i32);
+}]>;
+def : Pat<(i64 nimm33:$val), (XORri (SETHIi (HIX22 $val)), (LOX10 $val))>,
+      Requires<[Is64Bit]>;
+
+// More possible patterns:
+//
+//   (sllx sethi, n)
+//   (sllx simm13, n)
+//
+// 3 instrs:
+//
+//   (xor (sllx sethi), simm13)
+//   (sllx (xor sethi, simm13))
+//
+// 4 instrs:
+//
+//   (or sethi, (sllx sethi))
+//   (xnor sethi, (sllx sethi))
+//
+// 5 instrs:
+//
+//   (or (sllx sethi), (or sethi, simm13))
+//   (xnor (sllx sethi), (or sethi, simm13))
+//   (or (sllx sethi), (sllx sethi))
+//   (xnor (sllx sethi), (sllx sethi))
+//
+// Worst case is 6 instrs:
+//
+//   (or (sllx (or sethi, simmm13)), (or sethi, simm13))
+
+// Bits 42-63, same as assembler's %hh.
+def HH22 : SDNodeXForm<imm, [{
+  uint64_t Val = (N->getZExtValue() >> 42) & ((1u << 22) - 1);
+  return CurDAG->getTargetConstant(Val, SDLoc(N), MVT::i32);
+}]>;
+// Bits 32-41, same as assembler's %hm.
+def HM10 : SDNodeXForm<imm, [{
+  uint64_t Val = (N->getZExtValue() >> 32) & ((1u << 10) - 1);
+  return CurDAG->getTargetConstant(Val, SDLoc(N), MVT::i32);
+}]>;
+def : Pat<(i64 imm:$val),
+          (ORrr (SLLXri (ORri (SETHIi (HH22 $val)), (HM10 $val)), (i32 32)),
+                (ORri (SETHIi (HI22 $val)), (LO10 $val)))>,
+      Requires<[Is64Bit]>;
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Integer Arithmetic and Logic.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+
+// Register-register instructions.
+let isCodeGenOnly = 1 in {
+defm ANDX    : F3_12<"and", 0b000001, and, I64Regs, i64, i64imm>;
+defm ORX     : F3_12<"or",  0b000010, or,  I64Regs, i64, i64imm>;
+defm XORX    : F3_12<"xor", 0b000011, xor, I64Regs, i64, i64imm>;
+
+def ANDXNrr  : F3_1<2, 0b000101,
+                 (outs I64Regs:$dst), (ins I64Regs:$b, I64Regs:$c),
+                 "andn $b, $c, $dst",
+                 [(set i64:$dst, (and i64:$b, (not i64:$c)))]>;
+def ORXNrr   : F3_1<2, 0b000110,
+                 (outs I64Regs:$dst), (ins I64Regs:$b, I64Regs:$c),
+                 "orn $b, $c, $dst",
+                 [(set i64:$dst, (or i64:$b, (not i64:$c)))]>;
+def XNORXrr  : F3_1<2, 0b000111,
+                   (outs I64Regs:$dst), (ins I64Regs:$b, I64Regs:$c),
+                   "xnor $b, $c, $dst",
+                   [(set i64:$dst, (not (xor i64:$b, i64:$c)))]>;
+
+defm ADDX    : F3_12<"add", 0b000000, add, I64Regs, i64, i64imm>;
+defm SUBX    : F3_12<"sub", 0b000100, sub, I64Regs, i64, i64imm>;
+
+def TLS_ADDXrr : F3_1<2, 0b000000, (outs I64Regs:$rd),
+                   (ins I64Regs:$rs1, I64Regs:$rs2, TLSSym:$sym),
+                   "add $rs1, $rs2, $rd, $sym",
+                   [(set i64:$rd,
+                       (tlsadd i64:$rs1, i64:$rs2, tglobaltlsaddr:$sym))]>;
+
+// "LEA" form of add
+def LEAX_ADDri : F3_2<2, 0b000000,
+                     (outs I64Regs:$dst), (ins MEMri:$addr),
+                     "add ${addr:arith}, $dst",
+                     [(set iPTR:$dst, ADDRri:$addr)]>;
+}
+
+def : Pat<(SPcmpicc i64:$a, i64:$b), (CMPrr $a, $b)>;
+def : Pat<(SPcmpicc i64:$a, (i64 simm13:$b)), (CMPri $a, (as_i32imm $b))>;
+def : Pat<(ctpop i64:$src), (POPCrr $src)>;
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Integer Multiply and Divide.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+
+def MULXrr : F3_1<2, 0b001001,
+                  (outs I64Regs:$rd), (ins I64Regs:$rs1, I64Regs:$rs2),
+                  "mulx $rs1, $rs2, $rd",
+                  [(set i64:$rd, (mul i64:$rs1, i64:$rs2))]>;
+def MULXri : F3_2<2, 0b001001,
+                  (outs IntRegs:$rd), (ins IntRegs:$rs1, i64imm:$simm13),
+                  "mulx $rs1, $simm13, $rd",
+                  [(set i64:$rd, (mul i64:$rs1, (i64 simm13:$simm13)))]>;
+
+// Division can trap.
+let hasSideEffects = 1 in {
+def SDIVXrr : F3_1<2, 0b101101,
+                   (outs I64Regs:$rd), (ins I64Regs:$rs1, I64Regs:$rs2),
+                   "sdivx $rs1, $rs2, $rd",
+                   [(set i64:$rd, (sdiv i64:$rs1, i64:$rs2))]>;
+def SDIVXri : F3_2<2, 0b101101,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, i64imm:$simm13),
+                   "sdivx $rs1, $simm13, $rd",
+                   [(set i64:$rd, (sdiv i64:$rs1, (i64 simm13:$simm13)))]>;
+
+def UDIVXrr : F3_1<2, 0b001101,
+                   (outs I64Regs:$rd), (ins I64Regs:$rs1, I64Regs:$rs2),
+                   "udivx $rs1, $rs2, $rd",
+                   [(set i64:$rd, (udiv i64:$rs1, i64:$rs2))]>;
+def UDIVXri : F3_2<2, 0b001101,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, i64imm:$simm13),
+                   "udivx $rs1, $simm13, $rd",
+                   [(set i64:$rd, (udiv i64:$rs1, (i64 simm13:$simm13)))]>;
+} // hasSideEffects = 1
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Loads and Stores.
+//===----------------------------------------------------------------------===//
+//
+// All the 32-bit loads and stores are available. The extending loads are sign
+// or zero-extending to 64 bits. The LDrr and LDri instructions load 32 bits
+// zero-extended to i64. Their mnemonic is lduw in SPARC v9 (Load Unsigned
+// Word).
+//
+// SPARC v9 adds 64-bit loads as well as a sign-extending ldsw i32 loads.
+
+let Predicates = [Is64Bit] in {
+
+// 64-bit loads.
+let DecoderMethod = "DecodeLoadInt" in
+  defm LDX   : Load<"ldx", 0b001011, load, I64Regs, i64>;
+
+let mayLoad = 1, isCodeGenOnly = 1, isAsmParserOnly = 1 in
+  def TLS_LDXrr : F3_1<3, 0b001011,
+                       (outs IntRegs:$dst), (ins MEMrr:$addr, TLSSym:$sym),
+                       "ldx [$addr], $dst, $sym",
+                       [(set i64:$dst,
+                           (tlsld ADDRrr:$addr, tglobaltlsaddr:$sym))]>;
+
+// Extending loads to i64.
+def : Pat<(i64 (zextloadi1 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi1 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
+def : Pat<(i64 (extloadi1 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (extloadi1 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
+
+def : Pat<(i64 (zextloadi8 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi8 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
+def : Pat<(i64 (extloadi8 ADDRrr:$addr)),  (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (extloadi8 ADDRri:$addr)),  (LDUBri ADDRri:$addr)>;
+def : Pat<(i64 (sextloadi8 ADDRrr:$addr)), (LDSBrr ADDRrr:$addr)>;
+def : Pat<(i64 (sextloadi8 ADDRri:$addr)), (LDSBri ADDRri:$addr)>;
+
+def : Pat<(i64 (zextloadi16 ADDRrr:$addr)), (LDUHrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi16 ADDRri:$addr)), (LDUHri ADDRri:$addr)>;
+def : Pat<(i64 (extloadi16 ADDRrr:$addr)),  (LDUHrr ADDRrr:$addr)>;
+def : Pat<(i64 (extloadi16 ADDRri:$addr)),  (LDUHri ADDRri:$addr)>;
+def : Pat<(i64 (sextloadi16 ADDRrr:$addr)), (LDSHrr ADDRrr:$addr)>;
+def : Pat<(i64 (sextloadi16 ADDRri:$addr)), (LDSHri ADDRri:$addr)>;
+
+def : Pat<(i64 (zextloadi32 ADDRrr:$addr)), (LDrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi32 ADDRri:$addr)), (LDri ADDRri:$addr)>;
+def : Pat<(i64 (extloadi32 ADDRrr:$addr)),  (LDrr ADDRrr:$addr)>;
+def : Pat<(i64 (extloadi32 ADDRri:$addr)),  (LDri ADDRri:$addr)>;
+
+// Sign-extending load of i32 into i64 is a new SPARC v9 instruction.
+let DecoderMethod = "DecodeLoadInt" in
+  defm LDSW   : Load<"ldsw", 0b001000, sextloadi32, I64Regs, i64>;
+
+// 64-bit stores.
+let DecoderMethod = "DecodeStoreInt" in
+  defm STX    : Store<"stx", 0b001110, store,  I64Regs, i64>;
+
+// Truncating stores from i64 are identical to the i32 stores.
+def : Pat<(truncstorei8  i64:$src, ADDRrr:$addr), (STBrr ADDRrr:$addr, $src)>;
+def : Pat<(truncstorei8  i64:$src, ADDRri:$addr), (STBri ADDRri:$addr, $src)>;
+def : Pat<(truncstorei16 i64:$src, ADDRrr:$addr), (STHrr ADDRrr:$addr, $src)>;
+def : Pat<(truncstorei16 i64:$src, ADDRri:$addr), (STHri ADDRri:$addr, $src)>;
+def : Pat<(truncstorei32 i64:$src, ADDRrr:$addr), (STrr  ADDRrr:$addr, $src)>;
+def : Pat<(truncstorei32 i64:$src, ADDRri:$addr), (STri  ADDRri:$addr, $src)>;
+
+// store 0, addr -> store %g0, addr
+def : Pat<(store (i64 0), ADDRrr:$dst), (STXrr ADDRrr:$dst, (i64 G0))>;
+def : Pat<(store (i64 0), ADDRri:$dst), (STXri ADDRri:$dst, (i64 G0))>;
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Conditionals.
+//===----------------------------------------------------------------------===//
+
+//
+// Flag-setting instructions like subcc and addcc set both icc and xcc flags.
+// The icc flags correspond to the 32-bit result, and the xcc are for the
+// full 64-bit result.
+//
+// We reuse CMPICC SDNodes for compares, but use new BRXCC branch nodes for
+// 64-bit compares. See LowerBR_CC.
+
+let Predicates = [Is64Bit] in {
+
+let Uses = [ICC], cc = 0b10 in
+  defm BPX : IPredBranch<"%xcc", [(SPbrxcc bb:$imm19, imm:$cond)]>;
+
+// Conditional moves on %xcc.
+let Uses = [ICC], Constraints = "$f = $rd" in {
+let intcc = 1, cc = 0b10 in {
+def MOVXCCrr : F4_1<0b101100, (outs IntRegs:$rd),
+                      (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+                      "mov$cond %xcc, $rs2, $rd",
+                      [(set i32:$rd,
+                       (SPselectxcc i32:$rs2, i32:$f, imm:$cond))]>;
+def MOVXCCri : F4_2<0b101100, (outs IntRegs:$rd),
+                      (ins i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+                      "mov$cond %xcc, $simm11, $rd",
+                      [(set i32:$rd,
+                       (SPselectxcc simm11:$simm11, i32:$f, imm:$cond))]>;
+} // cc
+
+let intcc = 1, opf_cc = 0b10 in {
+def FMOVS_XCC : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+                      (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+                      "fmovs$cond %xcc, $rs2, $rd",
+                      [(set f32:$rd,
+                       (SPselectxcc f32:$rs2, f32:$f, imm:$cond))]>;
+def FMOVD_XCC : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+                      (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+                      "fmovd$cond %xcc, $rs2, $rd",
+                      [(set f64:$rd,
+                       (SPselectxcc f64:$rs2, f64:$f, imm:$cond))]>;
+def FMOVQ_XCC : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+                      (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+                      "fmovq$cond %xcc, $rs2, $rd",
+                      [(set f128:$rd,
+                       (SPselectxcc f128:$rs2, f128:$f, imm:$cond))]>;
+} // opf_cc
+} // Uses, Constraints
+
+// Branch On integer register with Prediction (BPr).
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in
+multiclass BranchOnReg<bits<3> cond, string OpcStr> {
+  def napt : F2_4<cond, 0, 1, (outs), (ins I64Regs:$rs1, bprtarget16:$imm16),
+             !strconcat(OpcStr, " $rs1, $imm16"), []>;
+  def apt  : F2_4<cond, 1, 1, (outs), (ins I64Regs:$rs1, bprtarget16:$imm16),
+             !strconcat(OpcStr, ",a $rs1, $imm16"), []>;
+  def napn  : F2_4<cond, 0, 0, (outs), (ins I64Regs:$rs1, bprtarget16:$imm16),
+             !strconcat(OpcStr, ",pn $rs1, $imm16"), []>;
+  def apn : F2_4<cond, 1, 0, (outs), (ins I64Regs:$rs1, bprtarget16:$imm16),
+             !strconcat(OpcStr, ",a,pn $rs1, $imm16"), []>;
+}
+
+multiclass bpr_alias<string OpcStr, Instruction NAPT, Instruction APT> {
+  def : InstAlias<!strconcat(OpcStr, ",pt $rs1, $imm16"),
+                  (NAPT I64Regs:$rs1, bprtarget16:$imm16), 0>;
+  def : InstAlias<!strconcat(OpcStr, ",a,pt $rs1, $imm16"),
+                  (APT I64Regs:$rs1, bprtarget16:$imm16), 0>;
+}
+
+defm BPZ   : BranchOnReg<0b001, "brz">;
+defm BPLEZ : BranchOnReg<0b010, "brlez">;
+defm BPLZ  : BranchOnReg<0b011, "brlz">;
+defm BPNZ  : BranchOnReg<0b101, "brnz">;
+defm BPGZ  : BranchOnReg<0b110, "brgz">;
+defm BPGEZ : BranchOnReg<0b111, "brgez">;
+
+defm : bpr_alias<"brz",   BPZnapt,   BPZapt  >;
+defm : bpr_alias<"brlez", BPLEZnapt, BPLEZapt>;
+defm : bpr_alias<"brlz",  BPLZnapt,  BPLZapt >;
+defm : bpr_alias<"brnz",  BPNZnapt,  BPNZapt >;
+defm : bpr_alias<"brgz",  BPGZnapt,  BPGZapt >;
+defm : bpr_alias<"brgez", BPGEZnapt, BPGEZapt>;
+
+// Move integer register on register condition (MOVr).
+multiclass MOVR< bits<3> rcond,  string OpcStr> {
+  def rr : F4_4r<0b101111, 0b00000, rcond, (outs I64Regs:$rd),
+                   (ins I64Regs:$rs1, IntRegs:$rs2),
+                   !strconcat(OpcStr, " $rs1, $rs2, $rd"), []>;
+
+  def ri : F4_4i<0b101111, rcond, (outs I64Regs:$rd),
+                   (ins I64Regs:$rs1, i64imm:$simm10),
+                   !strconcat(OpcStr, " $rs1, $simm10, $rd"), []>;
+}
+
+defm MOVRRZ  : MOVR<0b001, "movrz">;
+defm MOVRLEZ : MOVR<0b010, "movrlez">;
+defm MOVRLZ  : MOVR<0b011, "movrlz">;
+defm MOVRNZ  : MOVR<0b101, "movrnz">;
+defm MOVRGZ  : MOVR<0b110, "movrgz">;
+defm MOVRGEZ : MOVR<0b111, "movrgez">;
+
+// Move FP register on integer register condition (FMOVr).
+multiclass FMOVR<bits<3> rcond, string OpcStr> {
+
+  def S : F4_4r<0b110101, 0b00101, rcond,
+                (outs FPRegs:$rd), (ins I64Regs:$rs1, FPRegs:$rs2),
+                !strconcat(!strconcat("fmovrs", OpcStr)," $rs1, $rs2, $rd"),
+                []>;
+  def D : F4_4r<0b110101, 0b00110, rcond,
+                (outs FPRegs:$rd), (ins I64Regs:$rs1, FPRegs:$rs2),
+                !strconcat(!strconcat("fmovrd", OpcStr)," $rs1, $rs2, $rd"),
+                []>;
+  def Q : F4_4r<0b110101, 0b00111, rcond,
+                (outs FPRegs:$rd), (ins I64Regs:$rs1, FPRegs:$rs2),
+                !strconcat(!strconcat("fmovrq", OpcStr)," $rs1, $rs2, $rd"),
+                []>, Requires<[HasHardQuad]>;
+}
+
+let Predicates = [HasV9] in {
+  defm FMOVRZ   : FMOVR<0b001, "z">;
+  defm FMOVRLEZ : FMOVR<0b010, "lez">;
+  defm FMOVRLZ  : FMOVR<0b011, "lz">;
+  defm FMOVRNZ  : FMOVR<0b101, "nz">;
+  defm FMOVRGZ  : FMOVR<0b110, "gz">;
+  defm FMOVRGEZ : FMOVR<0b111, "gez">;
+}
+
+//===----------------------------------------------------------------------===//
+// 64-bit Floating Point Conversions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+
+def FXTOS : F3_3u<2, 0b110100, 0b010000100,
+                 (outs FPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fxtos $rs2, $rd",
+                 [(set FPRegs:$rd, (SPxtof DFPRegs:$rs2))]>;
+def FXTOD : F3_3u<2, 0b110100, 0b010001000,
+                 (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fxtod $rs2, $rd",
+                 [(set DFPRegs:$rd, (SPxtof DFPRegs:$rs2))]>;
+def FXTOQ : F3_3u<2, 0b110100, 0b010001100,
+                 (outs QFPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fxtoq $rs2, $rd",
+                 [(set QFPRegs:$rd, (SPxtof DFPRegs:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+
+def FSTOX : F3_3u<2, 0b110100, 0b010000001,
+                 (outs DFPRegs:$rd), (ins FPRegs:$rs2),
+                 "fstox $rs2, $rd",
+                 [(set DFPRegs:$rd, (SPftox FPRegs:$rs2))]>;
+def FDTOX : F3_3u<2, 0b110100, 0b010000010,
+                 (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fdtox $rs2, $rd",
+                 [(set DFPRegs:$rd, (SPftox DFPRegs:$rs2))]>;
+def FQTOX : F3_3u<2, 0b110100, 0b010000011,
+                 (outs DFPRegs:$rd), (ins QFPRegs:$rs2),
+                 "fqtox $rs2, $rd",
+                 [(set DFPRegs:$rd, (SPftox QFPRegs:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+
+} // Predicates = [Is64Bit]
+
+def : Pat<(SPselectxcc i64:$t, i64:$f, imm:$cond),
+          (MOVXCCrr $t, $f, imm:$cond)>;
+def : Pat<(SPselectxcc (i64 simm11:$t), i64:$f, imm:$cond),
+          (MOVXCCri (as_i32imm $t), $f, imm:$cond)>;
+
+def : Pat<(SPselecticc i64:$t, i64:$f, imm:$cond),
+          (MOVICCrr $t, $f, imm:$cond)>;
+def : Pat<(SPselecticc (i64 simm11:$t), i64:$f, imm:$cond),
+          (MOVICCri (as_i32imm $t), $f, imm:$cond)>;
+
+def : Pat<(SPselectfcc i64:$t, i64:$f, imm:$cond),
+          (MOVFCCrr $t, $f, imm:$cond)>;
+def : Pat<(SPselectfcc (i64 simm11:$t), i64:$f, imm:$cond),
+          (MOVFCCri (as_i32imm $t), $f, imm:$cond)>;
+
+} // Predicates = [Is64Bit]
+
+
+// 64 bit SETHI
+let Predicates = [Is64Bit], isCodeGenOnly = 1 in {
+def SETHIXi : F2_1<0b100,
+                   (outs IntRegs:$rd), (ins i64imm:$imm22),
+                   "sethi $imm22, $rd",
+                   [(set i64:$rd, SETHIimm:$imm22)]>;
+}
+
+// ATOMICS.
+let Predicates = [Is64Bit], Constraints = "$swap = $rd", asi = 0b10000000 in {
+  def CASXrr: F3_1_asi<3, 0b111110,
+                (outs I64Regs:$rd), (ins I64Regs:$rs1, I64Regs:$rs2,
+                                     I64Regs:$swap),
+                 "casx [$rs1], $rs2, $rd",
+                 [(set i64:$rd,
+                     (atomic_cmp_swap_64 i64:$rs1, i64:$rs2, i64:$swap))]>;
+
+} // Predicates = [Is64Bit], Constraints = ...
+
+let Predicates = [Is64Bit] in {
+
+def : Pat<(atomic_fence imm, imm), (MEMBARi 0xf)>;
+
+// atomic_load_64 addr -> load addr
+def : Pat<(i64 (atomic_load_64 ADDRrr:$src)), (LDXrr ADDRrr:$src)>;
+def : Pat<(i64 (atomic_load_64 ADDRri:$src)), (LDXri ADDRri:$src)>;
+
+// atomic_store_64 val, addr -> store val, addr
+def : Pat<(atomic_store_64 ADDRrr:$dst, i64:$val), (STXrr ADDRrr:$dst, $val)>;
+def : Pat<(atomic_store_64 ADDRri:$dst, i64:$val), (STXri ADDRri:$dst, $val)>;
+
+} // Predicates = [Is64Bit]
+
+let Predicates = [Is64Bit], hasSideEffects = 1, Uses = [ICC], cc = 0b10 in
+ defm TXCC : TRAP<"%xcc">;
+
+// Global addresses, constant pool entries
+let Predicates = [Is64Bit] in {
+
+def : Pat<(SPhi tglobaladdr:$in), (SETHIi tglobaladdr:$in)>;
+def : Pat<(SPlo tglobaladdr:$in), (ORXri (i64 G0), tglobaladdr:$in)>;
+def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>;
+def : Pat<(SPlo tconstpool:$in), (ORXri (i64 G0), tconstpool:$in)>;
+
+// GlobalTLS addresses
+def : Pat<(SPhi tglobaltlsaddr:$in), (SETHIi tglobaltlsaddr:$in)>;
+def : Pat<(SPlo tglobaltlsaddr:$in), (ORXri (i64 G0), tglobaltlsaddr:$in)>;
+def : Pat<(add (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+          (ADDXri (SETHIXi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+def : Pat<(xor (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+          (XORXri  (SETHIXi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+
+// Blockaddress
+def : Pat<(SPhi tblockaddress:$in), (SETHIi tblockaddress:$in)>;
+def : Pat<(SPlo tblockaddress:$in), (ORXri (i64 G0), tblockaddress:$in)>;
+
+// Add reg, lo.  This is used when taking the addr of a global/constpool entry.
+def : Pat<(add iPTR:$r, (SPlo tglobaladdr:$in)), (ADDXri $r, tglobaladdr:$in)>;
+def : Pat<(add iPTR:$r, (SPlo tconstpool:$in)),  (ADDXri $r, tconstpool:$in)>;
+def : Pat<(add iPTR:$r, (SPlo tblockaddress:$in)),
+                        (ADDXri $r, tblockaddress:$in)>;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td b/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td
new file mode 100644
index 000000000000..df570cea8da8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td
@@ -0,0 +1,506 @@
+//===-- SparcInstrAliases.td - Instruction Aliases for Sparc Target -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction aliases for Sparc.
+//===----------------------------------------------------------------------===//
+
+// Instruction aliases for conditional moves.
+
+// mov<cond> <ccreg> rs2, rd
+multiclass intcond_mov_alias<string cond, int condVal, string ccreg,
+                          Instruction movrr, Instruction movri,
+                          Instruction fmovs, Instruction fmovd> {
+
+  // mov<cond> (%icc|%xcc), rs2, rd
+  def : InstAlias<!strconcat(!strconcat(!strconcat("mov", cond), ccreg),
+                             ", $rs2, $rd"),
+                  (movrr IntRegs:$rd, IntRegs:$rs2, condVal)>;
+
+  // mov<cond> (%icc|%xcc), simm11, rd
+  def : InstAlias<!strconcat(!strconcat(!strconcat("mov", cond), ccreg),
+                             ", $simm11, $rd"),
+                  (movri IntRegs:$rd, i32imm:$simm11, condVal)>;
+
+  // fmovs<cond> (%icc|%xcc), $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat(!strconcat("fmovs", cond), ccreg),
+                             ", $rs2, $rd"),
+                  (fmovs FPRegs:$rd, FPRegs:$rs2, condVal)>;
+
+  // fmovd<cond> (%icc|%xcc), $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat(!strconcat("fmovd", cond), ccreg),
+                             ", $rs2, $rd"),
+                  (fmovd DFPRegs:$rd, DFPRegs:$rs2, condVal)>;
+}
+
+// mov<cond> <ccreg> rs2, rd
+multiclass fpcond_mov_alias<string cond, int condVal,
+                           Instruction movrr, Instruction movri,
+                           Instruction fmovs, Instruction fmovd> {
+
+  // mov<cond> %fcc[0-3], rs2, rd
+  def : InstAlias<!strconcat(!strconcat("mov", cond), " $cc, $rs2, $rd"),
+                  (movrr IntRegs:$rd, FCCRegs:$cc, IntRegs:$rs2, condVal)>;
+
+  // mov<cond> %fcc[0-3], simm11, rd
+  def : InstAlias<!strconcat(!strconcat("mov", cond), " $cc, $simm11, $rd"),
+                  (movri IntRegs:$rd, FCCRegs:$cc, i32imm:$simm11, condVal)>;
+
+  // fmovs<cond> %fcc[0-3], $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat("fmovs", cond), " $cc, $rs2, $rd"),
+                  (fmovs FPRegs:$rd, FCCRegs:$cc, FPRegs:$rs2, condVal)>;
+
+  // fmovd<cond> %fcc[0-3], $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat("fmovd", cond), " $cc, $rs2, $rd"),
+                  (fmovd DFPRegs:$rd, FCCRegs:$cc, DFPRegs:$rs2, condVal)>;
+}
+
+// Instruction aliases for integer conditional branches and moves.
+multiclass int_cond_alias<string cond, int condVal> {
+
+  // b<cond> $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), " $imm"),
+                  (BCOND brtarget:$imm, condVal)>;
+
+  // b<cond>,a $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a $imm"),
+                  (BCONDA brtarget:$imm, condVal)>;
+
+  // b<cond> %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), " %icc, $imm"),
+                  (BPICC brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond>,pt %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",pt %icc, $imm"),
+                  (BPICC brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond>,a %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a %icc, $imm"),
+                  (BPICCA brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond>,a,pt %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a,pt %icc, $imm"),
+                  (BPICCA brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond>,pn %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",pn %icc, $imm"),
+                  (BPICCNT brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond>,a,pn %icc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a,pn %icc, $imm"),
+                  (BPICCANT brtarget:$imm, condVal)>, Requires<[HasV9]>;
+
+  // b<cond> %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), " %xcc, $imm"),
+                  (BPXCC brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+  // b<cond>,pt %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",pt %xcc, $imm"),
+                  (BPXCC brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+  // b<cond>,a %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a %xcc, $imm"),
+                  (BPXCCA brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+  // b<cond>,a,pt %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a,pt %xcc, $imm"),
+                  (BPXCCA brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+  // b<cond>,pn %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",pn %xcc, $imm"),
+                  (BPXCCNT brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+  // b<cond>,a,pn %xcc, $imm
+  def : InstAlias<!strconcat(!strconcat("b", cond), ",a,pn %xcc, $imm"),
+                  (BPXCCANT brtarget:$imm, condVal)>, Requires<[Is64Bit]>;
+
+
+  defm : intcond_mov_alias<cond, condVal, " %icc",
+                            MOVICCrr, MOVICCri,
+                            FMOVS_ICC, FMOVD_ICC>, Requires<[HasV9]>;
+
+  defm : intcond_mov_alias<cond, condVal, " %xcc",
+                            MOVXCCrr, MOVXCCri,
+                            FMOVS_XCC, FMOVD_XCC>, Requires<[Is64Bit]>;
+
+  // fmovq<cond> (%icc|%xcc), $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat("fmovq", cond), " %icc, $rs2, $rd"),
+                  (FMOVQ_ICC QFPRegs:$rd, QFPRegs:$rs2, condVal)>,
+                  Requires<[HasV9, HasHardQuad]>;
+  def : InstAlias<!strconcat(!strconcat("fmovq", cond), " %xcc, $rs2, $rd"),
+                  (FMOVQ_XCC QFPRegs:$rd, QFPRegs:$rs2, condVal)>,
+                  Requires<[Is64Bit, HasHardQuad]>;
+
+  // t<cond> %icc,  rs => t<cond> %icc, G0 + rs
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs2"),
+                  (TICCrr G0, IntRegs:$rs2, condVal)>,
+                  Requires<[HasV9]>;
+  // t<cond> %icc, rs1 + rs2
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs1 + $rs2"),
+                  (TICCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>,
+                  Requires<[HasV9]>;
+
+
+  // t<cond> %xcc, rs => t<cond> %xcc, G0 + rs
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs2"),
+                  (TXCCrr G0, IntRegs:$rs2, condVal)>,
+                  Requires<[HasV9]>;
+  // t<cond> %xcc, rs1 + rs2
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs1 + $rs2"),
+                  (TXCCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>,
+                  Requires<[HasV9]>;
+
+
+  // t<cond> rs=> t<cond> %icc,  G0 + rs2
+  //def : InstAlias<!strconcat(!strconcat("t", cond), " $rs2"),
+  //                (TICCrr G0, IntRegs:$rs2, condVal)>,
+  //                Requires<[HasV9]>;
+
+  // t<cond> rs1 + rs2 => t<cond> %icc, rs1 + rs2
+  //def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1 + $rs2"),
+  //                (TICCrr IntRegs:$rs1, IntRegs:$rs2, condVal)>,
+  //                Requires<[HasV9]>;
+
+  // t<cond> %icc, imm => t<cond> %icc, G0 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $imm"),
+                  (TICCri G0, i32imm:$imm, condVal)>,
+                  Requires<[HasV9]>;
+  // t<cond> %icc, rs1 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %icc, $rs1 + $imm"),
+                  (TICCri IntRegs:$rs1, i32imm:$imm, condVal)>,
+                  Requires<[HasV9]>;
+  // t<cond> %xcc, imm => t<cond> %xcc, G0 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $imm"),
+                  (TXCCri G0, i32imm:$imm, condVal)>,
+                  Requires<[HasV9]>;
+  // t<cond> %xcc, rs1 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " %xcc, $rs1 + $imm"),
+                  (TXCCri IntRegs:$rs1, i32imm:$imm, condVal)>,
+                  Requires<[HasV9]>;
+
+  // t<cond> imm => t<cond> G0 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " $imm"),
+                  (TRAPri G0, i32imm:$imm, condVal)>;
+
+  // t<cond> rs1 + imm => t<cond> rs1 + imm
+  def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1 + $imm"),
+                  (TRAPri IntRegs:$rs1, i32imm:$imm, condVal)>;
+
+  // t<cond> rs1 => t<cond> G0 + rs1
+  def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1"),
+                  (TRAPrr G0, IntRegs:$rs1, condVal)>;
+
+  // t<cond> rs1 + rs2
+  def : InstAlias<!strconcat(!strconcat("t", cond), " $rs1 + $rs2"),
+                  (TRAPrr IntRegs:$rs1, IntRegs:$rs2, condVal)>;
+}
+
+
+// Instruction aliases for floating point conditional branches and moves.
+multiclass fp_cond_alias<string cond, int condVal> {
+
+  // fb<cond> $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), " $imm"),
+                  (FBCOND brtarget:$imm, condVal), 0>;
+
+  // fb<cond>,a $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",a $imm"),
+                  (FBCONDA brtarget:$imm, condVal), 0>;
+
+  // fb<cond> %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), " $cc, $imm"),
+                  (BPFCC brtarget:$imm, condVal, FCCRegs:$cc)>,
+                  Requires<[HasV9]>;
+
+  // fb<cond>,pt %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",pt $cc, $imm"),
+                  (BPFCC brtarget:$imm, condVal, FCCRegs:$cc)>,
+                  Requires<[HasV9]>;
+
+  // fb<cond>,a %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",a $cc, $imm"),
+                  (BPFCCA brtarget:$imm, condVal, FCCRegs:$cc)>,
+                  Requires<[HasV9]>;
+
+  // fb<cond>,a,pt %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",a,pt $cc, $imm"),
+                  (BPFCCA brtarget:$imm, condVal, FCCRegs:$cc)>,
+                   Requires<[HasV9]>;
+
+  // fb<cond>,pn %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",pn $cc, $imm"),
+                  (BPFCCNT brtarget:$imm, condVal, FCCRegs:$cc)>,
+                   Requires<[HasV9]>;
+
+  // fb<cond>,a,pn %fcc0, $imm
+  def : InstAlias<!strconcat(!strconcat("fb", cond), ",a,pn $cc, $imm"),
+                  (BPFCCANT brtarget:$imm, condVal, FCCRegs:$cc)>,
+                  Requires<[HasV9]>;
+
+  defm : fpcond_mov_alias<cond, condVal,
+                          V9MOVFCCrr, V9MOVFCCri,
+                          V9FMOVS_FCC, V9FMOVD_FCC>, Requires<[HasV9]>;
+
+  // fmovq<cond> %fcc0, $rs2, $rd
+  def : InstAlias<!strconcat(!strconcat("fmovq", cond), " $cc, $rs2, $rd"),
+                  (V9FMOVQ_FCC QFPRegs:$rd, FCCRegs:$cc, QFPRegs:$rs2,
+                                                          condVal)>,
+                  Requires<[HasV9, HasHardQuad]>;
+}
+
+
+// Instruction aliases for co-processor conditional branches.
+multiclass cp_cond_alias<string cond, int condVal> {
+
+  // cb<cond> $imm
+  def : InstAlias<!strconcat(!strconcat("cb", cond), " $imm"),
+                  (CBCOND brtarget:$imm, condVal), 0>;
+
+  // cb<cond>,a $imm
+  def : InstAlias<!strconcat(!strconcat("cb", cond), ",a $imm"),
+                  (CBCONDA brtarget:$imm, condVal), 0>;
+}
+
+defm : int_cond_alias<"a",    0b1000>;
+defm : int_cond_alias<"n",    0b0000>;
+defm : int_cond_alias<"ne",   0b1001>;
+defm : int_cond_alias<"e",    0b0001>;
+defm : int_cond_alias<"g",    0b1010>;
+defm : int_cond_alias<"le",   0b0010>;
+defm : int_cond_alias<"ge",   0b1011>;
+defm : int_cond_alias<"l",    0b0011>;
+defm : int_cond_alias<"gu",   0b1100>;
+defm : int_cond_alias<"leu",  0b0100>;
+defm : int_cond_alias<"cc",   0b1101>;
+defm : int_cond_alias<"cs",   0b0101>;
+defm : int_cond_alias<"pos",  0b1110>;
+defm : int_cond_alias<"neg",  0b0110>;
+defm : int_cond_alias<"vc",   0b1111>;
+defm : int_cond_alias<"vs",   0b0111>;
+let EmitPriority = 0 in 
+{
+  defm : int_cond_alias<"",     0b1000>; // same as a; gnu asm, not in manual
+  defm : int_cond_alias<"nz",   0b1001>; // same as ne
+  defm : int_cond_alias<"eq",   0b0001>; // same as e
+  defm : int_cond_alias<"z",    0b0001>; // same as e
+  defm : int_cond_alias<"geu",  0b1101>; // same as cc
+  defm : int_cond_alias<"lu",   0b0101>; // same as cs
+}
+defm : fp_cond_alias<"a",     0b1000>;
+defm : fp_cond_alias<"n",     0b0000>;
+defm : fp_cond_alias<"u",     0b0111>;
+defm : fp_cond_alias<"g",     0b0110>;
+defm : fp_cond_alias<"ug",    0b0101>;
+defm : fp_cond_alias<"l",     0b0100>;
+defm : fp_cond_alias<"ul",    0b0011>;
+defm : fp_cond_alias<"lg",    0b0010>;
+defm : fp_cond_alias<"ne",    0b0001>;
+defm : fp_cond_alias<"e",     0b1001>;
+defm : fp_cond_alias<"ue",    0b1010>;
+defm : fp_cond_alias<"ge",    0b1011>;
+defm : fp_cond_alias<"uge",   0b1100>;
+defm : fp_cond_alias<"le",    0b1101>;
+defm : fp_cond_alias<"ule",   0b1110>;
+defm : fp_cond_alias<"o",     0b1111>;
+let EmitPriority = 0 in 
+{
+  defm : fp_cond_alias<"",      0b1000>; // same as a; gnu asm, not in manual
+  defm : fp_cond_alias<"nz",    0b0001>; // same as ne
+  defm : fp_cond_alias<"z",     0b1001>; // same as e
+}
+
+defm : cp_cond_alias<"a",     0b1000>;
+defm : cp_cond_alias<"n",     0b0000>;
+defm : cp_cond_alias<"3",     0b0111>;
+defm : cp_cond_alias<"2",     0b0110>;
+defm : cp_cond_alias<"23",    0b0101>;
+defm : cp_cond_alias<"1",     0b0100>;
+defm : cp_cond_alias<"13",    0b0011>;
+defm : cp_cond_alias<"12",    0b0010>;
+defm : cp_cond_alias<"123",   0b0001>;
+defm : cp_cond_alias<"0",     0b1001>;
+defm : cp_cond_alias<"03",    0b1010>;
+defm : cp_cond_alias<"02",    0b1011>;
+defm : cp_cond_alias<"023",   0b1100>;
+defm : cp_cond_alias<"01",    0b1101>;
+defm : cp_cond_alias<"013",   0b1110>;
+defm : cp_cond_alias<"012",   0b1111>;
+let EmitPriority = 0 in defm : cp_cond_alias<"",      0b1000>; // same as a; gnu asm, not in manual
+
+// Section A.3 Synthetic Instructions
+
+// Most are marked as Emit=0, so that they are not used for disassembly. This is
+// an aesthetic issue, but the chosen policy is to typically prefer using the
+// non-alias form, except for the most obvious and clarifying aliases: cmp, jmp,
+// call, tst, ret, retl.
+
+// Note: cmp is handled in SparcInstrInfo.
+//       jmp/call/ret/retl have special case handling for output in
+//       SparcInstPrinter.cpp
+
+// jmp addr -> jmpl addr, %g0
+def : InstAlias<"jmp $addr", (JMPLrr G0, MEMrr:$addr), 0>;
+def : InstAlias<"jmp $addr", (JMPLri G0, MEMri:$addr), 0>;
+
+// call addr -> jmpl addr, %o7
+def : InstAlias<"call $addr", (JMPLrr O7, MEMrr:$addr), 0>;
+def : InstAlias<"call $addr", (JMPLri O7, MEMri:$addr), 0>;
+
+// tst reg -> orcc %g0, reg, %g0
+def : InstAlias<"tst $rs2", (ORCCrr G0, IntRegs:$rs2, G0)>;
+
+// ret -> jmpl %i7+8, %g0 (aka RET 8)
+def : InstAlias<"ret", (RET 8)>;
+
+// retl -> jmpl %o7+8, %g0 (aka RETL 8)
+def : InstAlias<"retl", (RETL 8)>;
+
+// restore -> restore %g0, %g0, %g0
+def : InstAlias<"restore", (RESTORErr G0, G0, G0)>;
+
+// save -> restore %g0, %g0, %g0
+def : InstAlias<"save", (SAVErr G0, G0, G0)>;
+
+// set value, rd
+// (turns into a sequence of sethi+or, depending on the value)
+// def : InstAlias<"set $val, $rd", (ORri IntRegs:$rd, (SETHIi (HI22 imm:$val)), (LO10 imm:$val))>;
+def SET : AsmPseudoInst<(outs IntRegs:$rd), (ins i32imm:$val), "set $val, $rd">;
+
+// not rd -> xnor rd, %g0, rd
+def : InstAlias<"not $rd", (XNORrr IntRegs:$rd, IntRegs:$rd, G0), 0>;
+
+// not reg, rd -> xnor reg, %g0, rd
+def : InstAlias<"not $rs1, $rd", (XNORrr IntRegs:$rd, IntRegs:$rs1, G0), 0>;
+
+// neg rd -> sub %g0, rd, rd
+def : InstAlias<"neg $rd", (SUBrr IntRegs:$rd, G0, IntRegs:$rd), 0>;
+
+// neg reg, rd -> sub %g0, reg, rd
+def : InstAlias<"neg $rs2, $rd", (SUBrr IntRegs:$rd, G0, IntRegs:$rs2), 0>;
+
+// inc rd -> add rd, 1, rd
+def : InstAlias<"inc $rd", (ADDri IntRegs:$rd, IntRegs:$rd, 1), 0>;
+
+// inc simm13, rd -> add rd, simm13, rd
+def : InstAlias<"inc $simm13, $rd", (ADDri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// inccc rd -> addcc rd, 1, rd
+def : InstAlias<"inccc $rd", (ADDCCri IntRegs:$rd, IntRegs:$rd, 1), 0>;
+
+// inccc simm13, rd -> addcc rd, simm13, rd
+def : InstAlias<"inccc $simm13, $rd", (ADDCCri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// dec rd -> sub rd, 1, rd
+def : InstAlias<"dec $rd", (SUBri IntRegs:$rd, IntRegs:$rd, 1), 0>;
+
+// dec simm13, rd -> sub rd, simm13, rd
+def : InstAlias<"dec $simm13, $rd", (SUBri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// deccc rd -> subcc rd, 1, rd
+def : InstAlias<"deccc $rd", (SUBCCri IntRegs:$rd, IntRegs:$rd, 1), 0>;
+
+// deccc simm13, rd -> subcc rd, simm13, rd
+def : InstAlias<"deccc $simm13, $rd", (SUBCCri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// btst reg_or_imm, reg -> andcc reg,reg_or_imm,%g0
+def : InstAlias<"btst $rs2, $rs1", (ANDCCrr G0, IntRegs:$rs1, IntRegs:$rs2), 0>;
+def : InstAlias<"btst $simm13, $rs1", (ANDCCri G0, IntRegs:$rs1, i32imm:$simm13), 0>;
+
+// bset reg_or_imm, rd -> or rd,reg_or_imm,rd
+def : InstAlias<"bset $rs2, $rd", (ORrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>;
+def : InstAlias<"bset $simm13, $rd", (ORri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// bclr reg_or_imm, rd -> andn rd,reg_or_imm,rd
+def : InstAlias<"bclr $rs2, $rd", (ANDNrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>;
+def : InstAlias<"bclr $simm13, $rd", (ANDNri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+// btog reg_or_imm, rd -> xor rd,reg_or_imm,rd
+def : InstAlias<"btog $rs2, $rd", (XORrr IntRegs:$rd, IntRegs:$rd, IntRegs:$rs2), 0>;
+def : InstAlias<"btog $simm13, $rd", (XORri IntRegs:$rd, IntRegs:$rd, i32imm:$simm13), 0>;
+
+
+// clr rd -> or %g0, %g0, rd
+def : InstAlias<"clr $rd", (ORrr IntRegs:$rd, G0, G0), 0>;
+
+// clr{b,h,} [addr] -> st{b,h,} %g0, [addr]
+def : InstAlias<"clrb [$addr]", (STBrr MEMrr:$addr, G0), 0>;
+def : InstAlias<"clrb [$addr]", (STBri MEMri:$addr, G0), 0>;
+def : InstAlias<"clrh [$addr]", (STHrr MEMrr:$addr, G0), 0>;
+def : InstAlias<"clrh [$addr]", (STHri MEMri:$addr, G0), 0>;
+def : InstAlias<"clr [$addr]", (STrr MEMrr:$addr, G0), 0>;
+def : InstAlias<"clr [$addr]", (STri MEMri:$addr, G0), 0>;
+
+
+// mov reg_or_imm, rd -> or %g0, reg_or_imm, rd
+def : InstAlias<"mov $rs2, $rd", (ORrr IntRegs:$rd, G0, IntRegs:$rs2)>;
+def : InstAlias<"mov $simm13, $rd", (ORri IntRegs:$rd, G0, i32imm:$simm13)>;
+
+// mov specialreg, rd -> rd specialreg, rd
+def : InstAlias<"mov $asr, $rd", (RDASR IntRegs:$rd, ASRRegs:$asr), 0>;
+def : InstAlias<"mov %psr, $rd", (RDPSR IntRegs:$rd), 0>;
+def : InstAlias<"mov %wim, $rd", (RDWIM IntRegs:$rd), 0>;
+def : InstAlias<"mov %tbr, $rd", (RDTBR IntRegs:$rd), 0>;
+
+// mov reg_or_imm, specialreg -> wr %g0, reg_or_imm, specialreg
+def : InstAlias<"mov $rs2, $asr", (WRASRrr ASRRegs:$asr, G0, IntRegs:$rs2), 0>;
+def : InstAlias<"mov $simm13, $asr", (WRASRri ASRRegs:$asr, G0, i32imm:$simm13), 0>;
+def : InstAlias<"mov $rs2, %psr", (WRPSRrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"mov $simm13, %psr", (WRPSRri G0, i32imm:$simm13), 0>;
+def : InstAlias<"mov $rs2, %wim", (WRWIMrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"mov $simm13, %wim", (WRWIMri G0, i32imm:$simm13), 0>;
+def : InstAlias<"mov $rs2, %tbr", (WRTBRrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"mov $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>;
+
+// End of Section A.3
+
+// wr reg_or_imm, specialreg -> wr %g0, reg_or_imm, specialreg
+// (aka: omit the first arg when it's g0. This is not in the manual, but is
+// supported by gnu and solaris as)
+def : InstAlias<"wr $rs2, $asr", (WRASRrr ASRRegs:$asr, G0, IntRegs:$rs2), 0>;
+def : InstAlias<"wr $simm13, $asr", (WRASRri ASRRegs:$asr, G0, i32imm:$simm13), 0>;
+def : InstAlias<"wr $rs2, %psr", (WRPSRrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"wr $simm13, %psr", (WRPSRri G0, i32imm:$simm13), 0>;
+def : InstAlias<"wr $rs2, %wim", (WRWIMrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"wr $simm13, %wim", (WRWIMri G0, i32imm:$simm13), 0>;
+def : InstAlias<"wr $rs2, %tbr", (WRTBRrr G0, IntRegs:$rs2), 0>;
+def : InstAlias<"wr $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>;
+
+
+// flush -> flush %g0
+def : InstAlias<"flush", (FLUSH), 0>;
+
+
+def : MnemonicAlias<"lduw", "ld">, Requires<[HasV9]>;
+def : MnemonicAlias<"lduwa", "lda">, Requires<[HasV9]>;
+
+def : MnemonicAlias<"return", "rett">, Requires<[HasV9]>;
+
+def : MnemonicAlias<"addc", "addx">, Requires<[HasV9]>;
+def : MnemonicAlias<"addccc", "addxcc">, Requires<[HasV9]>;
+
+def : MnemonicAlias<"subc", "subx">, Requires<[HasV9]>;
+def : MnemonicAlias<"subccc", "subxcc">, Requires<[HasV9]>;
+
+
+def : InstAlias<"fcmps $rs1, $rs2", (V9FCMPS FCC0, FPRegs:$rs1, FPRegs:$rs2)>;
+def : InstAlias<"fcmpd $rs1, $rs2", (V9FCMPD FCC0, DFPRegs:$rs1, DFPRegs:$rs2)>;
+def : InstAlias<"fcmpq $rs1, $rs2", (V9FCMPQ FCC0, QFPRegs:$rs1, QFPRegs:$rs2)>,
+                Requires<[HasHardQuad]>;
+
+def : InstAlias<"fcmpes $rs1, $rs2", (V9FCMPES FCC0, FPRegs:$rs1, FPRegs:$rs2)>;
+def : InstAlias<"fcmped $rs1, $rs2", (V9FCMPED FCC0, DFPRegs:$rs1,
+                                                     DFPRegs:$rs2)>;
+def : InstAlias<"fcmpeq $rs1, $rs2", (V9FCMPEQ FCC0, QFPRegs:$rs1,
+                                                     QFPRegs:$rs2)>,
+                Requires<[HasHardQuad]>;
+
+// signx rd -> sra rd, %g0, rd
+def : InstAlias<"signx $rd", (SRArr IntRegs:$rd, IntRegs:$rd, G0), 0>, Requires<[HasV9]>;
+
+// signx reg, rd -> sra reg, %g0, rd
+def : InstAlias<"signx $rs1, $rd", (SRArr IntRegs:$rd, IntRegs:$rs1, G0), 0>, Requires<[HasV9]>;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrFormats.td b/contrib/llvm/lib/Target/Sparc/SparcInstrFormats.td
new file mode 100644
index 000000000000..76366c6695f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrFormats.td
@@ -0,0 +1,369 @@
+//===-- SparcInstrFormats.td - Sparc Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern,
+             InstrItinClass itin = NoItinerary>
+   : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "SP";
+  let Size = 4;
+
+  bits<2> op;
+  let Inst{31-30} = op;               // Top two bits are the 'op' field
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+
+  let DecoderNamespace = "Sparc";
+  field bits<32> SoftFail = 0;
+  
+  let Itinerary = itin;
+}
+
+//===----------------------------------------------------------------------===//
+// Format #2 instruction classes in the Sparc
+//===----------------------------------------------------------------------===//
+
+// Format 2 instructions
+class F2<dag outs, dag ins, string asmstr, list<dag> pattern,
+         InstrItinClass itin = NoItinerary>
+   : InstSP<outs, ins, asmstr, pattern, itin> {
+  bits<3>  op2;
+  bits<22> imm22;
+  let op          = 0;    // op = 0
+  let Inst{24-22} = op2;
+  let Inst{21-0}  = imm22;
+}
+
+// Specific F2 classes: SparcV8 manual, page 44
+//
+class F2_1<bits<3> op2Val, dag outs, dag ins, string asmstr, list<dag> pattern,
+           InstrItinClass itin = NoItinerary>
+   : F2<outs, ins, asmstr, pattern, itin> {
+  bits<5>  rd;
+
+  let op2         = op2Val;
+
+  let Inst{29-25} = rd;
+}
+
+class F2_2<bits<3> op2Val, bit annul, dag outs, dag ins, string asmstr,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+   : F2<outs, ins, asmstr, pattern, itin> {
+  bits<4>   cond;
+  let op2         = op2Val;
+
+  let Inst{29}    = annul;
+  let Inst{28-25} = cond;
+}
+
+class F2_3<bits<3> op2Val, bit annul, bit pred,
+           dag outs, dag ins, string asmstr, list<dag> pattern,
+           InstrItinClass itin = NoItinerary>
+   : InstSP<outs, ins, asmstr, pattern, itin> {
+  bits<2>  cc;
+  bits<4>  cond;
+  bits<19> imm19;
+
+  let op          = 0;    // op = 0
+
+  let Inst{29}    = annul;
+  let Inst{28-25} = cond;
+  let Inst{24-22} = op2Val;
+  let Inst{21-20} = cc;
+  let Inst{19}    = pred;
+  let Inst{18-0}  = imm19;
+}
+
+class F2_4<bits<3> cond, bit annul, bit pred, dag outs, dag ins,
+           string asmstr, list<dag> pattern, InstrItinClass itin = NoItinerary>
+   : InstSP<outs, ins, asmstr, pattern, itin> {
+  bits<16> imm16;
+  bits<5>  rs1;
+
+  let op          = 0;    // op = 0
+
+  let Inst{29}    = annul;
+  let Inst{28}    = 0;
+  let Inst{27-25} = cond;
+  let Inst{24-22} = 0b011;
+  let Inst{21-20} = imm16{15-14};
+  let Inst{19}    = pred;
+  let Inst{18-14} = rs1;
+  let Inst{13-0}  = imm16{13-0};
+}
+
+
+//===----------------------------------------------------------------------===//
+// Format #3 instruction classes in the Sparc
+//===----------------------------------------------------------------------===//
+
+class F3<dag outs, dag ins, string asmstr, list<dag> pattern,
+         InstrItinClass itin = NoItinerary>
+   : InstSP<outs, ins, asmstr, pattern, itin> {
+  bits<5> rd;
+  bits<6> op3;
+  bits<5> rs1;
+  let op{1} = 1;   // Op = 2 or 3
+  let Inst{29-25} = rd;
+  let Inst{24-19} = op3;
+  let Inst{18-14} = rs1;
+}
+
+// Specific F3 classes: SparcV8 manual, page 44
+//
+class F3_1_asi<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
+           string asmstr, list<dag> pattern, InstrItinClass itin = NoItinerary>
+   : F3<outs, ins, asmstr, pattern, itin> {
+  bits<8> asi;
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 0;     // i field = 0
+  let Inst{12-5} = asi;   // address space identifier
+  let Inst{4-0}  = rs2;
+}
+
+class F3_1<bits<2> opVal, bits<6> op3val, dag outs, dag ins, string asmstr,
+       list<dag> pattern, InstrItinClass itin = IIC_iu_instr>
+  : F3_1_asi<opVal, op3val, outs, ins, asmstr, pattern, itin> {
+  let asi = 0;
+}
+
+class F3_2<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
+           string asmstr, list<dag> pattern, InstrItinClass itin = IIC_iu_instr>
+   : F3<outs, ins, asmstr, pattern, itin> {
+  bits<13> simm13;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 1;     // i field = 1
+  let Inst{12-0} = simm13;
+}
+
+// floating-point
+class F3_3<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+           string asmstr, list<dag> pattern, InstrItinClass itin = NoItinerary>
+   : F3<outs, ins, asmstr, pattern, itin> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13-5} = opfval;   // fp opcode
+  let Inst{4-0}  = rs2;
+}
+
+// floating-point unary operations.
+class F3_3u<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+           string asmstr, list<dag> pattern, InstrItinClass itin = NoItinerary>
+   : F3<outs, ins, asmstr, pattern, itin> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+  let rs1        = 0;
+
+  let Inst{13-5} = opfval;   // fp opcode
+  let Inst{4-0}  = rs2;
+}
+
+// floating-point compares.
+class F3_3c<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+           string asmstr, list<dag> pattern, InstrItinClass itin = NoItinerary>
+   : F3<outs, ins, asmstr, pattern, itin> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13-5} = opfval;   // fp opcode
+  let Inst{4-0}  = rs2;
+}
+
+// Shift by register rs2.
+class F3_Sr<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
+            string asmstr, list<dag> pattern, InstrItinClass itin = IIC_iu_instr>
+   : F3<outs, ins, asmstr, pattern, itin> {
+  bit x = xVal;           // 1 for 64-bit shifts.
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 0;     // i field = 0
+  let Inst{12}   = x;     // extended registers.
+  let Inst{4-0}  = rs2;
+}
+
+// Shift by immediate.
+class F3_Si<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
+            string asmstr, list<dag> pattern, InstrItinClass itin = IIC_iu_instr>
+   : F3<outs, ins, asmstr, pattern, itin> {
+  bit x = xVal;           // 1 for 64-bit shifts.
+  bits<6> shcnt;          // shcnt32 / shcnt64.
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 1;     // i field = 1
+  let Inst{12}   = x;     // extended registers.
+  let Inst{5-0}  = shcnt;
+}
+
+// Define rr and ri shift instructions with patterns.
+multiclass F3_S<string OpcStr, bits<6> Op3Val, bit XVal, SDNode OpNode,
+                ValueType VT, RegisterClass RC,
+                InstrItinClass itin = IIC_iu_instr> {
+  def rr : F3_Sr<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, IntRegs:$rs2),
+                 !strconcat(OpcStr, " $rs1, $rs2, $rd"),
+                 [(set VT:$rd, (OpNode VT:$rs1, i32:$rs2))],
+                 itin>;
+  def ri : F3_Si<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs1, i32imm:$shcnt),
+                 !strconcat(OpcStr, " $rs1, $shcnt, $rd"),
+                 [(set VT:$rd, (OpNode VT:$rs1, (i32 imm:$shcnt)))],
+                 itin>;
+}
+
+class F4<bits<6> op3, dag outs, dag ins, string asmstr, list<dag> pattern,
+           InstrItinClass itin = NoItinerary>
+   : InstSP<outs, ins, asmstr, pattern, itin> {
+  bits<5> rd;
+
+  let op          = 2;
+  let Inst{29-25} = rd;
+  let Inst{24-19} = op3;
+}
+
+
+class F4_1<bits<6> op3, dag outs, dag ins,
+           string asmstr, list<dag> pattern,
+           InstrItinClass itin = NoItinerary>
+   : F4<op3, outs, ins, asmstr, pattern, itin> {
+  bit    intcc;
+  bits<2> cc;
+  bits<4> cond;
+  bits<5> rs2;
+
+  let Inst{4-0}   = rs2;
+  let Inst{12-11} = cc;
+  let Inst{13}    = 0;
+  let Inst{17-14} = cond;
+  let Inst{18}    = intcc;
+}
+
+class F4_2<bits<6> op3, dag outs, dag ins,
+            string asmstr, list<dag> pattern,
+            InstrItinClass itin = NoItinerary>
+   : F4<op3, outs, ins, asmstr, pattern, itin> {
+  bit      intcc;
+  bits<2>  cc;
+  bits<4>  cond;
+  bits<11> simm11;
+
+  let Inst{10-0}  = simm11;
+  let Inst{12-11} = cc;
+  let Inst{13}    = 1;
+  let Inst{17-14} = cond;
+  let Inst{18}    = intcc;
+}
+
+class F4_3<bits<6> op3, bits<6> opf_low, dag outs, dag ins,
+           string asmstr, list<dag> pattern,
+           InstrItinClass itin = NoItinerary>
+   : F4<op3, outs, ins, asmstr, pattern, itin> {
+  bits<4> cond;
+  bit     intcc;
+  bits<2> opf_cc;
+  bits<5> rs2;
+
+  let Inst{18}     = 0;
+  let Inst{17-14}  = cond;
+  let Inst{13}     = intcc;
+  let Inst{12-11}  = opf_cc;
+  let Inst{10-5}   = opf_low;
+  let Inst{4-0}    = rs2;
+}
+
+class F4_4r<bits<6> op3, bits<5> opf_low, bits<3> rcond, dag outs, dag ins,
+            string asmstr, list<dag> pattern,
+            InstrItinClass itin = NoItinerary>
+   : F4<op3, outs, ins, asmstr, pattern, itin> {
+  bits <5> rs1;
+  bits <5> rs2;
+  let Inst{18-14} = rs1;
+  let Inst{13}    = 0;  // IsImm
+  let Inst{12-10} = rcond;
+  let Inst{9-5}   = opf_low;
+  let Inst{4-0}   = rs2;
+}
+
+
+class F4_4i<bits<6> op3, bits<3> rcond, dag outs, dag ins,
+            string asmstr, list<dag> pattern,
+           InstrItinClass itin = NoItinerary>
+   : F4<op3, outs, ins, asmstr, pattern, itin> {
+  bits<5> rs1;
+  bits<10> simm10;
+  let Inst{18-14} = rs1;
+  let Inst{13}    = 1;  // IsImm
+  let Inst{12-10} = rcond;
+  let Inst{9-0}   = simm10;
+}
+
+
+class TRAPSP<bits<6> op3Val, bit isimm, dag outs, dag ins,
+             string asmstr, list<dag> pattern,
+             InstrItinClass itin = NoItinerary>
+   : F3<outs, ins, asmstr, pattern, itin> {
+   bits<4> cond;
+   bits<2> cc;
+
+   let op = 0b10;
+   let rd{4} = 0;
+   let rd{3-0} = cond;
+   let op3 = op3Val;
+   let Inst{13} = isimm;
+   let Inst{12-11} = cc;
+
+}
+
+class TRAPSPrr<bits<6> op3Val, dag outs, dag ins,
+               string asmstr, list<dag> pattern,
+               InstrItinClass itin = NoItinerary>
+   : TRAPSP<op3Val, 0, outs, ins, asmstr, pattern, itin> {
+   bits<5> rs2;
+
+   let Inst{10-5} = 0;
+   let Inst{4-0}  = rs2;
+}
+
+class TRAPSPri<bits<6> op3Val, dag outs, dag ins,
+               string asmstr, list<dag> pattern,
+               InstrItinClass itin = NoItinerary>
+   : TRAPSP<op3Val, 1, outs, ins, asmstr, pattern, itin> {
+   bits<8> imm;
+
+   let Inst{10-8} = 0;
+   let Inst{7-0}  = imm;
+}
+
+// Pseudo-instructions for alternate assembly syntax (never used by codegen).
+// These are aliases that require C++ handling to convert to the target
+// instruction, while InstAliases can be handled directly by tblgen.
+class AsmPseudoInst<dag outs, dag ins, string asm>
+  : InstSP<outs, ins, asm, []> {
+  let isPseudo = 1;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
new file mode 100644
index 000000000000..ea8ed830bafc
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -0,0 +1,510 @@
+//===-- SparcInstrInfo.cpp - Sparc Instruction Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcInstrInfo.h"
+#include "Sparc.h"
+#include "SparcMachineFunctionInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "SparcGenInstrInfo.inc"
+
+// Pin the vtable to this file.
+void SparcInstrInfo::anchor() {}
+
+SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
+    : SparcGenInstrInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(),
+      Subtarget(ST) {}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned SparcInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                             int &FrameIndex) const {
+  if (MI.getOpcode() == SP::LDri || MI.getOpcode() == SP::LDXri ||
+      MI.getOpcode() == SP::LDFri || MI.getOpcode() == SP::LDDFri ||
+      MI.getOpcode() == SP::LDQFri) {
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+        MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned SparcInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                            int &FrameIndex) const {
+  if (MI.getOpcode() == SP::STri || MI.getOpcode() == SP::STXri ||
+      MI.getOpcode() == SP::STFri || MI.getOpcode() == SP::STDFri ||
+      MI.getOpcode() == SP::STQFri) {
+    if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
+        MI.getOperand(1).getImm() == 0) {
+      FrameIndex = MI.getOperand(0).getIndex();
+      return MI.getOperand(2).getReg();
+    }
+  }
+  return 0;
+}
+
+static bool IsIntegerCC(unsigned CC)
+{
+  return  (CC <= SPCC::ICC_VC);
+}
+
+static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
+{
+  switch(CC) {
+  case SPCC::ICC_A:    return SPCC::ICC_N;
+  case SPCC::ICC_N:    return SPCC::ICC_A;
+  case SPCC::ICC_NE:   return SPCC::ICC_E;
+  case SPCC::ICC_E:    return SPCC::ICC_NE;
+  case SPCC::ICC_G:    return SPCC::ICC_LE;
+  case SPCC::ICC_LE:   return SPCC::ICC_G;
+  case SPCC::ICC_GE:   return SPCC::ICC_L;
+  case SPCC::ICC_L:    return SPCC::ICC_GE;
+  case SPCC::ICC_GU:   return SPCC::ICC_LEU;
+  case SPCC::ICC_LEU:  return SPCC::ICC_GU;
+  case SPCC::ICC_CC:   return SPCC::ICC_CS;
+  case SPCC::ICC_CS:   return SPCC::ICC_CC;
+  case SPCC::ICC_POS:  return SPCC::ICC_NEG;
+  case SPCC::ICC_NEG:  return SPCC::ICC_POS;
+  case SPCC::ICC_VC:   return SPCC::ICC_VS;
+  case SPCC::ICC_VS:   return SPCC::ICC_VC;
+
+  case SPCC::FCC_A:    return SPCC::FCC_N;
+  case SPCC::FCC_N:    return SPCC::FCC_A;
+  case SPCC::FCC_U:    return SPCC::FCC_O;
+  case SPCC::FCC_O:    return SPCC::FCC_U;
+  case SPCC::FCC_G:    return SPCC::FCC_ULE;
+  case SPCC::FCC_LE:   return SPCC::FCC_UG;
+  case SPCC::FCC_UG:   return SPCC::FCC_LE;
+  case SPCC::FCC_ULE:  return SPCC::FCC_G;
+  case SPCC::FCC_L:    return SPCC::FCC_UGE;
+  case SPCC::FCC_GE:   return SPCC::FCC_UL;
+  case SPCC::FCC_UL:   return SPCC::FCC_GE;
+  case SPCC::FCC_UGE:  return SPCC::FCC_L;
+  case SPCC::FCC_LG:   return SPCC::FCC_UE;
+  case SPCC::FCC_UE:   return SPCC::FCC_LG;
+  case SPCC::FCC_NE:   return SPCC::FCC_E;
+  case SPCC::FCC_E:    return SPCC::FCC_NE;
+  
+  case SPCC::CPCC_A:   return SPCC::CPCC_N;
+  case SPCC::CPCC_N:   return SPCC::CPCC_A;
+  case SPCC::CPCC_3:   LLVM_FALLTHROUGH;
+  case SPCC::CPCC_2:   LLVM_FALLTHROUGH;
+  case SPCC::CPCC_23:  LLVM_FALLTHROUGH;
+  case SPCC::CPCC_1:   LLVM_FALLTHROUGH;
+  case SPCC::CPCC_13:  LLVM_FALLTHROUGH;
+  case SPCC::CPCC_12:  LLVM_FALLTHROUGH;
+  case SPCC::CPCC_123: LLVM_FALLTHROUGH;
+  case SPCC::CPCC_0:   LLVM_FALLTHROUGH;
+  case SPCC::CPCC_03:  LLVM_FALLTHROUGH;
+  case SPCC::CPCC_02:  LLVM_FALLTHROUGH;
+  case SPCC::CPCC_023: LLVM_FALLTHROUGH;
+  case SPCC::CPCC_01:  LLVM_FALLTHROUGH;
+  case SPCC::CPCC_013: LLVM_FALLTHROUGH;
+  case SPCC::CPCC_012:
+      // "Opposite" code is not meaningful, as we don't know
+      // what the CoProc condition means here. The cond-code will
+      // only be used in inline assembler, so this code should
+      // not be reached in a normal compilation pass.
+      llvm_unreachable("Meaningless inversion of co-processor cond code");
+  }
+  llvm_unreachable("Invalid cond code");
+}
+
+static bool isUncondBranchOpcode(int Opc) { return Opc == SP::BA; }
+
+static bool isCondBranchOpcode(int Opc) {
+  return Opc == SP::FBCOND || Opc == SP::BCOND;
+}
+
+static bool isIndirectBranchOpcode(int Opc) {
+  return Opc == SP::BINDrr || Opc == SP::BINDri;
+}
+
+static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
+                            SmallVectorImpl<MachineOperand> &Cond) {
+  Cond.push_back(MachineOperand::CreateImm(LastInst->getOperand(1).getImm()));
+  Target = LastInst->getOperand(0).getMBB();
+}
+
+bool SparcInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return false;
+
+  if (!isUnpredicatedTerminator(*I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = &*I;
+  unsigned LastOpc = LastInst->getOpcode();
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+    if (isUncondBranchOpcode(LastOpc)) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (isCondBranchOpcode(LastOpc)) {
+      // Block ends with fall-through condbranch.
+      parseCondBranch(LastInst, TBB, Cond);
+      return false;
+    }
+    return true; // Can't handle indirect branch.
+  }
+
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = &*I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If AllowModify is true and the block ends with two or more unconditional
+  // branches, delete all but the first unconditional branch.
+  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+    while (isUncondBranchOpcode(SecondLastOpc)) {
+      LastInst->eraseFromParent();
+      LastInst = SecondLastInst;
+      LastOpc = LastInst->getOpcode();
+      if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+        // Return now the only terminator is an unconditional branch.
+        TBB = LastInst->getOperand(0).getMBB();
+        return false;
+      } else {
+        SecondLastInst = &*I;
+        SecondLastOpc = SecondLastInst->getOpcode();
+      }
+    }
+  }
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
+    return true;
+
+  // If the block ends with a B and a Bcc, handle it.
+  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    parseCondBranch(SecondLastInst, TBB, Cond);
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed.
+  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // ...likewise if it ends with an indirect branch followed by an unconditional
+  // branch.
+  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned SparcInstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                      MachineBasicBlock *TBB,
+                                      MachineBasicBlock *FBB,
+                                      ArrayRef<MachineOperand> Cond,
+                                      const DebugLoc &DL,
+                                      int *BytesAdded) const {
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "Sparc branch conditions should have one component!");
+  assert(!BytesAdded && "code size not handled");
+
+  if (Cond.empty()) {
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(SP::BA)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch
+  unsigned CC = Cond[0].getImm();
+
+  if (IsIntegerCC(CC))
+    BuildMI(&MBB, DL, get(SP::BCOND)).addMBB(TBB).addImm(CC);
+  else
+    BuildMI(&MBB, DL, get(SP::FBCOND)).addMBB(TBB).addImm(CC);
+  if (!FBB)
+    return 1;
+
+  BuildMI(&MBB, DL, get(SP::BA)).addMBB(FBB);
+  return 2;
+}
+
+unsigned SparcInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                      int *BytesRemoved) const {
+  assert(!BytesRemoved && "code size not handled");
+
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+  while (I != MBB.begin()) {
+    --I;
+
+    if (I->isDebugValue())
+      continue;
+
+    if (I->getOpcode() != SP::BA
+        && I->getOpcode() != SP::BCOND
+        && I->getOpcode() != SP::FBCOND)
+      break; // Not a branch
+
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+  return Count;
+}
+
+bool SparcInstrInfo::reverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1);
+  SPCC::CondCodes CC = static_cast<SPCC::CondCodes>(Cond[0].getImm());
+  Cond[0].setImm(GetOppositeBranchCondition(CC));
+  return false;
+}
+
+void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I,
+                                 const DebugLoc &DL, unsigned DestReg,
+                                 unsigned SrcReg, bool KillSrc) const {
+  unsigned numSubRegs = 0;
+  unsigned movOpc     = 0;
+  const unsigned *subRegIdx = nullptr;
+  bool ExtraG0 = false;
+
+  const unsigned DW_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd };
+  const unsigned DFP_FP_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd };
+  const unsigned QFP_DFP_SubRegsIdx[] = { SP::sub_even64, SP::sub_odd64 };
+  const unsigned QFP_FP_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd,
+                                          SP::sub_odd64_then_sub_even,
+                                          SP::sub_odd64_then_sub_odd };
+
+  if (SP::IntRegsRegClass.contains(DestReg, SrcReg))
+    BuildMI(MBB, I, DL, get(SP::ORrr), DestReg).addReg(SP::G0)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (SP::IntPairRegClass.contains(DestReg, SrcReg)) {
+    subRegIdx  = DW_SubRegsIdx;
+    numSubRegs = 2;
+    movOpc     = SP::ORrr;
+    ExtraG0 = true;
+  } else if (SP::FPRegsRegClass.contains(DestReg, SrcReg))
+    BuildMI(MBB, I, DL, get(SP::FMOVS), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (SP::DFPRegsRegClass.contains(DestReg, SrcReg)) {
+    if (Subtarget.isV9()) {
+      BuildMI(MBB, I, DL, get(SP::FMOVD), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      // Use two FMOVS instructions.
+      subRegIdx  = DFP_FP_SubRegsIdx;
+      numSubRegs = 2;
+      movOpc     = SP::FMOVS;
+    }
+  } else if (SP::QFPRegsRegClass.contains(DestReg, SrcReg)) {
+    if (Subtarget.isV9()) {
+      if (Subtarget.hasHardQuad()) {
+        BuildMI(MBB, I, DL, get(SP::FMOVQ), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+      } else {
+        // Use two FMOVD instructions.
+        subRegIdx  = QFP_DFP_SubRegsIdx;
+        numSubRegs = 2;
+        movOpc     = SP::FMOVD;
+      }
+    } else {
+      // Use four FMOVS instructions.
+      subRegIdx  = QFP_FP_SubRegsIdx;
+      numSubRegs = 4;
+      movOpc     = SP::FMOVS;
+    }
+  } else if (SP::ASRRegsRegClass.contains(DestReg) &&
+             SP::IntRegsRegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(SP::WRASRrr), DestReg)
+        .addReg(SP::G0)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+  } else if (SP::IntRegsRegClass.contains(DestReg) &&
+             SP::ASRRegsRegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(SP::RDASR), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+  } else
+    llvm_unreachable("Impossible reg-to-reg copy");
+
+  if (numSubRegs == 0 || subRegIdx == nullptr || movOpc == 0)
+    return;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineInstr *MovMI = nullptr;
+
+  for (unsigned i = 0; i != numSubRegs; ++i) {
+    unsigned Dst = TRI->getSubReg(DestReg, subRegIdx[i]);
+    unsigned Src = TRI->getSubReg(SrcReg,  subRegIdx[i]);
+    assert(Dst && Src && "Bad sub-register");
+
+    MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(movOpc), Dst);
+    if (ExtraG0)
+      MIB.addReg(SP::G0);
+    MIB.addReg(Src);
+    MovMI = MIB.getInstr();
+  }
+  // Add implicit super-register defs and kills to the last MovMI.
+  MovMI->addRegisterDefined(DestReg, TRI);
+  if (KillSrc)
+    MovMI->addRegisterKilled(SrcReg, TRI);
+}
+
+void SparcInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
+      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+
+  // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
+  if (RC == &SP::I64RegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::STXri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  else if (RC == &SP::IntRegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::STri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  else if (RC == &SP::IntPairRegClass)
+    BuildMI(MBB, I, DL, get(SP::STDri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  else if (RC == &SP::FPRegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
+  else if (SP::DFPRegsRegClass.hasSubClassEq(RC))
+    BuildMI(MBB, I, DL, get(SP::STDFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
+  else if (SP::QFPRegsRegClass.hasSubClassEq(RC))
+    // Use STQFri irrespective of its legality. If STQ is not legal, it will be
+    // lowered into two STDs in eliminateFrameIndex.
+    BuildMI(MBB, I, DL, get(SP::STQFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
+  else
+    llvm_unreachable("Can't store this register to stack slot");
+}
+
+void SparcInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
+      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+
+  if (RC == &SP::I64RegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::LDXri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
+  else if (RC == &SP::IntRegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
+  else if (RC == &SP::IntPairRegClass)
+    BuildMI(MBB, I, DL, get(SP::LDDri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
+  else if (RC == &SP::FPRegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
+  else if (SP::DFPRegsRegClass.hasSubClassEq(RC))
+    BuildMI(MBB, I, DL, get(SP::LDDFri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
+  else if (SP::QFPRegsRegClass.hasSubClassEq(RC))
+    // Use LDQFri irrespective of its legality. If LDQ is not legal, it will be
+    // lowered into two LDDs in eliminateFrameIndex.
+    BuildMI(MBB, I, DL, get(SP::LDQFri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
+  else
+    llvm_unreachable("Can't load this register from stack slot");
+}
+
+unsigned SparcInstrInfo::getGlobalBaseReg(MachineFunction *MF) const
+{
+  SparcMachineFunctionInfo *SparcFI = MF->getInfo<SparcMachineFunctionInfo>();
+  unsigned GlobalBaseReg = SparcFI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // Insert the set of GlobalBaseReg into the first MBB of the function
+  MachineBasicBlock &FirstMBB = MF->front();
+  MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+
+  const TargetRegisterClass *PtrRC =
+    Subtarget.is64Bit() ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
+  GlobalBaseReg = RegInfo.createVirtualRegister(PtrRC);
+
+  DebugLoc dl;
+
+  BuildMI(FirstMBB, MBBI, dl, get(SP::GETPCX), GlobalBaseReg);
+  SparcFI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}
+
+bool SparcInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::LOAD_STACK_GUARD: {
+    assert(Subtarget.isTargetLinux() &&
+           "Only Linux target is expected to contain LOAD_STACK_GUARD");
+    // offsetof(tcbhead_t, stack_guard) from sysdeps/sparc/nptl/tls.h in glibc.
+    const int64_t Offset = Subtarget.is64Bit() ? 0x28 : 0x14;
+    MI.setDesc(get(Subtarget.is64Bit() ? SP::LDXri : SP::LDri));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+        .addReg(SP::G7)
+        .addImm(Offset);
+    return true;
+  }
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h
new file mode 100644
index 000000000000..c053cc4c475b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h
@@ -0,0 +1,108 @@
+//===-- SparcInstrInfo.h - Sparc Instruction Information --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCINSTRINFO_H
+#define LLVM_LIB_TARGET_SPARC_SPARCINSTRINFO_H
+
+#include "SparcRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "SparcGenInstrInfo.inc"
+
+namespace llvm {
+
+class SparcSubtarget;
+
+/// SPII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace SPII {
+  enum {
+    Pseudo = (1<<0),
+    Load = (1<<1),
+    Store = (1<<2),
+    DelaySlot = (1<<3)
+  };
+}
+
+class SparcInstrInfo : public SparcGenInstrInfo {
+  const SparcRegisterInfo RI;
+  const SparcSubtarget& Subtarget;
+  virtual void anchor();
+public:
+  explicit SparcInstrInfo(SparcSubtarget &ST);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const SparcRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override;
+
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+
+  bool
+  reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+  // Lower pseudo instructions after register allocation.
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
new file mode 100644
index 000000000000..5a19c624abb5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -0,0 +1,1692 @@
+//===-- SparcInstrInfo.td - Target Description for Sparc Target -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Sparc instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "SparcInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Feature predicates.
+//===----------------------------------------------------------------------===//
+
+// True when generating 32-bit code.
+def Is32Bit : Predicate<"!Subtarget->is64Bit()">;
+
+// True when generating 64-bit code. This also implies HasV9.
+def Is64Bit : Predicate<"Subtarget->is64Bit()">;
+
+// HasV9 - This predicate is true when the target processor supports V9
+// instructions.  Note that the machine may be running in 32-bit mode.
+def HasV9   : Predicate<"Subtarget->isV9()">,
+              AssemblerPredicate<"FeatureV9">;
+
+// HasNoV9 - This predicate is true when the target doesn't have V9
+// instructions.  Use of this is just a hack for the isel not having proper
+// costs for V8 instructions that are more expensive than their V9 ones.
+def HasNoV9 : Predicate<"!Subtarget->isV9()">;
+
+// HasVIS - This is true when the target processor has VIS extensions.
+def HasVIS : Predicate<"Subtarget->isVIS()">,
+             AssemblerPredicate<"FeatureVIS">;
+def HasVIS2 : Predicate<"Subtarget->isVIS2()">,
+             AssemblerPredicate<"FeatureVIS2">;
+def HasVIS3 : Predicate<"Subtarget->isVIS3()">,
+             AssemblerPredicate<"FeatureVIS3">;
+
+// HasHardQuad - This is true when the target processor supports quad floating
+// point instructions.
+def HasHardQuad : Predicate<"Subtarget->hasHardQuad()">;
+
+// HasLeonCASA - This is true when the target processor supports the CASA
+// instruction
+def HasLeonCASA : Predicate<"Subtarget->hasLeonCasa()">;
+
+// HasUMAC_SMAC - This is true when the target processor supports the
+// UMAC and SMAC instructions
+def HasUMAC_SMAC : Predicate<"Subtarget->hasUmacSmac()">;
+
+def HasNoFdivSqrtFix : Predicate<"!Subtarget->fixAllFDIVSQRT()">;
+def HasNoFmulsFix : Predicate<"!Subtarget->replaceFMULS()">;
+def HasNoFsmuldFix : Predicate<"!Subtarget->fixFSMULD()">;
+
+// UseDeprecatedInsts - This predicate is true when the target processor is a
+// V8, or when it is V9 but the V8 deprecated instructions are efficient enough
+// to use when appropriate.  In either of these cases, the instruction selector
+// will pick deprecated instructions.
+def UseDeprecatedInsts : Predicate<"Subtarget->useDeprecatedV8Instructions()">;
+
+//===----------------------------------------------------------------------===//
+// Instruction Pattern Stuff
+//===----------------------------------------------------------------------===//
+
+def simm11  : PatLeaf<(imm), [{ return isInt<11>(N->getSExtValue()); }]>;
+
+def simm13  : PatLeaf<(imm), [{ return isInt<13>(N->getSExtValue()); }]>;
+
+def LO10 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((unsigned)N->getZExtValue() & 1023, SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+def HI22 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return CurDAG->getTargetConstant((unsigned)N->getZExtValue() >> 10, SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+def SETHIimm : PatLeaf<(imm), [{
+  return isShiftedUInt<22, 10>(N->getZExtValue());
+}], HI22>;
+
+// Addressing modes.
+def ADDRrr : ComplexPattern<iPTR, 2, "SelectADDRrr", [], []>;
+def ADDRri : ComplexPattern<iPTR, 2, "SelectADDRri", [frameindex], []>;
+
+// Address operands
+def SparcMEMrrAsmOperand : AsmOperandClass {
+  let Name = "MEMrr";
+  let ParserMethod = "parseMEMOperand";
+}
+
+def SparcMEMriAsmOperand : AsmOperandClass {
+  let Name = "MEMri";
+  let ParserMethod = "parseMEMOperand";
+}
+
+def MEMrr : Operand<iPTR> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops ptr_rc, ptr_rc);
+  let ParserMatchClass = SparcMEMrrAsmOperand;
+}
+def MEMri : Operand<iPTR> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops ptr_rc, i32imm);
+  let ParserMatchClass = SparcMEMriAsmOperand;
+}
+
+def TLSSym : Operand<iPTR>;
+
+// Branch targets have OtherVT type.
+def brtarget : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+}
+
+def bprtarget : Operand<OtherVT> {
+  let EncoderMethod = "getBranchPredTargetOpValue";
+}
+
+def bprtarget16 : Operand<OtherVT> {
+  let EncoderMethod = "getBranchOnRegTargetOpValue";
+}
+
+def calltarget : Operand<i32> {
+  let EncoderMethod = "getCallTargetOpValue";
+  let DecoderMethod = "DecodeCall";
+}
+
+def simm13Op : Operand<i32> {
+  let DecoderMethod = "DecodeSIMM13";
+}
+
+// Operand for printing out a condition code.
+let PrintMethod = "printCCOperand" in
+  def CCOp : Operand<i32>;
+
+def SDTSPcmpicc :
+SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>;
+def SDTSPcmpfcc :
+SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisSameAs<0, 1>]>;
+def SDTSPbrcc :
+SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;
+def SDTSPselectcc :
+SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>]>;
+def SDTSPFTOI :
+SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
+def SDTSPITOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+def SDTSPFTOX :
+SDTypeProfile<1, 1, [SDTCisVT<0, f64>, SDTCisFP<1>]>;
+def SDTSPXTOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f64>]>;
+
+def SDTSPtlsadd :
+SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDTSPtlsld :
+SDTypeProfile<1, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
+
+def SDTSPeh_sjlj_setjmp : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>;
+def SDTSPeh_sjlj_longjmp: SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def SPcmpicc : SDNode<"SPISD::CMPICC", SDTSPcmpicc, [SDNPOutGlue]>;
+def SPcmpfcc : SDNode<"SPISD::CMPFCC", SDTSPcmpfcc, [SDNPOutGlue]>;
+def SPbricc : SDNode<"SPISD::BRICC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
+def SPbrxcc : SDNode<"SPISD::BRXCC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
+def SPbrfcc : SDNode<"SPISD::BRFCC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
+
+def SPhi    : SDNode<"SPISD::Hi", SDTIntUnaryOp>;
+def SPlo    : SDNode<"SPISD::Lo", SDTIntUnaryOp>;
+
+def SPftoi  : SDNode<"SPISD::FTOI", SDTSPFTOI>;
+def SPitof  : SDNode<"SPISD::ITOF", SDTSPITOF>;
+def SPftox  : SDNode<"SPISD::FTOX", SDTSPFTOX>;
+def SPxtof  : SDNode<"SPISD::XTOF", SDTSPXTOF>;
+
+def SPselecticc : SDNode<"SPISD::SELECT_ICC", SDTSPselectcc, [SDNPInGlue]>;
+def SPselectxcc : SDNode<"SPISD::SELECT_XCC", SDTSPselectcc, [SDNPInGlue]>;
+def SPselectfcc : SDNode<"SPISD::SELECT_FCC", SDTSPselectcc, [SDNPInGlue]>;
+
+def SPsjlj_setjmp: SDNode<"SPISD::EH_SJLJ_SETJMP",
+                          SDTSPeh_sjlj_setjmp,
+                          [SDNPHasChain, SDNPSideEffect]>;
+def SPsjlj_longjmp: SDNode<"SPISD::EH_SJLJ_LONGJMP",
+                           SDTSPeh_sjlj_longjmp,
+                           [SDNPHasChain, SDNPSideEffect]>;
+
+//  These are target-independent nodes, but have target-specific formats.
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_SPCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def SDT_SPCall    : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
+def call          : SDNode<"SPISD::CALL", SDT_SPCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                            SDNPVariadic]>;
+
+def SDT_SPRet     : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def retflag       : SDNode<"SPISD::RET_FLAG", SDT_SPRet,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def flushw        : SDNode<"SPISD::FLUSHW", SDTNone,
+                           [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
+
+def tlsadd        : SDNode<"SPISD::TLS_ADD", SDTSPtlsadd>;
+def tlsld         : SDNode<"SPISD::TLS_LD",  SDTSPtlsld>;
+def tlscall       : SDNode<"SPISD::TLS_CALL", SDT_SPCall,
+                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                             SDNPVariadic]>;
+
+def getPCX        : Operand<iPTR> {
+  let PrintMethod = "printGetPCX";
+}
+
+//===----------------------------------------------------------------------===//
+// SPARC Flag Conditions
+//===----------------------------------------------------------------------===//
+
+// Note that these values must be kept in sync with the CCOp::CondCode enum
+// values.
+class ICC_VAL<int N> : PatLeaf<(i32 N)>;
+def ICC_NE  : ICC_VAL< 9>;  // Not Equal
+def ICC_E   : ICC_VAL< 1>;  // Equal
+def ICC_G   : ICC_VAL<10>;  // Greater
+def ICC_LE  : ICC_VAL< 2>;  // Less or Equal
+def ICC_GE  : ICC_VAL<11>;  // Greater or Equal
+def ICC_L   : ICC_VAL< 3>;  // Less
+def ICC_GU  : ICC_VAL<12>;  // Greater Unsigned
+def ICC_LEU : ICC_VAL< 4>;  // Less or Equal Unsigned
+def ICC_CC  : ICC_VAL<13>;  // Carry Clear/Great or Equal Unsigned
+def ICC_CS  : ICC_VAL< 5>;  // Carry Set/Less Unsigned
+def ICC_POS : ICC_VAL<14>;  // Positive
+def ICC_NEG : ICC_VAL< 6>;  // Negative
+def ICC_VC  : ICC_VAL<15>;  // Overflow Clear
+def ICC_VS  : ICC_VAL< 7>;  // Overflow Set
+
+class FCC_VAL<int N> : PatLeaf<(i32 N)>;
+def FCC_U   : FCC_VAL<23>;  // Unordered
+def FCC_G   : FCC_VAL<22>;  // Greater
+def FCC_UG  : FCC_VAL<21>;  // Unordered or Greater
+def FCC_L   : FCC_VAL<20>;  // Less
+def FCC_UL  : FCC_VAL<19>;  // Unordered or Less
+def FCC_LG  : FCC_VAL<18>;  // Less or Greater
+def FCC_NE  : FCC_VAL<17>;  // Not Equal
+def FCC_E   : FCC_VAL<25>;  // Equal
+def FCC_UE  : FCC_VAL<26>;  // Unordered or Equal
+def FCC_GE  : FCC_VAL<27>;  // Greater or Equal
+def FCC_UGE : FCC_VAL<28>;  // Unordered or Greater or Equal
+def FCC_LE  : FCC_VAL<29>;  // Less or Equal
+def FCC_ULE : FCC_VAL<30>;  // Unordered or Less or Equal
+def FCC_O   : FCC_VAL<31>;  // Ordered
+
+class CPCC_VAL<int N> : PatLeaf<(i32 N)>;
+def CPCC_3   : CPCC_VAL<39>;  // 3
+def CPCC_2   : CPCC_VAL<38>;  // 2
+def CPCC_23  : CPCC_VAL<37>;  // 2 or 3
+def CPCC_1   : CPCC_VAL<36>;  // 1
+def CPCC_13  : CPCC_VAL<35>;  // 1 or 3
+def CPCC_12  : CPCC_VAL<34>;  // 1 or 2
+def CPCC_123 : CPCC_VAL<33>;  // 1 or 2 or 3
+def CPCC_0   : CPCC_VAL<41>;  // 0
+def CPCC_03  : CPCC_VAL<42>;  // 0 or 3
+def CPCC_02  : CPCC_VAL<43>;  // 0 or 2
+def CPCC_023 : CPCC_VAL<44>;  // 0 or 2 or 3
+def CPCC_01  : CPCC_VAL<45>;  // 0 or 1
+def CPCC_013 : CPCC_VAL<46>;  // 0 or 1 or 3
+def CPCC_012 : CPCC_VAL<47>;  // 0 or 1 or 2
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+/// F3_12 multiclass - Define a normal F3_1/F3_2 pattern in one shot.
+multiclass F3_12<string OpcStr, bits<6> Op3Val, SDNode OpNode,
+                 RegisterClass RC, ValueType Ty, Operand immOp,
+                 InstrItinClass itin = IIC_iu_instr> {
+  def rr  : F3_1<2, Op3Val,
+                 (outs RC:$rd), (ins RC:$rs1, RC:$rs2),
+                 !strconcat(OpcStr, " $rs1, $rs2, $rd"),
+                 [(set Ty:$rd, (OpNode Ty:$rs1, Ty:$rs2))],
+                 itin>;
+  def ri  : F3_2<2, Op3Val,
+                 (outs RC:$rd), (ins RC:$rs1, immOp:$simm13),
+                 !strconcat(OpcStr, " $rs1, $simm13, $rd"),
+                 [(set Ty:$rd, (OpNode Ty:$rs1, (Ty simm13:$simm13)))],
+                 itin>;
+}
+
+/// F3_12np multiclass - Define a normal F3_1/F3_2 pattern in one shot, with no
+/// pattern.
+multiclass F3_12np<string OpcStr, bits<6> Op3Val, InstrItinClass itin = IIC_iu_instr> {
+  def rr  : F3_1<2, Op3Val,
+                 (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+                 !strconcat(OpcStr, " $rs1, $rs2, $rd"), [],
+                 itin>;
+  def ri  : F3_2<2, Op3Val,
+                 (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+                 !strconcat(OpcStr, " $rs1, $simm13, $rd"), [],
+                 itin>;
+}
+
+// Load multiclass - Define both Reg+Reg/Reg+Imm patterns in one shot.
+multiclass Load<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
+           RegisterClass RC, ValueType Ty, InstrItinClass itin = IIC_iu_instr> {
+  def rr  : F3_1<3, Op3Val,
+                 (outs RC:$dst), (ins MEMrr:$addr),
+                 !strconcat(OpcStr, " [$addr], $dst"),
+                 [(set Ty:$dst, (OpNode ADDRrr:$addr))],
+                 itin>;
+  def ri  : F3_2<3, Op3Val,
+                 (outs RC:$dst), (ins MEMri:$addr),
+                 !strconcat(OpcStr, " [$addr], $dst"),
+                 [(set Ty:$dst, (OpNode ADDRri:$addr))],
+                 itin>;
+}
+
+// TODO: Instructions of the LoadASI class are currently asm only; hooking up
+// CodeGen's address spaces to use these is a future task.
+class LoadASI<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
+              RegisterClass RC, ValueType Ty, InstrItinClass itin = NoItinerary> :
+  F3_1_asi<3, Op3Val, (outs RC:$dst), (ins MEMrr:$addr, i8imm:$asi),
+                !strconcat(OpcStr, "a [$addr] $asi, $dst"),
+                []>;
+
+// LoadA multiclass - As above, but also define alternate address space variant
+multiclass LoadA<string OpcStr, bits<6> Op3Val, bits<6> LoadAOp3Val,
+                 SDPatternOperator OpNode, RegisterClass RC, ValueType Ty,
+                 InstrItinClass itin = NoItinerary> :
+             Load<OpcStr, Op3Val, OpNode, RC, Ty, itin> {
+  def Arr  : LoadASI<OpcStr, LoadAOp3Val, OpNode, RC, Ty>;
+}
+
+// The LDSTUB instruction is supported for asm only.
+// It is unlikely that general-purpose code could make use of it.
+// CAS is preferred for sparc v9.
+def LDSTUBrr : F3_1<3, 0b001101, (outs IntRegs:$dst), (ins MEMrr:$addr),
+                    "ldstub [$addr], $dst", []>;
+def LDSTUBri : F3_2<3, 0b001101, (outs IntRegs:$dst), (ins MEMri:$addr),
+                    "ldstub [$addr], $dst", []>;
+def LDSTUBArr : F3_1_asi<3, 0b011101, (outs IntRegs:$dst),
+                         (ins MEMrr:$addr, i8imm:$asi),
+                         "ldstuba [$addr] $asi, $dst", []>;
+
+// Store multiclass - Define both Reg+Reg/Reg+Imm patterns in one shot.
+multiclass Store<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode,
+           RegisterClass RC, ValueType Ty, InstrItinClass itin = IIC_st> {
+  def rr  : F3_1<3, Op3Val,
+                 (outs), (ins MEMrr:$addr, RC:$rd),
+                 !strconcat(OpcStr, " $rd, [$addr]"),
+                 [(OpNode Ty:$rd, ADDRrr:$addr)],
+                 itin>;
+  def ri  : F3_2<3, Op3Val,
+                 (outs), (ins MEMri:$addr, RC:$rd),
+                 !strconcat(OpcStr, " $rd, [$addr]"),
+                 [(OpNode Ty:$rd, ADDRri:$addr)],
+                 itin>;
+}
+
+// TODO: Instructions of the StoreASI class are currently asm only; hooking up
+// CodeGen's address spaces to use these is a future task.
+class StoreASI<string OpcStr, bits<6> Op3Val,
+               SDPatternOperator OpNode, RegisterClass RC, ValueType Ty,
+               InstrItinClass itin = IIC_st> :
+  F3_1_asi<3, Op3Val, (outs), (ins MEMrr:$addr, RC:$rd, i8imm:$asi),
+           !strconcat(OpcStr, "a $rd, [$addr] $asi"),
+           [],
+           itin>;
+
+multiclass StoreA<string OpcStr, bits<6> Op3Val, bits<6> StoreAOp3Val,
+                  SDPatternOperator OpNode, RegisterClass RC, ValueType Ty,
+                  InstrItinClass itin = IIC_st> :
+             Store<OpcStr, Op3Val, OpNode, RC, Ty> {
+  def Arr : StoreASI<OpcStr, StoreAOp3Val, OpNode, RC, Ty, itin>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+// Pseudo instructions.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstSP<outs, ins, asmstr, pattern> {
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
+
+// GETPCX for PIC
+let Defs = [O7] in {
+  def GETPCX : Pseudo<(outs getPCX:$getpcseq), (ins), "$getpcseq", [] >;
+}
+
+let Defs = [O6], Uses = [O6] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                               "!ADJCALLSTACKDOWN $amt",
+                               [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            "!ADJCALLSTACKUP $amt1",
+                            [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let hasSideEffects = 1, mayStore = 1 in {
+  let rd = 0, rs1 = 0, rs2 = 0 in
+    def FLUSHW : F3_1<0b10, 0b101011, (outs), (ins),
+                      "flushw",
+                      [(flushw)]>, Requires<[HasV9]>;
+  let rd = 0, rs1 = 1, simm13 = 3 in
+    def TA3 : F3_2<0b10, 0b111010, (outs), (ins),
+                   "ta 3",
+                   [(flushw)]>;
+}
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+// instruction selection into a branch sequence.  This has to handle all
+// permutations of selection between i32/f32/f64 on ICC and FCC.
+// Expanded after instruction selection.
+let Uses = [ICC], usesCustomInserter = 1 in {
+  def SELECT_CC_Int_ICC
+   : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_Int_ICC PSEUDO!",
+            [(set i32:$dst, (SPselecticc i32:$T, i32:$F, imm:$Cond))]>;
+  def SELECT_CC_FP_ICC
+   : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_FP_ICC PSEUDO!",
+            [(set f32:$dst, (SPselecticc f32:$T, f32:$F, imm:$Cond))]>;
+
+  def SELECT_CC_DFP_ICC
+   : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_DFP_ICC PSEUDO!",
+            [(set f64:$dst, (SPselecticc f64:$T, f64:$F, imm:$Cond))]>;
+
+  def SELECT_CC_QFP_ICC
+   : Pseudo<(outs QFPRegs:$dst), (ins QFPRegs:$T, QFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_QFP_ICC PSEUDO!",
+            [(set f128:$dst, (SPselecticc f128:$T, f128:$F, imm:$Cond))]>;
+}
+
+let usesCustomInserter = 1, Uses = [FCC0] in {
+
+  def SELECT_CC_Int_FCC
+   : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_Int_FCC PSEUDO!",
+            [(set i32:$dst, (SPselectfcc i32:$T, i32:$F, imm:$Cond))]>;
+
+  def SELECT_CC_FP_FCC
+   : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_FP_FCC PSEUDO!",
+            [(set f32:$dst, (SPselectfcc f32:$T, f32:$F, imm:$Cond))]>;
+  def SELECT_CC_DFP_FCC
+   : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_DFP_FCC PSEUDO!",
+            [(set f64:$dst, (SPselectfcc f64:$T, f64:$F, imm:$Cond))]>;
+  def SELECT_CC_QFP_FCC
+   : Pseudo<(outs QFPRegs:$dst), (ins QFPRegs:$T, QFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_QFP_FCC PSEUDO!",
+            [(set f128:$dst, (SPselectfcc f128:$T, f128:$F, imm:$Cond))]>;
+}
+
+let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+  let Defs = [WIM] in
+  def EH_SJLJ_SETJMP32ri  : Pseudo<(outs IntRegs:$dst), (ins MEMri:$buf),
+                            "#EH_SJLJ_SETJMP32",
+                            [(set i32:$dst, (SPsjlj_setjmp ADDRri:$buf))]>,
+                            Requires<[Is32Bit]>;
+  def EH_SJLJ_SETJMP32rr  : Pseudo<(outs IntRegs:$dst), (ins MEMrr:$buf),
+                            "#EH_SJLJ_SETJMP32",
+                            [(set i32:$dst, (SPsjlj_setjmp ADDRrr:$buf))]>,
+                            Requires<[Is32Bit]>;
+  let isTerminator = 1 in
+  def EH_SJLJ_LONGJMP32ri : Pseudo<(outs), (ins MEMri:$buf),
+                            "#EH_SJLJ_LONGJMP32",
+                            [(SPsjlj_longjmp ADDRri:$buf)]>,
+                            Requires<[Is32Bit]>;
+  def EH_SJLJ_LONGJMP32rr : Pseudo<(outs), (ins MEMrr:$buf),
+                            "#EH_SJLJ_LONGJMP32",
+                            [(SPsjlj_longjmp ADDRrr:$buf)]>,
+                            Requires<[Is32Bit]>;
+}
+
+// Section B.1 - Load Integer Instructions, p. 90
+let DecoderMethod = "DecodeLoadInt" in {
+  defm LDSB : LoadA<"ldsb", 0b001001, 0b011001, sextloadi8,  IntRegs, i32>;
+  defm LDSH : LoadA<"ldsh", 0b001010, 0b011010, sextloadi16, IntRegs, i32>;
+  defm LDUB : LoadA<"ldub", 0b000001, 0b010001, zextloadi8,  IntRegs, i32>;
+  defm LDUH : LoadA<"lduh", 0b000010, 0b010010, zextloadi16, IntRegs, i32>;
+  defm LD   : LoadA<"ld",   0b000000, 0b010000, load,        IntRegs, i32>;
+}
+
+let DecoderMethod = "DecodeLoadIntPair" in
+  defm LDD : LoadA<"ldd", 0b000011, 0b010011, load, IntPair, v2i32, IIC_ldd>;
+
+// Section B.2 - Load Floating-point Instructions, p. 92
+let DecoderMethod = "DecodeLoadFP" in {
+  defm LDF   : Load<"ld",  0b100000, load,    FPRegs,  f32, IIC_iu_or_fpu_instr>;
+  def LDFArr : LoadASI<"ld",  0b110000, load, FPRegs,  f32, IIC_iu_or_fpu_instr>,
+                Requires<[HasV9]>;
+}
+let DecoderMethod = "DecodeLoadDFP" in {
+  defm LDDF   : Load<"ldd", 0b100011, load,    DFPRegs, f64, IIC_ldd>;
+  def LDDFArr : LoadASI<"ldd", 0b110011, load, DFPRegs, f64>,
+                 Requires<[HasV9]>;
+}
+let DecoderMethod = "DecodeLoadQFP" in
+  defm LDQF  : LoadA<"ldq", 0b100010, 0b110010, load, QFPRegs, f128>,
+               Requires<[HasV9, HasHardQuad]>;
+
+let DecoderMethod = "DecodeLoadCP" in 
+  defm LDC   : Load<"ld", 0b110000, load, CoprocRegs, i32>; 
+let DecoderMethod = "DecodeLoadCPPair" in 
+  defm LDDC   : Load<"ldd", 0b110011, load, CoprocPair, v2i32, IIC_ldd>;
+
+let DecoderMethod = "DecodeLoadCP", Defs = [CPSR] in {
+  let rd = 0 in {
+    def LDCSRrr : F3_1<3, 0b110001, (outs), (ins MEMrr:$addr),
+                       "ld [$addr], %csr", []>;
+    def LDCSRri : F3_2<3, 0b110001, (outs), (ins MEMri:$addr),
+                       "ld [$addr], %csr", []>;
+  }
+}
+  
+let DecoderMethod = "DecodeLoadFP" in
+  let Defs = [FSR] in {
+    let rd = 0 in {
+      def LDFSRrr : F3_1<3, 0b100001, (outs), (ins MEMrr:$addr),
+                     "ld [$addr], %fsr", [], IIC_iu_or_fpu_instr>;
+      def LDFSRri : F3_2<3, 0b100001, (outs), (ins MEMri:$addr),
+                     "ld [$addr], %fsr", [], IIC_iu_or_fpu_instr>;
+    }
+    let rd = 1 in {
+      def LDXFSRrr : F3_1<3, 0b100001, (outs), (ins MEMrr:$addr),
+                     "ldx [$addr], %fsr", []>, Requires<[HasV9]>;
+      def LDXFSRri : F3_2<3, 0b100001, (outs), (ins MEMri:$addr),
+                     "ldx [$addr], %fsr", []>, Requires<[HasV9]>;
+    }
+  }
+
+// Section B.4 - Store Integer Instructions, p. 95
+let DecoderMethod = "DecodeStoreInt" in {
+  defm STB   : StoreA<"stb", 0b000101, 0b010101, truncstorei8,  IntRegs, i32>;
+  defm STH   : StoreA<"sth", 0b000110, 0b010110, truncstorei16, IntRegs, i32>;
+  defm ST    : StoreA<"st",  0b000100, 0b010100, store,         IntRegs, i32>;
+}
+
+let DecoderMethod = "DecodeStoreIntPair" in
+  defm STD   : StoreA<"std", 0b000111, 0b010111, store, IntPair, v2i32, IIC_std>;
+
+// Section B.5 - Store Floating-point Instructions, p. 97
+let DecoderMethod = "DecodeStoreFP" in {
+  defm STF   : Store<"st",  0b100100, store,         FPRegs,  f32>;
+  def STFArr : StoreASI<"st",  0b110100, store,      FPRegs,  f32>,
+               Requires<[HasV9]>;
+}
+let DecoderMethod = "DecodeStoreDFP" in {
+  defm STDF   : Store<"std", 0b100111, store,         DFPRegs, f64, IIC_std>;
+  def STDFArr : StoreASI<"std", 0b110111, store,      DFPRegs, f64>,
+                Requires<[HasV9]>;
+}
+let DecoderMethod = "DecodeStoreQFP" in
+  defm STQF  : StoreA<"stq", 0b100110, 0b110110, store, QFPRegs, f128>,
+               Requires<[HasV9, HasHardQuad]>;
+
+let DecoderMethod = "DecodeStoreCP" in 
+  defm STC   : Store<"st", 0b110100, store, CoprocRegs, i32>; 
+  
+let DecoderMethod = "DecodeStoreCPPair" in 
+  defm STDC   : Store<"std", 0b110111, store, CoprocPair, v2i32, IIC_std>;
+  
+let DecoderMethod = "DecodeStoreCP", rd = 0 in {
+  let Defs = [CPSR] in {
+    def STCSRrr : F3_1<3, 0b110101, (outs MEMrr:$addr), (ins),
+                       "st %csr, [$addr]", [], IIC_st>;
+    def STCSRri : F3_2<3, 0b110101, (outs MEMri:$addr), (ins),
+                       "st %csr, [$addr]", [], IIC_st>;
+  }
+  let Defs = [CPQ] in {
+    def STDCQrr : F3_1<3, 0b110110, (outs MEMrr:$addr), (ins),
+                       "std %cq, [$addr]", [], IIC_std>;
+    def STDCQri : F3_2<3, 0b110110, (outs MEMri:$addr), (ins),
+                       "std %cq, [$addr]", [], IIC_std>;
+  }
+}
+
+let DecoderMethod = "DecodeStoreFP" in {
+  let rd = 0 in {
+    let Defs = [FSR] in {
+      def STFSRrr : F3_1<3, 0b100101, (outs MEMrr:$addr), (ins),
+                     "st %fsr, [$addr]", [], IIC_st>;
+      def STFSRri : F3_2<3, 0b100101, (outs MEMri:$addr), (ins),
+                     "st %fsr, [$addr]", [], IIC_st>;
+    }
+    let Defs = [FQ] in {
+      def STDFQrr : F3_1<3, 0b100110, (outs MEMrr:$addr), (ins),
+                     "std %fq, [$addr]", [], IIC_std>;
+      def STDFQri : F3_2<3, 0b100110, (outs MEMri:$addr), (ins),
+                     "std %fq, [$addr]", [], IIC_std>;
+    }
+  }
+  let rd = 1, Defs = [FSR] in {
+    def STXFSRrr : F3_1<3, 0b100101, (outs MEMrr:$addr), (ins),
+                   "stx %fsr, [$addr]", []>, Requires<[HasV9]>;
+    def STXFSRri : F3_2<3, 0b100101, (outs MEMri:$addr), (ins),
+                   "stx %fsr, [$addr]", []>, Requires<[HasV9]>;
+  }
+}
+
+// Section B.8 - SWAP Register with Memory Instruction
+// (Atomic swap)
+let Constraints = "$val = $dst", DecoderMethod = "DecodeSWAP" in {
+  def SWAPrr : F3_1<3, 0b001111,
+                 (outs IntRegs:$dst), (ins MEMrr:$addr, IntRegs:$val),
+                 "swap [$addr], $dst",
+                 [(set i32:$dst, (atomic_swap_32 ADDRrr:$addr, i32:$val))]>;
+  def SWAPri : F3_2<3, 0b001111,
+                 (outs IntRegs:$dst), (ins MEMri:$addr, IntRegs:$val),
+                 "swap [$addr], $dst",
+                 [(set i32:$dst, (atomic_swap_32 ADDRri:$addr, i32:$val))]>;
+  def SWAPArr : F3_1_asi<3, 0b011111,
+                 (outs IntRegs:$dst), (ins MEMrr:$addr, i8imm:$asi, IntRegs:$val),
+                 "swapa [$addr] $asi, $dst",
+                 [/*FIXME: pattern?*/]>;
+}
+
+
+// Section B.9 - SETHI Instruction, p. 104
+def SETHIi: F2_1<0b100,
+                 (outs IntRegs:$rd), (ins i32imm:$imm22),
+                 "sethi $imm22, $rd",
+                 [(set i32:$rd, SETHIimm:$imm22)],
+                 IIC_iu_instr>;
+
+// Section B.10 - NOP Instruction, p. 105
+// (It's a special case of SETHI)
+let rd = 0, imm22 = 0 in
+  def NOP : F2_1<0b100, (outs), (ins), "nop", []>;
+
+// Section B.11 - Logical Instructions, p. 106
+defm AND    : F3_12<"and", 0b000001, and, IntRegs, i32, simm13Op>;
+
+def ANDNrr  : F3_1<2, 0b000101,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+                   "andn $rs1, $rs2, $rd",
+                   [(set i32:$rd, (and i32:$rs1, (not i32:$rs2)))]>;
+def ANDNri  : F3_2<2, 0b000101,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+                   "andn $rs1, $simm13, $rd", []>;
+
+defm OR     : F3_12<"or", 0b000010, or, IntRegs, i32, simm13Op>;
+
+def ORNrr   : F3_1<2, 0b000110,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+                   "orn $rs1, $rs2, $rd",
+                   [(set i32:$rd, (or i32:$rs1, (not i32:$rs2)))]>;
+def ORNri   : F3_2<2, 0b000110,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+                   "orn $rs1, $simm13, $rd", []>;
+defm XOR    : F3_12<"xor", 0b000011, xor, IntRegs, i32, simm13Op>;
+
+def XNORrr  : F3_1<2, 0b000111,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+                   "xnor $rs1, $rs2, $rd",
+                   [(set i32:$rd, (not (xor i32:$rs1, i32:$rs2)))]>;
+def XNORri  : F3_2<2, 0b000111,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+                   "xnor $rs1, $simm13, $rd", []>;
+
+let Defs = [ICC] in {
+  defm ANDCC  : F3_12np<"andcc",  0b010001>;
+  defm ANDNCC : F3_12np<"andncc", 0b010101>;
+  defm ORCC   : F3_12np<"orcc",   0b010010>;
+  defm ORNCC  : F3_12np<"orncc",  0b010110>;
+  defm XORCC  : F3_12np<"xorcc",  0b010011>;
+  defm XNORCC : F3_12np<"xnorcc", 0b010111>;
+}
+
+// Section B.12 - Shift Instructions, p. 107
+defm SLL : F3_12<"sll", 0b100101, shl, IntRegs, i32, simm13Op>;
+defm SRL : F3_12<"srl", 0b100110, srl, IntRegs, i32, simm13Op>;
+defm SRA : F3_12<"sra", 0b100111, sra, IntRegs, i32, simm13Op>;
+
+// Section B.13 - Add Instructions, p. 108
+defm ADD   : F3_12<"add", 0b000000, add, IntRegs, i32, simm13Op>;
+
+// "LEA" forms of add (patterns to make tblgen happy)
+let Predicates = [Is32Bit], isCodeGenOnly = 1 in
+  def LEA_ADDri   : F3_2<2, 0b000000,
+                     (outs IntRegs:$dst), (ins MEMri:$addr),
+                     "add ${addr:arith}, $dst",
+                     [(set iPTR:$dst, ADDRri:$addr)]>;
+
+let Defs = [ICC] in
+  defm ADDCC  : F3_12<"addcc", 0b010000, addc, IntRegs, i32, simm13Op>;
+
+let Uses = [ICC] in
+  defm ADDC   : F3_12np<"addx", 0b001000>;
+
+let Uses = [ICC], Defs = [ICC] in
+  defm ADDE  : F3_12<"addxcc", 0b011000, adde, IntRegs, i32, simm13Op>;
+
+// Section B.15 - Subtract Instructions, p. 110
+defm SUB    : F3_12  <"sub"  , 0b000100, sub, IntRegs, i32, simm13Op>;
+let Uses = [ICC], Defs = [ICC] in
+  defm SUBE   : F3_12  <"subxcc" , 0b011100, sube, IntRegs, i32, simm13Op>;
+
+let Defs = [ICC] in
+  defm SUBCC  : F3_12  <"subcc", 0b010100, subc, IntRegs, i32, simm13Op>;
+
+let Uses = [ICC] in
+  defm SUBC   : F3_12np <"subx", 0b001100>;
+
+// cmp (from Section A.3) is a specialized alias for subcc
+let Defs = [ICC], rd = 0 in {
+  def CMPrr   : F3_1<2, 0b010100,
+                     (outs), (ins IntRegs:$rs1, IntRegs:$rs2),
+                     "cmp $rs1, $rs2",
+                     [(SPcmpicc i32:$rs1, i32:$rs2)]>;
+  def CMPri   : F3_2<2, 0b010100,
+                     (outs), (ins IntRegs:$rs1, simm13Op:$simm13),
+                     "cmp $rs1, $simm13",
+                     [(SPcmpicc i32:$rs1, (i32 simm13:$simm13))]>;
+}
+
+// Section B.18 - Multiply Instructions, p. 113
+let Defs = [Y] in {
+  defm UMUL : F3_12<"umul", 0b001010, umullohi, IntRegs, i32, simm13Op, IIC_iu_umul>;
+  defm SMUL : F3_12<"smul", 0b001011, smullohi, IntRegs, i32, simm13Op, IIC_iu_smul>;
+}
+
+let Defs = [Y, ICC] in {
+  defm UMULCC : F3_12np<"umulcc", 0b011010, IIC_iu_umul>;
+  defm SMULCC : F3_12np<"smulcc", 0b011011, IIC_iu_smul>;
+}
+
+let Defs = [Y, ICC], Uses = [Y, ICC] in {
+  defm MULSCC : F3_12np<"mulscc", 0b100100>;
+}
+
+// Section B.19 - Divide Instructions, p. 115
+let Uses = [Y], Defs = [Y] in {
+  defm UDIV : F3_12np<"udiv", 0b001110, IIC_iu_div>;
+  defm SDIV : F3_12np<"sdiv", 0b001111, IIC_iu_div>;
+}
+
+let Uses = [Y], Defs = [Y, ICC] in {
+  defm UDIVCC : F3_12np<"udivcc", 0b011110, IIC_iu_div>;
+  defm SDIVCC : F3_12np<"sdivcc", 0b011111, IIC_iu_div>;
+}
+
+// Section B.20 - SAVE and RESTORE, p. 117
+defm SAVE    : F3_12np<"save"   , 0b111100>;
+defm RESTORE : F3_12np<"restore", 0b111101>;
+
+// Section B.21 - Branch on Integer Condition Codes Instructions, p. 119
+
+// unconditional branch class.
+class BranchAlways<dag ins, string asmstr, list<dag> pattern>
+  : F2_2<0b010, 0, (outs), ins, asmstr, pattern> {
+  let isBranch     = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let isBarrier    = 1;
+}
+
+let cond = 8 in
+  def BA : BranchAlways<(ins brtarget:$imm22), "ba $imm22", [(br bb:$imm22)]>;
+
+
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in {
+
+// conditional branch class:
+class BranchSP<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b010, 0, (outs), ins, asmstr, pattern, IIC_iu_instr>;
+
+// conditional branch with annul class:
+class BranchSPA<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b010, 1, (outs), ins, asmstr, pattern, IIC_iu_instr>;
+
+// Conditional branch class on %icc|%xcc with predication:
+multiclass IPredBranch<string regstr, list<dag> CCPattern> {
+  def CC    : F2_3<0b001, 0, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond),
+                   !strconcat("b$cond ", !strconcat(regstr, ", $imm19")),
+                   CCPattern,
+                   IIC_iu_instr>;
+  def CCA   : F2_3<0b001, 1, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond),
+                   !strconcat("b$cond,a ", !strconcat(regstr, ", $imm19")),
+                   [],
+                   IIC_iu_instr>;
+  def CCNT  : F2_3<0b001, 0, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond),
+                   !strconcat("b$cond,pn ", !strconcat(regstr, ", $imm19")),
+                   [],
+                   IIC_iu_instr>;
+  def CCANT : F2_3<0b001, 1, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond),
+                   !strconcat("b$cond,a,pn ", !strconcat(regstr, ", $imm19")),
+                   [],
+                   IIC_iu_instr>;
+}
+
+} // let isBranch = 1, isTerminator = 1, hasDelaySlot = 1
+
+
+// Indirect branch instructions.
+let isTerminator = 1, isBarrier = 1,  hasDelaySlot = 1, isBranch =1,
+     isIndirectBranch = 1, rd = 0, isCodeGenOnly = 1 in {
+  def BINDrr  : F3_1<2, 0b111000,
+                   (outs), (ins MEMrr:$ptr),
+                   "jmp $ptr",
+                   [(brind ADDRrr:$ptr)]>;
+  def BINDri  : F3_2<2, 0b111000,
+                   (outs), (ins MEMri:$ptr),
+                   "jmp $ptr",
+                   [(brind ADDRri:$ptr)]>;
+}
+
+let Uses = [ICC] in {
+  def BCOND : BranchSP<(ins brtarget:$imm22, CCOp:$cond),
+                         "b$cond $imm22",
+                        [(SPbricc bb:$imm22, imm:$cond)]>;
+  def BCONDA : BranchSPA<(ins brtarget:$imm22, CCOp:$cond),
+                         "b$cond,a $imm22", []>;
+
+  let Predicates = [HasV9], cc = 0b00 in
+    defm BPI : IPredBranch<"%icc", []>;
+}
+
+// Section B.22 - Branch on Floating-point Condition Codes Instructions, p. 121
+
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in {
+
+// floating-point conditional branch class:
+class FPBranchSP<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b110, 0, (outs), ins, asmstr, pattern, IIC_fpu_normal_instr>;
+
+// floating-point conditional branch with annul class:
+class FPBranchSPA<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b110, 1, (outs), ins, asmstr, pattern, IIC_fpu_normal_instr>;
+
+// Conditional branch class on %fcc0-%fcc3 with predication:
+multiclass FPredBranch {
+  def CC    : F2_3<0b101, 0, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond,
+                                         FCCRegs:$cc),
+                  "fb$cond $cc, $imm19", [], IIC_fpu_normal_instr>;
+  def CCA   : F2_3<0b101, 1, 1, (outs), (ins bprtarget:$imm19, CCOp:$cond,
+                                         FCCRegs:$cc),
+                  "fb$cond,a $cc, $imm19", [], IIC_fpu_normal_instr>;
+  def CCNT  : F2_3<0b101, 0, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond,
+                                         FCCRegs:$cc),
+                  "fb$cond,pn $cc, $imm19", [], IIC_fpu_normal_instr>;
+  def CCANT : F2_3<0b101, 1, 0, (outs), (ins bprtarget:$imm19, CCOp:$cond,
+                                         FCCRegs:$cc),
+                  "fb$cond,a,pn $cc, $imm19", [], IIC_fpu_normal_instr>;
+}
+} // let isBranch = 1, isTerminator = 1, hasDelaySlot = 1
+
+let Uses = [FCC0] in {
+  def FBCOND  : FPBranchSP<(ins brtarget:$imm22, CCOp:$cond),
+                              "fb$cond $imm22",
+                              [(SPbrfcc bb:$imm22, imm:$cond)]>;
+  def FBCONDA : FPBranchSPA<(ins brtarget:$imm22, CCOp:$cond),
+                             "fb$cond,a $imm22", []>;
+}
+
+let Predicates = [HasV9] in
+  defm BPF : FPredBranch;
+
+// Section B.22 - Branch on Co-processor Condition Codes Instructions, p. 123
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in {
+
+// co-processor conditional branch class:
+class CPBranchSP<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b111, 0, (outs), ins, asmstr, pattern>;
+
+// co-processor conditional branch with annul class:
+class CPBranchSPA<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b111, 1, (outs), ins, asmstr, pattern>;
+
+} // let isBranch = 1, isTerminator = 1, hasDelaySlot = 1
+
+def CBCOND  : CPBranchSP<(ins brtarget:$imm22, CCOp:$cond),
+                          "cb$cond $imm22",
+                          [(SPbrfcc bb:$imm22, imm:$cond)]>;
+def CBCONDA : CPBranchSPA<(ins brtarget:$imm22, CCOp:$cond),
+                           "cb$cond,a $imm22", []>;
+                           
+// Section B.24 - Call and Link Instruction, p. 125
+// This is the only Format 1 instruction
+let Uses = [O6],
+    hasDelaySlot = 1, isCall = 1 in {
+  def CALL : InstSP<(outs), (ins calltarget:$disp, variable_ops),
+                    "call $disp",
+                    [],
+                    IIC_jmp_or_call> {
+    bits<30> disp;
+    let op = 1;
+    let Inst{29-0} = disp;
+  }
+
+  // indirect calls: special cases of JMPL.
+  let isCodeGenOnly = 1, rd = 15 in {
+    def CALLrr : F3_1<2, 0b111000,
+                      (outs), (ins MEMrr:$ptr, variable_ops),
+                      "call $ptr",
+                      [(call ADDRrr:$ptr)],
+                      IIC_jmp_or_call>;
+    def CALLri : F3_2<2, 0b111000,
+                      (outs), (ins MEMri:$ptr, variable_ops),
+                      "call $ptr",
+                      [(call ADDRri:$ptr)],
+                      IIC_jmp_or_call>;
+  }
+}
+
+// Section B.25 - Jump and Link Instruction
+
+// JMPL Instruction.
+let isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
+    DecoderMethod = "DecodeJMPL" in {
+  def JMPLrr: F3_1<2, 0b111000,
+                   (outs IntRegs:$dst), (ins MEMrr:$addr),
+                   "jmpl $addr, $dst",
+                   [],
+                   IIC_jmp_or_call>;
+  def JMPLri: F3_2<2, 0b111000,
+                   (outs IntRegs:$dst), (ins MEMri:$addr),
+                   "jmpl $addr, $dst",
+                   [],
+                   IIC_jmp_or_call>;
+}
+
+// Section A.3 - Synthetic Instructions, p. 85
+// special cases of JMPL:
+let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
+    isCodeGenOnly = 1 in {
+  let rd = 0, rs1 = 15 in
+    def RETL: F3_2<2, 0b111000,
+                   (outs), (ins i32imm:$val),
+                   "jmp %o7+$val",
+                   [(retflag simm13:$val)],
+                   IIC_jmp_or_call>;
+
+  let rd = 0, rs1 = 31 in
+    def RET: F3_2<2, 0b111000,
+                  (outs), (ins i32imm:$val),
+                  "jmp %i7+$val",
+                  [],
+                  IIC_jmp_or_call>;
+}
+
+// Section B.26 - Return from Trap Instruction
+let isReturn = 1, isTerminator = 1, hasDelaySlot = 1,
+     isBarrier = 1, rd = 0, DecoderMethod = "DecodeReturn" in {
+  def RETTrr : F3_1<2, 0b111001,
+                   (outs), (ins MEMrr:$addr),
+                   "rett $addr",
+                   [],
+                   IIC_jmp_or_call>;
+  def RETTri : F3_2<2, 0b111001,
+                    (outs), (ins MEMri:$addr),
+                    "rett $addr",
+                    [],
+                    IIC_jmp_or_call>;
+}
+
+
+// Section B.27 - Trap on Integer Condition Codes Instruction
+// conditional branch class:
+let DecoderNamespace = "SparcV8", DecoderMethod = "DecodeTRAP", hasSideEffects = 1, Uses = [ICC], cc = 0b00 in
+{
+  def TRAPrr : TRAPSPrr<0b111010,
+                        (outs), (ins IntRegs:$rs1, IntRegs:$rs2, CCOp:$cond),
+                        "t$cond $rs1 + $rs2",
+                        []>;
+  def TRAPri : TRAPSPri<0b111010,
+                        (outs), (ins IntRegs:$rs1, i32imm:$imm, CCOp:$cond),
+                        "t$cond $rs1 + $imm",
+                        []>;
+}
+
+multiclass TRAP<string regStr> {
+  def rr : TRAPSPrr<0b111010,
+                    (outs), (ins IntRegs:$rs1, IntRegs:$rs2, CCOp:$cond),
+                    !strconcat(!strconcat("t$cond ", regStr), ", $rs1 + $rs2"),
+                    []>;
+  def ri : TRAPSPri<0b111010,
+                    (outs), (ins IntRegs:$rs1, i32imm:$imm, CCOp:$cond),
+                    !strconcat(!strconcat("t$cond ", regStr), ", $rs1 + $imm"),
+                    []>;
+}
+
+let DecoderNamespace = "SparcV9", DecoderMethod = "DecodeTRAP", Predicates = [HasV9], hasSideEffects = 1, Uses = [ICC], cc = 0b00 in
+  defm TICC : TRAP<"%icc">;
+
+
+let isBarrier = 1, isTerminator = 1, rd = 0b01000, rs1 = 0, simm13 = 5 in
+  def TA5 : F3_2<0b10, 0b111010, (outs), (ins), "ta 5", [(trap)]>;
+
+// Section B.28 - Read State Register Instructions
+let rs2 = 0 in
+  def RDASR : F3_1<2, 0b101000,
+                 (outs IntRegs:$rd), (ins ASRRegs:$rs1),
+                 "rd $rs1, $rd", []>;
+
+// PSR, WIM, and TBR don't exist on the SparcV9, only the V8.
+let Predicates = [HasNoV9] in {
+  let rs2 = 0, rs1 = 0, Uses=[PSR] in
+    def RDPSR : F3_1<2, 0b101001,
+		     (outs IntRegs:$rd), (ins),
+		     "rd %psr, $rd", []>;
+
+  let rs2 = 0, rs1 = 0, Uses=[WIM] in
+    def RDWIM : F3_1<2, 0b101010,
+		     (outs IntRegs:$rd), (ins),
+		     "rd %wim, $rd", []>;
+
+  let rs2 = 0, rs1 = 0, Uses=[TBR] in
+    def RDTBR : F3_1<2, 0b101011,
+		     (outs IntRegs:$rd), (ins),
+		     "rd %tbr, $rd", []>;
+}
+
+// Section B.29 - Write State Register Instructions
+def WRASRrr : F3_1<2, 0b110000,
+                 (outs ASRRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+                 "wr $rs1, $rs2, $rd", []>;
+def WRASRri : F3_2<2, 0b110000,
+                 (outs ASRRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+                 "wr $rs1, $simm13, $rd", []>;
+
+// PSR, WIM, and TBR don't exist on the SparcV9, only the V8.
+let Predicates = [HasNoV9] in {
+  let Defs = [PSR], rd=0 in {
+    def WRPSRrr : F3_1<2, 0b110001,
+		     (outs), (ins IntRegs:$rs1, IntRegs:$rs2),
+		     "wr $rs1, $rs2, %psr", []>;
+    def WRPSRri : F3_2<2, 0b110001,
+		     (outs), (ins IntRegs:$rs1, simm13Op:$simm13),
+		     "wr $rs1, $simm13, %psr", []>;
+  }
+
+  let Defs = [WIM], rd=0 in {
+    def WRWIMrr : F3_1<2, 0b110010,
+		     (outs), (ins IntRegs:$rs1, IntRegs:$rs2),
+		     "wr $rs1, $rs2, %wim", []>;
+    def WRWIMri : F3_2<2, 0b110010,
+		     (outs), (ins IntRegs:$rs1, simm13Op:$simm13),
+		     "wr $rs1, $simm13, %wim", []>;
+  }
+
+  let Defs = [TBR], rd=0 in {
+    def WRTBRrr : F3_1<2, 0b110011,
+		     (outs), (ins IntRegs:$rs1, IntRegs:$rs2),
+		     "wr $rs1, $rs2, %tbr", []>;
+    def WRTBRri : F3_2<2, 0b110011,
+		     (outs), (ins IntRegs:$rs1, simm13Op:$simm13),
+		     "wr $rs1, $simm13, %tbr", []>;
+  }
+}
+
+// Section B.30 - STBAR Instruction
+let hasSideEffects = 1, rd = 0, rs1 = 0b01111, rs2 = 0 in
+  def STBAR : F3_1<2, 0b101000, (outs), (ins), "stbar", []>;
+
+
+// Section B.31 - Unimplmented Instruction
+let rd = 0 in
+  def UNIMP : F2_1<0b000, (outs), (ins i32imm:$imm22),
+                  "unimp $imm22", []>;
+
+// Section B.32 - Flush Instruction Memory
+let rd = 0 in {
+  def FLUSHrr : F3_1<2, 0b111011, (outs), (ins MEMrr:$addr),
+                       "flush $addr", []>;
+  def FLUSHri : F3_2<2, 0b111011, (outs), (ins MEMri:$addr),
+                       "flush $addr", []>;
+
+  // The no-arg FLUSH is only here for the benefit of the InstAlias
+  // "flush", which cannot seem to use FLUSHrr, due to the inability
+  // to construct a MEMrr with fixed G0 registers.
+  let rs1 = 0, rs2 = 0 in
+    def FLUSH   : F3_1<2, 0b111011, (outs), (ins), "flush %g0", []>;
+}
+
+// Section B.33 - Floating-point Operate (FPop) Instructions
+
+// Convert Integer to Floating-point Instructions, p. 141
+def FITOS : F3_3u<2, 0b110100, 0b011000100,
+                 (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                 "fitos $rs2, $rd",
+                 [(set FPRegs:$rd, (SPitof FPRegs:$rs2))],
+                 IIC_fpu_fast_instr>;
+def FITOD : F3_3u<2, 0b110100, 0b011001000,
+                 (outs DFPRegs:$rd), (ins FPRegs:$rs2),
+                 "fitod $rs2, $rd",
+                 [(set DFPRegs:$rd, (SPitof FPRegs:$rs2))],
+                 IIC_fpu_fast_instr>;
+def FITOQ : F3_3u<2, 0b110100, 0b011001100,
+                 (outs QFPRegs:$rd), (ins FPRegs:$rs2),
+                 "fitoq $rs2, $rd",
+                 [(set QFPRegs:$rd, (SPitof FPRegs:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+
+// Convert Floating-point to Integer Instructions, p. 142
+def FSTOI : F3_3u<2, 0b110100, 0b011010001,
+                 (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                 "fstoi $rs2, $rd",
+                 [(set FPRegs:$rd, (SPftoi FPRegs:$rs2))],
+                 IIC_fpu_fast_instr>;
+def FDTOI : F3_3u<2, 0b110100, 0b011010010,
+                 (outs FPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fdtoi $rs2, $rd",
+                 [(set FPRegs:$rd, (SPftoi DFPRegs:$rs2))],
+                 IIC_fpu_fast_instr>;
+def FQTOI : F3_3u<2, 0b110100, 0b011010011,
+                 (outs FPRegs:$rd), (ins QFPRegs:$rs2),
+                 "fqtoi $rs2, $rd",
+                 [(set FPRegs:$rd, (SPftoi QFPRegs:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+
+// Convert between Floating-point Formats Instructions, p. 143
+def FSTOD : F3_3u<2, 0b110100, 0b011001001,
+                 (outs DFPRegs:$rd), (ins FPRegs:$rs2),
+                 "fstod $rs2, $rd",
+                 [(set f64:$rd, (fpextend f32:$rs2))],
+                 IIC_fpu_stod>;
+def FSTOQ : F3_3u<2, 0b110100, 0b011001101,
+                 (outs QFPRegs:$rd), (ins FPRegs:$rs2),
+                 "fstoq $rs2, $rd",
+                 [(set f128:$rd, (fpextend f32:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+def FDTOS : F3_3u<2, 0b110100, 0b011000110,
+                 (outs FPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fdtos $rs2, $rd",
+                 [(set f32:$rd, (fpround f64:$rs2))],
+                 IIC_fpu_fast_instr>;
+def FDTOQ : F3_3u<2, 0b110100, 0b011001110,
+                 (outs QFPRegs:$rd), (ins DFPRegs:$rs2),
+                 "fdtoq $rs2, $rd",
+                 [(set f128:$rd, (fpextend f64:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+def FQTOS : F3_3u<2, 0b110100, 0b011000111,
+                 (outs FPRegs:$rd), (ins QFPRegs:$rs2),
+                 "fqtos $rs2, $rd",
+                 [(set f32:$rd, (fpround f128:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+def FQTOD : F3_3u<2, 0b110100, 0b011001011,
+                 (outs DFPRegs:$rd), (ins QFPRegs:$rs2),
+                 "fqtod $rs2, $rd",
+                 [(set f64:$rd, (fpround f128:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+
+// Floating-point Move Instructions, p. 144
+def FMOVS : F3_3u<2, 0b110100, 0b000000001,
+                 (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                 "fmovs $rs2, $rd", []>;
+def FNEGS : F3_3u<2, 0b110100, 0b000000101,
+                 (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                 "fnegs $rs2, $rd",
+                 [(set f32:$rd, (fneg f32:$rs2))],
+                 IIC_fpu_negs>;
+def FABSS : F3_3u<2, 0b110100, 0b000001001,
+                 (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                 "fabss $rs2, $rd",
+                 [(set f32:$rd, (fabs f32:$rs2))],
+                 IIC_fpu_abs>;
+
+
+// Floating-point Square Root Instructions, p.145
+// FSQRTS generates an erratum on LEON processors, so by disabling this instruction
+// this will be promoted to use FSQRTD with doubles instead.
+let Predicates = [HasNoFdivSqrtFix] in 
+def FSQRTS : F3_3u<2, 0b110100, 0b000101001,
+                  (outs FPRegs:$rd), (ins FPRegs:$rs2),
+                  "fsqrts $rs2, $rd",
+                  [(set f32:$rd, (fsqrt f32:$rs2))],
+                  IIC_fpu_sqrts>;
+def FSQRTD : F3_3u<2, 0b110100, 0b000101010,
+                  (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                  "fsqrtd $rs2, $rd",
+                  [(set f64:$rd, (fsqrt f64:$rs2))],
+                  IIC_fpu_sqrtd>;
+def FSQRTQ : F3_3u<2, 0b110100, 0b000101011,
+                  (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
+                  "fsqrtq $rs2, $rd",
+                  [(set f128:$rd, (fsqrt f128:$rs2))]>,
+                  Requires<[HasHardQuad]>;
+
+
+
+// Floating-point Add and Subtract Instructions, p. 146
+def FADDS  : F3_3<2, 0b110100, 0b001000001,
+                  (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                  "fadds $rs1, $rs2, $rd",
+                  [(set f32:$rd, (fadd f32:$rs1, f32:$rs2))],
+                  IIC_fpu_fast_instr>;
+def FADDD  : F3_3<2, 0b110100, 0b001000010,
+                  (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                  "faddd $rs1, $rs2, $rd",
+                  [(set f64:$rd, (fadd f64:$rs1, f64:$rs2))],
+                  IIC_fpu_fast_instr>;
+def FADDQ  : F3_3<2, 0b110100, 0b001000011,
+                  (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                  "faddq $rs1, $rs2, $rd",
+                  [(set f128:$rd, (fadd f128:$rs1, f128:$rs2))]>,
+                  Requires<[HasHardQuad]>;
+
+def FSUBS  : F3_3<2, 0b110100, 0b001000101,
+                  (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                  "fsubs $rs1, $rs2, $rd",
+                  [(set f32:$rd, (fsub f32:$rs1, f32:$rs2))],
+                  IIC_fpu_fast_instr>;
+def FSUBD  : F3_3<2, 0b110100, 0b001000110,
+                  (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                  "fsubd $rs1, $rs2, $rd",
+                  [(set f64:$rd, (fsub f64:$rs1, f64:$rs2))],
+                  IIC_fpu_fast_instr>;
+def FSUBQ  : F3_3<2, 0b110100, 0b001000111,
+                  (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                  "fsubq $rs1, $rs2, $rd",
+                  [(set f128:$rd, (fsub f128:$rs1, f128:$rs2))]>,
+                  Requires<[HasHardQuad]>;
+
+
+// Floating-point Multiply and Divide Instructions, p. 147
+// FMULS generates an erratum on LEON processors, so by disabling this instruction
+// this will be promoted to use FMULD with doubles instead.
+let Predicates = [HasNoFmulsFix] in 
+def FMULS  : F3_3<2, 0b110100, 0b001001001,
+                  (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                  "fmuls $rs1, $rs2, $rd",
+                  [(set f32:$rd, (fmul f32:$rs1, f32:$rs2))],
+                  IIC_fpu_muls>;
+def FMULD  : F3_3<2, 0b110100, 0b001001010,
+                  (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                  "fmuld $rs1, $rs2, $rd",
+                  [(set f64:$rd, (fmul f64:$rs1, f64:$rs2))],
+                  IIC_fpu_muld>;
+def FMULQ  : F3_3<2, 0b110100, 0b001001011,
+                  (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                  "fmulq $rs1, $rs2, $rd",
+                  [(set f128:$rd, (fmul f128:$rs1, f128:$rs2))]>,
+                  Requires<[HasHardQuad]>;
+
+let Predicates = [HasNoFsmuldFix] in
+def FSMULD : F3_3<2, 0b110100, 0b001101001,
+                  (outs DFPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                  "fsmuld $rs1, $rs2, $rd",
+                  [(set f64:$rd, (fmul (fpextend f32:$rs1),
+                                        (fpextend f32:$rs2)))],
+                  IIC_fpu_muld>;
+def FDMULQ : F3_3<2, 0b110100, 0b001101110,
+                  (outs QFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                  "fdmulq $rs1, $rs2, $rd",
+                  [(set f128:$rd, (fmul (fpextend f64:$rs1),
+                                         (fpextend f64:$rs2)))]>,
+                  Requires<[HasHardQuad]>;
+
+// FDIVS generates an erratum on LEON processors, so by disabling this instruction
+// this will be promoted to use FDIVD with doubles instead.
+def FDIVS  : F3_3<2, 0b110100, 0b001001101,
+                 (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                 "fdivs $rs1, $rs2, $rd",
+                 [(set f32:$rd, (fdiv f32:$rs1, f32:$rs2))],
+                 IIC_fpu_divs>;
+def FDIVD  : F3_3<2, 0b110100, 0b001001110,
+                 (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                 "fdivd $rs1, $rs2, $rd",
+                 [(set f64:$rd, (fdiv f64:$rs1, f64:$rs2))],
+                 IIC_fpu_divd>;
+def FDIVQ  : F3_3<2, 0b110100, 0b001001111,
+                 (outs QFPRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                 "fdivq $rs1, $rs2, $rd",
+                 [(set f128:$rd, (fdiv f128:$rs1, f128:$rs2))]>,
+                 Requires<[HasHardQuad]>;
+
+// Floating-point Compare Instructions, p. 148
+// Note: the 2nd template arg is different for these guys.
+// Note 2: the result of a FCMP is not available until the 2nd cycle
+// after the instr is retired, but there is no interlock in Sparc V8.
+// This behavior is modeled with a forced noop after the instruction in
+// DelaySlotFiller.
+
+let Defs = [FCC0], rd = 0, isCodeGenOnly = 1 in {
+  def FCMPS  : F3_3c<2, 0b110101, 0b001010001,
+                   (outs), (ins FPRegs:$rs1, FPRegs:$rs2),
+                   "fcmps $rs1, $rs2",
+                   [(SPcmpfcc f32:$rs1, f32:$rs2)],
+                   IIC_fpu_fast_instr>;
+  def FCMPD  : F3_3c<2, 0b110101, 0b001010010,
+                   (outs), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                   "fcmpd $rs1, $rs2",
+                   [(SPcmpfcc f64:$rs1, f64:$rs2)],
+                   IIC_fpu_fast_instr>;
+  def FCMPQ  : F3_3c<2, 0b110101, 0b001010011,
+                   (outs), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                   "fcmpq $rs1, $rs2",
+                   [(SPcmpfcc f128:$rs1, f128:$rs2)]>,
+                   Requires<[HasHardQuad]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions for Thread Local Storage(TLS).
+//===----------------------------------------------------------------------===//
+let isCodeGenOnly = 1, isAsmParserOnly = 1 in {
+def TLS_ADDrr : F3_1<2, 0b000000,
+                    (outs IntRegs:$rd),
+                    (ins IntRegs:$rs1, IntRegs:$rs2, TLSSym:$sym),
+                    "add $rs1, $rs2, $rd, $sym",
+                    [(set i32:$rd,
+                        (tlsadd i32:$rs1, i32:$rs2, tglobaltlsaddr:$sym))]>;
+
+let mayLoad = 1 in
+  def TLS_LDrr : F3_1<3, 0b000000,
+                      (outs IntRegs:$dst), (ins MEMrr:$addr, TLSSym:$sym),
+                      "ld [$addr], $dst, $sym",
+                      [(set i32:$dst,
+                          (tlsld ADDRrr:$addr, tglobaltlsaddr:$sym))]>;
+
+let Uses = [O6], isCall = 1, hasDelaySlot = 1 in
+  def TLS_CALL : InstSP<(outs),
+                        (ins calltarget:$disp, TLSSym:$sym, variable_ops),
+                        "call $disp, $sym",
+                        [(tlscall texternalsym:$disp, tglobaltlsaddr:$sym)],
+                        IIC_jmp_or_call> {
+  bits<30> disp;
+  let op = 1;
+  let Inst{29-0} = disp;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// V9 Instructions
+//===----------------------------------------------------------------------===//
+
+// V9 Conditional Moves.
+let Predicates = [HasV9], Constraints = "$f = $rd" in {
+  // Move Integer Register on Condition (MOVcc) p. 194 of the V9 manual.
+  let Uses = [ICC], intcc = 1, cc = 0b00 in {
+    def MOVICCrr
+      : F4_1<0b101100, (outs IntRegs:$rd),
+             (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+             "mov$cond %icc, $rs2, $rd",
+             [(set i32:$rd, (SPselecticc i32:$rs2, i32:$f, imm:$cond))]>;
+
+    def MOVICCri
+      : F4_2<0b101100, (outs IntRegs:$rd),
+             (ins i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+             "mov$cond %icc, $simm11, $rd",
+             [(set i32:$rd,
+                    (SPselecticc simm11:$simm11, i32:$f, imm:$cond))]>;
+  }
+
+  let Uses = [FCC0], intcc = 0, cc = 0b00 in {
+    def MOVFCCrr
+      : F4_1<0b101100, (outs IntRegs:$rd),
+             (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+             "mov$cond %fcc0, $rs2, $rd",
+             [(set i32:$rd, (SPselectfcc i32:$rs2, i32:$f, imm:$cond))]>;
+    def MOVFCCri
+      : F4_2<0b101100, (outs IntRegs:$rd),
+             (ins i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+             "mov$cond %fcc0, $simm11, $rd",
+             [(set i32:$rd,
+                    (SPselectfcc simm11:$simm11, i32:$f, imm:$cond))]>;
+  }
+
+  let Uses = [ICC], intcc = 1, opf_cc = 0b00 in {
+    def FMOVS_ICC
+      : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+             (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+             "fmovs$cond %icc, $rs2, $rd",
+             [(set f32:$rd, (SPselecticc f32:$rs2, f32:$f, imm:$cond))]>;
+    def FMOVD_ICC
+      : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+               (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+               "fmovd$cond %icc, $rs2, $rd",
+               [(set f64:$rd, (SPselecticc f64:$rs2, f64:$f, imm:$cond))]>;
+    def FMOVQ_ICC
+      : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+               (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+               "fmovq$cond %icc, $rs2, $rd",
+               [(set f128:$rd, (SPselecticc f128:$rs2, f128:$f, imm:$cond))]>,
+               Requires<[HasHardQuad]>;
+  }
+
+  let Uses = [FCC0], intcc = 0, opf_cc = 0b00 in {
+    def FMOVS_FCC
+      : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+             (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+             "fmovs$cond %fcc0, $rs2, $rd",
+             [(set f32:$rd, (SPselectfcc f32:$rs2, f32:$f, imm:$cond))]>;
+    def FMOVD_FCC
+      : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+             (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+             "fmovd$cond %fcc0, $rs2, $rd",
+             [(set f64:$rd, (SPselectfcc f64:$rs2, f64:$f, imm:$cond))]>;
+    def FMOVQ_FCC
+      : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+             (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+             "fmovq$cond %fcc0, $rs2, $rd",
+             [(set f128:$rd, (SPselectfcc f128:$rs2, f128:$f, imm:$cond))]>,
+             Requires<[HasHardQuad]>;
+  }
+
+}
+
+// Floating-Point Move Instructions, p. 164 of the V9 manual.
+let Predicates = [HasV9] in {
+  def FMOVD : F3_3u<2, 0b110100, 0b000000010,
+                   (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                   "fmovd $rs2, $rd", []>;
+  def FMOVQ : F3_3u<2, 0b110100, 0b000000011,
+                   (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
+                   "fmovq $rs2, $rd", []>,
+                   Requires<[HasHardQuad]>;
+  def FNEGD : F3_3u<2, 0b110100, 0b000000110,
+                   (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                   "fnegd $rs2, $rd",
+                   [(set f64:$rd, (fneg f64:$rs2))]>;
+  def FNEGQ : F3_3u<2, 0b110100, 0b000000111,
+                   (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
+                   "fnegq $rs2, $rd",
+                   [(set f128:$rd, (fneg f128:$rs2))]>,
+                   Requires<[HasHardQuad]>;
+  def FABSD : F3_3u<2, 0b110100, 0b000001010,
+                   (outs DFPRegs:$rd), (ins DFPRegs:$rs2),
+                   "fabsd $rs2, $rd",
+                   [(set f64:$rd, (fabs f64:$rs2))]>;
+  def FABSQ : F3_3u<2, 0b110100, 0b000001011,
+                   (outs QFPRegs:$rd), (ins QFPRegs:$rs2),
+                   "fabsq $rs2, $rd",
+                   [(set f128:$rd, (fabs f128:$rs2))]>,
+                   Requires<[HasHardQuad]>;
+}
+
+// Floating-point compare instruction with %fcc0-%fcc3.
+def V9FCMPS  : F3_3c<2, 0b110101, 0b001010001,
+               (outs FCCRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+               "fcmps $rd, $rs1, $rs2", []>;
+def V9FCMPD  : F3_3c<2, 0b110101, 0b001010010,
+                (outs FCCRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                "fcmpd $rd, $rs1, $rs2", []>;
+def V9FCMPQ  : F3_3c<2, 0b110101, 0b001010011,
+                (outs FCCRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                "fcmpq $rd, $rs1, $rs2", []>,
+                 Requires<[HasHardQuad]>;
+
+let hasSideEffects = 1 in {
+  def V9FCMPES  : F3_3c<2, 0b110101, 0b001010101,
+                   (outs FCCRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2),
+                   "fcmpes $rd, $rs1, $rs2", []>;
+  def V9FCMPED  : F3_3c<2, 0b110101, 0b001010110,
+                   (outs FCCRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                   "fcmped $rd, $rs1, $rs2", []>;
+  def V9FCMPEQ  : F3_3c<2, 0b110101, 0b001010111,
+                   (outs FCCRegs:$rd), (ins QFPRegs:$rs1, QFPRegs:$rs2),
+                   "fcmpeq $rd, $rs1, $rs2", []>,
+                   Requires<[HasHardQuad]>;
+}
+
+// Floating point conditional move instrucitons with %fcc0-%fcc3.
+let Predicates = [HasV9] in {
+  let Constraints = "$f = $rd", intcc = 0 in {
+    def V9MOVFCCrr
+      : F4_1<0b101100, (outs IntRegs:$rd),
+             (ins FCCRegs:$cc, IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+             "mov$cond $cc, $rs2, $rd", []>;
+    def V9MOVFCCri
+      : F4_2<0b101100, (outs IntRegs:$rd),
+             (ins FCCRegs:$cc, i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+             "mov$cond $cc, $simm11, $rd", []>;
+    def V9FMOVS_FCC
+      : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+             (ins FCCRegs:$opf_cc, FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+             "fmovs$cond $opf_cc, $rs2, $rd", []>;
+    def V9FMOVD_FCC
+      : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+             (ins FCCRegs:$opf_cc, DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+             "fmovd$cond $opf_cc, $rs2, $rd", []>;
+    def V9FMOVQ_FCC
+      : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+             (ins FCCRegs:$opf_cc, QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+             "fmovq$cond $opf_cc, $rs2, $rd", []>,
+             Requires<[HasHardQuad]>;
+  } // Constraints = "$f = $rd", ...
+} // let Predicates = [hasV9]
+
+
+// POPCrr - This does a ctpop of a 64-bit register.  As such, we have to clear
+// the top 32-bits before using it.  To do this clearing, we use a SRLri X,0.
+let rs1 = 0 in
+  def POPCrr : F3_1<2, 0b101110,
+                    (outs IntRegs:$rd), (ins IntRegs:$rs2),
+                    "popc $rs2, $rd", []>, Requires<[HasV9]>;
+def : Pat<(ctpop i32:$src),
+          (POPCrr (SRLri $src, 0))>;
+
+let Predicates = [HasV9], hasSideEffects = 1, rd = 0, rs1 = 0b01111 in
+ def MEMBARi : F3_2<2, 0b101000, (outs), (ins simm13Op:$simm13),
+                    "membar $simm13", []>;
+
+// The CAS instruction, unlike other instructions, only comes in a 
+// form which requires an ASI be provided. The ASI value hardcoded 
+// here is ASI_PRIMARY, the default unprivileged ASI for SparcV9.
+let Predicates = [HasV9], Constraints = "$swap = $rd", asi = 0b10000000 in
+  def CASrr: F3_1_asi<3, 0b111100,
+                (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2,
+                                     IntRegs:$swap),
+                 "cas [$rs1], $rs2, $rd",
+                 [(set i32:$rd,
+                     (atomic_cmp_swap_32 iPTR:$rs1, i32:$rs2, i32:$swap))]>;
+
+
+// CASA is supported as an instruction on some LEON3 and all LEON4 processors.
+// This version can be automatically lowered from C code, selecting ASI 10
+let Predicates = [HasLeonCASA], Constraints = "$swap = $rd", asi = 0b00001010 in
+  def CASAasi10: F3_1_asi<3, 0b111100,
+                (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2,
+                                     IntRegs:$swap),
+                 "casa [$rs1] 10, $rs2, $rd",
+                 [(set i32:$rd,
+                     (atomic_cmp_swap_32 iPTR:$rs1, i32:$rs2, i32:$swap))]>;
+                 
+// CASA supported on some LEON3 and all LEON4 processors. Same pattern as
+// CASrr, above, but with a different ASI. This version is supported for
+// inline assembly lowering only. 
+let Predicates = [HasLeonCASA], Constraints = "$swap = $rd" in
+  def CASArr: F3_1_asi<3, 0b111100,
+                (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2,
+                                     IntRegs:$swap, i8imm:$asi),
+                 "casa [$rs1] $asi, $rs2, $rd", []>;
+                
+// TODO: Add DAG sequence to lower these instructions. Currently, only provided
+// as inline assembler-supported instructions. 
+let Predicates = [HasUMAC_SMAC], Defs = [Y, ASR18], Uses = [Y, ASR18] in {
+  def SMACrr :  F3_1<2, 0b111111,
+                   (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2, ASRRegs:$asr18),
+                   "smac $rs1, $rs2, $rd",
+                   [], IIC_smac_umac>;
+
+  def SMACri :  F3_2<2, 0b111111,
+                  (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13, ASRRegs:$asr18),
+                   "smac $rs1, $simm13, $rd",
+                   [], IIC_smac_umac>;
+                 
+  def UMACrr :  F3_1<2, 0b111110,
+                  (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2, ASRRegs:$asr18),
+                   "umac $rs1, $rs2, $rd",
+                   [], IIC_smac_umac>;
+                 
+  def UMACri :  F3_2<2, 0b111110,
+                  (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13, ASRRegs:$asr18),
+                   "umac $rs1, $simm13, $rd",
+                   [], IIC_smac_umac>;
+}
+
+let Defs = [ICC] in {
+defm TADDCC   : F3_12np<"taddcc",   0b100000>;
+defm TSUBCC   : F3_12np<"tsubcc",   0b100001>;
+
+let hasSideEffects = 1 in {
+  defm TADDCCTV : F3_12np<"taddcctv", 0b100010>;
+  defm TSUBCCTV : F3_12np<"tsubcctv", 0b100011>;
+}
+}
+
+
+// Section A.43 - Read Privileged Register Instructions
+let Predicates = [HasV9] in {
+let rs2 = 0 in
+  def RDPR : F3_1<2, 0b101010,
+                 (outs IntRegs:$rd), (ins PRRegs:$rs1),
+                 "rdpr $rs1, $rd", []>;
+}
+
+// Section A.62 - Write Privileged Register Instructions
+let Predicates = [HasV9] in {
+  def WRPRrr : F3_1<2, 0b110010,
+                   (outs PRRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2),
+                   "wrpr $rs1, $rs2, $rd", []>;
+  def WRPRri : F3_2<2, 0b110010,
+                   (outs PRRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13),
+                   "wrpr $rs1, $simm13, $rd", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Small immediates.
+def : Pat<(i32 simm13:$val),
+          (ORri (i32 G0), imm:$val)>;
+// Arbitrary immediates.
+def : Pat<(i32 imm:$val),
+          (ORri (SETHIi (HI22 imm:$val)), (LO10 imm:$val))>;
+
+
+// Global addresses, constant pool entries
+let Predicates = [Is32Bit] in {
+
+def : Pat<(SPhi tglobaladdr:$in), (SETHIi tglobaladdr:$in)>;
+def : Pat<(SPlo tglobaladdr:$in), (ORri (i32 G0), tglobaladdr:$in)>;
+def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>;
+def : Pat<(SPlo tconstpool:$in), (ORri (i32 G0), tconstpool:$in)>;
+
+// GlobalTLS addresses
+def : Pat<(SPhi tglobaltlsaddr:$in), (SETHIi tglobaltlsaddr:$in)>;
+def : Pat<(SPlo tglobaltlsaddr:$in), (ORri (i32 G0), tglobaltlsaddr:$in)>;
+def : Pat<(add (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+          (ADDri (SETHIi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+def : Pat<(xor (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+          (XORri (SETHIi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+
+// Blockaddress
+def : Pat<(SPhi tblockaddress:$in), (SETHIi tblockaddress:$in)>;
+def : Pat<(SPlo tblockaddress:$in), (ORri (i32 G0), tblockaddress:$in)>;
+
+// Add reg, lo.  This is used when taking the addr of a global/constpool entry.
+def : Pat<(add iPTR:$r, (SPlo tglobaladdr:$in)), (ADDri $r, tglobaladdr:$in)>;
+def : Pat<(add iPTR:$r, (SPlo tconstpool:$in)),  (ADDri $r, tconstpool:$in)>;
+def : Pat<(add iPTR:$r, (SPlo tblockaddress:$in)),
+                        (ADDri $r, tblockaddress:$in)>;
+}
+
+// Calls:
+def : Pat<(call tglobaladdr:$dst),
+          (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst),
+          (CALL texternalsym:$dst)>;
+
+// Map integer extload's to zextloads.
+def : Pat<(i32 (extloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (extloadi8 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi8 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (extloadi16 ADDRrr:$src)), (LDUHrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi16 ADDRri:$src)), (LDUHri ADDRri:$src)>;
+
+// zextload bool -> zextload byte
+def : Pat<(i32 (zextloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (zextloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+
+// store 0, addr -> store %g0, addr
+def : Pat<(store (i32 0), ADDRrr:$dst), (STrr ADDRrr:$dst, (i32 G0))>;
+def : Pat<(store (i32 0), ADDRri:$dst), (STri ADDRri:$dst, (i32 G0))>;
+
+// store bar for all atomic_fence in V8.
+let Predicates = [HasNoV9] in
+  def : Pat<(atomic_fence imm, imm), (STBAR)>;
+
+// atomic_load addr -> load addr
+def : Pat<(i32 (atomic_load_8 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (atomic_load_8 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (atomic_load_16 ADDRrr:$src)), (LDUHrr ADDRrr:$src)>;
+def : Pat<(i32 (atomic_load_16 ADDRri:$src)), (LDUHri ADDRri:$src)>;
+def : Pat<(i32 (atomic_load_32 ADDRrr:$src)), (LDrr ADDRrr:$src)>;
+def : Pat<(i32 (atomic_load_32 ADDRri:$src)), (LDri ADDRri:$src)>;
+
+// atomic_store val, addr -> store val, addr
+def : Pat<(atomic_store_8 ADDRrr:$dst, i32:$val), (STBrr ADDRrr:$dst, $val)>;
+def : Pat<(atomic_store_8 ADDRri:$dst, i32:$val), (STBri ADDRri:$dst, $val)>;
+def : Pat<(atomic_store_16 ADDRrr:$dst, i32:$val), (STHrr ADDRrr:$dst, $val)>;
+def : Pat<(atomic_store_16 ADDRri:$dst, i32:$val), (STHri ADDRri:$dst, $val)>;
+def : Pat<(atomic_store_32 ADDRrr:$dst, i32:$val), (STrr ADDRrr:$dst, $val)>;
+def : Pat<(atomic_store_32 ADDRri:$dst, i32:$val), (STri ADDRri:$dst, $val)>;
+
+// extract_vector
+def : Pat<(extractelt (v2i32 IntPair:$Rn), 0),
+          (i32 (EXTRACT_SUBREG IntPair:$Rn, sub_even))>;
+def : Pat<(extractelt (v2i32 IntPair:$Rn), 1),
+          (i32 (EXTRACT_SUBREG IntPair:$Rn, sub_odd))>;
+
+// build_vector
+def : Pat<(build_vector (i32 IntRegs:$a1), (i32 IntRegs:$a2)),
+          (INSERT_SUBREG
+	    (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (i32 IntRegs:$a1), sub_even),
+            (i32 IntRegs:$a2), sub_odd)>;
+
+
+include "SparcInstr64Bit.td"
+include "SparcInstrVIS.td"
+include "SparcInstrAliases.td"
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrVIS.td b/contrib/llvm/lib/Target/Sparc/SparcInstrVIS.td
new file mode 100644
index 000000000000..d9adf3e8b0f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrVIS.td
@@ -0,0 +1,263 @@
+//===---- SparcInstrVIS.td - Visual Instruction Set extensions (VIS) -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction formats, definitions and patterns needed for
+// VIS, VIS II, VIS II instructions on SPARC.
+//===----------------------------------------------------------------------===//
+
+// VIS Instruction Format.
+class VISInstFormat<bits<9> opfval, dag outs, dag ins, string asmstr,
+      list<dag> pattern>
+      : F3_3<0b10, 0b110110, opfval, outs, ins, asmstr, pattern>;
+
+class VISInst<bits<9> opfval, string OpcStr, RegisterClass RC = DFPRegs>
+       : VISInstFormat<opfval,
+        (outs RC:$rd), (ins RC:$rs1, RC:$rs2),
+        !strconcat(OpcStr, " $rs1, $rs2, $rd"), []>;
+
+// VIS Instruction with integer destination register.
+class VISInstID<bits<9> opfval, string OpcStr>
+       : VISInstFormat<opfval,
+        (outs I64Regs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+        !strconcat(OpcStr, " $rs1, $rs2, $rd"), []>;
+
+// For VIS Instructions with no operand.
+let rd = 0, rs1 = 0, rs2 = 0 in
+class VISInst0<bits<9> opfval, string asmstr>
+       : VISInstFormat<opfval, (outs), (ins), asmstr, []>;
+
+// For VIS Instructions with only rs1, rd operands.
+let rs2 = 0 in
+class VISInst1<bits<9> opfval, string OpcStr, RegisterClass RC = DFPRegs>
+       : VISInstFormat<opfval,
+        (outs RC:$rd), (ins RC:$rs1),
+        !strconcat(OpcStr, " $rs1, $rd"), []>;
+
+// For VIS Instructions with only rs2, rd operands.
+let rs1 = 0 in
+class VISInst2<bits<9> opfval, string OpcStr, RegisterClass RC = DFPRegs>
+       : VISInstFormat<opfval,
+        (outs RC:$rd), (ins RC:$rs2),
+        !strconcat(OpcStr, " $rs2, $rd"), []>;
+
+// For VIS Instructions with only rd operand.
+let Constraints = "$rd = $f", rs1 = 0, rs2 = 0 in
+class VISInstD<bits<9> opfval, string OpcStr, RegisterClass RC = DFPRegs>
+       : VISInstFormat<opfval,
+        (outs RC:$rd), (ins RC:$f),
+        !strconcat(OpcStr, " $rd"), []>;
+
+// VIS 1 Instructions
+let Predicates = [HasVIS] in {
+
+def FPADD16     : VISInst<0b001010000, "fpadd16">;
+def FPADD16S    : VISInst<0b001010001, "fpadd16s">;
+def FPADD32     : VISInst<0b001010010, "fpadd32">;
+def FPADD32S    : VISInst<0b001010011, "fpadd32s">;
+def FPSUB16     : VISInst<0b001010100, "fpsub16">;
+def FPSUB16S    : VISInst<0b001010101, "fpsub16S">;
+def FPSUB32     : VISInst<0b001010110, "fpsub32">;
+def FPSUB32S    : VISInst<0b001010111, "fpsub32S">;
+
+def FPACK16     : VISInst2<0b000111011, "fpack16">;
+def FPACK32     : VISInst <0b000111010, "fpack32">;
+def FPACKFIX    : VISInst2<0b000111101, "fpackfix">;
+def FEXPAND     : VISInst2<0b001001101, "fexpand">;
+def FPMERGE     : VISInst <0b001001011, "fpmerge">;
+
+def FMUL8X16    : VISInst<0b000110001, "fmul8x16">;
+def FMUL8X16AU  : VISInst<0b000110011, "fmul8x16au">;
+def FMUL8X16AL  : VISInst<0b000110101, "fmul8x16al">;
+def FMUL8SUX16  : VISInst<0b000110110, "fmul8sux16">;
+def FMUL8ULX16  : VISInst<0b000110111, "fmul8ulx16">;
+def FMULD8SUX16 : VISInst<0b000111000, "fmuld8sux16">;
+def FMULD8ULX16 : VISInst<0b000111001, "fmuld8ulx16">;
+
+def ALIGNADDR   : VISInst<0b000011000, "alignaddr", I64Regs>;
+def ALIGNADDRL  : VISInst<0b000011010, "alignaddrl", I64Regs>;
+def FALIGNADATA : VISInst<0b001001000, "faligndata">;
+
+def FZERO       : VISInstD<0b001100000, "fzero">;
+def FZEROS      : VISInstD<0b001100001, "fzeros", FPRegs>;
+def FONE        : VISInstD<0b001111110, "fone">;
+def FONES       : VISInstD<0b001111111, "fones", FPRegs>;
+def FSRC1       : VISInst1<0b001110100, "fsrc1">;
+def FSRC1S      : VISInst1<0b001110101, "fsrc1s", FPRegs>;
+def FSRC2       : VISInst2<0b001111000, "fsrc2">;
+def FSRC2S      : VISInst2<0b001111001, "fsrc2s", FPRegs>;
+def FNOT1       : VISInst1<0b001101010, "fnot1">;
+def FNOT1S      : VISInst1<0b001101011, "fnot1s", FPRegs>;
+def FNOT2       : VISInst2<0b001100110, "fnot2">;
+def FNOT2S      : VISInst2<0b001100111, "fnot2s", FPRegs>;
+def FOR         : VISInst<0b001111100,  "for">;
+def FORS        : VISInst<0b001111101,  "fors",  FPRegs>;
+def FNOR        : VISInst<0b001100010,  "fnor">;
+def FNORS       : VISInst<0b001100011,  "fnors", FPRegs>;
+def FAND        : VISInst<0b001110000,  "fand">;
+def FANDS       : VISInst<0b001110001,  "fands", FPRegs>;
+def FNAND       : VISInst<0b001101110,  "fnand">;
+def FNANDS      : VISInst<0b001101111,  "fnands", FPRegs>;
+def FXOR        : VISInst<0b001101100,  "fxor">;
+def FXORS       : VISInst<0b001101101,  "fxors", FPRegs>;
+def FXNOR       : VISInst<0b001110010,  "fxnor">;
+def FXNORS      : VISInst<0b001110011,  "fxnors", FPRegs>;
+
+def FORNOT1     : VISInst<0b001111010,  "fornot1">;
+def FORNOT1S    : VISInst<0b001111011,  "fornot1s",  FPRegs>;
+def FORNOT2     : VISInst<0b001110110,  "fornot2">;
+def FORNOT2S    : VISInst<0b001110111,  "fornot2s",  FPRegs>;
+def FANDNOT1    : VISInst<0b001101000,  "fandnot1">;
+def FANDNOT1S   : VISInst<0b001101001,  "fandnot1s", FPRegs>;
+def FANDNOT2    : VISInst<0b001100100,  "fandnot2">;
+def FANDNOT2S   : VISInst<0b001100101,  "fandnot2s", FPRegs>;
+
+def FCMPGT16    : VISInstID<0b000101000,  "fcmpgt16">;
+def FCMPGT32    : VISInstID<0b000101100,  "fcmpgt32">;
+def FCMPLE16    : VISInstID<0b000100000,  "fcmple16">;
+def FCMPLE32    : VISInstID<0b000100100,  "fcmple32">;
+def FCMPNE16    : VISInstID<0b000100010,  "fcmpne16">;
+def FCMPNE32    : VISInstID<0b000100110,  "fcmpne32">;
+def FCMPEQ16    : VISInstID<0b000101010,  "fcmpeq16">;
+def FCMPEQ32    : VISInstID<0b000101110,  "fcmpeq32">;
+
+
+def EDGE8       : VISInst<0b000000000,  "edge8",   I64Regs>;
+def EDGE8L      : VISInst<0b000000010,  "edge8l",  I64Regs>;
+def EDGE16      : VISInst<0b000000100,  "edge16",  I64Regs>;
+def EDGE16L     : VISInst<0b000000110,  "edge16l", I64Regs>;
+def EDGE32      : VISInst<0b000001000,  "edge32",  I64Regs>;
+def EDGE32L     : VISInst<0b000001010,  "edge32l", I64Regs>;
+
+def PDIST       : VISInst<0b000111110, "pdist">;
+
+def ARRAY8      : VISInst<0b000010000, "array8",  I64Regs>;
+def ARRAY16     : VISInst<0b000010010, "array16", I64Regs>;
+def ARRAY32     : VISInst<0b000010100, "array32", I64Regs>;
+
+def SHUTDOWN    : VISInst0<0b010000000, "shutdown">;
+
+} // Predicates = [HasVIS]
+
+
+// VIS 2 Instructions.
+let Predicates = [HasVIS2] in {
+
+def BMASK     : VISInst<0b000011001, "bmask", I64Regs>;
+def BSHUFFLE  : VISInst<0b000011100, "bshuffle">;
+
+def SIAM      : VISInst0<0b010000001, "siam">;
+
+def EDGE8N    : VISInst<0b000000001,  "edge8n",   I64Regs>;
+def EDGE8LN   : VISInst<0b000000011,  "edge8ln",  I64Regs>;
+def EDGE16N   : VISInst<0b000000101,  "edge16n",  I64Regs>;
+def EDGE16LN  : VISInst<0b000000111,  "edge16ln", I64Regs>;
+def EDGE32N   : VISInst<0b000001001,  "edge32n",  I64Regs>;
+def EDGE32LN  : VISInst<0b000001011,  "edge32ln", I64Regs>;
+} // Predicates = [HasVIS2]
+
+
+// VIS 3 Instructions.
+let Predicates = [HasVIS3] in {
+
+let Uses = [ICC] in
+def ADDXC : VISInst<0b000010001, "addxc", I64Regs>;
+
+let Defs = [ICC], Uses = [ICC] in
+def ADDXCCC : VISInst<0b000010011, "addxccc", I64Regs>;
+
+let rd = 0, rs1 = 0 in {
+def CMASK8  : VISInstFormat<0b000011011, (outs), (ins I64Regs:$rs2),
+              "cmask8 $rs2", []>;
+def CMASK16  : VISInstFormat<0b000011101, (outs), (ins I64Regs:$rs2),
+              "cmask16 $rs2", []>;
+def CMASK32  : VISInstFormat<0b000011111, (outs), (ins I64Regs:$rs2),
+              "cmask32 $rs2", []>;
+
+}
+
+def FCHKSM16 : VISInst<0b001000100, "fchksm16">;
+
+def FHADDS   : F3_3<0b10, 0b110100, 0b001100001,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fhadds $rs1, $rs2, $rd", []>;
+def FHADDD   : F3_3<0b10, 0b110100, 0b001100010,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fhaddd $rs1, $rs2, $rd", []>;
+def FHSUBS   : F3_3<0b10, 0b110100, 0b001100101,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fhsubs $rs1, $rs2, $rd", []>;
+def FHSUBD   : F3_3<0b10, 0b110100, 0b001100110,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fhsubd $rs1, $rs2, $rd", []>;
+def FLCMPS   : VISInstFormat<0b101010001, (outs FCCRegs:$rd),
+                     (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                     "flcmps $rd, $rs1, $rs2", []>;
+def FLCMPD   : VISInstFormat<0b101010010, (outs FCCRegs:$rd),
+                     (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                     "flcmpd $rd, $rs1, $rs2", []>;
+
+def FMEAN16  : VISInst<0b001000000, "fmean16">;
+
+def FNADDS   : F3_3<0b10, 0b110100, 0b001010001,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnadds $rs1, $rs2, $rd", []>;
+def FNADDD   : F3_3<0b10, 0b110100, 0b001010010,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnaddd $rs1, $rs2, $rd", []>;
+def FNHADDS  : F3_3<0b10, 0b110100, 0b001110001,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnhadds $rs1, $rs2, $rd", []>;
+def FNHADDD  : F3_3<0b10, 0b110100, 0b001110010,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnhaddd $rs1, $rs2, $rd", []>;
+
+def FNMULS   : F3_3<0b10, 0b110100, 0b001011001,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnhadds $rs1, $rs2, $rd", []>;
+def FNMULD   : F3_3<0b10, 0b110100, 0b001011010,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnhaddd $rs1, $rs2, $rd", []>;
+def FNSMULD  : F3_3<0b10, 0b110100, 0b001111001,
+                    (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
+                    "fnhadds $rs1, $rs2, $rd", []>;
+
+def FPADD64   : VISInst<0b001000010, "fpadd64">;
+
+def FSLL16    : VISInst<0b000100001, "fsll16">;
+def FSRL16    : VISInst<0b000100011, "fsrl16">;
+def FSLL32    : VISInst<0b000100101, "fsll32">;
+def FSRL32    : VISInst<0b000100111, "fsrl32">;
+def FSLAS16   : VISInst<0b000101001, "fslas16">;
+def FSRA16    : VISInst<0b000101011, "fsra16">;
+def FSLAS32   : VISInst<0b000101101, "fslas32">;
+def FSRA32    : VISInst<0b000101111, "fsra32">;
+
+let rs1 = 0 in
+def LZCNT     : VISInstFormat<0b000010111, (outs I64Regs:$rd),
+                   (ins I64Regs:$rs2), "lzcnt $rs2, $rd", []>;
+
+let rs1 = 0 in {
+def MOVSTOSW : VISInstFormat<0b100010011, (outs I64Regs:$rd),
+                   (ins DFPRegs:$rs2), "movstosw $rs2, $rd", []>;
+def MOVSTOUW : VISInstFormat<0b100010001, (outs I64Regs:$rd),
+                   (ins DFPRegs:$rs2), "movstouw $rs2, $rd", []>;
+def MOVDTOX  : VISInstFormat<0b100010000, (outs I64Regs:$rd),
+                   (ins DFPRegs:$rs2), "movdtox $rs2, $rd", []>;
+def MOVWTOS  :  VISInstFormat<0b100011001, (outs DFPRegs:$rd),
+                   (ins I64Regs:$rs2), "movdtox $rs2, $rd", []>;
+def MOVXTOD  :  VISInstFormat<0b100011000, (outs DFPRegs:$rd),
+                   (ins I64Regs:$rs2), "movdtox $rs2, $rd", []>;
+}
+
+def PDISTN   : VISInst<0b000111111, "pdistn">;
+
+def UMULXHI  : VISInst<0b000010110, "umulxhi", I64Regs>;
+def XMULX    : VISInst<0b100010101, "xmulx",   I64Regs>;
+def XMULXHI  : VISInst<0b100010111, "xmulxhi", I64Regs>;
+} // Predicates = [IsVIS3]
diff --git a/contrib/llvm/lib/Target/Sparc/SparcMCInstLower.cpp b/contrib/llvm/lib/Target/Sparc/SparcMCInstLower.cpp
new file mode 100644
index 000000000000..a3cedcbf9dd1
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcMCInstLower.cpp
@@ -0,0 +1,108 @@
+//===-- SparcMCInstLower.cpp - Convert Sparc MachineInstr to MCInst -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Sparc MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+
+static MCOperand LowerSymbolOperand(const MachineInstr *MI,
+                                    const MachineOperand &MO,
+                                    AsmPrinter &AP) {
+
+  SparcMCExpr::VariantKind Kind =
+    (SparcMCExpr::VariantKind)MO.getTargetFlags();
+  const MCSymbol *Symbol = nullptr;
+
+  switch(MO.getType()) {
+  default: llvm_unreachable("Unknown type in LowerSymbolOperand");
+  case MachineOperand::MO_MachineBasicBlock:
+    Symbol = MO.getMBB()->getSymbol();
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    Symbol = AP.getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_BlockAddress:
+    Symbol = AP.GetBlockAddressSymbol(MO.getBlockAddress());
+    break;
+
+  case MachineOperand::MO_ExternalSymbol:
+    Symbol = AP.GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    Symbol = AP.GetCPISymbol(MO.getIndex());
+    break;
+  }
+
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Symbol,
+                                                         AP.OutContext);
+  const SparcMCExpr *expr = SparcMCExpr::create(Kind, MCSym,
+                                                AP.OutContext);
+  return MCOperand::createExpr(expr);
+}
+
+static MCOperand LowerOperand(const MachineInstr *MI,
+                              const MachineOperand &MO,
+                              AsmPrinter &AP) {
+  switch(MO.getType()) {
+  default: llvm_unreachable("unknown operand type"); break;
+  case MachineOperand::MO_Register:
+    if (MO.isImplicit())
+      break;
+    return MCOperand::createReg(MO.getReg());
+
+  case MachineOperand::MO_Immediate:
+    return MCOperand::createImm(MO.getImm());
+
+  case MachineOperand::MO_MachineBasicBlock:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_BlockAddress:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_ConstantPoolIndex:
+    return LowerSymbolOperand(MI, MO, AP);
+
+  case MachineOperand::MO_RegisterMask:   break;
+
+  }
+  return MCOperand();
+}
+
+void llvm::LowerSparcMachineInstrToMCInst(const MachineInstr *MI,
+                                          MCInst &OutMI,
+                                          AsmPrinter &AP)
+{
+
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCOp = LowerOperand(MI, MO, AP);
+
+    if (MCOp.isValid())
+      OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..e7442826e78b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp
@@ -0,0 +1,14 @@
+//===-- SparcMachineFunctionInfo.cpp - Sparc Machine Function Info --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMachineFunctionInfo.h"
+
+using namespace llvm;
+
+void SparcMachineFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h
new file mode 100644
index 000000000000..104744279d9d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h
@@ -0,0 +1,56 @@
+//===- SparcMachineFunctionInfo.h - Sparc Machine Function Info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares  Sparc specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_SPARC_SPARCMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+  class SparcMachineFunctionInfo : public MachineFunctionInfo {
+    virtual void anchor();
+  private:
+    unsigned GlobalBaseReg;
+
+    /// VarArgsFrameOffset - Frame offset to start of varargs area.
+    int VarArgsFrameOffset;
+
+    /// SRetReturnReg - Holds the virtual register into which the sret
+    /// argument is passed.
+    unsigned SRetReturnReg;
+
+    /// IsLeafProc - True if the function is a leaf procedure.
+    bool IsLeafProc;
+  public:
+    SparcMachineFunctionInfo()
+      : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0),
+        IsLeafProc(false) {}
+    explicit SparcMachineFunctionInfo(MachineFunction &MF)
+      : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0),
+        IsLeafProc(false) {}
+
+    unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+    void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+    int getVarArgsFrameOffset() const { return VarArgsFrameOffset; }
+    void setVarArgsFrameOffset(int Offset) { VarArgsFrameOffset = Offset; }
+
+    unsigned getSRetReturnReg() const { return SRetReturnReg; }
+    void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+    void setLeafProc(bool rhs) { IsLeafProc = rhs; }
+    bool isLeafProc() const { return IsLeafProc; }
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
new file mode 100644
index 000000000000..37a1fdf4d770
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -0,0 +1,237 @@
+//===-- SparcRegisterInfo.cpp - SPARC Register Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SPARC implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcRegisterInfo.h"
+#include "Sparc.h"
+#include "SparcMachineFunctionInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "SparcGenRegisterInfo.inc"
+
+static cl::opt<bool>
+ReserveAppRegisters("sparc-reserve-app-registers", cl::Hidden, cl::init(false),
+                    cl::desc("Reserve application registers (%g2-%g4)"));
+
+SparcRegisterInfo::SparcRegisterInfo() : SparcGenRegisterInfo(SP::O7) {}
+
+const MCPhysReg*
+SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  return CSR_SaveList;
+}
+
+const uint32_t *
+SparcRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                        CallingConv::ID CC) const {
+  return CSR_RegMask;
+}
+
+const uint32_t*
+SparcRegisterInfo::getRTCallPreservedMask(CallingConv::ID CC) const {
+  return RTCSR_RegMask;
+}
+
+BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
+  // FIXME: G1 reserved for now for large imm generation by frame code.
+  Reserved.set(SP::G1);
+
+  // G1-G4 can be used in applications.
+  if (ReserveAppRegisters) {
+    Reserved.set(SP::G2);
+    Reserved.set(SP::G3);
+    Reserved.set(SP::G4);
+  }
+  // G5 is not reserved in 64 bit mode.
+  if (!Subtarget.is64Bit())
+    Reserved.set(SP::G5);
+
+  Reserved.set(SP::O6);
+  Reserved.set(SP::I6);
+  Reserved.set(SP::I7);
+  Reserved.set(SP::G0);
+  Reserved.set(SP::G6);
+  Reserved.set(SP::G7);
+
+  // Also reserve the register pair aliases covering the above
+  // registers, with the same conditions.
+  Reserved.set(SP::G0_G1);
+  if (ReserveAppRegisters)
+    Reserved.set(SP::G2_G3);
+  if (ReserveAppRegisters || !Subtarget.is64Bit())
+    Reserved.set(SP::G4_G5);
+
+  Reserved.set(SP::O6_O7);
+  Reserved.set(SP::I6_I7);
+  Reserved.set(SP::G6_G7);
+
+  // Unaliased double registers are not available in non-V9 targets.
+  if (!Subtarget.isV9()) {
+    for (unsigned n = 0; n != 16; ++n) {
+      for (MCRegAliasIterator AI(SP::D16 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+    }
+  }
+
+  return Reserved;
+}
+
+const TargetRegisterClass*
+SparcRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                      unsigned Kind) const {
+  const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
+  return Subtarget.is64Bit() ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
+}
+
+static void replaceFI(MachineFunction &MF, MachineBasicBlock::iterator II,
+                      MachineInstr &MI, const DebugLoc &dl,
+                      unsigned FIOperandNum, int Offset, unsigned FramePtr) {
+  // Replace frame index with a frame pointer reference.
+  if (Offset >= -4096 && Offset <= 4095) {
+    // If the offset is small enough to fit in the immediate field, directly
+    // encode it.
+    MI.getOperand(FIOperandNum).ChangeToRegister(FramePtr, false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+
+  // FIXME: it would be better to scavenge a register here instead of
+  // reserving G1 all of the time.
+  if (Offset >= 0) {
+    // Emit nonnegaive immediates with sethi + or.
+    // sethi %hi(Offset), %g1
+    // add %g1, %fp, %g1
+    // Insert G1+%lo(offset) into the user.
+    BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1)
+      .addImm(HI22(Offset));
+
+
+    // Emit G1 = G1 + I6
+    BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
+      .addReg(FramePtr);
+    // Insert: G1+%lo(offset) into the user.
+    MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(LO10(Offset));
+    return;
+  }
+
+  // Emit Negative numbers with sethi + xor
+  // sethi %hix(Offset), %g1
+  // xor  %g1, %lox(offset), %g1
+  // add %g1, %fp, %g1
+  // Insert: G1 + 0 into the user.
+  BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1)
+    .addImm(HIX22(Offset));
+  BuildMI(*MI.getParent(), II, dl, TII.get(SP::XORri), SP::G1)
+    .addReg(SP::G1).addImm(LOX10(Offset));
+
+  BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
+    .addReg(FramePtr);
+  // Insert: G1+%lo(offset) into the user.
+  MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
+}
+
+
+void
+SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                       int SPAdj, unsigned FIOperandNum,
+                                       RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  DebugLoc dl = MI.getDebugLoc();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
+  const SparcFrameLowering *TFI = getFrameLowering(MF);
+
+  unsigned FrameReg;
+  int Offset;
+  Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg);
+
+  Offset += MI.getOperand(FIOperandNum + 1).getImm();
+
+  if (!Subtarget.isV9() || !Subtarget.hasHardQuad()) {
+    if (MI.getOpcode() == SP::STQFri) {
+      const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+      unsigned SrcReg = MI.getOperand(2).getReg();
+      unsigned SrcEvenReg = getSubReg(SrcReg, SP::sub_even64);
+      unsigned SrcOddReg  = getSubReg(SrcReg, SP::sub_odd64);
+      MachineInstr *StMI =
+        BuildMI(*MI.getParent(), II, dl, TII.get(SP::STDFri))
+        .addReg(FrameReg).addImm(0).addReg(SrcEvenReg);
+      replaceFI(MF, II, *StMI, dl, 0, Offset, FrameReg);
+      MI.setDesc(TII.get(SP::STDFri));
+      MI.getOperand(2).setReg(SrcOddReg);
+      Offset += 8;
+    } else if (MI.getOpcode() == SP::LDQFri) {
+      const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+      unsigned DestReg     = MI.getOperand(0).getReg();
+      unsigned DestEvenReg = getSubReg(DestReg, SP::sub_even64);
+      unsigned DestOddReg  = getSubReg(DestReg, SP::sub_odd64);
+      MachineInstr *StMI =
+        BuildMI(*MI.getParent(), II, dl, TII.get(SP::LDDFri), DestEvenReg)
+        .addReg(FrameReg).addImm(0);
+      replaceFI(MF, II, *StMI, dl, 1, Offset, FrameReg);
+
+      MI.setDesc(TII.get(SP::LDDFri));
+      MI.getOperand(0).setReg(DestOddReg);
+      Offset += 8;
+    }
+  }
+
+  replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FrameReg);
+
+}
+
+unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return SP::I6;
+}
+
+// Sparc has no architectural need for stack realignment support,
+// except that LLVM unfortunately currently implements overaligned
+// stack objects by depending upon stack realignment support.
+// If that ever changes, this can probably be deleted.
+bool SparcRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  if (!TargetRegisterInfo::canRealignStack(MF))
+    return false;
+
+  // Sparc always has a fixed frame pointer register, so don't need to
+  // worry about needing to reserve it. [even if we don't have a frame
+  // pointer for our frame, it still cannot be used for other things,
+  // or register window traps will be SADNESS.]
+
+  // If there's a reserved call frame, we can use SP to access locals.
+  if (getFrameLowering(MF)->hasReservedCallFrame(MF))
+    return true;
+
+  // Otherwise, we'd need a base pointer, but those aren't implemented
+  // for SPARC at the moment.
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h
new file mode 100644
index 000000000000..2ac51263957e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h
@@ -0,0 +1,50 @@
+//===-- SparcRegisterInfo.h - Sparc Register Information Impl ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCREGISTERINFO_H
+#define LLVM_LIB_TARGET_SPARC_SPARCREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "SparcGenRegisterInfo.inc"
+
+namespace llvm {
+struct SparcRegisterInfo : public SparcGenRegisterInfo {
+  SparcRegisterInfo();
+
+  /// Code Generation virtual methods...
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID CC) const override;
+
+  const uint32_t* getRTCallPreservedMask(CallingConv::ID CC) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
+                                                unsigned Kind) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  bool canRealignStack(const MachineFunction &MF) const override;
+
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td
new file mode 100644
index 000000000000..6ecfddfc7d66
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td
@@ -0,0 +1,377 @@
+//===-- SparcRegisterInfo.td - Sparc Register defs ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the Sparc register file
+//===----------------------------------------------------------------------===//
+
+class SparcReg<bits<16> Enc, string n> : Register<n> {
+  let HWEncoding = Enc;
+  let Namespace = "SP";
+}
+
+class SparcCtrlReg<bits<16> Enc, string n>: Register<n> {
+  let HWEncoding = Enc;
+  let Namespace = "SP";
+}
+
+let Namespace = "SP" in {
+def sub_even : SubRegIndex<32>;
+def sub_odd  : SubRegIndex<32, 32>;
+def sub_even64 : SubRegIndex<64>;
+def sub_odd64  : SubRegIndex<64, 64>;
+}
+
+// Registers are identified with 5-bit ID numbers.
+// Ri - 32-bit integer registers
+class Ri<bits<16> Enc, string n> : SparcReg<Enc, n>;
+
+// Rdi - pairs of 32-bit integer registers
+class Rdi<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
+  let SubRegs = subregs;
+  let SubRegIndices = [sub_even, sub_odd];
+  let CoveredBySubRegs = 1;
+}
+// Rf - 32-bit floating-point registers
+class Rf<bits<16> Enc, string n> : SparcReg<Enc, n>;
+
+// Rd - Slots in the FP register file for 64-bit floating-point values.
+class Rd<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
+  let SubRegs = subregs;
+  let SubRegIndices = [sub_even, sub_odd];
+  let CoveredBySubRegs = 1;
+}
+
+// Rq - Slots in the FP register file for 128-bit floating-point values.
+class Rq<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
+  let SubRegs = subregs;
+  let SubRegIndices = [sub_even64, sub_odd64];
+  let CoveredBySubRegs = 1;
+}
+
+// Control Registers
+def ICC : SparcCtrlReg<0, "ICC">; // This represents icc and xcc in 64-bit code.
+foreach I = 0-3 in
+  def FCC#I : SparcCtrlReg<I, "FCC"#I>;
+
+def FSR : SparcCtrlReg<0, "FSR">; // Floating-point state register.
+
+def FQ : SparcCtrlReg<0, "FQ">; // Floating-point deferred-trap queue.
+
+def CPSR : SparcCtrlReg<0, "CPSR">; // Co-processor state register.
+
+def CPQ : SparcCtrlReg<0, "CPQ">; // Co-processor queue.
+
+// Y register
+def Y : SparcCtrlReg<0, "Y">, DwarfRegNum<[64]>;
+// Ancillary state registers (implementation defined)
+def ASR1 : SparcCtrlReg<1, "ASR1">;
+def ASR2 : SparcCtrlReg<2, "ASR2">;
+def ASR3 : SparcCtrlReg<3, "ASR3">;
+def ASR4 : SparcCtrlReg<4, "ASR4">;
+def ASR5 : SparcCtrlReg<5, "ASR5">;
+def ASR6 : SparcCtrlReg<6, "ASR6">;
+def ASR7 : SparcCtrlReg<7, "ASR7">;
+def ASR8 : SparcCtrlReg<8, "ASR8">;
+def ASR9 : SparcCtrlReg<9, "ASR9">;
+def ASR10 : SparcCtrlReg<10, "ASR10">;
+def ASR11 : SparcCtrlReg<11, "ASR11">;
+def ASR12 : SparcCtrlReg<12, "ASR12">;
+def ASR13 : SparcCtrlReg<13, "ASR13">;
+def ASR14 : SparcCtrlReg<14, "ASR14">;
+def ASR15 : SparcCtrlReg<15, "ASR15">;
+def ASR16 : SparcCtrlReg<16, "ASR16">;
+def ASR17 : SparcCtrlReg<17, "ASR17">;
+def ASR18 : SparcCtrlReg<18, "ASR18">;
+def ASR19 : SparcCtrlReg<19, "ASR19">;
+def ASR20 : SparcCtrlReg<20, "ASR20">;
+def ASR21 : SparcCtrlReg<21, "ASR21">;
+def ASR22 : SparcCtrlReg<22, "ASR22">;
+def ASR23 : SparcCtrlReg<23, "ASR23">;
+def ASR24 : SparcCtrlReg<24, "ASR24">;
+def ASR25 : SparcCtrlReg<25, "ASR25">;
+def ASR26 : SparcCtrlReg<26, "ASR26">;
+def ASR27 : SparcCtrlReg<27, "ASR27">;
+def ASR28 : SparcCtrlReg<28, "ASR28">;
+def ASR29 : SparcCtrlReg<29, "ASR29">;
+def ASR30 : SparcCtrlReg<30, "ASR30">;
+def ASR31 : SparcCtrlReg<31, "ASR31">;
+
+// Note that PSR, WIM, and TBR don't exist on the SparcV9, only the V8.
+def PSR : SparcCtrlReg<0, "PSR">;
+def WIM : SparcCtrlReg<0, "WIM">;
+def TBR : SparcCtrlReg<0, "TBR">;
+
+def TPC : SparcCtrlReg<0, "TPC">;
+def TNPC : SparcCtrlReg<1, "TNPC">;
+def TSTATE : SparcCtrlReg<2, "TSTATE">;
+def TT : SparcCtrlReg<3, "TT">;
+def TICK : SparcCtrlReg<4, "TICK">;
+def TBA : SparcCtrlReg<5, "TBA">;
+def PSTATE : SparcCtrlReg<6, "PSTATE">;
+def TL : SparcCtrlReg<7, "TL">;
+def PIL : SparcCtrlReg<8, "PIL">;
+def CWP : SparcCtrlReg<9, "CWP">;
+def CANSAVE : SparcCtrlReg<10, "CANSAVE">;
+def CANRESTORE : SparcCtrlReg<11, "CANRESTORE">;
+def CLEANWIN : SparcCtrlReg<12, "CLEANWIN">;
+def OTHERWIN : SparcCtrlReg<13, "OTHERWIN">;
+def WSTATE : SparcCtrlReg<14, "WSTATE">;
+
+// Integer registers
+def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>;
+def G1 : Ri< 1, "G1">, DwarfRegNum<[1]>;
+def G2 : Ri< 2, "G2">, DwarfRegNum<[2]>;
+def G3 : Ri< 3, "G3">, DwarfRegNum<[3]>;
+def G4 : Ri< 4, "G4">, DwarfRegNum<[4]>;
+def G5 : Ri< 5, "G5">, DwarfRegNum<[5]>;
+def G6 : Ri< 6, "G6">, DwarfRegNum<[6]>;
+def G7 : Ri< 7, "G7">, DwarfRegNum<[7]>;
+def O0 : Ri< 8, "O0">, DwarfRegNum<[8]>;
+def O1 : Ri< 9, "O1">, DwarfRegNum<[9]>;
+def O2 : Ri<10, "O2">, DwarfRegNum<[10]>;
+def O3 : Ri<11, "O3">, DwarfRegNum<[11]>;
+def O4 : Ri<12, "O4">, DwarfRegNum<[12]>;
+def O5 : Ri<13, "O5">, DwarfRegNum<[13]>;
+def O6 : Ri<14, "SP">, DwarfRegNum<[14]>;
+def O7 : Ri<15, "O7">, DwarfRegNum<[15]>;
+def L0 : Ri<16, "L0">, DwarfRegNum<[16]>;
+def L1 : Ri<17, "L1">, DwarfRegNum<[17]>;
+def L2 : Ri<18, "L2">, DwarfRegNum<[18]>;
+def L3 : Ri<19, "L3">, DwarfRegNum<[19]>;
+def L4 : Ri<20, "L4">, DwarfRegNum<[20]>;
+def L5 : Ri<21, "L5">, DwarfRegNum<[21]>;
+def L6 : Ri<22, "L6">, DwarfRegNum<[22]>;
+def L7 : Ri<23, "L7">, DwarfRegNum<[23]>;
+def I0 : Ri<24, "I0">, DwarfRegNum<[24]>;
+def I1 : Ri<25, "I1">, DwarfRegNum<[25]>;
+def I2 : Ri<26, "I2">, DwarfRegNum<[26]>;
+def I3 : Ri<27, "I3">, DwarfRegNum<[27]>;
+def I4 : Ri<28, "I4">, DwarfRegNum<[28]>;
+def I5 : Ri<29, "I5">, DwarfRegNum<[29]>;
+def I6 : Ri<30, "FP">, DwarfRegNum<[30]>;
+def I7 : Ri<31, "I7">, DwarfRegNum<[31]>;
+
+// Floating-point registers
+def F0  : Rf< 0,  "F0">, DwarfRegNum<[32]>;
+def F1  : Rf< 1,  "F1">, DwarfRegNum<[33]>;
+def F2  : Rf< 2,  "F2">, DwarfRegNum<[34]>;
+def F3  : Rf< 3,  "F3">, DwarfRegNum<[35]>;
+def F4  : Rf< 4,  "F4">, DwarfRegNum<[36]>;
+def F5  : Rf< 5,  "F5">, DwarfRegNum<[37]>;
+def F6  : Rf< 6,  "F6">, DwarfRegNum<[38]>;
+def F7  : Rf< 7,  "F7">, DwarfRegNum<[39]>;
+def F8  : Rf< 8,  "F8">, DwarfRegNum<[40]>;
+def F9  : Rf< 9,  "F9">, DwarfRegNum<[41]>;
+def F10 : Rf<10, "F10">, DwarfRegNum<[42]>;
+def F11 : Rf<11, "F11">, DwarfRegNum<[43]>;
+def F12 : Rf<12, "F12">, DwarfRegNum<[44]>;
+def F13 : Rf<13, "F13">, DwarfRegNum<[45]>;
+def F14 : Rf<14, "F14">, DwarfRegNum<[46]>;
+def F15 : Rf<15, "F15">, DwarfRegNum<[47]>;
+def F16 : Rf<16, "F16">, DwarfRegNum<[48]>;
+def F17 : Rf<17, "F17">, DwarfRegNum<[49]>;
+def F18 : Rf<18, "F18">, DwarfRegNum<[50]>;
+def F19 : Rf<19, "F19">, DwarfRegNum<[51]>;
+def F20 : Rf<20, "F20">, DwarfRegNum<[52]>;
+def F21 : Rf<21, "F21">, DwarfRegNum<[53]>;
+def F22 : Rf<22, "F22">, DwarfRegNum<[54]>;
+def F23 : Rf<23, "F23">, DwarfRegNum<[55]>;
+def F24 : Rf<24, "F24">, DwarfRegNum<[56]>;
+def F25 : Rf<25, "F25">, DwarfRegNum<[57]>;
+def F26 : Rf<26, "F26">, DwarfRegNum<[58]>;
+def F27 : Rf<27, "F27">, DwarfRegNum<[59]>;
+def F28 : Rf<28, "F28">, DwarfRegNum<[60]>;
+def F29 : Rf<29, "F29">, DwarfRegNum<[61]>;
+def F30 : Rf<30, "F30">, DwarfRegNum<[62]>;
+def F31 : Rf<31, "F31">, DwarfRegNum<[63]>;
+
+// Aliases of the F* registers used to hold 64-bit fp values (doubles)
+def D0  : Rd< 0,  "F0", [F0,   F1]>, DwarfRegNum<[72]>;
+def D1  : Rd< 2,  "F2", [F2,   F3]>, DwarfRegNum<[73]>;
+def D2  : Rd< 4,  "F4", [F4,   F5]>, DwarfRegNum<[74]>;
+def D3  : Rd< 6,  "F6", [F6,   F7]>, DwarfRegNum<[75]>;
+def D4  : Rd< 8,  "F8", [F8,   F9]>, DwarfRegNum<[76]>;
+def D5  : Rd<10, "F10", [F10, F11]>, DwarfRegNum<[77]>;
+def D6  : Rd<12, "F12", [F12, F13]>, DwarfRegNum<[78]>;
+def D7  : Rd<14, "F14", [F14, F15]>, DwarfRegNum<[79]>;
+def D8  : Rd<16, "F16", [F16, F17]>, DwarfRegNum<[80]>;
+def D9  : Rd<18, "F18", [F18, F19]>, DwarfRegNum<[81]>;
+def D10 : Rd<20, "F20", [F20, F21]>, DwarfRegNum<[82]>;
+def D11 : Rd<22, "F22", [F22, F23]>, DwarfRegNum<[83]>;
+def D12 : Rd<24, "F24", [F24, F25]>, DwarfRegNum<[84]>;
+def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<[85]>;
+def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<[86]>;
+def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[87]>;
+
+// Co-processor registers
+def C0 : Ri< 0, "C0">;
+def C1 : Ri< 1, "C1">;
+def C2 : Ri< 2, "C2">;
+def C3 : Ri< 3, "C3">;
+def C4 : Ri< 4, "C4">;
+def C5 : Ri< 5, "C5">;
+def C6 : Ri< 6, "C6">;
+def C7 : Ri< 7, "C7">;
+def C8 : Ri< 8, "C8">;
+def C9 : Ri< 9, "C9">;
+def C10 : Ri< 10, "C10">;
+def C11 : Ri< 11, "C11">;
+def C12 : Ri< 12, "C12">;
+def C13 : Ri< 13, "C13">;
+def C14 : Ri< 14, "C14">;
+def C15 : Ri< 15, "C15">;
+def C16 : Ri< 16, "C16">;
+def C17 : Ri< 17, "C17">;
+def C18 : Ri< 18, "C18">;
+def C19 : Ri< 19, "C19">;
+def C20 : Ri< 20, "C20">;
+def C21 : Ri< 21, "C21">;
+def C22 : Ri< 22, "C22">;
+def C23 : Ri< 23, "C23">;
+def C24 : Ri< 24, "C24">;
+def C25 : Ri< 25, "C25">;
+def C26 : Ri< 26, "C26">;
+def C27 : Ri< 27, "C27">;
+def C28 : Ri< 28, "C28">;
+def C29 : Ri< 29, "C29">;
+def C30 : Ri< 30, "C30">;
+def C31 : Ri< 31, "C31">;
+
+// Unaliased double precision floating point registers.
+// FIXME: Define DwarfRegNum for these registers.
+def D16 : SparcReg< 1, "F32">;
+def D17 : SparcReg< 3, "F34">;
+def D18 : SparcReg< 5, "F36">;
+def D19 : SparcReg< 7, "F38">;
+def D20 : SparcReg< 9, "F40">;
+def D21 : SparcReg<11, "F42">;
+def D22 : SparcReg<13, "F44">;
+def D23 : SparcReg<15, "F46">;
+def D24 : SparcReg<17, "F48">;
+def D25 : SparcReg<19, "F50">;
+def D26 : SparcReg<21, "F52">;
+def D27 : SparcReg<23, "F54">;
+def D28 : SparcReg<25, "F56">;
+def D29 : SparcReg<27, "F58">;
+def D30 : SparcReg<29, "F60">;
+def D31 : SparcReg<31, "F62">;
+
+// Aliases of the F* registers used to hold 128-bit for values (long doubles).
+def Q0  : Rq< 0,  "F0", [D0,   D1]>;
+def Q1  : Rq< 4,  "F4", [D2,   D3]>;
+def Q2  : Rq< 8,  "F8", [D4,   D5]>;
+def Q3  : Rq<12, "F12", [D6,   D7]>;
+def Q4  : Rq<16, "F16", [D8,   D9]>;
+def Q5  : Rq<20, "F20", [D10, D11]>;
+def Q6  : Rq<24, "F24", [D12, D13]>;
+def Q7  : Rq<28, "F28", [D14, D15]>;
+def Q8  : Rq< 1, "F32", [D16, D17]>;
+def Q9  : Rq< 5, "F36", [D18, D19]>;
+def Q10 : Rq< 9, "F40", [D20, D21]>;
+def Q11 : Rq<13, "F44", [D22, D23]>;
+def Q12 : Rq<17, "F48", [D24, D25]>;
+def Q13 : Rq<21, "F52", [D26, D27]>;
+def Q14 : Rq<25, "F56", [D28, D29]>;
+def Q15 : Rq<29, "F60", [D30, D31]>;
+
+// Aliases of the integer registers used for LDD/STD double-word operations
+def G0_G1 : Rdi<0, "G0", [G0, G1]>;
+def G2_G3 : Rdi<2, "G2", [G2, G3]>;
+def G4_G5 : Rdi<4, "G4", [G4, G5]>;
+def G6_G7 : Rdi<6, "G6", [G6, G7]>;
+def O0_O1 : Rdi<8, "O0", [O0, O1]>;
+def O2_O3 : Rdi<10, "O2", [O2, O3]>;
+def O4_O5 : Rdi<12, "O4", [O4, O5]>;
+def O6_O7 : Rdi<14, "O6", [O6, O7]>;
+def L0_L1 : Rdi<16, "L0", [L0, L1]>;
+def L2_L3 : Rdi<18, "L2", [L2, L3]>;
+def L4_L5 : Rdi<20, "L4", [L4, L5]>;
+def L6_L7 : Rdi<22, "L6", [L6, L7]>;
+def I0_I1 : Rdi<24, "I0", [I0, I1]>;
+def I2_I3 : Rdi<26, "I2", [I2, I3]>;
+def I4_I5 : Rdi<28, "I4", [I4, I5]>;
+def I6_I7 : Rdi<30, "I6", [I6, I7]>;
+
+// Aliases of the co-processor registers used for LDD/STD double-word operations
+def C0_C1 : Rdi<0, "C0", [C0, C1]>;
+def C2_C3 : Rdi<2, "C2", [C2, C3]>;
+def C4_C5 : Rdi<4, "C4", [C4, C5]>;
+def C6_C7 : Rdi<6, "C6", [C6, C7]>;
+def C8_C9 : Rdi<8, "C8", [C8, C9]>;
+def C10_C11 : Rdi<10, "C10", [C10, C11]>;
+def C12_C13 : Rdi<12, "C12", [C12, C13]>;
+def C14_C15 : Rdi<14, "C14", [C14, C15]>;
+def C16_C17 : Rdi<16, "C16", [C16, C17]>;
+def C18_C19 : Rdi<18, "C18", [C18, C19]>;
+def C20_C21 : Rdi<20, "C20", [C20, C21]>;
+def C22_C23 : Rdi<22, "C22", [C22, C23]>;
+def C24_C25 : Rdi<24, "C24", [C24, C25]>;
+def C26_C27 : Rdi<26, "C26", [C26, C27]>;
+def C28_C29 : Rdi<28, "C28", [C28, C29]>;
+def C30_C31 : Rdi<30, "C30", [C30, C31]>;
+
+// Register classes.
+//
+// FIXME: the register order should be defined in terms of the preferred
+// allocation order...
+//
+// This register class should not be used to hold i64 values, use the I64Regs
+// register class for that. The i64 type is included here to allow i64 patterns
+// using the integer instructions.
+def IntRegs : RegisterClass<"SP", [i32, i64], 32,
+                            (add (sequence "I%u", 0, 7),
+                                 (sequence "G%u", 0, 7),
+                                 (sequence "L%u", 0, 7),
+                                 (sequence "O%u", 0, 7))>;
+
+// Should be in the same order as IntRegs.
+def IntPair : RegisterClass<"SP", [v2i32], 64,
+    (add I0_I1, I2_I3, I4_I5, I6_I7,
+         G0_G1, G2_G3, G4_G5, G6_G7,
+         L0_L1, L2_L3, L4_L5, L6_L7,
+         O0_O1, O2_O3, O4_O5, O6_O7)>;
+
+// Register class for 64-bit mode, with a 64-bit spill slot size.
+// These are the same as the 32-bit registers, so TableGen will consider this
+// to be a sub-class of IntRegs. That works out because requiring a 64-bit
+// spill slot is a stricter constraint than only requiring a 32-bit spill slot.
+def I64Regs : RegisterClass<"SP", [i64], 64, (add IntRegs)>;
+
+// Floating point register classes.
+def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>;
+
+def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 31)>;
+
+def QFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 15)>;
+
+// Floating point control register classes.
+def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>;
+
+let isAllocatable = 0 in {
+  // Ancillary state registers
+  def ASRRegs : RegisterClass<"SP", [i32], 32,
+                              (add Y, (sequence "ASR%u", 1, 31))>;
+                            
+  // This register class should not be used to hold i64 values.
+  def CoprocRegs : RegisterClass<"SP", [i32], 32,
+                                (add (sequence "C%u", 0, 31))>;
+
+  // Should be in the same order as CoprocRegs.
+  def CoprocPair : RegisterClass<"SP", [v2i32], 64,
+    (add C0_C1,   C2_C3,   C4_C5,   C6_C7,   
+         C8_C9,   C10_C11, C12_C13, C14_C15,
+         C16_C17, C18_C19, C20_C21, C22_C23,
+         C24_C25, C26_C27, C28_C29, C30_C31)>;
+}
+
+// Privileged Registers
+def PRRegs : RegisterClass<"SP", [i64], 64,
+    (add TPC, TNPC, TSTATE, TT, TICK, TBA, PSTATE, TL, PIL, CWP,
+         CANSAVE, CANRESTORE, CLEANWIN, OTHERWIN, WSTATE)>;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSchedule.td b/contrib/llvm/lib/Target/Sparc/SparcSchedule.td
new file mode 100755
index 000000000000..f243546b029b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcSchedule.td
@@ -0,0 +1,124 @@
+//===-- SparcSchedule.td - Describe the Sparc Itineries ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+def IIC_iu_or_fpu_instr : InstrItinClass;
+def IIC_iu_instr : InstrItinClass;
+def IIC_fpu_normal_instr : InstrItinClass;
+def IIC_fpu_fast_instr : InstrItinClass;
+def IIC_jmp_or_call : InstrItinClass;
+def IIC_ldd : InstrItinClass;
+def IIC_st : InstrItinClass;
+def IIC_std : InstrItinClass;
+def IIC_iu_smul : InstrItinClass;
+def IIC_iu_umul : InstrItinClass;
+def IIC_iu_div : InstrItinClass;
+def IIC_ticc : InstrItinClass;
+def IIC_ldstub : InstrItinClass;
+def IIC_fpu_muls : InstrItinClass;
+def IIC_fpu_muld : InstrItinClass;
+def IIC_fpu_divs : InstrItinClass;
+def IIC_fpu_divd : InstrItinClass;
+def IIC_fpu_sqrts : InstrItinClass;
+def IIC_fpu_sqrtd : InstrItinClass;
+def IIC_fpu_abs : InstrItinClass;
+def IIC_fpu_movs : InstrItinClass;
+def IIC_fpu_negs : InstrItinClass;
+def IIC_smac_umac : InstrItinClass;
+def IIC_fpu_stod : InstrItinClass;
+
+def LEONIU : FuncUnit; // integer unit
+def LEONFPU : FuncUnit; // floating-point unit
+
+// Ref: http://www.atmel.com/Images/doc4226.pdf
+
+def LEON2Itineraries : ProcessorItineraries<
+[LEONIU, LEONFPU], [], [
+  InstrItinData<IIC_iu_or_fpu_instr, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+  InstrItinData<IIC_iu_instr, [InstrStage<1, [LEONIU]>], [1, 1]>,
+  InstrItinData<IIC_fpu_normal_instr, [InstrStage<1, [LEONFPU]>], [7, 1]>,
+  InstrItinData<IIC_fpu_fast_instr, [InstrStage<1, [LEONFPU]>], [7, 1]>,
+  InstrItinData<IIC_jmp_or_call, [InstrStage<1, [LEONIU, LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_ldd, [InstrStage<1, [LEONIU, LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_st, [InstrStage<1, [LEONIU, LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_std, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+  InstrItinData<IIC_iu_smul, [InstrStage<1, [LEONIU]>], [5, 1]>,
+  InstrItinData<IIC_iu_umul, [InstrStage<1, [LEONIU]>], [5, 1]>,
+  InstrItinData<IIC_iu_div, [InstrStage<1, [LEONIU]>], [35, 1]>,
+  InstrItinData<IIC_ticc, [InstrStage<1, [LEONIU, LEONFPU]>], [4, 1]>,
+  InstrItinData<IIC_ldstub, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+  InstrItinData<IIC_fpu_muls, [InstrStage<1, [LEONFPU]>], [16, 1]>,
+  InstrItinData<IIC_fpu_muld, [InstrStage<1, [LEONFPU]>], [21, 1]>,
+  InstrItinData<IIC_fpu_divs, [InstrStage<1, [LEONFPU]>], [20, 1]>,
+  InstrItinData<IIC_fpu_divd, [InstrStage<1, [LEONFPU]>], [36, 1]>,
+  InstrItinData<IIC_fpu_sqrts, [InstrStage<1, [LEONFPU]>], [37, 1]>,
+  InstrItinData<IIC_fpu_sqrtd, [InstrStage<1, [LEONFPU]>], [65, 1]>,
+  InstrItinData<IIC_fpu_abs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_movs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_negs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_stod, [InstrStage<1, [LEONFPU]>], [2, 1]>
+]>;
+
+def LEON3Itineraries : ProcessorItineraries<
+[LEONIU, LEONFPU], [], [
+  InstrItinData<IIC_iu_or_fpu_instr, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+  InstrItinData<IIC_iu_instr, [InstrStage<1, [LEONIU]>], [1, 1]>,
+  InstrItinData<IIC_fpu_normal_instr, [InstrStage<1, [LEONFPU]>], [7, 1]>,
+  InstrItinData<IIC_fpu_fast_instr, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+  InstrItinData<IIC_jmp_or_call, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+  InstrItinData<IIC_ldd, [InstrStage<1, [LEONIU, LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_st, [InstrStage<1, [LEONIU, LEONFPU]>], [4, 1]>,
+  InstrItinData<IIC_std, [InstrStage<1, [LEONIU, LEONFPU]>], [5, 1]>,
+  InstrItinData<IIC_iu_smul, [InstrStage<1, [LEONIU]>], [1, 1]>,
+  InstrItinData<IIC_iu_umul, [InstrStage<1, [LEONIU]>], [4, 1]>,
+  InstrItinData<IIC_iu_div, [InstrStage<1, [LEONIU]>], [35, 1]>,
+  InstrItinData<IIC_smac_umac, [InstrStage<1, [LEONIU]>], [2, 1]>,
+  InstrItinData<IIC_ticc, [InstrStage<1, [LEONIU, LEONFPU]>], [5, 1]>,
+  InstrItinData<IIC_ldstub, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+  InstrItinData<IIC_fpu_muls, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+  InstrItinData<IIC_fpu_muld, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+  InstrItinData<IIC_fpu_divs, [InstrStage<1, [LEONFPU]>], [16, 1]>,
+  InstrItinData<IIC_fpu_divd, [InstrStage<1, [LEONFPU]>], [17, 1]>,
+  InstrItinData<IIC_fpu_sqrts, [InstrStage<1, [LEONFPU]>], [24, 1]>,
+  InstrItinData<IIC_fpu_sqrtd, [InstrStage<1, [LEONFPU]>], [25, 1]>,
+  InstrItinData<IIC_fpu_abs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_movs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_negs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_stod, [InstrStage<1, [LEONFPU]>], [4, 1]>
+]>;
+
+def LEON4Itineraries : ProcessorItineraries<
+[LEONIU, LEONFPU], [], [
+  InstrItinData<IIC_iu_or_fpu_instr, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+  InstrItinData<IIC_iu_instr, [InstrStage<1, [LEONIU]>], [1, 1]>,
+  InstrItinData<IIC_fpu_normal_instr, [InstrStage<1, [LEONFPU]>], [7, 1]>,
+  InstrItinData<IIC_fpu_fast_instr, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+  InstrItinData<IIC_jmp_or_call, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+  InstrItinData<IIC_ldd, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+  InstrItinData<IIC_st, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+  InstrItinData<IIC_std, [InstrStage<1, [LEONIU, LEONFPU]>], [1, 1]>,
+  InstrItinData<IIC_iu_smul, [InstrStage<1, [LEONIU]>], [1, 1]>,
+  InstrItinData<IIC_iu_umul, [InstrStage<1, [LEONIU]>], [4, 1]>,
+  InstrItinData<IIC_iu_div, [InstrStage<1, [LEONIU]>], [35, 1]>,
+  InstrItinData<IIC_smac_umac, [InstrStage<1, [LEONIU]>], [2, 1]>,
+  InstrItinData<IIC_ticc, [InstrStage<1, [LEONIU, LEONFPU]>], [5, 1]>,
+  InstrItinData<IIC_ldstub, [InstrStage<1, [LEONIU, LEONFPU]>], [3, 1]>,
+  InstrItinData<IIC_fpu_muls, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+  InstrItinData<IIC_fpu_muld, [InstrStage<1, [LEONFPU]>], [4, 1]>,
+  InstrItinData<IIC_fpu_divs, [InstrStage<1, [LEONFPU]>], [16, 1]>,
+  InstrItinData<IIC_fpu_divd, [InstrStage<1, [LEONFPU]>], [17, 1]>,
+  InstrItinData<IIC_fpu_sqrts, [InstrStage<1, [LEONFPU]>], [24, 1]>,
+  InstrItinData<IIC_fpu_sqrtd, [InstrStage<1, [LEONFPU]>], [25, 1]>,
+  InstrItinData<IIC_fpu_abs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_movs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_negs, [InstrStage<1, [LEONFPU]>], [2, 1]>,
+  InstrItinData<IIC_fpu_stod, [InstrStage<1, [LEONFPU]>], [4, 1]>
+]>;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
new file mode 100644
index 000000000000..43ddef3cc96e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
@@ -0,0 +1,99 @@
+//===-- SparcSubtarget.cpp - SPARC Subtarget Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SPARC specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcSubtarget.h"
+#include "Sparc.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sparc-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "SparcGenSubtargetInfo.inc"
+
+void SparcSubtarget::anchor() { }
+
+SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                                StringRef FS) {
+  IsV9 = false;
+  IsLeon = false;
+  V8DeprecatedInsts = false;
+  IsVIS = false;
+  HasHardQuad = false;
+  UsePopc = false;
+  UseSoftFloat = false;
+
+  // Leon features
+  HasLeonCasa = false;
+  HasUmacSmac = false;
+  PerformSDIVReplace = false;
+  InsertNOPLoad = false;
+  FixFSMULD = false;
+  ReplaceFMULS = false;
+  FixAllFDIVSQRT = false;
+  DetectRoundChange = false;
+
+  // Determine default and user specified characteristics
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = (Is64Bit) ? "v9" : "v8";
+
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FS);
+
+  // Popc is a v9-only instruction.
+  if (!IsV9)
+    UsePopc = false;
+
+  return *this;
+}
+
+SparcSubtarget::SparcSubtarget(const Triple &TT, const std::string &CPU,
+                               const std::string &FS, const TargetMachine &TM,
+                               bool is64Bit)
+    : SparcGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT), Is64Bit(is64Bit),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
+      FrameLowering(*this) {}
+
+int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
+
+  if (is64Bit()) {
+    // All 64-bit stack frames must be 16-byte aligned, and must reserve space
+    // for spilling the 16 window registers at %sp+BIAS..%sp+BIAS+128.
+    frameSize += 128;
+    // Frames with calls must also reserve space for 6 outgoing arguments
+    // whether they are used or not. LowerCall_64 takes care of that.
+    frameSize = alignTo(frameSize, 16);
+  } else {
+    // Emit the correct save instruction based on the number of bytes in
+    // the frame. Minimum stack frame size according to V8 ABI is:
+    //   16 words for register window spill
+    //    1 word for address of returned aggregate-value
+    // +  6 words for passing parameters on the stack
+    // ----------
+    //   23 words * 4 bytes per word = 92 bytes
+    frameSize += 92;
+
+    // Round up to next doubleword boundary -- a double-word boundary
+    // is required by the ABI.
+    frameSize = alignTo(frameSize, 8);
+  }
+  return frameSize;
+}
+
+bool SparcSubtarget::enableMachineScheduler() const {
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
new file mode 100644
index 000000000000..fa42da425ff2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
@@ -0,0 +1,122 @@
+//===-- SparcSubtarget.h - Define Subtarget for the SPARC -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SPARC specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H
+#define LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H
+
+#include "SparcFrameLowering.h"
+#include "SparcISelLowering.h"
+#include "SparcInstrInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "SparcGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class SparcSubtarget : public SparcGenSubtargetInfo {
+  Triple TargetTriple;
+  virtual void anchor();
+  bool IsV9;
+  bool IsLeon;
+  bool V8DeprecatedInsts;
+  bool IsVIS, IsVIS2, IsVIS3;
+  bool Is64Bit;
+  bool HasHardQuad;
+  bool UsePopc;
+  bool UseSoftFloat;
+
+  // LEON features
+  bool HasUmacSmac;
+  bool HasLeonCasa;
+  bool InsertNOPLoad;
+  bool FixFSMULD;
+  bool ReplaceFMULS;
+  bool FixAllFDIVSQRT;
+  bool DetectRoundChange;
+  bool PerformSDIVReplace;
+
+  SparcInstrInfo InstrInfo;
+  SparcTargetLowering TLInfo;
+  SelectionDAGTargetInfo TSInfo;
+  SparcFrameLowering FrameLowering;
+
+public:
+  SparcSubtarget(const Triple &TT, const std::string &CPU,
+                 const std::string &FS, const TargetMachine &TM, bool is64bit);
+
+  const SparcInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const TargetFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const SparcRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const SparcTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+  bool enableMachineScheduler() const override;
+
+  bool isV9() const { return IsV9; }
+  bool isLeon() const { return IsLeon; }
+  bool isVIS() const { return IsVIS; }
+  bool isVIS2() const { return IsVIS2; }
+  bool isVIS3() const { return IsVIS3; }
+  bool useDeprecatedV8Instructions() const { return V8DeprecatedInsts; }
+  bool hasHardQuad() const { return HasHardQuad; }
+  bool usePopc() const { return UsePopc; }
+  bool useSoftFloat() const { return UseSoftFloat; }
+
+  // Leon options
+  bool hasUmacSmac() const { return HasUmacSmac; }
+  bool performSDIVReplace() const { return PerformSDIVReplace; }
+  bool hasLeonCasa() const { return HasLeonCasa; }
+  bool insertNOPLoad() const { return InsertNOPLoad; }
+  bool fixFSMULD() const { return FixFSMULD; }
+  bool replaceFMULS() const { return ReplaceFMULS; }
+  bool fixAllFDIVSQRT() const { return FixAllFDIVSQRT; }
+  bool detectRoundChange() const { return DetectRoundChange; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  SparcSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+  bool is64Bit() const { return Is64Bit; }
+
+  /// The 64-bit ABI uses biased stack and frame pointers, so the stack frame
+  /// of the current function is the area from [%sp+BIAS] to [%fp+BIAS].
+  int64_t getStackPointerBias() const {
+    return is64Bit() ? 2047 : 0;
+  }
+
+  /// Given a actual stack size as determined by FrameInfo, this function
+  /// returns adjusted framesize which includes space for register window
+  /// spills and arguments.
+  int getAdjustedFrameSize(int stackSize) const;
+
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
new file mode 100644
index 000000000000..4ae64062d9e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -0,0 +1,197 @@
+//===-- SparcTargetMachine.cpp - Define TargetMachine for Sparc -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetMachine.h"
+#include "SparcTargetObjectFile.h"
+#include "Sparc.h"
+#include "LeonPasses.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeSparcTarget() {
+  // Register the target.
+  RegisterTargetMachine<SparcV8TargetMachine> X(getTheSparcTarget());
+  RegisterTargetMachine<SparcV9TargetMachine> Y(getTheSparcV9Target());
+  RegisterTargetMachine<SparcelTargetMachine> Z(getTheSparcelTarget());
+}
+
+static std::string computeDataLayout(const Triple &T, bool is64Bit) {
+  // Sparc is typically big endian, but some are little.
+  std::string Ret = T.getArch() == Triple::sparcel ? "e" : "E";
+  Ret += "-m:e";
+
+  // Some ABIs have 32bit pointers.
+  if (!is64Bit)
+    Ret += "-p:32:32";
+
+  // Alignments for 64 bit integers.
+  Ret += "-i64:64";
+
+  // On SparcV9 128 floats are aligned to 128 bits, on others only to 64.
+  // On SparcV9 registers can hold 64 or 32 bits, on others only 32.
+  if (is64Bit)
+    Ret += "-n32:64";
+  else
+    Ret += "-f128:64-n32";
+
+  if (is64Bit)
+    Ret += "-S128";
+  else
+    Ret += "-S64";
+
+  return Ret;
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::Static;
+  return *RM;
+}
+
+/// Create an ILP32 architecture model
+SparcTargetMachine::SparcTargetMachine(const Target &T, const Triple &TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
+                                       CodeGenOpt::Level OL, bool is64bit)
+    : LLVMTargetMachine(T, computeDataLayout(TT, is64bit), TT, CPU, FS, Options,
+                        getEffectiveRelocModel(RM), CM, OL),
+      TLOF(make_unique<SparcELFTargetObjectFile>()),
+      Subtarget(TT, CPU, FS, *this, is64bit), is64Bit(is64bit) {
+  initAsmInfo();
+}
+
+SparcTargetMachine::~SparcTargetMachine() {}
+
+const SparcSubtarget * 
+SparcTargetMachine::getSubtargetImpl(const Function &F) const {
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+
+  // FIXME: This is related to the code below to reset the target options,
+  // we need to know whether or not the soft float flag is set on the
+  // function, so we can enable it as a subtarget feature.
+  bool softFloat =
+      F.hasFnAttribute("use-soft-float") &&
+      F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+
+  if (softFloat)         
+    FS += FS.empty() ? "+soft-float" : ",+soft-float";
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<SparcSubtarget>(TargetTriple, CPU, FS, *this,
+                                          this->is64Bit);
+  }
+  return I.get();
+}
+
+namespace {
+/// Sparc Code Generator Pass Configuration Options.
+class SparcPassConfig : public TargetPassConfig {
+public:
+  SparcPassConfig(SparcTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  SparcTargetMachine &getSparcTargetMachine() const {
+    return getTM<SparcTargetMachine>();
+  }
+
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *SparcTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new SparcPassConfig(this, PM);
+}
+
+void SparcPassConfig::addIRPasses() {
+  addPass(createAtomicExpandPass(&getSparcTargetMachine()));
+
+  TargetPassConfig::addIRPasses();
+}
+
+bool SparcPassConfig::addInstSelector() {
+  addPass(createSparcISelDag(getSparcTargetMachine()));
+  return false;
+}
+
+void SparcPassConfig::addPreEmitPass(){
+  addPass(createSparcDelaySlotFillerPass(getSparcTargetMachine()));
+
+  if (this->getSparcTargetMachine().getSubtargetImpl()->insertNOPLoad())
+  {
+    addPass(new InsertNOPLoad(getSparcTargetMachine()));
+  }
+  if (this->getSparcTargetMachine().getSubtargetImpl()->fixFSMULD())
+  {
+    addPass(new FixFSMULD(getSparcTargetMachine()));
+  }
+  if (this->getSparcTargetMachine().getSubtargetImpl()->replaceFMULS())
+  {
+    addPass(new ReplaceFMULS(getSparcTargetMachine()));
+  }
+  if (this->getSparcTargetMachine().getSubtargetImpl()->detectRoundChange()) {
+    addPass(new DetectRoundChange(getSparcTargetMachine()));
+  }
+  if (this->getSparcTargetMachine().getSubtargetImpl()->fixAllFDIVSQRT())
+  {
+    addPass(new FixAllFDIVSQRT(getSparcTargetMachine()));
+  }
+}
+
+void SparcV8TargetMachine::anchor() { }
+
+SparcV8TargetMachine::SparcV8TargetMachine(const Target &T, const Triple &TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+    : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+void SparcV9TargetMachine::anchor() { }
+
+SparcV9TargetMachine::SparcV9TargetMachine(const Target &T, const Triple &TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+    : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+void SparcelTargetMachine::anchor() {}
+
+SparcelTargetMachine::SparcelTargetMachine(const Target &T, const Triple &TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+    : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h
new file mode 100644
index 000000000000..48193fe095be
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h
@@ -0,0 +1,79 @@
+//===-- SparcTargetMachine.h - Define TargetMachine for Sparc ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Sparc specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCTARGETMACHINE_H
+#define LLVM_LIB_TARGET_SPARC_SPARCTARGETMACHINE_H
+
+#include "SparcInstrInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class SparcTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  SparcSubtarget Subtarget;
+  bool is64Bit;
+  mutable StringMap<std::unique_ptr<SparcSubtarget>> SubtargetMap;
+public:
+  SparcTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                     StringRef FS, const TargetOptions &Options,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL, bool is64bit);
+  ~SparcTargetMachine() override;
+
+  const SparcSubtarget *getSubtargetImpl() const { return &Subtarget; }
+  const SparcSubtarget *getSubtargetImpl(const Function &) const override;
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+};
+
+/// Sparc 32-bit target machine
+///
+class SparcV8TargetMachine : public SparcTargetMachine {
+  virtual void anchor();
+public:
+  SparcV8TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+/// Sparc 64-bit target machine
+///
+class SparcV9TargetMachine : public SparcTargetMachine {
+  virtual void anchor();
+public:
+  SparcV9TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+class SparcelTargetMachine : public SparcTargetMachine {
+  virtual void anchor();
+
+public:
+  SparcelTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp b/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
new file mode 100644
index 000000000000..8fdde15d8d27
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -0,0 +1,42 @@
+//===------- SparcTargetObjectFile.cpp - Sparc Object Info Impl -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetObjectFile.h"
+#include "MCTargetDesc/SparcMCExpr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+
+const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
+    MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+
+  if (Encoding & dwarf::DW_EH_PE_pcrel) {
+    MachineModuleInfoELF &ELFMMI = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    MCSymbol *SSym = getSymbolWithGlobalValueBase(GV, ".DW.stub", TM);
+
+    // Add information about the stub reference to ELFMMI so that the stub
+    // gets emitted by the asmprinter.
+    MachineModuleInfoImpl::StubValueTy &StubSym = ELFMMI.getGVStubEntry(SSym);
+    if (!StubSym.getPointer()) {
+      MCSymbol *Sym = TM.getSymbol(GV);
+      StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
+    }
+
+    MCContext &Ctx = getContext();
+    return SparcMCExpr::create(SparcMCExpr::VK_Sparc_R_DISP32,
+                               MCSymbolRefExpr::create(SSym, Ctx), Ctx);
+  }
+
+  return TargetLoweringObjectFileELF::getTTypeGlobalReference(GV, Encoding, TM,
+                                                              MMI, Streamer);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.h b/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.h
new file mode 100644
index 000000000000..fe8800625a56
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.h
@@ -0,0 +1,35 @@
+//===-- SparcTargetObjectFile.h - Sparc Object Info -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_SPARC_SPARCTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+class MCContext;
+class TargetMachine;
+
+class SparcELFTargetObjectFile : public TargetLoweringObjectFileELF {
+public:
+  SparcELFTargetObjectFile() :
+    TargetLoweringObjectFileELF()
+  {}
+
+  const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+                                        unsigned Encoding,
+                                        const TargetMachine &TM,
+                                        MachineModuleInfo *MMI,
+                                        MCStreamer &Streamer) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetStreamer.h b/contrib/llvm/lib/Target/Sparc/SparcTargetStreamer.h
new file mode 100644
index 000000000000..3b503503abce
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetStreamer.h
@@ -0,0 +1,49 @@
+//===-- SparcTargetStreamer.h - Sparc Target Streamer ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_SPARC_SPARCTARGETSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class SparcTargetStreamer : public MCTargetStreamer {
+  virtual void anchor();
+
+public:
+  SparcTargetStreamer(MCStreamer &S);
+  /// Emit ".register <reg>, #ignore".
+  virtual void emitSparcRegisterIgnore(unsigned reg) = 0;
+  /// Emit ".register <reg>, #scratch".
+  virtual void emitSparcRegisterScratch(unsigned reg) = 0;
+};
+
+// This part is for ascii assembly output
+class SparcTargetAsmStreamer : public SparcTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  SparcTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+  void emitSparcRegisterIgnore(unsigned reg) override;
+  void emitSparcRegisterScratch(unsigned reg) override;
+
+};
+
+// This part is for ELF object output
+class SparcTargetELFStreamer : public SparcTargetStreamer {
+public:
+  SparcTargetELFStreamer(MCStreamer &S);
+  MCELFStreamer &getStreamer();
+  void emitSparcRegisterIgnore(unsigned reg) override {}
+  void emitSparcRegisterScratch(unsigned reg) override {}
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/contrib/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
new file mode 100644
index 000000000000..66178acd52ba
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -0,0 +1,35 @@
+//===-- SparcTargetInfo.cpp - Sparc Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheSparcTarget() {
+  static Target TheSparcTarget;
+  return TheSparcTarget;
+}
+Target &llvm::getTheSparcV9Target() {
+  static Target TheSparcV9Target;
+  return TheSparcV9Target;
+}
+Target &llvm::getTheSparcelTarget() {
+  static Target TheSparcelTarget;
+  return TheSparcelTarget;
+}
+
+extern "C" void LLVMInitializeSparcTargetInfo() {
+  RegisterTarget<Triple::sparc, /*HasJIT=*/true> X(getTheSparcTarget(), "sparc",
+                                                   "Sparc");
+  RegisterTarget<Triple::sparcv9, /*HasJIT=*/true> Y(getTheSparcV9Target(),
+                                                     "sparcv9", "Sparc V9");
+  RegisterTarget<Triple::sparcel, /*HasJIT=*/true> Z(getTheSparcelTarget(),
+                                                     "sparcel", "Sparc LE");
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
new file mode 100644
index 000000000000..a94717c93456
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -0,0 +1,1259 @@
+//===-- SystemZAsmParser.cpp - Parse SystemZ assembly instructions --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+// Return true if Expr is in the range [MinValue, MaxValue].
+static bool inRange(const MCExpr *Expr, int64_t MinValue, int64_t MaxValue) {
+  if (auto *CE = dyn_cast<MCConstantExpr>(Expr)) {
+    int64_t Value = CE->getValue();
+    return Value >= MinValue && Value <= MaxValue;
+  }
+  return false;
+}
+
+namespace {
+enum RegisterKind {
+  GR32Reg,
+  GRH32Reg,
+  GR64Reg,
+  GR128Reg,
+  ADDR32Reg,
+  ADDR64Reg,
+  FP32Reg,
+  FP64Reg,
+  FP128Reg,
+  VR32Reg,
+  VR64Reg,
+  VR128Reg,
+  AR32Reg,
+};
+
+enum MemoryKind {
+  BDMem,
+  BDXMem,
+  BDLMem,
+  BDRMem,
+  BDVMem
+};
+
+class SystemZOperand : public MCParsedAsmOperand {
+public:
+private:
+  enum OperandKind {
+    KindInvalid,
+    KindToken,
+    KindReg,
+    KindImm,
+    KindImmTLS,
+    KindMem
+  };
+
+  OperandKind Kind;
+  SMLoc StartLoc, EndLoc;
+
+  // A string of length Length, starting at Data.
+  struct TokenOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  // LLVM register Num, which has kind Kind.  In some ways it might be
+  // easier for this class to have a register bank (general, floating-point
+  // or access) and a raw register number (0-15).  This would postpone the
+  // interpretation of the operand to the add*() methods and avoid the need
+  // for context-dependent parsing.  However, we do things the current way
+  // because of the virtual getReg() method, which needs to distinguish
+  // between (say) %r0 used as a single register and %r0 used as a pair.
+  // Context-dependent parsing can also give us slightly better error
+  // messages when invalid pairs like %r1 are used.
+  struct RegOp {
+    RegisterKind Kind;
+    unsigned Num;
+  };
+
+  // Base + Disp + Index, where Base and Index are LLVM registers or 0.
+  // MemKind says what type of memory this is and RegKind says what type
+  // the base register has (ADDR32Reg or ADDR64Reg).  Length is the operand
+  // length for D(L,B)-style operands, otherwise it is null.
+  struct MemOp {
+    unsigned Base : 12;
+    unsigned Index : 12;
+    unsigned MemKind : 4;
+    unsigned RegKind : 4;
+    const MCExpr *Disp;
+    union {
+      const MCExpr *Imm;
+      unsigned Reg;
+    } Length;
+  };
+
+  // Imm is an immediate operand, and Sym is an optional TLS symbol
+  // for use with a __tls_get_offset marker relocation.
+  struct ImmTLSOp {
+    const MCExpr *Imm;
+    const MCExpr *Sym;
+  };
+
+  union {
+    TokenOp Token;
+    RegOp Reg;
+    const MCExpr *Imm;
+    ImmTLSOp ImmTLS;
+    MemOp Mem;
+  };
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.  Null MCExpr = 0.
+    if (!Expr)
+      Inst.addOperand(MCOperand::createImm(0));
+    else if (auto *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+public:
+  SystemZOperand(OperandKind kind, SMLoc startLoc, SMLoc endLoc)
+      : Kind(kind), StartLoc(startLoc), EndLoc(endLoc) {}
+
+  // Create particular kinds of operand.
+  static std::unique_ptr<SystemZOperand> createInvalid(SMLoc StartLoc,
+                                                       SMLoc EndLoc) {
+    return make_unique<SystemZOperand>(KindInvalid, StartLoc, EndLoc);
+  }
+  static std::unique_ptr<SystemZOperand> createToken(StringRef Str, SMLoc Loc) {
+    auto Op = make_unique<SystemZOperand>(KindToken, Loc, Loc);
+    Op->Token.Data = Str.data();
+    Op->Token.Length = Str.size();
+    return Op;
+  }
+  static std::unique_ptr<SystemZOperand>
+  createReg(RegisterKind Kind, unsigned Num, SMLoc StartLoc, SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindReg, StartLoc, EndLoc);
+    Op->Reg.Kind = Kind;
+    Op->Reg.Num = Num;
+    return Op;
+  }
+  static std::unique_ptr<SystemZOperand>
+  createImm(const MCExpr *Expr, SMLoc StartLoc, SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindImm, StartLoc, EndLoc);
+    Op->Imm = Expr;
+    return Op;
+  }
+  static std::unique_ptr<SystemZOperand>
+  createMem(MemoryKind MemKind, RegisterKind RegKind, unsigned Base,
+            const MCExpr *Disp, unsigned Index, const MCExpr *LengthImm,
+            unsigned LengthReg, SMLoc StartLoc, SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindMem, StartLoc, EndLoc);
+    Op->Mem.MemKind = MemKind;
+    Op->Mem.RegKind = RegKind;
+    Op->Mem.Base = Base;
+    Op->Mem.Index = Index;
+    Op->Mem.Disp = Disp;
+    if (MemKind == BDLMem)
+      Op->Mem.Length.Imm = LengthImm;
+    if (MemKind == BDRMem)
+      Op->Mem.Length.Reg = LengthReg;
+    return Op;
+  }
+  static std::unique_ptr<SystemZOperand>
+  createImmTLS(const MCExpr *Imm, const MCExpr *Sym,
+               SMLoc StartLoc, SMLoc EndLoc) {
+    auto Op = make_unique<SystemZOperand>(KindImmTLS, StartLoc, EndLoc);
+    Op->ImmTLS.Imm = Imm;
+    Op->ImmTLS.Sym = Sym;
+    return Op;
+  }
+
+  // Token operands
+  bool isToken() const override {
+    return Kind == KindToken;
+  }
+  StringRef getToken() const {
+    assert(Kind == KindToken && "Not a token");
+    return StringRef(Token.Data, Token.Length);
+  }
+
+  // Register operands.
+  bool isReg() const override {
+    return Kind == KindReg;
+  }
+  bool isReg(RegisterKind RegKind) const {
+    return Kind == KindReg && Reg.Kind == RegKind;
+  }
+  unsigned getReg() const override {
+    assert(Kind == KindReg && "Not a register");
+    return Reg.Num;
+  }
+
+  // Immediate operands.
+  bool isImm() const override {
+    return Kind == KindImm;
+  }
+  bool isImm(int64_t MinValue, int64_t MaxValue) const {
+    return Kind == KindImm && inRange(Imm, MinValue, MaxValue);
+  }
+  const MCExpr *getImm() const {
+    assert(Kind == KindImm && "Not an immediate");
+    return Imm;
+  }
+
+  // Immediate operands with optional TLS symbol.
+  bool isImmTLS() const {
+    return Kind == KindImmTLS;
+  }
+
+  // Memory operands.
+  bool isMem() const override {
+    return Kind == KindMem;
+  }
+  bool isMem(MemoryKind MemKind) const {
+    return (Kind == KindMem &&
+            (Mem.MemKind == MemKind ||
+             // A BDMem can be treated as a BDXMem in which the index
+             // register field is 0.
+             (Mem.MemKind == BDMem && MemKind == BDXMem)));
+  }
+  bool isMem(MemoryKind MemKind, RegisterKind RegKind) const {
+    return isMem(MemKind) && Mem.RegKind == RegKind;
+  }
+  bool isMemDisp12(MemoryKind MemKind, RegisterKind RegKind) const {
+    return isMem(MemKind, RegKind) && inRange(Mem.Disp, 0, 0xfff);
+  }
+  bool isMemDisp20(MemoryKind MemKind, RegisterKind RegKind) const {
+    return isMem(MemKind, RegKind) && inRange(Mem.Disp, -524288, 524287);
+  }
+  bool isMemDisp12Len8(RegisterKind RegKind) const {
+    return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x100);
+  }
+
+  // Override MCParsedAsmOperand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  SMLoc getEndLoc() const override { return EndLoc; }
+  void print(raw_ostream &OS) const override;
+
+  // Used by the TableGen code to add particular types of operand
+  // to an instruction.
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands");
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands");
+    addExpr(Inst, getImm());
+  }
+  void addBDAddrOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands");
+    assert(isMem(BDMem) && "Invalid operand type");
+    Inst.addOperand(MCOperand::createReg(Mem.Base));
+    addExpr(Inst, Mem.Disp);
+  }
+  void addBDXAddrOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands");
+    assert(isMem(BDXMem) && "Invalid operand type");
+    Inst.addOperand(MCOperand::createReg(Mem.Base));
+    addExpr(Inst, Mem.Disp);
+    Inst.addOperand(MCOperand::createReg(Mem.Index));
+  }
+  void addBDLAddrOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands");
+    assert(isMem(BDLMem) && "Invalid operand type");
+    Inst.addOperand(MCOperand::createReg(Mem.Base));
+    addExpr(Inst, Mem.Disp);
+    addExpr(Inst, Mem.Length.Imm);
+  }
+  void addBDRAddrOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands");
+    assert(isMem(BDRMem) && "Invalid operand type");
+    Inst.addOperand(MCOperand::createReg(Mem.Base));
+    addExpr(Inst, Mem.Disp);
+    Inst.addOperand(MCOperand::createReg(Mem.Length.Reg));
+  }
+  void addBDVAddrOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands");
+    assert(isMem(BDVMem) && "Invalid operand type");
+    Inst.addOperand(MCOperand::createReg(Mem.Base));
+    addExpr(Inst, Mem.Disp);
+    Inst.addOperand(MCOperand::createReg(Mem.Index));
+  }
+  void addImmTLSOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands");
+    assert(Kind == KindImmTLS && "Invalid operand type");
+    addExpr(Inst, ImmTLS.Imm);
+    if (ImmTLS.Sym)
+      addExpr(Inst, ImmTLS.Sym);
+  }
+
+  // Used by the TableGen code to check for particular operand types.
+  bool isGR32() const { return isReg(GR32Reg); }
+  bool isGRH32() const { return isReg(GRH32Reg); }
+  bool isGRX32() const { return false; }
+  bool isGR64() const { return isReg(GR64Reg); }
+  bool isGR128() const { return isReg(GR128Reg); }
+  bool isADDR32() const { return isReg(ADDR32Reg); }
+  bool isADDR64() const { return isReg(ADDR64Reg); }
+  bool isADDR128() const { return false; }
+  bool isFP32() const { return isReg(FP32Reg); }
+  bool isFP64() const { return isReg(FP64Reg); }
+  bool isFP128() const { return isReg(FP128Reg); }
+  bool isVR32() const { return isReg(VR32Reg); }
+  bool isVR64() const { return isReg(VR64Reg); }
+  bool isVF128() const { return false; }
+  bool isVR128() const { return isReg(VR128Reg); }
+  bool isAR32() const { return isReg(AR32Reg); }
+  bool isAnyReg() const { return (isReg() || isImm(0, 15)); }
+  bool isBDAddr32Disp12() const { return isMemDisp12(BDMem, ADDR32Reg); }
+  bool isBDAddr32Disp20() const { return isMemDisp20(BDMem, ADDR32Reg); }
+  bool isBDAddr64Disp12() const { return isMemDisp12(BDMem, ADDR64Reg); }
+  bool isBDAddr64Disp20() const { return isMemDisp20(BDMem, ADDR64Reg); }
+  bool isBDXAddr64Disp12() const { return isMemDisp12(BDXMem, ADDR64Reg); }
+  bool isBDXAddr64Disp20() const { return isMemDisp20(BDXMem, ADDR64Reg); }
+  bool isBDLAddr64Disp12Len8() const { return isMemDisp12Len8(ADDR64Reg); }
+  bool isBDRAddr64Disp12() const { return isMemDisp12(BDRMem, ADDR64Reg); }
+  bool isBDVAddr64Disp12() const { return isMemDisp12(BDVMem, ADDR64Reg); }
+  bool isU1Imm() const { return isImm(0, 1); }
+  bool isU2Imm() const { return isImm(0, 3); }
+  bool isU3Imm() const { return isImm(0, 7); }
+  bool isU4Imm() const { return isImm(0, 15); }
+  bool isU6Imm() const { return isImm(0, 63); }
+  bool isU8Imm() const { return isImm(0, 255); }
+  bool isS8Imm() const { return isImm(-128, 127); }
+  bool isU12Imm() const { return isImm(0, 4095); }
+  bool isU16Imm() const { return isImm(0, 65535); }
+  bool isS16Imm() const { return isImm(-32768, 32767); }
+  bool isU32Imm() const { return isImm(0, (1LL << 32) - 1); }
+  bool isS32Imm() const { return isImm(-(1LL << 31), (1LL << 31) - 1); }
+  bool isU48Imm() const { return isImm(0, (1LL << 48) - 1); }
+};
+
+class SystemZAsmParser : public MCTargetAsmParser {
+#define GET_ASSEMBLER_HEADER
+#include "SystemZGenAsmMatcher.inc"
+
+private:
+  MCAsmParser &Parser;
+  enum RegisterGroup {
+    RegGR,
+    RegFP,
+    RegV,
+    RegAR
+  };
+  struct Register {
+    RegisterGroup Group;
+    unsigned Num;
+    SMLoc StartLoc, EndLoc;
+  };
+
+  bool parseRegister(Register &Reg);
+
+  bool parseRegister(Register &Reg, RegisterGroup Group, const unsigned *Regs,
+                     bool IsAddress = false);
+
+  OperandMatchResultTy parseRegister(OperandVector &Operands,
+                                     RegisterGroup Group, const unsigned *Regs,
+                                     RegisterKind Kind);
+
+  OperandMatchResultTy parseAnyRegister(OperandVector &Operands);
+
+  bool parseAddress(bool &HaveReg1, Register &Reg1,
+                    bool &HaveReg2, Register &Reg2,
+                    const MCExpr *&Disp, const MCExpr *&Length);
+  bool parseAddressRegister(Register &Reg);
+
+  bool ParseDirectiveInsn(SMLoc L);
+
+  OperandMatchResultTy parseAddress(OperandVector &Operands,
+                                    MemoryKind MemKind, const unsigned *Regs,
+                                    RegisterKind RegKind);
+
+  OperandMatchResultTy parsePCRel(OperandVector &Operands, int64_t MinVal,
+                                  int64_t MaxVal, bool AllowTLS);
+
+  bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
+
+public:
+  SystemZAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
+                   const MCInstrInfo &MII,
+                   const MCTargetOptions &Options)
+    : MCTargetAsmParser(Options, sti), Parser(parser) {
+    MCAsmParserExtension::Initialize(Parser);
+
+    // Alias the .word directive to .short.
+    parser.addAliasForDirective(".word", ".short");
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+  }
+
+  // Override MCTargetAsmParser.
+  bool ParseDirective(AsmToken DirectiveID) override;
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+
+  // Used by the TableGen code to parse particular operand types.
+  OperandMatchResultTy parseGR32(OperandVector &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, GR32Reg);
+  }
+  OperandMatchResultTy parseGRH32(OperandVector &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GRH32Regs, GRH32Reg);
+  }
+  OperandMatchResultTy parseGRX32(OperandVector &Operands) {
+    llvm_unreachable("GRX32 should only be used for pseudo instructions");
+  }
+  OperandMatchResultTy parseGR64(OperandVector &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, GR64Reg);
+  }
+  OperandMatchResultTy parseGR128(OperandVector &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GR128Regs, GR128Reg);
+  }
+  OperandMatchResultTy parseADDR32(OperandVector &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, ADDR32Reg);
+  }
+  OperandMatchResultTy parseADDR64(OperandVector &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, ADDR64Reg);
+  }
+  OperandMatchResultTy parseADDR128(OperandVector &Operands) {
+    llvm_unreachable("Shouldn't be used as an operand");
+  }
+  OperandMatchResultTy parseFP32(OperandVector &Operands) {
+    return parseRegister(Operands, RegFP, SystemZMC::FP32Regs, FP32Reg);
+  }
+  OperandMatchResultTy parseFP64(OperandVector &Operands) {
+    return parseRegister(Operands, RegFP, SystemZMC::FP64Regs, FP64Reg);
+  }
+  OperandMatchResultTy parseFP128(OperandVector &Operands) {
+    return parseRegister(Operands, RegFP, SystemZMC::FP128Regs, FP128Reg);
+  }
+  OperandMatchResultTy parseVR32(OperandVector &Operands) {
+    return parseRegister(Operands, RegV, SystemZMC::VR32Regs, VR32Reg);
+  }
+  OperandMatchResultTy parseVR64(OperandVector &Operands) {
+    return parseRegister(Operands, RegV, SystemZMC::VR64Regs, VR64Reg);
+  }
+  OperandMatchResultTy parseVF128(OperandVector &Operands) {
+    llvm_unreachable("Shouldn't be used as an operand");
+  }
+  OperandMatchResultTy parseVR128(OperandVector &Operands) {
+    return parseRegister(Operands, RegV, SystemZMC::VR128Regs, VR128Reg);
+  }
+  OperandMatchResultTy parseAR32(OperandVector &Operands) {
+    return parseRegister(Operands, RegAR, SystemZMC::AR32Regs, AR32Reg);
+  }
+  OperandMatchResultTy parseAnyReg(OperandVector &Operands) {
+    return parseAnyRegister(Operands);
+  }
+  OperandMatchResultTy parseBDAddr32(OperandVector &Operands) {
+    return parseAddress(Operands, BDMem, SystemZMC::GR32Regs, ADDR32Reg);
+  }
+  OperandMatchResultTy parseBDAddr64(OperandVector &Operands) {
+    return parseAddress(Operands, BDMem, SystemZMC::GR64Regs, ADDR64Reg);
+  }
+  OperandMatchResultTy parseBDXAddr64(OperandVector &Operands) {
+    return parseAddress(Operands, BDXMem, SystemZMC::GR64Regs, ADDR64Reg);
+  }
+  OperandMatchResultTy parseBDLAddr64(OperandVector &Operands) {
+    return parseAddress(Operands, BDLMem, SystemZMC::GR64Regs, ADDR64Reg);
+  }
+  OperandMatchResultTy parseBDRAddr64(OperandVector &Operands) {
+    return parseAddress(Operands, BDRMem, SystemZMC::GR64Regs, ADDR64Reg);
+  }
+  OperandMatchResultTy parseBDVAddr64(OperandVector &Operands) {
+    return parseAddress(Operands, BDVMem, SystemZMC::GR64Regs, ADDR64Reg);
+  }
+  OperandMatchResultTy parsePCRel12(OperandVector &Operands) {
+    return parsePCRel(Operands, -(1LL << 12), (1LL << 12) - 1, false);
+  }
+  OperandMatchResultTy parsePCRel16(OperandVector &Operands) {
+    return parsePCRel(Operands, -(1LL << 16), (1LL << 16) - 1, false);
+  }
+  OperandMatchResultTy parsePCRel24(OperandVector &Operands) {
+    return parsePCRel(Operands, -(1LL << 24), (1LL << 24) - 1, false);
+  }
+  OperandMatchResultTy parsePCRel32(OperandVector &Operands) {
+    return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1, false);
+  }
+  OperandMatchResultTy parsePCRelTLS16(OperandVector &Operands) {
+    return parsePCRel(Operands, -(1LL << 16), (1LL << 16) - 1, true);
+  }
+  OperandMatchResultTy parsePCRelTLS32(OperandVector &Operands) {
+    return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1, true);
+  }
+};
+} // end anonymous namespace
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#include "SystemZGenAsmMatcher.inc"
+
+// Used for the .insn directives; contains information needed to parse the
+// operands in the directive.
+struct InsnMatchEntry {
+  StringRef Format;
+  uint64_t Opcode;
+  int32_t NumOperands;
+  MatchClassKind OperandKinds[5];
+};
+
+// For equal_range comparison.
+struct CompareInsn {
+  bool operator() (const InsnMatchEntry &LHS, StringRef RHS) {
+    return LHS.Format < RHS;
+  }
+  bool operator() (StringRef LHS, const InsnMatchEntry &RHS) {
+    return LHS < RHS.Format;
+  }
+  bool operator() (const InsnMatchEntry &LHS, const InsnMatchEntry &RHS) {
+    return LHS.Format < RHS.Format;
+  }
+};
+
+// Table initializing information for parsing the .insn directive.
+static struct InsnMatchEntry InsnMatchTable[] = {
+  /* Format, Opcode, NumOperands, OperandKinds */
+  { "e", SystemZ::InsnE, 1,
+    { MCK_U16Imm } },
+  { "ri", SystemZ::InsnRI, 3,
+    { MCK_U32Imm, MCK_AnyReg, MCK_S16Imm } },
+  { "rie", SystemZ::InsnRIE, 4,
+    { MCK_U48Imm, MCK_AnyReg, MCK_AnyReg, MCK_PCRel16 } },
+  { "ril", SystemZ::InsnRIL, 3,
+    { MCK_U48Imm, MCK_AnyReg, MCK_PCRel32 } },
+  { "rilu", SystemZ::InsnRILU, 3,
+    { MCK_U48Imm, MCK_AnyReg, MCK_U32Imm } },
+  { "ris", SystemZ::InsnRIS, 5,
+    { MCK_U48Imm, MCK_AnyReg, MCK_S8Imm, MCK_U4Imm, MCK_BDAddr64Disp12 } },
+  { "rr", SystemZ::InsnRR, 3,
+    { MCK_U16Imm, MCK_AnyReg, MCK_AnyReg } },
+  { "rre", SystemZ::InsnRRE, 3,
+    { MCK_U32Imm, MCK_AnyReg, MCK_AnyReg } },
+  { "rrf", SystemZ::InsnRRF, 5,
+    { MCK_U32Imm, MCK_AnyReg, MCK_AnyReg, MCK_AnyReg, MCK_U4Imm } },
+  { "rrs", SystemZ::InsnRRS, 5,
+    { MCK_U48Imm, MCK_AnyReg, MCK_AnyReg, MCK_U4Imm, MCK_BDAddr64Disp12 } },
+  { "rs", SystemZ::InsnRS, 4,
+    { MCK_U32Imm, MCK_AnyReg, MCK_AnyReg, MCK_BDAddr64Disp12 } },
+  { "rse", SystemZ::InsnRSE, 4,
+    { MCK_U48Imm, MCK_AnyReg, MCK_AnyReg, MCK_BDAddr64Disp12 } },
+  { "rsi", SystemZ::InsnRSI, 4,
+    { MCK_U48Imm, MCK_AnyReg, MCK_AnyReg, MCK_PCRel16 } },
+  { "rsy", SystemZ::InsnRSY, 4,
+    { MCK_U48Imm, MCK_AnyReg, MCK_AnyReg, MCK_BDAddr64Disp20 } },
+  { "rx", SystemZ::InsnRX, 3,
+    { MCK_U32Imm, MCK_AnyReg, MCK_BDXAddr64Disp12 } },
+  { "rxe", SystemZ::InsnRXE, 3,
+    { MCK_U48Imm, MCK_AnyReg, MCK_BDXAddr64Disp12 } },
+  { "rxf", SystemZ::InsnRXF, 4,
+    { MCK_U48Imm, MCK_AnyReg, MCK_AnyReg, MCK_BDXAddr64Disp12 } },
+  { "rxy", SystemZ::InsnRXY, 3,
+    { MCK_U48Imm, MCK_AnyReg, MCK_BDXAddr64Disp20 } },
+  { "s", SystemZ::InsnS, 2,
+    { MCK_U32Imm, MCK_BDAddr64Disp12 } },
+  { "si", SystemZ::InsnSI, 3,
+    { MCK_U32Imm, MCK_BDAddr64Disp12, MCK_S8Imm } },
+  { "sil", SystemZ::InsnSIL, 3,
+    { MCK_U48Imm, MCK_BDAddr64Disp12, MCK_U16Imm } },
+  { "siy", SystemZ::InsnSIY, 3,
+    { MCK_U48Imm, MCK_BDAddr64Disp20, MCK_U8Imm } },
+  { "ss", SystemZ::InsnSS, 4,
+    { MCK_U48Imm, MCK_BDXAddr64Disp12, MCK_BDAddr64Disp12, MCK_AnyReg } },
+  { "sse", SystemZ::InsnSSE, 3,
+    { MCK_U48Imm, MCK_BDAddr64Disp12, MCK_BDAddr64Disp12 } },
+  { "ssf", SystemZ::InsnSSF, 4,
+    { MCK_U48Imm, MCK_BDAddr64Disp12, MCK_BDAddr64Disp12, MCK_AnyReg } }
+};
+
+void SystemZOperand::print(raw_ostream &OS) const {
+  llvm_unreachable("Not implemented");
+}
+
+// Parse one register of the form %<prefix><number>.
+bool SystemZAsmParser::parseRegister(Register &Reg) {
+  Reg.StartLoc = Parser.getTok().getLoc();
+
+  // Eat the % prefix.
+  if (Parser.getTok().isNot(AsmToken::Percent))
+    return Error(Parser.getTok().getLoc(), "register expected");
+  Parser.Lex();
+
+  // Expect a register name.
+  if (Parser.getTok().isNot(AsmToken::Identifier))
+    return Error(Reg.StartLoc, "invalid register");
+
+  // Check that there's a prefix.
+  StringRef Name = Parser.getTok().getString();
+  if (Name.size() < 2)
+    return Error(Reg.StartLoc, "invalid register");
+  char Prefix = Name[0];
+
+  // Treat the rest of the register name as a register number.
+  if (Name.substr(1).getAsInteger(10, Reg.Num))
+    return Error(Reg.StartLoc, "invalid register");
+
+  // Look for valid combinations of prefix and number.
+  if (Prefix == 'r' && Reg.Num < 16)
+    Reg.Group = RegGR;
+  else if (Prefix == 'f' && Reg.Num < 16)
+    Reg.Group = RegFP;
+  else if (Prefix == 'v' && Reg.Num < 32)
+    Reg.Group = RegV;
+  else if (Prefix == 'a' && Reg.Num < 16)
+    Reg.Group = RegAR;
+  else
+    return Error(Reg.StartLoc, "invalid register");
+
+  Reg.EndLoc = Parser.getTok().getLoc();
+  Parser.Lex();
+  return false;
+}
+
+// Parse a register of group Group.  If Regs is nonnull, use it to map
+// the raw register number to LLVM numbering, with zero entries
+// indicating an invalid register.  IsAddress says whether the
+// register appears in an address context. Allow FP Group if expecting
+// RegV Group, since the f-prefix yields the FP group even while used
+// with vector instructions.
+bool SystemZAsmParser::parseRegister(Register &Reg, RegisterGroup Group,
+                                     const unsigned *Regs, bool IsAddress) {
+  if (parseRegister(Reg))
+    return true;
+  if (Reg.Group != Group && !(Reg.Group == RegFP && Group == RegV))
+    return Error(Reg.StartLoc, "invalid operand for instruction");
+  if (Regs && Regs[Reg.Num] == 0)
+    return Error(Reg.StartLoc, "invalid register pair");
+  if (Reg.Num == 0 && IsAddress)
+    return Error(Reg.StartLoc, "%r0 used in an address");
+  if (Regs)
+    Reg.Num = Regs[Reg.Num];
+  return false;
+}
+
+// Parse a register and add it to Operands.  The other arguments are as above.
+OperandMatchResultTy
+SystemZAsmParser::parseRegister(OperandVector &Operands, RegisterGroup Group,
+                                const unsigned *Regs, RegisterKind Kind) {
+  if (Parser.getTok().isNot(AsmToken::Percent))
+    return MatchOperand_NoMatch;
+
+  Register Reg;
+  bool IsAddress = (Kind == ADDR32Reg || Kind == ADDR64Reg);
+  if (parseRegister(Reg, Group, Regs, IsAddress))
+    return MatchOperand_ParseFail;
+
+  Operands.push_back(SystemZOperand::createReg(Kind, Reg.Num,
+                                               Reg.StartLoc, Reg.EndLoc));
+  return MatchOperand_Success;
+}
+
+// Parse any type of register (including integers) and add it to Operands.
+OperandMatchResultTy
+SystemZAsmParser::parseAnyRegister(OperandVector &Operands) {
+  // Handle integer values.
+  if (Parser.getTok().is(AsmToken::Integer)) {
+    const MCExpr *Register;
+    SMLoc StartLoc = Parser.getTok().getLoc();
+    if (Parser.parseExpression(Register))
+      return MatchOperand_ParseFail;
+
+    if (auto *CE = dyn_cast<MCConstantExpr>(Register)) {
+      int64_t Value = CE->getValue();
+      if (Value < 0 || Value > 15) {
+        Error(StartLoc, "invalid register");
+        return MatchOperand_ParseFail;
+      }
+    }
+
+    SMLoc EndLoc =
+      SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+    Operands.push_back(SystemZOperand::createImm(Register, StartLoc, EndLoc));
+  }
+  else {
+    Register Reg;
+    if (parseRegister(Reg))
+      return MatchOperand_ParseFail;
+
+    // Map to the correct register kind.
+    RegisterKind Kind;
+    unsigned RegNo;
+    if (Reg.Group == RegGR) {
+      Kind = GR64Reg;
+      RegNo = SystemZMC::GR64Regs[Reg.Num];
+    }
+    else if (Reg.Group == RegFP) {
+      Kind = FP64Reg;
+      RegNo = SystemZMC::FP64Regs[Reg.Num];
+    }
+    else if (Reg.Group == RegV) {
+      Kind = VR128Reg;
+      RegNo = SystemZMC::VR128Regs[Reg.Num];
+    }
+    else if (Reg.Group == RegAR) {
+      Kind = AR32Reg;
+      RegNo = SystemZMC::AR32Regs[Reg.Num];
+    }
+    else {
+      return MatchOperand_ParseFail;
+    }
+
+    Operands.push_back(SystemZOperand::createReg(Kind, RegNo,
+                                                 Reg.StartLoc, Reg.EndLoc));
+  }
+  return MatchOperand_Success;
+}
+
+// Parse a memory operand into Reg1, Reg2, Disp, and Length.
+bool SystemZAsmParser::parseAddress(bool &HaveReg1, Register &Reg1,
+                                    bool &HaveReg2, Register &Reg2,
+                                    const MCExpr *&Disp,
+                                    const MCExpr *&Length) {
+  // Parse the displacement, which must always be present.
+  if (getParser().parseExpression(Disp))
+    return true;
+
+  // Parse the optional base and index.
+  HaveReg1 = false;
+  HaveReg2 = false;
+  Length = nullptr;
+  if (getLexer().is(AsmToken::LParen)) {
+    Parser.Lex();
+
+    if (getLexer().is(AsmToken::Percent)) {
+      // Parse the first register.
+      HaveReg1 = true;
+      if (parseRegister(Reg1))
+        return true;
+    } else {
+      // Parse the length.
+      if (getParser().parseExpression(Length))
+        return true;
+    }
+
+    // Check whether there's a second register.
+    if (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex();
+      HaveReg2 = true;
+      if (parseRegister(Reg2))
+        return true;
+    }
+
+    // Consume the closing bracket.
+    if (getLexer().isNot(AsmToken::RParen))
+      return Error(Parser.getTok().getLoc(), "unexpected token in address");
+    Parser.Lex();
+  }
+  return false;
+}
+
+// Verify that Reg is a valid address register (base or index).
+bool
+SystemZAsmParser::parseAddressRegister(Register &Reg) {
+  if (Reg.Group == RegV) {
+    Error(Reg.StartLoc, "invalid use of vector addressing");
+    return true;
+  } else if (Reg.Group != RegGR) {
+    Error(Reg.StartLoc, "invalid address register");
+    return true;
+  } else if (Reg.Num == 0) {
+    Error(Reg.StartLoc, "%r0 used in an address");
+    return true;
+  }
+  return false;
+}
+
+// Parse a memory operand and add it to Operands.  The other arguments
+// are as above.
+OperandMatchResultTy
+SystemZAsmParser::parseAddress(OperandVector &Operands, MemoryKind MemKind,
+                               const unsigned *Regs, RegisterKind RegKind) {
+  SMLoc StartLoc = Parser.getTok().getLoc();
+  unsigned Base = 0, Index = 0, LengthReg = 0;
+  Register Reg1, Reg2;
+  bool HaveReg1, HaveReg2;
+  const MCExpr *Disp;
+  const MCExpr *Length;
+  if (parseAddress(HaveReg1, Reg1, HaveReg2, Reg2, Disp, Length))
+    return MatchOperand_ParseFail;
+
+  switch (MemKind) {
+  case BDMem:
+    // If we have Reg1, it must be an address register.
+    if (HaveReg1) {
+      if (parseAddressRegister(Reg1))
+        return MatchOperand_ParseFail;
+      Base = Regs[Reg1.Num];
+    }
+    // There must be no Reg2 or length.
+    if (Length) {
+      Error(StartLoc, "invalid use of length addressing");
+      return MatchOperand_ParseFail;
+    }
+    if (HaveReg2) {
+      Error(StartLoc, "invalid use of indexed addressing");
+      return MatchOperand_ParseFail;
+    }
+    break;
+  case BDXMem:
+    // If we have Reg1, it must be an address register.
+    if (HaveReg1) {
+      if (parseAddressRegister(Reg1))
+        return MatchOperand_ParseFail;
+      // If the are two registers, the first one is the index and the
+      // second is the base.
+      if (HaveReg2)
+        Index = Regs[Reg1.Num];
+      else
+        Base = Regs[Reg1.Num];
+    }
+    // If we have Reg2, it must be an address register.
+    if (HaveReg2) {
+      if (parseAddressRegister(Reg2))
+        return MatchOperand_ParseFail;
+      Base = Regs[Reg2.Num];
+    }
+    // There must be no length.
+    if (Length) {
+      Error(StartLoc, "invalid use of length addressing");
+      return MatchOperand_ParseFail;
+    }
+    break;
+  case BDLMem:
+    // If we have Reg2, it must be an address register.
+    if (HaveReg2) {
+      if (parseAddressRegister(Reg2))
+        return MatchOperand_ParseFail;
+      Base = Regs[Reg2.Num];
+    }
+    // We cannot support base+index addressing.
+    if (HaveReg1 && HaveReg2) {
+      Error(StartLoc, "invalid use of indexed addressing");
+      return MatchOperand_ParseFail;
+    }
+    // We must have a length.
+    if (!Length) {
+      Error(StartLoc, "missing length in address");
+      return MatchOperand_ParseFail;
+    }
+    break;
+  case BDRMem:
+    // We must have Reg1, and it must be a GPR.
+    if (!HaveReg1 || Reg1.Group != RegGR) {
+      Error(StartLoc, "invalid operand for instruction");
+      return MatchOperand_ParseFail;
+    }
+    LengthReg = SystemZMC::GR64Regs[Reg1.Num];
+    // If we have Reg2, it must be an address register.
+    if (HaveReg2) {
+      if (parseAddressRegister(Reg2))
+        return MatchOperand_ParseFail;
+      Base = Regs[Reg2.Num];
+    }
+    // There must be no length.
+    if (Length) {
+      Error(StartLoc, "invalid use of length addressing");
+      return MatchOperand_ParseFail;
+    }
+    break;
+  case BDVMem:
+    // We must have Reg1, and it must be a vector register.
+    if (!HaveReg1 || Reg1.Group != RegV) {
+      Error(StartLoc, "vector index required in address");
+      return MatchOperand_ParseFail;
+    }
+    Index = SystemZMC::VR128Regs[Reg1.Num];
+    // If we have Reg2, it must be an address register.
+    if (HaveReg2) {
+      if (parseAddressRegister(Reg2))
+        return MatchOperand_ParseFail;
+      Base = Regs[Reg2.Num];
+    }
+    // There must be no length.
+    if (Length) {
+      Error(StartLoc, "invalid use of length addressing");
+      return MatchOperand_ParseFail;
+    }
+    break;
+  }
+
+  SMLoc EndLoc =
+    SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Operands.push_back(SystemZOperand::createMem(MemKind, RegKind, Base, Disp,
+                                               Index, Length, LengthReg,
+                                               StartLoc, EndLoc));
+  return MatchOperand_Success;
+}
+
+bool SystemZAsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+
+  if (IDVal == ".insn")
+    return ParseDirectiveInsn(DirectiveID.getLoc());
+
+  return true;
+}
+
+/// ParseDirectiveInsn
+/// ::= .insn [ format, encoding, (operands (, operands)*) ]
+bool SystemZAsmParser::ParseDirectiveInsn(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+
+  // Expect instruction format as identifier.
+  StringRef Format;
+  SMLoc ErrorLoc = Parser.getTok().getLoc();
+  if (Parser.parseIdentifier(Format))
+    return Error(ErrorLoc, "expected instruction format");
+
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> Operands;
+
+  // Find entry for this format in InsnMatchTable.
+  auto EntryRange =
+    std::equal_range(std::begin(InsnMatchTable), std::end(InsnMatchTable),
+                     Format, CompareInsn());
+
+  // If first == second, couldn't find a match in the table.
+  if (EntryRange.first == EntryRange.second)
+    return Error(ErrorLoc, "unrecognized format");
+
+  struct InsnMatchEntry *Entry = EntryRange.first;
+
+  // Format should match from equal_range.
+  assert(Entry->Format == Format);
+
+  // Parse the following operands using the table's information.
+  for (int i = 0; i < Entry->NumOperands; i++) {
+    MatchClassKind Kind = Entry->OperandKinds[i];
+
+    SMLoc StartLoc = Parser.getTok().getLoc();
+
+    // Always expect commas as separators for operands.
+    if (getLexer().isNot(AsmToken::Comma))
+      return Error(StartLoc, "unexpected token in directive");
+    Lex();
+
+    // Parse operands.
+    OperandMatchResultTy ResTy;
+    if (Kind == MCK_AnyReg)
+      ResTy = parseAnyReg(Operands);
+    else if (Kind == MCK_BDXAddr64Disp12 || Kind == MCK_BDXAddr64Disp20)
+      ResTy = parseBDXAddr64(Operands);
+    else if (Kind == MCK_BDAddr64Disp12 || Kind == MCK_BDAddr64Disp20)
+      ResTy = parseBDAddr64(Operands);
+    else if (Kind == MCK_PCRel32)
+      ResTy = parsePCRel32(Operands);
+    else if (Kind == MCK_PCRel16)
+      ResTy = parsePCRel16(Operands);
+    else {
+      // Only remaining operand kind is an immediate.
+      const MCExpr *Expr;
+      SMLoc StartLoc = Parser.getTok().getLoc();
+
+      // Expect immediate expression.
+      if (Parser.parseExpression(Expr))
+        return Error(StartLoc, "unexpected token in directive");
+
+      SMLoc EndLoc =
+        SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+      Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
+      ResTy = MatchOperand_Success;
+    }
+
+    if (ResTy != MatchOperand_Success)
+      return true;
+  }
+
+  // Build the instruction with the parsed operands.
+  MCInst Inst = MCInstBuilder(Entry->Opcode);
+
+  for (size_t i = 0; i < Operands.size(); i++) {
+    MCParsedAsmOperand &Operand = *Operands[i];
+    MatchClassKind Kind = Entry->OperandKinds[i];
+
+    // Verify operand.
+    unsigned Res = validateOperandClass(Operand, Kind);
+    if (Res != Match_Success)
+      return Error(Operand.getStartLoc(), "unexpected operand type");
+
+    // Add operands to instruction.
+    SystemZOperand &ZOperand = static_cast<SystemZOperand &>(Operand);
+    if (ZOperand.isReg())
+      ZOperand.addRegOperands(Inst, 1);
+    else if (ZOperand.isMem(BDMem))
+      ZOperand.addBDAddrOperands(Inst, 2);
+    else if (ZOperand.isMem(BDXMem))
+      ZOperand.addBDXAddrOperands(Inst, 3);
+    else if (ZOperand.isImm())
+      ZOperand.addImmOperands(Inst, 1);
+    else
+      llvm_unreachable("unexpected operand type");
+  }
+
+  // Emit as a regular instruction.
+  Parser.getStreamer().EmitInstruction(Inst, getSTI());
+
+  return false;
+}
+
+bool SystemZAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                     SMLoc &EndLoc) {
+  Register Reg;
+  if (parseRegister(Reg))
+    return true;
+  if (Reg.Group == RegGR)
+    RegNo = SystemZMC::GR64Regs[Reg.Num];
+  else if (Reg.Group == RegFP)
+    RegNo = SystemZMC::FP64Regs[Reg.Num];
+  else if (Reg.Group == RegV)
+    RegNo = SystemZMC::VR128Regs[Reg.Num];
+  else if (Reg.Group == RegAR)
+    RegNo = SystemZMC::AR32Regs[Reg.Num];
+  StartLoc = Reg.StartLoc;
+  EndLoc = Reg.EndLoc;
+  return false;
+}
+
+bool SystemZAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                        StringRef Name, SMLoc NameLoc,
+                                        OperandVector &Operands) {
+  Operands.push_back(SystemZOperand::createToken(Name, NameLoc));
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (parseOperand(Operands, Name)) {
+      return true;
+    }
+
+    // Read any subsequent operands.
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex();
+      if (parseOperand(Operands, Name)) {
+        return true;
+      }
+    }
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      SMLoc Loc = getLexer().getLoc();
+      return Error(Loc, "unexpected token in argument list");
+    }
+  }
+
+  // Consume the EndOfStatement.
+  Parser.Lex();
+  return false;
+}
+
+bool SystemZAsmParser::parseOperand(OperandVector &Operands,
+                                    StringRef Mnemonic) {
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.  Force all
+  // features to be available during the operand check, or else we will fail to
+  // find the custom parser, and then we will later get an InvalidOperand error
+  // instead of a MissingFeature errror.
+  uint64_t AvailableFeatures = getAvailableFeatures();
+  setAvailableFeatures(~(uint64_t)0);
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  setAvailableFeatures(AvailableFeatures);
+  if (ResTy == MatchOperand_Success)
+    return false;
+
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  // Check for a register.  All real register operands should have used
+  // a context-dependent parse routine, which gives the required register
+  // class.  The code is here to mop up other cases, like those where
+  // the instruction isn't recognized.
+  if (Parser.getTok().is(AsmToken::Percent)) {
+    Register Reg;
+    if (parseRegister(Reg))
+      return true;
+    Operands.push_back(SystemZOperand::createInvalid(Reg.StartLoc, Reg.EndLoc));
+    return false;
+  }
+
+  // The only other type of operand is an immediate or address.  As above,
+  // real address operands should have used a context-dependent parse routine,
+  // so we treat any plain expression as an immediate.
+  SMLoc StartLoc = Parser.getTok().getLoc();
+  Register Reg1, Reg2;
+  bool HaveReg1, HaveReg2;
+  const MCExpr *Expr;
+  const MCExpr *Length;
+  if (parseAddress(HaveReg1, Reg1, HaveReg2, Reg2, Expr, Length))
+    return true;
+  // If the register combination is not valid for any instruction, reject it.
+  // Otherwise, fall back to reporting an unrecognized instruction.
+  if (HaveReg1 && Reg1.Group != RegGR && Reg1.Group != RegV
+      && parseAddressRegister(Reg1))
+    return true;
+  if (HaveReg2 && parseAddressRegister(Reg2))
+    return true;
+
+  SMLoc EndLoc =
+    SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  if (HaveReg1 || HaveReg2 || Length)
+    Operands.push_back(SystemZOperand::createInvalid(StartLoc, EndLoc));
+  else
+    Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
+  return false;
+}
+
+bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                               OperandVector &Operands,
+                                               MCStreamer &Out,
+                                               uint64_t &ErrorInfo,
+                                               bool MatchingInlineAsm) {
+  MCInst Inst;
+  unsigned MatchResult;
+
+  MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                                     MatchingInlineAsm);
+  switch (MatchResult) {
+  case Match_Success:
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst, getSTI());
+    return false;
+
+  case Match_MissingFeature: {
+    assert(ErrorInfo && "Unknown missing feature!");
+    // Special case the error message for the very common case where only
+    // a single subtarget feature is missing
+    std::string Msg = "instruction requires:";
+    uint64_t Mask = 1;
+    for (unsigned I = 0; I < sizeof(ErrorInfo) * 8 - 1; ++I) {
+      if (ErrorInfo & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg);
+  }
+
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0ULL) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((SystemZOperand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction");
+  }
+
+  llvm_unreachable("Unexpected match type");
+}
+
+OperandMatchResultTy
+SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
+                             int64_t MaxVal, bool AllowTLS) {
+  MCContext &Ctx = getContext();
+  MCStreamer &Out = getStreamer();
+  const MCExpr *Expr;
+  SMLoc StartLoc = Parser.getTok().getLoc();
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_NoMatch;
+
+  // For consistency with the GNU assembler, treat immediates as offsets
+  // from ".".
+  if (auto *CE = dyn_cast<MCConstantExpr>(Expr)) {
+    int64_t Value = CE->getValue();
+    if ((Value & 1) || Value < MinVal || Value > MaxVal) {
+      Error(StartLoc, "offset out of range");
+      return MatchOperand_ParseFail;
+    }
+    MCSymbol *Sym = Ctx.createTempSymbol();
+    Out.EmitLabel(Sym);
+    const MCExpr *Base = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None,
+                                                 Ctx);
+    Expr = Value == 0 ? Base : MCBinaryExpr::createAdd(Base, Expr, Ctx);
+  }
+
+  // Optionally match :tls_gdcall: or :tls_ldcall: followed by a TLS symbol.
+  const MCExpr *Sym = nullptr;
+  if (AllowTLS && getLexer().is(AsmToken::Colon)) {
+    Parser.Lex();
+
+    if (Parser.getTok().isNot(AsmToken::Identifier)) {
+      Error(Parser.getTok().getLoc(), "unexpected token");
+      return MatchOperand_ParseFail;
+    }
+
+    MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+    StringRef Name = Parser.getTok().getString();
+    if (Name == "tls_gdcall")
+      Kind = MCSymbolRefExpr::VK_TLSGD;
+    else if (Name == "tls_ldcall")
+      Kind = MCSymbolRefExpr::VK_TLSLDM;
+    else {
+      Error(Parser.getTok().getLoc(), "unknown TLS tag");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+
+    if (Parser.getTok().isNot(AsmToken::Colon)) {
+      Error(Parser.getTok().getLoc(), "unexpected token");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+
+    if (Parser.getTok().isNot(AsmToken::Identifier)) {
+      Error(Parser.getTok().getLoc(), "unexpected token");
+      return MatchOperand_ParseFail;
+    }
+
+    StringRef Identifier = Parser.getTok().getString();
+    Sym = MCSymbolRefExpr::create(Ctx.getOrCreateSymbol(Identifier),
+                                  Kind, Ctx);
+    Parser.Lex();
+  }
+
+  SMLoc EndLoc =
+    SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+  if (AllowTLS)
+    Operands.push_back(SystemZOperand::createImmTLS(Expr, Sym,
+                                                    StartLoc, EndLoc));
+  else
+    Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
+
+  return MatchOperand_Success;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeSystemZAsmParser() {
+  RegisterMCAsmParser<SystemZAsmParser> X(getTheSystemZTarget());
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/contrib/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
new file mode 100644
index 000000000000..1806e015f61e
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -0,0 +1,451 @@
+//===-- SystemZDisassembler.cpp - Disassembler for SystemZ ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+class SystemZDisassembler : public MCDisassembler {
+public:
+  SystemZDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+    : MCDisassembler(STI, Ctx) {}
+  ~SystemZDisassembler() override {}
+
+  DecodeStatus getInstruction(MCInst &instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
+};
+} // end anonymous namespace
+
+static MCDisassembler *createSystemZDisassembler(const Target &T,
+                                                 const MCSubtargetInfo &STI,
+                                                 MCContext &Ctx) {
+  return new SystemZDisassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeSystemZDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(getTheSystemZTarget(),
+                                         createSystemZDisassembler);
+}
+
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst.
+///
+/// @param Value      - The immediate Value, has had any PC adjustment made by
+///                     the caller.
+/// @param isBranch   - If the instruction is a branch instruction
+/// @param Address    - The starting address of the instruction
+/// @param Offset     - The byte offset to this immediate in the instruction
+/// @param Width      - The byte width of this immediate in the instruction
+///
+/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was
+/// called then that function is called to get any symbolic information for the
+/// immediate in the instruction using the Address, Offset and Width.  If that
+/// returns non-zero then the symbolic information it returns is used to create
+/// an MCExpr and that is added as an operand to the MCInst.  If getOpInfo()
+/// returns zero and isBranch is true then a symbol look up for immediate Value
+/// is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with the immediate Value is created.  This function returns true
+/// if it adds an operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
+                                     uint64_t Address, uint64_t Offset,
+                                     uint64_t Width, MCInst &MI,
+                                     const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
+                                       Offset, Width);
+}
+
+static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                        const unsigned *Regs, unsigned Size) {
+  assert(RegNo < Size && "Invalid register");
+  RegNo = Regs[RegNo];
+  if (RegNo == 0)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(RegNo));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR32Regs, 16);
+}
+
+static DecodeStatus DecodeGRH32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GRH32Regs, 16);
+}
+
+static DecodeStatus DecodeGR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs, 16);
+}
+
+static DecodeStatus DecodeGR128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR128Regs, 16);
+}
+
+static DecodeStatus DecodeADDR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs, 16);
+}
+
+static DecodeStatus DecodeFP32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP32Regs, 16);
+}
+
+static DecodeStatus DecodeFP64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP64Regs, 16);
+}
+
+static DecodeStatus DecodeFP128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP128Regs, 16);
+}
+
+static DecodeStatus DecodeVR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::VR32Regs, 32);
+}
+
+static DecodeStatus DecodeVR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::VR64Regs, 32);
+}
+
+static DecodeStatus DecodeVR128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::VR128Regs, 32);
+}
+
+static DecodeStatus DecodeAR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::AR32Regs, 16);
+}
+
+template<unsigned N>
+static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm) {
+  if (!isUInt<N>(Imm))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(Imm));
+  return MCDisassembler::Success;
+}
+
+template<unsigned N>
+static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm) {
+  if (!isUInt<N>(Imm))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(SignExtend64<N>(Imm)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeU1ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<1>(Inst, Imm);
+}
+
+static DecodeStatus decodeU2ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<2>(Inst, Imm);
+}
+
+static DecodeStatus decodeU3ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<3>(Inst, Imm);
+}
+
+static DecodeStatus decodeU4ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<4>(Inst, Imm);
+}
+
+static DecodeStatus decodeU6ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<6>(Inst, Imm);
+}
+
+static DecodeStatus decodeU8ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<8>(Inst, Imm);
+}
+
+static DecodeStatus decodeU12ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<12>(Inst, Imm);
+}
+
+static DecodeStatus decodeU16ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<16>(Inst, Imm);
+}
+
+static DecodeStatus decodeU32ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<32>(Inst, Imm);
+}
+
+static DecodeStatus decodeS8ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeSImmOperand<8>(Inst, Imm);
+}
+
+static DecodeStatus decodeS16ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeSImmOperand<16>(Inst, Imm);
+}
+
+static DecodeStatus decodeS32ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeSImmOperand<32>(Inst, Imm);
+}
+
+template<unsigned N>
+static DecodeStatus decodePCDBLOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address,
+                                       bool isBranch,
+                                       const void *Decoder) {
+  assert(isUInt<N>(Imm) && "Invalid PC-relative offset");
+  uint64_t Value = SignExtend64<N>(Imm) * 2 + Address;
+
+  if (!tryAddingSymbolicOperand(Value, isBranch, Address, 2, N / 8,
+                                Inst, Decoder))
+    Inst.addOperand(MCOperand::createImm(Value));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodePC12DBLBranchOperand(MCInst &Inst, uint64_t Imm,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodePCDBLOperand<12>(Inst, Imm, Address, true, Decoder);
+}
+
+static DecodeStatus decodePC16DBLBranchOperand(MCInst &Inst, uint64_t Imm,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodePCDBLOperand<16>(Inst, Imm, Address, true, Decoder);
+}
+
+static DecodeStatus decodePC24DBLBranchOperand(MCInst &Inst, uint64_t Imm,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodePCDBLOperand<24>(Inst, Imm, Address, true, Decoder);
+}
+
+static DecodeStatus decodePC32DBLBranchOperand(MCInst &Inst, uint64_t Imm,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodePCDBLOperand<32>(Inst, Imm, Address, true, Decoder);
+}
+
+static DecodeStatus decodePC32DBLOperand(MCInst &Inst, uint64_t Imm,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  return decodePCDBLOperand<32>(Inst, Imm, Address, false, Decoder);
+}
+
+static DecodeStatus decodeBDAddr12Operand(MCInst &Inst, uint64_t Field,
+                                          const unsigned *Regs) {
+  uint64_t Base = Field >> 12;
+  uint64_t Disp = Field & 0xfff;
+  assert(Base < 16 && "Invalid BDAddr12");
+  Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::createImm(Disp));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDAddr20Operand(MCInst &Inst, uint64_t Field,
+                                          const unsigned *Regs) {
+  uint64_t Base = Field >> 20;
+  uint64_t Disp = ((Field << 12) & 0xff000) | ((Field >> 8) & 0xfff);
+  assert(Base < 16 && "Invalid BDAddr20");
+  Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::createImm(SignExtend64<20>(Disp)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDXAddr12Operand(MCInst &Inst, uint64_t Field,
+                                           const unsigned *Regs) {
+  uint64_t Index = Field >> 16;
+  uint64_t Base = (Field >> 12) & 0xf;
+  uint64_t Disp = Field & 0xfff;
+  assert(Index < 16 && "Invalid BDXAddr12");
+  Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::createImm(Disp));
+  Inst.addOperand(MCOperand::createReg(Index == 0 ? 0 : Regs[Index]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDXAddr20Operand(MCInst &Inst, uint64_t Field,
+                                           const unsigned *Regs) {
+  uint64_t Index = Field >> 24;
+  uint64_t Base = (Field >> 20) & 0xf;
+  uint64_t Disp = ((Field & 0xfff00) >> 8) | ((Field & 0xff) << 12);
+  assert(Index < 16 && "Invalid BDXAddr20");
+  Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::createImm(SignExtend64<20>(Disp)));
+  Inst.addOperand(MCOperand::createReg(Index == 0 ? 0 : Regs[Index]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDLAddr12Len8Operand(MCInst &Inst, uint64_t Field,
+                                               const unsigned *Regs) {
+  uint64_t Length = Field >> 16;
+  uint64_t Base = (Field >> 12) & 0xf;
+  uint64_t Disp = Field & 0xfff;
+  assert(Length < 256 && "Invalid BDLAddr12Len8");
+  Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::createImm(Disp));
+  Inst.addOperand(MCOperand::createImm(Length + 1));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDRAddr12Operand(MCInst &Inst, uint64_t Field,
+                                           const unsigned *Regs) {
+  uint64_t Length = Field >> 16;
+  uint64_t Base = (Field >> 12) & 0xf;
+  uint64_t Disp = Field & 0xfff;
+  assert(Length < 16 && "Invalid BDRAddr12");
+  Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::createImm(Disp));
+  Inst.addOperand(MCOperand::createReg(Regs[Length]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDVAddr12Operand(MCInst &Inst, uint64_t Field,
+                                           const unsigned *Regs) {
+  uint64_t Index = Field >> 16;
+  uint64_t Base = (Field >> 12) & 0xf;
+  uint64_t Disp = Field & 0xfff;
+  assert(Index < 32 && "Invalid BDVAddr12");
+  Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::createImm(Disp));
+  Inst.addOperand(MCOperand::createReg(SystemZMC::VR128Regs[Index]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDAddr32Disp12Operand(MCInst &Inst, uint64_t Field,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeBDAddr12Operand(Inst, Field, SystemZMC::GR32Regs);
+}
+
+static DecodeStatus decodeBDAddr32Disp20Operand(MCInst &Inst, uint64_t Field,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeBDAddr20Operand(Inst, Field, SystemZMC::GR32Regs);
+}
+
+static DecodeStatus decodeBDAddr64Disp12Operand(MCInst &Inst, uint64_t Field,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeBDAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeBDAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDXAddr64Disp12Operand(MCInst &Inst, uint64_t Field,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  return decodeBDXAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDXAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  return decodeBDXAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDLAddr64Disp12Len8Operand(MCInst &Inst,
+                                                     uint64_t Field,
+                                                     uint64_t Address,
+                                                     const void *Decoder) {
+  return decodeBDLAddr12Len8Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDRAddr64Disp12Operand(MCInst &Inst,
+                                                 uint64_t Field,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  return decodeBDRAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDVAddr64Disp12Operand(MCInst &Inst, uint64_t Field,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  return decodeBDVAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+#include "SystemZGenDisassemblerTables.inc"
+
+DecodeStatus SystemZDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                 ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address,
+                                                 raw_ostream &OS,
+                                                 raw_ostream &CS) const {
+  // Get the first two bytes of the instruction.
+  Size = 0;
+  if (Bytes.size() < 2)
+    return MCDisassembler::Fail;
+
+  // The top 2 bits of the first byte specify the size.
+  const uint8_t *Table;
+  if (Bytes[0] < 0x40) {
+    Size = 2;
+    Table = DecoderTable16;
+  } else if (Bytes[0] < 0xc0) {
+    Size = 4;
+    Table = DecoderTable32;
+  } else {
+    Size = 6;
+    Table = DecoderTable48;
+  }
+
+  // Read any remaining bytes.
+  if (Bytes.size() < Size)
+    return MCDisassembler::Fail;
+
+  // Construct the instruction.
+  uint64_t Inst = 0;
+  for (uint64_t I = 0; I < Size; ++I)
+    Inst = (Inst << 8) | Bytes[I];
+
+  return decodeInstruction(Table, MI, Inst, Address, this, STI);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
new file mode 100644
index 000000000000..1207c7b327e8
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -0,0 +1,231 @@
+//===-- SystemZInstPrinter.cpp - Convert SystemZ MCInst to assembly syntax ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZInstPrinter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "SystemZGenAsmWriter.inc"
+
+void SystemZInstPrinter::printAddress(unsigned Base, int64_t Disp,
+                                      unsigned Index, raw_ostream &O) {
+  O << Disp;
+  if (Base || Index) {
+    O << '(';
+    if (Index) {
+      O << '%' << getRegisterName(Index);
+      if (Base)
+        O << ',';
+    }
+    if (Base)
+      O << '%' << getRegisterName(Base);
+    O << ')';
+  }
+}
+
+void SystemZInstPrinter::printOperand(const MCOperand &MO, const MCAsmInfo *MAI,
+                                      raw_ostream &O) {
+  if (MO.isReg())
+    O << '%' << getRegisterName(MO.getReg());
+  else if (MO.isImm())
+    O << MO.getImm();
+  else if (MO.isExpr())
+    MO.getExpr()->print(O, MAI);
+  else
+    llvm_unreachable("Invalid operand");
+}
+
+void SystemZInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                   StringRef Annot,
+                                   const MCSubtargetInfo &STI) {
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+void SystemZInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
+  O << '%' << getRegisterName(RegNo);
+}
+
+template <unsigned N>
+static void printUImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isUInt<N>(Value) && "Invalid uimm argument");
+  O << Value;
+}
+
+template <unsigned N>
+static void printSImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) {
+  int64_t Value = MI->getOperand(OpNum).getImm();
+  assert(isInt<N>(Value) && "Invalid simm argument");
+  O << Value;
+}
+
+void SystemZInstPrinter::printU1ImmOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  printUImmOperand<1>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU2ImmOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  printUImmOperand<2>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU3ImmOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  printUImmOperand<3>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU4ImmOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  printUImmOperand<4>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU6ImmOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  printUImmOperand<6>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printS8ImmOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  printSImmOperand<8>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU8ImmOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  printUImmOperand<8>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU12ImmOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  printUImmOperand<12>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printS16ImmOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  printSImmOperand<16>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU16ImmOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  printUImmOperand<16>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printS32ImmOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  printSImmOperand<32>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU32ImmOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  printUImmOperand<32>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printU48ImmOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  printUImmOperand<48>(MI, OpNum, O);
+}
+
+void SystemZInstPrinter::printPCRelOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  if (MO.isImm()) {
+    O << "0x";
+    O.write_hex(MO.getImm());
+  } else
+    MO.getExpr()->print(O, &MAI);
+}
+
+void SystemZInstPrinter::printPCRelTLSOperand(const MCInst *MI, int OpNum,
+                                              raw_ostream &O) {
+  // Output the PC-relative operand.
+  printPCRelOperand(MI, OpNum, O);
+
+  // Output the TLS marker if present.
+  if ((unsigned)OpNum + 1 < MI->getNumOperands()) {
+    const MCOperand &MO = MI->getOperand(OpNum + 1);
+    const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*MO.getExpr());
+    switch (refExp.getKind()) {
+      case MCSymbolRefExpr::VK_TLSGD:
+        O << ":tls_gdcall:";
+        break;
+      case MCSymbolRefExpr::VK_TLSLDM:
+        O << ":tls_ldcall:";
+        break;
+      default:
+        llvm_unreachable("Unexpected symbol kind");
+    }
+    O << refExp.getSymbol().getName();
+  }
+}
+
+void SystemZInstPrinter::printOperand(const MCInst *MI, int OpNum,
+                                      raw_ostream &O) {
+  printOperand(MI->getOperand(OpNum), &MAI, O);
+}
+
+void SystemZInstPrinter::printBDAddrOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+  printAddress(MI->getOperand(OpNum).getReg(),
+               MI->getOperand(OpNum + 1).getImm(), 0, O);
+}
+
+void SystemZInstPrinter::printBDXAddrOperand(const MCInst *MI, int OpNum,
+                                             raw_ostream &O) {
+  printAddress(MI->getOperand(OpNum).getReg(),
+               MI->getOperand(OpNum + 1).getImm(),
+               MI->getOperand(OpNum + 2).getReg(), O);
+}
+
+void SystemZInstPrinter::printBDLAddrOperand(const MCInst *MI, int OpNum,
+                                             raw_ostream &O) {
+  unsigned Base = MI->getOperand(OpNum).getReg();
+  uint64_t Disp = MI->getOperand(OpNum + 1).getImm();
+  uint64_t Length = MI->getOperand(OpNum + 2).getImm();
+  O << Disp << '(' << Length;
+  if (Base)
+    O << ",%" << getRegisterName(Base);
+  O << ')';
+}
+
+void SystemZInstPrinter::printBDRAddrOperand(const MCInst *MI, int OpNum,
+                                             raw_ostream &O) {
+  unsigned Base = MI->getOperand(OpNum).getReg();
+  uint64_t Disp = MI->getOperand(OpNum + 1).getImm();
+  unsigned Length = MI->getOperand(OpNum + 2).getReg();
+  O << Disp << "(%" << getRegisterName(Length);
+  if (Base)
+    O << ",%" << getRegisterName(Base);
+  O << ')';
+}
+
+void SystemZInstPrinter::printBDVAddrOperand(const MCInst *MI, int OpNum,
+                                             raw_ostream &O) {
+  printAddress(MI->getOperand(OpNum).getReg(),
+               MI->getOperand(OpNum + 1).getImm(),
+               MI->getOperand(OpNum + 2).getReg(), O);
+}
+
+void SystemZInstPrinter::printCond4Operand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  static const char *const CondNames[] = {
+    "o", "h", "nle", "l", "nhe", "lh", "ne",
+    "e", "nlh", "he", "nl", "le", "nh", "no"
+  };
+  uint64_t Imm = MI->getOperand(OpNum).getImm();
+  assert(Imm > 0 && Imm < 15 && "Invalid condition");
+  O << CondNames[Imm - 1];
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
new file mode 100644
index 000000000000..6336f5ee0efa
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
@@ -0,0 +1,75 @@
+//==- SystemZInstPrinter.h - Convert SystemZ MCInst to assembly --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a SystemZ MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+class MCOperand;
+
+class SystemZInstPrinter : public MCInstPrinter {
+public:
+  SystemZInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                     const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  // Automatically generated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  // Print an address with the given base, displacement and index.
+  static void printAddress(unsigned Base, int64_t Disp, unsigned Index,
+                           raw_ostream &O);
+
+  // Print the given operand.
+  static void printOperand(const MCOperand &MO, const MCAsmInfo *MAI,
+                           raw_ostream &O);
+
+  // Override MCInstPrinter.
+  void printRegName(raw_ostream &O, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+
+private:
+  // Print various types of operand.
+  void printOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printBDAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printBDXAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printBDLAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printBDRAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printBDVAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU1ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU2ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU3ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU4ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU6ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printS8ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU8ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU12ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printS16ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU16ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printS32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printU48ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printPCRelOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printPCRelTLSOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+
+  // Print the mnemonic for a condition-code mask ("ne", "lh", etc.)
+  // This forms part of the instruction name rather than the operand list.
+  void printCond4Operand(const MCInst *MI, int OpNum, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
new file mode 100644
index 000000000000..9192448afd04
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -0,0 +1,126 @@
+//===-- SystemZMCAsmBackend.cpp - SystemZ assembler backend ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "MCTargetDesc/SystemZMCFixups.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectWriter.h"
+
+using namespace llvm;
+
+// Value is a fully-resolved relocation value: Symbol + Addend [- Pivot].
+// Return the bits that should be installed in a relocation field for
+// fixup kind Kind.
+static uint64_t extractBitsForFixup(MCFixupKind Kind, uint64_t Value) {
+  if (Kind < FirstTargetFixupKind)
+    return Value;
+
+  switch (unsigned(Kind)) {
+  case SystemZ::FK_390_PC12DBL:
+  case SystemZ::FK_390_PC16DBL:
+  case SystemZ::FK_390_PC24DBL:
+  case SystemZ::FK_390_PC32DBL:
+    return (int64_t)Value / 2;
+
+  case SystemZ::FK_390_TLS_CALL:
+    return 0;
+  }
+
+  llvm_unreachable("Unknown fixup kind!");
+}
+
+namespace {
+class SystemZMCAsmBackend : public MCAsmBackend {
+  uint8_t OSABI;
+public:
+  SystemZMCAsmBackend(uint8_t osABI)
+    : OSABI(osABI) {}
+
+  // Override MCAsmBackend
+  unsigned getNumFixupKinds() const override {
+    return SystemZ::NumTargetFixupKinds;
+  }
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+  bool mayNeedRelaxation(const MCInst &Inst) const override {
+    return false;
+  }
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *Fragment,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {
+    llvm_unreachable("SystemZ does do not have assembler relaxation");
+  }
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createSystemZObjectWriter(OS, OSABI);
+  }
+};
+} // end anonymous namespace
+
+const MCFixupKindInfo &
+SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+  const static MCFixupKindInfo Infos[SystemZ::NumTargetFixupKinds] = {
+    { "FK_390_PC12DBL",  4, 12, MCFixupKindInfo::FKF_IsPCRel },
+    { "FK_390_PC16DBL",  0, 16, MCFixupKindInfo::FKF_IsPCRel },
+    { "FK_390_PC24DBL",  0, 24, MCFixupKindInfo::FKF_IsPCRel },
+    { "FK_390_PC32DBL",  0, 32, MCFixupKindInfo::FKF_IsPCRel },
+    { "FK_390_TLS_CALL", 0, 0, 0 }
+  };
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+         "Invalid kind!");
+  return Infos[Kind - FirstTargetFixupKind];
+}
+
+void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                     unsigned DataSize, uint64_t Value,
+                                     bool IsPCRel) const {
+  MCFixupKind Kind = Fixup.getKind();
+  unsigned Offset = Fixup.getOffset();
+  unsigned BitSize = getFixupKindInfo(Kind).TargetSize;
+  unsigned Size = (BitSize + 7) / 8;
+
+  assert(Offset + Size <= DataSize && "Invalid fixup offset!");
+
+  // Big-endian insertion of Size bytes.
+  Value = extractBitsForFixup(Kind, Value);
+  if (BitSize < 64)
+    Value &= ((uint64_t)1 << BitSize) - 1;
+  unsigned ShiftValue = (Size * 8) - 8;
+  for (unsigned I = 0; I != Size; ++I) {
+    Data[Offset + I] |= uint8_t(Value >> ShiftValue);
+    ShiftValue -= 8;
+  }
+}
+
+bool SystemZMCAsmBackend::writeNopData(uint64_t Count,
+                                       MCObjectWriter *OW) const {
+  for (uint64_t I = 0; I != Count; ++I)
+    OW->write8(7);
+  return true;
+}
+
+MCAsmBackend *llvm::createSystemZMCAsmBackend(const Target &T,
+                                              const MCRegisterInfo &MRI,
+                                              const Triple &TT, StringRef CPU,
+                                              const MCTargetOptions &Options) {
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
+  return new SystemZMCAsmBackend(OSABI);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
new file mode 100644
index 000000000000..b17977d41be1
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -0,0 +1,29 @@
+//===-- SystemZMCAsmInfo.cpp - SystemZ asm properties ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+
+using namespace llvm;
+
+SystemZMCAsmInfo::SystemZMCAsmInfo(const Triple &TT) {
+  PointerSize = 8;
+  CalleeSaveStackSlotSize = 8;
+  IsLittleEndian = false;
+
+  CommentString = "#";
+  ZeroDirective = "\t.space\t";
+  Data64bitsDirective = "\t.quad\t";
+  UsesELFSectionDirectiveForBSS = true;
+  SupportsDebugInformation = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  UseIntegratedAssembler = true;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
new file mode 100644
index 000000000000..800f89232063
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
@@ -0,0 +1,26 @@
+//====-- SystemZMCAsmInfo.h - SystemZ asm properties -----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCASMINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class Triple;
+
+class SystemZMCAsmInfo : public MCAsmInfoELF {
+public:
+  explicit SystemZMCAsmInfo(const Triple &TT);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
new file mode 100644
index 000000000000..7082abad716d
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -0,0 +1,284 @@
+//===-- SystemZMCCodeEmitter.cpp - Convert SystemZ code to machine code ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SystemZMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "MCTargetDesc/SystemZMCFixups.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+namespace {
+class SystemZMCCodeEmitter : public MCCodeEmitter {
+  const MCInstrInfo &MCII;
+  MCContext &Ctx;
+
+public:
+  SystemZMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+    : MCII(mcii), Ctx(ctx) {
+  }
+
+  ~SystemZMCCodeEmitter() override {}
+
+  // OVerride MCCodeEmitter.
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+private:
+  // Automatically generated by TableGen.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  // Called by the TableGen code to get the binary encoding of operand
+  // MO in MI.  Fixups is the list of fixups against MI.
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  // Called by the TableGen code to get the binary encoding of an address.
+  // The index or length, if any, is encoded first, followed by the base,
+  // followed by the displacement.  In a 20-bit displacement,
+  // the low 12 bits are encoded before the high 8 bits.
+  uint64_t getBDAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+  uint64_t getBDAddr20Encoding(const MCInst &MI, unsigned OpNum,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
+  uint64_t getBDXAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  uint64_t getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  uint64_t getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+  uint64_t getBDRAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  uint64_t getBDVAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  // Operand OpNum of MI needs a PC-relative fixup of kind Kind at
+  // Offset bytes from the start of MI.  Add the fixup to Fixups
+  // and return the in-place addend, which since we're a RELA target
+  // is always 0.  If AllowTLS is true and optional operand OpNum + 1
+  // is present, also emit a TLS call fixup for it.
+  uint64_t getPCRelEncoding(const MCInst &MI, unsigned OpNum,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            unsigned Kind, int64_t Offset,
+                            bool AllowTLS) const;
+
+  uint64_t getPC16DBLEncoding(const MCInst &MI, unsigned OpNum,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const {
+    return getPCRelEncoding(MI, OpNum, Fixups,
+                            SystemZ::FK_390_PC16DBL, 2, false);
+  }
+  uint64_t getPC32DBLEncoding(const MCInst &MI, unsigned OpNum,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const {
+    return getPCRelEncoding(MI, OpNum, Fixups,
+                            SystemZ::FK_390_PC32DBL, 2, false);
+  }
+  uint64_t getPC16DBLTLSEncoding(const MCInst &MI, unsigned OpNum,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const {
+    return getPCRelEncoding(MI, OpNum, Fixups,
+                            SystemZ::FK_390_PC16DBL, 2, true);
+  }
+  uint64_t getPC32DBLTLSEncoding(const MCInst &MI, unsigned OpNum,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const {
+    return getPCRelEncoding(MI, OpNum, Fixups,
+                            SystemZ::FK_390_PC32DBL, 2, true);
+  }
+  uint64_t getPC12DBLBPPEncoding(const MCInst &MI, unsigned OpNum,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const {
+    return getPCRelEncoding(MI, OpNum, Fixups,
+                            SystemZ::FK_390_PC12DBL, 1, false);
+  }
+  uint64_t getPC16DBLBPPEncoding(const MCInst &MI, unsigned OpNum,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const {
+    return getPCRelEncoding(MI, OpNum, Fixups,
+                            SystemZ::FK_390_PC16DBL, 4, false);
+  }
+  uint64_t getPC24DBLBPPEncoding(const MCInst &MI, unsigned OpNum,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const {
+    return getPCRelEncoding(MI, OpNum, Fixups,
+                            SystemZ::FK_390_PC24DBL, 3, false);
+  }
+
+private:
+  uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
+  void verifyInstructionPredicates(const MCInst &MI,
+                                   uint64_t AvailableFeatures) const;
+};
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                MCContext &Ctx) {
+  return new SystemZMCCodeEmitter(MCII, Ctx);
+}
+
+void SystemZMCCodeEmitter::
+encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  verifyInstructionPredicates(MI,
+                              computeAvailableFeatures(STI.getFeatureBits()));
+
+  uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
+  unsigned Size = MCII.get(MI.getOpcode()).getSize();
+  // Big-endian insertion of Size bytes.
+  unsigned ShiftValue = (Size * 8) - 8;
+  for (unsigned I = 0; I != Size; ++I) {
+    OS << uint8_t(Bits >> ShiftValue);
+    ShiftValue -= 8;
+  }
+}
+
+uint64_t SystemZMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+  if (MO.isImm())
+    return static_cast<uint64_t>(MO.getImm());
+  llvm_unreachable("Unexpected operand type!");
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  assert(isUInt<4>(Base) && isUInt<12>(Disp));
+  return (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDAddr20Encoding(const MCInst &MI, unsigned OpNum,
+                    SmallVectorImpl<MCFixup> &Fixups,
+                    const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  assert(isUInt<4>(Base) && isInt<20>(Disp));
+  return (Base << 20) | ((Disp & 0xfff) << 8) | ((Disp & 0xff000) >> 12);
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDXAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
+  assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Index));
+  return (Index << 16) | (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
+  assert(isUInt<4>(Base) && isInt<20>(Disp) && isUInt<4>(Index));
+  return (Index << 24) | (Base << 20) | ((Disp & 0xfff) << 8)
+    | ((Disp & 0xff000) >> 12);
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Len  = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI) - 1;
+  assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<8>(Len));
+  return (Len << 16) | (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDRAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Len  = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
+  assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Len));
+  return (Len << 16) | (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDVAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
+  assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<5>(Index));
+  return (Index << 16) | (Base << 12) | Disp;
+}
+
+uint64_t
+SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       unsigned Kind, int64_t Offset,
+                                       bool AllowTLS) const {
+  const MCOperand &MO = MI.getOperand(OpNum);
+  const MCExpr *Expr;
+  if (MO.isImm())
+    Expr = MCConstantExpr::create(MO.getImm() + Offset, Ctx);
+  else {
+    Expr = MO.getExpr();
+    if (Offset) {
+      // The operand value is relative to the start of MI, but the fixup
+      // is relative to the operand field itself, which is Offset bytes
+      // into MI.  Add Offset to the relocation value to cancel out
+      // this difference.
+      const MCExpr *OffsetExpr = MCConstantExpr::create(Offset, Ctx);
+      Expr = MCBinaryExpr::createAdd(Expr, OffsetExpr, Ctx);
+    }
+  }
+  Fixups.push_back(MCFixup::create(Offset, Expr, (MCFixupKind)Kind));
+
+  // Output the fixup for the TLS marker if present.
+  if (AllowTLS && OpNum + 1 < MI.getNumOperands()) {
+    const MCOperand &MOTLS = MI.getOperand(OpNum + 1);
+    Fixups.push_back(MCFixup::create(0, MOTLS.getExpr(),
+                                     (MCFixupKind)SystemZ::FK_390_TLS_CALL));
+  }
+  return 0;
+}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "SystemZGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
new file mode 100644
index 000000000000..c012accc14dd
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
@@ -0,0 +1,32 @@
+//===-- SystemZMCFixups.h - SystemZ-specific fixup entries ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCFIXUPS_H
+#define LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCFIXUPS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace SystemZ {
+enum FixupKind {
+  // These correspond directly to R_390_* relocations.
+  FK_390_PC12DBL = FirstTargetFixupKind,
+  FK_390_PC16DBL,
+  FK_390_PC24DBL,
+  FK_390_PC32DBL,
+  FK_390_TLS_CALL,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+} // end namespace SystemZ
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
new file mode 100644
index 000000000000..43a96e84289c
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -0,0 +1,164 @@
+//===-- SystemZMCObjectWriter.cpp - SystemZ ELF writer --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "MCTargetDesc/SystemZMCFixups.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+using namespace llvm;
+
+namespace {
+class SystemZObjectWriter : public MCELFObjectTargetWriter {
+public:
+  SystemZObjectWriter(uint8_t OSABI);
+
+  ~SystemZObjectWriter() override;
+
+protected:
+  // Override MCELFObjectTargetWriter.
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+};
+} // end anonymous namespace
+
+SystemZObjectWriter::SystemZObjectWriter(uint8_t OSABI)
+  : MCELFObjectTargetWriter(/*Is64Bit=*/true, OSABI, ELF::EM_S390,
+                            /*HasRelocationAddend=*/ true) {}
+
+SystemZObjectWriter::~SystemZObjectWriter() {
+}
+
+// Return the relocation type for an absolute value of MCFixupKind Kind.
+static unsigned getAbsoluteReloc(unsigned Kind) {
+  switch (Kind) {
+  case FK_Data_1: return ELF::R_390_8;
+  case FK_Data_2: return ELF::R_390_16;
+  case FK_Data_4: return ELF::R_390_32;
+  case FK_Data_8: return ELF::R_390_64;
+  }
+  llvm_unreachable("Unsupported absolute address");
+}
+
+// Return the relocation type for a PC-relative value of MCFixupKind Kind.
+static unsigned getPCRelReloc(unsigned Kind) {
+  switch (Kind) {
+  case FK_Data_2:                return ELF::R_390_PC16;
+  case FK_Data_4:                return ELF::R_390_PC32;
+  case FK_Data_8:                return ELF::R_390_PC64;
+  case SystemZ::FK_390_PC12DBL:  return ELF::R_390_PC12DBL;
+  case SystemZ::FK_390_PC16DBL:  return ELF::R_390_PC16DBL;
+  case SystemZ::FK_390_PC24DBL:  return ELF::R_390_PC24DBL;
+  case SystemZ::FK_390_PC32DBL:  return ELF::R_390_PC32DBL;
+  }
+  llvm_unreachable("Unsupported PC-relative address");
+}
+
+// Return the R_390_TLS_LE* relocation type for MCFixupKind Kind.
+static unsigned getTLSLEReloc(unsigned Kind) {
+  switch (Kind) {
+  case FK_Data_4: return ELF::R_390_TLS_LE32;
+  case FK_Data_8: return ELF::R_390_TLS_LE64;
+  }
+  llvm_unreachable("Unsupported absolute address");
+}
+
+// Return the R_390_TLS_LDO* relocation type for MCFixupKind Kind.
+static unsigned getTLSLDOReloc(unsigned Kind) {
+  switch (Kind) {
+  case FK_Data_4: return ELF::R_390_TLS_LDO32;
+  case FK_Data_8: return ELF::R_390_TLS_LDO64;
+  }
+  llvm_unreachable("Unsupported absolute address");
+}
+
+// Return the R_390_TLS_LDM* relocation type for MCFixupKind Kind.
+static unsigned getTLSLDMReloc(unsigned Kind) {
+  switch (Kind) {
+  case FK_Data_4: return ELF::R_390_TLS_LDM32;
+  case FK_Data_8: return ELF::R_390_TLS_LDM64;
+  case SystemZ::FK_390_TLS_CALL: return ELF::R_390_TLS_LDCALL;
+  }
+  llvm_unreachable("Unsupported absolute address");
+}
+
+// Return the R_390_TLS_GD* relocation type for MCFixupKind Kind.
+static unsigned getTLSGDReloc(unsigned Kind) {
+  switch (Kind) {
+  case FK_Data_4: return ELF::R_390_TLS_GD32;
+  case FK_Data_8: return ELF::R_390_TLS_GD64;
+  case SystemZ::FK_390_TLS_CALL: return ELF::R_390_TLS_GDCALL;
+  }
+  llvm_unreachable("Unsupported absolute address");
+}
+
+// Return the PLT relocation counterpart of MCFixupKind Kind.
+static unsigned getPLTReloc(unsigned Kind) {
+  switch (Kind) {
+  case SystemZ::FK_390_PC12DBL: return ELF::R_390_PLT12DBL;
+  case SystemZ::FK_390_PC16DBL: return ELF::R_390_PLT16DBL;
+  case SystemZ::FK_390_PC24DBL: return ELF::R_390_PLT24DBL;
+  case SystemZ::FK_390_PC32DBL: return ELF::R_390_PLT32DBL;
+  }
+  llvm_unreachable("Unsupported absolute address");
+}
+
+unsigned SystemZObjectWriter::getRelocType(MCContext &Ctx,
+                                           const MCValue &Target,
+                                           const MCFixup &Fixup,
+                                           bool IsPCRel) const {
+  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
+  unsigned Kind = Fixup.getKind();
+  switch (Modifier) {
+  case MCSymbolRefExpr::VK_None:
+    if (IsPCRel)
+      return getPCRelReloc(Kind);
+    return getAbsoluteReloc(Kind);
+
+  case MCSymbolRefExpr::VK_NTPOFF:
+    assert(!IsPCRel && "NTPOFF shouldn't be PC-relative");
+    return getTLSLEReloc(Kind);
+
+  case MCSymbolRefExpr::VK_INDNTPOFF:
+    if (IsPCRel && Kind == SystemZ::FK_390_PC32DBL)
+      return ELF::R_390_TLS_IEENT;
+    llvm_unreachable("Only PC-relative INDNTPOFF accesses are supported for now");
+
+  case MCSymbolRefExpr::VK_DTPOFF:
+    assert(!IsPCRel && "DTPOFF shouldn't be PC-relative");
+    return getTLSLDOReloc(Kind);
+
+  case MCSymbolRefExpr::VK_TLSLDM:
+    assert(!IsPCRel && "TLSLDM shouldn't be PC-relative");
+    return getTLSLDMReloc(Kind);
+
+  case MCSymbolRefExpr::VK_TLSGD:
+    assert(!IsPCRel && "TLSGD shouldn't be PC-relative");
+    return getTLSGDReloc(Kind);
+
+  case MCSymbolRefExpr::VK_GOT:
+    if (IsPCRel && Kind == SystemZ::FK_390_PC32DBL)
+      return ELF::R_390_GOTENT;
+    llvm_unreachable("Only PC-relative GOT accesses are supported for now");
+
+  case MCSymbolRefExpr::VK_PLT:
+    assert(IsPCRel && "@PLT shouldt be PC-relative");
+    return getPLTReloc(Kind);
+
+  default:
+    llvm_unreachable("Modifier not supported");
+  }
+}
+
+MCObjectWriter *llvm::createSystemZObjectWriter(raw_pwrite_stream &OS,
+                                                uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new SystemZObjectWriter(OSABI);
+  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/false);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
new file mode 100644
index 000000000000..dfea7e33fa15
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -0,0 +1,246 @@
+//===-- SystemZMCTargetDesc.cpp - SystemZ target descriptions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMCTargetDesc.h"
+#include "InstPrinter/SystemZInstPrinter.h"
+#include "SystemZMCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "SystemZGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "SystemZGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "SystemZGenRegisterInfo.inc"
+
+const unsigned SystemZMC::GR32Regs[16] = {
+  SystemZ::R0L, SystemZ::R1L, SystemZ::R2L, SystemZ::R3L,
+  SystemZ::R4L, SystemZ::R5L, SystemZ::R6L, SystemZ::R7L,
+  SystemZ::R8L, SystemZ::R9L, SystemZ::R10L, SystemZ::R11L,
+  SystemZ::R12L, SystemZ::R13L, SystemZ::R14L, SystemZ::R15L
+};
+
+const unsigned SystemZMC::GRH32Regs[16] = {
+  SystemZ::R0H, SystemZ::R1H, SystemZ::R2H, SystemZ::R3H,
+  SystemZ::R4H, SystemZ::R5H, SystemZ::R6H, SystemZ::R7H,
+  SystemZ::R8H, SystemZ::R9H, SystemZ::R10H, SystemZ::R11H,
+  SystemZ::R12H, SystemZ::R13H, SystemZ::R14H, SystemZ::R15H
+};
+
+const unsigned SystemZMC::GR64Regs[16] = {
+  SystemZ::R0D, SystemZ::R1D, SystemZ::R2D, SystemZ::R3D,
+  SystemZ::R4D, SystemZ::R5D, SystemZ::R6D, SystemZ::R7D,
+  SystemZ::R8D, SystemZ::R9D, SystemZ::R10D, SystemZ::R11D,
+  SystemZ::R12D, SystemZ::R13D, SystemZ::R14D, SystemZ::R15D
+};
+
+const unsigned SystemZMC::GR128Regs[16] = {
+  SystemZ::R0Q, 0, SystemZ::R2Q, 0,
+  SystemZ::R4Q, 0, SystemZ::R6Q, 0,
+  SystemZ::R8Q, 0, SystemZ::R10Q, 0,
+  SystemZ::R12Q, 0, SystemZ::R14Q, 0
+};
+
+const unsigned SystemZMC::FP32Regs[16] = {
+  SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S,
+  SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S,
+  SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S,
+  SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S
+};
+
+const unsigned SystemZMC::FP64Regs[16] = {
+  SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D,
+  SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D,
+  SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D,
+  SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D
+};
+
+const unsigned SystemZMC::FP128Regs[16] = {
+  SystemZ::F0Q, SystemZ::F1Q, 0, 0,
+  SystemZ::F4Q, SystemZ::F5Q, 0, 0,
+  SystemZ::F8Q, SystemZ::F9Q, 0, 0,
+  SystemZ::F12Q, SystemZ::F13Q, 0, 0
+};
+
+const unsigned SystemZMC::VR32Regs[32] = {
+  SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S,
+  SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S,
+  SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S,
+  SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S,
+  SystemZ::F16S, SystemZ::F17S, SystemZ::F18S, SystemZ::F19S,
+  SystemZ::F20S, SystemZ::F21S, SystemZ::F22S, SystemZ::F23S,
+  SystemZ::F24S, SystemZ::F25S, SystemZ::F26S, SystemZ::F27S,
+  SystemZ::F28S, SystemZ::F29S, SystemZ::F30S, SystemZ::F31S
+};
+
+const unsigned SystemZMC::VR64Regs[32] = {
+  SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D,
+  SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D,
+  SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D,
+  SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D,
+  SystemZ::F16D, SystemZ::F17D, SystemZ::F18D, SystemZ::F19D,
+  SystemZ::F20D, SystemZ::F21D, SystemZ::F22D, SystemZ::F23D,
+  SystemZ::F24D, SystemZ::F25D, SystemZ::F26D, SystemZ::F27D,
+  SystemZ::F28D, SystemZ::F29D, SystemZ::F30D, SystemZ::F31D
+};
+
+const unsigned SystemZMC::VR128Regs[32] = {
+  SystemZ::V0, SystemZ::V1, SystemZ::V2, SystemZ::V3,
+  SystemZ::V4, SystemZ::V5, SystemZ::V6, SystemZ::V7,
+  SystemZ::V8, SystemZ::V9, SystemZ::V10, SystemZ::V11,
+  SystemZ::V12, SystemZ::V13, SystemZ::V14, SystemZ::V15,
+  SystemZ::V16, SystemZ::V17, SystemZ::V18, SystemZ::V19,
+  SystemZ::V20, SystemZ::V21, SystemZ::V22, SystemZ::V23,
+  SystemZ::V24, SystemZ::V25, SystemZ::V26, SystemZ::V27,
+  SystemZ::V28, SystemZ::V29, SystemZ::V30, SystemZ::V31
+};
+
+const unsigned SystemZMC::AR32Regs[16] = {
+  SystemZ::A0, SystemZ::A1, SystemZ::A2, SystemZ::A3,
+  SystemZ::A4, SystemZ::A5, SystemZ::A6, SystemZ::A7,
+  SystemZ::A8, SystemZ::A9, SystemZ::A10, SystemZ::A11,
+  SystemZ::A12, SystemZ::A13, SystemZ::A14, SystemZ::A15
+};
+
+unsigned SystemZMC::getFirstReg(unsigned Reg) {
+  static unsigned Map[SystemZ::NUM_TARGET_REGS];
+  static bool Initialized = false;
+  if (!Initialized) {
+    for (unsigned I = 0; I < 16; ++I) {
+      Map[GR32Regs[I]] = I;
+      Map[GRH32Regs[I]] = I;
+      Map[GR64Regs[I]] = I;
+      Map[GR128Regs[I]] = I;
+      Map[FP128Regs[I]] = I;
+      Map[AR32Regs[I]] = I;
+    }
+    for (unsigned I = 0; I < 32; ++I) {
+      Map[VR32Regs[I]] = I;
+      Map[VR64Regs[I]] = I;
+      Map[VR128Regs[I]] = I;
+    }
+  }
+  assert(Reg < SystemZ::NUM_TARGET_REGS);
+  return Map[Reg];
+}
+
+static MCAsmInfo *createSystemZMCAsmInfo(const MCRegisterInfo &MRI,
+                                         const Triple &TT) {
+  MCAsmInfo *MAI = new SystemZMCAsmInfo(TT);
+  MCCFIInstruction Inst =
+      MCCFIInstruction::createDefCfa(nullptr,
+                                     MRI.getDwarfRegNum(SystemZ::R15D, true),
+                                     SystemZMC::CFAOffsetFromInitialSP);
+  MAI->addInitialFrameState(Inst);
+  return MAI;
+}
+
+static MCInstrInfo *createSystemZMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitSystemZMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createSystemZMCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitSystemZMCRegisterInfo(X, SystemZ::R14D);
+  return X;
+}
+
+static MCSubtargetInfo *
+createSystemZMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  return createSystemZMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+                              CodeModel::Model &CM) {
+  // For SystemZ we define the models as follows:
+  //
+  // Small:  BRASL can call any function and will use a stub if necessary.
+  //         Locally-binding symbols will always be in range of LARL.
+  //
+  // Medium: BRASL can call any function and will use a stub if necessary.
+  //         GOT slots and locally-defined text will always be in range
+  //         of LARL, but other symbols might not be.
+  //
+  // Large:  Equivalent to Medium for now.
+  //
+  // Kernel: Equivalent to Medium for now.
+  //
+  // This means that any PIC module smaller than 4GB meets the
+  // requirements of Small, so Small seems like the best default there.
+  //
+  // All symbols bind locally in a non-PIC module, so the choice is less
+  // obvious.  There are two cases:
+  //
+  // - When creating an executable, PLTs and copy relocations allow
+  //   us to treat external symbols as part of the executable.
+  //   Any executable smaller than 4GB meets the requirements of Small,
+  //   so that seems like the best default.
+  //
+  // - When creating JIT code, stubs will be in range of BRASL if the
+  //   image is less than 4GB in size.  GOT entries will likewise be
+  //   in range of LARL.  However, the JIT environment has no equivalent
+  //   of copy relocs, so locally-binding data symbols might not be in
+  //   the range of LARL.  We need the Medium model in that case.
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  else if (CM == CodeModel::JITDefault)
+    CM = RM == Reloc::PIC_ ? CodeModel::Small : CodeModel::Medium;
+}
+
+static MCInstPrinter *createSystemZMCInstPrinter(const Triple &T,
+                                                 unsigned SyntaxVariant,
+                                                 const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI) {
+  return new SystemZInstPrinter(MAI, MII, MRI);
+}
+
+extern "C" void LLVMInitializeSystemZTargetMC() {
+  // Register the MCAsmInfo.
+  TargetRegistry::RegisterMCAsmInfo(getTheSystemZTarget(),
+                                    createSystemZMCAsmInfo);
+
+  // Register the adjustCodeGenOpts.
+  TargetRegistry::registerMCAdjustCodeGenOpts(getTheSystemZTarget(),
+                                              adjustCodeGenOpts);
+
+  // Register the MCCodeEmitter.
+  TargetRegistry::RegisterMCCodeEmitter(getTheSystemZTarget(),
+                                        createSystemZMCCodeEmitter);
+
+  // Register the MCInstrInfo.
+  TargetRegistry::RegisterMCInstrInfo(getTheSystemZTarget(),
+                                      createSystemZMCInstrInfo);
+
+  // Register the MCRegisterInfo.
+  TargetRegistry::RegisterMCRegInfo(getTheSystemZTarget(),
+                                    createSystemZMCRegisterInfo);
+
+  // Register the MCSubtargetInfo.
+  TargetRegistry::RegisterMCSubtargetInfo(getTheSystemZTarget(),
+                                          createSystemZMCSubtargetInfo);
+
+  // Register the MCAsmBackend.
+  TargetRegistry::RegisterMCAsmBackend(getTheSystemZTarget(),
+                                       createSystemZMCAsmBackend);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(getTheSystemZTarget(),
+                                        createSystemZMCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
new file mode 100644
index 000000000000..d9926c7e4986
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -0,0 +1,108 @@
+//===-- SystemZMCTargetDesc.h - SystemZ target descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCTARGETDESC_H
+#define LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCTargetOptions;
+class StringRef;
+class Target;
+class Triple;
+class raw_pwrite_stream;
+class raw_ostream;
+
+Target &getTheSystemZTarget();
+
+namespace SystemZMC {
+// How many bytes are in the ABI-defined, caller-allocated part of
+// a stack frame.
+const int64_t CallFrameSize = 160;
+
+// The offset of the DWARF CFA from the incoming stack pointer.
+const int64_t CFAOffsetFromInitialSP = CallFrameSize;
+
+// Maps of asm register numbers to LLVM register numbers, with 0 indicating
+// an invalid register.  In principle we could use 32-bit and 64-bit register
+// classes directly, provided that we relegated the GPR allocation order
+// in SystemZRegisterInfo.td to an AltOrder and left the default order
+// as %r0-%r15.  It seems better to provide the same interface for
+// all classes though.
+extern const unsigned GR32Regs[16];
+extern const unsigned GRH32Regs[16];
+extern const unsigned GR64Regs[16];
+extern const unsigned GR128Regs[16];
+extern const unsigned FP32Regs[16];
+extern const unsigned FP64Regs[16];
+extern const unsigned FP128Regs[16];
+extern const unsigned VR32Regs[32];
+extern const unsigned VR64Regs[32];
+extern const unsigned VR128Regs[32];
+extern const unsigned AR32Regs[16];
+
+// Return the 0-based number of the first architectural register that
+// contains the given LLVM register.   E.g. R1D -> 1.
+unsigned getFirstReg(unsigned Reg);
+
+// Return the given register as a GR64.
+inline unsigned getRegAsGR64(unsigned Reg) {
+  return GR64Regs[getFirstReg(Reg)];
+}
+
+// Return the given register as a low GR32.
+inline unsigned getRegAsGR32(unsigned Reg) {
+  return GR32Regs[getFirstReg(Reg)];
+}
+
+// Return the given register as a high GR32.
+inline unsigned getRegAsGRH32(unsigned Reg) {
+  return GRH32Regs[getFirstReg(Reg)];
+}
+
+// Return the given register as a VR128.
+inline unsigned getRegAsVR128(unsigned Reg) {
+  return VR128Regs[getFirstReg(Reg)];
+}
+} // end namespace SystemZMC
+
+MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
+                                          const MCRegisterInfo &MRI,
+                                          MCContext &Ctx);
+
+MCAsmBackend *createSystemZMCAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        const Triple &TT, StringRef CPU,
+                                        const MCTargetOptions &Options);
+
+MCObjectWriter *createSystemZObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI);
+} // end namespace llvm
+
+// Defines symbolic names for SystemZ registers.
+// This defines a mapping from register name to register number.
+#define GET_REGINFO_ENUM
+#include "SystemZGenRegisterInfo.inc"
+
+// Defines symbolic names for the SystemZ instructions.
+#define GET_INSTRINFO_ENUM
+#include "SystemZGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "SystemZGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/README.txt b/contrib/llvm/lib/Target/SystemZ/README.txt
new file mode 100644
index 000000000000..86a1322c9e23
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/README.txt
@@ -0,0 +1,154 @@
+//===---------------------------------------------------------------------===//
+// Random notes about and ideas for the SystemZ backend.
+//===---------------------------------------------------------------------===//
+
+The initial backend is deliberately restricted to z10.  We should add support
+for later architectures at some point.
+
+--
+
+If an inline asm ties an i32 "r" result to an i64 input, the input
+will be treated as an i32, leaving the upper bits uninitialised.
+For example:
+
+define void @f4(i32 *%dst) {
+  %val = call i32 asm "blah $0", "=r,0" (i64 103)
+  store i32 %val, i32 *%dst
+  ret void
+}
+
+from CodeGen/SystemZ/asm-09.ll will use LHI rather than LGHI.
+to load 103.  This seems to be a general target-independent problem.
+
+--
+
+The tuning of the choice between LOAD ADDRESS (LA) and addition in
+SystemZISelDAGToDAG.cpp is suspect.  It should be tweaked based on
+performance measurements.
+
+--
+
+There is no scheduling support.
+
+--
+
+We don't use the BRANCH ON INDEX instructions.
+
+--
+
+We only use MVC, XC and CLC for constant-length block operations.
+We could extend them to variable-length operations too,
+using EXECUTE RELATIVE LONG.
+
+MVCIN, MVCLE and CLCLE may be worthwhile too.
+
+--
+
+We don't use CUSE or the TRANSLATE family of instructions for string
+operations.  The TRANSLATE ones are probably more difficult to exploit.
+
+--
+
+We don't take full advantage of builtins like fabsl because the calling
+conventions require f128s to be returned by invisible reference.
+
+--
+
+ADD LOGICAL WITH SIGNED IMMEDIATE could be useful when we need to
+produce a carry.  SUBTRACT LOGICAL IMMEDIATE could be useful when we
+need to produce a borrow.  (Note that there are no memory forms of
+ADD LOGICAL WITH CARRY and SUBTRACT LOGICAL WITH BORROW, so the high
+part of 128-bit memory operations would probably need to be done
+via a register.)
+
+--
+
+We don't use ICM or STCM.
+
+--
+
+DAGCombiner doesn't yet fold truncations of extended loads.  Functions like:
+
+    unsigned long f (unsigned long x, unsigned short *y)
+    {
+      return (x << 32) | *y;
+    }
+
+therefore end up as:
+
+        sllg    %r2, %r2, 32
+        llgh    %r0, 0(%r3)
+        lr      %r2, %r0
+        br      %r14
+
+but truncating the load would give:
+
+        sllg    %r2, %r2, 32
+        lh      %r2, 0(%r3)
+        br      %r14
+
+--
+
+Functions like:
+
+define i64 @f1(i64 %a) {
+  %and = and i64 %a, 1
+  ret i64 %and
+}
+
+ought to be implemented as:
+
+        lhi     %r0, 1
+        ngr     %r2, %r0
+        br      %r14
+
+but two-address optimizations reverse the order of the AND and force:
+
+        lhi     %r0, 1
+        ngr     %r0, %r2
+        lgr     %r2, %r0
+        br      %r14
+
+CodeGen/SystemZ/and-04.ll has several examples of this.
+
+--
+
+Out-of-range displacements are usually handled by loading the full
+address into a register.  In many cases it would be better to create
+an anchor point instead.  E.g. for:
+
+define void @f4a(i128 *%aptr, i64 %base) {
+  %addr = add i64 %base, 524288
+  %bptr = inttoptr i64 %addr to i128 *
+  %a = load volatile i128 *%aptr
+  %b = load i128 *%bptr
+  %add = add i128 %a, %b
+  store i128 %add, i128 *%aptr
+  ret void
+}
+
+(from CodeGen/SystemZ/int-add-08.ll) we load %base+524288 and %base+524296
+into separate registers, rather than using %base+524288 as a base for both.
+
+--
+
+Dynamic stack allocations round the size to 8 bytes and then allocate
+that rounded amount.  It would be simpler to subtract the unrounded
+size from the copy of the stack pointer and then align the result.
+See CodeGen/SystemZ/alloca-01.ll for an example.
+
+--
+
+If needed, we can support 16-byte atomics using LPQ, STPQ and CSDG.
+
+--
+
+We might want to model all access registers and use them to spill
+32-bit values.
+
+--
+
+We might want to use the 'overflow' condition of eg. AR to support
+llvm.sadd.with.overflow.i32 and related instructions - the generated code
+for signed overflow check is currently quite bad.  This would improve
+the results of using -ftrapv.
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZ.h b/contrib/llvm/lib/Target/SystemZ/SystemZ.h
new file mode 100644
index 000000000000..9a8e508e4119
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZ.h
@@ -0,0 +1,185 @@
+//==- SystemZ.h - Top-Level Interface for SystemZ representation -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM SystemZ backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZ_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZ_H
+
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+class SystemZTargetMachine;
+class FunctionPass;
+
+namespace SystemZ {
+// Condition-code mask values.
+const unsigned CCMASK_0 = 1 << 3;
+const unsigned CCMASK_1 = 1 << 2;
+const unsigned CCMASK_2 = 1 << 1;
+const unsigned CCMASK_3 = 1 << 0;
+const unsigned CCMASK_ANY = CCMASK_0 | CCMASK_1 | CCMASK_2 | CCMASK_3;
+
+// Condition-code mask assignments for integer and floating-point
+// comparisons.
+const unsigned CCMASK_CMP_EQ = CCMASK_0;
+const unsigned CCMASK_CMP_LT = CCMASK_1;
+const unsigned CCMASK_CMP_GT = CCMASK_2;
+const unsigned CCMASK_CMP_NE = CCMASK_CMP_LT | CCMASK_CMP_GT;
+const unsigned CCMASK_CMP_LE = CCMASK_CMP_EQ | CCMASK_CMP_LT;
+const unsigned CCMASK_CMP_GE = CCMASK_CMP_EQ | CCMASK_CMP_GT;
+
+// Condition-code mask assignments for floating-point comparisons only.
+const unsigned CCMASK_CMP_UO = CCMASK_3;
+const unsigned CCMASK_CMP_O  = CCMASK_ANY ^ CCMASK_CMP_UO;
+
+// All condition-code values produced by comparisons.
+const unsigned CCMASK_ICMP = CCMASK_0 | CCMASK_1 | CCMASK_2;
+const unsigned CCMASK_FCMP = CCMASK_0 | CCMASK_1 | CCMASK_2 | CCMASK_3;
+
+// Condition-code mask assignments for CS.
+const unsigned CCMASK_CS_EQ = CCMASK_0;
+const unsigned CCMASK_CS_NE = CCMASK_1;
+const unsigned CCMASK_CS    = CCMASK_0 | CCMASK_1;
+
+// Condition-code mask assignments for a completed SRST loop.
+const unsigned CCMASK_SRST_FOUND    = CCMASK_1;
+const unsigned CCMASK_SRST_NOTFOUND = CCMASK_2;
+const unsigned CCMASK_SRST          = CCMASK_1 | CCMASK_2;
+
+// Condition-code mask assignments for TEST UNDER MASK.
+const unsigned CCMASK_TM_ALL_0       = CCMASK_0;
+const unsigned CCMASK_TM_MIXED_MSB_0 = CCMASK_1;
+const unsigned CCMASK_TM_MIXED_MSB_1 = CCMASK_2;
+const unsigned CCMASK_TM_ALL_1       = CCMASK_3;
+const unsigned CCMASK_TM_SOME_0      = CCMASK_TM_ALL_1 ^ CCMASK_ANY;
+const unsigned CCMASK_TM_SOME_1      = CCMASK_TM_ALL_0 ^ CCMASK_ANY;
+const unsigned CCMASK_TM_MSB_0       = CCMASK_0 | CCMASK_1;
+const unsigned CCMASK_TM_MSB_1       = CCMASK_2 | CCMASK_3;
+const unsigned CCMASK_TM             = CCMASK_ANY;
+
+// Condition-code mask assignments for TRANSACTION_BEGIN.
+const unsigned CCMASK_TBEGIN_STARTED       = CCMASK_0;
+const unsigned CCMASK_TBEGIN_INDETERMINATE = CCMASK_1;
+const unsigned CCMASK_TBEGIN_TRANSIENT     = CCMASK_2;
+const unsigned CCMASK_TBEGIN_PERSISTENT    = CCMASK_3;
+const unsigned CCMASK_TBEGIN               = CCMASK_ANY;
+
+// Condition-code mask assignments for TRANSACTION_END.
+const unsigned CCMASK_TEND_TX   = CCMASK_0;
+const unsigned CCMASK_TEND_NOTX = CCMASK_2;
+const unsigned CCMASK_TEND      = CCMASK_TEND_TX | CCMASK_TEND_NOTX;
+
+// Condition-code mask assignments for vector comparisons (and similar
+// operations).
+const unsigned CCMASK_VCMP_ALL       = CCMASK_0;
+const unsigned CCMASK_VCMP_MIXED     = CCMASK_1;
+const unsigned CCMASK_VCMP_NONE      = CCMASK_3;
+const unsigned CCMASK_VCMP           = CCMASK_0 | CCMASK_1 | CCMASK_3;
+
+// Condition-code mask assignments for Test Data Class.
+const unsigned CCMASK_TDC_NOMATCH   = CCMASK_0;
+const unsigned CCMASK_TDC_MATCH     = CCMASK_1;
+const unsigned CCMASK_TDC           = CCMASK_TDC_NOMATCH | CCMASK_TDC_MATCH;
+
+// The position of the low CC bit in an IPM result.
+const unsigned IPM_CC = 28;
+
+// Mask assignments for PFD.
+const unsigned PFD_READ  = 1;
+const unsigned PFD_WRITE = 2;
+
+// Mask assignments for TDC
+const unsigned TDCMASK_ZERO_PLUS       = 0x800;
+const unsigned TDCMASK_ZERO_MINUS      = 0x400;
+const unsigned TDCMASK_NORMAL_PLUS     = 0x200;
+const unsigned TDCMASK_NORMAL_MINUS    = 0x100;
+const unsigned TDCMASK_SUBNORMAL_PLUS  = 0x080;
+const unsigned TDCMASK_SUBNORMAL_MINUS = 0x040;
+const unsigned TDCMASK_INFINITY_PLUS   = 0x020;
+const unsigned TDCMASK_INFINITY_MINUS  = 0x010;
+const unsigned TDCMASK_QNAN_PLUS       = 0x008;
+const unsigned TDCMASK_QNAN_MINUS      = 0x004;
+const unsigned TDCMASK_SNAN_PLUS       = 0x002;
+const unsigned TDCMASK_SNAN_MINUS      = 0x001;
+
+const unsigned TDCMASK_ZERO            = TDCMASK_ZERO_PLUS | TDCMASK_ZERO_MINUS;
+const unsigned TDCMASK_POSITIVE        = TDCMASK_NORMAL_PLUS |
+                                         TDCMASK_SUBNORMAL_PLUS |
+                                         TDCMASK_INFINITY_PLUS;
+const unsigned TDCMASK_NEGATIVE        = TDCMASK_NORMAL_MINUS |
+                                         TDCMASK_SUBNORMAL_MINUS |
+                                         TDCMASK_INFINITY_MINUS;
+const unsigned TDCMASK_NAN             = TDCMASK_QNAN_PLUS |
+                                         TDCMASK_QNAN_MINUS |
+                                         TDCMASK_SNAN_PLUS |
+                                         TDCMASK_SNAN_MINUS;
+const unsigned TDCMASK_PLUS            = TDCMASK_POSITIVE |
+                                         TDCMASK_ZERO_PLUS |
+                                         TDCMASK_QNAN_PLUS |
+                                         TDCMASK_SNAN_PLUS;
+const unsigned TDCMASK_MINUS           = TDCMASK_NEGATIVE |
+                                         TDCMASK_ZERO_MINUS |
+                                         TDCMASK_QNAN_MINUS |
+                                         TDCMASK_SNAN_MINUS;
+const unsigned TDCMASK_ALL             = TDCMASK_PLUS | TDCMASK_MINUS;
+
+// Number of bits in a vector register.
+const unsigned VectorBits = 128;
+
+// Number of bytes in a vector register (and consequently the number of
+// bytes in a general permute vector).
+const unsigned VectorBytes = VectorBits / 8;
+
+// Return true if Val fits an LLILL operand.
+static inline bool isImmLL(uint64_t Val) {
+  return (Val & ~0x000000000000ffffULL) == 0;
+}
+
+// Return true if Val fits an LLILH operand.
+static inline bool isImmLH(uint64_t Val) {
+  return (Val & ~0x00000000ffff0000ULL) == 0;
+}
+
+// Return true if Val fits an LLIHL operand.
+static inline bool isImmHL(uint64_t Val) {
+  return (Val & ~0x00000ffff00000000ULL) == 0;
+}
+
+// Return true if Val fits an LLIHH operand.
+static inline bool isImmHH(uint64_t Val) {
+  return (Val & ~0xffff000000000000ULL) == 0;
+}
+
+// Return true if Val fits an LLILF operand.
+static inline bool isImmLF(uint64_t Val) {
+  return (Val & ~0x00000000ffffffffULL) == 0;
+}
+
+// Return true if Val fits an LLIHF operand.
+static inline bool isImmHF(uint64_t Val) {
+  return (Val & ~0xffffffff00000000ULL) == 0;
+}
+} // end namespace SystemZ
+
+FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM,
+                                   CodeGenOpt::Level OptLevel);
+FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZExpandPseudoPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZTDCPass();
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZ.td b/contrib/llvm/lib/Target/SystemZ/SystemZ.td
new file mode 100644
index 000000000000..6bdfd4d07edc
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZ.td
@@ -0,0 +1,75 @@
+//===-- SystemZ.td - Describe the SystemZ target machine -----*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// SystemZ subtarget features
+//===----------------------------------------------------------------------===//
+
+include "SystemZFeatures.td"
+
+//===----------------------------------------------------------------------===//
+// SystemZ subtarget scheduling models
+//===----------------------------------------------------------------------===//
+
+include "SystemZSchedule.td"
+
+//===----------------------------------------------------------------------===//
+// SystemZ supported processors
+//===----------------------------------------------------------------------===//
+
+include "SystemZProcessors.td"
+
+//===----------------------------------------------------------------------===//
+// Register file description
+//===----------------------------------------------------------------------===//
+
+include "SystemZRegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Calling convention description
+//===----------------------------------------------------------------------===//
+
+include "SystemZCallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction descriptions
+//===----------------------------------------------------------------------===//
+
+include "SystemZOperators.td"
+include "SystemZOperands.td"
+include "SystemZPatterns.td"
+include "SystemZInstrFormats.td"
+include "SystemZInstrInfo.td"
+include "SystemZInstrVector.td"
+include "SystemZInstrFP.td"
+
+def SystemZInstrInfo : InstrInfo {}
+
+//===----------------------------------------------------------------------===//
+// Assembly parser
+//===----------------------------------------------------------------------===//
+
+def SystemZAsmParser : AsmParser {
+  let ShouldEmitMatchRegisterName = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Top-level target declaration
+//===----------------------------------------------------------------------===//
+
+def SystemZ : Target {
+  let InstructionSet = SystemZInstrInfo;
+  let AssemblyParsers = [SystemZAsmParser];
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
new file mode 100644
index 000000000000..b39245b20b3c
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -0,0 +1,527 @@
+//===-- SystemZAsmPrinter.cpp - SystemZ LLVM assembly printer -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Streams SystemZ assembly language and associated data, in the form of
+// MCInsts and MCExprs respectively.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZAsmPrinter.h"
+#include "InstPrinter/SystemZInstPrinter.h"
+#include "SystemZConstantPoolValue.h"
+#include "SystemZMCInstLower.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+// Return an RI instruction like MI with opcode Opcode, but with the
+// GR64 register operands turned into GR32s.
+static MCInst lowerRILow(const MachineInstr *MI, unsigned Opcode) {
+  if (MI->isCompare())
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(1).getImm());
+  else
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(1).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+}
+
+// Return an RI instruction like MI with opcode Opcode, but with the
+// GR64 register operands turned into GRH32s.
+static MCInst lowerRIHigh(const MachineInstr *MI, unsigned Opcode) {
+  if (MI->isCompare())
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(1).getImm());
+  else
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(1).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+}
+
+// Return an RI instruction like MI with opcode Opcode, but with the
+// R2 register turned into a GR64.
+static MCInst lowerRIEfLow(const MachineInstr *MI, unsigned Opcode) {
+  return MCInstBuilder(Opcode)
+    .addReg(MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg())
+    .addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg()))
+    .addImm(MI->getOperand(3).getImm())
+    .addImm(MI->getOperand(4).getImm())
+    .addImm(MI->getOperand(5).getImm());
+}
+
+static const MCSymbolRefExpr *getTLSGetOffset(MCContext &Context) {
+  StringRef Name = "__tls_get_offset";
+  return MCSymbolRefExpr::create(Context.getOrCreateSymbol(Name),
+                                 MCSymbolRefExpr::VK_PLT,
+                                 Context);
+}
+
+static const MCSymbolRefExpr *getGlobalOffsetTable(MCContext &Context) {
+  StringRef Name = "_GLOBAL_OFFSET_TABLE_";
+  return MCSymbolRefExpr::create(Context.getOrCreateSymbol(Name),
+                                 MCSymbolRefExpr::VK_None,
+                                 Context);
+}
+
+// MI loads the high part of a vector from memory.  Return an instruction
+// that uses replicating vector load Opcode to do the same thing.
+static MCInst lowerSubvectorLoad(const MachineInstr *MI, unsigned Opcode) {
+  return MCInstBuilder(Opcode)
+    .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+    .addReg(MI->getOperand(1).getReg())
+    .addImm(MI->getOperand(2).getImm())
+    .addReg(MI->getOperand(3).getReg());
+}
+
+// MI stores the high part of a vector to memory.  Return an instruction
+// that uses elemental vector store Opcode to do the same thing.
+static MCInst lowerSubvectorStore(const MachineInstr *MI, unsigned Opcode) {
+  return MCInstBuilder(Opcode)
+    .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+    .addReg(MI->getOperand(1).getReg())
+    .addImm(MI->getOperand(2).getImm())
+    .addReg(MI->getOperand(3).getReg())
+    .addImm(0);
+}
+
+void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  SystemZMCInstLower Lower(MF->getContext(), *this);
+  MCInst LoweredMI;
+  switch (MI->getOpcode()) {
+  case SystemZ::Return:
+    LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R14D);
+    break;
+
+  case SystemZ::CondReturn:
+    LoweredMI = MCInstBuilder(SystemZ::BCR)
+      .addImm(MI->getOperand(0).getImm())
+      .addImm(MI->getOperand(1).getImm())
+      .addReg(SystemZ::R14D);
+    break;
+
+  case SystemZ::CRBReturn:
+    LoweredMI = MCInstBuilder(SystemZ::CRB)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R14D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CGRBReturn:
+    LoweredMI = MCInstBuilder(SystemZ::CGRB)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R14D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CIBReturn:
+    LoweredMI = MCInstBuilder(SystemZ::CIB)
+      .addReg(MI->getOperand(0).getReg())
+      .addImm(MI->getOperand(1).getImm())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R14D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CGIBReturn:
+    LoweredMI = MCInstBuilder(SystemZ::CGIB)
+      .addReg(MI->getOperand(0).getReg())
+      .addImm(MI->getOperand(1).getImm())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R14D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CLRBReturn:
+    LoweredMI = MCInstBuilder(SystemZ::CLRB)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R14D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CLGRBReturn:
+    LoweredMI = MCInstBuilder(SystemZ::CLGRB)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R14D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CLIBReturn:
+    LoweredMI = MCInstBuilder(SystemZ::CLIB)
+      .addReg(MI->getOperand(0).getReg())
+      .addImm(MI->getOperand(1).getImm())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R14D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CLGIBReturn:
+    LoweredMI = MCInstBuilder(SystemZ::CLGIB)
+      .addReg(MI->getOperand(0).getReg())
+      .addImm(MI->getOperand(1).getImm())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R14D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CallBRASL:
+    LoweredMI = MCInstBuilder(SystemZ::BRASL)
+      .addReg(SystemZ::R14D)
+      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_PLT));
+    break;
+
+  case SystemZ::CallBASR:
+    LoweredMI = MCInstBuilder(SystemZ::BASR)
+      .addReg(SystemZ::R14D)
+      .addReg(MI->getOperand(0).getReg());
+    break;
+
+  case SystemZ::CallJG:
+    LoweredMI = MCInstBuilder(SystemZ::JG)
+      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_PLT));
+    break;
+
+  case SystemZ::CallBRCL:
+    LoweredMI = MCInstBuilder(SystemZ::BRCL)
+      .addImm(MI->getOperand(0).getImm())
+      .addImm(MI->getOperand(1).getImm())
+      .addExpr(Lower.getExpr(MI->getOperand(2), MCSymbolRefExpr::VK_PLT));
+    break;
+
+  case SystemZ::CallBR:
+    LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R1D);
+    break;
+
+  case SystemZ::CallBCR:
+    LoweredMI = MCInstBuilder(SystemZ::BCR)
+      .addImm(MI->getOperand(0).getImm())
+      .addImm(MI->getOperand(1).getImm())
+      .addReg(SystemZ::R1D);
+    break;
+
+  case SystemZ::CRBCall:
+    LoweredMI = MCInstBuilder(SystemZ::CRB)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R1D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CGRBCall:
+    LoweredMI = MCInstBuilder(SystemZ::CGRB)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R1D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CIBCall:
+    LoweredMI = MCInstBuilder(SystemZ::CIB)
+      .addReg(MI->getOperand(0).getReg())
+      .addImm(MI->getOperand(1).getImm())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R1D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CGIBCall:
+    LoweredMI = MCInstBuilder(SystemZ::CGIB)
+      .addReg(MI->getOperand(0).getReg())
+      .addImm(MI->getOperand(1).getImm())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R1D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CLRBCall:
+    LoweredMI = MCInstBuilder(SystemZ::CLRB)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R1D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CLGRBCall:
+    LoweredMI = MCInstBuilder(SystemZ::CLGRB)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R1D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CLIBCall:
+    LoweredMI = MCInstBuilder(SystemZ::CLIB)
+      .addReg(MI->getOperand(0).getReg())
+      .addImm(MI->getOperand(1).getImm())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R1D)
+      .addImm(0);
+    break;
+
+  case SystemZ::CLGIBCall:
+    LoweredMI = MCInstBuilder(SystemZ::CLGIB)
+      .addReg(MI->getOperand(0).getReg())
+      .addImm(MI->getOperand(1).getImm())
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(SystemZ::R1D)
+      .addImm(0);
+    break;
+
+  case SystemZ::TLS_GDCALL:
+    LoweredMI = MCInstBuilder(SystemZ::BRASL)
+      .addReg(SystemZ::R14D)
+      .addExpr(getTLSGetOffset(MF->getContext()))
+      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_TLSGD));
+    break;
+
+  case SystemZ::TLS_LDCALL:
+    LoweredMI = MCInstBuilder(SystemZ::BRASL)
+      .addReg(SystemZ::R14D)
+      .addExpr(getTLSGetOffset(MF->getContext()))
+      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_TLSLDM));
+    break;
+
+  case SystemZ::GOT:
+    LoweredMI = MCInstBuilder(SystemZ::LARL)
+      .addReg(MI->getOperand(0).getReg())
+      .addExpr(getGlobalOffsetTable(MF->getContext()));
+    break;
+
+  case SystemZ::IILF64:
+    LoweredMI = MCInstBuilder(SystemZ::IILF)
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+    break;
+
+  case SystemZ::IIHF64:
+    LoweredMI = MCInstBuilder(SystemZ::IIHF)
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+    break;
+
+  case SystemZ::RISBHH:
+  case SystemZ::RISBHL:
+    LoweredMI = lowerRIEfLow(MI, SystemZ::RISBHG);
+    break;
+
+  case SystemZ::RISBLH:
+  case SystemZ::RISBLL:
+    LoweredMI = lowerRIEfLow(MI, SystemZ::RISBLG);
+    break;
+
+  case SystemZ::VLVGP32:
+    LoweredMI = MCInstBuilder(SystemZ::VLVGP)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(SystemZMC::getRegAsGR64(MI->getOperand(1).getReg()))
+      .addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg()));
+    break;
+
+  case SystemZ::VLR32:
+  case SystemZ::VLR64:
+    LoweredMI = MCInstBuilder(SystemZ::VLR)
+      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg()));
+    break;
+
+  case SystemZ::VL32:
+    LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPF);
+    break;
+
+  case SystemZ::VL64:
+    LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPG);
+    break;
+
+  case SystemZ::VST32:
+    LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEF);
+    break;
+
+  case SystemZ::VST64:
+    LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEG);
+    break;
+
+  case SystemZ::LFER:
+    LoweredMI = MCInstBuilder(SystemZ::VLGVF)
+      .addReg(SystemZMC::getRegAsGR64(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg()))
+      .addReg(0).addImm(0);
+    break;
+
+  case SystemZ::LEFR:
+    LoweredMI = MCInstBuilder(SystemZ::VLVGF)
+      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
+      .addReg(MI->getOperand(1).getReg())
+      .addReg(0).addImm(0);
+    break;
+
+#define LOWER_LOW(NAME)                                                 \
+  case SystemZ::NAME##64: LoweredMI = lowerRILow(MI, SystemZ::NAME); break
+
+  LOWER_LOW(IILL);
+  LOWER_LOW(IILH);
+  LOWER_LOW(TMLL);
+  LOWER_LOW(TMLH);
+  LOWER_LOW(NILL);
+  LOWER_LOW(NILH);
+  LOWER_LOW(NILF);
+  LOWER_LOW(OILL);
+  LOWER_LOW(OILH);
+  LOWER_LOW(OILF);
+  LOWER_LOW(XILF);
+
+#undef LOWER_LOW
+
+#define LOWER_HIGH(NAME) \
+  case SystemZ::NAME##64: LoweredMI = lowerRIHigh(MI, SystemZ::NAME); break
+
+  LOWER_HIGH(IIHL);
+  LOWER_HIGH(IIHH);
+  LOWER_HIGH(TMHL);
+  LOWER_HIGH(TMHH);
+  LOWER_HIGH(NIHL);
+  LOWER_HIGH(NIHH);
+  LOWER_HIGH(NIHF);
+  LOWER_HIGH(OIHL);
+  LOWER_HIGH(OIHH);
+  LOWER_HIGH(OIHF);
+  LOWER_HIGH(XIHF);
+
+#undef LOWER_HIGH
+
+  case SystemZ::Serialize:
+    if (MF->getSubtarget<SystemZSubtarget>().hasFastSerialization())
+      LoweredMI = MCInstBuilder(SystemZ::BCRAsm)
+        .addImm(14).addReg(SystemZ::R0D);
+    else
+      LoweredMI = MCInstBuilder(SystemZ::BCRAsm)
+        .addImm(15).addReg(SystemZ::R0D);
+    break;
+
+  // Emit nothing here but a comment if we can.
+  case SystemZ::MemBarrier:
+    OutStreamer->emitRawComment("MEMBARRIER");
+    return;
+
+  // We want to emit "j .+2" for traps, jumping to the relative immediate field
+  // of the jump instruction, which is an illegal instruction. We cannot emit a
+  // "." symbol, so create and emit a temp label before the instruction and use
+  // that instead.
+  case SystemZ::Trap: {
+    MCSymbol *DotSym = OutContext.createTempSymbol();
+    OutStreamer->EmitLabel(DotSym);
+
+    const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(DotSym, OutContext);
+    const MCConstantExpr *ConstExpr = MCConstantExpr::create(2, OutContext);
+    LoweredMI = MCInstBuilder(SystemZ::J)
+      .addExpr(MCBinaryExpr::createAdd(Expr, ConstExpr, OutContext));
+    }
+    break;
+
+  // Conditional traps will create a branch on condition instruction that jumps
+  // to the relative immediate field of the jump instruction. (eg. "jo .+2")
+  case SystemZ::CondTrap: {
+    MCSymbol *DotSym = OutContext.createTempSymbol();
+    OutStreamer->EmitLabel(DotSym);
+
+    const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(DotSym, OutContext);
+    const MCConstantExpr *ConstExpr = MCConstantExpr::create(2, OutContext);
+    LoweredMI = MCInstBuilder(SystemZ::BRC)
+      .addImm(MI->getOperand(0).getImm())
+      .addImm(MI->getOperand(1).getImm())
+      .addExpr(MCBinaryExpr::createAdd(Expr, ConstExpr, OutContext));
+    }
+    break;
+
+  default:
+    Lower.lower(MI, LoweredMI);
+    break;
+  }
+  EmitToStreamer(*OutStreamer, LoweredMI);
+}
+
+// Convert a SystemZ-specific constant pool modifier into the associated
+// MCSymbolRefExpr variant kind.
+static MCSymbolRefExpr::VariantKind
+getModifierVariantKind(SystemZCP::SystemZCPModifier Modifier) {
+  switch (Modifier) {
+  case SystemZCP::TLSGD: return MCSymbolRefExpr::VK_TLSGD;
+  case SystemZCP::TLSLDM: return MCSymbolRefExpr::VK_TLSLDM;
+  case SystemZCP::DTPOFF: return MCSymbolRefExpr::VK_DTPOFF;
+  case SystemZCP::NTPOFF: return MCSymbolRefExpr::VK_NTPOFF;
+  }
+  llvm_unreachable("Invalid SystemCPModifier!");
+}
+
+void SystemZAsmPrinter::
+EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+  auto *ZCPV = static_cast<SystemZConstantPoolValue*>(MCPV);
+
+  const MCExpr *Expr =
+    MCSymbolRefExpr::create(getSymbol(ZCPV->getGlobalValue()),
+                            getModifierVariantKind(ZCPV->getModifier()),
+                            OutContext);
+  uint64_t Size = getDataLayout().getTypeAllocSize(ZCPV->getType());
+
+  OutStreamer->EmitValue(Expr, Size);
+}
+
+bool SystemZAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
+                                        unsigned OpNo,
+                                        unsigned AsmVariant,
+                                        const char *ExtraCode,
+                                        raw_ostream &OS) {
+  if (ExtraCode && *ExtraCode == 'n') {
+    if (!MI->getOperand(OpNo).isImm())
+      return true;
+    OS << -int64_t(MI->getOperand(OpNo).getImm());
+  } else {
+    SystemZMCInstLower Lower(MF->getContext(), *this);
+    MCOperand MO(Lower.lowerOperand(MI->getOperand(OpNo)));
+    SystemZInstPrinter::printOperand(MO, MAI, OS);
+  }
+  return false;
+}
+
+bool SystemZAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                              unsigned OpNo,
+                                              unsigned AsmVariant,
+                                              const char *ExtraCode,
+                                              raw_ostream &OS) {
+  SystemZInstPrinter::printAddress(MI->getOperand(OpNo).getReg(),
+                                   MI->getOperand(OpNo + 1).getImm(),
+                                   MI->getOperand(OpNo + 2).getReg(), OS);
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeSystemZAsmPrinter() {
+  RegisterAsmPrinter<SystemZAsmPrinter> X(getTheSystemZTarget());
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
new file mode 100644
index 000000000000..fe8c88fe23e3
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -0,0 +1,42 @@
+//===-- SystemZAsmPrinter.h - SystemZ LLVM assembly printer ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H
+
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class MCStreamer;
+class MachineBasicBlock;
+class MachineInstr;
+class Module;
+class raw_ostream;
+
+class LLVM_LIBRARY_VISIBILITY SystemZAsmPrinter : public AsmPrinter {
+public:
+  SystemZAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)) {}
+
+  // Override AsmPrinter.
+  StringRef getPassName() const override { return "SystemZ Assembly Printer"; }
+  void EmitInstruction(const MachineInstr *MI) override;
+  void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &OS) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &OS) override;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
new file mode 100644
index 000000000000..72da51f74b10
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
@@ -0,0 +1,21 @@
+//===-- SystemZCallingConv.cpp - Calling conventions for SystemZ ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZCallingConv.h"
+#include "SystemZRegisterInfo.h"
+
+using namespace llvm;
+
+const MCPhysReg SystemZ::ArgGPRs[SystemZ::NumArgGPRs] = {
+  SystemZ::R2D, SystemZ::R3D, SystemZ::R4D, SystemZ::R5D, SystemZ::R6D
+};
+
+const MCPhysReg SystemZ::ArgFPRs[SystemZ::NumArgFPRs] = {
+  SystemZ::F0D, SystemZ::F2D, SystemZ::F4D, SystemZ::F6D
+};
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.h
new file mode 100644
index 000000000000..b5523e586f4c
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.h
@@ -0,0 +1,130 @@
+//===-- SystemZCallingConv.h - Calling conventions for SystemZ --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCALLINGCONV_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCALLINGCONV_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/MC/MCRegisterInfo.h"
+
+namespace llvm {
+namespace SystemZ {
+  const unsigned NumArgGPRs = 5;
+  extern const MCPhysReg ArgGPRs[NumArgGPRs];
+
+  const unsigned NumArgFPRs = 4;
+  extern const MCPhysReg ArgFPRs[NumArgFPRs];
+} // end namespace SystemZ
+
+class SystemZCCState : public CCState {
+private:
+  /// Records whether the value was a fixed argument.
+  /// See ISD::OutputArg::IsFixed.
+  SmallVector<bool, 4> ArgIsFixed;
+
+  /// Records whether the value was widened from a short vector type.
+  SmallVector<bool, 4> ArgIsShortVector;
+
+  // Check whether ArgVT is a short vector type.
+  bool IsShortVectorType(EVT ArgVT) {
+    return ArgVT.isVector() && ArgVT.getStoreSize() <= 8;
+  }
+
+public:
+  SystemZCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+                 SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
+      : CCState(CC, isVarArg, MF, locs, C) {}
+
+  void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                              CCAssignFn Fn) {
+    // Formal arguments are always fixed.
+    ArgIsFixed.clear();
+    for (unsigned i = 0; i < Ins.size(); ++i)
+      ArgIsFixed.push_back(true);
+    // Record whether the call operand was a short vector.
+    ArgIsShortVector.clear();
+    for (unsigned i = 0; i < Ins.size(); ++i)
+      ArgIsShortVector.push_back(IsShortVectorType(Ins[i].ArgVT));
+
+    CCState::AnalyzeFormalArguments(Ins, Fn);
+  }
+
+  void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           CCAssignFn Fn) {
+    // Record whether the call operand was a fixed argument.
+    ArgIsFixed.clear();
+    for (unsigned i = 0; i < Outs.size(); ++i)
+      ArgIsFixed.push_back(Outs[i].IsFixed);
+    // Record whether the call operand was a short vector.
+    ArgIsShortVector.clear();
+    for (unsigned i = 0; i < Outs.size(); ++i)
+      ArgIsShortVector.push_back(IsShortVectorType(Outs[i].ArgVT));
+
+    CCState::AnalyzeCallOperands(Outs, Fn);
+  }
+
+  // This version of AnalyzeCallOperands in the base class is not usable
+  // since we must provide a means of accessing ISD::OutputArg::IsFixed.
+  void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs,
+                           SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+                           CCAssignFn Fn) = delete;
+
+  bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; }
+  bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; }
+};
+
+// Handle i128 argument types.  These need to be passed by implicit
+// reference.  This could be as simple as the following .td line:
+//    CCIfType<[i128], CCPassIndirect<i64>>,
+// except that i128 is not a legal type, and therefore gets split by
+// common code into a pair of i64 arguments.
+inline bool CC_SystemZ_I128Indirect(unsigned &ValNo, MVT &ValVT,
+                                    MVT &LocVT,
+                                    CCValAssign::LocInfo &LocInfo,
+                                    ISD::ArgFlagsTy &ArgFlags,
+                                    CCState &State) {
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+  // ArgFlags.isSplit() is true on the first part of a i128 argument;
+  // PendingMembers.empty() is false on all subsequent parts.
+  if (!ArgFlags.isSplit() && PendingMembers.empty())
+    return false;
+
+  // Push a pending Indirect value location for each part.
+  LocVT = MVT::i64;
+  LocInfo = CCValAssign::Indirect;
+  PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT,
+                                                   LocVT, LocInfo));
+  if (!ArgFlags.isSplitEnd())
+    return true;
+
+  // OK, we've collected all parts in the pending list.  Allocate
+  // the location (register or stack slot) for the indirect pointer.
+  // (This duplicates the usual i64 calling convention rules.)
+  unsigned Reg = State.AllocateReg(SystemZ::ArgGPRs);
+  unsigned Offset = Reg ? 0 : State.AllocateStack(8, 8);
+
+  // Use that same location for all the pending parts.
+  for (auto &It : PendingMembers) {
+    if (Reg)
+      It.convertToReg(Reg);
+    else
+      It.convertToMem(Offset);
+    State.addLoc(It);
+  }
+
+  PendingMembers.clear();
+
+  return true;
+}
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td
new file mode 100644
index 000000000000..2bf5ac29865f
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -0,0 +1,122 @@
+//=- SystemZCallingConv.td - Calling conventions for SystemZ -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for the SystemZ ABI.
+//===----------------------------------------------------------------------===//
+
+class CCIfExtend<CCAction A>
+  : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
+
+class CCIfSubtarget<string F, CCAction A>
+  : CCIf<!strconcat("static_cast<const SystemZSubtarget&>"
+                    "(State.getMachineFunction().getSubtarget()).", F),
+         A>;
+
+// Match if this specific argument is a fixed (i.e. named) argument.
+class CCIfFixed<CCAction A>
+    : CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>;
+
+// Match if this specific argument was widened from a short vector type.
+class CCIfShortVector<CCAction A>
+    : CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>;
+
+
+//===----------------------------------------------------------------------===//
+// z/Linux return value calling convention
+//===----------------------------------------------------------------------===//
+def RetCC_SystemZ : CallingConv<[
+  // Promote i32 to i64 if it has an explicit extension type.
+  CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>,
+
+  // A SwiftError is returned in R9.
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R9D]>>>,
+
+  // ABI-compliant code returns 64-bit integers in R2.  Make the other
+  // call-clobbered argument registers available for code that doesn't
+  // care about the ABI.  (R6 is an argument register too, but is
+  // call-saved and therefore not suitable for return values.)
+  CCIfType<[i32], CCAssignToReg<[R2L, R3L, R4L, R5L]>>,
+  CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D]>>,
+
+  // ABI-complaint code returns float and double in F0.  Make the
+  // other floating-point argument registers available for code that
+  // doesn't care about the ABI.  All floating-point argument registers
+  // are call-clobbered, so we can use all of them here.
+  CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+  CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
+
+  // Similarly for vectors, with V24 being the ABI-compliant choice.
+  // Sub-128 vectors are returned in the same way, but they're widened
+  // to one of these types during type legalization.
+  CCIfSubtarget<"hasVector()",
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+             CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// z/Linux argument calling conventions
+//===----------------------------------------------------------------------===//
+def CC_SystemZ : CallingConv<[
+  // Promote i32 to i64 if it has an explicit extension type.
+  // The convention is that true integer arguments that are smaller
+  // than 64 bits should be marked as extended, but structures that
+  // are smaller than 64 bits shouldn't.
+  CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>,
+
+  // A SwiftSelf is passed in callee-saved R10.
+  CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R10D]>>>,
+
+  // A SwiftError is passed in callee-saved R9.
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R9D]>>>,
+
+  // Force long double values to the stack and pass i64 pointers to them.
+  CCIfType<[f128], CCPassIndirect<i64>>,
+  // Same for i128 values.  These are already split into two i64 here,
+  // so we have to use a custom handler.
+  CCIfType<[i64], CCCustom<"CC_SystemZ_I128Indirect">>,
+
+  // The first 5 integer arguments are passed in R2-R6.  Note that R6
+  // is call-saved.
+  CCIfType<[i32], CCAssignToReg<[R2L, R3L, R4L, R5L, R6L]>>,
+  CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
+
+  // The first 4 float and double arguments are passed in even registers F0-F6.
+  CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+  CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
+
+  // The first 8 named vector arguments are passed in V24-V31.  Sub-128 vectors
+  // are passed in the same way, but they're widened to one of these types
+  // during type legalization.
+  CCIfSubtarget<"hasVector()",
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+             CCIfFixed<CCAssignToReg<[V24, V26, V28, V30,
+                                      V25, V27, V29, V31]>>>>,
+
+  // However, sub-128 vectors which need to go on the stack occupy just a
+  // single 8-byte-aligned 8-byte stack slot.  Pass as i64.
+  CCIfSubtarget<"hasVector()",
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+             CCIfShortVector<CCBitConvertToType<i64>>>>,
+
+  // Other vector arguments are passed in 8-byte-aligned 16-byte stack slots.
+  CCIfSubtarget<"hasVector()",
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+             CCAssignToStack<16, 8>>>,
+
+  // Other arguments are passed in 8-byte-aligned 8-byte stack slots.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// z/Linux callee-saved registers
+//===----------------------------------------------------------------------===//
+def CSR_SystemZ : CalleeSavedRegs<(add (sequence "R%dD", 6, 15),
+                                       (sequence "F%dD", 8, 15))>;
+
+// R9 is used to return SwiftError; remove it from CSR.
+def CSR_SystemZ_SwiftError : CalleeSavedRegs<(sub CSR_SystemZ, R9D)>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
new file mode 100644
index 000000000000..4a6beb67f182
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -0,0 +1,52 @@
+//===-- SystemZConstantPoolValue.cpp - SystemZ constant-pool value --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZConstantPoolValue.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+SystemZConstantPoolValue::
+SystemZConstantPoolValue(const GlobalValue *gv,
+                         SystemZCP::SystemZCPModifier modifier)
+  : MachineConstantPoolValue(gv->getType()), GV(gv), Modifier(modifier) {}
+
+SystemZConstantPoolValue *
+SystemZConstantPoolValue::Create(const GlobalValue *GV,
+                                 SystemZCP::SystemZCPModifier Modifier) {
+  return new SystemZConstantPoolValue(GV, Modifier);
+}
+
+int SystemZConstantPoolValue::
+getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) {
+  unsigned AlignMask = Alignment - 1;
+  const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants();
+  for (unsigned I = 0, E = Constants.size(); I != E; ++I) {
+    if (Constants[I].isMachineConstantPoolEntry() &&
+        (Constants[I].getAlignment() & AlignMask) == 0) {
+      auto *ZCPV =
+        static_cast<SystemZConstantPoolValue *>(Constants[I].Val.MachineCPVal);
+      if (ZCPV->GV == GV && ZCPV->Modifier == Modifier)
+        return I;
+    }
+  }
+  return -1;
+}
+
+void SystemZConstantPoolValue::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(GV);
+  ID.AddInteger(Modifier);
+}
+
+void SystemZConstantPoolValue::print(raw_ostream &O) const {
+  O << GV << "@" << int(Modifier);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h
new file mode 100644
index 000000000000..a71b595560d2
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h
@@ -0,0 +1,58 @@
+//===- SystemZConstantPoolValue.h - SystemZ constant-pool value -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCONSTANTPOOLVALUE_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCONSTANTPOOLVALUE_H
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+class GlobalValue;
+
+namespace SystemZCP {
+enum SystemZCPModifier {
+  TLSGD,
+  TLSLDM,
+  DTPOFF,
+  NTPOFF
+};
+} // end namespace SystemZCP
+
+/// A SystemZ-specific constant pool value.  At present, the only
+/// defined constant pool values are module IDs or offsets of
+/// thread-local variables (written x@TLSGD, x@TLSLDM, x@DTPOFF,
+/// or x@NTPOFF).
+class SystemZConstantPoolValue : public MachineConstantPoolValue {
+  const GlobalValue *GV;
+  SystemZCP::SystemZCPModifier Modifier;
+
+protected:
+  SystemZConstantPoolValue(const GlobalValue *GV,
+                           SystemZCP::SystemZCPModifier Modifier);
+
+public:
+  static SystemZConstantPoolValue *
+    Create(const GlobalValue *GV, SystemZCP::SystemZCPModifier Modifier);
+
+  // Override MachineConstantPoolValue.
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                unsigned Alignment) override;
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+  void print(raw_ostream &O) const override;
+
+  // Access SystemZ-specific fields.
+  const GlobalValue *getGlobalValue() const { return GV; }
+  SystemZCP::SystemZCPModifier getModifier() const { return Modifier; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
new file mode 100644
index 000000000000..b4c843f658aa
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -0,0 +1,575 @@
+//===-- SystemZElimCompare.cpp - Eliminate comparison instructions --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass:
+// (1) tries to remove compares if CC already contains the required information
+// (2) fuses compares and branches into COMPARE AND BRANCH instructions
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-elim-compare"
+
+STATISTIC(BranchOnCounts, "Number of branch-on-count instructions");
+STATISTIC(LoadAndTraps, "Number of load-and-trap instructions");
+STATISTIC(EliminatedComparisons, "Number of eliminated comparisons");
+STATISTIC(FusedComparisons, "Number of fused compare-and-branch instructions");
+
+namespace {
+// Represents the references to a particular register in one or more
+// instructions.
+struct Reference {
+  Reference()
+    : Def(false), Use(false) {}
+
+  Reference &operator|=(const Reference &Other) {
+    Def |= Other.Def;
+    Use |= Other.Use;
+    return *this;
+  }
+
+  explicit operator bool() const { return Def || Use; }
+
+  // True if the register is defined or used in some form, either directly or
+  // via a sub- or super-register.
+  bool Def;
+  bool Use;
+};
+
+class SystemZElimCompare : public MachineFunctionPass {
+public:
+  static char ID;
+  SystemZElimCompare(const SystemZTargetMachine &tm)
+    : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {}
+
+  StringRef getPassName() const override {
+    return "SystemZ Comparison Elimination";
+  }
+
+  bool processBlock(MachineBasicBlock &MBB);
+  bool runOnMachineFunction(MachineFunction &F) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  Reference getRegReferences(MachineInstr &MI, unsigned Reg);
+  bool convertToBRCT(MachineInstr &MI, MachineInstr &Compare,
+                     SmallVectorImpl<MachineInstr *> &CCUsers);
+  bool convertToLoadAndTrap(MachineInstr &MI, MachineInstr &Compare,
+                            SmallVectorImpl<MachineInstr *> &CCUsers);
+  bool convertToLoadAndTest(MachineInstr &MI);
+  bool adjustCCMasksForInstr(MachineInstr &MI, MachineInstr &Compare,
+                             SmallVectorImpl<MachineInstr *> &CCUsers);
+  bool optimizeCompareZero(MachineInstr &Compare,
+                           SmallVectorImpl<MachineInstr *> &CCUsers);
+  bool fuseCompareOperations(MachineInstr &Compare,
+                             SmallVectorImpl<MachineInstr *> &CCUsers);
+
+  const SystemZInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+};
+
+char SystemZElimCompare::ID = 0;
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZElimComparePass(SystemZTargetMachine &TM) {
+  return new SystemZElimCompare(TM);
+}
+
+// Return true if CC is live out of MBB.
+static bool isCCLiveOut(MachineBasicBlock &MBB) {
+  for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI)
+    if ((*SI)->isLiveIn(SystemZ::CC))
+      return true;
+  return false;
+}
+
+// Return true if any CC result of MI would reflect the value of Reg.
+static bool resultTests(MachineInstr &MI, unsigned Reg) {
+  if (MI.getNumOperands() > 0 && MI.getOperand(0).isReg() &&
+      MI.getOperand(0).isDef() && MI.getOperand(0).getReg() == Reg)
+    return true;
+
+  switch (MI.getOpcode()) {
+  case SystemZ::LR:
+  case SystemZ::LGR:
+  case SystemZ::LGFR:
+  case SystemZ::LTR:
+  case SystemZ::LTGR:
+  case SystemZ::LTGFR:
+  case SystemZ::LER:
+  case SystemZ::LDR:
+  case SystemZ::LXR:
+  case SystemZ::LTEBR:
+  case SystemZ::LTDBR:
+  case SystemZ::LTXBR:
+    if (MI.getOperand(1).getReg() == Reg)
+      return true;
+  }
+
+  return false;
+}
+
+// Describe the references to Reg or any of its aliases in MI.
+Reference SystemZElimCompare::getRegReferences(MachineInstr &MI, unsigned Reg) {
+  Reference Ref;
+  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI.getOperand(I);
+    if (MO.isReg()) {
+      if (unsigned MOReg = MO.getReg()) {
+        if (TRI->regsOverlap(MOReg, Reg)) {
+          if (MO.isUse())
+            Ref.Use = true;
+          else if (MO.isDef())
+            Ref.Def = true;
+        }
+      }
+    }
+  }
+  return Ref;
+}
+
+// Return true if this is a load and test which can be optimized the
+// same way as compare instruction.
+static bool isLoadAndTestAsCmp(MachineInstr &MI) {
+  // If we during isel used a load-and-test as a compare with 0, the
+  // def operand is dead.
+  return (MI.getOpcode() == SystemZ::LTEBR ||
+          MI.getOpcode() == SystemZ::LTDBR ||
+          MI.getOpcode() == SystemZ::LTXBR) &&
+         MI.getOperand(0).isDead();
+}
+
+// Return the source register of Compare, which is the unknown value
+// being tested.
+static unsigned getCompareSourceReg(MachineInstr &Compare) {
+  unsigned reg = 0;
+  if (Compare.isCompare())
+    reg = Compare.getOperand(0).getReg();
+  else if (isLoadAndTestAsCmp(Compare))
+    reg = Compare.getOperand(1).getReg();
+  assert (reg);
+
+  return reg;
+}
+
+// Compare compares the result of MI against zero.  If MI is an addition
+// of -1 and if CCUsers is a single branch on nonzero, eliminate the addition
+// and convert the branch to a BRCT(G) or BRCTH.  Return true on success.
+bool SystemZElimCompare::convertToBRCT(
+    MachineInstr &MI, MachineInstr &Compare,
+    SmallVectorImpl<MachineInstr *> &CCUsers) {
+  // Check whether we have an addition of -1.
+  unsigned Opcode = MI.getOpcode();
+  unsigned BRCT;
+  if (Opcode == SystemZ::AHI)
+    BRCT = SystemZ::BRCT;
+  else if (Opcode == SystemZ::AGHI)
+    BRCT = SystemZ::BRCTG;
+  else if (Opcode == SystemZ::AIH)
+    BRCT = SystemZ::BRCTH;
+  else
+    return false;
+  if (MI.getOperand(2).getImm() != -1)
+    return false;
+
+  // Check whether we have a single JLH.
+  if (CCUsers.size() != 1)
+    return false;
+  MachineInstr *Branch = CCUsers[0];
+  if (Branch->getOpcode() != SystemZ::BRC ||
+      Branch->getOperand(0).getImm() != SystemZ::CCMASK_ICMP ||
+      Branch->getOperand(1).getImm() != SystemZ::CCMASK_CMP_NE)
+    return false;
+
+  // We already know that there are no references to the register between
+  // MI and Compare.  Make sure that there are also no references between
+  // Compare and Branch.
+  unsigned SrcReg = getCompareSourceReg(Compare);
+  MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
+  for (++MBBI; MBBI != MBBE; ++MBBI)
+    if (getRegReferences(*MBBI, SrcReg))
+      return false;
+
+  // The transformation is OK.  Rebuild Branch as a BRCT(G) or BRCTH.
+  MachineOperand Target(Branch->getOperand(2));
+  while (Branch->getNumOperands())
+    Branch->RemoveOperand(0);
+  Branch->setDesc(TII->get(BRCT));
+  MachineInstrBuilder MIB(*Branch->getParent()->getParent(), Branch);
+  MIB.addOperand(MI.getOperand(0))
+     .addOperand(MI.getOperand(1))
+     .addOperand(Target);
+  // Add a CC def to BRCT(G), since we may have to split them again if the
+  // branch displacement overflows.  BRCTH has a 32-bit displacement, so
+  // this is not necessary there.
+  if (BRCT != SystemZ::BRCTH)
+    MIB.addReg(SystemZ::CC, RegState::ImplicitDefine | RegState::Dead);
+  MI.eraseFromParent();
+  return true;
+}
+
+// Compare compares the result of MI against zero.  If MI is a suitable load
+// instruction and if CCUsers is a single conditional trap on zero, eliminate
+// the load and convert the branch to a load-and-trap.  Return true on success.
+bool SystemZElimCompare::convertToLoadAndTrap(
+    MachineInstr &MI, MachineInstr &Compare,
+    SmallVectorImpl<MachineInstr *> &CCUsers) {
+  unsigned LATOpcode = TII->getLoadAndTrap(MI.getOpcode());
+  if (!LATOpcode)
+    return false;
+
+  // Check whether we have a single CondTrap that traps on zero.
+  if (CCUsers.size() != 1)
+    return false;
+  MachineInstr *Branch = CCUsers[0];
+  if (Branch->getOpcode() != SystemZ::CondTrap ||
+      Branch->getOperand(0).getImm() != SystemZ::CCMASK_ICMP ||
+      Branch->getOperand(1).getImm() != SystemZ::CCMASK_CMP_EQ)
+    return false;
+
+  // We already know that there are no references to the register between
+  // MI and Compare.  Make sure that there are also no references between
+  // Compare and Branch.
+  unsigned SrcReg = getCompareSourceReg(Compare);
+  MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
+  for (++MBBI; MBBI != MBBE; ++MBBI)
+    if (getRegReferences(*MBBI, SrcReg))
+      return false;
+
+  // The transformation is OK.  Rebuild Branch as a load-and-trap.
+  while (Branch->getNumOperands())
+    Branch->RemoveOperand(0);
+  Branch->setDesc(TII->get(LATOpcode));
+  MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
+      .addOperand(MI.getOperand(0))
+      .addOperand(MI.getOperand(1))
+      .addOperand(MI.getOperand(2))
+      .addOperand(MI.getOperand(3));
+  MI.eraseFromParent();
+  return true;
+}
+
+// If MI is a load instruction, try to convert it into a LOAD AND TEST.
+// Return true on success.
+bool SystemZElimCompare::convertToLoadAndTest(MachineInstr &MI) {
+  unsigned Opcode = TII->getLoadAndTest(MI.getOpcode());
+  if (!Opcode)
+    return false;
+
+  MI.setDesc(TII->get(Opcode));
+  MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+      .addReg(SystemZ::CC, RegState::ImplicitDefine);
+  return true;
+}
+
+// The CC users in CCUsers are testing the result of a comparison of some
+// value X against zero and we know that any CC value produced by MI
+// would also reflect the value of X.  Try to adjust CCUsers so that
+// they test the result of MI directly, returning true on success.
+// Leave everything unchanged on failure.
+bool SystemZElimCompare::adjustCCMasksForInstr(
+    MachineInstr &MI, MachineInstr &Compare,
+    SmallVectorImpl<MachineInstr *> &CCUsers) {
+  int Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = TII->get(Opcode);
+  unsigned MIFlags = Desc.TSFlags;
+
+  // See which compare-style condition codes are available.
+  unsigned ReusableCCMask = SystemZII::getCompareZeroCCMask(MIFlags);
+
+  // For unsigned comparisons with zero, only equality makes sense.
+  unsigned CompareFlags = Compare.getDesc().TSFlags;
+  if (CompareFlags & SystemZII::IsLogical)
+    ReusableCCMask &= SystemZ::CCMASK_CMP_EQ;
+
+  if (ReusableCCMask == 0)
+    return false;
+
+  unsigned CCValues = SystemZII::getCCValues(MIFlags);
+  assert((ReusableCCMask & ~CCValues) == 0 && "Invalid CCValues");
+
+  // Now check whether these flags are enough for all users.
+  SmallVector<MachineOperand *, 4> AlterMasks;
+  for (unsigned int I = 0, E = CCUsers.size(); I != E; ++I) {
+    MachineInstr *MI = CCUsers[I];
+
+    // Fail if this isn't a use of CC that we understand.
+    unsigned Flags = MI->getDesc().TSFlags;
+    unsigned FirstOpNum;
+    if (Flags & SystemZII::CCMaskFirst)
+      FirstOpNum = 0;
+    else if (Flags & SystemZII::CCMaskLast)
+      FirstOpNum = MI->getNumExplicitOperands() - 2;
+    else
+      return false;
+
+    // Check whether the instruction predicate treats all CC values
+    // outside of ReusableCCMask in the same way.  In that case it
+    // doesn't matter what those CC values mean.
+    unsigned CCValid = MI->getOperand(FirstOpNum).getImm();
+    unsigned CCMask = MI->getOperand(FirstOpNum + 1).getImm();
+    unsigned OutValid = ~ReusableCCMask & CCValid;
+    unsigned OutMask = ~ReusableCCMask & CCMask;
+    if (OutMask != 0 && OutMask != OutValid)
+      return false;
+
+    AlterMasks.push_back(&MI->getOperand(FirstOpNum));
+    AlterMasks.push_back(&MI->getOperand(FirstOpNum + 1));
+  }
+
+  // All users are OK.  Adjust the masks for MI.
+  for (unsigned I = 0, E = AlterMasks.size(); I != E; I += 2) {
+    AlterMasks[I]->setImm(CCValues);
+    unsigned CCMask = AlterMasks[I + 1]->getImm();
+    if (CCMask & ~ReusableCCMask)
+      AlterMasks[I + 1]->setImm((CCMask & ReusableCCMask) |
+                                (CCValues & ~ReusableCCMask));
+  }
+
+  // CC is now live after MI.
+  int CCDef = MI.findRegisterDefOperandIdx(SystemZ::CC, false, true, TRI);
+  assert(CCDef >= 0 && "Couldn't find CC set");
+  MI.getOperand(CCDef).setIsDead(false);
+
+  // Clear any intervening kills of CC.
+  MachineBasicBlock::iterator MBBI = MI, MBBE = Compare;
+  for (++MBBI; MBBI != MBBE; ++MBBI)
+    MBBI->clearRegisterKills(SystemZ::CC, TRI);
+
+  return true;
+}
+
+// Return true if Compare is a comparison against zero.
+static bool isCompareZero(MachineInstr &Compare) {
+  switch (Compare.getOpcode()) {
+  case SystemZ::LTEBRCompare:
+  case SystemZ::LTDBRCompare:
+  case SystemZ::LTXBRCompare:
+    return true;
+
+  default:
+
+    if (isLoadAndTestAsCmp(Compare))
+      return true;
+
+    return Compare.getNumExplicitOperands() == 2 &&
+           Compare.getOperand(1).isImm() && Compare.getOperand(1).getImm() == 0;
+  }
+}
+
+// Try to optimize cases where comparison instruction Compare is testing
+// a value against zero.  Return true on success and if Compare should be
+// deleted as dead.  CCUsers is the list of instructions that use the CC
+// value produced by Compare.
+bool SystemZElimCompare::optimizeCompareZero(
+    MachineInstr &Compare, SmallVectorImpl<MachineInstr *> &CCUsers) {
+  if (!isCompareZero(Compare))
+    return false;
+
+  // Search back for CC results that are based on the first operand.
+  unsigned SrcReg = getCompareSourceReg(Compare);
+  MachineBasicBlock &MBB = *Compare.getParent();
+  MachineBasicBlock::iterator MBBI = Compare, MBBE = MBB.begin();
+  Reference CCRefs;
+  Reference SrcRefs;
+  while (MBBI != MBBE) {
+    --MBBI;
+    MachineInstr &MI = *MBBI;
+    if (resultTests(MI, SrcReg)) {
+      // Try to remove both MI and Compare by converting a branch to BRCT(G).
+      // or a load-and-trap instruction.  We don't care in this case whether
+      // CC is modified between MI and Compare.
+      if (!CCRefs.Use && !SrcRefs) {
+        if (convertToBRCT(MI, Compare, CCUsers)) {
+          BranchOnCounts += 1;
+          return true;
+        }
+        if (convertToLoadAndTrap(MI, Compare, CCUsers)) {
+          LoadAndTraps += 1;
+          return true;
+        }
+      }
+      // Try to eliminate Compare by reusing a CC result from MI.
+      if ((!CCRefs && convertToLoadAndTest(MI)) ||
+          (!CCRefs.Def && adjustCCMasksForInstr(MI, Compare, CCUsers))) {
+        EliminatedComparisons += 1;
+        return true;
+      }
+    }
+    SrcRefs |= getRegReferences(MI, SrcReg);
+    if (SrcRefs.Def)
+      return false;
+    CCRefs |= getRegReferences(MI, SystemZ::CC);
+    if (CCRefs.Use && CCRefs.Def)
+      return false;
+  }
+  return false;
+}
+
+// Try to fuse comparison instruction Compare into a later branch.
+// Return true on success and if Compare is therefore redundant.
+bool SystemZElimCompare::fuseCompareOperations(
+    MachineInstr &Compare, SmallVectorImpl<MachineInstr *> &CCUsers) {
+  // See whether we have a single branch with which to fuse.
+  if (CCUsers.size() != 1)
+    return false;
+  MachineInstr *Branch = CCUsers[0];
+  SystemZII::FusedCompareType Type;
+  switch (Branch->getOpcode()) {
+  case SystemZ::BRC:
+    Type = SystemZII::CompareAndBranch;
+    break;
+  case SystemZ::CondReturn:
+    Type = SystemZII::CompareAndReturn;
+    break;
+  case SystemZ::CallBCR:
+    Type = SystemZII::CompareAndSibcall;
+    break;
+  case SystemZ::CondTrap:
+    Type = SystemZII::CompareAndTrap;
+    break;
+  default:
+    return false;
+  }
+
+  // See whether we have a comparison that can be fused.
+  unsigned FusedOpcode =
+      TII->getFusedCompare(Compare.getOpcode(), Type, &Compare);
+  if (!FusedOpcode)
+    return false;
+
+  // Make sure that the operands are available at the branch.
+  // SrcReg2 is the register if the source operand is a register,
+  // 0 if the source operand is immediate, and the base register
+  // if the source operand is memory (index is not supported).
+  unsigned SrcReg = Compare.getOperand(0).getReg();
+  unsigned SrcReg2 =
+      Compare.getOperand(1).isReg() ? Compare.getOperand(1).getReg() : 0;
+  MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
+  for (++MBBI; MBBI != MBBE; ++MBBI)
+    if (MBBI->modifiesRegister(SrcReg, TRI) ||
+        (SrcReg2 && MBBI->modifiesRegister(SrcReg2, TRI)))
+      return false;
+
+  // Read the branch mask, target (if applicable), regmask (if applicable).
+  MachineOperand CCMask(MBBI->getOperand(1));
+  assert((CCMask.getImm() & ~SystemZ::CCMASK_ICMP) == 0 &&
+         "Invalid condition-code mask for integer comparison");
+  // This is only valid for CompareAndBranch.
+  MachineOperand Target(MBBI->getOperand(
+    Type == SystemZII::CompareAndBranch ? 2 : 0));
+  const uint32_t *RegMask;
+  if (Type == SystemZII::CompareAndSibcall)
+    RegMask = MBBI->getOperand(2).getRegMask();
+
+  // Clear out all current operands.
+  int CCUse = MBBI->findRegisterUseOperandIdx(SystemZ::CC, false, TRI);
+  assert(CCUse >= 0 && "BRC/BCR must use CC");
+  Branch->RemoveOperand(CCUse);
+  // Remove target (branch) or regmask (sibcall).
+  if (Type == SystemZII::CompareAndBranch ||
+      Type == SystemZII::CompareAndSibcall)
+    Branch->RemoveOperand(2);
+  Branch->RemoveOperand(1);
+  Branch->RemoveOperand(0);
+
+  // Rebuild Branch as a fused compare and branch.
+  // SrcNOps is the number of MI operands of the compare instruction
+  // that we need to copy over.
+  unsigned SrcNOps = 2;
+  if (FusedOpcode == SystemZ::CLT || FusedOpcode == SystemZ::CLGT)
+    SrcNOps = 3;
+  Branch->setDesc(TII->get(FusedOpcode));
+  MachineInstrBuilder MIB(*Branch->getParent()->getParent(), Branch);
+  for (unsigned I = 0; I < SrcNOps; I++)
+    MIB.addOperand(Compare.getOperand(I));
+  MIB.addOperand(CCMask);
+
+  if (Type == SystemZII::CompareAndBranch) {
+    // Only conditional branches define CC, as they may be converted back
+    // to a non-fused branch because of a long displacement.  Conditional
+    // returns don't have that problem.
+    MIB.addOperand(Target)
+       .addReg(SystemZ::CC, RegState::ImplicitDefine | RegState::Dead);
+  }
+
+  if (Type == SystemZII::CompareAndSibcall)
+    MIB.addRegMask(RegMask);
+
+  // Clear any intervening kills of SrcReg and SrcReg2.
+  MBBI = Compare;
+  for (++MBBI; MBBI != MBBE; ++MBBI) {
+    MBBI->clearRegisterKills(SrcReg, TRI);
+    if (SrcReg2)
+      MBBI->clearRegisterKills(SrcReg2, TRI);
+  }
+  FusedComparisons += 1;
+  return true;
+}
+
+// Process all comparison instructions in MBB.  Return true if something
+// changed.
+bool SystemZElimCompare::processBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+
+  // Walk backwards through the block looking for comparisons, recording
+  // all CC users as we go.  The subroutines can delete Compare and
+  // instructions before it.
+  bool CompleteCCUsers = !isCCLiveOut(MBB);
+  SmallVector<MachineInstr *, 4> CCUsers;
+  MachineBasicBlock::iterator MBBI = MBB.end();
+  while (MBBI != MBB.begin()) {
+    MachineInstr &MI = *--MBBI;
+    if (CompleteCCUsers && (MI.isCompare() || isLoadAndTestAsCmp(MI)) &&
+        (optimizeCompareZero(MI, CCUsers) ||
+         fuseCompareOperations(MI, CCUsers))) {
+      ++MBBI;
+      MI.eraseFromParent();
+      Changed = true;
+      CCUsers.clear();
+      continue;
+    }
+
+    if (MI.definesRegister(SystemZ::CC)) {
+      CCUsers.clear();
+      CompleteCCUsers = true;
+    }
+    if (MI.readsRegister(SystemZ::CC) && CompleteCCUsers)
+      CCUsers.push_back(&MI);
+  }
+  return Changed;
+}
+
+bool SystemZElimCompare::runOnMachineFunction(MachineFunction &F) {
+  if (skipFunction(*F.getFunction()))
+    return false;
+
+  TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
+  TRI = &TII->getRegisterInfo();
+
+  bool Changed = false;
+  for (auto &MBB : F)
+    Changed |= processBlock(MBB);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZExpandPseudo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZExpandPseudo.cpp
new file mode 100644
index 000000000000..92ce8089c24f
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZExpandPseudo.cpp
@@ -0,0 +1,153 @@
+//==-- SystemZExpandPseudo.cpp - Expand pseudo instructions -------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling and other late optimizations.  This
+// pass should be run after register allocation but before the post-regalloc
+// scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZSubtarget.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+using namespace llvm;
+
+#define SYSTEMZ_EXPAND_PSEUDO_NAME "SystemZ pseudo instruction expansion pass"
+
+namespace llvm {
+  void initializeSystemZExpandPseudoPass(PassRegistry&);
+}
+
+namespace {
+class SystemZExpandPseudo : public MachineFunctionPass {
+public:
+  static char ID;
+  SystemZExpandPseudo() : MachineFunctionPass(ID) {
+    initializeSystemZExpandPseudoPass(*PassRegistry::getPassRegistry());
+  }
+
+  const SystemZInstrInfo *TII;
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  StringRef getPassName() const override { return SYSTEMZ_EXPAND_PSEUDO_NAME; }
+
+private:
+  bool expandMBB(MachineBasicBlock &MBB);
+  bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                MachineBasicBlock::iterator &NextMBBI);
+  bool expandLOCRMux(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     MachineBasicBlock::iterator &NextMBBI);
+};
+char SystemZExpandPseudo::ID = 0;
+}
+
+INITIALIZE_PASS(SystemZExpandPseudo, "systemz-expand-pseudo",
+                SYSTEMZ_EXPAND_PSEUDO_NAME, false, false)
+
+/// \brief Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createSystemZExpandPseudoPass(SystemZTargetMachine &TM) {
+  return new SystemZExpandPseudo();
+}
+
+// MI is a load-register-on-condition pseudo instruction that could not be
+// handled as a single hardware instruction.  Replace it by a branch sequence.
+bool SystemZExpandPseudo::expandLOCRMux(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        MachineBasicBlock::iterator &NextMBBI) {
+  MachineFunction &MF = *MBB.getParent();
+  const BasicBlock *BB = MBB.getBasicBlock();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned DestReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(2).getReg();
+  unsigned CCValid = MI.getOperand(3).getImm();
+  unsigned CCMask = MI.getOperand(4).getImm();
+
+  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+    LiveRegs.stepBackward(*I);
+
+  // Splice MBB at MI, moving the rest of the block into RestMBB.
+  MachineBasicBlock *RestMBB = MF.CreateMachineBasicBlock(BB);
+  MF.insert(std::next(MachineFunction::iterator(MBB)), RestMBB);
+  RestMBB->splice(RestMBB->begin(), &MBB, MI, MBB.end());
+  RestMBB->transferSuccessors(&MBB);
+  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+    RestMBB->addLiveIn(*I);
+
+  // Create a new block MoveMBB to hold the move instruction.
+  MachineBasicBlock *MoveMBB = MF.CreateMachineBasicBlock(BB);
+  MF.insert(std::next(MachineFunction::iterator(MBB)), MoveMBB);
+  MoveMBB->addLiveIn(SrcReg);
+  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+    MoveMBB->addLiveIn(*I);
+
+  // At the end of MBB, create a conditional branch to RestMBB if the
+  // condition is false, otherwise fall through to MoveMBB.
+  BuildMI(&MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(CCValid).addImm(CCMask ^ CCValid).addMBB(RestMBB);
+  MBB.addSuccessor(RestMBB);
+  MBB.addSuccessor(MoveMBB);
+
+  // In MoveMBB, emit an instruction to move SrcReg into DestReg,
+  // then fall through to RestMBB.
+  TII->copyPhysReg(*MoveMBB, MoveMBB->end(), DL, DestReg, SrcReg,
+                   MI.getOperand(2).isKill());
+  MoveMBB->addSuccessor(RestMBB);
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+  return true;
+}
+
+/// \brief If MBBI references a pseudo instruction that should be expanded here,
+/// do the expansion and return true.  Otherwise return false.
+bool SystemZExpandPseudo::expandMI(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineBasicBlock::iterator &NextMBBI) {
+  MachineInstr &MI = *MBBI;
+  switch (MI.getOpcode()) {
+  case SystemZ::LOCRMux:
+    return expandLOCRMux(MBB, MBBI, NextMBBI);
+  default:
+    break;
+  }
+  return false;
+}
+
+/// \brief Iterate over the instructions in basic block MBB and expand any
+/// pseudo instructions.  Return true if anything was modified.
+bool SystemZExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= expandMI(MBB, MBBI, NMBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool SystemZExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  bool Modified = false;
+  for (auto &MBB : MF)
+    Modified |= expandMBB(MBB);
+  return Modified;
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFeatures.td b/contrib/llvm/lib/Target/SystemZ/SystemZFeatures.td
new file mode 100644
index 000000000000..716e5add8051
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZFeatures.td
@@ -0,0 +1,171 @@
+//===-- SystemZ.td - SystemZ processors and features ---------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Feature definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class SystemZFeature<string extname, string intname, string desc>
+  : Predicate<"Subtarget->has"##intname##"()">,
+    AssemblerPredicate<"Feature"##intname, extname>,
+    SubtargetFeature<extname, "Has"##intname, "true", desc>;
+
+class SystemZMissingFeature<string intname>
+  : Predicate<"!Subtarget->has"##intname##"()">;
+
+class SystemZFeatureList<list<SystemZFeature> x> {
+  list<SystemZFeature> List = x;
+}
+
+class SystemZFeatureAdd<list<SystemZFeature> x, list<SystemZFeature> y>
+  : SystemZFeatureList<!listconcat(x, y)>;
+
+//===----------------------------------------------------------------------===//
+//
+// New features added in the Ninth Edition of the z/Architecture
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureDistinctOps : SystemZFeature<
+  "distinct-ops", "DistinctOps",
+  "Assume that the distinct-operands facility is installed"
+>;
+
+def FeatureFastSerialization : SystemZFeature<
+  "fast-serialization", "FastSerialization",
+  "Assume that the fast-serialization facility is installed"
+>;
+
+def FeatureFPExtension : SystemZFeature<
+  "fp-extension", "FPExtension",
+  "Assume that the floating-point extension facility is installed"
+>;
+
+def FeatureHighWord : SystemZFeature<
+  "high-word", "HighWord",
+  "Assume that the high-word facility is installed"
+>;
+
+def FeatureInterlockedAccess1 : SystemZFeature<
+  "interlocked-access1", "InterlockedAccess1",
+  "Assume that interlocked-access facility 1 is installed"
+>;
+def FeatureNoInterlockedAccess1 : SystemZMissingFeature<"InterlockedAccess1">;
+
+def FeatureLoadStoreOnCond : SystemZFeature<
+  "load-store-on-cond", "LoadStoreOnCond",
+  "Assume that the load/store-on-condition facility is installed"
+>;
+
+def FeaturePopulationCount : SystemZFeature<
+  "population-count", "PopulationCount",
+  "Assume that the population-count facility is installed"
+>;
+
+def Arch9NewFeatures : SystemZFeatureList<[
+    FeatureDistinctOps,
+    FeatureFastSerialization,
+    FeatureFPExtension,
+    FeatureHighWord,
+    FeatureInterlockedAccess1,
+    FeatureLoadStoreOnCond,
+    FeaturePopulationCount
+]>;
+
+//===----------------------------------------------------------------------===//
+//
+// New features added in the Tenth Edition of the z/Architecture
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureExecutionHint : SystemZFeature<
+  "execution-hint", "ExecutionHint",
+  "Assume that the execution-hint facility is installed"
+>;
+
+def FeatureLoadAndTrap : SystemZFeature<
+  "load-and-trap", "LoadAndTrap",
+  "Assume that the load-and-trap facility is installed"
+>;
+
+def FeatureMiscellaneousExtensions : SystemZFeature<
+  "miscellaneous-extensions", "MiscellaneousExtensions",
+  "Assume that the miscellaneous-extensions facility is installed"
+>;
+
+def FeatureProcessorAssist : SystemZFeature<
+  "processor-assist", "ProcessorAssist",
+  "Assume that the processor-assist facility is installed"
+>;
+
+def FeatureTransactionalExecution : SystemZFeature<
+  "transactional-execution", "TransactionalExecution",
+  "Assume that the transactional-execution facility is installed"
+>;
+
+def Arch10NewFeatures : SystemZFeatureList<[
+    FeatureExecutionHint,
+    FeatureLoadAndTrap,
+    FeatureMiscellaneousExtensions,
+    FeatureProcessorAssist,
+    FeatureTransactionalExecution
+]>;
+
+//===----------------------------------------------------------------------===//
+//
+// New features added in the Eleventh Edition of the z/Architecture
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureLoadAndZeroRightmostByte : SystemZFeature<
+  "load-and-zero-rightmost-byte", "LoadAndZeroRightmostByte",
+  "Assume that the load-and-zero-rightmost-byte facility is installed"
+>;
+
+def FeatureLoadStoreOnCond2 : SystemZFeature<
+  "load-store-on-cond-2", "LoadStoreOnCond2",
+  "Assume that the load/store-on-condition facility 2 is installed"
+>;
+
+def FeatureVector : SystemZFeature<
+  "vector", "Vector",
+  "Assume that the vectory facility is installed"
+>;
+def FeatureNoVector : SystemZMissingFeature<"Vector">;
+
+def Arch11NewFeatures : SystemZFeatureList<[
+    FeatureLoadAndZeroRightmostByte,
+    FeatureLoadStoreOnCond2,
+    FeatureVector
+]>;
+
+//===----------------------------------------------------------------------===//
+//
+// Cumulative supported and unsupported feature sets
+//
+//===----------------------------------------------------------------------===//
+
+def Arch8SupportedFeatures
+  : SystemZFeatureList<[]>;
+def Arch9SupportedFeatures
+  : SystemZFeatureAdd<Arch8SupportedFeatures.List,  Arch9NewFeatures.List>;
+def Arch10SupportedFeatures
+  : SystemZFeatureAdd<Arch9SupportedFeatures.List,  Arch10NewFeatures.List>;
+def Arch11SupportedFeatures
+  : SystemZFeatureAdd<Arch10SupportedFeatures.List, Arch11NewFeatures.List>;
+
+def Arch11UnsupportedFeatures
+  : SystemZFeatureList<[]>;
+def Arch10UnsupportedFeatures
+  : SystemZFeatureAdd<Arch11UnsupportedFeatures.List, Arch11NewFeatures.List>;
+def Arch9UnsupportedFeatures
+  : SystemZFeatureAdd<Arch10UnsupportedFeatures.List, Arch10NewFeatures.List>;
+def Arch8UnsupportedFeatures
+  : SystemZFeatureAdd<Arch9UnsupportedFeatures.List,  Arch9NewFeatures.List>;
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
new file mode 100644
index 000000000000..a28a91e834f6
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -0,0 +1,549 @@
+//===-- SystemZFrameLowering.cpp - Frame lowering for SystemZ -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZFrameLowering.h"
+#include "SystemZCallingConv.h"
+#include "SystemZInstrBuilder.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZMachineFunctionInfo.h"
+#include "SystemZRegisterInfo.h"
+#include "SystemZSubtarget.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+namespace {
+// The ABI-defined register save slots, relative to the incoming stack
+// pointer.
+static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = {
+  { SystemZ::R2D,  0x10 },
+  { SystemZ::R3D,  0x18 },
+  { SystemZ::R4D,  0x20 },
+  { SystemZ::R5D,  0x28 },
+  { SystemZ::R6D,  0x30 },
+  { SystemZ::R7D,  0x38 },
+  { SystemZ::R8D,  0x40 },
+  { SystemZ::R9D,  0x48 },
+  { SystemZ::R10D, 0x50 },
+  { SystemZ::R11D, 0x58 },
+  { SystemZ::R12D, 0x60 },
+  { SystemZ::R13D, 0x68 },
+  { SystemZ::R14D, 0x70 },
+  { SystemZ::R15D, 0x78 },
+  { SystemZ::F0D,  0x80 },
+  { SystemZ::F2D,  0x88 },
+  { SystemZ::F4D,  0x90 },
+  { SystemZ::F6D,  0x98 }
+};
+} // end anonymous namespace
+
+SystemZFrameLowering::SystemZFrameLowering()
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8,
+                          -SystemZMC::CallFrameSize, 8,
+                          false /* StackRealignable */) {
+  // Create a mapping from register number to save slot offset.
+  RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
+  for (unsigned I = 0, E = array_lengthof(SpillOffsetTable); I != E; ++I)
+    RegSpillOffsets[SpillOffsetTable[I].Reg] = SpillOffsetTable[I].Offset;
+}
+
+const TargetFrameLowering::SpillSlot *
+SystemZFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
+  NumEntries = array_lengthof(SpillOffsetTable);
+  return SpillOffsetTable;
+}
+
+void SystemZFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                                BitVector &SavedRegs,
+                                                RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+  MachineFrameInfo &MFFrame = MF.getFrameInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  bool HasFP = hasFP(MF);
+  SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  bool IsVarArg = MF.getFunction()->isVarArg();
+
+  // va_start stores incoming FPR varargs in the normal way, but delegates
+  // the saving of incoming GPR varargs to spillCalleeSavedRegisters().
+  // Record these pending uses, which typically include the call-saved
+  // argument register R6D.
+  if (IsVarArg)
+    for (unsigned I = MFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I)
+      SavedRegs.set(SystemZ::ArgGPRs[I]);
+
+  // If there are any landing pads, entering them will modify r6/r7.
+  if (!MF.getLandingPads().empty()) {
+    SavedRegs.set(SystemZ::R6D);
+    SavedRegs.set(SystemZ::R7D);
+  }
+
+  // If the function requires a frame pointer, record that the hard
+  // frame pointer will be clobbered.
+  if (HasFP)
+    SavedRegs.set(SystemZ::R11D);
+
+  // If the function calls other functions, record that the return
+  // address register will be clobbered.
+  if (MFFrame.hasCalls())
+    SavedRegs.set(SystemZ::R14D);
+
+  // If we are saving GPRs other than the stack pointer, we might as well
+  // save and restore the stack pointer at the same time, via STMG and LMG.
+  // This allows the deallocation to be done by the LMG, rather than needing
+  // a separate %r15 addition.
+  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+  for (unsigned I = 0; CSRegs[I]; ++I) {
+    unsigned Reg = CSRegs[I];
+    if (SystemZ::GR64BitRegClass.contains(Reg) && SavedRegs.test(Reg)) {
+      SavedRegs.set(SystemZ::R15D);
+      break;
+    }
+  }
+}
+
+// Add GPR64 to the save instruction being built by MIB, which is in basic
+// block MBB.  IsImplicit says whether this is an explicit operand to the
+// instruction, or an implicit one that comes between the explicit start
+// and end registers.
+static void addSavedGPR(MachineBasicBlock &MBB, MachineInstrBuilder &MIB,
+                        unsigned GPR64, bool IsImplicit) {
+  const TargetRegisterInfo *RI =
+      MBB.getParent()->getSubtarget().getRegisterInfo();
+  unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_l32);
+  bool IsLive = MBB.isLiveIn(GPR64) || MBB.isLiveIn(GPR32);
+  if (!IsLive || !IsImplicit) {
+    MIB.addReg(GPR64, getImplRegState(IsImplicit) | getKillRegState(!IsLive));
+    if (!IsLive)
+      MBB.addLiveIn(GPR64);
+  }
+}
+
+bool SystemZFrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  bool IsVarArg = MF.getFunction()->isVarArg();
+  DebugLoc DL;
+
+  // Scan the call-saved GPRs and find the bounds of the register spill area.
+  unsigned LowGPR = 0;
+  unsigned HighGPR = SystemZ::R15D;
+  unsigned StartOffset = -1U;
+  for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+    unsigned Reg = CSI[I].getReg();
+    if (SystemZ::GR64BitRegClass.contains(Reg)) {
+      unsigned Offset = RegSpillOffsets[Reg];
+      assert(Offset && "Unexpected GPR save");
+      if (StartOffset > Offset) {
+        LowGPR = Reg;
+        StartOffset = Offset;
+      }
+    }
+  }
+
+  // Save the range of call-saved registers, for use by the epilogue inserter.
+  ZFI->setLowSavedGPR(LowGPR);
+  ZFI->setHighSavedGPR(HighGPR);
+
+  // Include the GPR varargs, if any.  R6D is call-saved, so would
+  // be included by the loop above, but we also need to handle the
+  // call-clobbered argument registers.
+  if (IsVarArg) {
+    unsigned FirstGPR = ZFI->getVarArgsFirstGPR();
+    if (FirstGPR < SystemZ::NumArgGPRs) {
+      unsigned Reg = SystemZ::ArgGPRs[FirstGPR];
+      unsigned Offset = RegSpillOffsets[Reg];
+      if (StartOffset > Offset) {
+        LowGPR = Reg; StartOffset = Offset;
+      }
+    }
+  }
+
+  // Save GPRs
+  if (LowGPR) {
+    assert(LowGPR != HighGPR && "Should be saving %r15 and something else");
+
+    // Build an STMG instruction.
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::STMG));
+
+    // Add the explicit register operands.
+    addSavedGPR(MBB, MIB, LowGPR, false);
+    addSavedGPR(MBB, MIB, HighGPR, false);
+
+    // Add the address.
+    MIB.addReg(SystemZ::R15D).addImm(StartOffset);
+
+    // Make sure all call-saved GPRs are included as operands and are
+    // marked as live on entry.
+    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+      unsigned Reg = CSI[I].getReg();
+      if (SystemZ::GR64BitRegClass.contains(Reg))
+        addSavedGPR(MBB, MIB, Reg, true);
+    }
+
+    // ...likewise GPR varargs.
+    if (IsVarArg)
+      for (unsigned I = ZFI->getVarArgsFirstGPR(); I < SystemZ::NumArgGPRs; ++I)
+        addSavedGPR(MBB, MIB, SystemZ::ArgGPRs[I], true);
+  }
+
+  // Save FPRs in the normal TargetInstrInfo way.
+  for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+    unsigned Reg = CSI[I].getReg();
+    if (SystemZ::FP64BitRegClass.contains(Reg)) {
+      MBB.addLiveIn(Reg);
+      TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(),
+                               &SystemZ::FP64BitRegClass, TRI);
+    }
+  }
+
+  return true;
+}
+
+bool SystemZFrameLowering::
+restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            const std::vector<CalleeSavedInfo> &CSI,
+                            const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  bool HasFP = hasFP(MF);
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Restore FPRs in the normal TargetInstrInfo way.
+  for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+    unsigned Reg = CSI[I].getReg();
+    if (SystemZ::FP64BitRegClass.contains(Reg))
+      TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(),
+                                &SystemZ::FP64BitRegClass, TRI);
+  }
+
+  // Restore call-saved GPRs (but not call-clobbered varargs, which at
+  // this point might hold return values).
+  unsigned LowGPR = ZFI->getLowSavedGPR();
+  unsigned HighGPR = ZFI->getHighSavedGPR();
+  unsigned StartOffset = RegSpillOffsets[LowGPR];
+  if (LowGPR) {
+    // If we saved any of %r2-%r5 as varargs, we should also be saving
+    // and restoring %r6.  If we're saving %r6 or above, we should be
+    // restoring it too.
+    assert(LowGPR != HighGPR && "Should be loading %r15 and something else");
+
+    // Build an LMG instruction.
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::LMG));
+
+    // Add the explicit register operands.
+    MIB.addReg(LowGPR, RegState::Define);
+    MIB.addReg(HighGPR, RegState::Define);
+
+    // Add the address.
+    MIB.addReg(HasFP ? SystemZ::R11D : SystemZ::R15D);
+    MIB.addImm(StartOffset);
+
+    // Do a second scan adding regs as being defined by instruction
+    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+      unsigned Reg = CSI[I].getReg();
+      if (Reg != LowGPR && Reg != HighGPR &&
+          SystemZ::GR64BitRegClass.contains(Reg))
+        MIB.addReg(Reg, RegState::ImplicitDefine);
+    }
+  }
+
+  return true;
+}
+
+void SystemZFrameLowering::
+processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                    RegScavenger *RS) const {
+  MachineFrameInfo &MFFrame = MF.getFrameInfo();
+  uint64_t MaxReach = (MFFrame.estimateStackSize(MF) +
+                       SystemZMC::CallFrameSize * 2);
+  if (!isUInt<12>(MaxReach)) {
+    // We may need register scavenging slots if some parts of the frame
+    // are outside the reach of an unsigned 12-bit displacement.
+    // Create 2 for the case where both addresses in an MVC are
+    // out of range.
+    RS->addScavengingFrameIndex(MFFrame.CreateStackObject(8, 8, false));
+    RS->addScavengingFrameIndex(MFFrame.CreateStackObject(8, 8, false));
+  }
+}
+
+// Emit instructions before MBBI (in MBB) to add NumBytes to Reg.
+static void emitIncrement(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator &MBBI,
+                          const DebugLoc &DL,
+                          unsigned Reg, int64_t NumBytes,
+                          const TargetInstrInfo *TII) {
+  while (NumBytes) {
+    unsigned Opcode;
+    int64_t ThisVal = NumBytes;
+    if (isInt<16>(NumBytes))
+      Opcode = SystemZ::AGHI;
+    else {
+      Opcode = SystemZ::AGFI;
+      // Make sure we maintain 8-byte stack alignment.
+      int64_t MinVal = -uint64_t(1) << 31;
+      int64_t MaxVal = (int64_t(1) << 31) - 8;
+      if (ThisVal < MinVal)
+        ThisVal = MinVal;
+      else if (ThisVal > MaxVal)
+        ThisVal = MaxVal;
+    }
+    MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII->get(Opcode), Reg)
+      .addReg(Reg).addImm(ThisVal);
+    // The CC implicit def is dead.
+    MI->getOperand(3).setIsDead();
+    NumBytes -= ThisVal;
+  }
+}
+
+void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) const {
+  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+  MachineFrameInfo &MFFrame = MF.getFrameInfo();
+  auto *ZII =
+      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFFrame.getCalleeSavedInfo();
+  bool HasFP = hasFP(MF);
+
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc DL;
+
+  // The current offset of the stack pointer from the CFA.
+  int64_t SPOffsetFromCFA = -SystemZMC::CFAOffsetFromInitialSP;
+
+  if (ZFI->getLowSavedGPR()) {
+    // Skip over the GPR saves.
+    if (MBBI != MBB.end() && MBBI->getOpcode() == SystemZ::STMG)
+      ++MBBI;
+    else
+      llvm_unreachable("Couldn't skip over GPR saves");
+
+    // Add CFI for the GPR saves.
+    for (auto &Save : CSI) {
+      unsigned Reg = Save.getReg();
+      if (SystemZ::GR64BitRegClass.contains(Reg)) {
+        int64_t Offset = SPOffsetFromCFA + RegSpillOffsets[Reg];
+        unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+            nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
+        BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+      }
+    }
+  }
+
+  uint64_t StackSize = getAllocatedStackSize(MF);
+  if (StackSize) {
+    // Determine if we want to store a backchain.
+    bool StoreBackchain = MF.getFunction()->hasFnAttribute("backchain");
+
+    // If we need backchain, save current stack pointer.  R1 is free at this
+    // point.
+    if (StoreBackchain)
+      BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::LGR))
+        .addReg(SystemZ::R1D, RegState::Define).addReg(SystemZ::R15D);
+
+    // Allocate StackSize bytes.
+    int64_t Delta = -int64_t(StackSize);
+    emitIncrement(MBB, MBBI, DL, SystemZ::R15D, Delta, ZII);
+
+    // Add CFI for the allocation.
+    unsigned CFIIndex = MF.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, SPOffsetFromCFA + Delta));
+    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+    SPOffsetFromCFA += Delta;
+
+    if (StoreBackchain)
+      BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::STG))
+        .addReg(SystemZ::R1D, RegState::Kill).addReg(SystemZ::R15D).addImm(0).addReg(0);
+  }
+
+  if (HasFP) {
+    // Copy the base of the frame to R11.
+    BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::LGR), SystemZ::R11D)
+      .addReg(SystemZ::R15D);
+
+    // Add CFI for the new frame location.
+    unsigned HardFP = MRI->getDwarfRegNum(SystemZ::R11D, true);
+    unsigned CFIIndex = MF.addFrameInst(
+        MCCFIInstruction::createDefCfaRegister(nullptr, HardFP));
+    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+
+    // Mark the FramePtr as live at the beginning of every block except
+    // the entry block.  (We'll have marked R11 as live on entry when
+    // saving the GPRs.)
+    for (auto I = std::next(MF.begin()), E = MF.end(); I != E; ++I)
+      I->addLiveIn(SystemZ::R11D);
+  }
+
+  // Skip over the FPR saves.
+  SmallVector<unsigned, 8> CFIIndexes;
+  for (auto &Save : CSI) {
+    unsigned Reg = Save.getReg();
+    if (SystemZ::FP64BitRegClass.contains(Reg)) {
+      if (MBBI != MBB.end() &&
+          (MBBI->getOpcode() == SystemZ::STD ||
+           MBBI->getOpcode() == SystemZ::STDY))
+        ++MBBI;
+      else
+        llvm_unreachable("Couldn't skip over FPR save");
+
+      // Add CFI for the this save.
+      unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+      unsigned IgnoredFrameReg;
+      int64_t Offset =
+          getFrameIndexReference(MF, Save.getFrameIdx(), IgnoredFrameReg);
+
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+          nullptr, DwarfReg, SPOffsetFromCFA + Offset));
+      CFIIndexes.push_back(CFIIndex);
+    }
+  }
+  // Complete the CFI for the FPR saves, modelling them as taking effect
+  // after the last save.
+  for (auto CFIIndex : CFIIndexes) {
+    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
+}
+
+void SystemZFrameLowering::emitEpilogue(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  auto *ZII =
+      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+
+  // Skip the return instruction.
+  assert(MBBI->isReturn() && "Can only insert epilogue into returning blocks");
+
+  uint64_t StackSize = getAllocatedStackSize(MF);
+  if (ZFI->getLowSavedGPR()) {
+    --MBBI;
+    unsigned Opcode = MBBI->getOpcode();
+    if (Opcode != SystemZ::LMG)
+      llvm_unreachable("Expected to see callee-save register restore code");
+
+    unsigned AddrOpNo = 2;
+    DebugLoc DL = MBBI->getDebugLoc();
+    uint64_t Offset = StackSize + MBBI->getOperand(AddrOpNo + 1).getImm();
+    unsigned NewOpcode = ZII->getOpcodeForOffset(Opcode, Offset);
+
+    // If the offset is too large, use the largest stack-aligned offset
+    // and add the rest to the base register (the stack or frame pointer).
+    if (!NewOpcode) {
+      uint64_t NumBytes = Offset - 0x7fff8;
+      emitIncrement(MBB, MBBI, DL, MBBI->getOperand(AddrOpNo).getReg(),
+                    NumBytes, ZII);
+      Offset -= NumBytes;
+      NewOpcode = ZII->getOpcodeForOffset(Opcode, Offset);
+      assert(NewOpcode && "No restore instruction available");
+    }
+
+    MBBI->setDesc(ZII->get(NewOpcode));
+    MBBI->getOperand(AddrOpNo + 1).ChangeToImmediate(Offset);
+  } else if (StackSize) {
+    DebugLoc DL = MBBI->getDebugLoc();
+    emitIncrement(MBB, MBBI, DL, SystemZ::R15D, StackSize, ZII);
+  }
+}
+
+bool SystemZFrameLowering::hasFP(const MachineFunction &MF) const {
+  return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+          MF.getFrameInfo().hasVarSizedObjects() ||
+          MF.getInfo<SystemZMachineFunctionInfo>()->getManipulatesSP());
+}
+
+int SystemZFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                 int FI,
+                                                 unsigned &FrameReg) const {
+  const MachineFrameInfo &MFFrame = MF.getFrameInfo();
+  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+
+  // Fill in FrameReg output argument.
+  FrameReg = RI->getFrameRegister(MF);
+
+  // Start with the offset of FI from the top of the caller-allocated frame
+  // (i.e. the top of the 160 bytes allocated by the caller).  This initial
+  // offset is therefore negative.
+  int64_t Offset = (MFFrame.getObjectOffset(FI) +
+                    MFFrame.getOffsetAdjustment());
+
+  // Make the offset relative to the incoming stack pointer.
+  Offset -= getOffsetOfLocalArea();
+
+  // Make the offset relative to the bottom of the frame.
+  Offset += getAllocatedStackSize(MF);
+
+  return Offset;
+}
+
+uint64_t SystemZFrameLowering::
+getAllocatedStackSize(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFFrame = MF.getFrameInfo();
+
+  // Start with the size of the local variables and spill slots.
+  uint64_t StackSize = MFFrame.getStackSize();
+
+  // We need to allocate the ABI-defined 160-byte base area whenever
+  // we allocate stack space for our own use and whenever we call another
+  // function.
+  if (StackSize || MFFrame.hasVarSizedObjects() || MFFrame.hasCalls())
+    StackSize += SystemZMC::CallFrameSize;
+
+  return StackSize;
+}
+
+bool
+SystemZFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  // The ABI requires us to allocate 160 bytes of stack space for the callee,
+  // with any outgoing stack arguments being placed above that.  It seems
+  // better to make that area a permanent feature of the frame even if
+  // we're using a frame pointer.
+  return true;
+}
+
+MachineBasicBlock::iterator SystemZFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF,
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI) const {
+  switch (MI->getOpcode()) {
+  case SystemZ::ADJCALLSTACKDOWN:
+  case SystemZ::ADJCALLSTACKUP:
+    assert(hasReservedCallFrame(MF) &&
+           "ADJSTACKDOWN and ADJSTACKUP should be no-ops");
+    return MBB.erase(MI);
+    break;
+
+  default:
+    llvm_unreachable("Unexpected call frame instruction");
+  }
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
new file mode 100644
index 000000000000..d43a176ad874
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -0,0 +1,64 @@
+//===-- SystemZFrameLowering.h - Frame lowering for SystemZ -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H
+
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class SystemZTargetMachine;
+class SystemZSubtarget;
+
+class SystemZFrameLowering : public TargetFrameLowering {
+  IndexedMap<unsigned> RegSpillOffsets;
+
+public:
+  SystemZFrameLowering();
+
+  // Override TargetFrameLowering.
+  bool isFPCloseToIncomingSP() const override { return false; }
+  const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const
+    override;
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBII,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const
+    override;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                           RegScavenger *RS) const override;
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  bool hasFP(const MachineFunction &MF) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override;
+
+  // Return the number of bytes in the callee-allocated part of the frame.
+  uint64_t getAllocatedStackSize(const MachineFunction &MF) const;
+
+  // Return the byte offset from the incoming stack pointer of Reg's
+  // ABI-defined save slot.  Return 0 if no slot is defined for Reg.
+  unsigned getRegSpillOffset(unsigned Reg) const {
+    return RegSpillOffsets[Reg];
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
new file mode 100644
index 000000000000..fe4b52b515e0
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
@@ -0,0 +1,337 @@
+//=-- SystemZHazardRecognizer.h - SystemZ Hazard Recognizer -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a hazard recognizer for the SystemZ scheduler.
+//
+// This class is used by the SystemZ scheduling strategy to maintain
+// the state during scheduling, and provide cost functions for
+// scheduling candidates. This includes:
+//
+// * Decoder grouping. A decoder group can maximally hold 3 uops, and
+// instructions that always begin a new group should be scheduled when
+// the current decoder group is empty.
+// * Processor resources usage. It is beneficial to balance the use of
+// resources.
+//
+// ===---------------------------------------------------------------------===//
+
+#include "SystemZHazardRecognizer.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+// This is the limit of processor resource usage at which the
+// scheduler should try to look for other instructions (not using the
+// critical resource).
+static cl::opt<int> ProcResCostLim("procres-cost-lim", cl::Hidden,
+                                   cl::desc("The OOO window for processor "
+                                            "resources during scheduling."),
+                                   cl::init(8));
+
+SystemZHazardRecognizer::
+SystemZHazardRecognizer(const MachineSchedContext *C) : DAG(nullptr),
+                                                        SchedModel(nullptr) {}
+
+unsigned SystemZHazardRecognizer::
+getNumDecoderSlots(SUnit *SU) const {
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  if (!SC->isValid())
+    return 0; // IMPLICIT_DEF / KILL -- will not make impact in output.
+
+  if (SC->BeginGroup) {
+    if (!SC->EndGroup)
+      return 2; // Cracked instruction
+    else
+      return 3; // Expanded/group-alone instruction
+  }
+    
+  return 1; // Normal instruction
+}
+
+unsigned SystemZHazardRecognizer::getCurrCycleIdx() {
+  unsigned Idx = CurrGroupSize;
+  if (GrpCount % 2)
+    Idx += 3;
+  return Idx;
+}
+
+ScheduleHazardRecognizer::HazardType SystemZHazardRecognizer::
+getHazardType(SUnit *m, int Stalls) {
+  return (fitsIntoCurrentGroup(m) ? NoHazard : Hazard);
+}
+
+void SystemZHazardRecognizer::Reset() {
+  CurrGroupSize = 0;
+  clearProcResCounters();
+  GrpCount = 0;
+  LastFPdOpCycleIdx = UINT_MAX;
+  DEBUG(CurGroupDbg = "";);
+}
+
+bool
+SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const {
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  if (!SC->isValid())
+    return true;
+
+  // A cracked instruction only fits into schedule if the current
+  // group is empty.
+  if (SC->BeginGroup)
+    return (CurrGroupSize == 0);
+
+  // Since a full group is handled immediately in EmitInstruction(),
+  // SU should fit into current group. NumSlots should be 1 or 0,
+  // since it is not a cracked or expanded instruction.
+  assert ((getNumDecoderSlots(SU) <= 1) && (CurrGroupSize < 3) &&
+          "Expected normal instruction to fit in non-full group!");
+
+  return true;
+}
+
+void SystemZHazardRecognizer::nextGroup(bool DbgOutput) {
+  if (CurrGroupSize > 0) {
+    DEBUG(dumpCurrGroup("Completed decode group"));
+    DEBUG(CurGroupDbg = "";);
+
+    GrpCount++;
+
+    // Reset counter for next group.
+    CurrGroupSize = 0;
+
+    // Decrease counters for execution units by one.
+    for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
+      if (ProcResourceCounters[i] > 0)
+        ProcResourceCounters[i]--;
+
+    // Clear CriticalResourceIdx if it is now below the threshold.
+    if (CriticalResourceIdx != UINT_MAX &&
+        (ProcResourceCounters[CriticalResourceIdx] <=
+         ProcResCostLim))
+      CriticalResourceIdx = UINT_MAX;
+  }
+
+  DEBUG(if (DbgOutput)
+          dumpProcResourceCounters(););
+}
+
+#ifndef NDEBUG // Debug output
+void SystemZHazardRecognizer::dumpSU(SUnit *SU, raw_ostream &OS) const {
+  OS << "SU(" << SU->NodeNum << "):";
+  OS << SchedModel->getInstrInfo()->getName(SU->getInstr()->getOpcode());
+
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  if (!SC->isValid())
+    return;
+  
+  for (TargetSchedModel::ProcResIter
+         PI = SchedModel->getWriteProcResBegin(SC),
+         PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+    const MCProcResourceDesc &PRD =
+      *SchedModel->getProcResource(PI->ProcResourceIdx);
+    std::string FU(PRD.Name);
+    // trim e.g. Z13_FXaUnit -> FXa
+    FU = FU.substr(FU.find("_") + 1);
+    FU.resize(FU.find("Unit"));
+    OS << "/" << FU;
+
+    if (PI->Cycles > 1)
+      OS << "(" << PI->Cycles << "cyc)";
+  }
+
+  if (SC->NumMicroOps > 1)
+    OS << "/" << SC->NumMicroOps << "uops";
+  if (SC->BeginGroup && SC->EndGroup)
+    OS << "/GroupsAlone";
+  else if (SC->BeginGroup)
+    OS << "/BeginsGroup";
+  else if (SC->EndGroup)
+    OS << "/EndsGroup";
+  if (SU->isUnbuffered)
+    OS << "/Unbuffered";
+}
+
+void SystemZHazardRecognizer::dumpCurrGroup(std::string Msg) const {
+  dbgs() << "+++ " << Msg;
+  dbgs() << ": ";
+
+  if (CurGroupDbg.empty())
+    dbgs() << " <empty>\n";
+  else {
+    dbgs() << "{ " << CurGroupDbg << " }";
+    dbgs() << " (" << CurrGroupSize << " decoder slot"
+           << (CurrGroupSize > 1 ? "s":"")
+           << ")\n";
+  }
+}
+
+void SystemZHazardRecognizer::dumpProcResourceCounters() const {
+  bool any = false;
+
+  for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
+    if (ProcResourceCounters[i] > 0) {
+      any = true;
+      break;
+    }
+
+  if (!any)
+    return;
+
+  dbgs() << "+++ Resource counters:\n";
+  for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
+    if (ProcResourceCounters[i] > 0) {
+      dbgs() << "+++ Extra schedule for execution unit "
+             << SchedModel->getProcResource(i)->Name
+             << ": " << ProcResourceCounters[i] << "\n";
+      any = true;
+    }
+}
+#endif //NDEBUG
+
+void SystemZHazardRecognizer::clearProcResCounters() {
+  ProcResourceCounters.assign(SchedModel->getNumProcResourceKinds(), 0);
+  CriticalResourceIdx = UINT_MAX;
+}
+
+// Update state with SU as the next scheduled unit.
+void SystemZHazardRecognizer::
+EmitInstruction(SUnit *SU) {
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  DEBUG( dumpCurrGroup("Decode group before emission"););
+
+  // If scheduling an SU that must begin a new decoder group, move on
+  // to next group.
+  if (!fitsIntoCurrentGroup(SU))
+    nextGroup();
+
+  DEBUG( dbgs() << "+++ HazardRecognizer emitting "; dumpSU(SU, dbgs());
+         dbgs() << "\n";
+         raw_string_ostream cgd(CurGroupDbg);
+         if (CurGroupDbg.length())
+           cgd << ", ";
+         dumpSU(SU, cgd););
+
+  // After returning from a call, we don't know much about the state.
+  if (SU->getInstr()->isCall()) {
+    DEBUG (dbgs() << "+++ Clearing state after call.\n";);
+    clearProcResCounters();
+    LastFPdOpCycleIdx = UINT_MAX;
+    CurrGroupSize += getNumDecoderSlots(SU);
+    assert (CurrGroupSize <= 3);
+    nextGroup();
+    return;
+  }
+
+  // Increase counter for execution unit(s).
+  for (TargetSchedModel::ProcResIter
+         PI = SchedModel->getWriteProcResBegin(SC),
+         PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+    // Don't handle FPd together with the other resources.
+    if (SchedModel->getProcResource(PI->ProcResourceIdx)->BufferSize == 1)
+      continue;
+    int &CurrCounter =
+      ProcResourceCounters[PI->ProcResourceIdx];
+    CurrCounter += PI->Cycles;
+    // Check if this is now the new critical resource.
+    if ((CurrCounter > ProcResCostLim) &&
+        (CriticalResourceIdx == UINT_MAX ||
+         (PI->ProcResourceIdx != CriticalResourceIdx &&
+          CurrCounter >
+          ProcResourceCounters[CriticalResourceIdx]))) {
+      DEBUG( dbgs() << "+++ New critical resource: "
+             << SchedModel->getProcResource(PI->ProcResourceIdx)->Name
+             << "\n";);
+      CriticalResourceIdx = PI->ProcResourceIdx;
+    }
+  }
+
+  // Make note of an instruction that uses a blocking resource (FPd).
+  if (SU->isUnbuffered) {
+    LastFPdOpCycleIdx = getCurrCycleIdx();
+    DEBUG (dbgs() << "+++ Last FPd cycle index: "
+           << LastFPdOpCycleIdx << "\n";);
+  }
+
+  // Insert SU into current group by increasing number of slots used
+  // in current group.
+  CurrGroupSize += getNumDecoderSlots(SU);
+  assert (CurrGroupSize <= 3);
+
+  // Check if current group is now full/ended. If so, move on to next
+  // group to be ready to evaluate more candidates.
+  if (CurrGroupSize == 3 || SC->EndGroup)
+    nextGroup();
+}
+
+int SystemZHazardRecognizer::groupingCost(SUnit *SU) const {
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  if (!SC->isValid())
+    return 0;
+  
+  // If SU begins new group, it can either break a current group early
+  // or fit naturally if current group is empty (negative cost).
+  if (SC->BeginGroup) {
+    if (CurrGroupSize)
+      return 3 - CurrGroupSize;
+    return -1;
+  }
+
+  // Similarly, a group-ending SU may either fit well (last in group), or
+  // end the group prematurely.
+  if (SC->EndGroup) {
+    unsigned resultingGroupSize =
+      (CurrGroupSize + getNumDecoderSlots(SU));
+    if (resultingGroupSize < 3)
+      return (3 - resultingGroupSize);
+    return -1;
+  }
+
+  // Most instructions can be placed in any decoder slot.
+  return 0;
+}
+
+bool SystemZHazardRecognizer::isFPdOpPreferred_distance(const SUnit *SU) {
+  assert (SU->isUnbuffered);
+  // If this is the first FPd op, it should be scheduled high.
+  if (LastFPdOpCycleIdx == UINT_MAX)
+    return true;
+  // If this is not the first PFd op, it should go into the other side
+  // of the processor to use the other FPd unit there. This should
+  // generally happen if two FPd ops are placed with 2 other
+  // instructions between them (modulo 6).
+  if (LastFPdOpCycleIdx > getCurrCycleIdx())
+    return ((LastFPdOpCycleIdx - getCurrCycleIdx()) == 3);
+  return ((getCurrCycleIdx() - LastFPdOpCycleIdx) == 3);
+}
+
+int SystemZHazardRecognizer::
+resourcesCost(SUnit *SU) {
+  int Cost = 0;
+
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  if (!SC->isValid())
+    return 0;
+
+  // For a FPd op, either return min or max value as indicated by the
+  // distance to any prior FPd op.
+  if (SU->isUnbuffered)
+    Cost = (isFPdOpPreferred_distance(SU) ? INT_MIN : INT_MAX);
+  // For other instructions, give a cost to the use of the critical resource.
+  else if (CriticalResourceIdx != UINT_MAX) {
+    for (TargetSchedModel::ProcResIter
+           PI = SchedModel->getWriteProcResBegin(SC),
+           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI)
+      if (PI->ProcResourceIdx == CriticalResourceIdx)
+        Cost = PI->Cycles;
+  }
+
+  return Cost;
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h
new file mode 100644
index 000000000000..8fa54ee434cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h
@@ -0,0 +1,128 @@
+//=-- SystemZHazardRecognizer.h - SystemZ Hazard Recognizer -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a hazard recognizer for the SystemZ scheduler.
+//
+// This class is used by the SystemZ scheduling strategy to maintain
+// the state during scheduling, and provide cost functions for
+// scheduling candidates. This includes:
+//
+// * Decoder grouping. A decoder group can maximally hold 3 uops, and
+// instructions that always begin a new group should be scheduled when
+// the current decoder group is empty.
+// * Processor resources usage. It is beneficial to balance the use of
+// resources.
+//
+// ===---------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZHAZARDRECOGNIZER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZHAZARDRECOGNIZER_H
+
+#include "SystemZSubtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+
+namespace llvm {
+
+/// SystemZHazardRecognizer maintains the state during scheduling.
+class SystemZHazardRecognizer : public ScheduleHazardRecognizer {
+
+  ScheduleDAGMI *DAG;
+  const TargetSchedModel *SchedModel;
+
+  /// Keep track of the number of decoder slots used in the current
+  /// decoder group.
+  unsigned CurrGroupSize;
+
+  /// The tracking of resources here are quite similar to the common
+  /// code use of a critical resource. However, z13 differs in the way
+  /// that it has two processor sides which may be interesting to
+  /// model in the future (a work in progress).
+
+  /// Counters for the number of uops scheduled per processor
+  /// resource.
+  SmallVector<int, 0> ProcResourceCounters;
+
+  /// This is the resource with the greatest queue, which the
+  /// scheduler tries to avoid.
+  unsigned CriticalResourceIdx;
+
+  /// Return the number of decoder slots MI requires.
+  inline unsigned getNumDecoderSlots(SUnit *SU) const;
+
+  /// Return true if MI fits into current decoder group.
+  bool fitsIntoCurrentGroup(SUnit *SU) const;
+
+  /// Two decoder groups per cycle are formed (for z13), meaning 2x3
+  /// instructions. This function returns a number between 0 and 5,
+  /// representing the current decoder slot of the current cycle.
+  unsigned getCurrCycleIdx();
+  
+  /// LastFPdOpCycleIdx stores the numbeer returned by getCurrCycleIdx()
+  /// when a stalling operation is scheduled (which uses the FPd resource).
+  unsigned LastFPdOpCycleIdx;
+
+  /// A counter of decoder groups scheduled.
+  unsigned GrpCount;
+
+  unsigned getCurrGroupSize() {return CurrGroupSize;};
+
+  /// Start next decoder group.
+  void nextGroup(bool DbgOutput = true);
+
+  /// Clear all counters for processor resources.
+  void clearProcResCounters();
+
+  /// With the goal of alternating processor sides for stalling (FPd)
+  /// ops, return true if it seems good to schedule an FPd op next.
+  bool isFPdOpPreferred_distance(const SUnit *SU);
+
+public:
+  SystemZHazardRecognizer(const MachineSchedContext *C);
+
+  void setDAG(ScheduleDAGMI *dag) {
+    DAG = dag;
+    SchedModel = dag->getSchedModel();
+  }
+  
+  HazardType getHazardType(SUnit *m, int Stalls = 0) override;    
+  void Reset() override;
+  void EmitInstruction(SUnit *SU) override;
+
+  // Cost functions used by SystemZPostRASchedStrategy while
+  // evaluating candidates.
+
+  /// Return the cost of decoder grouping for SU. If SU must start a
+  /// new decoder group, this is negative if this fits the schedule or
+  /// positive if it would mean ending a group prematurely. For normal
+  /// instructions this returns 0.
+  int groupingCost(SUnit *SU) const; 
+
+  /// Return the cost of SU in regards to processor resources usage.
+  /// A positive value means it would be better to wait with SU, while
+  /// a negative value means it would be good to schedule SU next.
+  int resourcesCost(SUnit *SU);
+
+#ifndef NDEBUG
+  // Debug dumping.
+  std::string CurGroupDbg; // current group as text
+  void dumpSU(SUnit *SU, raw_ostream &OS) const;
+  void dumpCurrGroup(std::string Msg = "") const;
+  void dumpProcResourceCounters() const;
+#endif
+};
+
+} // namespace llvm
+
+#endif /* LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZHAZARDRECOGNIZER_H */
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
new file mode 100644
index 000000000000..920b6e430e8f
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -0,0 +1,1419 @@
+//===-- SystemZISelDAGToDAG.cpp - A dag to dag inst selector for SystemZ --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the SystemZ target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-isel"
+
+namespace {
+// Used to build addressing modes.
+struct SystemZAddressingMode {
+  // The shape of the address.
+  enum AddrForm {
+    // base+displacement
+    FormBD,
+
+    // base+displacement+index for load and store operands
+    FormBDXNormal,
+
+    // base+displacement+index for load address operands
+    FormBDXLA,
+
+    // base+displacement+index+ADJDYNALLOC
+    FormBDXDynAlloc
+  };
+  AddrForm Form;
+
+  // The type of displacement.  The enum names here correspond directly
+  // to the definitions in SystemZOperand.td.  We could split them into
+  // flags -- single/pair, 128-bit, etc. -- but it hardly seems worth it.
+  enum DispRange {
+    Disp12Only,
+    Disp12Pair,
+    Disp20Only,
+    Disp20Only128,
+    Disp20Pair
+  };
+  DispRange DR;
+
+  // The parts of the address.  The address is equivalent to:
+  //
+  //     Base + Disp + Index + (IncludesDynAlloc ? ADJDYNALLOC : 0)
+  SDValue Base;
+  int64_t Disp;
+  SDValue Index;
+  bool IncludesDynAlloc;
+
+  SystemZAddressingMode(AddrForm form, DispRange dr)
+    : Form(form), DR(dr), Base(), Disp(0), Index(),
+      IncludesDynAlloc(false) {}
+
+  // True if the address can have an index register.
+  bool hasIndexField() { return Form != FormBD; }
+
+  // True if the address can (and must) include ADJDYNALLOC.
+  bool isDynAlloc() { return Form == FormBDXDynAlloc; }
+
+  void dump() {
+    errs() << "SystemZAddressingMode " << this << '\n';
+
+    errs() << " Base ";
+    if (Base.getNode())
+      Base.getNode()->dump();
+    else
+      errs() << "null\n";
+
+    if (hasIndexField()) {
+      errs() << " Index ";
+      if (Index.getNode())
+        Index.getNode()->dump();
+      else
+        errs() << "null\n";
+    }
+
+    errs() << " Disp " << Disp;
+    if (IncludesDynAlloc)
+      errs() << " + ADJDYNALLOC";
+    errs() << '\n';
+  }
+};
+
+// Return a mask with Count low bits set.
+static uint64_t allOnes(unsigned int Count) {
+  assert(Count <= 64);
+  if (Count > 63)
+    return UINT64_MAX;
+  return (uint64_t(1) << Count) - 1;
+}
+
+// Represents operands 2 to 5 of the ROTATE AND ... SELECTED BITS operation
+// given by Opcode.  The operands are: Input (R2), Start (I3), End (I4) and
+// Rotate (I5).  The combined operand value is effectively:
+//
+//   (or (rotl Input, Rotate), ~Mask)
+//
+// for RNSBG and:
+//
+//   (and (rotl Input, Rotate), Mask)
+//
+// otherwise.  The output value has BitSize bits, although Input may be
+// narrower (in which case the upper bits are don't care), or wider (in which
+// case the result will be truncated as part of the operation).
+struct RxSBGOperands {
+  RxSBGOperands(unsigned Op, SDValue N)
+    : Opcode(Op), BitSize(N.getValueSizeInBits()),
+      Mask(allOnes(BitSize)), Input(N), Start(64 - BitSize), End(63),
+      Rotate(0) {}
+
+  unsigned Opcode;
+  unsigned BitSize;
+  uint64_t Mask;
+  SDValue Input;
+  unsigned Start;
+  unsigned End;
+  unsigned Rotate;
+};
+
+class SystemZDAGToDAGISel : public SelectionDAGISel {
+  const SystemZSubtarget *Subtarget;
+
+  // Used by SystemZOperands.td to create integer constants.
+  inline SDValue getImm(const SDNode *Node, uint64_t Imm) const {
+    return CurDAG->getTargetConstant(Imm, SDLoc(Node), Node->getValueType(0));
+  }
+
+  const SystemZTargetMachine &getTargetMachine() const {
+    return static_cast<const SystemZTargetMachine &>(TM);
+  }
+
+  const SystemZInstrInfo *getInstrInfo() const {
+    return Subtarget->getInstrInfo();
+  }
+
+  // Try to fold more of the base or index of AM into AM, where IsBase
+  // selects between the base and index.
+  bool expandAddress(SystemZAddressingMode &AM, bool IsBase) const;
+
+  // Try to describe N in AM, returning true on success.
+  bool selectAddress(SDValue N, SystemZAddressingMode &AM) const;
+
+  // Extract individual target operands from matched address AM.
+  void getAddressOperands(const SystemZAddressingMode &AM, EVT VT,
+                          SDValue &Base, SDValue &Disp) const;
+  void getAddressOperands(const SystemZAddressingMode &AM, EVT VT,
+                          SDValue &Base, SDValue &Disp, SDValue &Index) const;
+
+  // Try to match Addr as a FormBD address with displacement type DR.
+  // Return true on success, storing the base and displacement in
+  // Base and Disp respectively.
+  bool selectBDAddr(SystemZAddressingMode::DispRange DR, SDValue Addr,
+                    SDValue &Base, SDValue &Disp) const;
+
+  // Try to match Addr as a FormBDX address with displacement type DR.
+  // Return true on success and if the result had no index.  Store the
+  // base and displacement in Base and Disp respectively.
+  bool selectMVIAddr(SystemZAddressingMode::DispRange DR, SDValue Addr,
+                     SDValue &Base, SDValue &Disp) const;
+
+  // Try to match Addr as a FormBDX* address of form Form with
+  // displacement type DR.  Return true on success, storing the base,
+  // displacement and index in Base, Disp and Index respectively.
+  bool selectBDXAddr(SystemZAddressingMode::AddrForm Form,
+                     SystemZAddressingMode::DispRange DR, SDValue Addr,
+                     SDValue &Base, SDValue &Disp, SDValue &Index) const;
+
+  // PC-relative address matching routines used by SystemZOperands.td.
+  bool selectPCRelAddress(SDValue Addr, SDValue &Target) const {
+    if (SystemZISD::isPCREL(Addr.getOpcode())) {
+      Target = Addr.getOperand(0);
+      return true;
+    }
+    return false;
+  }
+
+  // BD matching routines used by SystemZOperands.td.
+  bool selectBDAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectBDAddr(SystemZAddressingMode::Disp12Only, Addr, Base, Disp);
+  }
+  bool selectBDAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectBDAddr(SystemZAddressingMode::Disp12Pair, Addr, Base, Disp);
+  }
+  bool selectBDAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectBDAddr(SystemZAddressingMode::Disp20Only, Addr, Base, Disp);
+  }
+  bool selectBDAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectBDAddr(SystemZAddressingMode::Disp20Pair, Addr, Base, Disp);
+  }
+
+  // MVI matching routines used by SystemZOperands.td.
+  bool selectMVIAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectMVIAddr(SystemZAddressingMode::Disp12Pair, Addr, Base, Disp);
+  }
+  bool selectMVIAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectMVIAddr(SystemZAddressingMode::Disp20Pair, Addr, Base, Disp);
+  }
+
+  // BDX matching routines used by SystemZOperands.td.
+  bool selectBDXAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp,
+                           SDValue &Index) const {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+                         SystemZAddressingMode::Disp12Only,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectBDXAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
+                           SDValue &Index) const {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+                         SystemZAddressingMode::Disp12Pair,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectDynAlloc12Only(SDValue Addr, SDValue &Base, SDValue &Disp,
+                            SDValue &Index) const {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXDynAlloc,
+                         SystemZAddressingMode::Disp12Only,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectBDXAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp,
+                           SDValue &Index) const {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+                         SystemZAddressingMode::Disp20Only,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectBDXAddr20Only128(SDValue Addr, SDValue &Base, SDValue &Disp,
+                              SDValue &Index) const {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+                         SystemZAddressingMode::Disp20Only128,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectBDXAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
+                           SDValue &Index) const {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
+                         SystemZAddressingMode::Disp20Pair,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectLAAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
+                          SDValue &Index) const {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXLA,
+                         SystemZAddressingMode::Disp12Pair,
+                         Addr, Base, Disp, Index);
+  }
+  bool selectLAAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
+                          SDValue &Index) const {
+    return selectBDXAddr(SystemZAddressingMode::FormBDXLA,
+                         SystemZAddressingMode::Disp20Pair,
+                         Addr, Base, Disp, Index);
+  }
+
+  // Try to match Addr as an address with a base, 12-bit displacement
+  // and index, where the index is element Elem of a vector.
+  // Return true on success, storing the base, displacement and vector
+  // in Base, Disp and Index respectively.
+  bool selectBDVAddr12Only(SDValue Addr, SDValue Elem, SDValue &Base,
+                           SDValue &Disp, SDValue &Index) const;
+
+  // Check whether (or Op (and X InsertMask)) is effectively an insertion
+  // of X into bits InsertMask of some Y != Op.  Return true if so and
+  // set Op to that Y.
+  bool detectOrAndInsertion(SDValue &Op, uint64_t InsertMask) const;
+
+  // Try to update RxSBG so that only the bits of RxSBG.Input in Mask are used.
+  // Return true on success.
+  bool refineRxSBGMask(RxSBGOperands &RxSBG, uint64_t Mask) const;
+
+  // Try to fold some of RxSBG.Input into other fields of RxSBG.
+  // Return true on success.
+  bool expandRxSBG(RxSBGOperands &RxSBG) const;
+
+  // Return an undefined value of type VT.
+  SDValue getUNDEF(const SDLoc &DL, EVT VT) const;
+
+  // Convert N to VT, if it isn't already.
+  SDValue convertTo(const SDLoc &DL, EVT VT, SDValue N) const;
+
+  // Try to implement AND or shift node N using RISBG with the zero flag set.
+  // Return the selected node on success, otherwise return null.
+  bool tryRISBGZero(SDNode *N);
+
+  // Try to use RISBG or Opcode to implement OR or XOR node N.
+  // Return the selected node on success, otherwise return null.
+  bool tryRxSBG(SDNode *N, unsigned Opcode);
+
+  // If Op0 is null, then Node is a constant that can be loaded using:
+  //
+  //   (Opcode UpperVal LowerVal)
+  //
+  // If Op0 is nonnull, then Node can be implemented using:
+  //
+  //   (Opcode (Opcode Op0 UpperVal) LowerVal)
+  void splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Op0,
+                           uint64_t UpperVal, uint64_t LowerVal);
+
+  // Try to use gather instruction Opcode to implement vector insertion N.
+  bool tryGather(SDNode *N, unsigned Opcode);
+
+  // Try to use scatter instruction Opcode to implement store Store.
+  bool tryScatter(StoreSDNode *Store, unsigned Opcode);
+
+  // Return true if Load and Store are loads and stores of the same size
+  // and are guaranteed not to overlap.  Such operations can be implemented
+  // using block (SS-format) instructions.
+  //
+  // Partial overlap would lead to incorrect code, since the block operations
+  // are logically bytewise, even though they have a fast path for the
+  // non-overlapping case.  We also need to avoid full overlap (i.e. two
+  // addresses that might be equal at run time) because although that case
+  // would be handled correctly, it might be implemented by millicode.
+  bool canUseBlockOperation(StoreSDNode *Store, LoadSDNode *Load) const;
+
+  // N is a (store (load Y), X) pattern.  Return true if it can use an MVC
+  // from Y to X.
+  bool storeLoadCanUseMVC(SDNode *N) const;
+
+  // N is a (store (op (load A[0]), (load A[1])), X) pattern.  Return true
+  // if A[1 - I] == X and if N can use a block operation like NC from A[I]
+  // to X.
+  bool storeLoadCanUseBlockBinary(SDNode *N, unsigned I) const;
+
+public:
+  SystemZDAGToDAGISel(SystemZTargetMachine &TM, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(TM, OptLevel) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    Subtarget = &MF.getSubtarget<SystemZSubtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
+
+  // Override MachineFunctionPass.
+  StringRef getPassName() const override {
+    return "SystemZ DAG->DAG Pattern Instruction Selection";
+  }
+
+  // Override SelectionDAGISel.
+  void Select(SDNode *Node) override;
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                                    std::vector<SDValue> &OutOps) override;
+
+  // Include the pieces autogenerated from the target description.
+  #include "SystemZGenDAGISel.inc"
+};
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZISelDag(SystemZTargetMachine &TM,
+                                         CodeGenOpt::Level OptLevel) {
+  return new SystemZDAGToDAGISel(TM, OptLevel);
+}
+
+// Return true if Val should be selected as a displacement for an address
+// with range DR.  Here we're interested in the range of both the instruction
+// described by DR and of any pairing instruction.
+static bool selectDisp(SystemZAddressingMode::DispRange DR, int64_t Val) {
+  switch (DR) {
+  case SystemZAddressingMode::Disp12Only:
+    return isUInt<12>(Val);
+
+  case SystemZAddressingMode::Disp12Pair:
+  case SystemZAddressingMode::Disp20Only:
+  case SystemZAddressingMode::Disp20Pair:
+    return isInt<20>(Val);
+
+  case SystemZAddressingMode::Disp20Only128:
+    return isInt<20>(Val) && isInt<20>(Val + 8);
+  }
+  llvm_unreachable("Unhandled displacement range");
+}
+
+// Change the base or index in AM to Value, where IsBase selects
+// between the base and index.
+static void changeComponent(SystemZAddressingMode &AM, bool IsBase,
+                            SDValue Value) {
+  if (IsBase)
+    AM.Base = Value;
+  else
+    AM.Index = Value;
+}
+
+// The base or index of AM is equivalent to Value + ADJDYNALLOC,
+// where IsBase selects between the base and index.  Try to fold the
+// ADJDYNALLOC into AM.
+static bool expandAdjDynAlloc(SystemZAddressingMode &AM, bool IsBase,
+                              SDValue Value) {
+  if (AM.isDynAlloc() && !AM.IncludesDynAlloc) {
+    changeComponent(AM, IsBase, Value);
+    AM.IncludesDynAlloc = true;
+    return true;
+  }
+  return false;
+}
+
+// The base of AM is equivalent to Base + Index.  Try to use Index as
+// the index register.
+static bool expandIndex(SystemZAddressingMode &AM, SDValue Base,
+                        SDValue Index) {
+  if (AM.hasIndexField() && !AM.Index.getNode()) {
+    AM.Base = Base;
+    AM.Index = Index;
+    return true;
+  }
+  return false;
+}
+
+// The base or index of AM is equivalent to Op0 + Op1, where IsBase selects
+// between the base and index.  Try to fold Op1 into AM's displacement.
+static bool expandDisp(SystemZAddressingMode &AM, bool IsBase,
+                       SDValue Op0, uint64_t Op1) {
+  // First try adjusting the displacement.
+  int64_t TestDisp = AM.Disp + Op1;
+  if (selectDisp(AM.DR, TestDisp)) {
+    changeComponent(AM, IsBase, Op0);
+    AM.Disp = TestDisp;
+    return true;
+  }
+
+  // We could consider forcing the displacement into a register and
+  // using it as an index, but it would need to be carefully tuned.
+  return false;
+}
+
+bool SystemZDAGToDAGISel::expandAddress(SystemZAddressingMode &AM,
+                                        bool IsBase) const {
+  SDValue N = IsBase ? AM.Base : AM.Index;
+  unsigned Opcode = N.getOpcode();
+  if (Opcode == ISD::TRUNCATE) {
+    N = N.getOperand(0);
+    Opcode = N.getOpcode();
+  }
+  if (Opcode == ISD::ADD || CurDAG->isBaseWithConstantOffset(N)) {
+    SDValue Op0 = N.getOperand(0);
+    SDValue Op1 = N.getOperand(1);
+
+    unsigned Op0Code = Op0->getOpcode();
+    unsigned Op1Code = Op1->getOpcode();
+
+    if (Op0Code == SystemZISD::ADJDYNALLOC)
+      return expandAdjDynAlloc(AM, IsBase, Op1);
+    if (Op1Code == SystemZISD::ADJDYNALLOC)
+      return expandAdjDynAlloc(AM, IsBase, Op0);
+
+    if (Op0Code == ISD::Constant)
+      return expandDisp(AM, IsBase, Op1,
+                        cast<ConstantSDNode>(Op0)->getSExtValue());
+    if (Op1Code == ISD::Constant)
+      return expandDisp(AM, IsBase, Op0,
+                        cast<ConstantSDNode>(Op1)->getSExtValue());
+
+    if (IsBase && expandIndex(AM, Op0, Op1))
+      return true;
+  }
+  if (Opcode == SystemZISD::PCREL_OFFSET) {
+    SDValue Full = N.getOperand(0);
+    SDValue Base = N.getOperand(1);
+    SDValue Anchor = Base.getOperand(0);
+    uint64_t Offset = (cast<GlobalAddressSDNode>(Full)->getOffset() -
+                       cast<GlobalAddressSDNode>(Anchor)->getOffset());
+    return expandDisp(AM, IsBase, Base, Offset);
+  }
+  return false;
+}
+
+// Return true if an instruction with displacement range DR should be
+// used for displacement value Val.  selectDisp(DR, Val) must already hold.
+static bool isValidDisp(SystemZAddressingMode::DispRange DR, int64_t Val) {
+  assert(selectDisp(DR, Val) && "Invalid displacement");
+  switch (DR) {
+  case SystemZAddressingMode::Disp12Only:
+  case SystemZAddressingMode::Disp20Only:
+  case SystemZAddressingMode::Disp20Only128:
+    return true;
+
+  case SystemZAddressingMode::Disp12Pair:
+    // Use the other instruction if the displacement is too large.
+    return isUInt<12>(Val);
+
+  case SystemZAddressingMode::Disp20Pair:
+    // Use the other instruction if the displacement is small enough.
+    return !isUInt<12>(Val);
+  }
+  llvm_unreachable("Unhandled displacement range");
+}
+
+// Return true if Base + Disp + Index should be performed by LA(Y).
+static bool shouldUseLA(SDNode *Base, int64_t Disp, SDNode *Index) {
+  // Don't use LA(Y) for constants.
+  if (!Base)
+    return false;
+
+  // Always use LA(Y) for frame addresses, since we know that the destination
+  // register is almost always (perhaps always) going to be different from
+  // the frame register.
+  if (Base->getOpcode() == ISD::FrameIndex)
+    return true;
+
+  if (Disp) {
+    // Always use LA(Y) if there is a base, displacement and index.
+    if (Index)
+      return true;
+
+    // Always use LA if the displacement is small enough.  It should always
+    // be no worse than AGHI (and better if it avoids a move).
+    if (isUInt<12>(Disp))
+      return true;
+
+    // For similar reasons, always use LAY if the constant is too big for AGHI.
+    // LAY should be no worse than AGFI.
+    if (!isInt<16>(Disp))
+      return true;
+  } else {
+    // Don't use LA for plain registers.
+    if (!Index)
+      return false;
+
+    // Don't use LA for plain addition if the index operand is only used
+    // once.  It should be a natural two-operand addition in that case.
+    if (Index->hasOneUse())
+      return false;
+
+    // Prefer addition if the second operation is sign-extended, in the
+    // hope of using AGF.
+    unsigned IndexOpcode = Index->getOpcode();
+    if (IndexOpcode == ISD::SIGN_EXTEND ||
+        IndexOpcode == ISD::SIGN_EXTEND_INREG)
+      return false;
+  }
+
+  // Don't use LA for two-operand addition if either operand is only
+  // used once.  The addition instructions are better in that case.
+  if (Base->hasOneUse())
+    return false;
+
+  return true;
+}
+
+// Return true if Addr is suitable for AM, updating AM if so.
+bool SystemZDAGToDAGISel::selectAddress(SDValue Addr,
+                                        SystemZAddressingMode &AM) const {
+  // Start out assuming that the address will need to be loaded separately,
+  // then try to extend it as much as we can.
+  AM.Base = Addr;
+
+  // First try treating the address as a constant.
+  if (Addr.getOpcode() == ISD::Constant &&
+      expandDisp(AM, true, SDValue(),
+                 cast<ConstantSDNode>(Addr)->getSExtValue()))
+    ;
+  // Also see if it's a bare ADJDYNALLOC.
+  else if (Addr.getOpcode() == SystemZISD::ADJDYNALLOC &&
+           expandAdjDynAlloc(AM, true, SDValue()))
+    ;
+  else
+    // Otherwise try expanding each component.
+    while (expandAddress(AM, true) ||
+           (AM.Index.getNode() && expandAddress(AM, false)))
+      continue;
+
+  // Reject cases where it isn't profitable to use LA(Y).
+  if (AM.Form == SystemZAddressingMode::FormBDXLA &&
+      !shouldUseLA(AM.Base.getNode(), AM.Disp, AM.Index.getNode()))
+    return false;
+
+  // Reject cases where the other instruction in a pair should be used.
+  if (!isValidDisp(AM.DR, AM.Disp))
+    return false;
+
+  // Make sure that ADJDYNALLOC is included where necessary.
+  if (AM.isDynAlloc() && !AM.IncludesDynAlloc)
+    return false;
+
+  DEBUG(AM.dump());
+  return true;
+}
+
+// Insert a node into the DAG at least before Pos.  This will reposition
+// the node as needed, and will assign it a node ID that is <= Pos's ID.
+// Note that this does *not* preserve the uniqueness of node IDs!
+// The selection DAG must no longer depend on their uniqueness when this
+// function is used.
+static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) {
+  if (N.getNode()->getNodeId() == -1 ||
+      N.getNode()->getNodeId() > Pos->getNodeId()) {
+    DAG->RepositionNode(Pos->getIterator(), N.getNode());
+    N.getNode()->setNodeId(Pos->getNodeId());
+  }
+}
+
+void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
+                                             EVT VT, SDValue &Base,
+                                             SDValue &Disp) const {
+  Base = AM.Base;
+  if (!Base.getNode())
+    // Register 0 means "no base".  This is mostly useful for shifts.
+    Base = CurDAG->getRegister(0, VT);
+  else if (Base.getOpcode() == ISD::FrameIndex) {
+    // Lower a FrameIndex to a TargetFrameIndex.
+    int64_t FrameIndex = cast<FrameIndexSDNode>(Base)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FrameIndex, VT);
+  } else if (Base.getValueType() != VT) {
+    // Truncate values from i64 to i32, for shifts.
+    assert(VT == MVT::i32 && Base.getValueType() == MVT::i64 &&
+           "Unexpected truncation");
+    SDLoc DL(Base);
+    SDValue Trunc = CurDAG->getNode(ISD::TRUNCATE, DL, VT, Base);
+    insertDAGNode(CurDAG, Base.getNode(), Trunc);
+    Base = Trunc;
+  }
+
+  // Lower the displacement to a TargetConstant.
+  Disp = CurDAG->getTargetConstant(AM.Disp, SDLoc(Base), VT);
+}
+
+void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
+                                             EVT VT, SDValue &Base,
+                                             SDValue &Disp,
+                                             SDValue &Index) const {
+  getAddressOperands(AM, VT, Base, Disp);
+
+  Index = AM.Index;
+  if (!Index.getNode())
+    // Register 0 means "no index".
+    Index = CurDAG->getRegister(0, VT);
+}
+
+bool SystemZDAGToDAGISel::selectBDAddr(SystemZAddressingMode::DispRange DR,
+                                       SDValue Addr, SDValue &Base,
+                                       SDValue &Disp) const {
+  SystemZAddressingMode AM(SystemZAddressingMode::FormBD, DR);
+  if (!selectAddress(Addr, AM))
+    return false;
+
+  getAddressOperands(AM, Addr.getValueType(), Base, Disp);
+  return true;
+}
+
+bool SystemZDAGToDAGISel::selectMVIAddr(SystemZAddressingMode::DispRange DR,
+                                        SDValue Addr, SDValue &Base,
+                                        SDValue &Disp) const {
+  SystemZAddressingMode AM(SystemZAddressingMode::FormBDXNormal, DR);
+  if (!selectAddress(Addr, AM) || AM.Index.getNode())
+    return false;
+
+  getAddressOperands(AM, Addr.getValueType(), Base, Disp);
+  return true;
+}
+
+bool SystemZDAGToDAGISel::selectBDXAddr(SystemZAddressingMode::AddrForm Form,
+                                        SystemZAddressingMode::DispRange DR,
+                                        SDValue Addr, SDValue &Base,
+                                        SDValue &Disp, SDValue &Index) const {
+  SystemZAddressingMode AM(Form, DR);
+  if (!selectAddress(Addr, AM))
+    return false;
+
+  getAddressOperands(AM, Addr.getValueType(), Base, Disp, Index);
+  return true;
+}
+
+bool SystemZDAGToDAGISel::selectBDVAddr12Only(SDValue Addr, SDValue Elem,
+                                              SDValue &Base,
+                                              SDValue &Disp,
+                                              SDValue &Index) const {
+  SDValue Regs[2];
+  if (selectBDXAddr12Only(Addr, Regs[0], Disp, Regs[1]) &&
+      Regs[0].getNode() && Regs[1].getNode()) {
+    for (unsigned int I = 0; I < 2; ++I) {
+      Base = Regs[I];
+      Index = Regs[1 - I];
+      // We can't tell here whether the index vector has the right type
+      // for the access; the caller needs to do that instead.
+      if (Index.getOpcode() == ISD::ZERO_EXTEND)
+        Index = Index.getOperand(0);
+      if (Index.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          Index.getOperand(1) == Elem) {
+        Index = Index.getOperand(0);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool SystemZDAGToDAGISel::detectOrAndInsertion(SDValue &Op,
+                                               uint64_t InsertMask) const {
+  // We're only interested in cases where the insertion is into some operand
+  // of Op, rather than into Op itself.  The only useful case is an AND.
+  if (Op.getOpcode() != ISD::AND)
+    return false;
+
+  // We need a constant mask.
+  auto *MaskNode = dyn_cast<ConstantSDNode>(Op.getOperand(1).getNode());
+  if (!MaskNode)
+    return false;
+
+  // It's not an insertion of Op.getOperand(0) if the two masks overlap.
+  uint64_t AndMask = MaskNode->getZExtValue();
+  if (InsertMask & AndMask)
+    return false;
+
+  // It's only an insertion if all bits are covered or are known to be zero.
+  // The inner check covers all cases but is more expensive.
+  uint64_t Used = allOnes(Op.getValueSizeInBits());
+  if (Used != (AndMask | InsertMask)) {
+    APInt KnownZero, KnownOne;
+    CurDAG->computeKnownBits(Op.getOperand(0), KnownZero, KnownOne);
+    if (Used != (AndMask | InsertMask | KnownZero.getZExtValue()))
+      return false;
+  }
+
+  Op = Op.getOperand(0);
+  return true;
+}
+
+bool SystemZDAGToDAGISel::refineRxSBGMask(RxSBGOperands &RxSBG,
+                                          uint64_t Mask) const {
+  const SystemZInstrInfo *TII = getInstrInfo();
+  if (RxSBG.Rotate != 0)
+    Mask = (Mask << RxSBG.Rotate) | (Mask >> (64 - RxSBG.Rotate));
+  Mask &= RxSBG.Mask;
+  if (TII->isRxSBGMask(Mask, RxSBG.BitSize, RxSBG.Start, RxSBG.End)) {
+    RxSBG.Mask = Mask;
+    return true;
+  }
+  return false;
+}
+
+// Return true if any bits of (RxSBG.Input & Mask) are significant.
+static bool maskMatters(RxSBGOperands &RxSBG, uint64_t Mask) {
+  // Rotate the mask in the same way as RxSBG.Input is rotated.
+  if (RxSBG.Rotate != 0)
+    Mask = ((Mask << RxSBG.Rotate) | (Mask >> (64 - RxSBG.Rotate)));
+  return (Mask & RxSBG.Mask) != 0;
+}
+
+bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
+  SDValue N = RxSBG.Input;
+  unsigned Opcode = N.getOpcode();
+  switch (Opcode) {
+  case ISD::TRUNCATE: {
+    if (RxSBG.Opcode == SystemZ::RNSBG)
+      return false;
+    uint64_t BitSize = N.getValueSizeInBits();
+    uint64_t Mask = allOnes(BitSize);
+    if (!refineRxSBGMask(RxSBG, Mask))
+      return false;
+    RxSBG.Input = N.getOperand(0);
+    return true;
+  }
+  case ISD::AND: {
+    if (RxSBG.Opcode == SystemZ::RNSBG)
+      return false;
+
+    auto *MaskNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    if (!MaskNode)
+      return false;
+
+    SDValue Input = N.getOperand(0);
+    uint64_t Mask = MaskNode->getZExtValue();
+    if (!refineRxSBGMask(RxSBG, Mask)) {
+      // If some bits of Input are already known zeros, those bits will have
+      // been removed from the mask.  See if adding them back in makes the
+      // mask suitable.
+      APInt KnownZero, KnownOne;
+      CurDAG->computeKnownBits(Input, KnownZero, KnownOne);
+      Mask |= KnownZero.getZExtValue();
+      if (!refineRxSBGMask(RxSBG, Mask))
+        return false;
+    }
+    RxSBG.Input = Input;
+    return true;
+  }
+
+  case ISD::OR: {
+    if (RxSBG.Opcode != SystemZ::RNSBG)
+      return false;
+
+    auto *MaskNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    if (!MaskNode)
+      return false;
+
+    SDValue Input = N.getOperand(0);
+    uint64_t Mask = ~MaskNode->getZExtValue();
+    if (!refineRxSBGMask(RxSBG, Mask)) {
+      // If some bits of Input are already known ones, those bits will have
+      // been removed from the mask.  See if adding them back in makes the
+      // mask suitable.
+      APInt KnownZero, KnownOne;
+      CurDAG->computeKnownBits(Input, KnownZero, KnownOne);
+      Mask &= ~KnownOne.getZExtValue();
+      if (!refineRxSBGMask(RxSBG, Mask))
+        return false;
+    }
+    RxSBG.Input = Input;
+    return true;
+  }
+
+  case ISD::ROTL: {
+    // Any 64-bit rotate left can be merged into the RxSBG.
+    if (RxSBG.BitSize != 64 || N.getValueType() != MVT::i64)
+      return false;
+    auto *CountNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    if (!CountNode)
+      return false;
+
+    RxSBG.Rotate = (RxSBG.Rotate + CountNode->getZExtValue()) & 63;
+    RxSBG.Input = N.getOperand(0);
+    return true;
+  }
+
+  case ISD::ANY_EXTEND:
+    // Bits above the extended operand are don't-care.
+    RxSBG.Input = N.getOperand(0);
+    return true;
+
+  case ISD::ZERO_EXTEND:
+    if (RxSBG.Opcode != SystemZ::RNSBG) {
+      // Restrict the mask to the extended operand.
+      unsigned InnerBitSize = N.getOperand(0).getValueSizeInBits();
+      if (!refineRxSBGMask(RxSBG, allOnes(InnerBitSize)))
+        return false;
+
+      RxSBG.Input = N.getOperand(0);
+      return true;
+    }
+    LLVM_FALLTHROUGH;
+
+  case ISD::SIGN_EXTEND: {
+    // Check that the extension bits are don't-care (i.e. are masked out
+    // by the final mask).
+    unsigned InnerBitSize = N.getOperand(0).getValueSizeInBits();
+    if (maskMatters(RxSBG, allOnes(RxSBG.BitSize) - allOnes(InnerBitSize)))
+      return false;
+
+    RxSBG.Input = N.getOperand(0);
+    return true;
+  }
+
+  case ISD::SHL: {
+    auto *CountNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    if (!CountNode)
+      return false;
+
+    uint64_t Count = CountNode->getZExtValue();
+    unsigned BitSize = N.getValueSizeInBits();
+    if (Count < 1 || Count >= BitSize)
+      return false;
+
+    if (RxSBG.Opcode == SystemZ::RNSBG) {
+      // Treat (shl X, count) as (rotl X, size-count) as long as the bottom
+      // count bits from RxSBG.Input are ignored.
+      if (maskMatters(RxSBG, allOnes(Count)))
+        return false;
+    } else {
+      // Treat (shl X, count) as (and (rotl X, count), ~0<<count).
+      if (!refineRxSBGMask(RxSBG, allOnes(BitSize - Count) << Count))
+        return false;
+    }
+
+    RxSBG.Rotate = (RxSBG.Rotate + Count) & 63;
+    RxSBG.Input = N.getOperand(0);
+    return true;
+  }
+
+  case ISD::SRL:
+  case ISD::SRA: {
+    auto *CountNode = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    if (!CountNode)
+      return false;
+
+    uint64_t Count = CountNode->getZExtValue();
+    unsigned BitSize = N.getValueSizeInBits();
+    if (Count < 1 || Count >= BitSize)
+      return false;
+
+    if (RxSBG.Opcode == SystemZ::RNSBG || Opcode == ISD::SRA) {
+      // Treat (srl|sra X, count) as (rotl X, size-count) as long as the top
+      // count bits from RxSBG.Input are ignored.
+      if (maskMatters(RxSBG, allOnes(Count) << (BitSize - Count)))
+        return false;
+    } else {
+      // Treat (srl X, count), mask) as (and (rotl X, size-count), ~0>>count),
+      // which is similar to SLL above.
+      if (!refineRxSBGMask(RxSBG, allOnes(BitSize - Count)))
+        return false;
+    }
+
+    RxSBG.Rotate = (RxSBG.Rotate - Count) & 63;
+    RxSBG.Input = N.getOperand(0);
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
+SDValue SystemZDAGToDAGISel::getUNDEF(const SDLoc &DL, EVT VT) const {
+  SDNode *N = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
+  return SDValue(N, 0);
+}
+
+SDValue SystemZDAGToDAGISel::convertTo(const SDLoc &DL, EVT VT,
+                                       SDValue N) const {
+  if (N.getValueType() == MVT::i32 && VT == MVT::i64)
+    return CurDAG->getTargetInsertSubreg(SystemZ::subreg_l32,
+                                         DL, VT, getUNDEF(DL, MVT::i64), N);
+  if (N.getValueType() == MVT::i64 && VT == MVT::i32)
+    return CurDAG->getTargetExtractSubreg(SystemZ::subreg_l32, DL, VT, N);
+  assert(N.getValueType() == VT && "Unexpected value types");
+  return N;
+}
+
+bool SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  if (!VT.isInteger() || VT.getSizeInBits() > 64)
+    return false;
+  RxSBGOperands RISBG(SystemZ::RISBG, SDValue(N, 0));
+  unsigned Count = 0;
+  while (expandRxSBG(RISBG))
+    // The widening or narrowing is expected to be free.
+    // Counting widening or narrowing as a saved operation will result in
+    // preferring an R*SBG over a simple shift/logical instruction.
+    if (RISBG.Input.getOpcode() != ISD::ANY_EXTEND &&
+        RISBG.Input.getOpcode() != ISD::TRUNCATE)
+      Count += 1;
+  if (Count == 0)
+    return false;
+
+  // Prefer to use normal shift instructions over RISBG, since they can handle
+  // all cases and are sometimes shorter.
+  if (Count == 1 && N->getOpcode() != ISD::AND)
+    return false;
+
+  // Prefer register extensions like LLC over RISBG.  Also prefer to start
+  // out with normal ANDs if one instruction would be enough.  We can convert
+  // these ANDs into an RISBG later if a three-address instruction is useful.
+  if (RISBG.Rotate == 0) {
+    bool PreferAnd = false;
+    // Prefer AND for any 32-bit and-immediate operation.
+    if (VT == MVT::i32)
+      PreferAnd = true;
+    // As well as for any 64-bit operation that can be implemented via LLC(R),
+    // LLH(R), LLGT(R), or one of the and-immediate instructions.
+    else if (RISBG.Mask == 0xff ||
+             RISBG.Mask == 0xffff ||
+             RISBG.Mask == 0x7fffffff ||
+             SystemZ::isImmLF(~RISBG.Mask) ||
+             SystemZ::isImmHF(~RISBG.Mask))
+     PreferAnd = true;
+    // And likewise for the LLZRGF instruction, which doesn't have a register
+    // to register version.
+    else if (auto *Load = dyn_cast<LoadSDNode>(RISBG.Input)) {
+      if (Load->getMemoryVT() == MVT::i32 &&
+          (Load->getExtensionType() == ISD::EXTLOAD ||
+           Load->getExtensionType() == ISD::ZEXTLOAD) &&
+          RISBG.Mask == 0xffffff00 &&
+          Subtarget->hasLoadAndZeroRightmostByte())
+      PreferAnd = true;
+    }
+    if (PreferAnd) {
+      // Replace the current node with an AND.  Note that the current node
+      // might already be that same AND, in which case it is already CSE'd
+      // with it, and we must not call ReplaceNode.
+      SDValue In = convertTo(DL, VT, RISBG.Input);
+      SDValue Mask = CurDAG->getConstant(RISBG.Mask, DL, VT);
+      SDValue New = CurDAG->getNode(ISD::AND, DL, VT, In, Mask);
+      if (N != New.getNode()) {
+        insertDAGNode(CurDAG, N, Mask);
+        insertDAGNode(CurDAG, N, New);
+        ReplaceNode(N, New.getNode());
+        N = New.getNode();
+      }
+      // Now, select the machine opcode to implement this operation.
+      SelectCode(N);
+      return true;
+    }
+  }
+
+  unsigned Opcode = SystemZ::RISBG;
+  // Prefer RISBGN if available, since it does not clobber CC.
+  if (Subtarget->hasMiscellaneousExtensions())
+    Opcode = SystemZ::RISBGN;
+  EVT OpcodeVT = MVT::i64;
+  if (VT == MVT::i32 && Subtarget->hasHighWord()) {
+    Opcode = SystemZ::RISBMux;
+    OpcodeVT = MVT::i32;
+    RISBG.Start &= 31;
+    RISBG.End &= 31;
+  }
+  SDValue Ops[5] = {
+    getUNDEF(DL, OpcodeVT),
+    convertTo(DL, OpcodeVT, RISBG.Input),
+    CurDAG->getTargetConstant(RISBG.Start, DL, MVT::i32),
+    CurDAG->getTargetConstant(RISBG.End | 128, DL, MVT::i32),
+    CurDAG->getTargetConstant(RISBG.Rotate, DL, MVT::i32)
+  };
+  SDValue New = convertTo(
+      DL, VT, SDValue(CurDAG->getMachineNode(Opcode, DL, OpcodeVT, Ops), 0));
+  ReplaceUses(N, New.getNode());
+  CurDAG->RemoveDeadNode(N);
+  return true;
+}
+
+bool SystemZDAGToDAGISel::tryRxSBG(SDNode *N, unsigned Opcode) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  if (!VT.isInteger() || VT.getSizeInBits() > 64)
+    return false;
+  // Try treating each operand of N as the second operand of the RxSBG
+  // and see which goes deepest.
+  RxSBGOperands RxSBG[] = {
+    RxSBGOperands(Opcode, N->getOperand(0)),
+    RxSBGOperands(Opcode, N->getOperand(1))
+  };
+  unsigned Count[] = { 0, 0 };
+  for (unsigned I = 0; I < 2; ++I)
+    while (expandRxSBG(RxSBG[I]))
+      // The widening or narrowing is expected to be free.
+      // Counting widening or narrowing as a saved operation will result in
+      // preferring an R*SBG over a simple shift/logical instruction.
+      if (RxSBG[I].Input.getOpcode() != ISD::ANY_EXTEND &&
+          RxSBG[I].Input.getOpcode() != ISD::TRUNCATE)
+        Count[I] += 1;
+
+  // Do nothing if neither operand is suitable.
+  if (Count[0] == 0 && Count[1] == 0)
+    return false;
+
+  // Pick the deepest second operand.
+  unsigned I = Count[0] > Count[1] ? 0 : 1;
+  SDValue Op0 = N->getOperand(I ^ 1);
+
+  // Prefer IC for character insertions from memory.
+  if (Opcode == SystemZ::ROSBG && (RxSBG[I].Mask & 0xff) == 0)
+    if (auto *Load = dyn_cast<LoadSDNode>(Op0.getNode()))
+      if (Load->getMemoryVT() == MVT::i8)
+        return false;
+
+  // See whether we can avoid an AND in the first operand by converting
+  // ROSBG to RISBG.
+  if (Opcode == SystemZ::ROSBG && detectOrAndInsertion(Op0, RxSBG[I].Mask)) {
+    Opcode = SystemZ::RISBG;
+    // Prefer RISBGN if available, since it does not clobber CC.
+    if (Subtarget->hasMiscellaneousExtensions())
+      Opcode = SystemZ::RISBGN;
+  }
+
+  SDValue Ops[5] = {
+    convertTo(DL, MVT::i64, Op0),
+    convertTo(DL, MVT::i64, RxSBG[I].Input),
+    CurDAG->getTargetConstant(RxSBG[I].Start, DL, MVT::i32),
+    CurDAG->getTargetConstant(RxSBG[I].End, DL, MVT::i32),
+    CurDAG->getTargetConstant(RxSBG[I].Rotate, DL, MVT::i32)
+  };
+  SDValue New = convertTo(
+      DL, VT, SDValue(CurDAG->getMachineNode(Opcode, DL, MVT::i64, Ops), 0));
+  ReplaceNode(N, New.getNode());
+  return true;
+}
+
+void SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node,
+                                              SDValue Op0, uint64_t UpperVal,
+                                              uint64_t LowerVal) {
+  EVT VT = Node->getValueType(0);
+  SDLoc DL(Node);
+  SDValue Upper = CurDAG->getConstant(UpperVal, DL, VT);
+  if (Op0.getNode())
+    Upper = CurDAG->getNode(Opcode, DL, VT, Op0, Upper);
+
+  {
+    // When we haven't passed in Op0, Upper will be a constant. In order to
+    // prevent folding back to the large immediate in `Or = getNode(...)` we run
+    // SelectCode first and end up with an opaque machine node. This means that
+    // we need to use a handle to keep track of Upper in case it gets CSE'd by
+    // SelectCode.
+    //
+    // Note that in the case where Op0 is passed in we could just call
+    // SelectCode(Upper) later, along with the SelectCode(Or), and avoid needing
+    // the handle at all, but it's fine to do it here.
+    //
+    // TODO: This is a pretty hacky way to do this. Can we do something that
+    // doesn't require a two paragraph explanation?
+    HandleSDNode Handle(Upper);
+    SelectCode(Upper.getNode());
+    Upper = Handle.getValue();
+  }
+
+  SDValue Lower = CurDAG->getConstant(LowerVal, DL, VT);
+  SDValue Or = CurDAG->getNode(Opcode, DL, VT, Upper, Lower);
+
+  ReplaceUses(Node, Or.getNode());
+  CurDAG->RemoveDeadNode(Node);
+
+  SelectCode(Or.getNode());
+}
+
+bool SystemZDAGToDAGISel::tryGather(SDNode *N, unsigned Opcode) {
+  SDValue ElemV = N->getOperand(2);
+  auto *ElemN = dyn_cast<ConstantSDNode>(ElemV);
+  if (!ElemN)
+    return false;
+
+  unsigned Elem = ElemN->getZExtValue();
+  EVT VT = N->getValueType(0);
+  if (Elem >= VT.getVectorNumElements())
+    return false;
+
+  auto *Load = dyn_cast<LoadSDNode>(N->getOperand(1));
+  if (!Load || !Load->hasOneUse())
+    return false;
+  if (Load->getMemoryVT().getSizeInBits() !=
+      Load->getValueType(0).getSizeInBits())
+    return false;
+
+  SDValue Base, Disp, Index;
+  if (!selectBDVAddr12Only(Load->getBasePtr(), ElemV, Base, Disp, Index) ||
+      Index.getValueType() != VT.changeVectorElementTypeToInteger())
+    return false;
+
+  SDLoc DL(Load);
+  SDValue Ops[] = {
+    N->getOperand(0), Base, Disp, Index,
+    CurDAG->getTargetConstant(Elem, DL, MVT::i32), Load->getChain()
+  };
+  SDNode *Res = CurDAG->getMachineNode(Opcode, DL, VT, MVT::Other, Ops);
+  ReplaceUses(SDValue(Load, 1), SDValue(Res, 1));
+  ReplaceNode(N, Res);
+  return true;
+}
+
+bool SystemZDAGToDAGISel::tryScatter(StoreSDNode *Store, unsigned Opcode) {
+  SDValue Value = Store->getValue();
+  if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return false;
+  if (Store->getMemoryVT().getSizeInBits() != Value.getValueSizeInBits())
+    return false;
+
+  SDValue ElemV = Value.getOperand(1);
+  auto *ElemN = dyn_cast<ConstantSDNode>(ElemV);
+  if (!ElemN)
+    return false;
+
+  SDValue Vec = Value.getOperand(0);
+  EVT VT = Vec.getValueType();
+  unsigned Elem = ElemN->getZExtValue();
+  if (Elem >= VT.getVectorNumElements())
+    return false;
+
+  SDValue Base, Disp, Index;
+  if (!selectBDVAddr12Only(Store->getBasePtr(), ElemV, Base, Disp, Index) ||
+      Index.getValueType() != VT.changeVectorElementTypeToInteger())
+    return false;
+
+  SDLoc DL(Store);
+  SDValue Ops[] = {
+    Vec, Base, Disp, Index, CurDAG->getTargetConstant(Elem, DL, MVT::i32),
+    Store->getChain()
+  };
+  ReplaceNode(Store, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops));
+  return true;
+}
+
+bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store,
+                                               LoadSDNode *Load) const {
+  // Check that the two memory operands have the same size.
+  if (Load->getMemoryVT() != Store->getMemoryVT())
+    return false;
+
+  // Volatility stops an access from being decomposed.
+  if (Load->isVolatile() || Store->isVolatile())
+    return false;
+
+  // There's no chance of overlap if the load is invariant.
+  if (Load->isInvariant() && Load->isDereferenceable())
+    return true;
+
+  // Otherwise we need to check whether there's an alias.
+  const Value *V1 = Load->getMemOperand()->getValue();
+  const Value *V2 = Store->getMemOperand()->getValue();
+  if (!V1 || !V2)
+    return false;
+
+  // Reject equality.
+  uint64_t Size = Load->getMemoryVT().getStoreSize();
+  int64_t End1 = Load->getSrcValueOffset() + Size;
+  int64_t End2 = Store->getSrcValueOffset() + Size;
+  if (V1 == V2 && End1 == End2)
+    return false;
+
+  return !AA->alias(MemoryLocation(V1, End1, Load->getAAInfo()),
+                    MemoryLocation(V2, End2, Store->getAAInfo()));
+}
+
+bool SystemZDAGToDAGISel::storeLoadCanUseMVC(SDNode *N) const {
+  auto *Store = cast<StoreSDNode>(N);
+  auto *Load = cast<LoadSDNode>(Store->getValue());
+
+  // Prefer not to use MVC if either address can use ... RELATIVE LONG
+  // instructions.
+  uint64_t Size = Load->getMemoryVT().getStoreSize();
+  if (Size > 1 && Size <= 8) {
+    // Prefer LHRL, LRL and LGRL.
+    if (SystemZISD::isPCREL(Load->getBasePtr().getOpcode()))
+      return false;
+    // Prefer STHRL, STRL and STGRL.
+    if (SystemZISD::isPCREL(Store->getBasePtr().getOpcode()))
+      return false;
+  }
+
+  return canUseBlockOperation(Store, Load);
+}
+
+bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
+                                                     unsigned I) const {
+  auto *StoreA = cast<StoreSDNode>(N);
+  auto *LoadA = cast<LoadSDNode>(StoreA->getValue().getOperand(1 - I));
+  auto *LoadB = cast<LoadSDNode>(StoreA->getValue().getOperand(I));
+  return !LoadA->isVolatile() && canUseBlockOperation(StoreA, LoadB);
+}
+
+void SystemZDAGToDAGISel::Select(SDNode *Node) {
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    Node->setNodeId(-1);
+    return;
+  }
+
+  unsigned Opcode = Node->getOpcode();
+  switch (Opcode) {
+  case ISD::OR:
+    if (Node->getOperand(1).getOpcode() != ISD::Constant)
+      if (tryRxSBG(Node, SystemZ::ROSBG))
+        return;
+    goto or_xor;
+
+  case ISD::XOR:
+    if (Node->getOperand(1).getOpcode() != ISD::Constant)
+      if (tryRxSBG(Node, SystemZ::RXSBG))
+        return;
+    // Fall through.
+  or_xor:
+    // If this is a 64-bit operation in which both 32-bit halves are nonzero,
+    // split the operation into two.
+    if (Node->getValueType(0) == MVT::i64)
+      if (auto *Op1 = dyn_cast<ConstantSDNode>(Node->getOperand(1))) {
+        uint64_t Val = Op1->getZExtValue();
+        if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val)) {
+          splitLargeImmediate(Opcode, Node, Node->getOperand(0),
+                              Val - uint32_t(Val), uint32_t(Val));
+          return;
+        }
+      }
+    break;
+
+  case ISD::AND:
+    if (Node->getOperand(1).getOpcode() != ISD::Constant)
+      if (tryRxSBG(Node, SystemZ::RNSBG))
+        return;
+    LLVM_FALLTHROUGH;
+  case ISD::ROTL:
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::ZERO_EXTEND:
+    if (tryRISBGZero(Node))
+      return;
+    break;
+
+  case ISD::Constant:
+    // If this is a 64-bit constant that is out of the range of LLILF,
+    // LLIHF and LGFI, split it into two 32-bit pieces.
+    if (Node->getValueType(0) == MVT::i64) {
+      uint64_t Val = cast<ConstantSDNode>(Node)->getZExtValue();
+      if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val) && !isInt<32>(Val)) {
+        splitLargeImmediate(ISD::OR, Node, SDValue(), Val - uint32_t(Val),
+                            uint32_t(Val));
+        return;
+      }
+    }
+    break;
+
+  case SystemZISD::SELECT_CCMASK: {
+    SDValue Op0 = Node->getOperand(0);
+    SDValue Op1 = Node->getOperand(1);
+    // Prefer to put any load first, so that it can be matched as a
+    // conditional load.  Likewise for constants in range for LOCHI.
+    if ((Op1.getOpcode() == ISD::LOAD && Op0.getOpcode() != ISD::LOAD) ||
+        (Subtarget->hasLoadStoreOnCond2() &&
+         Node->getValueType(0).isInteger() &&
+         Op1.getOpcode() == ISD::Constant &&
+         isInt<16>(cast<ConstantSDNode>(Op1)->getSExtValue()) &&
+         !(Op0.getOpcode() == ISD::Constant &&
+           isInt<16>(cast<ConstantSDNode>(Op0)->getSExtValue())))) {
+      SDValue CCValid = Node->getOperand(2);
+      SDValue CCMask = Node->getOperand(3);
+      uint64_t ConstCCValid =
+        cast<ConstantSDNode>(CCValid.getNode())->getZExtValue();
+      uint64_t ConstCCMask =
+        cast<ConstantSDNode>(CCMask.getNode())->getZExtValue();
+      // Invert the condition.
+      CCMask = CurDAG->getConstant(ConstCCValid ^ ConstCCMask, SDLoc(Node),
+                                   CCMask.getValueType());
+      SDValue Op4 = Node->getOperand(4);
+      Node = CurDAG->UpdateNodeOperands(Node, Op1, Op0, CCValid, CCMask, Op4);
+    }
+    break;
+  }
+
+  case ISD::INSERT_VECTOR_ELT: {
+    EVT VT = Node->getValueType(0);
+    unsigned ElemBitSize = VT.getScalarSizeInBits();
+    if (ElemBitSize == 32) {
+      if (tryGather(Node, SystemZ::VGEF))
+        return;
+    } else if (ElemBitSize == 64) {
+      if (tryGather(Node, SystemZ::VGEG))
+        return;
+    }
+    break;
+  }
+
+  case ISD::STORE: {
+    auto *Store = cast<StoreSDNode>(Node);
+    unsigned ElemBitSize = Store->getValue().getValueSizeInBits();
+    if (ElemBitSize == 32) {
+      if (tryScatter(Store, SystemZ::VSCEF))
+        return;
+    } else if (ElemBitSize == 64) {
+      if (tryScatter(Store, SystemZ::VSCEG))
+        return;
+    }
+    break;
+  }
+  }
+
+  SelectCode(Node);
+}
+
+bool SystemZDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op,
+                             unsigned ConstraintID,
+                             std::vector<SDValue> &OutOps) {
+  SystemZAddressingMode::AddrForm Form;
+  SystemZAddressingMode::DispRange DispRange;
+  SDValue Base, Disp, Index;
+
+  switch(ConstraintID) {
+  default:
+    llvm_unreachable("Unexpected asm memory constraint");
+  case InlineAsm::Constraint_i:
+  case InlineAsm::Constraint_Q:
+    // Accept an address with a short displacement, but no index.
+    Form = SystemZAddressingMode::FormBD;
+    DispRange = SystemZAddressingMode::Disp12Only;
+    break;
+  case InlineAsm::Constraint_R:
+    // Accept an address with a short displacement and an index.
+    Form = SystemZAddressingMode::FormBDXNormal;
+    DispRange = SystemZAddressingMode::Disp12Only;
+    break;
+  case InlineAsm::Constraint_S:
+    // Accept an address with a long displacement, but no index.
+    Form = SystemZAddressingMode::FormBD;
+    DispRange = SystemZAddressingMode::Disp20Only;
+    break;
+  case InlineAsm::Constraint_T:
+  case InlineAsm::Constraint_m:
+    // Accept an address with a long displacement and an index.
+    // m works the same as T, as this is the most general case.
+    Form = SystemZAddressingMode::FormBDXNormal;
+    DispRange = SystemZAddressingMode::Disp20Only;
+    break;
+  }
+
+  if (selectBDXAddr(Form, DispRange, Op, Base, Disp, Index)) {
+    const TargetRegisterClass *TRC =
+      Subtarget->getRegisterInfo()->getPointerRegClass(*MF);
+    SDLoc DL(Base);
+    SDValue RC = CurDAG->getTargetConstant(TRC->getID(), DL, MVT::i32);
+
+    // Make sure that the base address doesn't go into %r0.
+    // If it's a TargetFrameIndex or a fixed register, we shouldn't do anything.
+    if (Base.getOpcode() != ISD::TargetFrameIndex &&
+        Base.getOpcode() != ISD::Register) {
+      Base =
+        SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                       DL, Base.getValueType(),
+                                       Base, RC), 0);
+    }
+
+    // Make sure that the index register isn't assigned to %r0 either.
+    if (Index.getOpcode() != ISD::Register) {
+      Index =
+        SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                       DL, Index.getValueType(),
+                                       Index, RC), 0);
+    }
+
+    OutOps.push_back(Base);
+    OutOps.push_back(Disp);
+    OutOps.push_back(Index);
+    return false;
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
new file mode 100644
index 000000000000..2081809def70
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -0,0 +1,6294 @@
+//===-- SystemZISelLowering.cpp - SystemZ DAG lowering implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SystemZTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZISelLowering.h"
+#include "SystemZCallingConv.h"
+#include "SystemZConstantPoolValue.h"
+#include "SystemZMachineFunctionInfo.h"
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/IR/Intrinsics.h"
+#include <cctype>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-lower"
+
+namespace {
+// Represents a sequence for extracting a 0/1 value from an IPM result:
+// (((X ^ XORValue) + AddValue) >> Bit)
+struct IPMConversion {
+  IPMConversion(unsigned xorValue, int64_t addValue, unsigned bit)
+    : XORValue(xorValue), AddValue(addValue), Bit(bit) {}
+
+  int64_t XORValue;
+  int64_t AddValue;
+  unsigned Bit;
+};
+
+// Represents information about a comparison.
+struct Comparison {
+  Comparison(SDValue Op0In, SDValue Op1In)
+    : Op0(Op0In), Op1(Op1In), Opcode(0), ICmpType(0), CCValid(0), CCMask(0) {}
+
+  // The operands to the comparison.
+  SDValue Op0, Op1;
+
+  // The opcode that should be used to compare Op0 and Op1.
+  unsigned Opcode;
+
+  // A SystemZICMP value.  Only used for integer comparisons.
+  unsigned ICmpType;
+
+  // The mask of CC values that Opcode can produce.
+  unsigned CCValid;
+
+  // The mask of CC values for which the original condition is true.
+  unsigned CCMask;
+};
+} // end anonymous namespace
+
+// Classify VT as either 32 or 64 bit.
+static bool is32Bit(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i32:
+    return true;
+  case MVT::i64:
+    return false;
+  default:
+    llvm_unreachable("Unsupported type");
+  }
+}
+
+// Return a version of MachineOperand that can be safely used before the
+// final use.
+static MachineOperand earlyUseOperand(MachineOperand Op) {
+  if (Op.isReg())
+    Op.setIsKill(false);
+  return Op;
+}
+
+SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
+                                             const SystemZSubtarget &STI)
+    : TargetLowering(TM), Subtarget(STI) {
+  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
+
+  // Set up the register classes.
+  if (Subtarget.hasHighWord())
+    addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
+  else
+    addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
+  addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
+  if (Subtarget.hasVector()) {
+    addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
+    addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
+  } else {
+    addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
+    addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
+  }
+  addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
+
+  if (Subtarget.hasVector()) {
+    addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass);
+    addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass);
+    addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass);
+    addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass);
+    addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass);
+    addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass);
+  }
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties(Subtarget.getRegisterInfo());
+
+  // Set up special registers.
+  setStackPointerRegisterToSaveRestore(SystemZ::R15D);
+
+  // TODO: It may be better to default to latency-oriented scheduling, however
+  // LLVM's current latency-oriented scheduler can't handle physreg definitions
+  // such as SystemZ has with CC, so set this to the register-pressure
+  // scheduler, because it can.
+  setSchedulingPreference(Sched::RegPressure);
+
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  // Instructions are strings of 2-byte aligned 2-byte values.
+  setMinFunctionAlignment(2);
+
+  // Handle operations that are handled in a similar way for all types.
+  for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
+       I <= MVT::LAST_FP_VALUETYPE;
+       ++I) {
+    MVT VT = MVT::SimpleValueType(I);
+    if (isTypeLegal(VT)) {
+      // Lower SET_CC into an IPM-based sequence.
+      setOperationAction(ISD::SETCC, VT, Custom);
+
+      // Expand SELECT(C, A, B) into SELECT_CC(X, 0, A, B, NE).
+      setOperationAction(ISD::SELECT, VT, Expand);
+
+      // Lower SELECT_CC and BR_CC into separate comparisons and branches.
+      setOperationAction(ISD::SELECT_CC, VT, Custom);
+      setOperationAction(ISD::BR_CC,     VT, Custom);
+    }
+  }
+
+  // Expand jump table branches as address arithmetic followed by an
+  // indirect jump.
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+
+  // Expand BRCOND into a BR_CC (see above).
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+
+  // Handle integer types.
+  for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
+       I <= MVT::LAST_INTEGER_VALUETYPE;
+       ++I) {
+    MVT VT = MVT::SimpleValueType(I);
+    if (isTypeLegal(VT)) {
+      // Expand individual DIV and REMs into DIVREMs.
+      setOperationAction(ISD::SDIV, VT, Expand);
+      setOperationAction(ISD::UDIV, VT, Expand);
+      setOperationAction(ISD::SREM, VT, Expand);
+      setOperationAction(ISD::UREM, VT, Expand);
+      setOperationAction(ISD::SDIVREM, VT, Custom);
+      setOperationAction(ISD::UDIVREM, VT, Custom);
+
+      // Lower ATOMIC_LOAD and ATOMIC_STORE into normal volatile loads and
+      // stores, putting a serialization instruction after the stores.
+      setOperationAction(ISD::ATOMIC_LOAD,  VT, Custom);
+      setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
+
+      // Lower ATOMIC_LOAD_SUB into ATOMIC_LOAD_ADD if LAA and LAAG are
+      // available, or if the operand is constant.
+      setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+
+      // Use POPCNT on z196 and above.
+      if (Subtarget.hasPopulationCount())
+        setOperationAction(ISD::CTPOP, VT, Custom);
+      else
+        setOperationAction(ISD::CTPOP, VT, Expand);
+
+      // No special instructions for these.
+      setOperationAction(ISD::CTTZ,            VT, Expand);
+      setOperationAction(ISD::ROTR,            VT, Expand);
+
+      // Use *MUL_LOHI where possible instead of MULH*.
+      setOperationAction(ISD::MULHS, VT, Expand);
+      setOperationAction(ISD::MULHU, VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, VT, Custom);
+      setOperationAction(ISD::UMUL_LOHI, VT, Custom);
+
+      // Only z196 and above have native support for conversions to unsigned.
+      if (!Subtarget.hasFPExtension())
+        setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+    }
+  }
+
+  // Type legalization will convert 8- and 16-bit atomic operations into
+  // forms that operate on i32s (but still keeping the original memory VT).
+  // Lower them into full i32 operations.
+  setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_MIN,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_MAX,  MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Custom);
+
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+
+  // Traps are legal, as we will convert them to "j .+2".
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // z10 has instructions for signed but not unsigned FP conversion.
+  // Handle unsigned 32-bit types as signed 64-bit types.
+  if (!Subtarget.hasFPExtension()) {
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+  }
+
+  // We have native support for a 64-bit CTLZ, via FLOGR.
+  setOperationAction(ISD::CTLZ, MVT::i32, Promote);
+  setOperationAction(ISD::CTLZ, MVT::i64, Legal);
+
+  // Give LowerOperation the chance to replace 64-bit ORs with subregs.
+  setOperationAction(ISD::OR, MVT::i64, Custom);
+
+  // FIXME: Can we support these natively?
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+
+  // We have native instructions for i8, i16 and i32 extensions, but not i1.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::EXTLOAD,  VT, MVT::i1, Promote);
+  }
+
+  // Handle the various types of symbolic address.
+  setOperationAction(ISD::ConstantPool,     PtrVT, Custom);
+  setOperationAction(ISD::GlobalAddress,    PtrVT, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
+  setOperationAction(ISD::BlockAddress,     PtrVT, Custom);
+  setOperationAction(ISD::JumpTable,        PtrVT, Custom);
+
+  // We need to handle dynamic allocations specially because of the
+  // 160-byte area at the bottom of the stack.
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
+  setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, PtrVT, Custom);
+
+  // Use custom expanders so that we can force the function to use
+  // a frame pointer.
+  setOperationAction(ISD::STACKSAVE,    MVT::Other, Custom);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);
+
+  // Handle prefetches with PFD or PFDRL.
+  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
+  for (MVT VT : MVT::vector_valuetypes()) {
+    // Assume by default that all vector operations need to be expanded.
+    for (unsigned Opcode = 0; Opcode < ISD::BUILTIN_OP_END; ++Opcode)
+      if (getOperationAction(Opcode, VT) == Legal)
+        setOperationAction(Opcode, VT, Expand);
+
+    // Likewise all truncating stores and extending loads.
+    for (MVT InnerVT : MVT::vector_valuetypes()) {
+      setTruncStoreAction(VT, InnerVT, Expand);
+      setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+    }
+
+    if (isTypeLegal(VT)) {
+      // These operations are legal for anything that can be stored in a
+      // vector register, even if there is no native support for the format
+      // as such.  In particular, we can do these for v4f32 even though there
+      // are no specific instructions for that format.
+      setOperationAction(ISD::LOAD, VT, Legal);
+      setOperationAction(ISD::STORE, VT, Legal);
+      setOperationAction(ISD::VSELECT, VT, Legal);
+      setOperationAction(ISD::BITCAST, VT, Legal);
+      setOperationAction(ISD::UNDEF, VT, Legal);
+
+      // Likewise, except that we need to replace the nodes with something
+      // more specific.
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+    }
+  }
+
+  // Handle integer vector types.
+  for (MVT VT : MVT::integer_vector_valuetypes()) {
+    if (isTypeLegal(VT)) {
+      // These operations have direct equivalents.
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal);
+      setOperationAction(ISD::ADD, VT, Legal);
+      setOperationAction(ISD::SUB, VT, Legal);
+      if (VT != MVT::v2i64)
+        setOperationAction(ISD::MUL, VT, Legal);
+      setOperationAction(ISD::AND, VT, Legal);
+      setOperationAction(ISD::OR, VT, Legal);
+      setOperationAction(ISD::XOR, VT, Legal);
+      setOperationAction(ISD::CTPOP, VT, Custom);
+      setOperationAction(ISD::CTTZ, VT, Legal);
+      setOperationAction(ISD::CTLZ, VT, Legal);
+
+      // Convert a GPR scalar to a vector by inserting it into element 0.
+      setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+
+      // Use a series of unpacks for extensions.
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
+      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
+
+      // Detect shifts by a scalar amount and convert them into
+      // V*_BY_SCALAR.
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+      setOperationAction(ISD::SRL, VT, Custom);
+
+      // At present ROTL isn't matched by DAGCombiner.  ROTR should be
+      // converted into ROTL.
+      setOperationAction(ISD::ROTL, VT, Expand);
+      setOperationAction(ISD::ROTR, VT, Expand);
+
+      // Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands
+      // and inverting the result as necessary.
+      setOperationAction(ISD::SETCC, VT, Custom);
+    }
+  }
+
+  if (Subtarget.hasVector()) {
+    // There should be no need to check for float types other than v2f64
+    // since <2 x f32> isn't a legal type.
+    setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
+  }
+
+  // Handle floating-point types.
+  for (unsigned I = MVT::FIRST_FP_VALUETYPE;
+       I <= MVT::LAST_FP_VALUETYPE;
+       ++I) {
+    MVT VT = MVT::SimpleValueType(I);
+    if (isTypeLegal(VT)) {
+      // We can use FI for FRINT.
+      setOperationAction(ISD::FRINT, VT, Legal);
+
+      // We can use the extended form of FI for other rounding operations.
+      if (Subtarget.hasFPExtension()) {
+        setOperationAction(ISD::FNEARBYINT, VT, Legal);
+        setOperationAction(ISD::FFLOOR, VT, Legal);
+        setOperationAction(ISD::FCEIL, VT, Legal);
+        setOperationAction(ISD::FTRUNC, VT, Legal);
+        setOperationAction(ISD::FROUND, VT, Legal);
+      }
+
+      // No special instructions for these.
+      setOperationAction(ISD::FSIN, VT, Expand);
+      setOperationAction(ISD::FCOS, VT, Expand);
+      setOperationAction(ISD::FSINCOS, VT, Expand);
+      setOperationAction(ISD::FREM, VT, Expand);
+      setOperationAction(ISD::FPOW, VT, Expand);
+    }
+  }
+
+  // Handle floating-point vector types.
+  if (Subtarget.hasVector()) {
+    // Scalar-to-vector conversion is just a subreg.
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
+
+    // Some insertions and extractions can be done directly but others
+    // need to go via integers.
+    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+
+    // These operations have direct equivalents.
+    setOperationAction(ISD::FADD, MVT::v2f64, Legal);
+    setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
+    setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMA, MVT::v2f64, Legal);
+    setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
+    setOperationAction(ISD::FABS, MVT::v2f64, Legal);
+    setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
+    setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
+    setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
+  }
+
+  // We have fused multiply-addition for f32 and f64 but not f128.
+  setOperationAction(ISD::FMA, MVT::f32,  Legal);
+  setOperationAction(ISD::FMA, MVT::f64,  Legal);
+  setOperationAction(ISD::FMA, MVT::f128, Expand);
+
+  // Needed so that we don't try to implement f128 constant loads using
+  // a load-and-extend of a f80 constant (in cases where the constant
+  // would fit in an f80).
+  for (MVT VT : MVT::fp_valuetypes())
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
+
+  // Floating-point truncation and stores need to be done separately.
+  setTruncStoreAction(MVT::f64,  MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+
+  // We have 64-bit FPR<->GPR moves, but need special handling for
+  // 32-bit forms.
+  if (!Subtarget.hasVector()) {
+    setOperationAction(ISD::BITCAST, MVT::i32, Custom);
+    setOperationAction(ISD::BITCAST, MVT::f32, Custom);
+  }
+
+  // VASTART and VACOPY need to deal with the SystemZ-specific varargs
+  // structure, but VAEND is a no-op.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VACOPY,  MVT::Other, Custom);
+  setOperationAction(ISD::VAEND,   MVT::Other, Expand);
+
+  // Codes for which we want to perform some z-specific combinations.
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::FP_ROUND);
+  setTargetDAGCombine(ISD::BSWAP);
+  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::ROTL);
+
+  // Handle intrinsics.
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  // We want to use MVC in preference to even a single load/store pair.
+  MaxStoresPerMemcpy = 0;
+  MaxStoresPerMemcpyOptSize = 0;
+
+  // The main memset sequence is a byte store followed by an MVC.
+  // Two STC or MV..I stores win over that, but the kind of fused stores
+  // generated by target-independent code don't when the byte value is
+  // variable.  E.g.  "STC <reg>;MHI <reg>,257;STH <reg>" is not better
+  // than "STC;MVC".  Handle the choice in target-specific code instead.
+  MaxStoresPerMemset = 0;
+  MaxStoresPerMemsetOptSize = 0;
+}
+
+EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
+                                              LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  case MVT::f128:
+    return false;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  // We can load zero using LZ?R and negative zero using LZ?R;LC?BR.
+  return Imm.isZero() || Imm.isNegZero();
+}
+
+bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  // We can use CGFI or CLGFI.
+  return isInt<32>(Imm) || isUInt<32>(Imm);
+}
+
+bool SystemZTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  // We can use ALGFI or SLGFI.
+  return isUInt<32>(Imm) || isUInt<32>(-Imm);
+}
+
+bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                           unsigned,
+                                                           unsigned,
+                                                           bool *Fast) const {
+  // Unaligned accesses should never be slower than the expanded version.
+  // We check specifically for aligned accesses in the few cases where
+  // they are required.
+  if (Fast)
+    *Fast = true;
+  return true;
+}
+
+bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                  const AddrMode &AM, Type *Ty,
+                                                  unsigned AS) const {
+  // Punt on globals for now, although they can be used in limited
+  // RELATIVE LONG cases.
+  if (AM.BaseGV)
+    return false;
+
+  // Require a 20-bit signed offset.
+  if (!isInt<20>(AM.BaseOffs))
+    return false;
+
+  // Indexing is OK but no scale factor can be applied.
+  return AM.Scale == 0 || AM.Scale == 1;
+}
+
+bool SystemZTargetLowering::isFoldableMemAccessOffset(Instruction *I,
+                                                      int64_t Offset) const {
+  // This only applies to z13.
+  if (!Subtarget.hasVector())
+    return true;
+
+  // * Use LDE instead of LE/LEY to avoid partial register
+  //   dependencies (LDE only supports small offsets).
+  // * Utilize the vector registers to hold floating point
+  //   values (vector load / store instructions only support small
+  //   offsets).
+
+  assert (isa<LoadInst>(I) || isa<StoreInst>(I));
+  Type *MemAccessTy = (isa<LoadInst>(I) ? I->getType() :
+                       I->getOperand(0)->getType());
+  if (!isUInt<12>(Offset) &&
+      (MemAccessTy->isFloatingPointTy() || MemAccessTy->isVectorTy()))
+    return false;
+
+  return true;
+}
+
+bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const {
+  if (!FromType->isIntegerTy() || !ToType->isIntegerTy())
+    return false;
+  unsigned FromBits = FromType->getPrimitiveSizeInBits();
+  unsigned ToBits = ToType->getPrimitiveSizeInBits();
+  return FromBits > ToBits;
+}
+
+bool SystemZTargetLowering::isTruncateFree(EVT FromVT, EVT ToVT) const {
+  if (!FromVT.isInteger() || !ToVT.isInteger())
+    return false;
+  unsigned FromBits = FromVT.getSizeInBits();
+  unsigned ToBits = ToVT.getSizeInBits();
+  return FromBits > ToBits;
+}
+
+//===----------------------------------------------------------------------===//
+// Inline asm support
+//===----------------------------------------------------------------------===//
+
+TargetLowering::ConstraintType
+SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'a': // Address register
+    case 'd': // Data register (equivalent to 'r')
+    case 'f': // Floating-point register
+    case 'h': // High-part register
+    case 'r': // General-purpose register
+      return C_RegisterClass;
+
+    case 'Q': // Memory with base and unsigned 12-bit displacement
+    case 'R': // Likewise, plus an index
+    case 'S': // Memory with base and signed 20-bit displacement
+    case 'T': // Likewise, plus an index
+    case 'm': // Equivalent to 'T'.
+      return C_Memory;
+
+    case 'I': // Unsigned 8-bit constant
+    case 'J': // Unsigned 12-bit constant
+    case 'K': // Signed 16-bit constant
+    case 'L': // Signed 20-bit displacement (on all targets we support)
+    case 'M': // 0x7fffffff
+      return C_Other;
+
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+TargetLowering::ConstraintWeight SystemZTargetLowering::
+getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                               const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+
+  case 'a': // Address register
+  case 'd': // Data register (equivalent to 'r')
+  case 'h': // High-part register
+  case 'r': // General-purpose register
+    if (CallOperandVal->getType()->isIntegerTy())
+      weight = CW_Register;
+    break;
+
+  case 'f': // Floating-point register
+    if (type->isFloatingPointTy())
+      weight = CW_Register;
+    break;
+
+  case 'I': // Unsigned 8-bit constant
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
+      if (isUInt<8>(C->getZExtValue()))
+        weight = CW_Constant;
+    break;
+
+  case 'J': // Unsigned 12-bit constant
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
+      if (isUInt<12>(C->getZExtValue()))
+        weight = CW_Constant;
+    break;
+
+  case 'K': // Signed 16-bit constant
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
+      if (isInt<16>(C->getSExtValue()))
+        weight = CW_Constant;
+    break;
+
+  case 'L': // Signed 20-bit displacement (on all targets we support)
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
+      if (isInt<20>(C->getSExtValue()))
+        weight = CW_Constant;
+    break;
+
+  case 'M': // 0x7fffffff
+    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
+      if (C->getZExtValue() == 0x7fffffff)
+        weight = CW_Constant;
+    break;
+  }
+  return weight;
+}
+
+// Parse a "{tNNN}" register constraint for which the register type "t"
+// has already been verified.  MC is the class associated with "t" and
+// Map maps 0-based register numbers to LLVM register numbers.
+static std::pair<unsigned, const TargetRegisterClass *>
+parseRegisterNumber(StringRef Constraint, const TargetRegisterClass *RC,
+                    const unsigned *Map) {
+  assert(*(Constraint.end()-1) == '}' && "Missing '}'");
+  if (isdigit(Constraint[2])) {
+    unsigned Index;
+    bool Failed =
+        Constraint.slice(2, Constraint.size() - 1).getAsInteger(10, Index);
+    if (!Failed && Index < 16 && Map[Index])
+      return std::make_pair(Map[Index], RC);
+  }
+  return std::make_pair(0U, nullptr);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+SystemZTargetLowering::getRegForInlineAsmConstraint(
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+    case 'd': // Data register (equivalent to 'r')
+    case 'r': // General-purpose register
+      if (VT == MVT::i64)
+        return std::make_pair(0U, &SystemZ::GR64BitRegClass);
+      else if (VT == MVT::i128)
+        return std::make_pair(0U, &SystemZ::GR128BitRegClass);
+      return std::make_pair(0U, &SystemZ::GR32BitRegClass);
+
+    case 'a': // Address register
+      if (VT == MVT::i64)
+        return std::make_pair(0U, &SystemZ::ADDR64BitRegClass);
+      else if (VT == MVT::i128)
+        return std::make_pair(0U, &SystemZ::ADDR128BitRegClass);
+      return std::make_pair(0U, &SystemZ::ADDR32BitRegClass);
+
+    case 'h': // High-part register (an LLVM extension)
+      return std::make_pair(0U, &SystemZ::GRH32BitRegClass);
+
+    case 'f': // Floating-point register
+      if (VT == MVT::f64)
+        return std::make_pair(0U, &SystemZ::FP64BitRegClass);
+      else if (VT == MVT::f128)
+        return std::make_pair(0U, &SystemZ::FP128BitRegClass);
+      return std::make_pair(0U, &SystemZ::FP32BitRegClass);
+    }
+  }
+  if (Constraint.size() > 0 && Constraint[0] == '{') {
+    // We need to override the default register parsing for GPRs and FPRs
+    // because the interpretation depends on VT.  The internal names of
+    // the registers are also different from the external names
+    // (F0D and F0S instead of F0, etc.).
+    if (Constraint[1] == 'r') {
+      if (VT == MVT::i32)
+        return parseRegisterNumber(Constraint, &SystemZ::GR32BitRegClass,
+                                   SystemZMC::GR32Regs);
+      if (VT == MVT::i128)
+        return parseRegisterNumber(Constraint, &SystemZ::GR128BitRegClass,
+                                   SystemZMC::GR128Regs);
+      return parseRegisterNumber(Constraint, &SystemZ::GR64BitRegClass,
+                                 SystemZMC::GR64Regs);
+    }
+    if (Constraint[1] == 'f') {
+      if (VT == MVT::f32)
+        return parseRegisterNumber(Constraint, &SystemZ::FP32BitRegClass,
+                                   SystemZMC::FP32Regs);
+      if (VT == MVT::f128)
+        return parseRegisterNumber(Constraint, &SystemZ::FP128BitRegClass,
+                                   SystemZMC::FP128Regs);
+      return parseRegisterNumber(Constraint, &SystemZ::FP64BitRegClass,
+                                 SystemZMC::FP64Regs);
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+void SystemZTargetLowering::
+LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                             std::vector<SDValue> &Ops,
+                             SelectionDAG &DAG) const {
+  // Only support length 1 constraints for now.
+  if (Constraint.length() == 1) {
+    switch (Constraint[0]) {
+    case 'I': // Unsigned 8-bit constant
+      if (auto *C = dyn_cast<ConstantSDNode>(Op))
+        if (isUInt<8>(C->getZExtValue()))
+          Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                              Op.getValueType()));
+      return;
+
+    case 'J': // Unsigned 12-bit constant
+      if (auto *C = dyn_cast<ConstantSDNode>(Op))
+        if (isUInt<12>(C->getZExtValue()))
+          Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                              Op.getValueType()));
+      return;
+
+    case 'K': // Signed 16-bit constant
+      if (auto *C = dyn_cast<ConstantSDNode>(Op))
+        if (isInt<16>(C->getSExtValue()))
+          Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
+                                              Op.getValueType()));
+      return;
+
+    case 'L': // Signed 20-bit displacement (on all targets we support)
+      if (auto *C = dyn_cast<ConstantSDNode>(Op))
+        if (isInt<20>(C->getSExtValue()))
+          Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
+                                              Op.getValueType()));
+      return;
+
+    case 'M': // 0x7fffffff
+      if (auto *C = dyn_cast<ConstantSDNode>(Op))
+        if (C->getZExtValue() == 0x7fffffff)
+          Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                              Op.getValueType()));
+      return;
+    }
+  }
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+// Calling conventions
+//===----------------------------------------------------------------------===//
+
+#include "SystemZGenCallingConv.inc"
+
+bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType,
+                                                     Type *ToType) const {
+  return isTruncateFree(FromType, ToType);
+}
+
+bool SystemZTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  return CI->isTailCall();
+}
+
+// We do not yet support 128-bit single-element vector types.  If the user
+// attempts to use such types as function argument or return type, prefer
+// to error out instead of emitting code violating the ABI.
+static void VerifyVectorType(MVT VT, EVT ArgVT) {
+  if (ArgVT.isVector() && !VT.isVector())
+    report_fatal_error("Unsupported vector argument or return type");
+}
+
+static void VerifyVectorTypes(const SmallVectorImpl<ISD::InputArg> &Ins) {
+  for (unsigned i = 0; i < Ins.size(); ++i)
+    VerifyVectorType(Ins[i].VT, Ins[i].ArgVT);
+}
+
+static void VerifyVectorTypes(const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  for (unsigned i = 0; i < Outs.size(); ++i)
+    VerifyVectorType(Outs[i].VT, Outs[i].ArgVT);
+}
+
+// Value is a value that has been passed to us in the location described by VA
+// (and so has type VA.getLocVT()).  Convert Value to VA.getValVT(), chaining
+// any loads onto Chain.
+static SDValue convertLocVTToValVT(SelectionDAG &DAG, const SDLoc &DL,
+                                   CCValAssign &VA, SDValue Chain,
+                                   SDValue Value) {
+  // If the argument has been promoted from a smaller type, insert an
+  // assertion to capture this.
+  if (VA.getLocInfo() == CCValAssign::SExt)
+    Value = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Value,
+                        DAG.getValueType(VA.getValVT()));
+  else if (VA.getLocInfo() == CCValAssign::ZExt)
+    Value = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Value,
+                        DAG.getValueType(VA.getValVT()));
+
+  if (VA.isExtInLoc())
+    Value = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Value);
+  else if (VA.getLocInfo() == CCValAssign::BCvt) {
+    // If this is a short vector argument loaded from the stack,
+    // extend from i64 to full vector size and then bitcast.
+    assert(VA.getLocVT() == MVT::i64);
+    assert(VA.getValVT().isVector());
+    Value = DAG.getBuildVector(MVT::v2i64, DL, {Value, DAG.getUNDEF(MVT::i64)});
+    Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value);
+  } else
+    assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
+  return Value;
+}
+
+// Value is a value of type VA.getValVT() that we need to copy into
+// the location described by VA.  Return a copy of Value converted to
+// VA.getValVT().  The caller is responsible for handling indirect values.
+static SDValue convertValVTToLocVT(SelectionDAG &DAG, const SDLoc &DL,
+                                   CCValAssign &VA, SDValue Value) {
+  switch (VA.getLocInfo()) {
+  case CCValAssign::SExt:
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Value);
+  case CCValAssign::ZExt:
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
+  case CCValAssign::AExt:
+    return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
+  case CCValAssign::BCvt:
+    // If this is a short vector argument to be stored to the stack,
+    // bitcast to v2i64 and then extract first element.
+    assert(VA.getLocVT() == MVT::i64);
+    assert(VA.getValVT().isVector());
+    Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
+                       DAG.getConstant(0, DL, MVT::i32));
+  case CCValAssign::Full:
+    return Value;
+  default:
+    llvm_unreachable("Unhandled getLocInfo()");
+  }
+}
+
+SDValue SystemZTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SystemZMachineFunctionInfo *FuncInfo =
+      MF.getInfo<SystemZMachineFunctionInfo>();
+  auto *TFL =
+      static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // Detect unsupported vector argument types.
+  if (Subtarget.hasVector())
+    VerifyVectorTypes(Ins);
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  SystemZCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
+
+  unsigned NumFixedGPRs = 0;
+  unsigned NumFixedFPRs = 0;
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    SDValue ArgValue;
+    CCValAssign &VA = ArgLocs[I];
+    EVT LocVT = VA.getLocVT();
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      const TargetRegisterClass *RC;
+      switch (LocVT.getSimpleVT().SimpleTy) {
+      default:
+        // Integers smaller than i64 should be promoted to i64.
+        llvm_unreachable("Unexpected argument type");
+      case MVT::i32:
+        NumFixedGPRs += 1;
+        RC = &SystemZ::GR32BitRegClass;
+        break;
+      case MVT::i64:
+        NumFixedGPRs += 1;
+        RC = &SystemZ::GR64BitRegClass;
+        break;
+      case MVT::f32:
+        NumFixedFPRs += 1;
+        RC = &SystemZ::FP32BitRegClass;
+        break;
+      case MVT::f64:
+        NumFixedFPRs += 1;
+        RC = &SystemZ::FP64BitRegClass;
+        break;
+      case MVT::v16i8:
+      case MVT::v8i16:
+      case MVT::v4i32:
+      case MVT::v2i64:
+      case MVT::v4f32:
+      case MVT::v2f64:
+        RC = &SystemZ::VR128BitRegClass;
+        break;
+      }
+
+      unsigned VReg = MRI.createVirtualRegister(RC);
+      MRI.addLiveIn(VA.getLocReg(), VReg);
+      ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
+    } else {
+      assert(VA.isMemLoc() && "Argument not register or memory");
+
+      // Create the frame index object for this incoming parameter.
+      int FI = MFI.CreateFixedObject(LocVT.getSizeInBits() / 8,
+                                     VA.getLocMemOffset(), true);
+
+      // Create the SelectionDAG nodes corresponding to a load
+      // from this parameter.  Unpromoted ints and floats are
+      // passed as right-justified 8-byte values.
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
+        FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
+                          DAG.getIntPtrConstant(4, DL));
+      ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
+                             MachinePointerInfo::getFixedStack(MF, FI));
+    }
+
+    // Convert the value of the argument register into the value that's
+    // being passed.
+    if (VA.getLocInfo() == CCValAssign::Indirect) {
+      InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
+                                   MachinePointerInfo()));
+      // If the original argument was split (e.g. i128), we need
+      // to load all parts of it here (using the same address).
+      unsigned ArgIndex = Ins[I].OrigArgIndex;
+      assert (Ins[I].PartOffset == 0);
+      while (I + 1 != E && Ins[I + 1].OrigArgIndex == ArgIndex) {
+        CCValAssign &PartVA = ArgLocs[I + 1];
+        unsigned PartOffset = Ins[I + 1].PartOffset;
+        SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue,
+                                      DAG.getIntPtrConstant(PartOffset, DL));
+        InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address,
+                                     MachinePointerInfo()));
+        ++I;
+      }
+    } else
+      InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, ArgValue));
+  }
+
+  if (IsVarArg) {
+    // Save the number of non-varargs registers for later use by va_start, etc.
+    FuncInfo->setVarArgsFirstGPR(NumFixedGPRs);
+    FuncInfo->setVarArgsFirstFPR(NumFixedFPRs);
+
+    // Likewise the address (in the form of a frame index) of where the
+    // first stack vararg would be.  The 1-byte size here is arbitrary.
+    int64_t StackSize = CCInfo.getNextStackOffset();
+    FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
+
+    // ...and a similar frame index for the caller-allocated save area
+    // that will be used to store the incoming registers.
+    int64_t RegSaveOffset = TFL->getOffsetOfLocalArea();
+    unsigned RegSaveIndex = MFI.CreateFixedObject(1, RegSaveOffset, true);
+    FuncInfo->setRegSaveFrameIndex(RegSaveIndex);
+
+    // Store the FPR varargs in the reserved frame slots.  (We store the
+    // GPRs as part of the prologue.)
+    if (NumFixedFPRs < SystemZ::NumArgFPRs) {
+      SDValue MemOps[SystemZ::NumArgFPRs];
+      for (unsigned I = NumFixedFPRs; I < SystemZ::NumArgFPRs; ++I) {
+        unsigned Offset = TFL->getRegSpillOffset(SystemZ::ArgFPRs[I]);
+        int FI = MFI.CreateFixedObject(8, RegSaveOffset + Offset, true);
+        SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+        unsigned VReg = MF.addLiveIn(SystemZ::ArgFPRs[I],
+                                     &SystemZ::FP64BitRegClass);
+        SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
+        MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN,
+                                 MachinePointerInfo::getFixedStack(MF, FI));
+      }
+      // Join the stores, which are independent of one another.
+      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                          makeArrayRef(&MemOps[NumFixedFPRs],
+                                       SystemZ::NumArgFPRs-NumFixedFPRs));
+    }
+  }
+
+  return Chain;
+}
+
+static bool canUseSiblingCall(const CCState &ArgCCInfo,
+                              SmallVectorImpl<CCValAssign> &ArgLocs,
+                              SmallVectorImpl<ISD::OutputArg> &Outs) {
+  // Punt if there are any indirect or stack arguments, or if the call
+  // needs the callee-saved argument register R6, or if the call uses
+  // the callee-saved register arguments SwiftSelf and SwiftError.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    if (VA.getLocInfo() == CCValAssign::Indirect)
+      return false;
+    if (!VA.isRegLoc())
+      return false;
+    unsigned Reg = VA.getLocReg();
+    if (Reg == SystemZ::R6H || Reg == SystemZ::R6L || Reg == SystemZ::R6D)
+      return false;
+    if (Outs[I].Flags.isSwiftSelf() || Outs[I].Flags.isSwiftError())
+      return false;
+  }
+  return true;
+}
+
+SDValue
+SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                 SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &DL = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &IsTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
+  MachineFunction &MF = DAG.getMachineFunction();
+  EVT PtrVT = getPointerTy(MF.getDataLayout());
+
+  // Detect unsupported vector argument and return types.
+  if (Subtarget.hasVector()) {
+    VerifyVectorTypes(Outs);
+    VerifyVectorTypes(Ins);
+  }
+
+  // Analyze the operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+  ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
+
+  // We don't support GuaranteedTailCallOpt, only automatically-detected
+  // sibling calls.
+  if (IsTailCall && !canUseSiblingCall(ArgCCInfo, ArgLocs, Outs))
+    IsTailCall = false;
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = ArgCCInfo.getNextStackOffset();
+
+  // Mark the start of the call.
+  if (!IsTailCall)
+    Chain = DAG.getCALLSEQ_START(Chain,
+                                 DAG.getConstant(NumBytes, DL, PtrVT, true),
+                                 DL);
+
+  // Copy argument values to their designated locations.
+  SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+  SDValue StackPtr;
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    SDValue ArgValue = OutVals[I];
+
+    if (VA.getLocInfo() == CCValAssign::Indirect) {
+      // Store the argument in a stack slot and pass its address.
+      SDValue SpillSlot = DAG.CreateStackTemporary(Outs[I].ArgVT);
+      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+      MemOpChains.push_back(
+          DAG.getStore(Chain, DL, ArgValue, SpillSlot,
+                       MachinePointerInfo::getFixedStack(MF, FI)));
+      // If the original argument was split (e.g. i128), we need
+      // to store all parts of it here (and pass just one address).
+      unsigned ArgIndex = Outs[I].OrigArgIndex;
+      assert (Outs[I].PartOffset == 0);
+      while (I + 1 != E && Outs[I + 1].OrigArgIndex == ArgIndex) {
+        SDValue PartValue = OutVals[I + 1];
+        unsigned PartOffset = Outs[I + 1].PartOffset;
+        SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot,
+                                      DAG.getIntPtrConstant(PartOffset, DL));
+        MemOpChains.push_back(
+            DAG.getStore(Chain, DL, PartValue, Address,
+                         MachinePointerInfo::getFixedStack(MF, FI)));
+        ++I;
+      }
+      ArgValue = SpillSlot;
+    } else
+      ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue);
+
+    if (VA.isRegLoc())
+      // Queue up the argument copies and emit them at the end.
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
+    else {
+      assert(VA.isMemLoc() && "Argument not register or memory");
+
+      // Work out the address of the stack slot.  Unpromoted ints and
+      // floats are passed as right-justified 8-byte values.
+      if (!StackPtr.getNode())
+        StackPtr = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, PtrVT);
+      unsigned Offset = SystemZMC::CallFrameSize + VA.getLocMemOffset();
+      if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
+        Offset += 4;
+      SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
+                                    DAG.getIntPtrConstant(Offset, DL));
+
+      // Emit the store.
+      MemOpChains.push_back(
+          DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
+    }
+  }
+
+  // Join the stores, which are independent of one another.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+  // Accept direct calls by converting symbolic call addresses to the
+  // associated Target* opcodes.  Force %r1 to be used for indirect
+  // tail calls.
+  SDValue Glue;
+  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT);
+    Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
+  } else if (auto *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT);
+    Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
+  } else if (IsTailCall) {
+    Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R1D, Callee, Glue);
+    Glue = Chain.getValue(1);
+    Callee = DAG.getRegister(SystemZ::R1D, Callee.getValueType());
+  }
+
+  // Build a sequence of copy-to-reg nodes, chained and glued together.
+  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
+                             RegsToPass[I].second, Glue);
+    Glue = Chain.getValue(1);
+  }
+
+  // The first call operand is the chain and the second is the target address.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I)
+    Ops.push_back(DAG.getRegister(RegsToPass[I].first,
+                                  RegsToPass[I].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  // Glue the call to the argument copies, if any.
+  if (Glue.getNode())
+    Ops.push_back(Glue);
+
+  // Emit the call.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  if (IsTailCall)
+    return DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, Ops);
+  Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, Ops);
+  Glue = Chain.getValue(1);
+
+  // Mark the end of the call, which is glued to the call itself.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getConstant(NumBytes, DL, PtrVT, true),
+                             DAG.getConstant(0, DL, PtrVT, true),
+                             Glue, DL);
+  Glue = Chain.getValue(1);
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RetLocs;
+  CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
+  RetCCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
+    CCValAssign &VA = RetLocs[I];
+
+    // Copy the value out, gluing the copy to the end of the call sequence.
+    SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
+                                          VA.getLocVT(), Glue);
+    Chain = RetValue.getValue(1);
+    Glue = RetValue.getValue(2);
+
+    // Convert the value of the return register into the value that's
+    // being returned.
+    InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, RetValue));
+  }
+
+  return Chain;
+}
+
+bool SystemZTargetLowering::
+CanLowerReturn(CallingConv::ID CallConv,
+               MachineFunction &MF, bool isVarArg,
+               const SmallVectorImpl<ISD::OutputArg> &Outs,
+               LLVMContext &Context) const {
+  // Detect unsupported vector return types.
+  if (Subtarget.hasVector())
+    VerifyVectorTypes(Outs);
+
+  // Special case that we cannot easily detect in RetCC_SystemZ since
+  // i128 is not a legal type.
+  for (auto &Out : Outs)
+    if (Out.ArgVT == MVT::i128)
+      return false;
+
+  SmallVector<CCValAssign, 16> RetLocs;
+  CCState RetCCInfo(CallConv, isVarArg, MF, RetLocs, Context);
+  return RetCCInfo.CheckReturn(Outs, RetCC_SystemZ);
+}
+
+SDValue
+SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                   bool IsVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const SmallVectorImpl<SDValue> &OutVals,
+                                   const SDLoc &DL, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Detect unsupported vector return types.
+  if (Subtarget.hasVector())
+    VerifyVectorTypes(Outs);
+
+  // Assign locations to each returned value.
+  SmallVector<CCValAssign, 16> RetLocs;
+  CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
+  RetCCInfo.AnalyzeReturn(Outs, RetCC_SystemZ);
+
+  // Quick exit for void returns
+  if (RetLocs.empty())
+    return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, Chain);
+
+  // Copy the result values into the output registers.
+  SDValue Glue;
+  SmallVector<SDValue, 4> RetOps;
+  RetOps.push_back(Chain);
+  for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
+    CCValAssign &VA = RetLocs[I];
+    SDValue RetValue = OutVals[I];
+
+    // Make the return register live on exit.
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    // Promote the value as required.
+    RetValue = convertValVTToLocVT(DAG, DL, VA, RetValue);
+
+    // Chain and glue the copies together.
+    unsigned Reg = VA.getLocReg();
+    Chain = DAG.getCopyToReg(Chain, DL, Reg, RetValue, Glue);
+    Glue = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(Reg, VA.getLocVT()));
+  }
+
+  // Update chain and glue.
+  RetOps[0] = Chain;
+  if (Glue.getNode())
+    RetOps.push_back(Glue);
+
+  return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
+SDValue SystemZTargetLowering::prepareVolatileOrAtomicLoad(
+    SDValue Chain, const SDLoc &DL, SelectionDAG &DAG) const {
+  return DAG.getNode(SystemZISD::SERIALIZE, DL, MVT::Other, Chain);
+}
+
+// Return true if Op is an intrinsic node with chain that returns the CC value
+// as its only (other) argument.  Provide the associated SystemZISD opcode and
+// the mask of valid CC values if so.
+static bool isIntrinsicWithCCAndChain(SDValue Op, unsigned &Opcode,
+                                      unsigned &CCValid) {
+  unsigned Id = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  switch (Id) {
+  case Intrinsic::s390_tbegin:
+    Opcode = SystemZISD::TBEGIN;
+    CCValid = SystemZ::CCMASK_TBEGIN;
+    return true;
+
+  case Intrinsic::s390_tbegin_nofloat:
+    Opcode = SystemZISD::TBEGIN_NOFLOAT;
+    CCValid = SystemZ::CCMASK_TBEGIN;
+    return true;
+
+  case Intrinsic::s390_tend:
+    Opcode = SystemZISD::TEND;
+    CCValid = SystemZ::CCMASK_TEND;
+    return true;
+
+  default:
+    return false;
+  }
+}
+
+// Return true if Op is an intrinsic node without chain that returns the
+// CC value as its final argument.  Provide the associated SystemZISD
+// opcode and the mask of valid CC values if so.
+static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
+  unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  switch (Id) {
+  case Intrinsic::s390_vpkshs:
+  case Intrinsic::s390_vpksfs:
+  case Intrinsic::s390_vpksgs:
+    Opcode = SystemZISD::PACKS_CC;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vpklshs:
+  case Intrinsic::s390_vpklsfs:
+  case Intrinsic::s390_vpklsgs:
+    Opcode = SystemZISD::PACKLS_CC;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vceqbs:
+  case Intrinsic::s390_vceqhs:
+  case Intrinsic::s390_vceqfs:
+  case Intrinsic::s390_vceqgs:
+    Opcode = SystemZISD::VICMPES;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vchbs:
+  case Intrinsic::s390_vchhs:
+  case Intrinsic::s390_vchfs:
+  case Intrinsic::s390_vchgs:
+    Opcode = SystemZISD::VICMPHS;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vchlbs:
+  case Intrinsic::s390_vchlhs:
+  case Intrinsic::s390_vchlfs:
+  case Intrinsic::s390_vchlgs:
+    Opcode = SystemZISD::VICMPHLS;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vtm:
+    Opcode = SystemZISD::VTM;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vfaebs:
+  case Intrinsic::s390_vfaehs:
+  case Intrinsic::s390_vfaefs:
+    Opcode = SystemZISD::VFAE_CC;
+    CCValid = SystemZ::CCMASK_ANY;
+    return true;
+
+  case Intrinsic::s390_vfaezbs:
+  case Intrinsic::s390_vfaezhs:
+  case Intrinsic::s390_vfaezfs:
+    Opcode = SystemZISD::VFAEZ_CC;
+    CCValid = SystemZ::CCMASK_ANY;
+    return true;
+
+  case Intrinsic::s390_vfeebs:
+  case Intrinsic::s390_vfeehs:
+  case Intrinsic::s390_vfeefs:
+    Opcode = SystemZISD::VFEE_CC;
+    CCValid = SystemZ::CCMASK_ANY;
+    return true;
+
+  case Intrinsic::s390_vfeezbs:
+  case Intrinsic::s390_vfeezhs:
+  case Intrinsic::s390_vfeezfs:
+    Opcode = SystemZISD::VFEEZ_CC;
+    CCValid = SystemZ::CCMASK_ANY;
+    return true;
+
+  case Intrinsic::s390_vfenebs:
+  case Intrinsic::s390_vfenehs:
+  case Intrinsic::s390_vfenefs:
+    Opcode = SystemZISD::VFENE_CC;
+    CCValid = SystemZ::CCMASK_ANY;
+    return true;
+
+  case Intrinsic::s390_vfenezbs:
+  case Intrinsic::s390_vfenezhs:
+  case Intrinsic::s390_vfenezfs:
+    Opcode = SystemZISD::VFENEZ_CC;
+    CCValid = SystemZ::CCMASK_ANY;
+    return true;
+
+  case Intrinsic::s390_vistrbs:
+  case Intrinsic::s390_vistrhs:
+  case Intrinsic::s390_vistrfs:
+    Opcode = SystemZISD::VISTR_CC;
+    CCValid = SystemZ::CCMASK_0 | SystemZ::CCMASK_3;
+    return true;
+
+  case Intrinsic::s390_vstrcbs:
+  case Intrinsic::s390_vstrchs:
+  case Intrinsic::s390_vstrcfs:
+    Opcode = SystemZISD::VSTRC_CC;
+    CCValid = SystemZ::CCMASK_ANY;
+    return true;
+
+  case Intrinsic::s390_vstrczbs:
+  case Intrinsic::s390_vstrczhs:
+  case Intrinsic::s390_vstrczfs:
+    Opcode = SystemZISD::VSTRCZ_CC;
+    CCValid = SystemZ::CCMASK_ANY;
+    return true;
+
+  case Intrinsic::s390_vfcedbs:
+    Opcode = SystemZISD::VFCMPES;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vfchdbs:
+    Opcode = SystemZISD::VFCMPHS;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vfchedbs:
+    Opcode = SystemZISD::VFCMPHES;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_vftcidb:
+    Opcode = SystemZISD::VFTCI;
+    CCValid = SystemZ::CCMASK_VCMP;
+    return true;
+
+  case Intrinsic::s390_tdc:
+    Opcode = SystemZISD::TDC;
+    CCValid = SystemZ::CCMASK_TDC;
+    return true;
+
+  default:
+    return false;
+  }
+}
+
+// Emit an intrinsic with chain with a glued value instead of its CC result.
+static SDValue emitIntrinsicWithChainAndGlue(SelectionDAG &DAG, SDValue Op,
+                                             unsigned Opcode) {
+  // Copy all operands except the intrinsic ID.
+  unsigned NumOps = Op.getNumOperands();
+  SmallVector<SDValue, 6> Ops;
+  Ops.reserve(NumOps - 1);
+  Ops.push_back(Op.getOperand(0));
+  for (unsigned I = 2; I < NumOps; ++I)
+    Ops.push_back(Op.getOperand(I));
+
+  assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
+  SDVTList RawVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), RawVTs, Ops);
+  SDValue OldChain = SDValue(Op.getNode(), 1);
+  SDValue NewChain = SDValue(Intr.getNode(), 0);
+  DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
+  return Intr;
+}
+
+// Emit an intrinsic with a glued value instead of its CC result.
+static SDValue emitIntrinsicWithGlue(SelectionDAG &DAG, SDValue Op,
+                                     unsigned Opcode) {
+  // Copy all operands except the intrinsic ID.
+  unsigned NumOps = Op.getNumOperands();
+  SmallVector<SDValue, 6> Ops;
+  Ops.reserve(NumOps - 1);
+  for (unsigned I = 1; I < NumOps; ++I)
+    Ops.push_back(Op.getOperand(I));
+
+  if (Op->getNumValues() == 1)
+    return DAG.getNode(Opcode, SDLoc(Op), MVT::Glue, Ops);
+  assert(Op->getNumValues() == 2 && "Expected exactly one non-CC result");
+  SDVTList RawVTs = DAG.getVTList(Op->getValueType(0), MVT::Glue);
+  return DAG.getNode(Opcode, SDLoc(Op), RawVTs, Ops);
+}
+
+// CC is a comparison that will be implemented using an integer or
+// floating-point comparison.  Return the condition code mask for
+// a branch on true.  In the integer case, CCMASK_CMP_UO is set for
+// unsigned comparisons and clear for signed ones.  In the floating-point
+// case, CCMASK_CMP_UO has its normal mask meaning (unordered).
+static unsigned CCMaskForCondCode(ISD::CondCode CC) {
+#define CONV(X) \
+  case ISD::SET##X: return SystemZ::CCMASK_CMP_##X; \
+  case ISD::SETO##X: return SystemZ::CCMASK_CMP_##X; \
+  case ISD::SETU##X: return SystemZ::CCMASK_CMP_UO | SystemZ::CCMASK_CMP_##X
+
+  switch (CC) {
+  default:
+    llvm_unreachable("Invalid integer condition!");
+
+  CONV(EQ);
+  CONV(NE);
+  CONV(GT);
+  CONV(GE);
+  CONV(LT);
+  CONV(LE);
+
+  case ISD::SETO:  return SystemZ::CCMASK_CMP_O;
+  case ISD::SETUO: return SystemZ::CCMASK_CMP_UO;
+  }
+#undef CONV
+}
+
+// Return a sequence for getting a 1 from an IPM result when CC has a
+// value in CCMask and a 0 when CC has a value in CCValid & ~CCMask.
+// The handling of CC values outside CCValid doesn't matter.
+static IPMConversion getIPMConversion(unsigned CCValid, unsigned CCMask) {
+  // Deal with cases where the result can be taken directly from a bit
+  // of the IPM result.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, 0, SystemZ::IPM_CC);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_2 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, 0, SystemZ::IPM_CC + 1);
+
+  // Deal with cases where we can add a value to force the sign bit
+  // to contain the right value.  Putting the bit in 31 means we can
+  // use SRL rather than RISBG(L), and also makes it easier to get a
+  // 0/-1 value, so it has priority over the other tests below.
+  //
+  // These sequences rely on the fact that the upper two bits of the
+  // IPM result are zero.
+  uint64_t TopBit = uint64_t(1) << 31;
+  if (CCMask == (CCValid & SystemZ::CCMASK_0))
+    return IPMConversion(0, -(1 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_1)))
+    return IPMConversion(0, -(2 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_2)))
+    return IPMConversion(0, -(3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & SystemZ::CCMASK_3))
+    return IPMConversion(0, TopBit - (3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_2
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(0, TopBit - (1 << SystemZ::IPM_CC), 31);
+
+  // Next try inverting the value and testing a bit.  0/1 could be
+  // handled this way too, but we dealt with that case above.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_2)))
+    return IPMConversion(-1, 0, SystemZ::IPM_CC);
+
+  // Handle cases where adding a value forces a non-sign bit to contain
+  // the right value.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_2)))
+    return IPMConversion(0, 1 << SystemZ::IPM_CC, SystemZ::IPM_CC + 1);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, -(1 << SystemZ::IPM_CC), SystemZ::IPM_CC + 1);
+
+  // The remaining cases are 1, 2, 0/1/3 and 0/2/3.  All these are
+  // can be done by inverting the low CC bit and applying one of the
+  // sign-based extractions above.
+  if (CCMask == (CCValid & SystemZ::CCMASK_1))
+    return IPMConversion(1 << SystemZ::IPM_CC, -(1 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & SystemZ::CCMASK_2))
+    return IPMConversion(1 << SystemZ::IPM_CC,
+                         TopBit - (3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(1 << SystemZ::IPM_CC, -(3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_2
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(1 << SystemZ::IPM_CC,
+                         TopBit - (1 << SystemZ::IPM_CC), 31);
+
+  llvm_unreachable("Unexpected CC combination");
+}
+
+// If C can be converted to a comparison against zero, adjust the operands
+// as necessary.
+static void adjustZeroCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
+  if (C.ICmpType == SystemZICMP::UnsignedOnly)
+    return;
+
+  auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1.getNode());
+  if (!ConstOp1)
+    return;
+
+  int64_t Value = ConstOp1->getSExtValue();
+  if ((Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_GT) ||
+      (Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_LE) ||
+      (Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_LT) ||
+      (Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_GE)) {
+    C.CCMask ^= SystemZ::CCMASK_CMP_EQ;
+    C.Op1 = DAG.getConstant(0, DL, C.Op1.getValueType());
+  }
+}
+
+// If a comparison described by C is suitable for CLI(Y), CHHSI or CLHHSI,
+// adjust the operands as necessary.
+static void adjustSubwordCmp(SelectionDAG &DAG, const SDLoc &DL,
+                             Comparison &C) {
+  // For us to make any changes, it must a comparison between a single-use
+  // load and a constant.
+  if (!C.Op0.hasOneUse() ||
+      C.Op0.getOpcode() != ISD::LOAD ||
+      C.Op1.getOpcode() != ISD::Constant)
+    return;
+
+  // We must have an 8- or 16-bit load.
+  auto *Load = cast<LoadSDNode>(C.Op0);
+  unsigned NumBits = Load->getMemoryVT().getStoreSizeInBits();
+  if (NumBits != 8 && NumBits != 16)
+    return;
+
+  // The load must be an extending one and the constant must be within the
+  // range of the unextended value.
+  auto *ConstOp1 = cast<ConstantSDNode>(C.Op1);
+  uint64_t Value = ConstOp1->getZExtValue();
+  uint64_t Mask = (1 << NumBits) - 1;
+  if (Load->getExtensionType() == ISD::SEXTLOAD) {
+    // Make sure that ConstOp1 is in range of C.Op0.
+    int64_t SignedValue = ConstOp1->getSExtValue();
+    if (uint64_t(SignedValue) + (uint64_t(1) << (NumBits - 1)) > Mask)
+      return;
+    if (C.ICmpType != SystemZICMP::SignedOnly) {
+      // Unsigned comparison between two sign-extended values is equivalent
+      // to unsigned comparison between two zero-extended values.
+      Value &= Mask;
+    } else if (NumBits == 8) {
+      // Try to treat the comparison as unsigned, so that we can use CLI.
+      // Adjust CCMask and Value as necessary.
+      if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_LT)
+        // Test whether the high bit of the byte is set.
+        Value = 127, C.CCMask = SystemZ::CCMASK_CMP_GT;
+      else if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_GE)
+        // Test whether the high bit of the byte is clear.
+        Value = 128, C.CCMask = SystemZ::CCMASK_CMP_LT;
+      else
+        // No instruction exists for this combination.
+        return;
+      C.ICmpType = SystemZICMP::UnsignedOnly;
+    }
+  } else if (Load->getExtensionType() == ISD::ZEXTLOAD) {
+    if (Value > Mask)
+      return;
+    // If the constant is in range, we can use any comparison.
+    C.ICmpType = SystemZICMP::Any;
+  } else
+    return;
+
+  // Make sure that the first operand is an i32 of the right extension type.
+  ISD::LoadExtType ExtType = (C.ICmpType == SystemZICMP::SignedOnly ?
+                              ISD::SEXTLOAD :
+                              ISD::ZEXTLOAD);
+  if (C.Op0.getValueType() != MVT::i32 ||
+      Load->getExtensionType() != ExtType)
+    C.Op0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32, Load->getChain(),
+                           Load->getBasePtr(), Load->getPointerInfo(),
+                           Load->getMemoryVT(), Load->getAlignment(),
+                           Load->getMemOperand()->getFlags());
+
+  // Make sure that the second operand is an i32 with the right value.
+  if (C.Op1.getValueType() != MVT::i32 ||
+      Value != ConstOp1->getZExtValue())
+    C.Op1 = DAG.getConstant(Value, DL, MVT::i32);
+}
+
+// Return true if Op is either an unextended load, or a load suitable
+// for integer register-memory comparisons of type ICmpType.
+static bool isNaturalMemoryOperand(SDValue Op, unsigned ICmpType) {
+  auto *Load = dyn_cast<LoadSDNode>(Op.getNode());
+  if (Load) {
+    // There are no instructions to compare a register with a memory byte.
+    if (Load->getMemoryVT() == MVT::i8)
+      return false;
+    // Otherwise decide on extension type.
+    switch (Load->getExtensionType()) {
+    case ISD::NON_EXTLOAD:
+      return true;
+    case ISD::SEXTLOAD:
+      return ICmpType != SystemZICMP::UnsignedOnly;
+    case ISD::ZEXTLOAD:
+      return ICmpType != SystemZICMP::SignedOnly;
+    default:
+      break;
+    }
+  }
+  return false;
+}
+
+// Return true if it is better to swap the operands of C.
+static bool shouldSwapCmpOperands(const Comparison &C) {
+  // Leave f128 comparisons alone, since they have no memory forms.
+  if (C.Op0.getValueType() == MVT::f128)
+    return false;
+
+  // Always keep a floating-point constant second, since comparisons with
+  // zero can use LOAD TEST and comparisons with other constants make a
+  // natural memory operand.
+  if (isa<ConstantFPSDNode>(C.Op1))
+    return false;
+
+  // Never swap comparisons with zero since there are many ways to optimize
+  // those later.
+  auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
+  if (ConstOp1 && ConstOp1->getZExtValue() == 0)
+    return false;
+
+  // Also keep natural memory operands second if the loaded value is
+  // only used here.  Several comparisons have memory forms.
+  if (isNaturalMemoryOperand(C.Op1, C.ICmpType) && C.Op1.hasOneUse())
+    return false;
+
+  // Look for cases where Cmp0 is a single-use load and Cmp1 isn't.
+  // In that case we generally prefer the memory to be second.
+  if (isNaturalMemoryOperand(C.Op0, C.ICmpType) && C.Op0.hasOneUse()) {
+    // The only exceptions are when the second operand is a constant and
+    // we can use things like CHHSI.
+    if (!ConstOp1)
+      return true;
+    // The unsigned memory-immediate instructions can handle 16-bit
+    // unsigned integers.
+    if (C.ICmpType != SystemZICMP::SignedOnly &&
+        isUInt<16>(ConstOp1->getZExtValue()))
+      return false;
+    // The signed memory-immediate instructions can handle 16-bit
+    // signed integers.
+    if (C.ICmpType != SystemZICMP::UnsignedOnly &&
+        isInt<16>(ConstOp1->getSExtValue()))
+      return false;
+    return true;
+  }
+
+  // Try to promote the use of CGFR and CLGFR.
+  unsigned Opcode0 = C.Op0.getOpcode();
+  if (C.ICmpType != SystemZICMP::UnsignedOnly && Opcode0 == ISD::SIGN_EXTEND)
+    return true;
+  if (C.ICmpType != SystemZICMP::SignedOnly && Opcode0 == ISD::ZERO_EXTEND)
+    return true;
+  if (C.ICmpType != SystemZICMP::SignedOnly &&
+      Opcode0 == ISD::AND &&
+      C.Op0.getOperand(1).getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(C.Op0.getOperand(1))->getZExtValue() == 0xffffffff)
+    return true;
+
+  return false;
+}
+
+// Return a version of comparison CC mask CCMask in which the LT and GT
+// actions are swapped.
+static unsigned reverseCCMask(unsigned CCMask) {
+  return ((CCMask & SystemZ::CCMASK_CMP_EQ) |
+          (CCMask & SystemZ::CCMASK_CMP_GT ? SystemZ::CCMASK_CMP_LT : 0) |
+          (CCMask & SystemZ::CCMASK_CMP_LT ? SystemZ::CCMASK_CMP_GT : 0) |
+          (CCMask & SystemZ::CCMASK_CMP_UO));
+}
+
+// Check whether C tests for equality between X and Y and whether X - Y
+// or Y - X is also computed.  In that case it's better to compare the
+// result of the subtraction against zero.
+static void adjustForSubtraction(SelectionDAG &DAG, const SDLoc &DL,
+                                 Comparison &C) {
+  if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
+      C.CCMask == SystemZ::CCMASK_CMP_NE) {
+    for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
+      SDNode *N = *I;
+      if (N->getOpcode() == ISD::SUB &&
+          ((N->getOperand(0) == C.Op0 && N->getOperand(1) == C.Op1) ||
+           (N->getOperand(0) == C.Op1 && N->getOperand(1) == C.Op0))) {
+        C.Op0 = SDValue(N, 0);
+        C.Op1 = DAG.getConstant(0, DL, N->getValueType(0));
+        return;
+      }
+    }
+  }
+}
+
+// Check whether C compares a floating-point value with zero and if that
+// floating-point value is also negated.  In this case we can use the
+// negation to set CC, so avoiding separate LOAD AND TEST and
+// LOAD (NEGATIVE/COMPLEMENT) instructions.
+static void adjustForFNeg(Comparison &C) {
+  auto *C1 = dyn_cast<ConstantFPSDNode>(C.Op1);
+  if (C1 && C1->isZero()) {
+    for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
+      SDNode *N = *I;
+      if (N->getOpcode() == ISD::FNEG) {
+        C.Op0 = SDValue(N, 0);
+        C.CCMask = reverseCCMask(C.CCMask);
+        return;
+      }
+    }
+  }
+}
+
+// Check whether C compares (shl X, 32) with 0 and whether X is
+// also sign-extended.  In that case it is better to test the result
+// of the sign extension using LTGFR.
+//
+// This case is important because InstCombine transforms a comparison
+// with (sext (trunc X)) into a comparison with (shl X, 32).
+static void adjustForLTGFR(Comparison &C) {
+  // Check for a comparison between (shl X, 32) and 0.
+  if (C.Op0.getOpcode() == ISD::SHL &&
+      C.Op0.getValueType() == MVT::i64 &&
+      C.Op1.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
+    auto *C1 = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
+    if (C1 && C1->getZExtValue() == 32) {
+      SDValue ShlOp0 = C.Op0.getOperand(0);
+      // See whether X has any SIGN_EXTEND_INREG uses.
+      for (auto I = ShlOp0->use_begin(), E = ShlOp0->use_end(); I != E; ++I) {
+        SDNode *N = *I;
+        if (N->getOpcode() == ISD::SIGN_EXTEND_INREG &&
+            cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32) {
+          C.Op0 = SDValue(N, 0);
+          return;
+        }
+      }
+    }
+  }
+}
+
+// If C compares the truncation of an extending load, try to compare
+// the untruncated value instead.  This exposes more opportunities to
+// reuse CC.
+static void adjustICmpTruncate(SelectionDAG &DAG, const SDLoc &DL,
+                               Comparison &C) {
+  if (C.Op0.getOpcode() == ISD::TRUNCATE &&
+      C.Op0.getOperand(0).getOpcode() == ISD::LOAD &&
+      C.Op1.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
+    auto *L = cast<LoadSDNode>(C.Op0.getOperand(0));
+    if (L->getMemoryVT().getStoreSizeInBits() <= C.Op0.getValueSizeInBits()) {
+      unsigned Type = L->getExtensionType();
+      if ((Type == ISD::ZEXTLOAD && C.ICmpType != SystemZICMP::SignedOnly) ||
+          (Type == ISD::SEXTLOAD && C.ICmpType != SystemZICMP::UnsignedOnly)) {
+        C.Op0 = C.Op0.getOperand(0);
+        C.Op1 = DAG.getConstant(0, DL, C.Op0.getValueType());
+      }
+    }
+  }
+}
+
+// Return true if shift operation N has an in-range constant shift value.
+// Store it in ShiftVal if so.
+static bool isSimpleShift(SDValue N, unsigned &ShiftVal) {
+  auto *Shift = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!Shift)
+    return false;
+
+  uint64_t Amount = Shift->getZExtValue();
+  if (Amount >= N.getValueSizeInBits())
+    return false;
+
+  ShiftVal = Amount;
+  return true;
+}
+
+// Check whether an AND with Mask is suitable for a TEST UNDER MASK
+// instruction and whether the CC value is descriptive enough to handle
+// a comparison of type Opcode between the AND result and CmpVal.
+// CCMask says which comparison result is being tested and BitSize is
+// the number of bits in the operands.  If TEST UNDER MASK can be used,
+// return the corresponding CC mask, otherwise return 0.
+static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask,
+                                     uint64_t Mask, uint64_t CmpVal,
+                                     unsigned ICmpType) {
+  assert(Mask != 0 && "ANDs with zero should have been removed by now");
+
+  // Check whether the mask is suitable for TMHH, TMHL, TMLH or TMLL.
+  if (!SystemZ::isImmLL(Mask) && !SystemZ::isImmLH(Mask) &&
+      !SystemZ::isImmHL(Mask) && !SystemZ::isImmHH(Mask))
+    return 0;
+
+  // Work out the masks for the lowest and highest bits.
+  unsigned HighShift = 63 - countLeadingZeros(Mask);
+  uint64_t High = uint64_t(1) << HighShift;
+  uint64_t Low = uint64_t(1) << countTrailingZeros(Mask);
+
+  // Signed ordered comparisons are effectively unsigned if the sign
+  // bit is dropped.
+  bool EffectivelyUnsigned = (ICmpType != SystemZICMP::SignedOnly);
+
+  // Check for equality comparisons with 0, or the equivalent.
+  if (CmpVal == 0) {
+    if (CCMask == SystemZ::CCMASK_CMP_EQ)
+      return SystemZ::CCMASK_TM_ALL_0;
+    if (CCMask == SystemZ::CCMASK_CMP_NE)
+      return SystemZ::CCMASK_TM_SOME_1;
+  }
+  if (EffectivelyUnsigned && CmpVal > 0 && CmpVal <= Low) {
+    if (CCMask == SystemZ::CCMASK_CMP_LT)
+      return SystemZ::CCMASK_TM_ALL_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GE)
+      return SystemZ::CCMASK_TM_SOME_1;
+  }
+  if (EffectivelyUnsigned && CmpVal < Low) {
+    if (CCMask == SystemZ::CCMASK_CMP_LE)
+      return SystemZ::CCMASK_TM_ALL_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GT)
+      return SystemZ::CCMASK_TM_SOME_1;
+  }
+
+  // Check for equality comparisons with the mask, or the equivalent.
+  if (CmpVal == Mask) {
+    if (CCMask == SystemZ::CCMASK_CMP_EQ)
+      return SystemZ::CCMASK_TM_ALL_1;
+    if (CCMask == SystemZ::CCMASK_CMP_NE)
+      return SystemZ::CCMASK_TM_SOME_0;
+  }
+  if (EffectivelyUnsigned && CmpVal >= Mask - Low && CmpVal < Mask) {
+    if (CCMask == SystemZ::CCMASK_CMP_GT)
+      return SystemZ::CCMASK_TM_ALL_1;
+    if (CCMask == SystemZ::CCMASK_CMP_LE)
+      return SystemZ::CCMASK_TM_SOME_0;
+  }
+  if (EffectivelyUnsigned && CmpVal > Mask - Low && CmpVal <= Mask) {
+    if (CCMask == SystemZ::CCMASK_CMP_GE)
+      return SystemZ::CCMASK_TM_ALL_1;
+    if (CCMask == SystemZ::CCMASK_CMP_LT)
+      return SystemZ::CCMASK_TM_SOME_0;
+  }
+
+  // Check for ordered comparisons with the top bit.
+  if (EffectivelyUnsigned && CmpVal >= Mask - High && CmpVal < High) {
+    if (CCMask == SystemZ::CCMASK_CMP_LE)
+      return SystemZ::CCMASK_TM_MSB_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GT)
+      return SystemZ::CCMASK_TM_MSB_1;
+  }
+  if (EffectivelyUnsigned && CmpVal > Mask - High && CmpVal <= High) {
+    if (CCMask == SystemZ::CCMASK_CMP_LT)
+      return SystemZ::CCMASK_TM_MSB_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GE)
+      return SystemZ::CCMASK_TM_MSB_1;
+  }
+
+  // If there are just two bits, we can do equality checks for Low and High
+  // as well.
+  if (Mask == Low + High) {
+    if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == Low)
+      return SystemZ::CCMASK_TM_MIXED_MSB_0;
+    if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == Low)
+      return SystemZ::CCMASK_TM_MIXED_MSB_0 ^ SystemZ::CCMASK_ANY;
+    if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == High)
+      return SystemZ::CCMASK_TM_MIXED_MSB_1;
+    if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == High)
+      return SystemZ::CCMASK_TM_MIXED_MSB_1 ^ SystemZ::CCMASK_ANY;
+  }
+
+  // Looks like we've exhausted our options.
+  return 0;
+}
+
+// See whether C can be implemented as a TEST UNDER MASK instruction.
+// Update the arguments with the TM version if so.
+static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
+                                   Comparison &C) {
+  // Check that we have a comparison with a constant.
+  auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
+  if (!ConstOp1)
+    return;
+  uint64_t CmpVal = ConstOp1->getZExtValue();
+
+  // Check whether the nonconstant input is an AND with a constant mask.
+  Comparison NewC(C);
+  uint64_t MaskVal;
+  ConstantSDNode *Mask = nullptr;
+  if (C.Op0.getOpcode() == ISD::AND) {
+    NewC.Op0 = C.Op0.getOperand(0);
+    NewC.Op1 = C.Op0.getOperand(1);
+    Mask = dyn_cast<ConstantSDNode>(NewC.Op1);
+    if (!Mask)
+      return;
+    MaskVal = Mask->getZExtValue();
+  } else {
+    // There is no instruction to compare with a 64-bit immediate
+    // so use TMHH instead if possible.  We need an unsigned ordered
+    // comparison with an i64 immediate.
+    if (NewC.Op0.getValueType() != MVT::i64 ||
+        NewC.CCMask == SystemZ::CCMASK_CMP_EQ ||
+        NewC.CCMask == SystemZ::CCMASK_CMP_NE ||
+        NewC.ICmpType == SystemZICMP::SignedOnly)
+      return;
+    // Convert LE and GT comparisons into LT and GE.
+    if (NewC.CCMask == SystemZ::CCMASK_CMP_LE ||
+        NewC.CCMask == SystemZ::CCMASK_CMP_GT) {
+      if (CmpVal == uint64_t(-1))
+        return;
+      CmpVal += 1;
+      NewC.CCMask ^= SystemZ::CCMASK_CMP_EQ;
+    }
+    // If the low N bits of Op1 are zero than the low N bits of Op0 can
+    // be masked off without changing the result.
+    MaskVal = -(CmpVal & -CmpVal);
+    NewC.ICmpType = SystemZICMP::UnsignedOnly;
+  }
+  if (!MaskVal)
+    return;
+
+  // Check whether the combination of mask, comparison value and comparison
+  // type are suitable.
+  unsigned BitSize = NewC.Op0.getValueSizeInBits();
+  unsigned NewCCMask, ShiftVal;
+  if (NewC.ICmpType != SystemZICMP::SignedOnly &&
+      NewC.Op0.getOpcode() == ISD::SHL &&
+      isSimpleShift(NewC.Op0, ShiftVal) &&
+      (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
+                                        MaskVal >> ShiftVal,
+                                        CmpVal >> ShiftVal,
+                                        SystemZICMP::Any))) {
+    NewC.Op0 = NewC.Op0.getOperand(0);
+    MaskVal >>= ShiftVal;
+  } else if (NewC.ICmpType != SystemZICMP::SignedOnly &&
+             NewC.Op0.getOpcode() == ISD::SRL &&
+             isSimpleShift(NewC.Op0, ShiftVal) &&
+             (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
+                                               MaskVal << ShiftVal,
+                                               CmpVal << ShiftVal,
+                                               SystemZICMP::UnsignedOnly))) {
+    NewC.Op0 = NewC.Op0.getOperand(0);
+    MaskVal <<= ShiftVal;
+  } else {
+    NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask, MaskVal, CmpVal,
+                                     NewC.ICmpType);
+    if (!NewCCMask)
+      return;
+  }
+
+  // Go ahead and make the change.
+  C.Opcode = SystemZISD::TM;
+  C.Op0 = NewC.Op0;
+  if (Mask && Mask->getZExtValue() == MaskVal)
+    C.Op1 = SDValue(Mask, 0);
+  else
+    C.Op1 = DAG.getConstant(MaskVal, DL, C.Op0.getValueType());
+  C.CCValid = SystemZ::CCMASK_TM;
+  C.CCMask = NewCCMask;
+}
+
+// Return a Comparison that tests the condition-code result of intrinsic
+// node Call against constant integer CC using comparison code Cond.
+// Opcode is the opcode of the SystemZISD operation for the intrinsic
+// and CCValid is the set of possible condition-code results.
+static Comparison getIntrinsicCmp(SelectionDAG &DAG, unsigned Opcode,
+                                  SDValue Call, unsigned CCValid, uint64_t CC,
+                                  ISD::CondCode Cond) {
+  Comparison C(Call, SDValue());
+  C.Opcode = Opcode;
+  C.CCValid = CCValid;
+  if (Cond == ISD::SETEQ)
+    // bit 3 for CC==0, bit 0 for CC==3, always false for CC>3.
+    C.CCMask = CC < 4 ? 1 << (3 - CC) : 0;
+  else if (Cond == ISD::SETNE)
+    // ...and the inverse of that.
+    C.CCMask = CC < 4 ? ~(1 << (3 - CC)) : -1;
+  else if (Cond == ISD::SETLT || Cond == ISD::SETULT)
+    // bits above bit 3 for CC==0 (always false), bits above bit 0 for CC==3,
+    // always true for CC>3.
+    C.CCMask = CC < 4 ? ~0U << (4 - CC) : -1;
+  else if (Cond == ISD::SETGE || Cond == ISD::SETUGE)
+    // ...and the inverse of that.
+    C.CCMask = CC < 4 ? ~(~0U << (4 - CC)) : 0;
+  else if (Cond == ISD::SETLE || Cond == ISD::SETULE)
+    // bit 3 and above for CC==0, bit 0 and above for CC==3 (always true),
+    // always true for CC>3.
+    C.CCMask = CC < 4 ? ~0U << (3 - CC) : -1;
+  else if (Cond == ISD::SETGT || Cond == ISD::SETUGT)
+    // ...and the inverse of that.
+    C.CCMask = CC < 4 ? ~(~0U << (3 - CC)) : 0;
+  else
+    llvm_unreachable("Unexpected integer comparison type");
+  C.CCMask &= CCValid;
+  return C;
+}
+
+// Decide how to implement a comparison of type Cond between CmpOp0 with CmpOp1.
+static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
+                         ISD::CondCode Cond, const SDLoc &DL) {
+  if (CmpOp1.getOpcode() == ISD::Constant) {
+    uint64_t Constant = cast<ConstantSDNode>(CmpOp1)->getZExtValue();
+    unsigned Opcode, CCValid;
+    if (CmpOp0.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+        CmpOp0.getResNo() == 0 && CmpOp0->hasNUsesOfValue(1, 0) &&
+        isIntrinsicWithCCAndChain(CmpOp0, Opcode, CCValid))
+      return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
+    if (CmpOp0.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+        CmpOp0.getResNo() == CmpOp0->getNumValues() - 1 &&
+        isIntrinsicWithCC(CmpOp0, Opcode, CCValid))
+      return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
+  }
+  Comparison C(CmpOp0, CmpOp1);
+  C.CCMask = CCMaskForCondCode(Cond);
+  if (C.Op0.getValueType().isFloatingPoint()) {
+    C.CCValid = SystemZ::CCMASK_FCMP;
+    C.Opcode = SystemZISD::FCMP;
+    adjustForFNeg(C);
+  } else {
+    C.CCValid = SystemZ::CCMASK_ICMP;
+    C.Opcode = SystemZISD::ICMP;
+    // Choose the type of comparison.  Equality and inequality tests can
+    // use either signed or unsigned comparisons.  The choice also doesn't
+    // matter if both sign bits are known to be clear.  In those cases we
+    // want to give the main isel code the freedom to choose whichever
+    // form fits best.
+    if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
+        C.CCMask == SystemZ::CCMASK_CMP_NE ||
+        (DAG.SignBitIsZero(C.Op0) && DAG.SignBitIsZero(C.Op1)))
+      C.ICmpType = SystemZICMP::Any;
+    else if (C.CCMask & SystemZ::CCMASK_CMP_UO)
+      C.ICmpType = SystemZICMP::UnsignedOnly;
+    else
+      C.ICmpType = SystemZICMP::SignedOnly;
+    C.CCMask &= ~SystemZ::CCMASK_CMP_UO;
+    adjustZeroCmp(DAG, DL, C);
+    adjustSubwordCmp(DAG, DL, C);
+    adjustForSubtraction(DAG, DL, C);
+    adjustForLTGFR(C);
+    adjustICmpTruncate(DAG, DL, C);
+  }
+
+  if (shouldSwapCmpOperands(C)) {
+    std::swap(C.Op0, C.Op1);
+    C.CCMask = reverseCCMask(C.CCMask);
+  }
+
+  adjustForTestUnderMask(DAG, DL, C);
+  return C;
+}
+
+// Emit the comparison instruction described by C.
+static SDValue emitCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
+  if (!C.Op1.getNode()) {
+    SDValue Op;
+    switch (C.Op0.getOpcode()) {
+    case ISD::INTRINSIC_W_CHAIN:
+      Op = emitIntrinsicWithChainAndGlue(DAG, C.Op0, C.Opcode);
+      break;
+    case ISD::INTRINSIC_WO_CHAIN:
+      Op = emitIntrinsicWithGlue(DAG, C.Op0, C.Opcode);
+      break;
+    default:
+      llvm_unreachable("Invalid comparison operands");
+    }
+    return SDValue(Op.getNode(), Op->getNumValues() - 1);
+  }
+  if (C.Opcode == SystemZISD::ICMP)
+    return DAG.getNode(SystemZISD::ICMP, DL, MVT::Glue, C.Op0, C.Op1,
+                       DAG.getConstant(C.ICmpType, DL, MVT::i32));
+  if (C.Opcode == SystemZISD::TM) {
+    bool RegisterOnly = (bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
+                         bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
+    return DAG.getNode(SystemZISD::TM, DL, MVT::Glue, C.Op0, C.Op1,
+                       DAG.getConstant(RegisterOnly, DL, MVT::i32));
+  }
+  return DAG.getNode(C.Opcode, DL, MVT::Glue, C.Op0, C.Op1);
+}
+
+// Implement a 32-bit *MUL_LOHI operation by extending both operands to
+// 64 bits.  Extend is the extension type to use.  Store the high part
+// in Hi and the low part in Lo.
+static void lowerMUL_LOHI32(SelectionDAG &DAG, const SDLoc &DL, unsigned Extend,
+                            SDValue Op0, SDValue Op1, SDValue &Hi,
+                            SDValue &Lo) {
+  Op0 = DAG.getNode(Extend, DL, MVT::i64, Op0);
+  Op1 = DAG.getNode(Extend, DL, MVT::i64, Op1);
+  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, Op0, Op1);
+  Hi = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
+                   DAG.getConstant(32, DL, MVT::i64));
+  Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Hi);
+  Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
+}
+
+// Lower a binary operation that produces two VT results, one in each
+// half of a GR128 pair.  Op0 and Op1 are the VT operands to the operation,
+// Extend extends Op0 to a GR128, and Opcode performs the GR128 operation
+// on the extended Op0 and (unextended) Op1.  Store the even register result
+// in Even and the odd register result in Odd.
+static void lowerGR128Binary(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
+                             unsigned Extend, unsigned Opcode, SDValue Op0,
+                             SDValue Op1, SDValue &Even, SDValue &Odd) {
+  SDNode *In128 = DAG.getMachineNode(Extend, DL, MVT::Untyped, Op0);
+  SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped,
+                               SDValue(In128, 0), Op1);
+  bool Is32Bit = is32Bit(VT);
+  Even = DAG.getTargetExtractSubreg(SystemZ::even128(Is32Bit), DL, VT, Result);
+  Odd = DAG.getTargetExtractSubreg(SystemZ::odd128(Is32Bit), DL, VT, Result);
+}
+
+// Return an i32 value that is 1 if the CC value produced by Glue is
+// in the mask CCMask and 0 otherwise.  CC is known to have a value
+// in CCValid, so other values can be ignored.
+static SDValue emitSETCC(SelectionDAG &DAG, const SDLoc &DL, SDValue Glue,
+                         unsigned CCValid, unsigned CCMask) {
+  IPMConversion Conversion = getIPMConversion(CCValid, CCMask);
+  SDValue Result = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+
+  if (Conversion.XORValue)
+    Result = DAG.getNode(ISD::XOR, DL, MVT::i32, Result,
+                         DAG.getConstant(Conversion.XORValue, DL, MVT::i32));
+
+  if (Conversion.AddValue)
+    Result = DAG.getNode(ISD::ADD, DL, MVT::i32, Result,
+                         DAG.getConstant(Conversion.AddValue, DL, MVT::i32));
+
+  // The SHR/AND sequence should get optimized to an RISBG.
+  Result = DAG.getNode(ISD::SRL, DL, MVT::i32, Result,
+                       DAG.getConstant(Conversion.Bit, DL, MVT::i32));
+  if (Conversion.Bit != 31)
+    Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
+                         DAG.getConstant(1, DL, MVT::i32));
+  return Result;
+}
+
+// Return the SystemISD vector comparison operation for CC, or 0 if it cannot
+// be done directly.  IsFP is true if CC is for a floating-point rather than
+// integer comparison.
+static unsigned getVectorComparison(ISD::CondCode CC, bool IsFP) {
+  switch (CC) {
+  case ISD::SETOEQ:
+  case ISD::SETEQ:
+    return IsFP ? SystemZISD::VFCMPE : SystemZISD::VICMPE;
+
+  case ISD::SETOGE:
+  case ISD::SETGE:
+    return IsFP ? SystemZISD::VFCMPHE : static_cast<SystemZISD::NodeType>(0);
+
+  case ISD::SETOGT:
+  case ISD::SETGT:
+    return IsFP ? SystemZISD::VFCMPH : SystemZISD::VICMPH;
+
+  case ISD::SETUGT:
+    return IsFP ? static_cast<SystemZISD::NodeType>(0) : SystemZISD::VICMPHL;
+
+  default:
+    return 0;
+  }
+}
+
+// Return the SystemZISD vector comparison operation for CC or its inverse,
+// or 0 if neither can be done directly.  Indicate in Invert whether the
+// result is for the inverse of CC.  IsFP is true if CC is for a
+// floating-point rather than integer comparison.
+static unsigned getVectorComparisonOrInvert(ISD::CondCode CC, bool IsFP,
+                                            bool &Invert) {
+  if (unsigned Opcode = getVectorComparison(CC, IsFP)) {
+    Invert = false;
+    return Opcode;
+  }
+
+  CC = ISD::getSetCCInverse(CC, !IsFP);
+  if (unsigned Opcode = getVectorComparison(CC, IsFP)) {
+    Invert = true;
+    return Opcode;
+  }
+
+  return 0;
+}
+
+// Return a v2f64 that contains the extended form of elements Start and Start+1
+// of v4f32 value Op.
+static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, const SDLoc &DL,
+                                  SDValue Op) {
+  int Mask[] = { Start, -1, Start + 1, -1 };
+  Op = DAG.getVectorShuffle(MVT::v4f32, DL, Op, DAG.getUNDEF(MVT::v4f32), Mask);
+  return DAG.getNode(SystemZISD::VEXTEND, DL, MVT::v2f64, Op);
+}
+
+// Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode,
+// producing a result of type VT.
+static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &DL,
+                            EVT VT, SDValue CmpOp0, SDValue CmpOp1) {
+  // There is no hardware support for v4f32, so extend the vector into
+  // two v2f64s and compare those.
+  if (CmpOp0.getValueType() == MVT::v4f32) {
+    SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0);
+    SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0);
+    SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1);
+    SDValue L1 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp1);
+    SDValue HRes = DAG.getNode(Opcode, DL, MVT::v2i64, H0, H1);
+    SDValue LRes = DAG.getNode(Opcode, DL, MVT::v2i64, L0, L1);
+    return DAG.getNode(SystemZISD::PACK, DL, VT, HRes, LRes);
+  }
+  return DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1);
+}
+
+// Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing
+// an integer mask of type VT.
+static SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
+                                ISD::CondCode CC, SDValue CmpOp0,
+                                SDValue CmpOp1) {
+  bool IsFP = CmpOp0.getValueType().isFloatingPoint();
+  bool Invert = false;
+  SDValue Cmp;
+  switch (CC) {
+    // Handle tests for order using (or (ogt y x) (oge x y)).
+  case ISD::SETUO:
+    Invert = true;
+  case ISD::SETO: {
+    assert(IsFP && "Unexpected integer comparison");
+    SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
+    SDValue GE = getVectorCmp(DAG, SystemZISD::VFCMPHE, DL, VT, CmpOp0, CmpOp1);
+    Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GE);
+    break;
+  }
+
+    // Handle <> tests using (or (ogt y x) (ogt x y)).
+  case ISD::SETUEQ:
+    Invert = true;
+  case ISD::SETONE: {
+    assert(IsFP && "Unexpected integer comparison");
+    SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
+    SDValue GT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp0, CmpOp1);
+    Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GT);
+    break;
+  }
+
+    // Otherwise a single comparison is enough.  It doesn't really
+    // matter whether we try the inversion or the swap first, since
+    // there are no cases where both work.
+  default:
+    if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
+      Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp0, CmpOp1);
+    else {
+      CC = ISD::getSetCCSwappedOperands(CC);
+      if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
+        Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp1, CmpOp0);
+      else
+        llvm_unreachable("Unhandled comparison");
+    }
+    break;
+  }
+  if (Invert) {
+    SDValue Mask = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
+                               DAG.getConstant(65535, DL, MVT::i32));
+    Mask = DAG.getNode(ISD::BITCAST, DL, VT, Mask);
+    Cmp = DAG.getNode(ISD::XOR, DL, VT, Cmp, Mask);
+  }
+  return Cmp;
+}
+
+SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  SDValue CmpOp0   = Op.getOperand(0);
+  SDValue CmpOp1   = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  if (VT.isVector())
+    return lowerVectorSETCC(DAG, DL, VT, CC, CmpOp0, CmpOp1);
+
+  Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
+  SDValue Glue = emitCmp(DAG, DL, C);
+  return emitSETCC(DAG, DL, Glue, C.CCValid, C.CCMask);
+}
+
+SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue CmpOp0   = Op.getOperand(2);
+  SDValue CmpOp1   = Op.getOperand(3);
+  SDValue Dest     = Op.getOperand(4);
+  SDLoc DL(Op);
+
+  Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
+  SDValue Glue = emitCmp(DAG, DL, C);
+  return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(),
+                     Op.getOperand(0), DAG.getConstant(C.CCValid, DL, MVT::i32),
+                     DAG.getConstant(C.CCMask, DL, MVT::i32), Dest, Glue);
+}
+
+// Return true if Pos is CmpOp and Neg is the negative of CmpOp,
+// allowing Pos and Neg to be wider than CmpOp.
+static bool isAbsolute(SDValue CmpOp, SDValue Pos, SDValue Neg) {
+  return (Neg.getOpcode() == ISD::SUB &&
+          Neg.getOperand(0).getOpcode() == ISD::Constant &&
+          cast<ConstantSDNode>(Neg.getOperand(0))->getZExtValue() == 0 &&
+          Neg.getOperand(1) == Pos &&
+          (Pos == CmpOp ||
+           (Pos.getOpcode() == ISD::SIGN_EXTEND &&
+            Pos.getOperand(0) == CmpOp)));
+}
+
+// Return the absolute or negative absolute of Op; IsNegative decides which.
+static SDValue getAbsolute(SelectionDAG &DAG, const SDLoc &DL, SDValue Op,
+                           bool IsNegative) {
+  Op = DAG.getNode(SystemZISD::IABS, DL, Op.getValueType(), Op);
+  if (IsNegative)
+    Op = DAG.getNode(ISD::SUB, DL, Op.getValueType(),
+                     DAG.getConstant(0, DL, Op.getValueType()), Op);
+  return Op;
+}
+
+SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDValue CmpOp0   = Op.getOperand(0);
+  SDValue CmpOp1   = Op.getOperand(1);
+  SDValue TrueOp   = Op.getOperand(2);
+  SDValue FalseOp  = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDLoc DL(Op);
+
+  Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
+
+  // Check for absolute and negative-absolute selections, including those
+  // where the comparison value is sign-extended (for LPGFR and LNGFR).
+  // This check supplements the one in DAGCombiner.
+  if (C.Opcode == SystemZISD::ICMP &&
+      C.CCMask != SystemZ::CCMASK_CMP_EQ &&
+      C.CCMask != SystemZ::CCMASK_CMP_NE &&
+      C.Op1.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
+    if (isAbsolute(C.Op0, TrueOp, FalseOp))
+      return getAbsolute(DAG, DL, TrueOp, C.CCMask & SystemZ::CCMASK_CMP_LT);
+    if (isAbsolute(C.Op0, FalseOp, TrueOp))
+      return getAbsolute(DAG, DL, FalseOp, C.CCMask & SystemZ::CCMASK_CMP_GT);
+  }
+
+  SDValue Glue = emitCmp(DAG, DL, C);
+
+  // Special case for handling -1/0 results.  The shifts we use here
+  // should get optimized with the IPM conversion sequence.
+  auto *TrueC = dyn_cast<ConstantSDNode>(TrueOp);
+  auto *FalseC = dyn_cast<ConstantSDNode>(FalseOp);
+  if (TrueC && FalseC) {
+    int64_t TrueVal = TrueC->getSExtValue();
+    int64_t FalseVal = FalseC->getSExtValue();
+    if ((TrueVal == -1 && FalseVal == 0) || (TrueVal == 0 && FalseVal == -1)) {
+      // Invert the condition if we want -1 on false.
+      if (TrueVal == 0)
+        C.CCMask ^= C.CCValid;
+      SDValue Result = emitSETCC(DAG, DL, Glue, C.CCValid, C.CCMask);
+      EVT VT = Op.getValueType();
+      // Extend the result to VT.  Upper bits are ignored.
+      if (!is32Bit(VT))
+        Result = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Result);
+      // Sign-extend from the low bit.
+      SDValue ShAmt = DAG.getConstant(VT.getSizeInBits() - 1, DL, MVT::i32);
+      SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Result, ShAmt);
+      return DAG.getNode(ISD::SRA, DL, VT, Shl, ShAmt);
+    }
+  }
+
+  SDValue Ops[] = {TrueOp, FalseOp, DAG.getConstant(C.CCValid, DL, MVT::i32),
+                   DAG.getConstant(C.CCMask, DL, MVT::i32), Glue};
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
+}
+
+SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
+                                                  SelectionDAG &DAG) const {
+  SDLoc DL(Node);
+  const GlobalValue *GV = Node->getGlobal();
+  int64_t Offset = Node->getOffset();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  CodeModel::Model CM = DAG.getTarget().getCodeModel();
+
+  SDValue Result;
+  if (Subtarget.isPC32DBLSymbol(GV, CM)) {
+    // Assign anchors at 1<<12 byte boundaries.
+    uint64_t Anchor = Offset & ~uint64_t(0xfff);
+    Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor);
+    Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+
+    // The offset can be folded into the address if it is aligned to a halfword.
+    Offset -= Anchor;
+    if (Offset != 0 && (Offset & 1) == 0) {
+      SDValue Full = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor + Offset);
+      Result = DAG.getNode(SystemZISD::PCREL_OFFSET, DL, PtrVT, Full, Result);
+      Offset = 0;
+    }
+  } else {
+    Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT);
+    Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+  }
+
+  // If there was a non-zero offset that we didn't fold, create an explicit
+  // addition for it.
+  if (Offset != 0)
+    Result = DAG.getNode(ISD::ADD, DL, PtrVT, Result,
+                         DAG.getConstant(Offset, DL, PtrVT));
+
+  return Result;
+}
+
+SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
+                                                 SelectionDAG &DAG,
+                                                 unsigned Opcode,
+                                                 SDValue GOTOffset) const {
+  SDLoc DL(Node);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Chain = DAG.getEntryNode();
+  SDValue Glue;
+
+  // __tls_get_offset takes the GOT offset in %r2 and the GOT in %r12.
+  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
+  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R12D, GOT, Glue);
+  Glue = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R2D, GOTOffset, Glue);
+  Glue = Chain.getValue(1);
+
+  // The first call operand is the chain and the second is the TLS symbol.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getTargetGlobalAddress(Node->getGlobal(), DL,
+                                           Node->getValueType(0),
+                                           0, 0));
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  Ops.push_back(DAG.getRegister(SystemZ::R2D, PtrVT));
+  Ops.push_back(DAG.getRegister(SystemZ::R12D, PtrVT));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const uint32_t *Mask =
+      TRI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  // Glue the call to the argument copies.
+  Ops.push_back(Glue);
+
+  // Emit the call.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(Opcode, DL, NodeTys, Ops);
+  Glue = Chain.getValue(1);
+
+  // Copy the return value from %r2.
+  return DAG.getCopyFromReg(Chain, DL, SystemZ::R2D, PtrVT, Glue);
+}
+
+SDValue SystemZTargetLowering::lowerThreadPointer(const SDLoc &DL,
+                                                  SelectionDAG &DAG) const {
+  SDValue Chain = DAG.getEntryNode();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // The high part of the thread pointer is in access register 0.
+  SDValue TPHi = DAG.getCopyFromReg(Chain, DL, SystemZ::A0, MVT::i32);
+  TPHi = DAG.getNode(ISD::ANY_EXTEND, DL, PtrVT, TPHi);
+
+  // The low part of the thread pointer is in access register 1.
+  SDValue TPLo = DAG.getCopyFromReg(Chain, DL, SystemZ::A1, MVT::i32);
+  TPLo = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TPLo);
+
+  // Merge them into a single 64-bit address.
+  SDValue TPHiShifted = DAG.getNode(ISD::SHL, DL, PtrVT, TPHi,
+                                    DAG.getConstant(32, DL, PtrVT));
+  return DAG.getNode(ISD::OR, DL, PtrVT, TPHiShifted, TPLo);
+}
+
+SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
+                                                     SelectionDAG &DAG) const {
+  if (DAG.getTarget().Options.EmulatedTLS)
+    return LowerToTLSEmulatedModel(Node, DAG);
+  SDLoc DL(Node);
+  const GlobalValue *GV = Node->getGlobal();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
+
+  SDValue TP = lowerThreadPointer(DL, DAG);
+
+  // Get the offset of GA from the thread pointer, based on the TLS model.
+  SDValue Offset;
+  switch (model) {
+    case TLSModel::GeneralDynamic: {
+      // Load the GOT offset of the tls_index (module ID / per-symbol offset).
+      SystemZConstantPoolValue *CPV =
+        SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD);
+
+      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+      Offset = DAG.getLoad(
+          PtrVT, DL, DAG.getEntryNode(), Offset,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+
+      // Call __tls_get_offset to retrieve the offset.
+      Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset);
+      break;
+    }
+
+    case TLSModel::LocalDynamic: {
+      // Load the GOT offset of the module ID.
+      SystemZConstantPoolValue *CPV =
+        SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM);
+
+      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+      Offset = DAG.getLoad(
+          PtrVT, DL, DAG.getEntryNode(), Offset,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+
+      // Call __tls_get_offset to retrieve the module base offset.
+      Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset);
+
+      // Note: The SystemZLDCleanupPass will remove redundant computations
+      // of the module base offset.  Count total number of local-dynamic
+      // accesses to trigger execution of that pass.
+      SystemZMachineFunctionInfo* MFI =
+        DAG.getMachineFunction().getInfo<SystemZMachineFunctionInfo>();
+      MFI->incNumLocalDynamicTLSAccesses();
+
+      // Add the per-symbol offset.
+      CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF);
+
+      SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8);
+      DTPOffset = DAG.getLoad(
+          PtrVT, DL, DAG.getEntryNode(), DTPOffset,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+
+      Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset);
+      break;
+    }
+
+    case TLSModel::InitialExec: {
+      // Load the offset from the GOT.
+      Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                          SystemZII::MO_INDNTPOFF);
+      Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset);
+      Offset =
+          DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Offset,
+                      MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+      break;
+    }
+
+    case TLSModel::LocalExec: {
+      // Force the offset into the constant pool and load it from there.
+      SystemZConstantPoolValue *CPV =
+        SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
+
+      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+      Offset = DAG.getLoad(
+          PtrVT, DL, DAG.getEntryNode(), Offset,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+      break;
+    }
+  }
+
+  // Add the base and offset together.
+  return DAG.getNode(ISD::ADD, DL, PtrVT, TP, Offset);
+}
+
+SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
+                                                 SelectionDAG &DAG) const {
+  SDLoc DL(Node);
+  const BlockAddress *BA = Node->getBlockAddress();
+  int64_t Offset = Node->getOffset();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset);
+  Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+  return Result;
+}
+
+SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(JT);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+
+  // Use LARL to load the address of the table.
+  return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+}
+
+SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
+                                                 SelectionDAG &DAG) const {
+  SDLoc DL(CP);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  SDValue Result;
+  if (CP->isMachineConstantPoolEntry())
+    Result = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+                                       CP->getAlignment());
+  else
+    Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+                                       CP->getAlignment(), CP->getOffset());
+
+  // Use LARL to load the address of the constant pool entry.
+  return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+}
+
+SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setFrameAddressIsTaken(true);
+
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // If the back chain frame index has not been allocated yet, do so.
+  SystemZMachineFunctionInfo *FI = MF.getInfo<SystemZMachineFunctionInfo>();
+  int BackChainIdx = FI->getFramePointerSaveIndex();
+  if (!BackChainIdx) {
+    // By definition, the frame address is the address of the back chain.
+    BackChainIdx = MFI.CreateFixedObject(8, -SystemZMC::CallFrameSize, false);
+    FI->setFramePointerSaveIndex(BackChainIdx);
+  }
+  SDValue BackChain = DAG.getFrameIndex(BackChainIdx, PtrVT);
+
+  // FIXME The frontend should detect this case.
+  if (Depth > 0) {
+    report_fatal_error("Unsupported stack frame traversal count");
+  }
+
+  return BackChain;
+}
+
+SDValue SystemZTargetLowering::lowerRETURNADDR(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setReturnAddressIsTaken(true);
+
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // FIXME The frontend should detect this case.
+  if (Depth > 0) {
+    report_fatal_error("Unsupported stack frame traversal count");
+  }
+
+  // Return R14D, which has the return address. Mark it an implicit live-in.
+  unsigned LinkReg = MF.addLiveIn(SystemZ::R14D, &SystemZ::GR64BitRegClass);
+  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, LinkReg, PtrVT);
+}
+
+SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue In = Op.getOperand(0);
+  EVT InVT = In.getValueType();
+  EVT ResVT = Op.getValueType();
+
+  // Convert loads directly.  This is normally done by DAGCombiner,
+  // but we need this case for bitcasts that are created during lowering
+  // and which are then lowered themselves.
+  if (auto *LoadN = dyn_cast<LoadSDNode>(In))
+    return DAG.getLoad(ResVT, DL, LoadN->getChain(), LoadN->getBasePtr(),
+                       LoadN->getMemOperand());
+
+  if (InVT == MVT::i32 && ResVT == MVT::f32) {
+    SDValue In64;
+    if (Subtarget.hasHighWord()) {
+      SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL,
+                                       MVT::i64);
+      In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
+                                       MVT::i64, SDValue(U64, 0), In);
+    } else {
+      In64 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, In);
+      In64 = DAG.getNode(ISD::SHL, DL, MVT::i64, In64,
+                         DAG.getConstant(32, DL, MVT::i64));
+    }
+    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::f64, In64);
+    return DAG.getTargetExtractSubreg(SystemZ::subreg_r32,
+                                      DL, MVT::f32, Out64);
+  }
+  if (InVT == MVT::f32 && ResVT == MVT::i32) {
+    SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f64);
+    SDValue In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_r32, DL,
+                                             MVT::f64, SDValue(U64, 0), In);
+    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::i64, In64);
+    if (Subtarget.hasHighWord())
+      return DAG.getTargetExtractSubreg(SystemZ::subreg_h32, DL,
+                                        MVT::i32, Out64);
+    SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i64, Out64,
+                                DAG.getConstant(32, DL, MVT::i64));
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Shift);
+  }
+  llvm_unreachable("Unexpected bitcast combination");
+}
+
+SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SystemZMachineFunctionInfo *FuncInfo =
+    MF.getInfo<SystemZMachineFunctionInfo>();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  SDValue Chain   = Op.getOperand(0);
+  SDValue Addr    = Op.getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  SDLoc DL(Op);
+
+  // The initial values of each field.
+  const unsigned NumFields = 4;
+  SDValue Fields[NumFields] = {
+    DAG.getConstant(FuncInfo->getVarArgsFirstGPR(), DL, PtrVT),
+    DAG.getConstant(FuncInfo->getVarArgsFirstFPR(), DL, PtrVT),
+    DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT),
+    DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT)
+  };
+
+  // Store each field into its respective slot.
+  SDValue MemOps[NumFields];
+  unsigned Offset = 0;
+  for (unsigned I = 0; I < NumFields; ++I) {
+    SDValue FieldAddr = Addr;
+    if (Offset != 0)
+      FieldAddr = DAG.getNode(ISD::ADD, DL, PtrVT, FieldAddr,
+                              DAG.getIntPtrConstant(Offset, DL));
+    MemOps[I] = DAG.getStore(Chain, DL, Fields[I], FieldAddr,
+                             MachinePointerInfo(SV, Offset));
+    Offset += 8;
+  }
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+}
+
+SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue Chain      = Op.getOperand(0);
+  SDValue DstPtr     = Op.getOperand(1);
+  SDValue SrcPtr     = Op.getOperand(2);
+  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  SDLoc DL(Op);
+
+  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(32, DL),
+                       /*Align*/8, /*isVolatile*/false, /*AlwaysInline*/false,
+                       /*isTailCall*/false,
+                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
+}
+
+SDValue SystemZTargetLowering::
+lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
+  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool RealignOpt = !MF.getFunction()-> hasFnAttribute("no-realign-stack");
+  bool StoreBackchain = MF.getFunction()->hasFnAttribute("backchain");
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  SDValue Align = Op.getOperand(2);
+  SDLoc DL(Op);
+
+  // If user has set the no alignment function attribute, ignore
+  // alloca alignments.
+  uint64_t AlignVal = (RealignOpt ?
+                       dyn_cast<ConstantSDNode>(Align)->getZExtValue() : 0);
+
+  uint64_t StackAlign = TFI->getStackAlignment();
+  uint64_t RequiredAlign = std::max(AlignVal, StackAlign);
+  uint64_t ExtraAlignSpace = RequiredAlign - StackAlign;
+
+  unsigned SPReg = getStackPointerRegisterToSaveRestore();
+  SDValue NeededSpace = Size;
+
+  // Get a reference to the stack pointer.
+  SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i64);
+
+  // If we need a backchain, save it now.
+  SDValue Backchain;
+  if (StoreBackchain)
+    Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
+
+  // Add extra space for alignment if needed.
+  if (ExtraAlignSpace)
+    NeededSpace = DAG.getNode(ISD::ADD, DL, MVT::i64, NeededSpace,
+                              DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
+
+  // Get the new stack pointer value.
+  SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace);
+
+  // Copy the new stack pointer back.
+  Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);
+
+  // The allocated data lives above the 160 bytes allocated for the standard
+  // frame, plus any outgoing stack arguments.  We don't know how much that
+  // amounts to yet, so emit a special ADJDYNALLOC placeholder.
+  SDValue ArgAdjust = DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
+  SDValue Result = DAG.getNode(ISD::ADD, DL, MVT::i64, NewSP, ArgAdjust);
+
+  // Dynamically realign if needed.
+  if (RequiredAlign > StackAlign) {
+    Result =
+      DAG.getNode(ISD::ADD, DL, MVT::i64, Result,
+                  DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
+    Result =
+      DAG.getNode(ISD::AND, DL, MVT::i64, Result,
+                  DAG.getConstant(~(RequiredAlign - 1), DL, MVT::i64));
+  }
+
+  if (StoreBackchain)
+    Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());
+
+  SDValue Ops[2] = { Result, Chain };
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue SystemZTargetLowering::lowerGET_DYNAMIC_AREA_OFFSET(
+    SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+
+  return DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
+}
+
+SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue Ops[2];
+  if (is32Bit(VT))
+    // Just do a normal 64-bit multiplication and extract the results.
+    // We define this so that it can be used for constant division.
+    lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0),
+                    Op.getOperand(1), Ops[1], Ops[0]);
+  else {
+    // Do a full 128-bit multiplication based on UMUL_LOHI64:
+    //
+    //   (ll * rl) + ((lh * rl) << 64) + ((ll * rh) << 64)
+    //
+    // but using the fact that the upper halves are either all zeros
+    // or all ones:
+    //
+    //   (ll * rl) - ((lh & rl) << 64) - ((ll & rh) << 64)
+    //
+    // and grouping the right terms together since they are quicker than the
+    // multiplication:
+    //
+    //   (ll * rl) - (((lh & rl) + (ll & rh)) << 64)
+    SDValue C63 = DAG.getConstant(63, DL, MVT::i64);
+    SDValue LL = Op.getOperand(0);
+    SDValue RL = Op.getOperand(1);
+    SDValue LH = DAG.getNode(ISD::SRA, DL, VT, LL, C63);
+    SDValue RH = DAG.getNode(ISD::SRA, DL, VT, RL, C63);
+    // UMUL_LOHI64 returns the low result in the odd register and the high
+    // result in the even register.  SMUL_LOHI is defined to return the
+    // low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+                     LL, RL, Ops[1], Ops[0]);
+    SDValue NegLLTimesRH = DAG.getNode(ISD::AND, DL, VT, LL, RH);
+    SDValue NegLHTimesRL = DAG.getNode(ISD::AND, DL, VT, LH, RL);
+    SDValue NegSum = DAG.getNode(ISD::ADD, DL, VT, NegLLTimesRH, NegLHTimesRL);
+    Ops[1] = DAG.getNode(ISD::SUB, DL, VT, Ops[1], NegSum);
+  }
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue Ops[2];
+  if (is32Bit(VT))
+    // Just do a normal 64-bit multiplication and extract the results.
+    // We define this so that it can be used for constant division.
+    lowerMUL_LOHI32(DAG, DL, ISD::ZERO_EXTEND, Op.getOperand(0),
+                    Op.getOperand(1), Ops[1], Ops[0]);
+  else
+    // UMUL_LOHI64 returns the low result in the odd register and the high
+    // result in the even register.  UMUL_LOHI is defined to return the
+    // low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Opcode;
+
+  // We use DSGF for 32-bit division.
+  if (is32Bit(VT)) {
+    Op0 = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Op0);
+    Opcode = SystemZISD::SDIVREM32;
+  } else if (DAG.ComputeNumSignBits(Op1) > 32) {
+    Op1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1);
+    Opcode = SystemZISD::SDIVREM32;
+  } else
+    Opcode = SystemZISD::SDIVREM64;
+
+  // DSG(F) takes a 64-bit dividend, so the even register in the GR128
+  // input is "don't care".  The instruction returns the remainder in
+  // the even register and the quotient in the odd register.
+  SDValue Ops[2];
+  lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, Opcode,
+                   Op0, Op1, Ops[1], Ops[0]);
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  // DL(G) uses a double-width dividend, so we need to clear the even
+  // register in the GR128 input.  The instruction returns the remainder
+  // in the even register and the quotient in the odd register.
+  SDValue Ops[2];
+  if (is32Bit(VT))
+    lowerGR128Binary(DAG, DL, VT, SystemZ::ZEXT128_32, SystemZISD::UDIVREM32,
+                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+  else
+    lowerGR128Binary(DAG, DL, VT, SystemZ::ZEXT128_64, SystemZISD::UDIVREM64,
+                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::i64 && "Should be 64-bit operation");
+
+  // Get the known-zero masks for each operand.
+  SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
+  APInt KnownZero[2], KnownOne[2];
+  DAG.computeKnownBits(Ops[0], KnownZero[0], KnownOne[0]);
+  DAG.computeKnownBits(Ops[1], KnownZero[1], KnownOne[1]);
+
+  // See if the upper 32 bits of one operand and the lower 32 bits of the
+  // other are known zero.  They are the low and high operands respectively.
+  uint64_t Masks[] = { KnownZero[0].getZExtValue(),
+                       KnownZero[1].getZExtValue() };
+  unsigned High, Low;
+  if ((Masks[0] >> 32) == 0xffffffff && uint32_t(Masks[1]) == 0xffffffff)
+    High = 1, Low = 0;
+  else if ((Masks[1] >> 32) == 0xffffffff && uint32_t(Masks[0]) == 0xffffffff)
+    High = 0, Low = 1;
+  else
+    return Op;
+
+  SDValue LowOp = Ops[Low];
+  SDValue HighOp = Ops[High];
+
+  // If the high part is a constant, we're better off using IILH.
+  if (HighOp.getOpcode() == ISD::Constant)
+    return Op;
+
+  // If the low part is a constant that is outside the range of LHI,
+  // then we're better off using IILF.
+  if (LowOp.getOpcode() == ISD::Constant) {
+    int64_t Value = int32_t(cast<ConstantSDNode>(LowOp)->getZExtValue());
+    if (!isInt<16>(Value))
+      return Op;
+  }
+
+  // Check whether the high part is an AND that doesn't change the
+  // high 32 bits and just masks out low bits.  We can skip it if so.
+  if (HighOp.getOpcode() == ISD::AND &&
+      HighOp.getOperand(1).getOpcode() == ISD::Constant) {
+    SDValue HighOp0 = HighOp.getOperand(0);
+    uint64_t Mask = cast<ConstantSDNode>(HighOp.getOperand(1))->getZExtValue();
+    if (DAG.MaskedValueIsZero(HighOp0, APInt(64, ~(Mask | 0xffffffff))))
+      HighOp = HighOp0;
+  }
+
+  // Take advantage of the fact that all GR32 operations only change the
+  // low 32 bits by truncating Low to an i32 and inserting it directly
+  // using a subreg.  The interesting cases are those where the truncation
+  // can be folded.
+  SDLoc DL(Op);
+  SDValue Low32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, LowOp);
+  return DAG.getTargetInsertSubreg(SystemZ::subreg_l32, DL,
+                                   MVT::i64, HighOp, Low32);
+}
+
+SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  Op = Op.getOperand(0);
+
+  // Handle vector types via VPOPCT.
+  if (VT.isVector()) {
+    Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Op);
+    Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::v16i8, Op);
+    switch (VT.getScalarSizeInBits()) {
+    case 8:
+      break;
+    case 16: {
+      Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
+      SDValue Shift = DAG.getConstant(8, DL, MVT::i32);
+      SDValue Tmp = DAG.getNode(SystemZISD::VSHL_BY_SCALAR, DL, VT, Op, Shift);
+      Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
+      Op = DAG.getNode(SystemZISD::VSRL_BY_SCALAR, DL, VT, Op, Shift);
+      break;
+    }
+    case 32: {
+      SDValue Tmp = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
+                                DAG.getConstant(0, DL, MVT::i32));
+      Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
+      break;
+    }
+    case 64: {
+      SDValue Tmp = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
+                                DAG.getConstant(0, DL, MVT::i32));
+      Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Tmp);
+      Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
+      break;
+    }
+    default:
+      llvm_unreachable("Unexpected type");
+    }
+    return Op;
+  }
+
+  // Get the known-zero mask for the operand.
+  APInt KnownZero, KnownOne;
+  DAG.computeKnownBits(Op, KnownZero, KnownOne);
+  unsigned NumSignificantBits = (~KnownZero).getActiveBits();
+  if (NumSignificantBits == 0)
+    return DAG.getConstant(0, DL, VT);
+
+  // Skip known-zero high parts of the operand.
+  int64_t OrigBitSize = VT.getSizeInBits();
+  int64_t BitSize = (int64_t)1 << Log2_32_Ceil(NumSignificantBits);
+  BitSize = std::min(BitSize, OrigBitSize);
+
+  // The POPCNT instruction counts the number of bits in each byte.
+  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op);
+  Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::i64, Op);
+  Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
+
+  // Add up per-byte counts in a binary tree.  All bits of Op at
+  // position larger than BitSize remain zero throughout.
+  for (int64_t I = BitSize / 2; I >= 8; I = I / 2) {
+    SDValue Tmp = DAG.getNode(ISD::SHL, DL, VT, Op, DAG.getConstant(I, DL, VT));
+    if (BitSize != OrigBitSize)
+      Tmp = DAG.getNode(ISD::AND, DL, VT, Tmp,
+                        DAG.getConstant(((uint64_t)1 << BitSize) - 1, DL, VT));
+    Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
+  }
+
+  // Extract overall result from high byte.
+  if (BitSize > 8)
+    Op = DAG.getNode(ISD::SRL, DL, VT, Op,
+                     DAG.getConstant(BitSize - 8, DL, VT));
+
+  return Op;
+}
+
+SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
+    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+  // The only fence that needs an instruction is a sequentially-consistent
+  // cross-thread fence.
+  if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
+      FenceScope == CrossThread) {
+    return SDValue(DAG.getMachineNode(SystemZ::Serialize, DL, MVT::Other,
+                                      Op.getOperand(0)),
+                   0);
+  }
+
+  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+  return DAG.getNode(SystemZISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
+}
+
+// Op is an atomic load.  Lower it into a normal volatile load.
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  auto *Node = cast<AtomicSDNode>(Op.getNode());
+  return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
+                        Node->getChain(), Node->getBasePtr(),
+                        Node->getMemoryVT(), Node->getMemOperand());
+}
+
+// Op is an atomic store.  Lower it into a normal volatile store followed
+// by a serialization.
+SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  auto *Node = cast<AtomicSDNode>(Op.getNode());
+  SDValue Chain = DAG.getTruncStore(Node->getChain(), SDLoc(Op), Node->getVal(),
+                                    Node->getBasePtr(), Node->getMemoryVT(),
+                                    Node->getMemOperand());
+  return SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op), MVT::Other,
+                                    Chain), 0);
+}
+
+// Op is an 8-, 16-bit or 32-bit ATOMIC_LOAD_* operation.  Lower the first
+// two into the fullword ATOMIC_LOADW_* operation given by Opcode.
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op,
+                                                   SelectionDAG &DAG,
+                                                   unsigned Opcode) const {
+  auto *Node = cast<AtomicSDNode>(Op.getNode());
+
+  // 32-bit operations need no code outside the main loop.
+  EVT NarrowVT = Node->getMemoryVT();
+  EVT WideVT = MVT::i32;
+  if (NarrowVT == WideVT)
+    return Op;
+
+  int64_t BitSize = NarrowVT.getSizeInBits();
+  SDValue ChainIn = Node->getChain();
+  SDValue Addr = Node->getBasePtr();
+  SDValue Src2 = Node->getVal();
+  MachineMemOperand *MMO = Node->getMemOperand();
+  SDLoc DL(Node);
+  EVT PtrVT = Addr.getValueType();
+
+  // Convert atomic subtracts of constants into additions.
+  if (Opcode == SystemZISD::ATOMIC_LOADW_SUB)
+    if (auto *Const = dyn_cast<ConstantSDNode>(Src2)) {
+      Opcode = SystemZISD::ATOMIC_LOADW_ADD;
+      Src2 = DAG.getConstant(-Const->getSExtValue(), DL, Src2.getValueType());
+    }
+
+  // Get the address of the containing word.
+  SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
+                                    DAG.getConstant(-4, DL, PtrVT));
+
+  // Get the number of bits that the word must be rotated left in order
+  // to bring the field to the top bits of a GR32.
+  SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
+                                 DAG.getConstant(3, DL, PtrVT));
+  BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);
+
+  // Get the complementing shift amount, for rotating a field in the top
+  // bits back to its proper position.
+  SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
+                                    DAG.getConstant(0, DL, WideVT), BitShift);
+
+  // Extend the source operand to 32 bits and prepare it for the inner loop.
+  // ATOMIC_SWAPW uses RISBG to rotate the field left, but all other
+  // operations require the source to be shifted in advance.  (This shift
+  // can be folded if the source is constant.)  For AND and NAND, the lower
+  // bits must be set, while for other opcodes they should be left clear.
+  if (Opcode != SystemZISD::ATOMIC_SWAPW)
+    Src2 = DAG.getNode(ISD::SHL, DL, WideVT, Src2,
+                       DAG.getConstant(32 - BitSize, DL, WideVT));
+  if (Opcode == SystemZISD::ATOMIC_LOADW_AND ||
+      Opcode == SystemZISD::ATOMIC_LOADW_NAND)
+    Src2 = DAG.getNode(ISD::OR, DL, WideVT, Src2,
+                       DAG.getConstant(uint32_t(-1) >> BitSize, DL, WideVT));
+
+  // Construct the ATOMIC_LOADW_* node.
+  SDVTList VTList = DAG.getVTList(WideVT, MVT::Other);
+  SDValue Ops[] = { ChainIn, AlignedAddr, Src2, BitShift, NegBitShift,
+                    DAG.getConstant(BitSize, DL, WideVT) };
+  SDValue AtomicOp = DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops,
+                                             NarrowVT, MMO);
+
+  // Rotate the result of the final CS so that the field is in the lower
+  // bits of a GR32, then truncate it.
+  SDValue ResultShift = DAG.getNode(ISD::ADD, DL, WideVT, BitShift,
+                                    DAG.getConstant(BitSize, DL, WideVT));
+  SDValue Result = DAG.getNode(ISD::ROTL, DL, WideVT, AtomicOp, ResultShift);
+
+  SDValue RetOps[2] = { Result, AtomicOp.getValue(1) };
+  return DAG.getMergeValues(RetOps, DL);
+}
+
+// Op is an ATOMIC_LOAD_SUB operation.  Lower 8- and 16-bit operations
+// into ATOMIC_LOADW_SUBs and decide whether to convert 32- and 64-bit
+// operations into additions.
+SDValue SystemZTargetLowering::lowerATOMIC_LOAD_SUB(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  auto *Node = cast<AtomicSDNode>(Op.getNode());
+  EVT MemVT = Node->getMemoryVT();
+  if (MemVT == MVT::i32 || MemVT == MVT::i64) {
+    // A full-width operation.
+    assert(Op.getValueType() == MemVT && "Mismatched VTs");
+    SDValue Src2 = Node->getVal();
+    SDValue NegSrc2;
+    SDLoc DL(Src2);
+
+    if (auto *Op2 = dyn_cast<ConstantSDNode>(Src2)) {
+      // Use an addition if the operand is constant and either LAA(G) is
+      // available or the negative value is in the range of A(G)FHI.
+      int64_t Value = (-Op2->getAPIntValue()).getSExtValue();
+      if (isInt<32>(Value) || Subtarget.hasInterlockedAccess1())
+        NegSrc2 = DAG.getConstant(Value, DL, MemVT);
+    } else if (Subtarget.hasInterlockedAccess1())
+      // Use LAA(G) if available.
+      NegSrc2 = DAG.getNode(ISD::SUB, DL, MemVT, DAG.getConstant(0, DL, MemVT),
+                            Src2);
+
+    if (NegSrc2.getNode())
+      return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, MemVT,
+                           Node->getChain(), Node->getBasePtr(), NegSrc2,
+                           Node->getMemOperand());
+
+    // Use the node as-is.
+    return Op;
+  }
+
+  return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_SUB);
+}
+
+// Node is an 8- or 16-bit ATOMIC_CMP_SWAP operation.  Lower the first two
+// into a fullword ATOMIC_CMP_SWAPW operation.
+SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  auto *Node = cast<AtomicSDNode>(Op.getNode());
+
+  // We have native support for 32-bit compare and swap.
+  EVT NarrowVT = Node->getMemoryVT();
+  EVT WideVT = MVT::i32;
+  if (NarrowVT == WideVT)
+    return Op;
+
+  int64_t BitSize = NarrowVT.getSizeInBits();
+  SDValue ChainIn = Node->getOperand(0);
+  SDValue Addr = Node->getOperand(1);
+  SDValue CmpVal = Node->getOperand(2);
+  SDValue SwapVal = Node->getOperand(3);
+  MachineMemOperand *MMO = Node->getMemOperand();
+  SDLoc DL(Node);
+  EVT PtrVT = Addr.getValueType();
+
+  // Get the address of the containing word.
+  SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
+                                    DAG.getConstant(-4, DL, PtrVT));
+
+  // Get the number of bits that the word must be rotated left in order
+  // to bring the field to the top bits of a GR32.
+  SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
+                                 DAG.getConstant(3, DL, PtrVT));
+  BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);
+
+  // Get the complementing shift amount, for rotating a field in the top
+  // bits back to its proper position.
+  SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
+                                    DAG.getConstant(0, DL, WideVT), BitShift);
+
+  // Construct the ATOMIC_CMP_SWAPW node.
+  SDVTList VTList = DAG.getVTList(WideVT, MVT::Other);
+  SDValue Ops[] = { ChainIn, AlignedAddr, CmpVal, SwapVal, BitShift,
+                    NegBitShift, DAG.getConstant(BitSize, DL, WideVT) };
+  SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAPW, DL,
+                                             VTList, Ops, NarrowVT, MMO);
+  return AtomicOp;
+}
+
+SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
+  return DAG.getCopyFromReg(Op.getOperand(0), SDLoc(Op),
+                            SystemZ::R15D, Op.getValueType());
+}
+
+SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
+  bool StoreBackchain = MF.getFunction()->hasFnAttribute("backchain");
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue NewSP = Op.getOperand(1);
+  SDValue Backchain;
+  SDLoc DL(Op);
+
+  if (StoreBackchain) {
+    SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, MVT::i64);
+    Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
+  }
+
+  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R15D, NewSP);
+
+  if (StoreBackchain)
+    Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());
+
+  return Chain;
+}
+
+SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  bool IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  if (!IsData)
+    // Just preserve the chain.
+    return Op.getOperand(0);
+
+  SDLoc DL(Op);
+  bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ;
+  auto *Node = cast<MemIntrinsicSDNode>(Op.getNode());
+  SDValue Ops[] = {
+    Op.getOperand(0),
+    DAG.getConstant(Code, DL, MVT::i32),
+    Op.getOperand(1)
+  };
+  return DAG.getMemIntrinsicNode(SystemZISD::PREFETCH, DL,
+                                 Node->getVTList(), Ops,
+                                 Node->getMemoryVT(), Node->getMemOperand());
+}
+
+// Return an i32 that contains the value of CC immediately after After,
+// whose final operand must be MVT::Glue.
+static SDValue getCCResult(SelectionDAG &DAG, SDNode *After) {
+  SDLoc DL(After);
+  SDValue Glue = SDValue(After, After->getNumValues() - 1);
+  SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+  return DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
+                     DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
+}
+
+SDValue
+SystemZTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  unsigned Opcode, CCValid;
+  if (isIntrinsicWithCCAndChain(Op, Opcode, CCValid)) {
+    assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
+    SDValue Glued = emitIntrinsicWithChainAndGlue(DAG, Op, Opcode);
+    SDValue CC = getCCResult(DAG, Glued.getNode());
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), CC);
+    return SDValue();
+  }
+
+  return SDValue();
+}
+
+SDValue
+SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  unsigned Opcode, CCValid;
+  if (isIntrinsicWithCC(Op, Opcode, CCValid)) {
+    SDValue Glued = emitIntrinsicWithGlue(DAG, Op, Opcode);
+    SDValue CC = getCCResult(DAG, Glued.getNode());
+    if (Op->getNumValues() == 1)
+      return CC;
+    assert(Op->getNumValues() == 2 && "Expected a CC and non-CC result");
+    return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(), Glued,
+                       CC);
+  }
+
+  unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  switch (Id) {
+  case Intrinsic::thread_pointer:
+    return lowerThreadPointer(SDLoc(Op), DAG);
+
+  case Intrinsic::s390_vpdi:
+    return DAG.getNode(SystemZISD::PERMUTE_DWORDS, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+  case Intrinsic::s390_vperm:
+    return DAG.getNode(SystemZISD::PERMUTE, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+  case Intrinsic::s390_vuphb:
+  case Intrinsic::s390_vuphh:
+  case Intrinsic::s390_vuphf:
+    return DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1));
+
+  case Intrinsic::s390_vuplhb:
+  case Intrinsic::s390_vuplhh:
+  case Intrinsic::s390_vuplhf:
+    return DAG.getNode(SystemZISD::UNPACKL_HIGH, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1));
+
+  case Intrinsic::s390_vuplb:
+  case Intrinsic::s390_vuplhw:
+  case Intrinsic::s390_vuplf:
+    return DAG.getNode(SystemZISD::UNPACK_LOW, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1));
+
+  case Intrinsic::s390_vupllb:
+  case Intrinsic::s390_vupllh:
+  case Intrinsic::s390_vupllf:
+    return DAG.getNode(SystemZISD::UNPACKL_LOW, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1));
+
+  case Intrinsic::s390_vsumb:
+  case Intrinsic::s390_vsumh:
+  case Intrinsic::s390_vsumgh:
+  case Intrinsic::s390_vsumgf:
+  case Intrinsic::s390_vsumqf:
+  case Intrinsic::s390_vsumqg:
+    return DAG.getNode(SystemZISD::VSUM, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+
+  return SDValue();
+}
+
+namespace {
+// Says that SystemZISD operation Opcode can be used to perform the equivalent
+// of a VPERM with permute vector Bytes.  If Opcode takes three operands,
+// Operand is the constant third operand, otherwise it is the number of
+// bytes in each element of the result.
+struct Permute {
+  unsigned Opcode;
+  unsigned Operand;
+  unsigned char Bytes[SystemZ::VectorBytes];
+};
+}
+
+static const Permute PermuteForms[] = {
+  // VMRHG
+  { SystemZISD::MERGE_HIGH, 8,
+    { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 } },
+  // VMRHF
+  { SystemZISD::MERGE_HIGH, 4,
+    { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 } },
+  // VMRHH
+  { SystemZISD::MERGE_HIGH, 2,
+    { 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 } },
+  // VMRHB
+  { SystemZISD::MERGE_HIGH, 1,
+    { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 } },
+  // VMRLG
+  { SystemZISD::MERGE_LOW, 8,
+    { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 } },
+  // VMRLF
+  { SystemZISD::MERGE_LOW, 4,
+    { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 } },
+  // VMRLH
+  { SystemZISD::MERGE_LOW, 2,
+    { 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 } },
+  // VMRLB
+  { SystemZISD::MERGE_LOW, 1,
+    { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 } },
+  // VPKG
+  { SystemZISD::PACK, 4,
+    { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 } },
+  // VPKF
+  { SystemZISD::PACK, 2,
+    { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 } },
+  // VPKH
+  { SystemZISD::PACK, 1,
+    { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 } },
+  // VPDI V1, V2, 4  (low half of V1, high half of V2)
+  { SystemZISD::PERMUTE_DWORDS, 4,
+    { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 } },
+  // VPDI V1, V2, 1  (high half of V1, low half of V2)
+  { SystemZISD::PERMUTE_DWORDS, 1,
+    { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 } }
+};
+
+// Called after matching a vector shuffle against a particular pattern.
+// Both the original shuffle and the pattern have two vector operands.
+// OpNos[0] is the operand of the original shuffle that should be used for
+// operand 0 of the pattern, or -1 if operand 0 of the pattern can be anything.
+// OpNos[1] is the same for operand 1 of the pattern.  Resolve these -1s and
+// set OpNo0 and OpNo1 to the shuffle operands that should actually be used
+// for operands 0 and 1 of the pattern.
+static bool chooseShuffleOpNos(int *OpNos, unsigned &OpNo0, unsigned &OpNo1) {
+  if (OpNos[0] < 0) {
+    if (OpNos[1] < 0)
+      return false;
+    OpNo0 = OpNo1 = OpNos[1];
+  } else if (OpNos[1] < 0) {
+    OpNo0 = OpNo1 = OpNos[0];
+  } else {
+    OpNo0 = OpNos[0];
+    OpNo1 = OpNos[1];
+  }
+  return true;
+}
+
+// Bytes is a VPERM-like permute vector, except that -1 is used for
+// undefined bytes.  Return true if the VPERM can be implemented using P.
+// When returning true set OpNo0 to the VPERM operand that should be
+// used for operand 0 of P and likewise OpNo1 for operand 1 of P.
+//
+// For example, if swapping the VPERM operands allows P to match, OpNo0
+// will be 1 and OpNo1 will be 0.  If instead Bytes only refers to one
+// operand, but rewriting it to use two duplicated operands allows it to
+// match P, then OpNo0 and OpNo1 will be the same.
+static bool matchPermute(const SmallVectorImpl<int> &Bytes, const Permute &P,
+                         unsigned &OpNo0, unsigned &OpNo1) {
+  int OpNos[] = { -1, -1 };
+  for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
+    int Elt = Bytes[I];
+    if (Elt >= 0) {
+      // Make sure that the two permute vectors use the same suboperand
+      // byte number.  Only the operand numbers (the high bits) are
+      // allowed to differ.
+      if ((Elt ^ P.Bytes[I]) & (SystemZ::VectorBytes - 1))
+        return false;
+      int ModelOpNo = P.Bytes[I] / SystemZ::VectorBytes;
+      int RealOpNo = unsigned(Elt) / SystemZ::VectorBytes;
+      // Make sure that the operand mappings are consistent with previous
+      // elements.
+      if (OpNos[ModelOpNo] == 1 - RealOpNo)
+        return false;
+      OpNos[ModelOpNo] = RealOpNo;
+    }
+  }
+  return chooseShuffleOpNos(OpNos, OpNo0, OpNo1);
+}
+
+// As above, but search for a matching permute.
+static const Permute *matchPermute(const SmallVectorImpl<int> &Bytes,
+                                   unsigned &OpNo0, unsigned &OpNo1) {
+  for (auto &P : PermuteForms)
+    if (matchPermute(Bytes, P, OpNo0, OpNo1))
+      return &P;
+  return nullptr;
+}
+
+// Bytes is a VPERM-like permute vector, except that -1 is used for
+// undefined bytes.  This permute is an operand of an outer permute.
+// See whether redistributing the -1 bytes gives a shuffle that can be
+// implemented using P.  If so, set Transform to a VPERM-like permute vector
+// that, when applied to the result of P, gives the original permute in Bytes.
+static bool matchDoublePermute(const SmallVectorImpl<int> &Bytes,
+                               const Permute &P,
+                               SmallVectorImpl<int> &Transform) {
+  unsigned To = 0;
+  for (unsigned From = 0; From < SystemZ::VectorBytes; ++From) {
+    int Elt = Bytes[From];
+    if (Elt < 0)
+      // Byte number From of the result is undefined.
+      Transform[From] = -1;
+    else {
+      while (P.Bytes[To] != Elt) {
+        To += 1;
+        if (To == SystemZ::VectorBytes)
+          return false;
+      }
+      Transform[From] = To;
+    }
+  }
+  return true;
+}
+
+// As above, but search for a matching permute.
+static const Permute *matchDoublePermute(const SmallVectorImpl<int> &Bytes,
+                                         SmallVectorImpl<int> &Transform) {
+  for (auto &P : PermuteForms)
+    if (matchDoublePermute(Bytes, P, Transform))
+      return &P;
+  return nullptr;
+}
+
+// Convert the mask of the given VECTOR_SHUFFLE into a byte-level mask,
+// as if it had type vNi8.
+static void getVPermMask(ShuffleVectorSDNode *VSN,
+                         SmallVectorImpl<int> &Bytes) {
+  EVT VT = VSN->getValueType(0);
+  unsigned NumElements = VT.getVectorNumElements();
+  unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
+  Bytes.resize(NumElements * BytesPerElement, -1);
+  for (unsigned I = 0; I < NumElements; ++I) {
+    int Index = VSN->getMaskElt(I);
+    if (Index >= 0)
+      for (unsigned J = 0; J < BytesPerElement; ++J)
+        Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
+  }
+}
+
+// Bytes is a VPERM-like permute vector, except that -1 is used for
+// undefined bytes.  See whether bytes [Start, Start + BytesPerElement) of
+// the result come from a contiguous sequence of bytes from one input.
+// Set Base to the selector for the first byte if so.
+static bool getShuffleInput(const SmallVectorImpl<int> &Bytes, unsigned Start,
+                            unsigned BytesPerElement, int &Base) {
+  Base = -1;
+  for (unsigned I = 0; I < BytesPerElement; ++I) {
+    if (Bytes[Start + I] >= 0) {
+      unsigned Elem = Bytes[Start + I];
+      if (Base < 0) {
+        Base = Elem - I;
+        // Make sure the bytes would come from one input operand.
+        if (unsigned(Base) % Bytes.size() + BytesPerElement > Bytes.size())
+          return false;
+      } else if (unsigned(Base) != Elem - I)
+        return false;
+    }
+  }
+  return true;
+}
+
+// Bytes is a VPERM-like permute vector, except that -1 is used for
+// undefined bytes.  Return true if it can be performed using VSLDI.
+// When returning true, set StartIndex to the shift amount and OpNo0
+// and OpNo1 to the VPERM operands that should be used as the first
+// and second shift operand respectively.
+static bool isShlDoublePermute(const SmallVectorImpl<int> &Bytes,
+                               unsigned &StartIndex, unsigned &OpNo0,
+                               unsigned &OpNo1) {
+  int OpNos[] = { -1, -1 };
+  int Shift = -1;
+  for (unsigned I = 0; I < 16; ++I) {
+    int Index = Bytes[I];
+    if (Index >= 0) {
+      int ExpectedShift = (Index - I) % SystemZ::VectorBytes;
+      int ModelOpNo = unsigned(ExpectedShift + I) / SystemZ::VectorBytes;
+      int RealOpNo = unsigned(Index) / SystemZ::VectorBytes;
+      if (Shift < 0)
+        Shift = ExpectedShift;
+      else if (Shift != ExpectedShift)
+        return false;
+      // Make sure that the operand mappings are consistent with previous
+      // elements.
+      if (OpNos[ModelOpNo] == 1 - RealOpNo)
+        return false;
+      OpNos[ModelOpNo] = RealOpNo;
+    }
+  }
+  StartIndex = Shift;
+  return chooseShuffleOpNos(OpNos, OpNo0, OpNo1);
+}
+
+// Create a node that performs P on operands Op0 and Op1, casting the
+// operands to the appropriate type.  The type of the result is determined by P.
+static SDValue getPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
+                              const Permute &P, SDValue Op0, SDValue Op1) {
+  // VPDI (PERMUTE_DWORDS) always operates on v2i64s.  The input
+  // elements of a PACK are twice as wide as the outputs.
+  unsigned InBytes = (P.Opcode == SystemZISD::PERMUTE_DWORDS ? 8 :
+                      P.Opcode == SystemZISD::PACK ? P.Operand * 2 :
+                      P.Operand);
+  // Cast both operands to the appropriate type.
+  MVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBytes * 8),
+                              SystemZ::VectorBytes / InBytes);
+  Op0 = DAG.getNode(ISD::BITCAST, DL, InVT, Op0);
+  Op1 = DAG.getNode(ISD::BITCAST, DL, InVT, Op1);
+  SDValue Op;
+  if (P.Opcode == SystemZISD::PERMUTE_DWORDS) {
+    SDValue Op2 = DAG.getConstant(P.Operand, DL, MVT::i32);
+    Op = DAG.getNode(SystemZISD::PERMUTE_DWORDS, DL, InVT, Op0, Op1, Op2);
+  } else if (P.Opcode == SystemZISD::PACK) {
+    MVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(P.Operand * 8),
+                                 SystemZ::VectorBytes / P.Operand);
+    Op = DAG.getNode(SystemZISD::PACK, DL, OutVT, Op0, Op1);
+  } else {
+    Op = DAG.getNode(P.Opcode, DL, InVT, Op0, Op1);
+  }
+  return Op;
+}
+
+// Bytes is a VPERM-like permute vector, except that -1 is used for
+// undefined bytes.  Implement it on operands Ops[0] and Ops[1] using
+// VSLDI or VPERM.
+static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
+                                     SDValue *Ops,
+                                     const SmallVectorImpl<int> &Bytes) {
+  for (unsigned I = 0; I < 2; ++I)
+    Ops[I] = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Ops[I]);
+
+  // First see whether VSLDI can be used.
+  unsigned StartIndex, OpNo0, OpNo1;
+  if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1))
+    return DAG.getNode(SystemZISD::SHL_DOUBLE, DL, MVT::v16i8, Ops[OpNo0],
+                       Ops[OpNo1], DAG.getConstant(StartIndex, DL, MVT::i32));
+
+  // Fall back on VPERM.  Construct an SDNode for the permute vector.
+  SDValue IndexNodes[SystemZ::VectorBytes];
+  for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
+    if (Bytes[I] >= 0)
+      IndexNodes[I] = DAG.getConstant(Bytes[I], DL, MVT::i32);
+    else
+      IndexNodes[I] = DAG.getUNDEF(MVT::i32);
+  SDValue Op2 = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes);
+  return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], Ops[1], Op2);
+}
+
+namespace {
+// Describes a general N-operand vector shuffle.
+struct GeneralShuffle {
+  GeneralShuffle(EVT vt) : VT(vt) {}
+  void addUndef();
+  void add(SDValue, unsigned);
+  SDValue getNode(SelectionDAG &, const SDLoc &);
+
+  // The operands of the shuffle.
+  SmallVector<SDValue, SystemZ::VectorBytes> Ops;
+
+  // Index I is -1 if byte I of the result is undefined.  Otherwise the
+  // result comes from byte Bytes[I] % SystemZ::VectorBytes of operand
+  // Bytes[I] / SystemZ::VectorBytes.
+  SmallVector<int, SystemZ::VectorBytes> Bytes;
+
+  // The type of the shuffle result.
+  EVT VT;
+};
+}
+
+// Add an extra undefined element to the shuffle.
+void GeneralShuffle::addUndef() {
+  unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
+  for (unsigned I = 0; I < BytesPerElement; ++I)
+    Bytes.push_back(-1);
+}
+
+// Add an extra element to the shuffle, taking it from element Elem of Op.
+// A null Op indicates a vector input whose value will be calculated later;
+// there is at most one such input per shuffle and it always has the same
+// type as the result.
+void GeneralShuffle::add(SDValue Op, unsigned Elem) {
+  unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
+
+  // The source vector can have wider elements than the result,
+  // either through an explicit TRUNCATE or because of type legalization.
+  // We want the least significant part.
+  EVT FromVT = Op.getNode() ? Op.getValueType() : VT;
+  unsigned FromBytesPerElement = FromVT.getVectorElementType().getStoreSize();
+  assert(FromBytesPerElement >= BytesPerElement &&
+         "Invalid EXTRACT_VECTOR_ELT");
+  unsigned Byte = ((Elem * FromBytesPerElement) % SystemZ::VectorBytes +
+                   (FromBytesPerElement - BytesPerElement));
+
+  // Look through things like shuffles and bitcasts.
+  while (Op.getNode()) {
+    if (Op.getOpcode() == ISD::BITCAST)
+      Op = Op.getOperand(0);
+    else if (Op.getOpcode() == ISD::VECTOR_SHUFFLE && Op.hasOneUse()) {
+      // See whether the bytes we need come from a contiguous part of one
+      // operand.
+      SmallVector<int, SystemZ::VectorBytes> OpBytes;
+      getVPermMask(cast<ShuffleVectorSDNode>(Op), OpBytes);
+      int NewByte;
+      if (!getShuffleInput(OpBytes, Byte, BytesPerElement, NewByte))
+        break;
+      if (NewByte < 0) {
+        addUndef();
+        return;
+      }
+      Op = Op.getOperand(unsigned(NewByte) / SystemZ::VectorBytes);
+      Byte = unsigned(NewByte) % SystemZ::VectorBytes;
+    } else if (Op.isUndef()) {
+      addUndef();
+      return;
+    } else
+      break;
+  }
+
+  // Make sure that the source of the extraction is in Ops.
+  unsigned OpNo = 0;
+  for (; OpNo < Ops.size(); ++OpNo)
+    if (Ops[OpNo] == Op)
+      break;
+  if (OpNo == Ops.size())
+    Ops.push_back(Op);
+
+  // Add the element to Bytes.
+  unsigned Base = OpNo * SystemZ::VectorBytes + Byte;
+  for (unsigned I = 0; I < BytesPerElement; ++I)
+    Bytes.push_back(Base + I);
+}
+
+// Return SDNodes for the completed shuffle.
+SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) {
+  assert(Bytes.size() == SystemZ::VectorBytes && "Incomplete vector");
+
+  if (Ops.size() == 0)
+    return DAG.getUNDEF(VT);
+
+  // Make sure that there are at least two shuffle operands.
+  if (Ops.size() == 1)
+    Ops.push_back(DAG.getUNDEF(MVT::v16i8));
+
+  // Create a tree of shuffles, deferring root node until after the loop.
+  // Try to redistribute the undefined elements of non-root nodes so that
+  // the non-root shuffles match something like a pack or merge, then adjust
+  // the parent node's permute vector to compensate for the new order.
+  // Among other things, this copes with vectors like <2 x i16> that were
+  // padded with undefined elements during type legalization.
+  //
+  // In the best case this redistribution will lead to the whole tree
+  // using packs and merges.  It should rarely be a loss in other cases.
+  unsigned Stride = 1;
+  for (; Stride * 2 < Ops.size(); Stride *= 2) {
+    for (unsigned I = 0; I < Ops.size() - Stride; I += Stride * 2) {
+      SDValue SubOps[] = { Ops[I], Ops[I + Stride] };
+
+      // Create a mask for just these two operands.
+      SmallVector<int, SystemZ::VectorBytes> NewBytes(SystemZ::VectorBytes);
+      for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) {
+        unsigned OpNo = unsigned(Bytes[J]) / SystemZ::VectorBytes;
+        unsigned Byte = unsigned(Bytes[J]) % SystemZ::VectorBytes;
+        if (OpNo == I)
+          NewBytes[J] = Byte;
+        else if (OpNo == I + Stride)
+          NewBytes[J] = SystemZ::VectorBytes + Byte;
+        else
+          NewBytes[J] = -1;
+      }
+      // See if it would be better to reorganize NewMask to avoid using VPERM.
+      SmallVector<int, SystemZ::VectorBytes> NewBytesMap(SystemZ::VectorBytes);
+      if (const Permute *P = matchDoublePermute(NewBytes, NewBytesMap)) {
+        Ops[I] = getPermuteNode(DAG, DL, *P, SubOps[0], SubOps[1]);
+        // Applying NewBytesMap to Ops[I] gets back to NewBytes.
+        for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) {
+          if (NewBytes[J] >= 0) {
+            assert(unsigned(NewBytesMap[J]) < SystemZ::VectorBytes &&
+                   "Invalid double permute");
+            Bytes[J] = I * SystemZ::VectorBytes + NewBytesMap[J];
+          } else
+            assert(NewBytesMap[J] < 0 && "Invalid double permute");
+        }
+      } else {
+        // Just use NewBytes on the operands.
+        Ops[I] = getGeneralPermuteNode(DAG, DL, SubOps, NewBytes);
+        for (unsigned J = 0; J < SystemZ::VectorBytes; ++J)
+          if (NewBytes[J] >= 0)
+            Bytes[J] = I * SystemZ::VectorBytes + J;
+      }
+    }
+  }
+
+  // Now we just have 2 inputs.  Put the second operand in Ops[1].
+  if (Stride > 1) {
+    Ops[1] = Ops[Stride];
+    for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
+      if (Bytes[I] >= int(SystemZ::VectorBytes))
+        Bytes[I] -= (Stride - 1) * SystemZ::VectorBytes;
+  }
+
+  // Look for an instruction that can do the permute without resorting
+  // to VPERM.
+  unsigned OpNo0, OpNo1;
+  SDValue Op;
+  if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
+    Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]);
+  else
+    Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes);
+  return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+}
+
+// Return true if the given BUILD_VECTOR is a scalar-to-vector conversion.
+static bool isScalarToVector(SDValue Op) {
+  for (unsigned I = 1, E = Op.getNumOperands(); I != E; ++I)
+    if (!Op.getOperand(I).isUndef())
+      return false;
+  return true;
+}
+
+// Return a vector of type VT that contains Value in the first element.
+// The other elements don't matter.
+static SDValue buildScalarToVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
+                                   SDValue Value) {
+  // If we have a constant, replicate it to all elements and let the
+  // BUILD_VECTOR lowering take care of it.
+  if (Value.getOpcode() == ISD::Constant ||
+      Value.getOpcode() == ISD::ConstantFP) {
+    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Value);
+    return DAG.getBuildVector(VT, DL, Ops);
+  }
+  if (Value.isUndef())
+    return DAG.getUNDEF(VT);
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
+}
+
+// Return a vector of type VT in which Op0 is in element 0 and Op1 is in
+// element 1.  Used for cases in which replication is cheap.
+static SDValue buildMergeScalars(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
+                                 SDValue Op0, SDValue Op1) {
+  if (Op0.isUndef()) {
+    if (Op1.isUndef())
+      return DAG.getUNDEF(VT);
+    return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op1);
+  }
+  if (Op1.isUndef())
+    return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0);
+  return DAG.getNode(SystemZISD::MERGE_HIGH, DL, VT,
+                     buildScalarToVector(DAG, DL, VT, Op0),
+                     buildScalarToVector(DAG, DL, VT, Op1));
+}
+
+// Extend GPR scalars Op0 and Op1 to doublewords and return a v2i64
+// vector for them.
+static SDValue joinDwords(SelectionDAG &DAG, const SDLoc &DL, SDValue Op0,
+                          SDValue Op1) {
+  if (Op0.isUndef() && Op1.isUndef())
+    return DAG.getUNDEF(MVT::v2i64);
+  // If one of the two inputs is undefined then replicate the other one,
+  // in order to avoid using another register unnecessarily.
+  if (Op0.isUndef())
+    Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
+  else if (Op1.isUndef())
+    Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
+  else {
+    Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
+    Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
+  }
+  return DAG.getNode(SystemZISD::JOIN_DWORDS, DL, MVT::v2i64, Op0, Op1);
+}
+
+// Try to represent constant BUILD_VECTOR node BVN using a
+// SystemZISD::BYTE_MASK-style mask.  Store the mask value in Mask
+// on success.
+static bool tryBuildVectorByteMask(BuildVectorSDNode *BVN, uint64_t &Mask) {
+  EVT ElemVT = BVN->getValueType(0).getVectorElementType();
+  unsigned BytesPerElement = ElemVT.getStoreSize();
+  for (unsigned I = 0, E = BVN->getNumOperands(); I != E; ++I) {
+    SDValue Op = BVN->getOperand(I);
+    if (!Op.isUndef()) {
+      uint64_t Value;
+      if (Op.getOpcode() == ISD::Constant)
+        Value = dyn_cast<ConstantSDNode>(Op)->getZExtValue();
+      else if (Op.getOpcode() == ISD::ConstantFP)
+        Value = (dyn_cast<ConstantFPSDNode>(Op)->getValueAPF().bitcastToAPInt()
+                 .getZExtValue());
+      else
+        return false;
+      for (unsigned J = 0; J < BytesPerElement; ++J) {
+        uint64_t Byte = (Value >> (J * 8)) & 0xff;
+        if (Byte == 0xff)
+          Mask |= 1ULL << ((E - I - 1) * BytesPerElement + J);
+        else if (Byte != 0)
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+// Try to load a vector constant in which BitsPerElement-bit value Value
+// is replicated to fill the vector.  VT is the type of the resulting
+// constant, which may have elements of a different size from BitsPerElement.
+// Return the SDValue of the constant on success, otherwise return
+// an empty value.
+static SDValue tryBuildVectorReplicate(SelectionDAG &DAG,
+                                       const SystemZInstrInfo *TII,
+                                       const SDLoc &DL, EVT VT, uint64_t Value,
+                                       unsigned BitsPerElement) {
+  // Signed 16-bit values can be replicated using VREPI.
+  int64_t SignedValue = SignExtend64(Value, BitsPerElement);
+  if (isInt<16>(SignedValue)) {
+    MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
+                                 SystemZ::VectorBits / BitsPerElement);
+    SDValue Op = DAG.getNode(SystemZISD::REPLICATE, DL, VecVT,
+                             DAG.getConstant(SignedValue, DL, MVT::i32));
+    return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+  }
+  // See whether rotating the constant left some N places gives a value that
+  // is one less than a power of 2 (i.e. all zeros followed by all ones).
+  // If so we can use VGM.
+  unsigned Start, End;
+  if (TII->isRxSBGMask(Value, BitsPerElement, Start, End)) {
+    // isRxSBGMask returns the bit numbers for a full 64-bit value,
+    // with 0 denoting 1 << 63 and 63 denoting 1.  Convert them to
+    // bit numbers for an BitsPerElement value, so that 0 denotes
+    // 1 << (BitsPerElement-1).
+    Start -= 64 - BitsPerElement;
+    End -= 64 - BitsPerElement;
+    MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
+                                 SystemZ::VectorBits / BitsPerElement);
+    SDValue Op = DAG.getNode(SystemZISD::ROTATE_MASK, DL, VecVT,
+                             DAG.getConstant(Start, DL, MVT::i32),
+                             DAG.getConstant(End, DL, MVT::i32));
+    return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+  }
+  return SDValue();
+}
+
+// If a BUILD_VECTOR contains some EXTRACT_VECTOR_ELTs, it's usually
+// better to use VECTOR_SHUFFLEs on them, only using BUILD_VECTOR for
+// the non-EXTRACT_VECTOR_ELT elements.  See if the given BUILD_VECTOR
+// would benefit from this representation and return it if so.
+static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
+                                     BuildVectorSDNode *BVN) {
+  EVT VT = BVN->getValueType(0);
+  unsigned NumElements = VT.getVectorNumElements();
+
+  // Represent the BUILD_VECTOR as an N-operand VECTOR_SHUFFLE-like operation
+  // on byte vectors.  If there are non-EXTRACT_VECTOR_ELT elements that still
+  // need a BUILD_VECTOR, add an additional placeholder operand for that
+  // BUILD_VECTOR and store its operands in ResidueOps.
+  GeneralShuffle GS(VT);
+  SmallVector<SDValue, SystemZ::VectorBytes> ResidueOps;
+  bool FoundOne = false;
+  for (unsigned I = 0; I < NumElements; ++I) {
+    SDValue Op = BVN->getOperand(I);
+    if (Op.getOpcode() == ISD::TRUNCATE)
+      Op = Op.getOperand(0);
+    if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        Op.getOperand(1).getOpcode() == ISD::Constant) {
+      unsigned Elem = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      GS.add(Op.getOperand(0), Elem);
+      FoundOne = true;
+    } else if (Op.isUndef()) {
+      GS.addUndef();
+    } else {
+      GS.add(SDValue(), ResidueOps.size());
+      ResidueOps.push_back(BVN->getOperand(I));
+    }
+  }
+
+  // Nothing to do if there are no EXTRACT_VECTOR_ELTs.
+  if (!FoundOne)
+    return SDValue();
+
+  // Create the BUILD_VECTOR for the remaining elements, if any.
+  if (!ResidueOps.empty()) {
+    while (ResidueOps.size() < NumElements)
+      ResidueOps.push_back(DAG.getUNDEF(ResidueOps[0].getValueType()));
+    for (auto &Op : GS.Ops) {
+      if (!Op.getNode()) {
+        Op = DAG.getBuildVector(VT, SDLoc(BVN), ResidueOps);
+        break;
+      }
+    }
+  }
+  return GS.getNode(DAG, SDLoc(BVN));
+}
+
+// Combine GPR scalar values Elems into a vector of type VT.
+static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
+                           SmallVectorImpl<SDValue> &Elems) {
+  // See whether there is a single replicated value.
+  SDValue Single;
+  unsigned int NumElements = Elems.size();
+  unsigned int Count = 0;
+  for (auto Elem : Elems) {
+    if (!Elem.isUndef()) {
+      if (!Single.getNode())
+        Single = Elem;
+      else if (Elem != Single) {
+        Single = SDValue();
+        break;
+      }
+      Count += 1;
+    }
+  }
+  // There are three cases here:
+  //
+  // - if the only defined element is a loaded one, the best sequence
+  //   is a replicating load.
+  //
+  // - otherwise, if the only defined element is an i64 value, we will
+  //   end up with the same VLVGP sequence regardless of whether we short-cut
+  //   for replication or fall through to the later code.
+  //
+  // - otherwise, if the only defined element is an i32 or smaller value,
+  //   we would need 2 instructions to replicate it: VLVGP followed by VREPx.
+  //   This is only a win if the single defined element is used more than once.
+  //   In other cases we're better off using a single VLVGx.
+  if (Single.getNode() && (Count > 1 || Single.getOpcode() == ISD::LOAD))
+    return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single);
+
+  // The best way of building a v2i64 from two i64s is to use VLVGP.
+  if (VT == MVT::v2i64)
+    return joinDwords(DAG, DL, Elems[0], Elems[1]);
+
+  // Use a 64-bit merge high to combine two doubles.
+  if (VT == MVT::v2f64)
+    return buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
+
+  // Build v4f32 values directly from the FPRs:
+  //
+  //   <Axxx> <Bxxx> <Cxxxx> <Dxxx>
+  //         V              V         VMRHF
+  //      <ABxx>         <CDxx>
+  //                V                 VMRHG
+  //              <ABCD>
+  if (VT == MVT::v4f32) {
+    SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
+    SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
+    // Avoid unnecessary undefs by reusing the other operand.
+    if (Op01.isUndef())
+      Op01 = Op23;
+    else if (Op23.isUndef())
+      Op23 = Op01;
+    // Merging identical replications is a no-op.
+    if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
+      return Op01;
+    Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01);
+    Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23);
+    SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
+                             DL, MVT::v2i64, Op01, Op23);
+    return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+  }
+
+  // Collect the constant terms.
+  SmallVector<SDValue, SystemZ::VectorBytes> Constants(NumElements, SDValue());
+  SmallVector<bool, SystemZ::VectorBytes> Done(NumElements, false);
+
+  unsigned NumConstants = 0;
+  for (unsigned I = 0; I < NumElements; ++I) {
+    SDValue Elem = Elems[I];
+    if (Elem.getOpcode() == ISD::Constant ||
+        Elem.getOpcode() == ISD::ConstantFP) {
+      NumConstants += 1;
+      Constants[I] = Elem;
+      Done[I] = true;
+    }
+  }
+  // If there was at least one constant, fill in the other elements of
+  // Constants with undefs to get a full vector constant and use that
+  // as the starting point.
+  SDValue Result;
+  if (NumConstants > 0) {
+    for (unsigned I = 0; I < NumElements; ++I)
+      if (!Constants[I].getNode())
+        Constants[I] = DAG.getUNDEF(Elems[I].getValueType());
+    Result = DAG.getBuildVector(VT, DL, Constants);
+  } else {
+    // Otherwise try to use VLVGP to start the sequence in order to
+    // avoid a false dependency on any previous contents of the vector
+    // register.  This only makes sense if one of the associated elements
+    // is defined.
+    unsigned I1 = NumElements / 2 - 1;
+    unsigned I2 = NumElements - 1;
+    bool Def1 = !Elems[I1].isUndef();
+    bool Def2 = !Elems[I2].isUndef();
+    if (Def1 || Def2) {
+      SDValue Elem1 = Elems[Def1 ? I1 : I2];
+      SDValue Elem2 = Elems[Def2 ? I2 : I1];
+      Result = DAG.getNode(ISD::BITCAST, DL, VT,
+                           joinDwords(DAG, DL, Elem1, Elem2));
+      Done[I1] = true;
+      Done[I2] = true;
+    } else
+      Result = DAG.getUNDEF(VT);
+  }
+
+  // Use VLVGx to insert the other elements.
+  for (unsigned I = 0; I < NumElements; ++I)
+    if (!Done[I] && !Elems[I].isUndef())
+      Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Result, Elems[I],
+                           DAG.getConstant(I, DL, MVT::i32));
+  return Result;
+}
+
+SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  const SystemZInstrInfo *TII =
+    static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  auto *BVN = cast<BuildVectorSDNode>(Op.getNode());
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  if (BVN->isConstant()) {
+    // Try using VECTOR GENERATE BYTE MASK.  This is the architecturally-
+    // preferred way of creating all-zero and all-one vectors so give it
+    // priority over other methods below.
+    uint64_t Mask = 0;
+    if (tryBuildVectorByteMask(BVN, Mask)) {
+      SDValue Op = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
+                               DAG.getConstant(Mask, DL, MVT::i32));
+      return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+    }
+
+    // Try using some form of replication.
+    APInt SplatBits, SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
+                             8, true) &&
+        SplatBitSize <= 64) {
+      // First try assuming that any undefined bits above the highest set bit
+      // and below the lowest set bit are 1s.  This increases the likelihood of
+      // being able to use a sign-extended element value in VECTOR REPLICATE
+      // IMMEDIATE or a wraparound mask in VECTOR GENERATE MASK.
+      uint64_t SplatBitsZ = SplatBits.getZExtValue();
+      uint64_t SplatUndefZ = SplatUndef.getZExtValue();
+      uint64_t Lower = (SplatUndefZ
+                        & ((uint64_t(1) << findFirstSet(SplatBitsZ)) - 1));
+      uint64_t Upper = (SplatUndefZ
+                        & ~((uint64_t(1) << findLastSet(SplatBitsZ)) - 1));
+      uint64_t Value = SplatBitsZ | Upper | Lower;
+      SDValue Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value,
+                                           SplatBitSize);
+      if (Op.getNode())
+        return Op;
+
+      // Now try assuming that any undefined bits between the first and
+      // last defined set bits are set.  This increases the chances of
+      // using a non-wraparound mask.
+      uint64_t Middle = SplatUndefZ & ~Upper & ~Lower;
+      Value = SplatBitsZ | Middle;
+      Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value, SplatBitSize);
+      if (Op.getNode())
+        return Op;
+    }
+
+    // Fall back to loading it from memory.
+    return SDValue();
+  }
+
+  // See if we should use shuffles to construct the vector from other vectors.
+  if (SDValue Res = tryBuildVectorShuffle(DAG, BVN))
+    return Res;
+
+  // Detect SCALAR_TO_VECTOR conversions.
+  if (isOperationLegal(ISD::SCALAR_TO_VECTOR, VT) && isScalarToVector(Op))
+    return buildScalarToVector(DAG, DL, VT, Op.getOperand(0));
+
+  // Otherwise use buildVector to build the vector up from GPRs.
+  unsigned NumElements = Op.getNumOperands();
+  SmallVector<SDValue, SystemZ::VectorBytes> Ops(NumElements);
+  for (unsigned I = 0; I < NumElements; ++I)
+    Ops[I] = Op.getOperand(I);
+  return buildVector(DAG, DL, VT, Ops);
+}
+
+SDValue SystemZTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  auto *VSN = cast<ShuffleVectorSDNode>(Op.getNode());
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  unsigned NumElements = VT.getVectorNumElements();
+
+  if (VSN->isSplat()) {
+    SDValue Op0 = Op.getOperand(0);
+    unsigned Index = VSN->getSplatIndex();
+    assert(Index < VT.getVectorNumElements() &&
+           "Splat index should be defined and in first operand");
+    // See whether the value we're splatting is directly available as a scalar.
+    if ((Index == 0 && Op0.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
+        Op0.getOpcode() == ISD::BUILD_VECTOR)
+      return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0.getOperand(Index));
+    // Otherwise keep it as a vector-to-vector operation.
+    return DAG.getNode(SystemZISD::SPLAT, DL, VT, Op.getOperand(0),
+                       DAG.getConstant(Index, DL, MVT::i32));
+  }
+
+  GeneralShuffle GS(VT);
+  for (unsigned I = 0; I < NumElements; ++I) {
+    int Elt = VSN->getMaskElt(I);
+    if (Elt < 0)
+      GS.addUndef();
+    else
+      GS.add(Op.getOperand(unsigned(Elt) / NumElements),
+             unsigned(Elt) % NumElements);
+  }
+  return GS.getNode(DAG, SDLoc(VSN));
+}
+
+SDValue SystemZTargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  // Just insert the scalar into element 0 of an undefined vector.
+  return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
+                     Op.getValueType(), DAG.getUNDEF(Op.getValueType()),
+                     Op.getOperand(0), DAG.getConstant(0, DL, MVT::i32));
+}
+
+SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  // Handle insertions of floating-point values.
+  SDLoc DL(Op);
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+  EVT VT = Op.getValueType();
+
+  // Insertions into constant indices of a v2f64 can be done using VPDI.
+  // However, if the inserted value is a bitcast or a constant then it's
+  // better to use GPRs, as below.
+  if (VT == MVT::v2f64 &&
+      Op1.getOpcode() != ISD::BITCAST &&
+      Op1.getOpcode() != ISD::ConstantFP &&
+      Op2.getOpcode() == ISD::Constant) {
+    uint64_t Index = dyn_cast<ConstantSDNode>(Op2)->getZExtValue();
+    unsigned Mask = VT.getVectorNumElements() - 1;
+    if (Index <= Mask)
+      return Op;
+  }
+
+  // Otherwise bitcast to the equivalent integer form and insert via a GPR.
+  MVT IntVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+  MVT IntVecVT = MVT::getVectorVT(IntVT, VT.getVectorNumElements());
+  SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, IntVecVT,
+                            DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0),
+                            DAG.getNode(ISD::BITCAST, DL, IntVT, Op1), Op2);
+  return DAG.getNode(ISD::BITCAST, DL, VT, Res);
+}
+
+SDValue
+SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  // Handle extractions of floating-point values.
+  SDLoc DL(Op);
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+  EVT VecVT = Op0.getValueType();
+
+  // Extractions of constant indices can be done directly.
+  if (auto *CIndexN = dyn_cast<ConstantSDNode>(Op1)) {
+    uint64_t Index = CIndexN->getZExtValue();
+    unsigned Mask = VecVT.getVectorNumElements() - 1;
+    if (Index <= Mask)
+      return Op;
+  }
+
+  // Otherwise bitcast to the equivalent integer form and extract via a GPR.
+  MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits());
+  MVT IntVecVT = MVT::getVectorVT(IntVT, VecVT.getVectorNumElements());
+  SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, IntVT,
+                            DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0), Op1);
+  return DAG.getNode(ISD::BITCAST, DL, VT, Res);
+}
+
+SDValue
+SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
+                                              unsigned UnpackHigh) const {
+  SDValue PackedOp = Op.getOperand(0);
+  EVT OutVT = Op.getValueType();
+  EVT InVT = PackedOp.getValueType();
+  unsigned ToBits = OutVT.getScalarSizeInBits();
+  unsigned FromBits = InVT.getScalarSizeInBits();
+  do {
+    FromBits *= 2;
+    EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
+                                 SystemZ::VectorBits / FromBits);
+    PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
+  } while (FromBits != ToBits);
+  return PackedOp;
+}
+
+SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
+                                          unsigned ByScalar) const {
+  // Look for cases where a vector shift can use the *_BY_SCALAR form.
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  unsigned ElemBitSize = VT.getScalarSizeInBits();
+
+  // See whether the shift vector is a splat represented as BUILD_VECTOR.
+  if (auto *BVN = dyn_cast<BuildVectorSDNode>(Op1)) {
+    APInt SplatBits, SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    // Check for constant splats.  Use ElemBitSize as the minimum element
+    // width and reject splats that need wider elements.
+    if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
+                             ElemBitSize, true) &&
+        SplatBitSize == ElemBitSize) {
+      SDValue Shift = DAG.getConstant(SplatBits.getZExtValue() & 0xfff,
+                                      DL, MVT::i32);
+      return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
+    }
+    // Check for variable splats.
+    BitVector UndefElements;
+    SDValue Splat = BVN->getSplatValue(&UndefElements);
+    if (Splat) {
+      // Since i32 is the smallest legal type, we either need a no-op
+      // or a truncation.
+      SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Splat);
+      return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
+    }
+  }
+
+  // See whether the shift vector is a splat represented as SHUFFLE_VECTOR,
+  // and the shift amount is directly available in a GPR.
+  if (auto *VSN = dyn_cast<ShuffleVectorSDNode>(Op1)) {
+    if (VSN->isSplat()) {
+      SDValue VSNOp0 = VSN->getOperand(0);
+      unsigned Index = VSN->getSplatIndex();
+      assert(Index < VT.getVectorNumElements() &&
+             "Splat index should be defined and in first operand");
+      if ((Index == 0 && VSNOp0.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
+          VSNOp0.getOpcode() == ISD::BUILD_VECTOR) {
+        // Since i32 is the smallest legal type, we either need a no-op
+        // or a truncation.
+        SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
+                                    VSNOp0.getOperand(Index));
+        return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
+      }
+    }
+  }
+
+  // Otherwise just treat the current form as legal.
+  return Op;
+}
+
+SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  case ISD::FRAMEADDR:
+    return lowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:
+    return lowerRETURNADDR(Op, DAG);
+  case ISD::BR_CC:
+    return lowerBR_CC(Op, DAG);
+  case ISD::SELECT_CC:
+    return lowerSELECT_CC(Op, DAG);
+  case ISD::SETCC:
+    return lowerSETCC(Op, DAG);
+  case ISD::GlobalAddress:
+    return lowerGlobalAddress(cast<GlobalAddressSDNode>(Op), DAG);
+  case ISD::GlobalTLSAddress:
+    return lowerGlobalTLSAddress(cast<GlobalAddressSDNode>(Op), DAG);
+  case ISD::BlockAddress:
+    return lowerBlockAddress(cast<BlockAddressSDNode>(Op), DAG);
+  case ISD::JumpTable:
+    return lowerJumpTable(cast<JumpTableSDNode>(Op), DAG);
+  case ISD::ConstantPool:
+    return lowerConstantPool(cast<ConstantPoolSDNode>(Op), DAG);
+  case ISD::BITCAST:
+    return lowerBITCAST(Op, DAG);
+  case ISD::VASTART:
+    return lowerVASTART(Op, DAG);
+  case ISD::VACOPY:
+    return lowerVACOPY(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    return lowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::GET_DYNAMIC_AREA_OFFSET:
+    return lowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
+  case ISD::SMUL_LOHI:
+    return lowerSMUL_LOHI(Op, DAG);
+  case ISD::UMUL_LOHI:
+    return lowerUMUL_LOHI(Op, DAG);
+  case ISD::SDIVREM:
+    return lowerSDIVREM(Op, DAG);
+  case ISD::UDIVREM:
+    return lowerUDIVREM(Op, DAG);
+  case ISD::OR:
+    return lowerOR(Op, DAG);
+  case ISD::CTPOP:
+    return lowerCTPOP(Op, DAG);
+  case ISD::ATOMIC_FENCE:
+    return lowerATOMIC_FENCE(Op, DAG);
+  case ISD::ATOMIC_SWAP:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
+  case ISD::ATOMIC_STORE:
+    return lowerATOMIC_STORE(Op, DAG);
+  case ISD::ATOMIC_LOAD:
+    return lowerATOMIC_LOAD(Op, DAG);
+  case ISD::ATOMIC_LOAD_ADD:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
+  case ISD::ATOMIC_LOAD_SUB:
+    return lowerATOMIC_LOAD_SUB(Op, DAG);
+  case ISD::ATOMIC_LOAD_AND:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_AND);
+  case ISD::ATOMIC_LOAD_OR:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_OR);
+  case ISD::ATOMIC_LOAD_XOR:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_XOR);
+  case ISD::ATOMIC_LOAD_NAND:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_NAND);
+  case ISD::ATOMIC_LOAD_MIN:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MIN);
+  case ISD::ATOMIC_LOAD_MAX:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MAX);
+  case ISD::ATOMIC_LOAD_UMIN:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMIN);
+  case ISD::ATOMIC_LOAD_UMAX:
+    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMAX);
+  case ISD::ATOMIC_CMP_SWAP:
+    return lowerATOMIC_CMP_SWAP(Op, DAG);
+  case ISD::STACKSAVE:
+    return lowerSTACKSAVE(Op, DAG);
+  case ISD::STACKRESTORE:
+    return lowerSTACKRESTORE(Op, DAG);
+  case ISD::PREFETCH:
+    return lowerPREFETCH(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN:
+    return lowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return lowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::BUILD_VECTOR:
+    return lowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:
+    return lowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:
+    return lowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:
+    return lowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
+  case ISD::SHL:
+    return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
+  case ISD::SRL:
+    return lowerShift(Op, DAG, SystemZISD::VSRL_BY_SCALAR);
+  case ISD::SRA:
+    return lowerShift(Op, DAG, SystemZISD::VSRA_BY_SCALAR);
+  default:
+    llvm_unreachable("Unexpected node to lower");
+  }
+}
+
+const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
+#define OPCODE(NAME) case SystemZISD::NAME: return "SystemZISD::" #NAME
+  switch ((SystemZISD::NodeType)Opcode) {
+    case SystemZISD::FIRST_NUMBER: break;
+    OPCODE(RET_FLAG);
+    OPCODE(CALL);
+    OPCODE(SIBCALL);
+    OPCODE(TLS_GDCALL);
+    OPCODE(TLS_LDCALL);
+    OPCODE(PCREL_WRAPPER);
+    OPCODE(PCREL_OFFSET);
+    OPCODE(IABS);
+    OPCODE(ICMP);
+    OPCODE(FCMP);
+    OPCODE(TM);
+    OPCODE(BR_CCMASK);
+    OPCODE(SELECT_CCMASK);
+    OPCODE(ADJDYNALLOC);
+    OPCODE(POPCNT);
+    OPCODE(UMUL_LOHI64);
+    OPCODE(SDIVREM32);
+    OPCODE(SDIVREM64);
+    OPCODE(UDIVREM32);
+    OPCODE(UDIVREM64);
+    OPCODE(MVC);
+    OPCODE(MVC_LOOP);
+    OPCODE(NC);
+    OPCODE(NC_LOOP);
+    OPCODE(OC);
+    OPCODE(OC_LOOP);
+    OPCODE(XC);
+    OPCODE(XC_LOOP);
+    OPCODE(CLC);
+    OPCODE(CLC_LOOP);
+    OPCODE(STPCPY);
+    OPCODE(STRCMP);
+    OPCODE(SEARCH_STRING);
+    OPCODE(IPM);
+    OPCODE(SERIALIZE);
+    OPCODE(MEMBARRIER);
+    OPCODE(TBEGIN);
+    OPCODE(TBEGIN_NOFLOAT);
+    OPCODE(TEND);
+    OPCODE(BYTE_MASK);
+    OPCODE(ROTATE_MASK);
+    OPCODE(REPLICATE);
+    OPCODE(JOIN_DWORDS);
+    OPCODE(SPLAT);
+    OPCODE(MERGE_HIGH);
+    OPCODE(MERGE_LOW);
+    OPCODE(SHL_DOUBLE);
+    OPCODE(PERMUTE_DWORDS);
+    OPCODE(PERMUTE);
+    OPCODE(PACK);
+    OPCODE(PACKS_CC);
+    OPCODE(PACKLS_CC);
+    OPCODE(UNPACK_HIGH);
+    OPCODE(UNPACKL_HIGH);
+    OPCODE(UNPACK_LOW);
+    OPCODE(UNPACKL_LOW);
+    OPCODE(VSHL_BY_SCALAR);
+    OPCODE(VSRL_BY_SCALAR);
+    OPCODE(VSRA_BY_SCALAR);
+    OPCODE(VSUM);
+    OPCODE(VICMPE);
+    OPCODE(VICMPH);
+    OPCODE(VICMPHL);
+    OPCODE(VICMPES);
+    OPCODE(VICMPHS);
+    OPCODE(VICMPHLS);
+    OPCODE(VFCMPE);
+    OPCODE(VFCMPH);
+    OPCODE(VFCMPHE);
+    OPCODE(VFCMPES);
+    OPCODE(VFCMPHS);
+    OPCODE(VFCMPHES);
+    OPCODE(VFTCI);
+    OPCODE(VEXTEND);
+    OPCODE(VROUND);
+    OPCODE(VTM);
+    OPCODE(VFAE_CC);
+    OPCODE(VFAEZ_CC);
+    OPCODE(VFEE_CC);
+    OPCODE(VFEEZ_CC);
+    OPCODE(VFENE_CC);
+    OPCODE(VFENEZ_CC);
+    OPCODE(VISTR_CC);
+    OPCODE(VSTRC_CC);
+    OPCODE(VSTRCZ_CC);
+    OPCODE(TDC);
+    OPCODE(ATOMIC_SWAPW);
+    OPCODE(ATOMIC_LOADW_ADD);
+    OPCODE(ATOMIC_LOADW_SUB);
+    OPCODE(ATOMIC_LOADW_AND);
+    OPCODE(ATOMIC_LOADW_OR);
+    OPCODE(ATOMIC_LOADW_XOR);
+    OPCODE(ATOMIC_LOADW_NAND);
+    OPCODE(ATOMIC_LOADW_MIN);
+    OPCODE(ATOMIC_LOADW_MAX);
+    OPCODE(ATOMIC_LOADW_UMIN);
+    OPCODE(ATOMIC_LOADW_UMAX);
+    OPCODE(ATOMIC_CMP_SWAPW);
+    OPCODE(LRV);
+    OPCODE(STRV);
+    OPCODE(PREFETCH);
+  }
+  return nullptr;
+#undef OPCODE
+}
+
+// Return true if VT is a vector whose elements are a whole number of bytes
+// in width.
+static bool canTreatAsByteVector(EVT VT) {
+  return VT.isVector() && VT.getScalarSizeInBits() % 8 == 0;
+}
+
+// Try to simplify an EXTRACT_VECTOR_ELT from a vector of type VecVT
+// producing a result of type ResVT.  Op is a possibly bitcast version
+// of the input vector and Index is the index (based on type VecVT) that
+// should be extracted.  Return the new extraction if a simplification
+// was possible or if Force is true.
+SDValue SystemZTargetLowering::combineExtract(const SDLoc &DL, EVT ResVT,
+                                              EVT VecVT, SDValue Op,
+                                              unsigned Index,
+                                              DAGCombinerInfo &DCI,
+                                              bool Force) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  // The number of bytes being extracted.
+  unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize();
+
+  for (;;) {
+    unsigned Opcode = Op.getOpcode();
+    if (Opcode == ISD::BITCAST)
+      // Look through bitcasts.
+      Op = Op.getOperand(0);
+    else if (Opcode == ISD::VECTOR_SHUFFLE &&
+             canTreatAsByteVector(Op.getValueType())) {
+      // Get a VPERM-like permute mask and see whether the bytes covered
+      // by the extracted element are a contiguous sequence from one
+      // source operand.
+      SmallVector<int, SystemZ::VectorBytes> Bytes;
+      getVPermMask(cast<ShuffleVectorSDNode>(Op), Bytes);
+      int First;
+      if (!getShuffleInput(Bytes, Index * BytesPerElement,
+                           BytesPerElement, First))
+        break;
+      if (First < 0)
+        return DAG.getUNDEF(ResVT);
+      // Make sure the contiguous sequence starts at a multiple of the
+      // original element size.
+      unsigned Byte = unsigned(First) % Bytes.size();
+      if (Byte % BytesPerElement != 0)
+        break;
+      // We can get the extracted value directly from an input.
+      Index = Byte / BytesPerElement;
+      Op = Op.getOperand(unsigned(First) / Bytes.size());
+      Force = true;
+    } else if (Opcode == ISD::BUILD_VECTOR &&
+               canTreatAsByteVector(Op.getValueType())) {
+      // We can only optimize this case if the BUILD_VECTOR elements are
+      // at least as wide as the extracted value.
+      EVT OpVT = Op.getValueType();
+      unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize();
+      if (OpBytesPerElement < BytesPerElement)
+        break;
+      // Make sure that the least-significant bit of the extracted value
+      // is the least significant bit of an input.
+      unsigned End = (Index + 1) * BytesPerElement;
+      if (End % OpBytesPerElement != 0)
+        break;
+      // We're extracting the low part of one operand of the BUILD_VECTOR.
+      Op = Op.getOperand(End / OpBytesPerElement - 1);
+      if (!Op.getValueType().isInteger()) {
+        EVT VT = MVT::getIntegerVT(Op.getValueSizeInBits());
+        Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
+        DCI.AddToWorklist(Op.getNode());
+      }
+      EVT VT = MVT::getIntegerVT(ResVT.getSizeInBits());
+      Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
+      if (VT != ResVT) {
+        DCI.AddToWorklist(Op.getNode());
+        Op = DAG.getNode(ISD::BITCAST, DL, ResVT, Op);
+      }
+      return Op;
+    } else if ((Opcode == ISD::SIGN_EXTEND_VECTOR_INREG ||
+                Opcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
+                Opcode == ISD::ANY_EXTEND_VECTOR_INREG) &&
+               canTreatAsByteVector(Op.getValueType()) &&
+               canTreatAsByteVector(Op.getOperand(0).getValueType())) {
+      // Make sure that only the unextended bits are significant.
+      EVT ExtVT = Op.getValueType();
+      EVT OpVT = Op.getOperand(0).getValueType();
+      unsigned ExtBytesPerElement = ExtVT.getVectorElementType().getStoreSize();
+      unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize();
+      unsigned Byte = Index * BytesPerElement;
+      unsigned SubByte = Byte % ExtBytesPerElement;
+      unsigned MinSubByte = ExtBytesPerElement - OpBytesPerElement;
+      if (SubByte < MinSubByte ||
+          SubByte + BytesPerElement > ExtBytesPerElement)
+        break;
+      // Get the byte offset of the unextended element
+      Byte = Byte / ExtBytesPerElement * OpBytesPerElement;
+      // ...then add the byte offset relative to that element.
+      Byte += SubByte - MinSubByte;
+      if (Byte % BytesPerElement != 0)
+        break;
+      Op = Op.getOperand(0);
+      Index = Byte / BytesPerElement;
+      Force = true;
+    } else
+      break;
+  }
+  if (Force) {
+    if (Op.getValueType() != VecVT) {
+      Op = DAG.getNode(ISD::BITCAST, DL, VecVT, Op);
+      DCI.AddToWorklist(Op.getNode());
+    }
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op,
+                       DAG.getConstant(Index, DL, MVT::i32));
+  }
+  return SDValue();
+}
+
+// Optimize vector operations in scalar value Op on the basis that Op
+// is truncated to TruncVT.
+SDValue SystemZTargetLowering::combineTruncateExtract(
+    const SDLoc &DL, EVT TruncVT, SDValue Op, DAGCombinerInfo &DCI) const {
+  // If we have (trunc (extract_vector_elt X, Y)), try to turn it into
+  // (extract_vector_elt (bitcast X), Y'), where (bitcast X) has elements
+  // of type TruncVT.
+  if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      TruncVT.getSizeInBits() % 8 == 0) {
+    SDValue Vec = Op.getOperand(0);
+    EVT VecVT = Vec.getValueType();
+    if (canTreatAsByteVector(VecVT)) {
+      if (auto *IndexN = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+        unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize();
+        unsigned TruncBytes = TruncVT.getStoreSize();
+        if (BytesPerElement % TruncBytes == 0) {
+          // Calculate the value of Y' in the above description.  We are
+          // splitting the original elements into Scale equal-sized pieces
+          // and for truncation purposes want the last (least-significant)
+          // of these pieces for IndexN.  This is easiest to do by calculating
+          // the start index of the following element and then subtracting 1.
+          unsigned Scale = BytesPerElement / TruncBytes;
+          unsigned NewIndex = (IndexN->getZExtValue() + 1) * Scale - 1;
+
+          // Defer the creation of the bitcast from X to combineExtract,
+          // which might be able to optimize the extraction.
+          VecVT = MVT::getVectorVT(MVT::getIntegerVT(TruncBytes * 8),
+                                   VecVT.getStoreSize() / TruncBytes);
+          EVT ResVT = (TruncBytes < 4 ? MVT::i32 : TruncVT);
+          return combineExtract(DL, ResVT, VecVT, Vec, NewIndex, DCI, true);
+        }
+      }
+    }
+  }
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineSIGN_EXTEND(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  // Convert (sext (ashr (shl X, C1), C2)) to
+  // (ashr (shl (anyext X), C1'), C2')), since wider shifts are as
+  // cheap as narrower ones.
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  if (N0.hasOneUse() && N0.getOpcode() == ISD::SRA) {
+    auto *SraAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    SDValue Inner = N0.getOperand(0);
+    if (SraAmt && Inner.hasOneUse() && Inner.getOpcode() == ISD::SHL) {
+      if (auto *ShlAmt = dyn_cast<ConstantSDNode>(Inner.getOperand(1))) {
+        unsigned Extra = (VT.getSizeInBits() - N0.getValueSizeInBits());
+        unsigned NewShlAmt = ShlAmt->getZExtValue() + Extra;
+        unsigned NewSraAmt = SraAmt->getZExtValue() + Extra;
+        EVT ShiftVT = N0.getOperand(1).getValueType();
+        SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SDLoc(Inner), VT,
+                                  Inner.getOperand(0));
+        SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(Inner), VT, Ext,
+                                  DAG.getConstant(NewShlAmt, SDLoc(Inner),
+                                                  ShiftVT));
+        return DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl,
+                           DAG.getConstant(NewSraAmt, SDLoc(N0), ShiftVT));
+      }
+    }
+  }
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineMERGE(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned Opcode = N->getOpcode();
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  if (Op0.getOpcode() == ISD::BITCAST)
+    Op0 = Op0.getOperand(0);
+  if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
+      cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0) {
+    // (z_merge_* 0, 0) -> 0.  This is mostly useful for using VLLEZF
+    // for v4f32.
+    if (Op1 == N->getOperand(0))
+      return Op1;
+    // (z_merge_? 0, X) -> (z_unpackl_? 0, X).
+    EVT VT = Op1.getValueType();
+    unsigned ElemBytes = VT.getVectorElementType().getStoreSize();
+    if (ElemBytes <= 4) {
+      Opcode = (Opcode == SystemZISD::MERGE_HIGH ?
+                SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW);
+      EVT InVT = VT.changeVectorElementTypeToInteger();
+      EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16),
+                                   SystemZ::VectorBytes / ElemBytes / 2);
+      if (VT != InVT) {
+        Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1);
+        DCI.AddToWorklist(Op1.getNode());
+      }
+      SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1);
+      DCI.AddToWorklist(Op.getNode());
+      return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
+    }
+  }
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineSTORE(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  auto *SN = cast<StoreSDNode>(N);
+  auto &Op1 = N->getOperand(1);
+  EVT MemVT = SN->getMemoryVT();
+  // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better
+  // for the extraction to be done on a vMiN value, so that we can use VSTE.
+  // If X has wider elements then convert it to:
+  // (truncstoreiN (extract_vector_elt (bitcast X), Y2), Z).
+  if (MemVT.isInteger()) {
+    if (SDValue Value =
+            combineTruncateExtract(SDLoc(N), MemVT, SN->getValue(), DCI)) {
+      DCI.AddToWorklist(Value.getNode());
+
+      // Rewrite the store with the new form of stored value.
+      return DAG.getTruncStore(SN->getChain(), SDLoc(SN), Value,
+                               SN->getBasePtr(), SN->getMemoryVT(),
+                               SN->getMemOperand());
+    }
+  }
+  // Combine STORE (BSWAP) into STRVH/STRV/STRVG
+  // See comment in combineBSWAP about volatile accesses.
+  if (!SN->isVolatile() &&
+      Op1.getOpcode() == ISD::BSWAP &&
+      Op1.getNode()->hasOneUse() &&
+      (Op1.getValueType() == MVT::i16 ||
+       Op1.getValueType() == MVT::i32 ||
+       Op1.getValueType() == MVT::i64)) {
+
+      SDValue BSwapOp = Op1.getOperand(0);
+
+      if (BSwapOp.getValueType() == MVT::i16)
+        BSwapOp = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), MVT::i32, BSwapOp);
+
+      SDValue Ops[] = {
+        N->getOperand(0), BSwapOp, N->getOperand(2),
+        DAG.getValueType(Op1.getValueType())
+      };
+
+      return
+        DAG.getMemIntrinsicNode(SystemZISD::STRV, SDLoc(N), DAG.getVTList(MVT::Other),
+                                Ops, MemVT, SN->getMemOperand());
+    }
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineEXTRACT_VECTOR_ELT(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  // Try to simplify a vector extraction.
+  if (auto *IndexN = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+    SDValue Op0 = N->getOperand(0);
+    EVT VecVT = Op0.getValueType();
+    return combineExtract(SDLoc(N), N->getValueType(0), VecVT, Op0,
+                          IndexN->getZExtValue(), DCI, false);
+  }
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineJOIN_DWORDS(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  // (join_dwords X, X) == (replicate X)
+  if (N->getOperand(0) == N->getOperand(1))
+    return DAG.getNode(SystemZISD::REPLICATE, SDLoc(N), N->getValueType(0),
+                       N->getOperand(0));
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineFP_ROUND(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  // (fpround (extract_vector_elt X 0))
+  // (fpround (extract_vector_elt X 1)) ->
+  // (extract_vector_elt (VROUND X) 0)
+  // (extract_vector_elt (VROUND X) 1)
+  //
+  // This is a special case since the target doesn't really support v2f32s.
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Op0 = N->getOperand(0);
+  if (N->getValueType(0) == MVT::f32 &&
+      Op0.hasOneUse() &&
+      Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      Op0.getOperand(0).getValueType() == MVT::v2f64 &&
+      Op0.getOperand(1).getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
+    SDValue Vec = Op0.getOperand(0);
+    for (auto *U : Vec->uses()) {
+      if (U != Op0.getNode() &&
+          U->hasOneUse() &&
+          U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          U->getOperand(0) == Vec &&
+          U->getOperand(1).getOpcode() == ISD::Constant &&
+          cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 1) {
+        SDValue OtherRound = SDValue(*U->use_begin(), 0);
+        if (OtherRound.getOpcode() == ISD::FP_ROUND &&
+            OtherRound.getOperand(0) == SDValue(U, 0) &&
+            OtherRound.getValueType() == MVT::f32) {
+          SDValue VRound = DAG.getNode(SystemZISD::VROUND, SDLoc(N),
+                                       MVT::v4f32, Vec);
+          DCI.AddToWorklist(VRound.getNode());
+          SDValue Extract1 =
+            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f32,
+                        VRound, DAG.getConstant(2, SDLoc(U), MVT::i32));
+          DCI.AddToWorklist(Extract1.getNode());
+          DAG.ReplaceAllUsesOfValueWith(OtherRound, Extract1);
+          SDValue Extract0 =
+            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32,
+                        VRound, DAG.getConstant(0, SDLoc(Op0), MVT::i32));
+          return Extract0;
+        }
+      }
+    }
+  }
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineBSWAP(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  // Combine BSWAP (LOAD) into LRVH/LRV/LRVG
+  // These loads are allowed to access memory multiple times, and so we must check
+  // that the loads are not volatile before performing the combine.
+  if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
+      N->getOperand(0).hasOneUse() &&
+      (N->getValueType(0) == MVT::i16 || N->getValueType(0) == MVT::i32 ||
+       N->getValueType(0) == MVT::i64) &&
+       !cast<LoadSDNode>(N->getOperand(0))->isVolatile()) {
+      SDValue Load = N->getOperand(0);
+      LoadSDNode *LD = cast<LoadSDNode>(Load);
+
+      // Create the byte-swapping load.
+      SDValue Ops[] = {
+        LD->getChain(),    // Chain
+        LD->getBasePtr(),  // Ptr
+        DAG.getValueType(N->getValueType(0)) // VT
+      };
+      SDValue BSLoad =
+        DAG.getMemIntrinsicNode(SystemZISD::LRV, SDLoc(N),
+                                DAG.getVTList(N->getValueType(0) == MVT::i64 ?
+                                              MVT::i64 : MVT::i32, MVT::Other),
+                                Ops, LD->getMemoryVT(), LD->getMemOperand());
+
+      // If this is an i16 load, insert the truncate.
+      SDValue ResVal = BSLoad;
+      if (N->getValueType(0) == MVT::i16)
+        ResVal = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i16, BSLoad);
+
+      // First, combine the bswap away.  This makes the value produced by the
+      // load dead.
+      DCI.CombineTo(N, ResVal);
+
+      // Next, combine the load away, we give it a bogus result value but a real
+      // chain result.  The result value is dead because the bswap is dead.
+      DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
+
+      // Return N so it doesn't get rechecked!
+      return SDValue(N, 0);
+    }
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::combineSHIFTROT(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+
+  SelectionDAG &DAG = DCI.DAG;
+
+  // Shift/rotate instructions only use the last 6 bits of the second operand
+  // register. If the second operand is the result of an AND with an immediate
+  // value that has its last 6 bits set, we can safely remove the AND operation.
+  //
+  // If the AND operation doesn't have the last 6 bits set, we can't remove it
+  // entirely, but we can still truncate it to a 16-bit value. This prevents
+  // us from ending up with a NILL with a signed operand, which will cause the
+  // instruction printer to abort.
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() == ISD::AND) {
+    SDValue AndMaskOp = N1->getOperand(1);
+    auto *AndMask = dyn_cast<ConstantSDNode>(AndMaskOp);
+
+    // The AND mask is constant
+    if (AndMask) {
+      auto AmtVal = AndMask->getZExtValue();
+      
+      // Bottom 6 bits are set
+      if ((AmtVal & 0x3f) == 0x3f) {
+        SDValue AndOp = N1->getOperand(0);
+
+        // This is the only use, so remove the node
+        if (N1.hasOneUse()) {
+          // Combine the AND away
+          DCI.CombineTo(N1.getNode(), AndOp);
+
+          // Return N so it isn't rechecked
+          return SDValue(N, 0);
+
+        // The node will be reused, so create a new node for this one use
+        } else {
+          SDValue Replace = DAG.getNode(N->getOpcode(), SDLoc(N),
+                                        N->getValueType(0), N->getOperand(0),
+                                        AndOp);
+          DCI.AddToWorklist(Replace.getNode());
+
+          return Replace;
+        }
+
+      // We can't remove the AND, but we can use NILL here (normally we would
+      // use NILF). Only keep the last 16 bits of the mask. The actual
+      // transformation will be handled by .td definitions.
+      } else if (AmtVal >> 16 != 0) {
+        SDValue AndOp = N1->getOperand(0);
+
+        auto NewMask = DAG.getConstant(AndMask->getZExtValue() & 0x0000ffff,
+                                       SDLoc(AndMaskOp),
+                                       AndMaskOp.getValueType());
+
+        auto NewAnd = DAG.getNode(N1.getOpcode(), SDLoc(N1), N1.getValueType(),
+                                  AndOp, NewMask);
+
+        SDValue Replace = DAG.getNode(N->getOpcode(), SDLoc(N),
+                                      N->getValueType(0), N->getOperand(0),
+                                      NewAnd);
+        DCI.AddToWorklist(Replace.getNode());
+
+        return Replace;
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  switch(N->getOpcode()) {
+  default: break;
+  case ISD::SIGN_EXTEND:        return combineSIGN_EXTEND(N, DCI);
+  case SystemZISD::MERGE_HIGH:
+  case SystemZISD::MERGE_LOW:   return combineMERGE(N, DCI);
+  case ISD::STORE:              return combineSTORE(N, DCI);
+  case ISD::EXTRACT_VECTOR_ELT: return combineEXTRACT_VECTOR_ELT(N, DCI);
+  case SystemZISD::JOIN_DWORDS: return combineJOIN_DWORDS(N, DCI);
+  case ISD::FP_ROUND:           return combineFP_ROUND(N, DCI);
+  case ISD::BSWAP:              return combineBSWAP(N, DCI);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:               return combineSHIFTROT(N, DCI);
+  }
+
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// Custom insertion
+//===----------------------------------------------------------------------===//
+
+// Create a new basic block after MBB.
+static MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB) {
+  MachineFunction &MF = *MBB->getParent();
+  MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+  MF.insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
+  return NewMBB;
+}
+
+// Split MBB after MI and return the new block (the one that contains
+// instructions after MI).
+static MachineBasicBlock *splitBlockAfter(MachineBasicBlock::iterator MI,
+                                          MachineBasicBlock *MBB) {
+  MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
+  NewMBB->splice(NewMBB->begin(), MBB,
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  return NewMBB;
+}
+
+// Split MBB before MI and return the new block (the one that contains MI).
+static MachineBasicBlock *splitBlockBefore(MachineBasicBlock::iterator MI,
+                                           MachineBasicBlock *MBB) {
+  MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
+  NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end());
+  NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  return NewMBB;
+}
+
+// Force base value Base into a register before MI.  Return the register.
+static unsigned forceReg(MachineInstr &MI, MachineOperand &Base,
+                         const SystemZInstrInfo *TII) {
+  if (Base.isReg())
+    return Base.getReg();
+
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+  BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LA), Reg)
+      .addOperand(Base)
+      .addImm(0)
+      .addReg(0);
+  return Reg;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
+MachineBasicBlock *
+SystemZTargetLowering::emitSelect(MachineInstr &MI,
+                                  MachineBasicBlock *MBB,
+                                  unsigned LOCROpcode) const {
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+
+  unsigned DestReg = MI.getOperand(0).getReg();
+  unsigned TrueReg = MI.getOperand(1).getReg();
+  unsigned FalseReg = MI.getOperand(2).getReg();
+  unsigned CCValid = MI.getOperand(3).getImm();
+  unsigned CCMask = MI.getOperand(4).getImm();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // Use LOCROpcode if possible.
+  if (LOCROpcode && Subtarget.hasLoadStoreOnCond()) {
+    BuildMI(*MBB, MI, DL, TII->get(LOCROpcode), DestReg)
+      .addReg(FalseReg).addReg(TrueReg)
+      .addImm(CCValid).addImm(CCMask);
+    MI.eraseFromParent();
+    return MBB;
+  }
+
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
+
+  //  StartMBB:
+  //   BRC CCMask, JoinMBB
+  //   # fallthrough to FalseMBB
+  MBB = StartMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
+  MBB->addSuccessor(JoinMBB);
+  MBB->addSuccessor(FalseMBB);
+
+  //  FalseMBB:
+  //   # fallthrough to JoinMBB
+  MBB = FalseMBB;
+  MBB->addSuccessor(JoinMBB);
+
+  //  JoinMBB:
+  //   %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
+  //  ...
+  MBB = JoinMBB;
+  BuildMI(*MBB, MI, DL, TII->get(SystemZ::PHI), DestReg)
+    .addReg(TrueReg).addMBB(StartMBB)
+    .addReg(FalseReg).addMBB(FalseMBB);
+
+  MI.eraseFromParent();
+  return JoinMBB;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo CondStore* instruction MI.
+// StoreOpcode is the store to use and Invert says whether the store should
+// happen when the condition is false rather than true.  If a STORE ON
+// CONDITION is available, STOCOpcode is its opcode, otherwise it is 0.
+MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
+                                                        MachineBasicBlock *MBB,
+                                                        unsigned StoreOpcode,
+                                                        unsigned STOCOpcode,
+                                                        bool Invert) const {
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+
+  unsigned SrcReg = MI.getOperand(0).getReg();
+  MachineOperand Base = MI.getOperand(1);
+  int64_t Disp = MI.getOperand(2).getImm();
+  unsigned IndexReg = MI.getOperand(3).getReg();
+  unsigned CCValid = MI.getOperand(4).getImm();
+  unsigned CCMask = MI.getOperand(5).getImm();
+  DebugLoc DL = MI.getDebugLoc();
+
+  StoreOpcode = TII->getOpcodeForOffset(StoreOpcode, Disp);
+
+  // Use STOCOpcode if possible.  We could use different store patterns in
+  // order to avoid matching the index register, but the performance trade-offs
+  // might be more complicated in that case.
+  if (STOCOpcode && !IndexReg && Subtarget.hasLoadStoreOnCond()) {
+    if (Invert)
+      CCMask ^= CCValid;
+    BuildMI(*MBB, MI, DL, TII->get(STOCOpcode))
+      .addReg(SrcReg).addOperand(Base).addImm(Disp)
+      .addImm(CCValid).addImm(CCMask);
+    MI.eraseFromParent();
+    return MBB;
+  }
+
+  // Get the condition needed to branch around the store.
+  if (!Invert)
+    CCMask ^= CCValid;
+
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
+
+  //  StartMBB:
+  //   BRC CCMask, JoinMBB
+  //   # fallthrough to FalseMBB
+  MBB = StartMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
+  MBB->addSuccessor(JoinMBB);
+  MBB->addSuccessor(FalseMBB);
+
+  //  FalseMBB:
+  //   store %SrcReg, %Disp(%Index,%Base)
+  //   # fallthrough to JoinMBB
+  MBB = FalseMBB;
+  BuildMI(MBB, DL, TII->get(StoreOpcode))
+    .addReg(SrcReg).addOperand(Base).addImm(Disp).addReg(IndexReg);
+  MBB->addSuccessor(JoinMBB);
+
+  MI.eraseFromParent();
+  return JoinMBB;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo ATOMIC_LOAD{,W}_*
+// or ATOMIC_SWAP{,W} instruction MI.  BinOpcode is the instruction that
+// performs the binary operation elided by "*", or 0 for ATOMIC_SWAP{,W}.
+// BitSize is the width of the field in bits, or 0 if this is a partword
+// ATOMIC_LOADW_* or ATOMIC_SWAPW instruction, in which case the bitsize
+// is one of the operands.  Invert says whether the field should be
+// inverted after performing BinOpcode (e.g. for NAND).
+MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
+    MachineInstr &MI, MachineBasicBlock *MBB, unsigned BinOpcode,
+    unsigned BitSize, bool Invert) const {
+  MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  bool IsSubWord = (BitSize < 32);
+
+  // Extract the operands.  Base can be a register or a frame index.
+  // Src2 can be a register or immediate.
+  unsigned Dest = MI.getOperand(0).getReg();
+  MachineOperand Base = earlyUseOperand(MI.getOperand(1));
+  int64_t Disp = MI.getOperand(2).getImm();
+  MachineOperand Src2 = earlyUseOperand(MI.getOperand(3));
+  unsigned BitShift = (IsSubWord ? MI.getOperand(4).getReg() : 0);
+  unsigned NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : 0);
+  DebugLoc DL = MI.getDebugLoc();
+  if (IsSubWord)
+    BitSize = MI.getOperand(6).getImm();
+
+  // Subword operations use 32-bit registers.
+  const TargetRegisterClass *RC = (BitSize <= 32 ?
+                                   &SystemZ::GR32BitRegClass :
+                                   &SystemZ::GR64BitRegClass);
+  unsigned LOpcode  = BitSize <= 32 ? SystemZ::L  : SystemZ::LG;
+  unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG;
+
+  // Get the right opcodes for the displacement.
+  LOpcode  = TII->getOpcodeForOffset(LOpcode,  Disp);
+  CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp);
+  assert(LOpcode && CSOpcode && "Displacement out of range");
+
+  // Create virtual registers for temporary results.
+  unsigned OrigVal       = MRI.createVirtualRegister(RC);
+  unsigned OldVal        = MRI.createVirtualRegister(RC);
+  unsigned NewVal        = (BinOpcode || IsSubWord ?
+                            MRI.createVirtualRegister(RC) : Src2.getReg());
+  unsigned RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
+  unsigned RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
+
+  // Insert a basic block for the main loop.
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *DoneMBB  = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *LoopMBB  = emitBlockAfter(StartMBB);
+
+  //  StartMBB:
+  //   ...
+  //   %OrigVal = L Disp(%Base)
+  //   # fall through to LoopMMB
+  MBB = StartMBB;
+  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal)
+    .addOperand(Base).addImm(Disp).addReg(0);
+  MBB->addSuccessor(LoopMBB);
+
+  //  LoopMBB:
+  //   %OldVal        = phi [ %OrigVal, StartMBB ], [ %Dest, LoopMBB ]
+  //   %RotatedOldVal = RLL %OldVal, 0(%BitShift)
+  //   %RotatedNewVal = OP %RotatedOldVal, %Src2
+  //   %NewVal        = RLL %RotatedNewVal, 0(%NegBitShift)
+  //   %Dest          = CS %OldVal, %NewVal, Disp(%Base)
+  //   JNE LoopMBB
+  //   # fall through to DoneMMB
+  MBB = LoopMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
+    .addReg(OrigVal).addMBB(StartMBB)
+    .addReg(Dest).addMBB(LoopMBB);
+  if (IsSubWord)
+    BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
+      .addReg(OldVal).addReg(BitShift).addImm(0);
+  if (Invert) {
+    // Perform the operation normally and then invert every bit of the field.
+    unsigned Tmp = MRI.createVirtualRegister(RC);
+    BuildMI(MBB, DL, TII->get(BinOpcode), Tmp)
+      .addReg(RotatedOldVal).addOperand(Src2);
+    if (BitSize <= 32)
+      // XILF with the upper BitSize bits set.
+      BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
+        .addReg(Tmp).addImm(-1U << (32 - BitSize));
+    else {
+      // Use LCGR and add -1 to the result, which is more compact than
+      // an XILF, XILH pair.
+      unsigned Tmp2 = MRI.createVirtualRegister(RC);
+      BuildMI(MBB, DL, TII->get(SystemZ::LCGR), Tmp2).addReg(Tmp);
+      BuildMI(MBB, DL, TII->get(SystemZ::AGHI), RotatedNewVal)
+        .addReg(Tmp2).addImm(-1);
+    }
+  } else if (BinOpcode)
+    // A simply binary operation.
+    BuildMI(MBB, DL, TII->get(BinOpcode), RotatedNewVal)
+      .addReg(RotatedOldVal).addOperand(Src2);
+  else if (IsSubWord)
+    // Use RISBG to rotate Src2 into position and use it to replace the
+    // field in RotatedOldVal.
+    BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedNewVal)
+      .addReg(RotatedOldVal).addReg(Src2.getReg())
+      .addImm(32).addImm(31 + BitSize).addImm(32 - BitSize);
+  if (IsSubWord)
+    BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
+      .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
+  BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
+    .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
+  MBB->addSuccessor(LoopMBB);
+  MBB->addSuccessor(DoneMBB);
+
+  MI.eraseFromParent();
+  return DoneMBB;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo
+// ATOMIC_LOAD{,W}_{,U}{MIN,MAX} instruction MI.  CompareOpcode is the
+// instruction that should be used to compare the current field with the
+// minimum or maximum value.  KeepOldMask is the BRC condition-code mask
+// for when the current field should be kept.  BitSize is the width of
+// the field in bits, or 0 if this is a partword ATOMIC_LOADW_* instruction.
+MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
+    MachineInstr &MI, MachineBasicBlock *MBB, unsigned CompareOpcode,
+    unsigned KeepOldMask, unsigned BitSize) const {
+  MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  bool IsSubWord = (BitSize < 32);
+
+  // Extract the operands.  Base can be a register or a frame index.
+  unsigned Dest = MI.getOperand(0).getReg();
+  MachineOperand Base = earlyUseOperand(MI.getOperand(1));
+  int64_t Disp = MI.getOperand(2).getImm();
+  unsigned Src2 = MI.getOperand(3).getReg();
+  unsigned BitShift = (IsSubWord ? MI.getOperand(4).getReg() : 0);
+  unsigned NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : 0);
+  DebugLoc DL = MI.getDebugLoc();
+  if (IsSubWord)
+    BitSize = MI.getOperand(6).getImm();
+
+  // Subword operations use 32-bit registers.
+  const TargetRegisterClass *RC = (BitSize <= 32 ?
+                                   &SystemZ::GR32BitRegClass :
+                                   &SystemZ::GR64BitRegClass);
+  unsigned LOpcode  = BitSize <= 32 ? SystemZ::L  : SystemZ::LG;
+  unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG;
+
+  // Get the right opcodes for the displacement.
+  LOpcode  = TII->getOpcodeForOffset(LOpcode,  Disp);
+  CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp);
+  assert(LOpcode && CSOpcode && "Displacement out of range");
+
+  // Create virtual registers for temporary results.
+  unsigned OrigVal       = MRI.createVirtualRegister(RC);
+  unsigned OldVal        = MRI.createVirtualRegister(RC);
+  unsigned NewVal        = MRI.createVirtualRegister(RC);
+  unsigned RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
+  unsigned RotatedAltVal = (IsSubWord ? MRI.createVirtualRegister(RC) : Src2);
+  unsigned RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
+
+  // Insert 3 basic blocks for the loop.
+  MachineBasicBlock *StartMBB  = MBB;
+  MachineBasicBlock *DoneMBB   = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *LoopMBB   = emitBlockAfter(StartMBB);
+  MachineBasicBlock *UseAltMBB = emitBlockAfter(LoopMBB);
+  MachineBasicBlock *UpdateMBB = emitBlockAfter(UseAltMBB);
+
+  //  StartMBB:
+  //   ...
+  //   %OrigVal     = L Disp(%Base)
+  //   # fall through to LoopMMB
+  MBB = StartMBB;
+  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal)
+    .addOperand(Base).addImm(Disp).addReg(0);
+  MBB->addSuccessor(LoopMBB);
+
+  //  LoopMBB:
+  //   %OldVal        = phi [ %OrigVal, StartMBB ], [ %Dest, UpdateMBB ]
+  //   %RotatedOldVal = RLL %OldVal, 0(%BitShift)
+  //   CompareOpcode %RotatedOldVal, %Src2
+  //   BRC KeepOldMask, UpdateMBB
+  MBB = LoopMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
+    .addReg(OrigVal).addMBB(StartMBB)
+    .addReg(Dest).addMBB(UpdateMBB);
+  if (IsSubWord)
+    BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
+      .addReg(OldVal).addReg(BitShift).addImm(0);
+  BuildMI(MBB, DL, TII->get(CompareOpcode))
+    .addReg(RotatedOldVal).addReg(Src2);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_ICMP).addImm(KeepOldMask).addMBB(UpdateMBB);
+  MBB->addSuccessor(UpdateMBB);
+  MBB->addSuccessor(UseAltMBB);
+
+  //  UseAltMBB:
+  //   %RotatedAltVal = RISBG %RotatedOldVal, %Src2, 32, 31 + BitSize, 0
+  //   # fall through to UpdateMMB
+  MBB = UseAltMBB;
+  if (IsSubWord)
+    BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedAltVal)
+      .addReg(RotatedOldVal).addReg(Src2)
+      .addImm(32).addImm(31 + BitSize).addImm(0);
+  MBB->addSuccessor(UpdateMBB);
+
+  //  UpdateMBB:
+  //   %RotatedNewVal = PHI [ %RotatedOldVal, LoopMBB ],
+  //                        [ %RotatedAltVal, UseAltMBB ]
+  //   %NewVal        = RLL %RotatedNewVal, 0(%NegBitShift)
+  //   %Dest          = CS %OldVal, %NewVal, Disp(%Base)
+  //   JNE LoopMBB
+  //   # fall through to DoneMMB
+  MBB = UpdateMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), RotatedNewVal)
+    .addReg(RotatedOldVal).addMBB(LoopMBB)
+    .addReg(RotatedAltVal).addMBB(UseAltMBB);
+  if (IsSubWord)
+    BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
+      .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
+  BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
+    .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
+  MBB->addSuccessor(LoopMBB);
+  MBB->addSuccessor(DoneMBB);
+
+  MI.eraseFromParent();
+  return DoneMBB;
+}
+
+// Implement EmitInstrWithCustomInserter for pseudo ATOMIC_CMP_SWAPW
+// instruction MI.
+MachineBasicBlock *
+SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
+                                          MachineBasicBlock *MBB) const {
+
+  MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Extract the operands.  Base can be a register or a frame index.
+  unsigned Dest = MI.getOperand(0).getReg();
+  MachineOperand Base = earlyUseOperand(MI.getOperand(1));
+  int64_t Disp = MI.getOperand(2).getImm();
+  unsigned OrigCmpVal = MI.getOperand(3).getReg();
+  unsigned OrigSwapVal = MI.getOperand(4).getReg();
+  unsigned BitShift = MI.getOperand(5).getReg();
+  unsigned NegBitShift = MI.getOperand(6).getReg();
+  int64_t BitSize = MI.getOperand(7).getImm();
+  DebugLoc DL = MI.getDebugLoc();
+
+  const TargetRegisterClass *RC = &SystemZ::GR32BitRegClass;
+
+  // Get the right opcodes for the displacement.
+  unsigned LOpcode  = TII->getOpcodeForOffset(SystemZ::L,  Disp);
+  unsigned CSOpcode = TII->getOpcodeForOffset(SystemZ::CS, Disp);
+  assert(LOpcode && CSOpcode && "Displacement out of range");
+
+  // Create virtual registers for temporary results.
+  unsigned OrigOldVal   = MRI.createVirtualRegister(RC);
+  unsigned OldVal       = MRI.createVirtualRegister(RC);
+  unsigned CmpVal       = MRI.createVirtualRegister(RC);
+  unsigned SwapVal      = MRI.createVirtualRegister(RC);
+  unsigned StoreVal     = MRI.createVirtualRegister(RC);
+  unsigned RetryOldVal  = MRI.createVirtualRegister(RC);
+  unsigned RetryCmpVal  = MRI.createVirtualRegister(RC);
+  unsigned RetrySwapVal = MRI.createVirtualRegister(RC);
+
+  // Insert 2 basic blocks for the loop.
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *DoneMBB  = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *LoopMBB  = emitBlockAfter(StartMBB);
+  MachineBasicBlock *SetMBB   = emitBlockAfter(LoopMBB);
+
+  //  StartMBB:
+  //   ...
+  //   %OrigOldVal     = L Disp(%Base)
+  //   # fall through to LoopMMB
+  MBB = StartMBB;
+  BuildMI(MBB, DL, TII->get(LOpcode), OrigOldVal)
+    .addOperand(Base).addImm(Disp).addReg(0);
+  MBB->addSuccessor(LoopMBB);
+
+  //  LoopMBB:
+  //   %OldVal        = phi [ %OrigOldVal, EntryBB ], [ %RetryOldVal, SetMBB ]
+  //   %CmpVal        = phi [ %OrigCmpVal, EntryBB ], [ %RetryCmpVal, SetMBB ]
+  //   %SwapVal       = phi [ %OrigSwapVal, EntryBB ], [ %RetrySwapVal, SetMBB ]
+  //   %Dest          = RLL %OldVal, BitSize(%BitShift)
+  //                      ^^ The low BitSize bits contain the field
+  //                         of interest.
+  //   %RetryCmpVal   = RISBG32 %CmpVal, %Dest, 32, 63-BitSize, 0
+  //                      ^^ Replace the upper 32-BitSize bits of the
+  //                         comparison value with those that we loaded,
+  //                         so that we can use a full word comparison.
+  //   CR %Dest, %RetryCmpVal
+  //   JNE DoneMBB
+  //   # Fall through to SetMBB
+  MBB = LoopMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
+    .addReg(OrigOldVal).addMBB(StartMBB)
+    .addReg(RetryOldVal).addMBB(SetMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), CmpVal)
+    .addReg(OrigCmpVal).addMBB(StartMBB)
+    .addReg(RetryCmpVal).addMBB(SetMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), SwapVal)
+    .addReg(OrigSwapVal).addMBB(StartMBB)
+    .addReg(RetrySwapVal).addMBB(SetMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::RLL), Dest)
+    .addReg(OldVal).addReg(BitShift).addImm(BitSize);
+  BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetryCmpVal)
+    .addReg(CmpVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
+  BuildMI(MBB, DL, TII->get(SystemZ::CR))
+    .addReg(Dest).addReg(RetryCmpVal);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_ICMP)
+    .addImm(SystemZ::CCMASK_CMP_NE).addMBB(DoneMBB);
+  MBB->addSuccessor(DoneMBB);
+  MBB->addSuccessor(SetMBB);
+
+  //  SetMBB:
+  //   %RetrySwapVal = RISBG32 %SwapVal, %Dest, 32, 63-BitSize, 0
+  //                      ^^ Replace the upper 32-BitSize bits of the new
+  //                         value with those that we loaded.
+  //   %StoreVal    = RLL %RetrySwapVal, -BitSize(%NegBitShift)
+  //                      ^^ Rotate the new field to its proper position.
+  //   %RetryOldVal = CS %Dest, %StoreVal, Disp(%Base)
+  //   JNE LoopMBB
+  //   # fall through to ExitMMB
+  MBB = SetMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetrySwapVal)
+    .addReg(SwapVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
+  BuildMI(MBB, DL, TII->get(SystemZ::RLL), StoreVal)
+    .addReg(RetrySwapVal).addReg(NegBitShift).addImm(-BitSize);
+  BuildMI(MBB, DL, TII->get(CSOpcode), RetryOldVal)
+    .addReg(OldVal).addReg(StoreVal).addOperand(Base).addImm(Disp);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
+  MBB->addSuccessor(LoopMBB);
+  MBB->addSuccessor(DoneMBB);
+
+  MI.eraseFromParent();
+  return DoneMBB;
+}
+
+// Emit an extension from a GR32 or GR64 to a GR128.  ClearEven is true
+// if the high register of the GR128 value must be cleared or false if
+// it's "don't care".  SubReg is subreg_l32 when extending a GR32
+// and subreg_l64 when extending a GR64.
+MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
+                                                     MachineBasicBlock *MBB,
+                                                     bool ClearEven,
+                                                     unsigned SubReg) const {
+  MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dest = MI.getOperand(0).getReg();
+  unsigned Src = MI.getOperand(1).getReg();
+  unsigned In128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+
+  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), In128);
+  if (ClearEven) {
+    unsigned NewIn128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
+    unsigned Zero64   = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+
+    BuildMI(*MBB, MI, DL, TII->get(SystemZ::LLILL), Zero64)
+      .addImm(0);
+    BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewIn128)
+      .addReg(In128).addReg(Zero64).addImm(SystemZ::subreg_h64);
+    In128 = NewIn128;
+  }
+  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
+    .addReg(In128).addReg(Src).addImm(SubReg);
+
+  MI.eraseFromParent();
+  return MBB;
+}
+
+MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
+    MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
+  MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  MachineOperand DestBase = earlyUseOperand(MI.getOperand(0));
+  uint64_t DestDisp = MI.getOperand(1).getImm();
+  MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
+  uint64_t SrcDisp = MI.getOperand(3).getImm();
+  uint64_t Length = MI.getOperand(4).getImm();
+
+  // When generating more than one CLC, all but the last will need to
+  // branch to the end when a difference is found.
+  MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ?
+                               splitBlockAfter(MI, MBB) : nullptr);
+
+  // Check for the loop form, in which operand 5 is the trip count.
+  if (MI.getNumExplicitOperands() > 5) {
+    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
+
+    uint64_t StartCountReg = MI.getOperand(5).getReg();
+    uint64_t StartSrcReg   = forceReg(MI, SrcBase, TII);
+    uint64_t StartDestReg  = (HaveSingleBase ? StartSrcReg :
+                              forceReg(MI, DestBase, TII));
+
+    const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass;
+    uint64_t ThisSrcReg  = MRI.createVirtualRegister(RC);
+    uint64_t ThisDestReg = (HaveSingleBase ? ThisSrcReg :
+                            MRI.createVirtualRegister(RC));
+    uint64_t NextSrcReg  = MRI.createVirtualRegister(RC);
+    uint64_t NextDestReg = (HaveSingleBase ? NextSrcReg :
+                            MRI.createVirtualRegister(RC));
+
+    RC = &SystemZ::GR64BitRegClass;
+    uint64_t ThisCountReg = MRI.createVirtualRegister(RC);
+    uint64_t NextCountReg = MRI.createVirtualRegister(RC);
+
+    MachineBasicBlock *StartMBB = MBB;
+    MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+    MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+    MachineBasicBlock *NextMBB = (EndMBB ? emitBlockAfter(LoopMBB) : LoopMBB);
+
+    //  StartMBB:
+    //   # fall through to LoopMMB
+    MBB->addSuccessor(LoopMBB);
+
+    //  LoopMBB:
+    //   %ThisDestReg = phi [ %StartDestReg, StartMBB ],
+    //                      [ %NextDestReg, NextMBB ]
+    //   %ThisSrcReg = phi [ %StartSrcReg, StartMBB ],
+    //                     [ %NextSrcReg, NextMBB ]
+    //   %ThisCountReg = phi [ %StartCountReg, StartMBB ],
+    //                       [ %NextCountReg, NextMBB ]
+    //   ( PFD 2, 768+DestDisp(%ThisDestReg) )
+    //   Opcode DestDisp(256,%ThisDestReg), SrcDisp(%ThisSrcReg)
+    //   ( JLH EndMBB )
+    //
+    // The prefetch is used only for MVC.  The JLH is used only for CLC.
+    MBB = LoopMBB;
+
+    BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)
+      .addReg(StartDestReg).addMBB(StartMBB)
+      .addReg(NextDestReg).addMBB(NextMBB);
+    if (!HaveSingleBase)
+      BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg)
+        .addReg(StartSrcReg).addMBB(StartMBB)
+        .addReg(NextSrcReg).addMBB(NextMBB);
+    BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg)
+      .addReg(StartCountReg).addMBB(StartMBB)
+      .addReg(NextCountReg).addMBB(NextMBB);
+    if (Opcode == SystemZ::MVC)
+      BuildMI(MBB, DL, TII->get(SystemZ::PFD))
+        .addImm(SystemZ::PFD_WRITE)
+        .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
+    BuildMI(MBB, DL, TII->get(Opcode))
+      .addReg(ThisDestReg).addImm(DestDisp).addImm(256)
+      .addReg(ThisSrcReg).addImm(SrcDisp);
+    if (EndMBB) {
+      BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+        .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+        .addMBB(EndMBB);
+      MBB->addSuccessor(EndMBB);
+      MBB->addSuccessor(NextMBB);
+    }
+
+    // NextMBB:
+    //   %NextDestReg = LA 256(%ThisDestReg)
+    //   %NextSrcReg = LA 256(%ThisSrcReg)
+    //   %NextCountReg = AGHI %ThisCountReg, -1
+    //   CGHI %NextCountReg, 0
+    //   JLH LoopMBB
+    //   # fall through to DoneMMB
+    //
+    // The AGHI, CGHI and JLH should be converted to BRCTG by later passes.
+    MBB = NextMBB;
+
+    BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg)
+      .addReg(ThisDestReg).addImm(256).addReg(0);
+    if (!HaveSingleBase)
+      BuildMI(MBB, DL, TII->get(SystemZ::LA), NextSrcReg)
+        .addReg(ThisSrcReg).addImm(256).addReg(0);
+    BuildMI(MBB, DL, TII->get(SystemZ::AGHI), NextCountReg)
+      .addReg(ThisCountReg).addImm(-1);
+    BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+      .addReg(NextCountReg).addImm(0);
+    BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+      .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+      .addMBB(LoopMBB);
+    MBB->addSuccessor(LoopMBB);
+    MBB->addSuccessor(DoneMBB);
+
+    DestBase = MachineOperand::CreateReg(NextDestReg, false);
+    SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
+    Length &= 255;
+    MBB = DoneMBB;
+  }
+  // Handle any remaining bytes with straight-line code.
+  while (Length > 0) {
+    uint64_t ThisLength = std::min(Length, uint64_t(256));
+    // The previous iteration might have created out-of-range displacements.
+    // Apply them using LAY if so.
+    if (!isUInt<12>(DestDisp)) {
+      unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+          .addOperand(DestBase)
+          .addImm(DestDisp)
+          .addReg(0);
+      DestBase = MachineOperand::CreateReg(Reg, false);
+      DestDisp = 0;
+    }
+    if (!isUInt<12>(SrcDisp)) {
+      unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+          .addOperand(SrcBase)
+          .addImm(SrcDisp)
+          .addReg(0);
+      SrcBase = MachineOperand::CreateReg(Reg, false);
+      SrcDisp = 0;
+    }
+    BuildMI(*MBB, MI, DL, TII->get(Opcode))
+      .addOperand(DestBase).addImm(DestDisp).addImm(ThisLength)
+      .addOperand(SrcBase).addImm(SrcDisp);
+    DestDisp += ThisLength;
+    SrcDisp += ThisLength;
+    Length -= ThisLength;
+    // If there's another CLC to go, branch to the end if a difference
+    // was found.
+    if (EndMBB && Length > 0) {
+      MachineBasicBlock *NextMBB = splitBlockBefore(MI, MBB);
+      BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+        .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+        .addMBB(EndMBB);
+      MBB->addSuccessor(EndMBB);
+      MBB->addSuccessor(NextMBB);
+      MBB = NextMBB;
+    }
+  }
+  if (EndMBB) {
+    MBB->addSuccessor(EndMBB);
+    MBB = EndMBB;
+    MBB->addLiveIn(SystemZ::CC);
+  }
+
+  MI.eraseFromParent();
+  return MBB;
+}
+
+// Decompose string pseudo-instruction MI into a loop that continually performs
+// Opcode until CC != 3.
+MachineBasicBlock *SystemZTargetLowering::emitStringWrapper(
+    MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
+  MachineFunction &MF = *MBB->getParent();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  uint64_t End1Reg = MI.getOperand(0).getReg();
+  uint64_t Start1Reg = MI.getOperand(1).getReg();
+  uint64_t Start2Reg = MI.getOperand(2).getReg();
+  uint64_t CharReg = MI.getOperand(3).getReg();
+
+  const TargetRegisterClass *RC = &SystemZ::GR64BitRegClass;
+  uint64_t This1Reg = MRI.createVirtualRegister(RC);
+  uint64_t This2Reg = MRI.createVirtualRegister(RC);
+  uint64_t End2Reg  = MRI.createVirtualRegister(RC);
+
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+
+  //  StartMBB:
+  //   # fall through to LoopMMB
+  MBB->addSuccessor(LoopMBB);
+
+  //  LoopMBB:
+  //   %This1Reg = phi [ %Start1Reg, StartMBB ], [ %End1Reg, LoopMBB ]
+  //   %This2Reg = phi [ %Start2Reg, StartMBB ], [ %End2Reg, LoopMBB ]
+  //   R0L = %CharReg
+  //   %End1Reg, %End2Reg = CLST %This1Reg, %This2Reg -- uses R0L
+  //   JO LoopMBB
+  //   # fall through to DoneMMB
+  //
+  // The load of R0L can be hoisted by post-RA LICM.
+  MBB = LoopMBB;
+
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), This1Reg)
+    .addReg(Start1Reg).addMBB(StartMBB)
+    .addReg(End1Reg).addMBB(LoopMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), This2Reg)
+    .addReg(Start2Reg).addMBB(StartMBB)
+    .addReg(End2Reg).addMBB(LoopMBB);
+  BuildMI(MBB, DL, TII->get(TargetOpcode::COPY), SystemZ::R0L).addReg(CharReg);
+  BuildMI(MBB, DL, TII->get(Opcode))
+    .addReg(End1Reg, RegState::Define).addReg(End2Reg, RegState::Define)
+    .addReg(This1Reg).addReg(This2Reg);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_ANY).addImm(SystemZ::CCMASK_3).addMBB(LoopMBB);
+  MBB->addSuccessor(LoopMBB);
+  MBB->addSuccessor(DoneMBB);
+
+  DoneMBB->addLiveIn(SystemZ::CC);
+
+  MI.eraseFromParent();
+  return DoneMBB;
+}
+
+// Update TBEGIN instruction with final opcode and register clobbers.
+MachineBasicBlock *SystemZTargetLowering::emitTransactionBegin(
+    MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode,
+    bool NoFloat) const {
+  MachineFunction &MF = *MBB->getParent();
+  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+  const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
+
+  // Update opcode.
+  MI.setDesc(TII->get(Opcode));
+
+  // We cannot handle a TBEGIN that clobbers the stack or frame pointer.
+  // Make sure to add the corresponding GRSM bits if they are missing.
+  uint64_t Control = MI.getOperand(2).getImm();
+  static const unsigned GPRControlBit[16] = {
+    0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000,
+    0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100
+  };
+  Control |= GPRControlBit[15];
+  if (TFI->hasFP(MF))
+    Control |= GPRControlBit[11];
+  MI.getOperand(2).setImm(Control);
+
+  // Add GPR clobbers.
+  for (int I = 0; I < 16; I++) {
+    if ((Control & GPRControlBit[I]) == 0) {
+      unsigned Reg = SystemZMC::GR64Regs[I];
+      MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
+    }
+  }
+
+  // Add FPR/VR clobbers.
+  if (!NoFloat && (Control & 4) != 0) {
+    if (Subtarget.hasVector()) {
+      for (int I = 0; I < 32; I++) {
+        unsigned Reg = SystemZMC::VR128Regs[I];
+        MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
+      }
+    } else {
+      for (int I = 0; I < 16; I++) {
+        unsigned Reg = SystemZMC::FP64Regs[I];
+        MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
+      }
+    }
+  }
+
+  return MBB;
+}
+
+MachineBasicBlock *SystemZTargetLowering::emitLoadAndTestCmp0(
+    MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned SrcReg = MI.getOperand(0).getReg();
+
+  // Create new virtual register of the same class as source.
+  const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
+  unsigned DstReg = MRI->createVirtualRegister(RC);
+
+  // Replace pseudo with a normal load-and-test that models the def as
+  // well.
+  BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg)
+    .addReg(SrcReg);
+  MI.eraseFromParent();
+
+  return MBB;
+}
+
+MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr &MI, MachineBasicBlock *MBB) const {
+  switch (MI.getOpcode()) {
+  case SystemZ::Select32Mux:
+    return emitSelect(MI, MBB,
+                      Subtarget.hasLoadStoreOnCond2()? SystemZ::LOCRMux : 0);
+  case SystemZ::Select32:
+    return emitSelect(MI, MBB, SystemZ::LOCR);
+  case SystemZ::Select64:
+    return emitSelect(MI, MBB, SystemZ::LOCGR);
+  case SystemZ::SelectF32:
+  case SystemZ::SelectF64:
+  case SystemZ::SelectF128:
+    return emitSelect(MI, MBB, 0);
+
+  case SystemZ::CondStore8Mux:
+    return emitCondStore(MI, MBB, SystemZ::STCMux, 0, false);
+  case SystemZ::CondStore8MuxInv:
+    return emitCondStore(MI, MBB, SystemZ::STCMux, 0, true);
+  case SystemZ::CondStore16Mux:
+    return emitCondStore(MI, MBB, SystemZ::STHMux, 0, false);
+  case SystemZ::CondStore16MuxInv:
+    return emitCondStore(MI, MBB, SystemZ::STHMux, 0, true);
+  case SystemZ::CondStore32Mux:
+    return emitCondStore(MI, MBB, SystemZ::STMux, SystemZ::STOCMux, false);
+  case SystemZ::CondStore32MuxInv:
+    return emitCondStore(MI, MBB, SystemZ::STMux, SystemZ::STOCMux, true);
+  case SystemZ::CondStore8:
+    return emitCondStore(MI, MBB, SystemZ::STC, 0, false);
+  case SystemZ::CondStore8Inv:
+    return emitCondStore(MI, MBB, SystemZ::STC, 0, true);
+  case SystemZ::CondStore16:
+    return emitCondStore(MI, MBB, SystemZ::STH, 0, false);
+  case SystemZ::CondStore16Inv:
+    return emitCondStore(MI, MBB, SystemZ::STH, 0, true);
+  case SystemZ::CondStore32:
+    return emitCondStore(MI, MBB, SystemZ::ST, SystemZ::STOC, false);
+  case SystemZ::CondStore32Inv:
+    return emitCondStore(MI, MBB, SystemZ::ST, SystemZ::STOC, true);
+  case SystemZ::CondStore64:
+    return emitCondStore(MI, MBB, SystemZ::STG, SystemZ::STOCG, false);
+  case SystemZ::CondStore64Inv:
+    return emitCondStore(MI, MBB, SystemZ::STG, SystemZ::STOCG, true);
+  case SystemZ::CondStoreF32:
+    return emitCondStore(MI, MBB, SystemZ::STE, 0, false);
+  case SystemZ::CondStoreF32Inv:
+    return emitCondStore(MI, MBB, SystemZ::STE, 0, true);
+  case SystemZ::CondStoreF64:
+    return emitCondStore(MI, MBB, SystemZ::STD, 0, false);
+  case SystemZ::CondStoreF64Inv:
+    return emitCondStore(MI, MBB, SystemZ::STD, 0, true);
+
+  case SystemZ::AEXT128_64:
+    return emitExt128(MI, MBB, false, SystemZ::subreg_l64);
+  case SystemZ::ZEXT128_32:
+    return emitExt128(MI, MBB, true, SystemZ::subreg_l32);
+  case SystemZ::ZEXT128_64:
+    return emitExt128(MI, MBB, true, SystemZ::subreg_l64);
+
+  case SystemZ::ATOMIC_SWAPW:
+    return emitAtomicLoadBinary(MI, MBB, 0, 0);
+  case SystemZ::ATOMIC_SWAP_32:
+    return emitAtomicLoadBinary(MI, MBB, 0, 32);
+  case SystemZ::ATOMIC_SWAP_64:
+    return emitAtomicLoadBinary(MI, MBB, 0, 64);
+
+  case SystemZ::ATOMIC_LOADW_AR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 0);
+  case SystemZ::ATOMIC_LOADW_AFI:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 0);
+  case SystemZ::ATOMIC_LOAD_AR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 32);
+  case SystemZ::ATOMIC_LOAD_AHI:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AHI, 32);
+  case SystemZ::ATOMIC_LOAD_AFI:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 32);
+  case SystemZ::ATOMIC_LOAD_AGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AGR, 64);
+  case SystemZ::ATOMIC_LOAD_AGHI:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AGHI, 64);
+  case SystemZ::ATOMIC_LOAD_AGFI:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::AGFI, 64);
+
+  case SystemZ::ATOMIC_LOADW_SR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 0);
+  case SystemZ::ATOMIC_LOAD_SR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 32);
+  case SystemZ::ATOMIC_LOAD_SGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::SGR, 64);
+
+  case SystemZ::ATOMIC_LOADW_NR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0);
+  case SystemZ::ATOMIC_LOADW_NILH:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0);
+  case SystemZ::ATOMIC_LOAD_NR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32);
+  case SystemZ::ATOMIC_LOAD_NILL:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32);
+  case SystemZ::ATOMIC_LOAD_NILH:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32);
+  case SystemZ::ATOMIC_LOAD_NILF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32);
+  case SystemZ::ATOMIC_LOAD_NGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64);
+  case SystemZ::ATOMIC_LOAD_NILL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64);
+  case SystemZ::ATOMIC_LOAD_NILH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64);
+  case SystemZ::ATOMIC_LOAD_NIHL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64);
+  case SystemZ::ATOMIC_LOAD_NIHH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64);
+  case SystemZ::ATOMIC_LOAD_NILF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64);
+  case SystemZ::ATOMIC_LOAD_NIHF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64);
+
+  case SystemZ::ATOMIC_LOADW_OR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 0);
+  case SystemZ::ATOMIC_LOADW_OILH:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 0);
+  case SystemZ::ATOMIC_LOAD_OR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 32);
+  case SystemZ::ATOMIC_LOAD_OILL:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL, 32);
+  case SystemZ::ATOMIC_LOAD_OILH:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 32);
+  case SystemZ::ATOMIC_LOAD_OILF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF, 32);
+  case SystemZ::ATOMIC_LOAD_OGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OGR, 64);
+  case SystemZ::ATOMIC_LOAD_OILL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL64, 64);
+  case SystemZ::ATOMIC_LOAD_OILH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH64, 64);
+  case SystemZ::ATOMIC_LOAD_OIHL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHL64, 64);
+  case SystemZ::ATOMIC_LOAD_OIHH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHH64, 64);
+  case SystemZ::ATOMIC_LOAD_OILF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF64, 64);
+  case SystemZ::ATOMIC_LOAD_OIHF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHF64, 64);
+
+  case SystemZ::ATOMIC_LOADW_XR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 0);
+  case SystemZ::ATOMIC_LOADW_XILF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 0);
+  case SystemZ::ATOMIC_LOAD_XR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 32);
+  case SystemZ::ATOMIC_LOAD_XILF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 32);
+  case SystemZ::ATOMIC_LOAD_XGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XGR, 64);
+  case SystemZ::ATOMIC_LOAD_XILF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF64, 64);
+  case SystemZ::ATOMIC_LOAD_XIHF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XIHF64, 64);
+
+  case SystemZ::ATOMIC_LOADW_NRi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0, true);
+  case SystemZ::ATOMIC_LOADW_NILHi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0, true);
+  case SystemZ::ATOMIC_LOAD_NRi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32, true);
+  case SystemZ::ATOMIC_LOAD_NILLi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32, true);
+  case SystemZ::ATOMIC_LOAD_NILHi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32, true);
+  case SystemZ::ATOMIC_LOAD_NILFi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32, true);
+  case SystemZ::ATOMIC_LOAD_NGRi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILL64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILH64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHL64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHH64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILF64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHF64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64, true);
+
+  case SystemZ::ATOMIC_LOADW_MIN:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
+                                SystemZ::CCMASK_CMP_LE, 0);
+  case SystemZ::ATOMIC_LOAD_MIN_32:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
+                                SystemZ::CCMASK_CMP_LE, 32);
+  case SystemZ::ATOMIC_LOAD_MIN_64:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR,
+                                SystemZ::CCMASK_CMP_LE, 64);
+
+  case SystemZ::ATOMIC_LOADW_MAX:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
+                                SystemZ::CCMASK_CMP_GE, 0);
+  case SystemZ::ATOMIC_LOAD_MAX_32:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
+                                SystemZ::CCMASK_CMP_GE, 32);
+  case SystemZ::ATOMIC_LOAD_MAX_64:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR,
+                                SystemZ::CCMASK_CMP_GE, 64);
+
+  case SystemZ::ATOMIC_LOADW_UMIN:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
+                                SystemZ::CCMASK_CMP_LE, 0);
+  case SystemZ::ATOMIC_LOAD_UMIN_32:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
+                                SystemZ::CCMASK_CMP_LE, 32);
+  case SystemZ::ATOMIC_LOAD_UMIN_64:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
+                                SystemZ::CCMASK_CMP_LE, 64);
+
+  case SystemZ::ATOMIC_LOADW_UMAX:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
+                                SystemZ::CCMASK_CMP_GE, 0);
+  case SystemZ::ATOMIC_LOAD_UMAX_32:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
+                                SystemZ::CCMASK_CMP_GE, 32);
+  case SystemZ::ATOMIC_LOAD_UMAX_64:
+    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
+                                SystemZ::CCMASK_CMP_GE, 64);
+
+  case SystemZ::ATOMIC_CMP_SWAPW:
+    return emitAtomicCmpSwapW(MI, MBB);
+  case SystemZ::MVCSequence:
+  case SystemZ::MVCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
+  case SystemZ::NCSequence:
+  case SystemZ::NCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::NC);
+  case SystemZ::OCSequence:
+  case SystemZ::OCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::OC);
+  case SystemZ::XCSequence:
+  case SystemZ::XCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::XC);
+  case SystemZ::CLCSequence:
+  case SystemZ::CLCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
+  case SystemZ::CLSTLoop:
+    return emitStringWrapper(MI, MBB, SystemZ::CLST);
+  case SystemZ::MVSTLoop:
+    return emitStringWrapper(MI, MBB, SystemZ::MVST);
+  case SystemZ::SRSTLoop:
+    return emitStringWrapper(MI, MBB, SystemZ::SRST);
+  case SystemZ::TBEGIN:
+    return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, false);
+  case SystemZ::TBEGIN_nofloat:
+    return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, true);
+  case SystemZ::TBEGINC:
+    return emitTransactionBegin(MI, MBB, SystemZ::TBEGINC, true);
+  case SystemZ::LTEBRCompare_VecPseudo:
+    return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTEBR);
+  case SystemZ::LTDBRCompare_VecPseudo:
+    return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTDBR);
+  case SystemZ::LTXBRCompare_VecPseudo:
+    return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTXBR);
+
+  default:
+    llvm_unreachable("Unexpected instr type to insert");
+  }
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h
new file mode 100644
index 000000000000..7a21a474c119
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -0,0 +1,595 @@
+//===-- SystemZISelLowering.h - SystemZ DAG lowering interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that SystemZ uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZISELLOWERING_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZISELLOWERING_H
+
+#include "SystemZ.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+namespace SystemZISD {
+enum NodeType : unsigned {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+  // Return with a flag operand.  Operand 0 is the chain operand.
+  RET_FLAG,
+
+  // Calls a function.  Operand 0 is the chain operand and operand 1
+  // is the target address.  The arguments start at operand 2.
+  // There is an optional glue operand at the end.
+  CALL,
+  SIBCALL,
+
+  // TLS calls.  Like regular calls, except operand 1 is the TLS symbol.
+  // (The call target is implicitly __tls_get_offset.)
+  TLS_GDCALL,
+  TLS_LDCALL,
+
+  // Wraps a TargetGlobalAddress that should be loaded using PC-relative
+  // accesses (LARL).  Operand 0 is the address.
+  PCREL_WRAPPER,
+
+  // Used in cases where an offset is applied to a TargetGlobalAddress.
+  // Operand 0 is the full TargetGlobalAddress and operand 1 is a
+  // PCREL_WRAPPER for an anchor point.  This is used so that we can
+  // cheaply refer to either the full address or the anchor point
+  // as a register base.
+  PCREL_OFFSET,
+
+  // Integer absolute.
+  IABS,
+
+  // Integer comparisons.  There are three operands: the two values
+  // to compare, and an integer of type SystemZICMP.
+  ICMP,
+
+  // Floating-point comparisons.  The two operands are the values to compare.
+  FCMP,
+
+  // Test under mask.  The first operand is ANDed with the second operand
+  // and the condition codes are set on the result.  The third operand is
+  // a boolean that is true if the condition codes need to distinguish
+  // between CCMASK_TM_MIXED_MSB_0 and CCMASK_TM_MIXED_MSB_1 (which the
+  // register forms do but the memory forms don't).
+  TM,
+
+  // Branches if a condition is true.  Operand 0 is the chain operand;
+  // operand 1 is the 4-bit condition-code mask, with bit N in
+  // big-endian order meaning "branch if CC=N"; operand 2 is the
+  // target block and operand 3 is the flag operand.
+  BR_CCMASK,
+
+  // Selects between operand 0 and operand 1.  Operand 2 is the
+  // mask of condition-code values for which operand 0 should be
+  // chosen over operand 1; it has the same form as BR_CCMASK.
+  // Operand 3 is the flag operand.
+  SELECT_CCMASK,
+
+  // Evaluates to the gap between the stack pointer and the
+  // base of the dynamically-allocatable area.
+  ADJDYNALLOC,
+
+  // Count number of bits set in operand 0 per byte.
+  POPCNT,
+
+  // Wrappers around the ISD opcodes of the same name.  The output and
+  // first input operands are GR128s.  The trailing numbers are the
+  // widths of the second operand in bits.
+  UMUL_LOHI64,
+  SDIVREM32,
+  SDIVREM64,
+  UDIVREM32,
+  UDIVREM64,
+
+  // Use a series of MVCs to copy bytes from one memory location to another.
+  // The operands are:
+  // - the target address
+  // - the source address
+  // - the constant length
+  //
+  // This isn't a memory opcode because we'd need to attach two
+  // MachineMemOperands rather than one.
+  MVC,
+
+  // Like MVC, but implemented as a loop that handles X*256 bytes
+  // followed by straight-line code to handle the rest (if any).
+  // The value of X is passed as an additional operand.
+  MVC_LOOP,
+
+  // Similar to MVC and MVC_LOOP, but for logic operations (AND, OR, XOR).
+  NC,
+  NC_LOOP,
+  OC,
+  OC_LOOP,
+  XC,
+  XC_LOOP,
+
+  // Use CLC to compare two blocks of memory, with the same comments
+  // as for MVC and MVC_LOOP.
+  CLC,
+  CLC_LOOP,
+
+  // Use an MVST-based sequence to implement stpcpy().
+  STPCPY,
+
+  // Use a CLST-based sequence to implement strcmp().  The two input operands
+  // are the addresses of the strings to compare.
+  STRCMP,
+
+  // Use an SRST-based sequence to search a block of memory.  The first
+  // operand is the end address, the second is the start, and the third
+  // is the character to search for.  CC is set to 1 on success and 2
+  // on failure.
+  SEARCH_STRING,
+
+  // Store the CC value in bits 29 and 28 of an integer.
+  IPM,
+
+  // Perform a serialization operation.  (BCR 15,0 or BCR 14,0.)
+  SERIALIZE,
+
+  // Compiler barrier only; generate a no-op.
+  MEMBARRIER,
+
+  // Transaction begin.  The first operand is the chain, the second
+  // the TDB pointer, and the third the immediate control field.
+  // Returns chain and glue.
+  TBEGIN,
+  TBEGIN_NOFLOAT,
+
+  // Transaction end.  Just the chain operand.  Returns chain and glue.
+  TEND,
+
+  // Create a vector constant by filling byte N of the result with bit
+  // 15-N of the single operand.
+  BYTE_MASK,
+
+  // Create a vector constant by replicating an element-sized RISBG-style mask.
+  // The first operand specifies the starting set bit and the second operand
+  // specifies the ending set bit.  Both operands count from the MSB of the
+  // element.
+  ROTATE_MASK,
+
+  // Replicate a GPR scalar value into all elements of a vector.
+  REPLICATE,
+
+  // Create a vector from two i64 GPRs.
+  JOIN_DWORDS,
+
+  // Replicate one element of a vector into all elements.  The first operand
+  // is the vector and the second is the index of the element to replicate.
+  SPLAT,
+
+  // Interleave elements from the high half of operand 0 and the high half
+  // of operand 1.
+  MERGE_HIGH,
+
+  // Likewise for the low halves.
+  MERGE_LOW,
+
+  // Concatenate the vectors in the first two operands, shift them left
+  // by the third operand, and take the first half of the result.
+  SHL_DOUBLE,
+
+  // Take one element of the first v2i64 operand and the one element of
+  // the second v2i64 operand and concatenate them to form a v2i64 result.
+  // The third operand is a 4-bit value of the form 0A0B, where A and B
+  // are the element selectors for the first operand and second operands
+  // respectively.
+  PERMUTE_DWORDS,
+
+  // Perform a general vector permute on vector operands 0 and 1.
+  // Each byte of operand 2 controls the corresponding byte of the result,
+  // in the same way as a byte-level VECTOR_SHUFFLE mask.
+  PERMUTE,
+
+  // Pack vector operands 0 and 1 into a single vector with half-sized elements.
+  PACK,
+
+  // Likewise, but saturate the result and set CC.  PACKS_CC does signed
+  // saturation and PACKLS_CC does unsigned saturation.
+  PACKS_CC,
+  PACKLS_CC,
+
+  // Unpack the first half of vector operand 0 into double-sized elements.
+  // UNPACK_HIGH sign-extends and UNPACKL_HIGH zero-extends.
+  UNPACK_HIGH,
+  UNPACKL_HIGH,
+
+  // Likewise for the second half.
+  UNPACK_LOW,
+  UNPACKL_LOW,
+
+  // Shift each element of vector operand 0 by the number of bits specified
+  // by scalar operand 1.
+  VSHL_BY_SCALAR,
+  VSRL_BY_SCALAR,
+  VSRA_BY_SCALAR,
+
+  // For each element of the output type, sum across all sub-elements of
+  // operand 0 belonging to the corresponding element, and add in the
+  // rightmost sub-element of the corresponding element of operand 1.
+  VSUM,
+
+  // Compare integer vector operands 0 and 1 to produce the usual 0/-1
+  // vector result.  VICMPE is for equality, VICMPH for "signed greater than"
+  // and VICMPHL for "unsigned greater than".
+  VICMPE,
+  VICMPH,
+  VICMPHL,
+
+  // Likewise, but also set the condition codes on the result.
+  VICMPES,
+  VICMPHS,
+  VICMPHLS,
+
+  // Compare floating-point vector operands 0 and 1 to preoduce the usual 0/-1
+  // vector result.  VFCMPE is for "ordered and equal", VFCMPH for "ordered and
+  // greater than" and VFCMPHE for "ordered and greater than or equal to".
+  VFCMPE,
+  VFCMPH,
+  VFCMPHE,
+
+  // Likewise, but also set the condition codes on the result.
+  VFCMPES,
+  VFCMPHS,
+  VFCMPHES,
+
+  // Test floating-point data class for vectors.
+  VFTCI,
+
+  // Extend the even f32 elements of vector operand 0 to produce a vector
+  // of f64 elements.
+  VEXTEND,
+
+  // Round the f64 elements of vector operand 0 to f32s and store them in the
+  // even elements of the result.
+  VROUND,
+
+  // AND the two vector operands together and set CC based on the result.
+  VTM,
+
+  // String operations that set CC as a side-effect.
+  VFAE_CC,
+  VFAEZ_CC,
+  VFEE_CC,
+  VFEEZ_CC,
+  VFENE_CC,
+  VFENEZ_CC,
+  VISTR_CC,
+  VSTRC_CC,
+  VSTRCZ_CC,
+
+  // Test Data Class.
+  //
+  // Operand 0: the value to test
+  // Operand 1: the bit mask
+  TDC,
+
+  // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
+  // ATOMIC_LOAD_<op>.
+  //
+  // Operand 0: the address of the containing 32-bit-aligned field
+  // Operand 1: the second operand of <op>, in the high bits of an i32
+  //            for everything except ATOMIC_SWAPW
+  // Operand 2: how many bits to rotate the i32 left to bring the first
+  //            operand into the high bits
+  // Operand 3: the negative of operand 2, for rotating the other way
+  // Operand 4: the width of the field in bits (8 or 16)
+  ATOMIC_SWAPW = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  ATOMIC_LOADW_ADD,
+  ATOMIC_LOADW_SUB,
+  ATOMIC_LOADW_AND,
+  ATOMIC_LOADW_OR,
+  ATOMIC_LOADW_XOR,
+  ATOMIC_LOADW_NAND,
+  ATOMIC_LOADW_MIN,
+  ATOMIC_LOADW_MAX,
+  ATOMIC_LOADW_UMIN,
+  ATOMIC_LOADW_UMAX,
+
+  // A wrapper around the inner loop of an ATOMIC_CMP_SWAP.
+  //
+  // Operand 0: the address of the containing 32-bit-aligned field
+  // Operand 1: the compare value, in the low bits of an i32
+  // Operand 2: the swap value, in the low bits of an i32
+  // Operand 3: how many bits to rotate the i32 left to bring the first
+  //            operand into the high bits
+  // Operand 4: the negative of operand 2, for rotating the other way
+  // Operand 5: the width of the field in bits (8 or 16)
+  ATOMIC_CMP_SWAPW,
+
+  // Byte swapping load.
+  //
+  // Operand 0: the address to load from
+  // Operand 1: the type of load (i16, i32, i64)
+  LRV,
+
+  // Byte swapping store.
+  //
+  // Operand 0: the value to store
+  // Operand 1: the address to store to
+  // Operand 2: the type of store (i16, i32, i64)
+  STRV,
+
+  // Prefetch from the second operand using the 4-bit control code in
+  // the first operand.  The code is 1 for a load prefetch and 2 for
+  // a store prefetch.
+  PREFETCH
+};
+
+// Return true if OPCODE is some kind of PC-relative address.
+inline bool isPCREL(unsigned Opcode) {
+  return Opcode == PCREL_WRAPPER || Opcode == PCREL_OFFSET;
+}
+} // end namespace SystemZISD
+
+namespace SystemZICMP {
+// Describes whether an integer comparison needs to be signed or unsigned,
+// or whether either type is OK.
+enum {
+  Any,
+  UnsignedOnly,
+  SignedOnly
+};
+} // end namespace SystemZICMP
+
+class SystemZSubtarget;
+class SystemZTargetMachine;
+
+class SystemZTargetLowering : public TargetLowering {
+public:
+  explicit SystemZTargetLowering(const TargetMachine &TM,
+                                 const SystemZSubtarget &STI);
+
+  // Override TargetLowering.
+  MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+    return MVT::i32;
+  }
+  MVT getVectorIdxTy(const DataLayout &DL) const override {
+    // Only the lower 12 bits of an element index are used, so we don't
+    // want to clobber the upper 32 bits of a GPR unnecessarily.
+    return MVT::i32;
+  }
+  TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
+    const override {
+    // Widen subvectors to the full width rather than promoting integer
+    // elements.  This is better because:
+    //
+    // (a) it means that we can handle the ABI for passing and returning
+    //     sub-128 vectors without having to handle them as legal types.
+    //
+    // (b) we don't have instructions to extend on load and truncate on store,
+    //     so promoting the integers is less efficient.
+    //
+    // (c) there are no multiplication instructions for the widest integer
+    //     type (v2i64).
+    if (VT.getScalarSizeInBits() % 8 == 0)
+      return TypeWidenVector;
+    return TargetLoweringBase::getPreferredVectorAction(VT);
+  }
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                         EVT) const override;
+  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+  bool isLegalICmpImmediate(int64_t Imm) const override;
+  bool isLegalAddImmediate(int64_t Imm) const override;
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
+  bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) const override;
+  bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
+                                      unsigned Align,
+                                      bool *Fast) const override;
+  bool isTruncateFree(Type *, Type *) const override;
+  bool isTruncateFree(EVT, EVT) const override;
+  const char *getTargetNodeName(unsigned Opcode) const override;
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+  TargetLowering::ConstraintType
+  getConstraintType(StringRef Constraint) const override;
+  TargetLowering::ConstraintWeight
+    getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                   const char *constraint) const override;
+  void LowerAsmOperandForConstraint(SDValue Op,
+                                    std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+
+  unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+    if (ConstraintCode.size() == 1) {
+      switch(ConstraintCode[0]) {
+      default:
+        break;
+      case 'Q':
+        return InlineAsm::Constraint_Q;
+      case 'R':
+        return InlineAsm::Constraint_R;
+      case 'S':
+        return InlineAsm::Constraint_S;
+      case 'T':
+        return InlineAsm::Constraint_T;
+      }
+    }
+    return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+  }
+
+  /// If a physical register, this returns the register that receives the
+  /// exception address on entry to an EH pad.
+  unsigned
+  getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+    return SystemZ::R6D;
+  }
+
+  /// If a physical register, this returns the register that receives the
+  /// exception typeid on entry to a landing pad.
+  unsigned
+  getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+    return SystemZ::R7D;
+  }
+
+  /// Override to support customized stack guard loading.
+  bool useLoadStackGuardNode() const override {
+    return true;
+  }
+  void insertSSPDeclarations(Module &M) const override {
+  }
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *BB) const override;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  bool allowTruncateForTailCall(Type *, Type *) const override;
+  bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+                      SelectionDAG &DAG) const override;
+  SDValue prepareVolatileOrAtomicLoad(SDValue Chain, const SDLoc &DL,
+                                      SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+  ISD::NodeType getExtendForAtomicOps() const override {
+    return ISD::ANY_EXTEND;
+  }
+
+  bool supportSwiftError() const override {
+    return true;
+  }
+
+private:
+  const SystemZSubtarget &Subtarget;
+
+  // Implement LowerOperation for individual opcodes.
+  SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerGlobalAddress(GlobalAddressSDNode *Node,
+                             SelectionDAG &DAG) const;
+  SDValue lowerTLSGetOffset(GlobalAddressSDNode *Node,
+                            SelectionDAG &DAG, unsigned Opcode,
+                            SDValue GOTOffset) const;
+  SDValue lowerThreadPointer(const SDLoc &DL, SelectionDAG &DAG) const;
+  SDValue lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
+                                SelectionDAG &DAG) const;
+  SDValue lowerBlockAddress(BlockAddressSDNode *Node,
+                            SelectionDAG &DAG) const;
+  SDValue lowerJumpTable(JumpTableSDNode *JT, SelectionDAG &DAG) const;
+  SDValue lowerConstantPool(ConstantPoolSDNode *CP, SelectionDAG &DAG) const;
+  SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
+                              unsigned Opcode) const;
+  SDValue lowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerLOAD_SEQUENCE_POINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
+                                 unsigned UnpackHigh) const;
+  SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
+
+  SDValue combineExtract(const SDLoc &DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,
+                         unsigned Index, DAGCombinerInfo &DCI,
+                         bool Force) const;
+  SDValue combineTruncateExtract(const SDLoc &DL, EVT TruncVT, SDValue Op,
+                                 DAGCombinerInfo &DCI) const;
+  SDValue combineSIGN_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineMERGE(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineSTORE(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineEXTRACT_VECTOR_ELT(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineJOIN_DWORDS(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineFP_ROUND(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineBSWAP(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineSHIFTROT(SDNode *N, DAGCombinerInfo &DCI) const;
+
+  // If the last instruction before MBBI in MBB was some form of COMPARE,
+  // try to replace it with a COMPARE AND BRANCH just before MBBI.
+  // CCMask and Target are the BRC-like operands for the branch.
+  // Return true if the change was made.
+  bool convertPrevCompareToBranch(MachineBasicBlock *MBB,
+                                  MachineBasicBlock::iterator MBBI,
+                                  unsigned CCMask,
+                                  MachineBasicBlock *Target) const;
+
+  // Implement EmitInstrWithCustomInserter for individual operation types.
+  MachineBasicBlock *emitSelect(MachineInstr &MI, MachineBasicBlock *BB,
+                                unsigned LOCROpcode) const;
+  MachineBasicBlock *emitCondStore(MachineInstr &MI, MachineBasicBlock *BB,
+                                   unsigned StoreOpcode, unsigned STOCOpcode,
+                                   bool Invert) const;
+  MachineBasicBlock *emitExt128(MachineInstr &MI, MachineBasicBlock *MBB,
+                                bool ClearEven, unsigned SubReg) const;
+  MachineBasicBlock *emitAtomicLoadBinary(MachineInstr &MI,
+                                          MachineBasicBlock *BB,
+                                          unsigned BinOpcode, unsigned BitSize,
+                                          bool Invert = false) const;
+  MachineBasicBlock *emitAtomicLoadMinMax(MachineInstr &MI,
+                                          MachineBasicBlock *MBB,
+                                          unsigned CompareOpcode,
+                                          unsigned KeepOldMask,
+                                          unsigned BitSize) const;
+  MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr &MI,
+                                        MachineBasicBlock *BB) const;
+  MachineBasicBlock *emitMemMemWrapper(MachineInstr &MI, MachineBasicBlock *BB,
+                                       unsigned Opcode) const;
+  MachineBasicBlock *emitStringWrapper(MachineInstr &MI, MachineBasicBlock *BB,
+                                       unsigned Opcode) const;
+  MachineBasicBlock *emitTransactionBegin(MachineInstr &MI,
+                                          MachineBasicBlock *MBB,
+                                          unsigned Opcode, bool NoFloat) const;
+  MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr &MI,
+                                         MachineBasicBlock *MBB,
+                                         unsigned Opcode) const;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h b/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h
new file mode 100644
index 000000000000..896b665d25eb
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h
@@ -0,0 +1,46 @@
+//===-- SystemZInstrBuilder.h - Functions to aid building insts -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle SystemZ'isms in a clean way.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZINSTRBUILDER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZINSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+
+namespace llvm {
+
+/// Add a BDX memory reference for frame object FI to MIB.
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI) {
+  MachineInstr *MI = MIB;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFrameInfo &MFFrame = MF.getFrameInfo();
+  const MCInstrDesc &MCID = MI->getDesc();
+  auto Flags = MachineMemOperand::MONone;
+  if (MCID.mayLoad())
+    Flags |= MachineMemOperand::MOLoad;
+  if (MCID.mayStore())
+    Flags |= MachineMemOperand::MOStore;
+  int64_t Offset = 0;
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags,
+      MFFrame.getObjectSize(FI), MFFrame.getObjectAlignment(FI));
+  return MIB.addFrameIndex(FI).addImm(Offset).addReg(0).addMemOperand(MMO);
+}
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td
new file mode 100644
index 000000000000..bb6d27e24828
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -0,0 +1,507 @@
+//==- SystemZInstrFP.td - Floating-point SystemZ instructions --*- tblgen-*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+// C's ?: operator for floating-point operands.
+def SelectF32  : SelectWrapper<FP32>;
+def SelectF64  : SelectWrapper<FP64>;
+def SelectF128 : SelectWrapper<FP128>;
+
+defm CondStoreF32 : CondStores<FP32, nonvolatile_store,
+                               nonvolatile_load, bdxaddr20only>;
+defm CondStoreF64 : CondStores<FP64, nonvolatile_store,
+                               nonvolatile_load, bdxaddr20only>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero.
+let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+  def LZER : InherentRRE<"lzer", 0xB374, FP32,  fpimm0>;
+  def LZDR : InherentRRE<"lzdr", 0xB375, FP64,  fpimm0>;
+  def LZXR : InherentRRE<"lzxr", 0xB376, FP128, fpimm0>;
+}
+
+// Moves between two floating-point registers.
+let hasSideEffects = 0 in {
+  def LER : UnaryRR <"ler", 0x38,   null_frag, FP32,  FP32>;
+  def LDR : UnaryRR <"ldr", 0x28,   null_frag, FP64,  FP64>;
+  def LXR : UnaryRRE<"lxr", 0xB365, null_frag, FP128, FP128>;
+
+  // For z13 we prefer LDR over LER to avoid partial register dependencies.
+  let isCodeGenOnly = 1 in
+    def LDR32 : UnaryRR<"ldr", 0x28, null_frag, FP32, FP32>;
+}
+
+// Moves between two floating-point registers that also set the condition
+// codes.
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  defm LTEBR : LoadAndTestRRE<"ltebr", 0xB302, FP32>;
+  defm LTDBR : LoadAndTestRRE<"ltdbr", 0xB312, FP64>;
+  defm LTXBR : LoadAndTestRRE<"ltxbr", 0xB342, FP128>;
+}
+// Note that LTxBRCompare is not available if we have vector support,
+// since load-and-test instructions will partially clobber the target
+// (vector) register.
+let Predicates = [FeatureNoVector] in {
+  defm : CompareZeroFP<LTEBRCompare, FP32>;
+  defm : CompareZeroFP<LTDBRCompare, FP64>;
+  defm : CompareZeroFP<LTXBRCompare, FP128>;
+}
+
+// Use a normal load-and-test for compare against zero in case of
+// vector support (via a pseudo to simplify instruction selection).
+let Defs = [CC], usesCustomInserter = 1 in {
+  def LTEBRCompare_VecPseudo : Pseudo<(outs), (ins FP32:$R1, FP32:$R2), []>;
+  def LTDBRCompare_VecPseudo : Pseudo<(outs), (ins FP64:$R1, FP64:$R2), []>;
+  def LTXBRCompare_VecPseudo : Pseudo<(outs), (ins FP128:$R1, FP128:$R2), []>;
+}
+let Predicates = [FeatureVector] in {
+  defm : CompareZeroFP<LTEBRCompare_VecPseudo, FP32>;
+  defm : CompareZeroFP<LTDBRCompare_VecPseudo, FP64>;
+  defm : CompareZeroFP<LTXBRCompare_VecPseudo, FP128>;
+}
+
+// Moves between 64-bit integer and floating-point registers.
+def LGDR : UnaryRRE<"lgdr", 0xB3CD, bitconvert, GR64, FP64>;
+def LDGR : UnaryRRE<"ldgr", 0xB3C1, bitconvert, FP64, GR64>;
+
+// fcopysign with an FP32 result.
+let isCodeGenOnly = 1 in {
+  def CPSDRss : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP32, FP32, FP32>;
+  def CPSDRsd : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP32, FP32, FP64>;
+}
+
+// The sign of an FP128 is in the high register.
+def : Pat<(fcopysign FP32:$src1, FP128:$src2),
+          (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+
+// fcopysign with an FP64 result.
+let isCodeGenOnly = 1 in
+  def CPSDRds : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP64, FP64, FP32>;
+def CPSDRdd : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP64, FP64, FP64>;
+
+// The sign of an FP128 is in the high register.
+def : Pat<(fcopysign FP64:$src1, FP128:$src2),
+          (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+
+// fcopysign with an FP128 result.  Use "upper" as the high half and leave
+// the low half as-is.
+class CopySign128<RegisterOperand cls, dag upper>
+  : Pat<(fcopysign FP128:$src1, cls:$src2),
+        (INSERT_SUBREG FP128:$src1, upper, subreg_h64)>;
+
+def : CopySign128<FP32,  (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                  FP32:$src2)>;
+def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                  FP64:$src2)>;
+def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                  (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+
+defm LoadStoreF32  : MVCLoadStore<load, f32,  MVCSequence, 4>;
+defm LoadStoreF64  : MVCLoadStore<load, f64,  MVCSequence, 8>;
+defm LoadStoreF128 : MVCLoadStore<load, f128, MVCSequence, 16>;
+
+//===----------------------------------------------------------------------===//
+// Load instructions
+//===----------------------------------------------------------------------===//
+
+let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
+  defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32, 4>;
+  defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>;
+
+  // For z13 we prefer LDE over LE to avoid partial register dependencies.
+  def LDE32 : UnaryRXE<"lde", 0xED24, null_frag, FP32, 4>;
+
+  // These instructions are split after register allocation, so we don't
+  // want a custom inserter.
+  let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+    def LX : Pseudo<(outs FP128:$dst), (ins bdxaddr20only128:$src),
+                     [(set FP128:$dst, (load bdxaddr20only128:$src))]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Store instructions
+//===----------------------------------------------------------------------===//
+
+let SimpleBDXStore = 1 in {
+  defm STE : StoreRXPair<"ste", 0x70, 0xED66, store, FP32, 4>;
+  defm STD : StoreRXPair<"std", 0x60, 0xED67, store, FP64, 8>;
+
+  // These instructions are split after register allocation, so we don't
+  // want a custom inserter.
+  let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+    def STX : Pseudo<(outs), (ins FP128:$src, bdxaddr20only128:$dst),
+                     [(store FP128:$src, bdxaddr20only128:$dst)]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Convert floating-point values to narrower representations, rounding
+// according to the current mode.  The destination of LEXBR and LDXBR
+// is a 128-bit value, but only the first register of the pair is used.
+def LEDBR : UnaryRRE<"ledbr", 0xB344, fpround,    FP32,  FP64>;
+def LEXBR : UnaryRRE<"lexbr", 0xB346, null_frag, FP128, FP128>;
+def LDXBR : UnaryRRE<"ldxbr", 0xB345, null_frag, FP128, FP128>;
+
+def LEDBRA : TernaryRRFe<"ledbra", 0xB344, FP32,  FP64>,
+             Requires<[FeatureFPExtension]>;
+def LEXBRA : TernaryRRFe<"lexbra", 0xB346, FP128, FP128>,
+             Requires<[FeatureFPExtension]>;
+def LDXBRA : TernaryRRFe<"ldxbra", 0xB345, FP128, FP128>,
+             Requires<[FeatureFPExtension]>;
+
+def : Pat<(f32 (fpround FP128:$src)),
+          (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>;
+def : Pat<(f64 (fpround FP128:$src)),
+          (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
+
+// Extend register floating-point values to wider representations.
+def LDEBR : UnaryRRE<"ldebr", 0xB304, fpextend, FP64,  FP32>;
+def LXEBR : UnaryRRE<"lxebr", 0xB306, fpextend, FP128, FP32>;
+def LXDBR : UnaryRRE<"lxdbr", 0xB305, fpextend, FP128, FP64>;
+
+// Extend memory floating-point values to wider representations.
+def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64,  4>;
+def LXEB : UnaryRXE<"lxeb", 0xED06, extloadf32, FP128, 4>;
+def LXDB : UnaryRXE<"lxdb", 0xED05, extloadf64, FP128, 8>;
+
+// Convert a signed integer register value to a floating-point one.
+def CEFBR : UnaryRRE<"cefbr", 0xB394, sint_to_fp, FP32,  GR32>;
+def CDFBR : UnaryRRE<"cdfbr", 0xB395, sint_to_fp, FP64,  GR32>;
+def CXFBR : UnaryRRE<"cxfbr", 0xB396, sint_to_fp, FP128, GR32>;
+
+def CEGBR : UnaryRRE<"cegbr", 0xB3A4, sint_to_fp, FP32,  GR64>;
+def CDGBR : UnaryRRE<"cdgbr", 0xB3A5, sint_to_fp, FP64,  GR64>;
+def CXGBR : UnaryRRE<"cxgbr", 0xB3A6, sint_to_fp, FP128, GR64>;
+
+// The FP extension feature provides versions of the above that allow
+// specifying rounding mode and inexact-exception suppression flags.
+let Predicates = [FeatureFPExtension] in {
+  def CEFBRA : TernaryRRFe<"cefbra", 0xB394, FP32,  GR32>;
+  def CDFBRA : TernaryRRFe<"cdfbra", 0xB395, FP64,  GR32>;
+  def CXFBRA : TernaryRRFe<"cxfbra", 0xB396, FP128, GR32>;
+
+  def CEGBRA : TernaryRRFe<"cegbra", 0xB3A4, FP32,  GR64>;
+  def CDGBRA : TernaryRRFe<"cdgbra", 0xB3A5, FP64,  GR64>;
+  def CXGBRA : TernaryRRFe<"cxgbra", 0xB3A6, FP128, GR64>;
+}
+
+// Convert am unsigned integer register value to a floating-point one.
+let Predicates = [FeatureFPExtension] in {
+  def CELFBR : TernaryRRFe<"celfbr", 0xB390, FP32,  GR32>;
+  def CDLFBR : TernaryRRFe<"cdlfbr", 0xB391, FP64,  GR32>;
+  def CXLFBR : TernaryRRFe<"cxlfbr", 0xB392, FP128, GR32>;
+
+  def CELGBR : TernaryRRFe<"celgbr", 0xB3A0, FP32,  GR64>;
+  def CDLGBR : TernaryRRFe<"cdlgbr", 0xB3A1, FP64,  GR64>;
+  def CXLGBR : TernaryRRFe<"cxlgbr", 0xB3A2, FP128, GR64>;
+
+  def : Pat<(f32  (uint_to_fp GR32:$src)), (CELFBR 0, GR32:$src, 0)>;
+  def : Pat<(f64  (uint_to_fp GR32:$src)), (CDLFBR 0, GR32:$src, 0)>;
+  def : Pat<(f128 (uint_to_fp GR32:$src)), (CXLFBR 0, GR32:$src, 0)>;
+
+  def : Pat<(f32  (uint_to_fp GR64:$src)), (CELGBR 0, GR64:$src, 0)>;
+  def : Pat<(f64  (uint_to_fp GR64:$src)), (CDLGBR 0, GR64:$src, 0)>;
+  def : Pat<(f128 (uint_to_fp GR64:$src)), (CXLGBR 0, GR64:$src, 0)>;
+}
+
+// Convert a floating-point register value to a signed integer value,
+// with the second operand (modifier M3) specifying the rounding mode.
+let Defs = [CC] in {
+  def CFEBR : BinaryRRFe<"cfebr", 0xB398, GR32, FP32>;
+  def CFDBR : BinaryRRFe<"cfdbr", 0xB399, GR32, FP64>;
+  def CFXBR : BinaryRRFe<"cfxbr", 0xB39A, GR32, FP128>;
+
+  def CGEBR : BinaryRRFe<"cgebr", 0xB3A8, GR64, FP32>;
+  def CGDBR : BinaryRRFe<"cgdbr", 0xB3A9, GR64, FP64>;
+  def CGXBR : BinaryRRFe<"cgxbr", 0xB3AA, GR64, FP128>;
+}
+
+// fp_to_sint always rounds towards zero, which is modifier value 5.
+def : Pat<(i32 (fp_to_sint FP32:$src)),  (CFEBR 5, FP32:$src)>;
+def : Pat<(i32 (fp_to_sint FP64:$src)),  (CFDBR 5, FP64:$src)>;
+def : Pat<(i32 (fp_to_sint FP128:$src)), (CFXBR 5, FP128:$src)>;
+
+def : Pat<(i64 (fp_to_sint FP32:$src)),  (CGEBR 5, FP32:$src)>;
+def : Pat<(i64 (fp_to_sint FP64:$src)),  (CGDBR 5, FP64:$src)>;
+def : Pat<(i64 (fp_to_sint FP128:$src)), (CGXBR 5, FP128:$src)>;
+
+// The FP extension feature provides versions of the above that allow
+// also specifying the inexact-exception suppression flag.
+let Predicates = [FeatureFPExtension], Defs = [CC] in {
+  def CFEBRA : TernaryRRFe<"cfebra", 0xB398, GR32, FP32>;
+  def CFDBRA : TernaryRRFe<"cfdbra", 0xB399, GR32, FP64>;
+  def CFXBRA : TernaryRRFe<"cfxbra", 0xB39A, GR32, FP128>;
+
+  def CGEBRA : TernaryRRFe<"cgebra", 0xB3A8, GR64, FP32>;
+  def CGDBRA : TernaryRRFe<"cgdbra", 0xB3A9, GR64, FP64>;
+  def CGXBRA : TernaryRRFe<"cgxbra", 0xB3AA, GR64, FP128>;
+}
+
+// Convert a floating-point register value to an unsigned integer value.
+let Predicates = [FeatureFPExtension] in {
+  let Defs = [CC] in {
+    def CLFEBR : TernaryRRFe<"clfebr", 0xB39C, GR32, FP32>;
+    def CLFDBR : TernaryRRFe<"clfdbr", 0xB39D, GR32, FP64>;
+    def CLFXBR : TernaryRRFe<"clfxbr", 0xB39E, GR32, FP128>;
+
+    def CLGEBR : TernaryRRFe<"clgebr", 0xB3AC, GR64, FP32>;
+    def CLGDBR : TernaryRRFe<"clgdbr", 0xB3AD, GR64, FP64>;
+    def CLGXBR : TernaryRRFe<"clgxbr", 0xB3AE, GR64, FP128>;
+  }
+
+  def : Pat<(i32 (fp_to_uint FP32:$src)),  (CLFEBR 5, FP32:$src,  0)>;
+  def : Pat<(i32 (fp_to_uint FP64:$src)),  (CLFDBR 5, FP64:$src,  0)>;
+  def : Pat<(i32 (fp_to_uint FP128:$src)), (CLFXBR 5, FP128:$src, 0)>;
+
+  def : Pat<(i64 (fp_to_uint FP32:$src)),  (CLGEBR 5, FP32:$src,  0)>;
+  def : Pat<(i64 (fp_to_uint FP64:$src)),  (CLGDBR 5, FP64:$src,  0)>;
+  def : Pat<(i64 (fp_to_uint FP128:$src)), (CLGXBR 5, FP128:$src, 0)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// We prefer generic instructions during isel, because they do not
+// clobber CC and therefore give the scheduler more freedom. In cases
+// the CC is actually useful, the SystemZElimCompare pass will try to
+// convert generic instructions into opcodes that also set CC. Note
+// that lcdf / lpdf / lndf only affect the sign bit, and can therefore
+// be used with fp32 as well. This could be done for fp128, in which
+// case the operands would have to be tied.
+
+// Negation (Load Complement).
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  def LCEBR : UnaryRRE<"lcebr", 0xB303, null_frag, FP32,  FP32>;
+  def LCDBR : UnaryRRE<"lcdbr", 0xB313, null_frag, FP64,  FP64>;
+  def LCXBR : UnaryRRE<"lcxbr", 0xB343, fneg, FP128, FP128>;
+}
+// Generic form, which does not set CC.
+def LCDFR : UnaryRRE<"lcdfr", 0xB373, fneg, FP64,  FP64>;
+let isCodeGenOnly = 1 in
+  def LCDFR_32 : UnaryRRE<"lcdfr", 0xB373, fneg, FP32,  FP32>;
+
+// Absolute value (Load Positive).
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  def LPEBR : UnaryRRE<"lpebr", 0xB300, null_frag, FP32,  FP32>;
+  def LPDBR : UnaryRRE<"lpdbr", 0xB310, null_frag, FP64,  FP64>;
+  def LPXBR : UnaryRRE<"lpxbr", 0xB340, fabs, FP128, FP128>;
+}
+// Generic form, which does not set CC.
+def LPDFR : UnaryRRE<"lpdfr", 0xB370, fabs, FP64,  FP64>;
+let isCodeGenOnly = 1 in
+  def LPDFR_32 : UnaryRRE<"lpdfr", 0xB370, fabs, FP32,  FP32>;
+
+// Negative absolute value (Load Negative).
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  def LNEBR : UnaryRRE<"lnebr", 0xB301, null_frag, FP32,  FP32>;
+  def LNDBR : UnaryRRE<"lndbr", 0xB311, null_frag, FP64,  FP64>;
+  def LNXBR : UnaryRRE<"lnxbr", 0xB341, fnabs, FP128, FP128>;
+}
+// Generic form, which does not set CC.
+def LNDFR : UnaryRRE<"lndfr", 0xB371, fnabs, FP64,  FP64>;
+let isCodeGenOnly = 1 in
+  def LNDFR_32 : UnaryRRE<"lndfr", 0xB371, fnabs, FP32,  FP32>;
+
+// Square root.
+def SQEBR : UnaryRRE<"sqebr", 0xB314, fsqrt, FP32,  FP32>;
+def SQDBR : UnaryRRE<"sqdbr", 0xB315, fsqrt, FP64,  FP64>;
+def SQXBR : UnaryRRE<"sqxbr", 0xB316, fsqrt, FP128, FP128>;
+
+def SQEB : UnaryRXE<"sqeb", 0xED14, loadu<fsqrt>, FP32, 4>;
+def SQDB : UnaryRXE<"sqdb", 0xED15, loadu<fsqrt>, FP64, 8>;
+
+// Round to an integer, with the second operand (modifier M3) specifying
+// the rounding mode.  These forms always check for inexact conditions.
+def FIEBR : BinaryRRFe<"fiebr", 0xB357, FP32,  FP32>;
+def FIDBR : BinaryRRFe<"fidbr", 0xB35F, FP64,  FP64>;
+def FIXBR : BinaryRRFe<"fixbr", 0xB347, FP128, FP128>;
+
+// frint rounds according to the current mode (modifier 0) and detects
+// inexact conditions.
+def : Pat<(frint FP32:$src),  (FIEBR 0, FP32:$src)>;
+def : Pat<(frint FP64:$src),  (FIDBR 0, FP64:$src)>;
+def : Pat<(frint FP128:$src), (FIXBR 0, FP128:$src)>;
+
+let Predicates = [FeatureFPExtension] in {
+  // Extended forms of the FIxBR instructions.  M4 can be set to 4
+  // to suppress detection of inexact conditions.
+  def FIEBRA : TernaryRRFe<"fiebra", 0xB357, FP32,  FP32>;
+  def FIDBRA : TernaryRRFe<"fidbra", 0xB35F, FP64,  FP64>;
+  def FIXBRA : TernaryRRFe<"fixbra", 0xB347, FP128, FP128>;
+
+  // fnearbyint is like frint but does not detect inexact conditions.
+  def : Pat<(fnearbyint FP32:$src),  (FIEBRA 0, FP32:$src,  4)>;
+  def : Pat<(fnearbyint FP64:$src),  (FIDBRA 0, FP64:$src,  4)>;
+  def : Pat<(fnearbyint FP128:$src), (FIXBRA 0, FP128:$src, 4)>;
+
+  // floor is no longer allowed to raise an inexact condition,
+  // so restrict it to the cases where the condition can be suppressed.
+  // Mode 7 is round towards -inf.
+  def : Pat<(ffloor FP32:$src),  (FIEBRA 7, FP32:$src,  4)>;
+  def : Pat<(ffloor FP64:$src),  (FIDBRA 7, FP64:$src,  4)>;
+  def : Pat<(ffloor FP128:$src), (FIXBRA 7, FP128:$src, 4)>;
+
+  // Same idea for ceil, where mode 6 is round towards +inf.
+  def : Pat<(fceil FP32:$src),  (FIEBRA 6, FP32:$src,  4)>;
+  def : Pat<(fceil FP64:$src),  (FIDBRA 6, FP64:$src,  4)>;
+  def : Pat<(fceil FP128:$src), (FIXBRA 6, FP128:$src, 4)>;
+
+  // Same idea for trunc, where mode 5 is round towards zero.
+  def : Pat<(ftrunc FP32:$src),  (FIEBRA 5, FP32:$src,  4)>;
+  def : Pat<(ftrunc FP64:$src),  (FIDBRA 5, FP64:$src,  4)>;
+  def : Pat<(ftrunc FP128:$src), (FIXBRA 5, FP128:$src, 4)>;
+
+  // Same idea for round, where mode 1 is round towards nearest with
+  // ties away from zero.
+  def : Pat<(fround FP32:$src),  (FIEBRA 1, FP32:$src,  4)>;
+  def : Pat<(fround FP64:$src),  (FIDBRA 1, FP64:$src,  4)>;
+  def : Pat<(fround FP128:$src), (FIXBRA 1, FP128:$src, 4)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition.
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  let isCommutable = 1 in {
+    def AEBR : BinaryRRE<"aebr", 0xB30A, fadd, FP32,  FP32>;
+    def ADBR : BinaryRRE<"adbr", 0xB31A, fadd, FP64,  FP64>;
+    def AXBR : BinaryRRE<"axbr", 0xB34A, fadd, FP128, FP128>;
+  }
+  def AEB : BinaryRXE<"aeb", 0xED0A, fadd, FP32, load, 4>;
+  def ADB : BinaryRXE<"adb", 0xED1A, fadd, FP64, load, 8>;
+}
+
+// Subtraction.
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  def SEBR : BinaryRRE<"sebr", 0xB30B, fsub, FP32,  FP32>;
+  def SDBR : BinaryRRE<"sdbr", 0xB31B, fsub, FP64,  FP64>;
+  def SXBR : BinaryRRE<"sxbr", 0xB34B, fsub, FP128, FP128>;
+
+  def SEB : BinaryRXE<"seb",  0xED0B, fsub, FP32, load, 4>;
+  def SDB : BinaryRXE<"sdb",  0xED1B, fsub, FP64, load, 8>;
+}
+
+// Multiplication.
+let isCommutable = 1 in {
+  def MEEBR : BinaryRRE<"meebr", 0xB317, fmul, FP32,  FP32>;
+  def MDBR  : BinaryRRE<"mdbr",  0xB31C, fmul, FP64,  FP64>;
+  def MXBR  : BinaryRRE<"mxbr",  0xB34C, fmul, FP128, FP128>;
+}
+def MEEB : BinaryRXE<"meeb", 0xED17, fmul, FP32, load, 4>;
+def MDB  : BinaryRXE<"mdb",  0xED1C, fmul, FP64, load, 8>;
+
+// f64 multiplication of two FP32 registers.
+def MDEBR : BinaryRRE<"mdebr", 0xB30C, null_frag, FP64, FP32>;
+def : Pat<(fmul (f64 (fpextend FP32:$src1)), (f64 (fpextend FP32:$src2))),
+          (MDEBR (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                FP32:$src1, subreg_r32), FP32:$src2)>;
+
+// f64 multiplication of an FP32 register and an f32 memory.
+def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>;
+def : Pat<(fmul (f64 (fpextend FP32:$src1)),
+                (f64 (extloadf32 bdxaddr12only:$addr))),
+          (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_r32),
+                bdxaddr12only:$addr)>;
+
+// f128 multiplication of two FP64 registers.
+def MXDBR : BinaryRRE<"mxdbr", 0xB307, null_frag, FP128, FP64>;
+def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))),
+          (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
+                                FP64:$src1, subreg_h64), FP64:$src2)>;
+
+// f128 multiplication of an FP64 register and an f64 memory.
+def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
+def : Pat<(fmul (f128 (fpextend FP64:$src1)),
+                (f128 (extloadf64 bdxaddr12only:$addr))),
+          (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
+                bdxaddr12only:$addr)>;
+
+// Fused multiply-add.
+def MAEBR : TernaryRRD<"maebr", 0xB30E, z_fma, FP32>;
+def MADBR : TernaryRRD<"madbr", 0xB31E, z_fma, FP64>;
+
+def MAEB : TernaryRXF<"maeb", 0xED0E, z_fma, FP32, load, 4>;
+def MADB : TernaryRXF<"madb", 0xED1E, z_fma, FP64, load, 8>;
+
+// Fused multiply-subtract.
+def MSEBR : TernaryRRD<"msebr", 0xB30F, z_fms, FP32>;
+def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_fms, FP64>;
+
+def MSEB : TernaryRXF<"mseb", 0xED0F, z_fms, FP32, load, 4>;
+def MSDB : TernaryRXF<"msdb", 0xED1F, z_fms, FP64, load, 8>;
+
+// Division.
+def DEBR : BinaryRRE<"debr", 0xB30D, fdiv, FP32,  FP32>;
+def DDBR : BinaryRRE<"ddbr", 0xB31D, fdiv, FP64,  FP64>;
+def DXBR : BinaryRRE<"dxbr", 0xB34D, fdiv, FP128, FP128>;
+
+def DEB : BinaryRXE<"deb", 0xED0D, fdiv, FP32, load, 4>;
+def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load, 8>;
+
+//===----------------------------------------------------------------------===//
+// Comparisons
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC], CCValues = 0xF in {
+  def CEBR : CompareRRE<"cebr", 0xB309, z_fcmp, FP32,  FP32>;
+  def CDBR : CompareRRE<"cdbr", 0xB319, z_fcmp, FP64,  FP64>;
+  def CXBR : CompareRRE<"cxbr", 0xB349, z_fcmp, FP128, FP128>;
+
+  def CEB : CompareRXE<"ceb", 0xED09, z_fcmp, FP32, load, 4>;
+  def CDB : CompareRXE<"cdb", 0xED19, z_fcmp, FP64, load, 8>;
+}
+
+// Test Data Class.
+let Defs = [CC], CCValues = 0xC in {
+  def TCEB : TestRXE<"tceb", 0xED10, z_tdc, FP32>;
+  def TCDB : TestRXE<"tcdb", 0xED11, z_tdc, FP64>;
+  def TCXB : TestRXE<"tcxb", 0xED12, z_tdc, FP128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-point control register instructions
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 1 in {
+  def EFPC  : InherentRRE<"efpc", 0xB38C, GR32, int_s390_efpc>;
+  def STFPC : StoreInherentS<"stfpc", 0xB29C, storei<int_s390_efpc>, 4>;
+
+  def SFPC : SideEffectUnaryRRE<"sfpc", 0xB384, GR32, int_s390_sfpc>;
+  def LFPC : SideEffectUnaryS<"lfpc", 0xB29D, loadu<int_s390_sfpc>, 4>;
+
+  def SFASR : SideEffectUnaryRRE<"sfasr", 0xB385, GR32, null_frag>;
+  def LFAS  : SideEffectUnaryS<"lfas", 0xB2BD, null_frag, 4>;
+
+  def SRNMB : SideEffectAddressS<"srnmb", 0xB2B8, null_frag, shift12only>,
+              Requires<[FeatureFPExtension]>;
+  def SRNM  : SideEffectAddressS<"srnm", 0xB299, null_frag, shift12only>;
+  def SRNMT : SideEffectAddressS<"srnmt", 0xB2B9, null_frag, shift12only>;
+}
+
+//===----------------------------------------------------------------------===//
+// Peepholes
+//===----------------------------------------------------------------------===//
+
+def : Pat<(f32  fpimmneg0), (LCDFR_32 (LZER))>;
+def : Pat<(f64  fpimmneg0), (LCDFR (LZDR))>;
+def : Pat<(f128 fpimmneg0), (LCXBR (LZXR))>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
new file mode 100644
index 000000000000..c727f486087e
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -0,0 +1,4083 @@
+//==- SystemZInstrFormats.td - SystemZ Instruction Formats --*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Basic SystemZ instruction definition
+//===----------------------------------------------------------------------===//
+
+class InstSystemZ<int size, dag outs, dag ins, string asmstr,
+                  list<dag> pattern> : Instruction {
+  let Namespace = "SystemZ";
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let Size = size;
+  let Pattern = pattern;
+  let AsmString = asmstr;
+
+  // Some instructions come in pairs, one having a 12-bit displacement
+  // and the other having a 20-bit displacement.  Both instructions in
+  // the pair have the same DispKey and their DispSizes are "12" and "20"
+  // respectively.
+  string DispKey = "";
+  string DispSize = "none";
+
+  // Many register-based <INSN>R instructions have a memory-based <INSN>
+  // counterpart.  OpKey uniquely identifies <INSN>R, while OpType is
+  // "reg" for <INSN>R and "mem" for <INSN>.
+  string OpKey = "";
+  string OpType = "none";
+
+  // Many distinct-operands instructions have older 2-operand equivalents.
+  // NumOpsKey uniquely identifies one of these 2-operand and 3-operand pairs,
+  // with NumOpsValue being "2" or "3" as appropriate.
+  string NumOpsKey = "";
+  string NumOpsValue = "none";
+
+  // True if this instruction is a simple D(X,B) load of a register
+  // (with no sign or zero extension).
+  bit SimpleBDXLoad = 0;
+
+  // True if this instruction is a simple D(X,B) store of a register
+  // (with no truncation).
+  bit SimpleBDXStore = 0;
+
+  // True if this instruction has a 20-bit displacement field.
+  bit Has20BitOffset = 0;
+
+  // True if addresses in this instruction have an index register.
+  bit HasIndex = 0;
+
+  // True if this is a 128-bit pseudo instruction that combines two 64-bit
+  // operations.
+  bit Is128Bit = 0;
+
+  // The access size of all memory operands in bytes, or 0 if not known.
+  bits<5> AccessBytes = 0;
+
+  // If the instruction sets CC to a useful value, this gives the mask
+  // of all possible CC results.  The mask has the same form as
+  // SystemZ::CCMASK_*.
+  bits<4> CCValues = 0;
+
+  // The subset of CCValues that have the same meaning as they would after
+  // a comparison of the first operand against zero.
+  bits<4> CompareZeroCCMask = 0;
+
+  // True if the instruction is conditional and if the CC mask operand
+  // comes first (as for BRC, etc.).
+  bit CCMaskFirst = 0;
+
+  // Similar, but true if the CC mask operand comes last (as for LOC, etc.).
+  bit CCMaskLast = 0;
+
+  // True if the instruction is the "logical" rather than "arithmetic" form,
+  // in cases where a distinction exists.
+  bit IsLogical = 0;
+
+  let TSFlags{0}     = SimpleBDXLoad;
+  let TSFlags{1}     = SimpleBDXStore;
+  let TSFlags{2}     = Has20BitOffset;
+  let TSFlags{3}     = HasIndex;
+  let TSFlags{4}     = Is128Bit;
+  let TSFlags{9-5}   = AccessBytes;
+  let TSFlags{13-10} = CCValues;
+  let TSFlags{17-14} = CompareZeroCCMask;
+  let TSFlags{18}    = CCMaskFirst;
+  let TSFlags{19}    = CCMaskLast;
+  let TSFlags{20}    = IsLogical;
+}
+
+//===----------------------------------------------------------------------===//
+// Mappings between instructions
+//===----------------------------------------------------------------------===//
+
+// Return the version of an instruction that has an unsigned 12-bit
+// displacement.
+def getDisp12Opcode : InstrMapping {
+  let FilterClass = "InstSystemZ";
+  let RowFields = ["DispKey"];
+  let ColFields = ["DispSize"];
+  let KeyCol = ["20"];
+  let ValueCols = [["12"]];
+}
+
+// Return the version of an instruction that has a signed 20-bit displacement.
+def getDisp20Opcode : InstrMapping {
+  let FilterClass = "InstSystemZ";
+  let RowFields = ["DispKey"];
+  let ColFields = ["DispSize"];
+  let KeyCol = ["12"];
+  let ValueCols = [["20"]];
+}
+
+// Return the memory form of a register instruction.
+def getMemOpcode : InstrMapping {
+  let FilterClass = "InstSystemZ";
+  let RowFields = ["OpKey"];
+  let ColFields = ["OpType"];
+  let KeyCol = ["reg"];
+  let ValueCols = [["mem"]];
+}
+
+// Return the 3-operand form of a 2-operand instruction.
+def getThreeOperandOpcode : InstrMapping {
+  let FilterClass = "InstSystemZ";
+  let RowFields = ["NumOpsKey"];
+  let ColFields = ["NumOpsValue"];
+  let KeyCol = ["2"];
+  let ValueCols = [["3"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction formats
+//===----------------------------------------------------------------------===//
+//
+// Formats are specified using operand field declarations of the form:
+//
+//   bits<4> Rn   : register input or output for operand n
+//   bits<5> Vn   : vector register input or output for operand n
+//   bits<m> In   : immediate value of width m for operand n
+//   bits<4> BDn  : address operand n, which has a base and a displacement
+//   bits<m> XBDn : address operand n, which has an index, a base and a
+//                  displacement
+//   bits<m> VBDn : address operand n, which has a vector index, a base and a
+//                  displacement
+//   bits<4> Xn   : index register for address operand n
+//   bits<4> Mn   : mode value for operand n
+//
+// The operand numbers ("n" in the list above) follow the architecture manual.
+// Assembly operands sometimes have a different order; in particular, R3 often
+// is often written between operands 1 and 2.
+//
+//===----------------------------------------------------------------------===//
+
+class InstE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<2, outs, ins, asmstr, pattern> {
+  field bits<16> Inst;
+  field bits<16> SoftFail = 0;
+
+  let Inst = op;
+}
+
+class InstI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<2, outs, ins, asmstr, pattern> {
+  field bits<16> Inst;
+  field bits<16> SoftFail = 0;
+
+  bits<8> I1;
+
+  let Inst{15-8} = op;
+  let Inst{7-0}  = I1;
+}
+
+class InstIE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> I1;
+  bits<4> I2;
+
+  let Inst{31-16} = op;
+  let Inst{15-8}  = 0;
+  let Inst{7-4}   = I1;
+  let Inst{3-0}   = I2;
+}
+
+class InstMII<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> M1;
+  bits<12> RI2;
+  bits<24> RI3;
+
+  let Inst{47-40} = op;
+  let Inst{39-36} = M1;
+  let Inst{35-24} = RI2;
+  let Inst{23-0}  = RI3;
+}
+
+class InstRIa<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<16> I2;
+
+  let Inst{31-24} = op{11-4};
+  let Inst{23-20} = R1;
+  let Inst{19-16} = op{3-0};
+  let Inst{15-0}  = I2;
+}
+
+class InstRIb<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<16> RI2;
+
+  let Inst{31-24} = op{11-4};
+  let Inst{23-20} = R1;
+  let Inst{19-16} = op{3-0};
+  let Inst{15-0}  = RI2;
+}
+
+class InstRIc<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> M1;
+  bits<16> RI2;
+
+  let Inst{31-24} = op{11-4};
+  let Inst{23-20} = M1;
+  let Inst{19-16} = op{3-0};
+  let Inst{15-0}  = RI2;
+}
+
+class InstRIEa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<16> I2;
+  bits<4> M3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = 0;
+  let Inst{31-16} = I2;
+  let Inst{15-12} = M3;
+  let Inst{11-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRIEb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<4> M3;
+  bits<16> RI4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R2;
+  let Inst{31-16} = RI4;
+  let Inst{15-12} = M3;
+  let Inst{11-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRIEc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<8> I2;
+  bits<4> M3;
+  bits<16> RI4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = M3;
+  let Inst{31-16} = RI4;
+  let Inst{15-8}  = I2;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRIEd<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<16> I2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R3;
+  let Inst{31-16} = I2;
+  let Inst{15-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRIEe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<16> RI2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R3;
+  let Inst{31-16} = RI2;
+  let Inst{15-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRIEf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<8> I3;
+  bits<8> I4;
+  bits<8> I5;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R2;
+  let Inst{31-24} = I3;
+  let Inst{23-16} = I4;
+  let Inst{15-8}  = I5;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRIEg<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> M3;
+  bits<16> I2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = M3;
+  let Inst{31-16} = I2;
+  let Inst{15-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRILa<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<32> I2;
+
+  let Inst{47-40} = op{11-4};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = op{3-0};
+  let Inst{31-0}  = I2;
+}
+
+class InstRILb<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<32> RI2;
+
+  let Inst{47-40} = op{11-4};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = op{3-0};
+  let Inst{31-0}  = RI2;
+}
+
+class InstRILc<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> M1;
+  bits<32> RI2;
+
+  let Inst{47-40} = op{11-4};
+  let Inst{39-36} = M1;
+  let Inst{35-32} = op{3-0};
+  let Inst{31-0}  = RI2;
+}
+
+class InstRIS<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<8> I2;
+  bits<4> M3;
+  bits<16> BD4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = M3;
+  let Inst{31-16} = BD4;
+  let Inst{15-8}  = I2;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRR<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<2, outs, ins, asmstr, pattern> {
+  field bits<16> Inst;
+  field bits<16> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+
+  let Inst{15-8} = op;
+  let Inst{7-4}  = R1;
+  let Inst{3-0}  = R2;
+}
+
+class InstRRD<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<4> R2;
+
+  let Inst{31-16} = op;
+  let Inst{15-12} = R1;
+  let Inst{11-8}  = 0;
+  let Inst{7-4}   = R3;
+  let Inst{3-0}   = R2;
+}
+
+class InstRRE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+
+  let Inst{31-16} = op;
+  let Inst{15-8}  = 0;
+  let Inst{7-4}   = R1;
+  let Inst{3-0}   = R2;
+}
+
+class InstRRFa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<4> R3;
+  bits<4> M4;
+
+  let Inst{31-16} = op;
+  let Inst{15-12} = R3;
+  let Inst{11-8}  = M4;
+  let Inst{7-4}   = R1;
+  let Inst{3-0}   = R2;
+}
+
+class InstRRFb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<4> R3;
+  bits<4> M4;
+
+  let Inst{31-16} = op;
+  let Inst{15-12} = R3;
+  let Inst{11-8}  = M4;
+  let Inst{7-4}   = R1;
+  let Inst{3-0}   = R2;
+}
+
+class InstRRFc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<4> M3;
+
+  let Inst{31-16} = op;
+  let Inst{15-12} = M3;
+  let Inst{11-8}  = 0;
+  let Inst{7-4}   = R1;
+  let Inst{3-0}   = R2;
+}
+
+class InstRRFe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<4> M3;
+  bits<4> M4;
+
+  let Inst{31-16} = op;
+  let Inst{15-12} = M3;
+  let Inst{11-8}  = M4;
+  let Inst{7-4}   = R1;
+  let Inst{3-0}   = R2;
+}
+
+class InstRRS<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<4> M3;
+  bits<16> BD4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R2;
+  let Inst{31-16} = BD4;
+  let Inst{15-12} = M3;
+  let Inst{11-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRXa<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<20> XBD2;
+
+  let Inst{31-24} = op;
+  let Inst{23-20} = R1;
+  let Inst{19-0}  = XBD2;
+
+  let HasIndex = 1;
+}
+
+class InstRXb<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> M1;
+  bits<20> XBD2;
+
+  let Inst{31-24} = op;
+  let Inst{23-20} = M1;
+  let Inst{19-0}  = XBD2;
+
+  let HasIndex = 1;
+}
+
+class InstRXE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<20> XBD2;
+  bits<4> M3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-16} = XBD2;
+  let Inst{15-12} = M3;
+  let Inst{11-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+
+  let HasIndex = 1;
+}
+
+class InstRXF<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<20> XBD2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R3;
+  let Inst{35-16} = XBD2;
+  let Inst{15-12} = R1;
+  let Inst{11-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+
+  let HasIndex = 1;
+}
+
+class InstRXYa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<28> XBD2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-8}  = XBD2;
+  let Inst{7-0}   = op{7-0};
+
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+}
+
+class InstRXYb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> M1;
+  bits<28> XBD2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = M1;
+  let Inst{35-8}  = XBD2;
+  let Inst{7-0}   = op{7-0};
+
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+}
+
+class InstRSa<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<16> BD2;
+
+  let Inst{31-24} = op;
+  let Inst{23-20} = R1;
+  let Inst{19-16} = R3;
+  let Inst{15-0}  = BD2;
+}
+
+class InstRSb<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> M3;
+  bits<16> BD2;
+
+  let Inst{31-24} = op;
+  let Inst{23-20} = R1;
+  let Inst{19-16} = M3;
+  let Inst{15-0}  = BD2;
+}
+
+class InstRSI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<16> RI2;
+
+  let Inst{31-24} = op;
+  let Inst{23-20} = R1;
+  let Inst{19-16} = R3;
+  let Inst{15-0}  = RI2;
+}
+
+class InstRSYa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<24> BD2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R3;
+  let Inst{31-8}  = BD2;
+  let Inst{7-0}   = op{7-0};
+
+  let Has20BitOffset = 1;
+}
+
+class InstRSYb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> M3;
+  bits<24> BD2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = M3;
+  let Inst{31-8}  = BD2;
+  let Inst{7-0}   = op{7-0};
+
+  let Has20BitOffset = 1;
+}
+
+class InstSI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<16> BD1;
+  bits<8> I2;
+
+  let Inst{31-24} = op;
+  let Inst{23-16} = I2;
+  let Inst{15-0}  = BD1;
+}
+
+class InstSIL<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<16> BD1;
+  bits<16> I2;
+
+  let Inst{47-32} = op;
+  let Inst{31-16} = BD1;
+  let Inst{15-0}  = I2;
+}
+
+class InstSIY<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<24> BD1;
+  bits<8> I2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-32} = I2;
+  let Inst{31-8}  = BD1;
+  let Inst{7-0}   = op{7-0};
+
+  let Has20BitOffset = 1;
+}
+
+class InstSMI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> M1;
+  bits<16> RI2;
+  bits<16> BD3;
+
+  let Inst{47-40} = op;
+  let Inst{39-36} = M1;
+  let Inst{35-32} = 0;
+  let Inst{31-16} = BD3;
+  let Inst{15-0}  = RI2;
+}
+
+class InstSSa<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<24> BDL1;
+  bits<16> BD2;
+
+  let Inst{47-40} = op;
+  let Inst{39-16} = BDL1;
+  let Inst{15-0}  = BD2;
+}
+
+class InstSSd<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<20> RBD1;
+  bits<16> BD2;
+  bits<4> R3;
+
+  let Inst{47-40} = op;
+  let Inst{39-36} = RBD1{19-16};
+  let Inst{35-32} = R3;
+  let Inst{31-16} = RBD1{15-0};
+  let Inst{15-0}  = BD2;
+}
+
+class InstSSe<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<16> BD2;
+  bits<4> R3;
+  bits<16> BD4;
+
+  let Inst{47-40} = op;
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R3;
+  let Inst{31-16} = BD2;
+  let Inst{15-0}  = BD4;
+}
+
+class InstSSE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<16> BD1;
+  bits<16> BD2;
+
+  let Inst{47-32} = op;
+  let Inst{31-16} = BD1;
+  let Inst{15-0}  = BD2;
+}
+
+class InstSSF<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<16> BD1;
+  bits<16> BD2;
+  bits<4>  R3;
+
+  let Inst{47-40} = op{11-4};
+  let Inst{39-36} = R3;
+  let Inst{35-32} = op{3-0};
+  let Inst{31-16} = BD1;
+  let Inst{15-0}  = BD2;
+}
+
+class InstS<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<16> BD2;
+
+  let Inst{31-16} = op;
+  let Inst{15-0}  = BD2;
+}
+
+class InstVRIa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<16> I2;
+  bits<4> M3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = 0;
+  let Inst{31-16} = I2;
+  let Inst{15-12} = M3;
+  let Inst{11}    = V1{4};
+  let Inst{10-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<8> I2;
+  bits<8> I3;
+  bits<4> M4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = 0;
+  let Inst{31-24} = I2;
+  let Inst{23-16} = I3;
+  let Inst{15-12} = M4;
+  let Inst{11}    = V1{4};
+  let Inst{10-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V3;
+  bits<16> I2;
+  bits<4> M4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V3{3-0};
+  let Inst{31-16} = I2;
+  let Inst{15-12} = M4;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V3{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRId<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<5> V3;
+  bits<8> I4;
+  bits<4> M5;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-28} = V3{3-0};
+  let Inst{27-24} = 0;
+  let Inst{23-16} = I4;
+  let Inst{15-12} = M5;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9}     = V3{4};
+  let Inst{8}     = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<12> I3;
+  bits<4> M4;
+  bits<4> M5;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-20} = I3;
+  let Inst{19-16} = M5;
+  let Inst{15-12} = M4;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+// Depending on the instruction mnemonic, certain bits may be or-ed into
+// the M4 value provided as explicit operand.  These are passed as m4or.
+class InstVRRa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+               bits<4> m4or = 0>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<4> M3;
+  bits<4> M4;
+  bits<4> M5;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-24} = 0;
+  let Inst{23-20} = M5;
+  let Inst{19}    = !if (!eq (m4or{3}, 1), 1, M4{3});
+  let Inst{18}    = !if (!eq (m4or{2}, 1), 1, M4{2});
+  let Inst{17}    = !if (!eq (m4or{1}, 1), 1, M4{1});
+  let Inst{16}    = !if (!eq (m4or{0}, 1), 1, M4{0});
+  let Inst{15-12} = M3;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+// Depending on the instruction mnemonic, certain bits may be or-ed into
+// the M5 value provided as explicit operand.  These are passed as m5or.
+class InstVRRb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+               bits<4> m5or = 0>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<5> V3;
+  bits<4> M4;
+  bits<4> M5;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-28} = V3{3-0};
+  let Inst{27-24} = 0;
+  let Inst{23}    = !if (!eq (m5or{3}, 1), 1, M5{3});
+  let Inst{22}    = !if (!eq (m5or{2}, 1), 1, M5{2});
+  let Inst{21}    = !if (!eq (m5or{1}, 1), 1, M5{1});
+  let Inst{20}    = !if (!eq (m5or{0}, 1), 1, M5{0});
+  let Inst{19-16} = 0;
+  let Inst{15-12} = M4;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9}     = V3{4};
+  let Inst{8}     = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRRc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<5> V3;
+  bits<4> M4;
+  bits<4> M5;
+  bits<4> M6;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-28} = V3{3-0};
+  let Inst{27-24} = 0;
+  let Inst{23-20} = M6;
+  let Inst{19-16} = M5;
+  let Inst{15-12} = M4;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9}     = V3{4};
+  let Inst{8}     = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+// Depending on the instruction mnemonic, certain bits may be or-ed into
+// the M6 value provided as explicit operand.  These are passed as m6or.
+class InstVRRd<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+               bits<4> m6or = 0>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<5> V3;
+  bits<5> V4;
+  bits<4> M5;
+  bits<4> M6;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-28} = V3{3-0};
+  let Inst{27-24} = M5;
+  let Inst{23}    = !if (!eq (m6or{3}, 1), 1, M6{3});
+  let Inst{22}    = !if (!eq (m6or{2}, 1), 1, M6{2});
+  let Inst{21}    = !if (!eq (m6or{1}, 1), 1, M6{1});
+  let Inst{20}    = !if (!eq (m6or{0}, 1), 1, M6{0});
+  let Inst{19-16} = 0;
+  let Inst{15-12} = V4{3-0};
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9}     = V3{4};
+  let Inst{8}     = V4{4};
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRRe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<5> V3;
+  bits<5> V4;
+  bits<4> M5;
+  bits<4> M6;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-28} = V3{3-0};
+  let Inst{27-24} = M6;
+  let Inst{23-20} = 0;
+  let Inst{19-16} = M5;
+  let Inst{15-12} = V4{3-0};
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9}     = V3{4};
+  let Inst{8}     = V4{4};
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRRf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<4> R2;
+  bits<4> R3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = R2;
+  let Inst{31-28} = R3;
+  let Inst{27-12} = 0;
+  let Inst{11}    = V1{4};
+  let Inst{10-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRSa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<16> BD2;
+  bits<5> V3;
+  bits<4> M4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V3{3-0};
+  let Inst{31-16} = BD2;
+  let Inst{15-12} = M4;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V3{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRSb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<16> BD2;
+  bits<4> R3;
+  bits<4> M4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = R3;
+  let Inst{31-16} = BD2;
+  let Inst{15-12} = M4;
+  let Inst{11}    = V1{4};
+  let Inst{10-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRSc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<16> BD2;
+  bits<5> V3;
+  bits<4> M4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = V3{3-0};
+  let Inst{31-16} = BD2;
+  let Inst{15-12} = M4;
+  let Inst{11}    = 0;
+  let Inst{10}    = V3{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRV<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<21> VBD2;
+  bits<4> M3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-16} = VBD2{19-0};
+  let Inst{15-12} = M3;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = VBD2{20};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRX<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<20> XBD2;
+  bits<4> M3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-16} = XBD2;
+  let Inst{15-12} = M3;
+  let Inst{11}    = V1{4};
+  let Inst{10-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction classes for .insn directives
+//===----------------------------------------------------------------------===//
+
+class DirectiveInsnE<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstE<0, outs, ins, asmstr, pattern> {
+  bits<16> enc;
+
+  let Inst = enc;
+}
+
+class DirectiveInsnRI<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstRIa<0, outs, ins, asmstr, pattern> {
+  bits<32> enc;
+
+  let Inst{31-24} = enc{31-24};
+  let Inst{19-16} = enc{19-16};
+}
+
+class DirectiveInsnRIE<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstRIEd<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{7-0}   = enc{7-0};
+}
+
+class DirectiveInsnRIL<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstRILa<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+  string type;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{35-32} = enc{35-32};
+}
+
+class DirectiveInsnRIS<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstRIS<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{7-0}   = enc{7-0};
+}
+
+class DirectiveInsnRR<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstRR<0, outs, ins, asmstr, pattern> {
+  bits<16> enc;
+
+  let Inst{15-8} = enc{15-8};
+}
+
+class DirectiveInsnRRE<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstRRE<0, outs, ins, asmstr, pattern> {
+  bits<32> enc;
+
+  let Inst{31-16} = enc{31-16};
+}
+
+class DirectiveInsnRRF<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstRRFa<0, outs, ins, asmstr, pattern> {
+  bits<32> enc;
+
+  let Inst{31-16} = enc{31-16};
+}
+
+class DirectiveInsnRRS<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstRRS<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{7-0}   = enc{7-0};
+}
+
+class DirectiveInsnRS<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstRSa<0, outs, ins, asmstr, pattern> {
+  bits<32> enc;
+
+  let Inst{31-24} = enc{31-24};
+}
+
+// RSE is like RSY except with a 12 bit displacement (instead of 20).
+class DirectiveInsnRSE<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstRSYa<6, outs, ins, asmstr, pattern> {
+  bits <48> enc;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{31-16} = BD2{15-0};
+  let Inst{15-8}  = 0;
+  let Inst{7-0}   = enc{7-0};
+}
+
+class DirectiveInsnRSI<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstRSI<0, outs, ins, asmstr, pattern> {
+  bits<32> enc;
+
+  let Inst{31-24} = enc{31-24};
+}
+
+class DirectiveInsnRSY<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstRSYa<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{7-0}   = enc{7-0};
+}
+
+class DirectiveInsnRX<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstRXa<0, outs, ins, asmstr, pattern> {
+  bits<32> enc;
+
+  let Inst{31-24} = enc{31-24};
+}
+
+class DirectiveInsnRXE<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstRXE<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let M3 = 0;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{7-0}   = enc{7-0};
+}
+
+class DirectiveInsnRXF<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstRXF<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{7-0}   = enc{7-0};
+}
+
+class DirectiveInsnRXY<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstRXYa<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{7-0}   = enc{7-0};
+}
+
+class DirectiveInsnS<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstS<0, outs, ins, asmstr, pattern> {
+  bits<32> enc;
+
+  let Inst{31-16} = enc{31-16};
+}
+
+class DirectiveInsnSI<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSI<0, outs, ins, asmstr, pattern> {
+  bits<32> enc;
+
+  let Inst{31-24} = enc{31-24};
+}
+
+class DirectiveInsnSIY<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSIY<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{7-0}   = enc{7-0};
+}
+
+class DirectiveInsnSIL<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSIL<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-32} = enc{47-32};
+}
+
+class DirectiveInsnSS<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSSd<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-40} = enc{47-40};
+}
+
+class DirectiveInsnSSE<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSSE<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-32} = enc{47-32};
+}
+
+class DirectiveInsnSSF<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSSF<0, outs, ins, asmstr, pattern> {
+  bits<48> enc;
+
+  let Inst{47-40} = enc{47-40};
+  let Inst{35-32} = enc{35-32};
+}
+
+//===----------------------------------------------------------------------===//
+// Variants of instructions with condition mask
+//===----------------------------------------------------------------------===//
+//
+// For instructions using a condition mask (e.g. conditional branches,
+// compare-and-branch instructions, or conditional move instructions),
+// we generally need to create multiple instruction patterns:
+//
+// - One used for code generation, which encodes the condition mask as an
+//   MI operand, but writes out an extended mnemonic for better readability.
+// - One pattern for the base form of the instruction with an explicit
+//   condition mask (encoded as a plain integer MI operand).
+// - Specific patterns for each extended mnemonic, where the condition mask
+//   is implied by the pattern name and not otherwise encoded at all.
+//
+// We need the latter primarily for the assembler and disassembler, since the
+// assembler parser is not able to decode part of an instruction mnemonic
+// into an operand.  Thus we provide separate patterns for each mnemonic.
+//
+// Note that in some cases there are two different mnemonics for the same
+// condition mask.  In this case we cannot have both instructions available
+// to the disassembler at the same time since the encodings are not distinct.
+// Therefore the alternate forms are marked isAsmParserOnly.
+//
+// We don't make one of the two names an alias of the other because
+// we need the custom parsing routines to select the correct register class.
+//
+// This section provides helpers for generating the specific forms.
+//
+//===----------------------------------------------------------------------===//
+
+// A class to describe a variant of an instruction with condition mask.
+class CondVariant<bits<4> ccmaskin, string suffixin, bit alternatein> {
+  // The fixed condition mask to use.
+  bits<4> ccmask = ccmaskin;
+
+  // The suffix to use for the extended assembler mnemonic.
+  string suffix = suffixin;
+
+  // Whether this is an alternate that needs to be marked isAsmParserOnly.
+  bit alternate = alternatein;
+}
+
+// Condition mask 15 means "always true", which is used to define
+// unconditional branches as a variant of conditional branches.
+def CondAlways : CondVariant<15, "", 0>;
+
+// Condition masks for general instructions that can set all 4 bits.
+def CondVariantO   : CondVariant<1,  "o",   0>;
+def CondVariantH   : CondVariant<2,  "h",   0>;
+def CondVariantP   : CondVariant<2,  "p",   1>;
+def CondVariantNLE : CondVariant<3,  "nle", 0>;
+def CondVariantL   : CondVariant<4,  "l",   0>;
+def CondVariantM   : CondVariant<4,  "m",   1>;
+def CondVariantNHE : CondVariant<5,  "nhe", 0>;
+def CondVariantLH  : CondVariant<6,  "lh",  0>;
+def CondVariantNE  : CondVariant<7,  "ne",  0>;
+def CondVariantNZ  : CondVariant<7,  "nz",  1>;
+def CondVariantE   : CondVariant<8,  "e",   0>;
+def CondVariantZ   : CondVariant<8,  "z",   1>;
+def CondVariantNLH : CondVariant<9,  "nlh", 0>;
+def CondVariantHE  : CondVariant<10, "he",  0>;
+def CondVariantNL  : CondVariant<11, "nl",  0>;
+def CondVariantNM  : CondVariant<11, "nm",  1>;
+def CondVariantLE  : CondVariant<12, "le",  0>;
+def CondVariantNH  : CondVariant<13, "nh",  0>;
+def CondVariantNP  : CondVariant<13, "np",  1>;
+def CondVariantNO  : CondVariant<14, "no",  0>;
+
+// A helper class to look up one of the above by name.
+class CV<string name>
+  : CondVariant<!cast<CondVariant>("CondVariant"#name).ccmask,
+                !cast<CondVariant>("CondVariant"#name).suffix,
+                !cast<CondVariant>("CondVariant"#name).alternate>;
+
+// Condition masks for integer instructions (e.g. compare-and-branch).
+// This is like the list above, except that condition 3 is not possible
+// and that the low bit of the mask is therefore always 0.  This means
+// that each condition has two names.  Conditions "o" and "no" are not used.
+def IntCondVariantH   : CondVariant<2,  "h",   0>;
+def IntCondVariantNLE : CondVariant<2,  "nle", 1>;
+def IntCondVariantL   : CondVariant<4,  "l",   0>;
+def IntCondVariantNHE : CondVariant<4,  "nhe", 1>;
+def IntCondVariantLH  : CondVariant<6,  "lh",  0>;
+def IntCondVariantNE  : CondVariant<6,  "ne",  1>;
+def IntCondVariantE   : CondVariant<8,  "e",   0>;
+def IntCondVariantNLH : CondVariant<8,  "nlh", 1>;
+def IntCondVariantHE  : CondVariant<10, "he",  0>;
+def IntCondVariantNL  : CondVariant<10, "nl",  1>;
+def IntCondVariantLE  : CondVariant<12, "le",  0>;
+def IntCondVariantNH  : CondVariant<12, "nh",  1>;
+
+// A helper class to look up one of the above by name.
+class ICV<string name>
+  : CondVariant<!cast<CondVariant>("IntCondVariant"#name).ccmask,
+                !cast<CondVariant>("IntCondVariant"#name).suffix,
+                !cast<CondVariant>("IntCondVariant"#name).alternate>;
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions with semantics
+//===----------------------------------------------------------------------===//
+//
+// These classes have the form [Cond]<Category><Format>, where <Format> is one
+// of the formats defined above and where <Category> describes the inputs
+// and outputs.  "Cond" is used if the instruction is conditional,
+// in which case the 4-bit condition-code mask is added as a final operand.
+// <Category> can be one of:
+//
+//   Inherent:
+//     One register output operand and no input operands.
+//
+//   StoreInherent:
+//     One address operand.  The instruction stores to the address.
+//
+//   SideEffectInherent:
+//     No input or output operands, but causes some side effect.
+//
+//   Branch:
+//     One branch target.  The instruction branches to the target.
+//
+//   Call:
+//     One output operand and one branch target.  The instruction stores
+//     the return address to the output operand and branches to the target.
+//
+//   CmpBranch:
+//     Two input operands and one optional branch target.  The instruction
+//     compares the two input operands and branches or traps on the result.
+//
+//   BranchUnary:
+//     One register output operand, one register input operand and one branch
+//     target.  The instructions stores a modified form of the source register
+//     in the destination register and branches on the result.
+//
+//   BranchBinary:
+//     One register output operand, two register input operands and one branch
+//     target. The instructions stores a modified form of one of the source
+//     registers in the destination register and branches on the result.
+//
+//   LoadMultiple:
+//     One address input operand and two explicit output operands.
+//     The instruction loads a range of registers from the address,
+//     with the explicit operands giving the first and last register
+//     to load.  Other loaded registers are added as implicit definitions.
+//
+//   StoreMultiple:
+//     Two explicit input register operands and an address operand.
+//     The instruction stores a range of registers to the address,
+//     with the explicit operands giving the first and last register
+//     to store.  Other stored registers are added as implicit uses.
+//
+//   StoreLength:
+//     One value operand, one length operand and one address operand.
+//     The instruction stores the value operand to the address but
+//     doesn't write more than the number of bytes specified by the
+//     length operand.
+//
+//   LoadAddress:
+//     One register output operand and one address operand.
+//
+//   SideEffectAddress:
+//     One address operand.  No output operands, but causes some side effect.
+//
+//   Unary:
+//     One register output operand and one input operand.
+//
+//   Store:
+//     One address operand and one other input operand.  The instruction
+//     stores to the address.
+//
+//   SideEffectUnary:
+//     One input operand.  No output operands, but causes some side effect.
+//
+//   Binary:
+//     One register output operand and two input operands.
+//
+//   StoreBinary:
+//     One address operand and two other input operands.  The instruction
+//     stores to the address.
+//
+//   SideEffectBinary:
+//     Two input operands.  No output operands, but causes some side effect.
+//
+//   Compare:
+//     Two input operands and an implicit CC output operand.
+//
+//   Test:
+//     Two input operands and an implicit CC output operand.  The second
+//     input operand is an "address" operand used as a test class mask.
+//
+//   Ternary:
+//     One register output operand and three input operands.
+//
+//   SideEffectTernary:
+//     Three input operands.  No output operands, but causes some side effect.
+//
+//   Quaternary:
+//     One register output operand and four input operands.
+//
+//   LoadAndOp:
+//     One output operand and two input operands, one of which is an address.
+//     The instruction both reads from and writes to the address.
+//
+//   CmpSwap:
+//     One output operand and three input operands, one of which is an address.
+//     The instruction both reads from and writes to the address.
+//
+//   RotateSelect:
+//     One output operand and five input operands.  The first two operands
+//     are registers and the other three are immediates.
+//
+//   Prefetch:
+//     One 4-bit immediate operand and one address operand.  The immediate
+//     operand is 1 for a load prefetch and 2 for a store prefetch.
+//
+//   BranchPreload:
+//     One 4-bit immediate operand and two address operands.
+//
+// The format determines which input operands are tied to output operands,
+// and also determines the shape of any address operand.
+//
+// Multiclasses of the form <Category><Format>Pair define two instructions,
+// one with <Category><Format> and one with <Category><Format>Y.  The name
+// of the first instruction has no suffix, the name of the second has
+// an extra "y".
+//
+//===----------------------------------------------------------------------===//
+
+class InherentRRE<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                  SDPatternOperator operator>
+  : InstRRE<opcode, (outs cls:$R1), (ins),
+            mnemonic#"\t$R1",
+            [(set cls:$R1, (operator))]> {
+  let R2 = 0;
+}
+
+class InherentVRIa<string mnemonic, bits<16> opcode, bits<16> value>
+  : InstVRIa<opcode, (outs VR128:$V1), (ins), mnemonic#"\t$V1", []> {
+  let I2 = value;
+  let M3 = 0;
+}
+
+class StoreInherentS<string mnemonic, bits<16> opcode,
+                     SDPatternOperator operator, bits<5> bytes>
+  : InstS<opcode, (outs), (ins bdaddr12only:$BD2),
+          mnemonic#"\t$BD2", [(operator bdaddr12only:$BD2)]> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+class SideEffectInherentE<string mnemonic, bits<16>opcode>
+  : InstE<opcode, (outs), (ins), mnemonic, []>;
+
+class SideEffectInherentS<string mnemonic, bits<16> opcode,
+                          SDPatternOperator operator>
+  : InstS<opcode, (outs), (ins), mnemonic, [(operator)]> {
+  let BD2 = 0;
+}
+
+// Allow an optional TLS marker symbol to generate TLS call relocations.
+class CallRI<string mnemonic, bits<12> opcode>
+  : InstRIb<opcode, (outs), (ins GR64:$R1, brtarget16tls:$RI2),
+            mnemonic#"\t$R1, $RI2", []>;
+
+// Allow an optional TLS marker symbol to generate TLS call relocations.
+class CallRIL<string mnemonic, bits<12> opcode>
+  : InstRILb<opcode, (outs), (ins GR64:$R1, brtarget32tls:$RI2),
+             mnemonic#"\t$R1, $RI2", []>;
+
+class CallRR<string mnemonic, bits<8> opcode>
+  : InstRR<opcode, (outs), (ins GR64:$R1, ADDR64:$R2),
+           mnemonic#"\t$R1, $R2", []>;
+
+class CallRX<string mnemonic, bits<8> opcode>
+  : InstRXa<opcode, (outs), (ins GR64:$R1, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $XBD2", []>;
+
+class CondBranchRI<string mnemonic, bits<12> opcode,
+                   SDPatternOperator operator = null_frag>
+  : InstRIc<opcode, (outs), (ins cond4:$valid, cond4:$M1, brtarget16:$RI2),
+            !subst("#", "${M1}", mnemonic)#"\t$RI2",
+            [(operator cond4:$valid, cond4:$M1, bb:$RI2)]> {
+  let CCMaskFirst = 1;
+}
+
+class AsmCondBranchRI<string mnemonic, bits<12> opcode>
+  : InstRIc<opcode, (outs), (ins imm32zx4:$M1, brtarget16:$RI2),
+            mnemonic#"\t$M1, $RI2", []>;
+
+class FixedCondBranchRI<CondVariant V, string mnemonic, bits<12> opcode,
+                        SDPatternOperator operator = null_frag>
+  : InstRIc<opcode, (outs), (ins brtarget16:$RI2),
+            !subst("#", V.suffix, mnemonic)#"\t$RI2", [(operator bb:$RI2)]> {
+  let isAsmParserOnly = V.alternate;
+  let M1 = V.ccmask;
+}
+
+class CondBranchRIL<string mnemonic, bits<12> opcode>
+  : InstRILc<opcode, (outs), (ins cond4:$valid, cond4:$M1, brtarget32:$RI2),
+             !subst("#", "${M1}", mnemonic)#"\t$RI2", []> {
+  let CCMaskFirst = 1;
+}
+
+class AsmCondBranchRIL<string mnemonic, bits<12> opcode>
+  : InstRILc<opcode, (outs), (ins imm32zx4:$M1, brtarget32:$RI2),
+             mnemonic#"\t$M1, $RI2", []>;
+
+class FixedCondBranchRIL<CondVariant V, string mnemonic, bits<12> opcode>
+  : InstRILc<opcode, (outs), (ins brtarget32:$RI2),
+             !subst("#", V.suffix, mnemonic)#"\t$RI2", []> {
+  let isAsmParserOnly = V.alternate;
+  let M1 = V.ccmask;
+}
+
+class CondBranchRR<string mnemonic, bits<8> opcode>
+  : InstRR<opcode, (outs), (ins cond4:$valid, cond4:$R1, GR64:$R2),
+           !subst("#", "${R1}", mnemonic)#"\t$R2", []> {
+  let CCMaskFirst = 1;
+}
+
+class AsmCondBranchRR<string mnemonic, bits<8> opcode>
+  : InstRR<opcode, (outs), (ins imm32zx4:$R1, GR64:$R2),
+           mnemonic#"\t$R1, $R2", []>;
+
+class FixedCondBranchRR<CondVariant V, string mnemonic, bits<8> opcode,
+                      SDPatternOperator operator = null_frag>
+  : InstRR<opcode, (outs), (ins ADDR64:$R2),
+           !subst("#", V.suffix, mnemonic)#"\t$R2", [(operator ADDR64:$R2)]> {
+  let isAsmParserOnly = V.alternate;
+  let R1 = V.ccmask;
+}
+
+class CondBranchRX<string mnemonic, bits<8> opcode>
+  : InstRXb<opcode, (outs), (ins cond4:$valid, cond4:$M1, bdxaddr12only:$XBD2),
+            !subst("#", "${M1}", mnemonic)#"\t$XBD2", []> {
+  let CCMaskFirst = 1;
+}
+
+class AsmCondBranchRX<string mnemonic, bits<8> opcode>
+  : InstRXb<opcode, (outs), (ins imm32zx4:$M1, bdxaddr12only:$XBD2),
+            mnemonic#"\t$M1, $XBD2", []>;
+
+class FixedCondBranchRX<CondVariant V, string mnemonic, bits<8> opcode>
+  : InstRXb<opcode, (outs), (ins bdxaddr12only:$XBD2),
+            !subst("#", V.suffix, mnemonic)#"\t$XBD2", []> {
+  let isAsmParserOnly = V.alternate;
+  let M1 = V.ccmask;
+}
+
+class CmpBranchRIEa<string mnemonic, bits<16> opcode,
+                    RegisterOperand cls, Immediate imm>
+  : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2, cond4:$M3),
+             mnemonic#"$M3\t$R1, $I2", []>;
+
+class AsmCmpBranchRIEa<string mnemonic, bits<16> opcode,
+                       RegisterOperand cls, Immediate imm>
+  : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2, imm32zx4:$M3),
+             mnemonic#"\t$R1, $I2, $M3", []>;
+
+class FixedCmpBranchRIEa<CondVariant V, string mnemonic, bits<16> opcode,
+                          RegisterOperand cls, Immediate imm>
+  : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2),
+             mnemonic#V.suffix#"\t$R1, $I2", []> {
+  let isAsmParserOnly = V.alternate;
+  let M3 = V.ccmask;
+}
+
+multiclass CmpBranchRIEaPair<string mnemonic, bits<16> opcode,
+                             RegisterOperand cls, Immediate imm> {
+  let isCodeGenOnly = 1 in
+    def "" : CmpBranchRIEa<mnemonic, opcode, cls, imm>;
+  def Asm : AsmCmpBranchRIEa<mnemonic, opcode, cls, imm>;
+}
+
+class CmpBranchRIEb<string mnemonic, bits<16> opcode,
+                    RegisterOperand cls>
+  : InstRIEb<opcode, (outs),
+             (ins cls:$R1, cls:$R2, cond4:$M3, brtarget16:$RI4),
+             mnemonic#"$M3\t$R1, $R2, $RI4", []>;
+
+class AsmCmpBranchRIEb<string mnemonic, bits<16> opcode,
+                       RegisterOperand cls>
+  : InstRIEb<opcode, (outs),
+             (ins cls:$R1, cls:$R2, imm32zx4:$M3, brtarget16:$RI4),
+             mnemonic#"\t$R1, $R2, $M3, $RI4", []>;
+
+class FixedCmpBranchRIEb<CondVariant V, string mnemonic, bits<16> opcode,
+                         RegisterOperand cls>
+  : InstRIEb<opcode, (outs), (ins cls:$R1, cls:$R2, brtarget16:$RI4),
+             mnemonic#V.suffix#"\t$R1, $R2, $RI4", []> {
+  let isAsmParserOnly = V.alternate;
+  let M3 = V.ccmask;
+}
+
+multiclass CmpBranchRIEbPair<string mnemonic, bits<16> opcode,
+                             RegisterOperand cls> {
+  let isCodeGenOnly = 1 in
+    def "" : CmpBranchRIEb<mnemonic, opcode, cls>;
+  def Asm : AsmCmpBranchRIEb<mnemonic, opcode, cls>;
+}
+
+class CmpBranchRIEc<string mnemonic, bits<16> opcode,
+                    RegisterOperand cls, Immediate imm>
+  : InstRIEc<opcode, (outs),
+             (ins cls:$R1, imm:$I2, cond4:$M3, brtarget16:$RI4),
+             mnemonic#"$M3\t$R1, $I2, $RI4", []>;
+
+class AsmCmpBranchRIEc<string mnemonic, bits<16> opcode,
+                       RegisterOperand cls, Immediate imm>
+  : InstRIEc<opcode, (outs),
+             (ins cls:$R1, imm:$I2, imm32zx4:$M3, brtarget16:$RI4),
+             mnemonic#"\t$R1, $I2, $M3, $RI4", []>;
+
+class FixedCmpBranchRIEc<CondVariant V, string mnemonic, bits<16> opcode,
+                         RegisterOperand cls, Immediate imm>
+  : InstRIEc<opcode, (outs), (ins cls:$R1, imm:$I2, brtarget16:$RI4),
+             mnemonic#V.suffix#"\t$R1, $I2, $RI4", []> {
+  let isAsmParserOnly = V.alternate;
+  let M3 = V.ccmask;
+}
+
+multiclass CmpBranchRIEcPair<string mnemonic, bits<16> opcode,
+                            RegisterOperand cls, Immediate imm> {
+  let isCodeGenOnly = 1 in
+    def "" : CmpBranchRIEc<mnemonic, opcode, cls, imm>;
+  def Asm : AsmCmpBranchRIEc<mnemonic, opcode, cls, imm>;
+}
+
+class CmpBranchRRFc<string mnemonic, bits<16> opcode,
+                    RegisterOperand cls>
+  : InstRRFc<opcode, (outs), (ins cls:$R1, cls:$R2, cond4:$M3),
+             mnemonic#"$M3\t$R1, $R2", []>;
+
+class AsmCmpBranchRRFc<string mnemonic, bits<16> opcode,
+                       RegisterOperand cls>
+  : InstRRFc<opcode, (outs), (ins cls:$R1, cls:$R2, imm32zx4:$M3),
+             mnemonic#"\t$R1, $R2, $M3", []>;
+
+multiclass CmpBranchRRFcPair<string mnemonic, bits<16> opcode,
+                             RegisterOperand cls> {
+  let isCodeGenOnly = 1 in
+    def "" : CmpBranchRRFc<mnemonic, opcode, cls>;
+  def Asm : AsmCmpBranchRRFc<mnemonic, opcode, cls>;
+}
+
+class FixedCmpBranchRRFc<CondVariant V, string mnemonic, bits<16> opcode,
+                          RegisterOperand cls>
+  : InstRRFc<opcode, (outs), (ins cls:$R1, cls:$R2),
+             mnemonic#V.suffix#"\t$R1, $R2", []> {
+  let isAsmParserOnly = V.alternate;
+  let M3 = V.ccmask;
+}
+
+class CmpBranchRRS<string mnemonic, bits<16> opcode,
+                   RegisterOperand cls>
+  : InstRRS<opcode, (outs),
+            (ins cls:$R1, cls:$R2, cond4:$M3, bdaddr12only:$BD4),
+            mnemonic#"$M3\t$R1, $R2, $BD4", []>;
+
+class AsmCmpBranchRRS<string mnemonic, bits<16> opcode,
+                      RegisterOperand cls>
+  : InstRRS<opcode, (outs),
+            (ins cls:$R1, cls:$R2, imm32zx4:$M3, bdaddr12only:$BD4),
+            mnemonic#"\t$R1, $R2, $M3, $BD4", []>;
+
+class FixedCmpBranchRRS<CondVariant V, string mnemonic, bits<16> opcode,
+                        RegisterOperand cls>
+  : InstRRS<opcode, (outs), (ins cls:$R1, cls:$R2, bdaddr12only:$BD4),
+            mnemonic#V.suffix#"\t$R1, $R2, $BD4", []> {
+  let isAsmParserOnly = V.alternate;
+  let M3 = V.ccmask;
+}
+
+multiclass CmpBranchRRSPair<string mnemonic, bits<16> opcode,
+                            RegisterOperand cls> {
+  let isCodeGenOnly = 1 in
+    def "" : CmpBranchRRS<mnemonic, opcode, cls>;
+  def Asm : AsmCmpBranchRRS<mnemonic, opcode, cls>;
+}
+
+class CmpBranchRIS<string mnemonic, bits<16> opcode,
+                   RegisterOperand cls, Immediate imm>
+  : InstRIS<opcode, (outs),
+            (ins cls:$R1, imm:$I2, cond4:$M3, bdaddr12only:$BD4),
+            mnemonic#"$M3\t$R1, $I2, $BD4", []>;
+
+class AsmCmpBranchRIS<string mnemonic, bits<16> opcode,
+                      RegisterOperand cls, Immediate imm>
+  : InstRIS<opcode, (outs),
+            (ins cls:$R1, imm:$I2, imm32zx4:$M3, bdaddr12only:$BD4),
+            mnemonic#"\t$R1, $I2, $M3, $BD4", []>;
+
+class FixedCmpBranchRIS<CondVariant V, string mnemonic, bits<16> opcode,
+                        RegisterOperand cls, Immediate imm>
+  : InstRIS<opcode, (outs), (ins cls:$R1, imm:$I2, bdaddr12only:$BD4),
+            mnemonic#V.suffix#"\t$R1, $I2, $BD4", []> {
+  let isAsmParserOnly = V.alternate;
+  let M3 = V.ccmask;
+}
+
+multiclass CmpBranchRISPair<string mnemonic, bits<16> opcode,
+                            RegisterOperand cls, Immediate imm> {
+  let isCodeGenOnly = 1 in
+    def "" : CmpBranchRIS<mnemonic, opcode, cls, imm>;
+  def Asm : AsmCmpBranchRIS<mnemonic, opcode, cls, imm>;
+}
+
+class CmpBranchRSYb<string mnemonic, bits<16> opcode,
+                    RegisterOperand cls>
+  : InstRSYb<opcode, (outs), (ins cls:$R1, bdaddr20only:$BD2, cond4:$M3),
+             mnemonic#"$M3\t$R1, $BD2", []>;
+
+class AsmCmpBranchRSYb<string mnemonic, bits<16> opcode,
+                       RegisterOperand cls>
+  : InstRSYb<opcode, (outs), (ins cls:$R1, bdaddr20only:$BD2, imm32zx4:$M3),
+             mnemonic#"\t$R1, $M3, $BD2", []>;
+
+multiclass CmpBranchRSYbPair<string mnemonic, bits<16> opcode,
+                             RegisterOperand cls> {
+  let isCodeGenOnly = 1 in
+    def "" : CmpBranchRSYb<mnemonic, opcode, cls>;
+  def Asm : AsmCmpBranchRSYb<mnemonic, opcode, cls>;
+}
+
+class FixedCmpBranchRSYb<CondVariant V, string mnemonic, bits<16> opcode,
+                          RegisterOperand cls>
+  : InstRSYb<opcode, (outs), (ins cls:$R1, bdaddr20only:$BD2),
+             mnemonic#V.suffix#"\t$R1, $BD2", []> {
+  let isAsmParserOnly = V.alternate;
+  let M3 = V.ccmask;
+}
+
+class BranchUnaryRI<string mnemonic, bits<12> opcode, RegisterOperand cls>
+  : InstRIb<opcode, (outs cls:$R1), (ins cls:$R1src, brtarget16:$RI2),
+            mnemonic##"\t$R1, $RI2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BranchUnaryRIL<string mnemonic, bits<12> opcode, RegisterOperand cls>
+  : InstRILb<opcode, (outs cls:$R1), (ins cls:$R1src, brtarget32:$RI2),
+             mnemonic##"\t$R1, $RI2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BranchUnaryRR<string mnemonic, bits<8> opcode, RegisterOperand cls>
+  : InstRR<opcode, (outs cls:$R1), (ins cls:$R1src, GR64:$R2),
+           mnemonic##"\t$R1, $R2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BranchUnaryRRE<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRRE<opcode, (outs cls:$R1), (ins cls:$R1src, GR64:$R2),
+            mnemonic##"\t$R1, $R2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BranchUnaryRX<string mnemonic, bits<8> opcode, RegisterOperand cls>
+  : InstRXa<opcode, (outs cls:$R1), (ins cls:$R1src, bdxaddr12only:$XBD2),
+            mnemonic##"\t$R1, $XBD2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BranchUnaryRXY<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRXYa<opcode, (outs cls:$R1), (ins cls:$R1src, bdxaddr20only:$XBD2),
+             mnemonic##"\t$R1, $XBD2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BranchBinaryRSI<string mnemonic, bits<8> opcode, RegisterOperand cls>
+  : InstRSI<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, brtarget16:$RI2),
+            mnemonic##"\t$R1, $R3, $RI2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BranchBinaryRIEe<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRIEe<opcode, (outs cls:$R1),
+             (ins cls:$R1src, cls:$R3, brtarget16:$RI2),
+             mnemonic##"\t$R1, $R3, $RI2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BranchBinaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls>
+  : InstRSa<opcode, (outs cls:$R1),
+            (ins cls:$R1src, cls:$R3, bdaddr12only:$BD2),
+            mnemonic##"\t$R1, $R3, $BD2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BranchBinaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRSYa<opcode,
+             (outs cls:$R1), (ins cls:$R1src, cls:$R3, bdaddr20only:$BD2),
+             mnemonic##"\t$R1, $R3, $BD2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class LoadMultipleRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+                     AddressingMode mode = bdaddr12only>
+  : InstRSa<opcode, (outs cls:$R1, cls:$R3), (ins mode:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2", []> {
+  let mayLoad = 1;
+}
+
+class LoadMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                      AddressingMode mode = bdaddr20only>
+  : InstRSYa<opcode, (outs cls:$R1, cls:$R3), (ins mode:$BD2),
+             mnemonic#"\t$R1, $R3, $BD2", []> {
+  let mayLoad = 1;
+}
+
+multiclass LoadMultipleRSPair<string mnemonic, bits<8> rsOpcode,
+                              bits<16> rsyOpcode, RegisterOperand cls> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : LoadMultipleRS<mnemonic, rsOpcode, cls, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : LoadMultipleRSY<mnemonic#"y", rsyOpcode, cls, bdaddr20pair>;
+  }
+}
+
+class LoadMultipleVRSa<string mnemonic, bits<16> opcode>
+  : InstVRSa<opcode, (outs VR128:$V1, VR128:$V3), (ins bdaddr12only:$BD2),
+             mnemonic#"\t$V1, $V3, $BD2", []> {
+  let M4 = 0;
+  let mayLoad = 1;
+}
+
+class StoreRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                 RegisterOperand cls>
+  : InstRILb<opcode, (outs), (ins cls:$R1, pcrel32:$RI2),
+             mnemonic#"\t$R1, $RI2",
+             [(operator cls:$R1, pcrel32:$RI2)]> {
+  let mayStore = 1;
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+class StoreRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+              RegisterOperand cls, bits<5> bytes,
+              AddressingMode mode = bdxaddr12only>
+  : InstRXa<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(operator cls:$R1, mode:$XBD2)]> {
+  let OpKey = mnemonic#"r"#cls;
+  let OpType = "mem";
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+class StoreRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               RegisterOperand cls, bits<5> bytes,
+               AddressingMode mode = bdxaddr20only>
+  : InstRXYa<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+             mnemonic#"\t$R1, $XBD2",
+             [(operator cls:$R1, mode:$XBD2)]> {
+  let OpKey = mnemonic#"r"#cls;
+  let OpType = "mem";
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+multiclass StoreRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
+                       SDPatternOperator operator, RegisterOperand cls,
+                       bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : StoreRX<mnemonic, rxOpcode, operator, cls, bytes, bdxaddr12pair>;
+    let DispSize = "20" in
+      def Y  : StoreRXY<mnemonic#"y", rxyOpcode, operator, cls, bytes,
+                        bdxaddr20pair>;
+  }
+}
+
+class StoreVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               TypedReg tr, bits<5> bytes, bits<4> type = 0>
+  : InstVRX<opcode, (outs), (ins tr.op:$V1, bdxaddr12only:$XBD2),
+            mnemonic#"\t$V1, $XBD2",
+            [(set tr.op:$V1, (tr.vt (operator bdxaddr12only:$XBD2)))]> {
+  let M3 = type;
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+class StoreLengthVRSb<string mnemonic, bits<16> opcode,
+                      SDPatternOperator operator, bits<5> bytes>
+  : InstVRSb<opcode, (outs), (ins VR128:$V1, GR32:$R3, bdaddr12only:$BD2),
+             mnemonic#"\t$V1, $R3, $BD2",
+             [(operator VR128:$V1, GR32:$R3, bdaddr12only:$BD2)]> {
+  let M4 = 0;
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+class StoreMultipleRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+                      AddressingMode mode = bdaddr12only>
+  : InstRSa<opcode, (outs), (ins cls:$R1, cls:$R3, mode:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2", []> {
+  let mayStore = 1;
+}
+
+class StoreMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                       AddressingMode mode = bdaddr20only>
+  : InstRSYa<opcode, (outs), (ins cls:$R1, cls:$R3, mode:$BD2),
+             mnemonic#"\t$R1, $R3, $BD2", []> {
+  let mayStore = 1;
+}
+
+multiclass StoreMultipleRSPair<string mnemonic, bits<8> rsOpcode,
+                               bits<16> rsyOpcode, RegisterOperand cls> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : StoreMultipleRS<mnemonic, rsOpcode, cls, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : StoreMultipleRSY<mnemonic#"y", rsyOpcode, cls, bdaddr20pair>;
+  }
+}
+
+class StoreMultipleVRSa<string mnemonic, bits<16> opcode>
+  : InstVRSa<opcode, (outs), (ins VR128:$V1, VR128:$V3, bdaddr12only:$BD2),
+             mnemonic#"\t$V1, $V3, $BD2", []> {
+  let M4 = 0;
+  let mayStore = 1;
+}
+
+// StoreSI* instructions are used to store an integer to memory, but the
+// addresses are more restricted than for normal stores.  If we are in the
+// situation of having to force either the address into a register or the
+// constant into a register, it's usually better to do the latter.
+// We therefore match the address in the same way as a normal store and
+// only use the StoreSI* instruction if the matched address is suitable.
+class StoreSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+              Immediate imm>
+  : InstSI<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2),
+           mnemonic#"\t$BD1, $I2",
+           [(operator imm:$I2, mviaddr12pair:$BD1)]> {
+  let mayStore = 1;
+}
+
+class StoreSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               Immediate imm>
+  : InstSIY<opcode, (outs), (ins mviaddr20pair:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(operator imm:$I2, mviaddr20pair:$BD1)]> {
+  let mayStore = 1;
+}
+
+class StoreSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               Immediate imm>
+  : InstSIL<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(operator imm:$I2, mviaddr12pair:$BD1)]> {
+  let mayStore = 1;
+}
+
+multiclass StoreSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
+                       SDPatternOperator operator, Immediate imm> {
+  let DispKey = mnemonic in {
+    let DispSize = "12" in
+      def "" : StoreSI<mnemonic, siOpcode, operator, imm>;
+    let DispSize = "20" in
+      def Y  : StoreSIY<mnemonic#"y", siyOpcode, operator, imm>;
+  }
+}
+
+class StoreSSE<string mnemonic, bits<16> opcode>
+  : InstSSE<opcode, (outs), (ins bdaddr12only:$BD1, bdaddr12only:$BD2),
+            mnemonic#"\t$BD1, $BD2", []> {
+  let mayStore = 1;
+}
+
+class CondStoreRSY<string mnemonic, bits<16> opcode,
+                   RegisterOperand cls, bits<5> bytes,
+                   AddressingMode mode = bdaddr20only>
+  : InstRSYb<opcode, (outs), (ins cls:$R1, mode:$BD2, cond4:$valid, cond4:$M3),
+            mnemonic#"$M3\t$R1, $BD2", []> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+  let CCMaskLast = 1;
+}
+
+// Like CondStoreRSY, but used for the raw assembly form.  The condition-code
+// mask is the third operand rather than being part of the mnemonic.
+class AsmCondStoreRSY<string mnemonic, bits<16> opcode,
+                      RegisterOperand cls, bits<5> bytes,
+                      AddressingMode mode = bdaddr20only>
+  : InstRSYb<opcode, (outs), (ins cls:$R1, mode:$BD2, imm32zx4:$M3),
+             mnemonic#"\t$R1, $BD2, $M3", []> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+// Like CondStoreRSY, but with a fixed CC mask.
+class FixedCondStoreRSY<CondVariant V, string mnemonic, bits<16> opcode,
+                        RegisterOperand cls, bits<5> bytes,
+                        AddressingMode mode = bdaddr20only>
+  : InstRSYb<opcode, (outs), (ins cls:$R1, mode:$BD2),
+             mnemonic#V.suffix#"\t$R1, $BD2", []> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+  let isAsmParserOnly = V.alternate;
+  let M3 = V.ccmask;
+}
+
+multiclass CondStoreRSYPair<string mnemonic, bits<16> opcode,
+                            RegisterOperand cls, bits<5> bytes,
+                            AddressingMode mode = bdaddr20only> {
+  let isCodeGenOnly = 1 in
+    def "" : CondStoreRSY<mnemonic, opcode, cls, bytes, mode>;
+  def Asm : AsmCondStoreRSY<mnemonic, opcode, cls, bytes, mode>;
+}
+
+class SideEffectUnaryI<string mnemonic, bits<8> opcode, Immediate imm>
+  : InstI<opcode, (outs), (ins imm:$I1),
+          mnemonic#"\t$I1", []>;
+
+class SideEffectUnaryRR<string mnemonic, bits<8>opcode, RegisterOperand cls>
+  : InstRR<opcode, (outs), (ins cls:$R1),
+           mnemonic#"\t$R1", []> {
+  let R2 = 0;
+}
+
+class SideEffectUnaryRRE<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                         SDPatternOperator operator>
+  : InstRRE<opcode, (outs), (ins cls:$R1),
+            mnemonic#"\t$R1", [(operator cls:$R1)]> {
+  let R2 = 0;
+}
+
+class SideEffectUnaryS<string mnemonic, bits<16> opcode,
+                       SDPatternOperator operator, bits<5> bytes,
+                       AddressingMode mode = bdaddr12only>
+  : InstS<opcode, (outs), (ins mode:$BD2),
+          mnemonic#"\t$BD2", [(operator mode:$BD2)]> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class SideEffectAddressS<string mnemonic, bits<16> opcode,
+                        SDPatternOperator operator,
+                        AddressingMode mode = bdaddr12only>
+  : InstS<opcode, (outs), (ins mode:$BD2),
+          mnemonic#"\t$BD2", [(operator mode:$BD2)]>;
+
+class LoadAddressRX<string mnemonic, bits<8> opcode,
+                    SDPatternOperator operator, AddressingMode mode>
+  : InstRXa<opcode, (outs GR64:$R1), (ins mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set GR64:$R1, (operator mode:$XBD2))]>;
+
+class LoadAddressRXY<string mnemonic, bits<16> opcode,
+                     SDPatternOperator operator, AddressingMode mode>
+  : InstRXYa<opcode, (outs GR64:$R1), (ins mode:$XBD2),
+             mnemonic#"\t$R1, $XBD2",
+             [(set GR64:$R1, (operator mode:$XBD2))]>;
+
+multiclass LoadAddressRXPair<string mnemonic, bits<8> rxOpcode,
+                             bits<16> rxyOpcode, SDPatternOperator operator> {
+  let DispKey = mnemonic in {
+    let DispSize = "12" in
+      def "" : LoadAddressRX<mnemonic, rxOpcode, operator, laaddr12pair>;
+    let DispSize = "20" in
+      def Y  : LoadAddressRXY<mnemonic#"y", rxyOpcode, operator, laaddr20pair>;
+  }
+}
+
+class LoadAddressRIL<string mnemonic, bits<12> opcode,
+                     SDPatternOperator operator>
+  : InstRILb<opcode, (outs GR64:$R1), (ins pcrel32:$RI2),
+             mnemonic#"\t$R1, $RI2",
+             [(set GR64:$R1, (operator pcrel32:$RI2))]>;
+
+class UnaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+              RegisterOperand cls1, RegisterOperand cls2>
+  : InstRR<opcode, (outs cls1:$R1), (ins cls2:$R2),
+           mnemonic#"\t$R1, $R2",
+           [(set cls1:$R1, (operator cls2:$R2))]> {
+  let OpKey = mnemonic#cls1;
+  let OpType = "reg";
+}
+
+class UnaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs cls1:$R1), (ins cls2:$R2),
+            mnemonic#"\t$R1, $R2",
+            [(set cls1:$R1, (operator cls2:$R2))]> {
+  let OpKey = mnemonic#cls1;
+  let OpType = "reg";
+}
+
+class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+              RegisterOperand cls, Immediate imm>
+  : InstRIa<opcode, (outs cls:$R1), (ins imm:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(set cls:$R1, (operator imm:$I2))]>;
+
+class UnaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+               RegisterOperand cls, Immediate imm>
+  : InstRILa<opcode, (outs cls:$R1), (ins imm:$I2),
+             mnemonic#"\t$R1, $I2",
+             [(set cls:$R1, (operator imm:$I2))]>;
+
+class UnaryRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                 RegisterOperand cls>
+  : InstRILb<opcode, (outs cls:$R1), (ins pcrel32:$RI2),
+             mnemonic#"\t$R1, $RI2",
+             [(set cls:$R1, (operator pcrel32:$RI2))]> {
+  let mayLoad = 1;
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+class CondUnaryRSY<string mnemonic, bits<16> opcode,
+                   SDPatternOperator operator, RegisterOperand cls,
+                   bits<5> bytes, AddressingMode mode = bdaddr20only>
+  : InstRSYb<opcode, (outs cls:$R1),
+             (ins cls:$R1src, mode:$BD2, cond4:$valid, cond4:$M3),
+             mnemonic#"$M3\t$R1, $BD2",
+             [(set cls:$R1,
+                   (z_select_ccmask (operator bdaddr20only:$BD2), cls:$R1src,
+                                    cond4:$valid, cond4:$M3))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+  let CCMaskLast = 1;
+}
+
+// Like CondUnaryRSY, but used for the raw assembly form.  The condition-code
+// mask is the third operand rather than being part of the mnemonic.
+class AsmCondUnaryRSY<string mnemonic, bits<16> opcode,
+                      RegisterOperand cls, bits<5> bytes,
+                      AddressingMode mode = bdaddr20only>
+  : InstRSYb<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$BD2, imm32zx4:$M3),
+             mnemonic#"\t$R1, $BD2, $M3", []> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+// Like CondUnaryRSY, but with a fixed CC mask.
+class FixedCondUnaryRSY<CondVariant V, string mnemonic, bits<16> opcode,
+                        RegisterOperand cls, bits<5> bytes,
+                        AddressingMode mode = bdaddr20only>
+  : InstRSYb<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$BD2),
+             mnemonic#V.suffix#"\t$R1, $BD2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+  let isAsmParserOnly = V.alternate;
+  let M3 = V.ccmask;
+}
+
+multiclass CondUnaryRSYPair<string mnemonic, bits<16> opcode,
+                            SDPatternOperator operator,
+                            RegisterOperand cls, bits<5> bytes,
+                            AddressingMode mode = bdaddr20only> {
+  let isCodeGenOnly = 1 in
+    def "" : CondUnaryRSY<mnemonic, opcode, operator, cls, bytes, mode>;
+  def Asm : AsmCondUnaryRSY<mnemonic, opcode, cls, bytes, mode>;
+}
+
+
+class UnaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+              RegisterOperand cls, bits<5> bytes,
+              AddressingMode mode = bdxaddr12only>
+  : InstRXa<opcode, (outs cls:$R1), (ins mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set cls:$R1, (operator mode:$XBD2))]> {
+  let OpKey = mnemonic#"r"#cls;
+  let OpType = "mem";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class UnaryRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               RegisterOperand cls, bits<5> bytes>
+  : InstRXE<opcode, (outs cls:$R1), (ins bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set cls:$R1, (operator bdxaddr12only:$XBD2))]> {
+  let OpKey = mnemonic#"r"#cls;
+  let OpType = "mem";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+  let M3 = 0;
+}
+
+class UnaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               RegisterOperand cls, bits<5> bytes,
+               AddressingMode mode = bdxaddr20only>
+  : InstRXYa<opcode, (outs cls:$R1), (ins mode:$XBD2),
+             mnemonic#"\t$R1, $XBD2",
+             [(set cls:$R1, (operator mode:$XBD2))]> {
+  let OpKey = mnemonic#"r"#cls;
+  let OpType = "mem";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+multiclass UnaryRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
+                       SDPatternOperator operator, RegisterOperand cls,
+                       bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : UnaryRX<mnemonic, rxOpcode, operator, cls, bytes, bdxaddr12pair>;
+    let DispSize = "20" in
+      def Y  : UnaryRXY<mnemonic#"y", rxyOpcode, operator, cls, bytes,
+                        bdxaddr20pair>;
+  }
+}
+
+class UnaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                TypedReg tr, Immediate imm, bits<4> type = 0>
+  : InstVRIa<opcode, (outs tr.op:$V1), (ins imm:$I2),
+             mnemonic#"\t$V1, $I2",
+             [(set tr.op:$V1, (tr.vt (operator imm:$I2)))]> {
+  let M3 = type;
+}
+
+class UnaryVRIaGeneric<string mnemonic, bits<16> opcode, Immediate imm>
+  : InstVRIa<opcode, (outs VR128:$V1), (ins imm:$I2, imm32zx4:$M3),
+             mnemonic#"\t$V1, $I2, $M3", []>;
+
+class UnaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m4 = 0,
+                bits<4> m5 = 0>
+  : InstVRRa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2),
+             mnemonic#"\t$V1, $V2",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2))))]> {
+  let M3 = type;
+  let M4 = m4;
+  let M5 = m5;
+}
+
+class UnaryVRRaGeneric<string mnemonic, bits<16> opcode, bits<4> m4 = 0,
+                       bits<4> m5 = 0>
+  : InstVRRa<opcode, (outs VR128:$V1), (ins VR128:$V2, imm32zx4:$M3),
+             mnemonic#"\t$V1, $V2, $M3", []> {
+  let M4 = m4;
+  let M5 = m5;
+}
+
+class UnaryVRRaFloatGeneric<string mnemonic, bits<16> opcode, bits<4> m5 = 0>
+  : InstVRRa<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, imm32zx4:$M3, imm32zx4:$M4),
+             mnemonic#"\t$V1, $V2, $M3, $M4", []> {
+  let M5 = m5;
+}
+
+// Declare a pair of instructions, one which sets CC and one which doesn't.
+// The CC-setting form ends with "S" and sets the low bit of M5.
+// The form that does not set CC has an extra operand to optionally allow
+// specifying arbitrary M5 values in assembler.
+multiclass UnaryExtraVRRaSPair<string mnemonic, bits<16> opcode,
+                               SDPatternOperator operator,
+                               SDPatternOperator operator_cc,
+                               TypedReg tr1, TypedReg tr2, bits<4> type> {
+  let M3 = type, M4 = 0 in
+    def "" : InstVRRa<opcode, (outs tr1.op:$V1),
+                      (ins tr2.op:$V2, imm32zx4:$M5),
+                      mnemonic#"\t$V1, $V2, $M5", []>;
+  def : Pat<(tr1.vt (operator (tr2.vt tr2.op:$V2))),
+            (!cast<Instruction>(NAME) tr2.op:$V2, 0)>;
+  def : InstAlias<mnemonic#"\t$V1, $V2",
+                  (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2, 0)>;
+  let Defs = [CC] in
+    def S : UnaryVRRa<mnemonic##"s", opcode, operator_cc, tr1, tr2,
+                      type, 0, 1>;
+}
+
+multiclass UnaryExtraVRRaSPairGeneric<string mnemonic, bits<16> opcode> {
+  let M4 = 0 in
+    def "" : InstVRRa<opcode, (outs VR128:$V1),
+                     (ins VR128:$V2, imm32zx4:$M3, imm32zx4:$M5),
+                     mnemonic#"\t$V1, $V2, $M3, $M5", []>;
+  def : InstAlias<mnemonic#"\t$V1, $V2, $M3",
+                  (!cast<Instruction>(NAME) VR128:$V1, VR128:$V2,
+                                            imm32zx4:$M3, 0)>;
+}
+
+class UnaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+               TypedReg tr, bits<5> bytes, bits<4> type = 0>
+  : InstVRX<opcode, (outs tr.op:$V1), (ins bdxaddr12only:$XBD2),
+            mnemonic#"\t$V1, $XBD2",
+            [(set tr.op:$V1, (tr.vt (operator bdxaddr12only:$XBD2)))]> {
+  let M3 = type;
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class UnaryVRXGeneric<string mnemonic, bits<16> opcode>
+  : InstVRX<opcode, (outs VR128:$V1), (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
+            mnemonic#"\t$V1, $XBD2, $M3", []> {
+  let mayLoad = 1;
+}
+
+class SideEffectBinaryRX<string mnemonic, bits<8> opcode,
+                         RegisterOperand cls>
+  : InstRXa<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
+            mnemonic##"\t$R1, $XBD2", []>;
+
+class SideEffectBinaryRILPC<string mnemonic, bits<12> opcode,
+                            RegisterOperand cls>
+  : InstRILb<opcode, (outs), (ins cls:$R1, pcrel32:$RI2),
+             mnemonic##"\t$R1, $RI2", []> {
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+class SideEffectBinaryIE<string mnemonic, bits<16> opcode,
+                         Immediate imm1, Immediate imm2>
+  : InstIE<opcode, (outs), (ins imm1:$I1, imm2:$I2),
+           mnemonic#"\t$I1, $I2", []>;
+
+class SideEffectBinarySIL<string mnemonic, bits<16> opcode,
+                          SDPatternOperator operator, Immediate imm>
+  : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2", [(operator bdaddr12only:$BD1, imm:$I2)]>;
+
+class BinaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+               RegisterOperand cls1, RegisterOperand cls2>
+  : InstRR<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+           mnemonic#"\t$R1, $R2",
+           [(set cls1:$R1, (operator cls1:$R1src, cls2:$R2))]> {
+  let OpKey = mnemonic#cls1;
+  let OpType = "reg";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BinaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+            mnemonic#"\t$R1, $R2",
+            [(set cls1:$R1, (operator cls1:$R1src, cls2:$R2))]> {
+  let OpKey = mnemonic#cls1;
+  let OpType = "reg";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BinaryRRFa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls1, RegisterOperand cls2,
+                 RegisterOperand cls3>
+  : InstRRFa<opcode, (outs cls1:$R1), (ins cls2:$R2, cls3:$R3),
+             mnemonic#"\t$R1, $R2, $R3",
+             [(set cls1:$R1, (operator cls2:$R2, cls3:$R3))]> {
+  let M4 = 0;
+}
+
+multiclass BinaryRRAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
+                        SDPatternOperator operator, RegisterOperand cls1,
+                        RegisterOperand cls2> {
+  let NumOpsKey = mnemonic in {
+    let NumOpsValue = "3" in
+      def K : BinaryRRFa<mnemonic#"k", opcode2, null_frag, cls1, cls1, cls2>,
+              Requires<[FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRR<mnemonic, opcode1, operator, cls1, cls2>;
+  }
+}
+
+multiclass BinaryRREAndK<string mnemonic, bits<16> opcode1, bits<16> opcode2,
+                         SDPatternOperator operator, RegisterOperand cls1,
+                         RegisterOperand cls2> {
+  let NumOpsKey = mnemonic in {
+    let NumOpsValue = "3" in
+      def K : BinaryRRFa<mnemonic#"k", opcode2, null_frag, cls1, cls1, cls2>,
+              Requires<[FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRRE<mnemonic, opcode1, operator, cls1, cls2>;
+  }
+}
+
+class BinaryRRFb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls1, RegisterOperand cls2,
+                 RegisterOperand cls3>
+  : InstRRFb<opcode, (outs cls1:$R1), (ins cls2:$R2, cls3:$R3),
+             mnemonic#"\t$R1, $R3, $R2",
+             [(set cls1:$R1, (operator cls2:$R2, cls3:$R3))]> {
+  let M4 = 0;
+}
+
+class BinaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                RegisterOperand cls2>
+  : InstRRFe<opcode, (outs cls1:$R1), (ins imm32zx4:$M3, cls2:$R2),
+             mnemonic#"\t$R1, $M3, $R2", []> {
+  let M4 = 0;
+}
+
+class CondBinaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                   RegisterOperand cls2>
+  : InstRRFc<opcode, (outs cls1:$R1),
+             (ins cls1:$R1src, cls2:$R2, cond4:$valid, cond4:$M3),
+             mnemonic#"$M3\t$R1, $R2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let CCMaskLast = 1;
+}
+
+// Like CondBinaryRRF, but used for the raw assembly form.  The condition-code
+// mask is the third operand rather than being part of the mnemonic.
+class AsmCondBinaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                       RegisterOperand cls2>
+  : InstRRFc<opcode, (outs cls1:$R1),
+             (ins cls1:$R1src, cls2:$R2, imm32zx4:$M3),
+             mnemonic#"\t$R1, $R2, $M3", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+// Like CondBinaryRRF, but with a fixed CC mask.
+class FixedCondBinaryRRF<CondVariant V, string mnemonic, bits<16> opcode,
+                         RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRFc<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+             mnemonic#V.suffix#"\t$R1, $R2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let isAsmParserOnly = V.alternate;
+  let M3 = V.ccmask;
+}
+
+multiclass CondBinaryRRFPair<string mnemonic, bits<16> opcode,
+                             RegisterOperand cls1, RegisterOperand cls2> {
+  let isCodeGenOnly = 1 in
+    def "" : CondBinaryRRF<mnemonic, opcode, cls1, cls2>;
+  def Asm : AsmCondBinaryRRF<mnemonic, opcode, cls1, cls2>;
+}
+
+class BinaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+               RegisterOperand cls, Immediate imm>
+  : InstRIa<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BinaryRIE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls, Immediate imm>
+  : InstRIEd<opcode, (outs cls:$R1), (ins cls:$R3, imm:$I2),
+             mnemonic#"\t$R1, $R3, $I2",
+             [(set cls:$R1, (operator cls:$R3, imm:$I2))]>;
+
+multiclass BinaryRIAndK<string mnemonic, bits<12> opcode1, bits<16> opcode2,
+                        SDPatternOperator operator, RegisterOperand cls,
+                        Immediate imm> {
+  let NumOpsKey = mnemonic in {
+    let NumOpsValue = "3" in
+      def K : BinaryRIE<mnemonic##"k", opcode2, null_frag, cls, imm>,
+              Requires<[FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRI<mnemonic, opcode1, operator, cls, imm>;
+  }
+}
+
+class CondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                    Immediate imm>
+  : InstRIEg<opcode, (outs cls:$R1),
+             (ins cls:$R1src, imm:$I2, cond4:$valid, cond4:$M3),
+             mnemonic#"$M3\t$R1, $I2",
+             [(set cls:$R1, (z_select_ccmask imm:$I2, cls:$R1src,
+                                             cond4:$valid, cond4:$M3))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let CCMaskLast = 1;
+}
+
+// Like CondBinaryRIE, but used for the raw assembly form.  The condition-code
+// mask is the third operand rather than being part of the mnemonic.
+class AsmCondBinaryRIE<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                       Immediate imm>
+  : InstRIEg<opcode, (outs cls:$R1),
+             (ins cls:$R1src, imm:$I2, imm32zx4:$M3),
+             mnemonic#"\t$R1, $I2, $M3", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+// Like CondBinaryRIE, but with a fixed CC mask.
+class FixedCondBinaryRIE<CondVariant V, string mnemonic, bits<16> opcode,
+                         RegisterOperand cls, Immediate imm>
+  : InstRIEg<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+             mnemonic#V.suffix#"\t$R1, $I2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let isAsmParserOnly = V.alternate;
+  let M3 = V.ccmask;
+}
+
+multiclass CondBinaryRIEPair<string mnemonic, bits<16> opcode,
+                             RegisterOperand cls, Immediate imm> {
+  let isCodeGenOnly = 1 in
+    def "" : CondBinaryRIE<mnemonic, opcode, cls, imm>;
+  def Asm : AsmCondBinaryRIE<mnemonic, opcode, cls, imm>;
+}
+
+class BinaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                RegisterOperand cls, Immediate imm>
+  : InstRILa<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+             mnemonic#"\t$R1, $I2",
+             [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BinaryRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+               RegisterOperand cls>
+  : InstRSa<opcode, (outs cls:$R1), (ins cls:$R1src, shift12only:$BD2),
+            mnemonic#"\t$R1, $BD2",
+            [(set cls:$R1, (operator cls:$R1src, shift12only:$BD2))]> {
+  let R3 = 0;
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BinaryRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls>
+  : InstRSYa<opcode, (outs cls:$R1), (ins cls:$R3, shift20only:$BD2),
+             mnemonic#"\t$R1, $R3, $BD2",
+             [(set cls:$R1, (operator cls:$R3, shift20only:$BD2))]>;
+
+multiclass BinaryRSAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
+                        SDPatternOperator operator, RegisterOperand cls> {
+  let NumOpsKey = mnemonic in {
+    let NumOpsValue = "3" in
+      def K  : BinaryRSY<mnemonic##"k", opcode2, null_frag, cls>,
+               Requires<[FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRS<mnemonic, opcode1, operator, cls>;
+  }
+}
+
+class BinaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+               RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
+               AddressingMode mode = bdxaddr12only>
+  : InstRXa<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set cls:$R1, (operator cls:$R1src, (load mode:$XBD2)))]> {
+  let OpKey = mnemonic#"r"#cls;
+  let OpType = "mem";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class BinaryRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                  RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
+  : InstRXE<opcode, (outs cls:$R1), (ins cls:$R1src, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set cls:$R1, (operator cls:$R1src,
+                                     (load bdxaddr12only:$XBD2)))]> {
+  let OpKey = mnemonic#"r"#cls;
+  let OpType = "mem";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+  let M3 = 0;
+}
+
+class BinaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
+                AddressingMode mode = bdxaddr20only>
+  : InstRXYa<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$XBD2),
+             mnemonic#"\t$R1, $XBD2",
+             [(set cls:$R1, (operator cls:$R1src, (load mode:$XBD2)))]> {
+  let OpKey = mnemonic#"r"#cls;
+  let OpType = "mem";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+multiclass BinaryRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
+                        SDPatternOperator operator, RegisterOperand cls,
+                        SDPatternOperator load, bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : BinaryRX<mnemonic, rxOpcode, operator, cls, load, bytes,
+                        bdxaddr12pair>;
+    let DispSize = "20" in
+      def Y  : BinaryRXY<mnemonic#"y", rxyOpcode, operator, cls, load, bytes,
+                         bdxaddr20pair>;
+  }
+}
+
+class BinarySI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+               Operand imm, AddressingMode mode = bdaddr12only>
+  : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
+           mnemonic#"\t$BD1, $I2",
+           [(store (operator (load mode:$BD1), imm:$I2), mode:$BD1)]> {
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+class BinarySIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                Operand imm, AddressingMode mode = bdaddr20only>
+  : InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(store (operator (load mode:$BD1), imm:$I2), mode:$BD1)]> {
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+multiclass BinarySIPair<string mnemonic, bits<8> siOpcode,
+                        bits<16> siyOpcode, SDPatternOperator operator,
+                        Operand imm> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : BinarySI<mnemonic, siOpcode, operator, imm, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : BinarySIY<mnemonic#"y", siyOpcode, operator, imm, bdaddr20pair>;
+  }
+}
+
+class BinarySSF<string mnemonic, bits<12> opcode, RegisterOperand cls>
+  : InstSSF<opcode, (outs cls:$R3), (ins bdaddr12pair:$BD1, bdaddr12pair:$BD2),
+            mnemonic#"\t$R3, $BD1, $BD2", []> {
+  let mayLoad = 1;
+}
+
+class BinaryVRIb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 TypedReg tr, bits<4> type>
+  : InstVRIb<opcode, (outs tr.op:$V1), (ins imm32zx8:$I2, imm32zx8:$I3),
+             mnemonic#"\t$V1, $I2, $I3",
+             [(set tr.op:$V1, (tr.vt (operator imm32zx8:$I2, imm32zx8:$I3)))]> {
+  let M4 = type;
+}
+
+class BinaryVRIbGeneric<string mnemonic, bits<16> opcode>
+  : InstVRIb<opcode, (outs VR128:$V1),
+             (ins imm32zx8:$I2, imm32zx8:$I3, imm32zx4:$M4),
+             mnemonic#"\t$V1, $I2, $I3, $M4", []>;
+
+class BinaryVRIc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 TypedReg tr1, TypedReg tr2, bits<4> type>
+  : InstVRIc<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, imm32zx16:$I2),
+             mnemonic#"\t$V1, $V3, $I2",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V3),
+                                                 imm32zx16:$I2)))]> {
+  let M4 = type;
+}
+
+class BinaryVRIcGeneric<string mnemonic, bits<16> opcode>
+  : InstVRIc<opcode, (outs VR128:$V1),
+             (ins VR128:$V3, imm32zx16:$I2, imm32zx4:$M4),
+             mnemonic#"\t$V1, $V3, $I2, $M4", []>;
+
+class BinaryVRIe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> m5>
+  : InstVRIe<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx12:$I3),
+             mnemonic#"\t$V1, $V2, $I3",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+                                                 imm32zx12:$I3)))]> {
+  let M4 = type;
+  let M5 = m5;
+}
+
+class BinaryVRIeFloatGeneric<string mnemonic, bits<16> opcode>
+  : InstVRIe<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, imm32zx12:$I3, imm32zx4:$M4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $I3, $M4, $M5", []>;
+
+class BinaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m4 = 0>
+  : InstVRRa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $M5",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+                                                 imm32zx12:$M5)))]> {
+  let M3 = type;
+  let M4 = m4;
+}
+
+class BinaryVRRaFloatGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRa<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, imm32zx4:$M3, imm32zx4:$M4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $M3, $M4, $M5", []>;
+
+class BinaryVRRb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 TypedReg tr1, TypedReg tr2, bits<4> type = 0,
+                 bits<4> modifier = 0>
+  : InstVRRb<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3),
+             mnemonic#"\t$V1, $V2, $V3",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+                                                 (tr2.vt tr2.op:$V3))))]> {
+  let M4 = type;
+  let M5 = modifier;
+}
+
+// Declare a pair of instructions, one which sets CC and one which doesn't.
+// The CC-setting form ends with "S" and sets the low bit of M5.
+multiclass BinaryVRRbSPair<string mnemonic, bits<16> opcode,
+                           SDPatternOperator operator,
+                           SDPatternOperator operator_cc, TypedReg tr1,
+                           TypedReg tr2, bits<4> type, bits<4> modifier = 0> {
+  def "" : BinaryVRRb<mnemonic, opcode, operator, tr1, tr2, type,
+                      !and (modifier, 14)>;
+  let Defs = [CC] in
+    def S : BinaryVRRb<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
+                       !add (!and (modifier, 14), 1)>;
+}
+
+class BinaryVRRbSPairGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRb<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $V3, $M4, $M5", []>;
+
+// Declare a pair of instructions, one which sets CC and one which doesn't.
+// The CC-setting form ends with "S" and sets the low bit of M5.
+// The form that does not set CC has an extra operand to optionally allow
+// specifying arbitrary M5 values in assembler.
+multiclass BinaryExtraVRRbSPair<string mnemonic, bits<16> opcode,
+                                SDPatternOperator operator,
+                                SDPatternOperator operator_cc,
+                                TypedReg tr1, TypedReg tr2, bits<4> type> {
+  let M4 = type in
+    def "" : InstVRRb<opcode, (outs tr1.op:$V1),
+                      (ins tr2.op:$V2, tr2.op:$V3, imm32zx4:$M5),
+                      mnemonic#"\t$V1, $V2, $V3, $M5", []>;
+  def : Pat<(tr1.vt (operator (tr2.vt tr2.op:$V2), (tr2.vt tr2.op:$V3))),
+            (!cast<Instruction>(NAME) tr2.op:$V2, tr2.op:$V3, 0)>;
+  def : InstAlias<mnemonic#"\t$V1, $V2, $V3",
+                  (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
+                                            tr2.op:$V3, 0)>;
+  let Defs = [CC] in
+    def S : BinaryVRRb<mnemonic##"s", opcode, operator_cc, tr1, tr2, type, 1>;
+}
+
+multiclass BinaryExtraVRRbSPairGeneric<string mnemonic, bits<16> opcode> {
+  def "" : InstVRRb<opcode, (outs VR128:$V1),
+                   (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5),
+                   mnemonic#"\t$V1, $V2, $V3, $M4, $M5", []>;
+  def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $M4",
+                  (!cast<Instruction>(NAME) VR128:$V1, VR128:$V2, VR128:$V3,
+                                            imm32zx4:$M4, 0)>;
+}
+
+class BinaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m5 = 0,
+                 bits<4> m6 = 0>
+  : InstVRRc<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3),
+             mnemonic#"\t$V1, $V2, $V3",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+                                                 (tr2.vt tr2.op:$V3))))]> {
+  let M4 = type;
+  let M5 = m5;
+  let M6 = m6;
+}
+
+class BinaryVRRcGeneric<string mnemonic, bits<16> opcode, bits<4> m5 = 0,
+                        bits<4> m6 = 0>
+  : InstVRRc<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3, imm32zx4:$M4),
+             mnemonic#"\t$V1, $V2, $V3, $M4", []> {
+  let M5 = m5;
+  let M6 = m6;
+}
+
+class BinaryVRRcFloatGeneric<string mnemonic, bits<16> opcode, bits<4> m6 = 0>
+  : InstVRRc<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $V3, $M4, $M5", []> {
+  let M6 = m6;
+}
+
+// Declare a pair of instructions, one which sets CC and one which doesn't.
+// The CC-setting form ends with "S" and sets the low bit of M5.
+multiclass BinaryVRRcSPair<string mnemonic, bits<16> opcode,
+                           SDPatternOperator operator,
+                           SDPatternOperator operator_cc, TypedReg tr1,
+                           TypedReg tr2, bits<4> type, bits<4> m5,
+                           bits<4> modifier = 0> {
+  def "" : BinaryVRRc<mnemonic, opcode, operator, tr1, tr2, type,
+                      m5, !and (modifier, 14)>;
+  let Defs = [CC] in
+    def S : BinaryVRRc<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
+                       m5, !add (!and (modifier, 14), 1)>;
+}
+
+class BinaryVRRcSPairFloatGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRc<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5,
+                  imm32zx4:$M6),
+             mnemonic#"\t$V1, $V2, $V3, $M4, $M5, $M6", []>;
+
+class BinaryVRRf<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 TypedReg tr>
+  : InstVRRf<opcode, (outs tr.op:$V1), (ins GR64:$R2, GR64:$R3),
+             mnemonic#"\t$V1, $R2, $R3",
+             [(set tr.op:$V1, (tr.vt (operator GR64:$R2, GR64:$R3)))]>;
+
+class BinaryVRSa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 TypedReg tr1, TypedReg tr2, bits<4> type>
+  : InstVRSa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, shift12only:$BD2),
+             mnemonic#"\t$V1, $V3, $BD2",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V3),
+                                                 shift12only:$BD2)))]> {
+  let M4 = type;
+}
+
+class BinaryVRSaGeneric<string mnemonic, bits<16> opcode>
+  : InstVRSa<opcode, (outs VR128:$V1),
+             (ins VR128:$V3, shift12only:$BD2, imm32zx4:$M4),
+             mnemonic#"\t$V1, $V3, $BD2, $M4", []>;
+
+class BinaryVRSb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 bits<5> bytes>
+  : InstVRSb<opcode, (outs VR128:$V1), (ins GR32:$R3, bdaddr12only:$BD2),
+             mnemonic#"\t$V1, $R3, $BD2",
+             [(set VR128:$V1, (operator GR32:$R3, bdaddr12only:$BD2))]> {
+  let M4 = 0;
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class BinaryVRSc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 TypedReg tr, bits<4> type>
+  : InstVRSc<opcode, (outs GR64:$R1), (ins tr.op:$V3, shift12only:$BD2),
+           mnemonic#"\t$R1, $V3, $BD2",
+           [(set GR64:$R1, (operator (tr.vt tr.op:$V3), shift12only:$BD2))]> {
+  let M4 = type;
+}
+
+class BinaryVRScGeneric<string mnemonic, bits<16> opcode>
+  : InstVRSc<opcode, (outs GR64:$R1),
+             (ins VR128:$V3, shift12only:$BD2, imm32zx4: $M4),
+             mnemonic#"\t$R1, $V3, $BD2, $M4", []>;
+
+class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                TypedReg tr, bits<5> bytes>
+  : InstVRX<opcode, (outs VR128:$V1), (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
+            mnemonic#"\t$V1, $XBD2, $M3",
+            [(set tr.op:$V1, (tr.vt (operator bdxaddr12only:$XBD2,
+                                              imm32zx4:$M3)))]> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
+                     Immediate index>
+  : InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3),
+            mnemonic#"\t$V1, $VBD2, $M3", []> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+class StoreBinaryVRX<string mnemonic, bits<16> opcode,
+                     SDPatternOperator operator, TypedReg tr, bits<5> bytes,
+                     Immediate index>
+  : InstVRX<opcode, (outs), (ins tr.op:$V1, bdxaddr12only:$XBD2, index:$M3),
+            mnemonic#"\t$V1, $XBD2, $M3",
+            [(operator (tr.vt tr.op:$V1), bdxaddr12only:$XBD2, index:$M3)]> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+class MemoryBinarySSd<string mnemonic, bits<8> opcode,
+                      RegisterOperand cls>
+  : InstSSd<opcode, (outs),
+            (ins bdraddr12only:$RBD1, bdaddr12only:$BD2, cls:$R3),
+            mnemonic#"\t$RBD1, $BD2, $R3", []>;
+
+class CompareRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRR<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+           mnemonic#"\t$R1, $R2",
+           [(operator cls1:$R1, cls2:$R2)]> {
+  let OpKey = mnemonic#cls1;
+  let OpType = "reg";
+  let isCompare = 1;
+}
+
+class CompareRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+            mnemonic#"\t$R1, $R2",
+            [(operator cls1:$R1, cls2:$R2)]> {
+  let OpKey = mnemonic#cls1;
+  let OpType = "reg";
+  let isCompare = 1;
+}
+
+class CompareRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                RegisterOperand cls, Immediate imm>
+  : InstRIa<opcode, (outs), (ins cls:$R1, imm:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(operator cls:$R1, imm:$I2)]> {
+  let isCompare = 1;
+}
+
+class CompareRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                 RegisterOperand cls, Immediate imm>
+  : InstRILa<opcode, (outs), (ins cls:$R1, imm:$I2),
+             mnemonic#"\t$R1, $I2",
+             [(operator cls:$R1, imm:$I2)]> {
+  let isCompare = 1;
+}
+
+class CompareRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                   RegisterOperand cls, SDPatternOperator load>
+  : InstRILb<opcode, (outs), (ins cls:$R1, pcrel32:$RI2),
+             mnemonic#"\t$R1, $RI2",
+             [(operator cls:$R1, (load pcrel32:$RI2))]> {
+  let isCompare = 1;
+  let mayLoad = 1;
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+class CompareRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+                RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
+                AddressingMode mode = bdxaddr12only>
+  : InstRXa<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(operator cls:$R1, (load mode:$XBD2))]> {
+  let OpKey = mnemonic#"r"#cls;
+  let OpType = "mem";
+  let isCompare = 1;
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class CompareRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
+  : InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(operator cls:$R1, (load bdxaddr12only:$XBD2))]> {
+  let OpKey = mnemonic#"r"#cls;
+  let OpType = "mem";
+  let isCompare = 1;
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+  let M3 = 0;
+}
+
+class CompareRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
+                 AddressingMode mode = bdxaddr20only>
+  : InstRXYa<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+             mnemonic#"\t$R1, $XBD2",
+             [(operator cls:$R1, (load mode:$XBD2))]> {
+  let OpKey = mnemonic#"r"#cls;
+  let OpType = "mem";
+  let isCompare = 1;
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+multiclass CompareRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
+                         SDPatternOperator operator, RegisterOperand cls,
+                         SDPatternOperator load, bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : CompareRX<mnemonic, rxOpcode, operator, cls,
+                         load, bytes, bdxaddr12pair>;
+    let DispSize = "20" in
+      def Y  : CompareRXY<mnemonic#"y", rxyOpcode, operator, cls,
+                          load, bytes, bdxaddr20pair>;
+  }
+}
+
+class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+                SDPatternOperator load, Immediate imm,
+                AddressingMode mode = bdaddr12only>
+  : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
+           mnemonic#"\t$BD1, $I2",
+           [(operator (load mode:$BD1), imm:$I2)]> {
+  let isCompare = 1;
+  let mayLoad = 1;
+}
+
+class CompareSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 SDPatternOperator load, Immediate imm>
+  : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(operator (load bdaddr12only:$BD1), imm:$I2)]> {
+  let isCompare = 1;
+  let mayLoad = 1;
+}
+
+class CompareSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 SDPatternOperator load, Immediate imm,
+                 AddressingMode mode = bdaddr20only>
+  : InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(operator (load mode:$BD1), imm:$I2)]> {
+  let isCompare = 1;
+  let mayLoad = 1;
+}
+
+multiclass CompareSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
+                         SDPatternOperator operator, SDPatternOperator load,
+                         Immediate imm> {
+  let DispKey = mnemonic in {
+    let DispSize = "12" in
+      def "" : CompareSI<mnemonic, siOpcode, operator, load, imm, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : CompareSIY<mnemonic#"y", siyOpcode, operator, load, imm,
+                          bdaddr20pair>;
+  }
+}
+
+class CompareVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                  TypedReg tr, bits<4> type>
+  : InstVRRa<opcode, (outs), (ins tr.op:$V1, tr.op:$V2),
+             mnemonic#"\t$V1, $V2",
+             [(operator (tr.vt tr.op:$V1), (tr.vt tr.op:$V2))]> {
+  let isCompare = 1;
+  let M3 = type;
+  let M4 = 0;
+  let M5 = 0;
+}
+
+class CompareVRRaGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRa<opcode, (outs), (ins VR128:$V1, VR128:$V2, imm32zx4:$M3),
+             mnemonic#"\t$V1, $V2, $M3", []> {
+  let isCompare = 1;
+  let M4 = 0;
+  let M5 = 0;
+}
+
+class CompareVRRaFloatGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRa<opcode, (outs),
+             (ins VR64:$V1, VR64:$V2, imm32zx4:$M3, imm32zx4:$M4),
+             mnemonic#"\t$V1, $V2, $M3, $M4", []> {
+  let isCompare = 1;
+  let M5 = 0;
+}
+
+class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+              RegisterOperand cls>
+  : InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(operator cls:$R1, bdxaddr12only:$XBD2)]> {
+  let M3 = 0;
+}
+
+class SideEffectTernaryRRFc<string mnemonic, bits<16> opcode,
+                            RegisterOperand cls1, RegisterOperand cls2,
+                            Immediate imm>
+  : InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2, imm:$M3),
+             mnemonic#"\t$R1, $R2, $M3", []>;
+
+class SideEffectTernarySSF<string mnemonic, bits<12> opcode,
+                           RegisterOperand cls>
+  : InstSSF<opcode, (outs),
+            (ins bdaddr12only:$BD1, bdaddr12only:$BD2, cls:$R3),
+            mnemonic#"\t$BD1, $BD2, $R3", []>;
+
+class TernaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                  RegisterOperand cls2>
+  : InstRRFe<opcode, (outs cls1:$R1),
+             (ins imm32zx4:$M3, cls2:$R2, imm32zx4:$M4),
+             mnemonic#"\t$R1, $M3, $R2, $M4", []>;
+
+class TernaryRRD<string mnemonic, bits<16> opcode,
+                 SDPatternOperator operator, RegisterOperand cls>
+  : InstRRD<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, cls:$R2),
+            mnemonic#"\t$R1, $R3, $R2",
+            [(set cls:$R1, (operator cls:$R1src, cls:$R3, cls:$R2))]> {
+  let OpKey = mnemonic#cls;
+  let OpType = "reg";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class TernaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+                bits<5> bytes, AddressingMode mode = bdaddr12only>
+  : InstRSb<opcode, (outs cls:$R1),
+            (ins cls:$R1src, imm32zx4:$M3, mode:$BD2),
+            mnemonic#"\t$R1, $M3, $BD2", []> {
+
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class TernaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                bits<5> bytes, AddressingMode mode = bdaddr20only>
+  : InstRSYb<opcode, (outs cls:$R1),
+             (ins cls:$R1src, imm32zx4:$M3, mode:$BD2),
+             mnemonic#"\t$R1, $M3, $BD2", []> {
+
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+multiclass TernaryRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
+                         RegisterOperand cls, bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : TernaryRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : TernaryRSY<mnemonic#"y", rsyOpcode, cls, bytes, bdaddr20pair>;
+  }
+}
+
+class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
+  : InstRXF<opcode, (outs cls:$R1),
+            (ins cls:$R1src, cls:$R3, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $R3, $XBD2",
+            [(set cls:$R1, (operator cls:$R1src, cls:$R3,
+                                     (load bdxaddr12only:$XBD2)))]> {
+  let OpKey = mnemonic#"r"#cls;
+  let OpType = "mem";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class TernaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                  TypedReg tr1, TypedReg tr2, Immediate imm, Immediate index>
+  : InstVRIa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V1src, imm:$I2, index:$M3),
+             mnemonic#"\t$V1, $I2, $M3",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
+                                                 imm:$I2, index:$M3)))]> {
+  let Constraints = "$V1 = $V1src";
+  let DisableEncoding = "$V1src";
+}
+
+class TernaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                  TypedReg tr1, TypedReg tr2, bits<4> type>
+  : InstVRId<opcode, (outs tr1.op:$V1),
+             (ins tr2.op:$V2, tr2.op:$V3, imm32zx8:$I4),
+             mnemonic#"\t$V1, $V2, $V3, $I4",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+                                                 (tr2.vt tr2.op:$V3),
+                                                 imm32zx8:$I4)))]> {
+  let M5 = type;
+}
+
+class TernaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                  TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> m4or>
+  : InstVRRa<opcode, (outs tr1.op:$V1),
+             (ins tr2.op:$V2, imm32zx4:$M4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $M4, $M5",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+                                                 imm32zx4:$M4,
+                                                 imm32zx4:$M5)))],
+             m4or> {
+  let M3 = type;
+}
+
+class TernaryVRRaFloatGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRa<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, imm32zx4:$M3, imm32zx4:$M4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $M3, $M4, $M5", []>;
+
+class TernaryVRRb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                  TypedReg tr1, TypedReg tr2, bits<4> type,
+                  SDPatternOperator m5mask, bits<4> m5or>
+  : InstVRRb<opcode, (outs tr1.op:$V1),
+             (ins tr2.op:$V2, tr2.op:$V3, m5mask:$M5),
+             mnemonic#"\t$V1, $V2, $V3, $M5",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+                                                 (tr2.vt tr2.op:$V3),
+                                                 m5mask:$M5)))],
+             m5or> {
+  let M4 = type;
+}
+
+// Declare a pair of instructions, one which sets CC and one which doesn't.
+// The CC-setting form ends with "S" and sets the low bit of M5.
+// Also create aliases to make use of M5 operand optional in assembler.
+multiclass TernaryOptVRRbSPair<string mnemonic, bits<16> opcode,
+                               SDPatternOperator operator,
+                               SDPatternOperator operator_cc,
+                               TypedReg tr1, TypedReg tr2, bits<4> type,
+                               bits<4> modifier = 0> {
+  def "" : TernaryVRRb<mnemonic, opcode, operator, tr1, tr2, type,
+                       imm32zx4even, !and (modifier, 14)>;
+  def : InstAlias<mnemonic#"\t$V1, $V2, $V3",
+                  (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
+                                            tr2.op:$V3, 0)>;
+  let Defs = [CC] in
+    def S : TernaryVRRb<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
+                        imm32zx4even, !add(!and (modifier, 14), 1)>;
+  def : InstAlias<mnemonic#"s\t$V1, $V2, $V3",
+                  (!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2,
+                                                tr2.op:$V3, 0)>;
+}
+
+multiclass TernaryOptVRRbSPairGeneric<string mnemonic, bits<16> opcode> {
+  def "" : InstVRRb<opcode, (outs VR128:$V1),
+                   (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5),
+                   mnemonic#"\t$V1, $V2, $V3, $M4, $M5", []>;
+  def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $M4",
+                  (!cast<Instruction>(NAME) VR128:$V1, VR128:$V2, VR128:$V3,
+                                            imm32zx4:$M4, 0)>;
+}
+
+class TernaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                  TypedReg tr1, TypedReg tr2>
+  : InstVRRc<opcode, (outs tr1.op:$V1),
+             (ins tr2.op:$V2, tr2.op:$V3, imm32zx4:$M4),
+             mnemonic#"\t$V1, $V2, $V3, $M4",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+                                                 (tr2.vt tr2.op:$V3),
+                                                 imm32zx4:$M4)))]> {
+  let M5 = 0;
+  let M6 = 0;
+}
+
+class TernaryVRRd<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                  TypedReg tr1, TypedReg tr2, bits<4> type = 0>
+  : InstVRRd<opcode, (outs tr1.op:$V1),
+             (ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4),
+             mnemonic#"\t$V1, $V2, $V3, $V4",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+                                                 (tr2.vt tr2.op:$V3),
+                                                 (tr1.vt tr1.op:$V4))))]> {
+  let M5 = type;
+  let M6 = 0;
+}
+
+class TernaryVRRdGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRd<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3, VR128:$V4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $V3, $V4, $M5", []> {
+  let M6 = 0;
+}
+
+class TernaryVRRe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                  TypedReg tr1, TypedReg tr2, bits<4> m5 = 0, bits<4> type = 0>
+  : InstVRRe<opcode, (outs tr1.op:$V1),
+             (ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4),
+             mnemonic#"\t$V1, $V2, $V3, $V4",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+                                                 (tr2.vt tr2.op:$V3),
+                                                 (tr1.vt tr1.op:$V4))))]> {
+  let M5 = m5;
+  let M6 = type;
+}
+
+class TernaryVRReFloatGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRe<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3, VR128:$V4, imm32zx4:$M5, imm32zx4:$M6),
+             mnemonic#"\t$V1, $V2, $V3, $V4, $M5, $M6", []>;
+
+class TernaryVRSb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                  TypedReg tr1, TypedReg tr2, RegisterOperand cls, bits<4> type>
+  : InstVRSb<opcode, (outs tr1.op:$V1),
+             (ins tr2.op:$V1src, cls:$R3, shift12only:$BD2),
+             mnemonic#"\t$V1, $R3, $BD2",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
+                                                 cls:$R3,
+                                                 shift12only:$BD2)))]> {
+  let Constraints = "$V1 = $V1src";
+  let DisableEncoding = "$V1src";
+  let M4 = type;
+}
+
+class TernaryVRSbGeneric<string mnemonic, bits<16> opcode>
+  : InstVRSb<opcode, (outs VR128:$V1),
+             (ins VR128:$V1src, GR64:$R3, shift12only:$BD2, imm32zx4:$M4),
+             mnemonic#"\t$V1, $R3, $BD2, $M4", []> {
+  let Constraints = "$V1 = $V1src";
+  let DisableEncoding = "$V1src";
+}
+
+class TernaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
+                 Immediate index>
+  : InstVRV<opcode, (outs VR128:$V1),
+           (ins VR128:$V1src, bdvaddr12only:$VBD2, index:$M3),
+           mnemonic#"\t$V1, $VBD2, $M3", []> {
+  let Constraints = "$V1 = $V1src";
+  let DisableEncoding = "$V1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class TernaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 TypedReg tr1, TypedReg tr2, bits<5> bytes, Immediate index>
+  : InstVRX<opcode, (outs tr1.op:$V1),
+           (ins tr2.op:$V1src, bdxaddr12only:$XBD2, index:$M3),
+           mnemonic#"\t$V1, $XBD2, $M3",
+           [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
+                                               bdxaddr12only:$XBD2,
+                                               index:$M3)))]> {
+  let Constraints = "$V1 = $V1src";
+  let DisableEncoding = "$V1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class QuaternaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                     TypedReg tr1, TypedReg tr2, bits<4> type>
+  : InstVRId<opcode, (outs tr1.op:$V1),
+             (ins tr2.op:$V1src, tr2.op:$V2, tr2.op:$V3, imm32zx8:$I4),
+             mnemonic#"\t$V1, $V2, $V3, $I4",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
+                                                 (tr2.vt tr2.op:$V2),
+                                                 (tr2.vt tr2.op:$V3),
+                                                 imm32zx8:$I4)))]> {
+  let Constraints = "$V1 = $V1src";
+  let DisableEncoding = "$V1src";
+  let M5 = type;
+}
+
+class QuaternaryVRIdGeneric<string mnemonic, bits<16> opcode>
+  : InstVRId<opcode, (outs VR128:$V1),
+             (ins VR128:$V1src, VR128:$V2, VR128:$V3,
+                  imm32zx8:$I4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $V3, $I4, $M5", []> {
+  let Constraints = "$V1 = $V1src";
+  let DisableEncoding = "$V1src";
+}
+
+class QuaternaryVRRd<string mnemonic, bits<16> opcode,
+                     SDPatternOperator operator, TypedReg tr1, TypedReg tr2,
+                     bits<4> type, SDPatternOperator m6mask, bits<4> m6or>
+  : InstVRRd<opcode, (outs tr1.op:$V1),
+             (ins tr2.op:$V2, tr2.op:$V3, tr2.op:$V4, m6mask:$M6),
+             mnemonic#"\t$V1, $V2, $V3, $V4, $M6",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+                                                 (tr2.vt tr2.op:$V3),
+                                                 (tr2.vt tr2.op:$V4),
+                                                 m6mask:$M6)))],
+             m6or> {
+  let M5 = type;
+}
+
+// Declare a pair of instructions, one which sets CC and one which doesn't.
+// The CC-setting form ends with "S" and sets the low bit of M6.
+// Also create aliases to make use of M6 operand optional in assembler.
+multiclass QuaternaryOptVRRdSPair<string mnemonic, bits<16> opcode,
+                                  SDPatternOperator operator,
+                                SDPatternOperator operator_cc,
+                                TypedReg tr1, TypedReg tr2, bits<4> type,
+                                bits<4> modifier = 0> {
+  def "" : QuaternaryVRRd<mnemonic, opcode, operator, tr1, tr2, type,
+                          imm32zx4even, !and (modifier, 14)>;
+  def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4",
+                  (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
+                                            tr2.op:$V3, tr2.op:$V4, 0)>;
+  let Defs = [CC] in
+    def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
+                           imm32zx4even, !add (!and (modifier, 14), 1)>;
+  def : InstAlias<mnemonic#"s\t$V1, $V2, $V3, $V4",
+                  (!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2,
+                                                tr2.op:$V3, tr2.op:$V4, 0)>;
+}
+
+multiclass QuaternaryOptVRRdSPairGeneric<string mnemonic, bits<16> opcode> {
+  def "" : InstVRRd<opcode, (outs VR128:$V1),
+                   (ins VR128:$V2, VR128:$V3, VR128:$V4,
+                        imm32zx4:$M5, imm32zx4:$M6),
+                   mnemonic#"\t$V1, $V2, $V3, $V4, $M5, $M6", []>;
+  def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4, $M5",
+                  (!cast<Instruction>(NAME) VR128:$V1, VR128:$V2, VR128:$V3,
+                                            VR128:$V4, imm32zx4:$M5, 0)>;
+}
+
+class SideEffectQuaternarySSe<string mnemonic, bits<8> opcode,
+                              RegisterOperand cls>
+  : InstSSe<opcode, (outs),
+            (ins cls:$R1, bdaddr12only:$BD2, cls:$R3, bdaddr12only:$BD4),
+            mnemonic#"\t$R1, $BD2, $R3, $BD4", []>;
+
+class LoadAndOpRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                  RegisterOperand cls, AddressingMode mode = bdaddr20only>
+  : InstRSYa<opcode, (outs cls:$R1), (ins cls:$R3, mode:$BD2),
+             mnemonic#"\t$R1, $R3, $BD2",
+             [(set cls:$R1, (operator mode:$BD2, cls:$R3))]> {
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+class CmpSwapRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+                RegisterOperand cls, AddressingMode mode = bdaddr12only>
+  : InstRSa<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, mode:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2",
+            [(set cls:$R1, (operator mode:$BD2, cls:$R1src, cls:$R3))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+class CmpSwapRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls, AddressingMode mode = bdaddr20only>
+  : InstRSYa<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, mode:$BD2),
+             mnemonic#"\t$R1, $R3, $BD2",
+             [(set cls:$R1, (operator mode:$BD2, cls:$R1src, cls:$R3))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+multiclass CmpSwapRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
+                         SDPatternOperator operator, RegisterOperand cls> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : CmpSwapRS<mnemonic, rsOpcode, operator, cls, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : CmpSwapRSY<mnemonic#"y", rsyOpcode, operator, cls, bdaddr20pair>;
+  }
+}
+
+class RotateSelectRIEf<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                       RegisterOperand cls2>
+  : InstRIEf<opcode, (outs cls1:$R1),
+             (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4,
+                  imm32zx6:$I5),
+             mnemonic#"\t$R1, $R2, $I3, $I4, $I5", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class PrefetchRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator>
+  : InstRXYb<opcode, (outs), (ins imm32zx4:$M1, bdxaddr20only:$XBD2),
+             mnemonic##"\t$M1, $XBD2",
+             [(operator imm32zx4:$M1, bdxaddr20only:$XBD2)]>;
+
+class PrefetchRILPC<string mnemonic, bits<12> opcode,
+                    SDPatternOperator operator>
+  : InstRILc<opcode, (outs), (ins imm32zx4:$M1, pcrel32:$RI2),
+             mnemonic##"\t$M1, $RI2",
+             [(operator imm32zx4:$M1, pcrel32:$RI2)]> {
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+class BranchPreloadSMI<string mnemonic, bits<8> opcode>
+  : InstSMI<opcode, (outs),
+            (ins imm32zx4:$M1, brtarget16bpp:$RI2, bdxaddr12only:$BD3),
+            mnemonic#"\t$M1, $RI2, $BD3", []>;
+
+class BranchPreloadMII<string mnemonic, bits<8> opcode>
+  : InstMII<opcode, (outs),
+            (ins imm32zx4:$M1, brtarget12bpp:$RI2, brtarget24bpp:$RI3),
+            mnemonic#"\t$M1, $RI2, $RI3", []>;
+
+// A floating-point load-and test operation.  Create both a normal unary
+// operation and one that acts as a comparison against zero.
+// Note that the comparison against zero operation is not available if we
+// have vector support, since load-and-test instructions will partially
+// clobber the target (vector) register.
+multiclass LoadAndTestRRE<string mnemonic, bits<16> opcode,
+                          RegisterOperand cls> {
+  def "" : UnaryRRE<mnemonic, opcode, null_frag, cls, cls>;
+  let isCodeGenOnly = 1, Predicates = [FeatureNoVector] in
+    def Compare : CompareRRE<mnemonic, opcode, null_frag, cls, cls>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+//
+// Convenience instructions that get lowered to real instructions
+// by either SystemZTargetLowering::EmitInstrWithCustomInserter()
+// or SystemZInstrInfo::expandPostRAPseudo().
+//
+//===----------------------------------------------------------------------===//
+
+class Pseudo<dag outs, dag ins, list<dag> pattern>
+  : InstSystemZ<0, outs, ins, "", pattern> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+// Like SideEffectBinarySIL, but expanded later.
+class SideEffectBinarySILPseudo<SDPatternOperator operator, Immediate imm>
+  : Pseudo<(outs), (ins bdaddr12only:$BD1, imm:$I2),
+           [(operator bdaddr12only:$BD1, imm:$I2)]>;
+
+// Like UnaryRI, but expanded after RA depending on the choice of register.
+class UnaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+                    Immediate imm>
+  : Pseudo<(outs cls:$R1), (ins imm:$I2),
+           [(set cls:$R1, (operator imm:$I2))]>;
+
+// Like UnaryRXY, but expanded after RA depending on the choice of register.
+class UnaryRXYPseudo<string key, SDPatternOperator operator,
+                     RegisterOperand cls, bits<5> bytes,
+                     AddressingMode mode = bdxaddr20only>
+  : Pseudo<(outs cls:$R1), (ins mode:$XBD2),
+           [(set cls:$R1, (operator mode:$XBD2))]> {
+  let OpKey = key#"r"#cls;
+  let OpType = "mem";
+  let mayLoad = 1;
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+  let AccessBytes = bytes;
+}
+
+// Like UnaryRR, but expanded after RA depending on the choice of registers.
+class UnaryRRPseudo<string key, SDPatternOperator operator,
+                    RegisterOperand cls1, RegisterOperand cls2>
+  : Pseudo<(outs cls1:$R1), (ins cls2:$R2),
+           [(set cls1:$R1, (operator cls2:$R2))]> {
+  let OpKey = key#cls1;
+  let OpType = "reg";
+}
+
+// Like BinaryRI, but expanded after RA depending on the choice of register.
+class BinaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+                     Immediate imm>
+  : Pseudo<(outs cls:$R1), (ins cls:$R1src, imm:$I2),
+           [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+}
+
+// Like BinaryRIE, but expanded after RA depending on the choice of register.
+class BinaryRIEPseudo<SDPatternOperator operator, RegisterOperand cls,
+                      Immediate imm>
+  : Pseudo<(outs cls:$R1), (ins cls:$R3, imm:$I2),
+           [(set cls:$R1, (operator cls:$R3, imm:$I2))]>;
+
+// Like BinaryRIAndK, but expanded after RA depending on the choice of register.
+multiclass BinaryRIAndKPseudo<string key, SDPatternOperator operator,
+                              RegisterOperand cls, Immediate imm> {
+  let NumOpsKey = key in {
+    let NumOpsValue = "3" in
+      def K : BinaryRIEPseudo<null_frag, cls, imm>,
+              Requires<[FeatureHighWord, FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRIPseudo<operator, cls, imm>,
+               Requires<[FeatureHighWord]>;
+  }
+}
+
+// Like CompareRI, but expanded after RA depending on the choice of register.
+class CompareRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+                      Immediate imm>
+  : Pseudo<(outs), (ins cls:$R1, imm:$I2), [(operator cls:$R1, imm:$I2)]> {
+  let isCompare = 1;
+}
+
+// Like CompareRXY, but expanded after RA depending on the choice of register.
+class CompareRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
+                       SDPatternOperator load, bits<5> bytes,
+                       AddressingMode mode = bdxaddr20only>
+  : Pseudo<(outs), (ins cls:$R1, mode:$XBD2),
+           [(operator cls:$R1, (load mode:$XBD2))]> {
+  let mayLoad = 1;
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+  let AccessBytes = bytes;
+}
+
+// Like CondBinaryRRF, but expanded after RA depending on the choice of
+// register.
+class CondBinaryRRFPseudo<RegisterOperand cls1, RegisterOperand cls2>
+  : Pseudo<(outs cls1:$R1),
+           (ins cls1:$R1src, cls2:$R2, cond4:$valid, cond4:$M3), []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let CCMaskLast = 1;
+}
+
+// Like CondBinaryRIE, but expanded after RA depending on the choice of
+// register.
+class CondBinaryRIEPseudo<RegisterOperand cls, Immediate imm>
+  : Pseudo<(outs cls:$R1),
+           (ins cls:$R1src, imm:$I2, cond4:$valid, cond4:$M3),
+           [(set cls:$R1, (z_select_ccmask imm:$I2, cls:$R1src,
+                                           cond4:$valid, cond4:$M3))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let CCMaskLast = 1;
+}
+
+// Like CondUnaryRSY, but expanded after RA depending on the choice of
+// register.
+class CondUnaryRSYPseudo<SDPatternOperator operator, RegisterOperand cls,
+                         bits<5> bytes, AddressingMode mode = bdaddr20only>
+  : Pseudo<(outs cls:$R1),
+           (ins cls:$R1src, mode:$BD2, cond4:$valid, cond4:$R3),
+           [(set cls:$R1,
+                 (z_select_ccmask (operator mode:$BD2), cls:$R1src,
+                                  cond4:$valid, cond4:$R3))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+  let CCMaskLast = 1;
+}
+
+// Like CondStoreRSY, but expanded after RA depending on the choice of
+// register.
+class CondStoreRSYPseudo<RegisterOperand cls, bits<5> bytes,
+                         AddressingMode mode = bdaddr20only>
+  : Pseudo<(outs), (ins cls:$R1, mode:$BD2, cond4:$valid, cond4:$R3), []> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+  let CCMaskLast = 1;
+}
+
+// Like StoreRXY, but expanded after RA depending on the choice of register.
+class StoreRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
+                     bits<5> bytes, AddressingMode mode = bdxaddr20only>
+  : Pseudo<(outs), (ins cls:$R1, mode:$XBD2),
+           [(operator cls:$R1, mode:$XBD2)]> {
+  let mayStore = 1;
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+  let AccessBytes = bytes;
+}
+
+// Like RotateSelectRIEf, but expanded after RA depending on the choice
+// of registers.
+class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2>
+  : Pseudo<(outs cls1:$R1),
+           (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4,
+                imm32zx6:$I5),
+           []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+// Implements "$dst = $cc & (8 >> CC) ? $src1 : $src2", where CC is
+// the value of the PSW's 2-bit condition code field.
+class SelectWrapper<RegisterOperand cls>
+  : Pseudo<(outs cls:$dst),
+           (ins cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc),
+           [(set cls:$dst, (z_select_ccmask cls:$src1, cls:$src2,
+                                            imm32zx4:$valid, imm32zx4:$cc))]> {
+  let usesCustomInserter = 1;
+  // Although the instructions used by these nodes do not in themselves
+  // change CC, the insertion requires new blocks, and CC cannot be live
+  // across them.
+  let Defs = [CC];
+  let Uses = [CC];
+}
+
+// Stores $new to $addr if $cc is true ("" case) or false (Inv case).
+multiclass CondStores<RegisterOperand cls, SDPatternOperator store,
+                      SDPatternOperator load, AddressingMode mode> {
+  let Defs = [CC], Uses = [CC], usesCustomInserter = 1 in {
+    def "" : Pseudo<(outs),
+                    (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc),
+                    [(store (z_select_ccmask cls:$new, (load mode:$addr),
+                                             imm32zx4:$valid, imm32zx4:$cc),
+                            mode:$addr)]>;
+    def Inv : Pseudo<(outs),
+                     (ins cls:$new, mode:$addr, imm32zx4:$valid, imm32zx4:$cc),
+                     [(store (z_select_ccmask (load mode:$addr), cls:$new,
+                                              imm32zx4:$valid, imm32zx4:$cc),
+                              mode:$addr)]>;
+  }
+}
+
+// OPERATOR is ATOMIC_SWAP or an ATOMIC_LOAD_* operation.  PAT and OPERAND
+// describe the second (non-memory) operand.
+class AtomicLoadBinary<SDPatternOperator operator, RegisterOperand cls,
+                       dag pat, DAGOperand operand>
+  : Pseudo<(outs cls:$dst), (ins bdaddr20only:$ptr, operand:$src2),
+           [(set cls:$dst, (operator bdaddr20only:$ptr, pat))]> {
+  let Defs = [CC];
+  let Has20BitOffset = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let usesCustomInserter = 1;
+  let hasNoSchedulingInfo = 1;
+}
+
+// Specializations of AtomicLoadWBinary.
+class AtomicLoadBinaryReg32<SDPatternOperator operator>
+  : AtomicLoadBinary<operator, GR32, (i32 GR32:$src2), GR32>;
+class AtomicLoadBinaryImm32<SDPatternOperator operator, Immediate imm>
+  : AtomicLoadBinary<operator, GR32, (i32 imm:$src2), imm>;
+class AtomicLoadBinaryReg64<SDPatternOperator operator>
+  : AtomicLoadBinary<operator, GR64, (i64 GR64:$src2), GR64>;
+class AtomicLoadBinaryImm64<SDPatternOperator operator, Immediate imm>
+  : AtomicLoadBinary<operator, GR64, (i64 imm:$src2), imm>;
+
+// OPERATOR is ATOMIC_SWAPW or an ATOMIC_LOADW_* operation.  PAT and OPERAND
+// describe the second (non-memory) operand.
+class AtomicLoadWBinary<SDPatternOperator operator, dag pat,
+                        DAGOperand operand>
+  : Pseudo<(outs GR32:$dst),
+           (ins bdaddr20only:$ptr, operand:$src2, ADDR32:$bitshift,
+                ADDR32:$negbitshift, uimm32:$bitsize),
+           [(set GR32:$dst, (operator bdaddr20only:$ptr, pat, ADDR32:$bitshift,
+                                      ADDR32:$negbitshift, uimm32:$bitsize))]> {
+  let Defs = [CC];
+  let Has20BitOffset = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let usesCustomInserter = 1;
+  let hasNoSchedulingInfo = 1;
+}
+
+// Specializations of AtomicLoadWBinary.
+class AtomicLoadWBinaryReg<SDPatternOperator operator>
+  : AtomicLoadWBinary<operator, (i32 GR32:$src2), GR32>;
+class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
+  : AtomicLoadWBinary<operator, (i32 imm:$src2), imm>;
+
+// Define an instruction that operates on two fixed-length blocks of memory,
+// and associated pseudo instructions for operating on blocks of any size.
+// The Sequence form uses a straight-line sequence of instructions and
+// the Loop form uses a loop of length-256 instructions followed by
+// another instruction to handle the excess.
+multiclass MemorySS<string mnemonic, bits<8> opcode,
+                    SDPatternOperator sequence, SDPatternOperator loop> {
+  def "" : InstSSa<opcode, (outs), (ins bdladdr12onlylen8:$BDL1,
+                                        bdaddr12only:$BD2),
+                   mnemonic##"\t$BDL1, $BD2", []>;
+  let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
+    def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                       imm64:$length),
+                           [(sequence bdaddr12only:$dest, bdaddr12only:$src,
+                                      imm64:$length)]>;
+    def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                   imm64:$length, GR64:$count256),
+                      [(loop bdaddr12only:$dest, bdaddr12only:$src,
+                             imm64:$length, GR64:$count256)]>;
+  }
+}
+
+// Define an instruction that operates on two strings, both terminated
+// by the character in R0.  The instruction processes a CPU-determinated
+// number of bytes at a time and sets CC to 3 if the instruction needs
+// to be repeated.  Also define a pseudo instruction that represents
+// the full loop (the main instruction plus the branch on CC==3).
+multiclass StringRRE<string mnemonic, bits<16> opcode,
+                     SDPatternOperator operator> {
+  def "" : InstRRE<opcode, (outs GR64:$R1, GR64:$R2),
+                   (ins GR64:$R1src, GR64:$R2src),
+                   mnemonic#"\t$R1, $R2", []> {
+    let Uses = [R0L];
+    let Constraints = "$R1 = $R1src, $R2 = $R2src";
+    let DisableEncoding = "$R1src, $R2src";
+  }
+  let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
+    def Loop : Pseudo<(outs GR64:$end),
+                      (ins GR64:$start1, GR64:$start2, GR32:$char),
+                      [(set GR64:$end, (operator GR64:$start1, GR64:$start2,
+                                                 GR32:$char))]>;
+}
+
+// A pseudo instruction that is a direct alias of a real instruction.
+// These aliases are used in cases where a particular register operand is
+// fixed or where the same instruction is used with different register sizes.
+// The size parameter is the size in bytes of the associated real instruction.
+class Alias<int size, dag outs, dag ins, list<dag> pattern>
+  : InstSystemZ<size, outs, ins, "", pattern> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+class UnaryAliasVRS<RegisterOperand cls1, RegisterOperand cls2>
+ : Alias<6, (outs cls1:$src1), (ins cls2:$src2), []>;
+
+// An alias of a UnaryVRR*, but with different register sizes.
+class UnaryAliasVRR<SDPatternOperator operator, TypedReg tr1, TypedReg tr2>
+  : Alias<6, (outs tr1.op:$V1), (ins tr2.op:$V2),
+          [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2))))]>;
+
+// An alias of a UnaryVRX, but with different register sizes.
+class UnaryAliasVRX<SDPatternOperator operator, TypedReg tr,
+                    AddressingMode mode = bdxaddr12only>
+  : Alias<6, (outs tr.op:$V1), (ins mode:$XBD2),
+          [(set tr.op:$V1, (tr.vt (operator mode:$XBD2)))]>;
+
+// An alias of a StoreVRX, but with different register sizes.
+class StoreAliasVRX<SDPatternOperator operator, TypedReg tr,
+                    AddressingMode mode = bdxaddr12only>
+  : Alias<6, (outs), (ins tr.op:$V1, mode:$XBD2),
+          [(operator (tr.vt tr.op:$V1), mode:$XBD2)]>;
+
+// An alias of a BinaryRI, but with different register sizes.
+class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
+                    Immediate imm>
+  : Alias<4, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+          [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+}
+
+// An alias of a BinaryRIL, but with different register sizes.
+class BinaryAliasRIL<SDPatternOperator operator, RegisterOperand cls,
+                     Immediate imm>
+  : Alias<6, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+          [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+}
+
+// An alias of a BinaryVRRf, but with different register sizes.
+class BinaryAliasVRRf<RegisterOperand cls>
+  : Alias<6, (outs VR128:$V1), (ins cls:$R2, cls:$R3), []>;
+
+// An alias of a CompareRI, but with different register sizes.
+class CompareAliasRI<SDPatternOperator operator, RegisterOperand cls,
+                     Immediate imm>
+  : Alias<4, (outs), (ins cls:$R1, imm:$I2), [(operator cls:$R1, imm:$I2)]> {
+  let isCompare = 1;
+}
+
+// An alias of a RotateSelectRIEf, but with different register sizes.
+class RotateSelectAliasRIEf<RegisterOperand cls1, RegisterOperand cls2>
+  : Alias<6, (outs cls1:$R1),
+          (ins cls1:$R1src, cls2:$R2, imm32zx8:$I3, imm32zx8:$I4,
+               imm32zx6:$I5), []> {
+  let Constraints = "$R1 = $R1src";
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
new file mode 100644
index 000000000000..3565d5f2c49c
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -0,0 +1,1752 @@
+//===-- SystemZInstrInfo.cpp - SystemZ instruction information ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SystemZ implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZInstrInfo.h"
+#include "SystemZInstrBuilder.h"
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRMAP_INFO
+#include "SystemZGenInstrInfo.inc"
+
+// Return a mask with Count low bits set.
+static uint64_t allOnes(unsigned int Count) {
+  return Count == 0 ? 0 : (uint64_t(1) << (Count - 1) << 1) - 1;
+}
+
+// Reg should be a 32-bit GPR.  Return true if it is a high register rather
+// than a low register.
+static bool isHighReg(unsigned int Reg) {
+  if (SystemZ::GRH32BitRegClass.contains(Reg))
+    return true;
+  assert(SystemZ::GR32BitRegClass.contains(Reg) && "Invalid GRX32");
+  return false;
+}
+
+// Pin the vtable to this file.
+void SystemZInstrInfo::anchor() {}
+
+SystemZInstrInfo::SystemZInstrInfo(SystemZSubtarget &sti)
+  : SystemZGenInstrInfo(SystemZ::ADJCALLSTACKDOWN, SystemZ::ADJCALLSTACKUP),
+    RI(), STI(sti) {
+}
+
+// MI is a 128-bit load or store.  Split it into two 64-bit loads or stores,
+// each having the opcode given by NewOpcode.
+void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
+                                 unsigned NewOpcode) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+
+  // Get two load or store instructions.  Use the original instruction for one
+  // of them (arbitrarily the second here) and create a clone for the other.
+  MachineInstr *EarlierMI = MF.CloneMachineInstr(&*MI);
+  MBB->insert(MI, EarlierMI);
+
+  // Set up the two 64-bit registers.
+  MachineOperand &HighRegOp = EarlierMI->getOperand(0);
+  MachineOperand &LowRegOp = MI->getOperand(0);
+  HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_h64));
+  LowRegOp.setReg(RI.getSubReg(LowRegOp.getReg(), SystemZ::subreg_l64));
+
+  // The address in the first (high) instruction is already correct.
+  // Adjust the offset in the second (low) instruction.
+  MachineOperand &HighOffsetOp = EarlierMI->getOperand(2);
+  MachineOperand &LowOffsetOp = MI->getOperand(2);
+  LowOffsetOp.setImm(LowOffsetOp.getImm() + 8);
+
+  // Clear the kill flags for the base and index registers in the first
+  // instruction.
+  EarlierMI->getOperand(1).setIsKill(false);
+  EarlierMI->getOperand(3).setIsKill(false);
+
+  // Set the opcodes.
+  unsigned HighOpcode = getOpcodeForOffset(NewOpcode, HighOffsetOp.getImm());
+  unsigned LowOpcode = getOpcodeForOffset(NewOpcode, LowOffsetOp.getImm());
+  assert(HighOpcode && LowOpcode && "Both offsets should be in range");
+
+  EarlierMI->setDesc(get(HighOpcode));
+  MI->setDesc(get(LowOpcode));
+}
+
+// Split ADJDYNALLOC instruction MI.
+void SystemZInstrInfo::splitAdjDynAlloc(MachineBasicBlock::iterator MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  MachineFrameInfo &MFFrame = MF.getFrameInfo();
+  MachineOperand &OffsetMO = MI->getOperand(2);
+
+  uint64_t Offset = (MFFrame.getMaxCallFrameSize() +
+                     SystemZMC::CallFrameSize +
+                     OffsetMO.getImm());
+  unsigned NewOpcode = getOpcodeForOffset(SystemZ::LA, Offset);
+  assert(NewOpcode && "No support for huge argument lists yet");
+  MI->setDesc(get(NewOpcode));
+  OffsetMO.setImm(Offset);
+}
+
+// MI is an RI-style pseudo instruction.  Replace it with LowOpcode
+// if the first operand is a low GR32 and HighOpcode if the first operand
+// is a high GR32.  ConvertHigh is true if LowOpcode takes a signed operand
+// and HighOpcode takes an unsigned 32-bit operand.  In those cases,
+// MI has the same kind of operand as LowOpcode, so needs to be converted
+// if HighOpcode is used.
+void SystemZInstrInfo::expandRIPseudo(MachineInstr &MI, unsigned LowOpcode,
+                                      unsigned HighOpcode,
+                                      bool ConvertHigh) const {
+  unsigned Reg = MI.getOperand(0).getReg();
+  bool IsHigh = isHighReg(Reg);
+  MI.setDesc(get(IsHigh ? HighOpcode : LowOpcode));
+  if (IsHigh && ConvertHigh)
+    MI.getOperand(1).setImm(uint32_t(MI.getOperand(1).getImm()));
+}
+
+// MI is a three-operand RIE-style pseudo instruction.  Replace it with
+// LowOpcodeK if the registers are both low GR32s, otherwise use a move
+// followed by HighOpcode or LowOpcode, depending on whether the target
+// is a high or low GR32.
+void SystemZInstrInfo::expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode,
+                                       unsigned LowOpcodeK,
+                                       unsigned HighOpcode) const {
+  unsigned DestReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  bool DestIsHigh = isHighReg(DestReg);
+  bool SrcIsHigh = isHighReg(SrcReg);
+  if (!DestIsHigh && !SrcIsHigh)
+    MI.setDesc(get(LowOpcodeK));
+  else {
+    emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, SrcReg,
+                  SystemZ::LR, 32, MI.getOperand(1).isKill());
+    MI.setDesc(get(DestIsHigh ? HighOpcode : LowOpcode));
+    MI.getOperand(1).setReg(DestReg);
+    MI.tieOperands(0, 1);
+  }
+}
+
+// MI is an RXY-style pseudo instruction.  Replace it with LowOpcode
+// if the first operand is a low GR32 and HighOpcode if the first operand
+// is a high GR32.
+void SystemZInstrInfo::expandRXYPseudo(MachineInstr &MI, unsigned LowOpcode,
+                                       unsigned HighOpcode) const {
+  unsigned Reg = MI.getOperand(0).getReg();
+  unsigned Opcode = getOpcodeForOffset(isHighReg(Reg) ? HighOpcode : LowOpcode,
+                                       MI.getOperand(2).getImm());
+  MI.setDesc(get(Opcode));
+}
+
+// MI is a load-on-condition pseudo instruction with a single register
+// (source or destination) operand.  Replace it with LowOpcode if the
+// register is a low GR32 and HighOpcode if the register is a high GR32.
+void SystemZInstrInfo::expandLOCPseudo(MachineInstr &MI, unsigned LowOpcode,
+                                       unsigned HighOpcode) const {
+  unsigned Reg = MI.getOperand(0).getReg();
+  unsigned Opcode = isHighReg(Reg) ? HighOpcode : LowOpcode;
+  MI.setDesc(get(Opcode));
+}
+
+// MI is a load-register-on-condition pseudo instruction.  Replace it with
+// LowOpcode if source and destination are both low GR32s and HighOpcode if
+// source and destination are both high GR32s.
+void SystemZInstrInfo::expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode,
+                                        unsigned HighOpcode) const {
+  unsigned DestReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = MI.getOperand(2).getReg();
+  bool DestIsHigh = isHighReg(DestReg);
+  bool SrcIsHigh = isHighReg(SrcReg);
+
+  if (!DestIsHigh && !SrcIsHigh)
+    MI.setDesc(get(LowOpcode));
+  else if (DestIsHigh && SrcIsHigh)
+    MI.setDesc(get(HighOpcode));
+
+  // If we were unable to implement the pseudo with a single instruction, we
+  // need to convert it back into a branch sequence.  This cannot be done here
+  // since the caller of expandPostRAPseudo does not handle changes to the CFG
+  // correctly.  This change is defered to the SystemZExpandPseudo pass.
+}
+
+// MI is an RR-style pseudo instruction that zero-extends the low Size bits
+// of one GRX32 into another.  Replace it with LowOpcode if both operands
+// are low registers, otherwise use RISB[LH]G.
+void SystemZInstrInfo::expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
+                                        unsigned Size) const {
+  emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(),
+                MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), LowOpcode,
+                Size, MI.getOperand(1).isKill());
+  MI.eraseFromParent();
+}
+
+void SystemZInstrInfo::expandLoadStackGuard(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  const unsigned Reg = MI->getOperand(0).getReg();
+
+  // Conveniently, all 4 instructions are cloned from LOAD_STACK_GUARD,
+  // so they already have operand 0 set to reg.
+
+  // ear <reg>, %a0
+  MachineInstr *Ear1MI = MF.CloneMachineInstr(MI);
+  MBB->insert(MI, Ear1MI);
+  Ear1MI->setDesc(get(SystemZ::EAR));
+  MachineInstrBuilder(MF, Ear1MI).addReg(SystemZ::A0);
+
+  // sllg <reg>, <reg>, 32
+  MachineInstr *SllgMI = MF.CloneMachineInstr(MI);
+  MBB->insert(MI, SllgMI);
+  SllgMI->setDesc(get(SystemZ::SLLG));
+  MachineInstrBuilder(MF, SllgMI).addReg(Reg).addReg(0).addImm(32);
+
+  // ear <reg>, %a1
+  MachineInstr *Ear2MI = MF.CloneMachineInstr(MI);
+  MBB->insert(MI, Ear2MI);
+  Ear2MI->setDesc(get(SystemZ::EAR));
+  MachineInstrBuilder(MF, Ear2MI).addReg(SystemZ::A1);
+
+  // lg <reg>, 40(<reg>)
+  MI->setDesc(get(SystemZ::LG));
+  MachineInstrBuilder(MF, MI).addReg(Reg).addImm(40).addReg(0);
+}
+
+// Emit a zero-extending move from 32-bit GPR SrcReg to 32-bit GPR
+// DestReg before MBBI in MBB.  Use LowLowOpcode when both DestReg and SrcReg
+// are low registers, otherwise use RISB[LH]G.  Size is the number of bits
+// taken from the low end of SrcReg (8 for LLCR, 16 for LLHR and 32 for LR).
+// KillSrc is true if this move is the last use of SrcReg.
+void SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     const DebugLoc &DL, unsigned DestReg,
+                                     unsigned SrcReg, unsigned LowLowOpcode,
+                                     unsigned Size, bool KillSrc) const {
+  unsigned Opcode;
+  bool DestIsHigh = isHighReg(DestReg);
+  bool SrcIsHigh = isHighReg(SrcReg);
+  if (DestIsHigh && SrcIsHigh)
+    Opcode = SystemZ::RISBHH;
+  else if (DestIsHigh && !SrcIsHigh)
+    Opcode = SystemZ::RISBHL;
+  else if (!DestIsHigh && SrcIsHigh)
+    Opcode = SystemZ::RISBLH;
+  else {
+    BuildMI(MBB, MBBI, DL, get(LowLowOpcode), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  unsigned Rotate = (DestIsHigh != SrcIsHigh ? 32 : 0);
+  BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+    .addReg(DestReg, RegState::Undef)
+    .addReg(SrcReg, getKillRegState(KillSrc))
+    .addImm(32 - Size).addImm(128 + 31).addImm(Rotate);
+}
+
+
+MachineInstr *SystemZInstrInfo::commuteInstructionImpl(MachineInstr &MI,
+                                                       bool NewMI,
+                                                       unsigned OpIdx1,
+                                                       unsigned OpIdx2) const {
+  auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
+    if (NewMI)
+      return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
+    return MI;
+  };
+
+  switch (MI.getOpcode()) {
+  case SystemZ::LOCRMux:
+  case SystemZ::LOCFHR:
+  case SystemZ::LOCR:
+  case SystemZ::LOCGR: {
+    auto &WorkingMI = cloneIfNew(MI);
+    // Invert condition.
+    unsigned CCValid = WorkingMI.getOperand(3).getImm();
+    unsigned CCMask = WorkingMI.getOperand(4).getImm();
+    WorkingMI.getOperand(4).setImm(CCMask ^ CCValid);
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
+  }
+  default:
+    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+  }
+}
+
+
+// If MI is a simple load or store for a frame object, return the register
+// it loads or stores and set FrameIndex to the index of the frame object.
+// Return 0 otherwise.
+//
+// Flag is SimpleBDXLoad for loads and SimpleBDXStore for stores.
+static int isSimpleMove(const MachineInstr &MI, int &FrameIndex,
+                        unsigned Flag) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  if ((MCID.TSFlags & Flag) && MI.getOperand(1).isFI() &&
+      MI.getOperand(2).getImm() == 0 && MI.getOperand(3).getReg() == 0) {
+    FrameIndex = MI.getOperand(1).getIndex();
+    return MI.getOperand(0).getReg();
+  }
+  return 0;
+}
+
+unsigned SystemZInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                               int &FrameIndex) const {
+  return isSimpleMove(MI, FrameIndex, SystemZII::SimpleBDXLoad);
+}
+
+unsigned SystemZInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                              int &FrameIndex) const {
+  return isSimpleMove(MI, FrameIndex, SystemZII::SimpleBDXStore);
+}
+
+bool SystemZInstrInfo::isStackSlotCopy(const MachineInstr &MI,
+                                       int &DestFrameIndex,
+                                       int &SrcFrameIndex) const {
+  // Check for MVC 0(Length,FI1),0(FI2)
+  const MachineFrameInfo &MFI = MI.getParent()->getParent()->getFrameInfo();
+  if (MI.getOpcode() != SystemZ::MVC || !MI.getOperand(0).isFI() ||
+      MI.getOperand(1).getImm() != 0 || !MI.getOperand(3).isFI() ||
+      MI.getOperand(4).getImm() != 0)
+    return false;
+
+  // Check that Length covers the full slots.
+  int64_t Length = MI.getOperand(2).getImm();
+  unsigned FI1 = MI.getOperand(0).getIndex();
+  unsigned FI2 = MI.getOperand(3).getIndex();
+  if (MFI.getObjectSize(FI1) != Length ||
+      MFI.getObjectSize(FI2) != Length)
+    return false;
+
+  DestFrameIndex = FI1;
+  SrcFrameIndex = FI2;
+  return true;
+}
+
+bool SystemZInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *&TBB,
+                                     MachineBasicBlock *&FBB,
+                                     SmallVectorImpl<MachineOperand> &Cond,
+                                     bool AllowModify) const {
+  // Most of the code and comments here are boilerplate.
+
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+
+    // Working from the bottom, when we see a non-terminator instruction, we're
+    // done.
+    if (!isUnpredicatedTerminator(*I))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled by this
+    // analysis.
+    if (!I->isBranch())
+      return true;
+
+    // Can't handle indirect branches.
+    SystemZII::Branch Branch(getBranchInfo(*I));
+    if (!Branch.Target->isMBB())
+      return true;
+
+    // Punt on compound branches.
+    if (Branch.Type != SystemZII::BranchNormal)
+      return true;
+
+    if (Branch.CCMask == SystemZ::CCMASK_ANY) {
+      // Handle unconditional branches.
+      if (!AllowModify) {
+        TBB = Branch.Target->getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (std::next(I) != MBB.end())
+        std::next(I)->eraseFromParent();
+
+      Cond.clear();
+      FBB = nullptr;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(Branch.Target->getMBB())) {
+        TBB = nullptr;
+        I->eraseFromParent();
+        I = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditinal destination.
+      TBB = Branch.Target->getMBB();
+      continue;
+    }
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      // FIXME: add X86-style branch swap
+      FBB = TBB;
+      TBB = Branch.Target->getMBB();
+      Cond.push_back(MachineOperand::CreateImm(Branch.CCValid));
+      Cond.push_back(MachineOperand::CreateImm(Branch.CCMask));
+      continue;
+    }
+
+    // Handle subsequent conditional branches.
+    assert(Cond.size() == 2 && TBB && "Should have seen a conditional branch");
+
+    // Only handle the case where all conditional branches branch to the same
+    // destination.
+    if (TBB != Branch.Target->getMBB())
+      return true;
+
+    // If the conditions are the same, we can leave them alone.
+    unsigned OldCCValid = Cond[0].getImm();
+    unsigned OldCCMask = Cond[1].getImm();
+    if (OldCCValid == Branch.CCValid && OldCCMask == Branch.CCMask)
+      continue;
+
+    // FIXME: Try combining conditions like X86 does.  Should be easy on Z!
+    return false;
+  }
+
+  return false;
+}
+
+unsigned SystemZInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                        int *BytesRemoved) const {
+  assert(!BytesRemoved && "code size not handled");
+
+  // Most of the code and comments here are boilerplate.
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (!I->isBranch())
+      break;
+    if (!getBranchInfo(*I).Target->isMBB())
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+bool SystemZInstrInfo::
+reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 2 && "Invalid condition");
+  Cond[1].setImm(Cond[1].getImm() ^ Cond[0].getImm());
+  return false;
+}
+
+unsigned SystemZInstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                        MachineBasicBlock *TBB,
+                                        MachineBasicBlock *FBB,
+                                        ArrayRef<MachineOperand> Cond,
+                                        const DebugLoc &DL,
+                                        int *BytesAdded) const {
+  // In this function we output 32-bit branches, which should always
+  // have enough range.  They can be shortened and relaxed by later code
+  // in the pipeline, if desired.
+
+  // Shouldn't be a fall through.
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "SystemZ branch conditions have one component!");
+  assert(!BytesAdded && "code size not handled");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(SystemZ::J)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  unsigned CCValid = Cond[0].getImm();
+  unsigned CCMask = Cond[1].getImm();
+  BuildMI(&MBB, DL, get(SystemZ::BRC))
+    .addImm(CCValid).addImm(CCMask).addMBB(TBB);
+  ++Count;
+
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, DL, get(SystemZ::J)).addMBB(FBB);
+    ++Count;
+  }
+  return Count;
+}
+
+bool SystemZInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                                      unsigned &SrcReg2, int &Mask,
+                                      int &Value) const {
+  assert(MI.isCompare() && "Caller should have checked for a comparison");
+
+  if (MI.getNumExplicitOperands() == 2 && MI.getOperand(0).isReg() &&
+      MI.getOperand(1).isImm()) {
+    SrcReg = MI.getOperand(0).getReg();
+    SrcReg2 = 0;
+    Value = MI.getOperand(1).getImm();
+    Mask = ~0;
+    return true;
+  }
+
+  return false;
+}
+
+// If Reg is a virtual register, return its definition, otherwise return null.
+static MachineInstr *getDef(unsigned Reg,
+                            const MachineRegisterInfo *MRI) {
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return nullptr;
+  return MRI->getUniqueVRegDef(Reg);
+}
+
+// Return true if MI is a shift of type Opcode by Imm bits.
+static bool isShift(MachineInstr *MI, unsigned Opcode, int64_t Imm) {
+  return (MI->getOpcode() == Opcode &&
+          !MI->getOperand(2).getReg() &&
+          MI->getOperand(3).getImm() == Imm);
+}
+
+// If the destination of MI has no uses, delete it as dead.
+static void eraseIfDead(MachineInstr *MI, const MachineRegisterInfo *MRI) {
+  if (MRI->use_nodbg_empty(MI->getOperand(0).getReg()))
+    MI->eraseFromParent();
+}
+
+// Compare compares SrcReg against zero.  Check whether SrcReg contains
+// the result of an IPM sequence whose input CC survives until Compare,
+// and whether Compare is therefore redundant.  Delete it and return
+// true if so.
+static bool removeIPMBasedCompare(MachineInstr &Compare, unsigned SrcReg,
+                                  const MachineRegisterInfo *MRI,
+                                  const TargetRegisterInfo *TRI) {
+  MachineInstr *LGFR = nullptr;
+  MachineInstr *RLL = getDef(SrcReg, MRI);
+  if (RLL && RLL->getOpcode() == SystemZ::LGFR) {
+    LGFR = RLL;
+    RLL = getDef(LGFR->getOperand(1).getReg(), MRI);
+  }
+  if (!RLL || !isShift(RLL, SystemZ::RLL, 31))
+    return false;
+
+  MachineInstr *SRL = getDef(RLL->getOperand(1).getReg(), MRI);
+  if (!SRL || !isShift(SRL, SystemZ::SRL, SystemZ::IPM_CC))
+    return false;
+
+  MachineInstr *IPM = getDef(SRL->getOperand(1).getReg(), MRI);
+  if (!IPM || IPM->getOpcode() != SystemZ::IPM)
+    return false;
+
+  // Check that there are no assignments to CC between the IPM and Compare,
+  if (IPM->getParent() != Compare.getParent())
+    return false;
+  MachineBasicBlock::iterator MBBI = IPM, MBBE = Compare.getIterator();
+  for (++MBBI; MBBI != MBBE; ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    if (MI.modifiesRegister(SystemZ::CC, TRI))
+      return false;
+  }
+
+  Compare.eraseFromParent();
+  if (LGFR)
+    eraseIfDead(LGFR, MRI);
+  eraseIfDead(RLL, MRI);
+  eraseIfDead(SRL, MRI);
+  eraseIfDead(IPM, MRI);
+
+  return true;
+}
+
+bool SystemZInstrInfo::optimizeCompareInstr(
+    MachineInstr &Compare, unsigned SrcReg, unsigned SrcReg2, int Mask,
+    int Value, const MachineRegisterInfo *MRI) const {
+  assert(!SrcReg2 && "Only optimizing constant comparisons so far");
+  bool IsLogical = (Compare.getDesc().TSFlags & SystemZII::IsLogical) != 0;
+  return Value == 0 && !IsLogical &&
+         removeIPMBasedCompare(Compare, SrcReg, MRI, &RI);
+}
+
+
+bool SystemZInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
+                                       ArrayRef<MachineOperand> Pred,
+                                       unsigned TrueReg, unsigned FalseReg,
+                                       int &CondCycles, int &TrueCycles,
+                                       int &FalseCycles) const {
+  // Not all subtargets have LOCR instructions.
+  if (!STI.hasLoadStoreOnCond())
+    return false;
+  if (Pred.size() != 2)
+    return false;
+
+  // Check register classes.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+    RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  if (!RC)
+    return false;
+
+  // We have LOCR instructions for 32 and 64 bit general purpose registers.
+  if ((STI.hasLoadStoreOnCond2() &&
+       SystemZ::GRX32BitRegClass.hasSubClassEq(RC)) ||
+      SystemZ::GR32BitRegClass.hasSubClassEq(RC) ||
+      SystemZ::GR64BitRegClass.hasSubClassEq(RC)) {
+    CondCycles = 2;
+    TrueCycles = 2;
+    FalseCycles = 2;
+    return true;
+  }
+
+  // Can't do anything else.
+  return false;
+}
+
+void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    const DebugLoc &DL, unsigned DstReg,
+                                    ArrayRef<MachineOperand> Pred,
+                                    unsigned TrueReg,
+                                    unsigned FalseReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+
+  assert(Pred.size() == 2 && "Invalid condition");
+  unsigned CCValid = Pred[0].getImm();
+  unsigned CCMask = Pred[1].getImm();
+
+  unsigned Opc;
+  if (SystemZ::GRX32BitRegClass.hasSubClassEq(RC)) {
+    if (STI.hasLoadStoreOnCond2())
+      Opc = SystemZ::LOCRMux;
+    else {
+      Opc = SystemZ::LOCR;
+      MRI.constrainRegClass(DstReg, &SystemZ::GR32BitRegClass);
+    }
+  } else if (SystemZ::GR64BitRegClass.hasSubClassEq(RC))
+    Opc = SystemZ::LOCGR;
+  else
+    llvm_unreachable("Invalid register class");
+
+  BuildMI(MBB, I, DL, get(Opc), DstReg)
+    .addReg(FalseReg).addReg(TrueReg)
+    .addImm(CCValid).addImm(CCMask);
+}
+
+bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+                                     unsigned Reg,
+                                     MachineRegisterInfo *MRI) const {
+  unsigned DefOpc = DefMI.getOpcode();
+  if (DefOpc != SystemZ::LHIMux && DefOpc != SystemZ::LHI &&
+      DefOpc != SystemZ::LGHI)
+    return false;
+  if (DefMI.getOperand(0).getReg() != Reg)
+    return false;
+  int32_t ImmVal = (int32_t)DefMI.getOperand(1).getImm();
+
+  unsigned UseOpc = UseMI.getOpcode();
+  unsigned NewUseOpc;
+  unsigned UseIdx;
+  int CommuteIdx = -1;
+  switch (UseOpc) {
+  case SystemZ::LOCRMux:
+    if (!STI.hasLoadStoreOnCond2())
+      return false;
+    NewUseOpc = SystemZ::LOCHIMux;
+    if (UseMI.getOperand(2).getReg() == Reg)
+      UseIdx = 2;
+    else if (UseMI.getOperand(1).getReg() == Reg)
+      UseIdx = 2, CommuteIdx = 1;
+    else
+      return false;
+    break;
+  case SystemZ::LOCGR:
+    if (!STI.hasLoadStoreOnCond2())
+      return false;
+    NewUseOpc = SystemZ::LOCGHI;
+    if (UseMI.getOperand(2).getReg() == Reg)
+      UseIdx = 2;
+    else if (UseMI.getOperand(1).getReg() == Reg)
+      UseIdx = 2, CommuteIdx = 1;
+    else
+      return false;
+    break;
+  default:
+    return false;
+  }
+
+  if (CommuteIdx != -1)
+    if (!commuteInstruction(UseMI, false, CommuteIdx, UseIdx))
+      return false;
+
+  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
+  UseMI.setDesc(get(NewUseOpc));
+  UseMI.getOperand(UseIdx).ChangeToImmediate(ImmVal);
+  if (DeleteDef)
+    DefMI.eraseFromParent();
+
+  return true;
+}
+
+bool SystemZInstrInfo::isPredicable(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  if (Opcode == SystemZ::Return ||
+      Opcode == SystemZ::Trap ||
+      Opcode == SystemZ::CallJG ||
+      Opcode == SystemZ::CallBR)
+    return true;
+  return false;
+}
+
+bool SystemZInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &MBB,
+                    unsigned NumCycles, unsigned ExtraPredCycles,
+                    BranchProbability Probability) const {
+  // Avoid using conditional returns at the end of a loop (since then
+  // we'd need to emit an unconditional branch to the beginning anyway,
+  // making the loop body longer).  This doesn't apply for low-probability
+  // loops (eg. compare-and-swap retry), so just decide based on branch
+  // probability instead of looping structure.
+  // However, since Compare and Trap instructions cost the same as a regular
+  // Compare instruction, we should allow the if conversion to convert this
+  // into a Conditional Compare regardless of the branch probability.
+  if (MBB.getLastNonDebugInstr()->getOpcode() != SystemZ::Trap &&
+      MBB.succ_empty() && Probability < BranchProbability(1, 8))
+    return false;
+  // For now only convert single instructions.
+  return NumCycles == 1;
+}
+
+bool SystemZInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                    unsigned NumCyclesT, unsigned ExtraPredCyclesT,
+                    MachineBasicBlock &FMBB,
+                    unsigned NumCyclesF, unsigned ExtraPredCyclesF,
+                    BranchProbability Probability) const {
+  // For now avoid converting mutually-exclusive cases.
+  return false;
+}
+
+bool SystemZInstrInfo::
+isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                          BranchProbability Probability) const {
+  // For now only duplicate single instructions.
+  return NumCycles == 1;
+}
+
+bool SystemZInstrInfo::PredicateInstruction(
+    MachineInstr &MI, ArrayRef<MachineOperand> Pred) const {
+  assert(Pred.size() == 2 && "Invalid condition");
+  unsigned CCValid = Pred[0].getImm();
+  unsigned CCMask = Pred[1].getImm();
+  assert(CCMask > 0 && CCMask < 15 && "Invalid predicate");
+  unsigned Opcode = MI.getOpcode();
+  if (Opcode == SystemZ::Trap) {
+    MI.setDesc(get(SystemZ::CondTrap));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+      .addImm(CCValid).addImm(CCMask)
+      .addReg(SystemZ::CC, RegState::Implicit);
+    return true;
+  }
+  if (Opcode == SystemZ::Return) {
+    MI.setDesc(get(SystemZ::CondReturn));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+      .addImm(CCValid).addImm(CCMask)
+      .addReg(SystemZ::CC, RegState::Implicit);
+    return true;
+  }
+  if (Opcode == SystemZ::CallJG) {
+    MachineOperand FirstOp = MI.getOperand(0);
+    const uint32_t *RegMask = MI.getOperand(1).getRegMask();
+    MI.RemoveOperand(1);
+    MI.RemoveOperand(0);
+    MI.setDesc(get(SystemZ::CallBRCL));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+      .addImm(CCValid).addImm(CCMask)
+      .addOperand(FirstOp)
+      .addRegMask(RegMask)
+      .addReg(SystemZ::CC, RegState::Implicit);
+    return true;
+  }
+  if (Opcode == SystemZ::CallBR) {
+    const uint32_t *RegMask = MI.getOperand(0).getRegMask();
+    MI.RemoveOperand(0);
+    MI.setDesc(get(SystemZ::CallBCR));
+    MachineInstrBuilder(*MI.getParent()->getParent(), MI)
+      .addImm(CCValid).addImm(CCMask)
+      .addRegMask(RegMask)
+      .addReg(SystemZ::CC, RegState::Implicit);
+    return true;
+  }
+  return false;
+}
+
+void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   const DebugLoc &DL, unsigned DestReg,
+                                   unsigned SrcReg, bool KillSrc) const {
+  // Split 128-bit GPR moves into two 64-bit moves.  This handles ADDR128 too.
+  if (SystemZ::GR128BitRegClass.contains(DestReg, SrcReg)) {
+    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_h64),
+                RI.getSubReg(SrcReg, SystemZ::subreg_h64), KillSrc);
+    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_l64),
+                RI.getSubReg(SrcReg, SystemZ::subreg_l64), KillSrc);
+    return;
+  }
+
+  if (SystemZ::GRX32BitRegClass.contains(DestReg, SrcReg)) {
+    emitGRX32Move(MBB, MBBI, DL, DestReg, SrcReg, SystemZ::LR, 32, KillSrc);
+    return;
+  }
+
+  // Everything else needs only one instruction.
+  unsigned Opcode;
+  if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::LGR;
+  else if (SystemZ::FP32BitRegClass.contains(DestReg, SrcReg))
+    // For z13 we prefer LDR over LER to avoid partial register dependencies.
+    Opcode = STI.hasVector() ? SystemZ::LDR32 : SystemZ::LER;
+  else if (SystemZ::FP64BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::LDR;
+  else if (SystemZ::FP128BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::LXR;
+  else if (SystemZ::VR32BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::VLR32;
+  else if (SystemZ::VR64BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::VLR64;
+  else if (SystemZ::VR128BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::VLR;
+  else if (SystemZ::AR32BitRegClass.contains(DestReg, SrcReg))
+    Opcode = SystemZ::CPYA;
+  else if (SystemZ::AR32BitRegClass.contains(DestReg) &&
+           SystemZ::GR32BitRegClass.contains(SrcReg))
+    Opcode = SystemZ::SAR;
+  else if (SystemZ::GR32BitRegClass.contains(DestReg) &&
+           SystemZ::AR32BitRegClass.contains(SrcReg))
+    Opcode = SystemZ::EAR;
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+
+  BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+void SystemZInstrInfo::storeRegToStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+    bool isKill, int FrameIdx, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Callers may expect a single instruction, so keep 128-bit moves
+  // together for now and lower them after register allocation.
+  unsigned LoadOpcode, StoreOpcode;
+  getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode);
+  addFrameReference(BuildMI(MBB, MBBI, DL, get(StoreOpcode))
+                        .addReg(SrcReg, getKillRegState(isKill)),
+                    FrameIdx);
+}
+
+void SystemZInstrInfo::loadRegFromStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
+    int FrameIdx, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Callers may expect a single instruction, so keep 128-bit moves
+  // together for now and lower them after register allocation.
+  unsigned LoadOpcode, StoreOpcode;
+  getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode);
+  addFrameReference(BuildMI(MBB, MBBI, DL, get(LoadOpcode), DestReg),
+                    FrameIdx);
+}
+
+// Return true if MI is a simple load or store with a 12-bit displacement
+// and no index.  Flag is SimpleBDXLoad for loads and SimpleBDXStore for stores.
+static bool isSimpleBD12Move(const MachineInstr *MI, unsigned Flag) {
+  const MCInstrDesc &MCID = MI->getDesc();
+  return ((MCID.TSFlags & Flag) &&
+          isUInt<12>(MI->getOperand(2).getImm()) &&
+          MI->getOperand(3).getReg() == 0);
+}
+
+namespace {
+struct LogicOp {
+  LogicOp() : RegSize(0), ImmLSB(0), ImmSize(0) {}
+  LogicOp(unsigned regSize, unsigned immLSB, unsigned immSize)
+    : RegSize(regSize), ImmLSB(immLSB), ImmSize(immSize) {}
+
+  explicit operator bool() const { return RegSize; }
+
+  unsigned RegSize, ImmLSB, ImmSize;
+};
+} // end anonymous namespace
+
+static LogicOp interpretAndImmediate(unsigned Opcode) {
+  switch (Opcode) {
+  case SystemZ::NILMux: return LogicOp(32,  0, 16);
+  case SystemZ::NIHMux: return LogicOp(32, 16, 16);
+  case SystemZ::NILL64: return LogicOp(64,  0, 16);
+  case SystemZ::NILH64: return LogicOp(64, 16, 16);
+  case SystemZ::NIHL64: return LogicOp(64, 32, 16);
+  case SystemZ::NIHH64: return LogicOp(64, 48, 16);
+  case SystemZ::NIFMux: return LogicOp(32,  0, 32);
+  case SystemZ::NILF64: return LogicOp(64,  0, 32);
+  case SystemZ::NIHF64: return LogicOp(64, 32, 32);
+  default:              return LogicOp();
+  }
+}
+
+static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) {
+  if (OldMI->registerDefIsDead(SystemZ::CC)) {
+    MachineOperand *CCDef = NewMI->findRegisterDefOperand(SystemZ::CC);
+    if (CCDef != nullptr)
+      CCDef->setIsDead(true);
+  }
+}
+
+// Used to return from convertToThreeAddress after replacing two-address
+// instruction OldMI with three-address instruction NewMI.
+static MachineInstr *finishConvertToThreeAddress(MachineInstr *OldMI,
+                                                 MachineInstr *NewMI,
+                                                 LiveVariables *LV) {
+  if (LV) {
+    unsigned NumOps = OldMI->getNumOperands();
+    for (unsigned I = 1; I < NumOps; ++I) {
+      MachineOperand &Op = OldMI->getOperand(I);
+      if (Op.isReg() && Op.isKill())
+        LV->replaceKillInstruction(Op.getReg(), *OldMI, *NewMI);
+    }
+  }
+  transferDeadCC(OldMI, NewMI);
+  return NewMI;
+}
+
+MachineInstr *SystemZInstrInfo::convertToThreeAddress(
+    MachineFunction::iterator &MFI, MachineInstr &MI, LiveVariables *LV) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  unsigned Opcode = MI.getOpcode();
+  unsigned NumOps = MI.getNumOperands();
+
+  // Try to convert something like SLL into SLLK, if supported.
+  // We prefer to keep the two-operand form where possible both
+  // because it tends to be shorter and because some instructions
+  // have memory forms that can be used during spilling.
+  if (STI.hasDistinctOps()) {
+    MachineOperand &Dest = MI.getOperand(0);
+    MachineOperand &Src = MI.getOperand(1);
+    unsigned DestReg = Dest.getReg();
+    unsigned SrcReg = Src.getReg();
+    // AHIMux is only really a three-operand instruction when both operands
+    // are low registers.  Try to constrain both operands to be low if
+    // possible.
+    if (Opcode == SystemZ::AHIMux &&
+        TargetRegisterInfo::isVirtualRegister(DestReg) &&
+        TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+        MRI.getRegClass(DestReg)->contains(SystemZ::R1L) &&
+        MRI.getRegClass(SrcReg)->contains(SystemZ::R1L)) {
+      MRI.constrainRegClass(DestReg, &SystemZ::GR32BitRegClass);
+      MRI.constrainRegClass(SrcReg, &SystemZ::GR32BitRegClass);
+    }
+    int ThreeOperandOpcode = SystemZ::getThreeOperandOpcode(Opcode);
+    if (ThreeOperandOpcode >= 0) {
+      // Create three address instruction without adding the implicit
+      // operands. Those will instead be copied over from the original
+      // instruction by the loop below.
+      MachineInstrBuilder MIB(
+          *MF, MF->CreateMachineInstr(get(ThreeOperandOpcode), MI.getDebugLoc(),
+                                      /*NoImplicit=*/true));
+      MIB.addOperand(Dest);
+      // Keep the kill state, but drop the tied flag.
+      MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg());
+      // Keep the remaining operands as-is.
+      for (unsigned I = 2; I < NumOps; ++I)
+        MIB.addOperand(MI.getOperand(I));
+      MBB->insert(MI, MIB);
+      return finishConvertToThreeAddress(&MI, MIB, LV);
+    }
+  }
+
+  // Try to convert an AND into an RISBG-type instruction.
+  if (LogicOp And = interpretAndImmediate(Opcode)) {
+    uint64_t Imm = MI.getOperand(2).getImm() << And.ImmLSB;
+    // AND IMMEDIATE leaves the other bits of the register unchanged.
+    Imm |= allOnes(And.RegSize) & ~(allOnes(And.ImmSize) << And.ImmLSB);
+    unsigned Start, End;
+    if (isRxSBGMask(Imm, And.RegSize, Start, End)) {
+      unsigned NewOpcode;
+      if (And.RegSize == 64) {
+        NewOpcode = SystemZ::RISBG;
+        // Prefer RISBGN if available, since it does not clobber CC.
+        if (STI.hasMiscellaneousExtensions())
+          NewOpcode = SystemZ::RISBGN;
+      } else {
+        NewOpcode = SystemZ::RISBMux;
+        Start &= 31;
+        End &= 31;
+      }
+      MachineOperand &Dest = MI.getOperand(0);
+      MachineOperand &Src = MI.getOperand(1);
+      MachineInstrBuilder MIB =
+          BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpcode))
+              .addOperand(Dest)
+              .addReg(0)
+              .addReg(Src.getReg(), getKillRegState(Src.isKill()),
+                      Src.getSubReg())
+              .addImm(Start)
+              .addImm(End + 128)
+              .addImm(0);
+      return finishConvertToThreeAddress(&MI, MIB, LV);
+    }
+  }
+  return nullptr;
+}
+
+MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex,
+    LiveIntervals *LIS) const {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  unsigned Size = MFI.getObjectSize(FrameIndex);
+  unsigned Opcode = MI.getOpcode();
+
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    if (LIS != nullptr && (Opcode == SystemZ::LA || Opcode == SystemZ::LAY) &&
+        isInt<8>(MI.getOperand(2).getImm()) && !MI.getOperand(3).getReg()) {
+
+      // Check CC liveness, since new instruction introduces a dead
+      // def of CC.
+      MCRegUnitIterator CCUnit(SystemZ::CC, TRI);
+      LiveRange &CCLiveRange = LIS->getRegUnit(*CCUnit);
+      ++CCUnit;
+      assert (!CCUnit.isValid() && "CC only has one reg unit.");
+      SlotIndex MISlot =
+          LIS->getSlotIndexes()->getInstructionIndex(MI).getRegSlot();
+      if (!CCLiveRange.liveAt(MISlot)) {
+        // LA(Y) %reg, CONST(%reg) -> AGSI %mem, CONST
+        MachineInstr *BuiltMI = BuildMI(*InsertPt->getParent(), InsertPt,
+                                        MI.getDebugLoc(), get(SystemZ::AGSI))
+                                    .addFrameIndex(FrameIndex)
+                                    .addImm(0)
+                                    .addImm(MI.getOperand(2).getImm());
+        BuiltMI->findRegisterDefOperand(SystemZ::CC)->setIsDead(true);
+        CCLiveRange.createDeadDef(MISlot, LIS->getVNInfoAllocator());
+        return BuiltMI;
+      }
+    }
+    return nullptr;
+  }
+
+  // All other cases require a single operand.
+  if (Ops.size() != 1)
+    return nullptr;
+
+  unsigned OpNum = Ops[0];
+  assert(Size ==
+             MF.getRegInfo()
+                 .getRegClass(MI.getOperand(OpNum).getReg())
+                 ->getSize() &&
+         "Invalid size combination");
+
+  if ((Opcode == SystemZ::AHI || Opcode == SystemZ::AGHI) && OpNum == 0 &&
+      isInt<8>(MI.getOperand(2).getImm())) {
+    // A(G)HI %reg, CONST -> A(G)SI %mem, CONST
+    Opcode = (Opcode == SystemZ::AHI ? SystemZ::ASI : SystemZ::AGSI);
+    MachineInstr *BuiltMI =
+        BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(), get(Opcode))
+            .addFrameIndex(FrameIndex)
+            .addImm(0)
+            .addImm(MI.getOperand(2).getImm());
+    transferDeadCC(&MI, BuiltMI);
+    return BuiltMI;
+  }
+
+  if (Opcode == SystemZ::LGDR || Opcode == SystemZ::LDGR) {
+    bool Op0IsGPR = (Opcode == SystemZ::LGDR);
+    bool Op1IsGPR = (Opcode == SystemZ::LDGR);
+    // If we're spilling the destination of an LDGR or LGDR, store the
+    // source register instead.
+    if (OpNum == 0) {
+      unsigned StoreOpcode = Op1IsGPR ? SystemZ::STG : SystemZ::STD;
+      return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
+                     get(StoreOpcode))
+          .addOperand(MI.getOperand(1))
+          .addFrameIndex(FrameIndex)
+          .addImm(0)
+          .addReg(0);
+    }
+    // If we're spilling the source of an LDGR or LGDR, load the
+    // destination register instead.
+    if (OpNum == 1) {
+      unsigned LoadOpcode = Op0IsGPR ? SystemZ::LG : SystemZ::LD;
+      unsigned Dest = MI.getOperand(0).getReg();
+      return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
+                     get(LoadOpcode), Dest)
+          .addFrameIndex(FrameIndex)
+          .addImm(0)
+          .addReg(0);
+    }
+  }
+
+  // Look for cases where the source of a simple store or the destination
+  // of a simple load is being spilled.  Try to use MVC instead.
+  //
+  // Although MVC is in practice a fast choice in these cases, it is still
+  // logically a bytewise copy.  This means that we cannot use it if the
+  // load or store is volatile.  We also wouldn't be able to use MVC if
+  // the two memories partially overlap, but that case cannot occur here,
+  // because we know that one of the memories is a full frame index.
+  //
+  // For performance reasons, we also want to avoid using MVC if the addresses
+  // might be equal.  We don't worry about that case here, because spill slot
+  // coloring happens later, and because we have special code to remove
+  // MVCs that turn out to be redundant.
+  if (OpNum == 0 && MI.hasOneMemOperand()) {
+    MachineMemOperand *MMO = *MI.memoperands_begin();
+    if (MMO->getSize() == Size && !MMO->isVolatile()) {
+      // Handle conversion of loads.
+      if (isSimpleBD12Move(&MI, SystemZII::SimpleBDXLoad)) {
+        return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
+                       get(SystemZ::MVC))
+            .addFrameIndex(FrameIndex)
+            .addImm(0)
+            .addImm(Size)
+            .addOperand(MI.getOperand(1))
+            .addImm(MI.getOperand(2).getImm())
+            .addMemOperand(MMO);
+      }
+      // Handle conversion of stores.
+      if (isSimpleBD12Move(&MI, SystemZII::SimpleBDXStore)) {
+        return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
+                       get(SystemZ::MVC))
+            .addOperand(MI.getOperand(1))
+            .addImm(MI.getOperand(2).getImm())
+            .addImm(Size)
+            .addFrameIndex(FrameIndex)
+            .addImm(0)
+            .addMemOperand(MMO);
+      }
+    }
+  }
+
+  // If the spilled operand is the final one, try to change <INSN>R
+  // into <INSN>.
+  int MemOpcode = SystemZ::getMemOpcode(Opcode);
+  if (MemOpcode >= 0) {
+    unsigned NumOps = MI.getNumExplicitOperands();
+    if (OpNum == NumOps - 1) {
+      const MCInstrDesc &MemDesc = get(MemOpcode);
+      uint64_t AccessBytes = SystemZII::getAccessSize(MemDesc.TSFlags);
+      assert(AccessBytes != 0 && "Size of access should be known");
+      assert(AccessBytes <= Size && "Access outside the frame index");
+      uint64_t Offset = Size - AccessBytes;
+      MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
+                                        MI.getDebugLoc(), get(MemOpcode));
+      for (unsigned I = 0; I < OpNum; ++I)
+        MIB.addOperand(MI.getOperand(I));
+      MIB.addFrameIndex(FrameIndex).addImm(Offset);
+      if (MemDesc.TSFlags & SystemZII::HasIndex)
+        MIB.addReg(0);
+      transferDeadCC(&MI, MIB);
+      return MIB;
+    }
+  }
+
+  return nullptr;
+}
+
+MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+    LiveIntervals *LIS) const {
+  return nullptr;
+}
+
+bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case SystemZ::L128:
+    splitMove(MI, SystemZ::LG);
+    return true;
+
+  case SystemZ::ST128:
+    splitMove(MI, SystemZ::STG);
+    return true;
+
+  case SystemZ::LX:
+    splitMove(MI, SystemZ::LD);
+    return true;
+
+  case SystemZ::STX:
+    splitMove(MI, SystemZ::STD);
+    return true;
+
+  case SystemZ::LBMux:
+    expandRXYPseudo(MI, SystemZ::LB, SystemZ::LBH);
+    return true;
+
+  case SystemZ::LHMux:
+    expandRXYPseudo(MI, SystemZ::LH, SystemZ::LHH);
+    return true;
+
+  case SystemZ::LLCRMux:
+    expandZExtPseudo(MI, SystemZ::LLCR, 8);
+    return true;
+
+  case SystemZ::LLHRMux:
+    expandZExtPseudo(MI, SystemZ::LLHR, 16);
+    return true;
+
+  case SystemZ::LLCMux:
+    expandRXYPseudo(MI, SystemZ::LLC, SystemZ::LLCH);
+    return true;
+
+  case SystemZ::LLHMux:
+    expandRXYPseudo(MI, SystemZ::LLH, SystemZ::LLHH);
+    return true;
+
+  case SystemZ::LMux:
+    expandRXYPseudo(MI, SystemZ::L, SystemZ::LFH);
+    return true;
+
+  case SystemZ::LOCMux:
+    expandLOCPseudo(MI, SystemZ::LOC, SystemZ::LOCFH);
+    return true;
+
+  case SystemZ::LOCHIMux:
+    expandLOCPseudo(MI, SystemZ::LOCHI, SystemZ::LOCHHI);
+    return true;
+
+  case SystemZ::LOCRMux:
+    expandLOCRPseudo(MI, SystemZ::LOCR, SystemZ::LOCFHR);
+    return true;
+
+  case SystemZ::STCMux:
+    expandRXYPseudo(MI, SystemZ::STC, SystemZ::STCH);
+    return true;
+
+  case SystemZ::STHMux:
+    expandRXYPseudo(MI, SystemZ::STH, SystemZ::STHH);
+    return true;
+
+  case SystemZ::STMux:
+    expandRXYPseudo(MI, SystemZ::ST, SystemZ::STFH);
+    return true;
+
+  case SystemZ::STOCMux:
+    expandLOCPseudo(MI, SystemZ::STOC, SystemZ::STOCFH);
+    return true;
+
+  case SystemZ::LHIMux:
+    expandRIPseudo(MI, SystemZ::LHI, SystemZ::IIHF, true);
+    return true;
+
+  case SystemZ::IIFMux:
+    expandRIPseudo(MI, SystemZ::IILF, SystemZ::IIHF, false);
+    return true;
+
+  case SystemZ::IILMux:
+    expandRIPseudo(MI, SystemZ::IILL, SystemZ::IIHL, false);
+    return true;
+
+  case SystemZ::IIHMux:
+    expandRIPseudo(MI, SystemZ::IILH, SystemZ::IIHH, false);
+    return true;
+
+  case SystemZ::NIFMux:
+    expandRIPseudo(MI, SystemZ::NILF, SystemZ::NIHF, false);
+    return true;
+
+  case SystemZ::NILMux:
+    expandRIPseudo(MI, SystemZ::NILL, SystemZ::NIHL, false);
+    return true;
+
+  case SystemZ::NIHMux:
+    expandRIPseudo(MI, SystemZ::NILH, SystemZ::NIHH, false);
+    return true;
+
+  case SystemZ::OIFMux:
+    expandRIPseudo(MI, SystemZ::OILF, SystemZ::OIHF, false);
+    return true;
+
+  case SystemZ::OILMux:
+    expandRIPseudo(MI, SystemZ::OILL, SystemZ::OIHL, false);
+    return true;
+
+  case SystemZ::OIHMux:
+    expandRIPseudo(MI, SystemZ::OILH, SystemZ::OIHH, false);
+    return true;
+
+  case SystemZ::XIFMux:
+    expandRIPseudo(MI, SystemZ::XILF, SystemZ::XIHF, false);
+    return true;
+
+  case SystemZ::TMLMux:
+    expandRIPseudo(MI, SystemZ::TMLL, SystemZ::TMHL, false);
+    return true;
+
+  case SystemZ::TMHMux:
+    expandRIPseudo(MI, SystemZ::TMLH, SystemZ::TMHH, false);
+    return true;
+
+  case SystemZ::AHIMux:
+    expandRIPseudo(MI, SystemZ::AHI, SystemZ::AIH, false);
+    return true;
+
+  case SystemZ::AHIMuxK:
+    expandRIEPseudo(MI, SystemZ::AHI, SystemZ::AHIK, SystemZ::AIH);
+    return true;
+
+  case SystemZ::AFIMux:
+    expandRIPseudo(MI, SystemZ::AFI, SystemZ::AIH, false);
+    return true;
+
+  case SystemZ::CHIMux:
+    expandRIPseudo(MI, SystemZ::CHI, SystemZ::CIH, false);
+    return true;
+
+  case SystemZ::CFIMux:
+    expandRIPseudo(MI, SystemZ::CFI, SystemZ::CIH, false);
+    return true;
+
+  case SystemZ::CLFIMux:
+    expandRIPseudo(MI, SystemZ::CLFI, SystemZ::CLIH, false);
+    return true;
+
+  case SystemZ::CMux:
+    expandRXYPseudo(MI, SystemZ::C, SystemZ::CHF);
+    return true;
+
+  case SystemZ::CLMux:
+    expandRXYPseudo(MI, SystemZ::CL, SystemZ::CLHF);
+    return true;
+
+  case SystemZ::RISBMux: {
+    bool DestIsHigh = isHighReg(MI.getOperand(0).getReg());
+    bool SrcIsHigh = isHighReg(MI.getOperand(2).getReg());
+    if (SrcIsHigh == DestIsHigh)
+      MI.setDesc(get(DestIsHigh ? SystemZ::RISBHH : SystemZ::RISBLL));
+    else {
+      MI.setDesc(get(DestIsHigh ? SystemZ::RISBHL : SystemZ::RISBLH));
+      MI.getOperand(5).setImm(MI.getOperand(5).getImm() ^ 32);
+    }
+    return true;
+  }
+
+  case SystemZ::ADJDYNALLOC:
+    splitAdjDynAlloc(MI);
+    return true;
+
+  case TargetOpcode::LOAD_STACK_GUARD:
+    expandLoadStackGuard(&MI);
+    return true;
+
+  default:
+    return false;
+  }
+}
+
+unsigned SystemZInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+  if (MI.getOpcode() == TargetOpcode::INLINEASM) {
+    const MachineFunction *MF = MI.getParent()->getParent();
+    const char *AsmStr = MI.getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  }
+  return MI.getDesc().getSize();
+}
+
+SystemZII::Branch
+SystemZInstrInfo::getBranchInfo(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case SystemZ::BR:
+  case SystemZ::J:
+  case SystemZ::JG:
+    return SystemZII::Branch(SystemZII::BranchNormal, SystemZ::CCMASK_ANY,
+                             SystemZ::CCMASK_ANY, &MI.getOperand(0));
+
+  case SystemZ::BRC:
+  case SystemZ::BRCL:
+    return SystemZII::Branch(SystemZII::BranchNormal, MI.getOperand(0).getImm(),
+                             MI.getOperand(1).getImm(), &MI.getOperand(2));
+
+  case SystemZ::BRCT:
+  case SystemZ::BRCTH:
+    return SystemZII::Branch(SystemZII::BranchCT, SystemZ::CCMASK_ICMP,
+                             SystemZ::CCMASK_CMP_NE, &MI.getOperand(2));
+
+  case SystemZ::BRCTG:
+    return SystemZII::Branch(SystemZII::BranchCTG, SystemZ::CCMASK_ICMP,
+                             SystemZ::CCMASK_CMP_NE, &MI.getOperand(2));
+
+  case SystemZ::CIJ:
+  case SystemZ::CRJ:
+    return SystemZII::Branch(SystemZII::BranchC, SystemZ::CCMASK_ICMP,
+                             MI.getOperand(2).getImm(), &MI.getOperand(3));
+
+  case SystemZ::CLIJ:
+  case SystemZ::CLRJ:
+    return SystemZII::Branch(SystemZII::BranchCL, SystemZ::CCMASK_ICMP,
+                             MI.getOperand(2).getImm(), &MI.getOperand(3));
+
+  case SystemZ::CGIJ:
+  case SystemZ::CGRJ:
+    return SystemZII::Branch(SystemZII::BranchCG, SystemZ::CCMASK_ICMP,
+                             MI.getOperand(2).getImm(), &MI.getOperand(3));
+
+  case SystemZ::CLGIJ:
+  case SystemZ::CLGRJ:
+    return SystemZII::Branch(SystemZII::BranchCLG, SystemZ::CCMASK_ICMP,
+                             MI.getOperand(2).getImm(), &MI.getOperand(3));
+
+  default:
+    llvm_unreachable("Unrecognized branch opcode");
+  }
+}
+
+void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC,
+                                           unsigned &LoadOpcode,
+                                           unsigned &StoreOpcode) const {
+  if (RC == &SystemZ::GR32BitRegClass || RC == &SystemZ::ADDR32BitRegClass) {
+    LoadOpcode = SystemZ::L;
+    StoreOpcode = SystemZ::ST;
+  } else if (RC == &SystemZ::GRH32BitRegClass) {
+    LoadOpcode = SystemZ::LFH;
+    StoreOpcode = SystemZ::STFH;
+  } else if (RC == &SystemZ::GRX32BitRegClass) {
+    LoadOpcode = SystemZ::LMux;
+    StoreOpcode = SystemZ::STMux;
+  } else if (RC == &SystemZ::GR64BitRegClass ||
+             RC == &SystemZ::ADDR64BitRegClass) {
+    LoadOpcode = SystemZ::LG;
+    StoreOpcode = SystemZ::STG;
+  } else if (RC == &SystemZ::GR128BitRegClass ||
+             RC == &SystemZ::ADDR128BitRegClass) {
+    LoadOpcode = SystemZ::L128;
+    StoreOpcode = SystemZ::ST128;
+  } else if (RC == &SystemZ::FP32BitRegClass) {
+    LoadOpcode = SystemZ::LE;
+    StoreOpcode = SystemZ::STE;
+  } else if (RC == &SystemZ::FP64BitRegClass) {
+    LoadOpcode = SystemZ::LD;
+    StoreOpcode = SystemZ::STD;
+  } else if (RC == &SystemZ::FP128BitRegClass) {
+    LoadOpcode = SystemZ::LX;
+    StoreOpcode = SystemZ::STX;
+  } else if (RC == &SystemZ::VR32BitRegClass) {
+    LoadOpcode = SystemZ::VL32;
+    StoreOpcode = SystemZ::VST32;
+  } else if (RC == &SystemZ::VR64BitRegClass) {
+    LoadOpcode = SystemZ::VL64;
+    StoreOpcode = SystemZ::VST64;
+  } else if (RC == &SystemZ::VF128BitRegClass ||
+             RC == &SystemZ::VR128BitRegClass) {
+    LoadOpcode = SystemZ::VL;
+    StoreOpcode = SystemZ::VST;
+  } else
+    llvm_unreachable("Unsupported regclass to load or store");
+}
+
+unsigned SystemZInstrInfo::getOpcodeForOffset(unsigned Opcode,
+                                              int64_t Offset) const {
+  const MCInstrDesc &MCID = get(Opcode);
+  int64_t Offset2 = (MCID.TSFlags & SystemZII::Is128Bit ? Offset + 8 : Offset);
+  if (isUInt<12>(Offset) && isUInt<12>(Offset2)) {
+    // Get the instruction to use for unsigned 12-bit displacements.
+    int Disp12Opcode = SystemZ::getDisp12Opcode(Opcode);
+    if (Disp12Opcode >= 0)
+      return Disp12Opcode;
+
+    // All address-related instructions can use unsigned 12-bit
+    // displacements.
+    return Opcode;
+  }
+  if (isInt<20>(Offset) && isInt<20>(Offset2)) {
+    // Get the instruction to use for signed 20-bit displacements.
+    int Disp20Opcode = SystemZ::getDisp20Opcode(Opcode);
+    if (Disp20Opcode >= 0)
+      return Disp20Opcode;
+
+    // Check whether Opcode allows signed 20-bit displacements.
+    if (MCID.TSFlags & SystemZII::Has20BitOffset)
+      return Opcode;
+  }
+  return 0;
+}
+
+unsigned SystemZInstrInfo::getLoadAndTest(unsigned Opcode) const {
+  switch (Opcode) {
+  case SystemZ::L:      return SystemZ::LT;
+  case SystemZ::LY:     return SystemZ::LT;
+  case SystemZ::LG:     return SystemZ::LTG;
+  case SystemZ::LGF:    return SystemZ::LTGF;
+  case SystemZ::LR:     return SystemZ::LTR;
+  case SystemZ::LGFR:   return SystemZ::LTGFR;
+  case SystemZ::LGR:    return SystemZ::LTGR;
+  case SystemZ::LER:    return SystemZ::LTEBR;
+  case SystemZ::LDR:    return SystemZ::LTDBR;
+  case SystemZ::LXR:    return SystemZ::LTXBR;
+  case SystemZ::LCDFR:  return SystemZ::LCDBR;
+  case SystemZ::LPDFR:  return SystemZ::LPDBR;
+  case SystemZ::LNDFR:  return SystemZ::LNDBR;
+  case SystemZ::LCDFR_32:  return SystemZ::LCEBR;
+  case SystemZ::LPDFR_32:  return SystemZ::LPEBR;
+  case SystemZ::LNDFR_32:  return SystemZ::LNEBR;
+  // On zEC12 we prefer to use RISBGN.  But if there is a chance to
+  // actually use the condition code, we may turn it back into RISGB.
+  // Note that RISBG is not really a "load-and-test" instruction,
+  // but sets the same condition code values, so is OK to use here.
+  case SystemZ::RISBGN: return SystemZ::RISBG;
+  default:              return 0;
+  }
+}
+
+// Return true if Mask matches the regexp 0*1+0*, given that zero masks
+// have already been filtered out.  Store the first set bit in LSB and
+// the number of set bits in Length if so.
+static bool isStringOfOnes(uint64_t Mask, unsigned &LSB, unsigned &Length) {
+  unsigned First = findFirstSet(Mask);
+  uint64_t Top = (Mask >> First) + 1;
+  if ((Top & -Top) == Top) {
+    LSB = First;
+    Length = findFirstSet(Top);
+    return true;
+  }
+  return false;
+}
+
+bool SystemZInstrInfo::isRxSBGMask(uint64_t Mask, unsigned BitSize,
+                                   unsigned &Start, unsigned &End) const {
+  // Reject trivial all-zero masks.
+  Mask &= allOnes(BitSize);
+  if (Mask == 0)
+    return false;
+
+  // Handle the 1+0+ or 0+1+0* cases.  Start then specifies the index of
+  // the msb and End specifies the index of the lsb.
+  unsigned LSB, Length;
+  if (isStringOfOnes(Mask, LSB, Length)) {
+    Start = 63 - (LSB + Length - 1);
+    End = 63 - LSB;
+    return true;
+  }
+
+  // Handle the wrap-around 1+0+1+ cases.  Start then specifies the msb
+  // of the low 1s and End specifies the lsb of the high 1s.
+  if (isStringOfOnes(Mask ^ allOnes(BitSize), LSB, Length)) {
+    assert(LSB > 0 && "Bottom bit must be set");
+    assert(LSB + Length < BitSize && "Top bit must be set");
+    Start = 63 - (LSB - 1);
+    End = 63 - (LSB + Length);
+    return true;
+  }
+
+  return false;
+}
+
+unsigned SystemZInstrInfo::getFusedCompare(unsigned Opcode,
+                                           SystemZII::FusedCompareType Type,
+                                           const MachineInstr *MI) const {
+  switch (Opcode) {
+  case SystemZ::CHI:
+  case SystemZ::CGHI:
+    if (!(MI && isInt<8>(MI->getOperand(1).getImm())))
+      return 0;
+    break;
+  case SystemZ::CLFI:
+  case SystemZ::CLGFI:
+    if (!(MI && isUInt<8>(MI->getOperand(1).getImm())))
+      return 0;
+    break;
+  case SystemZ::CL:
+  case SystemZ::CLG:
+    if (!STI.hasMiscellaneousExtensions())
+      return 0;
+    if (!(MI && MI->getOperand(3).getReg() == 0))
+      return 0;
+    break;
+  }
+  switch (Type) {
+  case SystemZII::CompareAndBranch:
+    switch (Opcode) {
+    case SystemZ::CR:
+      return SystemZ::CRJ;
+    case SystemZ::CGR:
+      return SystemZ::CGRJ;
+    case SystemZ::CHI:
+      return SystemZ::CIJ;
+    case SystemZ::CGHI:
+      return SystemZ::CGIJ;
+    case SystemZ::CLR:
+      return SystemZ::CLRJ;
+    case SystemZ::CLGR:
+      return SystemZ::CLGRJ;
+    case SystemZ::CLFI:
+      return SystemZ::CLIJ;
+    case SystemZ::CLGFI:
+      return SystemZ::CLGIJ;
+    default:
+      return 0;
+    }
+  case SystemZII::CompareAndReturn:
+    switch (Opcode) {
+    case SystemZ::CR:
+      return SystemZ::CRBReturn;
+    case SystemZ::CGR:
+      return SystemZ::CGRBReturn;
+    case SystemZ::CHI:
+      return SystemZ::CIBReturn;
+    case SystemZ::CGHI:
+      return SystemZ::CGIBReturn;
+    case SystemZ::CLR:
+      return SystemZ::CLRBReturn;
+    case SystemZ::CLGR:
+      return SystemZ::CLGRBReturn;
+    case SystemZ::CLFI:
+      return SystemZ::CLIBReturn;
+    case SystemZ::CLGFI:
+      return SystemZ::CLGIBReturn;
+    default:
+      return 0;
+    }
+  case SystemZII::CompareAndSibcall:
+    switch (Opcode) {
+    case SystemZ::CR:
+      return SystemZ::CRBCall;
+    case SystemZ::CGR:
+      return SystemZ::CGRBCall;
+    case SystemZ::CHI:
+      return SystemZ::CIBCall;
+    case SystemZ::CGHI:
+      return SystemZ::CGIBCall;
+    case SystemZ::CLR:
+      return SystemZ::CLRBCall;
+    case SystemZ::CLGR:
+      return SystemZ::CLGRBCall;
+    case SystemZ::CLFI:
+      return SystemZ::CLIBCall;
+    case SystemZ::CLGFI:
+      return SystemZ::CLGIBCall;
+    default:
+      return 0;
+    }
+  case SystemZII::CompareAndTrap:
+    switch (Opcode) {
+    case SystemZ::CR:
+      return SystemZ::CRT;
+    case SystemZ::CGR:
+      return SystemZ::CGRT;
+    case SystemZ::CHI:
+      return SystemZ::CIT;
+    case SystemZ::CGHI:
+      return SystemZ::CGIT;
+    case SystemZ::CLR:
+      return SystemZ::CLRT;
+    case SystemZ::CLGR:
+      return SystemZ::CLGRT;
+    case SystemZ::CLFI:
+      return SystemZ::CLFIT;
+    case SystemZ::CLGFI:
+      return SystemZ::CLGIT;
+    case SystemZ::CL:
+      return SystemZ::CLT;
+    case SystemZ::CLG:
+      return SystemZ::CLGT;
+    default:
+      return 0;
+    }
+  }
+  return 0;
+}
+
+unsigned SystemZInstrInfo::getLoadAndTrap(unsigned Opcode) const {
+  if (!STI.hasLoadAndTrap())
+    return 0;
+  switch (Opcode) {
+  case SystemZ::L:
+  case SystemZ::LY:
+    return SystemZ::LAT;
+  case SystemZ::LG:
+    return SystemZ::LGAT;
+  case SystemZ::LFH:
+    return SystemZ::LFHAT;
+  case SystemZ::LLGF:
+    return SystemZ::LLGFAT;
+  case SystemZ::LLGT:
+    return SystemZ::LLGTAT;
+  }
+  return 0;
+}
+
+void SystemZInstrInfo::loadImmediate(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     unsigned Reg, uint64_t Value) const {
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  unsigned Opcode;
+  if (isInt<16>(Value))
+    Opcode = SystemZ::LGHI;
+  else if (SystemZ::isImmLL(Value))
+    Opcode = SystemZ::LLILL;
+  else if (SystemZ::isImmLH(Value)) {
+    Opcode = SystemZ::LLILH;
+    Value >>= 16;
+  } else {
+    assert(isInt<32>(Value) && "Huge values not handled yet");
+    Opcode = SystemZ::LGFI;
+  }
+  BuildMI(MBB, MBBI, DL, get(Opcode), Reg).addImm(Value);
+}
+
+bool SystemZInstrInfo::
+areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+                                AliasAnalysis *AA) const {
+
+  if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand())
+    return false;
+
+  // If mem-operands show that the same address Value is used by both
+  // instructions, check for non-overlapping offsets and widths. Not
+  // sure if a register based analysis would be an improvement...
+
+  MachineMemOperand *MMOa = *MIa.memoperands_begin();
+  MachineMemOperand *MMOb = *MIb.memoperands_begin();
+  const Value *VALa = MMOa->getValue();
+  const Value *VALb = MMOb->getValue();
+  bool SameVal = (VALa && VALb && (VALa == VALb));
+  if (!SameVal) {
+    const PseudoSourceValue *PSVa = MMOa->getPseudoValue();
+    const PseudoSourceValue *PSVb = MMOb->getPseudoValue();
+    if (PSVa && PSVb && (PSVa == PSVb))
+      SameVal = true;
+  }
+  if (SameVal) {
+    int OffsetA = MMOa->getOffset(), OffsetB = MMOb->getOffset();
+    int WidthA = MMOa->getSize(), WidthB = MMOb->getSize();
+    int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
+    int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
+    int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+    if (LowOffset + LowWidth <= HighOffset)
+      return true;
+  }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
new file mode 100644
index 000000000000..794b193a501e
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -0,0 +1,309 @@
+//===-- SystemZInstrInfo.h - SystemZ instruction information ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SystemZ implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZINSTRINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZINSTRINFO_H
+
+#include "SystemZ.h"
+#include "SystemZRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "SystemZGenInstrInfo.inc"
+
+namespace llvm {
+
+class SystemZTargetMachine;
+
+namespace SystemZII {
+enum {
+  // See comments in SystemZInstrFormats.td.
+  SimpleBDXLoad          = (1 << 0),
+  SimpleBDXStore         = (1 << 1),
+  Has20BitOffset         = (1 << 2),
+  HasIndex               = (1 << 3),
+  Is128Bit               = (1 << 4),
+  AccessSizeMask         = (31 << 5),
+  AccessSizeShift        = 5,
+  CCValuesMask           = (15 << 10),
+  CCValuesShift          = 10,
+  CompareZeroCCMaskMask  = (15 << 14),
+  CompareZeroCCMaskShift = 14,
+  CCMaskFirst            = (1 << 18),
+  CCMaskLast             = (1 << 19),
+  IsLogical              = (1 << 20)
+};
+static inline unsigned getAccessSize(unsigned int Flags) {
+  return (Flags & AccessSizeMask) >> AccessSizeShift;
+}
+static inline unsigned getCCValues(unsigned int Flags) {
+  return (Flags & CCValuesMask) >> CCValuesShift;
+}
+static inline unsigned getCompareZeroCCMask(unsigned int Flags) {
+  return (Flags & CompareZeroCCMaskMask) >> CompareZeroCCMaskShift;
+}
+
+// SystemZ MachineOperand target flags.
+enum {
+  // Masks out the bits for the access model.
+  MO_SYMBOL_MODIFIER = (3 << 0),
+
+  // @GOT (aka @GOTENT)
+  MO_GOT = (1 << 0),
+
+  // @INDNTPOFF
+  MO_INDNTPOFF = (2 << 0)
+};
+// Classifies a branch.
+enum BranchType {
+  // An instruction that branches on the current value of CC.
+  BranchNormal,
+
+  // An instruction that peforms a 32-bit signed comparison and branches
+  // on the result.
+  BranchC,
+
+  // An instruction that peforms a 32-bit unsigned comparison and branches
+  // on the result.
+  BranchCL,
+
+  // An instruction that peforms a 64-bit signed comparison and branches
+  // on the result.
+  BranchCG,
+
+  // An instruction that peforms a 64-bit unsigned comparison and branches
+  // on the result.
+  BranchCLG,
+
+  // An instruction that decrements a 32-bit register and branches if
+  // the result is nonzero.
+  BranchCT,
+
+  // An instruction that decrements a 64-bit register and branches if
+  // the result is nonzero.
+  BranchCTG
+};
+// Information about a branch instruction.
+struct Branch {
+  // The type of the branch.
+  BranchType Type;
+
+  // CCMASK_<N> is set if CC might be equal to N.
+  unsigned CCValid;
+
+  // CCMASK_<N> is set if the branch should be taken when CC == N.
+  unsigned CCMask;
+
+  // The target of the branch.
+  const MachineOperand *Target;
+
+  Branch(BranchType type, unsigned ccValid, unsigned ccMask,
+         const MachineOperand *target)
+    : Type(type), CCValid(ccValid), CCMask(ccMask), Target(target) {}
+};
+// Kinds of fused compares in compare-and-* instructions.  Together with type
+// of the converted compare, this identifies the compare-and-*
+// instruction.
+enum FusedCompareType {
+  // Relative branch - CRJ etc.
+  CompareAndBranch,
+
+  // Indirect branch, used for return - CRBReturn etc.
+  CompareAndReturn,
+
+  // Indirect branch, used for sibcall - CRBCall etc.
+  CompareAndSibcall,
+
+  // Trap
+  CompareAndTrap
+};
+} // end namespace SystemZII
+
+class SystemZSubtarget;
+class SystemZInstrInfo : public SystemZGenInstrInfo {
+  const SystemZRegisterInfo RI;
+  SystemZSubtarget &STI;
+
+  void splitMove(MachineBasicBlock::iterator MI, unsigned NewOpcode) const;
+  void splitAdjDynAlloc(MachineBasicBlock::iterator MI) const;
+  void expandRIPseudo(MachineInstr &MI, unsigned LowOpcode, unsigned HighOpcode,
+                      bool ConvertHigh) const;
+  void expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode,
+                       unsigned LowOpcodeK, unsigned HighOpcode) const;
+  void expandRXYPseudo(MachineInstr &MI, unsigned LowOpcode,
+                       unsigned HighOpcode) const;
+  void expandLOCPseudo(MachineInstr &MI, unsigned LowOpcode,
+                       unsigned HighOpcode) const;
+  void expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode,
+                        unsigned HighOpcode) const;
+  void expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
+                        unsigned Size) const;
+  void expandLoadStackGuard(MachineInstr *MI) const;
+  void emitGRX32Move(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                     unsigned LowLowOpcode, unsigned Size, bool KillSrc) const;
+  virtual void anchor();
+
+protected:
+  /// Commutes the operands in the given instruction by changing the operands
+  /// order and/or changing the instruction's opcode and/or the immediate value
+  /// operand.
+  ///
+  /// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands
+  /// to be commuted.
+  ///
+  /// Do not call this method for a non-commutable instruction or
+  /// non-commutable operands.
+  /// Even though the instruction is commutable, the method may still
+  /// fail to commute the operands, null pointer is returned in such cases.
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+                                       unsigned CommuteOpIdx1,
+                                       unsigned CommuteOpIdx2) const override;
+
+public:
+  explicit SystemZInstrInfo(SystemZSubtarget &STI);
+
+  // Override TargetInstrInfo.
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+  bool isStackSlotCopy(const MachineInstr &MI, int &DestFrameIndex,
+                       int &SrcFrameIndex) const override;
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &Mask, int &Value) const override;
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int Mask, int Value,
+                            const MachineRegisterInfo *MRI) const override;
+  bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond,
+                       unsigned, unsigned, int&, int&, int&) const override;
+  void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                    const DebugLoc &DL, unsigned DstReg,
+                    ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+                    unsigned FalseReg) const override;
+  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+                     MachineRegisterInfo *MRI) const override;
+  bool isPredicable(MachineInstr &MI) const override;
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                           unsigned ExtraPredCycles,
+                           BranchProbability Probability) const override;
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                           unsigned NumCyclesT, unsigned ExtraPredCyclesT,
+                           MachineBasicBlock &FMBB,
+                           unsigned NumCyclesF, unsigned ExtraPredCyclesF,
+                           BranchProbability Probability) const override;
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                            BranchProbability Probability) const override;
+  bool PredicateInstruction(MachineInstr &MI,
+                            ArrayRef<MachineOperand> Pred) const override;
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIdx,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                      MachineInstr &MI,
+                                      LiveVariables *LV) const override;
+  MachineInstr *
+  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                        ArrayRef<unsigned> Ops,
+                        MachineBasicBlock::iterator InsertPt, int FrameIndex,
+                        LiveIntervals *LIS = nullptr) const override;
+  MachineInstr *foldMemoryOperandImpl(
+      MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+      MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+      LiveIntervals *LIS = nullptr) const override;
+  bool expandPostRAPseudo(MachineInstr &MBBI) const override;
+  bool reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
+    override;
+
+  // Return the SystemZRegisterInfo, which this class owns.
+  const SystemZRegisterInfo &getRegisterInfo() const { return RI; }
+
+  // Return the size in bytes of MI.
+  unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+  // Return true if MI is a conditional or unconditional branch.
+  // When returning true, set Cond to the mask of condition-code
+  // values on which the instruction will branch, and set Target
+  // to the operand that contains the branch target.  This target
+  // can be a register or a basic block.
+  SystemZII::Branch getBranchInfo(const MachineInstr &MI) const;
+
+  // Get the load and store opcodes for a given register class.
+  void getLoadStoreOpcodes(const TargetRegisterClass *RC,
+                           unsigned &LoadOpcode, unsigned &StoreOpcode) const;
+
+  // Opcode is the opcode of an instruction that has an address operand,
+  // and the caller wants to perform that instruction's operation on an
+  // address that has displacement Offset.  Return the opcode of a suitable
+  // instruction (which might be Opcode itself) or 0 if no such instruction
+  // exists.
+  unsigned getOpcodeForOffset(unsigned Opcode, int64_t Offset) const;
+
+  // If Opcode is a load instruction that has a LOAD AND TEST form,
+  // return the opcode for the testing form, otherwise return 0.
+  unsigned getLoadAndTest(unsigned Opcode) const;
+
+  // Return true if ROTATE AND ... SELECTED BITS can be used to select bits
+  // Mask of the R2 operand, given that only the low BitSize bits of Mask are
+  // significant.  Set Start and End to the I3 and I4 operands if so.
+  bool isRxSBGMask(uint64_t Mask, unsigned BitSize,
+                   unsigned &Start, unsigned &End) const;
+
+  // If Opcode is a COMPARE opcode for which an associated fused COMPARE AND *
+  // operation exists, return the opcode for the latter, otherwise return 0.
+  // MI, if nonnull, is the compare instruction.
+  unsigned getFusedCompare(unsigned Opcode,
+                           SystemZII::FusedCompareType Type,
+                           const MachineInstr *MI = nullptr) const;
+
+  // If Opcode is a LOAD opcode for with an associated LOAD AND TRAP
+  // operation exists, returh the opcode for the latter, otherwise return 0.
+  unsigned getLoadAndTrap(unsigned Opcode) const;
+
+  // Emit code before MBBI in MI to move immediate value Value into
+  // physical register Reg.
+  void loadImmediate(MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator MBBI,
+                     unsigned Reg, uint64_t Value) const;
+
+  // Sometimes, it is possible for the target to tell, even without
+  // aliasing information, that two MIs access different memory
+  // addresses. This function returns true if two MIs access different
+  // memory addresses and false otherwise.
+  bool
+  areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
+                                  AliasAnalysis *AA = nullptr) const override;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
new file mode 100644
index 000000000000..d63525f29412
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -0,0 +1,1929 @@
+//===-- SystemZInstrInfo.td - General SystemZ instructions ----*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+let hasNoSchedulingInfo = 1 in {
+  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
+                                [(callseq_start timm:$amt)]>;
+  def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+                                [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let hasSideEffects = 0 in {
+  // Takes as input the value of the stack pointer after a dynamic allocation
+  // has been made.  Sets the output to the address of the dynamically-
+  // allocated area itself, skipping the outgoing arguments.
+  //
+  // This expands to an LA or LAY instruction.  We restrict the offset
+  // to the range of LA and keep the LAY range in reserve for when
+  // the size of the outgoing arguments is added.
+  def ADJDYNALLOC : Pseudo<(outs GR64:$dst), (ins dynalloc12only:$src),
+                           [(set GR64:$dst, dynalloc12only:$src)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch instructions
+//===----------------------------------------------------------------------===//
+
+// Conditional branches.
+let isBranch = 1, isTerminator = 1, Uses = [CC] in {
+  // It's easier for LLVM to handle these branches in their raw BRC/BRCL form
+  // with the condition-code mask being the first operand.  It seems friendlier
+  // to use mnemonic forms like JE and JLH when writing out the assembly though.
+  let isCodeGenOnly = 1 in {
+    // An assembler extended mnemonic for BRC.
+    def BRC  : CondBranchRI <"j#",  0xA74, z_br_ccmask>;
+    // An assembler extended mnemonic for BRCL.  (The extension is "G"
+    // rather than "L" because "JL" is "Jump if Less".)
+    def BRCL : CondBranchRIL<"jg#", 0xC04>;
+    let isIndirectBranch = 1 in {
+      def BC  : CondBranchRX<"b#",  0x47>;
+      def BCR : CondBranchRR<"b#r", 0x07>;
+    }
+  }
+
+  // Allow using the raw forms directly from the assembler (and occasional
+  // special code generation needs) as well.
+  def BRCAsm  : AsmCondBranchRI <"brc",  0xA74>;
+  def BRCLAsm : AsmCondBranchRIL<"brcl", 0xC04>;
+  let isIndirectBranch = 1 in {
+    def BCAsm  : AsmCondBranchRX<"bc",  0x47>;
+    def BCRAsm : AsmCondBranchRR<"bcr", 0x07>;
+  }
+
+  // Define AsmParser extended mnemonics for each general condition-code mask
+  // (integer or floating-point)
+  foreach V = [ "E", "NE", "H", "NH", "L", "NL", "HE", "NHE", "LE", "NLE",
+                "Z", "NZ", "P", "NP", "M", "NM", "LH", "NLH", "O", "NO" ] in {
+    def JAsm#V  : FixedCondBranchRI <CV<V>, "j#",  0xA74>;
+    def JGAsm#V : FixedCondBranchRIL<CV<V>, "jg#", 0xC04>;
+    let isIndirectBranch = 1 in {
+      def BAsm#V  : FixedCondBranchRX <CV<V>, "b#",  0x47>;
+      def BRAsm#V : FixedCondBranchRR <CV<V>, "b#r", 0x07>;
+    }
+  }
+}
+
+// Unconditional branches.  These are in fact simply variants of the
+// conditional branches with the condition mask set to "always".
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+  def J  : FixedCondBranchRI <CondAlways, "j",  0xA74, br>;
+  def JG : FixedCondBranchRIL<CondAlways, "jg", 0xC04>;
+  let isIndirectBranch = 1 in {
+    def B  : FixedCondBranchRX<CondAlways, "b",  0x47>;
+    def BR : FixedCondBranchRR<CondAlways, "br", 0x07, brind>;
+  }
+}
+
+// NOPs.  These are again variants of the conditional branches,
+// with the condition mask set to "never".
+def NOP  : InstAlias<"nop\t$XBD", (BCAsm 0, bdxaddr12only:$XBD), 0>;
+def NOPR : InstAlias<"nopr\t$R", (BCRAsm 0, GR64:$R), 0>;
+
+// Fused compare-and-branch instructions.
+//
+// These instructions do not use or clobber the condition codes.
+// We nevertheless pretend that the relative compare-and-branch
+// instructions clobber CC, so that we can lower them to separate
+// comparisons and BRCLs if the branch ends up being out of range.
+let isBranch = 1, isTerminator = 1 in {
+  // As for normal branches, we handle these instructions internally in
+  // their raw CRJ-like form, but use assembly macros like CRJE when writing
+  // them out.  Using the *Pair multiclasses, we also create the raw forms.
+  let Defs = [CC] in {
+    defm CRJ   : CmpBranchRIEbPair<"crj",   0xEC76, GR32>;
+    defm CGRJ  : CmpBranchRIEbPair<"cgrj",  0xEC64, GR64>;
+    defm CIJ   : CmpBranchRIEcPair<"cij",   0xEC7E, GR32, imm32sx8>;
+    defm CGIJ  : CmpBranchRIEcPair<"cgij",  0xEC7C, GR64, imm64sx8>;
+    defm CLRJ  : CmpBranchRIEbPair<"clrj",  0xEC77, GR32>;
+    defm CLGRJ : CmpBranchRIEbPair<"clgrj", 0xEC65, GR64>;
+    defm CLIJ  : CmpBranchRIEcPair<"clij",  0xEC7F, GR32, imm32zx8>;
+    defm CLGIJ : CmpBranchRIEcPair<"clgij", 0xEC7D, GR64, imm64zx8>;
+  }
+  let isIndirectBranch = 1 in {
+    defm CRB   : CmpBranchRRSPair<"crb",   0xECF6, GR32>;
+    defm CGRB  : CmpBranchRRSPair<"cgrb",  0xECE4, GR64>;
+    defm CIB   : CmpBranchRISPair<"cib",   0xECFE, GR32, imm32sx8>;
+    defm CGIB  : CmpBranchRISPair<"cgib",  0xECFC, GR64, imm64sx8>;
+    defm CLRB  : CmpBranchRRSPair<"clrb",  0xECF7, GR32>;
+    defm CLGRB : CmpBranchRRSPair<"clgrb", 0xECE5, GR64>;
+    defm CLIB  : CmpBranchRISPair<"clib",  0xECFF, GR32, imm32zx8>;
+    defm CLGIB : CmpBranchRISPair<"clgib", 0xECFD, GR64, imm64zx8>;
+  }
+
+  // Define AsmParser mnemonics for each integer condition-code mask.
+  foreach V = [ "E", "H", "L", "HE", "LE", "LH",
+                "NE", "NH", "NL", "NHE", "NLE", "NLH" ] in {
+    let Defs = [CC] in {
+      def CRJAsm#V   : FixedCmpBranchRIEb<ICV<V>, "crj",   0xEC76, GR32>;
+      def CGRJAsm#V  : FixedCmpBranchRIEb<ICV<V>, "cgrj",  0xEC64, GR64>;
+      def CIJAsm#V   : FixedCmpBranchRIEc<ICV<V>, "cij",   0xEC7E, GR32,
+                                          imm32sx8>;
+      def CGIJAsm#V  : FixedCmpBranchRIEc<ICV<V>, "cgij",  0xEC7C, GR64,
+                                          imm64sx8>;
+      def CLRJAsm#V  : FixedCmpBranchRIEb<ICV<V>, "clrj",  0xEC77, GR32>;
+      def CLGRJAsm#V : FixedCmpBranchRIEb<ICV<V>, "clgrj", 0xEC65, GR64>;
+      def CLIJAsm#V  : FixedCmpBranchRIEc<ICV<V>, "clij",  0xEC7F, GR32,
+                                          imm32zx8>;
+      def CLGIJAsm#V : FixedCmpBranchRIEc<ICV<V>, "clgij", 0xEC7D, GR64,
+                                          imm64zx8>;
+    }
+    let isIndirectBranch = 1 in {
+      def CRBAsm#V   : FixedCmpBranchRRS<ICV<V>, "crb",   0xECF6, GR32>;
+      def CGRBAsm#V  : FixedCmpBranchRRS<ICV<V>, "cgrb",  0xECE4, GR64>;
+      def CIBAsm#V   : FixedCmpBranchRIS<ICV<V>, "cib",   0xECFE, GR32,
+                                         imm32sx8>;
+      def CGIBAsm#V  : FixedCmpBranchRIS<ICV<V>, "cgib",  0xECFC, GR64,
+                                         imm64sx8>;
+      def CLRBAsm#V  : FixedCmpBranchRRS<ICV<V>, "clrb",  0xECF7, GR32>;
+      def CLGRBAsm#V : FixedCmpBranchRRS<ICV<V>, "clgrb", 0xECE5, GR64>;
+      def CLIBAsm#V  : FixedCmpBranchRIS<ICV<V>, "clib",  0xECFF, GR32,
+                                         imm32zx8>;
+      def CLGIBAsm#V : FixedCmpBranchRIS<ICV<V>, "clgib", 0xECFD, GR64,
+                                         imm64zx8>;
+    }
+  }
+}
+
+// Decrement a register and branch if it is nonzero.  These don't clobber CC,
+// but we might need to split long relative branches into sequences that do.
+let isBranch = 1, isTerminator = 1 in {
+  let Defs = [CC] in {
+    def BRCT  : BranchUnaryRI<"brct",  0xA76, GR32>;
+    def BRCTG : BranchUnaryRI<"brctg", 0xA77, GR64>;
+  }
+  // This doesn't need to clobber CC since we never need to split it.
+  def BRCTH : BranchUnaryRIL<"brcth", 0xCC6, GRH32>,
+              Requires<[FeatureHighWord]>;
+
+  def BCT   : BranchUnaryRX<"bct",  0x46,GR32>;
+  def BCTR  : BranchUnaryRR<"bctr", 0x06, GR32>;
+  def BCTG  : BranchUnaryRXY<"bctg",  0xE346, GR64>;
+  def BCTGR : BranchUnaryRRE<"bctgr", 0xB946, GR64>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+  let Defs = [CC] in {
+    def BRXH  : BranchBinaryRSI<"brxh",  0x84, GR32>;
+    def BRXLE : BranchBinaryRSI<"brxle", 0x85, GR32>;
+    def BRXHG : BranchBinaryRIEe<"brxhg", 0xEC44, GR64>;
+    def BRXLG : BranchBinaryRIEe<"brxlg", 0xEC45, GR64>;
+  }
+  def BXH   : BranchBinaryRS<"bxh",  0x86, GR32>;
+  def BXLE  : BranchBinaryRS<"bxle", 0x87, GR32>;
+  def BXHG  : BranchBinaryRSY<"bxhg",  0xEB44, GR64>;
+  def BXLEG : BranchBinaryRSY<"bxleg", 0xEB45, GR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// Trap instructions
+//===----------------------------------------------------------------------===//
+
+// Unconditional trap.
+// FIXME: This trap instruction should be marked as isTerminator, but there is
+// currently a general bug that allows non-terminators to be placed between
+// terminators. Temporarily leave this unmarked until the bug is fixed.
+let isBarrier = 1, hasCtrlDep = 1 in
+  def Trap : Alias<4, (outs), (ins), [(trap)]>;
+
+// Conditional trap.
+let isTerminator = 1, hasCtrlDep = 1, Uses = [CC] in
+  def CondTrap : Alias<4, (outs), (ins cond4:$valid, cond4:$R1), []>;
+
+// Fused compare-and-trap instructions.
+let isTerminator = 1, hasCtrlDep = 1 in {
+  // These patterns work the same way as for compare-and-branch.
+  defm CRT   : CmpBranchRRFcPair<"crt",   0xB972, GR32>;
+  defm CGRT  : CmpBranchRRFcPair<"cgrt",  0xB960, GR64>;
+  defm CLRT  : CmpBranchRRFcPair<"clrt",  0xB973, GR32>;
+  defm CLGRT : CmpBranchRRFcPair<"clgrt", 0xB961, GR64>;
+  defm CIT   : CmpBranchRIEaPair<"cit",   0xEC72, GR32, imm32sx16>;
+  defm CGIT  : CmpBranchRIEaPair<"cgit",  0xEC70, GR64, imm64sx16>;
+  defm CLFIT : CmpBranchRIEaPair<"clfit", 0xEC73, GR32, imm32zx16>;
+  defm CLGIT : CmpBranchRIEaPair<"clgit", 0xEC71, GR64, imm64zx16>;
+  let Predicates = [FeatureMiscellaneousExtensions] in {
+    defm CLT  : CmpBranchRSYbPair<"clt",  0xEB23, GR32>;
+    defm CLGT : CmpBranchRSYbPair<"clgt", 0xEB2B, GR64>;
+  }
+
+  foreach V = [ "E", "H", "L", "HE", "LE", "LH",
+                "NE", "NH", "NL", "NHE", "NLE", "NLH" ] in {
+    def CRTAsm#V   : FixedCmpBranchRRFc<ICV<V>, "crt",   0xB972, GR32>;
+    def CGRTAsm#V  : FixedCmpBranchRRFc<ICV<V>, "cgrt",  0xB960, GR64>;
+    def CLRTAsm#V  : FixedCmpBranchRRFc<ICV<V>, "clrt",  0xB973, GR32>;
+    def CLGRTAsm#V : FixedCmpBranchRRFc<ICV<V>, "clgrt", 0xB961, GR64>;
+    def CITAsm#V   : FixedCmpBranchRIEa<ICV<V>, "cit",   0xEC72, GR32,
+                                         imm32sx16>;
+    def CGITAsm#V  : FixedCmpBranchRIEa<ICV<V>, "cgit",  0xEC70, GR64,
+                                         imm64sx16>;
+    def CLFITAsm#V : FixedCmpBranchRIEa<ICV<V>, "clfit", 0xEC73, GR32,
+                                         imm32zx16>;
+    def CLGITAsm#V : FixedCmpBranchRIEa<ICV<V>, "clgit", 0xEC71, GR64,
+                                         imm64zx16>;
+    let Predicates = [FeatureMiscellaneousExtensions] in {
+      def CLTAsm#V  : FixedCmpBranchRSYb<ICV<V>, "clt",  0xEB23, GR32>;
+      def CLGTAsm#V : FixedCmpBranchRSYb<ICV<V>, "clgt", 0xEB2B, GR64>;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Call and return instructions
+//===----------------------------------------------------------------------===//
+
+// Define the general form of the call instructions for the asm parser.
+// These instructions don't hard-code %r14 as the return address register.
+let isCall = 1, Defs = [CC] in {
+  def BRAS  : CallRI <"bras", 0xA75>;
+  def BRASL : CallRIL<"brasl", 0xC05>;
+  def BAS   : CallRX <"bas", 0x4D>;
+  def BASR  : CallRR <"basr", 0x0D>;
+}
+
+// Regular calls.
+let isCall = 1, Defs = [R14D, CC] in {
+  def CallBRASL : Alias<6, (outs), (ins pcrel32:$I2, variable_ops),
+                        [(z_call pcrel32:$I2)]>;
+  def CallBASR  : Alias<2, (outs), (ins ADDR64:$R2, variable_ops),
+                        [(z_call ADDR64:$R2)]>;
+}
+
+// TLS calls.  These will be lowered into a call to __tls_get_offset,
+// with an extra relocation specifying the TLS symbol.
+let isCall = 1, Defs = [R14D, CC] in {
+  def TLS_GDCALL : Alias<6, (outs), (ins tlssym:$I2, variable_ops),
+                         [(z_tls_gdcall tglobaltlsaddr:$I2)]>;
+  def TLS_LDCALL : Alias<6, (outs), (ins tlssym:$I2, variable_ops),
+                         [(z_tls_ldcall tglobaltlsaddr:$I2)]>;
+}
+
+// Sibling calls.  Indirect sibling calls must be via R1, since R2 upwards
+// are argument registers and since branching to R0 is a no-op.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  def CallJG : Alias<6, (outs), (ins pcrel32:$I2),
+                     [(z_sibcall pcrel32:$I2)]>;
+  let Uses = [R1D] in
+    def CallBR : Alias<2, (outs), (ins), [(z_sibcall R1D)]>;
+}
+
+// Conditional sibling calls.
+let CCMaskFirst = 1, isCall = 1, isTerminator = 1, isReturn = 1 in {
+  def CallBRCL : Alias<6, (outs), (ins cond4:$valid, cond4:$R1,
+                                   pcrel32:$I2), []>;
+  let Uses = [R1D] in
+    def CallBCR : Alias<2, (outs), (ins cond4:$valid, cond4:$R1), []>;
+}
+
+// Fused compare and conditional sibling calls.
+let isCall = 1, isTerminator = 1, isReturn = 1, Uses = [R1D] in {
+  def CRBCall : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3), []>;
+  def CGRBCall : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3), []>;
+  def CIBCall : Alias<6, (outs), (ins GR32:$R1, imm32sx8:$I2, cond4:$M3), []>;
+  def CGIBCall : Alias<6, (outs), (ins GR64:$R1, imm64sx8:$I2, cond4:$M3), []>;
+  def CLRBCall : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3), []>;
+  def CLGRBCall : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3), []>;
+  def CLIBCall : Alias<6, (outs), (ins GR32:$R1, imm32zx8:$I2, cond4:$M3), []>;
+  def CLGIBCall : Alias<6, (outs), (ins GR64:$R1, imm64zx8:$I2, cond4:$M3), []>;
+}
+
+// A return instruction (br %r14).
+let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
+  def Return : Alias<2, (outs), (ins), [(z_retflag)]>;
+
+// A conditional return instruction (bcr <cond>, %r14).
+let isReturn = 1, isTerminator = 1, hasCtrlDep = 1, CCMaskFirst = 1, Uses = [CC] in
+  def CondReturn : Alias<2, (outs), (ins cond4:$valid, cond4:$R1), []>;
+
+// Fused compare and conditional returns.
+let isReturn = 1, isTerminator = 1, hasCtrlDep = 1 in {
+  def CRBReturn : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3), []>;
+  def CGRBReturn : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3), []>;
+  def CIBReturn : Alias<6, (outs), (ins GR32:$R1, imm32sx8:$I2, cond4:$M3), []>;
+  def CGIBReturn : Alias<6, (outs), (ins GR64:$R1, imm64sx8:$I2, cond4:$M3), []>;
+  def CLRBReturn : Alias<6, (outs), (ins GR32:$R1, GR32:$R2, cond4:$M3), []>;
+  def CLGRBReturn : Alias<6, (outs), (ins GR64:$R1, GR64:$R2, cond4:$M3), []>;
+  def CLIBReturn : Alias<6, (outs), (ins GR32:$R1, imm32zx8:$I2, cond4:$M3), []>;
+  def CLGIBReturn : Alias<6, (outs), (ins GR64:$R1, imm64zx8:$I2, cond4:$M3), []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+def Select32Mux : SelectWrapper<GRX32>, Requires<[FeatureHighWord]>;
+def Select32    : SelectWrapper<GR32>;
+def Select64    : SelectWrapper<GR64>;
+
+// We don't define 32-bit Mux stores if we don't have STOCFH, because the
+// low-only STOC should then always be used if possible.
+defm CondStore8Mux  : CondStores<GRX32, nonvolatile_truncstorei8,
+                                 nonvolatile_anyextloadi8, bdxaddr20only>,
+                      Requires<[FeatureHighWord]>;
+defm CondStore16Mux : CondStores<GRX32, nonvolatile_truncstorei16,
+                                 nonvolatile_anyextloadi16, bdxaddr20only>,
+                      Requires<[FeatureHighWord]>;
+defm CondStore32Mux : CondStores<GRX32, nonvolatile_store,
+                                 nonvolatile_load, bdxaddr20only>,
+                      Requires<[FeatureLoadStoreOnCond2]>;
+defm CondStore8     : CondStores<GR32, nonvolatile_truncstorei8,
+                                 nonvolatile_anyextloadi8, bdxaddr20only>;
+defm CondStore16    : CondStores<GR32, nonvolatile_truncstorei16,
+                                 nonvolatile_anyextloadi16, bdxaddr20only>;
+defm CondStore32    : CondStores<GR32, nonvolatile_store,
+                                 nonvolatile_load, bdxaddr20only>;
+
+defm : CondStores64<CondStore8, CondStore8Inv, nonvolatile_truncstorei8,
+                    nonvolatile_anyextloadi8, bdxaddr20only>;
+defm : CondStores64<CondStore16, CondStore16Inv, nonvolatile_truncstorei16,
+                    nonvolatile_anyextloadi16, bdxaddr20only>;
+defm : CondStores64<CondStore32, CondStore32Inv, nonvolatile_truncstorei32,
+                    nonvolatile_anyextloadi32, bdxaddr20only>;
+defm CondStore64 : CondStores<GR64, nonvolatile_store,
+                              nonvolatile_load, bdxaddr20only>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Register moves.
+let hasSideEffects = 0 in {
+  // Expands to LR, RISBHG or RISBLG, depending on the choice of registers.
+  def LRMux : UnaryRRPseudo<"lr", null_frag, GRX32, GRX32>,
+              Requires<[FeatureHighWord]>;
+  def LR  : UnaryRR <"lr",  0x18,   null_frag, GR32, GR32>;
+  def LGR : UnaryRRE<"lgr", 0xB904, null_frag, GR64, GR64>;
+}
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
+  def LTR  : UnaryRR <"ltr",  0x12,   null_frag, GR32, GR32>;
+  def LTGR : UnaryRRE<"ltgr", 0xB902, null_frag, GR64, GR64>;
+}
+
+// Immediate moves.
+let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1,
+    isReMaterializable = 1 in {
+  // 16-bit sign-extended immediates.  LHIMux expands to LHI or IIHF,
+  // deopending on the choice of register.
+  def LHIMux : UnaryRIPseudo<bitconvert, GRX32, imm32sx16>,
+               Requires<[FeatureHighWord]>;
+  def LHI  : UnaryRI<"lhi",  0xA78, bitconvert, GR32, imm32sx16>;
+  def LGHI : UnaryRI<"lghi", 0xA79, bitconvert, GR64, imm64sx16>;
+
+  // Other 16-bit immediates.
+  def LLILL : UnaryRI<"llill", 0xA5F, bitconvert, GR64, imm64ll16>;
+  def LLILH : UnaryRI<"llilh", 0xA5E, bitconvert, GR64, imm64lh16>;
+  def LLIHL : UnaryRI<"llihl", 0xA5D, bitconvert, GR64, imm64hl16>;
+  def LLIHH : UnaryRI<"llihh", 0xA5C, bitconvert, GR64, imm64hh16>;
+
+  // 32-bit immediates.
+  def LGFI  : UnaryRIL<"lgfi",  0xC01, bitconvert, GR64, imm64sx32>;
+  def LLILF : UnaryRIL<"llilf", 0xC0F, bitconvert, GR64, imm64lf32>;
+  def LLIHF : UnaryRIL<"llihf", 0xC0E, bitconvert, GR64, imm64hf32>;
+}
+
+// Register loads.
+let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
+  // Expands to L, LY or LFH, depending on the choice of register.
+  def LMux : UnaryRXYPseudo<"l", load, GRX32, 4>,
+             Requires<[FeatureHighWord]>;
+  defm L : UnaryRXPair<"l", 0x58, 0xE358, load, GR32, 4>;
+  def LFH : UnaryRXY<"lfh", 0xE3CA, load, GRH32, 4>,
+            Requires<[FeatureHighWord]>;
+  def LG : UnaryRXY<"lg", 0xE304, load, GR64, 8>;
+
+  // These instructions are split after register allocation, so we don't
+  // want a custom inserter.
+  let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+    def L128 : Pseudo<(outs GR128:$dst), (ins bdxaddr20only128:$src),
+                      [(set GR128:$dst, (load bdxaddr20only128:$src))]>;
+  }
+}
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
+  def LT  : UnaryRXY<"lt",  0xE312, load, GR32, 4>;
+  def LTG : UnaryRXY<"ltg", 0xE302, load, GR64, 8>;
+}
+
+let canFoldAsLoad = 1 in {
+  def LRL  : UnaryRILPC<"lrl",  0xC4D, aligned_load, GR32>;
+  def LGRL : UnaryRILPC<"lgrl", 0xC48, aligned_load, GR64>;
+}
+
+// Load and zero rightmost byte.
+let Predicates = [FeatureLoadAndZeroRightmostByte] in {
+  def LZRF : UnaryRXY<"lzrf", 0xE33B, null_frag, GR32, 4>;
+  def LZRG : UnaryRXY<"lzrg", 0xE32A, null_frag, GR64, 8>;
+  def : Pat<(and (i32 (load bdxaddr20only:$src)), 0xffffff00),
+            (LZRF bdxaddr20only:$src)>;
+  def : Pat<(and (i64 (load bdxaddr20only:$src)), 0xffffffffffffff00),
+            (LZRG bdxaddr20only:$src)>;
+}
+
+// Load and trap.
+let Predicates = [FeatureLoadAndTrap] in {
+  def LAT   : UnaryRXY<"lat",   0xE39F, null_frag, GR32, 4>;
+  def LFHAT : UnaryRXY<"lfhat", 0xE3C8, null_frag, GRH32, 4>;
+  def LGAT  : UnaryRXY<"lgat",  0xE385, null_frag, GR64, 8>;
+}
+
+// Register stores.
+let SimpleBDXStore = 1 in {
+  // Expands to ST, STY or STFH, depending on the choice of register.
+  def STMux : StoreRXYPseudo<store, GRX32, 4>,
+              Requires<[FeatureHighWord]>;
+  defm ST : StoreRXPair<"st", 0x50, 0xE350, store, GR32, 4>;
+  def STFH : StoreRXY<"stfh", 0xE3CB, store, GRH32, 4>,
+             Requires<[FeatureHighWord]>;
+  def STG : StoreRXY<"stg", 0xE324, store, GR64, 8>;
+
+  // These instructions are split after register allocation, so we don't
+  // want a custom inserter.
+  let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+    def ST128 : Pseudo<(outs), (ins GR128:$src, bdxaddr20only128:$dst),
+                       [(store GR128:$src, bdxaddr20only128:$dst)]>;
+  }
+}
+def STRL  : StoreRILPC<"strl", 0xC4F, aligned_store, GR32>;
+def STGRL : StoreRILPC<"stgrl", 0xC4B, aligned_store, GR64>;
+
+// 8-bit immediate stores to 8-bit fields.
+defm MVI : StoreSIPair<"mvi", 0x92, 0xEB52, truncstorei8, imm32zx8trunc>;
+
+// 16-bit immediate stores to 16-, 32- or 64-bit fields.
+def MVHHI : StoreSIL<"mvhhi", 0xE544, truncstorei16, imm32sx16trunc>;
+def MVHI  : StoreSIL<"mvhi",  0xE54C, store,         imm32sx16>;
+def MVGHI : StoreSIL<"mvghi", 0xE548, store,         imm64sx16>;
+
+// Memory-to-memory moves.
+let mayLoad = 1, mayStore = 1 in
+  defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
+
+// String moves.
+let mayLoad = 1, mayStore = 1, Defs = [CC] in
+  defm MVST : StringRRE<"mvst", 0xB255, z_stpcpy>;
+
+//===----------------------------------------------------------------------===//
+// Conditional move instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureLoadStoreOnCond2], Uses = [CC] in {
+  // Load immediate on condition.  Matched via DAG pattern and created
+  // by the PeepholeOptimizer via FoldImmediate.
+  let hasSideEffects = 0 in {
+    // Expands to LOCHI or LOCHHI, depending on the choice of register.
+    def LOCHIMux : CondBinaryRIEPseudo<GRX32, imm32sx16>;
+    defm LOCHHI  : CondBinaryRIEPair<"lochhi", 0xEC4E, GRH32, imm32sx16>;
+    defm LOCHI   : CondBinaryRIEPair<"lochi",  0xEC42, GR32, imm32sx16>;
+    defm LOCGHI  : CondBinaryRIEPair<"locghi", 0xEC46, GR64, imm64sx16>;
+  }
+
+  // Move register on condition.  Expanded from Select* pseudos and
+  // created by early if-conversion.
+  let hasSideEffects = 0, isCommutable = 1 in {
+    // Expands to LOCR or LOCFHR or a branch-and-move sequence,
+    // depending on the choice of registers.
+    def LOCRMux : CondBinaryRRFPseudo<GRX32, GRX32>;
+    defm LOCFHR : CondBinaryRRFPair<"locfhr", 0xB9E0, GRH32, GRH32>;
+  }
+
+  // Load on condition.  Matched via DAG pattern.
+  // Expands to LOC or LOCFH, depending on the choice of register.
+  def LOCMux : CondUnaryRSYPseudo<nonvolatile_load, GRX32, 4>;
+  defm LOCFH : CondUnaryRSYPair<"locfh", 0xEBE0, nonvolatile_load, GRH32, 4>;
+
+  // Store on condition.  Expanded from CondStore* pseudos.
+  // Expands to STOC or STOCFH, depending on the choice of register.
+  def STOCMux : CondStoreRSYPseudo<GRX32, 4>;
+  defm STOCFH : CondStoreRSYPair<"stocfh", 0xEBE1, GRH32, 4>;
+
+  // Define AsmParser extended mnemonics for each general condition-code mask.
+  foreach V = [ "E", "NE", "H", "NH", "L", "NL", "HE", "NHE", "LE", "NLE",
+                "Z", "NZ", "P", "NP", "M", "NM", "LH", "NLH", "O", "NO" ] in {
+    def LOCHIAsm#V  : FixedCondBinaryRIE<CV<V>, "lochi",  0xEC42, GR32,
+                                         imm32sx16>;
+    def LOCGHIAsm#V : FixedCondBinaryRIE<CV<V>, "locghi", 0xEC46, GR64,
+                                         imm64sx16>;
+    def LOCHHIAsm#V : FixedCondBinaryRIE<CV<V>, "lochhi", 0xEC4E, GRH32,
+                                         imm32sx16>;
+    def LOCFHRAsm#V : FixedCondBinaryRRF<CV<V>, "locfhr", 0xB9E0, GRH32, GRH32>;
+    def LOCFHAsm#V  : FixedCondUnaryRSY<CV<V>, "locfh",  0xEBE0, GRH32, 4>;
+    def STOCFHAsm#V : FixedCondStoreRSY<CV<V>, "stocfh", 0xEBE1, GRH32, 4>;
+  }
+}
+
+let Predicates = [FeatureLoadStoreOnCond], Uses = [CC] in {
+  // Move register on condition.  Expanded from Select* pseudos and
+  // created by early if-conversion.
+  let hasSideEffects = 0, isCommutable = 1 in {
+    defm LOCR  : CondBinaryRRFPair<"locr",  0xB9F2, GR32, GR32>;
+    defm LOCGR : CondBinaryRRFPair<"locgr", 0xB9E2, GR64, GR64>;
+  }
+
+  // Load on condition.  Matched via DAG pattern.
+  defm LOC  : CondUnaryRSYPair<"loc",  0xEBF2, nonvolatile_load, GR32, 4>;
+  defm LOCG : CondUnaryRSYPair<"locg", 0xEBE2, nonvolatile_load, GR64, 8>;
+
+  // Store on condition.  Expanded from CondStore* pseudos.
+  defm STOC  : CondStoreRSYPair<"stoc",  0xEBF3, GR32, 4>;
+  defm STOCG : CondStoreRSYPair<"stocg", 0xEBE3, GR64, 8>;
+
+  // Define AsmParser extended mnemonics for each general condition-code mask.
+  foreach V = [ "E", "NE", "H", "NH", "L", "NL", "HE", "NHE", "LE", "NLE",
+                "Z", "NZ", "P", "NP", "M", "NM", "LH", "NLH", "O", "NO" ] in {
+    def LOCRAsm#V   : FixedCondBinaryRRF<CV<V>, "locr",  0xB9F2, GR32, GR32>;
+    def LOCGRAsm#V  : FixedCondBinaryRRF<CV<V>, "locgr", 0xB9E2, GR64, GR64>;
+    def LOCAsm#V    : FixedCondUnaryRSY<CV<V>, "loc",   0xEBF2, GR32, 4>;
+    def LOCGAsm#V   : FixedCondUnaryRSY<CV<V>, "locg",  0xEBE2, GR64, 8>;
+    def STOCAsm#V   : FixedCondStoreRSY<CV<V>, "stoc",  0xEBF3, GR32, 4>;
+    def STOCGAsm#V  : FixedCondStoreRSY<CV<V>, "stocg", 0xEBE3, GR64, 8>;
+  }
+}
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+//
+// Note that putting these before zero extensions mean that we will prefer
+// them for anyextload*.  There's not really much to choose between the two
+// either way, but signed-extending loads have a short LH and a long LHY,
+// while zero-extending loads have only the long LLH.
+//
+//===----------------------------------------------------------------------===//
+
+// 32-bit extensions from registers.
+let hasSideEffects = 0 in {
+  def LBR : UnaryRRE<"lbr", 0xB926, sext8,  GR32, GR32>;
+  def LHR : UnaryRRE<"lhr", 0xB927, sext16, GR32, GR32>;
+}
+
+// 64-bit extensions from registers.
+let hasSideEffects = 0 in {
+  def LGBR : UnaryRRE<"lgbr", 0xB906, sext8,  GR64, GR64>;
+  def LGHR : UnaryRRE<"lghr", 0xB907, sext16, GR64, GR64>;
+  def LGFR : UnaryRRE<"lgfr", 0xB914, sext32, GR64, GR32>;
+}
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
+  def LTGFR : UnaryRRE<"ltgfr", 0xB912, null_frag, GR64, GR32>;
+
+// Match 32-to-64-bit sign extensions in which the source is already
+// in a 64-bit register.
+def : Pat<(sext_inreg GR64:$src, i32),
+          (LGFR (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
+
+// 32-bit extensions from 8-bit memory.  LBMux expands to LB or LBH,
+// depending on the choice of register.
+def LBMux : UnaryRXYPseudo<"lb", asextloadi8, GRX32, 1>,
+            Requires<[FeatureHighWord]>;
+def LB  : UnaryRXY<"lb", 0xE376, asextloadi8, GR32, 1>;
+def LBH : UnaryRXY<"lbh", 0xE3C0, asextloadi8, GRH32, 1>,
+          Requires<[FeatureHighWord]>;
+
+// 32-bit extensions from 16-bit memory.  LHMux expands to LH or LHH,
+// depending on the choice of register.
+def LHMux : UnaryRXYPseudo<"lh", asextloadi16, GRX32, 2>,
+            Requires<[FeatureHighWord]>;
+defm LH   : UnaryRXPair<"lh", 0x48, 0xE378, asextloadi16, GR32, 2>;
+def  LHH  : UnaryRXY<"lhh", 0xE3C4, asextloadi16, GRH32, 2>,
+            Requires<[FeatureHighWord]>;
+def  LHRL : UnaryRILPC<"lhrl", 0xC45, aligned_asextloadi16, GR32>;
+
+// 64-bit extensions from memory.
+def LGB   : UnaryRXY<"lgb", 0xE377, asextloadi8,  GR64, 1>;
+def LGH   : UnaryRXY<"lgh", 0xE315, asextloadi16, GR64, 2>;
+def LGF   : UnaryRXY<"lgf", 0xE314, asextloadi32, GR64, 4>;
+def LGHRL : UnaryRILPC<"lghrl", 0xC44, aligned_asextloadi16, GR64>;
+def LGFRL : UnaryRILPC<"lgfrl", 0xC4C, aligned_asextloadi32, GR64>;
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
+  def LTGF : UnaryRXY<"ltgf", 0xE332, asextloadi32, GR64, 4>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+// 32-bit extensions from registers.
+let hasSideEffects = 0 in {
+  // Expands to LLCR or RISB[LH]G, depending on the choice of registers.
+  def LLCRMux : UnaryRRPseudo<"llcr", zext8, GRX32, GRX32>,
+                Requires<[FeatureHighWord]>;
+  def LLCR    : UnaryRRE<"llcr", 0xB994, zext8,  GR32, GR32>;
+  // Expands to LLHR or RISB[LH]G, depending on the choice of registers.
+  def LLHRMux : UnaryRRPseudo<"llhr", zext16, GRX32, GRX32>,
+                Requires<[FeatureHighWord]>;
+  def LLHR    : UnaryRRE<"llhr", 0xB995, zext16, GR32, GR32>;
+}
+
+// 64-bit extensions from registers.
+let hasSideEffects = 0 in {
+  def LLGCR : UnaryRRE<"llgcr", 0xB984, zext8,  GR64, GR64>;
+  def LLGHR : UnaryRRE<"llghr", 0xB985, zext16, GR64, GR64>;
+  def LLGFR : UnaryRRE<"llgfr", 0xB916, zext32, GR64, GR32>;
+}
+
+// Match 32-to-64-bit zero extensions in which the source is already
+// in a 64-bit register.
+def : Pat<(and GR64:$src, 0xffffffff),
+          (LLGFR (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
+
+// 32-bit extensions from 8-bit memory.  LLCMux expands to LLC or LLCH,
+// depending on the choice of register.
+def LLCMux : UnaryRXYPseudo<"llc", azextloadi8, GRX32, 1>,
+             Requires<[FeatureHighWord]>;
+def LLC  : UnaryRXY<"llc", 0xE394, azextloadi8, GR32, 1>;
+def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GRH32, 1>,
+           Requires<[FeatureHighWord]>;
+
+// 32-bit extensions from 16-bit memory.  LLHMux expands to LLH or LLHH,
+// depending on the choice of register.
+def LLHMux : UnaryRXYPseudo<"llh", azextloadi16, GRX32, 2>,
+             Requires<[FeatureHighWord]>;
+def LLH   : UnaryRXY<"llh", 0xE395, azextloadi16, GR32, 2>;
+def LLHH  : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GRH32, 2>,
+            Requires<[FeatureHighWord]>;
+def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_azextloadi16, GR32>;
+
+// 64-bit extensions from memory.
+def LLGC   : UnaryRXY<"llgc", 0xE390, azextloadi8,  GR64, 1>;
+def LLGH   : UnaryRXY<"llgh", 0xE391, azextloadi16, GR64, 2>;
+def LLGF   : UnaryRXY<"llgf", 0xE316, azextloadi32, GR64, 4>;
+def LLGHRL : UnaryRILPC<"llghrl", 0xC46, aligned_azextloadi16, GR64>;
+def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_azextloadi32, GR64>;
+
+// 31-to-64-bit zero extensions.
+def LLGTR : UnaryRRE<"llgtr", 0xB917, null_frag, GR64, GR64>;
+def LLGT  : UnaryRXY<"llgt",  0xE317, null_frag, GR64, 4>;
+def : Pat<(and GR64:$src, 0x7fffffff),
+          (LLGTR GR64:$src)>;
+def : Pat<(and (i64 (azextloadi32 bdxaddr20only:$src)), 0x7fffffff),
+          (LLGT bdxaddr20only:$src)>;
+
+// Load and zero rightmost byte.
+let Predicates = [FeatureLoadAndZeroRightmostByte] in {
+  def LLZRGF : UnaryRXY<"llzrgf", 0xE33A, null_frag, GR64, 4>;
+  def : Pat<(and (i64 (azextloadi32 bdxaddr20only:$src)), 0xffffff00),
+            (LLZRGF bdxaddr20only:$src)>;
+}
+
+// Load and trap.
+let Predicates = [FeatureLoadAndTrap] in {
+  def LLGFAT : UnaryRXY<"llgfat", 0xE39D, null_frag, GR64, 4>;
+  def LLGTAT : UnaryRXY<"llgtat", 0xE39C, null_frag, GR64, 4>;
+}
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+// Truncations of 64-bit registers to 32-bit registers.
+def : Pat<(i32 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, subreg_l32)>;
+
+// Truncations of 32-bit registers to 8-bit memory.  STCMux expands to
+// STC, STCY or STCH, depending on the choice of register.
+def STCMux : StoreRXYPseudo<truncstorei8, GRX32, 1>,
+             Requires<[FeatureHighWord]>;
+defm STC : StoreRXPair<"stc", 0x42, 0xE372, truncstorei8, GR32, 1>;
+def STCH : StoreRXY<"stch", 0xE3C3, truncstorei8, GRH32, 1>,
+           Requires<[FeatureHighWord]>;
+
+// Truncations of 32-bit registers to 16-bit memory.  STHMux expands to
+// STH, STHY or STHH, depending on the choice of register.
+def STHMux : StoreRXYPseudo<truncstorei16, GRX32, 1>,
+             Requires<[FeatureHighWord]>;
+defm STH : StoreRXPair<"sth", 0x40, 0xE370, truncstorei16, GR32, 2>;
+def STHH : StoreRXY<"sthh", 0xE3C7, truncstorei16, GRH32, 2>,
+           Requires<[FeatureHighWord]>;
+def STHRL : StoreRILPC<"sthrl", 0xC47, aligned_truncstorei16, GR32>;
+
+// Truncations of 64-bit registers to memory.
+defm : StoreGR64Pair<STC, STCY, truncstorei8>;
+defm : StoreGR64Pair<STH, STHY, truncstorei16>;
+def  : StoreGR64PC<STHRL, aligned_truncstorei16>;
+defm : StoreGR64Pair<ST, STY, truncstorei32>;
+def  : StoreGR64PC<STRL, aligned_truncstorei32>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Multi-register loads.
+defm LM : LoadMultipleRSPair<"lm", 0x98, 0xEB98, GR32>;
+def LMG : LoadMultipleRSY<"lmg", 0xEB04, GR64>;
+def LMH : LoadMultipleRSY<"lmh", 0xEB96, GRH32>;
+
+// Multi-register stores.
+defm STM : StoreMultipleRSPair<"stm", 0x90, 0xEB90, GR32>;
+def STMG : StoreMultipleRSY<"stmg", 0xEB24, GR64>;
+def STMH : StoreMultipleRSY<"stmh", 0xEB26, GRH32>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+// Byte-swapping register moves.
+let hasSideEffects = 0 in {
+  def LRVR  : UnaryRRE<"lrvr",  0xB91F, bswap, GR32, GR32>;
+  def LRVGR : UnaryRRE<"lrvgr", 0xB90F, bswap, GR64, GR64>;
+}
+
+// Byte-swapping loads.  Unlike normal loads, these instructions are
+// allowed to access storage more than once.
+def LRVH : UnaryRXY<"lrvh", 0xE31F, z_lrvh, GR32, 2>;
+def LRV  : UnaryRXY<"lrv",  0xE31E, z_lrv,  GR32, 4>;
+def LRVG : UnaryRXY<"lrvg", 0xE30F, z_lrvg, GR64, 8>;
+
+// Likewise byte-swapping stores.
+def STRVH : StoreRXY<"strvh", 0xE33F, z_strvh, GR32, 2>;
+def STRV  : StoreRXY<"strv",  0xE33E, z_strv,  GR32, 4>;
+def STRVG : StoreRXY<"strvg", 0xE32F, z_strvg, GR64, 8>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+// Load BDX-style addresses.
+let hasSideEffects = 0, isAsCheapAsAMove = 1, isReMaterializable = 1 in
+  defm LA : LoadAddressRXPair<"la", 0x41, 0xE371, bitconvert>;
+
+// Load a PC-relative address.  There's no version of this instruction
+// with a 16-bit offset, so there's no relaxation.
+let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1,
+    isReMaterializable = 1 in
+  def LARL : LoadAddressRIL<"larl", 0xC00, bitconvert>;
+
+// Load the Global Offset Table address.  This will be lowered into a
+//     larl $R1, _GLOBAL_OFFSET_TABLE_
+// instruction.
+def GOT : Alias<6, (outs GR64:$R1), (ins),
+                [(set GR64:$R1, (global_offset_table))]>;
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+  let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+    def LPR  : UnaryRR <"lpr",  0x10,   z_iabs, GR32, GR32>;
+    def LPGR : UnaryRRE<"lpgr", 0xB900, z_iabs, GR64, GR64>;
+  }
+  let CCValues = 0xE, CompareZeroCCMask = 0xE in
+    def LPGFR : UnaryRRE<"lpgfr", 0xB910, null_frag, GR64, GR32>;
+}
+def : Pat<(z_iabs32 GR32:$src), (LPR  GR32:$src)>;
+def : Pat<(z_iabs64 GR64:$src), (LPGR GR64:$src)>;
+defm : SXU<z_iabs,   LPGFR>;
+defm : SXU<z_iabs64, LPGFR>;
+
+let Defs = [CC] in {
+  let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+    def LNR  : UnaryRR <"lnr",  0x11,   z_inegabs, GR32, GR32>;
+    def LNGR : UnaryRRE<"lngr", 0xB901, z_inegabs, GR64, GR64>;
+  }
+  let CCValues = 0xE, CompareZeroCCMask = 0xE in
+    def LNGFR : UnaryRRE<"lngfr", 0xB911, null_frag, GR64, GR32>;
+}
+def : Pat<(z_inegabs32 GR32:$src), (LNR  GR32:$src)>;
+def : Pat<(z_inegabs64 GR64:$src), (LNGR GR64:$src)>;
+defm : SXU<z_inegabs,   LNGFR>;
+defm : SXU<z_inegabs64, LNGFR>;
+
+let Defs = [CC] in {
+  let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+    def LCR  : UnaryRR <"lcr",  0x13,   ineg, GR32, GR32>;
+    def LCGR : UnaryRRE<"lcgr", 0xB903, ineg, GR64, GR64>;
+  }
+  let CCValues = 0xE, CompareZeroCCMask = 0xE in
+    def LCGFR : UnaryRRE<"lcgfr", 0xB913, null_frag, GR64, GR32>;
+}
+defm : SXU<ineg, LCGFR>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+let isCodeGenOnly = 1 in
+  defm IC32 : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR32, azextloadi8, 1>;
+defm IC : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR64, azextloadi8, 1>;
+
+defm : InsertMem<"inserti8", IC32,  GR32, azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>;
+
+defm : InsertMem<"inserti8", IC,  GR64, azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>;
+
+let Defs = [CC] in {
+  defm ICM : TernaryRSPair<"icm", 0xBF, 0xEB81, GR32, 0>;
+  def ICMH : TernaryRSY<"icmh", 0xEB80, GRH32, 0>;
+}
+
+// Insertions of a 16-bit immediate, leaving other bits unaffected.
+// We don't have or_as_insert equivalents of these operations because
+// OI is available instead.
+//
+// IIxMux expands to II[LH]x, depending on the choice of register.
+def IILMux : BinaryRIPseudo<insertll, GRX32, imm32ll16>,
+             Requires<[FeatureHighWord]>;
+def IIHMux : BinaryRIPseudo<insertlh, GRX32, imm32lh16>,
+             Requires<[FeatureHighWord]>;
+def IILL : BinaryRI<"iill", 0xA53, insertll, GR32, imm32ll16>;
+def IILH : BinaryRI<"iilh", 0xA52, insertlh, GR32, imm32lh16>;
+def IIHL : BinaryRI<"iihl", 0xA51, insertll, GRH32, imm32ll16>;
+def IIHH : BinaryRI<"iihh", 0xA50, insertlh, GRH32, imm32lh16>;
+def IILL64 : BinaryAliasRI<insertll, GR64, imm64ll16>;
+def IILH64 : BinaryAliasRI<insertlh, GR64, imm64lh16>;
+def IIHL64 : BinaryAliasRI<inserthl, GR64, imm64hl16>;
+def IIHH64 : BinaryAliasRI<inserthh, GR64, imm64hh16>;
+
+// ...likewise for 32-bit immediates.  For GR32s this is a general
+// full-width move.  (We use IILF rather than something like LLILF
+// for 32-bit moves because IILF leaves the upper 32 bits of the
+// GR64 unchanged.)
+let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in {
+  def IIFMux : UnaryRIPseudo<bitconvert, GRX32, uimm32>,
+               Requires<[FeatureHighWord]>;
+  def IILF : UnaryRIL<"iilf", 0xC09, bitconvert, GR32, uimm32>;
+  def IIHF : UnaryRIL<"iihf", 0xC08, bitconvert, GRH32, uimm32>;
+}
+def IILF64 : BinaryAliasRIL<insertlf, GR64, imm64lf32>;
+def IIHF64 : BinaryAliasRIL<inserthf, GR64, imm64hf32>;
+
+// An alternative model of inserthf, with the first operand being
+// a zero-extended value.
+def : Pat<(or (zext32 GR32:$src), imm64hf32:$imm),
+          (IIHF64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32),
+                  imm64hf32:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+// Plain addition.
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+  // Addition of a register.
+  let isCommutable = 1 in {
+    defm AR : BinaryRRAndK<"ar", 0x1A, 0xB9F8, add, GR32, GR32>;
+    defm AGR : BinaryRREAndK<"agr", 0xB908, 0xB9E8, add, GR64, GR64>;
+  }
+  def AGFR : BinaryRRE<"agfr", 0xB918, null_frag, GR64, GR32>;
+
+  // Addition of signed 16-bit immediates.
+  defm AHIMux : BinaryRIAndKPseudo<"ahimux", add, GRX32, imm32sx16>;
+  defm AHI  : BinaryRIAndK<"ahi",  0xA7A, 0xECD8, add, GR32, imm32sx16>;
+  defm AGHI : BinaryRIAndK<"aghi", 0xA7B, 0xECD9, add, GR64, imm64sx16>;
+
+  // Addition of signed 32-bit immediates.
+  def AFIMux : BinaryRIPseudo<add, GRX32, simm32>,
+               Requires<[FeatureHighWord]>;
+  def AFI  : BinaryRIL<"afi",  0xC29, add, GR32, simm32>;
+  def AIH  : BinaryRIL<"aih",  0xCC8, add, GRH32, simm32>,
+             Requires<[FeatureHighWord]>;
+  def AGFI : BinaryRIL<"agfi", 0xC28, add, GR64, imm64sx32>;
+
+  // Addition of memory.
+  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, asextloadi16, 2>;
+  defm A   : BinaryRXPair<"a",  0x5A, 0xE35A, add, GR32, load, 4>;
+  def  AGF : BinaryRXY<"agf", 0xE318, add, GR64, asextloadi32, 4>;
+  def  AG  : BinaryRXY<"ag",  0xE308, add, GR64, load, 8>;
+
+  // Addition to memory.
+  def ASI  : BinarySIY<"asi",  0xEB6A, add, imm32sx8>;
+  def AGSI : BinarySIY<"agsi", 0xEB7A, add, imm64sx8>;
+}
+defm : SXB<add, GR64, AGFR>;
+
+// Addition producing a carry.
+let Defs = [CC] in {
+  // Addition of a register.
+  let isCommutable = 1 in {
+    defm ALR : BinaryRRAndK<"alr", 0x1E, 0xB9FA, addc, GR32, GR32>;
+    defm ALGR : BinaryRREAndK<"algr", 0xB90A, 0xB9EA, addc, GR64, GR64>;
+  }
+  def ALGFR : BinaryRRE<"algfr", 0xB91A, null_frag, GR64, GR32>;
+
+  // Addition of signed 16-bit immediates.
+  def ALHSIK  : BinaryRIE<"alhsik",  0xECDA, addc, GR32, imm32sx16>,
+                Requires<[FeatureDistinctOps]>;
+  def ALGHSIK : BinaryRIE<"alghsik", 0xECDB, addc, GR64, imm64sx16>,
+                Requires<[FeatureDistinctOps]>;
+
+  // Addition of unsigned 32-bit immediates.
+  def ALFI  : BinaryRIL<"alfi",  0xC2B, addc, GR32, uimm32>;
+  def ALGFI : BinaryRIL<"algfi", 0xC2A, addc, GR64, imm64zx32>;
+
+  // Addition of memory.
+  defm AL   : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>;
+  def  ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>;
+  def  ALG  : BinaryRXY<"alg",  0xE30A, addc, GR64, load, 8>;
+}
+defm : ZXB<addc, GR64, ALGFR>;
+
+// Addition producing and using a carry.
+let Defs = [CC], Uses = [CC] in {
+  // Addition of a register.
+  def ALCR  : BinaryRRE<"alcr",  0xB998, adde, GR32, GR32>;
+  def ALCGR : BinaryRRE<"alcgr", 0xB988, adde, GR64, GR64>;
+
+  // Addition of memory.
+  def ALC  : BinaryRXY<"alc",  0xE398, adde, GR32, load, 4>;
+  def ALCG : BinaryRXY<"alcg", 0xE388, adde, GR64, load, 8>;
+}
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+// Plain subtraction.  Although immediate forms exist, we use the
+// add-immediate instruction instead.
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+  // Subtraction of a register.
+  defm SR : BinaryRRAndK<"sr", 0x1B, 0xB9F9, sub, GR32, GR32>;
+  def SGFR : BinaryRRE<"sgfr", 0xB919, null_frag, GR64, GR32>;
+  defm SGR : BinaryRREAndK<"sgr", 0xB909, 0xB9E9, sub, GR64, GR64>;
+
+  // Subtraction of memory.
+  defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>;
+  defm S   : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>;
+  def  SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, asextloadi32, 4>;
+  def  SG  : BinaryRXY<"sg",  0xE309, sub, GR64, load, 8>;
+}
+defm : SXB<sub, GR64, SGFR>;
+
+// Subtraction producing a carry.
+let Defs = [CC] in {
+  // Subtraction of a register.
+  defm SLR : BinaryRRAndK<"slr", 0x1F, 0xB9FB, subc, GR32, GR32>;
+  def SLGFR : BinaryRRE<"slgfr", 0xB91B, null_frag, GR64, GR32>;
+  defm SLGR : BinaryRREAndK<"slgr", 0xB90B, 0xB9EB, subc, GR64, GR64>;
+
+  // Subtraction of unsigned 32-bit immediates.  These don't match
+  // subc because we prefer addc for constants.
+  def SLFI  : BinaryRIL<"slfi",  0xC25, null_frag, GR32, uimm32>;
+  def SLGFI : BinaryRIL<"slgfi", 0xC24, null_frag, GR64, imm64zx32>;
+
+  // Subtraction of memory.
+  defm SL   : BinaryRXPair<"sl", 0x5F, 0xE35F, subc, GR32, load, 4>;
+  def  SLGF : BinaryRXY<"slgf", 0xE31B, subc, GR64, azextloadi32, 4>;
+  def  SLG  : BinaryRXY<"slg",  0xE30B, subc, GR64, load, 8>;
+}
+defm : ZXB<subc, GR64, SLGFR>;
+
+// Subtraction producing and using a carry.
+let Defs = [CC], Uses = [CC] in {
+  // Subtraction of a register.
+  def SLBR  : BinaryRRE<"slbr",  0xB999, sube, GR32, GR32>;
+  def SLBGR : BinaryRRE<"slbgr", 0xB989, sube, GR64, GR64>;
+
+  // Subtraction of memory.
+  def SLB  : BinaryRXY<"slb",  0xE399, sube, GR32, load, 4>;
+  def SLBG : BinaryRXY<"slbg", 0xE389, sube, GR64, load, 8>;
+}
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+  // ANDs of a register.
+  let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm NR : BinaryRRAndK<"nr", 0x14, 0xB9F4, and, GR32, GR32>;
+    defm NGR : BinaryRREAndK<"ngr", 0xB980, 0xB9E4, and, GR64, GR64>;
+  }
+
+  let isConvertibleToThreeAddress = 1 in {
+    // ANDs of a 16-bit immediate, leaving other bits unaffected.
+    // The CC result only reflects the 16-bit field, not the full register.
+    //
+    // NIxMux expands to NI[LH]x, depending on the choice of register.
+    def NILMux : BinaryRIPseudo<and, GRX32, imm32ll16c>,
+                 Requires<[FeatureHighWord]>;
+    def NIHMux : BinaryRIPseudo<and, GRX32, imm32lh16c>,
+                 Requires<[FeatureHighWord]>;
+    def NILL : BinaryRI<"nill", 0xA57, and, GR32, imm32ll16c>;
+    def NILH : BinaryRI<"nilh", 0xA56, and, GR32, imm32lh16c>;
+    def NIHL : BinaryRI<"nihl", 0xA55, and, GRH32, imm32ll16c>;
+    def NIHH : BinaryRI<"nihh", 0xA54, and, GRH32, imm32lh16c>;
+    def NILL64 : BinaryAliasRI<and, GR64, imm64ll16c>;
+    def NILH64 : BinaryAliasRI<and, GR64, imm64lh16c>;
+    def NIHL64 : BinaryAliasRI<and, GR64, imm64hl16c>;
+    def NIHH64 : BinaryAliasRI<and, GR64, imm64hh16c>;
+
+    // ANDs of a 32-bit immediate, leaving other bits unaffected.
+    // The CC result only reflects the 32-bit field, which means we can
+    // use it as a zero indicator for i32 operations but not otherwise.
+    let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+      // Expands to NILF or NIHF, depending on the choice of register.
+      def NIFMux : BinaryRIPseudo<and, GRX32, uimm32>,
+                   Requires<[FeatureHighWord]>;
+      def NILF : BinaryRIL<"nilf", 0xC0B, and, GR32, uimm32>;
+      def NIHF : BinaryRIL<"nihf", 0xC0A, and, GRH32, uimm32>;
+    }
+    def NILF64 : BinaryAliasRIL<and, GR64, imm64lf32c>;
+    def NIHF64 : BinaryAliasRIL<and, GR64, imm64hf32c>;
+  }
+
+  // ANDs of memory.
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm N  : BinaryRXPair<"n", 0x54, 0xE354, and, GR32, load, 4>;
+    def  NG : BinaryRXY<"ng", 0xE380, and, GR64, load, 8>;
+  }
+
+  // AND to memory
+  defm NI : BinarySIPair<"ni", 0x94, 0xEB54, null_frag, imm32zx8>;
+
+  // Block AND.
+  let mayLoad = 1, mayStore = 1 in
+    defm NC : MemorySS<"nc", 0xD4, z_nc, z_nc_loop>;
+}
+defm : RMWIByte<and, bdaddr12pair, NI>;
+defm : RMWIByte<and, bdaddr20pair, NIY>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+  // ORs of a register.
+  let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm OR : BinaryRRAndK<"or", 0x16, 0xB9F6, or, GR32, GR32>;
+    defm OGR : BinaryRREAndK<"ogr", 0xB981, 0xB9E6, or, GR64, GR64>;
+  }
+
+  // ORs of a 16-bit immediate, leaving other bits unaffected.
+  // The CC result only reflects the 16-bit field, not the full register.
+  //
+  // OIxMux expands to OI[LH]x, depending on the choice of register.
+  def OILMux : BinaryRIPseudo<or, GRX32, imm32ll16>,
+               Requires<[FeatureHighWord]>;
+  def OIHMux : BinaryRIPseudo<or, GRX32, imm32lh16>,
+               Requires<[FeatureHighWord]>;
+  def OILL : BinaryRI<"oill", 0xA5B, or, GR32, imm32ll16>;
+  def OILH : BinaryRI<"oilh", 0xA5A, or, GR32, imm32lh16>;
+  def OIHL : BinaryRI<"oihl", 0xA59, or, GRH32, imm32ll16>;
+  def OIHH : BinaryRI<"oihh", 0xA58, or, GRH32, imm32lh16>;
+  def OILL64 : BinaryAliasRI<or, GR64, imm64ll16>;
+  def OILH64 : BinaryAliasRI<or, GR64, imm64lh16>;
+  def OIHL64 : BinaryAliasRI<or, GR64, imm64hl16>;
+  def OIHH64 : BinaryAliasRI<or, GR64, imm64hh16>;
+
+  // ORs of a 32-bit immediate, leaving other bits unaffected.
+  // The CC result only reflects the 32-bit field, which means we can
+  // use it as a zero indicator for i32 operations but not otherwise.
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    // Expands to OILF or OIHF, depending on the choice of register.
+    def OIFMux : BinaryRIPseudo<or, GRX32, uimm32>,
+                 Requires<[FeatureHighWord]>;
+    def OILF : BinaryRIL<"oilf", 0xC0D, or, GR32, uimm32>;
+    def OIHF : BinaryRIL<"oihf", 0xC0C, or, GRH32, uimm32>;
+  }
+  def OILF64 : BinaryAliasRIL<or, GR64, imm64lf32>;
+  def OIHF64 : BinaryAliasRIL<or, GR64, imm64hf32>;
+
+  // ORs of memory.
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm O  : BinaryRXPair<"o", 0x56, 0xE356, or, GR32, load, 4>;
+    def  OG : BinaryRXY<"og", 0xE381, or, GR64, load, 8>;
+  }
+
+  // OR to memory
+  defm OI : BinarySIPair<"oi", 0x96, 0xEB56, null_frag, imm32zx8>;
+
+  // Block OR.
+  let mayLoad = 1, mayStore = 1 in
+    defm OC : MemorySS<"oc", 0xD6, z_oc, z_oc_loop>;
+}
+defm : RMWIByte<or, bdaddr12pair, OI>;
+defm : RMWIByte<or, bdaddr20pair, OIY>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+  // XORs of a register.
+  let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm XR : BinaryRRAndK<"xr", 0x17, 0xB9F7, xor, GR32, GR32>;
+    defm XGR : BinaryRREAndK<"xgr", 0xB982, 0xB9E7, xor, GR64, GR64>;
+  }
+
+  // XORs of a 32-bit immediate, leaving other bits unaffected.
+  // The CC result only reflects the 32-bit field, which means we can
+  // use it as a zero indicator for i32 operations but not otherwise.
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    // Expands to XILF or XIHF, depending on the choice of register.
+    def XIFMux : BinaryRIPseudo<xor, GRX32, uimm32>,
+                 Requires<[FeatureHighWord]>;
+    def XILF : BinaryRIL<"xilf", 0xC07, xor, GR32, uimm32>;
+    def XIHF : BinaryRIL<"xihf", 0xC06, xor, GRH32, uimm32>;
+  }
+  def XILF64 : BinaryAliasRIL<xor, GR64, imm64lf32>;
+  def XIHF64 : BinaryAliasRIL<xor, GR64, imm64hf32>;
+
+  // XORs of memory.
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm X  : BinaryRXPair<"x",0x57, 0xE357, xor, GR32, load, 4>;
+    def  XG : BinaryRXY<"xg", 0xE382, xor, GR64, load, 8>;
+  }
+
+  // XOR to memory
+  defm XI : BinarySIPair<"xi", 0x97, 0xEB57, null_frag, imm32zx8>;
+
+  // Block XOR.
+  let mayLoad = 1, mayStore = 1 in
+    defm XC : MemorySS<"xc", 0xD7, z_xc, z_xc_loop>;
+}
+defm : RMWIByte<xor, bdaddr12pair, XI>;
+defm : RMWIByte<xor, bdaddr20pair, XIY>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+// Multiplication of a register.
+let isCommutable = 1 in {
+  def MSR  : BinaryRRE<"msr",  0xB252, mul, GR32, GR32>;
+  def MSGR : BinaryRRE<"msgr", 0xB90C, mul, GR64, GR64>;
+}
+def MSGFR : BinaryRRE<"msgfr", 0xB91C, null_frag, GR64, GR32>;
+defm : SXB<mul, GR64, MSGFR>;
+
+// Multiplication of a signed 16-bit immediate.
+def MHI  : BinaryRI<"mhi",  0xA7C, mul, GR32, imm32sx16>;
+def MGHI : BinaryRI<"mghi", 0xA7D, mul, GR64, imm64sx16>;
+
+// Multiplication of a signed 32-bit immediate.
+def MSFI  : BinaryRIL<"msfi",  0xC21, mul, GR32, simm32>;
+def MSGFI : BinaryRIL<"msgfi", 0xC20, mul, GR64, imm64sx32>;
+
+// Multiplication of memory.
+defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, asextloadi16, 2>;
+defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load, 4>;
+def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
+def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load, 8>;
+
+// Multiplication of a register, producing two results.
+def MLGR : BinaryRRE<"mlgr", 0xB986, z_umul_lohi64, GR128, GR64>;
+
+// Multiplication of memory, producing two results.
+def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 1 in {  // Do not speculatively execute.
+  // Division and remainder, from registers.
+  def DSGFR : BinaryRRE<"dsgfr", 0xB91D, z_sdivrem32, GR128, GR32>;
+  def DSGR  : BinaryRRE<"dsgr",  0xB90D, z_sdivrem64, GR128, GR64>;
+  def DLR   : BinaryRRE<"dlr",   0xB997, z_udivrem32, GR128, GR32>;
+  def DLGR  : BinaryRRE<"dlgr",  0xB987, z_udivrem64, GR128, GR64>;
+
+  // Division and remainder, from memory.
+  def DSGF : BinaryRXY<"dsgf", 0xE31D, z_sdivrem32, GR128, load, 4>;
+  def DSG  : BinaryRXY<"dsg",  0xE30D, z_sdivrem64, GR128, load, 8>;
+  def DL   : BinaryRXY<"dl",   0xE397, z_udivrem32, GR128, load, 4>;
+  def DLG  : BinaryRXY<"dlg",  0xE387, z_udivrem64, GR128, load, 8>;
+}
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+// Shift left.
+let hasSideEffects = 0 in {
+  defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>;
+  defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>;
+  def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>;
+}
+
+// Logical shift right.
+let hasSideEffects = 0 in {
+  defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>;
+  def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>;
+}
+
+// Arithmetic shift right.
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
+  defm SRA : BinaryRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>;
+  def SRAG : BinaryRSY<"srag", 0xEB0A, sra, GR64>;
+}
+
+// Rotate left.
+let hasSideEffects = 0 in {
+  def RLL  : BinaryRSY<"rll",  0xEB1D, rotl, GR32>;
+  def RLLG : BinaryRSY<"rllg", 0xEB1C, rotl, GR64>;
+}
+
+// Rotate second operand left and inserted selected bits into first operand.
+// These can act like 32-bit operands provided that the constant start and
+// end bits (operands 2 and 3) are in the range [32, 64).
+let Defs = [CC] in {
+  let isCodeGenOnly = 1 in
+    def RISBG32 : RotateSelectRIEf<"risbg", 0xEC55, GR32, GR32>;
+  let CCValues = 0xE, CompareZeroCCMask = 0xE in
+    def RISBG : RotateSelectRIEf<"risbg", 0xEC55, GR64, GR64>;
+}
+
+// On zEC12 we have a variant of RISBG that does not set CC.
+let Predicates = [FeatureMiscellaneousExtensions] in
+  def RISBGN : RotateSelectRIEf<"risbgn", 0xEC59, GR64, GR64>;
+
+// Forms of RISBG that only affect one word of the destination register.
+// They do not set CC.
+let Predicates = [FeatureHighWord] in {
+  def RISBMux : RotateSelectRIEfPseudo<GRX32, GRX32>;
+  def RISBLL  : RotateSelectAliasRIEf<GR32,  GR32>;
+  def RISBLH  : RotateSelectAliasRIEf<GR32,  GRH32>;
+  def RISBHL  : RotateSelectAliasRIEf<GRH32, GR32>;
+  def RISBHH  : RotateSelectAliasRIEf<GRH32, GRH32>;
+  def RISBLG  : RotateSelectRIEf<"risblg", 0xEC51, GR32, GR64>;
+  def RISBHG  : RotateSelectRIEf<"risbhg", 0xEC5D, GRH32, GR64>;
+}
+
+// Rotate second operand left and perform a logical operation with selected
+// bits of the first operand.  The CC result only describes the selected bits,
+// so isn't useful for a full comparison against zero.
+let Defs = [CC] in {
+  def RNSBG : RotateSelectRIEf<"rnsbg", 0xEC54, GR64, GR64>;
+  def ROSBG : RotateSelectRIEf<"rosbg", 0xEC56, GR64, GR64>;
+  def RXSBG : RotateSelectRIEf<"rxsbg", 0xEC57, GR64, GR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+// Signed comparisons.  We put these before the unsigned comparisons because
+// some of the signed forms have COMPARE AND BRANCH equivalents whereas none
+// of the unsigned forms do.
+let Defs = [CC], CCValues = 0xE in {
+  // Comparison with a register.
+  def CR   : CompareRR <"cr",   0x19,   z_scmp,    GR32, GR32>;
+  def CGFR : CompareRRE<"cgfr", 0xB930, null_frag, GR64, GR32>;
+  def CGR  : CompareRRE<"cgr",  0xB920, z_scmp,    GR64, GR64>;
+
+  // Comparison with a signed 16-bit immediate.  CHIMux expands to CHI or CIH,
+  // depending on the choice of register.
+  def CHIMux : CompareRIPseudo<z_scmp, GRX32, imm32sx16>,
+               Requires<[FeatureHighWord]>;
+  def CHI  : CompareRI<"chi",  0xA7E, z_scmp, GR32, imm32sx16>;
+  def CGHI : CompareRI<"cghi", 0xA7F, z_scmp, GR64, imm64sx16>;
+
+  // Comparison with a signed 32-bit immediate.  CFIMux expands to CFI or CIH,
+  // depending on the choice of register.
+  def CFIMux : CompareRIPseudo<z_scmp, GRX32, simm32>,
+               Requires<[FeatureHighWord]>;
+  def CFI  : CompareRIL<"cfi",  0xC2D, z_scmp, GR32, simm32>;
+  def CIH  : CompareRIL<"cih",  0xCCD, z_scmp, GRH32, simm32>,
+             Requires<[FeatureHighWord]>;
+  def CGFI : CompareRIL<"cgfi", 0xC2C, z_scmp, GR64, imm64sx32>;
+
+  // Comparison with memory.
+  defm CH    : CompareRXPair<"ch", 0x49, 0xE379, z_scmp, GR32, asextloadi16, 2>;
+  def  CMux  : CompareRXYPseudo<z_scmp, GRX32, load, 4>,
+               Requires<[FeatureHighWord]>;
+  defm C     : CompareRXPair<"c",  0x59, 0xE359, z_scmp, GR32, load, 4>;
+  def  CHF   : CompareRXY<"chf", 0xE3CD, z_scmp, GRH32, load, 4>,
+               Requires<[FeatureHighWord]>;
+  def  CGH   : CompareRXY<"cgh", 0xE334, z_scmp, GR64, asextloadi16, 2>;
+  def  CGF   : CompareRXY<"cgf", 0xE330, z_scmp, GR64, asextloadi32, 4>;
+  def  CG    : CompareRXY<"cg",  0xE320, z_scmp, GR64, load, 8>;
+  def  CHRL  : CompareRILPC<"chrl",  0xC65, z_scmp, GR32, aligned_asextloadi16>;
+  def  CRL   : CompareRILPC<"crl",   0xC6D, z_scmp, GR32, aligned_load>;
+  def  CGHRL : CompareRILPC<"cghrl", 0xC64, z_scmp, GR64, aligned_asextloadi16>;
+  def  CGFRL : CompareRILPC<"cgfrl", 0xC6C, z_scmp, GR64, aligned_asextloadi32>;
+  def  CGRL  : CompareRILPC<"cgrl",  0xC68, z_scmp, GR64, aligned_load>;
+
+  // Comparison between memory and a signed 16-bit immediate.
+  def CHHSI : CompareSIL<"chhsi", 0xE554, z_scmp, asextloadi16, imm32sx16>;
+  def CHSI  : CompareSIL<"chsi",  0xE55C, z_scmp, load, imm32sx16>;
+  def CGHSI : CompareSIL<"cghsi", 0xE558, z_scmp, load, imm64sx16>;
+}
+defm : SXB<z_scmp, GR64, CGFR>;
+
+// Unsigned comparisons.
+let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
+  // Comparison with a register.
+  def CLR   : CompareRR <"clr",   0x15,   z_ucmp,    GR32, GR32>;
+  def CLGFR : CompareRRE<"clgfr", 0xB931, null_frag, GR64, GR32>;
+  def CLGR  : CompareRRE<"clgr",  0xB921, z_ucmp,    GR64, GR64>;
+
+  // Comparison with an unsigned 32-bit immediate.  CLFIMux expands to CLFI
+  // or CLIH, depending on the choice of register.
+  def CLFIMux : CompareRIPseudo<z_ucmp, GRX32, uimm32>,
+                Requires<[FeatureHighWord]>;
+  def CLFI  : CompareRIL<"clfi",  0xC2F, z_ucmp, GR32, uimm32>;
+  def CLIH  : CompareRIL<"clih",  0xCCF, z_ucmp, GRH32, uimm32>,
+              Requires<[FeatureHighWord]>;
+  def CLGFI : CompareRIL<"clgfi", 0xC2E, z_ucmp, GR64, imm64zx32>;
+
+  // Comparison with memory.
+  def  CLMux  : CompareRXYPseudo<z_ucmp, GRX32, load, 4>,
+                Requires<[FeatureHighWord]>;
+  defm CL     : CompareRXPair<"cl", 0x55, 0xE355, z_ucmp, GR32, load, 4>;
+  def  CLHF   : CompareRXY<"clhf", 0xE3CF, z_ucmp, GRH32, load, 4>,
+                Requires<[FeatureHighWord]>;
+  def  CLGF   : CompareRXY<"clgf", 0xE331, z_ucmp, GR64, azextloadi32, 4>;
+  def  CLG    : CompareRXY<"clg",  0xE321, z_ucmp, GR64, load, 8>;
+  def  CLHRL  : CompareRILPC<"clhrl",  0xC67, z_ucmp, GR32,
+                             aligned_azextloadi16>;
+  def  CLRL   : CompareRILPC<"clrl",   0xC6F, z_ucmp, GR32,
+                             aligned_load>;
+  def  CLGHRL : CompareRILPC<"clghrl", 0xC66, z_ucmp, GR64,
+                             aligned_azextloadi16>;
+  def  CLGFRL : CompareRILPC<"clgfrl", 0xC6E, z_ucmp, GR64,
+                             aligned_azextloadi32>;
+  def  CLGRL  : CompareRILPC<"clgrl",  0xC6A, z_ucmp, GR64,
+                             aligned_load>;
+
+  // Comparison between memory and an unsigned 8-bit immediate.
+  defm CLI : CompareSIPair<"cli", 0x95, 0xEB55, z_ucmp, azextloadi8, imm32zx8>;
+
+  // Comparison between memory and an unsigned 16-bit immediate.
+  def CLHHSI : CompareSIL<"clhhsi", 0xE555, z_ucmp, azextloadi16, imm32zx16>;
+  def CLFHSI : CompareSIL<"clfhsi", 0xE55D, z_ucmp, load, imm32zx16>;
+  def CLGHSI : CompareSIL<"clghsi", 0xE559, z_ucmp, load, imm64zx16>;
+}
+defm : ZXB<z_ucmp, GR64, CLGFR>;
+
+// Memory-to-memory comparison.
+let mayLoad = 1, Defs = [CC] in
+  defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
+
+// String comparison.
+let mayLoad = 1, Defs = [CC] in
+  defm CLST : StringRRE<"clst", 0xB25D, z_strcmp>;
+
+// Test under mask.
+let Defs = [CC] in {
+  // TMxMux expands to TM[LH]x, depending on the choice of register.
+  def TMLMux : CompareRIPseudo<z_tm_reg, GRX32, imm32ll16>,
+               Requires<[FeatureHighWord]>;
+  def TMHMux : CompareRIPseudo<z_tm_reg, GRX32, imm32lh16>,
+               Requires<[FeatureHighWord]>;
+  def TMLL : CompareRI<"tmll", 0xA71, z_tm_reg, GR32, imm32ll16>;
+  def TMLH : CompareRI<"tmlh", 0xA70, z_tm_reg, GR32, imm32lh16>;
+  def TMHL : CompareRI<"tmhl", 0xA73, z_tm_reg, GRH32, imm32ll16>;
+  def TMHH : CompareRI<"tmhh", 0xA72, z_tm_reg, GRH32, imm32lh16>;
+
+  def TMLL64 : CompareAliasRI<z_tm_reg, GR64, imm64ll16>;
+  def TMLH64 : CompareAliasRI<z_tm_reg, GR64, imm64lh16>;
+  def TMHL64 : CompareAliasRI<z_tm_reg, GR64, imm64hl16>;
+  def TMHH64 : CompareAliasRI<z_tm_reg, GR64, imm64hh16>;
+
+  defm TM : CompareSIPair<"tm", 0x91, 0xEB51, z_tm_mem, anyextloadi8, imm32zx8>;
+}
+
+def TML : InstAlias<"tml\t$R, $I", (TMLL GR32:$R, imm32ll16:$I), 0>;
+def TMH : InstAlias<"tmh\t$R, $I", (TMLH GR32:$R, imm32lh16:$I), 0>;
+
+//===----------------------------------------------------------------------===//
+// Prefetch and execution hint
+//===----------------------------------------------------------------------===//
+
+def PFD : PrefetchRXY<"pfd", 0xE336, z_prefetch>;
+def PFDRL : PrefetchRILPC<"pfdrl", 0xC62, z_prefetch>;
+
+let Predicates = [FeatureExecutionHint] in {
+  // Branch Prediction Preload
+  def BPP : BranchPreloadSMI<"bpp", 0xC7>;
+  def BPRP : BranchPreloadMII<"bprp", 0xC5>;
+
+  // Next Instruction Access Intent
+  def NIAI : SideEffectBinaryIE<"niai", 0xB2FA, imm32zx4, imm32zx4>;
+}
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+// A serialization instruction that acts as a barrier for all memory
+// accesses, which expands to "bcr 14, 0".
+let hasSideEffects = 1 in
+def Serialize : Alias<2, (outs), (ins), [(z_serialize)]>;
+
+// A pseudo instruction that serves as a compiler barrier.
+let hasSideEffects = 1, hasNoSchedulingInfo = 1 in
+def MemBarrier : Pseudo<(outs), (ins), [(z_membarrier)]>;
+
+let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
+  def LAA   : LoadAndOpRSY<"laa",   0xEBF8, atomic_load_add_32, GR32>;
+  def LAAG  : LoadAndOpRSY<"laag",  0xEBE8, atomic_load_add_64, GR64>;
+  def LAAL  : LoadAndOpRSY<"laal",  0xEBFA, null_frag, GR32>;
+  def LAALG : LoadAndOpRSY<"laalg", 0xEBEA, null_frag, GR64>;
+  def LAN   : LoadAndOpRSY<"lan",   0xEBF4, atomic_load_and_32, GR32>;
+  def LANG  : LoadAndOpRSY<"lang",  0xEBE4, atomic_load_and_64, GR64>;
+  def LAO   : LoadAndOpRSY<"lao",   0xEBF6, atomic_load_or_32, GR32>;
+  def LAOG  : LoadAndOpRSY<"laog",  0xEBE6, atomic_load_or_64, GR64>;
+  def LAX   : LoadAndOpRSY<"lax",   0xEBF7, atomic_load_xor_32, GR32>;
+  def LAXG  : LoadAndOpRSY<"laxg",  0xEBE7, atomic_load_xor_64, GR64>;
+}
+
+def ATOMIC_SWAPW   : AtomicLoadWBinaryReg<z_atomic_swapw>;
+def ATOMIC_SWAP_32 : AtomicLoadBinaryReg32<atomic_swap_32>;
+def ATOMIC_SWAP_64 : AtomicLoadBinaryReg64<atomic_swap_64>;
+
+def ATOMIC_LOADW_AR  : AtomicLoadWBinaryReg<z_atomic_loadw_add>;
+def ATOMIC_LOADW_AFI : AtomicLoadWBinaryImm<z_atomic_loadw_add, simm32>;
+let Predicates = [FeatureNoInterlockedAccess1] in {
+  def ATOMIC_LOAD_AR   : AtomicLoadBinaryReg32<atomic_load_add_32>;
+  def ATOMIC_LOAD_AHI  : AtomicLoadBinaryImm32<atomic_load_add_32, imm32sx16>;
+  def ATOMIC_LOAD_AFI  : AtomicLoadBinaryImm32<atomic_load_add_32, simm32>;
+  def ATOMIC_LOAD_AGR  : AtomicLoadBinaryReg64<atomic_load_add_64>;
+  def ATOMIC_LOAD_AGHI : AtomicLoadBinaryImm64<atomic_load_add_64, imm64sx16>;
+  def ATOMIC_LOAD_AGFI : AtomicLoadBinaryImm64<atomic_load_add_64, imm64sx32>;
+}
+
+def ATOMIC_LOADW_SR : AtomicLoadWBinaryReg<z_atomic_loadw_sub>;
+def ATOMIC_LOAD_SR  : AtomicLoadBinaryReg32<atomic_load_sub_32>;
+def ATOMIC_LOAD_SGR : AtomicLoadBinaryReg64<atomic_load_sub_64>;
+
+def ATOMIC_LOADW_NR   : AtomicLoadWBinaryReg<z_atomic_loadw_and>;
+def ATOMIC_LOADW_NILH : AtomicLoadWBinaryImm<z_atomic_loadw_and, imm32lh16c>;
+let Predicates = [FeatureNoInterlockedAccess1] in {
+  def ATOMIC_LOAD_NR     : AtomicLoadBinaryReg32<atomic_load_and_32>;
+  def ATOMIC_LOAD_NILL   : AtomicLoadBinaryImm32<atomic_load_and_32,
+                                                 imm32ll16c>;
+  def ATOMIC_LOAD_NILH   : AtomicLoadBinaryImm32<atomic_load_and_32,
+                                                 imm32lh16c>;
+  def ATOMIC_LOAD_NILF   : AtomicLoadBinaryImm32<atomic_load_and_32, uimm32>;
+  def ATOMIC_LOAD_NGR    : AtomicLoadBinaryReg64<atomic_load_and_64>;
+  def ATOMIC_LOAD_NILL64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64ll16c>;
+  def ATOMIC_LOAD_NILH64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64lh16c>;
+  def ATOMIC_LOAD_NIHL64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64hl16c>;
+  def ATOMIC_LOAD_NIHH64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64hh16c>;
+  def ATOMIC_LOAD_NILF64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64lf32c>;
+  def ATOMIC_LOAD_NIHF64 : AtomicLoadBinaryImm64<atomic_load_and_64,
+                                                 imm64hf32c>;
+}
+
+def ATOMIC_LOADW_OR     : AtomicLoadWBinaryReg<z_atomic_loadw_or>;
+def ATOMIC_LOADW_OILH   : AtomicLoadWBinaryImm<z_atomic_loadw_or, imm32lh16>;
+let Predicates = [FeatureNoInterlockedAccess1] in {
+  def ATOMIC_LOAD_OR     : AtomicLoadBinaryReg32<atomic_load_or_32>;
+  def ATOMIC_LOAD_OILL   : AtomicLoadBinaryImm32<atomic_load_or_32, imm32ll16>;
+  def ATOMIC_LOAD_OILH   : AtomicLoadBinaryImm32<atomic_load_or_32, imm32lh16>;
+  def ATOMIC_LOAD_OILF   : AtomicLoadBinaryImm32<atomic_load_or_32, uimm32>;
+  def ATOMIC_LOAD_OGR    : AtomicLoadBinaryReg64<atomic_load_or_64>;
+  def ATOMIC_LOAD_OILL64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64ll16>;
+  def ATOMIC_LOAD_OILH64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lh16>;
+  def ATOMIC_LOAD_OIHL64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hl16>;
+  def ATOMIC_LOAD_OIHH64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hh16>;
+  def ATOMIC_LOAD_OILF64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lf32>;
+  def ATOMIC_LOAD_OIHF64 : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hf32>;
+}
+
+def ATOMIC_LOADW_XR     : AtomicLoadWBinaryReg<z_atomic_loadw_xor>;
+def ATOMIC_LOADW_XILF   : AtomicLoadWBinaryImm<z_atomic_loadw_xor, uimm32>;
+let Predicates = [FeatureNoInterlockedAccess1] in {
+  def ATOMIC_LOAD_XR     : AtomicLoadBinaryReg32<atomic_load_xor_32>;
+  def ATOMIC_LOAD_XILF   : AtomicLoadBinaryImm32<atomic_load_xor_32, uimm32>;
+  def ATOMIC_LOAD_XGR    : AtomicLoadBinaryReg64<atomic_load_xor_64>;
+  def ATOMIC_LOAD_XILF64 : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64lf32>;
+  def ATOMIC_LOAD_XIHF64 : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64hf32>;
+}
+
+def ATOMIC_LOADW_NRi    : AtomicLoadWBinaryReg<z_atomic_loadw_nand>;
+def ATOMIC_LOADW_NILHi  : AtomicLoadWBinaryImm<z_atomic_loadw_nand,
+                                               imm32lh16c>;
+def ATOMIC_LOAD_NRi     : AtomicLoadBinaryReg32<atomic_load_nand_32>;
+def ATOMIC_LOAD_NILLi   : AtomicLoadBinaryImm32<atomic_load_nand_32,
+                                                imm32ll16c>;
+def ATOMIC_LOAD_NILHi   : AtomicLoadBinaryImm32<atomic_load_nand_32,
+                                                imm32lh16c>;
+def ATOMIC_LOAD_NILFi   : AtomicLoadBinaryImm32<atomic_load_nand_32, uimm32>;
+def ATOMIC_LOAD_NGRi    : AtomicLoadBinaryReg64<atomic_load_nand_64>;
+def ATOMIC_LOAD_NILL64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64ll16c>;
+def ATOMIC_LOAD_NILH64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64lh16c>;
+def ATOMIC_LOAD_NIHL64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64hl16c>;
+def ATOMIC_LOAD_NIHH64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64hh16c>;
+def ATOMIC_LOAD_NILF64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64lf32c>;
+def ATOMIC_LOAD_NIHF64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
+                                                imm64hf32c>;
+
+def ATOMIC_LOADW_MIN    : AtomicLoadWBinaryReg<z_atomic_loadw_min>;
+def ATOMIC_LOAD_MIN_32  : AtomicLoadBinaryReg32<atomic_load_min_32>;
+def ATOMIC_LOAD_MIN_64  : AtomicLoadBinaryReg64<atomic_load_min_64>;
+
+def ATOMIC_LOADW_MAX    : AtomicLoadWBinaryReg<z_atomic_loadw_max>;
+def ATOMIC_LOAD_MAX_32  : AtomicLoadBinaryReg32<atomic_load_max_32>;
+def ATOMIC_LOAD_MAX_64  : AtomicLoadBinaryReg64<atomic_load_max_64>;
+
+def ATOMIC_LOADW_UMIN   : AtomicLoadWBinaryReg<z_atomic_loadw_umin>;
+def ATOMIC_LOAD_UMIN_32 : AtomicLoadBinaryReg32<atomic_load_umin_32>;
+def ATOMIC_LOAD_UMIN_64 : AtomicLoadBinaryReg64<atomic_load_umin_64>;
+
+def ATOMIC_LOADW_UMAX   : AtomicLoadWBinaryReg<z_atomic_loadw_umax>;
+def ATOMIC_LOAD_UMAX_32 : AtomicLoadBinaryReg32<atomic_load_umax_32>;
+def ATOMIC_LOAD_UMAX_64 : AtomicLoadBinaryReg64<atomic_load_umax_64>;
+
+def ATOMIC_CMP_SWAPW
+  : Pseudo<(outs GR32:$dst), (ins bdaddr20only:$addr, GR32:$cmp, GR32:$swap,
+                                  ADDR32:$bitshift, ADDR32:$negbitshift,
+                                  uimm32:$bitsize),
+           [(set GR32:$dst,
+                 (z_atomic_cmp_swapw bdaddr20only:$addr, GR32:$cmp, GR32:$swap,
+                                     ADDR32:$bitshift, ADDR32:$negbitshift,
+                                     uimm32:$bitsize))]> {
+  let Defs = [CC];
+  let mayLoad = 1;
+  let mayStore = 1;
+  let usesCustomInserter = 1;
+  let hasNoSchedulingInfo = 1;
+}
+
+// Test and set.
+let mayLoad = 1, Defs = [CC] in
+  def TS : StoreInherentS<"ts", 0x9300, null_frag, 1>;
+
+// Compare and swap.
+let Defs = [CC] in {
+  defm CS  : CmpSwapRSPair<"cs", 0xBA, 0xEB14, atomic_cmp_swap_32, GR32>;
+  def  CSG : CmpSwapRSY<"csg", 0xEB30, atomic_cmp_swap_64, GR64>;
+}
+
+// Compare double and swap.
+let Defs = [CC] in {
+  defm CDS  : CmpSwapRSPair<"cds", 0xBB, 0xEB31, null_frag, GR128>;
+  def  CDSG : CmpSwapRSY<"cdsg", 0xEB3E, null_frag, GR128>;
+}
+
+// Compare and swap and store.
+let Uses = [R0L, R1D], Defs = [CC], mayStore = 1, mayLoad = 1 in
+  def CSST : SideEffectTernarySSF<"csst", 0xC82, GR64>;
+
+// Perform locked operation.
+let Uses = [R0L, R1D], Defs = [CC], mayStore = 1, mayLoad =1 in
+  def PLO : SideEffectQuaternarySSe<"plo", 0xEE, GR64>;
+
+// Load/store pair from/to quadword.
+def LPQ  : UnaryRXY<"lpq", 0xE38F, null_frag, GR128, 16>;
+def STPQ : StoreRXY<"stpq", 0xE38E, null_frag, GR128, 16>;
+
+// Load pair disjoint.
+let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
+  def LPD  : BinarySSF<"lpd", 0xC84, GR128>;
+  def LPDG : BinarySSF<"lpdg", 0xC85, GR128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Access registers
+//===----------------------------------------------------------------------===//
+
+// Read a 32-bit access register into a GR32.  As with all GR32 operations,
+// the upper 32 bits of the enclosing GR64 remain unchanged, which is useful
+// when a 64-bit address is stored in a pair of access registers.
+def EAR : UnaryRRE<"ear", 0xB24F, null_frag, GR32, AR32>;
+
+// Set access register.
+def SAR : UnaryRRE<"sar", 0xB24E, null_frag, AR32, GR32>;
+
+// Copy access register.
+def CPYA : UnaryRRE<"cpya", 0xB24D, null_frag, AR32, AR32>;
+
+// Load address extended.
+defm LAE : LoadAddressRXPair<"lae", 0x51, 0xE375, null_frag>;
+
+// Load access multiple.
+defm LAM : LoadMultipleRSPair<"lam", 0x9A, 0xEB9A, AR32>;
+
+// Load access multiple.
+defm STAM : StoreMultipleRSPair<"stam", 0x9B, 0xEB9B, AR32>;
+
+//===----------------------------------------------------------------------===//
+// Program mask and addressing mode
+//===----------------------------------------------------------------------===//
+
+// Extract CC and program mask into a register.  CC ends up in bits 29 and 28.
+let Uses = [CC] in
+  def IPM : InherentRRE<"ipm", 0xB222, GR32, z_ipm>;
+
+// Set CC and program mask from a register.
+let hasSideEffects = 1, Defs = [CC] in
+  def SPM : SideEffectUnaryRR<"spm", 0x04, GR32>;
+
+// Branch and link - like BAS, but also extracts CC and program mask.
+let isCall = 1, Uses = [CC], Defs = [CC] in {
+  def BAL  : CallRX<"bal", 0x45>;
+  def BALR : CallRR<"balr", 0x05>;
+}
+
+// Test addressing mode.
+let Defs = [CC] in
+  def TAM : SideEffectInherentE<"tam", 0x010B>;
+
+// Set addressing mode.
+let hasSideEffects = 1 in {
+  def SAM24 : SideEffectInherentE<"sam24", 0x010C>;
+  def SAM31 : SideEffectInherentE<"sam31", 0x010D>;
+  def SAM64 : SideEffectInherentE<"sam64", 0x010E>;
+}
+
+// Branch and set mode.  Not really a call, but also sets an output register.
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in
+  def BSM : CallRR<"bsm", 0x0B>;
+
+// Branch and save and set mode.
+let isCall = 1, Defs = [CC] in
+  def BASSM : CallRR<"bassm", 0x0C>;
+
+//===----------------------------------------------------------------------===//
+// Transactional execution
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 1, Predicates = [FeatureTransactionalExecution] in {
+  // Transaction Begin
+  let mayStore = 1, usesCustomInserter = 1, Defs = [CC] in {
+    def TBEGIN : SideEffectBinarySIL<"tbegin", 0xE560, z_tbegin, imm32zx16>;
+    def TBEGIN_nofloat : SideEffectBinarySILPseudo<z_tbegin_nofloat, imm32zx16>;
+
+    def TBEGINC : SideEffectBinarySIL<"tbeginc", 0xE561,
+                                      int_s390_tbeginc, imm32zx16>;
+  }
+
+  // Transaction End
+  let Defs = [CC] in
+    def TEND : SideEffectInherentS<"tend", 0xB2F8, z_tend>;
+
+  // Transaction Abort
+  let isTerminator = 1, isBarrier = 1 in
+    def TABORT : SideEffectAddressS<"tabort", 0xB2FC, int_s390_tabort>;
+
+  // Nontransactional Store
+  def NTSTG : StoreRXY<"ntstg", 0xE325, int_s390_ntstg, GR64, 8>;
+
+  // Extract Transaction Nesting Depth
+  def ETND : InherentRRE<"etnd", 0xB2EC, GR32, int_s390_etnd>;
+}
+
+//===----------------------------------------------------------------------===//
+// Processor assist
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureProcessorAssist] in {
+  let hasSideEffects = 1 in
+    def PPA : SideEffectTernaryRRFc<"ppa", 0xB2E8, GR64, GR64, imm32zx4>;
+  def : Pat<(int_s390_ppa_txassist GR32:$src),
+            (PPA (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32),
+                 0, 1)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Find leftmost one, AKA count leading zeros.  The instruction actually
+// returns a pair of GR64s, the first giving the number of leading zeros
+// and the second giving a copy of the source with the leftmost one bit
+// cleared.  We only use the first result here.
+let Defs = [CC] in
+  def FLOGR : UnaryRRE<"flogr", 0xB983, null_frag, GR128, GR64>;
+def : Pat<(ctlz GR64:$src),
+          (EXTRACT_SUBREG (FLOGR GR64:$src), subreg_h64)>;
+
+// Population count.  Counts bits set per byte.
+let Predicates = [FeaturePopulationCount], Defs = [CC] in
+  def POPCNT : UnaryRRE<"popcnt", 0xB9E1, z_popcnt, GR64, GR64>;
+
+// Use subregs to populate the "don't care" bits in a 32-bit to 64-bit anyext.
+def : Pat<(i64 (anyext GR32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>;
+
+// Extend GR32s and GR64s to GR128s.
+let usesCustomInserter = 1 in {
+  def AEXT128_64 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
+  def ZEXT128_32 : Pseudo<(outs GR128:$dst), (ins GR32:$src), []>;
+  def ZEXT128_64 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
+}
+
+// Search a block of memory for a character.
+let mayLoad = 1, Defs = [CC] in
+  defm SRST : StringRRE<"srst", 0xb25e, z_search_string>;
+
+// Supervisor call.
+let hasSideEffects = 1, isCall = 1, Defs = [CC] in
+  def SVC : SideEffectUnaryI<"svc", 0x0A, imm32zx8>;
+
+// Store clock.
+let hasSideEffects = 1, Defs = [CC] in {
+  def STCK  : StoreInherentS<"stck",  0xB205, null_frag, 8>;
+  def STCKF : StoreInherentS<"stckf", 0xB27C, null_frag, 8>;
+  def STCKE : StoreInherentS<"stcke", 0xB278, null_frag, 16>;
+}
+
+// Store facility list.
+let hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in
+  def STFLE : StoreInherentS<"stfle", 0xB2B0, null_frag, 0>;
+
+// Extract CPU time.
+let Defs = [R0D, R1D], hasSideEffects = 1, mayLoad = 1 in
+  def ECTG : SideEffectTernarySSF<"ectg", 0xC81, GR64>;
+
+// Execute.
+let hasSideEffects = 1 in {
+  def EX   : SideEffectBinaryRX<"ex", 0x44, GR64>;
+  def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, GR64>;
+}
+
+// Program return.
+let hasSideEffects = 1, Defs = [CC] in
+  def PR : SideEffectInherentE<"pr", 0x0101>;
+
+// Move with key.
+let mayLoad = 1, mayStore = 1, Defs = [CC] in
+  def MVCK : MemoryBinarySSd<"mvck", 0xD9, GR64>;
+
+// Store real address.
+def STRAG : StoreSSE<"strag", 0xE502>;
+
+//===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+let isCodeGenOnly = 1 in {
+  def InsnE   : DirectiveInsnE<(outs), (ins imm64zx16:$enc), ".insn e,$enc", []>;
+  def InsnRI  : DirectiveInsnRI<(outs), (ins imm64zx32:$enc, AnyReg:$R1,
+                                             imm32sx16:$I2),
+                                ".insn ri,$enc,$R1,$I2", []>;
+  def InsnRIE : DirectiveInsnRIE<(outs), (ins imm64zx48:$enc, AnyReg:$R1,
+                                              AnyReg:$R3, brtarget16:$I2),
+                                 ".insn rie,$enc,$R1,$R3,$I2", []>;
+  def InsnRIL : DirectiveInsnRIL<(outs), (ins imm64zx48:$enc, AnyReg:$R1,
+                                              brtarget32:$I2),
+                                 ".insn ril,$enc,$R1,$I2", []>;
+  def InsnRILU : DirectiveInsnRIL<(outs), (ins imm64zx48:$enc, AnyReg:$R1,
+                                               uimm32:$I2),
+                                  ".insn rilu,$enc,$R1,$I2", []>;
+  def InsnRIS : DirectiveInsnRIS<(outs),
+                                 (ins imm64zx48:$enc, AnyReg:$R1,
+                                      imm32sx8:$I2, imm32zx4:$M3,
+                                      bdaddr12only:$BD4),
+                                 ".insn ris,$enc,$R1,$I2,$M3,$BD4", []>;
+  def InsnRR : DirectiveInsnRR<(outs),
+                               (ins imm64zx16:$enc, AnyReg:$R1, AnyReg:$R2),
+                               ".insn rr,$enc,$R1,$R2", []>;
+  def InsnRRE : DirectiveInsnRRE<(outs), (ins imm64zx32:$enc,
+                                              AnyReg:$R1, AnyReg:$R2),
+                                 ".insn rre,$enc,$R1,$R2", []>;
+  def InsnRRF : DirectiveInsnRRF<(outs),
+                                 (ins imm64zx32:$enc, AnyReg:$R1, AnyReg:$R2,
+                                      AnyReg:$R3, imm32zx4:$M4),
+                                 ".insn rrf,$enc,$R1,$R2,$R3,$M4", []>;
+  def InsnRRS : DirectiveInsnRRS<(outs),
+                                 (ins imm64zx48:$enc, AnyReg:$R1,
+                                      AnyReg:$R2, imm32zx4:$M3,
+                                      bdaddr12only:$BD4),
+                                 ".insn rrs,$enc,$R1,$R2,$M3,$BD4", []>;
+  def InsnRS  : DirectiveInsnRS<(outs),
+                                (ins imm64zx32:$enc, AnyReg:$R1,
+                                     AnyReg:$R3, bdaddr12only:$BD2),
+                                ".insn rs,$enc,$R1,$R3,$BD2", []>;
+  def InsnRSE : DirectiveInsnRSE<(outs),
+                                 (ins imm64zx48:$enc, AnyReg:$R1,
+                                      AnyReg:$R3, bdaddr12only:$BD2),
+                                 ".insn rse,$enc,$R1,$R3,$BD2", []>;
+  def InsnRSI : DirectiveInsnRSI<(outs),
+                                 (ins imm64zx48:$enc, AnyReg:$R1,
+                                      AnyReg:$R3, brtarget16:$RI2),
+                                 ".insn rsi,$enc,$R1,$R3,$RI2", []>;
+  def InsnRSY : DirectiveInsnRSY<(outs),
+                                 (ins imm64zx48:$enc, AnyReg:$R1,
+                                      AnyReg:$R3, bdaddr20only:$BD2),
+                                 ".insn rsy,$enc,$R1,$R3,$BD2", []>;
+  def InsnRX  : DirectiveInsnRX<(outs), (ins imm64zx32:$enc, AnyReg:$R1,
+                                             bdxaddr12only:$XBD2),
+                                ".insn rx,$enc,$R1,$XBD2", []>;
+  def InsnRXE : DirectiveInsnRXE<(outs), (ins imm64zx48:$enc, AnyReg:$R1,
+                                              bdxaddr12only:$XBD2),
+                                 ".insn rxe,$enc,$R1,$XBD2", []>;
+  def InsnRXF : DirectiveInsnRXF<(outs),
+                                 (ins imm64zx48:$enc, AnyReg:$R1,
+                                      AnyReg:$R3, bdxaddr12only:$XBD2),
+                                 ".insn rxf,$enc,$R1,$R3,$XBD2", []>;
+  def InsnRXY : DirectiveInsnRXY<(outs), (ins imm64zx48:$enc, AnyReg:$R1,
+                                              bdxaddr20only:$XBD2),
+                                 ".insn rxy,$enc,$R1,$XBD2", []>;
+  def InsnS : DirectiveInsnS<(outs),
+                             (ins imm64zx32:$enc, bdaddr12only:$BD2),
+                             ".insn s,$enc,$BD2", []>;
+  def InsnSI : DirectiveInsnSI<(outs),
+                               (ins imm64zx32:$enc, bdaddr12only:$BD1,
+                                    imm32sx8:$I2),
+                               ".insn si,$enc,$BD1,$I2", []>;
+  def InsnSIY : DirectiveInsnSIY<(outs),
+                                 (ins imm64zx48:$enc,
+                                      bdaddr20only:$BD1, imm32zx8:$I2),
+                                 ".insn siy,$enc,$BD1,$I2", []>;
+  def InsnSIL : DirectiveInsnSIL<(outs),
+                                 (ins imm64zx48:$enc, bdaddr12only:$BD1,
+                                      imm32zx16:$I2),
+                                 ".insn sil,$enc,$BD1,$I2", []>;
+  def InsnSS : DirectiveInsnSS<(outs),
+                               (ins imm64zx48:$enc, bdraddr12only:$RBD1,
+                                    bdaddr12only:$BD2, AnyReg:$R3),
+                               ".insn ss,$enc,$RBD1,$BD2,$R3", []>;
+  def InsnSSE : DirectiveInsnSSE<(outs),
+                                 (ins imm64zx48:$enc,
+                                      bdaddr12only:$BD1,bdaddr12only:$BD2),
+                                 ".insn sse,$enc,$BD1,$BD2", []>;
+  def InsnSSF : DirectiveInsnSSF<(outs),
+                                 (ins imm64zx48:$enc, bdaddr12only:$BD1,
+                                      bdaddr12only:$BD2, AnyReg:$R3),
+                                 ".insn ssf,$enc,$BD1,$BD2,$R3", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Peepholes.
+//===----------------------------------------------------------------------===//
+
+// Use AL* for GR64 additions of unsigned 32-bit values.
+defm : ZXB<add, GR64, ALGFR>;
+def  : Pat<(add GR64:$src1, imm64zx32:$src2),
+           (ALGFI GR64:$src1, imm64zx32:$src2)>;
+def  : Pat<(add GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
+           (ALGF GR64:$src1, bdxaddr20only:$addr)>;
+
+// Use SL* for GR64 subtractions of unsigned 32-bit values.
+defm : ZXB<sub, GR64, SLGFR>;
+def  : Pat<(add GR64:$src1, imm64zx32n:$src2),
+           (SLGFI GR64:$src1, imm64zx32n:$src2)>;
+def  : Pat<(sub GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
+           (SLGF GR64:$src1, bdxaddr20only:$addr)>;
+
+// Optimize sign-extended 1/0 selects to -1/0 selects.  This is important
+// for vector legalization.
+def : Pat<(sra (shl (i32 (z_select_ccmask 1, 0, imm32zx4:$valid, imm32zx4:$cc)),
+                         (i32 31)),
+                    (i32 31)),
+          (Select32 (LHI -1), (LHI 0), imm32zx4:$valid, imm32zx4:$cc)>;
+def : Pat<(sra (shl (i64 (anyext (i32 (z_select_ccmask 1, 0, imm32zx4:$valid,
+                                                       imm32zx4:$cc)))),
+                    (i32 63)),
+               (i32 63)),
+          (Select64 (LGHI -1), (LGHI 0), imm32zx4:$valid, imm32zx4:$cc)>;
+
+// Avoid generating 2 XOR instructions. (xor (and x, y), y) is
+// equivalent to (and (xor x, -1), y)
+def : Pat<(and (xor GR64:$x, (i64 -1)), GR64:$y),
+                          (XGR GR64:$y, (NGR GR64:$y, GR64:$x))>;
+
+// Shift/rotate instructions only use the last 6 bits of the second operand
+// register, so we can safely use NILL (16 fewer bits than NILF) to only AND the
+// last 16 bits.
+// Complexity is added so that we match this before we match NILF on the AND
+// operation alone.
+let AddedComplexity = 4 in {
+  def : Pat<(shl GR32:$val, (and GR32:$shift, uimm32:$imm)),
+            (SLL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+  def : Pat<(sra GR32:$val, (and GR32:$shift, uimm32:$imm)),
+            (SRA GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+  def : Pat<(srl GR32:$val, (and GR32:$shift, uimm32:$imm)),
+            (SRL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+  def : Pat<(shl GR64:$val, (and GR32:$shift, uimm32:$imm)),
+            (SLLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+  def : Pat<(sra GR64:$val, (and GR32:$shift, uimm32:$imm)),
+            (SRAG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+  def : Pat<(srl GR64:$val, (and GR32:$shift, uimm32:$imm)),
+            (SRLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+  def : Pat<(rotl GR32:$val, (and GR32:$shift, uimm32:$imm)),
+            (RLL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+
+  def : Pat<(rotl GR64:$val, (and GR32:$shift, uimm32:$imm)),
+            (RLLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+}
+
+// Peepholes for turning scalar operations into block operations.
+defm : BlockLoadStore<anyextloadi8, i32, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 1>;
+defm : BlockLoadStore<anyextloadi16, i32, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 2>;
+defm : BlockLoadStore<load, i32, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 4>;
+defm : BlockLoadStore<anyextloadi8, i64, MVCSequence, NCSequence,
+                      OCSequence, XCSequence, 1>;
+defm : BlockLoadStore<anyextloadi16, i64, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 2>;
+defm : BlockLoadStore<anyextloadi32, i64, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 4>;
+defm : BlockLoadStore<load, i64, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 8>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrVector.td
new file mode 100644
index 000000000000..738ea7a33729
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -0,0 +1,1200 @@
+//==- SystemZInstrVector.td - SystemZ Vector instructions ------*- tblgen-*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+  // Register move.
+  def VLR : UnaryVRRa<"vlr", 0xE756, null_frag, v128any, v128any>;
+  def VLR32 : UnaryAliasVRR<null_frag, v32eb, v32eb>;
+  def VLR64 : UnaryAliasVRR<null_frag, v64db, v64db>;
+
+  // Load GR from VR element.
+  def VLGV  : BinaryVRScGeneric<"vlgv", 0xE721>;
+  def VLGVB : BinaryVRSc<"vlgvb", 0xE721, null_frag, v128b, 0>;
+  def VLGVH : BinaryVRSc<"vlgvh", 0xE721, null_frag, v128h, 1>;
+  def VLGVF : BinaryVRSc<"vlgvf", 0xE721, null_frag, v128f, 2>;
+  def VLGVG : BinaryVRSc<"vlgvg", 0xE721, z_vector_extract, v128g, 3>;
+
+  // Load VR element from GR.
+  def VLVG  : TernaryVRSbGeneric<"vlvg", 0xE722>;
+  def VLVGB : TernaryVRSb<"vlvgb", 0xE722, z_vector_insert,
+                          v128b, v128b, GR32, 0>;
+  def VLVGH : TernaryVRSb<"vlvgh", 0xE722, z_vector_insert,
+                          v128h, v128h, GR32, 1>;
+  def VLVGF : TernaryVRSb<"vlvgf", 0xE722, z_vector_insert,
+                          v128f, v128f, GR32, 2>;
+  def VLVGG : TernaryVRSb<"vlvgg", 0xE722, z_vector_insert,
+                          v128g, v128g, GR64, 3>;
+
+  // Load VR from GRs disjoint.
+  def VLVGP : BinaryVRRf<"vlvgp", 0xE762, z_join_dwords, v128g>;
+  def VLVGP32 : BinaryAliasVRRf<GR32>;
+}
+
+// Extractions always assign to the full GR64, even if the element would
+// fit in the lower 32 bits.  Sub-i64 extracts therefore need to take a
+// subreg of the result.
+class VectorExtractSubreg<ValueType type, Instruction insn>
+  : Pat<(i32 (z_vector_extract (type VR128:$vec), shift12only:$index)),
+        (EXTRACT_SUBREG (insn VR128:$vec, shift12only:$index), subreg_l32)>;
+
+def : VectorExtractSubreg<v16i8, VLGVB>;
+def : VectorExtractSubreg<v8i16, VLGVH>;
+def : VectorExtractSubreg<v4i32, VLGVF>;
+
+//===----------------------------------------------------------------------===//
+// Immediate instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+  // Generate byte mask.
+  def VZERO : InherentVRIa<"vzero", 0xE744, 0>;
+  def VONE  : InherentVRIa<"vone", 0xE744, 0xffff>;
+  def VGBM  : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16>;
+
+  // Generate mask.
+  def VGM  : BinaryVRIbGeneric<"vgm", 0xE746>;
+  def VGMB : BinaryVRIb<"vgmb", 0xE746, z_rotate_mask, v128b, 0>;
+  def VGMH : BinaryVRIb<"vgmh", 0xE746, z_rotate_mask, v128h, 1>;
+  def VGMF : BinaryVRIb<"vgmf", 0xE746, z_rotate_mask, v128f, 2>;
+  def VGMG : BinaryVRIb<"vgmg", 0xE746, z_rotate_mask, v128g, 3>;
+
+  // Load element immediate.
+  //
+  // We want these instructions to be used ahead of VLVG* where possible.
+  // However, VLVG* takes a variable BD-format index whereas VLEI takes
+  // a plain immediate index.  This means that VLVG* has an extra "base"
+  // register operand and is 3 units more complex.  Bumping the complexity
+  // of the VLEI* instructions by 4 means that they are strictly better
+  // than VLVG* in cases where both forms match.
+  let AddedComplexity = 4 in {
+    def VLEIB : TernaryVRIa<"vleib", 0xE740, z_vector_insert,
+                            v128b, v128b, imm32sx16trunc, imm32zx4>;
+    def VLEIH : TernaryVRIa<"vleih", 0xE741, z_vector_insert,
+                            v128h, v128h, imm32sx16trunc, imm32zx3>;
+    def VLEIF : TernaryVRIa<"vleif", 0xE743, z_vector_insert,
+                            v128f, v128f, imm32sx16, imm32zx2>;
+    def VLEIG : TernaryVRIa<"vleig", 0xE742, z_vector_insert,
+                            v128g, v128g, imm64sx16, imm32zx1>;
+  }
+
+  // Replicate immediate.
+  def VREPI  : UnaryVRIaGeneric<"vrepi", 0xE745, imm32sx16>;
+  def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16, 0>;
+  def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16, 1>;
+  def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16, 2>;
+  def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16, 3>;
+}
+
+//===----------------------------------------------------------------------===//
+// Loads
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+  // Load.
+  def VL : UnaryVRX<"vl", 0xE706, null_frag, v128any, 16>;
+
+  // Load to block boundary.  The number of loaded bytes is only known
+  // at run time.  The instruction is really polymorphic, but v128b matches
+  // the return type of the associated intrinsic.
+  def VLBB : BinaryVRX<"vlbb", 0xE707, int_s390_vlbb, v128b, 0>;
+
+  // Load count to block boundary.
+  let Defs = [CC] in
+    def LCBB : InstRXE<0xE727, (outs GR32:$R1),
+                               (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
+                       "lcbb\t$R1, $XBD2, $M3",
+                       [(set GR32:$R1, (int_s390_lcbb bdxaddr12only:$XBD2,
+                                                      imm32zx4:$M3))]>;
+
+  // Load with length.  The number of loaded bytes is only known at run time.
+  def VLL : BinaryVRSb<"vll", 0xE737, int_s390_vll, 0>;
+
+  // Load multiple.
+  def VLM : LoadMultipleVRSa<"vlm", 0xE736>;
+
+  // Load and replicate
+  def VLREP  : UnaryVRXGeneric<"vlrep", 0xE705>;
+  def VLREPB : UnaryVRX<"vlrepb", 0xE705, z_replicate_loadi8,  v128b, 1, 0>;
+  def VLREPH : UnaryVRX<"vlreph", 0xE705, z_replicate_loadi16, v128h, 2, 1>;
+  def VLREPF : UnaryVRX<"vlrepf", 0xE705, z_replicate_loadi32, v128f, 4, 2>;
+  def VLREPG : UnaryVRX<"vlrepg", 0xE705, z_replicate_loadi64, v128g, 8, 3>;
+  def : Pat<(v4f32 (z_replicate_loadf32 bdxaddr12only:$addr)),
+            (VLREPF bdxaddr12only:$addr)>;
+  def : Pat<(v2f64 (z_replicate_loadf64 bdxaddr12only:$addr)),
+            (VLREPG bdxaddr12only:$addr)>;
+
+  // Use VLREP to load subvectors.  These patterns use "12pair" because
+  // LEY and LDY offer full 20-bit displacement fields.  It's often better
+  // to use those instructions rather than force a 20-bit displacement
+  // into a GPR temporary.
+  def VL32 : UnaryAliasVRX<load, v32eb, bdxaddr12pair>;
+  def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
+
+  // Load logical element and zero.
+  def VLLEZ  : UnaryVRXGeneric<"vllez", 0xE704>;
+  def VLLEZB : UnaryVRX<"vllezb", 0xE704, z_vllezi8,  v128b, 1, 0>;
+  def VLLEZH : UnaryVRX<"vllezh", 0xE704, z_vllezi16, v128h, 2, 1>;
+  def VLLEZF : UnaryVRX<"vllezf", 0xE704, z_vllezi32, v128f, 4, 2>;
+  def VLLEZG : UnaryVRX<"vllezg", 0xE704, z_vllezi64, v128g, 8, 3>;
+  def : Pat<(v4f32 (z_vllezf32 bdxaddr12only:$addr)),
+            (VLLEZF bdxaddr12only:$addr)>;
+  def : Pat<(v2f64 (z_vllezf64 bdxaddr12only:$addr)),
+            (VLLEZG bdxaddr12only:$addr)>;
+
+  // Load element.
+  def VLEB : TernaryVRX<"vleb", 0xE700, z_vlei8,  v128b, v128b, 1, imm32zx4>;
+  def VLEH : TernaryVRX<"vleh", 0xE701, z_vlei16, v128h, v128h, 2, imm32zx3>;
+  def VLEF : TernaryVRX<"vlef", 0xE703, z_vlei32, v128f, v128f, 4, imm32zx2>;
+  def VLEG : TernaryVRX<"vleg", 0xE702, z_vlei64, v128g, v128g, 8, imm32zx1>;
+  def : Pat<(z_vlef32 (v4f32 VR128:$val), bdxaddr12only:$addr, imm32zx2:$index),
+            (VLEF VR128:$val, bdxaddr12only:$addr, imm32zx2:$index)>;
+  def : Pat<(z_vlef64 (v2f64 VR128:$val), bdxaddr12only:$addr, imm32zx1:$index),
+            (VLEG VR128:$val, bdxaddr12only:$addr, imm32zx1:$index)>;
+
+  // Gather element.
+  def VGEF : TernaryVRV<"vgef", 0xE713, 4, imm32zx2>;
+  def VGEG : TernaryVRV<"vgeg", 0xE712, 8, imm32zx1>;
+}
+
+// Use replicating loads if we're inserting a single element into an
+// undefined vector.  This avoids a false dependency on the previous
+// register contents.
+multiclass ReplicatePeephole<Instruction vlrep, ValueType vectype,
+                             SDPatternOperator load, ValueType scalartype> {
+  def : Pat<(vectype (z_vector_insert
+                      (undef), (scalartype (load bdxaddr12only:$addr)), 0)),
+            (vlrep bdxaddr12only:$addr)>;
+  def : Pat<(vectype (scalar_to_vector
+                      (scalartype (load bdxaddr12only:$addr)))),
+            (vlrep bdxaddr12only:$addr)>;
+}
+defm : ReplicatePeephole<VLREPB, v16i8, anyextloadi8, i32>;
+defm : ReplicatePeephole<VLREPH, v8i16, anyextloadi16, i32>;
+defm : ReplicatePeephole<VLREPF, v4i32, load, i32>;
+defm : ReplicatePeephole<VLREPG, v2i64, load, i64>;
+defm : ReplicatePeephole<VLREPF, v4f32, load, f32>;
+defm : ReplicatePeephole<VLREPG, v2f64, load, f64>;
+
+//===----------------------------------------------------------------------===//
+// Stores
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+  // Store.
+  def VST : StoreVRX<"vst", 0xE70E, null_frag, v128any, 16>;
+
+  // Store with length.  The number of stored bytes is only known at run time.
+  def VSTL : StoreLengthVRSb<"vstl", 0xE73F, int_s390_vstl, 0>;
+
+  // Store multiple.
+  def VSTM : StoreMultipleVRSa<"vstm", 0xE73E>;
+
+  // Store element.
+  def VSTEB : StoreBinaryVRX<"vsteb", 0xE708, z_vstei8,  v128b, 1, imm32zx4>;
+  def VSTEH : StoreBinaryVRX<"vsteh", 0xE709, z_vstei16, v128h, 2, imm32zx3>;
+  def VSTEF : StoreBinaryVRX<"vstef", 0xE70B, z_vstei32, v128f, 4, imm32zx2>;
+  def VSTEG : StoreBinaryVRX<"vsteg", 0xE70A, z_vstei64, v128g, 8, imm32zx1>;
+  def : Pat<(z_vstef32 (v4f32 VR128:$val), bdxaddr12only:$addr,
+                       imm32zx2:$index),
+            (VSTEF VR128:$val, bdxaddr12only:$addr, imm32zx2:$index)>;
+  def : Pat<(z_vstef64 (v2f64 VR128:$val), bdxaddr12only:$addr,
+                       imm32zx1:$index),
+            (VSTEG VR128:$val, bdxaddr12only:$addr, imm32zx1:$index)>;
+
+  // Use VSTE to store subvectors.  These patterns use "12pair" because
+  // STEY and STDY offer full 20-bit displacement fields.  It's often better
+  // to use those instructions rather than force a 20-bit displacement
+  // into a GPR temporary.
+  def VST32 : StoreAliasVRX<store, v32eb, bdxaddr12pair>;
+  def VST64 : StoreAliasVRX<store, v64db, bdxaddr12pair>;
+
+  // Scatter element.
+  def VSCEF : StoreBinaryVRV<"vscef", 0xE71B, 4, imm32zx2>;
+  def VSCEG : StoreBinaryVRV<"vsceg", 0xE71A, 8, imm32zx1>;
+}
+
+//===----------------------------------------------------------------------===//
+// Selects and permutes
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+  // Merge high.
+  def VMRH:   BinaryVRRcGeneric<"vmrh", 0xE761>;
+  def VMRHB : BinaryVRRc<"vmrhb", 0xE761, z_merge_high, v128b, v128b, 0>;
+  def VMRHH : BinaryVRRc<"vmrhh", 0xE761, z_merge_high, v128h, v128h, 1>;
+  def VMRHF : BinaryVRRc<"vmrhf", 0xE761, z_merge_high, v128f, v128f, 2>;
+  def VMRHG : BinaryVRRc<"vmrhg", 0xE761, z_merge_high, v128g, v128g, 3>;
+  def : BinaryRRWithType<VMRHF, VR128, z_merge_high, v4f32>;
+  def : BinaryRRWithType<VMRHG, VR128, z_merge_high, v2f64>;
+
+  // Merge low.
+  def VMRL:   BinaryVRRcGeneric<"vmrl", 0xE760>;
+  def VMRLB : BinaryVRRc<"vmrlb", 0xE760, z_merge_low, v128b, v128b, 0>;
+  def VMRLH : BinaryVRRc<"vmrlh", 0xE760, z_merge_low, v128h, v128h, 1>;
+  def VMRLF : BinaryVRRc<"vmrlf", 0xE760, z_merge_low, v128f, v128f, 2>;
+  def VMRLG : BinaryVRRc<"vmrlg", 0xE760, z_merge_low, v128g, v128g, 3>;
+  def : BinaryRRWithType<VMRLF, VR128, z_merge_low, v4f32>;
+  def : BinaryRRWithType<VMRLG, VR128, z_merge_low, v2f64>;
+
+  // Permute.
+  def VPERM : TernaryVRRe<"vperm", 0xE78C, z_permute, v128b, v128b>;
+
+  // Permute doubleword immediate.
+  def VPDI : TernaryVRRc<"vpdi", 0xE784, z_permute_dwords, v128g, v128g>;
+
+  // Replicate.
+  def VREP:   BinaryVRIcGeneric<"vrep", 0xE74D>;
+  def VREPB : BinaryVRIc<"vrepb", 0xE74D, z_splat, v128b, v128b, 0>;
+  def VREPH : BinaryVRIc<"vreph", 0xE74D, z_splat, v128h, v128h, 1>;
+  def VREPF : BinaryVRIc<"vrepf", 0xE74D, z_splat, v128f, v128f, 2>;
+  def VREPG : BinaryVRIc<"vrepg", 0xE74D, z_splat, v128g, v128g, 3>;
+  def : Pat<(v4f32 (z_splat VR128:$vec, imm32zx16:$index)),
+            (VREPF VR128:$vec, imm32zx16:$index)>;
+  def : Pat<(v2f64 (z_splat VR128:$vec, imm32zx16:$index)),
+            (VREPG VR128:$vec, imm32zx16:$index)>;
+
+  // Select.
+  def VSEL : TernaryVRRe<"vsel", 0xE78D, null_frag, v128any, v128any>;
+}
+
+//===----------------------------------------------------------------------===//
+// Widening and narrowing
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+  // Pack
+  def VPK  : BinaryVRRcGeneric<"vpk", 0xE794>;
+  def VPKH : BinaryVRRc<"vpkh", 0xE794, z_pack, v128b, v128h, 1>;
+  def VPKF : BinaryVRRc<"vpkf", 0xE794, z_pack, v128h, v128f, 2>;
+  def VPKG : BinaryVRRc<"vpkg", 0xE794, z_pack, v128f, v128g, 3>;
+
+  // Pack saturate.
+  def  VPKS  : BinaryVRRbSPairGeneric<"vpks", 0xE797>;
+  defm VPKSH : BinaryVRRbSPair<"vpksh", 0xE797, int_s390_vpksh, z_packs_cc,
+                               v128b, v128h, 1>;
+  defm VPKSF : BinaryVRRbSPair<"vpksf", 0xE797, int_s390_vpksf, z_packs_cc,
+                               v128h, v128f, 2>;
+  defm VPKSG : BinaryVRRbSPair<"vpksg", 0xE797, int_s390_vpksg, z_packs_cc,
+                               v128f, v128g, 3>;
+
+  // Pack saturate logical.
+  def  VPKLS  : BinaryVRRbSPairGeneric<"vpkls", 0xE795>;
+  defm VPKLSH : BinaryVRRbSPair<"vpklsh", 0xE795, int_s390_vpklsh, z_packls_cc,
+                                v128b, v128h, 1>;
+  defm VPKLSF : BinaryVRRbSPair<"vpklsf", 0xE795, int_s390_vpklsf, z_packls_cc,
+                                v128h, v128f, 2>;
+  defm VPKLSG : BinaryVRRbSPair<"vpklsg", 0xE795, int_s390_vpklsg, z_packls_cc,
+                                v128f, v128g, 3>;
+
+  // Sign-extend to doubleword.
+  def VSEG  : UnaryVRRaGeneric<"vseg", 0xE75F>;
+  def VSEGB : UnaryVRRa<"vsegb", 0xE75F, z_vsei8,  v128g, v128g, 0>;
+  def VSEGH : UnaryVRRa<"vsegh", 0xE75F, z_vsei16, v128g, v128g, 1>;
+  def VSEGF : UnaryVRRa<"vsegf", 0xE75F, z_vsei32, v128g, v128g, 2>;
+  def : Pat<(z_vsei8_by_parts  (v16i8 VR128:$src)), (VSEGB VR128:$src)>;
+  def : Pat<(z_vsei16_by_parts (v8i16 VR128:$src)), (VSEGH VR128:$src)>;
+  def : Pat<(z_vsei32_by_parts (v4i32 VR128:$src)), (VSEGF VR128:$src)>;
+
+  // Unpack high.
+  def VUPH  : UnaryVRRaGeneric<"vuph", 0xE7D7>;
+  def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, z_unpack_high, v128h, v128b, 0>;
+  def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, z_unpack_high, v128f, v128h, 1>;
+  def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, z_unpack_high, v128g, v128f, 2>;
+
+  // Unpack logical high.
+  def VUPLH  : UnaryVRRaGeneric<"vuplh", 0xE7D5>;
+  def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, z_unpackl_high, v128h, v128b, 0>;
+  def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, z_unpackl_high, v128f, v128h, 1>;
+  def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, z_unpackl_high, v128g, v128f, 2>;
+
+  // Unpack low.
+  def VUPL   : UnaryVRRaGeneric<"vupl", 0xE7D6>;
+  def VUPLB  : UnaryVRRa<"vuplb",  0xE7D6, z_unpack_low, v128h, v128b, 0>;
+  def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, z_unpack_low, v128f, v128h, 1>;
+  def VUPLF  : UnaryVRRa<"vuplf",  0xE7D6, z_unpack_low, v128g, v128f, 2>;
+
+  // Unpack logical low.
+  def VUPLL  : UnaryVRRaGeneric<"vupll", 0xE7D4>;
+  def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, z_unpackl_low, v128h, v128b, 0>;
+  def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, z_unpackl_low, v128f, v128h, 1>;
+  def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, z_unpackl_low, v128g, v128f, 2>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instantiating generic operations for specific types.
+//===----------------------------------------------------------------------===//
+
+multiclass GenericVectorOps<ValueType type, ValueType inttype> {
+  let Predicates = [FeatureVector] in {
+    def : Pat<(type (load bdxaddr12only:$addr)),
+              (VL bdxaddr12only:$addr)>;
+    def : Pat<(store (type VR128:$src), bdxaddr12only:$addr),
+              (VST VR128:$src, bdxaddr12only:$addr)>;
+    def : Pat<(type (vselect (inttype VR128:$x), VR128:$y, VR128:$z)),
+              (VSEL VR128:$y, VR128:$z, VR128:$x)>;
+    def : Pat<(type (vselect (inttype (z_vnot VR128:$x)), VR128:$y, VR128:$z)),
+              (VSEL VR128:$z, VR128:$y, VR128:$x)>;
+  }
+}
+
+defm : GenericVectorOps<v16i8, v16i8>;
+defm : GenericVectorOps<v8i16, v8i16>;
+defm : GenericVectorOps<v4i32, v4i32>;
+defm : GenericVectorOps<v2i64, v2i64>;
+defm : GenericVectorOps<v4f32, v4i32>;
+defm : GenericVectorOps<v2f64, v2i64>;
+
+//===----------------------------------------------------------------------===//
+// Integer arithmetic
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+  // Add.
+  def VA  : BinaryVRRcGeneric<"va", 0xE7F3>;
+  def VAB : BinaryVRRc<"vab", 0xE7F3, add, v128b, v128b, 0>;
+  def VAH : BinaryVRRc<"vah", 0xE7F3, add, v128h, v128h, 1>;
+  def VAF : BinaryVRRc<"vaf", 0xE7F3, add, v128f, v128f, 2>;
+  def VAG : BinaryVRRc<"vag", 0xE7F3, add, v128g, v128g, 3>;
+  def VAQ : BinaryVRRc<"vaq", 0xE7F3, int_s390_vaq, v128q, v128q, 4>;
+
+  // Add compute carry.
+  def VACC  : BinaryVRRcGeneric<"vacc", 0xE7F1>;
+  def VACCB : BinaryVRRc<"vaccb", 0xE7F1, int_s390_vaccb, v128b, v128b, 0>;
+  def VACCH : BinaryVRRc<"vacch", 0xE7F1, int_s390_vacch, v128h, v128h, 1>;
+  def VACCF : BinaryVRRc<"vaccf", 0xE7F1, int_s390_vaccf, v128f, v128f, 2>;
+  def VACCG : BinaryVRRc<"vaccg", 0xE7F1, int_s390_vaccg, v128g, v128g, 3>;
+  def VACCQ : BinaryVRRc<"vaccq", 0xE7F1, int_s390_vaccq, v128q, v128q, 4>;
+
+  // Add with carry.
+  def VAC  : TernaryVRRdGeneric<"vac", 0xE7BB>;
+  def VACQ : TernaryVRRd<"vacq", 0xE7BB, int_s390_vacq, v128q, v128q, 4>;
+
+  // Add with carry compute carry.
+  def VACCC  : TernaryVRRdGeneric<"vaccc", 0xE7B9>;
+  def VACCCQ : TernaryVRRd<"vacccq", 0xE7B9, int_s390_vacccq, v128q, v128q, 4>;
+
+  // And.
+  def VN : BinaryVRRc<"vn", 0xE768, null_frag, v128any, v128any>;
+
+  // And with complement.
+  def VNC : BinaryVRRc<"vnc", 0xE769, null_frag, v128any, v128any>;
+
+  // Average.
+  def VAVG  : BinaryVRRcGeneric<"vavg", 0xE7F2>;
+  def VAVGB : BinaryVRRc<"vavgb", 0xE7F2, int_s390_vavgb, v128b, v128b, 0>;
+  def VAVGH : BinaryVRRc<"vavgh", 0xE7F2, int_s390_vavgh, v128h, v128h, 1>;
+  def VAVGF : BinaryVRRc<"vavgf", 0xE7F2, int_s390_vavgf, v128f, v128f, 2>;
+  def VAVGG : BinaryVRRc<"vavgg", 0xE7F2, int_s390_vavgg, v128g, v128g, 3>;
+
+  // Average logical.
+  def VAVGL  : BinaryVRRcGeneric<"vavgl", 0xE7F0>;
+  def VAVGLB : BinaryVRRc<"vavglb", 0xE7F0, int_s390_vavglb, v128b, v128b, 0>;
+  def VAVGLH : BinaryVRRc<"vavglh", 0xE7F0, int_s390_vavglh, v128h, v128h, 1>;
+  def VAVGLF : BinaryVRRc<"vavglf", 0xE7F0, int_s390_vavglf, v128f, v128f, 2>;
+  def VAVGLG : BinaryVRRc<"vavglg", 0xE7F0, int_s390_vavglg, v128g, v128g, 3>;
+
+  // Checksum.
+  def VCKSM : BinaryVRRc<"vcksm", 0xE766, int_s390_vcksm, v128f, v128f>;
+
+  // Count leading zeros.
+  def VCLZ  : UnaryVRRaGeneric<"vclz", 0xE753>;
+  def VCLZB : UnaryVRRa<"vclzb", 0xE753, ctlz, v128b, v128b, 0>;
+  def VCLZH : UnaryVRRa<"vclzh", 0xE753, ctlz, v128h, v128h, 1>;
+  def VCLZF : UnaryVRRa<"vclzf", 0xE753, ctlz, v128f, v128f, 2>;
+  def VCLZG : UnaryVRRa<"vclzg", 0xE753, ctlz, v128g, v128g, 3>;
+
+  // Count trailing zeros.
+  def VCTZ  : UnaryVRRaGeneric<"vctz", 0xE752>;
+  def VCTZB : UnaryVRRa<"vctzb", 0xE752, cttz, v128b, v128b, 0>;
+  def VCTZH : UnaryVRRa<"vctzh", 0xE752, cttz, v128h, v128h, 1>;
+  def VCTZF : UnaryVRRa<"vctzf", 0xE752, cttz, v128f, v128f, 2>;
+  def VCTZG : UnaryVRRa<"vctzg", 0xE752, cttz, v128g, v128g, 3>;
+
+  // Exclusive or.
+  def VX : BinaryVRRc<"vx", 0xE76D, null_frag, v128any, v128any>;
+
+  // Galois field multiply sum.
+  def VGFM  : BinaryVRRcGeneric<"vgfm", 0xE7B4>;
+  def VGFMB : BinaryVRRc<"vgfmb", 0xE7B4, int_s390_vgfmb, v128h, v128b, 0>;
+  def VGFMH : BinaryVRRc<"vgfmh", 0xE7B4, int_s390_vgfmh, v128f, v128h, 1>;
+  def VGFMF : BinaryVRRc<"vgfmf", 0xE7B4, int_s390_vgfmf, v128g, v128f, 2>;
+  def VGFMG : BinaryVRRc<"vgfmg", 0xE7B4, int_s390_vgfmg, v128q, v128g, 3>;
+
+  // Galois field multiply sum and accumulate.
+  def VGFMA  : TernaryVRRdGeneric<"vgfma", 0xE7BC>;
+  def VGFMAB : TernaryVRRd<"vgfmab", 0xE7BC, int_s390_vgfmab, v128h, v128b, 0>;
+  def VGFMAH : TernaryVRRd<"vgfmah", 0xE7BC, int_s390_vgfmah, v128f, v128h, 1>;
+  def VGFMAF : TernaryVRRd<"vgfmaf", 0xE7BC, int_s390_vgfmaf, v128g, v128f, 2>;
+  def VGFMAG : TernaryVRRd<"vgfmag", 0xE7BC, int_s390_vgfmag, v128q, v128g, 3>;
+
+  // Load complement.
+  def VLC  : UnaryVRRaGeneric<"vlc", 0xE7DE>;
+  def VLCB : UnaryVRRa<"vlcb", 0xE7DE, z_vneg, v128b, v128b, 0>;
+  def VLCH : UnaryVRRa<"vlch", 0xE7DE, z_vneg, v128h, v128h, 1>;
+  def VLCF : UnaryVRRa<"vlcf", 0xE7DE, z_vneg, v128f, v128f, 2>;
+  def VLCG : UnaryVRRa<"vlcg", 0xE7DE, z_vneg, v128g, v128g, 3>;
+
+  // Load positive.
+  def VLP  : UnaryVRRaGeneric<"vlp", 0xE7DF>;
+  def VLPB : UnaryVRRa<"vlpb", 0xE7DF, z_viabs8,  v128b, v128b, 0>;
+  def VLPH : UnaryVRRa<"vlph", 0xE7DF, z_viabs16, v128h, v128h, 1>;
+  def VLPF : UnaryVRRa<"vlpf", 0xE7DF, z_viabs32, v128f, v128f, 2>;
+  def VLPG : UnaryVRRa<"vlpg", 0xE7DF, z_viabs64, v128g, v128g, 3>;
+
+  // Maximum.
+  def VMX  : BinaryVRRcGeneric<"vmx", 0xE7FF>;
+  def VMXB : BinaryVRRc<"vmxb", 0xE7FF, null_frag, v128b, v128b, 0>;
+  def VMXH : BinaryVRRc<"vmxh", 0xE7FF, null_frag, v128h, v128h, 1>;
+  def VMXF : BinaryVRRc<"vmxf", 0xE7FF, null_frag, v128f, v128f, 2>;
+  def VMXG : BinaryVRRc<"vmxg", 0xE7FF, null_frag, v128g, v128g, 3>;
+
+  // Maximum logical.
+  def VMXL  : BinaryVRRcGeneric<"vmxl", 0xE7FD>;
+  def VMXLB : BinaryVRRc<"vmxlb", 0xE7FD, null_frag, v128b, v128b, 0>;
+  def VMXLH : BinaryVRRc<"vmxlh", 0xE7FD, null_frag, v128h, v128h, 1>;
+  def VMXLF : BinaryVRRc<"vmxlf", 0xE7FD, null_frag, v128f, v128f, 2>;
+  def VMXLG : BinaryVRRc<"vmxlg", 0xE7FD, null_frag, v128g, v128g, 3>;
+
+  // Minimum.
+  def VMN  : BinaryVRRcGeneric<"vmn", 0xE7FE>;
+  def VMNB : BinaryVRRc<"vmnb", 0xE7FE, null_frag, v128b, v128b, 0>;
+  def VMNH : BinaryVRRc<"vmnh", 0xE7FE, null_frag, v128h, v128h, 1>;
+  def VMNF : BinaryVRRc<"vmnf", 0xE7FE, null_frag, v128f, v128f, 2>;
+  def VMNG : BinaryVRRc<"vmng", 0xE7FE, null_frag, v128g, v128g, 3>;
+
+  // Minimum logical.
+  def VMNL  : BinaryVRRcGeneric<"vmnl", 0xE7FC>;
+  def VMNLB : BinaryVRRc<"vmnlb", 0xE7FC, null_frag, v128b, v128b, 0>;
+  def VMNLH : BinaryVRRc<"vmnlh", 0xE7FC, null_frag, v128h, v128h, 1>;
+  def VMNLF : BinaryVRRc<"vmnlf", 0xE7FC, null_frag, v128f, v128f, 2>;
+  def VMNLG : BinaryVRRc<"vmnlg", 0xE7FC, null_frag, v128g, v128g, 3>;
+
+  // Multiply and add low.
+  def VMAL   : TernaryVRRdGeneric<"vmal", 0xE7AA>;
+  def VMALB  : TernaryVRRd<"vmalb",  0xE7AA, z_muladd, v128b, v128b, 0>;
+  def VMALHW : TernaryVRRd<"vmalhw", 0xE7AA, z_muladd, v128h, v128h, 1>;
+  def VMALF  : TernaryVRRd<"vmalf",  0xE7AA, z_muladd, v128f, v128f, 2>;
+
+  // Multiply and add high.
+  def VMAH  : TernaryVRRdGeneric<"vmah", 0xE7AB>;
+  def VMAHB : TernaryVRRd<"vmahb", 0xE7AB, int_s390_vmahb, v128b, v128b, 0>;
+  def VMAHH : TernaryVRRd<"vmahh", 0xE7AB, int_s390_vmahh, v128h, v128h, 1>;
+  def VMAHF : TernaryVRRd<"vmahf", 0xE7AB, int_s390_vmahf, v128f, v128f, 2>;
+
+  // Multiply and add logical high.
+  def VMALH  : TernaryVRRdGeneric<"vmalh", 0xE7A9>;
+  def VMALHB : TernaryVRRd<"vmalhb", 0xE7A9, int_s390_vmalhb, v128b, v128b, 0>;
+  def VMALHH : TernaryVRRd<"vmalhh", 0xE7A9, int_s390_vmalhh, v128h, v128h, 1>;
+  def VMALHF : TernaryVRRd<"vmalhf", 0xE7A9, int_s390_vmalhf, v128f, v128f, 2>;
+
+  // Multiply and add even.
+  def VMAE  : TernaryVRRdGeneric<"vmae", 0xE7AE>;
+  def VMAEB : TernaryVRRd<"vmaeb", 0xE7AE, int_s390_vmaeb, v128h, v128b, 0>;
+  def VMAEH : TernaryVRRd<"vmaeh", 0xE7AE, int_s390_vmaeh, v128f, v128h, 1>;
+  def VMAEF : TernaryVRRd<"vmaef", 0xE7AE, int_s390_vmaef, v128g, v128f, 2>;
+
+  // Multiply and add logical even.
+  def VMALE  : TernaryVRRdGeneric<"vmale", 0xE7AC>;
+  def VMALEB : TernaryVRRd<"vmaleb", 0xE7AC, int_s390_vmaleb, v128h, v128b, 0>;
+  def VMALEH : TernaryVRRd<"vmaleh", 0xE7AC, int_s390_vmaleh, v128f, v128h, 1>;
+  def VMALEF : TernaryVRRd<"vmalef", 0xE7AC, int_s390_vmalef, v128g, v128f, 2>;
+
+  // Multiply and add odd.
+  def VMAO  : TernaryVRRdGeneric<"vmao", 0xE7AF>;
+  def VMAOB : TernaryVRRd<"vmaob", 0xE7AF, int_s390_vmaob, v128h, v128b, 0>;
+  def VMAOH : TernaryVRRd<"vmaoh", 0xE7AF, int_s390_vmaoh, v128f, v128h, 1>;
+  def VMAOF : TernaryVRRd<"vmaof", 0xE7AF, int_s390_vmaof, v128g, v128f, 2>;
+
+  // Multiply and add logical odd.
+  def VMALO  : TernaryVRRdGeneric<"vmalo", 0xE7AD>;
+  def VMALOB : TernaryVRRd<"vmalob", 0xE7AD, int_s390_vmalob, v128h, v128b, 0>;
+  def VMALOH : TernaryVRRd<"vmaloh", 0xE7AD, int_s390_vmaloh, v128f, v128h, 1>;
+  def VMALOF : TernaryVRRd<"vmalof", 0xE7AD, int_s390_vmalof, v128g, v128f, 2>;
+
+  // Multiply high.
+  def VMH  : BinaryVRRcGeneric<"vmh", 0xE7A3>;
+  def VMHB : BinaryVRRc<"vmhb", 0xE7A3, int_s390_vmhb, v128b, v128b, 0>;
+  def VMHH : BinaryVRRc<"vmhh", 0xE7A3, int_s390_vmhh, v128h, v128h, 1>;
+  def VMHF : BinaryVRRc<"vmhf", 0xE7A3, int_s390_vmhf, v128f, v128f, 2>;
+
+  // Multiply logical high.
+  def VMLH  : BinaryVRRcGeneric<"vmlh", 0xE7A1>;
+  def VMLHB : BinaryVRRc<"vmlhb", 0xE7A1, int_s390_vmlhb, v128b, v128b, 0>;
+  def VMLHH : BinaryVRRc<"vmlhh", 0xE7A1, int_s390_vmlhh, v128h, v128h, 1>;
+  def VMLHF : BinaryVRRc<"vmlhf", 0xE7A1, int_s390_vmlhf, v128f, v128f, 2>;
+
+  // Multiply low.
+  def VML   : BinaryVRRcGeneric<"vml", 0xE7A2>;
+  def VMLB  : BinaryVRRc<"vmlb",  0xE7A2, mul, v128b, v128b, 0>;
+  def VMLHW : BinaryVRRc<"vmlhw", 0xE7A2, mul, v128h, v128h, 1>;
+  def VMLF  : BinaryVRRc<"vmlf",  0xE7A2, mul, v128f, v128f, 2>;
+
+  // Multiply even.
+  def VME  : BinaryVRRcGeneric<"vme", 0xE7A6>;
+  def VMEB : BinaryVRRc<"vmeb", 0xE7A6, int_s390_vmeb, v128h, v128b, 0>;
+  def VMEH : BinaryVRRc<"vmeh", 0xE7A6, int_s390_vmeh, v128f, v128h, 1>;
+  def VMEF : BinaryVRRc<"vmef", 0xE7A6, int_s390_vmef, v128g, v128f, 2>;
+
+  // Multiply logical even.
+  def VMLE  : BinaryVRRcGeneric<"vmle", 0xE7A4>;
+  def VMLEB : BinaryVRRc<"vmleb", 0xE7A4, int_s390_vmleb, v128h, v128b, 0>;
+  def VMLEH : BinaryVRRc<"vmleh", 0xE7A4, int_s390_vmleh, v128f, v128h, 1>;
+  def VMLEF : BinaryVRRc<"vmlef", 0xE7A4, int_s390_vmlef, v128g, v128f, 2>;
+
+  // Multiply odd.
+  def VMO  : BinaryVRRcGeneric<"vmo", 0xE7A7>;
+  def VMOB : BinaryVRRc<"vmob", 0xE7A7, int_s390_vmob, v128h, v128b, 0>;
+  def VMOH : BinaryVRRc<"vmoh", 0xE7A7, int_s390_vmoh, v128f, v128h, 1>;
+  def VMOF : BinaryVRRc<"vmof", 0xE7A7, int_s390_vmof, v128g, v128f, 2>;
+
+  // Multiply logical odd.
+  def VMLO  : BinaryVRRcGeneric<"vmlo", 0xE7A5>;
+  def VMLOB : BinaryVRRc<"vmlob", 0xE7A5, int_s390_vmlob, v128h, v128b, 0>;
+  def VMLOH : BinaryVRRc<"vmloh", 0xE7A5, int_s390_vmloh, v128f, v128h, 1>;
+  def VMLOF : BinaryVRRc<"vmlof", 0xE7A5, int_s390_vmlof, v128g, v128f, 2>;
+
+  // Nor.
+  def VNO : BinaryVRRc<"vno", 0xE76B, null_frag, v128any, v128any>;
+  def : InstAlias<"vnot\t$V1, $V2", (VNO VR128:$V1, VR128:$V2, VR128:$V2), 0>;
+
+  // Or.
+  def VO : BinaryVRRc<"vo", 0xE76A, null_frag, v128any, v128any>;
+
+  // Population count.
+  def VPOPCT : UnaryVRRaGeneric<"vpopct", 0xE750>;
+  def : Pat<(v16i8 (z_popcnt VR128:$x)), (VPOPCT VR128:$x, 0)>;
+
+  // Element rotate left logical (with vector shift amount).
+  def VERLLV  : BinaryVRRcGeneric<"verllv", 0xE773>;
+  def VERLLVB : BinaryVRRc<"verllvb", 0xE773, int_s390_verllvb,
+                           v128b, v128b, 0>;
+  def VERLLVH : BinaryVRRc<"verllvh", 0xE773, int_s390_verllvh,
+                           v128h, v128h, 1>;
+  def VERLLVF : BinaryVRRc<"verllvf", 0xE773, int_s390_verllvf,
+                           v128f, v128f, 2>;
+  def VERLLVG : BinaryVRRc<"verllvg", 0xE773, int_s390_verllvg,
+                           v128g, v128g, 3>;
+
+  // Element rotate left logical (with scalar shift amount).
+  def VERLL  : BinaryVRSaGeneric<"verll", 0xE733>;
+  def VERLLB : BinaryVRSa<"verllb", 0xE733, int_s390_verllb, v128b, v128b, 0>;
+  def VERLLH : BinaryVRSa<"verllh", 0xE733, int_s390_verllh, v128h, v128h, 1>;
+  def VERLLF : BinaryVRSa<"verllf", 0xE733, int_s390_verllf, v128f, v128f, 2>;
+  def VERLLG : BinaryVRSa<"verllg", 0xE733, int_s390_verllg, v128g, v128g, 3>;
+
+  // Element rotate and insert under mask.
+  def VERIM  : QuaternaryVRIdGeneric<"verim", 0xE772>;
+  def VERIMB : QuaternaryVRId<"verimb", 0xE772, int_s390_verimb, v128b, v128b, 0>;
+  def VERIMH : QuaternaryVRId<"verimh", 0xE772, int_s390_verimh, v128h, v128h, 1>;
+  def VERIMF : QuaternaryVRId<"verimf", 0xE772, int_s390_verimf, v128f, v128f, 2>;
+  def VERIMG : QuaternaryVRId<"verimg", 0xE772, int_s390_verimg, v128g, v128g, 3>;
+
+  // Element shift left (with vector shift amount).
+  def VESLV  : BinaryVRRcGeneric<"veslv", 0xE770>;
+  def VESLVB : BinaryVRRc<"veslvb", 0xE770, z_vshl, v128b, v128b, 0>;
+  def VESLVH : BinaryVRRc<"veslvh", 0xE770, z_vshl, v128h, v128h, 1>;
+  def VESLVF : BinaryVRRc<"veslvf", 0xE770, z_vshl, v128f, v128f, 2>;
+  def VESLVG : BinaryVRRc<"veslvg", 0xE770, z_vshl, v128g, v128g, 3>;
+
+  // Element shift left (with scalar shift amount).
+  def VESL  : BinaryVRSaGeneric<"vesl", 0xE730>;
+  def VESLB : BinaryVRSa<"veslb", 0xE730, z_vshl_by_scalar, v128b, v128b, 0>;
+  def VESLH : BinaryVRSa<"veslh", 0xE730, z_vshl_by_scalar, v128h, v128h, 1>;
+  def VESLF : BinaryVRSa<"veslf", 0xE730, z_vshl_by_scalar, v128f, v128f, 2>;
+  def VESLG : BinaryVRSa<"veslg", 0xE730, z_vshl_by_scalar, v128g, v128g, 3>;
+
+  // Element shift right arithmetic (with vector shift amount).
+  def VESRAV  : BinaryVRRcGeneric<"vesrav", 0xE77A>;
+  def VESRAVB : BinaryVRRc<"vesravb", 0xE77A, z_vsra, v128b, v128b, 0>;
+  def VESRAVH : BinaryVRRc<"vesravh", 0xE77A, z_vsra, v128h, v128h, 1>;
+  def VESRAVF : BinaryVRRc<"vesravf", 0xE77A, z_vsra, v128f, v128f, 2>;
+  def VESRAVG : BinaryVRRc<"vesravg", 0xE77A, z_vsra, v128g, v128g, 3>;
+
+  // Element shift right arithmetic (with scalar shift amount).
+  def VESRA  : BinaryVRSaGeneric<"vesra", 0xE73A>;
+  def VESRAB : BinaryVRSa<"vesrab", 0xE73A, z_vsra_by_scalar, v128b, v128b, 0>;
+  def VESRAH : BinaryVRSa<"vesrah", 0xE73A, z_vsra_by_scalar, v128h, v128h, 1>;
+  def VESRAF : BinaryVRSa<"vesraf", 0xE73A, z_vsra_by_scalar, v128f, v128f, 2>;
+  def VESRAG : BinaryVRSa<"vesrag", 0xE73A, z_vsra_by_scalar, v128g, v128g, 3>;
+
+  // Element shift right logical (with vector shift amount).
+  def VESRLV  : BinaryVRRcGeneric<"vesrlv", 0xE778>;
+  def VESRLVB : BinaryVRRc<"vesrlvb", 0xE778, z_vsrl, v128b, v128b, 0>;
+  def VESRLVH : BinaryVRRc<"vesrlvh", 0xE778, z_vsrl, v128h, v128h, 1>;
+  def VESRLVF : BinaryVRRc<"vesrlvf", 0xE778, z_vsrl, v128f, v128f, 2>;
+  def VESRLVG : BinaryVRRc<"vesrlvg", 0xE778, z_vsrl, v128g, v128g, 3>;
+
+  // Element shift right logical (with scalar shift amount).
+  def VESRL  : BinaryVRSaGeneric<"vesrl", 0xE738>;
+  def VESRLB : BinaryVRSa<"vesrlb", 0xE738, z_vsrl_by_scalar, v128b, v128b, 0>;
+  def VESRLH : BinaryVRSa<"vesrlh", 0xE738, z_vsrl_by_scalar, v128h, v128h, 1>;
+  def VESRLF : BinaryVRSa<"vesrlf", 0xE738, z_vsrl_by_scalar, v128f, v128f, 2>;
+  def VESRLG : BinaryVRSa<"vesrlg", 0xE738, z_vsrl_by_scalar, v128g, v128g, 3>;
+
+  // Shift left.
+  def VSL : BinaryVRRc<"vsl", 0xE774, int_s390_vsl, v128b, v128b>;
+
+  // Shift left by byte.
+  def VSLB : BinaryVRRc<"vslb", 0xE775, int_s390_vslb, v128b, v128b>;
+
+  // Shift left double by byte.
+  def VSLDB : TernaryVRId<"vsldb", 0xE777, z_shl_double, v128b, v128b, 0>;
+  def : Pat<(int_s390_vsldb VR128:$x, VR128:$y, imm32zx8:$z),
+            (VSLDB VR128:$x, VR128:$y, imm32zx8:$z)>;
+
+  // Shift right arithmetic.
+  def VSRA : BinaryVRRc<"vsra", 0xE77E, int_s390_vsra, v128b, v128b>;
+
+  // Shift right arithmetic by byte.
+  def VSRAB : BinaryVRRc<"vsrab", 0xE77F, int_s390_vsrab, v128b, v128b>;
+
+  // Shift right logical.
+  def VSRL : BinaryVRRc<"vsrl", 0xE77C, int_s390_vsrl, v128b, v128b>;
+
+  // Shift right logical by byte.
+  def VSRLB : BinaryVRRc<"vsrlb", 0xE77D, int_s390_vsrlb, v128b, v128b>;
+
+  // Subtract.
+  def VS  : BinaryVRRcGeneric<"vs", 0xE7F7>;
+  def VSB : BinaryVRRc<"vsb", 0xE7F7, sub, v128b, v128b, 0>;
+  def VSH : BinaryVRRc<"vsh", 0xE7F7, sub, v128h, v128h, 1>;
+  def VSF : BinaryVRRc<"vsf", 0xE7F7, sub, v128f, v128f, 2>;
+  def VSG : BinaryVRRc<"vsg", 0xE7F7, sub, v128g, v128g, 3>;
+  def VSQ : BinaryVRRc<"vsq", 0xE7F7, int_s390_vsq, v128q, v128q, 4>;
+
+  // Subtract compute borrow indication.
+  def VSCBI  : BinaryVRRcGeneric<"vscbi", 0xE7F5>;
+  def VSCBIB : BinaryVRRc<"vscbib", 0xE7F5, int_s390_vscbib, v128b, v128b, 0>;
+  def VSCBIH : BinaryVRRc<"vscbih", 0xE7F5, int_s390_vscbih, v128h, v128h, 1>;
+  def VSCBIF : BinaryVRRc<"vscbif", 0xE7F5, int_s390_vscbif, v128f, v128f, 2>;
+  def VSCBIG : BinaryVRRc<"vscbig", 0xE7F5, int_s390_vscbig, v128g, v128g, 3>;
+  def VSCBIQ : BinaryVRRc<"vscbiq", 0xE7F5, int_s390_vscbiq, v128q, v128q, 4>;
+
+  // Subtract with borrow indication.
+  def VSBI  : TernaryVRRdGeneric<"vsbi", 0xE7BF>;
+  def VSBIQ : TernaryVRRd<"vsbiq", 0xE7BF, int_s390_vsbiq, v128q, v128q, 4>;
+
+  // Subtract with borrow compute borrow indication.
+  def VSBCBI  : TernaryVRRdGeneric<"vsbcbi", 0xE7BD>;
+  def VSBCBIQ : TernaryVRRd<"vsbcbiq", 0xE7BD, int_s390_vsbcbiq,
+                            v128q, v128q, 4>;
+
+  // Sum across doubleword.
+  def VSUMG  : BinaryVRRcGeneric<"vsumg", 0xE765>;
+  def VSUMGH : BinaryVRRc<"vsumgh", 0xE765, z_vsum, v128g, v128h, 1>;
+  def VSUMGF : BinaryVRRc<"vsumgf", 0xE765, z_vsum, v128g, v128f, 2>;
+
+  // Sum across quadword.
+  def VSUMQ  : BinaryVRRcGeneric<"vsumq", 0xE767>;
+  def VSUMQF : BinaryVRRc<"vsumqf", 0xE767, z_vsum, v128q, v128f, 2>;
+  def VSUMQG : BinaryVRRc<"vsumqg", 0xE767, z_vsum, v128q, v128g, 3>;
+
+  // Sum across word.
+  def VSUM  : BinaryVRRcGeneric<"vsum", 0xE764>;
+  def VSUMB : BinaryVRRc<"vsumb", 0xE764, z_vsum, v128f, v128b, 0>;
+  def VSUMH : BinaryVRRc<"vsumh", 0xE764, z_vsum, v128f, v128h, 1>;
+}
+
+// Instantiate the bitwise ops for type TYPE.
+multiclass BitwiseVectorOps<ValueType type> {
+  let Predicates = [FeatureVector] in {
+    def : Pat<(type (and VR128:$x, VR128:$y)), (VN VR128:$x, VR128:$y)>;
+    def : Pat<(type (and VR128:$x, (z_vnot VR128:$y))),
+              (VNC VR128:$x, VR128:$y)>;
+    def : Pat<(type (or VR128:$x, VR128:$y)), (VO VR128:$x, VR128:$y)>;
+    def : Pat<(type (xor VR128:$x, VR128:$y)), (VX VR128:$x, VR128:$y)>;
+    def : Pat<(type (or (and VR128:$x, VR128:$z),
+                        (and VR128:$y, (z_vnot VR128:$z)))),
+              (VSEL VR128:$x, VR128:$y, VR128:$z)>;
+    def : Pat<(type (z_vnot (or VR128:$x, VR128:$y))),
+              (VNO VR128:$x, VR128:$y)>;
+    def : Pat<(type (z_vnot VR128:$x)), (VNO VR128:$x, VR128:$x)>;
+  }
+}
+
+defm : BitwiseVectorOps<v16i8>;
+defm : BitwiseVectorOps<v8i16>;
+defm : BitwiseVectorOps<v4i32>;
+defm : BitwiseVectorOps<v2i64>;
+
+// Instantiate additional patterns for absolute-related expressions on
+// type TYPE.  LC is the negate instruction for TYPE and LP is the absolute
+// instruction.
+multiclass IntegerAbsoluteVectorOps<ValueType type, Instruction lc,
+                                    Instruction lp, int shift> {
+  let Predicates = [FeatureVector] in {
+    def : Pat<(type (vselect (type (z_vicmph_zero VR128:$x)),
+                             (z_vneg VR128:$x), VR128:$x)),
+              (lc (lp VR128:$x))>;
+    def : Pat<(type (vselect (type (z_vnot (z_vicmph_zero VR128:$x))),
+                             VR128:$x, (z_vneg VR128:$x))),
+              (lc (lp VR128:$x))>;
+    def : Pat<(type (vselect (type (z_vicmpl_zero VR128:$x)),
+                             VR128:$x, (z_vneg VR128:$x))),
+              (lc (lp VR128:$x))>;
+    def : Pat<(type (vselect (type (z_vnot (z_vicmpl_zero VR128:$x))),
+                             (z_vneg VR128:$x), VR128:$x)),
+              (lc (lp VR128:$x))>;
+    def : Pat<(type (or (and (z_vsra_by_scalar VR128:$x, (i32 shift)),
+                             (z_vneg VR128:$x)),
+                        (and (z_vnot (z_vsra_by_scalar VR128:$x, (i32 shift))),
+                             VR128:$x))),
+              (lp VR128:$x)>;
+    def : Pat<(type (or (and (z_vsra_by_scalar VR128:$x, (i32 shift)),
+                             VR128:$x),
+                        (and (z_vnot (z_vsra_by_scalar VR128:$x, (i32 shift))),
+                             (z_vneg VR128:$x)))),
+              (lc (lp VR128:$x))>;
+  }
+}
+
+defm : IntegerAbsoluteVectorOps<v16i8, VLCB, VLPB, 7>;
+defm : IntegerAbsoluteVectorOps<v8i16, VLCH, VLPH, 15>;
+defm : IntegerAbsoluteVectorOps<v4i32, VLCF, VLPF, 31>;
+defm : IntegerAbsoluteVectorOps<v2i64, VLCG, VLPG, 63>;
+
+// Instantiate minimum- and maximum-related patterns for TYPE.  CMPH is the
+// signed or unsigned "set if greater than" comparison instruction and
+// MIN and MAX are the associated minimum and maximum instructions.
+multiclass IntegerMinMaxVectorOps<ValueType type, SDPatternOperator cmph,
+                                  Instruction min, Instruction max> {
+  let Predicates = [FeatureVector] in {
+    def : Pat<(type (vselect (cmph VR128:$x, VR128:$y), VR128:$x, VR128:$y)),
+              (max VR128:$x, VR128:$y)>;
+    def : Pat<(type (vselect (cmph VR128:$x, VR128:$y), VR128:$y, VR128:$x)),
+              (min VR128:$x, VR128:$y)>;
+    def : Pat<(type (vselect (z_vnot (cmph VR128:$x, VR128:$y)),
+                             VR128:$x, VR128:$y)),
+              (min VR128:$x, VR128:$y)>;
+    def : Pat<(type (vselect (z_vnot (cmph VR128:$x, VR128:$y)),
+                             VR128:$y, VR128:$x)),
+              (max VR128:$x, VR128:$y)>;
+  }
+}
+
+// Signed min/max.
+defm : IntegerMinMaxVectorOps<v16i8, z_vicmph, VMNB, VMXB>;
+defm : IntegerMinMaxVectorOps<v8i16, z_vicmph, VMNH, VMXH>;
+defm : IntegerMinMaxVectorOps<v4i32, z_vicmph, VMNF, VMXF>;
+defm : IntegerMinMaxVectorOps<v2i64, z_vicmph, VMNG, VMXG>;
+
+// Unsigned min/max.
+defm : IntegerMinMaxVectorOps<v16i8, z_vicmphl, VMNLB, VMXLB>;
+defm : IntegerMinMaxVectorOps<v8i16, z_vicmphl, VMNLH, VMXLH>;
+defm : IntegerMinMaxVectorOps<v4i32, z_vicmphl, VMNLF, VMXLF>;
+defm : IntegerMinMaxVectorOps<v2i64, z_vicmphl, VMNLG, VMXLG>;
+
+//===----------------------------------------------------------------------===//
+// Integer comparison
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+  // Element compare.
+  let Defs = [CC] in {
+    def VEC  : CompareVRRaGeneric<"vec", 0xE7DB>;
+    def VECB : CompareVRRa<"vecb", 0xE7DB, null_frag, v128b, 0>;
+    def VECH : CompareVRRa<"vech", 0xE7DB, null_frag, v128h, 1>;
+    def VECF : CompareVRRa<"vecf", 0xE7DB, null_frag, v128f, 2>;
+    def VECG : CompareVRRa<"vecg", 0xE7DB, null_frag, v128g, 3>;
+  }
+
+  // Element compare logical.
+  let Defs = [CC] in {
+    def VECL  : CompareVRRaGeneric<"vecl", 0xE7D9>;
+    def VECLB : CompareVRRa<"veclb", 0xE7D9, null_frag, v128b, 0>;
+    def VECLH : CompareVRRa<"veclh", 0xE7D9, null_frag, v128h, 1>;
+    def VECLF : CompareVRRa<"veclf", 0xE7D9, null_frag, v128f, 2>;
+    def VECLG : CompareVRRa<"veclg", 0xE7D9, null_frag, v128g, 3>;
+  }
+
+  // Compare equal.
+  def  VCEQ  : BinaryVRRbSPairGeneric<"vceq", 0xE7F8>;
+  defm VCEQB : BinaryVRRbSPair<"vceqb", 0xE7F8, z_vicmpe, z_vicmpes,
+                               v128b, v128b, 0>;
+  defm VCEQH : BinaryVRRbSPair<"vceqh", 0xE7F8, z_vicmpe, z_vicmpes,
+                               v128h, v128h, 1>;
+  defm VCEQF : BinaryVRRbSPair<"vceqf", 0xE7F8, z_vicmpe, z_vicmpes,
+                               v128f, v128f, 2>;
+  defm VCEQG : BinaryVRRbSPair<"vceqg", 0xE7F8, z_vicmpe, z_vicmpes,
+                               v128g, v128g, 3>;
+
+  // Compare high.
+  def  VCH  : BinaryVRRbSPairGeneric<"vch", 0xE7FB>;
+  defm VCHB : BinaryVRRbSPair<"vchb", 0xE7FB, z_vicmph, z_vicmphs,
+                              v128b, v128b, 0>;
+  defm VCHH : BinaryVRRbSPair<"vchh", 0xE7FB, z_vicmph, z_vicmphs,
+                              v128h, v128h, 1>;
+  defm VCHF : BinaryVRRbSPair<"vchf", 0xE7FB, z_vicmph, z_vicmphs,
+                              v128f, v128f, 2>;
+  defm VCHG : BinaryVRRbSPair<"vchg", 0xE7FB, z_vicmph, z_vicmphs,
+                              v128g, v128g, 3>;
+
+  // Compare high logical.
+  def  VCHL  : BinaryVRRbSPairGeneric<"vchl", 0xE7F9>;
+  defm VCHLB : BinaryVRRbSPair<"vchlb", 0xE7F9, z_vicmphl, z_vicmphls,
+                               v128b, v128b, 0>;
+  defm VCHLH : BinaryVRRbSPair<"vchlh", 0xE7F9, z_vicmphl, z_vicmphls,
+                               v128h, v128h, 1>;
+  defm VCHLF : BinaryVRRbSPair<"vchlf", 0xE7F9, z_vicmphl, z_vicmphls,
+                               v128f, v128f, 2>;
+  defm VCHLG : BinaryVRRbSPair<"vchlg", 0xE7F9, z_vicmphl, z_vicmphls,
+                               v128g, v128g, 3>;
+
+  // Test under mask.
+  let Defs = [CC] in
+    def VTM : CompareVRRa<"vtm", 0xE7D8, z_vtm, v128b, 0>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-point arithmetic
+//===----------------------------------------------------------------------===//
+
+// See comments in SystemZInstrFP.td for the suppression flags and
+// rounding modes.
+multiclass VectorRounding<Instruction insn, TypedReg tr> {
+  def : FPConversion<insn, frint,      tr, tr, 0, 0>;
+  def : FPConversion<insn, fnearbyint, tr, tr, 4, 0>;
+  def : FPConversion<insn, ffloor,     tr, tr, 4, 7>;
+  def : FPConversion<insn, fceil,      tr, tr, 4, 6>;
+  def : FPConversion<insn, ftrunc,     tr, tr, 4, 5>;
+  def : FPConversion<insn, fround,     tr, tr, 4, 1>;
+}
+
+let Predicates = [FeatureVector] in {
+  // Add.
+  def VFA   : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>;
+  def VFADB : BinaryVRRc<"vfadb", 0xE7E3, fadd, v128db, v128db, 3, 0>;
+  def WFADB : BinaryVRRc<"wfadb", 0xE7E3, fadd, v64db, v64db, 3, 8>;
+
+  // Convert from fixed 64-bit.
+  def VCDG  : TernaryVRRaFloatGeneric<"vcdg", 0xE7C3>;
+  def VCDGB : TernaryVRRa<"vcdgb", 0xE7C3, null_frag, v128db, v128g, 3, 0>;
+  def WCDGB : TernaryVRRa<"wcdgb", 0xE7C3, null_frag, v64db, v64g, 3, 8>;
+  def : FPConversion<VCDGB, sint_to_fp, v128db, v128g, 0, 0>;
+
+  // Convert from logical 64-bit.
+  def VCDLG  : TernaryVRRaFloatGeneric<"vcdlg", 0xE7C1>;
+  def VCDLGB : TernaryVRRa<"vcdlgb", 0xE7C1, null_frag, v128db, v128g, 3, 0>;
+  def WCDLGB : TernaryVRRa<"wcdlgb", 0xE7C1, null_frag, v64db, v64g, 3, 8>;
+  def : FPConversion<VCDLGB, uint_to_fp, v128db, v128g, 0, 0>;
+
+  // Convert to fixed 64-bit.
+  def VCGD  : TernaryVRRaFloatGeneric<"vcgd", 0xE7C2>;
+  def VCGDB : TernaryVRRa<"vcgdb", 0xE7C2, null_frag, v128g, v128db, 3, 0>;
+  def WCGDB : TernaryVRRa<"wcgdb", 0xE7C2, null_frag, v64g, v64db, 3, 8>;
+  // Rounding mode should agree with SystemZInstrFP.td.
+  def : FPConversion<VCGDB, fp_to_sint, v128g, v128db, 0, 5>;
+
+  // Convert to logical 64-bit.
+  def VCLGD  : TernaryVRRaFloatGeneric<"vclgd", 0xE7C0>;
+  def VCLGDB : TernaryVRRa<"vclgdb", 0xE7C0, null_frag, v128g, v128db, 3, 0>;
+  def WCLGDB : TernaryVRRa<"wclgdb", 0xE7C0, null_frag, v64g, v64db, 3, 8>;
+  // Rounding mode should agree with SystemZInstrFP.td.
+  def : FPConversion<VCLGDB, fp_to_uint, v128g, v128db, 0, 5>;
+
+  // Divide.
+  def VFD   : BinaryVRRcFloatGeneric<"vfd", 0xE7E5>;
+  def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, fdiv, v128db, v128db, 3, 0>;
+  def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, fdiv, v64db, v64db, 3, 8>;
+
+  // Load FP integer.
+  def VFI   : TernaryVRRaFloatGeneric<"vfi", 0xE7C7>;
+  def VFIDB : TernaryVRRa<"vfidb", 0xE7C7, int_s390_vfidb, v128db, v128db, 3, 0>;
+  def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>;
+  defm : VectorRounding<VFIDB, v128db>;
+  defm : VectorRounding<WFIDB, v64db>;
+
+  // Load lengthened.
+  def VLDE  : UnaryVRRaFloatGeneric<"vlde", 0xE7C4>;
+  def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>;
+  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32eb, 2, 8>;
+
+  // Load rounded,
+  def VLED  : TernaryVRRaFloatGeneric<"vled", 0xE7C5>;
+  def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>;
+  def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>;
+  def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>;
+  def : FPConversion<WLEDB, fpround, v32eb, v64db, 0, 0>;
+
+  // Multiply.
+  def VFM   : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>;
+  def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>;
+  def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, fmul, v64db, v64db, 3, 8>;
+
+  // Multiply and add.
+  def VFMA   : TernaryVRReFloatGeneric<"vfma", 0xE78F>;
+  def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, fma, v128db, v128db, 0, 3>;
+  def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, fma, v64db, v64db, 8, 3>;
+
+  // Multiply and subtract.
+  def VFMS   : TernaryVRReFloatGeneric<"vfms", 0xE78E>;
+  def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, fms, v128db, v128db, 0, 3>;
+  def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, fms, v64db, v64db, 8, 3>;
+
+  // Perform sign operation.
+  def VFPSO   : BinaryVRRaFloatGeneric<"vfpso", 0xE7CC>;
+  def VFPSODB : BinaryVRRa<"vfpsodb", 0xE7CC, null_frag, v128db, v128db, 3, 0>;
+  def WFPSODB : BinaryVRRa<"wfpsodb", 0xE7CC, null_frag, v64db, v64db, 3, 8>;
+
+  // Load complement.
+  def VFLCDB : UnaryVRRa<"vflcdb", 0xE7CC, fneg, v128db, v128db, 3, 0, 0>;
+  def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, fneg, v64db, v64db, 3, 8, 0>;
+
+  // Load negative.
+  def VFLNDB : UnaryVRRa<"vflndb", 0xE7CC, fnabs, v128db, v128db, 3, 0, 1>;
+  def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, fnabs, v64db, v64db, 3, 8, 1>;
+
+  // Load positive.
+  def VFLPDB : UnaryVRRa<"vflpdb", 0xE7CC, fabs, v128db, v128db, 3, 0, 2>;
+  def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, fabs, v64db, v64db, 3, 8, 2>;
+
+  // Square root.
+  def VFSQ   : UnaryVRRaFloatGeneric<"vfsq", 0xE7CE>;
+  def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, fsqrt, v128db, v128db, 3, 0>;
+  def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, fsqrt, v64db, v64db, 3, 8>;
+
+  // Subtract.
+  def VFS   : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>;
+  def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, fsub, v128db, v128db, 3, 0>;
+  def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, fsub, v64db, v64db, 3, 8>;
+
+  // Test data class immediate.
+  let Defs = [CC] in {
+    def VFTCI   : BinaryVRIeFloatGeneric<"vftci", 0xE74A>;
+    def VFTCIDB : BinaryVRIe<"vftcidb", 0xE74A, z_vftci, v128g, v128db, 3, 0>;
+    def WFTCIDB : BinaryVRIe<"wftcidb", 0xE74A, null_frag, v64g, v64db, 3, 8>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-point comparison
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+  // Compare scalar.
+  let Defs = [CC] in {
+    def WFC   : CompareVRRaFloatGeneric<"wfc", 0xE7CB>;
+    def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_fcmp, v64db, 3>;
+  }
+
+  // Compare and signal scalar.
+  let Defs = [CC] in {
+    def WFK   : CompareVRRaFloatGeneric<"wfk", 0xE7CA>;
+    def WFKDB : CompareVRRa<"wfkdb", 0xE7CA, null_frag, v64db, 3>;
+  }
+
+  // Compare equal.
+  def  VFCE   : BinaryVRRcSPairFloatGeneric<"vfce", 0xE7E8>;
+  defm VFCEDB : BinaryVRRcSPair<"vfcedb", 0xE7E8, z_vfcmpe, z_vfcmpes,
+                                v128g, v128db, 3, 0>;
+  defm WFCEDB : BinaryVRRcSPair<"wfcedb", 0xE7E8, null_frag, null_frag,
+                                v64g, v64db, 3, 8>;
+
+  // Compare high.
+  def  VFCH   : BinaryVRRcSPairFloatGeneric<"vfch", 0xE7EB>;
+  defm VFCHDB : BinaryVRRcSPair<"vfchdb", 0xE7EB, z_vfcmph, z_vfcmphs,
+                                v128g, v128db, 3, 0>;
+  defm WFCHDB : BinaryVRRcSPair<"wfchdb", 0xE7EB, null_frag, null_frag,
+                                v64g, v64db, 3, 8>;
+
+  // Compare high or equal.
+  def  VFCHE   : BinaryVRRcSPairFloatGeneric<"vfche", 0xE7EA>;
+  defm VFCHEDB : BinaryVRRcSPair<"vfchedb", 0xE7EA, z_vfcmphe, z_vfcmphes,
+                                 v128g, v128db, 3, 0>;
+  defm WFCHEDB : BinaryVRRcSPair<"wfchedb", 0xE7EA, null_frag, null_frag,
+                                 v64g, v64db, 3, 8>;
+}
+
+//===----------------------------------------------------------------------===//
+// Conversions
+//===----------------------------------------------------------------------===//
+
+def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+
+def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+
+def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+
+def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+
+def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+
+def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Replicating scalars
+//===----------------------------------------------------------------------===//
+
+// Define patterns for replicating a scalar GR32 into a vector of type TYPE.
+// INDEX is 8 minus the element size in bytes.
+class VectorReplicateScalar<ValueType type, Instruction insn, bits<16> index>
+  : Pat<(type (z_replicate GR32:$scalar)),
+        (insn (VLVGP32 GR32:$scalar, GR32:$scalar), index)>;
+
+def : VectorReplicateScalar<v16i8, VREPB, 7>;
+def : VectorReplicateScalar<v8i16, VREPH, 3>;
+def : VectorReplicateScalar<v4i32, VREPF, 1>;
+
+// i64 replications are just a single isntruction.
+def : Pat<(v2i64 (z_replicate GR64:$scalar)),
+          (VLVGP GR64:$scalar, GR64:$scalar)>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point insertion and extraction
+//===----------------------------------------------------------------------===//
+
+// Moving 32-bit values between GPRs and FPRs can be done using VLVGF
+// and VLGVF.
+let Predicates = [FeatureVector] in {
+  def LEFR : UnaryAliasVRS<VR32, GR32>;
+  def LFER : UnaryAliasVRS<GR64, VR32>;
+  def : Pat<(f32 (bitconvert (i32 GR32:$src))), (LEFR GR32:$src)>;
+  def : Pat<(i32 (bitconvert (f32 VR32:$src))),
+            (EXTRACT_SUBREG (LFER VR32:$src), subreg_l32)>;
+}
+
+// Floating-point values are stored in element 0 of the corresponding
+// vector register.  Scalar to vector conversion is just a subreg and
+// scalar replication can just replicate element 0 of the vector register.
+multiclass ScalarToVectorFP<Instruction vrep, ValueType vt, RegisterOperand cls,
+                            SubRegIndex subreg> {
+  def : Pat<(vt (scalar_to_vector cls:$scalar)),
+            (INSERT_SUBREG (vt (IMPLICIT_DEF)), cls:$scalar, subreg)>;
+  def : Pat<(vt (z_replicate cls:$scalar)),
+            (vrep (INSERT_SUBREG (vt (IMPLICIT_DEF)), cls:$scalar,
+                                 subreg), 0)>;
+}
+defm : ScalarToVectorFP<VREPF, v4f32, FP32, subreg_r32>;
+defm : ScalarToVectorFP<VREPG, v2f64, FP64, subreg_r64>;
+
+// Match v2f64 insertions.  The AddedComplexity counters the 3 added by
+// TableGen for the base register operand in VLVG-based integer insertions
+// and ensures that this version is strictly better.
+let AddedComplexity = 4 in {
+  def : Pat<(z_vector_insert (v2f64 VR128:$vec), FP64:$elt, 0),
+            (VPDI (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FP64:$elt,
+                                 subreg_r64), VR128:$vec, 1)>;
+  def : Pat<(z_vector_insert (v2f64 VR128:$vec), FP64:$elt, 1),
+            (VPDI VR128:$vec, (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FP64:$elt,
+                                             subreg_r64), 0)>;
+}
+
+// We extract floating-point element X by replicating (for elements other
+// than 0) and then taking a high subreg.  The AddedComplexity counters the
+// 3 added by TableGen for the base register operand in VLGV-based integer
+// extractions and ensures that this version is strictly better.
+let AddedComplexity = 4 in {
+  def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), 0)),
+            (EXTRACT_SUBREG VR128:$vec, subreg_r32)>;
+  def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), imm32zx2:$index)),
+            (EXTRACT_SUBREG (VREPF VR128:$vec, imm32zx2:$index), subreg_r32)>;
+
+  def : Pat<(f64 (z_vector_extract (v2f64 VR128:$vec), 0)),
+            (EXTRACT_SUBREG VR128:$vec, subreg_r64)>;
+  def : Pat<(f64 (z_vector_extract (v2f64 VR128:$vec), imm32zx1:$index)),
+            (EXTRACT_SUBREG (VREPG VR128:$vec, imm32zx1:$index), subreg_r64)>;
+}
+
+//===----------------------------------------------------------------------===//
+// String instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVector] in {
+  defm VFAE  : TernaryOptVRRbSPairGeneric<"vfae", 0xE782>;
+  defm VFAEB : TernaryOptVRRbSPair<"vfaeb", 0xE782, int_s390_vfaeb,
+                                   z_vfae_cc, v128b, v128b, 0>;
+  defm VFAEH : TernaryOptVRRbSPair<"vfaeh", 0xE782, int_s390_vfaeh,
+                                   z_vfae_cc, v128h, v128h, 1>;
+  defm VFAEF : TernaryOptVRRbSPair<"vfaef", 0xE782, int_s390_vfaef,
+                                   z_vfae_cc, v128f, v128f, 2>;
+  defm VFAEZB : TernaryOptVRRbSPair<"vfaezb", 0xE782, int_s390_vfaezb,
+                                    z_vfaez_cc, v128b, v128b, 0, 2>;
+  defm VFAEZH : TernaryOptVRRbSPair<"vfaezh", 0xE782, int_s390_vfaezh,
+                                    z_vfaez_cc, v128h, v128h, 1, 2>;
+  defm VFAEZF : TernaryOptVRRbSPair<"vfaezf", 0xE782, int_s390_vfaezf,
+                                    z_vfaez_cc, v128f, v128f, 2, 2>;
+
+  defm VFEE  : BinaryExtraVRRbSPairGeneric<"vfee", 0xE780>;
+  defm VFEEB : BinaryExtraVRRbSPair<"vfeeb", 0xE780, int_s390_vfeeb,
+                                    z_vfee_cc, v128b, v128b, 0>;
+  defm VFEEH : BinaryExtraVRRbSPair<"vfeeh", 0xE780, int_s390_vfeeh,
+                                    z_vfee_cc, v128h, v128h, 1>;
+  defm VFEEF : BinaryExtraVRRbSPair<"vfeef", 0xE780, int_s390_vfeef,
+                                    z_vfee_cc, v128f, v128f, 2>;
+  defm VFEEZB : BinaryVRRbSPair<"vfeezb", 0xE780, int_s390_vfeezb,
+                                z_vfeez_cc, v128b, v128b, 0, 2>;
+  defm VFEEZH : BinaryVRRbSPair<"vfeezh", 0xE780, int_s390_vfeezh,
+                                z_vfeez_cc, v128h, v128h, 1, 2>;
+  defm VFEEZF : BinaryVRRbSPair<"vfeezf", 0xE780, int_s390_vfeezf,
+                                z_vfeez_cc, v128f, v128f, 2, 2>;
+
+  defm VFENE  : BinaryExtraVRRbSPairGeneric<"vfene", 0xE781>;
+  defm VFENEB : BinaryExtraVRRbSPair<"vfeneb", 0xE781, int_s390_vfeneb,
+                                     z_vfene_cc, v128b, v128b, 0>;
+  defm VFENEH : BinaryExtraVRRbSPair<"vfeneh", 0xE781, int_s390_vfeneh,
+                                     z_vfene_cc, v128h, v128h, 1>;
+  defm VFENEF : BinaryExtraVRRbSPair<"vfenef", 0xE781, int_s390_vfenef,
+                                     z_vfene_cc, v128f, v128f, 2>;
+  defm VFENEZB : BinaryVRRbSPair<"vfenezb", 0xE781, int_s390_vfenezb,
+                                 z_vfenez_cc, v128b, v128b, 0, 2>;
+  defm VFENEZH : BinaryVRRbSPair<"vfenezh", 0xE781, int_s390_vfenezh,
+                                 z_vfenez_cc, v128h, v128h, 1, 2>;
+  defm VFENEZF : BinaryVRRbSPair<"vfenezf", 0xE781, int_s390_vfenezf,
+                                 z_vfenez_cc, v128f, v128f, 2, 2>;
+
+  defm VISTR  : UnaryExtraVRRaSPairGeneric<"vistr", 0xE75C>;
+  defm VISTRB : UnaryExtraVRRaSPair<"vistrb", 0xE75C, int_s390_vistrb,
+                                    z_vistr_cc, v128b, v128b, 0>;
+  defm VISTRH : UnaryExtraVRRaSPair<"vistrh", 0xE75C, int_s390_vistrh,
+                                    z_vistr_cc, v128h, v128h, 1>;
+  defm VISTRF : UnaryExtraVRRaSPair<"vistrf", 0xE75C, int_s390_vistrf,
+                                    z_vistr_cc, v128f, v128f, 2>;
+
+  defm VSTRC  : QuaternaryOptVRRdSPairGeneric<"vstrc", 0xE78A>;
+  defm VSTRCB : QuaternaryOptVRRdSPair<"vstrcb", 0xE78A, int_s390_vstrcb,
+                                       z_vstrc_cc, v128b, v128b, 0>;
+  defm VSTRCH : QuaternaryOptVRRdSPair<"vstrch", 0xE78A, int_s390_vstrch,
+                                       z_vstrc_cc, v128h, v128h, 1>;
+  defm VSTRCF : QuaternaryOptVRRdSPair<"vstrcf", 0xE78A, int_s390_vstrcf,
+                                       z_vstrc_cc, v128f, v128f, 2>;
+  defm VSTRCZB : QuaternaryOptVRRdSPair<"vstrczb", 0xE78A, int_s390_vstrczb,
+                                        z_vstrcz_cc, v128b, v128b, 0, 2>;
+  defm VSTRCZH : QuaternaryOptVRRdSPair<"vstrczh", 0xE78A, int_s390_vstrczh,
+                                        z_vstrcz_cc, v128h, v128h, 1, 2>;
+  defm VSTRCZF : QuaternaryOptVRRdSPair<"vstrczf", 0xE78A, int_s390_vstrczf,
+                                        z_vstrcz_cc, v128f, v128f, 2, 2>;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
new file mode 100644
index 000000000000..ec8ce6e911fa
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
@@ -0,0 +1,146 @@
+//===-- SystemZLDCleanup.cpp - Clean up local-dynamic TLS accesses --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass combines multiple accesses to local-dynamic TLS variables so that
+// the TLS base address for the module is only fetched once per execution path
+// through the function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "SystemZMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class SystemZLDCleanup : public MachineFunctionPass {
+public:
+  static char ID;
+  SystemZLDCleanup(const SystemZTargetMachine &tm)
+    : MachineFunctionPass(ID), TII(nullptr), MF(nullptr) {}
+
+  StringRef getPassName() const override {
+    return "SystemZ Local Dynamic TLS Access Clean-up";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg);
+  MachineInstr *ReplaceTLSCall(MachineInstr *I, unsigned TLSBaseAddrReg);
+  MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg);
+
+  const SystemZInstrInfo *TII;
+  MachineFunction *MF;
+};
+
+char SystemZLDCleanup::ID = 0;
+
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZLDCleanupPass(SystemZTargetMachine &TM) {
+  return new SystemZLDCleanup(TM);
+}
+
+void SystemZLDCleanup::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool SystemZLDCleanup::runOnMachineFunction(MachineFunction &F) {
+  if (skipFunction(*F.getFunction()))
+    return false;
+
+  TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
+  MF = &F;
+
+  SystemZMachineFunctionInfo* MFI = F.getInfo<SystemZMachineFunctionInfo>();
+  if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+    // No point folding accesses if there isn't at least two.
+    return false;
+  }
+
+  MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+  return VisitNode(DT->getRootNode(), 0);
+}
+
+// Visit the dominator subtree rooted at Node in pre-order.
+// If TLSBaseAddrReg is non-null, then use that to replace any
+// TLS_LDCALL instructions. Otherwise, create the register
+// when the first such instruction is seen, and then use it
+// as we encounter more instructions.
+bool SystemZLDCleanup::VisitNode(MachineDomTreeNode *Node,
+                                 unsigned TLSBaseAddrReg) {
+  MachineBasicBlock *BB = Node->getBlock();
+  bool Changed = false;
+
+  // Traverse the current block.
+  for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+    switch (I->getOpcode()) {
+      case SystemZ::TLS_LDCALL:
+        if (TLSBaseAddrReg)
+          I = ReplaceTLSCall(&*I, TLSBaseAddrReg);
+        else
+          I = SetRegister(&*I, &TLSBaseAddrReg);
+        Changed = true;
+        break;
+      default:
+        break;
+    }
+  }
+
+  // Visit the children of this block in the dominator tree.
+  for (auto I = Node->begin(), E = Node->end(); I != E; ++I)
+    Changed |= VisitNode(*I, TLSBaseAddrReg);
+
+  return Changed;
+}
+
+// Replace the TLS_LDCALL instruction I with a copy from TLSBaseAddrReg,
+// returning the new instruction.
+MachineInstr *SystemZLDCleanup::ReplaceTLSCall(MachineInstr *I,
+                                               unsigned TLSBaseAddrReg) {
+  // Insert a Copy from TLSBaseAddrReg to R2.
+  MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                               TII->get(TargetOpcode::COPY), SystemZ::R2D)
+                               .addReg(TLSBaseAddrReg);
+
+  // Erase the TLS_LDCALL instruction.
+  I->eraseFromParent();
+
+  return Copy;
+}
+
+// Create a virtal register in *TLSBaseAddrReg, and populate it by
+// inserting a copy instruction after I. Returns the new instruction.
+MachineInstr *SystemZLDCleanup::SetRegister(MachineInstr *I,
+                                            unsigned *TLSBaseAddrReg) {
+  // Create a virtual register for the TLS base address.
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  *TLSBaseAddrReg = RegInfo.createVirtualRegister(&SystemZ::GR64BitRegClass);
+
+  // Insert a copy from R2 to TLSBaseAddrReg.
+  MachineInstr *Next = I->getNextNode();
+  MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+                               TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
+                               .addReg(SystemZ::R2D);
+
+  return Copy;
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
new file mode 100644
index 000000000000..14ff6afbd4ae
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -0,0 +1,465 @@
+//===-- SystemZLongBranch.cpp - Branch lengthening for SystemZ ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass makes sure that all branches are in range.  There are several ways
+// in which this could be done.  One aggressive approach is to assume that all
+// branches are in range and successively replace those that turn out not
+// to be in range with a longer form (branch relaxation).  A simple
+// implementation is to continually walk through the function relaxing
+// branches until no more changes are needed and a fixed point is reached.
+// However, in the pathological worst case, this implementation is
+// quadratic in the number of blocks; relaxing branch N can make branch N-1
+// go out of range, which in turn can make branch N-2 go out of range,
+// and so on.
+//
+// An alternative approach is to assume that all branches must be
+// converted to their long forms, then reinstate the short forms of
+// branches that, even under this pessimistic assumption, turn out to be
+// in range (branch shortening).  This too can be implemented as a function
+// walk that is repeated until a fixed point is reached.  In general,
+// the result of shortening is not as good as that of relaxation, and
+// shortening is also quadratic in the worst case; shortening branch N
+// can bring branch N-1 in range of the short form, which in turn can do
+// the same for branch N-2, and so on.  The main advantage of shortening
+// is that each walk through the function produces valid code, so it is
+// possible to stop at any point after the first walk.  The quadraticness
+// could therefore be handled with a maximum pass count, although the
+// question then becomes: what maximum count should be used?
+//
+// On SystemZ, long branches are only needed for functions bigger than 64k,
+// which are relatively rare to begin with, and the long branch sequences
+// are actually relatively cheap.  It therefore doesn't seem worth spending
+// much compilation time on the problem.  Instead, the approach we take is:
+//
+// (1) Work out the address that each block would have if no branches
+//     need relaxing.  Exit the pass early if all branches are in range
+//     according to this assumption.
+//
+// (2) Work out the address that each block would have if all branches
+//     need relaxing.
+//
+// (3) Walk through the block calculating the final address of each instruction
+//     and relaxing those that need to be relaxed.  For backward branches,
+//     this check uses the final address of the target block, as calculated
+//     earlier in the walk.  For forward branches, this check uses the
+//     address of the target block that was calculated in (2).  Both checks
+//     give a conservatively-correct range.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-long-branch"
+
+STATISTIC(LongBranches, "Number of long branches.");
+
+namespace {
+// Represents positional information about a basic block.
+struct MBBInfo {
+  // The address that we currently assume the block has.
+  uint64_t Address;
+
+  // The size of the block in bytes, excluding terminators.
+  // This value never changes.
+  uint64_t Size;
+
+  // The minimum alignment of the block, as a log2 value.
+  // This value never changes.
+  unsigned Alignment;
+
+  // The number of terminators in this block.  This value never changes.
+  unsigned NumTerminators;
+
+  MBBInfo()
+    : Address(0), Size(0), Alignment(0), NumTerminators(0) {} 
+};
+
+// Represents the state of a block terminator.
+struct TerminatorInfo {
+  // If this terminator is a relaxable branch, this points to the branch
+  // instruction, otherwise it is null.
+  MachineInstr *Branch;
+
+  // The address that we currently assume the terminator has.
+  uint64_t Address;
+
+  // The current size of the terminator in bytes.
+  uint64_t Size;
+
+  // If Branch is nonnull, this is the number of the target block,
+  // otherwise it is unused.
+  unsigned TargetBlock;
+
+  // If Branch is nonnull, this is the length of the longest relaxed form,
+  // otherwise it is zero.
+  unsigned ExtraRelaxSize;
+
+  TerminatorInfo() : Branch(nullptr), Size(0), TargetBlock(0),
+                     ExtraRelaxSize(0) {}
+};
+
+// Used to keep track of the current position while iterating over the blocks.
+struct BlockPosition {
+  // The address that we assume this position has.
+  uint64_t Address;
+
+  // The number of low bits in Address that are known to be the same
+  // as the runtime address.
+  unsigned KnownBits;
+
+  BlockPosition(unsigned InitialAlignment)
+    : Address(0), KnownBits(InitialAlignment) {}
+};
+
+class SystemZLongBranch : public MachineFunctionPass {
+public:
+  static char ID;
+  SystemZLongBranch(const SystemZTargetMachine &tm)
+    : MachineFunctionPass(ID), TII(nullptr) {}
+
+  StringRef getPassName() const override { return "SystemZ Long Branch"; }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  void skipNonTerminators(BlockPosition &Position, MBBInfo &Block);
+  void skipTerminator(BlockPosition &Position, TerminatorInfo &Terminator,
+                      bool AssumeRelaxed);
+  TerminatorInfo describeTerminator(MachineInstr &MI);
+  uint64_t initMBBInfo();
+  bool mustRelaxBranch(const TerminatorInfo &Terminator, uint64_t Address);
+  bool mustRelaxABranch();
+  void setWorstCaseAddresses();
+  void splitBranchOnCount(MachineInstr *MI, unsigned AddOpcode);
+  void splitCompareBranch(MachineInstr *MI, unsigned CompareOpcode);
+  void relaxBranch(TerminatorInfo &Terminator);
+  void relaxBranches();
+
+  const SystemZInstrInfo *TII;
+  MachineFunction *MF;
+  SmallVector<MBBInfo, 16> MBBs;
+  SmallVector<TerminatorInfo, 16> Terminators;
+};
+
+char SystemZLongBranch::ID = 0;
+
+const uint64_t MaxBackwardRange = 0x10000;
+const uint64_t MaxForwardRange = 0xfffe;
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZLongBranchPass(SystemZTargetMachine &TM) {
+  return new SystemZLongBranch(TM);
+}
+
+// Position describes the state immediately before Block.  Update Block
+// accordingly and move Position to the end of the block's non-terminator
+// instructions.
+void SystemZLongBranch::skipNonTerminators(BlockPosition &Position,
+                                           MBBInfo &Block) {
+  if (Block.Alignment > Position.KnownBits) {
+    // When calculating the address of Block, we need to conservatively
+    // assume that Block had the worst possible misalignment.
+    Position.Address += ((uint64_t(1) << Block.Alignment) -
+                         (uint64_t(1) << Position.KnownBits));
+    Position.KnownBits = Block.Alignment;
+  }
+
+  // Align the addresses.
+  uint64_t AlignMask = (uint64_t(1) << Block.Alignment) - 1;
+  Position.Address = (Position.Address + AlignMask) & ~AlignMask;
+
+  // Record the block's position.
+  Block.Address = Position.Address;
+
+  // Move past the non-terminators in the block.
+  Position.Address += Block.Size;
+}
+
+// Position describes the state immediately before Terminator.
+// Update Terminator accordingly and move Position past it.
+// Assume that Terminator will be relaxed if AssumeRelaxed.
+void SystemZLongBranch::skipTerminator(BlockPosition &Position,
+                                       TerminatorInfo &Terminator,
+                                       bool AssumeRelaxed) {
+  Terminator.Address = Position.Address;
+  Position.Address += Terminator.Size;
+  if (AssumeRelaxed)
+    Position.Address += Terminator.ExtraRelaxSize;
+}
+
+// Return a description of terminator instruction MI.
+TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr &MI) {
+  TerminatorInfo Terminator;
+  Terminator.Size = TII->getInstSizeInBytes(MI);
+  if (MI.isConditionalBranch() || MI.isUnconditionalBranch()) {
+    switch (MI.getOpcode()) {
+    case SystemZ::J:
+      // Relaxes to JG, which is 2 bytes longer.
+      Terminator.ExtraRelaxSize = 2;
+      break;
+    case SystemZ::BRC:
+      // Relaxes to BRCL, which is 2 bytes longer.
+      Terminator.ExtraRelaxSize = 2;
+      break;
+    case SystemZ::BRCT:
+    case SystemZ::BRCTG:
+      // Relaxes to A(G)HI and BRCL, which is 6 bytes longer.
+      Terminator.ExtraRelaxSize = 6;
+      break;
+    case SystemZ::BRCTH:
+      // Never needs to be relaxed.
+      Terminator.ExtraRelaxSize = 0;
+      break;
+    case SystemZ::CRJ:
+    case SystemZ::CLRJ:
+      // Relaxes to a C(L)R/BRCL sequence, which is 2 bytes longer.
+      Terminator.ExtraRelaxSize = 2;
+      break;
+    case SystemZ::CGRJ:
+    case SystemZ::CLGRJ:
+      // Relaxes to a C(L)GR/BRCL sequence, which is 4 bytes longer.
+      Terminator.ExtraRelaxSize = 4;
+      break;
+    case SystemZ::CIJ:
+    case SystemZ::CGIJ:
+      // Relaxes to a C(G)HI/BRCL sequence, which is 4 bytes longer.
+      Terminator.ExtraRelaxSize = 4;
+      break;
+    case SystemZ::CLIJ:
+    case SystemZ::CLGIJ:
+      // Relaxes to a CL(G)FI/BRCL sequence, which is 6 bytes longer.
+      Terminator.ExtraRelaxSize = 6;
+      break;
+    default:
+      llvm_unreachable("Unrecognized branch instruction");
+    }
+    Terminator.Branch = &MI;
+    Terminator.TargetBlock =
+      TII->getBranchInfo(MI).Target->getMBB()->getNumber();
+  }
+  return Terminator;
+}
+
+// Fill MBBs and Terminators, setting the addresses on the assumption
+// that no branches need relaxation.  Return the size of the function under
+// this assumption.
+uint64_t SystemZLongBranch::initMBBInfo() {
+  MF->RenumberBlocks();
+  unsigned NumBlocks = MF->size();
+
+  MBBs.clear();
+  MBBs.resize(NumBlocks);
+
+  Terminators.clear();
+  Terminators.reserve(NumBlocks);
+
+  BlockPosition Position(MF->getAlignment());
+  for (unsigned I = 0; I < NumBlocks; ++I) {
+    MachineBasicBlock *MBB = MF->getBlockNumbered(I);
+    MBBInfo &Block = MBBs[I];
+
+    // Record the alignment, for quick access.
+    Block.Alignment = MBB->getAlignment();
+
+    // Calculate the size of the fixed part of the block.
+    MachineBasicBlock::iterator MI = MBB->begin();
+    MachineBasicBlock::iterator End = MBB->end();
+    while (MI != End && !MI->isTerminator()) {
+      Block.Size += TII->getInstSizeInBytes(*MI);
+      ++MI;
+    }
+    skipNonTerminators(Position, Block);
+
+    // Add the terminators.
+    while (MI != End) {
+      if (!MI->isDebugValue()) {
+        assert(MI->isTerminator() && "Terminator followed by non-terminator");
+        Terminators.push_back(describeTerminator(*MI));
+        skipTerminator(Position, Terminators.back(), false);
+        ++Block.NumTerminators;
+      }
+      ++MI;
+    }
+  }
+
+  return Position.Address;
+}
+
+// Return true if, under current assumptions, Terminator would need to be
+// relaxed if it were placed at address Address.
+bool SystemZLongBranch::mustRelaxBranch(const TerminatorInfo &Terminator,
+                                        uint64_t Address) {
+  if (!Terminator.Branch)
+    return false;
+
+  const MBBInfo &Target = MBBs[Terminator.TargetBlock];
+  if (Address >= Target.Address) {
+    if (Address - Target.Address <= MaxBackwardRange)
+      return false;
+  } else {
+    if (Target.Address - Address <= MaxForwardRange)
+      return false;
+  }
+
+  return true;
+}
+
+// Return true if, under current assumptions, any terminator needs
+// to be relaxed.
+bool SystemZLongBranch::mustRelaxABranch() {
+  for (auto &Terminator : Terminators)
+    if (mustRelaxBranch(Terminator, Terminator.Address))
+      return true;
+  return false;
+}
+
+// Set the address of each block on the assumption that all branches
+// must be long.
+void SystemZLongBranch::setWorstCaseAddresses() {
+  SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin();
+  BlockPosition Position(MF->getAlignment());
+  for (auto &Block : MBBs) {
+    skipNonTerminators(Position, Block);
+    for (unsigned BTI = 0, BTE = Block.NumTerminators; BTI != BTE; ++BTI) {
+      skipTerminator(Position, *TI, true);
+      ++TI;
+    }
+  }
+}
+
+// Split BRANCH ON COUNT MI into the addition given by AddOpcode followed
+// by a BRCL on the result.
+void SystemZLongBranch::splitBranchOnCount(MachineInstr *MI,
+                                           unsigned AddOpcode) {
+  MachineBasicBlock *MBB = MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  BuildMI(*MBB, MI, DL, TII->get(AddOpcode))
+    .addOperand(MI->getOperand(0))
+    .addOperand(MI->getOperand(1))
+    .addImm(-1);
+  MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
+    .addImm(SystemZ::CCMASK_ICMP)
+    .addImm(SystemZ::CCMASK_CMP_NE)
+    .addOperand(MI->getOperand(2));
+  // The implicit use of CC is a killing use.
+  BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo());
+  MI->eraseFromParent();
+}
+
+// Split MI into the comparison given by CompareOpcode followed
+// a BRCL on the result.
+void SystemZLongBranch::splitCompareBranch(MachineInstr *MI,
+                                           unsigned CompareOpcode) {
+  MachineBasicBlock *MBB = MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  BuildMI(*MBB, MI, DL, TII->get(CompareOpcode))
+    .addOperand(MI->getOperand(0))
+    .addOperand(MI->getOperand(1));
+  MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
+    .addImm(SystemZ::CCMASK_ICMP)
+    .addOperand(MI->getOperand(2))
+    .addOperand(MI->getOperand(3));
+  // The implicit use of CC is a killing use.
+  BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo());
+  MI->eraseFromParent();
+}
+
+// Relax the branch described by Terminator.
+void SystemZLongBranch::relaxBranch(TerminatorInfo &Terminator) {
+  MachineInstr *Branch = Terminator.Branch;
+  switch (Branch->getOpcode()) {
+  case SystemZ::J:
+    Branch->setDesc(TII->get(SystemZ::JG));
+    break;
+  case SystemZ::BRC:
+    Branch->setDesc(TII->get(SystemZ::BRCL));
+    break;
+  case SystemZ::BRCT:
+    splitBranchOnCount(Branch, SystemZ::AHI);
+    break;
+  case SystemZ::BRCTG:
+    splitBranchOnCount(Branch, SystemZ::AGHI);
+    break;
+  case SystemZ::CRJ:
+    splitCompareBranch(Branch, SystemZ::CR);
+    break;
+  case SystemZ::CGRJ:
+    splitCompareBranch(Branch, SystemZ::CGR);
+    break;
+  case SystemZ::CIJ:
+    splitCompareBranch(Branch, SystemZ::CHI);
+    break;
+  case SystemZ::CGIJ:
+    splitCompareBranch(Branch, SystemZ::CGHI);
+    break;
+  case SystemZ::CLRJ:
+    splitCompareBranch(Branch, SystemZ::CLR);
+    break;
+  case SystemZ::CLGRJ:
+    splitCompareBranch(Branch, SystemZ::CLGR);
+    break;
+  case SystemZ::CLIJ:
+    splitCompareBranch(Branch, SystemZ::CLFI);
+    break;
+  case SystemZ::CLGIJ:
+    splitCompareBranch(Branch, SystemZ::CLGFI);
+    break;
+  default:
+    llvm_unreachable("Unrecognized branch");
+  }
+
+  Terminator.Size += Terminator.ExtraRelaxSize;
+  Terminator.ExtraRelaxSize = 0;
+  Terminator.Branch = nullptr;
+
+  ++LongBranches;
+}
+
+// Run a shortening pass and relax any branches that need to be relaxed.
+void SystemZLongBranch::relaxBranches() {
+  SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin();
+  BlockPosition Position(MF->getAlignment());
+  for (auto &Block : MBBs) {
+    skipNonTerminators(Position, Block);
+    for (unsigned BTI = 0, BTE = Block.NumTerminators; BTI != BTE; ++BTI) {
+      assert(Position.Address <= TI->Address &&
+             "Addresses shouldn't go forwards");
+      if (mustRelaxBranch(*TI, Position.Address))
+        relaxBranch(*TI);
+      skipTerminator(Position, *TI, false);
+      ++TI;
+    }
+  }
+}
+
+bool SystemZLongBranch::runOnMachineFunction(MachineFunction &F) {
+  TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
+  MF = &F;
+  uint64_t Size = initMBBInfo();
+  if (Size <= MaxForwardRange || !mustRelaxABranch())
+    return false;
+
+  setWorstCaseAddresses();
+  relaxBranches();
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
new file mode 100644
index 000000000000..2655e4866b20
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -0,0 +1,103 @@
+//===-- SystemZMCInstLower.cpp - Lower MachineInstr to MCInst -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMCInstLower.h"
+#include "SystemZAsmPrinter.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+// Return the VK_* enumeration for MachineOperand target flags Flags.
+static MCSymbolRefExpr::VariantKind getVariantKind(unsigned Flags) {
+  switch (Flags & SystemZII::MO_SYMBOL_MODIFIER) {
+    case 0:
+      return MCSymbolRefExpr::VK_None;
+    case SystemZII::MO_GOT:
+      return MCSymbolRefExpr::VK_GOT;
+    case SystemZII::MO_INDNTPOFF:
+      return MCSymbolRefExpr::VK_INDNTPOFF;
+  }
+  llvm_unreachable("Unrecognised MO_ACCESS_MODEL");
+}
+
+SystemZMCInstLower::SystemZMCInstLower(MCContext &ctx,
+                                       SystemZAsmPrinter &asmprinter)
+  : Ctx(ctx), AsmPrinter(asmprinter) {}
+
+const MCExpr *
+SystemZMCInstLower::getExpr(const MachineOperand &MO,
+                            MCSymbolRefExpr::VariantKind Kind) const {
+  const MCSymbol *Symbol;
+  bool HasOffset = true;
+  switch (MO.getType()) {
+  case MachineOperand::MO_MachineBasicBlock:
+    Symbol = MO.getMBB()->getSymbol();
+    HasOffset = false;
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    Symbol = AsmPrinter.getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_ExternalSymbol:
+    Symbol = AsmPrinter.GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+
+  case MachineOperand::MO_JumpTableIndex:
+    Symbol = AsmPrinter.GetJTISymbol(MO.getIndex());
+    HasOffset = false;
+    break;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    Symbol = AsmPrinter.GetCPISymbol(MO.getIndex());
+    break;
+
+  case MachineOperand::MO_BlockAddress:
+    Symbol = AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress());
+    break;
+
+  default:
+    llvm_unreachable("unknown operand type");
+  }
+  const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, Kind, Ctx);
+  if (HasOffset)
+    if (int64_t Offset = MO.getOffset()) {
+      const MCExpr *OffsetExpr = MCConstantExpr::create(Offset, Ctx);
+      Expr = MCBinaryExpr::createAdd(Expr, OffsetExpr, Ctx);
+    }
+  return Expr;
+}
+
+MCOperand SystemZMCInstLower::lowerOperand(const MachineOperand &MO) const {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    return MCOperand::createReg(MO.getReg());
+
+  case MachineOperand::MO_Immediate:
+    return MCOperand::createImm(MO.getImm());
+
+  default: {
+    MCSymbolRefExpr::VariantKind Kind = getVariantKind(MO.getTargetFlags());
+    return MCOperand::createExpr(getExpr(MO, Kind));
+  }
+  }
+}
+
+void SystemZMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    // Ignore all implicit register operands.
+    if (!MO.isReg() || !MO.isImplicit())
+      OutMI.addOperand(lowerOperand(MO));
+  }
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.h b/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.h
new file mode 100644
index 000000000000..7173cfa42959
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMCInstLower.h
@@ -0,0 +1,44 @@
+//===-- SystemZMCInstLower.h - Lower MachineInstr to MCInst ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMCINSTLOWER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMCINSTLOWER_H
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCInst;
+class MCOperand;
+class MachineInstr;
+class MachineOperand;
+class Mangler;
+class SystemZAsmPrinter;
+
+class LLVM_LIBRARY_VISIBILITY SystemZMCInstLower {
+  MCContext &Ctx;
+  SystemZAsmPrinter &AsmPrinter;
+
+public:
+  SystemZMCInstLower(MCContext &ctx, SystemZAsmPrinter &asmPrinter);
+
+  // Lower MachineInstr MI to MCInst OutMI.
+  void lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  // Return an MCOperand for MO.
+  MCOperand lowerOperand(const MachineOperand& MO) const;
+
+  // Return an MCExpr for symbolic operand MO with variant kind Kind.
+  const MCExpr *getExpr(const MachineOperand &MO,
+                        MCSymbolRefExpr::VariantKind Kind) const;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..1a7c0d7f687a
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
@@ -0,0 +1,17 @@
+//=== SystemZMachineFunctionInfo.cpp - SystemZ machine function info ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMachineFunctionInfo.h"
+
+using namespace llvm;
+
+
+// pin vtable to this file
+void SystemZMachineFunctionInfo::anchor() {}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
new file mode 100644
index 000000000000..4f64f4c65f1d
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -0,0 +1,79 @@
+//=== SystemZMachineFunctionInfo.h - SystemZ machine function info -*- C++ -*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+class SystemZMachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+  unsigned LowSavedGPR;
+  unsigned HighSavedGPR;
+  unsigned VarArgsFirstGPR;
+  unsigned VarArgsFirstFPR;
+  unsigned VarArgsFrameIndex;
+  unsigned RegSaveFrameIndex;
+  int FramePointerSaveIndex;
+  bool ManipulatesSP;
+  unsigned NumLocalDynamics;
+
+public:
+  explicit SystemZMachineFunctionInfo(MachineFunction &MF)
+    : LowSavedGPR(0), HighSavedGPR(0), VarArgsFirstGPR(0), VarArgsFirstFPR(0),
+      VarArgsFrameIndex(0), RegSaveFrameIndex(0), FramePointerSaveIndex(0),
+      ManipulatesSP(false), NumLocalDynamics(0) {}
+
+  // Get and set the first call-saved GPR that should be saved and restored
+  // by this function.  This is 0 if no GPRs need to be saved or restored.
+  unsigned getLowSavedGPR() const { return LowSavedGPR; }
+  void setLowSavedGPR(unsigned Reg) { LowSavedGPR = Reg; }
+
+  // Get and set the last call-saved GPR that should be saved and restored
+  // by this function.
+  unsigned getHighSavedGPR() const { return HighSavedGPR; }
+  void setHighSavedGPR(unsigned Reg) { HighSavedGPR = Reg; }
+
+  // Get and set the number of fixed (as opposed to variable) arguments
+  // that are passed in GPRs to this function.
+  unsigned getVarArgsFirstGPR() const { return VarArgsFirstGPR; }
+  void setVarArgsFirstGPR(unsigned GPR) { VarArgsFirstGPR = GPR; }
+
+  // Likewise FPRs.
+  unsigned getVarArgsFirstFPR() const { return VarArgsFirstFPR; }
+  void setVarArgsFirstFPR(unsigned FPR) { VarArgsFirstFPR = FPR; }
+
+  // Get and set the frame index of the first stack vararg.
+  unsigned getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(unsigned FI) { VarArgsFrameIndex = FI; }
+
+  // Get and set the frame index of the register save area
+  // (i.e. the incoming stack pointer).
+  unsigned getRegSaveFrameIndex() const { return RegSaveFrameIndex; }
+  void setRegSaveFrameIndex(unsigned FI) { RegSaveFrameIndex = FI; }
+
+  // Get and set the frame index of where the old frame pointer is stored.
+  int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
+  void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
+
+  // Get and set whether the function directly manipulates the stack pointer,
+  // e.g. through STACKSAVE or STACKRESTORE.
+  bool getManipulatesSP() const { return ManipulatesSP; }
+  void setManipulatesSP(bool MSP) { ManipulatesSP = MSP; }
+
+  // Count number of local-dynamic TLS symbols used.
+  unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
new file mode 100644
index 000000000000..ab6020f3f189
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -0,0 +1,153 @@
+//-- SystemZMachineScheduler.cpp - SystemZ Scheduler Interface -*- C++ -*---==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// -------------------------- Post RA scheduling ---------------------------- //
+// SystemZPostRASchedStrategy is a scheduling strategy which is plugged into
+// the MachineScheduler. It has a sorted Available set of SUs and a pickNode()
+// implementation that looks to optimize decoder grouping and balance the
+// usage of processor resources.
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMachineScheduler.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+#ifndef NDEBUG
+// Print the set of SUs
+void SystemZPostRASchedStrategy::SUSet::
+dump(SystemZHazardRecognizer &HazardRec) {
+  dbgs() << "{";
+  for (auto &SU : *this) {
+    HazardRec.dumpSU(SU, dbgs());
+    if (SU != *rbegin())
+      dbgs() << ",  ";
+  }
+  dbgs() << "}\n";
+}
+#endif
+
+SystemZPostRASchedStrategy::
+SystemZPostRASchedStrategy(const MachineSchedContext *C)
+  : DAG(nullptr), HazardRec(C) {}
+
+void SystemZPostRASchedStrategy::initialize(ScheduleDAGMI *dag) {
+  DAG = dag;
+  HazardRec.setDAG(dag);
+  HazardRec.Reset();
+}
+
+// Pick the next node to schedule.
+SUnit *SystemZPostRASchedStrategy::pickNode(bool &IsTopNode) {
+  // Only scheduling top-down.
+  IsTopNode = true;
+
+  if (Available.empty())
+    return nullptr;
+
+  // If only one choice, return it.
+  if (Available.size() == 1) {
+    DEBUG (dbgs() << "+++ Only one: ";
+           HazardRec.dumpSU(*Available.begin(), dbgs()); dbgs() << "\n";);
+    return *Available.begin();
+  }
+
+  // All nodes that are possible to schedule are stored by in the
+  // Available set.
+  DEBUG(dbgs() << "+++ Available: "; Available.dump(HazardRec););
+
+  Candidate Best;
+  for (auto *SU : Available) {
+
+    // SU is the next candidate to be compared against current Best.
+    Candidate c(SU, HazardRec);
+
+    // Remeber which SU is the best candidate.
+    if (Best.SU == nullptr || c < Best) {
+      Best = c;
+      DEBUG(dbgs() << "+++ Best sofar: ";
+            HazardRec.dumpSU(Best.SU, dbgs());
+            if (Best.GroupingCost != 0)
+              dbgs() << "\tGrouping cost:" << Best.GroupingCost;
+            if (Best.ResourcesCost != 0)
+              dbgs() << " Resource cost:" << Best.ResourcesCost;
+            dbgs() << " Height:" << Best.SU->getHeight();
+            dbgs() << "\n";);
+    }
+
+    // Once we know we have seen all SUs that affect grouping or use unbuffered
+    // resources, we can stop iterating if Best looks good.
+    if (!SU->isScheduleHigh && Best.noCost())
+      break;
+  }
+
+  assert (Best.SU != nullptr);
+  return Best.SU;
+}
+
+SystemZPostRASchedStrategy::Candidate::
+Candidate(SUnit *SU_, SystemZHazardRecognizer &HazardRec) : Candidate() {
+  SU = SU_;
+
+  // Check the grouping cost. For a node that must begin / end a
+  // group, it is positive if it would do so prematurely, or negative
+  // if it would fit naturally into the schedule.
+  GroupingCost = HazardRec.groupingCost(SU);
+
+    // Check the resources cost for this SU.
+  ResourcesCost = HazardRec.resourcesCost(SU);
+}
+
+bool SystemZPostRASchedStrategy::Candidate::
+operator<(const Candidate &other) {
+
+  // Check decoder grouping.
+  if (GroupingCost < other.GroupingCost)
+    return true;
+  if (GroupingCost > other.GroupingCost)
+    return false;
+
+  // Compare the use of resources.
+  if (ResourcesCost < other.ResourcesCost)
+    return true;
+  if (ResourcesCost > other.ResourcesCost)
+    return false;
+
+  // Higher SU is otherwise generally better.
+  if (SU->getHeight() > other.SU->getHeight())
+    return true;
+  if (SU->getHeight() < other.SU->getHeight())
+    return false;
+
+  // If all same, fall back to original order.
+  if (SU->NodeNum < other.SU->NodeNum)
+    return true;
+
+  return false;
+}
+
+void SystemZPostRASchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+  DEBUG(dbgs() << "+++ Scheduling SU(" << SU->NodeNum << ")\n";);
+
+  // Remove SU from Available set and update HazardRec.
+  Available.erase(SU);
+  HazardRec.EmitInstruction(SU);
+}
+
+void SystemZPostRASchedStrategy::releaseTopNode(SUnit *SU) {
+  // Set isScheduleHigh flag on all SUs that we want to consider first in
+  // pickNode().
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  bool AffectsGrouping = (SC->isValid() && (SC->BeginGroup || SC->EndGroup));
+  SU->isScheduleHigh = (AffectsGrouping || SU->isUnbuffered);
+
+  // Put all released SUs in the Available set.
+  Available.insert(SU);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
new file mode 100644
index 000000000000..b919758b70e7
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -0,0 +1,112 @@
+//==-- SystemZMachineScheduler.h - SystemZ Scheduler Interface -*- C++ -*---==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// -------------------------- Post RA scheduling ---------------------------- //
+// SystemZPostRASchedStrategy is a scheduling strategy which is plugged into
+// the MachineScheduler. It has a sorted Available set of SUs and a pickNode()
+// implementation that looks to optimize decoder grouping and balance the
+// usage of processor resources.
+//===----------------------------------------------------------------------===//
+
+#include "SystemZInstrInfo.h"
+#include "SystemZHazardRecognizer.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/Support/Debug.h"
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
+
+using namespace llvm;
+
+namespace llvm {
+  
+/// A MachineSchedStrategy implementation for SystemZ post RA scheduling.
+class SystemZPostRASchedStrategy : public MachineSchedStrategy {
+    ScheduleDAGMI *DAG;
+  
+  /// A candidate during instruction evaluation.
+  struct Candidate {
+    SUnit *SU;
+
+    /// The decoding cost.
+    int GroupingCost;
+
+    /// The processor resources cost.
+    int ResourcesCost;
+
+    Candidate() : SU(nullptr), GroupingCost(0), ResourcesCost(0) {}
+    Candidate(SUnit *SU_, SystemZHazardRecognizer &HazardRec);
+
+    // Compare two candidates.
+    bool operator<(const Candidate &other);
+
+    // Check if this node is free of cost ("as good as any").
+    bool inline noCost() {
+      return (GroupingCost <= 0 && !ResourcesCost);
+    }
+   };
+
+  // A sorter for the Available set that makes sure that SUs are considered
+  // in the best order.
+  struct SUSorter {
+    bool operator() (SUnit *lhs, SUnit *rhs) const {
+      if (lhs->isScheduleHigh && !rhs->isScheduleHigh)
+        return true;
+      if (!lhs->isScheduleHigh && rhs->isScheduleHigh)
+        return false;
+
+      if (lhs->getHeight() > rhs->getHeight())
+        return true;
+      else if (lhs->getHeight() < rhs->getHeight())
+        return false;
+
+      return (lhs->NodeNum < rhs->NodeNum);
+    }
+  };
+  // A set of SUs with a sorter and dump method.
+  struct SUSet : std::set<SUnit*, SUSorter> {
+    #ifndef NDEBUG
+    void dump(SystemZHazardRecognizer &HazardRec);
+    #endif
+  };
+
+  /// The set of available SUs to schedule next.
+  SUSet Available;
+
+  // HazardRecognizer that tracks the scheduler state for the current
+  // region.
+  SystemZHazardRecognizer HazardRec;
+  
+ public:
+  SystemZPostRASchedStrategy(const MachineSchedContext *C);
+
+  /// PostRA scheduling does not track pressure.
+  bool shouldTrackPressure() const override { return false; }
+
+  /// Initialize the strategy after building the DAG for a new region.
+  void initialize(ScheduleDAGMI *dag) override;
+
+  /// Pick the next node to schedule, or return NULL.
+  SUnit *pickNode(bool &IsTopNode) override;
+
+  /// ScheduleDAGMI has scheduled an instruction - tell HazardRec
+  /// about it.
+  void schedNode(SUnit *SU, bool IsTopNode) override;
+
+  /// SU has had all predecessor dependencies resolved. Put it into
+  /// Available.
+  void releaseTopNode(SUnit *SU) override;
+
+  /// Currently only scheduling top-down, so this method is empty.
+  void releaseBottomNode(SUnit *SU) override {};
+};
+
+} // namespace llvm
+
+#endif /* LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H */
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td b/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td
new file mode 100644
index 000000000000..7bb4fe5afb3f
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td
@@ -0,0 +1,593 @@
+//===-- SystemZOperands.td - SystemZ instruction operands ----*- tblgen-*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Class definitions
+//===----------------------------------------------------------------------===//
+
+class ImmediateAsmOperand<string name>
+  : AsmOperandClass {
+  let Name = name;
+  let RenderMethod = "addImmOperands";
+}
+class ImmediateTLSAsmOperand<string name>
+  : AsmOperandClass {
+  let Name = name;
+  let RenderMethod = "addImmTLSOperands";
+}
+
+// Constructs both a DAG pattern and instruction operand for an immediate
+// of type VT.  PRED returns true if a node is acceptable and XFORM returns
+// the operand value associated with the node.  ASMOP is the name of the
+// associated asm operand, and also forms the basis of the asm print method.
+class Immediate<ValueType vt, code pred, SDNodeXForm xform, string asmop>
+  : PatLeaf<(vt imm), pred, xform>, Operand<vt> {
+  let PrintMethod = "print"##asmop##"Operand";
+  let DecoderMethod = "decode"##asmop##"Operand";
+  let ParserMatchClass = !cast<AsmOperandClass>(asmop);
+}
+
+// Constructs an asm operand for a PC-relative address.  SIZE says how
+// many bits there are.
+class PCRelAsmOperand<string size> : ImmediateAsmOperand<"PCRel"##size> {
+  let PredicateMethod = "isImm";
+  let ParserMethod = "parsePCRel"##size;
+}
+class PCRelTLSAsmOperand<string size>
+  : ImmediateTLSAsmOperand<"PCRelTLS"##size> {
+  let PredicateMethod = "isImmTLS";
+  let ParserMethod = "parsePCRelTLS"##size;
+}
+
+// Constructs an operand for a PC-relative address with address type VT.
+// ASMOP is the associated asm operand.
+class PCRelOperand<ValueType vt, AsmOperandClass asmop> : Operand<vt> {
+  let PrintMethod = "printPCRelOperand";
+  let ParserMatchClass = asmop;
+}
+class PCRelTLSOperand<ValueType vt, AsmOperandClass asmop> : Operand<vt> {
+  let PrintMethod = "printPCRelTLSOperand";
+  let ParserMatchClass = asmop;
+}
+
+// Constructs both a DAG pattern and instruction operand for a PC-relative
+// address with address size VT.  SELF is the name of the operand and
+// ASMOP is the associated asm operand.
+class PCRelAddress<ValueType vt, string self, AsmOperandClass asmop>
+  : ComplexPattern<vt, 1, "selectPCRelAddress",
+                   [z_pcrel_wrapper, z_pcrel_offset]>,
+    PCRelOperand<vt, asmop> {
+  let MIOperandInfo = (ops !cast<Operand>(self));
+}
+
+// Constructs an AsmOperandClass for addressing mode FORMAT, treating the
+// registers as having BITSIZE bits and displacements as having DISPSIZE bits.
+// LENGTH is "LenN" for addresses with an N-bit length field, otherwise it
+// is "".
+class AddressAsmOperand<string format, string bitsize, string dispsize,
+                        string length = "">
+  : AsmOperandClass {
+  let Name = format##bitsize##"Disp"##dispsize##length;
+  let ParserMethod = "parse"##format##bitsize;
+  let RenderMethod = "add"##format##"Operands";
+}
+
+// Constructs an instruction operand for an addressing mode.  FORMAT,
+// BITSIZE, DISPSIZE and LENGTH are the parameters to an associated
+// AddressAsmOperand.  OPERANDS is a list of individual operands
+// (base register, displacement, etc.).
+class AddressOperand<string bitsize, string dispsize, string length,
+                     string format, dag operands>
+  : Operand<!cast<ValueType>("i"##bitsize)> {
+  let PrintMethod = "print"##format##"Operand";
+  let EncoderMethod = "get"##format##dispsize##length##"Encoding";
+  let DecoderMethod =
+    "decode"##format##bitsize##"Disp"##dispsize##length##"Operand";
+  let MIOperandInfo = operands;
+  let ParserMatchClass =
+    !cast<AddressAsmOperand>(format##bitsize##"Disp"##dispsize##length);
+}
+
+// Constructs both a DAG pattern and instruction operand for an addressing mode.
+// FORMAT, BITSIZE, DISPSIZE and LENGTH are the parameters to an associated
+// AddressAsmOperand.  OPERANDS is a list of NUMOPS individual operands
+// (base register, displacement, etc.).  SELTYPE is the type of the memory
+// operand for selection purposes; sometimes we want different selection
+// choices for the same underlying addressing mode.  SUFFIX is similarly
+// a suffix appended to the displacement for selection purposes;
+// e.g. we want to reject small 20-bit displacements if a 12-bit form
+// also exists, but we want to accept them otherwise.
+class AddressingMode<string seltype, string bitsize, string dispsize,
+                     string suffix, string length, int numops, string format,
+                     dag operands>
+  : ComplexPattern<!cast<ValueType>("i"##bitsize), numops,
+                   "select"##seltype##dispsize##suffix##length,
+                   [add, sub, or, frameindex, z_adjdynalloc]>,
+    AddressOperand<bitsize, dispsize, length, format, operands>;
+
+// An addressing mode with a base and displacement but no index.
+class BDMode<string type, string bitsize, string dispsize, string suffix>
+  : AddressingMode<type, bitsize, dispsize, suffix, "", 2, "BDAddr",
+                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
+                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize))>;
+
+// An addressing mode with a base, displacement and index.
+class BDXMode<string type, string bitsize, string dispsize, string suffix>
+  : AddressingMode<type, bitsize, dispsize, suffix, "", 3, "BDXAddr",
+                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
+                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+                        !cast<RegisterOperand>("ADDR"##bitsize))>;
+
+// A BDMode paired with an immediate length operand of LENSIZE bits.
+class BDLMode<string type, string bitsize, string dispsize, string suffix,
+              string lensize>
+  : AddressingMode<type, bitsize, dispsize, suffix, "Len"##lensize, 3,
+                   "BDLAddr",
+                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
+                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+                        !cast<Immediate>("imm"##bitsize))>;
+
+// A BDMode paired with a register length operand.
+class BDRMode<string type, string bitsize, string dispsize, string suffix>
+  : AddressingMode<type, bitsize, dispsize, suffix, "", 3, "BDRAddr",
+                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
+                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+                        !cast<RegisterOperand>("GR"##bitsize))>;
+
+// An addressing mode with a base, displacement and a vector index.
+class BDVMode<string bitsize, string dispsize>
+  : AddressOperand<bitsize, dispsize, "", "BDVAddr",
+                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
+                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+                        !cast<RegisterOperand>("VR128"))>;
+
+//===----------------------------------------------------------------------===//
+// Extracting immediate operands from nodes
+// These all create MVT::i64 nodes to ensure the value is not sign-extended
+// when converted from an SDNode to a MachineOperand later on.
+//===----------------------------------------------------------------------===//
+
+// Bits 0-15 (counting from the lsb).
+def LL16 : SDNodeXForm<imm, [{
+  uint64_t Value = N->getZExtValue() & 0x000000000000FFFFULL;
+  return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64);
+}]>;
+
+// Bits 16-31 (counting from the lsb).
+def LH16 : SDNodeXForm<imm, [{
+  uint64_t Value = (N->getZExtValue() & 0x00000000FFFF0000ULL) >> 16;
+  return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64);
+}]>;
+
+// Bits 32-47 (counting from the lsb).
+def HL16 : SDNodeXForm<imm, [{
+  uint64_t Value = (N->getZExtValue() & 0x0000FFFF00000000ULL) >> 32;
+  return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64);
+}]>;
+
+// Bits 48-63 (counting from the lsb).
+def HH16 : SDNodeXForm<imm, [{
+  uint64_t Value = (N->getZExtValue() & 0xFFFF000000000000ULL) >> 48;
+  return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64);
+}]>;
+
+// Low 32 bits.
+def LF32 : SDNodeXForm<imm, [{
+  uint64_t Value = N->getZExtValue() & 0x00000000FFFFFFFFULL;
+  return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64);
+}]>;
+
+// High 32 bits.
+def HF32 : SDNodeXForm<imm, [{
+  uint64_t Value = N->getZExtValue() >> 32;
+  return CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i64);
+}]>;
+
+// Truncate an immediate to a 8-bit signed quantity.
+def SIMM8 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(int8_t(N->getZExtValue()), SDLoc(N),
+                                   MVT::i64);
+}]>;
+
+// Truncate an immediate to a 8-bit unsigned quantity.
+def UIMM8 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(uint8_t(N->getZExtValue()), SDLoc(N),
+                                   MVT::i64);
+}]>;
+
+// Truncate an immediate to a 8-bit unsigned quantity and mask off low bit.
+def UIMM8EVEN : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 0xfe, SDLoc(N),
+                                   MVT::i64);
+}]>;
+
+// Truncate an immediate to a 12-bit unsigned quantity.
+def UIMM12 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 0xfff, SDLoc(N),
+                                   MVT::i64);
+}]>;
+
+// Truncate an immediate to a 16-bit signed quantity.
+def SIMM16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(int16_t(N->getZExtValue()), SDLoc(N),
+                                   MVT::i64);
+}]>;
+
+// Truncate an immediate to a 16-bit unsigned quantity.
+def UIMM16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(uint16_t(N->getZExtValue()), SDLoc(N),
+                                   MVT::i64);
+}]>;
+
+// Truncate an immediate to a 32-bit signed quantity.
+def SIMM32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(int32_t(N->getZExtValue()), SDLoc(N),
+                                   MVT::i64);
+}]>;
+
+// Truncate an immediate to a 32-bit unsigned quantity.
+def UIMM32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(uint32_t(N->getZExtValue()), SDLoc(N),
+                                   MVT::i64);
+}]>;
+
+// Truncate an immediate to a 48-bit unsigned quantity.
+def UIMM48 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(uint64_t(N->getZExtValue()) & 0xffffffffffff,
+                                   SDLoc(N), MVT::i64);
+}]>;
+
+// Negate and then truncate an immediate to a 32-bit unsigned quantity.
+def NEGIMM32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(uint32_t(-N->getZExtValue()), SDLoc(N),
+                                   MVT::i64);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Immediate asm operands.
+//===----------------------------------------------------------------------===//
+
+def U1Imm  : ImmediateAsmOperand<"U1Imm">;
+def U2Imm  : ImmediateAsmOperand<"U2Imm">;
+def U3Imm  : ImmediateAsmOperand<"U3Imm">;
+def U4Imm  : ImmediateAsmOperand<"U4Imm">;
+def U6Imm  : ImmediateAsmOperand<"U6Imm">;
+def S8Imm  : ImmediateAsmOperand<"S8Imm">;
+def U8Imm  : ImmediateAsmOperand<"U8Imm">;
+def U12Imm : ImmediateAsmOperand<"U12Imm">;
+def S16Imm : ImmediateAsmOperand<"S16Imm">;
+def U16Imm : ImmediateAsmOperand<"U16Imm">;
+def S32Imm : ImmediateAsmOperand<"S32Imm">;
+def U32Imm : ImmediateAsmOperand<"U32Imm">;
+def U48Imm : ImmediateAsmOperand<"U48Imm">;
+
+//===----------------------------------------------------------------------===//
+// i32 immediates
+//===----------------------------------------------------------------------===//
+
+// Immediates for the lower and upper 16 bits of an i32, with the other
+// bits of the i32 being zero.
+def imm32ll16 : Immediate<i32, [{
+  return SystemZ::isImmLL(N->getZExtValue());
+}], LL16, "U16Imm">;
+
+def imm32lh16 : Immediate<i32, [{
+  return SystemZ::isImmLH(N->getZExtValue());
+}], LH16, "U16Imm">;
+
+// Immediates for the lower and upper 16 bits of an i32, with the other
+// bits of the i32 being one.
+def imm32ll16c : Immediate<i32, [{
+  return SystemZ::isImmLL(uint32_t(~N->getZExtValue()));
+}], LL16, "U16Imm">;
+
+def imm32lh16c : Immediate<i32, [{
+  return SystemZ::isImmLH(uint32_t(~N->getZExtValue()));
+}], LH16, "U16Imm">;
+
+// Short immediates
+def imm32zx1 : Immediate<i32, [{
+  return isUInt<1>(N->getZExtValue());
+}], NOOP_SDNodeXForm, "U1Imm">;
+
+def imm32zx2 : Immediate<i32, [{
+  return isUInt<2>(N->getZExtValue());
+}], NOOP_SDNodeXForm, "U2Imm">;
+
+def imm32zx3 : Immediate<i32, [{
+  return isUInt<3>(N->getZExtValue());
+}], NOOP_SDNodeXForm, "U3Imm">;
+
+def imm32zx4 : Immediate<i32, [{
+  return isUInt<4>(N->getZExtValue());
+}], NOOP_SDNodeXForm, "U4Imm">;
+
+// Note: this enforces an even value during code generation only.
+// When used from the assembler, any 4-bit value is allowed.
+def imm32zx4even : Immediate<i32, [{
+  return isUInt<4>(N->getZExtValue());
+}], UIMM8EVEN, "U4Imm">;
+
+def imm32zx6 : Immediate<i32, [{
+  return isUInt<6>(N->getZExtValue());
+}], NOOP_SDNodeXForm, "U6Imm">;
+
+def imm32sx8 : Immediate<i32, [{
+  return isInt<8>(N->getSExtValue());
+}], SIMM8, "S8Imm">;
+
+def imm32zx8 : Immediate<i32, [{
+  return isUInt<8>(N->getZExtValue());
+}], UIMM8, "U8Imm">;
+
+def imm32zx8trunc : Immediate<i32, [{}], UIMM8, "U8Imm">;
+
+def imm32zx12 : Immediate<i32, [{
+  return isUInt<12>(N->getZExtValue());
+}], UIMM12, "U12Imm">;
+
+def imm32sx16 : Immediate<i32, [{
+  return isInt<16>(N->getSExtValue());
+}], SIMM16, "S16Imm">;
+
+def imm32zx16 : Immediate<i32, [{
+  return isUInt<16>(N->getZExtValue());
+}], UIMM16, "U16Imm">;
+
+def imm32sx16trunc : Immediate<i32, [{}], SIMM16, "S16Imm">;
+
+// Full 32-bit immediates.  we need both signed and unsigned versions
+// because the assembler is picky.  E.g. AFI requires signed operands
+// while NILF requires unsigned ones.
+def simm32 : Immediate<i32, [{}], SIMM32, "S32Imm">;
+def uimm32 : Immediate<i32, [{}], UIMM32, "U32Imm">;
+
+def imm32 : ImmLeaf<i32, [{}]>;
+
+//===----------------------------------------------------------------------===//
+// 64-bit immediates
+//===----------------------------------------------------------------------===//
+
+// Immediates for 16-bit chunks of an i64, with the other bits of the
+// i32 being zero.
+def imm64ll16 : Immediate<i64, [{
+  return SystemZ::isImmLL(N->getZExtValue());
+}], LL16, "U16Imm">;
+
+def imm64lh16 : Immediate<i64, [{
+  return SystemZ::isImmLH(N->getZExtValue());
+}], LH16, "U16Imm">;
+
+def imm64hl16 : Immediate<i64, [{
+  return SystemZ::isImmHL(N->getZExtValue());
+}], HL16, "U16Imm">;
+
+def imm64hh16 : Immediate<i64, [{
+  return SystemZ::isImmHH(N->getZExtValue());
+}], HH16, "U16Imm">;
+
+// Immediates for 16-bit chunks of an i64, with the other bits of the
+// i32 being one.
+def imm64ll16c : Immediate<i64, [{
+  return SystemZ::isImmLL(uint64_t(~N->getZExtValue()));
+}], LL16, "U16Imm">;
+
+def imm64lh16c : Immediate<i64, [{
+  return SystemZ::isImmLH(uint64_t(~N->getZExtValue()));
+}], LH16, "U16Imm">;
+
+def imm64hl16c : Immediate<i64, [{
+  return SystemZ::isImmHL(uint64_t(~N->getZExtValue()));
+}], HL16, "U16Imm">;
+
+def imm64hh16c : Immediate<i64, [{
+  return SystemZ::isImmHH(uint64_t(~N->getZExtValue()));
+}], HH16, "U16Imm">;
+
+// Immediates for the lower and upper 32 bits of an i64, with the other
+// bits of the i32 being zero.
+def imm64lf32 : Immediate<i64, [{
+  return SystemZ::isImmLF(N->getZExtValue());
+}], LF32, "U32Imm">;
+
+def imm64hf32 : Immediate<i64, [{
+  return SystemZ::isImmHF(N->getZExtValue());
+}], HF32, "U32Imm">;
+
+// Immediates for the lower and upper 32 bits of an i64, with the other
+// bits of the i32 being one.
+def imm64lf32c : Immediate<i64, [{
+  return SystemZ::isImmLF(uint64_t(~N->getZExtValue()));
+}], LF32, "U32Imm">;
+
+def imm64hf32c : Immediate<i64, [{
+  return SystemZ::isImmHF(uint64_t(~N->getZExtValue()));
+}], HF32, "U32Imm">;
+
+// Short immediates.
+def imm64sx8 : Immediate<i64, [{
+  return isInt<8>(N->getSExtValue());
+}], SIMM8, "S8Imm">;
+
+def imm64zx8 : Immediate<i64, [{
+  return isUInt<8>(N->getSExtValue());
+}], UIMM8, "U8Imm">;
+
+def imm64sx16 : Immediate<i64, [{
+  return isInt<16>(N->getSExtValue());
+}], SIMM16, "S16Imm">;
+
+def imm64zx16 : Immediate<i64, [{
+  return isUInt<16>(N->getZExtValue());
+}], UIMM16, "U16Imm">;
+
+def imm64sx32 : Immediate<i64, [{
+  return isInt<32>(N->getSExtValue());
+}], SIMM32, "S32Imm">;
+
+def imm64zx32 : Immediate<i64, [{
+  return isUInt<32>(N->getZExtValue());
+}], UIMM32, "U32Imm">;
+
+def imm64zx32n : Immediate<i64, [{
+  return isUInt<32>(-N->getSExtValue());
+}], NEGIMM32, "U32Imm">;
+
+def imm64zx48 : Immediate<i64, [{
+  return isUInt<64>(N->getZExtValue());
+}], UIMM48, "U48Imm">;
+
+def imm64 : ImmLeaf<i64, [{}]>, Operand<i64>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point immediates
+//===----------------------------------------------------------------------===//
+
+// Floating-point zero.
+def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
+
+// Floating point negative zero.
+def fpimmneg0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(-0.0); }]>;
+
+//===----------------------------------------------------------------------===//
+// Symbolic address operands
+//===----------------------------------------------------------------------===//
+
+// PC-relative asm operands.
+def PCRel12 : PCRelAsmOperand<"12">;
+def PCRel16 : PCRelAsmOperand<"16">;
+def PCRel24 : PCRelAsmOperand<"24">;
+def PCRel32 : PCRelAsmOperand<"32">;
+def PCRelTLS16 : PCRelTLSAsmOperand<"16">;
+def PCRelTLS32 : PCRelTLSAsmOperand<"32">;
+
+// PC-relative offsets of a basic block.  The offset is sign-extended
+// and multiplied by 2.
+def brtarget16 : PCRelOperand<OtherVT, PCRel16> {
+  let EncoderMethod = "getPC16DBLEncoding";
+  let DecoderMethod = "decodePC16DBLBranchOperand";
+}
+def brtarget32 : PCRelOperand<OtherVT, PCRel32> {
+  let EncoderMethod = "getPC32DBLEncoding";
+  let DecoderMethod = "decodePC32DBLBranchOperand";
+}
+
+// Variants of brtarget for use with branch prediction preload.
+def brtarget12bpp : PCRelOperand<OtherVT, PCRel12> {
+  let EncoderMethod = "getPC12DBLBPPEncoding";
+  let DecoderMethod = "decodePC12DBLBranchOperand";
+}
+def brtarget16bpp : PCRelOperand<OtherVT, PCRel16> {
+  let EncoderMethod = "getPC16DBLBPPEncoding";
+  let DecoderMethod = "decodePC16DBLBranchOperand";
+}
+def brtarget24bpp : PCRelOperand<OtherVT, PCRel24> {
+  let EncoderMethod = "getPC24DBLBPPEncoding";
+  let DecoderMethod = "decodePC24DBLBranchOperand";
+}
+
+// Variants of brtarget16/32 with an optional additional TLS symbol.
+// These are used to annotate calls to __tls_get_offset.
+def tlssym : Operand<i64> { }
+def brtarget16tls : PCRelTLSOperand<OtherVT, PCRelTLS16> {
+  let MIOperandInfo = (ops brtarget16:$func, tlssym:$sym);
+  let EncoderMethod = "getPC16DBLTLSEncoding";
+  let DecoderMethod = "decodePC16DBLBranchOperand";
+}
+def brtarget32tls : PCRelTLSOperand<OtherVT, PCRelTLS32> {
+  let MIOperandInfo = (ops brtarget32:$func, tlssym:$sym);
+  let EncoderMethod = "getPC32DBLTLSEncoding";
+  let DecoderMethod = "decodePC32DBLBranchOperand";
+}
+
+// A PC-relative offset of a global value.  The offset is sign-extended
+// and multiplied by 2.
+def pcrel32 : PCRelAddress<i64, "pcrel32", PCRel32> {
+  let EncoderMethod = "getPC32DBLEncoding";
+  let DecoderMethod = "decodePC32DBLOperand";
+}
+
+//===----------------------------------------------------------------------===//
+// Addressing modes
+//===----------------------------------------------------------------------===//
+
+// 12-bit displacement operands.
+def disp12imm32 : Operand<i32>;
+def disp12imm64 : Operand<i64>;
+
+// 20-bit displacement operands.
+def disp20imm32 : Operand<i32>;
+def disp20imm64 : Operand<i64>;
+
+def BDAddr32Disp12      : AddressAsmOperand<"BDAddr",   "32", "12">;
+def BDAddr32Disp20      : AddressAsmOperand<"BDAddr",   "32", "20">;
+def BDAddr64Disp12      : AddressAsmOperand<"BDAddr",   "64", "12">;
+def BDAddr64Disp20      : AddressAsmOperand<"BDAddr",   "64", "20">;
+def BDXAddr64Disp12     : AddressAsmOperand<"BDXAddr",  "64", "12">;
+def BDXAddr64Disp20     : AddressAsmOperand<"BDXAddr",  "64", "20">;
+def BDLAddr64Disp12Len8 : AddressAsmOperand<"BDLAddr",  "64", "12", "Len8">;
+def BDRAddr64Disp12     : AddressAsmOperand<"BDRAddr",  "64", "12">;
+def BDVAddr64Disp12     : AddressAsmOperand<"BDVAddr",  "64", "12">;
+
+// DAG patterns and operands for addressing modes.  Each mode has
+// the form <type><range><group>[<len>] where:
+//
+// <type> is one of:
+//   shift    : base + displacement (32-bit)
+//   bdaddr   : base + displacement
+//   mviaddr  : like bdaddr, but reject cases with a natural index
+//   bdxaddr  : base + displacement + index
+//   laaddr   : like bdxaddr, but used for Load Address operations
+//   dynalloc : base + displacement + index + ADJDYNALLOC
+//   bdladdr  : base + displacement with a length field
+//   bdvaddr  : base + displacement with a vector index
+//
+// <range> is one of:
+//   12       : the displacement is an unsigned 12-bit value
+//   20       : the displacement is a signed 20-bit value
+//
+// <group> is one of:
+//   pair     : used when there is an equivalent instruction with the opposite
+//              range value (12 or 20)
+//   only     : used when there is no equivalent instruction with the opposite
+//              range value
+//
+// <len> is one of:
+//
+//   <empty>  : there is no length field
+//   len8     : the length field is 8 bits, with a range of [1, 0x100].
+def shift12only       : BDMode <"BDAddr",   "32", "12", "Only">;
+def shift20only       : BDMode <"BDAddr",   "32", "20", "Only">;
+def bdaddr12only      : BDMode <"BDAddr",   "64", "12", "Only">;
+def bdaddr12pair      : BDMode <"BDAddr",   "64", "12", "Pair">;
+def bdaddr20only      : BDMode <"BDAddr",   "64", "20", "Only">;
+def bdaddr20pair      : BDMode <"BDAddr",   "64", "20", "Pair">;
+def mviaddr12pair     : BDMode <"MVIAddr",  "64", "12", "Pair">;
+def mviaddr20pair     : BDMode <"MVIAddr",  "64", "20", "Pair">;
+def bdxaddr12only     : BDXMode<"BDXAddr",  "64", "12", "Only">;
+def bdxaddr12pair     : BDXMode<"BDXAddr",  "64", "12", "Pair">;
+def bdxaddr20only     : BDXMode<"BDXAddr",  "64", "20", "Only">;
+def bdxaddr20only128  : BDXMode<"BDXAddr",  "64", "20", "Only128">;
+def bdxaddr20pair     : BDXMode<"BDXAddr",  "64", "20", "Pair">;
+def dynalloc12only    : BDXMode<"DynAlloc", "64", "12", "Only">;
+def laaddr12pair      : BDXMode<"LAAddr",   "64", "12", "Pair">;
+def laaddr20pair      : BDXMode<"LAAddr",   "64", "20", "Pair">;
+def bdladdr12onlylen8 : BDLMode<"BDLAddr",  "64", "12", "Only", "8">;
+def bdraddr12only     : BDRMode<"BDRAddr",  "64", "12", "Only">;
+def bdvaddr12only     : BDVMode<            "64", "12">;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous
+//===----------------------------------------------------------------------===//
+
+// A 4-bit condition-code mask.
+def cond4 : PatLeaf<(i32 imm), [{ return (N->getZExtValue() < 16); }]>,
+            Operand<i32> {
+  let PrintMethod = "printCond4Operand";
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td b/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td
new file mode 100644
index 000000000000..fde26ed4e1c5
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -0,0 +1,684 @@
+//===-- SystemZOperators.td - SystemZ-specific operators ------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Type profiles
+//===----------------------------------------------------------------------===//
+def SDT_CallSeqStart        : SDCallSeqStart<[SDTCisVT<0, i64>]>;
+def SDT_CallSeqEnd          : SDCallSeqEnd<[SDTCisVT<0, i64>,
+                                            SDTCisVT<1, i64>]>;
+def SDT_ZCall               : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_ZCmp                : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_ZICmp               : SDTypeProfile<0, 3,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisVT<2, i32>]>;
+def SDT_ZBRCCMask           : SDTypeProfile<0, 3,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<2, OtherVT>]>;
+def SDT_ZSelectCCMask       : SDTypeProfile<1, 4,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<1, 2>,
+                                             SDTCisVT<3, i32>,
+                                             SDTCisVT<4, i32>]>;
+def SDT_ZWrapPtr            : SDTypeProfile<1, 1,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisPtrTy<0>]>;
+def SDT_ZWrapOffset         : SDTypeProfile<1, 2,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisPtrTy<0>]>;
+def SDT_ZAdjDynAlloc        : SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>;
+def SDT_ZGR128Binary32      : SDTypeProfile<1, 2,
+                                            [SDTCisVT<0, untyped>,
+                                             SDTCisVT<1, untyped>,
+                                             SDTCisVT<2, i32>]>;
+def SDT_ZGR128Binary64      : SDTypeProfile<1, 2,
+                                            [SDTCisVT<0, untyped>,
+                                             SDTCisVT<1, untyped>,
+                                             SDTCisVT<2, i64>]>;
+def SDT_ZAtomicLoadBinaryW  : SDTypeProfile<1, 5,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, i32>,
+                                             SDTCisVT<3, i32>,
+                                             SDTCisVT<4, i32>,
+                                             SDTCisVT<5, i32>]>;
+def SDT_ZAtomicCmpSwapW     : SDTypeProfile<1, 6,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, i32>,
+                                             SDTCisVT<3, i32>,
+                                             SDTCisVT<4, i32>,
+                                             SDTCisVT<5, i32>,
+                                             SDTCisVT<6, i32>]>;
+def SDT_ZMemMemLength       : SDTypeProfile<0, 3,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, i64>]>;
+def SDT_ZMemMemLoop         : SDTypeProfile<0, 4,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, i64>,
+                                             SDTCisVT<3, i64>]>;
+def SDT_ZString             : SDTypeProfile<1, 3,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisPtrTy<2>,
+                                             SDTCisVT<3, i32>]>;
+def SDT_ZI32Intrinsic       : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>;
+def SDT_ZPrefetch           : SDTypeProfile<0, 2,
+                                            [SDTCisVT<0, i32>,
+                                             SDTCisPtrTy<1>]>;
+def SDT_ZLoadBSwap          : SDTypeProfile<1, 2,
+                                            [SDTCisInt<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, OtherVT>]>;
+def SDT_ZStoreBSwap         : SDTypeProfile<0, 3,
+                                            [SDTCisInt<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, OtherVT>]>;
+def SDT_ZTBegin             : SDTypeProfile<0, 2,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisVT<1, i32>]>;
+def SDT_ZInsertVectorElt    : SDTypeProfile<1, 3,
+                                            [SDTCisVec<0>,
+                                             SDTCisSameAs<0, 1>,
+                                             SDTCisVT<3, i32>]>;
+def SDT_ZExtractVectorElt   : SDTypeProfile<1, 2,
+                                            [SDTCisVec<1>,
+                                             SDTCisVT<2, i32>]>;
+def SDT_ZReplicate          : SDTypeProfile<1, 1,
+                                            [SDTCisVec<0>]>;
+def SDT_ZVecUnaryConv       : SDTypeProfile<1, 1,
+                                            [SDTCisVec<0>,
+                                             SDTCisVec<1>]>;
+def SDT_ZVecUnary           : SDTypeProfile<1, 1,
+                                            [SDTCisVec<0>,
+                                             SDTCisSameAs<0, 1>]>;
+def SDT_ZVecBinary          : SDTypeProfile<1, 2,
+                                            [SDTCisVec<0>,
+                                             SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>]>;
+def SDT_ZVecBinaryInt       : SDTypeProfile<1, 2,
+                                            [SDTCisVec<0>,
+                                             SDTCisSameAs<0, 1>,
+                                             SDTCisVT<2, i32>]>;
+def SDT_ZVecBinaryConv      : SDTypeProfile<1, 2,
+                                            [SDTCisVec<0>,
+                                             SDTCisVec<1>,
+                                             SDTCisSameAs<1, 2>]>;
+def SDT_ZVecBinaryConvInt   : SDTypeProfile<1, 2,
+                                            [SDTCisVec<0>,
+                                             SDTCisVec<1>,
+                                             SDTCisVT<2, i32>]>;
+def SDT_ZRotateMask         : SDTypeProfile<1, 2,
+                                            [SDTCisVec<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<2, i32>]>;
+def SDT_ZJoinDwords         : SDTypeProfile<1, 2,
+                                            [SDTCisVT<0, v2i64>,
+                                             SDTCisVT<1, i64>,
+                                             SDTCisVT<2, i64>]>;
+def SDT_ZVecTernary         : SDTypeProfile<1, 3,
+                                            [SDTCisVec<0>,
+                                             SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>]>;
+def SDT_ZVecTernaryInt      : SDTypeProfile<1, 3,
+                                            [SDTCisVec<0>,
+                                             SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisVT<3, i32>]>;
+def SDT_ZVecQuaternaryInt   : SDTypeProfile<1, 4,
+                                            [SDTCisVec<0>,
+                                             SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisVT<4, i32>]>;
+def SDT_ZTest               : SDTypeProfile<0, 2, [SDTCisVT<1, i64>]>;
+
+//===----------------------------------------------------------------------===//
+// Node definitions
+//===----------------------------------------------------------------------===//
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start       : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart,
+                                 [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+def callseq_end         : SDNode<"ISD::CALLSEQ_END",   SDT_CallSeqEnd,
+                                 [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue,
+                                  SDNPOutGlue]>;
+def global_offset_table : SDNode<"ISD::GLOBAL_OFFSET_TABLE", SDTPtrLeaf>;
+
+// Nodes for SystemZISD::*.  See SystemZISelLowering.h for more details.
+def z_retflag           : SDNode<"SystemZISD::RET_FLAG", SDTNone,
+                                 [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def z_call              : SDNode<"SystemZISD::CALL", SDT_ZCall,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                                  SDNPVariadic]>;
+def z_sibcall           : SDNode<"SystemZISD::SIBCALL", SDT_ZCall,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                                  SDNPVariadic]>;
+def z_tls_gdcall        : SDNode<"SystemZISD::TLS_GDCALL", SDT_ZCall,
+                                 [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                  SDNPVariadic]>;
+def z_tls_ldcall        : SDNode<"SystemZISD::TLS_LDCALL", SDT_ZCall,
+                                 [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                  SDNPVariadic]>;
+def z_pcrel_wrapper     : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
+def z_pcrel_offset      : SDNode<"SystemZISD::PCREL_OFFSET",
+                                 SDT_ZWrapOffset, []>;
+def z_iabs              : SDNode<"SystemZISD::IABS", SDTIntUnaryOp, []>;
+def z_icmp              : SDNode<"SystemZISD::ICMP", SDT_ZICmp, [SDNPOutGlue]>;
+def z_fcmp              : SDNode<"SystemZISD::FCMP", SDT_ZCmp, [SDNPOutGlue]>;
+def z_tm                : SDNode<"SystemZISD::TM", SDT_ZICmp, [SDNPOutGlue]>;
+def z_br_ccmask         : SDNode<"SystemZISD::BR_CCMASK", SDT_ZBRCCMask,
+                                 [SDNPHasChain, SDNPInGlue]>;
+def z_select_ccmask     : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask,
+    		                 [SDNPInGlue]>;
+def z_adjdynalloc       : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
+def z_popcnt            : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
+def z_umul_lohi64       : SDNode<"SystemZISD::UMUL_LOHI64", SDT_ZGR128Binary64>;
+def z_sdivrem32         : SDNode<"SystemZISD::SDIVREM32", SDT_ZGR128Binary32>;
+def z_sdivrem64         : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>;
+def z_udivrem32         : SDNode<"SystemZISD::UDIVREM32", SDT_ZGR128Binary32>;
+def z_udivrem64         : SDNode<"SystemZISD::UDIVREM64", SDT_ZGR128Binary64>;
+
+def z_serialize         : SDNode<"SystemZISD::SERIALIZE", SDTNone,
+                                 [SDNPHasChain, SDNPMayStore]>;
+def z_membarrier        : SDNode<"SystemZISD::MEMBARRIER", SDTNone,
+                                 [SDNPHasChain, SDNPSideEffect]>;
+
+def z_loadbswap        : SDNode<"SystemZISD::LRV", SDT_ZLoadBSwap,
+                                 [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def z_storebswap       : SDNode<"SystemZISD::STRV", SDT_ZStoreBSwap,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def z_tdc               : SDNode<"SystemZISD::TDC", SDT_ZTest, [SDNPOutGlue]>;
+
+// Defined because the index is an i32 rather than a pointer.
+def z_vector_insert     : SDNode<"ISD::INSERT_VECTOR_ELT",
+                                 SDT_ZInsertVectorElt>;
+def z_vector_extract    : SDNode<"ISD::EXTRACT_VECTOR_ELT",
+                                 SDT_ZExtractVectorElt>;
+def z_byte_mask         : SDNode<"SystemZISD::BYTE_MASK", SDT_ZReplicate>;
+def z_rotate_mask       : SDNode<"SystemZISD::ROTATE_MASK", SDT_ZRotateMask>;
+def z_replicate         : SDNode<"SystemZISD::REPLICATE", SDT_ZReplicate>;
+def z_join_dwords       : SDNode<"SystemZISD::JOIN_DWORDS", SDT_ZJoinDwords>;
+def z_splat             : SDNode<"SystemZISD::SPLAT", SDT_ZVecBinaryInt>;
+def z_merge_high        : SDNode<"SystemZISD::MERGE_HIGH", SDT_ZVecBinary>;
+def z_merge_low         : SDNode<"SystemZISD::MERGE_LOW", SDT_ZVecBinary>;
+def z_shl_double        : SDNode<"SystemZISD::SHL_DOUBLE", SDT_ZVecTernaryInt>;
+def z_permute_dwords    : SDNode<"SystemZISD::PERMUTE_DWORDS",
+                                 SDT_ZVecTernaryInt>;
+def z_permute           : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>;
+def z_pack              : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>;
+def z_packs_cc          : SDNode<"SystemZISD::PACKS_CC", SDT_ZVecBinaryConv,
+                                 [SDNPOutGlue]>;
+def z_packls_cc         : SDNode<"SystemZISD::PACKLS_CC", SDT_ZVecBinaryConv,
+                                 [SDNPOutGlue]>;
+def z_unpack_high       : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnaryConv>;
+def z_unpackl_high      : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnaryConv>;
+def z_unpack_low        : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnaryConv>;
+def z_unpackl_low       : SDNode<"SystemZISD::UNPACKL_LOW", SDT_ZVecUnaryConv>;
+def z_vshl_by_scalar    : SDNode<"SystemZISD::VSHL_BY_SCALAR",
+                                 SDT_ZVecBinaryInt>;
+def z_vsrl_by_scalar    : SDNode<"SystemZISD::VSRL_BY_SCALAR",
+                                 SDT_ZVecBinaryInt>;
+def z_vsra_by_scalar    : SDNode<"SystemZISD::VSRA_BY_SCALAR",
+                                 SDT_ZVecBinaryInt>;
+def z_vsum              : SDNode<"SystemZISD::VSUM", SDT_ZVecBinaryConv>;
+def z_vicmpe            : SDNode<"SystemZISD::VICMPE", SDT_ZVecBinary>;
+def z_vicmph            : SDNode<"SystemZISD::VICMPH", SDT_ZVecBinary>;
+def z_vicmphl           : SDNode<"SystemZISD::VICMPHL", SDT_ZVecBinary>;
+def z_vicmpes           : SDNode<"SystemZISD::VICMPES", SDT_ZVecBinary,
+                                 [SDNPOutGlue]>;
+def z_vicmphs           : SDNode<"SystemZISD::VICMPHS", SDT_ZVecBinary,
+                                 [SDNPOutGlue]>;
+def z_vicmphls          : SDNode<"SystemZISD::VICMPHLS", SDT_ZVecBinary,
+                                 [SDNPOutGlue]>;
+def z_vfcmpe            : SDNode<"SystemZISD::VFCMPE", SDT_ZVecBinaryConv>;
+def z_vfcmph            : SDNode<"SystemZISD::VFCMPH", SDT_ZVecBinaryConv>;
+def z_vfcmphe           : SDNode<"SystemZISD::VFCMPHE", SDT_ZVecBinaryConv>;
+def z_vfcmpes           : SDNode<"SystemZISD::VFCMPES", SDT_ZVecBinaryConv,
+                                 [SDNPOutGlue]>;
+def z_vfcmphs           : SDNode<"SystemZISD::VFCMPHS", SDT_ZVecBinaryConv,
+                                 [SDNPOutGlue]>;
+def z_vfcmphes          : SDNode<"SystemZISD::VFCMPHES", SDT_ZVecBinaryConv,
+                                 [SDNPOutGlue]>;
+def z_vextend           : SDNode<"SystemZISD::VEXTEND", SDT_ZVecUnaryConv>;
+def z_vround            : SDNode<"SystemZISD::VROUND", SDT_ZVecUnaryConv>;
+def z_vtm               : SDNode<"SystemZISD::VTM", SDT_ZCmp, [SDNPOutGlue]>;
+def z_vfae_cc           : SDNode<"SystemZISD::VFAE_CC", SDT_ZVecTernaryInt,
+                                 [SDNPOutGlue]>;
+def z_vfaez_cc          : SDNode<"SystemZISD::VFAEZ_CC", SDT_ZVecTernaryInt,
+                                 [SDNPOutGlue]>;
+def z_vfee_cc           : SDNode<"SystemZISD::VFEE_CC", SDT_ZVecBinary,
+                                 [SDNPOutGlue]>;
+def z_vfeez_cc          : SDNode<"SystemZISD::VFEEZ_CC", SDT_ZVecBinary,
+                                 [SDNPOutGlue]>;
+def z_vfene_cc          : SDNode<"SystemZISD::VFENE_CC", SDT_ZVecBinary,
+                                 [SDNPOutGlue]>;
+def z_vfenez_cc         : SDNode<"SystemZISD::VFENEZ_CC", SDT_ZVecBinary,
+                                 [SDNPOutGlue]>;
+def z_vistr_cc          : SDNode<"SystemZISD::VISTR_CC", SDT_ZVecUnary,
+                                 [SDNPOutGlue]>;
+def z_vstrc_cc          : SDNode<"SystemZISD::VSTRC_CC", SDT_ZVecQuaternaryInt,
+                                 [SDNPOutGlue]>;
+def z_vstrcz_cc         : SDNode<"SystemZISD::VSTRCZ_CC",
+                                 SDT_ZVecQuaternaryInt, [SDNPOutGlue]>;
+def z_vftci             : SDNode<"SystemZISD::VFTCI", SDT_ZVecBinaryConvInt,
+                                 [SDNPOutGlue]>;
+
+class AtomicWOp<string name, SDTypeProfile profile = SDT_ZAtomicLoadBinaryW>
+  : SDNode<"SystemZISD::"##name, profile,
+           [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+
+def z_atomic_swapw      : AtomicWOp<"ATOMIC_SWAPW">;
+def z_atomic_loadw_add  : AtomicWOp<"ATOMIC_LOADW_ADD">;
+def z_atomic_loadw_sub  : AtomicWOp<"ATOMIC_LOADW_SUB">;
+def z_atomic_loadw_and  : AtomicWOp<"ATOMIC_LOADW_AND">;
+def z_atomic_loadw_or   : AtomicWOp<"ATOMIC_LOADW_OR">;
+def z_atomic_loadw_xor  : AtomicWOp<"ATOMIC_LOADW_XOR">;
+def z_atomic_loadw_nand : AtomicWOp<"ATOMIC_LOADW_NAND">;
+def z_atomic_loadw_min  : AtomicWOp<"ATOMIC_LOADW_MIN">;
+def z_atomic_loadw_max  : AtomicWOp<"ATOMIC_LOADW_MAX">;
+def z_atomic_loadw_umin : AtomicWOp<"ATOMIC_LOADW_UMIN">;
+def z_atomic_loadw_umax : AtomicWOp<"ATOMIC_LOADW_UMAX">;
+def z_atomic_cmp_swapw  : AtomicWOp<"ATOMIC_CMP_SWAPW", SDT_ZAtomicCmpSwapW>;
+
+def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_mvc_loop          : SDNode<"SystemZISD::MVC_LOOP", SDT_ZMemMemLoop,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_nc                : SDNode<"SystemZISD::NC", SDT_ZMemMemLength,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_nc_loop           : SDNode<"SystemZISD::NC_LOOP", SDT_ZMemMemLoop,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_oc                : SDNode<"SystemZISD::OC", SDT_ZMemMemLength,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_oc_loop           : SDNode<"SystemZISD::OC_LOOP", SDT_ZMemMemLoop,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_xc                : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_xc_loop           : SDNode<"SystemZISD::XC_LOOP", SDT_ZMemMemLoop,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLength,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_clc_loop          : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoop,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_strcmp            : SDNode<"SystemZISD::STRCMP", SDT_ZString,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_stpcpy            : SDNode<"SystemZISD::STPCPY", SDT_ZString,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_search_string     : SDNode<"SystemZISD::SEARCH_STRING", SDT_ZString,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_ipm               : SDNode<"SystemZISD::IPM", SDT_ZI32Intrinsic,
+                                 [SDNPInGlue]>;
+def z_prefetch          : SDNode<"SystemZISD::PREFETCH", SDT_ZPrefetch,
+                                 [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+                                  SDNPMemOperand]>;
+
+def z_tbegin            : SDNode<"SystemZISD::TBEGIN", SDT_ZTBegin,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayStore,
+                                  SDNPSideEffect]>;
+def z_tbegin_nofloat    : SDNode<"SystemZISD::TBEGIN_NOFLOAT", SDT_ZTBegin,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayStore,
+                                  SDNPSideEffect]>;
+def z_tend              : SDNode<"SystemZISD::TEND", SDTNone,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+
+def z_vshl              : SDNode<"ISD::SHL", SDT_ZVecBinary>;
+def z_vsra              : SDNode<"ISD::SRA", SDT_ZVecBinary>;
+def z_vsrl              : SDNode<"ISD::SRL", SDT_ZVecBinary>;
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments
+//===----------------------------------------------------------------------===//
+
+def z_lrvh  : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i16)>;
+def z_lrv   : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i32)>;
+def z_lrvg  : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i64)>;
+
+def z_strvh : PatFrag<(ops node:$src, node:$addr),
+                      (z_storebswap node:$src, node:$addr, i16)>;
+def z_strv  : PatFrag<(ops node:$src, node:$addr),
+                      (z_storebswap node:$src, node:$addr, i32)>;
+def z_strvg : PatFrag<(ops node:$src, node:$addr),
+                      (z_storebswap node:$src, node:$addr, i64)>;
+
+// Signed and unsigned comparisons.
+def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
+  unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  return Type != SystemZICMP::UnsignedOnly;
+}]>;
+def z_ucmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
+  unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  return Type != SystemZICMP::SignedOnly;
+}]>;
+
+// Register- and memory-based TEST UNDER MASK.
+def z_tm_reg : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, imm)>;
+def z_tm_mem : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, 0)>;
+
+// Register sign-extend operations.  Sub-32-bit values are represented as i32s.
+def sext8  : PatFrag<(ops node:$src), (sext_inreg node:$src, i8)>;
+def sext16 : PatFrag<(ops node:$src), (sext_inreg node:$src, i16)>;
+def sext32 : PatFrag<(ops node:$src), (sext (i32 node:$src))>;
+
+// Match extensions of an i32 to an i64, followed by an in-register sign
+// extension from a sub-i32 value.
+def sext8dbl : PatFrag<(ops node:$src), (sext8 (anyext node:$src))>;
+def sext16dbl : PatFrag<(ops node:$src), (sext16 (anyext node:$src))>;
+
+// Register zero-extend operations.  Sub-32-bit values are represented as i32s.
+def zext8  : PatFrag<(ops node:$src), (and node:$src, 0xff)>;
+def zext16 : PatFrag<(ops node:$src), (and node:$src, 0xffff)>;
+def zext32 : PatFrag<(ops node:$src), (zext (i32 node:$src))>;
+
+// Extending loads in which the extension type can be signed.
+def asextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+  unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
+  return Type == ISD::EXTLOAD || Type == ISD::SEXTLOAD;
+}]>;
+def asextloadi8 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def asextloadi16 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def asextloadi32 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// Extending loads in which the extension type can be unsigned.
+def azextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+  unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
+  return Type == ISD::EXTLOAD || Type == ISD::ZEXTLOAD;
+}]>;
+def azextloadi8 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def azextloadi16 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def azextloadi32 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// Extending loads in which the extension type doesn't matter.
+def anyextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getExtensionType() != ISD::NON_EXTLOAD;
+}]>;
+def anyextloadi8 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def anyextloadi16 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def anyextloadi32 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// Aligned loads.
+class AlignedLoad<SDPatternOperator load>
+  : PatFrag<(ops node:$addr), (load node:$addr), [{
+  auto *Load = cast<LoadSDNode>(N);
+  return Load->getAlignment() >= Load->getMemoryVT().getStoreSize();
+}]>;
+def aligned_load         : AlignedLoad<load>;
+def aligned_asextloadi16 : AlignedLoad<asextloadi16>;
+def aligned_asextloadi32 : AlignedLoad<asextloadi32>;
+def aligned_azextloadi16 : AlignedLoad<azextloadi16>;
+def aligned_azextloadi32 : AlignedLoad<azextloadi32>;
+
+// Aligned stores.
+class AlignedStore<SDPatternOperator store>
+  : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr), [{
+  auto *Store = cast<StoreSDNode>(N);
+  return Store->getAlignment() >= Store->getMemoryVT().getStoreSize();
+}]>;
+def aligned_store         : AlignedStore<store>;
+def aligned_truncstorei16 : AlignedStore<truncstorei16>;
+def aligned_truncstorei32 : AlignedStore<truncstorei32>;
+
+// Non-volatile loads.  Used for instructions that might access the storage
+// location multiple times.
+class NonvolatileLoad<SDPatternOperator load>
+  : PatFrag<(ops node:$addr), (load node:$addr), [{
+  auto *Load = cast<LoadSDNode>(N);
+  return !Load->isVolatile();
+}]>;
+def nonvolatile_load          : NonvolatileLoad<load>;
+def nonvolatile_anyextloadi8  : NonvolatileLoad<anyextloadi8>;
+def nonvolatile_anyextloadi16 : NonvolatileLoad<anyextloadi16>;
+def nonvolatile_anyextloadi32 : NonvolatileLoad<anyextloadi32>;
+
+// Non-volatile stores.
+class NonvolatileStore<SDPatternOperator store>
+  : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr), [{
+  auto *Store = cast<StoreSDNode>(N);
+  return !Store->isVolatile();
+}]>;
+def nonvolatile_store         : NonvolatileStore<store>;
+def nonvolatile_truncstorei8  : NonvolatileStore<truncstorei8>;
+def nonvolatile_truncstorei16 : NonvolatileStore<truncstorei16>;
+def nonvolatile_truncstorei32 : NonvolatileStore<truncstorei32>;
+
+// A store of a load that can be implemented using MVC.
+def mvc_store : PatFrag<(ops node:$value, node:$addr),
+                        (unindexedstore node:$value, node:$addr),
+                        [{ return storeLoadCanUseMVC(N); }]>;
+
+// Binary read-modify-write operations on memory in which the other
+// operand is also memory and for which block operations like NC can
+// be used.  There are two patterns for each operator, depending on
+// which operand contains the "other" load.
+multiclass block_op<SDPatternOperator operator> {
+  def "1" : PatFrag<(ops node:$value, node:$addr),
+                    (unindexedstore (operator node:$value,
+                                              (unindexedload node:$addr)),
+                                    node:$addr),
+                    [{ return storeLoadCanUseBlockBinary(N, 0); }]>;
+  def "2" : PatFrag<(ops node:$value, node:$addr),
+                    (unindexedstore (operator (unindexedload node:$addr),
+                                              node:$value),
+                                    node:$addr),
+                    [{ return storeLoadCanUseBlockBinary(N, 1); }]>;
+}
+defm block_and : block_op<and>;
+defm block_or  : block_op<or>;
+defm block_xor : block_op<xor>;
+
+// Insertions.
+def inserti8 : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, -256), node:$src2)>;
+def insertll : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0xffffffffffff0000), node:$src2)>;
+def insertlh : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0xffffffff0000ffff), node:$src2)>;
+def inserthl : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0xffff0000ffffffff), node:$src2)>;
+def inserthh : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0x0000ffffffffffff), node:$src2)>;
+def insertlf : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0xffffffff00000000), node:$src2)>;
+def inserthf : PatFrag<(ops node:$src1, node:$src2),
+                       (or (and node:$src1, 0x00000000ffffffff), node:$src2)>;
+
+// ORs that can be treated as insertions.
+def or_as_inserti8 : PatFrag<(ops node:$src1, node:$src2),
+                             (or node:$src1, node:$src2), [{
+  unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
+  return CurDAG->MaskedValueIsZero(N->getOperand(0),
+                                   APInt::getLowBitsSet(BitWidth, 8));
+}]>;
+
+// ORs that can be treated as reversed insertions.
+def or_as_revinserti8 : PatFrag<(ops node:$src1, node:$src2),
+                                (or node:$src1, node:$src2), [{
+  unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
+  return CurDAG->MaskedValueIsZero(N->getOperand(1),
+                                   APInt::getLowBitsSet(BitWidth, 8));
+}]>;
+
+// Negative integer absolute.
+def z_inegabs : PatFrag<(ops node:$src), (ineg (z_iabs node:$src))>;
+
+// Integer absolute, matching the canonical form generated by DAGCombiner.
+def z_iabs32 : PatFrag<(ops node:$src),
+                       (xor (add node:$src, (sra node:$src, (i32 31))),
+                            (sra node:$src, (i32 31)))>;
+def z_iabs64 : PatFrag<(ops node:$src),
+                       (xor (add node:$src, (sra node:$src, (i32 63))),
+                            (sra node:$src, (i32 63)))>;
+def z_inegabs32 : PatFrag<(ops node:$src), (ineg (z_iabs32 node:$src))>;
+def z_inegabs64 : PatFrag<(ops node:$src), (ineg (z_iabs64 node:$src))>;
+
+// Integer multiply-and-add
+def z_muladd : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                       (add (mul node:$src1, node:$src2), node:$src3)>;
+
+// Fused multiply-subtract, using the natural operand order.
+def fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                  (fma node:$src1, node:$src2, (fneg node:$src3))>;
+
+// Fused multiply-add and multiply-subtract, but with the order of the
+// operands matching SystemZ's MA and MS instructions.
+def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                    (fma node:$src2, node:$src3, node:$src1)>;
+def z_fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                    (fma node:$src2, node:$src3, (fneg node:$src1))>;
+
+// Floating-point negative absolute.
+def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>;
+
+// Create a unary operator that loads from memory and then performs
+// the given operation on it.
+class loadu<SDPatternOperator operator, SDPatternOperator load = load>
+  : PatFrag<(ops node:$addr), (operator (load node:$addr))>;
+
+// Create a store operator that performs the given unary operation
+// on the value before storing it.
+class storeu<SDPatternOperator operator, SDPatternOperator store = store>
+  : PatFrag<(ops node:$value, node:$addr),
+            (store (operator node:$value), node:$addr)>;
+
+// Create a store operator that performs the given inherent operation
+// and stores the resulting value.
+class storei<SDPatternOperator operator, SDPatternOperator store = store>
+  : PatFrag<(ops node:$addr),
+            (store (operator), node:$addr)>;
+
+// Vector representation of all-zeros and all-ones.
+def z_vzero : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 0))))>;
+def z_vones : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 65535))))>;
+
+// Load a scalar and replicate it in all elements of a vector.
+class z_replicate_load<ValueType scalartype, SDPatternOperator load>
+  : PatFrag<(ops node:$addr),
+            (z_replicate (scalartype (load node:$addr)))>;
+def z_replicate_loadi8  : z_replicate_load<i32, anyextloadi8>;
+def z_replicate_loadi16 : z_replicate_load<i32, anyextloadi16>;
+def z_replicate_loadi32 : z_replicate_load<i32, load>;
+def z_replicate_loadi64 : z_replicate_load<i64, load>;
+def z_replicate_loadf32 : z_replicate_load<f32, load>;
+def z_replicate_loadf64 : z_replicate_load<f64, load>;
+
+// Load a scalar and insert it into a single element of a vector.
+class z_vle<ValueType scalartype, SDPatternOperator load>
+  : PatFrag<(ops node:$vec, node:$addr, node:$index),
+            (z_vector_insert node:$vec, (scalartype (load node:$addr)),
+                             node:$index)>;
+def z_vlei8  : z_vle<i32, anyextloadi8>;
+def z_vlei16 : z_vle<i32, anyextloadi16>;
+def z_vlei32 : z_vle<i32, load>;
+def z_vlei64 : z_vle<i64, load>;
+def z_vlef32 : z_vle<f32, load>;
+def z_vlef64 : z_vle<f64, load>;
+
+// Load a scalar and insert it into the low element of the high i64 of a
+// zeroed vector.
+class z_vllez<ValueType scalartype, SDPatternOperator load, int index>
+  : PatFrag<(ops node:$addr),
+            (z_vector_insert (z_vzero),
+                             (scalartype (load node:$addr)), (i32 index))>;
+def z_vllezi8  : z_vllez<i32, anyextloadi8, 7>;
+def z_vllezi16 : z_vllez<i32, anyextloadi16, 3>;
+def z_vllezi32 : z_vllez<i32, load, 1>;
+def z_vllezi64 : PatFrag<(ops node:$addr),
+                         (z_join_dwords (i64 (load node:$addr)), (i64 0))>;
+// We use high merges to form a v4f32 from four f32s.  Propagating zero
+// into all elements but index 1 gives this expression.
+def z_vllezf32 : PatFrag<(ops node:$addr),
+                         (bitconvert
+                          (z_merge_high
+                           (v2i64
+                            (z_unpackl_high
+                             (v4i32
+                              (bitconvert
+                               (v4f32 (scalar_to_vector
+                                       (f32 (load node:$addr)))))))),
+                           (v2i64 (z_vzero))))>;
+def z_vllezf64 : PatFrag<(ops node:$addr),
+                         (z_merge_high
+                          (scalar_to_vector (f64 (load node:$addr))),
+                          (z_vzero))>;
+
+// Store one element of a vector.
+class z_vste<ValueType scalartype, SDPatternOperator store>
+  : PatFrag<(ops node:$vec, node:$addr, node:$index),
+            (store (scalartype (z_vector_extract node:$vec, node:$index)),
+                   node:$addr)>;
+def z_vstei8  : z_vste<i32, truncstorei8>;
+def z_vstei16 : z_vste<i32, truncstorei16>;
+def z_vstei32 : z_vste<i32, store>;
+def z_vstei64 : z_vste<i64, store>;
+def z_vstef32 : z_vste<f32, store>;
+def z_vstef64 : z_vste<f64, store>;
+
+// Arithmetic negation on vectors.
+def z_vneg : PatFrag<(ops node:$x), (sub (z_vzero), node:$x)>;
+
+// Bitwise negation on vectors.
+def z_vnot : PatFrag<(ops node:$x), (xor node:$x, (z_vones))>;
+
+// Signed "integer greater than zero" on vectors.
+def z_vicmph_zero : PatFrag<(ops node:$x), (z_vicmph node:$x, (z_vzero))>;
+
+// Signed "integer less than zero" on vectors.
+def z_vicmpl_zero : PatFrag<(ops node:$x), (z_vicmph (z_vzero), node:$x)>;
+
+// Integer absolute on vectors.
+class z_viabs<int shift>
+  : PatFrag<(ops node:$src),
+            (xor (add node:$src, (z_vsra_by_scalar node:$src, (i32 shift))),
+                 (z_vsra_by_scalar node:$src, (i32 shift)))>;
+def z_viabs8  : z_viabs<7>;
+def z_viabs16 : z_viabs<15>;
+def z_viabs32 : z_viabs<31>;
+def z_viabs64 : z_viabs<63>;
+
+// Sign-extend the i64 elements of a vector.
+class z_vse<int shift>
+  : PatFrag<(ops node:$src),
+            (z_vsra_by_scalar (z_vshl_by_scalar node:$src, shift), shift)>;
+def z_vsei8  : z_vse<56>;
+def z_vsei16 : z_vse<48>;
+def z_vsei32 : z_vse<32>;
+
+// ...and again with the extensions being done on individual i64 scalars.
+class z_vse_by_parts<SDPatternOperator operator, int index1, int index2>
+  : PatFrag<(ops node:$src),
+            (z_join_dwords
+             (operator (z_vector_extract node:$src, index1)),
+             (operator (z_vector_extract node:$src, index2)))>;
+def z_vsei8_by_parts  : z_vse_by_parts<sext8dbl, 7, 15>;
+def z_vsei16_by_parts : z_vse_by_parts<sext16dbl, 3, 7>;
+def z_vsei32_by_parts : z_vse_by_parts<sext32, 1, 3>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZPatterns.td b/contrib/llvm/lib/Target/SystemZ/SystemZPatterns.td
new file mode 100644
index 000000000000..16a7ed784d70
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZPatterns.td
@@ -0,0 +1,169 @@
+//===-- SystemZPatterns.td - SystemZ-specific pattern rules ---*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Record that INSN performs a 64-bit version of unary operator OPERATOR
+// in which the operand is sign-extended from 32 to 64 bits.
+multiclass SXU<SDPatternOperator operator, Instruction insn> {
+  def : Pat<(operator (sext (i32 GR32:$src))),
+            (insn GR32:$src)>;
+  def : Pat<(operator (sext_inreg GR64:$src, i32)),
+            (insn (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
+}
+
+// Record that INSN performs a 64-bit version of binary operator OPERATOR
+// in which the first operand has class CLS and which the second operand
+// is sign-extended from a 32-bit register.
+multiclass SXB<SDPatternOperator operator, RegisterOperand cls,
+               Instruction insn> {
+  def : Pat<(operator cls:$src1, (sext GR32:$src2)),
+            (insn cls:$src1, GR32:$src2)>;
+  def : Pat<(operator cls:$src1, (sext_inreg GR64:$src2, i32)),
+            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_l32))>;
+}
+
+// Like SXB, but for zero extension.
+multiclass ZXB<SDPatternOperator operator, RegisterOperand cls,
+               Instruction insn> {
+  def : Pat<(operator cls:$src1, (zext GR32:$src2)),
+            (insn cls:$src1, GR32:$src2)>;
+  def : Pat<(operator cls:$src1, (and GR64:$src2, 0xffffffff)),
+            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_l32))>;
+}
+
+// Record that INSN performs a binary read-modify-write operation,
+// with LOAD, OPERATOR and STORE being the read, modify and write
+// respectively.  MODE is the addressing mode and IMM is the type
+// of the second operand.
+class RMWI<SDPatternOperator load, SDPatternOperator operator,
+           SDPatternOperator store, AddressingMode mode,
+           PatFrag imm, Instruction insn>
+  : Pat<(store (operator (load mode:$addr), imm:$src), mode:$addr),
+        (insn mode:$addr, (UIMM8 imm:$src))>;
+
+// Record that INSN performs binary operation OPERATION on a byte
+// memory location.  IMM is the type of the second operand.
+multiclass RMWIByte<SDPatternOperator operator, AddressingMode mode,
+                    Instruction insn> {
+  def : RMWI<anyextloadi8, operator, truncstorei8, mode, imm32, insn>;
+  def : RMWI<anyextloadi8, operator, truncstorei8, mode, imm64, insn>;
+}
+
+// Record that INSN performs insertion TYPE into a register of class CLS.
+// The inserted operand is loaded using LOAD from an address of mode MODE.
+multiclass InsertMem<string type, Instruction insn, RegisterOperand cls,
+                     SDPatternOperator load, AddressingMode mode> {
+  def : Pat<(!cast<SDPatternOperator>("or_as_"##type)
+              cls:$src1, (load mode:$src2)),
+            (insn cls:$src1, mode:$src2)>;
+  def : Pat<(!cast<SDPatternOperator>("or_as_rev"##type)
+              (load mode:$src2), cls:$src1),
+            (insn cls:$src1, mode:$src2)>;
+}
+
+// INSN stores the low 32 bits of a GPR to a memory with addressing mode MODE.
+// Record that it is equivalent to using OPERATOR to store a GR64.
+class StoreGR64<Instruction insn, SDPatternOperator operator,
+                AddressingMode mode>
+  : Pat<(operator GR64:$R1, mode:$XBD2),
+        (insn (EXTRACT_SUBREG GR64:$R1, subreg_l32), mode:$XBD2)>;
+
+// INSN and INSNY are an RX/RXY pair of instructions that store the low
+// 32 bits of a GPR to memory.  Record that they are equivalent to using
+// OPERATOR to store a GR64.
+multiclass StoreGR64Pair<Instruction insn, Instruction insny,
+                         SDPatternOperator operator> {
+  def : StoreGR64<insn, operator, bdxaddr12pair>;
+  def : StoreGR64<insny, operator, bdxaddr20pair>;
+}
+
+// INSN stores the low 32 bits of a GPR using PC-relative addressing.
+// Record that it is equivalent to using OPERATOR to store a GR64.
+class StoreGR64PC<Instruction insn, SDPatternOperator operator>
+  : Pat<(operator GR64:$R1, pcrel32:$XBD2),
+        (insn (EXTRACT_SUBREG GR64:$R1, subreg_l32), pcrel32:$XBD2)> {
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+// INSN and INSNINV conditionally store the low 32 bits of a GPR to memory,
+// with INSN storing when the condition is true and INSNINV storing when the
+// condition is false.  Record that they are equivalent to a LOAD/select/STORE
+// sequence for GR64s.
+multiclass CondStores64<Instruction insn, Instruction insninv,
+                        SDPatternOperator store, SDPatternOperator load,
+                        AddressingMode mode> {
+  def : Pat<(store (z_select_ccmask GR64:$new, (load mode:$addr),
+                                    imm32zx4:$valid, imm32zx4:$cc),
+                   mode:$addr),
+            (insn (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
+                  imm32zx4:$valid, imm32zx4:$cc)>;
+  def : Pat<(store (z_select_ccmask (load mode:$addr), GR64:$new,
+                                    imm32zx4:$valid, imm32zx4:$cc),
+                   mode:$addr),
+            (insninv (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
+                     imm32zx4:$valid, imm32zx4:$cc)>;
+}
+
+// Try to use MVC instruction INSN for a load of type LOAD followed by a store
+// of the same size.  VT is the type of the intermediate (legalized) value and
+// LENGTH is the number of bytes loaded by LOAD.
+multiclass MVCLoadStore<SDPatternOperator load, ValueType vt, Instruction insn,
+                        bits<5> length> {
+  def : Pat<(mvc_store (vt (load bdaddr12only:$src)), bdaddr12only:$dest),
+            (insn bdaddr12only:$dest, bdaddr12only:$src, length)>;
+}
+
+// Use NC-like instruction INSN for block_op operation OPERATOR.
+// The other operand is a load of type LOAD, which accesses LENGTH bytes.
+// VT is the intermediate legalized type in which the binary operation
+// is actually done.
+multiclass BinaryLoadStore<SDPatternOperator operator, SDPatternOperator load,
+                           ValueType vt, Instruction insn, bits<5> length> {
+  def : Pat<(operator (vt (load bdaddr12only:$src)), bdaddr12only:$dest),
+            (insn bdaddr12only:$dest, bdaddr12only:$src, length)>;
+}
+
+// A convenient way of generating all block peepholes for a particular
+// LOAD/VT/LENGTH combination.
+multiclass BlockLoadStore<SDPatternOperator load, ValueType vt,
+                          Instruction mvc, Instruction nc, Instruction oc,
+                          Instruction xc, bits<5> length> {
+  defm : MVCLoadStore<load, vt, mvc, length>;
+  defm : BinaryLoadStore<block_and1, load, vt, nc, length>;
+  defm : BinaryLoadStore<block_and2, load, vt, nc, length>;
+  defm : BinaryLoadStore<block_or1,  load, vt, oc, length>;
+  defm : BinaryLoadStore<block_or2,  load, vt, oc, length>;
+  defm : BinaryLoadStore<block_xor1, load, vt, xc, length>;
+  defm : BinaryLoadStore<block_xor2, load, vt, xc, length>;
+}
+
+// Record that INSN is a LOAD AND TEST that can be used to compare
+// registers in CLS against zero.  The instruction has separate R1 and R2
+// operands, but they must be the same when the instruction is used like this.
+multiclass CompareZeroFP<Instruction insn, RegisterOperand cls> {
+  def : Pat<(z_fcmp cls:$reg, (fpimm0)), (insn cls:$reg, cls:$reg)>;
+  // The sign of the zero makes no difference.
+  def : Pat<(z_fcmp cls:$reg, (fpimmneg0)), (insn cls:$reg, cls:$reg)>;
+}
+
+// Use INSN for performing binary operation OPERATION of type VT
+// on registers of class CLS.
+class BinaryRRWithType<Instruction insn, RegisterOperand cls,
+                       SDPatternOperator operator, ValueType vt>
+  : Pat<(vt (operator cls:$x, cls:$y)), (insn cls:$x, cls:$y)>;
+
+// Use INSN to perform conversion operation OPERATOR, with the input being
+// TR2 and the output being TR1.  SUPPRESS is 4 to suppress inexact conditions
+// and 0 to allow them.  MODE is the rounding mode to use.
+class FPConversion<Instruction insn, SDPatternOperator operator, TypedReg tr1,
+                   TypedReg tr2, bits<3> suppress, bits<4> mode>
+  : Pat<(tr1.vt (operator (tr2.vt tr2.op:$vec))),
+        (insn tr2.op:$vec, suppress, mode)>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZProcessors.td b/contrib/llvm/lib/Target/SystemZ/SystemZProcessors.td
new file mode 100644
index 000000000000..1cdc0949ff4a
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZProcessors.td
@@ -0,0 +1,35 @@
+//===-- SystemZ.td - SystemZ processors and features ---------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Processor definitions.
+//
+// For compatibility with other compilers on the platform, each model can
+// be identifed either by the system name (e.g. z10) or the level of the
+// architecture the model supports, as identified by the edition level
+// of the z/Architecture Principles of Operation document (e.g. arch8).
+//
+// The minimum architecture level supported by LLVM is as defined in
+// the Eighth Edition of the PoP (i.e. as implemented on z10).
+//
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"generic", NoSchedModel, []>;
+
+def : ProcessorModel<"arch8", NoSchedModel, Arch8SupportedFeatures.List>;
+def : ProcessorModel<"z10", NoSchedModel, Arch8SupportedFeatures.List>;
+
+def : ProcessorModel<"arch9", Z196Model, Arch9SupportedFeatures.List>;
+def : ProcessorModel<"z196", Z196Model, Arch9SupportedFeatures.List>;
+
+def : ProcessorModel<"arch10", ZEC12Model, Arch10SupportedFeatures.List>;
+def : ProcessorModel<"zEC12", ZEC12Model, Arch10SupportedFeatures.List>;
+
+def : ProcessorModel<"arch11", Z13Model, Arch11SupportedFeatures.List>;
+def : ProcessorModel<"z13", Z13Model, Arch11SupportedFeatures.List>;
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
new file mode 100644
index 000000000000..6ef8000d6f43
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -0,0 +1,159 @@
+//===-- SystemZRegisterInfo.cpp - SystemZ register information ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZInstrInfo.h"
+#include "SystemZRegisterInfo.h"
+#include "SystemZSubtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "SystemZGenRegisterInfo.inc"
+
+SystemZRegisterInfo::SystemZRegisterInfo()
+    : SystemZGenRegisterInfo(SystemZ::R14D) {}
+
+const MCPhysReg *
+SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  if (MF->getSubtarget().getTargetLowering()->supportSwiftError() &&
+      MF->getFunction()->getAttributes().hasAttrSomewhere(
+          Attribute::SwiftError))
+    return CSR_SystemZ_SwiftError_SaveList;
+  return CSR_SystemZ_SaveList;
+}
+
+const uint32_t *
+SystemZRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                          CallingConv::ID CC) const {
+  if (MF.getSubtarget().getTargetLowering()->supportSwiftError() &&
+      MF.getFunction()->getAttributes().hasAttrSomewhere(
+          Attribute::SwiftError))
+    return CSR_SystemZ_SwiftError_RegMask;
+  return CSR_SystemZ_RegMask;
+}
+
+BitVector
+SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const SystemZFrameLowering *TFI = getFrameLowering(MF);
+
+  if (TFI->hasFP(MF)) {
+    // R11D is the frame pointer.  Reserve all aliases.
+    Reserved.set(SystemZ::R11D);
+    Reserved.set(SystemZ::R11L);
+    Reserved.set(SystemZ::R11H);
+    Reserved.set(SystemZ::R10Q);
+  }
+
+  // R15D is the stack pointer.  Reserve all aliases.
+  Reserved.set(SystemZ::R15D);
+  Reserved.set(SystemZ::R15L);
+  Reserved.set(SystemZ::R15H);
+  Reserved.set(SystemZ::R14Q);
+
+  // A0 and A1 hold the thread pointer.
+  Reserved.set(SystemZ::A0);
+  Reserved.set(SystemZ::A1);
+
+  return Reserved;
+}
+
+void
+SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                         int SPAdj, unsigned FIOperandNum,
+                                         RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Outgoing arguments should be part of the frame");
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  auto *TII =
+      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SystemZFrameLowering *TFI = getFrameLowering(MF);
+  DebugLoc DL = MI->getDebugLoc();
+
+  // Decompose the frame index into a base and offset.
+  int FrameIndex = MI->getOperand(FIOperandNum).getIndex();
+  unsigned BasePtr;
+  int64_t Offset = (TFI->getFrameIndexReference(MF, FrameIndex, BasePtr) +
+                    MI->getOperand(FIOperandNum + 1).getImm());
+
+  // Special handling of dbg_value instructions.
+  if (MI->isDebugValue()) {
+    MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, /*isDef*/ false);
+    MI->getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // See if the offset is in range, or if an equivalent instruction that
+  // accepts the offset exists.
+  unsigned Opcode = MI->getOpcode();
+  unsigned OpcodeForOffset = TII->getOpcodeForOffset(Opcode, Offset);
+  if (OpcodeForOffset) {
+    if (OpcodeForOffset == SystemZ::LE &&
+        MF.getSubtarget<SystemZSubtarget>().hasVector()) {
+      // If LE is ok for offset, use LDE instead on z13.
+      OpcodeForOffset = SystemZ::LDE32;
+    }
+    MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+  }
+  else {
+    // Create an anchor point that is in range.  Start at 0xffff so that
+    // can use LLILH to load the immediate.
+    int64_t OldOffset = Offset;
+    int64_t Mask = 0xffff;
+    do {
+      Offset = OldOffset & Mask;
+      OpcodeForOffset = TII->getOpcodeForOffset(Opcode, Offset);
+      Mask >>= 1;
+      assert(Mask && "One offset must be OK");
+    } while (!OpcodeForOffset);
+
+    unsigned ScratchReg =
+      MF.getRegInfo().createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+    int64_t HighOffset = OldOffset - Offset;
+
+    if (MI->getDesc().TSFlags & SystemZII::HasIndex
+        && MI->getOperand(FIOperandNum + 2).getReg() == 0) {
+      // Load the offset into the scratch register and use it as an index.
+      // The scratch register then dies here.
+      TII->loadImmediate(MBB, MI, ScratchReg, HighOffset);
+      MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+      MI->getOperand(FIOperandNum + 2).ChangeToRegister(ScratchReg,
+                                                        false, false, true);
+    } else {
+      // Load the anchor address into a scratch register.
+      unsigned LAOpcode = TII->getOpcodeForOffset(SystemZ::LA, HighOffset);
+      if (LAOpcode)
+        BuildMI(MBB, MI, DL, TII->get(LAOpcode),ScratchReg)
+          .addReg(BasePtr).addImm(HighOffset).addReg(0);
+      else {
+        // Load the high offset into the scratch register and use it as
+        // an index.
+        TII->loadImmediate(MBB, MI, ScratchReg, HighOffset);
+        BuildMI(MBB, MI, DL, TII->get(SystemZ::AGR),ScratchReg)
+          .addReg(ScratchReg, RegState::Kill).addReg(BasePtr);
+      }
+
+      // Use the scratch register as the base.  It then dies here.
+      MI->getOperand(FIOperandNum).ChangeToRegister(ScratchReg,
+                                                    false, false, true);
+    }
+  }
+  MI->setDesc(TII->get(OpcodeForOffset));
+  MI->getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+}
+
+unsigned
+SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const SystemZFrameLowering *TFI = getFrameLowering(MF);
+  return TFI->hasFP(MF) ? SystemZ::R11D : SystemZ::R15D;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
new file mode 100644
index 000000000000..e41c06c98af2
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -0,0 +1,67 @@
+//===-- SystemZRegisterInfo.h - SystemZ register information ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZREGISTERINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZREGISTERINFO_H
+
+#include "SystemZ.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "SystemZGenRegisterInfo.inc"
+
+namespace llvm {
+
+namespace SystemZ {
+// Return the subreg to use for referring to the even and odd registers
+// in a GR128 pair.  Is32Bit says whether we want a GR32 or GR64.
+inline unsigned even128(bool Is32bit) {
+  return Is32bit ? subreg_hl32 : subreg_h64;
+}
+inline unsigned odd128(bool Is32bit) {
+  return Is32bit ? subreg_l32 : subreg_l64;
+}
+} // end namespace SystemZ
+
+struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
+public:
+  SystemZRegisterInfo();
+
+  /// getPointerRegClass - Return the register class to use to hold pointers.
+  /// This is currently only used by LOAD_STACK_GUARD, which requires a non-%r0
+  /// register, hence ADDR64.
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind=0) const override {
+    return &SystemZ::ADDR64BitRegClass;
+  }
+
+  // Override TargetRegisterInfo.h.
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
+    return true;
+  }
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
+    return true;
+  }
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID CC) const override;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS) const override;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
new file mode 100644
index 000000000000..47d2f75cc11a
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -0,0 +1,306 @@
+//==- SystemZRegisterInfo.td - SystemZ register definitions -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Class definitions.
+//===----------------------------------------------------------------------===//
+
+class SystemZReg<string n> : Register<n> {
+  let Namespace = "SystemZ";
+}
+
+class SystemZRegWithSubregs<string n, list<Register> subregs>
+  : RegisterWithSubRegs<n, subregs> {
+  let Namespace = "SystemZ";
+}
+
+let Namespace = "SystemZ" in {
+def subreg_l32   : SubRegIndex<32, 0>;  // Also acts as subreg_ll32.
+def subreg_h32   : SubRegIndex<32, 32>; // Also acts as subreg_lh32.
+def subreg_l64   : SubRegIndex<64, 0>;
+def subreg_h64   : SubRegIndex<64, 64>;
+def subreg_r32   : SubRegIndex<32, 32>; // Reinterpret a wider reg as 32 bits.
+def subreg_r64   : SubRegIndex<64, 64>; // Reinterpret a wider reg as 64 bits.
+def subreg_hh32  : ComposedSubRegIndex<subreg_h64, subreg_h32>;
+def subreg_hl32  : ComposedSubRegIndex<subreg_h64, subreg_l32>;
+def subreg_hr32  : ComposedSubRegIndex<subreg_h64, subreg_r32>;
+}
+
+// Define a register class that contains values of types TYPES and an
+// associated operand called NAME.  SIZE is the size and alignment
+// of the registers and REGLIST is the list of individual registers.
+multiclass SystemZRegClass<string name, list<ValueType> types, int size,
+                           dag regList, bit allocatable = 1> {
+  def AsmOperand : AsmOperandClass {
+    let Name = name;
+    let ParserMethod = "parse"##name;
+    let RenderMethod = "addRegOperands";
+  }
+  let isAllocatable = allocatable in
+    def Bit : RegisterClass<"SystemZ", types, size, regList> {
+      let Size = size;
+    }
+  def "" : RegisterOperand<!cast<RegisterClass>(name##"Bit")> {
+    let ParserMatchClass = !cast<AsmOperandClass>(name##"AsmOperand");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// General-purpose registers
+//===----------------------------------------------------------------------===//
+
+// Lower 32 bits of one of the 16 64-bit general-purpose registers
+class GPR32<bits<16> num, string n> : SystemZReg<n> {
+  let HWEncoding = num;
+}
+
+// One of the 16 64-bit general-purpose registers.
+class GPR64<bits<16> num, string n, GPR32 low, GPR32 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
+  let HWEncoding = num;
+  let SubRegIndices = [subreg_l32, subreg_h32];
+}
+
+// 8 even-odd pairs of GPR64s.
+class GPR128<bits<16> num, string n, GPR64 low, GPR64 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
+  let HWEncoding = num;
+  let SubRegIndices = [subreg_l64, subreg_h64];
+}
+
+// General-purpose registers
+foreach I = 0-15 in {
+  def R#I#L : GPR32<I, "r"#I>;
+  def R#I#H : GPR32<I, "r"#I>;
+  def R#I#D : GPR64<I, "r"#I, !cast<GPR32>("R"#I#"L"), !cast<GPR32>("R"#I#"H")>,
+                    DwarfRegNum<[I]>;
+}
+
+foreach I = [0, 2, 4, 6, 8, 10, 12, 14] in {
+  def R#I#Q : GPR128<I, "r"#I, !cast<GPR64>("R"#!add(I, 1)#"D"),
+                     !cast<GPR64>("R"#I#"D")>;
+}
+
+/// Allocate the callee-saved R6-R13 backwards. That way they can be saved
+/// together with R14 and R15 in one prolog instruction.
+defm GR32  : SystemZRegClass<"GR32",  [i32], 32,
+                             (add (sequence "R%uL",  0, 5),
+                                  (sequence "R%uL", 15, 6))>;
+defm GRH32 : SystemZRegClass<"GRH32", [i32], 32,
+                             (add (sequence "R%uH",  0, 5),
+                                  (sequence "R%uH", 15, 6))>;
+defm GR64  : SystemZRegClass<"GR64",  [i64], 64,
+                             (add (sequence "R%uD",  0, 5),
+                                  (sequence "R%uD", 15, 6))>;
+
+// Combine the low and high GR32s into a single class.  This can only be
+// used for virtual registers if the high-word facility is available.
+defm GRX32 : SystemZRegClass<"GRX32", [i32], 32,
+                             (add (sequence "R%uL",  0, 5),
+                                  (sequence "R%uH",  0, 5),
+                                  R15L, R15H, R14L, R14H, R13L, R13H,
+                                  R12L, R12H, R11L, R11H, R10L, R10H,
+                                  R9L, R9H, R8L, R8H, R7L, R7H, R6L, R6H)>;
+
+// The architecture doesn't really have any i128 support, so model the
+// register pairs as untyped instead.
+defm GR128 : SystemZRegClass<"GR128", [untyped], 128,
+                             (add R0Q, R2Q, R4Q, R12Q, R10Q, R8Q, R6Q, R14Q)>;
+
+// Base and index registers.  Everything except R0, which in an address
+// context evaluates as 0.
+defm ADDR32 : SystemZRegClass<"ADDR32", [i32], 32, (sub GR32Bit, R0L)>;
+defm ADDR64 : SystemZRegClass<"ADDR64", [i64], 64, (sub GR64Bit, R0D)>;
+
+// Not used directly, but needs to exist for ADDR32 and ADDR64 subregs
+// of a GR128.
+defm ADDR128 : SystemZRegClass<"ADDR128", [untyped], 128, (sub GR128Bit, R0Q)>;
+
+// Any type register. Used for .insn directives when we don't know what the
+// register types could be.
+defm AnyReg : SystemZRegClass<"AnyReg",
+                              [i64, f64, v8i8, v4i16, v2i32, v2f32], 64,
+                              (add (sequence "R%uD", 0, 15),
+                                   (sequence "F%uD", 0, 15),
+                                   (sequence "V%u", 0, 15))>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point registers
+//===----------------------------------------------------------------------===//
+
+// Maps FPR register numbers to their DWARF encoding.
+class DwarfMapping<int id> { int Id = id; }
+
+def F0Dwarf  : DwarfMapping<16>;
+def F2Dwarf  : DwarfMapping<17>;
+def F4Dwarf  : DwarfMapping<18>;
+def F6Dwarf  : DwarfMapping<19>;
+
+def F1Dwarf  : DwarfMapping<20>;
+def F3Dwarf  : DwarfMapping<21>;
+def F5Dwarf  : DwarfMapping<22>;
+def F7Dwarf  : DwarfMapping<23>;
+
+def F8Dwarf  : DwarfMapping<24>;
+def F10Dwarf : DwarfMapping<25>;
+def F12Dwarf : DwarfMapping<26>;
+def F14Dwarf : DwarfMapping<27>;
+
+def F9Dwarf  : DwarfMapping<28>;
+def F11Dwarf : DwarfMapping<29>;
+def F13Dwarf : DwarfMapping<30>;
+def F15Dwarf : DwarfMapping<31>;
+
+def F16Dwarf : DwarfMapping<68>;
+def F18Dwarf : DwarfMapping<69>;
+def F20Dwarf : DwarfMapping<70>;
+def F22Dwarf : DwarfMapping<71>;
+
+def F17Dwarf : DwarfMapping<72>;
+def F19Dwarf : DwarfMapping<73>;
+def F21Dwarf : DwarfMapping<74>;
+def F23Dwarf : DwarfMapping<75>;
+
+def F24Dwarf : DwarfMapping<76>;
+def F26Dwarf : DwarfMapping<77>;
+def F28Dwarf : DwarfMapping<78>;
+def F30Dwarf : DwarfMapping<79>;
+
+def F25Dwarf : DwarfMapping<80>;
+def F27Dwarf : DwarfMapping<81>;
+def F29Dwarf : DwarfMapping<82>;
+def F31Dwarf : DwarfMapping<83>;
+
+// Upper 32 bits of one of the floating-point registers
+class FPR32<bits<16> num, string n> : SystemZReg<n> {
+  let HWEncoding = num;
+}
+
+// One of the floating-point registers.
+class FPR64<bits<16> num, string n, FPR32 high>
+ : SystemZRegWithSubregs<n, [high]> {
+  let HWEncoding = num;
+  let SubRegIndices = [subreg_r32];
+}
+
+// 8 pairs of FPR64s, with a one-register gap inbetween.
+class FPR128<bits<16> num, string n, FPR64 low, FPR64 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
+  let HWEncoding = num;
+  let SubRegIndices = [subreg_l64, subreg_h64];
+}
+
+// Floating-point registers.  Registers 16-31 require the vector facility.
+foreach I = 0-15 in {
+  def F#I#S : FPR32<I, "f"#I>;
+  def F#I#D : FPR64<I, "f"#I, !cast<FPR32>("F"#I#"S")>,
+              DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
+}
+foreach I = 16-31 in {
+  def F#I#S : FPR32<I, "v"#I>;
+  def F#I#D : FPR64<I, "v"#I, !cast<FPR32>("F"#I#"S")>,
+              DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
+}
+
+foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in {
+  def F#I#Q  : FPR128<I, "f"#I, !cast<FPR64>("F"#!add(I, 2)#"D"),
+                     !cast<FPR64>("F"#I#"D")>;
+}
+
+// There's no store-multiple instruction for FPRs, so we're not fussy
+// about the order in which call-saved registers are allocated.
+defm FP32  : SystemZRegClass<"FP32", [f32], 32, (sequence "F%uS", 0, 15)>;
+defm FP64  : SystemZRegClass<"FP64", [f64], 64, (sequence "F%uD", 0, 15)>;
+defm FP128 : SystemZRegClass<"FP128", [f128], 128,
+                             (add F0Q, F1Q, F4Q, F5Q, F8Q, F9Q, F12Q, F13Q)>;
+
+//===----------------------------------------------------------------------===//
+// Vector registers
+//===----------------------------------------------------------------------===//
+
+// A full 128-bit vector register, with an FPR64 as its high part.
+class VR128<bits<16> num, string n, FPR64 high>
+  : SystemZRegWithSubregs<n, [high]> {
+  let HWEncoding = num;
+  let SubRegIndices = [subreg_r64];
+}
+
+// Full vector registers.
+foreach I = 0-31 in {
+  def V#I : VR128<I, "v"#I, !cast<FPR64>("F"#I#"D")>,
+            DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
+}
+
+// Class used to store 32-bit values in the first element of a vector
+// register.  f32 scalars are used for the WLEDB and WLDEB instructions.
+defm VR32 : SystemZRegClass<"VR32", [f32, v4i8, v2i16], 32,
+                            (add (sequence "F%uS", 0, 7),
+                                 (sequence "F%uS", 16, 31),
+                                 (sequence "F%uS", 8, 15))>;
+
+// Class used to store 64-bit values in the upper half of a vector register.
+// The vector facility also includes scalar f64 instructions that operate
+// on the full vector register set.
+defm VR64 : SystemZRegClass<"VR64", [f64, v8i8, v4i16, v2i32, v2f32], 64,
+                            (add (sequence "F%uD", 0, 7),
+                                 (sequence "F%uD", 16, 31),
+                                 (sequence "F%uD", 8, 15))>;
+
+// The subset of vector registers that can be used for floating-point
+// operations too.
+defm VF128 : SystemZRegClass<"VF128",
+                             [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
+                             (sequence "V%u", 0, 15)>;
+
+// All vector registers.
+defm VR128 : SystemZRegClass<"VR128",
+                             [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
+                             (add (sequence "V%u", 0, 7),
+                                  (sequence "V%u", 16, 31),
+                                  (sequence "V%u", 8, 15))>;
+
+// Attaches a ValueType to a register operand, to make the instruction
+// definitions easier.
+class TypedReg<ValueType vtin, RegisterOperand opin> {
+  ValueType vt = vtin;
+  RegisterOperand op = opin;
+}
+
+def v32eb   : TypedReg<f32,     VR32>;
+def v64g    : TypedReg<i64,     VR64>;
+def v64db   : TypedReg<f64,     VR64>;
+def v128b   : TypedReg<v16i8,   VR128>;
+def v128h   : TypedReg<v8i16,   VR128>;
+def v128f   : TypedReg<v4i32,   VR128>;
+def v128g   : TypedReg<v2i64,   VR128>;
+def v128q   : TypedReg<v16i8,   VR128>;
+def v128eb  : TypedReg<v4f32,   VR128>;
+def v128db  : TypedReg<v2f64,   VR128>;
+def v128any : TypedReg<untyped, VR128>;
+
+//===----------------------------------------------------------------------===//
+// Other registers
+//===----------------------------------------------------------------------===//
+
+// The 2-bit condition code field of the PSW.  Every register named in an
+// inline asm needs a class associated with it.
+def CC : SystemZReg<"cc">;
+let isAllocatable = 0 in
+  def CCRegs : RegisterClass<"SystemZ", [i32], 32, (add CC)>;
+
+// Access registers.
+class ACR32<bits<16> num, string n> : SystemZReg<n> {
+  let HWEncoding = num;
+}
+foreach I = 0-15 in {
+  def A#I : ACR32<I, "a"#I>, DwarfRegNum<[!add(I, 48)]>;
+}
+defm AR32 : SystemZRegClass<"AR32", [i32], 32,
+                            (add (sequence "A%u", 0, 15)), 0>;
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSchedule.td b/contrib/llvm/lib/Target/SystemZ/SystemZSchedule.td
new file mode 100644
index 000000000000..dbba8ab42b5a
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSchedule.td
@@ -0,0 +1,77 @@
+//==-- SystemZSchedule.td - SystemZ Scheduling Definitions ----*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Scheduler resources
+// Resources ending with a '2' use that resource for 2 cycles. An instruction
+// using two such resources use the mapped unit for 4 cycles, and 2 is added
+// to the total number of uops of the sched class.
+
+// These three resources are used to express decoder grouping rules.
+// The number of decoder slots needed by an instructions is normally
+// one. For a cracked instruction (BeginGroup && !EndGroup) it is
+// two. Expanded instructions (BeginGroup && EndGroup) group alone.
+def GroupAlone : SchedWrite;
+def BeginGroup : SchedWrite;
+def EndGroup   : SchedWrite;
+
+// Latencies, to make code a bit neater. If more than one resource is
+// used for an instruction, the greatest latency (not the sum) will be
+// output by Tablegen. Therefore, in such cases one of these resources
+// is needed.
+def Lat2 : SchedWrite;
+def Lat3 : SchedWrite;
+def Lat4 : SchedWrite;
+def Lat5 : SchedWrite;
+def Lat6 : SchedWrite;
+def Lat7 : SchedWrite;
+def Lat8 : SchedWrite;
+def Lat9 : SchedWrite;
+def Lat10 : SchedWrite;
+def Lat11 : SchedWrite;
+def Lat12 : SchedWrite;
+def Lat15 : SchedWrite;
+def Lat20 : SchedWrite;
+def Lat30 : SchedWrite;
+
+// Fixed-point
+def FXa         : SchedWrite;
+def FXa2        : SchedWrite;
+def FXb         : SchedWrite;
+def FXU         : SchedWrite;
+
+// Load/store unit
+def LSU         : SchedWrite;
+
+// Model a return without latency, otherwise if-converter will model
+// extra cost and abort (currently there is an assert that checks that
+// all instructions have at least one uop).
+def LSU_lat1    : SchedWrite;
+
+// Floating point unit (zEC12 and earlier)
+def FPU  : SchedWrite;
+def FPU2 : SchedWrite;
+
+// Vector sub units (z13)
+def VecBF     : SchedWrite;
+def VecBF2    : SchedWrite;
+def VecDF     : SchedWrite;
+def VecDF2    : SchedWrite;
+def VecFPd    : SchedWrite; // Blocking BFP div/sqrt unit.
+def VecMul    : SchedWrite;
+def VecStr    : SchedWrite;
+def VecXsPm   : SchedWrite;
+
+// Virtual branching unit
+def VBU         : SchedWrite;
+
+
+include "SystemZScheduleZ13.td"
+include "SystemZScheduleZEC12.td"
+include "SystemZScheduleZ196.td"
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
new file mode 100644
index 000000000000..e97d61d8355d
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -0,0 +1,1064 @@
+//-- SystemZScheduleZ13.td - SystemZ Scheduling Definitions ----*- tblgen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Z13 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def Z13Model : SchedMachineModel {
+
+    let UnsupportedFeatures = Arch11UnsupportedFeatures.List;
+    
+    let IssueWidth = 8;
+    let MicroOpBufferSize = 60;     // Issue queues
+    let LoadLatency = 1;            // Optimistic load latency.
+
+    let PostRAScheduler = 1;
+
+    // Extra cycles for a mispredicted branch.
+    let MispredictPenalty = 20;
+}
+
+let SchedModel = Z13Model in  {
+
+// These definitions could be put in a subtarget common include file,
+// but it seems the include system in Tablegen currently rejects
+// multiple includes of same file.
+def : WriteRes<GroupAlone, []> {
+  let NumMicroOps = 0;
+  let BeginGroup  = 1;
+  let EndGroup    = 1;
+}
+def : WriteRes<BeginGroup, []> {
+  let NumMicroOps = 0;
+  let BeginGroup  = 1;
+}
+def : WriteRes<EndGroup, []> {
+  let NumMicroOps = 0;
+  let EndGroup    = 1;
+}
+def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
+def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
+def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
+def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
+def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
+def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
+def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
+def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
+def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
+def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
+def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
+def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
+def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
+def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
+
+// Execution units.
+def Z13_FXaUnit     : ProcResource<2>;
+def Z13_FXbUnit     : ProcResource<2>;
+def Z13_LSUnit      : ProcResource<2>;
+def Z13_VecUnit     : ProcResource<2>;
+def Z13_VecFPdUnit  : ProcResource<2> { let BufferSize = 1; /* blocking */ }
+def Z13_VBUnit      : ProcResource<2>;
+
+// Subtarget specific definitions of scheduling resources.
+def : WriteRes<FXa,     [Z13_FXaUnit]> { let Latency = 1; }
+def : WriteRes<FXa2,    [Z13_FXaUnit, Z13_FXaUnit]> { let Latency = 2; }
+def : WriteRes<FXb,     [Z13_FXbUnit]> { let Latency = 1; }
+def : WriteRes<LSU,     [Z13_LSUnit]>  { let Latency = 4; }
+def : WriteRes<VecBF,   [Z13_VecUnit]> { let Latency = 8; }
+def : WriteRes<VecBF2,  [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecDF,   [Z13_VecUnit]> { let Latency = 8; }
+def : WriteRes<VecDF2,  [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecFPd,  [Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
+                         Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit]>
+                         { let Latency = 30; }
+def : WriteRes<VecMul,  [Z13_VecUnit]> { let Latency = 5; }
+def : WriteRes<VecStr,  [Z13_VecUnit]> { let Latency = 4; }
+def : WriteRes<VecXsPm, [Z13_VecUnit]> { let Latency = 3; }
+def : WriteRes<VBU,     [Z13_VBUnit]>; // Virtual Branching Unit
+
+// -------------------------- INSTRUCTIONS ---------------------------------- //
+
+// InstRW constructs have been used in order to preserve the
+// readability of the InstrInfo files.
+
+// For each instruction, as matched by a regexp, provide a list of
+// resources that it needs. These will be combined into a SchedClass.
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+
+//===----------------------------------------------------------------------===//
+// Branch instructions
+//===----------------------------------------------------------------------===//
+
+// Branch
+def : InstRW<[VBU], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[VBU], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[FXa, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[FXa, FXa, FXb, FXb, Lat4, GroupAlone],
+             (instregex "B(R)?X(H|L).*$")>;
+
+// Compare and branch
+def : InstRW<[FXb], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[FXb, FXb, Lat2, GroupAlone],
+             (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Trap instructions
+//===----------------------------------------------------------------------===//
+
+// Trap
+def : InstRW<[VBU], (instregex "(Cond)?Trap$")>;
+
+// Compare and trap
+def : InstRW<[FXb], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CL(G)?T(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Call and return instructions
+//===----------------------------------------------------------------------===//
+
+// Call
+def : InstRW<[VBU, FXa, FXa, Lat3, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+
+// Return
+def : InstRW<[FXb, EndGroup], (instregex "Return$")>;
+def : InstRW<[FXb], (instregex "CondReturn$")>;
+
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+// Select pseudo 
+def : InstRW<[FXa], (instregex "Select(32|64|32Mux)$")>;
+
+// CondStore pseudos
+def : InstRW<[FXa], (instregex "CondStore16(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore16Mux(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore32(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore32Mux(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore64(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore8(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore8Mux(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Moves
+def : InstRW<[FXb, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>;
+
+// Move character
+def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>;
+
+// Pseudo -> reg move
+def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[FXa], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[FXa], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[FXa], (instregex "REG_SEQUENCE$")>;
+def : InstRW<[FXa], (instregex "SUBREG_TO_REG$")>;
+
+// Loads
+def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux|CBB)?$")>;
+def : InstRW<[LSU], (instregex "LG(RL)?$")>;
+def : InstRW<[LSU], (instregex "L128$")>;
+
+def : InstRW<[FXa], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[FXa], (instregex "LLIL(F|H|L)$")>;
+
+def : InstRW<[FXa], (instregex "LG(F|H)I$")>;
+def : InstRW<[FXa], (instregex "LHI(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LR(Mux)?$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSU], (instregex "LZR(F|G)$")>;
+
+// Load and trap
+def : InstRW<[FXb, LSU, Lat5], (instregex "L(FH|G)?AT$")>;
+
+// Load and test
+def : InstRW<[FXa, LSU, Lat5], (instregex "LT(G)?$")>;
+def : InstRW<[FXa], (instregex "LT(G)?R$")>;
+
+// Stores
+def : InstRW<[FXb, LSU, Lat5], (instregex "STG(RL)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "ST128$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+
+// String moves.
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+
+//===----------------------------------------------------------------------===//
+// Conditional move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat2], (instregex "LOCRMux$")>;
+def : InstRW<[FXa, Lat2], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
+def : InstRW<[FXa, Lat2], (instregex "LOC(G|H)?HI(Asm.*)?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "L(B|H|G)R$")>;
+def : InstRW<[FXa], (instregex "LG(B|H|F)R$")>;
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "LTGF$")>;
+def : InstRW<[FXa], (instregex "LTGFR$")>;
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LH(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LG(B|H|F)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSU], (instregex "LLHRL$")>;
+def : InstRW<[LSU], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSU], (instregex "LLZRGF$")>;
+
+// Load and trap
+def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Load multiple (estimated average of 5 ops)
+def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+             (instregex "LM(H|Y|G)?$")>;
+
+// Store multiple (estimated average of ceil(5/2) FXb ops)
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
+              GroupAlone], (instregex "STM(G|H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LRV(G)?R$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LA(Y|RL)?$")>;
+
+// Load the Global Offset Table address ( -> larl )
+def : InstRW<[FXa], (instregex "GOT$")>;
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat2], (instregex "LP(G)?R$")>;
+def : InstRW<[FXa, FXa, Lat3, BeginGroup], (instregex "L(N|P)GFR$")>;
+def : InstRW<[FXa, Lat2], (instregex "LN(R|GR)$")>;
+def : InstRW<[FXa], (instregex "LC(R|GR)$")>;
+def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "LCGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "IC(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "IC32(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
+def : InstRW<[FXa], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[FXa], (instregex "IIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "IIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "IIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "IILF(64)?$")>;
+def : InstRW<[FXa], (instregex "IILH(64)?$")>;
+def : InstRW<[FXa], (instregex "IILL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "A(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "AH(Y)?$")>;
+def : InstRW<[FXa], (instregex "AIH$")>;
+def : InstRW<[FXa], (instregex "AFI(Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "AG$")>;
+def : InstRW<[FXa], (instregex "AGFI$")>;
+def : InstRW<[FXa], (instregex "AGHI(K)?$")>;
+def : InstRW<[FXa], (instregex "AGR(K)?$")>;
+def : InstRW<[FXa], (instregex "AHI(K)?$")>;
+def : InstRW<[FXa], (instregex "AHIMux(K)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "AL(Y)?$")>;
+def : InstRW<[FXa], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXa], (instregex "ALGHSIK$")>;
+def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
+def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
+def : InstRW<[FXa], (instregex "ALR(K)?$")>;
+def : InstRW<[FXa], (instregex "AR(K)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "A(G)?SI$")>;
+
+// Logical addition with carry
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>;
+def : InstRW<[FXa, Lat2, GroupAlone], (instregex "ALC(G)?R$")>;
+
+// Add with sign extension (32 -> 64)
+def : InstRW<[FXa, LSU, Lat6], (instregex "AGF$")>;
+def : InstRW<[FXa, Lat2], (instregex "AGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "S(G|Y)?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "SH(Y)?$")>;
+def : InstRW<[FXa], (instregex "SGR(K)?$")>;
+def : InstRW<[FXa], (instregex "SLFI$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[FXa], (instregex "SLGF(I|R)$")>;
+def : InstRW<[FXa], (instregex "SLGR(K)?$")>;
+def : InstRW<[FXa], (instregex "SLR(K)?$")>;
+def : InstRW<[FXa], (instregex "SR(K)?$")>;
+
+// Subtraction with borrow
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "SLB(G)?$")>;
+def : InstRW<[FXa, Lat2, GroupAlone], (instregex "SLB(G)?R$")>;
+
+// Subtraction with sign extension (32 -> 64)
+def : InstRW<[FXa, LSU, Lat6], (instregex "SGF$")>;
+def : InstRW<[FXa, Lat2], (instregex "SGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "N(G|Y)?$")>;
+def : InstRW<[FXa], (instregex "NGR(K)?$")>;
+def : InstRW<[FXa], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "NI(Y)?$")>;
+def : InstRW<[FXa], (instregex "NIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "NIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "NIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "NILF(64)?$")>;
+def : InstRW<[FXa], (instregex "NILH(64)?$")>;
+def : InstRW<[FXa], (instregex "NILL(64)?$")>;
+def : InstRW<[FXa], (instregex "NR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "NC$")>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "O(G|Y)?$")>;
+def : InstRW<[FXa], (instregex "OGR(K)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "OI(Y)?$")>;
+def : InstRW<[FXa], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXa], (instregex "OIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "OIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "OIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "OILF(64)?$")>;
+def : InstRW<[FXa], (instregex "OILH(64)?$")>;
+def : InstRW<[FXa], (instregex "OILL(64)?$")>;
+def : InstRW<[FXa], (instregex "OR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "OC$")>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "X(G|Y)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "XI(Y)?$")>;
+def : InstRW<[FXa], (instregex "XIFMux$")>;
+def : InstRW<[FXa], (instregex "XGR(K)?$")>;
+def : InstRW<[FXa], (instregex "XIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "XILF(64)?$")>;
+def : InstRW<[FXa], (instregex "XR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "XC$")>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat10], (instregex "MS(GF|Y)?$")>;
+def : InstRW<[FXa, Lat6], (instregex "MS(R|FI)$")>;
+def : InstRW<[FXa, LSU, Lat12], (instregex "MSG$")>;
+def : InstRW<[FXa, Lat8], (instregex "MSGR$")>;
+def : InstRW<[FXa, Lat6], (instregex "MSGF(I|R)$")>;
+def : InstRW<[FXa, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXa, Lat9, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXa, Lat5], (instregex "MGHI$")>;
+def : InstRW<[FXa, Lat5], (instregex "MHI$")>;
+def : InstRW<[FXa, LSU, Lat9], (instregex "MH(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[LSU, FXa, Lat30, GroupAlone], (instregex "DSG(F)?$")>;
+def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>;
+def : InstRW<[FXa2, FXa2, Lat30, GroupAlone], (instregex "DLGR$")>;
+def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "SLL(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SRL(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SRA(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SLA(K)?$")>;
+
+// Rotate
+def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>;
+
+// Rotate and insert
+def : InstRW<[FXa], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[FXa], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[FXa], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[FXa], (instregex "RISBMux$")>;
+
+// Rotate and Select
+def : InstRW<[FXa, FXa, Lat3, BeginGroup], (instregex "R(N|O|X)SBG$")>;
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
+def : InstRW<[FXb], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[FXb], (instregex "CG(F|H)I$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[FXb], (instregex "C(G)?R$")>;
+def : InstRW<[FXb], (instregex "CIH$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CH(F|SI)$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
+def : InstRW<[FXb], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLGF(RL)?$")>;
+def : InstRW<[FXb], (instregex "CLGF(I|R)$")>;
+def : InstRW<[FXb], (instregex "CLGR$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLGRL$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
+def : InstRW<[FXb], (instregex "CLIH$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLI(Y)?$")>;
+def : InstRW<[FXb], (instregex "CLR$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLRL$")>;
+
+// Compare halfword
+def : InstRW<[FXb, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
+def : InstRW<[FXb, LSU, Lat6], (instregex "CGH(RL)?$")>;
+def : InstRW<[FXa, FXb, LSU, Lat6, BeginGroup], (instregex "CHHSI$")>;
+
+// Compare with sign extension (32 -> 64)
+def : InstRW<[FXb, LSU, Lat6], (instregex "CGF(RL)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "CGFR$")>;
+
+// Compare logical character
+def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>;
+
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+
+// Test under mask
+def : InstRW<[FXb, LSU, Lat5], (instregex "TM(Y)?$")>;
+def : InstRW<[FXb], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[FXb], (instregex "TMHH(64)?$")>;
+def : InstRW<[FXb], (instregex "TMHL(64)?$")>;
+def : InstRW<[FXb], (instregex "TMLH(64)?$")>;
+def : InstRW<[FXb], (instregex "TMLL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Prefetch and execution hint
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "PFD(RL)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "BPP$")>;
+def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>;
+def : InstRW<[FXb], (instregex "NIAI$")>;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, EndGroup], (instregex "Serialize$")>;
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAA(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAAL(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAN(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAO(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAX(G)?$")>;
+
+// Test and set
+def : InstRW<[FXb, LSU, Lat5, EndGroup], (instregex "TS$")>;
+
+// Compare and swap
+def : InstRW<[FXa, FXb, LSU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+
+// Compare double and swap
+def : InstRW<[FXa, FXa, FXb, FXb, FXa, LSU, Lat10, GroupAlone],
+             (instregex "CDS(Y)?$")>;
+def : InstRW<[FXa, FXa, FXb, FXb, LSU, FXb, FXb, LSU, LSU, Lat20, GroupAlone],
+             (instregex "CDSG$")>;
+
+// Compare and swap and store
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "CSST$")>;
+
+// Perform locked operation
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+
+// Load/store pair from/to quadword
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+
+// Load pair disjoint
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Access registers
+//===----------------------------------------------------------------------===//
+
+// Extract/set/copy access register
+def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+
+// Load address extended
+def : InstRW<[LSU, FXa, Lat5, BeginGroup], (instregex "LAE(Y)?$")>;
+
+// Load/store access multiple (not modeled precisely)
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Program mask and addressing mode
+//===----------------------------------------------------------------------===//
+
+// Insert Program Mask
+def : InstRW<[FXa, Lat3, EndGroup], (instregex "IPM$")>;
+
+// Set Program Mask
+def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+
+// Branch and link
+def : InstRW<[FXa, FXa, FXb, Lat5, GroupAlone], (instregex "BAL(R)?$")>;
+
+// Test addressing mode
+def : InstRW<[FXb], (instregex "TAM$")>;
+
+// Set addressing mode
+def : InstRW<[FXb, Lat2, EndGroup], (instregex "SAM(24|31|64)$")>;
+
+// Branch (and save) and set mode.
+def : InstRW<[FXa, FXb, Lat2, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "BASSM$")>;
+
+//===----------------------------------------------------------------------===//
+// Transactional execution
+//===----------------------------------------------------------------------===//
+
+// Transaction begin
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat15, GroupAlone],
+              (instregex "TBEGIN(C|_nofloat)?$")>;
+
+// Transaction end
+def : InstRW<[FXb, GroupAlone], (instregex "TEND$")>;
+
+// Transaction abort
+def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>;
+
+// Extract Transaction Nesting Depth
+def : InstRW<[FXa], (instregex "ETND$")>;
+
+// Nontransactional store
+def : InstRW<[FXb, LSU, Lat5], (instregex "NTSTG$")>;
+
+//===----------------------------------------------------------------------===//
+// Processor assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "PPA$")>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Find leftmost one
+def : InstRW<[FXa, Lat6, GroupAlone], (instregex "FLOGR$")>;
+
+// Population count
+def : InstRW<[FXa, Lat3], (instregex "POPCNT$")>;
+
+// Extend
+def : InstRW<[FXa], (instregex "AEXT128_64$")>;
+def : InstRW<[FXa], (instregex "ZEXT128_(32|64)$")>;
+
+// String instructions
+def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>;
+
+// Move with key
+def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
+
+// Extract CPU Time
+def : InstRW<[FXa, Lat5, LSU], (instregex "ECTG$")>;
+
+// Execute
+def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
+
+// Program return
+def : InstRW<[FXb, Lat30], (instregex "PR$")>;
+
+// Inline assembly
+def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone],
+             (instregex "STCK(F)?$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone],
+             (instregex "STCKE$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "STFLE$")>;
+def : InstRW<[FXb, Lat30], (instregex "SVC$")>;
+
+// Store real address
+def : InstRW<[FXb, LSU, Lat5], (instregex "STRAG$")>;
+
+//===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+// An "empty" sched-class will be assigned instead of the "invalid sched-class".
+// getNumDecoderSlots() will then return 1 instead of 0.
+def : InstRW<[], (instregex "Insn.*")>;
+
+
+// ----------------------------- Floating point ----------------------------- //
+
+//===----------------------------------------------------------------------===//
+// FP: Select instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "SelectF(32|64|128)$")>;
+def : InstRW<[FXa], (instregex "CondStoreF32(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStoreF64(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero
+def : InstRW<[FXb], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[FXb, FXb, Lat2, BeginGroup], (instregex "LZXR$")>;
+
+// Load
+def : InstRW<[VecXsPm], (instregex "LER$")>;
+def : InstRW<[FXb], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[FXb, Lat3], (instregex "LGDR$")>;
+def : InstRW<[FXb, FXb, Lat2, GroupAlone], (instregex "LXR$")>;
+
+// Load and Test
+def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)BR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "LTEBRCompare(_VecPseudo)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "LTDBRCompare(_VecPseudo)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXBR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone],
+             (instregex "LTXBRCompare(_VecPseudo)?$")>;
+
+// Copy sign
+def : InstRW<[VecXsPm], (instregex "CPSDRd(d|s)$")>;
+def : InstRW<[VecXsPm], (instregex "CPSDRs(d|s)$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Load instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm, LSU, Lat7], (instregex "LE(Y)?$")>;
+def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSU], (instregex "LX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Store instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat7], (instregex "STD(Y)?$")>;
+def : InstRW<[FXb, LSU, Lat7], (instregex "STE(Y)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecBF], (instregex "LEDBR(A)?$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LEXBR(A)?$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXBR(A)?$")>;
+
+// Load lengthened
+def : InstRW<[VecBF, LSU, Lat12], (instregex "LDEB$")>;
+def : InstRW<[VecBF], (instregex "LDEBR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12 , GroupAlone], (instregex "LX(D|E)B$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)BR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CEL(F|G)BR$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CDL(F|G)BR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CXL(F|G)BR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)BR(A)?$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XBR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat11, GroupAlone], (instregex "CLFEBR$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLFDBR$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "CL(F|G)XBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DBR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)EBR$")>;
+def : InstRW<[FXb], (instregex "LCDFR(_32)?$")>;
+def : InstRW<[FXb], (instregex "LNDFR(_32)?$")>;
+def : InstRW<[FXb], (instregex "LPDFR(_32)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+
+// Square root
+def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)B$")>;
+def : InstRW<[VecFPd], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXBR$")>;
+
+// Load FP integer
+def : InstRW<[VecBF], (instregex "FIEBR(A)?$")>;
+def : InstRW<[VecBF], (instregex "FIDBR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXBR(A)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D)B$")>;
+def : InstRW<[VecBF], (instregex "A(E|D)BR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "AXBR$")>;
+
+// Subtraction
+def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D)B$")>;
+def : InstRW<[VecBF], (instregex "S(E|D)BR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXBR$")>;
+
+// Multiply
+def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[VecBF], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXDB$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXBR$")>;
+
+// Multiply and add / subtract
+def : InstRW<[VecBF, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[VecBF, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "M(A|S)DBR$")>;
+
+// Division
+def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>;
+def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecXsPm, LSU, Lat8], (instregex "C(E|D)B$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "C(E|D)BR?$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXBR$")>;
+
+// Test Data Class
+def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>;
+def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "TCXB$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Floating-point control register instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[FXb, LSU, Lat5, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "SFASR$")>;
+def : InstRW<[FXa, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
+def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>;
+
+// --------------------------------- Vector --------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// Vector: Move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "VLR(32|64)?$")>;
+def : InstRW<[FXb, Lat4], (instregex "VLGV(B|F|G|H)?$")>;
+def : InstRW<[FXb], (instregex "VLVG(B|F|G|H)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "VLVGP(32)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Immediate instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VZERO$")>;
+def : InstRW<[VecXsPm], (instregex "VONE$")>;
+def : InstRW<[VecXsPm], (instregex "VGBM$")>;
+def : InstRW<[VecXsPm], (instregex "VGM(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>;
+def : InstRW<[VecXsPm], (instregex "VREPI(B|F|G|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Loads
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "VL(L|BB)?$")>;
+def : InstRW<[LSU], (instregex "VL(32|64)$")>;
+def : InstRW<[LSU], (instregex "VLLEZ(B|F|G|H)?$")>;
+def : InstRW<[LSU], (instregex "VLREP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, LSU, Lat7], (instregex "VLE(B|F|G|H)$")>;
+def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VGE(F|G)$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+              (instregex "VLM$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Stores
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat8], (instregex "VST(L|32|64)?$")>;
+def : InstRW<[FXb, LSU, Lat8], (instregex "VSTE(F|G)$")>;
+def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VSTE(B|H)$")>;
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat20, GroupAlone],
+              (instregex "VSTM$")>;
+def : InstRW<[FXb, FXb, LSU, Lat12, BeginGroup], (instregex "VSCE(F|G)$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Selects and permutes
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VMRH(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMRL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VPERM$")>;
+def : InstRW<[VecXsPm], (instregex "VPDI$")>;
+def : InstRW<[VecXsPm], (instregex "VREP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VSEL$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Widening and narrowing
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VPK(F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VPKS(F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VPKS(F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VPKLS(F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VPKLS(F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VSEG(B|F|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPH(B|F|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPL(B|F)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPLH(B|F|H|W)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPLL(B|F|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VAVG(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VAVGL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VN(C|O)?$")>;
+def : InstRW<[VecXsPm], (instregex "VO$")>;
+def : InstRW<[VecMul], (instregex "VCKSM$")>;
+def : InstRW<[VecXsPm], (instregex "VCLZ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VCTZ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VX$")>;
+def : InstRW<[VecMul], (instregex "VGFM?$")>;
+def : InstRW<[VecMul], (instregex "VGFMA(B|F|G|H)?$")>;
+def : InstRW<[VecMul], (instregex "VGFM(B|F|G|H)$")>;
+def : InstRW<[VecXsPm], (instregex "VLC(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VLP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMX(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMXL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMN(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMNL(B|F|G|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAL(B|F)?$")>;
+def : InstRW<[VecMul], (instregex "VMALE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMALH(B|F|H|W)?$")>;
+def : InstRW<[VecMul], (instregex "VMALO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAH(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VME(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMH(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VML(B|F)?$")>;
+def : InstRW<[VecMul], (instregex "VMLE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMLH(B|F|H|W)?$")>;
+def : InstRW<[VecMul], (instregex "VMLO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMO(B|F|H)?$")>;
+
+def : InstRW<[VecXsPm], (instregex "VPOPCT$")>;
+
+def : InstRW<[VecXsPm], (instregex "VERLL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VERLLV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VERIM(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESLV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRA(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRAV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRLV(B|F|G|H)?$")>;
+
+def : InstRW<[VecXsPm], (instregex "VSL(DB)?$")>;
+def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSLB$")>;
+def : InstRW<[VecXsPm], (instregex "VSR(A|L)$")>;
+def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSR(A|L)B$")>;
+
+def : InstRW<[VecXsPm], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VSCBI(B|F|G|H|Q)?$")>;
+def : InstRW<[VecXsPm], (instregex "VS(F|G|H|Q)?$")>;
+
+def : InstRW<[VecMul], (instregex "VSUM(B|H)?$")>;
+def : InstRW<[VecMul], (instregex "VSUMG(F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VSUMQ(F|G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm, Lat4], (instregex "VEC(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VECL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VCEQ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCEQ(B|F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VCH(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCH(B|F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VCHL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCHL(B|F|G|H)S$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VTM$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecBF2], (instregex "VCD(G|GB|LG|LGB)$")>;
+def : InstRW<[VecBF], (instregex "WCD(GB|LGB)$")>;
+def : InstRW<[VecBF2], (instregex "VC(L)?GD$")>;
+def : InstRW<[VecBF2], (instregex "VFADB$")>;
+def : InstRW<[VecBF], (instregex "WFADB$")>;
+def : InstRW<[VecBF2], (instregex "VCGDB$")>;
+def : InstRW<[VecBF], (instregex "WCGDB$")>;
+def : InstRW<[VecBF2], (instregex "VF(I|M|A|S)$")>;
+def : InstRW<[VecBF2], (instregex "VF(I|M|S)DB$")>;
+def : InstRW<[VecBF], (instregex "WF(I|M|S)DB$")>;
+def : InstRW<[VecBF2], (instregex "VCLGDB$")>;
+def : InstRW<[VecBF], (instregex "WCLGDB$")>;
+def : InstRW<[VecXsPm], (instregex "VFL(C|N|P)DB$")>;
+def : InstRW<[VecXsPm], (instregex "WFL(C|N|P)DB$")>;
+def : InstRW<[VecBF2], (instregex "VFM(A|S)$")>;
+def : InstRW<[VecBF2], (instregex "VFM(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "WFM(A|S)DB$")>;
+def : InstRW<[VecXsPm], (instregex "VFPSO$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FPSODB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VFTCI(DB)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WFTCIDB$")>;
+def : InstRW<[VecBF2], (instregex "VL(DE|ED)$")>;
+def : InstRW<[VecBF2], (instregex "VL(DE|ED)B$")>;
+def : InstRW<[VecBF], (instregex "WL(DE|ED)B$")>;
+
+// divide / square root
+def : InstRW<[VecFPd], (instregex "VFD$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FDDB$")>;
+def : InstRW<[VecFPd], (instregex "VFSQ$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FSQDB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VFC(E|H|HE)$")>;
+def : InstRW<[VecXsPm], (instregex "VFC(E|H|HE)DB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)$")>;
+def : InstRW<[VecXsPm], (instregex "WFC(E|H|HE)DB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VFC(E|H|HE)DBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WFC(E|H|HE)DBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)DB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point insertion and extraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "LEFR$")>;
+def : InstRW<[FXb, Lat4], (instregex "LFER$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: String instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecStr], (instregex "VFAE(B)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAEBS$")>;
+def : InstRW<[VecStr], (instregex "VFAE(F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAE(F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VFAEZ(B|F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAEZ(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[VecStr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[VecStr], (instregex "VISTR(B|F|H)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VISTR(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VSTRC(B|F|H)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VSTRC(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VSTRCZ(B|F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>;
+
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
new file mode 100644
index 000000000000..a950e54e7601
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -0,0 +1,769 @@
+//=- SystemZScheduleZ196.td - SystemZ Scheduling Definitions ---*- tblgen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Z196 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def Z196Model : SchedMachineModel {
+
+    let UnsupportedFeatures = Arch9UnsupportedFeatures.List;
+    
+    let IssueWidth = 5;
+    let MicroOpBufferSize = 40;     // Issue queues
+    let LoadLatency = 1;            // Optimistic load latency.
+
+    let PostRAScheduler = 1;
+
+    // Extra cycles for a mispredicted branch.
+    let MispredictPenalty = 16;
+}
+
+let SchedModel = Z196Model in  {
+
+// These definitions could be put in a subtarget common include file,
+// but it seems the include system in Tablegen currently rejects
+// multiple includes of same file.
+def : WriteRes<GroupAlone, []> {
+  let NumMicroOps = 0;
+  let BeginGroup  = 1;
+  let EndGroup    = 1;
+}
+def : WriteRes<EndGroup, []> {
+  let NumMicroOps = 0;
+  let EndGroup    = 1;
+}
+def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
+def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
+def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
+def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
+def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
+def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
+def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
+def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
+def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
+def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
+def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
+def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
+def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
+def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
+
+// Execution units.
+def Z196_FXUnit : ProcResource<2>;
+def Z196_LSUnit : ProcResource<2>;
+def Z196_FPUnit : ProcResource<1>;
+
+// Subtarget specific definitions of scheduling resources.
+def : WriteRes<FXU,       [Z196_FXUnit]> { let Latency = 1; }
+def : WriteRes<LSU,       [Z196_LSUnit]> { let Latency = 4; }
+def : WriteRes<LSU_lat1,  [Z196_LSUnit]> { let Latency = 1; }
+def : WriteRes<FPU,       [Z196_FPUnit]> { let Latency = 8; }
+def : WriteRes<FPU2,      [Z196_FPUnit, Z196_FPUnit]> { let Latency = 9; }
+
+// -------------------------- INSTRUCTIONS ---------------------------------- //
+
+// InstRW constructs have been used in order to preserve the
+// readability of the InstrInfo files.
+
+// For each instruction, as matched by a regexp, provide a list of
+// resources that it needs. These will be combined into a SchedClass.
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+
+//===----------------------------------------------------------------------===//
+// Branch instructions
+//===----------------------------------------------------------------------===//
+
+// Branch
+def : InstRW<[LSU, EndGroup], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[LSU, EndGroup], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[LSU, EndGroup], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[LSU, EndGroup], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BRCT(G|H)?$")>;
+def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[FXU, FXU, FXU, LSU, Lat7, GroupAlone],
+             (instregex "B(R)?X(H|L).*$")>;
+
+// Compare and branch
+def : InstRW<[FXU, LSU, Lat5, GroupAlone],
+             (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[FXU, LSU, Lat5, GroupAlone],
+             (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Trap instructions
+//===----------------------------------------------------------------------===//
+
+// Trap
+def : InstRW<[LSU, EndGroup], (instregex "(Cond)?Trap$")>;
+
+// Compare and trap
+def : InstRW<[FXU], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[FXU], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[FXU], (instregex "CL(F|G)IT(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Call and return instructions
+//===----------------------------------------------------------------------===//
+
+// Call
+def : InstRW<[LSU, FXU, FXU, Lat6, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[LSU, FXU, FXU, Lat6, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[LSU, FXU, FXU, Lat6, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+
+// Return
+def : InstRW<[LSU_lat1, EndGroup], (instregex "Return$")>;
+def : InstRW<[LSU_lat1, EndGroup], (instregex "CondReturn$")>;
+
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+// Select pseudo 
+def : InstRW<[FXU], (instregex "Select(32|64|32Mux)$")>;
+
+// CondStore pseudos
+def : InstRW<[FXU], (instregex "CondStore16(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore16Mux(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore32(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore64(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore8(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore8Mux(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Moves
+def : InstRW<[FXU, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
+
+// Move character
+def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
+
+// Pseudo -> reg move
+def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[FXU], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[FXU], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[FXU], (instregex "REG_SEQUENCE$")>;
+def : InstRW<[FXU], (instregex "SUBREG_TO_REG$")>;
+
+// Loads
+def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux)?$")>;
+def : InstRW<[LSU], (instregex "LG(RL)?$")>;
+def : InstRW<[LSU], (instregex "L128$")>;
+
+def : InstRW<[FXU], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[FXU], (instregex "LLIL(F|H|L)$")>;
+
+def : InstRW<[FXU], (instregex "LG(F|H)I$")>;
+def : InstRW<[FXU], (instregex "LHI(Mux)?$")>;
+def : InstRW<[FXU], (instregex "LR(Mux)?$")>;
+
+// Load and test
+def : InstRW<[FXU, LSU, Lat5], (instregex "LT(G)?$")>;
+def : InstRW<[FXU], (instregex "LT(G)?R$")>;
+
+// Stores
+def : InstRW<[FXU, LSU, Lat5], (instregex "STG(RL)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ST128$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+
+// String moves.
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+
+//===----------------------------------------------------------------------===//
+// Conditional move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat2, EndGroup], (instregex "LOC(G)?R(Asm.*)?$")>;
+def : InstRW<[FXU, LSU, Lat6, EndGroup], (instregex "LOC(G)?(Asm.*)?$")>;
+def : InstRW<[FXU, LSU, Lat5, EndGroup], (instregex "STOC(G)?(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+def : InstRW<[FXU], (instregex "L(B|H|G)R$")>;
+def : InstRW<[FXU], (instregex "LG(B|H|F)R$")>;
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "LTGF$")>;
+def : InstRW<[FXU], (instregex "LTGFR$")>;
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LH(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LG(B|H|F)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[FXU], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[FXU], (instregex "LLG(C|F|H|T)R$")>;
+def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSU], (instregex "LLHRL$")>;
+def : InstRW<[LSU], (instregex "LLG(C|F|H|T|FRL|HRL)$")>;
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Load multiple (estimated average of 5 ops)
+def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+             (instregex "LM(H|Y|G)?$")>;
+
+// Store multiple (estimated average of 3 ops)
+def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
+             (instregex "STM(H|Y|G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LA(Y|RL)?$")>;
+
+// Load the Global Offset Table address
+def : InstRW<[FXU], (instregex "GOT$")>;
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat2], (instregex "LP(G)?R$")>;
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "L(N|P)GFR$")>;
+def : InstRW<[FXU, Lat2], (instregex "LN(R|GR)$")>;
+def : InstRW<[FXU], (instregex "LC(R|GR)$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LCGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "IC(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "IC32(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
+def : InstRW<[FXU], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[FXU], (instregex "IIHF(64)?$")>;
+def : InstRW<[FXU], (instregex "IIHH(64)?$")>;
+def : InstRW<[FXU], (instregex "IIHL(64)?$")>;
+def : InstRW<[FXU], (instregex "IILF(64)?$")>;
+def : InstRW<[FXU], (instregex "IILH(64)?$")>;
+def : InstRW<[FXU], (instregex "IILL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "AH(Y)?$")>;
+def : InstRW<[FXU], (instregex "AIH$")>;
+def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
+def : InstRW<[FXU], (instregex "AGFI$")>;
+def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
+def : InstRW<[FXU], (instregex "AGR(K)?$")>;
+def : InstRW<[FXU], (instregex "AHI(K)?$")>;
+def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>;
+def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXU], (instregex "ALGHSIK$")>;
+def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
+def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
+def : InstRW<[FXU], (instregex "ALR(K)?$")>;
+def : InstRW<[FXU], (instregex "AR(K)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>;
+
+// Logical addition with carry
+def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
+def : InstRW<[FXU, Lat3, GroupAlone], (instregex "ALC(G)?R$")>;
+
+// Add with sign extension (32 -> 64)
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "AGF$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "AGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "S(G|Y)?$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "SH(Y)?$")>;
+def : InstRW<[FXU], (instregex "SGR(K)?$")>;
+def : InstRW<[FXU], (instregex "SLFI$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[FXU], (instregex "SLGF(I|R)$")>;
+def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
+def : InstRW<[FXU], (instregex "SLR(K)?$")>;
+def : InstRW<[FXU], (instregex "SR(K)?$")>;
+
+// Subtraction with borrow
+def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
+def : InstRW<[FXU, Lat3, GroupAlone], (instregex "SLB(G)?R$")>;
+
+// Subtraction with sign extension (32 -> 64)
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "SGF$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "SGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "N(G|Y)?$")>;
+def : InstRW<[FXU], (instregex "NGR(K)?$")>;
+def : InstRW<[FXU], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "NI(Y)?$")>;
+def : InstRW<[FXU], (instregex "NIHF(64)?$")>;
+def : InstRW<[FXU], (instregex "NIHH(64)?$")>;
+def : InstRW<[FXU], (instregex "NIHL(64)?$")>;
+def : InstRW<[FXU], (instregex "NILF(64)?$")>;
+def : InstRW<[FXU], (instregex "NILH(64)?$")>;
+def : InstRW<[FXU], (instregex "NILL(64)?$")>;
+def : InstRW<[FXU], (instregex "NR(K)?$")>;
+def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "NC$")>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "O(G|Y)?$")>;
+def : InstRW<[FXU], (instregex "OGR(K)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "OI(Y)?$")>;
+def : InstRW<[FXU], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXU], (instregex "OIHF(64)?$")>;
+def : InstRW<[FXU], (instregex "OIHH(64)?$")>;
+def : InstRW<[FXU], (instregex "OIHL(64)?$")>;
+def : InstRW<[FXU], (instregex "OILF(64)?$")>;
+def : InstRW<[FXU], (instregex "OILH(64)?$")>;
+def : InstRW<[FXU], (instregex "OILL(64)?$")>;
+def : InstRW<[FXU], (instregex "OR(K)?$")>;
+def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "OC$")>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "X(G|Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "XI(Y)?$")>;
+def : InstRW<[FXU], (instregex "XIFMux$")>;
+def : InstRW<[FXU], (instregex "XGR(K)?$")>;
+def : InstRW<[FXU], (instregex "XIHF(64)?$")>;
+def : InstRW<[FXU], (instregex "XILF(64)?$")>;
+def : InstRW<[FXU], (instregex "XR(K)?$")>;
+def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "XC$")>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat10], (instregex "MS(GF|Y)?$")>;
+def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>;
+def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>;
+def : InstRW<[FXU, Lat8], (instregex "MSGR$")>;
+def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>;
+def : InstRW<[FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
+def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
+def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "DSG(F)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "DSG(F)?$")>;
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "DL(G)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "DL(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
+def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
+def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
+def : InstRW<[FXU, Lat2], (instregex "SLA(K)?$")>;
+
+// Rotate
+def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
+
+// Rotate and insert
+def : InstRW<[FXU], (instregex "RISBG(32)?$")>;
+def : InstRW<[FXU], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[FXU], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[FXU], (instregex "RISBMux$")>;
+
+// Rotate and Select
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "R(N|O|X)SBG$")>;
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
+def : InstRW<[FXU], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[FXU], (instregex "CG(F|H)I$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[FXU], (instregex "C(G)?R$")>;
+def : InstRW<[FXU], (instregex "CIH$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CH(F|SI)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
+def : InstRW<[FXU], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLGF(RL)?$")>;
+def : InstRW<[FXU], (instregex "CLGF(I|R)$")>;
+def : InstRW<[FXU], (instregex "CLGR$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLGRL$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
+def : InstRW<[FXU], (instregex "CLIH$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
+def : InstRW<[FXU], (instregex "CLR$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
+
+// Compare halfword
+def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CH(Y|RL)?$")>;
+def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CGH(RL)?$")>;
+def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CHHSI$")>;
+
+// Compare with sign extension (32 -> 64)
+def : InstRW<[FXU, FXU, LSU, Lat6, Lat2, GroupAlone], (instregex "CGF(RL)?$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "CGFR$")>;
+
+// Compare logical character
+def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "CLC$")>;
+
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+
+// Test under mask
+def : InstRW<[FXU, LSU, Lat5], (instregex "TM(Y)?$")>;
+def : InstRW<[FXU], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[FXU], (instregex "TMHH(64)?$")>;
+def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
+def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
+def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Prefetch
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, GroupAlone], (instregex "PFD(RL)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, EndGroup], (instregex "Serialize$")>;
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAA(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAAL(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAN(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAO(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAX(G)?$")>;
+
+// Test and set
+def : InstRW<[FXU, LSU, Lat5, EndGroup], (instregex "TS$")>;
+
+// Compare and swap
+def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+
+// Compare double and swap
+def : InstRW<[FXU, FXU, FXU, FXU, FXU, LSU, Lat10, GroupAlone],
+             (instregex "CDS(Y)?$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone],
+             (instregex "CDSG$")>;
+
+// Compare and swap and store
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CSST$")>;
+
+// Perform locked operation
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+
+// Load/store pair from/to quadword
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+
+// Load pair disjoint
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Access registers
+//===----------------------------------------------------------------------===//
+
+// Extract/set/copy access register
+def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+
+// Load address extended
+def : InstRW<[LSU, FXU, Lat5, GroupAlone], (instregex "LAE(Y)?$")>;
+
+// Load/store access multiple (not modeled precisely)
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Program mask and addressing mode
+//===----------------------------------------------------------------------===//
+
+// Insert Program Mask
+def : InstRW<[FXU, Lat3, EndGroup], (instregex "IPM$")>;
+
+// Set Program Mask
+def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+
+// Branch and link
+def : InstRW<[FXU, FXU, LSU, Lat8, GroupAlone], (instregex "BAL(R)?$")>;
+
+// Test addressing mode
+def : InstRW<[FXU], (instregex "TAM$")>;
+
+// Set addressing mode
+def : InstRW<[LSU, EndGroup], (instregex "SAM(24|31|64)$")>;
+
+// Branch (and save) and set mode.
+def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "BASSM$")>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Find leftmost one
+def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
+
+// Population count
+def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>;
+
+// Extend
+def : InstRW<[FXU], (instregex "AEXT128_64$")>;
+def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>;
+
+// String instructions
+def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
+
+// Move with key
+def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
+
+// Extract CPU Time
+def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
+
+// Execute
+def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
+
+// Program return
+def : InstRW<[FXU, Lat30], (instregex "PR$")>;
+
+// Inline assembly
+def : InstRW<[FXU, LSU, Lat15], (instregex "STCK$")>;
+def : InstRW<[FXU, LSU, Lat12], (instregex "STCKF$")>;
+def : InstRW<[LSU, FXU, Lat5], (instregex "STCKE$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STFLE$")>;
+def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
+
+// Store real address
+def : InstRW<[FXU, LSU, Lat5], (instregex "STRAG$")>;
+
+//===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+// An "empty" sched-class will be assigned instead of the "invalid sched-class".
+// getNumDecoderSlots() will then return 1 instead of 0.
+def : InstRW<[], (instregex "Insn.*")>;
+
+
+// ----------------------------- Floating point ----------------------------- //
+
+//===----------------------------------------------------------------------===//
+// FP: Select instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "SelectF(32|64|128)$")>;
+def : InstRW<[FXU], (instregex "CondStoreF32(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStoreF64(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero
+def : InstRW<[FXU], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LZXR$")>;
+
+// Load
+def : InstRW<[FXU], (instregex "LER$")>;
+def : InstRW<[FXU], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[FXU, Lat3], (instregex "LGDR$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LXR$")>;
+
+// Load and Test
+def : InstRW<[FPU], (instregex "LT(D|E)BR$")>;
+def : InstRW<[FPU], (instregex "LTEBRCompare(_VecPseudo)?$")>;
+def : InstRW<[FPU], (instregex "LTDBRCompare(_VecPseudo)?$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "LTXBR$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone],
+             (instregex "LTXBRCompare(_VecPseudo)?$")>;
+
+// Copy sign
+def : InstRW<[FXU, FXU, Lat5, GroupAlone], (instregex "CPSDRd(d|s)$")>;
+def : InstRW<[FXU, FXU, Lat5, GroupAlone], (instregex "CPSDRs(d|s)$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Load instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "LE(Y)?$")>;
+def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSU], (instregex "LX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Store instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat7], (instregex "STD(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat7], (instregex "STE(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[FPU], (instregex "LEDBR(A)?$")>;
+def : InstRW<[FPU, FPU, Lat20], (instregex "LEXBR(A)?$")>;
+def : InstRW<[FPU, FPU, Lat20], (instregex "LDXBR(A)?$")>;
+
+// Load lengthened
+def : InstRW<[FPU, LSU, Lat12], (instregex "LDEB$")>;
+def : InstRW<[FPU], (instregex "LDEBR$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "LX(D|E)B$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "LX(D|E)BR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CE(F|G)BR(A)?$")>;
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CD(F|G)BR(A)?$")>;
+def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CEL(F|G)BR$")>;
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CDL(F|G)BR$")>;
+def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CXL(F|G)BR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CF(E|D)BR(A)?$")>;
+def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CG(E|D)BR(A)?$")>;
+def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "C(F|G)XBR(A)?$")>;
+def : InstRW<[FXU, FPU, Lat11, GroupAlone], (instregex "CLF(E|D)BR$")>;
+def : InstRW<[FXU, FPU, Lat11, GroupAlone], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "CL(F|G)XBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[FPU], (instregex "L(C|N|P)DBR$")>;
+def : InstRW<[FPU], (instregex "L(C|N|P)EBR$")>;
+def : InstRW<[FXU], (instregex "LCDFR(_32)?$")>;
+def : InstRW<[FXU], (instregex "LNDFR(_32)?$")>;
+def : InstRW<[FXU], (instregex "LPDFR(_32)?$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+
+// Square root
+def : InstRW<[FPU, LSU, Lat30], (instregex "SQ(E|D)B$")>;
+def : InstRW<[FPU, Lat30], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "SQXBR$")>;
+
+// Load FP integer
+def : InstRW<[FPU], (instregex "FIEBR(A)?$")>;
+def : InstRW<[FPU], (instregex "FIDBR(A)?$")>;
+def : InstRW<[FPU2, FPU2, Lat15, GroupAlone], (instregex "FIXBR(A)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[FPU, LSU, Lat12], (instregex "A(E|D)B$")>;
+def : InstRW<[FPU], (instregex "A(E|D)BR$")>;
+def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "AXBR$")>;
+
+// Subtraction
+def : InstRW<[FPU, LSU, Lat12], (instregex "S(E|D)B$")>;
+def : InstRW<[FPU], (instregex "S(E|D)BR$")>;
+def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "SXBR$")>;
+
+// Multiply
+def : InstRW<[FPU, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[FPU], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXDB$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>;
+
+// Multiply and add / subtract
+def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
+
+// Division
+def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
+def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>;
+def : InstRW<[FPU], (instregex "C(E|D)BR$")>;
+def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>;
+
+// Test Data Class
+def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Floating-point control register instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>;
+def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
+def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
+
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
new file mode 100644
index 000000000000..8ab6c826f1ed
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -0,0 +1,807 @@
+//=- SystemZScheduleZEC12.td - SystemZ Scheduling Definitions --*- tblgen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for ZEC12 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def ZEC12Model : SchedMachineModel {
+
+    let UnsupportedFeatures = Arch10UnsupportedFeatures.List;
+    
+    let IssueWidth = 5;
+    let MicroOpBufferSize = 40;     // Issue queues
+    let LoadLatency = 1;            // Optimistic load latency.
+
+    let PostRAScheduler = 1;
+
+    // Extra cycles for a mispredicted branch.
+    let MispredictPenalty = 16;
+}
+
+let SchedModel = ZEC12Model in  {
+
+// These definitions could be put in a subtarget common include file,
+// but it seems the include system in Tablegen currently rejects
+// multiple includes of same file.
+def : WriteRes<GroupAlone, []> {
+  let NumMicroOps = 0;
+  let BeginGroup  = 1;
+  let EndGroup    = 1;
+}
+def : WriteRes<EndGroup, []> {
+  let NumMicroOps = 0;
+  let EndGroup    = 1;
+}
+def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
+def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
+def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
+def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
+def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
+def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
+def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
+def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
+def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
+def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
+def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
+def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
+def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
+def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
+
+// Execution units.
+def ZEC12_FXUnit : ProcResource<2>;
+def ZEC12_LSUnit : ProcResource<2>;
+def ZEC12_FPUnit : ProcResource<1>;
+def ZEC12_VBUnit : ProcResource<1>;
+
+// Subtarget specific definitions of scheduling resources.
+def : WriteRes<FXU,      [ZEC12_FXUnit]> { let Latency = 1; }
+def : WriteRes<LSU,      [ZEC12_LSUnit]> { let Latency = 4; }
+def : WriteRes<LSU_lat1, [ZEC12_LSUnit]> { let Latency = 1; }
+def : WriteRes<FPU,  [ZEC12_FPUnit]> { let Latency = 8; }
+def : WriteRes<FPU2, [ZEC12_FPUnit, ZEC12_FPUnit]> { let Latency = 9; }
+def : WriteRes<VBU,  [ZEC12_VBUnit]>; // Virtual Branching Unit
+
+// -------------------------- INSTRUCTIONS ---------------------------------- //
+
+// InstRW constructs have been used in order to preserve the
+// readability of the InstrInfo files.
+
+// For each instruction, as matched by a regexp, provide a list of
+// resources that it needs. These will be combined into a SchedClass.
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+
+//===----------------------------------------------------------------------===//
+// Branch instructions
+//===----------------------------------------------------------------------===//
+
+// Branch
+def : InstRW<[VBU], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[VBU], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[LSU, Lat4], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[LSU, Lat4], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[FXU, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[FXU, FXU, FXU, LSU, Lat7, GroupAlone],
+             (instregex "B(R)?X(H|L).*$")>;
+
+// Compare and branch
+def : InstRW<[FXU], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[FXU, LSU, Lat5, GroupAlone],
+             (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Trap instructions
+//===----------------------------------------------------------------------===//
+
+// Trap
+def : InstRW<[VBU], (instregex "(Cond)?Trap$")>;
+
+// Compare and trap
+def : InstRW<[FXU], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[FXU], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[FXU], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CL(G)?T(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Call and return instructions
+//===----------------------------------------------------------------------===//
+
+// Call
+def : InstRW<[VBU, FXU, FXU, Lat3, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+
+// Return
+def : InstRW<[LSU_lat1, EndGroup], (instregex "Return$")>;
+def : InstRW<[LSU_lat1], (instregex "CondReturn$")>;
+
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+// Select pseudo 
+def : InstRW<[FXU], (instregex "Select(32|64|32Mux)$")>;
+
+// CondStore pseudos
+def : InstRW<[FXU], (instregex "CondStore16(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore16Mux(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore32(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore64(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore8(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStore8Mux(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Moves
+def : InstRW<[FXU, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
+
+// Move character
+def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
+
+// Pseudo -> reg move
+def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[FXU], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[FXU], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[FXU], (instregex "REG_SEQUENCE$")>;
+def : InstRW<[FXU], (instregex "SUBREG_TO_REG$")>;
+
+// Loads
+def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux)?$")>;
+def : InstRW<[LSU], (instregex "LG(RL)?$")>;
+def : InstRW<[LSU], (instregex "L128$")>;
+
+def : InstRW<[FXU], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[FXU], (instregex "LLIL(F|H|L)$")>;
+
+def : InstRW<[FXU], (instregex "LG(F|H)I$")>;
+def : InstRW<[FXU], (instregex "LHI(Mux)?$")>;
+def : InstRW<[FXU], (instregex "LR(Mux)?$")>;
+
+// Load and trap
+def : InstRW<[FXU, LSU, Lat5], (instregex "L(FH|G)?AT$")>;
+
+// Load and test
+def : InstRW<[FXU, LSU, Lat5], (instregex "LT(G)?$")>;
+def : InstRW<[FXU], (instregex "LT(G)?R$")>;
+
+// Stores
+def : InstRW<[FXU, LSU, Lat5], (instregex "STG(RL)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ST128$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+
+// String moves.
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+
+//===----------------------------------------------------------------------===//
+// Conditional move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat2], (instregex "LOC(G)?R(Asm.*)?$")>;
+def : InstRW<[FXU, LSU, Lat6], (instregex "LOC(G)?(Asm.*)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STOC(G)?(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "L(B|H|G)R$")>;
+def : InstRW<[FXU], (instregex "LG(B|H|F)R$")>;
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "LTGF$")>;
+def : InstRW<[FXU], (instregex "LTGFR$")>;
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LH(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LG(B|H|F)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[FXU], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[FXU], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSU], (instregex "LLHRL$")>;
+def : InstRW<[LSU], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+
+// Load and trap
+def : InstRW<[FXU, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Load multiple (estimated average of 5 ops)
+def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+             (instregex "LM(H|Y|G)?$")>;
+
+// Store multiple (estimated average of 3 ops)
+def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
+             (instregex "STM(H|Y|G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LA(Y|RL)?$")>;
+
+// Load the Global Offset Table address
+def : InstRW<[FXU], (instregex "GOT$")>;
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat2], (instregex "LP(G)?R$")>;
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "L(N|P)GFR$")>;
+def : InstRW<[FXU, Lat2], (instregex "LN(R|GR)$")>;
+def : InstRW<[FXU], (instregex "LC(R|GR)$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LCGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "IC(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "IC32(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
+def : InstRW<[FXU], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[FXU], (instregex "IIHF(64)?$")>;
+def : InstRW<[FXU], (instregex "IIHH(64)?$")>;
+def : InstRW<[FXU], (instregex "IIHL(64)?$")>;
+def : InstRW<[FXU], (instregex "IILF(64)?$")>;
+def : InstRW<[FXU], (instregex "IILH(64)?$")>;
+def : InstRW<[FXU], (instregex "IILL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>;
+def : InstRW<[FXU, LSU, Lat6], (instregex "AH(Y)?$")>;
+def : InstRW<[FXU], (instregex "AIH$")>;
+def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
+def : InstRW<[FXU], (instregex "AGFI$")>;
+def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
+def : InstRW<[FXU], (instregex "AGR(K)?$")>;
+def : InstRW<[FXU], (instregex "AHI(K)?$")>;
+def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>;
+def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXU], (instregex "ALGHSIK$")>;
+def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
+def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
+def : InstRW<[FXU], (instregex "ALR(K)?$")>;
+def : InstRW<[FXU], (instregex "AR(K)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>;
+
+// Logical addition with carry
+def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
+def : InstRW<[FXU, Lat3, GroupAlone], (instregex "ALC(G)?R$")>;
+
+// Add with sign extension (32 -> 64)
+def : InstRW<[FXU, LSU, Lat6], (instregex "AGF$")>;
+def : InstRW<[FXU, Lat2], (instregex "AGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "S(G|Y)?$")>;
+def : InstRW<[FXU, LSU, Lat6], (instregex "SH(Y)?$")>;
+def : InstRW<[FXU], (instregex "SGR(K)?$")>;
+def : InstRW<[FXU], (instregex "SLFI$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[FXU], (instregex "SLGF(I|R)$")>;
+def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
+def : InstRW<[FXU], (instregex "SLR(K)?$")>;
+def : InstRW<[FXU], (instregex "SR(K)?$")>;
+
+// Subtraction with borrow
+def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
+def : InstRW<[FXU, Lat3, GroupAlone], (instregex "SLB(G)?R$")>;
+
+// Subtraction with sign extension (32 -> 64)
+def : InstRW<[FXU, LSU, Lat6], (instregex "SGF$")>;
+def : InstRW<[FXU, Lat2], (instregex "SGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "N(G|Y)?$")>;
+def : InstRW<[FXU], (instregex "NGR(K)?$")>;
+def : InstRW<[FXU], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "NI(Y)?$")>;
+def : InstRW<[FXU], (instregex "NIHF(64)?$")>;
+def : InstRW<[FXU], (instregex "NIHH(64)?$")>;
+def : InstRW<[FXU], (instregex "NIHL(64)?$")>;
+def : InstRW<[FXU], (instregex "NILF(64)?$")>;
+def : InstRW<[FXU], (instregex "NILH(64)?$")>;
+def : InstRW<[FXU], (instregex "NILL(64)?$")>;
+def : InstRW<[FXU], (instregex "NR(K)?$")>;
+def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "NC$")>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "O(G|Y)?$")>;
+def : InstRW<[FXU], (instregex "OGR(K)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "OI(Y)?$")>;
+def : InstRW<[FXU], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXU], (instregex "OIHF(64)?$")>;
+def : InstRW<[FXU], (instregex "OIHH(64)?$")>;
+def : InstRW<[FXU], (instregex "OIHL(64)?$")>;
+def : InstRW<[FXU], (instregex "OILF(64)?$")>;
+def : InstRW<[FXU], (instregex "OILH(64)?$")>;
+def : InstRW<[FXU], (instregex "OILL(64)?$")>;
+def : InstRW<[FXU], (instregex "OR(K)?$")>;
+def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "OC$")>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "X(G|Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "XI(Y)?$")>;
+def : InstRW<[FXU], (instregex "XIFMux$")>;
+def : InstRW<[FXU], (instregex "XGR(K)?$")>;
+def : InstRW<[FXU], (instregex "XIHF(64)?$")>;
+def : InstRW<[FXU], (instregex "XILF(64)?$")>;
+def : InstRW<[FXU], (instregex "XR(K)?$")>;
+def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "XC$")>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat10], (instregex "MS(GF|Y)?$")>;
+def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>;
+def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>;
+def : InstRW<[FXU, Lat8], (instregex "MSGR$")>;
+def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>;
+def : InstRW<[FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
+def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
+def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "DSG(F)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "DSG(F)?$")>;
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "DL(G)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "DL(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
+def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
+def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
+def : InstRW<[FXU], (instregex "SLA(K)?$")>;
+
+// Rotate
+def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
+
+// Rotate and insert
+def : InstRW<[FXU], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[FXU], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[FXU], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[FXU], (instregex "RISBMux$")>;
+
+// Rotate and Select
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "R(N|O|X)SBG$")>;
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
+def : InstRW<[FXU], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[FXU], (instregex "CG(F|H)I$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[FXU], (instregex "C(G)?R$")>;
+def : InstRW<[FXU], (instregex "CIH$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CH(F|SI)$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
+def : InstRW<[FXU], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLGF(RL)?$")>;
+def : InstRW<[FXU], (instregex "CLGF(I|R)$")>;
+def : InstRW<[FXU], (instregex "CLGR$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLGRL$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
+def : InstRW<[FXU], (instregex "CLIH$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
+def : InstRW<[FXU], (instregex "CLR$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
+
+// Compare halfword
+def : InstRW<[FXU, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
+def : InstRW<[FXU, LSU, Lat6], (instregex "CGH(RL)?$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "CHHSI$")>;
+
+// Compare with sign extension (32 -> 64)
+def : InstRW<[FXU, LSU, Lat6], (instregex "CGF(RL)?$")>;
+def : InstRW<[FXU, Lat2], (instregex "CGFR$")>;
+
+// Compare logical character
+def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "CLC$")>;
+
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+
+// Test under mask
+def : InstRW<[FXU, LSU, Lat5], (instregex "TM(Y)?$")>;
+def : InstRW<[FXU], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[FXU], (instregex "TMHH(64)?$")>;
+def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
+def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
+def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Prefetch and execution hint
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "PFD(RL)?$")>;
+def : InstRW<[LSU], (instregex "BP(R)?P$")>;
+def : InstRW<[FXU], (instregex "NIAI$")>;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, EndGroup], (instregex "Serialize$")>;
+
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAA(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAAL(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAN(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAO(G)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "LAX(G)?$")>;
+
+// Test and set
+def : InstRW<[FXU, LSU, Lat5, EndGroup], (instregex "TS$")>;
+
+// Compare and swap
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+
+// Compare double and swap
+def : InstRW<[FXU, FXU, FXU, FXU, FXU, LSU, Lat10, GroupAlone],
+             (instregex "CDS(Y)?$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone],
+             (instregex "CDSG$")>;
+
+// Compare and swap and store
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CSST$")>;
+
+// Perform locked operation
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+
+// Load/store pair from/to quadword
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+
+// Load pair disjoint
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Access registers
+//===----------------------------------------------------------------------===//
+
+// Extract/set/copy access register
+def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+
+// Load address extended
+def : InstRW<[LSU, FXU, Lat5, GroupAlone], (instregex "LAE(Y)?$")>;
+
+// Load/store access multiple (not modeled precisely)
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Program mask and addressing mode
+//===----------------------------------------------------------------------===//
+
+// Insert Program Mask
+def : InstRW<[FXU, Lat3, EndGroup], (instregex "IPM$")>;
+
+// Set Program Mask
+def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+
+// Branch and link
+def : InstRW<[FXU, FXU, LSU, Lat8, GroupAlone], (instregex "BAL(R)?$")>;
+
+// Test addressing mode
+def : InstRW<[FXU], (instregex "TAM$")>;
+
+// Set addressing mode
+def : InstRW<[LSU, EndGroup], (instregex "SAM(24|31|64)$")>;
+
+// Branch (and save) and set mode.
+def : InstRW<[FXU, LSU, Lat5, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "BASSM$")>;
+
+//===----------------------------------------------------------------------===//
+// Transactional execution
+//===----------------------------------------------------------------------===//
+
+// Transaction begin
+def : InstRW<[LSU, LSU, FXU, FXU, FXU, FXU, FXU, Lat15, GroupAlone],
+              (instregex "TBEGIN(C|_nofloat)?$")>;
+
+// Transaction end
+def : InstRW<[LSU, GroupAlone], (instregex "TEND$")>;
+
+// Transaction abort
+def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>;
+
+// Extract Transaction Nesting Depth
+def : InstRW<[FXU], (instregex "ETND$")>;
+
+// Nontransactional store
+def : InstRW<[FXU, LSU, Lat5], (instregex "NTSTG$")>;
+
+//===----------------------------------------------------------------------===//
+// Processor assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "PPA$")>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Find leftmost one
+def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
+
+// Population count
+def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>;
+
+// Extend
+def : InstRW<[FXU], (instregex "AEXT128_64$")>;
+def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>;
+
+// String instructions
+def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
+
+// Move with key
+def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
+
+// Extract CPU Time
+def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
+
+// Execute
+def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
+
+// Program return
+def : InstRW<[FXU, Lat30], (instregex "PR$")>;
+
+// Inline assembly
+def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "STCK(F)?$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, FXU, FXU, Lat20, GroupAlone],
+             (instregex "STCKE$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STFLE$")>;
+def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
+
+// Store real address
+def : InstRW<[FXU, LSU, Lat5], (instregex "STRAG$")>;
+
+//===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+// An "empty" sched-class will be assigned instead of the "invalid sched-class".
+// getNumDecoderSlots() will then return 1 instead of 0.
+def : InstRW<[], (instregex "Insn.*")>;
+
+
+// ----------------------------- Floating point ----------------------------- //
+
+//===----------------------------------------------------------------------===//
+// FP: Select instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "SelectF(32|64|128)$")>;
+def : InstRW<[FXU], (instregex "CondStoreF32(Inv)?$")>;
+def : InstRW<[FXU], (instregex "CondStoreF64(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero
+def : InstRW<[FXU], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LZXR$")>;
+
+// Load
+def : InstRW<[FXU], (instregex "LER$")>;
+def : InstRW<[FXU], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[FXU, Lat3], (instregex "LGDR$")>;
+def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "LXR$")>;
+
+// Load and Test
+def : InstRW<[FPU], (instregex "LT(D|E)BR$")>;
+def : InstRW<[FPU], (instregex "LTEBRCompare(_VecPseudo)?$")>;
+def : InstRW<[FPU], (instregex "LTDBRCompare(_VecPseudo)?$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "LTXBR$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone],
+             (instregex "LTXBRCompare(_VecPseudo)?$")>;
+
+// Copy sign
+def : InstRW<[FXU, FXU, Lat5, GroupAlone], (instregex "CPSDRd(d|s)$")>;
+def : InstRW<[FXU, FXU, Lat5, GroupAlone], (instregex "CPSDRs(d|s)$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Load instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "LE(Y)?$")>;
+def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSU], (instregex "LX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Store instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat7], (instregex "STD(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat7], (instregex "STE(Y)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[FPU], (instregex "LEDBR(A)?$")>;
+def : InstRW<[FPU, FPU, Lat20], (instregex "LEXBR(A)?$")>;
+def : InstRW<[FPU, FPU, Lat20], (instregex "LDXBR(A)?$")>;
+
+// Load lengthened
+def : InstRW<[FPU, LSU, Lat12], (instregex "LDEB$")>;
+def : InstRW<[FPU], (instregex "LDEBR$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "LX(D|E)B$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "LX(D|E)BR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CE(F|G)BR(A?)$")>;
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CD(F|G)BR(A?)$")>;
+def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CX(F|G)BR(A?)$")>;
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CEL(F|G)BR$")>;
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CDL(F|G)BR$")>;
+def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CXL(F|G)BR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CF(E|D)BR(A?)$")>;
+def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CG(E|D)BR(A?)$")>;
+def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "C(F|G)XBR(A?)$")>;
+def : InstRW<[FXU, FPU, Lat11, GroupAlone], (instregex "CLF(E|D)BR$")>;
+def : InstRW<[FXU, FPU, Lat11, GroupAlone], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "CL(F|G)XBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[FPU], (instregex "L(C|N|P)DBR$")>;
+def : InstRW<[FPU], (instregex "L(C|N|P)EBR$")>;
+def : InstRW<[FXU], (instregex "LCDFR(_32)?$")>;
+def : InstRW<[FXU], (instregex "LNDFR(_32)?$")>;
+def : InstRW<[FXU], (instregex "LPDFR(_32)?$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+
+// Square root
+def : InstRW<[FPU, LSU, Lat30], (instregex "SQ(E|D)B$")>;
+def : InstRW<[FPU, Lat30], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "SQXBR$")>;
+
+// Load FP integer
+def : InstRW<[FPU], (instregex "FIEBR(A)?$")>;
+def : InstRW<[FPU], (instregex "FIDBR(A)?$")>;
+def : InstRW<[FPU2, FPU2, Lat15, GroupAlone], (instregex "FIXBR(A)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[FPU, LSU, Lat12], (instregex "A(E|D)B$")>;
+def : InstRW<[FPU], (instregex "A(E|D)BR$")>;
+def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "AXBR$")>;
+
+// Subtraction
+def : InstRW<[FPU, LSU, Lat12], (instregex "S(E|D)B$")>;
+def : InstRW<[FPU], (instregex "S(E|D)BR$")>;
+def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "SXBR$")>;
+
+// Multiply
+def : InstRW<[FPU, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[FPU], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXDB$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>;
+
+// Multiply and add / subtract
+def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
+
+// Division
+def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
+def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>;
+def : InstRW<[FPU], (instregex "C(E|D)BR$")>;
+def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>;
+
+// Test Data Class
+def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Floating-point control register instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>;
+def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
+def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
+
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..657482504045
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -0,0 +1,275 @@
+//===-- SystemZSelectionDAGInfo.cpp - SystemZ SelectionDAG Info -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SystemZSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-selectiondag-info"
+
+// Decide whether it is best to use a loop or straight-line code for
+// a block operation of Size bytes with source address Src and destination
+// address Dest.  Sequence is the opcode to use for straight-line code
+// (such as MVC) and Loop is the opcode to use for loops (such as MVC_LOOP).
+// Return the chain for the completed operation.
+static SDValue emitMemMem(SelectionDAG &DAG, const SDLoc &DL, unsigned Sequence,
+                          unsigned Loop, SDValue Chain, SDValue Dst,
+                          SDValue Src, uint64_t Size) {
+  EVT PtrVT = Src.getValueType();
+  // The heuristic we use is to prefer loops for anything that would
+  // require 7 or more MVCs.  With these kinds of sizes there isn't
+  // much to choose between straight-line code and looping code,
+  // since the time will be dominated by the MVCs themselves.
+  // However, the loop has 4 or 5 instructions (depending on whether
+  // the base addresses can be proved equal), so there doesn't seem
+  // much point using a loop for 5 * 256 bytes or fewer.  Anything in
+  // the range (5 * 256, 6 * 256) will need another instruction after
+  // the loop, so it doesn't seem worth using a loop then either.
+  // The next value up, 6 * 256, can be implemented in the same
+  // number of straight-line MVCs as 6 * 256 - 1.
+  if (Size > 6 * 256)
+    return DAG.getNode(Loop, DL, MVT::Other, Chain, Dst, Src,
+                       DAG.getConstant(Size, DL, PtrVT),
+                       DAG.getConstant(Size / 256, DL, PtrVT));
+  return DAG.getNode(Sequence, DL, MVT::Other, Chain, Dst, Src,
+                     DAG.getConstant(Size, DL, PtrVT));
+}
+
+SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool IsVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+  if (IsVolatile)
+    return SDValue();
+
+  if (auto *CSize = dyn_cast<ConstantSDNode>(Size))
+    return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
+                      Chain, Dst, Src, CSize->getZExtValue());
+  return SDValue();
+}
+
+// Handle a memset of 1, 2, 4 or 8 bytes with the operands given by
+// Chain, Dst, ByteVal and Size.  These cases are expected to use
+// MVI, MVHHI, MVHI and MVGHI respectively.
+static SDValue memsetStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+                           SDValue Dst, uint64_t ByteVal, uint64_t Size,
+                           unsigned Align, MachinePointerInfo DstPtrInfo) {
+  uint64_t StoreVal = ByteVal;
+  for (unsigned I = 1; I < Size; ++I)
+    StoreVal |= ByteVal << (I * 8);
+  return DAG.getStore(
+      Chain, DL, DAG.getConstant(StoreVal, DL, MVT::getIntegerVT(Size * 8)),
+      Dst, DstPtrInfo, Align);
+}
+
+SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst,
+    SDValue Byte, SDValue Size, unsigned Align, bool IsVolatile,
+    MachinePointerInfo DstPtrInfo) const {
+  EVT PtrVT = Dst.getValueType();
+
+  if (IsVolatile)
+    return SDValue();
+
+  if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
+    uint64_t Bytes = CSize->getZExtValue();
+    if (Bytes == 0)
+      return SDValue();
+    if (auto *CByte = dyn_cast<ConstantSDNode>(Byte)) {
+      // Handle cases that can be done using at most two of
+      // MVI, MVHI, MVHHI and MVGHI.  The latter two can only be
+      // used if ByteVal is all zeros or all ones; in other casees,
+      // we can move at most 2 halfwords.
+      uint64_t ByteVal = CByte->getZExtValue();
+      if (ByteVal == 0 || ByteVal == 255 ?
+          Bytes <= 16 && countPopulation(Bytes) <= 2 :
+          Bytes <= 4) {
+        unsigned Size1 = Bytes == 16 ? 8 : 1 << findLastSet(Bytes);
+        unsigned Size2 = Bytes - Size1;
+        SDValue Chain1 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size1,
+                                     Align, DstPtrInfo);
+        if (Size2 == 0)
+          return Chain1;
+        Dst = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                          DAG.getConstant(Size1, DL, PtrVT));
+        DstPtrInfo = DstPtrInfo.getWithOffset(Size1);
+        SDValue Chain2 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size2,
+                                     std::min(Align, Size1), DstPtrInfo);
+        return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2);
+      }
+    } else {
+      // Handle one and two bytes using STC.
+      if (Bytes <= 2) {
+        SDValue Chain1 = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Align);
+        if (Bytes == 1)
+          return Chain1;
+        SDValue Dst2 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                                   DAG.getConstant(1, DL, PtrVT));
+        SDValue Chain2 =
+            DAG.getStore(Chain, DL, Byte, Dst2, DstPtrInfo.getWithOffset(1),
+                         /* Alignment = */ 1);
+        return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2);
+      }
+    }
+    assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already");
+
+    // Handle the special case of a memset of 0, which can use XC.
+    auto *CByte = dyn_cast<ConstantSDNode>(Byte);
+    if (CByte && CByte->getZExtValue() == 0)
+      return emitMemMem(DAG, DL, SystemZISD::XC, SystemZISD::XC_LOOP,
+                        Chain, Dst, Dst, Bytes);
+
+    // Copy the byte to the first location and then use MVC to copy
+    // it to the rest.
+    Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Align);
+    SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                                   DAG.getConstant(1, DL, PtrVT));
+    return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
+                      Chain, DstPlus1, Dst, Bytes - 1);
+  }
+  return SDValue();
+}
+
+// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size),
+// deciding whether to use a loop or straight-line code.
+static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+                       SDValue Src1, SDValue Src2, uint64_t Size) {
+  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  EVT PtrVT = Src1.getValueType();
+  // A two-CLC sequence is a clear win over a loop, not least because it
+  // needs only one branch.  A three-CLC sequence needs the same number
+  // of branches as a loop (i.e. 2), but is shorter.  That brings us to
+  // lengths greater than 768 bytes.  It seems relatively likely that
+  // a difference will be found within the first 768 bytes, so we just
+  // optimize for the smallest number of branch instructions, in order
+  // to avoid polluting the prediction buffer too much.  A loop only ever
+  // needs 2 branches, whereas a straight-line sequence would need 3 or more.
+  if (Size > 3 * 256)
+    return DAG.getNode(SystemZISD::CLC_LOOP, DL, VTs, Chain, Src1, Src2,
+                       DAG.getConstant(Size, DL, PtrVT),
+                       DAG.getConstant(Size / 256, DL, PtrVT));
+  return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2,
+                     DAG.getConstant(Size, DL, PtrVT));
+}
+
+// Convert the current CC value into an integer that is 0 if CC == 0,
+// less than zero if CC == 1 and greater than zero if CC >= 2.
+// The sequence starts with IPM, which puts CC into bits 29 and 28
+// of an integer and clears bits 30 and 31.
+static SDValue addIPMSequence(const SDLoc &DL, SDValue Glue,
+                              SelectionDAG &DAG) {
+  SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+  SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
+                            DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
+  SDValue ROTL = DAG.getNode(ISD::ROTL, DL, MVT::i32, SRL,
+                             DAG.getConstant(31, DL, MVT::i32));
+  return ROTL;
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemcmp(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src1,
+    SDValue Src2, SDValue Size, MachinePointerInfo Op1PtrInfo,
+    MachinePointerInfo Op2PtrInfo) const {
+  if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
+    uint64_t Bytes = CSize->getZExtValue();
+    assert(Bytes > 0 && "Caller should have handled 0-size case");
+    Chain = emitCLC(DAG, DL, Chain, Src1, Src2, Bytes);
+    SDValue Glue = Chain.getValue(1);
+    return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
+  }
+  return std::make_pair(SDValue(), SDValue());
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForMemchr(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src,
+    SDValue Char, SDValue Length, MachinePointerInfo SrcPtrInfo) const {
+  // Use SRST to find the character.  End is its address on success.
+  EVT PtrVT = Src.getValueType();
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
+  Length = DAG.getZExtOrTrunc(Length, DL, PtrVT);
+  Char = DAG.getZExtOrTrunc(Char, DL, MVT::i32);
+  Char = DAG.getNode(ISD::AND, DL, MVT::i32, Char,
+                     DAG.getConstant(255, DL, MVT::i32));
+  SDValue Limit = DAG.getNode(ISD::ADD, DL, PtrVT, Src, Length);
+  SDValue End = DAG.getNode(SystemZISD::SEARCH_STRING, DL, VTs, Chain,
+                            Limit, Src, Char);
+  Chain = End.getValue(1);
+  SDValue Glue = End.getValue(2);
+
+  // Now select between End and null, depending on whether the character
+  // was found.
+  SDValue Ops[] = {End, DAG.getConstant(0, DL, PtrVT),
+                   DAG.getConstant(SystemZ::CCMASK_SRST, DL, MVT::i32),
+                   DAG.getConstant(SystemZ::CCMASK_SRST_FOUND, DL, MVT::i32),
+                   Glue};
+  VTs = DAG.getVTList(PtrVT, MVT::Glue);
+  End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
+  return std::make_pair(End, Chain);
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForStrcpy(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dest,
+    SDValue Src, MachinePointerInfo DestPtrInfo, MachinePointerInfo SrcPtrInfo,
+    bool isStpcpy) const {
+  SDVTList VTs = DAG.getVTList(Dest.getValueType(), MVT::Other);
+  SDValue EndDest = DAG.getNode(SystemZISD::STPCPY, DL, VTs, Chain, Dest, Src,
+                                DAG.getConstant(0, DL, MVT::i32));
+  return std::make_pair(isStpcpy ? EndDest : Dest, EndDest.getValue(1));
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForStrcmp(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src1,
+    SDValue Src2, MachinePointerInfo Op1PtrInfo,
+    MachinePointerInfo Op2PtrInfo) const {
+  SDVTList VTs = DAG.getVTList(Src1.getValueType(), MVT::Other, MVT::Glue);
+  SDValue Unused = DAG.getNode(SystemZISD::STRCMP, DL, VTs, Chain, Src1, Src2,
+                               DAG.getConstant(0, DL, MVT::i32));
+  Chain = Unused.getValue(1);
+  SDValue Glue = Chain.getValue(2);
+  return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
+}
+
+// Search from Src for a null character, stopping once Src reaches Limit.
+// Return a pair of values, the first being the number of nonnull characters
+// and the second being the out chain.
+//
+// This can be used for strlen by setting Limit to 0.
+static std::pair<SDValue, SDValue> getBoundedStrlen(SelectionDAG &DAG,
+                                                    const SDLoc &DL,
+                                                    SDValue Chain, SDValue Src,
+                                                    SDValue Limit) {
+  EVT PtrVT = Src.getValueType();
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
+  SDValue End = DAG.getNode(SystemZISD::SEARCH_STRING, DL, VTs, Chain,
+                            Limit, Src, DAG.getConstant(0, DL, MVT::i32));
+  Chain = End.getValue(1);
+  SDValue Len = DAG.getNode(ISD::SUB, DL, PtrVT, End, Src);
+  return std::make_pair(Len, Chain);
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForStrlen(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src,
+    MachinePointerInfo SrcPtrInfo) const {
+  EVT PtrVT = Src.getValueType();
+  return getBoundedStrlen(DAG, DL, Chain, Src, DAG.getConstant(0, DL, PtrVT));
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::EmitTargetCodeForStrnlen(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src,
+    SDValue MaxLength, MachinePointerInfo SrcPtrInfo) const {
+  EVT PtrVT = Src.getValueType();
+  MaxLength = DAG.getZExtOrTrunc(MaxLength, DL, PtrVT);
+  SDValue Limit = DAG.getNode(ISD::ADD, DL, PtrVT, Src, MaxLength);
+  return getBoundedStrlen(DAG, DL, Chain, Src, Limit);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
new file mode 100644
index 000000000000..93cd970c30c6
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -0,0 +1,74 @@
+//===-- SystemZSelectionDAGInfo.h - SystemZ SelectionDAG Info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SystemZ subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZSELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class SystemZTargetMachine;
+
+class SystemZSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+  explicit SystemZSelectionDAGInfo() = default;
+
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &DL,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool IsVolatile,
+                                  bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
+
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL,
+                                  SDValue Chain, SDValue Dst, SDValue Byte,
+                                  SDValue Size, unsigned Align, bool IsVolatile,
+                                  MachinePointerInfo DstPtrInfo) const override;
+
+  std::pair<SDValue, SDValue>
+  EmitTargetCodeForMemcmp(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+                          SDValue Src1, SDValue Src2, SDValue Size,
+                          MachinePointerInfo Op1PtrInfo,
+                          MachinePointerInfo Op2PtrInfo) const override;
+
+  std::pair<SDValue, SDValue>
+  EmitTargetCodeForMemchr(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+                          SDValue Src, SDValue Char, SDValue Length,
+                          MachinePointerInfo SrcPtrInfo) const override;
+
+  std::pair<SDValue, SDValue> EmitTargetCodeForStrcpy(
+      SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dest,
+      SDValue Src, MachinePointerInfo DestPtrInfo,
+      MachinePointerInfo SrcPtrInfo, bool isStpcpy) const override;
+
+  std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrcmp(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+                          SDValue Src1, SDValue Src2,
+                          MachinePointerInfo Op1PtrInfo,
+                          MachinePointerInfo Op2PtrInfo) const override;
+
+  std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrlen(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+                          SDValue Src,
+                          MachinePointerInfo SrcPtrInfo) const override;
+
+  std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrnlen(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+                           SDValue Src, SDValue MaxLength,
+                           MachinePointerInfo SrcPtrInfo) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
new file mode 100644
index 000000000000..83882fc0310a
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -0,0 +1,285 @@
+//===-- SystemZShortenInst.cpp - Instruction-shortening pass --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to replace instructions with shorter forms.  For example,
+// IILF can be replaced with LLILL or LLILH if the constant fits and if the
+// other 32 bits of the GR64 destination are not live.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-shorten-inst"
+
+namespace {
+class SystemZShortenInst : public MachineFunctionPass {
+public:
+  static char ID;
+  SystemZShortenInst(const SystemZTargetMachine &tm);
+
+  StringRef getPassName() const override {
+    return "SystemZ Instruction Shortening";
+  }
+
+  bool processBlock(MachineBasicBlock &MBB);
+  bool runOnMachineFunction(MachineFunction &F) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  bool shortenIIF(MachineInstr &MI, unsigned LLIxL, unsigned LLIxH);
+  bool shortenOn0(MachineInstr &MI, unsigned Opcode);
+  bool shortenOn01(MachineInstr &MI, unsigned Opcode);
+  bool shortenOn001(MachineInstr &MI, unsigned Opcode);
+  bool shortenOn001AddCC(MachineInstr &MI, unsigned Opcode);
+  bool shortenFPConv(MachineInstr &MI, unsigned Opcode);
+
+  const SystemZInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  LivePhysRegs LiveRegs;
+};
+
+char SystemZShortenInst::ID = 0;
+} // end anonymous namespace
+
+FunctionPass *llvm::createSystemZShortenInstPass(SystemZTargetMachine &TM) {
+  return new SystemZShortenInst(TM);
+}
+
+SystemZShortenInst::SystemZShortenInst(const SystemZTargetMachine &tm)
+  : MachineFunctionPass(ID), TII(nullptr) {}
+
+// Tie operands if MI has become a two-address instruction.
+static void tieOpsIfNeeded(MachineInstr &MI) {
+  if (MI.getDesc().getOperandConstraint(0, MCOI::TIED_TO) &&
+      !MI.getOperand(0).isTied())
+    MI.tieOperands(0, 1);
+}
+
+// MI loads one word of a GPR using an IIxF instruction and LLIxL and LLIxH
+// are the halfword immediate loads for the same word.  Try to use one of them
+// instead of IIxF.
+bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned LLIxL,
+                                    unsigned LLIxH) {
+  unsigned Reg = MI.getOperand(0).getReg();
+  // The new opcode will clear the other half of the GR64 reg, so
+  // cancel if that is live.
+  unsigned thisSubRegIdx =
+      (SystemZ::GRH32BitRegClass.contains(Reg) ? SystemZ::subreg_h32
+                                               : SystemZ::subreg_l32);
+  unsigned otherSubRegIdx =
+      (thisSubRegIdx == SystemZ::subreg_l32 ? SystemZ::subreg_h32
+                                            : SystemZ::subreg_l32);
+  unsigned GR64BitReg =
+      TRI->getMatchingSuperReg(Reg, thisSubRegIdx, &SystemZ::GR64BitRegClass);
+  unsigned OtherReg = TRI->getSubReg(GR64BitReg, otherSubRegIdx);
+  if (LiveRegs.contains(OtherReg))
+    return false;
+
+  uint64_t Imm = MI.getOperand(1).getImm();
+  if (SystemZ::isImmLL(Imm)) {
+    MI.setDesc(TII->get(LLIxL));
+    MI.getOperand(0).setReg(SystemZMC::getRegAsGR64(Reg));
+    return true;
+  }
+  if (SystemZ::isImmLH(Imm)) {
+    MI.setDesc(TII->get(LLIxH));
+    MI.getOperand(0).setReg(SystemZMC::getRegAsGR64(Reg));
+    MI.getOperand(1).setImm(Imm >> 16);
+    return true;
+  }
+  return false;
+}
+
+// Change MI's opcode to Opcode if register operand 0 has a 4-bit encoding.
+bool SystemZShortenInst::shortenOn0(MachineInstr &MI, unsigned Opcode) {
+  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16) {
+    MI.setDesc(TII->get(Opcode));
+    return true;
+  }
+  return false;
+}
+
+// Change MI's opcode to Opcode if register operands 0 and 1 have a
+// 4-bit encoding.
+bool SystemZShortenInst::shortenOn01(MachineInstr &MI, unsigned Opcode) {
+  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+      SystemZMC::getFirstReg(MI.getOperand(1).getReg()) < 16) {
+    MI.setDesc(TII->get(Opcode));
+    return true;
+  }
+  return false;
+}
+
+// Change MI's opcode to Opcode if register operands 0, 1 and 2 have a
+// 4-bit encoding and if operands 0 and 1 are tied. Also ties op 0
+// with op 1, if MI becomes 2-address.
+bool SystemZShortenInst::shortenOn001(MachineInstr &MI, unsigned Opcode) {
+  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+      MI.getOperand(1).getReg() == MI.getOperand(0).getReg() &&
+      SystemZMC::getFirstReg(MI.getOperand(2).getReg()) < 16) {
+    MI.setDesc(TII->get(Opcode));
+    tieOpsIfNeeded(MI);
+    return true;
+  }
+  return false;
+}
+
+// Calls shortenOn001 if CCLive is false. CC def operand is added in
+// case of success.
+bool SystemZShortenInst::shortenOn001AddCC(MachineInstr &MI, unsigned Opcode) {
+  if (!LiveRegs.contains(SystemZ::CC) && shortenOn001(MI, Opcode)) {
+    MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
+      .addReg(SystemZ::CC, RegState::ImplicitDefine | RegState::Dead);
+    return true;
+  }
+  return false;
+}
+
+// MI is a vector-style conversion instruction with the operand order:
+// destination, source, exact-suppress, rounding-mode.  If both registers
+// have a 4-bit encoding then change it to Opcode, which has operand order:
+// destination, rouding-mode, source, exact-suppress.
+bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) {
+  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
+      SystemZMC::getFirstReg(MI.getOperand(1).getReg()) < 16) {
+    MachineOperand Dest(MI.getOperand(0));
+    MachineOperand Src(MI.getOperand(1));
+    MachineOperand Suppress(MI.getOperand(2));
+    MachineOperand Mode(MI.getOperand(3));
+    MI.RemoveOperand(3);
+    MI.RemoveOperand(2);
+    MI.RemoveOperand(1);
+    MI.RemoveOperand(0);
+    MI.setDesc(TII->get(Opcode));
+    MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
+      .addOperand(Dest)
+      .addOperand(Mode)
+      .addOperand(Src)
+      .addOperand(Suppress);
+    return true;
+  }
+  return false;
+}
+
+// Process all instructions in MBB.  Return true if something changed.
+bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+
+  // Set up the set of live registers at the end of MBB (live out)
+  LiveRegs.clear();
+  LiveRegs.addLiveOuts(MBB);
+
+  // Iterate backwards through the block looking for instructions to change.
+  for (auto MBBI = MBB.rbegin(), MBBE = MBB.rend(); MBBI != MBBE; ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    switch (MI.getOpcode()) {
+    case SystemZ::IILF:
+      Changed |= shortenIIF(MI, SystemZ::LLILL, SystemZ::LLILH);
+      break;
+
+    case SystemZ::IIHF:
+      Changed |= shortenIIF(MI, SystemZ::LLIHL, SystemZ::LLIHH);
+      break;
+
+    case SystemZ::WFADB:
+      Changed |= shortenOn001AddCC(MI, SystemZ::ADBR);
+      break;
+
+    case SystemZ::WFDDB:
+      Changed |= shortenOn001(MI, SystemZ::DDBR);
+      break;
+
+    case SystemZ::WFIDB:
+      Changed |= shortenFPConv(MI, SystemZ::FIDBRA);
+      break;
+
+    case SystemZ::WLDEB:
+      Changed |= shortenOn01(MI, SystemZ::LDEBR);
+      break;
+
+    case SystemZ::WLEDB:
+      Changed |= shortenFPConv(MI, SystemZ::LEDBRA);
+      break;
+
+    case SystemZ::WFMDB:
+      Changed |= shortenOn001(MI, SystemZ::MDBR);
+      break;
+
+    case SystemZ::WFLCDB:
+      Changed |= shortenOn01(MI, SystemZ::LCDFR);
+      break;
+
+    case SystemZ::WFLNDB:
+      Changed |= shortenOn01(MI, SystemZ::LNDFR);
+      break;
+
+    case SystemZ::WFLPDB:
+      Changed |= shortenOn01(MI, SystemZ::LPDFR);
+      break;
+
+    case SystemZ::WFSQDB:
+      Changed |= shortenOn01(MI, SystemZ::SQDBR);
+      break;
+
+    case SystemZ::WFSDB:
+      Changed |= shortenOn001AddCC(MI, SystemZ::SDBR);
+      break;
+
+    case SystemZ::WFCDB:
+      Changed |= shortenOn01(MI, SystemZ::CDBR);
+      break;
+
+    case SystemZ::VL32:
+      // For z13 we prefer LDE over LE to avoid partial register dependencies.
+      Changed |= shortenOn0(MI, SystemZ::LDE32);
+      break;
+
+    case SystemZ::VST32:
+      Changed |= shortenOn0(MI, SystemZ::STE);
+      break;
+
+    case SystemZ::VL64:
+      Changed |= shortenOn0(MI, SystemZ::LD);
+      break;
+
+    case SystemZ::VST64:
+      Changed |= shortenOn0(MI, SystemZ::STD);
+      break;
+    }
+
+    LiveRegs.stepBackward(MI);
+  }
+
+  return Changed;
+}
+
+bool SystemZShortenInst::runOnMachineFunction(MachineFunction &F) {
+  if (skipFunction(*F.getFunction()))
+    return false;
+
+  const SystemZSubtarget &ST = F.getSubtarget<SystemZSubtarget>();
+  TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
+  LiveRegs.init(*TRI);
+
+  bool Changed = false;
+  for (auto &MBB : F)
+    Changed |= processBlock(MBB);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
new file mode 100644
index 000000000000..ce07ea3318a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -0,0 +1,64 @@
+//===-- SystemZSubtarget.cpp - SystemZ subtarget information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZSubtarget.h"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/IR/GlobalValue.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "SystemZGenSubtargetInfo.inc"
+
+// Pin the vtable to this file.
+void SystemZSubtarget::anchor() {}
+
+SystemZSubtarget &
+SystemZSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "generic";
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FS);
+  return *this;
+}
+
+SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
+                                   const std::string &FS,
+                                   const TargetMachine &TM)
+    : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
+      HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
+      HasPopulationCount(false), HasFastSerialization(false),
+      HasInterlockedAccess1(false), HasMiscellaneousExtensions(false),
+      HasExecutionHint(false), HasLoadAndTrap(false),
+      HasTransactionalExecution(false), HasProcessorAssist(false),
+      HasVector(false), HasLoadStoreOnCond2(false),
+      HasLoadAndZeroRightmostByte(false),
+      TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+      TLInfo(TM, *this), TSInfo(), FrameLowering() {}
+
+bool SystemZSubtarget::isPC32DBLSymbol(const GlobalValue *GV,
+                                       CodeModel::Model CM) const {
+  // PC32DBL accesses require the low bit to be clear.  Note that a zero
+  // value selects the default alignment and is therefore OK.
+  if (GV->getAlignment() == 1)
+    return false;
+
+  // For the small model, all locally-binding symbols are in range.
+  if (CM == CodeModel::Small)
+    return TLInfo.getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+
+  // For Medium and above, assume that the symbol is not within the 4GB range.
+  // Taking the address of locally-defined text would be OK, but that
+  // case isn't easy to detect.
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h
new file mode 100644
index 000000000000..cdb61327a16a
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h
@@ -0,0 +1,146 @@
+//===-- SystemZSubtarget.h - SystemZ subtarget information -----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SystemZ specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZSUBTARGET_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZSUBTARGET_H
+
+#include "SystemZFrameLowering.h"
+#include "SystemZISelLowering.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZRegisterInfo.h"
+#include "SystemZSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "SystemZGenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+
+class SystemZSubtarget : public SystemZGenSubtargetInfo {
+  virtual void anchor();
+protected:
+  bool HasDistinctOps;
+  bool HasLoadStoreOnCond;
+  bool HasHighWord;
+  bool HasFPExtension;
+  bool HasPopulationCount;
+  bool HasFastSerialization;
+  bool HasInterlockedAccess1;
+  bool HasMiscellaneousExtensions;
+  bool HasExecutionHint;
+  bool HasLoadAndTrap;
+  bool HasTransactionalExecution;
+  bool HasProcessorAssist;
+  bool HasVector;
+  bool HasLoadStoreOnCond2;
+  bool HasLoadAndZeroRightmostByte;
+
+private:
+  Triple TargetTriple;
+  SystemZInstrInfo InstrInfo;
+  SystemZTargetLowering TLInfo;
+  SystemZSelectionDAGInfo TSInfo;
+  SystemZFrameLowering FrameLowering;
+
+  SystemZSubtarget &initializeSubtargetDependencies(StringRef CPU,
+                                                    StringRef FS);
+public:
+  SystemZSubtarget(const Triple &TT, const std::string &CPU,
+                   const std::string &FS, const TargetMachine &TM);
+
+  const TargetFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const SystemZInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const SystemZRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+  const SystemZTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+  // This is important for reducing register pressure in vector code.
+  bool useAA() const override { return true; }
+
+  // Always enable the early if-conversion pass.
+  bool enableEarlyIfConversion() const override { return true; }
+
+  // Automatically generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  // Return true if the target has the distinct-operands facility.
+  bool hasDistinctOps() const { return HasDistinctOps; }
+
+  // Return true if the target has the load/store-on-condition facility.
+  bool hasLoadStoreOnCond() const { return HasLoadStoreOnCond; }
+
+  // Return true if the target has the load/store-on-condition facility 2.
+  bool hasLoadStoreOnCond2() const { return HasLoadStoreOnCond2; }
+
+  // Return true if the target has the high-word facility.
+  bool hasHighWord() const { return HasHighWord; }
+
+  // Return true if the target has the floating-point extension facility.
+  bool hasFPExtension() const { return HasFPExtension; }
+
+  // Return true if the target has the population-count facility.
+  bool hasPopulationCount() const { return HasPopulationCount; }
+
+  // Return true if the target has the fast-serialization facility.
+  bool hasFastSerialization() const { return HasFastSerialization; }
+
+  // Return true if the target has interlocked-access facility 1.
+  bool hasInterlockedAccess1() const { return HasInterlockedAccess1; }
+
+  // Return true if the target has the miscellaneous-extensions facility.
+  bool hasMiscellaneousExtensions() const {
+    return HasMiscellaneousExtensions;
+  }
+
+  // Return true if the target has the execution-hint facility.
+  bool hasExecutionHint() const { return HasExecutionHint; }
+
+  // Return true if the target has the load-and-trap facility.
+  bool hasLoadAndTrap() const { return HasLoadAndTrap; }
+
+  // Return true if the target has the transactional-execution facility.
+  bool hasTransactionalExecution() const { return HasTransactionalExecution; }
+
+  // Return true if the target has the processor-assist facility.
+  bool hasProcessorAssist() const { return HasProcessorAssist; }
+
+  // Return true if the target has the load-and-zero-rightmost-byte facility.
+  bool hasLoadAndZeroRightmostByte() const {
+    return HasLoadAndZeroRightmostByte;
+  }
+
+  // Return true if the target has the vector facility.
+  bool hasVector() const { return HasVector; }
+
+  // Return true if GV can be accessed using LARL for reloc model RM
+  // and code model CM.
+  bool isPC32DBLSymbol(const GlobalValue *GV, CodeModel::Model CM) const;
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTDC.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTDC.cpp
new file mode 100644
index 000000000000..96a9ef82c125
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTDC.cpp
@@ -0,0 +1,382 @@
+//===-- SystemZTDC.cpp - Utilize Test Data Class instruction --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for instructions that can be replaced by a Test Data Class
+// instruction, and replaces them when profitable.
+//
+// Roughly, the following rules are recognized:
+//
+// 1: fcmp pred X, 0 -> tdc X, mask
+// 2: fcmp pred X, +-inf -> tdc X, mask
+// 3: fcmp pred X, +-minnorm -> tdc X, mask
+// 4: tdc (fabs X), mask -> tdc X, newmask
+// 5: icmp slt (bitcast float X to int), 0 -> tdc X, mask [ie. signbit]
+// 6: icmp sgt (bitcast float X to int), -1 -> tdc X, mask
+// 7: icmp ne/eq (call @llvm.s390.tdc.*(X, mask)) -> tdc X, mask/~mask
+// 8: and i1 (tdc X, M1), (tdc X, M2) -> tdc X, (M1 & M2)
+// 9: or i1 (tdc X, M1), (tdc X, M2) -> tdc X, (M1 | M2)
+// 10: xor i1 (tdc X, M1), (tdc X, M2) -> tdc X, (M1 ^ M2)
+//
+// The pass works in 4 steps:
+//
+// 1. All fcmp and icmp instructions in a function are checked for a match
+//    with rules 1-3 and 5-7.  Their TDC equivalents are stored in
+//    the ConvertedInsts mapping.  If the operand of a fcmp instruction is
+//    a fabs, it's also folded according to rule 4.
+// 2. All and/or/xor i1 instructions whose both operands have been already
+//    mapped are mapped according to rules 8-10.  LogicOpsWorklist is used
+//    as a queue of instructions to check.
+// 3. All mapped instructions that are considered worthy of conversion (ie.
+//    replacing them will actually simplify the final code) are replaced
+//    with a call to the s390.tdc intrinsic.
+// 4. All intermediate results of replaced instructions are removed if unused.
+//
+// Instructions that match rules 1-3 are considered unworthy of conversion
+// on their own (since a comparison instruction is superior), but are mapped
+// in the hopes of folding the result using rules 4 and 8-10 (likely removing
+// the original comparison in the process).
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include <deque>
+#include <set>
+
+using namespace llvm;
+
+namespace llvm {
+  void initializeSystemZTDCPassPass(PassRegistry&);
+}
+
+namespace {
+
+class SystemZTDCPass : public FunctionPass {
+public:
+  static char ID;
+  SystemZTDCPass() : FunctionPass(ID) {
+    initializeSystemZTDCPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+private:
+  // Maps seen instructions that can be mapped to a TDC, values are
+  // (TDC operand, TDC mask, worthy flag) triples.
+  MapVector<Instruction *, std::tuple<Value *, int, bool>> ConvertedInsts;
+  // The queue of and/or/xor i1 instructions to be potentially folded.
+  std::vector<BinaryOperator *> LogicOpsWorklist;
+  // Instructions matched while folding, to be removed at the end if unused.
+  std::set<Instruction *> PossibleJunk;
+
+  // Tries to convert a fcmp instruction.
+  void convertFCmp(CmpInst &I);
+
+  // Tries to convert an icmp instruction.
+  void convertICmp(CmpInst &I);
+
+  // Tries to convert an i1 and/or/xor instruction, whose both operands
+  // have been already converted.
+  void convertLogicOp(BinaryOperator &I);
+
+  // Marks an instruction as converted - adds it to ConvertedInsts and adds
+  // any and/or/xor i1 users to the queue.
+  void converted(Instruction *I, Value *V, int Mask, bool Worthy) {
+    ConvertedInsts[I] = std::make_tuple(V, Mask, Worthy);
+    auto &M = *I->getFunction()->getParent();
+    auto &Ctx = M.getContext();
+    for (auto *U : I->users()) {
+      auto *LI = dyn_cast<BinaryOperator>(U);
+      if (LI && LI->getType() == Type::getInt1Ty(Ctx) &&
+          (LI->getOpcode() == Instruction::And ||
+           LI->getOpcode() == Instruction::Or ||
+           LI->getOpcode() == Instruction::Xor)) {
+        LogicOpsWorklist.push_back(LI);
+      }
+    }
+  }
+};
+
+} // end anonymous namespace
+
+char SystemZTDCPass::ID = 0;
+INITIALIZE_PASS(SystemZTDCPass, "systemz-tdc",
+                "SystemZ Test Data Class optimization", false, false)
+
+FunctionPass *llvm::createSystemZTDCPass() {
+  return new SystemZTDCPass();
+}
+
+void SystemZTDCPass::convertFCmp(CmpInst &I) {
+  Value *Op0 = I.getOperand(0);
+  auto *Const = dyn_cast<ConstantFP>(I.getOperand(1));
+  auto Pred = I.getPredicate();
+  // Only comparisons with consts are interesting.
+  if (!Const)
+    return;
+  // Compute the smallest normal number (and its negation).
+  auto &Sem = Op0->getType()->getFltSemantics();
+  APFloat Smallest = APFloat::getSmallestNormalized(Sem);
+  APFloat NegSmallest = Smallest;
+  NegSmallest.changeSign();
+  // Check if Const is one of our recognized consts.
+  int WhichConst;
+  if (Const->isZero()) {
+    // All comparisons with 0 can be converted.
+    WhichConst = 0;
+  } else if (Const->isInfinity()) {
+    // Likewise for infinities.
+    WhichConst = Const->isNegative() ? 2 : 1;
+  } else if (Const->isExactlyValue(Smallest)) {
+    // For Smallest, we cannot do EQ separately from GT.
+    if ((Pred & CmpInst::FCMP_OGE) != CmpInst::FCMP_OGE &&
+        (Pred & CmpInst::FCMP_OGE) != 0)
+      return;
+    WhichConst = 3;
+  } else if (Const->isExactlyValue(NegSmallest)) {
+    // Likewise for NegSmallest, we cannot do EQ separately from LT.
+    if ((Pred & CmpInst::FCMP_OLE) != CmpInst::FCMP_OLE &&
+        (Pred & CmpInst::FCMP_OLE) != 0)
+      return;
+    WhichConst = 4;
+  } else {
+    // Not one of our special constants.
+    return;
+  }
+  // Partial masks to use for EQ, GT, LT, UN comparisons, respectively.
+  static const int Masks[][4] = {
+    { // 0
+      SystemZ::TDCMASK_ZERO,              // eq
+      SystemZ::TDCMASK_POSITIVE,          // gt
+      SystemZ::TDCMASK_NEGATIVE,          // lt
+      SystemZ::TDCMASK_NAN,               // un
+    },
+    { // inf
+      SystemZ::TDCMASK_INFINITY_PLUS,     // eq
+      0,                                  // gt
+      (SystemZ::TDCMASK_ZERO |
+       SystemZ::TDCMASK_NEGATIVE |
+       SystemZ::TDCMASK_NORMAL_PLUS |
+       SystemZ::TDCMASK_SUBNORMAL_PLUS),  // lt
+      SystemZ::TDCMASK_NAN,               // un
+    },
+    { // -inf
+      SystemZ::TDCMASK_INFINITY_MINUS,    // eq
+      (SystemZ::TDCMASK_ZERO |
+       SystemZ::TDCMASK_POSITIVE |
+       SystemZ::TDCMASK_NORMAL_MINUS |
+       SystemZ::TDCMASK_SUBNORMAL_MINUS), // gt
+      0,                                  // lt
+      SystemZ::TDCMASK_NAN,               // un
+    },
+    { // minnorm
+      0,                                  // eq (unsupported)
+      (SystemZ::TDCMASK_NORMAL_PLUS |
+       SystemZ::TDCMASK_INFINITY_PLUS),   // gt (actually ge)
+      (SystemZ::TDCMASK_ZERO |
+       SystemZ::TDCMASK_NEGATIVE |
+       SystemZ::TDCMASK_SUBNORMAL_PLUS),  // lt
+      SystemZ::TDCMASK_NAN,               // un
+    },
+    { // -minnorm
+      0,                                  // eq (unsupported)
+      (SystemZ::TDCMASK_ZERO |
+       SystemZ::TDCMASK_POSITIVE |
+       SystemZ::TDCMASK_SUBNORMAL_MINUS), // gt
+      (SystemZ::TDCMASK_NORMAL_MINUS |
+       SystemZ::TDCMASK_INFINITY_MINUS),  // lt (actually le)
+      SystemZ::TDCMASK_NAN,               // un
+    }
+  };
+  // Construct the mask as a combination of the partial masks.
+  int Mask = 0;
+  if (Pred & CmpInst::FCMP_OEQ)
+    Mask |= Masks[WhichConst][0];
+  if (Pred & CmpInst::FCMP_OGT)
+    Mask |= Masks[WhichConst][1];
+  if (Pred & CmpInst::FCMP_OLT)
+    Mask |= Masks[WhichConst][2];
+  if (Pred & CmpInst::FCMP_UNO)
+    Mask |= Masks[WhichConst][3];
+  // A lone fcmp is unworthy of tdc conversion on its own, but may become
+  // worthy if combined with fabs.
+  bool Worthy = false;
+  if (CallInst *CI = dyn_cast<CallInst>(Op0)) {
+    Function *F = CI->getCalledFunction();
+    if (F && F->getIntrinsicID() == Intrinsic::fabs) {
+      // Fold with fabs - adjust the mask appropriately.
+      Mask &= SystemZ::TDCMASK_PLUS;
+      Mask |= Mask >> 1;
+      Op0 = CI->getArgOperand(0);
+      // A combination of fcmp with fabs is a win, unless the constant
+      // involved is 0 (which is handled by later passes).
+      Worthy = WhichConst != 0;
+      PossibleJunk.insert(CI);
+    }
+  }
+  converted(&I, Op0, Mask, Worthy);
+}
+
+void SystemZTDCPass::convertICmp(CmpInst &I) {
+  Value *Op0 = I.getOperand(0);
+  auto *Const = dyn_cast<ConstantInt>(I.getOperand(1));
+  auto Pred = I.getPredicate();
+  // All our icmp rules involve comparisons with consts.
+  if (!Const)
+    return;
+  if (auto *Cast = dyn_cast<BitCastInst>(Op0)) {
+    // Check for icmp+bitcast used for signbit.
+    if (!Cast->getSrcTy()->isFloatTy() &&
+        !Cast->getSrcTy()->isDoubleTy() &&
+        !Cast->getSrcTy()->isFP128Ty())
+      return;
+    Value *V = Cast->getOperand(0);
+    int Mask;
+    if (Pred == CmpInst::ICMP_SLT && Const->isZero()) {
+      // icmp slt (bitcast X), 0 - set if sign bit true
+      Mask = SystemZ::TDCMASK_MINUS;
+    } else if (Pred == CmpInst::ICMP_SGT && Const->isMinusOne()) {
+      // icmp sgt (bitcast X), -1 - set if sign bit false
+      Mask = SystemZ::TDCMASK_PLUS;
+    } else {
+      // Not a sign bit check.
+      return;
+    }
+    PossibleJunk.insert(Cast);
+    converted(&I, V, Mask, true);
+  } else if (auto *CI = dyn_cast<CallInst>(Op0)) {
+    // Check if this is a pre-existing call of our tdc intrinsic.
+    Function *F = CI->getCalledFunction();
+    if (!F || F->getIntrinsicID() != Intrinsic::s390_tdc)
+      return;
+    if (!Const->isZero())
+      return;
+    Value *V = CI->getArgOperand(0);
+    auto *MaskC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+    // Bail if the mask is not a constant.
+    if (!MaskC)
+      return;
+    int Mask = MaskC->getZExtValue();
+    Mask &= SystemZ::TDCMASK_ALL;
+    if (Pred == CmpInst::ICMP_NE) {
+      // icmp ne (call llvm.s390.tdc(...)), 0 -> simple TDC
+    } else if (Pred == CmpInst::ICMP_EQ) {
+      // icmp eq (call llvm.s390.tdc(...)), 0 -> TDC with inverted mask
+      Mask ^= SystemZ::TDCMASK_ALL;
+    } else {
+      // An unknown comparison - ignore.
+      return;
+    }
+    PossibleJunk.insert(CI);
+    converted(&I, V, Mask, false);
+  }
+}
+
+void SystemZTDCPass::convertLogicOp(BinaryOperator &I) {
+  Value *Op0, *Op1;
+  int Mask0, Mask1;
+  bool Worthy0, Worthy1;
+  std::tie(Op0, Mask0, Worthy0) = ConvertedInsts[cast<Instruction>(I.getOperand(0))];
+  std::tie(Op1, Mask1, Worthy1) = ConvertedInsts[cast<Instruction>(I.getOperand(1))];
+  if (Op0 != Op1)
+    return;
+  int Mask;
+  switch (I.getOpcode()) {
+    case Instruction::And:
+      Mask = Mask0 & Mask1;
+      break;
+    case Instruction::Or:
+      Mask = Mask0 | Mask1;
+      break;
+    case Instruction::Xor:
+      Mask = Mask0 ^ Mask1;
+      break;
+    default:
+      llvm_unreachable("Unknown op in convertLogicOp");
+  }
+  converted(&I, Op0, Mask, true);
+}
+
+bool SystemZTDCPass::runOnFunction(Function &F) {
+  ConvertedInsts.clear();
+  LogicOpsWorklist.clear();
+  PossibleJunk.clear();
+
+  // Look for icmp+fcmp instructions.
+  for (auto &I : instructions(F)) {
+    if (I.getOpcode() == Instruction::FCmp)
+      convertFCmp(cast<CmpInst>(I));
+    else if (I.getOpcode() == Instruction::ICmp)
+      convertICmp(cast<CmpInst>(I));
+  }
+
+  // If none found, bail already.
+  if (ConvertedInsts.empty())
+    return false;
+
+  // Process the queue of logic instructions.
+  while (!LogicOpsWorklist.empty()) {
+    BinaryOperator *Op = LogicOpsWorklist.back();
+    LogicOpsWorklist.pop_back();
+    // If both operands mapped, and the instruction itself not yet mapped,
+    // convert it.
+    if (ConvertedInsts.count(dyn_cast<Instruction>(Op->getOperand(0))) &&
+        ConvertedInsts.count(dyn_cast<Instruction>(Op->getOperand(1))) &&
+        !ConvertedInsts.count(Op))
+      convertLogicOp(*Op);
+  }
+
+  // Time to actually replace the instructions.  Do it in the reverse order
+  // of finding them, since there's a good chance the earlier ones will be
+  // unused (due to being folded into later ones).
+  Module &M = *F.getParent();
+  auto &Ctx = M.getContext();
+  Value *Zero32 = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+  bool MadeChange = false;
+  for (auto &It : reverse(ConvertedInsts)) {
+    Instruction *I = It.first;
+    Value *V;
+    int Mask;
+    bool Worthy;
+    std::tie(V, Mask, Worthy) = It.second;
+    if (!I->user_empty()) {
+      // If used and unworthy of conversion, skip it.
+      if (!Worthy)
+        continue;
+      // Call the intrinsic, compare result with 0.
+      Value *TDCFunc = Intrinsic::getDeclaration(&M, Intrinsic::s390_tdc,
+                                                 V->getType());
+      IRBuilder<> IRB(I);
+      Value *MaskVal = ConstantInt::get(Type::getInt64Ty(Ctx), Mask);
+      Instruction *TDC = IRB.CreateCall(TDCFunc, {V, MaskVal});
+      Value *ICmp = IRB.CreateICmp(CmpInst::ICMP_NE, TDC, Zero32);
+      I->replaceAllUsesWith(ICmp);
+    }
+    // If unused, or used and converted, remove it.
+    I->eraseFromParent();
+    MadeChange = true;
+  }
+
+  if (!MadeChange)
+    return false;
+
+  // We've actually done something - now clear misc accumulated junk (fabs,
+  // bitcast).
+  for (auto *I : PossibleJunk)
+    if (I->user_empty())
+      I->eraseFromParent();
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
new file mode 100644
index 000000000000..33fdb8f90825
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -0,0 +1,209 @@
+//===-- SystemZTargetMachine.cpp - Define TargetMachine for SystemZ -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetMachine.h"
+#include "SystemZTargetTransformInfo.h"
+#include "SystemZMachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeSystemZTarget() {
+  // Register the target.
+  RegisterTargetMachine<SystemZTargetMachine> X(getTheSystemZTarget());
+}
+
+// Determine whether we use the vector ABI.
+static bool UsesVectorABI(StringRef CPU, StringRef FS) {
+  // We use the vector ABI whenever the vector facility is avaiable.
+  // This is the case by default if CPU is z13 or later, and can be
+  // overridden via "[+-]vector" feature string elements.
+  bool VectorABI = true;
+  if (CPU.empty() || CPU == "generic" ||
+      CPU == "z10" || CPU == "z196" || CPU == "zEC12")
+    VectorABI = false;
+
+  SmallVector<StringRef, 3> Features;
+  FS.split(Features, ',', -1, false /* KeepEmpty */);
+  for (auto &Feature : Features) {
+    if (Feature == "vector" || Feature == "+vector")
+      VectorABI = true;
+    if (Feature == "-vector")
+      VectorABI = false;
+  }
+
+  return VectorABI;
+}
+
+static std::string computeDataLayout(const Triple &TT, StringRef CPU,
+                                     StringRef FS) {
+  bool VectorABI = UsesVectorABI(CPU, FS);
+  std::string Ret = "";
+
+  // Big endian.
+  Ret += "E";
+
+  // Data mangling.
+  Ret += DataLayout::getManglingComponent(TT);
+
+  // Make sure that global data has at least 16 bits of alignment by
+  // default, so that we can refer to it using LARL.  We don't have any
+  // special requirements for stack variables though.
+  Ret += "-i1:8:16-i8:8:16";
+
+  // 64-bit integers are naturally aligned.
+  Ret += "-i64:64";
+
+  // 128-bit floats are aligned only to 64 bits.
+  Ret += "-f128:64";
+
+  // When using the vector ABI, 128-bit vectors are also aligned to 64 bits.
+  if (VectorABI)
+    Ret += "-v128:64";
+
+  // We prefer 16 bits of aligned for all globals; see above.
+  Ret += "-a:8:16";
+
+  // Integer registers are 32 or 64 bits.
+  Ret += "-n32:64";
+
+  return Ret;
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  // Static code is suitable for use in a dynamic executable; there is no
+  // separate DynamicNoPIC model.
+  if (!RM.hasValue() || *RM == Reloc::DynamicNoPIC)
+    return Reloc::Static;
+  return *RM;
+}
+
+SystemZTargetMachine::SystemZTargetMachine(const Target &T, const Triple &TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Optional<Reloc::Model> RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, computeDataLayout(TT, CPU, FS), TT, CPU, FS, Options,
+                        getEffectiveRelocModel(RM), CM, OL),
+      TLOF(make_unique<TargetLoweringObjectFileELF>()),
+      Subtarget(TT, CPU, FS, *this) {
+  initAsmInfo();
+}
+
+SystemZTargetMachine::~SystemZTargetMachine() {}
+
+namespace {
+/// SystemZ Code Generator Pass Configuration Options.
+class SystemZPassConfig : public TargetPassConfig {
+public:
+  SystemZPassConfig(SystemZTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  SystemZTargetMachine &getSystemZTargetMachine() const {
+    return getTM<SystemZTargetMachine>();
+  }
+
+  ScheduleDAGInstrs *
+  createPostMachineScheduler(MachineSchedContext *C) const override {
+    return new ScheduleDAGMI(C, make_unique<SystemZPostRASchedStrategy>(C),
+                             /*RemoveKillFlags=*/true);
+  }
+
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  bool addILPOpts() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
+};
+} // end anonymous namespace
+
+void SystemZPassConfig::addIRPasses() {
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createSystemZTDCPass());
+
+  TargetPassConfig::addIRPasses();
+}
+
+bool SystemZPassConfig::addInstSelector() {
+  addPass(createSystemZISelDag(getSystemZTargetMachine(), getOptLevel()));
+
+ if (getOptLevel() != CodeGenOpt::None)
+    addPass(createSystemZLDCleanupPass(getSystemZTargetMachine()));
+
+  return false;
+}
+
+bool SystemZPassConfig::addILPOpts() {
+  addPass(&EarlyIfConverterID);
+  return true;
+}
+
+void SystemZPassConfig::addPreSched2() {
+  addPass(createSystemZExpandPseudoPass(getSystemZTargetMachine()));
+
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(&IfConverterID);
+}
+
+void SystemZPassConfig::addPreEmitPass() {
+
+  // Do instruction shortening before compare elimination because some
+  // vector instructions will be shortened into opcodes that compare
+  // elimination recognizes.
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createSystemZShortenInstPass(getSystemZTargetMachine()), false);
+
+  // We eliminate comparisons here rather than earlier because some
+  // transformations can change the set of available CC values and we
+  // generally want those transformations to have priority.  This is
+  // especially true in the commonest case where the result of the comparison
+  // is used by a single in-range branch instruction, since we will then
+  // be able to fuse the compare and the branch instead.
+  //
+  // For example, two-address NILF can sometimes be converted into
+  // three-address RISBLG.  NILF produces a CC value that indicates whether
+  // the low word is zero, but RISBLG does not modify CC at all.  On the
+  // other hand, 64-bit ANDs like NILL can sometimes be converted to RISBG.
+  // The CC value produced by NILL isn't useful for our purposes, but the
+  // value produced by RISBG can be used for any comparison with zero
+  // (not just equality).  So there are some transformations that lose
+  // CC values (while still being worthwhile) and others that happen to make
+  // the CC result more useful than it was originally.
+  //
+  // Another reason is that we only want to use BRANCH ON COUNT in cases
+  // where we know that the count register is not going to be spilled.
+  //
+  // Doing it so late makes it more likely that a register will be reused
+  // between the comparison and the branch, but it isn't clear whether
+  // preventing that would be a win or not.
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createSystemZElimComparePass(getSystemZTargetMachine()), false);
+  addPass(createSystemZLongBranchPass(getSystemZTargetMachine()));
+
+  // Do final scheduling after all other optimizations, to get an
+  // optimal input for the decoder (branch relaxation must happen
+  // after block placement).
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(&PostMachineSchedulerID);
+}
+
+TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new SystemZPassConfig(this, PM);
+}
+
+TargetIRAnalysis SystemZTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(SystemZTTIImpl(this, F));
+  });
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
new file mode 100644
index 000000000000..69cf9bc6e525
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -0,0 +1,53 @@
+//==- SystemZTargetMachine.h - Define TargetMachine for SystemZ ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SystemZ specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETMACHINE_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETMACHINE_H
+
+#include "SystemZSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class TargetFrameLowering;
+
+class SystemZTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  SystemZSubtarget        Subtarget;
+
+public:
+  SystemZTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+  ~SystemZTargetMachine() override;
+
+  const SystemZSubtarget *getSubtargetImpl() const { return &Subtarget; }
+  const SystemZSubtarget *getSubtargetImpl(const Function &) const override {
+    return &Subtarget;
+  }
+  // Override LLVMTargetMachine
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  TargetIRAnalysis getTargetIRAnalysis() override;
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+
+  bool targetSchedulesPostRAScheduling() const override { return true; };
+
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
new file mode 100644
index 000000000000..b10c0e09a0d4
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -0,0 +1,315 @@
+//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a TargetTransformInfo analysis pass specific to the
+// SystemZ target machine. It uses the target's detailed information to provide
+// more precise answers to certain TTI queries, while letting the target
+// independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZTargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "systemztti"
+
+//===----------------------------------------------------------------------===//
+//
+// SystemZ cost model.
+//
+//===----------------------------------------------------------------------===//
+
+int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TTI::TCC_Free;
+  // No cost model for operations on integers larger than 64 bit implemented yet.
+  if (BitSize > 64)
+    return TTI::TCC_Free;
+
+  if (Imm == 0)
+    return TTI::TCC_Free;
+
+  if (Imm.getBitWidth() <= 64) {
+    // Constants loaded via lgfi.
+    if (isInt<32>(Imm.getSExtValue()))
+      return TTI::TCC_Basic;
+    // Constants loaded via llilf.
+    if (isUInt<32>(Imm.getZExtValue()))
+      return TTI::TCC_Basic;
+    // Constants loaded via llihf:
+    if ((Imm.getZExtValue() & 0xffffffff) == 0)
+      return TTI::TCC_Basic;
+
+    return 2 * TTI::TCC_Basic;
+  }
+
+  return 4 * TTI::TCC_Basic;
+}
+
+int SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
+                                  const APInt &Imm, Type *Ty) {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TTI::TCC_Free;
+  // No cost model for operations on integers larger than 64 bit implemented yet.
+  if (BitSize > 64)
+    return TTI::TCC_Free;
+
+  switch (Opcode) {
+  default:
+    return TTI::TCC_Free;
+  case Instruction::GetElementPtr:
+    // Always hoist the base address of a GetElementPtr. This prevents the
+    // creation of new constants for every base constant that gets constant
+    // folded with the offset.
+    if (Idx == 0)
+      return 2 * TTI::TCC_Basic;
+    return TTI::TCC_Free;
+  case Instruction::Store:
+    if (Idx == 0 && Imm.getBitWidth() <= 64) {
+      // Any 8-bit immediate store can by implemented via mvi.
+      if (BitSize == 8)
+        return TTI::TCC_Free;
+      // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
+      if (isInt<16>(Imm.getSExtValue()))
+        return TTI::TCC_Free;
+    }
+    break;
+  case Instruction::ICmp:
+    if (Idx == 1 && Imm.getBitWidth() <= 64) {
+      // Comparisons against signed 32-bit immediates implemented via cgfi.
+      if (isInt<32>(Imm.getSExtValue()))
+        return TTI::TCC_Free;
+      // Comparisons against unsigned 32-bit immediates implemented via clgfi.
+      if (isUInt<32>(Imm.getZExtValue()))
+        return TTI::TCC_Free;
+    }
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+    if (Idx == 1 && Imm.getBitWidth() <= 64) {
+      // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
+      if (isUInt<32>(Imm.getZExtValue()))
+        return TTI::TCC_Free;
+      // Or their negation, by swapping addition vs. subtraction.
+      if (isUInt<32>(-Imm.getSExtValue()))
+        return TTI::TCC_Free;
+    }
+    break;
+  case Instruction::Mul:
+    if (Idx == 1 && Imm.getBitWidth() <= 64) {
+      // We use msgfi to multiply by 32-bit signed immediates.
+      if (isInt<32>(Imm.getSExtValue()))
+        return TTI::TCC_Free;
+    }
+    break;
+  case Instruction::Or:
+  case Instruction::Xor:
+    if (Idx == 1 && Imm.getBitWidth() <= 64) {
+      // Masks supported by oilf/xilf.
+      if (isUInt<32>(Imm.getZExtValue()))
+        return TTI::TCC_Free;
+      // Masks supported by oihf/xihf.
+      if ((Imm.getZExtValue() & 0xffffffff) == 0)
+        return TTI::TCC_Free;
+    }
+    break;
+  case Instruction::And:
+    if (Idx == 1 && Imm.getBitWidth() <= 64) {
+      // Any 32-bit AND operation can by implemented via nilf.
+      if (BitSize <= 32)
+        return TTI::TCC_Free;
+      // 64-bit masks supported by nilf.
+      if (isUInt<32>(~Imm.getZExtValue()))
+        return TTI::TCC_Free;
+      // 64-bit masks supported by nilh.
+      if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
+        return TTI::TCC_Free;
+      // Some 64-bit AND operations can be implemented via risbg.
+      const SystemZInstrInfo *TII = ST->getInstrInfo();
+      unsigned Start, End;
+      if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
+        return TTI::TCC_Free;
+    }
+    break;
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    // Always return TCC_Free for the shift value of a shift instruction.
+    if (Idx == 1)
+      return TTI::TCC_Free;
+    break;
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+  case Instruction::BitCast:
+  case Instruction::PHI:
+  case Instruction::Call:
+  case Instruction::Select:
+  case Instruction::Ret:
+  case Instruction::Load:
+    break;
+  }
+
+  return SystemZTTIImpl::getIntImmCost(Imm, Ty);
+}
+
+int SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                                  const APInt &Imm, Type *Ty) {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TTI::TCC_Free;
+  // No cost model for operations on integers larger than 64 bit implemented yet.
+  if (BitSize > 64)
+    return TTI::TCC_Free;
+
+  switch (IID) {
+  default:
+    return TTI::TCC_Free;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+    // These get expanded to include a normal addition/subtraction.
+    if (Idx == 1 && Imm.getBitWidth() <= 64) {
+      if (isUInt<32>(Imm.getZExtValue()))
+        return TTI::TCC_Free;
+      if (isUInt<32>(-Imm.getSExtValue()))
+        return TTI::TCC_Free;
+    }
+    break;
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    // These get expanded to include a normal multiplication.
+    if (Idx == 1 && Imm.getBitWidth() <= 64) {
+      if (isInt<32>(Imm.getSExtValue()))
+        return TTI::TCC_Free;
+    }
+    break;
+  case Intrinsic::experimental_stackmap:
+    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TTI::TCC_Free;
+    break;
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TTI::TCC_Free;
+    break;
+  }
+  return SystemZTTIImpl::getIntImmCost(Imm, Ty);
+}
+
+TargetTransformInfo::PopcntSupportKind
+SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) {
+  assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
+  if (ST->hasPopulationCount() && TyWidth <= 64)
+    return TTI::PSK_FastHardware;
+  return TTI::PSK_Software;
+}
+
+void SystemZTTIImpl::getUnrollingPreferences(Loop *L,
+                                             TTI::UnrollingPreferences &UP) {
+  // Find out if L contains a call, what the machine instruction count
+  // estimate is, and how many stores there are.
+  bool HasCall = false;
+  unsigned NumStores = 0;
+  for (auto &BB : L->blocks())
+    for (auto &I : *BB) {
+      if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
+        ImmutableCallSite CS(&I);
+        if (const Function *F = CS.getCalledFunction()) {
+          if (isLoweredToCall(F))
+            HasCall = true;
+          if (F->getIntrinsicID() == Intrinsic::memcpy ||
+              F->getIntrinsicID() == Intrinsic::memset)
+            NumStores++;
+        } else { // indirect call.
+          HasCall = true;
+        }
+      }
+      if (isa<StoreInst>(&I)) {
+        NumStores++;
+        Type *MemAccessTy = I.getOperand(0)->getType();
+        if((MemAccessTy->isIntegerTy() || MemAccessTy->isFloatingPointTy()) &&
+           (getDataLayout().getTypeSizeInBits(MemAccessTy) == 128))
+          NumStores++;  // 128 bit fp/int stores get split.
+      }
+    }
+
+  // The z13 processor will run out of store tags if too many stores
+  // are fed into it too quickly. Therefore make sure there are not
+  // too many stores in the resulting unrolled loop.
+  unsigned const Max = (NumStores ? (12 / NumStores) : UINT_MAX);
+
+  if (HasCall) {
+    // Only allow full unrolling if loop has any calls.
+    UP.FullUnrollMaxCount = Max;
+    UP.MaxCount = 1;
+    return;
+  }
+
+  UP.MaxCount = Max;
+  if (UP.MaxCount <= 1)
+    return;
+
+  // Allow partial and runtime trip count unrolling.
+  UP.Partial = UP.Runtime = true;
+
+  UP.PartialThreshold = 75;
+  UP.DefaultUnrollRuntimeCount = 4;
+
+  // Allow expensive instructions in the pre-header of the loop.
+  UP.AllowExpensiveTripCount = true;
+
+  UP.Force = true;
+}
+
+unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) {
+  if (!Vector)
+    // Discount the stack pointer.  Also leave out %r0, since it can't
+    // be used in an address.
+    return 14;
+  if (ST->hasVector())
+    return 32;
+  return 0;
+}
+
+unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) {
+  if (!Vector)
+    return 64;
+  if (ST->hasVector())
+    return 128;
+  return 0;
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
new file mode 100644
index 000000000000..f7d2d827f11b
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -0,0 +1,61 @@
+//===-- SystemZTargetTransformInfo.h - SystemZ-specific TTI ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETTRANSFORMINFO_H
+
+#include "SystemZTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+
+namespace llvm {
+
+class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
+  typedef BasicTTIImplBase<SystemZTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const SystemZSubtarget *ST;
+  const SystemZTargetLowering *TLI;
+
+  const SystemZSubtarget *getST() const { return ST; }
+  const SystemZTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  /// \name Scalar TTI Implementations
+  /// @{
+
+  int getIntImmCost(const APInt &Imm, Type *Ty);
+
+  int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+  int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                    Type *Ty);
+
+  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+
+  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector);
+
+  /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp b/contrib/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
new file mode 100644
index 000000000000..d3c53a43b391
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- SystemZTargetInfo.cpp - SystemZ target implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+Target &llvm::getTheSystemZTarget() {
+  static Target TheSystemZTarget;
+  return TheSystemZTarget;
+}
+
+extern "C" void LLVMInitializeSystemZTargetInfo() {
+  RegisterTarget<Triple::systemz, /*HasJIT=*/true> X(getTheSystemZTarget(),
+                                                     "systemz", "SystemZ");
+}
diff --git a/contrib/llvm/lib/Target/Target.cpp b/contrib/llvm/lib/Target/Target.cpp
new file mode 100644
index 000000000000..5d1616d03779
--- /dev/null
+++ b/contrib/llvm/lib/Target/Target.cpp
@@ -0,0 +1,141 @@
+//===-- Target.cpp --------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the common infrastructure (including C bindings) for 
+// libLLVMTarget.a, which implements target information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Target.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include <cstring>
+
+using namespace llvm;
+
+// Avoid including "llvm-c/Core.h" for compile time, fwd-declare this instead.
+extern "C" LLVMContextRef LLVMGetGlobalContext(void);
+
+inline TargetLibraryInfoImpl *unwrap(LLVMTargetLibraryInfoRef P) {
+  return reinterpret_cast<TargetLibraryInfoImpl*>(P);
+}
+
+inline LLVMTargetLibraryInfoRef wrap(const TargetLibraryInfoImpl *P) {
+  TargetLibraryInfoImpl *X = const_cast<TargetLibraryInfoImpl*>(P);
+  return reinterpret_cast<LLVMTargetLibraryInfoRef>(X);
+}
+
+void llvm::initializeTarget(PassRegistry &Registry) {
+  initializeTargetLibraryInfoWrapperPassPass(Registry);
+  initializeTargetTransformInfoWrapperPassPass(Registry);
+}
+
+void LLVMInitializeTarget(LLVMPassRegistryRef R) {
+  initializeTarget(*unwrap(R));
+}
+
+LLVMTargetDataRef LLVMGetModuleDataLayout(LLVMModuleRef M) {
+  return wrap(&unwrap(M)->getDataLayout());
+}
+
+void LLVMSetModuleDataLayout(LLVMModuleRef M, LLVMTargetDataRef DL) {
+  unwrap(M)->setDataLayout(*unwrap(DL));
+}
+
+LLVMTargetDataRef LLVMCreateTargetData(const char *StringRep) {
+  return wrap(new DataLayout(StringRep));
+}
+
+void LLVMDisposeTargetData(LLVMTargetDataRef TD) {
+  delete unwrap(TD);
+}
+
+void LLVMAddTargetLibraryInfo(LLVMTargetLibraryInfoRef TLI,
+                              LLVMPassManagerRef PM) {
+  unwrap(PM)->add(new TargetLibraryInfoWrapperPass(*unwrap(TLI)));
+}
+
+char *LLVMCopyStringRepOfTargetData(LLVMTargetDataRef TD) {
+  std::string StringRep = unwrap(TD)->getStringRepresentation();
+  return strdup(StringRep.c_str());
+}
+
+LLVMByteOrdering LLVMByteOrder(LLVMTargetDataRef TD) {
+  return unwrap(TD)->isLittleEndian() ? LLVMLittleEndian : LLVMBigEndian;
+}
+
+unsigned LLVMPointerSize(LLVMTargetDataRef TD) {
+  return unwrap(TD)->getPointerSize(0);
+}
+
+unsigned LLVMPointerSizeForAS(LLVMTargetDataRef TD, unsigned AS) {
+  return unwrap(TD)->getPointerSize(AS);
+}
+
+LLVMTypeRef LLVMIntPtrType(LLVMTargetDataRef TD) {
+  return wrap(unwrap(TD)->getIntPtrType(*unwrap(LLVMGetGlobalContext())));
+}
+
+LLVMTypeRef LLVMIntPtrTypeForAS(LLVMTargetDataRef TD, unsigned AS) {
+  return wrap(unwrap(TD)->getIntPtrType(*unwrap(LLVMGetGlobalContext()), AS));
+}
+
+LLVMTypeRef LLVMIntPtrTypeInContext(LLVMContextRef C, LLVMTargetDataRef TD) {
+  return wrap(unwrap(TD)->getIntPtrType(*unwrap(C)));
+}
+
+LLVMTypeRef LLVMIntPtrTypeForASInContext(LLVMContextRef C, LLVMTargetDataRef TD, unsigned AS) {
+  return wrap(unwrap(TD)->getIntPtrType(*unwrap(C), AS));
+}
+
+unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getTypeSizeInBits(unwrap(Ty));
+}
+
+unsigned long long LLVMStoreSizeOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getTypeStoreSize(unwrap(Ty));
+}
+
+unsigned long long LLVMABISizeOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getTypeAllocSize(unwrap(Ty));
+}
+
+unsigned LLVMABIAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getABITypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMCallFrameAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getABITypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMPreferredAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getPrefTypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMPreferredAlignmentOfGlobal(LLVMTargetDataRef TD,
+                                        LLVMValueRef GlobalVar) {
+  return unwrap(TD)->getPreferredAlignment(unwrap<GlobalVariable>(GlobalVar));
+}
+
+unsigned LLVMElementAtOffset(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
+                             unsigned long long Offset) {
+  StructType *STy = unwrap<StructType>(StructTy);
+  return unwrap(TD)->getStructLayout(STy)->getElementContainingOffset(Offset);
+}
+
+unsigned long long LLVMOffsetOfElement(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
+                                       unsigned Element) {
+  StructType *STy = unwrap<StructType>(StructTy);
+  return unwrap(TD)->getStructLayout(STy)->getElementOffset(Element);
+}
diff --git a/contrib/llvm/lib/Target/TargetIntrinsicInfo.cpp b/contrib/llvm/lib/Target/TargetIntrinsicInfo.cpp
new file mode 100644
index 000000000000..e8b71924e0d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetIntrinsicInfo.cpp
@@ -0,0 +1,30 @@
+//===-- TargetIntrinsicInfo.cpp - Target Instruction Information ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetIntrinsicInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Function.h"
+using namespace llvm;
+
+TargetIntrinsicInfo::TargetIntrinsicInfo() {
+}
+
+TargetIntrinsicInfo::~TargetIntrinsicInfo() {
+}
+
+unsigned TargetIntrinsicInfo::getIntrinsicID(const Function *F) const {
+  const ValueName *ValName = F->getValueName();
+  if (!ValName)
+    return 0;
+  return lookupName(ValName->getKeyData(), ValName->getKeyLength());
+}
diff --git a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
new file mode 100644
index 000000000000..375f8511f7ad
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -0,0 +1,326 @@
+//===-- llvm/Target/TargetLoweringObjectFile.cpp - Object File Info -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements classes used to handle lowerings specific to common
+// object file formats.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                              Generic Code
+//===----------------------------------------------------------------------===//
+
+/// Initialize - this method must be called before any actual lowering is
+/// done.  This specifies the current context for codegen, and gives the
+/// lowering implementations a chance to set up their default sections.
+void TargetLoweringObjectFile::Initialize(MCContext &ctx,
+                                          const TargetMachine &TM) {
+  Ctx = &ctx;
+  // `Initialize` can be called more than once.
+  if (Mang != nullptr) delete Mang;
+  Mang = new Mangler();
+  InitMCObjectFileInfo(TM.getTargetTriple(), TM.isPositionIndependent(),
+                       TM.getCodeModel(), *Ctx);
+}
+
+TargetLoweringObjectFile::~TargetLoweringObjectFile() {
+  delete Mang;
+}
+
+static bool isSuitableForBSS(const GlobalVariable *GV, bool NoZerosInBSS) {
+  const Constant *C = GV->getInitializer();
+
+  // Must have zero initializer.
+  if (!C->isNullValue())
+    return false;
+
+  // Leave constant zeros in readonly constant sections, so they can be shared.
+  if (GV->isConstant())
+    return false;
+
+  // If the global has an explicit section specified, don't put it in BSS.
+  if (GV->hasSection())
+    return false;
+
+  // If -nozero-initialized-in-bss is specified, don't ever use BSS.
+  if (NoZerosInBSS)
+    return false;
+
+  // Otherwise, put it in BSS!
+  return true;
+}
+
+/// IsNullTerminatedString - Return true if the specified constant (which is
+/// known to have a type that is an array of 1/2/4 byte elements) ends with a
+/// nul value and contains no other nuls in it.  Note that this is more general
+/// than ConstantDataSequential::isString because we allow 2 & 4 byte strings.
+static bool IsNullTerminatedString(const Constant *C) {
+  // First check: is we have constant array terminated with zero
+  if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(C)) {
+    unsigned NumElts = CDS->getNumElements();
+    assert(NumElts != 0 && "Can't have an empty CDS");
+    
+    if (CDS->getElementAsInteger(NumElts-1) != 0)
+      return false; // Not null terminated.
+    
+    // Verify that the null doesn't occur anywhere else in the string.
+    for (unsigned i = 0; i != NumElts-1; ++i)
+      if (CDS->getElementAsInteger(i) == 0)
+        return false;
+    return true;
+  }
+
+  // Another possibility: [1 x i8] zeroinitializer
+  if (isa<ConstantAggregateZero>(C))
+    return cast<ArrayType>(C->getType())->getNumElements() == 1;
+
+  return false;
+}
+
+MCSymbol *TargetLoweringObjectFile::getSymbolWithGlobalValueBase(
+    const GlobalValue *GV, StringRef Suffix, const TargetMachine &TM) const {
+  assert(!Suffix.empty());
+
+  SmallString<60> NameStr;
+  NameStr += GV->getParent()->getDataLayout().getPrivateGlobalPrefix();
+  TM.getNameWithPrefix(NameStr, GV, *Mang);
+  NameStr.append(Suffix.begin(), Suffix.end());
+  return Ctx->getOrCreateSymbol(NameStr);
+}
+
+MCSymbol *TargetLoweringObjectFile::getCFIPersonalitySymbol(
+    const GlobalValue *GV, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  return TM.getSymbol(GV);
+}
+
+void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
+                                                    const DataLayout &,
+                                                    const MCSymbol *Sym) const {
+}
+
+
+/// getKindForGlobal - This is a top-level target-independent classifier for
+/// a global variable.  Given an global variable and information from TM, it
+/// classifies the global in a variety of ways that make various target
+/// implementations simpler.  The target implementation is free to ignore this
+/// extra info of course.
+SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalObject *GO,
+                                                       const TargetMachine &TM){
+  assert(!GO->isDeclaration() && !GO->hasAvailableExternallyLinkage() &&
+         "Can only be used for global definitions");
+
+  Reloc::Model ReloModel = TM.getRelocationModel();
+
+  // Early exit - functions should be always in text sections.
+  const auto *GVar = dyn_cast<GlobalVariable>(GO);
+  if (!GVar)
+    return SectionKind::getText();
+
+  // Handle thread-local data first.
+  if (GVar->isThreadLocal()) {
+    if (isSuitableForBSS(GVar, TM.Options.NoZerosInBSS))
+      return SectionKind::getThreadBSS();
+    return SectionKind::getThreadData();
+  }
+
+  // Variables with common linkage always get classified as common.
+  if (GVar->hasCommonLinkage())
+    return SectionKind::getCommon();
+
+  // Variable can be easily put to BSS section.
+  if (isSuitableForBSS(GVar, TM.Options.NoZerosInBSS)) {
+    if (GVar->hasLocalLinkage())
+      return SectionKind::getBSSLocal();
+    else if (GVar->hasExternalLinkage())
+      return SectionKind::getBSSExtern();
+    return SectionKind::getBSS();
+  }
+
+  const Constant *C = GVar->getInitializer();
+
+  // If the global is marked constant, we can put it into a mergable section,
+  // a mergable string section, or general .data if it contains relocations.
+  if (GVar->isConstant()) {
+    // If the initializer for the global contains something that requires a
+    // relocation, then we may have to drop this into a writable data section
+    // even though it is marked const.
+    if (!C->needsRelocation()) {
+      // If the global is required to have a unique address, it can't be put
+      // into a mergable section: just drop it into the general read-only
+      // section instead.
+      if (!GVar->hasGlobalUnnamedAddr())
+        return SectionKind::getReadOnly();
+
+      // If initializer is a null-terminated string, put it in a "cstring"
+      // section of the right width.
+      if (ArrayType *ATy = dyn_cast<ArrayType>(C->getType())) {
+        if (IntegerType *ITy =
+              dyn_cast<IntegerType>(ATy->getElementType())) {
+          if ((ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16 ||
+               ITy->getBitWidth() == 32) &&
+              IsNullTerminatedString(C)) {
+            if (ITy->getBitWidth() == 8)
+              return SectionKind::getMergeable1ByteCString();
+            if (ITy->getBitWidth() == 16)
+              return SectionKind::getMergeable2ByteCString();
+
+            assert(ITy->getBitWidth() == 32 && "Unknown width");
+            return SectionKind::getMergeable4ByteCString();
+          }
+        }
+      }
+
+      // Otherwise, just drop it into a mergable constant section.  If we have
+      // a section for this size, use it, otherwise use the arbitrary sized
+      // mergable section.
+      switch (
+          GVar->getParent()->getDataLayout().getTypeAllocSize(C->getType())) {
+      case 4:  return SectionKind::getMergeableConst4();
+      case 8:  return SectionKind::getMergeableConst8();
+      case 16: return SectionKind::getMergeableConst16();
+      case 32: return SectionKind::getMergeableConst32();
+      default:
+        return SectionKind::getReadOnly();
+      }
+
+    } else {
+      // In static, ROPI and RWPI relocation models, the linker will resolve
+      // all addresses, so the relocation entries will actually be constants by
+      // the time the app starts up.  However, we can't put this into a
+      // mergable section, because the linker doesn't take relocations into
+      // consideration when it tries to merge entries in the section.
+      if (ReloModel == Reloc::Static || ReloModel == Reloc::ROPI ||
+          ReloModel == Reloc::RWPI || ReloModel == Reloc::ROPI_RWPI)
+        return SectionKind::getReadOnly();
+
+      // Otherwise, the dynamic linker needs to fix it up, put it in the
+      // writable data.rel section.
+      return SectionKind::getReadOnlyWithRel();
+    }
+  }
+
+  // Okay, this isn't a constant.
+  return SectionKind::getData();
+}
+
+/// This method computes the appropriate section to emit the specified global
+/// variable or function definition.  This should not be passed external (or
+/// available externally) globals.
+MCSection *TargetLoweringObjectFile::SectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  // Select section name.
+  if (GO->hasSection())
+    return getExplicitSectionGlobal(GO, Kind, TM);
+
+  // Use default section depending on the 'type' of global
+  return SelectSectionForGlobal(GO, Kind, TM);
+}
+
+MCSection *TargetLoweringObjectFile::getSectionForJumpTable(
+    const Function &F, const TargetMachine &TM) const {
+  unsigned Align = 0;
+  return getSectionForConstant(F.getParent()->getDataLayout(),
+                               SectionKind::getReadOnly(), /*C=*/nullptr,
+                               Align);
+}
+
+bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  // In PIC mode, we need to emit the jump table to the same section as the
+  // function body itself, otherwise the label differences won't make sense.
+  // FIXME: Need a better predicate for this: what about custom entries?
+  if (UsesLabelDifference)
+    return true;
+
+  // We should also do if the section name is NULL or function is declared
+  // in discardable section
+  // FIXME: this isn't the right predicate, should be based on the MCSection
+  // for the function.
+  if (F.isWeakForLinker())
+    return true;
+
+  return false;
+}
+
+/// Given a mergable constant with the specified size and relocation
+/// information, return a section that it should be placed in.
+MCSection *TargetLoweringObjectFile::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C,
+    unsigned &Align) const {
+  if (Kind.isReadOnly() && ReadOnlySection != nullptr)
+    return ReadOnlySection;
+
+  return DataSection;
+}
+
+/// getTTypeGlobalReference - Return an MCExpr to use for a
+/// reference to the specified global variable from exception
+/// handling information.
+const MCExpr *TargetLoweringObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
+    MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+  const MCSymbolRefExpr *Ref =
+      MCSymbolRefExpr::create(TM.getSymbol(GV), getContext());
+
+  return getTTypeReference(Ref, Encoding, Streamer);
+}
+
+const MCExpr *TargetLoweringObjectFile::
+getTTypeReference(const MCSymbolRefExpr *Sym, unsigned Encoding,
+                  MCStreamer &Streamer) const {
+  switch (Encoding & 0x70) {
+  default:
+    report_fatal_error("We do not support this DWARF encoding yet!");
+  case dwarf::DW_EH_PE_absptr:
+    // Do nothing special
+    return Sym;
+  case dwarf::DW_EH_PE_pcrel: {
+    // Emit a label to the streamer for the current position.  This gives us
+    // .-foo addressing.
+    MCSymbol *PCSym = getContext().createTempSymbol();
+    Streamer.EmitLabel(PCSym);
+    const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
+    return MCBinaryExpr::createSub(Sym, PC, getContext());
+  }
+  }
+}
+
+const MCExpr *TargetLoweringObjectFile::getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
+  // FIXME: It's not clear what, if any, default this should have - perhaps a
+  // null return could mean 'no location' & we should just do that here.
+  return MCSymbolRefExpr::create(Sym, *Ctx);
+}
+
+void TargetLoweringObjectFile::getNameWithPrefix(
+    SmallVectorImpl<char> &OutName, const GlobalValue *GV,
+    const TargetMachine &TM) const {
+  Mang->getNameWithPrefix(OutName, GV, /*CannotUsePrivateLabel=*/false);
+}
diff --git a/contrib/llvm/lib/Target/TargetMachine.cpp b/contrib/llvm/lib/Target/TargetMachine.cpp
new file mode 100644
index 000000000000..e16ced1661a1
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetMachine.cpp
@@ -0,0 +1,221 @@
+//===-- TargetMachine.cpp - General Target Information ---------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the general parts of a Target machine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+cl::opt<bool> EnableIPRA("enable-ipra", cl::init(false), cl::Hidden,
+                         cl::desc("Enable interprocedural register allocation "
+                                  "to reduce load/store at procedure calls."));
+
+//---------------------------------------------------------------------------
+// TargetMachine Class
+//
+
+TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString,
+                             const Triple &TT, StringRef CPU, StringRef FS,
+                             const TargetOptions &Options)
+    : TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU),
+      TargetFS(FS), AsmInfo(nullptr), MRI(nullptr), MII(nullptr), STI(nullptr),
+      RequireStructuredCFG(false), Options(Options) {
+  if (EnableIPRA.getNumOccurrences())
+    this->Options.EnableIPRA = EnableIPRA;
+}
+
+TargetMachine::~TargetMachine() {
+  delete AsmInfo;
+  delete MRI;
+  delete MII;
+  delete STI;
+}
+
+bool TargetMachine::isPositionIndependent() const {
+  return getRelocationModel() == Reloc::PIC_;
+}
+
+/// \brief Reset the target options based on the function's attributes.
+// FIXME: This function needs to go away for a number of reasons:
+// a) global state on the TargetMachine is terrible in general,
+// b) there's no default state here to keep,
+// c) these target options should be passed only on the function
+//    and not on the TargetMachine (via TargetOptions) at all.
+void TargetMachine::resetTargetOptions(const Function &F) const {
+#define RESET_OPTION(X, Y)                                                     \
+  do {                                                                         \
+    if (F.hasFnAttribute(Y))                                                   \
+      Options.X = (F.getFnAttribute(Y).getValueAsString() == "true");          \
+  } while (0)
+
+  RESET_OPTION(LessPreciseFPMADOption, "less-precise-fpmad");
+  RESET_OPTION(UnsafeFPMath, "unsafe-fp-math");
+  RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
+  RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
+  RESET_OPTION(NoTrappingFPMath, "no-trapping-math");
+
+  StringRef Denormal =
+    F.getFnAttribute("denormal-fp-math").getValueAsString();
+  if (Denormal == "ieee")
+    Options.FPDenormalMode = FPDenormal::IEEE;
+  else if (Denormal == "preserve-sign")
+    Options.FPDenormalMode = FPDenormal::PreserveSign;
+  else if (Denormal == "positive-zero")
+    Options.FPDenormalMode = FPDenormal::PositiveZero;
+}
+
+/// Returns the code generation relocation model. The choices are static, PIC,
+/// and dynamic-no-pic.
+Reloc::Model TargetMachine::getRelocationModel() const { return RM; }
+
+/// Returns the code model. The choices are small, kernel, medium, large, and
+/// target default.
+CodeModel::Model TargetMachine::getCodeModel() const { return CMModel; }
+
+/// Get the IR-specified TLS model for Var.
+static TLSModel::Model getSelectedTLSModel(const GlobalValue *GV) {
+  switch (GV->getThreadLocalMode()) {
+  case GlobalVariable::NotThreadLocal:
+    llvm_unreachable("getSelectedTLSModel for non-TLS variable");
+    break;
+  case GlobalVariable::GeneralDynamicTLSModel:
+    return TLSModel::GeneralDynamic;
+  case GlobalVariable::LocalDynamicTLSModel:
+    return TLSModel::LocalDynamic;
+  case GlobalVariable::InitialExecTLSModel:
+    return TLSModel::InitialExec;
+  case GlobalVariable::LocalExecTLSModel:
+    return TLSModel::LocalExec;
+  }
+  llvm_unreachable("invalid TLS model");
+}
+
+bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
+                                         const GlobalValue *GV) const {
+  Reloc::Model RM = getRelocationModel();
+  const Triple &TT = getTargetTriple();
+
+  // DLLImport explicitly marks the GV as external.
+  if (GV && GV->hasDLLImportStorageClass())
+    return false;
+
+  // Every other GV is local on COFF.
+  // Make an exception for windows OS in the triple: Some firmwares builds use
+  // *-win32-macho triples. This (accidentally?) produced windows relocations
+  // without GOT tables in older clang versions; Keep this behaviour.
+  if (TT.isOSBinFormatCOFF() || (TT.isOSWindows() && TT.isOSBinFormatMachO()))
+    return true;
+
+  if (GV && (GV->hasLocalLinkage() || !GV->hasDefaultVisibility()))
+    return true;
+
+  if (TT.isOSBinFormatMachO()) {
+    if (RM == Reloc::Static)
+      return true;
+    return GV && GV->isStrongDefinitionForLinker();
+  }
+
+  assert(TT.isOSBinFormatELF());
+  assert(RM != Reloc::DynamicNoPIC);
+
+  bool IsExecutable =
+      RM == Reloc::Static || M.getPIELevel() != PIELevel::Default;
+  if (IsExecutable) {
+    // If the symbol is defined, it cannot be preempted.
+    if (GV && !GV->isDeclarationForLinker())
+      return true;
+
+    bool IsTLS = GV && GV->isThreadLocal();
+    bool IsAccessViaCopyRelocs =
+        Options.MCOptions.MCPIECopyRelocations && GV && isa<GlobalVariable>(GV);
+    // Check if we can use copy relocations.
+    if (!IsTLS && (RM == Reloc::Static || IsAccessViaCopyRelocs))
+      return true;
+  }
+
+  // ELF supports preemption of other symbols.
+  return false;
+}
+
+TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
+  bool IsPIE = GV->getParent()->getPIELevel() != PIELevel::Default;
+  Reloc::Model RM = getRelocationModel();
+  bool IsSharedLibrary = RM == Reloc::PIC_ && !IsPIE;
+  bool IsLocal = shouldAssumeDSOLocal(*GV->getParent(), GV);
+
+  TLSModel::Model Model;
+  if (IsSharedLibrary) {
+    if (IsLocal)
+      Model = TLSModel::LocalDynamic;
+    else
+      Model = TLSModel::GeneralDynamic;
+  } else {
+    if (IsLocal)
+      Model = TLSModel::LocalExec;
+    else
+      Model = TLSModel::InitialExec;
+  }
+
+  // If the user specified a more specific model, use that.
+  TLSModel::Model SelectedModel = getSelectedTLSModel(GV);
+  if (SelectedModel > Model)
+    return SelectedModel;
+
+  return Model;
+}
+
+/// Returns the optimization level: None, Less, Default, or Aggressive.
+CodeGenOpt::Level TargetMachine::getOptLevel() const { return OptLevel; }
+
+void TargetMachine::setOptLevel(CodeGenOpt::Level Level) { OptLevel = Level; }
+
+TargetIRAnalysis TargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(F.getParent()->getDataLayout());
+  });
+}
+
+void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
+                                      const GlobalValue *GV, Mangler &Mang,
+                                      bool MayAlwaysUsePrivate) const {
+  if (MayAlwaysUsePrivate || !GV->hasPrivateLinkage()) {
+    // Simple case: If GV is not private, it is not important to find out if
+    // private labels are legal in this case or not.
+    Mang.getNameWithPrefix(Name, GV, false);
+    return;
+  }
+  const TargetLoweringObjectFile *TLOF = getObjFileLowering();
+  TLOF->getNameWithPrefix(Name, GV, *this);
+}
+
+MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV) const {
+  const TargetLoweringObjectFile *TLOF = getObjFileLowering();
+  SmallString<128> NameStr;
+  getNameWithPrefix(NameStr, GV, TLOF->getMangler());
+  return TLOF->getContext().getOrCreateSymbol(NameStr);
+}
diff --git a/contrib/llvm/lib/Target/TargetMachineC.cpp b/contrib/llvm/lib/Target/TargetMachineC.cpp
new file mode 100644
index 000000000000..5fb5b0227800
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetMachineC.cpp
@@ -0,0 +1,243 @@
+//===-- TargetMachine.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LLVM-C part of TargetMachine.h
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/TargetMachine.h"
+#include "llvm-c/Core.h"
+#include "llvm-c/Target.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/CodeGenCWrappers.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+
+using namespace llvm;
+
+static TargetMachine *unwrap(LLVMTargetMachineRef P) {
+  return reinterpret_cast<TargetMachine *>(P);
+}
+static Target *unwrap(LLVMTargetRef P) {
+  return reinterpret_cast<Target*>(P);
+}
+static LLVMTargetMachineRef wrap(const TargetMachine *P) {
+  return reinterpret_cast<LLVMTargetMachineRef>(const_cast<TargetMachine *>(P));
+}
+static LLVMTargetRef wrap(const Target * P) {
+  return reinterpret_cast<LLVMTargetRef>(const_cast<Target*>(P));
+}
+
+LLVMTargetRef LLVMGetFirstTarget() {
+  if (TargetRegistry::targets().begin() == TargetRegistry::targets().end()) {
+    return nullptr;
+  }
+
+  const Target *target = &*TargetRegistry::targets().begin();
+  return wrap(target);
+}
+LLVMTargetRef LLVMGetNextTarget(LLVMTargetRef T) {
+  return wrap(unwrap(T)->getNext());
+}
+
+LLVMTargetRef LLVMGetTargetFromName(const char *Name) {
+  StringRef NameRef = Name;
+  auto I = find_if(TargetRegistry::targets(),
+                   [&](const Target &T) { return T.getName() == NameRef; });
+  return I != TargetRegistry::targets().end() ? wrap(&*I) : nullptr;
+}
+
+LLVMBool LLVMGetTargetFromTriple(const char* TripleStr, LLVMTargetRef *T,
+                                 char **ErrorMessage) {
+  std::string Error;
+
+  *T = wrap(TargetRegistry::lookupTarget(TripleStr, Error));
+
+  if (!*T) {
+    if (ErrorMessage)
+      *ErrorMessage = strdup(Error.c_str());
+
+    return 1;
+  }
+
+  return 0;
+}
+
+const char * LLVMGetTargetName(LLVMTargetRef T) {
+  return unwrap(T)->getName();
+}
+
+const char * LLVMGetTargetDescription(LLVMTargetRef T) {
+  return unwrap(T)->getShortDescription();
+}
+
+LLVMBool LLVMTargetHasJIT(LLVMTargetRef T) {
+  return unwrap(T)->hasJIT();
+}
+
+LLVMBool LLVMTargetHasTargetMachine(LLVMTargetRef T) {
+  return unwrap(T)->hasTargetMachine();
+}
+
+LLVMBool LLVMTargetHasAsmBackend(LLVMTargetRef T) {
+  return unwrap(T)->hasMCAsmBackend();
+}
+
+LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
+        const char* Triple, const char* CPU, const char* Features,
+        LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
+        LLVMCodeModel CodeModel) {
+  Optional<Reloc::Model> RM;
+  switch (Reloc){
+    case LLVMRelocStatic:
+      RM = Reloc::Static;
+      break;
+    case LLVMRelocPIC:
+      RM = Reloc::PIC_;
+      break;
+    case LLVMRelocDynamicNoPic:
+      RM = Reloc::DynamicNoPIC;
+      break;
+    default:
+      break;
+  }
+
+  CodeModel::Model CM = unwrap(CodeModel);
+
+  CodeGenOpt::Level OL;
+  switch (Level) {
+    case LLVMCodeGenLevelNone:
+      OL = CodeGenOpt::None;
+      break;
+    case LLVMCodeGenLevelLess:
+      OL = CodeGenOpt::Less;
+      break;
+    case LLVMCodeGenLevelAggressive:
+      OL = CodeGenOpt::Aggressive;
+      break;
+    default:
+      OL = CodeGenOpt::Default;
+      break;
+  }
+
+  TargetOptions opt;
+  return wrap(unwrap(T)->createTargetMachine(Triple, CPU, Features, opt, RM,
+    CM, OL));
+}
+
+void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) { delete unwrap(T); }
+
+LLVMTargetRef LLVMGetTargetMachineTarget(LLVMTargetMachineRef T) {
+  const Target* target = &(unwrap(T)->getTarget());
+  return wrap(target);
+}
+
+char* LLVMGetTargetMachineTriple(LLVMTargetMachineRef T) {
+  std::string StringRep = unwrap(T)->getTargetTriple().str();
+  return strdup(StringRep.c_str());
+}
+
+char* LLVMGetTargetMachineCPU(LLVMTargetMachineRef T) {
+  std::string StringRep = unwrap(T)->getTargetCPU();
+  return strdup(StringRep.c_str());
+}
+
+char* LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T) {
+  std::string StringRep = unwrap(T)->getTargetFeatureString();
+  return strdup(StringRep.c_str());
+}
+
+void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T,
+                                      LLVMBool VerboseAsm) {
+  unwrap(T)->Options.MCOptions.AsmVerbose = VerboseAsm;
+}
+
+LLVMTargetDataRef LLVMCreateTargetDataLayout(LLVMTargetMachineRef T) {
+  return wrap(new DataLayout(unwrap(T)->createDataLayout()));
+}
+
+static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
+                                      raw_pwrite_stream &OS,
+                                      LLVMCodeGenFileType codegen,
+                                      char **ErrorMessage) {
+  TargetMachine* TM = unwrap(T);
+  Module* Mod = unwrap(M);
+
+  legacy::PassManager pass;
+
+  std::string error;
+
+  Mod->setDataLayout(TM->createDataLayout());
+
+  TargetMachine::CodeGenFileType ft;
+  switch (codegen) {
+    case LLVMAssemblyFile:
+      ft = TargetMachine::CGFT_AssemblyFile;
+      break;
+    default:
+      ft = TargetMachine::CGFT_ObjectFile;
+      break;
+  }
+  if (TM->addPassesToEmitFile(pass, OS, ft)) {
+    error = "TargetMachine can't emit a file of this type";
+    *ErrorMessage = strdup(error.c_str());
+    return true;
+  }
+
+  pass.run(*Mod);
+
+  OS.flush();
+  return false;
+}
+
+LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
+  char* Filename, LLVMCodeGenFileType codegen, char** ErrorMessage) {
+  std::error_code EC;
+  raw_fd_ostream dest(Filename, EC, sys::fs::F_None);
+  if (EC) {
+    *ErrorMessage = strdup(EC.message().c_str());
+    return true;
+  }
+  bool Result = LLVMTargetMachineEmit(T, M, dest, codegen, ErrorMessage);
+  dest.flush();
+  return Result;
+}
+
+LLVMBool LLVMTargetMachineEmitToMemoryBuffer(LLVMTargetMachineRef T,
+  LLVMModuleRef M, LLVMCodeGenFileType codegen, char** ErrorMessage,
+  LLVMMemoryBufferRef *OutMemBuf) {
+  SmallString<0> CodeString;
+  raw_svector_ostream OStream(CodeString);
+  bool Result = LLVMTargetMachineEmit(T, M, OStream, codegen, ErrorMessage);
+
+  StringRef Data = OStream.str();
+  *OutMemBuf =
+      LLVMCreateMemoryBufferWithMemoryRangeCopy(Data.data(), Data.size(), "");
+  return Result;
+}
+
+char *LLVMGetDefaultTargetTriple(void) {
+  return strdup(sys::getDefaultTargetTriple().c_str());
+}
+
+void LLVMAddAnalysisPasses(LLVMTargetMachineRef T, LLVMPassManagerRef PM) {
+  unwrap(PM)->add(
+      createTargetTransformInfoWrapperPass(unwrap(T)->getTargetIRAnalysis()));
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
new file mode 100644
index 000000000000..b4763ca60ab6
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -0,0 +1,151 @@
+//==- WebAssemblyDisassembler.cpp - Disassembler for WebAssembly -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file is part of the WebAssembly Disassembler.
+///
+/// It contains code to translate the data produced by the decoder into
+/// MCInsts.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-disassembler"
+
+namespace {
+class WebAssemblyDisassembler final : public MCDisassembler {
+  std::unique_ptr<const MCInstrInfo> MCII;
+
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
+
+public:
+  WebAssemblyDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                          std::unique_ptr<const MCInstrInfo> MCII)
+      : MCDisassembler(STI, Ctx), MCII(std::move(MCII)) {}
+};
+} // end anonymous namespace
+
+static MCDisassembler *createWebAssemblyDisassembler(const Target &T,
+                                                     const MCSubtargetInfo &STI,
+                                                     MCContext &Ctx) {
+  std::unique_ptr<const MCInstrInfo> MCII(T.createMCInstrInfo());
+  return new WebAssemblyDisassembler(STI, Ctx, std::move(MCII));
+}
+
+extern "C" void LLVMInitializeWebAssemblyDisassembler() {
+  // Register the disassembler for each target.
+  TargetRegistry::RegisterMCDisassembler(getTheWebAssemblyTarget32(),
+                                         createWebAssemblyDisassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheWebAssemblyTarget64(),
+                                         createWebAssemblyDisassembler);
+}
+
+MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
+    MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/,
+    raw_ostream &OS, raw_ostream &CS) const {
+  Size = 0;
+  uint64_t Pos = 0;
+
+  // Read the opcode.
+  if (Pos + sizeof(uint64_t) > Bytes.size())
+    return MCDisassembler::Fail;
+  uint64_t Opcode = support::endian::read64le(Bytes.data() + Pos);
+  Pos += sizeof(uint64_t);
+
+  if (Opcode >= WebAssembly::INSTRUCTION_LIST_END)
+    return MCDisassembler::Fail;
+
+  MI.setOpcode(Opcode);
+  const MCInstrDesc &Desc = MCII->get(Opcode);
+  unsigned NumFixedOperands = Desc.NumOperands;
+
+  // If it's variadic, read the number of extra operands.
+  unsigned NumExtraOperands = 0;
+  if (Desc.isVariadic()) {
+    if (Pos + sizeof(uint64_t) > Bytes.size())
+      return MCDisassembler::Fail;
+    NumExtraOperands = support::endian::read64le(Bytes.data() + Pos);
+    Pos += sizeof(uint64_t);
+  }
+
+  // Read the fixed operands. These are described by the MCInstrDesc.
+  for (unsigned i = 0; i < NumFixedOperands; ++i) {
+    const MCOperandInfo &Info = Desc.OpInfo[i];
+    switch (Info.OperandType) {
+    case MCOI::OPERAND_IMMEDIATE:
+    case WebAssembly::OPERAND_LOCAL:
+    case WebAssembly::OPERAND_P2ALIGN:
+    case WebAssembly::OPERAND_BASIC_BLOCK: {
+      if (Pos + sizeof(uint64_t) > Bytes.size())
+        return MCDisassembler::Fail;
+      uint64_t Imm = support::endian::read64le(Bytes.data() + Pos);
+      Pos += sizeof(uint64_t);
+      MI.addOperand(MCOperand::createImm(Imm));
+      break;
+    }
+    case MCOI::OPERAND_REGISTER: {
+      if (Pos + sizeof(uint64_t) > Bytes.size())
+        return MCDisassembler::Fail;
+      uint64_t Reg = support::endian::read64le(Bytes.data() + Pos);
+      Pos += sizeof(uint64_t);
+      MI.addOperand(MCOperand::createReg(Reg));
+      break;
+    }
+    case WebAssembly::OPERAND_F32IMM:
+    case WebAssembly::OPERAND_F64IMM: {
+      // TODO: MC converts all floating point immediate operands to double.
+      // This is fine for numeric values, but may cause NaNs to change bits.
+      if (Pos + sizeof(uint64_t) > Bytes.size())
+        return MCDisassembler::Fail;
+      uint64_t Bits = support::endian::read64le(Bytes.data() + Pos);
+      Pos += sizeof(uint64_t);
+      double Imm;
+      memcpy(&Imm, &Bits, sizeof(Imm));
+      MI.addOperand(MCOperand::createFPImm(Imm));
+      break;
+    }
+    default:
+      llvm_unreachable("unimplemented operand kind");
+    }
+  }
+
+  // Read the extra operands.
+  assert(NumExtraOperands == 0 || Desc.isVariadic());
+  for (unsigned i = 0; i < NumExtraOperands; ++i) {
+    if (Pos + sizeof(uint64_t) > Bytes.size())
+      return MCDisassembler::Fail;
+    if (Desc.TSFlags & WebAssemblyII::VariableOpIsImmediate) {
+      // Decode extra immediate operands.
+      uint64_t Imm = support::endian::read64le(Bytes.data() + Pos);
+      MI.addOperand(MCOperand::createImm(Imm));
+    } else {
+      // Decode extra register operands.
+      uint64_t Reg = support::endian::read64le(Bytes.data() + Pos);
+      MI.addOperand(MCOperand::createReg(Reg));
+    }
+    Pos += sizeof(uint64_t);
+  }
+
+  Size = Pos;
+  return MCDisassembler::Success;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
new file mode 100644
index 000000000000..0af13cffdb04
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -0,0 +1,244 @@
+//=- WebAssemblyInstPrinter.cpp - WebAssembly assembly instruction printing -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Print MCInst instructions to wasm format.
+///
+//===----------------------------------------------------------------------===//
+
+#include "InstPrinter/WebAssemblyInstPrinter.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "WebAssemblyGenAsmWriter.inc"
+
+WebAssemblyInstPrinter::WebAssemblyInstPrinter(const MCAsmInfo &MAI,
+                                               const MCInstrInfo &MII,
+                                               const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI), ControlFlowCounter(0) {}
+
+void WebAssemblyInstPrinter::printRegName(raw_ostream &OS,
+                                          unsigned RegNo) const {
+  assert(RegNo != WebAssemblyFunctionInfo::UnusedReg);
+  // Note that there's an implicit get_local/set_local here!
+  OS << "$" << RegNo;
+}
+
+void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                       StringRef Annot,
+                                       const MCSubtargetInfo & /*STI*/) {
+  // Print the instruction (this uses the AsmStrings from the .td files).
+  printInstruction(MI, OS);
+
+  // Print any additional variadic operands.
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  if (Desc.isVariadic())
+    for (auto i = Desc.getNumOperands(), e = MI->getNumOperands(); i < e; ++i) {
+      // FIXME: For CALL_INDIRECT_VOID, don't print a leading comma, because
+      // we have an extra flags operand which is not currently printed, for
+      // compatiblity reasons.
+      if (i != 0 &&
+          (MI->getOpcode() != WebAssembly::CALL_INDIRECT_VOID ||
+           i != Desc.getNumOperands()))
+        OS << ", ";
+      printOperand(MI, i, OS);
+    }
+
+  // Print any added annotation.
+  printAnnotation(OS, Annot);
+
+  if (CommentStream) {
+    // Observe any effects on the control flow stack, for use in annotating
+    // control flow label references.
+    switch (MI->getOpcode()) {
+    default:
+      break;
+    case WebAssembly::LOOP: {
+      printAnnotation(OS, "label" + utostr(ControlFlowCounter) + ':');
+      ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, true));
+      break;
+    }
+    case WebAssembly::BLOCK:
+      ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
+      break;
+    case WebAssembly::END_LOOP:
+      ControlFlowStack.pop_back();
+      break;
+    case WebAssembly::END_BLOCK:
+      printAnnotation(
+          OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
+      break;
+    }
+
+    // Annotate any control flow label references.
+    unsigned NumFixedOperands = Desc.NumOperands;
+    SmallSet<uint64_t, 8> Printed;
+    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
+      if (!(i < NumFixedOperands
+                ? (Desc.OpInfo[i].OperandType ==
+                   WebAssembly::OPERAND_BASIC_BLOCK)
+                : (Desc.TSFlags & WebAssemblyII::VariableOpImmediateIsLabel)))
+        continue;
+      uint64_t Depth = MI->getOperand(i).getImm();
+      if (!Printed.insert(Depth).second)
+        continue;
+      const auto &Pair = ControlFlowStack.rbegin()[Depth];
+      printAnnotation(OS, utostr(Depth) + ": " + (Pair.second ? "up" : "down") +
+                              " to label" + utostr(Pair.first));
+    }
+  }
+}
+
+static std::string toString(const APFloat &FP) {
+  // Print NaNs with custom payloads specially.
+  if (FP.isNaN() &&
+      !FP.bitwiseIsEqual(APFloat::getQNaN(FP.getSemantics())) &&
+      !FP.bitwiseIsEqual(
+          APFloat::getQNaN(FP.getSemantics(), /*Negative=*/true))) {
+    APInt AI = FP.bitcastToAPInt();
+    return
+        std::string(AI.isNegative() ? "-" : "") + "nan:0x" +
+        utohexstr(AI.getZExtValue() &
+                  (AI.getBitWidth() == 32 ? INT64_C(0x007fffff) :
+                                            INT64_C(0x000fffffffffffff)),
+                  /*LowerCase=*/true);
+  }
+
+  // Use C99's hexadecimal floating-point representation.
+  static const size_t BufBytes = 128;
+  char buf[BufBytes];
+  auto Written = FP.convertToHexString(
+      buf, /*hexDigits=*/0, /*upperCase=*/false, APFloat::rmNearestTiesToEven);
+  (void)Written;
+  assert(Written != 0);
+  assert(Written < BufBytes);
+  return buf;
+}
+
+void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
+            MII.get(MI->getOpcode()).TSFlags == 0) &&
+           "WebAssembly variable_ops register ops don't use TSFlags");
+    unsigned WAReg = Op.getReg();
+    if (int(WAReg) >= 0)
+      printRegName(O, WAReg);
+    else if (OpNo >= MII.get(MI->getOpcode()).getNumDefs())
+      O << "$pop" << WebAssemblyFunctionInfo::getWARegStackId(WAReg);
+    else if (WAReg != WebAssemblyFunctionInfo::UnusedReg)
+      O << "$push" << WebAssemblyFunctionInfo::getWARegStackId(WAReg);
+    else
+      O << "$drop";
+    // Add a '=' suffix if this is a def.
+    if (OpNo < MII.get(MI->getOpcode()).getNumDefs())
+      O << '=';
+  } else if (Op.isImm()) {
+    const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+    assert((OpNo < Desc.getNumOperands() ||
+            (Desc.TSFlags & WebAssemblyII::VariableOpIsImmediate)) &&
+           "WebAssemblyII::VariableOpIsImmediate should be set for "
+           "variable_ops immediate ops");
+    (void)Desc;
+    // TODO: (MII.get(MI->getOpcode()).TSFlags &
+    //        WebAssemblyII::VariableOpImmediateIsLabel)
+    // can tell us whether this is an immediate referencing a label in the
+    // control flow stack, and it may be nice to pretty-print.
+    O << Op.getImm();
+  } else if (Op.isFPImm()) {
+    const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+    assert(OpNo < Desc.getNumOperands() &&
+           "Unexpected floating-point immediate as a non-fixed operand");
+    assert(Desc.TSFlags == 0 &&
+           "WebAssembly variable_ops floating point ops don't use TSFlags");
+    const MCOperandInfo &Info = Desc.OpInfo[OpNo];
+    if (Info.OperandType == WebAssembly::OPERAND_F32IMM) {
+      // TODO: MC converts all floating point immediate operands to double.
+      // This is fine for numeric values, but may cause NaNs to change bits.
+      O << toString(APFloat(float(Op.getFPImm())));
+    } else {
+      assert(Info.OperandType == WebAssembly::OPERAND_F64IMM);
+      O << toString(APFloat(Op.getFPImm()));
+    }
+  } else {
+    assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() ||
+            (MII.get(MI->getOpcode()).TSFlags &
+             WebAssemblyII::VariableOpIsImmediate)) &&
+           "WebAssemblyII::VariableOpIsImmediate should be set for "
+           "variable_ops expr ops");
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    Op.getExpr()->print(O, &MAI);
+  }
+}
+
+void
+WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(const MCInst *MI,
+                                                       unsigned OpNo,
+                                                       raw_ostream &O) {
+  int64_t Imm = MI->getOperand(OpNo).getImm();
+  if (Imm == WebAssembly::GetDefaultP2Align(MI->getOpcode()))
+    return;
+  O << ":p2align=" << Imm;
+}
+
+void
+WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI,
+                                                         unsigned OpNo,
+                                                         raw_ostream &O) {
+  int64_t Imm = MI->getOperand(OpNo).getImm();
+  switch (WebAssembly::ExprType(Imm)) {
+  case WebAssembly::ExprType::Void: break;
+  case WebAssembly::ExprType::I32: O << "i32"; break;
+  case WebAssembly::ExprType::I64: O << "i64"; break;
+  case WebAssembly::ExprType::F32: O << "f32"; break;
+  case WebAssembly::ExprType::F64: O << "f64"; break;
+  case WebAssembly::ExprType::I8x16: O << "i8x16"; break;
+  case WebAssembly::ExprType::I16x8: O << "i16x8"; break;
+  case WebAssembly::ExprType::I32x4: O << "i32x4"; break;
+  case WebAssembly::ExprType::F32x4: O << "f32x4"; break;
+  case WebAssembly::ExprType::B8x16: O << "b8x16"; break;
+  case WebAssembly::ExprType::B16x8: O << "b16x8"; break;
+  case WebAssembly::ExprType::B32x4: O << "b32x4"; break;
+  }
+}
+
+const char *llvm::WebAssembly::TypeToString(MVT Ty) {
+  switch (Ty.SimpleTy) {
+  case MVT::i32:
+    return "i32";
+  case MVT::i64:
+    return "i64";
+  case MVT::f32:
+    return "f32";
+  case MVT::f64:
+    return "f64";
+  case MVT::v16i8:
+  case MVT::v8i16:
+  case MVT::v4i32:
+  case MVT::v4f32:
+    return "v128";
+  default:
+    llvm_unreachable("unsupported type");
+  }
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
new file mode 100644
index 000000000000..d11f99c1ff39
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -0,0 +1,58 @@
+// WebAssemblyInstPrinter.h - Print wasm MCInst to assembly syntax -*- C++ -*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This class prints an WebAssembly MCInst to wasm file syntax.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class MCSubtargetInfo;
+
+class WebAssemblyInstPrinter final : public MCInstPrinter {
+  uint64_t ControlFlowCounter;
+  SmallVector<std::pair<uint64_t, bool>, 0> ControlFlowStack;
+
+public:
+  WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                         const MCRegisterInfo &MRI);
+
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+
+  // Used by tblegen code.
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printWebAssemblyP2AlignOperand(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O);
+  void printWebAssemblySignatureOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O);
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+};
+
+namespace WebAssembly {
+
+const char *TypeToString(MVT Ty);
+
+} // end namespace WebAssembly
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
new file mode 100644
index 000000000000..97454a824a34
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -0,0 +1,105 @@
+//===-- WebAssemblyAsmBackend.cpp - WebAssembly Assembler Backend ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements the WebAssemblyAsmBackend class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+class WebAssemblyAsmBackend final : public MCAsmBackend {
+  bool Is64Bit;
+
+public:
+  explicit WebAssemblyAsmBackend(bool Is64Bit)
+      : MCAsmBackend(), Is64Bit(Is64Bit) {}
+  ~WebAssemblyAsmBackend() override {}
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+  // No instruction requires relaxation
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+
+  unsigned getNumFixupKinds() const override {
+    // We currently just use the generic fixups in MCFixup.h and don't have any
+    // target-specific fixups.
+    return 0;
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {}
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+
+bool WebAssemblyAsmBackend::writeNopData(uint64_t Count,
+                                         MCObjectWriter *OW) const {
+  if (Count == 0)
+    return true;
+
+  for (uint64_t i = 0; i < Count; ++i)
+    OW->write8(WebAssembly::Nop);
+
+  return true;
+}
+
+void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                       unsigned DataSize, uint64_t Value,
+                                       bool IsPCRel) const {
+  const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
+  assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
+
+  unsigned NumBytes = (Info.TargetSize + 7) / 8;
+  if (Value == 0)
+    return; // Doesn't change encoding.
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+MCObjectWriter *
+WebAssemblyAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
+  return createWebAssemblyELFObjectWriter(OS, Is64Bit, 0);
+}
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Triple &TT) {
+  return new WebAssemblyAsmBackend(TT.isArch64Bit());
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
new file mode 100644
index 000000000000..2146f67959b8
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp
@@ -0,0 +1,67 @@
+//===-- WebAssemblyELFObjectWriter.cpp - WebAssembly ELF Writer -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file handles ELF-specific object emission, converting LLVM's
+/// internal fixups into the appropriate relocations.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+namespace {
+class WebAssemblyELFObjectWriter final : public MCELFObjectTargetWriter {
+public:
+  WebAssemblyELFObjectWriter(bool Is64Bit, uint8_t OSABI);
+
+protected:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+};
+} // end anonymous namespace
+
+WebAssemblyELFObjectWriter::WebAssemblyELFObjectWriter(bool Is64Bit,
+                                                       uint8_t OSABI)
+    : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_WEBASSEMBLY,
+                              /*HasRelocationAddend=*/false) {}
+
+unsigned WebAssemblyELFObjectWriter::getRelocType(MCContext &Ctx,
+                                                  const MCValue &Target,
+                                                  const MCFixup &Fixup,
+                                                  bool IsPCRel) const {
+  // WebAssembly functions are not allocated in the address space. To resolve a
+  // pointer to a function, we must use a special relocation type.
+  if (const MCSymbolRefExpr *SyExp =
+          dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
+    if (SyExp->getKind() == MCSymbolRefExpr::VK_WebAssembly_FUNCTION)
+      return ELF::R_WEBASSEMBLY_FUNCTION;
+
+  switch (Fixup.getKind()) {
+  case FK_Data_4:
+    assert(!is64Bit() && "4-byte relocations only supported on wasm32");
+    return ELF::R_WEBASSEMBLY_DATA;
+  case FK_Data_8:
+    assert(is64Bit() && "8-byte relocations only supported on wasm64");
+    return ELF::R_WEBASSEMBLY_DATA;
+  default:
+    llvm_unreachable("unimplemented fixup kind");
+  }
+}
+
+MCObjectWriter *llvm::createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
+                                                       bool Is64Bit,
+                                                       uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW =
+      new WebAssemblyELFObjectWriter(Is64Bit, OSABI);
+  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
new file mode 100644
index 000000000000..d8c39216c53b
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
@@ -0,0 +1,53 @@
+//===-- WebAssemblyMCAsmInfo.cpp - WebAssembly asm properties -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the declarations of the WebAssemblyMCAsmInfo
+/// properties.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-mc-asm-info"
+
+WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {}
+
+WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
+  PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4;
+
+  // TODO: What should MaxInstLength be?
+
+  UseDataRegionDirectives = true;
+
+  // Use .skip instead of .zero because .zero is confusing when used with two
+  // arguments (it doesn't actually zero things out).
+  ZeroDirective = "\t.skip\t";
+
+  Data8bitsDirective = "\t.int8\t";
+  Data16bitsDirective = "\t.int16\t";
+  Data32bitsDirective = "\t.int32\t";
+  Data64bitsDirective = "\t.int64\t";
+
+  AlignmentIsInBytes = false;
+  COMMDirectiveAlignmentIsInBytes = false;
+  LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment;
+
+  SupportsDebugInformation = true;
+
+  // For now, WebAssembly does not support exceptions.
+  ExceptionsType = ExceptionHandling::None;
+
+  // TODO: UseIntegratedAssembler?
+
+  // WebAssembly's stack is never executable.
+  UsesNonexecutableStackSection = false;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
new file mode 100644
index 000000000000..2dcf2cd3c892
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
@@ -0,0 +1,32 @@
+//===-- WebAssemblyMCAsmInfo.h - WebAssembly asm properties -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the declaration of the WebAssemblyMCAsmInfo class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+
+class Triple;
+
+class WebAssemblyMCAsmInfo final : public MCAsmInfoELF {
+public:
+  explicit WebAssemblyMCAsmInfo(const Triple &T);
+  ~WebAssemblyMCAsmInfo() override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
new file mode 100644
index 000000000000..d0e0eecd3002
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -0,0 +1,121 @@
+//=- WebAssemblyMCCodeEmitter.cpp - Convert WebAssembly code to machine code -//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements the WebAssemblyMCCodeEmitter class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumFixups, "Number of MC fixups created.");
+
+namespace {
+class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
+  const MCInstrInfo &MCII;
+
+  // Implementation generated by tablegen.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+public:
+  explicit WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
+};
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII) {
+  return new WebAssemblyMCCodeEmitter(MCII);
+}
+
+void WebAssemblyMCCodeEmitter::encodeInstruction(
+    const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  uint64_t Start = OS.tell();
+
+  uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+  assert(Binary < UINT8_MAX && "Multi-byte opcodes not supported yet");
+  OS << uint8_t(Binary);
+
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
+    const MCOperand &MO = MI.getOperand(i);
+    if (MO.isReg()) {
+      /* nothing to encode */
+    } else if (MO.isImm()) {
+      if (i < Desc.getNumOperands()) {
+        assert(Desc.TSFlags == 0 &&
+               "WebAssembly non-variable_ops don't use TSFlags");
+        const MCOperandInfo &Info = Desc.OpInfo[i];
+        if (Info.OperandType == WebAssembly::OPERAND_I32IMM) {
+          encodeSLEB128(int32_t(MO.getImm()), OS);
+        } else if (Info.OperandType == WebAssembly::OPERAND_I64IMM) {
+          encodeSLEB128(int64_t(MO.getImm()), OS);
+        } else {
+          encodeULEB128(uint64_t(MO.getImm()), OS);
+        }
+      } else {
+        assert(Desc.TSFlags == (WebAssemblyII::VariableOpIsImmediate |
+                                WebAssemblyII::VariableOpImmediateIsLabel));
+        encodeULEB128(uint64_t(MO.getImm()), OS);
+      }
+    } else if (MO.isFPImm()) {
+      assert(i < Desc.getNumOperands() &&
+             "Unexpected floating-point immediate as a non-fixed operand");
+      assert(Desc.TSFlags == 0 &&
+             "WebAssembly variable_ops floating point ops don't use TSFlags");
+      const MCOperandInfo &Info = Desc.OpInfo[i];
+      if (Info.OperandType == WebAssembly::OPERAND_F32IMM) {
+        // TODO: MC converts all floating point immediate operands to double.
+        // This is fine for numeric values, but may cause NaNs to change bits.
+        float f = float(MO.getFPImm());
+        support::endian::Writer<support::little>(OS).write<float>(f);
+      } else {
+        assert(Info.OperandType == WebAssembly::OPERAND_F64IMM);
+        double d = MO.getFPImm();
+        support::endian::Writer<support::little>(OS).write<double>(d);
+      }
+    } else if (MO.isExpr()) {
+      Fixups.push_back(MCFixup::create(
+          OS.tell() - Start, MO.getExpr(),
+          STI.getTargetTriple().isArch64Bit() ? FK_Data_8 : FK_Data_4,
+          MI.getLoc()));
+      ++MCNumFixups;
+      encodeULEB128(STI.getTargetTriple().isArch64Bit() ? UINT64_MAX
+                                                        : uint64_t(UINT32_MAX),
+                    OS);
+    } else {
+      llvm_unreachable("unexpected operand kind");
+    }
+  }
+
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+#include "WebAssemblyGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
new file mode 100644
index 000000000000..3dc1ded17116
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -0,0 +1,146 @@
+//===-- WebAssemblyMCTargetDesc.cpp - WebAssembly Target Descriptions -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file provides WebAssembly-specific target descriptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyMCTargetDesc.h"
+#include "InstPrinter/WebAssemblyInstPrinter.h"
+#include "WebAssemblyMCAsmInfo.h"
+#include "WebAssemblyTargetStreamer.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-mc-target-desc"
+
+#define GET_INSTRINFO_MC_DESC
+#include "WebAssemblyGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "WebAssemblyGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "WebAssemblyGenRegisterInfo.inc"
+
+static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/,
+                                  const Triple &TT) {
+  return new WebAssemblyMCAsmInfo(TT);
+}
+
+static void adjustCodeGenOpts(const Triple & /*TT*/, Reloc::Model /*RM*/,
+                              CodeModel::Model &CM) {
+  CodeModel::Model M = (CM == CodeModel::Default || CM == CodeModel::JITDefault)
+                           ? CodeModel::Large
+                           : CM;
+  if (M != CodeModel::Large)
+    report_fatal_error("Non-large code models are not supported yet");
+}
+
+static MCInstrInfo *createMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitWebAssemblyMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createMCRegisterInfo(const Triple & /*T*/) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitWebAssemblyMCRegisterInfo(X, 0);
+  return X;
+}
+
+static MCInstPrinter *createMCInstPrinter(const Triple & /*T*/,
+                                          unsigned SyntaxVariant,
+                                          const MCAsmInfo &MAI,
+                                          const MCInstrInfo &MII,
+                                          const MCRegisterInfo &MRI) {
+  assert(SyntaxVariant == 0 && "WebAssembly only has one syntax variant");
+  return new WebAssemblyInstPrinter(MAI, MII, MRI);
+}
+
+static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo & /*MRI*/,
+                                        MCContext & /*Ctx*/) {
+  return createWebAssemblyMCCodeEmitter(MCII);
+}
+
+static MCAsmBackend *createAsmBackend(const Target & /*T*/,
+                                      const MCRegisterInfo & /*MRI*/,
+                                      const Triple &TT, StringRef /*CPU*/,
+                                      const MCTargetOptions & /*Options*/) {
+  return createWebAssemblyAsmBackend(TT);
+}
+
+static MCSubtargetInfo *createMCSubtargetInfo(const Triple &TT, StringRef CPU,
+                                              StringRef FS) {
+  return createWebAssemblyMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCTargetStreamer *
+createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo & /*STI*/) {
+  return new WebAssemblyTargetELFStreamer(S);
+}
+
+static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
+                                                 formatted_raw_ostream &OS,
+                                                 MCInstPrinter * /*InstPrint*/,
+                                                 bool /*isVerboseAsm*/) {
+  return new WebAssemblyTargetAsmStreamer(S, OS);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeWebAssemblyTargetMC() {
+  for (Target *T :
+       {&getTheWebAssemblyTarget32(), &getTheWebAssemblyTarget64()}) {
+    // Register the MC asm info.
+    RegisterMCAsmInfoFn X(*T, createMCAsmInfo);
+
+    // Register the MC instruction info.
+    TargetRegistry::RegisterMCInstrInfo(*T, createMCInstrInfo);
+
+    // Register the MC codegen info.
+    TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts);
+
+    // Register the MC register info.
+    TargetRegistry::RegisterMCRegInfo(*T, createMCRegisterInfo);
+
+    // Register the MCInstPrinter.
+    TargetRegistry::RegisterMCInstPrinter(*T, createMCInstPrinter);
+
+    // Register the MC code emitter.
+    TargetRegistry::RegisterMCCodeEmitter(*T, createCodeEmitter);
+
+    // Register the ASM Backend.
+    TargetRegistry::RegisterMCAsmBackend(*T, createAsmBackend);
+
+    // Register the MC subtarget info.
+    TargetRegistry::RegisterMCSubtargetInfo(*T, createMCSubtargetInfo);
+
+    // Register the object target streamer.
+    TargetRegistry::RegisterObjectTargetStreamer(*T,
+                                                 createObjectTargetStreamer);
+    // Register the asm target streamer.
+    TargetRegistry::RegisterAsmTargetStreamer(*T, createAsmTargetStreamer);
+  }
+}
+
+WebAssembly::ValType WebAssembly::toValType(const MVT &Ty) {
+  switch (Ty.SimpleTy) {
+  case MVT::i32: return WebAssembly::ValType::I32;
+  case MVT::i64: return WebAssembly::ValType::I64;
+  case MVT::f32: return WebAssembly::ValType::F32;
+  case MVT::f64: return WebAssembly::ValType::F64;
+  default: llvm_unreachable("unexpected type");
+  }
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
new file mode 100644
index 000000000000..8583b772deab
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -0,0 +1,182 @@
+//==- WebAssemblyMCTargetDesc.h - WebAssembly Target Descriptions -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file provides WebAssembly-specific target descriptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H
+
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCSubtargetInfo;
+class MVT;
+class Target;
+class Triple;
+class raw_pwrite_stream;
+
+Target &getTheWebAssemblyTarget32();
+Target &getTheWebAssemblyTarget64();
+
+MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII);
+
+MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
+
+MCObjectWriter *createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
+                                                 bool Is64Bit, uint8_t OSABI);
+
+namespace WebAssembly {
+enum OperandType {
+  /// Basic block label in a branch construct.
+  OPERAND_BASIC_BLOCK = MCOI::OPERAND_FIRST_TARGET,
+  /// Local index.
+  OPERAND_LOCAL,
+  /// 32-bit integer immediates.
+  OPERAND_I32IMM,
+  /// 64-bit integer immediates.
+  OPERAND_I64IMM,
+  /// 32-bit floating-point immediates.
+  OPERAND_F32IMM,
+  /// 64-bit floating-point immediates.
+  OPERAND_F64IMM,
+  /// 32-bit unsigned function indices.
+  OPERAND_FUNCTION32,
+  /// 32-bit unsigned memory offsets.
+  OPERAND_OFFSET32,
+  /// p2align immediate for load and store address alignment.
+  OPERAND_P2ALIGN,
+  /// signature immediate for block/loop.
+  OPERAND_SIGNATURE
+};
+} // end namespace WebAssembly
+
+namespace WebAssemblyII {
+enum {
+  // For variadic instructions, this flag indicates whether an operand
+  // in the variable_ops range is an immediate value.
+  VariableOpIsImmediate = (1 << 0),
+  // For immediate values in the variable_ops range, this flag indicates
+  // whether the value represents a control-flow label.
+  VariableOpImmediateIsLabel = (1 << 1)
+};
+} // end namespace WebAssemblyII
+
+} // end namespace llvm
+
+// Defines symbolic names for WebAssembly registers. This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "WebAssemblyGenRegisterInfo.inc"
+
+// Defines symbolic names for the WebAssembly instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "WebAssemblyGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "WebAssemblyGenSubtargetInfo.inc"
+
+namespace llvm {
+namespace WebAssembly {
+
+/// Return the default p2align value for a load or store with the given opcode.
+inline unsigned GetDefaultP2Align(unsigned Opcode) {
+  switch (Opcode) {
+  case WebAssembly::LOAD8_S_I32:
+  case WebAssembly::LOAD8_U_I32:
+  case WebAssembly::LOAD8_S_I64:
+  case WebAssembly::LOAD8_U_I64:
+  case WebAssembly::STORE8_I32:
+  case WebAssembly::STORE8_I64:
+    return 0;
+  case WebAssembly::LOAD16_S_I32:
+  case WebAssembly::LOAD16_U_I32:
+  case WebAssembly::LOAD16_S_I64:
+  case WebAssembly::LOAD16_U_I64:
+  case WebAssembly::STORE16_I32:
+  case WebAssembly::STORE16_I64:
+    return 1;
+  case WebAssembly::LOAD_I32:
+  case WebAssembly::LOAD_F32:
+  case WebAssembly::STORE_I32:
+  case WebAssembly::STORE_F32:
+  case WebAssembly::LOAD32_S_I64:
+  case WebAssembly::LOAD32_U_I64:
+  case WebAssembly::STORE32_I64:
+    return 2;
+  case WebAssembly::LOAD_I64:
+  case WebAssembly::LOAD_F64:
+  case WebAssembly::STORE_I64:
+  case WebAssembly::STORE_F64:
+    return 3;
+  default:
+    llvm_unreachable("Only loads and stores have p2align values");
+  }
+}
+
+/// The operand number of the load or store address in load/store instructions.
+static const unsigned LoadAddressOperandNo = 3;
+static const unsigned StoreAddressOperandNo = 2;
+
+/// The operand number of the load or store p2align in load/store instructions.
+static const unsigned LoadP2AlignOperandNo = 1;
+static const unsigned StoreP2AlignOperandNo = 0;
+
+/// This is used to indicate block signatures.
+enum class ExprType {
+  Void    = 0x40,
+  I32     = 0x7f,
+  I64     = 0x7e,
+  F32     = 0x7d,
+  F64     = 0x7c,
+  I8x16   = 0x7b,
+  I16x8   = 0x7a,
+  I32x4   = 0x79,
+  F32x4   = 0x78,
+  B8x16   = 0x77,
+  B16x8   = 0x76,
+  B32x4   = 0x75
+};
+
+/// This is used to indicate local types.
+enum class ValType {
+  I32     = 0x7f,
+  I64     = 0x7e,
+  F32     = 0x7d,
+  F64     = 0x7c,
+  I8x16   = 0x7b,
+  I16x8   = 0x7a,
+  I32x4   = 0x79,
+  F32x4   = 0x78,
+  B8x16   = 0x77,
+  B16x8   = 0x76,
+  B32x4   = 0x75
+};
+
+/// Instruction opcodes emitted via means other than CodeGen.
+static const unsigned Nop = 0x01;
+static const unsigned End = 0x0b;
+
+ValType toValType(const MVT &Ty);
+
+} // end namespace WebAssembly
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
new file mode 100644
index 000000000000..3cee8b2a1844
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -0,0 +1,120 @@
+//==-- WebAssemblyTargetStreamer.cpp - WebAssembly Target Streamer Methods --=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines WebAssembly-specific target streamer classes.
+/// These are for implementing support for target-specific assembly directives.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyTargetStreamer.h"
+#include "InstPrinter/WebAssemblyInstPrinter.h"
+#include "WebAssemblyMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+WebAssemblyTargetStreamer::WebAssemblyTargetStreamer(MCStreamer &S)
+    : MCTargetStreamer(S) {}
+
+WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer(
+    MCStreamer &S, formatted_raw_ostream &OS)
+    : WebAssemblyTargetStreamer(S), OS(OS) {}
+
+WebAssemblyTargetELFStreamer::WebAssemblyTargetELFStreamer(MCStreamer &S)
+    : WebAssemblyTargetStreamer(S) {}
+
+static void PrintTypes(formatted_raw_ostream &OS, ArrayRef<MVT> Types) {
+  bool First = true;
+  for (MVT Type : Types) {
+    if (First)
+      First = false;
+    else
+      OS << ", ";
+    OS << WebAssembly::TypeToString(Type);
+  }
+  OS << '\n';
+}
+
+void WebAssemblyTargetAsmStreamer::emitParam(ArrayRef<MVT> Types) {
+  OS << "\t.param  \t";
+  PrintTypes(OS, Types);
+}
+
+void WebAssemblyTargetAsmStreamer::emitResult(ArrayRef<MVT> Types) {
+  OS << "\t.result \t";
+  PrintTypes(OS, Types);
+}
+
+void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) {
+  if (!Types.empty()) {
+    OS << "\t.local  \t";
+    PrintTypes(OS, Types);
+  }
+}
+
+void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; }
+
+void WebAssemblyTargetAsmStreamer::emitIndirectFunctionType(
+    StringRef name, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) {
+  OS << "\t.functype\t" << name;
+  if (Results.empty())
+    OS << ", void";
+  else {
+    assert(Results.size() == 1);
+    OS << ", " << WebAssembly::TypeToString(Results.front());
+  }
+  for (auto Ty : Params)
+    OS << ", " << WebAssembly::TypeToString(Ty);
+  OS << '\n';
+}
+
+void WebAssemblyTargetAsmStreamer::emitGlobalImport(StringRef name) {
+  OS << "\t.import_global\t" << name << '\n';
+}
+
+void WebAssemblyTargetAsmStreamer::emitIndIdx(const MCExpr *Value) {
+  OS << "\t.indidx  \t" << *Value << '\n';
+}
+
+void WebAssemblyTargetELFStreamer::emitParam(ArrayRef<MVT> Types) {
+  // Nothing to emit; params are declared as part of the function signature.
+}
+
+void WebAssemblyTargetELFStreamer::emitResult(ArrayRef<MVT> Types) {
+  // Nothing to emit; results are declared as part of the function signature.
+}
+
+void WebAssemblyTargetELFStreamer::emitLocal(ArrayRef<MVT> Types) {
+  Streamer.EmitULEB128IntValue(Types.size());
+  for (MVT Type : Types)
+    Streamer.EmitIntValue(int64_t(WebAssembly::toValType(Type)), 1);
+}
+
+void WebAssemblyTargetELFStreamer::emitEndFunc() {
+  Streamer.EmitIntValue(WebAssembly::End, 1);
+}
+
+void WebAssemblyTargetELFStreamer::emitIndIdx(const MCExpr *Value) {
+  llvm_unreachable(".indidx encoding not yet implemented");
+}
+
+void WebAssemblyTargetELFStreamer::emitIndirectFunctionType(
+    StringRef name, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) {
+  // Nothing to emit here. TODO: Re-design how linking works and re-evaluate
+  // whether it's necessary for .o files to declare indirect function types.
+}
+
+void WebAssemblyTargetELFStreamer::emitGlobalImport(StringRef name) {
+}
+\ No newline at end of file
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
new file mode 100644
index 000000000000..23ac3190243a
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -0,0 +1,88 @@
+//==-- WebAssemblyTargetStreamer.h - WebAssembly Target Streamer -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares WebAssembly-specific target streamer classes.
+/// These are for implementing support for target-specific assembly directives.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYTARGETSTREAMER_H
+
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+class MCELFStreamer;
+
+/// WebAssembly-specific streamer interface, to implement support
+/// WebAssembly-specific assembly directives.
+class WebAssemblyTargetStreamer : public MCTargetStreamer {
+public:
+  explicit WebAssemblyTargetStreamer(MCStreamer &S);
+
+  /// .param
+  virtual void emitParam(ArrayRef<MVT> Types) = 0;
+  /// .result
+  virtual void emitResult(ArrayRef<MVT> Types) = 0;
+  /// .local
+  virtual void emitLocal(ArrayRef<MVT> Types) = 0;
+  /// .endfunc
+  virtual void emitEndFunc() = 0;
+  /// .functype
+  virtual void emitIndirectFunctionType(StringRef name,
+                                        SmallVectorImpl<MVT> &Params,
+                                        SmallVectorImpl<MVT> &Results) {
+    llvm_unreachable("emitIndirectFunctionType not implemented");
+  }
+  /// .indidx
+  virtual void emitIndIdx(const MCExpr *Value) = 0;
+  /// .import_global
+  virtual void emitGlobalImport(StringRef name) = 0;
+};
+
+/// This part is for ascii assembly output
+class WebAssemblyTargetAsmStreamer final : public WebAssemblyTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  WebAssemblyTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+
+  void emitParam(ArrayRef<MVT> Types) override;
+  void emitResult(ArrayRef<MVT> Types) override;
+  void emitLocal(ArrayRef<MVT> Types) override;
+  void emitEndFunc() override;
+  void emitIndirectFunctionType(StringRef name,
+                                SmallVectorImpl<MVT> &Params,
+                                SmallVectorImpl<MVT> &Results) override;
+  void emitIndIdx(const MCExpr *Value) override;
+  void emitGlobalImport(StringRef name) override;
+};
+
+/// This part is for ELF object output
+class WebAssemblyTargetELFStreamer final : public WebAssemblyTargetStreamer {
+public:
+  explicit WebAssemblyTargetELFStreamer(MCStreamer &S);
+
+  void emitParam(ArrayRef<MVT> Types) override;
+  void emitResult(ArrayRef<MVT> Types) override;
+  void emitLocal(ArrayRef<MVT> Types) override;
+  void emitEndFunc() override;
+  void emitIndirectFunctionType(StringRef name,
+                                SmallVectorImpl<MVT> &Params,
+                                SmallVectorImpl<MVT> &Results) override;
+  void emitIndIdx(const MCExpr *Value) override;
+  void emitGlobalImport(StringRef name) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/README.txt b/contrib/llvm/lib/Target/WebAssembly/README.txt
new file mode 100644
index 000000000000..64991ad14071
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/README.txt
@@ -0,0 +1,147 @@
+//===-- README.txt - Notes for WebAssembly code gen -----------------------===//
+
+This WebAssembly backend is presently under development.
+
+Currently the easiest way to use it is through Emscripten, which provides a
+compilation environment that includes standard libraries, tools, and packaging
+for producing WebAssembly applications that can run in browsers and other
+environments. For more information, see the Emscripten documentation in
+general, and this page in particular:
+  * https://github.com/kripken/emscripten/wiki/New-WebAssembly-Backend
+
+Other ways of using this backend, such as via a standalone "clang", are also
+under development, though they are not generally usable yet.
+
+For more information on WebAssembly itself, see the home page:
+  * https://webassembly.github.io/
+
+The following documents contain some information on the semantics and binary
+encoding of WebAssembly itself:
+  * https://github.com/WebAssembly/design/blob/master/Semantics.md
+  * https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md
+
+The backend is built, tested and archived on the following waterfall:
+  https://wasm-stat.us
+
+The backend's bringup is done in part by using the GCC torture test suite, since
+it doesn't require C library support. Current known failures are in
+known_gcc_test_failures.txt, all other tests should pass. The waterfall will
+turn red if not. Once most of these pass, further testing will use LLVM's own
+test suite. The tests can be run locally using:
+  https://github.com/WebAssembly/waterfall/blob/master/src/compile_torture_tests.py
+
+//===---------------------------------------------------------------------===//
+
+Br, br_if, and br_table instructions can support having a value on the value
+stack across the jump (sometimes). We should (a) model this, and (b) extend
+the stackifier to utilize it.
+
+//===---------------------------------------------------------------------===//
+
+The min/max instructions aren't exactly a<b?a:b because of NaN and negative zero
+behavior. The ARM target has the same kind of min/max instructions and has
+implemented optimizations for them; we should do similar optimizations for
+WebAssembly.
+
+//===---------------------------------------------------------------------===//
+
+AArch64 runs SeparateConstOffsetFromGEPPass, followed by EarlyCSE and LICM.
+Would these be useful to run for WebAssembly too? Also, it has an option to
+run SimplifyCFG after running the AtomicExpand pass. Would this be useful for
+us too?
+
+//===---------------------------------------------------------------------===//
+
+Register stackification uses the VALUE_STACK physical register to impose
+ordering dependencies on instructions with stack operands. This is pessimistic;
+we should consider alternate ways to model stack dependencies.
+
+//===---------------------------------------------------------------------===//
+
+Lots of things could be done in WebAssemblyTargetTransformInfo.cpp. Similarly,
+there are numerous optimization-related hooks that can be overridden in
+WebAssemblyTargetLowering.
+
+//===---------------------------------------------------------------------===//
+
+Instead of the OptimizeReturned pass, which should consider preserving the
+"returned" attribute through to MachineInstrs and extending the StoreResults
+pass to do this optimization on calls too. That would also let the
+WebAssemblyPeephole pass clean up dead defs for such calls, as it does for
+stores.
+
+//===---------------------------------------------------------------------===//
+
+Consider implementing optimizeSelect, optimizeCompareInstr, optimizeCondBranch,
+optimizeLoadInstr, and/or getMachineCombinerPatterns.
+
+//===---------------------------------------------------------------------===//
+
+Find a clean way to fix the problem which leads to the Shrink Wrapping pass
+being run after the WebAssembly PEI pass.
+
+//===---------------------------------------------------------------------===//
+
+When setting multiple local variables to the same constant, we currently get
+code like this:
+
+    i32.const   $4=, 0
+    i32.const   $3=, 0
+
+It could be done with a smaller encoding like this:
+
+    i32.const   $push5=, 0
+    tee_local   $push6=, $4=, $pop5
+    copy_local  $3=, $pop6
+
+//===---------------------------------------------------------------------===//
+
+WebAssembly registers are implicitly initialized to zero. Explicit zeroing is
+therefore often redundant and could be optimized away.
+
+//===---------------------------------------------------------------------===//
+
+Small indices may use smaller encodings than large indices.
+WebAssemblyRegColoring and/or WebAssemblyRegRenumbering should sort registers
+according to their usage frequency to maximize the usage of smaller encodings.
+
+//===---------------------------------------------------------------------===//
+
+Many cases of irreducible control flow could be transformed more optimally
+than via the transform in WebAssemblyFixIrreducibleControlFlow.cpp.
+
+It may also be worthwhile to do transforms before register coloring,
+particularly when duplicating code, to allow register coloring to be aware of
+the duplication.
+
+//===---------------------------------------------------------------------===//
+
+WebAssemblyRegStackify could use AliasAnalysis to reorder loads and stores more
+aggressively.
+
+//===---------------------------------------------------------------------===//
+
+WebAssemblyRegStackify is currently a greedy algorithm. This means that, for
+example, a binary operator will stackify with its user before its operands.
+However, if moving the binary operator to its user moves it to a place where
+its operands can't be moved to, it would be better to leave it in place, or
+perhaps move it up, so that it can stackify its operands. A binary operator
+has two operands and one result, so in such cases there could be a net win by
+prefering the operands.
+
+//===---------------------------------------------------------------------===//
+
+Instruction ordering has a significant influence on register stackification and
+coloring. Consider experimenting with the MachineScheduler (enable via
+enableMachineScheduler) and determine if it can be configured to schedule
+instructions advantageously for this purpose.
+
+//===---------------------------------------------------------------------===//
+
+WebAssembly is now officially a stack machine, rather than an AST, and this
+comes with additional opportunities for WebAssemblyRegStackify. Specifically,
+the stack doesn't need to be empty after an instruction with no return values.
+WebAssemblyRegStackify could be extended, or possibly rewritten, to take
+advantage of the new opportunities.
+
+//===---------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
new file mode 100644
index 000000000000..f310f0a44461
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
@@ -0,0 +1,36 @@
+//===-- WebAssemblyTargetInfo.cpp - WebAssembly Target Implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file registers the WebAssembly target.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-target-info"
+
+Target &llvm::getTheWebAssemblyTarget32() {
+  static Target TheWebAssemblyTarget32;
+  return TheWebAssemblyTarget32;
+}
+Target &llvm::getTheWebAssemblyTarget64() {
+  static Target TheWebAssemblyTarget64;
+  return TheWebAssemblyTarget64;
+}
+
+extern "C" void LLVMInitializeWebAssemblyTargetInfo() {
+  RegisterTarget<Triple::wasm32> X(getTheWebAssemblyTarget32(), "wasm32",
+                                   "WebAssembly 32-bit");
+  RegisterTarget<Triple::wasm64> Y(getTheWebAssemblyTarget64(), "wasm64",
+                                   "WebAssembly 64-bit");
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h
new file mode 100644
index 000000000000..09c35b4825fc
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h
@@ -0,0 +1,56 @@
+//===-- WebAssembly.h - Top-level interface for WebAssembly  ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the entry points for global functions defined in
+/// the LLVM WebAssembly back-end.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLY_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLY_H
+
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+
+class WebAssemblyTargetMachine;
+class ModulePass;
+class FunctionPass;
+
+// LLVM IR passes.
+ModulePass *createWebAssemblyLowerEmscriptenEHSjLj(bool DoEH, bool DoSjLj);
+void initializeWebAssemblyLowerEmscriptenEHSjLjPass(PassRegistry &);
+FunctionPass *createWebAssemblyOptimizeReturned();
+
+// ISel and immediate followup passes.
+FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
+                                       CodeGenOpt::Level OptLevel);
+FunctionPass *createWebAssemblyArgumentMove();
+FunctionPass *createWebAssemblySetP2AlignOperands();
+
+// Late passes.
+FunctionPass *createWebAssemblyReplacePhysRegs();
+FunctionPass *createWebAssemblyPrepareForLiveIntervals();
+FunctionPass *createWebAssemblyOptimizeLiveIntervals();
+FunctionPass *createWebAssemblyStoreResults();
+FunctionPass *createWebAssemblyRegStackify();
+FunctionPass *createWebAssemblyRegColoring();
+FunctionPass *createWebAssemblyExplicitLocals();
+FunctionPass *createWebAssemblyFixIrreducibleControlFlow();
+FunctionPass *createWebAssemblyCFGStackify();
+FunctionPass *createWebAssemblyLowerBrUnless();
+FunctionPass *createWebAssemblyRegNumbering();
+FunctionPass *createWebAssemblyPeephole();
+FunctionPass *createWebAssemblyCallIndirectFixup();
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td
new file mode 100644
index 000000000000..f647349d759b
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -0,0 +1,66 @@
+//- WebAssembly.td - Describe the WebAssembly Target Machine --*- tablegen -*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This is a target description file for the WebAssembly architecture,
+/// which is also known as "wasm".
+///
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// WebAssembly Subtarget features.
+//===----------------------------------------------------------------------===//
+
+def FeatureSIMD128 : SubtargetFeature<"simd128", "HasSIMD128", "true",
+                                      "Enable 128-bit SIMD">;
+
+//===----------------------------------------------------------------------===//
+// Architectures.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "WebAssemblyRegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "WebAssemblyInstrInfo.td"
+
+def WebAssemblyInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// WebAssembly Processors supported.
+//===----------------------------------------------------------------------===//
+
+// Minimal Viable Product.
+def : ProcessorModel<"mvp", NoSchedModel, []>;
+
+// Generic processor: latest stable version.
+def : ProcessorModel<"generic", NoSchedModel, []>;
+
+// Latest and greatest experimental version of WebAssembly. Bugs included!
+def : ProcessorModel<"bleeding-edge", NoSchedModel, [FeatureSIMD128]>;
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def WebAssembly : Target {
+  let InstructionSet = WebAssemblyInstrInfo;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
new file mode 100644
index 000000000000..5fadca38b820
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
@@ -0,0 +1,95 @@
+//===-- WebAssemblyArgumentMove.cpp - Argument instruction moving ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file moves ARGUMENT instructions after ScheduleDAG scheduling.
+///
+/// Arguments are really live-in registers, however, since we use virtual
+/// registers and LLVM doesn't support live-in virtual registers, we're
+/// currently making do with ARGUMENT instructions which are placed at the top
+/// of the entry block. The trick is to get them to *stay* at the top of the
+/// entry block.
+///
+/// The ARGUMENTS physical register keeps these instructions pinned in place
+/// during liveness-aware CodeGen passes, however one thing which does not
+/// respect this is the ScheduleDAG scheduler. This pass is therefore run
+/// immediately after that.
+///
+/// This is all hopefully a temporary solution until we find a better solution
+/// for describing the live-in nature of arguments.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-argument-move"
+
+namespace {
+class WebAssemblyArgumentMove final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyArgumentMove() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "WebAssembly Argument Move"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyArgumentMove::ID = 0;
+FunctionPass *llvm::createWebAssemblyArgumentMove() {
+  return new WebAssemblyArgumentMove();
+}
+
+bool WebAssemblyArgumentMove::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Argument Move **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  bool Changed = false;
+  MachineBasicBlock &EntryMBB = MF.front();
+  MachineBasicBlock::iterator InsertPt = EntryMBB.end();
+
+  // Look for the first NonArg instruction.
+  for (MachineInstr &MI : EntryMBB) {
+    if (!WebAssembly::isArgument(MI)) {
+      InsertPt = MI;
+      break;
+    }
+  }
+
+  // Now move any argument instructions later in the block
+  // to before our first NonArg instruction.
+  for (MachineInstr &MI : llvm::make_range(InsertPt, EntryMBB.end())) {
+    if (WebAssembly::isArgument(MI)) {
+      EntryMBB.insert(InsertPt, MI.removeFromParent());
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
new file mode 100644
index 000000000000..5b4b82eb5603
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -0,0 +1,322 @@
+//===-- WebAssemblyAsmPrinter.cpp - WebAssembly LLVM assembly writer ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains a printer that converts from our internal
+/// representation of machine-dependent LLVM code to the WebAssembly assembly
+/// language.
+///
+//===----------------------------------------------------------------------===//
+
+#include "InstPrinter/WebAssemblyInstPrinter.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyTargetStreamer.h"
+#include "WebAssembly.h"
+#include "WebAssemblyMCInstLower.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblyRegisterInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+
+class WebAssemblyAsmPrinter final : public AsmPrinter {
+  const MachineRegisterInfo *MRI;
+  WebAssemblyFunctionInfo *MFI;
+
+public:
+  WebAssemblyAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)), MRI(nullptr), MFI(nullptr) {}
+
+private:
+  StringRef getPassName() const override {
+    return "WebAssembly Assembly Printer";
+  }
+
+  //===------------------------------------------------------------------===//
+  // MachineFunctionPass Implementation.
+  //===------------------------------------------------------------------===//
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    MRI = &MF.getRegInfo();
+    MFI = MF.getInfo<WebAssemblyFunctionInfo>();
+    return AsmPrinter::runOnMachineFunction(MF);
+  }
+
+  //===------------------------------------------------------------------===//
+  // AsmPrinter Implementation.
+  //===------------------------------------------------------------------===//
+
+  void EmitEndOfAsmFile(Module &M) override;
+  void EmitJumpTableInfo() override;
+  void EmitConstantPool() override;
+  void EmitFunctionBodyStart() override;
+  void EmitFunctionBodyEnd() override;
+  void EmitInstruction(const MachineInstr *MI) override;
+  const MCExpr *lowerConstant(const Constant *CV) override;
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &OS) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &OS) override;
+
+  MVT getRegType(unsigned RegNo) const;
+  std::string regToString(const MachineOperand &MO);
+  WebAssemblyTargetStreamer *getTargetStreamer();
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Helpers.
+//===----------------------------------------------------------------------===//
+
+MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
+  const TargetRegisterClass *TRC = MRI->getRegClass(RegNo);
+  for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64, MVT::v16i8, MVT::v8i16,
+                MVT::v4i32, MVT::v4f32})
+    if (TRC->hasType(T))
+      return T;
+  DEBUG(errs() << "Unknown type for register number: " << RegNo);
+  llvm_unreachable("Unknown register type");
+  return MVT::Other;
+}
+
+std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) {
+  unsigned RegNo = MO.getReg();
+  assert(TargetRegisterInfo::isVirtualRegister(RegNo) &&
+         "Unlowered physical register encountered during assembly printing");
+  assert(!MFI->isVRegStackified(RegNo));
+  unsigned WAReg = MFI->getWAReg(RegNo);
+  assert(WAReg != WebAssemblyFunctionInfo::UnusedReg);
+  return '$' + utostr(WAReg);
+}
+
+WebAssemblyTargetStreamer *WebAssemblyAsmPrinter::getTargetStreamer() {
+  MCTargetStreamer *TS = OutStreamer->getTargetStreamer();
+  return static_cast<WebAssemblyTargetStreamer *>(TS);
+}
+
+//===----------------------------------------------------------------------===//
+// WebAssemblyAsmPrinter Implementation.
+//===----------------------------------------------------------------------===//
+
+void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  for (const auto &F : M) {
+    // Emit function type info for all undefined functions
+    if (F.isDeclarationForLinker() && !F.isIntrinsic()) {
+      SmallVector<MVT, 4> Results;
+      SmallVector<MVT, 4> Params;
+      ComputeSignatureVTs(F, TM, Params, Results);
+      getTargetStreamer()->emitIndirectFunctionType(F.getName(), Params,
+                                                    Results);
+    }
+  }
+  for (const auto &G : M.globals()) {
+    if (!G.hasInitializer() && G.hasExternalLinkage()) {
+      getTargetStreamer()->emitGlobalImport(G.getGlobalIdentifier());
+    }
+  }
+}
+
+void WebAssemblyAsmPrinter::EmitConstantPool() {
+  assert(MF->getConstantPool()->getConstants().empty() &&
+         "WebAssembly disables constant pools");
+}
+
+void WebAssemblyAsmPrinter::EmitJumpTableInfo() {
+  // Nothing to do; jump tables are incorporated into the instruction stream.
+}
+
+void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
+  if (!MFI->getParams().empty())
+    getTargetStreamer()->emitParam(MFI->getParams());
+
+  SmallVector<MVT, 4> ResultVTs;
+  const Function &F(*MF->getFunction());
+
+  // Emit the function index.
+  if (MDNode *Idx = F.getMetadata("wasm.index")) {
+    assert(Idx->getNumOperands() == 1);
+
+    getTargetStreamer()->emitIndIdx(AsmPrinter::lowerConstant(
+        cast<ConstantAsMetadata>(Idx->getOperand(0))->getValue()));
+  }
+
+  ComputeLegalValueVTs(F, TM, F.getReturnType(), ResultVTs);
+
+  // If the return type needs to be legalized it will get converted into
+  // passing a pointer.
+  if (ResultVTs.size() == 1)
+    getTargetStreamer()->emitResult(ResultVTs);
+
+  // FIXME: When ExplicitLocals is enabled by default, we won't need
+  // to define the locals here (and MFI can go back to being pointer-to-const).
+  for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
+    unsigned VReg = TargetRegisterInfo::index2VirtReg(Idx);
+    unsigned WAReg = MFI->getWAReg(VReg);
+    // Don't declare unused registers.
+    if (WAReg == WebAssemblyFunctionInfo::UnusedReg)
+      continue;
+    // Don't redeclare parameters.
+    if (WAReg < MFI->getParams().size())
+      continue;
+    // Don't declare stackified registers.
+    if (int(WAReg) < 0)
+      continue;
+    MFI->addLocal(getRegType(VReg));
+  }
+
+  getTargetStreamer()->emitLocal(MFI->getLocals());
+
+  AsmPrinter::EmitFunctionBodyStart();
+}
+
+void WebAssemblyAsmPrinter::EmitFunctionBodyEnd() {
+  getTargetStreamer()->emitEndFunc();
+}
+
+void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n');
+
+  switch (MI->getOpcode()) {
+  case WebAssembly::ARGUMENT_I32:
+  case WebAssembly::ARGUMENT_I64:
+  case WebAssembly::ARGUMENT_F32:
+  case WebAssembly::ARGUMENT_F64:
+  case WebAssembly::ARGUMENT_v16i8:
+  case WebAssembly::ARGUMENT_v8i16:
+  case WebAssembly::ARGUMENT_v4i32:
+  case WebAssembly::ARGUMENT_v4f32:
+    // These represent values which are live into the function entry, so there's
+    // no instruction to emit.
+    break;
+  case WebAssembly::FALLTHROUGH_RETURN_I32:
+  case WebAssembly::FALLTHROUGH_RETURN_I64:
+  case WebAssembly::FALLTHROUGH_RETURN_F32:
+  case WebAssembly::FALLTHROUGH_RETURN_F64:
+  case WebAssembly::FALLTHROUGH_RETURN_v16i8:
+  case WebAssembly::FALLTHROUGH_RETURN_v8i16:
+  case WebAssembly::FALLTHROUGH_RETURN_v4i32:
+  case WebAssembly::FALLTHROUGH_RETURN_v4f32: {
+    // These instructions represent the implicit return at the end of a
+    // function body. The operand is always a pop.
+    assert(MFI->isVRegStackified(MI->getOperand(0).getReg()));
+
+    if (isVerbose()) {
+      OutStreamer->AddComment("fallthrough-return: $pop" +
+                              utostr(MFI->getWARegStackId(
+                                  MFI->getWAReg(MI->getOperand(0).getReg()))));
+      OutStreamer->AddBlankLine();
+    }
+    break;
+  }
+  case WebAssembly::FALLTHROUGH_RETURN_VOID:
+    // This instruction represents the implicit return at the end of a
+    // function body with no return value.
+    if (isVerbose()) {
+      OutStreamer->AddComment("fallthrough-return");
+      OutStreamer->AddBlankLine();
+    }
+    break;
+  default: {
+    WebAssemblyMCInstLower MCInstLowering(OutContext, *this);
+    MCInst TmpInst;
+    MCInstLowering.Lower(MI, TmpInst);
+    EmitToStreamer(*OutStreamer, TmpInst);
+    break;
+  }
+  }
+}
+
+const MCExpr *WebAssemblyAsmPrinter::lowerConstant(const Constant *CV) {
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
+    if (GV->getValueType()->isFunctionTy())
+      return MCSymbolRefExpr::create(
+          getSymbol(GV), MCSymbolRefExpr::VK_WebAssembly_FUNCTION, OutContext);
+  return AsmPrinter::lowerConstant(CV);
+}
+
+bool WebAssemblyAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
+                                            unsigned OpNo, unsigned AsmVariant,
+                                            const char *ExtraCode,
+                                            raw_ostream &OS) {
+  if (AsmVariant != 0)
+    report_fatal_error("There are no defined alternate asm variants");
+
+  // First try the generic code, which knows about modifiers like 'c' and 'n'.
+  if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS))
+    return false;
+
+  if (!ExtraCode) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    switch (MO.getType()) {
+    case MachineOperand::MO_Immediate:
+      OS << MO.getImm();
+      return false;
+    case MachineOperand::MO_Register:
+      OS << regToString(MO);
+      return false;
+    case MachineOperand::MO_GlobalAddress:
+      getSymbol(MO.getGlobal())->print(OS, MAI);
+      printOffset(MO.getOffset(), OS);
+      return false;
+    case MachineOperand::MO_ExternalSymbol:
+      GetExternalSymbolSymbol(MO.getSymbolName())->print(OS, MAI);
+      printOffset(MO.getOffset(), OS);
+      return false;
+    case MachineOperand::MO_MachineBasicBlock:
+      MO.getMBB()->getSymbol()->print(OS, MAI);
+      return false;
+    default:
+      break;
+    }
+  }
+
+  return true;
+}
+
+bool WebAssemblyAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                                  unsigned OpNo,
+                                                  unsigned AsmVariant,
+                                                  const char *ExtraCode,
+                                                  raw_ostream &OS) {
+  if (AsmVariant != 0)
+    report_fatal_error("There are no defined alternate asm variants");
+
+  if (!ExtraCode) {
+    // TODO: For now, we just hard-code 0 as the constant offset; teach
+    // SelectInlineAsmMemoryOperand how to do address mode matching.
+    OS << "0(" + regToString(MI->getOperand(OpNo)) + ')';
+    return false;
+  }
+
+  return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeWebAssemblyAsmPrinter() {
+  RegisterAsmPrinter<WebAssemblyAsmPrinter> X(getTheWebAssemblyTarget32());
+  RegisterAsmPrinter<WebAssemblyAsmPrinter> Y(getTheWebAssemblyTarget64());
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
new file mode 100644
index 000000000000..49b9754e6b62
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -0,0 +1,579 @@
+//===-- WebAssemblyCFGStackify.cpp - CFG Stackification -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a CFG stacking pass.
+///
+/// This pass reorders the blocks in a function to put them into topological
+/// order, ignoring loop backedges, and without any loop being interrupted
+/// by a block not dominated by the loop header, with special care to keep the
+/// order as similar as possible to the original order.
+///
+/// Then, it inserts BLOCK and LOOP markers to mark the start of scopes, since
+/// scope boundaries serve as the labels for WebAssembly's control transfers.
+///
+/// This is sufficient to convert arbitrary CFGs into a form that works on
+/// WebAssembly, provided that all loops are single-entry.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-cfg-stackify"
+
+namespace {
+class WebAssemblyCFGStackify final : public MachineFunctionPass {
+  StringRef getPassName() const override { return "WebAssembly CFG Stackify"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyCFGStackify() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyCFGStackify::ID = 0;
+FunctionPass *llvm::createWebAssemblyCFGStackify() {
+  return new WebAssemblyCFGStackify();
+}
+
+/// Return the "bottom" block of a loop. This differs from
+/// MachineLoop::getBottomBlock in that it works even if the loop is
+/// discontiguous.
+static MachineBasicBlock *LoopBottom(const MachineLoop *Loop) {
+  MachineBasicBlock *Bottom = Loop->getHeader();
+  for (MachineBasicBlock *MBB : Loop->blocks())
+    if (MBB->getNumber() > Bottom->getNumber())
+      Bottom = MBB;
+  return Bottom;
+}
+
+static void MaybeUpdateTerminator(MachineBasicBlock *MBB) {
+#ifndef NDEBUG
+  bool AnyBarrier = false;
+#endif
+  bool AllAnalyzable = true;
+  for (const MachineInstr &Term : MBB->terminators()) {
+#ifndef NDEBUG
+    AnyBarrier |= Term.isBarrier();
+#endif
+    AllAnalyzable &= Term.isBranch() && !Term.isIndirectBranch();
+  }
+  assert((AnyBarrier || AllAnalyzable) &&
+         "AnalyzeBranch needs to analyze any block with a fallthrough");
+  if (AllAnalyzable)
+    MBB->updateTerminator();
+}
+
+namespace {
+/// Sort blocks by their number.
+struct CompareBlockNumbers {
+  bool operator()(const MachineBasicBlock *A,
+                  const MachineBasicBlock *B) const {
+    return A->getNumber() > B->getNumber();
+  }
+};
+/// Sort blocks by their number in the opposite order..
+struct CompareBlockNumbersBackwards {
+  bool operator()(const MachineBasicBlock *A,
+                  const MachineBasicBlock *B) const {
+    return A->getNumber() < B->getNumber();
+  }
+};
+/// Bookkeeping for a loop to help ensure that we don't mix blocks not dominated
+/// by the loop header among the loop's blocks.
+struct Entry {
+  const MachineLoop *Loop;
+  unsigned NumBlocksLeft;
+
+  /// List of blocks not dominated by Loop's header that are deferred until
+  /// after all of Loop's blocks have been seen.
+  std::vector<MachineBasicBlock *> Deferred;
+
+  explicit Entry(const MachineLoop *L)
+      : Loop(L), NumBlocksLeft(L->getNumBlocks()) {}
+};
+}
+
+/// Sort the blocks, taking special care to make sure that loops are not
+/// interrupted by blocks not dominated by their header.
+/// TODO: There are many opportunities for improving the heuristics here.
+/// Explore them.
+static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
+                       const MachineDominatorTree &MDT) {
+  // Prepare for a topological sort: Record the number of predecessors each
+  // block has, ignoring loop backedges.
+  MF.RenumberBlocks();
+  SmallVector<unsigned, 16> NumPredsLeft(MF.getNumBlockIDs(), 0);
+  for (MachineBasicBlock &MBB : MF) {
+    unsigned N = MBB.pred_size();
+    if (MachineLoop *L = MLI.getLoopFor(&MBB))
+      if (L->getHeader() == &MBB)
+        for (const MachineBasicBlock *Pred : MBB.predecessors())
+          if (L->contains(Pred))
+            --N;
+    NumPredsLeft[MBB.getNumber()] = N;
+  }
+
+  // Topological sort the CFG, with additional constraints:
+  //  - Between a loop header and the last block in the loop, there can be
+  //    no blocks not dominated by the loop header.
+  //  - It's desirable to preserve the original block order when possible.
+  // We use two ready lists; Preferred and Ready. Preferred has recently
+  // processed sucessors, to help preserve block sequences from the original
+  // order. Ready has the remaining ready blocks.
+  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
+                CompareBlockNumbers>
+      Preferred;
+  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
+                CompareBlockNumbersBackwards>
+      Ready;
+  SmallVector<Entry, 4> Loops;
+  for (MachineBasicBlock *MBB = &MF.front();;) {
+    const MachineLoop *L = MLI.getLoopFor(MBB);
+    if (L) {
+      // If MBB is a loop header, add it to the active loop list. We can't put
+      // any blocks that it doesn't dominate until we see the end of the loop.
+      if (L->getHeader() == MBB)
+        Loops.push_back(Entry(L));
+      // For each active loop the block is in, decrement the count. If MBB is
+      // the last block in an active loop, take it off the list and pick up any
+      // blocks deferred because the header didn't dominate them.
+      for (Entry &E : Loops)
+        if (E.Loop->contains(MBB) && --E.NumBlocksLeft == 0)
+          for (auto DeferredBlock : E.Deferred)
+            Ready.push(DeferredBlock);
+      while (!Loops.empty() && Loops.back().NumBlocksLeft == 0)
+        Loops.pop_back();
+    }
+    // The main topological sort logic.
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      // Ignore backedges.
+      if (MachineLoop *SuccL = MLI.getLoopFor(Succ))
+        if (SuccL->getHeader() == Succ && SuccL->contains(MBB))
+          continue;
+      // Decrement the predecessor count. If it's now zero, it's ready.
+      if (--NumPredsLeft[Succ->getNumber()] == 0)
+        Preferred.push(Succ);
+    }
+    // Determine the block to follow MBB. First try to find a preferred block,
+    // to preserve the original block order when possible.
+    MachineBasicBlock *Next = nullptr;
+    while (!Preferred.empty()) {
+      Next = Preferred.top();
+      Preferred.pop();
+      // If X isn't dominated by the top active loop header, defer it until that
+      // loop is done.
+      if (!Loops.empty() &&
+          !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
+        Loops.back().Deferred.push_back(Next);
+        Next = nullptr;
+        continue;
+      }
+      // If Next was originally ordered before MBB, and it isn't because it was
+      // loop-rotated above the header, it's not preferred.
+      if (Next->getNumber() < MBB->getNumber() &&
+          (!L || !L->contains(Next) ||
+           L->getHeader()->getNumber() < Next->getNumber())) {
+        Ready.push(Next);
+        Next = nullptr;
+        continue;
+      }
+      break;
+    }
+    // If we didn't find a suitable block in the Preferred list, check the
+    // general Ready list.
+    if (!Next) {
+      // If there are no more blocks to process, we're done.
+      if (Ready.empty()) {
+        MaybeUpdateTerminator(MBB);
+        break;
+      }
+      for (;;) {
+        Next = Ready.top();
+        Ready.pop();
+        // If Next isn't dominated by the top active loop header, defer it until
+        // that loop is done.
+        if (!Loops.empty() &&
+            !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
+          Loops.back().Deferred.push_back(Next);
+          continue;
+        }
+        break;
+      }
+    }
+    // Move the next block into place and iterate.
+    Next->moveAfter(MBB);
+    MaybeUpdateTerminator(MBB);
+    MBB = Next;
+  }
+  assert(Loops.empty() && "Active loop list not finished");
+  MF.RenumberBlocks();
+
+#ifndef NDEBUG
+  SmallSetVector<MachineLoop *, 8> OnStack;
+
+  // Insert a sentinel representing the degenerate loop that starts at the
+  // function entry block and includes the entire function as a "loop" that
+  // executes once.
+  OnStack.insert(nullptr);
+
+  for (auto &MBB : MF) {
+    assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative.");
+
+    MachineLoop *Loop = MLI.getLoopFor(&MBB);
+    if (Loop && &MBB == Loop->getHeader()) {
+      // Loop header. The loop predecessor should be sorted above, and the other
+      // predecessors should be backedges below.
+      for (auto Pred : MBB.predecessors())
+        assert(
+            (Pred->getNumber() < MBB.getNumber() || Loop->contains(Pred)) &&
+            "Loop header predecessors must be loop predecessors or backedges");
+      assert(OnStack.insert(Loop) && "Loops should be declared at most once.");
+    } else {
+      // Not a loop header. All predecessors should be sorted above.
+      for (auto Pred : MBB.predecessors())
+        assert(Pred->getNumber() < MBB.getNumber() &&
+               "Non-loop-header predecessors should be topologically sorted");
+      assert(OnStack.count(MLI.getLoopFor(&MBB)) &&
+             "Blocks must be nested in their loops");
+    }
+    while (OnStack.size() > 1 && &MBB == LoopBottom(OnStack.back()))
+      OnStack.pop_back();
+  }
+  assert(OnStack.pop_back_val() == nullptr &&
+         "The function entry block shouldn't actually be a loop header");
+  assert(OnStack.empty() &&
+         "Control flow stack pushes and pops should be balanced.");
+#endif
+}
+
+/// Test whether Pred has any terminators explicitly branching to MBB, as
+/// opposed to falling through. Note that it's possible (eg. in unoptimized
+/// code) for a branch instruction to both branch to a block and fallthrough
+/// to it, so we check the actual branch operands to see if there are any
+/// explicit mentions.
+static bool ExplicitlyBranchesTo(MachineBasicBlock *Pred,
+                                 MachineBasicBlock *MBB) {
+  for (MachineInstr &MI : Pred->terminators())
+    for (MachineOperand &MO : MI.explicit_operands())
+      if (MO.isMBB() && MO.getMBB() == MBB)
+        return true;
+  return false;
+}
+
+/// Insert a BLOCK marker for branches to MBB (if needed).
+static void PlaceBlockMarker(
+    MachineBasicBlock &MBB, MachineFunction &MF,
+    SmallVectorImpl<MachineBasicBlock *> &ScopeTops,
+    DenseMap<const MachineInstr *, MachineInstr *> &BlockTops,
+    DenseMap<const MachineInstr *, MachineInstr *> &LoopTops,
+    const WebAssemblyInstrInfo &TII,
+    const MachineLoopInfo &MLI,
+    MachineDominatorTree &MDT,
+    WebAssemblyFunctionInfo &MFI) {
+  // First compute the nearest common dominator of all forward non-fallthrough
+  // predecessors so that we minimize the time that the BLOCK is on the stack,
+  // which reduces overall stack height.
+  MachineBasicBlock *Header = nullptr;
+  bool IsBranchedTo = false;
+  int MBBNumber = MBB.getNumber();
+  for (MachineBasicBlock *Pred : MBB.predecessors())
+    if (Pred->getNumber() < MBBNumber) {
+      Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred;
+      if (ExplicitlyBranchesTo(Pred, &MBB))
+        IsBranchedTo = true;
+    }
+  if (!Header)
+    return;
+  if (!IsBranchedTo)
+    return;
+
+  assert(&MBB != &MF.front() && "Header blocks shouldn't have predecessors");
+  MachineBasicBlock *LayoutPred = &*std::prev(MachineFunction::iterator(&MBB));
+
+  // If the nearest common dominator is inside a more deeply nested context,
+  // walk out to the nearest scope which isn't more deeply nested.
+  for (MachineFunction::iterator I(LayoutPred), E(Header); I != E; --I) {
+    if (MachineBasicBlock *ScopeTop = ScopeTops[I->getNumber()]) {
+      if (ScopeTop->getNumber() > Header->getNumber()) {
+        // Skip over an intervening scope.
+        I = std::next(MachineFunction::iterator(ScopeTop));
+      } else {
+        // We found a scope level at an appropriate depth.
+        Header = ScopeTop;
+        break;
+      }
+    }
+  }
+
+  // Decide where in Header to put the BLOCK.
+  MachineBasicBlock::iterator InsertPos;
+  MachineLoop *HeaderLoop = MLI.getLoopFor(Header);
+  if (HeaderLoop && MBB.getNumber() > LoopBottom(HeaderLoop)->getNumber()) {
+    // Header is the header of a loop that does not lexically contain MBB, so
+    // the BLOCK needs to be above the LOOP, after any END constructs.
+    InsertPos = Header->begin();
+    while (InsertPos->getOpcode() == WebAssembly::END_BLOCK ||
+           InsertPos->getOpcode() == WebAssembly::END_LOOP)
+      ++InsertPos;
+  } else {
+    // Otherwise, insert the BLOCK as late in Header as we can, but before the
+    // beginning of the local expression tree and any nested BLOCKs.
+    InsertPos = Header->getFirstTerminator();
+    while (InsertPos != Header->begin() &&
+           WebAssembly::isChild(*std::prev(InsertPos), MFI) &&
+           std::prev(InsertPos)->getOpcode() != WebAssembly::LOOP &&
+           std::prev(InsertPos)->getOpcode() != WebAssembly::END_BLOCK &&
+           std::prev(InsertPos)->getOpcode() != WebAssembly::END_LOOP)
+      --InsertPos;
+  }
+
+  // Add the BLOCK.
+  MachineInstr *Begin = BuildMI(*Header, InsertPos, DebugLoc(),
+                                TII.get(WebAssembly::BLOCK))
+      .addImm(int64_t(WebAssembly::ExprType::Void));
+
+  // Mark the end of the block.
+  InsertPos = MBB.begin();
+  while (InsertPos != MBB.end() &&
+         InsertPos->getOpcode() == WebAssembly::END_LOOP &&
+         LoopTops[&*InsertPos]->getParent()->getNumber() >= Header->getNumber())
+    ++InsertPos;
+  MachineInstr *End = BuildMI(MBB, InsertPos, DebugLoc(),
+                              TII.get(WebAssembly::END_BLOCK));
+  BlockTops[End] = Begin;
+
+  // Track the farthest-spanning scope that ends at this point.
+  int Number = MBB.getNumber();
+  if (!ScopeTops[Number] ||
+      ScopeTops[Number]->getNumber() > Header->getNumber())
+    ScopeTops[Number] = Header;
+}
+
+/// Insert a LOOP marker for a loop starting at MBB (if it's a loop header).
+static void PlaceLoopMarker(
+    MachineBasicBlock &MBB, MachineFunction &MF,
+    SmallVectorImpl<MachineBasicBlock *> &ScopeTops,
+    DenseMap<const MachineInstr *, MachineInstr *> &LoopTops,
+    const WebAssemblyInstrInfo &TII, const MachineLoopInfo &MLI) {
+  MachineLoop *Loop = MLI.getLoopFor(&MBB);
+  if (!Loop || Loop->getHeader() != &MBB)
+    return;
+
+  // The operand of a LOOP is the first block after the loop. If the loop is the
+  // bottom of the function, insert a dummy block at the end.
+  MachineBasicBlock *Bottom = LoopBottom(Loop);
+  auto Iter = std::next(MachineFunction::iterator(Bottom));
+  if (Iter == MF.end()) {
+    MachineBasicBlock *Label = MF.CreateMachineBasicBlock();
+    // Give it a fake predecessor so that AsmPrinter prints its label.
+    Label->addSuccessor(Label);
+    MF.push_back(Label);
+    Iter = std::next(MachineFunction::iterator(Bottom));
+  }
+  MachineBasicBlock *AfterLoop = &*Iter;
+
+  // Mark the beginning of the loop (after the end of any existing loop that
+  // ends here).
+  auto InsertPos = MBB.begin();
+  while (InsertPos != MBB.end() &&
+         InsertPos->getOpcode() == WebAssembly::END_LOOP)
+    ++InsertPos;
+  MachineInstr *Begin = BuildMI(MBB, InsertPos, DebugLoc(),
+                                TII.get(WebAssembly::LOOP))
+      .addImm(int64_t(WebAssembly::ExprType::Void));
+
+  // Mark the end of the loop.
+  MachineInstr *End = BuildMI(*AfterLoop, AfterLoop->begin(), DebugLoc(),
+                              TII.get(WebAssembly::END_LOOP));
+  LoopTops[End] = Begin;
+
+  assert((!ScopeTops[AfterLoop->getNumber()] ||
+          ScopeTops[AfterLoop->getNumber()]->getNumber() < MBB.getNumber()) &&
+         "With block sorting the outermost loop for a block should be first.");
+  if (!ScopeTops[AfterLoop->getNumber()])
+    ScopeTops[AfterLoop->getNumber()] = &MBB;
+}
+
+static unsigned
+GetDepth(const SmallVectorImpl<const MachineBasicBlock *> &Stack,
+         const MachineBasicBlock *MBB) {
+  unsigned Depth = 0;
+  for (auto X : reverse(Stack)) {
+    if (X == MBB)
+      break;
+    ++Depth;
+  }
+  assert(Depth < Stack.size() && "Branch destination should be in scope");
+  return Depth;
+}
+
+/// In normal assembly languages, when the end of a function is unreachable,
+/// because the function ends in an infinite loop or a noreturn call or similar,
+/// it isn't necessary to worry about the function return type at the end of
+/// the function, because it's never reached. However, in WebAssembly, blocks
+/// that end at the function end need to have a return type signature that
+/// matches the function signature, even though it's unreachable. This function
+/// checks for such cases and fixes up the signatures.
+static void FixEndsAtEndOfFunction(
+    MachineFunction &MF,
+    const WebAssemblyFunctionInfo &MFI,
+    DenseMap<const MachineInstr *, MachineInstr *> &BlockTops,
+    DenseMap<const MachineInstr *, MachineInstr *> &LoopTops) {
+  assert(MFI.getResults().size() <= 1);
+
+  if (MFI.getResults().empty())
+    return;
+
+  WebAssembly::ExprType retType;
+  switch (MFI.getResults().front().SimpleTy) {
+  case MVT::i32: retType = WebAssembly::ExprType::I32; break;
+  case MVT::i64: retType = WebAssembly::ExprType::I64; break;
+  case MVT::f32: retType = WebAssembly::ExprType::F32; break;
+  case MVT::f64: retType = WebAssembly::ExprType::F64; break;
+  case MVT::v16i8: retType = WebAssembly::ExprType::I8x16; break;
+  case MVT::v8i16: retType = WebAssembly::ExprType::I16x8; break;
+  case MVT::v4i32: retType = WebAssembly::ExprType::I32x4; break;
+  case MVT::v4f32: retType = WebAssembly::ExprType::F32x4; break;
+  default: llvm_unreachable("unexpected return type");
+  }
+
+  for (MachineBasicBlock &MBB : reverse(MF)) {
+    for (MachineInstr &MI : reverse(MBB)) {
+      if (MI.isPosition() || MI.isDebugValue())
+        continue;
+      if (MI.getOpcode() == WebAssembly::END_BLOCK) {
+        BlockTops[&MI]->getOperand(0).setImm(int32_t(retType));
+        continue;
+      }
+      if (MI.getOpcode() == WebAssembly::END_LOOP) {
+        LoopTops[&MI]->getOperand(0).setImm(int32_t(retType));
+        continue;
+      }
+      // Something other than an `end`. We're done.
+      return;
+    }
+  }
+}
+
+/// Insert LOOP and BLOCK markers at appropriate places.
+static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
+                         const WebAssemblyInstrInfo &TII,
+                         MachineDominatorTree &MDT,
+                         WebAssemblyFunctionInfo &MFI) {
+  // For each block whose label represents the end of a scope, record the block
+  // which holds the beginning of the scope. This will allow us to quickly skip
+  // over scoped regions when walking blocks. We allocate one more than the
+  // number of blocks in the function to accommodate for the possible fake block
+  // we may insert at the end.
+  SmallVector<MachineBasicBlock *, 8> ScopeTops(MF.getNumBlockIDs() + 1);
+
+  // For each LOOP_END, the corresponding LOOP.
+  DenseMap<const MachineInstr *, MachineInstr *> LoopTops;
+
+  // For each END_BLOCK, the corresponding BLOCK.
+  DenseMap<const MachineInstr *, MachineInstr *> BlockTops;
+
+  for (auto &MBB : MF) {
+    // Place the LOOP for MBB if MBB is the header of a loop.
+    PlaceLoopMarker(MBB, MF, ScopeTops, LoopTops, TII, MLI);
+
+    // Place the BLOCK for MBB if MBB is branched to from above.
+    PlaceBlockMarker(MBB, MF, ScopeTops, BlockTops, LoopTops, TII, MLI, MDT, MFI);
+  }
+
+  // Now rewrite references to basic blocks to be depth immediates.
+  SmallVector<const MachineBasicBlock *, 8> Stack;
+  for (auto &MBB : reverse(MF)) {
+    for (auto &MI : reverse(MBB)) {
+      switch (MI.getOpcode()) {
+      case WebAssembly::BLOCK:
+        assert(ScopeTops[Stack.back()->getNumber()]->getNumber() <= MBB.getNumber() &&
+               "Block should be balanced");
+        Stack.pop_back();
+        break;
+      case WebAssembly::LOOP:
+        assert(Stack.back() == &MBB && "Loop top should be balanced");
+        Stack.pop_back();
+        break;
+      case WebAssembly::END_BLOCK:
+        Stack.push_back(&MBB);
+        break;
+      case WebAssembly::END_LOOP:
+        Stack.push_back(LoopTops[&MI]->getParent());
+        break;
+      default:
+        if (MI.isTerminator()) {
+          // Rewrite MBB operands to be depth immediates.
+          SmallVector<MachineOperand, 4> Ops(MI.operands());
+          while (MI.getNumOperands() > 0)
+            MI.RemoveOperand(MI.getNumOperands() - 1);
+          for (auto MO : Ops) {
+            if (MO.isMBB())
+              MO = MachineOperand::CreateImm(GetDepth(Stack, MO.getMBB()));
+            MI.addOperand(MF, MO);
+          }
+        }
+        break;
+      }
+    }
+  }
+  assert(Stack.empty() && "Control flow should be balanced");
+
+  // Fix up block/loop signatures at the end of the function to conform to
+  // WebAssembly's rules.
+  FixEndsAtEndOfFunction(MF, MFI, BlockTops, LoopTops);
+}
+
+bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** CFG Stackifying **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  const auto &MLI = getAnalysis<MachineLoopInfo>();
+  auto &MDT = getAnalysis<MachineDominatorTree>();
+  // Liveness is not tracked for VALUE_STACK physreg.
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  MF.getRegInfo().invalidateLiveness();
+
+  // Sort the blocks, with contiguous loops.
+  SortBlocks(MF, MLI, MDT);
+
+  // Place the BLOCK and LOOP markers to indicate the beginnings of scopes.
+  PlaceMarkers(MF, MLI, TII, MDT, MFI);
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
new file mode 100644
index 000000000000..fc0a01ca30e5
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
@@ -0,0 +1,120 @@
+//===-- WebAssemblyCallIndirectFixup.cpp - Fix call_indirects -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file converts pseudo call_indirect instructions into real
+/// call_indirects.
+///
+/// The order of arguments for a call_indirect is the arguments to the function
+/// call, followed by the function pointer. There's no natural way to express
+/// a machineinstr with varargs followed by one more arg, so we express it as
+/// the function pointer followed by varargs, then rewrite it here.
+///
+/// We need to rewrite the order of the arguments on the machineinstrs
+/// themselves so that register stackification knows the order they'll be
+/// executed in.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" // for WebAssembly::ARGUMENT_*
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-call-indirect-fixup"
+
+namespace {
+class WebAssemblyCallIndirectFixup final : public MachineFunctionPass {
+  StringRef getPassName() const override {
+    return "WebAssembly CallIndirect Fixup";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyCallIndirectFixup() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyCallIndirectFixup::ID = 0;
+FunctionPass *llvm::createWebAssemblyCallIndirectFixup() {
+  return new WebAssemblyCallIndirectFixup();
+}
+
+static unsigned GetNonPseudoCallIndirectOpcode(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+    using namespace WebAssembly;
+  case PCALL_INDIRECT_VOID: return CALL_INDIRECT_VOID;
+  case PCALL_INDIRECT_I32: return CALL_INDIRECT_I32;
+  case PCALL_INDIRECT_I64: return CALL_INDIRECT_I64;
+  case PCALL_INDIRECT_F32: return CALL_INDIRECT_F32;
+  case PCALL_INDIRECT_F64: return CALL_INDIRECT_F64;
+  case PCALL_INDIRECT_v16i8: return CALL_INDIRECT_v16i8;
+  case PCALL_INDIRECT_v8i16: return CALL_INDIRECT_v8i16;
+  case PCALL_INDIRECT_v4i32: return CALL_INDIRECT_v4i32;
+  case PCALL_INDIRECT_v4f32: return CALL_INDIRECT_v4f32;
+  default: return INSTRUCTION_LIST_END;
+  }
+}
+
+static bool IsPseudoCallIndirect(const MachineInstr &MI) {
+  return GetNonPseudoCallIndirectOpcode(MI) !=
+         WebAssembly::INSTRUCTION_LIST_END;
+}
+
+bool WebAssemblyCallIndirectFixup::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Fixing up CALL_INDIRECTs **********\n"
+               << MF.getName() << '\n');
+
+  bool Changed = false;
+  const WebAssemblyInstrInfo *TII =
+      MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (IsPseudoCallIndirect(MI)) {
+        DEBUG(dbgs() << "Found call_indirect: " << MI << '\n');
+
+        // Rewrite pseudo to non-pseudo
+        const MCInstrDesc &Desc = TII->get(GetNonPseudoCallIndirectOpcode(MI));
+        MI.setDesc(Desc);
+
+        // Rewrite argument order
+        auto Uses = MI.explicit_uses();
+        MachineInstr::mop_iterator it = Uses.begin();
+        const MachineOperand MO = *it;
+
+        // Set up the flags immediate, which currently has no defined flags
+        // so it's always zero.
+        it->ChangeToImmediate(0);
+
+        MI.addOperand(MF, MO);
+
+        DEBUG(dbgs() << "  After transform: " << MI);
+        Changed = true;
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "\nDone fixing up CALL_INDIRECTs\n\n");
+
+  return Changed;
+}
+
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
new file mode 100644
index 000000000000..04ede7ff110c
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -0,0 +1,308 @@
+//===-- WebAssemblyExplicitLocals.cpp - Make Locals Explicit --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file converts any remaining registers into WebAssembly locals.
+///
+/// After register stackification and register coloring, convert non-stackified
+/// registers into locals, inserting explicit get_local and set_local
+/// instructions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-explicit-locals"
+
+namespace {
+class WebAssemblyExplicitLocals final : public MachineFunctionPass {
+  StringRef getPassName() const override {
+    return "WebAssembly Explicit Locals";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyExplicitLocals() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyExplicitLocals::ID = 0;
+FunctionPass *llvm::createWebAssemblyExplicitLocals() {
+  return new WebAssemblyExplicitLocals();
+}
+
+/// Return a local id number for the given register, assigning it a new one
+/// if it doesn't yet have one.
+static unsigned getLocalId(DenseMap<unsigned, unsigned> &Reg2Local,
+                           unsigned &CurLocal, unsigned Reg) {
+  return Reg2Local.insert(std::make_pair(Reg, CurLocal++)).first->second;
+}
+
+/// Get the appropriate get_local opcode for the given register class.
+static unsigned getGetLocalOpcode(const TargetRegisterClass *RC) {
+  if (RC == &WebAssembly::I32RegClass)
+    return WebAssembly::GET_LOCAL_I32;
+  if (RC == &WebAssembly::I64RegClass)
+    return WebAssembly::GET_LOCAL_I64;
+  if (RC == &WebAssembly::F32RegClass)
+    return WebAssembly::GET_LOCAL_F32;
+  if (RC == &WebAssembly::F64RegClass)
+    return WebAssembly::GET_LOCAL_F64;
+  if (RC == &WebAssembly::V128RegClass)
+    return WebAssembly::GET_LOCAL_V128;
+  llvm_unreachable("Unexpected register class");
+}
+
+/// Get the appropriate set_local opcode for the given register class.
+static unsigned getSetLocalOpcode(const TargetRegisterClass *RC) {
+  if (RC == &WebAssembly::I32RegClass)
+    return WebAssembly::SET_LOCAL_I32;
+  if (RC == &WebAssembly::I64RegClass)
+    return WebAssembly::SET_LOCAL_I64;
+  if (RC == &WebAssembly::F32RegClass)
+    return WebAssembly::SET_LOCAL_F32;
+  if (RC == &WebAssembly::F64RegClass)
+    return WebAssembly::SET_LOCAL_F64;
+  if (RC == &WebAssembly::V128RegClass)
+    return WebAssembly::SET_LOCAL_V128;
+  llvm_unreachable("Unexpected register class");
+}
+
+/// Get the appropriate tee_local opcode for the given register class.
+static unsigned getTeeLocalOpcode(const TargetRegisterClass *RC) {
+  if (RC == &WebAssembly::I32RegClass)
+    return WebAssembly::TEE_LOCAL_I32;
+  if (RC == &WebAssembly::I64RegClass)
+    return WebAssembly::TEE_LOCAL_I64;
+  if (RC == &WebAssembly::F32RegClass)
+    return WebAssembly::TEE_LOCAL_F32;
+  if (RC == &WebAssembly::F64RegClass)
+    return WebAssembly::TEE_LOCAL_F64;
+  if (RC == &WebAssembly::V128RegClass)
+    return WebAssembly::TEE_LOCAL_V128;
+  llvm_unreachable("Unexpected register class");
+}
+
+/// Get the type associated with the given register class.
+static MVT typeForRegClass(const TargetRegisterClass *RC) {
+  if (RC == &WebAssembly::I32RegClass)
+    return MVT::i32;
+  if (RC == &WebAssembly::I64RegClass)
+    return MVT::i64;
+  if (RC == &WebAssembly::F32RegClass)
+    return MVT::f32;
+  if (RC == &WebAssembly::F64RegClass)
+    return MVT::f64;
+  llvm_unreachable("unrecognized register class");
+}
+
+/// Given a MachineOperand of a stackified vreg, return the instruction at the
+/// start of the expression tree.
+static MachineInstr *FindStartOfTree(MachineOperand &MO,
+                                     MachineRegisterInfo &MRI,
+                                     WebAssemblyFunctionInfo &MFI) {
+  unsigned Reg = MO.getReg();
+  assert(MFI.isVRegStackified(Reg));
+  MachineInstr *Def = MRI.getVRegDef(Reg);
+
+  // Find the first stackified use and proceed from there.
+  for (MachineOperand &DefMO : Def->explicit_uses()) {
+    if (!DefMO.isReg())
+      continue;
+    return FindStartOfTree(DefMO, MRI, MFI);
+  }
+
+  // If there were no stackified uses, we've reached the start.
+  return Def;
+}
+
+bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Make Locals Explicit **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  // Disable this pass if we aren't doing direct wasm object emission.
+  if (MF.getSubtarget<WebAssemblySubtarget>()
+        .getTargetTriple().isOSBinFormatELF())
+    return false;
+
+  bool Changed = false;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+  // Map non-stackified virtual registers to their local ids.
+  DenseMap<unsigned, unsigned> Reg2Local;
+
+  // Handle ARGUMENTS first to ensure that they get the designated numbers.
+  for (MachineBasicBlock::iterator I = MF.begin()->begin(),
+                                   E = MF.begin()->end();
+       I != E;) {
+    MachineInstr &MI = *I++;
+    if (!WebAssembly::isArgument(MI))
+      break;
+    unsigned Reg = MI.getOperand(0).getReg();
+    assert(!MFI.isVRegStackified(Reg));
+    Reg2Local[Reg] = MI.getOperand(1).getImm();
+    MI.eraseFromParent();
+    Changed = true;
+  }
+
+  // Start assigning local numbers after the last parameter.
+  unsigned CurLocal = MFI.getParams().size();
+
+  // Visit each instruction in the function.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
+      MachineInstr &MI = *I++;
+      assert(!WebAssembly::isArgument(MI));
+
+      if (MI.isDebugValue() || MI.isLabel())
+        continue;
+
+      // Replace tee instructions with tee_local. The difference is that tee
+      // instructins have two defs, while tee_local instructions have one def
+      // and an index of a local to write to.
+      if (WebAssembly::isTee(MI)) {
+        assert(MFI.isVRegStackified(MI.getOperand(0).getReg()));
+        assert(!MFI.isVRegStackified(MI.getOperand(1).getReg()));
+        unsigned OldReg = MI.getOperand(2).getReg();
+        const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
+
+        // Stackify the input if it isn't stackified yet.
+        if (!MFI.isVRegStackified(OldReg)) {
+          unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+          unsigned NewReg = MRI.createVirtualRegister(RC);
+          unsigned Opc = getGetLocalOpcode(RC);
+          BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Opc), NewReg)
+              .addImm(LocalId);
+          MI.getOperand(2).setReg(NewReg);
+          MFI.stackifyVReg(NewReg);
+        }
+
+        // Replace the TEE with a TEE_LOCAL.
+        unsigned LocalId =
+            getLocalId(Reg2Local, CurLocal, MI.getOperand(1).getReg());
+        unsigned Opc = getTeeLocalOpcode(RC);
+        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Opc),
+                MI.getOperand(0).getReg())
+            .addImm(LocalId)
+            .addReg(MI.getOperand(2).getReg());
+
+        MI.eraseFromParent();
+        Changed = true;
+        continue;
+      }
+
+      // Insert set_locals for any defs that aren't stackified yet. Currently
+      // we handle at most one def.
+      assert(MI.getDesc().getNumDefs() <= 1);
+      if (MI.getDesc().getNumDefs() == 1) {
+        unsigned OldReg = MI.getOperand(0).getReg();
+        if (!MFI.isVRegStackified(OldReg) && !MRI.use_empty(OldReg)) {
+          unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+          const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
+          unsigned NewReg = MRI.createVirtualRegister(RC);
+          auto InsertPt = std::next(MachineBasicBlock::iterator(&MI));
+          unsigned Opc = getSetLocalOpcode(RC);
+          BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
+              .addImm(LocalId)
+              .addReg(NewReg);
+          MI.getOperand(0).setReg(NewReg);
+          MFI.stackifyVReg(NewReg);
+          Changed = true;
+        }
+      }
+
+      // Insert get_locals for any uses that aren't stackified yet.
+      MachineInstr *InsertPt = &MI;
+      for (MachineOperand &MO : reverse(MI.explicit_uses())) {
+        if (!MO.isReg())
+          continue;
+
+        unsigned OldReg = MO.getReg();
+
+        // If we see a stackified register, prepare to insert subsequent
+        // get_locals before the start of its tree.
+        if (MFI.isVRegStackified(OldReg)) {
+          InsertPt = FindStartOfTree(MO, MRI, MFI);
+          continue;
+        }
+
+        // Insert a get_local.
+        unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+        const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
+        unsigned NewReg = MRI.createVirtualRegister(RC);
+        unsigned Opc = getGetLocalOpcode(RC);
+        InsertPt =
+            BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc), NewReg)
+                .addImm(LocalId);
+        MO.setReg(NewReg);
+        MFI.stackifyVReg(NewReg);
+        Changed = true;
+      }
+
+      // Coalesce and eliminate COPY instructions.
+      if (WebAssembly::isCopy(MI)) {
+        MRI.replaceRegWith(MI.getOperand(1).getReg(),
+                           MI.getOperand(0).getReg());
+        MI.eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+
+  // Define the locals.
+  for (size_t i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    auto I = Reg2Local.find(Reg);
+    if (I == Reg2Local.end() || I->second < MFI.getParams().size())
+      continue;
+
+    MFI.addLocal(typeForRegClass(MRI.getRegClass(Reg)));
+    Changed = true;
+  }
+
+#ifndef NDEBUG
+  // Assert that all registers have been stackified at this point.
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      if (MI.isDebugValue() || MI.isLabel())
+        continue;
+      for (const MachineOperand &MO : MI.explicit_operands()) {
+        assert(
+            (!MO.isReg() || MRI.use_empty(MO.getReg()) ||
+             MFI.isVRegStackified(MO.getReg())) &&
+            "WebAssemblyExplicitLocals failed to stackify a register operand");
+      }
+    }
+  }
+#endif
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
new file mode 100644
index 000000000000..529540ea4ed2
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -0,0 +1,1274 @@
+//===-- WebAssemblyFastISel.cpp - WebAssembly FastISel implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the WebAssembly-specific support for the FastISel
+/// class. Some of the target-specific code is generated by tablegen in the file
+/// WebAssemblyGenFastISel.inc, which is #included here.
+///
+/// TODO: kill flags
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyTargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-fastisel"
+
+namespace {
+
+class WebAssemblyFastISel final : public FastISel {
+  // All possible address modes.
+  class Address {
+  public:
+    typedef enum { RegBase, FrameIndexBase } BaseKind;
+
+  private:
+    BaseKind Kind;
+    union {
+      unsigned Reg;
+      int FI;
+    } Base;
+
+    int64_t Offset;
+
+    const GlobalValue *GV;
+
+  public:
+    // Innocuous defaults for our address.
+    Address() : Kind(RegBase), Offset(0), GV(0) { Base.Reg = 0; }
+    void setKind(BaseKind K) { Kind = K; }
+    BaseKind getKind() const { return Kind; }
+    bool isRegBase() const { return Kind == RegBase; }
+    bool isFIBase() const { return Kind == FrameIndexBase; }
+    void setReg(unsigned Reg) {
+      assert(isRegBase() && "Invalid base register access!");
+      Base.Reg = Reg;
+    }
+    unsigned getReg() const {
+      assert(isRegBase() && "Invalid base register access!");
+      return Base.Reg;
+    }
+    void setFI(unsigned FI) {
+      assert(isFIBase() && "Invalid base frame index access!");
+      Base.FI = FI;
+    }
+    unsigned getFI() const {
+      assert(isFIBase() && "Invalid base frame index access!");
+      return Base.FI;
+    }
+
+    void setOffset(int64_t Offset_) {
+      assert(Offset_ >= 0 && "Offsets must be non-negative");
+      Offset = Offset_;
+    }
+    int64_t getOffset() const { return Offset; }
+    void setGlobalValue(const GlobalValue *G) { GV = G; }
+    const GlobalValue *getGlobalValue() const { return GV; }
+  };
+
+  /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
+  /// right decision when generating code for different targets.
+  const WebAssemblySubtarget *Subtarget;
+  LLVMContext *Context;
+
+private:
+  // Utility helper routines
+  MVT::SimpleValueType getSimpleType(Type *Ty) {
+    EVT VT = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true);
+    return VT.isSimple() ? VT.getSimpleVT().SimpleTy :
+                           MVT::INVALID_SIMPLE_VALUE_TYPE;
+  }
+  MVT::SimpleValueType getLegalType(MVT::SimpleValueType VT) {
+    switch (VT) {
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+      return MVT::i32;
+    case MVT::i32:
+    case MVT::i64:
+    case MVT::f32:
+    case MVT::f64:
+      return VT;
+    case MVT::v16i8:
+    case MVT::v8i16:
+    case MVT::v4i32:
+    case MVT::v4f32:
+      if (Subtarget->hasSIMD128())
+        return VT;
+      break;
+    default:
+      break;
+    }
+    return MVT::INVALID_SIMPLE_VALUE_TYPE;
+  }
+  bool computeAddress(const Value *Obj, Address &Addr);
+  void materializeLoadStoreOperands(Address &Addr);
+  void addLoadStoreOperands(const Address &Addr, const MachineInstrBuilder &MIB,
+                            MachineMemOperand *MMO);
+  unsigned maskI1Value(unsigned Reg, const Value *V);
+  unsigned getRegForI1Value(const Value *V, bool &Not);
+  unsigned zeroExtendToI32(unsigned Reg, const Value *V,
+                           MVT::SimpleValueType From);
+  unsigned signExtendToI32(unsigned Reg, const Value *V,
+                           MVT::SimpleValueType From);
+  unsigned zeroExtend(unsigned Reg, const Value *V,
+                      MVT::SimpleValueType From,
+                      MVT::SimpleValueType To);
+  unsigned signExtend(unsigned Reg, const Value *V,
+                      MVT::SimpleValueType From,
+                      MVT::SimpleValueType To);
+  unsigned getRegForUnsignedValue(const Value *V);
+  unsigned getRegForSignedValue(const Value *V);
+  unsigned getRegForPromotedValue(const Value *V, bool IsSigned);
+  unsigned notValue(unsigned Reg);
+  unsigned copyValue(unsigned Reg);
+
+  // Backend specific FastISel code.
+  unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
+  unsigned fastMaterializeConstant(const Constant *C) override;
+  bool fastLowerArguments() override;
+
+  // Selection routines.
+  bool selectCall(const Instruction *I);
+  bool selectSelect(const Instruction *I);
+  bool selectTrunc(const Instruction *I);
+  bool selectZExt(const Instruction *I);
+  bool selectSExt(const Instruction *I);
+  bool selectICmp(const Instruction *I);
+  bool selectFCmp(const Instruction *I);
+  bool selectBitCast(const Instruction *I);
+  bool selectLoad(const Instruction *I);
+  bool selectStore(const Instruction *I);
+  bool selectBr(const Instruction *I);
+  bool selectRet(const Instruction *I);
+  bool selectUnreachable(const Instruction *I);
+
+public:
+  // Backend specific FastISel code.
+  WebAssemblyFastISel(FunctionLoweringInfo &FuncInfo,
+                      const TargetLibraryInfo *LibInfo)
+      : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) {
+    Subtarget = &FuncInfo.MF->getSubtarget<WebAssemblySubtarget>();
+    Context = &FuncInfo.Fn->getContext();
+  }
+
+  bool fastSelectInstruction(const Instruction *I) override;
+
+#include "WebAssemblyGenFastISel.inc"
+};
+
+} // end anonymous namespace
+
+bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
+
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  if (auto *Ty = dyn_cast<PointerType>(Obj->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(Obj)) {
+    if (Addr.getGlobalValue())
+      return false;
+    Addr.setGlobalValue(GV);
+    return true;
+  }
+
+  switch (Opcode) {
+  default:
+    break;
+  case Instruction::BitCast: {
+    // Look through bitcasts.
+    return computeAddress(U->getOperand(0), Addr);
+  }
+  case Instruction::IntToPtr: {
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+        TLI.getPointerTy(DL))
+      return computeAddress(U->getOperand(0), Addr);
+    break;
+  }
+  case Instruction::PtrToInt: {
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+      return computeAddress(U->getOperand(0), Addr);
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    Address SavedAddr = Addr;
+    uint64_t TmpOffset = Addr.getOffset();
+    // Non-inbounds geps can wrap; wasm's offsets can't.
+    if (!cast<GEPOperator>(U)->isInBounds())
+      goto unsupported_gep;
+    // Iterate through the GEP folding the constants into offsets where
+    // we can.
+    for (gep_type_iterator GTI = gep_type_begin(U), E = gep_type_end(U);
+         GTI != E; ++GTI) {
+      const Value *Op = GTI.getOperand();
+      if (StructType *STy = GTI.getStructTypeOrNull()) {
+        const StructLayout *SL = DL.getStructLayout(STy);
+        unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+        TmpOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+        for (;;) {
+          if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+            // Constant-offset addressing.
+            TmpOffset += CI->getSExtValue() * S;
+            break;
+          }
+          if (S == 1 && Addr.isRegBase() && Addr.getReg() == 0) {
+            // An unscaled add of a register. Set it as the new base.
+            Addr.setReg(getRegForValue(Op));
+            break;
+          }
+          if (canFoldAddIntoGEP(U, Op)) {
+            // A compatible add with a constant operand. Fold the constant.
+            ConstantInt *CI =
+                cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+            TmpOffset += CI->getSExtValue() * S;
+            // Iterate on the other operand.
+            Op = cast<AddOperator>(Op)->getOperand(0);
+            continue;
+          }
+          // Unsupported
+          goto unsupported_gep;
+        }
+      }
+    }
+    // Don't fold in negative offsets.
+    if (int64_t(TmpOffset) >= 0) {
+      // Try to grab the base operand now.
+      Addr.setOffset(TmpOffset);
+      if (computeAddress(U->getOperand(0), Addr))
+        return true;
+    }
+    // We failed, restore everything and try the other options.
+    Addr = SavedAddr;
+  unsupported_gep:
+    break;
+  }
+  case Instruction::Alloca: {
+    const AllocaInst *AI = cast<AllocaInst>(Obj);
+    DenseMap<const AllocaInst *, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
+      Addr.setKind(Address::FrameIndexBase);
+      Addr.setFI(SI->second);
+      return true;
+    }
+    break;
+  }
+  case Instruction::Add: {
+    // Adds of constants are common and easy enough.
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    if (isa<ConstantInt>(LHS))
+      std::swap(LHS, RHS);
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+      uint64_t TmpOffset = Addr.getOffset() + CI->getSExtValue();
+      if (int64_t(TmpOffset) >= 0) {
+        Addr.setOffset(TmpOffset);
+        return computeAddress(LHS, Addr);
+      }
+    }
+
+    Address Backup = Addr;
+    if (computeAddress(LHS, Addr) && computeAddress(RHS, Addr))
+      return true;
+    Addr = Backup;
+
+    break;
+  }
+  case Instruction::Sub: {
+    // Subs of constants are common and easy enough.
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+      int64_t TmpOffset = Addr.getOffset() - CI->getSExtValue();
+      if (TmpOffset >= 0) {
+        Addr.setOffset(TmpOffset);
+        return computeAddress(LHS, Addr);
+      }
+    }
+    break;
+  }
+  }
+  Addr.setReg(getRegForValue(Obj));
+  return Addr.getReg() != 0;
+}
+
+void WebAssemblyFastISel::materializeLoadStoreOperands(Address &Addr) {
+  if (Addr.isRegBase()) {
+    unsigned Reg = Addr.getReg();
+    if (Reg == 0) {
+      Reg = createResultReg(Subtarget->hasAddr64() ?
+                            &WebAssembly::I64RegClass :
+                            &WebAssembly::I32RegClass);
+      unsigned Opc = Subtarget->hasAddr64() ?
+                     WebAssembly::CONST_I64 :
+                     WebAssembly::CONST_I32;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), Reg)
+         .addImm(0);
+      Addr.setReg(Reg);
+    }
+  }
+}
+
+void WebAssemblyFastISel::addLoadStoreOperands(const Address &Addr,
+                                               const MachineInstrBuilder &MIB,
+                                               MachineMemOperand *MMO) {
+  // Set the alignment operand (this is rewritten in SetP2AlignOperands).
+  // TODO: Disable SetP2AlignOperands for FastISel and just do it here.
+  MIB.addImm(0);
+
+  if (const GlobalValue *GV = Addr.getGlobalValue())
+    MIB.addGlobalAddress(GV, Addr.getOffset());
+  else
+    MIB.addImm(Addr.getOffset());
+
+  if (Addr.isRegBase())
+    MIB.addReg(Addr.getReg());
+  else
+    MIB.addFrameIndex(Addr.getFI());
+
+  MIB.addMemOperand(MMO);
+}
+
+unsigned WebAssemblyFastISel::maskI1Value(unsigned Reg, const Value *V) {
+  return zeroExtendToI32(Reg, V, MVT::i1);
+}
+
+unsigned WebAssemblyFastISel::getRegForI1Value(const Value *V, bool &Not) {
+  if (const ICmpInst *ICmp = dyn_cast<ICmpInst>(V))
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(ICmp->getOperand(1)))
+      if (ICmp->isEquality() && C->isZero() && C->getType()->isIntegerTy(32)) {
+        Not = ICmp->isTrueWhenEqual();
+        return getRegForValue(ICmp->getOperand(0));
+      }
+
+  if (BinaryOperator::isNot(V)) {
+    Not = true;
+    return getRegForValue(BinaryOperator::getNotArgument(V));
+  }
+
+  Not = false;
+  return maskI1Value(getRegForValue(V), V);
+}
+
+unsigned WebAssemblyFastISel::zeroExtendToI32(unsigned Reg, const Value *V,
+                                              MVT::SimpleValueType From) {
+  if (Reg == 0)
+    return 0;
+
+  switch (From) {
+  case MVT::i1:
+    // If the value is naturally an i1, we don't need to mask it.
+    // TODO: Recursively examine selects, phis, and, or, xor, constants.
+    if (From == MVT::i1 && V != nullptr) {
+      if (isa<CmpInst>(V) ||
+          (isa<Argument>(V) && cast<Argument>(V)->hasZExtAttr()))
+        return copyValue(Reg);
+    }
+  case MVT::i8:
+  case MVT::i16:
+    break;
+  case MVT::i32:
+    return copyValue(Reg);
+  default:
+    return 0;
+  }
+
+  unsigned Imm = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::CONST_I32), Imm)
+    .addImm(~(~uint64_t(0) << MVT(From).getSizeInBits()));
+
+  unsigned Result = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::AND_I32), Result)
+    .addReg(Reg)
+    .addReg(Imm);
+
+  return Result;
+}
+
+unsigned WebAssemblyFastISel::signExtendToI32(unsigned Reg, const Value *V,
+                                              MVT::SimpleValueType From) {
+  if (Reg == 0)
+    return 0;
+
+  switch (From) {
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+    break;
+  case MVT::i32:
+    return copyValue(Reg);
+  default:
+    return 0;
+  }
+
+  unsigned Imm = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::CONST_I32), Imm)
+    .addImm(32 - MVT(From).getSizeInBits());
+
+  unsigned Left = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::SHL_I32), Left)
+    .addReg(Reg)
+    .addReg(Imm);
+
+  unsigned Right = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::SHR_S_I32), Right)
+    .addReg(Left)
+    .addReg(Imm);
+
+  return Right;
+}
+
+unsigned WebAssemblyFastISel::zeroExtend(unsigned Reg, const Value *V,
+                                         MVT::SimpleValueType From,
+                                         MVT::SimpleValueType To) {
+  if (To == MVT::i64) {
+    if (From == MVT::i64)
+      return copyValue(Reg);
+
+    Reg = zeroExtendToI32(Reg, V, From);
+
+    unsigned Result = createResultReg(&WebAssembly::I64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(WebAssembly::I64_EXTEND_U_I32), Result)
+        .addReg(Reg);
+    return Result;
+  }
+
+  return zeroExtendToI32(Reg, V, From);
+}
+
+unsigned WebAssemblyFastISel::signExtend(unsigned Reg, const Value *V,
+                                         MVT::SimpleValueType From,
+                                         MVT::SimpleValueType To) {
+  if (To == MVT::i64) {
+    if (From == MVT::i64)
+      return copyValue(Reg);
+
+    Reg = signExtendToI32(Reg, V, From);
+
+    unsigned Result = createResultReg(&WebAssembly::I64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(WebAssembly::I64_EXTEND_S_I32), Result)
+        .addReg(Reg);
+    return Result;
+  }
+
+  return signExtendToI32(Reg, V, From);
+}
+
+unsigned WebAssemblyFastISel::getRegForUnsignedValue(const Value *V) {
+  MVT::SimpleValueType From = getSimpleType(V->getType());
+  MVT::SimpleValueType To = getLegalType(From);
+  return zeroExtend(getRegForValue(V), V, From, To);
+}
+
+unsigned WebAssemblyFastISel::getRegForSignedValue(const Value *V) {
+  MVT::SimpleValueType From = getSimpleType(V->getType());
+  MVT::SimpleValueType To = getLegalType(From);
+  return zeroExtend(getRegForValue(V), V, From, To);
+}
+
+unsigned WebAssemblyFastISel::getRegForPromotedValue(const Value *V,
+                                                     bool IsSigned) {
+  return IsSigned ? getRegForSignedValue(V) :
+                    getRegForUnsignedValue(V);
+}
+
+unsigned WebAssemblyFastISel::notValue(unsigned Reg) {
+  assert(MRI.getRegClass(Reg) == &WebAssembly::I32RegClass);
+
+  unsigned NotReg = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::EQZ_I32), NotReg)
+    .addReg(Reg);
+  return NotReg;
+}
+
+unsigned WebAssemblyFastISel::copyValue(unsigned Reg) {
+  unsigned ResultReg = createResultReg(MRI.getRegClass(Reg));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::COPY), ResultReg)
+    .addReg(Reg);
+  return ResultReg;
+}
+
+unsigned WebAssemblyFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
+  DenseMap<const AllocaInst *, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned ResultReg = createResultReg(Subtarget->hasAddr64() ?
+                                         &WebAssembly::I64RegClass :
+                                         &WebAssembly::I32RegClass);
+    unsigned Opc = Subtarget->hasAddr64() ?
+                   WebAssembly::COPY_I64 :
+                   WebAssembly::COPY_I32;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+        .addFrameIndex(SI->second);
+    return ResultReg;
+  }
+
+  return 0;
+}
+
+unsigned WebAssemblyFastISel::fastMaterializeConstant(const Constant *C) {
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) {
+    unsigned ResultReg = createResultReg(Subtarget->hasAddr64() ?
+                                         &WebAssembly::I64RegClass :
+                                         &WebAssembly::I32RegClass);
+    unsigned Opc = Subtarget->hasAddr64() ?
+                   WebAssembly::CONST_I64 :
+                   WebAssembly::CONST_I32;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+       .addGlobalAddress(GV);
+    return ResultReg;
+  }
+
+  // Let target-independent code handle it.
+  return 0;
+}
+
+bool WebAssemblyFastISel::fastLowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const Function *F = FuncInfo.Fn;
+  if (F->isVarArg())
+    return false;
+
+  unsigned i = 0;
+  for (auto const &Arg : F->args()) {
+    const AttributeSet &Attrs = F->getAttributes();
+    if (Attrs.hasAttribute(i+1, Attribute::ByVal) ||
+        Attrs.hasAttribute(i+1, Attribute::SwiftSelf) ||
+        Attrs.hasAttribute(i+1, Attribute::SwiftError) ||
+        Attrs.hasAttribute(i+1, Attribute::InAlloca) ||
+        Attrs.hasAttribute(i+1, Attribute::Nest))
+      return false;
+
+    Type *ArgTy = Arg.getType();
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy())
+      return false;
+    if (!Subtarget->hasSIMD128() && ArgTy->isVectorTy())
+      return false;
+
+    unsigned Opc;
+    const TargetRegisterClass *RC;
+    switch (getSimpleType(ArgTy)) {
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      Opc = WebAssembly::ARGUMENT_I32;
+      RC = &WebAssembly::I32RegClass;
+      break;
+    case MVT::i64:
+      Opc = WebAssembly::ARGUMENT_I64;
+      RC = &WebAssembly::I64RegClass;
+      break;
+    case MVT::f32:
+      Opc = WebAssembly::ARGUMENT_F32;
+      RC = &WebAssembly::F32RegClass;
+      break;
+    case MVT::f64:
+      Opc = WebAssembly::ARGUMENT_F64;
+      RC = &WebAssembly::F64RegClass;
+      break;
+    case MVT::v16i8:
+      Opc = WebAssembly::ARGUMENT_v16i8;
+      RC = &WebAssembly::V128RegClass;
+      break;
+    case MVT::v8i16:
+      Opc = WebAssembly::ARGUMENT_v8i16;
+      RC = &WebAssembly::V128RegClass;
+      break;
+    case MVT::v4i32:
+      Opc = WebAssembly::ARGUMENT_v4i32;
+      RC = &WebAssembly::V128RegClass;
+      break;
+    case MVT::v4f32:
+      Opc = WebAssembly::ARGUMENT_v4f32;
+      RC = &WebAssembly::V128RegClass;
+      break;
+    default:
+      return false;
+    }
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addImm(i);
+    updateValueMap(&Arg, ResultReg);
+
+    ++i;
+  }
+
+  MRI.addLiveIn(WebAssembly::ARGUMENTS);
+
+  auto *MFI = MF->getInfo<WebAssemblyFunctionInfo>();
+  for (auto const &Arg : F->args())
+    MFI->addParam(getLegalType(getSimpleType(Arg.getType())));
+
+  return true;
+}
+
+bool WebAssemblyFastISel::selectCall(const Instruction *I) {
+  const CallInst *Call = cast<CallInst>(I);
+
+  if (Call->isMustTailCall() || Call->isInlineAsm() ||
+      Call->getFunctionType()->isVarArg())
+    return false;
+
+  Function *Func = Call->getCalledFunction();
+  if (Func && Func->isIntrinsic())
+    return false;
+
+  FunctionType *FuncTy = Call->getFunctionType();
+  unsigned Opc;
+  bool IsDirect = Func != nullptr;
+  bool IsVoid = FuncTy->getReturnType()->isVoidTy();
+  unsigned ResultReg;
+  if (IsVoid) {
+    Opc = IsDirect ? WebAssembly::CALL_VOID : WebAssembly::PCALL_INDIRECT_VOID;
+  } else {
+    if (!Subtarget->hasSIMD128() && Call->getType()->isVectorTy())
+      return false;
+
+    MVT::SimpleValueType RetTy = getSimpleType(Call->getType());
+    switch (RetTy) {
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      Opc = IsDirect ? WebAssembly::CALL_I32 : WebAssembly::PCALL_INDIRECT_I32;
+      ResultReg = createResultReg(&WebAssembly::I32RegClass);
+      break;
+    case MVT::i64:
+      Opc = IsDirect ? WebAssembly::CALL_I64 : WebAssembly::PCALL_INDIRECT_I64;
+      ResultReg = createResultReg(&WebAssembly::I64RegClass);
+      break;
+    case MVT::f32:
+      Opc = IsDirect ? WebAssembly::CALL_F32 : WebAssembly::PCALL_INDIRECT_F32;
+      ResultReg = createResultReg(&WebAssembly::F32RegClass);
+      break;
+    case MVT::f64:
+      Opc = IsDirect ? WebAssembly::CALL_F64 : WebAssembly::PCALL_INDIRECT_F64;
+      ResultReg = createResultReg(&WebAssembly::F64RegClass);
+      break;
+    case MVT::v16i8:
+      Opc =
+          IsDirect ? WebAssembly::CALL_v16i8 : WebAssembly::PCALL_INDIRECT_v16i8;
+      ResultReg = createResultReg(&WebAssembly::V128RegClass);
+      break;
+    case MVT::v8i16:
+      Opc =
+          IsDirect ? WebAssembly::CALL_v8i16 : WebAssembly::PCALL_INDIRECT_v8i16;
+      ResultReg = createResultReg(&WebAssembly::V128RegClass);
+      break;
+    case MVT::v4i32:
+      Opc =
+          IsDirect ? WebAssembly::CALL_v4i32 : WebAssembly::PCALL_INDIRECT_v4i32;
+      ResultReg = createResultReg(&WebAssembly::V128RegClass);
+      break;
+    case MVT::v4f32:
+      Opc =
+          IsDirect ? WebAssembly::CALL_v4f32 : WebAssembly::PCALL_INDIRECT_v4f32;
+      ResultReg = createResultReg(&WebAssembly::V128RegClass);
+      break;
+    default:
+      return false;
+    }
+  }
+
+  SmallVector<unsigned, 8> Args;
+  for (unsigned i = 0, e = Call->getNumArgOperands(); i < e; ++i) {
+    Value *V = Call->getArgOperand(i);
+    MVT::SimpleValueType ArgTy = getSimpleType(V->getType());
+    if (ArgTy == MVT::INVALID_SIMPLE_VALUE_TYPE)
+      return false;
+
+    const AttributeSet &Attrs = Call->getAttributes();
+    if (Attrs.hasAttribute(i+1, Attribute::ByVal) ||
+        Attrs.hasAttribute(i+1, Attribute::SwiftSelf) ||
+        Attrs.hasAttribute(i+1, Attribute::SwiftError) ||
+        Attrs.hasAttribute(i+1, Attribute::InAlloca) ||
+        Attrs.hasAttribute(i+1, Attribute::Nest))
+      return false;
+
+    unsigned Reg;
+
+    if (Attrs.hasAttribute(i+1, Attribute::SExt))
+      Reg = getRegForSignedValue(V);
+    else if (Attrs.hasAttribute(i+1, Attribute::ZExt))
+      Reg = getRegForUnsignedValue(V);
+    else
+      Reg = getRegForValue(V);
+
+    if (Reg == 0)
+      return false;
+
+    Args.push_back(Reg);
+  }
+
+  auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+
+  if (!IsVoid)
+    MIB.addReg(ResultReg, RegState::Define);
+
+  if (IsDirect)
+    MIB.addGlobalAddress(Func);
+  else
+    MIB.addReg(getRegForValue(Call->getCalledValue()));
+
+  for (unsigned ArgReg : Args)
+    MIB.addReg(ArgReg);
+
+  if (!IsVoid)
+    updateValueMap(Call, ResultReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
+  const SelectInst *Select = cast<SelectInst>(I);
+
+  bool Not;
+  unsigned CondReg  = getRegForI1Value(Select->getCondition(), Not);
+  if (CondReg == 0)
+    return false;
+
+  unsigned TrueReg  = getRegForValue(Select->getTrueValue());
+  if (TrueReg == 0)
+    return false;
+
+  unsigned FalseReg = getRegForValue(Select->getFalseValue());
+  if (FalseReg == 0)
+    return false;
+
+  if (Not)
+    std::swap(TrueReg, FalseReg);
+
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  switch (getSimpleType(Select->getType())) {
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    Opc = WebAssembly::SELECT_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i64:
+    Opc = WebAssembly::SELECT_I64;
+    RC = &WebAssembly::I64RegClass;
+    break;
+  case MVT::f32:
+    Opc = WebAssembly::SELECT_F32;
+    RC = &WebAssembly::F32RegClass;
+    break;
+  case MVT::f64:
+    Opc = WebAssembly::SELECT_F64;
+    RC = &WebAssembly::F64RegClass;
+    break;
+  default:
+    return false;
+  }
+
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+    .addReg(TrueReg)
+    .addReg(FalseReg)
+    .addReg(CondReg);
+
+  updateValueMap(Select, ResultReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectTrunc(const Instruction *I) {
+  const TruncInst *Trunc = cast<TruncInst>(I);
+
+  unsigned Reg = getRegForValue(Trunc->getOperand(0));
+  if (Reg == 0)
+    return false;
+
+  if (Trunc->getOperand(0)->getType()->isIntegerTy(64)) {
+    unsigned Result = createResultReg(&WebAssembly::I32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(WebAssembly::I32_WRAP_I64), Result)
+        .addReg(Reg);
+    Reg = Result;
+  }
+
+  updateValueMap(Trunc, Reg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectZExt(const Instruction *I) {
+  const ZExtInst *ZExt = cast<ZExtInst>(I);
+
+  const Value *Op = ZExt->getOperand(0);
+  MVT::SimpleValueType From = getSimpleType(Op->getType());
+  MVT::SimpleValueType To = getLegalType(getSimpleType(ZExt->getType()));
+  unsigned Reg = zeroExtend(getRegForValue(Op), Op, From, To);
+  if (Reg == 0)
+    return false;
+
+  updateValueMap(ZExt, Reg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectSExt(const Instruction *I) {
+  const SExtInst *SExt = cast<SExtInst>(I);
+
+  const Value *Op = SExt->getOperand(0);
+  MVT::SimpleValueType From = getSimpleType(Op->getType());
+  MVT::SimpleValueType To = getLegalType(getSimpleType(SExt->getType()));
+  unsigned Reg = signExtend(getRegForValue(Op), Op, From, To);
+  if (Reg == 0)
+    return false;
+
+  updateValueMap(SExt, Reg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectICmp(const Instruction *I) {
+  const ICmpInst *ICmp = cast<ICmpInst>(I);
+
+  bool I32 = getSimpleType(ICmp->getOperand(0)->getType()) != MVT::i64;
+  unsigned Opc;
+  bool isSigned = false;
+  switch (ICmp->getPredicate()) {
+  case ICmpInst::ICMP_EQ:
+    Opc = I32 ? WebAssembly::EQ_I32 : WebAssembly::EQ_I64;
+    break;
+  case ICmpInst::ICMP_NE:
+    Opc = I32 ? WebAssembly::NE_I32 : WebAssembly::NE_I64;
+    break;
+  case ICmpInst::ICMP_UGT:
+    Opc = I32 ? WebAssembly::GT_U_I32 : WebAssembly::GT_U_I64;
+    break;
+  case ICmpInst::ICMP_UGE:
+    Opc = I32 ? WebAssembly::GE_U_I32 : WebAssembly::GE_U_I64;
+    break;
+  case ICmpInst::ICMP_ULT:
+    Opc = I32 ? WebAssembly::LT_U_I32 : WebAssembly::LT_U_I64;
+    break;
+  case ICmpInst::ICMP_ULE:
+    Opc = I32 ? WebAssembly::LE_U_I32 : WebAssembly::LE_U_I64;
+    break;
+  case ICmpInst::ICMP_SGT:
+    Opc = I32 ? WebAssembly::GT_S_I32 : WebAssembly::GT_S_I64;
+    isSigned = true;
+    break;
+  case ICmpInst::ICMP_SGE:
+    Opc = I32 ? WebAssembly::GE_S_I32 : WebAssembly::GE_S_I64;
+    isSigned = true;
+    break;
+  case ICmpInst::ICMP_SLT:
+    Opc = I32 ? WebAssembly::LT_S_I32 : WebAssembly::LT_S_I64;
+    isSigned = true;
+    break;
+  case ICmpInst::ICMP_SLE:
+    Opc = I32 ? WebAssembly::LE_S_I32 : WebAssembly::LE_S_I64;
+    isSigned = true;
+    break;
+  default: return false;
+  }
+
+  unsigned LHS = getRegForPromotedValue(ICmp->getOperand(0), isSigned);
+  if (LHS == 0)
+    return false;
+
+  unsigned RHS = getRegForPromotedValue(ICmp->getOperand(1), isSigned);
+  if (RHS == 0)
+    return false;
+
+  unsigned ResultReg = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(LHS)
+      .addReg(RHS);
+  updateValueMap(ICmp, ResultReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectFCmp(const Instruction *I) {
+  const FCmpInst *FCmp = cast<FCmpInst>(I);
+
+  unsigned LHS = getRegForValue(FCmp->getOperand(0));
+  if (LHS == 0)
+    return false;
+
+  unsigned RHS = getRegForValue(FCmp->getOperand(1));
+  if (RHS == 0)
+    return false;
+
+  bool F32 = getSimpleType(FCmp->getOperand(0)->getType()) != MVT::f64;
+  unsigned Opc;
+  bool Not = false;
+  switch (FCmp->getPredicate()) {
+  case FCmpInst::FCMP_OEQ:
+    Opc = F32 ? WebAssembly::EQ_F32 : WebAssembly::EQ_F64;
+    break;
+  case FCmpInst::FCMP_UNE:
+    Opc = F32 ? WebAssembly::NE_F32 : WebAssembly::NE_F64;
+    break;
+  case FCmpInst::FCMP_OGT:
+    Opc = F32 ? WebAssembly::GT_F32 : WebAssembly::GT_F64;
+    break;
+  case FCmpInst::FCMP_OGE:
+    Opc = F32 ? WebAssembly::GE_F32 : WebAssembly::GE_F64;
+    break;
+  case FCmpInst::FCMP_OLT:
+    Opc = F32 ? WebAssembly::LT_F32 : WebAssembly::LT_F64;
+    break;
+  case FCmpInst::FCMP_OLE:
+    Opc = F32 ? WebAssembly::LE_F32 : WebAssembly::LE_F64;
+    break;
+  case FCmpInst::FCMP_UGT:
+    Opc = F32 ? WebAssembly::LE_F32 : WebAssembly::LE_F64;
+    Not = true;
+    break;
+  case FCmpInst::FCMP_UGE:
+    Opc = F32 ? WebAssembly::LT_F32 : WebAssembly::LT_F64;
+    Not = true;
+    break;
+  case FCmpInst::FCMP_ULT:
+    Opc = F32 ? WebAssembly::GE_F32 : WebAssembly::GE_F64;
+    Not = true;
+    break;
+  case FCmpInst::FCMP_ULE:
+    Opc = F32 ? WebAssembly::GT_F32 : WebAssembly::GT_F64;
+    Not = true;
+    break;
+  default:
+    return false;
+  }
+
+  unsigned ResultReg = createResultReg(&WebAssembly::I32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(LHS)
+      .addReg(RHS);
+
+  if (Not)
+    ResultReg = notValue(ResultReg);
+
+  updateValueMap(FCmp, ResultReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectBitCast(const Instruction *I) {
+  // Target-independent code can handle this, except it doesn't set the dead
+  // flag on the ARGUMENTS clobber, so we have to do that manually in order
+  // to satisfy code that expects this of isBitcast() instructions.
+  EVT VT = TLI.getValueType(DL, I->getOperand(0)->getType());
+  EVT RetVT = TLI.getValueType(DL, I->getType());
+  if (!VT.isSimple() || !RetVT.isSimple())
+    return false;
+
+  if (VT == RetVT) {
+    // No-op bitcast.
+    updateValueMap(I, getRegForValue(I->getOperand(0)));
+    return true;
+  }
+
+  unsigned Reg = fastEmit_ISD_BITCAST_r(VT.getSimpleVT(), RetVT.getSimpleVT(),
+                                        getRegForValue(I->getOperand(0)),
+                                        I->getOperand(0)->hasOneUse());
+  if (!Reg)
+    return false;
+  MachineBasicBlock::iterator Iter = FuncInfo.InsertPt;
+  --Iter;
+  assert(Iter->isBitcast());
+  Iter->setPhysRegsDeadExcept(ArrayRef<unsigned>(), TRI);
+  updateValueMap(I, Reg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectLoad(const Instruction *I) {
+  const LoadInst *Load = cast<LoadInst>(I);
+  if (Load->isAtomic())
+    return false;
+  if (!Subtarget->hasSIMD128() && Load->getType()->isVectorTy())
+    return false;
+
+  Address Addr;
+  if (!computeAddress(Load->getPointerOperand(), Addr))
+    return false;
+
+  // TODO: Fold a following sign-/zero-extend into the load instruction.
+
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  switch (getSimpleType(Load->getType())) {
+  case MVT::i1:
+  case MVT::i8:
+    Opc = WebAssembly::LOAD8_U_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i16:
+    Opc = WebAssembly::LOAD16_U_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i32:
+    Opc = WebAssembly::LOAD_I32;
+    RC = &WebAssembly::I32RegClass;
+    break;
+  case MVT::i64:
+    Opc = WebAssembly::LOAD_I64;
+    RC = &WebAssembly::I64RegClass;
+    break;
+  case MVT::f32:
+    Opc = WebAssembly::LOAD_F32;
+    RC = &WebAssembly::F32RegClass;
+    break;
+  case MVT::f64:
+    Opc = WebAssembly::LOAD_F64;
+    RC = &WebAssembly::F64RegClass;
+    break;
+  default:
+    return false;
+  }
+
+  materializeLoadStoreOperands(Addr);
+
+  unsigned ResultReg = createResultReg(RC);
+  auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                     ResultReg);
+
+  addLoadStoreOperands(Addr, MIB, createMachineMemOperandFor(Load));
+
+  updateValueMap(Load, ResultReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectStore(const Instruction *I) {
+  const StoreInst *Store = cast<StoreInst>(I);
+  if (Store->isAtomic())
+    return false;
+  if (!Subtarget->hasSIMD128() &&
+      Store->getValueOperand()->getType()->isVectorTy())
+    return false;
+
+  Address Addr;
+  if (!computeAddress(Store->getPointerOperand(), Addr))
+    return false;
+
+  unsigned Opc;
+  bool VTIsi1 = false;
+  switch (getSimpleType(Store->getValueOperand()->getType())) {
+  case MVT::i1:
+    VTIsi1 = true;
+  case MVT::i8:
+    Opc = WebAssembly::STORE8_I32;
+    break;
+  case MVT::i16:
+    Opc = WebAssembly::STORE16_I32;
+    break;
+  case MVT::i32:
+    Opc = WebAssembly::STORE_I32;
+    break;
+  case MVT::i64:
+    Opc = WebAssembly::STORE_I64;
+    break;
+  case MVT::f32:
+    Opc = WebAssembly::STORE_F32;
+    break;
+  case MVT::f64:
+    Opc = WebAssembly::STORE_F64;
+    break;
+  default: return false;
+  }
+
+  materializeLoadStoreOperands(Addr);
+
+  unsigned ValueReg = getRegForValue(Store->getValueOperand());
+  if (ValueReg == 0)
+    return false;
+  if (VTIsi1)
+    ValueReg = maskI1Value(ValueReg, Store->getValueOperand());
+
+  auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+
+  addLoadStoreOperands(Addr, MIB, createMachineMemOperandFor(Store));
+
+  MIB.addReg(ValueReg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectBr(const Instruction *I) {
+  const BranchInst *Br = cast<BranchInst>(I);
+  if (Br->isUnconditional()) {
+    MachineBasicBlock *MSucc = FuncInfo.MBBMap[Br->getSuccessor(0)];
+    fastEmitBranch(MSucc, Br->getDebugLoc());
+    return true;
+  }
+
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[Br->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[Br->getSuccessor(1)];
+
+  bool Not;
+  unsigned CondReg = getRegForI1Value(Br->getCondition(), Not);
+  if (CondReg == 0)
+    return false;
+
+  unsigned Opc = WebAssembly::BR_IF;
+  if (Not)
+    Opc = WebAssembly::BR_UNLESS;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+      .addMBB(TBB)
+      .addReg(CondReg);
+
+  finishCondBranch(Br->getParent(), TBB, FBB);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectRet(const Instruction *I) {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+
+  if (Ret->getNumOperands() == 0) {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(WebAssembly::RETURN_VOID));
+    return true;
+  }
+
+  Value *RV = Ret->getOperand(0);
+  if (!Subtarget->hasSIMD128() && RV->getType()->isVectorTy())
+    return false;
+
+  unsigned Opc;
+  switch (getSimpleType(RV->getType())) {
+  case MVT::i1: case MVT::i8:
+  case MVT::i16: case MVT::i32:
+    Opc = WebAssembly::RETURN_I32;
+    break;
+  case MVT::i64:
+    Opc = WebAssembly::RETURN_I64;
+    break;
+  case MVT::f32:
+    Opc = WebAssembly::RETURN_F32;
+    break;
+  case MVT::f64:
+    Opc = WebAssembly::RETURN_F64;
+    break;
+  case MVT::v16i8:
+    Opc = WebAssembly::RETURN_v16i8;
+    break;
+  case MVT::v8i16:
+    Opc = WebAssembly::RETURN_v8i16;
+    break;
+  case MVT::v4i32:
+    Opc = WebAssembly::RETURN_v4i32;
+    break;
+  case MVT::v4f32:
+    Opc = WebAssembly::RETURN_v4f32;
+    break;
+  default: return false;
+  }
+
+  unsigned Reg;
+  if (FuncInfo.Fn->getAttributes().hasAttribute(0, Attribute::SExt))
+    Reg = getRegForSignedValue(RV);
+  else if (FuncInfo.Fn->getAttributes().hasAttribute(0, Attribute::ZExt))
+    Reg = getRegForUnsignedValue(RV);
+  else
+    Reg = getRegForValue(RV);
+
+  if (Reg == 0)
+    return false;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)).addReg(Reg);
+  return true;
+}
+
+bool WebAssemblyFastISel::selectUnreachable(const Instruction *I) {
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(WebAssembly::UNREACHABLE));
+  return true;
+}
+
+bool WebAssemblyFastISel::fastSelectInstruction(const Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Call:
+    if (selectCall(I))
+      return true;
+    break;
+  case Instruction::Select:      return selectSelect(I);
+  case Instruction::Trunc:       return selectTrunc(I);
+  case Instruction::ZExt:        return selectZExt(I);
+  case Instruction::SExt:        return selectSExt(I);
+  case Instruction::ICmp:        return selectICmp(I);
+  case Instruction::FCmp:        return selectFCmp(I);
+  case Instruction::BitCast:     return selectBitCast(I);
+  case Instruction::Load:        return selectLoad(I);
+  case Instruction::Store:       return selectStore(I);
+  case Instruction::Br:          return selectBr(I);
+  case Instruction::Ret:         return selectRet(I);
+  case Instruction::Unreachable: return selectUnreachable(I);
+  default: break;
+  }
+
+  // Fall back to target-independent instruction selection.
+  return selectOperator(I, I->getOpcode());
+}
+
+FastISel *WebAssembly::createFastISel(FunctionLoweringInfo &FuncInfo,
+                                      const TargetLibraryInfo *LibInfo) {
+  return new WebAssemblyFastISel(FuncInfo, LibInfo);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
new file mode 100644
index 000000000000..2bbf7a2b42f9
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
@@ -0,0 +1,296 @@
+//=- WebAssemblyFixIrreducibleControlFlow.cpp - Fix irreducible control flow -//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a pass that transforms irreducible control flow
+/// into reducible control flow. Irreducible control flow means multiple-entry
+/// loops; they appear as CFG cycles that are not recorded in MachineLoopInfo
+/// due to being unnatural.
+///
+/// Note that LLVM has a generic pass that lowers irreducible control flow, but
+/// it linearizes control flow, turning diamonds into two triangles, which is
+/// both unnecessary and undesirable for WebAssembly.
+///
+/// TODO: The transformation implemented here handles all irreducible control
+/// flow, without exponential code-size expansion, though it does so by creating
+/// inefficient code in many cases. Ideally, we should add other
+/// transformations, including code-duplicating cases, which can be more
+/// efficient in common cases, and they can fall back to this conservative
+/// implementation as needed.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-fix-irreducible-control-flow"
+
+namespace {
+class WebAssemblyFixIrreducibleControlFlow final : public MachineFunctionPass {
+  StringRef getPassName() const override {
+    return "WebAssembly Fix Irreducible Control Flow";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  bool VisitLoop(MachineFunction &MF, MachineLoopInfo &MLI, MachineLoop *Loop);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyFixIrreducibleControlFlow() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyFixIrreducibleControlFlow::ID = 0;
+FunctionPass *llvm::createWebAssemblyFixIrreducibleControlFlow() {
+  return new WebAssemblyFixIrreducibleControlFlow();
+}
+
+namespace {
+
+/// A utility for walking the blocks of a loop, handling a nested inner
+/// loop as a monolithic conceptual block.
+class MetaBlock {
+  MachineBasicBlock *Block;
+  SmallVector<MachineBasicBlock *, 2> Preds;
+  SmallVector<MachineBasicBlock *, 2> Succs;
+
+public:
+  explicit MetaBlock(MachineBasicBlock *MBB)
+      : Block(MBB), Preds(MBB->pred_begin(), MBB->pred_end()),
+        Succs(MBB->succ_begin(), MBB->succ_end()) {}
+
+  explicit MetaBlock(MachineLoop *Loop) : Block(Loop->getHeader()) {
+    Loop->getExitBlocks(Succs);
+    for (MachineBasicBlock *Pred : Block->predecessors())
+      if (!Loop->contains(Pred))
+        Preds.push_back(Pred);
+  }
+
+  MachineBasicBlock *getBlock() const { return Block; }
+
+  const SmallVectorImpl<MachineBasicBlock *> &predecessors() const {
+    return Preds;
+  }
+  const SmallVectorImpl<MachineBasicBlock *> &successors() const {
+    return Succs;
+  }
+
+  bool operator==(const MetaBlock &MBB) { return Block == MBB.Block; }
+  bool operator!=(const MetaBlock &MBB) { return Block != MBB.Block; }
+};
+
+class SuccessorList final : public MetaBlock {
+  size_t Index;
+  size_t Num;
+
+public:
+  explicit SuccessorList(MachineBasicBlock *MBB)
+      : MetaBlock(MBB), Index(0), Num(successors().size()) {}
+
+  explicit SuccessorList(MachineLoop *Loop)
+      : MetaBlock(Loop), Index(0), Num(successors().size()) {}
+
+  bool HasNext() const { return Index != Num; }
+
+  MachineBasicBlock *Next() {
+    assert(HasNext());
+    return successors()[Index++];
+  }
+};
+
+} // end anonymous namespace
+
+bool WebAssemblyFixIrreducibleControlFlow::VisitLoop(MachineFunction &MF,
+                                                     MachineLoopInfo &MLI,
+                                                     MachineLoop *Loop) {
+  MachineBasicBlock *Header = Loop ? Loop->getHeader() : &*MF.begin();
+  SetVector<MachineBasicBlock *> RewriteSuccs;
+
+  // DFS through Loop's body, looking for for irreducible control flow. Loop is
+  // natural, and we stay in its body, and we treat any nested loops
+  // monolithically, so any cycles we encounter indicate irreducibility.
+  SmallPtrSet<MachineBasicBlock *, 8> OnStack;
+  SmallPtrSet<MachineBasicBlock *, 8> Visited;
+  SmallVector<SuccessorList, 4> LoopWorklist;
+  LoopWorklist.push_back(SuccessorList(Header));
+  OnStack.insert(Header);
+  Visited.insert(Header);
+  while (!LoopWorklist.empty()) {
+    SuccessorList &Top = LoopWorklist.back();
+    if (Top.HasNext()) {
+      MachineBasicBlock *Next = Top.Next();
+      if (Next == Header || (Loop && !Loop->contains(Next)))
+        continue;
+      if (LLVM_LIKELY(OnStack.insert(Next).second)) {
+        if (!Visited.insert(Next).second) {
+          OnStack.erase(Next);
+          continue;
+        }
+        MachineLoop *InnerLoop = MLI.getLoopFor(Next);
+        if (InnerLoop != Loop)
+          LoopWorklist.push_back(SuccessorList(InnerLoop));
+        else
+          LoopWorklist.push_back(SuccessorList(Next));
+      } else {
+        RewriteSuccs.insert(Top.getBlock());
+      }
+      continue;
+    }
+    OnStack.erase(Top.getBlock());
+    LoopWorklist.pop_back();
+  }
+
+  // Most likely, we didn't find any irreducible control flow.
+  if (LLVM_LIKELY(RewriteSuccs.empty()))
+    return false;
+
+  DEBUG(dbgs() << "Irreducible control flow detected!\n");
+
+  // Ok. We have irreducible control flow! Create a dispatch block which will
+  // contains a jump table to any block in the problematic set of blocks.
+  MachineBasicBlock *Dispatch = MF.CreateMachineBasicBlock();
+  MF.insert(MF.end(), Dispatch);
+  MLI.changeLoopFor(Dispatch, Loop);
+
+  // Add the jump table.
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  MachineInstrBuilder MIB = BuildMI(*Dispatch, Dispatch->end(), DebugLoc(),
+                                    TII.get(WebAssembly::BR_TABLE_I32));
+
+  // Add the register which will be used to tell the jump table which block to
+  // jump to.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+  MIB.addReg(Reg);
+
+  // Collect all the blocks which need to have their successors rewritten,
+  // add the successors to the jump table, and remember their index.
+  DenseMap<MachineBasicBlock *, unsigned> Indices;
+  SmallVector<MachineBasicBlock *, 4> SuccWorklist(RewriteSuccs.begin(),
+                                                   RewriteSuccs.end());
+  while (!SuccWorklist.empty()) {
+    MachineBasicBlock *MBB = SuccWorklist.pop_back_val();
+    auto Pair = Indices.insert(std::make_pair(MBB, 0));
+    if (!Pair.second)
+      continue;
+
+    unsigned Index = MIB.getInstr()->getNumExplicitOperands() - 1;
+    DEBUG(dbgs() << "MBB#" << MBB->getNumber() << " has index " << Index
+                 << "\n");
+
+    Pair.first->second = Index;
+    for (auto Pred : MBB->predecessors())
+      RewriteSuccs.insert(Pred);
+
+    MIB.addMBB(MBB);
+    Dispatch->addSuccessor(MBB);
+
+    MetaBlock Meta(MBB);
+    for (auto *Succ : Meta.successors())
+      if (Succ != Header && (!Loop || Loop->contains(Succ)))
+        SuccWorklist.push_back(Succ);
+  }
+
+  // Rewrite the problematic successors for every block in RewriteSuccs.
+  // For simplicity, we just introduce a new block for every edge we need to
+  // rewrite. Fancier things are possible.
+  for (MachineBasicBlock *MBB : RewriteSuccs) {
+    DenseMap<MachineBasicBlock *, MachineBasicBlock *> Map;
+    for (auto *Succ : MBB->successors()) {
+      if (!Indices.count(Succ))
+        continue;
+
+      MachineBasicBlock *Split = MF.CreateMachineBasicBlock();
+      MF.insert(MBB->isLayoutSuccessor(Succ) ? MachineFunction::iterator(Succ)
+                                             : MF.end(),
+                Split);
+      MLI.changeLoopFor(Split, Loop);
+
+      // Set the jump table's register of the index of the block we wish to
+      // jump to, and jump to the jump table.
+      BuildMI(*Split, Split->end(), DebugLoc(), TII.get(WebAssembly::CONST_I32),
+              Reg)
+          .addImm(Indices[Succ]);
+      BuildMI(*Split, Split->end(), DebugLoc(), TII.get(WebAssembly::BR))
+          .addMBB(Dispatch);
+      Split->addSuccessor(Dispatch);
+      Map[Succ] = Split;
+    }
+    // Remap the terminator operands and the successor list.
+    for (MachineInstr &Term : MBB->terminators())
+      for (auto &Op : Term.explicit_uses())
+        if (Op.isMBB() && Indices.count(Op.getMBB()))
+          Op.setMBB(Map[Op.getMBB()]);
+    for (auto Rewrite : Map)
+      MBB->replaceSuccessor(Rewrite.first, Rewrite.second);
+  }
+
+  // Create a fake default label, because br_table requires one.
+  MIB.addMBB(MIB.getInstr()
+                 ->getOperand(MIB.getInstr()->getNumExplicitOperands() - 1)
+                 .getMBB());
+
+  return true;
+}
+
+bool WebAssemblyFixIrreducibleControlFlow::runOnMachineFunction(
+    MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Fixing Irreducible Control Flow **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  bool Changed = false;
+  auto &MLI = getAnalysis<MachineLoopInfo>();
+
+  // Visit the function body, which is identified as a null loop.
+  Changed |= VisitLoop(MF, MLI, nullptr);
+
+  // Visit all the loops.
+  SmallVector<MachineLoop *, 8> Worklist(MLI.begin(), MLI.end());
+  while (!Worklist.empty()) {
+    MachineLoop *CurLoop = Worklist.pop_back_val();
+    Worklist.append(CurLoop->begin(), CurLoop->end());
+    Changed |= VisitLoop(MF, MLI, CurLoop);
+  }
+
+  // If we made any changes, completely recompute everything.
+  if (LLVM_UNLIKELY(Changed)) {
+    DEBUG(dbgs() << "Recomputing dominators and loops.\n");
+    MF.getRegInfo().invalidateLiveness();
+    MF.RenumberBlocks();
+    getAnalysis<MachineDominatorTree>().runOnMachineFunction(MF);
+    MLI.runOnMachineFunction(MF);
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
new file mode 100644
index 000000000000..a6a2c0bf06ae
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -0,0 +1,257 @@
+//===-- WebAssemblyFrameLowering.cpp - WebAssembly Frame Lowering ----------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the WebAssembly implementation of
+/// TargetFrameLowering class.
+///
+/// On WebAssembly, there aren't a lot of things to do here. There are no
+/// callee-saved registers to save, and no spill slots.
+///
+/// The stack grows downward.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyFrameLowering.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyInstrInfo.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-frame-info"
+
+// TODO: wasm64
+// TODO: Emit TargetOpcode::CFI_INSTRUCTION instructions
+
+/// We need a base pointer in the case of having items on the stack that
+/// require stricter alignment than the stack pointer itself.  Because we need
+/// to shift the stack pointer by some unknown amount to force the alignment,
+/// we need to record the value of the stack pointer on entry to the function.
+bool WebAssemblyFrameLowering::hasBP(
+    const MachineFunction &MF) const {
+  const auto *RegInfo =
+      MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
+  return RegInfo->needsStackRealignment(MF);
+}
+
+/// Return true if the specified function should have a dedicated frame pointer
+/// register.
+bool WebAssemblyFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // When we have var-sized objects, we move the stack pointer by an unknown
+  // amount, and need to emit a frame pointer to restore the stack to where we
+  // were on function entry.
+  // If we already need a base pointer, we use that to fix up the stack pointer.
+  // If there are no fixed-size objects, we would have no use of a frame
+  // pointer, and thus should not emit one.
+  bool HasFixedSizedObjects = MFI.getStackSize() > 0;
+  bool NeedsFixedReference = !hasBP(MF) || HasFixedSizedObjects;
+
+  return MFI.isFrameAddressTaken() ||
+         (MFI.hasVarSizedObjects() && NeedsFixedReference) ||
+         MFI.hasStackMap() || MFI.hasPatchPoint();
+}
+
+/// Under normal circumstances, when a frame pointer is not required, we reserve
+/// argument space for call sites in the function immediately on entry to the
+/// current function. This eliminates the need for add/sub sp brackets around
+/// call sites. Returns true if the call frame is included as part of the stack
+/// frame.
+bool WebAssemblyFrameLowering::hasReservedCallFrame(
+    const MachineFunction &MF) const {
+  return !MF.getFrameInfo().hasVarSizedObjects();
+}
+
+
+/// Returns true if this function needs a local user-space stack pointer.
+/// Unlike a machine stack pointer, the wasm user stack pointer is a global
+/// variable, so it is loaded into a register in the prolog.
+bool WebAssemblyFrameLowering::needsSP(const MachineFunction &MF,
+                                       const MachineFrameInfo &MFI) const {
+  return MFI.getStackSize() || MFI.adjustsStack() || hasFP(MF);
+}
+
+/// Returns true if the local user-space stack pointer needs to be written back
+/// to memory by this function (this is not meaningful if needsSP is false). If
+/// false, the stack red zone can be used and only a local SP is needed.
+bool WebAssemblyFrameLowering::needsSPWriteback(
+    const MachineFunction &MF, const MachineFrameInfo &MFI) const {
+  assert(needsSP(MF, MFI));
+  return MFI.getStackSize() > RedZoneSize || MFI.hasCalls() ||
+         MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+}
+
+static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
+                            MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator &InsertAddr,
+                            MachineBasicBlock::iterator &InsertStore,
+                            const DebugLoc &DL) {
+  const char *ES = "__stack_pointer";
+  auto *SPSymbol = MF.createExternalSymbolName(ES);
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterClass *PtrRC =
+      MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+  unsigned Zero = MRI.createVirtualRegister(PtrRC);
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+  BuildMI(MBB, InsertAddr, DL, TII->get(WebAssembly::CONST_I32), Zero)
+      .addImm(0);
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
+      MachineMemOperand::MOStore, 4, 4);
+  BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::STORE_I32))
+      .addImm(2)  // p2align
+      .addExternalSymbol(SPSymbol)
+      .addReg(Zero)
+      .addReg(SrcReg)
+      .addMemOperand(MMO);
+}
+
+MachineBasicBlock::iterator
+WebAssemblyFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  assert(!I->getOperand(0).getImm() && (hasFP(MF) || hasBP(MF)) &&
+         "Call frame pseudos should only be used for dynamic stack adjustment");
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  if (I->getOpcode() == TII->getCallFrameDestroyOpcode() &&
+      needsSPWriteback(MF, MF.getFrameInfo())) {
+    DebugLoc DL = I->getDebugLoc();
+    writeSPToMemory(WebAssembly::SP32, MF, MBB, I, I, DL);
+  }
+  return MBB.erase(I);
+}
+
+void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
+                                            MachineBasicBlock &MBB) const {
+  // TODO: Do ".setMIFlag(MachineInstr::FrameSetup)" on emitted instructions
+  auto &MFI = MF.getFrameInfo();
+  assert(MFI.getCalleeSavedInfo().empty() &&
+         "WebAssembly should not have callee-saved registers");
+
+  if (!needsSP(MF, MFI)) return;
+  uint64_t StackSize = MFI.getStackSize();
+
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto &MRI = MF.getRegInfo();
+
+  auto InsertPt = MBB.begin();
+  DebugLoc DL;
+
+  const TargetRegisterClass *PtrRC =
+      MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+  unsigned Zero = MRI.createVirtualRegister(PtrRC);
+  unsigned SPReg = WebAssembly::SP32;
+  if (StackSize)
+    SPReg = MRI.createVirtualRegister(PtrRC);
+  const char *ES = "__stack_pointer";
+  auto *SPSymbol = MF.createExternalSymbolName(ES);
+  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), Zero)
+      .addImm(0);
+  MachineMemOperand *LoadMMO = MF.getMachineMemOperand(
+      MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
+      MachineMemOperand::MOLoad, 4, 4);
+  // Load the SP value.
+  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32), SPReg)
+      .addImm(2)       // p2align
+      .addExternalSymbol(SPSymbol)
+      .addReg(Zero)    // addr
+      .addMemOperand(LoadMMO);
+
+  bool HasBP = hasBP(MF);
+  if (HasBP) {
+    auto FI = MF.getInfo<WebAssemblyFunctionInfo>();
+    unsigned BasePtr = MRI.createVirtualRegister(PtrRC);
+    FI->setBasePointerVreg(BasePtr);
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::COPY), BasePtr)
+        .addReg(SPReg);
+  }
+  if (StackSize) {
+    // Subtract the frame size
+    unsigned OffsetReg = MRI.createVirtualRegister(PtrRC);
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+        .addImm(StackSize);
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::SUB_I32),
+            WebAssembly::SP32)
+        .addReg(SPReg)
+        .addReg(OffsetReg);
+  }
+  if (HasBP) {
+    unsigned BitmaskReg = MRI.createVirtualRegister(PtrRC);
+    unsigned Alignment = MFI.getMaxAlignment();
+    assert((1u << countTrailingZeros(Alignment)) == Alignment &&
+      "Alignment must be a power of 2");
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), BitmaskReg)
+        .addImm((int)~(Alignment - 1));
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::AND_I32),
+            WebAssembly::SP32)
+        .addReg(WebAssembly::SP32)
+        .addReg(BitmaskReg);
+  }
+  if (hasFP(MF)) {
+    // Unlike most conventional targets (where FP points to the saved FP),
+    // FP points to the bottom of the fixed-size locals, so we can use positive
+    // offsets in load/store instructions.
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::COPY),
+            WebAssembly::FP32)
+        .addReg(WebAssembly::SP32);
+  }
+  if (StackSize && needsSPWriteback(MF, MFI)) {
+    writeSPToMemory(WebAssembly::SP32, MF, MBB, InsertPt, InsertPt, DL);
+  }
+}
+
+void WebAssemblyFrameLowering::emitEpilogue(MachineFunction &MF,
+                                            MachineBasicBlock &MBB) const {
+  auto &MFI = MF.getFrameInfo();
+  uint64_t StackSize = MFI.getStackSize();
+  if (!needsSP(MF, MFI) || !needsSPWriteback(MF, MFI)) return;
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto &MRI = MF.getRegInfo();
+  auto InsertPt = MBB.getFirstTerminator();
+  DebugLoc DL;
+
+  if (InsertPt != MBB.end())
+    DL = InsertPt->getDebugLoc();
+
+  // Restore the stack pointer. If we had fixed-size locals, add the offset
+  // subtracted in the prolog.
+  unsigned SPReg = 0;
+  MachineBasicBlock::iterator InsertAddr = InsertPt;
+  if (hasBP(MF)) {
+    auto FI = MF.getInfo<WebAssemblyFunctionInfo>();
+    SPReg = FI->getBasePointerVreg();
+  } else if (StackSize) {
+    const TargetRegisterClass *PtrRC =
+        MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+    unsigned OffsetReg = MRI.createVirtualRegister(PtrRC);
+    InsertAddr =
+        BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+            .addImm(StackSize);
+    // In the epilog we don't need to write the result back to the SP32 physreg
+    // because it won't be used again. We can use a stackified register instead.
+    SPReg = MRI.createVirtualRegister(PtrRC);
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::ADD_I32), SPReg)
+        .addReg(hasFP(MF) ? WebAssembly::FP32 : WebAssembly::SP32)
+        .addReg(OffsetReg);
+  } else {
+    SPReg = hasFP(MF) ? WebAssembly::FP32 : WebAssembly::SP32;
+  }
+
+  writeSPToMemory(SPReg, MF, MBB, InsertAddr, InsertPt, DL);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
new file mode 100644
index 000000000000..bf326fce88fa
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
@@ -0,0 +1,57 @@
+// WebAssemblyFrameLowering.h - TargetFrameLowering for WebAssembly -*- C++ -*-/
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This class implements WebAssembly-specific bits of
+/// TargetFrameLowering class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYFRAMELOWERING_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYFRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class MachineFrameInfo;
+
+class WebAssemblyFrameLowering final : public TargetFrameLowering {
+ public:
+  /// Size of the red zone for the user stack (leaf functions can use this much
+  /// space below the stack pointer without writing it back to memory).
+  // TODO: (ABI) Revisit and decide how large it should be.
+  static const size_t RedZoneSize = 128;
+
+  WebAssemblyFrameLowering()
+      : TargetFrameLowering(StackGrowsDown, /*StackAlignment=*/16,
+                            /*LocalAreaOffset=*/0,
+                            /*TransientStackAlignment=*/16,
+                            /*StackRealignable=*/true) {}
+
+  MachineBasicBlock::iterator eliminateCallFramePseudoInstr(
+      MachineFunction &MF, MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator I) const override;
+
+  /// These methods insert prolog and epilog code into the function.
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+ private:
+  bool hasBP(const MachineFunction &MF) const;
+  bool needsSP(const MachineFunction &MF, const MachineFrameInfo &MFI) const;
+  bool needsSPWriteback(const MachineFunction &MF,
+                        const MachineFrameInfo &MFI) const;
+};
+
+}  // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
new file mode 100644
index 000000000000..2f0f106ef5b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -0,0 +1,25 @@
+//- WebAssemblyISD.def - WebAssembly ISD ---------------------------*- C++ -*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file describes the various WebAssembly ISD node types.
+///
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+HANDLE_NODETYPE(CALL1)
+HANDLE_NODETYPE(CALL0)
+HANDLE_NODETYPE(RETURN)
+HANDLE_NODETYPE(ARGUMENT)
+HANDLE_NODETYPE(Wrapper)
+HANDLE_NODETYPE(BR_IF)
+HANDLE_NODETYPE(BR_TABLE)
+
+// add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here...
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
new file mode 100644
index 000000000000..a67137f867e7
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -0,0 +1,118 @@
+//- WebAssemblyISelDAGToDAG.cpp - A dag to dag inst selector for WebAssembly -//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines an instruction selector for the WebAssembly target.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h" // To access function attributes.
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-isel"
+
+//===--------------------------------------------------------------------===//
+/// WebAssembly-specific code to select WebAssembly machine instructions for
+/// SelectionDAG operations.
+///
+namespace {
+class WebAssemblyDAGToDAGISel final : public SelectionDAGISel {
+  /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
+  /// right decision when generating code for different targets.
+  const WebAssemblySubtarget *Subtarget;
+
+  bool ForCodeSize;
+
+public:
+  WebAssemblyDAGToDAGISel(WebAssemblyTargetMachine &tm,
+                          CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), ForCodeSize(false) {
+  }
+
+  StringRef getPassName() const override {
+    return "WebAssembly Instruction Selection";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    ForCodeSize =
+        MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) ||
+        MF.getFunction()->hasFnAttribute(Attribute::MinSize);
+    Subtarget = &MF.getSubtarget<WebAssemblySubtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
+
+  void Select(SDNode *Node) override;
+
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                                    std::vector<SDValue> &OutOps) override;
+
+// Include the pieces autogenerated from the target description.
+#include "WebAssemblyGenDAGISel.inc"
+
+private:
+  // add select functions here...
+};
+} // end anonymous namespace
+
+void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
+  // Dump information about the Node being selected.
+  DEBUG(errs() << "Selecting: ");
+  DEBUG(Node->dump(CurDAG));
+  DEBUG(errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    Node->setNodeId(-1);
+    return;
+  }
+
+  // Few custom selection stuff.
+  EVT VT = Node->getValueType(0);
+
+  switch (Node->getOpcode()) {
+  default:
+    break;
+    // If we need WebAssembly-specific selection, it would go here.
+    (void)VT;
+  }
+
+  // Select the default instruction.
+  SelectCode(Node);
+}
+
+bool WebAssemblyDAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
+  switch (ConstraintID) {
+  case InlineAsm::Constraint_i:
+  case InlineAsm::Constraint_m:
+    // We just support simple memory operands that just have a single address
+    // operand and need no special handling.
+    OutOps.push_back(Op);
+    return false;
+  default:
+    break;
+  }
+
+  return true;
+}
+
+/// This pass converts a legalized DAG into a WebAssembly-specific DAG, ready
+/// for instruction scheduling.
+FunctionPass *llvm::createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
+                                             CodeGenOpt::Level OptLevel) {
+  return new WebAssemblyDAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
new file mode 100644
index 000000000000..6a7f75a6b3a1
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -0,0 +1,707 @@
+//=- WebAssemblyISelLowering.cpp - WebAssembly DAG Lowering Implementation -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements the WebAssemblyTargetLowering class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyISelLowering.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyTargetMachine.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-lower"
+
+WebAssemblyTargetLowering::WebAssemblyTargetLowering(
+    const TargetMachine &TM, const WebAssemblySubtarget &STI)
+    : TargetLowering(TM), Subtarget(&STI) {
+  auto MVTPtr = Subtarget->hasAddr64() ? MVT::i64 : MVT::i32;
+
+  // Booleans always contain 0 or 1.
+  setBooleanContents(ZeroOrOneBooleanContent);
+  // WebAssembly does not produce floating-point exceptions on normal floating
+  // point operations.
+  setHasFloatingPointExceptions(false);
+  // We don't know the microarchitecture here, so just reduce register pressure.
+  setSchedulingPreference(Sched::RegPressure);
+  // Tell ISel that we have a stack pointer.
+  setStackPointerRegisterToSaveRestore(
+      Subtarget->hasAddr64() ? WebAssembly::SP64 : WebAssembly::SP32);
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &WebAssembly::I32RegClass);
+  addRegisterClass(MVT::i64, &WebAssembly::I64RegClass);
+  addRegisterClass(MVT::f32, &WebAssembly::F32RegClass);
+  addRegisterClass(MVT::f64, &WebAssembly::F64RegClass);
+  if (Subtarget->hasSIMD128()) {
+    addRegisterClass(MVT::v16i8, &WebAssembly::V128RegClass);
+    addRegisterClass(MVT::v8i16, &WebAssembly::V128RegClass);
+    addRegisterClass(MVT::v4i32, &WebAssembly::V128RegClass);
+    addRegisterClass(MVT::v4f32, &WebAssembly::V128RegClass);
+  }
+  // Compute derived properties from the register classes.
+  computeRegisterProperties(Subtarget->getRegisterInfo());
+
+  setOperationAction(ISD::GlobalAddress, MVTPtr, Custom);
+  setOperationAction(ISD::ExternalSymbol, MVTPtr, Custom);
+  setOperationAction(ISD::JumpTable, MVTPtr, Custom);
+  setOperationAction(ISD::BlockAddress, MVTPtr, Custom);
+  setOperationAction(ISD::BRIND, MVT::Other, Custom);
+
+  // Take the default expansion for va_arg, va_copy, and va_end. There is no
+  // default action for va_start, so we do that custom.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAARG, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
+  for (auto T : {MVT::f32, MVT::f64}) {
+    // Don't expand the floating-point types to constant pools.
+    setOperationAction(ISD::ConstantFP, T, Legal);
+    // Expand floating-point comparisons.
+    for (auto CC : {ISD::SETO, ISD::SETUO, ISD::SETUEQ, ISD::SETONE,
+                    ISD::SETULT, ISD::SETULE, ISD::SETUGT, ISD::SETUGE})
+      setCondCodeAction(CC, T, Expand);
+    // Expand floating-point library function operators.
+    for (auto Op : {ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOWI, ISD::FPOW,
+                    ISD::FREM, ISD::FMA})
+      setOperationAction(Op, T, Expand);
+    // Note supported floating-point library function operators that otherwise
+    // default to expand.
+    for (auto Op :
+         {ISD::FCEIL, ISD::FFLOOR, ISD::FTRUNC, ISD::FNEARBYINT, ISD::FRINT})
+      setOperationAction(Op, T, Legal);
+    // Support minnan and maxnan, which otherwise default to expand.
+    setOperationAction(ISD::FMINNAN, T, Legal);
+    setOperationAction(ISD::FMAXNAN, T, Legal);
+  }
+
+  for (auto T : {MVT::i32, MVT::i64}) {
+    // Expand unavailable integer operations.
+    for (auto Op :
+         {ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI,
+          ISD::MULHS, ISD::MULHU, ISD::SDIVREM, ISD::UDIVREM, ISD::SHL_PARTS,
+          ISD::SRA_PARTS, ISD::SRL_PARTS, ISD::ADDC, ISD::ADDE, ISD::SUBC,
+          ISD::SUBE}) {
+      setOperationAction(Op, T, Expand);
+    }
+  }
+
+  // As a special case, these operators use the type to mean the type to
+  // sign-extend from.
+  for (auto T : {MVT::i1, MVT::i8, MVT::i16, MVT::i32})
+    setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand);
+
+  // Dynamic stack allocation: use the default expansion.
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVTPtr, Expand);
+
+  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+  setOperationAction(ISD::CopyToReg, MVT::Other, Custom);
+
+  // Expand these forms; we pattern-match the forms that we can handle in isel.
+  for (auto T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64})
+    for (auto Op : {ISD::BR_CC, ISD::SELECT_CC})
+      setOperationAction(Op, T, Expand);
+
+  // We have custom switch handling.
+  setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+
+  // WebAssembly doesn't have:
+  //  - Floating-point extending loads.
+  //  - Floating-point truncating stores.
+  //  - i1 extending loads.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  for (auto T : MVT::integer_valuetypes())
+    for (auto Ext : {ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD})
+      setLoadExtAction(Ext, T, MVT::i1, Promote);
+
+  // Trap lowers to wasm unreachable
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+}
+
+FastISel *WebAssemblyTargetLowering::createFastISel(
+    FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) const {
+  return WebAssembly::createFastISel(FuncInfo, LibInfo);
+}
+
+bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
+    const GlobalAddressSDNode * /*GA*/) const {
+  // All offsets can be folded.
+  return true;
+}
+
+MVT WebAssemblyTargetLowering::getScalarShiftAmountTy(const DataLayout & /*DL*/,
+                                                      EVT VT) const {
+  unsigned BitWidth = NextPowerOf2(VT.getSizeInBits() - 1);
+  if (BitWidth > 1 && BitWidth < 8) BitWidth = 8;
+
+  if (BitWidth > 64) {
+    // The shift will be lowered to a libcall, and compiler-rt libcalls expect
+    // the count to be an i32.
+    BitWidth = 32;
+    assert(BitWidth >= Log2_32_Ceil(VT.getSizeInBits()) &&
+           "32-bit shift counts ought to be enough for anyone");
+  }
+
+  MVT Result = MVT::getIntegerVT(BitWidth);
+  assert(Result != MVT::INVALID_SIMPLE_VALUE_TYPE &&
+         "Unable to represent scalar shift amount type");
+  return Result;
+}
+
+const char *WebAssemblyTargetLowering::getTargetNodeName(
+    unsigned Opcode) const {
+  switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) {
+    case WebAssemblyISD::FIRST_NUMBER:
+      break;
+#define HANDLE_NODETYPE(NODE) \
+  case WebAssemblyISD::NODE:  \
+    return "WebAssemblyISD::" #NODE;
+#include "WebAssemblyISD.def"
+#undef HANDLE_NODETYPE
+  }
+  return nullptr;
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
+  // First, see if this is a constraint that directly corresponds to a
+  // WebAssembly register class.
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+      case 'r':
+        assert(VT != MVT::iPTR && "Pointer MVT not expected here");
+        if (Subtarget->hasSIMD128() && VT.isVector()) {
+          if (VT.getSizeInBits() == 128)
+            return std::make_pair(0U, &WebAssembly::V128RegClass);
+        }
+        if (VT.isInteger() && !VT.isVector()) {
+          if (VT.getSizeInBits() <= 32)
+            return std::make_pair(0U, &WebAssembly::I32RegClass);
+          if (VT.getSizeInBits() <= 64)
+            return std::make_pair(0U, &WebAssembly::I64RegClass);
+        }
+        break;
+      default:
+        break;
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+bool WebAssemblyTargetLowering::isCheapToSpeculateCttz() const {
+  // Assume ctz is a relatively cheap operation.
+  return true;
+}
+
+bool WebAssemblyTargetLowering::isCheapToSpeculateCtlz() const {
+  // Assume clz is a relatively cheap operation.
+  return true;
+}
+
+bool WebAssemblyTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                      const AddrMode &AM,
+                                                      Type *Ty,
+                                                      unsigned AS) const {
+  // WebAssembly offsets are added as unsigned without wrapping. The
+  // isLegalAddressingMode gives us no way to determine if wrapping could be
+  // happening, so we approximate this by accepting only non-negative offsets.
+  if (AM.BaseOffs < 0) return false;
+
+  // WebAssembly has no scale register operands.
+  if (AM.Scale != 0) return false;
+
+  // Everything else is legal.
+  return true;
+}
+
+bool WebAssemblyTargetLowering::allowsMisalignedMemoryAccesses(
+    EVT /*VT*/, unsigned /*AddrSpace*/, unsigned /*Align*/, bool *Fast) const {
+  // WebAssembly supports unaligned accesses, though it should be declared
+  // with the p2align attribute on loads and stores which do so, and there
+  // may be a performance impact. We tell LLVM they're "fast" because
+  // for the kinds of things that LLVM uses this for (merging adjacent stores
+  // of constants, etc.), WebAssembly implementations will either want the
+  // unaligned access or they'll split anyway.
+  if (Fast) *Fast = true;
+  return true;
+}
+
+bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+  // The current thinking is that wasm engines will perform this optimization,
+  // so we can save on code size.
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// WebAssembly Lowering private implementation.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+static void fail(const SDLoc &DL, SelectionDAG &DAG, const char *msg) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  DAG.getContext()->diagnose(
+      DiagnosticInfoUnsupported(*MF.getFunction(), msg, DL.getDebugLoc()));
+}
+
+// Test whether the given calling convention is supported.
+static bool CallingConvSupported(CallingConv::ID CallConv) {
+  // We currently support the language-independent target-independent
+  // conventions. We don't yet have a way to annotate calls with properties like
+  // "cold", and we don't have any call-clobbered registers, so these are mostly
+  // all handled the same.
+  return CallConv == CallingConv::C || CallConv == CallingConv::Fast ||
+         CallConv == CallingConv::Cold ||
+         CallConv == CallingConv::PreserveMost ||
+         CallConv == CallingConv::PreserveAll ||
+         CallConv == CallingConv::CXX_FAST_TLS;
+}
+
+SDValue WebAssemblyTargetLowering::LowerCall(
+    CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc DL = CLI.DL;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  MachineFunction &MF = DAG.getMachineFunction();
+  auto Layout = MF.getDataLayout();
+
+  CallingConv::ID CallConv = CLI.CallConv;
+  if (!CallingConvSupported(CallConv))
+    fail(DL, DAG,
+         "WebAssembly doesn't support language-specific or target-specific "
+         "calling conventions yet");
+  if (CLI.IsPatchPoint)
+    fail(DL, DAG, "WebAssembly doesn't support patch point yet");
+
+  // WebAssembly doesn't currently support explicit tail calls. If they are
+  // required, fail. Otherwise, just disable them.
+  if ((CallConv == CallingConv::Fast && CLI.IsTailCall &&
+       MF.getTarget().Options.GuaranteedTailCallOpt) ||
+      (CLI.CS && CLI.CS->isMustTailCall()))
+    fail(DL, DAG, "WebAssembly doesn't support tail call yet");
+  CLI.IsTailCall = false;
+
+  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+  if (Ins.size() > 1)
+    fail(DL, DAG, "WebAssembly doesn't support more than 1 returned value yet");
+
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  for (unsigned i = 0; i < Outs.size(); ++i) {
+    const ISD::OutputArg &Out = Outs[i];
+    SDValue &OutVal = OutVals[i];
+    if (Out.Flags.isNest())
+      fail(DL, DAG, "WebAssembly hasn't implemented nest arguments");
+    if (Out.Flags.isInAlloca())
+      fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments");
+    if (Out.Flags.isInConsecutiveRegs())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments");
+    if (Out.Flags.isInConsecutiveRegsLast())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments");
+    if (Out.Flags.isByVal() && Out.Flags.getByValSize() != 0) {
+      auto &MFI = MF.getFrameInfo();
+      int FI = MFI.CreateStackObject(Out.Flags.getByValSize(),
+                                     Out.Flags.getByValAlign(),
+                                     /*isSS=*/false);
+      SDValue SizeNode =
+          DAG.getConstant(Out.Flags.getByValSize(), DL, MVT::i32);
+      SDValue FINode = DAG.getFrameIndex(FI, getPointerTy(Layout));
+      Chain = DAG.getMemcpy(
+          Chain, DL, FINode, OutVal, SizeNode, Out.Flags.getByValAlign(),
+          /*isVolatile*/ false, /*AlwaysInline=*/false,
+          /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
+      OutVal = FINode;
+    }
+  }
+
+  bool IsVarArg = CLI.IsVarArg;
+  unsigned NumFixedArgs = CLI.NumFixedArgs;
+
+  auto PtrVT = getPointerTy(Layout);
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+
+  if (IsVarArg) {
+    // Outgoing non-fixed arguments are placed in a buffer. First
+    // compute their offsets and the total amount of buffer space needed.
+    for (SDValue Arg :
+         make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
+      EVT VT = Arg.getValueType();
+      assert(VT != MVT::iPTR && "Legalized args should be concrete");
+      Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+      unsigned Offset = CCInfo.AllocateStack(Layout.getTypeAllocSize(Ty),
+                                             Layout.getABITypeAlignment(Ty));
+      CCInfo.addLoc(CCValAssign::getMem(ArgLocs.size(), VT.getSimpleVT(),
+                                        Offset, VT.getSimpleVT(),
+                                        CCValAssign::Full));
+    }
+  }
+
+  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+
+  SDValue FINode;
+  if (IsVarArg && NumBytes) {
+    // For non-fixed arguments, next emit stores to store the argument values
+    // to the stack buffer at the offsets computed above.
+    int FI = MF.getFrameInfo().CreateStackObject(NumBytes,
+                                                 Layout.getStackAlignment(),
+                                                 /*isSS=*/false);
+    unsigned ValNo = 0;
+    SmallVector<SDValue, 8> Chains;
+    for (SDValue Arg :
+         make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
+      assert(ArgLocs[ValNo].getValNo() == ValNo &&
+             "ArgLocs should remain in order and only hold varargs args");
+      unsigned Offset = ArgLocs[ValNo++].getLocMemOffset();
+      FINode = DAG.getFrameIndex(FI, getPointerTy(Layout));
+      SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, FINode,
+                                DAG.getConstant(Offset, DL, PtrVT));
+      Chains.push_back(DAG.getStore(
+          Chain, DL, Arg, Add,
+          MachinePointerInfo::getFixedStack(MF, FI, Offset), 0));
+    }
+    if (!Chains.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  } else if (IsVarArg) {
+    FINode = DAG.getIntPtrConstant(0, DL);
+  }
+
+  // Compute the operands for the CALLn node.
+  SmallVector<SDValue, 16> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add all fixed arguments. Note that for non-varargs calls, NumFixedArgs
+  // isn't reliable.
+  Ops.append(OutVals.begin(),
+             IsVarArg ? OutVals.begin() + NumFixedArgs : OutVals.end());
+  // Add a pointer to the vararg buffer.
+  if (IsVarArg) Ops.push_back(FINode);
+
+  SmallVector<EVT, 8> InTys;
+  for (const auto &In : Ins) {
+    assert(!In.Flags.isByVal() && "byval is not valid for return values");
+    assert(!In.Flags.isNest() && "nest is not valid for return values");
+    if (In.Flags.isInAlloca())
+      fail(DL, DAG, "WebAssembly hasn't implemented inalloca return values");
+    if (In.Flags.isInConsecutiveRegs())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs return values");
+    if (In.Flags.isInConsecutiveRegsLast())
+      fail(DL, DAG,
+           "WebAssembly hasn't implemented cons regs last return values");
+    // Ignore In.getOrigAlign() because all our arguments are passed in
+    // registers.
+    InTys.push_back(In.VT);
+  }
+  InTys.push_back(MVT::Other);
+  SDVTList InTyList = DAG.getVTList(InTys);
+  SDValue Res =
+      DAG.getNode(Ins.empty() ? WebAssemblyISD::CALL0 : WebAssemblyISD::CALL1,
+                  DL, InTyList, Ops);
+  if (Ins.empty()) {
+    Chain = Res;
+  } else {
+    InVals.push_back(Res);
+    Chain = Res.getValue(1);
+  }
+
+  return Chain;
+}
+
+bool WebAssemblyTargetLowering::CanLowerReturn(
+    CallingConv::ID /*CallConv*/, MachineFunction & /*MF*/, bool /*IsVarArg*/,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    LLVMContext & /*Context*/) const {
+  // WebAssembly can't currently handle returning tuples.
+  return Outs.size() <= 1;
+}
+
+SDValue WebAssemblyTargetLowering::LowerReturn(
+    SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+    SelectionDAG &DAG) const {
+  assert(Outs.size() <= 1 && "WebAssembly can only return up to one value");
+  if (!CallingConvSupported(CallConv))
+    fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
+
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  RetOps.append(OutVals.begin(), OutVals.end());
+  Chain = DAG.getNode(WebAssemblyISD::RETURN, DL, MVT::Other, RetOps);
+
+  // Record the number and types of the return values.
+  for (const ISD::OutputArg &Out : Outs) {
+    assert(!Out.Flags.isByVal() && "byval is not valid for return values");
+    assert(!Out.Flags.isNest() && "nest is not valid for return values");
+    assert(Out.IsFixed && "non-fixed return value is not valid");
+    if (Out.Flags.isInAlloca())
+      fail(DL, DAG, "WebAssembly hasn't implemented inalloca results");
+    if (Out.Flags.isInConsecutiveRegs())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs results");
+    if (Out.Flags.isInConsecutiveRegsLast())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs last results");
+  }
+
+  return Chain;
+}
+
+SDValue WebAssemblyTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  if (!CallingConvSupported(CallConv))
+    fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  auto *MFI = MF.getInfo<WebAssemblyFunctionInfo>();
+
+  // Set up the incoming ARGUMENTS value, which serves to represent the liveness
+  // of the incoming values before they're represented by virtual registers.
+  MF.getRegInfo().addLiveIn(WebAssembly::ARGUMENTS);
+
+  for (const ISD::InputArg &In : Ins) {
+    if (In.Flags.isInAlloca())
+      fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments");
+    if (In.Flags.isNest())
+      fail(DL, DAG, "WebAssembly hasn't implemented nest arguments");
+    if (In.Flags.isInConsecutiveRegs())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments");
+    if (In.Flags.isInConsecutiveRegsLast())
+      fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments");
+    // Ignore In.getOrigAlign() because all our arguments are passed in
+    // registers.
+    InVals.push_back(
+        In.Used
+            ? DAG.getNode(WebAssemblyISD::ARGUMENT, DL, In.VT,
+                          DAG.getTargetConstant(InVals.size(), DL, MVT::i32))
+            : DAG.getUNDEF(In.VT));
+
+    // Record the number and types of arguments.
+    MFI->addParam(In.VT);
+  }
+
+  // Varargs are copied into a buffer allocated by the caller, and a pointer to
+  // the buffer is passed as an argument.
+  if (IsVarArg) {
+    MVT PtrVT = getPointerTy(MF.getDataLayout());
+    unsigned VarargVreg =
+        MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrVT));
+    MFI->setVarargBufferVreg(VarargVreg);
+    Chain = DAG.getCopyToReg(
+        Chain, DL, VarargVreg,
+        DAG.getNode(WebAssemblyISD::ARGUMENT, DL, PtrVT,
+                    DAG.getTargetConstant(Ins.size(), DL, MVT::i32)));
+    MFI->addParam(PtrVT);
+  }
+
+  // Record the number and types of results.
+  SmallVector<MVT, 4> Params;
+  SmallVector<MVT, 4> Results;
+  ComputeSignatureVTs(*MF.getFunction(), DAG.getTarget(), Params, Results);
+  for (MVT VT : Results)
+    MFI->addResult(VT);
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//  Custom lowering hooks.
+//===----------------------------------------------------------------------===//
+
+SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  switch (Op.getOpcode()) {
+    default:
+      llvm_unreachable("unimplemented operation lowering");
+      return SDValue();
+    case ISD::FrameIndex:
+      return LowerFrameIndex(Op, DAG);
+    case ISD::GlobalAddress:
+      return LowerGlobalAddress(Op, DAG);
+    case ISD::ExternalSymbol:
+      return LowerExternalSymbol(Op, DAG);
+    case ISD::JumpTable:
+      return LowerJumpTable(Op, DAG);
+    case ISD::BR_JT:
+      return LowerBR_JT(Op, DAG);
+    case ISD::VASTART:
+      return LowerVASTART(Op, DAG);
+    case ISD::BlockAddress:
+    case ISD::BRIND:
+      fail(DL, DAG, "WebAssembly hasn't implemented computed gotos");
+      return SDValue();
+    case ISD::RETURNADDR: // Probably nothing meaningful can be returned here.
+      fail(DL, DAG, "WebAssembly hasn't implemented __builtin_return_address");
+      return SDValue();
+    case ISD::FRAMEADDR:
+      return LowerFRAMEADDR(Op, DAG);
+    case ISD::CopyToReg:
+      return LowerCopyToReg(Op, DAG);
+  }
+}
+
+SDValue WebAssemblyTargetLowering::LowerCopyToReg(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDValue Src = Op.getOperand(2);
+  if (isa<FrameIndexSDNode>(Src.getNode())) {
+    // CopyToReg nodes don't support FrameIndex operands. Other targets select
+    // the FI to some LEA-like instruction, but since we don't have that, we
+    // need to insert some kind of instruction that can take an FI operand and
+    // produces a value usable by CopyToReg (i.e. in a vreg). So insert a dummy
+    // copy_local between Op and its FI operand.
+    SDValue Chain = Op.getOperand(0);
+    SDLoc DL(Op);
+    unsigned Reg = cast<RegisterSDNode>(Op.getOperand(1))->getReg();
+    EVT VT = Src.getValueType();
+    SDValue Copy(
+        DAG.getMachineNode(VT == MVT::i32 ? WebAssembly::COPY_I32
+                                          : WebAssembly::COPY_I64,
+                           DL, VT, Src),
+        0);
+    return Op.getNode()->getNumValues() == 1
+               ? DAG.getCopyToReg(Chain, DL, Reg, Copy)
+               : DAG.getCopyToReg(Chain, DL, Reg, Copy, Op.getNumOperands() == 4
+                                                            ? Op.getOperand(3)
+                                                            : SDValue());
+  }
+  return SDValue();
+}
+
+SDValue WebAssemblyTargetLowering::LowerFrameIndex(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  int FI = cast<FrameIndexSDNode>(Op)->getIndex();
+  return DAG.getTargetFrameIndex(FI, Op.getValueType());
+}
+
+SDValue WebAssemblyTargetLowering::LowerFRAMEADDR(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  // Non-zero depths are not supported by WebAssembly currently. Use the
+  // legalizer's default expansion, which is to return 0 (what this function is
+  // documented to do).
+  if (Op.getConstantOperandVal(0) > 0)
+    return SDValue();
+
+  DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
+  EVT VT = Op.getValueType();
+  unsigned FP =
+      Subtarget->getRegisterInfo()->getFrameRegister(DAG.getMachineFunction());
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), FP, VT);
+}
+
+SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  const auto *GA = cast<GlobalAddressSDNode>(Op);
+  EVT VT = Op.getValueType();
+  assert(GA->getTargetFlags() == 0 &&
+         "Unexpected target flags on generic GlobalAddressSDNode");
+  if (GA->getAddressSpace() != 0)
+    fail(DL, DAG, "WebAssembly only expects the 0 address space");
+  return DAG.getNode(
+      WebAssemblyISD::Wrapper, DL, VT,
+      DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT, GA->getOffset()));
+}
+
+SDValue WebAssemblyTargetLowering::LowerExternalSymbol(
+    SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  const auto *ES = cast<ExternalSymbolSDNode>(Op);
+  EVT VT = Op.getValueType();
+  assert(ES->getTargetFlags() == 0 &&
+         "Unexpected target flags on generic ExternalSymbolSDNode");
+  // Set the TargetFlags to 0x1 which indicates that this is a "function"
+  // symbol rather than a data symbol. We do this unconditionally even though
+  // we don't know anything about the symbol other than its name, because all
+  // external symbols used in target-independent SelectionDAG code are for
+  // functions.
+  return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
+                     DAG.getTargetExternalSymbol(ES->getSymbol(), VT,
+                                                 /*TargetFlags=*/0x1));
+}
+
+SDValue WebAssemblyTargetLowering::LowerJumpTable(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  // There's no need for a Wrapper node because we always incorporate a jump
+  // table operand into a BR_TABLE instruction, rather than ever
+  // materializing it in a register.
+  const JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  return DAG.getTargetJumpTable(JT->getIndex(), Op.getValueType(),
+                                JT->getTargetFlags());
+}
+
+SDValue WebAssemblyTargetLowering::LowerBR_JT(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Chain = Op.getOperand(0);
+  const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
+  SDValue Index = Op.getOperand(2);
+  assert(JT->getTargetFlags() == 0 && "WebAssembly doesn't set target flags");
+
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Index);
+
+  MachineJumpTableInfo *MJTI = DAG.getMachineFunction().getJumpTableInfo();
+  const auto &MBBs = MJTI->getJumpTables()[JT->getIndex()].MBBs;
+
+  // Add an operand for each case.
+  for (auto MBB : MBBs) Ops.push_back(DAG.getBasicBlock(MBB));
+
+  // TODO: For now, we just pick something arbitrary for a default case for now.
+  // We really want to sniff out the guard and put in the real default case (and
+  // delete the guard).
+  Ops.push_back(DAG.getBasicBlock(MBBs[0]));
+
+  return DAG.getNode(WebAssemblyISD::BR_TABLE, DL, MVT::Other, Ops);
+}
+
+SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT PtrVT = getPointerTy(DAG.getMachineFunction().getDataLayout());
+
+  auto *MFI = DAG.getMachineFunction().getInfo<WebAssemblyFunctionInfo>();
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+
+  SDValue ArgN = DAG.getCopyFromReg(DAG.getEntryNode(), DL,
+                                    MFI->getVarargBufferVreg(), PtrVT);
+  return DAG.getStore(Op.getOperand(0), DL, ArgN, Op.getOperand(1),
+                      MachinePointerInfo(SV), 0);
+}
+
+//===----------------------------------------------------------------------===//
+//                          WebAssembly Optimization Hooks
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
new file mode 100644
index 000000000000..5bc723028e63
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -0,0 +1,98 @@
+//- WebAssemblyISelLowering.h - WebAssembly DAG Lowering Interface -*- C++ -*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the interfaces that WebAssembly uses to lower LLVM
+/// code into a selection DAG.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYISELLOWERING_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+namespace WebAssemblyISD {
+
+enum NodeType : unsigned {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+#define HANDLE_NODETYPE(NODE) NODE,
+#include "WebAssemblyISD.def"
+#undef HANDLE_NODETYPE
+};
+
+}  // end namespace WebAssemblyISD
+
+class WebAssemblySubtarget;
+class WebAssemblyTargetMachine;
+
+class WebAssemblyTargetLowering final : public TargetLowering {
+ public:
+  WebAssemblyTargetLowering(const TargetMachine &TM,
+                            const WebAssemblySubtarget &STI);
+
+ private:
+  /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
+  /// right decision when generating code for different targets.
+  const WebAssemblySubtarget *Subtarget;
+
+  FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
+                           const TargetLibraryInfo *LibInfo) const override;
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+  MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
+  const char *getTargetNodeName(unsigned Opcode) const override;
+  std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(
+      const TargetRegisterInfo *TRI, StringRef Constraint,
+      MVT VT) const override;
+  bool isCheapToSpeculateCttz() const override;
+  bool isCheapToSpeculateCtlz() const override;
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
+  bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace, unsigned Align,
+                                      bool *Fast) const override;
+  bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
+                      SelectionDAG &DAG) const override;
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool IsVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+
+  // Custom lowering hooks.
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCopyToReg(SDValue Op, SelectionDAG &DAG) const;
+};
+
+namespace WebAssembly {
+FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo);
+}  // end namespace WebAssembly
+
+}  // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
new file mode 100644
index 000000000000..64415658ed81
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -0,0 +1,47 @@
+// WebAssemblyInstrAtomics.td-WebAssembly Atomic codegen support-*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Atomic operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+// TODO: Implement atomic instructions.
+
+//===----------------------------------------------------------------------===//
+// Atomic fences
+//===----------------------------------------------------------------------===//
+
+// TODO: add atomic fences here...
+
+//===----------------------------------------------------------------------===//
+// Atomic loads
+//===----------------------------------------------------------------------===//
+
+// TODO: add atomic loads here...
+
+//===----------------------------------------------------------------------===//
+// Atomic stores
+//===----------------------------------------------------------------------===//
+
+// TODO: add atomic stores here...
+
+//===----------------------------------------------------------------------===//
+// Low-level exclusive operations
+//===----------------------------------------------------------------------===//
+
+// TODO: add exclusive operations here...
+
+// Load-exclusives.
+
+// Store-exclusives.
+
+// Store-release-exclusives.
+
+// And clear exclusive.
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
new file mode 100644
index 000000000000..047f4be066c0
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -0,0 +1,130 @@
+//===- WebAssemblyInstrCall.td-WebAssembly Call codegen support -*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Call operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+// TODO: addr64: These currently assume the callee address is 32-bit.
+
+let Defs = [ARGUMENTS] in {
+
+// Call sequence markers. These have an immediate which represents the amount of
+// stack space to allocate or free, which is used for varargs lowering.
+let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in {
+def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt),
+                         [(WebAssemblycallseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
+                       [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>;
+} // isCodeGenOnly = 1
+
+multiclass CALL<WebAssemblyRegClass vt, string prefix> {
+  def CALL_#vt : I<(outs vt:$dst), (ins function32_op:$callee, variable_ops),
+                   [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))],
+                   !strconcat(prefix, "call\t$dst, $callee"),
+                   0x10>;
+  let isCodeGenOnly = 1 in {
+    def PCALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops),
+                              [(set vt:$dst, (WebAssemblycall1 I32:$callee))],
+                              "PSEUDO CALL INDIRECT\t$callee">;
+  } // isCodeGenOnly = 1
+
+  def CALL_INDIRECT_#vt : I<(outs vt:$dst), (ins i32imm:$flags, variable_ops),
+                            [],
+                            !strconcat(prefix, "call_indirect\t$dst"),
+                            0x11>;
+}
+
+multiclass SIMD_CALL<ValueType vt, string prefix> {
+  def CALL_#vt : SIMD_I<(outs V128:$dst), (ins function32_op:$callee, variable_ops),
+                         [(set (vt V128:$dst),
+                               (WebAssemblycall1 (i32 imm:$callee)))],
+                         !strconcat(prefix, "call\t$dst, $callee"),
+                         0x10>;
+  let isCodeGenOnly = 1 in {
+    def PCALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
+                                    (ins I32:$callee, variable_ops),
+                                    [(set (vt V128:$dst),
+                                          (WebAssemblycall1 I32:$callee))],
+                                    "PSEUDO CALL INDIRECT\t$callee">;
+  } // isCodeGenOnly = 1
+
+  def CALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
+                                  (ins i32imm:$flags, variable_ops),
+                                  [],
+                                  !strconcat(prefix, "call_indirect\t$dst"),
+                                  0x11>;
+}
+
+let Uses = [SP32, SP64], isCall = 1 in {
+  defm : CALL<I32, "i32.">;
+  defm : CALL<I64, "i64.">;
+  defm : CALL<F32, "f32.">;
+  defm : CALL<F64, "f64.">;
+  defm : SIMD_CALL<v16i8, "i8x16.">;
+  defm : SIMD_CALL<v8i16, "i16x8.">;
+  defm : SIMD_CALL<v4i32, "i32x4.">;
+  defm : SIMD_CALL<v4f32, "f32x4.">;
+
+  def CALL_VOID : I<(outs), (ins function32_op:$callee, variable_ops),
+                    [(WebAssemblycall0 (i32 imm:$callee))],
+                    "call    \t$callee", 0x10>;
+  let isCodeGenOnly = 1 in {
+    def PCALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops),
+                      [(WebAssemblycall0 I32:$callee)],
+                      "PSEUDO CALL INDIRECT\t$callee">;
+  } // isCodeGenOnly = 1
+
+  def CALL_INDIRECT_VOID : I<(outs), (ins i32imm:$flags, variable_ops),
+                             [],
+                             "call_indirect\t", 0x11>;
+} // Uses = [SP32,SP64], isCall = 1
+
+} // Defs = [ARGUMENTS]
+
+// Patterns for matching a direct call to a global address.
+def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_I32 tglobaladdr:$callee)>;
+def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_I64 tglobaladdr:$callee)>;
+def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_F32 tglobaladdr:$callee)>;
+def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_F64 tglobaladdr:$callee)>;
+def : Pat<(v16i8 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_v16i8 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v8i16 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_v8i16 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v4i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_v4i32 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_v4f32 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee)),
+          (CALL_VOID tglobaladdr:$callee)>;
+
+// Patterns for matching a direct call to an external symbol.
+def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_I32 texternalsym:$callee)>;
+def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_I64 texternalsym:$callee)>;
+def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_F32 texternalsym:$callee)>;
+def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_F64 texternalsym:$callee)>;
+def : Pat<(v16i8 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_v16i8 texternalsym:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v8i16 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_v8i16 texternalsym:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v4i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_v4i32 texternalsym:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_v4f32 texternalsym:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(WebAssemblycall0 (WebAssemblywrapper texternalsym:$callee)),
+          (CALL_VOID texternalsym:$callee)>;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
new file mode 100644
index 000000000000..1146431e6b77
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -0,0 +1,114 @@
+//===- WebAssemblyInstrControl.td-WebAssembly control-flow ------*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly control-flow code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+let Defs = [ARGUMENTS] in {
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
+// The condition operand is a boolean value which WebAssembly represents as i32.
+def BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond),
+              [(brcond I32:$cond, bb:$dst)],
+               "br_if   \t$dst, $cond", 0x0d>;
+let isCodeGenOnly = 1 in
+def BR_UNLESS : I<(outs), (ins bb_op:$dst, I32:$cond), []>;
+let isBarrier = 1 in {
+def BR   : I<(outs), (ins bb_op:$dst),
+             [(br bb:$dst)],
+             "br      \t$dst", 0x0c>;
+} // isBarrier = 1
+} // isBranch = 1, isTerminator = 1, hasCtrlDep = 1
+
+} // Defs = [ARGUMENTS]
+
+def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst),
+          (BR_IF bb_op:$dst, I32:$cond)>;
+def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst),
+          (BR_UNLESS bb_op:$dst, I32:$cond)>;
+
+let Defs = [ARGUMENTS] in {
+
+// TODO: SelectionDAG's lowering insists on using a pointer as the index for
+// jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode
+// currently.
+// Set TSFlags{0} to 1 to indicate that the variable_ops are immediates.
+// Set TSFlags{1} to 1 to indicate that the immediates represent labels.
+let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
+def BR_TABLE_I32 : I<(outs), (ins I32:$index, variable_ops),
+                     [(WebAssemblybr_table I32:$index)],
+                     "br_table \t$index", 0x0e> {
+  let TSFlags{0} = 1;
+  let TSFlags{1} = 1;
+}
+def BR_TABLE_I64 : I<(outs), (ins I64:$index, variable_ops),
+                     [(WebAssemblybr_table I64:$index)],
+                     "br_table \t$index"> {
+  let TSFlags{0} = 1;
+  let TSFlags{1} = 1;
+}
+} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+
+// Placemarkers to indicate the start or end of a block or loop scope. These
+// use/clobber VALUE_STACK to prevent them from being moved into the middle of
+// an expression tree.
+let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in {
+def BLOCK     : I<(outs), (ins Signature:$sig), [], "block   \t$sig", 0x02>;
+def LOOP      : I<(outs), (ins Signature:$sig), [], "loop    \t$sig", 0x03>;
+
+// END_BLOCK and END_LOOP are represented with the same opcode in wasm.
+def END_BLOCK : I<(outs), (ins), [], "end_block", 0x0b>;
+def END_LOOP  : I<(outs), (ins), [], "end_loop", 0x0b>;
+} // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
+
+multiclass RETURN<WebAssemblyRegClass vt> {
+  def RETURN_#vt : I<(outs), (ins vt:$val), [(WebAssemblyreturn vt:$val)],
+                     "return  \t$val", 0x0f>;
+  // Equivalent to RETURN_#vt, for use at the end of a function when wasm
+  // semantics return by falling off the end of the block.
+  let isCodeGenOnly = 1 in
+  def FALLTHROUGH_RETURN_#vt : I<(outs), (ins vt:$val), []>;
+}
+
+multiclass SIMD_RETURN<ValueType vt> {
+  def RETURN_#vt : SIMD_I<(outs), (ins V128:$val),
+                          [(WebAssemblyreturn (vt V128:$val))],
+                          "return  \t$val", 0x0f>;
+  // Equivalent to RETURN_#vt, for use at the end of a function when wasm
+  // semantics return by falling off the end of the block.
+  let isCodeGenOnly = 1 in
+  def FALLTHROUGH_RETURN_#vt : SIMD_I<(outs), (ins V128:$val), []>;
+}
+
+let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
+
+let isReturn = 1 in {
+  defm : RETURN<I32>;
+  defm : RETURN<I64>;
+  defm : RETURN<F32>;
+  defm : RETURN<F64>;
+  defm : SIMD_RETURN<v16i8>;
+  defm : SIMD_RETURN<v8i16>;
+  defm : SIMD_RETURN<v4i32>;
+  defm : SIMD_RETURN<v4f32>;
+
+  def RETURN_VOID : I<(outs), (ins), [(WebAssemblyreturn)], "return", 0x0f>;
+
+  // This is to RETURN_VOID what FALLTHROUGH_RETURN_#vt is to RETURN_#vt.
+  let isCodeGenOnly = 1 in
+  def FALLTHROUGH_RETURN_VOID : I<(outs), (ins), []>;
+} // isReturn = 1
+
+def UNREACHABLE : I<(outs), (ins), [(trap)], "unreachable", 0x00>;
+
+} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+
+} // Defs = [ARGUMENTS]
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td
new file mode 100644
index 000000000000..29483ba663d5
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -0,0 +1,111 @@
+//===-- WebAssemblyInstrConv.td-WebAssembly Conversion support -*- tablegen -*-=
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly datatype conversions, truncations, reinterpretations,
+/// promotions, and demotions operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+let Defs = [ARGUMENTS] in {
+
+def I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src),
+                      [(set I32:$dst, (trunc I64:$src))],
+                      "i32.wrap/i64\t$dst, $src", 0xa7>;
+
+def I64_EXTEND_S_I32 : I<(outs I64:$dst), (ins I32:$src),
+                          [(set I64:$dst, (sext I32:$src))],
+                          "i64.extend_s/i32\t$dst, $src", 0xac>;
+def I64_EXTEND_U_I32 : I<(outs I64:$dst), (ins I32:$src),
+                         [(set I64:$dst, (zext I32:$src))],
+                         "i64.extend_u/i32\t$dst, $src", 0xad>;
+
+} // defs = [ARGUMENTS]
+
+// Expand a "don't care" extend into zero-extend (chosen over sign-extend
+// somewhat arbitrarily, although it favors popular hardware architectures
+// and is conceptually a simpler operation).
+def : Pat<(i64 (anyext I32:$src)), (I64_EXTEND_U_I32 I32:$src)>;
+
+let Defs = [ARGUMENTS] in {
+
+// Conversion from floating point to integer traps on overflow and invalid.
+let hasSideEffects = 1 in {
+def I32_TRUNC_S_F32 : I<(outs I32:$dst), (ins F32:$src),
+                        [(set I32:$dst, (fp_to_sint F32:$src))],
+                        "i32.trunc_s/f32\t$dst, $src", 0xa8>;
+def I32_TRUNC_U_F32 : I<(outs I32:$dst), (ins F32:$src),
+                        [(set I32:$dst, (fp_to_uint F32:$src))],
+                        "i32.trunc_u/f32\t$dst, $src", 0xa9>;
+def I64_TRUNC_S_F32 : I<(outs I64:$dst), (ins F32:$src),
+                        [(set I64:$dst, (fp_to_sint F32:$src))],
+                        "i64.trunc_s/f32\t$dst, $src", 0xae>;
+def I64_TRUNC_U_F32 : I<(outs I64:$dst), (ins F32:$src),
+                        [(set I64:$dst, (fp_to_uint F32:$src))],
+                        "i64.trunc_u/f32\t$dst, $src", 0xaf>;
+def I32_TRUNC_S_F64 : I<(outs I32:$dst), (ins F64:$src),
+                        [(set I32:$dst, (fp_to_sint F64:$src))],
+                        "i32.trunc_s/f64\t$dst, $src", 0xaa>;
+def I32_TRUNC_U_F64 : I<(outs I32:$dst), (ins F64:$src),
+                        [(set I32:$dst, (fp_to_uint F64:$src))],
+                        "i32.trunc_u/f64\t$dst, $src", 0xab>;
+def I64_TRUNC_S_F64 : I<(outs I64:$dst), (ins F64:$src),
+                        [(set I64:$dst, (fp_to_sint F64:$src))],
+                        "i64.trunc_s/f64\t$dst, $src", 0xb0>;
+def I64_TRUNC_U_F64 : I<(outs I64:$dst), (ins F64:$src),
+                        [(set I64:$dst, (fp_to_uint F64:$src))],
+                        "i64.trunc_u/f64\t$dst, $src", 0xb1>;
+} // hasSideEffects = 1
+
+def F32_CONVERT_S_I32 : I<(outs F32:$dst), (ins I32:$src),
+                          [(set F32:$dst, (sint_to_fp I32:$src))],
+                          "f32.convert_s/i32\t$dst, $src", 0xb2>;
+def F32_CONVERT_U_I32 : I<(outs F32:$dst), (ins I32:$src),
+                          [(set F32:$dst, (uint_to_fp I32:$src))],
+                          "f32.convert_u/i32\t$dst, $src", 0xb3>;
+def F64_CONVERT_S_I32 : I<(outs F64:$dst), (ins I32:$src),
+                          [(set F64:$dst, (sint_to_fp I32:$src))],
+                          "f64.convert_s/i32\t$dst, $src", 0xb7>;
+def F64_CONVERT_U_I32 : I<(outs F64:$dst), (ins I32:$src),
+                          [(set F64:$dst, (uint_to_fp I32:$src))],
+                          "f64.convert_u/i32\t$dst, $src", 0xb8>;
+def F32_CONVERT_S_I64 : I<(outs F32:$dst), (ins I64:$src),
+                          [(set F32:$dst, (sint_to_fp I64:$src))],
+                          "f32.convert_s/i64\t$dst, $src", 0xb4>;
+def F32_CONVERT_U_I64 : I<(outs F32:$dst), (ins I64:$src),
+                          [(set F32:$dst, (uint_to_fp I64:$src))],
+                          "f32.convert_u/i64\t$dst, $src", 0xb5>;
+def F64_CONVERT_S_I64 : I<(outs F64:$dst), (ins I64:$src),
+                          [(set F64:$dst, (sint_to_fp I64:$src))],
+                          "f64.convert_s/i64\t$dst, $src", 0xb9>;
+def F64_CONVERT_U_I64 : I<(outs F64:$dst), (ins I64:$src),
+                          [(set F64:$dst, (uint_to_fp I64:$src))],
+                          "f64.convert_u/i64\t$dst, $src", 0xba>;
+
+def F64_PROMOTE_F32 : I<(outs F64:$dst), (ins F32:$src),
+                        [(set F64:$dst, (fpextend F32:$src))],
+                        "f64.promote/f32\t$dst, $src", 0xbb>;
+def F32_DEMOTE_F64 : I<(outs F32:$dst), (ins F64:$src),
+                       [(set F32:$dst, (fpround F64:$src))],
+                       "f32.demote/f64\t$dst, $src", 0xb6>;
+
+def I32_REINTERPRET_F32 : I<(outs I32:$dst), (ins F32:$src),
+                            [(set I32:$dst, (bitconvert F32:$src))],
+                            "i32.reinterpret/f32\t$dst, $src", 0xbc>;
+def F32_REINTERPRET_I32 : I<(outs F32:$dst), (ins I32:$src),
+                            [(set F32:$dst, (bitconvert I32:$src))],
+                            "f32.reinterpret/i32\t$dst, $src", 0xbe>;
+def I64_REINTERPRET_F64 : I<(outs I64:$dst), (ins F64:$src),
+                            [(set I64:$dst, (bitconvert F64:$src))],
+                            "i64.reinterpret/f64\t$dst, $src", 0xbd>;
+def F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src),
+                            [(set F64:$dst, (bitconvert I64:$src))],
+                            "f64.reinterpret/i64\t$dst, $src", 0xbf>;
+
+} // Defs = [ARGUMENTS]
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
new file mode 100644
index 000000000000..030be0862a56
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -0,0 +1,101 @@
+// WebAssemblyInstrFloat.td-WebAssembly Float codegen support ---*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Floating-point operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+let Defs = [ARGUMENTS] in {
+
+let isCommutable = 1 in
+defm ADD : BinaryFP<fadd, "add ", 0x92, 0xa0>;
+defm SUB : BinaryFP<fsub, "sub ", 0x93, 0xa1>;
+let isCommutable = 1 in
+defm MUL : BinaryFP<fmul, "mul ", 0x94, 0xa2>;
+defm DIV : BinaryFP<fdiv, "div ", 0x95, 0xa3>;
+defm SQRT : UnaryFP<fsqrt, "sqrt", 0x91, 0x9f>;
+
+defm ABS : UnaryFP<fabs, "abs ", 0x8b, 0x99>;
+defm NEG : UnaryFP<fneg, "neg ", 0x8c, 0x9a>;
+defm COPYSIGN : BinaryFP<fcopysign, "copysign", 0x98, 0xa6>;
+
+let isCommutable = 1 in {
+defm MIN : BinaryFP<fminnan, "min ", 0x96, 0xa4>;
+defm MAX : BinaryFP<fmaxnan, "max ", 0x97, 0xa5>;
+} // isCommutable = 1
+
+defm CEIL : UnaryFP<fceil, "ceil", 0x8d, 0x9b>;
+defm FLOOR : UnaryFP<ffloor, "floor", 0x8e, 0x9c>;
+defm TRUNC : UnaryFP<ftrunc, "trunc", 0x8f, 0x9d>;
+defm NEAREST : UnaryFP<fnearbyint, "nearest", 0x90, 0x9e>;
+
+} // Defs = [ARGUMENTS]
+
+// DAGCombine oddly folds casts into the rhs of copysign. Unfold them.
+def : Pat<(fcopysign F64:$lhs, F32:$rhs),
+          (COPYSIGN_F64 F64:$lhs, (F64_PROMOTE_F32 F32:$rhs))>;
+def : Pat<(fcopysign F32:$lhs, F64:$rhs),
+          (COPYSIGN_F32 F32:$lhs, (F32_DEMOTE_F64 F64:$rhs))>;
+
+// WebAssembly doesn't expose inexact exceptions, so map frint to fnearbyint.
+def : Pat<(frint f32:$src), (NEAREST_F32 f32:$src)>;
+def : Pat<(frint f64:$src), (NEAREST_F64 f64:$src)>;
+
+let Defs = [ARGUMENTS] in {
+
+let isCommutable = 1 in {
+defm EQ : ComparisonFP<SETOEQ, "eq  ", 0x5b, 0x61>;
+defm NE : ComparisonFP<SETUNE, "ne  ", 0x5c, 0x62>;
+} // isCommutable = 1
+defm LT : ComparisonFP<SETOLT, "lt  ", 0x5d, 0x63>;
+defm LE : ComparisonFP<SETOLE, "le  ", 0x5e, 0x64>;
+defm GT : ComparisonFP<SETOGT, "gt  ", 0x5f, 0x65>;
+defm GE : ComparisonFP<SETOGE, "ge  ", 0x60, 0x66>;
+
+} // Defs = [ARGUMENTS]
+
+// Don't care floating-point comparisons, supported via other comparisons.
+def : Pat<(seteq f32:$lhs, f32:$rhs), (EQ_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setne f32:$lhs, f32:$rhs), (NE_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setlt f32:$lhs, f32:$rhs), (LT_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setle f32:$lhs, f32:$rhs), (LE_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setgt f32:$lhs, f32:$rhs), (GT_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(setge f32:$lhs, f32:$rhs), (GE_F32 f32:$lhs, f32:$rhs)>;
+def : Pat<(seteq f64:$lhs, f64:$rhs), (EQ_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setne f64:$lhs, f64:$rhs), (NE_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setlt f64:$lhs, f64:$rhs), (LT_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setle f64:$lhs, f64:$rhs), (LE_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setgt f64:$lhs, f64:$rhs), (GT_F64 f64:$lhs, f64:$rhs)>;
+def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>;
+
+let Defs = [ARGUMENTS] in {
+
+def SELECT_F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs, I32:$cond),
+                   [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))],
+                   "f32.select\t$dst, $lhs, $rhs, $cond", 0x1b>;
+def SELECT_F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs, I32:$cond),
+                   [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))],
+                   "f64.select\t$dst, $lhs, $rhs, $cond", 0x1b>;
+
+} // Defs = [ARGUMENTS]
+
+// ISD::SELECT requires its operand to conform to getBooleanContents, but
+// WebAssembly's select interprets any non-zero value as true, so we can fold
+// a setne with 0 into a select.
+def : Pat<(select (i32 (setne I32:$cond, 0)), F32:$lhs, F32:$rhs),
+          (SELECT_F32 F32:$lhs, F32:$rhs, I32:$cond)>;
+def : Pat<(select (i32 (setne I32:$cond, 0)), F64:$lhs, F64:$rhs),
+          (SELECT_F64 F64:$lhs, F64:$rhs, I32:$cond)>;
+
+// And again, this time with seteq instead of setne and the arms reversed.
+def : Pat<(select (i32 (seteq I32:$cond, 0)), F32:$lhs, F32:$rhs),
+          (SELECT_F32 F32:$rhs, F32:$lhs, I32:$cond)>;
+def : Pat<(select (i32 (seteq I32:$cond, 0)), F64:$lhs, F64:$rhs),
+          (SELECT_F64 F64:$rhs, F64:$lhs, I32:$cond)>;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
new file mode 100644
index 000000000000..5b2498402571
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -0,0 +1,102 @@
+//=- WebAssemblyInstrFormats.td - WebAssembly Instr. Formats -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly instruction format definitions.
+///
+//===----------------------------------------------------------------------===//
+
+// WebAssembly Instruction Format.
+class WebAssemblyInst<bits<32> inst, string asmstr> : Instruction {
+  field bits<32> Inst = inst; // Instruction encoding.
+  let Namespace   = "WebAssembly";
+  let Pattern     = [];
+  let AsmString   = asmstr;
+}
+
+// Normal instructions.
+class I<dag oops, dag iops, list<dag> pattern, string asmstr = "", bits<32> inst = -1>
+    : WebAssemblyInst<inst, asmstr> {
+  dag OutOperandList = oops;
+  dag InOperandList  = iops;
+  let Pattern        = pattern;
+}
+
+class SIMD_I<dag oops, dag iops, list<dag> pattern,
+             string asmstr = "", bits<32> inst = -1>
+    : I<oops, iops, pattern, asmstr, inst>, Requires<[HasSIMD128]>;
+
+// Unary and binary instructions, for the local types that WebAssembly supports.
+multiclass UnaryInt<SDNode node, string name, bits<32> i32Inst, bits<32> i64Inst> {
+  def _I32 : I<(outs I32:$dst), (ins I32:$src),
+               [(set I32:$dst, (node I32:$src))],
+               !strconcat("i32.", !strconcat(name, "\t$dst, $src")), i32Inst>;
+  def _I64 : I<(outs I64:$dst), (ins I64:$src),
+               [(set I64:$dst, (node I64:$src))],
+               !strconcat("i64.", !strconcat(name, "\t$dst, $src")), i64Inst>;
+}
+multiclass BinaryInt<SDNode node, string name, bits<32> i32Inst, bits<32> i64Inst> {
+  def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs),
+               [(set I32:$dst, (node I32:$lhs, I32:$rhs))],
+               !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs")), i32Inst>;
+  def _I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs),
+               [(set I64:$dst, (node I64:$lhs, I64:$rhs))],
+               !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs")), i64Inst>;
+}
+multiclass UnaryFP<SDNode node, string name, bits<32> f32Inst, bits<32> f64Inst> {
+  def _F32 : I<(outs F32:$dst), (ins F32:$src),
+               [(set F32:$dst, (node F32:$src))],
+               !strconcat("f32.", !strconcat(name, "\t$dst, $src")), f32Inst>;
+  def _F64 : I<(outs F64:$dst), (ins F64:$src),
+               [(set F64:$dst, (node F64:$src))],
+               !strconcat("f64.", !strconcat(name, "\t$dst, $src")), f64Inst>;
+}
+multiclass BinaryFP<SDNode node, string name, bits<32> f32Inst, bits<32> f64Inst> {
+  def _F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs),
+               [(set F32:$dst, (node F32:$lhs, F32:$rhs))],
+               !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs")), f32Inst>;
+  def _F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs),
+               [(set F64:$dst, (node F64:$lhs, F64:$rhs))],
+               !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs")), f64Inst>;
+}
+multiclass SIMDBinary<SDNode node, SDNode fnode, string name> {
+  def _I8x16 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                      [(set (v16i8 V128:$dst), (node V128:$lhs, V128:$rhs))],
+                      !strconcat("i8x16.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+  def _I16x8 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                      [(set (v8i16 V128:$dst), (node V128:$lhs, V128:$rhs))],
+                      !strconcat("i16x8.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+  def _I32x4 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                      [(set (v4i32 V128:$dst), (node V128:$lhs, V128:$rhs))],
+                      !strconcat("i32x4.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+  def _F32x4 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                      [(set (v4f32 V128:$dst), (fnode V128:$lhs, V128:$rhs))],
+                      !strconcat("f32x4.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+
+}
+multiclass ComparisonInt<CondCode cond, string name, bits<32> i32Inst, bits<32> i64Inst> {
+  def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs),
+               [(set I32:$dst, (setcc I32:$lhs, I32:$rhs, cond))],
+               !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+               i32Inst>;
+  def _I64 : I<(outs I32:$dst), (ins I64:$lhs, I64:$rhs),
+               [(set I32:$dst, (setcc I64:$lhs, I64:$rhs, cond))],
+               !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+               i64Inst>;
+}
+multiclass ComparisonFP<CondCode cond, string name, bits<32> f32Inst, bits<32> f64Inst> {
+  def _F32 : I<(outs I32:$dst), (ins F32:$lhs, F32:$rhs),
+               [(set I32:$dst, (setcc F32:$lhs, F32:$rhs, cond))],
+               !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+               f32Inst>;
+  def _F64 : I<(outs I32:$dst), (ins F64:$lhs, F64:$rhs),
+               [(set I32:$dst, (setcc F64:$lhs, F64:$rhs, cond))],
+               !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
+               f64Inst>;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
new file mode 100644
index 000000000000..0e2d8bbaf64c
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -0,0 +1,204 @@
+//===-- WebAssemblyInstrInfo.cpp - WebAssembly Instruction Information ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the WebAssembly implementation of the
+/// TargetInstrInfo class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyInstrInfo.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-instr-info"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "WebAssemblyGenInstrInfo.inc"
+
+WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
+    : WebAssemblyGenInstrInfo(WebAssembly::ADJCALLSTACKDOWN,
+                              WebAssembly::ADJCALLSTACKUP),
+      RI(STI.getTargetTriple()) {}
+
+bool WebAssemblyInstrInfo::isReallyTriviallyReMaterializable(
+    const MachineInstr &MI, AliasAnalysis *AA) const {
+  switch (MI.getOpcode()) {
+  case WebAssembly::CONST_I32:
+  case WebAssembly::CONST_I64:
+  case WebAssembly::CONST_F32:
+  case WebAssembly::CONST_F64:
+    // isReallyTriviallyReMaterializableGeneric misses these because of the
+    // ARGUMENTS implicit def, so we manualy override it here.
+    return true;
+  default:
+    return false;
+  }
+}
+
+void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       const DebugLoc &DL, unsigned DestReg,
+                                       unsigned SrcReg, bool KillSrc) const {
+  // This method is called by post-RA expansion, which expects only pregs to
+  // exist. However we need to handle both here.
+  auto &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+      TargetRegisterInfo::isVirtualRegister(DestReg)
+          ? MRI.getRegClass(DestReg)
+          : MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(DestReg);
+
+  unsigned CopyOpcode;
+  if (RC == &WebAssembly::I32RegClass)
+    CopyOpcode = WebAssembly::COPY_I32;
+  else if (RC == &WebAssembly::I64RegClass)
+    CopyOpcode = WebAssembly::COPY_I64;
+  else if (RC == &WebAssembly::F32RegClass)
+    CopyOpcode = WebAssembly::COPY_F32;
+  else if (RC == &WebAssembly::F64RegClass)
+    CopyOpcode = WebAssembly::COPY_F64;
+  else
+    llvm_unreachable("Unexpected register class");
+
+  BuildMI(MBB, I, DL, get(CopyOpcode), DestReg)
+      .addReg(SrcReg, KillSrc ? RegState::Kill : 0);
+}
+
+MachineInstr *
+WebAssemblyInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+                                             unsigned OpIdx1,
+                                             unsigned OpIdx2) const {
+  // If the operands are stackified, we can't reorder them.
+  WebAssemblyFunctionInfo &MFI =
+      *MI.getParent()->getParent()->getInfo<WebAssemblyFunctionInfo>();
+  if (MFI.isVRegStackified(MI.getOperand(OpIdx1).getReg()) ||
+      MFI.isVRegStackified(MI.getOperand(OpIdx2).getReg()))
+    return nullptr;
+
+  // Otherwise use the default implementation.
+  return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+}
+
+// Branch analysis.
+bool WebAssemblyInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                         MachineBasicBlock *&TBB,
+                                         MachineBasicBlock *&FBB,
+                                         SmallVectorImpl<MachineOperand> &Cond,
+                                         bool /*AllowModify*/) const {
+  bool HaveCond = false;
+  for (MachineInstr &MI : MBB.terminators()) {
+    switch (MI.getOpcode()) {
+    default:
+      // Unhandled instruction; bail out.
+      return true;
+    case WebAssembly::BR_IF:
+      if (HaveCond)
+        return true;
+      // If we're running after CFGStackify, we can't optimize further.
+      if (!MI.getOperand(0).isMBB())
+        return true;
+      Cond.push_back(MachineOperand::CreateImm(true));
+      Cond.push_back(MI.getOperand(1));
+      TBB = MI.getOperand(0).getMBB();
+      HaveCond = true;
+      break;
+    case WebAssembly::BR_UNLESS:
+      if (HaveCond)
+        return true;
+      // If we're running after CFGStackify, we can't optimize further.
+      if (!MI.getOperand(0).isMBB())
+        return true;
+      Cond.push_back(MachineOperand::CreateImm(false));
+      Cond.push_back(MI.getOperand(1));
+      TBB = MI.getOperand(0).getMBB();
+      HaveCond = true;
+      break;
+    case WebAssembly::BR:
+      // If we're running after CFGStackify, we can't optimize further.
+      if (!MI.getOperand(0).isMBB())
+        return true;
+      if (!HaveCond)
+        TBB = MI.getOperand(0).getMBB();
+      else
+        FBB = MI.getOperand(0).getMBB();
+      break;
+    }
+    if (MI.isBarrier())
+      break;
+  }
+
+  return false;
+}
+
+unsigned WebAssemblyInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                            int *BytesRemoved) const {
+  assert(!BytesRemoved && "code size not handled");
+
+  MachineBasicBlock::instr_iterator I = MBB.instr_end();
+  unsigned Count = 0;
+
+  while (I != MBB.instr_begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (!I->isTerminator())
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.instr_end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+unsigned WebAssemblyInstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                            MachineBasicBlock *TBB,
+                                            MachineBasicBlock *FBB,
+                                            ArrayRef<MachineOperand> Cond,
+                                            const DebugLoc &DL,
+                                            int *BytesAdded) const {
+  assert(!BytesAdded && "code size not handled");
+
+  if (Cond.empty()) {
+    if (!TBB)
+      return 0;
+
+    BuildMI(&MBB, DL, get(WebAssembly::BR)).addMBB(TBB);
+    return 1;
+  }
+
+  assert(Cond.size() == 2 && "Expected a flag and a successor block");
+
+  if (Cond[0].getImm()) {
+    BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addMBB(TBB).addOperand(Cond[1]);
+  } else {
+    BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS))
+        .addMBB(TBB)
+        .addOperand(Cond[1]);
+  }
+  if (!FBB)
+    return 1;
+
+  BuildMI(&MBB, DL, get(WebAssembly::BR)).addMBB(FBB);
+  return 2;
+}
+
+bool WebAssemblyInstrInfo::reverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 2 && "Expected a flag and a successor block");
+  Cond.front() = MachineOperand::CreateImm(!Cond.front().getImm());
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
new file mode 100644
index 000000000000..df6c937a364b
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -0,0 +1,63 @@
+//=- WebAssemblyInstrInfo.h - WebAssembly Instruction Information -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the WebAssembly implementation of the
+/// TargetInstrInfo class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYINSTRINFO_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYINSTRINFO_H
+
+#include "WebAssemblyRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "WebAssemblyGenInstrInfo.inc"
+
+namespace llvm {
+
+class WebAssemblySubtarget;
+
+class WebAssemblyInstrInfo final : public WebAssemblyGenInstrInfo {
+  const WebAssemblyRegisterInfo RI;
+
+public:
+  explicit WebAssemblyInstrInfo(const WebAssemblySubtarget &STI);
+
+  const WebAssemblyRegisterInfo &getRegisterInfo() const { return RI; }
+
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
+                                         AliasAnalysis *AA) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+                                       unsigned OpIdx1,
+                                       unsigned OpIdx2) const override;
+
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override;
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+  bool
+  reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
new file mode 100644
index 000000000000..dcfd1a42c6aa
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -0,0 +1,222 @@
+// WebAssemblyInstrInfo.td-Describe the WebAssembly Instructions-*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Instruction definitions.
+///
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// WebAssembly Instruction Predicate Definitions.
+//===----------------------------------------------------------------------===//
+
+def HasAddr32 : Predicate<"!Subtarget->hasAddr64()">;
+def HasAddr64 : Predicate<"Subtarget->hasAddr64()">;
+def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">,
+                           AssemblerPredicate<"FeatureSIMD128", "simd128">;
+
+//===----------------------------------------------------------------------===//
+// WebAssembly-specific DAG Node Types.
+//===----------------------------------------------------------------------===//
+
+def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_WebAssemblyCallSeqEnd :
+    SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+def SDT_WebAssemblyCall0    : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyCall1    : SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>;
+def SDT_WebAssemblyBrTable  : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_WebAssemblyArgument : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>;
+def SDT_WebAssemblyReturn   : SDTypeProfile<0, -1, []>;
+def SDT_WebAssemblyWrapper  : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+                                                   SDTCisPtrTy<0>]>;
+
+//===----------------------------------------------------------------------===//
+// WebAssembly-specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def WebAssemblycallseq_start :
+    SDNode<"ISD::CALLSEQ_START", SDT_WebAssemblyCallSeqStart,
+           [SDNPHasChain, SDNPOutGlue]>;
+def WebAssemblycallseq_end :
+    SDNode<"ISD::CALLSEQ_END", SDT_WebAssemblyCallSeqEnd,
+           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def WebAssemblycall0 : SDNode<"WebAssemblyISD::CALL0",
+                              SDT_WebAssemblyCall0,
+                              [SDNPHasChain, SDNPVariadic]>;
+def WebAssemblycall1 : SDNode<"WebAssemblyISD::CALL1",
+                              SDT_WebAssemblyCall1,
+                              [SDNPHasChain, SDNPVariadic]>;
+def WebAssemblybr_table : SDNode<"WebAssemblyISD::BR_TABLE",
+                                 SDT_WebAssemblyBrTable,
+                                 [SDNPHasChain, SDNPVariadic]>;
+def WebAssemblyargument : SDNode<"WebAssemblyISD::ARGUMENT",
+                                 SDT_WebAssemblyArgument>;
+def WebAssemblyreturn   : SDNode<"WebAssemblyISD::RETURN",
+                                 SDT_WebAssemblyReturn, [SDNPHasChain]>;
+def WebAssemblywrapper  : SDNode<"WebAssemblyISD::Wrapper",
+                                 SDT_WebAssemblyWrapper>;
+
+//===----------------------------------------------------------------------===//
+// WebAssembly-specific Operands.
+//===----------------------------------------------------------------------===//
+
+let OperandNamespace = "WebAssembly" in {
+
+let OperandType = "OPERAND_BASIC_BLOCK" in
+def bb_op : Operand<OtherVT>;
+
+let OperandType = "OPERAND_LOCAL" in
+def local_op : Operand<i32>;
+
+let OperandType = "OPERAND_I32IMM" in
+def i32imm_op : Operand<i32>;
+
+let OperandType = "OPERAND_I64IMM" in
+def i64imm_op : Operand<i64>;
+
+let OperandType = "OPERAND_F32IMM" in
+def f32imm_op : Operand<f32>;
+
+let OperandType = "OPERAND_F64IMM" in
+def f64imm_op : Operand<f64>;
+
+let OperandType = "OPERAND_FUNCTION32" in
+def function32_op : Operand<i32>;
+
+let OperandType = "OPERAND_OFFSET32" in
+def offset32_op : Operand<i32>;
+
+let OperandType = "OPERAND_P2ALIGN" in {
+def P2Align : Operand<i32> {
+  let PrintMethod = "printWebAssemblyP2AlignOperand";
+}
+} // OperandType = "OPERAND_P2ALIGN"
+
+let OperandType = "OPERAND_SIGNATURE" in {
+def Signature : Operand<i32> {
+  let PrintMethod = "printWebAssemblySignatureOperand";
+}
+} // OperandType = "OPERAND_SIGNATURE"
+
+} // OperandNamespace = "WebAssembly"
+
+//===----------------------------------------------------------------------===//
+// WebAssembly Instruction Format Definitions.
+//===----------------------------------------------------------------------===//
+
+include "WebAssemblyInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Additional instructions.
+//===----------------------------------------------------------------------===//
+
+multiclass ARGUMENT<WebAssemblyRegClass vt> {
+  let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
+  def ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno),
+                       [(set vt:$res, (WebAssemblyargument timm:$argno))]>;
+}
+multiclass SIMD_ARGUMENT<ValueType vt> {
+  let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
+  def ARGUMENT_#vt : SIMD_I<(outs V128:$res), (ins i32imm:$argno),
+                            [(set (vt V128:$res),
+                                  (WebAssemblyargument timm:$argno))]>;
+}
+defm : ARGUMENT<I32>;
+defm : ARGUMENT<I64>;
+defm : ARGUMENT<F32>;
+defm : ARGUMENT<F64>;
+defm : SIMD_ARGUMENT<v16i8>;
+defm : SIMD_ARGUMENT<v8i16>;
+defm : SIMD_ARGUMENT<v4i32>;
+defm : SIMD_ARGUMENT<v4f32>;
+
+let Defs = [ARGUMENTS] in {
+
+// get_local and set_local are not generated by instruction selection; they
+// are implied by virtual register uses and defs.
+multiclass LOCAL<WebAssemblyRegClass vt> {
+let hasSideEffects = 0 in {
+  // COPY is not an actual instruction in wasm, but since we allow get_local and
+  // set_local to be implicit during most of codegen, we can have a COPY which
+  // is actually a no-op because all the work is done in the implied get_local
+  // and set_local. COPYs are eliminated (and replaced with
+  // get_local/set_local) in the ExplicitLocals pass.
+  let isAsCheapAsAMove = 1, isCodeGenOnly = 1 in
+  def COPY_#vt : I<(outs vt:$res), (ins vt:$src), [], "copy_local\t$res, $src">;
+
+  // TEE is similar to COPY, but writes two copies of its result. Typically
+  // this would be used to stackify one result and write the other result to a
+  // local.
+  let isAsCheapAsAMove = 1, isCodeGenOnly = 1 in
+  def TEE_#vt : I<(outs vt:$res, vt:$also), (ins vt:$src), [],
+                  "tee_local\t$res, $also, $src">;
+
+  // This is the actual get_local instruction in wasm. These are made explicit
+  // by the ExplicitLocals pass. It has mayLoad because it reads from a wasm
+  // local, which is a side effect not otherwise modeled in LLVM.
+  let mayLoad = 1, isAsCheapAsAMove = 1 in
+  def GET_LOCAL_#vt : I<(outs vt:$res), (ins local_op:$local), [],
+                        "get_local\t$res, $local", 0x20>;
+
+  // This is the actual set_local instruction in wasm. These are made explicit
+  // by the ExplicitLocals pass. It has mayStore because it writes to a wasm
+  // local, which is a side effect not otherwise modeled in LLVM.
+  let mayStore = 1, isAsCheapAsAMove = 1 in
+  def SET_LOCAL_#vt : I<(outs), (ins local_op:$local, vt:$src), [],
+                        "set_local\t$local, $src", 0x21>;
+
+  // This is the actual tee_local instruction in wasm. TEEs are turned into
+  // TEE_LOCALs by the ExplicitLocals pass. It has mayStore for the same reason
+  // as SET_LOCAL.
+  let mayStore = 1, isAsCheapAsAMove = 1 in
+  def TEE_LOCAL_#vt : I<(outs vt:$res), (ins local_op:$local, vt:$src), [],
+                         "tee_local\t$res, $local, $src", 0x22>;
+
+} // hasSideEffects = 0
+}
+defm : LOCAL<I32>;
+defm : LOCAL<I64>;
+defm : LOCAL<F32>;
+defm : LOCAL<F64>;
+defm : LOCAL<V128>, Requires<[HasSIMD128]>;
+
+let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
+def CONST_I32 : I<(outs I32:$res), (ins i32imm_op:$imm),
+                  [(set I32:$res, imm:$imm)],
+                  "i32.const\t$res, $imm", 0x41>;
+def CONST_I64 : I<(outs I64:$res), (ins i64imm_op:$imm),
+                  [(set I64:$res, imm:$imm)],
+                  "i64.const\t$res, $imm", 0x42>;
+def CONST_F32 : I<(outs F32:$res), (ins f32imm_op:$imm),
+                  [(set F32:$res, fpimm:$imm)],
+                  "f32.const\t$res, $imm", 0x43>;
+def CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
+                  [(set F64:$res, fpimm:$imm)],
+                  "f64.const\t$res, $imm", 0x44>;
+} // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1
+
+} // Defs = [ARGUMENTS]
+
+def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
+          (CONST_I32 tglobaladdr:$addr)>;
+def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
+          (CONST_I32 texternalsym:$addr)>;
+
+//===----------------------------------------------------------------------===//
+// Additional sets of instructions.
+//===----------------------------------------------------------------------===//
+
+include "WebAssemblyInstrMemory.td"
+include "WebAssemblyInstrCall.td"
+include "WebAssemblyInstrControl.td"
+include "WebAssemblyInstrInteger.td"
+include "WebAssemblyInstrConv.td"
+include "WebAssemblyInstrFloat.td"
+include "WebAssemblyInstrAtomics.td"
+include "WebAssemblyInstrSIMD.td"
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
new file mode 100644
index 000000000000..8a3248ee669e
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -0,0 +1,97 @@
+// WebAssemblyInstrInteger.td-WebAssembly Integer codegen -------*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Integer operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+let Defs = [ARGUMENTS] in {
+
+// The spaces after the names are for aesthetic purposes only, to make
+// operands line up vertically after tab expansion.
+let isCommutable = 1 in
+defm ADD : BinaryInt<add, "add ", 0x6a, 0x7c>;
+defm SUB : BinaryInt<sub, "sub ", 0x6b, 0x7d>;
+let isCommutable = 1 in
+defm MUL : BinaryInt<mul, "mul ", 0x6c, 0x7e>;
+// Divide and remainder trap on a zero denominator.
+let hasSideEffects = 1 in {
+defm DIV_S : BinaryInt<sdiv, "div_s", 0x6d, 0x7f>;
+defm DIV_U : BinaryInt<udiv, "div_u", 0x6e, 0x80>;
+defm REM_S : BinaryInt<srem, "rem_s", 0x6f, 0x81>;
+defm REM_U : BinaryInt<urem, "rem_u", 0x70, 0x82>;
+} // hasSideEffects = 1
+let isCommutable = 1 in {
+defm AND : BinaryInt<and, "and ", 0x71, 0x83>;
+defm OR : BinaryInt<or, "or  ", 0x72, 0x84>;
+defm XOR : BinaryInt<xor, "xor ", 0x73, 0x85>;
+} // isCommutable = 1
+defm SHL : BinaryInt<shl, "shl ", 0x74, 0x86>;
+defm SHR_S : BinaryInt<sra, "shr_s", 0x75, 0x87>;
+defm SHR_U : BinaryInt<srl, "shr_u", 0x76, 0x88>;
+defm ROTL : BinaryInt<rotl, "rotl", 0x77, 0x89>;
+defm ROTR : BinaryInt<rotr, "rotr", 0x78, 0x8a>;
+
+let isCommutable = 1 in {
+defm EQ : ComparisonInt<SETEQ, "eq  ", 0x46, 0x68>;
+defm NE : ComparisonInt<SETNE, "ne  ", 0x47, 0x69>;
+} // isCommutable = 1
+defm LT_S : ComparisonInt<SETLT,  "lt_s", 0x48, 0x53>;
+defm LT_U : ComparisonInt<SETULT, "lt_u", 0x49, 0x54>;
+defm GT_S : ComparisonInt<SETGT,  "gt_s", 0x4a, 0x55>;
+defm GT_U : ComparisonInt<SETUGT, "gt_u", 0x4b, 0x56>;
+defm LE_S : ComparisonInt<SETLE,  "le_s", 0x4c, 0x57>;
+defm LE_U : ComparisonInt<SETULE, "le_u", 0x4d, 0x58>;
+defm GE_S : ComparisonInt<SETGE,  "ge_s", 0x4e, 0x59>;
+defm GE_U : ComparisonInt<SETUGE, "ge_u", 0x4f, 0x5a>;
+
+defm CLZ : UnaryInt<ctlz, "clz ", 0x67, 0x79>;
+defm CTZ : UnaryInt<cttz, "ctz ", 0x68, 0x7a>;
+defm POPCNT : UnaryInt<ctpop, "popcnt", 0x69, 0x7b>;
+
+def EQZ_I32 : I<(outs I32:$dst), (ins I32:$src),
+                [(set I32:$dst, (setcc I32:$src, 0, SETEQ))],
+                "i32.eqz \t$dst, $src", 0x45>;
+def EQZ_I64 : I<(outs I32:$dst), (ins I64:$src),
+                [(set I32:$dst, (setcc I64:$src, 0, SETEQ))],
+                "i64.eqz \t$dst, $src", 0x50>;
+
+} // Defs = [ARGUMENTS]
+
+// Optimize away an explicit mask on a rotate count.
+def : Pat<(rotl I32:$lhs, (and I32:$rhs, 31)), (ROTL_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>;
+def : Pat<(rotl I64:$lhs, (and I64:$rhs, 63)), (ROTL_I64 I64:$lhs, I64:$rhs)>;
+def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>;
+
+let Defs = [ARGUMENTS] in {
+
+def SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond),
+                   [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
+                   "i32.select\t$dst, $lhs, $rhs, $cond", 0x1b>;
+def SELECT_I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs, I32:$cond),
+                   [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))],
+                   "i64.select\t$dst, $lhs, $rhs, $cond", 0x1b>;
+
+} // Defs = [ARGUMENTS]
+
+// ISD::SELECT requires its operand to conform to getBooleanContents, but
+// WebAssembly's select interprets any non-zero value as true, so we can fold
+// a setne with 0 into a select.
+def : Pat<(select (i32 (setne I32:$cond, 0)), I32:$lhs, I32:$rhs),
+          (SELECT_I32 I32:$lhs, I32:$rhs, I32:$cond)>;
+def : Pat<(select (i32 (setne I32:$cond, 0)), I64:$lhs, I64:$rhs),
+          (SELECT_I64 I64:$lhs, I64:$rhs, I32:$cond)>;
+
+// And again, this time with seteq instead of setne and the arms reversed.
+def : Pat<(select (i32 (seteq I32:$cond, 0)), I32:$lhs, I32:$rhs),
+          (SELECT_I32 I32:$rhs, I32:$lhs, I32:$cond)>;
+def : Pat<(select (i32 (seteq I32:$cond, 0)), I64:$lhs, I64:$rhs),
+          (SELECT_I64 I64:$rhs, I64:$lhs, I32:$cond)>;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
new file mode 100644
index 000000000000..b606ebb0a68d
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -0,0 +1,686 @@
+// WebAssemblyInstrMemory.td-WebAssembly Memory codegen support -*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly Memory operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+// TODO:
+//  - HasAddr64
+//  - WebAssemblyTargetLowering having to do with atomics
+//  - Each has optional alignment.
+
+// WebAssembly has i8/i16/i32/i64/f32/f64 memory types, but doesn't have i8/i16
+// local types. These memory-only types instead zero- or sign-extend into local
+// types when loading, and truncate when storing.
+
+// WebAssembly constant offsets are performed as unsigned with infinite
+// precision, so we need to check for NoUnsignedWrap so that we don't fold an
+// offset for an add that needs wrapping.
+def regPlusImm : PatFrag<(ops node:$addr, node:$off),
+                         (add node:$addr, node:$off),
+                         [{ return N->getFlags()->hasNoUnsignedWrap(); }]>;
+
+// Treat an 'or' node as an 'add' if the or'ed bits are known to be zero.
+def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
+
+  APInt KnownZero0, KnownOne0;
+  CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
+  APInt KnownZero1, KnownOne1;
+  CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
+  return (~KnownZero0 & ~KnownZero1) == 0;
+}]>;
+
+// GlobalAddresses are conceptually unsigned values, so we can also fold them
+// into immediate values as long as the add is 'nuw'.
+// TODO: We'd like to also match GA offsets but there are cases where the
+// register can have a negative value. Find out what more we can do.
+def regPlusGA : PatFrag<(ops node:$addr, node:$off),
+                        (add node:$addr, node:$off),
+                        [{
+  return N->getFlags()->hasNoUnsignedWrap();
+}]>;
+
+// We don't need a regPlusES because external symbols never have constant
+// offsets folded into them, so we can just use add.
+
+let Defs = [ARGUMENTS] in {
+
+// Basic load.
+// FIXME: When we can break syntax compatibility, reorder the fields in the
+// asmstrings to match the binary encoding.
+def LOAD_I32 : I<(outs I32:$dst),
+                 (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+                 [], "i32.load\t$dst, ${off}(${addr})${p2align}", 0x28>;
+def LOAD_I64 : I<(outs I64:$dst),
+                 (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+                 [], "i64.load\t$dst, ${off}(${addr})${p2align}", 0x29>;
+def LOAD_F32 : I<(outs F32:$dst),
+                 (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+                 [], "f32.load\t$dst, ${off}(${addr})${p2align}", 0x2a>;
+def LOAD_F64 : I<(outs F64:$dst),
+                 (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+                 [], "f64.load\t$dst, ${off}(${addr})${p2align}", 0x2b>;
+
+} // Defs = [ARGUMENTS]
+
+// Select loads with no constant offset.
+def : Pat<(i32 (load I32:$addr)), (LOAD_I32 0, 0, $addr)>;
+def : Pat<(i64 (load I32:$addr)), (LOAD_I64 0, 0, $addr)>;
+def : Pat<(f32 (load I32:$addr)), (LOAD_F32 0, 0, $addr)>;
+def : Pat<(f64 (load I32:$addr)), (LOAD_F64 0, 0, $addr)>;
+
+// Select loads with a constant offset.
+def : Pat<(i32 (load (regPlusImm I32:$addr, imm:$off))),
+          (LOAD_I32 0, imm:$off, $addr)>;
+def : Pat<(i64 (load (regPlusImm I32:$addr, imm:$off))),
+          (LOAD_I64 0, imm:$off, $addr)>;
+def : Pat<(f32 (load (regPlusImm I32:$addr, imm:$off))),
+          (LOAD_F32 0, imm:$off, $addr)>;
+def : Pat<(f64 (load (regPlusImm I32:$addr, imm:$off))),
+          (LOAD_F64 0, imm:$off, $addr)>;
+def : Pat<(i32 (load (or_is_add I32:$addr, imm:$off))),
+          (LOAD_I32 0, imm:$off, $addr)>;
+def : Pat<(i64 (load (or_is_add I32:$addr, imm:$off))),
+          (LOAD_I64 0, imm:$off, $addr)>;
+def : Pat<(f32 (load (or_is_add I32:$addr, imm:$off))),
+          (LOAD_F32 0, imm:$off, $addr)>;
+def : Pat<(f64 (load (or_is_add I32:$addr, imm:$off))),
+          (LOAD_F64 0, imm:$off, $addr)>;
+def : Pat<(i32 (load (regPlusGA I32:$addr,
+                                (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD_I32 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (load (regPlusGA I32:$addr,
+                                (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(f32 (load (regPlusGA I32:$addr,
+                                (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD_F32 0, tglobaladdr:$off, $addr)>;
+def : Pat<(f64 (load (regPlusGA I32:$addr,
+                                (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD_F64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD_I32 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD_I64 0, texternalsym:$off, $addr)>;
+def : Pat<(f32 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD_F32 0, texternalsym:$off, $addr)>;
+def : Pat<(f64 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD_F64 0, texternalsym:$off, $addr)>;
+
+// Select loads with just a constant offset.
+def : Pat<(i32 (load imm:$off)), (LOAD_I32 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (load imm:$off)), (LOAD_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(f32 (load imm:$off)), (LOAD_F32 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(f64 (load imm:$off)), (LOAD_F64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (load (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD_I32 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (load (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(f32 (load (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD_F32 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(f64 (load (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD_F64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (load (WebAssemblywrapper texternalsym:$off))),
+          (LOAD_I32 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (load (WebAssemblywrapper texternalsym:$off))),
+          (LOAD_I64 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(f32 (load (WebAssemblywrapper texternalsym:$off))),
+          (LOAD_F32 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(f64 (load (WebAssemblywrapper texternalsym:$off))),
+          (LOAD_F64 0, texternalsym:$off, (CONST_I32 0))>;
+
+let Defs = [ARGUMENTS] in {
+
+// Extending load.
+def LOAD8_S_I32  : I<(outs I32:$dst),
+                     (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+                     [], "i32.load8_s\t$dst, ${off}(${addr})${p2align}", 0x2c>;
+def LOAD8_U_I32  : I<(outs I32:$dst),
+                     (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+                     [], "i32.load8_u\t$dst, ${off}(${addr})${p2align}", 0x2d>;
+def LOAD16_S_I32 : I<(outs I32:$dst),
+                     (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+                     [], "i32.load16_s\t$dst, ${off}(${addr})${p2align}", 0x2e>;
+def LOAD16_U_I32 : I<(outs I32:$dst),
+                     (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+                     [], "i32.load16_u\t$dst, ${off}(${addr})${p2align}", 0x2f>;
+def LOAD8_S_I64  : I<(outs I64:$dst),
+                     (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+                     [], "i64.load8_s\t$dst, ${off}(${addr})${p2align}", 0x30>;
+def LOAD8_U_I64  : I<(outs I64:$dst),
+                     (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+                     [], "i64.load8_u\t$dst, ${off}(${addr})${p2align}", 0x31>;
+def LOAD16_S_I64 : I<(outs I64:$dst),
+                     (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+                     [], "i64.load16_s\t$dst, ${off}(${addr})${p2align}", 0x32>;
+def LOAD16_U_I64 : I<(outs I64:$dst),
+                     (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+                     [], "i64.load16_u\t$dst, ${off}(${addr})${p2align}", 0x33>;
+def LOAD32_S_I64 : I<(outs I64:$dst),
+                     (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+                     [], "i64.load32_s\t$dst, ${off}(${addr})${p2align}", 0x34>;
+def LOAD32_U_I64 : I<(outs I64:$dst),
+                     (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+                     [], "i64.load32_u\t$dst, ${off}(${addr})${p2align}", 0x35>;
+
+} // Defs = [ARGUMENTS]
+
+// Select extending loads with no constant offset.
+def : Pat<(i32 (sextloadi8 I32:$addr)), (LOAD8_S_I32 0, 0, $addr)>;
+def : Pat<(i32 (zextloadi8 I32:$addr)), (LOAD8_U_I32 0, 0, $addr)>;
+def : Pat<(i32 (sextloadi16 I32:$addr)), (LOAD16_S_I32 0, 0, $addr)>;
+def : Pat<(i32 (zextloadi16 I32:$addr)), (LOAD16_U_I32 0, 0, $addr)>;
+def : Pat<(i64 (sextloadi8 I32:$addr)), (LOAD8_S_I64 0, 0, $addr)>;
+def : Pat<(i64 (zextloadi8 I32:$addr)), (LOAD8_U_I64 0, 0, $addr)>;
+def : Pat<(i64 (sextloadi16 I32:$addr)), (LOAD16_S_I64 0, 0, $addr)>;
+def : Pat<(i64 (zextloadi16 I32:$addr)), (LOAD16_U_I64 0, 0, $addr)>;
+def : Pat<(i64 (sextloadi32 I32:$addr)), (LOAD32_S_I64 0, 0, $addr)>;
+def : Pat<(i64 (zextloadi32 I32:$addr)), (LOAD32_U_I64 0, 0, $addr)>;
+
+// Select extending loads with a constant offset.
+def : Pat<(i32 (sextloadi8 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD8_S_I32 0, imm:$off, $addr)>;
+def : Pat<(i32 (zextloadi8 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD8_U_I32 0, imm:$off, $addr)>;
+def : Pat<(i32 (sextloadi16 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD16_S_I32 0, imm:$off, $addr)>;
+def : Pat<(i32 (zextloadi16 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD16_U_I32 0, imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi8 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD8_S_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi8 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD8_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi16 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD16_S_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi16 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD16_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi32 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD32_S_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi32 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD32_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i32 (sextloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_S_I32 0, imm:$off, $addr)>;
+def : Pat<(i32 (zextloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_U_I32 0, imm:$off, $addr)>;
+def : Pat<(i32 (sextloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_S_I32 0, imm:$off, $addr)>;
+def : Pat<(i32 (zextloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_U_I32 0, imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_S_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_S_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (sextloadi32 (or_is_add I32:$addr, imm:$off))),
+          (LOAD32_S_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (zextloadi32 (or_is_add I32:$addr, imm:$off))),
+          (LOAD32_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i32 (sextloadi8 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD8_S_I32 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (zextloadi8 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD8_U_I32 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (sextloadi16 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD16_S_I32 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (zextloadi16 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD16_U_I32 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (sextloadi8 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD8_S_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (zextloadi8 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD8_U_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (sextloadi16 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD16_S_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (zextloadi16 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD16_U_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (sextloadi32 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD32_S_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (zextloadi32 (regPlusGA I32:$addr,
+                                       (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD32_U_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (sextloadi8 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD8_S_I32 0, texternalsym:$off, $addr)>;
+def : Pat<(i32 (zextloadi8 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD8_U_I32 0, texternalsym:$off, $addr)>;
+def : Pat<(i32 (sextloadi16 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD16_S_I32 0, texternalsym:$off, $addr)>;
+def : Pat<(i32 (zextloadi16 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD16_U_I32 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (sextloadi8 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD8_S_I64 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (zextloadi8 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD8_U_I64 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (sextloadi16 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD16_S_I64 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (zextloadi16 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD16_U_I64 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (sextloadi32 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD32_S_I64 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (zextloadi32 (add I32:$addr,
+                                 (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD32_U_I64 0, texternalsym:$off, $addr)>;
+
+// Select extending loads with just a constant offset.
+def : Pat<(i32 (sextloadi8 imm:$off)),
+          (LOAD8_S_I32 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi8 imm:$off)),
+          (LOAD8_U_I32 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi16 imm:$off)),
+          (LOAD16_S_I32 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi16 imm:$off)),
+          (LOAD16_U_I32 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi8 imm:$off)),
+          (LOAD8_S_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi8 imm:$off)),
+          (LOAD8_U_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi16 imm:$off)),
+          (LOAD16_S_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi16 imm:$off)),
+          (LOAD16_U_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi32 imm:$off)),
+          (LOAD32_S_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi32 imm:$off)),
+          (LOAD32_U_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD8_S_I32 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD8_U_I32 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD16_S_I32 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD16_U_I32 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD8_S_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD8_U_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD16_S_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD16_U_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi32 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD32_S_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi32 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD32_U_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi8 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD8_S_I32 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi8 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD8_U_I32 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i32 (sextloadi16 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD16_S_I32 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i32 (zextloadi16 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD16_U_I32 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi8 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD8_S_I64 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi8 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD8_U_I64 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi16 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD16_S_I64 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi16 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD16_U_I64 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (sextloadi32 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD32_S_I64 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (zextloadi32 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD32_U_I64 0, texternalsym:$off, (CONST_I32 0))>;
+
+// Resolve "don't care" extending loads to zero-extending loads. This is
+// somewhat arbitrary, but zero-extending is conceptually simpler.
+
+// Select "don't care" extending loads with no constant offset.
+def : Pat<(i32 (extloadi8 I32:$addr)),  (LOAD8_U_I32 0, 0, $addr)>;
+def : Pat<(i32 (extloadi16 I32:$addr)), (LOAD16_U_I32 0, 0, $addr)>;
+def : Pat<(i64 (extloadi8 I32:$addr)),  (LOAD8_U_I64 0, 0, $addr)>;
+def : Pat<(i64 (extloadi16 I32:$addr)), (LOAD16_U_I64 0, 0, $addr)>;
+def : Pat<(i64 (extloadi32 I32:$addr)), (LOAD32_U_I64 0, 0, $addr)>;
+
+// Select "don't care" extending loads with a constant offset.
+def : Pat<(i32 (extloadi8 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD8_U_I32 0, imm:$off, $addr)>;
+def : Pat<(i32 (extloadi16 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD16_U_I32 0, imm:$off, $addr)>;
+def : Pat<(i64 (extloadi8 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD8_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (extloadi16 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD16_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (extloadi32 (regPlusImm I32:$addr, imm:$off))),
+          (LOAD32_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i32 (extloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_U_I32 0, imm:$off, $addr)>;
+def : Pat<(i32 (extloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_U_I32 0, imm:$off, $addr)>;
+def : Pat<(i64 (extloadi8 (or_is_add I32:$addr, imm:$off))),
+          (LOAD8_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (extloadi16 (or_is_add I32:$addr, imm:$off))),
+          (LOAD16_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i64 (extloadi32 (or_is_add I32:$addr, imm:$off))),
+          (LOAD32_U_I64 0, imm:$off, $addr)>;
+def : Pat<(i32 (extloadi8 (regPlusGA I32:$addr,
+                                     (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD8_U_I32 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (extloadi16 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD16_U_I32 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (extloadi8 (regPlusGA I32:$addr,
+                                     (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD8_U_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (extloadi16 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD16_U_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i64 (extloadi32 (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off)))),
+          (LOAD32_U_I64 0, tglobaladdr:$off, $addr)>;
+def : Pat<(i32 (extloadi8 (add I32:$addr,
+                               (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD8_U_I32 0, texternalsym:$off, $addr)>;
+def : Pat<(i32 (extloadi16 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD16_U_I32 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (extloadi8 (add I32:$addr,
+                               (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD8_U_I64 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (extloadi16 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD16_U_I64 0, texternalsym:$off, $addr)>;
+def : Pat<(i64 (extloadi32 (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off)))),
+          (LOAD32_U_I64 0, texternalsym:$off, $addr)>;
+
+// Select "don't care" extending loads with just a constant offset.
+def : Pat<(i32 (extloadi8 imm:$off)),
+          (LOAD8_U_I32 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi16 imm:$off)),
+          (LOAD16_U_I32 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi8 imm:$off)),
+          (LOAD8_U_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi16 imm:$off)),
+          (LOAD16_U_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi32 imm:$off)),
+          (LOAD32_U_I64 0, imm:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD8_U_I32 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD16_U_I32 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi8 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD8_U_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi16 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD16_U_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi32 (WebAssemblywrapper tglobaladdr:$off))),
+          (LOAD32_U_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi8 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD8_U_I32 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i32 (extloadi16 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD16_U_I32 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi8 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD8_U_I64 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi16 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD16_U_I64 0, texternalsym:$off, (CONST_I32 0))>;
+def : Pat<(i64 (extloadi32 (WebAssemblywrapper texternalsym:$off))),
+          (LOAD32_U_I64 0, tglobaladdr:$off, (CONST_I32 0))>;
+
+let Defs = [ARGUMENTS] in {
+
+// Basic store.
+// Note: WebAssembly inverts SelectionDAG's usual operand order.
+def STORE_I32  : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+                            I32:$val), [],
+                   "i32.store\t${off}(${addr})${p2align}, $val", 0x36>;
+def STORE_I64  : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+                            I64:$val), [],
+                   "i64.store\t${off}(${addr})${p2align}, $val", 0x37>;
+def STORE_F32  : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+                            F32:$val), [],
+                   "f32.store\t${off}(${addr})${p2align}, $val", 0x38>;
+def STORE_F64  : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+                            F64:$val), [],
+                   "f64.store\t${off}(${addr})${p2align}, $val", 0x39>;
+
+} // Defs = [ARGUMENTS]
+
+// Select stores with no constant offset.
+def : Pat<(store I32:$val, I32:$addr), (STORE_I32 0, 0, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, I32:$addr), (STORE_I64 0, 0, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, I32:$addr), (STORE_F32 0, 0, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, I32:$addr), (STORE_F64 0, 0, I32:$addr, F64:$val)>;
+
+// Select stores with a constant offset.
+def : Pat<(store I32:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE_I32 0, imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE_I64 0, imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE_F32 0, imm:$off, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE_F64 0, imm:$off, I32:$addr, F64:$val)>;
+def : Pat<(store I32:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE_I32 0, imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE_I64 0, imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE_F32 0, imm:$off, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE_F64 0, imm:$off, I32:$addr, F64:$val)>;
+def : Pat<(store I32:$val, (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE_I32 0, tglobaladdr:$off, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE_I64 0, tglobaladdr:$off, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE_F32 0, tglobaladdr:$off, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, (regPlusGA I32:$addr,
+                                      (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE_F64 0, tglobaladdr:$off, I32:$addr, F64:$val)>;
+def : Pat<(store I32:$val, (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off))),
+          (STORE_I32 0, texternalsym:$off, I32:$addr, I32:$val)>;
+def : Pat<(store I64:$val, (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off))),
+          (STORE_I64 0, texternalsym:$off, I32:$addr, I64:$val)>;
+def : Pat<(store F32:$val, (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off))),
+          (STORE_F32 0, texternalsym:$off, I32:$addr, F32:$val)>;
+def : Pat<(store F64:$val, (add I32:$addr,
+                                (WebAssemblywrapper texternalsym:$off))),
+          (STORE_F64 0, texternalsym:$off, I32:$addr, F64:$val)>;
+
+// Select stores with just a constant offset.
+def : Pat<(store I32:$val, imm:$off),
+          (STORE_I32 0, imm:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(store I64:$val, imm:$off),
+          (STORE_I64 0, imm:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(store F32:$val, imm:$off),
+          (STORE_F32 0, imm:$off, (CONST_I32 0), F32:$val)>;
+def : Pat<(store F64:$val, imm:$off),
+          (STORE_F64 0, imm:$off, (CONST_I32 0), F64:$val)>;
+def : Pat<(store I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE_I32 0, tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(store I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE_I64 0, tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(store F32:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE_F32 0, tglobaladdr:$off, (CONST_I32 0), F32:$val)>;
+def : Pat<(store F64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE_F64 0, tglobaladdr:$off, (CONST_I32 0), F64:$val)>;
+def : Pat<(store I32:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE_I32 0, texternalsym:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(store I64:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE_I64 0, texternalsym:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(store F32:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE_F32 0, texternalsym:$off, (CONST_I32 0), F32:$val)>;
+def : Pat<(store F64:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE_F64 0, texternalsym:$off, (CONST_I32 0), F64:$val)>;
+
+let Defs = [ARGUMENTS] in {
+
+// Truncating store.
+def STORE8_I32  : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+                             I32:$val), [],
+                    "i32.store8\t${off}(${addr})${p2align}, $val", 0x3a>;
+def STORE16_I32 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+                             I32:$val), [],
+                    "i32.store16\t${off}(${addr})${p2align}, $val", 0x3b>;
+def STORE8_I64  : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+                             I64:$val), [],
+                    "i64.store8\t${off}(${addr})${p2align}, $val", 0x3c>;
+def STORE16_I64 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+                             I64:$val), [],
+                    "i64.store16\t${off}(${addr})${p2align}, $val", 0x3d>;
+def STORE32_I64 : I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr,
+                             I64:$val), [],
+                    "i64.store32\t${off}(${addr})${p2align}, $val", 0x3e>;
+
+} // Defs = [ARGUMENTS]
+
+// Select truncating stores with no constant offset.
+def : Pat<(truncstorei8 I32:$val, I32:$addr),
+          (STORE8_I32 0, 0, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, I32:$addr),
+          (STORE16_I32 0, 0, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, I32:$addr),
+          (STORE8_I64 0, 0, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, I32:$addr),
+          (STORE16_I64 0, 0, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, I32:$addr),
+          (STORE32_I64 0, 0, I32:$addr, I64:$val)>;
+
+// Select truncating stores with a constant offset.
+def : Pat<(truncstorei8 I32:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE8_I32 0, imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE16_I32 0, imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE8_I64 0, imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE16_I64 0, imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (regPlusImm I32:$addr, imm:$off)),
+          (STORE32_I64 0, imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE8_I32 0, imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE16_I32 0, imm:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE8_I64 0, imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE16_I64 0, imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (or_is_add I32:$addr, imm:$off)),
+          (STORE32_I64 0, imm:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei8 I32:$val,
+                        (regPlusGA I32:$addr,
+                                   (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE8_I32 0, tglobaladdr:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val,
+                         (regPlusGA I32:$addr,
+                                    (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE16_I32 0, tglobaladdr:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val,
+                        (regPlusGA I32:$addr,
+                                   (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE8_I64 0, tglobaladdr:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val,
+                         (regPlusGA I32:$addr,
+                                    (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE16_I64 0, tglobaladdr:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val,
+                         (regPlusGA I32:$addr,
+                                    (WebAssemblywrapper tglobaladdr:$off))),
+          (STORE32_I64 0, tglobaladdr:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (add I32:$addr,
+                                       (WebAssemblywrapper texternalsym:$off))),
+          (STORE8_I32 0, texternalsym:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei16 I32:$val,
+                         (add I32:$addr,
+                              (WebAssemblywrapper texternalsym:$off))),
+          (STORE16_I32 0, texternalsym:$off, I32:$addr, I32:$val)>;
+def : Pat<(truncstorei8 I64:$val,
+                        (add I32:$addr,
+                             (WebAssemblywrapper texternalsym:$off))),
+          (STORE8_I64 0, texternalsym:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei16 I64:$val,
+                         (add I32:$addr,
+                              (WebAssemblywrapper texternalsym:$off))),
+          (STORE16_I64 0, texternalsym:$off, I32:$addr, I64:$val)>;
+def : Pat<(truncstorei32 I64:$val,
+                         (add I32:$addr,
+                              (WebAssemblywrapper texternalsym:$off))),
+          (STORE32_I64 0, texternalsym:$off, I32:$addr, I64:$val)>;
+
+// Select truncating stores with just a constant offset.
+def : Pat<(truncstorei8 I32:$val, imm:$off),
+          (STORE8_I32 0, imm:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, imm:$off),
+          (STORE16_I32 0, imm:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, imm:$off),
+          (STORE8_I64 0, imm:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, imm:$off),
+          (STORE16_I64 0, imm:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, imm:$off),
+          (STORE32_I64 0, imm:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE8_I32 0, tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE16_I32 0, tglobaladdr:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE8_I64 0, tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE16_I64 0, tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper tglobaladdr:$off)),
+          (STORE32_I64 0, tglobaladdr:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE8_I32 0, texternalsym:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE16_I32 0, texternalsym:$off, (CONST_I32 0), I32:$val)>;
+def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE8_I64 0, texternalsym:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE16_I64 0, texternalsym:$off, (CONST_I32 0), I64:$val)>;
+def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper texternalsym:$off)),
+          (STORE32_I64 0, texternalsym:$off, (CONST_I32 0), I64:$val)>;
+
+let Defs = [ARGUMENTS] in {
+
+// Current memory size.
+def CURRENT_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
+                           [],
+                           "current_memory\t$dst", 0x3f>,
+                         Requires<[HasAddr32]>;
+
+// Grow memory.
+def GROW_MEMORY_I32 : I<(outs), (ins i32imm:$flags, I32:$delta),
+                        [],
+                        "grow_memory\t$delta", 0x40>,
+                      Requires<[HasAddr32]>;
+
+} // Defs = [ARGUMENTS]
+
+def : Pat<(int_wasm_current_memory),
+          (CURRENT_MEMORY_I32 0)>;
+def : Pat<(int_wasm_grow_memory I32:$delta),
+          (GROW_MEMORY_I32 0, $delta)>;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
new file mode 100644
index 000000000000..e403534d580a
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -0,0 +1,19 @@
+// WebAssemblyInstrSIMD.td - WebAssembly SIMD codegen support -*- tablegen -*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief WebAssembly SIMD operand code-gen constructs.
+///
+//===----------------------------------------------------------------------===//
+
+let isCommutable = 1 in {
+defm ADD : SIMDBinary<add, fadd, "add ">;
+defm MUL: SIMDBinary<mul, fmul, "mul ">;
+} // isCommutable = 1
+defm SUB: SIMDBinary<sub, fsub, "sub ">;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
new file mode 100644
index 000000000000..7ea5d05a1b21
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -0,0 +1,128 @@
+//===-- WebAssemblyLowerBrUnless.cpp - Lower br_unless --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file lowers br_unless into br_if with an inverted condition.
+///
+/// br_unless is not currently in the spec, but it's very convenient for LLVM
+/// to use. This pass allows LLVM to use it, for now.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-lower-br_unless"
+
+namespace {
+class WebAssemblyLowerBrUnless final : public MachineFunctionPass {
+  StringRef getPassName() const override {
+    return "WebAssembly Lower br_unless";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyLowerBrUnless() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyLowerBrUnless::ID = 0;
+FunctionPass *llvm::createWebAssemblyLowerBrUnless() {
+  return new WebAssemblyLowerBrUnless();
+}
+
+bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Lowering br_unless **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto &MRI = MF.getRegInfo();
+
+  for (auto &MBB : MF) {
+    for (auto MII = MBB.begin(); MII != MBB.end();) {
+      MachineInstr *MI = &*MII++;
+      if (MI->getOpcode() != WebAssembly::BR_UNLESS)
+        continue;
+
+      unsigned Cond = MI->getOperand(1).getReg();
+      bool Inverted = false;
+
+      // Attempt to invert the condition in place.
+      if (MFI.isVRegStackified(Cond)) {
+        assert(MRI.hasOneDef(Cond));
+        MachineInstr *Def = MRI.getVRegDef(Cond);
+        switch (Def->getOpcode()) {
+          using namespace WebAssembly;
+        case EQ_I32: Def->setDesc(TII.get(NE_I32)); Inverted = true; break;
+        case NE_I32: Def->setDesc(TII.get(EQ_I32)); Inverted = true; break;
+        case GT_S_I32: Def->setDesc(TII.get(LE_S_I32)); Inverted = true; break;
+        case GE_S_I32: Def->setDesc(TII.get(LT_S_I32)); Inverted = true; break;
+        case LT_S_I32: Def->setDesc(TII.get(GE_S_I32)); Inverted = true; break;
+        case LE_S_I32: Def->setDesc(TII.get(GT_S_I32)); Inverted = true; break;
+        case GT_U_I32: Def->setDesc(TII.get(LE_U_I32)); Inverted = true; break;
+        case GE_U_I32: Def->setDesc(TII.get(LT_U_I32)); Inverted = true; break;
+        case LT_U_I32: Def->setDesc(TII.get(GE_U_I32)); Inverted = true; break;
+        case LE_U_I32: Def->setDesc(TII.get(GT_U_I32)); Inverted = true; break;
+        case EQ_I64: Def->setDesc(TII.get(NE_I64)); Inverted = true; break;
+        case NE_I64: Def->setDesc(TII.get(EQ_I64)); Inverted = true; break;
+        case GT_S_I64: Def->setDesc(TII.get(LE_S_I64)); Inverted = true; break;
+        case GE_S_I64: Def->setDesc(TII.get(LT_S_I64)); Inverted = true; break;
+        case LT_S_I64: Def->setDesc(TII.get(GE_S_I64)); Inverted = true; break;
+        case LE_S_I64: Def->setDesc(TII.get(GT_S_I64)); Inverted = true; break;
+        case GT_U_I64: Def->setDesc(TII.get(LE_U_I64)); Inverted = true; break;
+        case GE_U_I64: Def->setDesc(TII.get(LT_U_I64)); Inverted = true; break;
+        case LT_U_I64: Def->setDesc(TII.get(GE_U_I64)); Inverted = true; break;
+        case LE_U_I64: Def->setDesc(TII.get(GT_U_I64)); Inverted = true; break;
+        case EQ_F32: Def->setDesc(TII.get(NE_F32)); Inverted = true; break;
+        case NE_F32: Def->setDesc(TII.get(EQ_F32)); Inverted = true; break;
+        case EQ_F64: Def->setDesc(TII.get(NE_F64)); Inverted = true; break;
+        case NE_F64: Def->setDesc(TII.get(EQ_F64)); Inverted = true; break;
+        default: break;
+        }
+      }
+
+      // If we weren't able to invert the condition in place. Insert an
+      // instruction to invert it.
+      if (!Inverted) {
+        unsigned Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
+        BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQZ_I32), Tmp)
+            .addReg(Cond);
+        MFI.stackifyVReg(Tmp);
+        Cond = Tmp;
+        Inverted = true;
+      }
+
+      // The br_unless condition has now been inverted. Insert a br_if and
+      // delete the br_unless.
+      assert(Inverted);
+      BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::BR_IF))
+          .addOperand(MI->getOperand(0))
+          .addReg(Cond);
+      MBB.erase(MI);
+    }
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
new file mode 100644
index 000000000000..72cb1ccbe668
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -0,0 +1,1184 @@
+//=== WebAssemblyLowerEmscriptenEHSjLj.cpp - Lower exceptions for Emscripten =//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file lowers exception-related instructions and setjmp/longjmp
+/// function calls in order to use Emscripten's JavaScript try and catch
+/// mechanism.
+///
+/// To handle exceptions and setjmp/longjmps, this scheme relies on JavaScript's
+/// try and catch syntax and relevant exception-related libraries implemented
+/// in JavaScript glue code that will be produced by Emscripten. This is similar
+/// to the current Emscripten asm.js exception handling in fastcomp. For
+/// fastcomp's EH / SjLj scheme, see these files in fastcomp LLVM branch:
+/// (Location: https://github.com/kripken/emscripten-fastcomp)
+/// lib/Target/JSBackend/NaCl/LowerEmExceptionsPass.cpp
+/// lib/Target/JSBackend/NaCl/LowerEmSetjmp.cpp
+/// lib/Target/JSBackend/JSBackend.cpp
+/// lib/Target/JSBackend/CallHandlers.h
+///
+/// * Exception handling
+/// This pass lowers invokes and landingpads into library functions in JS glue
+/// code. Invokes are lowered into function wrappers called invoke wrappers that
+/// exist in JS side, which wraps the original function call with JS try-catch.
+/// If an exception occurred, cxa_throw() function in JS side sets some
+/// variables (see below) so we can check whether an exception occurred from
+/// wasm code and handle it appropriately.
+///
+/// * Setjmp-longjmp handling
+/// This pass lowers setjmp to a reasonably-performant approach for emscripten.
+/// The idea is that each block with a setjmp is broken up into two parts: the
+/// part containing setjmp and the part right after the setjmp. The latter part
+/// is either reached from the setjmp, or later from a longjmp. To handle the
+/// longjmp, all calls that might longjmp are also called using invoke wrappers
+/// and thus JS / try-catch. JS longjmp() function also sets some variables so
+/// we can check / whether a longjmp occurred from wasm code. Each block with a
+/// function call that might longjmp is also split up after the longjmp call.
+/// After the longjmp call, we check whether a longjmp occurred, and if it did,
+/// which setjmp it corresponds to, and jump to the right post-setjmp block.
+/// We assume setjmp-longjmp handling always run after EH handling, which means
+/// we don't expect any exception-related instructions when SjLj runs.
+/// FIXME Currently this scheme does not support indirect call of setjmp,
+/// because of the limitation of the scheme itself. fastcomp does not support it
+/// either.
+///
+/// In detail, this pass does following things:
+///
+/// 1) Create three global variables: __THREW__, __threwValue, and __tempRet0.
+///    __tempRet0 will be set within __cxa_find_matching_catch() function in
+///    JS library, and __THREW__ and __threwValue will be set in invoke wrappers
+///    in JS glue code. For what invoke wrappers are, refer to 3). These
+///    variables are used for both exceptions and setjmp/longjmps.
+///    __THREW__ indicates whether an exception or a longjmp occurred or not. 0
+///    means nothing occurred, 1 means an exception occurred, and other numbers
+///    mean a longjmp occurred. In the case of longjmp, __threwValue variable
+///    indicates the corresponding setjmp buffer the longjmp corresponds to.
+///    In exception handling, __tempRet0 indicates the type of an exception
+///    caught, and in setjmp/longjmp, it means the second argument to longjmp
+///    function.
+///
+/// * Exception handling
+///
+/// 2) Create setThrew and setTempRet0 functions.
+///    The global variables created in 1) will exist in wasm address space,
+///    but their values should be set in JS code, so we provide these functions
+///    as interfaces to JS glue code. These functions are equivalent to the
+///    following JS functions, which actually exist in asm.js version of JS
+///    library.
+///
+///    function setThrew(threw, value) {
+///      if (__THREW__ == 0) {
+///        __THREW__ = threw;
+///        __threwValue = value;
+///      }
+///    }
+///
+///    function setTempRet0(value) {
+///      __tempRet0 = value;
+///    }
+///
+/// 3) Lower
+///      invoke @func(arg1, arg2) to label %invoke.cont unwind label %lpad
+///    into
+///      __THREW__ = 0;
+///      call @__invoke_SIG(func, arg1, arg2)
+///      %__THREW__.val = __THREW__;
+///      __THREW__ = 0;
+///      if (%__THREW__.val == 1)
+///        goto %lpad
+///      else
+///         goto %invoke.cont
+///    SIG is a mangled string generated based on the LLVM IR-level function
+///    signature. After LLVM IR types are lowered to the target wasm types,
+///    the names for these wrappers will change based on wasm types as well,
+///    as in invoke_vi (function takes an int and returns void). The bodies of
+///    these wrappers will be generated in JS glue code, and inside those
+///    wrappers we use JS try-catch to generate actual exception effects. It
+///    also calls the original callee function. An example wrapper in JS code
+///    would look like this:
+///      function invoke_vi(index,a1) {
+///        try {
+///          Module["dynCall_vi"](index,a1); // This calls original callee
+///        } catch(e) {
+///          if (typeof e !== 'number' && e !== 'longjmp') throw e;
+///          asm["setThrew"](1, 0); // setThrew is called here
+///        }
+///      }
+///    If an exception is thrown, __THREW__ will be set to true in a wrapper,
+///    so we can jump to the right BB based on this value.
+///
+/// 4) Lower
+///      %val = landingpad catch c1 catch c2 catch c3 ...
+///      ... use %val ...
+///    into
+///      %fmc = call @__cxa_find_matching_catch_N(c1, c2, c3, ...)
+///      %val = {%fmc, __tempRet0}
+///      ... use %val ...
+///    Here N is a number calculated based on the number of clauses.
+///    Global variable __tempRet0 is set within __cxa_find_matching_catch() in
+///    JS glue code.
+///
+/// 5) Lower
+///      resume {%a, %b}
+///    into
+///      call @__resumeException(%a)
+///    where __resumeException() is a function in JS glue code.
+///
+/// 6) Lower
+///      call @llvm.eh.typeid.for(type) (intrinsic)
+///    into
+///      call @llvm_eh_typeid_for(type)
+///    llvm_eh_typeid_for function will be generated in JS glue code.
+///
+/// * Setjmp / Longjmp handling
+///
+/// 7) In the function entry that calls setjmp, initialize setjmpTable and
+///    sejmpTableSize as follows:
+///      setjmpTableSize = 4;
+///      setjmpTable = (int *) malloc(40);
+///      setjmpTable[0] = 0;
+///    setjmpTable and setjmpTableSize are used in saveSetjmp() function in JS
+///    code.
+///
+/// 8) Lower
+///      setjmp(buf)
+///    into
+///      setjmpTable = saveSetjmp(buf, label, setjmpTable, setjmpTableSize);
+///      setjmpTableSize = __tempRet0;
+///    For each dynamic setjmp call, setjmpTable stores its ID (a number which
+///    is incrementally assigned from 0) and its label (a unique number that
+///    represents each callsite of setjmp). When we need more entries in
+///    setjmpTable, it is reallocated in saveSetjmp() in JS code and it will
+///    return the new table address, and assign the new table size in
+///    __tempRet0. saveSetjmp also stores the setjmp's ID into the buffer buf.
+///    A BB with setjmp is split into two after setjmp call in order to make the
+///    post-setjmp BB the possible destination of longjmp BB.
+///
+/// 9) Lower
+///      longjmp(buf, value)
+///    into
+///      emscripten_longjmp_jmpbuf(buf, value)
+///    emscripten_longjmp_jmpbuf will be lowered to emscripten_longjmp later.
+///
+/// 10) Lower every call that might longjmp into
+///      __THREW__ = 0;
+///      call @__invoke_SIG(func, arg1, arg2)
+///      %__THREW__.val = __THREW__;
+///      __THREW__ = 0;
+///      if (%__THREW__.val != 0 & __threwValue != 0) {
+///        %label = testSetjmp(mem[%__THREW__.val], setjmpTable,
+///                            setjmpTableSize);
+///        if (%label == 0)
+///          emscripten_longjmp(%__THREW__.val, __threwValue);
+///        __tempRet0 = __threwValue;
+///      } else {
+///        %label = -1;
+///      }
+///      longjmp_result = __tempRet0;
+///      switch label {
+///        label 1: goto post-setjmp BB 1
+///        label 2: goto post-setjmp BB 2
+///        ...
+///        default: goto splitted next BB
+///      }
+///     testSetjmp examines setjmpTable to see if there is a matching setjmp
+///     call. After calling an invoke wrapper, if a longjmp occurred, __THREW__
+///     will be the address of matching jmp_buf buffer and __threwValue be the
+///     second argument to longjmp. mem[__THREW__.val] is a setjmp ID that is
+///     stored in saveSetjmp. testSetjmp returns a setjmp label, a unique ID to
+///     each setjmp callsite. Label 0 means this longjmp buffer does not
+///     correspond to one of the setjmp callsites in this function, so in this
+///     case we just chain the longjmp to the caller. (Here we call
+///     emscripten_longjmp, which is different from emscripten_longjmp_jmpbuf.
+///     emscripten_longjmp_jmpbuf takes jmp_buf as its first argument, while
+///     emscripten_longjmp takes an int. Both of them will eventually be lowered
+///     to emscripten_longjmp in s2wasm, but here we need two signatures - we
+///     can't translate an int value to a jmp_buf.)
+///     Label -1 means no longjmp occurred. Otherwise we jump to the right
+///     post-setjmp BB based on the label.
+///
+///===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-lower-em-ehsjlj"
+
+static cl::list<std::string>
+    EHWhitelist("emscripten-cxx-exceptions-whitelist",
+                cl::desc("The list of function names in which Emscripten-style "
+                         "exception handling is enabled (see emscripten "
+                         "EMSCRIPTEN_CATCHING_WHITELIST options)"),
+                cl::CommaSeparated);
+
+namespace {
+class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
+  static const char *ThrewGVName;
+  static const char *ThrewValueGVName;
+  static const char *TempRet0GVName;
+  static const char *ResumeFName;
+  static const char *EHTypeIDFName;
+  static const char *SetThrewFName;
+  static const char *SetTempRet0FName;
+  static const char *EmLongjmpFName;
+  static const char *EmLongjmpJmpbufFName;
+  static const char *SaveSetjmpFName;
+  static const char *TestSetjmpFName;
+  static const char *FindMatchingCatchPrefix;
+  static const char *InvokePrefix;
+
+  bool EnableEH;   // Enable exception handling
+  bool EnableSjLj; // Enable setjmp/longjmp handling
+
+  GlobalVariable *ThrewGV;
+  GlobalVariable *ThrewValueGV;
+  GlobalVariable *TempRet0GV;
+  Function *ResumeF;
+  Function *EHTypeIDF;
+  Function *EmLongjmpF;
+  Function *EmLongjmpJmpbufF;
+  Function *SaveSetjmpF;
+  Function *TestSetjmpF;
+
+  // __cxa_find_matching_catch_N functions.
+  // Indexed by the number of clauses in an original landingpad instruction.
+  DenseMap<int, Function *> FindMatchingCatches;
+  // Map of <function signature string, invoke_ wrappers>
+  StringMap<Function *> InvokeWrappers;
+  // Set of whitelisted function names for exception handling
+  std::set<std::string> EHWhitelistSet;
+
+  StringRef getPassName() const override {
+    return "WebAssembly Lower Emscripten Exceptions";
+  }
+
+  bool runEHOnFunction(Function &F);
+  bool runSjLjOnFunction(Function &F);
+  Function *getFindMatchingCatch(Module &M, unsigned NumClauses);
+
+  template <typename CallOrInvoke> Value *wrapInvoke(CallOrInvoke *CI);
+  void wrapTestSetjmp(BasicBlock *BB, Instruction *InsertPt, Value *Threw,
+                      Value *SetjmpTable, Value *SetjmpTableSize, Value *&Label,
+                      Value *&LongjmpResult, BasicBlock *&EndBB);
+  template <typename CallOrInvoke> Function *getInvokeWrapper(CallOrInvoke *CI);
+
+  bool areAllExceptionsAllowed() const { return EHWhitelistSet.empty(); }
+  bool canLongjmp(Module &M, const Value *Callee) const;
+
+  void createSetThrewFunction(Module &M);
+  void createSetTempRet0Function(Module &M);
+
+  void rebuildSSA(Function &F);
+
+public:
+  static char ID;
+
+  WebAssemblyLowerEmscriptenEHSjLj(bool EnableEH = true, bool EnableSjLj = true)
+      : ModulePass(ID), EnableEH(EnableEH), EnableSjLj(EnableSjLj),
+        ThrewGV(nullptr), ThrewValueGV(nullptr), TempRet0GV(nullptr),
+        ResumeF(nullptr), EHTypeIDF(nullptr), EmLongjmpF(nullptr),
+        EmLongjmpJmpbufF(nullptr), SaveSetjmpF(nullptr), TestSetjmpF(nullptr) {
+    EHWhitelistSet.insert(EHWhitelist.begin(), EHWhitelist.end());
+  }
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+  }
+};
+} // End anonymous namespace
+
+const char *WebAssemblyLowerEmscriptenEHSjLj::ThrewGVName = "__THREW__";
+const char *WebAssemblyLowerEmscriptenEHSjLj::ThrewValueGVName = "__threwValue";
+const char *WebAssemblyLowerEmscriptenEHSjLj::TempRet0GVName = "__tempRet0";
+const char *WebAssemblyLowerEmscriptenEHSjLj::ResumeFName = "__resumeException";
+const char *WebAssemblyLowerEmscriptenEHSjLj::EHTypeIDFName =
+    "llvm_eh_typeid_for";
+const char *WebAssemblyLowerEmscriptenEHSjLj::SetThrewFName = "setThrew";
+const char *WebAssemblyLowerEmscriptenEHSjLj::SetTempRet0FName = "setTempRet0";
+const char *WebAssemblyLowerEmscriptenEHSjLj::EmLongjmpFName =
+    "emscripten_longjmp";
+const char *WebAssemblyLowerEmscriptenEHSjLj::EmLongjmpJmpbufFName =
+    "emscripten_longjmp_jmpbuf";
+const char *WebAssemblyLowerEmscriptenEHSjLj::SaveSetjmpFName = "saveSetjmp";
+const char *WebAssemblyLowerEmscriptenEHSjLj::TestSetjmpFName = "testSetjmp";
+const char *WebAssemblyLowerEmscriptenEHSjLj::FindMatchingCatchPrefix =
+    "__cxa_find_matching_catch_";
+const char *WebAssemblyLowerEmscriptenEHSjLj::InvokePrefix = "__invoke_";
+
+char WebAssemblyLowerEmscriptenEHSjLj::ID = 0;
+INITIALIZE_PASS(WebAssemblyLowerEmscriptenEHSjLj, DEBUG_TYPE,
+                "WebAssembly Lower Emscripten Exceptions / Setjmp / Longjmp",
+                false, false)
+
+ModulePass *llvm::createWebAssemblyLowerEmscriptenEHSjLj(bool EnableEH,
+                                                         bool EnableSjLj) {
+  return new WebAssemblyLowerEmscriptenEHSjLj(EnableEH, EnableSjLj);
+}
+
+static bool canThrow(const Value *V) {
+  if (const auto *F = dyn_cast<const Function>(V)) {
+    // Intrinsics cannot throw
+    if (F->isIntrinsic())
+      return false;
+    StringRef Name = F->getName();
+    // leave setjmp and longjmp (mostly) alone, we process them properly later
+    if (Name == "setjmp" || Name == "longjmp")
+      return false;
+    return !F->doesNotThrow();
+  }
+  // not a function, so an indirect call - can throw, we can't tell
+  return true;
+}
+
+// Returns an available name for a global value.
+// If the proposed name already exists in the module, adds '_' at the end of
+// the name until the name is available.
+static inline std::string createGlobalValueName(const Module &M,
+                                                const std::string &Propose) {
+  std::string Name = Propose;
+  while (M.getNamedGlobal(Name))
+    Name += "_";
+  return Name;
+}
+
+// Simple function name mangler.
+// This function simply takes LLVM's string representation of parameter types
+// and concatenate them with '_'. There are non-alphanumeric characters but llc
+// is ok with it, and we need to postprocess these names after the lowering
+// phase anyway.
+static std::string getSignature(FunctionType *FTy) {
+  std::string Sig;
+  raw_string_ostream OS(Sig);
+  OS << *FTy->getReturnType();
+  for (Type *ParamTy : FTy->params())
+    OS << "_" << *ParamTy;
+  if (FTy->isVarArg())
+    OS << "_...";
+  Sig = OS.str();
+  Sig.erase(remove_if(Sig, isspace), Sig.end());
+  // When s2wasm parses .s file, a comma means the end of an argument. So a
+  // mangled function name can contain any character but a comma.
+  std::replace(Sig.begin(), Sig.end(), ',', '.');
+  return Sig;
+}
+
+// Returns __cxa_find_matching_catch_N function, where N = NumClauses + 2.
+// This is because a landingpad instruction contains two more arguments, a
+// personality function and a cleanup bit, and __cxa_find_matching_catch_N
+// functions are named after the number of arguments in the original landingpad
+// instruction.
+Function *
+WebAssemblyLowerEmscriptenEHSjLj::getFindMatchingCatch(Module &M,
+                                                       unsigned NumClauses) {
+  if (FindMatchingCatches.count(NumClauses))
+    return FindMatchingCatches[NumClauses];
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
+  SmallVector<Type *, 16> Args(NumClauses, Int8PtrTy);
+  FunctionType *FTy = FunctionType::get(Int8PtrTy, Args, false);
+  Function *F =
+      Function::Create(FTy, GlobalValue::ExternalLinkage,
+                       FindMatchingCatchPrefix + Twine(NumClauses + 2), &M);
+  FindMatchingCatches[NumClauses] = F;
+  return F;
+}
+
+// Generate invoke wrapper seqence with preamble and postamble
+// Preamble:
+// __THREW__ = 0;
+// Postamble:
+// %__THREW__.val = __THREW__; __THREW__ = 0;
+// Returns %__THREW__.val, which indicates whether an exception is thrown (or
+// whether longjmp occurred), for future use.
+template <typename CallOrInvoke>
+Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
+  LLVMContext &C = CI->getModule()->getContext();
+
+  // If we are calling a function that is noreturn, we must remove that
+  // attribute. The code we insert here does expect it to return, after we
+  // catch the exception.
+  if (CI->doesNotReturn()) {
+    if (auto *F = dyn_cast<Function>(CI->getCalledValue()))
+      F->removeFnAttr(Attribute::NoReturn);
+    CI->removeAttribute(AttributeSet::FunctionIndex, Attribute::NoReturn);
+  }
+
+  IRBuilder<> IRB(C);
+  IRB.SetInsertPoint(CI);
+
+  // Pre-invoke
+  // __THREW__ = 0;
+  IRB.CreateStore(IRB.getInt32(0), ThrewGV);
+
+  // Invoke function wrapper in JavaScript
+  SmallVector<Value *, 16> Args;
+  // Put the pointer to the callee as first argument, so it can be called
+  // within the invoke wrapper later
+  Args.push_back(CI->getCalledValue());
+  Args.append(CI->arg_begin(), CI->arg_end());
+  CallInst *NewCall = IRB.CreateCall(getInvokeWrapper(CI), Args);
+  NewCall->takeName(CI);
+  NewCall->setCallingConv(CI->getCallingConv());
+  NewCall->setDebugLoc(CI->getDebugLoc());
+
+  // Because we added the pointer to the callee as first argument, all
+  // argument attribute indices have to be incremented by one.
+  SmallVector<AttributeSet, 8> AttributesVec;
+  const AttributeSet &InvokePAL = CI->getAttributes();
+  CallSite::arg_iterator AI = CI->arg_begin();
+  unsigned i = 1; // Argument attribute index starts from 1
+  for (unsigned e = CI->getNumArgOperands(); i <= e; ++AI, ++i) {
+    if (InvokePAL.hasAttributes(i)) {
+      AttrBuilder B(InvokePAL, i);
+      AttributesVec.push_back(AttributeSet::get(C, i + 1, B));
+    }
+  }
+  // Add any return attributes.
+  if (InvokePAL.hasAttributes(AttributeSet::ReturnIndex))
+    AttributesVec.push_back(AttributeSet::get(C, InvokePAL.getRetAttributes()));
+  // Add any function attributes.
+  if (InvokePAL.hasAttributes(AttributeSet::FunctionIndex))
+    AttributesVec.push_back(AttributeSet::get(C, InvokePAL.getFnAttributes()));
+  // Reconstruct the AttributesList based on the vector we constructed.
+  AttributeSet NewCallPAL = AttributeSet::get(C, AttributesVec);
+  NewCall->setAttributes(NewCallPAL);
+
+  CI->replaceAllUsesWith(NewCall);
+
+  // Post-invoke
+  // %__THREW__.val = __THREW__; __THREW__ = 0;
+  Value *Threw = IRB.CreateLoad(ThrewGV, ThrewGV->getName() + ".val");
+  IRB.CreateStore(IRB.getInt32(0), ThrewGV);
+  return Threw;
+}
+
+// Get matching invoke wrapper based on callee signature
+template <typename CallOrInvoke>
+Function *WebAssemblyLowerEmscriptenEHSjLj::getInvokeWrapper(CallOrInvoke *CI) {
+  Module *M = CI->getModule();
+  SmallVector<Type *, 16> ArgTys;
+  Value *Callee = CI->getCalledValue();
+  FunctionType *CalleeFTy;
+  if (auto *F = dyn_cast<Function>(Callee))
+    CalleeFTy = F->getFunctionType();
+  else {
+    auto *CalleeTy = cast<PointerType>(Callee->getType())->getElementType();
+    CalleeFTy = dyn_cast<FunctionType>(CalleeTy);
+  }
+
+  std::string Sig = getSignature(CalleeFTy);
+  if (InvokeWrappers.find(Sig) != InvokeWrappers.end())
+    return InvokeWrappers[Sig];
+
+  // Put the pointer to the callee as first argument
+  ArgTys.push_back(PointerType::getUnqual(CalleeFTy));
+  // Add argument types
+  ArgTys.append(CalleeFTy->param_begin(), CalleeFTy->param_end());
+
+  FunctionType *FTy = FunctionType::get(CalleeFTy->getReturnType(), ArgTys,
+                                        CalleeFTy->isVarArg());
+  Function *F = Function::Create(FTy, GlobalValue::ExternalLinkage,
+                                 InvokePrefix + Sig, M);
+  InvokeWrappers[Sig] = F;
+  return F;
+}
+
+bool WebAssemblyLowerEmscriptenEHSjLj::canLongjmp(Module &M,
+                                                  const Value *Callee) const {
+  if (auto *CalleeF = dyn_cast<Function>(Callee))
+    if (CalleeF->isIntrinsic())
+      return false;
+
+  // The reason we include malloc/free here is to exclude the malloc/free
+  // calls generated in setjmp prep / cleanup routines.
+  Function *SetjmpF = M.getFunction("setjmp");
+  Function *MallocF = M.getFunction("malloc");
+  Function *FreeF = M.getFunction("free");
+  if (Callee == SetjmpF || Callee == MallocF || Callee == FreeF)
+    return false;
+
+  // There are functions in JS glue code
+  if (Callee == ResumeF || Callee == EHTypeIDF || Callee == SaveSetjmpF ||
+      Callee == TestSetjmpF)
+    return false;
+
+  // __cxa_find_matching_catch_N functions cannot longjmp
+  if (Callee->getName().startswith(FindMatchingCatchPrefix))
+    return false;
+
+  // Exception-catching related functions
+  Function *BeginCatchF = M.getFunction("__cxa_begin_catch");
+  Function *EndCatchF = M.getFunction("__cxa_end_catch");
+  Function *AllocExceptionF = M.getFunction("__cxa_allocate_exception");
+  Function *ThrowF = M.getFunction("__cxa_throw");
+  Function *TerminateF = M.getFunction("__clang_call_terminate");
+  if (Callee == BeginCatchF || Callee == EndCatchF ||
+      Callee == AllocExceptionF || Callee == ThrowF || Callee == TerminateF)
+    return false;
+
+  // Otherwise we don't know
+  return true;
+}
+
+// Generate testSetjmp function call seqence with preamble and postamble.
+// The code this generates is equivalent to the following JavaScript code:
+// if (%__THREW__.val != 0 & threwValue != 0) {
+//   %label = _testSetjmp(mem[%__THREW__.val], setjmpTable, setjmpTableSize);
+//   if (%label == 0)
+//     emscripten_longjmp(%__THREW__.val, threwValue);
+//   __tempRet0 = threwValue;
+// } else {
+//   %label = -1;
+// }
+// %longjmp_result = __tempRet0;
+//
+// As output parameters. returns %label, %longjmp_result, and the BB the last
+// instruction (%longjmp_result = ...) is in.
+void WebAssemblyLowerEmscriptenEHSjLj::wrapTestSetjmp(
+    BasicBlock *BB, Instruction *InsertPt, Value *Threw, Value *SetjmpTable,
+    Value *SetjmpTableSize, Value *&Label, Value *&LongjmpResult,
+    BasicBlock *&EndBB) {
+  Function *F = BB->getParent();
+  LLVMContext &C = BB->getModule()->getContext();
+  IRBuilder<> IRB(C);
+  IRB.SetInsertPoint(InsertPt);
+
+  // if (%__THREW__.val != 0 & threwValue != 0)
+  IRB.SetInsertPoint(BB);
+  BasicBlock *ThenBB1 = BasicBlock::Create(C, "if.then1", F);
+  BasicBlock *ElseBB1 = BasicBlock::Create(C, "if.else1", F);
+  BasicBlock *EndBB1 = BasicBlock::Create(C, "if.end", F);
+  Value *ThrewCmp = IRB.CreateICmpNE(Threw, IRB.getInt32(0));
+  Value *ThrewValue =
+      IRB.CreateLoad(ThrewValueGV, ThrewValueGV->getName() + ".val");
+  Value *ThrewValueCmp = IRB.CreateICmpNE(ThrewValue, IRB.getInt32(0));
+  Value *Cmp1 = IRB.CreateAnd(ThrewCmp, ThrewValueCmp, "cmp1");
+  IRB.CreateCondBr(Cmp1, ThenBB1, ElseBB1);
+
+  // %label = _testSetjmp(mem[%__THREW__.val], _setjmpTable, _setjmpTableSize);
+  // if (%label == 0)
+  IRB.SetInsertPoint(ThenBB1);
+  BasicBlock *ThenBB2 = BasicBlock::Create(C, "if.then2", F);
+  BasicBlock *EndBB2 = BasicBlock::Create(C, "if.end2", F);
+  Value *ThrewInt = IRB.CreateIntToPtr(Threw, Type::getInt32PtrTy(C),
+                                       Threw->getName() + ".i32p");
+  Value *LoadedThrew =
+      IRB.CreateLoad(ThrewInt, ThrewInt->getName() + ".loaded");
+  Value *ThenLabel = IRB.CreateCall(
+      TestSetjmpF, {LoadedThrew, SetjmpTable, SetjmpTableSize}, "label");
+  Value *Cmp2 = IRB.CreateICmpEQ(ThenLabel, IRB.getInt32(0));
+  IRB.CreateCondBr(Cmp2, ThenBB2, EndBB2);
+
+  // emscripten_longjmp(%__THREW__.val, threwValue);
+  IRB.SetInsertPoint(ThenBB2);
+  IRB.CreateCall(EmLongjmpF, {Threw, ThrewValue});
+  IRB.CreateUnreachable();
+
+  // __tempRet0 = threwValue;
+  IRB.SetInsertPoint(EndBB2);
+  IRB.CreateStore(ThrewValue, TempRet0GV);
+  IRB.CreateBr(EndBB1);
+
+  IRB.SetInsertPoint(ElseBB1);
+  IRB.CreateBr(EndBB1);
+
+  // longjmp_result = __tempRet0;
+  IRB.SetInsertPoint(EndBB1);
+  PHINode *LabelPHI = IRB.CreatePHI(IRB.getInt32Ty(), 2, "label");
+  LabelPHI->addIncoming(ThenLabel, EndBB2);
+
+  LabelPHI->addIncoming(IRB.getInt32(-1), ElseBB1);
+
+  // Output parameter assignment
+  Label = LabelPHI;
+  EndBB = EndBB1;
+  LongjmpResult = IRB.CreateLoad(TempRet0GV, "longjmp_result");
+}
+
+// Create setThrew function
+// function setThrew(threw, value) {
+//   if (__THREW__ == 0) {
+//     __THREW__ = threw;
+//     __threwValue = value;
+//   }
+// }
+void WebAssemblyLowerEmscriptenEHSjLj::createSetThrewFunction(Module &M) {
+  LLVMContext &C = M.getContext();
+  IRBuilder<> IRB(C);
+
+  assert(!M.getNamedGlobal(SetThrewFName) && "setThrew already exists");
+  Type *Params[] = {IRB.getInt32Ty(), IRB.getInt32Ty()};
+  FunctionType *FTy = FunctionType::get(IRB.getVoidTy(), Params, false);
+  Function *F =
+      Function::Create(FTy, GlobalValue::ExternalLinkage, SetThrewFName, &M);
+  Argument *Arg1 = &*(F->arg_begin());
+  Argument *Arg2 = &*(++F->arg_begin());
+  Arg1->setName("threw");
+  Arg2->setName("value");
+  BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
+  BasicBlock *ThenBB = BasicBlock::Create(C, "if.then", F);
+  BasicBlock *EndBB = BasicBlock::Create(C, "if.end", F);
+
+  IRB.SetInsertPoint(EntryBB);
+  Value *Threw = IRB.CreateLoad(ThrewGV, ThrewGV->getName() + ".val");
+  Value *Cmp = IRB.CreateICmpEQ(Threw, IRB.getInt32(0), "cmp");
+  IRB.CreateCondBr(Cmp, ThenBB, EndBB);
+
+  IRB.SetInsertPoint(ThenBB);
+  IRB.CreateStore(Arg1, ThrewGV);
+  IRB.CreateStore(Arg2, ThrewValueGV);
+  IRB.CreateBr(EndBB);
+
+  IRB.SetInsertPoint(EndBB);
+  IRB.CreateRetVoid();
+}
+
+// Create setTempRet0 function
+// function setTempRet0(value) {
+//   __tempRet0 = value;
+// }
+void WebAssemblyLowerEmscriptenEHSjLj::createSetTempRet0Function(Module &M) {
+  LLVMContext &C = M.getContext();
+  IRBuilder<> IRB(C);
+
+  assert(!M.getNamedGlobal(SetTempRet0FName) && "setTempRet0 already exists");
+  Type *Params[] = {IRB.getInt32Ty()};
+  FunctionType *FTy = FunctionType::get(IRB.getVoidTy(), Params, false);
+  Function *F =
+      Function::Create(FTy, GlobalValue::ExternalLinkage, SetTempRet0FName, &M);
+  F->arg_begin()->setName("value");
+  BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
+  IRB.SetInsertPoint(EntryBB);
+  IRB.CreateStore(&*F->arg_begin(), TempRet0GV);
+  IRB.CreateRetVoid();
+}
+
+void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) {
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+  DT.recalculate(F); // CFG has been changed
+  SSAUpdater SSA;
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      for (auto UI = I.use_begin(), UE = I.use_end(); UI != UE;) {
+        Use &U = *UI;
+        ++UI;
+        SSA.Initialize(I.getType(), I.getName());
+        SSA.AddAvailableValue(&BB, &I);
+        Instruction *User = cast<Instruction>(U.getUser());
+        if (User->getParent() == &BB)
+          continue;
+
+        if (PHINode *UserPN = dyn_cast<PHINode>(User))
+          if (UserPN->getIncomingBlock(U) == &BB)
+            continue;
+
+        if (DT.dominates(&I, User))
+          continue;
+        SSA.RewriteUseAfterInsertions(U);
+      }
+    }
+  }
+}
+
+bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
+  LLVMContext &C = M.getContext();
+  IRBuilder<> IRB(C);
+
+  Function *SetjmpF = M.getFunction("setjmp");
+  Function *LongjmpF = M.getFunction("longjmp");
+  bool SetjmpUsed = SetjmpF && !SetjmpF->use_empty();
+  bool LongjmpUsed = LongjmpF && !LongjmpF->use_empty();
+  bool DoSjLj = EnableSjLj && (SetjmpUsed || LongjmpUsed);
+
+  // Create global variables __THREW__, threwValue, and __tempRet0, which are
+  // used in common for both exception handling and setjmp/longjmp handling
+  ThrewGV = new GlobalVariable(M, IRB.getInt32Ty(), false,
+                               GlobalValue::ExternalLinkage, IRB.getInt32(0),
+                               createGlobalValueName(M, ThrewGVName));
+  ThrewValueGV = new GlobalVariable(
+      M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage, IRB.getInt32(0),
+      createGlobalValueName(M, ThrewValueGVName));
+  TempRet0GV = new GlobalVariable(M, IRB.getInt32Ty(), false,
+                                  GlobalValue::ExternalLinkage, IRB.getInt32(0),
+                                  createGlobalValueName(M, TempRet0GVName));
+
+  bool Changed = false;
+
+  // Exception handling
+  if (EnableEH) {
+    // Register __resumeException function
+    FunctionType *ResumeFTy =
+        FunctionType::get(IRB.getVoidTy(), IRB.getInt8PtrTy(), false);
+    ResumeF = Function::Create(ResumeFTy, GlobalValue::ExternalLinkage,
+                               ResumeFName, &M);
+
+    // Register llvm_eh_typeid_for function
+    FunctionType *EHTypeIDTy =
+        FunctionType::get(IRB.getInt32Ty(), IRB.getInt8PtrTy(), false);
+    EHTypeIDF = Function::Create(EHTypeIDTy, GlobalValue::ExternalLinkage,
+                                 EHTypeIDFName, &M);
+
+    for (Function &F : M) {
+      if (F.isDeclaration())
+        continue;
+      Changed |= runEHOnFunction(F);
+    }
+  }
+
+  // Setjmp/longjmp handling
+  if (DoSjLj) {
+    Changed = true; // We have setjmp or longjmp somewhere
+
+    Function *MallocF = M.getFunction("malloc");
+    Function *FreeF = M.getFunction("free");
+    if (!MallocF || !FreeF)
+      report_fatal_error(
+          "malloc and free must be linked into the module if setjmp is used");
+
+    // Register saveSetjmp function
+    FunctionType *SetjmpFTy = SetjmpF->getFunctionType();
+    SmallVector<Type *, 4> Params = {SetjmpFTy->getParamType(0),
+                                     IRB.getInt32Ty(), Type::getInt32PtrTy(C),
+                                     IRB.getInt32Ty()};
+    FunctionType *FTy =
+        FunctionType::get(Type::getInt32PtrTy(C), Params, false);
+    SaveSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage,
+                                   SaveSetjmpFName, &M);
+
+    // Register testSetjmp function
+    Params = {IRB.getInt32Ty(), Type::getInt32PtrTy(C), IRB.getInt32Ty()};
+    FTy = FunctionType::get(IRB.getInt32Ty(), Params, false);
+    TestSetjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage,
+                                   TestSetjmpFName, &M);
+
+    if (LongjmpF) {
+      // Replace all uses of longjmp with emscripten_longjmp_jmpbuf, which is
+      // defined in JS code
+      EmLongjmpJmpbufF = Function::Create(LongjmpF->getFunctionType(),
+                                          GlobalValue::ExternalLinkage,
+                                          EmLongjmpJmpbufFName, &M);
+
+      LongjmpF->replaceAllUsesWith(EmLongjmpJmpbufF);
+    }
+    FTy = FunctionType::get(IRB.getVoidTy(),
+                            {IRB.getInt32Ty(), IRB.getInt32Ty()}, false);
+    EmLongjmpF =
+        Function::Create(FTy, GlobalValue::ExternalLinkage, EmLongjmpFName, &M);
+
+    // Only traverse functions that uses setjmp in order not to insert
+    // unnecessary prep / cleanup code in every function
+    SmallPtrSet<Function *, 8> SetjmpUsers;
+    for (User *U : SetjmpF->users()) {
+      auto *UI = cast<Instruction>(U);
+      SetjmpUsers.insert(UI->getFunction());
+    }
+    for (Function *F : SetjmpUsers)
+      runSjLjOnFunction(*F);
+  }
+
+  if (!Changed) {
+    // Delete unused global variables and functions
+    ThrewGV->eraseFromParent();
+    ThrewValueGV->eraseFromParent();
+    TempRet0GV->eraseFromParent();
+    if (ResumeF)
+      ResumeF->eraseFromParent();
+    if (EHTypeIDF)
+      EHTypeIDF->eraseFromParent();
+    if (EmLongjmpF)
+      EmLongjmpF->eraseFromParent();
+    if (SaveSetjmpF)
+      SaveSetjmpF->eraseFromParent();
+    if (TestSetjmpF)
+      TestSetjmpF->eraseFromParent();
+    return false;
+  }
+
+  // If we have made any changes while doing exception handling or
+  // setjmp/longjmp handling, we have to create these functions for JavaScript
+  // to call.
+  createSetThrewFunction(M);
+  createSetTempRet0Function(M);
+
+  return true;
+}
+
+bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
+  Module &M = *F.getParent();
+  LLVMContext &C = F.getContext();
+  IRBuilder<> IRB(C);
+  bool Changed = false;
+  SmallVector<Instruction *, 64> ToErase;
+  SmallPtrSet<LandingPadInst *, 32> LandingPads;
+  bool AllowExceptions =
+      areAllExceptionsAllowed() || EHWhitelistSet.count(F.getName());
+
+  for (BasicBlock &BB : F) {
+    auto *II = dyn_cast<InvokeInst>(BB.getTerminator());
+    if (!II)
+      continue;
+    Changed = true;
+    LandingPads.insert(II->getLandingPadInst());
+    IRB.SetInsertPoint(II);
+
+    bool NeedInvoke = AllowExceptions && canThrow(II->getCalledValue());
+    if (NeedInvoke) {
+      // Wrap invoke with invoke wrapper and generate preamble/postamble
+      Value *Threw = wrapInvoke(II);
+      ToErase.push_back(II);
+
+      // Insert a branch based on __THREW__ variable
+      Value *Cmp = IRB.CreateICmpEQ(Threw, IRB.getInt32(1), "cmp");
+      IRB.CreateCondBr(Cmp, II->getUnwindDest(), II->getNormalDest());
+
+    } else {
+      // This can't throw, and we don't need this invoke, just replace it with a
+      // call+branch
+      SmallVector<Value *, 16> Args(II->arg_begin(), II->arg_end());
+      CallInst *NewCall = IRB.CreateCall(II->getCalledValue(), Args);
+      NewCall->takeName(II);
+      NewCall->setCallingConv(II->getCallingConv());
+      NewCall->setDebugLoc(II->getDebugLoc());
+      NewCall->setAttributes(II->getAttributes());
+      II->replaceAllUsesWith(NewCall);
+      ToErase.push_back(II);
+
+      IRB.CreateBr(II->getNormalDest());
+
+      // Remove any PHI node entries from the exception destination
+      II->getUnwindDest()->removePredecessor(&BB);
+    }
+  }
+
+  // Process resume instructions
+  for (BasicBlock &BB : F) {
+    // Scan the body of the basic block for resumes
+    for (Instruction &I : BB) {
+      auto *RI = dyn_cast<ResumeInst>(&I);
+      if (!RI)
+        continue;
+
+      // Split the input into legal values
+      Value *Input = RI->getValue();
+      IRB.SetInsertPoint(RI);
+      Value *Low = IRB.CreateExtractValue(Input, 0, "low");
+      // Create a call to __resumeException function
+      IRB.CreateCall(ResumeF, {Low});
+      // Add a terminator to the block
+      IRB.CreateUnreachable();
+      ToErase.push_back(RI);
+    }
+  }
+
+  // Process llvm.eh.typeid.for intrinsics
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      auto *CI = dyn_cast<CallInst>(&I);
+      if (!CI)
+        continue;
+      const Function *Callee = CI->getCalledFunction();
+      if (!Callee)
+        continue;
+      if (Callee->getIntrinsicID() != Intrinsic::eh_typeid_for)
+        continue;
+
+      IRB.SetInsertPoint(CI);
+      CallInst *NewCI =
+          IRB.CreateCall(EHTypeIDF, CI->getArgOperand(0), "typeid");
+      CI->replaceAllUsesWith(NewCI);
+      ToErase.push_back(CI);
+    }
+  }
+
+  // Look for orphan landingpads, can occur in blocks with no predecesors
+  for (BasicBlock &BB : F) {
+    Instruction *I = BB.getFirstNonPHI();
+    if (auto *LPI = dyn_cast<LandingPadInst>(I))
+      LandingPads.insert(LPI);
+  }
+
+  // Handle all the landingpad for this function together, as multiple invokes
+  // may share a single lp
+  for (LandingPadInst *LPI : LandingPads) {
+    IRB.SetInsertPoint(LPI);
+    SmallVector<Value *, 16> FMCArgs;
+    for (unsigned i = 0, e = LPI->getNumClauses(); i < e; ++i) {
+      Constant *Clause = LPI->getClause(i);
+      // As a temporary workaround for the lack of aggregate varargs support
+      // in the interface between JS and wasm, break out filter operands into
+      // their component elements.
+      if (LPI->isFilter(i)) {
+        auto *ATy = cast<ArrayType>(Clause->getType());
+        for (unsigned j = 0, e = ATy->getNumElements(); j < e; ++j) {
+          Value *EV = IRB.CreateExtractValue(Clause, makeArrayRef(j), "filter");
+          FMCArgs.push_back(EV);
+        }
+      } else
+        FMCArgs.push_back(Clause);
+    }
+
+    // Create a call to __cxa_find_matching_catch_N function
+    Function *FMCF = getFindMatchingCatch(M, FMCArgs.size());
+    CallInst *FMCI = IRB.CreateCall(FMCF, FMCArgs, "fmc");
+    Value *Undef = UndefValue::get(LPI->getType());
+    Value *Pair0 = IRB.CreateInsertValue(Undef, FMCI, 0, "pair0");
+    Value *TempRet0 =
+        IRB.CreateLoad(TempRet0GV, TempRet0GV->getName() + ".val");
+    Value *Pair1 = IRB.CreateInsertValue(Pair0, TempRet0, 1, "pair1");
+
+    LPI->replaceAllUsesWith(Pair1);
+    ToErase.push_back(LPI);
+  }
+
+  // Erase everything we no longer need in this function
+  for (Instruction *I : ToErase)
+    I->eraseFromParent();
+
+  return Changed;
+}
+
+bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
+  Module &M = *F.getParent();
+  LLVMContext &C = F.getContext();
+  IRBuilder<> IRB(C);
+  SmallVector<Instruction *, 64> ToErase;
+  // Vector of %setjmpTable values
+  std::vector<Instruction *> SetjmpTableInsts;
+  // Vector of %setjmpTableSize values
+  std::vector<Instruction *> SetjmpTableSizeInsts;
+
+  // Setjmp preparation
+
+  // This instruction effectively means %setjmpTableSize = 4.
+  // We create this as an instruction intentionally, and we don't want to fold
+  // this instruction to a constant 4, because this value will be used in
+  // SSAUpdater.AddAvailableValue(...) later.
+  BasicBlock &EntryBB = F.getEntryBlock();
+  BinaryOperator *SetjmpTableSize = BinaryOperator::Create(
+      Instruction::Add, IRB.getInt32(4), IRB.getInt32(0), "setjmpTableSize",
+      &*EntryBB.getFirstInsertionPt());
+  // setjmpTable = (int *) malloc(40);
+  Instruction *SetjmpTable = CallInst::CreateMalloc(
+      SetjmpTableSize, IRB.getInt32Ty(), IRB.getInt32Ty(), IRB.getInt32(40),
+      nullptr, nullptr, "setjmpTable");
+  // setjmpTable[0] = 0;
+  IRB.SetInsertPoint(SetjmpTableSize);
+  IRB.CreateStore(IRB.getInt32(0), SetjmpTable);
+  SetjmpTableInsts.push_back(SetjmpTable);
+  SetjmpTableSizeInsts.push_back(SetjmpTableSize);
+
+  // Setjmp transformation
+  std::vector<PHINode *> SetjmpRetPHIs;
+  Function *SetjmpF = M.getFunction("setjmp");
+  for (User *U : SetjmpF->users()) {
+    auto *CI = dyn_cast<CallInst>(U);
+    if (!CI)
+      report_fatal_error("Does not support indirect calls to setjmp");
+
+    BasicBlock *BB = CI->getParent();
+    if (BB->getParent() != &F) // in other function
+      continue;
+
+    // The tail is everything right after the call, and will be reached once
+    // when setjmp is called, and later when longjmp returns to the setjmp
+    BasicBlock *Tail = SplitBlock(BB, CI->getNextNode());
+    // Add a phi to the tail, which will be the output of setjmp, which
+    // indicates if this is the first call or a longjmp back. The phi directly
+    // uses the right value based on where we arrive from
+    IRB.SetInsertPoint(Tail->getFirstNonPHI());
+    PHINode *SetjmpRet = IRB.CreatePHI(IRB.getInt32Ty(), 2, "setjmp.ret");
+
+    // setjmp initial call returns 0
+    SetjmpRet->addIncoming(IRB.getInt32(0), BB);
+    // The proper output is now this, not the setjmp call itself
+    CI->replaceAllUsesWith(SetjmpRet);
+    // longjmp returns to the setjmp will add themselves to this phi
+    SetjmpRetPHIs.push_back(SetjmpRet);
+
+    // Fix call target
+    // Our index in the function is our place in the array + 1 to avoid index
+    // 0, because index 0 means the longjmp is not ours to handle.
+    IRB.SetInsertPoint(CI);
+    Value *Args[] = {CI->getArgOperand(0), IRB.getInt32(SetjmpRetPHIs.size()),
+                     SetjmpTable, SetjmpTableSize};
+    Instruction *NewSetjmpTable =
+        IRB.CreateCall(SaveSetjmpF, Args, "setjmpTable");
+    Instruction *NewSetjmpTableSize =
+        IRB.CreateLoad(TempRet0GV, "setjmpTableSize");
+    SetjmpTableInsts.push_back(NewSetjmpTable);
+    SetjmpTableSizeInsts.push_back(NewSetjmpTableSize);
+    ToErase.push_back(CI);
+  }
+
+  // Update each call that can longjmp so it can return to a setjmp where
+  // relevant.
+
+  // Because we are creating new BBs while processing and don't want to make
+  // all these newly created BBs candidates again for longjmp processing, we
+  // first make the vector of candidate BBs.
+  std::vector<BasicBlock *> BBs;
+  for (BasicBlock &BB : F)
+    BBs.push_back(&BB);
+
+  // BBs.size() will change within the loop, so we query it every time
+  for (unsigned i = 0; i < BBs.size(); i++) {
+    BasicBlock *BB = BBs[i];
+    for (Instruction &I : *BB) {
+      assert(!isa<InvokeInst>(&I));
+      auto *CI = dyn_cast<CallInst>(&I);
+      if (!CI)
+        continue;
+
+      const Value *Callee = CI->getCalledValue();
+      if (!canLongjmp(M, Callee))
+        continue;
+
+      Value *Threw = nullptr;
+      BasicBlock *Tail;
+      if (Callee->getName().startswith(InvokePrefix)) {
+        // If invoke wrapper has already been generated for this call in
+        // previous EH phase, search for the load instruction
+        // %__THREW__.val = __THREW__;
+        // in postamble after the invoke wrapper call
+        LoadInst *ThrewLI = nullptr;
+        StoreInst *ThrewResetSI = nullptr;
+        for (auto I = std::next(BasicBlock::iterator(CI)), IE = BB->end();
+             I != IE; ++I) {
+          if (auto *LI = dyn_cast<LoadInst>(I))
+            if (auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()))
+              if (GV == ThrewGV) {
+                Threw = ThrewLI = LI;
+                break;
+              }
+        }
+        // Search for the store instruction after the load above
+        // __THREW__ = 0;
+        for (auto I = std::next(BasicBlock::iterator(ThrewLI)), IE = BB->end();
+             I != IE; ++I) {
+          if (auto *SI = dyn_cast<StoreInst>(I))
+            if (auto *GV = dyn_cast<GlobalVariable>(SI->getPointerOperand()))
+              if (GV == ThrewGV && SI->getValueOperand() == IRB.getInt32(0)) {
+                ThrewResetSI = SI;
+                break;
+              }
+        }
+        assert(Threw && ThrewLI && "Cannot find __THREW__ load after invoke");
+        assert(ThrewResetSI && "Cannot find __THREW__ store after invoke");
+        Tail = SplitBlock(BB, ThrewResetSI->getNextNode());
+
+      } else {
+        // Wrap call with invoke wrapper and generate preamble/postamble
+        Threw = wrapInvoke(CI);
+        ToErase.push_back(CI);
+        Tail = SplitBlock(BB, CI->getNextNode());
+      }
+
+      // We need to replace the terminator in Tail - SplitBlock makes BB go
+      // straight to Tail, we need to check if a longjmp occurred, and go to the
+      // right setjmp-tail if so
+      ToErase.push_back(BB->getTerminator());
+
+      // Generate a function call to testSetjmp function and preamble/postamble
+      // code to figure out (1) whether longjmp occurred (2) if longjmp
+      // occurred, which setjmp it corresponds to
+      Value *Label = nullptr;
+      Value *LongjmpResult = nullptr;
+      BasicBlock *EndBB = nullptr;
+      wrapTestSetjmp(BB, CI, Threw, SetjmpTable, SetjmpTableSize, Label,
+                     LongjmpResult, EndBB);
+      assert(Label && LongjmpResult && EndBB);
+
+      // Create switch instruction
+      IRB.SetInsertPoint(EndBB);
+      SwitchInst *SI = IRB.CreateSwitch(Label, Tail, SetjmpRetPHIs.size());
+      // -1 means no longjmp happened, continue normally (will hit the default
+      // switch case). 0 means a longjmp that is not ours to handle, needs a
+      // rethrow. Otherwise the index is the same as the index in P+1 (to avoid
+      // 0).
+      for (unsigned i = 0; i < SetjmpRetPHIs.size(); i++) {
+        SI->addCase(IRB.getInt32(i + 1), SetjmpRetPHIs[i]->getParent());
+        SetjmpRetPHIs[i]->addIncoming(LongjmpResult, EndBB);
+      }
+
+      // We are splitting the block here, and must continue to find other calls
+      // in the block - which is now split. so continue to traverse in the Tail
+      BBs.push_back(Tail);
+    }
+  }
+
+  // Erase everything we no longer need in this function
+  for (Instruction *I : ToErase)
+    I->eraseFromParent();
+
+  // Free setjmpTable buffer before each return instruction
+  for (BasicBlock &BB : F) {
+    TerminatorInst *TI = BB.getTerminator();
+    if (isa<ReturnInst>(TI))
+      CallInst::CreateFree(SetjmpTable, TI);
+  }
+
+  // Every call to saveSetjmp can change setjmpTable and setjmpTableSize
+  // (when buffer reallocation occurs)
+  // entry:
+  //   setjmpTableSize = 4;
+  //   setjmpTable = (int *) malloc(40);
+  //   setjmpTable[0] = 0;
+  // ...
+  // somebb:
+  //   setjmpTable = saveSetjmp(buf, label, setjmpTable, setjmpTableSize);
+  //   setjmpTableSize = __tempRet0;
+  // So we need to make sure the SSA for these variables is valid so that every
+  // saveSetjmp and testSetjmp calls have the correct arguments.
+  SSAUpdater SetjmpTableSSA;
+  SSAUpdater SetjmpTableSizeSSA;
+  SetjmpTableSSA.Initialize(Type::getInt32PtrTy(C), "setjmpTable");
+  SetjmpTableSizeSSA.Initialize(Type::getInt32Ty(C), "setjmpTableSize");
+  for (Instruction *I : SetjmpTableInsts)
+    SetjmpTableSSA.AddAvailableValue(I->getParent(), I);
+  for (Instruction *I : SetjmpTableSizeInsts)
+    SetjmpTableSizeSSA.AddAvailableValue(I->getParent(), I);
+
+  for (auto UI = SetjmpTable->use_begin(), UE = SetjmpTable->use_end();
+       UI != UE;) {
+    // Grab the use before incrementing the iterator.
+    Use &U = *UI;
+    // Increment the iterator before removing the use from the list.
+    ++UI;
+    if (Instruction *I = dyn_cast<Instruction>(U.getUser()))
+      if (I->getParent() != &EntryBB)
+        SetjmpTableSSA.RewriteUse(U);
+  }
+  for (auto UI = SetjmpTableSize->use_begin(), UE = SetjmpTableSize->use_end();
+       UI != UE;) {
+    Use &U = *UI;
+    ++UI;
+    if (Instruction *I = dyn_cast<Instruction>(U.getUser()))
+      if (I->getParent() != &EntryBB)
+        SetjmpTableSizeSSA.RewriteUse(U);
+  }
+
+  // Finally, our modifications to the cfg can break dominance of SSA variables.
+  // For example, in this code,
+  // if (x()) { .. setjmp() .. }
+  // if (y()) { .. longjmp() .. }
+  // We must split the longjmp block, and it can jump into the block splitted
+  // from setjmp one. But that means that when we split the setjmp block, it's
+  // first part no longer dominates its second part - there is a theoretically
+  // possible control flow path where x() is false, then y() is true and we
+  // reach the second part of the setjmp block, without ever reaching the first
+  // part. So, we rebuild SSA form here.
+  rebuildSSA(F);
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
new file mode 100644
index 000000000000..022a448590ec
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -0,0 +1,115 @@
+// WebAssemblyMCInstLower.cpp - Convert WebAssembly MachineInstr to an MCInst //
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains code to lower WebAssembly MachineInstrs to their
+/// corresponding MCInst records.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyMCInstLower.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+MCSymbol *
+WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  return Printer.getSymbol(MO.getGlobal());
+}
+
+MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
+    const MachineOperand &MO) const {
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym,
+                                                     int64_t Offset,
+                                                     bool IsFunc) const {
+  MCSymbolRefExpr::VariantKind VK =
+      IsFunc ? MCSymbolRefExpr::VK_WebAssembly_FUNCTION
+             : MCSymbolRefExpr::VK_None;
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, VK, Ctx);
+
+  if (Offset != 0) {
+    if (IsFunc)
+      report_fatal_error("Function addresses with offsets not supported");
+    Expr =
+        MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, Ctx), Ctx);
+  }
+
+  return MCOperand::createExpr(Expr);
+}
+
+void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
+                                   MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_MachineBasicBlock:
+      MI->dump();
+      llvm_unreachable("MachineBasicBlock operand should have been rewritten");
+    case MachineOperand::MO_Register: {
+      // Ignore all implicit register operands.
+      if (MO.isImplicit())
+        continue;
+      const WebAssemblyFunctionInfo &MFI =
+          *MI->getParent()->getParent()->getInfo<WebAssemblyFunctionInfo>();
+      unsigned WAReg = MFI.getWAReg(MO.getReg());
+      MCOp = MCOperand::createReg(WAReg);
+      break;
+    }
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::createImm(MO.getImm());
+      break;
+    case MachineOperand::MO_FPImmediate: {
+      // TODO: MC converts all floating point immediate operands to double.
+      // This is fine for numeric values, but may cause NaNs to change bits.
+      const ConstantFP *Imm = MO.getFPImm();
+      if (Imm->getType()->isFloatTy())
+        MCOp = MCOperand::createFPImm(Imm->getValueAPF().convertToFloat());
+      else if (Imm->getType()->isDoubleTy())
+        MCOp = MCOperand::createFPImm(Imm->getValueAPF().convertToDouble());
+      else
+        llvm_unreachable("unknown floating point immediate type");
+      break;
+    }
+    case MachineOperand::MO_GlobalAddress:
+      assert(MO.getTargetFlags() == 0 &&
+             "WebAssembly does not use target flags on GlobalAddresses");
+      MCOp = LowerSymbolOperand(GetGlobalAddressSymbol(MO), MO.getOffset(),
+                                MO.getGlobal()->getValueType()->isFunctionTy());
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      // The target flag indicates whether this is a symbol for a
+      // variable or a function.
+      assert((MO.getTargetFlags() & -2) == 0 &&
+             "WebAssembly uses only one target flag bit on ExternalSymbols");
+      MCOp = LowerSymbolOperand(GetExternalSymbolSymbol(MO), /*Offset=*/0,
+                                MO.getTargetFlags() & 1);
+      break;
+    }
+
+    OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
new file mode 100644
index 000000000000..ab4ba1c28d53
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -0,0 +1,46 @@
+//===-- WebAssemblyMCInstLower.h - Lower MachineInstr to MCInst -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares the class to lower WebAssembly MachineInstrs to
+/// their corresponding MCInst records.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H
+
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCContext;
+class MCSymbol;
+class MachineInstr;
+class MachineOperand;
+
+/// This class is used to lower an MachineInstr into an MCInst.
+class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
+  MCContext &Ctx;
+  AsmPrinter &Printer;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+  MCOperand LowerSymbolOperand(MCSymbol *Sym, int64_t Offset,
+                               bool IsFunc) const;
+
+public:
+  WebAssemblyMCInstLower(MCContext &ctx, AsmPrinter &printer)
+      : Ctx(ctx), Printer(printer) {}
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..ccf6a18b32ea
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -0,0 +1,62 @@
+//=- WebAssemblyMachineFunctionInfo.cpp - WebAssembly Machine Function Info -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements WebAssembly-specific per-machine-function
+/// information.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblyISelLowering.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/Analysis.h"
+using namespace llvm;
+
+WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() {}
+
+void WebAssemblyFunctionInfo::initWARegs() {
+  assert(WARegs.empty());
+  unsigned Reg = UnusedReg;
+  WARegs.resize(MF.getRegInfo().getNumVirtRegs(), Reg);
+}
+
+void llvm::ComputeLegalValueVTs(const Function &F, const TargetMachine &TM,
+                                Type *Ty, SmallVectorImpl<MVT> &ValueVTs) {
+  const DataLayout &DL(F.getParent()->getDataLayout());
+  const WebAssemblyTargetLowering &TLI =
+      *TM.getSubtarget<WebAssemblySubtarget>(F).getTargetLowering();
+  SmallVector<EVT, 4> VTs;
+  ComputeValueVTs(TLI, DL, Ty, VTs);
+
+  for (EVT VT : VTs) {
+    unsigned NumRegs = TLI.getNumRegisters(F.getContext(), VT);
+    MVT RegisterVT = TLI.getRegisterType(F.getContext(), VT);
+    for (unsigned i = 0; i != NumRegs; ++i)
+      ValueVTs.push_back(RegisterVT);
+  }
+}
+
+void llvm::ComputeSignatureVTs(const Function &F, const TargetMachine &TM,
+                               SmallVectorImpl<MVT> &Params,
+                               SmallVectorImpl<MVT> &Results) {
+  ComputeLegalValueVTs(F, TM, F.getReturnType(), Results);
+
+  if (Results.size() > 1) {
+    // WebAssembly currently can't lower returns of multiple values without
+    // demoting to sret (see WebAssemblyTargetLowering::CanLowerReturn). So
+    // replace multiple return values with a pointer parameter.
+    Results.clear();
+    Params.push_back(
+        MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits()));
+  }
+
+  for (auto &Arg : F.args())
+    ComputeLegalValueVTs(F, TM, Arg.getType(), Params);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
new file mode 100644
index 000000000000..756619bebbed
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -0,0 +1,119 @@
+// WebAssemblyMachineFunctionInfo.h-WebAssembly machine function info-*- C++ -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares WebAssembly-specific per-machine-function
+/// information.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+namespace llvm {
+
+/// This class is derived from MachineFunctionInfo and contains private
+/// WebAssembly-specific information for each MachineFunction.
+class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
+  MachineFunction &MF;
+
+  std::vector<MVT> Params;
+  std::vector<MVT> Results;
+  std::vector<MVT> Locals;
+
+  /// A mapping from CodeGen vreg index to WebAssembly register number.
+  std::vector<unsigned> WARegs;
+
+  /// A mapping from CodeGen vreg index to a boolean value indicating whether
+  /// the given register is considered to be "stackified", meaning it has been
+  /// determined or made to meet the stack requirements:
+  ///   - single use (per path)
+  ///   - single def (per path)
+  ///   - defined and used in LIFO order with other stack registers
+  BitVector VRegStackified;
+
+  // A virtual register holding the pointer to the vararg buffer for vararg
+  // functions. It is created and set in TLI::LowerFormalArguments and read by
+  // TLI::LowerVASTART
+  unsigned VarargVreg = -1U;
+
+  // A virtual register holding the base pointer for functions that have
+  // overaligned values on the user stack.
+  unsigned BasePtrVreg = -1U;
+
+ public:
+  explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {}
+  ~WebAssemblyFunctionInfo() override;
+
+  void addParam(MVT VT) { Params.push_back(VT); }
+  const std::vector<MVT> &getParams() const { return Params; }
+
+  void addResult(MVT VT) { Results.push_back(VT); }
+  const std::vector<MVT> &getResults() const { return Results; }
+
+  void addLocal(MVT VT) { Locals.push_back(VT); }
+  const std::vector<MVT> &getLocals() const { return Locals; }
+
+  unsigned getVarargBufferVreg() const {
+    assert(VarargVreg != -1U && "Vararg vreg hasn't been set");
+    return VarargVreg;
+  }
+  void setVarargBufferVreg(unsigned Reg) { VarargVreg = Reg; }
+
+  unsigned getBasePointerVreg() const {
+    assert(BasePtrVreg != -1U && "Base ptr vreg hasn't been set");
+    return BasePtrVreg;
+  }
+  void setBasePointerVreg(unsigned Reg) { BasePtrVreg = Reg; }
+
+  static const unsigned UnusedReg = -1u;
+
+  void stackifyVReg(unsigned VReg) {
+    assert(MF.getRegInfo().getUniqueVRegDef(VReg));
+    if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size())
+      VRegStackified.resize(TargetRegisterInfo::virtReg2Index(VReg) + 1);
+    VRegStackified.set(TargetRegisterInfo::virtReg2Index(VReg));
+  }
+  bool isVRegStackified(unsigned VReg) const {
+    if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size())
+      return false;
+    return VRegStackified.test(TargetRegisterInfo::virtReg2Index(VReg));
+  }
+
+  void initWARegs();
+  void setWAReg(unsigned VReg, unsigned WAReg) {
+    assert(WAReg != UnusedReg);
+    assert(TargetRegisterInfo::virtReg2Index(VReg) < WARegs.size());
+    WARegs[TargetRegisterInfo::virtReg2Index(VReg)] = WAReg;
+  }
+  unsigned getWAReg(unsigned Reg) const {
+    assert(TargetRegisterInfo::virtReg2Index(Reg) < WARegs.size());
+    return WARegs[TargetRegisterInfo::virtReg2Index(Reg)];
+  }
+
+  // For a given stackified WAReg, return the id number to print with push/pop.
+  static unsigned getWARegStackId(unsigned Reg) {
+    assert(Reg & INT32_MIN);
+    return Reg & INT32_MAX;
+  }
+};
+
+void ComputeLegalValueVTs(const Function &F, const TargetMachine &TM,
+                          Type *Ty, SmallVectorImpl<MVT> &ValueVTs);
+
+void ComputeSignatureVTs(const Function &F, const TargetMachine &TM,
+                         SmallVectorImpl<MVT> &Params,
+                         SmallVectorImpl<MVT> &Results);
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
new file mode 100644
index 000000000000..5a3a7411ed46
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
@@ -0,0 +1,105 @@
+//===--- WebAssemblyOptimizeLiveIntervals.cpp - LiveInterval processing ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Optimize LiveIntervals for use in a post-RA context.
+//
+/// LiveIntervals normally runs before register allocation when the code is
+/// only recently lowered out of SSA form, so it's uncommon for registers to
+/// have multiple defs, and then they do, the defs are usually closely related.
+/// Later, after coalescing, tail duplication, and other optimizations, it's
+/// more common to see registers with multiple unrelated defs. This pass
+/// updates LiveIntervalAnalysis to distribute the value numbers across separate
+/// LiveIntervals.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-optimize-live-intervals"
+
+namespace {
+class WebAssemblyOptimizeLiveIntervals final : public MachineFunctionPass {
+  StringRef getPassName() const override {
+    return "WebAssembly Optimize Live Intervals";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreservedID(LiveVariablesID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyOptimizeLiveIntervals() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyOptimizeLiveIntervals::ID = 0;
+FunctionPass *llvm::createWebAssemblyOptimizeLiveIntervals() {
+  return new WebAssemblyOptimizeLiveIntervals();
+}
+
+bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Optimize LiveIntervals **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+
+  // We don't preserve SSA form.
+  MRI.leaveSSA();
+
+  assert(MRI.tracksLiveness() &&
+         "OptimizeLiveIntervals expects liveness");
+
+  // Split multiple-VN LiveIntervals into multiple LiveIntervals.
+  SmallVector<LiveInterval*, 4> SplitLIs;
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+
+    LIS.splitSeparateComponents(LIS.getInterval(Reg), SplitLIs);
+    SplitLIs.clear();
+  }
+
+  // In PrepareForLiveIntervals, we conservatively inserted IMPLICIT_DEF
+  // instructions to satisfy LiveIntervals' requirement that all uses be
+  // dominated by defs. Now that LiveIntervals has computed which of these
+  // defs are actually needed and which are dead, remove the dead ones.
+  for (auto MII = MF.begin()->begin(), MIE = MF.begin()->end(); MII != MIE; ) {
+    MachineInstr *MI = &*MII++;
+    if (MI->isImplicitDef() && MI->getOperand(0).isDead()) {
+      LiveInterval &LI = LIS.getInterval(MI->getOperand(0).getReg());
+      LIS.removeVRegDefAt(LI, LIS.getInstructionIndex(*MI).getRegSlot());
+      LIS.RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+    }
+  }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
new file mode 100644
index 000000000000..96520aa5d28c
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -0,0 +1,76 @@
+//===-- WebAssemblyOptimizeReturned.cpp - Optimize "returned" attributes --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Optimize calls with "returned" attributes for WebAssembly.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-optimize-returned"
+
+namespace {
+class OptimizeReturned final : public FunctionPass,
+                               public InstVisitor<OptimizeReturned> {
+  StringRef getPassName() const override {
+    return "WebAssembly Optimize Returned";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  DominatorTree *DT;
+
+public:
+  static char ID;
+  OptimizeReturned() : FunctionPass(ID), DT(nullptr) {}
+
+  void visitCallSite(CallSite CS);
+};
+} // End anonymous namespace
+
+char OptimizeReturned::ID = 0;
+FunctionPass *llvm::createWebAssemblyOptimizeReturned() {
+  return new OptimizeReturned();
+}
+
+void OptimizeReturned::visitCallSite(CallSite CS) {
+  for (unsigned i = 0, e = CS.getNumArgOperands(); i < e; ++i)
+    if (CS.paramHasAttr(1 + i, Attribute::Returned)) {
+      Instruction *Inst = CS.getInstruction();
+      Value *Arg = CS.getArgOperand(i);
+      // Ignore constants, globals, undef, etc.
+      if (isa<Constant>(Arg))
+        continue;
+      // Like replaceDominatedUsesWith but using Instruction/Use dominance.
+      for (auto UI = Arg->use_begin(), UE = Arg->use_end(); UI != UE;) {
+        Use &U = *UI++;
+        if (DT->dominates(Inst, U))
+          U.set(Inst);
+      }
+    }
+}
+
+bool OptimizeReturned::runOnFunction(Function &F) {
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  visit(F);
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
new file mode 100644
index 000000000000..32dde88c2234
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -0,0 +1,198 @@
+//===-- WebAssemblyPeephole.cpp - WebAssembly Peephole Optimiztions -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Late peephole optimizations for WebAssembly.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-peephole"
+
+static cl::opt<bool> DisableWebAssemblyFallthroughReturnOpt(
+    "disable-wasm-fallthrough-return-opt", cl::Hidden,
+    cl::desc("WebAssembly: Disable fallthrough-return optimizations."),
+    cl::init(false));
+
+namespace {
+class WebAssemblyPeephole final : public MachineFunctionPass {
+  StringRef getPassName() const override {
+    return "WebAssembly late peephole optimizer";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID;
+  WebAssemblyPeephole() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyPeephole::ID = 0;
+FunctionPass *llvm::createWebAssemblyPeephole() {
+  return new WebAssemblyPeephole();
+}
+
+/// If desirable, rewrite NewReg to a drop register.
+static bool MaybeRewriteToDrop(unsigned OldReg, unsigned NewReg,
+                               MachineOperand &MO, WebAssemblyFunctionInfo &MFI,
+                               MachineRegisterInfo &MRI) {
+  bool Changed = false;
+  if (OldReg == NewReg) {
+    Changed = true;
+    unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+    MO.setReg(NewReg);
+    MO.setIsDead();
+    MFI.stackifyVReg(NewReg);
+  }
+  return Changed;
+}
+
+static bool MaybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
+                                      const MachineFunction &MF,
+                                      WebAssemblyFunctionInfo &MFI,
+                                      MachineRegisterInfo &MRI,
+                                      const WebAssemblyInstrInfo &TII,
+                                      unsigned FallthroughOpc,
+                                      unsigned CopyLocalOpc) {
+  if (DisableWebAssemblyFallthroughReturnOpt)
+    return false;
+  if (&MBB != &MF.back())
+    return false;
+  if (&MI != &MBB.back())
+    return false;
+
+  // If the operand isn't stackified, insert a COPY to read the operand and
+  // stackify it.
+  MachineOperand &MO = MI.getOperand(0);
+  unsigned Reg = MO.getReg();
+  if (!MFI.isVRegStackified(Reg)) {
+    unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+    BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg)
+        .addReg(Reg);
+    MO.setReg(NewReg);
+    MFI.stackifyVReg(NewReg);
+  }
+
+  // Rewrite the return.
+  MI.setDesc(TII.get(FallthroughOpc));
+  return true;
+}
+
+bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Peephole **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  const WebAssemblyTargetLowering &TLI =
+      *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
+  auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  bool Changed = false;
+
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
+      switch (MI.getOpcode()) {
+      default:
+        break;
+      case WebAssembly::CALL_I32:
+      case WebAssembly::CALL_I64: {
+        MachineOperand &Op1 = MI.getOperand(1);
+        if (Op1.isSymbol()) {
+          StringRef Name(Op1.getSymbolName());
+          if (Name == TLI.getLibcallName(RTLIB::MEMCPY) ||
+              Name == TLI.getLibcallName(RTLIB::MEMMOVE) ||
+              Name == TLI.getLibcallName(RTLIB::MEMSET)) {
+            LibFunc::Func Func;
+            if (LibInfo.getLibFunc(Name, Func)) {
+              const auto &Op2 = MI.getOperand(2);
+              if (!Op2.isReg())
+                report_fatal_error("Peephole: call to builtin function with "
+                                   "wrong signature, not consuming reg");
+              MachineOperand &MO = MI.getOperand(0);
+              unsigned OldReg = MO.getReg();
+              unsigned NewReg = Op2.getReg();
+
+              if (MRI.getRegClass(NewReg) != MRI.getRegClass(OldReg))
+                report_fatal_error("Peephole: call to builtin function with "
+                                   "wrong signature, from/to mismatch");
+              Changed |= MaybeRewriteToDrop(OldReg, NewReg, MO, MFI, MRI);
+            }
+          }
+        }
+        break;
+      }
+      // Optimize away an explicit void return at the end of the function.
+      case WebAssembly::RETURN_I32:
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I32,
+            WebAssembly::COPY_I32);
+        break;
+      case WebAssembly::RETURN_I64:
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_I64,
+            WebAssembly::COPY_I64);
+        break;
+      case WebAssembly::RETURN_F32:
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F32,
+            WebAssembly::COPY_F32);
+        break;
+      case WebAssembly::RETURN_F64:
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F64,
+            WebAssembly::COPY_F64);
+        break;
+      case WebAssembly::RETURN_v16i8:
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v16i8,
+            WebAssembly::COPY_V128);
+        break;
+      case WebAssembly::RETURN_v8i16:
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v8i16,
+            WebAssembly::COPY_V128);
+        break;
+      case WebAssembly::RETURN_v4i32:
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v4i32,
+            WebAssembly::COPY_V128);
+        break;
+      case WebAssembly::RETURN_v4f32:
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_v4f32,
+            WebAssembly::COPY_V128);
+        break;
+      case WebAssembly::RETURN_VOID:
+        if (!DisableWebAssemblyFallthroughReturnOpt &&
+            &MBB == &MF.back() && &MI == &MBB.back())
+          MI.setDesc(TII.get(WebAssembly::FALLTHROUGH_RETURN_VOID));
+        break;
+      }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
new file mode 100644
index 000000000000..473dcb7a33fd
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
@@ -0,0 +1,124 @@
+//===- WebAssemblyPrepareForLiveIntervals.cpp - Prepare for LiveIntervals -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Fix up code to meet LiveInterval's requirements.
+///
+/// Some CodeGen passes don't preserve LiveInterval's requirements, because
+/// they run after register allocation and it isn't important. However,
+/// WebAssembly runs LiveIntervals in a late pass. This pass transforms code
+/// to meet LiveIntervals' requirements; primarily, it ensures that all
+/// virtual register uses have definitions (IMPLICIT_DEF definitions if
+/// nothing else).
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-prepare-for-live-intervals"
+
+namespace {
+class WebAssemblyPrepareForLiveIntervals final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyPrepareForLiveIntervals() : MachineFunctionPass(ID) {}
+
+private:
+  StringRef getPassName() const override {
+    return "WebAssembly Prepare For LiveIntervals";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyPrepareForLiveIntervals::ID = 0;
+FunctionPass *llvm::createWebAssemblyPrepareForLiveIntervals() {
+  return new WebAssemblyPrepareForLiveIntervals();
+}
+
+// Test whether the given register has an ARGUMENT def.
+static bool HasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) {
+  for (const auto &Def : MRI.def_instructions(Reg))
+    if (WebAssembly::isArgument(Def))
+      return true;
+  return false;
+}
+
+bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Prepare For LiveIntervals **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  bool Changed = false;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  MachineBasicBlock &Entry = *MF.begin();
+
+  assert(!mustPreserveAnalysisID(LiveIntervalsID) &&
+         "LiveIntervals shouldn't be active yet!");
+
+  // We don't preserve SSA form.
+  MRI.leaveSSA();
+
+  // BranchFolding and perhaps other passes don't preserve IMPLICIT_DEF
+  // instructions. LiveIntervals requires that all paths to virtual register
+  // uses provide a definition. Insert IMPLICIT_DEFs in the entry block to
+  // conservatively satisfy this.
+  //
+  // TODO: This is fairly heavy-handed; find a better approach.
+  //
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+
+    // Skip unused registers.
+    if (MRI.use_nodbg_empty(Reg))
+      continue;
+
+    // Skip registers that have an ARGUMENT definition.
+    if (HasArgumentDef(Reg, MRI))
+      continue;
+
+    BuildMI(Entry, Entry.begin(), DebugLoc(),
+            TII.get(WebAssembly::IMPLICIT_DEF), Reg);
+    Changed = true;
+  }
+
+  // Move ARGUMENT_* instructions to the top of the entry block, so that their
+  // liveness reflects the fact that these really are live-in values.
+  for (auto MII = Entry.begin(), MIE = Entry.end(); MII != MIE; ) {
+    MachineInstr &MI = *MII++;
+    if (WebAssembly::isArgument(MI)) {
+      MI.removeFromParent();
+      Entry.insert(Entry.begin(), &MI);
+    }
+  }
+
+  // Ok, we're now ready to run LiveIntervalAnalysis again.
+  MF.getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
new file mode 100644
index 000000000000..5fd4a8d1949e
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -0,0 +1,175 @@
+//===-- WebAssemblyRegColoring.cpp - Register coloring --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a virtual register coloring pass.
+///
+/// WebAssembly doesn't have a fixed number of registers, but it is still
+/// desirable to minimize the total number of registers used in each function.
+///
+/// This code is modeled after lib/CodeGen/StackSlotColoring.cpp.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-reg-coloring"
+
+namespace {
+class WebAssemblyRegColoring final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyRegColoring() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "WebAssembly Register Coloring";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<LiveIntervals>();
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+};
+} // end anonymous namespace
+
+char WebAssemblyRegColoring::ID = 0;
+FunctionPass *llvm::createWebAssemblyRegColoring() {
+  return new WebAssemblyRegColoring();
+}
+
+// Compute the total spill weight for VReg.
+static float computeWeight(const MachineRegisterInfo *MRI,
+                           const MachineBlockFrequencyInfo *MBFI,
+                           unsigned VReg) {
+  float weight = 0.0f;
+  for (MachineOperand &MO : MRI->reg_nodbg_operands(VReg))
+    weight += LiveIntervals::getSpillWeight(MO.isDef(), MO.isUse(), MBFI,
+                                            *MO.getParent());
+  return weight;
+}
+
+bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Register Coloring **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  // If there are calls to setjmp or sigsetjmp, don't perform coloring. Virtual
+  // registers could be modified before the longjmp is executed, resulting in
+  // the wrong value being used afterwards. (See <rdar://problem/8007500>.)
+  // TODO: Does WebAssembly need to care about setjmp for register coloring?
+  if (MF.exposesReturnsTwice())
+    return false;
+
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  LiveIntervals *Liveness = &getAnalysis<LiveIntervals>();
+  const MachineBlockFrequencyInfo *MBFI =
+      &getAnalysis<MachineBlockFrequencyInfo>();
+  WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+
+  // Gather all register intervals into a list and sort them.
+  unsigned NumVRegs = MRI->getNumVirtRegs();
+  SmallVector<LiveInterval *, 0> SortedIntervals;
+  SortedIntervals.reserve(NumVRegs);
+
+  DEBUG(dbgs() << "Interesting register intervals:\n");
+  for (unsigned i = 0; i < NumVRegs; ++i) {
+    unsigned VReg = TargetRegisterInfo::index2VirtReg(i);
+    if (MFI.isVRegStackified(VReg))
+      continue;
+    // Skip unused registers, which can use $drop.
+    if (MRI->use_empty(VReg))
+      continue;
+
+    LiveInterval *LI = &Liveness->getInterval(VReg);
+    assert(LI->weight == 0.0f);
+    LI->weight = computeWeight(MRI, MBFI, VReg);
+    DEBUG(LI->dump());
+    SortedIntervals.push_back(LI);
+  }
+  DEBUG(dbgs() << '\n');
+
+  // Sort them to put arguments first (since we don't want to rename live-in
+  // registers), by weight next, and then by position.
+  // TODO: Investigate more intelligent sorting heuristics. For starters, we
+  // should try to coalesce adjacent live intervals before non-adjacent ones.
+  std::sort(SortedIntervals.begin(), SortedIntervals.end(),
+            [MRI](LiveInterval *LHS, LiveInterval *RHS) {
+              if (MRI->isLiveIn(LHS->reg) != MRI->isLiveIn(RHS->reg))
+                return MRI->isLiveIn(LHS->reg);
+              if (LHS->weight != RHS->weight)
+                return LHS->weight > RHS->weight;
+              if (LHS->empty() || RHS->empty())
+                return !LHS->empty() && RHS->empty();
+              return *LHS < *RHS;
+            });
+
+  DEBUG(dbgs() << "Coloring register intervals:\n");
+  SmallVector<unsigned, 16> SlotMapping(SortedIntervals.size(), -1u);
+  SmallVector<SmallVector<LiveInterval *, 4>, 16> Assignments(
+      SortedIntervals.size());
+  BitVector UsedColors(SortedIntervals.size());
+  bool Changed = false;
+  for (size_t i = 0, e = SortedIntervals.size(); i < e; ++i) {
+    LiveInterval *LI = SortedIntervals[i];
+    unsigned Old = LI->reg;
+    size_t Color = i;
+    const TargetRegisterClass *RC = MRI->getRegClass(Old);
+
+    // Check if it's possible to reuse any of the used colors.
+    if (!MRI->isLiveIn(Old))
+      for (int C(UsedColors.find_first()); C != -1;
+           C = UsedColors.find_next(C)) {
+        if (MRI->getRegClass(SortedIntervals[C]->reg) != RC)
+          continue;
+        for (LiveInterval *OtherLI : Assignments[C])
+          if (!OtherLI->empty() && OtherLI->overlaps(*LI))
+            goto continue_outer;
+        Color = C;
+        break;
+      continue_outer:;
+      }
+
+    unsigned New = SortedIntervals[Color]->reg;
+    SlotMapping[i] = New;
+    Changed |= Old != New;
+    UsedColors.set(Color);
+    Assignments[Color].push_back(LI);
+    DEBUG(dbgs() << "Assigning vreg"
+                 << TargetRegisterInfo::virtReg2Index(LI->reg) << " to vreg"
+                 << TargetRegisterInfo::virtReg2Index(New) << "\n");
+  }
+  if (!Changed)
+    return false;
+
+  // Rewrite register operands.
+  for (size_t i = 0, e = SortedIntervals.size(); i < e; ++i) {
+    unsigned Old = SortedIntervals[i]->reg;
+    unsigned New = SlotMapping[i];
+    if (Old != New)
+      MRI->replaceRegWith(Old, New);
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
new file mode 100644
index 000000000000..e3470825940c
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -0,0 +1,107 @@
+//===-- WebAssemblyRegNumbering.cpp - Register Numbering ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a pass which assigns WebAssembly register
+/// numbers for CodeGen virtual registers.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-reg-numbering"
+
+namespace {
+class WebAssemblyRegNumbering final : public MachineFunctionPass {
+  StringRef getPassName() const override {
+    return "WebAssembly Register Numbering";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyRegNumbering() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyRegNumbering::ID = 0;
+FunctionPass *llvm::createWebAssemblyRegNumbering() {
+  return new WebAssemblyRegNumbering();
+}
+
+bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Register Numbering **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  MFI.initWARegs();
+
+  // WebAssembly argument registers are in the same index space as local
+  // variables. Assign the numbers for them first.
+  MachineBasicBlock &EntryMBB = MF.front();
+  for (MachineInstr &MI : EntryMBB) {
+    if (!WebAssembly::isArgument(MI))
+      break;
+
+    int64_t Imm = MI.getOperand(1).getImm();
+    DEBUG(dbgs() << "Arg VReg " << MI.getOperand(0).getReg() << " -> WAReg "
+                 << Imm << "\n");
+    MFI.setWAReg(MI.getOperand(0).getReg(), Imm);
+  }
+
+  // Then assign regular WebAssembly registers for all remaining used
+  // virtual registers. TODO: Consider sorting the registers by frequency of
+  // use, to maximize usage of small immediate fields.
+  unsigned NumVRegs = MF.getRegInfo().getNumVirtRegs();
+  unsigned NumStackRegs = 0;
+  // Start the numbering for locals after the arg regs
+  unsigned CurReg = MFI.getParams().size();
+  for (unsigned VRegIdx = 0; VRegIdx < NumVRegs; ++VRegIdx) {
+    unsigned VReg = TargetRegisterInfo::index2VirtReg(VRegIdx);
+    // Skip unused registers.
+    if (MRI.use_empty(VReg))
+      continue;
+    // Handle stackified registers.
+    if (MFI.isVRegStackified(VReg)) {
+      DEBUG(dbgs() << "VReg " << VReg << " -> WAReg "
+                   << (INT32_MIN | NumStackRegs) << "\n");
+      MFI.setWAReg(VReg, INT32_MIN | NumStackRegs++);
+      continue;
+    }
+    if (MFI.getWAReg(VReg) == WebAssemblyFunctionInfo::UnusedReg) {
+      DEBUG(dbgs() << "VReg " << VReg << " -> WAReg " << CurReg << "\n");
+      MFI.setWAReg(VReg, CurReg++);
+    }
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
new file mode 100644
index 000000000000..32ee09e45796
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -0,0 +1,884 @@
+//===-- WebAssemblyRegStackify.cpp - Register Stackification --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a register stacking pass.
+///
+/// This pass reorders instructions to put register uses and defs in an order
+/// such that they form single-use expression trees. Registers fitting this form
+/// are then marked as "stackified", meaning references to them are replaced by
+/// "push" and "pop" from the value stack.
+///
+/// This is primarily a code size optimization, since temporary values on the
+/// value stack don't need to be named.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" // for WebAssembly::ARGUMENT_*
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-reg-stackify"
+
+namespace {
+class WebAssemblyRegStackify final : public MachineFunctionPass {
+  StringRef getPassName() const override {
+    return "WebAssembly Register Stackify";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreservedID(LiveVariablesID);
+    AU.addPreserved<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyRegStackify() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyRegStackify::ID = 0;
+FunctionPass *llvm::createWebAssemblyRegStackify() {
+  return new WebAssemblyRegStackify();
+}
+
+// Decorate the given instruction with implicit operands that enforce the
+// expression stack ordering constraints for an instruction which is on
+// the expression stack.
+static void ImposeStackOrdering(MachineInstr *MI) {
+  // Write the opaque VALUE_STACK register.
+  if (!MI->definesRegister(WebAssembly::VALUE_STACK))
+    MI->addOperand(MachineOperand::CreateReg(WebAssembly::VALUE_STACK,
+                                             /*isDef=*/true,
+                                             /*isImp=*/true));
+
+  // Also read the opaque VALUE_STACK register.
+  if (!MI->readsRegister(WebAssembly::VALUE_STACK))
+    MI->addOperand(MachineOperand::CreateReg(WebAssembly::VALUE_STACK,
+                                             /*isDef=*/false,
+                                             /*isImp=*/true));
+}
+
+// Convert an IMPLICIT_DEF instruction into an instruction which defines
+// a constant zero value.
+static void ConvertImplicitDefToConstZero(MachineInstr *MI,
+                                          MachineRegisterInfo &MRI,
+                                          const TargetInstrInfo *TII,
+                                          MachineFunction &MF) {
+  assert(MI->getOpcode() == TargetOpcode::IMPLICIT_DEF);
+
+  const auto *RegClass =
+      MRI.getRegClass(MI->getOperand(0).getReg());
+  if (RegClass == &WebAssembly::I32RegClass) {
+    MI->setDesc(TII->get(WebAssembly::CONST_I32));
+    MI->addOperand(MachineOperand::CreateImm(0));
+  } else if (RegClass == &WebAssembly::I64RegClass) {
+    MI->setDesc(TII->get(WebAssembly::CONST_I64));
+    MI->addOperand(MachineOperand::CreateImm(0));
+  } else if (RegClass == &WebAssembly::F32RegClass) {
+    MI->setDesc(TII->get(WebAssembly::CONST_F32));
+    ConstantFP *Val = cast<ConstantFP>(Constant::getNullValue(
+        Type::getFloatTy(MF.getFunction()->getContext())));
+    MI->addOperand(MachineOperand::CreateFPImm(Val));
+  } else if (RegClass == &WebAssembly::F64RegClass) {
+    MI->setDesc(TII->get(WebAssembly::CONST_F64));
+    ConstantFP *Val = cast<ConstantFP>(Constant::getNullValue(
+        Type::getDoubleTy(MF.getFunction()->getContext())));
+    MI->addOperand(MachineOperand::CreateFPImm(Val));
+  } else {
+    llvm_unreachable("Unexpected reg class");
+  }
+}
+
+// Determine whether a call to the callee referenced by
+// MI->getOperand(CalleeOpNo) reads memory, writes memory, and/or has side
+// effects.
+static void QueryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read,
+                        bool &Write, bool &Effects, bool &StackPointer) {
+  // All calls can use the stack pointer.
+  StackPointer = true;
+
+  const MachineOperand &MO = MI.getOperand(CalleeOpNo);
+  if (MO.isGlobal()) {
+    const Constant *GV = MO.getGlobal();
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      if (!GA->isInterposable())
+        GV = GA->getAliasee();
+
+    if (const Function *F = dyn_cast<Function>(GV)) {
+      if (!F->doesNotThrow())
+        Effects = true;
+      if (F->doesNotAccessMemory())
+        return;
+      if (F->onlyReadsMemory()) {
+        Read = true;
+        return;
+      }
+    }
+  }
+
+  // Assume the worst.
+  Write = true;
+  Read = true;
+  Effects = true;
+}
+
+// Determine whether MI reads memory, writes memory, has side effects,
+// and/or uses the __stack_pointer value.
+static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
+                  bool &Write, bool &Effects, bool &StackPointer) {
+  assert(!MI.isPosition());
+  assert(!MI.isTerminator());
+
+  if (MI.isDebugValue())
+    return;
+
+  // Check for loads.
+  if (MI.mayLoad() && !MI.isDereferenceableInvariantLoad(&AA))
+    Read = true;
+
+  // Check for stores.
+  if (MI.mayStore()) {
+    Write = true;
+
+    // Check for stores to __stack_pointer.
+    for (auto MMO : MI.memoperands()) {
+      const MachinePointerInfo &MPI = MMO->getPointerInfo();
+      if (MPI.V.is<const PseudoSourceValue *>()) {
+        auto PSV = MPI.V.get<const PseudoSourceValue *>();
+        if (const ExternalSymbolPseudoSourceValue *EPSV =
+                dyn_cast<ExternalSymbolPseudoSourceValue>(PSV))
+          if (StringRef(EPSV->getSymbol()) == "__stack_pointer")
+            StackPointer = true;
+      }
+    }
+  } else if (MI.hasOrderedMemoryRef()) {
+    switch (MI.getOpcode()) {
+    case WebAssembly::DIV_S_I32: case WebAssembly::DIV_S_I64:
+    case WebAssembly::REM_S_I32: case WebAssembly::REM_S_I64:
+    case WebAssembly::DIV_U_I32: case WebAssembly::DIV_U_I64:
+    case WebAssembly::REM_U_I32: case WebAssembly::REM_U_I64:
+    case WebAssembly::I32_TRUNC_S_F32: case WebAssembly::I64_TRUNC_S_F32:
+    case WebAssembly::I32_TRUNC_S_F64: case WebAssembly::I64_TRUNC_S_F64:
+    case WebAssembly::I32_TRUNC_U_F32: case WebAssembly::I64_TRUNC_U_F32:
+    case WebAssembly::I32_TRUNC_U_F64: case WebAssembly::I64_TRUNC_U_F64:
+      // These instruction have hasUnmodeledSideEffects() returning true
+      // because they trap on overflow and invalid so they can't be arbitrarily
+      // moved, however hasOrderedMemoryRef() interprets this plus their lack
+      // of memoperands as having a potential unknown memory reference.
+      break;
+    default:
+      // Record volatile accesses, unless it's a call, as calls are handled
+      // specially below.
+      if (!MI.isCall()) {
+        Write = true;
+        Effects = true;
+      }
+      break;
+    }
+  }
+
+  // Check for side effects.
+  if (MI.hasUnmodeledSideEffects()) {
+    switch (MI.getOpcode()) {
+    case WebAssembly::DIV_S_I32: case WebAssembly::DIV_S_I64:
+    case WebAssembly::REM_S_I32: case WebAssembly::REM_S_I64:
+    case WebAssembly::DIV_U_I32: case WebAssembly::DIV_U_I64:
+    case WebAssembly::REM_U_I32: case WebAssembly::REM_U_I64:
+    case WebAssembly::I32_TRUNC_S_F32: case WebAssembly::I64_TRUNC_S_F32:
+    case WebAssembly::I32_TRUNC_S_F64: case WebAssembly::I64_TRUNC_S_F64:
+    case WebAssembly::I32_TRUNC_U_F32: case WebAssembly::I64_TRUNC_U_F32:
+    case WebAssembly::I32_TRUNC_U_F64: case WebAssembly::I64_TRUNC_U_F64:
+      // These instructions have hasUnmodeledSideEffects() returning true
+      // because they trap on overflow and invalid so they can't be arbitrarily
+      // moved, however in the specific case of register stackifying, it is safe
+      // to move them because overflow and invalid are Undefined Behavior.
+      break;
+    default:
+      Effects = true;
+      break;
+    }
+  }
+
+  // Analyze calls.
+  if (MI.isCall()) {
+    switch (MI.getOpcode()) {
+    case WebAssembly::CALL_VOID:
+    case WebAssembly::CALL_INDIRECT_VOID:
+      QueryCallee(MI, 0, Read, Write, Effects, StackPointer);
+      break;
+    case WebAssembly::CALL_I32: case WebAssembly::CALL_I64:
+    case WebAssembly::CALL_F32: case WebAssembly::CALL_F64:
+    case WebAssembly::CALL_INDIRECT_I32: case WebAssembly::CALL_INDIRECT_I64:
+    case WebAssembly::CALL_INDIRECT_F32: case WebAssembly::CALL_INDIRECT_F64:
+      QueryCallee(MI, 1, Read, Write, Effects, StackPointer);
+      break;
+    default:
+      llvm_unreachable("unexpected call opcode");
+    }
+  }
+}
+
+// Test whether Def is safe and profitable to rematerialize.
+static bool ShouldRematerialize(const MachineInstr &Def, AliasAnalysis &AA,
+                                const WebAssemblyInstrInfo *TII) {
+  return Def.isAsCheapAsAMove() && TII->isTriviallyReMaterializable(Def, &AA);
+}
+
+// Identify the definition for this register at this point. This is a
+// generalization of MachineRegisterInfo::getUniqueVRegDef that uses
+// LiveIntervals to handle complex cases.
+static MachineInstr *GetVRegDef(unsigned Reg, const MachineInstr *Insert,
+                                const MachineRegisterInfo &MRI,
+                                const LiveIntervals &LIS)
+{
+  // Most registers are in SSA form here so we try a quick MRI query first.
+  if (MachineInstr *Def = MRI.getUniqueVRegDef(Reg))
+    return Def;
+
+  // MRI doesn't know what the Def is. Try asking LIS.
+  if (const VNInfo *ValNo = LIS.getInterval(Reg).getVNInfoBefore(
+          LIS.getInstructionIndex(*Insert)))
+    return LIS.getInstructionFromIndex(ValNo->def);
+
+  return nullptr;
+}
+
+// Test whether Reg, as defined at Def, has exactly one use. This is a
+// generalization of MachineRegisterInfo::hasOneUse that uses LiveIntervals
+// to handle complex cases.
+static bool HasOneUse(unsigned Reg, MachineInstr *Def,
+                      MachineRegisterInfo &MRI, MachineDominatorTree &MDT,
+                      LiveIntervals &LIS) {
+  // Most registers are in SSA form here so we try a quick MRI query first.
+  if (MRI.hasOneUse(Reg))
+    return true;
+
+  bool HasOne = false;
+  const LiveInterval &LI = LIS.getInterval(Reg);
+  const VNInfo *DefVNI = LI.getVNInfoAt(
+      LIS.getInstructionIndex(*Def).getRegSlot());
+  assert(DefVNI);
+  for (auto &I : MRI.use_nodbg_operands(Reg)) {
+    const auto &Result = LI.Query(LIS.getInstructionIndex(*I.getParent()));
+    if (Result.valueIn() == DefVNI) {
+      if (!Result.isKill())
+        return false;
+      if (HasOne)
+        return false;
+      HasOne = true;
+    }
+  }
+  return HasOne;
+}
+
+// Test whether it's safe to move Def to just before Insert.
+// TODO: Compute memory dependencies in a way that doesn't require always
+// walking the block.
+// TODO: Compute memory dependencies in a way that uses AliasAnalysis to be
+// more precise.
+static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
+                         AliasAnalysis &AA, const MachineRegisterInfo &MRI) {
+  assert(Def->getParent() == Insert->getParent());
+
+  // Check for register dependencies.
+  SmallVector<unsigned, 4> MutableRegisters;
+  for (const MachineOperand &MO : Def->operands()) {
+    if (!MO.isReg() || MO.isUndef())
+      continue;
+    unsigned Reg = MO.getReg();
+
+    // If the register is dead here and at Insert, ignore it.
+    if (MO.isDead() && Insert->definesRegister(Reg) &&
+        !Insert->readsRegister(Reg))
+      continue;
+
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      // Ignore ARGUMENTS; it's just used to keep the ARGUMENT_* instructions
+      // from moving down, and we've already checked for that.
+      if (Reg == WebAssembly::ARGUMENTS)
+        continue;
+      // If the physical register is never modified, ignore it.
+      if (!MRI.isPhysRegModified(Reg))
+        continue;
+      // Otherwise, it's a physical register with unknown liveness.
+      return false;
+    }
+
+    // If one of the operands isn't in SSA form, it has different values at
+    // different times, and we need to make sure we don't move our use across
+    // a different def.
+    if (!MO.isDef() && !MRI.hasOneDef(Reg))
+      MutableRegisters.push_back(Reg);
+  }
+
+  bool Read = false, Write = false, Effects = false, StackPointer = false;
+  Query(*Def, AA, Read, Write, Effects, StackPointer);
+
+  // If the instruction does not access memory and has no side effects, it has
+  // no additional dependencies.
+  bool HasMutableRegisters = !MutableRegisters.empty();
+  if (!Read && !Write && !Effects && !StackPointer && !HasMutableRegisters)
+    return true;
+
+  // Scan through the intervening instructions between Def and Insert.
+  MachineBasicBlock::const_iterator D(Def), I(Insert);
+  for (--I; I != D; --I) {
+    bool InterveningRead = false;
+    bool InterveningWrite = false;
+    bool InterveningEffects = false;
+    bool InterveningStackPointer = false;
+    Query(*I, AA, InterveningRead, InterveningWrite, InterveningEffects,
+          InterveningStackPointer);
+    if (Effects && InterveningEffects)
+      return false;
+    if (Read && InterveningWrite)
+      return false;
+    if (Write && (InterveningRead || InterveningWrite))
+      return false;
+    if (StackPointer && InterveningStackPointer)
+      return false;
+
+    for (unsigned Reg : MutableRegisters)
+      for (const MachineOperand &MO : I->operands())
+        if (MO.isReg() && MO.isDef() && MO.getReg() == Reg)
+          return false;
+  }
+
+  return true;
+}
+
+/// Test whether OneUse, a use of Reg, dominates all of Reg's other uses.
+static bool OneUseDominatesOtherUses(unsigned Reg, const MachineOperand &OneUse,
+                                     const MachineBasicBlock &MBB,
+                                     const MachineRegisterInfo &MRI,
+                                     const MachineDominatorTree &MDT,
+                                     LiveIntervals &LIS,
+                                     WebAssemblyFunctionInfo &MFI) {
+  const LiveInterval &LI = LIS.getInterval(Reg);
+
+  const MachineInstr *OneUseInst = OneUse.getParent();
+  VNInfo *OneUseVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(*OneUseInst));
+
+  for (const MachineOperand &Use : MRI.use_nodbg_operands(Reg)) {
+    if (&Use == &OneUse)
+      continue;
+
+    const MachineInstr *UseInst = Use.getParent();
+    VNInfo *UseVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(*UseInst));
+
+    if (UseVNI != OneUseVNI)
+      continue;
+
+    const MachineInstr *OneUseInst = OneUse.getParent();
+    if (UseInst == OneUseInst) {
+      // Another use in the same instruction. We need to ensure that the one
+      // selected use happens "before" it.
+      if (&OneUse > &Use)
+        return false;
+    } else {
+      // Test that the use is dominated by the one selected use.
+      while (!MDT.dominates(OneUseInst, UseInst)) {
+        // Actually, dominating is over-conservative. Test that the use would
+        // happen after the one selected use in the stack evaluation order.
+        //
+        // This is needed as a consequence of using implicit get_locals for
+        // uses and implicit set_locals for defs.
+        if (UseInst->getDesc().getNumDefs() == 0)
+          return false;
+        const MachineOperand &MO = UseInst->getOperand(0);
+        if (!MO.isReg())
+          return false;
+        unsigned DefReg = MO.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(DefReg) ||
+            !MFI.isVRegStackified(DefReg))
+          return false;
+        assert(MRI.hasOneUse(DefReg));
+        const MachineOperand &NewUse = *MRI.use_begin(DefReg);
+        const MachineInstr *NewUseInst = NewUse.getParent();
+        if (NewUseInst == OneUseInst) {
+          if (&OneUse > &NewUse)
+            return false;
+          break;
+        }
+        UseInst = NewUseInst;
+      }
+    }
+  }
+  return true;
+}
+
+/// Get the appropriate tee opcode for the given register class.
+static unsigned GetTeeOpcode(const TargetRegisterClass *RC) {
+  if (RC == &WebAssembly::I32RegClass)
+    return WebAssembly::TEE_I32;
+  if (RC == &WebAssembly::I64RegClass)
+    return WebAssembly::TEE_I64;
+  if (RC == &WebAssembly::F32RegClass)
+    return WebAssembly::TEE_F32;
+  if (RC == &WebAssembly::F64RegClass)
+    return WebAssembly::TEE_F64;
+  if (RC == &WebAssembly::V128RegClass)
+    return WebAssembly::TEE_V128;
+  llvm_unreachable("Unexpected register class");
+}
+
+// Shrink LI to its uses, cleaning up LI.
+static void ShrinkToUses(LiveInterval &LI, LiveIntervals &LIS) {
+  if (LIS.shrinkToUses(&LI)) {
+    SmallVector<LiveInterval*, 4> SplitLIs;
+    LIS.splitSeparateComponents(LI, SplitLIs);
+  }
+}
+
+/// A single-use def in the same block with no intervening memory or register
+/// dependencies; move the def down and nest it with the current instruction.
+static MachineInstr *MoveForSingleUse(unsigned Reg, MachineOperand& Op,
+                                      MachineInstr *Def,
+                                      MachineBasicBlock &MBB,
+                                      MachineInstr *Insert, LiveIntervals &LIS,
+                                      WebAssemblyFunctionInfo &MFI,
+                                      MachineRegisterInfo &MRI) {
+  DEBUG(dbgs() << "Move for single use: "; Def->dump());
+
+  MBB.splice(Insert, &MBB, Def);
+  LIS.handleMove(*Def);
+
+  if (MRI.hasOneDef(Reg) && MRI.hasOneUse(Reg)) {
+    // No one else is using this register for anything so we can just stackify
+    // it in place.
+    MFI.stackifyVReg(Reg);
+  } else {
+    // The register may have unrelated uses or defs; create a new register for
+    // just our one def and use so that we can stackify it.
+    unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+    Def->getOperand(0).setReg(NewReg);
+    Op.setReg(NewReg);
+
+    // Tell LiveIntervals about the new register.
+    LIS.createAndComputeVirtRegInterval(NewReg);
+
+    // Tell LiveIntervals about the changes to the old register.
+    LiveInterval &LI = LIS.getInterval(Reg);
+    LI.removeSegment(LIS.getInstructionIndex(*Def).getRegSlot(),
+                     LIS.getInstructionIndex(*Op.getParent()).getRegSlot(),
+                     /*RemoveDeadValNo=*/true);
+
+    MFI.stackifyVReg(NewReg);
+
+    DEBUG(dbgs() << " - Replaced register: "; Def->dump());
+  }
+
+  ImposeStackOrdering(Def);
+  return Def;
+}
+
+/// A trivially cloneable instruction; clone it and nest the new copy with the
+/// current instruction.
+static MachineInstr *RematerializeCheapDef(
+    unsigned Reg, MachineOperand &Op, MachineInstr &Def, MachineBasicBlock &MBB,
+    MachineBasicBlock::instr_iterator Insert, LiveIntervals &LIS,
+    WebAssemblyFunctionInfo &MFI, MachineRegisterInfo &MRI,
+    const WebAssemblyInstrInfo *TII, const WebAssemblyRegisterInfo *TRI) {
+  DEBUG(dbgs() << "Rematerializing cheap def: "; Def.dump());
+  DEBUG(dbgs() << " - for use in "; Op.getParent()->dump());
+
+  unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+  TII->reMaterialize(MBB, Insert, NewReg, 0, Def, *TRI);
+  Op.setReg(NewReg);
+  MachineInstr *Clone = &*std::prev(Insert);
+  LIS.InsertMachineInstrInMaps(*Clone);
+  LIS.createAndComputeVirtRegInterval(NewReg);
+  MFI.stackifyVReg(NewReg);
+  ImposeStackOrdering(Clone);
+
+  DEBUG(dbgs() << " - Cloned to "; Clone->dump());
+
+  // Shrink the interval.
+  bool IsDead = MRI.use_empty(Reg);
+  if (!IsDead) {
+    LiveInterval &LI = LIS.getInterval(Reg);
+    ShrinkToUses(LI, LIS);
+    IsDead = !LI.liveAt(LIS.getInstructionIndex(Def).getDeadSlot());
+  }
+
+  // If that was the last use of the original, delete the original.
+  if (IsDead) {
+    DEBUG(dbgs() << " - Deleting original\n");
+    SlotIndex Idx = LIS.getInstructionIndex(Def).getRegSlot();
+    LIS.removePhysRegDefAt(WebAssembly::ARGUMENTS, Idx);
+    LIS.removeInterval(Reg);
+    LIS.RemoveMachineInstrFromMaps(Def);
+    Def.eraseFromParent();
+  }
+
+  return Clone;
+}
+
+/// A multiple-use def in the same block with no intervening memory or register
+/// dependencies; move the def down, nest it with the current instruction, and
+/// insert a tee to satisfy the rest of the uses. As an illustration, rewrite
+/// this:
+///
+///    Reg = INST ...        // Def
+///    INST ..., Reg, ...    // Insert
+///    INST ..., Reg, ...
+///    INST ..., Reg, ...
+///
+/// to this:
+///
+///    DefReg = INST ...     // Def (to become the new Insert)
+///    TeeReg, Reg = TEE_... DefReg
+///    INST ..., TeeReg, ... // Insert
+///    INST ..., Reg, ...
+///    INST ..., Reg, ...
+///
+/// with DefReg and TeeReg stackified. This eliminates a get_local from the
+/// resulting code.
+static MachineInstr *MoveAndTeeForMultiUse(
+    unsigned Reg, MachineOperand &Op, MachineInstr *Def, MachineBasicBlock &MBB,
+    MachineInstr *Insert, LiveIntervals &LIS, WebAssemblyFunctionInfo &MFI,
+    MachineRegisterInfo &MRI, const WebAssemblyInstrInfo *TII) {
+  DEBUG(dbgs() << "Move and tee for multi-use:"; Def->dump());
+
+  // Move Def into place.
+  MBB.splice(Insert, &MBB, Def);
+  LIS.handleMove(*Def);
+
+  // Create the Tee and attach the registers.
+  const auto *RegClass = MRI.getRegClass(Reg);
+  unsigned TeeReg = MRI.createVirtualRegister(RegClass);
+  unsigned DefReg = MRI.createVirtualRegister(RegClass);
+  MachineOperand &DefMO = Def->getOperand(0);
+  MachineInstr *Tee = BuildMI(MBB, Insert, Insert->getDebugLoc(),
+                              TII->get(GetTeeOpcode(RegClass)), TeeReg)
+                          .addReg(Reg, RegState::Define)
+                          .addReg(DefReg, getUndefRegState(DefMO.isDead()));
+  Op.setReg(TeeReg);
+  DefMO.setReg(DefReg);
+  SlotIndex TeeIdx = LIS.InsertMachineInstrInMaps(*Tee).getRegSlot();
+  SlotIndex DefIdx = LIS.getInstructionIndex(*Def).getRegSlot();
+
+  // Tell LiveIntervals we moved the original vreg def from Def to Tee.
+  LiveInterval &LI = LIS.getInterval(Reg);
+  LiveInterval::iterator I = LI.FindSegmentContaining(DefIdx);
+  VNInfo *ValNo = LI.getVNInfoAt(DefIdx);
+  I->start = TeeIdx;
+  ValNo->def = TeeIdx;
+  ShrinkToUses(LI, LIS);
+
+  // Finish stackifying the new regs.
+  LIS.createAndComputeVirtRegInterval(TeeReg);
+  LIS.createAndComputeVirtRegInterval(DefReg);
+  MFI.stackifyVReg(DefReg);
+  MFI.stackifyVReg(TeeReg);
+  ImposeStackOrdering(Def);
+  ImposeStackOrdering(Tee);
+
+  DEBUG(dbgs() << " - Replaced register: "; Def->dump());
+  DEBUG(dbgs() << " - Tee instruction: "; Tee->dump());
+  return Def;
+}
+
+namespace {
+/// A stack for walking the tree of instructions being built, visiting the
+/// MachineOperands in DFS order.
+class TreeWalkerState {
+  typedef MachineInstr::mop_iterator mop_iterator;
+  typedef std::reverse_iterator<mop_iterator> mop_reverse_iterator;
+  typedef iterator_range<mop_reverse_iterator> RangeTy;
+  SmallVector<RangeTy, 4> Worklist;
+
+public:
+  explicit TreeWalkerState(MachineInstr *Insert) {
+    const iterator_range<mop_iterator> &Range = Insert->explicit_uses();
+    if (Range.begin() != Range.end())
+      Worklist.push_back(reverse(Range));
+  }
+
+  bool Done() const { return Worklist.empty(); }
+
+  MachineOperand &Pop() {
+    RangeTy &Range = Worklist.back();
+    MachineOperand &Op = *Range.begin();
+    Range = drop_begin(Range, 1);
+    if (Range.begin() == Range.end())
+      Worklist.pop_back();
+    assert((Worklist.empty() ||
+            Worklist.back().begin() != Worklist.back().end()) &&
+           "Empty ranges shouldn't remain in the worklist");
+    return Op;
+  }
+
+  /// Push Instr's operands onto the stack to be visited.
+  void PushOperands(MachineInstr *Instr) {
+    const iterator_range<mop_iterator> &Range(Instr->explicit_uses());
+    if (Range.begin() != Range.end())
+      Worklist.push_back(reverse(Range));
+  }
+
+  /// Some of Instr's operands are on the top of the stack; remove them and
+  /// re-insert them starting from the beginning (because we've commuted them).
+  void ResetTopOperands(MachineInstr *Instr) {
+    assert(HasRemainingOperands(Instr) &&
+           "Reseting operands should only be done when the instruction has "
+           "an operand still on the stack");
+    Worklist.back() = reverse(Instr->explicit_uses());
+  }
+
+  /// Test whether Instr has operands remaining to be visited at the top of
+  /// the stack.
+  bool HasRemainingOperands(const MachineInstr *Instr) const {
+    if (Worklist.empty())
+      return false;
+    const RangeTy &Range = Worklist.back();
+    return Range.begin() != Range.end() && Range.begin()->getParent() == Instr;
+  }
+
+  /// Test whether the given register is present on the stack, indicating an
+  /// operand in the tree that we haven't visited yet. Moving a definition of
+  /// Reg to a point in the tree after that would change its value.
+  ///
+  /// This is needed as a consequence of using implicit get_locals for
+  /// uses and implicit set_locals for defs.
+  bool IsOnStack(unsigned Reg) const {
+    for (const RangeTy &Range : Worklist)
+      for (const MachineOperand &MO : Range)
+        if (MO.isReg() && MO.getReg() == Reg)
+          return true;
+    return false;
+  }
+};
+
+/// State to keep track of whether commuting is in flight or whether it's been
+/// tried for the current instruction and didn't work.
+class CommutingState {
+  /// There are effectively three states: the initial state where we haven't
+  /// started commuting anything and we don't know anything yet, the tenative
+  /// state where we've commuted the operands of the current instruction and are
+  /// revisting it, and the declined state where we've reverted the operands
+  /// back to their original order and will no longer commute it further.
+  bool TentativelyCommuting;
+  bool Declined;
+
+  /// During the tentative state, these hold the operand indices of the commuted
+  /// operands.
+  unsigned Operand0, Operand1;
+
+public:
+  CommutingState() : TentativelyCommuting(false), Declined(false) {}
+
+  /// Stackification for an operand was not successful due to ordering
+  /// constraints. If possible, and if we haven't already tried it and declined
+  /// it, commute Insert's operands and prepare to revisit it.
+  void MaybeCommute(MachineInstr *Insert, TreeWalkerState &TreeWalker,
+                    const WebAssemblyInstrInfo *TII) {
+    if (TentativelyCommuting) {
+      assert(!Declined &&
+             "Don't decline commuting until you've finished trying it");
+      // Commuting didn't help. Revert it.
+      TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1);
+      TentativelyCommuting = false;
+      Declined = true;
+    } else if (!Declined && TreeWalker.HasRemainingOperands(Insert)) {
+      Operand0 = TargetInstrInfo::CommuteAnyOperandIndex;
+      Operand1 = TargetInstrInfo::CommuteAnyOperandIndex;
+      if (TII->findCommutedOpIndices(*Insert, Operand0, Operand1)) {
+        // Tentatively commute the operands and try again.
+        TII->commuteInstruction(*Insert, /*NewMI=*/false, Operand0, Operand1);
+        TreeWalker.ResetTopOperands(Insert);
+        TentativelyCommuting = true;
+        Declined = false;
+      }
+    }
+  }
+
+  /// Stackification for some operand was successful. Reset to the default
+  /// state.
+  void Reset() {
+    TentativelyCommuting = false;
+    Declined = false;
+  }
+};
+} // end anonymous namespace
+
+bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** Register Stackifying **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  bool Changed = false;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  const auto *TRI = MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
+  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+
+  // Walk the instructions from the bottom up. Currently we don't look past
+  // block boundaries, and the blocks aren't ordered so the block visitation
+  // order isn't significant, but we may want to change this in the future.
+  for (MachineBasicBlock &MBB : MF) {
+    // Don't use a range-based for loop, because we modify the list as we're
+    // iterating over it and the end iterator may change.
+    for (auto MII = MBB.rbegin(); MII != MBB.rend(); ++MII) {
+      MachineInstr *Insert = &*MII;
+      // Don't nest anything inside an inline asm, because we don't have
+      // constraints for $push inputs.
+      if (Insert->getOpcode() == TargetOpcode::INLINEASM)
+        continue;
+
+      // Ignore debugging intrinsics.
+      if (Insert->getOpcode() == TargetOpcode::DBG_VALUE)
+        continue;
+
+      // Iterate through the inputs in reverse order, since we'll be pulling
+      // operands off the stack in LIFO order.
+      CommutingState Commuting;
+      TreeWalkerState TreeWalker(Insert);
+      while (!TreeWalker.Done()) {
+        MachineOperand &Op = TreeWalker.Pop();
+
+        // We're only interested in explicit virtual register operands.
+        if (!Op.isReg())
+          continue;
+
+        unsigned Reg = Op.getReg();
+        assert(Op.isUse() && "explicit_uses() should only iterate over uses");
+        assert(!Op.isImplicit() &&
+               "explicit_uses() should only iterate over explicit operands");
+        if (TargetRegisterInfo::isPhysicalRegister(Reg))
+          continue;
+
+        // Identify the definition for this register at this point.
+        MachineInstr *Def = GetVRegDef(Reg, Insert, MRI, LIS);
+        if (!Def)
+          continue;
+
+        // Don't nest an INLINE_ASM def into anything, because we don't have
+        // constraints for $pop outputs.
+        if (Def->getOpcode() == TargetOpcode::INLINEASM)
+          continue;
+
+        // Argument instructions represent live-in registers and not real
+        // instructions.
+        if (WebAssembly::isArgument(*Def))
+          continue;
+
+        // Decide which strategy to take. Prefer to move a single-use value
+        // over cloning it, and prefer cloning over introducing a tee.
+        // For moving, we require the def to be in the same block as the use;
+        // this makes things simpler (LiveIntervals' handleMove function only
+        // supports intra-block moves) and it's MachineSink's job to catch all
+        // the sinking opportunities anyway.
+        bool SameBlock = Def->getParent() == &MBB;
+        bool CanMove = SameBlock && IsSafeToMove(Def, Insert, AA, MRI) &&
+                       !TreeWalker.IsOnStack(Reg);
+        if (CanMove && HasOneUse(Reg, Def, MRI, MDT, LIS)) {
+          Insert = MoveForSingleUse(Reg, Op, Def, MBB, Insert, LIS, MFI, MRI);
+        } else if (ShouldRematerialize(*Def, AA, TII)) {
+          Insert =
+              RematerializeCheapDef(Reg, Op, *Def, MBB, Insert->getIterator(),
+                                    LIS, MFI, MRI, TII, TRI);
+        } else if (CanMove &&
+                   OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) {
+          Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI,
+                                         MRI, TII);
+        } else {
+          // We failed to stackify the operand. If the problem was ordering
+          // constraints, Commuting may be able to help.
+          if (!CanMove && SameBlock)
+            Commuting.MaybeCommute(Insert, TreeWalker, TII);
+          // Proceed to the next operand.
+          continue;
+        }
+
+        // If the instruction we just stackified is an IMPLICIT_DEF, convert it
+        // to a constant 0 so that the def is explicit, and the push/pop
+        // correspondence is maintained.
+        if (Insert->getOpcode() == TargetOpcode::IMPLICIT_DEF)
+          ConvertImplicitDefToConstZero(Insert, MRI, TII, MF);
+
+        // We stackified an operand. Add the defining instruction's operands to
+        // the worklist stack now to continue to build an ever deeper tree.
+        Commuting.Reset();
+        TreeWalker.PushOperands(Insert);
+      }
+
+      // If we stackified any operands, skip over the tree to start looking for
+      // the next instruction we can build a tree on.
+      if (Insert != &*MII) {
+        ImposeStackOrdering(&*MII);
+        MII = MachineBasicBlock::iterator(Insert).getReverse();
+        Changed = true;
+      }
+    }
+  }
+
+  // If we used VALUE_STACK anywhere, add it to the live-in sets everywhere so
+  // that it never looks like a use-before-def.
+  if (Changed) {
+    MF.getRegInfo().addLiveIn(WebAssembly::VALUE_STACK);
+    for (MachineBasicBlock &MBB : MF)
+      MBB.addLiveIn(WebAssembly::VALUE_STACK);
+  }
+
+#ifndef NDEBUG
+  // Verify that pushes and pops are performed in LIFO order.
+  SmallVector<unsigned, 0> Stack;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (MI.isDebugValue())
+        continue;
+      for (MachineOperand &MO : reverse(MI.explicit_operands())) {
+        if (!MO.isReg())
+          continue;
+        unsigned Reg = MO.getReg();
+
+        if (MFI.isVRegStackified(Reg)) {
+          if (MO.isDef())
+            Stack.push_back(Reg);
+          else
+            assert(Stack.pop_back_val() == Reg &&
+                   "Register stack pop should be paired with a push");
+        }
+      }
+    }
+    // TODO: Generalize this code to support keeping values on the stack across
+    // basic block boundaries.
+    assert(Stack.empty() &&
+           "Register stack pushes and pops should be balanced");
+  }
+#endif
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
new file mode 100644
index 000000000000..9367464c806e
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -0,0 +1,148 @@
+//===-- WebAssemblyRegisterInfo.cpp - WebAssembly Register Information ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the WebAssembly implementation of the
+/// TargetRegisterInfo class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyRegisterInfo.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyFrameLowering.h"
+#include "WebAssemblyInstrInfo.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-reg-info"
+
+#define GET_REGINFO_TARGET_DESC
+#include "WebAssemblyGenRegisterInfo.inc"
+
+WebAssemblyRegisterInfo::WebAssemblyRegisterInfo(const Triple &TT)
+    : WebAssemblyGenRegisterInfo(0), TT(TT) {}
+
+const MCPhysReg *
+WebAssemblyRegisterInfo::getCalleeSavedRegs(const MachineFunction *) const {
+  static const MCPhysReg CalleeSavedRegs[] = {0};
+  return CalleeSavedRegs;
+}
+
+BitVector
+WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction & /*MF*/) const {
+  BitVector Reserved(getNumRegs());
+  for (auto Reg : {WebAssembly::SP32, WebAssembly::SP64, WebAssembly::FP32,
+                   WebAssembly::FP64})
+    Reserved.set(Reg);
+  return Reserved;
+}
+
+void WebAssemblyRegisterInfo::eliminateFrameIndex(
+    MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum,
+    RegScavenger * /*RS*/) const {
+  assert(SPAdj == 0);
+  MachineInstr &MI = *II;
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  int64_t FrameOffset = MFI.getStackSize() + MFI.getObjectOffset(FrameIndex);
+
+  assert(MFI.getObjectSize(FrameIndex) != 0 &&
+         "We assume that variable-sized objects have already been lowered, "
+         "and don't use FrameIndex operands.");
+  unsigned FrameRegister = getFrameRegister(MF);
+
+  // If this is the address operand of a load or store, make it relative to SP
+  // and fold the frame offset directly in.
+  if ((MI.mayLoad() && FIOperandNum == WebAssembly::LoadAddressOperandNo) ||
+      (MI.mayStore() && FIOperandNum == WebAssembly::StoreAddressOperandNo)) {
+    assert(FrameOffset >= 0 && MI.getOperand(FIOperandNum - 1).getImm() >= 0);
+    int64_t Offset = MI.getOperand(FIOperandNum - 1).getImm() + FrameOffset;
+
+    if (static_cast<uint64_t>(Offset) <= std::numeric_limits<uint32_t>::max()) {
+      MI.getOperand(FIOperandNum - 1).setImm(Offset);
+      MI.getOperand(FIOperandNum)
+          .ChangeToRegister(FrameRegister, /*IsDef=*/false);
+      return;
+    }
+  }
+
+  // If this is an address being added to a constant, fold the frame offset
+  // into the constant.
+  if (MI.getOpcode() == WebAssembly::ADD_I32) {
+    MachineOperand &OtherMO = MI.getOperand(3 - FIOperandNum);
+    if (OtherMO.isReg()) {
+      unsigned OtherMOReg = OtherMO.getReg();
+      if (TargetRegisterInfo::isVirtualRegister(OtherMOReg)) {
+        MachineInstr *Def = MF.getRegInfo().getUniqueVRegDef(OtherMOReg);
+        // TODO: For now we just opportunistically do this in the case where
+        // the CONST_I32 happens to have exactly one def and one use. We
+        // should generalize this to optimize in more cases.
+        if (Def && Def->getOpcode() == WebAssembly::CONST_I32 &&
+            MRI.hasOneNonDBGUse(Def->getOperand(0).getReg())) {
+          MachineOperand &ImmMO = Def->getOperand(1);
+          ImmMO.setImm(ImmMO.getImm() + uint32_t(FrameOffset));
+          MI.getOperand(FIOperandNum)
+              .ChangeToRegister(FrameRegister, /*IsDef=*/false);
+          return;
+        }
+      }
+    }
+  }
+
+  // Otherwise create an i32.add SP, offset and make it the operand.
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+  unsigned FIRegOperand = FrameRegister;
+  if (FrameOffset) {
+    // Create i32.add SP, offset and make it the operand.
+    const TargetRegisterClass *PtrRC =
+        MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+    unsigned OffsetOp = MRI.createVirtualRegister(PtrRC);
+    BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::CONST_I32),
+            OffsetOp)
+        .addImm(FrameOffset);
+    FIRegOperand = MRI.createVirtualRegister(PtrRC);
+    BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::ADD_I32),
+            FIRegOperand)
+        .addReg(FrameRegister)
+        .addReg(OffsetOp);
+  }
+  MI.getOperand(FIOperandNum).ChangeToRegister(FIRegOperand, /*IsDef=*/false);
+}
+
+unsigned
+WebAssemblyRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  static const unsigned Regs[2][2] = {
+      /*            !isArch64Bit       isArch64Bit      */
+      /* !hasFP */ {WebAssembly::SP32, WebAssembly::SP64},
+      /*  hasFP */ {WebAssembly::FP32, WebAssembly::FP64}};
+  const WebAssemblyFrameLowering *TFI = getFrameLowering(MF);
+  return Regs[TFI->hasFP(MF)][TT.isArch64Bit()];
+}
+
+const TargetRegisterClass *
+WebAssemblyRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                            unsigned Kind) const {
+  assert(Kind == 0 && "Only one kind of pointer on WebAssembly");
+  if (MF.getSubtarget<WebAssemblySubtarget>().hasAddr64())
+    return &WebAssembly::I64RegClass;
+  return &WebAssembly::I32RegClass;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
new file mode 100644
index 000000000000..ad1d71eebf22
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h
@@ -0,0 +1,52 @@
+// WebAssemblyRegisterInfo.h - WebAssembly Register Information Impl -*- C++ -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the WebAssembly implementation of the
+/// WebAssemblyRegisterInfo class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYREGISTERINFO_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYREGISTERINFO_H
+
+#define GET_REGINFO_HEADER
+#include "WebAssemblyGenRegisterInfo.inc"
+
+namespace llvm {
+
+class MachineFunction;
+class RegScavenger;
+class TargetRegisterClass;
+class Triple;
+
+class WebAssemblyRegisterInfo final : public WebAssemblyGenRegisterInfo {
+  const Triple &TT;
+
+public:
+  explicit WebAssemblyRegisterInfo(const Triple &TT);
+
+  // Code Generation virtual methods.
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
new file mode 100644
index 000000000000..90888100be17
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -0,0 +1,62 @@
+//WebAssemblyRegisterInfo.td-Describe the WebAssembly Registers -*- tablegen -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file describes the WebAssembly register classes and some nominal
+/// physical registers.
+///
+//===----------------------------------------------------------------------===//
+
+class WebAssemblyReg<string n> : Register<n> {
+  let Namespace = "WebAssembly";
+}
+
+class WebAssemblyRegClass<list<ValueType> regTypes, int alignment, dag regList>
+     : RegisterClass<"WebAssembly", regTypes, alignment, regList>;
+
+//===----------------------------------------------------------------------===//
+// Registers
+//===----------------------------------------------------------------------===//
+
+// Special registers used as the frame and stack pointer.
+//
+// WebAssembly may someday supports mixed 32-bit and 64-bit heaps in the same
+// application, which requires separate width FP and SP.
+def FP32 : WebAssemblyReg<"%FP32">;
+def FP64 : WebAssemblyReg<"%FP64">;
+def SP32 : WebAssemblyReg<"%SP32">;
+def SP64 : WebAssemblyReg<"%SP64">;
+
+// The register allocation framework requires register classes have at least
+// one register, so we define a few for the floating point register classes
+// since we otherwise don't need a physical register in those classes.
+def F32_0 : WebAssemblyReg<"%f32.0">;
+def F64_0 : WebAssemblyReg<"%f64.0">;
+
+def V128_0: WebAssemblyReg<"%v128">;
+
+// The value stack "register". This is an opaque entity which serves to order
+// uses and defs that must remain in LIFO order.
+def VALUE_STACK : WebAssemblyReg<"STACK">;
+
+// The incoming arguments "register". This is an opaque entity which serves to
+// order the ARGUMENT instructions that are emulating live-in registers and
+// must not be scheduled below other instructions.
+def ARGUMENTS : WebAssemblyReg<"ARGUMENTS">;
+
+//===----------------------------------------------------------------------===//
+//  Register classes
+//===----------------------------------------------------------------------===//
+
+def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32)>;
+def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64)>;
+def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>;
+def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>;
+def V128 : WebAssemblyRegClass<[v4f32, v4i32, v16i8, v8i16], 128, (add V128_0)>;
+
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
new file mode 100644
index 000000000000..9e944df637d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
@@ -0,0 +1,99 @@
+//===-- WebAssemblyReplacePhysRegs.cpp - Replace phys regs with virt regs -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a pass that replaces physical registers with
+/// virtual registers.
+///
+/// LLVM expects certain physical registers, such as a stack pointer. However,
+/// WebAssembly doesn't actually have such physical registers. This pass is run
+/// once LLVM no longer needs these registers, and replaces them with virtual
+/// registers, so they can participate in register stackifying and coloring in
+/// the normal way.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-replace-phys-regs"
+
+namespace {
+class WebAssemblyReplacePhysRegs final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyReplacePhysRegs() : MachineFunctionPass(ID) {}
+
+private:
+  StringRef getPassName() const override {
+    return "WebAssembly Replace Physical Registers";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyReplacePhysRegs::ID = 0;
+FunctionPass *llvm::createWebAssemblyReplacePhysRegs() {
+  return new WebAssemblyReplacePhysRegs();
+}
+
+bool WebAssemblyReplacePhysRegs::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Replace Physical Registers **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const auto &TRI = *MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
+  bool Changed = false;
+
+  assert(!mustPreserveAnalysisID(LiveIntervalsID) &&
+         "LiveIntervals shouldn't be active yet!");
+  // We don't preserve SSA or liveness.
+  MRI.leaveSSA();
+  MRI.invalidateLiveness();
+
+  for (unsigned PReg = WebAssembly::NoRegister + 1;
+       PReg < WebAssembly::NUM_TARGET_REGS; ++PReg) {
+    // Skip fake registers that are never used explicitly.
+    if (PReg == WebAssembly::VALUE_STACK || PReg == WebAssembly::ARGUMENTS)
+      continue;
+
+    // Replace explicit uses of the physical register with a virtual register.
+    const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PReg);
+    unsigned VReg = WebAssembly::NoRegister;
+    for (auto I = MRI.reg_begin(PReg), E = MRI.reg_end(); I != E; ) {
+      MachineOperand &MO = *I++;
+      if (!MO.isImplicit()) {
+        if (VReg == WebAssembly::NoRegister)
+          VReg = MRI.createVirtualRegister(RC);
+        MO.setReg(VReg);
+        if (MO.getParent()->isDebugValue())
+          MO.setIsDebug();
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
new file mode 100644
index 000000000000..fae9c6100510
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
@@ -0,0 +1,20 @@
+//===-- WebAssemblySelectionDAGInfo.cpp - WebAssembly SelectionDAG Info ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements the WebAssemblySelectionDAGInfo class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyTargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-selectiondag-info"
+
+WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() {}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
new file mode 100644
index 000000000000..533c66b7a22f
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -0,0 +1,30 @@
+//=- WebAssemblySelectionDAGInfo.h - WebAssembly SelectionDAG Info -*- C++ -*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the WebAssembly subclass for
+/// SelectionDAGTargetInfo.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class WebAssemblySelectionDAGInfo final : public SelectionDAGTargetInfo {
+public:
+  ~WebAssemblySelectionDAGInfo() override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
new file mode 100644
index 000000000000..2441ead7cb27
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
@@ -0,0 +1,119 @@
+//=- WebAssemblySetP2AlignOperands.cpp - Set alignments on loads and stores -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file sets the p2align operands on load and store instructions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-set-p2align-operands"
+
+namespace {
+class WebAssemblySetP2AlignOperands final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblySetP2AlignOperands() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "WebAssembly Set p2align Operands";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblySetP2AlignOperands::ID = 0;
+FunctionPass *llvm::createWebAssemblySetP2AlignOperands() {
+  return new WebAssemblySetP2AlignOperands();
+}
+
+static void RewriteP2Align(MachineInstr &MI, unsigned OperandNo) {
+  assert(MI.getOperand(OperandNo).getImm() == 0 &&
+         "ISel should set p2align operands to 0");
+  assert(MI.hasOneMemOperand() &&
+         "Load and store instructions have exactly one mem operand");
+  assert((*MI.memoperands_begin())->getSize() ==
+             (UINT64_C(1)
+              << WebAssembly::GetDefaultP2Align(MI.getOpcode())) &&
+         "Default p2align value should be natural");
+  assert(MI.getDesc().OpInfo[OperandNo].OperandType ==
+             WebAssembly::OPERAND_P2ALIGN &&
+         "Load and store instructions should have a p2align operand");
+  uint64_t P2Align = Log2_64((*MI.memoperands_begin())->getAlignment());
+
+  // WebAssembly does not currently support supernatural alignment.
+  P2Align = std::min(
+      P2Align, uint64_t(WebAssembly::GetDefaultP2Align(MI.getOpcode())));
+
+  MI.getOperand(OperandNo).setImm(P2Align);
+}
+
+bool WebAssemblySetP2AlignOperands::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Set p2align Operands **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  bool Changed = false;
+
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      switch (MI.getOpcode()) {
+      case WebAssembly::LOAD_I32:
+      case WebAssembly::LOAD_I64:
+      case WebAssembly::LOAD_F32:
+      case WebAssembly::LOAD_F64:
+      case WebAssembly::LOAD8_S_I32:
+      case WebAssembly::LOAD8_U_I32:
+      case WebAssembly::LOAD16_S_I32:
+      case WebAssembly::LOAD16_U_I32:
+      case WebAssembly::LOAD8_S_I64:
+      case WebAssembly::LOAD8_U_I64:
+      case WebAssembly::LOAD16_S_I64:
+      case WebAssembly::LOAD16_U_I64:
+      case WebAssembly::LOAD32_S_I64:
+      case WebAssembly::LOAD32_U_I64:
+        RewriteP2Align(MI, WebAssembly::LoadP2AlignOperandNo);
+        break;
+      case WebAssembly::STORE_I32:
+      case WebAssembly::STORE_I64:
+      case WebAssembly::STORE_F32:
+      case WebAssembly::STORE_F64:
+      case WebAssembly::STORE8_I32:
+      case WebAssembly::STORE16_I32:
+      case WebAssembly::STORE8_I64:
+      case WebAssembly::STORE16_I64:
+      case WebAssembly::STORE32_I64:
+        RewriteP2Align(MI, WebAssembly::StoreP2AlignOperandNo);
+        break;
+      default:
+        break;
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
new file mode 100644
index 000000000000..34ec6f2d34a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -0,0 +1,202 @@
+//===-- WebAssemblyStoreResults.cpp - Optimize using store result values --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements an optimization pass using store result values.
+///
+/// WebAssembly's store instructions return the stored value. This is to enable
+/// an optimization wherein uses of the stored value can be replaced by uses of
+/// the store's result value, making the stored value register more likely to
+/// be single-use, thus more likely to be useful to register stackifying, and
+/// potentially also exposing the store to register stackifying. These both can
+/// reduce get_local/set_local traffic.
+///
+/// This pass also performs this optimization for memcpy, memmove, and memset
+/// calls, since the LLVM intrinsics for these return void so they can't use the
+/// returned attribute and consequently aren't handled by the OptimizeReturned
+/// pass.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-store-results"
+
+namespace {
+class WebAssemblyStoreResults final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyStoreResults() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "WebAssembly Store Results"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.addPreserved<MachineBlockFrequencyInfo>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+};
+} // end anonymous namespace
+
+char WebAssemblyStoreResults::ID = 0;
+FunctionPass *llvm::createWebAssemblyStoreResults() {
+  return new WebAssemblyStoreResults();
+}
+
+// Replace uses of FromReg with ToReg if they are dominated by MI.
+static bool ReplaceDominatedUses(MachineBasicBlock &MBB, MachineInstr &MI,
+                                 unsigned FromReg, unsigned ToReg,
+                                 const MachineRegisterInfo &MRI,
+                                 MachineDominatorTree &MDT,
+                                 LiveIntervals &LIS) {
+  bool Changed = false;
+
+  LiveInterval *FromLI = &LIS.getInterval(FromReg);
+  LiveInterval *ToLI = &LIS.getInterval(ToReg);
+
+  SlotIndex FromIdx = LIS.getInstructionIndex(MI).getRegSlot();
+  VNInfo *FromVNI = FromLI->getVNInfoAt(FromIdx);
+
+  SmallVector<SlotIndex, 4> Indices;
+
+  for (auto I = MRI.use_nodbg_begin(FromReg), E = MRI.use_nodbg_end(); I != E;) {
+    MachineOperand &O = *I++;
+    MachineInstr *Where = O.getParent();
+
+    // Check that MI dominates the instruction in the normal way.
+    if (&MI == Where || !MDT.dominates(&MI, Where))
+      continue;
+
+    // If this use gets a different value, skip it.
+    SlotIndex WhereIdx = LIS.getInstructionIndex(*Where);
+    VNInfo *WhereVNI = FromLI->getVNInfoAt(WhereIdx);
+    if (WhereVNI && WhereVNI != FromVNI)
+      continue;
+
+    // Make sure ToReg isn't clobbered before it gets there.
+    VNInfo *ToVNI = ToLI->getVNInfoAt(WhereIdx);
+    if (ToVNI && ToVNI != FromVNI)
+      continue;
+
+    Changed = true;
+    DEBUG(dbgs() << "Setting operand " << O << " in " << *Where << " from "
+                 << MI << "\n");
+    O.setReg(ToReg);
+
+    // If the store's def was previously dead, it is no longer.
+    if (!O.isUndef()) {
+      MI.getOperand(0).setIsDead(false);
+
+      Indices.push_back(WhereIdx.getRegSlot());
+    }
+  }
+
+  if (Changed) {
+    // Extend ToReg's liveness.
+    LIS.extendToIndices(*ToLI, Indices);
+
+    // Shrink FromReg's liveness.
+    LIS.shrinkToUses(FromLI);
+
+    // If we replaced all dominated uses, FromReg is now killed at MI.
+    if (!FromLI->liveAt(FromIdx.getDeadSlot()))
+      MI.addRegisterKilled(FromReg,
+                           MBB.getParent()->getSubtarget<WebAssemblySubtarget>()
+                                 .getRegisterInfo());
+  }
+
+  return Changed;
+}
+
+static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI,
+                         const MachineRegisterInfo &MRI,
+                         MachineDominatorTree &MDT,
+                         LiveIntervals &LIS,
+                         const WebAssemblyTargetLowering &TLI,
+                         const TargetLibraryInfo &LibInfo) {
+  MachineOperand &Op1 = MI.getOperand(1);
+  if (!Op1.isSymbol())
+    return false;
+
+  StringRef Name(Op1.getSymbolName());
+  bool callReturnsInput = Name == TLI.getLibcallName(RTLIB::MEMCPY) ||
+                          Name == TLI.getLibcallName(RTLIB::MEMMOVE) ||
+                          Name == TLI.getLibcallName(RTLIB::MEMSET);
+  if (!callReturnsInput)
+    return false;
+
+  LibFunc::Func Func;
+  if (!LibInfo.getLibFunc(Name, Func))
+    return false;
+
+  unsigned FromReg = MI.getOperand(2).getReg();
+  unsigned ToReg = MI.getOperand(0).getReg();
+  if (MRI.getRegClass(FromReg) != MRI.getRegClass(ToReg))
+    report_fatal_error("Store results: call to builtin function with wrong "
+                       "signature, from/to mismatch");
+  return ReplaceDominatedUses(MBB, MI, FromReg, ToReg, MRI, MDT, LIS);
+}
+
+bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+    dbgs() << "********** Store Results **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+  const WebAssemblyTargetLowering &TLI =
+      *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
+  const auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+  bool Changed = false;
+
+  // We don't preserve SSA form.
+  MRI.leaveSSA();
+
+  assert(MRI.tracksLiveness() && "StoreResults expects liveness tracking");
+
+  for (auto &MBB : MF) {
+    DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n');
+    for (auto &MI : MBB)
+      switch (MI.getOpcode()) {
+      default:
+        break;
+      case WebAssembly::CALL_I32:
+      case WebAssembly::CALL_I64:
+        Changed |= optimizeCall(MBB, MI, MRI, MDT, LIS, TLI, LibInfo);
+        break;
+      }
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
new file mode 100644
index 000000000000..ce39051b0555
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -0,0 +1,55 @@
+//===-- WebAssemblySubtarget.cpp - WebAssembly Subtarget Information ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements the WebAssembly-specific subclass of
+/// TargetSubtarget.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblySubtarget.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyInstrInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-subtarget"
+
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
+#include "WebAssemblyGenSubtargetInfo.inc"
+
+WebAssemblySubtarget &
+WebAssemblySubtarget::initializeSubtargetDependencies(StringRef FS) {
+  // Determine default and user-specified characteristics
+
+  if (CPUString.empty())
+    CPUString = "generic";
+
+  ParseSubtargetFeatures(CPUString, FS);
+  return *this;
+}
+
+WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
+                                           const std::string &CPU,
+                                           const std::string &FS,
+                                           const TargetMachine &TM)
+    : WebAssemblyGenSubtargetInfo(TT, CPU, FS), HasSIMD128(false),
+      CPUString(CPU), TargetTriple(TT), FrameLowering(),
+      InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
+      TLInfo(TM, *this) {}
+
+bool WebAssemblySubtarget::enableMachineScheduler() const {
+  // Disable the MachineScheduler for now. Even with ShouldTrackPressure set and
+  // enableMachineSchedDefaultSched overridden, it appears to have an overall
+  // negative effect for the kinds of register optimizations we're doing.
+  return false;
+}
+
+bool WebAssemblySubtarget::useAA() const { return true; }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
new file mode 100644
index 000000000000..f530a290fa0e
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -0,0 +1,85 @@
+//=- WebAssemblySubtarget.h - Define Subtarget for the WebAssembly -*- C++ -*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares the WebAssembly-specific subclass of
+/// TargetSubtarget.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSUBTARGET_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYSUBTARGET_H
+
+#include "WebAssemblyFrameLowering.h"
+#include "WebAssemblyISelLowering.h"
+#include "WebAssemblyInstrInfo.h"
+#include "WebAssemblySelectionDAGInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "WebAssemblyGenSubtargetInfo.inc"
+
+namespace llvm {
+
+class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
+  bool HasSIMD128;
+
+  /// String name of used CPU.
+  std::string CPUString;
+
+  /// What processor and OS we're targeting.
+  Triple TargetTriple;
+
+  WebAssemblyFrameLowering FrameLowering;
+  WebAssemblyInstrInfo InstrInfo;
+  WebAssemblySelectionDAGInfo TSInfo;
+  WebAssemblyTargetLowering TLInfo;
+
+  /// Initializes using CPUString and the passed in feature string so that we
+  /// can use initializer lists for subtarget initialization.
+  WebAssemblySubtarget &initializeSubtargetDependencies(StringRef FS);
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  WebAssemblySubtarget(const Triple &TT, const std::string &CPU,
+                       const std::string &FS, const TargetMachine &TM);
+
+  const WebAssemblySelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const WebAssemblyFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const WebAssemblyTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const WebAssemblyInstrInfo *getInstrInfo() const override {
+    return &InstrInfo;
+  }
+  const WebAssemblyRegisterInfo *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
+  }
+  const Triple &getTargetTriple() const { return TargetTriple; }
+  bool enableMachineScheduler() const override;
+  bool useAA() const override;
+
+  // Predicates used by WebAssemblyInstrInfo.td.
+  bool hasAddr64() const { return TargetTriple.isArch64Bit(); }
+  bool hasSIMD128() const { return HasSIMD128; }
+
+  /// Parses features string setting specified subtarget options. Definition of
+  /// function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
new file mode 100644
index 000000000000..b61bc0a08143
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -0,0 +1,277 @@
+//===- WebAssemblyTargetMachine.cpp - Define TargetMachine for WebAssembly -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the WebAssembly-specific subclass of TargetMachine.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblyTargetMachine.h"
+#include "WebAssemblyTargetObjectFile.h"
+#include "WebAssemblyTargetTransformInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm"
+
+// Emscripten's asm.js-style exception handling
+static cl::opt<bool> EnableEmException(
+    "enable-emscripten-cxx-exceptions",
+    cl::desc("WebAssembly Emscripten-style exception handling"),
+    cl::init(false));
+
+// Emscripten's asm.js-style setjmp/longjmp handling
+static cl::opt<bool> EnableEmSjLj(
+    "enable-emscripten-sjlj",
+    cl::desc("WebAssembly Emscripten-style setjmp/longjmp handling"),
+    cl::init(false));
+
+extern "C" void LLVMInitializeWebAssemblyTarget() {
+  // Register the target.
+  RegisterTargetMachine<WebAssemblyTargetMachine> X(
+      getTheWebAssemblyTarget32());
+  RegisterTargetMachine<WebAssemblyTargetMachine> Y(
+      getTheWebAssemblyTarget64());
+
+  // Register exception handling pass to opt
+  initializeWebAssemblyLowerEmscriptenEHSjLjPass(
+      *PassRegistry::getPassRegistry());
+}
+
+//===----------------------------------------------------------------------===//
+// WebAssembly Lowering public interface.
+//===----------------------------------------------------------------------===//
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::PIC_;
+  return *RM;
+}
+
+/// Create an WebAssembly architecture model.
+///
+WebAssemblyTargetMachine::WebAssemblyTargetMachine(
+    const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
+    const TargetOptions &Options, Optional<Reloc::Model> RM,
+    CodeModel::Model CM, CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T,
+                        TT.isArch64Bit() ? "e-m:e-p:64:64-i64:64-n32:64-S128"
+                                         : "e-m:e-p:32:32-i64:64-n32:64-S128",
+                        TT, CPU, FS, Options, getEffectiveRelocModel(RM),
+                        CM, OL),
+      TLOF(make_unique<WebAssemblyTargetObjectFile>()) {
+  // WebAssembly type-checks instructions, but a noreturn function with a return
+  // type that doesn't match the context will cause a check failure. So we lower
+  // LLVM 'unreachable' to ISD::TRAP and then lower that to WebAssembly's
+  // 'unreachable' instructions which is meant for that case.
+  this->Options.TrapUnreachable = true;
+
+  initAsmInfo();
+
+  // Note that we don't use setRequiresStructuredCFG(true). It disables
+  // optimizations than we're ok with, and want, such as critical edge
+  // splitting and tail merging.
+}
+
+WebAssemblyTargetMachine::~WebAssemblyTargetMachine() {}
+
+const WebAssemblySubtarget *
+WebAssemblyTargetMachine::getSubtargetImpl(const Function &F) const {
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this);
+  }
+  return I.get();
+}
+
+namespace {
+/// WebAssembly Code Generator Pass Configuration Options.
+class WebAssemblyPassConfig final : public TargetPassConfig {
+public:
+  WebAssemblyPassConfig(WebAssemblyTargetMachine *TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  WebAssemblyTargetMachine &getWebAssemblyTargetMachine() const {
+    return getTM<WebAssemblyTargetMachine>();
+  }
+
+  FunctionPass *createTargetRegisterAllocator(bool) override;
+
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  void addPostRegAlloc() override;
+  bool addGCPasses() override { return false; }
+  void addPreEmitPass() override;
+};
+} // end anonymous namespace
+
+TargetIRAnalysis WebAssemblyTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(WebAssemblyTTIImpl(this, F));
+  });
+}
+
+TargetPassConfig *
+WebAssemblyTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new WebAssemblyPassConfig(this, PM);
+}
+
+FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
+  return nullptr; // No reg alloc
+}
+
+//===----------------------------------------------------------------------===//
+// The following functions are called from lib/CodeGen/Passes.cpp to modify
+// the CodeGen pass sequence.
+//===----------------------------------------------------------------------===//
+
+void WebAssemblyPassConfig::addIRPasses() {
+  if (TM->Options.ThreadModel == ThreadModel::Single)
+    // In "single" mode, atomics get lowered to non-atomics.
+    addPass(createLowerAtomicPass());
+  else
+    // Expand some atomic operations. WebAssemblyTargetLowering has hooks which
+    // control specifically what gets lowered.
+    addPass(createAtomicExpandPass(TM));
+
+  // Optimize "returned" function attributes.
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createWebAssemblyOptimizeReturned());
+
+  // If exception handling is not enabled and setjmp/longjmp handling is
+  // enabled, we lower invokes into calls and delete unreachable landingpad
+  // blocks. Lowering invokes when there is no EH support is done in
+  // TargetPassConfig::addPassesToHandleExceptions, but this runs after this
+  // function and SjLj handling expects all invokes to be lowered before.
+  if (!EnableEmException) {
+    addPass(createLowerInvokePass());
+    // The lower invoke pass may create unreachable code. Remove it in order not
+    // to process dead blocks in setjmp/longjmp handling.
+    addPass(createUnreachableBlockEliminationPass());
+  }
+
+  // Handle exceptions and setjmp/longjmp if enabled.
+  if (EnableEmException || EnableEmSjLj)
+    addPass(createWebAssemblyLowerEmscriptenEHSjLj(EnableEmException,
+                                                   EnableEmSjLj));
+
+  TargetPassConfig::addIRPasses();
+}
+
+bool WebAssemblyPassConfig::addInstSelector() {
+  (void)TargetPassConfig::addInstSelector();
+  addPass(
+      createWebAssemblyISelDag(getWebAssemblyTargetMachine(), getOptLevel()));
+  // Run the argument-move pass immediately after the ScheduleDAG scheduler
+  // so that we can fix up the ARGUMENT instructions before anything else
+  // sees them in the wrong place.
+  addPass(createWebAssemblyArgumentMove());
+  // Set the p2align operands. This information is present during ISel, however
+  // it's inconvenient to collect. Collect it now, and update the immediate
+  // operands.
+  addPass(createWebAssemblySetP2AlignOperands());
+  return false;
+}
+
+void WebAssemblyPassConfig::addPostRegAlloc() {
+  // TODO: The following CodeGen passes don't currently support code containing
+  // virtual registers. Consider removing their restrictions and re-enabling
+  // them.
+
+  // Has no asserts of its own, but was not written to handle virtual regs.
+  disablePass(&ShrinkWrapID);
+
+  // These functions all require the NoVRegs property.
+  disablePass(&MachineCopyPropagationID);
+  disablePass(&PostRASchedulerID);
+  disablePass(&FuncletLayoutID);
+  disablePass(&StackMapLivenessID);
+  disablePass(&LiveDebugValuesID);
+  disablePass(&PatchableFunctionID);
+
+  TargetPassConfig::addPostRegAlloc();
+}
+
+void WebAssemblyPassConfig::addPreEmitPass() {
+  TargetPassConfig::addPreEmitPass();
+
+  // Now that we have a prologue and epilogue and all frame indices are
+  // rewritten, eliminate SP and FP. This allows them to be stackified,
+  // colored, and numbered with the rest of the registers.
+  addPass(createWebAssemblyReplacePhysRegs());
+
+  // Rewrite pseudo call_indirect instructions as real instructions.
+  // This needs to run before register stackification, because we change the
+  // order of the arguments.
+  addPass(createWebAssemblyCallIndirectFixup());
+
+  if (getOptLevel() != CodeGenOpt::None) {
+    // LiveIntervals isn't commonly run this late. Re-establish preconditions.
+    addPass(createWebAssemblyPrepareForLiveIntervals());
+
+    // Depend on LiveIntervals and perform some optimizations on it.
+    addPass(createWebAssemblyOptimizeLiveIntervals());
+
+    // Prepare store instructions for register stackifying.
+    addPass(createWebAssemblyStoreResults());
+
+    // Mark registers as representing wasm's value stack. This is a key
+    // code-compression technique in WebAssembly. We run this pass (and
+    // StoreResults above) very late, so that it sees as much code as possible,
+    // including code emitted by PEI and expanded by late tail duplication.
+    addPass(createWebAssemblyRegStackify());
+
+    // Run the register coloring pass to reduce the total number of registers.
+    // This runs after stackification so that it doesn't consider registers
+    // that become stackified.
+    addPass(createWebAssemblyRegColoring());
+  }
+
+  // Insert explicit get_local and set_local operators.
+  addPass(createWebAssemblyExplicitLocals());
+
+  // Eliminate multiple-entry loops.
+  addPass(createWebAssemblyFixIrreducibleControlFlow());
+
+  // Put the CFG in structured form; insert BLOCK and LOOP markers.
+  addPass(createWebAssemblyCFGStackify());
+
+  // Lower br_unless into br_if.
+  addPass(createWebAssemblyLowerBrUnless());
+
+  // Perform the very last peephole optimizations on the code.
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createWebAssemblyPeephole());
+
+  // Create a mapping from LLVM CodeGen virtual registers to wasm registers.
+  addPass(createWebAssemblyRegNumbering());
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
new file mode 100644
index 000000000000..52a2ef78736a
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
@@ -0,0 +1,53 @@
+// WebAssemblyTargetMachine.h - Define TargetMachine for WebAssembly -*- C++ -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares the WebAssembly-specific subclass of
+/// TargetMachine.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETMACHINE_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETMACHINE_H
+
+#include "WebAssemblySubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class WebAssemblyTargetMachine final : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  mutable StringMap<std::unique_ptr<WebAssemblySubtarget>> SubtargetMap;
+
+public:
+  WebAssemblyTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                           StringRef FS, const TargetOptions &Options,
+                           Optional<Reloc::Model> RM, CodeModel::Model CM,
+                           CodeGenOpt::Level OL);
+
+  ~WebAssemblyTargetMachine() override;
+  const WebAssemblySubtarget *
+  getSubtargetImpl(const Function &F) const override;
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+
+  /// \brief Get the TargetIRAnalysis for this target.
+  TargetIRAnalysis getTargetIRAnalysis() override;
+
+  bool usesPhysRegsForPEI() const override { return false; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
new file mode 100644
index 000000000000..74e33b93e00d
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
@@ -0,0 +1,24 @@
+//===-- WebAssemblyTargetObjectFile.cpp - WebAssembly Object Info ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the functions of the WebAssembly-specific subclass
+/// of TargetLoweringObjectFile.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyTargetObjectFile.h"
+#include "WebAssemblyTargetMachine.h"
+using namespace llvm;
+
+void WebAssemblyTargetObjectFile::Initialize(MCContext &Ctx,
+                                             const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
new file mode 100644
index 000000000000..39e50c9c575d
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
@@ -0,0 +1,30 @@
+//===-- WebAssemblyTargetObjectFile.h - WebAssembly Object Info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares the WebAssembly-specific subclass of
+/// TargetLoweringObjectFile.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFileELF {
+public:
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
new file mode 100644
index 000000000000..bf546dab5fbb
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -0,0 +1,83 @@
+//===-- WebAssemblyTargetTransformInfo.cpp - WebAssembly-specific TTI -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines the WebAssembly-specific TargetTransformInfo
+/// implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyTargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasmtti"
+
+TargetTransformInfo::PopcntSupportKind
+WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  return TargetTransformInfo::PSK_FastHardware;
+}
+
+unsigned WebAssemblyTTIImpl::getNumberOfRegisters(bool Vector) {
+  unsigned Result = BaseT::getNumberOfRegisters(Vector);
+
+  // For SIMD, use at least 16 registers, as a rough guess.
+  if (Vector)
+    Result = std::max(Result, 16u);
+
+  return Result;
+}
+
+unsigned WebAssemblyTTIImpl::getRegisterBitWidth(bool Vector) {
+  if (Vector && getST()->hasSIMD128())
+    return 128;
+
+  return 64;
+}
+
+unsigned WebAssemblyTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
+
+  unsigned Cost = BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
+      Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
+
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    switch (Opcode) {
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::Shl:
+      // SIMD128's shifts currently only accept a scalar shift count. For each
+      // element, we'll need to extract, op, insert. The following is a rough
+      // approxmation.
+      if (Opd2Info != TTI::OK_UniformValue &&
+          Opd2Info != TTI::OK_UniformConstantValue)
+        Cost = VTy->getNumElements() *
+               (TargetTransformInfo::TCC_Basic +
+                getArithmeticInstrCost(Opcode, VTy->getElementType()) +
+                TargetTransformInfo::TCC_Basic);
+      break;
+    }
+  }
+  return Cost;
+}
+
+unsigned WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                                unsigned Index) {
+  unsigned Cost = BasicTTIImplBase::getVectorInstrCost(Opcode, Val, Index);
+
+  // SIMD128's insert/extract currently only take constant indices.
+  if (Index == -1u)
+    return Cost + 25 * TargetTransformInfo::TCC_Expensive;
+
+  return Cost;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
new file mode 100644
index 000000000000..2a2e3941f82d
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -0,0 +1,72 @@
+//==- WebAssemblyTargetTransformInfo.h - WebAssembly-specific TTI -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file a TargetTransformInfo::Concept conforming object specific
+/// to the WebAssembly target machine.
+///
+/// It uses the target's detailed information to provide more precise answers to
+/// certain TTI queries, while letting the target independent and default TTI
+/// implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETTRANSFORMINFO_H
+
+#include "WebAssemblyTargetMachine.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include <algorithm>
+
+namespace llvm {
+
+class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
+  typedef BasicTTIImplBase<WebAssemblyTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const WebAssemblySubtarget *ST;
+  const WebAssemblyTargetLowering *TLI;
+
+  const WebAssemblySubtarget *getST() const { return ST; }
+  const WebAssemblyTargetLowering *getTLI() const { return TLI; }
+
+public:
+  WebAssemblyTTIImpl(const WebAssemblyTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  /// \name Scalar TTI Implementations
+  /// @{
+
+  // TODO: Implement more Scalar TTI for WebAssembly
+
+  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+
+  /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
new file mode 100644
index 000000000000..a0049c147d2c
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -0,0 +1,71 @@
+//===-- WebAssemblyUtilities.cpp - WebAssembly Utility Functions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements several utility functions for WebAssembly.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyUtilities.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+using namespace llvm;
+
+bool WebAssembly::isArgument(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case WebAssembly::ARGUMENT_I32:
+  case WebAssembly::ARGUMENT_I64:
+  case WebAssembly::ARGUMENT_F32:
+  case WebAssembly::ARGUMENT_F64:
+  case WebAssembly::ARGUMENT_v16i8:
+  case WebAssembly::ARGUMENT_v8i16:
+  case WebAssembly::ARGUMENT_v4i32:
+  case WebAssembly::ARGUMENT_v4f32:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool WebAssembly::isCopy(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case WebAssembly::COPY_I32:
+  case WebAssembly::COPY_I64:
+  case WebAssembly::COPY_F32:
+  case WebAssembly::COPY_F64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool WebAssembly::isTee(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case WebAssembly::TEE_I32:
+  case WebAssembly::TEE_I64:
+  case WebAssembly::TEE_F32:
+  case WebAssembly::TEE_F64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// Test whether MI is a child of some other node in an expression tree.
+bool WebAssembly::isChild(const MachineInstr &MI,
+                          const WebAssemblyFunctionInfo &MFI) {
+  if (MI.getNumOperands() == 0)
+    return false;
+  const MachineOperand &MO = MI.getOperand(0);
+  if (!MO.isReg() || MO.isImplicit() || !MO.isDef())
+    return false;
+  unsigned Reg = MO.getReg();
+  return TargetRegisterInfo::isVirtualRegister(Reg) &&
+         MFI.isVRegStackified(Reg);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
new file mode 100644
index 000000000000..eb114403d14e
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
@@ -0,0 +1,34 @@
+//===-- WebAssemblyUtilities - WebAssembly Utility Functions ---*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the declaration of the WebAssembly-specific
+/// utility functions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYUTILITIES_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYUTILITIES_H
+
+namespace llvm {
+
+class MachineInstr;
+class WebAssemblyFunctionInfo;
+
+namespace WebAssembly {
+
+bool isArgument(const MachineInstr &MI);
+bool isCopy(const MachineInstr &MI);
+bool isTee(const MachineInstr &MI);
+bool isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI);
+
+} // end namespace WebAssembly
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
new file mode 100644
index 000000000000..8dd5e8a03e2e
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -0,0 +1,68 @@
+# Tests which are known to fail from the GCC torture test suite.
+
+# Computed gotos are not supported (Cannot select BlockAddress/BRIND)
+20040302-1.c
+20071210-1.c
+920501-4.c
+920501-5.c
+comp-goto-1.c
+980526-1.c
+990208-1.c
+
+# WebAssembly hasn't implemented (will never?) __builtin_return_address
+20010122-1.c
+20030323-1.c
+20030811-1.c
+pr17377.c
+
+# Error: invalid output constraint '=t' in asm.
+990413-2.c
+
+# Error: __builtin_setjmp / __builtin_longjmp is not supported for the current target.
+built-in-setjmp.c
+pr60003.c
+
+# Error in the program / unsupported by Clang.
+scal-to-vec1.c
+scal-to-vec2.c
+scal-to-vec3.c
+20000822-1.c
+20010209-1.c
+20010605-1.c
+20030501-1.c
+20040520-1.c
+20061220-1.c
+20090219-1.c
+920415-1.c
+920428-2.c
+920501-7.c
+920612-2.c
+920721-4.c
+921017-1.c
+921215-1.c
+931002-1.c
+comp-goto-2.c
+nest-align-1.c
+nest-stdar-1.c
+nestfunc-1.c
+nestfunc-2.c
+nestfunc-3.c
+nestfunc-5.c
+nestfunc-6.c
+nestfunc-7.c
+pr22061-3.c
+pr22061-4.c
+pr24135.c
+pr51447.c
+20020412-1.c
+20040308-1.c
+20040423-1.c
+20041218-2.c
+20070919-1.c
+align-nest.c
+pr41935.c
+920302-1.c
+920501-3.c
+920728-1.c
+pr28865.c
+widechar-2.c
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
new file mode 100644
index 000000000000..c38a7d1dd44d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -0,0 +1,1077 @@
+//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmInstrumentation.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86Operand.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include <algorithm>
+#include <cassert>
+#include <vector>
+
+// Following comment describes how assembly instrumentation works.
+// Currently we have only AddressSanitizer instrumentation, but we're
+// planning to implement MemorySanitizer for inline assembly too. If
+// you're not familiar with AddressSanitizer algorithm, please, read
+// https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerAlgorithm.
+//
+// When inline assembly is parsed by an instance of X86AsmParser, all
+// instructions are emitted via EmitInstruction method. That's the
+// place where X86AsmInstrumentation analyzes an instruction and
+// decides, whether the instruction should be emitted as is or
+// instrumentation is required. The latter case happens when an
+// instruction reads from or writes to memory. Now instruction opcode
+// is explicitly checked, and if an instruction has a memory operand
+// (for instance, movq (%rsi, %rcx, 8), %rax) - it should be
+// instrumented.  There're also exist instructions that modify
+// memory but don't have an explicit memory operands, for instance,
+// movs.
+//
+// Let's consider at first 8-byte memory accesses when an instruction
+// has an explicit memory operand. In this case we need two registers -
+// AddressReg to compute address of a memory cells which are accessed
+// and ShadowReg to compute corresponding shadow address. So, we need
+// to spill both registers before instrumentation code and restore them
+// after instrumentation. Thus, in general, instrumentation code will
+// look like this:
+// PUSHF  # Store flags, otherwise they will be overwritten
+// PUSH AddressReg  # spill AddressReg
+// PUSH ShadowReg   # spill ShadowReg
+// LEA MemOp, AddressReg  # compute address of the memory operand
+// MOV AddressReg, ShadowReg
+// SHR ShadowReg, 3
+// # ShadowOffset(AddressReg >> 3) contains address of a shadow
+// # corresponding to MemOp.
+// CMP ShadowOffset(ShadowReg), 0  # test shadow value
+// JZ .Done  # when shadow equals to zero, everything is fine
+// MOV AddressReg, RDI
+// # Call __asan_report function with AddressReg as an argument
+// CALL __asan_report
+// .Done:
+// POP ShadowReg  # Restore ShadowReg
+// POP AddressReg  # Restore AddressReg
+// POPF  # Restore flags
+//
+// Memory accesses with different size (1-, 2-, 4- and 16-byte) are
+// handled in a similar manner, but small memory accesses (less than 8
+// byte) require an additional ScratchReg, which is used for shadow value.
+//
+// If, suppose, we're instrumenting an instruction like movs, only
+// contents of RDI, RDI + AccessSize * RCX, RSI, RSI + AccessSize *
+// RCX are checked.  In this case there're no need to spill and restore
+// AddressReg , ShadowReg or flags four times, they're saved on stack
+// just once, before instrumentation of these four addresses, and restored
+// at the end of the instrumentation.
+//
+// There exist several things which complicate this simple algorithm.
+// * Instrumented memory operand can have RSP as a base or an index
+//   register.  So we need to add a constant offset before computation
+//   of memory address, since flags, AddressReg, ShadowReg, etc. were
+//   already stored on stack and RSP was modified.
+// * Debug info (usually, DWARF) should be adjusted, because sometimes
+//   RSP is used as a frame register. So, we need to select some
+//   register as a frame register and temprorary override current CFA
+//   register.
+
+namespace llvm {
+namespace {
+
+static cl::opt<bool> ClAsanInstrumentAssembly(
+    "asan-instrument-assembly",
+    cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden,
+    cl::init(false));
+
+const int64_t MinAllowedDisplacement = std::numeric_limits<int32_t>::min();
+const int64_t MaxAllowedDisplacement = std::numeric_limits<int32_t>::max();
+
+int64_t ApplyDisplacementBounds(int64_t Displacement) {
+  return std::max(std::min(MaxAllowedDisplacement, Displacement),
+                  MinAllowedDisplacement);
+}
+
+void CheckDisplacementBounds(int64_t Displacement) {
+  assert(Displacement >= MinAllowedDisplacement &&
+         Displacement <= MaxAllowedDisplacement);
+}
+
+bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; }
+
+bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; }
+
+class X86AddressSanitizer : public X86AsmInstrumentation {
+public:
+  struct RegisterContext {
+  private:
+    enum RegOffset {
+      REG_OFFSET_ADDRESS = 0,
+      REG_OFFSET_SHADOW,
+      REG_OFFSET_SCRATCH
+    };
+
+  public:
+    RegisterContext(unsigned AddressReg, unsigned ShadowReg,
+                    unsigned ScratchReg) {
+      BusyRegs.push_back(convReg(AddressReg, 64));
+      BusyRegs.push_back(convReg(ShadowReg, 64));
+      BusyRegs.push_back(convReg(ScratchReg, 64));
+    }
+
+    unsigned AddressReg(unsigned Size) const {
+      return convReg(BusyRegs[REG_OFFSET_ADDRESS], Size);
+    }
+
+    unsigned ShadowReg(unsigned Size) const {
+      return convReg(BusyRegs[REG_OFFSET_SHADOW], Size);
+    }
+
+    unsigned ScratchReg(unsigned Size) const {
+      return convReg(BusyRegs[REG_OFFSET_SCRATCH], Size);
+    }
+
+    void AddBusyReg(unsigned Reg) {
+      if (Reg != X86::NoRegister)
+        BusyRegs.push_back(convReg(Reg, 64));
+    }
+
+    void AddBusyRegs(const X86Operand &Op) {
+      AddBusyReg(Op.getMemBaseReg());
+      AddBusyReg(Op.getMemIndexReg());
+    }
+
+    unsigned ChooseFrameReg(unsigned Size) const {
+      static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX,
+                                              X86::RCX, X86::RDX, X86::RDI,
+                                              X86::RSI };
+      for (unsigned Reg : Candidates) {
+        if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg))
+          return convReg(Reg, Size);
+      }
+      return X86::NoRegister;
+    }
+
+  private:
+    unsigned convReg(unsigned Reg, unsigned Size) const {
+      return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, Size);
+    }
+
+    std::vector<unsigned> BusyRegs;
+  };
+
+  X86AddressSanitizer(const MCSubtargetInfo *&STI)
+      : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {}
+
+  ~X86AddressSanitizer() override {}
+
+  // X86AsmInstrumentation implementation:
+  void InstrumentAndEmitInstruction(const MCInst &Inst,
+                                    OperandVector &Operands,
+                                    MCContext &Ctx,
+                                    const MCInstrInfo &MII,
+                                    MCStreamer &Out) override {
+    InstrumentMOVS(Inst, Operands, Ctx, MII, Out);
+    if (RepPrefix)
+      EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX));
+
+    InstrumentMOV(Inst, Operands, Ctx, MII, Out);
+
+    RepPrefix = (Inst.getOpcode() == X86::REP_PREFIX);
+    if (!RepPrefix)
+      EmitInstruction(Out, Inst);
+  }
+
+  // Adjusts up stack and saves all registers used in instrumentation.
+  virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+                                            MCContext &Ctx,
+                                            MCStreamer &Out) = 0;
+
+  // Restores all registers used in instrumentation and adjusts stack.
+  virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+                                            MCContext &Ctx,
+                                            MCStreamer &Out) = 0;
+
+  virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+                                         bool IsWrite,
+                                         const RegisterContext &RegCtx,
+                                         MCContext &Ctx, MCStreamer &Out) = 0;
+  virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+                                         bool IsWrite,
+                                         const RegisterContext &RegCtx,
+                                         MCContext &Ctx, MCStreamer &Out) = 0;
+
+  virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+                                  MCStreamer &Out) = 0;
+
+  void InstrumentMemOperand(X86Operand &Op, unsigned AccessSize, bool IsWrite,
+                            const RegisterContext &RegCtx, MCContext &Ctx,
+                            MCStreamer &Out);
+  void InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, unsigned CntReg,
+                          unsigned AccessSize, MCContext &Ctx, MCStreamer &Out);
+
+  void InstrumentMOVS(const MCInst &Inst, OperandVector &Operands,
+                      MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
+  void InstrumentMOV(const MCInst &Inst, OperandVector &Operands,
+                     MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
+
+protected:
+  void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); }
+
+  void EmitLEA(X86Operand &Op, unsigned Size, unsigned Reg, MCStreamer &Out) {
+    assert(Size == 32 || Size == 64);
+    MCInst Inst;
+    Inst.setOpcode(Size == 32 ? X86::LEA32r : X86::LEA64r);
+    Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, Size)));
+    Op.addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+
+  void ComputeMemOperandAddress(X86Operand &Op, unsigned Size,
+                                unsigned Reg, MCContext &Ctx, MCStreamer &Out);
+
+  // Creates new memory operand with Displacement added to an original
+  // displacement. Residue will contain a residue which could happen when the
+  // total displacement exceeds 32-bit limitation.
+  std::unique_ptr<X86Operand> AddDisplacement(X86Operand &Op,
+                                              int64_t Displacement,
+                                              MCContext &Ctx, int64_t *Residue);
+
+  bool is64BitMode() const {
+    return STI->getFeatureBits()[X86::Mode64Bit];
+  }
+  bool is32BitMode() const {
+    return STI->getFeatureBits()[X86::Mode32Bit];
+  }
+  bool is16BitMode() const {
+    return STI->getFeatureBits()[X86::Mode16Bit];
+  }
+
+  unsigned getPointerWidth() {
+    if (is16BitMode()) return 16;
+    if (is32BitMode()) return 32;
+    if (is64BitMode()) return 64;
+    llvm_unreachable("invalid mode");
+  }
+
+  // True when previous instruction was actually REP prefix.
+  bool RepPrefix;
+
+  // Offset from the original SP register.
+  int64_t OrigSPOffset;
+};
+
+void X86AddressSanitizer::InstrumentMemOperand(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite,
+    const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+  assert(Op.isMem() && "Op should be a memory operand.");
+  assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 &&
+         "AccessSize should be a power of two, less or equal than 16.");
+  // FIXME: take into account load/store alignment.
+  if (IsSmallMemAccess(AccessSize))
+    InstrumentMemOperandSmall(Op, AccessSize, IsWrite, RegCtx, Ctx, Out);
+  else
+    InstrumentMemOperandLarge(Op, AccessSize, IsWrite, RegCtx, Ctx, Out);
+}
+
+void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg,
+                                             unsigned CntReg,
+                                             unsigned AccessSize,
+                                             MCContext &Ctx, MCStreamer &Out) {
+  // FIXME: check whole ranges [DstReg .. DstReg + AccessSize * (CntReg - 1)]
+  // and [SrcReg .. SrcReg + AccessSize * (CntReg - 1)].
+  RegisterContext RegCtx(X86::RDX /* AddressReg */, X86::RAX /* ShadowReg */,
+                         IsSmallMemAccess(AccessSize)
+                             ? X86::RBX
+                             : X86::NoRegister /* ScratchReg */);
+  RegCtx.AddBusyReg(DstReg);
+  RegCtx.AddBusyReg(SrcReg);
+  RegCtx.AddBusyReg(CntReg);
+
+  InstrumentMemOperandPrologue(RegCtx, Ctx, Out);
+
+  // Test (%SrcReg)
+  {
+    const MCExpr *Disp = MCConstantExpr::create(0, Ctx);
+    std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+        getPointerWidth(), 0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc()));
+    InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
+                         Out);
+  }
+
+  // Test -1(%SrcReg, %CntReg, AccessSize)
+  {
+    const MCExpr *Disp = MCConstantExpr::create(-1, Ctx);
+    std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+        getPointerWidth(), 0, Disp, SrcReg, CntReg, AccessSize, SMLoc(),
+        SMLoc()));
+    InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
+                         Out);
+  }
+
+  // Test (%DstReg)
+  {
+    const MCExpr *Disp = MCConstantExpr::create(0, Ctx);
+    std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+        getPointerWidth(), 0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc()));
+    InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
+  }
+
+  // Test -1(%DstReg, %CntReg, AccessSize)
+  {
+    const MCExpr *Disp = MCConstantExpr::create(-1, Ctx);
+    std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+        getPointerWidth(), 0, Disp, DstReg, CntReg, AccessSize, SMLoc(),
+        SMLoc()));
+    InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
+  }
+
+  InstrumentMemOperandEpilogue(RegCtx, Ctx, Out);
+}
+
+void X86AddressSanitizer::InstrumentMOVS(const MCInst &Inst,
+                                         OperandVector &Operands,
+                                         MCContext &Ctx, const MCInstrInfo &MII,
+                                         MCStreamer &Out) {
+  // Access size in bytes.
+  unsigned AccessSize = 0;
+
+  switch (Inst.getOpcode()) {
+  case X86::MOVSB:
+    AccessSize = 1;
+    break;
+  case X86::MOVSW:
+    AccessSize = 2;
+    break;
+  case X86::MOVSL:
+    AccessSize = 4;
+    break;
+  case X86::MOVSQ:
+    AccessSize = 8;
+    break;
+  default:
+    return;
+  }
+
+  InstrumentMOVSImpl(AccessSize, Ctx, Out);
+}
+
+void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst,
+                                        OperandVector &Operands, MCContext &Ctx,
+                                        const MCInstrInfo &MII,
+                                        MCStreamer &Out) {
+  // Access size in bytes.
+  unsigned AccessSize = 0;
+
+  switch (Inst.getOpcode()) {
+  case X86::MOV8mi:
+  case X86::MOV8mr:
+  case X86::MOV8rm:
+    AccessSize = 1;
+    break;
+  case X86::MOV16mi:
+  case X86::MOV16mr:
+  case X86::MOV16rm:
+    AccessSize = 2;
+    break;
+  case X86::MOV32mi:
+  case X86::MOV32mr:
+  case X86::MOV32rm:
+    AccessSize = 4;
+    break;
+  case X86::MOV64mi32:
+  case X86::MOV64mr:
+  case X86::MOV64rm:
+    AccessSize = 8;
+    break;
+  case X86::MOVAPDmr:
+  case X86::MOVAPSmr:
+  case X86::MOVAPDrm:
+  case X86::MOVAPSrm:
+    AccessSize = 16;
+    break;
+  default:
+    return;
+  }
+
+  const bool IsWrite = MII.get(Inst.getOpcode()).mayStore();
+
+  for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) {
+    assert(Operands[Ix]);
+    MCParsedAsmOperand &Op = *Operands[Ix];
+    if (Op.isMem()) {
+      X86Operand &MemOp = static_cast<X86Operand &>(Op);
+      RegisterContext RegCtx(
+          X86::RDI /* AddressReg */, X86::RAX /* ShadowReg */,
+          IsSmallMemAccess(AccessSize) ? X86::RCX
+                                       : X86::NoRegister /* ScratchReg */);
+      RegCtx.AddBusyRegs(MemOp);
+      InstrumentMemOperandPrologue(RegCtx, Ctx, Out);
+      InstrumentMemOperand(MemOp, AccessSize, IsWrite, RegCtx, Ctx, Out);
+      InstrumentMemOperandEpilogue(RegCtx, Ctx, Out);
+    }
+  }
+}
+
+void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op,
+                                                   unsigned Size,
+                                                   unsigned Reg, MCContext &Ctx,
+                                                   MCStreamer &Out) {
+  int64_t Displacement = 0;
+  if (IsStackReg(Op.getMemBaseReg()))
+    Displacement -= OrigSPOffset;
+  if (IsStackReg(Op.getMemIndexReg()))
+    Displacement -= OrigSPOffset * Op.getMemScale();
+
+  assert(Displacement >= 0);
+
+  // Emit Op as is.
+  if (Displacement == 0) {
+    EmitLEA(Op, Size, Reg, Out);
+    return;
+  }
+
+  int64_t Residue;
+  std::unique_ptr<X86Operand> NewOp =
+      AddDisplacement(Op, Displacement, Ctx, &Residue);
+  EmitLEA(*NewOp, Size, Reg, Out);
+
+  while (Residue != 0) {
+    const MCConstantExpr *Disp =
+        MCConstantExpr::create(ApplyDisplacementBounds(Residue), Ctx);
+    std::unique_ptr<X86Operand> DispOp =
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(),
+                              SMLoc());
+    EmitLEA(*DispOp, Size, Reg, Out);
+    Residue -= Disp->getValue();
+  }
+}
+
+std::unique_ptr<X86Operand>
+X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement,
+                                     MCContext &Ctx, int64_t *Residue) {
+  assert(Displacement >= 0);
+
+  if (Displacement == 0 ||
+      (Op.getMemDisp() && Op.getMemDisp()->getKind() != MCExpr::Constant)) {
+    *Residue = Displacement;
+    return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(),
+                                 Op.getMemDisp(), Op.getMemBaseReg(),
+                                 Op.getMemIndexReg(), Op.getMemScale(),
+                                 SMLoc(), SMLoc());
+  }
+
+  int64_t OrigDisplacement =
+      static_cast<const MCConstantExpr *>(Op.getMemDisp())->getValue();
+  CheckDisplacementBounds(OrigDisplacement);
+  Displacement += OrigDisplacement;
+
+  int64_t NewDisplacement = ApplyDisplacementBounds(Displacement);
+  CheckDisplacementBounds(NewDisplacement);
+
+  *Residue = Displacement - NewDisplacement;
+  const MCExpr *Disp = MCConstantExpr::create(NewDisplacement, Ctx);
+  return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), Disp,
+                               Op.getMemBaseReg(), Op.getMemIndexReg(),
+                               Op.getMemScale(), SMLoc(), SMLoc());
+}
+
+class X86AddressSanitizer32 : public X86AddressSanitizer {
+public:
+  static const long kShadowOffset = 0x20000000;
+
+  X86AddressSanitizer32(const MCSubtargetInfo *&STI)
+      : X86AddressSanitizer(STI) {}
+
+  ~X86AddressSanitizer32() override {}
+
+  unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
+    unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
+    if (FrameReg == X86::NoRegister)
+      return FrameReg;
+    return getX86SubSuperRegister(FrameReg, 32);
+  }
+
+  void SpillReg(MCStreamer &Out, unsigned Reg) {
+    EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(Reg));
+    OrigSPOffset -= 4;
+  }
+
+  void RestoreReg(MCStreamer &Out, unsigned Reg) {
+    EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(Reg));
+    OrigSPOffset += 4;
+  }
+
+  void StoreFlags(MCStreamer &Out) {
+    EmitInstruction(Out, MCInstBuilder(X86::PUSHF32));
+    OrigSPOffset -= 4;
+  }
+
+  void RestoreFlags(MCStreamer &Out) {
+    EmitInstruction(Out, MCInstBuilder(X86::POPF32));
+    OrigSPOffset += 4;
+  }
+
+  void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+                                    MCContext &Ctx,
+                                    MCStreamer &Out) override {
+    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32);
+    assert(LocalFrameReg != X86::NoRegister);
+
+    const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
+    unsigned FrameReg = GetFrameReg(Ctx, Out);
+    if (MRI && FrameReg != X86::NoRegister) {
+      SpillReg(Out, LocalFrameReg);
+      if (FrameReg == X86::ESP) {
+        Out.EmitCFIAdjustCfaOffset(4 /* byte size of the LocalFrameReg */);
+        Out.EmitCFIRelOffset(
+            MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0);
+      }
+      EmitInstruction(
+          Out,
+          MCInstBuilder(X86::MOV32rr).addReg(LocalFrameReg).addReg(FrameReg));
+      Out.EmitCFIRememberState();
+      Out.EmitCFIDefCfaRegister(
+          MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */));
+    }
+
+    SpillReg(Out, RegCtx.AddressReg(32));
+    SpillReg(Out, RegCtx.ShadowReg(32));
+    if (RegCtx.ScratchReg(32) != X86::NoRegister)
+      SpillReg(Out, RegCtx.ScratchReg(32));
+    StoreFlags(Out);
+  }
+
+  void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+                                    MCContext &Ctx,
+                                    MCStreamer &Out) override {
+    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32);
+    assert(LocalFrameReg != X86::NoRegister);
+
+    RestoreFlags(Out);
+    if (RegCtx.ScratchReg(32) != X86::NoRegister)
+      RestoreReg(Out, RegCtx.ScratchReg(32));
+    RestoreReg(Out, RegCtx.ShadowReg(32));
+    RestoreReg(Out, RegCtx.AddressReg(32));
+
+    unsigned FrameReg = GetFrameReg(Ctx, Out);
+    if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) {
+      RestoreReg(Out, LocalFrameReg);
+      Out.EmitCFIRestoreState();
+      if (FrameReg == X86::ESP)
+        Out.EmitCFIAdjustCfaOffset(-4 /* byte size of the LocalFrameReg */);
+    }
+  }
+
+  void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+                                 bool IsWrite,
+                                 const RegisterContext &RegCtx,
+                                 MCContext &Ctx,
+                                 MCStreamer &Out) override;
+  void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+                                 bool IsWrite,
+                                 const RegisterContext &RegCtx,
+                                 MCContext &Ctx,
+                                 MCStreamer &Out) override;
+  void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+                          MCStreamer &Out) override;
+
+private:
+  void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+                          MCStreamer &Out, const RegisterContext &RegCtx) {
+    EmitInstruction(Out, MCInstBuilder(X86::CLD));
+    EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
+
+    EmitInstruction(Out, MCInstBuilder(X86::AND64ri8)
+                             .addReg(X86::ESP)
+                             .addReg(X86::ESP)
+                             .addImm(-16));
+    EmitInstruction(
+        Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32)));
+
+    MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+                                            (IsWrite ? "store" : "load") +
+                                            llvm::Twine(AccessSize));
+    const MCSymbolRefExpr *FnExpr =
+        MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+    EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr));
+  }
+};
+
+void X86AddressSanitizer32::InstrumentMemOperandSmall(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite,
+    const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+  unsigned AddressRegI32 = RegCtx.AddressReg(32);
+  unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
+  unsigned ShadowRegI8 = RegCtx.ShadowReg(8);
+
+  assert(RegCtx.ScratchReg(32) != X86::NoRegister);
+  unsigned ScratchRegI32 = RegCtx.ScratchReg(32);
+
+  ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out);
+
+  EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
+                           AddressRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR32ri)
+                           .addReg(ShadowRegI32)
+                           .addReg(ShadowRegI32)
+                           .addImm(3));
+
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::MOV8rm);
+    Inst.addOperand(MCOperand::createReg(ShadowRegI8));
+    const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1,
+                              SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+
+  EmitInstruction(
+      Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
+  MCSymbol *DoneSym = Ctx.createTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+  EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
+                           AddressRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::AND32ri)
+                           .addReg(ScratchRegI32)
+                           .addReg(ScratchRegI32)
+                           .addImm(7));
+
+  switch (AccessSize) {
+  default: llvm_unreachable("Incorrect access size");
+  case 1:
+    break;
+  case 2: {
+    const MCExpr *Disp = MCConstantExpr::create(1, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
+                              SMLoc(), SMLoc()));
+    EmitLEA(*Op, 32, ScratchRegI32, Out);
+    break;
+  }
+  case 4:
+    EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8)
+                             .addReg(ScratchRegI32)
+                             .addReg(ScratchRegI32)
+                             .addImm(3));
+    break;
+  }
+
+  EmitInstruction(
+      Out,
+      MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8));
+  EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg(
+                           ShadowRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr));
+
+  EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+  EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer32::InstrumentMemOperandLarge(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite,
+    const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+  unsigned AddressRegI32 = RegCtx.AddressReg(32);
+  unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
+
+  ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out);
+
+  EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
+                           AddressRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR32ri)
+                           .addReg(ShadowRegI32)
+                           .addReg(ShadowRegI32)
+                           .addImm(3));
+  {
+    MCInst Inst;
+    switch (AccessSize) {
+    default: llvm_unreachable("Incorrect access size");
+    case 8:
+      Inst.setOpcode(X86::CMP8mi);
+      break;
+    case 16:
+      Inst.setOpcode(X86::CMP16mi);
+      break;
+    }
+    const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1,
+                              SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    Inst.addOperand(MCOperand::createImm(0));
+    EmitInstruction(Out, Inst);
+  }
+  MCSymbol *DoneSym = Ctx.createTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+  EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+  EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer32::InstrumentMOVSImpl(unsigned AccessSize,
+                                               MCContext &Ctx,
+                                               MCStreamer &Out) {
+  StoreFlags(Out);
+
+  // No need to test when ECX is equals to zero.
+  MCSymbol *DoneSym = Ctx.createTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+  EmitInstruction(
+      Out, MCInstBuilder(X86::TEST32rr).addReg(X86::ECX).addReg(X86::ECX));
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+  // Instrument first and last elements in src and dst range.
+  InstrumentMOVSBase(X86::EDI /* DstReg */, X86::ESI /* SrcReg */,
+                     X86::ECX /* CntReg */, AccessSize, Ctx, Out);
+
+  EmitLabel(Out, DoneSym);
+  RestoreFlags(Out);
+}
+
+class X86AddressSanitizer64 : public X86AddressSanitizer {
+public:
+  static const long kShadowOffset = 0x7fff8000;
+
+  X86AddressSanitizer64(const MCSubtargetInfo *&STI)
+      : X86AddressSanitizer(STI) {}
+
+  ~X86AddressSanitizer64() override {}
+
+  unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
+    unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
+    if (FrameReg == X86::NoRegister)
+      return FrameReg;
+    return getX86SubSuperRegister(FrameReg, 64);
+  }
+
+  void SpillReg(MCStreamer &Out, unsigned Reg) {
+    EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(Reg));
+    OrigSPOffset -= 8;
+  }
+
+  void RestoreReg(MCStreamer &Out, unsigned Reg) {
+    EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(Reg));
+    OrigSPOffset += 8;
+  }
+
+  void StoreFlags(MCStreamer &Out) {
+    EmitInstruction(Out, MCInstBuilder(X86::PUSHF64));
+    OrigSPOffset -= 8;
+  }
+
+  void RestoreFlags(MCStreamer &Out) {
+    EmitInstruction(Out, MCInstBuilder(X86::POPF64));
+    OrigSPOffset += 8;
+  }
+
+  void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+                                    MCContext &Ctx,
+                                    MCStreamer &Out) override {
+    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64);
+    assert(LocalFrameReg != X86::NoRegister);
+
+    const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
+    unsigned FrameReg = GetFrameReg(Ctx, Out);
+    if (MRI && FrameReg != X86::NoRegister) {
+      SpillReg(Out, X86::RBP);
+      if (FrameReg == X86::RSP) {
+        Out.EmitCFIAdjustCfaOffset(8 /* byte size of the LocalFrameReg */);
+        Out.EmitCFIRelOffset(
+            MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0);
+      }
+      EmitInstruction(
+          Out,
+          MCInstBuilder(X86::MOV64rr).addReg(LocalFrameReg).addReg(FrameReg));
+      Out.EmitCFIRememberState();
+      Out.EmitCFIDefCfaRegister(
+          MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */));
+    }
+
+    EmitAdjustRSP(Ctx, Out, -128);
+    SpillReg(Out, RegCtx.ShadowReg(64));
+    SpillReg(Out, RegCtx.AddressReg(64));
+    if (RegCtx.ScratchReg(64) != X86::NoRegister)
+      SpillReg(Out, RegCtx.ScratchReg(64));
+    StoreFlags(Out);
+  }
+
+  void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+                                    MCContext &Ctx,
+                                    MCStreamer &Out) override {
+    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64);
+    assert(LocalFrameReg != X86::NoRegister);
+
+    RestoreFlags(Out);
+    if (RegCtx.ScratchReg(64) != X86::NoRegister)
+      RestoreReg(Out, RegCtx.ScratchReg(64));
+    RestoreReg(Out, RegCtx.AddressReg(64));
+    RestoreReg(Out, RegCtx.ShadowReg(64));
+    EmitAdjustRSP(Ctx, Out, 128);
+
+    unsigned FrameReg = GetFrameReg(Ctx, Out);
+    if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) {
+      RestoreReg(Out, LocalFrameReg);
+      Out.EmitCFIRestoreState();
+      if (FrameReg == X86::RSP)
+        Out.EmitCFIAdjustCfaOffset(-8 /* byte size of the LocalFrameReg */);
+    }
+  }
+
+  void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+                                 bool IsWrite,
+                                 const RegisterContext &RegCtx,
+                                 MCContext &Ctx,
+                                 MCStreamer &Out) override;
+  void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+                                 bool IsWrite,
+                                 const RegisterContext &RegCtx,
+                                 MCContext &Ctx,
+                                 MCStreamer &Out) override;
+  void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+                          MCStreamer &Out) override;
+
+private:
+  void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) {
+    const MCExpr *Disp = MCConstantExpr::create(Offset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1,
+                              SMLoc(), SMLoc()));
+    EmitLEA(*Op, 64, X86::RSP, Out);
+    OrigSPOffset += Offset;
+  }
+
+  void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+                          MCStreamer &Out, const RegisterContext &RegCtx) {
+    EmitInstruction(Out, MCInstBuilder(X86::CLD));
+    EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
+
+    EmitInstruction(Out, MCInstBuilder(X86::AND64ri8)
+                             .addReg(X86::RSP)
+                             .addReg(X86::RSP)
+                             .addImm(-16));
+
+    if (RegCtx.AddressReg(64) != X86::RDI) {
+      EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg(
+                               RegCtx.AddressReg(64)));
+    }
+    MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+                                            (IsWrite ? "store" : "load") +
+                                            llvm::Twine(AccessSize));
+    const MCSymbolRefExpr *FnExpr =
+        MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+    EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
+  }
+};
+
+void X86AddressSanitizer64::InstrumentMemOperandSmall(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite,
+    const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+  unsigned AddressRegI64 = RegCtx.AddressReg(64);
+  unsigned AddressRegI32 = RegCtx.AddressReg(32);
+  unsigned ShadowRegI64 = RegCtx.ShadowReg(64);
+  unsigned ShadowRegI32 = RegCtx.ShadowReg(32);
+  unsigned ShadowRegI8 = RegCtx.ShadowReg(8);
+
+  assert(RegCtx.ScratchReg(32) != X86::NoRegister);
+  unsigned ScratchRegI32 = RegCtx.ScratchReg(32);
+
+  ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out);
+
+  EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
+                           AddressRegI64));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR64ri)
+                           .addReg(ShadowRegI64)
+                           .addReg(ShadowRegI64)
+                           .addImm(3));
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::MOV8rm);
+    Inst.addOperand(MCOperand::createReg(ShadowRegI8));
+    const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1,
+                              SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+
+  EmitInstruction(
+      Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
+  MCSymbol *DoneSym = Ctx.createTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+  EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
+                           AddressRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::AND32ri)
+                           .addReg(ScratchRegI32)
+                           .addReg(ScratchRegI32)
+                           .addImm(7));
+
+  switch (AccessSize) {
+  default: llvm_unreachable("Incorrect access size");
+  case 1:
+    break;
+  case 2: {
+    const MCExpr *Disp = MCConstantExpr::create(1, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
+                              SMLoc(), SMLoc()));
+    EmitLEA(*Op, 32, ScratchRegI32, Out);
+    break;
+  }
+  case 4:
+    EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8)
+                             .addReg(ScratchRegI32)
+                             .addReg(ScratchRegI32)
+                             .addImm(3));
+    break;
+  }
+
+  EmitInstruction(
+      Out,
+      MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8));
+  EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg(
+                           ShadowRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr));
+
+  EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+  EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer64::InstrumentMemOperandLarge(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite,
+    const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+  unsigned AddressRegI64 = RegCtx.AddressReg(64);
+  unsigned ShadowRegI64 = RegCtx.ShadowReg(64);
+
+  ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out);
+
+  EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
+                           AddressRegI64));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR64ri)
+                           .addReg(ShadowRegI64)
+                           .addReg(ShadowRegI64)
+                           .addImm(3));
+  {
+    MCInst Inst;
+    switch (AccessSize) {
+    default: llvm_unreachable("Incorrect access size");
+    case 8:
+      Inst.setOpcode(X86::CMP8mi);
+      break;
+    case 16:
+      Inst.setOpcode(X86::CMP16mi);
+      break;
+    }
+    const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1,
+                              SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    Inst.addOperand(MCOperand::createImm(0));
+    EmitInstruction(Out, Inst);
+  }
+
+  MCSymbol *DoneSym = Ctx.createTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+  EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+  EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize,
+                                               MCContext &Ctx,
+                                               MCStreamer &Out) {
+  StoreFlags(Out);
+
+  // No need to test when RCX is equals to zero.
+  MCSymbol *DoneSym = Ctx.createTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
+  EmitInstruction(
+      Out, MCInstBuilder(X86::TEST64rr).addReg(X86::RCX).addReg(X86::RCX));
+  EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
+
+  // Instrument first and last elements in src and dst range.
+  InstrumentMOVSBase(X86::RDI /* DstReg */, X86::RSI /* SrcReg */,
+                     X86::RCX /* CntReg */, AccessSize, Ctx, Out);
+
+  EmitLabel(Out, DoneSym);
+  RestoreFlags(Out);
+}
+
+} // End anonymous namespace
+
+X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI)
+    : STI(STI), InitialFrameReg(0) {}
+
+X86AsmInstrumentation::~X86AsmInstrumentation() {}
+
+void X86AsmInstrumentation::InstrumentAndEmitInstruction(
+    const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
+    const MCInstrInfo &MII, MCStreamer &Out) {
+  EmitInstruction(Out, Inst);
+}
+
+void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out,
+                                            const MCInst &Inst) {
+  Out.EmitInstruction(Inst, *STI);
+}
+
+unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx,
+                                                   MCStreamer &Out) {
+  if (!Out.getNumFrameInfos()) // No active dwarf frame
+    return X86::NoRegister;
+  const MCDwarfFrameInfo &Frame = Out.getDwarfFrameInfos().back();
+  if (Frame.End) // Active dwarf frame is closed
+    return X86::NoRegister;
+  const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
+  if (!MRI) // No register info
+    return X86::NoRegister;
+
+  if (InitialFrameReg) {
+    // FrameReg is set explicitly, we're instrumenting a MachineFunction.
+    return InitialFrameReg;
+  }
+
+  return MRI->getLLVMRegNum(Frame.CurrentCfaRegister, true /* IsEH */);
+}
+
+X86AsmInstrumentation *
+CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                            const MCContext &Ctx, const MCSubtargetInfo *&STI) {
+  Triple T(STI->getTargetTriple());
+  const bool hasCompilerRTSupport = T.isOSLinux();
+  if (ClAsanInstrumentAssembly && hasCompilerRTSupport &&
+      MCOptions.SanitizeAddress) {
+    if (STI->getFeatureBits()[X86::Mode32Bit] != 0)
+      return new X86AddressSanitizer32(STI);
+    if (STI->getFeatureBits()[X86::Mode64Bit] != 0)
+      return new X86AddressSanitizer64(STI);
+  }
+  return new X86AsmInstrumentation(STI);
+}
+
+} // end llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
new file mode 100644
index 000000000000..470ceadb0aa6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -0,0 +1,68 @@
+//===- X86AsmInstrumentation.h - Instrument X86 inline assembly *- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
+
+#include "llvm/ADT/SmallVector.h"
+
+#include <memory>
+
+namespace llvm {
+
+class MCContext;
+class MCInst;
+class MCInstrInfo;
+class MCParsedAsmOperand;
+class MCStreamer;
+class MCSubtargetInfo;
+class MCTargetOptions;
+
+class X86AsmInstrumentation;
+
+X86AsmInstrumentation *
+CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                            const MCContext &Ctx,
+                            const MCSubtargetInfo *&STI);
+
+class X86AsmInstrumentation {
+public:
+  virtual ~X86AsmInstrumentation();
+
+  // Sets frame register corresponding to a current frame.
+  void SetInitialFrameRegister(unsigned RegNo) {
+    InitialFrameReg = RegNo;
+  }
+
+  // Tries to instrument and emit instruction.
+  virtual void InstrumentAndEmitInstruction(
+      const MCInst &Inst,
+      SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand> > &Operands,
+      MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
+
+protected:
+  friend X86AsmInstrumentation *
+  CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                              const MCContext &Ctx,
+                              const MCSubtargetInfo *&STI);
+
+  X86AsmInstrumentation(const MCSubtargetInfo *&STI);
+
+  unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out);
+
+  void EmitInstruction(MCStreamer &Out, const MCInst &Inst);
+
+  const MCSubtargetInfo *&STI;
+
+  unsigned InitialFrameReg;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
new file mode 100644
index 000000000000..e692118f47fd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -0,0 +1,3184 @@
+//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86AsmInstrumentation.h"
+#include "X86AsmParserCommon.h"
+#include "X86Operand.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <memory>
+
+using namespace llvm;
+
+namespace {
+
+static const char OpPrecedence[] = {
+  0, // IC_OR
+  1, // IC_XOR
+  2, // IC_AND
+  3, // IC_LSHIFT
+  3, // IC_RSHIFT
+  4, // IC_PLUS
+  4, // IC_MINUS
+  5, // IC_MULTIPLY
+  5, // IC_DIVIDE
+  6, // IC_RPAREN
+  7, // IC_LPAREN
+  0, // IC_IMM
+  0  // IC_REGISTER
+};
+
+class X86AsmParser : public MCTargetAsmParser {
+  const MCInstrInfo &MII;
+  ParseInstructionInfo *InstInfo;
+  std::unique_ptr<X86AsmInstrumentation> Instrumentation;
+  bool Code16GCC;
+
+private:
+  SMLoc consumeToken() {
+    MCAsmParser &Parser = getParser();
+    SMLoc Result = Parser.getTok().getLoc();
+    Parser.Lex();
+    return Result;
+  }
+
+  unsigned MatchInstruction(const OperandVector &Operands, MCInst &Inst,
+                            uint64_t &ErrorInfo, bool matchingInlineAsm,
+                            unsigned VariantID = 0) {
+    // In Code16GCC mode, match as 32-bit.
+    if (Code16GCC)
+      SwitchMode(X86::Mode32Bit);
+    unsigned rv = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                                       matchingInlineAsm, VariantID);
+    if (Code16GCC)
+      SwitchMode(X86::Mode16Bit);
+    return rv;
+  }
+
+  enum InfixCalculatorTok {
+    IC_OR = 0,
+    IC_XOR,
+    IC_AND,
+    IC_LSHIFT,
+    IC_RSHIFT,
+    IC_PLUS,
+    IC_MINUS,
+    IC_MULTIPLY,
+    IC_DIVIDE,
+    IC_RPAREN,
+    IC_LPAREN,
+    IC_IMM,
+    IC_REGISTER
+  };
+
+  class InfixCalculator {
+    typedef std::pair< InfixCalculatorTok, int64_t > ICToken;
+    SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
+    SmallVector<ICToken, 4> PostfixStack;
+
+  public:
+    int64_t popOperand() {
+      assert (!PostfixStack.empty() && "Poped an empty stack!");
+      ICToken Op = PostfixStack.pop_back_val();
+      assert ((Op.first == IC_IMM || Op.first == IC_REGISTER)
+              && "Expected and immediate or register!");
+      return Op.second;
+    }
+    void pushOperand(InfixCalculatorTok Op, int64_t Val = 0) {
+      assert ((Op == IC_IMM || Op == IC_REGISTER) &&
+              "Unexpected operand!");
+      PostfixStack.push_back(std::make_pair(Op, Val));
+    }
+
+    void popOperator() { InfixOperatorStack.pop_back(); }
+    void pushOperator(InfixCalculatorTok Op) {
+      // Push the new operator if the stack is empty.
+      if (InfixOperatorStack.empty()) {
+        InfixOperatorStack.push_back(Op);
+        return;
+      }
+
+      // Push the new operator if it has a higher precedence than the operator
+      // on the top of the stack or the operator on the top of the stack is a
+      // left parentheses.
+      unsigned Idx = InfixOperatorStack.size() - 1;
+      InfixCalculatorTok StackOp = InfixOperatorStack[Idx];
+      if (OpPrecedence[Op] > OpPrecedence[StackOp] || StackOp == IC_LPAREN) {
+        InfixOperatorStack.push_back(Op);
+        return;
+      }
+
+      // The operator on the top of the stack has higher precedence than the
+      // new operator.
+      unsigned ParenCount = 0;
+      while (1) {
+        // Nothing to process.
+        if (InfixOperatorStack.empty())
+          break;
+
+        Idx = InfixOperatorStack.size() - 1;
+        StackOp = InfixOperatorStack[Idx];
+        if (!(OpPrecedence[StackOp] >= OpPrecedence[Op] || ParenCount))
+          break;
+
+        // If we have an even parentheses count and we see a left parentheses,
+        // then stop processing.
+        if (!ParenCount && StackOp == IC_LPAREN)
+          break;
+
+        if (StackOp == IC_RPAREN) {
+          ++ParenCount;
+          InfixOperatorStack.pop_back();
+        } else if (StackOp == IC_LPAREN) {
+          --ParenCount;
+          InfixOperatorStack.pop_back();
+        } else {
+          InfixOperatorStack.pop_back();
+          PostfixStack.push_back(std::make_pair(StackOp, 0));
+        }
+      }
+      // Push the new operator.
+      InfixOperatorStack.push_back(Op);
+    }
+
+    int64_t execute() {
+      // Push any remaining operators onto the postfix stack.
+      while (!InfixOperatorStack.empty()) {
+        InfixCalculatorTok StackOp = InfixOperatorStack.pop_back_val();
+        if (StackOp != IC_LPAREN && StackOp != IC_RPAREN)
+          PostfixStack.push_back(std::make_pair(StackOp, 0));
+      }
+
+      if (PostfixStack.empty())
+        return 0;
+
+      SmallVector<ICToken, 16> OperandStack;
+      for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) {
+        ICToken Op = PostfixStack[i];
+        if (Op.first == IC_IMM || Op.first == IC_REGISTER) {
+          OperandStack.push_back(Op);
+        } else {
+          assert (OperandStack.size() > 1 && "Too few operands.");
+          int64_t Val;
+          ICToken Op2 = OperandStack.pop_back_val();
+          ICToken Op1 = OperandStack.pop_back_val();
+          switch (Op.first) {
+          default:
+            report_fatal_error("Unexpected operator!");
+            break;
+          case IC_PLUS:
+            Val = Op1.second + Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_MINUS:
+            Val = Op1.second - Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_MULTIPLY:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Multiply operation with an immediate and a register!");
+            Val = Op1.second * Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_DIVIDE:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Divide operation with an immediate and a register!");
+            assert (Op2.second != 0 && "Division by zero!");
+            Val = Op1.second / Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_OR:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Or operation with an immediate and a register!");
+            Val = Op1.second | Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_XOR:
+            assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+              "Xor operation with an immediate and a register!");
+            Val = Op1.second ^ Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_AND:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "And operation with an immediate and a register!");
+            Val = Op1.second & Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_LSHIFT:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Left shift operation with an immediate and a register!");
+            Val = Op1.second << Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          case IC_RSHIFT:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Right shift operation with an immediate and a register!");
+            Val = Op1.second >> Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
+          }
+        }
+      }
+      assert (OperandStack.size() == 1 && "Expected a single result.");
+      return OperandStack.pop_back_val().second;
+    }
+  };
+
+  enum IntelExprState {
+    IES_OR,
+    IES_XOR,
+    IES_AND,
+    IES_LSHIFT,
+    IES_RSHIFT,
+    IES_PLUS,
+    IES_MINUS,
+    IES_NOT,
+    IES_MULTIPLY,
+    IES_DIVIDE,
+    IES_LBRAC,
+    IES_RBRAC,
+    IES_LPAREN,
+    IES_RPAREN,
+    IES_REGISTER,
+    IES_INTEGER,
+    IES_IDENTIFIER,
+    IES_ERROR
+  };
+
+  class IntelExprStateMachine {
+    IntelExprState State, PrevState;
+    unsigned BaseReg, IndexReg, TmpReg, Scale;
+    int64_t Imm;
+    const MCExpr *Sym;
+    StringRef SymName;
+    bool StopOnLBrac, AddImmPrefix;
+    InfixCalculator IC;
+    InlineAsmIdentifierInfo Info;
+
+  public:
+    IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) :
+      State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0),
+      Scale(1), Imm(imm), Sym(nullptr), StopOnLBrac(stoponlbrac),
+      AddImmPrefix(addimmprefix) { Info.clear(); }
+
+    unsigned getBaseReg() { return BaseReg; }
+    unsigned getIndexReg() { return IndexReg; }
+    unsigned getScale() { return Scale; }
+    const MCExpr *getSym() { return Sym; }
+    StringRef getSymName() { return SymName; }
+    int64_t getImm() { return Imm + IC.execute(); }
+    bool isValidEndState() {
+      return State == IES_RBRAC || State == IES_INTEGER;
+    }
+    bool getStopOnLBrac() { return StopOnLBrac; }
+    bool getAddImmPrefix() { return AddImmPrefix; }
+    bool hadError() { return State == IES_ERROR; }
+
+    InlineAsmIdentifierInfo &getIdentifierInfo() {
+      return Info;
+    }
+
+    void onOr() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_OR;
+        IC.pushOperator(IC_OR);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onXor() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_XOR;
+        IC.pushOperator(IC_XOR);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onAnd() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_AND;
+        IC.pushOperator(IC_AND);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onLShift() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_LSHIFT;
+        IC.pushOperator(IC_LSHIFT);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onRShift() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_RSHIFT;
+        IC.pushOperator(IC_RSHIFT);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onPlus() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+      case IES_REGISTER:
+        State = IES_PLUS;
+        IC.pushOperator(IC_PLUS);
+        if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+          // If we already have a BaseReg, then assume this is the IndexReg with
+          // a scale of 1.
+          if (!BaseReg) {
+            BaseReg = TmpReg;
+          } else {
+            assert (!IndexReg && "BaseReg/IndexReg already set!");
+            IndexReg = TmpReg;
+            Scale = 1;
+          }
+        }
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onMinus() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_NOT:
+      case IES_MULTIPLY:
+      case IES_DIVIDE:
+      case IES_LPAREN:
+      case IES_RPAREN:
+      case IES_LBRAC:
+      case IES_RBRAC:
+      case IES_INTEGER:
+      case IES_REGISTER:
+        State = IES_MINUS;
+        // Only push the minus operator if it is not a unary operator.
+        if (!(CurrState == IES_PLUS || CurrState == IES_MINUS ||
+              CurrState == IES_MULTIPLY || CurrState == IES_DIVIDE ||
+              CurrState == IES_LPAREN || CurrState == IES_LBRAC))
+          IC.pushOperator(IC_MINUS);
+        if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+          // If we already have a BaseReg, then assume this is the IndexReg with
+          // a scale of 1.
+          if (!BaseReg) {
+            BaseReg = TmpReg;
+          } else {
+            assert (!IndexReg && "BaseReg/IndexReg already set!");
+            IndexReg = TmpReg;
+            Scale = 1;
+          }
+        }
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onNot() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_NOT:
+        State = IES_NOT;
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onRegister(unsigned Reg) {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_LPAREN:
+        State = IES_REGISTER;
+        TmpReg = Reg;
+        IC.pushOperand(IC_REGISTER);
+        break;
+      case IES_MULTIPLY:
+        // Index Register - Scale * Register
+        if (PrevState == IES_INTEGER) {
+          assert (!IndexReg && "IndexReg already set!");
+          State = IES_REGISTER;
+          IndexReg = Reg;
+          // Get the scale and replace the 'Scale * Register' with '0'.
+          Scale = IC.popOperand();
+          IC.pushOperand(IC_IMM);
+          IC.popOperator();
+        } else {
+          State = IES_ERROR;
+        }
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName) {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_MINUS:
+      case IES_NOT:
+        State = IES_INTEGER;
+        Sym = SymRef;
+        SymName = SymRefName;
+        IC.pushOperand(IC_IMM);
+        break;
+      }
+    }
+    bool onInteger(int64_t TmpInt, StringRef &ErrMsg) {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_MINUS:
+      case IES_NOT:
+      case IES_OR:
+      case IES_XOR:
+      case IES_AND:
+      case IES_LSHIFT:
+      case IES_RSHIFT:
+      case IES_DIVIDE:
+      case IES_MULTIPLY:
+      case IES_LPAREN:
+        State = IES_INTEGER;
+        if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) {
+          // Index Register - Register * Scale
+          assert (!IndexReg && "IndexReg already set!");
+          IndexReg = TmpReg;
+          Scale = TmpInt;
+          if(Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) {
+            ErrMsg = "scale factor in address must be 1, 2, 4 or 8";
+            return true;
+          }
+          // Get the scale and replace the 'Register * Scale' with '0'.
+          IC.popOperator();
+        } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+                    PrevState == IES_OR || PrevState == IES_AND ||
+                    PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+                    PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+                    PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+                    PrevState == IES_NOT || PrevState == IES_XOR) &&
+                   CurrState == IES_MINUS) {
+          // Unary minus.  No need to pop the minus operand because it was never
+          // pushed.
+          IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm.
+        } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+                    PrevState == IES_OR || PrevState == IES_AND ||
+                    PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+                    PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+                    PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+                    PrevState == IES_NOT || PrevState == IES_XOR) &&
+                   CurrState == IES_NOT) {
+          // Unary not.  No need to pop the not operand because it was never
+          // pushed.
+          IC.pushOperand(IC_IMM, ~TmpInt); // Push ~Imm.
+        } else {
+          IC.pushOperand(IC_IMM, TmpInt);
+        }
+        break;
+      }
+      PrevState = CurrState;
+      return false;
+    }
+    void onStar() {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_REGISTER:
+      case IES_RPAREN:
+        State = IES_MULTIPLY;
+        IC.pushOperator(IC_MULTIPLY);
+        break;
+      }
+    }
+    void onDivide() {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+        State = IES_DIVIDE;
+        IC.pushOperator(IC_DIVIDE);
+        break;
+      }
+    }
+    void onLBrac() {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_RBRAC:
+        State = IES_PLUS;
+        IC.pushOperator(IC_PLUS);
+        break;
+      }
+    }
+    void onRBrac() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_REGISTER:
+      case IES_RPAREN:
+        State = IES_RBRAC;
+        if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+          // If we already have a BaseReg, then assume this is the IndexReg with
+          // a scale of 1.
+          if (!BaseReg) {
+            BaseReg = TmpReg;
+          } else {
+            assert (!IndexReg && "BaseReg/IndexReg already set!");
+            IndexReg = TmpReg;
+            Scale = 1;
+          }
+        }
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onLParen() {
+      IntelExprState CurrState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_PLUS:
+      case IES_MINUS:
+      case IES_NOT:
+      case IES_OR:
+      case IES_XOR:
+      case IES_AND:
+      case IES_LSHIFT:
+      case IES_RSHIFT:
+      case IES_MULTIPLY:
+      case IES_DIVIDE:
+      case IES_LPAREN:
+        // FIXME: We don't handle this type of unary minus or not, yet.
+        if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
+            PrevState == IES_OR || PrevState == IES_AND ||
+            PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
+            PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
+            PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
+            PrevState == IES_NOT || PrevState == IES_XOR) &&
+            (CurrState == IES_MINUS || CurrState == IES_NOT)) {
+          State = IES_ERROR;
+          break;
+        }
+        State = IES_LPAREN;
+        IC.pushOperator(IC_LPAREN);
+        break;
+      }
+      PrevState = CurrState;
+    }
+    void onRParen() {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_REGISTER:
+      case IES_RPAREN:
+        State = IES_RPAREN;
+        IC.pushOperator(IC_RPAREN);
+        break;
+      }
+    }
+  };
+
+  bool Error(SMLoc L, const Twine &Msg, SMRange Range = None,
+             bool MatchingInlineAsm = false) {
+    MCAsmParser &Parser = getParser();
+    if (MatchingInlineAsm) {
+      if (!getLexer().isAtStartOfStatement())
+        Parser.eatToEndOfStatement();
+      return false;
+    }
+    return Parser.Error(L, Msg, Range);
+  }
+
+  std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) {
+    Error(Loc, Msg);
+    return nullptr;
+  }
+
+  std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
+  std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
+  bool IsSIReg(unsigned Reg);
+  unsigned GetSIDIForRegClass(unsigned RegClassID, unsigned Reg, bool IsSIReg);
+  void
+  AddDefaultSrcDestOperands(OperandVector &Operands,
+                            std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+                            std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst);
+  bool VerifyAndAdjustOperands(OperandVector &OrigOperands,
+                               OperandVector &FinalOperands);
+  std::unique_ptr<X86Operand> ParseOperand();
+  std::unique_ptr<X86Operand> ParseATTOperand();
+  std::unique_ptr<X86Operand> ParseIntelOperand();
+  std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator();
+  bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
+  std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind);
+  std::unique_ptr<X86Operand>
+  ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
+  std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End);
+  bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
+  std::unique_ptr<X86Operand>
+  ParseIntelBracExpression(unsigned SegReg, SMLoc Start, int64_t ImmDisp,
+                           bool isSymbol, unsigned Size);
+  bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
+                            InlineAsmIdentifierInfo &Info,
+                            bool IsUnevaluatedOperand, SMLoc &End);
+
+  std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
+
+  std::unique_ptr<X86Operand>
+  CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
+                        unsigned IndexReg, unsigned Scale, SMLoc Start,
+                        SMLoc End, unsigned Size, StringRef Identifier,
+                        InlineAsmIdentifierInfo &Info,
+                        bool AllowBetterSizeMatch = false);
+
+  bool parseDirectiveEven(SMLoc L);
+  bool ParseDirectiveWord(unsigned Size, SMLoc L);
+  bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
+
+  bool processInstruction(MCInst &Inst, const OperandVector &Ops);
+
+  /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds
+  /// instrumentation around Inst.
+  void EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out);
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+
+  void MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, OperandVector &Operands,
+                         MCStreamer &Out, bool MatchingInlineAsm);
+
+  bool ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
+                           bool MatchingInlineAsm);
+
+  bool MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                  OperandVector &Operands, MCStreamer &Out,
+                                  uint64_t &ErrorInfo,
+                                  bool MatchingInlineAsm);
+
+  bool MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                    OperandVector &Operands, MCStreamer &Out,
+                                    uint64_t &ErrorInfo,
+                                    bool MatchingInlineAsm);
+
+  bool OmitRegisterFromClobberLists(unsigned RegNo) override;
+
+  /// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z})
+  /// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required.
+  /// return false if no parsing errors occurred, true otherwise.
+  bool HandleAVX512Operand(OperandVector &Operands,
+                           const MCParsedAsmOperand &Op);
+
+  bool ParseZ(std::unique_ptr<X86Operand> &Z, const SMLoc &StartLoc);
+
+  /// MS-compatibility:
+  /// Obtain an appropriate size qualifier, when facing its absence,
+  /// upon AVX512 vector/broadcast memory operand
+  unsigned AdjustAVX512Mem(unsigned Size, X86Operand* UnsizedMemOpNext);
+
+  bool is64BitMode() const {
+    // FIXME: Can tablegen auto-generate this?
+    return getSTI().getFeatureBits()[X86::Mode64Bit];
+  }
+  bool is32BitMode() const {
+    // FIXME: Can tablegen auto-generate this?
+    return getSTI().getFeatureBits()[X86::Mode32Bit];
+  }
+  bool is16BitMode() const {
+    // FIXME: Can tablegen auto-generate this?
+    return getSTI().getFeatureBits()[X86::Mode16Bit];
+  }
+  void SwitchMode(unsigned mode) {
+    MCSubtargetInfo &STI = copySTI();
+    FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit});
+    FeatureBitset OldMode = STI.getFeatureBits() & AllModes;
+    unsigned FB = ComputeAvailableFeatures(
+      STI.ToggleFeature(OldMode.flip(mode)));
+    setAvailableFeatures(FB);
+
+    assert(FeatureBitset({mode}) == (STI.getFeatureBits() & AllModes));
+  }
+
+  unsigned getPointerWidth() {
+    if (is16BitMode()) return 16;
+    if (is32BitMode()) return 32;
+    if (is64BitMode()) return 64;
+    llvm_unreachable("invalid mode");
+  }
+
+  bool isParsingIntelSyntax() {
+    return getParser().getAssemblerDialect();
+  }
+
+  /// @name Auto-generated Matcher Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "X86GenAsmMatcher.inc"
+
+  /// }
+
+public:
+  X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
+               const MCInstrInfo &mii, const MCTargetOptions &Options)
+      : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr),
+        Code16GCC(false) {
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+    Instrumentation.reset(
+        CreateX86AsmInstrumentation(Options, Parser.getContext(), STI));
+  }
+
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+  void SetFrameRegister(unsigned RegNo) override;
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+
+  bool ParseDirective(AsmToken DirectiveID) override;
+};
+} // end anonymous namespace
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+static bool CheckBaseRegAndIndexReg(unsigned BaseReg, unsigned IndexReg,
+                                    StringRef &ErrMsg) {
+  // If we have both a base register and an index register make sure they are
+  // both 64-bit or 32-bit registers.
+  // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
+
+  if ((BaseReg == X86::RIP && IndexReg != 0) || (IndexReg == X86::RIP)) {
+    ErrMsg = "invalid base+index expression";
+    return true;
+  }
+  if (BaseReg != 0 && IndexReg != 0) {
+    if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
+        (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+         X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) &&
+        IndexReg != X86::RIZ) {
+      ErrMsg = "base register is 64-bit, but index register is not";
+      return true;
+    }
+    if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
+        (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+         X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) &&
+        IndexReg != X86::EIZ){
+      ErrMsg = "base register is 32-bit, but index register is not";
+      return true;
+    }
+    if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) {
+      if (X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
+          X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) {
+        ErrMsg = "base register is 16-bit, but index register is not";
+        return true;
+      }
+      if (((BaseReg == X86::BX || BaseReg == X86::BP) &&
+           IndexReg != X86::SI && IndexReg != X86::DI) ||
+          ((BaseReg == X86::SI || BaseReg == X86::DI) &&
+           IndexReg != X86::BX && IndexReg != X86::BP)) {
+        ErrMsg = "invalid 16-bit base/index register combination";
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool X86AsmParser::ParseRegister(unsigned &RegNo,
+                                 SMLoc &StartLoc, SMLoc &EndLoc) {
+  MCAsmParser &Parser = getParser();
+  RegNo = 0;
+  const AsmToken &PercentTok = Parser.getTok();
+  StartLoc = PercentTok.getLoc();
+
+  // If we encounter a %, ignore it. This code handles registers with and
+  // without the prefix, unprefixed registers can occur in cfi directives.
+  if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent))
+    Parser.Lex(); // Eat percent token.
+
+  const AsmToken &Tok = Parser.getTok();
+  EndLoc = Tok.getEndLoc();
+
+  if (Tok.isNot(AsmToken::Identifier)) {
+    if (isParsingIntelSyntax()) return true;
+    return Error(StartLoc, "invalid register name",
+                 SMRange(StartLoc, EndLoc));
+  }
+
+  RegNo = MatchRegisterName(Tok.getString());
+
+  // If the match failed, try the register name as lowercase.
+  if (RegNo == 0)
+    RegNo = MatchRegisterName(Tok.getString().lower());
+
+  // The "flags" register cannot be referenced directly.
+  // Treat it as an identifier instead.
+  if (isParsingInlineAsm() && isParsingIntelSyntax() && RegNo == X86::EFLAGS)
+    RegNo = 0;
+
+  if (!is64BitMode()) {
+    // FIXME: This should be done using Requires<Not64BitMode> and
+    // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
+    // checked.
+    // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
+    // REX prefix.
+    if (RegNo == X86::RIZ ||
+        X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
+        X86II::isX86_64NonExtLowByteReg(RegNo) ||
+        X86II::isX86_64ExtendedReg(RegNo))
+      return Error(StartLoc, "register %"
+                   + Tok.getString() + " is only available in 64-bit mode",
+                   SMRange(StartLoc, EndLoc));
+  } else if (!getSTI().getFeatureBits()[X86::FeatureAVX512]) {
+    if (X86II::is32ExtendedReg(RegNo))
+      return Error(StartLoc, "register %"
+                   + Tok.getString() + " is only available with AVX512",
+                   SMRange(StartLoc, EndLoc));
+  }
+
+  // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
+  if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) {
+    RegNo = X86::ST0;
+    Parser.Lex(); // Eat 'st'
+
+    // Check to see if we have '(4)' after %st.
+    if (getLexer().isNot(AsmToken::LParen))
+      return false;
+    // Lex the paren.
+    getParser().Lex();
+
+    const AsmToken &IntTok = Parser.getTok();
+    if (IntTok.isNot(AsmToken::Integer))
+      return Error(IntTok.getLoc(), "expected stack index");
+    switch (IntTok.getIntVal()) {
+    case 0: RegNo = X86::ST0; break;
+    case 1: RegNo = X86::ST1; break;
+    case 2: RegNo = X86::ST2; break;
+    case 3: RegNo = X86::ST3; break;
+    case 4: RegNo = X86::ST4; break;
+    case 5: RegNo = X86::ST5; break;
+    case 6: RegNo = X86::ST6; break;
+    case 7: RegNo = X86::ST7; break;
+    default: return Error(IntTok.getLoc(), "invalid stack index");
+    }
+
+    if (getParser().Lex().isNot(AsmToken::RParen))
+      return Error(Parser.getTok().getLoc(), "expected ')'");
+
+    EndLoc = Parser.getTok().getEndLoc();
+    Parser.Lex(); // Eat ')'
+    return false;
+  }
+
+  EndLoc = Parser.getTok().getEndLoc();
+
+  // If this is "db[0-7]", match it as an alias
+  // for dr[0-7].
+  if (RegNo == 0 && Tok.getString().size() == 3 &&
+      Tok.getString().startswith("db")) {
+    switch (Tok.getString()[2]) {
+    case '0': RegNo = X86::DR0; break;
+    case '1': RegNo = X86::DR1; break;
+    case '2': RegNo = X86::DR2; break;
+    case '3': RegNo = X86::DR3; break;
+    case '4': RegNo = X86::DR4; break;
+    case '5': RegNo = X86::DR5; break;
+    case '6': RegNo = X86::DR6; break;
+    case '7': RegNo = X86::DR7; break;
+    }
+
+    if (RegNo != 0) {
+      EndLoc = Parser.getTok().getEndLoc();
+      Parser.Lex(); // Eat it.
+      return false;
+    }
+  }
+
+  if (RegNo == 0) {
+    if (isParsingIntelSyntax()) return true;
+    return Error(StartLoc, "invalid register name",
+                 SMRange(StartLoc, EndLoc));
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  return false;
+}
+
+void X86AsmParser::SetFrameRegister(unsigned RegNo) {
+  Instrumentation->SetInitialFrameRegister(RegNo);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
+  bool Parse32 = is32BitMode() || Code16GCC;
+  unsigned Basereg = is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI);
+  const MCExpr *Disp = MCConstantExpr::create(0, getContext());
+  return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+                               /*BaseReg=*/Basereg, /*IndexReg=*/0, /*Scale=*/1,
+                               Loc, Loc, 0);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
+  bool Parse32 = is32BitMode() || Code16GCC;
+  unsigned Basereg = is64BitMode() ? X86::RDI : (Parse32 ? X86::EDI : X86::DI);
+  const MCExpr *Disp = MCConstantExpr::create(0, getContext());
+  return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+                               /*BaseReg=*/Basereg, /*IndexReg=*/0, /*Scale=*/1,
+                               Loc, Loc, 0);
+}
+
+bool X86AsmParser::IsSIReg(unsigned Reg) {
+  switch (Reg) {
+  default: llvm_unreachable("Only (R|E)SI and (R|E)DI are expected!");
+  case X86::RSI:
+  case X86::ESI:
+  case X86::SI:
+    return true;
+  case X86::RDI:
+  case X86::EDI:
+  case X86::DI:
+    return false;
+  }
+}
+
+unsigned X86AsmParser::GetSIDIForRegClass(unsigned RegClassID, unsigned Reg,
+                                          bool IsSIReg) {
+  switch (RegClassID) {
+  default: llvm_unreachable("Unexpected register class");
+  case X86::GR64RegClassID:
+    return IsSIReg ? X86::RSI : X86::RDI;
+  case X86::GR32RegClassID:
+    return IsSIReg ? X86::ESI : X86::EDI;
+  case X86::GR16RegClassID:
+    return IsSIReg ? X86::SI : X86::DI;
+  }
+}
+
+void X86AsmParser::AddDefaultSrcDestOperands(
+    OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
+    std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst) {
+  if (isParsingIntelSyntax()) {
+    Operands.push_back(std::move(Dst));
+    Operands.push_back(std::move(Src));
+  }
+  else {
+    Operands.push_back(std::move(Src));
+    Operands.push_back(std::move(Dst));
+  }
+}
+
+bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands,
+                                           OperandVector &FinalOperands) {
+
+  if (OrigOperands.size() > 1) {
+    // Check if sizes match, OrigOperands also contains the instruction name
+    assert(OrigOperands.size() == FinalOperands.size() + 1 &&
+           "Operand size mismatch");
+
+    SmallVector<std::pair<SMLoc, std::string>, 2> Warnings;
+    // Verify types match
+    int RegClassID = -1;
+    for (unsigned int i = 0; i < FinalOperands.size(); ++i) {
+      X86Operand &OrigOp = static_cast<X86Operand &>(*OrigOperands[i + 1]);
+      X86Operand &FinalOp = static_cast<X86Operand &>(*FinalOperands[i]);
+
+      if (FinalOp.isReg() &&
+          (!OrigOp.isReg() || FinalOp.getReg() != OrigOp.getReg()))
+        // Return false and let a normal complaint about bogus operands happen
+        return false;
+
+      if (FinalOp.isMem()) {
+
+        if (!OrigOp.isMem())
+          // Return false and let a normal complaint about bogus operands happen
+          return false;
+
+        unsigned OrigReg = OrigOp.Mem.BaseReg;
+        unsigned FinalReg = FinalOp.Mem.BaseReg;
+
+        // If we've already encounterd a register class, make sure all register
+        // bases are of the same register class
+        if (RegClassID != -1 &&
+            !X86MCRegisterClasses[RegClassID].contains(OrigReg)) {
+          return Error(OrigOp.getStartLoc(),
+                       "mismatching source and destination index registers");
+        }
+
+        if (X86MCRegisterClasses[X86::GR64RegClassID].contains(OrigReg))
+          RegClassID = X86::GR64RegClassID;
+        else if (X86MCRegisterClasses[X86::GR32RegClassID].contains(OrigReg))
+          RegClassID = X86::GR32RegClassID;
+        else if (X86MCRegisterClasses[X86::GR16RegClassID].contains(OrigReg))
+          RegClassID = X86::GR16RegClassID;
+        else
+          // Unexpected register class type
+          // Return false and let a normal complaint about bogus operands happen
+          return false;
+
+        bool IsSI = IsSIReg(FinalReg);
+        FinalReg = GetSIDIForRegClass(RegClassID, FinalReg, IsSI);
+
+        if (FinalReg != OrigReg) {
+          std::string RegName = IsSI ? "ES:(R|E)SI" : "ES:(R|E)DI";
+          Warnings.push_back(std::make_pair(
+              OrigOp.getStartLoc(),
+              "memory operand is only for determining the size, " + RegName +
+                  " will be used for the location"));
+        }
+
+        FinalOp.Mem.Size = OrigOp.Mem.Size;
+        FinalOp.Mem.SegReg = OrigOp.Mem.SegReg;
+        FinalOp.Mem.BaseReg = FinalReg;
+      }
+    }
+
+    // Produce warnings only if all the operands passed the adjustment - prevent
+    // legal cases like "movsd (%rax), %xmm0" mistakenly produce warnings
+    for (auto &WarningMsg : Warnings) {
+      Warning(WarningMsg.first, WarningMsg.second);
+    }
+
+    // Remove old operands
+    for (unsigned int i = 0; i < FinalOperands.size(); ++i)
+      OrigOperands.pop_back();
+  }
+  // OrigOperands.append(FinalOperands.begin(), FinalOperands.end());
+  for (unsigned int i = 0; i < FinalOperands.size(); ++i)
+    OrigOperands.push_back(std::move(FinalOperands[i]));
+
+  return false;
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
+  if (isParsingIntelSyntax())
+    return ParseIntelOperand();
+  return ParseATTOperand();
+}
+
+/// getIntelMemOperandSize - Return intel memory operand size.
+static unsigned getIntelMemOperandSize(StringRef OpStr) {
+  unsigned Size = StringSwitch<unsigned>(OpStr)
+    .Cases("BYTE", "byte", 8)
+    .Cases("WORD", "word", 16)
+    .Cases("DWORD", "dword", 32)
+    .Cases("FWORD", "fword", 48)
+    .Cases("QWORD", "qword", 64)
+    .Cases("MMWORD","mmword", 64)
+    .Cases("XWORD", "xword", 80)
+    .Cases("TBYTE", "tbyte", 80)
+    .Cases("XMMWORD", "xmmword", 128)
+    .Cases("YMMWORD", "ymmword", 256)
+    .Cases("ZMMWORD", "zmmword", 512)
+    .Cases("OPAQUE", "opaque", -1U) // needs to be non-zero, but doesn't matter
+    .Default(0);
+  return Size;
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
+    unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
+    unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
+    InlineAsmIdentifierInfo &Info, bool AllowBetterSizeMatch) {
+  // If we found a decl other than a VarDecl, then assume it is a FuncDecl or
+  // some other label reference.
+  if (isa<MCSymbolRefExpr>(Disp) && Info.OpDecl && !Info.IsVarDecl) {
+    // Insert an explicit size if the user didn't have one.
+    if (!Size) {
+      Size = getPointerWidth();
+      InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
+                                          /*Len=*/0, Size);
+    }
+
+    // Create an absolute memory reference in order to match against
+    // instructions taking a PC relative operand.
+    return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size,
+                                 Identifier, Info.OpDecl);
+  }
+
+  // We either have a direct symbol reference, or an offset from a symbol.  The
+  // parser always puts the symbol on the LHS, so look there for size
+  // calculation purposes.
+  const MCBinaryExpr *BinOp = dyn_cast<MCBinaryExpr>(Disp);
+  bool IsSymRef =
+      isa<MCSymbolRefExpr>(BinOp ? BinOp->getLHS() : Disp);
+  if (IsSymRef) {
+    if (!Size) {
+      Size = Info.Type * 8; // Size is in terms of bits in this context.
+      if (Size)
+        InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
+                                            /*Len=*/0, Size);
+    if (AllowBetterSizeMatch)
+      // Handle cases where size qualifier is absent, upon an indirect symbol
+      // reference - e.g. "vaddps zmm1, zmm2, [var]"
+      // set Size to zero to allow matching mechansim to try and find a better
+      // size qualifier than our initial guess, based on available variants of
+      // the given instruction
+      Size = 0;
+    }
+  }
+
+  // When parsing inline assembly we set the base register to a non-zero value
+  // if we don't know the actual value at this time.  This is necessary to
+  // get the matching correct in some cases.
+  BaseReg = BaseReg ? BaseReg : 1;
+  return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
+                               IndexReg, Scale, Start, End, Size, Identifier,
+                               Info.OpDecl);
+}
+
+static void
+RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites,
+                           StringRef SymName, int64_t ImmDisp,
+                           int64_t FinalImmDisp, SMLoc &BracLoc,
+                           SMLoc &StartInBrac, SMLoc &End) {
+  // Remove the '[' and ']' from the IR string.
+  AsmRewrites.emplace_back(AOK_Skip, BracLoc, 1);
+  AsmRewrites.emplace_back(AOK_Skip, End, 1);
+
+  // If ImmDisp is non-zero, then we parsed a displacement before the
+  // bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp])
+  // If ImmDisp doesn't match the displacement computed by the state machine
+  // then we have an additional displacement in the bracketed expression.
+  if (ImmDisp != FinalImmDisp) {
+    if (ImmDisp) {
+      // We have an immediate displacement before the bracketed expression.
+      // Adjust this to match the final immediate displacement.
+      bool Found = false;
+      for (AsmRewrite &AR : AsmRewrites) {
+        if (AR.Loc.getPointer() > BracLoc.getPointer())
+          continue;
+        if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm) {
+          assert (!Found && "ImmDisp already rewritten.");
+          AR.Kind = AOK_Imm;
+          AR.Len = BracLoc.getPointer() - AR.Loc.getPointer();
+          AR.Val = FinalImmDisp;
+          Found = true;
+          break;
+        }
+      }
+      assert (Found && "Unable to rewrite ImmDisp.");
+      (void)Found;
+    } else {
+      // We have a symbolic and an immediate displacement, but no displacement
+      // before the bracketed expression.  Put the immediate displacement
+      // before the bracketed expression.
+      AsmRewrites.emplace_back(AOK_Imm, BracLoc, 0, FinalImmDisp);
+    }
+  }
+  // Remove all the ImmPrefix rewrites within the brackets.
+  for (AsmRewrite &AR : AsmRewrites) {
+    if (AR.Loc.getPointer() < StartInBrac.getPointer())
+      continue;
+    if (AR.Kind == AOK_ImmPrefix)
+      AR.Kind = AOK_Delete;
+  }
+  const char *SymLocPtr = SymName.data();
+  // Skip everything before the symbol.
+  if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) {
+    assert(Len > 0 && "Expected a non-negative length.");
+    AsmRewrites.emplace_back(AOK_Skip, StartInBrac, Len);
+  }
+  // Skip everything after the symbol.
+  if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) {
+    SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size());
+    assert(Len > 0 && "Expected a non-negative length.");
+    AsmRewrites.emplace_back(AOK_Skip, Loc, Len);
+  }
+}
+
+bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+
+  AsmToken::TokenKind PrevTK = AsmToken::Error;
+  bool Done = false;
+  while (!Done) {
+    bool UpdateLocLex = true;
+
+    // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an
+    // identifier.  Don't try an parse it as a register.
+    if (PrevTK != AsmToken::Error && Tok.getString().startswith("."))
+      break;
+
+    // If we're parsing an immediate expression, we don't expect a '['.
+    if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac)
+      break;
+
+    AsmToken::TokenKind TK = getLexer().getKind();
+    switch (TK) {
+    default: {
+      if (SM.isValidEndState()) {
+        Done = true;
+        break;
+      }
+      return Error(Tok.getLoc(), "unknown token in expression");
+    }
+    case AsmToken::EndOfStatement: {
+      Done = true;
+      break;
+    }
+    case AsmToken::String:
+    case AsmToken::Identifier: {
+      // This could be a register or a symbolic displacement.
+      unsigned TmpReg;
+      const MCExpr *Val;
+      SMLoc IdentLoc = Tok.getLoc();
+      StringRef Identifier = Tok.getString();
+      if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) {
+        SM.onRegister(TmpReg);
+        UpdateLocLex = false;
+        break;
+      } else {
+        if (!isParsingInlineAsm()) {
+          if (getParser().parsePrimaryExpr(Val, End))
+            return Error(Tok.getLoc(), "Unexpected identifier!");
+        } else {
+          // This is a dot operator, not an adjacent identifier.
+          if (Identifier.find('.') != StringRef::npos &&
+              PrevTK == AsmToken::RBrac) {
+            return false;
+          } else {
+            InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+            if (ParseIntelIdentifier(Val, Identifier, Info,
+                                     /*Unevaluated=*/false, End))
+              return true;
+          }
+        }
+        SM.onIdentifierExpr(Val, Identifier);
+        UpdateLocLex = false;
+        break;
+      }
+      return Error(Tok.getLoc(), "Unexpected identifier!");
+    }
+    case AsmToken::Integer: {
+      StringRef ErrMsg;
+      if (isParsingInlineAsm() && SM.getAddImmPrefix())
+        InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Tok.getLoc());
+      // Look for 'b' or 'f' following an Integer as a directional label
+      SMLoc Loc = getTok().getLoc();
+      int64_t IntVal = getTok().getIntVal();
+      End = consumeToken();
+      UpdateLocLex = false;
+      if (getLexer().getKind() == AsmToken::Identifier) {
+        StringRef IDVal = getTok().getString();
+        if (IDVal == "f" || IDVal == "b") {
+          MCSymbol *Sym =
+              getContext().getDirectionalLocalSymbol(IntVal, IDVal == "b");
+          MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+          const MCExpr *Val =
+              MCSymbolRefExpr::create(Sym, Variant, getContext());
+          if (IDVal == "b" && Sym->isUndefined())
+            return Error(Loc, "invalid reference to undefined symbol");
+          StringRef Identifier = Sym->getName();
+          SM.onIdentifierExpr(Val, Identifier);
+          End = consumeToken();
+        } else {
+          if (SM.onInteger(IntVal, ErrMsg))
+            return Error(Loc, ErrMsg);
+        }
+      } else {
+        if (SM.onInteger(IntVal, ErrMsg))
+          return Error(Loc, ErrMsg);
+      }
+      break;
+    }
+    case AsmToken::Plus:    SM.onPlus(); break;
+    case AsmToken::Minus:   SM.onMinus(); break;
+    case AsmToken::Tilde:   SM.onNot(); break;
+    case AsmToken::Star:    SM.onStar(); break;
+    case AsmToken::Slash:   SM.onDivide(); break;
+    case AsmToken::Pipe:    SM.onOr(); break;
+    case AsmToken::Caret:   SM.onXor(); break;
+    case AsmToken::Amp:     SM.onAnd(); break;
+    case AsmToken::LessLess:
+                            SM.onLShift(); break;
+    case AsmToken::GreaterGreater:
+                            SM.onRShift(); break;
+    case AsmToken::LBrac:   SM.onLBrac(); break;
+    case AsmToken::RBrac:   SM.onRBrac(); break;
+    case AsmToken::LParen:  SM.onLParen(); break;
+    case AsmToken::RParen:  SM.onRParen(); break;
+    }
+    if (SM.hadError())
+      return Error(Tok.getLoc(), "unknown token in expression");
+
+    if (!Done && UpdateLocLex)
+      End = consumeToken();
+
+    PrevTK = TK;
+  }
+  return false;
+}
+
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
+                                       int64_t ImmDisp, bool isSymbol,
+                                       unsigned Size) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc();
+  if (getLexer().isNot(AsmToken::LBrac))
+    return ErrorOperand(BracLoc, "Expected '[' token!");
+  Parser.Lex(); // Eat '['
+
+  SMLoc StartInBrac = Parser.getTok().getLoc();
+  // Parse [ Symbol + ImmDisp ] and [ BaseReg + Scale*IndexReg + ImmDisp ].  We
+  // may have already parsed an immediate displacement before the bracketed
+  // expression.
+  IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true);
+  if (ParseIntelExpression(SM, End))
+    return nullptr;
+
+  const MCExpr *Disp = nullptr;
+  if (const MCExpr *Sym = SM.getSym()) {
+    // A symbolic displacement.
+    Disp = Sym;
+    if (isParsingInlineAsm())
+      RewriteIntelBracExpression(*InstInfo->AsmRewrites, SM.getSymName(),
+                                 ImmDisp, SM.getImm(), BracLoc, StartInBrac,
+                                 End);
+  }
+
+  if (SM.getImm() || !Disp) {
+    const MCExpr *Imm = MCConstantExpr::create(SM.getImm(), getContext());
+    if (Disp)
+      Disp = MCBinaryExpr::createAdd(Disp, Imm, getContext());
+    else
+      Disp = Imm;  // An immediate displacement only.
+  }
+
+  // Parse struct field access.  Intel requires a dot, but MSVC doesn't.  MSVC
+  // will in fact do global lookup the field name inside all global typedefs,
+  // but we don't emulate that.
+  if ((Parser.getTok().getKind() == AsmToken::Identifier ||
+       Parser.getTok().getKind() == AsmToken::Dot ||
+       Parser.getTok().getKind() == AsmToken::Real) &&
+      Parser.getTok().getString().find('.') != StringRef::npos) {
+    const MCExpr *NewDisp;
+    if (ParseIntelDotOperator(Disp, NewDisp))
+      return nullptr;
+
+    End = Tok.getEndLoc();
+    Parser.Lex();  // Eat the field.
+    Disp = NewDisp;
+  }
+
+  if (isSymbol) {
+    if (SM.getSym()) {
+      Error(Start, "cannot use more than one symbol in memory operand");
+      return nullptr;
+    }
+    if (SM.getBaseReg()) {
+      Error(Start, "cannot use base register with variable reference");
+      return nullptr;
+    }
+    if (SM.getIndexReg()) {
+      Error(Start, "cannot use index register with variable reference");
+      return nullptr;
+    }
+  }
+
+  int BaseReg = SM.getBaseReg();
+  int IndexReg = SM.getIndexReg();
+  int Scale = SM.getScale();
+  if (!isParsingInlineAsm()) {
+    // handle [-42]
+    if (!BaseReg && !IndexReg) {
+      if (!SegReg)
+        return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size);
+      return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
+                                   Start, End, Size);
+    }
+    StringRef ErrMsg;
+    if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
+      Error(StartInBrac, ErrMsg);
+      return nullptr;
+    }
+    return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
+                                 IndexReg, Scale, Start, End, Size);
+  }
+
+  InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+  return CreateMemForInlineAsm(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
+                               End, Size, SM.getSymName(), Info,
+                               isParsingInlineAsm());
+}
+
+// Inline assembly may use variable names with namespace alias qualifiers.
+bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
+                                        StringRef &Identifier,
+                                        InlineAsmIdentifierInfo &Info,
+                                        bool IsUnevaluatedOperand, SMLoc &End) {
+  MCAsmParser &Parser = getParser();
+  assert(isParsingInlineAsm() && "Expected to be parsing inline assembly.");
+  Val = nullptr;
+
+  StringRef LineBuf(Identifier.data());
+  void *Result =
+    SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);
+
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc Loc = Tok.getLoc();
+
+  // Advance the token stream until the end of the current token is
+  // after the end of what the frontend claimed.
+  const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size();
+  do {
+    End = Tok.getEndLoc();
+    getLexer().Lex();
+  } while (End.getPointer() < EndPtr);
+  Identifier = LineBuf;
+
+  // The frontend should end parsing on an assembler token boundary, unless it
+  // failed parsing.
+  assert((End.getPointer() == EndPtr || !Result) &&
+         "frontend claimed part of a token?");
+
+  // If the identifier lookup was unsuccessful, assume that we are dealing with
+  // a label.
+  if (!Result) {
+    StringRef InternalName =
+      SemaCallback->LookupInlineAsmLabel(Identifier, getSourceManager(),
+                                         Loc, false);
+    assert(InternalName.size() && "We should have an internal name here.");
+    // Push a rewrite for replacing the identifier name with the internal name.
+    InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(),
+                                        InternalName);
+  }
+
+  // Create the symbol reference.
+  MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
+  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+  Val = MCSymbolRefExpr::create(Sym, Variant, getParser().getContext());
+  return false;
+}
+
+/// \brief Parse intel style segment override.
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start,
+                                        unsigned Size) {
+  MCAsmParser &Parser = getParser();
+  assert(SegReg != 0 && "Tried to parse a segment override without a segment!");
+  const AsmToken &Tok = Parser.getTok(); // Eat colon.
+  if (Tok.isNot(AsmToken::Colon))
+    return ErrorOperand(Tok.getLoc(), "Expected ':' token!");
+  Parser.Lex(); // Eat ':'
+
+  int64_t ImmDisp = 0;
+  if (getLexer().is(AsmToken::Integer)) {
+    ImmDisp = Tok.getIntVal();
+    AsmToken ImmDispToken = Parser.Lex(); // Eat the integer.
+
+    if (isParsingInlineAsm())
+      InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, ImmDispToken.getLoc());
+
+    if (getLexer().isNot(AsmToken::LBrac)) {
+      // An immediate following a 'segment register', 'colon' token sequence can
+      // be followed by a bracketed expression.  If it isn't we know we have our
+      // final segment override.
+      const MCExpr *Disp = MCConstantExpr::create(ImmDisp, getContext());
+      return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
+                                   /*BaseReg=*/0, /*IndexReg=*/0, /*Scale=*/1,
+                                   Start, ImmDispToken.getEndLoc(), Size);
+    }
+  }
+
+  if (getLexer().is(AsmToken::LBrac))
+    return ParseIntelBracExpression(SegReg, Start, ImmDisp, false, Size);
+
+  const MCExpr *Val;
+  SMLoc End;
+  if (!isParsingInlineAsm()) {
+    if (getParser().parsePrimaryExpr(Val, End))
+      return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+
+    return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size);
+  }
+
+  InlineAsmIdentifierInfo Info;
+  StringRef Identifier = Tok.getString();
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return nullptr;
+  return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
+                               /*Scale=*/1, Start, End, Size, Identifier, Info);
+}
+
+//ParseRoundingModeOp - Parse AVX-512 rounding mode operand
+std::unique_ptr<X86Operand>
+X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  // Eat "{" and mark the current place.
+  const SMLoc consumedToken = consumeToken();
+  if (Tok.getIdentifier().startswith("r")){
+    int rndMode = StringSwitch<int>(Tok.getIdentifier())
+      .Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT)
+      .Case("rd", X86::STATIC_ROUNDING::TO_NEG_INF)
+      .Case("ru", X86::STATIC_ROUNDING::TO_POS_INF)
+      .Case("rz", X86::STATIC_ROUNDING::TO_ZERO)
+      .Default(-1);
+    if (-1 == rndMode)
+      return ErrorOperand(Tok.getLoc(), "Invalid rounding mode.");
+     Parser.Lex();  // Eat "r*" of r*-sae
+    if (!getLexer().is(AsmToken::Minus))
+      return ErrorOperand(Tok.getLoc(), "Expected - at this point");
+    Parser.Lex();  // Eat "-"
+    Parser.Lex();  // Eat the sae
+    if (!getLexer().is(AsmToken::RCurly))
+      return ErrorOperand(Tok.getLoc(), "Expected } at this point");
+    Parser.Lex();  // Eat "}"
+    const MCExpr *RndModeOp =
+      MCConstantExpr::create(rndMode, Parser.getContext());
+    return X86Operand::CreateImm(RndModeOp, Start, End);
+  }
+  if(Tok.getIdentifier().equals("sae")){
+    Parser.Lex();  // Eat the sae
+    if (!getLexer().is(AsmToken::RCurly))
+      return ErrorOperand(Tok.getLoc(), "Expected } at this point");
+    Parser.Lex();  // Eat "}"
+    return X86Operand::CreateToken("{sae}", consumedToken);
+  }
+  return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+}
+
+/// Parse the '.' operator.
+bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
+                                                const MCExpr *&NewDisp) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  int64_t OrigDispVal, DotDispVal;
+
+  // FIXME: Handle non-constant expressions.
+  if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp))
+    OrigDispVal = OrigDisp->getValue();
+  else
+    return Error(Tok.getLoc(), "Non-constant offsets are not supported!");
+
+  // Drop the optional '.'.
+  StringRef DotDispStr = Tok.getString();
+  if (DotDispStr.startswith("."))
+    DotDispStr = DotDispStr.drop_front(1);
+
+  // .Imm gets lexed as a real.
+  if (Tok.is(AsmToken::Real)) {
+    APInt DotDisp;
+    DotDispStr.getAsInteger(10, DotDisp);
+    DotDispVal = DotDisp.getZExtValue();
+  } else if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
+    unsigned DotDisp;
+    std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
+    if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
+                                           DotDisp))
+      return Error(Tok.getLoc(), "Unable to lookup field reference!");
+    DotDispVal = DotDisp;
+  } else
+    return Error(Tok.getLoc(), "Unexpected token type!");
+
+  if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
+    SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
+    unsigned Len = DotDispStr.size();
+    unsigned Val = OrigDispVal + DotDispVal;
+    InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, Val);
+  }
+
+  NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext());
+  return false;
+}
+
+/// Parse the 'offset' operator.  This operator is used to specify the
+/// location rather then the content of a variable.
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc OffsetOfLoc = Tok.getLoc();
+  Parser.Lex(); // Eat offset.
+
+  const MCExpr *Val;
+  InlineAsmIdentifierInfo Info;
+  SMLoc Start = Tok.getLoc(), End;
+  StringRef Identifier = Tok.getString();
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return nullptr;
+
+  // Don't emit the offset operator.
+  InstInfo->AsmRewrites->emplace_back(AOK_Skip, OffsetOfLoc, 7);
+
+  // The offset operator will have an 'r' constraint, thus we need to create
+  // register operand to ensure proper matching.  Just pick a GPR based on
+  // the size of a pointer.
+  bool Parse32 = is32BitMode() || Code16GCC;
+  unsigned RegNo = is64BitMode() ? X86::RBX : (Parse32 ? X86::EBX : X86::BX);
+
+  return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true,
+                               OffsetOfLoc, Identifier, Info.OpDecl);
+}
+
+enum IntelOperatorKind {
+  IOK_LENGTH,
+  IOK_SIZE,
+  IOK_TYPE
+};
+
+/// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators.  The LENGTH operator
+/// returns the number of elements in an array.  It returns the value 1 for
+/// non-array variables.  The SIZE operator returns the size of a C or C++
+/// variable.  A variable's size is the product of its LENGTH and TYPE.  The
+/// TYPE operator returns the size of a C or C++ type or variable. If the
+/// variable is an array, TYPE returns the size of a single element.
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc TypeLoc = Tok.getLoc();
+  Parser.Lex(); // Eat operator.
+
+  const MCExpr *Val = nullptr;
+  InlineAsmIdentifierInfo Info;
+  SMLoc Start = Tok.getLoc(), End;
+  StringRef Identifier = Tok.getString();
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/true, End))
+    return nullptr;
+
+  if (!Info.OpDecl)
+    return ErrorOperand(Start, "unable to lookup expression");
+
+  unsigned CVal = 0;
+  switch(OpKind) {
+  default: llvm_unreachable("Unexpected operand kind!");
+  case IOK_LENGTH: CVal = Info.Length; break;
+  case IOK_SIZE: CVal = Info.Size; break;
+  case IOK_TYPE: CVal = Info.Type; break;
+  }
+
+  // Rewrite the type operator and the C or C++ type or variable in terms of an
+  // immediate.  E.g. TYPE foo -> $$4
+  unsigned Len = End.getPointer() - TypeLoc.getPointer();
+  InstInfo->AsmRewrites->emplace_back(AOK_Imm, TypeLoc, Len, CVal);
+
+  const MCExpr *Imm = MCConstantExpr::create(CVal, getContext());
+  return X86Operand::CreateImm(Imm, Start, End);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
+  MCAsmParser &Parser = getParser();
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc Start, End;
+
+  // Offset, length, type and size operators.
+  if (isParsingInlineAsm()) {
+    StringRef AsmTokStr = Tok.getString();
+    if (AsmTokStr == "offset" || AsmTokStr == "OFFSET")
+      return ParseIntelOffsetOfOperator();
+    if (AsmTokStr == "length" || AsmTokStr == "LENGTH")
+      return ParseIntelOperator(IOK_LENGTH);
+    if (AsmTokStr == "size" || AsmTokStr == "SIZE")
+      return ParseIntelOperator(IOK_SIZE);
+    if (AsmTokStr == "type" || AsmTokStr == "TYPE")
+      return ParseIntelOperator(IOK_TYPE);
+  }
+
+  bool PtrInOperand = false;
+  unsigned Size = getIntelMemOperandSize(Tok.getString());
+  if (Size) {
+    Parser.Lex(); // Eat operand size (e.g., byte, word).
+    if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
+      return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
+    Parser.Lex(); // Eat ptr.
+    PtrInOperand = true;
+  }
+
+  Start = Tok.getLoc();
+
+  // rounding mode token
+  if (getSTI().getFeatureBits()[X86::FeatureAVX512] &&
+      getLexer().is(AsmToken::LCurly))
+    return ParseRoundingModeOp(Start, End);
+
+  // Register.
+  unsigned RegNo = 0;
+  if (getLexer().is(AsmToken::Identifier) &&
+      !ParseRegister(RegNo, Start, End)) {
+    // If this is a segment register followed by a ':', then this is the start
+    // of a segment override, otherwise this is a normal register reference.
+    // In case it is a normal register and there is ptr in the operand this
+    // is an error
+    if (RegNo == X86::RIP)
+      return ErrorOperand(Start, "rip can only be used as a base register");
+    if (getLexer().isNot(AsmToken::Colon)) {
+      if (PtrInOperand) {
+        return ErrorOperand(Start, "expected memory operand after "
+                                   "'ptr', found register operand instead");
+      }
+      return X86Operand::CreateReg(RegNo, Start, End);
+    }
+    return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size);
+  }
+
+  // Immediates and Memory
+
+  // Parse [ BaseReg + Scale*IndexReg + Disp ].
+  if (getLexer().is(AsmToken::LBrac))
+    return ParseIntelBracExpression(/*SegReg=*/0, Start, /*ImmDisp=*/0, false,
+                                    Size);
+
+  AsmToken StartTok = Tok;
+  IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
+                           /*AddImmPrefix=*/false);
+  if (ParseIntelExpression(SM, End))
+    return nullptr;
+
+  bool isSymbol = SM.getSym() && SM.getSym()->getKind() != MCExpr::Constant;
+  int64_t Imm = SM.getImm();
+  if (SM.getSym() && SM.getSym()->getKind() == MCExpr::Constant)
+    SM.getSym()->evaluateAsAbsolute(Imm);
+
+  if (StartTok.isNot(AsmToken::Identifier) &&
+      StartTok.isNot(AsmToken::String) && isParsingInlineAsm()) {
+    unsigned Len = Tok.getLoc().getPointer() - Start.getPointer();
+    if (StartTok.getString().size() == Len)
+      // Just add a prefix if this wasn't a complex immediate expression.
+      InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Start);
+    else
+      // Otherwise, rewrite the complex expression as a single immediate.
+      InstInfo->AsmRewrites->emplace_back(AOK_Imm, Start, Len, Imm);
+  }
+
+  if (getLexer().isNot(AsmToken::LBrac)) {
+    // If a directional label (ie. 1f or 2b) was parsed above from
+    // ParseIntelExpression() then SM.getSym() was set to a pointer to
+    // to the MCExpr with the directional local symbol and this is a
+    // memory operand not an immediate operand.
+    if (isSymbol) {
+      if (isParsingInlineAsm())
+        return CreateMemForInlineAsm(/*SegReg=*/0, SM.getSym(), /*BaseReg=*/0,
+                                     /*IndexReg=*/0,
+                                     /*Scale=*/1, Start, End, Size,
+                                     SM.getSymName(), SM.getIdentifierInfo());
+      return X86Operand::CreateMem(getPointerWidth(), SM.getSym(), Start, End,
+                                   Size);
+    }
+
+    const MCExpr *ImmExpr = MCConstantExpr::create(Imm, getContext());
+    return X86Operand::CreateImm(ImmExpr, Start, End);
+  }
+
+  // Only positive immediates are valid.
+  if (Imm < 0)
+    return ErrorOperand(Start, "expected a positive immediate displacement "
+                               "before bracketed expr.");
+
+  return ParseIntelBracExpression(/*SegReg=*/0, Start, Imm, isSymbol, Size);
+}
+
+std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
+  MCAsmParser &Parser = getParser();
+  switch (getLexer().getKind()) {
+  default:
+    // Parse a memory operand with no segment register.
+    return ParseMemOperand(0, Parser.getTok().getLoc());
+  case AsmToken::Percent: {
+    // Read the register.
+    unsigned RegNo;
+    SMLoc Start, End;
+    if (ParseRegister(RegNo, Start, End)) return nullptr;
+    if (RegNo == X86::EIZ || RegNo == X86::RIZ) {
+      Error(Start, "%eiz and %riz can only be used as index registers",
+            SMRange(Start, End));
+      return nullptr;
+    }
+    if (RegNo == X86::RIP) {
+      Error(Start, "%rip can only be used as a base register",
+            SMRange(Start, End));
+      return nullptr;
+    }
+
+    // If this is a segment register followed by a ':', then this is the start
+    // of a memory reference, otherwise this is a normal register reference.
+    if (getLexer().isNot(AsmToken::Colon))
+      return X86Operand::CreateReg(RegNo, Start, End);
+
+    if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo))
+      return ErrorOperand(Start, "invalid segment register");
+
+    getParser().Lex(); // Eat the colon.
+    return ParseMemOperand(RegNo, Start);
+  }
+  case AsmToken::Dollar: {
+    // $42 -> immediate.
+    SMLoc Start = Parser.getTok().getLoc(), End;
+    Parser.Lex();
+    const MCExpr *Val;
+    if (getParser().parseExpression(Val, End))
+      return nullptr;
+    return X86Operand::CreateImm(Val, Start, End);
+  }
+  case AsmToken::LCurly:{
+    SMLoc Start = Parser.getTok().getLoc(), End;
+    if (getSTI().getFeatureBits()[X86::FeatureAVX512])
+      return ParseRoundingModeOp(Start, End);
+    return ErrorOperand(Start, "Unexpected '{' in expression");
+  }
+  }
+}
+
+// true on failure, false otherwise
+// If no {z} mark was found - Parser doesn't advance
+bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z,
+                          const SMLoc &StartLoc) {
+  MCAsmParser &Parser = getParser();
+  // Assuming we are just pass the '{' mark, quering the next token
+  // Searched for {z}, but none was found. Return false, as no parsing error was
+  // encountered
+  if (!(getLexer().is(AsmToken::Identifier) &&
+        (getLexer().getTok().getIdentifier() == "z")))
+    return false;
+  Parser.Lex(); // Eat z
+  // Query and eat the '}' mark
+  if (!getLexer().is(AsmToken::RCurly))
+    return Error(getLexer().getLoc(), "Expected } at this point");
+  Parser.Lex(); // Eat '}'
+  // Assign Z with the {z} mark opernad
+  Z = X86Operand::CreateToken("{z}", StartLoc);
+  return false;
+}
+
+// true on failure, false otherwise
+bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
+                                       const MCParsedAsmOperand &Op) {
+  MCAsmParser &Parser = getParser();
+  if(getSTI().getFeatureBits()[X86::FeatureAVX512]) {
+    if (getLexer().is(AsmToken::LCurly)) {
+      // Eat "{" and mark the current place.
+      const SMLoc consumedToken = consumeToken();
+      // Distinguish {1to<NUM>} from {%k<NUM>}.
+      if(getLexer().is(AsmToken::Integer)) {
+        // Parse memory broadcasting ({1to<NUM>}).
+        if (getLexer().getTok().getIntVal() != 1)
+          return TokError("Expected 1to<NUM> at this point");
+        Parser.Lex();  // Eat "1" of 1to8
+        if (!getLexer().is(AsmToken::Identifier) ||
+            !getLexer().getTok().getIdentifier().startswith("to"))
+          return TokError("Expected 1to<NUM> at this point");
+        // Recognize only reasonable suffixes.
+        const char *BroadcastPrimitive =
+          StringSwitch<const char*>(getLexer().getTok().getIdentifier())
+            .Case("to2",  "{1to2}")
+            .Case("to4",  "{1to4}")
+            .Case("to8",  "{1to8}")
+            .Case("to16", "{1to16}")
+            .Default(nullptr);
+        if (!BroadcastPrimitive)
+          return TokError("Invalid memory broadcast primitive.");
+        Parser.Lex();  // Eat "toN" of 1toN
+        if (!getLexer().is(AsmToken::RCurly))
+          return TokError("Expected } at this point");
+        Parser.Lex();  // Eat "}"
+        Operands.push_back(X86Operand::CreateToken(BroadcastPrimitive,
+                                                   consumedToken));
+        // No AVX512 specific primitives can pass
+        // after memory broadcasting, so return.
+        return false;
+      } else {
+        // Parse either {k}{z}, {z}{k}, {k} or {z}
+        // last one have no meaning, but GCC accepts it
+        // Currently, we're just pass a '{' mark
+        std::unique_ptr<X86Operand> Z;
+        if (ParseZ(Z, consumedToken))
+          return true;
+        // Reaching here means that parsing of the allegadly '{z}' mark yielded
+        // no errors.
+        // Query for the need of further parsing for a {%k<NUM>} mark
+        if (!Z || getLexer().is(AsmToken::LCurly)) {
+          const SMLoc StartLoc = Z ? consumeToken() : consumedToken;
+          // Parse an op-mask register mark ({%k<NUM>}), which is now to be
+          // expected
+          if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+            if (!getLexer().is(AsmToken::RCurly))
+              return Error(getLexer().getLoc(), "Expected } at this point");
+            Operands.push_back(X86Operand::CreateToken("{", StartLoc));
+            Operands.push_back(std::move(Op));
+            Operands.push_back(X86Operand::CreateToken("}", consumeToken()));
+          } else
+            return Error(getLexer().getLoc(),
+                          "Expected an op-mask register at this point");
+          // {%k<NUM>} mark is found, inquire for {z}
+          if (getLexer().is(AsmToken::LCurly) && !Z) {
+            // Have we've found a parsing error, or found no (expected) {z} mark
+            // - report an error
+            if (ParseZ(Z, consumeToken()) || !Z)
+              return true;
+
+          }
+          // '{z}' on its own is meaningless, hence should be ignored.
+          // on the contrary - have it been accompanied by a K register,
+          // allow it.
+          if (Z)
+            Operands.push_back(std::move(Z));
+        }
+      }
+    }
+  }
+  return false;
+}
+
+/// ParseMemOperand: segment: disp(basereg, indexreg, scale).  The '%ds:' prefix
+/// has already been parsed if present.
+std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
+                                                          SMLoc MemStart) {
+
+  MCAsmParser &Parser = getParser();
+  // We have to disambiguate a parenthesized expression "(4+5)" from the start
+  // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)".  The
+  // only way to do this without lookahead is to eat the '(' and see what is
+  // after it.
+  const MCExpr *Disp = MCConstantExpr::create(0, getParser().getContext());
+  if (getLexer().isNot(AsmToken::LParen)) {
+    SMLoc ExprEnd;
+    if (getParser().parseExpression(Disp, ExprEnd)) return nullptr;
+
+    // After parsing the base expression we could either have a parenthesized
+    // memory address or not.  If not, return now.  If so, eat the (.
+    if (getLexer().isNot(AsmToken::LParen)) {
+      // Unless we have a segment register, treat this as an immediate.
+      if (SegReg == 0)
+        return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, ExprEnd);
+      return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
+                                   MemStart, ExprEnd);
+    }
+
+    // Eat the '('.
+    Parser.Lex();
+  } else {
+    // Okay, we have a '('.  We don't know if this is an expression or not, but
+    // so we have to eat the ( to see beyond it.
+    SMLoc LParenLoc = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat the '('.
+
+    if (getLexer().is(AsmToken::Percent) || getLexer().is(AsmToken::Comma)) {
+      // Nothing to do here, fall into the code below with the '(' part of the
+      // memory operand consumed.
+    } else {
+      SMLoc ExprEnd;
+
+      // It must be an parenthesized expression, parse it now.
+      if (getParser().parseParenExpression(Disp, ExprEnd))
+        return nullptr;
+
+      // After parsing the base expression we could either have a parenthesized
+      // memory address or not.  If not, return now.  If so, eat the (.
+      if (getLexer().isNot(AsmToken::LParen)) {
+        // Unless we have a segment register, treat this as an immediate.
+        if (SegReg == 0)
+          return X86Operand::CreateMem(getPointerWidth(), Disp, LParenLoc,
+                                       ExprEnd);
+        return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
+                                     MemStart, ExprEnd);
+      }
+
+      // Eat the '('.
+      Parser.Lex();
+    }
+  }
+
+  // If we reached here, then we just ate the ( of the memory operand.  Process
+  // the rest of the memory operand.
+  unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
+  SMLoc IndexLoc, BaseLoc;
+
+  if (getLexer().is(AsmToken::Percent)) {
+    SMLoc StartLoc, EndLoc;
+    BaseLoc = Parser.getTok().getLoc();
+    if (ParseRegister(BaseReg, StartLoc, EndLoc)) return nullptr;
+    if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) {
+      Error(StartLoc, "eiz and riz can only be used as index registers",
+            SMRange(StartLoc, EndLoc));
+      return nullptr;
+    }
+  }
+
+  if (getLexer().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the comma.
+    IndexLoc = Parser.getTok().getLoc();
+
+    // Following the comma we should have either an index register, or a scale
+    // value. We don't support the later form, but we want to parse it
+    // correctly.
+    //
+    // Not that even though it would be completely consistent to support syntax
+    // like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
+    if (getLexer().is(AsmToken::Percent)) {
+      SMLoc L;
+      if (ParseRegister(IndexReg, L, L))
+        return nullptr;
+      if (BaseReg == X86::RIP) {
+        Error(IndexLoc, "%rip as base register can not have an index register");
+        return nullptr;
+      }
+      if (IndexReg == X86::RIP) {
+        Error(IndexLoc, "%rip is not allowed as an index register");
+        return nullptr;
+      }
+
+      if (getLexer().isNot(AsmToken::RParen)) {
+        // Parse the scale amount:
+        //  ::= ',' [scale-expression]
+        if (getLexer().isNot(AsmToken::Comma)) {
+          Error(Parser.getTok().getLoc(),
+                "expected comma in scale expression");
+          return nullptr;
+        }
+        Parser.Lex(); // Eat the comma.
+
+        if (getLexer().isNot(AsmToken::RParen)) {
+          SMLoc Loc = Parser.getTok().getLoc();
+
+          int64_t ScaleVal;
+          if (getParser().parseAbsoluteExpression(ScaleVal)){
+            Error(Loc, "expected scale expression");
+            return nullptr;
+          }
+
+          // Validate the scale amount.
+          if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+              ScaleVal != 1) {
+            Error(Loc, "scale factor in 16-bit address must be 1");
+            return nullptr;
+          }
+          if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 &&
+              ScaleVal != 8) {
+            Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
+            return nullptr;
+          }
+          Scale = (unsigned)ScaleVal;
+        }
+      }
+    } else if (getLexer().isNot(AsmToken::RParen)) {
+      // A scale amount without an index is ignored.
+      // index.
+      SMLoc Loc = Parser.getTok().getLoc();
+
+      int64_t Value;
+      if (getParser().parseAbsoluteExpression(Value))
+        return nullptr;
+
+      if (Value != 1)
+        Warning(Loc, "scale factor without index register is ignored");
+      Scale = 1;
+    }
+  }
+
+  // Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
+  if (getLexer().isNot(AsmToken::RParen)) {
+    Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
+    return nullptr;
+  }
+  SMLoc MemEnd = Parser.getTok().getEndLoc();
+  Parser.Lex(); // Eat the ')'.
+
+  // Check for use of invalid 16-bit registers. Only BX/BP/SI/DI are allowed,
+  // and then only in non-64-bit modes. Except for DX, which is a special case
+  // because an unofficial form of in/out instructions uses it.
+  if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
+      (is64BitMode() || (BaseReg != X86::BX && BaseReg != X86::BP &&
+                         BaseReg != X86::SI && BaseReg != X86::DI)) &&
+      BaseReg != X86::DX) {
+    Error(BaseLoc, "invalid 16-bit base register");
+    return nullptr;
+  }
+  if (BaseReg == 0 &&
+      X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
+    Error(IndexLoc, "16-bit memory operand may not include only index register");
+    return nullptr;
+  }
+
+  StringRef ErrMsg;
+  if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
+    Error(BaseLoc, ErrMsg);
+    return nullptr;
+  }
+
+  if (SegReg || BaseReg || IndexReg)
+    return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
+                                 IndexReg, Scale, MemStart, MemEnd);
+  return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, MemEnd);
+}
+
+bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                    SMLoc NameLoc, OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  InstInfo = &Info;
+  StringRef PatchedName = Name;
+
+  if (Name == "jmp" && isParsingIntelSyntax() && isParsingInlineAsm()) {
+    StringRef NextTok = Parser.getTok().getString();
+    if (NextTok == "short") {
+      SMLoc NameEndLoc =
+          NameLoc.getFromPointer(NameLoc.getPointer() + Name.size());
+      // Eat the short keyword
+      Parser.Lex();
+      // MS ignores the short keyword, it determines the jmp type based
+      // on the distance of the label
+      InstInfo->AsmRewrites->emplace_back(AOK_Skip, NameEndLoc,
+                                          NextTok.size() + 1);
+    }
+  }
+
+  // FIXME: Hack to recognize setneb as setne.
+  if (PatchedName.startswith("set") && PatchedName.endswith("b") &&
+      PatchedName != "setb" && PatchedName != "setnb")
+    PatchedName = PatchedName.substr(0, Name.size()-1);
+
+  // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
+  if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
+      (PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
+       PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
+    bool IsVCMP = PatchedName[0] == 'v';
+    unsigned CCIdx = IsVCMP ? 4 : 3;
+    unsigned ComparisonCode = StringSwitch<unsigned>(
+      PatchedName.slice(CCIdx, PatchedName.size() - 2))
+      .Case("eq",       0x00)
+      .Case("eq_oq",    0x00)
+      .Case("lt",       0x01)
+      .Case("lt_os",    0x01)
+      .Case("le",       0x02)
+      .Case("le_os",    0x02)
+      .Case("unord",    0x03)
+      .Case("unord_q",  0x03)
+      .Case("neq",      0x04)
+      .Case("neq_uq",   0x04)
+      .Case("nlt",      0x05)
+      .Case("nlt_us",   0x05)
+      .Case("nle",      0x06)
+      .Case("nle_us",   0x06)
+      .Case("ord",      0x07)
+      .Case("ord_q",    0x07)
+      /* AVX only from here */
+      .Case("eq_uq",    0x08)
+      .Case("nge",      0x09)
+      .Case("nge_us",   0x09)
+      .Case("ngt",      0x0A)
+      .Case("ngt_us",   0x0A)
+      .Case("false",    0x0B)
+      .Case("false_oq", 0x0B)
+      .Case("neq_oq",   0x0C)
+      .Case("ge",       0x0D)
+      .Case("ge_os",    0x0D)
+      .Case("gt",       0x0E)
+      .Case("gt_os",    0x0E)
+      .Case("true",     0x0F)
+      .Case("true_uq",  0x0F)
+      .Case("eq_os",    0x10)
+      .Case("lt_oq",    0x11)
+      .Case("le_oq",    0x12)
+      .Case("unord_s",  0x13)
+      .Case("neq_us",   0x14)
+      .Case("nlt_uq",   0x15)
+      .Case("nle_uq",   0x16)
+      .Case("ord_s",    0x17)
+      .Case("eq_us",    0x18)
+      .Case("nge_uq",   0x19)
+      .Case("ngt_uq",   0x1A)
+      .Case("false_os", 0x1B)
+      .Case("neq_os",   0x1C)
+      .Case("ge_oq",    0x1D)
+      .Case("gt_oq",    0x1E)
+      .Case("true_us",  0x1F)
+      .Default(~0U);
+    if (ComparisonCode != ~0U && (IsVCMP || ComparisonCode < 8)) {
+
+      Operands.push_back(X86Operand::CreateToken(PatchedName.slice(0, CCIdx),
+                                                 NameLoc));
+
+      const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
+                                                   getParser().getContext());
+      Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+
+      PatchedName = PatchedName.substr(PatchedName.size() - 2);
+    }
+  }
+
+  // FIXME: Hack to recognize vpcmp<comparison code>{ub,uw,ud,uq,b,w,d,q}.
+  if (PatchedName.startswith("vpcmp") &&
+      (PatchedName.endswith("b") || PatchedName.endswith("w") ||
+       PatchedName.endswith("d") || PatchedName.endswith("q"))) {
+    unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1;
+    unsigned ComparisonCode = StringSwitch<unsigned>(
+      PatchedName.slice(5, PatchedName.size() - CCIdx))
+      .Case("eq",    0x0) // Only allowed on unsigned. Checked below.
+      .Case("lt",    0x1)
+      .Case("le",    0x2)
+      //.Case("false", 0x3) // Not a documented alias.
+      .Case("neq",   0x4)
+      .Case("nlt",   0x5)
+      .Case("nle",   0x6)
+      //.Case("true",  0x7) // Not a documented alias.
+      .Default(~0U);
+    if (ComparisonCode != ~0U && (ComparisonCode != 0 || CCIdx == 2)) {
+      Operands.push_back(X86Operand::CreateToken("vpcmp", NameLoc));
+
+      const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
+                                                   getParser().getContext());
+      Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+
+      PatchedName = PatchedName.substr(PatchedName.size() - CCIdx);
+    }
+  }
+
+  // FIXME: Hack to recognize vpcom<comparison code>{ub,uw,ud,uq,b,w,d,q}.
+  if (PatchedName.startswith("vpcom") &&
+      (PatchedName.endswith("b") || PatchedName.endswith("w") ||
+       PatchedName.endswith("d") || PatchedName.endswith("q"))) {
+    unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1;
+    unsigned ComparisonCode = StringSwitch<unsigned>(
+      PatchedName.slice(5, PatchedName.size() - CCIdx))
+      .Case("lt",    0x0)
+      .Case("le",    0x1)
+      .Case("gt",    0x2)
+      .Case("ge",    0x3)
+      .Case("eq",    0x4)
+      .Case("neq",   0x5)
+      .Case("false", 0x6)
+      .Case("true",  0x7)
+      .Default(~0U);
+    if (ComparisonCode != ~0U) {
+      Operands.push_back(X86Operand::CreateToken("vpcom", NameLoc));
+
+      const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
+                                                   getParser().getContext());
+      Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
+
+      PatchedName = PatchedName.substr(PatchedName.size() - CCIdx);
+    }
+  }
+
+  Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
+
+  // Determine whether this is an instruction prefix.
+  bool isPrefix =
+    Name == "lock" || Name == "rep" ||
+    Name == "repe" || Name == "repz" ||
+    Name == "repne" || Name == "repnz" ||
+    Name == "rex64" || Name == "data16";
+
+  bool CurlyAsEndOfStatement = false;
+  // This does the actual operand parsing.  Don't parse any more if we have a
+  // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
+  // just want to parse the "lock" as the first instruction and the "incl" as
+  // the next one.
+  if (getLexer().isNot(AsmToken::EndOfStatement) && !isPrefix) {
+
+    // Parse '*' modifier.
+    if (getLexer().is(AsmToken::Star))
+      Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
+
+    // Read the operands.
+    while(1) {
+      if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+        Operands.push_back(std::move(Op));
+        if (HandleAVX512Operand(Operands, *Operands.back()))
+          return true;
+      } else {
+         return true;
+      }
+      // check for comma and eat it
+      if (getLexer().is(AsmToken::Comma))
+        Parser.Lex();
+      else
+        break;
+     }
+
+    // In MS inline asm curly braces mark the begining/end of a block, therefore
+    // they should be interepreted as end of statement
+    CurlyAsEndOfStatement =
+        isParsingIntelSyntax() && isParsingInlineAsm() &&
+        (getLexer().is(AsmToken::LCurly) || getLexer().is(AsmToken::RCurly));
+    if (getLexer().isNot(AsmToken::EndOfStatement) && !CurlyAsEndOfStatement)
+      return TokError("unexpected token in argument list");
+   }
+
+  // Consume the EndOfStatement or the prefix separator Slash
+  if (getLexer().is(AsmToken::EndOfStatement) ||
+      (isPrefix && getLexer().is(AsmToken::Slash)))
+    Parser.Lex();
+  else if (CurlyAsEndOfStatement)
+    // Add an actual EndOfStatement before the curly brace
+    Info.AsmRewrites->emplace_back(AOK_EndOfStatement,
+                                   getLexer().getTok().getLoc(), 0);
+
+  // This is for gas compatibility and cannot be done in td.
+  // Adding "p" for some floating point with no argument.
+  // For example: fsub --> fsubp
+  bool IsFp =
+    Name == "fsub" || Name == "fdiv" || Name == "fsubr" || Name == "fdivr";
+  if (IsFp && Operands.size() == 1) {
+    const char *Repl = StringSwitch<const char *>(Name)
+      .Case("fsub", "fsubp")
+      .Case("fdiv", "fdivp")
+      .Case("fsubr", "fsubrp")
+      .Case("fdivr", "fdivrp");
+    static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl);
+  }
+
+  // Moving a 32 or 16 bit value into a segment register has the same
+  // behavior. Modify such instructions to always take shorter form.
+  if ((Name == "mov" || Name == "movw" || Name == "movl") &&
+      (Operands.size() == 3)) {
+    X86Operand &Op1 = (X86Operand &)*Operands[1];
+    X86Operand &Op2 = (X86Operand &)*Operands[2];
+    SMLoc Loc = Op1.getEndLoc();
+    if (Op1.isReg() && Op2.isReg() &&
+        X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(
+            Op2.getReg()) &&
+        (X86MCRegisterClasses[X86::GR16RegClassID].contains(Op1.getReg()) ||
+         X86MCRegisterClasses[X86::GR32RegClassID].contains(Op1.getReg()))) {
+      // Change instruction name to match new instruction.
+      if (Name != "mov" && Name[3] == (is16BitMode() ? 'l' : 'w')) {
+        Name = is16BitMode() ? "movw" : "movl";
+        Operands[0] = X86Operand::CreateToken(Name, NameLoc);
+      }
+      // Select the correct equivalent 16-/32-bit source register.
+      unsigned Reg =
+          getX86SubSuperRegisterOrZero(Op1.getReg(), is16BitMode() ? 16 : 32);
+      Operands[1] = X86Operand::CreateReg(Reg, Loc, Loc);
+    }
+  }
+
+  // This is a terrible hack to handle "out[s]?[bwl]? %al, (%dx)" ->
+  // "outb %al, %dx".  Out doesn't take a memory form, but this is a widely
+  // documented form in various unofficial manuals, so a lot of code uses it.
+  if ((Name == "outb" || Name == "outsb" || Name == "outw" || Name == "outsw" ||
+       Name == "outl" || Name == "outsl" || Name == "out" || Name == "outs") &&
+      Operands.size() == 3) {
+    X86Operand &Op = (X86Operand &)*Operands.back();
+    if (Op.isMem() && Op.Mem.SegReg == 0 &&
+        isa<MCConstantExpr>(Op.Mem.Disp) &&
+        cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+        Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
+      SMLoc Loc = Op.getEndLoc();
+      Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
+    }
+  }
+  // Same hack for "in[s]?[bwl]? (%dx), %al" -> "inb %dx, %al".
+  if ((Name == "inb" || Name == "insb" || Name == "inw" || Name == "insw" ||
+       Name == "inl" || Name == "insl" || Name == "in" || Name == "ins") &&
+      Operands.size() == 3) {
+    X86Operand &Op = (X86Operand &)*Operands[1];
+    if (Op.isMem() && Op.Mem.SegReg == 0 &&
+        isa<MCConstantExpr>(Op.Mem.Disp) &&
+        cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+        Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
+      SMLoc Loc = Op.getEndLoc();
+      Operands[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
+    }
+  }
+
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 2> TmpOperands;
+  bool HadVerifyError = false;
+
+  // Append default arguments to "ins[bwld]"
+  if (Name.startswith("ins") && 
+      (Operands.size() == 1 || Operands.size() == 3) &&
+      (Name == "insb" || Name == "insw" || Name == "insl" || Name == "insd" ||
+       Name == "ins")) {
+    
+    AddDefaultSrcDestOperands(TmpOperands,
+                              X86Operand::CreateReg(X86::DX, NameLoc, NameLoc),
+                              DefaultMemDIOperand(NameLoc));
+    HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+  }
+
+  // Append default arguments to "outs[bwld]"
+  if (Name.startswith("outs") && 
+      (Operands.size() == 1 || Operands.size() == 3) &&
+      (Name == "outsb" || Name == "outsw" || Name == "outsl" ||
+       Name == "outsd" || Name == "outs")) {
+    AddDefaultSrcDestOperands(TmpOperands, DefaultMemSIOperand(NameLoc),
+                              X86Operand::CreateReg(X86::DX, NameLoc, NameLoc));
+    HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+  }
+
+  // Transform "lods[bwlq]" into "lods[bwlq] ($SIREG)" for appropriate
+  // values of $SIREG according to the mode. It would be nice if this
+  // could be achieved with InstAlias in the tables.
+  if (Name.startswith("lods") &&
+      (Operands.size() == 1 || Operands.size() == 2) &&
+      (Name == "lods" || Name == "lodsb" || Name == "lodsw" ||
+       Name == "lodsl" || Name == "lodsd" || Name == "lodsq")) {
+    TmpOperands.push_back(DefaultMemSIOperand(NameLoc));
+    HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+  }
+
+  // Transform "stos[bwlq]" into "stos[bwlq] ($DIREG)" for appropriate
+  // values of $DIREG according to the mode. It would be nice if this
+  // could be achieved with InstAlias in the tables.
+  if (Name.startswith("stos") &&
+      (Operands.size() == 1 || Operands.size() == 2) &&
+      (Name == "stos" || Name == "stosb" || Name == "stosw" ||
+       Name == "stosl" || Name == "stosd" || Name == "stosq")) {
+    TmpOperands.push_back(DefaultMemDIOperand(NameLoc));
+    HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+  }
+
+  // Transform "scas[bwlq]" into "scas[bwlq] ($DIREG)" for appropriate
+  // values of $DIREG according to the mode. It would be nice if this
+  // could be achieved with InstAlias in the tables.
+  if (Name.startswith("scas") &&
+      (Operands.size() == 1 || Operands.size() == 2) &&
+      (Name == "scas" || Name == "scasb" || Name == "scasw" ||
+       Name == "scasl" || Name == "scasd" || Name == "scasq")) {
+    TmpOperands.push_back(DefaultMemDIOperand(NameLoc));
+    HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+  }
+
+  // Add default SI and DI operands to "cmps[bwlq]".
+  if (Name.startswith("cmps") &&
+      (Operands.size() == 1 || Operands.size() == 3) &&
+      (Name == "cmps" || Name == "cmpsb" || Name == "cmpsw" ||
+       Name == "cmpsl" || Name == "cmpsd" || Name == "cmpsq")) {
+    AddDefaultSrcDestOperands(TmpOperands, DefaultMemDIOperand(NameLoc),
+                              DefaultMemSIOperand(NameLoc));
+    HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+  }
+
+  // Add default SI and DI operands to "movs[bwlq]".
+  if (((Name.startswith("movs") &&
+        (Name == "movs" || Name == "movsb" || Name == "movsw" ||
+         Name == "movsl" || Name == "movsd" || Name == "movsq")) ||
+       (Name.startswith("smov") &&
+        (Name == "smov" || Name == "smovb" || Name == "smovw" ||
+         Name == "smovl" || Name == "smovd" || Name == "smovq"))) &&
+      (Operands.size() == 1 || Operands.size() == 3)) {
+    if (Name == "movsd" && Operands.size() == 1 && !isParsingIntelSyntax())
+      Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
+    AddDefaultSrcDestOperands(TmpOperands, DefaultMemSIOperand(NameLoc),
+                              DefaultMemDIOperand(NameLoc));
+    HadVerifyError = VerifyAndAdjustOperands(Operands, TmpOperands);
+  }
+
+  // Check if we encountered an error for one the string insturctions
+  if (HadVerifyError) {
+    return HadVerifyError;
+  }
+
+  // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>.  Canonicalize to
+  // "shift <op>".
+  if ((Name.startswith("shr") || Name.startswith("sar") ||
+       Name.startswith("shl") || Name.startswith("sal") ||
+       Name.startswith("rcl") || Name.startswith("rcr") ||
+       Name.startswith("rol") || Name.startswith("ror")) &&
+      Operands.size() == 3) {
+    if (isParsingIntelSyntax()) {
+      // Intel syntax
+      X86Operand &Op1 = static_cast<X86Operand &>(*Operands[2]);
+      if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+          cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
+        Operands.pop_back();
+    } else {
+      X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+      if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) &&
+          cast<MCConstantExpr>(Op1.getImm())->getValue() == 1)
+        Operands.erase(Operands.begin() + 1);
+    }
+  }
+
+  // Transforms "int $3" into "int3" as a size optimization.  We can't write an
+  // instalias with an immediate operand yet.
+  if (Name == "int" && Operands.size() == 2) {
+    X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+    if (Op1.isImm())
+      if (auto *CE = dyn_cast<MCConstantExpr>(Op1.getImm()))
+        if (CE->getValue() == 3) {
+          Operands.erase(Operands.begin() + 1);
+          static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3");
+        }
+  }
+
+  // Transforms "xlat mem8" into "xlatb"
+  if ((Name == "xlat" || Name == "xlatb") && Operands.size() == 2) {
+    X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]);
+    if (Op1.isMem8()) {
+      Warning(Op1.getStartLoc(), "memory operand is only for determining the "
+                                 "size, (R|E)BX will be used for the location");
+      Operands.pop_back();
+      static_cast<X86Operand &>(*Operands[0]).setTokenValue("xlatb");
+    }
+  }
+
+  return false;
+}
+
+bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
+  return false;
+}
+
+static const char *getSubtargetFeatureName(uint64_t Val);
+
+void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands,
+                                   MCStreamer &Out) {
+  Instrumentation->InstrumentAndEmitInstruction(Inst, Operands, getContext(),
+                                                MII, Out);
+}
+
+bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                           OperandVector &Operands,
+                                           MCStreamer &Out, uint64_t &ErrorInfo,
+                                           bool MatchingInlineAsm) {
+  if (isParsingIntelSyntax())
+    return MatchAndEmitIntelInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo,
+                                        MatchingInlineAsm);
+  return MatchAndEmitATTInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo,
+                                    MatchingInlineAsm);
+}
+
+void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op,
+                                     OperandVector &Operands, MCStreamer &Out,
+                                     bool MatchingInlineAsm) {
+  // FIXME: This should be replaced with a real .td file alias mechanism.
+  // Also, MatchInstructionImpl should actually *do* the EmitInstruction
+  // call.
+  const char *Repl = StringSwitch<const char *>(Op.getToken())
+                         .Case("finit", "fninit")
+                         .Case("fsave", "fnsave")
+                         .Case("fstcw", "fnstcw")
+                         .Case("fstcww", "fnstcw")
+                         .Case("fstenv", "fnstenv")
+                         .Case("fstsw", "fnstsw")
+                         .Case("fstsww", "fnstsw")
+                         .Case("fclex", "fnclex")
+                         .Default(nullptr);
+  if (Repl) {
+    MCInst Inst;
+    Inst.setOpcode(X86::WAIT);
+    Inst.setLoc(IDLoc);
+    if (!MatchingInlineAsm)
+      EmitInstruction(Inst, Operands, Out);
+    Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
+  }
+}
+
+bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
+                                       bool MatchingInlineAsm) {
+  assert(ErrorInfo && "Unknown missing feature!");
+  SmallString<126> Msg;
+  raw_svector_ostream OS(Msg);
+  OS << "instruction requires:";
+  uint64_t Mask = 1;
+  for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+    if (ErrorInfo & Mask)
+      OS << ' ' << getSubtargetFeatureName(ErrorInfo & Mask);
+    Mask <<= 1;
+  }
+  return Error(IDLoc, OS.str(), SMRange(), MatchingInlineAsm);
+}
+
+bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                              OperandVector &Operands,
+                                              MCStreamer &Out,
+                                              uint64_t &ErrorInfo,
+                                              bool MatchingInlineAsm) {
+  assert(!Operands.empty() && "Unexpect empty operand list!");
+  X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
+  assert(Op.isToken() && "Leading operand should always be a mnemonic!");
+  SMRange EmptyRange = None;
+
+  // First, handle aliases that expand to multiple instructions.
+  MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
+
+  bool WasOriginallyInvalidOperand = false;
+  MCInst Inst;
+
+  // First, try a direct match.
+  switch (MatchInstruction(Operands, Inst, ErrorInfo, MatchingInlineAsm,
+                           isParsingIntelSyntax())) {
+  default: llvm_unreachable("Unexpected match result!");
+  case Match_Success:
+    // Some instructions need post-processing to, for example, tweak which
+    // encoding is selected. Loop on it while changes happen so the
+    // individual transformations can chain off each other.
+    if (!MatchingInlineAsm)
+      while (processInstruction(Inst, Operands))
+        ;
+
+    Inst.setLoc(IDLoc);
+    if (!MatchingInlineAsm)
+      EmitInstruction(Inst, Operands, Out);
+    Opcode = Inst.getOpcode();
+    return false;
+  case Match_MissingFeature:
+    return ErrorMissingFeature(IDLoc, ErrorInfo, MatchingInlineAsm);
+  case Match_InvalidOperand:
+    WasOriginallyInvalidOperand = true;
+    break;
+  case Match_MnemonicFail:
+    break;
+  }
+
+  // FIXME: Ideally, we would only attempt suffix matches for things which are
+  // valid prefixes, and we could just infer the right unambiguous
+  // type. However, that requires substantially more matcher support than the
+  // following hack.
+
+  // Change the operand to point to a temporary token.
+  StringRef Base = Op.getToken();
+  SmallString<16> Tmp;
+  Tmp += Base;
+  Tmp += ' ';
+  Op.setTokenValue(Tmp);
+
+  // If this instruction starts with an 'f', then it is a floating point stack
+  // instruction.  These come in up to three forms for 32-bit, 64-bit, and
+  // 80-bit floating point, which use the suffixes s,l,t respectively.
+  //
+  // Otherwise, we assume that this may be an integer instruction, which comes
+  // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively.
+  const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0";
+
+  // Check for the various suffix matches.
+  uint64_t ErrorInfoIgnore;
+  uint64_t ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings.
+  unsigned Match[4];
+
+  for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) {
+    Tmp.back() = Suffixes[I];
+    Match[I] = MatchInstruction(Operands, Inst, ErrorInfoIgnore,
+                                MatchingInlineAsm, isParsingIntelSyntax());
+    // If this returned as a missing feature failure, remember that.
+    if (Match[I] == Match_MissingFeature)
+      ErrorInfoMissingFeature = ErrorInfoIgnore;
+  }
+
+  // Restore the old token.
+  Op.setTokenValue(Base);
+
+  // If exactly one matched, then we treat that as a successful match (and the
+  // instruction will already have been filled in correctly, since the failing
+  // matches won't have modified it).
+  unsigned NumSuccessfulMatches =
+      std::count(std::begin(Match), std::end(Match), Match_Success);
+  if (NumSuccessfulMatches == 1) {
+    Inst.setLoc(IDLoc);
+    if (!MatchingInlineAsm)
+      EmitInstruction(Inst, Operands, Out);
+    Opcode = Inst.getOpcode();
+    return false;
+  }
+
+  // Otherwise, the match failed, try to produce a decent error message.
+
+  // If we had multiple suffix matches, then identify this as an ambiguous
+  // match.
+  if (NumSuccessfulMatches > 1) {
+    char MatchChars[4];
+    unsigned NumMatches = 0;
+    for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I)
+      if (Match[I] == Match_Success)
+        MatchChars[NumMatches++] = Suffixes[I];
+
+    SmallString<126> Msg;
+    raw_svector_ostream OS(Msg);
+    OS << "ambiguous instructions require an explicit suffix (could be ";
+    for (unsigned i = 0; i != NumMatches; ++i) {
+      if (i != 0)
+        OS << ", ";
+      if (i + 1 == NumMatches)
+        OS << "or ";
+      OS << "'" << Base << MatchChars[i] << "'";
+    }
+    OS << ")";
+    Error(IDLoc, OS.str(), EmptyRange, MatchingInlineAsm);
+    return true;
+  }
+
+  // Okay, we know that none of the variants matched successfully.
+
+  // If all of the instructions reported an invalid mnemonic, then the original
+  // mnemonic was invalid.
+  if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) {
+    if (!WasOriginallyInvalidOperand) {
+      return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
+                   Op.getLocRange(), MatchingInlineAsm);
+    }
+
+    // Recover location info for the operand if we know which was the problem.
+    if (ErrorInfo != ~0ULL) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction", EmptyRange,
+                     MatchingInlineAsm);
+
+      X86Operand &Operand = (X86Operand &)*Operands[ErrorInfo];
+      if (Operand.getStartLoc().isValid()) {
+        SMRange OperandRange = Operand.getLocRange();
+        return Error(Operand.getStartLoc(), "invalid operand for instruction",
+                     OperandRange, MatchingInlineAsm);
+      }
+    }
+
+    return Error(IDLoc, "invalid operand for instruction", EmptyRange,
+                 MatchingInlineAsm);
+  }
+
+  // If one instruction matched with a missing feature, report this as a
+  // missing feature.
+  if (std::count(std::begin(Match), std::end(Match),
+                 Match_MissingFeature) == 1) {
+    ErrorInfo = ErrorInfoMissingFeature;
+    return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature,
+                               MatchingInlineAsm);
+  }
+
+  // If one instruction matched with an invalid operand, report this as an
+  // operand failure.
+  if (std::count(std::begin(Match), std::end(Match),
+                 Match_InvalidOperand) == 1) {
+    return Error(IDLoc, "invalid operand for instruction", EmptyRange,
+                 MatchingInlineAsm);
+  }
+
+  // If all of these were an outright failure, report it in a useless way.
+  Error(IDLoc, "unknown use of instruction mnemonic without a size suffix",
+        EmptyRange, MatchingInlineAsm);
+  return true;
+}
+
+unsigned X86AsmParser::AdjustAVX512Mem(unsigned Size,
+    X86Operand* UnsizedMemOpNext) {
+  // Check for the existence of an AVX512 platform
+  if (!getSTI().getFeatureBits()[X86::FeatureAVX512])
+    return 0;
+  // Allow adjusting upon a (x|y|z)mm
+  if (Size == 512 || Size == 256 || Size == 128)
+    return Size;
+  // This is an allegadly broadcasting mem op adjustment,
+  // allow some more inquiring to validate it
+  if (Size == 64 || Size == 32)
+    return UnsizedMemOpNext && UnsizedMemOpNext->isToken() &&
+      UnsizedMemOpNext->getToken().substr(0, 4).equals("{1to") ? Size : 0;
+  // Do not allow any other type of adjustments
+  return 0;
+}
+
+bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                                OperandVector &Operands,
+                                                MCStreamer &Out,
+                                                uint64_t &ErrorInfo,
+                                                bool MatchingInlineAsm) {
+  assert(!Operands.empty() && "Unexpect empty operand list!");
+  X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
+  assert(Op.isToken() && "Leading operand should always be a mnemonic!");
+  StringRef Mnemonic = Op.getToken();
+  SMRange EmptyRange = None;
+  StringRef Base = Op.getToken();
+
+  // First, handle aliases that expand to multiple instructions.
+  MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
+
+  MCInst Inst;
+
+  // Find one unsized memory operand, if present.
+  X86Operand *UnsizedMemOp = nullptr;
+  // If unsized memory operand was found - obtain following operand.
+  // For use in AdjustAVX512Mem
+  X86Operand *UnsizedMemOpNext = nullptr;
+  for (const auto &Op : Operands) {
+    X86Operand *X86Op = static_cast<X86Operand *>(Op.get());
+    if (UnsizedMemOp) {
+      UnsizedMemOpNext = X86Op;
+      // Have we found an unqualified memory operand,
+      // break. IA allows only one memory operand.
+      break;
+    }
+    if (X86Op->isMemUnsized())
+      UnsizedMemOp = X86Op;
+  }
+
+  // Allow some instructions to have implicitly pointer-sized operands.  This is
+  // compatible with gas.
+  if (UnsizedMemOp) {
+    static const char *const PtrSizedInstrs[] = {"call", "jmp", "push"};
+    for (const char *Instr : PtrSizedInstrs) {
+      if (Mnemonic == Instr) {
+        UnsizedMemOp->Mem.Size = getPointerWidth();
+        break;
+      }
+    }
+  }
+
+  SmallVector<unsigned, 8> Match;
+  uint64_t ErrorInfoMissingFeature = 0;
+
+  // If unsized push has immediate operand we should default the default pointer
+  // size for the size.
+  if (Mnemonic == "push" && Operands.size() == 2) {
+    auto *X86Op = static_cast<X86Operand *>(Operands[1].get());
+    if (X86Op->isImm()) {
+      // If it's not a constant fall through and let remainder take care of it.
+      const auto *CE = dyn_cast<MCConstantExpr>(X86Op->getImm());
+      unsigned Size = getPointerWidth();
+      if (CE &&
+          (isIntN(Size, CE->getValue()) || isUIntN(Size, CE->getValue()))) {
+        SmallString<16> Tmp;
+        Tmp += Base;
+        Tmp += (is64BitMode())
+                   ? "q"
+                   : (is32BitMode()) ? "l" : (is16BitMode()) ? "w" : " ";
+        Op.setTokenValue(Tmp);
+        // Do match in ATT mode to allow explicit suffix usage.
+        Match.push_back(MatchInstruction(Operands, Inst, ErrorInfo,
+                                         MatchingInlineAsm,
+                                         false /*isParsingIntelSyntax()*/));
+        Op.setTokenValue(Base);
+      }
+    }
+  }
+
+  // If an unsized memory operand is present, try to match with each memory
+  // operand size.  In Intel assembly, the size is not part of the instruction
+  // mnemonic.
+  unsigned MatchedSize = 0;
+  if (UnsizedMemOp && UnsizedMemOp->isMemUnsized()) {
+    static const unsigned MopSizes[] = {8, 16, 32, 64, 80, 128, 256, 512};
+    for (unsigned Size : MopSizes) {
+      UnsizedMemOp->Mem.Size = Size;
+      uint64_t ErrorInfoIgnore;
+      unsigned LastOpcode = Inst.getOpcode();
+      unsigned M = MatchInstruction(Operands, Inst, ErrorInfoIgnore,
+                                    MatchingInlineAsm, isParsingIntelSyntax());
+      if (Match.empty() || LastOpcode != Inst.getOpcode())
+        Match.push_back(M);
+
+      // If this returned as a missing feature failure, remember that.
+      if (Match.back() == Match_MissingFeature)
+        ErrorInfoMissingFeature = ErrorInfoIgnore;
+      if (M == Match_Success)
+        // MS-compatability:
+        // Adjust AVX512 vector/broadcast memory operand,
+        // when facing the absence of a size qualifier.
+        // Match GCC behavior on respective cases.
+        MatchedSize = AdjustAVX512Mem(Size, UnsizedMemOpNext);
+    }
+
+    // Restore the size of the unsized memory operand if we modified it.
+    if (UnsizedMemOp)
+      UnsizedMemOp->Mem.Size = 0;
+  }
+
+  // If we haven't matched anything yet, this is not a basic integer or FPU
+  // operation.  There shouldn't be any ambiguity in our mnemonic table, so try
+  // matching with the unsized operand.
+  if (Match.empty()) {
+    Match.push_back(MatchInstruction(
+        Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax()));
+    // If this returned as a missing feature failure, remember that.
+    if (Match.back() == Match_MissingFeature)
+      ErrorInfoMissingFeature = ErrorInfo;
+  }
+
+  // Restore the size of the unsized memory operand if we modified it.
+  if (UnsizedMemOp)
+    UnsizedMemOp->Mem.Size = 0;
+
+  // If it's a bad mnemonic, all results will be the same.
+  if (Match.back() == Match_MnemonicFail) {
+    return Error(IDLoc, "invalid instruction mnemonic '" + Mnemonic + "'",
+                 Op.getLocRange(), MatchingInlineAsm);
+  }
+
+  // If exactly one matched, then we treat that as a successful match (and the
+  // instruction will already have been filled in correctly, since the failing
+  // matches won't have modified it).
+  unsigned NumSuccessfulMatches =
+      std::count(std::begin(Match), std::end(Match), Match_Success);
+  if (NumSuccessfulMatches == 1) {
+    if (MatchedSize && isParsingInlineAsm() && isParsingIntelSyntax())
+      // MS compatibility -
+      // Fix the rewrite according to the matched memory size
+      // MS inline assembly only
+      for (AsmRewrite &AR : *InstInfo->AsmRewrites)
+        if ((AR.Loc.getPointer() == UnsizedMemOp->StartLoc.getPointer()) &&
+            (AR.Kind == AOK_SizeDirective))
+          AR.Val = MatchedSize;
+    // Some instructions need post-processing to, for example, tweak which
+    // encoding is selected. Loop on it while changes happen so the individual
+    // transformations can chain off each other.
+    if (!MatchingInlineAsm)
+      while (processInstruction(Inst, Operands))
+        ;
+    Inst.setLoc(IDLoc);
+    if (!MatchingInlineAsm)
+      EmitInstruction(Inst, Operands, Out);
+    Opcode = Inst.getOpcode();
+    return false;
+  } else if (NumSuccessfulMatches > 1) {
+    assert(UnsizedMemOp &&
+           "multiple matches only possible with unsized memory operands");
+    return Error(UnsizedMemOp->getStartLoc(),
+                 "ambiguous operand size for instruction '" + Mnemonic + "\'",
+                 UnsizedMemOp->getLocRange(), MatchingInlineAsm);
+  }
+
+  // If one instruction matched with a missing feature, report this as a
+  // missing feature.
+  if (std::count(std::begin(Match), std::end(Match),
+                 Match_MissingFeature) == 1) {
+    ErrorInfo = ErrorInfoMissingFeature;
+    return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature,
+                               MatchingInlineAsm);
+  }
+
+  // If one instruction matched with an invalid operand, report this as an
+  // operand failure.
+  if (std::count(std::begin(Match), std::end(Match),
+                 Match_InvalidOperand) == 1) {
+    return Error(IDLoc, "invalid operand for instruction", EmptyRange,
+                 MatchingInlineAsm);
+  }
+
+  // If all of these were an outright failure, report it in a useless way.
+  return Error(IDLoc, "unknown instruction mnemonic", EmptyRange,
+               MatchingInlineAsm);
+}
+
+bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) {
+  return X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo);
+}
+
+bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
+  MCAsmParser &Parser = getParser();
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal == ".word")
+    return ParseDirectiveWord(2, DirectiveID.getLoc());
+  else if (IDVal.startswith(".code"))
+    return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
+  else if (IDVal.startswith(".att_syntax")) {
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      if (Parser.getTok().getString() == "prefix")
+        Parser.Lex();
+      else if (Parser.getTok().getString() == "noprefix")
+        return Error(DirectiveID.getLoc(), "'.att_syntax noprefix' is not "
+                                           "supported: registers must have a "
+                                           "'%' prefix in .att_syntax");
+    }
+    getParser().setAssemblerDialect(0);
+    return false;
+  } else if (IDVal.startswith(".intel_syntax")) {
+    getParser().setAssemblerDialect(1);
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      if (Parser.getTok().getString() == "noprefix")
+        Parser.Lex();
+      else if (Parser.getTok().getString() == "prefix")
+        return Error(DirectiveID.getLoc(), "'.intel_syntax prefix' is not "
+                                           "supported: registers must not have "
+                                           "a '%' prefix in .intel_syntax");
+    }
+    return false;
+  } else if (IDVal == ".even")
+    return parseDirectiveEven(DirectiveID.getLoc());
+  return true;
+}
+
+/// parseDirectiveEven
+///  ::= .even
+bool X86AsmParser::parseDirectiveEven(SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    TokError("unexpected token in directive");
+    return false;  
+  }
+  const MCSection *Section = getStreamer().getCurrentSectionOnly();
+  if (!Section) {
+    getStreamer().InitSections(false);
+    Section = getStreamer().getCurrentSectionOnly();
+  }
+  if (Section->UseCodeAlign())
+    getStreamer().EmitCodeAlignment(2, 0);
+  else
+    getStreamer().EmitValueToAlignment(2, 0, 1, 0);
+  return false;
+}
+/// ParseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      SMLoc ExprLoc = getLexer().getLoc();
+      if (getParser().parseExpression(Value))
+        return false;
+
+      if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) {
+        assert(Size <= 8 && "Invalid size");
+        uint64_t IntValue = MCE->getValue();
+        if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
+          return Error(ExprLoc, "literal value out of range for directive");
+        getStreamer().EmitIntValue(IntValue, Size);
+      } else {
+        getStreamer().EmitValue(Value, Size, ExprLoc);
+      }
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma)) {
+        Error(L, "unexpected token in directive");
+        return false;
+      }
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+/// ParseDirectiveCode
+///  ::= .code16 | .code32 | .code64
+bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  Code16GCC = false;
+  if (IDVal == ".code16") {
+    Parser.Lex();
+    if (!is16BitMode()) {
+      SwitchMode(X86::Mode16Bit);
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+    }
+  } else if (IDVal == ".code16gcc") {
+    // .code16gcc parses as if in 32-bit mode, but emits code in 16-bit mode.
+    Parser.Lex();
+    Code16GCC = true;
+    if (!is16BitMode()) {
+      SwitchMode(X86::Mode16Bit);
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+    }
+  } else if (IDVal == ".code32") {
+    Parser.Lex();
+    if (!is32BitMode()) {
+      SwitchMode(X86::Mode32Bit);
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+    }
+  } else if (IDVal == ".code64") {
+    Parser.Lex();
+    if (!is64BitMode()) {
+      SwitchMode(X86::Mode64Bit);
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64);
+    }
+  } else {
+    Error(L, "unknown directive " + IDVal);
+    return false;
+  }
+
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmParser() {
+  RegisterMCAsmParser<X86AsmParser> X(getTheX86_32Target());
+  RegisterMCAsmParser<X86AsmParser> Y(getTheX86_64Target());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#define GET_SUBTARGET_FEATURE_NAME
+#include "X86GenAsmMatcher.inc"
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h
new file mode 100644
index 000000000000..c45a3f14ef11
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h
@@ -0,0 +1,41 @@
+//===-- X86AsmParserCommon.h - Common functions for X86AsmParser ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H
+
+#include "llvm/Support/MathExtras.h"
+
+namespace llvm {
+
+inline bool isImmSExti16i8Value(uint64_t Value) {
+  return isInt<8>(Value) ||
+         (isUInt<16>(Value) && isInt<8>(static_cast<int16_t>(Value)));
+}
+
+inline bool isImmSExti32i8Value(uint64_t Value) {
+  return isInt<8>(Value) ||
+         (isUInt<32>(Value) && isInt<8>(static_cast<int32_t>(Value)));
+}
+
+inline bool isImmSExti64i8Value(uint64_t Value) {
+  return isInt<8>(Value);
+}
+
+inline bool isImmSExti64i32Value(uint64_t Value) {
+  return isInt<32>(Value);
+}
+
+inline bool isImmUnsignedi8Value(uint64_t Value) {
+  return isUInt<8>(Value) || isInt<8>(Value);
+}
+
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
new file mode 100644
index 000000000000..9db1a8483bee
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -0,0 +1,546 @@
+//===-- X86Operand.h - Parsed X86 machine instruction --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
+
+#include "X86AsmParserCommon.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/STLExtras.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+
+namespace llvm {
+
+/// X86Operand - Instances of this class represent a parsed X86 machine
+/// instruction.
+struct X86Operand : public MCParsedAsmOperand {
+  enum KindTy {
+    Token,
+    Register,
+    Immediate,
+    Memory
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+  SMLoc OffsetOfLoc;
+  StringRef SymName;
+  void *OpDecl;
+  bool AddressOf;
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNo;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct MemOp {
+    unsigned SegReg;
+    const MCExpr *Disp;
+    unsigned BaseReg;
+    unsigned IndexReg;
+    unsigned Scale;
+    unsigned Size;
+    unsigned ModeSize;
+  };
+
+  union {
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+  };
+
+  X86Operand(KindTy K, SMLoc Start, SMLoc End)
+    : Kind(K), StartLoc(Start), EndLoc(End) {}
+
+  StringRef getSymName() override { return SymName; }
+  void *getOpDecl() override { return OpDecl; }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
+  SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+  /// getOffsetOfLoc - Get the location of the offset operator.
+  SMLoc getOffsetOfLoc() const override { return OffsetOfLoc; }
+
+  void print(raw_ostream &OS) const override {}
+
+  StringRef getToken() const {
+    assert(Kind == Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+  void setTokenValue(StringRef Value) {
+    assert(Kind == Token && "Invalid access!");
+    Tok.Data = Value.data();
+    Tok.Length = Value.size();
+  }
+
+  unsigned getReg() const override {
+    assert(Kind == Register && "Invalid access!");
+    return Reg.RegNo;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  const MCExpr *getMemDisp() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Disp;
+  }
+  unsigned getMemSegReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.SegReg;
+  }
+  unsigned getMemBaseReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.BaseReg;
+  }
+  unsigned getMemIndexReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.IndexReg;
+  }
+  unsigned getMemScale() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Scale;
+  }
+  unsigned getMemModeSize() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.ModeSize;
+  }
+
+  bool isToken() const override {return Kind == Token; }
+
+  bool isImm() const override { return Kind == Immediate; }
+
+  bool isImmSExti16i8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti16i8Value(CE->getValue());
+  }
+  bool isImmSExti32i8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti32i8Value(CE->getValue());
+  }
+  bool isImmSExti64i8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti64i8Value(CE->getValue());
+  }
+  bool isImmSExti64i32() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti64i32Value(CE->getValue());
+  }
+
+  bool isImmUnsignedi8() const {
+    if (!isImm()) return false;
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return true;
+    return isImmUnsignedi8Value(CE->getValue());
+  }
+
+  bool isOffsetOf() const override {
+    return OffsetOfLoc.getPointer();
+  }
+
+  bool needAddressOf() const override {
+    return AddressOf;
+  }
+
+  bool isMem() const override { return Kind == Memory; }
+  bool isMemUnsized() const {
+    return Kind == Memory && Mem.Size == 0;
+  }
+  bool isMem8() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMem16() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMem32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMem64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64);
+  }
+  bool isMem80() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 80);
+  }
+  bool isMem128() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 128);
+  }
+  bool isMem256() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 256);
+  }
+  bool isMem512() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 512);
+  }
+  bool isMemIndexReg(unsigned LowR, unsigned HighR) const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.IndexReg >= LowR && Mem.IndexReg <= HighR;
+  }
+
+  bool isMem64_RC128() const {
+    return isMem64() && isMemIndexReg(X86::XMM0, X86::XMM15);
+  }
+  bool isMem128_RC128() const {
+    return isMem128() && isMemIndexReg(X86::XMM0, X86::XMM15);
+  }
+  bool isMem128_RC256() const {
+    return isMem128() && isMemIndexReg(X86::YMM0, X86::YMM15);
+  }
+  bool isMem256_RC128() const {
+    return isMem256() && isMemIndexReg(X86::XMM0, X86::XMM15);
+  }
+  bool isMem256_RC256() const {
+    return isMem256() && isMemIndexReg(X86::YMM0, X86::YMM15);
+  }
+
+  bool isMem64_RC128X() const {
+    return isMem64() && isMemIndexReg(X86::XMM0, X86::XMM31);
+  }
+  bool isMem128_RC128X() const {
+    return isMem128() && isMemIndexReg(X86::XMM0, X86::XMM31);
+  }
+  bool isMem128_RC256X() const {
+    return isMem128() && isMemIndexReg(X86::YMM0, X86::YMM31);
+  }
+  bool isMem256_RC128X() const {
+    return isMem256() && isMemIndexReg(X86::XMM0, X86::XMM31);
+  }
+  bool isMem256_RC256X() const {
+    return isMem256() && isMemIndexReg(X86::YMM0, X86::YMM31);
+  }
+  bool isMem512_RC256X() const {
+    return isMem512() && isMemIndexReg(X86::YMM0, X86::YMM31);
+  }
+  bool isMem512_RC512() const {
+    return isMem512() && isMemIndexReg(X86::ZMM0, X86::ZMM31);
+  }
+
+  bool isAbsMem() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1;
+  }
+  bool isAVX512RC() const{
+      return isImm();
+  }
+
+  bool isAbsMem16() const {
+    return isAbsMem() && Mem.ModeSize == 16;
+  }
+
+  bool isSrcIdx() const {
+    return !getMemIndexReg() && getMemScale() == 1 &&
+      (getMemBaseReg() == X86::RSI || getMemBaseReg() == X86::ESI ||
+       getMemBaseReg() == X86::SI) && isa<MCConstantExpr>(getMemDisp()) &&
+      cast<MCConstantExpr>(getMemDisp())->getValue() == 0;
+  }
+  bool isSrcIdx8() const {
+    return isMem8() && isSrcIdx();
+  }
+  bool isSrcIdx16() const {
+    return isMem16() && isSrcIdx();
+  }
+  bool isSrcIdx32() const {
+    return isMem32() && isSrcIdx();
+  }
+  bool isSrcIdx64() const {
+    return isMem64() && isSrcIdx();
+  }
+
+  bool isDstIdx() const {
+    return !getMemIndexReg() && getMemScale() == 1 &&
+      (getMemSegReg() == 0 || getMemSegReg() == X86::ES) &&
+      (getMemBaseReg() == X86::RDI || getMemBaseReg() == X86::EDI ||
+       getMemBaseReg() == X86::DI) && isa<MCConstantExpr>(getMemDisp()) &&
+      cast<MCConstantExpr>(getMemDisp())->getValue() == 0;
+  }
+  bool isDstIdx8() const {
+    return isMem8() && isDstIdx();
+  }
+  bool isDstIdx16() const {
+    return isMem16() && isDstIdx();
+  }
+  bool isDstIdx32() const {
+    return isMem32() && isDstIdx();
+  }
+  bool isDstIdx64() const {
+    return isMem64() && isDstIdx();
+  }
+
+  bool isMemOffs() const {
+    return Kind == Memory && !getMemBaseReg() && !getMemIndexReg() &&
+      getMemScale() == 1;
+  }
+
+  bool isMemOffs16_8() const {
+    return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMemOffs16_16() const {
+    return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMemOffs16_32() const {
+    return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMemOffs32_8() const {
+    return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMemOffs32_16() const {
+    return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMemOffs32_32() const {
+    return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMemOffs32_64() const {
+    return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 64);
+  }
+  bool isMemOffs64_8() const {
+    return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMemOffs64_16() const {
+    return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMemOffs64_32() const {
+    return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMemOffs64_64() const {
+    return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 64);
+  }
+
+  bool isReg() const override { return Kind == Register; }
+
+  bool isGR32orGR64() const {
+    return Kind == Register &&
+      (X86MCRegisterClasses[X86::GR32RegClassID].contains(getReg()) ||
+      X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
+  static unsigned getGR32FromGR64(unsigned RegNo) {
+    switch (RegNo) {
+    default: llvm_unreachable("Unexpected register");
+    case X86::RAX: return X86::EAX;
+    case X86::RCX: return X86::ECX;
+    case X86::RDX: return X86::EDX;
+    case X86::RBX: return X86::EBX;
+    case X86::RBP: return X86::EBP;
+    case X86::RSP: return X86::ESP;
+    case X86::RSI: return X86::ESI;
+    case X86::RDI: return X86::EDI;
+    case X86::R8: return X86::R8D;
+    case X86::R9: return X86::R9D;
+    case X86::R10: return X86::R10D;
+    case X86::R11: return X86::R11D;
+    case X86::R12: return X86::R12D;
+    case X86::R13: return X86::R13D;
+    case X86::R14: return X86::R14D;
+    case X86::R15: return X86::R15D;
+    case X86::RIP: return X86::EIP;
+    }
+  }
+
+  void addGR32orGR64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    unsigned RegNo = getReg();
+    if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
+      RegNo = getGR32FromGR64(RegNo);
+    Inst.addOperand(MCOperand::createReg(RegNo));
+  }
+  void addAVX512RCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addMemOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 5) && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+    Inst.addOperand(MCOperand::createImm(getMemScale()));
+    Inst.addOperand(MCOperand::createReg(getMemIndexReg()));
+    addExpr(Inst, getMemDisp());
+    Inst.addOperand(MCOperand::createReg(getMemSegReg()));
+  }
+
+  void addAbsMemOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 1) && "Invalid number of operands!");
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(getMemDisp()));
+  }
+
+  void addSrcIdxOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 2) && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+    Inst.addOperand(MCOperand::createReg(getMemSegReg()));
+  }
+  void addDstIdxOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 1) && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+  }
+
+  void addMemOffsOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 2) && "Invalid number of operands!");
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(getMemDisp()));
+    Inst.addOperand(MCOperand::createReg(getMemSegReg()));
+  }
+
+  static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) {
+    SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
+    auto Res = llvm::make_unique<X86Operand>(Token, Loc, EndLoc);
+    Res->Tok.Data = Str.data();
+    Res->Tok.Length = Str.size();
+    return Res;
+  }
+
+  static std::unique_ptr<X86Operand>
+  CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+            bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(),
+            StringRef SymName = StringRef(), void *OpDecl = nullptr) {
+    auto Res = llvm::make_unique<X86Operand>(Register, StartLoc, EndLoc);
+    Res->Reg.RegNo = RegNo;
+    Res->AddressOf = AddressOf;
+    Res->OffsetOfLoc = OffsetOfLoc;
+    Res->SymName = SymName;
+    Res->OpDecl = OpDecl;
+    return Res;
+  }
+
+  static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val,
+                                               SMLoc StartLoc, SMLoc EndLoc) {
+    auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
+    Res->Imm.Val = Val;
+    return Res;
+  }
+
+  /// Create an absolute memory operand.
+  static std::unique_ptr<X86Operand>
+  CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
+            unsigned Size = 0, StringRef SymName = StringRef(),
+            void *OpDecl = nullptr) {
+    auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+    Res->Mem.SegReg   = 0;
+    Res->Mem.Disp     = Disp;
+    Res->Mem.BaseReg  = 0;
+    Res->Mem.IndexReg = 0;
+    Res->Mem.Scale    = 1;
+    Res->Mem.Size     = Size;
+    Res->Mem.ModeSize = ModeSize;
+    Res->SymName      = SymName;
+    Res->OpDecl       = OpDecl;
+    Res->AddressOf    = false;
+    return Res;
+  }
+
+  /// Create a generalized memory operand.
+  static std::unique_ptr<X86Operand>
+  CreateMem(unsigned ModeSize, unsigned SegReg, const MCExpr *Disp,
+            unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc,
+            SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(),
+            void *OpDecl = nullptr) {
+    // We should never just have a displacement, that should be parsed as an
+    // absolute memory operand.
+    assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
+
+    // The scale should always be one of {1,2,4,8}.
+    assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
+           "Invalid scale!");
+    auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+    Res->Mem.SegReg   = SegReg;
+    Res->Mem.Disp     = Disp;
+    Res->Mem.BaseReg  = BaseReg;
+    Res->Mem.IndexReg = IndexReg;
+    Res->Mem.Scale    = Scale;
+    Res->Mem.Size     = Size;
+    Res->Mem.ModeSize = ModeSize;
+    Res->SymName      = SymName;
+    Res->OpDecl       = OpDecl;
+    Res->AddressOf    = false;
+    return Res;
+  }
+};
+
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
new file mode 100644
index 000000000000..0871888bbfcd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -0,0 +1,1083 @@
+//===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains code to translate the data produced by the decoder into
+//  MCInsts.
+//
+//
+// The X86 disassembler is a table-driven disassembler for the 16-, 32-, and
+// 64-bit X86 instruction sets.  The main decode sequence for an assembly
+// instruction in this disassembler is:
+//
+// 1. Read the prefix bytes and determine the attributes of the instruction.
+//    These attributes, recorded in enum attributeBits
+//    (X86DisassemblerDecoderCommon.h), form a bitmask.  The table CONTEXTS_SYM
+//    provides a mapping from bitmasks to contexts, which are represented by
+//    enum InstructionContext (ibid.).
+//
+// 2. Read the opcode, and determine what kind of opcode it is.  The
+//    disassembler distinguishes four kinds of opcodes, which are enumerated in
+//    OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte
+//    (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a
+//    (0x0f 0x3a 0xnn).  Mandatory prefixes are treated as part of the context.
+//
+// 3. Depending on the opcode type, look in one of four ClassDecision structures
+//    (X86DisassemblerDecoderCommon.h).  Use the opcode class to determine which
+//    OpcodeDecision (ibid.) to look the opcode in.  Look up the opcode, to get
+//    a ModRMDecision (ibid.).
+//
+// 4. Some instructions, such as escape opcodes or extended opcodes, or even
+//    instructions that have ModRM*Reg / ModRM*Mem forms in LLVM, need the
+//    ModR/M byte to complete decode.  The ModRMDecision's type is an entry from
+//    ModRMDecisionType (X86DisassemblerDecoderCommon.h) that indicates if the
+//    ModR/M byte is required and how to interpret it.
+//
+// 5. After resolving the ModRMDecision, the disassembler has a unique ID
+//    of type InstrUID (X86DisassemblerDecoderCommon.h).  Looking this ID up in
+//    INSTRUCTIONS_SYM yields the name of the instruction and the encodings and
+//    meanings of its operands.
+//
+// 6. For each operand, its encoding is an entry from OperandEncoding
+//    (X86DisassemblerDecoderCommon.h) and its type is an entry from
+//    OperandType (ibid.).  The encoding indicates how to read it from the
+//    instruction; the type indicates how to interpret the value once it has
+//    been read.  For example, a register operand could be stored in the R/M
+//    field of the ModR/M byte, the REG field of the ModR/M byte, or added to
+//    the main opcode.  This is orthogonal from its meaning (an GPR or an XMM
+//    register, for instance).  Given this information, the operands can be
+//    extracted and interpreted.
+//
+// 7. As the last step, the disassembler translates the instruction information
+//    and operands into a format understandable by the client - in this case, an
+//    MCInst for use by the MC infrastructure.
+//
+// The disassembler is broken broadly into two parts: the table emitter that
+// emits the instruction decode tables discussed above during compilation, and
+// the disassembler itself.  The table emitter is documented in more detail in
+// utils/TableGen/X86DisassemblerEmitter.h.
+//
+// X86Disassembler.cpp contains the code responsible for step 7, and for
+//   invoking the decoder to execute steps 1-6.
+// X86DisassemblerDecoderCommon.h contains the definitions needed by both the
+//   table emitter and the disassembler.
+// X86DisassemblerDecoder.h contains the public interface of the decoder,
+//   factored out into C for possible use by other projects.
+// X86DisassemblerDecoder.c contains the source code of the decoder, which is
+//   responsible for steps 1-6.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86DisassemblerDecoder.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::X86Disassembler;
+
+#define DEBUG_TYPE "x86-disassembler"
+
+void llvm::X86Disassembler::Debug(const char *file, unsigned line,
+                                  const char *s) {
+  dbgs() << file << ":" << line << ": " << s;
+}
+
+StringRef llvm::X86Disassembler::GetInstrName(unsigned Opcode,
+                                                const void *mii) {
+  const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
+  return MII->getName(Opcode);
+}
+
+#define debug(s) DEBUG(Debug(__FILE__, __LINE__, s));
+
+namespace llvm {
+
+// Fill-ins to make the compiler happy.  These constants are never actually
+//   assigned; they are just filler to make an automatically-generated switch
+//   statement work.
+namespace X86 {
+  enum {
+    BX_SI = 500,
+    BX_DI = 501,
+    BP_SI = 502,
+    BP_DI = 503,
+    sib   = 504,
+    sib64 = 505
+  };
+}
+
+}
+
+static bool translateInstruction(MCInst &target,
+                                InternalInstruction &source,
+                                const MCDisassembler *Dis);
+
+namespace {
+
+/// Generic disassembler for all X86 platforms. All each platform class should
+/// have to do is subclass the constructor, and provide a different
+/// disassemblerMode value.
+class X86GenericDisassembler : public MCDisassembler {
+  std::unique_ptr<const MCInstrInfo> MII;
+public:
+  X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                         std::unique_ptr<const MCInstrInfo> MII);
+public:
+  DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
+
+private:
+  DisassemblerMode              fMode;
+};
+
+}
+
+X86GenericDisassembler::X86GenericDisassembler(
+                                         const MCSubtargetInfo &STI,
+                                         MCContext &Ctx,
+                                         std::unique_ptr<const MCInstrInfo> MII)
+  : MCDisassembler(STI, Ctx), MII(std::move(MII)) {
+  const FeatureBitset &FB = STI.getFeatureBits();
+  if (FB[X86::Mode16Bit]) {
+    fMode = MODE_16BIT;
+    return;
+  } else if (FB[X86::Mode32Bit]) {
+    fMode = MODE_32BIT;
+    return;
+  } else if (FB[X86::Mode64Bit]) {
+    fMode = MODE_64BIT;
+    return;
+  }
+
+  llvm_unreachable("Invalid CPU mode");
+}
+
+namespace {
+struct Region {
+  ArrayRef<uint8_t> Bytes;
+  uint64_t Base;
+  Region(ArrayRef<uint8_t> Bytes, uint64_t Base) : Bytes(Bytes), Base(Base) {}
+};
+} // end anonymous namespace
+
+/// A callback function that wraps the readByte method from Region.
+///
+/// @param Arg      - The generic callback parameter.  In this case, this should
+///                   be a pointer to a Region.
+/// @param Byte     - A pointer to the byte to be read.
+/// @param Address  - The address to be read.
+static int regionReader(const void *Arg, uint8_t *Byte, uint64_t Address) {
+  auto *R = static_cast<const Region *>(Arg);
+  ArrayRef<uint8_t> Bytes = R->Bytes;
+  unsigned Index = Address - R->Base;
+  if (Bytes.size() <= Index)
+    return -1;
+  *Byte = Bytes[Index];
+  return 0;
+}
+
+/// logger - a callback function that wraps the operator<< method from
+///   raw_ostream.
+///
+/// @param arg      - The generic callback parameter.  This should be a pointe
+///                   to a raw_ostream.
+/// @param log      - A string to be logged.  logger() adds a newline.
+static void logger(void* arg, const char* log) {
+  if (!arg)
+    return;
+
+  raw_ostream &vStream = *(static_cast<raw_ostream*>(arg));
+  vStream << log << "\n";
+}
+
+//
+// Public interface for the disassembler
+//
+
+MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction(
+    MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address,
+    raw_ostream &VStream, raw_ostream &CStream) const {
+  CommentStream = &CStream;
+
+  InternalInstruction InternalInstr;
+
+  dlog_t LoggerFn = logger;
+  if (&VStream == &nulls())
+    LoggerFn = nullptr; // Disable logging completely if it's going to nulls().
+
+  Region R(Bytes, Address);
+
+  int Ret = decodeInstruction(&InternalInstr, regionReader, (const void *)&R,
+                              LoggerFn, (void *)&VStream,
+                              (const void *)MII.get(), Address, fMode);
+
+  if (Ret) {
+    Size = InternalInstr.readerCursor - Address;
+    return Fail;
+  } else {
+    Size = InternalInstr.length;
+    return (!translateInstruction(Instr, InternalInstr, this)) ? Success : Fail;
+  }
+}
+
+//
+// Private code that translates from struct InternalInstructions to MCInsts.
+//
+
+/// translateRegister - Translates an internal register to the appropriate LLVM
+///   register, and appends it as an operand to an MCInst.
+///
+/// @param mcInst     - The MCInst to append to.
+/// @param reg        - The Reg to append.
+static void translateRegister(MCInst &mcInst, Reg reg) {
+#define ENTRY(x) X86::x,
+  uint8_t llvmRegnums[] = {
+    ALL_REGS
+    0
+  };
+#undef ENTRY
+
+  uint8_t llvmRegnum = llvmRegnums[reg];
+  mcInst.addOperand(MCOperand::createReg(llvmRegnum));
+}
+
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst.
+///
+/// @param Value      - The immediate Value, has had any PC adjustment made by
+///                     the caller.
+/// @param isBranch   - If the instruction is a branch instruction
+/// @param Address    - The starting address of the instruction
+/// @param Offset     - The byte offset to this immediate in the instruction
+/// @param Width      - The byte width of this immediate in the instruction
+///
+/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was
+/// called then that function is called to get any symbolic information for the
+/// immediate in the instruction using the Address, Offset and Width.  If that
+/// returns non-zero then the symbolic information it returns is used to create
+/// an MCExpr and that is added as an operand to the MCInst.  If getOpInfo()
+/// returns zero and isBranch is true then a symbol look up for immediate Value
+/// is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with the immediate Value is created.  This function returns true
+/// if it adds an operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
+                                     uint64_t Address, uint64_t Offset,
+                                     uint64_t Width, MCInst &MI,
+                                     const MCDisassembler *Dis) {
+  return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
+                                       Offset, Width);
+}
+
+/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
+/// referenced by a load instruction with the base register that is the rip.
+/// These can often be addresses in a literal pool.  The Address of the
+/// instruction and its immediate Value are used to determine the address
+/// being referenced in the literal pool entry.  The SymbolLookUp call back will
+/// return a pointer to a literal 'C' string if the referenced address is an
+/// address into a section with 'C' string literals.
+static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value,
+                                            const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  Dis->tryAddingPcLoadReferenceComment(Value, Address);
+}
+
+static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = {
+  0,        // SEG_OVERRIDE_NONE
+  X86::CS,
+  X86::SS,
+  X86::DS,
+  X86::ES,
+  X86::FS,
+  X86::GS
+};
+
+/// translateSrcIndex   - Appends a source index operand to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The internal instruction.
+static bool translateSrcIndex(MCInst &mcInst, InternalInstruction &insn) {
+  unsigned baseRegNo;
+
+  if (insn.mode == MODE_64BIT)
+    baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::RSI;
+  else if (insn.mode == MODE_32BIT)
+    baseRegNo = insn.prefixPresent[0x67] ? X86::SI : X86::ESI;
+  else {
+    assert(insn.mode == MODE_16BIT);
+    baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::SI;
+  }
+  MCOperand baseReg = MCOperand::createReg(baseRegNo);
+  mcInst.addOperand(baseReg);
+
+  MCOperand segmentReg;
+  segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
+  mcInst.addOperand(segmentReg);
+  return false;
+}
+
+/// translateDstIndex   - Appends a destination index operand to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The internal instruction.
+
+static bool translateDstIndex(MCInst &mcInst, InternalInstruction &insn) {
+  unsigned baseRegNo;
+
+  if (insn.mode == MODE_64BIT)
+    baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::RDI;
+  else if (insn.mode == MODE_32BIT)
+    baseRegNo = insn.prefixPresent[0x67] ? X86::DI : X86::EDI;
+  else {
+    assert(insn.mode == MODE_16BIT);
+    baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::DI;
+  }
+  MCOperand baseReg = MCOperand::createReg(baseRegNo);
+  mcInst.addOperand(baseReg);
+  return false;
+}
+
+/// translateImmediate  - Appends an immediate operand to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param immediate    - The immediate value to append.
+/// @param operand      - The operand, as stored in the descriptor table.
+/// @param insn         - The internal instruction.
+static void translateImmediate(MCInst &mcInst, uint64_t immediate,
+                               const OperandSpecifier &operand,
+                               InternalInstruction &insn,
+                               const MCDisassembler *Dis) {
+  // Sign-extend the immediate if necessary.
+
+  OperandType type = (OperandType)operand.type;
+
+  bool isBranch = false;
+  uint64_t pcrel = 0;
+  if (type == TYPE_RELv) {
+    isBranch = true;
+    pcrel = insn.startLocation +
+            insn.immediateOffset + insn.immediateSize;
+    switch (insn.displacementSize) {
+    default:
+      break;
+    case 1:
+      if(immediate & 0x80)
+        immediate |= ~(0xffull);
+      break;
+    case 2:
+      if(immediate & 0x8000)
+        immediate |= ~(0xffffull);
+      break;
+    case 4:
+      if(immediate & 0x80000000)
+        immediate |= ~(0xffffffffull);
+      break;
+    case 8:
+      break;
+    }
+  }
+  // By default sign-extend all X86 immediates based on their encoding.
+  else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 ||
+           type == TYPE_IMM64 || type == TYPE_IMMv) {
+    switch (operand.encoding) {
+    default:
+      break;
+    case ENCODING_IB:
+      if(immediate & 0x80)
+        immediate |= ~(0xffull);
+      break;
+    case ENCODING_IW:
+      if(immediate & 0x8000)
+        immediate |= ~(0xffffull);
+      break;
+    case ENCODING_ID:
+      if(immediate & 0x80000000)
+        immediate |= ~(0xffffffffull);
+      break;
+    case ENCODING_IO:
+      break;
+    }
+  } else if (type == TYPE_IMM3) {
+    // Check for immediates that printSSECC can't handle.
+    if (immediate >= 8) {
+      unsigned NewOpc;
+      switch (mcInst.getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case X86::CMPPDrmi:  NewOpc = X86::CMPPDrmi_alt;  break;
+      case X86::CMPPDrri:  NewOpc = X86::CMPPDrri_alt;  break;
+      case X86::CMPPSrmi:  NewOpc = X86::CMPPSrmi_alt;  break;
+      case X86::CMPPSrri:  NewOpc = X86::CMPPSrri_alt;  break;
+      case X86::CMPSDrm:   NewOpc = X86::CMPSDrm_alt;   break;
+      case X86::CMPSDrr:   NewOpc = X86::CMPSDrr_alt;   break;
+      case X86::CMPSSrm:   NewOpc = X86::CMPSSrm_alt;   break;
+      case X86::CMPSSrr:   NewOpc = X86::CMPSSrr_alt;   break;
+      case X86::VPCOMBri:  NewOpc = X86::VPCOMBri_alt;  break;
+      case X86::VPCOMBmi:  NewOpc = X86::VPCOMBmi_alt;  break;
+      case X86::VPCOMWri:  NewOpc = X86::VPCOMWri_alt;  break;
+      case X86::VPCOMWmi:  NewOpc = X86::VPCOMWmi_alt;  break;
+      case X86::VPCOMDri:  NewOpc = X86::VPCOMDri_alt;  break;
+      case X86::VPCOMDmi:  NewOpc = X86::VPCOMDmi_alt;  break;
+      case X86::VPCOMQri:  NewOpc = X86::VPCOMQri_alt;  break;
+      case X86::VPCOMQmi:  NewOpc = X86::VPCOMQmi_alt;  break;
+      case X86::VPCOMUBri: NewOpc = X86::VPCOMUBri_alt; break;
+      case X86::VPCOMUBmi: NewOpc = X86::VPCOMUBmi_alt; break;
+      case X86::VPCOMUWri: NewOpc = X86::VPCOMUWri_alt; break;
+      case X86::VPCOMUWmi: NewOpc = X86::VPCOMUWmi_alt; break;
+      case X86::VPCOMUDri: NewOpc = X86::VPCOMUDri_alt; break;
+      case X86::VPCOMUDmi: NewOpc = X86::VPCOMUDmi_alt; break;
+      case X86::VPCOMUQri: NewOpc = X86::VPCOMUQri_alt; break;
+      case X86::VPCOMUQmi: NewOpc = X86::VPCOMUQmi_alt; break;
+      }
+      // Switch opcode to the one that doesn't get special printing.
+      mcInst.setOpcode(NewOpc);
+    }
+  } else if (type == TYPE_IMM5) {
+    // Check for immediates that printAVXCC can't handle.
+    if (immediate >= 32) {
+      unsigned NewOpc;
+      switch (mcInst.getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case X86::VCMPPDrmi:   NewOpc = X86::VCMPPDrmi_alt;   break;
+      case X86::VCMPPDrri:   NewOpc = X86::VCMPPDrri_alt;   break;
+      case X86::VCMPPSrmi:   NewOpc = X86::VCMPPSrmi_alt;   break;
+      case X86::VCMPPSrri:   NewOpc = X86::VCMPPSrri_alt;   break;
+      case X86::VCMPSDrm:    NewOpc = X86::VCMPSDrm_alt;    break;
+      case X86::VCMPSDrr:    NewOpc = X86::VCMPSDrr_alt;    break;
+      case X86::VCMPSSrm:    NewOpc = X86::VCMPSSrm_alt;    break;
+      case X86::VCMPSSrr:    NewOpc = X86::VCMPSSrr_alt;    break;
+      case X86::VCMPPDYrmi:  NewOpc = X86::VCMPPDYrmi_alt;  break;
+      case X86::VCMPPDYrri:  NewOpc = X86::VCMPPDYrri_alt;  break;
+      case X86::VCMPPSYrmi:  NewOpc = X86::VCMPPSYrmi_alt;  break;
+      case X86::VCMPPSYrri:  NewOpc = X86::VCMPPSYrri_alt;  break;
+      case X86::VCMPPDZrmi:  NewOpc = X86::VCMPPDZrmi_alt;  break;
+      case X86::VCMPPDZrri:  NewOpc = X86::VCMPPDZrri_alt;  break;
+      case X86::VCMPPDZrrib: NewOpc = X86::VCMPPDZrrib_alt; break;
+      case X86::VCMPPSZrmi:  NewOpc = X86::VCMPPSZrmi_alt;  break;
+      case X86::VCMPPSZrri:  NewOpc = X86::VCMPPSZrri_alt;  break;
+      case X86::VCMPPSZrrib: NewOpc = X86::VCMPPSZrrib_alt; break;
+      case X86::VCMPPDZ128rmi:  NewOpc = X86::VCMPPDZ128rmi_alt;  break;
+      case X86::VCMPPDZ128rri:  NewOpc = X86::VCMPPDZ128rri_alt;  break;
+      case X86::VCMPPSZ128rmi:  NewOpc = X86::VCMPPSZ128rmi_alt;  break;
+      case X86::VCMPPSZ128rri:  NewOpc = X86::VCMPPSZ128rri_alt;  break;
+      case X86::VCMPPDZ256rmi:  NewOpc = X86::VCMPPDZ256rmi_alt;  break;
+      case X86::VCMPPDZ256rri:  NewOpc = X86::VCMPPDZ256rri_alt;  break;
+      case X86::VCMPPSZ256rmi:  NewOpc = X86::VCMPPSZ256rmi_alt;  break;
+      case X86::VCMPPSZ256rri:  NewOpc = X86::VCMPPSZ256rri_alt;  break;
+      case X86::VCMPSDZrm_Int:  NewOpc = X86::VCMPSDZrmi_alt;  break;
+      case X86::VCMPSDZrr_Int:  NewOpc = X86::VCMPSDZrri_alt;  break;
+      case X86::VCMPSDZrrb_Int: NewOpc = X86::VCMPSDZrrb_alt;  break;
+      case X86::VCMPSSZrm_Int:  NewOpc = X86::VCMPSSZrmi_alt;  break;
+      case X86::VCMPSSZrr_Int:  NewOpc = X86::VCMPSSZrri_alt;  break;
+      case X86::VCMPSSZrrb_Int: NewOpc = X86::VCMPSSZrrb_alt;  break;
+      }
+      // Switch opcode to the one that doesn't get special printing.
+      mcInst.setOpcode(NewOpc);
+    }
+  } else if (type == TYPE_AVX512ICC) {
+    if (immediate >= 8 || ((immediate & 0x3) == 3)) {
+      unsigned NewOpc;
+      switch (mcInst.getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case X86::VPCMPBZ128rmi:    NewOpc = X86::VPCMPBZ128rmi_alt;    break;
+      case X86::VPCMPBZ128rmik:   NewOpc = X86::VPCMPBZ128rmik_alt;   break;
+      case X86::VPCMPBZ128rri:    NewOpc = X86::VPCMPBZ128rri_alt;    break;
+      case X86::VPCMPBZ128rrik:   NewOpc = X86::VPCMPBZ128rrik_alt;   break;
+      case X86::VPCMPBZ256rmi:    NewOpc = X86::VPCMPBZ256rmi_alt;    break;
+      case X86::VPCMPBZ256rmik:   NewOpc = X86::VPCMPBZ256rmik_alt;   break;
+      case X86::VPCMPBZ256rri:    NewOpc = X86::VPCMPBZ256rri_alt;    break;
+      case X86::VPCMPBZ256rrik:   NewOpc = X86::VPCMPBZ256rrik_alt;   break;
+      case X86::VPCMPBZrmi:       NewOpc = X86::VPCMPBZrmi_alt;       break;
+      case X86::VPCMPBZrmik:      NewOpc = X86::VPCMPBZrmik_alt;      break;
+      case X86::VPCMPBZrri:       NewOpc = X86::VPCMPBZrri_alt;       break;
+      case X86::VPCMPBZrrik:      NewOpc = X86::VPCMPBZrrik_alt;      break;
+      case X86::VPCMPDZ128rmi:    NewOpc = X86::VPCMPDZ128rmi_alt;    break;
+      case X86::VPCMPDZ128rmib:   NewOpc = X86::VPCMPDZ128rmib_alt;   break;
+      case X86::VPCMPDZ128rmibk:  NewOpc = X86::VPCMPDZ128rmibk_alt;  break;
+      case X86::VPCMPDZ128rmik:   NewOpc = X86::VPCMPDZ128rmik_alt;   break;
+      case X86::VPCMPDZ128rri:    NewOpc = X86::VPCMPDZ128rri_alt;    break;
+      case X86::VPCMPDZ128rrik:   NewOpc = X86::VPCMPDZ128rrik_alt;   break;
+      case X86::VPCMPDZ256rmi:    NewOpc = X86::VPCMPDZ256rmi_alt;    break;
+      case X86::VPCMPDZ256rmib:   NewOpc = X86::VPCMPDZ256rmib_alt;   break;
+      case X86::VPCMPDZ256rmibk:  NewOpc = X86::VPCMPDZ256rmibk_alt;  break;
+      case X86::VPCMPDZ256rmik:   NewOpc = X86::VPCMPDZ256rmik_alt;   break;
+      case X86::VPCMPDZ256rri:    NewOpc = X86::VPCMPDZ256rri_alt;    break;
+      case X86::VPCMPDZ256rrik:   NewOpc = X86::VPCMPDZ256rrik_alt;   break;
+      case X86::VPCMPDZrmi:       NewOpc = X86::VPCMPDZrmi_alt;       break;
+      case X86::VPCMPDZrmib:      NewOpc = X86::VPCMPDZrmib_alt;      break;
+      case X86::VPCMPDZrmibk:     NewOpc = X86::VPCMPDZrmibk_alt;     break;
+      case X86::VPCMPDZrmik:      NewOpc = X86::VPCMPDZrmik_alt;      break;
+      case X86::VPCMPDZrri:       NewOpc = X86::VPCMPDZrri_alt;       break;
+      case X86::VPCMPDZrrik:      NewOpc = X86::VPCMPDZrrik_alt;      break;
+      case X86::VPCMPQZ128rmi:    NewOpc = X86::VPCMPQZ128rmi_alt;    break;
+      case X86::VPCMPQZ128rmib:   NewOpc = X86::VPCMPQZ128rmib_alt;   break;
+      case X86::VPCMPQZ128rmibk:  NewOpc = X86::VPCMPQZ128rmibk_alt;  break;
+      case X86::VPCMPQZ128rmik:   NewOpc = X86::VPCMPQZ128rmik_alt;   break;
+      case X86::VPCMPQZ128rri:    NewOpc = X86::VPCMPQZ128rri_alt;    break;
+      case X86::VPCMPQZ128rrik:   NewOpc = X86::VPCMPQZ128rrik_alt;   break;
+      case X86::VPCMPQZ256rmi:    NewOpc = X86::VPCMPQZ256rmi_alt;    break;
+      case X86::VPCMPQZ256rmib:   NewOpc = X86::VPCMPQZ256rmib_alt;   break;
+      case X86::VPCMPQZ256rmibk:  NewOpc = X86::VPCMPQZ256rmibk_alt;  break;
+      case X86::VPCMPQZ256rmik:   NewOpc = X86::VPCMPQZ256rmik_alt;   break;
+      case X86::VPCMPQZ256rri:    NewOpc = X86::VPCMPQZ256rri_alt;    break;
+      case X86::VPCMPQZ256rrik:   NewOpc = X86::VPCMPQZ256rrik_alt;   break;
+      case X86::VPCMPQZrmi:       NewOpc = X86::VPCMPQZrmi_alt;       break;
+      case X86::VPCMPQZrmib:      NewOpc = X86::VPCMPQZrmib_alt;      break;
+      case X86::VPCMPQZrmibk:     NewOpc = X86::VPCMPQZrmibk_alt;     break;
+      case X86::VPCMPQZrmik:      NewOpc = X86::VPCMPQZrmik_alt;      break;
+      case X86::VPCMPQZrri:       NewOpc = X86::VPCMPQZrri_alt;       break;
+      case X86::VPCMPQZrrik:      NewOpc = X86::VPCMPQZrrik_alt;      break;
+      case X86::VPCMPUBZ128rmi:   NewOpc = X86::VPCMPUBZ128rmi_alt;   break;
+      case X86::VPCMPUBZ128rmik:  NewOpc = X86::VPCMPUBZ128rmik_alt;  break;
+      case X86::VPCMPUBZ128rri:   NewOpc = X86::VPCMPUBZ128rri_alt;   break;
+      case X86::VPCMPUBZ128rrik:  NewOpc = X86::VPCMPUBZ128rrik_alt;  break;
+      case X86::VPCMPUBZ256rmi:   NewOpc = X86::VPCMPUBZ256rmi_alt;   break;
+      case X86::VPCMPUBZ256rmik:  NewOpc = X86::VPCMPUBZ256rmik_alt;  break;
+      case X86::VPCMPUBZ256rri:   NewOpc = X86::VPCMPUBZ256rri_alt;   break;
+      case X86::VPCMPUBZ256rrik:  NewOpc = X86::VPCMPUBZ256rrik_alt;  break;
+      case X86::VPCMPUBZrmi:      NewOpc = X86::VPCMPUBZrmi_alt;      break;
+      case X86::VPCMPUBZrmik:     NewOpc = X86::VPCMPUBZrmik_alt;     break;
+      case X86::VPCMPUBZrri:      NewOpc = X86::VPCMPUBZrri_alt;      break;
+      case X86::VPCMPUBZrrik:     NewOpc = X86::VPCMPUBZrrik_alt;     break;
+      case X86::VPCMPUDZ128rmi:   NewOpc = X86::VPCMPUDZ128rmi_alt;   break;
+      case X86::VPCMPUDZ128rmib:  NewOpc = X86::VPCMPUDZ128rmib_alt;  break;
+      case X86::VPCMPUDZ128rmibk: NewOpc = X86::VPCMPUDZ128rmibk_alt; break;
+      case X86::VPCMPUDZ128rmik:  NewOpc = X86::VPCMPUDZ128rmik_alt;  break;
+      case X86::VPCMPUDZ128rri:   NewOpc = X86::VPCMPUDZ128rri_alt;   break;
+      case X86::VPCMPUDZ128rrik:  NewOpc = X86::VPCMPUDZ128rrik_alt;  break;
+      case X86::VPCMPUDZ256rmi:   NewOpc = X86::VPCMPUDZ256rmi_alt;   break;
+      case X86::VPCMPUDZ256rmib:  NewOpc = X86::VPCMPUDZ256rmib_alt;  break;
+      case X86::VPCMPUDZ256rmibk: NewOpc = X86::VPCMPUDZ256rmibk_alt; break;
+      case X86::VPCMPUDZ256rmik:  NewOpc = X86::VPCMPUDZ256rmik_alt;  break;
+      case X86::VPCMPUDZ256rri:   NewOpc = X86::VPCMPUDZ256rri_alt;   break;
+      case X86::VPCMPUDZ256rrik:  NewOpc = X86::VPCMPUDZ256rrik_alt;  break;
+      case X86::VPCMPUDZrmi:      NewOpc = X86::VPCMPUDZrmi_alt;      break;
+      case X86::VPCMPUDZrmib:     NewOpc = X86::VPCMPUDZrmib_alt;     break;
+      case X86::VPCMPUDZrmibk:    NewOpc = X86::VPCMPUDZrmibk_alt;    break;
+      case X86::VPCMPUDZrmik:     NewOpc = X86::VPCMPUDZrmik_alt;     break;
+      case X86::VPCMPUDZrri:      NewOpc = X86::VPCMPUDZrri_alt;      break;
+      case X86::VPCMPUDZrrik:     NewOpc = X86::VPCMPUDZrrik_alt;     break;
+      case X86::VPCMPUQZ128rmi:   NewOpc = X86::VPCMPUQZ128rmi_alt;   break;
+      case X86::VPCMPUQZ128rmib:  NewOpc = X86::VPCMPUQZ128rmib_alt;  break;
+      case X86::VPCMPUQZ128rmibk: NewOpc = X86::VPCMPUQZ128rmibk_alt; break;
+      case X86::VPCMPUQZ128rmik:  NewOpc = X86::VPCMPUQZ128rmik_alt;  break;
+      case X86::VPCMPUQZ128rri:   NewOpc = X86::VPCMPUQZ128rri_alt;   break;
+      case X86::VPCMPUQZ128rrik:  NewOpc = X86::VPCMPUQZ128rrik_alt;  break;
+      case X86::VPCMPUQZ256rmi:   NewOpc = X86::VPCMPUQZ256rmi_alt;   break;
+      case X86::VPCMPUQZ256rmib:  NewOpc = X86::VPCMPUQZ256rmib_alt;  break;
+      case X86::VPCMPUQZ256rmibk: NewOpc = X86::VPCMPUQZ256rmibk_alt; break;
+      case X86::VPCMPUQZ256rmik:  NewOpc = X86::VPCMPUQZ256rmik_alt;  break;
+      case X86::VPCMPUQZ256rri:   NewOpc = X86::VPCMPUQZ256rri_alt;   break;
+      case X86::VPCMPUQZ256rrik:  NewOpc = X86::VPCMPUQZ256rrik_alt;  break;
+      case X86::VPCMPUQZrmi:      NewOpc = X86::VPCMPUQZrmi_alt;      break;
+      case X86::VPCMPUQZrmib:     NewOpc = X86::VPCMPUQZrmib_alt;     break;
+      case X86::VPCMPUQZrmibk:    NewOpc = X86::VPCMPUQZrmibk_alt;    break;
+      case X86::VPCMPUQZrmik:     NewOpc = X86::VPCMPUQZrmik_alt;     break;
+      case X86::VPCMPUQZrri:      NewOpc = X86::VPCMPUQZrri_alt;      break;
+      case X86::VPCMPUQZrrik:     NewOpc = X86::VPCMPUQZrrik_alt;     break;
+      case X86::VPCMPUWZ128rmi:   NewOpc = X86::VPCMPUWZ128rmi_alt;   break;
+      case X86::VPCMPUWZ128rmik:  NewOpc = X86::VPCMPUWZ128rmik_alt;  break;
+      case X86::VPCMPUWZ128rri:   NewOpc = X86::VPCMPUWZ128rri_alt;   break;
+      case X86::VPCMPUWZ128rrik:  NewOpc = X86::VPCMPUWZ128rrik_alt;  break;
+      case X86::VPCMPUWZ256rmi:   NewOpc = X86::VPCMPUWZ256rmi_alt;   break;
+      case X86::VPCMPUWZ256rmik:  NewOpc = X86::VPCMPUWZ256rmik_alt;  break;
+      case X86::VPCMPUWZ256rri:   NewOpc = X86::VPCMPUWZ256rri_alt;   break;
+      case X86::VPCMPUWZ256rrik:  NewOpc = X86::VPCMPUWZ256rrik_alt;  break;
+      case X86::VPCMPUWZrmi:      NewOpc = X86::VPCMPUWZrmi_alt;      break;
+      case X86::VPCMPUWZrmik:     NewOpc = X86::VPCMPUWZrmik_alt;     break;
+      case X86::VPCMPUWZrri:      NewOpc = X86::VPCMPUWZrri_alt;      break;
+      case X86::VPCMPUWZrrik:     NewOpc = X86::VPCMPUWZrrik_alt;     break;
+      case X86::VPCMPWZ128rmi:    NewOpc = X86::VPCMPWZ128rmi_alt;    break;
+      case X86::VPCMPWZ128rmik:   NewOpc = X86::VPCMPWZ128rmik_alt;   break;
+      case X86::VPCMPWZ128rri:    NewOpc = X86::VPCMPWZ128rri_alt;    break;
+      case X86::VPCMPWZ128rrik:   NewOpc = X86::VPCMPWZ128rrik_alt;   break;
+      case X86::VPCMPWZ256rmi:    NewOpc = X86::VPCMPWZ256rmi_alt;    break;
+      case X86::VPCMPWZ256rmik:   NewOpc = X86::VPCMPWZ256rmik_alt;   break;
+      case X86::VPCMPWZ256rri:    NewOpc = X86::VPCMPWZ256rri_alt;    break;
+      case X86::VPCMPWZ256rrik:   NewOpc = X86::VPCMPWZ256rrik_alt;   break;
+      case X86::VPCMPWZrmi:       NewOpc = X86::VPCMPWZrmi_alt;       break;
+      case X86::VPCMPWZrmik:      NewOpc = X86::VPCMPWZrmik_alt;      break;
+      case X86::VPCMPWZrri:       NewOpc = X86::VPCMPWZrri_alt;       break;
+      case X86::VPCMPWZrrik:      NewOpc = X86::VPCMPWZrrik_alt;      break;
+      }
+      // Switch opcode to the one that doesn't get special printing.
+      mcInst.setOpcode(NewOpc);
+    }
+  }
+
+  switch (type) {
+  case TYPE_XMM32:
+  case TYPE_XMM64:
+  case TYPE_XMM128:
+    mcInst.addOperand(MCOperand::createReg(X86::XMM0 + (immediate >> 4)));
+    return;
+  case TYPE_XMM256:
+    mcInst.addOperand(MCOperand::createReg(X86::YMM0 + (immediate >> 4)));
+    return;
+  case TYPE_XMM512:
+    mcInst.addOperand(MCOperand::createReg(X86::ZMM0 + (immediate >> 4)));
+    return;
+  case TYPE_BNDR:
+    mcInst.addOperand(MCOperand::createReg(X86::BND0 + (immediate >> 4)));
+  case TYPE_REL8:
+    isBranch = true;
+    pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+    if (immediate & 0x80)
+      immediate |= ~(0xffull);
+    break;
+  case TYPE_REL16:
+    isBranch = true;
+    pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+    if (immediate & 0x8000)
+      immediate |= ~(0xffffull);
+    break;
+  case TYPE_REL32:
+  case TYPE_REL64:
+    isBranch = true;
+    pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+    if(immediate & 0x80000000)
+      immediate |= ~(0xffffffffull);
+    break;
+  default:
+    // operand is 64 bits wide.  Do nothing.
+    break;
+  }
+
+  if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation,
+                               insn.immediateOffset, insn.immediateSize,
+                               mcInst, Dis))
+    mcInst.addOperand(MCOperand::createImm(immediate));
+
+  if (type == TYPE_MOFFS8 || type == TYPE_MOFFS16 ||
+      type == TYPE_MOFFS32 || type == TYPE_MOFFS64) {
+    MCOperand segmentReg;
+    segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
+    mcInst.addOperand(segmentReg);
+  }
+}
+
+/// translateRMRegister - Translates a register stored in the R/M field of the
+///   ModR/M byte to its LLVM equivalent and appends it to an MCInst.
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The internal instruction to extract the R/M field
+///                       from.
+/// @return             - 0 on success; -1 otherwise
+static bool translateRMRegister(MCInst &mcInst,
+                                InternalInstruction &insn) {
+  if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+    debug("A R/M register operand may not have a SIB byte");
+    return true;
+  }
+
+  switch (insn.eaBase) {
+  default:
+    debug("Unexpected EA base register");
+    return true;
+  case EA_BASE_NONE:
+    debug("EA_BASE_NONE for ModR/M base");
+    return true;
+#define ENTRY(x) case EA_BASE_##x:
+  ALL_EA_BASES
+#undef ENTRY
+    debug("A R/M register operand may not have a base; "
+          "the operand must be a register.");
+    return true;
+#define ENTRY(x)                                                      \
+  case EA_REG_##x:                                                    \
+    mcInst.addOperand(MCOperand::createReg(X86::x)); break;
+  ALL_REGS
+#undef ENTRY
+  }
+
+  return false;
+}
+
+/// translateRMMemory - Translates a memory operand stored in the Mod and R/M
+///   fields of an internal instruction (and possibly its SIB byte) to a memory
+///   operand in LLVM's format, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The instruction to extract Mod, R/M, and SIB fields
+///                       from.
+/// @return             - 0 on success; nonzero otherwise
+static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
+                              const MCDisassembler *Dis) {
+  // Addresses in an MCInst are represented as five operands:
+  //   1. basereg       (register)  The R/M base, or (if there is a SIB) the
+  //                                SIB base
+  //   2. scaleamount   (immediate) 1, or (if there is a SIB) the specified
+  //                                scale amount
+  //   3. indexreg      (register)  x86_registerNONE, or (if there is a SIB)
+  //                                the index (which is multiplied by the
+  //                                scale amount)
+  //   4. displacement  (immediate) 0, or the displacement if there is one
+  //   5. segmentreg    (register)  x86_registerNONE for now, but could be set
+  //                                if we have segment overrides
+
+  MCOperand baseReg;
+  MCOperand scaleAmount;
+  MCOperand indexReg;
+  MCOperand displacement;
+  MCOperand segmentReg;
+  uint64_t pcrel = 0;
+
+  if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+    if (insn.sibBase != SIB_BASE_NONE) {
+      switch (insn.sibBase) {
+      default:
+        debug("Unexpected sibBase");
+        return true;
+#define ENTRY(x)                                          \
+      case SIB_BASE_##x:                                  \
+        baseReg = MCOperand::createReg(X86::x); break;
+      ALL_SIB_BASES
+#undef ENTRY
+      }
+    } else {
+      baseReg = MCOperand::createReg(0);
+    }
+
+    // Check whether we are handling VSIB addressing mode for GATHER.
+    // If sibIndex was set to SIB_INDEX_NONE, index offset is 4 and
+    // we should use SIB_INDEX_XMM4|YMM4 for VSIB.
+    // I don't see a way to get the correct IndexReg in readSIB:
+    //   We can tell whether it is VSIB or SIB after instruction ID is decoded,
+    //   but instruction ID may not be decoded yet when calling readSIB.
+    uint32_t Opcode = mcInst.getOpcode();
+    bool IndexIs128 = (Opcode == X86::VGATHERDPDrm ||
+                       Opcode == X86::VGATHERDPDYrm ||
+                       Opcode == X86::VGATHERQPDrm ||
+                       Opcode == X86::VGATHERDPSrm ||
+                       Opcode == X86::VGATHERQPSrm ||
+                       Opcode == X86::VPGATHERDQrm ||
+                       Opcode == X86::VPGATHERDQYrm ||
+                       Opcode == X86::VPGATHERQQrm ||
+                       Opcode == X86::VPGATHERDDrm ||
+                       Opcode == X86::VPGATHERQDrm);
+    bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm ||
+                       Opcode == X86::VGATHERDPSYrm ||
+                       Opcode == X86::VGATHERQPSYrm ||
+                       Opcode == X86::VGATHERDPDZrm ||
+                       Opcode == X86::VPGATHERDQZrm ||
+                       Opcode == X86::VPGATHERQQYrm ||
+                       Opcode == X86::VPGATHERDDYrm ||
+                       Opcode == X86::VPGATHERQDYrm);
+    bool IndexIs512 = (Opcode == X86::VGATHERQPDZrm ||
+                       Opcode == X86::VGATHERDPSZrm ||
+                       Opcode == X86::VGATHERQPSZrm ||
+                       Opcode == X86::VPGATHERQQZrm ||
+                       Opcode == X86::VPGATHERDDZrm ||
+                       Opcode == X86::VPGATHERQDZrm);
+    if (IndexIs128 || IndexIs256 || IndexIs512) {
+      unsigned IndexOffset = insn.sibIndex -
+                         (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX);
+      SIBIndex IndexBase = IndexIs512 ? SIB_INDEX_ZMM0 :
+                           IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0;
+      insn.sibIndex = (SIBIndex)(IndexBase +
+                           (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset));
+    }
+
+    if (insn.sibIndex != SIB_INDEX_NONE) {
+      switch (insn.sibIndex) {
+      default:
+        debug("Unexpected sibIndex");
+        return true;
+#define ENTRY(x)                                          \
+      case SIB_INDEX_##x:                                 \
+        indexReg = MCOperand::createReg(X86::x); break;
+      EA_BASES_32BIT
+      EA_BASES_64BIT
+      REGS_XMM
+      REGS_YMM
+      REGS_ZMM
+#undef ENTRY
+      }
+    } else {
+      indexReg = MCOperand::createReg(0);
+    }
+
+    scaleAmount = MCOperand::createImm(insn.sibScale);
+  } else {
+    switch (insn.eaBase) {
+    case EA_BASE_NONE:
+      if (insn.eaDisplacement == EA_DISP_NONE) {
+        debug("EA_BASE_NONE and EA_DISP_NONE for ModR/M base");
+        return true;
+      }
+      if (insn.mode == MODE_64BIT){
+        pcrel = insn.startLocation +
+                insn.displacementOffset + insn.displacementSize;
+        tryAddingPcLoadReferenceComment(insn.startLocation +
+                                        insn.displacementOffset,
+                                        insn.displacement + pcrel, Dis);
+        baseReg = MCOperand::createReg(X86::RIP); // Section 2.2.1.6
+      }
+      else
+        baseReg = MCOperand::createReg(0);
+
+      indexReg = MCOperand::createReg(0);
+      break;
+    case EA_BASE_BX_SI:
+      baseReg = MCOperand::createReg(X86::BX);
+      indexReg = MCOperand::createReg(X86::SI);
+      break;
+    case EA_BASE_BX_DI:
+      baseReg = MCOperand::createReg(X86::BX);
+      indexReg = MCOperand::createReg(X86::DI);
+      break;
+    case EA_BASE_BP_SI:
+      baseReg = MCOperand::createReg(X86::BP);
+      indexReg = MCOperand::createReg(X86::SI);
+      break;
+    case EA_BASE_BP_DI:
+      baseReg = MCOperand::createReg(X86::BP);
+      indexReg = MCOperand::createReg(X86::DI);
+      break;
+    default:
+      indexReg = MCOperand::createReg(0);
+      switch (insn.eaBase) {
+      default:
+        debug("Unexpected eaBase");
+        return true;
+        // Here, we will use the fill-ins defined above.  However,
+        //   BX_SI, BX_DI, BP_SI, and BP_DI are all handled above and
+        //   sib and sib64 were handled in the top-level if, so they're only
+        //   placeholders to keep the compiler happy.
+#define ENTRY(x)                                        \
+      case EA_BASE_##x:                                 \
+        baseReg = MCOperand::createReg(X86::x); break;
+      ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) case EA_REG_##x:
+      ALL_REGS
+#undef ENTRY
+        debug("A R/M memory operand may not be a register; "
+              "the base field must be a base.");
+        return true;
+      }
+    }
+
+    scaleAmount = MCOperand::createImm(1);
+  }
+
+  displacement = MCOperand::createImm(insn.displacement);
+
+  segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
+
+  mcInst.addOperand(baseReg);
+  mcInst.addOperand(scaleAmount);
+  mcInst.addOperand(indexReg);
+  if(!tryAddingSymbolicOperand(insn.displacement + pcrel, false,
+                               insn.startLocation, insn.displacementOffset,
+                               insn.displacementSize, mcInst, Dis))
+    mcInst.addOperand(displacement);
+  mcInst.addOperand(segmentReg);
+  return false;
+}
+
+/// translateRM - Translates an operand stored in the R/M (and possibly SIB)
+///   byte of an instruction to LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param operand      - The operand, as stored in the descriptor table.
+/// @param insn         - The instruction to extract Mod, R/M, and SIB fields
+///                       from.
+/// @return             - 0 on success; nonzero otherwise
+static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
+                        InternalInstruction &insn, const MCDisassembler *Dis) {
+  switch (operand.type) {
+  default:
+    debug("Unexpected type for a R/M operand");
+    return true;
+  case TYPE_R8:
+  case TYPE_R16:
+  case TYPE_R32:
+  case TYPE_R64:
+  case TYPE_Rv:
+  case TYPE_MM64:
+  case TYPE_XMM32:
+  case TYPE_XMM64:
+  case TYPE_XMM128:
+  case TYPE_XMM256:
+  case TYPE_XMM512:
+  case TYPE_VK1:
+  case TYPE_VK2:
+  case TYPE_VK4:
+  case TYPE_VK8:
+  case TYPE_VK16:
+  case TYPE_VK32:
+  case TYPE_VK64:
+  case TYPE_DEBUGREG:
+  case TYPE_CONTROLREG:
+  case TYPE_BNDR:
+    return translateRMRegister(mcInst, insn);
+  case TYPE_M:
+  case TYPE_M8:
+  case TYPE_M16:
+  case TYPE_M32:
+  case TYPE_M64:
+  case TYPE_M128:
+  case TYPE_M256:
+  case TYPE_M512:
+  case TYPE_Mv:
+  case TYPE_M32FP:
+  case TYPE_M64FP:
+  case TYPE_M80FP:
+  case TYPE_M1616:
+  case TYPE_M1632:
+  case TYPE_M1664:
+  case TYPE_LEA:
+    return translateRMMemory(mcInst, insn, Dis);
+  }
+}
+
+/// translateFPRegister - Translates a stack position on the FPU stack to its
+///   LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param stackPos     - The stack position to translate.
+static void translateFPRegister(MCInst &mcInst,
+                                uint8_t stackPos) {
+  mcInst.addOperand(MCOperand::createReg(X86::ST0 + stackPos));
+}
+
+/// translateMaskRegister - Translates a 3-bit mask register number to
+///   LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param maskRegNum   - Number of mask register from 0 to 7.
+/// @return             - false on success; true otherwise.
+static bool translateMaskRegister(MCInst &mcInst,
+                                uint8_t maskRegNum) {
+  if (maskRegNum >= 8) {
+    debug("Invalid mask register number");
+    return true;
+  }
+
+  mcInst.addOperand(MCOperand::createReg(X86::K0 + maskRegNum));
+  return false;
+}
+
+/// translateOperand - Translates an operand stored in an internal instruction
+///   to LLVM's format and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param operand      - The operand, as stored in the descriptor table.
+/// @param insn         - The internal instruction.
+/// @return             - false on success; true otherwise.
+static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
+                             InternalInstruction &insn,
+                             const MCDisassembler *Dis) {
+  switch (operand.encoding) {
+  default:
+    debug("Unhandled operand encoding during translation");
+    return true;
+  case ENCODING_REG:
+    translateRegister(mcInst, insn.reg);
+    return false;
+  case ENCODING_WRITEMASK:
+    return translateMaskRegister(mcInst, insn.writemask);
+  CASE_ENCODING_RM:
+    return translateRM(mcInst, operand, insn, Dis);
+  case ENCODING_IB:
+  case ENCODING_IW:
+  case ENCODING_ID:
+  case ENCODING_IO:
+  case ENCODING_Iv:
+  case ENCODING_Ia:
+    translateImmediate(mcInst,
+                       insn.immediates[insn.numImmediatesTranslated++],
+                       operand,
+                       insn,
+                       Dis);
+    return false;
+  case ENCODING_SI:
+    return translateSrcIndex(mcInst, insn);
+  case ENCODING_DI:
+    return translateDstIndex(mcInst, insn);
+  case ENCODING_RB:
+  case ENCODING_RW:
+  case ENCODING_RD:
+  case ENCODING_RO:
+  case ENCODING_Rv:
+    translateRegister(mcInst, insn.opcodeRegister);
+    return false;
+  case ENCODING_FP:
+    translateFPRegister(mcInst, insn.modRM & 7);
+    return false;
+  case ENCODING_VVVV:
+    translateRegister(mcInst, insn.vvvv);
+    return false;
+  case ENCODING_DUP:
+    return translateOperand(mcInst, insn.operands[operand.type - TYPE_DUP0],
+                            insn, Dis);
+  }
+}
+
+/// translateInstruction - Translates an internal instruction and all its
+///   operands to an MCInst.
+///
+/// @param mcInst       - The MCInst to populate with the instruction's data.
+/// @param insn         - The internal instruction.
+/// @return             - false on success; true otherwise.
+static bool translateInstruction(MCInst &mcInst,
+                                InternalInstruction &insn,
+                                const MCDisassembler *Dis) {
+  if (!insn.spec) {
+    debug("Instruction has no specification");
+    return true;
+  }
+
+  mcInst.clear();
+  mcInst.setOpcode(insn.instructionID);
+  // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3
+  // prefix bytes should be disassembled as xrelease and xacquire then set the
+  // opcode to those instead of the rep and repne opcodes.
+  if (insn.xAcquireRelease) {
+    if(mcInst.getOpcode() == X86::REP_PREFIX)
+      mcInst.setOpcode(X86::XRELEASE_PREFIX);
+    else if(mcInst.getOpcode() == X86::REPNE_PREFIX)
+      mcInst.setOpcode(X86::XACQUIRE_PREFIX);
+  }
+
+  insn.numImmediatesTranslated = 0;
+
+  for (const auto &Op : insn.operands) {
+    if (Op.encoding != ENCODING_NONE) {
+      if (translateOperand(mcInst, Op, insn, Dis)) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+static MCDisassembler *createX86Disassembler(const Target &T,
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  std::unique_ptr<const MCInstrInfo> MII(T.createMCInstrInfo());
+  return new X86GenericDisassembler(STI, Ctx, std::move(MII));
+}
+
+extern "C" void LLVMInitializeX86Disassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(getTheX86_32Target(),
+                                         createX86Disassembler);
+  TargetRegistry::RegisterMCDisassembler(getTheX86_64Target(),
+                                         createX86Disassembler);
+}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
new file mode 100644
index 000000000000..ab64d6fcf70b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -0,0 +1,1901 @@
+//===-- X86DisassemblerDecoder.cpp - Disassembler decoder -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the implementation of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdarg>   /* for va_*()       */
+#include <cstdio>    /* for vsnprintf()  */
+#include <cstdlib>   /* for exit()       */
+#include <cstring>   /* for memset()     */
+
+#include "X86DisassemblerDecoder.h"
+
+using namespace llvm::X86Disassembler;
+
+/// Specifies whether a ModR/M byte is needed and (if so) which
+/// instruction each possible value of the ModR/M byte corresponds to.  Once
+/// this information is known, we have narrowed down to a single instruction.
+struct ModRMDecision {
+  uint8_t modrm_type;
+  uint16_t instructionIDs;
+};
+
+/// Specifies which set of ModR/M->instruction tables to look at
+/// given a particular opcode.
+struct OpcodeDecision {
+  ModRMDecision modRMDecisions[256];
+};
+
+/// Specifies which opcode->instruction tables to look at given
+/// a particular context (set of attributes).  Since there are many possible
+/// contexts, the decoder first uses CONTEXTS_SYM to determine which context
+/// applies given a specific set of attributes.  Hence there are only IC_max
+/// entries in this table, rather than 2^(ATTR_max).
+struct ContextDecision {
+  OpcodeDecision opcodeDecisions[IC_max];
+};
+
+#include "X86GenDisassemblerTables.inc"
+
+#ifndef NDEBUG
+#define debug(s) do { Debug(__FILE__, __LINE__, s); } while (0)
+#else
+#define debug(s) do { } while (0)
+#endif
+
+/*
+ * contextForAttrs - Client for the instruction context table.  Takes a set of
+ *   attributes and returns the appropriate decode context.
+ *
+ * @param attrMask  - Attributes, from the enumeration attributeBits.
+ * @return          - The InstructionContext to use when looking up an
+ *                    an instruction with these attributes.
+ */
+static InstructionContext contextForAttrs(uint16_t attrMask) {
+  return static_cast<InstructionContext>(CONTEXTS_SYM[attrMask]);
+}
+
+/*
+ * modRMRequired - Reads the appropriate instruction table to determine whether
+ *   the ModR/M byte is required to decode a particular instruction.
+ *
+ * @param type        - The opcode type (i.e., how many bytes it has).
+ * @param insnContext - The context for the instruction, as returned by
+ *                      contextForAttrs.
+ * @param opcode      - The last byte of the instruction's opcode, not counting
+ *                      ModR/M extensions and escapes.
+ * @return            - true if the ModR/M byte is required, false otherwise.
+ */
+static int modRMRequired(OpcodeType type,
+                         InstructionContext insnContext,
+                         uint16_t opcode) {
+  const struct ContextDecision* decision = nullptr;
+
+  switch (type) {
+  case ONEBYTE:
+    decision = &ONEBYTE_SYM;
+    break;
+  case TWOBYTE:
+    decision = &TWOBYTE_SYM;
+    break;
+  case THREEBYTE_38:
+    decision = &THREEBYTE38_SYM;
+    break;
+  case THREEBYTE_3A:
+    decision = &THREEBYTE3A_SYM;
+    break;
+  case XOP8_MAP:
+    decision = &XOP8_MAP_SYM;
+    break;
+  case XOP9_MAP:
+    decision = &XOP9_MAP_SYM;
+    break;
+  case XOPA_MAP:
+    decision = &XOPA_MAP_SYM;
+    break;
+  }
+
+  return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
+    modrm_type != MODRM_ONEENTRY;
+}
+
+/*
+ * decode - Reads the appropriate instruction table to obtain the unique ID of
+ *   an instruction.
+ *
+ * @param type        - See modRMRequired().
+ * @param insnContext - See modRMRequired().
+ * @param opcode      - See modRMRequired().
+ * @param modRM       - The ModR/M byte if required, or any value if not.
+ * @return            - The UID of the instruction, or 0 on failure.
+ */
+static InstrUID decode(OpcodeType type,
+                       InstructionContext insnContext,
+                       uint8_t opcode,
+                       uint8_t modRM) {
+  const struct ModRMDecision* dec = nullptr;
+
+  switch (type) {
+  case ONEBYTE:
+    dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case TWOBYTE:
+    dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case THREEBYTE_38:
+    dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case THREEBYTE_3A:
+    dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOP8_MAP:
+    dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOP9_MAP:
+    dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOPA_MAP:
+    dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  }
+
+  switch (dec->modrm_type) {
+  default:
+    debug("Corrupt table!  Unknown modrm_type");
+    return 0;
+  case MODRM_ONEENTRY:
+    return modRMTable[dec->instructionIDs];
+  case MODRM_SPLITRM:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+1];
+    return modRMTable[dec->instructionIDs];
+  case MODRM_SPLITREG:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8];
+    return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+  case MODRM_SPLITMISC:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8];
+    return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+  case MODRM_FULL:
+    return modRMTable[dec->instructionIDs+modRM];
+  }
+}
+
+/*
+ * specifierForUID - Given a UID, returns the name and operand specification for
+ *   that instruction.
+ *
+ * @param uid - The unique ID for the instruction.  This should be returned by
+ *              decode(); specifierForUID will not check bounds.
+ * @return    - A pointer to the specification for that instruction.
+ */
+static const struct InstructionSpecifier *specifierForUID(InstrUID uid) {
+  return &INSTRUCTIONS_SYM[uid];
+}
+
+/*
+ * consumeByte - Uses the reader function provided by the user to consume one
+ *   byte from the instruction's memory and advance the cursor.
+ *
+ * @param insn  - The instruction with the reader function to use.  The cursor
+ *                for this instruction is advanced.
+ * @param byte  - A pointer to a pre-allocated memory buffer to be populated
+ *                with the data read.
+ * @return      - 0 if the read was successful; nonzero otherwise.
+ */
+static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) {
+  int ret = insn->reader(insn->readerArg, byte, insn->readerCursor);
+
+  if (!ret)
+    ++(insn->readerCursor);
+
+  return ret;
+}
+
+/*
+ * lookAtByte - Like consumeByte, but does not advance the cursor.
+ *
+ * @param insn  - See consumeByte().
+ * @param byte  - See consumeByte().
+ * @return      - See consumeByte().
+ */
+static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) {
+  return insn->reader(insn->readerArg, byte, insn->readerCursor);
+}
+
+static void unconsumeByte(struct InternalInstruction* insn) {
+  insn->readerCursor--;
+}
+
+#define CONSUME_FUNC(name, type)                                  \
+  static int name(struct InternalInstruction* insn, type* ptr) {  \
+    type combined = 0;                                            \
+    unsigned offset;                                              \
+    for (offset = 0; offset < sizeof(type); ++offset) {           \
+      uint8_t byte;                                               \
+      int ret = insn->reader(insn->readerArg,                     \
+                             &byte,                               \
+                             insn->readerCursor + offset);        \
+      if (ret)                                                    \
+        return ret;                                               \
+      combined = combined | ((uint64_t)byte << (offset * 8));     \
+    }                                                             \
+    *ptr = combined;                                              \
+    insn->readerCursor += sizeof(type);                           \
+    return 0;                                                     \
+  }
+
+/*
+ * consume* - Use the reader function provided by the user to consume data
+ *   values of various sizes from the instruction's memory and advance the
+ *   cursor appropriately.  These readers perform endian conversion.
+ *
+ * @param insn    - See consumeByte().
+ * @param ptr     - A pointer to a pre-allocated memory of appropriate size to
+ *                  be populated with the data read.
+ * @return        - See consumeByte().
+ */
+CONSUME_FUNC(consumeInt8, int8_t)
+CONSUME_FUNC(consumeInt16, int16_t)
+CONSUME_FUNC(consumeInt32, int32_t)
+CONSUME_FUNC(consumeUInt16, uint16_t)
+CONSUME_FUNC(consumeUInt32, uint32_t)
+CONSUME_FUNC(consumeUInt64, uint64_t)
+
+/*
+ * dbgprintf - Uses the logging function provided by the user to log a single
+ *   message, typically without a carriage-return.
+ *
+ * @param insn    - The instruction containing the logging function.
+ * @param format  - See printf().
+ * @param ...     - See printf().
+ */
+static void dbgprintf(struct InternalInstruction* insn,
+                      const char* format,
+                      ...) {
+  char buffer[256];
+  va_list ap;
+
+  if (!insn->dlog)
+    return;
+
+  va_start(ap, format);
+  (void)vsnprintf(buffer, sizeof(buffer), format, ap);
+  va_end(ap);
+
+  insn->dlog(insn->dlogArg, buffer);
+}
+
+/*
+ * setPrefixPresent - Marks that a particular prefix is present at a particular
+ *   location.
+ *
+ * @param insn      - The instruction to be marked as having the prefix.
+ * @param prefix    - The prefix that is present.
+ * @param location  - The location where the prefix is located (in the address
+ *                    space of the instruction's reader).
+ */
+static void setPrefixPresent(struct InternalInstruction* insn,
+                                    uint8_t prefix,
+                                    uint64_t location)
+{
+  insn->prefixPresent[prefix] = 1;
+  insn->prefixLocations[prefix] = location;
+}
+
+/*
+ * isPrefixAtLocation - Queries an instruction to determine whether a prefix is
+ *   present at a given location.
+ *
+ * @param insn      - The instruction to be queried.
+ * @param prefix    - The prefix.
+ * @param location  - The location to query.
+ * @return          - Whether the prefix is at that location.
+ */
+static bool isPrefixAtLocation(struct InternalInstruction* insn,
+                               uint8_t prefix,
+                               uint64_t location)
+{
+  return insn->prefixPresent[prefix] == 1 &&
+     insn->prefixLocations[prefix] == location;
+}
+
+/*
+ * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the
+ *   instruction as having them.  Also sets the instruction's default operand,
+ *   address, and other relevant data sizes to report operands correctly.
+ *
+ * @param insn  - The instruction whose prefixes are to be read.
+ * @return      - 0 if the instruction could be read until the end of the prefix
+ *                bytes, and no prefixes conflicted; nonzero otherwise.
+ */
+static int readPrefixes(struct InternalInstruction* insn) {
+  bool isPrefix = true;
+  bool prefixGroups[4] = { false };
+  uint64_t prefixLocation;
+  uint8_t byte = 0;
+  uint8_t nextByte;
+
+  bool hasAdSize = false;
+  bool hasOpSize = false;
+
+  dbgprintf(insn, "readPrefixes()");
+
+  while (isPrefix) {
+    prefixLocation = insn->readerCursor;
+
+    /* If we fail reading prefixes, just stop here and let the opcode reader deal with it */
+    if (consumeByte(insn, &byte))
+      break;
+
+    /*
+     * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
+     * break and let it be disassembled as a normal "instruction".
+     */
+    if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0)
+      break;
+
+    if (insn->readerCursor - 1 == insn->startLocation
+        && (byte == 0xf2 || byte == 0xf3)
+        && !lookAtByte(insn, &nextByte))
+    {
+      /*
+       * If the byte is 0xf2 or 0xf3, and any of the following conditions are
+       * met:
+       * - it is followed by a LOCK (0xf0) prefix
+       * - it is followed by an xchg instruction
+       * then it should be disassembled as a xacquire/xrelease not repne/rep.
+       */
+      if ((byte == 0xf2 || byte == 0xf3) &&
+          ((nextByte == 0xf0) ||
+          ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
+        insn->xAcquireRelease = true;
+      /*
+       * Also if the byte is 0xf3, and the following condition is met:
+       * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
+       *                       "mov mem, imm" (opcode 0xc6/0xc7) instructions.
+       * then it should be disassembled as an xrelease not rep.
+       */
+      if (byte == 0xf3 &&
+          (nextByte == 0x88 || nextByte == 0x89 ||
+           nextByte == 0xc6 || nextByte == 0xc7))
+        insn->xAcquireRelease = true;
+      if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) {
+        if (consumeByte(insn, &nextByte))
+          return -1;
+        if (lookAtByte(insn, &nextByte))
+          return -1;
+        unconsumeByte(insn);
+      }
+      if (nextByte != 0x0f && nextByte != 0x90)
+        break;
+    }
+
+    switch (byte) {
+    case 0xf0:  /* LOCK */
+    case 0xf2:  /* REPNE/REPNZ */
+    case 0xf3:  /* REP or REPE/REPZ */
+      if (prefixGroups[0])
+        dbgprintf(insn, "Redundant Group 1 prefix");
+      prefixGroups[0] = true;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x2e:  /* CS segment override -OR- Branch not taken */
+    case 0x36:  /* SS segment override -OR- Branch taken */
+    case 0x3e:  /* DS segment override */
+    case 0x26:  /* ES segment override */
+    case 0x64:  /* FS segment override */
+    case 0x65:  /* GS segment override */
+      switch (byte) {
+      case 0x2e:
+        insn->segmentOverride = SEG_OVERRIDE_CS;
+        break;
+      case 0x36:
+        insn->segmentOverride = SEG_OVERRIDE_SS;
+        break;
+      case 0x3e:
+        insn->segmentOverride = SEG_OVERRIDE_DS;
+        break;
+      case 0x26:
+        insn->segmentOverride = SEG_OVERRIDE_ES;
+        break;
+      case 0x64:
+        insn->segmentOverride = SEG_OVERRIDE_FS;
+        break;
+      case 0x65:
+        insn->segmentOverride = SEG_OVERRIDE_GS;
+        break;
+      default:
+        debug("Unhandled override");
+        return -1;
+      }
+      if (prefixGroups[1])
+        dbgprintf(insn, "Redundant Group 2 prefix");
+      prefixGroups[1] = true;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x66:  /* Operand-size override */
+      if (prefixGroups[2])
+        dbgprintf(insn, "Redundant Group 3 prefix");
+      prefixGroups[2] = true;
+      hasOpSize = true;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x67:  /* Address-size override */
+      if (prefixGroups[3])
+        dbgprintf(insn, "Redundant Group 4 prefix");
+      prefixGroups[3] = true;
+      hasAdSize = true;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    default:    /* Not a prefix byte */
+      isPrefix = false;
+      break;
+    }
+
+    if (isPrefix)
+      dbgprintf(insn, "Found prefix 0x%hhx", byte);
+  }
+
+  insn->vectorExtensionType = TYPE_NO_VEX_XOP;
+
+  if (byte == 0x62) {
+    uint8_t byte1, byte2;
+
+    if (consumeByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of EVEX prefix");
+      return -1;
+    }
+
+    if (lookAtByte(insn, &byte2)) {
+      dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
+      return -1;
+    }
+
+    if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) &&
+       ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) {
+      insn->vectorExtensionType = TYPE_EVEX;
+    } else {
+      unconsumeByte(insn); /* unconsume byte1 */
+      unconsumeByte(insn); /* unconsume byte  */
+      insn->necessaryPrefixLocation = insn->readerCursor - 2;
+    }
+
+    if (insn->vectorExtensionType == TYPE_EVEX) {
+      insn->vectorExtensionPrefix[0] = byte;
+      insn->vectorExtensionPrefix[1] = byte1;
+      if (consumeByte(insn, &insn->vectorExtensionPrefix[2])) {
+        dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
+        return -1;
+      }
+      if (consumeByte(insn, &insn->vectorExtensionPrefix[3])) {
+        dbgprintf(insn, "Couldn't read fourth byte of EVEX prefix");
+        return -1;
+      }
+
+      /* We simulate the REX prefix for simplicity's sake */
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromEVEX3of4(insn->vectorExtensionPrefix[2]) << 3)
+                        | (rFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 2)
+                        | (xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 1)
+                        | (bFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 0);
+      }
+
+      dbgprintf(insn, "Found EVEX prefix 0x%hhx 0x%hhx 0x%hhx 0x%hhx",
+              insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+              insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]);
+    }
+  } else if (byte == 0xc4) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of VEX");
+      return -1;
+    }
+
+    if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+      insn->vectorExtensionType = TYPE_VEX_3B;
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    } else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+
+    if (insn->vectorExtensionType == TYPE_VEX_3B) {
+      insn->vectorExtensionPrefix[0] = byte;
+      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+      consumeByte(insn, &insn->vectorExtensionPrefix[2]);
+
+      /* We simulate the REX prefix for simplicity's sake */
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3)
+                        | (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2)
+                        | (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1)
+                        | (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0);
+      }
+
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx",
+                insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+                insn->vectorExtensionPrefix[2]);
+    }
+  } else if (byte == 0xc5) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of VEX");
+      return -1;
+    }
+
+    if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+      insn->vectorExtensionType = TYPE_VEX_2B;
+    } else {
+      unconsumeByte(insn);
+    }
+
+    if (insn->vectorExtensionType == TYPE_VEX_2B) {
+      insn->vectorExtensionPrefix[0] = byte;
+      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2);
+      }
+
+      switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
+      default:
+        break;
+      case VEX_PREFIX_66:
+        hasOpSize = true;
+        break;
+      }
+
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx",
+                insn->vectorExtensionPrefix[0],
+                insn->vectorExtensionPrefix[1]);
+    }
+  } else if (byte == 0x8f) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of XOP");
+      return -1;
+    }
+
+    if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */
+      insn->vectorExtensionType = TYPE_XOP;
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    } else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+
+    if (insn->vectorExtensionType == TYPE_XOP) {
+      insn->vectorExtensionPrefix[0] = byte;
+      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+      consumeByte(insn, &insn->vectorExtensionPrefix[2]);
+
+      /* We simulate the REX prefix for simplicity's sake */
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3)
+                        | (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2)
+                        | (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1)
+                        | (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0);
+      }
+
+      switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
+      default:
+        break;
+      case VEX_PREFIX_66:
+        hasOpSize = true;
+        break;
+      }
+
+      dbgprintf(insn, "Found XOP prefix 0x%hhx 0x%hhx 0x%hhx",
+                insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+                insn->vectorExtensionPrefix[2]);
+    }
+  } else {
+    if (insn->mode == MODE_64BIT) {
+      if ((byte & 0xf0) == 0x40) {
+        uint8_t opcodeByte;
+
+        if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
+          dbgprintf(insn, "Redundant REX prefix");
+          return -1;
+        }
+
+        insn->rexPrefix = byte;
+        insn->necessaryPrefixLocation = insn->readerCursor - 2;
+
+        dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
+      } else {
+        unconsumeByte(insn);
+        insn->necessaryPrefixLocation = insn->readerCursor - 1;
+      }
+    } else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+  }
+
+  if (insn->mode == MODE_16BIT) {
+    insn->registerSize       = (hasOpSize ? 4 : 2);
+    insn->addressSize        = (hasAdSize ? 4 : 2);
+    insn->displacementSize   = (hasAdSize ? 4 : 2);
+    insn->immediateSize      = (hasOpSize ? 4 : 2);
+  } else if (insn->mode == MODE_32BIT) {
+    insn->registerSize       = (hasOpSize ? 2 : 4);
+    insn->addressSize        = (hasAdSize ? 2 : 4);
+    insn->displacementSize   = (hasAdSize ? 2 : 4);
+    insn->immediateSize      = (hasOpSize ? 2 : 4);
+  } else if (insn->mode == MODE_64BIT) {
+    if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
+      insn->registerSize       = 8;
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = 4;
+      insn->immediateSize      = 4;
+    } else if (insn->rexPrefix) {
+      insn->registerSize       = (hasOpSize ? 2 : 4);
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = (hasOpSize ? 2 : 4);
+      insn->immediateSize      = (hasOpSize ? 2 : 4);
+    } else {
+      insn->registerSize       = (hasOpSize ? 2 : 4);
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = (hasOpSize ? 2 : 4);
+      insn->immediateSize      = (hasOpSize ? 2 : 4);
+    }
+  }
+
+  return 0;
+}
+
+/*
+ * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of
+ *   extended or escape opcodes).
+ *
+ * @param insn  - The instruction whose opcode is to be read.
+ * @return      - 0 if the opcode could be read successfully; nonzero otherwise.
+ */
+static int readOpcode(struct InternalInstruction* insn) {
+  /* Determine the length of the primary opcode */
+
+  uint8_t current;
+
+  dbgprintf(insn, "readOpcode()");
+
+  insn->opcodeType = ONEBYTE;
+
+  if (insn->vectorExtensionType == TYPE_EVEX) {
+    switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
+    default:
+      dbgprintf(insn, "Unhandled mm field for instruction (0x%hhx)",
+                mmFromEVEX2of4(insn->vectorExtensionPrefix[1]));
+      return -1;
+    case VEX_LOB_0F:
+      insn->opcodeType = TWOBYTE;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F38:
+      insn->opcodeType = THREEBYTE_38;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F3A:
+      insn->opcodeType = THREEBYTE_3A;
+      return consumeByte(insn, &insn->opcode);
+    }
+  } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+    switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) {
+    default:
+      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+                mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
+      return -1;
+    case VEX_LOB_0F:
+      insn->opcodeType = TWOBYTE;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F38:
+      insn->opcodeType = THREEBYTE_38;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F3A:
+      insn->opcodeType = THREEBYTE_3A;
+      return consumeByte(insn, &insn->opcode);
+    }
+  } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+    insn->opcodeType = TWOBYTE;
+    return consumeByte(insn, &insn->opcode);
+  } else if (insn->vectorExtensionType == TYPE_XOP) {
+    switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) {
+    default:
+      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+                mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
+      return -1;
+    case XOP_MAP_SELECT_8:
+      insn->opcodeType = XOP8_MAP;
+      return consumeByte(insn, &insn->opcode);
+    case XOP_MAP_SELECT_9:
+      insn->opcodeType = XOP9_MAP;
+      return consumeByte(insn, &insn->opcode);
+    case XOP_MAP_SELECT_A:
+      insn->opcodeType = XOPA_MAP;
+      return consumeByte(insn, &insn->opcode);
+    }
+  }
+
+  if (consumeByte(insn, &current))
+    return -1;
+
+  if (current == 0x0f) {
+    dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
+
+    if (consumeByte(insn, &current))
+      return -1;
+
+    if (current == 0x38) {
+      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+      if (consumeByte(insn, &current))
+        return -1;
+
+      insn->opcodeType = THREEBYTE_38;
+    } else if (current == 0x3a) {
+      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+      if (consumeByte(insn, &current))
+        return -1;
+
+      insn->opcodeType = THREEBYTE_3A;
+    } else {
+      dbgprintf(insn, "Didn't find a three-byte escape prefix");
+
+      insn->opcodeType = TWOBYTE;
+    }
+  }
+
+  /*
+   * At this point we have consumed the full opcode.
+   * Anything we consume from here on must be unconsumed.
+   */
+
+  insn->opcode = current;
+
+  return 0;
+}
+
+static int readModRM(struct InternalInstruction* insn);
+
+/*
+ * getIDWithAttrMask - Determines the ID of an instruction, consuming
+ *   the ModR/M byte as appropriate for extended and escape opcodes,
+ *   and using a supplied attribute mask.
+ *
+ * @param instructionID - A pointer whose target is filled in with the ID of the
+ *                        instruction.
+ * @param insn          - The instruction whose ID is to be determined.
+ * @param attrMask      - The attribute mask to search.
+ * @return              - 0 if the ModR/M could be read when needed or was not
+ *                        needed; nonzero otherwise.
+ */
+static int getIDWithAttrMask(uint16_t* instructionID,
+                             struct InternalInstruction* insn,
+                             uint16_t attrMask) {
+  bool hasModRMExtension;
+
+  InstructionContext instructionClass = contextForAttrs(attrMask);
+
+  hasModRMExtension = modRMRequired(insn->opcodeType,
+                                    instructionClass,
+                                    insn->opcode);
+
+  if (hasModRMExtension) {
+    if (readModRM(insn))
+      return -1;
+
+    *instructionID = decode(insn->opcodeType,
+                            instructionClass,
+                            insn->opcode,
+                            insn->modRM);
+  } else {
+    *instructionID = decode(insn->opcodeType,
+                            instructionClass,
+                            insn->opcode,
+                            0);
+  }
+
+  return 0;
+}
+
+/*
+ * is16BitEquivalent - Determines whether two instruction names refer to
+ * equivalent instructions but one is 16-bit whereas the other is not.
+ *
+ * @param orig  - The instruction that is not 16-bit
+ * @param equiv - The instruction that is 16-bit
+ */
+static bool is16BitEquivalent(const char *orig, const char *equiv) {
+  off_t i;
+
+  for (i = 0;; i++) {
+    if (orig[i] == '\0' && equiv[i] == '\0')
+      return true;
+    if (orig[i] == '\0' || equiv[i] == '\0')
+      return false;
+    if (orig[i] != equiv[i]) {
+      if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
+        continue;
+      if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
+        continue;
+      if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
+        continue;
+      return false;
+    }
+  }
+}
+
+/*
+ * is64Bit - Determines whether this instruction is a 64-bit instruction.
+ *
+ * @param name - The instruction that is not 16-bit
+ */
+static bool is64Bit(const char *name) {
+  off_t i;
+
+  for (i = 0;; ++i) {
+    if (name[i] == '\0')
+      return false;
+    if (name[i] == '6' && name[i+1] == '4')
+      return true;
+  }
+}
+
+/*
+ * getID - Determines the ID of an instruction, consuming the ModR/M byte as
+ *   appropriate for extended and escape opcodes.  Determines the attributes and
+ *   context for the instruction before doing so.
+ *
+ * @param insn  - The instruction whose ID is to be determined.
+ * @return      - 0 if the ModR/M could be read when needed or was not needed;
+ *                nonzero otherwise.
+ */
+static int getID(struct InternalInstruction* insn, const void *miiArg) {
+  uint16_t attrMask;
+  uint16_t instructionID;
+
+  dbgprintf(insn, "getID()");
+
+  attrMask = ATTR_NONE;
+
+  if (insn->mode == MODE_64BIT)
+    attrMask |= ATTR_64BIT;
+
+  if (insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
+    attrMask |= (insn->vectorExtensionType == TYPE_EVEX) ? ATTR_EVEX : ATTR_VEX;
+
+    if (insn->vectorExtensionType == TYPE_EVEX) {
+      switch (ppFromEVEX3of4(insn->vectorExtensionPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (zFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXKZ;
+      if (bFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXB;
+      if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXK;
+      if (lFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXL;
+      if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXL2;
+    } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+      switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromVEX3of3(insn->vectorExtensionPrefix[2]))
+        attrMask |= ATTR_VEXL;
+    } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+      switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromVEX2of2(insn->vectorExtensionPrefix[1]))
+        attrMask |= ATTR_VEXL;
+    } else if (insn->vectorExtensionType == TYPE_XOP) {
+      switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromXOP3of3(insn->vectorExtensionPrefix[2]))
+        attrMask |= ATTR_VEXL;
+    } else {
+      return -1;
+    }
+  } else {
+    if (insn->mode != MODE_16BIT && isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_OPSIZE;
+    else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_ADSIZE;
+    else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_XS;
+    else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_XD;
+  }
+
+  if (insn->rexPrefix & 0x08)
+    attrMask |= ATTR_REXW;
+
+  /*
+   * JCXZ/JECXZ need special handling for 16-bit mode because the meaning
+   * of the AdSize prefix is inverted w.r.t. 32-bit mode.
+   */
+  if (insn->mode == MODE_16BIT && insn->opcodeType == ONEBYTE &&
+      insn->opcode == 0xE3)
+    attrMask ^= ATTR_ADSIZE;
+
+  /*
+   * In 64-bit mode all f64 superscripted opcodes ignore opcode size prefix
+   * CALL/JMP/JCC instructions need to ignore 0x66 and consume 4 bytes
+   */
+
+  if (insn->mode == MODE_64BIT &&
+      isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) {
+    switch (insn->opcode) {
+    case 0xE8:
+    case 0xE9:
+      // Take care of psubsb and other mmx instructions.
+      if (insn->opcodeType == ONEBYTE) {
+        attrMask ^= ATTR_OPSIZE;
+        insn->immediateSize = 4;
+        insn->displacementSize = 4;
+      }
+      break;
+    case 0x82:
+    case 0x83:
+    case 0x84:
+    case 0x85:
+    case 0x86:
+    case 0x87:
+    case 0x88:
+    case 0x89:
+    case 0x8A:
+    case 0x8B:
+    case 0x8C:
+    case 0x8D:
+    case 0x8E:
+    case 0x8F:
+      // Take care of lea and three byte ops.
+      if (insn->opcodeType == TWOBYTE) {
+        attrMask ^= ATTR_OPSIZE;
+        insn->immediateSize = 4;
+        insn->displacementSize = 4;
+      }
+      break;
+    }
+  }
+
+  if (getIDWithAttrMask(&instructionID, insn, attrMask))
+    return -1;
+
+  /* The following clauses compensate for limitations of the tables. */
+
+  if (insn->mode != MODE_64BIT &&
+      insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
+    /*
+     * The tables can't distinquish between cases where the W-bit is used to
+     * select register size and cases where its a required part of the opcode.
+     */
+    if ((insn->vectorExtensionType == TYPE_EVEX &&
+         wFromEVEX3of4(insn->vectorExtensionPrefix[2])) ||
+        (insn->vectorExtensionType == TYPE_VEX_3B &&
+         wFromVEX3of3(insn->vectorExtensionPrefix[2])) ||
+        (insn->vectorExtensionType == TYPE_XOP &&
+         wFromXOP3of3(insn->vectorExtensionPrefix[2]))) {
+
+      uint16_t instructionIDWithREXW;
+      if (getIDWithAttrMask(&instructionIDWithREXW,
+                            insn, attrMask | ATTR_REXW)) {
+        insn->instructionID = instructionID;
+        insn->spec = specifierForUID(instructionID);
+        return 0;
+      }
+
+      auto SpecName = GetInstrName(instructionIDWithREXW, miiArg);
+      // If not a 64-bit instruction. Switch the opcode.
+      if (!is64Bit(SpecName.data())) {
+        insn->instructionID = instructionIDWithREXW;
+        insn->spec = specifierForUID(instructionIDWithREXW);
+        return 0;
+      }
+    }
+  }
+
+  /*
+   * Absolute moves need special handling.
+   * -For 16-bit mode because the meaning of the AdSize and OpSize prefixes are
+   *  inverted w.r.t.
+   * -For 32-bit mode we need to ensure the ADSIZE prefix is observed in
+   *  any position.
+   */
+  if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) {
+    /* Make sure we observed the prefixes in any position. */
+    if (insn->prefixPresent[0x67])
+      attrMask |= ATTR_ADSIZE;
+    if (insn->prefixPresent[0x66])
+      attrMask |= ATTR_OPSIZE;
+
+    /* In 16-bit, invert the attributes. */
+    if (insn->mode == MODE_16BIT)
+      attrMask ^= ATTR_ADSIZE | ATTR_OPSIZE;
+
+    if (getIDWithAttrMask(&instructionID, insn, attrMask))
+      return -1;
+
+    insn->instructionID = instructionID;
+    insn->spec = specifierForUID(instructionID);
+    return 0;
+  }
+
+  if ((insn->mode == MODE_16BIT || insn->prefixPresent[0x66]) &&
+      !(attrMask & ATTR_OPSIZE)) {
+    /*
+     * The instruction tables make no distinction between instructions that
+     * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
+     * particular spot (i.e., many MMX operations).  In general we're
+     * conservative, but in the specific case where OpSize is present but not
+     * in the right place we check if there's a 16-bit operation.
+     */
+
+    const struct InstructionSpecifier *spec;
+    uint16_t instructionIDWithOpsize;
+    llvm::StringRef specName, specWithOpSizeName;
+
+    spec = specifierForUID(instructionID);
+
+    if (getIDWithAttrMask(&instructionIDWithOpsize,
+                          insn,
+                          attrMask | ATTR_OPSIZE)) {
+      /*
+       * ModRM required with OpSize but not present; give up and return version
+       * without OpSize set
+       */
+
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+      return 0;
+    }
+
+    specName = GetInstrName(instructionID, miiArg);
+    specWithOpSizeName = GetInstrName(instructionIDWithOpsize, miiArg);
+
+    if (is16BitEquivalent(specName.data(), specWithOpSizeName.data()) &&
+        (insn->mode == MODE_16BIT) ^ insn->prefixPresent[0x66]) {
+      insn->instructionID = instructionIDWithOpsize;
+      insn->spec = specifierForUID(instructionIDWithOpsize);
+    } else {
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+    }
+    return 0;
+  }
+
+  if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 &&
+      insn->rexPrefix & 0x01) {
+    /*
+     * NOOP shouldn't decode as NOOP if REX.b is set. Instead
+     * it should decode as XCHG %r8, %eax.
+     */
+
+    const struct InstructionSpecifier *spec;
+    uint16_t instructionIDWithNewOpcode;
+    const struct InstructionSpecifier *specWithNewOpcode;
+
+    spec = specifierForUID(instructionID);
+
+    /* Borrow opcode from one of the other XCHGar opcodes */
+    insn->opcode = 0x91;
+
+    if (getIDWithAttrMask(&instructionIDWithNewOpcode,
+                          insn,
+                          attrMask)) {
+      insn->opcode = 0x90;
+
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+      return 0;
+    }
+
+    specWithNewOpcode = specifierForUID(instructionIDWithNewOpcode);
+
+    /* Change back */
+    insn->opcode = 0x90;
+
+    insn->instructionID = instructionIDWithNewOpcode;
+    insn->spec = specWithNewOpcode;
+
+    return 0;
+  }
+
+  insn->instructionID = instructionID;
+  insn->spec = specifierForUID(insn->instructionID);
+
+  return 0;
+}
+
+/*
+ * readSIB - Consumes the SIB byte to determine addressing information for an
+ *   instruction.
+ *
+ * @param insn  - The instruction whose SIB byte is to be read.
+ * @return      - 0 if the SIB byte was successfully read; nonzero otherwise.
+ */
+static int readSIB(struct InternalInstruction* insn) {
+  SIBIndex sibIndexBase = SIB_INDEX_NONE;
+  SIBBase sibBaseBase = SIB_BASE_NONE;
+  uint8_t index, base;
+
+  dbgprintf(insn, "readSIB()");
+
+  if (insn->consumedSIB)
+    return 0;
+
+  insn->consumedSIB = true;
+
+  switch (insn->addressSize) {
+  case 2:
+    dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
+    return -1;
+  case 4:
+    sibIndexBase = SIB_INDEX_EAX;
+    sibBaseBase = SIB_BASE_EAX;
+    break;
+  case 8:
+    sibIndexBase = SIB_INDEX_RAX;
+    sibBaseBase = SIB_BASE_RAX;
+    break;
+  }
+
+  if (consumeByte(insn, &insn->sib))
+    return -1;
+
+  index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
+
+  // FIXME: The fifth bit (bit index 4) is only to be used for instructions
+  // that understand VSIB indexing. ORing the bit in here is mildy dangerous
+  // because performing math on an 'enum SIBIndex' can produce garbage.
+  // Excluding the "none" value, it should cover 6 spaces of register names:
+  //   - 16 possibilities for 16-bit GPR starting at SIB_INDEX_BX_SI
+  //   - 16 possibilities for 32-bit GPR starting at SIB_INDEX_EAX
+  //   - 16 possibilities for 64-bit GPR starting at SIB_INDEX_RAX
+  //   - 32 possibilities for each of XMM, YMM, ZMM registers
+  // When sibIndexBase gets assigned SIB_INDEX_RAX as it does in 64-bit mode,
+  // summing in a fully decoded index between 0 and 31 can end up with a value
+  // that looks like something in the low half of the XMM range.
+  // translateRMMemory() tries to reverse the damage, with only partial success,
+  // as evidenced by known bugs in "test/MC/Disassembler/X86/x86-64.txt"
+  if (insn->vectorExtensionType == TYPE_EVEX)
+    index |= v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4;
+
+  if (index == 0x4) {
+    insn->sibIndex = SIB_INDEX_NONE;
+  } else {
+    insn->sibIndex = (SIBIndex)(sibIndexBase + index);
+  }
+
+  insn->sibScale = 1 << scaleFromSIB(insn->sib);
+
+  base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
+
+  switch (base) {
+  case 0x5:
+  case 0xd:
+    switch (modFromModRM(insn->modRM)) {
+    case 0x0:
+      insn->eaDisplacement = EA_DISP_32;
+      insn->sibBase = SIB_BASE_NONE;
+      break;
+    case 0x1:
+      insn->eaDisplacement = EA_DISP_8;
+      insn->sibBase = (SIBBase)(sibBaseBase + base);
+      break;
+    case 0x2:
+      insn->eaDisplacement = EA_DISP_32;
+      insn->sibBase = (SIBBase)(sibBaseBase + base);
+      break;
+    case 0x3:
+      debug("Cannot have Mod = 0b11 and a SIB byte");
+      return -1;
+    }
+    break;
+  default:
+    insn->sibBase = (SIBBase)(sibBaseBase + base);
+    break;
+  }
+
+  return 0;
+}
+
+/*
+ * readDisplacement - Consumes the displacement of an instruction.
+ *
+ * @param insn  - The instruction whose displacement is to be read.
+ * @return      - 0 if the displacement byte was successfully read; nonzero
+ *                otherwise.
+ */
+static int readDisplacement(struct InternalInstruction* insn) {
+  int8_t d8;
+  int16_t d16;
+  int32_t d32;
+
+  dbgprintf(insn, "readDisplacement()");
+
+  if (insn->consumedDisplacement)
+    return 0;
+
+  insn->consumedDisplacement = true;
+  insn->displacementOffset = insn->readerCursor - insn->startLocation;
+
+  switch (insn->eaDisplacement) {
+  case EA_DISP_NONE:
+    insn->consumedDisplacement = false;
+    break;
+  case EA_DISP_8:
+    if (consumeInt8(insn, &d8))
+      return -1;
+    insn->displacement = d8;
+    break;
+  case EA_DISP_16:
+    if (consumeInt16(insn, &d16))
+      return -1;
+    insn->displacement = d16;
+    break;
+  case EA_DISP_32:
+    if (consumeInt32(insn, &d32))
+      return -1;
+    insn->displacement = d32;
+    break;
+  }
+
+  insn->consumedDisplacement = true;
+  return 0;
+}
+
+/*
+ * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and
+ *   displacement) for an instruction and interprets it.
+ *
+ * @param insn  - The instruction whose addressing information is to be read.
+ * @return      - 0 if the information was successfully read; nonzero otherwise.
+ */
+static int readModRM(struct InternalInstruction* insn) {
+  uint8_t mod, rm, reg;
+
+  dbgprintf(insn, "readModRM()");
+
+  if (insn->consumedModRM)
+    return 0;
+
+  if (consumeByte(insn, &insn->modRM))
+    return -1;
+  insn->consumedModRM = true;
+
+  mod     = modFromModRM(insn->modRM);
+  rm      = rmFromModRM(insn->modRM);
+  reg     = regFromModRM(insn->modRM);
+
+  /*
+   * This goes by insn->registerSize to pick the correct register, which messes
+   * up if we're using (say) XMM or 8-bit register operands.  That gets fixed in
+   * fixupReg().
+   */
+  switch (insn->registerSize) {
+  case 2:
+    insn->regBase = MODRM_REG_AX;
+    insn->eaRegBase = EA_REG_AX;
+    break;
+  case 4:
+    insn->regBase = MODRM_REG_EAX;
+    insn->eaRegBase = EA_REG_EAX;
+    break;
+  case 8:
+    insn->regBase = MODRM_REG_RAX;
+    insn->eaRegBase = EA_REG_RAX;
+    break;
+  }
+
+  reg |= rFromREX(insn->rexPrefix) << 3;
+  rm  |= bFromREX(insn->rexPrefix) << 3;
+  if (insn->vectorExtensionType == TYPE_EVEX) {
+    reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+    rm  |=  xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+  }
+
+  insn->reg = (Reg)(insn->regBase + reg);
+
+  switch (insn->addressSize) {
+  case 2:
+    insn->eaBaseBase = EA_BASE_BX_SI;
+
+    switch (mod) {
+    case 0x0:
+      if (rm == 0x6) {
+        insn->eaBase = EA_BASE_NONE;
+        insn->eaDisplacement = EA_DISP_16;
+        if (readDisplacement(insn))
+          return -1;
+      } else {
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        insn->eaDisplacement = EA_DISP_NONE;
+      }
+      break;
+    case 0x1:
+      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+      insn->eaDisplacement = EA_DISP_8;
+      insn->displacementSize = 1;
+      if (readDisplacement(insn))
+        return -1;
+      break;
+    case 0x2:
+      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+      insn->eaDisplacement = EA_DISP_16;
+      if (readDisplacement(insn))
+        return -1;
+      break;
+    case 0x3:
+      insn->eaBase = (EABase)(insn->eaRegBase + rm);
+      if (readDisplacement(insn))
+        return -1;
+      break;
+    }
+    break;
+  case 4:
+  case 8:
+    insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
+
+    switch (mod) {
+    case 0x0:
+      insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */
+      // In determining whether RIP-relative mode is used (rm=5),
+      // or whether a SIB byte is present (rm=4),
+      // the extension bits (REX.b and EVEX.x) are ignored.
+      switch (rm & 7) {
+      case 0x4: // SIB byte is present
+        insn->eaBase = (insn->addressSize == 4 ?
+                        EA_BASE_sib : EA_BASE_sib64);
+        if (readSIB(insn) || readDisplacement(insn))
+          return -1;
+        break;
+      case 0x5: // RIP-relative
+        insn->eaBase = EA_BASE_NONE;
+        insn->eaDisplacement = EA_DISP_32;
+        if (readDisplacement(insn))
+          return -1;
+        break;
+      default:
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        break;
+      }
+      break;
+    case 0x1:
+      insn->displacementSize = 1;
+      /* FALLTHROUGH */
+    case 0x2:
+      insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
+      switch (rm & 7) {
+      case 0x4: // SIB byte is present
+        insn->eaBase = EA_BASE_sib;
+        if (readSIB(insn) || readDisplacement(insn))
+          return -1;
+        break;
+      default:
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        if (readDisplacement(insn))
+          return -1;
+        break;
+      }
+      break;
+    case 0x3:
+      insn->eaDisplacement = EA_DISP_NONE;
+      insn->eaBase = (EABase)(insn->eaRegBase + rm);
+      break;
+    }
+    break;
+  } /* switch (insn->addressSize) */
+
+  return 0;
+}
+
+#define GENERIC_FIXUP_FUNC(name, base, prefix)            \
+  static uint16_t name(struct InternalInstruction *insn,  \
+                       OperandType type,                  \
+                       uint8_t index,                     \
+                       uint8_t *valid) {                  \
+    *valid = 1;                                           \
+    switch (type) {                                       \
+    default:                                              \
+      debug("Unhandled register type");                   \
+      *valid = 0;                                         \
+      return 0;                                           \
+    case TYPE_Rv:                                         \
+      return base + index;                                \
+    case TYPE_R8:                                         \
+      if (insn->rexPrefix &&                              \
+         index >= 4 && index <= 7) {                      \
+        return prefix##_SPL + (index - 4);                \
+      } else {                                            \
+        return prefix##_AL + index;                       \
+      }                                                   \
+    case TYPE_R16:                                        \
+      return prefix##_AX + index;                         \
+    case TYPE_R32:                                        \
+      return prefix##_EAX + index;                        \
+    case TYPE_R64:                                        \
+      return prefix##_RAX + index;                        \
+    case TYPE_XMM512:                                     \
+      return prefix##_ZMM0 + index;                       \
+    case TYPE_XMM256:                                     \
+      return prefix##_YMM0 + index;                       \
+    case TYPE_XMM128:                                     \
+    case TYPE_XMM64:                                      \
+    case TYPE_XMM32:                                      \
+      return prefix##_XMM0 + index;                       \
+    case TYPE_VK1:                                        \
+    case TYPE_VK2:                                        \
+    case TYPE_VK4:                                        \
+    case TYPE_VK8:                                        \
+    case TYPE_VK16:                                       \
+    case TYPE_VK32:                                       \
+    case TYPE_VK64:                                       \
+      if (index > 7)                                      \
+        *valid = 0;                                       \
+      return prefix##_K0 + index;                         \
+    case TYPE_MM64:                                       \
+      return prefix##_MM0 + (index & 0x7);                \
+    case TYPE_SEGMENTREG:                                 \
+      if (index > 5)                                      \
+        *valid = 0;                                       \
+      return prefix##_ES + index;                         \
+    case TYPE_DEBUGREG:                                   \
+      return prefix##_DR0 + index;                        \
+    case TYPE_CONTROLREG:                                 \
+      return prefix##_CR0 + index;                        \
+    case TYPE_BNDR:                                       \
+      if (index > 3)                                      \
+        *valid = 0;                                       \
+      return prefix##_BND0 + index;                       \
+    }                                                     \
+  }
+
+/*
+ * fixup*Value - Consults an operand type to determine the meaning of the
+ *   reg or R/M field.  If the operand is an XMM operand, for example, an
+ *   operand would be XMM0 instead of AX, which readModRM() would otherwise
+ *   misinterpret it as.
+ *
+ * @param insn  - The instruction containing the operand.
+ * @param type  - The operand type.
+ * @param index - The existing value of the field as reported by readModRM().
+ * @param valid - The address of a uint8_t.  The target is set to 1 if the
+ *                field is valid for the register class; 0 if not.
+ * @return      - The proper value.
+ */
+GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase,    MODRM_REG)
+GENERIC_FIXUP_FUNC(fixupRMValue,  insn->eaRegBase,  EA_REG)
+
+/*
+ * fixupReg - Consults an operand specifier to determine which of the
+ *   fixup*Value functions to use in correcting readModRM()'ss interpretation.
+ *
+ * @param insn  - See fixup*Value().
+ * @param op    - The operand specifier.
+ * @return      - 0 if fixup was successful; -1 if the register returned was
+ *                invalid for its class.
+ */
+static int fixupReg(struct InternalInstruction *insn,
+                    const struct OperandSpecifier *op) {
+  uint8_t valid;
+
+  dbgprintf(insn, "fixupReg()");
+
+  switch ((OperandEncoding)op->encoding) {
+  default:
+    debug("Expected a REG or R/M encoding in fixupReg");
+    return -1;
+  case ENCODING_VVVV:
+    insn->vvvv = (Reg)fixupRegValue(insn,
+                                    (OperandType)op->type,
+                                    insn->vvvv,
+                                    &valid);
+    if (!valid)
+      return -1;
+    break;
+  case ENCODING_REG:
+    insn->reg = (Reg)fixupRegValue(insn,
+                                   (OperandType)op->type,
+                                   insn->reg - insn->regBase,
+                                   &valid);
+    if (!valid)
+      return -1;
+    break;
+  CASE_ENCODING_RM:
+    if (insn->eaBase >= insn->eaRegBase) {
+      insn->eaBase = (EABase)fixupRMValue(insn,
+                                          (OperandType)op->type,
+                                          insn->eaBase - insn->eaRegBase,
+                                          &valid);
+      if (!valid)
+        return -1;
+    }
+    break;
+  }
+
+  return 0;
+}
+
+/*
+ * readOpcodeRegister - Reads an operand from the opcode field of an
+ *   instruction and interprets it appropriately given the operand width.
+ *   Handles AddRegFrm instructions.
+ *
+ * @param insn  - the instruction whose opcode field is to be read.
+ * @param size  - The width (in bytes) of the register being specified.
+ *                1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
+ *                RAX.
+ * @return      - 0 on success; nonzero otherwise.
+ */
+static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
+  dbgprintf(insn, "readOpcodeRegister()");
+
+  if (size == 0)
+    size = insn->registerSize;
+
+  switch (size) {
+  case 1:
+    insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3)
+                                                  | (insn->opcode & 7)));
+    if (insn->rexPrefix &&
+        insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
+        insn->opcodeRegister < MODRM_REG_AL + 0x8) {
+      insn->opcodeRegister = (Reg)(MODRM_REG_SPL
+                                   + (insn->opcodeRegister - MODRM_REG_AL - 4));
+    }
+
+    break;
+  case 2:
+    insn->opcodeRegister = (Reg)(MODRM_REG_AX
+                                 + ((bFromREX(insn->rexPrefix) << 3)
+                                    | (insn->opcode & 7)));
+    break;
+  case 4:
+    insn->opcodeRegister = (Reg)(MODRM_REG_EAX
+                                 + ((bFromREX(insn->rexPrefix) << 3)
+                                    | (insn->opcode & 7)));
+    break;
+  case 8:
+    insn->opcodeRegister = (Reg)(MODRM_REG_RAX
+                                 + ((bFromREX(insn->rexPrefix) << 3)
+                                    | (insn->opcode & 7)));
+    break;
+  }
+
+  return 0;
+}
+
+/*
+ * readImmediate - Consumes an immediate operand from an instruction, given the
+ *   desired operand size.
+ *
+ * @param insn  - The instruction whose operand is to be read.
+ * @param size  - The width (in bytes) of the operand.
+ * @return      - 0 if the immediate was successfully consumed; nonzero
+ *                otherwise.
+ */
+static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
+  uint8_t imm8;
+  uint16_t imm16;
+  uint32_t imm32;
+  uint64_t imm64;
+
+  dbgprintf(insn, "readImmediate()");
+
+  if (insn->numImmediatesConsumed == 2) {
+    debug("Already consumed two immediates");
+    return -1;
+  }
+
+  if (size == 0)
+    size = insn->immediateSize;
+  else
+    insn->immediateSize = size;
+  insn->immediateOffset = insn->readerCursor - insn->startLocation;
+
+  switch (size) {
+  case 1:
+    if (consumeByte(insn, &imm8))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm8;
+    break;
+  case 2:
+    if (consumeUInt16(insn, &imm16))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm16;
+    break;
+  case 4:
+    if (consumeUInt32(insn, &imm32))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm32;
+    break;
+  case 8:
+    if (consumeUInt64(insn, &imm64))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm64;
+    break;
+  }
+
+  insn->numImmediatesConsumed++;
+
+  return 0;
+}
+
+/*
+ * readVVVV - Consumes vvvv from an instruction if it has a VEX prefix.
+ *
+ * @param insn  - The instruction whose operand is to be read.
+ * @return      - 0 if the vvvv was successfully consumed; nonzero
+ *                otherwise.
+ */
+static int readVVVV(struct InternalInstruction* insn) {
+  dbgprintf(insn, "readVVVV()");
+
+  int vvvv;
+  if (insn->vectorExtensionType == TYPE_EVEX)
+    vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 |
+            vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]));
+  else if (insn->vectorExtensionType == TYPE_VEX_3B)
+    vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
+  else if (insn->vectorExtensionType == TYPE_VEX_2B)
+    vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]);
+  else if (insn->vectorExtensionType == TYPE_XOP)
+    vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]);
+  else
+    return -1;
+
+  if (insn->mode != MODE_64BIT)
+    vvvv &= 0x7;
+
+  insn->vvvv = static_cast<Reg>(vvvv);
+  return 0;
+}
+
+/*
+ * readMaskRegister - Reads an mask register from the opcode field of an
+ *   instruction.
+ *
+ * @param insn    - The instruction whose opcode field is to be read.
+ * @return        - 0 on success; nonzero otherwise.
+ */
+static int readMaskRegister(struct InternalInstruction* insn) {
+  dbgprintf(insn, "readMaskRegister()");
+
+  if (insn->vectorExtensionType != TYPE_EVEX)
+    return -1;
+
+  insn->writemask =
+      static_cast<Reg>(aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]));
+  return 0;
+}
+
+/*
+ * readOperands - Consults the specifier for an instruction and consumes all
+ *   operands for that instruction, interpreting them as it goes.
+ *
+ * @param insn  - The instruction whose operands are to be read and interpreted.
+ * @return      - 0 if all operands could be read; nonzero otherwise.
+ */
+static int readOperands(struct InternalInstruction* insn) {
+  int hasVVVV, needVVVV;
+  int sawRegImm = 0;
+
+  dbgprintf(insn, "readOperands()");
+
+  /* If non-zero vvvv specified, need to make sure one of the operands
+     uses it. */
+  hasVVVV = !readVVVV(insn);
+  needVVVV = hasVVVV && (insn->vvvv != 0);
+
+  for (const auto &Op : x86OperandSets[insn->spec->operands]) {
+    switch (Op.encoding) {
+    case ENCODING_NONE:
+    case ENCODING_SI:
+    case ENCODING_DI:
+      break;
+    case ENCODING_REG:
+    CASE_ENCODING_RM:
+      if (readModRM(insn))
+        return -1;
+      if (fixupReg(insn, &Op))
+        return -1;
+      // Apply the AVX512 compressed displacement scaling factor.
+      if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
+        insn->displacement *= 1 << (Op.encoding - ENCODING_RM);
+      break;
+    case ENCODING_IB:
+      if (sawRegImm) {
+        /* Saw a register immediate so don't read again and instead split the
+           previous immediate.  FIXME: This is a hack. */
+        insn->immediates[insn->numImmediatesConsumed] =
+          insn->immediates[insn->numImmediatesConsumed - 1] & 0xf;
+        ++insn->numImmediatesConsumed;
+        break;
+      }
+      if (readImmediate(insn, 1))
+        return -1;
+      if (Op.type == TYPE_XMM128 ||
+          Op.type == TYPE_XMM256)
+        sawRegImm = 1;
+      break;
+    case ENCODING_IW:
+      if (readImmediate(insn, 2))
+        return -1;
+      break;
+    case ENCODING_ID:
+      if (readImmediate(insn, 4))
+        return -1;
+      break;
+    case ENCODING_IO:
+      if (readImmediate(insn, 8))
+        return -1;
+      break;
+    case ENCODING_Iv:
+      if (readImmediate(insn, insn->immediateSize))
+        return -1;
+      break;
+    case ENCODING_Ia:
+      if (readImmediate(insn, insn->addressSize))
+        return -1;
+      break;
+    case ENCODING_RB:
+      if (readOpcodeRegister(insn, 1))
+        return -1;
+      break;
+    case ENCODING_RW:
+      if (readOpcodeRegister(insn, 2))
+        return -1;
+      break;
+    case ENCODING_RD:
+      if (readOpcodeRegister(insn, 4))
+        return -1;
+      break;
+    case ENCODING_RO:
+      if (readOpcodeRegister(insn, 8))
+        return -1;
+      break;
+    case ENCODING_Rv:
+      if (readOpcodeRegister(insn, 0))
+        return -1;
+      break;
+    case ENCODING_FP:
+      break;
+    case ENCODING_VVVV:
+      needVVVV = 0; /* Mark that we have found a VVVV operand. */
+      if (!hasVVVV)
+        return -1;
+      if (fixupReg(insn, &Op))
+        return -1;
+      break;
+    case ENCODING_WRITEMASK:
+      if (readMaskRegister(insn))
+        return -1;
+      break;
+    case ENCODING_DUP:
+      break;
+    default:
+      dbgprintf(insn, "Encountered an operand with an unknown encoding.");
+      return -1;
+    }
+  }
+
+  /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */
+  if (needVVVV) return -1;
+
+  return 0;
+}
+
+/*
+ * decodeInstruction - Reads and interprets a full instruction provided by the
+ *   user.
+ *
+ * @param insn      - A pointer to the instruction to be populated.  Must be
+ *                    pre-allocated.
+ * @param reader    - The function to be used to read the instruction's bytes.
+ * @param readerArg - A generic argument to be passed to the reader to store
+ *                    any internal state.
+ * @param logger    - If non-NULL, the function to be used to write log messages
+ *                    and warnings.
+ * @param loggerArg - A generic argument to be passed to the logger to store
+ *                    any internal state.
+ * @param startLoc  - The address (in the reader's address space) of the first
+ *                    byte in the instruction.
+ * @param mode      - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to
+ *                    decode the instruction in.
+ * @return          - 0 if the instruction's memory could be read; nonzero if
+ *                    not.
+ */
+int llvm::X86Disassembler::decodeInstruction(
+    struct InternalInstruction *insn, byteReader_t reader,
+    const void *readerArg, dlog_t logger, void *loggerArg, const void *miiArg,
+    uint64_t startLoc, DisassemblerMode mode) {
+  memset(insn, 0, sizeof(struct InternalInstruction));
+
+  insn->reader = reader;
+  insn->readerArg = readerArg;
+  insn->dlog = logger;
+  insn->dlogArg = loggerArg;
+  insn->startLocation = startLoc;
+  insn->readerCursor = startLoc;
+  insn->mode = mode;
+  insn->numImmediatesConsumed = 0;
+
+  if (readPrefixes(insn)       ||
+      readOpcode(insn)         ||
+      getID(insn, miiArg)      ||
+      insn->instructionID == 0 ||
+      readOperands(insn))
+    return -1;
+
+  insn->operands = x86OperandSets[insn->spec->operands];
+
+  insn->length = insn->readerCursor - insn->startLocation;
+
+  dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu",
+            startLoc, insn->readerCursor, insn->length);
+
+  if (insn->length > 15)
+    dbgprintf(insn, "Instruction exceeds 15-byte limit");
+
+  return 0;
+}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
new file mode 100644
index 000000000000..b07fd0b17d35
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -0,0 +1,682 @@
+//===-- X86DisassemblerDecoderInternal.h - Disassembler decoder -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the public interface of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H
+#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H
+
+#include "X86DisassemblerDecoderCommon.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace llvm {
+namespace X86Disassembler {
+
+// Accessor functions for various fields of an Intel instruction
+#define modFromModRM(modRM)  (((modRM) & 0xc0) >> 6)
+#define regFromModRM(modRM)  (((modRM) & 0x38) >> 3)
+#define rmFromModRM(modRM)   ((modRM) & 0x7)
+#define scaleFromSIB(sib)    (((sib) & 0xc0) >> 6)
+#define indexFromSIB(sib)    (((sib) & 0x38) >> 3)
+#define baseFromSIB(sib)     ((sib) & 0x7)
+#define wFromREX(rex)        (((rex) & 0x8) >> 3)
+#define rFromREX(rex)        (((rex) & 0x4) >> 2)
+#define xFromREX(rex)        (((rex) & 0x2) >> 1)
+#define bFromREX(rex)        ((rex) & 0x1)
+
+#define rFromEVEX2of4(evex)     (((~(evex)) & 0x80) >> 7)
+#define xFromEVEX2of4(evex)     (((~(evex)) & 0x40) >> 6)
+#define bFromEVEX2of4(evex)     (((~(evex)) & 0x20) >> 5)
+#define r2FromEVEX2of4(evex)    (((~(evex)) & 0x10) >> 4)
+#define mmFromEVEX2of4(evex)    ((evex) & 0x3)
+#define wFromEVEX3of4(evex)     (((evex) & 0x80) >> 7)
+#define vvvvFromEVEX3of4(evex)  (((~(evex)) & 0x78) >> 3)
+#define ppFromEVEX3of4(evex)    ((evex) & 0x3)
+#define zFromEVEX4of4(evex)     (((evex) & 0x80) >> 7)
+#define l2FromEVEX4of4(evex)    (((evex) & 0x40) >> 6)
+#define lFromEVEX4of4(evex)     (((evex) & 0x20) >> 5)
+#define bFromEVEX4of4(evex)     (((evex) & 0x10) >> 4)
+#define v2FromEVEX4of4(evex)    (((~evex) & 0x8) >> 3)
+#define aaaFromEVEX4of4(evex)   ((evex) & 0x7)
+
+#define rFromVEX2of3(vex)       (((~(vex)) & 0x80) >> 7)
+#define xFromVEX2of3(vex)       (((~(vex)) & 0x40) >> 6)
+#define bFromVEX2of3(vex)       (((~(vex)) & 0x20) >> 5)
+#define mmmmmFromVEX2of3(vex)   ((vex) & 0x1f)
+#define wFromVEX3of3(vex)       (((vex) & 0x80) >> 7)
+#define vvvvFromVEX3of3(vex)    (((~(vex)) & 0x78) >> 3)
+#define lFromVEX3of3(vex)       (((vex) & 0x4) >> 2)
+#define ppFromVEX3of3(vex)      ((vex) & 0x3)
+
+#define rFromVEX2of2(vex)       (((~(vex)) & 0x80) >> 7)
+#define vvvvFromVEX2of2(vex)    (((~(vex)) & 0x78) >> 3)
+#define lFromVEX2of2(vex)       (((vex) & 0x4) >> 2)
+#define ppFromVEX2of2(vex)      ((vex) & 0x3)
+
+#define rFromXOP2of3(xop)       (((~(xop)) & 0x80) >> 7)
+#define xFromXOP2of3(xop)       (((~(xop)) & 0x40) >> 6)
+#define bFromXOP2of3(xop)       (((~(xop)) & 0x20) >> 5)
+#define mmmmmFromXOP2of3(xop)   ((xop) & 0x1f)
+#define wFromXOP3of3(xop)       (((xop) & 0x80) >> 7)
+#define vvvvFromXOP3of3(vex)    (((~(vex)) & 0x78) >> 3)
+#define lFromXOP3of3(xop)       (((xop) & 0x4) >> 2)
+#define ppFromXOP3of3(xop)      ((xop) & 0x3)
+
+// These enums represent Intel registers for use by the decoder.
+#define REGS_8BIT     \
+  ENTRY(AL)           \
+  ENTRY(CL)           \
+  ENTRY(DL)           \
+  ENTRY(BL)           \
+  ENTRY(AH)           \
+  ENTRY(CH)           \
+  ENTRY(DH)           \
+  ENTRY(BH)           \
+  ENTRY(R8B)          \
+  ENTRY(R9B)          \
+  ENTRY(R10B)         \
+  ENTRY(R11B)         \
+  ENTRY(R12B)         \
+  ENTRY(R13B)         \
+  ENTRY(R14B)         \
+  ENTRY(R15B)         \
+  ENTRY(SPL)          \
+  ENTRY(BPL)          \
+  ENTRY(SIL)          \
+  ENTRY(DIL)
+
+#define EA_BASES_16BIT  \
+  ENTRY(BX_SI)          \
+  ENTRY(BX_DI)          \
+  ENTRY(BP_SI)          \
+  ENTRY(BP_DI)          \
+  ENTRY(SI)             \
+  ENTRY(DI)             \
+  ENTRY(BP)             \
+  ENTRY(BX)             \
+  ENTRY(R8W)            \
+  ENTRY(R9W)            \
+  ENTRY(R10W)           \
+  ENTRY(R11W)           \
+  ENTRY(R12W)           \
+  ENTRY(R13W)           \
+  ENTRY(R14W)           \
+  ENTRY(R15W)
+
+#define REGS_16BIT    \
+  ENTRY(AX)           \
+  ENTRY(CX)           \
+  ENTRY(DX)           \
+  ENTRY(BX)           \
+  ENTRY(SP)           \
+  ENTRY(BP)           \
+  ENTRY(SI)           \
+  ENTRY(DI)           \
+  ENTRY(R8W)          \
+  ENTRY(R9W)          \
+  ENTRY(R10W)         \
+  ENTRY(R11W)         \
+  ENTRY(R12W)         \
+  ENTRY(R13W)         \
+  ENTRY(R14W)         \
+  ENTRY(R15W)
+
+#define EA_BASES_32BIT  \
+  ENTRY(EAX)            \
+  ENTRY(ECX)            \
+  ENTRY(EDX)            \
+  ENTRY(EBX)            \
+  ENTRY(sib)            \
+  ENTRY(EBP)            \
+  ENTRY(ESI)            \
+  ENTRY(EDI)            \
+  ENTRY(R8D)            \
+  ENTRY(R9D)            \
+  ENTRY(R10D)           \
+  ENTRY(R11D)           \
+  ENTRY(R12D)           \
+  ENTRY(R13D)           \
+  ENTRY(R14D)           \
+  ENTRY(R15D)
+
+#define REGS_32BIT  \
+  ENTRY(EAX)        \
+  ENTRY(ECX)        \
+  ENTRY(EDX)        \
+  ENTRY(EBX)        \
+  ENTRY(ESP)        \
+  ENTRY(EBP)        \
+  ENTRY(ESI)        \
+  ENTRY(EDI)        \
+  ENTRY(R8D)        \
+  ENTRY(R9D)        \
+  ENTRY(R10D)       \
+  ENTRY(R11D)       \
+  ENTRY(R12D)       \
+  ENTRY(R13D)       \
+  ENTRY(R14D)       \
+  ENTRY(R15D)
+
+#define EA_BASES_64BIT  \
+  ENTRY(RAX)            \
+  ENTRY(RCX)            \
+  ENTRY(RDX)            \
+  ENTRY(RBX)            \
+  ENTRY(sib64)          \
+  ENTRY(RBP)            \
+  ENTRY(RSI)            \
+  ENTRY(RDI)            \
+  ENTRY(R8)             \
+  ENTRY(R9)             \
+  ENTRY(R10)            \
+  ENTRY(R11)            \
+  ENTRY(R12)            \
+  ENTRY(R13)            \
+  ENTRY(R14)            \
+  ENTRY(R15)
+
+#define REGS_64BIT  \
+  ENTRY(RAX)        \
+  ENTRY(RCX)        \
+  ENTRY(RDX)        \
+  ENTRY(RBX)        \
+  ENTRY(RSP)        \
+  ENTRY(RBP)        \
+  ENTRY(RSI)        \
+  ENTRY(RDI)        \
+  ENTRY(R8)         \
+  ENTRY(R9)         \
+  ENTRY(R10)        \
+  ENTRY(R11)        \
+  ENTRY(R12)        \
+  ENTRY(R13)        \
+  ENTRY(R14)        \
+  ENTRY(R15)
+
+#define REGS_MMX  \
+  ENTRY(MM0)      \
+  ENTRY(MM1)      \
+  ENTRY(MM2)      \
+  ENTRY(MM3)      \
+  ENTRY(MM4)      \
+  ENTRY(MM5)      \
+  ENTRY(MM6)      \
+  ENTRY(MM7)
+
+#define REGS_XMM  \
+  ENTRY(XMM0)     \
+  ENTRY(XMM1)     \
+  ENTRY(XMM2)     \
+  ENTRY(XMM3)     \
+  ENTRY(XMM4)     \
+  ENTRY(XMM5)     \
+  ENTRY(XMM6)     \
+  ENTRY(XMM7)     \
+  ENTRY(XMM8)     \
+  ENTRY(XMM9)     \
+  ENTRY(XMM10)    \
+  ENTRY(XMM11)    \
+  ENTRY(XMM12)    \
+  ENTRY(XMM13)    \
+  ENTRY(XMM14)    \
+  ENTRY(XMM15)    \
+  ENTRY(XMM16)    \
+  ENTRY(XMM17)    \
+  ENTRY(XMM18)    \
+  ENTRY(XMM19)    \
+  ENTRY(XMM20)    \
+  ENTRY(XMM21)    \
+  ENTRY(XMM22)    \
+  ENTRY(XMM23)    \
+  ENTRY(XMM24)    \
+  ENTRY(XMM25)    \
+  ENTRY(XMM26)    \
+  ENTRY(XMM27)    \
+  ENTRY(XMM28)    \
+  ENTRY(XMM29)    \
+  ENTRY(XMM30)    \
+  ENTRY(XMM31)
+
+#define REGS_YMM  \
+  ENTRY(YMM0)     \
+  ENTRY(YMM1)     \
+  ENTRY(YMM2)     \
+  ENTRY(YMM3)     \
+  ENTRY(YMM4)     \
+  ENTRY(YMM5)     \
+  ENTRY(YMM6)     \
+  ENTRY(YMM7)     \
+  ENTRY(YMM8)     \
+  ENTRY(YMM9)     \
+  ENTRY(YMM10)    \
+  ENTRY(YMM11)    \
+  ENTRY(YMM12)    \
+  ENTRY(YMM13)    \
+  ENTRY(YMM14)    \
+  ENTRY(YMM15)    \
+  ENTRY(YMM16)    \
+  ENTRY(YMM17)    \
+  ENTRY(YMM18)    \
+  ENTRY(YMM19)    \
+  ENTRY(YMM20)    \
+  ENTRY(YMM21)    \
+  ENTRY(YMM22)    \
+  ENTRY(YMM23)    \
+  ENTRY(YMM24)    \
+  ENTRY(YMM25)    \
+  ENTRY(YMM26)    \
+  ENTRY(YMM27)    \
+  ENTRY(YMM28)    \
+  ENTRY(YMM29)    \
+  ENTRY(YMM30)    \
+  ENTRY(YMM31)
+
+#define REGS_ZMM  \
+  ENTRY(ZMM0)     \
+  ENTRY(ZMM1)     \
+  ENTRY(ZMM2)     \
+  ENTRY(ZMM3)     \
+  ENTRY(ZMM4)     \
+  ENTRY(ZMM5)     \
+  ENTRY(ZMM6)     \
+  ENTRY(ZMM7)     \
+  ENTRY(ZMM8)     \
+  ENTRY(ZMM9)     \
+  ENTRY(ZMM10)    \
+  ENTRY(ZMM11)    \
+  ENTRY(ZMM12)    \
+  ENTRY(ZMM13)    \
+  ENTRY(ZMM14)    \
+  ENTRY(ZMM15)    \
+  ENTRY(ZMM16)    \
+  ENTRY(ZMM17)    \
+  ENTRY(ZMM18)    \
+  ENTRY(ZMM19)    \
+  ENTRY(ZMM20)    \
+  ENTRY(ZMM21)    \
+  ENTRY(ZMM22)    \
+  ENTRY(ZMM23)    \
+  ENTRY(ZMM24)    \
+  ENTRY(ZMM25)    \
+  ENTRY(ZMM26)    \
+  ENTRY(ZMM27)    \
+  ENTRY(ZMM28)    \
+  ENTRY(ZMM29)    \
+  ENTRY(ZMM30)    \
+  ENTRY(ZMM31)
+
+#define REGS_MASKS \
+  ENTRY(K0)        \
+  ENTRY(K1)        \
+  ENTRY(K2)        \
+  ENTRY(K3)        \
+  ENTRY(K4)        \
+  ENTRY(K5)        \
+  ENTRY(K6)        \
+  ENTRY(K7)
+
+#define REGS_SEGMENT \
+  ENTRY(ES)          \
+  ENTRY(CS)          \
+  ENTRY(SS)          \
+  ENTRY(DS)          \
+  ENTRY(FS)          \
+  ENTRY(GS)
+
+#define REGS_DEBUG  \
+  ENTRY(DR0)        \
+  ENTRY(DR1)        \
+  ENTRY(DR2)        \
+  ENTRY(DR3)        \
+  ENTRY(DR4)        \
+  ENTRY(DR5)        \
+  ENTRY(DR6)        \
+  ENTRY(DR7)        \
+  ENTRY(DR8)        \
+  ENTRY(DR9)        \
+  ENTRY(DR10)       \
+  ENTRY(DR11)       \
+  ENTRY(DR12)       \
+  ENTRY(DR13)       \
+  ENTRY(DR14)       \
+  ENTRY(DR15)
+
+#define REGS_CONTROL  \
+  ENTRY(CR0)          \
+  ENTRY(CR1)          \
+  ENTRY(CR2)          \
+  ENTRY(CR3)          \
+  ENTRY(CR4)          \
+  ENTRY(CR5)          \
+  ENTRY(CR6)          \
+  ENTRY(CR7)          \
+  ENTRY(CR8)          \
+  ENTRY(CR9)          \
+  ENTRY(CR10)         \
+  ENTRY(CR11)         \
+  ENTRY(CR12)         \
+  ENTRY(CR13)         \
+  ENTRY(CR14)         \
+  ENTRY(CR15)
+
+#define REGS_BOUND    \
+  ENTRY(BND0)         \
+  ENTRY(BND1)         \
+  ENTRY(BND2)         \
+  ENTRY(BND3)
+
+#define ALL_EA_BASES  \
+  EA_BASES_16BIT      \
+  EA_BASES_32BIT      \
+  EA_BASES_64BIT
+
+#define ALL_SIB_BASES \
+  REGS_32BIT          \
+  REGS_64BIT
+
+#define ALL_REGS      \
+  REGS_8BIT           \
+  REGS_16BIT          \
+  REGS_32BIT          \
+  REGS_64BIT          \
+  REGS_MMX            \
+  REGS_XMM            \
+  REGS_YMM            \
+  REGS_ZMM            \
+  REGS_MASKS          \
+  REGS_SEGMENT        \
+  REGS_DEBUG          \
+  REGS_CONTROL        \
+  REGS_BOUND          \
+  ENTRY(RIP)
+
+/// \brief All possible values of the base field for effective-address
+/// computations, a.k.a. the Mod and R/M fields of the ModR/M byte.
+/// We distinguish between bases (EA_BASE_*) and registers that just happen
+/// to be referred to when Mod == 0b11 (EA_REG_*).
+enum EABase {
+  EA_BASE_NONE,
+#define ENTRY(x) EA_BASE_##x,
+  ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) EA_REG_##x,
+  ALL_REGS
+#undef ENTRY
+  EA_max
+};
+
+/// \brief All possible values of the SIB index field.
+/// borrows entries from ALL_EA_BASES with the special case that
+/// sib is synonymous with NONE.
+/// Vector SIB: index can be XMM or YMM.
+enum SIBIndex {
+  SIB_INDEX_NONE,
+#define ENTRY(x) SIB_INDEX_##x,
+  ALL_EA_BASES
+  REGS_XMM
+  REGS_YMM
+  REGS_ZMM
+#undef ENTRY
+  SIB_INDEX_max
+};
+
+/// \brief All possible values of the SIB base field.
+enum SIBBase {
+  SIB_BASE_NONE,
+#define ENTRY(x) SIB_BASE_##x,
+  ALL_SIB_BASES
+#undef ENTRY
+  SIB_BASE_max
+};
+
+/// \brief Possible displacement types for effective-address computations.
+typedef enum {
+  EA_DISP_NONE,
+  EA_DISP_8,
+  EA_DISP_16,
+  EA_DISP_32
+} EADisplacement;
+
+/// \brief All possible values of the reg field in the ModR/M byte.
+enum Reg {
+#define ENTRY(x) MODRM_REG_##x,
+  ALL_REGS
+#undef ENTRY
+  MODRM_REG_max
+};
+
+/// \brief All possible segment overrides.
+enum SegmentOverride {
+  SEG_OVERRIDE_NONE,
+  SEG_OVERRIDE_CS,
+  SEG_OVERRIDE_SS,
+  SEG_OVERRIDE_DS,
+  SEG_OVERRIDE_ES,
+  SEG_OVERRIDE_FS,
+  SEG_OVERRIDE_GS,
+  SEG_OVERRIDE_max
+};
+
+/// \brief Possible values for the VEX.m-mmmm field
+enum VEXLeadingOpcodeByte {
+  VEX_LOB_0F = 0x1,
+  VEX_LOB_0F38 = 0x2,
+  VEX_LOB_0F3A = 0x3
+};
+
+enum XOPMapSelect {
+  XOP_MAP_SELECT_8 = 0x8,
+  XOP_MAP_SELECT_9 = 0x9,
+  XOP_MAP_SELECT_A = 0xA
+};
+
+/// \brief Possible values for the VEX.pp/EVEX.pp field
+enum VEXPrefixCode {
+  VEX_PREFIX_NONE = 0x0,
+  VEX_PREFIX_66 = 0x1,
+  VEX_PREFIX_F3 = 0x2,
+  VEX_PREFIX_F2 = 0x3
+};
+
+enum VectorExtensionType {
+  TYPE_NO_VEX_XOP   = 0x0,
+  TYPE_VEX_2B       = 0x1,
+  TYPE_VEX_3B       = 0x2,
+  TYPE_EVEX         = 0x3,
+  TYPE_XOP          = 0x4
+};
+
+/// \brief Type for the byte reader that the consumer must provide to
+/// the decoder. Reads a single byte from the instruction's address space.
+/// \param arg     A baton that the consumer can associate with any internal
+///                state that it needs.
+/// \param byte    A pointer to a single byte in memory that should be set to
+///                contain the value at address.
+/// \param address The address in the instruction's address space that should
+///                be read from.
+/// \return        -1 if the byte cannot be read for any reason; 0 otherwise.
+typedef int (*byteReader_t)(const void *arg, uint8_t *byte, uint64_t address);
+
+/// \brief Type for the logging function that the consumer can provide to
+/// get debugging output from the decoder.
+/// \param arg A baton that the consumer can associate with any internal
+///            state that it needs.
+/// \param log A string that contains the message.  Will be reused after
+///            the logger returns.
+typedef void (*dlog_t)(void *arg, const char *log);
+
+/// The specification for how to extract and interpret a full instruction and
+/// its operands.
+struct InstructionSpecifier {
+  uint16_t operands;
+};
+
+/// The x86 internal instruction, which is produced by the decoder.
+struct InternalInstruction {
+  // Reader interface (C)
+  byteReader_t reader;
+  // Opaque value passed to the reader
+  const void* readerArg;
+  // The address of the next byte to read via the reader
+  uint64_t readerCursor;
+
+  // Logger interface (C)
+  dlog_t dlog;
+  // Opaque value passed to the logger
+  void* dlogArg;
+
+  // General instruction information
+
+  // The mode to disassemble for (64-bit, protected, real)
+  DisassemblerMode mode;
+  // The start of the instruction, usable with the reader
+  uint64_t startLocation;
+  // The length of the instruction, in bytes
+  size_t length;
+
+  // Prefix state
+
+  // 1 if the prefix byte corresponding to the entry is present; 0 if not
+  uint8_t prefixPresent[0x100];
+  // contains the location (for use with the reader) of the prefix byte
+  uint64_t prefixLocations[0x100];
+  // The value of the vector extension prefix(EVEX/VEX/XOP), if present
+  uint8_t vectorExtensionPrefix[4];
+  // The type of the vector extension prefix
+  VectorExtensionType vectorExtensionType;
+  // The value of the REX prefix, if present
+  uint8_t rexPrefix;
+  // The location where a mandatory prefix would have to be (i.e., right before
+  // the opcode, or right before the REX prefix if one is present).
+  uint64_t necessaryPrefixLocation;
+  // The segment override type
+  SegmentOverride segmentOverride;
+  // 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease
+  bool xAcquireRelease;
+
+  // Sizes of various critical pieces of data, in bytes
+  uint8_t registerSize;
+  uint8_t addressSize;
+  uint8_t displacementSize;
+  uint8_t immediateSize;
+
+  // Offsets from the start of the instruction to the pieces of data, which is
+  // needed to find relocation entries for adding symbolic operands.
+  uint8_t displacementOffset;
+  uint8_t immediateOffset;
+
+  // opcode state
+
+  // The last byte of the opcode, not counting any ModR/M extension
+  uint8_t opcode;
+
+  // decode state
+
+  // The type of opcode, used for indexing into the array of decode tables
+  OpcodeType opcodeType;
+  // The instruction ID, extracted from the decode table
+  uint16_t instructionID;
+  // The specifier for the instruction, from the instruction info table
+  const InstructionSpecifier *spec;
+
+  // state for additional bytes, consumed during operand decode.  Pattern:
+  // consumed___ indicates that the byte was already consumed and does not
+  // need to be consumed again.
+
+  // The VEX.vvvv field, which contains a third register operand for some AVX
+  // instructions.
+  Reg                           vvvv;
+
+  // The writemask for AVX-512 instructions which is contained in EVEX.aaa
+  Reg                           writemask;
+
+  // The ModR/M byte, which contains most register operands and some portion of
+  // all memory operands.
+  bool                          consumedModRM;
+  uint8_t                       modRM;
+
+  // The SIB byte, used for more complex 32- or 64-bit memory operands
+  bool                          consumedSIB;
+  uint8_t                       sib;
+
+  // The displacement, used for memory operands
+  bool                          consumedDisplacement;
+  int32_t                       displacement;
+
+  // Immediates.  There can be two in some cases
+  uint8_t                       numImmediatesConsumed;
+  uint8_t                       numImmediatesTranslated;
+  uint64_t                      immediates[2];
+
+  // A register or immediate operand encoded into the opcode
+  Reg                           opcodeRegister;
+
+  // Portions of the ModR/M byte
+
+  // These fields determine the allowable values for the ModR/M fields, which
+  // depend on operand and address widths.
+  EABase                        eaBaseBase;
+  EABase                        eaRegBase;
+  Reg                           regBase;
+
+  // The Mod and R/M fields can encode a base for an effective address, or a
+  // register.  These are separated into two fields here.
+  EABase                        eaBase;
+  EADisplacement                eaDisplacement;
+  // The reg field always encodes a register
+  Reg                           reg;
+
+  // SIB state
+  SIBIndex                      sibIndex;
+  uint8_t                       sibScale;
+  SIBBase                       sibBase;
+
+  ArrayRef<OperandSpecifier> operands;
+};
+
+/// \brief Decode one instruction and store the decoding results in
+/// a buffer provided by the consumer.
+/// \param insn      The buffer to store the instruction in.  Allocated by the
+///                  consumer.
+/// \param reader    The byteReader_t for the bytes to be read.
+/// \param readerArg An argument to pass to the reader for storing context
+///                  specific to the consumer.  May be NULL.
+/// \param logger    The dlog_t to be used in printing status messages from the
+///                  disassembler.  May be NULL.
+/// \param loggerArg An argument to pass to the logger for storing context
+///                  specific to the logger.  May be NULL.
+/// \param startLoc  The address (in the reader's address space) of the first
+///                  byte in the instruction.
+/// \param mode      The mode (16-bit, 32-bit, 64-bit) to decode in.
+/// \return          Nonzero if there was an error during decode, 0 otherwise.
+int decodeInstruction(InternalInstruction *insn,
+                      byteReader_t reader,
+                      const void *readerArg,
+                      dlog_t logger,
+                      void *loggerArg,
+                      const void *miiArg,
+                      uint64_t startLoc,
+                      DisassemblerMode mode);
+
+/// \brief Print a message to debugs()
+/// \param file The name of the file printing the debug message.
+/// \param line The line number that printed the debug message.
+/// \param s    The message to print.
+void Debug(const char *file, unsigned line, const char *s);
+
+StringRef GetInstrName(unsigned Opcode, const void *mii);
+
+} // namespace X86Disassembler
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
new file mode 100644
index 000000000000..0a835b876d90
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -0,0 +1,493 @@
+//===-- X86DisassemblerDecoderCommon.h - Disassembler decoder ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains common definitions used by both the disassembler and the table
+//  generator.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H
+#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+namespace X86Disassembler {
+
+#define INSTRUCTIONS_SYM  x86DisassemblerInstrSpecifiers
+#define CONTEXTS_SYM      x86DisassemblerContexts
+#define ONEBYTE_SYM       x86DisassemblerOneByteOpcodes
+#define TWOBYTE_SYM       x86DisassemblerTwoByteOpcodes
+#define THREEBYTE38_SYM   x86DisassemblerThreeByte38Opcodes
+#define THREEBYTE3A_SYM   x86DisassemblerThreeByte3AOpcodes
+#define XOP8_MAP_SYM      x86DisassemblerXOP8Opcodes
+#define XOP9_MAP_SYM      x86DisassemblerXOP9Opcodes
+#define XOPA_MAP_SYM      x86DisassemblerXOPAOpcodes
+
+#define INSTRUCTIONS_STR  "x86DisassemblerInstrSpecifiers"
+#define CONTEXTS_STR      "x86DisassemblerContexts"
+#define ONEBYTE_STR       "x86DisassemblerOneByteOpcodes"
+#define TWOBYTE_STR       "x86DisassemblerTwoByteOpcodes"
+#define THREEBYTE38_STR   "x86DisassemblerThreeByte38Opcodes"
+#define THREEBYTE3A_STR   "x86DisassemblerThreeByte3AOpcodes"
+#define XOP8_MAP_STR      "x86DisassemblerXOP8Opcodes"
+#define XOP9_MAP_STR      "x86DisassemblerXOP9Opcodes"
+#define XOPA_MAP_STR      "x86DisassemblerXOPAOpcodes"
+
+// Attributes of an instruction that must be known before the opcode can be
+// processed correctly.  Most of these indicate the presence of particular
+// prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
+#define ATTRIBUTE_BITS                  \
+  ENUM_ENTRY(ATTR_NONE,   0x00)         \
+  ENUM_ENTRY(ATTR_64BIT,  (0x1 << 0))   \
+  ENUM_ENTRY(ATTR_XS,     (0x1 << 1))   \
+  ENUM_ENTRY(ATTR_XD,     (0x1 << 2))   \
+  ENUM_ENTRY(ATTR_REXW,   (0x1 << 3))   \
+  ENUM_ENTRY(ATTR_OPSIZE, (0x1 << 4))   \
+  ENUM_ENTRY(ATTR_ADSIZE, (0x1 << 5))   \
+  ENUM_ENTRY(ATTR_VEX,    (0x1 << 6))   \
+  ENUM_ENTRY(ATTR_VEXL,   (0x1 << 7))   \
+  ENUM_ENTRY(ATTR_EVEX,   (0x1 << 8))   \
+  ENUM_ENTRY(ATTR_EVEXL,  (0x1 << 9))   \
+  ENUM_ENTRY(ATTR_EVEXL2, (0x1 << 10))  \
+  ENUM_ENTRY(ATTR_EVEXK,  (0x1 << 11))  \
+  ENUM_ENTRY(ATTR_EVEXKZ, (0x1 << 12))  \
+  ENUM_ENTRY(ATTR_EVEXB,  (0x1 << 13))
+
+#define ENUM_ENTRY(n, v) n = v,
+enum attributeBits {
+  ATTRIBUTE_BITS
+  ATTR_max
+};
+#undef ENUM_ENTRY
+
+// Combinations of the above attributes that are relevant to instruction
+// decode. Although other combinations are possible, they can be reduced to
+// these without affecting the ultimately decoded instruction.
+
+//           Class name           Rank  Rationale for rank assignment
+#define INSTRUCTION_CONTEXTS                                                   \
+  ENUM_ENTRY(IC,                    0,  "says nothing about the instruction")  \
+  ENUM_ENTRY(IC_64BIT,              1,  "says the instruction applies in "     \
+                                        "64-bit mode but no more")             \
+  ENUM_ENTRY(IC_OPSIZE,             3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_ADSIZE,             3,  "requires an ADSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_OPSIZE_ADSIZE,      4,  "requires ADSIZE and OPSIZE prefixes") \
+  ENUM_ENTRY(IC_XD,                 2,  "may say something about the opcode "  \
+                                        "but not the operands")                \
+  ENUM_ENTRY(IC_XS,                 2,  "may say something about the opcode "  \
+                                        "but not the operands")                \
+  ENUM_ENTRY(IC_XD_OPSIZE,          3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_XS_OPSIZE,          3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_64BIT_REXW,         5,  "requires a REX.W prefix, so operands "\
+                                        "change width; overrides IC_OPSIZE")   \
+  ENUM_ENTRY(IC_64BIT_REXW_ADSIZE,  6,  "requires a REX.W prefix and 0x67 "    \
+                                        "prefix")                              \
+  ENUM_ENTRY(IC_64BIT_OPSIZE,       3,  "Just as meaningful as IC_OPSIZE")     \
+  ENUM_ENTRY(IC_64BIT_ADSIZE,       3,  "Just as meaningful as IC_ADSIZE")     \
+  ENUM_ENTRY(IC_64BIT_OPSIZE_ADSIZE, 4, "Just as meaningful as IC_OPSIZE/"     \
+                                        "IC_ADSIZE")                           \
+  ENUM_ENTRY(IC_64BIT_XD,           6,  "XD instructions are SSE; REX.W is "   \
+                                        "secondary")                           \
+  ENUM_ENTRY(IC_64BIT_XS,           6,  "Just as meaningful as IC_64BIT_XD")   \
+  ENUM_ENTRY(IC_64BIT_XD_OPSIZE,    3,  "Just as meaningful as IC_XD_OPSIZE")  \
+  ENUM_ENTRY(IC_64BIT_XS_OPSIZE,    3,  "Just as meaningful as IC_XS_OPSIZE")  \
+  ENUM_ENTRY(IC_64BIT_REXW_XS,      7,  "OPSIZE could mean a different "       \
+                                        "opcode")                              \
+  ENUM_ENTRY(IC_64BIT_REXW_XD,      7,  "Just as meaningful as "               \
+                                        "IC_64BIT_REXW_XS")                    \
+  ENUM_ENTRY(IC_64BIT_REXW_OPSIZE,  8,  "The Dynamic Duo!  Prefer over all "   \
+                                        "else because this changes most "      \
+                                        "operands' meaning")                   \
+  ENUM_ENTRY(IC_VEX,                1,  "requires a VEX prefix")               \
+  ENUM_ENTRY(IC_VEX_XS,             2,  "requires VEX and the XS prefix")      \
+  ENUM_ENTRY(IC_VEX_XD,             2,  "requires VEX and the XD prefix")      \
+  ENUM_ENTRY(IC_VEX_OPSIZE,         2,  "requires VEX and the OpSize prefix")  \
+  ENUM_ENTRY(IC_VEX_W,              3,  "requires VEX and the W prefix")       \
+  ENUM_ENTRY(IC_VEX_W_XS,           4,  "requires VEX, W, and XS prefix")      \
+  ENUM_ENTRY(IC_VEX_W_XD,           4,  "requires VEX, W, and XD prefix")      \
+  ENUM_ENTRY(IC_VEX_W_OPSIZE,       4,  "requires VEX, W, and OpSize")         \
+  ENUM_ENTRY(IC_VEX_L,              3,  "requires VEX and the L prefix")       \
+  ENUM_ENTRY(IC_VEX_L_XS,           4,  "requires VEX and the L and XS prefix")\
+  ENUM_ENTRY(IC_VEX_L_XD,           4,  "requires VEX and the L and XD prefix")\
+  ENUM_ENTRY(IC_VEX_L_OPSIZE,       4,  "requires VEX, L, and OpSize")         \
+  ENUM_ENTRY(IC_VEX_L_W,            4,  "requires VEX, L and W")               \
+  ENUM_ENTRY(IC_VEX_L_W_XS,         5,  "requires VEX, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_VEX_L_W_XD,         5,  "requires VEX, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_VEX_L_W_OPSIZE,     5,  "requires VEX, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX,               1,  "requires an EVEX prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS,            2,  "requires EVEX and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD,            2,  "requires EVEX and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE,        2,  "requires EVEX and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W,             3,  "requires EVEX and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS,          4,  "requires EVEX, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD,          4,  "requires EVEX, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE,      4,  "requires EVEX, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L,             3,  "requires EVEX and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS,          4,  "requires EVEX and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD,          4,  "requires EVEX and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE,      4,  "requires EVEX, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W,           3,  "requires EVEX, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS,        4,  "requires EVEX, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD,        4,  "requires EVEX, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE,    4,  "requires EVEX, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2,            3,  "requires EVEX and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS,         4,  "requires EVEX and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD,         4,  "requires EVEX and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE,     4,  "requires EVEX, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W,          3,  "requires EVEX, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS,       4,  "requires EVEX, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD,       4,  "requires EVEX, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE,   4,  "requires EVEX, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_K,             1,  "requires an EVEX_K prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_K,          2,  "requires EVEX_K and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_K,          2,  "requires EVEX_K and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_K,      2,  "requires EVEX_K and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_K,           3,  "requires EVEX_K and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_K,        4,  "requires EVEX_K, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_K,        4,  "requires EVEX_K, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_K,    4,  "requires EVEX_K, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_K,           3,  "requires EVEX_K and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_K,        4,  "requires EVEX_K and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_K,        4,  "requires EVEX_K and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_K,    4,  "requires EVEX_K, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_K,         3,  "requires EVEX_K, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_K,      4,  "requires EVEX_K, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_K,      4,  "requires EVEX_K, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K,  4,  "requires EVEX_K, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_K,          3,  "requires EVEX_K and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_K,       4,  "requires EVEX_K and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_K,       4,  "requires EVEX_K and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K,   4,  "requires EVEX_K, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_K,        3,  "requires EVEX_K, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_K,     4,  "requires EVEX_K, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_K,     4,  "requires EVEX_K, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K, 4,  "requires EVEX_K, L2, W and OpSize")     \
+  ENUM_ENTRY(IC_EVEX_B,             1,  "requires an EVEX_B prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_B,          2,  "requires EVEX_B and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_B,          2,  "requires EVEX_B and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_B,      2,  "requires EVEX_B and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_B,           3,  "requires EVEX_B and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_B,        4,  "requires EVEX_B, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_B,        4,  "requires EVEX_B, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_B,    4,  "requires EVEX_B, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_B,           3,  "requires EVEX_B and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_B,        4,  "requires EVEX_B and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_B,        4,  "requires EVEX_B and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_B,    4,  "requires EVEX_B, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_B,         3,  "requires EVEX_B, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_B,      4,  "requires EVEX_B, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_B,      4,  "requires EVEX_B, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_B,  4,  "requires EVEX_B, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_B,          3,  "requires EVEX_B and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_B,       4,  "requires EVEX_B and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_B,       4,  "requires EVEX_B and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_B,   4,  "requires EVEX_B, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_B,        3,  "requires EVEX_B, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_B,     4,  "requires EVEX_B, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_B,     4,  "requires EVEX_B, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_B, 4,  "requires EVEX_B, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_K_B,           1,  "requires EVEX_B and EVEX_K prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_K_B,        2,  "requires EVEX_B, EVEX_K and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_K_B,        2,  "requires EVEX_B, EVEX_K and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_K_B,    2,  "requires EVEX_B, EVEX_K and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_K_B,         3,  "requires EVEX_B, EVEX_K and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_K_B,      4,  "requires EVEX_B, EVEX_K, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_K_B,      4,  "requires EVEX_B, EVEX_K, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_K_B,  4,  "requires EVEX_B, EVEX_K, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_K_B,         3,  "requires EVEX_B, EVEX_K and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_K_B,      4,  "requires EVEX_B, EVEX_K and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_K_B,      4,  "requires EVEX_B, EVEX_K and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_K_B,  4,  "requires EVEX_B, EVEX_K, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_K_B,       3,  "requires EVEX_B, EVEX_K, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_K_B,    4,  "requires EVEX_B, EVEX_K, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_K_B,    4,  "requires EVEX_B, EVEX_K, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K_B,4,  "requires EVEX_B, EVEX_K, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_K_B,        3,  "requires EVEX_B, EVEX_K and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_K_B,     4,  "requires EVEX_B, EVEX_K and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_K_B,     4,  "requires EVEX_B, EVEX_K and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K_B, 4,  "requires EVEX_B, EVEX_K, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_K_B,      3,  "requires EVEX_B, EVEX_K, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B,   4,  "requires EVEX_B, EVEX_K, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B,   4,  "requires EVEX_B, EVEX_K, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B,4,  "requires EVEX_B, EVEX_K, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_KZ_B,           1,  "requires EVEX_B and EVEX_KZ prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_KZ_B,        2,  "requires EVEX_B, EVEX_KZ and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_KZ_B,        2,  "requires EVEX_B, EVEX_KZ and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_KZ_B,    2,  "requires EVEX_B, EVEX_KZ and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_KZ_B,         3,  "requires EVEX_B, EVEX_KZ and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ_B,  4,  "requires EVEX_B, EVEX_KZ, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_KZ_B,           3,  "requires EVEX_B, EVEX_KZ and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_KZ_B,        4,  "requires EVEX_B, EVEX_KZ and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_KZ_B,        4,  "requires EVEX_B, EVEX_KZ and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ_B,    4,  "requires EVEX_B, EVEX_KZ, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_KZ_B,         3,  "requires EVEX_B, EVEX_KZ, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ_B,  4,  "requires EVEX_B, EVEX_KZ, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_KZ_B,          3,  "requires EVEX_B, EVEX_KZ and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_KZ_B,       4,  "requires EVEX_B, EVEX_KZ and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_KZ_B,       4,  "requires EVEX_B, EVEX_KZ and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ_B,   4,  "requires EVEX_B, EVEX_KZ, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_KZ_B,        3,  "requires EVEX_B, EVEX_KZ, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ_B,     4,  "requires EVEX_B, EVEX_KZ, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ_B,     4,  "requires EVEX_B, EVEX_KZ, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ_B, 4,  "requires EVEX_B, EVEX_KZ, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_KZ,             1,  "requires an EVEX_KZ prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_KZ,          2,  "requires EVEX_KZ and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_KZ,          2,  "requires EVEX_KZ and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_KZ,      2,  "requires EVEX_KZ and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_KZ,           3,  "requires EVEX_KZ and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_KZ,        4,  "requires EVEX_KZ, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_KZ,        4,  "requires EVEX_KZ, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ,    4,  "requires EVEX_KZ, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_KZ,           3,  "requires EVEX_KZ and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_KZ,        4,  "requires EVEX_KZ and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_KZ,        4,  "requires EVEX_KZ and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ,    4,  "requires EVEX_KZ, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_KZ,         3,  "requires EVEX_KZ, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_KZ,      4,  "requires EVEX_KZ, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_KZ,      4,  "requires EVEX_KZ, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ,  4,  "requires EVEX_KZ, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_KZ,          3,  "requires EVEX_KZ and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_KZ,       4,  "requires EVEX_KZ and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_KZ,       4,  "requires EVEX_KZ and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ,   4,  "requires EVEX_KZ, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_KZ,        3,  "requires EVEX_KZ, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ,     4,  "requires EVEX_KZ, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ,     4,  "requires EVEX_KZ, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4,  "requires EVEX_KZ, L2, W and OpSize")
+
+#define ENUM_ENTRY(n, r, d) n,
+enum InstructionContext {
+  INSTRUCTION_CONTEXTS
+  IC_max
+};
+#undef ENUM_ENTRY
+
+// Opcode types, which determine which decode table to use, both in the Intel
+// manual and also for the decoder.
+enum OpcodeType {
+  ONEBYTE       = 0,
+  TWOBYTE       = 1,
+  THREEBYTE_38  = 2,
+  THREEBYTE_3A  = 3,
+  XOP8_MAP      = 4,
+  XOP9_MAP      = 5,
+  XOPA_MAP      = 6
+};
+
+// The following structs are used for the hierarchical decode table.  After
+// determining the instruction's class (i.e., which IC_* constant applies to
+// it), the decoder reads the opcode.  Some instructions require specific
+// values of the ModR/M byte, so the ModR/M byte indexes into the final table.
+//
+// If a ModR/M byte is not required, "required" is left unset, and the values
+// for each instructionID are identical.
+typedef uint16_t InstrUID;
+
+// ModRMDecisionType - describes the type of ModR/M decision, allowing the
+// consumer to determine the number of entries in it.
+//
+// MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
+//                  instruction is the same.
+// MODRM_SPLITRM  - If the ModR/M byte is between 0x00 and 0xbf, the opcode
+//                  corresponds to one instruction; otherwise, it corresponds to
+//                  a different instruction.
+// MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
+//                  divided by 8 is used to select instruction; otherwise, each
+//                  value of the ModR/M byte could correspond to a different
+//                  instruction.
+// MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
+//                  corresponds to instructions that use reg field as opcode
+// MODRM_FULL     - Potentially, each value of the ModR/M byte could correspond
+//                  to a different instruction.
+#define MODRMTYPES            \
+  ENUM_ENTRY(MODRM_ONEENTRY)  \
+  ENUM_ENTRY(MODRM_SPLITRM)   \
+  ENUM_ENTRY(MODRM_SPLITMISC)  \
+  ENUM_ENTRY(MODRM_SPLITREG)  \
+  ENUM_ENTRY(MODRM_FULL)
+
+#define ENUM_ENTRY(n) n,
+enum ModRMDecisionType {
+  MODRMTYPES
+  MODRM_max
+};
+#undef ENUM_ENTRY
+
+#define CASE_ENCODING_RM     \
+    case ENCODING_RM:        \
+    case ENCODING_RM_CD2:    \
+    case ENCODING_RM_CD4:    \
+    case ENCODING_RM_CD8:    \
+    case ENCODING_RM_CD16:   \
+    case ENCODING_RM_CD32:   \
+    case ENCODING_RM_CD64
+
+// Physical encodings of instruction operands.
+#define ENCODINGS                                                              \
+  ENUM_ENTRY(ENCODING_NONE,   "")                                              \
+  ENUM_ENTRY(ENCODING_REG,    "Register operand in ModR/M byte.")              \
+  ENUM_ENTRY(ENCODING_RM,     "R/M operand in ModR/M byte.")                   \
+  ENUM_ENTRY(ENCODING_RM_CD2, "R/M operand with CDisp scaling of 2")           \
+  ENUM_ENTRY(ENCODING_RM_CD4, "R/M operand with CDisp scaling of 4")           \
+  ENUM_ENTRY(ENCODING_RM_CD8, "R/M operand with CDisp scaling of 8")           \
+  ENUM_ENTRY(ENCODING_RM_CD16,"R/M operand with CDisp scaling of 16")          \
+  ENUM_ENTRY(ENCODING_RM_CD32,"R/M operand with CDisp scaling of 32")          \
+  ENUM_ENTRY(ENCODING_RM_CD64,"R/M operand with CDisp scaling of 64")          \
+  ENUM_ENTRY(ENCODING_VVVV,   "Register operand in VEX.vvvv byte.")            \
+  ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.")         \
+  ENUM_ENTRY(ENCODING_IB,     "1-byte immediate")                              \
+  ENUM_ENTRY(ENCODING_IW,     "2-byte")                                        \
+  ENUM_ENTRY(ENCODING_ID,     "4-byte")                                        \
+  ENUM_ENTRY(ENCODING_IO,     "8-byte")                                        \
+  ENUM_ENTRY(ENCODING_RB,     "(AL..DIL, R8L..R15L) Register code added to "   \
+                              "the opcode byte")                               \
+  ENUM_ENTRY(ENCODING_RW,     "(AX..DI, R8W..R15W)")                           \
+  ENUM_ENTRY(ENCODING_RD,     "(EAX..EDI, R8D..R15D)")                         \
+  ENUM_ENTRY(ENCODING_RO,     "(RAX..RDI, R8..R15)")                           \
+  ENUM_ENTRY(ENCODING_FP,     "Position on floating-point stack in ModR/M "    \
+                              "byte.")                                         \
+                                                                               \
+  ENUM_ENTRY(ENCODING_Iv,     "Immediate of operand size")                     \
+  ENUM_ENTRY(ENCODING_Ia,     "Immediate of address size")                     \
+  ENUM_ENTRY(ENCODING_Rv,     "Register code of operand size added to the "    \
+                              "opcode byte")                                   \
+  ENUM_ENTRY(ENCODING_DUP,    "Duplicate of another operand; ID is encoded "   \
+                              "in type")                                       \
+  ENUM_ENTRY(ENCODING_SI,     "Source index; encoded in OpSize/Adsize prefix") \
+  ENUM_ENTRY(ENCODING_DI,     "Destination index; encoded in prefixes")
+
+#define ENUM_ENTRY(n, d) n,
+enum OperandEncoding {
+  ENCODINGS
+  ENCODING_max
+};
+#undef ENUM_ENTRY
+
+// Semantic interpretations of instruction operands.
+#define TYPES                                                                  \
+  ENUM_ENTRY(TYPE_NONE,       "")                                              \
+  ENUM_ENTRY(TYPE_REL8,       "1-byte immediate address")                      \
+  ENUM_ENTRY(TYPE_REL16,      "2-byte")                                        \
+  ENUM_ENTRY(TYPE_REL32,      "4-byte")                                        \
+  ENUM_ENTRY(TYPE_REL64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_PTR1616,    "2+2-byte segment+offset address")               \
+  ENUM_ENTRY(TYPE_PTR1632,    "2+4-byte")                                      \
+  ENUM_ENTRY(TYPE_PTR1664,    "2+8-byte")                                      \
+  ENUM_ENTRY(TYPE_R8,         "1-byte register operand")                       \
+  ENUM_ENTRY(TYPE_R16,        "2-byte")                                        \
+  ENUM_ENTRY(TYPE_R32,        "4-byte")                                        \
+  ENUM_ENTRY(TYPE_R64,        "8-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM8,       "1-byte immediate operand")                      \
+  ENUM_ENTRY(TYPE_IMM16,      "2-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM32,      "4-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM3,       "1-byte immediate operand between 0 and 7")      \
+  ENUM_ENTRY(TYPE_IMM5,       "1-byte immediate operand between 0 and 31")     \
+  ENUM_ENTRY(TYPE_AVX512ICC,  "1-byte immediate operand for AVX512 icmp")      \
+  ENUM_ENTRY(TYPE_UIMM8,      "1-byte unsigned immediate operand")             \
+  ENUM_ENTRY(TYPE_RM8,        "1-byte register or memory operand")             \
+  ENUM_ENTRY(TYPE_RM16,       "2-byte")                                        \
+  ENUM_ENTRY(TYPE_RM32,       "4-byte")                                        \
+  ENUM_ENTRY(TYPE_RM64,       "8-byte")                                        \
+  ENUM_ENTRY(TYPE_M,          "Memory operand")                                \
+  ENUM_ENTRY(TYPE_M8,         "1-byte")                                        \
+  ENUM_ENTRY(TYPE_M16,        "2-byte")                                        \
+  ENUM_ENTRY(TYPE_M32,        "4-byte")                                        \
+  ENUM_ENTRY(TYPE_M64,        "8-byte")                                        \
+  ENUM_ENTRY(TYPE_LEA,        "Effective address")                             \
+  ENUM_ENTRY(TYPE_M128,       "16-byte (SSE/SSE2)")                            \
+  ENUM_ENTRY(TYPE_M256,       "256-byte (AVX)")                                \
+  ENUM_ENTRY(TYPE_M1616,      "2+2-byte segment+offset address")               \
+  ENUM_ENTRY(TYPE_M1632,      "2+4-byte")                                      \
+  ENUM_ENTRY(TYPE_M1664,      "2+8-byte")                                      \
+  ENUM_ENTRY(TYPE_SRCIDX8,    "1-byte memory at source index")                 \
+  ENUM_ENTRY(TYPE_SRCIDX16,   "2-byte memory at source index")                 \
+  ENUM_ENTRY(TYPE_SRCIDX32,   "4-byte memory at source index")                 \
+  ENUM_ENTRY(TYPE_SRCIDX64,   "8-byte memory at source index")                 \
+  ENUM_ENTRY(TYPE_DSTIDX8,    "1-byte memory at destination index")            \
+  ENUM_ENTRY(TYPE_DSTIDX16,   "2-byte memory at destination index")            \
+  ENUM_ENTRY(TYPE_DSTIDX32,   "4-byte memory at destination index")            \
+  ENUM_ENTRY(TYPE_DSTIDX64,   "8-byte memory at destination index")            \
+  ENUM_ENTRY(TYPE_MOFFS8,     "1-byte memory offset (relative to segment "     \
+                              "base)")                                         \
+  ENUM_ENTRY(TYPE_MOFFS16,    "2-byte")                                        \
+  ENUM_ENTRY(TYPE_MOFFS32,    "4-byte")                                        \
+  ENUM_ENTRY(TYPE_MOFFS64,    "8-byte")                                        \
+  ENUM_ENTRY(TYPE_M32FP,      "32-bit IEE754 memory floating-point operand")   \
+  ENUM_ENTRY(TYPE_M64FP,      "64-bit")                                        \
+  ENUM_ENTRY(TYPE_M80FP,      "80-bit extended")                               \
+  ENUM_ENTRY(TYPE_ST,         "Position on the floating-point stack")          \
+  ENUM_ENTRY(TYPE_MM64,       "8-byte MMX register")                           \
+  ENUM_ENTRY(TYPE_XMM32,      "4-byte XMM register or memory operand")         \
+  ENUM_ENTRY(TYPE_XMM64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_XMM128,     "16-byte")                                       \
+  ENUM_ENTRY(TYPE_XMM256,     "32-byte")                                       \
+  ENUM_ENTRY(TYPE_XMM512,     "64-byte")                                       \
+  ENUM_ENTRY(TYPE_VK1,        "1-bit")                                         \
+  ENUM_ENTRY(TYPE_VK2,        "2-bit")                                         \
+  ENUM_ENTRY(TYPE_VK4,        "4-bit")                                         \
+  ENUM_ENTRY(TYPE_VK8,        "8-bit")                                         \
+  ENUM_ENTRY(TYPE_VK16,       "16-bit")                                        \
+  ENUM_ENTRY(TYPE_VK32,       "32-bit")                                        \
+  ENUM_ENTRY(TYPE_VK64,       "64-bit")                                        \
+  ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand")                      \
+  ENUM_ENTRY(TYPE_DEBUGREG,   "Debug register operand")                        \
+  ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand")                      \
+  ENUM_ENTRY(TYPE_BNDR,       "MPX bounds register")                           \
+                                                                               \
+  ENUM_ENTRY(TYPE_Mv,         "Memory operand of operand size")                \
+  ENUM_ENTRY(TYPE_Rv,         "Register operand of operand size")              \
+  ENUM_ENTRY(TYPE_IMMv,       "Immediate operand of operand size")             \
+  ENUM_ENTRY(TYPE_RELv,       "Immediate address of operand size")             \
+  ENUM_ENTRY(TYPE_DUP0,       "Duplicate of operand 0")                        \
+  ENUM_ENTRY(TYPE_DUP1,       "operand 1")                                     \
+  ENUM_ENTRY(TYPE_DUP2,       "operand 2")                                     \
+  ENUM_ENTRY(TYPE_DUP3,       "operand 3")                                     \
+  ENUM_ENTRY(TYPE_DUP4,       "operand 4")                                     \
+  ENUM_ENTRY(TYPE_M512,       "512-bit FPU/MMX/XMM/MXCSR state")
+
+#define ENUM_ENTRY(n, d) n,
+enum OperandType {
+  TYPES
+  TYPE_max
+};
+#undef ENUM_ENTRY
+
+/// \brief The specification for how to extract and interpret one operand.
+struct OperandSpecifier {
+  uint8_t encoding;
+  uint8_t type;
+};
+
+static const unsigned X86_MAX_OPERANDS = 6;
+
+/// Decoding mode for the Intel disassembler.  16-bit, 32-bit, and 64-bit mode
+/// are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,
+/// respectively.
+enum DisassemblerMode {
+  MODE_16BIT,
+  MODE_32BIT,
+  MODE_64BIT
+};
+
+} // namespace X86Disassembler
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
new file mode 100644
index 000000000000..10b7e6ff5ee2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -0,0 +1,299 @@
+//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as AT&T-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "X86GenAsmWriter.inc"
+
+void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << markup("<reg:") << '%' << getRegisterName(RegNo) << markup(">");
+}
+
+void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                  StringRef Annot, const MCSubtargetInfo &STI) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
+  // If verbose assembly is enabled, we can print some informative comments.
+  if (CommentStream)
+    HasCustomInstComment =
+        EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+
+  if (TSFlags & X86II::LOCK)
+    OS << "\tlock\t";
+
+  // Output CALLpcrel32 as "callq" in 64-bit mode.
+  // In Intel annotation it's always emitted as "call".
+  //
+  // TODO: Probably this hack should be redesigned via InstAlias in
+  // InstrInfo.td as soon as Requires clause is supported properly
+  // for InstAlias.
+  if (MI->getOpcode() == X86::CALLpcrel32 &&
+      (STI.getFeatureBits()[X86::Mode64Bit])) {
+    OS << "\tcallq\t";
+    printPCRelImm(MI, 0, OS);
+  }
+  // Try to print any aliases first.
+  else if (!printAliasInstr(MI, OS))
+    printInstruction(MI, OS);
+
+  // Next always print the annotation.
+  printAnnotation(OS, Annot);
+}
+
+void X86ATTInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op,
+                                      raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm();
+  switch (Imm) {
+  default: llvm_unreachable("Invalid ssecc/avxcc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
+  case 0x10: O << "eq_os"; break;
+  case 0x11: O << "lt_oq"; break;
+  case 0x12: O << "le_oq"; break;
+  case 0x13: O << "unord_s"; break;
+  case 0x14: O << "neq_us"; break;
+  case 0x15: O << "nlt_uq"; break;
+  case 0x16: O << "nle_uq"; break;
+  case 0x17: O << "ord_s"; break;
+  case 0x18: O << "eq_us"; break;
+  case 0x19: O << "nge_uq"; break;
+  case 0x1a: O << "ngt_uq"; break;
+  case 0x1b: O << "false_os"; break;
+  case 0x1c: O << "neq_os"; break;
+  case 0x1d: O << "ge_oq"; break;
+  case 0x1e: O << "gt_oq"; break;
+  case 0x1f: O << "true_us"; break;
+  }
+}
+
+void X86ATTInstPrinter::printXOPCC(const MCInst *MI, unsigned Op,
+                                   raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm();
+  switch (Imm) {
+  default: llvm_unreachable("Invalid xopcc argument!");
+  case 0: O << "lt"; break;
+  case 1: O << "le"; break;
+  case 2: O << "gt"; break;
+  case 3: O << "ge"; break;
+  case 4: O << "eq"; break;
+  case 5: O << "neq"; break;
+  case 6: O << "false"; break;
+  case 7: O << "true"; break;
+  }
+}
+
+void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
+                                            raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
+  switch (Imm) {
+  case 0: O << "{rn-sae}"; break;
+  case 1: O << "{rd-sae}"; break;
+  case 2: O << "{ru-sae}"; break;
+  case 3: O << "{rz-sae}"; break;
+  }
+}
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value (e.g. for jumps and calls).  These
+/// print slightly differently than normal immediates.  For example, a $ is not
+/// emitted.
+void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << formatImm(Op.getImm());
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    // If a symbolic branch target was added as a constant expression then print
+    // that address in hex.
+    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+    int64_t Address;
+    if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
+      O << formatHex((uint64_t)Address);
+    } else {
+      // Otherwise, just print the expression.
+      Op.getExpr()->print(O, &MAI);
+    }
+  }
+}
+
+void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    printRegName(O, Op.getReg());
+  } else if (Op.isImm()) {
+    // Print immediates as signed values.
+    int64_t Imm = Op.getImm();
+    O << markup("<imm:") << '$' << formatImm(Imm) << markup(">");
+
+    // TODO: This should be in a helper function in the base class, so it can
+    // be used by other printers.
+
+    // If there are no instruction-specific comments, add a comment clarifying
+    // the hex value of the immediate operand when it isn't in the range
+    // [-256,255].
+    if (CommentStream && !HasCustomInstComment && (Imm > 255 || Imm < -256)) {
+      // Don't print unnecessary hex sign bits. 
+      if (Imm == (int16_t)(Imm))
+        *CommentStream << format("imm = 0x%" PRIX16 "\n", (uint16_t)Imm);
+      else if (Imm == (int32_t)(Imm))
+        *CommentStream << format("imm = 0x%" PRIX32 "\n", (uint32_t)Imm);
+      else
+        *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Imm);
+    }
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << markup("<imm:") << '$';
+    Op.getExpr()->print(O, &MAI);
+    O << markup(">");
+  }
+}
+
+void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+                                          raw_ostream &O) {
+  const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg);
+  const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg);
+  const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp);
+  const MCOperand &SegReg = MI->getOperand(Op + X86::AddrSegmentReg);
+
+  O << markup("<mem:");
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op + X86::AddrSegmentReg, O);
+    O << ':';
+  }
+
+  if (DispSpec.isImm()) {
+    int64_t DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+      O << formatImm(DispVal);
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+    DispSpec.getExpr()->print(O, &MAI);
+  }
+
+  if (IndexReg.getReg() || BaseReg.getReg()) {
+    O << '(';
+    if (BaseReg.getReg())
+      printOperand(MI, Op + X86::AddrBaseReg, O);
+
+    if (IndexReg.getReg()) {
+      O << ',';
+      printOperand(MI, Op + X86::AddrIndexReg, O);
+      unsigned ScaleVal = MI->getOperand(Op + X86::AddrScaleAmt).getImm();
+      if (ScaleVal != 1) {
+        O << ',' << markup("<imm:") << ScaleVal // never printed in hex.
+          << markup(">");
+      }
+    }
+    O << ')';
+  }
+
+  O << markup(">");
+}
+
+void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+                                    raw_ostream &O) {
+  const MCOperand &SegReg = MI->getOperand(Op + 1);
+
+  O << markup("<mem:");
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op + 1, O);
+    O << ':';
+  }
+
+  O << "(";
+  printOperand(MI, Op, O);
+  O << ")";
+
+  O << markup(">");
+}
+
+void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+                                    raw_ostream &O) {
+  O << markup("<mem:");
+
+  O << "%es:(";
+  printOperand(MI, Op, O);
+  O << ")";
+
+  O << markup(">");
+}
+
+void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+                                       raw_ostream &O) {
+  const MCOperand &DispSpec = MI->getOperand(Op);
+  const MCOperand &SegReg = MI->getOperand(Op + 1);
+
+  O << markup("<mem:");
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op + 1, O);
+    O << ':';
+  }
+
+  if (DispSpec.isImm()) {
+    O << formatImm(DispSpec.getImm());
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement?");
+    DispSpec.getExpr()->print(O, &MAI);
+  }
+
+  O << markup(">");
+}
+
+void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
+                                   raw_ostream &O) {
+  if (MI->getOperand(Op).isExpr())
+    return printOperand(MI, Op, O);
+
+  O << markup("<imm:") << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff)
+    << markup(">");
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
new file mode 100644
index 000000000000..bbb309076610
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -0,0 +1,142 @@
+//==- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to AT&T style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class X86ATTInstPrinter final : public MCInstPrinter {
+public:
+  X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                    const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+
+  // Autogenerated by tblgen, returns true if we successfully printed an
+  // alias.
+  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx, raw_ostream &O);
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &OS);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS);
+
+  void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+
+  void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+
+  void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+
+  void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printDstIdx(MI, OpNo, O);
+  }
+  void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+
+private:
+  bool HasCustomInstComment;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
new file mode 100644
index 000000000000..8594addb5dd4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -0,0 +1,1197 @@
+//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstComments.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define CASE_SSE_INS_COMMON(Inst, src)            \
+  case X86::Inst##src:
+
+#define CASE_AVX_INS_COMMON(Inst, Suffix, src)    \
+  case X86::V##Inst##Suffix##src:
+
+#define CASE_MASK_INS_COMMON(Inst, Suffix, src)   \
+  case X86::V##Inst##Suffix##src##k:
+
+#define CASE_MASKZ_INS_COMMON(Inst, Suffix, src)  \
+  case X86::V##Inst##Suffix##src##kz:
+
+#define CASE_AVX512_INS_COMMON(Inst, Suffix, src) \
+  CASE_AVX_INS_COMMON(Inst, Suffix, src)          \
+  CASE_MASK_INS_COMMON(Inst, Suffix, src)         \
+  CASE_MASKZ_INS_COMMON(Inst, Suffix, src)
+
+#define CASE_MOVDUP(Inst, src)                    \
+  CASE_AVX512_INS_COMMON(Inst, Z, r##src)         \
+  CASE_AVX512_INS_COMMON(Inst, Z256, r##src)      \
+  CASE_AVX512_INS_COMMON(Inst, Z128, r##src)      \
+  CASE_AVX_INS_COMMON(Inst, , r##src)             \
+  CASE_AVX_INS_COMMON(Inst, Y, r##src)            \
+  CASE_SSE_INS_COMMON(Inst, r##src)
+
+#define CASE_MASK_MOVDUP(Inst, src)               \
+  CASE_MASK_INS_COMMON(Inst, Z, r##src)           \
+  CASE_MASK_INS_COMMON(Inst, Z256, r##src)        \
+  CASE_MASK_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_MASKZ_MOVDUP(Inst, src)              \
+  CASE_MASKZ_INS_COMMON(Inst, Z, r##src)          \
+  CASE_MASKZ_INS_COMMON(Inst, Z256, r##src)       \
+  CASE_MASKZ_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_PMOVZX(Inst, src)                    \
+  CASE_AVX512_INS_COMMON(Inst, Z, r##src)         \
+  CASE_AVX512_INS_COMMON(Inst, Z256, r##src)      \
+  CASE_AVX512_INS_COMMON(Inst, Z128, r##src)      \
+  CASE_AVX_INS_COMMON(Inst, , r##src)             \
+  CASE_AVX_INS_COMMON(Inst, Y, r##src)            \
+  CASE_SSE_INS_COMMON(Inst, r##src)
+
+#define CASE_MASK_PMOVZX(Inst, src)               \
+  CASE_MASK_INS_COMMON(Inst, Z, r##src)           \
+  CASE_MASK_INS_COMMON(Inst, Z256, r##src)        \
+  CASE_MASK_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_MASKZ_PMOVZX(Inst, src)              \
+  CASE_MASKZ_INS_COMMON(Inst, Z, r##src)          \
+  CASE_MASKZ_INS_COMMON(Inst, Z256, r##src)       \
+  CASE_MASKZ_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_UNPCK(Inst, src)                     \
+  CASE_AVX512_INS_COMMON(Inst, Z, r##src)         \
+  CASE_AVX512_INS_COMMON(Inst, Z256, r##src)      \
+  CASE_AVX512_INS_COMMON(Inst, Z128, r##src)      \
+  CASE_AVX_INS_COMMON(Inst, , r##src)             \
+  CASE_AVX_INS_COMMON(Inst, Y, r##src)            \
+  CASE_SSE_INS_COMMON(Inst, r##src)
+
+#define CASE_MASK_UNPCK(Inst, src)                \
+  CASE_MASK_INS_COMMON(Inst, Z, r##src)           \
+  CASE_MASK_INS_COMMON(Inst, Z256, r##src)        \
+  CASE_MASK_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_MASKZ_UNPCK(Inst, src)               \
+  CASE_MASKZ_INS_COMMON(Inst, Z, r##src)          \
+  CASE_MASKZ_INS_COMMON(Inst, Z256, r##src)       \
+  CASE_MASKZ_INS_COMMON(Inst, Z128, r##src)
+
+#define CASE_SHUF(Inst, suf)                      \
+  CASE_AVX512_INS_COMMON(Inst, Z, suf)            \
+  CASE_AVX512_INS_COMMON(Inst, Z256, suf)         \
+  CASE_AVX512_INS_COMMON(Inst, Z128, suf)         \
+  CASE_AVX_INS_COMMON(Inst, , suf)                \
+  CASE_AVX_INS_COMMON(Inst, Y, suf)               \
+  CASE_SSE_INS_COMMON(Inst, suf)
+
+#define CASE_MASK_SHUF(Inst, src)                 \
+  CASE_MASK_INS_COMMON(Inst, Z, r##src##i)        \
+  CASE_MASK_INS_COMMON(Inst, Z256, r##src##i)     \
+  CASE_MASK_INS_COMMON(Inst, Z128, r##src##i)
+
+#define CASE_MASKZ_SHUF(Inst, src)                \
+  CASE_MASKZ_INS_COMMON(Inst, Z, r##src##i)       \
+  CASE_MASKZ_INS_COMMON(Inst, Z256, r##src##i)    \
+  CASE_MASKZ_INS_COMMON(Inst, Z128, r##src##i)
+
+#define CASE_VPERMILPI(Inst, src)                 \
+  CASE_AVX512_INS_COMMON(Inst, Z, src##i)         \
+  CASE_AVX512_INS_COMMON(Inst, Z256, src##i)      \
+  CASE_AVX512_INS_COMMON(Inst, Z128, src##i)      \
+  CASE_AVX_INS_COMMON(Inst, , src##i)             \
+  CASE_AVX_INS_COMMON(Inst, Y, src##i)
+
+#define CASE_MASK_VPERMILPI(Inst, src)            \
+  CASE_MASK_INS_COMMON(Inst, Z, src##i)           \
+  CASE_MASK_INS_COMMON(Inst, Z256, src##i)        \
+  CASE_MASK_INS_COMMON(Inst, Z128, src##i)
+
+#define CASE_MASKZ_VPERMILPI(Inst, src)           \
+  CASE_MASKZ_INS_COMMON(Inst, Z, src##i)          \
+  CASE_MASKZ_INS_COMMON(Inst, Z256, src##i)       \
+  CASE_MASKZ_INS_COMMON(Inst, Z128, src##i)
+
+#define CASE_VPERM(Inst, src)                     \
+  CASE_AVX512_INS_COMMON(Inst, Z, src##i)         \
+  CASE_AVX512_INS_COMMON(Inst, Z256, src##i)      \
+  CASE_AVX_INS_COMMON(Inst, Y, src##i)
+
+#define CASE_MASK_VPERM(Inst, src)                \
+  CASE_MASK_INS_COMMON(Inst, Z, src##i)           \
+  CASE_MASK_INS_COMMON(Inst, Z256, src##i)
+
+#define CASE_MASKZ_VPERM(Inst, src)               \
+  CASE_MASKZ_INS_COMMON(Inst, Z, src##i)          \
+  CASE_MASKZ_INS_COMMON(Inst, Z256, src##i)
+
+#define CASE_VSHUF(Inst, src)                          \
+  CASE_AVX512_INS_COMMON(SHUFF##Inst, Z, r##src##i)    \
+  CASE_AVX512_INS_COMMON(SHUFI##Inst, Z, r##src##i)    \
+  CASE_AVX512_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \
+  CASE_AVX512_INS_COMMON(SHUFI##Inst, Z256, r##src##i)
+
+#define CASE_MASK_VSHUF(Inst, src)                    \
+  CASE_MASK_INS_COMMON(SHUFF##Inst, Z, r##src##i)     \
+  CASE_MASK_INS_COMMON(SHUFI##Inst, Z, r##src##i)     \
+  CASE_MASK_INS_COMMON(SHUFF##Inst, Z256, r##src##i)  \
+  CASE_MASK_INS_COMMON(SHUFI##Inst, Z256, r##src##i)
+
+#define CASE_MASKZ_VSHUF(Inst, src)                   \
+  CASE_MASKZ_INS_COMMON(SHUFF##Inst, Z, r##src##i)    \
+  CASE_MASKZ_INS_COMMON(SHUFI##Inst, Z, r##src##i)    \
+  CASE_MASKZ_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \
+  CASE_MASKZ_INS_COMMON(SHUFI##Inst, Z256, r##src##i)
+
+static unsigned getVectorRegSize(unsigned RegNo) {
+  if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31)
+    return 512;
+  if (X86::YMM0 <= RegNo && RegNo <= X86::YMM31)
+    return 256;
+  if (X86::XMM0 <= RegNo && RegNo <= X86::XMM31)
+    return 128;
+  if (X86::MM0 <= RegNo && RegNo <= X86::MM7)
+    return 64;
+
+  llvm_unreachable("Unknown vector reg!");
+}
+
+static MVT getRegOperandVectorVT(const MCInst *MI, const MVT &ScalarVT,
+                                 unsigned OperandIndex) {
+  unsigned OpReg = MI->getOperand(OperandIndex).getReg();
+  return MVT::getVectorVT(ScalarVT,
+                          getVectorRegSize(OpReg)/ScalarVT.getSizeInBits());
+}
+
+/// \brief Extracts the dst type for a given zero extension instruction.
+static MVT getZeroExtensionResultType(const MCInst *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown zero extension instruction");
+  // zero extension to i16
+  CASE_PMOVZX(PMOVZXBW, m)
+  CASE_PMOVZX(PMOVZXBW, r)
+    return getRegOperandVectorVT(MI, MVT::i16, 0);
+  // zero extension to i32
+  CASE_PMOVZX(PMOVZXBD, m)
+  CASE_PMOVZX(PMOVZXBD, r)
+  CASE_PMOVZX(PMOVZXWD, m)
+  CASE_PMOVZX(PMOVZXWD, r)
+    return getRegOperandVectorVT(MI, MVT::i32, 0);
+  // zero extension to i64
+  CASE_PMOVZX(PMOVZXBQ, m)
+  CASE_PMOVZX(PMOVZXBQ, r)
+  CASE_PMOVZX(PMOVZXWQ, m)
+  CASE_PMOVZX(PMOVZXWQ, r)
+  CASE_PMOVZX(PMOVZXDQ, m)
+  CASE_PMOVZX(PMOVZXDQ, r)
+    return getRegOperandVectorVT(MI, MVT::i64, 0);
+  }
+}
+
+/// Wraps the destination register name with AVX512 mask/maskz filtering.
+static std::string getMaskName(const MCInst *MI, const char *DestName,
+                               const char *(*getRegName)(unsigned)) {
+  std::string OpMaskName(DestName);
+
+  bool MaskWithZero = false;
+  const char *MaskRegName = nullptr;
+
+  switch (MI->getOpcode()) {
+  default:
+    return OpMaskName;
+  CASE_MASKZ_MOVDUP(MOVDDUP, m)
+  CASE_MASKZ_MOVDUP(MOVDDUP, r)
+  CASE_MASKZ_MOVDUP(MOVSHDUP, m)
+  CASE_MASKZ_MOVDUP(MOVSHDUP, r)
+  CASE_MASKZ_MOVDUP(MOVSLDUP, m)
+  CASE_MASKZ_MOVDUP(MOVSLDUP, r)
+  CASE_MASKZ_PMOVZX(PMOVZXBD, m)
+  CASE_MASKZ_PMOVZX(PMOVZXBD, r)
+  CASE_MASKZ_PMOVZX(PMOVZXBQ, m)
+  CASE_MASKZ_PMOVZX(PMOVZXBQ, r)
+  CASE_MASKZ_PMOVZX(PMOVZXBW, m)
+  CASE_MASKZ_PMOVZX(PMOVZXBW, r)
+  CASE_MASKZ_PMOVZX(PMOVZXDQ, m)
+  CASE_MASKZ_PMOVZX(PMOVZXDQ, r)
+  CASE_MASKZ_PMOVZX(PMOVZXWD, m)
+  CASE_MASKZ_PMOVZX(PMOVZXWD, r)
+  CASE_MASKZ_PMOVZX(PMOVZXWQ, m)
+  CASE_MASKZ_PMOVZX(PMOVZXWQ, r)
+  CASE_MASKZ_UNPCK(PUNPCKHBW, m)
+  CASE_MASKZ_UNPCK(PUNPCKHBW, r)
+  CASE_MASKZ_UNPCK(PUNPCKHWD, m)
+  CASE_MASKZ_UNPCK(PUNPCKHWD, r)
+  CASE_MASKZ_UNPCK(PUNPCKHDQ, m)
+  CASE_MASKZ_UNPCK(PUNPCKHDQ, r)
+  CASE_MASKZ_UNPCK(PUNPCKLBW, m)
+  CASE_MASKZ_UNPCK(PUNPCKLBW, r)
+  CASE_MASKZ_UNPCK(PUNPCKLWD, m)
+  CASE_MASKZ_UNPCK(PUNPCKLWD, r)
+  CASE_MASKZ_UNPCK(PUNPCKLDQ, m)
+  CASE_MASKZ_UNPCK(PUNPCKLDQ, r)
+  CASE_MASKZ_UNPCK(UNPCKHPD, m)
+  CASE_MASKZ_UNPCK(UNPCKHPD, r)
+  CASE_MASKZ_UNPCK(UNPCKHPS, m)
+  CASE_MASKZ_UNPCK(UNPCKHPS, r)
+  CASE_MASKZ_UNPCK(UNPCKLPD, m)
+  CASE_MASKZ_UNPCK(UNPCKLPD, r)
+  CASE_MASKZ_UNPCK(UNPCKLPS, m)
+  CASE_MASKZ_UNPCK(UNPCKLPS, r)
+  CASE_MASKZ_SHUF(PALIGNR, r)
+  CASE_MASKZ_SHUF(PALIGNR, m)
+  CASE_MASKZ_SHUF(ALIGNQ, r)
+  CASE_MASKZ_SHUF(ALIGNQ, m)
+  CASE_MASKZ_SHUF(ALIGND, r)
+  CASE_MASKZ_SHUF(ALIGND, m)
+  CASE_MASKZ_SHUF(SHUFPD, m)
+  CASE_MASKZ_SHUF(SHUFPD, r)
+  CASE_MASKZ_SHUF(SHUFPS, m)
+  CASE_MASKZ_SHUF(SHUFPS, r)
+  CASE_MASKZ_VPERMILPI(PERMILPD, m)
+  CASE_MASKZ_VPERMILPI(PERMILPD, r)
+  CASE_MASKZ_VPERMILPI(PERMILPS, m)
+  CASE_MASKZ_VPERMILPI(PERMILPS, r)
+  CASE_MASKZ_VPERMILPI(PSHUFD, m)
+  CASE_MASKZ_VPERMILPI(PSHUFD, r)
+  CASE_MASKZ_VPERMILPI(PSHUFHW, m)
+  CASE_MASKZ_VPERMILPI(PSHUFHW, r)
+  CASE_MASKZ_VPERMILPI(PSHUFLW, m)
+  CASE_MASKZ_VPERMILPI(PSHUFLW, r)
+  CASE_MASKZ_VPERM(PERMPD, m)
+  CASE_MASKZ_VPERM(PERMPD, r)
+  CASE_MASKZ_VPERM(PERMQ, m)
+  CASE_MASKZ_VPERM(PERMQ, r)
+  CASE_MASKZ_VSHUF(64X2, m)
+  CASE_MASKZ_VSHUF(64X2, r)
+  CASE_MASKZ_VSHUF(32X4, m)
+  CASE_MASKZ_VSHUF(32X4, r)
+  CASE_MASKZ_INS_COMMON(BROADCASTF64X2, Z128, rm)
+  CASE_MASKZ_INS_COMMON(BROADCASTI64X2, Z128, rm)
+  CASE_MASKZ_INS_COMMON(BROADCASTF64X2, , rm)
+  CASE_MASKZ_INS_COMMON(BROADCASTI64X2, , rm)
+  CASE_MASKZ_INS_COMMON(BROADCASTF64X4, , rm)
+  CASE_MASKZ_INS_COMMON(BROADCASTI64X4, , rm)
+  CASE_MASKZ_INS_COMMON(BROADCASTF32X4, Z256, rm)
+  CASE_MASKZ_INS_COMMON(BROADCASTI32X4, Z256, rm)
+  CASE_MASKZ_INS_COMMON(BROADCASTF32X4, , rm)
+  CASE_MASKZ_INS_COMMON(BROADCASTI32X4, , rm)
+  CASE_MASKZ_INS_COMMON(BROADCASTF32X8, , rm)
+  CASE_MASKZ_INS_COMMON(BROADCASTI32X8, , rm)
+  CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z256, r)
+  CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z256, r)
+  CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z256, m)
+  CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z256, m)
+  CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z, r)
+  CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z, r)
+  CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z, m)
+  CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z, m)
+    MaskWithZero = true;
+    MaskRegName = getRegName(MI->getOperand(1).getReg());
+    break;
+  CASE_MASK_MOVDUP(MOVDDUP, m)
+  CASE_MASK_MOVDUP(MOVDDUP, r)
+  CASE_MASK_MOVDUP(MOVSHDUP, m)
+  CASE_MASK_MOVDUP(MOVSHDUP, r)
+  CASE_MASK_MOVDUP(MOVSLDUP, m)
+  CASE_MASK_MOVDUP(MOVSLDUP, r)
+  CASE_MASK_PMOVZX(PMOVZXBD, m)
+  CASE_MASK_PMOVZX(PMOVZXBD, r)
+  CASE_MASK_PMOVZX(PMOVZXBQ, m)
+  CASE_MASK_PMOVZX(PMOVZXBQ, r)
+  CASE_MASK_PMOVZX(PMOVZXBW, m)
+  CASE_MASK_PMOVZX(PMOVZXBW, r)
+  CASE_MASK_PMOVZX(PMOVZXDQ, m)
+  CASE_MASK_PMOVZX(PMOVZXDQ, r)
+  CASE_MASK_PMOVZX(PMOVZXWD, m)
+  CASE_MASK_PMOVZX(PMOVZXWD, r)
+  CASE_MASK_PMOVZX(PMOVZXWQ, m)
+  CASE_MASK_PMOVZX(PMOVZXWQ, r)
+  CASE_MASK_UNPCK(PUNPCKHBW, m)
+  CASE_MASK_UNPCK(PUNPCKHBW, r)
+  CASE_MASK_UNPCK(PUNPCKHWD, m)
+  CASE_MASK_UNPCK(PUNPCKHWD, r)
+  CASE_MASK_UNPCK(PUNPCKHDQ, m)
+  CASE_MASK_UNPCK(PUNPCKHDQ, r)
+  CASE_MASK_UNPCK(PUNPCKLBW, m)
+  CASE_MASK_UNPCK(PUNPCKLBW, r)
+  CASE_MASK_UNPCK(PUNPCKLWD, m)
+  CASE_MASK_UNPCK(PUNPCKLWD, r)
+  CASE_MASK_UNPCK(PUNPCKLDQ, m)
+  CASE_MASK_UNPCK(PUNPCKLDQ, r)
+  CASE_MASK_UNPCK(UNPCKHPD, m)
+  CASE_MASK_UNPCK(UNPCKHPD, r)
+  CASE_MASK_UNPCK(UNPCKHPS, m)
+  CASE_MASK_UNPCK(UNPCKHPS, r)
+  CASE_MASK_UNPCK(UNPCKLPD, m)
+  CASE_MASK_UNPCK(UNPCKLPD, r)
+  CASE_MASK_UNPCK(UNPCKLPS, m)
+  CASE_MASK_UNPCK(UNPCKLPS, r)
+  CASE_MASK_SHUF(PALIGNR, r)
+  CASE_MASK_SHUF(PALIGNR, m)
+  CASE_MASK_SHUF(ALIGNQ, r)
+  CASE_MASK_SHUF(ALIGNQ, m)
+  CASE_MASK_SHUF(ALIGND, r)
+  CASE_MASK_SHUF(ALIGND, m)
+  CASE_MASK_SHUF(SHUFPD, m)
+  CASE_MASK_SHUF(SHUFPD, r)
+  CASE_MASK_SHUF(SHUFPS, m)
+  CASE_MASK_SHUF(SHUFPS, r)
+  CASE_MASK_VPERMILPI(PERMILPD, m)
+  CASE_MASK_VPERMILPI(PERMILPD, r)
+  CASE_MASK_VPERMILPI(PERMILPS, m)
+  CASE_MASK_VPERMILPI(PERMILPS, r)
+  CASE_MASK_VPERMILPI(PSHUFD, m)
+  CASE_MASK_VPERMILPI(PSHUFD, r)
+  CASE_MASK_VPERMILPI(PSHUFHW, m)
+  CASE_MASK_VPERMILPI(PSHUFHW, r)
+  CASE_MASK_VPERMILPI(PSHUFLW, m)
+  CASE_MASK_VPERMILPI(PSHUFLW, r)
+  CASE_MASK_VPERM(PERMPD, m)
+  CASE_MASK_VPERM(PERMPD, r)
+  CASE_MASK_VPERM(PERMQ, m)
+  CASE_MASK_VPERM(PERMQ, r)
+  CASE_MASK_VSHUF(64X2, m)
+  CASE_MASK_VSHUF(64X2, r)
+  CASE_MASK_VSHUF(32X4, m)
+  CASE_MASK_VSHUF(32X4, r)
+  CASE_MASK_INS_COMMON(BROADCASTF64X2, Z128, rm)
+  CASE_MASK_INS_COMMON(BROADCASTI64X2, Z128, rm)
+  CASE_MASK_INS_COMMON(BROADCASTF64X2, , rm)
+  CASE_MASK_INS_COMMON(BROADCASTI64X2, , rm)
+  CASE_MASK_INS_COMMON(BROADCASTF64X4, , rm)
+  CASE_MASK_INS_COMMON(BROADCASTI64X4, , rm)
+  CASE_MASK_INS_COMMON(BROADCASTF32X4, Z256, rm)
+  CASE_MASK_INS_COMMON(BROADCASTI32X4, Z256, rm)
+  CASE_MASK_INS_COMMON(BROADCASTF32X4, , rm)
+  CASE_MASK_INS_COMMON(BROADCASTI32X4, , rm)
+  CASE_MASK_INS_COMMON(BROADCASTF32X8, , rm)
+  CASE_MASK_INS_COMMON(BROADCASTI32X8, , rm)
+  CASE_MASK_INS_COMMON(BROADCASTF32X2, Z256, r)
+  CASE_MASK_INS_COMMON(BROADCASTI32X2, Z256, r)
+  CASE_MASK_INS_COMMON(BROADCASTF32X2, Z256, m)
+  CASE_MASK_INS_COMMON(BROADCASTI32X2, Z256, m)
+  CASE_MASK_INS_COMMON(BROADCASTF32X2, Z, r)
+  CASE_MASK_INS_COMMON(BROADCASTI32X2, Z, r)
+  CASE_MASK_INS_COMMON(BROADCASTF32X2, Z, m)
+  CASE_MASK_INS_COMMON(BROADCASTI32X2, Z, m)
+    MaskRegName = getRegName(MI->getOperand(2).getReg());
+    break;
+  }
+
+  // MASK: zmmX {%kY}
+  OpMaskName += " {%";
+  OpMaskName += MaskRegName;
+  OpMaskName += "}";
+
+  // MASKZ: zmmX {%kY} {z}
+  if (MaskWithZero)
+    OpMaskName += " {z}";
+
+  return OpMaskName;
+}
+
+//===----------------------------------------------------------------------===//
+// Top Level Entrypoint
+//===----------------------------------------------------------------------===//
+
+/// EmitAnyX86InstComments - This function decodes x86 instructions and prints
+/// newline terminated strings to the specified string if desired.  This
+/// information is shown in disassembly dumps when verbose assembly is enabled.
+bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+                                  const char *(*getRegName)(unsigned)) {
+  // If this is a shuffle operation, the switch should fill in this state.
+  SmallVector<int, 8> ShuffleMask;
+  const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr;
+  unsigned NumOperands = MI->getNumOperands();
+  bool RegForm = false;
+
+  switch (MI->getOpcode()) {
+  default:
+    // Not an instruction for which we can decode comments.
+    return false;
+
+  case X86::BLENDPDrri:
+  case X86::VBLENDPDrri:
+  case X86::VBLENDPDYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    LLVM_FALLTHROUGH;
+  case X86::BLENDPDrmi:
+  case X86::VBLENDPDrmi:
+  case X86::VBLENDPDYrmi:
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+                      MI->getOperand(NumOperands - 1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::BLENDPSrri:
+  case X86::VBLENDPSrri:
+  case X86::VBLENDPSYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    LLVM_FALLTHROUGH;
+  case X86::BLENDPSrmi:
+  case X86::VBLENDPSrmi:
+  case X86::VBLENDPSYrmi:
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+                      MI->getOperand(NumOperands - 1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::PBLENDWrri:
+  case X86::VPBLENDWrri:
+  case X86::VPBLENDWYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    LLVM_FALLTHROUGH;
+  case X86::PBLENDWrmi:
+  case X86::VPBLENDWrmi:
+  case X86::VPBLENDWYrmi:
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+                      MI->getOperand(NumOperands - 1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::VPBLENDDrri:
+  case X86::VPBLENDDYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    LLVM_FALLTHROUGH;
+  case X86::VPBLENDDrmi:
+  case X86::VPBLENDDYrmi:
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+                      MI->getOperand(NumOperands - 1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::INSERTPSrr:
+  case X86::VINSERTPSrr:
+  case X86::VINSERTPSZrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    LLVM_FALLTHROUGH;
+  case X86::INSERTPSrm:
+  case X86::VINSERTPSrm:
+  case X86::VINSERTPSZrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodeINSERTPSMask(MI->getOperand(NumOperands - 1).getImm(),
+                         ShuffleMask);
+    break;
+
+  case X86::MOVLHPSrr:
+  case X86::VMOVLHPSrr:
+  case X86::VMOVLHPSZrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVLHPSMask(2, ShuffleMask);
+    break;
+
+  case X86::MOVHLPSrr:
+  case X86::VMOVHLPSrr:
+  case X86::VMOVHLPSZrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVHLPSMask(2, ShuffleMask);
+    break;
+
+  case X86::MOVHPDrm:
+  case X86::VMOVHPDrm:
+  case X86::VMOVHPDZ128rm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeInsertElementMask(MVT::v2f64, 1, 1, ShuffleMask);
+    break;
+
+  case X86::MOVHPSrm:
+  case X86::VMOVHPSrm:
+  case X86::VMOVHPSZ128rm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeInsertElementMask(MVT::v4f32, 2, 2, ShuffleMask);
+    break;
+
+  case X86::MOVLPDrm:
+  case X86::VMOVLPDrm:
+  case X86::VMOVLPDZ128rm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeInsertElementMask(MVT::v2f64, 0, 1, ShuffleMask);
+    break;
+
+  case X86::MOVLPSrm:
+  case X86::VMOVLPSrm:
+  case X86::VMOVLPSZ128rm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeInsertElementMask(MVT::v4f32, 0, 2, ShuffleMask);
+    break;
+
+  CASE_MOVDUP(MOVSLDUP, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+
+  CASE_MOVDUP(MOVSLDUP, m)
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVSLDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+    break;
+
+  CASE_MOVDUP(MOVSHDUP, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+
+  CASE_MOVDUP(MOVSHDUP, m)
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVSHDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+    break;
+
+  CASE_MOVDUP(MOVDDUP, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+
+  CASE_MOVDUP(MOVDDUP, m)
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVDDUPMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+    break;
+
+  case X86::PSLLDQri:
+  case X86::VPSLLDQri:
+  case X86::VPSLLDQYri:
+  case X86::VPSLLDQZ128rr:
+  case X86::VPSLLDQZ256rr:
+  case X86::VPSLLDQZ512rr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+  case X86::VPSLLDQZ128rm:
+  case X86::VPSLLDQZ256rm:
+  case X86::VPSLLDQZ512rm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodePSLLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+                       MI->getOperand(NumOperands - 1).getImm(),
+                       ShuffleMask);
+    break;
+
+  case X86::PSRLDQri:
+  case X86::VPSRLDQri:
+  case X86::VPSRLDQYri:
+  case X86::VPSRLDQZ128rr:
+  case X86::VPSRLDQZ256rr:
+  case X86::VPSRLDQZ512rr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+  case X86::VPSRLDQZ128rm:
+  case X86::VPSRLDQZ256rm:
+  case X86::VPSRLDQZ512rm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodePSRLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+                       MI->getOperand(NumOperands - 1).getImm(),
+                       ShuffleMask);
+    break;
+
+  CASE_SHUF(PALIGNR, rri)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_SHUF(PALIGNR, rmi)
+    Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodePALIGNRMask(getRegOperandVectorVT(MI, MVT::i8, 0),
+                        MI->getOperand(NumOperands - 1).getImm(),
+                        ShuffleMask);
+    break;
+
+  CASE_AVX512_INS_COMMON(ALIGNQ, Z, rri)
+  CASE_AVX512_INS_COMMON(ALIGNQ, Z256, rri)
+  CASE_AVX512_INS_COMMON(ALIGNQ, Z128, rri)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_AVX512_INS_COMMON(ALIGNQ, Z, rmi)
+  CASE_AVX512_INS_COMMON(ALIGNQ, Z256, rmi)
+  CASE_AVX512_INS_COMMON(ALIGNQ, Z128, rmi)
+    Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodeVALIGNMask(getRegOperandVectorVT(MI, MVT::i64, 0),
+                       MI->getOperand(NumOperands - 1).getImm(),
+                       ShuffleMask);
+    break;
+
+  CASE_AVX512_INS_COMMON(ALIGND, Z, rri)
+  CASE_AVX512_INS_COMMON(ALIGND, Z256, rri)
+  CASE_AVX512_INS_COMMON(ALIGND, Z128, rri)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_AVX512_INS_COMMON(ALIGND, Z, rmi)
+  CASE_AVX512_INS_COMMON(ALIGND, Z256, rmi)
+  CASE_AVX512_INS_COMMON(ALIGND, Z128, rmi)
+    Src2Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodeVALIGNMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+                       MI->getOperand(NumOperands - 1).getImm(),
+                       ShuffleMask);
+    break;
+
+  CASE_SHUF(PSHUFD, ri)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    LLVM_FALLTHROUGH;
+
+  CASE_SHUF(PSHUFD, mi)
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+                      MI->getOperand(NumOperands - 1).getImm(),
+                      ShuffleMask);
+    break;
+
+  CASE_SHUF(PSHUFHW, ri)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    LLVM_FALLTHROUGH;
+
+  CASE_SHUF(PSHUFHW, mi)
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodePSHUFHWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+                        MI->getOperand(NumOperands - 1).getImm(),
+                        ShuffleMask);
+    break;
+
+  CASE_SHUF(PSHUFLW, ri)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    LLVM_FALLTHROUGH;
+
+  CASE_SHUF(PSHUFLW, mi)
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodePSHUFLWMask(getRegOperandVectorVT(MI, MVT::i16, 0),
+                        MI->getOperand(NumOperands - 1).getImm(),
+                        ShuffleMask);
+    break;
+
+  case X86::MMX_PSHUFWri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    LLVM_FALLTHROUGH;
+
+  case X86::MMX_PSHUFWmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodePSHUFMask(MVT::v4i16,
+                      MI->getOperand(NumOperands - 1).getImm(),
+                      ShuffleMask);
+    break;
+
+  case X86::PSWAPDrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    LLVM_FALLTHROUGH;
+
+  case X86::PSWAPDrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePSWAPMask(MVT::v2i32, ShuffleMask);
+    break;
+
+  CASE_UNPCK(PUNPCKHBW, r)
+  case X86::MMX_PUNPCKHBWirr:
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_UNPCK(PUNPCKHBW, m)
+  case X86::MMX_PUNPCKHBWirm:
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
+    break;
+
+  CASE_UNPCK(PUNPCKHWD, r)
+  case X86::MMX_PUNPCKHWDirr:
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_UNPCK(PUNPCKHWD, m)
+  case X86::MMX_PUNPCKHWDirm:
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
+    break;
+
+  CASE_UNPCK(PUNPCKHDQ, r)
+  case X86::MMX_PUNPCKHDQirr:
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_UNPCK(PUNPCKHDQ, m)
+  case X86::MMX_PUNPCKHDQirm:
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
+    break;
+
+  CASE_UNPCK(PUNPCKHQDQ, r)
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_UNPCK(PUNPCKHQDQ, m)
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
+    break;
+
+  CASE_UNPCK(PUNPCKLBW, r)
+  case X86::MMX_PUNPCKLBWirr:
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_UNPCK(PUNPCKLBW, m)
+  case X86::MMX_PUNPCKLBWirm:
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask);
+    break;
+
+  CASE_UNPCK(PUNPCKLWD, r)
+  case X86::MMX_PUNPCKLWDirr:
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_UNPCK(PUNPCKLWD, m)
+  case X86::MMX_PUNPCKLWDirm:
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask);
+    break;
+
+  CASE_UNPCK(PUNPCKLDQ, r)
+  case X86::MMX_PUNPCKLDQirr:
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_UNPCK(PUNPCKLDQ, m)
+  case X86::MMX_PUNPCKLDQirm:
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask);
+    break;
+
+  CASE_UNPCK(PUNPCKLQDQ, r)
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_UNPCK(PUNPCKLQDQ, m)
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask);
+    break;
+
+  CASE_SHUF(SHUFPD, rri)
+    Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_SHUF(SHUFPD, rmi)
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+                      MI->getOperand(NumOperands - 1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_SHUF(SHUFPS, rri)
+    Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_SHUF(SHUFPS, rmi)
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+                      MI->getOperand(NumOperands - 1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_VSHUF(64X2, r)
+    Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_VSHUF(64X2, m)
+    decodeVSHUF64x2FamilyMask(getRegOperandVectorVT(MI, MVT::i64, 0),
+                              MI->getOperand(NumOperands - 1).getImm(),
+                              ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_VSHUF(32X4, r)
+    Src2Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_VSHUF(32X4, m)
+    decodeVSHUF64x2FamilyMask(getRegOperandVectorVT(MI, MVT::i32, 0),
+                              MI->getOperand(NumOperands - 1).getImm(),
+                              ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?3:7)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_UNPCK(UNPCKLPD, r)
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_UNPCK(UNPCKLPD, m)
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_UNPCK(UNPCKLPS, r)
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_UNPCK(UNPCKLPS, m)
+    DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_UNPCK(UNPCKHPD, r)
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_UNPCK(UNPCKHPD, m)
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_UNPCK(UNPCKHPS, r)
+    Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    RegForm = true;
+    LLVM_FALLTHROUGH;
+
+  CASE_UNPCK(UNPCKHPS, m)
+    DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_VPERMILPI(PERMILPS, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    LLVM_FALLTHROUGH;
+
+  CASE_VPERMILPI(PERMILPS, m)
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f32, 0),
+                      MI->getOperand(NumOperands - 1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_VPERMILPI(PERMILPD, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    LLVM_FALLTHROUGH;
+
+  CASE_VPERMILPI(PERMILPD, m)
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+                      MI->getOperand(NumOperands - 1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::VPERM2F128rr:
+  case X86::VPERM2I128rr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    LLVM_FALLTHROUGH;
+
+  case X86::VPERM2F128rm:
+  case X86::VPERM2I128rm:
+    // For instruction comments purpose, assume the 256-bit vector is v4i64.
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodeVPERM2X128Mask(MVT::v4i64,
+                           MI->getOperand(NumOperands - 1).getImm(),
+                           ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_VPERM(PERMPD, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    LLVM_FALLTHROUGH;
+
+  CASE_VPERM(PERMPD, m)
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodeVPERMMask(getRegOperandVectorVT(MI, MVT::f64, 0),
+                      MI->getOperand(NumOperands - 1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_VPERM(PERMQ, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 2).getReg());
+    LLVM_FALLTHROUGH;
+
+  CASE_VPERM(PERMQ, m)
+    if (MI->getOperand(NumOperands - 1).isImm())
+      DecodeVPERMMask(getRegOperandVectorVT(MI, MVT::i64, 0),
+                      MI->getOperand(NumOperands - 1).getImm(),
+                      ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::MOVSDrr:
+  case X86::VMOVSDrr:
+  case X86::VMOVSDZrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    LLVM_FALLTHROUGH;
+
+  case X86::MOVSDrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVSDZrm:
+    DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::MOVSSrr:
+  case X86::VMOVSSrr:
+  case X86::VMOVSSZrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    LLVM_FALLTHROUGH;
+
+  case X86::MOVSSrm:
+  case X86::VMOVSSrm:
+  case X86::VMOVSSZrm:
+    DecodeScalarMoveMask(MVT::v4f32, nullptr == Src2Name, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::MOVPQI2QIrr:
+  case X86::MOVZPQILo2PQIrr:
+  case X86::VMOVPQI2QIrr:
+  case X86::VMOVZPQILo2PQIrr:
+  case X86::VMOVZPQILo2PQIZrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    LLVM_FALLTHROUGH;
+
+  case X86::MOVQI2PQIrm:
+  case X86::VMOVQI2PQIrm:
+  case X86::VMOVQI2PQIZrm:
+    DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::MOVDI2PDIrm:
+  case X86::VMOVDI2PDIrm:
+  case X86::VMOVDI2PDIZrm:
+    DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::EXTRQI:
+    if (MI->getOperand(2).isImm() &&
+        MI->getOperand(3).isImm())
+      DecodeEXTRQIMask(MI->getOperand(2).getImm(),
+                       MI->getOperand(3).getImm(),
+                       ShuffleMask);
+
+    DestName = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    break;
+
+  case X86::INSERTQI:
+    if (MI->getOperand(3).isImm() &&
+        MI->getOperand(4).isImm())
+      DecodeINSERTQIMask(MI->getOperand(3).getImm(),
+                         MI->getOperand(4).getImm(),
+                         ShuffleMask);
+
+    DestName = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    break;
+
+  case X86::VBROADCASTF128:
+  case X86::VBROADCASTI128:
+  CASE_AVX512_INS_COMMON(BROADCASTF64X2, Z128, rm)
+  CASE_AVX512_INS_COMMON(BROADCASTI64X2, Z128, rm)
+    DecodeSubVectorBroadcast(MVT::v4f64, MVT::v2f64, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  CASE_AVX512_INS_COMMON(BROADCASTF64X2, , rm)
+  CASE_AVX512_INS_COMMON(BROADCASTI64X2, , rm)
+    DecodeSubVectorBroadcast(MVT::v8f64, MVT::v2f64, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  CASE_AVX512_INS_COMMON(BROADCASTF64X4, , rm)
+  CASE_AVX512_INS_COMMON(BROADCASTI64X4, , rm)
+    DecodeSubVectorBroadcast(MVT::v8f64, MVT::v4f64, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  CASE_AVX512_INS_COMMON(BROADCASTF32X4, Z256, rm)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X4, Z256, rm)
+    DecodeSubVectorBroadcast(MVT::v8f32, MVT::v4f32, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  CASE_AVX512_INS_COMMON(BROADCASTF32X4, , rm)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X4, , rm)
+    DecodeSubVectorBroadcast(MVT::v16f32, MVT::v4f32, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  CASE_AVX512_INS_COMMON(BROADCASTF32X8, , rm)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X8, , rm)
+    DecodeSubVectorBroadcast(MVT::v16f32, MVT::v8f32, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m)
+    DecodeSubVectorBroadcast(MVT::v8f32, MVT::v2f32, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m)
+    DecodeSubVectorBroadcast(MVT::v16f32, MVT::v2f32, ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_PMOVZX(PMOVZXBW, r)
+  CASE_PMOVZX(PMOVZXBD, r)
+  CASE_PMOVZX(PMOVZXBQ, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+
+  CASE_PMOVZX(PMOVZXBW, m)
+  CASE_PMOVZX(PMOVZXBD, m)
+  CASE_PMOVZX(PMOVZXBQ, m)
+    DecodeZeroExtendMask(MVT::i8, getZeroExtensionResultType(MI), ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_PMOVZX(PMOVZXWD, r)
+  CASE_PMOVZX(PMOVZXWQ, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+
+  CASE_PMOVZX(PMOVZXWD, m)
+  CASE_PMOVZX(PMOVZXWQ, m)
+    DecodeZeroExtendMask(MVT::i16, getZeroExtensionResultType(MI), ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  CASE_PMOVZX(PMOVZXDQ, r)
+    Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+
+  CASE_PMOVZX(PMOVZXDQ, m)
+    DecodeZeroExtendMask(MVT::i32, getZeroExtensionResultType(MI), ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  }
+
+  // The only comments we decode are shuffles, so give up if we were unable to
+  // decode a shuffle mask.
+  if (ShuffleMask.empty())
+    return false;
+
+  if (!DestName) DestName = Src1Name;
+  OS << (DestName ? getMaskName(MI, DestName, getRegName) : "mem") << " = ";
+
+  // If the two sources are the same, canonicalize the input elements to be
+  // from the first src so that we get larger element spans.
+  if (Src1Name == Src2Name) {
+    for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+      if ((int)ShuffleMask[i] >= 0 && // Not sentinel.
+          ShuffleMask[i] >= (int)e)   // From second mask.
+        ShuffleMask[i] -= e;
+    }
+  }
+
+  // The shuffle mask specifies which elements of the src1/src2 fill in the
+  // destination, with a few sentinel values.  Loop through and print them
+  // out.
+  for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+    if (i != 0)
+      OS << ',';
+    if (ShuffleMask[i] == SM_SentinelZero) {
+      OS << "zero";
+      continue;
+    }
+
+    // Otherwise, it must come from src1 or src2.  Print the span of elements
+    // that comes from this src.
+    bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size();
+    const char *SrcName = isSrc1 ? Src1Name : Src2Name;
+    OS << (SrcName ? SrcName : "mem") << '[';
+    bool IsFirst = true;
+    while (i != e && (int)ShuffleMask[i] != SM_SentinelZero &&
+           (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) {
+      if (!IsFirst)
+        OS << ',';
+      else
+        IsFirst = false;
+      if (ShuffleMask[i] == SM_SentinelUndef)
+        OS << "u";
+      else
+        OS << ShuffleMask[i] % ShuffleMask.size();
+      ++i;
+    }
+    OS << ']';
+    --i; // For loop increments element #.
+  }
+  //MI->print(OS, 0);
+  OS << "\n";
+
+  // We successfully added a comment to this instruction.
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
new file mode 100644
index 000000000000..c6d0d85a7d3d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
@@ -0,0 +1,30 @@
+//=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
+
+namespace llvm {
+
+  enum AsmComments {
+    AC_EVEX_2_VEX = 0x2 // For instr that was compressed from EVEX to VEX.
+  };
+
+  class MCInst;
+  class raw_ostream;
+  bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+                              const char *(*getRegName)(unsigned));
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
new file mode 100644
index 000000000000..4443edb8e342
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -0,0 +1,260 @@
+//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as Intel-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86IntelInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "X86GenAsmWriter1.inc"
+
+void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << getRegisterName(RegNo);
+}
+
+void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                    StringRef Annot,
+                                    const MCSubtargetInfo &STI) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
+  if (TSFlags & X86II::LOCK)
+    OS << "\tlock\n";
+
+  printInstruction(MI, OS);
+
+  // Next always print the annotation.
+  printAnnotation(OS, Annot);
+
+  // If verbose assembly is enabled, we can print some informative comments.
+  if (CommentStream)
+    EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+}
+
+void X86IntelInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op,
+                                        raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm();
+  switch (Imm) {
+  default: llvm_unreachable("Invalid avxcc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
+  case 0x10: O << "eq_os"; break;
+  case 0x11: O << "lt_oq"; break;
+  case 0x12: O << "le_oq"; break;
+  case 0x13: O << "unord_s"; break;
+  case 0x14: O << "neq_us"; break;
+  case 0x15: O << "nlt_uq"; break;
+  case 0x16: O << "nle_uq"; break;
+  case 0x17: O << "ord_s"; break;
+  case 0x18: O << "eq_us"; break;
+  case 0x19: O << "nge_uq"; break;
+  case 0x1a: O << "ngt_uq"; break;
+  case 0x1b: O << "false_os"; break;
+  case 0x1c: O << "neq_os"; break;
+  case 0x1d: O << "ge_oq"; break;
+  case 0x1e: O << "gt_oq"; break;
+  case 0x1f: O << "true_us"; break;
+  }
+}
+
+void X86IntelInstPrinter::printXOPCC(const MCInst *MI, unsigned Op,
+                                     raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm();
+  switch (Imm) {
+  default: llvm_unreachable("Invalid xopcc argument!");
+  case 0: O << "lt"; break;
+  case 1: O << "le"; break;
+  case 2: O << "gt"; break;
+  case 3: O << "ge"; break;
+  case 4: O << "eq"; break;
+  case 5: O << "neq"; break;
+  case 6: O << "false"; break;
+  case 7: O << "true"; break;
+  }
+}
+
+void X86IntelInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
+                                               raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
+  switch (Imm) {
+  case 0: O << "{rn-sae}"; break;
+  case 1: O << "{rd-sae}"; break;
+  case 2: O << "{ru-sae}"; break;
+  case 3: O << "{rz-sae}"; break;
+  }
+}
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.
+void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << formatImm(Op.getImm());
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    // If a symbolic branch target was added as a constant expression then print
+    // that address in hex.
+    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+    int64_t Address;
+    if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
+      O << formatHex((uint64_t)Address);
+    }
+    else {
+      // Otherwise, just print the expression.
+      Op.getExpr()->print(O, &MAI);
+    }
+  }
+}
+
+void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    printRegName(O, Op.getReg());
+  } else if (Op.isImm()) {
+    O << formatImm((int64_t)Op.getImm());
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    Op.getExpr()->print(O, &MAI);
+  }
+}
+
+void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+                                            raw_ostream &O) {
+  const MCOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
+  unsigned ScaleVal         = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+  const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+  const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+  const MCOperand &SegReg   = MI->getOperand(Op+X86::AddrSegmentReg);
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+X86::AddrSegmentReg, O);
+    O << ':';
+  }
+
+  O << '[';
+
+  bool NeedPlus = false;
+  if (BaseReg.getReg()) {
+    printOperand(MI, Op+X86::AddrBaseReg, O);
+    NeedPlus = true;
+  }
+
+  if (IndexReg.getReg()) {
+    if (NeedPlus) O << " + ";
+    if (ScaleVal != 1)
+      O << ScaleVal << '*';
+    printOperand(MI, Op+X86::AddrIndexReg, O);
+    NeedPlus = true;
+  }
+
+  if (!DispSpec.isImm()) {
+    if (NeedPlus) O << " + ";
+    assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+    DispSpec.getExpr()->print(O, &MAI);
+  } else {
+    int64_t DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+      if (NeedPlus) {
+        if (DispVal > 0)
+          O << " + ";
+        else {
+          O << " - ";
+          DispVal = -DispVal;
+        }
+      }
+      O << formatImm(DispVal);
+    }
+  }
+
+  O << ']';
+}
+
+void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
+                                      raw_ostream &O) {
+  const MCOperand &SegReg   = MI->getOperand(Op+1);
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+1, O);
+    O << ':';
+  }
+  O << '[';
+  printOperand(MI, Op, O);
+  O << ']';
+}
+
+void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
+                                      raw_ostream &O) {
+  // DI accesses are always ES-based.
+  O << "es:[";
+  printOperand(MI, Op, O);
+  O << ']';
+}
+
+void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+                                         raw_ostream &O) {
+  const MCOperand &DispSpec = MI->getOperand(Op);
+  const MCOperand &SegReg   = MI->getOperand(Op+1);
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+1, O);
+    O << ':';
+  }
+
+  O << '[';
+
+  if (DispSpec.isImm()) {
+    O << formatImm(DispSpec.getImm());
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement?");
+    DispSpec.getExpr()->print(O, &MAI);
+  }
+
+  O << ']';
+}
+
+void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
+                                     raw_ostream &O) {
+  if (MI->getOperand(Op).isExpr())
+    return MI->getOperand(Op).getExpr()->print(O, &MAI);
+
+  O << formatImm(MI->getOperand(Op).getImm() & 0xff);
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
new file mode 100644
index 000000000000..20cd7ffb2e63
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -0,0 +1,162 @@
+//= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to Intel style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class X86IntelInstPrinter final : public MCInstPrinter {
+public:
+  X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                      const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O);
+
+  void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+
+  void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "opaque ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+
+  void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "xmmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "ymmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "zmmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "xword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "xmmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "ymmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "zmmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+
+
+  void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printSrcIdx(MI, OpNo, O);
+  }
+  void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printDstIdx(MI, OpNo, O);
+  }
+  void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printDstIdx(MI, OpNo, O);
+  }
+  void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
new file mode 100644
index 000000000000..e83ec9f4045a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -0,0 +1,881 @@
+//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("invalid fixup kind!");
+  case FK_PCRel_1:
+  case FK_SecRel_1:
+  case FK_Data_1:
+    return 0;
+  case FK_PCRel_2:
+  case FK_SecRel_2:
+  case FK_Data_2:
+    return 1;
+  case FK_PCRel_4:
+  case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_relax:
+  case X86::reloc_riprel_4byte_relax_rex:
+  case X86::reloc_riprel_4byte_movq_load:
+  case X86::reloc_signed_4byte:
+  case X86::reloc_signed_4byte_relax:
+  case X86::reloc_global_offset_table:
+  case FK_SecRel_4:
+  case FK_Data_4:
+    return 2;
+  case FK_PCRel_8:
+  case FK_SecRel_8:
+  case FK_Data_8:
+  case X86::reloc_global_offset_table8:
+    return 3;
+  }
+}
+
+namespace {
+
+class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine,
+                     bool HasRelocationAddend, bool foobar)
+    : MCELFObjectTargetWriter(is64Bit, OSABI, EMachine, HasRelocationAddend) {}
+};
+
+class X86AsmBackend : public MCAsmBackend {
+  const StringRef CPU;
+  bool HasNopl;
+  const uint64_t MaxNopLength;
+public:
+  X86AsmBackend(const Target &T, StringRef CPU)
+      : MCAsmBackend(), CPU(CPU),
+        MaxNopLength((CPU == "slm") ? 7 : 15) {
+    HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
+              CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
+              CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
+              CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" &&
+              CPU != "c3" && CPU != "c3-2" && CPU != "lakemont";
+  }
+
+  unsigned getNumFixupKinds() const override {
+    return X86::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
+        {"reloc_riprel_4byte", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"reloc_riprel_4byte_movq_load", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"reloc_riprel_4byte_relax", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"reloc_riprel_4byte_relax_rex", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"reloc_signed_4byte", 0, 32, 0},
+        {"reloc_signed_4byte_relax", 0, 32, 0},
+        {"reloc_global_offset_table", 0, 32, 0},
+        {"reloc_global_offset_table8", 0, 64, 0},
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override {
+    unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
+
+    assert(Fixup.getOffset() + Size <= DataSize &&
+           "Invalid fixup offset!");
+
+    // Check that uppper bits are either all zeros or all ones.
+    // Specifically ignore overflow/underflow as long as the leakage is
+    // limited to the lower bits. This is to remain compatible with
+    // other assemblers.
+    assert(isIntN(Size * 8 + 1, Value) &&
+           "Value does not fit in the Fixup field");
+
+    for (unsigned i = 0; i != Size; ++i)
+      Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override;
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override;
+
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override;
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+} // end anonymous namespace
+
+static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool is16BitMode) {
+  unsigned Op = Inst.getOpcode();
+  switch (Op) {
+  default:
+    return Op;
+  case X86::JAE_1:
+    return (is16BitMode) ? X86::JAE_2 : X86::JAE_4;
+  case X86::JA_1:
+    return (is16BitMode) ? X86::JA_2 : X86::JA_4;
+  case X86::JBE_1:
+    return (is16BitMode) ? X86::JBE_2 : X86::JBE_4;
+  case X86::JB_1:
+    return (is16BitMode) ? X86::JB_2 : X86::JB_4;
+  case X86::JE_1:
+    return (is16BitMode) ? X86::JE_2 : X86::JE_4;
+  case X86::JGE_1:
+    return (is16BitMode) ? X86::JGE_2 : X86::JGE_4;
+  case X86::JG_1:
+    return (is16BitMode) ? X86::JG_2 : X86::JG_4;
+  case X86::JLE_1:
+    return (is16BitMode) ? X86::JLE_2 : X86::JLE_4;
+  case X86::JL_1:
+    return (is16BitMode) ? X86::JL_2 : X86::JL_4;
+  case X86::JMP_1:
+    return (is16BitMode) ? X86::JMP_2 : X86::JMP_4;
+  case X86::JNE_1:
+    return (is16BitMode) ? X86::JNE_2 : X86::JNE_4;
+  case X86::JNO_1:
+    return (is16BitMode) ? X86::JNO_2 : X86::JNO_4;
+  case X86::JNP_1:
+    return (is16BitMode) ? X86::JNP_2 : X86::JNP_4;
+  case X86::JNS_1:
+    return (is16BitMode) ? X86::JNS_2 : X86::JNS_4;
+  case X86::JO_1:
+    return (is16BitMode) ? X86::JO_2 : X86::JO_4;
+  case X86::JP_1:
+    return (is16BitMode) ? X86::JP_2 : X86::JP_4;
+  case X86::JS_1:
+    return (is16BitMode) ? X86::JS_2 : X86::JS_4;
+  }
+}
+
+static unsigned getRelaxedOpcodeArith(const MCInst &Inst) {
+  unsigned Op = Inst.getOpcode();
+  switch (Op) {
+  default:
+    return Op;
+
+    // IMUL
+  case X86::IMUL16rri8: return X86::IMUL16rri;
+  case X86::IMUL16rmi8: return X86::IMUL16rmi;
+  case X86::IMUL32rri8: return X86::IMUL32rri;
+  case X86::IMUL32rmi8: return X86::IMUL32rmi;
+  case X86::IMUL64rri8: return X86::IMUL64rri32;
+  case X86::IMUL64rmi8: return X86::IMUL64rmi32;
+
+    // AND
+  case X86::AND16ri8: return X86::AND16ri;
+  case X86::AND16mi8: return X86::AND16mi;
+  case X86::AND32ri8: return X86::AND32ri;
+  case X86::AND32mi8: return X86::AND32mi;
+  case X86::AND64ri8: return X86::AND64ri32;
+  case X86::AND64mi8: return X86::AND64mi32;
+
+    // OR
+  case X86::OR16ri8: return X86::OR16ri;
+  case X86::OR16mi8: return X86::OR16mi;
+  case X86::OR32ri8: return X86::OR32ri;
+  case X86::OR32mi8: return X86::OR32mi;
+  case X86::OR64ri8: return X86::OR64ri32;
+  case X86::OR64mi8: return X86::OR64mi32;
+
+    // XOR
+  case X86::XOR16ri8: return X86::XOR16ri;
+  case X86::XOR16mi8: return X86::XOR16mi;
+  case X86::XOR32ri8: return X86::XOR32ri;
+  case X86::XOR32mi8: return X86::XOR32mi;
+  case X86::XOR64ri8: return X86::XOR64ri32;
+  case X86::XOR64mi8: return X86::XOR64mi32;
+
+    // ADD
+  case X86::ADD16ri8: return X86::ADD16ri;
+  case X86::ADD16mi8: return X86::ADD16mi;
+  case X86::ADD32ri8: return X86::ADD32ri;
+  case X86::ADD32mi8: return X86::ADD32mi;
+  case X86::ADD64ri8: return X86::ADD64ri32;
+  case X86::ADD64mi8: return X86::ADD64mi32;
+
+   // ADC
+  case X86::ADC16ri8: return X86::ADC16ri;
+  case X86::ADC16mi8: return X86::ADC16mi;
+  case X86::ADC32ri8: return X86::ADC32ri;
+  case X86::ADC32mi8: return X86::ADC32mi;
+  case X86::ADC64ri8: return X86::ADC64ri32;
+  case X86::ADC64mi8: return X86::ADC64mi32;
+
+    // SUB
+  case X86::SUB16ri8: return X86::SUB16ri;
+  case X86::SUB16mi8: return X86::SUB16mi;
+  case X86::SUB32ri8: return X86::SUB32ri;
+  case X86::SUB32mi8: return X86::SUB32mi;
+  case X86::SUB64ri8: return X86::SUB64ri32;
+  case X86::SUB64mi8: return X86::SUB64mi32;
+
+   // SBB
+  case X86::SBB16ri8: return X86::SBB16ri;
+  case X86::SBB16mi8: return X86::SBB16mi;
+  case X86::SBB32ri8: return X86::SBB32ri;
+  case X86::SBB32mi8: return X86::SBB32mi;
+  case X86::SBB64ri8: return X86::SBB64ri32;
+  case X86::SBB64mi8: return X86::SBB64mi32;
+
+    // CMP
+  case X86::CMP16ri8: return X86::CMP16ri;
+  case X86::CMP16mi8: return X86::CMP16mi;
+  case X86::CMP32ri8: return X86::CMP32ri;
+  case X86::CMP32mi8: return X86::CMP32mi;
+  case X86::CMP64ri8: return X86::CMP64ri32;
+  case X86::CMP64mi8: return X86::CMP64mi32;
+
+    // PUSH
+  case X86::PUSH32i8:  return X86::PUSHi32;
+  case X86::PUSH16i8:  return X86::PUSHi16;
+  case X86::PUSH64i8:  return X86::PUSH64i32;
+  }
+}
+
+static unsigned getRelaxedOpcode(const MCInst &Inst, bool is16BitMode) {
+  unsigned R = getRelaxedOpcodeArith(Inst);
+  if (R != Inst.getOpcode())
+    return R;
+  return getRelaxedOpcodeBranch(Inst, is16BitMode);
+}
+
+bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  // Branches can always be relaxed in either mode.
+  if (getRelaxedOpcodeBranch(Inst, false) != Inst.getOpcode())
+    return true;
+
+  // Check if this instruction is ever relaxable.
+  if (getRelaxedOpcodeArith(Inst) == Inst.getOpcode())
+    return false;
+
+
+  // Check if the relaxable operand has an expression. For the current set of
+  // relaxable instructions, the relaxable operand is always the last operand.
+  unsigned RelaxableOp = Inst.getNumOperands() - 1;
+  if (Inst.getOperand(RelaxableOp).isExpr())
+    return true;
+
+  return false;
+}
+
+bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                         uint64_t Value,
+                                         const MCRelaxableFragment *DF,
+                                         const MCAsmLayout &Layout) const {
+  // Relax if the value is too big for a (signed) i8.
+  return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+// FIXME: Can tblgen help at all here to verify there aren't other instructions
+// we can relax?
+void X86AsmBackend::relaxInstruction(const MCInst &Inst,
+                                     const MCSubtargetInfo &STI,
+                                     MCInst &Res) const {
+  // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
+  bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+  unsigned RelaxedOp = getRelaxedOpcode(Inst, is16BitMode);
+
+  if (RelaxedOp == Inst.getOpcode()) {
+    SmallString<256> Tmp;
+    raw_svector_ostream OS(Tmp);
+    Inst.dump_pretty(OS);
+    OS << "\n";
+    report_fatal_error("unexpected instruction to relax: " + OS.str());
+  }
+
+  Res = Inst;
+  Res.setOpcode(RelaxedOp);
+}
+
+/// \brief Write a sequence of optimal nops to the output, covering \p Count
+/// bytes.
+/// \return - true on success, false on failure
+bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  static const uint8_t Nops[10][10] = {
+    // nop
+    {0x90},
+    // xchg %ax,%ax
+    {0x66, 0x90},
+    // nopl (%[re]ax)
+    {0x0f, 0x1f, 0x00},
+    // nopl 0(%[re]ax)
+    {0x0f, 0x1f, 0x40, 0x00},
+    // nopl 0(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopw 0(%[re]ax,%[re]ax,1)
+    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopl 0L(%[re]ax)
+    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+    // nopl 0L(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    // nopw 0L(%[re]ax,%[re]ax,1)
+    {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    // nopw %cs:0L(%[re]ax,%[re]ax,1)
+    {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+  };
+
+  // This CPU doesn't support long nops. If needed add more.
+  // FIXME: Can we get this from the subtarget somehow?
+  // FIXME: We could generated something better than plain 0x90.
+  if (!HasNopl) {
+    for (uint64_t i = 0; i < Count; ++i)
+      OW->write8(0x90);
+    return true;
+  }
+
+  // 15 is the longest single nop instruction.  Emit as many 15-byte nops as
+  // needed, then emit a nop of the remaining length.
+  do {
+    const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
+    const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
+    for (uint8_t i = 0; i < Prefixes; i++)
+      OW->write8(0x66);
+    const uint8_t Rest = ThisNopLength - Prefixes;
+    for (uint8_t i = 0; i < Rest; i++)
+      OW->write8(Nops[Rest - 1][i]);
+    Count -= ThisNopLength;
+  } while (Count != 0);
+
+  return true;
+}
+
+/* *** */
+
+namespace {
+
+class ELFX86AsmBackend : public X86AsmBackend {
+public:
+  uint8_t OSABI;
+  ELFX86AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+      : X86AsmBackend(T, CPU), OSABI(OSABI) {}
+};
+
+class ELFX86_32AsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+    : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386);
+  }
+};
+
+class ELFX86_X32AsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+      : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+                                    ELF::EM_X86_64);
+  }
+};
+
+class ELFX86_IAMCUAsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+      : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
+                                    ELF::EM_IAMCU);
+  }
+};
+
+class ELFX86_64AsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+    : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64);
+  }
+};
+
+class WindowsX86AsmBackend : public X86AsmBackend {
+  bool Is64Bit;
+
+public:
+  WindowsX86AsmBackend(const Target &T, bool is64Bit, StringRef CPU)
+    : X86AsmBackend(T, CPU)
+    , Is64Bit(is64Bit) {
+  }
+
+  Optional<MCFixupKind> getFixupKind(StringRef Name) const override {
+    return StringSwitch<Optional<MCFixupKind>>(Name)
+        .Case("dir32", FK_Data_4)
+        .Case("secrel32", FK_SecRel_4)
+        .Case("secidx", FK_SecRel_2)
+        .Default(MCAsmBackend::getFixupKind(Name));
+  }
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createX86WinCOFFObjectWriter(OS, Is64Bit);
+  }
+};
+
+namespace CU {
+
+  /// Compact unwind encoding values.
+  enum CompactUnwindEncodings {
+    /// [RE]BP based frame where [RE]BP is pused on the stack immediately after
+    /// the return address, then [RE]SP is moved to [RE]BP.
+    UNWIND_MODE_BP_FRAME                   = 0x01000000,
+
+    /// A frameless function with a small constant stack size.
+    UNWIND_MODE_STACK_IMMD                 = 0x02000000,
+
+    /// A frameless function with a large constant stack size.
+    UNWIND_MODE_STACK_IND                  = 0x03000000,
+
+    /// No compact unwind encoding is available.
+    UNWIND_MODE_DWARF                      = 0x04000000,
+
+    /// Mask for encoding the frame registers.
+    UNWIND_BP_FRAME_REGISTERS              = 0x00007FFF,
+
+    /// Mask for encoding the frameless registers.
+    UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
+  };
+
+} // end CU namespace
+
+class DarwinX86AsmBackend : public X86AsmBackend {
+  const MCRegisterInfo &MRI;
+
+  /// \brief Number of registers that can be saved in a compact unwind encoding.
+  enum { CU_NUM_SAVED_REGS = 6 };
+
+  mutable unsigned SavedRegs[CU_NUM_SAVED_REGS];
+  bool Is64Bit;
+
+  unsigned OffsetSize;                   ///< Offset of a "push" instruction.
+  unsigned MoveInstrSize;                ///< Size of a "move" instruction.
+  unsigned StackDivide;                  ///< Amount to adjust stack size by.
+protected:
+  /// \brief Size of a "push" instruction for the given register.
+  unsigned PushInstrSize(unsigned Reg) const {
+    switch (Reg) {
+      case X86::EBX:
+      case X86::ECX:
+      case X86::EDX:
+      case X86::EDI:
+      case X86::ESI:
+      case X86::EBP:
+      case X86::RBX:
+      case X86::RBP:
+        return 1;
+      case X86::R12:
+      case X86::R13:
+      case X86::R14:
+      case X86::R15:
+        return 2;
+    }
+    return 1;
+  }
+
+  /// \brief Implementation of algorithm to generate the compact unwind encoding
+  /// for the CFI instructions.
+  uint32_t
+  generateCompactUnwindEncodingImpl(ArrayRef<MCCFIInstruction> Instrs) const {
+    if (Instrs.empty()) return 0;
+
+    // Reset the saved registers.
+    unsigned SavedRegIdx = 0;
+    memset(SavedRegs, 0, sizeof(SavedRegs));
+
+    bool HasFP = false;
+
+    // Encode that we are using EBP/RBP as the frame pointer.
+    uint32_t CompactUnwindEncoding = 0;
+
+    unsigned SubtractInstrIdx = Is64Bit ? 3 : 2;
+    unsigned InstrOffset = 0;
+    unsigned StackAdjust = 0;
+    unsigned StackSize = 0;
+    unsigned PrevStackSize = 0;
+    unsigned NumDefCFAOffsets = 0;
+
+    for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
+      const MCCFIInstruction &Inst = Instrs[i];
+
+      switch (Inst.getOperation()) {
+      default:
+        // Any other CFI directives indicate a frame that we aren't prepared
+        // to represent via compact unwind, so just bail out.
+        return 0;
+      case MCCFIInstruction::OpDefCfaRegister: {
+        // Defines a frame pointer. E.g.
+        //
+        //     movq %rsp, %rbp
+        //  L0:
+        //     .cfi_def_cfa_register %rbp
+        //
+        HasFP = true;
+
+        // If the frame pointer is other than esp/rsp, we do not have a way to
+        // generate a compact unwinding representation, so bail out.
+        if (MRI.getLLVMRegNum(Inst.getRegister(), true) !=
+            (Is64Bit ? X86::RBP : X86::EBP))
+          return 0;
+
+        // Reset the counts.
+        memset(SavedRegs, 0, sizeof(SavedRegs));
+        StackAdjust = 0;
+        SavedRegIdx = 0;
+        InstrOffset += MoveInstrSize;
+        break;
+      }
+      case MCCFIInstruction::OpDefCfaOffset: {
+        // Defines a new offset for the CFA. E.g.
+        //
+        //  With frame:
+        //
+        //     pushq %rbp
+        //  L0:
+        //     .cfi_def_cfa_offset 16
+        //
+        //  Without frame:
+        //
+        //     subq $72, %rsp
+        //  L0:
+        //     .cfi_def_cfa_offset 80
+        //
+        PrevStackSize = StackSize;
+        StackSize = std::abs(Inst.getOffset()) / StackDivide;
+        ++NumDefCFAOffsets;
+        break;
+      }
+      case MCCFIInstruction::OpOffset: {
+        // Defines a "push" of a callee-saved register. E.g.
+        //
+        //     pushq %r15
+        //     pushq %r14
+        //     pushq %rbx
+        //  L0:
+        //     subq $120, %rsp
+        //  L1:
+        //     .cfi_offset %rbx, -40
+        //     .cfi_offset %r14, -32
+        //     .cfi_offset %r15, -24
+        //
+        if (SavedRegIdx == CU_NUM_SAVED_REGS)
+          // If there are too many saved registers, we cannot use a compact
+          // unwind encoding.
+          return CU::UNWIND_MODE_DWARF;
+
+        unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+        SavedRegs[SavedRegIdx++] = Reg;
+        StackAdjust += OffsetSize;
+        InstrOffset += PushInstrSize(Reg);
+        break;
+      }
+      }
+    }
+
+    StackAdjust /= StackDivide;
+
+    if (HasFP) {
+      if ((StackAdjust & 0xFF) != StackAdjust)
+        // Offset was too big for a compact unwind encoding.
+        return CU::UNWIND_MODE_DWARF;
+
+      // Get the encoding of the saved registers when we have a frame pointer.
+      uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame();
+      if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+      CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME;
+      CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16;
+      CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS;
+    } else {
+      // If the amount of the stack allocation is the size of a register, then
+      // we "push" the RAX/EAX register onto the stack instead of adjusting the
+      // stack pointer with a SUB instruction. We don't support the push of the
+      // RAX/EAX register with compact unwind. So we check for that situation
+      // here.
+      if ((NumDefCFAOffsets == SavedRegIdx + 1 &&
+           StackSize - PrevStackSize == 1) ||
+          (Instrs.size() == 1 && NumDefCFAOffsets == 1 && StackSize == 2))
+        return CU::UNWIND_MODE_DWARF;
+
+      SubtractInstrIdx += InstrOffset;
+      ++StackAdjust;
+
+      if ((StackSize & 0xFF) == StackSize) {
+        // Frameless stack with a small stack size.
+        CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD;
+
+        // Encode the stack size.
+        CompactUnwindEncoding |= (StackSize & 0xFF) << 16;
+      } else {
+        if ((StackAdjust & 0x7) != StackAdjust)
+          // The extra stack adjustments are too big for us to handle.
+          return CU::UNWIND_MODE_DWARF;
+
+        // Frameless stack with an offset too large for us to encode compactly.
+        CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND;
+
+        // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
+        // instruction.
+        CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
+
+        // Encode any extra stack stack adjustments (done via push
+        // instructions).
+        CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
+      }
+
+      // Encode the number of registers saved. (Reverse the list first.)
+      std::reverse(&SavedRegs[0], &SavedRegs[SavedRegIdx]);
+      CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10;
+
+      // Get the encoding of the saved registers when we don't have a frame
+      // pointer.
+      uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegIdx);
+      if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+      // Encode the register encoding.
+      CompactUnwindEncoding |=
+        RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION;
+    }
+
+    return CompactUnwindEncoding;
+  }
+
+private:
+  /// \brief Get the compact unwind number for a given register. The number
+  /// corresponds to the enum lists in compact_unwind_encoding.h.
+  int getCompactUnwindRegNum(unsigned Reg) const {
+    static const MCPhysReg CU32BitRegs[7] = {
+      X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+    };
+    static const MCPhysReg CU64BitRegs[] = {
+      X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+    };
+    const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
+    for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
+      if (*CURegs == Reg)
+        return Idx;
+
+    return -1;
+  }
+
+  /// \brief Return the registers encoded for a compact encoding with a frame
+  /// pointer.
+  uint32_t encodeCompactUnwindRegistersWithFrame() const {
+    // Encode the registers in the order they were saved --- 3-bits per
+    // register. The list of saved registers is assumed to be in reverse
+    // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS.
+    uint32_t RegEnc = 0;
+    for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) {
+      unsigned Reg = SavedRegs[i];
+      if (Reg == 0) break;
+
+      int CURegNum = getCompactUnwindRegNum(Reg);
+      if (CURegNum == -1) return ~0U;
+
+      // Encode the 3-bit register number in order, skipping over 3-bits for
+      // each register.
+      RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
+    }
+
+    assert((RegEnc & 0x3FFFF) == RegEnc &&
+           "Invalid compact register encoding!");
+    return RegEnc;
+  }
+
+  /// \brief Create the permutation encoding used with frameless stacks. It is
+  /// passed the number of registers to be saved and an array of the registers
+  /// saved.
+  uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
+    // The saved registers are numbered from 1 to 6. In order to encode the
+    // order in which they were saved, we re-number them according to their
+    // place in the register order. The re-numbering is relative to the last
+    // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in
+    // that order:
+    //
+    //    Orig  Re-Num
+    //    ----  ------
+    //     6       6
+    //     2       2
+    //     4       3
+    //     5       3
+    //
+    for (unsigned i = 0; i < RegCount; ++i) {
+      int CUReg = getCompactUnwindRegNum(SavedRegs[i]);
+      if (CUReg == -1) return ~0U;
+      SavedRegs[i] = CUReg;
+    }
+
+    // Reverse the list.
+    std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]);
+
+    uint32_t RenumRegs[CU_NUM_SAVED_REGS];
+    for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){
+      unsigned Countless = 0;
+      for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
+        if (SavedRegs[j] < SavedRegs[i])
+          ++Countless;
+
+      RenumRegs[i] = SavedRegs[i] - Countless - 1;
+    }
+
+    // Take the renumbered values and encode them into a 10-bit number.
+    uint32_t permutationEncoding = 0;
+    switch (RegCount) {
+    case 6:
+      permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
+                             + 6 * RenumRegs[2] +  2 * RenumRegs[3]
+                             +     RenumRegs[4];
+      break;
+    case 5:
+      permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
+                             + 6 * RenumRegs[3] +  2 * RenumRegs[4]
+                             +     RenumRegs[5];
+      break;
+    case 4:
+      permutationEncoding |=  60 * RenumRegs[2] + 12 * RenumRegs[3]
+                             + 3 * RenumRegs[4] +      RenumRegs[5];
+      break;
+    case 3:
+      permutationEncoding |=  20 * RenumRegs[3] +  4 * RenumRegs[4]
+                             +     RenumRegs[5];
+      break;
+    case 2:
+      permutationEncoding |=   5 * RenumRegs[4] +      RenumRegs[5];
+      break;
+    case 1:
+      permutationEncoding |=       RenumRegs[5];
+      break;
+    }
+
+    assert((permutationEncoding & 0x3FF) == permutationEncoding &&
+           "Invalid compact register encoding!");
+    return permutationEncoding;
+  }
+
+public:
+  DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef CPU,
+                      bool Is64Bit)
+    : X86AsmBackend(T, CPU), MRI(MRI), Is64Bit(Is64Bit) {
+    memset(SavedRegs, 0, sizeof(SavedRegs));
+    OffsetSize = Is64Bit ? 8 : 4;
+    MoveInstrSize = Is64Bit ? 3 : 2;
+    StackDivide = Is64Bit ? 8 : 4;
+  }
+};
+
+class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
+public:
+  DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                         StringRef CPU)
+      : DarwinX86AsmBackend(T, MRI, CPU, false) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
+                                     MachO::CPU_TYPE_I386,
+                                     MachO::CPU_SUBTYPE_I386_ALL);
+  }
+
+  /// \brief Generate the compact unwind encoding for the CFI instructions.
+  uint32_t generateCompactUnwindEncoding(
+                             ArrayRef<MCCFIInstruction> Instrs) const override {
+    return generateCompactUnwindEncodingImpl(Instrs);
+  }
+};
+
+class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
+  const MachO::CPUSubTypeX86 Subtype;
+public:
+  DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                         StringRef CPU, MachO::CPUSubTypeX86 st)
+      : DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
+                                     MachO::CPU_TYPE_X86_64, Subtype);
+  }
+
+  /// \brief Generate the compact unwind encoding for the CFI instructions.
+  uint32_t generateCompactUnwindEncoding(
+                             ArrayRef<MCCFIInstruction> Instrs) const override {
+    return generateCompactUnwindEncodingImpl(Instrs);
+  }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           const Triple &TheTriple,
+                                           StringRef CPU,
+                                           const MCTargetOptions &Options) {
+  if (TheTriple.isOSBinFormatMachO())
+    return new DarwinX86_32AsmBackend(T, MRI, CPU);
+
+  if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
+    return new WindowsX86AsmBackend(T, false, CPU);
+
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+
+  if (TheTriple.isOSIAMCU())
+    return new ELFX86_IAMCUAsmBackend(T, OSABI, CPU);
+
+  return new ELFX86_32AsmBackend(T, OSABI, CPU);
+}
+
+MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           const Triple &TheTriple,
+                                           StringRef CPU,
+                                           const MCTargetOptions &Options) {
+  if (TheTriple.isOSBinFormatMachO()) {
+    MachO::CPUSubTypeX86 CS =
+        StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
+            .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
+            .Default(MachO::CPU_SUBTYPE_X86_64_ALL);
+    return new DarwinX86_64AsmBackend(T, MRI, CPU, CS);
+  }
+
+  if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
+    return new WindowsX86AsmBackend(T, true, CPU);
+
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+
+  if (TheTriple.getEnvironment() == Triple::GNUX32)
+    return new ELFX86_X32AsmBackend(T, OSABI, CPU);
+  return new ELFX86_64AsmBackend(T, OSABI, CPU);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
new file mode 100644
index 000000000000..aab552547fac
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -0,0 +1,784 @@
+//===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the X86 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H
+
+#include "X86MCTargetDesc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+namespace X86 {
+  // Enums for memory operand decoding.  Each memory operand is represented with
+  // a 5 operand sequence in the form:
+  //   [BaseReg, ScaleAmt, IndexReg, Disp, Segment]
+  // These enums help decode this.
+  enum {
+    AddrBaseReg = 0,
+    AddrScaleAmt = 1,
+    AddrIndexReg = 2,
+    AddrDisp = 3,
+
+    /// AddrSegmentReg - The operand # of the segment in the memory operand.
+    AddrSegmentReg = 4,
+
+    /// AddrNumOperands - Total number of operands in a memory reference.
+    AddrNumOperands = 5
+  };
+
+  /// AVX512 static rounding constants.  These need to match the values in
+  /// avx512fintrin.h.
+  enum STATIC_ROUNDING {
+    TO_NEAREST_INT = 0,
+    TO_NEG_INF = 1,
+    TO_POS_INF = 2,
+    TO_ZERO = 3,
+    CUR_DIRECTION = 4
+  };
+} // end namespace X86;
+
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // X86 Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a
+    /// relocation of:
+    ///    SYMBOL_LABEL + [. - PICBASELABEL]
+    MO_GOT_ABSOLUTE_ADDRESS,
+
+    /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the
+    /// immediate should get the value of the symbol minus the PIC base label:
+    ///    SYMBOL_LABEL - PICBASELABEL
+    MO_PIC_BASE_OFFSET,
+
+    /// MO_GOT - On a symbol operand this indicates that the immediate is the
+    /// offset to the GOT entry for the symbol name from the base of the GOT.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOT
+    MO_GOT,
+
+    /// MO_GOTOFF - On a symbol operand this indicates that the immediate is
+    /// the offset to the location of the symbol name from the base of the GOT.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOTOFF
+    MO_GOTOFF,
+
+    /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is
+    /// offset to the GOT entry for the symbol name from the current code
+    /// location.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOTPCREL
+    MO_GOTPCREL,
+
+    /// MO_PLT - On a symbol operand this indicates that the immediate is
+    /// offset to the PLT entry of symbol name from the current code location.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @PLT
+    MO_PLT,
+
+    /// MO_TLSGD - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS index structure that contains
+    /// the module number and variable offset for the symbol. Used in the
+    /// general dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSGD
+    MO_TLSGD,
+
+    /// MO_TLSLD - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS index for the module that
+    /// contains the symbol. When this index is passed to a call to
+    /// __tls_get_addr, the function will return the base address of the TLS
+    /// block for the symbol. Used in the x86-64 local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSLD
+    MO_TLSLD,
+
+    /// MO_TLSLDM - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS index for the module that
+    /// contains the symbol. When this index is passed to a call to
+    /// ___tls_get_addr, the function will return the base address of the TLS
+    /// block for the symbol. Used in the IA32 local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSLDM
+    MO_TLSLDM,
+
+    /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the thread-pointer offset for the
+    /// symbol. Used in the x86-64 initial exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @GOTTPOFF
+    MO_GOTTPOFF,
+
+    /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
+    /// the absolute address of the GOT entry with the negative thread-pointer
+    /// offset for the symbol. Used in the non-PIC IA32 initial exec TLS access
+    /// model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @INDNTPOFF
+    MO_INDNTPOFF,
+
+    /// MO_TPOFF - On a symbol operand this indicates that the immediate is
+    /// the thread-pointer offset for the symbol. Used in the x86-64 local
+    /// exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TPOFF
+    MO_TPOFF,
+
+    /// MO_DTPOFF - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS offset of the symbol. Used
+    /// in the local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @DTPOFF
+    MO_DTPOFF,
+
+    /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
+    /// the negative thread-pointer offset for the symbol. Used in the IA32
+    /// local exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @NTPOFF
+    MO_NTPOFF,
+
+    /// MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the negative thread-pointer offset for
+    /// the symbol. Used in the PIC IA32 initial exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @GOTNTPOFF
+    MO_GOTNTPOFF,
+
+    /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "__imp_FOO" symbol.  This is used for
+    /// dllimport linkage on windows.
+    MO_DLLIMPORT,
+
+    /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
+    /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+    MO_DARWIN_NONLAZY,
+
+    /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates
+    /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is
+    /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+    MO_DARWIN_NONLAZY_PIC_BASE,
+
+    /// MO_TLVP - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// This is the TLS offset for the Darwin TLS mechanism.
+    MO_TLVP,
+
+    /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate
+    /// is some TLS offset from the picbase.
+    ///
+    /// This is the 32-bit TLS offset for Darwin TLS in PIC mode.
+    MO_TLVP_PIC_BASE,
+
+    /// MO_SECREL - On a symbol operand this indicates that the immediate is
+    /// the offset from beginning of section.
+    ///
+    /// This is the TLS offset for the COFF/Windows TLS mechanism.
+    MO_SECREL
+  };
+
+  enum : uint64_t {
+    //===------------------------------------------------------------------===//
+    // Instruction encodings.  These are the standard/most common forms for X86
+    // instructions.
+    //
+
+    // PseudoFrm - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    Pseudo         = 0,
+
+    /// Raw - This form is for instructions that don't have any operands, so
+    /// they are just a fixed opcode value, like 'leave'.
+    RawFrm         = 1,
+
+    /// AddRegFrm - This form is used for instructions like 'push r32' that have
+    /// their one register operand added to their opcode.
+    AddRegFrm      = 2,
+
+    /// RawFrmMemOffs - This form is for instructions that store an absolute
+    /// memory offset as an immediate with a possible segment override.
+    RawFrmMemOffs  = 3,
+
+    /// RawFrmSrc - This form is for instructions that use the source index
+    /// register SI/ESI/RSI with a possible segment override.
+    RawFrmSrc      = 4,
+
+    /// RawFrmDst - This form is for instructions that use the destination index
+    /// register DI/EDI/ESI.
+    RawFrmDst      = 5,
+
+    /// RawFrmSrc - This form is for instructions that use the source index
+    /// register SI/ESI/ERI with a possible segment override, and also the
+    /// destination index register DI/ESI/RDI.
+    RawFrmDstSrc   = 6,
+
+    /// RawFrmImm8 - This is used for the ENTER instruction, which has two
+    /// immediates, the first of which is a 16-bit immediate (specified by
+    /// the imm encoding) and the second is a 8-bit fixed value.
+    RawFrmImm8 = 7,
+
+    /// RawFrmImm16 - This is used for CALL FAR instructions, which have two
+    /// immediates, the first of which is a 16 or 32-bit immediate (specified by
+    /// the imm encoding) and the second is a 16-bit fixed value.  In the AMD
+    /// manual, this operand is described as pntr16:32 and pntr16:16
+    RawFrmImm16 = 8,
+
+    /// MRM[0-7][rm] - These forms are used to represent instructions that use
+    /// a Mod/RM byte, and use the middle field to hold extended opcode
+    /// information.  In the intel manual these are represented as /0, /1, ...
+    ///
+
+    /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is memory.
+    ///
+    MRMDestMem     = 32,
+
+    /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is memory.
+    ///
+    MRMSrcMem      = 33,
+
+    /// MRMSrcMem4VOp3 - This form is used for instructions that encode
+    /// operand 3 with VEX.VVVV and load from memory.
+    ///
+    MRMSrcMem4VOp3 = 34,
+
+    /// MRMSrcMemOp4 - This form is used for instructions that use the Mod/RM
+    /// byte to specify the fourth source, which in this case is memory.
+    ///
+    MRMSrcMemOp4   = 35,
+
+    /// MRMXm - This form is used for instructions that use the Mod/RM byte
+    /// to specify a memory source, but doesn't use the middle field.
+    ///
+    MRMXm = 39, // Instruction that uses Mod/RM but not the middle field.
+
+    // Next, instructions that operate on a memory r/m operand...
+    MRM0m = 40,  MRM1m = 41,  MRM2m = 42,  MRM3m = 43, // Format /0 /1 /2 /3
+    MRM4m = 44,  MRM5m = 45,  MRM6m = 46,  MRM7m = 47, // Format /4 /5 /6 /7
+
+    /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is a register.
+    ///
+    MRMDestReg     = 48,
+
+    /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is a register.
+    ///
+    MRMSrcReg      = 49,
+
+    /// MRMSrcReg4VOp3 - This form is used for instructions that encode
+    /// operand 3 with VEX.VVVV and do not load from memory.
+    ///
+    MRMSrcReg4VOp3 = 50,
+
+    /// MRMSrcRegOp4 - This form is used for instructions that use the Mod/RM
+    /// byte to specify the fourth source, which in this case is a register.
+    ///
+    MRMSrcRegOp4   = 51,
+
+    /// MRMXr - This form is used for instructions that use the Mod/RM byte
+    /// to specify a register source, but doesn't use the middle field.
+    ///
+    MRMXr = 55, // Instruction that uses Mod/RM but not the middle field.
+
+    // Instructions that operate on a register r/m operand...
+    MRM0r = 56,  MRM1r = 57,  MRM2r = 58,  MRM3r = 59, // Format /0 /1 /2 /3
+    MRM4r = 60,  MRM5r = 61,  MRM6r = 62,  MRM7r = 63, // Format /4 /5 /6 /7
+
+    /// MRM_XX - A mod/rm byte of exactly 0xXX.
+    MRM_C0 = 64,  MRM_C1 = 65,  MRM_C2 = 66,  MRM_C3 = 67,
+    MRM_C4 = 68,  MRM_C5 = 69,  MRM_C6 = 70,  MRM_C7 = 71,
+    MRM_C8 = 72,  MRM_C9 = 73,  MRM_CA = 74,  MRM_CB = 75,
+    MRM_CC = 76,  MRM_CD = 77,  MRM_CE = 78,  MRM_CF = 79,
+    MRM_D0 = 80,  MRM_D1 = 81,  MRM_D2 = 82,  MRM_D3 = 83,
+    MRM_D4 = 84,  MRM_D5 = 85,  MRM_D6 = 86,  MRM_D7 = 87,
+    MRM_D8 = 88,  MRM_D9 = 89,  MRM_DA = 90,  MRM_DB = 91,
+    MRM_DC = 92,  MRM_DD = 93,  MRM_DE = 94,  MRM_DF = 95,
+    MRM_E0 = 96,  MRM_E1 = 97,  MRM_E2 = 98,  MRM_E3 = 99,
+    MRM_E4 = 100, MRM_E5 = 101, MRM_E6 = 102, MRM_E7 = 103,
+    MRM_E8 = 104, MRM_E9 = 105, MRM_EA = 106, MRM_EB = 107,
+    MRM_EC = 108, MRM_ED = 109, MRM_EE = 110, MRM_EF = 111,
+    MRM_F0 = 112, MRM_F1 = 113, MRM_F2 = 114, MRM_F3 = 115,
+    MRM_F4 = 116, MRM_F5 = 117, MRM_F6 = 118, MRM_F7 = 119,
+    MRM_F8 = 120, MRM_F9 = 121, MRM_FA = 122, MRM_FB = 123,
+    MRM_FC = 124, MRM_FD = 125, MRM_FE = 126, MRM_FF = 127,
+
+    FormMask       = 127,
+
+    //===------------------------------------------------------------------===//
+    // Actual flags...
+
+    // OpSize - OpSizeFixed implies instruction never needs a 0x66 prefix.
+    // OpSize16 means this is a 16-bit instruction and needs 0x66 prefix in
+    // 32-bit mode. OpSize32 means this is a 32-bit instruction needs a 0x66
+    // prefix in 16-bit mode.
+    OpSizeShift = 7,
+    OpSizeMask = 0x3 << OpSizeShift,
+
+    OpSizeFixed = 0 << OpSizeShift,
+    OpSize16    = 1 << OpSizeShift,
+    OpSize32    = 2 << OpSizeShift,
+
+    // AsSize - AdSizeX implies this instruction determines its need of 0x67
+    // prefix from a normal ModRM memory operand. The other types indicate that
+    // an operand is encoded with a specific width and a prefix is needed if
+    // it differs from the current mode.
+    AdSizeShift = OpSizeShift + 2,
+    AdSizeMask  = 0x3 << AdSizeShift,
+
+    AdSizeX  = 1 << AdSizeShift,
+    AdSize16 = 1 << AdSizeShift,
+    AdSize32 = 2 << AdSizeShift,
+    AdSize64 = 3 << AdSizeShift,
+
+    //===------------------------------------------------------------------===//
+    // OpPrefix - There are several prefix bytes that are used as opcode
+    // extensions. These are 0x66, 0xF3, and 0xF2. If this field is 0 there is
+    // no prefix.
+    //
+    OpPrefixShift = AdSizeShift + 2,
+    OpPrefixMask  = 0x7 << OpPrefixShift,
+
+    // PS, PD - Prefix code for packed single and double precision vector
+    // floating point operations performed in the SSE registers.
+    PS = 1 << OpPrefixShift, PD = 2 << OpPrefixShift,
+
+    // XS, XD - These prefix codes are for single and double precision scalar
+    // floating point operations performed in the SSE registers.
+    XS = 3 << OpPrefixShift,  XD = 4 << OpPrefixShift,
+
+    //===------------------------------------------------------------------===//
+    // OpMap - This field determines which opcode map this instruction
+    // belongs to. i.e. one-byte, two-byte, 0x0f 0x38, 0x0f 0x3a, etc.
+    //
+    OpMapShift = OpPrefixShift + 3,
+    OpMapMask  = 0x7 << OpMapShift,
+
+    // OB - OneByte - Set if this instruction has a one byte opcode.
+    OB = 0 << OpMapShift,
+
+    // TB - TwoByte - Set if this instruction has a two byte opcode, which
+    // starts with a 0x0F byte before the real opcode.
+    TB = 1 << OpMapShift,
+
+    // T8, TA - Prefix after the 0x0F prefix.
+    T8 = 2 << OpMapShift,  TA = 3 << OpMapShift,
+
+    // XOP8 - Prefix to include use of imm byte.
+    XOP8 = 4 << OpMapShift,
+
+    // XOP9 - Prefix to exclude use of imm byte.
+    XOP9 = 5 << OpMapShift,
+
+    // XOPA - Prefix to encode 0xA in VEX.MMMM of XOP instructions.
+    XOPA = 6 << OpMapShift,
+
+    //===------------------------------------------------------------------===//
+    // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+    // They are used to specify GPRs and SSE registers, 64-bit operand size,
+    // etc. We only cares about REX.W and REX.R bits and only the former is
+    // statically determined.
+    //
+    REXShift    = OpMapShift + 3,
+    REX_W       = 1 << REXShift,
+
+    //===------------------------------------------------------------------===//
+    // This three-bit field describes the size of an immediate operand.  Zero is
+    // unused so that we can tell if we forgot to set a value.
+    ImmShift = REXShift + 1,
+    ImmMask    = 15 << ImmShift,
+    Imm8       = 1 << ImmShift,
+    Imm8PCRel  = 2 << ImmShift,
+    Imm8Reg    = 3 << ImmShift,
+    Imm16      = 4 << ImmShift,
+    Imm16PCRel = 5 << ImmShift,
+    Imm32      = 6 << ImmShift,
+    Imm32PCRel = 7 << ImmShift,
+    Imm32S     = 8 << ImmShift,
+    Imm64      = 9 << ImmShift,
+
+    //===------------------------------------------------------------------===//
+    // FP Instruction Classification...  Zero is non-fp instruction.
+
+    // FPTypeMask - Mask for all of the FP types...
+    FPTypeShift = ImmShift + 4,
+    FPTypeMask  = 7 << FPTypeShift,
+
+    // NotFP - The default, set for instructions that do not use FP registers.
+    NotFP      = 0 << FPTypeShift,
+
+    // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+    ZeroArgFP  = 1 << FPTypeShift,
+
+    // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+    OneArgFP   = 2 << FPTypeShift,
+
+    // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+    // result back to ST(0).  For example, fcos, fsqrt, etc.
+    //
+    OneArgFPRW = 3 << FPTypeShift,
+
+    // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+    // explicit argument, storing the result to either ST(0) or the implicit
+    // argument.  For example: fadd, fsub, fmul, etc...
+    TwoArgFP   = 4 << FPTypeShift,
+
+    // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+    // explicit argument, but have no destination.  Example: fucom, fucomi, ...
+    CompareFP  = 5 << FPTypeShift,
+
+    // CondMovFP - "2 operand" floating point conditional move instructions.
+    CondMovFP  = 6 << FPTypeShift,
+
+    // SpecialFP - Special instruction forms.  Dispatch by opcode explicitly.
+    SpecialFP  = 7 << FPTypeShift,
+
+    // Lock prefix
+    LOCKShift = FPTypeShift + 3,
+    LOCK = 1 << LOCKShift,
+
+    // REP prefix
+    REPShift = LOCKShift + 1,
+    REP = 1 << REPShift,
+
+    // Execution domain for SSE instructions.
+    // 0 means normal, non-SSE instruction.
+    SSEDomainShift = REPShift + 1,
+
+    // Encoding
+    EncodingShift = SSEDomainShift + 2,
+    EncodingMask = 0x3 << EncodingShift,
+
+    // VEX - encoding using 0xC4/0xC5
+    VEX = 1 << EncodingShift,
+
+    /// XOP - Opcode prefix used by XOP instructions.
+    XOP = 2 << EncodingShift,
+
+    // VEX_EVEX - Specifies that this instruction use EVEX form which provides
+    // syntax support up to 32 512-bit register operands and up to 7 16-bit
+    // mask operands as well as source operand data swizzling/memory operand
+    // conversion, eviction hint, and rounding mode.
+    EVEX = 3 << EncodingShift,
+
+    // Opcode
+    OpcodeShift   = EncodingShift + 2,
+
+    /// VEX_W - Has a opcode specific functionality, but is used in the same
+    /// way as REX_W is for regular SSE instructions.
+    VEX_WShift  = OpcodeShift + 8,
+    VEX_W       = 1ULL << VEX_WShift,
+
+    /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2
+    /// address instructions in SSE are represented as 3 address ones in AVX
+    /// and the additional register is encoded in VEX_VVVV prefix.
+    VEX_4VShift = VEX_WShift + 1,
+    VEX_4V      = 1ULL << VEX_4VShift,
+
+    /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current
+    /// instruction uses 256-bit wide registers. This is usually auto detected
+    /// if a VR256 register is used, but some AVX instructions also have this
+    /// field marked when using a f256 memory references.
+    VEX_LShift = VEX_4VShift + 1,
+    VEX_L       = 1ULL << VEX_LShift,
+
+    // EVEX_K - Set if this instruction requires masking
+    EVEX_KShift = VEX_LShift + 1,
+    EVEX_K      = 1ULL << EVEX_KShift,
+
+    // EVEX_Z - Set if this instruction has EVEX.Z field set.
+    EVEX_ZShift = EVEX_KShift + 1,
+    EVEX_Z      = 1ULL << EVEX_ZShift,
+
+    // EVEX_L2 - Set if this instruction has EVEX.L' field set.
+    EVEX_L2Shift = EVEX_ZShift + 1,
+    EVEX_L2     = 1ULL << EVEX_L2Shift,
+
+    // EVEX_B - Set if this instruction has EVEX.B field set.
+    EVEX_BShift = EVEX_L2Shift + 1,
+    EVEX_B      = 1ULL << EVEX_BShift,
+
+    // The scaling factor for the AVX512's 8-bit compressed displacement.
+    CD8_Scale_Shift = EVEX_BShift + 1,
+    CD8_Scale_Mask = 127ULL << CD8_Scale_Shift,
+
+    /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
+    /// wacky 0x0F 0x0F prefix for 3DNow! instructions.  The manual documents
+    /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
+    /// storing a classifier in the imm8 field.  To simplify our implementation,
+    /// we handle this by storeing the classifier in the opcode field and using
+    /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
+    Has3DNow0F0FOpcodeShift = CD8_Scale_Shift + 7,
+    Has3DNow0F0FOpcode = 1ULL << Has3DNow0F0FOpcodeShift,
+
+    /// Explicitly specified rounding control
+    EVEX_RCShift = Has3DNow0F0FOpcodeShift + 1,
+    EVEX_RC = 1ULL << EVEX_RCShift
+  };
+
+  // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
+  // specified machine instruction.
+  //
+  inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
+    return TSFlags >> X86II::OpcodeShift;
+  }
+
+  inline bool hasImm(uint64_t TSFlags) {
+    return (TSFlags & X86II::ImmMask) != 0;
+  }
+
+  /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
+  /// of the specified instruction.
+  inline unsigned getSizeOfImm(uint64_t TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: llvm_unreachable("Unknown immediate size");
+    case X86II::Imm8:
+    case X86II::Imm8PCRel:
+    case X86II::Imm8Reg:    return 1;
+    case X86II::Imm16:
+    case X86II::Imm16PCRel: return 2;
+    case X86II::Imm32:
+    case X86II::Imm32S:
+    case X86II::Imm32PCRel: return 4;
+    case X86II::Imm64:      return 8;
+    }
+  }
+
+  /// isImmPCRel - Return true if the immediate of the specified instruction's
+  /// TSFlags indicates that it is pc relative.
+  inline unsigned isImmPCRel(uint64_t TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: llvm_unreachable("Unknown immediate size");
+    case X86II::Imm8PCRel:
+    case X86II::Imm16PCRel:
+    case X86II::Imm32PCRel:
+      return true;
+    case X86II::Imm8:
+    case X86II::Imm8Reg:
+    case X86II::Imm16:
+    case X86II::Imm32:
+    case X86II::Imm32S:
+    case X86II::Imm64:
+      return false;
+    }
+  }
+
+  /// isImmSigned - Return true if the immediate of the specified instruction's
+  /// TSFlags indicates that it is signed.
+  inline unsigned isImmSigned(uint64_t TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: llvm_unreachable("Unknown immediate signedness");
+    case X86II::Imm32S:
+      return true;
+    case X86II::Imm8:
+    case X86II::Imm8PCRel:
+    case X86II::Imm8Reg:
+    case X86II::Imm16:
+    case X86II::Imm16PCRel:
+    case X86II::Imm32:
+    case X86II::Imm32PCRel:
+    case X86II::Imm64:
+      return false;
+    }
+  }
+
+  /// getOperandBias - compute any additional adjustment needed to
+  ///                  the offset to the start of the memory operand
+  ///                  in this instruction.
+  /// If this is a two-address instruction,skip one of the register operands.
+  /// FIXME: This should be handled during MCInst lowering.
+  inline unsigned getOperandBias(const MCInstrDesc& Desc)
+  {
+    unsigned NumOps = Desc.getNumOperands();
+    if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
+      return 1;
+    if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+        Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
+      // Special case for AVX-512 GATHER with 2 TIED_TO operands
+      // Skip the first 2 operands: dst, mask_wb
+      return 2;
+    if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+        Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1)
+      // Special case for GATHER with 2 TIED_TO operands
+      // Skip the first 2 operands: dst, mask_wb
+      return 2;
+    if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0)
+      // SCATTER
+      return 1;
+    return 0;
+  }
+
+  /// getMemoryOperandNo - The function returns the MCInst operand # for the
+  /// first field of the memory operand.  If the instruction doesn't have a
+  /// memory operand, this returns -1.
+  ///
+  /// Note that this ignores tied operands.  If there is a tied register which
+  /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
+  /// counted as one operand.
+  ///
+  inline int getMemoryOperandNo(uint64_t TSFlags) {
+    bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+    bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+
+    switch (TSFlags & X86II::FormMask) {
+    default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
+    case X86II::Pseudo:
+    case X86II::RawFrm:
+    case X86II::AddRegFrm:
+    case X86II::RawFrmImm8:
+    case X86II::RawFrmImm16:
+    case X86II::RawFrmMemOffs:
+    case X86II::RawFrmSrc:
+    case X86II::RawFrmDst:
+    case X86II::RawFrmDstSrc:
+      return -1;
+    case X86II::MRMDestMem:
+      return 0;
+    case X86II::MRMSrcMem:
+      // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a
+      // mask register.
+      return 1 + HasVEX_4V + HasEVEX_K;
+    case X86II::MRMSrcMem4VOp3:
+      // Skip registers encoded in reg.
+      return 1 + HasEVEX_K;
+    case X86II::MRMSrcMemOp4:
+      // Skip registers encoded in reg, VEX_VVVV, and I8IMM.
+      return 3;
+    case X86II::MRMDestReg:
+    case X86II::MRMSrcReg:
+    case X86II::MRMSrcReg4VOp3:
+    case X86II::MRMSrcRegOp4:
+    case X86II::MRMXr:
+    case X86II::MRM0r: case X86II::MRM1r:
+    case X86II::MRM2r: case X86II::MRM3r:
+    case X86II::MRM4r: case X86II::MRM5r:
+    case X86II::MRM6r: case X86II::MRM7r:
+      return -1;
+    case X86II::MRMXm:
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m:
+      // Start from 0, skip registers encoded in VEX_VVVV or a mask register.
+      return 0 + HasVEX_4V + HasEVEX_K;
+    case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
+    case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
+    case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
+    case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+    case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE:
+    case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1:
+    case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4:
+    case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7:
+    case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA:
+    case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD:
+    case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0:
+    case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3:
+    case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6:
+    case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9:
+    case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+    case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF:
+    case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2:
+    case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5:
+    case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8:
+    case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB:
+    case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE:
+    case X86II::MRM_FF:
+      return -1;
+    }
+  }
+
+  /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
+  /// higher) register?  e.g. r8, xmm8, xmm13, etc.
+  inline bool isX86_64ExtendedReg(unsigned RegNo) {
+    if ((RegNo >= X86::XMM8 && RegNo <= X86::XMM31) ||
+        (RegNo >= X86::YMM8 && RegNo <= X86::YMM31) ||
+        (RegNo >= X86::ZMM8 && RegNo <= X86::ZMM31))
+      return true;
+
+    switch (RegNo) {
+    default: break;
+    case X86::R8:    case X86::R9:    case X86::R10:   case X86::R11:
+    case X86::R12:   case X86::R13:   case X86::R14:   case X86::R15:
+    case X86::R8D:   case X86::R9D:   case X86::R10D:  case X86::R11D:
+    case X86::R12D:  case X86::R13D:  case X86::R14D:  case X86::R15D:
+    case X86::R8W:   case X86::R9W:   case X86::R10W:  case X86::R11W:
+    case X86::R12W:  case X86::R13W:  case X86::R14W:  case X86::R15W:
+    case X86::R8B:   case X86::R9B:   case X86::R10B:  case X86::R11B:
+    case X86::R12B:  case X86::R13B:  case X86::R14B:  case X86::R15B:
+    case X86::CR8:   case X86::CR9:   case X86::CR10:  case X86::CR11:
+    case X86::CR12:  case X86::CR13:  case X86::CR14:  case X86::CR15:
+    case X86::DR8:   case X86::DR9:   case X86::DR10:  case X86::DR11:
+    case X86::DR12:  case X86::DR13:  case X86::DR14:  case X86::DR15:
+      return true;
+    }
+    return false;
+  }
+
+  /// is32ExtendedReg - Is the MemoryOperand a 32 extended (zmm16 or higher)
+  /// registers? e.g. zmm21, etc.
+  static inline bool is32ExtendedReg(unsigned RegNo) {
+    return ((RegNo >= X86::XMM16 && RegNo <= X86::XMM31) ||
+            (RegNo >= X86::YMM16 && RegNo <= X86::YMM31) ||
+            (RegNo >= X86::ZMM16 && RegNo <= X86::ZMM31));
+  }
+
+
+  inline bool isX86_64NonExtLowByteReg(unsigned reg) {
+    return (reg == X86::SPL || reg == X86::BPL ||
+            reg == X86::SIL || reg == X86::DIL);
+  }
+
+  /// isKMasked - Is this a masked instruction.
+  inline bool isKMasked(uint64_t TSFlags) {
+    return (TSFlags & X86II::EVEX_K) != 0;
+  }
+
+  /// isKMergedMasked - Is this a merge masked instruction.
+  inline bool isKMergeMasked(uint64_t TSFlags) {
+    return isKMasked(TSFlags) && (TSFlags & X86II::EVEX_Z) == 0;
+  }
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
new file mode 100644
index 000000000000..da69da51df10
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -0,0 +1,303 @@
+//===-- X86ELFObjectWriter.cpp - X86 ELF Writer ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+  class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+  public:
+    X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
+
+    ~X86ELFObjectWriter() override;
+
+  protected:
+    unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                          const MCFixup &Fixup, bool IsPCRel) const override;
+  };
+}
+
+X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
+                                       uint16_t EMachine)
+    : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine,
+                              // Only i386 and IAMCU use Rel instead of RelA.
+                              /*HasRelocationAddend*/
+                              (EMachine != ELF::EM_386) &&
+                                  (EMachine != ELF::EM_IAMCU)) {}
+
+X86ELFObjectWriter::~X86ELFObjectWriter()
+{}
+
+enum X86_64RelType { RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 };
+
+static X86_64RelType getType64(unsigned Kind,
+                               MCSymbolRefExpr::VariantKind &Modifier,
+                               bool &IsPCRel) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unimplemented");
+  case X86::reloc_global_offset_table8:
+    Modifier = MCSymbolRefExpr::VK_GOT;
+    IsPCRel = true;
+    return RT64_64;
+  case FK_Data_8:
+    return RT64_64;
+  case X86::reloc_signed_4byte:
+  case X86::reloc_signed_4byte_relax:
+    if (Modifier == MCSymbolRefExpr::VK_None && !IsPCRel)
+      return RT64_32S;
+    return RT64_32;
+  case X86::reloc_global_offset_table:
+    Modifier = MCSymbolRefExpr::VK_GOT;
+    IsPCRel = true;
+    return RT64_32;
+  case FK_Data_4:
+  case FK_PCRel_4:
+  case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_relax:
+  case X86::reloc_riprel_4byte_relax_rex:
+  case X86::reloc_riprel_4byte_movq_load:
+    return RT64_32;
+  case FK_PCRel_2:
+  case FK_Data_2:
+    return RT64_16;
+  case FK_PCRel_1:
+  case FK_Data_1:
+    return RT64_8;
+  }
+}
+
+static void checkIs32(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) {
+  if (Type != RT64_32)
+    Ctx.reportError(Loc,
+                    "32 bit reloc applied to a field with a different size");
+}
+
+static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
+                               MCSymbolRefExpr::VariantKind Modifier,
+                               X86_64RelType Type, bool IsPCRel,
+                               unsigned Kind) {
+  switch (Modifier) {
+  default:
+    llvm_unreachable("Unimplemented");
+  case MCSymbolRefExpr::VK_None:
+    switch (Type) {
+    case RT64_64:
+      return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64;
+    case RT64_32:
+      return IsPCRel ? ELF::R_X86_64_PC32 : ELF::R_X86_64_32;
+    case RT64_32S:
+      return ELF::R_X86_64_32S;
+    case RT64_16:
+      return IsPCRel ? ELF::R_X86_64_PC16 : ELF::R_X86_64_16;
+    case RT64_8:
+      return IsPCRel ? ELF::R_X86_64_PC8 : ELF::R_X86_64_8;
+    }
+  case MCSymbolRefExpr::VK_GOT:
+    switch (Type) {
+    case RT64_64:
+      return IsPCRel ? ELF::R_X86_64_GOTPC64 : ELF::R_X86_64_GOT64;
+    case RT64_32:
+      return IsPCRel ? ELF::R_X86_64_GOTPC32 : ELF::R_X86_64_GOT32;
+    case RT64_32S:
+    case RT64_16:
+    case RT64_8:
+      llvm_unreachable("Unimplemented");
+    }
+  case MCSymbolRefExpr::VK_GOTOFF:
+    assert(Type == RT64_64);
+    assert(!IsPCRel);
+    return ELF::R_X86_64_GOTOFF64;
+  case MCSymbolRefExpr::VK_TPOFF:
+    assert(!IsPCRel);
+    switch (Type) {
+    case RT64_64:
+      return ELF::R_X86_64_TPOFF64;
+    case RT64_32:
+      return ELF::R_X86_64_TPOFF32;
+    case RT64_32S:
+    case RT64_16:
+    case RT64_8:
+      llvm_unreachable("Unimplemented");
+    }
+  case MCSymbolRefExpr::VK_DTPOFF:
+    assert(!IsPCRel);
+    switch (Type) {
+    case RT64_64:
+      return ELF::R_X86_64_DTPOFF64;
+    case RT64_32:
+      return ELF::R_X86_64_DTPOFF32;
+    case RT64_32S:
+    case RT64_16:
+    case RT64_8:
+      llvm_unreachable("Unimplemented");
+    }
+  case MCSymbolRefExpr::VK_SIZE:
+    assert(!IsPCRel);
+    switch (Type) {
+    case RT64_64:
+      return ELF::R_X86_64_SIZE64;
+    case RT64_32:
+      return ELF::R_X86_64_SIZE32;
+    case RT64_32S:
+    case RT64_16:
+    case RT64_8:
+      llvm_unreachable("Unimplemented");
+    }
+  case MCSymbolRefExpr::VK_TLSCALL:
+    return ELF::R_X86_64_TLSDESC_CALL;
+  case MCSymbolRefExpr::VK_TLSDESC:
+    return ELF::R_X86_64_GOTPC32_TLSDESC;
+  case MCSymbolRefExpr::VK_TLSGD:
+    checkIs32(Ctx, Loc, Type);
+    return ELF::R_X86_64_TLSGD;
+  case MCSymbolRefExpr::VK_GOTTPOFF:
+    checkIs32(Ctx, Loc, Type);
+    return ELF::R_X86_64_GOTTPOFF;
+  case MCSymbolRefExpr::VK_TLSLD:
+    checkIs32(Ctx, Loc, Type);
+    return ELF::R_X86_64_TLSLD;
+  case MCSymbolRefExpr::VK_PLT:
+    checkIs32(Ctx, Loc, Type);
+    return ELF::R_X86_64_PLT32;
+  case MCSymbolRefExpr::VK_GOTPCREL:
+    checkIs32(Ctx, Loc, Type);
+    // Older versions of ld.bfd/ld.gold/lld
+    // do not support GOTPCRELX/REX_GOTPCRELX,
+    // and we want to keep back-compatibility.
+    if (!Ctx.getAsmInfo()->canRelaxRelocations())
+      return ELF::R_X86_64_GOTPCREL;
+    switch (Kind) {
+    default:
+      return ELF::R_X86_64_GOTPCREL;
+    case X86::reloc_riprel_4byte_relax:
+      return ELF::R_X86_64_GOTPCRELX;
+    case X86::reloc_riprel_4byte_relax_rex:
+    case X86::reloc_riprel_4byte_movq_load:
+      return ELF::R_X86_64_REX_GOTPCRELX;
+    }
+  }
+}
+
+enum X86_32RelType { RT32_32, RT32_16, RT32_8 };
+
+static X86_32RelType getType32(X86_64RelType T) {
+  switch (T) {
+  case RT64_64:
+    llvm_unreachable("Unimplemented");
+  case RT64_32:
+  case RT64_32S:
+    return RT32_32;
+  case RT64_16:
+    return RT32_16;
+  case RT64_8:
+    return RT32_8;
+  }
+  llvm_unreachable("unexpected relocation type!");
+}
+
+static unsigned getRelocType32(MCContext &Ctx,
+                               MCSymbolRefExpr::VariantKind Modifier,
+                               X86_32RelType Type, bool IsPCRel,
+                               unsigned Kind) {
+  switch (Modifier) {
+  default:
+    llvm_unreachable("Unimplemented");
+  case MCSymbolRefExpr::VK_None:
+    switch (Type) {
+    case RT32_32:
+      return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32;
+    case RT32_16:
+      return IsPCRel ? ELF::R_386_PC16 : ELF::R_386_16;
+    case RT32_8:
+      return IsPCRel ? ELF::R_386_PC8 : ELF::R_386_8;
+    }
+  case MCSymbolRefExpr::VK_GOT:
+    assert(Type == RT32_32);
+    if (IsPCRel)
+      return ELF::R_386_GOTPC;
+    // Older versions of ld.bfd/ld.gold/lld do not support R_386_GOT32X and we
+    // want to maintain compatibility.
+    if (!Ctx.getAsmInfo()->canRelaxRelocations())
+      return ELF::R_386_GOT32;
+
+    return Kind == X86::reloc_signed_4byte_relax ? ELF::R_386_GOT32X
+                                                 : ELF::R_386_GOT32;
+  case MCSymbolRefExpr::VK_GOTOFF:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_GOTOFF;
+  case MCSymbolRefExpr::VK_TPOFF:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_TLS_LE_32;
+  case MCSymbolRefExpr::VK_DTPOFF:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_TLS_LDO_32;
+  case MCSymbolRefExpr::VK_TLSGD:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_TLS_GD;
+  case MCSymbolRefExpr::VK_GOTTPOFF:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_TLS_IE_32;
+  case MCSymbolRefExpr::VK_PLT:
+    assert(Type == RT32_32);
+    return ELF::R_386_PLT32;
+  case MCSymbolRefExpr::VK_INDNTPOFF:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_TLS_IE;
+  case MCSymbolRefExpr::VK_NTPOFF:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_TLS_LE;
+  case MCSymbolRefExpr::VK_GOTNTPOFF:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_TLS_GOTIE;
+  case MCSymbolRefExpr::VK_TLSLDM:
+    assert(Type == RT32_32);
+    assert(!IsPCRel);
+    return ELF::R_386_TLS_LDM;
+  }
+}
+
+unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel) const {
+  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
+  unsigned Kind = Fixup.getKind();
+  X86_64RelType Type = getType64(Kind, Modifier, IsPCRel);
+  if (getEMachine() == ELF::EM_X86_64)
+    return getRelocType64(Ctx, Fixup.getLoc(), Modifier, Type, IsPCRel, Kind);
+
+  assert((getEMachine() == ELF::EM_386 || getEMachine() == ELF::EM_IAMCU) &&
+         "Unsupported ELF machine type.");
+  return getRelocType32(Ctx, Modifier, getType32(Type), IsPCRel, Kind);
+}
+
+MCObjectWriter *llvm::createX86ELFObjectWriter(raw_pwrite_stream &OS,
+                                               bool IsELF64, uint8_t OSABI,
+                                               uint16_t EMachine) {
+  MCELFObjectTargetWriter *MOTW =
+    new X86ELFObjectWriter(IsELF64, OSABI, EMachine);
+  return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
new file mode 100644
index 000000000000..dfdc9ec29aec
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -0,0 +1,40 @@
+//===-- X86FixupKinds.h - X86 Specific Fixup Entries ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace X86 {
+enum Fixups {
+  reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative
+  reloc_riprel_4byte_movq_load,              // 32-bit rip-relative in movq
+  reloc_riprel_4byte_relax,                  // 32-bit rip-relative in relaxable
+                                             // instruction
+  reloc_riprel_4byte_relax_rex,              // 32-bit rip-relative in relaxable
+                                             // instruction with rex prefix
+  reloc_signed_4byte,                        // 32-bit signed. Unlike FK_Data_4
+                                             // this will be sign extended at
+                                             // runtime.
+  reloc_signed_4byte_relax,                  // like reloc_signed_4byte, but
+                                             // in a relaxable instruction.
+  reloc_global_offset_table,                 // 32-bit, relative to the start
+                                             // of the instruction. Used only
+                                             // for _GLOBAL_OFFSET_TABLE_.
+  reloc_global_offset_table8,                // 64-bit variant.
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
new file mode 100644
index 000000000000..48a1d8f1330c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -0,0 +1,171 @@
+//===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the X86MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+using namespace llvm;
+
+enum AsmWriterFlavorTy {
+  // Note: This numbering has to match the GCC assembler dialects for inline
+  // asm alternatives to work right.
+  ATT = 0, Intel = 1
+};
+
+static cl::opt<AsmWriterFlavorTy>
+AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
+  cl::desc("Choose style of code to emit from X86 backend:"),
+  cl::values(clEnumValN(ATT,   "att",   "Emit AT&T-style assembly"),
+             clEnumValN(Intel, "intel", "Emit Intel-style assembly")));
+
+static cl::opt<bool>
+MarkedJTDataRegions("mark-data-regions", cl::init(true),
+  cl::desc("Mark code section jump table data regions."),
+  cl::Hidden);
+
+void X86MCAsmInfoDarwin::anchor() { }
+
+X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
+  bool is64Bit = T.getArch() == Triple::x86_64;
+  if (is64Bit)
+    PointerSize = CalleeSaveStackSlotSize = 8;
+
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+
+  if (!is64Bit)
+    Data64bitsDirective = nullptr;       // we can't emit a 64-bit unit
+
+  // Use ## as a comment string so that .s files generated by llvm can go
+  // through the GCC preprocessor without causing an error.  This is needed
+  // because "clang foo.s" runs the C preprocessor, which is usually reserved
+  // for .S files on other systems.  Perhaps this is because the file system
+  // wasn't always case preserving or something.
+  CommentString = "##";
+
+  SupportsDebugInformation = true;
+  UseDataRegionDirectives = MarkedJTDataRegions;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  // old assembler lacks some directives
+  // FIXME: this should really be a check on the assembler characteristics
+  // rather than OS version
+  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6))
+    HasWeakDefCanBeHiddenDirective = false;
+
+  // Assume ld64 is new enough that the abs-ified FDE relocs may be used
+  // (actually, must, since otherwise the non-extern relocations we produce
+  // overwhelm ld64's tiny little mind and it fails).
+  DwarfFDESymbolsUseAbsDiff = true;
+
+  UseIntegratedAssembler = true;
+}
+
+X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
+  : X86MCAsmInfoDarwin(Triple) {
+}
+
+void X86ELFMCAsmInfo::anchor() { }
+
+X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
+  bool is64Bit = T.getArch() == Triple::x86_64;
+  bool isX32 = T.getEnvironment() == Triple::GNUX32;
+
+  // For ELF, x86-64 pointer size depends on the ABI.
+  // For x86-64 without the x32 ABI, pointer size is 8. For x86 and for x86-64
+  // with the x32 ABI, pointer size remains the default 4.
+  PointerSize = (is64Bit && !isX32) ? 8 : 4;
+
+  // OTOH, stack slot size is always 8 for x86-64, even with the x32 ABI.
+  CalleeSaveStackSlotSize = is64Bit ? 8 : 4;
+
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+
+  // Debug Information
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  // Always enable the integrated assembler by default.
+  // Clang also enabled it when the OS is Solaris but that is redundant here.
+  UseIntegratedAssembler = true;
+}
+
+const MCExpr *
+X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                                   unsigned Encoding,
+                                                   MCStreamer &Streamer) const {
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Res =
+    MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context);
+  const MCExpr *Four = MCConstantExpr::create(4, Context);
+  return MCBinaryExpr::createAdd(Res, Four, Context);
+}
+
+void X86MCAsmInfoMicrosoft::anchor() { }
+
+X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
+  if (Triple.getArch() == Triple::x86_64) {
+    PrivateGlobalPrefix = ".L";
+    PrivateLabelPrefix = ".L";
+    PointerSize = 8;
+    WinEHEncodingType = WinEH::EncodingType::Itanium;
+  } else {
+    // 32-bit X86 doesn't use CFI, so this isn't a real encoding type. It's just
+    // a place holder that the Windows EHStreamer looks for to suppress CFI
+    // output. In particular, usesWindowsCFI() returns false.
+    WinEHEncodingType = WinEH::EncodingType::X86;
+  }
+
+  ExceptionsType = ExceptionHandling::WinEH;
+
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+
+  AllowAtInName = true;
+
+  UseIntegratedAssembler = true;
+}
+
+void X86MCAsmInfoGNUCOFF::anchor() { }
+
+X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
+  assert(Triple.isOSWindows() && "Windows is the only supported COFF target");
+  if (Triple.getArch() == Triple::x86_64) {
+    PrivateGlobalPrefix = ".L";
+    PrivateLabelPrefix = ".L";
+    PointerSize = 8;
+    WinEHEncodingType = WinEH::EncodingType::Itanium;
+    ExceptionsType = ExceptionHandling::WinEH;
+  } else {
+    ExceptionsType = ExceptionHandling::DwarfCFI;
+  }
+
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+
+  UseIntegratedAssembler = true;
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
new file mode 100644
index 000000000000..30d5c802d1ed
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -0,0 +1,61 @@
+//===-- X86MCAsmInfo.h - X86 asm properties --------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the X86MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
+  virtual void anchor();
+
+public:
+  explicit X86MCAsmInfoDarwin(const Triple &Triple);
+};
+
+struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin {
+  explicit X86_64MCAsmInfoDarwin(const Triple &Triple);
+  const MCExpr *
+  getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+                              MCStreamer &Streamer) const override;
+};
+
+class X86ELFMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit X86ELFMCAsmInfo(const Triple &Triple);
+};
+
+class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+  void anchor() override;
+
+public:
+  explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
+};
+
+class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
+  void anchor() override;
+
+public:
+  explicit X86MCAsmInfoGNUCOFF(const Triple &Triple);
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
new file mode 100644
index 000000000000..8045e7c6d872
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -0,0 +1,1529 @@
+//===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+namespace {
+class X86MCCodeEmitter : public MCCodeEmitter {
+  X86MCCodeEmitter(const X86MCCodeEmitter &) = delete;
+  void operator=(const X86MCCodeEmitter &) = delete;
+  const MCInstrInfo &MCII;
+  MCContext &Ctx;
+public:
+  X86MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+    : MCII(mcii), Ctx(ctx) {
+  }
+
+  ~X86MCCodeEmitter() override {}
+
+  bool is64BitMode(const MCSubtargetInfo &STI) const {
+    return STI.getFeatureBits()[X86::Mode64Bit];
+  }
+
+  bool is32BitMode(const MCSubtargetInfo &STI) const {
+    return STI.getFeatureBits()[X86::Mode32Bit];
+  }
+
+  bool is16BitMode(const MCSubtargetInfo &STI) const {
+    return STI.getFeatureBits()[X86::Mode16Bit];
+  }
+
+  /// Is16BitMemOperand - Return true if the specified instruction has
+  /// a 16-bit memory operand. Op specifies the operand # of the memoperand.
+  bool Is16BitMemOperand(const MCInst &MI, unsigned Op,
+                         const MCSubtargetInfo &STI) const {
+    const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+    const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+    const MCOperand &Disp     = MI.getOperand(Op+X86::AddrDisp);
+
+    if (is16BitMode(STI) && BaseReg.getReg() == 0 &&
+        Disp.isImm() && Disp.getImm() < 0x10000)
+      return true;
+    if ((BaseReg.getReg() != 0 &&
+         X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
+        (IndexReg.getReg() != 0 &&
+         X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+      return true;
+    return false;
+  }
+
+  unsigned GetX86RegNum(const MCOperand &MO) const {
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
+  }
+
+  unsigned getX86RegEncoding(const MCInst &MI, unsigned OpNum) const {
+    return Ctx.getRegisterInfo()->getEncodingValue(
+                                                 MI.getOperand(OpNum).getReg());
+  }
+
+  // Does this register require a bit to be set in REX prefix.
+  bool isREXExtendedReg(const MCInst &MI, unsigned OpNum) const {
+    return (getX86RegEncoding(MI, OpNum) >> 3) & 1;
+  }
+
+  void EmitByte(uint8_t C, unsigned &CurByte, raw_ostream &OS) const {
+    OS << (char)C;
+    ++CurByte;
+  }
+
+  void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+                    raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, CurByte, OS);
+      Val >>= 8;
+    }
+  }
+
+  void EmitImmediate(const MCOperand &Disp, SMLoc Loc,
+                     unsigned ImmSize, MCFixupKind FixupKind,
+                     unsigned &CurByte, raw_ostream &OS,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     int ImmOffset = 0) const;
+
+  inline static uint8_t ModRMByte(unsigned Mod, unsigned RegOpcode,
+                                  unsigned RM) {
+    assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+    return RM | (RegOpcode << 3) | (Mod << 6);
+  }
+
+  void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
+                        unsigned &CurByte, raw_ostream &OS) const {
+    EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS);
+  }
+
+  void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base,
+                   unsigned &CurByte, raw_ostream &OS) const {
+    // SIB byte is in the same format as the ModRMByte.
+    EmitByte(ModRMByte(SS, Index, Base), CurByte, OS);
+  }
+
+  void emitMemModRMByte(const MCInst &MI, unsigned Op, unsigned RegOpcodeField,
+                        uint64_t TSFlags, bool Rex, unsigned &CurByte,
+                        raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const;
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+                           const MCInst &MI, const MCInstrDesc &Desc,
+                           raw_ostream &OS) const;
+
+  void EmitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand,
+                                 const MCInst &MI, raw_ostream &OS) const;
+
+  bool emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+                        const MCInst &MI, const MCInstrDesc &Desc,
+                        const MCSubtargetInfo &STI, raw_ostream &OS) const;
+
+  uint8_t DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
+                             int MemOperand, const MCInstrDesc &Desc) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
+                                            MCContext &Ctx) {
+  return new X86MCCodeEmitter(MCII, Ctx);
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
+static bool isDisp8(int Value) {
+  return Value == (int8_t)Value;
+}
+
+/// isCDisp8 - Return true if this signed displacement fits in a 8-bit
+/// compressed dispacement field.
+static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) {
+  assert(((TSFlags & X86II::EncodingMask) == X86II::EVEX) &&
+         "Compressed 8-bit displacement is only valid for EVEX inst.");
+
+  unsigned CD8_Scale =
+    (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift;
+  if (CD8_Scale == 0) {
+    CValue = Value;
+    return isDisp8(Value);
+  }
+
+  unsigned Mask = CD8_Scale - 1;
+  assert((CD8_Scale & Mask) == 0 && "Invalid memory object size.");
+  if (Value & Mask) // Unaligned offset
+    return false;
+  Value /= (int)CD8_Scale;
+  bool Ret = (Value == (int8_t)Value);
+
+  if (Ret)
+    CValue = Value;
+  return Ret;
+}
+
+/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate
+/// in an instruction with the specified TSFlags.
+static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
+  unsigned Size = X86II::getSizeOfImm(TSFlags);
+  bool isPCRel = X86II::isImmPCRel(TSFlags);
+
+  if (X86II::isImmSigned(TSFlags)) {
+    switch (Size) {
+    default: llvm_unreachable("Unsupported signed fixup size!");
+    case 4: return MCFixupKind(X86::reloc_signed_4byte);
+    }
+  }
+  return MCFixup::getKindForSize(Size, isPCRel);
+}
+
+/// Is32BitMemOperand - Return true if the specified instruction has
+/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
+  const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
+    return true;
+  if (BaseReg.getReg() == X86::EIP) {
+    assert(IndexReg.getReg() == 0 && "Invalid eip-based address.");
+    return true;
+  }
+  return false;
+}
+
+/// Is64BitMemOperand - Return true if the specified instruction has
+/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
+#ifndef NDEBUG
+static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) {
+  const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+#endif
+
+/// StartsWithGlobalOffsetTable - Check if this expression starts with
+///  _GLOBAL_OFFSET_TABLE_ and if it is of the form
+///  _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF
+/// i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that
+/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start
+/// of a binary expression.
+enum GlobalOffsetTableExprKind {
+  GOT_None,
+  GOT_Normal,
+  GOT_SymDiff
+};
+static GlobalOffsetTableExprKind
+StartsWithGlobalOffsetTable(const MCExpr *Expr) {
+  const MCExpr *RHS = nullptr;
+  if (Expr->getKind() == MCExpr::Binary) {
+    const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr);
+    Expr = BE->getLHS();
+    RHS = BE->getRHS();
+  }
+
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    return GOT_None;
+
+  const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+  const MCSymbol &S = Ref->getSymbol();
+  if (S.getName() != "_GLOBAL_OFFSET_TABLE_")
+    return GOT_None;
+  if (RHS && RHS->getKind() == MCExpr::SymbolRef)
+    return GOT_SymDiff;
+  return GOT_Normal;
+}
+
+static bool HasSecRelSymbolRef(const MCExpr *Expr) {
+  if (Expr->getKind() == MCExpr::SymbolRef) {
+    const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+    return Ref->getKind() == MCSymbolRefExpr::VK_SECREL;
+  }
+  return false;
+}
+
+void X86MCCodeEmitter::
+EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
+              MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
+              SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
+  const MCExpr *Expr = nullptr;
+  if (DispOp.isImm()) {
+    // If this is a simple integer displacement that doesn't require a
+    // relocation, emit it now.
+    if (FixupKind != FK_PCRel_1 &&
+        FixupKind != FK_PCRel_2 &&
+        FixupKind != FK_PCRel_4) {
+      EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS);
+      return;
+    }
+    Expr = MCConstantExpr::create(DispOp.getImm(), Ctx);
+  } else {
+    Expr = DispOp.getExpr();
+  }
+
+  // If we have an immoffset, add it to the expression.
+  if ((FixupKind == FK_Data_4 ||
+       FixupKind == FK_Data_8 ||
+       FixupKind == MCFixupKind(X86::reloc_signed_4byte))) {
+    GlobalOffsetTableExprKind Kind = StartsWithGlobalOffsetTable(Expr);
+    if (Kind != GOT_None) {
+      assert(ImmOffset == 0);
+
+      if (Size == 8) {
+        FixupKind = MCFixupKind(X86::reloc_global_offset_table8);
+      } else {
+        assert(Size == 4);
+        FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+      }
+
+      if (Kind == GOT_Normal)
+        ImmOffset = CurByte;
+    } else if (Expr->getKind() == MCExpr::SymbolRef) {
+      if (HasSecRelSymbolRef(Expr)) {
+        FixupKind = MCFixupKind(FK_SecRel_4);
+      }
+    } else if (Expr->getKind() == MCExpr::Binary) {
+      const MCBinaryExpr *Bin = static_cast<const MCBinaryExpr*>(Expr);
+      if (HasSecRelSymbolRef(Bin->getLHS())
+          || HasSecRelSymbolRef(Bin->getRHS())) {
+        FixupKind = MCFixupKind(FK_SecRel_4);
+      }
+    }
+  }
+
+  // If the fixup is pc-relative, we need to bias the value to be relative to
+  // the start of the field, not the end of the field.
+  if (FixupKind == FK_PCRel_4 ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load) ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax) ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax_rex))
+    ImmOffset -= 4;
+  if (FixupKind == FK_PCRel_2)
+    ImmOffset -= 2;
+  if (FixupKind == FK_PCRel_1)
+    ImmOffset -= 1;
+
+  if (ImmOffset)
+    Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(ImmOffset, Ctx),
+                                   Ctx);
+
+  // Emit a symbolic constant as a fixup and 4 zeros.
+  Fixups.push_back(MCFixup::create(CurByte, Expr, FixupKind, Loc));
+  EmitConstant(0, Size, CurByte, OS);
+}
+
+void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
+                                        unsigned RegOpcodeField,
+                                        uint64_t TSFlags, bool Rex,
+                                        unsigned &CurByte, raw_ostream &OS,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  const MCOperand &Disp     = MI.getOperand(Op+X86::AddrDisp);
+  const MCOperand &Base     = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &Scale    = MI.getOperand(Op+X86::AddrScaleAmt);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+  unsigned BaseReg = Base.getReg();
+  bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
+
+  // Handle %rip relative addressing.
+  if (BaseReg == X86::RIP ||
+      BaseReg == X86::EIP) {    // [disp32+rIP] in X86-64 mode
+    assert(is64BitMode(STI) && "Rip-relative addressing requires 64-bit mode");
+    assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
+    EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+
+    unsigned Opcode = MI.getOpcode();
+    // movq loads are handled with a special relocation form which allows the
+    // linker to eliminate some loads for GOT references which end up in the
+    // same linkage unit.
+    unsigned FixupKind = [=]() {
+      switch (Opcode) {
+      default:
+        return X86::reloc_riprel_4byte;
+      case X86::MOV64rm:
+        assert(Rex);
+        return X86::reloc_riprel_4byte_movq_load;
+      case X86::CALL64m:
+      case X86::JMP64m:
+      case X86::TEST64rm:
+      case X86::ADC64rm:
+      case X86::ADD64rm:
+      case X86::AND64rm:
+      case X86::CMP64rm:
+      case X86::OR64rm:
+      case X86::SBB64rm:
+      case X86::SUB64rm:
+      case X86::XOR64rm:
+        return Rex ? X86::reloc_riprel_4byte_relax_rex
+                   : X86::reloc_riprel_4byte_relax;
+      }
+    }();
+
+    // rip-relative addressing is actually relative to the *next* instruction.
+    // Since an immediate can follow the mod/rm byte for an instruction, this
+    // means that we need to bias the immediate field of the instruction with
+    // the size of the immediate field.  If we have this case, add it into the
+    // expression to emit.
+    int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
+
+    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
+                  CurByte, OS, Fixups, -ImmSize);
+    return;
+  }
+
+  unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U;
+
+  // 16-bit addressing forms of the ModR/M byte have a different encoding for
+  // the R/M field and are far more limited in which registers can be used.
+  if (Is16BitMemOperand(MI, Op, STI)) {
+    if (BaseReg) {
+      // For 32-bit addressing, the row and column values in Table 2-2 are
+      // basically the same. It's AX/CX/DX/BX/SP/BP/SI/DI in that order, with
+      // some special cases. And GetX86RegNum reflects that numbering.
+      // For 16-bit addressing it's more fun, as shown in the SDM Vol 2A,
+      // Table 2-1 "16-Bit Addressing Forms with the ModR/M byte". We can only
+      // use SI/DI/BP/BX, which have "row" values 4-7 in no particular order,
+      // while values 0-3 indicate the allowed combinations (base+index) of
+      // those: 0 for BX+SI, 1 for BX+DI, 2 for BP+SI, 3 for BP+DI.
+      //
+      // R16Table[] is a lookup from the normal RegNo, to the row values from
+      // Table 2-1 for 16-bit addressing modes. Where zero means disallowed.
+      static const unsigned R16Table[] = { 0, 0, 0, 7, 0, 6, 4, 5 };
+      unsigned RMfield = R16Table[BaseRegNo];
+
+      assert(RMfield && "invalid 16-bit base register");
+
+      if (IndexReg.getReg()) {
+        unsigned IndexReg16 = R16Table[GetX86RegNum(IndexReg)];
+
+        assert(IndexReg16 && "invalid 16-bit index register");
+        // We must have one of SI/DI (4,5), and one of BP/BX (6,7).
+        assert(((IndexReg16 ^ RMfield) & 2) &&
+               "invalid 16-bit base/index register combination");
+        assert(Scale.getImm() == 1 &&
+               "invalid scale for 16-bit memory reference");
+
+        // Allow base/index to appear in either order (although GAS doesn't).
+        if (IndexReg16 & 2)
+          RMfield = (RMfield & 1) | ((7 - IndexReg16) << 1);
+        else
+          RMfield = (IndexReg16 & 1) | ((7 - RMfield) << 1);
+      }
+
+      if (Disp.isImm() && isDisp8(Disp.getImm())) {
+        if (Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+          // There is no displacement; just the register.
+          EmitByte(ModRMByte(0, RegOpcodeField, RMfield), CurByte, OS);
+          return;
+        }
+        // Use the [REG]+disp8 form, including for [BP] which cannot be encoded.
+        EmitByte(ModRMByte(1, RegOpcodeField, RMfield), CurByte, OS);
+        EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+        return;
+      }
+      // This is the [REG]+disp16 case.
+      EmitByte(ModRMByte(2, RegOpcodeField, RMfield), CurByte, OS);
+    } else {
+      // There is no BaseReg; this is the plain [disp16] case.
+      EmitByte(ModRMByte(0, RegOpcodeField, 6), CurByte, OS);
+    }
+
+    // Emit 16-bit displacement for plain disp16 or [REG]+disp16 cases.
+    EmitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups);
+    return;
+  }
+
+  // Determine whether a SIB byte is needed.
+  // If no BaseReg, issue a RIP relative instruction only if the MCE can
+  // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
+  // 2-7) and absolute references.
+
+  if (// The SIB byte must be used if there is an index register.
+      IndexReg.getReg() == 0 &&
+      // The SIB byte must be used if the base is ESP/RSP/R12, all of which
+      // encode to an R/M value of 4, which indicates that a SIB byte is
+      // present.
+      BaseRegNo != N86::ESP &&
+      // If there is no base register and we're in 64-bit mode, we need a SIB
+      // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
+      (!is64BitMode(STI) || BaseReg != 0)) {
+
+    if (BaseReg == 0) {          // [disp32]     in X86-32 mode
+      EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+      EmitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups);
+      return;
+    }
+
+    // If the base is not EBP/ESP and there is no displacement, use simple
+    // indirect register encoding, this handles addresses like [EAX].  The
+    // encoding for [EBP] with no displacement means [disp32] so we handle it
+    // by emitting a displacement of 0 below.
+    if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+      EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+      return;
+    }
+
+    // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
+    if (Disp.isImm()) {
+      if (!HasEVEX && isDisp8(Disp.getImm())) {
+        EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+        EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+        return;
+      }
+      // Try EVEX compressed 8-bit displacement first; if failed, fall back to
+      // 32-bit displacement.
+      int CDisp8 = 0;
+      if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+        EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+        EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
+                      CDisp8 - Disp.getImm());
+        return;
+      }
+    }
+
+    // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+    EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
+    unsigned Opcode = MI.getOpcode();
+    unsigned FixupKind = Opcode == X86::MOV32rm ? X86::reloc_signed_4byte_relax
+                                                : X86::reloc_signed_4byte;
+    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS,
+                  Fixups);
+    return;
+  }
+
+  // We need a SIB byte, so start by outputting the ModR/M byte first
+  assert(IndexReg.getReg() != X86::ESP &&
+         IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+  bool ForceDisp32 = false;
+  bool ForceDisp8  = false;
+  int CDisp8 = 0;
+  int ImmOffset = 0;
+  if (BaseReg == 0) {
+    // If there is no base register, we emit the special case SIB byte with
+    // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+    EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp32 = true;
+  } else if (!Disp.isImm()) {
+    // Emit the normal disp32 encoding.
+    EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp32 = true;
+  } else if (Disp.getImm() == 0 &&
+             // Base reg can't be anything that ends up with '5' as the base
+             // reg, it is the magic [*] nomenclature that indicates no base.
+             BaseRegNo != N86::EBP) {
+    // Emit no displacement ModR/M byte
+    EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+  } else if (!HasEVEX && isDisp8(Disp.getImm())) {
+    // Emit the disp8 encoding.
+    EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+  } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+    // Emit the disp8 encoding.
+    EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+    ImmOffset = CDisp8 - Disp.getImm();
+  } else {
+    // Emit the normal disp32 encoding.
+    EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+  }
+
+  // Calculate what the SS field value should be...
+  static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
+  unsigned SS = SSTable[Scale.getImm()];
+
+  if (BaseReg == 0) {
+    // Handle the SIB byte for the case where there is no base, see Intel
+    // Manual 2A, table 2-7. The displacement has already been output.
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = GetX86RegNum(IndexReg);
+    else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
+      IndexRegNo = 4;
+    EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
+  } else {
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = GetX86RegNum(IndexReg);
+    else
+      IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
+    EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS);
+  }
+
+  // Do we need to output a displacement?
+  if (ForceDisp8)
+    EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, ImmOffset);
+  else if (ForceDisp32 || Disp.getImm() != 0)
+    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
+                  CurByte, OS, Fixups);
+}
+
+/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
+/// called VEX.
+void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+                                           int MemOperand, const MCInst &MI,
+                                           const MCInstrDesc &Desc,
+                                           raw_ostream &OS) const {
+  assert(!(TSFlags & X86II::LOCK) && "Can't have LOCK VEX.");
+
+  uint64_t Encoding = TSFlags & X86II::EncodingMask;
+  bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+  bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+  bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;
+
+  // VEX_R: opcode externsion equivalent to REX.R in
+  // 1's complement (inverted) form
+  //
+  //  1: Same as REX_R=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX_R=1 (64 bit mode only)
+  //
+  uint8_t VEX_R = 0x1;
+  uint8_t EVEX_R2 = 0x1;
+
+  // VEX_X: equivalent to REX.X, only used when a
+  // register is used for index in SIB Byte.
+  //
+  //  1: Same as REX.X=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX.X=1 (64-bit mode only)
+  uint8_t VEX_X = 0x1;
+
+  // VEX_B:
+  //
+  //  1: Same as REX_B=0 (ignored in 32-bit mode)
+  //  0: Same as REX_B=1 (64 bit mode only)
+  //
+  uint8_t VEX_B = 0x1;
+
+  // VEX_W: opcode specific (use like REX.W, or used for
+  // opcode extension, or ignored, depending on the opcode byte)
+  uint8_t VEX_W = (TSFlags & X86II::VEX_W) ? 1 : 0;
+
+  // VEX_5M (VEX m-mmmmm field):
+  //
+  //  0b00000: Reserved for future use
+  //  0b00001: implied 0F leading opcode
+  //  0b00010: implied 0F 38 leading opcode bytes
+  //  0b00011: implied 0F 3A leading opcode bytes
+  //  0b00100-0b11111: Reserved for future use
+  //  0b01000: XOP map select - 08h instructions with imm byte
+  //  0b01001: XOP map select - 09h instructions with no imm byte
+  //  0b01010: XOP map select - 0Ah instructions with imm dword
+  uint8_t VEX_5M;
+  switch (TSFlags & X86II::OpMapMask) {
+  default: llvm_unreachable("Invalid prefix!");
+  case X86II::TB:   VEX_5M = 0x1; break; // 0F
+  case X86II::T8:   VEX_5M = 0x2; break; // 0F 38
+  case X86II::TA:   VEX_5M = 0x3; break; // 0F 3A
+  case X86II::XOP8: VEX_5M = 0x8; break;
+  case X86II::XOP9: VEX_5M = 0x9; break;
+  case X86II::XOPA: VEX_5M = 0xA; break;
+  }
+
+  // VEX_4V (VEX vvvv field): a register specifier
+  // (in 1's complement form) or 1111 if unused.
+  uint8_t VEX_4V = 0xf;
+  uint8_t EVEX_V2 = 0x1;
+
+  // EVEX_L2/VEX_L (Vector Length):
+  //
+  // L2 L
+  //  0 0: scalar or 128-bit vector
+  //  0 1: 256-bit vector
+  //  1 0: 512-bit vector
+  //
+  uint8_t VEX_L = (TSFlags & X86II::VEX_L) ? 1 : 0;
+  uint8_t EVEX_L2 = (TSFlags & X86II::EVEX_L2) ? 1 : 0;
+
+  // VEX_PP: opcode extension providing equivalent
+  // functionality of a SIMD prefix
+  //
+  //  0b00: None
+  //  0b01: 66
+  //  0b10: F3
+  //  0b11: F2
+  //
+  uint8_t VEX_PP;
+  switch (TSFlags & X86II::OpPrefixMask) {
+  default: llvm_unreachable("Invalid op prefix!");
+  case X86II::PS: VEX_PP = 0x0; break; // none
+  case X86II::PD: VEX_PP = 0x1; break; // 66
+  case X86II::XS: VEX_PP = 0x2; break; // F3
+  case X86II::XD: VEX_PP = 0x3; break; // F2
+  }
+
+  // EVEX_U
+  uint8_t EVEX_U = 1; // Always '1' so far
+
+  // EVEX_z
+  uint8_t EVEX_z = (HasEVEX_K && (TSFlags & X86II::EVEX_Z)) ? 1 : 0;
+
+  // EVEX_b
+  uint8_t EVEX_b = (TSFlags & X86II::EVEX_B) ? 1 : 0;
+
+  // EVEX_rc
+  uint8_t EVEX_rc = 0;
+
+  // EVEX_aaa
+  uint8_t EVEX_aaa = 0;
+
+  bool EncodeRC = false;
+
+  // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+  unsigned NumOps = Desc.getNumOperands();
+  unsigned CurOp = X86II::getOperandBias(Desc);
+
+  switch (TSFlags & X86II::FormMask) {
+  default: llvm_unreachable("Unexpected form in EmitVEXOpcodePrefix!");
+  case X86II::RawFrm:
+    break;
+  case X86II::MRMDestMem: {
+    // MRMDestMem instructions forms:
+    //  MemAddr, src1(ModR/M)
+    //  MemAddr, src1(VEX_4V), src2(ModR/M)
+    //  MemAddr, src1(ModR/M), imm8
+    //
+    unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+    VEX_B = ~(BaseRegEnc >> 3) & 1;
+    unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+    VEX_X = ~(IndexRegEnc >> 3) & 1;
+    if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
+      EVEX_V2 = ~(IndexRegEnc >> 4) & 1;
+
+    CurOp += X86::AddrNumOperands;
+
+    if (HasEVEX_K)
+      EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+    if (HasVEX_4V) {
+      unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+      VEX_4V = ~VRegEnc & 0xf;
+      EVEX_V2 = ~(VRegEnc >> 4) & 1;
+    }
+
+    unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_R = ~(RegEnc >> 3) & 1;
+    EVEX_R2 = ~(RegEnc >> 4) & 1;
+    break;
+  }
+  case X86II::MRMSrcMem: {
+    // MRMSrcMem instructions forms:
+    //  src1(ModR/M), MemAddr
+    //  src1(ModR/M), src2(VEX_4V), MemAddr
+    //  src1(ModR/M), MemAddr, imm8
+    //  src1(ModR/M), MemAddr, src2(Imm[7:4])
+    //
+    //  FMA4:
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(Imm[7:4])
+    unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_R = ~(RegEnc >> 3) & 1;
+    EVEX_R2 = ~(RegEnc >> 4) & 1;
+
+    if (HasEVEX_K)
+      EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+    if (HasVEX_4V) {
+      unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+      VEX_4V = ~VRegEnc & 0xf;
+      EVEX_V2 = ~(VRegEnc >> 4) & 1;
+    }
+
+    unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+    VEX_B = ~(BaseRegEnc >> 3) & 1;
+    unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+    VEX_X = ~(IndexRegEnc >> 3) & 1;
+    if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
+      EVEX_V2 = ~(IndexRegEnc >> 4) & 1;
+
+    break;
+  }
+  case X86II::MRMSrcMem4VOp3: {
+    // Instruction format for 4VOp3:
+    //   src1(ModR/M), MemAddr, src3(VEX_4V)
+    unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_R = ~(RegEnc >> 3) & 1;
+
+    unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+    VEX_B = ~(BaseRegEnc >> 3) & 1;
+    unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+    VEX_X = ~(IndexRegEnc >> 3) & 1;
+
+    VEX_4V = ~getX86RegEncoding(MI, CurOp + X86::AddrNumOperands) & 0xf;
+    break;
+  }
+  case X86II::MRMSrcMemOp4: {
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M),
+    unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_R = ~(RegEnc >> 3) & 1;
+
+    unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_4V = ~VRegEnc & 0xf;
+
+    unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+    VEX_B = ~(BaseRegEnc >> 3) & 1;
+    unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+    VEX_X = ~(IndexRegEnc >> 3) & 1;
+    break;
+  }
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    // MRM[0-9]m instructions forms:
+    //  MemAddr
+    //  src1(VEX_4V), MemAddr
+    if (HasVEX_4V) {
+      unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+      VEX_4V = ~VRegEnc & 0xf;
+      EVEX_V2 = ~(VRegEnc >> 4) & 1;
+    }
+
+    if (HasEVEX_K)
+      EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+    unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
+    VEX_B = ~(BaseRegEnc >> 3) & 1;
+    unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+    VEX_X = ~(IndexRegEnc >> 3) & 1;
+    break;
+  }
+  case X86II::MRMSrcReg: {
+    // MRMSrcReg instructions forms:
+    //  dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(Imm[7:4])
+    //  dst(ModR/M), src1(ModR/M)
+    //  dst(ModR/M), src1(ModR/M), imm8
+    //
+    //  FMA4:
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M),
+    unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_R = ~(RegEnc >> 3) & 1;
+    EVEX_R2 = ~(RegEnc >> 4) & 1;
+
+    if (HasEVEX_K)
+      EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+    if (HasVEX_4V) {
+      unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+      VEX_4V = ~VRegEnc & 0xf;
+      EVEX_V2 = ~(VRegEnc >> 4) & 1;
+    }
+
+    RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_B = ~(RegEnc >> 3) & 1;
+    VEX_X = ~(RegEnc >> 4) & 1;
+
+    if (EVEX_b) {
+      if (HasEVEX_RC) {
+        unsigned RcOperand = NumOps-1;
+        assert(RcOperand >= CurOp);
+        EVEX_rc = MI.getOperand(RcOperand).getImm() & 0x3;
+      }
+      EncodeRC = true;
+    }
+    break;
+  }
+  case X86II::MRMSrcReg4VOp3: {
+    // Instruction format for 4VOp3:
+    //   src1(ModR/M), src2(ModR/M), src3(VEX_4V)
+    unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_R = ~(RegEnc >> 3) & 1;
+
+    RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_B = ~(RegEnc >> 3) & 1;
+
+    VEX_4V = ~getX86RegEncoding(MI, CurOp++) & 0xf;
+    break;
+  }
+  case X86II::MRMSrcRegOp4: {
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M),
+    unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_R = ~(RegEnc >> 3) & 1;
+
+    unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_4V = ~VRegEnc & 0xf;
+
+    // Skip second register source (encoded in Imm[7:4])
+    ++CurOp;
+
+    RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_B = ~(RegEnc >> 3) & 1;
+    VEX_X = ~(RegEnc >> 4) & 1;
+    break;
+  }
+  case X86II::MRMDestReg: {
+    // MRMDestReg instructions forms:
+    //  dst(ModR/M), src(ModR/M)
+    //  dst(ModR/M), src(ModR/M), imm8
+    //  dst(ModR/M), src1(VEX_4V), src2(ModR/M)
+    unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_B = ~(RegEnc >> 3) & 1;
+    VEX_X = ~(RegEnc >> 4) & 1;
+
+    if (HasEVEX_K)
+      EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+    if (HasVEX_4V) {
+      unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+      VEX_4V = ~VRegEnc & 0xf;
+      EVEX_V2 = ~(VRegEnc >> 4) & 1;
+    }
+
+    RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_R = ~(RegEnc >> 3) & 1;
+    EVEX_R2 = ~(RegEnc >> 4) & 1;
+    if (EVEX_b)
+      EncodeRC = true;
+    break;
+  }
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r: {
+    // MRM0r-MRM7r instructions forms:
+    //  dst(VEX_4V), src(ModR/M), imm8
+    if (HasVEX_4V) {
+      unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
+      VEX_4V = ~VRegEnc & 0xf;
+      EVEX_V2 = ~(VRegEnc >> 4) & 1;
+    }
+    if (HasEVEX_K)
+      EVEX_aaa = getX86RegEncoding(MI, CurOp++);
+
+    unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_B = ~(RegEnc >> 3) & 1;
+    VEX_X = ~(RegEnc >> 4) & 1;
+    break;
+  }
+  }
+
+  if (Encoding == X86II::VEX || Encoding == X86II::XOP) {
+    // VEX opcode prefix can have 2 or 3 bytes
+    //
+    //  3 bytes:
+    //    +-----+ +--------------+ +-------------------+
+    //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+    //    +-----+ +--------------+ +-------------------+
+    //  2 bytes:
+    //    +-----+ +-------------------+
+    //    | C5h | | R | vvvv | L | pp |
+    //    +-----+ +-------------------+
+    //
+    //  XOP uses a similar prefix:
+    //    +-----+ +--------------+ +-------------------+
+    //    | 8Fh | | RXB | m-mmmm | | W | vvvv | L | pp |
+    //    +-----+ +--------------+ +-------------------+
+    uint8_t LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+
+    // Can we use the 2 byte VEX prefix?
+    if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
+      EmitByte(0xC5, CurByte, OS);
+      EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
+      return;
+    }
+
+    // 3 byte VEX prefix
+    EmitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, CurByte, OS);
+    EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
+    EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+  } else {
+    assert(Encoding == X86II::EVEX && "unknown encoding!");
+    // EVEX opcode prefix can have 4 bytes
+    //
+    // +-----+ +--------------+ +-------------------+ +------------------------+
+    // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa |
+    // +-----+ +--------------+ +-------------------+ +------------------------+
+    assert((VEX_5M & 0x3) == VEX_5M
+           && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
+
+    EmitByte(0x62, CurByte, OS);
+    EmitByte((VEX_R   << 7) |
+             (VEX_X   << 6) |
+             (VEX_B   << 5) |
+             (EVEX_R2 << 4) |
+             VEX_5M, CurByte, OS);
+    EmitByte((VEX_W   << 7) |
+             (VEX_4V  << 3) |
+             (EVEX_U  << 2) |
+             VEX_PP, CurByte, OS);
+    if (EncodeRC)
+      EmitByte((EVEX_z  << 7) |
+               (EVEX_rc << 5) |
+               (EVEX_b  << 4) |
+               (EVEX_V2 << 3) |
+               EVEX_aaa, CurByte, OS);
+    else
+      EmitByte((EVEX_z  << 7) |
+               (EVEX_L2 << 6) |
+               (VEX_L   << 5) |
+               (EVEX_b  << 4) |
+               (EVEX_V2 << 3) |
+               EVEX_aaa, CurByte, OS);
+  }
+}
+
+/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
+                                             int MemOperand,
+                                             const MCInstrDesc &Desc) const {
+  uint8_t REX = 0;
+  bool UsesHighByteReg = false;
+
+  if (TSFlags & X86II::REX_W)
+    REX |= 1 << 3; // set REX.W
+
+  if (MI.getNumOperands() == 0) return REX;
+
+  unsigned NumOps = MI.getNumOperands();
+  unsigned CurOp = X86II::getOperandBias(Desc);
+
+  // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+  for (unsigned i = CurOp; i != NumOps; ++i) {
+    const MCOperand &MO = MI.getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
+      UsesHighByteReg = true;
+    if (X86II::isX86_64NonExtLowByteReg(Reg))
+      // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
+      // that returns non-zero.
+      REX |= 0x40; // REX fixed encoding prefix
+  }
+
+  switch (TSFlags & X86II::FormMask) {
+  case X86II::AddRegFrm:
+    REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+    break;
+  case X86II::MRMSrcReg:
+    REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+    REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+    break;
+  case X86II::MRMSrcMem: {
+    REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+    REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
+    REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
+    CurOp += X86::AddrNumOperands;
+    break;
+  }
+  case X86II::MRMDestReg:
+    REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+    REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+    break;
+  case X86II::MRMDestMem:
+    REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
+    REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
+    CurOp += X86::AddrNumOperands;
+    REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+    break;
+  case X86II::MRMXm:
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m:
+    REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
+    REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
+    break;
+  case X86II::MRMXr:
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r:
+    REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+    break;
+  }
+  if (REX && UsesHighByteReg)
+    report_fatal_error("Cannot encode high byte register in REX-prefixed instruction");
+
+  return REX;
+}
+
+/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
+void X86MCCodeEmitter::EmitSegmentOverridePrefix(unsigned &CurByte,
+                                                 unsigned SegOperand,
+                                                 const MCInst &MI,
+                                                 raw_ostream &OS) const {
+  // Check for explicit segment override on memory operand.
+  switch (MI.getOperand(SegOperand).getReg()) {
+  default: llvm_unreachable("Unknown segment register!");
+  case 0: break;
+  case X86::CS: EmitByte(0x2E, CurByte, OS); break;
+  case X86::SS: EmitByte(0x36, CurByte, OS); break;
+  case X86::DS: EmitByte(0x3E, CurByte, OS); break;
+  case X86::ES: EmitByte(0x26, CurByte, OS); break;
+  case X86::FS: EmitByte(0x64, CurByte, OS); break;
+  case X86::GS: EmitByte(0x65, CurByte, OS); break;
+  }
+}
+
+/// Emit all instruction prefixes prior to the opcode.
+///
+/// MemOperand is the operand # of the start of a memory operand if present.  If
+/// Not present, it is -1.
+///
+/// Returns true if a REX prefix was used.
+bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+                                        int MemOperand, const MCInst &MI,
+                                        const MCInstrDesc &Desc,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &OS) const {
+  bool Ret = false;
+  // Emit the operand size opcode prefix as needed.
+  if ((TSFlags & X86II::OpSizeMask) == (is16BitMode(STI) ? X86II::OpSize32
+                                                         : X86II::OpSize16))
+    EmitByte(0x66, CurByte, OS);
+
+  // Emit the LOCK opcode prefix.
+  if (TSFlags & X86II::LOCK)
+    EmitByte(0xF0, CurByte, OS);
+
+  switch (TSFlags & X86II::OpPrefixMask) {
+  case X86II::PD:   // 66
+    EmitByte(0x66, CurByte, OS);
+    break;
+  case X86II::XS:   // F3
+    EmitByte(0xF3, CurByte, OS);
+    break;
+  case X86II::XD:   // F2
+    EmitByte(0xF2, CurByte, OS);
+    break;
+  }
+
+  // Handle REX prefix.
+  // FIXME: Can this come before F2 etc to simplify emission?
+  if (is64BitMode(STI)) {
+    if (uint8_t REX = DetermineREXPrefix(MI, TSFlags, MemOperand, Desc)) {
+      EmitByte(0x40 | REX, CurByte, OS);
+      Ret = true;
+    }
+  }
+
+  // 0x0F escape code must be emitted just before the opcode.
+  switch (TSFlags & X86II::OpMapMask) {
+  case X86II::TB:  // Two-byte opcode map
+  case X86II::T8:  // 0F 38
+  case X86II::TA:  // 0F 3A
+    EmitByte(0x0F, CurByte, OS);
+    break;
+  }
+
+  switch (TSFlags & X86II::OpMapMask) {
+  case X86II::T8:    // 0F 38
+    EmitByte(0x38, CurByte, OS);
+    break;
+  case X86II::TA:    // 0F 3A
+    EmitByte(0x3A, CurByte, OS);
+    break;
+  }
+  return Ret;
+}
+
+void X86MCCodeEmitter::
+encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MCII.get(Opcode);
+  uint64_t TSFlags = Desc.TSFlags;
+
+  // Pseudo instructions don't get encoded.
+  if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return;
+
+  unsigned NumOps = Desc.getNumOperands();
+  unsigned CurOp = X86II::getOperandBias(Desc);
+
+  // Keep track of the current byte being emitted.
+  unsigned CurByte = 0;
+
+  // Encoding type for this instruction.
+  uint64_t Encoding = TSFlags & X86II::EncodingMask;
+
+  // It uses the VEX.VVVV field?
+  bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+  bool HasVEX_I8Reg = (TSFlags & X86II::ImmMask) == X86II::Imm8Reg;
+
+  // It uses the EVEX.aaa field?
+  bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+  bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;
+
+  // Used if a register is encoded in 7:4 of immediate.
+  unsigned I8RegNum = 0;
+
+  // Determine where the memory operand starts, if present.
+  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
+  if (MemoryOperand != -1) MemoryOperand += CurOp;
+
+  // Emit segment override opcode prefix as needed.
+  if (MemoryOperand >= 0)
+    EmitSegmentOverridePrefix(CurByte, MemoryOperand+X86::AddrSegmentReg,
+                              MI, OS);
+
+  // Emit the repeat opcode prefix as needed.
+  if (TSFlags & X86II::REP)
+    EmitByte(0xF3, CurByte, OS);
+
+  // Emit the address size opcode prefix as needed.
+  bool need_address_override;
+  uint64_t AdSize = TSFlags & X86II::AdSizeMask;
+  if ((is16BitMode(STI) && AdSize == X86II::AdSize32) ||
+      (is32BitMode(STI) && AdSize == X86II::AdSize16) ||
+      (is64BitMode(STI) && AdSize == X86II::AdSize32)) {
+    need_address_override = true;
+  } else if (MemoryOperand < 0) {
+    need_address_override = false;
+  } else if (is64BitMode(STI)) {
+    assert(!Is16BitMemOperand(MI, MemoryOperand, STI));
+    need_address_override = Is32BitMemOperand(MI, MemoryOperand);
+  } else if (is32BitMode(STI)) {
+    assert(!Is64BitMemOperand(MI, MemoryOperand));
+    need_address_override = Is16BitMemOperand(MI, MemoryOperand, STI);
+  } else {
+    assert(is16BitMode(STI));
+    assert(!Is64BitMemOperand(MI, MemoryOperand));
+    need_address_override = !Is16BitMemOperand(MI, MemoryOperand, STI);
+  }
+
+  if (need_address_override)
+    EmitByte(0x67, CurByte, OS);
+
+  bool Rex = false;
+  if (Encoding == 0)
+    Rex = emitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS);
+  else
+    EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+
+  uint8_t BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
+
+  if (TSFlags & X86II::Has3DNow0F0FOpcode)
+    BaseOpcode = 0x0F;   // Weird 3DNow! encoding.
+
+  uint64_t Form = TSFlags & X86II::FormMask;
+  switch (Form) {
+  default: errs() << "FORM: " << Form << "\n";
+    llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!");
+  case X86II::Pseudo:
+    llvm_unreachable("Pseudo instruction shouldn't be emitted");
+  case X86II::RawFrmDstSrc: {
+    unsigned siReg = MI.getOperand(1).getReg();
+    assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) ||
+            (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) ||
+            (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) &&
+           "SI and DI register sizes do not match");
+    // Emit segment override opcode prefix as needed (not for %ds).
+    if (MI.getOperand(2).getReg() != X86::DS)
+      EmitSegmentOverridePrefix(CurByte, 2, MI, OS);
+    // Emit AdSize prefix as needed.
+    if ((!is32BitMode(STI) && siReg == X86::ESI) ||
+        (is32BitMode(STI) && siReg == X86::SI))
+      EmitByte(0x67, CurByte, OS);
+    CurOp += 3; // Consume operands.
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  }
+  case X86II::RawFrmSrc: {
+    unsigned siReg = MI.getOperand(0).getReg();
+    // Emit segment override opcode prefix as needed (not for %ds).
+    if (MI.getOperand(1).getReg() != X86::DS)
+      EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
+    // Emit AdSize prefix as needed.
+    if ((!is32BitMode(STI) && siReg == X86::ESI) ||
+        (is32BitMode(STI) && siReg == X86::SI))
+      EmitByte(0x67, CurByte, OS);
+    CurOp += 2; // Consume operands.
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  }
+  case X86II::RawFrmDst: {
+    unsigned siReg = MI.getOperand(0).getReg();
+    // Emit AdSize prefix as needed.
+    if ((!is32BitMode(STI) && siReg == X86::EDI) ||
+        (is32BitMode(STI) && siReg == X86::DI))
+      EmitByte(0x67, CurByte, OS);
+    ++CurOp; // Consume operand.
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  }
+  case X86II::RawFrm:
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  case X86II::RawFrmMemOffs:
+    // Emit segment override opcode prefix as needed.
+    EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                  CurByte, OS, Fixups);
+    ++CurOp; // skip segment operand
+    break;
+  case X86II::RawFrmImm8:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                  CurByte, OS, Fixups);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte,
+                  OS, Fixups);
+    break;
+  case X86II::RawFrmImm16:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                  CurByte, OS, Fixups);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte,
+                  OS, Fixups);
+    break;
+
+  case X86II::AddRegFrm:
+    EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
+    break;
+
+  case X86II::MRMDestReg: {
+    EmitByte(BaseOpcode, CurByte, OS);
+    unsigned SrcRegNum = CurOp + 1;
+
+    if (HasEVEX_K) // Skip writemask
+      ++SrcRegNum;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
+    EmitRegModRMByte(MI.getOperand(CurOp),
+                     GetX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS);
+    CurOp = SrcRegNum + 1;
+    break;
+  }
+  case X86II::MRMDestMem: {
+    EmitByte(BaseOpcode, CurByte, OS);
+    unsigned SrcRegNum = CurOp + X86::AddrNumOperands;
+
+    if (HasEVEX_K) // Skip writemask
+      ++SrcRegNum;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
+    emitMemModRMByte(MI, CurOp, GetX86RegNum(MI.getOperand(SrcRegNum)), TSFlags,
+                     Rex, CurByte, OS, Fixups, STI);
+    CurOp = SrcRegNum + 1;
+    break;
+  }
+  case X86II::MRMSrcReg: {
+    EmitByte(BaseOpcode, CurByte, OS);
+    unsigned SrcRegNum = CurOp + 1;
+
+    if (HasEVEX_K) // Skip writemask
+      ++SrcRegNum;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
+    EmitRegModRMByte(MI.getOperand(SrcRegNum),
+                     GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+    CurOp = SrcRegNum + 1;
+    if (HasVEX_I8Reg)
+      I8RegNum = getX86RegEncoding(MI, CurOp++);
+    // do not count the rounding control operand
+    if (HasEVEX_RC)
+      --NumOps;
+    break;
+  }
+  case X86II::MRMSrcReg4VOp3: {
+    EmitByte(BaseOpcode, CurByte, OS);
+    unsigned SrcRegNum = CurOp + 1;
+
+    EmitRegModRMByte(MI.getOperand(SrcRegNum),
+                     GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+    CurOp = SrcRegNum + 1;
+    ++CurOp; // Encoded in VEX.VVVV
+    break;
+  }
+  case X86II::MRMSrcRegOp4: {
+    EmitByte(BaseOpcode, CurByte, OS);
+    unsigned SrcRegNum = CurOp + 1;
+
+    // Skip 1st src (which is encoded in VEX_VVVV)
+    ++SrcRegNum;
+
+    // Capture 2nd src (which is encoded in Imm[7:4])
+    assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg");
+    I8RegNum = getX86RegEncoding(MI, SrcRegNum++);
+
+    EmitRegModRMByte(MI.getOperand(SrcRegNum),
+                     GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+    CurOp = SrcRegNum + 1;
+    break;
+  }
+  case X86II::MRMSrcMem: {
+    unsigned FirstMemOp = CurOp+1;
+
+    if (HasEVEX_K) // Skip writemask
+      ++FirstMemOp;
+
+    if (HasVEX_4V)
+      ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
+
+    EmitByte(BaseOpcode, CurByte, OS);
+
+    emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+                     TSFlags, Rex, CurByte, OS, Fixups, STI);
+    CurOp = FirstMemOp + X86::AddrNumOperands;
+    if (HasVEX_I8Reg)
+      I8RegNum = getX86RegEncoding(MI, CurOp++);
+    break;
+  }
+  case X86II::MRMSrcMem4VOp3: {
+    unsigned FirstMemOp = CurOp+1;
+
+    EmitByte(BaseOpcode, CurByte, OS);
+
+    emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+                     TSFlags, Rex, CurByte, OS, Fixups, STI);
+    CurOp = FirstMemOp + X86::AddrNumOperands;
+    ++CurOp; // Encoded in VEX.VVVV.
+    break;
+  }
+  case X86II::MRMSrcMemOp4: {
+    unsigned FirstMemOp = CurOp+1;
+
+    ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
+
+    // Capture second register source (encoded in Imm[7:4])
+    assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg");
+    I8RegNum = getX86RegEncoding(MI, FirstMemOp++);
+
+    EmitByte(BaseOpcode, CurByte, OS);
+
+    emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+                     TSFlags, Rex, CurByte, OS, Fixups, STI);
+    CurOp = FirstMemOp + X86::AddrNumOperands;
+    break;
+  }
+
+  case X86II::MRMXr:
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r: {
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      ++CurOp;
+    if (HasEVEX_K) // Skip writemask
+      ++CurOp;
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitRegModRMByte(MI.getOperand(CurOp++),
+                     (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r,
+                     CurByte, OS);
+    break;
+  }
+
+  case X86II::MRMXm:
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      ++CurOp;
+    if (HasEVEX_K) // Skip writemask
+      ++CurOp;
+    EmitByte(BaseOpcode, CurByte, OS);
+    emitMemModRMByte(MI, CurOp,
+                     (Form == X86II::MRMXm) ? 0 : Form - X86II::MRM0m, TSFlags,
+                     Rex, CurByte, OS, Fixups, STI);
+    CurOp += X86::AddrNumOperands;
+    break;
+  }
+  case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
+  case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
+  case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
+  case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
+  case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE:
+  case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1:
+  case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4:
+  case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7:
+  case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA:
+  case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD:
+  case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0:
+  case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3:
+  case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6:
+  case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9:
+  case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
+  case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF:
+  case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2:
+  case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5:
+  case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8:
+  case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB:
+  case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE:
+  case X86II::MRM_FF:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS);
+    break;
+  }
+
+  if (HasVEX_I8Reg) {
+    // The last source register of a 4 operand instruction in AVX is encoded
+    // in bits[7:4] of a immediate byte.
+    assert(I8RegNum < 16 && "Register encoding out of range");
+    I8RegNum <<= 4;
+    if (CurOp != NumOps) {
+      unsigned Val = MI.getOperand(CurOp++).getImm();
+      assert(Val < 16 && "Immediate operand value out of range");
+      I8RegNum |= Val;
+    }
+    EmitImmediate(MCOperand::createImm(I8RegNum), MI.getLoc(), 1, FK_Data_1,
+                  CurByte, OS, Fixups);
+  } else {
+    // If there is a remaining operand, it must be a trailing immediate. Emit it
+    // according to the right size for the instruction. Some instructions
+    // (SSE4a extrq and insertq) have two trailing immediates.
+    while (CurOp != NumOps && NumOps - CurOp <= 2) {
+      EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+                    X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                    CurByte, OS, Fixups);
+    }
+  }
+
+  if (TSFlags & X86II::Has3DNow0F0FOpcode)
+    EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
+
+#ifndef NDEBUG
+  // FIXME: Verify.
+  if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) {
+    errs() << "Cannot encode all operands of: ";
+    MI.dump();
+    errs() << '\n';
+    abort();
+  }
+#endif
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
new file mode 100644
index 000000000000..22cb0fac33cb
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -0,0 +1,456 @@
+//===-- X86MCTargetDesc.cpp - X86 Target Descriptions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "InstPrinter/X86IntelInstPrinter.h"
+#include "X86MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#if _MSC_VER
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
+#define GET_REGINFO_MC_DESC
+#include "X86GenRegisterInfo.inc"
+
+#define GET_INSTRINFO_MC_DESC
+#include "X86GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "X86GenSubtargetInfo.inc"
+
+std::string X86_MC::ParseX86Triple(const Triple &TT) {
+  std::string FS;
+  if (TT.getArch() == Triple::x86_64)
+    FS = "+64bit-mode,-32bit-mode,-16bit-mode";
+  else if (TT.getEnvironment() != Triple::CODE16)
+    FS = "-64bit-mode,+32bit-mode,-16bit-mode";
+  else
+    FS = "-64bit-mode,-32bit-mode,+16bit-mode";
+
+  return FS;
+}
+
+unsigned X86_MC::getDwarfRegFlavour(const Triple &TT, bool isEH) {
+  if (TT.getArch() == Triple::x86_64)
+    return DWARFFlavour::X86_64;
+
+  if (TT.isOSDarwin())
+    return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic;
+  if (TT.isOSCygMing())
+    // Unsupported by now, just quick fallback
+    return DWARFFlavour::X86_32_Generic;
+  return DWARFFlavour::X86_32_Generic;
+}
+
+void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
+  // FIXME: TableGen these.
+  for (unsigned Reg = X86::NoRegister + 1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
+    unsigned SEH = MRI->getEncodingValue(Reg);
+    MRI->mapLLVMRegToSEHReg(Reg, SEH);
+  }
+
+  // These CodeView registers are numbered sequentially starting at value 1.
+  static const MCPhysReg LowCVRegs[] = {
+      X86::AL,  X86::CL,  X86::DL,  X86::BL,  X86::AH,  X86::CH,
+      X86::DH,  X86::BH,  X86::AX,  X86::CX,  X86::DX,  X86::BX,
+      X86::SP,  X86::BP,  X86::SI,  X86::DI,  X86::EAX, X86::ECX,
+      X86::EDX, X86::EBX, X86::ESP, X86::EBP, X86::ESI, X86::EDI,
+  };
+  unsigned CVLowRegStart = 1;
+  for (unsigned I = 0; I < array_lengthof(LowCVRegs); ++I)
+    MRI->mapLLVMRegToCVReg(LowCVRegs[I], I + CVLowRegStart);
+
+  MRI->mapLLVMRegToCVReg(X86::EFLAGS, 34);
+
+  // The x87 registers start at 128 and are numbered sequentially.
+  unsigned FP0Start = 128;
+  for (unsigned I = 0; I < 8; ++I)
+    MRI->mapLLVMRegToCVReg(X86::FP0 + I, FP0Start + I);
+
+  // The low 8 XMM registers start at 154 and are numbered sequentially.
+  unsigned CVXMM0Start = 154;
+  for (unsigned I = 0; I < 8; ++I)
+    MRI->mapLLVMRegToCVReg(X86::XMM0 + I, CVXMM0Start + I);
+
+  // The high 8 XMM registers start at 252 and are numbered sequentially.
+  unsigned CVXMM8Start = 252;
+  for (unsigned I = 0; I < 8; ++I)
+    MRI->mapLLVMRegToCVReg(X86::XMM8 + I, CVXMM8Start + I);
+
+  // FIXME: XMM16 and above from AVX512 not yet documented.
+
+  // AMD64 registers start at 324 and count up.
+  unsigned CVX64RegStart = 324;
+  static const MCPhysReg CVX64Regs[] = {
+      X86::SIL,   X86::DIL,   X86::BPL,   X86::SPL,   X86::RAX,   X86::RBX,
+      X86::RCX,   X86::RDX,   X86::RSI,   X86::RDI,   X86::RBP,   X86::RSP,
+      X86::R8,    X86::R9,    X86::R10,   X86::R11,   X86::R12,   X86::R13,
+      X86::R14,   X86::R15,   X86::R8B,   X86::R9B,   X86::R10B,  X86::R11B,
+      X86::R12B,  X86::R13B,  X86::R14B,  X86::R15B,  X86::R8W,   X86::R9W,
+      X86::R10W,  X86::R11W,  X86::R12W,  X86::R13W,  X86::R14W,  X86::R15W,
+      X86::R8D,   X86::R9D,   X86::R10D,  X86::R11D,  X86::R12D,  X86::R13D,
+      X86::R14D,  X86::R15D,  X86::YMM0,  X86::YMM1,  X86::YMM2,  X86::YMM3,
+      X86::YMM4,  X86::YMM5,  X86::YMM6,  X86::YMM7,  X86::YMM8,  X86::YMM9,
+      X86::YMM10, X86::YMM11, X86::YMM12, X86::YMM13, X86::YMM14, X86::YMM15,
+  };
+  for (unsigned I = 0; I < array_lengthof(CVX64Regs); ++I)
+    MRI->mapLLVMRegToCVReg(CVX64Regs[I], CVX64RegStart + I);
+}
+
+MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
+                                                  StringRef CPU, StringRef FS) {
+  std::string ArchFS = X86_MC::ParseX86Triple(TT);
+  if (!FS.empty()) {
+    if (!ArchFS.empty())
+      ArchFS = (Twine(ArchFS) + "," + FS).str();
+    else
+      ArchFS = FS;
+  }
+
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "generic";
+
+  return createX86MCSubtargetInfoImpl(TT, CPUName, ArchFS);
+}
+
+static MCInstrInfo *createX86MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitX86MCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createX86MCRegisterInfo(const Triple &TT) {
+  unsigned RA = (TT.getArch() == Triple::x86_64)
+                    ? X86::RIP  // Should have dwarf #16.
+                    : X86::EIP; // Should have dwarf #8.
+
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitX86MCRegisterInfo(X, RA, X86_MC::getDwarfRegFlavour(TT, false),
+                        X86_MC::getDwarfRegFlavour(TT, true), RA);
+  X86_MC::initLLVMToSEHAndCVRegMapping(X);
+  return X;
+}
+
+static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
+                                     const Triple &TheTriple) {
+  bool is64Bit = TheTriple.getArch() == Triple::x86_64;
+
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSBinFormatMachO()) {
+    if (is64Bit)
+      MAI = new X86_64MCAsmInfoDarwin(TheTriple);
+    else
+      MAI = new X86MCAsmInfoDarwin(TheTriple);
+  } else if (TheTriple.isOSBinFormatELF()) {
+    // Force the use of an ELF container.
+    MAI = new X86ELFMCAsmInfo(TheTriple);
+  } else if (TheTriple.isWindowsMSVCEnvironment() ||
+             TheTriple.isWindowsCoreCLREnvironment()) {
+    MAI = new X86MCAsmInfoMicrosoft(TheTriple);
+  } else if (TheTriple.isOSCygMing() ||
+             TheTriple.isWindowsItaniumEnvironment()) {
+    MAI = new X86MCAsmInfoGNUCOFF(TheTriple);
+  } else {
+    // The default is ELF.
+    MAI = new X86ELFMCAsmInfo(TheTriple);
+  }
+
+  // Initialize initial frame state.
+  // Calculate amount of bytes used for return address storing
+  int stackGrowth = is64Bit ? -8 : -4;
+
+  // Initial state of the frame pointer is esp+stackGrowth.
+  unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP;
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
+      nullptr, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
+  MAI->addInitialFrameState(Inst);
+
+  // Add return address to move list
+  unsigned InstPtr = is64Bit ? X86::RIP : X86::EIP;
+  MCCFIInstruction Inst2 = MCCFIInstruction::createOffset(
+      nullptr, MRI.getDwarfRegNum(InstPtr, true), stackGrowth);
+  MAI->addInitialFrameState(Inst2);
+
+  return MAI;
+}
+
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+                              CodeModel::Model &CM) {
+  bool is64Bit = TT.getArch() == Triple::x86_64;
+
+  // For static codegen, if we're not already set, use Small codegen.
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  else if (CM == CodeModel::JITDefault)
+    // 64-bit JIT places everything in the same buffer except external funcs.
+    CM = is64Bit ? CodeModel::Large : CodeModel::Small;
+}
+
+static MCInstPrinter *createX86MCInstPrinter(const Triple &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
+                                             const MCRegisterInfo &MRI) {
+  if (SyntaxVariant == 0)
+    return new X86ATTInstPrinter(MAI, MII, MRI);
+  if (SyntaxVariant == 1)
+    return new X86IntelInstPrinter(MAI, MII, MRI);
+  return nullptr;
+}
+
+static MCRelocationInfo *createX86MCRelocationInfo(const Triple &TheTriple,
+                                                   MCContext &Ctx) {
+  // Default to the stock relocation info.
+  return llvm::createMCRelocationInfo(TheTriple, Ctx);
+}
+
+static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
+  return new MCInstrAnalysis(Info);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86TargetMC() {
+  for (Target *T : {&getTheX86_32Target(), &getTheX86_64Target()}) {
+    // Register the MC asm info.
+    RegisterMCAsmInfoFn X(*T, createX86MCAsmInfo);
+
+    // Register the MC codegen info.
+    RegisterMCAdjustCodeGenOptsFn Y(*T, adjustCodeGenOpts);
+
+    // Register the MC instruction info.
+    TargetRegistry::RegisterMCInstrInfo(*T, createX86MCInstrInfo);
+
+    // Register the MC register info.
+    TargetRegistry::RegisterMCRegInfo(*T, createX86MCRegisterInfo);
+
+    // Register the MC subtarget info.
+    TargetRegistry::RegisterMCSubtargetInfo(*T,
+                                            X86_MC::createX86MCSubtargetInfo);
+
+    // Register the MC instruction analyzer.
+    TargetRegistry::RegisterMCInstrAnalysis(*T, createX86MCInstrAnalysis);
+
+    // Register the code emitter.
+    TargetRegistry::RegisterMCCodeEmitter(*T, createX86MCCodeEmitter);
+
+    // Register the object streamer.
+    TargetRegistry::RegisterCOFFStreamer(*T, createX86WinCOFFStreamer);
+
+    // Register the MCInstPrinter.
+    TargetRegistry::RegisterMCInstPrinter(*T, createX86MCInstPrinter);
+
+    // Register the MC relocation info.
+    TargetRegistry::RegisterMCRelocationInfo(*T, createX86MCRelocationInfo);
+  }
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(getTheX86_32Target(),
+                                       createX86_32AsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(getTheX86_64Target(),
+                                       createX86_64AsmBackend);
+}
+
+unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
+                                            bool High) {
+  switch (Size) {
+  default: return 0;
+  case 8:
+    if (High) {
+      switch (Reg) {
+      default: return getX86SubSuperRegisterOrZero(Reg, 64);
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SI;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DI;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BP;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SP;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AH;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DH;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CH;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BH;
+      }
+    } else {
+      switch (Reg) {
+      default: return 0;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AL;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DL;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CL;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BL;
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SIL;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DIL;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BPL;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SPL;
+      case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+        return X86::R8B;
+      case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+        return X86::R9B;
+      case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+        return X86::R10B;
+      case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+        return X86::R11B;
+      case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+        return X86::R12B;
+      case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+        return X86::R13B;
+      case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+        return X86::R14B;
+      case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+        return X86::R15B;
+      }
+    }
+  case 16:
+    switch (Reg) {
+    default: return 0;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::AX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::DX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::CX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::BX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::SI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::DI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::BP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::SP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8W;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9W;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10W;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11W;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12W;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13W;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14W;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15W;
+    }
+  case 32:
+    switch (Reg) {
+    default: return 0;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::EAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::EDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::ECX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::EBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::ESI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::EDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::EBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::ESP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8D;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9D;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10D;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11D;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12D;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13D;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14D;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15D;
+    }
+  case 64:
+    switch (Reg) {
+    default: return 0;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::RAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::RDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::RCX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::RBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::RSI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::RDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::RBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::RSP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15;
+    }
+  }
+}
+
+unsigned llvm::getX86SubSuperRegister(unsigned Reg, unsigned Size, bool High) {
+  unsigned Res = getX86SubSuperRegisterOrZero(Reg, Size, High);
+  assert(Res != 0 && "Unexpected register or VT");
+  return Res;
+}
+
+
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
new file mode 100644
index 000000000000..f73e734b9b0e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -0,0 +1,127 @@
+//===-- X86MCTargetDesc.h - X86 Target Descriptions -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
+
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class MCRelocationInfo;
+class MCTargetOptions;
+class Target;
+class Triple;
+class StringRef;
+class raw_ostream;
+class raw_pwrite_stream;
+
+Target &getTheX86_32Target();
+Target &getTheX86_64Target();
+
+/// Flavour of dwarf regnumbers
+///
+namespace DWARFFlavour {
+  enum {
+    X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
+  };
+}
+
+///  Native X86 register numbers
+///
+namespace N86 {
+  enum {
+    EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+  };
+}
+
+namespace X86_MC {
+std::string ParseX86Triple(const Triple &TT);
+
+unsigned getDwarfRegFlavour(const Triple &TT, bool isEH);
+
+void initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI);
+
+/// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc.
+/// do not need to go through TargetRegistry.
+MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU,
+                                          StringRef FS);
+}
+
+MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
+                                      MCContext &Ctx);
+
+MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     const Triple &TT, StringRef CPU,
+                                     const MCTargetOptions &Options);
+MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     const Triple &TT, StringRef CPU,
+                                     const MCTargetOptions &Options);
+
+/// Construct an X86 Windows COFF machine code streamer which will generate
+/// PE/COFF format object files.
+///
+/// Takes ownership of \p AB and \p CE.
+MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+                                     raw_pwrite_stream &OS, MCCodeEmitter *CE,
+                                     bool RelaxAll, bool IncrementalLinkerCompatible);
+
+/// Construct an X86 Mach-O object writer.
+MCObjectWriter *createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+                                          uint32_t CPUType,
+                                          uint32_t CPUSubtype);
+
+/// Construct an X86 ELF object writer.
+MCObjectWriter *createX86ELFObjectWriter(raw_pwrite_stream &OS, bool IsELF64,
+                                         uint8_t OSABI, uint16_t EMachine);
+/// Construct an X86 Win COFF object writer.
+MCObjectWriter *createX86WinCOFFObjectWriter(raw_pwrite_stream &OS,
+                                             bool Is64Bit);
+
+/// Returns the sub or super register of a specific X86 register.
+/// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX.
+/// Aborts on error.
+unsigned getX86SubSuperRegister(unsigned, unsigned, bool High=false);
+
+/// Returns the sub or super register of a specific X86 register.
+/// Like getX86SubSuperRegister() but returns 0 on error.
+unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned,
+                                      bool High = false);
+
+} // End llvm namespace
+
+
+// Defines symbolic names for X86 registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "X86GenRegisterInfo.inc"
+
+// Defines symbolic names for the X86 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "X86GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "X86GenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
new file mode 100644
index 000000000000..297926ddcfda
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -0,0 +1,610 @@
+//===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
+
+using namespace llvm;
+
+namespace {
+class X86MachObjectWriter : public MCMachObjectTargetWriter {
+  bool recordScatteredRelocation(MachObjectWriter *Writer,
+                                 const MCAssembler &Asm,
+                                 const MCAsmLayout &Layout,
+                                 const MCFragment *Fragment,
+                                 const MCFixup &Fixup,
+                                 MCValue Target,
+                                 unsigned Log2Size,
+                                 uint64_t &FixedValue);
+  void recordTLVPRelocation(MachObjectWriter *Writer,
+                            const MCAssembler &Asm,
+                            const MCAsmLayout &Layout,
+                            const MCFragment *Fragment,
+                            const MCFixup &Fixup,
+                            MCValue Target,
+                            uint64_t &FixedValue);
+
+  void RecordX86Relocation(MachObjectWriter *Writer,
+                              const MCAssembler &Asm,
+                              const MCAsmLayout &Layout,
+                              const MCFragment *Fragment,
+                              const MCFixup &Fixup,
+                              MCValue Target,
+                              uint64_t &FixedValue);
+  void RecordX86_64Relocation(MachObjectWriter *Writer, MCAssembler &Asm,
+                              const MCAsmLayout &Layout,
+                              const MCFragment *Fragment, const MCFixup &Fixup,
+                              MCValue Target, uint64_t &FixedValue);
+
+public:
+  X86MachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
+
+  void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override {
+    if (Writer->is64Bit())
+      RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                             FixedValue);
+    else
+      RecordX86Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                          FixedValue);
+  }
+};
+}
+
+static bool isFixupKindRIPRel(unsigned Kind) {
+  return Kind == X86::reloc_riprel_4byte ||
+         Kind == X86::reloc_riprel_4byte_movq_load ||
+         Kind == X86::reloc_riprel_4byte_relax ||
+         Kind == X86::reloc_riprel_4byte_relax_rex;
+}
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("invalid fixup kind!");
+  case FK_PCRel_1:
+  case FK_Data_1: return 0;
+  case FK_PCRel_2:
+  case FK_Data_2: return 1;
+  case FK_PCRel_4:
+    // FIXME: Remove these!!!
+  case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_relax:
+  case X86::reloc_riprel_4byte_relax_rex:
+  case X86::reloc_riprel_4byte_movq_load:
+  case X86::reloc_signed_4byte:
+  case X86::reloc_signed_4byte_relax:
+  case FK_Data_4: return 2;
+  case FK_Data_8: return 3;
+  }
+}
+
+void X86MachObjectWriter::RecordX86_64Relocation(
+    MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind());
+  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+  // See <reloc.h>.
+  uint32_t FixupOffset =
+    Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+  uint32_t FixupAddress =
+    Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+  int64_t Value = 0;
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+  const MCSymbol *RelSymbol = nullptr;
+
+  Value = Target.getConstant();
+
+  if (IsPCRel) {
+    // Compensate for the relocation offset, Darwin x86_64 relocations only have
+    // the addend and appear to have attempted to define it to be the actual
+    // expression addend without the PCrel bias. However, instructions with data
+    // following the relocation are not accommodated for (see comment below
+    // regarding SIGNED{1,2,4}), so it isn't exactly that either.
+    Value += 1LL << Log2Size;
+  }
+
+  if (Target.isAbsolute()) { // constant
+    // SymbolNum of 0 indicates the absolute section.
+    Type = MachO::X86_64_RELOC_UNSIGNED;
+
+    // FIXME: I believe this is broken, I don't think the linker can understand
+    // it. I think it would require a local relocation, but I'm not sure if that
+    // would work either. The official way to get an absolute PCrel relocation
+    // is to use an absolute symbol (which we don't support yet).
+    if (IsPCRel) {
+      IsExtern = 1;
+      Type = MachO::X86_64_RELOC_BRANCH;
+    }
+  } else if (Target.getSymB()) { // A - B + constant
+    const MCSymbol *A = &Target.getSymA()->getSymbol();
+    if (A->isTemporary())
+      A = &Writer->findAliasedSymbol(*A);
+    const MCSymbol *A_Base = Asm.getAtom(*A);
+
+    const MCSymbol *B = &Target.getSymB()->getSymbol();
+    if (B->isTemporary())
+      B = &Writer->findAliasedSymbol(*B);
+    const MCSymbol *B_Base = Asm.getAtom(*B);
+
+    // Neither symbol can be modified.
+    if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+        Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                   "unsupported relocation of modified symbol");
+      return;
+    }
+
+    // We don't support PCrel relocations of differences. Darwin 'as' doesn't
+    // implement most of these correctly.
+    if (IsPCRel) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported pc-relative relocation of difference");
+      return;
+    }
+
+    // The support for the situation where one or both of the symbols would
+    // require a local relocation is handled just like if the symbols were
+    // external.  This is certainly used in the case of debug sections where the
+    // section has only temporary symbols and thus the symbols don't have base
+    // symbols.  This is encoded using the section ordinal and non-extern
+    // relocation entries.
+
+    // Darwin 'as' doesn't emit correct relocations for this (it ends up with a
+    // single SIGNED relocation); reject it for now.  Except the case where both
+    // symbols don't have a base, equal but both NULL.
+    if (A_Base == B_Base && A_Base) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported relocation with identical base");
+      return;
+    }
+
+    // A subtraction expression where either symbol is undefined is a
+    // non-relocatable expression.
+    if (A->isUndefined() || B->isUndefined()) {
+      StringRef Name = A->isUndefined() ? A->getName() : B->getName();
+      Asm.getContext().reportError(Fixup.getLoc(),
+        "unsupported relocation with subtraction expression, symbol '" +
+        Name + "' can not be undefined in a subtraction expression");
+      return;
+    }
+
+    Value += Writer->getSymbolAddress(*A, Layout) -
+             (!A_Base ? 0 : Writer->getSymbolAddress(*A_Base, Layout));
+    Value -= Writer->getSymbolAddress(*B, Layout) -
+             (!B_Base ? 0 : Writer->getSymbolAddress(*B_Base, Layout));
+
+    if (!A_Base)
+      Index = A->getFragment()->getParent()->getOrdinal() + 1;
+    Type = MachO::X86_64_RELOC_UNSIGNED;
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 =
+        (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+    Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
+
+    if (B_Base)
+      RelSymbol = B_Base;
+    else
+      Index = B->getFragment()->getParent()->getOrdinal() + 1;
+    Type = MachO::X86_64_RELOC_SUBTRACTOR;
+  } else {
+    const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+    if (Symbol->isTemporary() && Value) {
+      const MCSection &Sec = Symbol->getSection();
+      if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
+        Symbol->setUsedInReloc();
+    }
+    RelSymbol = Asm.getAtom(*Symbol);
+
+    // Relocations inside debug sections always use local relocations when
+    // possible. This seems to be done because the debugger doesn't fully
+    // understand x86_64 relocation entries, and expects to find values that
+    // have already been fixed up.
+    if (Symbol->isInSection()) {
+      const MCSectionMachO &Section =
+          static_cast<const MCSectionMachO &>(*Fragment->getParent());
+      if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+        RelSymbol = nullptr;
+    }
+
+    // x86_64 almost always uses external relocations, except when there is no
+    // symbol to use as a base address (a local symbol with no preceding
+    // non-local symbol).
+    if (RelSymbol) {
+      // Add the local offset, if needed.
+      if (RelSymbol != Symbol)
+        Value += Layout.getSymbolOffset(*Symbol) -
+                 Layout.getSymbolOffset(*RelSymbol);
+    } else if (Symbol->isInSection() && !Symbol->isVariable()) {
+      // The index is the section ordinal (1-based).
+      Index = Symbol->getFragment()->getParent()->getOrdinal() + 1;
+      Value += Writer->getSymbolAddress(*Symbol, Layout);
+
+      if (IsPCRel)
+        Value -= FixupAddress + (1 << Log2Size);
+    } else if (Symbol->isVariable()) {
+      const MCExpr *Value = Symbol->getVariableValue();
+      int64_t Res;
+      bool isAbs = Value->evaluateAsAbsolute(Res, Layout,
+                                             Writer->getSectionAddressMap());
+      if (isAbs) {
+        FixedValue = Res;
+        return;
+      } else {
+        Asm.getContext().reportError(Fixup.getLoc(),
+                                     "unsupported relocation of variable '" +
+                                         Symbol->getName() + "'");
+        return;
+      }
+    } else {
+      Asm.getContext().reportError(
+          Fixup.getLoc(), "unsupported relocation of undefined symbol '" +
+                              Symbol->getName() + "'");
+      return;
+    }
+
+    MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
+    if (IsPCRel) {
+      if (IsRIPRel) {
+        if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+          // x86_64 distinguishes movq foo@GOTPCREL so that the linker can
+          // rewrite the movq to an leaq at link time if the symbol ends up in
+          // the same linkage unit.
+          if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load)
+            Type = MachO::X86_64_RELOC_GOT_LOAD;
+          else
+            Type = MachO::X86_64_RELOC_GOT;
+        }  else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+          Type = MachO::X86_64_RELOC_TLV;
+        }  else if (Modifier != MCSymbolRefExpr::VK_None) {
+          Asm.getContext().reportError(
+              Fixup.getLoc(), "unsupported symbol modifier in relocation");
+          return;
+        } else {
+          Type = MachO::X86_64_RELOC_SIGNED;
+
+          // The Darwin x86_64 relocation format has a problem where it cannot
+          // encode an address (L<foo> + <constant>) which is outside the atom
+          // containing L<foo>. Generally, this shouldn't occur but it does
+          // happen when we have a RIPrel instruction with data following the
+          // relocation entry (e.g., movb $012, L0(%rip)). Even with the PCrel
+          // adjustment Darwin x86_64 uses, the offset is still negative and the
+          // linker has no way to recognize this.
+          //
+          // To work around this, Darwin uses several special relocation types
+          // to indicate the offsets. However, the specification or
+          // implementation of these seems to also be incomplete; they should
+          // adjust the addend as well based on the actual encoded instruction
+          // (the additional bias), but instead appear to just look at the final
+          // offset.
+          switch (-(Target.getConstant() + (1LL << Log2Size))) {
+          case 1: Type = MachO::X86_64_RELOC_SIGNED_1; break;
+          case 2: Type = MachO::X86_64_RELOC_SIGNED_2; break;
+          case 4: Type = MachO::X86_64_RELOC_SIGNED_4; break;
+          }
+        }
+      } else {
+        if (Modifier != MCSymbolRefExpr::VK_None) {
+          Asm.getContext().reportError(
+              Fixup.getLoc(),
+              "unsupported symbol modifier in branch relocation");
+          return;
+        }
+
+        Type = MachO::X86_64_RELOC_BRANCH;
+      }
+    } else {
+      if (Modifier == MCSymbolRefExpr::VK_GOT) {
+        Type = MachO::X86_64_RELOC_GOT;
+      } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+        // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which
+        // case all we do is set the PCrel bit in the relocation entry; this is
+        // used with exception handling, for example. The source is required to
+        // include any necessary offset directly.
+        Type = MachO::X86_64_RELOC_GOT;
+        IsPCRel = 1;
+      } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+        Asm.getContext().reportError(
+            Fixup.getLoc(), "TLVP symbol modifier should have been rip-rel");
+        return;
+      } else if (Modifier != MCSymbolRefExpr::VK_None) {
+        Asm.getContext().reportError(
+            Fixup.getLoc(), "unsupported symbol modifier in relocation");
+        return;
+      } else {
+        Type = MachO::X86_64_RELOC_UNSIGNED;
+        unsigned Kind = Fixup.getKind();
+        if (Kind == X86::reloc_signed_4byte) {
+          Asm.getContext().reportError(
+              Fixup.getLoc(),
+              "32-bit absolute addressing is not supported in 64-bit mode");
+          return;
+        }
+      }
+    }
+  }
+
+  // x86_64 always writes custom values into the fixups.
+  FixedValue = Value;
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                (IsExtern << 27) | (Type << 28);
+  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+}
+
+bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
+                                                    const MCAssembler &Asm,
+                                                    const MCAsmLayout &Layout,
+                                                    const MCFragment *Fragment,
+                                                    const MCFixup &Fixup,
+                                                    MCValue Target,
+                                                    unsigned Log2Size,
+                                                    uint64_t &FixedValue) {
+  uint64_t OriginalFixedValue = FixedValue;
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Type = MachO::GENERIC_RELOC_VANILLA;
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+
+  if (!A->getFragment()) {
+    Asm.getContext().reportError(
+        Fixup.getLoc(),
+        "symbol '" + A->getName() +
+            "' can not be undefined in a subtraction expression");
+    return false;
+  }
+
+  uint32_t Value = Writer->getSymbolAddress(*A, Layout);
+  uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
+  FixedValue += SecAddr;
+  uint32_t Value2 = 0;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    const MCSymbol *SB = &B->getSymbol();
+
+    if (!SB->getFragment()) {
+      Asm.getContext().reportError(
+          Fixup.getLoc(),
+          "symbol '" + B->getSymbol().getName() +
+              "' can not be undefined in a subtraction expression");
+      return false;
+    }
+
+    // Select the appropriate difference relocation type.
+    //
+    // Note that there is no longer any semantic difference between these two
+    // relocation types from the linkers point of view, this is done solely for
+    // pedantic compatibility with 'as'.
+    Type = A->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF
+                           : (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF;
+    Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
+    FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
+  }
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  if (Type == MachO::GENERIC_RELOC_SECTDIFF ||
+      Type == MachO::GENERIC_RELOC_LOCAL_SECTDIFF) {
+    // If the offset is too large to fit in a scattered relocation,
+    // we're hosed. It's an unfortunate limitation of the MachO format.
+    if (FixupOffset > 0xffffff) {
+      char Buffer[32];
+      format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
+      Asm.getContext().reportError(Fixup.getLoc(),
+                         Twine("Section too large, can't encode "
+                                "r_address (") + Buffer +
+                         ") into 24 bits of scattered "
+                         "relocation entry.");
+      return false;
+    }
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = ((0                         <<  0) | // r_address
+                   (MachO::GENERIC_RELOC_PAIR << 24) | // r_type
+                   (Log2Size                  << 28) |
+                   (IsPCRel                   << 30) |
+                   MachO::R_SCATTERED);
+    MRE.r_word1 = Value2;
+    Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+  } else {
+    // If the offset is more than 24-bits, it won't fit in a scattered
+    // relocation offset field, so we fall back to using a non-scattered
+    // relocation. This is a bit risky, as if the offset reaches out of
+    // the block and the linker is doing scattered loading on this
+    // symbol, things can go badly.
+    //
+    // Required for 'as' compatibility.
+    if (FixupOffset > 0xffffff) {
+      FixedValue = OriginalFixedValue;
+      return false;
+    }
+  }
+
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = ((FixupOffset <<  0) |
+                 (Type        << 24) |
+                 (Log2Size    << 28) |
+                 (IsPCRel     << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = Value;
+  Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
+  return true;
+}
+
+void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer,
+                                               const MCAssembler &Asm,
+                                               const MCAsmLayout &Layout,
+                                               const MCFragment *Fragment,
+                                               const MCFixup &Fixup,
+                                               MCValue Target,
+                                               uint64_t &FixedValue) {
+  assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP &&
+         !is64Bit() &&
+         "Should only be called with a 32-bit TLVP relocation!");
+
+  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+  uint32_t Value = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = 0;
+
+  // We're only going to have a second symbol in pic mode and it'll be a
+  // subtraction from the picbase. For 32-bit pic the addend is the difference
+  // between the picbase and the next address.  For 32-bit static the addend is
+  // zero.
+  if (Target.getSymB()) {
+    // If this is a subtraction then we're pcrel.
+    uint32_t FixupAddress =
+      Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+    IsPCRel = 1;
+    FixedValue =
+        FixupAddress -
+        Writer->getSymbolAddress(Target.getSymB()->getSymbol(), Layout) +
+        Target.getConstant();
+    FixedValue += 1ULL << Log2Size;
+  } else {
+    FixedValue = 0;
+  }
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = Value;
+  MRE.r_word1 =
+      (IsPCRel << 24) | (Log2Size << 25) | (MachO::GENERIC_RELOC_TLV << 28);
+  Writer->addRelocation(&Target.getSymA()->getSymbol(), Fragment->getParent(),
+                        MRE);
+}
+
+void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
+                                              const MCAssembler &Asm,
+                                              const MCAsmLayout &Layout,
+                                              const MCFragment *Fragment,
+                                              const MCFixup &Fixup,
+                                              MCValue Target,
+                                              uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+  // If this is a 32-bit TLVP reloc it's handled a bit differently.
+  if (Target.getSymA() &&
+      Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP) {
+    recordTLVPRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                         FixedValue);
+    return;
+  }
+
+  // If this is a difference or a defined symbol plus an offset, then we need a
+  // scattered relocation entry. Differences always require scattered
+  // relocations.
+  if (Target.getSymB()) {
+    recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                              Target, Log2Size, FixedValue);
+    return;
+  }
+
+  // Get the symbol data, if any.
+  const MCSymbol *A = nullptr;
+  if (Target.getSymA())
+    A = &Target.getSymA()->getSymbol();
+
+  // If this is an internal relocation with an offset, it also needs a scattered
+  // relocation entry.
+  uint32_t Offset = Target.getConstant();
+  if (IsPCRel)
+    Offset += 1 << Log2Size;
+  // Try to record the scattered relocation if needed. Fall back to non
+  // scattered if necessary (see comments in recordScatteredRelocation()
+  // for details).
+  if (Offset && A && !Writer->doesSymbolRequireExternRelocation(*A) &&
+      recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                                Log2Size, FixedValue))
+    return;
+
+  // See <reloc.h>.
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned Index = 0;
+  unsigned Type = 0;
+  const MCSymbol *RelSymbol = nullptr;
+
+  if (Target.isAbsolute()) { // constant
+    // SymbolNum of 0 indicates the absolute section.
+    //
+    // FIXME: Currently, these are never generated (see code below). I cannot
+    // find a case where they are actually emitted.
+    Type = MachO::GENERIC_RELOC_VANILLA;
+  } else {
+    // Resolve constant variables.
+    if (A->isVariable()) {
+      int64_t Res;
+      if (A->getVariableValue()->evaluateAsAbsolute(
+              Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+    }
+
+    // Check whether we need an external or internal relocation.
+    if (Writer->doesSymbolRequireExternRelocation(*A)) {
+      RelSymbol = A;
+      // For external relocations, make sure to offset the fixup value to
+      // compensate for the addend of the symbol address, if it was
+      // undefined. This occurs with weak definitions, for example.
+      if (!A->isUndefined())
+        FixedValue -= Layout.getSymbolOffset(*A);
+    } else {
+      // The index is the section ordinal (1-based).
+      const MCSection &Sec = A->getSection();
+      Index = Sec.getOrdinal() + 1;
+      FixedValue += Writer->getSectionAddress(&Sec);
+    }
+    if (IsPCRel)
+      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+
+    Type = MachO::GENERIC_RELOC_VANILLA;
+  }
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 =
+      (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createX86MachObjectWriter(raw_pwrite_stream &OS,
+                                                bool Is64Bit, uint32_t CPUType,
+                                                uint32_t CPUSubtype) {
+  return createMachObjectWriter(new X86MachObjectWriter(Is64Bit,
+                                                        CPUType,
+                                                        CPUSubtype),
+                                OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
new file mode 100644
index 000000000000..33376b6d1b90
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -0,0 +1,105 @@
+//===-- X86WinCOFFObjectWriter.cpp - X86 Win COFF Writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace llvm {
+  class MCObjectWriter;
+}
+
+namespace {
+  class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+  public:
+    X86WinCOFFObjectWriter(bool Is64Bit);
+    ~X86WinCOFFObjectWriter() override;
+
+    unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsCrossSection,
+                          const MCAsmBackend &MAB) const override;
+  };
+}
+
+X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit)
+    : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64
+                                          : COFF::IMAGE_FILE_MACHINE_I386) {}
+
+X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {}
+
+unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
+                                              const MCFixup &Fixup,
+                                              bool IsCrossSection,
+                                              const MCAsmBackend &MAB) const {
+  unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind();
+
+  MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+    MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
+  if (getMachine() == COFF::IMAGE_FILE_MACHINE_AMD64) {
+    switch (FixupKind) {
+    case FK_PCRel_4:
+    case X86::reloc_riprel_4byte:
+    case X86::reloc_riprel_4byte_movq_load:
+    case X86::reloc_riprel_4byte_relax:
+    case X86::reloc_riprel_4byte_relax_rex:
+      return COFF::IMAGE_REL_AMD64_REL32;
+    case FK_Data_4:
+    case X86::reloc_signed_4byte:
+    case X86::reloc_signed_4byte_relax:
+      if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+        return COFF::IMAGE_REL_AMD64_ADDR32NB;
+      if (Modifier == MCSymbolRefExpr::VK_SECREL)
+        return COFF::IMAGE_REL_AMD64_SECREL;
+      return COFF::IMAGE_REL_AMD64_ADDR32;
+    case FK_Data_8:
+      return COFF::IMAGE_REL_AMD64_ADDR64;
+    case FK_SecRel_2:
+      return COFF::IMAGE_REL_AMD64_SECTION;
+    case FK_SecRel_4:
+      return COFF::IMAGE_REL_AMD64_SECREL;
+    default:
+      llvm_unreachable("unsupported relocation type");
+    }
+  } else if (getMachine() == COFF::IMAGE_FILE_MACHINE_I386) {
+    switch (FixupKind) {
+    case FK_PCRel_4:
+    case X86::reloc_riprel_4byte:
+    case X86::reloc_riprel_4byte_movq_load:
+      return COFF::IMAGE_REL_I386_REL32;
+    case FK_Data_4:
+    case X86::reloc_signed_4byte:
+    case X86::reloc_signed_4byte_relax:
+      if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+        return COFF::IMAGE_REL_I386_DIR32NB;
+      if (Modifier == MCSymbolRefExpr::VK_SECREL)
+        return COFF::IMAGE_REL_AMD64_SECREL;
+      return COFF::IMAGE_REL_I386_DIR32;
+    case FK_SecRel_2:
+      return COFF::IMAGE_REL_I386_SECTION;
+    case FK_SecRel_4:
+      return COFF::IMAGE_REL_I386_SECREL;
+    default:
+      llvm_unreachable("unsupported relocation type");
+    }
+  } else
+    llvm_unreachable("Unsupported COFF machine type.");
+}
+
+MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_pwrite_stream &OS,
+                                                   bool Is64Bit) {
+  MCWinCOFFObjectTargetWriter *MOTW = new X86WinCOFFObjectWriter(Is64Bit);
+  return createWinCOFFObjectWriter(MOTW, OS);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
new file mode 100644
index 000000000000..d04511873b46
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -0,0 +1,60 @@
+//===-- X86WinCOFFStreamer.cpp - X86 Target WinCOFF Streamer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "llvm/MC/MCWin64EH.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+class X86WinCOFFStreamer : public MCWinCOFFStreamer {
+  Win64EH::UnwindEmitter EHStreamer;
+public:
+  X86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter *CE,
+                     raw_pwrite_stream &OS)
+      : MCWinCOFFStreamer(C, AB, *CE, OS) {}
+
+  void EmitWinEHHandlerData() override;
+  void EmitWindowsUnwindTables() override;
+  void FinishImpl() override;
+};
+
+void X86WinCOFFStreamer::EmitWinEHHandlerData() {
+  MCStreamer::EmitWinEHHandlerData();
+
+  // We have to emit the unwind info now, because this directive
+  // actually switches to the .xdata section!
+  EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo());
+}
+
+void X86WinCOFFStreamer::EmitWindowsUnwindTables() {
+  if (!getNumWinFrameInfos())
+    return;
+  EHStreamer.Emit(*this);
+}
+
+void X86WinCOFFStreamer::FinishImpl() {
+  EmitFrames(nullptr);
+  EmitWindowsUnwindTables();
+
+  MCWinCOFFStreamer::FinishImpl();
+}
+}
+
+MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+                                           raw_pwrite_stream &OS,
+                                           MCCodeEmitter *CE, bool RelaxAll,
+                                           bool IncrementalLinkerCompatible) {
+  X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS);
+  S->getAssembler().setRelaxAll(RelaxAll);
+  S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
+  return S;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
new file mode 100644
index 000000000000..d2654fc67ed5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -0,0 +1,29 @@
+//===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheX86_32Target() {
+  static Target TheX86_32Target;
+  return TheX86_32Target;
+}
+Target &llvm::getTheX86_64Target() {
+  static Target TheX86_64Target;
+  return TheX86_64Target;
+}
+
+extern "C" void LLVMInitializeX86TargetInfo() {
+  RegisterTarget<Triple::x86, /*HasJIT=*/true> X(
+      getTheX86_32Target(), "x86", "32-bit X86: Pentium-Pro and above");
+
+  RegisterTarget<Triple::x86_64, /*HasJIT=*/true> Y(
+      getTheX86_64Target(), "x86-64", "64-bit X86: EM64T and AMD64");
+}
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
new file mode 100644
index 000000000000..1be5aec849fc
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -0,0 +1,606 @@
+//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecode.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/CodeGen/MachineValueType.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  // Defaults the copying the dest value.
+  ShuffleMask.push_back(0);
+  ShuffleMask.push_back(1);
+  ShuffleMask.push_back(2);
+  ShuffleMask.push_back(3);
+
+  // Decode the immediate.
+  unsigned ZMask = Imm & 15;
+  unsigned CountD = (Imm >> 4) & 3;
+  unsigned CountS = (Imm >> 6) & 3;
+
+  // CountS selects which input element to use.
+  unsigned InVal = 4 + CountS;
+  // CountD specifies which element of destination to update.
+  ShuffleMask[CountD] = InVal;
+  // ZMask zaps values, potentially overriding the CountD elt.
+  if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero;
+  if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero;
+  if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero;
+  if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
+}
+
+void DecodeInsertElementMask(MVT VT, unsigned Idx, unsigned Len,
+                             SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  assert((Idx + Len) <= NumElts && "Insertion out of range");
+
+  for (unsigned i = 0; i != NumElts; ++i)
+    ShuffleMask.push_back(i);
+  for (unsigned i = 0; i != Len; ++i)
+    ShuffleMask[Idx + i] = NumElts + i;
+}
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = NElts / 2; i != NElts; ++i)
+    ShuffleMask.push_back(NElts + i);
+
+  for (unsigned i = NElts / 2; i != NElts; ++i)
+    ShuffleMask.push_back(i);
+}
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = 0; i != NElts / 2; ++i)
+    ShuffleMask.push_back(i);
+
+  for (unsigned i = 0; i != NElts / 2; ++i)
+    ShuffleMask.push_back(NElts + i);
+}
+
+void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  for (int i = 0, e = NumElts / 2; i < e; ++i) {
+    ShuffleMask.push_back(2 * i);
+    ShuffleMask.push_back(2 * i);
+  }
+}
+
+void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  for (int i = 0, e = NumElts / 2; i < e; ++i) {
+    ShuffleMask.push_back(2 * i + 1);
+    ShuffleMask.push_back(2 * i + 1);
+  }
+}
+
+void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VectorSizeInBits = VT.getSizeInBits();
+  unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VectorSizeInBits / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+  unsigned NumLaneSubElts = 64 / ScalarSizeInBits;
+
+  for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+    for (unsigned i = 0; i < NumLaneElts; i += NumLaneSubElts)
+      for (unsigned s = 0; s != NumLaneSubElts; s++)
+        ShuffleMask.push_back(l + s);
+}
+
+void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VectorSizeInBits = VT.getSizeInBits();
+  unsigned NumElts = VectorSizeInBits / 8;
+  unsigned NumLanes = VectorSizeInBits / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+    for (unsigned i = 0; i < NumLaneElts; ++i) {
+      int M = SM_SentinelZero;
+      if (i >= Imm) M = i - Imm + l;
+      ShuffleMask.push_back(M);
+    }
+}
+
+void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VectorSizeInBits = VT.getSizeInBits();
+  unsigned NumElts = VectorSizeInBits / 8;
+  unsigned NumLanes = VectorSizeInBits / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+    for (unsigned i = 0; i < NumLaneElts; ++i) {
+      unsigned Base = i + Imm;
+      int M = Base + l;
+      if (Base >= NumLaneElts) M = SM_SentinelZero;
+      ShuffleMask.push_back(M);
+    }
+}
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
+
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      unsigned Base = i + Offset;
+      // if i+offset is out of this lane then we actually need the other source
+      if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
+      ShuffleMask.push_back(Base + l);
+    }
+  }
+}
+
+void DecodeVALIGNMask(MVT VT, unsigned Imm,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  int NumElts = VT.getVectorNumElements();
+  // Not all bits of the immediate are used so mask it.
+  assert(isPowerOf2_32(NumElts) && "NumElts should be power of 2");
+  Imm = Imm & (NumElts - 1);
+  for (int i = 0; i != NumElts; ++i)
+    ShuffleMask.push_back(i + Imm);
+}
+
+/// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  if (NumLanes == 0) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  unsigned NewImm = Imm;
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      ShuffleMask.push_back(NewImm % NumLaneElts + l);
+      NewImm /= NumLaneElts;
+    }
+    if (NumLaneElts == 4) NewImm = Imm; // reload imm
+  }
+}
+
+void DecodePSHUFHWMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    unsigned NewImm = Imm;
+    for (unsigned i = 0, e = 4; i != e; ++i) {
+      ShuffleMask.push_back(l + i);
+    }
+    for (unsigned i = 4, e = 8; i != e; ++i) {
+      ShuffleMask.push_back(l + 4 + (NewImm & 3));
+      NewImm >>= 2;
+    }
+  }
+}
+
+void DecodePSHUFLWMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    unsigned NewImm = Imm;
+    for (unsigned i = 0, e = 4; i != e; ++i) {
+      ShuffleMask.push_back(l + (NewImm & 3));
+      NewImm >>= 2;
+    }
+    for (unsigned i = 4, e = 8; i != e; ++i) {
+      ShuffleMask.push_back(l + i);
+    }
+  }
+}
+
+void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumHalfElts = NumElts / 2;
+
+  for (unsigned l = 0; l != NumHalfElts; ++l)
+    ShuffleMask.push_back(l + NumHalfElts);
+  for (unsigned h = 0; h != NumHalfElts; ++h)
+    ShuffleMask.push_back(h);
+}
+
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  unsigned NewImm = Imm;
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    // each half of a lane comes from different source
+    for (unsigned s = 0; s != NumElts * 2; s += NumElts) {
+      for (unsigned i = 0; i != NumLaneElts / 2; ++i) {
+        ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
+        NewImm /= NumLaneElts;
+      }
+    }
+    if (NumLaneElts == 4) NewImm = Imm; // reload imm
+  }
+}
+
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  if (NumLanes == 0) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = l + NumLaneElts / 2, e = l + NumLaneElts; i != e; ++i) {
+      ShuffleMask.push_back(i);           // Reads from dest/src1
+      ShuffleMask.push_back(i + NumElts); // Reads from src/src2
+    }
+  }
+}
+
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = l, e = l + NumLaneElts / 2; i != e; ++i) {
+      ShuffleMask.push_back(i);           // Reads from dest/src1
+      ShuffleMask.push_back(i + NumElts); // Reads from src/src2
+    }
+  }
+}
+
+/// Decodes a broadcast of the first element of a vector.
+void DecodeVectorBroadcast(MVT DstVT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = DstVT.getVectorNumElements();
+  ShuffleMask.append(NumElts, 0);
+}
+
+/// Decodes a broadcast of a subvector to a larger vector type.
+void DecodeSubVectorBroadcast(MVT DstVT, MVT SrcVT,
+                              SmallVectorImpl<int> &ShuffleMask) {
+  assert(SrcVT.getScalarType() == DstVT.getScalarType() &&
+         "Non matching vector element types");
+  unsigned NumElts = SrcVT.getVectorNumElements();
+  unsigned Scale = DstVT.getSizeInBits() / SrcVT.getSizeInBits();
+
+  for (unsigned i = 0; i != Scale; ++i)
+    for (unsigned j = 0; j != NumElts; ++j)
+      ShuffleMask.push_back(j);
+}
+
+/// \brief Decode a shuffle packed values at 128-bit granularity
+/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
+/// immediate mask into a shuffle mask.
+void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumElementsInLane = 128 / VT.getScalarSizeInBits();
+  unsigned ControlBitsMask = NumLanes - 1;
+  unsigned NumControlBits  = NumLanes / 2;
+
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask;
+    // We actually need the other source.
+    if (l >= NumLanes / 2)
+      LaneMask += NumLanes;
+    for (unsigned i = 0; i != NumElementsInLane; ++i)
+      ShuffleMask.push_back(LaneMask * NumElementsInLane + i);
+  }
+}
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+                          SmallVectorImpl<int> &ShuffleMask) {
+  unsigned HalfSize = VT.getVectorNumElements() / 2;
+
+  for (unsigned l = 0; l != 2; ++l) {
+    unsigned HalfMask = Imm >> (l * 4);
+    unsigned HalfBegin = (HalfMask & 0x3) * HalfSize;
+    for (unsigned i = HalfBegin, e = HalfBegin + HalfSize; i != e; ++i)
+      ShuffleMask.push_back(HalfMask & 8 ? SM_SentinelZero : (int)i);
+  }
+}
+
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  for (int i = 0, e = RawMask.size(); i < e; ++i) {
+    uint64_t M = RawMask[i];
+    if (M == (uint64_t)SM_SentinelUndef) {
+      ShuffleMask.push_back(M);
+      continue;
+    }
+    // For 256/512-bit vectors the base of the shuffle is the 128-bit
+    // subvector we're inside.
+    int Base = (i / 16) * 16;
+    // If the high bit (7) of the byte is set, the element is zeroed.
+    if (M & (1 << 7))
+      ShuffleMask.push_back(SM_SentinelZero);
+    else {
+      // Only the least significant 4 bits of the byte are used.
+      int Index = Base + (M & 0xf);
+      ShuffleMask.push_back(Index);
+    }
+  }
+}
+
+void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  int ElementBits = VT.getScalarSizeInBits();
+  int NumElements = VT.getVectorNumElements();
+  for (int i = 0; i < NumElements; ++i) {
+    // If there are more than 8 elements in the vector, then any immediate blend
+    // mask applies to each 128-bit lane. There can never be more than
+    // 8 elements in a 128-bit lane with an immediate blend.
+    int Bit = NumElements > 8 ? i % (128 / ElementBits) : i;
+    assert(Bit < 8 &&
+           "Immediate blends only operate over 8 elements at a time!");
+    ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i);
+  }
+}
+
+void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  assert(RawMask.size() == 16 && "Illegal VPPERM shuffle mask size");
+
+  // VPPERM Operation
+  // Bits[4:0] - Byte Index (0 - 31)
+  // Bits[7:5] - Permute Operation
+  //
+  // Permute Operation:
+  // 0 - Source byte (no logical operation).
+  // 1 - Invert source byte.
+  // 2 - Bit reverse of source byte.
+  // 3 - Bit reverse of inverted source byte.
+  // 4 - 00h (zero - fill).
+  // 5 - FFh (ones - fill).
+  // 6 - Most significant bit of source byte replicated in all bit positions.
+  // 7 - Invert most significant bit of source byte and replicate in all bit positions.
+  for (int i = 0, e = RawMask.size(); i < e; ++i) {
+    uint64_t M = RawMask[i];
+    if (M == (uint64_t)SM_SentinelUndef) {
+      ShuffleMask.push_back(M);
+      continue;
+    }
+
+    uint64_t PermuteOp = (M >> 5) & 0x7;
+    if (PermuteOp == 4) {
+      ShuffleMask.push_back(SM_SentinelZero);
+      continue;
+    }
+    if (PermuteOp != 0) {
+      ShuffleMask.clear();
+      return;
+    }
+
+    uint64_t Index = M & 0x1F;
+    ShuffleMask.push_back((int)Index);
+  }
+}
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+void DecodeVPERMMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  assert((VT.is256BitVector() || VT.is512BitVector()) &&
+         (VT.getScalarSizeInBits() == 64) && "Unexpected vector value type");
+  unsigned NumElts = VT.getVectorNumElements();
+  for (unsigned l = 0; l != NumElts; l += 4)
+    for (unsigned i = 0; i != 4; ++i)
+      ShuffleMask.push_back(l + ((Imm >> (2 * i)) & 3));
+}
+
+void DecodeZeroExtendMask(MVT SrcScalarVT, MVT DstVT, SmallVectorImpl<int> &Mask) {
+  unsigned NumDstElts = DstVT.getVectorNumElements();
+  unsigned SrcScalarBits = SrcScalarVT.getSizeInBits();
+  unsigned DstScalarBits = DstVT.getScalarSizeInBits();
+  unsigned Scale = DstScalarBits / SrcScalarBits;
+  assert(SrcScalarBits < DstScalarBits &&
+         "Expected zero extension mask to increase scalar size");
+
+  for (unsigned i = 0; i != NumDstElts; i++) {
+    Mask.push_back(i);
+    for (unsigned j = 1; j != Scale; j++)
+      Mask.push_back(SM_SentinelZero);
+  }
+}
+
+void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  ShuffleMask.push_back(0);
+  for (unsigned i = 1; i < NumElts; i++)
+    ShuffleMask.push_back(SM_SentinelZero);
+}
+
+void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) {
+  // First element comes from the first element of second source.
+  // Remaining elements: Load zero extends / Move copies from first source.
+  unsigned NumElts = VT.getVectorNumElements();
+  Mask.push_back(NumElts);
+  for (unsigned i = 1; i < NumElts; i++)
+    Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
+}
+
+void DecodeEXTRQIMask(int Len, int Idx,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  // Only the bottom 6 bits are valid for each immediate.
+  Len &= 0x3F;
+  Idx &= 0x3F;
+
+  // We can only decode this bit extraction instruction as a shuffle if both the
+  // length and index work with whole bytes.
+  if (0 != (Len % 8) || 0 != (Idx % 8))
+    return;
+
+  // A length of zero is equivalent to a bit length of 64.
+  if (Len == 0)
+    Len = 64;
+
+  // If the length + index exceeds the bottom 64 bits the result is undefined.
+  if ((Len + Idx) > 64) {
+    ShuffleMask.append(16, SM_SentinelUndef);
+    return;
+  }
+
+  // Convert index and index to work with bytes.
+  Len /= 8;
+  Idx /= 8;
+
+  // EXTRQ: Extract Len bytes starting from Idx. Zero pad the remaining bytes
+  // of the lower 64-bits. The upper 64-bits are undefined.
+  for (int i = 0; i != Len; ++i)
+    ShuffleMask.push_back(i + Idx);
+  for (int i = Len; i != 8; ++i)
+    ShuffleMask.push_back(SM_SentinelZero);
+  for (int i = 8; i != 16; ++i)
+    ShuffleMask.push_back(SM_SentinelUndef);
+}
+
+void DecodeINSERTQIMask(int Len, int Idx,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  // Only the bottom 6 bits are valid for each immediate.
+  Len &= 0x3F;
+  Idx &= 0x3F;
+
+  // We can only decode this bit insertion instruction as a shuffle if both the
+  // length and index work with whole bytes.
+  if (0 != (Len % 8) || 0 != (Idx % 8))
+    return;
+
+  // A length of zero is equivalent to a bit length of 64.
+  if (Len == 0)
+    Len = 64;
+
+  // If the length + index exceeds the bottom 64 bits the result is undefined.
+  if ((Len + Idx) > 64) {
+    ShuffleMask.append(16, SM_SentinelUndef);
+    return;
+  }
+
+  // Convert index and index to work with bytes.
+  Len /= 8;
+  Idx /= 8;
+
+  // INSERTQ: Extract lowest Len bytes from lower half of second source and
+  // insert over first source starting at Idx byte. The upper 64-bits are
+  // undefined.
+  for (int i = 0; i != Idx; ++i)
+    ShuffleMask.push_back(i);
+  for (int i = 0; i != Len; ++i)
+    ShuffleMask.push_back(i + 16);
+  for (int i = Idx + Len; i != 8; ++i)
+    ShuffleMask.push_back(i);
+  for (int i = 8; i != 16; ++i)
+    ShuffleMask.push_back(SM_SentinelUndef);
+}
+
+void DecodeVPERMILPMask(MVT VT, ArrayRef<uint64_t> RawMask,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VecSize = VT.getSizeInBits();
+  unsigned EltSize = VT.getScalarSizeInBits();
+  unsigned NumLanes = VecSize / 128;
+  unsigned NumEltsPerLane = VT.getVectorNumElements() / NumLanes;
+  assert((VecSize == 128 || VecSize == 256 || VecSize == 512) &&
+         "Unexpected vector size");
+  assert((EltSize == 32 || EltSize == 64) && "Unexpected element size");
+
+  for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
+    uint64_t M = RawMask[i];
+    M = (EltSize == 64 ? ((M >> 1) & 0x1) : (M & 0x3));
+    unsigned LaneOffset = i & ~(NumEltsPerLane - 1);
+    ShuffleMask.push_back((int)(LaneOffset + M));
+  }
+}
+
+void DecodeVPERMIL2PMask(MVT VT, unsigned M2Z, ArrayRef<uint64_t> RawMask,
+                         SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VecSize = VT.getSizeInBits();
+  unsigned EltSize = VT.getScalarSizeInBits();
+  unsigned NumLanes = VecSize / 128;
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumEltsPerLane = NumElts / NumLanes;
+  assert((VecSize == 128 || VecSize == 256) && "Unexpected vector size");
+  assert((EltSize == 32 || EltSize == 64) && "Unexpected element size");
+  assert((NumElts == RawMask.size()) && "Unexpected mask size");
+
+  for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
+    // VPERMIL2 Operation.
+    // Bits[3] - Match Bit.
+    // Bits[2:1] - (Per Lane) PD Shuffle Mask.
+    // Bits[2:0] - (Per Lane) PS Shuffle Mask.
+    uint64_t Selector = RawMask[i];
+    unsigned MatchBit = (Selector >> 3) & 0x1;
+
+    // M2Z[0:1]     MatchBit
+    //   0Xb           X        Source selected by Selector index.
+    //   10b           0        Source selected by Selector index.
+    //   10b           1        Zero.
+    //   11b           0        Zero.
+    //   11b           1        Source selected by Selector index.
+    if ((M2Z & 0x2) != 0 && MatchBit != (M2Z & 0x1)) {
+      ShuffleMask.push_back(SM_SentinelZero);
+      continue;
+    }
+
+    int Index = i & ~(NumEltsPerLane - 1);
+    if (EltSize == 64)
+      Index += (Selector >> 1) & 0x1;
+    else
+      Index += Selector & 0x3;
+
+    int Src = (Selector >> 2) & 0x1;
+    Index += Src * NumElts;
+    ShuffleMask.push_back(Index);
+  }
+}
+
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  uint64_t EltMaskSize = RawMask.size() - 1;
+  for (auto M : RawMask) {
+    M &= EltMaskSize;
+    ShuffleMask.push_back((int)M);
+  }
+}
+
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  uint64_t EltMaskSize = (RawMask.size() * 2) - 1;
+  for (auto M : RawMask) {
+    M &= EltMaskSize;
+    ShuffleMask.push_back((int)M);
+  }
+}
+
+} // llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
new file mode 100644
index 000000000000..17619d09d059
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -0,0 +1,162 @@
+//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+#define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+
+#include "llvm/ADT/SmallVector.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+template <typename T> class ArrayRef;
+class MVT;
+
+enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
+
+/// Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+// Insert the bottom Len elements from a second source into a vector starting at
+// element Idx.
+void DecodeInsertElementMask(MVT VT, unsigned Idx, unsigned Len,
+                             SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
+/// i.e. <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
+/// i.e. <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeVALIGNMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for pshufhw.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for pshuflw.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFLWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes a PSWAPD 3DNow! instruction.
+void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for shufp*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes a broadcast of the first element of a vector.
+void DecodeVectorBroadcast(MVT DstVT, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes a broadcast of a subvector to a larger vector type.
+void DecodeSubVectorBroadcast(MVT DstVT, MVT SrcVT,
+                              SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a PSHUFB mask from a raw array of constants such as from
+/// BUILD_VECTOR.
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a BLEND immediate mask into a shuffle mask.
+void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+                          SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a shuffle packed values at 128-bit granularity
+/// immediate mask into a shuffle mask.
+void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
+                               SmallVectorImpl<int> &ShuffleMask);
+
+/// Decodes the shuffle masks for VPERMQ/VPERMPD.
+void DecodeVPERMMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPPERM mask from a raw array of constants such as from
+/// BUILD_VECTOR.
+/// This can only basic masks (permutes + zeros), not any of the other
+/// operations that VPPERM can perform.
+void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a zero extension instruction as a shuffle mask.
+void DecodeZeroExtendMask(MVT SrcScalarVT, MVT DstVT,
+                          SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a move lower and zero upper instruction as a shuffle mask.
+void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a scalar float move instruction as a shuffle mask.
+void DecodeScalarMoveMask(MVT VT, bool IsLoad,
+                          SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask.
+void DecodeEXTRQIMask(int Len, int Idx,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask.
+void DecodeINSERTQIMask(int Len, int Idx,
+                        SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
+void DecodeVPERMILPMask(MVT VT, ArrayRef<uint64_t> RawMask,
+                        SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
+void DecodeVPERMIL2PMask(MVT VT, unsigned M2Z, ArrayRef<uint64_t> RawMask,
+                         SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask);
+} // llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h
new file mode 100644
index 000000000000..2cb80a482d06
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86.h
@@ -0,0 +1,99 @@
+//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the x86
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86_H
+#define LLVM_LIB_TARGET_X86_X86_H
+
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+
+class FunctionPass;
+class ImmutablePass;
+class PassRegistry;
+class X86TargetMachine;
+
+/// This pass converts a legalized DAG into a X86-specific DAG, ready for
+/// instruction scheduling.
+FunctionPass *createX86ISelDag(X86TargetMachine &TM,
+                               CodeGenOpt::Level OptLevel);
+
+/// This pass initializes a global base register for PIC on x86-32.
+FunctionPass *createX86GlobalBaseRegPass();
+
+/// This pass combines multiple accesses to local-dynamic TLS variables so that
+/// the TLS base address for the module is only fetched once per execution path
+/// through the function.
+FunctionPass *createCleanupLocalDynamicTLSPass();
+
+/// This function returns a pass which converts floating-point register
+/// references and pseudo instructions into floating-point stack references and
+/// physical instructions.
+FunctionPass *createX86FloatingPointStackifierPass();
+
+/// This pass inserts AVX vzeroupper instructions before each call to avoid
+/// transition penalty between functions encoded with AVX and SSE.
+FunctionPass *createX86IssueVZeroUpperPass();
+
+/// Return a pass that pads short functions with NOOPs.
+/// This will prevent a stall when returning on the Atom.
+FunctionPass *createX86PadShortFunctions();
+
+/// Return a pass that selectively replaces certain instructions (like add,
+/// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA
+/// instructions, in order to eliminate execution delays in some processors.
+FunctionPass *createX86FixupLEAs();
+
+/// Return a pass that removes redundant LEA instructions and redundant address
+/// recalculations.
+FunctionPass *createX86OptimizeLEAs();
+
+/// Return a pass that transforms setcc + movzx pairs into xor + setcc.
+FunctionPass *createX86FixupSetCC();
+
+/// Return a pass that expands WinAlloca pseudo-instructions.
+FunctionPass *createX86WinAllocaExpander();
+
+/// Return a pass that optimizes the code-size of x86 call sequences. This is
+/// done by replacing esp-relative movs with pushes.
+FunctionPass *createX86CallFrameOptimization();
+
+/// Return an IR pass that inserts EH registration stack objects and explicit
+/// EH state updates. This pass must run after EH preparation, which does
+/// Windows-specific but architecture-neutral preparation.
+FunctionPass *createX86WinEHStatePass();
+
+/// Return a Machine IR pass that expands X86-specific pseudo
+/// instructions into a sequence of actual instructions. This pass
+/// must run after prologue/epilogue insertion and before lowering
+/// the MachineInstr to MC.
+FunctionPass *createX86ExpandPseudoPass();
+
+/// Return a Machine IR pass that selectively replaces
+/// certain byte and word instructions by equivalent 32 bit instructions,
+/// in order to eliminate partial register usage, false dependences on
+/// the upper portions of registers, and to save code size.
+FunctionPass *createX86FixupBWInsts();
+
+void initializeFixupBWInstPassPass(PassRegistry &);
+
+/// This pass replaces EVEX ecnoded of AVX-512 instructiosn by VEX 
+/// encoding when possible in order to reduce code size.
+FunctionPass *createX86EvexToVexInsts();
+
+void initializeEvexToVexInstPassPass(PassRegistry &);
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td
new file mode 100644
index 000000000000..dc18a59a30ba
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86.td
@@ -0,0 +1,856 @@
+//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel i386 architecture, referred
+// to here as the "X86" architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget state
+//
+
+def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
+                                  "64-bit mode (x86_64)">;
+def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true",
+                                  "32-bit mode (80386)">;
+def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
+                                  "16-bit mode (i8086)">;
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget features
+//===----------------------------------------------------------------------===//
+
+def FeatureX87     : SubtargetFeature<"x87","HasX87", "true",
+                                      "Enable X87 float instructions">;
+
+def FeatureCMOV    : SubtargetFeature<"cmov","HasCMov", "true",
+                                      "Enable conditional move instructions">;
+
+def FeaturePOPCNT   : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
+                                       "Support POPCNT instruction">;
+
+def FeatureFXSR    : SubtargetFeature<"fxsr", "HasFXSR", "true",
+                                      "Support fxsave/fxrestore instructions">;
+
+def FeatureXSAVE   : SubtargetFeature<"xsave", "HasXSAVE", "true",
+                                       "Support xsave instructions">;
+
+def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true",
+                                       "Support xsaveopt instructions">;
+
+def FeatureXSAVEC  : SubtargetFeature<"xsavec", "HasXSAVEC", "true",
+                                       "Support xsavec instructions">;
+
+def FeatureXSAVES  : SubtargetFeature<"xsaves", "HasXSAVES", "true",
+                                       "Support xsaves instructions">;
+
+def FeatureSSE1    : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
+                                      "Enable SSE instructions",
+                                      // SSE codegen depends on cmovs, and all
+                                      // SSE1+ processors support them.
+                                      [FeatureCMOV]>;
+def FeatureSSE2    : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
+                                      "Enable SSE2 instructions",
+                                      [FeatureSSE1]>;
+def FeatureSSE3    : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
+                                      "Enable SSE3 instructions",
+                                      [FeatureSSE2]>;
+def FeatureSSSE3   : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
+                                      "Enable SSSE3 instructions",
+                                      [FeatureSSE3]>;
+def FeatureSSE41   : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41",
+                                      "Enable SSE 4.1 instructions",
+                                      [FeatureSSSE3]>;
+def FeatureSSE42   : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42",
+                                      "Enable SSE 4.2 instructions",
+                                      [FeatureSSE41]>;
+// The MMX subtarget feature is separate from the rest of the SSE features
+// because it's important (for odd compatibility reasons) to be able to
+// turn it off explicitly while allowing SSE+ to be on.
+def FeatureMMX     : SubtargetFeature<"mmx","X863DNowLevel", "MMX",
+                                      "Enable MMX instructions">;
+def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
+                                      "Enable 3DNow! instructions",
+                                      [FeatureMMX]>;
+def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
+                                      "Enable 3DNow! Athlon instructions",
+                                      [Feature3DNow]>;
+// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
+// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
+// without disabling 64-bit mode.
+def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
+                                      "Support 64-bit instructions",
+                                      [FeatureCMOV]>;
+def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
+                                      "64-bit with cmpxchg16b",
+                                      [Feature64Bit]>;
+def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
+                                       "Bit testing of memory is slow">;
+def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
+                                       "SHLD instruction is slow">;
+def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
+                                        "PMULLD instruction is slow">;
+// FIXME: This should not apply to CPUs that do not have SSE.
+def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
+                                "IsUAMem16Slow", "true",
+                                "Slow unaligned 16-byte memory access">;
+def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
+                                "IsUAMem32Slow", "true",
+                                "Slow unaligned 32-byte memory access">;
+def FeatureSSE4A   : SubtargetFeature<"sse4a", "HasSSE4A", "true",
+                                      "Support SSE 4a instructions",
+                                      [FeatureSSE3]>;
+
+def FeatureAVX     : SubtargetFeature<"avx", "X86SSELevel", "AVX",
+                                      "Enable AVX instructions",
+                                      [FeatureSSE42]>;
+def FeatureAVX2    : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
+                                      "Enable AVX2 instructions",
+                                      [FeatureAVX]>;
+def FeatureAVX512   : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
+                                      "Enable AVX-512 instructions",
+                                      [FeatureAVX2]>;
+def FeatureERI      : SubtargetFeature<"avx512er", "HasERI", "true",
+                      "Enable AVX-512 Exponential and Reciprocal Instructions",
+                                      [FeatureAVX512]>;
+def FeatureCDI      : SubtargetFeature<"avx512cd", "HasCDI", "true",
+                      "Enable AVX-512 Conflict Detection Instructions",
+                                      [FeatureAVX512]>;
+def FeaturePFI      : SubtargetFeature<"avx512pf", "HasPFI", "true",
+                      "Enable AVX-512 PreFetch Instructions",
+                                      [FeatureAVX512]>;
+def FeaturePREFETCHWT1  : SubtargetFeature<"prefetchwt1", "HasPFPREFETCHWT1",
+                                   "true",
+                                   "Prefetch with Intent to Write and T1 Hint">;
+def FeatureDQI     : SubtargetFeature<"avx512dq", "HasDQI", "true",
+                      "Enable AVX-512 Doubleword and Quadword Instructions",
+                                      [FeatureAVX512]>;
+def FeatureBWI     : SubtargetFeature<"avx512bw", "HasBWI", "true",
+                      "Enable AVX-512 Byte and Word Instructions",
+                                      [FeatureAVX512]>;
+def FeatureVLX     : SubtargetFeature<"avx512vl", "HasVLX", "true",
+                      "Enable AVX-512 Vector Length eXtensions",
+                                      [FeatureAVX512]>;
+def FeatureVBMI     : SubtargetFeature<"avx512vbmi", "HasVBMI", "true",
+                      "Enable AVX-512 Vector Byte Manipulation Instructions",
+                                      [FeatureBWI]>;
+def FeatureIFMA     : SubtargetFeature<"avx512ifma", "HasIFMA", "true",
+                      "Enable AVX-512 Integer Fused Multiple-Add",
+                                      [FeatureAVX512]>;
+def FeaturePKU   : SubtargetFeature<"pku", "HasPKU", "true",
+                      "Enable protection keys">;
+def FeaturePCLMUL  : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
+                         "Enable packed carry-less multiplication instructions",
+                               [FeatureSSE2]>;
+def FeatureFMA     : SubtargetFeature<"fma", "HasFMA", "true",
+                                      "Enable three-operand fused multiple-add",
+                                      [FeatureAVX]>;
+def FeatureFMA4    : SubtargetFeature<"fma4", "HasFMA4", "true",
+                                      "Enable four-operand fused multiple-add",
+                                      [FeatureAVX, FeatureSSE4A]>;
+def FeatureXOP     : SubtargetFeature<"xop", "HasXOP", "true",
+                                      "Enable XOP instructions",
+                                      [FeatureFMA4]>;
+def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem",
+                                          "HasSSEUnalignedMem", "true",
+                      "Allow unaligned memory operands with SSE instructions">;
+def FeatureAES     : SubtargetFeature<"aes", "HasAES", "true",
+                                      "Enable AES instructions",
+                                      [FeatureSSE2]>;
+def FeatureTBM     : SubtargetFeature<"tbm", "HasTBM", "true",
+                                      "Enable TBM instructions">;
+def FeatureMOVBE   : SubtargetFeature<"movbe", "HasMOVBE", "true",
+                                      "Support MOVBE instruction">;
+def FeatureRDRAND  : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
+                                      "Support RDRAND instruction">;
+def FeatureF16C    : SubtargetFeature<"f16c", "HasF16C", "true",
+                       "Support 16-bit floating point conversion instructions",
+                       [FeatureAVX]>;
+def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
+                                       "Support FS/GS Base instructions">;
+def FeatureLZCNT   : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
+                                      "Support LZCNT instruction">;
+def FeatureBMI     : SubtargetFeature<"bmi", "HasBMI", "true",
+                                      "Support BMI instructions">;
+def FeatureBMI2    : SubtargetFeature<"bmi2", "HasBMI2", "true",
+                                      "Support BMI2 instructions">;
+def FeatureRTM     : SubtargetFeature<"rtm", "HasRTM", "true",
+                                      "Support RTM instructions">;
+def FeatureHLE     : SubtargetFeature<"hle", "HasHLE", "true",
+                                      "Support HLE">;
+def FeatureADX     : SubtargetFeature<"adx", "HasADX", "true",
+                                      "Support ADX instructions">;
+def FeatureSHA     : SubtargetFeature<"sha", "HasSHA", "true",
+                                      "Enable SHA instructions",
+                                      [FeatureSSE2]>;
+def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
+                                      "Support PRFCHW instructions">;
+def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
+                                      "Support RDSEED instruction">;
+def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
+                                       "Support LAHF and SAHF instructions">;
+def FeatureMWAITX  : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
+                                      "Enable MONITORX/MWAITX timer functionality">;
+def FeatureMPX     : SubtargetFeature<"mpx", "HasMPX", "true",
+                                      "Support MPX instructions">;
+def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
+                                     "Use LEA for adjusting the stack pointer">;
+def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
+                                     "HasSlowDivide32", "true",
+                                     "Use 8-bit divide for positive values less than 256">;
+def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw",
+                                     "HasSlowDivide64", "true",
+                                     "Use 16-bit divide for positive values less than 65536">;
+def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
+                                     "PadShortFunctions", "true",
+                                     "Pad short functions">;
+def FeatureINVPCID : SubtargetFeature<"invpcid", "HasInvPCId", "true",
+                                      "Invalidate Process-Context Identifier">;
+def FeatureVMFUNC  : SubtargetFeature<"vmfunc", "HasVMFUNC", "true",
+                                      "VM Functions">;
+def FeatureSMAP    : SubtargetFeature<"smap", "HasSMAP", "true",
+                                      "Supervisor Mode Access Protection">;
+def FeatureSGX     : SubtargetFeature<"sgx", "HasSGX", "true",
+                                      "Enable Software Guard Extensions">;
+def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true",
+                                      "Flush A Cache Line Optimized">;
+def FeaturePCOMMIT : SubtargetFeature<"pcommit", "HasPCOMMIT", "true",
+                                      "Enable Persistent Commit">;
+def FeatureCLWB    : SubtargetFeature<"clwb", "HasCLWB", "true",
+                                      "Cache Line Write Back">;
+// TODO: This feature ought to be renamed.
+// What it really refers to are CPUs for which certain instructions
+// (which ones besides the example below?) are microcoded.
+// The best examples of this are the memory forms of CALL and PUSH
+// instructions, which should be avoided in favor of a MOV + register CALL/PUSH.
+def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect",
+                                     "CallRegIndirect", "true",
+                                     "Call register indirect">;
+def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
+                                   "LEA instruction needs inputs at AG stage">;
+def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
+                                   "LEA instruction with certain arguments is slow">;
+def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
+                                   "INC and DEC instructions are slower than ADD and SUB">;
+def FeatureSoftFloat
+    : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
+                       "Use software floating point features.">;
+// On at least some AMD processors, there is no performance hazard to writing
+// only the lower parts of a YMM register without clearing the upper part.
+def FeatureFastPartialYMMWrite
+    : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
+                       "true", "Partial writes to YMM registers are fast">;
+// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
+// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
+// vector FSQRT has higher throughput than the corresponding NR code.
+// The idea is that throughput bound code is likely to be vectorized, so for
+// vectorized code we should care about the throughput of SQRT operations.
+// But if the code is scalar that probably means that the code has some kind of
+// dependency and we should care more about reducing the latency.
+def FeatureFastScalarFSQRT
+    : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
+                       "true", "Scalar SQRT is fast (disable Newton-Raphson)">;
+def FeatureFastVectorFSQRT
+    : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
+                       "true", "Vector SQRT is fast (disable Newton-Raphson)">;
+// If lzcnt has equivalent latency/throughput to most simple integer ops, it can
+// be used to replace test/set sequences.
+def FeatureFastLZCNT
+    : SubtargetFeature<
+          "fast-lzcnt", "HasFastLZCNT", "true",
+          "LZCNT instructions are as fast as most simple integer ops">;
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
+
+include "X86Schedule.td"
+
+def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
+                    "Intel Atom processors">;
+def ProcIntelSLM  : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
+                    "Intel Silvermont processors">;
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : ProcessorModel<Name, GenericModel, Features>;
+
+def : Proc<"generic",         [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"i386",            [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"i486",            [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"i586",            [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"pentium",         [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"pentium-mmx",     [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"i686",            [FeatureX87, FeatureSlowUAMem16]>;
+def : Proc<"pentiumpro",      [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>;
+def : Proc<"pentium2",        [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                               FeatureCMOV, FeatureFXSR]>;
+def : Proc<"pentium3",        [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                               FeatureSSE1, FeatureFXSR]>;
+def : Proc<"pentium3m",       [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                               FeatureSSE1, FeatureFXSR, FeatureSlowBTMem]>;
+
+// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
+// The intent is to enable it for pentium4 which is the current default
+// processor in a vanilla 32-bit clang compilation when no specific
+// architecture is specified.  This generally gives a nice performance
+// increase on silvermont, with largely neutral behavior on other
+// contemporary large core processors.
+// pentium-m, pentium4m, prescott and nocona are included as a preventative
+// measure to avoid performance surprises, in case clang's default cpu
+// changes slightly.
+
+def : ProcessorModel<"pentium-m", GenericPostRAModel,
+                     [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                      FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;
+
+def : ProcessorModel<"pentium4", GenericPostRAModel,
+                     [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                      FeatureSSE2, FeatureFXSR]>;
+
+def : ProcessorModel<"pentium4m", GenericPostRAModel,
+                     [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                      FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;
+
+// Intel Quark.
+def : Proc<"lakemont",        []>;
+
+// Intel Core Duo.
+def : ProcessorModel<"yonah", SandyBridgeModel,
+                     [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
+                      FeatureFXSR, FeatureSlowBTMem]>;
+
+// NetBurst.
+def : ProcessorModel<"prescott", GenericPostRAModel,
+                     [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
+                      FeatureFXSR, FeatureSlowBTMem]>;
+def : ProcessorModel<"nocona", GenericPostRAModel, [
+  FeatureX87,
+  FeatureSlowUAMem16,
+  FeatureMMX,
+  FeatureSSE3,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem
+]>;
+
+// Intel Core 2 Solo/Duo.
+def : ProcessorModel<"core2", SandyBridgeModel, [
+  FeatureX87,
+  FeatureSlowUAMem16,
+  FeatureMMX,
+  FeatureSSSE3,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF
+]>;
+def : ProcessorModel<"penryn", SandyBridgeModel, [
+  FeatureX87,
+  FeatureSlowUAMem16,
+  FeatureMMX,
+  FeatureSSE41,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF
+]>;
+
+// Atom CPUs.
+class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
+  ProcIntelAtom,
+  FeatureX87,
+  FeatureSlowUAMem16,
+  FeatureMMX,
+  FeatureSSSE3,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureMOVBE,
+  FeatureSlowBTMem,
+  FeatureLEAForSP,
+  FeatureSlowDivide32,
+  FeatureSlowDivide64,
+  FeatureCallRegIndirect,
+  FeatureLEAUsesAG,
+  FeaturePadShortFunctions,
+  FeatureLAHFSAHF
+]>;
+def : BonnellProc<"bonnell">;
+def : BonnellProc<"atom">; // Pin the generic name to the baseline.
+
+class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
+  ProcIntelSLM,
+  FeatureX87,
+  FeatureMMX,
+  FeatureSSE42,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureMOVBE,
+  FeaturePOPCNT,
+  FeaturePCLMUL,
+  FeatureAES,
+  FeatureSlowDivide64,
+  FeatureCallRegIndirect,
+  FeaturePRFCHW,
+  FeatureSlowLEA,
+  FeatureSlowIncDec,
+  FeatureSlowBTMem,
+  FeatureSlowPMULLD,
+  FeatureLAHFSAHF
+]>;
+def : SilvermontProc<"silvermont">;
+def : SilvermontProc<"slm">; // Legacy alias.
+
+// "Arrandale" along with corei3 and corei5
+class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+  FeatureX87,
+  FeatureMMX,
+  FeatureSSE42,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeaturePOPCNT,
+  FeatureLAHFSAHF
+]>;
+def : NehalemProc<"nehalem">;
+def : NehalemProc<"corei7">;
+
+// Westmere is a similar machine to nehalem with some additional features.
+// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
+class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
+  FeatureX87,
+  FeatureMMX,
+  FeatureSSE42,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureSlowBTMem,
+  FeaturePOPCNT,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureLAHFSAHF
+]>;
+def : WestmereProc<"westmere">;
+
+class ProcessorFeatures<list<SubtargetFeature> Inherited,
+                        list<SubtargetFeature> NewFeatures> {
+  list<SubtargetFeature> Value = !listconcat(Inherited, NewFeatures);
+}
+
+class ProcModel<string Name, SchedMachineModel Model,
+                list<SubtargetFeature> ProcFeatures,
+                list<SubtargetFeature> OtherFeatures> :
+  ProcessorModel<Name, Model, !listconcat(ProcFeatures, OtherFeatures)>;
+
+// SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
+// rather than a superset.
+def SNBFeatures : ProcessorFeatures<[], [
+  FeatureX87,
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeaturePOPCNT,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureLAHFSAHF,
+  FeatureFastScalarFSQRT
+]>;
+
+class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
+                                               SNBFeatures.Value, [
+  FeatureSlowBTMem,
+  FeatureSlowUAMem32
+]>;
+def : SandyBridgeProc<"sandybridge">;
+def : SandyBridgeProc<"corei7-avx">; // Legacy alias.
+
+def IVBFeatures : ProcessorFeatures<SNBFeatures.Value, [
+  FeatureRDRAND,
+  FeatureF16C,
+  FeatureFSGSBase
+]>;
+
+class IvyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
+                                             IVBFeatures.Value, [
+  FeatureSlowBTMem,
+  FeatureSlowUAMem32
+]>;
+def : IvyBridgeProc<"ivybridge">;
+def : IvyBridgeProc<"core-avx-i">; // Legacy alias.
+
+def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
+  FeatureAVX2,
+  FeatureBMI,
+  FeatureBMI2,
+  FeatureFMA,
+  FeatureLZCNT,
+  FeatureMOVBE,
+  FeatureINVPCID,
+  FeatureVMFUNC,
+  FeatureRTM,
+  FeatureHLE,
+  FeatureSlowIncDec
+]>;
+
+class HaswellProc<string Name> : ProcModel<Name, HaswellModel,
+                                           HSWFeatures.Value, []>;
+def : HaswellProc<"haswell">;
+def : HaswellProc<"core-avx2">; // Legacy alias.
+
+def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [
+  FeatureADX,
+  FeatureRDSEED,
+  FeatureSMAP
+]>;
+class BroadwellProc<string Name> : ProcModel<Name, HaswellModel,
+                                             BDWFeatures.Value, []>;
+def : BroadwellProc<"broadwell">;
+
+def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
+  FeatureMPX,
+  FeatureXSAVEC,
+  FeatureXSAVES,
+  FeatureSGX,
+  FeatureCLFLUSHOPT,
+  FeatureFastVectorFSQRT
+]>;
+
+// FIXME: define SKL model
+class SkylakeClientProc<string Name> : ProcModel<Name, HaswellModel,
+                                                 SKLFeatures.Value, []>;
+def : SkylakeClientProc<"skylake">;
+
+// FIXME: define KNL model
+class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel,
+                                                  IVBFeatures.Value, [
+  FeatureAVX512,
+  FeatureERI,
+  FeatureCDI,
+  FeaturePFI,
+  FeaturePREFETCHWT1,
+  FeatureADX,
+  FeatureRDSEED,
+  FeatureMOVBE,
+  FeatureLZCNT,
+  FeatureBMI,
+  FeatureBMI2,
+  FeatureFMA
+]>;
+def : KnightsLandingProc<"knl">;
+
+def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [
+  FeatureAVX512,
+  FeatureCDI,
+  FeatureDQI,
+  FeatureBWI,
+  FeatureVLX,
+  FeaturePKU,
+  FeaturePCOMMIT,
+  FeatureCLWB
+]>;
+
+// FIXME: define SKX model
+class SkylakeServerProc<string Name> : ProcModel<Name, HaswellModel,
+                                                 SKXFeatures.Value, []>;
+def : SkylakeServerProc<"skylake-avx512">;
+def : SkylakeServerProc<"skx">; // Legacy alias.
+
+def CNLFeatures : ProcessorFeatures<SKXFeatures.Value, [
+  FeatureVBMI,
+  FeatureIFMA,
+  FeatureSHA
+]>;
+
+class CannonlakeProc<string Name> : ProcModel<Name, HaswellModel,
+                                              CNLFeatures.Value, []>;
+def : CannonlakeProc<"cannonlake">;
+
+// AMD CPUs.
+
+def : Proc<"k6",              [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"k6-2",            [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"k6-3",            [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"athlon",          [FeatureX87, FeatureSlowUAMem16, Feature3DNowA,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-tbird",    [FeatureX87, FeatureSlowUAMem16, Feature3DNowA,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-4",        [FeatureX87, FeatureSlowUAMem16, FeatureSSE1,
+                               Feature3DNowA, FeatureFXSR, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-xp",       [FeatureX87, FeatureSlowUAMem16, FeatureSSE1,
+                               Feature3DNowA, FeatureFXSR, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-mp",       [FeatureX87, FeatureSlowUAMem16, FeatureSSE1,
+                               Feature3DNowA, FeatureFXSR, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"k8",              [FeatureX87, FeatureSlowUAMem16, FeatureSSE2,
+                               Feature3DNowA, FeatureFXSR, Feature64Bit,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"opteron",         [FeatureX87, FeatureSlowUAMem16, FeatureSSE2,
+                               Feature3DNowA, FeatureFXSR, Feature64Bit,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon64",        [FeatureX87, FeatureSlowUAMem16, FeatureSSE2,
+                               Feature3DNowA, FeatureFXSR, Feature64Bit,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon-fx",       [FeatureX87, FeatureSlowUAMem16, FeatureSSE2,
+                               Feature3DNowA, FeatureFXSR, Feature64Bit,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"k8-sse3",         [FeatureX87, FeatureSlowUAMem16, FeatureSSE3,
+                               Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"opteron-sse3",    [FeatureX87, FeatureSlowUAMem16, FeatureSSE3,
+                               Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"athlon64-sse3",   [FeatureX87, FeatureSlowUAMem16, FeatureSSE3,
+                               Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
+def : Proc<"amdfam10",        [FeatureX87, FeatureSSE4A, Feature3DNowA,
+                               FeatureFXSR, FeatureCMPXCHG16B, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureSlowBTMem, FeatureSlowSHLD,
+                               FeatureLAHFSAHF]>;
+def : Proc<"barcelona",       [FeatureX87, FeatureSSE4A, Feature3DNowA,
+                               FeatureFXSR, FeatureCMPXCHG16B, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureSlowBTMem, FeatureSlowSHLD,
+                               FeatureLAHFSAHF]>;
+
+// Bobcat
+def : Proc<"btver1", [
+  FeatureX87,
+  FeatureMMX,
+  FeatureSSSE3,
+  FeatureSSE4A,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeaturePRFCHW,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
+]>;
+
+// Jaguar
+def : ProcessorModel<"btver2", BtVer2Model, [
+  FeatureX87,
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureSSE4A,
+  FeatureCMPXCHG16B,
+  FeaturePRFCHW,
+  FeatureAES,
+  FeaturePCLMUL,
+  FeatureBMI,
+  FeatureF16C,
+  FeatureMOVBE,
+  FeatureLZCNT,
+  FeatureFastLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF,
+  FeatureFastPartialYMMWrite
+]>;
+
+// Bulldozer
+def : Proc<"bdver1", [
+  FeatureX87,
+  FeatureXOP,
+  FeatureFMA4,
+  FeatureCMPXCHG16B,
+  FeatureAES,
+  FeaturePRFCHW,
+  FeaturePCLMUL,
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureSSE4A,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
+]>;
+// Piledriver
+def : Proc<"bdver2", [
+  FeatureX87,
+  FeatureXOP,
+  FeatureFMA4,
+  FeatureCMPXCHG16B,
+  FeatureAES,
+  FeaturePRFCHW,
+  FeaturePCLMUL,
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureSSE4A,
+  FeatureF16C,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureBMI,
+  FeatureTBM,
+  FeatureFMA,
+  FeatureSlowSHLD,
+  FeatureLAHFSAHF
+]>;
+
+// Steamroller
+def : Proc<"bdver3", [
+  FeatureX87,
+  FeatureXOP,
+  FeatureFMA4,
+  FeatureCMPXCHG16B,
+  FeatureAES,
+  FeaturePRFCHW,
+  FeaturePCLMUL,
+  FeatureMMX,
+  FeatureAVX,
+  FeatureFXSR,
+  FeatureSSE4A,
+  FeatureF16C,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureBMI,
+  FeatureTBM,
+  FeatureFMA,
+  FeatureXSAVEOPT,
+  FeatureSlowSHLD,
+  FeatureFSGSBase,
+  FeatureLAHFSAHF
+]>;
+
+// Excavator
+def : Proc<"bdver4", [
+  FeatureX87,
+  FeatureMMX,
+  FeatureAVX2,
+  FeatureFXSR,
+  FeatureXOP,
+  FeatureFMA4,
+  FeatureCMPXCHG16B,
+  FeatureAES,
+  FeaturePRFCHW,
+  FeaturePCLMUL,
+  FeatureF16C,
+  FeatureLZCNT,
+  FeaturePOPCNT,
+  FeatureXSAVE,
+  FeatureBMI,
+  FeatureBMI2,
+  FeatureTBM,
+  FeatureFMA,
+  FeatureXSAVEOPT,
+  FeatureSlowSHLD,
+  FeatureFSGSBase,
+  FeatureLAHFSAHF,
+  FeatureMWAITX
+]>;
+
+def : Proc<"geode",           [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>;
+
+def : Proc<"winchip-c6",      [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"winchip2",        [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"c3",              [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"c3-2",            [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+                               FeatureSSE1, FeatureFXSR]>;
+
+// We also provide a generic 64-bit specific x86 processor model which tries to
+// be good for modern chips without enabling instruction set encodings past the
+// basic SSE2 and 64-bit ones. It disables slow things from any mainstream and
+// modern 64-bit x86 chip, and enables features that are generally beneficial.
+//
+// We currently use the Sandy Bridge model as the default scheduling model as
+// we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which
+// covers a huge swath of x86 processors. If there are specific scheduling
+// knobs which need to be tuned differently for AMD chips, we might consider
+// forming a common base for them.
+def : ProcessorModel<"x86-64", SandyBridgeModel,
+                     [FeatureX87, FeatureMMX, FeatureSSE2, FeatureFXSR,
+                      Feature64Bit, FeatureSlowBTMem ]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "X86RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "X86InstrInfo.td"
+
+def X86InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "X86CallingConv.td"
+
+
+//===----------------------------------------------------------------------===//
+// Assembly Parser
+//===----------------------------------------------------------------------===//
+
+def ATTAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+
+  // Variant name.
+  string Name = "att";
+
+  // Discard comments in assembly strings.
+  string CommentDelimiter = "#";
+
+  // Recognize hard coded registers.
+  string RegisterPrefix = "%";
+}
+
+def IntelAsmParserVariant : AsmParserVariant {
+  int Variant = 1;
+
+  // Variant name.
+  string Name = "intel";
+
+  // Discard comments in assembly strings.
+  string CommentDelimiter = ";";
+
+  // Recognize hard coded registers.
+  string RegisterPrefix = "";
+}
+
+//===----------------------------------------------------------------------===//
+// Assembly Printers
+//===----------------------------------------------------------------------===//
+
+// The X86 target supports two different syntaxes for emitting machine code.
+// This is controlled by the -x86-asm-syntax={att|intel}
+def ATTAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "ATTInstPrinter";
+  int Variant = 0;
+}
+def IntelAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "IntelInstPrinter";
+  int Variant = 1;
+}
+
+def X86 : Target {
+  // Information about the instructions...
+  let InstructionSet = X86InstrInfo;
+  let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
+  let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
new file mode 100644
index 000000000000..d42e1187ce64
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -0,0 +1,661 @@
+//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to X86 machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Primitive Helper Functions.
+//===----------------------------------------------------------------------===//
+
+/// runOnMachineFunction - Emit the function body.
+///
+bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &MF.getSubtarget<X86Subtarget>();
+
+  SMShadowTracker.startFunction(MF);
+  CodeEmitter.reset(TM.getTarget().createMCCodeEmitter(
+      *MF.getSubtarget().getInstrInfo(), *MF.getSubtarget().getRegisterInfo(),
+      MF.getContext()));
+
+  SetupMachineFunction(MF);
+
+  if (Subtarget->isTargetCOFF()) {
+    bool Local = MF.getFunction()->hasLocalLinkage();
+    OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
+    OutStreamer->EmitCOFFSymbolStorageClass(
+        Local ? COFF::IMAGE_SYM_CLASS_STATIC : COFF::IMAGE_SYM_CLASS_EXTERNAL);
+    OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+                                               << COFF::SCT_COMPLEX_TYPE_SHIFT);
+    OutStreamer->EndCOFFSymbolDef();
+  }
+
+  // Emit the rest of the function body.
+  EmitFunctionBody();
+
+  // Emit the XRay table for this function.
+  EmitXRayTable();
+
+  // We didn't modify anything.
+  return false;
+}
+
+/// printSymbolOperand - Print a raw symbol reference operand.  This handles
+/// jump tables, constant pools, global address and external symbols, all of
+/// which print to a label with various suffixes for relocation types etc.
+static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
+                               raw_ostream &O) {
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown symbol type!");
+  case MachineOperand::MO_ConstantPoolIndex:
+    P.GetCPISymbol(MO.getIndex())->print(O, P.MAI);
+    P.printOffset(MO.getOffset(), O);
+    break;
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MO.getGlobal();
+
+    MCSymbol *GVSym;
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+        MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE)
+      GVSym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+    else
+      GVSym = P.getSymbol(GV);
+
+    // Handle dllimport linkage.
+    if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
+      GVSym =
+          P.OutContext.getOrCreateSymbol(Twine("__imp_") + GVSym->getName());
+
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+        MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) {
+      MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+      MachineModuleInfoImpl::StubValueTy &StubSym =
+          P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
+      if (!StubSym.getPointer())
+        StubSym = MachineModuleInfoImpl::
+          StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
+    }
+
+    // If the name begins with a dollar-sign, enclose it in parens.  We do this
+    // to avoid having it look like an integer immediate to the assembler.
+    if (GVSym->getName()[0] != '$')
+      GVSym->print(O, P.MAI);
+    else {
+      O << '(';
+      GVSym->print(O, P.MAI);
+      O << ')';
+    }
+    P.printOffset(MO.getOffset(), O);
+    break;
+  }
+  }
+
+  switch (MO.getTargetFlags()) {
+  default:
+    llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG:    // No flag.
+    break;
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DLLIMPORT:
+    // These affect the name of the symbol, not any suffix.
+    break;
+  case X86II::MO_GOT_ABSOLUTE_ADDRESS:
+    O << " + [.-";
+    P.MF->getPICBaseSymbol()->print(O, P.MAI);
+    O << ']';
+    break;
+  case X86II::MO_PIC_BASE_OFFSET:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+    O << '-';
+    P.MF->getPICBaseSymbol()->print(O, P.MAI);
+    break;
+  case X86II::MO_TLSGD:     O << "@TLSGD";     break;
+  case X86II::MO_TLSLD:     O << "@TLSLD";     break;
+  case X86II::MO_TLSLDM:    O << "@TLSLDM";    break;
+  case X86II::MO_GOTTPOFF:  O << "@GOTTPOFF";  break;
+  case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break;
+  case X86II::MO_TPOFF:     O << "@TPOFF";     break;
+  case X86II::MO_DTPOFF:    O << "@DTPOFF";    break;
+  case X86II::MO_NTPOFF:    O << "@NTPOFF";    break;
+  case X86II::MO_GOTNTPOFF: O << "@GOTNTPOFF"; break;
+  case X86II::MO_GOTPCREL:  O << "@GOTPCREL";  break;
+  case X86II::MO_GOT:       O << "@GOT";       break;
+  case X86II::MO_GOTOFF:    O << "@GOTOFF";    break;
+  case X86II::MO_PLT:       O << "@PLT";       break;
+  case X86II::MO_TLVP:      O << "@TLVP";      break;
+  case X86II::MO_TLVP_PIC_BASE:
+    O << "@TLVP" << '-';
+    P.MF->getPICBaseSymbol()->print(O, P.MAI);
+    break;
+  case X86II::MO_SECREL:    O << "@SECREL32";  break;
+  }
+}
+
+static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
+                         unsigned OpNo, raw_ostream &O,
+                         const char *Modifier = nullptr, unsigned AsmVariant = 0);
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.  These print slightly differently, for
+/// example, a $ is not emitted.
+static void printPCRelImm(X86AsmPrinter &P, const MachineInstr *MI,
+                          unsigned OpNo, raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  switch (MO.getType()) {
+  default: llvm_unreachable("Unknown pcrel immediate operand");
+  case MachineOperand::MO_Register:
+    // pc-relativeness was handled when computing the value in the reg.
+    printOperand(P, MI, OpNo, O);
+    return;
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_GlobalAddress:
+    printSymbolOperand(P, MO, O);
+    return;
+  }
+}
+
+static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
+                         unsigned OpNo, raw_ostream &O, const char *Modifier,
+                         unsigned AsmVariant) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown operand type!");
+  case MachineOperand::MO_Register: {
+    // FIXME: Enumerating AsmVariant, so we can remove magic number.
+    if (AsmVariant == 0) O << '%';
+    unsigned Reg = MO.getReg();
+    if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+      unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 :
+                      (strcmp(Modifier+6,"32") == 0) ? 32 :
+                      (strcmp(Modifier+6,"16") == 0) ? 16 : 8;
+      Reg = getX86SubSuperRegister(Reg, Size);
+    }
+    O << X86ATTInstPrinter::getRegisterName(Reg);
+    return;
+  }
+
+  case MachineOperand::MO_Immediate:
+    if (AsmVariant == 0) O << '$';
+    O << MO.getImm();
+    return;
+
+  case MachineOperand::MO_GlobalAddress: {
+    if (AsmVariant == 0) O << '$';
+    printSymbolOperand(P, MO, O);
+    break;
+  }
+  }
+}
+
+static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+                                 unsigned Op, raw_ostream &O,
+                                 const char *Modifier = nullptr) {
+  const MachineOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
+  const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+  const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+
+  // If we really don't want to print out (rip), don't.
+  bool HasBaseReg = BaseReg.getReg() != 0;
+  if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") &&
+      BaseReg.getReg() == X86::RIP)
+    HasBaseReg = false;
+
+  // HasParenPart - True if we will print out the () part of the mem ref.
+  bool HasParenPart = IndexReg.getReg() || HasBaseReg;
+
+  switch (DispSpec.getType()) {
+  default:
+    llvm_unreachable("unknown operand type!");
+  case MachineOperand::MO_Immediate: {
+    int DispVal = DispSpec.getImm();
+    if (DispVal || !HasParenPart)
+      O << DispVal;
+    break;
+  }
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ConstantPoolIndex:
+    printSymbolOperand(P, DispSpec, O);
+  }
+
+  if (Modifier && strcmp(Modifier, "H") == 0)
+    O << "+8";
+
+  if (HasParenPart) {
+    assert(IndexReg.getReg() != X86::ESP &&
+           "X86 doesn't allow scaling by ESP");
+
+    O << '(';
+    if (HasBaseReg)
+      printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier);
+
+    if (IndexReg.getReg()) {
+      O << ',';
+      printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier);
+      unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+      if (ScaleVal != 1)
+        O << ',' << ScaleVal;
+    }
+    O << ')';
+  }
+}
+
+static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+                              unsigned Op, raw_ostream &O,
+                              const char *Modifier = nullptr) {
+  assert(isMem(*MI, Op) && "Invalid memory reference!");
+  const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg);
+  if (Segment.getReg()) {
+    printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier);
+    O << ':';
+  }
+  printLeaMemReference(P, MI, Op, O, Modifier);
+}
+
+static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
+                                   unsigned Op, raw_ostream &O,
+                                   const char *Modifier = nullptr,
+                                   unsigned AsmVariant = 1) {
+  const MachineOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
+  unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
+  const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
+  const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
+  const MachineOperand &SegReg   = MI->getOperand(Op+X86::AddrSegmentReg);
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier, AsmVariant);
+    O << ':';
+  }
+
+  O << '[';
+
+  bool NeedPlus = false;
+  if (BaseReg.getReg()) {
+    printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier, AsmVariant);
+    NeedPlus = true;
+  }
+
+  if (IndexReg.getReg()) {
+    if (NeedPlus) O << " + ";
+    if (ScaleVal != 1)
+      O << ScaleVal << '*';
+    printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier, AsmVariant);
+    NeedPlus = true;
+  }
+
+  if (!DispSpec.isImm()) {
+    if (NeedPlus) O << " + ";
+    printOperand(P, MI, Op+X86::AddrDisp, O, Modifier, AsmVariant);
+  } else {
+    int64_t DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+      if (NeedPlus) {
+        if (DispVal > 0)
+          O << " + ";
+        else {
+          O << " - ";
+          DispVal = -DispVal;
+        }
+      }
+      O << DispVal;
+    }
+  }
+  O << ']';
+}
+
+static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
+                              char Mode, raw_ostream &O) {
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default: return true;  // Unknown mode.
+  case 'b': // Print QImode register
+    Reg = getX86SubSuperRegister(Reg, 8);
+    break;
+  case 'h': // Print QImode high register
+    Reg = getX86SubSuperRegister(Reg, 8, true);
+    break;
+  case 'w': // Print HImode register
+    Reg = getX86SubSuperRegister(Reg, 16);
+    break;
+  case 'k': // Print SImode register
+    Reg = getX86SubSuperRegister(Reg, 32);
+    break;
+  case 'q':
+    // Print 64-bit register names if 64-bit integer registers are available.
+    // Otherwise, print 32-bit register names.
+    Reg = getX86SubSuperRegister(Reg, P.getSubtarget().is64Bit() ? 64 : 32);
+    break;
+  }
+
+  O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant,
+                                    const char *ExtraCode, raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    const MachineOperand &MO = MI->getOperand(OpNo);
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'a': // This is an address.  Currently only 'i' and 'r' are expected.
+      switch (MO.getType()) {
+      default:
+        return true;
+      case MachineOperand::MO_Immediate:
+        O << MO.getImm();
+        return false;
+      case MachineOperand::MO_ConstantPoolIndex:
+      case MachineOperand::MO_JumpTableIndex:
+      case MachineOperand::MO_ExternalSymbol:
+        llvm_unreachable("unexpected operand type!");
+      case MachineOperand::MO_GlobalAddress:
+        printSymbolOperand(*this, MO, O);
+        if (Subtarget->isPICStyleRIPRel())
+          O << "(%rip)";
+        return false;
+      case MachineOperand::MO_Register:
+        O << '(';
+        printOperand(*this, MI, OpNo, O);
+        O << ')';
+        return false;
+      }
+
+    case 'c': // Don't print "$" before a global var name or constant.
+      switch (MO.getType()) {
+      default:
+        printOperand(*this, MI, OpNo, O);
+        break;
+      case MachineOperand::MO_Immediate:
+        O << MO.getImm();
+        break;
+      case MachineOperand::MO_ConstantPoolIndex:
+      case MachineOperand::MO_JumpTableIndex:
+      case MachineOperand::MO_ExternalSymbol:
+        llvm_unreachable("unexpected operand type!");
+      case MachineOperand::MO_GlobalAddress:
+        printSymbolOperand(*this, MO, O);
+        break;
+      }
+      return false;
+
+    case 'A': // Print '*' before a register (it must be a register)
+      if (MO.isReg()) {
+        O << '*';
+        printOperand(*this, MI, OpNo, O);
+        return false;
+      }
+      return true;
+
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+    case 'q': // Print DImode register
+      if (MO.isReg())
+        return printAsmMRegister(*this, MO, ExtraCode[0], O);
+      printOperand(*this, MI, OpNo, O);
+      return false;
+
+    case 'P': // This is the operand of a call, treat specially.
+      printPCRelImm(*this, MI, OpNo, O);
+      return false;
+
+    case 'n':  // Negate the immediate or print a '-' before the operand.
+      // Note: this is a temporary solution. It should be handled target
+      // independently as part of the 'MC' work.
+      if (MO.isImm()) {
+        O << -MO.getImm();
+        return false;
+      }
+      O << '-';
+    }
+  }
+
+  printOperand(*this, MI, OpNo, O, /*Modifier*/ nullptr, AsmVariant);
+  return false;
+}
+
+bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                          unsigned OpNo, unsigned AsmVariant,
+                                          const char *ExtraCode,
+                                          raw_ostream &O) {
+  if (AsmVariant) {
+    printIntelMemReference(*this, MI, OpNo, O);
+    return false;
+  }
+
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+    case 'q': // Print SImode register
+      // These only apply to registers, ignore on mem.
+      break;
+    case 'H':
+      printMemReference(*this, MI, OpNo, O, "H");
+      return false;
+    case 'P': // Don't print @PLT, but do print as memory.
+      printMemReference(*this, MI, OpNo, O, "no-rip");
+      return false;
+    }
+  }
+  printMemReference(*this, MI, OpNo, O);
+  return false;
+}
+
+void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
+  const Triple &TT = TM.getTargetTriple();
+
+  if (TT.isOSBinFormatMachO())
+    OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+
+  if (TT.isOSBinFormatCOFF()) {
+    // Emit an absolute @feat.00 symbol.  This appears to be some kind of
+    // compiler features bitfield read by link.exe.
+    if (TT.getArch() == Triple::x86) {
+      MCSymbol *S = MMI->getContext().getOrCreateSymbol(StringRef("@feat.00"));
+      OutStreamer->BeginCOFFSymbolDef(S);
+      OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+      OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
+      OutStreamer->EndCOFFSymbolDef();
+      // According to the PE-COFF spec, the LSB of this value marks the object
+      // for "registered SEH".  This means that all SEH handler entry points
+      // must be registered in .sxdata.  Use of any unregistered handlers will
+      // cause the process to terminate immediately.  LLVM does not know how to
+      // register any SEH handlers, so its object files should be safe.
+      OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
+      OutStreamer->EmitAssignment(
+          S, MCConstantExpr::create(int64_t(1), MMI->getContext()));
+    }
+  }
+  OutStreamer->EmitSyntaxDirective();
+
+  // If this is not inline asm and we're in 16-bit
+  // mode prefix assembly with .code16.
+  bool is16 = TT.getEnvironment() == Triple::CODE16;
+  if (M.getModuleInlineAsm().empty() && is16)
+    OutStreamer->EmitAssemblerFlag(MCAF_Code16);
+}
+
+static void
+emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
+                         MachineModuleInfoImpl::StubValueTy &MCSym) {
+  // L_foo$stub:
+  OutStreamer.EmitLabel(StubLabel);
+  //   .indirect_symbol _foo
+  OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+
+  if (MCSym.getInt())
+    // External to current translation unit.
+    OutStreamer.EmitIntValue(0, 4/*size*/);
+  else
+    // Internal to current translation unit.
+    //
+    // When we place the LSDA into the TEXT section, the type info
+    // pointers need to be indirect and pc-rel. We accomplish this by
+    // using NLPs; however, sometimes the types are local to the file.
+    // We need to fill in the value for the NLP in those cases.
+    OutStreamer.EmitValue(
+        MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()),
+        4 /*size*/);
+}
+
+MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const {
+  if (Subtarget->isTargetKnownWindowsMSVC()) {
+    const MachineConstantPoolEntry &CPE =
+        MF->getConstantPool()->getConstants()[CPID];
+    if (!CPE.isMachineConstantPoolEntry()) {
+      const DataLayout &DL = MF->getDataLayout();
+      SectionKind Kind = CPE.getSectionKind(&DL);
+      const Constant *C = CPE.Val.ConstVal;
+      unsigned Align = CPE.Alignment;
+      if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
+              getObjFileLowering().getSectionForConstant(DL, Kind, C, Align))) {
+        if (MCSymbol *Sym = S->getCOMDATSymbol()) {
+          if (Sym->isUndefined())
+            OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global);
+          return Sym;
+        }
+      }
+    }
+  }
+
+  return AsmPrinter::GetCPISymbol(CPID);
+}
+
+void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
+  const Triple &TT = TM.getTargetTriple();
+
+  if (TT.isOSBinFormatMachO()) {
+    // All darwin targets use mach-o.
+    MachineModuleInfoMachO &MMIMacho =
+        MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+    // Output stubs for dynamically-linked functions.
+    MachineModuleInfoMachO::SymbolListTy Stubs;
+
+    // Output stubs for external and common global variables.
+    Stubs = MMIMacho.GetGVStubList();
+    if (!Stubs.empty()) {
+      MCSection *TheSection = OutContext.getMachOSection(
+          "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS,
+          SectionKind::getMetadata());
+      OutStreamer->SwitchSection(TheSection);
+
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
+
+      Stubs.clear();
+      OutStreamer->AddBlankLine();
+    }
+
+    SM.serializeToStackMapSection();
+    FM.serializeToFaultMapSection();
+
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+  }
+
+  if (TT.isKnownWindowsMSVCEnvironment() && MMI->usesVAFloatArgument()) {
+    StringRef SymbolName =
+        (TT.getArch() == Triple::x86_64) ? "_fltused" : "__fltused";
+    MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName);
+    OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
+  }
+
+  if (TT.isOSBinFormatCOFF()) {
+    const TargetLoweringObjectFileCOFF &TLOFCOFF =
+        static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering());
+
+    std::string Flags;
+    raw_string_ostream FlagsOS(Flags);
+
+    for (const auto &Function : M)
+      TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Function);
+    for (const auto &Global : M.globals())
+      TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Global);
+    for (const auto &Alias : M.aliases())
+      TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Alias);
+
+    FlagsOS.flush();
+
+    // Output collected flags.
+    if (!Flags.empty()) {
+      OutStreamer->SwitchSection(TLOFCOFF.getDrectveSection());
+      OutStreamer->EmitBytes(Flags);
+    }
+
+    SM.serializeToStackMapSection();
+  }
+
+  if (TT.isOSBinFormatELF()) {
+    SM.serializeToStackMapSection();
+    FM.serializeToFaultMapSection();
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmPrinter() {
+  RegisterAsmPrinter<X86AsmPrinter> X(getTheX86_32Target());
+  RegisterAsmPrinter<X86AsmPrinter> Y(getTheX86_64Target());
+}
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
new file mode 100644
index 000000000000..6798253d0f6a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -0,0 +1,141 @@
+//===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
+#define LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
+
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/FaultMaps.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/Target/TargetMachine.h"
+
+// Implemented in X86MCInstLower.cpp
+namespace {
+  class X86MCInstLower;
+}
+
+namespace llvm {
+class MCStreamer;
+class MCSymbol;
+
+class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
+  const X86Subtarget *Subtarget;
+  StackMaps SM;
+  FaultMaps FM;
+  std::unique_ptr<MCCodeEmitter> CodeEmitter;
+
+  // This utility class tracks the length of a stackmap instruction's 'shadow'.
+  // It is used by the X86AsmPrinter to ensure that the stackmap shadow
+  // invariants (i.e. no other stackmaps, patchpoints, or control flow within
+  // the shadow) are met, while outputting a minimal number of NOPs for padding.
+  //
+  // To minimise the number of NOPs used, the shadow tracker counts the number
+  // of instruction bytes output since the last stackmap. Only if there are too
+  // few instruction bytes to cover the shadow are NOPs used for padding.
+  class StackMapShadowTracker {
+  public:
+    void startFunction(MachineFunction &MF) {
+      this->MF = &MF;
+    }
+    void count(MCInst &Inst, const MCSubtargetInfo &STI,
+               MCCodeEmitter *CodeEmitter);
+
+    // Called to signal the start of a shadow of RequiredSize bytes.
+    void reset(unsigned RequiredSize) {
+      RequiredShadowSize = RequiredSize;
+      CurrentShadowSize = 0;
+      InShadow = true;
+    }
+
+    // Called before every stackmap/patchpoint, and at the end of basic blocks,
+    // to emit any necessary padding-NOPs.
+    void emitShadowPadding(MCStreamer &OutStreamer, const MCSubtargetInfo &STI);
+  private:
+    const MachineFunction *MF;
+    bool InShadow = false;
+
+    // RequiredShadowSize holds the length of the shadow specified in the most
+    // recently encountered STACKMAP instruction.
+    // CurrentShadowSize counts the number of bytes encoded since the most
+    // recently encountered STACKMAP, stopping when that number is greater than
+    // or equal to RequiredShadowSize.
+    unsigned RequiredShadowSize = 0, CurrentShadowSize = 0;
+  };
+
+  StackMapShadowTracker SMShadowTracker;
+
+  // All instructions emitted by the X86AsmPrinter should use this helper
+  // method.
+  //
+  // This helper function invokes the SMShadowTracker on each instruction before
+  // outputting it to the OutStream. This allows the shadow tracker to minimise
+  // the number of NOPs used for stackmap padding.
+  void EmitAndCountInstruction(MCInst &Inst);
+  void LowerSTACKMAP(const MachineInstr &MI);
+  void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
+  void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
+  void LowerFAULTING_LOAD_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
+  void LowerPATCHABLE_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
+
+  void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI);
+
+  // XRay-specific lowering for X86.
+  void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
+                                     X86MCInstLower &MCIL);
+  void LowerPATCHABLE_RET(const MachineInstr &MI, X86MCInstLower &MCIL);
+  void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
+
+  // Helper function that emits the XRay sleds we've collected for a particular
+  // function.
+  void EmitXRayTable();
+
+public:
+  explicit X86AsmPrinter(TargetMachine &TM,
+                         std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)), SM(*this), FM(*this) {}
+
+  StringRef getPassName() const override {
+    return "X86 Assembly Printer";
+  }
+
+  const X86Subtarget &getSubtarget() const { return *Subtarget; }
+
+  void EmitStartOfAsmFile(Module &M) override;
+
+  void EmitEndOfAsmFile(Module &M) override;
+
+  void EmitInstruction(const MachineInstr *MI) override;
+
+  void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override {
+    SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+  }
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &OS) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &OS) override;
+
+  /// \brief Return the symbol for the specified constant pool entry.
+  MCSymbol *GetCPISymbol(unsigned CPID) const override;
+
+  bool doInitialization(Module &M) override {
+    SMShadowTracker.reset(0);
+    SM.reset();
+    return AsmPrinter::doInitialization(M);
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
new file mode 100644
index 000000000000..844c66d5a462
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -0,0 +1,591 @@
+//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that optimizes call sequences on x86.
+// Currently, it converts movs of function parameters onto the stack into
+// pushes. This is beneficial for two main reasons:
+// 1) The push instruction encoding is much smaller than a stack-ptr-based mov.
+// 2) It is possible to push memory arguments directly. So, if the
+//    the transformation is performed pre-reg-alloc, it can help relieve
+//    register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-cf-opt"
+
+static cl::opt<bool>
+    NoX86CFOpt("no-x86-call-frame-opt",
+               cl::desc("Avoid optimizing x86 call frames for size"),
+               cl::init(false), cl::Hidden);
+
+namespace {
+class X86CallFrameOptimization : public MachineFunctionPass {
+public:
+  X86CallFrameOptimization() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  // Information we know about a particular call site
+  struct CallContext {
+    CallContext()
+        : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0),
+          MovVector(4, nullptr), NoStackParams(false), UsePush(false) {}
+
+    // Iterator referring to the frame setup instruction
+    MachineBasicBlock::iterator FrameSetup;
+
+    // Actual call instruction
+    MachineInstr *Call;
+
+    // A copy of the stack pointer
+    MachineInstr *SPCopy;
+
+    // The total displacement of all passed parameters
+    int64_t ExpectedDist;
+
+    // The sequence of movs used to pass the parameters
+    SmallVector<MachineInstr *, 4> MovVector;
+
+    // True if this call site has no stack parameters
+    bool NoStackParams;
+
+    // True if this call site can use push instructions
+    bool UsePush;
+  };
+
+  typedef SmallVector<CallContext, 8> ContextVector;
+
+  bool isLegal(MachineFunction &MF);
+
+  bool isProfitable(MachineFunction &MF, ContextVector &CallSeqMap);
+
+  void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator I, CallContext &Context);
+
+  void adjustCallSequence(MachineFunction &MF, const CallContext &Context);
+
+  MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
+                                   unsigned Reg);
+
+  enum InstClassification { Convert, Skip, Exit };
+
+  InstClassification classifyInstruction(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                         const X86RegisterInfo &RegInfo,
+                                         DenseSet<unsigned int> &UsedRegs);
+
+  StringRef getPassName() const override { return "X86 Optimize Call Frame"; }
+
+  const TargetInstrInfo *TII;
+  const X86FrameLowering *TFL;
+  const X86Subtarget *STI;
+  MachineRegisterInfo *MRI;
+  unsigned SlotSize;
+  unsigned Log2SlotSize;
+  static char ID;
+};
+
+char X86CallFrameOptimization::ID = 0;
+} // end anonymous namespace
+
+FunctionPass *llvm::createX86CallFrameOptimization() {
+  return new X86CallFrameOptimization();
+}
+
+// This checks whether the transformation is legal.
+// Also returns false in cases where it's potentially legal, but
+// we don't even want to try.
+bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
+  if (NoX86CFOpt.getValue())
+    return false;
+
+  // Work around LLVM PR30879 (bad interaction between CFO and libunwind)
+  if (STI->isTargetFreeBSD() && STI->is32Bit() &&
+      STI->getTargetTriple().getOSMajorVersion() >= 12)
+    return false;
+
+  // We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset
+  // in the compact unwind encoding that Darwin uses. So, bail if there
+  // is a danger of that being generated.
+  if (STI->isTargetDarwin() &&
+      (!MF.getLandingPads().empty() ||
+       (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF))))
+    return false;
+
+  // It is not valid to change the stack pointer outside the prolog/epilog
+  // on 64-bit Windows.
+  if (STI->isTargetWin64())
+    return false;
+
+  // You would expect straight-line code between call-frame setup and
+  // call-frame destroy. You would be wrong. There are circumstances (e.g.
+  // CMOV_GR8 expansion of a select that feeds a function call!) where we can
+  // end up with the setup and the destroy in different basic blocks.
+  // This is bad, and breaks SP adjustment.
+  // So, check that all of the frames in the function are closed inside
+  // the same block, and, for good measure, that there are no nested frames.
+  unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+  unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+  for (MachineBasicBlock &BB : MF) {
+    bool InsideFrameSequence = false;
+    for (MachineInstr &MI : BB) {
+      if (MI.getOpcode() == FrameSetupOpcode) {
+        if (InsideFrameSequence)
+          return false;
+        InsideFrameSequence = true;
+      } else if (MI.getOpcode() == FrameDestroyOpcode) {
+        if (!InsideFrameSequence)
+          return false;
+        InsideFrameSequence = false;
+      }
+    }
+
+    if (InsideFrameSequence)
+      return false;
+  }
+
+  return true;
+}
+
+// Check whether this transformation is profitable for a particular
+// function - in terms of code size.
+bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
+                                            ContextVector &CallSeqVector) {
+  // This transformation is always a win when we do not expect to have
+  // a reserved call frame. Under other circumstances, it may be either
+  // a win or a loss, and requires a heuristic.
+  bool CannotReserveFrame = MF.getFrameInfo().hasVarSizedObjects();
+  if (CannotReserveFrame)
+    return true;
+
+  unsigned StackAlign = TFL->getStackAlignment();
+
+  int64_t Advantage = 0;
+  for (auto CC : CallSeqVector) {
+    // Call sites where no parameters are passed on the stack
+    // do not affect the cost, since there needs to be no
+    // stack adjustment.
+    if (CC.NoStackParams)
+      continue;
+
+    if (!CC.UsePush) {
+      // If we don't use pushes for a particular call site,
+      // we pay for not having a reserved call frame with an
+      // additional sub/add esp pair. The cost is ~3 bytes per instruction,
+      // depending on the size of the constant.
+      // TODO: Callee-pop functions should have a smaller penalty, because
+      // an add is needed even with a reserved call frame.
+      Advantage -= 6;
+    } else {
+      // We can use pushes. First, account for the fixed costs.
+      // We'll need a add after the call.
+      Advantage -= 3;
+      // If we have to realign the stack, we'll also need a sub before
+      if (CC.ExpectedDist % StackAlign)
+        Advantage -= 3;
+      // Now, for each push, we save ~3 bytes. For small constants, we actually,
+      // save more (up to 5 bytes), but 3 should be a good approximation.
+      Advantage += (CC.ExpectedDist >> Log2SlotSize) * 3;
+    }
+  }
+
+  return Advantage >= 0;
+}
+
+bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
+  STI = &MF.getSubtarget<X86Subtarget>();
+  TII = STI->getInstrInfo();
+  TFL = STI->getFrameLowering();
+  MRI = &MF.getRegInfo();
+
+  const X86RegisterInfo &RegInfo =
+      *static_cast<const X86RegisterInfo *>(STI->getRegisterInfo());
+  SlotSize = RegInfo.getSlotSize();
+  assert(isPowerOf2_32(SlotSize) && "Expect power of 2 stack slot size");
+  Log2SlotSize = Log2_32(SlotSize);
+
+  if (skipFunction(*MF.getFunction()) || !isLegal(MF))
+    return false;
+
+  unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+
+  bool Changed = false;
+
+  ContextVector CallSeqVector;
+
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
+      if (MI.getOpcode() == FrameSetupOpcode) {
+        CallContext Context;
+        collectCallInfo(MF, MBB, MI, Context);
+        CallSeqVector.push_back(Context);
+      }
+
+  if (!isProfitable(MF, CallSeqVector))
+    return false;
+
+  for (auto CC : CallSeqVector) {
+    if (CC.UsePush) {
+      adjustCallSequence(MF, CC);
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+X86CallFrameOptimization::InstClassification
+X86CallFrameOptimization::classifyInstruction(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const X86RegisterInfo &RegInfo, DenseSet<unsigned int> &UsedRegs) {
+  if (MI == MBB.end())
+    return Exit;
+
+  // The instructions we actually care about are movs onto the stack
+  int Opcode = MI->getOpcode();
+  if (Opcode == X86::MOV32mi   || Opcode == X86::MOV32mr ||
+      Opcode == X86::MOV64mi32 || Opcode == X86::MOV64mr)
+    return Convert;
+
+  // Not all calling conventions have only stack MOVs between the stack
+  // adjust and the call.
+
+  // We want to tolerate other instructions, to cover more cases.
+  // In particular:
+  // a) PCrel calls, where we expect an additional COPY of the basereg.
+  // b) Passing frame-index addresses.
+  // c) Calling conventions that have inreg parameters. These generate
+  //    both copies and movs into registers.
+  // To avoid creating lots of special cases, allow any instruction
+  // that does not write into memory, does not def or use the stack
+  // pointer, and does not def any register that was used by a preceding
+  // push.
+  // (Reading from memory is allowed, even if referenced through a
+  // frame index, since these will get adjusted properly in PEI)
+
+  // The reason for the last condition is that the pushes can't replace
+  // the movs in place, because the order must be reversed.
+  // So if we have a MOV32mr that uses EDX, then an instruction that defs
+  // EDX, and then the call, after the transformation the push will use
+  // the modified version of EDX, and not the original one.
+  // Since we are still in SSA form at this point, we only need to
+  // make sure we don't clobber any *physical* registers that were
+  // used by an earlier mov that will become a push.
+
+  if (MI->isCall() || MI->mayStore())
+    return Exit;
+
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    unsigned int Reg = MO.getReg();
+    if (!RegInfo.isPhysicalRegister(Reg))
+      continue;
+    if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister()))
+      return Exit;
+    if (MO.isDef()) {
+      for (unsigned int U : UsedRegs)
+        if (RegInfo.regsOverlap(Reg, U))
+          return Exit;
+    }
+  }
+
+  return Skip;
+}
+
+void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
+                                               MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator I,
+                                               CallContext &Context) {
+  // Check that this particular call sequence is amenable to the
+  // transformation.
+  const X86RegisterInfo &RegInfo =
+      *static_cast<const X86RegisterInfo *>(STI->getRegisterInfo());
+  unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+
+  // We expect to enter this at the beginning of a call sequence
+  assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
+  MachineBasicBlock::iterator FrameSetup = I++;
+  Context.FrameSetup = FrameSetup;
+
+  // How much do we adjust the stack? This puts an upper bound on
+  // the number of parameters actually passed on it.
+  unsigned int MaxAdjust =
+      FrameSetup->getOperand(0).getImm() >> Log2SlotSize;
+
+  // A zero adjustment means no stack parameters
+  if (!MaxAdjust) {
+    Context.NoStackParams = true;
+    return;
+  }
+
+  // Skip over DEBUG_VALUE.
+  // For globals in PIC mode, we can have some LEAs here. Skip them as well.
+  // TODO: Extend this to something that covers more cases.
+  while (I->getOpcode() == X86::LEA32r || I->isDebugValue())
+    ++I;
+
+  unsigned StackPtr = RegInfo.getStackRegister();
+  // SelectionDAG (but not FastISel) inserts a copy of ESP into a virtual
+  // register here.  If it's there, use that virtual register as stack pointer
+  // instead.
+  if (I->isCopy() && I->getOperand(0).isReg() && I->getOperand(1).isReg() &&
+      I->getOperand(1).getReg() == StackPtr) {
+    Context.SPCopy = &*I++;
+    StackPtr = Context.SPCopy->getOperand(0).getReg();
+  }
+
+  // Scan the call setup sequence for the pattern we're looking for.
+  // We only handle a simple case - a sequence of store instructions that
+  // push a sequence of stack-slot-aligned values onto the stack, with
+  // no gaps between them.
+  if (MaxAdjust > 4)
+    Context.MovVector.resize(MaxAdjust, nullptr);
+
+  InstClassification Classification;
+  DenseSet<unsigned int> UsedRegs;
+
+  while ((Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs)) !=
+         Exit) {
+    if (Classification == Skip) {
+      ++I;
+      continue;
+    }
+
+    // We know the instruction has a supported store opcode.
+    // We only want movs of the form:
+    // mov imm/reg, k(%StackPtr)
+    // If we run into something else, bail.
+    // Note that AddrBaseReg may, counter to its name, not be a register,
+    // but rather a frame index.
+    // TODO: Support the fi case. This should probably work now that we
+    // have the infrastructure to track the stack pointer within a call
+    // sequence.
+    if (!I->getOperand(X86::AddrBaseReg).isReg() ||
+        (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
+        !I->getOperand(X86::AddrScaleAmt).isImm() ||
+        (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
+        (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
+        (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
+        !I->getOperand(X86::AddrDisp).isImm())
+      return;
+
+    int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
+    assert(StackDisp >= 0 &&
+           "Negative stack displacement when passing parameters");
+
+    // We really don't want to consider the unaligned case.
+    if (StackDisp & (SlotSize - 1))
+      return;
+    StackDisp >>= Log2SlotSize;
+
+    assert((size_t)StackDisp < Context.MovVector.size() &&
+           "Function call has more parameters than the stack is adjusted for.");
+
+    // If the same stack slot is being filled twice, something's fishy.
+    if (Context.MovVector[StackDisp] != nullptr)
+      return;
+    Context.MovVector[StackDisp] = &*I;
+
+    for (const MachineOperand &MO : I->uses()) {
+      if (!MO.isReg())
+        continue;
+      unsigned int Reg = MO.getReg();
+      if (RegInfo.isPhysicalRegister(Reg))
+        UsedRegs.insert(Reg);
+    }
+
+    ++I;
+  }
+
+  // We now expect the end of the sequence. If we stopped early,
+  // or reached the end of the block without finding a call, bail.
+  if (I == MBB.end() || !I->isCall())
+    return;
+
+  Context.Call = &*I;
+  if ((++I)->getOpcode() != FrameDestroyOpcode)
+    return;
+
+  // Now, go through the vector, and see that we don't have any gaps,
+  // but only a series of MOVs.
+  auto MMI = Context.MovVector.begin(), MME = Context.MovVector.end();
+  for (; MMI != MME; ++MMI, Context.ExpectedDist += SlotSize)
+    if (*MMI == nullptr)
+      break;
+
+  // If the call had no parameters, do nothing
+  if (MMI == Context.MovVector.begin())
+    return;
+
+  // We are either at the last parameter, or a gap.
+  // Make sure it's not a gap
+  for (; MMI != MME; ++MMI)
+    if (*MMI != nullptr)
+      return;
+
+  Context.UsePush = true;
+}
+
+void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
+                                                  const CallContext &Context) {
+  // Ok, we can in fact do the transformation for this call.
+  // Do not remove the FrameSetup instruction, but adjust the parameters.
+  // PEI will end up finalizing the handling of this.
+  MachineBasicBlock::iterator FrameSetup = Context.FrameSetup;
+  MachineBasicBlock &MBB = *(FrameSetup->getParent());
+  FrameSetup->getOperand(1).setImm(Context.ExpectedDist);
+
+  DebugLoc DL = FrameSetup->getDebugLoc();
+  bool Is64Bit = STI->is64Bit();
+  // Now, iterate through the vector in reverse order, and replace the movs
+  // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
+  // replace uses.
+  for (int Idx = (Context.ExpectedDist >> Log2SlotSize) - 1; Idx >= 0; --Idx) {
+    MachineBasicBlock::iterator MOV = *Context.MovVector[Idx];
+    MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
+    MachineBasicBlock::iterator Push = nullptr;
+    unsigned PushOpcode;
+    switch (MOV->getOpcode()) {
+    default:
+      llvm_unreachable("Unexpected Opcode!");
+    case X86::MOV32mi:
+    case X86::MOV64mi32:
+      PushOpcode = Is64Bit ? X86::PUSH64i32 : X86::PUSHi32;
+      // If the operand is a small (8-bit) immediate, we can use a
+      // PUSH instruction with a shorter encoding.
+      // Note that isImm() may fail even though this is a MOVmi, because
+      // the operand can also be a symbol.
+      if (PushOp.isImm()) {
+        int64_t Val = PushOp.getImm();
+        if (isInt<8>(Val))
+          PushOpcode = Is64Bit ? X86::PUSH64i8 : X86::PUSH32i8;
+      }
+      Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
+                 .addOperand(PushOp);
+      break;
+    case X86::MOV32mr:
+    case X86::MOV64mr:
+      unsigned int Reg = PushOp.getReg();
+
+      // If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg
+      // in preparation for the PUSH64. The upper 32 bits can be undef.
+      if (Is64Bit && MOV->getOpcode() == X86::MOV32mr) {
+        unsigned UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+        Reg = MRI->createVirtualRegister(&X86::GR64RegClass);
+        BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg);
+        BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg)
+          .addReg(UndefReg)
+          .addOperand(PushOp)
+          .addImm(X86::sub_32bit);
+      }
+
+      // If PUSHrmm is not slow on this target, try to fold the source of the
+      // push into the instruction.
+      bool SlowPUSHrmm = STI->isAtom() || STI->isSLM();
+
+      // Check that this is legal to fold. Right now, we're extremely
+      // conservative about that.
+      MachineInstr *DefMov = nullptr;
+      if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
+        PushOpcode = Is64Bit ? X86::PUSH64rmm : X86::PUSH32rmm;
+        Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode));
+
+        unsigned NumOps = DefMov->getDesc().getNumOperands();
+        for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
+          Push->addOperand(DefMov->getOperand(i));
+
+        DefMov->eraseFromParent();
+      } else {
+        PushOpcode = Is64Bit ? X86::PUSH64r : X86::PUSH32r;
+        Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
+                   .addReg(Reg)
+                   .getInstr();
+      }
+      break;
+    }
+
+    // For debugging, when using SP-based CFA, we need to adjust the CFA
+    // offset after each push.
+    // TODO: This is needed only if we require precise CFA.
+    if (!TFL->hasFP(MF))
+      TFL->BuildCFI(
+          MBB, std::next(Push), DL,
+          MCCFIInstruction::createAdjustCfaOffset(nullptr, SlotSize));
+
+    MBB.erase(MOV);
+  }
+
+  // The stack-pointer copy is no longer used in the call sequences.
+  // There should not be any other users, but we can't commit to that, so:
+  if (Context.SPCopy && MRI->use_empty(Context.SPCopy->getOperand(0).getReg()))
+    Context.SPCopy->eraseFromParent();
+
+  // Once we've done this, we need to make sure PEI doesn't assume a reserved
+  // frame.
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  FuncInfo->setHasPushSequences(true);
+}
+
+MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
+    MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
+  // Do an extremely restricted form of load folding.
+  // ISel will often create patterns like:
+  // movl    4(%edi), %eax
+  // movl    8(%edi), %ecx
+  // movl    12(%edi), %edx
+  // movl    %edx, 8(%esp)
+  // movl    %ecx, 4(%esp)
+  // movl    %eax, (%esp)
+  // call
+  // Get rid of those with prejudice.
+  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    return nullptr;
+
+  // Make sure this is the only use of Reg.
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return nullptr;
+
+  MachineInstr &DefMI = *MRI->getVRegDef(Reg);
+
+  // Make sure the def is a MOV from memory.
+  // If the def is in another block, give up.
+  if ((DefMI.getOpcode() != X86::MOV32rm &&
+       DefMI.getOpcode() != X86::MOV64rm) ||
+      DefMI.getParent() != FrameSetup->getParent())
+    return nullptr;
+
+  // Make sure we don't have any instructions between DefMI and the
+  // push that make folding the load illegal.
+  for (MachineBasicBlock::iterator I = DefMI; I != FrameSetup; ++I)
+    if (I->isLoadFoldBarrier())
+      return nullptr;
+
+  return &DefMI;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86CallLowering.cpp b/contrib/llvm/lib/Target/X86/X86CallLowering.cpp
new file mode 100644
index 000000000000..5ae4962378d3
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallLowering.cpp
@@ -0,0 +1,46 @@
+//===-- llvm/lib/Target/X86/X86CallLowering.cpp - Call lowering -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86CallLowering.h"
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "This shouldn't be built without GISel"
+#endif
+
+X86CallLowering::X86CallLowering(const X86TargetLowering &TLI)
+    : CallLowering(&TLI) {}
+
+bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+                                  const Value *Val, unsigned VReg) const {
+  // TODO: handle functions returning non-void values.
+  if (Val)
+    return false;
+
+  MIRBuilder.buildInstr(X86::RET).addImm(0);
+
+  return true;
+}
+
+bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+                                           const Function &F,
+                                           ArrayRef<unsigned> VRegs) const {
+  // TODO: handle functions with one or more arguments.
+  return F.arg_empty();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86CallLowering.h b/contrib/llvm/lib/Target/X86/X86CallLowering.h
new file mode 100644
index 000000000000..f2672f09d855
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallLowering.h
@@ -0,0 +1,39 @@
+//===-- llvm/lib/Target/X86/X86CallLowering.h - Call lowering -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86CALLLOWERING
+#define LLVM_LIB_TARGET_X86_X86CALLLOWERING
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+
+namespace llvm {
+
+class Function;
+class MachineIRBuilder;
+class X86TargetLowering;
+class Value;
+
+class X86CallLowering : public CallLowering {
+public:
+  X86CallLowering(const X86TargetLowering &TLI);
+
+  bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+                   unsigned VReg) const override;
+
+  bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+                            ArrayRef<unsigned> VRegs) const override;
+};
+} // End of namespace llvm;
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.cpp b/contrib/llvm/lib/Target/X86/X86CallingConv.cpp
new file mode 100644
index 000000000000..59dde982f512
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -0,0 +1,208 @@
+//=== X86CallingConv.cpp - X86 Custom Calling Convention Impl   -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of custom routines for the X86
+// Calling Convention that aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                   CCValAssign::LocInfo &LocInfo,
+                                   ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  // List of GPR registers that are available to store values in regcall
+  // calling convention.
+  static const MCPhysReg RegList[] = {X86::EAX, X86::ECX, X86::EDX, X86::EDI,
+                                      X86::ESI};
+
+  // The vector will save all the available registers for allocation.
+  SmallVector<unsigned, 5> AvailableRegs;
+
+  // searching for the available registers.
+  for (auto Reg : RegList) {
+    if (!State.isAllocated(Reg))
+      AvailableRegs.push_back(Reg);
+  }
+
+  const size_t RequiredGprsUponSplit = 2;
+  if (AvailableRegs.size() < RequiredGprsUponSplit)
+    return false; // Not enough free registers - continue the search.
+
+  // Allocating the available registers.
+  for (unsigned I = 0; I < RequiredGprsUponSplit; I++) {
+
+    // Marking the register as located.
+    unsigned Reg = State.AllocateReg(AvailableRegs[I]);
+
+    // Since we previously made sure that 2 registers are available
+    // we expect that a real register number will be returned.
+    assert(Reg && "Expecting a register will be available");
+
+    // Assign the value to the allocated register
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  }
+
+  // Successful in allocating regsiters - stop scanning next rules.
+  return true;
+}
+
+static ArrayRef<MCPhysReg> CC_X86_VectorCallGetSSEs(const MVT &ValVT) {
+  if (ValVT.is512BitVector()) {
+    static const MCPhysReg RegListZMM[] = {X86::ZMM0, X86::ZMM1, X86::ZMM2,
+                                           X86::ZMM3, X86::ZMM4, X86::ZMM5};
+    return makeArrayRef(std::begin(RegListZMM), std::end(RegListZMM));
+  }
+
+  if (ValVT.is256BitVector()) {
+    static const MCPhysReg RegListYMM[] = {X86::YMM0, X86::YMM1, X86::YMM2,
+                                           X86::YMM3, X86::YMM4, X86::YMM5};
+    return makeArrayRef(std::begin(RegListYMM), std::end(RegListYMM));
+  }
+
+  static const MCPhysReg RegListXMM[] = {X86::XMM0, X86::XMM1, X86::XMM2,
+                                         X86::XMM3, X86::XMM4, X86::XMM5};
+  return makeArrayRef(std::begin(RegListXMM), std::end(RegListXMM));
+}
+
+static ArrayRef<MCPhysReg> CC_X86_64_VectorCallGetGPRs() {
+  static const MCPhysReg RegListGPR[] = {X86::RCX, X86::RDX, X86::R8, X86::R9};
+  return makeArrayRef(std::begin(RegListGPR), std::end(RegListGPR));
+}
+
+static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT,
+                                            MVT &LocVT,
+                                            CCValAssign::LocInfo &LocInfo,
+                                            ISD::ArgFlagsTy &ArgFlags,
+                                            CCState &State) {
+
+  ArrayRef<MCPhysReg> RegList = CC_X86_VectorCallGetSSEs(ValVT);
+  bool Is64bit = static_cast<const X86Subtarget &>(
+                     State.getMachineFunction().getSubtarget())
+                     .is64Bit();
+
+  for (auto Reg : RegList) {
+    // If the register is not marked as allocated - assign to it.
+    if (!State.isAllocated(Reg)) {
+      unsigned AssigedReg = State.AllocateReg(Reg);
+      assert(AssigedReg == Reg && "Expecting a valid register allocation");
+      State.addLoc(
+          CCValAssign::getReg(ValNo, ValVT, AssigedReg, LocVT, LocInfo));
+      return true;
+    }
+    // If the register is marked as shadow allocated - assign to it.
+    if (Is64bit && State.IsShadowAllocatedReg(Reg)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return true;
+    }
+  }
+
+  llvm_unreachable("Clang should ensure that hva marked vectors will have "
+                   "an available register.");
+  return false;
+}
+
+bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  // On the second pass, go through the HVAs only.
+  if (ArgFlags.isSecArgPass()) {
+    if (ArgFlags.isHva())
+      return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
+                                             ArgFlags, State);
+    return true;
+  }
+
+  // Process only vector types as defined by vectorcall spec:
+  // "A vector type is either a floating-point type, for example,
+  //  a float or double, or an SIMD vector type, for example, __m128 or __m256".
+  if (!(ValVT.isFloatingPoint() ||
+        (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
+    // If R9 was already assigned it means that we are after the fourth element
+    // and because this is not an HVA / Vector type, we need to allocate
+    // shadow XMM register.
+    if (State.isAllocated(X86::R9)) {
+      // Assign shadow XMM register.
+      (void)State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT));
+    }
+
+    return false;
+  }
+
+  if (!ArgFlags.isHva() || ArgFlags.isHvaStart()) {
+    // Assign shadow GPR register.
+    (void)State.AllocateReg(CC_X86_64_VectorCallGetGPRs());
+
+    // Assign XMM register - (shadow for HVA and non-shadow for non HVA).
+    if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+      // In Vectorcall Calling convention, additional shadow stack can be
+      // created on top of the basic 32 bytes of win64.
+      // It can happen if the fifth or sixth argument is vector type or HVA.
+      // At that case for each argument a shadow stack of 8 bytes is allocated.
+      if (Reg == X86::XMM4 || Reg == X86::XMM5)
+        State.AllocateStack(8, 8);
+
+      if (!ArgFlags.isHva()) {
+        State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+        return true; // Allocated a register - Stop the search.
+      }
+    }
+  }
+
+  // If this is an HVA - Stop the search,
+  // otherwise continue the search.
+  return ArgFlags.isHva();
+}
+
+bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  // On the second pass, go through the HVAs only.
+  if (ArgFlags.isSecArgPass()) {
+    if (ArgFlags.isHva())
+      return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
+                                             ArgFlags, State);
+    return true;
+  }
+
+  // Process only vector types as defined by vectorcall spec:
+  // "A vector type is either a floating point type, for example,
+  //  a float or double, or an SIMD vector type, for example, __m128 or __m256".
+  if (!(ValVT.isFloatingPoint() ||
+        (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
+    return false;
+  }
+
+  if (ArgFlags.isHva())
+    return true; // If this is an HVA - Stop the search.
+
+  // Assign XMM register.
+  if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return true;
+  }
+
+  // In case we did not find an available XMM register for a vector -
+  // pass it indirectly.
+  // It is similar to CCPassIndirect, with the addition of inreg.
+  if (!ValVT.isFloatingPoint()) {
+    LocVT = MVT::i32;
+    LocInfo = CCValAssign::Indirect;
+    ArgFlags.setInReg();
+  }
+
+  return false; // No register was assigned - Continue the search.
+}
+
+} // End llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h
new file mode 100644
index 000000000000..c49a6838fa44
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h
@@ -0,0 +1,121 @@
+//=== X86CallingConv.h - X86 Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the X86 Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86CALLINGCONV_H
+#define LLVM_LIB_TARGET_X86_X86CALLINGCONV_H
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+/// When regcall calling convention compiled to 32 bit arch, special treatment
+/// is required for 64 bit masks.
+/// The value should be assigned to two GPRs.
+/// \return true if registers were allocated and false otherwise.
+bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                   CCValAssign::LocInfo &LocInfo,
+                                   ISD::ArgFlagsTy &ArgFlags, CCState &State);
+
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 64 bit arch.
+/// For HVAs shadow registers might be allocated on the first pass
+/// and actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State);
+
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 32 bit arch.
+/// For HVAs actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State);
+
+inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
+                                CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
+                                CCState &) {
+  llvm_unreachable("The AnyReg calling convention is only supported by the " \
+                   "stackmap and patchpoint intrinsics.");
+  // gracefully fallback to X86 C calling convention on Release builds.
+  return false;
+}
+
+inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT,
+                                         MVT &LocVT,
+                                         CCValAssign::LocInfo &LocInfo,
+                                         ISD::ArgFlagsTy &ArgFlags,
+                                         CCState &State) {
+  // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure
+  // not to split i64 and double between a register and stack
+  static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX};
+  static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]);
+  
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+  // If this is the first part of an double/i64/i128, or if we're already
+  // in the middle of a split, add to the pending list. If this is not
+  // the end of the split, return, otherwise go on to process the pending
+  // list
+  if (ArgFlags.isSplit() || !PendingMembers.empty()) {
+    PendingMembers.push_back(
+        CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+    if (!ArgFlags.isSplitEnd())
+      return true;
+  }
+
+  // If there are no pending members, we are not in the middle of a split,
+  // so do the usual inreg stuff.
+  if (PendingMembers.empty()) {
+    if (unsigned Reg = State.AllocateReg(RegList)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return true;
+    }
+    return false;
+  }
+
+  assert(ArgFlags.isSplitEnd());
+
+  // We now have the entire original argument in PendingMembers, so decide
+  // whether to use registers or the stack.
+  // Per the MCU ABI:
+  // a) To use registers, we need to have enough of them free to contain
+  // the entire argument.
+  // b) We never want to use more than 2 registers for a single argument.
+
+  unsigned FirstFree = State.getFirstUnallocated(RegList);
+  bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree);
+
+  for (auto &It : PendingMembers) {
+    if (UseRegs)
+      It.convertToReg(State.AllocateReg(RegList[FirstFree++]));
+    else
+      It.convertToMem(State.AllocateStack(4, 4));
+    State.addLoc(It);
+  }
+
+  PendingMembers.clear();
+
+  return true;
+}
+
+} // End llvm namespace
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td
new file mode 100644
index 000000000000..cf7bc981b8a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td
@@ -0,0 +1,1121 @@
+//===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the X86-32 and X86-64
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+    : CCIf<!strconcat("static_cast<const X86Subtarget&>"
+                       "(State.getMachineFunction().getSubtarget()).", F),
+           A>;
+
+// Register classes for RegCall
+class RC_X86_RegCall {
+  list<Register> GPR_8 = [];
+  list<Register> GPR_16 = [];
+  list<Register> GPR_32 = [];
+  list<Register> GPR_64 = [];
+  list<Register> FP_CALL = [FP0];
+  list<Register> FP_RET = [FP0, FP1];
+  list<Register> XMM = [];
+  list<Register> YMM = [];
+  list<Register> ZMM = [];
+}
+
+// RegCall register classes for 32 bits
+def RC_X86_32_RegCall : RC_X86_RegCall {
+  let GPR_8 = [AL, CL, DL, DIL, SIL];
+  let GPR_16 = [AX, CX, DX, DI, SI];
+  let GPR_32 = [EAX, ECX, EDX, EDI, ESI];
+  let GPR_64 = [RAX]; ///< Not actually used, but AssignToReg can't handle []
+                      ///< \todo Fix AssignToReg to enable empty lists
+  let XMM = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7];
+  let YMM = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7];
+  let ZMM = [ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7];
+}
+
+class RC_X86_64_RegCall : RC_X86_RegCall {
+  let XMM = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+             XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15];
+  let YMM = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+             YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15];
+  let ZMM = [ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7,
+             ZMM8, ZMM9, ZMM10, ZMM11, ZMM12, ZMM13, ZMM14, ZMM15];
+}
+
+def RC_X86_64_RegCall_Win : RC_X86_64_RegCall {
+  let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R10B, R11B, R12B, R14B, R15B];
+  let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R10W, R11W, R12W, R14W, R15W];
+  let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R10D, R11D, R12D, R14D, R15D];
+  let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11, R12, R14, R15];
+}
+
+def RC_X86_64_RegCall_SysV : RC_X86_64_RegCall {
+  let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R12B, R13B, R14B, R15B];
+  let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R12W, R13W, R14W, R15W];
+  let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R12D, R13D, R14D, R15D];
+  let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R12, R13, R14, R15];
+}
+
+// X86-64 Intel regcall calling convention.
+multiclass X86_RegCall_base<RC_X86_RegCall RC> {
+def CC_#NAME : CallingConv<[
+  // Handles byval parameters.
+    CCIfSubtarget<"is64Bit()", CCIfByVal<CCPassByVal<8, 8>>>,
+    CCIfByVal<CCPassByVal<4, 4>>,
+
+    // Promote i1/i8/i16 arguments to i32.
+    CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+    // Promote v8i1/v16i1/v32i1 arguments to i32.
+    CCIfType<[v8i1, v16i1, v32i1], CCPromoteToType<i32>>,
+
+    // bool, char, int, enum, long, pointer --> GPR
+    CCIfType<[i32], CCAssignToReg<RC.GPR_32>>,
+
+    // long long, __int64 --> GPR
+    CCIfType<[i64], CCAssignToReg<RC.GPR_64>>,
+
+    // __mmask64 (v64i1) --> GPR64 (for x64) or 2 x GPR32 (for IA32)
+    CCIfType<[v64i1], CCPromoteToType<i64>>,
+    CCIfSubtarget<"is64Bit()", CCIfType<[i64], 
+      CCAssignToReg<RC.GPR_64>>>,
+    CCIfSubtarget<"is32Bit()", CCIfType<[i64], 
+      CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>,
+
+    // float, double, float128 --> XMM
+    // In the case of SSE disabled --> save to stack
+    CCIfType<[f32, f64, f128], 
+      CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>,
+
+    // long double --> FP
+    CCIfType<[f80], CCAssignToReg<RC.FP_CALL>>,
+
+    // __m128, __m128i, __m128d --> XMM
+    // In the case of SSE disabled --> save to stack
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 
+      CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>,
+
+    // __m256, __m256i, __m256d --> YMM
+    // In the case of SSE disabled --> save to stack
+    CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], 
+      CCIfSubtarget<"hasAVX()", CCAssignToReg<RC.YMM>>>,
+
+    // __m512, __m512i, __m512d --> ZMM
+    // In the case of SSE disabled --> save to stack
+    CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], 
+      CCIfSubtarget<"hasAVX512()",CCAssignToReg<RC.ZMM>>>,
+
+    // If no register was found -> assign to stack
+
+    // In 64 bit, assign 64/32 bit values to 8 byte stack
+    CCIfSubtarget<"is64Bit()", CCIfType<[i32, i64, f32, f64], 
+      CCAssignToStack<8, 8>>>,
+
+    // In 32 bit, assign 64/32 bit values to 8/4 byte stack
+    CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+    CCIfType<[i64, f64], CCAssignToStack<8, 4>>,
+
+    // MMX type gets 8 byte slot in stack , while alignment depends on target
+    CCIfSubtarget<"is64Bit()", CCIfType<[x86mmx], CCAssignToStack<8, 8>>>,
+    CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
+
+    // float 128 get stack slots whose size and alignment depends 
+    // on the subtarget.
+    CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
+
+    // Vectors get 16-byte stack slots that are 16-byte aligned.
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 
+      CCAssignToStack<16, 16>>,
+
+    // 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
+    CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], 
+      CCAssignToStack<32, 32>>,
+
+    // 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+    CCIfType<[v16i32, v8i64, v16f32, v8f64], CCAssignToStack<64, 64>>
+]>;
+
+def RetCC_#NAME : CallingConv<[
+    // Promote i1, v8i1 arguments to i8.
+    CCIfType<[i1, v8i1], CCPromoteToType<i8>>,
+
+    // Promote v16i1 arguments to i16.
+    CCIfType<[v16i1], CCPromoteToType<i16>>,
+
+    // Promote v32i1 arguments to i32.
+    CCIfType<[v32i1], CCPromoteToType<i32>>,
+
+    // bool, char, int, enum, long, pointer --> GPR
+    CCIfType<[i8], CCAssignToReg<RC.GPR_8>>,
+    CCIfType<[i16], CCAssignToReg<RC.GPR_16>>,
+    CCIfType<[i32], CCAssignToReg<RC.GPR_32>>,
+
+    // long long, __int64 --> GPR
+    CCIfType<[i64], CCAssignToReg<RC.GPR_64>>,
+
+    // __mmask64 (v64i1) --> GPR64 (for x64) or 2 x GPR32 (for IA32)
+    CCIfType<[v64i1], CCPromoteToType<i64>>,
+    CCIfSubtarget<"is64Bit()", CCIfType<[i64], 
+      CCAssignToReg<RC.GPR_64>>>,
+    CCIfSubtarget<"is32Bit()", CCIfType<[i64], 
+      CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>,
+
+    // long double --> FP
+    CCIfType<[f80], CCAssignToReg<RC.FP_RET>>,
+
+    // float, double, float128 --> XMM
+    CCIfType<[f32, f64, f128], 
+      CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>,
+
+    // __m128, __m128i, __m128d --> XMM
+    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 
+      CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>,
+
+    // __m256, __m256i, __m256d --> YMM
+    CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], 
+      CCIfSubtarget<"hasAVX()", CCAssignToReg<RC.YMM>>>,
+
+    // __m512, __m512i, __m512d --> ZMM
+    CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], 
+      CCIfSubtarget<"hasAVX512()", CCAssignToReg<RC.ZMM>>>
+]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Return-value conventions common to all X86 CC's.
+def RetCC_X86Common : CallingConv<[
+  // Scalar values are returned in AX first, then DX.  For i8, the ABI
+  // requires the values to be in AL and AH, however this code uses AL and DL
+  // instead. This is because using AH for the second register conflicts with
+  // the way LLVM does multiple return values -- a return of {i16,i8} would end
+  // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI
+  // for functions that return two i8 values are currently expected to pack the
+  // values into an i16 (which uses AX, and thus AL:AH).
+  //
+  // For code that doesn't care about the ABI, we allow returning more than two
+  // integer values in registers.
+  CCIfType<[i1],  CCPromoteToType<i8>>,
+  CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>,
+  CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+  CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>,
+
+  // Boolean vectors of AVX-512 are returned in SIMD registers.
+  // The call from AVX to AVX-512 function should work,
+  // since the boolean types in AVX/AVX2 are promoted by default.
+  CCIfType<[v2i1],  CCPromoteToType<v2i64>>,
+  CCIfType<[v4i1],  CCPromoteToType<v4i32>>,
+  CCIfType<[v8i1],  CCPromoteToType<v8i16>>,
+  CCIfType<[v16i1], CCPromoteToType<v16i8>>,
+  CCIfType<[v32i1], CCPromoteToType<v32i8>>,
+  CCIfType<[v64i1], CCPromoteToType<v64i8>>,
+
+  // Vector types are returned in XMM0 and XMM1, when they fit.  XMM2 and XMM3
+  // can only be used by ABI non-compliant code. If the target doesn't have XMM
+  // registers, it won't have vector types.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3
+  // can only be used by ABI non-compliant code. This vector type is only
+  // supported while using the AVX target feature.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+  // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3
+  // can only be used by ABI non-compliant code. This vector type is only
+  // supported while using the AVX-512 target feature.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+            CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
+  // MMX vector types are always returned in MM0. If the target doesn't have
+  // MM0, it doesn't support these vector types.
+  CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
+
+  // Long double types are always returned in FP0 (even with SSE).
+  CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>
+]>;
+
+// X86-32 C return-value convention.
+def RetCC_X86_32_C : CallingConv<[
+  // The X86-32 calling convention returns FP values in FP0, unless marked
+  // with "inreg" (used here to distinguish one kind of reg from another,
+  // weirdly; this is really the sse-regparm calling convention) in which
+  // case they use XMM0, otherwise it is the same as the common X86 calling
+  // conv.
+  CCIfInReg<CCIfSubtarget<"hasSSE2()",
+    CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+  CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 FastCC return-value convention.
+def RetCC_X86_32_Fast : CallingConv<[
+  // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has
+  // SSE2.
+  // This can happen when a float, 2 x float, or 3 x float vector is split by
+  // target lowering, and is returned in 1-3 sse regs.
+  CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+  CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+
+  // For integers, ECX can be used as an extra return register
+  CCIfType<[i8],  CCAssignToReg<[AL, DL, CL]>>,
+  CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+
+  // Otherwise, it is the same as the common X86 calling convention.
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// Intel_OCL_BI return-value convention.
+def RetCC_Intel_OCL_BI : CallingConv<[
+  // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+  CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // 256-bit FP vectors
+  // No more than 4 registers
+  CCIfType<[v8f32, v4f64, v8i32, v4i64],
+            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+  // 512-bit FP vectors
+  CCIfType<[v16f32, v8f64, v16i32, v8i64],
+            CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
+  // i32, i64 in the standard way
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 HiPE return-value convention.
+def RetCC_X86_32_HiPE : CallingConv<[
+  // Promote all types to i32
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Return: HP, P, VAL1, VAL2
+  CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>>
+]>;
+
+// X86-32 Vectorcall return-value convention.
+def RetCC_X86_32_VectorCall : CallingConv<[
+  // Floating Point types are returned in XMM0,XMM1,XMMM2 and XMM3.
+  CCIfType<[f32, f64, f128],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // Return integers in the standard way.
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-64 C return-value convention.
+def RetCC_X86_64_C : CallingConv<[
+  // The X86-64 calling convention always returns FP values in XMM0.
+  CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
+  CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
+  CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>,
+
+  // MMX vector types are always returned in XMM0.
+  CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
+
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
+
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-Win64 C return-value convention.
+def RetCC_X86_Win64_C : CallingConv<[
+  // The X86-Win64 calling convention always returns __m64 values in RAX.
+  CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+  // Otherwise, everything is the same as 'normal' X86-64 C CC.
+  CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// X86-64 vectorcall return-value convention.
+def RetCC_X86_64_Vectorcall : CallingConv<[
+  // Vectorcall calling convention always returns FP values in XMMs.
+  CCIfType<[f32, f64, f128], 
+    CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+  // Otherwise, everything is the same as Windows X86-64 C CC.
+  CCDelegateTo<RetCC_X86_Win64_C>
+]>;
+
+// X86-64 HiPE return-value convention.
+def RetCC_X86_64_HiPE : CallingConv<[
+  // Promote all types to i64
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Return: HP, P, VAL1, VAL2
+  CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>>
+]>;
+
+// X86-64 WebKit_JS return-value convention.
+def RetCC_X86_64_WebKit_JS : CallingConv<[
+  // Promote all types to i64
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Return: RAX
+  CCIfType<[i64], CCAssignToReg<[RAX]>>
+]>;
+
+def RetCC_X86_64_Swift : CallingConv<[
+
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
+
+  // For integers, ECX, R8D can be used as extra return registers.
+  CCIfType<[i1],  CCPromoteToType<i8>>,
+  CCIfType<[i8] , CCAssignToReg<[AL, DL, CL, R8B]>>,
+  CCIfType<[i16], CCAssignToReg<[AX, DX, CX, R8W]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX, R8D]>>,
+  CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX, R8]>>,
+
+  // XMM0, XMM1, XMM2 and XMM3 can be used to return FP values.
+  CCIfType<[f32], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+  CCIfType<[f64], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+  CCIfType<[f128], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+  // MMX vector types are returned in XMM0, XMM1, XMM2 and XMM3.
+  CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-64 AnyReg return-value convention. No explicit register is specified for
+// the return-value. The register allocator is allowed and expected to choose
+// any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def RetCC_X86_64_AnyReg : CallingConv<[
+  CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
+// X86-64 HHVM return-value convention.
+def RetCC_X86_64_HHVM: CallingConv<[
+  // Promote all types to i64
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Return: could return in any GP register save RSP and R12.
+  CCIfType<[i64], CCAssignToReg<[RBX, RBP, RDI, RSI, RDX, RCX, R8, R9,
+                                 RAX, R10, R11, R13, R14, R15]>>
+]>;
+
+
+defm X86_32_RegCall :
+	 X86_RegCall_base<RC_X86_32_RegCall>;
+defm X86_Win64_RegCall :
+     X86_RegCall_base<RC_X86_64_RegCall_Win>;
+defm X86_SysV64_RegCall :
+     X86_RegCall_base<RC_X86_64_RegCall_SysV>;
+
+// This is the root return-value convention for the X86-32 backend.
+def RetCC_X86_32 : CallingConv<[
+  // If FastCC, use RetCC_X86_32_Fast.
+  CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+  // If HiPE, use RetCC_X86_32_HiPE.
+  CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>,
+  CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<RetCC_X86_32_RegCall>>,
+
+  // Otherwise, use RetCC_X86_32_C.
+  CCDelegateTo<RetCC_X86_32_C>
+]>;
+
+// This is the root return-value convention for the X86-64 backend.
+def RetCC_X86_64 : CallingConv<[
+  // HiPE uses RetCC_X86_64_HiPE
+  CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>,
+
+  // Handle JavaScript calls.
+  CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<RetCC_X86_64_WebKit_JS>>,
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_X86_64_AnyReg>>,
+
+  // Handle Swift calls.
+  CCIfCC<"CallingConv::Swift", CCDelegateTo<RetCC_X86_64_Swift>>,
+
+  // Handle explicit CC selection
+  CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
+  CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
+
+  // Handle Vectorcall CC
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_64_Vectorcall>>,
+
+  // Handle HHVM calls.
+  CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>,
+
+  CCIfCC<"CallingConv::X86_RegCall",
+          CCIfSubtarget<"isTargetWin64()",
+                        CCDelegateTo<RetCC_X86_Win64_RegCall>>>,
+  CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<RetCC_X86_SysV64_RegCall>>,
+          
+  // Mingw64 and native Win64 use Win64 CC
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
+
+  // Otherwise, drop to normal X86-64 CC
+  CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// This is the return-value convention used for the entire X86 backend.
+def RetCC_X86 : CallingConv<[
+
+  // Check if this is the Intel OpenCL built-ins calling convention
+  CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
+
+  CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
+  CCDelegateTo<RetCC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+def CC_X86_64_C : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R10.
+  CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>,
+  CCIfNest<CCAssignToReg<[R10]>>,
+
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R13]>>>,
+
+  // A SwiftError is passed in R12.
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
+
+  // For Swift Calling Convention, pass sret in %RAX.
+  CCIfCC<"CallingConv::Swift",
+    CCIfSRet<CCIfType<[i64], CCAssignToReg<[RAX]>>>>,
+
+  // The first 6 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
+  CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+
+  // The first 8 MMX vector arguments are passed in XMM registers on Darwin.
+  CCIfType<[x86mmx],
+            CCIfSubtarget<"isTargetDarwin()",
+            CCIfSubtarget<"hasSSE2()",
+            CCPromoteToType<v2i64>>>>,
+
+  // Boolean vectors of AVX-512 are passed in SIMD registers.
+  // The call from AVX to AVX-512 function should work,
+  // since the boolean types in AVX/AVX2 are promoted by default.
+  CCIfType<[v2i1],  CCPromoteToType<v2i64>>,
+  CCIfType<[v4i1],  CCPromoteToType<v4i32>>,
+  CCIfType<[v8i1],  CCPromoteToType<v8i16>>,
+  CCIfType<[v16i1], CCPromoteToType<v16i8>>,
+  CCIfType<[v32i1], CCPromoteToType<v32i8>>,
+  CCIfType<[v64i1], CCPromoteToType<v64i8>>,
+
+  // The first 8 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCIfSubtarget<"hasSSE1()",
+            CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
+
+  // The first 8 256-bit vector arguments are passed in YMM registers, unless
+  // this is a vararg function.
+  // FIXME: This isn't precisely correct; the x86-64 ABI document says that
+  // fixed arguments to vararg functions are supposed to be passed in
+  // registers.  Actually modeling that would be a lot of work, though.
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                          CCIfSubtarget<"hasFp256()",
+                          CCAssignToReg<[YMM0, YMM1, YMM2, YMM3,
+                                         YMM4, YMM5, YMM6, YMM7]>>>>,
+
+  // The first 8 512-bit vector arguments are passed in ZMM registers.
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+            CCIfSubtarget<"hasAVX512()",
+            CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+  // Long doubles get stack slots whose size and alignment depends on the
+  // subtarget.
+  CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
+
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToStack<32, 32>>,
+
+  // 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+  CCIfType<[v16i32, v8i64, v16f32, v8f64],
+           CCAssignToStack<64, 64>>
+]>;
+
+// Calling convention for X86-64 HHVM.
+def CC_X86_64_HHVM : CallingConv<[
+  // Use all/any GP registers for args, except RSP.
+  CCIfType<[i64], CCAssignToReg<[RBX, R12, RBP, R15,
+                                 RDI, RSI, RDX, RCX, R8, R9,
+                                 RAX, R10, R11, R13, R14]>>
+]>;
+
+// Calling convention for helper functions in HHVM.
+def CC_X86_64_HHVM_C : CallingConv<[
+  // Pass the first argument in RBP.
+  CCIfType<[i64], CCAssignToReg<[RBP]>>,
+
+  // Otherwise it's the same as the regular C calling convention.
+  CCDelegateTo<CC_X86_64_C>
+]>;
+
+// Calling convention used on Win64
+def CC_X86_Win64_C : CallingConv<[
+  // FIXME: Handle byval stuff.
+  // FIXME: Handle varargs.
+
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R10.
+  CCIfNest<CCAssignToReg<[R10]>>,
+
+  // 128 bit vectors are passed by pointer
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
+
+
+  // 256 bit vectors are passed by pointer
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
+
+  // 512 bit vectors are passed by pointer
+  CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
+
+  // The first 4 MMX vector arguments are passed in GPRs.
+  CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+  // The first 4 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
+                                          [XMM0, XMM1, XMM2, XMM3]>>,
+
+  // Do not pass the sret argument in RCX, the Win64 thiscall calling
+  // convention requires "this" to be passed in RCX.
+  CCIfCC<"CallingConv::X86_ThisCall",
+    CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8  , R9  ],
+                                                     [XMM1, XMM2, XMM3]>>>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8  , R9  ],
+                                          [XMM0, XMM1, XMM2, XMM3]>>,
+
+  // The first 4 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
+                                   [RCX , RDX , R8  , R9  ]>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+  // Long doubles get stack slots whose size and alignment depends on the
+  // subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 0>>
+]>;
+
+def CC_X86_Win64_VectorCall : CallingConv<[
+  CCCustom<"CC_X86_64_VectorCall">,
+
+  // Delegate to fastcall to handle integer types.
+  CCDelegateTo<CC_X86_Win64_C>
+]>;
+
+
+def CC_X86_64_GHC : CallingConv<[
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
+  CCIfType<[i64],
+            CCAssignToReg<[R13, RBP, R12, RBX, R14, RSI, RDI, R8, R9, R15]>>,
+
+  // Pass in STG registers: F1, F2, F3, F4, D1, D2
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCIfSubtarget<"hasSSE1()",
+            CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>>
+]>;
+
+def CC_X86_64_HiPE : CallingConv<[
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2, ARG3
+  CCIfType<[i64], CCAssignToReg<[R15, RBP, RSI, RDX, RCX, R8]>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_X86_64_WebKit_JS : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Only the first integer argument is passed in register.
+  CCIfType<[i32], CCAssignToReg<[EAX]>>,
+  CCIfType<[i64], CCAssignToReg<[RAX]>>,
+
+  // The remaining integer arguments are passed on the stack. 32bit integer and
+  // floating-point arguments are aligned to 4 byte and stored in 4 byte slots.
+  // 64bit integer and floating-point arguments are aligned to 8 byte and stored
+  // in 8 byte stack slots.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+// No explicit register is specified for the AnyReg calling convention. The
+// register allocator may assign the arguments to any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def CC_X86_64_AnyReg : CallingConv<[
+  CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 C Calling Convention
+//===----------------------------------------------------------------------===//
+
+/// CC_X86_32_Vector_Common - In all X86-32 calling conventions, extra vector
+/// values are spilled on the stack.
+def CC_X86_32_Vector_Common : CallingConv<[
+  // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToStack<32, 32>>,
+
+  // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+           CCAssignToStack<64, 64>>
+]>;
+
+// CC_X86_32_Vector_Standard - The first 3 vector arguments are passed in
+// vector registers
+def CC_X86_32_Vector_Standard : CallingConv<[
+  // SSE vector arguments are passed in XMM registers.
+  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                CCAssignToReg<[XMM0, XMM1, XMM2]>>>,
+
+  // AVX 256-bit vector arguments are passed in YMM registers.
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                CCIfSubtarget<"hasFp256()",
+                CCAssignToReg<[YMM0, YMM1, YMM2]>>>>,
+
+  // AVX 512-bit vector arguments are passed in ZMM registers.
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+                CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>>,
+
+  CCDelegateTo<CC_X86_32_Vector_Common>
+]>;
+
+// CC_X86_32_Vector_Darwin - The first 4 vector arguments are passed in
+// vector registers.
+def CC_X86_32_Vector_Darwin : CallingConv<[
+  // SSE vector arguments are passed in XMM registers.
+  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
+
+  // AVX 256-bit vector arguments are passed in YMM registers.
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                CCIfSubtarget<"hasFp256()",
+                CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
+
+  // AVX 512-bit vector arguments are passed in ZMM registers.
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+                CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>,
+
+  CCDelegateTo<CC_X86_32_Vector_Common>
+]>;
+
+/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
+/// values are spilled on the stack.
+def CC_X86_32_Common : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // The first 3 float or double arguments, if marked 'inreg' and if the call
+  // is not a vararg call and if SSE2 is available, are passed in SSE registers.
+  CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64],
+                CCIfSubtarget<"hasSSE2()",
+                CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>,
+
+  // The first 3 __m64 vector arguments are passed in mmx registers if the
+  // call is not a vararg call.
+  CCIfNotVarArg<CCIfType<[x86mmx],
+                CCAssignToReg<[MM0, MM1, MM2]>>>,
+
+  // Integer/Float values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  // Doubles get 8-byte slots that are 4-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 4>>,
+
+  // Long doubles get slots whose size depends on the subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 4>>,
+
+  // Boolean vectors of AVX-512 are passed in SIMD registers.
+  // The call from AVX to AVX-512 function should work,
+  // since the boolean types in AVX/AVX2 are promoted by default.
+  CCIfType<[v2i1],  CCPromoteToType<v2i64>>,
+  CCIfType<[v4i1],  CCPromoteToType<v4i32>>,
+  CCIfType<[v8i1],  CCPromoteToType<v8i16>>,
+  CCIfType<[v16i1], CCPromoteToType<v16i8>>,
+  CCIfType<[v32i1], CCPromoteToType<v32i8>>,
+  CCIfType<[v64i1], CCPromoteToType<v64i8>>,
+
+  // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
+  // passed in the parameter area.
+  CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
+
+  // Darwin passes vectors in a form that differs from the i386 psABI
+  CCIfSubtarget<"isTargetDarwin()", CCDelegateTo<CC_X86_32_Vector_Darwin>>,
+
+  // Otherwise, drop to 'normal' X86-32 CC
+  CCDelegateTo<CC_X86_32_Vector_Standard>
+]>;
+
+def CC_X86_32_C : CallingConv<[
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in ECX.
+  CCIfNest<CCAssignToReg<[ECX]>>,
+
+  // The first 3 integer arguments, if marked 'inreg' and if the call is not
+  // a vararg call, are passed in integer registers.
+  CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_MCU : CallingConv<[
+  // Handles byval parameters.  Note that, like FastCC, we can't rely on
+  // the delegation to CC_X86_32_Common because that happens after code that
+  // puts arguments in registers.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // If the call is not a vararg call, some arguments may be passed
+  // in integer registers.
+  CCIfNotVarArg<CCIfType<[i32], CCCustom<"CC_X86_32_MCUInReg">>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCall : CallingConv<[
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in EAX.
+  CCIfNest<CCAssignToReg<[EAX]>>,
+
+  // The first 2 integer arguments are passed in ECX/EDX
+  CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_Win32_VectorCall : CallingConv<[
+  // Pass floating point in XMMs
+  CCCustom<"CC_X86_32_VectorCall">,
+
+  // Delegate to fastcall to handle integer types.
+  CCDelegateTo<CC_X86_32_FastCall>
+]>;
+
+def CC_X86_32_ThisCall_Common : CallingConv<[
+  // The first integer argument is passed in ECX
+  CCIfType<[i32], CCAssignToReg<[ECX]>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_ThisCall_Mingw : CallingConv<[
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  CCDelegateTo<CC_X86_32_ThisCall_Common>
+]>;
+
+def CC_X86_32_ThisCall_Win : CallingConv<[
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // Pass sret arguments indirectly through stack.
+  CCIfSRet<CCAssignToStack<4, 4>>,
+
+  CCDelegateTo<CC_X86_32_ThisCall_Common>
+]>;
+
+def CC_X86_32_ThisCall : CallingConv<[
+  CCIfSubtarget<"isTargetCygMing()", CCDelegateTo<CC_X86_32_ThisCall_Mingw>>,
+  CCDelegateTo<CC_X86_32_ThisCall_Win>
+]>;
+
+def CC_X86_32_FastCC : CallingConv<[
+  // Handles byval parameters.  Note that we can't rely on the delegation
+  // to CC_X86_32_Common for this because that happens after code that
+  // puts arguments in registers.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in EAX.
+  CCIfNest<CCAssignToReg<[EAX]>>,
+
+  // The first 2 integer arguments are passed in ECX/EDX
+  CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+  // The first 3 float or double arguments, if the call is not a vararg
+  // call and if SSE2 is available, are passed in SSE registers.
+  CCIfNotVarArg<CCIfType<[f32,f64],
+                CCIfSubtarget<"hasSSE2()",
+                CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+
+  // Doubles get 8-byte slots that are 8-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 8>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_GHC : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1
+  CCIfType<[i32], CCAssignToReg<[EBX, EBP, EDI, ESI]>>
+]>;
+
+def CC_X86_32_HiPE : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2
+  CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX, ECX]>>,
+
+  // Integer/Float values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>
+]>;
+
+// X86-64 Intel OpenCL built-ins calling convention.
+def CC_Intel_OCL_BI : CallingConv<[
+
+  CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>,
+  CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8,  R9 ]>>>,
+
+  CCIfType<[i32], CCIfSubtarget<"is64Bit()", CCAssignToReg<[EDI, ESI, EDX, ECX]>>>,
+  CCIfType<[i64], CCIfSubtarget<"is64Bit()", CCAssignToReg<[RDI, RSI, RDX, RCX]>>>,
+
+  CCIfType<[i32], CCAssignToStack<4, 4>>,
+
+  // The SSE vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+  // The 256-bit vector arguments are passed in YMM registers.
+  CCIfType<[v8f32, v4f64, v8i32, v4i64],
+           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
+
+  // The 512-bit vector arguments are passed in ZMM registers.
+  CCIfType<[v16f32, v8f64, v16i32, v8i64],
+           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>,
+
+  // Pass masks in mask registers
+  CCIfType<[v16i1, v8i1], CCAssignToReg<[K1]>>,
+
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+  CCIfSubtarget<"is64Bit()",       CCDelegateTo<CC_X86_64_C>>,
+  CCDelegateTo<CC_X86_32_C>
+]>;
+
+def CC_X86_32_Intr : CallingConv<[
+  CCAssignToStack<4, 4>
+]>;
+
+def CC_X86_64_Intr : CallingConv<[
+  CCAssignToStack<8, 8>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Root Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// This is the root argument convention for the X86-32 backend.
+def CC_X86_32 : CallingConv<[
+  // X86_INTR calling convention is valid in MCU target and should override the
+  // MCU calling convention. Thus, this should be checked before isTargetMCU().
+  CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>,
+  CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>,
+  CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>,
+  CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
+  CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
+  CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
+  CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
+  CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_32_RegCall>>,
+
+  // Otherwise, drop to normal X86-32 CC
+  CCDelegateTo<CC_X86_32_C>
+]>;
+
+// This is the root argument convention for the X86-64 backend.
+def CC_X86_64 : CallingConv<[
+  CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>,
+  CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
+  CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>,
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>,
+  CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
+  CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>,
+  CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>,
+  CCIfCC<"CallingConv::HHVM_C", CCDelegateTo<CC_X86_64_HHVM_C>>,
+  CCIfCC<"CallingConv::X86_RegCall",
+    CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_RegCall>>>,
+  CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_SysV64_RegCall>>,
+  CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_64_Intr>>,
+
+  // Mingw64 and native Win64 use Win64 CC
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+
+  // Otherwise, drop to normal X86-64 CC
+  CCDelegateTo<CC_X86_64_C>
+]>;
+
+// This is the argument convention used for the entire X86 backend.
+def CC_X86 : CallingConv<[
+  CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
+  CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
+  CCDelegateTo<CC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved Registers.
+//===----------------------------------------------------------------------===//
+
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
+def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>;
+
+def CSR_64_SwiftError : CalleeSavedRegs<(sub CSR_64, R12)>;
+
+def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>;
+def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
+
+def CSR_Win64_NoSSE : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15)>;
+
+def CSR_Win64 : CalleeSavedRegs<(add CSR_Win64_NoSSE,
+                                     (sequence "XMM%u", 6, 15))>;
+
+// The function used by Darwin to obtain the address of a thread-local variable
+// uses rdi to pass a single parameter and rax for the return value. All other
+// GPRs are preserved.
+def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI,
+                                             R8, R9, R10, R11)>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add RBP)>;
+
+// CSRs that are handled explicitly via copies.
+def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(sub CSR_64_TLS_Darwin, RBP)>;
+
+// All GPRs - except r11
+def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
+                                              R8, R9, R10, RSP)>;
+
+// All registers - except r11
+def CSR_64_RT_AllRegs     : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
+                                                 (sequence "XMM%u", 0, 15))>;
+def CSR_64_RT_AllRegs_AVX : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
+                                                 (sequence "YMM%u", 0, 15))>;
+
+def CSR_64_MostRegs : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
+                                           R11, R12, R13, R14, R15, RBP,
+                                           (sequence "XMM%u", 0, 15))>;
+
+def CSR_32_AllRegs     : CalleeSavedRegs<(add EAX, EBX, ECX, EDX, EBP, ESI,
+                                              EDI)>;
+def CSR_32_AllRegs_SSE : CalleeSavedRegs<(add CSR_32_AllRegs,
+                                              (sequence "XMM%u", 0, 7))>;
+def CSR_32_AllRegs_AVX : CalleeSavedRegs<(add CSR_32_AllRegs,
+                                              (sequence "YMM%u", 0, 7))>;
+def CSR_32_AllRegs_AVX512 : CalleeSavedRegs<(add CSR_32_AllRegs,
+                                                 (sequence "ZMM%u", 0, 7),
+                                                 (sequence "K%u", 0, 7))>;
+
+def CSR_64_AllRegs     : CalleeSavedRegs<(add CSR_64_MostRegs, RAX)>;
+def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX,
+                                                   (sequence "YMM%u", 0, 15)),
+                                              (sequence "XMM%u", 0, 15))>;
+def CSR_64_AllRegs_AVX512 : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX,
+                                                      (sequence "ZMM%u", 0, 31),
+                                                      (sequence "K%u", 0, 7)),
+                                                 (sequence "XMM%u", 0, 15))>;
+
+// Standard C + YMM6-15
+def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
+                                                  R13, R14, R15,
+                                                  (sequence "YMM%u", 6, 15))>;
+
+def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI,
+                                                     R12, R13, R14, R15,
+                                                     (sequence "ZMM%u", 6, 21),
+                                                     K4, K5, K6, K7)>;
+//Standard C + XMM 8-15
+def CSR_64_Intel_OCL_BI       : CalleeSavedRegs<(add CSR_64,
+                                                 (sequence "XMM%u", 8, 15))>;
+
+//Standard C + YMM 8-15
+def CSR_64_Intel_OCL_BI_AVX    : CalleeSavedRegs<(add CSR_64,
+                                                  (sequence "YMM%u", 8, 15))>;
+
+def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15,
+                                                  (sequence "ZMM%u", 16, 31),
+                                                  K4, K5, K6, K7)>;
+
+// Only R12 is preserved for PHP calls in HHVM.
+def CSR_64_HHVM : CalleeSavedRegs<(add R12)>;
+
+// Register calling convention preserves few GPR and XMM8-15
+def CSR_32_RegCall_NoSSE : CalleeSavedRegs<(add ESI, EDI, EBX, EBP, ESP)>;
+def CSR_32_RegCall       : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE,
+                                           (sequence "XMM%u", 4, 7))>;                                            
+def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP,
+                                              (sequence "R%u", 10, 15))>;
+def CSR_Win64_RegCall       : CalleeSavedRegs<(add CSR_Win64_RegCall_NoSSE,                                  
+                                              (sequence "XMM%u", 8, 15))>;
+def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP,
+                                               (sequence "R%u", 12, 15))>;
+def CSR_SysV64_RegCall       : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE,               
+                                               (sequence "XMM%u", 8, 15))>;
+                                               
diff --git a/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp b/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp
new file mode 100755
index 000000000000..bdd1ab537bb2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp
@@ -0,0 +1,213 @@
+//===----------------------- X86EvexToVex.cpp ----------------------------===//
+// Compress EVEX instructions to VEX encoding when possible to reduce code size
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+/// \file
+/// This file defines the pass that goes over all AVX-512 instructions which
+/// are encoded using the EVEX prefix and if possible replaces them by their
+/// corresponding VEX encoding which is usually shorter by 2 bytes.
+/// EVEX instructions may be encoded via the VEX prefix when the AVX-512
+/// instruction has a corresponding AVX/AVX2 opcode and when it does not
+/// use the xmm or the mask registers or xmm/ymm registers wuith indexes
+/// higher than 15.
+/// The pass applies code reduction on the generated code for AVX-512 instrs.
+///
+//===---------------------------------------------------------------------===//
+
+#include "InstPrinter/X86InstComments.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86InstrTablesInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+
+using namespace llvm;
+
+#define EVEX2VEX_DESC "Compressing EVEX instrs to VEX encoding when possible"
+#define EVEX2VEX_NAME "x86-evex-to-vex-compress"
+
+#define DEBUG_TYPE EVEX2VEX_NAME
+
+namespace {
+
+class EvexToVexInstPass : public MachineFunctionPass {
+
+  /// X86EvexToVexCompressTable - Evex to Vex encoding opcode map.
+  typedef DenseMap<unsigned, uint16_t> EvexToVexTableType;
+  EvexToVexTableType EvexToVex128Table;
+  EvexToVexTableType EvexToVex256Table;
+
+  /// For EVEX instructions that can be encoded using VEX encoding, replace
+  /// them by the VEX encoding in order to reduce size.
+  bool CompressEvexToVexImpl(MachineInstr &MI) const;
+
+  /// For initializing the hash map tables of all AVX-512 EVEX
+  /// corresponding to AVX/AVX2 opcodes.
+  void AddTableEntry(EvexToVexTableType &EvexToVexTable, uint16_t EvexOp,
+                     uint16_t VexOp);
+
+public:
+  static char ID;
+
+  StringRef getPassName() const override { return EVEX2VEX_DESC; }
+
+  EvexToVexInstPass() : MachineFunctionPass(ID) {
+    initializeEvexToVexInstPassPass(*PassRegistry::getPassRegistry());
+
+    // Initialize the EVEX to VEX 128 table map.
+    for (X86EvexToVexCompressTableEntry Entry : X86EvexToVex128CompressTable) {
+      AddTableEntry(EvexToVex128Table, Entry.EvexOpcode, Entry.VexOpcode);
+    }
+
+    // Initialize the EVEX to VEX 256 table map.
+    for (X86EvexToVexCompressTableEntry Entry : X86EvexToVex256CompressTable) {
+      AddTableEntry(EvexToVex256Table, Entry.EvexOpcode, Entry.VexOpcode);
+    }
+  }
+
+  /// Loop over all of the basic blocks, replacing EVEX instructions
+  /// by equivalent VEX instructions when possible for reducing code size.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  // This pass runs after regalloc and doesn't support VReg operands.
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  /// Machine instruction info used throughout the class.
+  const X86InstrInfo *TII;
+};
+
+char EvexToVexInstPass::ID = 0;
+}
+
+INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false)
+
+FunctionPass *llvm::createX86EvexToVexInsts() {
+  return new EvexToVexInstPass();
+}
+
+bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+
+  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+  if (!ST.hasAVX512())
+    return false;
+
+  bool Changed = false;
+
+  /// Go over all basic blocks in function and replace
+  /// EVEX encoded instrs by VEX encoding when possible.
+  for (MachineBasicBlock &MBB : MF) {
+
+    // Traverse the basic block. 
+    for (MachineInstr &MI : MBB)      
+      Changed |= CompressEvexToVexImpl(MI);
+  }
+
+  return Changed;
+}
+
+void EvexToVexInstPass::AddTableEntry(EvexToVexTableType &EvexToVexTable,
+                                      uint16_t EvexOp, uint16_t VexOp) {
+  EvexToVexTable[EvexOp] = VexOp;
+}
+
+// For EVEX instructions that can be encoded using VEX encoding
+// replace them by the VEX encoding in order to reduce size.
+bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
+
+  // VEX format.
+  // # of bytes: 0,2,3  1      1      0,1   0,1,2,4  0,1
+  //  [Prefixes] [VEX]  OPCODE ModR/M [SIB] [DISP]  [IMM]
+  //
+  // EVEX format.
+  //  # of bytes: 4    1      1      1      4       / 1         1
+  //  [Prefixes]  EVEX Opcode ModR/M [SIB] [Disp32] / [Disp8*N] [Immediate]
+
+  const MCInstrDesc &Desc = MI.getDesc();
+
+  // Check for EVEX instructions only.
+  if ((Desc.TSFlags & X86II::EncodingMask) != X86II::EVEX)
+    return false;
+ 
+  // Check for EVEX instructions with mask or broadcast as in these cases 
+  // the EVEX prefix is needed in order to carry this information 
+  // thus preventing the transformation to VEX encoding.
+  if (Desc.TSFlags & (X86II::EVEX_K | X86II::EVEX_B))
+    return false;
+ 
+  // Check for non EVEX_V512 instrs only.
+  // EVEX_V512 instr: bit EVEX_L2 = 1; bit VEX_L = 0.
+  if ((Desc.TSFlags & X86II::EVEX_L2) && !(Desc.TSFlags & X86II::VEX_L))
+    return false;  
+        
+  // EVEX_V128 instr: bit EVEX_L2 = 0, bit VEX_L = 0.
+  bool IsEVEX_V128 =
+      (!(Desc.TSFlags & X86II::EVEX_L2) && !(Desc.TSFlags & X86II::VEX_L));
+
+  // EVEX_V256 instr: bit EVEX_L2 = 0, bit VEX_L = 1.
+  bool IsEVEX_V256 =
+      (!(Desc.TSFlags & X86II::EVEX_L2) && (Desc.TSFlags & X86II::VEX_L));
+
+  unsigned NewOpc = 0;
+
+  // Check for EVEX_V256 instructions.
+  if (IsEVEX_V256) {
+    // Search for opcode in the EvexToVex256 table.
+    auto It = EvexToVex256Table.find(MI.getOpcode());
+    if (It != EvexToVex256Table.end())
+      NewOpc = It->second;
+  }
+
+  // Check for EVEX_V128 or Scalar instructions.
+  else if (IsEVEX_V128) {
+    // Search for opcode in the EvexToVex128 table.
+    auto It = EvexToVex128Table.find(MI.getOpcode());
+    if (It != EvexToVex128Table.end())
+      NewOpc = It->second;
+  }
+
+  if (!NewOpc)
+    return false;
+
+  auto isHiRegIdx = [](unsigned Reg) {
+    // Check for XMM register with indexes between 16 - 31.
+    if (Reg >= X86::XMM16 && Reg <= X86::XMM31)
+      return true;
+
+    // Check for YMM register with indexes between 16 - 31.
+    if (Reg >= X86::YMM16 && Reg <= X86::YMM31)
+      return true;
+
+    return false;
+  };
+
+  // Check that operands are not ZMM regs or
+  // XMM/YMM regs with hi indexes between 16 - 31.
+  for (const MachineOperand &MO : MI.explicit_operands()) {
+    if (!MO.isReg())
+      continue;
+
+    unsigned Reg = MO.getReg();
+
+    assert (!(Reg >= X86::ZMM0 && Reg <= X86::ZMM31));
+
+    if (isHiRegIdx(Reg))
+      return false;
+  }
+ 
+  const MCInstrDesc &MCID = TII->get(NewOpc);
+  MI.setDesc(MCID);
+  MI.setAsmPrinterFlag(AC_EVEX_2_VEX);
+  return true; 
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
new file mode 100644
index 000000000000..192b942490e7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -0,0 +1,297 @@
+//===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling, if-conversion, other late
+// optimizations, or simply the encoding of the instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
+#include "llvm/IR/GlobalValue.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-pseudo"
+
+namespace {
+class X86ExpandPseudo : public MachineFunctionPass {
+public:
+  static char ID;
+  X86ExpandPseudo() : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  const X86Subtarget *STI;
+  const X86InstrInfo *TII;
+  const X86RegisterInfo *TRI;
+  const X86MachineFunctionInfo *X86FI;
+  const X86FrameLowering *X86FL;
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  StringRef getPassName() const override {
+    return "X86 pseudo instruction expansion pass";
+  }
+
+private:
+  bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+  bool ExpandMBB(MachineBasicBlock &MBB);
+};
+char X86ExpandPseudo::ID = 0;
+} // End anonymous namespace.
+
+/// If \p MBBI is a pseudo instruction, this method expands
+/// it to the corresponding (sequence of) actual instruction(s).
+/// \returns true if \p MBBI has been expanded.
+bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  DebugLoc DL = MBBI->getDebugLoc();
+  switch (Opcode) {
+  default:
+    return false;
+  case X86::TCRETURNdi:
+  case X86::TCRETURNdicc:
+  case X86::TCRETURNri:
+  case X86::TCRETURNmi:
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNdi64cc:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64: {
+    bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64;
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1);
+    assert(StackAdjust.isImm() && "Expecting immediate value.");
+
+    // Adjust stack pointer.
+    int StackAdj = StackAdjust.getImm();
+    int MaxTCDelta = X86FI->getTCReturnAddrDelta();
+    int Offset = 0;
+    assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
+
+    // Incoporate the retaddr area.
+    Offset = StackAdj - MaxTCDelta;
+    assert(Offset >= 0 && "Offset should never be negative");
+
+    if (Opcode == X86::TCRETURNdicc || Opcode == X86::TCRETURNdi64cc) {
+      assert(Offset == 0 && "Conditional tail call cannot adjust the stack.");
+    }
+
+    if (Offset) {
+      // Check for possible merge with preceding ADD instruction.
+      Offset += X86FL->mergeSPUpdates(MBB, MBBI, true);
+      X86FL->emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true);
+    }
+
+    // Jump to label or value in register.
+    bool IsWin64 = STI->isTargetWin64();
+    if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdicc ||
+        Opcode == X86::TCRETURNdi64 || Opcode == X86::TCRETURNdi64cc) {
+      unsigned Op;
+      switch (Opcode) {
+      case X86::TCRETURNdi:
+        Op = X86::TAILJMPd;
+        break;
+      case X86::TCRETURNdicc:
+        Op = X86::TAILJMPd_CC;
+        break;
+      case X86::TCRETURNdi64cc:
+        assert(!IsWin64 && "Conditional tail calls confuse the Win64 unwinder.");
+        // TODO: We could do it for Win64 "leaf" functions though; PR30337.
+        Op = X86::TAILJMPd64_CC;
+        break;
+      default:
+        // Note: Win64 uses REX prefixes indirect jumps out of functions, but
+        // not direct ones.
+        Op = X86::TAILJMPd64;
+        break;
+      }
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
+      if (JumpTarget.isGlobal()) {
+        MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                             JumpTarget.getTargetFlags());
+      } else {
+        assert(JumpTarget.isSymbol());
+        MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+                              JumpTarget.getTargetFlags());
+      }
+      if (Op == X86::TAILJMPd_CC || Op == X86::TAILJMPd64_CC) {
+        MIB.addImm(MBBI->getOperand(2).getImm());
+      }
+
+    } else if (Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64) {
+      unsigned Op = (Opcode == X86::TCRETURNmi)
+                        ? X86::TAILJMPm
+                        : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
+      for (unsigned i = 0; i != 5; ++i)
+        MIB.addOperand(MBBI->getOperand(i));
+    } else if (Opcode == X86::TCRETURNri64) {
+      BuildMI(MBB, MBBI, DL,
+              TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
+          .addReg(JumpTarget.getReg(), RegState::Kill);
+    } else {
+      BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr))
+          .addReg(JumpTarget.getReg(), RegState::Kill);
+    }
+
+    MachineInstr &NewMI = *std::prev(MBBI);
+    NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI);
+
+    // Delete the pseudo instruction TCRETURN.
+    MBB.erase(MBBI);
+
+    return true;
+  }
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    MachineOperand &DestAddr = MBBI->getOperand(0);
+    assert(DestAddr.isReg() && "Offset should be in register!");
+    const bool Uses64BitFramePtr =
+        STI->isTarget64BitLP64() || STI->isTargetNaCl64();
+    unsigned StackPtr = TRI->getStackRegister();
+    BuildMI(MBB, MBBI, DL,
+            TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr)
+        .addReg(DestAddr.getReg());
+    // The EH_RETURN pseudo is really removed during the MC Lowering.
+    return true;
+  }
+  case X86::IRET: {
+    // Adjust stack to erase error code
+    int64_t StackAdj = MBBI->getOperand(0).getImm();
+    X86FL->emitSPUpdate(MBB, MBBI, StackAdj, true);
+    // Replace pseudo with machine iret
+    BuildMI(MBB, MBBI, DL,
+            TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32));
+    MBB.erase(MBBI);
+    return true;
+  }
+  case X86::RET: {
+    // Adjust stack to erase error code
+    int64_t StackAdj = MBBI->getOperand(0).getImm();
+    MachineInstrBuilder MIB;
+    if (StackAdj == 0) {
+      MIB = BuildMI(MBB, MBBI, DL,
+                    TII->get(STI->is64Bit() ? X86::RETQ : X86::RETL));
+    } else if (isUInt<16>(StackAdj)) {
+      MIB = BuildMI(MBB, MBBI, DL,
+                    TII->get(STI->is64Bit() ? X86::RETIQ : X86::RETIL))
+                .addImm(StackAdj);
+    } else {
+      assert(!STI->is64Bit() &&
+             "shouldn't need to do this for x86_64 targets!");
+      // A ret can only handle immediates as big as 2**16-1.  If we need to pop
+      // off bytes before the return address, we must do it manually.
+      BuildMI(MBB, MBBI, DL, TII->get(X86::POP32r)).addReg(X86::ECX, RegState::Define);
+      X86FL->emitSPUpdate(MBB, MBBI, StackAdj, /*InEpilogue=*/true);
+      BuildMI(MBB, MBBI, DL, TII->get(X86::PUSH32r)).addReg(X86::ECX);
+      MIB = BuildMI(MBB, MBBI, DL, TII->get(X86::RETL));
+    }
+    for (unsigned I = 1, E = MBBI->getNumOperands(); I != E; ++I)
+      MIB.addOperand(MBBI->getOperand(I));
+    MBB.erase(MBBI);
+    return true;
+  }
+  case X86::EH_RESTORE: {
+    // Restore ESP and EBP, and optionally ESI if required.
+    bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(
+        MBB.getParent()->getFunction()->getPersonalityFn()));
+    X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /*RestoreSP=*/IsSEH);
+    MBBI->eraseFromParent();
+    return true;
+  }
+  case X86::LCMPXCHG8B_SAVE_EBX:
+  case X86::LCMPXCHG16B_SAVE_RBX: {
+    // Perform the following transformation.
+    // SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx
+    // =>
+    // [E|R]BX = InArg
+    // actualcmpxchg Addr
+    // [E|R]BX = SaveRbx
+    const MachineOperand &InArg = MBBI->getOperand(6);
+    unsigned SaveRbx = MBBI->getOperand(7).getReg();
+
+    unsigned ActualInArg =
+        Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
+    // Copy the input argument of the pseudo into the argument of the
+    // actual instruction.
+    TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, InArg.getReg(),
+                     InArg.isKill());
+    // Create the actual instruction.
+    unsigned ActualOpc =
+        Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::LCMPXCHG8B : X86::LCMPXCHG16B;
+    MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(ActualOpc));
+    // Copy the operands related to the address.
+    for (unsigned Idx = 1; Idx < 6; ++Idx)
+      NewInstr->addOperand(MBBI->getOperand(Idx));
+    // Finally, restore the value of RBX.
+    TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, SaveRbx,
+                     /*SrcIsKill*/ true);
+
+    // Delete the pseudo.
+    MBBI->eraseFromParent();
+    return true;
+  }
+  }
+  llvm_unreachable("Previous switch has a fallthrough?");
+}
+
+/// Expand all pseudo instructions contained in \p MBB.
+/// \returns true if any expansion occurred for \p MBB.
+bool X86ExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  // MBBI may be invalidated by the expansion.
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= ExpandMI(MBB, MBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  STI = &static_cast<const X86Subtarget &>(MF.getSubtarget());
+  TII = STI->getInstrInfo();
+  TRI = STI->getRegisterInfo();
+  X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  X86FL = STI->getFrameLowering();
+
+  bool Modified = false;
+  for (MachineBasicBlock &MBB : MF)
+    Modified |= ExpandMBB(MBB);
+  return Modified;
+}
+
+/// Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createX86ExpandPseudoPass() {
+  return new X86ExpandPseudo();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
new file mode 100644
index 000000000000..c890fdd1e519
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
@@ -0,0 +1,3933 @@
+//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86-specific support for the FastISel class. Much
+// of the target-specific code is generated by tablegen in the file
+// X86GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86CallingConv.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+
+class X86FastISel final : public FastISel {
+  /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const X86Subtarget *Subtarget;
+
+  /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
+  /// floating point ops.
+  /// When SSE is available, use it for f32 operations.
+  /// When SSE2 is available, use it for f64 operations.
+  bool X86ScalarSSEf64;
+  bool X86ScalarSSEf32;
+
+public:
+  explicit X86FastISel(FunctionLoweringInfo &funcInfo,
+                       const TargetLibraryInfo *libInfo)
+      : FastISel(funcInfo, libInfo) {
+    Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
+    X86ScalarSSEf64 = Subtarget->hasSSE2();
+    X86ScalarSSEf32 = Subtarget->hasSSE1();
+  }
+
+  bool fastSelectInstruction(const Instruction *I) override;
+
+  /// \brief The specified machine instr operand is a vreg, and that
+  /// vreg is being provided by the specified load instruction.  If possible,
+  /// try to fold the load as an operand to the instruction, returning true if
+  /// possible.
+  bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                           const LoadInst *LI) override;
+
+  bool fastLowerArguments() override;
+  bool fastLowerCall(CallLoweringInfo &CLI) override;
+  bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
+
+#include "X86GenFastISel.inc"
+
+private:
+  bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT,
+                          const DebugLoc &DL);
+
+  bool X86FastEmitLoad(EVT VT, X86AddressMode &AM, MachineMemOperand *MMO,
+                       unsigned &ResultReg, unsigned Alignment = 1);
+
+  bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM,
+                        MachineMemOperand *MMO = nullptr, bool Aligned = false);
+  bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+                        X86AddressMode &AM,
+                        MachineMemOperand *MMO = nullptr, bool Aligned = false);
+
+  bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
+                         unsigned &ResultReg);
+
+  bool X86SelectAddress(const Value *V, X86AddressMode &AM);
+  bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
+
+  bool X86SelectLoad(const Instruction *I);
+
+  bool X86SelectStore(const Instruction *I);
+
+  bool X86SelectRet(const Instruction *I);
+
+  bool X86SelectCmp(const Instruction *I);
+
+  bool X86SelectZExt(const Instruction *I);
+
+  bool X86SelectBranch(const Instruction *I);
+
+  bool X86SelectShift(const Instruction *I);
+
+  bool X86SelectDivRem(const Instruction *I);
+
+  bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);
+
+  bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);
+
+  bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);
+
+  bool X86SelectSelect(const Instruction *I);
+
+  bool X86SelectTrunc(const Instruction *I);
+
+  bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc,
+                               const TargetRegisterClass *RC);
+
+  bool X86SelectFPExt(const Instruction *I);
+  bool X86SelectFPTrunc(const Instruction *I);
+  bool X86SelectSIToFP(const Instruction *I);
+
+  const X86InstrInfo *getInstrInfo() const {
+    return Subtarget->getInstrInfo();
+  }
+  const X86TargetMachine *getTargetMachine() const {
+    return static_cast<const X86TargetMachine *>(&TM);
+  }
+
+  bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
+
+  unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);
+  unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT);
+  unsigned fastMaterializeConstant(const Constant *C) override;
+
+  unsigned fastMaterializeAlloca(const AllocaInst *C) override;
+
+  unsigned fastMaterializeFloatZero(const ConstantFP *CF) override;
+
+  /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+  /// computed in an SSE register, not on the X87 floating point stack.
+  bool isScalarFPTypeInSSEReg(EVT VT) const {
+    return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+      (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
+  }
+
+  bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
+
+  bool IsMemcpySmall(uint64_t Len);
+
+  bool TryEmitSmallMemcpy(X86AddressMode DestAM,
+                          X86AddressMode SrcAM, uint64_t Len);
+
+  bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
+                            const Value *Cond);
+
+  const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
+                                            X86AddressMode &AM);
+
+  unsigned fastEmitInst_rrrr(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC, unsigned Op0,
+                             bool Op0IsKill, unsigned Op1, bool Op1IsKill,
+                             unsigned Op2, bool Op2IsKill, unsigned Op3,
+                             bool Op3IsKill);
+};
+
+} // end anonymous namespace.
+
+static std::pair<X86::CondCode, bool>
+getX86ConditionCode(CmpInst::Predicate Predicate) {
+  X86::CondCode CC = X86::COND_INVALID;
+  bool NeedSwap = false;
+  switch (Predicate) {
+  default: break;
+  // Floating-point Predicates
+  case CmpInst::FCMP_UEQ: CC = X86::COND_E;       break;
+  case CmpInst::FCMP_OLT: NeedSwap = true;        LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_OGT: CC = X86::COND_A;       break;
+  case CmpInst::FCMP_OLE: NeedSwap = true;        LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_OGE: CC = X86::COND_AE;      break;
+  case CmpInst::FCMP_UGT: NeedSwap = true;        LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_ULT: CC = X86::COND_B;       break;
+  case CmpInst::FCMP_UGE: NeedSwap = true;        LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_ULE: CC = X86::COND_BE;      break;
+  case CmpInst::FCMP_ONE: CC = X86::COND_NE;      break;
+  case CmpInst::FCMP_UNO: CC = X86::COND_P;       break;
+  case CmpInst::FCMP_ORD: CC = X86::COND_NP;      break;
+  case CmpInst::FCMP_OEQ:                         LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
+
+  // Integer Predicates
+  case CmpInst::ICMP_EQ:  CC = X86::COND_E;       break;
+  case CmpInst::ICMP_NE:  CC = X86::COND_NE;      break;
+  case CmpInst::ICMP_UGT: CC = X86::COND_A;       break;
+  case CmpInst::ICMP_UGE: CC = X86::COND_AE;      break;
+  case CmpInst::ICMP_ULT: CC = X86::COND_B;       break;
+  case CmpInst::ICMP_ULE: CC = X86::COND_BE;      break;
+  case CmpInst::ICMP_SGT: CC = X86::COND_G;       break;
+  case CmpInst::ICMP_SGE: CC = X86::COND_GE;      break;
+  case CmpInst::ICMP_SLT: CC = X86::COND_L;       break;
+  case CmpInst::ICMP_SLE: CC = X86::COND_LE;      break;
+  }
+
+  return std::make_pair(CC, NeedSwap);
+}
+
+static std::pair<unsigned, bool>
+getX86SSEConditionCode(CmpInst::Predicate Predicate) {
+  unsigned CC;
+  bool NeedSwap = false;
+
+  // SSE Condition code mapping:
+  //  0 - EQ
+  //  1 - LT
+  //  2 - LE
+  //  3 - UNORD
+  //  4 - NEQ
+  //  5 - NLT
+  //  6 - NLE
+  //  7 - ORD
+  switch (Predicate) {
+  default: llvm_unreachable("Unexpected predicate");
+  case CmpInst::FCMP_OEQ: CC = 0;          break;
+  case CmpInst::FCMP_OGT: NeedSwap = true; LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_OLT: CC = 1;          break;
+  case CmpInst::FCMP_OGE: NeedSwap = true; LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_OLE: CC = 2;          break;
+  case CmpInst::FCMP_UNO: CC = 3;          break;
+  case CmpInst::FCMP_UNE: CC = 4;          break;
+  case CmpInst::FCMP_ULE: NeedSwap = true; LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_UGE: CC = 5;          break;
+  case CmpInst::FCMP_ULT: NeedSwap = true; LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_UGT: CC = 6;          break;
+  case CmpInst::FCMP_ORD: CC = 7;          break;
+  case CmpInst::FCMP_UEQ:
+  case CmpInst::FCMP_ONE: CC = 8;          break;
+  }
+
+  return std::make_pair(CC, NeedSwap);
+}
+
+/// \brief Adds a complex addressing mode to the given machine instr builder.
+/// Note, this will constrain the index register.  If its not possible to
+/// constrain the given index register, then a new one will be created.  The
+/// IndexReg field of the addressing mode will be updated to match in this case.
+const MachineInstrBuilder &
+X86FastISel::addFullAddress(const MachineInstrBuilder &MIB,
+                            X86AddressMode &AM) {
+  // First constrain the index register.  It needs to be a GR64_NOSP.
+  AM.IndexReg = constrainOperandRegClass(MIB->getDesc(), AM.IndexReg,
+                                         MIB->getNumOperands() +
+                                         X86::AddrIndexReg);
+  return ::addFullAddress(MIB, AM);
+}
+
+/// \brief Check if it is possible to fold the condition from the XALU intrinsic
+/// into the user. The condition code will only be updated on success.
+bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
+                                       const Value *Cond) {
+  if (!isa<ExtractValueInst>(Cond))
+    return false;
+
+  const auto *EV = cast<ExtractValueInst>(Cond);
+  if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
+    return false;
+
+  const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
+  MVT RetVT;
+  const Function *Callee = II->getCalledFunction();
+  Type *RetTy =
+    cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
+  if (!isTypeLegal(RetTy, RetVT))
+    return false;
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return false;
+
+  X86::CondCode TmpCC;
+  switch (II->getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
+  }
+
+  // Check if both instructions are in the same basic block.
+  if (II->getParent() != I->getParent())
+    return false;
+
+  // Make sure nothing is in the way
+  BasicBlock::const_iterator Start(I);
+  BasicBlock::const_iterator End(II);
+  for (auto Itr = std::prev(Start); Itr != End; --Itr) {
+    // We only expect extractvalue instructions between the intrinsic and the
+    // instruction to be selected.
+    if (!isa<ExtractValueInst>(Itr))
+      return false;
+
+    // Check that the extractvalue operand comes from the intrinsic.
+    const auto *EVI = cast<ExtractValueInst>(Itr);
+    if (EVI->getAggregateOperand() != II)
+      return false;
+  }
+
+  CC = TmpCC;
+  return true;
+}
+
+bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
+  EVT evt = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true);
+  if (evt == MVT::Other || !evt.isSimple())
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+
+  VT = evt.getSimpleVT();
+  // For now, require SSE/SSE2 for performing floating-point operations,
+  // since x87 requires additional work.
+  if (VT == MVT::f64 && !X86ScalarSSEf64)
+    return false;
+  if (VT == MVT::f32 && !X86ScalarSSEf32)
+    return false;
+  // Similarly, no f80 support yet.
+  if (VT == MVT::f80)
+    return false;
+  // We only handle legal types. For example, on x86-32 the instruction
+  // selector contains all of the 64-bit instructions from x86-64,
+  // under the assumption that i64 won't be used if the target doesn't
+  // support it.
+  return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
+}
+
+#include "X86GenCallingConv.inc"
+
+/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
+/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
+/// Return true and the result register by reference if it is possible.
+bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
+                                  MachineMemOperand *MMO, unsigned &ResultReg,
+                                  unsigned Alignment) {
+  bool HasSSE41 = Subtarget->hasSSE41();
+  bool HasAVX = Subtarget->hasAVX();
+  bool HasAVX2 = Subtarget->hasAVX2();
+  bool HasAVX512 = Subtarget->hasAVX512();
+  bool HasVLX = Subtarget->hasVLX();
+  bool IsNonTemporal = MMO && MMO->isNonTemporal();
+
+  // Get opcode and regclass of the output for the given load instruction.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = nullptr;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+    Opc = X86::MOV8rm;
+    RC  = &X86::GR8RegClass;
+    break;
+  case MVT::i16:
+    Opc = X86::MOV16rm;
+    RC  = &X86::GR16RegClass;
+    break;
+  case MVT::i32:
+    Opc = X86::MOV32rm;
+    RC  = &X86::GR32RegClass;
+    break;
+  case MVT::i64:
+    // Must be in x86-64 mode.
+    Opc = X86::MOV64rm;
+    RC  = &X86::GR64RegClass;
+    break;
+  case MVT::f32:
+    if (X86ScalarSSEf32) {
+      Opc = HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
+      RC  = &X86::FR32RegClass;
+    } else {
+      Opc = X86::LD_Fp32m;
+      RC  = &X86::RFP32RegClass;
+    }
+    break;
+  case MVT::f64:
+    if (X86ScalarSSEf64) {
+      Opc = HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm;
+      RC  = &X86::FR64RegClass;
+    } else {
+      Opc = X86::LD_Fp64m;
+      RC  = &X86::RFP64RegClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return false;
+  case MVT::v4f32:
+    if (IsNonTemporal && Alignment >= 16 && HasSSE41)
+      Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
+            HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+    else if (Alignment >= 16)
+      Opc = HasVLX ? X86::VMOVAPSZ128rm :
+            HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm;
+    else
+      Opc = HasVLX ? X86::VMOVUPSZ128rm :
+            HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
+    RC  = &X86::VR128RegClass;
+    break;
+  case MVT::v2f64:
+    if (IsNonTemporal && Alignment >= 16 && HasSSE41)
+      Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
+            HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+    else if (Alignment >= 16)
+      Opc = HasVLX ? X86::VMOVAPDZ128rm :
+            HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm;
+    else
+      Opc = HasVLX ? X86::VMOVUPDZ128rm :
+            HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
+    RC  = &X86::VR128RegClass;
+    break;
+  case MVT::v4i32:
+  case MVT::v2i64:
+  case MVT::v8i16:
+  case MVT::v16i8:
+    if (IsNonTemporal && Alignment >= 16)
+      Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
+            HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+    else if (Alignment >= 16)
+      Opc = HasVLX ? X86::VMOVDQA64Z128rm :
+            HasAVX ? X86::VMOVDQArm : X86::MOVDQArm;
+    else
+      Opc = HasVLX ? X86::VMOVDQU64Z128rm :
+            HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
+    RC  = &X86::VR128RegClass;
+    break;
+  case MVT::v8f32:
+    assert(HasAVX);
+    if (IsNonTemporal && Alignment >= 32 && HasAVX2)
+      Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
+    else if (Alignment >= 32)
+      Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm;
+    else
+      Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm;
+    RC  = &X86::VR256RegClass;
+    break;
+  case MVT::v4f64:
+    assert(HasAVX);
+    if (IsNonTemporal && Alignment >= 32 && HasAVX2)
+      Opc = X86::VMOVNTDQAYrm;
+    else if (Alignment >= 32)
+      Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm;
+    else
+      Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm;
+    RC  = &X86::VR256RegClass;
+    break;
+  case MVT::v8i32:
+  case MVT::v4i64:
+  case MVT::v16i16:
+  case MVT::v32i8:
+    assert(HasAVX);
+    if (IsNonTemporal && Alignment >= 32 && HasAVX2)
+      Opc = X86::VMOVNTDQAYrm;
+    else if (Alignment >= 32)
+      Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm;
+    else
+      Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm;
+    RC  = &X86::VR256RegClass;
+    break;
+  case MVT::v16f32:
+    assert(HasAVX512);
+    if (IsNonTemporal && Alignment >= 64)
+      Opc = X86::VMOVNTDQAZrm;
+    else
+      Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm;
+    RC  = &X86::VR512RegClass;
+    break;
+  case MVT::v8f64:
+    assert(HasAVX512);
+    if (IsNonTemporal && Alignment >= 64)
+      Opc = X86::VMOVNTDQAZrm;
+    else
+      Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm;
+    RC  = &X86::VR512RegClass;
+    break;
+  case MVT::v8i64:
+  case MVT::v16i32:
+  case MVT::v32i16:
+  case MVT::v64i8:
+    assert(HasAVX512);
+    // Note: There are a lot more choices based on type with AVX-512, but
+    // there's really no advantage when the load isn't masked.
+    if (IsNonTemporal && Alignment >= 64)
+      Opc = X86::VMOVNTDQAZrm;
+    else
+      Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm;
+    RC  = &X86::VR512RegClass;
+    break;
+  }
+
+  ResultReg = createResultReg(RC);
+  MachineInstrBuilder MIB =
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+  addFullAddress(MIB, AM);
+  if (MMO)
+    MIB->addMemOperand(*FuncInfo.MF, MMO);
+  return true;
+}
+
+/// X86FastEmitStore - Emit a machine instruction to store a value Val of
+/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
+/// and a displacement offset, or a GlobalAddress,
+/// i.e. V. Return true if it is possible.
+bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
+                                   X86AddressMode &AM,
+                                   MachineMemOperand *MMO, bool Aligned) {
+  bool HasSSE2 = Subtarget->hasSSE2();
+  bool HasSSE4A = Subtarget->hasSSE4A();
+  bool HasAVX = Subtarget->hasAVX();
+  bool HasAVX512 = Subtarget->hasAVX512();
+  bool HasVLX = Subtarget->hasVLX();
+  bool IsNonTemporal = MMO && MMO->isNonTemporal();
+
+  // Get opcode and regclass of the output for the given store instruction.
+  unsigned Opc = 0;
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f80: // No f80 support yet.
+  default: return false;
+  case MVT::i1: {
+    // Mask out all but lowest bit.
+    unsigned AndResult = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(X86::AND8ri), AndResult)
+      .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
+    ValReg = AndResult;
+    LLVM_FALLTHROUGH; // handle i1 as i8.
+  }
+  case MVT::i8:  Opc = X86::MOV8mr;  break;
+  case MVT::i16: Opc = X86::MOV16mr; break;
+  case MVT::i32:
+    Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr;
+    break;
+  case MVT::i64:
+    // Must be in x86-64 mode.
+    Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;
+    break;
+  case MVT::f32:
+    if (X86ScalarSSEf32) {
+      if (IsNonTemporal && HasSSE4A)
+        Opc = X86::MOVNTSS;
+      else
+        Opc = HasAVX512 ? X86::VMOVSSZmr :
+              HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
+    } else
+      Opc = X86::ST_Fp32m;
+    break;
+  case MVT::f64:
+    if (X86ScalarSSEf32) {
+      if (IsNonTemporal && HasSSE4A)
+        Opc = X86::MOVNTSD;
+      else
+        Opc = HasAVX512 ? X86::VMOVSDZmr :
+              HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;
+    } else
+      Opc = X86::ST_Fp64m;
+    break;
+  case MVT::v4f32:
+    if (Aligned) {
+      if (IsNonTemporal)
+        Opc = HasVLX ? X86::VMOVNTPSZ128mr :
+              HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;
+      else
+        Opc = HasVLX ? X86::VMOVAPSZ128mr :
+              HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;
+    } else
+      Opc = HasVLX ? X86::VMOVUPSZ128mr :
+            HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;
+    break;
+  case MVT::v2f64:
+    if (Aligned) {
+      if (IsNonTemporal)
+        Opc = HasVLX ? X86::VMOVNTPDZ128mr :
+              HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;
+      else
+        Opc = HasVLX ? X86::VMOVAPDZ128mr :
+              HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;
+    } else
+      Opc = HasVLX ? X86::VMOVUPDZ128mr :
+            HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;
+    break;
+  case MVT::v4i32:
+  case MVT::v2i64:
+  case MVT::v8i16:
+  case MVT::v16i8:
+    if (Aligned) {
+      if (IsNonTemporal)
+        Opc = HasVLX ? X86::VMOVNTDQZ128mr :
+              HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;
+      else
+        Opc = HasVLX ? X86::VMOVDQA64Z128mr :
+              HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
+    } else
+      Opc = HasVLX ? X86::VMOVDQU64Z128mr :
+            HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr;
+    break;
+  case MVT::v8f32:
+    assert(HasAVX);
+    if (Aligned) {
+      if (IsNonTemporal)
+        Opc = HasVLX ? X86::VMOVNTPSZ256mr : X86::VMOVNTPSYmr;
+      else
+        Opc = HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr;
+    } else
+      Opc = HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr;
+    break;
+  case MVT::v4f64:
+    assert(HasAVX);
+    if (Aligned) {
+      if (IsNonTemporal)
+        Opc = HasVLX ? X86::VMOVNTPDZ256mr : X86::VMOVNTPDYmr;
+      else
+        Opc = HasVLX ? X86::VMOVAPDZ256mr : X86::VMOVAPDYmr;
+    } else
+      Opc = HasVLX ? X86::VMOVUPDZ256mr : X86::VMOVUPDYmr;
+    break;
+  case MVT::v8i32:
+  case MVT::v4i64:
+  case MVT::v16i16:
+  case MVT::v32i8:
+    assert(HasAVX);
+    if (Aligned) {
+      if (IsNonTemporal)
+        Opc = HasVLX ? X86::VMOVNTDQZ256mr : X86::VMOVNTDQYmr;
+      else
+        Opc = HasVLX ? X86::VMOVDQA64Z256mr : X86::VMOVDQAYmr;
+    } else
+      Opc = HasVLX ? X86::VMOVDQU64Z256mr : X86::VMOVDQUYmr;
+    break;
+  case MVT::v16f32:
+    assert(HasAVX512);
+    if (Aligned)
+      Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr;
+    else
+      Opc = X86::VMOVUPSZmr;
+    break;
+  case MVT::v8f64:
+    assert(HasAVX512);
+    if (Aligned) {
+      Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr;
+    } else
+      Opc = X86::VMOVUPDZmr;
+    break;
+  case MVT::v8i64:
+  case MVT::v16i32:
+  case MVT::v32i16:
+  case MVT::v64i8:
+    assert(HasAVX512);
+    // Note: There are a lot more choices based on type with AVX-512, but
+    // there's really no advantage when the store isn't masked.
+    if (Aligned)
+      Opc = IsNonTemporal ? X86::VMOVNTDQZmr : X86::VMOVDQA64Zmr;
+    else
+      Opc = X86::VMOVDQU64Zmr;
+    break;
+  }
+
+  const MCInstrDesc &Desc = TII.get(Opc);
+  // Some of the instructions in the previous switch use FR128 instead
+  // of FR32 for ValReg. Make sure the register we feed the instruction
+  // matches its register class constraints.
+  // Note: This is fine to do a copy from FR32 to FR128, this is the
+  // same registers behind the scene and actually why it did not trigger
+  // any bugs before.
+  ValReg = constrainOperandRegClass(Desc, ValReg, Desc.getNumOperands() - 1);
+  MachineInstrBuilder MIB =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, Desc);
+  addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
+  if (MMO)
+    MIB->addMemOperand(*FuncInfo.MF, MMO);
+
+  return true;
+}
+
+bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
+                                   X86AddressMode &AM,
+                                   MachineMemOperand *MMO, bool Aligned) {
+  // Handle 'null' like i32/i64 0.
+  if (isa<ConstantPointerNull>(Val))
+    Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
+
+  // If this is a store of a simple constant, fold the constant into the store.
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+    unsigned Opc = 0;
+    bool Signed = true;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: break;
+    case MVT::i1:
+      Signed = false;
+      LLVM_FALLTHROUGH; // Handle as i8.
+    case MVT::i8:  Opc = X86::MOV8mi;  break;
+    case MVT::i16: Opc = X86::MOV16mi; break;
+    case MVT::i32: Opc = X86::MOV32mi; break;
+    case MVT::i64:
+      // Must be a 32-bit sign extended value.
+      if (isInt<32>(CI->getSExtValue()))
+        Opc = X86::MOV64mi32;
+      break;
+    }
+
+    if (Opc) {
+      MachineInstrBuilder MIB =
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+      addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
+                                            : CI->getZExtValue());
+      if (MMO)
+        MIB->addMemOperand(*FuncInfo.MF, MMO);
+      return true;
+    }
+  }
+
+  unsigned ValReg = getRegForValue(Val);
+  if (ValReg == 0)
+    return false;
+
+  bool ValKill = hasTrivialKill(Val);
+  return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
+}
+
+/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
+/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
+/// ISD::SIGN_EXTEND).
+bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
+                                    unsigned Src, EVT SrcVT,
+                                    unsigned &ResultReg) {
+  unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
+                           Src, /*TODO: Kill=*/false);
+  if (RR == 0)
+    return false;
+
+  ResultReg = RR;
+  return true;
+}
+
+bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
+  // Handle constant address.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // Can't handle alternate code models yet.
+    if (TM.getCodeModel() != CodeModel::Small)
+      return false;
+
+    // Can't handle TLS yet.
+    if (GV->isThreadLocal())
+      return false;
+
+    // RIP-relative addresses can't have additional register operands, so if
+    // we've already folded stuff into the addressing mode, just force the
+    // global value into its own register, which we can use as the basereg.
+    if (!Subtarget->isPICStyleRIPRel() ||
+        (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
+      // Okay, we've committed to selecting this global. Set up the address.
+      AM.GV = GV;
+
+      // Allow the subtarget to classify the global.
+      unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);
+
+      // If this reference is relative to the pic base, set it now.
+      if (isGlobalRelativeToPICBase(GVFlags)) {
+        // FIXME: How do we know Base.Reg is free??
+        AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+      }
+
+      // Unless the ABI requires an extra load, return a direct reference to
+      // the global.
+      if (!isGlobalStubReference(GVFlags)) {
+        if (Subtarget->isPICStyleRIPRel()) {
+          // Use rip-relative addressing if we can.  Above we verified that the
+          // base and index registers are unused.
+          assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+          AM.Base.Reg = X86::RIP;
+        }
+        AM.GVOpFlags = GVFlags;
+        return true;
+      }
+
+      // Ok, we need to do a load from a stub.  If we've already loaded from
+      // this stub, reuse the loaded pointer, otherwise emit the load now.
+      DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V);
+      unsigned LoadReg;
+      if (I != LocalValueMap.end() && I->second != 0) {
+        LoadReg = I->second;
+      } else {
+        // Issue load from stub.
+        unsigned Opc = 0;
+        const TargetRegisterClass *RC = nullptr;
+        X86AddressMode StubAM;
+        StubAM.Base.Reg = AM.Base.Reg;
+        StubAM.GV = GV;
+        StubAM.GVOpFlags = GVFlags;
+
+        // Prepare for inserting code in the local-value area.
+        SavePoint SaveInsertPt = enterLocalValueArea();
+
+        if (TLI.getPointerTy(DL) == MVT::i64) {
+          Opc = X86::MOV64rm;
+          RC  = &X86::GR64RegClass;
+
+          if (Subtarget->isPICStyleRIPRel())
+            StubAM.Base.Reg = X86::RIP;
+        } else {
+          Opc = X86::MOV32rm;
+          RC  = &X86::GR32RegClass;
+        }
+
+        LoadReg = createResultReg(RC);
+        MachineInstrBuilder LoadMI =
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
+        addFullAddress(LoadMI, StubAM);
+
+        // Ok, back to normal mode.
+        leaveLocalValueArea(SaveInsertPt);
+
+        // Prevent loading GV stub multiple times in same MBB.
+        LocalValueMap[V] = LoadReg;
+      }
+
+      // Now construct the final address. Note that the Disp, Scale,
+      // and Index values may already be set here.
+      AM.Base.Reg = LoadReg;
+      AM.GV = nullptr;
+      return true;
+    }
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+    if (AM.Base.Reg == 0) {
+      AM.Base.Reg = getRegForValue(V);
+      return AM.Base.Reg != 0;
+    }
+    if (AM.IndexReg == 0) {
+      assert(AM.Scale == 1 && "Scale with no index!");
+      AM.IndexReg = getRegForValue(V);
+      return AM.IndexReg != 0;
+    }
+  }
+
+  return false;
+}
+
+/// X86SelectAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
+  SmallVector<const Value *, 32> GEPs;
+redo_gep:
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    // Don't walk into other basic blocks; it's possible we haven't
+    // visited them yet, so the instructions may not yet be assigned
+    // virtual registers.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+  switch (Opcode) {
+  default: break;
+  case Instruction::BitCast:
+    // Look past bitcasts.
+    return X86SelectAddress(U->getOperand(0), AM);
+
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+        TLI.getPointerTy(DL))
+      return X86SelectAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+      return X86SelectAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::Alloca: {
+    // Do static allocas.
+    const AllocaInst *A = cast<AllocaInst>(V);
+    DenseMap<const AllocaInst *, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(A);
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
+      AM.BaseType = X86AddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = SI->second;
+      return true;
+    }
+    break;
+  }
+
+  case Instruction::Add: {
+    // Adds of constants are common and easy enough.
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
+      // They have to fit in the 32-bit signed displacement field though.
+      if (isInt<32>(Disp)) {
+        AM.Disp = (uint32_t)Disp;
+        return X86SelectAddress(U->getOperand(0), AM);
+      }
+    }
+    break;
+  }
+
+  case Instruction::GetElementPtr: {
+    X86AddressMode SavedAM = AM;
+
+    // Pattern-match simple GEPs.
+    uint64_t Disp = (int32_t)AM.Disp;
+    unsigned IndexReg = AM.IndexReg;
+    unsigned Scale = AM.Scale;
+    gep_type_iterator GTI = gep_type_begin(U);
+    // Iterate through the indices, folding what we can. Constants can be
+    // folded, and one dynamic index can be handled, if the scale is supported.
+    for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
+         i != e; ++i, ++GTI) {
+      const Value *Op = *i;
+      if (StructType *STy = GTI.getStructTypeOrNull()) {
+        const StructLayout *SL = DL.getStructLayout(STy);
+        Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
+        continue;
+      }
+
+      // A array/variable index is always of the form i*S where S is the
+      // constant scale size.  See if we can push the scale into immediates.
+      uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+      for (;;) {
+        if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+          // Constant-offset addressing.
+          Disp += CI->getSExtValue() * S;
+          break;
+        }
+        if (canFoldAddIntoGEP(U, Op)) {
+          // A compatible add with a constant operand. Fold the constant.
+          ConstantInt *CI =
+            cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+          Disp += CI->getSExtValue() * S;
+          // Iterate on the other operand.
+          Op = cast<AddOperator>(Op)->getOperand(0);
+          continue;
+        }
+        if (IndexReg == 0 &&
+            (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
+            (S == 1 || S == 2 || S == 4 || S == 8)) {
+          // Scaled-index addressing.
+          Scale = S;
+          IndexReg = getRegForGEPIndex(Op).first;
+          if (IndexReg == 0)
+            return false;
+          break;
+        }
+        // Unsupported.
+        goto unsupported_gep;
+      }
+    }
+
+    // Check for displacement overflow.
+    if (!isInt<32>(Disp))
+      break;
+
+    AM.IndexReg = IndexReg;
+    AM.Scale = Scale;
+    AM.Disp = (uint32_t)Disp;
+    GEPs.push_back(V);
+
+    if (const GetElementPtrInst *GEP =
+          dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
+      // Ok, the GEP indices were covered by constant-offset and scaled-index
+      // addressing. Update the address state and move on to examining the base.
+      V = GEP;
+      goto redo_gep;
+    } else if (X86SelectAddress(U->getOperand(0), AM)) {
+      return true;
+    }
+
+    // If we couldn't merge the gep value into this addr mode, revert back to
+    // our address and just match the value instead of completely failing.
+    AM = SavedAM;
+
+    for (const Value *I : reverse(GEPs))
+      if (handleConstantAddresses(I, AM))
+        return true;
+
+    return false;
+  unsupported_gep:
+    // Ok, the GEP indices weren't all covered.
+    break;
+  }
+  }
+
+  return handleConstantAddresses(V, AM);
+}
+
+/// X86SelectCallAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  const Instruction *I = dyn_cast<Instruction>(V);
+  // Record if the value is defined in the same basic block.
+  //
+  // This information is crucial to know whether or not folding an
+  // operand is valid.
+  // Indeed, FastISel generates or reuses a virtual register for all
+  // operands of all instructions it selects. Obviously, the definition and
+  // its uses must use the same virtual register otherwise the produced
+  // code is incorrect.
+  // Before instruction selection, FunctionLoweringInfo::set sets the virtual
+  // registers for values that are alive across basic blocks. This ensures
+  // that the values are consistently set between across basic block, even
+  // if different instruction selection mechanisms are used (e.g., a mix of
+  // SDISel and FastISel).
+  // For values local to a basic block, the instruction selection process
+  // generates these virtual registers with whatever method is appropriate
+  // for its needs. In particular, FastISel and SDISel do not share the way
+  // local virtual registers are set.
+  // Therefore, this is impossible (or at least unsafe) to share values
+  // between basic blocks unless they use the same instruction selection
+  // method, which is not guarantee for X86.
+  // Moreover, things like hasOneUse could not be used accurately, if we
+  // allow to reference values across basic blocks whereas they are not
+  // alive across basic blocks initially.
+  bool InMBB = true;
+  if (I) {
+    Opcode = I->getOpcode();
+    U = I;
+    InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+  default: break;
+  case Instruction::BitCast:
+    // Look past bitcasts if its operand is in the same BB.
+    if (InMBB)
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs if its operand is in the same BB.
+    if (InMBB &&
+        TLI.getValueType(DL, U->getOperand(0)->getType()) ==
+            TLI.getPointerTy(DL))
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints if its operand is in the same BB.
+    if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
+  }
+
+  // Handle constant address.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // Can't handle alternate code models yet.
+    if (TM.getCodeModel() != CodeModel::Small)
+      return false;
+
+    // RIP-relative addresses can't have additional register operands.
+    if (Subtarget->isPICStyleRIPRel() &&
+        (AM.Base.Reg != 0 || AM.IndexReg != 0))
+      return false;
+
+    // Can't handle DLL Import.
+    if (GV->hasDLLImportStorageClass())
+      return false;
+
+    // Can't handle TLS.
+    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+      if (GVar->isThreadLocal())
+        return false;
+
+    // Okay, we've committed to selecting this global. Set up the basic address.
+    AM.GV = GV;
+
+    // No ABI requires an extra load for anything other than DLLImport, which
+    // we rejected above. Return a direct reference to the global.
+    if (Subtarget->isPICStyleRIPRel()) {
+      // Use rip-relative addressing if we can.  Above we verified that the
+      // base and index registers are unused.
+      assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+      AM.Base.Reg = X86::RIP;
+    } else {
+      AM.GVOpFlags = Subtarget->classifyLocalReference(nullptr);
+    }
+
+    return true;
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+    if (AM.Base.Reg == 0) {
+      AM.Base.Reg = getRegForValue(V);
+      return AM.Base.Reg != 0;
+    }
+    if (AM.IndexReg == 0) {
+      assert(AM.Scale == 1 && "Scale with no index!");
+      AM.IndexReg = getRegForValue(V);
+      return AM.IndexReg != 0;
+    }
+  }
+
+  return false;
+}
+
+
+/// X86SelectStore - Select and emit code to implement store instructions.
+bool X86FastISel::X86SelectStore(const Instruction *I) {
+  // Atomic stores need special handling.
+  const StoreInst *S = cast<StoreInst>(I);
+
+  if (S->isAtomic())
+    return false;
+
+  const Value *PtrV = I->getOperand(1);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return false;
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
+      if (Alloca->isSwiftError())
+        return false;
+    }
+  }
+
+  const Value *Val = S->getValueOperand();
+  const Value *Ptr = S->getPointerOperand();
+
+  MVT VT;
+  if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
+    return false;
+
+  unsigned Alignment = S->getAlignment();
+  unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
+  if (Alignment == 0) // Ensure that codegen never sees alignment 0
+    Alignment = ABIAlignment;
+  bool Aligned = Alignment >= ABIAlignment;
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(Ptr, AM))
+    return false;
+
+  return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
+}
+
+/// X86SelectRet - Select and emit code to implement ret instructions.
+bool X86FastISel::X86SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+  const X86MachineFunctionInfo *X86MFInfo =
+      FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  if (TLI.supportSwiftError() &&
+      F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return false;
+
+  if (TLI.supportSplitCSR(FuncInfo.MF))
+    return false;
+
+  CallingConv::ID CC = F.getCallingConv();
+  if (CC != CallingConv::C &&
+      CC != CallingConv::Fast &&
+      CC != CallingConv::X86_FastCall &&
+      CC != CallingConv::X86_StdCall &&
+      CC != CallingConv::X86_ThisCall &&
+      CC != CallingConv::X86_64_SysV &&
+      CC != CallingConv::X86_64_Win64)
+    return false;
+
+  // Don't handle popping bytes if they don't fit the ret's immediate.
+  if (!isUInt<16>(X86MFInfo->getBytesToPopOnReturn()))
+    return false;
+
+  // fastcc with -tailcallopt is intended to provide a guaranteed
+  // tail call optimization. Fastisel doesn't know how to do that.
+  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+    return false;
+
+  // Let SDISel handle vararg functions.
+  if (F.isVarArg())
+    return false;
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+
+  if (Ret->getNumOperands() > 0) {
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
+    CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+    const Value *RV = Ret->getOperand(0);
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+
+    // Don't bother handling odd stuff for now.
+    if (VA.getLocInfo() != CCValAssign::Full)
+      return false;
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+
+    // The calling-convention tables for x87 returns don't tell
+    // the whole story.
+    if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
+      return false;
+
+    unsigned SrcReg = Reg + VA.getValNo();
+    EVT SrcVT = TLI.getValueType(DL, RV->getType());
+    EVT DstVT = VA.getValVT();
+    // Special handling for extended integers.
+    if (SrcVT != DstVT) {
+      if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
+        return false;
+
+      if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+        return false;
+
+      assert(DstVT == MVT::i32 && "X86 should always ext to i32");
+
+      if (SrcVT == MVT::i1) {
+        if (Outs[0].Flags.isSExt())
+          return false;
+        SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
+        SrcVT = MVT::i8;
+      }
+      unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
+                                             ISD::SIGN_EXTEND;
+      SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
+                          SrcReg, /*TODO: Kill=*/false);
+    }
+
+    // Make the copy.
+    unsigned DstReg = VA.getLocReg();
+    const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!SrcRC->contains(DstReg))
+      return false;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
+
+    // Add register to return instruction.
+    RetRegs.push_back(VA.getLocReg());
+  }
+
+  // Swift calling convention does not require we copy the sret argument
+  // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
+
+  // All x86 ABIs require that for returning structs by value we copy
+  // the sret argument into %rax/%eax (depending on ABI) for the return.
+  // We saved the argument into a virtual register in the entry block,
+  // so now we copy the value out and into %rax/%eax.
+  if (F.hasStructRetAttr() && CC != CallingConv::Swift) {
+    unsigned Reg = X86MFInfo->getSRetReturnReg();
+    assert(Reg &&
+           "SRetReturnReg should have been set in LowerFormalArguments()!");
+    unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);
+    RetRegs.push_back(RetReg);
+  }
+
+  // Now emit the RET.
+  MachineInstrBuilder MIB;
+  if (X86MFInfo->getBytesToPopOnReturn()) {
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                  TII.get(Subtarget->is64Bit() ? X86::RETIQ : X86::RETIL))
+              .addImm(X86MFInfo->getBytesToPopOnReturn());
+  } else {
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                  TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
+  }
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+  return true;
+}
+
+/// X86SelectLoad - Select and emit code to implement load instructions.
+///
+bool X86FastISel::X86SelectLoad(const Instruction *I) {
+  const LoadInst *LI = cast<LoadInst>(I);
+
+  // Atomic loads need special handling.
+  if (LI->isAtomic())
+    return false;
+
+  const Value *SV = I->getOperand(0);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(SV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return false;
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
+      if (Alloca->isSwiftError())
+        return false;
+    }
+  }
+
+  MVT VT;
+  if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
+    return false;
+
+  const Value *Ptr = LI->getPointerOperand();
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(Ptr, AM))
+    return false;
+
+  unsigned Alignment = LI->getAlignment();
+  unsigned ABIAlignment = DL.getABITypeAlignment(LI->getType());
+  if (Alignment == 0) // Ensure that codegen never sees alignment 0
+    Alignment = ABIAlignment;
+
+  unsigned ResultReg = 0;
+  if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg,
+                       Alignment))
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
+  bool HasAVX = Subtarget->hasAVX();
+  bool X86ScalarSSEf32 = Subtarget->hasSSE1();
+  bool X86ScalarSSEf64 = Subtarget->hasSSE2();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:       return 0;
+  case MVT::i8:  return X86::CMP8rr;
+  case MVT::i16: return X86::CMP16rr;
+  case MVT::i32: return X86::CMP32rr;
+  case MVT::i64: return X86::CMP64rr;
+  case MVT::f32:
+    return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0;
+  case MVT::f64:
+    return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0;
+  }
+}
+
+/// If we have a comparison with RHS as the RHS  of the comparison, return an
+/// opcode that works for the compare (e.g. CMP32ri) otherwise return 0.
+static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
+  int64_t Val = RHSC->getSExtValue();
+  switch (VT.getSimpleVT().SimpleTy) {
+  // Otherwise, we can't fold the immediate into this comparison.
+  default:
+    return 0;
+  case MVT::i8:
+    return X86::CMP8ri;
+  case MVT::i16:
+    if (isInt<8>(Val))
+      return X86::CMP16ri8;
+    return X86::CMP16ri;
+  case MVT::i32:
+    if (isInt<8>(Val))
+      return X86::CMP32ri8;
+    return X86::CMP32ri;
+  case MVT::i64:
+    if (isInt<8>(Val))
+      return X86::CMP64ri8;
+    // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
+    // field.
+    if (isInt<32>(Val))
+      return X86::CMP64ri32;
+    return 0;
+  }
+}
+
+bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT,
+                                     const DebugLoc &CurDbgLoc) {
+  unsigned Op0Reg = getRegForValue(Op0);
+  if (Op0Reg == 0) return false;
+
+  // Handle 'null' like i32/i64 0.
+  if (isa<ConstantPointerNull>(Op1))
+    Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));
+
+  // We have two options: compare with register or immediate.  If the RHS of
+  // the compare is an immediate that we can fold into this compare, use
+  // CMPri, otherwise use CMPrr.
+  if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+    if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc))
+        .addReg(Op0Reg)
+        .addImm(Op1C->getSExtValue());
+      return true;
+    }
+  }
+
+  unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
+  if (CompareOpc == 0) return false;
+
+  unsigned Op1Reg = getRegForValue(Op1);
+  if (Op1Reg == 0) return false;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc))
+    .addReg(Op0Reg)
+    .addReg(Op1Reg);
+
+  return true;
+}
+
+bool X86FastISel::X86SelectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
+
+  MVT VT;
+  if (!isTypeLegal(I->getOperand(0)->getType(), VT))
+    return false;
+
+  if (I->getType()->isIntegerTy(1) && Subtarget->hasAVX512())
+    return false;
+
+  // Try to optimize or fold the cmp.
+  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+  unsigned ResultReg = 0;
+  switch (Predicate) {
+  default: break;
+  case CmpInst::FCMP_FALSE: {
+    ResultReg = createResultReg(&X86::GR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
+            ResultReg);
+    ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
+                                           X86::sub_8bit);
+    if (!ResultReg)
+      return false;
+    break;
+  }
+  case CmpInst::FCMP_TRUE: {
+    ResultReg = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
+            ResultReg).addImm(1);
+    break;
+  }
+  }
+
+  if (ResultReg) {
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  const Value *LHS = CI->getOperand(0);
+  const Value *RHS = CI->getOperand(1);
+
+  // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+  // We don't have to materialize a zero constant for this case and can just use
+  // %x again on the RHS.
+  if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+    const auto *RHSC = dyn_cast<ConstantFP>(RHS);
+    if (RHSC && RHSC->isNullValue())
+      RHS = LHS;
+  }
+
+  // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+  static const uint16_t SETFOpcTable[2][3] = {
+    { X86::SETEr,  X86::SETNPr, X86::AND8rr },
+    { X86::SETNEr, X86::SETPr,  X86::OR8rr  }
+  };
+  const uint16_t *SETFOpc = nullptr;
+  switch (Predicate) {
+  default: break;
+  case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
+  case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
+  }
+
+  ResultReg = createResultReg(&X86::GR8RegClass);
+  if (SETFOpc) {
+    if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
+      return false;
+
+    unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
+    unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
+            FlagReg1);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
+            FlagReg2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
+            ResultReg).addReg(FlagReg1).addReg(FlagReg2);
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  X86::CondCode CC;
+  bool SwapArgs;
+  std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+  assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+  unsigned Opc = X86::getSETFromCond(CC);
+
+  if (SwapArgs)
+    std::swap(LHS, RHS);
+
+  // Emit a compare of LHS/RHS.
+  if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
+    return false;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectZExt(const Instruction *I) {
+  EVT DstVT = TLI.getValueType(DL, I->getType());
+  if (!TLI.isTypeLegal(DstVT))
+    return false;
+
+  unsigned ResultReg = getRegForValue(I->getOperand(0));
+  if (ResultReg == 0)
+    return false;
+
+  // Handle zero-extension from i1 to i8, which is common.
+  MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
+  if (SrcVT == MVT::i1) {
+    // Set the high bits to zero.
+    ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
+    SrcVT = MVT::i8;
+
+    if (ResultReg == 0)
+      return false;
+  }
+
+  if (DstVT == MVT::i64) {
+    // Handle extension to 64-bits via sub-register shenanigans.
+    unsigned MovInst;
+
+    switch (SrcVT.SimpleTy) {
+    case MVT::i8:  MovInst = X86::MOVZX32rr8;  break;
+    case MVT::i16: MovInst = X86::MOVZX32rr16; break;
+    case MVT::i32: MovInst = X86::MOV32rr;     break;
+    default: llvm_unreachable("Unexpected zext to i64 source type");
+    }
+
+    unsigned Result32 = createResultReg(&X86::GR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32)
+      .addReg(ResultReg);
+
+    ResultReg = createResultReg(&X86::GR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG),
+            ResultReg)
+      .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
+  } else if (DstVT != MVT::i8) {
+    ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
+                           ResultReg, /*Kill=*/true);
+    if (ResultReg == 0)
+      return false;
+  }
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectBranch(const Instruction *I) {
+  // Unconditional branches are selected by tablegen-generated code.
+  // Handle a conditional branch.
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  // Fold the common case of a conditional branch with a comparison
+  // in the same block (values defined on other blocks may not have
+  // initialized registers).
+  X86::CondCode CC;
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
+      EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType());
+
+      // Try to optimize or fold the cmp.
+      CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+      switch (Predicate) {
+      default: break;
+      case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true;
+      case CmpInst::FCMP_TRUE:  fastEmitBranch(TrueMBB, DbgLoc); return true;
+      }
+
+      const Value *CmpLHS = CI->getOperand(0);
+      const Value *CmpRHS = CI->getOperand(1);
+
+      // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
+      // 0.0.
+      // We don't have to materialize a zero constant for this case and can just
+      // use %x again on the RHS.
+      if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+        const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+        if (CmpRHSC && CmpRHSC->isNullValue())
+          CmpRHS = CmpLHS;
+      }
+
+      // Try to take advantage of fallthrough opportunities.
+      if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+        std::swap(TrueMBB, FalseMBB);
+        Predicate = CmpInst::getInversePredicate(Predicate);
+      }
+
+      // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
+      // code check. Instead two branch instructions are required to check all
+      // the flags. First we change the predicate to a supported condition code,
+      // which will be the first branch. Later one we will emit the second
+      // branch.
+      bool NeedExtraBranch = false;
+      switch (Predicate) {
+      default: break;
+      case CmpInst::FCMP_OEQ:
+        std::swap(TrueMBB, FalseMBB);
+        LLVM_FALLTHROUGH;
+      case CmpInst::FCMP_UNE:
+        NeedExtraBranch = true;
+        Predicate = CmpInst::FCMP_ONE;
+        break;
+      }
+
+      bool SwapArgs;
+      unsigned BranchOpc;
+      std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+      assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+      BranchOpc = X86::GetCondBranchFromCond(CC);
+      if (SwapArgs)
+        std::swap(CmpLHS, CmpRHS);
+
+      // Emit a compare of the LHS and RHS, setting the flags.
+      if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
+        return false;
+
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
+        .addMBB(TrueMBB);
+
+      // X86 requires a second branch to handle UNE (and OEQ, which is mapped
+      // to UNE above).
+      if (NeedExtraBranch) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1))
+          .addMBB(TrueMBB);
+      }
+
+      finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+      return true;
+    }
+  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+    // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
+    // typically happen for _Bool and C++ bools.
+    MVT SourceVT;
+    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+        isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
+      unsigned TestOpc = 0;
+      switch (SourceVT.SimpleTy) {
+      default: break;
+      case MVT::i8:  TestOpc = X86::TEST8ri; break;
+      case MVT::i16: TestOpc = X86::TEST16ri; break;
+      case MVT::i32: TestOpc = X86::TEST32ri; break;
+      case MVT::i64: TestOpc = X86::TEST64ri32; break;
+      }
+      if (TestOpc) {
+        unsigned OpReg = getRegForValue(TI->getOperand(0));
+        if (OpReg == 0) return false;
+
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
+          .addReg(OpReg).addImm(1);
+
+        unsigned JmpOpc = X86::JNE_1;
+        if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+          std::swap(TrueMBB, FalseMBB);
+          JmpOpc = X86::JE_1;
+        }
+
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
+          .addMBB(TrueMBB);
+
+        finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+        return true;
+      }
+    }
+  } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
+    // Fake request the condition, otherwise the intrinsic might be completely
+    // optimized away.
+    unsigned TmpReg = getRegForValue(BI->getCondition());
+    if (TmpReg == 0)
+      return false;
+
+    unsigned BranchOpc = X86::GetCondBranchFromCond(CC);
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
+      .addMBB(TrueMBB);
+    finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+    return true;
+  }
+
+  // Otherwise do a clumsy setcc and re-test it.
+  // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
+  // in an explicit cast, so make sure to handle that correctly.
+  unsigned OpReg = getRegForValue(BI->getCondition());
+  if (OpReg == 0) return false;
+
+  // In case OpReg is a K register, COPY to a GPR
+  if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) {
+    unsigned KOpReg = OpReg;
+    OpReg = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), OpReg)
+        .addReg(KOpReg);
+  }
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+      .addReg(OpReg)
+      .addImm(1);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1))
+    .addMBB(TrueMBB);
+  finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);
+  return true;
+}
+
+bool X86FastISel::X86SelectShift(const Instruction *I) {
+  unsigned CReg = 0, OpReg = 0;
+  const TargetRegisterClass *RC = nullptr;
+  if (I->getType()->isIntegerTy(8)) {
+    CReg = X86::CL;
+    RC = &X86::GR8RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR8rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR8rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL8rCL; break;
+    default: return false;
+    }
+  } else if (I->getType()->isIntegerTy(16)) {
+    CReg = X86::CX;
+    RC = &X86::GR16RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR16rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR16rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL16rCL; break;
+    default: return false;
+    }
+  } else if (I->getType()->isIntegerTy(32)) {
+    CReg = X86::ECX;
+    RC = &X86::GR32RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR32rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR32rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL32rCL; break;
+    default: return false;
+    }
+  } else if (I->getType()->isIntegerTy(64)) {
+    CReg = X86::RCX;
+    RC = &X86::GR64RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR64rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR64rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL64rCL; break;
+    default: return false;
+    }
+  } else {
+    return false;
+  }
+
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT))
+    return false;
+
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (Op0Reg == 0) return false;
+
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (Op1Reg == 0) return false;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+          CReg).addReg(Op1Reg);
+
+  // The shift instruction uses X86::CL. If we defined a super-register
+  // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
+  if (CReg != X86::CL)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::KILL), X86::CL)
+      .addReg(CReg, RegState::Kill);
+
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
+    .addReg(Op0Reg);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectDivRem(const Instruction *I) {
+  const static unsigned NumTypes = 4; // i8, i16, i32, i64
+  const static unsigned NumOps   = 4; // SDiv, SRem, UDiv, URem
+  const static bool S = true;  // IsSigned
+  const static bool U = false; // !IsSigned
+  const static unsigned Copy = TargetOpcode::COPY;
+  // For the X86 DIV/IDIV instruction, in most cases the dividend
+  // (numerator) must be in a specific register pair highreg:lowreg,
+  // producing the quotient in lowreg and the remainder in highreg.
+  // For most data types, to set up the instruction, the dividend is
+  // copied into lowreg, and lowreg is sign-extended or zero-extended
+  // into highreg.  The exception is i8, where the dividend is defined
+  // as a single register rather than a register pair, and we
+  // therefore directly sign-extend or zero-extend the dividend into
+  // lowreg, instead of copying, and ignore the highreg.
+  const static struct DivRemEntry {
+    // The following portion depends only on the data type.
+    const TargetRegisterClass *RC;
+    unsigned LowInReg;  // low part of the register pair
+    unsigned HighInReg; // high part of the register pair
+    // The following portion depends on both the data type and the operation.
+    struct DivRemResult {
+    unsigned OpDivRem;        // The specific DIV/IDIV opcode to use.
+    unsigned OpSignExtend;    // Opcode for sign-extending lowreg into
+                              // highreg, or copying a zero into highreg.
+    unsigned OpCopy;          // Opcode for copying dividend into lowreg, or
+                              // zero/sign-extending into lowreg for i8.
+    unsigned DivRemResultReg; // Register containing the desired result.
+    bool IsOpSigned;          // Whether to use signed or unsigned form.
+    } ResultTable[NumOps];
+  } OpTable[NumTypes] = {
+    { &X86::GR8RegClass,  X86::AX,  0, {
+        { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AL,  S }, // SDiv
+        { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AH,  S }, // SRem
+        { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AL,  U }, // UDiv
+        { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AH,  U }, // URem
+      }
+    }, // i8
+    { &X86::GR16RegClass, X86::AX,  X86::DX, {
+        { X86::IDIV16r, X86::CWD,     Copy,            X86::AX,  S }, // SDiv
+        { X86::IDIV16r, X86::CWD,     Copy,            X86::DX,  S }, // SRem
+        { X86::DIV16r,  X86::MOV32r0, Copy,            X86::AX,  U }, // UDiv
+        { X86::DIV16r,  X86::MOV32r0, Copy,            X86::DX,  U }, // URem
+      }
+    }, // i16
+    { &X86::GR32RegClass, X86::EAX, X86::EDX, {
+        { X86::IDIV32r, X86::CDQ,     Copy,            X86::EAX, S }, // SDiv
+        { X86::IDIV32r, X86::CDQ,     Copy,            X86::EDX, S }, // SRem
+        { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EAX, U }, // UDiv
+        { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EDX, U }, // URem
+      }
+    }, // i32
+    { &X86::GR64RegClass, X86::RAX, X86::RDX, {
+        { X86::IDIV64r, X86::CQO,     Copy,            X86::RAX, S }, // SDiv
+        { X86::IDIV64r, X86::CQO,     Copy,            X86::RDX, S }, // SRem
+        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RAX, U }, // UDiv
+        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RDX, U }, // URem
+      }
+    }, // i64
+  };
+
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT))
+    return false;
+
+  unsigned TypeIndex, OpIndex;
+  switch (VT.SimpleTy) {
+  default: return false;
+  case MVT::i8:  TypeIndex = 0; break;
+  case MVT::i16: TypeIndex = 1; break;
+  case MVT::i32: TypeIndex = 2; break;
+  case MVT::i64: TypeIndex = 3;
+    if (!Subtarget->is64Bit())
+      return false;
+    break;
+  }
+
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("Unexpected div/rem opcode");
+  case Instruction::SDiv: OpIndex = 0; break;
+  case Instruction::SRem: OpIndex = 1; break;
+  case Instruction::UDiv: OpIndex = 2; break;
+  case Instruction::URem: OpIndex = 3; break;
+  }
+
+  const DivRemEntry &TypeEntry = OpTable[TypeIndex];
+  const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (Op0Reg == 0)
+    return false;
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (Op1Reg == 0)
+    return false;
+
+  // Move op0 into low-order input register.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
+  // Zero-extend or sign-extend into high-order input register.
+  if (OpEntry.OpSignExtend) {
+    if (OpEntry.IsOpSigned)
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(OpEntry.OpSignExtend));
+    else {
+      unsigned Zero32 = createResultReg(&X86::GR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(X86::MOV32r0), Zero32);
+
+      // Copy the zero into the appropriate sub/super/identical physical
+      // register. Unfortunately the operations needed are not uniform enough
+      // to fit neatly into the table above.
+      if (VT == MVT::i16) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(Copy), TypeEntry.HighInReg)
+          .addReg(Zero32, 0, X86::sub_16bit);
+      } else if (VT == MVT::i32) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(Copy), TypeEntry.HighInReg)
+            .addReg(Zero32);
+      } else if (VT == MVT::i64) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
+            .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
+      }
+    }
+  }
+  // Generate the DIV/IDIV instruction.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
+  // For i8 remainder, we can't reference AH directly, as we'll end
+  // up with bogus copies like %R9B = COPY %AH. Reference AX
+  // instead to prevent AH references in a REX instruction.
+  //
+  // The current assumption of the fast register allocator is that isel
+  // won't generate explicit references to the GPR8_NOREX registers. If
+  // the allocator and/or the backend get enhanced to be more robust in
+  // that regard, this can be, and should be, removed.
+  unsigned ResultReg = 0;
+  if ((I->getOpcode() == Instruction::SRem ||
+       I->getOpcode() == Instruction::URem) &&
+      OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
+    unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
+    unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(Copy), SourceSuperReg).addReg(X86::AX);
+
+    // Shift AX right by 8 bits instead of using AH.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri),
+            ResultSuperReg).addReg(SourceSuperReg).addImm(8);
+
+    // Now reference the 8-bit subreg of the result.
+    ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
+                                           /*Kill=*/true, X86::sub_8bit);
+  }
+  // Copy the result out of the physreg if we haven't already.
+  if (!ResultReg) {
+    ResultReg = createResultReg(TypeEntry.RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg)
+        .addReg(OpEntry.DivRemResultReg);
+  }
+  updateValueMap(I, ResultReg);
+
+  return true;
+}
+
+/// \brief Emit a conditional move instruction (if the are supported) to lower
+/// the select.
+bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
+  // Check if the subtarget supports these instructions.
+  if (!Subtarget->hasCMov())
+    return false;
+
+  // FIXME: Add support for i8.
+  if (RetVT < MVT::i16 || RetVT > MVT::i64)
+    return false;
+
+  const Value *Cond = I->getOperand(0);
+  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+  bool NeedTest = true;
+  X86::CondCode CC = X86::COND_NE;
+
+  // Optimize conditions coming from a compare if both instructions are in the
+  // same basic block (values defined in other basic blocks may not have
+  // initialized registers).
+  const auto *CI = dyn_cast<CmpInst>(Cond);
+  if (CI && (CI->getParent() == I->getParent())) {
+    CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+    // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
+    static const uint16_t SETFOpcTable[2][3] = {
+      { X86::SETNPr, X86::SETEr , X86::TEST8rr },
+      { X86::SETPr,  X86::SETNEr, X86::OR8rr   }
+    };
+    const uint16_t *SETFOpc = nullptr;
+    switch (Predicate) {
+    default: break;
+    case CmpInst::FCMP_OEQ:
+      SETFOpc = &SETFOpcTable[0][0];
+      Predicate = CmpInst::ICMP_NE;
+      break;
+    case CmpInst::FCMP_UNE:
+      SETFOpc = &SETFOpcTable[1][0];
+      Predicate = CmpInst::ICMP_NE;
+      break;
+    }
+
+    bool NeedSwap;
+    std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
+    assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
+
+    const Value *CmpLHS = CI->getOperand(0);
+    const Value *CmpRHS = CI->getOperand(1);
+    if (NeedSwap)
+      std::swap(CmpLHS, CmpRHS);
+
+    EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
+    // Emit a compare of the LHS and RHS, setting the flags.
+    if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
+      return false;
+
+    if (SETFOpc) {
+      unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
+      unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
+              FlagReg1);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
+              FlagReg2);
+      auto const &II = TII.get(SETFOpc[2]);
+      if (II.getNumDefs()) {
+        unsigned TmpReg = createResultReg(&X86::GR8RegClass);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
+          .addReg(FlagReg2).addReg(FlagReg1);
+      } else {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+          .addReg(FlagReg2).addReg(FlagReg1);
+      }
+    }
+    NeedTest = false;
+  } else if (foldX86XALUIntrinsic(CC, I, Cond)) {
+    // Fake request the condition, otherwise the intrinsic might be completely
+    // optimized away.
+    unsigned TmpReg = getRegForValue(Cond);
+    if (TmpReg == 0)
+      return false;
+
+    NeedTest = false;
+  }
+
+  if (NeedTest) {
+    // Selects operate on i1, however, CondReg is 8 bits width and may contain
+    // garbage. Indeed, only the less significant bit is supposed to be
+    // accurate. If we read more than the lsb, we may see non-zero values
+    // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
+    // the select. This is achieved by performing TEST against 1.
+    unsigned CondReg = getRegForValue(Cond);
+    if (CondReg == 0)
+      return false;
+    bool CondIsKill = hasTrivialKill(Cond);
+
+    // In case OpReg is a K register, COPY to a GPR
+    if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
+      unsigned KCondReg = CondReg;
+      CondReg = createResultReg(&X86::GR8RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), CondReg)
+          .addReg(KCondReg, getKillRegState(CondIsKill));
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+        .addReg(CondReg, getKillRegState(CondIsKill))
+        .addImm(1);
+  }
+
+  const Value *LHS = I->getOperand(1);
+  const Value *RHS = I->getOperand(2);
+
+  unsigned RHSReg = getRegForValue(RHS);
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  if (!LHSReg || !RHSReg)
+    return false;
+
+  unsigned Opc = X86::getCMovFromCond(CC, RC->getSize());
+  unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
+                                       LHSReg, LHSIsKill);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+/// \brief Emit SSE or AVX instructions to lower the select.
+///
+/// Try to use SSE1/SSE2 instructions to simulate a select without branches.
+/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
+/// SSE instructions are available. If AVX is available, try to use a VBLENDV.
+bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
+  // Optimize conditions coming from a compare if both instructions are in the
+  // same basic block (values defined in other basic blocks may not have
+  // initialized registers).
+  const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
+  if (!CI || (CI->getParent() != I->getParent()))
+    return false;
+
+  if (I->getType() != CI->getOperand(0)->getType() ||
+      !((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
+        (Subtarget->hasSSE2() && RetVT == MVT::f64)))
+    return false;
+
+  const Value *CmpLHS = CI->getOperand(0);
+  const Value *CmpRHS = CI->getOperand(1);
+  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+  // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
+  // We don't have to materialize a zero constant for this case and can just use
+  // %x again on the RHS.
+  if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
+    const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
+    if (CmpRHSC && CmpRHSC->isNullValue())
+      CmpRHS = CmpLHS;
+  }
+
+  unsigned CC;
+  bool NeedSwap;
+  std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
+  if (CC > 7)
+    return false;
+
+  if (NeedSwap)
+    std::swap(CmpLHS, CmpRHS);
+
+  // Choose the SSE instruction sequence based on data type (float or double).
+  static const uint16_t OpcTable[2][4] = {
+    { X86::CMPSSrr,  X86::ANDPSrr,  X86::ANDNPSrr,  X86::ORPSrr  },
+    { X86::CMPSDrr,  X86::ANDPDrr,  X86::ANDNPDrr,  X86::ORPDrr  }
+  };
+
+  const uint16_t *Opc = nullptr;
+  switch (RetVT.SimpleTy) {
+  default: return false;
+  case MVT::f32: Opc = &OpcTable[0][0]; break;
+  case MVT::f64: Opc = &OpcTable[1][0]; break;
+  }
+
+  const Value *LHS = I->getOperand(1);
+  const Value *RHS = I->getOperand(2);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  unsigned RHSReg = getRegForValue(RHS);
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  unsigned CmpLHSReg = getRegForValue(CmpLHS);
+  bool CmpLHSIsKill = hasTrivialKill(CmpLHS);
+
+  unsigned CmpRHSReg = getRegForValue(CmpRHS);
+  bool CmpRHSIsKill = hasTrivialKill(CmpRHS);
+
+  if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS)
+    return false;
+
+  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+  unsigned ResultReg;
+
+  if (Subtarget->hasAVX512()) {
+    // If we have AVX512 we can use a mask compare and masked movss/sd.
+    const TargetRegisterClass *VR128X = &X86::VR128XRegClass;
+    const TargetRegisterClass *VK1 = &X86::VK1RegClass;
+
+    unsigned CmpOpcode =
+      (RetVT == MVT::f32) ? X86::VCMPSSZrr : X86::VCMPSDZrr;
+    unsigned CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill,
+                                       CmpRHSReg, CmpRHSIsKill, CC);
+
+    // Need an IMPLICIT_DEF for the input that is used to generate the upper
+    // bits of the result register since its not based on any of the inputs.
+    unsigned ImplicitDefReg = createResultReg(VR128X);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+
+    // Place RHSReg is the passthru of the masked movss/sd operation and put
+    // LHS in the input. The mask input comes from the compare.
+    unsigned MovOpcode =
+      (RetVT == MVT::f32) ? X86::VMOVSSZrrk : X86::VMOVSDZrrk;
+    unsigned MovReg = fastEmitInst_rrrr(MovOpcode, VR128X, RHSReg, RHSIsKill,
+                                        CmpReg, true, ImplicitDefReg, true,
+                                        LHSReg, LHSIsKill);
+
+    ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg).addReg(MovReg);
+
+  } else if (Subtarget->hasAVX()) {
+    const TargetRegisterClass *VR128 = &X86::VR128RegClass;
+
+    // If we have AVX, create 1 blendv instead of 3 logic instructions.
+    // Blendv was introduced with SSE 4.1, but the 2 register form implicitly
+    // uses XMM0 as the selection register. That may need just as many
+    // instructions as the AND/ANDN/OR sequence due to register moves, so
+    // don't bother.
+    unsigned CmpOpcode =
+      (RetVT == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr;
+    unsigned BlendOpcode =
+      (RetVT == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;
+
+    unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill,
+                                       CmpRHSReg, CmpRHSIsKill, CC);
+    unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill,
+                                          LHSReg, LHSIsKill, CmpReg, true);
+    ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);
+  } else {
+    const TargetRegisterClass *VR128 = &X86::VR128RegClass;
+    unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
+                                       CmpRHSReg, CmpRHSIsKill, CC);
+    unsigned AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /*IsKill=*/false,
+                                      LHSReg, LHSIsKill);
+    unsigned AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /*IsKill=*/true,
+                                       RHSReg, RHSIsKill);
+    unsigned OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*IsKill=*/true,
+                                     AndReg, /*IsKill=*/true);
+    ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg);
+  }
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
+  // These are pseudo CMOV instructions and will be later expanded into control-
+  // flow.
+  unsigned Opc;
+  switch (RetVT.SimpleTy) {
+  default: return false;
+  case MVT::i8:  Opc = X86::CMOV_GR8;  break;
+  case MVT::i16: Opc = X86::CMOV_GR16; break;
+  case MVT::i32: Opc = X86::CMOV_GR32; break;
+  case MVT::f32: Opc = X86::CMOV_FR32; break;
+  case MVT::f64: Opc = X86::CMOV_FR64; break;
+  }
+
+  const Value *Cond = I->getOperand(0);
+  X86::CondCode CC = X86::COND_NE;
+
+  // Optimize conditions coming from a compare if both instructions are in the
+  // same basic block (values defined in other basic blocks may not have
+  // initialized registers).
+  const auto *CI = dyn_cast<CmpInst>(Cond);
+  if (CI && (CI->getParent() == I->getParent())) {
+    bool NeedSwap;
+    std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
+    if (CC > X86::LAST_VALID_COND)
+      return false;
+
+    const Value *CmpLHS = CI->getOperand(0);
+    const Value *CmpRHS = CI->getOperand(1);
+
+    if (NeedSwap)
+      std::swap(CmpLHS, CmpRHS);
+
+    EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());
+    if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
+      return false;
+  } else {
+    unsigned CondReg = getRegForValue(Cond);
+    if (CondReg == 0)
+      return false;
+    bool CondIsKill = hasTrivialKill(Cond);
+
+    // In case OpReg is a K register, COPY to a GPR
+    if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
+      unsigned KCondReg = CondReg;
+      CondReg = createResultReg(&X86::GR8RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), CondReg)
+          .addReg(KCondReg, getKillRegState(CondIsKill));
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
+        .addReg(CondReg, getKillRegState(CondIsKill))
+        .addImm(1);
+  }
+
+  const Value *LHS = I->getOperand(1);
+  const Value *RHS = I->getOperand(2);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  unsigned RHSReg = getRegForValue(RHS);
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  if (!LHSReg || !RHSReg)
+    return false;
+
+  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+
+  unsigned ResultReg =
+    fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectSelect(const Instruction *I) {
+  MVT RetVT;
+  if (!isTypeLegal(I->getType(), RetVT))
+    return false;
+
+  // Check if we can fold the select.
+  if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
+    CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+    const Value *Opnd = nullptr;
+    switch (Predicate) {
+    default:                              break;
+    case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
+    case CmpInst::FCMP_TRUE:  Opnd = I->getOperand(1); break;
+    }
+    // No need for a select anymore - this is an unconditional move.
+    if (Opnd) {
+      unsigned OpReg = getRegForValue(Opnd);
+      if (OpReg == 0)
+        return false;
+      bool OpIsKill = hasTrivialKill(Opnd);
+      const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
+      unsigned ResultReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(OpReg, getKillRegState(OpIsKill));
+      updateValueMap(I, ResultReg);
+      return true;
+    }
+  }
+
+  // First try to use real conditional move instructions.
+  if (X86FastEmitCMoveSelect(RetVT, I))
+    return true;
+
+  // Try to use a sequence of SSE instructions to simulate a conditional move.
+  if (X86FastEmitSSESelect(RetVT, I))
+    return true;
+
+  // Fall-back to pseudo conditional move instructions, which will be later
+  // converted to control-flow.
+  if (X86FastEmitPseudoSelect(RetVT, I))
+    return true;
+
+  return false;
+}
+
+bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
+  // The target-independent selection algorithm in FastISel already knows how
+  // to select a SINT_TO_FP if the target is SSE but not AVX.
+  // Early exit if the subtarget doesn't have AVX.
+  if (!Subtarget->hasAVX())
+    return false;
+
+  if (!I->getOperand(0)->getType()->isIntegerTy(32))
+    return false;
+
+  // Select integer to float/double conversion.
+  unsigned OpReg = getRegForValue(I->getOperand(0));
+  if (OpReg == 0)
+    return false;
+
+  const TargetRegisterClass *RC = nullptr;
+  unsigned Opcode;
+
+  if (I->getType()->isDoubleTy()) {
+    // sitofp int -> double
+    Opcode = X86::VCVTSI2SDrr;
+    RC = &X86::FR64RegClass;
+  } else if (I->getType()->isFloatTy()) {
+    // sitofp int -> float
+    Opcode = X86::VCVTSI2SSrr;
+    RC = &X86::FR32RegClass;
+  } else
+    return false;
+
+  unsigned ImplicitDefReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+  unsigned ResultReg =
+      fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+// Helper method used by X86SelectFPExt and X86SelectFPTrunc.
+bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
+                                          unsigned TargetOpc,
+                                          const TargetRegisterClass *RC) {
+  assert((I->getOpcode() == Instruction::FPExt ||
+          I->getOpcode() == Instruction::FPTrunc) &&
+         "Instruction must be an FPExt or FPTrunc!");
+
+  unsigned OpReg = getRegForValue(I->getOperand(0));
+  if (OpReg == 0)
+    return false;
+
+  unsigned ResultReg = createResultReg(RC);
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc),
+                ResultReg);
+  if (Subtarget->hasAVX())
+    MIB.addReg(OpReg);
+  MIB.addReg(OpReg);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectFPExt(const Instruction *I) {
+  if (X86ScalarSSEf64 && I->getType()->isDoubleTy() &&
+      I->getOperand(0)->getType()->isFloatTy()) {
+    // fpext from float to double.
+    unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;
+    return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR64RegClass);
+  }
+
+  return false;
+}
+
+bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
+  if (X86ScalarSSEf64 && I->getType()->isFloatTy() &&
+      I->getOperand(0)->getType()->isDoubleTy()) {
+    // fptrunc from double to float.
+    unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;
+    return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR32RegClass);
+  }
+
+  return false;
+}
+
+bool X86FastISel::X86SelectTrunc(const Instruction *I) {
+  EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(DL, I->getType());
+
+  // This code only handles truncation to byte.
+  if (DstVT != MVT::i8 && DstVT != MVT::i1)
+    return false;
+  if (!TLI.isTypeLegal(SrcVT))
+    return false;
+
+  unsigned InputReg = getRegForValue(I->getOperand(0));
+  if (!InputReg)
+    // Unhandled operand.  Halt "fast" selection and bail.
+    return false;
+
+  if (SrcVT == MVT::i8) {
+    // Truncate from i8 to i1; no code needed.
+    updateValueMap(I, InputReg);
+    return true;
+  }
+
+  bool KillInputReg = false;
+  if (!Subtarget->is64Bit()) {
+    // If we're on x86-32; we can't extract an i8 from a general register.
+    // First issue a copy to GR16_ABCD or GR32_ABCD.
+    const TargetRegisterClass *CopyRC =
+      (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass;
+    unsigned CopyReg = createResultReg(CopyRC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg);
+    InputReg = CopyReg;
+    KillInputReg = true;
+  }
+
+  // Issue an extract_subreg.
+  unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8,
+                                                  InputReg, KillInputReg,
+                                                  X86::sub_8bit);
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::IsMemcpySmall(uint64_t Len) {
+  return Len <= (Subtarget->is64Bit() ? 32 : 16);
+}
+
+bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
+                                     X86AddressMode SrcAM, uint64_t Len) {
+
+  // Make sure we don't bloat code by inlining very large memcpy's.
+  if (!IsMemcpySmall(Len))
+    return false;
+
+  bool i64Legal = Subtarget->is64Bit();
+
+  // We don't care about alignment here since we just emit integer accesses.
+  while (Len) {
+    MVT VT;
+    if (Len >= 8 && i64Legal)
+      VT = MVT::i64;
+    else if (Len >= 4)
+      VT = MVT::i32;
+    else if (Len >= 2)
+      VT = MVT::i16;
+    else
+      VT = MVT::i8;
+
+    unsigned Reg;
+    bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
+    RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM);
+    assert(RV && "Failed to emit load or store??");
+
+    unsigned Size = VT.getSizeInBits()/8;
+    Len -= Size;
+    DestAM.Disp += Size;
+    SrcAM.Disp += Size;
+  }
+
+  return true;
+}
+
+bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
+  // FIXME: Handle more intrinsics.
+  switch (II->getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::convert_from_fp16:
+  case Intrinsic::convert_to_fp16: {
+    if (Subtarget->useSoftFloat() || !Subtarget->hasF16C())
+      return false;
+
+    const Value *Op = II->getArgOperand(0);
+    unsigned InputReg = getRegForValue(Op);
+    if (InputReg == 0)
+      return false;
+
+    // F16C only allows converting from float to half and from half to float.
+    bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16;
+    if (IsFloatToHalf) {
+      if (!Op->getType()->isFloatTy())
+        return false;
+    } else {
+      if (!II->getType()->isFloatTy())
+        return false;
+    }
+
+    unsigned ResultReg = 0;
+    const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16);
+    if (IsFloatToHalf) {
+      // 'InputReg' is implicitly promoted from register class FR32 to
+      // register class VR128 by method 'constrainOperandRegClass' which is
+      // directly called by 'fastEmitInst_ri'.
+      // Instruction VCVTPS2PHrr takes an extra immediate operand which is
+      // used to provide rounding control: use MXCSR.RC, encoded as 0b100.
+      // It's consistent with the other FP instructions, which are usually
+      // controlled by MXCSR.
+      InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 4);
+
+      // Move the lower 32-bits of ResultReg to another register of class GR32.
+      ResultReg = createResultReg(&X86::GR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(X86::VMOVPDI2DIrr), ResultReg)
+          .addReg(InputReg, RegState::Kill);
+      
+      // The result value is in the lower 16-bits of ResultReg.
+      unsigned RegIdx = X86::sub_16bit;
+      ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx);
+    } else {
+      assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
+      // Explicitly sign-extend the input to 32-bit.
+      InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg,
+                            /*Kill=*/false);
+
+      // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
+      InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
+                            InputReg, /*Kill=*/true);
+
+      InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /*Kill=*/true);
+
+      // The result value is in the lower 32-bits of ResultReg.
+      // Emit an explicit copy from register class VR128 to register class FR32.
+      ResultReg = createResultReg(&X86::FR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+          .addReg(InputReg, RegState::Kill);
+    }
+
+    updateValueMap(II, ResultReg);
+    return true;
+  }
+  case Intrinsic::frameaddress: {
+    MachineFunction *MF = FuncInfo.MF;
+    if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI())
+      return false;
+
+    Type *RetTy = II->getCalledFunction()->getReturnType();
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    unsigned Opc;
+    const TargetRegisterClass *RC = nullptr;
+
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Invalid result type for frameaddress.");
+    case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
+    case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
+    }
+
+    // This needs to be set before we call getPtrSizedFrameRegister, otherwise
+    // we get the wrong frame register.
+    MachineFrameInfo &MFI = MF->getFrameInfo();
+    MFI.setFrameAddressIsTaken(true);
+
+    const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+    unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF);
+    assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+            (FrameReg == X86::EBP && VT == MVT::i32)) &&
+           "Invalid Frame Register!");
+
+    // Always make a copy of the frame register to to a vreg first, so that we
+    // never directly reference the frame register (the TwoAddressInstruction-
+    // Pass doesn't like that).
+    unsigned SrcReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
+
+    // Now recursively load from the frame address.
+    // movq (%rbp), %rax
+    // movq (%rax), %rax
+    // movq (%rax), %rax
+    // ...
+    unsigned DestReg;
+    unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
+    while (Depth--) {
+      DestReg = createResultReg(RC);
+      addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                           TII.get(Opc), DestReg), SrcReg);
+      SrcReg = DestReg;
+    }
+
+    updateValueMap(II, SrcReg);
+    return true;
+  }
+  case Intrinsic::memcpy: {
+    const MemCpyInst *MCI = cast<MemCpyInst>(II);
+    // Don't handle volatile or variable length memcpys.
+    if (MCI->isVolatile())
+      return false;
+
+    if (isa<ConstantInt>(MCI->getLength())) {
+      // Small memcpy's are common enough that we want to do them
+      // without a call if possible.
+      uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();
+      if (IsMemcpySmall(Len)) {
+        X86AddressMode DestAM, SrcAM;
+        if (!X86SelectAddress(MCI->getRawDest(), DestAM) ||
+            !X86SelectAddress(MCI->getRawSource(), SrcAM))
+          return false;
+        TryEmitSmallMemcpy(DestAM, SrcAM, Len);
+        return true;
+      }
+    }
+
+    unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+    if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
+      return false;
+
+    if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
+      return false;
+
+    return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2);
+  }
+  case Intrinsic::memset: {
+    const MemSetInst *MSI = cast<MemSetInst>(II);
+
+    if (MSI->isVolatile())
+      return false;
+
+    unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+    if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
+      return false;
+
+    if (MSI->getDestAddressSpace() > 255)
+      return false;
+
+    return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+  }
+  case Intrinsic::stackprotector: {
+    // Emit code to store the stack guard onto the stack.
+    EVT PtrTy = TLI.getPointerTy(DL);
+
+    const Value *Op1 = II->getArgOperand(0); // The guard's value.
+    const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));
+
+    MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);
+
+    // Grab the frame index.
+    X86AddressMode AM;
+    if (!X86SelectAddress(Slot, AM)) return false;
+    if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
+    return true;
+  }
+  case Intrinsic::dbg_declare: {
+    const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
+    X86AddressMode AM;
+    assert(DI->getAddress() && "Null address should be checked earlier!");
+    if (!X86SelectAddress(DI->getAddress(), AM))
+      return false;
+    const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
+    // FIXME may need to add RegState::Debug to any registers produced,
+    // although ESP/EBP should be the only ones at the moment.
+    assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) &&
+           "Expected inlined-at fields to agree");
+    addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM)
+        .addImm(0)
+        .addMetadata(DI->getVariable())
+        .addMetadata(DI->getExpression());
+    return true;
+  }
+  case Intrinsic::trap: {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
+    return true;
+  }
+  case Intrinsic::sqrt: {
+    if (!Subtarget->hasSSE1())
+      return false;
+
+    Type *RetTy = II->getCalledFunction()->getReturnType();
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
+    // is not generated by FastISel yet.
+    // FIXME: Update this code once tablegen can handle it.
+    static const uint16_t SqrtOpc[2][2] = {
+      {X86::SQRTSSr, X86::VSQRTSSr},
+      {X86::SQRTSDr, X86::VSQRTSDr}
+    };
+    bool HasAVX = Subtarget->hasAVX();
+    unsigned Opc;
+    const TargetRegisterClass *RC;
+    switch (VT.SimpleTy) {
+    default: return false;
+    case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break;
+    case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
+    }
+
+    const Value *SrcVal = II->getArgOperand(0);
+    unsigned SrcReg = getRegForValue(SrcVal);
+
+    if (SrcReg == 0)
+      return false;
+
+    unsigned ImplicitDefReg = 0;
+    if (HasAVX) {
+      ImplicitDefReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+    }
+
+    unsigned ResultReg = createResultReg(RC);
+    MachineInstrBuilder MIB;
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                  ResultReg);
+
+    if (ImplicitDefReg)
+      MIB.addReg(ImplicitDefReg);
+
+    MIB.addReg(SrcReg);
+
+    updateValueMap(II, ResultReg);
+    return true;
+  }
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow: {
+    // This implements the basic lowering of the xalu with overflow intrinsics
+    // into add/sub/mul followed by either seto or setb.
+    const Function *Callee = II->getCalledFunction();
+    auto *Ty = cast<StructType>(Callee->getReturnType());
+    Type *RetTy = Ty->getTypeAtIndex(0U);
+    assert(Ty->getTypeAtIndex(1)->isIntegerTy() &&
+           Ty->getTypeAtIndex(1)->getScalarSizeInBits() == 1 &&
+           "Overflow value expected to be an i1");
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    if (VT < MVT::i8 || VT > MVT::i64)
+      return false;
+
+    const Value *LHS = II->getArgOperand(0);
+    const Value *RHS = II->getArgOperand(1);
+
+    // Canonicalize immediate to the RHS.
+    if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+        isCommutativeIntrinsic(II))
+      std::swap(LHS, RHS);
+
+    bool UseIncDec = false;
+    if (isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isOne())
+      UseIncDec = true;
+
+    unsigned BaseOpc, CondOpc;
+    switch (II->getIntrinsicID()) {
+    default: llvm_unreachable("Unexpected intrinsic!");
+    case Intrinsic::sadd_with_overflow:
+      BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD);
+      CondOpc = X86::SETOr;
+      break;
+    case Intrinsic::uadd_with_overflow:
+      BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
+    case Intrinsic::ssub_with_overflow:
+      BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB);
+      CondOpc = X86::SETOr;
+      break;
+    case Intrinsic::usub_with_overflow:
+      BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
+    case Intrinsic::smul_with_overflow:
+      BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break;
+    case Intrinsic::umul_with_overflow:
+      BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
+    }
+
+    unsigned LHSReg = getRegForValue(LHS);
+    if (LHSReg == 0)
+      return false;
+    bool LHSIsKill = hasTrivialKill(LHS);
+
+    unsigned ResultReg = 0;
+    // Check if we have an immediate version.
+    if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
+      static const uint16_t Opc[2][4] = {
+        { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
+        { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
+      };
+
+      if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) {
+        ResultReg = createResultReg(TLI.getRegClassFor(VT));
+        bool IsDec = BaseOpc == X86ISD::DEC;
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
+          .addReg(LHSReg, getKillRegState(LHSIsKill));
+      } else
+        ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
+                                CI->getZExtValue());
+    }
+
+    unsigned RHSReg;
+    bool RHSIsKill;
+    if (!ResultReg) {
+      RHSReg = getRegForValue(RHS);
+      if (RHSReg == 0)
+        return false;
+      RHSIsKill = hasTrivialKill(RHS);
+      ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
+                              RHSIsKill);
+    }
+
+    // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
+    // it manually.
+    if (BaseOpc == X86ISD::UMUL && !ResultReg) {
+      static const uint16_t MULOpc[] =
+        { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
+      static const MCPhysReg Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
+      // First copy the first operand into RAX, which is an implicit input to
+      // the X86::MUL*r instruction.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
+        .addReg(LHSReg, getKillRegState(LHSIsKill));
+      ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
+                                 TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
+    } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
+      static const uint16_t MULOpc[] =
+        { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
+      if (VT == MVT::i8) {
+        // Copy the first operand into AL, which is an implicit input to the
+        // X86::IMUL8r instruction.
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+               TII.get(TargetOpcode::COPY), X86::AL)
+          .addReg(LHSReg, getKillRegState(LHSIsKill));
+        ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
+                                   RHSIsKill);
+      } else
+        ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
+                                    TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
+                                    RHSReg, RHSIsKill);
+    }
+
+    if (!ResultReg)
+      return false;
+
+    // Assign to a GPR since the overflow return value is lowered to a SETcc.
+    unsigned ResultReg2 = createResultReg(&X86::GR8RegClass);
+    assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
+            ResultReg2);
+
+    updateValueMap(II, ResultReg, 2);
+    return true;
+  }
+  case Intrinsic::x86_sse_cvttss2si:
+  case Intrinsic::x86_sse_cvttss2si64:
+  case Intrinsic::x86_sse2_cvttsd2si:
+  case Intrinsic::x86_sse2_cvttsd2si64: {
+    bool IsInputDouble;
+    switch (II->getIntrinsicID()) {
+    default: llvm_unreachable("Unexpected intrinsic.");
+    case Intrinsic::x86_sse_cvttss2si:
+    case Intrinsic::x86_sse_cvttss2si64:
+      if (!Subtarget->hasSSE1())
+        return false;
+      IsInputDouble = false;
+      break;
+    case Intrinsic::x86_sse2_cvttsd2si:
+    case Intrinsic::x86_sse2_cvttsd2si64:
+      if (!Subtarget->hasSSE2())
+        return false;
+      IsInputDouble = true;
+      break;
+    }
+
+    Type *RetTy = II->getCalledFunction()->getReturnType();
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    static const uint16_t CvtOpc[2][2][2] = {
+      { { X86::CVTTSS2SIrr,   X86::VCVTTSS2SIrr   },
+        { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr }  },
+      { { X86::CVTTSD2SIrr,   X86::VCVTTSD2SIrr   },
+        { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr }  }
+    };
+    bool HasAVX = Subtarget->hasAVX();
+    unsigned Opc;
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected result type.");
+    case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break;
+    case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break;
+    }
+
+    // Check if we can fold insertelement instructions into the convert.
+    const Value *Op = II->getArgOperand(0);
+    while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
+      const Value *Index = IE->getOperand(2);
+      if (!isa<ConstantInt>(Index))
+        break;
+      unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
+
+      if (Idx == 0) {
+        Op = IE->getOperand(1);
+        break;
+      }
+      Op = IE->getOperand(0);
+    }
+
+    unsigned Reg = getRegForValue(Op);
+    if (Reg == 0)
+      return false;
+
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(Reg);
+
+    updateValueMap(II, ResultReg);
+    return true;
+  }
+  }
+}
+
+bool X86FastISel::fastLowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const Function *F = FuncInfo.Fn;
+  if (F->isVarArg())
+    return false;
+
+  CallingConv::ID CC = F->getCallingConv();
+  if (CC != CallingConv::C)
+    return false;
+
+  if (Subtarget->isCallingConvWin64(CC))
+    return false;
+
+  if (!Subtarget->is64Bit())
+    return false;
+
+  // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
+  unsigned GPRCnt = 0;
+  unsigned FPRCnt = 0;
+  unsigned Idx = 0;
+  for (auto const &Arg : F->args()) {
+    // The first argument is at index 1.
+    ++Idx;
+    if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+      return false;
+
+    Type *ArgTy = Arg.getType();
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+      return false;
+
+    EVT ArgVT = TLI.getValueType(DL, ArgTy);
+    if (!ArgVT.isSimple()) return false;
+    switch (ArgVT.getSimpleVT().SimpleTy) {
+    default: return false;
+    case MVT::i32:
+    case MVT::i64:
+      ++GPRCnt;
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (!Subtarget->hasSSE1())
+        return false;
+      ++FPRCnt;
+      break;
+    }
+
+    if (GPRCnt > 6)
+      return false;
+
+    if (FPRCnt > 8)
+      return false;
+  }
+
+  static const MCPhysReg GPR32ArgRegs[] = {
+    X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
+  };
+  static const MCPhysReg GPR64ArgRegs[] = {
+    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
+  };
+  static const MCPhysReg XMMArgRegs[] = {
+    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+  };
+
+  unsigned GPRIdx = 0;
+  unsigned FPRIdx = 0;
+  for (auto const &Arg : F->args()) {
+    MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
+    const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+    unsigned SrcReg;
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type.");
+    case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
+    case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
+    case MVT::f32: LLVM_FALLTHROUGH;
+    case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
+    }
+    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+    // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+    // Without this, EmitLiveInCopies may eliminate the livein if its only
+    // use is a bitcast (which isn't turned into an instruction).
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+      .addReg(DstReg, getKillRegState(true));
+    updateValueMap(&Arg, ResultReg);
+  }
+  return true;
+}
+
+static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
+                                                  CallingConv::ID CC,
+                                                  ImmutableCallSite *CS) {
+  if (Subtarget->is64Bit())
+    return 0;
+  if (Subtarget->getTargetTriple().isOSMSVCRT())
+    return 0;
+  if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
+      CC == CallingConv::HiPE)
+    return 0;
+
+  if (CS)
+    if (CS->arg_empty() || !CS->paramHasAttr(1, Attribute::StructRet) ||
+        CS->paramHasAttr(1, Attribute::InReg) || Subtarget->isTargetMCU())
+      return 0;
+
+  return 4;
+}
+
+bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
+  auto &OutVals       = CLI.OutVals;
+  auto &OutFlags      = CLI.OutFlags;
+  auto &OutRegs       = CLI.OutRegs;
+  auto &Ins           = CLI.Ins;
+  auto &InRegs        = CLI.InRegs;
+  CallingConv::ID CC  = CLI.CallConv;
+  bool &IsTailCall    = CLI.IsTailCall;
+  bool IsVarArg       = CLI.IsVarArg;
+  const Value *Callee = CLI.Callee;
+  MCSymbol *Symbol = CLI.Symbol;
+
+  bool Is64Bit        = Subtarget->is64Bit();
+  bool IsWin64        = Subtarget->isCallingConvWin64(CC);
+
+  // Handle only C, fastcc, and webkit_js calling conventions for now.
+  switch (CC) {
+  default: return false;
+  case CallingConv::C:
+  case CallingConv::Fast:
+  case CallingConv::WebKit_JS:
+  case CallingConv::Swift:
+  case CallingConv::X86_FastCall:
+  case CallingConv::X86_StdCall:
+  case CallingConv::X86_ThisCall:
+  case CallingConv::X86_64_Win64:
+  case CallingConv::X86_64_SysV:
+    break;
+  }
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (IsTailCall)
+    return false;
+
+  // fastcc with -tailcallopt is intended to provide a guaranteed
+  // tail call optimization. Fastisel doesn't know how to do that.
+  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+    return false;
+
+  // Don't know how to handle Win64 varargs yet.  Nothing special needed for
+  // x86-32. Special handling for x86-64 is implemented.
+  if (IsVarArg && IsWin64)
+    return false;
+
+  // Don't know about inalloca yet.
+  if (CLI.CS && CLI.CS->hasInAllocaArgument())
+    return false;
+
+  for (auto Flag : CLI.OutFlags)
+    if (Flag.isSwiftError())
+      return false;
+
+  SmallVector<MVT, 16> OutVTs;
+  SmallVector<unsigned, 16> ArgRegs;
+
+  // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
+  // instruction. This is safe because it is common to all FastISel supported
+  // calling conventions on x86.
+  for (int i = 0, e = OutVals.size(); i != e; ++i) {
+    Value *&Val = OutVals[i];
+    ISD::ArgFlagsTy Flags = OutFlags[i];
+    if (auto *CI = dyn_cast<ConstantInt>(Val)) {
+      if (CI->getBitWidth() < 32) {
+        if (Flags.isSExt())
+          Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext()));
+        else
+          Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext()));
+      }
+    }
+
+    // Passing bools around ends up doing a trunc to i1 and passing it.
+    // Codegen this as an argument + "and 1".
+    MVT VT;
+    auto *TI = dyn_cast<TruncInst>(Val);
+    unsigned ResultReg;
+    if (TI && TI->getType()->isIntegerTy(1) && CLI.CS &&
+              (TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
+              TI->hasOneUse()) {
+      Value *PrevVal = TI->getOperand(0);
+      ResultReg = getRegForValue(PrevVal);
+
+      if (!ResultReg)
+        return false;
+
+      if (!isTypeLegal(PrevVal->getType(), VT))
+        return false;
+
+      ResultReg =
+        fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1);
+    } else {
+      if (!isTypeLegal(Val->getType(), VT))
+        return false;
+      ResultReg = getRegForValue(Val);
+    }
+
+    if (!ResultReg)
+      return false;
+
+    ArgRegs.push_back(ResultReg);
+    OutVTs.push_back(VT);
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());
+
+  // Allocate shadow area for Win64
+  if (IsWin64)
+    CCInfo.AllocateStack(32, 8);
+
+  CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
+    .addImm(NumBytes).addImm(0);
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign const &VA = ArgLocs[i];
+    const Value *ArgVal = OutVals[VA.getValNo()];
+    MVT ArgVT = OutVTs[VA.getValNo()];
+
+    if (ArgVT == MVT::x86mmx)
+      return false;
+
+    unsigned ArgReg = ArgRegs[VA.getValNo()];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt: {
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
+
+      if (ArgVT == MVT::i1)
+        return false;
+
+      bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
+                                       ArgVT, ArgReg);
+      assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::ZExt: {
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
+
+      // Handle zero-extension from i1 to i8, which is common.
+      if (ArgVT == MVT::i1) {
+        // Set the high bits to zero.
+        ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false);
+        ArgVT = MVT::i8;
+
+        if (ArgReg == 0)
+          return false;
+      }
+
+      bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
+                                       ArgVT, ArgReg);
+      assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::AExt: {
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
+      bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
+                                       ArgVT, ArgReg);
+      if (!Emitted)
+        Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
+                                    ArgVT, ArgReg);
+      if (!Emitted)
+        Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
+                                    ArgVT, ArgReg);
+
+      assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::BCvt: {
+      ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg,
+                          /*TODO: Kill=*/false);
+      assert(ArgReg && "Failed to emit a bitcast!");
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::VExt:
+      // VExt has not been implemented, so this should be impossible to reach
+      // for now.  However, fallback to Selection DAG isel once implemented.
+      return false;
+    case CCValAssign::AExtUpper:
+    case CCValAssign::SExtUpper:
+    case CCValAssign::ZExtUpper:
+    case CCValAssign::FPExt:
+      llvm_unreachable("Unexpected loc info!");
+    case CCValAssign::Indirect:
+      // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
+      // support this.
+      return false;
+    }
+
+    if (VA.isRegLoc()) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
+      OutRegs.push_back(VA.getLocReg());
+    } else {
+      assert(VA.isMemLoc());
+
+      // Don't emit stores for undef values.
+      if (isa<UndefValue>(ArgVal))
+        continue;
+
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      X86AddressMode AM;
+      AM.Base.Reg = RegInfo->getStackRegister();
+      AM.Disp = LocMemOffset;
+      ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
+      unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+      MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+          MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset),
+          MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
+      if (Flags.isByVal()) {
+        X86AddressMode SrcAM;
+        SrcAM.Base.Reg = ArgReg;
+        if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
+          return false;
+      } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
+        // If this is a really simple value, emit this with the Value* version
+        // of X86FastEmitStore.  If it isn't simple, we don't want to do this,
+        // as it can cause us to reevaluate the argument.
+        if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
+          return false;
+      } else {
+        bool ValIsKill = hasTrivialKill(ArgVal);
+        if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO))
+          return false;
+      }
+    }
+  }
+
+  // ELF / PIC requires GOT in the EBX register before function calls via PLT
+  // GOT pointer.
+  if (Subtarget->isPICStyleGOT()) {
+    unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
+  }
+
+  if (Is64Bit && IsVarArg && !IsWin64) {
+    // From AMD64 ABI document:
+    // For calls that may call functions that use varargs or stdargs
+    // (prototype-less calls or calls to functions containing ellipsis (...) in
+    // the declaration) %al is used as hidden argument to specify the number
+    // of SSE registers used. The contents of %al do not need to match exactly
+    // the number of registers, but must be an ubound on the number of SSE
+    // registers used and is in the range 0 - 8 inclusive.
+
+    // Count the number of XMM registers allocated.
+    static const MCPhysReg XMMArgRegs[] = {
+      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+    };
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
+    assert((Subtarget->hasSSE1() || !NumXMMRegs)
+           && "SSE registers cannot be used when SSE is disabled");
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
+            X86::AL).addImm(NumXMMRegs);
+  }
+
+  // Materialize callee address in a register. FIXME: GV address can be
+  // handled with a CALLpcrel32 instead.
+  X86AddressMode CalleeAM;
+  if (!X86SelectCallAddress(Callee, CalleeAM))
+    return false;
+
+  unsigned CalleeOp = 0;
+  const GlobalValue *GV = nullptr;
+  if (CalleeAM.GV != nullptr) {
+    GV = CalleeAM.GV;
+  } else if (CalleeAM.Base.Reg != 0) {
+    CalleeOp = CalleeAM.Base.Reg;
+  } else
+    return false;
+
+  // Issue the call.
+  MachineInstrBuilder MIB;
+  if (CalleeOp) {
+    // Register-indirect call.
+    unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc))
+      .addReg(CalleeOp);
+  } else {
+    // Direct call.
+    assert(GV && "Not a direct call");
+    unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+
+    // See if we need any target-specific flags on the GV operand.
+    unsigned char OpFlags = Subtarget->classifyGlobalFunctionReference(GV);
+    // Ignore NonLazyBind attribute in FastISel
+    if (OpFlags == X86II::MO_GOTPCREL)
+      OpFlags = 0;
+
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
+    if (Symbol)
+      MIB.addSym(Symbol, OpFlags);
+    else
+      MIB.addGlobalAddress(GV, 0, OpFlags);
+  }
+
+  // Add a register mask operand representing the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
+
+  // Add an implicit use GOT pointer in EBX.
+  if (Subtarget->isPICStyleGOT())
+    MIB.addReg(X86::EBX, RegState::Implicit);
+
+  if (Is64Bit && IsVarArg && !IsWin64)
+    MIB.addReg(X86::AL, RegState::Implicit);
+
+  // Add implicit physical register uses to the call.
+  for (auto Reg : OutRegs)
+    MIB.addReg(Reg, RegState::Implicit);
+
+  // Issue CALLSEQ_END
+  unsigned NumBytesForCalleeToPop =
+      X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
+                       TM.Options.GuaranteedTailCallOpt)
+          ? NumBytes // Callee pops everything.
+          : computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CS);
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
+    .addImm(NumBytes).addImm(NumBytesForCalleeToPop);
+
+  // Now handle call return values.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,
+                    CLI.RetTy->getContext());
+  CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+  // Copy all of the result registers out of their specified physreg.
+  unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    EVT CopyVT = VA.getValVT();
+    unsigned CopyReg = ResultReg + i;
+
+    // If this is x86-64, and we disabled SSE, we can't return FP values
+    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+      report_fatal_error("SSE register return with SSE disabled");
+    }
+
+    // If we prefer to use the value in xmm registers, copy it out as f80 and
+    // use a truncate to move it from fp stack reg to xmm reg.
+    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+        isScalarFPTypeInSSEReg(VA.getValVT())) {
+      CopyVT = MVT::f80;
+      CopyReg = createResultReg(&X86::RFP80RegClass);
+    }
+
+    // Copy out the result.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
+    InRegs.push_back(VA.getLocReg());
+
+    // Round the f80 to the right size, which also moves it to the appropriate
+    // xmm register. This is accomplished by storing the f80 value in memory
+    // and then loading it back.
+    if (CopyVT != VA.getValVT()) {
+      EVT ResVT = VA.getValVT();
+      unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
+      unsigned MemSize = ResVT.getSizeInBits()/8;
+      int FI = MFI.CreateStackObject(MemSize, MemSize, false);
+      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                TII.get(Opc)), FI)
+        .addReg(CopyReg);
+      Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
+      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                TII.get(Opc), ResultReg + i), FI);
+    }
+  }
+
+  CLI.ResultReg = ResultReg;
+  CLI.NumResultRegs = RVLocs.size();
+  CLI.Call = MIB;
+
+  return true;
+}
+
+bool
+X86FastISel::fastSelectInstruction(const Instruction *I)  {
+  switch (I->getOpcode()) {
+  default: break;
+  case Instruction::Load:
+    return X86SelectLoad(I);
+  case Instruction::Store:
+    return X86SelectStore(I);
+  case Instruction::Ret:
+    return X86SelectRet(I);
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return X86SelectCmp(I);
+  case Instruction::ZExt:
+    return X86SelectZExt(I);
+  case Instruction::Br:
+    return X86SelectBranch(I);
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::Shl:
+    return X86SelectShift(I);
+  case Instruction::SDiv:
+  case Instruction::UDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    return X86SelectDivRem(I);
+  case Instruction::Select:
+    return X86SelectSelect(I);
+  case Instruction::Trunc:
+    return X86SelectTrunc(I);
+  case Instruction::FPExt:
+    return X86SelectFPExt(I);
+  case Instruction::FPTrunc:
+    return X86SelectFPTrunc(I);
+  case Instruction::SIToFP:
+    return X86SelectSIToFP(I);
+  case Instruction::IntToPtr: // Deliberate fall-through.
+  case Instruction::PtrToInt: {
+    EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+    EVT DstVT = TLI.getValueType(DL, I->getType());
+    if (DstVT.bitsGT(SrcVT))
+      return X86SelectZExt(I);
+    if (DstVT.bitsLT(SrcVT))
+      return X86SelectTrunc(I);
+    unsigned Reg = getRegForValue(I->getOperand(0));
+    if (Reg == 0) return false;
+    updateValueMap(I, Reg);
+    return true;
+  }
+  case Instruction::BitCast: {
+    // Select SSE2/AVX bitcasts between 128/256 bit vector types.
+    if (!Subtarget->hasSSE2())
+      return false;
+
+    EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+    EVT DstVT = TLI.getValueType(DL, I->getType());
+
+    if (!SrcVT.isSimple() || !DstVT.isSimple())
+      return false;
+
+    MVT SVT = SrcVT.getSimpleVT();
+    MVT DVT = DstVT.getSimpleVT();
+
+    if (!SVT.is128BitVector() &&
+        !(Subtarget->hasAVX() && SVT.is256BitVector()) &&
+        !(Subtarget->hasAVX512() && SVT.is512BitVector() &&
+          (Subtarget->hasBWI() || (SVT.getScalarSizeInBits() >= 32 &&
+                                   DVT.getScalarSizeInBits() >= 32))))
+      return false;
+
+    unsigned Reg = getRegForValue(I->getOperand(0));
+    if (Reg == 0)
+      return false;
+      
+    // No instruction is needed for conversion. Reuse the register used by
+    // the fist operand.
+    updateValueMap(I, Reg);
+    return true;
+  }
+  }
+
+  return false;
+}
+
+unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
+  if (VT > MVT::i64)
+    return 0;
+
+  uint64_t Imm = CI->getZExtValue();
+  if (Imm == 0) {
+    unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type");
+    case MVT::i1:
+    case MVT::i8:
+      return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
+                                        X86::sub_8bit);
+    case MVT::i16:
+      return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true,
+                                        X86::sub_16bit);
+    case MVT::i32:
+      return SrcReg;
+    case MVT::i64: {
+      unsigned ResultReg = createResultReg(&X86::GR64RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
+        .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
+      return ResultReg;
+    }
+    }
+  }
+
+  unsigned Opc = 0;
+  switch (VT.SimpleTy) {
+  default: llvm_unreachable("Unexpected value type");
+  case MVT::i1:  VT = MVT::i8;       LLVM_FALLTHROUGH;
+  case MVT::i8:  Opc = X86::MOV8ri;  break;
+  case MVT::i16: Opc = X86::MOV16ri; break;
+  case MVT::i32: Opc = X86::MOV32ri; break;
+  case MVT::i64: {
+    if (isUInt<32>(Imm))
+      Opc = X86::MOV32ri;
+    else if (isInt<32>(Imm))
+      Opc = X86::MOV64ri32;
+    else
+      Opc = X86::MOV64ri;
+    break;
+  }
+  }
+  if (VT == MVT::i64 && Opc == X86::MOV32ri) {
+    unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm);
+    unsigned ResultReg = createResultReg(&X86::GR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
+      .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
+    return ResultReg;
+  }
+  return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
+}
+
+unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
+  if (CFP->isNullValue())
+    return fastMaterializeFloatZero(CFP);
+
+  // Can't handle alternate code models yet.
+  CodeModel::Model CM = TM.getCodeModel();
+  if (CM != CodeModel::Small && CM != CodeModel::Large)
+    return 0;
+
+  // Get opcode and regclass of the output for the given load instruction.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = nullptr;
+  switch (VT.SimpleTy) {
+  default: return 0;
+  case MVT::f32:
+    if (X86ScalarSSEf32) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
+      RC  = &X86::FR32RegClass;
+    } else {
+      Opc = X86::LD_Fp32m;
+      RC  = &X86::RFP32RegClass;
+    }
+    break;
+  case MVT::f64:
+    if (X86ScalarSSEf64) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
+      RC  = &X86::FR64RegClass;
+    } else {
+      Opc = X86::LD_Fp64m;
+      RC  = &X86::RFP64RegClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return 0;
+  }
+
+  // MachineConstantPool wants an explicit alignment.
+  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+  if (Align == 0) {
+    // Alignment of vector types. FIXME!
+    Align = DL.getTypeAllocSize(CFP->getType());
+  }
+
+  // x86-32 PIC requires a PIC base register for constant pools.
+  unsigned PICBase = 0;
+  unsigned char OpFlag = Subtarget->classifyLocalReference(nullptr);
+  if (OpFlag == X86II::MO_PIC_BASE_OFFSET)
+    PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+  else if (OpFlag == X86II::MO_GOTOFF)
+    PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+  else if (Subtarget->is64Bit() && TM.getCodeModel() == CodeModel::Small)
+    PICBase = X86::RIP;
+
+  // Create the load from the constant pool.
+  unsigned CPI = MCP.getConstantPoolIndex(CFP, Align);
+  unsigned ResultReg = createResultReg(RC);
+
+  if (CM == CodeModel::Large) {
+    unsigned AddrReg = createResultReg(&X86::GR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
+            AddrReg)
+      .addConstantPoolIndex(CPI, 0, OpFlag);
+    MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                      TII.get(Opc), ResultReg);
+    addDirectMem(MIB, AddrReg);
+    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getConstantPool(*FuncInfo.MF),
+        MachineMemOperand::MOLoad, DL.getPointerSize(), Align);
+    MIB->addMemOperand(*FuncInfo.MF, MMO);
+    return ResultReg;
+  }
+
+  addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                   TII.get(Opc), ResultReg),
+                           CPI, PICBase, OpFlag);
+  return ResultReg;
+}
+
+unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
+  // Can't handle alternate code models yet.
+  if (TM.getCodeModel() != CodeModel::Small)
+    return 0;
+
+  // Materialize addresses with LEA/MOV instructions.
+  X86AddressMode AM;
+  if (X86SelectAddress(GV, AM)) {
+    // If the expression is just a basereg, then we're done, otherwise we need
+    // to emit an LEA.
+    if (AM.BaseType == X86AddressMode::RegBase &&
+        AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
+      return AM.Base.Reg;
+
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    if (TM.getRelocationModel() == Reloc::Static &&
+        TLI.getPointerTy(DL) == MVT::i64) {
+      // The displacement code could be more than 32 bits away so we need to use
+      // an instruction with a 64 bit immediate
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
+              ResultReg)
+        .addGlobalAddress(GV);
+    } else {
+      unsigned Opc =
+          TLI.getPointerTy(DL) == MVT::i32
+              ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
+              : X86::LEA64r;
+      addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                             TII.get(Opc), ResultReg), AM);
+    }
+    return ResultReg;
+  }
+  return 0;
+}
+
+unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(DL, C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple())
+    return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  if (const auto *CI = dyn_cast<ConstantInt>(C))
+    return X86MaterializeInt(CI, VT);
+  else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return X86MaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return X86MaterializeGV(GV, VT);
+
+  return 0;
+}
+
+unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
+  // Fail on dynamic allocas. At this point, getRegForValue has already
+  // checked its CSE maps, so if we're here trying to handle a dynamic
+  // alloca, we're not going to succeed. X86SelectAddress has a
+  // check for dynamic allocas, because it's called directly from
+  // various places, but targetMaterializeAlloca also needs a check
+  // in order to avoid recursion between getRegForValue,
+  // X86SelectAddrss, and targetMaterializeAlloca.
+  if (!FuncInfo.StaticAllocaMap.count(C))
+    return 0;
+  assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(C, AM))
+    return 0;
+  unsigned Opc =
+      TLI.getPointerTy(DL) == MVT::i32
+          ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
+          : X86::LEA64r;
+  const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
+  unsigned ResultReg = createResultReg(RC);
+  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                         TII.get(Opc), ResultReg), AM);
+  return ResultReg;
+}
+
+unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
+  MVT VT;
+  if (!isTypeLegal(CF->getType(), VT))
+    return 0;
+
+  // Get opcode and regclass for the given zero.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = nullptr;
+  switch (VT.SimpleTy) {
+  default: return 0;
+  case MVT::f32:
+    if (X86ScalarSSEf32) {
+      Opc = X86::FsFLD0SS;
+      RC  = &X86::FR32RegClass;
+    } else {
+      Opc = X86::LD_Fp032;
+      RC  = &X86::RFP32RegClass;
+    }
+    break;
+  case MVT::f64:
+    if (X86ScalarSSEf64) {
+      Opc = X86::FsFLD0SD;
+      RC  = &X86::FR64RegClass;
+    } else {
+      Opc = X86::LD_Fp064;
+      RC  = &X86::RFP64RegClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return 0;
+  }
+
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
+  return ResultReg;
+}
+
+
+bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                                      const LoadInst *LI) {
+  const Value *Ptr = LI->getPointerOperand();
+  X86AddressMode AM;
+  if (!X86SelectAddress(Ptr, AM))
+    return false;
+
+  const X86InstrInfo &XII = (const X86InstrInfo &)TII;
+
+  unsigned Size = DL.getTypeAllocSize(LI->getType());
+  unsigned Alignment = LI->getAlignment();
+
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = DL.getABITypeAlignment(LI->getType());
+
+  SmallVector<MachineOperand, 8> AddrOps;
+  AM.getFullAddress(AddrOps);
+
+  MachineInstr *Result = XII.foldMemoryOperandImpl(
+      *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment,
+      /*AllowCommute=*/true);
+  if (!Result)
+    return false;
+
+  // The index register could be in the wrong register class.  Unfortunately,
+  // foldMemoryOperandImpl could have commuted the instruction so its not enough
+  // to just look at OpNo + the offset to the index reg.  We actually need to
+  // scan the instruction to find the index reg and see if its the correct reg
+  // class.
+  unsigned OperandNo = 0;
+  for (MachineInstr::mop_iterator I = Result->operands_begin(),
+       E = Result->operands_end(); I != E; ++I, ++OperandNo) {
+    MachineOperand &MO = *I;
+    if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg)
+      continue;
+    // Found the index reg, now try to rewrite it.
+    unsigned IndexReg = constrainOperandRegClass(Result->getDesc(),
+                                                 MO.getReg(), OperandNo);
+    if (IndexReg == MO.getReg())
+      continue;
+    MO.setReg(IndexReg);
+  }
+
+  Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
+  MI->eraseFromParent();
+  return true;
+}
+
+unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode,
+                                        const TargetRegisterClass *RC,
+                                        unsigned Op0, bool Op0IsKill,
+                                        unsigned Op1, bool Op1IsKill,
+                                        unsigned Op2, bool Op2IsKill,
+                                        unsigned Op3, bool Op3IsKill) {
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  unsigned ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+  Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+  Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);
+  Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 3);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addReg(Op2, getKillRegState(Op2IsKill))
+        .addReg(Op3, getKillRegState(Op3IsKill));
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addReg(Op2, getKillRegState(Op2IsKill))
+        .addReg(Op3, getKillRegState(Op3IsKill));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+
+namespace llvm {
+  FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
+                                const TargetLibraryInfo *libInfo) {
+    return new X86FastISel(funcInfo, libInfo);
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp
new file mode 100644
index 000000000000..8bde4bf98d66
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -0,0 +1,367 @@
+//===-- X86FixupBWInsts.cpp - Fixup Byte or Word instructions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines the pass that looks through the machine instructions
+/// late in the compilation, and finds byte or word instructions that
+/// can be profitably replaced with 32 bit instructions that give equivalent
+/// results for the bits of the results that are used. There are two possible
+/// reasons to do this.
+///
+/// One reason is to avoid false-dependences on the upper portions
+/// of the registers.  Only instructions that have a destination register
+/// which is not in any of the source registers can be affected by this.
+/// Any instruction where one of the source registers is also the destination
+/// register is unaffected, because it has a true dependence on the source
+/// register already.  So, this consideration primarily affects load
+/// instructions and register-to-register moves.  It would
+/// seem like cmov(s) would also be affected, but because of the way cmov is
+/// really implemented by most machines as reading both the destination and
+/// and source regsters, and then "merging" the two based on a condition,
+/// it really already should be considered as having a true dependence on the
+/// destination register as well.
+///
+/// The other reason to do this is for potential code size savings.  Word
+/// operations need an extra override byte compared to their 32 bit
+/// versions. So this can convert many word operations to their larger
+/// size, saving a byte in encoding. This could introduce partial register
+/// dependences where none existed however.  As an example take:
+///   orw  ax, $0x1000
+///   addw ax, $3
+/// now if this were to get transformed into
+///   orw  ax, $1000
+///   addl eax, $3
+/// because the addl encodes shorter than the addw, this would introduce
+/// a use of a register that was only partially written earlier.  On older
+/// Intel processors this can be quite a performance penalty, so this should
+/// probably only be done when it can be proven that a new partial dependence
+/// wouldn't be created, or when your know a newer processor is being
+/// targeted, or when optimizing for minimum code size.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define FIXUPBW_DESC "X86 Byte/Word Instruction Fixup"
+#define FIXUPBW_NAME "x86-fixup-bw-insts"
+
+#define DEBUG_TYPE FIXUPBW_NAME
+
+// Option to allow this optimization pass to have fine-grained control.
+static cl::opt<bool>
+    FixupBWInsts("fixup-byte-word-insts",
+                 cl::desc("Change byte and word instructions to larger sizes"),
+                 cl::init(true), cl::Hidden);
+
+namespace {
+class FixupBWInstPass : public MachineFunctionPass {
+  /// Loop over all of the instructions in the basic block replacing applicable
+  /// byte or word instructions with better alternatives.
+  void processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+  /// This sets the \p SuperDestReg to the 32 bit super reg of the original
+  /// destination register of the MachineInstr passed in. It returns true if
+  /// that super register is dead just prior to \p OrigMI, and false if not.
+  bool getSuperRegDestIfDead(MachineInstr *OrigMI,
+                             unsigned &SuperDestReg) const;
+
+  /// Change the MachineInstr \p MI into the equivalent extending load to 32 bit
+  /// register if it is safe to do so.  Return the replacement instruction if
+  /// OK, otherwise return nullptr.
+  MachineInstr *tryReplaceLoad(unsigned New32BitOpcode, MachineInstr *MI) const;
+
+  /// Change the MachineInstr \p MI into the equivalent 32-bit copy if it is
+  /// safe to do so.  Return the replacement instruction if OK, otherwise return
+  /// nullptr.
+  MachineInstr *tryReplaceCopy(MachineInstr *MI) const;
+
+  // Change the MachineInstr \p MI into an eqivalent 32 bit instruction if
+  // possible.  Return the replacement instruction if OK, return nullptr
+  // otherwise. Set WasCandidate to true or false depending on whether the
+  // MI was a candidate for this sort of transformation.
+  MachineInstr *tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB,
+                                bool &WasCandidate) const;
+public:
+  static char ID;
+
+  StringRef getPassName() const override { return FIXUPBW_DESC; }
+
+  FixupBWInstPass() : MachineFunctionPass(ID) {
+    initializeFixupBWInstPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineLoopInfo>(); // Machine loop info is used to
+                                       // guide some heuristics.
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  /// Loop over all of the basic blocks, replacing byte and word instructions by
+  /// equivalent 32 bit instructions where performance or code size can be
+  /// improved.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  MachineFunction *MF;
+
+  /// Machine instruction info used throughout the class.
+  const X86InstrInfo *TII;
+
+  /// Local member for function's OptForSize attribute.
+  bool OptForSize;
+
+  /// Machine loop info used for guiding some heruistics.
+  MachineLoopInfo *MLI;
+
+  /// Register Liveness information after the current instruction.
+  LivePhysRegs LiveRegs;
+};
+char FixupBWInstPass::ID = 0;
+}
+
+INITIALIZE_PASS(FixupBWInstPass, FIXUPBW_NAME, FIXUPBW_DESC, false, false)
+
+FunctionPass *llvm::createX86FixupBWInsts() { return new FixupBWInstPass(); }
+
+bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
+  if (!FixupBWInsts || skipFunction(*MF.getFunction()))
+    return false;
+
+  this->MF = &MF;
+  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+  OptForSize = MF.getFunction()->optForSize();
+  MLI = &getAnalysis<MachineLoopInfo>();
+  LiveRegs.init(TII->getRegisterInfo());
+
+  DEBUG(dbgs() << "Start X86FixupBWInsts\n";);
+
+  // Process all basic blocks.
+  for (auto &MBB : MF)
+    processBasicBlock(MF, MBB);
+
+  DEBUG(dbgs() << "End X86FixupBWInsts\n";);
+
+  return true;
+}
+
+// TODO: This method of analysis can miss some legal cases, because the
+// super-register could be live into the address expression for a memory
+// reference for the instruction, and still be killed/last used by the
+// instruction. However, the existing query interfaces don't seem to
+// easily allow that to be checked.
+//
+// What we'd really like to know is whether after OrigMI, the
+// only portion of SuperDestReg that is alive is the portion that
+// was the destination register of OrigMI.
+bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
+                                            unsigned &SuperDestReg) const {
+  auto *TRI = &TII->getRegisterInfo();
+
+  unsigned OrigDestReg = OrigMI->getOperand(0).getReg();
+  SuperDestReg = getX86SubSuperRegister(OrigDestReg, 32);
+
+  const auto SubRegIdx = TRI->getSubRegIndex(SuperDestReg, OrigDestReg);
+
+  // Make sure that the sub-register that this instruction has as its
+  // destination is the lowest order sub-register of the super-register.
+  // If it isn't, then the register isn't really dead even if the
+  // super-register is considered dead.
+  if (SubRegIdx == X86::sub_8bit_hi)
+    return false;
+
+  if (LiveRegs.contains(SuperDestReg))
+    return false;
+
+  if (SubRegIdx == X86::sub_8bit) {
+    // In the case of byte registers, we also have to check that the upper
+    // byte register is also dead. That is considered to be independent of
+    // whether the super-register is dead.
+    unsigned UpperByteReg =
+        getX86SubSuperRegister(SuperDestReg, 8, /*High=*/true);
+
+    if (LiveRegs.contains(UpperByteReg))
+      return false;
+  }
+
+  return true;
+}
+
+MachineInstr *FixupBWInstPass::tryReplaceLoad(unsigned New32BitOpcode,
+                                              MachineInstr *MI) const {
+  unsigned NewDestReg;
+
+  // We are going to try to rewrite this load to a larger zero-extending
+  // load.  This is safe if all portions of the 32 bit super-register
+  // of the original destination register, except for the original destination
+  // register are dead. getSuperRegDestIfDead checks that.
+  if (!getSuperRegDestIfDead(MI, NewDestReg))
+    return nullptr;
+
+  // Safe to change the instruction.
+  MachineInstrBuilder MIB =
+      BuildMI(*MF, MI->getDebugLoc(), TII->get(New32BitOpcode), NewDestReg);
+
+  unsigned NumArgs = MI->getNumOperands();
+  for (unsigned i = 1; i < NumArgs; ++i)
+    MIB.addOperand(MI->getOperand(i));
+
+  MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  return MIB;
+}
+
+MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
+  assert(MI->getNumExplicitOperands() == 2);
+  auto &OldDest = MI->getOperand(0);
+  auto &OldSrc = MI->getOperand(1);
+
+  unsigned NewDestReg;
+  if (!getSuperRegDestIfDead(MI, NewDestReg))
+    return nullptr;
+
+  unsigned NewSrcReg = getX86SubSuperRegister(OldSrc.getReg(), 32);
+
+  // This is only correct if we access the same subregister index: otherwise,
+  // we could try to replace "movb %ah, %al" with "movl %eax, %eax".
+  auto *TRI = &TII->getRegisterInfo();
+  if (TRI->getSubRegIndex(NewSrcReg, OldSrc.getReg()) !=
+      TRI->getSubRegIndex(NewDestReg, OldDest.getReg()))
+    return nullptr;
+
+  // Safe to change the instruction.
+  // Don't set src flags, as we don't know if we're also killing the superreg.
+  // However, the superregister might not be defined; make it explicit that
+  // we don't care about the higher bits by reading it as Undef, and adding
+  // an imp-use on the original subregister.
+  MachineInstrBuilder MIB =
+      BuildMI(*MF, MI->getDebugLoc(), TII->get(X86::MOV32rr), NewDestReg)
+          .addReg(NewSrcReg, RegState::Undef)
+          .addReg(OldSrc.getReg(), RegState::Implicit);
+
+  // Drop imp-defs/uses that would be redundant with the new def/use.
+  for (auto &Op : MI->implicit_operands())
+    if (Op.getReg() != (Op.isDef() ? NewDestReg : NewSrcReg))
+      MIB.addOperand(Op);
+
+  return MIB;
+}
+
+MachineInstr *FixupBWInstPass::tryReplaceInstr(
+                  MachineInstr *MI, MachineBasicBlock &MBB,
+                  bool &WasCandidate) const {
+  MachineInstr *NewMI = nullptr;
+  WasCandidate = false;
+
+  // See if this is an instruction of the type we are currently looking for.
+  switch (MI->getOpcode()) {
+
+  case X86::MOV8rm:
+    // Only replace 8 bit loads with the zero extending versions if
+    // in an inner most loop and not optimizing for size. This takes
+    // an extra byte to encode, and provides limited performance upside.
+    if (MachineLoop *ML = MLI->getLoopFor(&MBB)) {
+      if (ML->begin() == ML->end() && !OptForSize) {
+        NewMI = tryReplaceLoad(X86::MOVZX32rm8, MI);
+        WasCandidate = true;
+      }
+    }
+    break;
+
+  case X86::MOV16rm:
+    // Always try to replace 16 bit load with 32 bit zero extending.
+    // Code size is the same, and there is sometimes a perf advantage
+    // from eliminating a false dependence on the upper portion of
+    // the register.
+    NewMI = tryReplaceLoad(X86::MOVZX32rm16, MI);
+    WasCandidate = true;
+    break;
+
+  case X86::MOV8rr:
+  case X86::MOV16rr:
+    // Always try to replace 8/16 bit copies with a 32 bit copy.
+    // Code size is either less (16) or equal (8), and there is sometimes a
+    // perf advantage from eliminating a false dependence on the upper portion
+    // of the register.
+    NewMI = tryReplaceCopy(MI);
+    WasCandidate = true;
+    break;
+
+  default:
+    // nothing to do here.
+    break;
+  }
+
+  return NewMI;
+}
+
+void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) {
+
+  // This algorithm doesn't delete the instructions it is replacing
+  // right away.  By leaving the existing instructions in place, the
+  // register liveness information doesn't change, and this makes the
+  // analysis that goes on be better than if the replaced instructions
+  // were immediately removed.
+  //
+  // This algorithm always creates a replacement instruction
+  // and notes that and the original in a data structure, until the
+  // whole BB has been analyzed.  This keeps the replacement instructions
+  // from making it seem as if the larger register might be live.
+  SmallVector<std::pair<MachineInstr *, MachineInstr *>, 8> MIReplacements;
+
+  // Start computing liveness for this block. We iterate from the end to be able
+  // to update this for each instruction.
+  LiveRegs.clear();
+  // We run after PEI, so we need to AddPristinesAndCSRs.
+  LiveRegs.addLiveOuts(MBB);
+
+  bool WasCandidate = false;
+
+  for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) {
+    MachineInstr *MI = &*I;
+    
+    MachineInstr *NewMI = tryReplaceInstr(MI, MBB, WasCandidate);
+
+    // Add this to replacements if it was a candidate, even if NewMI is
+    // nullptr.  We will revisit that in a bit.
+    if (WasCandidate) {
+      MIReplacements.push_back(std::make_pair(MI, NewMI));
+    }
+
+    // We're done with this instruction, update liveness for the next one.
+    LiveRegs.stepBackward(*MI);
+  }
+
+  while (!MIReplacements.empty()) {
+    MachineInstr *MI = MIReplacements.back().first;
+    MachineInstr *NewMI = MIReplacements.back().second;
+    MIReplacements.pop_back();
+    if (NewMI) {
+      MBB.insert(MI, NewMI);
+      MBB.erase(MI);
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
new file mode 100644
index 000000000000..12095917ca30
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -0,0 +1,418 @@
+//===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass that finds instructions that can be
+// re-written as LEA instructions in order to reduce pipeline delays.
+// When optimizing for size it replaces suitable LEAs with INC or DEC.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-fixup-LEAs"
+
+STATISTIC(NumLEAs, "Number of LEA instructions created");
+
+namespace {
+class FixupLEAPass : public MachineFunctionPass {
+  enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
+  static char ID;
+  /// \brief Loop over all of the instructions in the basic block
+  /// replacing applicable instructions with LEA instructions,
+  /// where appropriate.
+  bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
+
+  StringRef getPassName() const override { return "X86 LEA Fixup"; }
+
+  /// \brief Given a machine register, look for the instruction
+  /// which writes it in the current basic block. If found,
+  /// try to replace it with an equivalent LEA instruction.
+  /// If replacement succeeds, then also process the newly created
+  /// instruction.
+  void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I,
+                    MachineFunction::iterator MFI);
+
+  /// \brief Given a memory access or LEA instruction
+  /// whose address mode uses a base and/or index register, look for
+  /// an opportunity to replace the instruction which sets the base or index
+  /// register with an equivalent LEA instruction.
+  void processInstruction(MachineBasicBlock::iterator &I,
+                          MachineFunction::iterator MFI);
+
+  /// \brief Given a LEA instruction which is unprofitable
+  /// on Silvermont try to replace it with an equivalent ADD instruction
+  void processInstructionForSLM(MachineBasicBlock::iterator &I,
+                                MachineFunction::iterator MFI);
+
+  /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg
+  /// and convert them to INC or DEC respectively.
+  bool fixupIncDec(MachineBasicBlock::iterator &I,
+                   MachineFunction::iterator MFI) const;
+
+  /// \brief Determine if an instruction references a machine register
+  /// and, if so, whether it reads or writes the register.
+  RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I);
+
+  /// \brief Step backwards through a basic block, looking
+  /// for an instruction which writes a register within
+  /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
+  MachineBasicBlock::iterator searchBackwards(MachineOperand &p,
+                                              MachineBasicBlock::iterator &I,
+                                              MachineFunction::iterator MFI);
+
+  /// \brief if an instruction can be converted to an
+  /// equivalent LEA, insert the new instruction into the basic block
+  /// and return a pointer to it. Otherwise, return zero.
+  MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI,
+                                   MachineBasicBlock::iterator &MBBI) const;
+
+public:
+  FixupLEAPass() : MachineFunctionPass(ID) {}
+
+  /// \brief Loop over all of the basic blocks,
+  /// replacing instructions by equivalent LEA instructions
+  /// if needed and when possible.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  // This pass runs after regalloc and doesn't support VReg operands.
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  MachineFunction *MF;
+  const X86InstrInfo *TII; // Machine instruction info.
+  bool OptIncDec;
+  bool OptLEA;
+};
+char FixupLEAPass::ID = 0;
+}
+
+MachineInstr *
+FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
+                                 MachineBasicBlock::iterator &MBBI) const {
+  MachineInstr &MI = *MBBI;
+  switch (MI.getOpcode()) {
+  case X86::MOV32rr:
+  case X86::MOV64rr: {
+    const MachineOperand &Src = MI.getOperand(1);
+    const MachineOperand &Dest = MI.getOperand(0);
+    MachineInstr *NewMI =
+        BuildMI(*MF, MI.getDebugLoc(),
+                TII->get(MI.getOpcode() == X86::MOV32rr ? X86::LEA32r
+                                                        : X86::LEA64r))
+            .addOperand(Dest)
+            .addOperand(Src)
+            .addImm(1)
+            .addReg(0)
+            .addImm(0)
+            .addReg(0);
+    MFI->insert(MBBI, NewMI); // Insert the new inst
+    return NewMI;
+  }
+  case X86::ADD64ri32:
+  case X86::ADD64ri8:
+  case X86::ADD64ri32_DB:
+  case X86::ADD64ri8_DB:
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+  case X86::ADD32ri_DB:
+  case X86::ADD32ri8_DB:
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri_DB:
+  case X86::ADD16ri8_DB:
+    if (!MI.getOperand(2).isImm()) {
+      // convertToThreeAddress will call getImm()
+      // which requires isImm() to be true
+      return nullptr;
+    }
+    break;
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB:
+    if (MI.getOperand(1).getReg() != MI.getOperand(2).getReg()) {
+      // if src1 != src2, then convertToThreeAddress will
+      // need to create a Virtual register, which we cannot do
+      // after register allocation.
+      return nullptr;
+    }
+  }
+  return TII->convertToThreeAddress(MFI, MI, nullptr);
+}
+
+FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
+
+bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
+  if (skipFunction(*Func.getFunction()))
+    return false;
+
+  MF = &Func;
+  const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
+  OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize();
+  OptLEA = ST.LEAusesAG() || ST.slowLEA();
+
+  if (!OptLEA && !OptIncDec)
+    return false;
+
+  TII = ST.getInstrInfo();
+
+  DEBUG(dbgs() << "Start X86FixupLEAs\n";);
+  // Process all basic blocks.
+  for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I)
+    processBasicBlock(Func, I);
+  DEBUG(dbgs() << "End X86FixupLEAs\n";);
+
+  return true;
+}
+
+FixupLEAPass::RegUsageState
+FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) {
+  RegUsageState RegUsage = RU_NotUsed;
+  MachineInstr &MI = *I;
+
+  for (unsigned int i = 0; i < MI.getNumOperands(); ++i) {
+    MachineOperand &opnd = MI.getOperand(i);
+    if (opnd.isReg() && opnd.getReg() == p.getReg()) {
+      if (opnd.isDef())
+        return RU_Write;
+      RegUsage = RU_Read;
+    }
+  }
+  return RegUsage;
+}
+
+/// getPreviousInstr - Given a reference to an instruction in a basic
+/// block, return a reference to the previous instruction in the block,
+/// wrapping around to the last instruction of the block if the block
+/// branches to itself.
+static inline bool getPreviousInstr(MachineBasicBlock::iterator &I,
+                                    MachineFunction::iterator MFI) {
+  if (I == MFI->begin()) {
+    if (MFI->isPredecessor(&*MFI)) {
+      I = --MFI->end();
+      return true;
+    } else
+      return false;
+  }
+  --I;
+  return true;
+}
+
+MachineBasicBlock::iterator
+FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
+                              MachineFunction::iterator MFI) {
+  int InstrDistance = 1;
+  MachineBasicBlock::iterator CurInst;
+  static const int INSTR_DISTANCE_THRESHOLD = 5;
+
+  CurInst = I;
+  bool Found;
+  Found = getPreviousInstr(CurInst, MFI);
+  while (Found && I != CurInst) {
+    if (CurInst->isCall() || CurInst->isInlineAsm())
+      break;
+    if (InstrDistance > INSTR_DISTANCE_THRESHOLD)
+      break; // too far back to make a difference
+    if (usesRegister(p, CurInst) == RU_Write) {
+      return CurInst;
+    }
+    InstrDistance += TII->getInstrLatency(
+        MF->getSubtarget().getInstrItineraryData(), *CurInst);
+    Found = getPreviousInstr(CurInst, MFI);
+  }
+  return MachineBasicBlock::iterator();
+}
+
+static inline bool isLEA(const int opcode) {
+  return opcode == X86::LEA16r || opcode == X86::LEA32r ||
+         opcode == X86::LEA64r || opcode == X86::LEA64_32r;
+}
+
+/// isLEASimpleIncOrDec - Does this LEA have one these forms:
+/// lea  %reg, 1(%reg)
+/// lea  %reg, -1(%reg)
+static inline bool isLEASimpleIncOrDec(MachineInstr &LEA) {
+  unsigned SrcReg = LEA.getOperand(1 + X86::AddrBaseReg).getReg();
+  unsigned DstReg = LEA.getOperand(0).getReg();
+  unsigned AddrDispOp = 1 + X86::AddrDisp;
+  return SrcReg == DstReg &&
+         LEA.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
+         LEA.getOperand(1 + X86::AddrSegmentReg).getReg() == 0 &&
+         LEA.getOperand(AddrDispOp).isImm() &&
+         (LEA.getOperand(AddrDispOp).getImm() == 1 ||
+          LEA.getOperand(AddrDispOp).getImm() == -1);
+}
+
+bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I,
+                               MachineFunction::iterator MFI) const {
+  MachineInstr &MI = *I;
+  int Opcode = MI.getOpcode();
+  if (!isLEA(Opcode))
+    return false;
+
+  if (isLEASimpleIncOrDec(MI) && TII->isSafeToClobberEFLAGS(*MFI, I)) {
+    int NewOpcode;
+    bool isINC = MI.getOperand(4).getImm() == 1;
+    switch (Opcode) {
+    case X86::LEA16r:
+      NewOpcode = isINC ? X86::INC16r : X86::DEC16r;
+      break;
+    case X86::LEA32r:
+    case X86::LEA64_32r:
+      NewOpcode = isINC ? X86::INC32r : X86::DEC32r;
+      break;
+    case X86::LEA64r:
+      NewOpcode = isINC ? X86::INC64r : X86::DEC64r;
+      break;
+    }
+
+    MachineInstr *NewMI =
+        BuildMI(*MFI, I, MI.getDebugLoc(), TII->get(NewOpcode))
+            .addOperand(MI.getOperand(0))
+            .addOperand(MI.getOperand(1));
+    MFI->erase(I);
+    I = static_cast<MachineBasicBlock::iterator>(NewMI);
+    return true;
+  }
+  return false;
+}
+
+void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I,
+                                      MachineFunction::iterator MFI) {
+  // Process a load, store, or LEA instruction.
+  MachineInstr &MI = *I;
+  const MCInstrDesc &Desc = MI.getDesc();
+  int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags);
+  if (AddrOffset >= 0) {
+    AddrOffset += X86II::getOperandBias(Desc);
+    MachineOperand &p = MI.getOperand(AddrOffset + X86::AddrBaseReg);
+    if (p.isReg() && p.getReg() != X86::ESP) {
+      seekLEAFixup(p, I, MFI);
+    }
+    MachineOperand &q = MI.getOperand(AddrOffset + X86::AddrIndexReg);
+    if (q.isReg() && q.getReg() != X86::ESP) {
+      seekLEAFixup(q, I, MFI);
+    }
+  }
+}
+
+void FixupLEAPass::seekLEAFixup(MachineOperand &p,
+                                MachineBasicBlock::iterator &I,
+                                MachineFunction::iterator MFI) {
+  MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI);
+  if (MBI != MachineBasicBlock::iterator()) {
+    MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI);
+    if (NewMI) {
+      ++NumLEAs;
+      DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
+      // now to replace with an equivalent LEA...
+      DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
+      MFI->erase(MBI);
+      MachineBasicBlock::iterator J =
+          static_cast<MachineBasicBlock::iterator>(NewMI);
+      processInstruction(J, MFI);
+    }
+  }
+}
+
+void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
+                                            MachineFunction::iterator MFI) {
+  MachineInstr &MI = *I;
+  const int opcode = MI.getOpcode();
+  if (!isLEA(opcode))
+    return;
+  if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() ||
+      !TII->isSafeToClobberEFLAGS(*MFI, I))
+    return;
+  const unsigned DstR = MI.getOperand(0).getReg();
+  const unsigned SrcR1 = MI.getOperand(1).getReg();
+  const unsigned SrcR2 = MI.getOperand(3).getReg();
+  if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR))
+    return;
+  if (MI.getOperand(2).getImm() > 1)
+    return;
+  int addrr_opcode, addri_opcode;
+  switch (opcode) {
+  default:
+    llvm_unreachable("Unexpected LEA instruction");
+  case X86::LEA16r:
+    addrr_opcode = X86::ADD16rr;
+    addri_opcode = X86::ADD16ri;
+    break;
+  case X86::LEA32r:
+    addrr_opcode = X86::ADD32rr;
+    addri_opcode = X86::ADD32ri;
+    break;
+  case X86::LEA64_32r:
+  case X86::LEA64r:
+    addrr_opcode = X86::ADD64rr;
+    addri_opcode = X86::ADD64ri32;
+    break;
+  }
+  DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
+  DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+  MachineInstr *NewMI = nullptr;
+  const MachineOperand &Dst = MI.getOperand(0);
+  // Make ADD instruction for two registers writing to LEA's destination
+  if (SrcR1 != 0 && SrcR2 != 0) {
+    const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3);
+    const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1);
+    NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode))
+                .addOperand(Dst)
+                .addOperand(Src1)
+                .addOperand(Src2);
+    MFI->insert(I, NewMI);
+    DEBUG(NewMI->dump(););
+  }
+  // Make ADD instruction for immediate
+  if (MI.getOperand(4).getImm() != 0) {
+    const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3);
+    NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode))
+                .addOperand(Dst)
+                .addOperand(SrcR)
+                .addImm(MI.getOperand(4).getImm());
+    MFI->insert(I, NewMI);
+    DEBUG(NewMI->dump(););
+  }
+  if (NewMI) {
+    MFI->erase(I);
+    I = static_cast<MachineBasicBlock::iterator>(NewMI);
+  }
+}
+
+bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
+                                     MachineFunction::iterator MFI) {
+
+  for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
+    if (OptIncDec)
+      if (fixupIncDec(I, MFI))
+        continue;
+
+    if (OptLEA) {
+      if (MF.getSubtarget<X86Subtarget>().isSLM())
+        processInstructionForSLM(I, MFI);
+      else
+        processInstruction(I, MFI);
+    }
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp b/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp
new file mode 100644
index 000000000000..a86eb997635e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FixupSetCC.cpp
@@ -0,0 +1,187 @@
+//===---- X86FixupSetCC.cpp - optimize usage of LEA instructions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that fixes zero-extension of setcc patterns.
+// X86 setcc instructions are modeled to have no input arguments, and a single
+// GR8 output argument. This is consistent with other similar instructions
+// (e.g. movb), but means it is impossible to directly generate a setcc into
+// the lower GR8 of a specified GR32.
+// This means that ISel must select (zext (setcc)) into something like
+// seta %al; movzbl %al, %eax.
+// Unfortunately, this can cause a stall due to the partial register write
+// performed by the setcc. Instead, we can use:
+// xor %eax, %eax; seta %al
+// This both avoids the stall, and encodes shorter.
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-fixup-setcc"
+
+STATISTIC(NumSubstZexts, "Number of setcc + zext pairs substituted");
+
+namespace {
+class X86FixupSetCCPass : public MachineFunctionPass {
+public:
+  X86FixupSetCCPass() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "X86 Fixup SetCC"; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  // Find the preceding instruction that imp-defs eflags.
+  MachineInstr *findFlagsImpDef(MachineBasicBlock *MBB,
+                                MachineBasicBlock::reverse_iterator MI);
+
+  // Return true if MI imp-uses eflags.
+  bool impUsesFlags(MachineInstr *MI);
+
+  // Return true if this is the opcode of a SetCC instruction with a register
+  // output.
+  bool isSetCCr(unsigned Opode);
+
+  MachineRegisterInfo *MRI;
+  const X86InstrInfo *TII;
+
+  enum { SearchBound = 16 };
+
+  static char ID;
+};
+
+char X86FixupSetCCPass::ID = 0;
+}
+
+FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); }
+
+bool X86FixupSetCCPass::isSetCCr(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case X86::SETOr:
+  case X86::SETNOr:
+  case X86::SETBr:
+  case X86::SETAEr:
+  case X86::SETEr:
+  case X86::SETNEr:
+  case X86::SETBEr:
+  case X86::SETAr:
+  case X86::SETSr:
+  case X86::SETNSr:
+  case X86::SETPr:
+  case X86::SETNPr:
+  case X86::SETLr:
+  case X86::SETGEr:
+  case X86::SETLEr:
+  case X86::SETGr:
+    return true;
+  }
+}
+
+// We expect the instruction *immediately* before the setcc to imp-def
+// EFLAGS (because of scheduling glue). To make this less brittle w.r.t
+// scheduling, look backwards until we hit the beginning of the
+// basic-block, or a small bound (to avoid quadratic behavior).
+MachineInstr *
+X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB,
+                                   MachineBasicBlock::reverse_iterator MI) {
+  // FIXME: Should this be instr_rend(), and MI be reverse_instr_iterator?
+  auto MBBStart = MBB->rend();
+  for (int i = 0; (i < SearchBound) && (MI != MBBStart); ++i, ++MI)
+    for (auto &Op : MI->implicit_operands())
+      if ((Op.getReg() == X86::EFLAGS) && (Op.isDef()))
+        return &*MI;
+
+  return nullptr;
+}
+
+bool X86FixupSetCCPass::impUsesFlags(MachineInstr *MI) {
+  for (auto &Op : MI->implicit_operands())
+    if ((Op.getReg() == X86::EFLAGS) && (Op.isUse()))
+      return true;
+
+  return false;
+}
+
+bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  MRI = &MF.getRegInfo();
+  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+
+  SmallVector<MachineInstr*, 4> ToErase;
+
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      // Find a setcc that is used by a zext.
+      // This doesn't have to be the only use, the transformation is safe
+      // regardless.
+      if (!isSetCCr(MI.getOpcode()))
+        continue;
+
+      MachineInstr *ZExt = nullptr;
+      for (auto &Use : MRI->use_instructions(MI.getOperand(0).getReg()))
+        if (Use.getOpcode() == X86::MOVZX32rr8)
+          ZExt = &Use;
+
+      if (!ZExt)
+        continue;
+
+      // Find the preceding instruction that imp-defs eflags.
+      MachineInstr *FlagsDefMI = findFlagsImpDef(
+          MI.getParent(), MachineBasicBlock::reverse_iterator(&MI));
+      if (!FlagsDefMI)
+        continue;
+
+      // We'd like to put something that clobbers eflags directly before
+      // FlagsDefMI. This can't hurt anything after FlagsDefMI, because
+      // it, itself, by definition, clobbers eflags. But it may happen that
+      // FlagsDefMI also *uses* eflags, in which case the transformation is
+      // invalid.
+      if (impUsesFlags(FlagsDefMI))
+        continue;
+
+      ++NumSubstZexts;
+      Changed = true;
+
+      // On 32-bit, we need to be careful to force an ABCD register.
+      const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit()
+                                          ? &X86::GR32RegClass
+                                          : &X86::GR32_ABCDRegClass;
+      unsigned ZeroReg = MRI->createVirtualRegister(RC);
+      unsigned InsertReg = MRI->createVirtualRegister(RC);
+
+      // Initialize a register with 0. This must go before the eflags def
+      BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
+              ZeroReg);
+
+      // X86 setcc only takes an output GR8, so fake a GR32 input by inserting
+      // the setcc result into the low byte of the zeroed register.
+      BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),
+              TII->get(X86::INSERT_SUBREG), InsertReg)
+          .addReg(ZeroReg)
+          .addReg(MI.getOperand(0).getReg())
+          .addImm(X86::sub_8bit);
+      MRI->replaceRegWith(ZExt->getOperand(0).getReg(), InsertReg);
+      ToErase.push_back(ZExt);
+    }
+  }
+
+  for (auto &I : ToErase)
+    I->eraseFromParent();
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
new file mode 100644
index 000000000000..a5489b9aa8b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -0,0 +1,1696 @@
+//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which converts floating point instructions from
+// pseudo registers into register stack instructions.  This pass uses live
+// variable information to indicate where the FPn registers are used and their
+// lifetimes.
+//
+// The x87 hardware tracks liveness of the stack registers, so it is necessary
+// to implement exact liveness tracking between basic blocks. The CFG edges are
+// partitioned into bundles where the same FP registers must be live in
+// identical stack positions. Instructions are inserted at the end of each basic
+// block to rearrange the live registers to match the outgoing bundle.
+//
+// This approach avoids splitting critical edges at the potential cost of more
+// live register shuffling instructions when critical edges are present.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <bitset>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-codegen"
+
+STATISTIC(NumFXCH, "Number of fxch instructions inserted");
+STATISTIC(NumFP  , "Number of floating point instructions");
+
+namespace {
+  const unsigned ScratchFPReg = 7;
+
+  struct FPS : public MachineFunctionPass {
+    static char ID;
+    FPS() : MachineFunctionPass(ID) {
+      initializeEdgeBundlesPass(*PassRegistry::getPassRegistry());
+      // This is really only to keep valgrind quiet.
+      // The logic in isLive() is too much for it.
+      memset(Stack, 0, sizeof(Stack));
+      memset(RegMap, 0, sizeof(RegMap));
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<EdgeBundles>();
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    StringRef getPassName() const override { return "X86 FP Stackifier"; }
+
+  private:
+    const TargetInstrInfo *TII; // Machine instruction info.
+
+    // Two CFG edges are related if they leave the same block, or enter the same
+    // block. The transitive closure of an edge under this relation is a
+    // LiveBundle. It represents a set of CFG edges where the live FP stack
+    // registers must be allocated identically in the x87 stack.
+    //
+    // A LiveBundle is usually all the edges leaving a block, or all the edges
+    // entering a block, but it can contain more edges if critical edges are
+    // present.
+    //
+    // The set of live FP registers in a LiveBundle is calculated by bundleCFG,
+    // but the exact mapping of FP registers to stack slots is fixed later.
+    struct LiveBundle {
+      // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c.
+      unsigned Mask;
+
+      // Number of pre-assigned live registers in FixStack. This is 0 when the
+      // stack order has not yet been fixed.
+      unsigned FixCount;
+
+      // Assigned stack order for live-in registers.
+      // FixStack[i] == getStackEntry(i) for all i < FixCount.
+      unsigned char FixStack[8];
+
+      LiveBundle() : Mask(0), FixCount(0) {}
+
+      // Have the live registers been assigned a stack order yet?
+      bool isFixed() const { return !Mask || FixCount; }
+    };
+
+    // Numbered LiveBundle structs. LiveBundles[0] is used for all CFG edges
+    // with no live FP registers.
+    SmallVector<LiveBundle, 8> LiveBundles;
+
+    // The edge bundle analysis provides indices into the LiveBundles vector.
+    EdgeBundles *Bundles;
+
+    // Return a bitmask of FP registers in block's live-in list.
+    static unsigned calcLiveInMask(MachineBasicBlock *MBB) {
+      unsigned Mask = 0;
+      for (const auto &LI : MBB->liveins()) {
+        if (LI.PhysReg < X86::FP0 || LI.PhysReg > X86::FP6)
+          continue;
+        Mask |= 1 << (LI.PhysReg - X86::FP0);
+      }
+      return Mask;
+    }
+
+    // Partition all the CFG edges into LiveBundles.
+    void bundleCFG(MachineFunction &MF);
+
+    MachineBasicBlock *MBB;     // Current basic block
+
+    // The hardware keeps track of how many FP registers are live, so we have
+    // to model that exactly. Usually, each live register corresponds to an
+    // FP<n> register, but when dealing with calls, returns, and inline
+    // assembly, it is sometimes necessary to have live scratch registers.
+    unsigned Stack[8];          // FP<n> Registers in each stack slot...
+    unsigned StackTop;          // The current top of the FP stack.
+
+    enum {
+      NumFPRegs = 8             // Including scratch pseudo-registers.
+    };
+
+    // For each live FP<n> register, point to its Stack[] entry.
+    // The first entries correspond to FP0-FP6, the rest are scratch registers
+    // used when we need slightly different live registers than what the
+    // register allocator thinks.
+    unsigned RegMap[NumFPRegs];
+
+    // Set up our stack model to match the incoming registers to MBB.
+    void setupBlockStack();
+
+    // Shuffle live registers to match the expectations of successor blocks.
+    void finishBlockStack();
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dumpStack() const {
+      dbgs() << "Stack contents:";
+      for (unsigned i = 0; i != StackTop; ++i) {
+        dbgs() << " FP" << Stack[i];
+        assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
+      }
+    }
+#endif
+
+    /// getSlot - Return the stack slot number a particular register number is
+    /// in.
+    unsigned getSlot(unsigned RegNo) const {
+      assert(RegNo < NumFPRegs && "Regno out of range!");
+      return RegMap[RegNo];
+    }
+
+    /// isLive - Is RegNo currently live in the stack?
+    bool isLive(unsigned RegNo) const {
+      unsigned Slot = getSlot(RegNo);
+      return Slot < StackTop && Stack[Slot] == RegNo;
+    }
+
+    /// getStackEntry - Return the X86::FP<n> register in register ST(i).
+    unsigned getStackEntry(unsigned STi) const {
+      if (STi >= StackTop)
+        report_fatal_error("Access past stack top!");
+      return Stack[StackTop-1-STi];
+    }
+
+    /// getSTReg - Return the X86::ST(i) register which contains the specified
+    /// FP<RegNo> register.
+    unsigned getSTReg(unsigned RegNo) const {
+      return StackTop - 1 - getSlot(RegNo) + X86::ST0;
+    }
+
+    // pushReg - Push the specified FP<n> register onto the stack.
+    void pushReg(unsigned Reg) {
+      assert(Reg < NumFPRegs && "Register number out of range!");
+      if (StackTop >= 8)
+        report_fatal_error("Stack overflow!");
+      Stack[StackTop] = Reg;
+      RegMap[Reg] = StackTop++;
+    }
+
+    // popReg - Pop a register from the stack.
+    void popReg() {
+      if (StackTop == 0)
+        report_fatal_error("Cannot pop empty stack!");
+      RegMap[Stack[--StackTop]] = ~0;     // Update state
+    }
+
+    bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
+    void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) {
+      DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+      if (isAtTop(RegNo)) return;
+
+      unsigned STReg = getSTReg(RegNo);
+      unsigned RegOnTop = getStackEntry(0);
+
+      // Swap the slots the regs are in.
+      std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+      // Swap stack slot contents.
+      if (RegMap[RegOnTop] >= StackTop)
+        report_fatal_error("Access past stack top!");
+      std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+
+      // Emit an fxch to update the runtime processors version of the state.
+      BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg);
+      ++NumFXCH;
+    }
+
+    void duplicateToTop(unsigned RegNo, unsigned AsReg,
+                        MachineBasicBlock::iterator I) {
+      DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+      unsigned STReg = getSTReg(RegNo);
+      pushReg(AsReg);   // New register on top of stack
+
+      BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
+    }
+
+    /// popStackAfter - Pop the current value off of the top of the FP stack
+    /// after the specified instruction.
+    void popStackAfter(MachineBasicBlock::iterator &I);
+
+    /// freeStackSlotAfter - Free the specified register from the register
+    /// stack, so that it is no longer in a register.  If the register is
+    /// currently at the top of the stack, we just pop the current instruction,
+    /// otherwise we store the current top-of-stack into the specified slot,
+    /// then pop the top of stack.
+    void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
+
+    /// freeStackSlotBefore - Just the pop, no folding. Return the inserted
+    /// instruction.
+    MachineBasicBlock::iterator
+    freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo);
+
+    /// Adjust the live registers to be the set in Mask.
+    void adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I);
+
+    /// Shuffle the top FixCount stack entries such that FP reg FixStack[0] is
+    /// st(0), FP reg FixStack[1] is st(1) etc.
+    void shuffleStackTop(const unsigned char *FixStack, unsigned FixCount,
+                         MachineBasicBlock::iterator I);
+
+    bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+    void handleCall(MachineBasicBlock::iterator &I);
+    void handleReturn(MachineBasicBlock::iterator &I);
+    void handleZeroArgFP(MachineBasicBlock::iterator &I);
+    void handleOneArgFP(MachineBasicBlock::iterator &I);
+    void handleOneArgFPRW(MachineBasicBlock::iterator &I);
+    void handleTwoArgFP(MachineBasicBlock::iterator &I);
+    void handleCompareFP(MachineBasicBlock::iterator &I);
+    void handleCondMovFP(MachineBasicBlock::iterator &I);
+    void handleSpecialFP(MachineBasicBlock::iterator &I);
+
+    // Check if a COPY instruction is using FP registers.
+    static bool isFPCopy(MachineInstr &MI) {
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned SrcReg = MI.getOperand(1).getReg();
+
+      return X86::RFP80RegClass.contains(DstReg) ||
+        X86::RFP80RegClass.contains(SrcReg);
+    }
+
+    void setKillFlags(MachineBasicBlock &MBB) const;
+  };
+  char FPS::ID = 0;
+}
+
+FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
+
+/// getFPReg - Return the X86::FPx register number for the specified operand.
+/// For example, this returns 3 for X86::FP3.
+static unsigned getFPReg(const MachineOperand &MO) {
+  assert(MO.isReg() && "Expected an FP register!");
+  unsigned Reg = MO.getReg();
+  assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
+  return Reg - X86::FP0;
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool FPS::runOnMachineFunction(MachineFunction &MF) {
+  // We only need to run this pass if there are any FP registers used in this
+  // function.  If it is all integer, there is nothing for us to do!
+  bool FPIsUsed = false;
+
+  static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!");
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (unsigned i = 0; i <= 6; ++i)
+    if (!MRI.reg_nodbg_empty(X86::FP0 + i)) {
+      FPIsUsed = true;
+      break;
+    }
+
+  // Early exit.
+  if (!FPIsUsed) return false;
+
+  Bundles = &getAnalysis<EdgeBundles>();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  // Prepare cross-MBB liveness.
+  bundleCFG(MF);
+
+  StackTop = 0;
+
+  // Process the function in depth first order so that we process at least one
+  // of the predecessors for every reachable block in the function.
+  df_iterator_default_set<MachineBasicBlock*> Processed;
+  MachineBasicBlock *Entry = &MF.front();
+
+  LiveBundle &Bundle =
+    LiveBundles[Bundles->getBundle(Entry->getNumber(), false)];
+  
+  // In regcall convention, some FP registers may not be passed through
+  // the stack, so they will need to be assigned to the stack first
+  if ((Entry->getParent()->getFunction()->getCallingConv() ==
+    CallingConv::X86_RegCall) && (Bundle.Mask && !Bundle.FixCount)) {
+    // In the register calling convention, up to one FP argument could be 
+    // saved in the first FP register.
+    // If bundle.mask is non-zero and Bundle.FixCount is zero, it means
+    // that the FP registers contain arguments.
+    // The actual value is passed in FP0.
+    // Here we fix the stack and mark FP0 as pre-assigned register.
+    assert((Bundle.Mask & 0xFE) == 0 &&
+      "Only FP0 could be passed as an argument");
+    Bundle.FixCount = 1;
+    Bundle.FixStack[0] = 0;
+  }
+
+  bool Changed = false;
+  for (MachineBasicBlock *BB : depth_first_ext(Entry, Processed))
+    Changed |= processBasicBlock(MF, *BB);
+
+  // Process any unreachable blocks in arbitrary order now.
+  if (MF.size() != Processed.size())
+    for (MachineBasicBlock &BB : MF)
+      if (Processed.insert(&BB).second)
+        Changed |= processBasicBlock(MF, BB);
+
+  LiveBundles.clear();
+
+  return Changed;
+}
+
+/// bundleCFG - Scan all the basic blocks to determine consistent live-in and
+/// live-out sets for the FP registers. Consistent means that the set of
+/// registers live-out from a block is identical to the live-in set of all
+/// successors. This is not enforced by the normal live-in lists since
+/// registers may be implicitly defined, or not used by all successors.
+void FPS::bundleCFG(MachineFunction &MF) {
+  assert(LiveBundles.empty() && "Stale data in LiveBundles");
+  LiveBundles.resize(Bundles->getNumBundles());
+
+  // Gather the actual live-in masks for all MBBs.
+  for (MachineBasicBlock &MBB : MF) {
+    const unsigned Mask = calcLiveInMask(&MBB);
+    if (!Mask)
+      continue;
+    // Update MBB ingoing bundle mask.
+    LiveBundles[Bundles->getBundle(MBB.getNumber(), false)].Mask |= Mask;
+  }
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+  bool Changed = false;
+  MBB = &BB;
+
+  setKillFlags(BB);
+  setupBlockStack();
+
+  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+    MachineInstr &MI = *I;
+    uint64_t Flags = MI.getDesc().TSFlags;
+
+    unsigned FPInstClass = Flags & X86II::FPTypeMask;
+    if (MI.isInlineAsm())
+      FPInstClass = X86II::SpecialFP;
+
+    if (MI.isCopy() && isFPCopy(MI))
+      FPInstClass = X86II::SpecialFP;
+
+    if (MI.isImplicitDef() &&
+        X86::RFP80RegClass.contains(MI.getOperand(0).getReg()))
+      FPInstClass = X86II::SpecialFP;
+
+    if (MI.isCall())
+      FPInstClass = X86II::SpecialFP;
+
+    if (FPInstClass == X86II::NotFP)
+      continue;  // Efficiently ignore non-fp insts!
+
+    MachineInstr *PrevMI = nullptr;
+    if (I != BB.begin())
+      PrevMI = &*std::prev(I);
+
+    ++NumFP;  // Keep track of # of pseudo instrs
+    DEBUG(dbgs() << "\nFPInst:\t" << MI);
+
+    // Get dead variables list now because the MI pointer may be deleted as part
+    // of processing!
+    SmallVector<unsigned, 8> DeadRegs;
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && MO.isDead())
+        DeadRegs.push_back(MO.getReg());
+    }
+
+    switch (FPInstClass) {
+    case X86II::ZeroArgFP:  handleZeroArgFP(I); break;
+    case X86II::OneArgFP:   handleOneArgFP(I);  break;  // fstp ST(0)
+    case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0))
+    case X86II::TwoArgFP:   handleTwoArgFP(I);  break;
+    case X86II::CompareFP:  handleCompareFP(I); break;
+    case X86II::CondMovFP:  handleCondMovFP(I); break;
+    case X86II::SpecialFP:  handleSpecialFP(I); break;
+    default: llvm_unreachable("Unknown FP Type!");
+    }
+
+    // Check to see if any of the values defined by this instruction are dead
+    // after definition.  If so, pop them.
+    for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
+      unsigned Reg = DeadRegs[i];
+      // Check if Reg is live on the stack. An inline-asm register operand that
+      // is in the clobber list and marked dead might not be live on the stack.
+      if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) {
+        DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
+        freeStackSlotAfter(I, Reg-X86::FP0);
+      }
+    }
+
+    // Print out all of the instructions expanded to if -debug
+    DEBUG({
+      MachineBasicBlock::iterator PrevI = PrevMI;
+      if (I == PrevI) {
+        dbgs() << "Just deleted pseudo instruction\n";
+      } else {
+        MachineBasicBlock::iterator Start = I;
+        // Rewind to first instruction newly inserted.
+        while (Start != BB.begin() && std::prev(Start) != PrevI)
+          --Start;
+        dbgs() << "Inserted instructions:\n\t";
+        Start->print(dbgs());
+        while (++Start != std::next(I)) {
+        }
+      }
+      dumpStack();
+    });
+    (void)PrevMI;
+
+    Changed = true;
+  }
+
+  finishBlockStack();
+
+  return Changed;
+}
+
+/// setupBlockStack - Use the live bundles to set up our model of the stack
+/// to match predecessors' live out stack.
+void FPS::setupBlockStack() {
+  DEBUG(dbgs() << "\nSetting up live-ins for BB#" << MBB->getNumber()
+               << " derived from " << MBB->getName() << ".\n");
+  StackTop = 0;
+  // Get the live-in bundle for MBB.
+  const LiveBundle &Bundle =
+    LiveBundles[Bundles->getBundle(MBB->getNumber(), false)];
+
+  if (!Bundle.Mask) {
+    DEBUG(dbgs() << "Block has no FP live-ins.\n");
+    return;
+  }
+
+  // Depth-first iteration should ensure that we always have an assigned stack.
+  assert(Bundle.isFixed() && "Reached block before any predecessors");
+
+  // Push the fixed live-in registers.
+  for (unsigned i = Bundle.FixCount; i > 0; --i) {
+    MBB->addLiveIn(X86::ST0+i-1);
+    DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP"
+                 << unsigned(Bundle.FixStack[i-1]) << '\n');
+    pushReg(Bundle.FixStack[i-1]);
+  }
+
+  // Kill off unwanted live-ins. This can happen with a critical edge.
+  // FIXME: We could keep these live registers around as zombies. They may need
+  // to be revived at the end of a short block. It might save a few instrs.
+  adjustLiveRegs(calcLiveInMask(MBB), MBB->begin());
+  DEBUG(MBB->dump());
+}
+
+/// finishBlockStack - Revive live-outs that are implicitly defined out of
+/// MBB. Shuffle live registers to match the expected fixed stack of any
+/// predecessors, and ensure that all predecessors are expecting the same
+/// stack.
+void FPS::finishBlockStack() {
+  // The RET handling below takes care of return blocks for us.
+  if (MBB->succ_empty())
+    return;
+
+  DEBUG(dbgs() << "Setting up live-outs for BB#" << MBB->getNumber()
+               << " derived from " << MBB->getName() << ".\n");
+
+  // Get MBB's live-out bundle.
+  unsigned BundleIdx = Bundles->getBundle(MBB->getNumber(), true);
+  LiveBundle &Bundle = LiveBundles[BundleIdx];
+
+  // We may need to kill and define some registers to match successors.
+  // FIXME: This can probably be combined with the shuffle below.
+  MachineBasicBlock::iterator Term = MBB->getFirstTerminator();
+  adjustLiveRegs(Bundle.Mask, Term);
+
+  if (!Bundle.Mask) {
+    DEBUG(dbgs() << "No live-outs.\n");
+    return;
+  }
+
+  // Has the stack order been fixed yet?
+  DEBUG(dbgs() << "LB#" << BundleIdx << ": ");
+  if (Bundle.isFixed()) {
+    DEBUG(dbgs() << "Shuffling stack to match.\n");
+    shuffleStackTop(Bundle.FixStack, Bundle.FixCount, Term);
+  } else {
+    // Not fixed yet, we get to choose.
+    DEBUG(dbgs() << "Fixing stack order now.\n");
+    Bundle.FixCount = StackTop;
+    for (unsigned i = 0; i < StackTop; ++i)
+      Bundle.FixStack[i] = getStackEntry(i);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+  struct TableEntry {
+    uint16_t from;
+    uint16_t to;
+    bool operator<(const TableEntry &TE) const { return from < TE.from; }
+    friend bool operator<(const TableEntry &TE, unsigned V) {
+      return TE.from < V;
+    }
+    friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V,
+                                                const TableEntry &TE) {
+      return V < TE.from;
+    }
+  };
+}
+
+static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) {
+  const TableEntry *I = std::lower_bound(Table.begin(), Table.end(), Opcode);
+  if (I != Table.end() && I->from == Opcode)
+    return I->to;
+  return -1;
+}
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE)                                              \
+  { static bool TABLE##Checked = false;                                   \
+    if (!TABLE##Checked) {                                                \
+       assert(std::is_sorted(std::begin(TABLE), std::end(TABLE)) &&       \
+              "All lookup tables must be sorted for efficient access!");  \
+       TABLE##Checked = true;                                             \
+    }                                                                     \
+  }
+#endif
+
+//===----------------------------------------------------------------------===//
+// Register File -> Register Stack Mapping Methods
+//===----------------------------------------------------------------------===//
+
+// OpcodeTable - Sorted map of register instructions to their stack version.
+// The first element is an register file pseudo instruction, the second is the
+// concrete X86 instruction which uses the register stack.
+//
+static const TableEntry OpcodeTable[] = {
+  { X86::ABS_Fp32     , X86::ABS_F     },
+  { X86::ABS_Fp64     , X86::ABS_F     },
+  { X86::ABS_Fp80     , X86::ABS_F     },
+  { X86::ADD_Fp32m    , X86::ADD_F32m  },
+  { X86::ADD_Fp64m    , X86::ADD_F64m  },
+  { X86::ADD_Fp64m32  , X86::ADD_F32m  },
+  { X86::ADD_Fp80m32  , X86::ADD_F32m  },
+  { X86::ADD_Fp80m64  , X86::ADD_F64m  },
+  { X86::ADD_FpI16m32 , X86::ADD_FI16m },
+  { X86::ADD_FpI16m64 , X86::ADD_FI16m },
+  { X86::ADD_FpI16m80 , X86::ADD_FI16m },
+  { X86::ADD_FpI32m32 , X86::ADD_FI32m },
+  { X86::ADD_FpI32m64 , X86::ADD_FI32m },
+  { X86::ADD_FpI32m80 , X86::ADD_FI32m },
+  { X86::CHS_Fp32     , X86::CHS_F     },
+  { X86::CHS_Fp64     , X86::CHS_F     },
+  { X86::CHS_Fp80     , X86::CHS_F     },
+  { X86::CMOVBE_Fp32  , X86::CMOVBE_F  },
+  { X86::CMOVBE_Fp64  , X86::CMOVBE_F  },
+  { X86::CMOVBE_Fp80  , X86::CMOVBE_F  },
+  { X86::CMOVB_Fp32   , X86::CMOVB_F   },
+  { X86::CMOVB_Fp64   , X86::CMOVB_F  },
+  { X86::CMOVB_Fp80   , X86::CMOVB_F  },
+  { X86::CMOVE_Fp32   , X86::CMOVE_F  },
+  { X86::CMOVE_Fp64   , X86::CMOVE_F   },
+  { X86::CMOVE_Fp80   , X86::CMOVE_F   },
+  { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F },
+  { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F },
+  { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F },
+  { X86::CMOVNB_Fp32  , X86::CMOVNB_F  },
+  { X86::CMOVNB_Fp64  , X86::CMOVNB_F  },
+  { X86::CMOVNB_Fp80  , X86::CMOVNB_F  },
+  { X86::CMOVNE_Fp32  , X86::CMOVNE_F  },
+  { X86::CMOVNE_Fp64  , X86::CMOVNE_F  },
+  { X86::CMOVNE_Fp80  , X86::CMOVNE_F  },
+  { X86::CMOVNP_Fp32  , X86::CMOVNP_F  },
+  { X86::CMOVNP_Fp64  , X86::CMOVNP_F  },
+  { X86::CMOVNP_Fp80  , X86::CMOVNP_F  },
+  { X86::CMOVP_Fp32   , X86::CMOVP_F   },
+  { X86::CMOVP_Fp64   , X86::CMOVP_F   },
+  { X86::CMOVP_Fp80   , X86::CMOVP_F   },
+  { X86::COS_Fp32     , X86::COS_F     },
+  { X86::COS_Fp64     , X86::COS_F     },
+  { X86::COS_Fp80     , X86::COS_F     },
+  { X86::DIVR_Fp32m   , X86::DIVR_F32m },
+  { X86::DIVR_Fp64m   , X86::DIVR_F64m },
+  { X86::DIVR_Fp64m32 , X86::DIVR_F32m },
+  { X86::DIVR_Fp80m32 , X86::DIVR_F32m },
+  { X86::DIVR_Fp80m64 , X86::DIVR_F64m },
+  { X86::DIVR_FpI16m32, X86::DIVR_FI16m},
+  { X86::DIVR_FpI16m64, X86::DIVR_FI16m},
+  { X86::DIVR_FpI16m80, X86::DIVR_FI16m},
+  { X86::DIVR_FpI32m32, X86::DIVR_FI32m},
+  { X86::DIVR_FpI32m64, X86::DIVR_FI32m},
+  { X86::DIVR_FpI32m80, X86::DIVR_FI32m},
+  { X86::DIV_Fp32m    , X86::DIV_F32m  },
+  { X86::DIV_Fp64m    , X86::DIV_F64m  },
+  { X86::DIV_Fp64m32  , X86::DIV_F32m  },
+  { X86::DIV_Fp80m32  , X86::DIV_F32m  },
+  { X86::DIV_Fp80m64  , X86::DIV_F64m  },
+  { X86::DIV_FpI16m32 , X86::DIV_FI16m },
+  { X86::DIV_FpI16m64 , X86::DIV_FI16m },
+  { X86::DIV_FpI16m80 , X86::DIV_FI16m },
+  { X86::DIV_FpI32m32 , X86::DIV_FI32m },
+  { X86::DIV_FpI32m64 , X86::DIV_FI32m },
+  { X86::DIV_FpI32m80 , X86::DIV_FI32m },
+  { X86::ILD_Fp16m32  , X86::ILD_F16m  },
+  { X86::ILD_Fp16m64  , X86::ILD_F16m  },
+  { X86::ILD_Fp16m80  , X86::ILD_F16m  },
+  { X86::ILD_Fp32m32  , X86::ILD_F32m  },
+  { X86::ILD_Fp32m64  , X86::ILD_F32m  },
+  { X86::ILD_Fp32m80  , X86::ILD_F32m  },
+  { X86::ILD_Fp64m32  , X86::ILD_F64m  },
+  { X86::ILD_Fp64m64  , X86::ILD_F64m  },
+  { X86::ILD_Fp64m80  , X86::ILD_F64m  },
+  { X86::ISTT_Fp16m32 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp16m64 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp16m80 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp32m32 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp32m64 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp32m80 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp64m32 , X86::ISTT_FP64m},
+  { X86::ISTT_Fp64m64 , X86::ISTT_FP64m},
+  { X86::ISTT_Fp64m80 , X86::ISTT_FP64m},
+  { X86::IST_Fp16m32  , X86::IST_F16m  },
+  { X86::IST_Fp16m64  , X86::IST_F16m  },
+  { X86::IST_Fp16m80  , X86::IST_F16m  },
+  { X86::IST_Fp32m32  , X86::IST_F32m  },
+  { X86::IST_Fp32m64  , X86::IST_F32m  },
+  { X86::IST_Fp32m80  , X86::IST_F32m  },
+  { X86::IST_Fp64m32  , X86::IST_FP64m },
+  { X86::IST_Fp64m64  , X86::IST_FP64m },
+  { X86::IST_Fp64m80  , X86::IST_FP64m },
+  { X86::LD_Fp032     , X86::LD_F0     },
+  { X86::LD_Fp064     , X86::LD_F0     },
+  { X86::LD_Fp080     , X86::LD_F0     },
+  { X86::LD_Fp132     , X86::LD_F1     },
+  { X86::LD_Fp164     , X86::LD_F1     },
+  { X86::LD_Fp180     , X86::LD_F1     },
+  { X86::LD_Fp32m     , X86::LD_F32m   },
+  { X86::LD_Fp32m64   , X86::LD_F32m   },
+  { X86::LD_Fp32m80   , X86::LD_F32m   },
+  { X86::LD_Fp64m     , X86::LD_F64m   },
+  { X86::LD_Fp64m80   , X86::LD_F64m   },
+  { X86::LD_Fp80m     , X86::LD_F80m   },
+  { X86::MUL_Fp32m    , X86::MUL_F32m  },
+  { X86::MUL_Fp64m    , X86::MUL_F64m  },
+  { X86::MUL_Fp64m32  , X86::MUL_F32m  },
+  { X86::MUL_Fp80m32  , X86::MUL_F32m  },
+  { X86::MUL_Fp80m64  , X86::MUL_F64m  },
+  { X86::MUL_FpI16m32 , X86::MUL_FI16m },
+  { X86::MUL_FpI16m64 , X86::MUL_FI16m },
+  { X86::MUL_FpI16m80 , X86::MUL_FI16m },
+  { X86::MUL_FpI32m32 , X86::MUL_FI32m },
+  { X86::MUL_FpI32m64 , X86::MUL_FI32m },
+  { X86::MUL_FpI32m80 , X86::MUL_FI32m },
+  { X86::SIN_Fp32     , X86::SIN_F     },
+  { X86::SIN_Fp64     , X86::SIN_F     },
+  { X86::SIN_Fp80     , X86::SIN_F     },
+  { X86::SQRT_Fp32    , X86::SQRT_F    },
+  { X86::SQRT_Fp64    , X86::SQRT_F    },
+  { X86::SQRT_Fp80    , X86::SQRT_F    },
+  { X86::ST_Fp32m     , X86::ST_F32m   },
+  { X86::ST_Fp64m     , X86::ST_F64m   },
+  { X86::ST_Fp64m32   , X86::ST_F32m   },
+  { X86::ST_Fp80m32   , X86::ST_F32m   },
+  { X86::ST_Fp80m64   , X86::ST_F64m   },
+  { X86::ST_FpP80m    , X86::ST_FP80m  },
+  { X86::SUBR_Fp32m   , X86::SUBR_F32m },
+  { X86::SUBR_Fp64m   , X86::SUBR_F64m },
+  { X86::SUBR_Fp64m32 , X86::SUBR_F32m },
+  { X86::SUBR_Fp80m32 , X86::SUBR_F32m },
+  { X86::SUBR_Fp80m64 , X86::SUBR_F64m },
+  { X86::SUBR_FpI16m32, X86::SUBR_FI16m},
+  { X86::SUBR_FpI16m64, X86::SUBR_FI16m},
+  { X86::SUBR_FpI16m80, X86::SUBR_FI16m},
+  { X86::SUBR_FpI32m32, X86::SUBR_FI32m},
+  { X86::SUBR_FpI32m64, X86::SUBR_FI32m},
+  { X86::SUBR_FpI32m80, X86::SUBR_FI32m},
+  { X86::SUB_Fp32m    , X86::SUB_F32m  },
+  { X86::SUB_Fp64m    , X86::SUB_F64m  },
+  { X86::SUB_Fp64m32  , X86::SUB_F32m  },
+  { X86::SUB_Fp80m32  , X86::SUB_F32m  },
+  { X86::SUB_Fp80m64  , X86::SUB_F64m  },
+  { X86::SUB_FpI16m32 , X86::SUB_FI16m },
+  { X86::SUB_FpI16m64 , X86::SUB_FI16m },
+  { X86::SUB_FpI16m80 , X86::SUB_FI16m },
+  { X86::SUB_FpI32m32 , X86::SUB_FI32m },
+  { X86::SUB_FpI32m64 , X86::SUB_FI32m },
+  { X86::SUB_FpI32m80 , X86::SUB_FI32m },
+  { X86::TST_Fp32     , X86::TST_F     },
+  { X86::TST_Fp64     , X86::TST_F     },
+  { X86::TST_Fp80     , X86::TST_F     },
+  { X86::UCOM_FpIr32  , X86::UCOM_FIr  },
+  { X86::UCOM_FpIr64  , X86::UCOM_FIr  },
+  { X86::UCOM_FpIr80  , X86::UCOM_FIr  },
+  { X86::UCOM_Fpr32   , X86::UCOM_Fr   },
+  { X86::UCOM_Fpr64   , X86::UCOM_Fr   },
+  { X86::UCOM_Fpr80   , X86::UCOM_Fr   },
+};
+
+static unsigned getConcreteOpcode(unsigned Opcode) {
+  ASSERT_SORTED(OpcodeTable);
+  int Opc = Lookup(OpcodeTable, Opcode);
+  assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
+  return Opc;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Methods
+//===----------------------------------------------------------------------===//
+
+// PopTable - Sorted map of instructions to their popping version.  The first
+// element is an instruction, the second is the version which pops.
+//
+static const TableEntry PopTable[] = {
+  { X86::ADD_FrST0 , X86::ADD_FPrST0  },
+
+  { X86::DIVR_FrST0, X86::DIVR_FPrST0 },
+  { X86::DIV_FrST0 , X86::DIV_FPrST0  },
+
+  { X86::IST_F16m  , X86::IST_FP16m   },
+  { X86::IST_F32m  , X86::IST_FP32m   },
+
+  { X86::MUL_FrST0 , X86::MUL_FPrST0  },
+
+  { X86::ST_F32m   , X86::ST_FP32m    },
+  { X86::ST_F64m   , X86::ST_FP64m    },
+  { X86::ST_Frr    , X86::ST_FPrr     },
+
+  { X86::SUBR_FrST0, X86::SUBR_FPrST0 },
+  { X86::SUB_FrST0 , X86::SUB_FPrST0  },
+
+  { X86::UCOM_FIr  , X86::UCOM_FIPr   },
+
+  { X86::UCOM_FPr  , X86::UCOM_FPPr   },
+  { X86::UCOM_Fr   , X86::UCOM_FPr    },
+};
+
+/// popStackAfter - Pop the current value off of the top of the FP stack after
+/// the specified instruction.  This attempts to be sneaky and combine the pop
+/// into the instruction itself if possible.  The iterator is left pointing to
+/// the last instruction, be it a new pop instruction inserted, or the old
+/// instruction if it was modified in place.
+///
+void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
+  MachineInstr &MI = *I;
+  const DebugLoc &dl = MI.getDebugLoc();
+  ASSERT_SORTED(PopTable);
+
+  popReg();
+
+  // Check to see if there is a popping version of this instruction...
+  int Opcode = Lookup(PopTable, I->getOpcode());
+  if (Opcode != -1) {
+    I->setDesc(TII->get(Opcode));
+    if (Opcode == X86::UCOM_FPPr)
+      I->RemoveOperand(0);
+  } else {    // Insert an explicit pop
+    I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
+  }
+}
+
+/// freeStackSlotAfter - Free the specified register from the register stack, so
+/// that it is no longer in a register.  If the register is currently at the top
+/// of the stack, we just pop the current instruction, otherwise we store the
+/// current top-of-stack into the specified slot, then pop the top of stack.
+void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
+  if (getStackEntry(0) == FPRegNo) {  // already at the top of stack? easy.
+    popStackAfter(I);
+    return;
+  }
+
+  // Otherwise, store the top of stack into the dead slot, killing the operand
+  // without having to add in an explicit xchg then pop.
+  //
+  I = freeStackSlotBefore(++I, FPRegNo);
+}
+
+/// freeStackSlotBefore - Free the specified register without trying any
+/// folding.
+MachineBasicBlock::iterator
+FPS::freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo) {
+  unsigned STReg    = getSTReg(FPRegNo);
+  unsigned OldSlot  = getSlot(FPRegNo);
+  unsigned TopReg   = Stack[StackTop-1];
+  Stack[OldSlot]    = TopReg;
+  RegMap[TopReg]    = OldSlot;
+  RegMap[FPRegNo]   = ~0;
+  Stack[--StackTop] = ~0;
+  return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr))
+      .addReg(STReg)
+      .getInstr();
+}
+
+/// adjustLiveRegs - Kill and revive registers such that exactly the FP
+/// registers with a bit in Mask are live.
+void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
+  unsigned Defs = Mask;
+  unsigned Kills = 0;
+  for (unsigned i = 0; i < StackTop; ++i) {
+    unsigned RegNo = Stack[i];
+    if (!(Defs & (1 << RegNo)))
+      // This register is live, but we don't want it.
+      Kills |= (1 << RegNo);
+    else
+      // We don't need to imp-def this live register.
+      Defs &= ~(1 << RegNo);
+  }
+  assert((Kills & Defs) == 0 && "Register needs killing and def'ing?");
+
+  // Produce implicit-defs for free by using killed registers.
+  while (Kills && Defs) {
+    unsigned KReg = countTrailingZeros(Kills);
+    unsigned DReg = countTrailingZeros(Defs);
+    DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n");
+    std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]);
+    std::swap(RegMap[KReg], RegMap[DReg]);
+    Kills &= ~(1 << KReg);
+    Defs &= ~(1 << DReg);
+  }
+
+  // Kill registers by popping.
+  if (Kills && I != MBB->begin()) {
+    MachineBasicBlock::iterator I2 = std::prev(I);
+    while (StackTop) {
+      unsigned KReg = getStackEntry(0);
+      if (!(Kills & (1 << KReg)))
+        break;
+      DEBUG(dbgs() << "Popping %FP" << KReg << "\n");
+      popStackAfter(I2);
+      Kills &= ~(1 << KReg);
+    }
+  }
+
+  // Manually kill the rest.
+  while (Kills) {
+    unsigned KReg = countTrailingZeros(Kills);
+    DEBUG(dbgs() << "Killing %FP" << KReg << "\n");
+    freeStackSlotBefore(I, KReg);
+    Kills &= ~(1 << KReg);
+  }
+
+  // Load zeros for all the imp-defs.
+  while(Defs) {
+    unsigned DReg = countTrailingZeros(Defs);
+    DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n");
+    BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0));
+    pushReg(DReg);
+    Defs &= ~(1 << DReg);
+  }
+
+  // Now we should have the correct registers live.
+  DEBUG(dumpStack());
+  assert(StackTop == countPopulation(Mask) && "Live count mismatch");
+}
+
+/// shuffleStackTop - emit fxch instructions before I to shuffle the top
+/// FixCount entries into the order given by FixStack.
+/// FIXME: Is there a better algorithm than insertion sort?
+void FPS::shuffleStackTop(const unsigned char *FixStack,
+                          unsigned FixCount,
+                          MachineBasicBlock::iterator I) {
+  // Move items into place, starting from the desired stack bottom.
+  while (FixCount--) {
+    // Old register at position FixCount.
+    unsigned OldReg = getStackEntry(FixCount);
+    // Desired register at position FixCount.
+    unsigned Reg = FixStack[FixCount];
+    if (Reg == OldReg)
+      continue;
+    // (Reg st0) (OldReg st0) = (Reg OldReg st0)
+    moveToTop(Reg, I);
+    if (FixCount > 0)
+      moveToTop(OldReg, I);
+  }
+  DEBUG(dumpStack());
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction transformation implementation
+//===----------------------------------------------------------------------===//
+
+void FPS::handleCall(MachineBasicBlock::iterator &I) {
+  unsigned STReturns = 0;
+  const MachineFunction* MF = I->getParent()->getParent();
+
+  for (const auto &MO : I->operands()) {
+    if (!MO.isReg())
+      continue;
+
+    unsigned R = MO.getReg() - X86::FP0;
+
+    if (R < 8) {
+      if (MF->getFunction()->getCallingConv() != CallingConv::X86_RegCall) {
+        assert(MO.isDef() && MO.isImplicit());
+      }
+
+      STReturns |= 1 << R;
+    }
+  }
+
+  unsigned N = countTrailingOnes(STReturns);
+
+  // FP registers used for function return must be consecutive starting at
+  // FP0
+  assert(STReturns == 0 || (isMask_32(STReturns) && N <= 2));
+
+  // Reset the FP Stack - It is required because of possible leftovers from
+  // passed arguments. The caller should assume that the FP stack is 
+  // returned empty (unless the callee returns values on FP stack).
+  while (StackTop > 0)
+    popReg();
+
+  for (unsigned I = 0; I < N; ++I)
+    pushReg(N - I - 1);
+}
+
+/// If RET has an FP register use operand, pass the first one in ST(0) and
+/// the second one in ST(1).
+void FPS::handleReturn(MachineBasicBlock::iterator &I) {
+  MachineInstr &MI = *I;
+
+  // Find the register operands.
+  unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U;
+  unsigned LiveMask = 0;
+
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &Op = MI.getOperand(i);
+    if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+      continue;
+    // FP Register uses must be kills unless there are two uses of the same
+    // register, in which case only one will be a kill.
+    assert(Op.isUse() &&
+           (Op.isKill() ||                    // Marked kill.
+            getFPReg(Op) == FirstFPRegOp ||   // Second instance.
+            MI.killsRegister(Op.getReg())) && // Later use is marked kill.
+           "Ret only defs operands, and values aren't live beyond it");
+
+    if (FirstFPRegOp == ~0U)
+      FirstFPRegOp = getFPReg(Op);
+    else {
+      assert(SecondFPRegOp == ~0U && "More than two fp operands!");
+      SecondFPRegOp = getFPReg(Op);
+    }
+    LiveMask |= (1 << getFPReg(Op));
+
+    // Remove the operand so that later passes don't see it.
+    MI.RemoveOperand(i);
+    --i;
+    --e;
+  }
+
+  // We may have been carrying spurious live-ins, so make sure only the
+  // returned registers are left live.
+  adjustLiveRegs(LiveMask, MI);
+  if (!LiveMask) return;  // Quick check to see if any are possible.
+
+  // There are only four possibilities here:
+  // 1) we are returning a single FP value.  In this case, it has to be in
+  //    ST(0) already, so just declare success by removing the value from the
+  //    FP Stack.
+  if (SecondFPRegOp == ~0U) {
+    // Assert that the top of stack contains the right FP register.
+    assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) &&
+           "Top of stack not the right register for RET!");
+
+    // Ok, everything is good, mark the value as not being on the stack
+    // anymore so that our assertion about the stack being empty at end of
+    // block doesn't fire.
+    StackTop = 0;
+    return;
+  }
+
+  // Otherwise, we are returning two values:
+  // 2) If returning the same value for both, we only have one thing in the FP
+  //    stack.  Consider:  RET FP1, FP1
+  if (StackTop == 1) {
+    assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&&
+           "Stack misconfiguration for RET!");
+
+    // Duplicate the TOS so that we return it twice.  Just pick some other FPx
+    // register to hold it.
+    unsigned NewReg = ScratchFPReg;
+    duplicateToTop(FirstFPRegOp, NewReg, MI);
+    FirstFPRegOp = NewReg;
+  }
+
+  /// Okay we know we have two different FPx operands now:
+  assert(StackTop == 2 && "Must have two values live!");
+
+  /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently
+  ///    in ST(1).  In this case, emit an fxch.
+  if (getStackEntry(0) == SecondFPRegOp) {
+    assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live");
+    moveToTop(FirstFPRegOp, MI);
+  }
+
+  /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in
+  /// ST(1).  Just remove both from our understanding of the stack and return.
+  assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live");
+  assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live");
+  StackTop = 0;
+}
+
+/// handleZeroArgFP - ST(0) = fld0    ST(0) = flds <mem>
+///
+void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
+  MachineInstr &MI = *I;
+  unsigned DestReg = getFPReg(MI.getOperand(0));
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI.RemoveOperand(0); // Remove the explicit ST(0) operand
+  MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+
+  // Result gets pushed on the stack.
+  pushReg(DestReg);
+}
+
+/// handleOneArgFP - fst <mem>, ST(0)
+///
+void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
+  MachineInstr &MI = *I;
+  unsigned NumOps = MI.getDesc().getNumOperands();
+  assert((NumOps == X86::AddrNumOperands + 1 || NumOps == 1) &&
+         "Can only handle fst* & ftst instructions!");
+
+  // Is this the last use of the source register?
+  unsigned Reg = getFPReg(MI.getOperand(NumOps - 1));
+  bool KillsSrc = MI.killsRegister(X86::FP0 + Reg);
+
+  // FISTP64m is strange because there isn't a non-popping versions.
+  // If we have one _and_ we don't want to pop the operand, duplicate the value
+  // on the stack instead of moving it.  This ensure that popping the value is
+  // always ok.
+  // Ditto FISTTP16m, FISTTP32m, FISTTP64m, ST_FpP80m.
+  //
+  if (!KillsSrc && (MI.getOpcode() == X86::IST_Fp64m32 ||
+                    MI.getOpcode() == X86::ISTT_Fp16m32 ||
+                    MI.getOpcode() == X86::ISTT_Fp32m32 ||
+                    MI.getOpcode() == X86::ISTT_Fp64m32 ||
+                    MI.getOpcode() == X86::IST_Fp64m64 ||
+                    MI.getOpcode() == X86::ISTT_Fp16m64 ||
+                    MI.getOpcode() == X86::ISTT_Fp32m64 ||
+                    MI.getOpcode() == X86::ISTT_Fp64m64 ||
+                    MI.getOpcode() == X86::IST_Fp64m80 ||
+                    MI.getOpcode() == X86::ISTT_Fp16m80 ||
+                    MI.getOpcode() == X86::ISTT_Fp32m80 ||
+                    MI.getOpcode() == X86::ISTT_Fp64m80 ||
+                    MI.getOpcode() == X86::ST_FpP80m)) {
+    duplicateToTop(Reg, ScratchFPReg, I);
+  } else {
+    moveToTop(Reg, I);            // Move to the top of the stack...
+  }
+
+  // Convert from the pseudo instruction to the concrete instruction.
+  MI.RemoveOperand(NumOps - 1); // Remove explicit ST(0) operand
+  MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+
+  if (MI.getOpcode() == X86::IST_FP64m || MI.getOpcode() == X86::ISTT_FP16m ||
+      MI.getOpcode() == X86::ISTT_FP32m || MI.getOpcode() == X86::ISTT_FP64m ||
+      MI.getOpcode() == X86::ST_FP80m) {
+    if (StackTop == 0)
+      report_fatal_error("Stack empty??");
+    --StackTop;
+  } else if (KillsSrc) { // Last use of operand?
+    popStackAfter(I);
+  }
+}
+
+
+/// handleOneArgFPRW: Handle instructions that read from the top of stack and
+/// replace the value with a newly computed value.  These instructions may have
+/// non-fp operands after their FP operands.
+///
+///  Examples:
+///     R1 = fchs R2
+///     R1 = fadd R2, [mem]
+///
+void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
+  MachineInstr &MI = *I;
+#ifndef NDEBUG
+  unsigned NumOps = MI.getDesc().getNumOperands();
+  assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!");
+#endif
+
+  // Is this the last use of the source register?
+  unsigned Reg = getFPReg(MI.getOperand(1));
+  bool KillsSrc = MI.killsRegister(X86::FP0 + Reg);
+
+  if (KillsSrc) {
+    // If this is the last use of the source register, just make sure it's on
+    // the top of the stack.
+    moveToTop(Reg, I);
+    if (StackTop == 0)
+      report_fatal_error("Stack cannot be empty!");
+    --StackTop;
+    pushReg(getFPReg(MI.getOperand(0)));
+  } else {
+    // If this is not the last use of the source register, _copy_ it to the top
+    // of the stack.
+    duplicateToTop(Reg, getFPReg(MI.getOperand(0)), I);
+  }
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI.RemoveOperand(1); // Drop the source operand.
+  MI.RemoveOperand(0); // Drop the destination operand.
+  MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define tables of various ways to map pseudo instructions
+//
+
+// ForwardST0Table - Map: A = B op C  into: ST(0) = ST(0) op ST(i)
+static const TableEntry ForwardST0Table[] = {
+  { X86::ADD_Fp32  , X86::ADD_FST0r },
+  { X86::ADD_Fp64  , X86::ADD_FST0r },
+  { X86::ADD_Fp80  , X86::ADD_FST0r },
+  { X86::DIV_Fp32  , X86::DIV_FST0r },
+  { X86::DIV_Fp64  , X86::DIV_FST0r },
+  { X86::DIV_Fp80  , X86::DIV_FST0r },
+  { X86::MUL_Fp32  , X86::MUL_FST0r },
+  { X86::MUL_Fp64  , X86::MUL_FST0r },
+  { X86::MUL_Fp80  , X86::MUL_FST0r },
+  { X86::SUB_Fp32  , X86::SUB_FST0r },
+  { X86::SUB_Fp64  , X86::SUB_FST0r },
+  { X86::SUB_Fp80  , X86::SUB_FST0r },
+};
+
+// ReverseST0Table - Map: A = B op C  into: ST(0) = ST(i) op ST(0)
+static const TableEntry ReverseST0Table[] = {
+  { X86::ADD_Fp32  , X86::ADD_FST0r  },   // commutative
+  { X86::ADD_Fp64  , X86::ADD_FST0r  },   // commutative
+  { X86::ADD_Fp80  , X86::ADD_FST0r  },   // commutative
+  { X86::DIV_Fp32  , X86::DIVR_FST0r },
+  { X86::DIV_Fp64  , X86::DIVR_FST0r },
+  { X86::DIV_Fp80  , X86::DIVR_FST0r },
+  { X86::MUL_Fp32  , X86::MUL_FST0r  },   // commutative
+  { X86::MUL_Fp64  , X86::MUL_FST0r  },   // commutative
+  { X86::MUL_Fp80  , X86::MUL_FST0r  },   // commutative
+  { X86::SUB_Fp32  , X86::SUBR_FST0r },
+  { X86::SUB_Fp64  , X86::SUBR_FST0r },
+  { X86::SUB_Fp80  , X86::SUBR_FST0r },
+};
+
+// ForwardSTiTable - Map: A = B op C  into: ST(i) = ST(0) op ST(i)
+static const TableEntry ForwardSTiTable[] = {
+  { X86::ADD_Fp32  , X86::ADD_FrST0  },   // commutative
+  { X86::ADD_Fp64  , X86::ADD_FrST0  },   // commutative
+  { X86::ADD_Fp80  , X86::ADD_FrST0  },   // commutative
+  { X86::DIV_Fp32  , X86::DIVR_FrST0 },
+  { X86::DIV_Fp64  , X86::DIVR_FrST0 },
+  { X86::DIV_Fp80  , X86::DIVR_FrST0 },
+  { X86::MUL_Fp32  , X86::MUL_FrST0  },   // commutative
+  { X86::MUL_Fp64  , X86::MUL_FrST0  },   // commutative
+  { X86::MUL_Fp80  , X86::MUL_FrST0  },   // commutative
+  { X86::SUB_Fp32  , X86::SUBR_FrST0 },
+  { X86::SUB_Fp64  , X86::SUBR_FrST0 },
+  { X86::SUB_Fp80  , X86::SUBR_FrST0 },
+};
+
+// ReverseSTiTable - Map: A = B op C  into: ST(i) = ST(i) op ST(0)
+static const TableEntry ReverseSTiTable[] = {
+  { X86::ADD_Fp32  , X86::ADD_FrST0 },
+  { X86::ADD_Fp64  , X86::ADD_FrST0 },
+  { X86::ADD_Fp80  , X86::ADD_FrST0 },
+  { X86::DIV_Fp32  , X86::DIV_FrST0 },
+  { X86::DIV_Fp64  , X86::DIV_FrST0 },
+  { X86::DIV_Fp80  , X86::DIV_FrST0 },
+  { X86::MUL_Fp32  , X86::MUL_FrST0 },
+  { X86::MUL_Fp64  , X86::MUL_FrST0 },
+  { X86::MUL_Fp80  , X86::MUL_FrST0 },
+  { X86::SUB_Fp32  , X86::SUB_FrST0 },
+  { X86::SUB_Fp64  , X86::SUB_FrST0 },
+  { X86::SUB_Fp80  , X86::SUB_FrST0 },
+};
+
+
+/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual
+/// instructions which need to be simplified and possibly transformed.
+///
+/// Result: ST(0) = fsub  ST(0), ST(i)
+///         ST(i) = fsub  ST(0), ST(i)
+///         ST(0) = fsubr ST(0), ST(i)
+///         ST(i) = fsubr ST(0), ST(i)
+///
+void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+  ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+  MachineInstr &MI = *I;
+
+  unsigned NumOperands = MI.getDesc().getNumOperands();
+  assert(NumOperands == 3 && "Illegal TwoArgFP instruction!");
+  unsigned Dest = getFPReg(MI.getOperand(0));
+  unsigned Op0 = getFPReg(MI.getOperand(NumOperands - 2));
+  unsigned Op1 = getFPReg(MI.getOperand(NumOperands - 1));
+  bool KillsOp0 = MI.killsRegister(X86::FP0 + Op0);
+  bool KillsOp1 = MI.killsRegister(X86::FP0 + Op1);
+  DebugLoc dl = MI.getDebugLoc();
+
+  unsigned TOS = getStackEntry(0);
+
+  // One of our operands must be on the top of the stack.  If neither is yet, we
+  // need to move one.
+  if (Op0 != TOS && Op1 != TOS) {   // No operand at TOS?
+    // We can choose to move either operand to the top of the stack.  If one of
+    // the operands is killed by this instruction, we want that one so that we
+    // can update right on top of the old version.
+    if (KillsOp0) {
+      moveToTop(Op0, I);         // Move dead operand to TOS.
+      TOS = Op0;
+    } else if (KillsOp1) {
+      moveToTop(Op1, I);
+      TOS = Op1;
+    } else {
+      // All of the operands are live after this instruction executes, so we
+      // cannot update on top of any operand.  Because of this, we must
+      // duplicate one of the stack elements to the top.  It doesn't matter
+      // which one we pick.
+      //
+      duplicateToTop(Op0, Dest, I);
+      Op0 = TOS = Dest;
+      KillsOp0 = true;
+    }
+  } else if (!KillsOp0 && !KillsOp1) {
+    // If we DO have one of our operands at the top of the stack, but we don't
+    // have a dead operand, we must duplicate one of the operands to a new slot
+    // on the stack.
+    duplicateToTop(Op0, Dest, I);
+    Op0 = TOS = Dest;
+    KillsOp0 = true;
+  }
+
+  // Now we know that one of our operands is on the top of the stack, and at
+  // least one of our operands is killed by this instruction.
+  assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) &&
+         "Stack conditions not set up right!");
+
+  // We decide which form to use based on what is on the top of the stack, and
+  // which operand is killed by this instruction.
+  ArrayRef<TableEntry> InstTable;
+  bool isForward = TOS == Op0;
+  bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
+  if (updateST0) {
+    if (isForward)
+      InstTable = ForwardST0Table;
+    else
+      InstTable = ReverseST0Table;
+  } else {
+    if (isForward)
+      InstTable = ForwardSTiTable;
+    else
+      InstTable = ReverseSTiTable;
+  }
+
+  int Opcode = Lookup(InstTable, MI.getOpcode());
+  assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
+
+  // NotTOS - The register which is not on the top of stack...
+  unsigned NotTOS = (TOS == Op0) ? Op1 : Op0;
+
+  // Replace the old instruction with a new instruction
+  MBB->remove(&*I++);
+  I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS));
+
+  // If both operands are killed, pop one off of the stack in addition to
+  // overwriting the other one.
+  if (KillsOp0 && KillsOp1 && Op0 != Op1) {
+    assert(!updateST0 && "Should have updated other operand!");
+    popStackAfter(I);   // Pop the top of stack
+  }
+
+  // Update stack information so that we know the destination register is now on
+  // the stack.
+  unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS);
+  assert(UpdatedSlot < StackTop && Dest < 7);
+  Stack[UpdatedSlot]   = Dest;
+  RegMap[Dest]         = UpdatedSlot;
+  MBB->getParent()->DeleteMachineInstr(&MI); // Remove the old instruction
+}
+
+/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
+/// register arguments and no explicit destinations.
+///
+void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+  ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+  MachineInstr &MI = *I;
+
+  unsigned NumOperands = MI.getDesc().getNumOperands();
+  assert(NumOperands == 2 && "Illegal FUCOM* instruction!");
+  unsigned Op0 = getFPReg(MI.getOperand(NumOperands - 2));
+  unsigned Op1 = getFPReg(MI.getOperand(NumOperands - 1));
+  bool KillsOp0 = MI.killsRegister(X86::FP0 + Op0);
+  bool KillsOp1 = MI.killsRegister(X86::FP0 + Op1);
+
+  // Make sure the first operand is on the top of stack, the other one can be
+  // anywhere.
+  moveToTop(Op0, I);
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI.getOperand(0).setReg(getSTReg(Op1));
+  MI.RemoveOperand(1);
+  MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+
+  // If any of the operands are killed by this instruction, free them.
+  if (KillsOp0) freeStackSlotAfter(I, Op0);
+  if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1);
+}
+
+/// handleCondMovFP - Handle two address conditional move instructions.  These
+/// instructions move a st(i) register to st(0) iff a condition is true.  These
+/// instructions require that the first operand is at the top of the stack, but
+/// otherwise don't modify the stack at all.
+void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
+  MachineInstr &MI = *I;
+
+  unsigned Op0 = getFPReg(MI.getOperand(0));
+  unsigned Op1 = getFPReg(MI.getOperand(2));
+  bool KillsOp1 = MI.killsRegister(X86::FP0 + Op1);
+
+  // The first operand *must* be on the top of the stack.
+  moveToTop(Op0, I);
+
+  // Change the second operand to the stack register that the operand is in.
+  // Change from the pseudo instruction to the concrete instruction.
+  MI.RemoveOperand(0);
+  MI.RemoveOperand(1);
+  MI.getOperand(0).setReg(getSTReg(Op1));
+  MI.setDesc(TII->get(getConcreteOpcode(MI.getOpcode())));
+
+  // If we kill the second operand, make sure to pop it from the stack.
+  if (Op0 != Op1 && KillsOp1) {
+    // Get this value off of the register stack.
+    freeStackSlotAfter(I, Op1);
+  }
+}
+
+
+/// handleSpecialFP - Handle special instructions which behave unlike other
+/// floating point instructions.  This is primarily intended for use by pseudo
+/// instructions.
+///
+void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
+  MachineInstr &MI = *Inst;
+
+  if (MI.isCall()) {
+    handleCall(Inst);
+    return;
+  }
+
+  if (MI.isReturn()) {
+    handleReturn(Inst);
+    return;
+  }
+
+  switch (MI.getOpcode()) {
+  default: llvm_unreachable("Unknown SpecialFP instruction!");
+  case TargetOpcode::COPY: {
+    // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP.
+    const MachineOperand &MO1 = MI.getOperand(1);
+    const MachineOperand &MO0 = MI.getOperand(0);
+    bool KillsSrc = MI.killsRegister(MO1.getReg());
+
+    // FP <- FP copy.
+    unsigned DstFP = getFPReg(MO0);
+    unsigned SrcFP = getFPReg(MO1);
+    assert(isLive(SrcFP) && "Cannot copy dead register");
+    if (KillsSrc) {
+      // If the input operand is killed, we can just change the owner of the
+      // incoming stack slot into the result.
+      unsigned Slot = getSlot(SrcFP);
+      Stack[Slot] = DstFP;
+      RegMap[DstFP] = Slot;
+    } else {
+      // For COPY we just duplicate the specified value to a new stack slot.
+      // This could be made better, but would require substantial changes.
+      duplicateToTop(SrcFP, DstFP, Inst);
+    }
+    break;
+  }
+
+  case TargetOpcode::IMPLICIT_DEF: {
+    // All FP registers must be explicitly defined, so load a 0 instead.
+    unsigned Reg = MI.getOperand(0).getReg() - X86::FP0;
+    DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n');
+    BuildMI(*MBB, Inst, MI.getDebugLoc(), TII->get(X86::LD_F0));
+    pushReg(Reg);
+    break;
+  }
+
+  case TargetOpcode::INLINEASM: {
+    // The inline asm MachineInstr currently only *uses* FP registers for the
+    // 'f' constraint.  These should be turned into the current ST(x) register
+    // in the machine instr.
+    //
+    // There are special rules for x87 inline assembly. The compiler must know
+    // exactly how many registers are popped and pushed implicitly by the asm.
+    // Otherwise it is not possible to restore the stack state after the inline
+    // asm.
+    //
+    // There are 3 kinds of input operands:
+    //
+    // 1. Popped inputs. These must appear at the stack top in ST0-STn. A
+    //    popped input operand must be in a fixed stack slot, and it is either
+    //    tied to an output operand, or in the clobber list. The MI has ST use
+    //    and def operands for these inputs.
+    //
+    // 2. Fixed inputs. These inputs appear in fixed stack slots, but are
+    //    preserved by the inline asm. The fixed stack slots must be STn-STm
+    //    following the popped inputs. A fixed input operand cannot be tied to
+    //    an output or appear in the clobber list. The MI has ST use operands
+    //    and no defs for these inputs.
+    //
+    // 3. Preserved inputs. These inputs use the "f" constraint which is
+    //    represented as an FP register. The inline asm won't change these
+    //    stack slots.
+    //
+    // Outputs must be in ST registers, FP outputs are not allowed. Clobbered
+    // registers do not count as output operands. The inline asm changes the
+    // stack as if it popped all the popped inputs and then pushed all the
+    // output operands.
+
+    // Scan the assembly for ST registers used, defined and clobbered. We can
+    // only tell clobbers from defs by looking at the asm descriptor.
+    unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0;
+    unsigned NumOps = 0;
+    SmallSet<unsigned, 1> FRegIdx;
+    unsigned RCID;
+
+    for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI.getNumOperands();
+         i != e && MI.getOperand(i).isImm(); i += 1 + NumOps) {
+      unsigned Flags = MI.getOperand(i).getImm();
+
+      NumOps = InlineAsm::getNumOperandRegisters(Flags);
+      if (NumOps != 1)
+        continue;
+      const MachineOperand &MO = MI.getOperand(i + 1);
+      if (!MO.isReg())
+        continue;
+      unsigned STReg = MO.getReg() - X86::FP0;
+      if (STReg >= 8)
+        continue;
+
+      // If the flag has a register class constraint, this must be an operand
+      // with constraint "f". Record its index and continue.
+      if (InlineAsm::hasRegClassConstraint(Flags, RCID)) {
+        FRegIdx.insert(i + 1);
+        continue;
+      }
+
+      switch (InlineAsm::getKind(Flags)) {
+      case InlineAsm::Kind_RegUse:
+        STUses |= (1u << STReg);
+        break;
+      case InlineAsm::Kind_RegDef:
+      case InlineAsm::Kind_RegDefEarlyClobber:
+        STDefs |= (1u << STReg);
+        if (MO.isDead())
+          STDeadDefs |= (1u << STReg);
+        break;
+      case InlineAsm::Kind_Clobber:
+        STClobbers |= (1u << STReg);
+        break;
+      default:
+        break;
+      }
+    }
+
+    if (STUses && !isMask_32(STUses))
+      MI.emitError("fixed input regs must be last on the x87 stack");
+    unsigned NumSTUses = countTrailingOnes(STUses);
+
+    // Defs must be contiguous from the stack top. ST0-STn.
+    if (STDefs && !isMask_32(STDefs)) {
+      MI.emitError("output regs must be last on the x87 stack");
+      STDefs = NextPowerOf2(STDefs) - 1;
+    }
+    unsigned NumSTDefs = countTrailingOnes(STDefs);
+
+    // So must the clobbered stack slots. ST0-STm, m >= n.
+    if (STClobbers && !isMask_32(STDefs | STClobbers))
+      MI.emitError("clobbers must be last on the x87 stack");
+
+    // Popped inputs are the ones that are also clobbered or defined.
+    unsigned STPopped = STUses & (STDefs | STClobbers);
+    if (STPopped && !isMask_32(STPopped))
+      MI.emitError("implicitly popped regs must be last on the x87 stack");
+    unsigned NumSTPopped = countTrailingOnes(STPopped);
+
+    DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
+                 << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n");
+
+#ifndef NDEBUG
+    // If any input operand uses constraint "f", all output register
+    // constraints must be early-clobber defs.
+    for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I)
+      if (FRegIdx.count(I)) {
+        assert((1 << getFPReg(MI.getOperand(I)) & STDefs) == 0 &&
+               "Operands with constraint \"f\" cannot overlap with defs");
+      }
+#endif
+
+    // Collect all FP registers (register operands with constraints "t", "u",
+    // and "f") to kill afer the instruction.
+    unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff;
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI.getOperand(i);
+      if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+        continue;
+      unsigned FPReg = getFPReg(Op);
+
+      // If we kill this operand, make sure to pop it from the stack after the
+      // asm.  We just remember it for now, and pop them all off at the end in
+      // a batch.
+      if (Op.isUse() && Op.isKill())
+        FPKills |= 1U << FPReg;
+    }
+
+    // Do not include registers that are implicitly popped by defs/clobbers.
+    FPKills &= ~(STDefs | STClobbers);
+
+    // Now we can rearrange the live registers to match what was requested.
+    unsigned char STUsesArray[8];
+
+    for (unsigned I = 0; I < NumSTUses; ++I)
+      STUsesArray[I] = I;
+
+    shuffleStackTop(STUsesArray, NumSTUses, Inst);
+    DEBUG({dbgs() << "Before asm: "; dumpStack();});
+
+    // With the stack layout fixed, rewrite the FP registers.
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI.getOperand(i);
+      if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+        continue;
+
+      unsigned FPReg = getFPReg(Op);
+
+      if (FRegIdx.count(i))
+        // Operand with constraint "f".
+        Op.setReg(getSTReg(FPReg));
+      else
+        // Operand with a single register class constraint ("t" or "u").
+        Op.setReg(X86::ST0 + FPReg);
+    }
+
+    // Simulate the inline asm popping its inputs and pushing its outputs.
+    StackTop -= NumSTPopped;
+
+    for (unsigned i = 0; i < NumSTDefs; ++i)
+      pushReg(NumSTDefs - i - 1);
+
+    // If this asm kills any FP registers (is the last use of them) we must
+    // explicitly emit pop instructions for them.  Do this now after the asm has
+    // executed so that the ST(x) numbers are not off (which would happen if we
+    // did this inline with operand rewriting).
+    //
+    // Note: this might be a non-optimal pop sequence.  We might be able to do
+    // better by trying to pop in stack order or something.
+    while (FPKills) {
+      unsigned FPReg = countTrailingZeros(FPKills);
+      if (isLive(FPReg))
+        freeStackSlotAfter(Inst, FPReg);
+      FPKills &= ~(1U << FPReg);
+    }
+
+    // Don't delete the inline asm!
+    return;
+  }
+  }
+
+  Inst = MBB->erase(Inst);  // Remove the pseudo instruction
+
+  // We want to leave I pointing to the previous instruction, but what if we
+  // just erased the first instruction?
+  if (Inst == MBB->begin()) {
+    DEBUG(dbgs() << "Inserting dummy KILL\n");
+    Inst = BuildMI(*MBB, Inst, DebugLoc(), TII->get(TargetOpcode::KILL));
+  } else
+    --Inst;
+}
+
+void FPS::setKillFlags(MachineBasicBlock &MBB) const {
+  const TargetRegisterInfo *TRI =
+      MBB.getParent()->getSubtarget().getRegisterInfo();
+  LivePhysRegs LPR(TRI);
+
+  LPR.addLiveOuts(MBB);
+
+  for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
+       I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+
+    std::bitset<8> Defs;
+    SmallVector<MachineOperand *, 2> Uses;
+    MachineInstr &MI = *I;
+
+    for (auto &MO : I->operands()) {
+      if (!MO.isReg())
+        continue;
+
+      unsigned Reg = MO.getReg() - X86::FP0;
+
+      if (Reg >= 8)
+        continue;
+
+      if (MO.isDef()) {
+        Defs.set(Reg);
+        if (!LPR.contains(MO.getReg()))
+          MO.setIsDead();
+      } else
+        Uses.push_back(&MO);
+    }
+
+    for (auto *MO : Uses)
+      if (Defs.test(getFPReg(*MO)) || !LPR.contains(MO->getReg()))
+        MO->setIsKill();
+
+    LPR.stepBackward(MI);
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
new file mode 100644
index 000000000000..1deefe1231ca
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -0,0 +1,2998 @@
+//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Debug.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
+                                   unsigned StackAlignOverride)
+    : TargetFrameLowering(StackGrowsDown, StackAlignOverride,
+                          STI.is64Bit() ? -8 : -4),
+      STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {
+  // Cache a bunch of frame-related predicates for this subtarget.
+  SlotSize = TRI->getSlotSize();
+  Is64Bit = STI.is64Bit();
+  IsLP64 = STI.isTarget64BitLP64();
+  // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
+  Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+  StackPtr = TRI->getStackRegister();
+}
+
+bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo().hasVarSizedObjects() &&
+         !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+}
+
+/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
+/// call frame pseudos can be simplified.  Having a FP, as in the default
+/// implementation, is not sufficient here since we can't always use it.
+/// Use a more nuanced condition.
+bool
+X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
+  return hasReservedCallFrame(MF) ||
+         (hasFP(MF) && !TRI->needsStackRealignment(MF)) ||
+         TRI->hasBasePointer(MF);
+}
+
+// needsFrameIndexResolution - Do we need to perform FI resolution for
+// this function. Normally, this is required only when the function
+// has any stack objects. However, FI resolution actually has another job,
+// not apparent from the title - it resolves callframesetup/destroy 
+// that were not simplified earlier.
+// So, this is required for x86 functions that have push sequences even
+// when there are no stack objects.
+bool
+X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
+  return MF.getFrameInfo().hasStackObjects() ||
+         MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.  This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+          TRI->needsStackRealignment(MF) ||
+          MFI.hasVarSizedObjects() ||
+          MFI.isFrameAddressTaken() || MFI.hasOpaqueSPAdjustment() ||
+          MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+          MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() ||
+          MFI.hasStackMap() || MFI.hasPatchPoint() ||
+          MFI.hasCopyImplyingStackAdjustment());
+}
+
+static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
+  if (IsLP64) {
+    if (isInt<8>(Imm))
+      return X86::SUB64ri8;
+    return X86::SUB64ri32;
+  } else {
+    if (isInt<8>(Imm))
+      return X86::SUB32ri8;
+    return X86::SUB32ri;
+  }
+}
+
+static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
+  if (IsLP64) {
+    if (isInt<8>(Imm))
+      return X86::ADD64ri8;
+    return X86::ADD64ri32;
+  } else {
+    if (isInt<8>(Imm))
+      return X86::ADD32ri8;
+    return X86::ADD32ri;
+  }
+}
+
+static unsigned getSUBrrOpcode(unsigned isLP64) {
+  return isLP64 ? X86::SUB64rr : X86::SUB32rr;
+}
+
+static unsigned getADDrrOpcode(unsigned isLP64) {
+  return isLP64 ? X86::ADD64rr : X86::ADD32rr;
+}
+
+static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
+  if (IsLP64) {
+    if (isInt<8>(Imm))
+      return X86::AND64ri8;
+    return X86::AND64ri32;
+  }
+  if (isInt<8>(Imm))
+    return X86::AND32ri8;
+  return X86::AND32ri;
+}
+
+static unsigned getLEArOpcode(unsigned IsLP64) {
+  return IsLP64 ? X86::LEA64r : X86::LEA32r;
+}
+
+/// findDeadCallerSavedReg - Return a caller-saved register that isn't live
+/// when it reaches the "return" instruction. We can then pop a stack object
+/// to this register without worry about clobbering it.
+static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator &MBBI,
+                                       const X86RegisterInfo *TRI,
+                                       bool Is64Bit) {
+  const MachineFunction *MF = MBB.getParent();
+  const Function *F = MF->getFunction();
+  if (!F || MF->callsEHReturn())
+    return 0;
+
+  const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF);
+
+  if (MBBI == MBB.end())
+    return 0;
+
+  switch (MBBI->getOpcode()) {
+  default: return 0;
+  case TargetOpcode::PATCHABLE_RET:
+  case X86::RET:
+  case X86::RETL:
+  case X86::RETQ:
+  case X86::RETIL:
+  case X86::RETIQ:
+  case X86::TCRETURNdi:
+  case X86::TCRETURNri:
+  case X86::TCRETURNmi:
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64:
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    SmallSet<uint16_t, 8> Uses;
+    for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MBBI->getOperand(i);
+      if (!MO.isReg() || MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        Uses.insert(*AI);
+    }
+
+    for (auto CS : AvailableRegs)
+      if (!Uses.count(CS) && CS != X86::RIP)
+        return CS;
+  }
+  }
+
+  return 0;
+}
+
+static bool isEAXLiveIn(MachineBasicBlock &MBB) {
+  for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {
+    unsigned Reg = RegMask.PhysReg;
+
+    if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX ||
+        Reg == X86::AH || Reg == X86::AL)
+      return true;
+  }
+
+  return false;
+}
+
+/// Check if the flags need to be preserved before the terminators.
+/// This would be the case, if the eflags is live-in of the region
+/// composed by the terminators or live-out of that region, without
+/// being defined by a terminator.
+static bool
+flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
+  for (const MachineInstr &MI : MBB.terminators()) {
+    bool BreakNext = false;
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg != X86::EFLAGS)
+        continue;
+
+      // This terminator needs an eflags that is not defined
+      // by a previous another terminator:
+      // EFLAGS is live-in of the region composed by the terminators.
+      if (!MO.isDef())
+        return true;
+      // This terminator defines the eflags, i.e., we don't need to preserve it.
+      // However, we still need to check this specific terminator does not
+      // read a live-in value.
+      BreakNext = true;
+    }
+    // We found a definition of the eflags, no need to preserve them.
+    if (BreakNext)
+      return false;
+  }
+
+  // None of the terminators use or define the eflags.
+  // Check if they are live-out, that would imply we need to preserve them.
+  for (const MachineBasicBlock *Succ : MBB.successors())
+    if (Succ->isLiveIn(X86::EFLAGS))
+      return true;
+
+  return false;
+}
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator &MBBI,
+                                    int64_t NumBytes, bool InEpilogue) const {
+  bool isSub = NumBytes < 0;
+  uint64_t Offset = isSub ? -NumBytes : NumBytes;
+
+  uint64_t Chunk = (1LL << 31) - 1;
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  while (Offset) {
+    if (Offset > Chunk) {
+      // Rather than emit a long series of instructions for large offsets,
+      // load the offset into a register and do one sub/add
+      unsigned Reg = 0;
+
+      if (isSub && !isEAXLiveIn(MBB))
+        Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
+      else
+        Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+
+      if (Reg) {
+        unsigned Opc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
+        BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg)
+          .addImm(Offset);
+        Opc = isSub
+          ? getSUBrrOpcode(Is64Bit)
+          : getADDrrOpcode(Is64Bit);
+        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+          .addReg(StackPtr)
+          .addReg(Reg);
+        MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+        Offset = 0;
+        continue;
+      }
+    }
+
+    uint64_t ThisVal = std::min(Offset, Chunk);
+    if (ThisVal == (Is64Bit ? 8 : 4)) {
+      // Use push / pop instead.
+      unsigned Reg = isSub
+        ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
+        : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+      if (Reg) {
+        unsigned Opc = isSub
+          ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
+          : (Is64Bit ? X86::POP64r  : X86::POP32r);
+        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
+          .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
+        if (isSub)
+          MI->setFlag(MachineInstr::FrameSetup);
+        else
+          MI->setFlag(MachineInstr::FrameDestroy);
+        Offset -= ThisVal;
+        continue;
+      }
+    }
+
+    MachineInstrBuilder MI = BuildStackAdjustment(
+        MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue);
+    if (isSub)
+      MI.setMIFlag(MachineInstr::FrameSetup);
+    else
+      MI.setMIFlag(MachineInstr::FrameDestroy);
+
+    Offset -= ThisVal;
+  }
+}
+
+MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    const DebugLoc &DL, int64_t Offset, bool InEpilogue) const {
+  assert(Offset != 0 && "zero offset stack adjustment requested");
+
+  // On Atom, using LEA to adjust SP is preferred, but using it in the epilogue
+  // is tricky.
+  bool UseLEA;
+  if (!InEpilogue) {
+    // Check if inserting the prologue at the beginning
+    // of MBB would require to use LEA operations.
+    // We need to use LEA operations if EFLAGS is live in, because
+    // it means an instruction will read it before it gets defined.
+    UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS);
+  } else {
+    // If we can use LEA for SP but we shouldn't, check that none
+    // of the terminators uses the eflags. Otherwise we will insert
+    // a ADD that will redefine the eflags and break the condition.
+    // Alternatively, we could move the ADD, but this may not be possible
+    // and is an optimization anyway.
+    UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent());
+    if (UseLEA && !STI.useLeaForSP())
+      UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB);
+    // If that assert breaks, that means we do not do the right thing
+    // in canUseAsEpilogue.
+    assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) &&
+           "We shouldn't have allowed this insertion point");
+  }
+
+  MachineInstrBuilder MI;
+  if (UseLEA) {
+    MI = addRegOffset(BuildMI(MBB, MBBI, DL,
+                              TII.get(getLEArOpcode(Uses64BitFramePtr)),
+                              StackPtr),
+                      StackPtr, false, Offset);
+  } else {
+    bool IsSub = Offset < 0;
+    uint64_t AbsOffset = IsSub ? -Offset : Offset;
+    unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset)
+                         : getADDriOpcode(Uses64BitFramePtr, AbsOffset);
+    MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+             .addReg(StackPtr)
+             .addImm(AbsOffset);
+    MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+  }
+  return MI;
+}
+
+int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator &MBBI,
+                                     bool doMergeWithPrevious) const {
+  if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
+      (!doMergeWithPrevious && MBBI == MBB.end()))
+    return 0;
+
+  MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
+  MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
+                                                       : std::next(MBBI);
+  unsigned Opc = PI->getOpcode();
+  int Offset = 0;
+
+  if (!doMergeWithPrevious && NI != MBB.end() &&
+      NI->getOpcode() == TargetOpcode::CFI_INSTRUCTION) {
+    // Don't merge with the next instruction if it has CFI.
+    return Offset;
+  }
+
+  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+       Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+      PI->getOperand(0).getReg() == StackPtr){
+    assert(PI->getOperand(1).getReg() == StackPtr);
+    Offset += PI->getOperand(2).getImm();
+    MBB.erase(PI);
+    if (!doMergeWithPrevious) MBBI = NI;
+  } else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
+             PI->getOperand(0).getReg() == StackPtr &&
+             PI->getOperand(1).getReg() == StackPtr &&
+             PI->getOperand(2).getImm() == 1 &&
+             PI->getOperand(3).getReg() == X86::NoRegister &&
+             PI->getOperand(5).getReg() == X86::NoRegister) {
+    // For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
+    Offset += PI->getOperand(4).getImm();
+    MBB.erase(PI);
+    if (!doMergeWithPrevious) MBBI = NI;
+  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+             PI->getOperand(0).getReg() == StackPtr) {
+    assert(PI->getOperand(1).getReg() == StackPtr);
+    Offset -= PI->getOperand(2).getImm();
+    MBB.erase(PI);
+    if (!doMergeWithPrevious) MBBI = NI;
+  }
+
+  return Offset;
+}
+
+void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                const DebugLoc &DL,
+                                const MCCFIInstruction &CFIInst) const {
+  MachineFunction &MF = *MBB.getParent();
+  unsigned CFIIndex = MF.addFrameInst(CFIInst);
+  BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+}
+
+void X86FrameLowering::emitCalleeSavedFrameMoves(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    const DebugLoc &DL) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  if (CSI.empty()) return;
+
+  // Calculate offsets.
+  for (std::vector<CalleeSavedInfo>::const_iterator
+         I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+    int64_t Offset = MFI.getObjectOffset(I->getFrameIdx());
+    unsigned Reg = I->getReg();
+
+    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+    BuildCFI(MBB, MBBI, DL,
+             MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+  }
+}
+
+void X86FrameLowering::emitStackProbe(MachineFunction &MF,
+                                      MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI,
+                                      const DebugLoc &DL, bool InProlog) const {
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  if (STI.isTargetWindowsCoreCLR()) {
+    if (InProlog) {
+      emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
+    } else {
+      emitStackProbeInline(MF, MBB, MBBI, DL, false);
+    }
+  } else {
+    emitStackProbeCall(MF, MBB, MBBI, DL, InProlog);
+  }
+}
+
+void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
+                                        MachineBasicBlock &PrologMBB) const {
+  const StringRef ChkStkStubSymbol = "__chkstk_stub";
+  MachineInstr *ChkStkStub = nullptr;
+
+  for (MachineInstr &MI : PrologMBB) {
+    if (MI.isCall() && MI.getOperand(0).isSymbol() &&
+        ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) {
+      ChkStkStub = &MI;
+      break;
+    }
+  }
+
+  if (ChkStkStub != nullptr) {
+    assert(!ChkStkStub->isBundled() &&
+           "Not expecting bundled instructions here");
+    MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator());
+    assert(std::prev(MBBI) == ChkStkStub &&
+           "MBBI expected after __chkstk_stub.");
+    DebugLoc DL = PrologMBB.findDebugLoc(MBBI);
+    emitStackProbeInline(MF, PrologMBB, MBBI, DL, true);
+    ChkStkStub->eraseFromParent();
+  }
+}
+
+void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
+                                            MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MBBI,
+                                            const DebugLoc &DL,
+                                            bool InProlog) const {
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  assert(STI.is64Bit() && "different expansion needed for 32 bit");
+  assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB.getBasicBlock();
+
+  // RAX contains the number of bytes of desired stack adjustment.
+  // The handling here assumes this value has already been updated so as to
+  // maintain stack alignment.
+  //
+  // We need to exit with RSP modified by this amount and execute suitable
+  // page touches to notify the OS that we're growing the stack responsibly.
+  // All stack probing must be done without modifying RSP.
+  //
+  // MBB:
+  //    SizeReg = RAX;
+  //    ZeroReg = 0
+  //    CopyReg = RSP
+  //    Flags, TestReg = CopyReg - SizeReg
+  //    FinalReg = !Flags.Ovf ? TestReg : ZeroReg
+  //    LimitReg = gs magic thread env access
+  //    if FinalReg >= LimitReg goto ContinueMBB
+  // RoundBB:
+  //    RoundReg = page address of FinalReg
+  // LoopMBB:
+  //    LoopReg = PHI(LimitReg,ProbeReg)
+  //    ProbeReg = LoopReg - PageSize
+  //    [ProbeReg] = 0
+  //    if (ProbeReg > RoundReg) goto LoopMBB
+  // ContinueMBB:
+  //    RSP = RSP - RAX
+  //    [rest of original MBB]
+
+  // Set up the new basic blocks
+  MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+
+  MachineFunction::iterator MBBIter = std::next(MBB.getIterator());
+  MF.insert(MBBIter, RoundMBB);
+  MF.insert(MBBIter, LoopMBB);
+  MF.insert(MBBIter, ContinueMBB);
+
+  // Split MBB and move the tail portion down to ContinueMBB.
+  MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI);
+  ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end());
+  ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+  // Some useful constants
+  const int64_t ThreadEnvironmentStackLimit = 0x10;
+  const int64_t PageSize = 0x1000;
+  const int64_t PageMask = ~(PageSize - 1);
+
+  // Registers we need. For the normal case we use virtual
+  // registers. For the prolog expansion we use RAX, RCX and RDX.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterClass *RegClass = &X86::GR64RegClass;
+  const unsigned SizeReg = InProlog ? (unsigned)X86::RAX
+                                    : MRI.createVirtualRegister(RegClass),
+                 ZeroReg = InProlog ? (unsigned)X86::RCX
+                                    : MRI.createVirtualRegister(RegClass),
+                 CopyReg = InProlog ? (unsigned)X86::RDX
+                                    : MRI.createVirtualRegister(RegClass),
+                 TestReg = InProlog ? (unsigned)X86::RDX
+                                    : MRI.createVirtualRegister(RegClass),
+                 FinalReg = InProlog ? (unsigned)X86::RDX
+                                     : MRI.createVirtualRegister(RegClass),
+                 RoundedReg = InProlog ? (unsigned)X86::RDX
+                                       : MRI.createVirtualRegister(RegClass),
+                 LimitReg = InProlog ? (unsigned)X86::RCX
+                                     : MRI.createVirtualRegister(RegClass),
+                 JoinReg = InProlog ? (unsigned)X86::RCX
+                                    : MRI.createVirtualRegister(RegClass),
+                 ProbeReg = InProlog ? (unsigned)X86::RCX
+                                     : MRI.createVirtualRegister(RegClass);
+
+  // SP-relative offsets where we can save RCX and RDX.
+  int64_t RCXShadowSlot = 0;
+  int64_t RDXShadowSlot = 0;
+
+  // If inlining in the prolog, save RCX and RDX.     
+  // Future optimization: don't save or restore if not live in.
+  if (InProlog) {
+    // Compute the offsets. We need to account for things already
+    // pushed onto the stack at this point: return address, frame
+    // pointer (if used), and callee saves.
+    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+    const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize();
+    const bool HasFP = hasFP(MF);
+    RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0);
+    RDXShadowSlot = RCXShadowSlot + 8;
+    // Emit the saves.
+    addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+                 RCXShadowSlot)
+        .addReg(X86::RCX);
+    addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false,
+                 RDXShadowSlot)
+        .addReg(X86::RDX);
+  } else {
+    // Not in the prolog. Copy RAX to a virtual reg.
+    BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX);
+  }
+
+  // Add code to MBB to check for overflow and set the new target stack pointer
+  // to zero if so.
+  BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg)
+      .addReg(ZeroReg, RegState::Undef)
+      .addReg(ZeroReg, RegState::Undef);
+  BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP);
+  BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg)
+      .addReg(CopyReg)
+      .addReg(SizeReg);
+  BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg)
+      .addReg(TestReg)
+      .addReg(ZeroReg);
+
+  // FinalReg now holds final stack pointer value, or zero if
+  // allocation would overflow. Compare against the current stack
+  // limit from the thread environment block. Note this limit is the
+  // lowest touched page on the stack, not the point at which the OS
+  // will cause an overflow exception, so this is just an optimization
+  // to avoid unnecessarily touching pages that are below the current
+  // SP but already committed to the stack by the OS.
+  BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg)
+      .addReg(0)
+      .addImm(1)
+      .addReg(0)
+      .addImm(ThreadEnvironmentStackLimit)
+      .addReg(X86::GS);
+  BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg);
+  // Jump if the desired stack pointer is at or above the stack limit.
+  BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB);
+
+  // Add code to roundMBB to round the final stack pointer to a page boundary.
+  BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg)
+      .addReg(FinalReg)
+      .addImm(PageMask);
+  BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB);
+
+  // LimitReg now holds the current stack limit, RoundedReg page-rounded
+  // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page
+  // and probe until we reach RoundedReg.
+  if (!InProlog) {
+    BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg)
+        .addReg(LimitReg)
+        .addMBB(RoundMBB)
+        .addReg(ProbeReg)
+        .addMBB(LoopMBB);
+  }
+
+  addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg,
+               false, -PageSize);
+
+  // Probe by storing a byte onto the stack.
+  BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi))
+      .addReg(ProbeReg)
+      .addImm(1)
+      .addReg(0)
+      .addImm(0)
+      .addReg(0)
+      .addImm(0);
+  BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr))
+      .addReg(RoundedReg)
+      .addReg(ProbeReg);
+  BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB);
+
+  MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI();
+
+  // If in prolog, restore RDX and RCX.
+  if (InProlog) {
+    addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
+                         X86::RCX),
+                 X86::RSP, false, RCXShadowSlot);
+    addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm),
+                         X86::RDX),
+                 X86::RSP, false, RDXShadowSlot);
+  }
+
+  // Now that the probing is done, add code to continueMBB to update
+  // the stack pointer for real.
+  BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
+      .addReg(X86::RSP)
+      .addReg(SizeReg);
+
+  // Add the control flow edges we need.
+  MBB.addSuccessor(ContinueMBB);
+  MBB.addSuccessor(RoundMBB);
+  RoundMBB->addSuccessor(LoopMBB);
+  LoopMBB->addSuccessor(ContinueMBB);
+  LoopMBB->addSuccessor(LoopMBB);
+
+  // Mark all the instructions added to the prolog as frame setup.
+  if (InProlog) {
+    for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) {
+      BeforeMBBI->setFlag(MachineInstr::FrameSetup);
+    }
+    for (MachineInstr &MI : *RoundMBB) {
+      MI.setFlag(MachineInstr::FrameSetup);
+    }
+    for (MachineInstr &MI : *LoopMBB) {
+      MI.setFlag(MachineInstr::FrameSetup);
+    }
+    for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin();
+         CMBBI != ContinueMBBI; ++CMBBI) {
+      CMBBI->setFlag(MachineInstr::FrameSetup);
+    }
+  }
+
+  // Possible TODO: physreg liveness for InProlog case.
+}
+
+void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
+                                          MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          const DebugLoc &DL,
+                                          bool InProlog) const {
+  bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
+
+  unsigned CallOp;
+  if (Is64Bit)
+    CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
+  else
+    CallOp = X86::CALLpcrel32;
+
+  const char *Symbol;
+  if (Is64Bit) {
+    if (STI.isTargetCygMing()) {
+      Symbol = "___chkstk_ms";
+    } else {
+      Symbol = "__chkstk";
+    }
+  } else if (STI.isTargetCygMing())
+    Symbol = "_alloca";
+  else
+    Symbol = "_chkstk";
+
+  MachineInstrBuilder CI;
+  MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
+
+  // All current stack probes take AX and SP as input, clobber flags, and
+  // preserve all registers. x86_64 probes leave RSP unmodified.
+  if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
+    // For the large code model, we have to call through a register. Use R11,
+    // as it is scratch in all supported calling conventions.
+    BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
+        .addExternalSymbol(Symbol);
+    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
+  } else {
+    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol);
+  }
+
+  unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
+  unsigned SP = Is64Bit ? X86::RSP : X86::ESP;
+  CI.addReg(AX, RegState::Implicit)
+      .addReg(SP, RegState::Implicit)
+      .addReg(AX, RegState::Define | RegState::Implicit)
+      .addReg(SP, RegState::Define | RegState::Implicit)
+      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+
+  if (Is64Bit) {
+    // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
+    // themselves. It also does not clobber %rax so we can reuse it when
+    // adjusting %rsp.
+    BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
+        .addReg(X86::RSP)
+        .addReg(X86::RAX);
+  }
+
+  if (InProlog) {
+    // Apply the frame setup flag to all inserted instrs.
+    for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI)
+      ExpansionMBBI->setFlag(MachineInstr::FrameSetup);
+  }
+}
+
+void X86FrameLowering::emitStackProbeInlineStub(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
+
+  assert(InProlog && "ChkStkStub called outside prolog!");
+
+  BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+      .addExternalSymbol("__chkstk_stub");
+}
+
+static unsigned calculateSetFPREG(uint64_t SPAdjust) {
+  // Win64 ABI has a less restrictive limitation of 240; 128 works equally well
+  // and might require smaller successive adjustments.
+  const uint64_t Win64MaxSEHOffset = 128;
+  uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset);
+  // Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode.
+  return SEHFrameOffset & -16;
+}
+
+// If we're forcing a stack realignment we can't rely on just the frame
+// info, we need to know the ABI stack alignment as well in case we
+// have a call out.  Otherwise just make sure we have some alignment - we'll
+// go with the minimum SlotSize.
+uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  uint64_t MaxAlign = MFI.getMaxAlignment(); // Desired stack alignment.
+  unsigned StackAlign = getStackAlignment();
+  if (MF.getFunction()->hasFnAttribute("stackrealign")) {
+    if (MFI.hasCalls())
+      MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
+    else if (MaxAlign < SlotSize)
+      MaxAlign = SlotSize;
+  }
+  return MaxAlign;
+}
+
+void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          const DebugLoc &DL, unsigned Reg,
+                                          uint64_t MaxAlign) const {
+  uint64_t Val = -MaxAlign;
+  unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);
+  MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
+                         .addReg(Reg)
+                         .addImm(Val)
+                         .setMIFlag(MachineInstr::FrameSetup);
+
+  // The EFLAGS implicit def is dead.
+  MI->getOperand(3).setIsDead();
+}
+
+/// emitPrologue - Push callee-saved registers onto the stack, which
+/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
+/// space for local variables. Also emit labels used by the exception handler to
+/// generate the exception handling frames.
+
+/*
+  Here's a gist of what gets emitted:
+
+  ; Establish frame pointer, if needed
+  [if needs FP]
+      push  %rbp
+      .cfi_def_cfa_offset 16
+      .cfi_offset %rbp, -16
+      .seh_pushreg %rpb
+      mov  %rsp, %rbp
+      .cfi_def_cfa_register %rbp
+
+  ; Spill general-purpose registers
+  [for all callee-saved GPRs]
+      pushq %<reg>
+      [if not needs FP]
+         .cfi_def_cfa_offset (offset from RETADDR)
+      .seh_pushreg %<reg>
+
+  ; If the required stack alignment > default stack alignment
+  ; rsp needs to be re-aligned.  This creates a "re-alignment gap"
+  ; of unknown size in the stack frame.
+  [if stack needs re-alignment]
+      and  $MASK, %rsp
+
+  ; Allocate space for locals
+  [if target is Windows and allocated space > 4096 bytes]
+      ; Windows needs special care for allocations larger
+      ; than one page.
+      mov $NNN, %rax
+      call ___chkstk_ms/___chkstk
+      sub  %rax, %rsp
+  [else]
+      sub  $NNN, %rsp
+
+  [if needs FP]
+      .seh_stackalloc (size of XMM spill slots)
+      .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots
+  [else]
+      .seh_stackalloc NNN
+
+  ; Spill XMMs
+  ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,
+  ; they may get spilled on any platform, if the current function
+  ; calls @llvm.eh.unwind.init
+  [if needs FP]
+      [for all callee-saved XMM registers]
+          movaps  %<xmm reg>, -MMM(%rbp)
+      [for all callee-saved XMM registers]
+          .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
+              ; i.e. the offset relative to (%rbp - SEHFrameOffset)
+  [else]
+      [for all callee-saved XMM registers]
+          movaps  %<xmm reg>, KKK(%rsp)
+      [for all callee-saved XMM registers]
+          .seh_savexmm %<xmm reg>, KKK
+
+  .seh_endprologue
+
+  [if needs base pointer]
+      mov  %rsp, %rbx
+      [if needs to restore base pointer]
+          mov %rsp, -MMM(%rbp)
+
+  ; Emit CFI info
+  [if needs FP]
+      [for all callee-saved registers]
+          .cfi_offset %<reg>, (offset from %rbp)
+  [else]
+       .cfi_def_cfa_offset (offset from RETADDR)
+      [for all callee-saved registers]
+          .cfi_offset %<reg>, (offset from %rsp)
+
+  Notes:
+  - .seh directives are emitted only for Windows 64 ABI
+  - .cfi directives are emitted for all other ABIs
+  - for 32-bit code, substitute %e?? registers for %r??
+*/
+
+void X86FrameLowering::emitPrologue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {
+  assert(&STI == &MF.getSubtarget<X86Subtarget>() &&
+         "MF used frame lowering for wrong subtarget");
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const Function *Fn = MF.getFunction();
+  MachineModuleInfo &MMI = MF.getMMI();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
+  uint64_t StackSize = MFI.getStackSize();    // Number of bytes to allocate.
+  bool IsFunclet = MBB.isEHFuncletEntry();
+  EHPersonality Personality = EHPersonality::Unknown;
+  if (Fn->hasPersonalityFn())
+    Personality = classifyEHPersonality(Fn->getPersonalityFn());
+  bool FnHasClrFunclet =
+      MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
+  bool IsClrFunclet = IsFunclet && FnHasClrFunclet;
+  bool HasFP = hasFP(MF);
+  bool IsWin64CC = STI.isCallingConvWin64(Fn->getCallingConv());
+  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+  bool NeedsWinCFI = IsWin64Prologue && Fn->needsUnwindTableEntry();
+  bool NeedsDwarfCFI =
+      !IsWin64Prologue && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
+  unsigned FramePtr = TRI->getFrameRegister(MF);
+  const unsigned MachineFramePtr =
+      STI.isTarget64BitILP32()
+          ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
+  unsigned BasePtr = TRI->getBaseRegister();
+  bool HasWinCFI = false;
+  
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc DL;
+
+  // Add RETADDR move area to callee saved frame size.
+  int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+  if (TailCallReturnAddrDelta && IsWin64Prologue)
+    report_fatal_error("Can't handle guaranteed tail call under win64 yet");
+
+  if (TailCallReturnAddrDelta < 0)
+    X86FI->setCalleeSavedFrameSize(
+      X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
+
+  bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO());
+
+  // The default stack probe size is 4096 if the function has no stackprobesize
+  // attribute.
+  unsigned StackProbeSize = 4096;
+  if (Fn->hasFnAttribute("stack-probe-size"))
+    Fn->getFnAttribute("stack-probe-size")
+        .getValueAsString()
+        .getAsInteger(0, StackProbeSize);
+
+  // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
+  // function, and use up to 128 bytes of stack space, don't have a frame
+  // pointer, calls, or dynamic alloca then we do not need to adjust the
+  // stack pointer (we fit in the Red Zone). We also check that we don't
+  // push and pop from the stack.
+  if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) &&
+      !TRI->needsStackRealignment(MF) &&
+      !MFI.hasVarSizedObjects() &&             // No dynamic alloca.
+      !MFI.adjustsStack() &&                   // No calls.
+      !IsWin64CC &&                            // Win64 has no Red Zone
+      !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
+      !MF.shouldSplitStack()) {                // Regular stack
+    uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
+    if (HasFP) MinSize += SlotSize;
+    X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0);
+    StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
+    MFI.setStackSize(StackSize);
+  }
+
+  // Insert stack pointer adjustment for later moving of return addr.  Only
+  // applies to tail call optimized functions where the callee argument stack
+  // size is bigger than the callers.
+  if (TailCallReturnAddrDelta < 0) {
+    BuildStackAdjustment(MBB, MBBI, DL, TailCallReturnAddrDelta,
+                         /*InEpilogue=*/false)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // Mapping for machine moves:
+  //
+  //   DST: VirtualFP AND
+  //        SRC: VirtualFP              => DW_CFA_def_cfa_offset
+  //        ELSE                        => DW_CFA_def_cfa
+  //
+  //   SRC: VirtualFP AND
+  //        DST: Register               => DW_CFA_def_cfa_register
+  //
+  //   ELSE
+  //        OFFSET < 0                  => DW_CFA_offset_extended_sf
+  //        REG < 64                    => DW_CFA_offset + Reg
+  //        ELSE                        => DW_CFA_offset_extended
+
+  uint64_t NumBytes = 0;
+  int stackGrowth = -SlotSize;
+
+  // Find the funclet establisher parameter
+  unsigned Establisher = X86::NoRegister;
+  if (IsClrFunclet)
+    Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX;
+  else if (IsFunclet)
+    Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX;
+
+  if (IsWin64Prologue && IsFunclet && !IsClrFunclet) {
+    // Immediately spill establisher into the home slot.
+    // The runtime cares about this.
+    // MOV64mr %rdx, 16(%rsp)
+    unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16)
+        .addReg(Establisher)
+        .setMIFlag(MachineInstr::FrameSetup);
+    MBB.addLiveIn(Establisher);
+  }
+
+  if (HasFP) {
+    // Calculate required stack adjustment.
+    uint64_t FrameSize = StackSize - SlotSize;
+    // If required, include space for extra hidden slot for stashing base pointer.
+    if (X86FI->getRestoreBasePointer())
+      FrameSize += SlotSize;
+
+    NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+
+    // Callee-saved registers are pushed on stack before the stack is realigned.
+    if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
+      NumBytes = alignTo(NumBytes, MaxAlign);
+
+    // Get the offset of the stack slot for the EBP register, which is
+    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+    // Update the frame offset adjustment.
+    if (!IsFunclet)
+      MFI.setOffsetAdjustment(-NumBytes);
+    else
+      assert(MFI.getOffsetAdjustment() == -(int)NumBytes &&
+             "should calculate same local variable offset for funclets");
+
+    // Save EBP/RBP into the appropriate stack slot.
+    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+      .addReg(MachineFramePtr, RegState::Kill)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+    if (NeedsDwarfCFI) {
+      // Mark the place where EBP/RBP was saved.
+      // Define the current CFA rule to use the provided offset.
+      assert(StackSize);
+      BuildCFI(MBB, MBBI, DL,
+               MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));
+
+      // Change the rule for the FramePtr to be an "offset" rule.
+      unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+      BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset(
+                                  nullptr, DwarfFramePtr, 2 * stackGrowth));
+    }
+
+    if (NeedsWinCFI) {
+      HasWinCFI = true;
+      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
+          .addImm(FramePtr)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    if (!IsWin64Prologue && !IsFunclet) {
+      // Update EBP with the new base value.
+      BuildMI(MBB, MBBI, DL,
+              TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
+              FramePtr)
+          .addReg(StackPtr)
+          .setMIFlag(MachineInstr::FrameSetup);
+
+      if (NeedsDwarfCFI) {
+        // Mark effective beginning of when frame pointer becomes valid.
+        // Define the current CFA to use the EBP/RBP register.
+        unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+        BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister(
+                                    nullptr, DwarfFramePtr));
+      }
+    }
+
+    // Mark the FramePtr as live-in in every block. Don't do this again for
+    // funclet prologues.
+    if (!IsFunclet) {
+      for (MachineBasicBlock &EveryMBB : MF)
+        EveryMBB.addLiveIn(MachineFramePtr);
+    }
+  } else {
+    assert(!IsFunclet && "funclets without FPs not yet implemented");
+    NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+  }
+
+  // For EH funclets, only allocate enough space for outgoing calls. Save the
+  // NumBytes value that we would've used for the parent frame.
+  unsigned ParentFrameNumBytes = NumBytes;
+  if (IsFunclet)
+    NumBytes = getWinEHFuncletFrameSize(MF);
+
+  // Skip the callee-saved push instructions.
+  bool PushedRegs = false;
+  int StackOffset = 2 * stackGrowth;
+
+  while (MBBI != MBB.end() &&
+         MBBI->getFlag(MachineInstr::FrameSetup) &&
+         (MBBI->getOpcode() == X86::PUSH32r ||
+          MBBI->getOpcode() == X86::PUSH64r)) {
+    PushedRegs = true;
+    unsigned Reg = MBBI->getOperand(0).getReg();
+    ++MBBI;
+
+    if (!HasFP && NeedsDwarfCFI) {
+      // Mark callee-saved push instruction.
+      // Define the current CFA rule to use the provided offset.
+      assert(StackSize);
+      BuildCFI(MBB, MBBI, DL,
+               MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
+      StackOffset += stackGrowth;
+    }
+
+    if (NeedsWinCFI) {
+      HasWinCFI = true;
+      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag(
+          MachineInstr::FrameSetup);
+    }
+  }
+
+  // Realign stack after we pushed callee-saved registers (so that we'll be
+  // able to calculate their offsets from the frame pointer).
+  // Don't do this for Win64, it needs to realign the stack after the prologue.
+  if (!IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) {
+    assert(HasFP && "There should be a frame pointer if stack is realigned.");
+    BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign);
+  }
+
+  // If there is an SUB32ri of ESP immediately before this instruction, merge
+  // the two. This can be the case when tail call elimination is enabled and
+  // the callee has more arguments then the caller.
+  NumBytes -= mergeSPUpdates(MBB, MBBI, true);
+
+  // Adjust stack pointer: ESP -= numbytes.
+
+  // Windows and cygwin/mingw require a prologue helper routine when allocating
+  // more than 4K bytes on the stack.  Windows uses __chkstk and cygwin/mingw
+  // uses __alloca.  __alloca and the 32-bit version of __chkstk will probe the
+  // stack and adjust the stack pointer in one go.  The 64-bit version of
+  // __chkstk is only responsible for probing the stack.  The 64-bit prologue is
+  // responsible for adjusting the stack pointer.  Touching the stack at 4K
+  // increments is necessary to ensure that the guard pages used by the OS
+  // virtual memory manager are allocated in correct sequence.
+  uint64_t AlignedNumBytes = NumBytes;
+  if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
+    AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
+  if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
+    // Check whether EAX is livein for this block.
+    bool isEAXAlive = isEAXLiveIn(MBB);
+
+    if (isEAXAlive) {
+      // Sanity check that EAX is not livein for this function.
+      // It should not be, so throw an assert.
+      assert(!Is64Bit && "EAX is livein in x64 case!");
+
+      // Save EAX
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
+        .addReg(X86::EAX, RegState::Kill)
+        .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    if (Is64Bit) {
+      // Handle the 64-bit Windows ABI case where we need to call __chkstk.
+      // Function prologue is responsible for adjusting the stack pointer.
+      if (isUInt<32>(NumBytes)) {
+        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+            .addImm(NumBytes)
+            .setMIFlag(MachineInstr::FrameSetup);
+      } else if (isInt<32>(NumBytes)) {
+        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX)
+            .addImm(NumBytes)
+            .setMIFlag(MachineInstr::FrameSetup);
+      } else {
+        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
+            .addImm(NumBytes)
+            .setMIFlag(MachineInstr::FrameSetup);
+      }
+    } else {
+      // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
+      // We'll also use 4 already allocated bytes for EAX.
+      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+          .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    // Call __chkstk, __chkstk_ms, or __alloca.
+    emitStackProbe(MF, MBB, MBBI, DL, true);
+
+    if (isEAXAlive) {
+      // Restore EAX
+      MachineInstr *MI =
+          addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX),
+                       StackPtr, false, NumBytes - 4);
+      MI->setFlag(MachineInstr::FrameSetup);
+      MBB.insert(MBBI, MI);
+    }
+  } else if (NumBytes) {
+    emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false);
+  }
+
+  if (NeedsWinCFI && NumBytes) {
+    HasWinCFI = true;
+    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
+        .addImm(NumBytes)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  int SEHFrameOffset = 0;
+  unsigned SPOrEstablisher;
+  if (IsFunclet) {
+    if (IsClrFunclet) {
+      // The establisher parameter passed to a CLR funclet is actually a pointer
+      // to the (mostly empty) frame of its nearest enclosing funclet; we have
+      // to find the root function establisher frame by loading the PSPSym from
+      // the intermediate frame.
+      unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
+      MachinePointerInfo NoInfo;
+      MBB.addLiveIn(Establisher);
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher),
+                   Establisher, false, PSPSlotOffset)
+          .addMemOperand(MF.getMachineMemOperand(
+              NoInfo, MachineMemOperand::MOLoad, SlotSize, SlotSize));
+      ;
+      // Save the root establisher back into the current funclet's (mostly
+      // empty) frame, in case a sub-funclet or the GC needs it.
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr,
+                   false, PSPSlotOffset)
+          .addReg(Establisher)
+          .addMemOperand(
+              MF.getMachineMemOperand(NoInfo, MachineMemOperand::MOStore |
+                                                  MachineMemOperand::MOVolatile,
+                                      SlotSize, SlotSize));
+    }
+    SPOrEstablisher = Establisher;
+  } else {
+    SPOrEstablisher = StackPtr;
+  }
+
+  if (IsWin64Prologue && HasFP) {
+    // Set RBP to a small fixed offset from RSP. In the funclet case, we base
+    // this calculation on the incoming establisher, which holds the value of
+    // RSP from the parent frame at the end of the prologue.
+    SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes);
+    if (SEHFrameOffset)
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr),
+                   SPOrEstablisher, false, SEHFrameOffset);
+    else
+      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr)
+          .addReg(SPOrEstablisher);
+
+    // If this is not a funclet, emit the CFI describing our frame pointer.
+    if (NeedsWinCFI && !IsFunclet) {
+      HasWinCFI = true;
+      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
+          .addImm(FramePtr)
+          .addImm(SEHFrameOffset)
+          .setMIFlag(MachineInstr::FrameSetup);
+      if (isAsynchronousEHPersonality(Personality))
+        MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset;
+    }
+  } else if (IsFunclet && STI.is32Bit()) {
+    // Reset EBP / ESI to something good for funclets.
+    MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL);
+    // If we're a catch funclet, we can be returned to via catchret. Save ESP
+    // into the registration node so that the runtime will restore it for us.
+    if (!MBB.isCleanupFuncletEntry()) {
+      assert(Personality == EHPersonality::MSVC_CXX);
+      unsigned FrameReg;
+      int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
+      int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg);
+      // ESP is the first field, so no extra displacement is needed.
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg,
+                   false, EHRegOffset)
+          .addReg(X86::ESP);
+    }
+  }
+
+  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) {
+    const MachineInstr &FrameInstr = *MBBI;
+    ++MBBI;
+
+    if (NeedsWinCFI) {
+      int FI;
+      if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
+        if (X86::FR64RegClass.contains(Reg)) {
+          unsigned IgnoredFrameReg;
+          int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg);
+          Offset += SEHFrameOffset;
+
+          HasWinCFI = true;
+          BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
+              .addImm(Reg)
+              .addImm(Offset)
+              .setMIFlag(MachineInstr::FrameSetup);
+        }
+      }
+    }
+  }
+
+  if (NeedsWinCFI && HasWinCFI)
+    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
+        .setMIFlag(MachineInstr::FrameSetup);
+
+  if (FnHasClrFunclet && !IsFunclet) {
+    // Save the so-called Initial-SP (i.e. the value of the stack pointer
+    // immediately after the prolog)  into the PSPSlot so that funclets
+    // and the GC can recover it.
+    unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF);
+    auto PSPInfo = MachinePointerInfo::getFixedStack(
+        MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx);
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false,
+                 PSPSlotOffset)
+        .addReg(StackPtr)
+        .addMemOperand(MF.getMachineMemOperand(
+            PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
+            SlotSize, SlotSize));
+  }
+
+  // Realign stack after we spilled callee-saved registers (so that we'll be
+  // able to calculate their offsets from the frame pointer).
+  // Win64 requires aligning the stack after the prologue.
+  if (IsWin64Prologue && TRI->needsStackRealignment(MF)) {
+    assert(HasFP && "There should be a frame pointer if stack is realigned.");
+    BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign);
+  }
+
+  // We already dealt with stack realignment and funclets above.
+  if (IsFunclet && STI.is32Bit())
+    return;
+
+  // If we need a base pointer, set it up here. It's whatever the value
+  // of the stack pointer is at this point. Any variable size objects
+  // will be allocated after this, so we can still use the base pointer
+  // to reference locals.
+  if (TRI->hasBasePointer(MF)) {
+    // Update the base pointer with the current stack pointer.
+    unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
+    BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
+      .addReg(SPOrEstablisher)
+      .setMIFlag(MachineInstr::FrameSetup);
+    if (X86FI->getRestoreBasePointer()) {
+      // Stash value of base pointer.  Saving RSP instead of EBP shortens
+      // dependence chain. Used by SjLj EH.
+      unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
+                   FramePtr, true, X86FI->getRestoreBasePointerOffset())
+        .addReg(SPOrEstablisher)
+        .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) {
+      // Stash the value of the frame pointer relative to the base pointer for
+      // Win32 EH. This supports Win32 EH, which does the inverse of the above:
+      // it recovers the frame pointer from the base pointer rather than the
+      // other way around.
+      unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
+      unsigned UsedReg;
+      int Offset =
+          getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
+      assert(UsedReg == BasePtr);
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset)
+          .addReg(FramePtr)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+  }
+
+  if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
+    // Mark end of stack pointer adjustment.
+    if (!HasFP && NumBytes) {
+      // Define the current CFA rule to use the provided offset.
+      assert(StackSize);
+      BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
+                                  nullptr, -StackSize + stackGrowth));
+    }
+
+    // Emit DWARF info specifying the offsets of the callee-saved registers.
+    if (PushedRegs)
+      emitCalleeSavedFrameMoves(MBB, MBBI, DL);
+  }
+
+  // X86 Interrupt handling function cannot assume anything about the direction
+  // flag (DF in EFLAGS register). Clear this flag by creating "cld" instruction
+  // in each prologue of interrupt handler function.
+  //
+  // FIXME: Create "cld" instruction only in these cases:
+  // 1. The interrupt handling function uses any of the "rep" instructions.
+  // 2. Interrupt handling function calls another function.
+  //
+  if (Fn->getCallingConv() == CallingConv::X86_INTR)
+    BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
+        .setMIFlag(MachineInstr::FrameSetup);
+
+  // At this point we know if the function has WinCFI or not.
+  MF.setHasWinCFI(HasWinCFI);
+}
+
+bool X86FrameLowering::canUseLEAForSPInEpilogue(
+    const MachineFunction &MF) const {
+  // We can't use LEA instructions for adjusting the stack pointer if we don't
+  // have a frame pointer in the Win64 ABI.  Only ADD instructions may be used
+  // to deallocate the stack.
+  // This means that we can use LEA for SP in two situations:
+  // 1. We *aren't* using the Win64 ABI which means we are free to use LEA.
+  // 2. We *have* a frame pointer which means we are permitted to use LEA.
+  return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF);
+}
+
+static bool isFuncletReturnInstr(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case X86::CATCHRET:
+  case X86::CLEANUPRET:
+    return true;
+  default:
+    return false;
+  }
+  llvm_unreachable("impossible");
+}
+
+// CLR funclets use a special "Previous Stack Pointer Symbol" slot on the
+// stack. It holds a pointer to the bottom of the root function frame.  The
+// establisher frame pointer passed to a nested funclet may point to the
+// (mostly empty) frame of its parent funclet, but it will need to find
+// the frame of the root function to access locals.  To facilitate this,
+// every funclet copies the pointer to the bottom of the root function
+// frame into a PSPSym slot in its own (mostly empty) stack frame. Using the
+// same offset for the PSPSym in the root function frame that's used in the
+// funclets' frames allows each funclet to dynamically accept any ancestor
+// frame as its establisher argument (the runtime doesn't guarantee the
+// immediate parent for some reason lost to history), and also allows the GC,
+// which uses the PSPSym for some bookkeeping, to find it in any funclet's
+// frame with only a single offset reported for the entire method.
+unsigned
+X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
+  const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
+  unsigned SPReg;
+  int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg,
+                                              /*IgnoreSPUpdates*/ true);
+  assert(Offset >= 0 && SPReg == TRI->getStackRegister());
+  return static_cast<unsigned>(Offset);
+}
+
+unsigned
+X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
+  // This is the size of the pushed CSRs.
+  unsigned CSSize =
+      MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+  // This is the amount of stack a funclet needs to allocate.
+  unsigned UsedSize;
+  EHPersonality Personality =
+      classifyEHPersonality(MF.getFunction()->getPersonalityFn());
+  if (Personality == EHPersonality::CoreCLR) {
+    // CLR funclets need to hold enough space to include the PSPSym, at the
+    // same offset from the stack pointer (immediately after the prolog) as it
+    // resides at in the main function.
+    UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize;
+  } else {
+    // Other funclets just need enough stack for outgoing call arguments.
+    UsedSize = MF.getFrameInfo().getMaxCallFrameSize();
+  }
+  // RBP is not included in the callee saved register block. After pushing RBP,
+  // everything is 16 byte aligned. Everything we allocate before an outgoing
+  // call must also be 16 byte aligned.
+  unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment());
+  // Subtract out the size of the callee saved registers. This is how much stack
+  // each funclet will allocate.
+  return FrameSizeMinusRBP - CSSize;
+}
+
+static bool isTailCallOpcode(unsigned Opc) {
+    return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi ||
+        Opc == X86::TCRETURNmi ||
+        Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNdi64 ||
+        Opc == X86::TCRETURNmi64;
+}
+
+void X86FrameLowering::emitEpilogue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  Optional<unsigned> RetOpcode;
+  if (MBBI != MBB.end())
+    RetOpcode = MBBI->getOpcode();
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+  // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
+  const bool Is64BitILP32 = STI.isTarget64BitILP32();
+  unsigned FramePtr = TRI->getFrameRegister(MF);
+  unsigned MachineFramePtr =
+      Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
+
+  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+  bool NeedsWinCFI =
+      IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry();
+  bool IsFunclet = MBBI == MBB.end() ? false : isFuncletReturnInstr(*MBBI);
+  MachineBasicBlock *TargetMBB = nullptr;
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  uint64_t StackSize = MFI.getStackSize();
+  uint64_t MaxAlign = calculateMaxStackAlign(MF);
+  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+  uint64_t NumBytes = 0;
+
+  if (RetOpcode && *RetOpcode == X86::CATCHRET) {
+    // SEH shouldn't use catchret.
+    assert(!isAsynchronousEHPersonality(
+               classifyEHPersonality(MF.getFunction()->getPersonalityFn())) &&
+           "SEH should not use CATCHRET");
+
+    NumBytes = getWinEHFuncletFrameSize(MF);
+    assert(hasFP(MF) && "EH funclets without FP not yet implemented");
+    TargetMBB = MBBI->getOperand(0).getMBB();
+
+    // Pop EBP.
+    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+            MachineFramePtr)
+        .setMIFlag(MachineInstr::FrameDestroy);
+  } else if (RetOpcode && *RetOpcode == X86::CLEANUPRET) {
+    NumBytes = getWinEHFuncletFrameSize(MF);
+    assert(hasFP(MF) && "EH funclets without FP not yet implemented");
+    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+            MachineFramePtr)
+        .setMIFlag(MachineInstr::FrameDestroy);
+  } else if (hasFP(MF)) {
+    // Calculate required stack adjustment.
+    uint64_t FrameSize = StackSize - SlotSize;
+    NumBytes = FrameSize - CSSize;
+
+    // Callee-saved registers were pushed on stack before the stack was
+    // realigned.
+    if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
+      NumBytes = alignTo(FrameSize, MaxAlign);
+
+    // Pop EBP.
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr)
+        .setMIFlag(MachineInstr::FrameDestroy);
+  } else {
+    NumBytes = StackSize - CSSize;
+  }
+  uint64_t SEHStackAllocAmt = NumBytes;
+
+  MachineBasicBlock::iterator FirstCSPop = MBBI;
+  // Skip the callee-saved pop instructions.
+  while (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PI = std::prev(MBBI);
+    unsigned Opc = PI->getOpcode();
+
+    if (Opc != X86::DBG_VALUE && !PI->isTerminator()) {
+      if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
+          (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)))
+        break;
+      FirstCSPop = PI;
+    }
+
+    --MBBI;
+  }
+  MBBI = FirstCSPop;
+
+  if (TargetMBB) {
+    // Fill EAX/RAX with the address of the target block.
+    unsigned ReturnReg = STI.is64Bit() ? X86::RAX : X86::EAX;
+    if (STI.is64Bit()) {
+      // LEA64r TargetMBB(%rip), %rax
+      BuildMI(MBB, FirstCSPop, DL, TII.get(X86::LEA64r), ReturnReg)
+          .addReg(X86::RIP)
+          .addImm(0)
+          .addReg(0)
+          .addMBB(TargetMBB)
+          .addReg(0);
+    } else {
+      // MOV32ri $TargetMBB, %eax
+      BuildMI(MBB, FirstCSPop, DL, TII.get(X86::MOV32ri), ReturnReg)
+          .addMBB(TargetMBB);
+    }
+    // Record that we've taken the address of TargetMBB and no longer just
+    // reference it in a terminator.
+    TargetMBB->setHasAddressTaken();
+  }
+
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+
+  // If there is an ADD32ri or SUB32ri of ESP immediately before this
+  // instruction, merge the two instructions.
+  if (NumBytes || MFI.hasVarSizedObjects())
+    NumBytes += mergeSPUpdates(MBB, MBBI, true);
+
+  // If dynamic alloca is used, then reset esp to point to the last callee-saved
+  // slot before popping them off! Same applies for the case, when stack was
+  // realigned. Don't do this if this was a funclet epilogue, since the funclets
+  // will not do realignment or dynamic stack allocation.
+  if ((TRI->needsStackRealignment(MF) || MFI.hasVarSizedObjects()) &&
+      !IsFunclet) {
+    if (TRI->needsStackRealignment(MF))
+      MBBI = FirstCSPop;
+    unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt);
+    uint64_t LEAAmount =
+        IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize;
+
+    // There are only two legal forms of epilogue:
+    // - add SEHAllocationSize, %rsp
+    // - lea SEHAllocationSize(%FramePtr), %rsp
+    //
+    // 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence.
+    // However, we may use this sequence if we have a frame pointer because the
+    // effects of the prologue can safely be undone.
+    if (LEAAmount != 0) {
+      unsigned Opc = getLEArOpcode(Uses64BitFramePtr);
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
+                   FramePtr, false, LEAAmount);
+      --MBBI;
+    } else {
+      unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);
+      BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+        .addReg(FramePtr);
+      --MBBI;
+    }
+  } else if (NumBytes) {
+    // Adjust stack pointer back: ESP += numbytes.
+    emitSPUpdate(MBB, MBBI, NumBytes, /*InEpilogue=*/true);
+    --MBBI;
+  }
+
+  // Windows unwinder will not invoke function's exception handler if IP is
+  // either in prologue or in epilogue.  This behavior causes a problem when a
+  // call immediately precedes an epilogue, because the return address points
+  // into the epilogue.  To cope with that, we insert an epilogue marker here,
+  // then replace it with a 'nop' if it ends up immediately after a CALL in the
+  // final emitted code.
+  if (NeedsWinCFI && MF.hasWinCFI())
+    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
+
+  if (!RetOpcode || !isTailCallOpcode(*RetOpcode)) {
+    // Add the return addr area delta back since we are not tail calling.
+    int Offset = -1 * X86FI->getTCReturnAddrDelta();
+    assert(Offset >= 0 && "TCDelta should never be positive");
+    if (Offset) {
+      MBBI = MBB.getFirstTerminator();
+
+      // Check for possible merge with preceding ADD instruction.
+      Offset += mergeSPUpdates(MBB, MBBI, true);
+      emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true);
+    }
+  }
+}
+
+// NOTE: this only has a subset of the full frame index logic. In
+// particular, the FI < 0 and AfterFPPop logic is handled in
+// X86RegisterInfo::eliminateFrameIndex, but not here. Possibly
+// (probably?) it should be moved into here.
+int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                             unsigned &FrameReg) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // We can't calculate offset from frame pointer if the stack is realigned,
+  // so enforce usage of stack/base pointer.  The base pointer is used when we
+  // have dynamic allocas in addition to dynamic realignment.
+  if (TRI->hasBasePointer(MF))
+    FrameReg = TRI->getBaseRegister();
+  else if (TRI->needsStackRealignment(MF))
+    FrameReg = TRI->getStackRegister();
+  else
+    FrameReg = TRI->getFrameRegister(MF);
+
+  // Offset will hold the offset from the stack pointer at function entry to the
+  // object.
+  // We need to factor in additional offsets applied during the prologue to the
+  // frame, base, and stack pointer depending on which is used.
+  int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
+  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+  uint64_t StackSize = MFI.getStackSize();
+  bool HasFP = hasFP(MF);
+  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+  int64_t FPDelta = 0;
+
+  if (IsWin64Prologue) {
+    assert(!MFI.hasCalls() || (StackSize % 16) == 8);
+
+    // Calculate required stack adjustment.
+    uint64_t FrameSize = StackSize - SlotSize;
+    // If required, include space for extra hidden slot for stashing base pointer.
+    if (X86FI->getRestoreBasePointer())
+      FrameSize += SlotSize;
+    uint64_t NumBytes = FrameSize - CSSize;
+
+    uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes);
+    if (FI && FI == X86FI->getFAIndex())
+      return -SEHFrameOffset;
+
+    // FPDelta is the offset from the "traditional" FP location of the old base
+    // pointer followed by return address and the location required by the
+    // restricted Win64 prologue.
+    // Add FPDelta to all offsets below that go through the frame pointer.
+    FPDelta = FrameSize - SEHFrameOffset;
+    assert((!MFI.hasCalls() || (FPDelta % 16) == 0) &&
+           "FPDelta isn't aligned per the Win64 ABI!");
+  }
+
+
+  if (TRI->hasBasePointer(MF)) {
+    assert(HasFP && "VLAs and dynamic stack realign, but no FP?!");
+    if (FI < 0) {
+      // Skip the saved EBP.
+      return Offset + SlotSize + FPDelta;
+    } else {
+      assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0);
+      return Offset + StackSize;
+    }
+  } else if (TRI->needsStackRealignment(MF)) {
+    if (FI < 0) {
+      // Skip the saved EBP.
+      return Offset + SlotSize + FPDelta;
+    } else {
+      assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0);
+      return Offset + StackSize;
+    }
+    // FIXME: Support tail calls
+  } else {
+    if (!HasFP)
+      return Offset + StackSize;
+
+    // Skip the saved EBP.
+    Offset += SlotSize;
+
+    // Skip the RETADDR move area
+    int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+    if (TailCallReturnAddrDelta < 0)
+      Offset -= TailCallReturnAddrDelta;
+  }
+
+  return Offset + FPDelta;
+}
+
+int
+X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
+                                                 int FI, unsigned &FrameReg,
+                                                 bool IgnoreSPUpdates) const {
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  // Does not include any dynamic realign.
+  const uint64_t StackSize = MFI.getStackSize();
+  // LLVM arranges the stack as follows:
+  //   ...
+  //   ARG2
+  //   ARG1
+  //   RETADDR
+  //   PUSH RBP   <-- RBP points here
+  //   PUSH CSRs
+  //   ~~~~~~~    <-- possible stack realignment (non-win64)
+  //   ...
+  //   STACK OBJECTS
+  //   ...        <-- RSP after prologue points here
+  //   ~~~~~~~    <-- possible stack realignment (win64)
+  //
+  // if (hasVarSizedObjects()):
+  //   ...        <-- "base pointer" (ESI/RBX) points here
+  //   DYNAMIC ALLOCAS
+  //   ...        <-- RSP points here
+  //
+  // Case 1: In the simple case of no stack realignment and no dynamic
+  // allocas, both "fixed" stack objects (arguments and CSRs) are addressable
+  // with fixed offsets from RSP.
+  //
+  // Case 2: In the case of stack realignment with no dynamic allocas, fixed
+  // stack objects are addressed with RBP and regular stack objects with RSP.
+  //
+  // Case 3: In the case of dynamic allocas and stack realignment, RSP is used
+  // to address stack arguments for outgoing calls and nothing else. The "base
+  // pointer" points to local variables, and RBP points to fixed objects.
+  //
+  // In cases 2 and 3, we can only answer for non-fixed stack objects, and the
+  // answer we give is relative to the SP after the prologue, and not the
+  // SP in the middle of the function.
+
+  if (MFI.isFixedObjectIndex(FI) && TRI->needsStackRealignment(MF) &&
+      !STI.isTargetWin64())
+    return getFrameIndexReference(MF, FI, FrameReg);
+
+  // If !hasReservedCallFrame the function might have SP adjustement in the
+  // body.  So, even though the offset is statically known, it depends on where
+  // we are in the function.
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  if (!IgnoreSPUpdates && !TFI->hasReservedCallFrame(MF))
+    return getFrameIndexReference(MF, FI, FrameReg);
+
+  // We don't handle tail calls, and shouldn't be seeing them either.
+  assert(MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta() >= 0 &&
+         "we don't handle this case!");
+
+  // Fill in FrameReg output argument.
+  FrameReg = TRI->getStackRegister();
+
+  // This is how the math works out:
+  //
+  //  %rsp grows (i.e. gets lower) left to right. Each box below is
+  //  one word (eight bytes).  Obj0 is the stack slot we're trying to
+  //  get to.
+  //
+  //    ----------------------------------
+  //    | BP | Obj0 | Obj1 | ... | ObjN |
+  //    ----------------------------------
+  //    ^    ^      ^                   ^
+  //    A    B      C                   E
+  //
+  // A is the incoming stack pointer.
+  // (B - A) is the local area offset (-8 for x86-64) [1]
+  // (C - A) is the Offset returned by MFI.getObjectOffset for Obj0 [2]
+  //
+  // |(E - B)| is the StackSize (absolute value, positive).  For a
+  // stack that grown down, this works out to be (B - E). [3]
+  //
+  // E is also the value of %rsp after stack has been set up, and we
+  // want (C - E) -- the value we can add to %rsp to get to Obj0.  Now
+  // (C - E) == (C - A) - (B - A) + (B - E)
+  //            { Using [1], [2] and [3] above }
+  //         == getObjectOffset - LocalAreaOffset + StackSize
+  //
+
+  // Get the Offset from the StackPointer
+  int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
+
+  return Offset + StackSize;
+}
+
+bool X86FrameLowering::assignCalleeSavedSpillSlots(
+    MachineFunction &MF, const TargetRegisterInfo *TRI,
+    std::vector<CalleeSavedInfo> &CSI) const {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+
+  unsigned CalleeSavedFrameSize = 0;
+  int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
+
+  if (hasFP(MF)) {
+    // emitPrologue always spills frame register the first thing.
+    SpillSlotOffset -= SlotSize;
+    MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+
+    // Since emitPrologue and emitEpilogue will handle spilling and restoring of
+    // the frame register, we can delete it from CSI list and not have to worry
+    // about avoiding it later.
+    unsigned FPReg = TRI->getFrameRegister(MF);
+    for (unsigned i = 0; i < CSI.size(); ++i) {
+      if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) {
+        CSI.erase(CSI.begin() + i);
+        break;
+      }
+    }
+  }
+
+  // Assign slots for GPRs. It increases frame size.
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i - 1].getReg();
+
+    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
+      continue;
+
+    SpillSlotOffset -= SlotSize;
+    CalleeSavedFrameSize += SlotSize;
+
+    int SlotIndex = MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
+    CSI[i - 1].setFrameIdx(SlotIndex);
+  }
+
+  X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
+
+  // Assign slots for XMMs.
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i - 1].getReg();
+    if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+      continue;
+
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    // ensure alignment
+    SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment();
+    // spill into slot
+    SpillSlotOffset -= RC->getSize();
+    int SlotIndex =
+        MFI.CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset);
+    CSI[i - 1].setFrameIdx(SlotIndex);
+    MFI.ensureMaxAlignment(RC->getAlignment());
+  }
+
+  return true;
+}
+
+bool X86FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBB.findDebugLoc(MI);
+
+  // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI
+  // for us, and there are no XMM CSRs on Win32.
+  if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows())
+    return true;
+
+  // Push GPRs. It increases frame size.
+  const MachineFunction &MF = *MBB.getParent();
+  unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i - 1].getReg();
+
+    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
+      continue;
+
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    bool isLiveIn = MRI.isLiveIn(Reg);
+    if (!isLiveIn)
+      MBB.addLiveIn(Reg);
+
+    // Decide whether we can add a kill flag to the use.
+    bool CanKill = !isLiveIn;
+    // Check if any subregister is live-in
+    if (CanKill) {
+      for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg) {
+        if (MRI.isLiveIn(*AReg)) {
+          CanKill = false;
+          break;
+        }
+      }
+    }
+
+    // Do not set a kill flag on values that are also marked as live-in. This
+    // happens with the @llvm-returnaddress intrinsic and with arguments
+    // passed in callee saved registers.
+    // Omitting the kill flags is conservatively correct even if the live-in
+    // is not used after all.
+    BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, getKillRegState(CanKill))
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
+  // It can be done by spilling XMMs to stack frame.
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+      continue;
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC,
+                            TRI);
+    --MI;
+    MI->setFlag(MachineInstr::FrameSetup);
+    ++MI;
+  }
+
+  return true;
+}
+
+bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                          const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  if (MI != MBB.end() && isFuncletReturnInstr(*MI) && STI.isOSWindows()) {
+    // Don't restore CSRs in 32-bit EH funclets. Matches
+    // spillCalleeSavedRegisters.
+    if (STI.is32Bit())
+      return true;
+    // Don't restore CSRs before an SEH catchret. SEH except blocks do not form
+    // funclets. emitEpilogue transforms these to normal jumps.
+    if (MI->getOpcode() == X86::CATCHRET) {
+      const Function *Func = MBB.getParent()->getFunction();
+      bool IsSEH = isAsynchronousEHPersonality(
+          classifyEHPersonality(Func->getPersonalityFn()));
+      if (IsSEH)
+        return true;
+    }
+  }
+
+  DebugLoc DL = MBB.findDebugLoc(MI);
+
+  // Reload XMMs from stack frame.
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    if (X86::GR64RegClass.contains(Reg) ||
+        X86::GR32RegClass.contains(Reg))
+      continue;
+
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
+  }
+
+  // POP GPRs.
+  unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    if (!X86::GR64RegClass.contains(Reg) &&
+        !X86::GR32RegClass.contains(Reg))
+      continue;
+
+    BuildMI(MBB, MI, DL, TII.get(Opc), Reg)
+        .setMIFlag(MachineInstr::FrameDestroy);
+  }
+  return true;
+}
+
+void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                            BitVector &SavedRegs,
+                                            RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+
+  if (TailCallReturnAddrDelta < 0) {
+    // create RETURNADDR area
+    //   arg
+    //   arg
+    //   RETADDR
+    //   { ...
+    //     RETADDR area
+    //     ...
+    //   }
+    //   [EBP]
+    MFI.CreateFixedObject(-TailCallReturnAddrDelta,
+                           TailCallReturnAddrDelta - SlotSize, true);
+  }
+
+  // Spill the BasePtr if it's used.
+  if (TRI->hasBasePointer(MF)) {
+    SavedRegs.set(TRI->getBaseRegister());
+
+    // Allocate a spill slot for EBP if we have a base pointer and EH funclets.
+    if (MF.hasEHFunclets()) {
+      int FI = MFI.CreateSpillStackObject(SlotSize, SlotSize);
+      X86FI->setHasSEHFramePtrSave(true);
+      X86FI->setSEHFramePtrSaveIndex(FI);
+    }
+  }
+}
+
+static bool
+HasNestArgument(const MachineFunction *MF) {
+  const Function *F = MF->getFunction();
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; I++) {
+    if (I->hasNestAttr())
+      return true;
+  }
+  return false;
+}
+
+/// GetScratchRegister - Get a temp register for performing work in the
+/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
+/// and the properties of the function either one or two registers will be
+/// needed. Set primary to true for the first register, false for the second.
+static unsigned
+GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) {
+  CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
+
+  // Erlang stuff.
+  if (CallingConvention == CallingConv::HiPE) {
+    if (Is64Bit)
+      return Primary ? X86::R14 : X86::R13;
+    else
+      return Primary ? X86::EBX : X86::EDI;
+  }
+
+  if (Is64Bit) {
+    if (IsLP64)
+      return Primary ? X86::R11 : X86::R12;
+    else
+      return Primary ? X86::R11D : X86::R12D;
+  }
+
+  bool IsNested = HasNestArgument(&MF);
+
+  if (CallingConvention == CallingConv::X86_FastCall ||
+      CallingConvention == CallingConv::Fast) {
+    if (IsNested)
+      report_fatal_error("Segmented stacks does not support fastcall with "
+                         "nested function.");
+    return Primary ? X86::EAX : X86::ECX;
+  }
+  if (IsNested)
+    return Primary ? X86::EDX : X86::EAX;
+  return Primary ? X86::ECX : X86::EAX;
+}
+
+// The stack limit in the TCB is set to this many bytes above the actual stack
+// limit.
+static const uint64_t kSplitStackAvailable = 256;
+
+void X86FrameLowering::adjustForSegmentedStacks(
+    MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  uint64_t StackSize;
+  unsigned TlsReg, TlsOffset;
+  DebugLoc DL;
+
+  // To support shrink-wrapping we would need to insert the new blocks
+  // at the right place and update the branches to PrologueMBB.
+  assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
+
+  unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
+  assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+         "Scratch register is live-in");
+
+  if (MF.getFunction()->isVarArg())
+    report_fatal_error("Segmented stacks do not support vararg functions.");
+  if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() &&
+      !STI.isTargetWin64() && !STI.isTargetFreeBSD() &&
+      !STI.isTargetDragonFly())
+    report_fatal_error("Segmented stacks not supported on this platform.");
+
+  // Eventually StackSize will be calculated by a link-time pass; which will
+  // also decide whether checking code needs to be injected into this particular
+  // prologue.
+  StackSize = MFI.getStackSize();
+
+  // Do not generate a prologue for functions with a stack of size zero
+  if (StackSize == 0)
+    return;
+
+  MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  bool IsNested = false;
+
+  // We need to know if the function has a nest argument only in 64 bit mode.
+  if (Is64Bit)
+    IsNested = HasNestArgument(&MF);
+
+  // The MOV R10, RAX needs to be in a different block, since the RET we emit in
+  // allocMBB needs to be last (terminating) instruction.
+
+  for (const auto &LI : PrologueMBB.liveins()) {
+    allocMBB->addLiveIn(LI);
+    checkMBB->addLiveIn(LI);
+  }
+
+  if (IsNested)
+    allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D);
+
+  MF.push_front(allocMBB);
+  MF.push_front(checkMBB);
+
+  // When the frame size is less than 256 we just compare the stack
+  // boundary directly to the value of the stack pointer, per gcc.
+  bool CompareStackPointer = StackSize < kSplitStackAvailable;
+
+  // Read the limit off the current stacklet off the stack_guard location.
+  if (Is64Bit) {
+    if (STI.isTargetLinux()) {
+      TlsReg = X86::FS;
+      TlsOffset = IsLP64 ? 0x70 : 0x40;
+    } else if (STI.isTargetDarwin()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
+    } else if (STI.isTargetWin64()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x28; // pvArbitrary, reserved for application use
+    } else if (STI.isTargetFreeBSD()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x18;
+    } else if (STI.isTargetDragonFly()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x20; // use tls_tcb.tcb_segstack
+    } else {
+      report_fatal_error("Segmented stacks not supported on this platform.");
+    }
+
+    if (CompareStackPointer)
+      ScratchReg = IsLP64 ? X86::RSP : X86::ESP;
+    else
+      BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP)
+        .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+    BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg)
+      .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+  } else {
+    if (STI.isTargetLinux()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x30;
+    } else if (STI.isTargetDarwin()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x48 + 90*4;
+    } else if (STI.isTargetWin32()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x14; // pvArbitrary, reserved for application use
+    } else if (STI.isTargetDragonFly()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x10; // use tls_tcb.tcb_segstack
+    } else if (STI.isTargetFreeBSD()) {
+      report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
+    } else {
+      report_fatal_error("Segmented stacks not supported on this platform.");
+    }
+
+    if (CompareStackPointer)
+      ScratchReg = X86::ESP;
+    else
+      BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
+        .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+    if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() ||
+        STI.isTargetDragonFly()) {
+      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
+        .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+    } else if (STI.isTargetDarwin()) {
+
+      // TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
+      unsigned ScratchReg2;
+      bool SaveScratch2;
+      if (CompareStackPointer) {
+        // The primary scratch register is available for holding the TLS offset.
+        ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true);
+        SaveScratch2 = false;
+      } else {
+        // Need to use a second register to hold the TLS offset
+        ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false);
+
+        // Unfortunately, with fastcc the second scratch register may hold an
+        // argument.
+        SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
+      }
+
+      // If Scratch2 is live-in then it needs to be saved.
+      assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
+             "Scratch register is live-in and not saved");
+
+      if (SaveScratch2)
+        BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
+          .addReg(ScratchReg2, RegState::Kill);
+
+      BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
+        .addImm(TlsOffset);
+      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
+        .addReg(ScratchReg)
+        .addReg(ScratchReg2).addImm(1).addReg(0)
+        .addImm(0)
+        .addReg(TlsReg);
+
+      if (SaveScratch2)
+        BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
+    }
+  }
+
+  // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
+  // It jumps to normal execution of the function body.
+  BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&PrologueMBB);
+
+  // On 32 bit we first push the arguments size and then the frame size. On 64
+  // bit, we pass the stack frame size in r10 and the argument size in r11.
+  if (Is64Bit) {
+    // Functions with nested arguments use R10, so it needs to be saved across
+    // the call to _morestack
+
+    const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX;
+    const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D;
+    const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D;
+    const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr;
+    const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri;
+
+    if (IsNested)
+      BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10);
+
+    BuildMI(allocMBB, DL, TII.get(MOVri), Reg10)
+      .addImm(StackSize);
+    BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
+      .addImm(X86FI->getArgumentStackSize());
+  } else {
+    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+      .addImm(X86FI->getArgumentStackSize());
+    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+      .addImm(StackSize);
+  }
+
+  // __morestack is in libgcc
+  if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
+    // Under the large code model, we cannot assume that __morestack lives
+    // within 2^31 bytes of the call site, so we cannot use pc-relative
+    // addressing. We cannot perform the call via a temporary register,
+    // as the rax register may be used to store the static chain, and all
+    // other suitable registers may be either callee-save or used for
+    // parameter passing. We cannot use the stack at this point either
+    // because __morestack manipulates the stack directly.
+    //
+    // To avoid these issues, perform an indirect call via a read-only memory
+    // location containing the address.
+    //
+    // This solution is not perfect, as it assumes that the .rodata section
+    // is laid out within 2^31 bytes of each function body, but this seems
+    // to be sufficient for JIT.
+    BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
+        .addReg(X86::RIP)
+        .addImm(0)
+        .addReg(0)
+        .addExternalSymbol("__morestack_addr")
+        .addReg(0);
+    MF.getMMI().setUsesMorestackAddr(true);
+  } else {
+    if (Is64Bit)
+      BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
+        .addExternalSymbol("__morestack");
+    else
+      BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
+        .addExternalSymbol("__morestack");
+  }
+
+  if (IsNested)
+    BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
+  else
+    BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));
+
+  allocMBB->addSuccessor(&PrologueMBB);
+
+  checkMBB->addSuccessor(allocMBB);
+  checkMBB->addSuccessor(&PrologueMBB);
+
+#ifdef EXPENSIVE_CHECKS
+  MF.verify();
+#endif
+}
+
+/// Lookup an ERTS parameter in the !hipe.literals named metadata node.
+/// HiPE provides Erlang Runtime System-internal parameters, such as PCB offsets
+/// to fields it needs, through a named metadata node "hipe.literals" containing
+/// name-value pairs.
+static unsigned getHiPELiteral(
+    NamedMDNode *HiPELiteralsMD, const StringRef LiteralName) {
+  for (int i = 0, e = HiPELiteralsMD->getNumOperands(); i != e; ++i) {
+    MDNode *Node = HiPELiteralsMD->getOperand(i);
+    if (Node->getNumOperands() != 2) continue;
+    MDString *NodeName = dyn_cast<MDString>(Node->getOperand(0));
+    ValueAsMetadata *NodeVal = dyn_cast<ValueAsMetadata>(Node->getOperand(1));
+    if (!NodeName || !NodeVal) continue;
+    ConstantInt *ValConst = dyn_cast_or_null<ConstantInt>(NodeVal->getValue());
+    if (ValConst && NodeName->getString() == LiteralName) {
+      return ValConst->getZExtValue();
+    }
+  }
+
+  report_fatal_error("HiPE literal " + LiteralName
+                     + " required but not provided");
+}
+
+/// Erlang programs may need a special prologue to handle the stack size they
+/// might need at runtime. That is because Erlang/OTP does not implement a C
+/// stack but uses a custom implementation of hybrid stack/heap architecture.
+/// (for more information see Eric Stenman's Ph.D. thesis:
+/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
+///
+/// CheckStack:
+///       temp0 = sp - MaxStack
+///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+/// OldStart:
+///       ...
+/// IncStack:
+///       call inc_stack   # doubles the stack space
+///       temp0 = sp - MaxStack
+///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+void X86FrameLowering::adjustForHiPEPrologue(
+    MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  DebugLoc DL;
+
+  // To support shrink-wrapping we would need to insert the new blocks
+  // at the right place and update the branches to PrologueMBB.
+  assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
+
+  // HiPE-specific values
+  NamedMDNode *HiPELiteralsMD = MF.getMMI().getModule()
+    ->getNamedMetadata("hipe.literals");
+  if (!HiPELiteralsMD)
+    report_fatal_error(
+        "Can't generate HiPE prologue without runtime parameters");
+  const unsigned HipeLeafWords
+    = getHiPELiteral(HiPELiteralsMD,
+                     Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS");
+  const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
+  const unsigned Guaranteed = HipeLeafWords * SlotSize;
+  unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ?
+                            MF.getFunction()->arg_size() - CCRegisteredArgs : 0;
+  unsigned MaxStack = MFI.getStackSize() + CallerStkArity*SlotSize + SlotSize;
+
+  assert(STI.isTargetLinux() &&
+         "HiPE prologue is only supported on Linux operating systems.");
+
+  // Compute the largest caller's frame that is needed to fit the callees'
+  // frames. This 'MaxStack' is computed from:
+  //
+  // a) the fixed frame size, which is the space needed for all spilled temps,
+  // b) outgoing on-stack parameter areas, and
+  // c) the minimum stack space this function needs to make available for the
+  //    functions it calls (a tunable ABI property).
+  if (MFI.hasCalls()) {
+    unsigned MoreStackForCalls = 0;
+
+    for (auto &MBB : MF) {
+      for (auto &MI : MBB) {
+        if (!MI.isCall())
+          continue;
+
+        // Get callee operand.
+        const MachineOperand &MO = MI.getOperand(0);
+
+        // Only take account of global function calls (no closures etc.).
+        if (!MO.isGlobal())
+          continue;
+
+        const Function *F = dyn_cast<Function>(MO.getGlobal());
+        if (!F)
+          continue;
+
+        // Do not update 'MaxStack' for primitive and built-in functions
+        // (encoded with names either starting with "erlang."/"bif_" or not
+        // having a ".", such as a simple <Module>.<Function>.<Arity>, or an
+        // "_", such as the BIF "suspend_0") as they are executed on another
+        // stack.
+        if (F->getName().find("erlang.") != StringRef::npos ||
+            F->getName().find("bif_") != StringRef::npos ||
+            F->getName().find_first_of("._") == StringRef::npos)
+          continue;
+
+        unsigned CalleeStkArity =
+          F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0;
+        if (HipeLeafWords - 1 > CalleeStkArity)
+          MoreStackForCalls = std::max(MoreStackForCalls,
+                               (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
+      }
+    }
+    MaxStack += MoreStackForCalls;
+  }
+
+  // If the stack frame needed is larger than the guaranteed then runtime checks
+  // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
+  if (MaxStack > Guaranteed) {
+    MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
+    MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
+
+    for (const auto &LI : PrologueMBB.liveins()) {
+      stackCheckMBB->addLiveIn(LI);
+      incStackMBB->addLiveIn(LI);
+    }
+
+    MF.push_front(incStackMBB);
+    MF.push_front(stackCheckMBB);
+
+    unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
+    unsigned LEAop, CMPop, CALLop;
+    SPLimitOffset = getHiPELiteral(HiPELiteralsMD, "P_NSP_LIMIT");
+    if (Is64Bit) {
+      SPReg = X86::RSP;
+      PReg  = X86::RBP;
+      LEAop = X86::LEA64r;
+      CMPop = X86::CMP64rm;
+      CALLop = X86::CALL64pcrel32;
+    } else {
+      SPReg = X86::ESP;
+      PReg  = X86::EBP;
+      LEAop = X86::LEA32r;
+      CMPop = X86::CMP32rm;
+      CALLop = X86::CALLpcrel32;
+    }
+
+    ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
+    assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+           "HiPE prologue scratch register is live-in");
+
+    // Create new MBB for StackCheck:
+    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg),
+                 SPReg, false, -MaxStack);
+    // SPLimitOffset is in a fixed heap location (pointed by BP).
+    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
+                 .addReg(ScratchReg), PReg, false, SPLimitOffset);
+    BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&PrologueMBB);
+
+    // Create new MBB for IncStack:
+    BuildMI(incStackMBB, DL, TII.get(CALLop)).
+      addExternalSymbol("inc_stack_0");
+    addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg),
+                 SPReg, false, -MaxStack);
+    addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
+                 .addReg(ScratchReg), PReg, false, SPLimitOffset);
+    BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB);
+
+    stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100});
+    stackCheckMBB->addSuccessor(incStackMBB, {1, 100});
+    incStackMBB->addSuccessor(&PrologueMBB, {99, 100});
+    incStackMBB->addSuccessor(incStackMBB, {1, 100});
+  }
+#ifdef EXPENSIVE_CHECKS
+  MF.verify();
+#endif
+}
+
+bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MBBI,
+                                           const DebugLoc &DL,
+                                           int Offset) const {
+
+  if (Offset <= 0)
+    return false;
+
+  if (Offset % SlotSize)
+    return false;
+
+  int NumPops = Offset / SlotSize;
+  // This is only worth it if we have at most 2 pops.
+  if (NumPops != 1 && NumPops != 2)
+    return false;
+
+  // Handle only the trivial case where the adjustment directly follows
+  // a call. This is the most common one, anyway.
+  if (MBBI == MBB.begin())
+    return false;
+  MachineBasicBlock::iterator Prev = std::prev(MBBI);
+  if (!Prev->isCall() || !Prev->getOperand(1).isRegMask())
+    return false;
+
+  unsigned Regs[2];
+  unsigned FoundRegs = 0;
+
+  auto RegMask = Prev->getOperand(1);
+
+  auto &RegClass =
+      Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass;
+  // Try to find up to NumPops free registers.
+  for (auto Candidate : RegClass) {
+
+    // Poor man's liveness:
+    // Since we're immediately after a call, any register that is clobbered
+    // by the call and not defined by it can be considered dead.
+    if (!RegMask.clobbersPhysReg(Candidate))
+      continue;
+
+    bool IsDef = false;
+    for (const MachineOperand &MO : Prev->implicit_operands()) {
+      if (MO.isReg() && MO.isDef() &&
+          TRI->isSuperOrSubRegisterEq(MO.getReg(), Candidate)) {
+        IsDef = true;
+        break;
+      }
+    }
+
+    if (IsDef)
+      continue;
+
+    Regs[FoundRegs++] = Candidate;
+    if (FoundRegs == (unsigned)NumPops)
+      break;
+  }
+
+  if (FoundRegs == 0)
+    return false;
+
+  // If we found only one free register, but need two, reuse the same one twice.
+  while (FoundRegs < (unsigned)NumPops)
+    Regs[FoundRegs++] = Regs[0];
+
+  for (int i = 0; i < NumPops; ++i)
+    BuildMI(MBB, MBBI, DL, 
+            TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]);
+
+  return true;
+}
+
+MachineBasicBlock::iterator X86FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  bool reserveCallFrame = hasReservedCallFrame(MF);
+  unsigned Opcode = I->getOpcode();
+  bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
+  DebugLoc DL = I->getDebugLoc();
+  uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
+  uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
+  I = MBB.erase(I);
+
+  if (!reserveCallFrame) {
+    // If the stack pointer can be changed after prologue, turn the
+    // adjcallstackup instruction into a 'sub ESP, <amt>' and the
+    // adjcallstackdown instruction into 'add ESP, <amt>'
+
+    // We need to keep the stack aligned properly.  To do this, we round the
+    // amount of space needed for the outgoing arguments up to the next
+    // alignment boundary.
+    unsigned StackAlign = getStackAlignment();
+    Amount = alignTo(Amount, StackAlign);
+
+    MachineModuleInfo &MMI = MF.getMMI();
+    const Function *Fn = MF.getFunction();
+    bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+    bool DwarfCFI = !WindowsCFI && 
+                    (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
+
+    // If we have any exception handlers in this function, and we adjust
+    // the SP before calls, we may need to indicate this to the unwinder
+    // using GNU_ARGS_SIZE. Note that this may be necessary even when
+    // Amount == 0, because the preceding function may have set a non-0
+    // GNU_ARGS_SIZE.
+    // TODO: We don't need to reset this between subsequent functions,
+    // if it didn't change.
+    bool HasDwarfEHHandlers = !WindowsCFI && !MF.getLandingPads().empty();
+
+    if (HasDwarfEHHandlers && !isDestroy &&
+        MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
+      BuildCFI(MBB, I, DL,
+               MCCFIInstruction::createGnuArgsSize(nullptr, Amount));
+
+    if (Amount == 0)
+      return I;
+
+    // Factor out the amount that gets handled inside the sequence
+    // (Pushes of argument for frame setup, callee pops for frame destroy)
+    Amount -= InternalAmt;
+
+    // TODO: This is needed only if we require precise CFA.
+    // If this is a callee-pop calling convention, emit a CFA adjust for
+    // the amount the callee popped.
+    if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
+      BuildCFI(MBB, I, DL, 
+               MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));
+
+    // Add Amount to SP to destroy a frame, or subtract to setup.
+    int64_t StackAdjustment = isDestroy ? Amount : -Amount;
+    int64_t CfaAdjustment = -StackAdjustment;
+
+    if (StackAdjustment) {
+      // Merge with any previous or following adjustment instruction. Note: the
+      // instructions merged with here do not have CFI, so their stack
+      // adjustments do not feed into CfaAdjustment.
+      StackAdjustment += mergeSPUpdates(MBB, I, true);
+      StackAdjustment += mergeSPUpdates(MBB, I, false);
+
+      if (StackAdjustment) {
+        if (!(Fn->optForMinSize() &&
+              adjustStackWithPops(MBB, I, DL, StackAdjustment)))
+          BuildStackAdjustment(MBB, I, DL, StackAdjustment,
+                               /*InEpilogue=*/false);
+      }
+    }
+
+    if (DwarfCFI && !hasFP(MF)) {
+      // If we don't have FP, but need to generate unwind information,
+      // we need to set the correct CFA offset after the stack adjustment.
+      // How much we adjust the CFA offset depends on whether we're emitting
+      // CFI only for EH purposes or for debugging. EH only requires the CFA
+      // offset to be correct at each call site, while for debugging we want
+      // it to be more precise.
+
+      // TODO: When not using precise CFA, we also need to adjust for the
+      // InternalAmt here.
+      if (CfaAdjustment) {
+        BuildCFI(MBB, I, DL, MCCFIInstruction::createAdjustCfaOffset(
+                                 nullptr, CfaAdjustment));
+      }
+    }
+
+    return I;
+  }
+
+  if (isDestroy && InternalAmt) {
+    // If we are performing frame pointer elimination and if the callee pops
+    // something off the stack pointer, add it back.  We do this until we have
+    // more advanced stack pointer tracking ability.
+    // We are not tracking the stack pointer adjustment by the callee, so make
+    // sure we restore the stack pointer immediately after the call, there may
+    // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
+    MachineBasicBlock::iterator CI = I;
+    MachineBasicBlock::iterator B = MBB.begin();
+    while (CI != B && !std::prev(CI)->isCall())
+      --CI;
+    BuildStackAdjustment(MBB, CI, DL, -InternalAmt, /*InEpilogue=*/false);
+  }
+
+  return I;
+}
+
+bool X86FrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
+  assert(MBB.getParent() && "Block is not attached to a function!");
+  const MachineFunction &MF = *MBB.getParent();
+  return !TRI->needsStackRealignment(MF) || !MBB.isLiveIn(X86::EFLAGS);
+}
+
+bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
+  assert(MBB.getParent() && "Block is not attached to a function!");
+
+  // Win64 has strict requirements in terms of epilogue and we are
+  // not taking a chance at messing with them.
+  // I.e., unless this block is already an exit block, we can't use
+  // it as an epilogue.
+  if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock())
+    return false;
+
+  if (canUseLEAForSPInEpilogue(*MBB.getParent()))
+    return true;
+
+  // If we cannot use LEA to adjust SP, we may need to use ADD, which
+  // clobbers the EFLAGS. Check that we do not need to preserve it,
+  // otherwise, conservatively assume this is not
+  // safe to insert the epilogue here.
+  return !flagsNeedToBePreservedBeforeTheTerminators(MBB);
+}
+
+bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+  // If we may need to emit frameless compact unwind information, give
+  // up as this is currently broken: PR25614.
+  return (MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF)) &&
+         // The lowering of segmented stack and HiPE only support entry blocks
+         // as prologue blocks: PR26107.
+         // This limitation may be lifted if we fix:
+         // - adjustForSegmentedStacks
+         // - adjustForHiPEPrologue
+         MF.getFunction()->getCallingConv() != CallingConv::HiPE &&
+         !MF.shouldSplitStack();
+}
+
+MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    const DebugLoc &DL, bool RestoreSP) const {
+  assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env");
+  assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32");
+  assert(STI.is32Bit() && !Uses64BitFramePtr &&
+         "restoring EBP/ESI on non-32-bit target");
+
+  MachineFunction &MF = *MBB.getParent();
+  unsigned FramePtr = TRI->getFrameRegister(MF);
+  unsigned BasePtr = TRI->getBaseRegister();
+  WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // FIXME: Don't set FrameSetup flag in catchret case.
+
+  int FI = FuncInfo.EHRegNodeFrameIndex;
+  int EHRegSize = MFI.getObjectSize(FI);
+
+  if (RestoreSP) {
+    // MOV32rm -EHRegSize(%ebp), %esp
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP),
+                 X86::EBP, true, -EHRegSize)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  unsigned UsedReg;
+  int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg);
+  int EndOffset = -EHRegOffset - EHRegSize;
+  FuncInfo.EHRegNodeEndOffset = EndOffset;
+
+  if (UsedReg == FramePtr) {
+    // ADD $offset, %ebp
+    unsigned ADDri = getADDriOpcode(false, EndOffset);
+    BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr)
+        .addReg(FramePtr)
+        .addImm(EndOffset)
+        .setMIFlag(MachineInstr::FrameSetup)
+        ->getOperand(3)
+        .setIsDead();
+    assert(EndOffset >= 0 &&
+           "end of registration object above normal EBP position!");
+  } else if (UsedReg == BasePtr) {
+    // LEA offset(%ebp), %esi
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr),
+                 FramePtr, false, EndOffset)
+        .setMIFlag(MachineInstr::FrameSetup);
+    // MOV32rm SavedEBPOffset(%esi), %ebp
+    assert(X86FI->getHasSEHFramePtrSave());
+    int Offset =
+        getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
+    assert(UsedReg == BasePtr);
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr),
+                 UsedReg, true, Offset)
+        .setMIFlag(MachineInstr::FrameSetup);
+  } else {
+    llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr");
+  }
+  return MBBI;
+}
+
+namespace {
+// Struct used by orderFrameObjects to help sort the stack objects.
+struct X86FrameSortingObject {
+  bool IsValid = false;         // true if we care about this Object.
+  unsigned ObjectIndex = 0;     // Index of Object into MFI list.
+  unsigned ObjectSize = 0;      // Size of Object in bytes.
+  unsigned ObjectAlignment = 1; // Alignment of Object in bytes.
+  unsigned ObjectNumUses = 0;   // Object static number of uses.
+};
+
+// The comparison function we use for std::sort to order our local
+// stack symbols. The current algorithm is to use an estimated
+// "density". This takes into consideration the size and number of
+// uses each object has in order to roughly minimize code size.
+// So, for example, an object of size 16B that is referenced 5 times
+// will get higher priority than 4 4B objects referenced 1 time each.
+// It's not perfect and we may be able to squeeze a few more bytes out of
+// it (for example : 0(esp) requires fewer bytes, symbols allocated at the
+// fringe end can have special consideration, given their size is less
+// important, etc.), but the algorithmic complexity grows too much to be
+// worth the extra gains we get. This gets us pretty close.
+// The final order leaves us with objects with highest priority going
+// at the end of our list.
+struct X86FrameSortingComparator {
+  inline bool operator()(const X86FrameSortingObject &A,
+                         const X86FrameSortingObject &B) {
+    uint64_t DensityAScaled, DensityBScaled;
+
+    // For consistency in our comparison, all invalid objects are placed
+    // at the end. This also allows us to stop walking when we hit the
+    // first invalid item after it's all sorted.
+    if (!A.IsValid)
+      return false;
+    if (!B.IsValid)
+      return true;
+
+    // The density is calculated by doing :
+    //     (double)DensityA = A.ObjectNumUses / A.ObjectSize
+    //     (double)DensityB = B.ObjectNumUses / B.ObjectSize
+    // Since this approach may cause inconsistencies in
+    // the floating point <, >, == comparisons, depending on the floating
+    // point model with which the compiler was built, we're going
+    // to scale both sides by multiplying with
+    // A.ObjectSize * B.ObjectSize. This ends up factoring away
+    // the division and, with it, the need for any floating point
+    // arithmetic.
+    DensityAScaled = static_cast<uint64_t>(A.ObjectNumUses) *
+      static_cast<uint64_t>(B.ObjectSize);
+    DensityBScaled = static_cast<uint64_t>(B.ObjectNumUses) *
+      static_cast<uint64_t>(A.ObjectSize);
+
+    // If the two densities are equal, prioritize highest alignment
+    // objects. This allows for similar alignment objects
+    // to be packed together (given the same density).
+    // There's room for improvement here, also, since we can pack
+    // similar alignment (different density) objects next to each
+    // other to save padding. This will also require further
+    // complexity/iterations, and the overall gain isn't worth it,
+    // in general. Something to keep in mind, though.
+    if (DensityAScaled == DensityBScaled)
+      return A.ObjectAlignment < B.ObjectAlignment;
+    
+    return DensityAScaled < DensityBScaled;
+  }
+};
+} // namespace
+
+// Order the symbols in the local stack.
+// We want to place the local stack objects in some sort of sensible order.
+// The heuristic we use is to try and pack them according to static number
+// of uses and size of object in order to minimize code size.
+void X86FrameLowering::orderFrameObjects(
+    const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // Don't waste time if there's nothing to do.
+  if (ObjectsToAllocate.empty())
+    return;
+
+  // Create an array of all MFI objects. We won't need all of these
+  // objects, but we're going to create a full array of them to make
+  // it easier to index into when we're counting "uses" down below.
+  // We want to be able to easily/cheaply access an object by simply
+  // indexing into it, instead of having to search for it every time.
+  std::vector<X86FrameSortingObject> SortingObjects(MFI.getObjectIndexEnd());
+
+  // Walk the objects we care about and mark them as such in our working
+  // struct.
+  for (auto &Obj : ObjectsToAllocate) {
+    SortingObjects[Obj].IsValid = true;
+    SortingObjects[Obj].ObjectIndex = Obj;
+    SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlignment(Obj);
+    // Set the size.
+    int ObjectSize = MFI.getObjectSize(Obj);
+    if (ObjectSize == 0)
+      // Variable size. Just use 4.
+      SortingObjects[Obj].ObjectSize = 4;
+    else      
+      SortingObjects[Obj].ObjectSize = ObjectSize;
+  }
+
+  // Count the number of uses for each object.
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (MI.isDebugValue())
+        continue;
+      for (const MachineOperand &MO : MI.operands()) {
+        // Check to see if it's a local stack symbol.
+        if (!MO.isFI())
+          continue;
+        int Index = MO.getIndex();
+        // Check to see if it falls within our range, and is tagged
+        // to require ordering.
+        if (Index >= 0 && Index < MFI.getObjectIndexEnd() &&
+            SortingObjects[Index].IsValid)
+          SortingObjects[Index].ObjectNumUses++;
+      }
+    }
+  }
+
+  // Sort the objects using X86FrameSortingAlgorithm (see its comment for
+  // info).
+  std::stable_sort(SortingObjects.begin(), SortingObjects.end(),
+                   X86FrameSortingComparator());
+
+  // Now modify the original list to represent the final order that
+  // we want. The order will depend on whether we're going to access them
+  // from the stack pointer or the frame pointer. For SP, the list should
+  // end up with the END containing objects that we want with smaller offsets.
+  // For FP, it should be flipped.
+  int i = 0;
+  for (auto &Obj : SortingObjects) {
+    // All invalid items are sorted at the end, so it's safe to stop.
+    if (!Obj.IsValid)
+      break;
+    ObjectsToAllocate[i++] = Obj.ObjectIndex;
+  }
+
+  // Flip it if we're accessing off of the FP.
+  if (!TRI->needsStackRealignment(MF) && hasFP(MF))
+    std::reverse(ObjectsToAllocate.begin(), ObjectsToAllocate.end());
+}
+
+
+unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
+  // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.
+  unsigned Offset = 16;
+  // RBP is immediately pushed.
+  Offset += SlotSize;
+  // All callee-saved registers are then pushed.
+  Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+  // Every funclet allocates enough stack space for the largest outgoing call.
+  Offset += getWinEHFuncletFrameSize(MF);
+  return Offset;
+}
+
+void X86FrameLowering::processFunctionBeforeFrameFinalized(
+    MachineFunction &MF, RegScavenger *RS) const {
+  // If this function isn't doing Win64-style C++ EH, we don't need to do
+  // anything.
+  const Function *Fn = MF.getFunction();
+  if (!STI.is64Bit() || !MF.hasEHFunclets() ||
+      classifyEHPersonality(Fn->getPersonalityFn()) != EHPersonality::MSVC_CXX)
+    return;
+
+  // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset
+  // relative to RSP after the prologue.  Find the offset of the last fixed
+  // object, so that we can allocate a slot immediately following it. If there
+  // were no fixed objects, use offset -SlotSize, which is immediately after the
+  // return address. Fixed objects have negative frame indices.
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
+  int64_t MinFixedObjOffset = -SlotSize;
+  for (int I = MFI.getObjectIndexBegin(); I < 0; ++I)
+    MinFixedObjOffset = std::min(MinFixedObjOffset, MFI.getObjectOffset(I));
+
+  for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
+    for (WinEHHandlerType &H : TBME.HandlerArray) {
+      int FrameIndex = H.CatchObj.FrameIndex;
+      if (FrameIndex != INT_MAX) {
+        // Ensure alignment.
+        unsigned Align = MFI.getObjectAlignment(FrameIndex);
+        MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align;
+        MinFixedObjOffset -= MFI.getObjectSize(FrameIndex);
+        MFI.setObjectOffset(FrameIndex, MinFixedObjOffset);
+      }
+    }
+  }
+
+  // Ensure alignment.
+  MinFixedObjOffset -= std::abs(MinFixedObjOffset) % 8;
+  int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize;
+  int UnwindHelpFI =
+      MFI.CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false);
+  EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
+
+  // Store -2 into UnwindHelp on function entry. We have to scan forwards past
+  // other frame setup instructions.
+  MachineBasicBlock &MBB = MF.front();
+  auto MBBI = MBB.begin();
+  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
+    ++MBBI;
+
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)),
+                    UnwindHelpFI)
+      .addImm(-2);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
new file mode 100644
index 000000000000..e1b04d6dc300
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
@@ -0,0 +1,218 @@
+//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements X86-specific bits of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
+#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class MachineInstrBuilder;
+class MCCFIInstruction;
+class X86Subtarget;
+class X86RegisterInfo;
+
+class X86FrameLowering : public TargetFrameLowering {
+public:
+  X86FrameLowering(const X86Subtarget &STI, unsigned StackAlignOverride);
+
+  // Cached subtarget predicates.
+
+  const X86Subtarget &STI;
+  const TargetInstrInfo &TII;
+  const X86RegisterInfo *TRI;
+
+  unsigned SlotSize;
+
+  /// Is64Bit implies that x86_64 instructions are available.
+  bool Is64Bit;
+
+  bool IsLP64;
+
+  /// True if the 64-bit frame or stack pointer should be used. True for most
+  /// 64-bit targets with the exception of x32. If this is false, 32-bit
+  /// instruction operands should be used to manipulate StackPtr and FramePtr.
+  bool Uses64BitFramePtr;
+
+  unsigned StackPtr;
+
+  /// Emit target stack probe code. This is required for all
+  /// large stack allocations on Windows. The caller is required to materialize
+  /// the number of bytes to probe in RAX/EAX.
+  void emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+                      bool InProlog) const;
+
+  /// Replace a StackProbe inline-stub with the actual probe code inline.
+  void inlineStackProbe(MachineFunction &MF,
+                        MachineBasicBlock &PrologMBB) const override;
+
+  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 const DebugLoc &DL) const;
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  void adjustForSegmentedStacks(MachineFunction &MF,
+                                MachineBasicBlock &PrologueMBB) const override;
+
+  void adjustForHiPEPrologue(MachineFunction &MF,
+                             MachineBasicBlock &PrologueMBB) const override;
+
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS = nullptr) const override;
+
+  bool
+  assignCalleeSavedSpillSlots(MachineFunction &MF,
+                              const TargetRegisterInfo *TRI,
+                              std::vector<CalleeSavedInfo> &CSI) const override;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+  bool needsFrameIndexResolution(const MachineFunction &MF) const override;
+
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+
+  int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
+                                     unsigned &FrameReg,
+                                     bool IgnoreSPUpdates) const override;
+
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override;
+
+  unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                           RegScavenger *RS) const override;
+
+  /// Check the instruction before/after the passed instruction. If
+  /// it is an ADD/SUB/LEA instruction it is deleted argument and the
+  /// stack adjustment is returned as a positive value for ADD/LEA and
+  /// a negative for SUB.
+  int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                     bool doMergeWithPrevious) const;
+
+  /// Emit a series of instructions to increment / decrement the stack
+  /// pointer by a constant value.
+  void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                    int64_t NumBytes, bool InEpilogue) const;
+
+  /// Check that LEA can be used on SP in an epilogue sequence for \p MF.
+  bool canUseLEAForSPInEpilogue(const MachineFunction &MF) const;
+
+  /// Check whether or not the given \p MBB can be used as a prologue
+  /// for the target.
+  /// The prologue will be inserted first in this basic block.
+  /// This method is used by the shrink-wrapping pass to decide if
+  /// \p MBB will be correctly handled by the target.
+  /// As soon as the target enable shrink-wrapping without overriding
+  /// this method, we assume that each basic block is a valid
+  /// prologue.
+  bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
+
+  /// Check whether or not the given \p MBB can be used as a epilogue
+  /// for the target.
+  /// The epilogue will be inserted before the first terminator of that block.
+  /// This method is used by the shrink-wrapping pass to decide if
+  /// \p MBB will be correctly handled by the target.
+  bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
+
+  /// Returns true if the target will correctly handle shrink wrapping.
+  bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
+  /// Order the symbols in the local stack.
+  /// We want to place the local stack objects in some sort of sensible order.
+  /// The heuristic we use is to try and pack them according to static number
+  /// of uses and size in order to minimize code size.
+  void orderFrameObjects(const MachineFunction &MF,
+                         SmallVectorImpl<int> &ObjectsToAllocate) const override;
+
+  /// convertArgMovsToPushes - This method tries to convert a call sequence
+  /// that uses sub and mov instructions to put the argument onto the stack
+  /// into a series of pushes.
+  /// Returns true if the transformation succeeded, false if not.
+  bool convertArgMovsToPushes(MachineFunction &MF, 
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I, 
+                              uint64_t Amount) const;
+
+  /// Wraps up getting a CFI index and building a MachineInstr for it.
+  void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                const DebugLoc &DL, const MCCFIInstruction &CFIInst) const;
+
+  /// Sets up EBP and optionally ESI based on the incoming EBP value.  Only
+  /// needed for 32-bit. Used in funclet prologues and at catchret destinations.
+  MachineBasicBlock::iterator
+  restoreWin32EHStackPointers(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI,
+                              const DebugLoc &DL, bool RestoreSP = false) const;
+
+private:
+  uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
+
+  /// Emit target stack probe as a call to a helper function
+  void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+                          bool InProlog) const;
+
+  /// Emit target stack probe as an inline sequence.
+  void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            const DebugLoc &DL, bool InProlog) const;
+
+  /// Emit a stub to later inline the target stack probe.
+  void emitStackProbeInlineStub(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                const DebugLoc &DL, bool InProlog) const;
+
+  /// Aligns the stack pointer by ANDing it with -MaxAlign.
+  void BuildStackAlignAND(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+                          unsigned Reg, uint64_t MaxAlign) const;
+
+  /// Make small positive stack adjustments using POPs.
+  bool adjustStackWithPops(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+                           int Offset) const;
+
+  /// Adjusts the stack pointer using LEA, SUB, or ADD.
+  MachineInstrBuilder BuildStackAdjustment(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MBBI,
+                                           const DebugLoc &DL, int64_t Offset,
+                                           bool InEpilogue) const;
+
+  unsigned getPSPSlotOffsetFromSP(const MachineFunction &MF) const;
+
+  unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
new file mode 100644
index 000000000000..8b66790679d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -0,0 +1,2798 @@
+//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for X86,
+// converting from a legalized dag to a X86 dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <stdint.h>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-isel"
+
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
+//===----------------------------------------------------------------------===//
+//                      Pattern Matcher Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// This corresponds to X86AddressMode, but uses SDValue's instead of register
+  /// numbers for the leaves of the matched tree.
+  struct X86ISelAddressMode {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    // This is really a union, discriminated by BaseType!
+    SDValue Base_Reg;
+    int Base_FrameIndex;
+
+    unsigned Scale;
+    SDValue IndexReg;
+    int32_t Disp;
+    SDValue Segment;
+    const GlobalValue *GV;
+    const Constant *CP;
+    const BlockAddress *BlockAddr;
+    const char *ES;
+    MCSymbol *MCSym;
+    int JT;
+    unsigned Align;    // CP alignment.
+    unsigned char SymbolFlags;  // X86II::MO_*
+
+    X86ISelAddressMode()
+        : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
+          Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
+          MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
+
+    bool hasSymbolicDisplacement() const {
+      return GV != nullptr || CP != nullptr || ES != nullptr ||
+             MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
+    }
+
+    bool hasBaseOrIndexReg() const {
+      return BaseType == FrameIndexBase ||
+             IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
+    }
+
+    /// Return true if this addressing mode is already RIP-relative.
+    bool isRIPRelative() const {
+      if (BaseType != RegBase) return false;
+      if (RegisterSDNode *RegNode =
+            dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
+        return RegNode->getReg() == X86::RIP;
+      return false;
+    }
+
+    void setBaseReg(SDValue Reg) {
+      BaseType = RegBase;
+      Base_Reg = Reg;
+    }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dump() {
+      dbgs() << "X86ISelAddressMode " << this << '\n';
+      dbgs() << "Base_Reg ";
+      if (Base_Reg.getNode())
+        Base_Reg.getNode()->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'
+             << " Scale" << Scale << '\n'
+             << "IndexReg ";
+      if (IndexReg.getNode())
+        IndexReg.getNode()->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << " Disp " << Disp << '\n'
+             << "GV ";
+      if (GV)
+        GV->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << " CP ";
+      if (CP)
+        CP->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << '\n'
+             << "ES ";
+      if (ES)
+        dbgs() << ES;
+      else
+        dbgs() << "nul";
+      dbgs() << " MCSym ";
+      if (MCSym)
+        dbgs() << MCSym;
+      else
+        dbgs() << "nul";
+      dbgs() << " JT" << JT << " Align" << Align << '\n';
+    }
+#endif
+  };
+}
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  /// ISel - X86-specific code to select X86 machine instructions for
+  /// SelectionDAG operations.
+  ///
+  class X86DAGToDAGISel final : public SelectionDAGISel {
+    /// Keep a pointer to the X86Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const X86Subtarget *Subtarget;
+
+    /// If true, selector should try to optimize for code size instead of
+    /// performance.
+    bool OptForSize;
+
+    /// If true, selector should try to optimize for minimum code size.
+    bool OptForMinSize;
+
+  public:
+    explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
+        : SelectionDAGISel(tm, OptLevel), OptForSize(false),
+          OptForMinSize(false) {}
+
+    StringRef getPassName() const override {
+      return "X86 DAG->DAG Instruction Selection";
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      // Reset the subtarget each time through.
+      Subtarget = &MF.getSubtarget<X86Subtarget>();
+      SelectionDAGISel::runOnMachineFunction(MF);
+      return true;
+    }
+
+    void EmitFunctionEntryCode() override;
+
+    bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
+
+    void PreprocessISelDAG() override;
+
+    inline bool immSext8(SDNode *N) const {
+      return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
+    }
+
+    // True if the 64-bit immediate fits in a 32-bit sign-extended field.
+    inline bool i64immSExt32(SDNode *N) const {
+      uint64_t v = cast<ConstantSDNode>(N)->getZExtValue();
+      return (int64_t)v == (int32_t)v;
+    }
+
+// Include the pieces autogenerated from the target description.
+#include "X86GenDAGISel.inc"
+
+  private:
+    void Select(SDNode *N) override;
+    bool tryGather(SDNode *N, unsigned Opc);
+
+    bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
+    bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
+    bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
+    bool matchAddress(SDValue N, X86ISelAddressMode &AM);
+    bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth);
+    bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+                                 unsigned Depth);
+    bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
+    bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+                    SDValue &Scale, SDValue &Index, SDValue &Disp,
+                    SDValue &Segment);
+    bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+                          SDValue &Scale, SDValue &Index, SDValue &Disp,
+                          SDValue &Segment);
+    bool selectMOV64Imm32(SDValue N, SDValue &Imm);
+    bool selectLEAAddr(SDValue N, SDValue &Base,
+                       SDValue &Scale, SDValue &Index, SDValue &Disp,
+                       SDValue &Segment);
+    bool selectLEA64_32Addr(SDValue N, SDValue &Base,
+                            SDValue &Scale, SDValue &Index, SDValue &Disp,
+                            SDValue &Segment);
+    bool selectTLSADDRAddr(SDValue N, SDValue &Base,
+                           SDValue &Scale, SDValue &Index, SDValue &Disp,
+                           SDValue &Segment);
+    bool selectScalarSSELoad(SDNode *Root, SDValue N,
+                             SDValue &Base, SDValue &Scale,
+                             SDValue &Index, SDValue &Disp,
+                             SDValue &Segment,
+                             SDValue &NodeWithChain);
+    bool selectRelocImm(SDValue N, SDValue &Op);
+
+    bool tryFoldLoad(SDNode *P, SDValue N,
+                     SDValue &Base, SDValue &Scale,
+                     SDValue &Index, SDValue &Disp,
+                     SDValue &Segment);
+
+    /// Implement addressing mode selection for inline asm expressions.
+    bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                      unsigned ConstraintID,
+                                      std::vector<SDValue> &OutOps) override;
+
+    void emitSpecialCodeForMain();
+
+    inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
+                                   SDValue &Base, SDValue &Scale,
+                                   SDValue &Index, SDValue &Disp,
+                                   SDValue &Segment) {
+      Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+                 ? CurDAG->getTargetFrameIndex(
+                       AM.Base_FrameIndex,
+                       TLI->getPointerTy(CurDAG->getDataLayout()))
+                 : AM.Base_Reg;
+      Scale = getI8Imm(AM.Scale, DL);
+      Index = AM.IndexReg;
+      // These are 32-bit even in 64-bit mode since RIP-relative offset
+      // is 32-bit.
+      if (AM.GV)
+        Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
+                                              MVT::i32, AM.Disp,
+                                              AM.SymbolFlags);
+      else if (AM.CP)
+        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
+                                             AM.Align, AM.Disp, AM.SymbolFlags);
+      else if (AM.ES) {
+        assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
+        Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
+      } else if (AM.MCSym) {
+        assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
+        assert(AM.SymbolFlags == 0 && "oo");
+        Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
+      } else if (AM.JT != -1) {
+        assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
+        Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
+      } else if (AM.BlockAddr)
+        Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
+                                             AM.SymbolFlags);
+      else
+        Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
+
+      if (AM.Segment.getNode())
+        Segment = AM.Segment;
+      else
+        Segment = CurDAG->getRegister(0, MVT::i32);
+    }
+
+    // Utility function to determine whether we should avoid selecting
+    // immediate forms of instructions for better code size or not.
+    // At a high level, we'd like to avoid such instructions when
+    // we have similar constants used within the same basic block
+    // that can be kept in a register.
+    //
+    bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
+      uint32_t UseCount = 0;
+
+      // Do not want to hoist if we're not optimizing for size.
+      // TODO: We'd like to remove this restriction.
+      // See the comment in X86InstrInfo.td for more info.
+      if (!OptForSize)
+        return false;
+
+      // Walk all the users of the immediate.
+      for (SDNode::use_iterator UI = N->use_begin(),
+           UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
+
+        SDNode *User = *UI;
+
+        // This user is already selected. Count it as a legitimate use and
+        // move on.
+        if (User->isMachineOpcode()) {
+          UseCount++;
+          continue;
+        }
+
+        // We want to count stores of immediates as real uses.
+        if (User->getOpcode() == ISD::STORE &&
+            User->getOperand(1).getNode() == N) {
+          UseCount++;
+          continue;
+        }
+
+        // We don't currently match users that have > 2 operands (except
+        // for stores, which are handled above)
+        // Those instruction won't match in ISEL, for now, and would
+        // be counted incorrectly.
+        // This may change in the future as we add additional instruction
+        // types.
+        if (User->getNumOperands() != 2)
+          continue;
+
+        // Immediates that are used for offsets as part of stack
+        // manipulation should be left alone. These are typically
+        // used to indicate SP offsets for argument passing and
+        // will get pulled into stores/pushes (implicitly).
+        if (User->getOpcode() == X86ISD::ADD ||
+            User->getOpcode() == ISD::ADD    ||
+            User->getOpcode() == X86ISD::SUB ||
+            User->getOpcode() == ISD::SUB) {
+
+          // Find the other operand of the add/sub.
+          SDValue OtherOp = User->getOperand(0);
+          if (OtherOp.getNode() == N)
+            OtherOp = User->getOperand(1);
+
+          // Don't count if the other operand is SP.
+          RegisterSDNode *RegNode;
+          if (OtherOp->getOpcode() == ISD::CopyFromReg &&
+              (RegNode = dyn_cast_or_null<RegisterSDNode>(
+                 OtherOp->getOperand(1).getNode())))
+            if ((RegNode->getReg() == X86::ESP) ||
+                (RegNode->getReg() == X86::RSP))
+              continue;
+        }
+
+        // ... otherwise, count this and move on.
+        UseCount++;
+      }
+
+      // If we have more than 1 use, then recommend for hoisting.
+      return (UseCount > 1);
+    }
+
+    /// Return a target constant with the specified value of type i8.
+    inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
+      return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
+    }
+
+    /// Return a target constant with the specified value, of type i32.
+    inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
+      return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
+    }
+
+    /// Return an SDNode that returns the value of the global base register.
+    /// Output instructions required to initialize the global base register,
+    /// if necessary.
+    SDNode *getGlobalBaseReg();
+
+    /// Return a reference to the TargetMachine, casted to the target-specific
+    /// type.
+    const X86TargetMachine &getTargetMachine() const {
+      return static_cast<const X86TargetMachine &>(TM);
+    }
+
+    /// Return a reference to the TargetInstrInfo, casted to the target-specific
+    /// type.
+    const X86InstrInfo *getInstrInfo() const {
+      return Subtarget->getInstrInfo();
+    }
+
+    /// \brief Address-mode matching performs shift-of-and to and-of-shift
+    /// reassociation in order to expose more scaled addressing
+    /// opportunities.
+    bool ComplexPatternFuncMutatesDAG() const override {
+      return true;
+    }
+  };
+}
+
+
+bool
+X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
+  if (OptLevel == CodeGenOpt::None) return false;
+
+  if (!N.hasOneUse())
+    return false;
+
+  if (N.getOpcode() != ISD::LOAD)
+    return true;
+
+  // If N is a load, do additional profitability checks.
+  if (U == Root) {
+    switch (U->getOpcode()) {
+    default: break;
+    case X86ISD::ADD:
+    case X86ISD::SUB:
+    case X86ISD::AND:
+    case X86ISD::XOR:
+    case X86ISD::OR:
+    case ISD::ADD:
+    case ISD::ADDC:
+    case ISD::ADDE:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR: {
+      SDValue Op1 = U->getOperand(1);
+
+      // If the other operand is a 8-bit immediate we should fold the immediate
+      // instead. This reduces code size.
+      // e.g.
+      // movl 4(%esp), %eax
+      // addl $4, %eax
+      // vs.
+      // movl $4, %eax
+      // addl 4(%esp), %eax
+      // The former is 2 bytes shorter. In case where the increment is 1, then
+      // the saving can be 4 bytes (by using incl %eax).
+      if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
+        if (Imm->getAPIntValue().isSignedIntN(8))
+          return false;
+
+      // If the other operand is a TLS address, we should fold it instead.
+      // This produces
+      // movl    %gs:0, %eax
+      // leal    i@NTPOFF(%eax), %eax
+      // instead of
+      // movl    $i@NTPOFF, %eax
+      // addl    %gs:0, %eax
+      // if the block also has an access to a second TLS address this will save
+      // a load.
+      // FIXME: This is probably also true for non-TLS addresses.
+      if (Op1.getOpcode() == X86ISD::Wrapper) {
+        SDValue Val = Op1.getOperand(0);
+        if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+          return false;
+      }
+    }
+    }
+  }
+
+  return true;
+}
+
+/// Replace the original chain operand of the call with
+/// load's chain operand and move load below the call's chain operand.
+static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
+                               SDValue Call, SDValue OrigChain) {
+  SmallVector<SDValue, 8> Ops;
+  SDValue Chain = OrigChain.getOperand(0);
+  if (Chain.getNode() == Load.getNode())
+    Ops.push_back(Load.getOperand(0));
+  else {
+    assert(Chain.getOpcode() == ISD::TokenFactor &&
+           "Unexpected chain operand");
+    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
+      if (Chain.getOperand(i).getNode() == Load.getNode())
+        Ops.push_back(Load.getOperand(0));
+      else
+        Ops.push_back(Chain.getOperand(i));
+    SDValue NewChain =
+      CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
+    Ops.clear();
+    Ops.push_back(NewChain);
+  }
+  Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
+  CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
+  CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
+                             Load.getOperand(1), Load.getOperand(2));
+
+  Ops.clear();
+  Ops.push_back(SDValue(Load.getNode(), 1));
+  Ops.append(Call->op_begin() + 1, Call->op_end());
+  CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
+}
+
+/// Return true if call address is a load and it can be
+/// moved below CALLSEQ_START and the chains leading up to the call.
+/// Return the CALLSEQ_START by reference as a second output.
+/// In the case of a tail call, there isn't a callseq node between the call
+/// chain and the load.
+static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
+  // The transformation is somewhat dangerous if the call's chain was glued to
+  // the call. After MoveBelowOrigChain the load is moved between the call and
+  // the chain, this can create a cycle if the load is not folded. So it is
+  // *really* important that we are sure the load will be folded.
+  if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
+    return false;
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
+  if (!LD ||
+      LD->isVolatile() ||
+      LD->getAddressingMode() != ISD::UNINDEXED ||
+      LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return false;
+
+  // Now let's find the callseq_start.
+  while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
+    if (!Chain.hasOneUse())
+      return false;
+    Chain = Chain.getOperand(0);
+  }
+
+  if (!Chain.getNumOperands())
+    return false;
+  // Since we are not checking for AA here, conservatively abort if the chain
+  // writes to memory. It's not safe to move the callee (a load) across a store.
+  if (isa<MemSDNode>(Chain.getNode()) &&
+      cast<MemSDNode>(Chain.getNode())->writeMem())
+    return false;
+  if (Chain.getOperand(0).getNode() == Callee.getNode())
+    return true;
+  if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
+      Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
+      Callee.getValue(1).hasOneUse())
+    return true;
+  return false;
+}
+
+void X86DAGToDAGISel::PreprocessISelDAG() {
+  // OptFor[Min]Size are used in pattern predicates that isel is matching.
+  OptForSize = MF->getFunction()->optForSize();
+  OptForMinSize = MF->getFunction()->optForMinSize();
+  assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize");
+
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+       E = CurDAG->allnodes_end(); I != E; ) {
+    SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
+
+    if (OptLevel != CodeGenOpt::None &&
+        // Only does this when target favors doesn't favor register indirect
+        // call.
+        ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) ||
+         (N->getOpcode() == X86ISD::TC_RETURN &&
+          // Only does this if load can be folded into TC_RETURN.
+          (Subtarget->is64Bit() ||
+           !getTargetMachine().isPositionIndependent())))) {
+      /// Also try moving call address load from outside callseq_start to just
+      /// before the call to allow it to be folded.
+      ///
+      ///     [Load chain]
+      ///         ^
+      ///         |
+      ///       [Load]
+      ///       ^    ^
+      ///       |    |
+      ///      /      \--
+      ///     /          |
+      ///[CALLSEQ_START] |
+      ///     ^          |
+      ///     |          |
+      /// [LOAD/C2Reg]   |
+      ///     |          |
+      ///      \        /
+      ///       \      /
+      ///       [CALL]
+      bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
+      SDValue Chain = N->getOperand(0);
+      SDValue Load  = N->getOperand(1);
+      if (!isCalleeLoad(Load, Chain, HasCallSeq))
+        continue;
+      moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
+      ++NumLoadMoved;
+      continue;
+    }
+
+    // Lower fpround and fpextend nodes that target the FP stack to be store and
+    // load to the stack.  This is a gross hack.  We would like to simply mark
+    // these as being illegal, but when we do that, legalize produces these when
+    // it expands calls, then expands these in the same legalize pass.  We would
+    // like dag combine to be able to hack on these between the call expansion
+    // and the node legalization.  As such this pass basically does "really
+    // late" legalization of these inline with the X86 isel pass.
+    // FIXME: This should only happen when not compiled with -O0.
+    if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
+      continue;
+
+    MVT SrcVT = N->getOperand(0).getSimpleValueType();
+    MVT DstVT = N->getSimpleValueType(0);
+
+    // If any of the sources are vectors, no fp stack involved.
+    if (SrcVT.isVector() || DstVT.isVector())
+      continue;
+
+    // If the source and destination are SSE registers, then this is a legal
+    // conversion that should not be lowered.
+    const X86TargetLowering *X86Lowering =
+        static_cast<const X86TargetLowering *>(TLI);
+    bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
+    bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
+    if (SrcIsSSE && DstIsSSE)
+      continue;
+
+    if (!SrcIsSSE && !DstIsSSE) {
+      // If this is an FPStack extension, it is a noop.
+      if (N->getOpcode() == ISD::FP_EXTEND)
+        continue;
+      // If this is a value-preserving FPStack truncation, it is a noop.
+      if (N->getConstantOperandVal(1))
+        continue;
+    }
+
+    // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+    // FPStack has extload and truncstore.  SSE can fold direct loads into other
+    // operations.  Based on this, decide what we want to do.
+    MVT MemVT;
+    if (N->getOpcode() == ISD::FP_ROUND)
+      MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+    else
+      MemVT = SrcIsSSE ? SrcVT : DstVT;
+
+    SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+    SDLoc dl(N);
+
+    // FIXME: optimize the case where the src/dest is a load or store?
+    SDValue Store =
+        CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
+                              MemTmp, MachinePointerInfo(), MemVT);
+    SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
+                                        MachinePointerInfo(), MemVT);
+
+    // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+    // extload we created.  This will cause general havok on the dag because
+    // anything below the conversion could be folded into other existing nodes.
+    // To avoid invalidating 'I', back it up to the convert node.
+    --I;
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+
+    // Now that we did that, the node is dead.  Increment the iterator to the
+    // next node to process, then delete N.
+    ++I;
+    CurDAG->DeleteNode(N);
+  }
+}
+
+
+/// Emit any code that needs to be executed only in the main function.
+void X86DAGToDAGISel::emitSpecialCodeForMain() {
+  if (Subtarget->isTargetCygMing()) {
+    TargetLowering::ArgListTy Args;
+    auto &DL = CurDAG->getDataLayout();
+
+    TargetLowering::CallLoweringInfo CLI(*CurDAG);
+    CLI.setChain(CurDAG->getRoot())
+        .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
+                   CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
+                   std::move(Args));
+    const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
+    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
+    CurDAG->setRoot(Result.second);
+  }
+}
+
+void X86DAGToDAGISel::EmitFunctionEntryCode() {
+  // If this is main, emit special code for main.
+  if (const Function *Fn = MF->getFunction())
+    if (Fn->hasExternalLinkage() && Fn->getName() == "main")
+      emitSpecialCodeForMain();
+}
+
+static bool isDispSafeForFrameIndex(int64_t Val) {
+  // On 64-bit platforms, we can run into an issue where a frame index
+  // includes a displacement that, when added to the explicit displacement,
+  // will overflow the displacement field. Assuming that the frame index
+  // displacement fits into a 31-bit integer  (which is only slightly more
+  // aggressive than the current fundamental assumption that it fits into
+  // a 32-bit integer), a 31-bit disp should always be safe.
+  return isInt<31>(Val);
+}
+
+bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
+                                            X86ISelAddressMode &AM) {
+  // Cannot combine ExternalSymbol displacements with integer offsets.
+  if (Offset != 0 && (AM.ES || AM.MCSym))
+    return true;
+  int64_t Val = AM.Disp + Offset;
+  CodeModel::Model M = TM.getCodeModel();
+  if (Subtarget->is64Bit()) {
+    if (!X86::isOffsetSuitableForCodeModel(Val, M,
+                                           AM.hasSymbolicDisplacement()))
+      return true;
+    // In addition to the checks required for a register base, check that
+    // we do not try to use an unsafe Disp with a frame index.
+    if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
+        !isDispSafeForFrameIndex(Val))
+      return true;
+  }
+  AM.Disp = Val;
+  return false;
+
+}
+
+bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
+  SDValue Address = N->getOperand(1);
+
+  // load gs:0 -> GS segment register.
+  // load fs:0 -> FS segment register.
+  //
+  // This optimization is valid because the GNU TLS model defines that
+  // gs:0 (or fs:0 on X86-64) contains its own address.
+  // For more information see http://people.redhat.com/drepper/tls.pdf
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
+    if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
+        Subtarget->isTargetGlibc())
+      switch (N->getPointerInfo().getAddrSpace()) {
+      case 256:
+        AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+        return false;
+      case 257:
+        AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+        return false;
+      // Address space 258 is not handled here, because it is not used to
+      // address TLS areas.
+      }
+
+  return true;
+}
+
+/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
+/// mode. These wrap things that will resolve down into a symbol reference.
+/// If no match is possible, this returns true, otherwise it returns false.
+bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
+  // If the addressing mode already has a symbol as the displacement, we can
+  // never match another symbol.
+  if (AM.hasSymbolicDisplacement())
+    return true;
+
+  SDValue N0 = N.getOperand(0);
+  CodeModel::Model M = TM.getCodeModel();
+
+  // Handle X86-64 rip-relative addresses.  We check this before checking direct
+  // folding because RIP is preferable to non-RIP accesses.
+  if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP &&
+      // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
+      // they cannot be folded into immediate fields.
+      // FIXME: This can be improved for kernel and other models?
+      (M == CodeModel::Small || M == CodeModel::Kernel)) {
+    // Base and index reg must be 0 in order to use %rip as base.
+    if (AM.hasBaseOrIndexReg())
+      return true;
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+      X86ISelAddressMode Backup = AM;
+      AM.GV = G->getGlobal();
+      AM.SymbolFlags = G->getTargetFlags();
+      if (foldOffsetIntoAddress(G->getOffset(), AM)) {
+        AM = Backup;
+        return true;
+      }
+    } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+      X86ISelAddressMode Backup = AM;
+      AM.CP = CP->getConstVal();
+      AM.Align = CP->getAlignment();
+      AM.SymbolFlags = CP->getTargetFlags();
+      if (foldOffsetIntoAddress(CP->getOffset(), AM)) {
+        AM = Backup;
+        return true;
+      }
+    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+      AM.ES = S->getSymbol();
+      AM.SymbolFlags = S->getTargetFlags();
+    } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
+      AM.MCSym = S->getMCSymbol();
+    } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+      AM.JT = J->getIndex();
+      AM.SymbolFlags = J->getTargetFlags();
+    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+      X86ISelAddressMode Backup = AM;
+      AM.BlockAddr = BA->getBlockAddress();
+      AM.SymbolFlags = BA->getTargetFlags();
+      if (foldOffsetIntoAddress(BA->getOffset(), AM)) {
+        AM = Backup;
+        return true;
+      }
+    } else
+      llvm_unreachable("Unhandled symbol reference node.");
+
+    if (N.getOpcode() == X86ISD::WrapperRIP)
+      AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
+    return false;
+  }
+
+  // Handle the case when globals fit in our immediate field: This is true for
+  // X86-32 always and X86-64 when in -mcmodel=small mode.  In 64-bit
+  // mode, this only applies to a non-RIP-relative computation.
+  if (!Subtarget->is64Bit() ||
+      M == CodeModel::Small || M == CodeModel::Kernel) {
+    assert(N.getOpcode() != X86ISD::WrapperRIP &&
+           "RIP-relative addressing already handled");
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+      AM.GV = G->getGlobal();
+      AM.Disp += G->getOffset();
+      AM.SymbolFlags = G->getTargetFlags();
+    } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+      AM.CP = CP->getConstVal();
+      AM.Align = CP->getAlignment();
+      AM.Disp += CP->getOffset();
+      AM.SymbolFlags = CP->getTargetFlags();
+    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+      AM.ES = S->getSymbol();
+      AM.SymbolFlags = S->getTargetFlags();
+    } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
+      AM.MCSym = S->getMCSymbol();
+    } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+      AM.JT = J->getIndex();
+      AM.SymbolFlags = J->getTargetFlags();
+    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+      AM.BlockAddr = BA->getBlockAddress();
+      AM.Disp += BA->getOffset();
+      AM.SymbolFlags = BA->getTargetFlags();
+    } else
+      llvm_unreachable("Unhandled symbol reference node.");
+    return false;
+  }
+
+  return true;
+}
+
+/// Add the specified node to the specified addressing mode, returning true if
+/// it cannot be done. This just pattern matches for the addressing mode.
+bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
+  if (matchAddressRecursively(N, AM, 0))
+    return true;
+
+  // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
+  // a smaller encoding and avoids a scaled-index.
+  if (AM.Scale == 2 &&
+      AM.BaseType == X86ISelAddressMode::RegBase &&
+      AM.Base_Reg.getNode() == nullptr) {
+    AM.Base_Reg = AM.IndexReg;
+    AM.Scale = 1;
+  }
+
+  // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
+  // because it has a smaller encoding.
+  // TODO: Which other code models can use this?
+  if (TM.getCodeModel() == CodeModel::Small &&
+      Subtarget->is64Bit() &&
+      AM.Scale == 1 &&
+      AM.BaseType == X86ISelAddressMode::RegBase &&
+      AM.Base_Reg.getNode() == nullptr &&
+      AM.IndexReg.getNode() == nullptr &&
+      AM.SymbolFlags == X86II::MO_NO_FLAG &&
+      AM.hasSymbolicDisplacement())
+    AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
+
+  return false;
+}
+
+bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM,
+                               unsigned Depth) {
+  // Add an artificial use to this node so that we can keep track of
+  // it if it gets CSE'd with a different node.
+  HandleSDNode Handle(N);
+
+  X86ISelAddressMode Backup = AM;
+  if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
+      !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
+    return false;
+  AM = Backup;
+
+  // Try again after commuting the operands.
+  if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
+      !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
+    return false;
+  AM = Backup;
+
+  // If we couldn't fold both operands into the address at the same time,
+  // see if we can just put each operand into a register and fold at least
+  // the add.
+  if (AM.BaseType == X86ISelAddressMode::RegBase &&
+      !AM.Base_Reg.getNode() &&
+      !AM.IndexReg.getNode()) {
+    N = Handle.getValue();
+    AM.Base_Reg = N.getOperand(0);
+    AM.IndexReg = N.getOperand(1);
+    AM.Scale = 1;
+    return false;
+  }
+  N = Handle.getValue();
+  return true;
+}
+
+// Insert a node into the DAG at least before the Pos node's position. This
+// will reposition the node as needed, and will assign it a node ID that is <=
+// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
+// IDs! The selection DAG must no longer depend on their uniqueness when this
+// is used.
+static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
+  if (N.getNode()->getNodeId() == -1 ||
+      N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
+    DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode());
+    N.getNode()->setNodeId(Pos.getNode()->getNodeId());
+  }
+}
+
+// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
+// safe. This allows us to convert the shift and and into an h-register
+// extract and a scaled index. Returns false if the simplification is
+// performed.
+static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
+                                      uint64_t Mask,
+                                      SDValue Shift, SDValue X,
+                                      X86ISelAddressMode &AM) {
+  if (Shift.getOpcode() != ISD::SRL ||
+      !isa<ConstantSDNode>(Shift.getOperand(1)) ||
+      !Shift.hasOneUse())
+    return true;
+
+  int ScaleLog = 8 - Shift.getConstantOperandVal(1);
+  if (ScaleLog <= 0 || ScaleLog >= 4 ||
+      Mask != (0xffu << ScaleLog))
+    return true;
+
+  MVT VT = N.getSimpleValueType();
+  SDLoc DL(N);
+  SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
+  SDValue NewMask = DAG.getConstant(0xff, DL, VT);
+  SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
+  SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
+  SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
+  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
+
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  insertDAGNode(DAG, N, Eight);
+  insertDAGNode(DAG, N, Srl);
+  insertDAGNode(DAG, N, NewMask);
+  insertDAGNode(DAG, N, And);
+  insertDAGNode(DAG, N, ShlCount);
+  insertDAGNode(DAG, N, Shl);
+  DAG.ReplaceAllUsesWith(N, Shl);
+  AM.IndexReg = And;
+  AM.Scale = (1 << ScaleLog);
+  return false;
+}
+
+// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
+// allows us to fold the shift into this addressing mode. Returns false if the
+// transform succeeded.
+static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
+                                        uint64_t Mask,
+                                        SDValue Shift, SDValue X,
+                                        X86ISelAddressMode &AM) {
+  if (Shift.getOpcode() != ISD::SHL ||
+      !isa<ConstantSDNode>(Shift.getOperand(1)))
+    return true;
+
+  // Not likely to be profitable if either the AND or SHIFT node has more
+  // than one use (unless all uses are for address computation). Besides,
+  // isel mechanism requires their node ids to be reused.
+  if (!N.hasOneUse() || !Shift.hasOneUse())
+    return true;
+
+  // Verify that the shift amount is something we can fold.
+  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+  if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
+    return true;
+
+  MVT VT = N.getSimpleValueType();
+  SDLoc DL(N);
+  SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
+  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
+  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
+
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  insertDAGNode(DAG, N, NewMask);
+  insertDAGNode(DAG, N, NewAnd);
+  insertDAGNode(DAG, N, NewShift);
+  DAG.ReplaceAllUsesWith(N, NewShift);
+
+  AM.Scale = 1 << ShiftAmt;
+  AM.IndexReg = NewAnd;
+  return false;
+}
+
+// Implement some heroics to detect shifts of masked values where the mask can
+// be replaced by extending the shift and undoing that in the addressing mode
+// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
+// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
+// the addressing mode. This results in code such as:
+//
+//   int f(short *y, int *lookup_table) {
+//     ...
+//     return *y + lookup_table[*y >> 11];
+//   }
+//
+// Turning into:
+//   movzwl (%rdi), %eax
+//   movl %eax, %ecx
+//   shrl $11, %ecx
+//   addl (%rsi,%rcx,4), %eax
+//
+// Instead of:
+//   movzwl (%rdi), %eax
+//   movl %eax, %ecx
+//   shrl $9, %ecx
+//   andl $124, %rcx
+//   addl (%rsi,%rcx), %eax
+//
+// Note that this function assumes the mask is provided as a mask *after* the
+// value is shifted. The input chain may or may not match that, but computing
+// such a mask is trivial.
+static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
+                                    uint64_t Mask,
+                                    SDValue Shift, SDValue X,
+                                    X86ISelAddressMode &AM) {
+  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
+      !isa<ConstantSDNode>(Shift.getOperand(1)))
+    return true;
+
+  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+  unsigned MaskLZ = countLeadingZeros(Mask);
+  unsigned MaskTZ = countTrailingZeros(Mask);
+
+  // The amount of shift we're trying to fit into the addressing mode is taken
+  // from the trailing zeros of the mask.
+  unsigned AMShiftAmt = MaskTZ;
+
+  // There is nothing we can do here unless the mask is removing some bits.
+  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
+  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
+
+  // We also need to ensure that mask is a continuous run of bits.
+  if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
+
+  // Scale the leading zero count down based on the actual size of the value.
+  // Also scale it down based on the size of the shift.
+  MaskLZ -= (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
+
+  // The final check is to ensure that any masked out high bits of X are
+  // already known to be zero. Otherwise, the mask has a semantic impact
+  // other than masking out a couple of low bits. Unfortunately, because of
+  // the mask, zero extensions will be removed from operands in some cases.
+  // This code works extra hard to look through extensions because we can
+  // replace them with zero extensions cheaply if necessary.
+  bool ReplacingAnyExtend = false;
+  if (X.getOpcode() == ISD::ANY_EXTEND) {
+    unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
+                          X.getOperand(0).getSimpleValueType().getSizeInBits();
+    // Assume that we'll replace the any-extend with a zero-extend, and
+    // narrow the search to the extended value.
+    X = X.getOperand(0);
+    MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
+    ReplacingAnyExtend = true;
+  }
+  APInt MaskedHighBits =
+    APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
+  APInt KnownZero, KnownOne;
+  DAG.computeKnownBits(X, KnownZero, KnownOne);
+  if (MaskedHighBits != KnownZero) return true;
+
+  // We've identified a pattern that can be transformed into a single shift
+  // and an addressing mode. Make it so.
+  MVT VT = N.getSimpleValueType();
+  if (ReplacingAnyExtend) {
+    assert(X.getValueType() != VT);
+    // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
+    SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
+    insertDAGNode(DAG, N, NewX);
+    X = NewX;
+  }
+  SDLoc DL(N);
+  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
+  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
+  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
+  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
+
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  insertDAGNode(DAG, N, NewSRLAmt);
+  insertDAGNode(DAG, N, NewSRL);
+  insertDAGNode(DAG, N, NewSHLAmt);
+  insertDAGNode(DAG, N, NewSHL);
+  DAG.ReplaceAllUsesWith(N, NewSHL);
+
+  AM.Scale = 1 << AMShiftAmt;
+  AM.IndexReg = NewSRL;
+  return false;
+}
+
+bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+                                              unsigned Depth) {
+  SDLoc dl(N);
+  DEBUG({
+      dbgs() << "MatchAddress: ";
+      AM.dump();
+    });
+  // Limit recursion.
+  if (Depth > 5)
+    return matchAddressBase(N, AM);
+
+  // If this is already a %rip relative address, we can only merge immediates
+  // into it.  Instead of handling this in every case, we handle it here.
+  // RIP relative addressing: %rip + 32-bit displacement!
+  if (AM.isRIPRelative()) {
+    // FIXME: JumpTable and ExternalSymbol address currently don't like
+    // displacements.  It isn't very important, but this should be fixed for
+    // consistency.
+    if (!(AM.ES || AM.MCSym) && AM.JT != -1)
+      return true;
+
+    if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
+      if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
+        return false;
+    return true;
+  }
+
+  switch (N.getOpcode()) {
+  default: break;
+  case ISD::LOCAL_RECOVER: {
+    if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
+      if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
+        // Use the symbol and don't prefix it.
+        AM.MCSym = ESNode->getMCSymbol();
+        return false;
+      }
+    break;
+  }
+  case ISD::Constant: {
+    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+    if (!foldOffsetIntoAddress(Val, AM))
+      return false;
+    break;
+  }
+
+  case X86ISD::Wrapper:
+  case X86ISD::WrapperRIP:
+    if (!matchWrapper(N, AM))
+      return false;
+    break;
+
+  case ISD::LOAD:
+    if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
+      return false;
+    break;
+
+  case ISD::FrameIndex:
+    if (AM.BaseType == X86ISelAddressMode::RegBase &&
+        AM.Base_Reg.getNode() == nullptr &&
+        (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
+      AM.BaseType = X86ISelAddressMode::FrameIndexBase;
+      AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+      return false;
+    }
+    break;
+
+  case ISD::SHL:
+    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
+      break;
+
+    if (ConstantSDNode
+          *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
+      unsigned Val = CN->getZExtValue();
+      // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
+      // that the base operand remains free for further matching. If
+      // the base doesn't end up getting used, a post-processing step
+      // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
+      if (Val == 1 || Val == 2 || Val == 3) {
+        AM.Scale = 1 << Val;
+        SDValue ShVal = N.getNode()->getOperand(0);
+
+        // Okay, we know that we have a scale by now.  However, if the scaled
+        // value is an add of something and a constant, we can fold the
+        // constant into the disp field here.
+        if (CurDAG->isBaseWithConstantOffset(ShVal)) {
+          AM.IndexReg = ShVal.getNode()->getOperand(0);
+          ConstantSDNode *AddVal =
+            cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
+          uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
+          if (!foldOffsetIntoAddress(Disp, AM))
+            return false;
+        }
+
+        AM.IndexReg = ShVal;
+        return false;
+      }
+    }
+    break;
+
+  case ISD::SRL: {
+    // Scale must not be used already.
+    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
+
+    SDValue And = N.getOperand(0);
+    if (And.getOpcode() != ISD::AND) break;
+    SDValue X = And.getOperand(0);
+
+    // We only handle up to 64-bit values here as those are what matter for
+    // addressing mode optimizations.
+    if (X.getSimpleValueType().getSizeInBits() > 64) break;
+
+    // The mask used for the transform is expected to be post-shift, but we
+    // found the shift first so just apply the shift to the mask before passing
+    // it down.
+    if (!isa<ConstantSDNode>(N.getOperand(1)) ||
+        !isa<ConstantSDNode>(And.getOperand(1)))
+      break;
+    uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
+
+    // Try to fold the mask and shift into the scale, and return false if we
+    // succeed.
+    if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
+      return false;
+    break;
+  }
+
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI:
+    // A mul_lohi where we need the low part can be folded as a plain multiply.
+    if (N.getResNo() != 0) break;
+    LLVM_FALLTHROUGH;
+  case ISD::MUL:
+  case X86ISD::MUL_IMM:
+    // X*[3,5,9] -> X+X*[2,4,8]
+    if (AM.BaseType == X86ISelAddressMode::RegBase &&
+        AM.Base_Reg.getNode() == nullptr &&
+        AM.IndexReg.getNode() == nullptr) {
+      if (ConstantSDNode
+            *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
+        if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
+            CN->getZExtValue() == 9) {
+          AM.Scale = unsigned(CN->getZExtValue())-1;
+
+          SDValue MulVal = N.getNode()->getOperand(0);
+          SDValue Reg;
+
+          // Okay, we know that we have a scale by now.  However, if the scaled
+          // value is an add of something and a constant, we can fold the
+          // constant into the disp field here.
+          if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
+              isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
+            Reg = MulVal.getNode()->getOperand(0);
+            ConstantSDNode *AddVal =
+              cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
+            uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
+            if (foldOffsetIntoAddress(Disp, AM))
+              Reg = N.getNode()->getOperand(0);
+          } else {
+            Reg = N.getNode()->getOperand(0);
+          }
+
+          AM.IndexReg = AM.Base_Reg = Reg;
+          return false;
+        }
+    }
+    break;
+
+  case ISD::SUB: {
+    // Given A-B, if A can be completely folded into the address and
+    // the index field with the index field unused, use -B as the index.
+    // This is a win if a has multiple parts that can be folded into
+    // the address. Also, this saves a mov if the base register has
+    // other uses, since it avoids a two-address sub instruction, however
+    // it costs an additional mov if the index register has other uses.
+
+    // Add an artificial use to this node so that we can keep track of
+    // it if it gets CSE'd with a different node.
+    HandleSDNode Handle(N);
+
+    // Test if the LHS of the sub can be folded.
+    X86ISelAddressMode Backup = AM;
+    if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
+      AM = Backup;
+      break;
+    }
+    // Test if the index field is free for use.
+    if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
+      AM = Backup;
+      break;
+    }
+
+    int Cost = 0;
+    SDValue RHS = Handle.getValue().getNode()->getOperand(1);
+    // If the RHS involves a register with multiple uses, this
+    // transformation incurs an extra mov, due to the neg instruction
+    // clobbering its operand.
+    if (!RHS.getNode()->hasOneUse() ||
+        RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
+        RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
+        RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
+        (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
+         RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
+      ++Cost;
+    // If the base is a register with multiple uses, this
+    // transformation may save a mov.
+    if ((AM.BaseType == X86ISelAddressMode::RegBase &&
+         AM.Base_Reg.getNode() &&
+         !AM.Base_Reg.getNode()->hasOneUse()) ||
+        AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+      --Cost;
+    // If the folded LHS was interesting, this transformation saves
+    // address arithmetic.
+    if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
+        ((AM.Disp != 0) && (Backup.Disp == 0)) +
+        (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
+      --Cost;
+    // If it doesn't look like it may be an overall win, don't do it.
+    if (Cost >= 0) {
+      AM = Backup;
+      break;
+    }
+
+    // Ok, the transformation is legal and appears profitable. Go for it.
+    SDValue Zero = CurDAG->getConstant(0, dl, N.getValueType());
+    SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
+    AM.IndexReg = Neg;
+    AM.Scale = 1;
+
+    // Insert the new nodes into the topological ordering.
+    insertDAGNode(*CurDAG, N, Zero);
+    insertDAGNode(*CurDAG, N, Neg);
+    return false;
+  }
+
+  case ISD::ADD:
+    if (!matchAdd(N, AM, Depth))
+      return false;
+    break;
+
+  case ISD::OR:
+    // We want to look through a transform in InstCombine and DAGCombiner that
+    // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
+    // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
+    // An 'lea' can then be used to match the shift (multiply) and add:
+    // and $1, %esi
+    // lea (%rsi, %rdi, 8), %rax
+    if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
+        !matchAdd(N, AM, Depth))
+      return false;
+    break;
+
+  case ISD::AND: {
+    // Perform some heroic transforms on an and of a constant-count shift
+    // with a constant to enable use of the scaled offset field.
+
+    // Scale must not be used already.
+    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
+
+    SDValue Shift = N.getOperand(0);
+    if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
+    SDValue X = Shift.getOperand(0);
+
+    // We only handle up to 64-bit values here as those are what matter for
+    // addressing mode optimizations.
+    if (X.getSimpleValueType().getSizeInBits() > 64) break;
+
+    if (!isa<ConstantSDNode>(N.getOperand(1)))
+      break;
+    uint64_t Mask = N.getConstantOperandVal(1);
+
+    // Try to fold the mask and shift into an extract and scale.
+    if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
+      return false;
+
+    // Try to fold the mask and shift directly into the scale.
+    if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
+      return false;
+
+    // Try to swap the mask and shift to place shifts which can be done as
+    // a scale on the outside of the mask.
+    if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
+      return false;
+    break;
+  }
+  }
+
+  return matchAddressBase(N, AM);
+}
+
+/// Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
+  // Is the base register already occupied?
+  if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
+    // If so, check to see if the scale index register is set.
+    if (!AM.IndexReg.getNode()) {
+      AM.IndexReg = N;
+      AM.Scale = 1;
+      return false;
+    }
+
+    // Otherwise, we cannot select it.
+    return true;
+  }
+
+  // Default, generate it as a register.
+  AM.BaseType = X86ISelAddressMode::RegBase;
+  AM.Base_Reg = N;
+  return false;
+}
+
+bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+                                      SDValue &Scale, SDValue &Index,
+                                      SDValue &Disp, SDValue &Segment) {
+
+  MaskedGatherScatterSDNode *Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent);
+  if (!Mgs)
+    return false;
+  X86ISelAddressMode AM;
+  unsigned AddrSpace = Mgs->getPointerInfo().getAddrSpace();
+  // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
+  if (AddrSpace == 256)
+    AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+  if (AddrSpace == 257)
+    AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+  if (AddrSpace == 258)
+    AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
+
+  SDLoc DL(N);
+  Base = Mgs->getBasePtr();
+  Index = Mgs->getIndex();
+  unsigned ScalarSize = Mgs->getValue().getScalarValueSizeInBits();
+  Scale = getI8Imm(ScalarSize/8, DL);
+
+  // If Base is 0, the whole address is in index and the Scale is 1
+  if (isa<ConstantSDNode>(Base)) {
+    assert(cast<ConstantSDNode>(Base)->isNullValue() &&
+           "Unexpected base in gather/scatter");
+    Scale = getI8Imm(1, DL);
+    Base = CurDAG->getRegister(0, MVT::i32);
+  }
+  if (AM.Segment.getNode())
+    Segment = AM.Segment;
+  else
+    Segment = CurDAG->getRegister(0, MVT::i32);
+  Disp = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  return true;
+}
+
+/// Returns true if it is able to pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+///
+/// Parent is the parent node of the addr operand that is being matched.  It
+/// is always a load, store, atomic node, or null.  It is only null when
+/// checking memory operands for inline asm nodes.
+bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+                                 SDValue &Scale, SDValue &Index,
+                                 SDValue &Disp, SDValue &Segment) {
+  X86ISelAddressMode AM;
+
+  if (Parent &&
+      // This list of opcodes are all the nodes that have an "addr:$ptr" operand
+      // that are not a MemSDNode, and thus don't have proper addrspace info.
+      Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
+      Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
+      Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
+      Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
+      Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
+    unsigned AddrSpace =
+      cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
+    // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
+    if (AddrSpace == 256)
+      AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+    if (AddrSpace == 257)
+      AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+    if (AddrSpace == 258)
+      AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
+  }
+
+  if (matchAddress(N, AM))
+    return false;
+
+  MVT VT = N.getSimpleValueType();
+  if (AM.BaseType == X86ISelAddressMode::RegBase) {
+    if (!AM.Base_Reg.getNode())
+      AM.Base_Reg = CurDAG->getRegister(0, VT);
+  }
+
+  if (!AM.IndexReg.getNode())
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+  return true;
+}
+
+/// Match a scalar SSE load. In particular, we want to match a load whose top
+/// elements are either undef or zeros. The load flavor is derived from the
+/// type of N, which is either v4f32 or v2f64.
+///
+/// We also return:
+///   PatternChainNode: this is the matched node that has a chain input and
+///   output.
+bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
+                                          SDValue N, SDValue &Base,
+                                          SDValue &Scale, SDValue &Index,
+                                          SDValue &Disp, SDValue &Segment,
+                                          SDValue &PatternNodeWithChain) {
+  // We can allow a full vector load here since narrowing a load is ok.
+  if (ISD::isNON_EXTLoad(N.getNode())) {
+    PatternNodeWithChain = N;
+    if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
+        IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel)) {
+      LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+      return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
+                        Segment);
+    }
+  }
+
+  // We can also match the special zero extended load opcode.
+  if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
+    PatternNodeWithChain = N;
+    if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
+        IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel)) {
+      auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
+      return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
+                        Segment);
+    }
+  }
+
+  // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
+  // once. Otherwise the load might get duplicated and the chain output of the
+  // duplicate load will not be observed by all dependencies.
+  if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
+    PatternNodeWithChain = N.getOperand(0);
+    if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
+        IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
+        IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
+      LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+      return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
+                        Segment);
+    }
+  }
+
+  // Also handle the case where we explicitly require zeros in the top
+  // elements.  This is a vector shuffle from the zero vector.
+  if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
+      // Check to see if the top elements are all zeros (or bitcast of zeros).
+      N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      N.getOperand(0).getNode()->hasOneUse()) {
+    PatternNodeWithChain = N.getOperand(0).getOperand(0);
+    if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
+        IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
+        IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
+      // Okay, this is a zero extending load.  Fold it.
+      LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+      return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
+                        Segment);
+    }
+  }
+
+  return false;
+}
+
+
+bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
+  if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    uint64_t ImmVal = CN->getZExtValue();
+    if ((uint32_t)ImmVal != (uint64_t)ImmVal)
+      return false;
+
+    Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
+    return true;
+  }
+
+  // In static codegen with small code model, we can get the address of a label
+  // into a register with 'movl'. TableGen has already made sure we're looking
+  // at a label of some kind.
+  assert(N->getOpcode() == X86ISD::Wrapper &&
+         "Unexpected node type for MOV32ri64");
+  N = N.getOperand(0);
+
+  // At least GNU as does not accept 'movl' for TPOFF relocations.
+  // FIXME: We could use 'movl' when we know we are targeting MC.
+  if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
+    return false;
+
+  Imm = N;
+  if (N->getOpcode() != ISD::TargetGlobalAddress)
+    return TM.getCodeModel() == CodeModel::Small;
+
+  Optional<ConstantRange> CR =
+      cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
+  if (!CR)
+    return TM.getCodeModel() == CodeModel::Small;
+
+  return CR->getUnsignedMax().ult(1ull << 32);
+}
+
+bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
+                                         SDValue &Scale, SDValue &Index,
+                                         SDValue &Disp, SDValue &Segment) {
+  // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
+  SDLoc DL(N);
+
+  if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
+    return false;
+
+  RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
+  if (RN && RN->getReg() == 0)
+    Base = CurDAG->getRegister(0, MVT::i64);
+  else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(Base)) {
+    // Base could already be %rip, particularly in the x32 ABI.
+    Base = SDValue(CurDAG->getMachineNode(
+                       TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
+                       CurDAG->getTargetConstant(0, DL, MVT::i64),
+                       Base,
+                       CurDAG->getTargetConstant(X86::sub_32bit, DL, MVT::i32)),
+                   0);
+  }
+
+  RN = dyn_cast<RegisterSDNode>(Index);
+  if (RN && RN->getReg() == 0)
+    Index = CurDAG->getRegister(0, MVT::i64);
+  else {
+    assert(Index.getValueType() == MVT::i32 &&
+           "Expect to be extending 32-bit registers for use in LEA");
+    Index = SDValue(CurDAG->getMachineNode(
+                        TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
+                        CurDAG->getTargetConstant(0, DL, MVT::i64),
+                        Index,
+                        CurDAG->getTargetConstant(X86::sub_32bit, DL,
+                                                  MVT::i32)),
+                    0);
+  }
+
+  return true;
+}
+
+/// Calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LEA instruction.
+bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
+                                    SDValue &Base, SDValue &Scale,
+                                    SDValue &Index, SDValue &Disp,
+                                    SDValue &Segment) {
+  X86ISelAddressMode AM;
+
+  // Save the DL and VT before calling matchAddress, it can invalidate N.
+  SDLoc DL(N);
+  MVT VT = N.getSimpleValueType();
+
+  // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
+  // segments.
+  SDValue Copy = AM.Segment;
+  SDValue T = CurDAG->getRegister(0, MVT::i32);
+  AM.Segment = T;
+  if (matchAddress(N, AM))
+    return false;
+  assert (T == AM.Segment);
+  AM.Segment = Copy;
+
+  unsigned Complexity = 0;
+  if (AM.BaseType == X86ISelAddressMode::RegBase)
+    if (AM.Base_Reg.getNode())
+      Complexity = 1;
+    else
+      AM.Base_Reg = CurDAG->getRegister(0, VT);
+  else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+    Complexity = 4;
+
+  if (AM.IndexReg.getNode())
+    Complexity++;
+  else
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
+  // a simple shift.
+  if (AM.Scale > 1)
+    Complexity++;
+
+  // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
+  // to a LEA. This is determined with some experimentation but is by no means
+  // optimal (especially for code size consideration). LEA is nice because of
+  // its three-address nature. Tweak the cost function again when we can run
+  // convertToThreeAddress() at register allocation time.
+  if (AM.hasSymbolicDisplacement()) {
+    // For X86-64, always use LEA to materialize RIP-relative addresses.
+    if (Subtarget->is64Bit())
+      Complexity = 4;
+    else
+      Complexity += 2;
+  }
+
+  if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode()))
+    Complexity++;
+
+  // If it isn't worth using an LEA, reject it.
+  if (Complexity <= 2)
+    return false;
+
+  getAddressOperands(AM, DL, Base, Scale, Index, Disp, Segment);
+  return true;
+}
+
+/// This is only run on TargetGlobalTLSAddress nodes.
+bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
+                                        SDValue &Scale, SDValue &Index,
+                                        SDValue &Disp, SDValue &Segment) {
+  assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
+  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+
+  X86ISelAddressMode AM;
+  AM.GV = GA->getGlobal();
+  AM.Disp += GA->getOffset();
+  AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
+  AM.SymbolFlags = GA->getTargetFlags();
+
+  if (N.getValueType() == MVT::i32) {
+    AM.Scale = 1;
+    AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
+  } else {
+    AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
+  }
+
+  getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+  return true;
+}
+
+bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
+  if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
+    Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
+                                   N.getValueType());
+    return true;
+  }
+
+  // Keep track of the original value type and whether this value was
+  // truncated. If we see a truncation from pointer type to VT that truncates
+  // bits that are known to be zero, we can use a narrow reference.
+  EVT VT = N.getValueType();
+  bool WasTruncated = false;
+  if (N.getOpcode() == ISD::TRUNCATE) {
+    WasTruncated = true;
+    N = N.getOperand(0);
+  }
+
+  if (N.getOpcode() != X86ISD::Wrapper)
+    return false;
+
+  // We can only use non-GlobalValues as immediates if they were not truncated,
+  // as we do not have any range information. If we have a GlobalValue and the
+  // address was not truncated, we can select it as an operand directly.
+  unsigned Opc = N.getOperand(0)->getOpcode();
+  if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
+    Op = N.getOperand(0);
+    // We can only select the operand directly if we didn't have to look past a
+    // truncate.
+    return !WasTruncated;
+  }
+
+  // Check that the global's range fits into VT.
+  auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
+  Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
+  if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
+    return false;
+
+  // Okay, we can use a narrow reference.
+  Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
+                                      GA->getOffset(), GA->getTargetFlags());
+  return true;
+}
+
+bool X86DAGToDAGISel::tryFoldLoad(SDNode *P, SDValue N,
+                                  SDValue &Base, SDValue &Scale,
+                                  SDValue &Index, SDValue &Disp,
+                                  SDValue &Segment) {
+  if (!ISD::isNON_EXTLoad(N.getNode()) ||
+      !IsProfitableToFold(N, P, P) ||
+      !IsLegalToFold(N, P, P, OptLevel))
+    return false;
+
+  return selectAddr(N.getNode(),
+                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
+/// Return an SDNode that returns the value of the global base register.
+/// Output instructions required to initialize the global base register,
+/// if necessary.
+SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
+  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
+  auto &DL = MF->getDataLayout();
+  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
+}
+
+/// Test whether the given X86ISD::CMP node has any uses which require the SF
+/// or OF bits to be accurate.
+static bool hasNoSignedComparisonUses(SDNode *N) {
+  // Examine each user of the node.
+  for (SDNode::use_iterator UI = N->use_begin(),
+         UE = N->use_end(); UI != UE; ++UI) {
+    // Only examine CopyToReg uses.
+    if (UI->getOpcode() != ISD::CopyToReg)
+      return false;
+    // Only examine CopyToReg uses that copy to EFLAGS.
+    if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() !=
+          X86::EFLAGS)
+      return false;
+    // Examine each user of the CopyToReg use.
+    for (SDNode::use_iterator FlagUI = UI->use_begin(),
+           FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
+      // Only examine the Flag result.
+      if (FlagUI.getUse().getResNo() != 1) continue;
+      // Anything unusual: assume conservatively.
+      if (!FlagUI->isMachineOpcode()) return false;
+      // Examine the opcode of the user.
+      switch (FlagUI->getMachineOpcode()) {
+      // These comparisons don't treat the most significant bit specially.
+      case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr:
+      case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr:
+      case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm:
+      case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm:
+      case X86::JA_1: case X86::JAE_1: case X86::JB_1: case X86::JBE_1:
+      case X86::JE_1: case X86::JNE_1: case X86::JP_1: case X86::JNP_1:
+      case X86::CMOVA16rr: case X86::CMOVA16rm:
+      case X86::CMOVA32rr: case X86::CMOVA32rm:
+      case X86::CMOVA64rr: case X86::CMOVA64rm:
+      case X86::CMOVAE16rr: case X86::CMOVAE16rm:
+      case X86::CMOVAE32rr: case X86::CMOVAE32rm:
+      case X86::CMOVAE64rr: case X86::CMOVAE64rm:
+      case X86::CMOVB16rr: case X86::CMOVB16rm:
+      case X86::CMOVB32rr: case X86::CMOVB32rm:
+      case X86::CMOVB64rr: case X86::CMOVB64rm:
+      case X86::CMOVBE16rr: case X86::CMOVBE16rm:
+      case X86::CMOVBE32rr: case X86::CMOVBE32rm:
+      case X86::CMOVBE64rr: case X86::CMOVBE64rm:
+      case X86::CMOVE16rr: case X86::CMOVE16rm:
+      case X86::CMOVE32rr: case X86::CMOVE32rm:
+      case X86::CMOVE64rr: case X86::CMOVE64rm:
+      case X86::CMOVNE16rr: case X86::CMOVNE16rm:
+      case X86::CMOVNE32rr: case X86::CMOVNE32rm:
+      case X86::CMOVNE64rr: case X86::CMOVNE64rm:
+      case X86::CMOVNP16rr: case X86::CMOVNP16rm:
+      case X86::CMOVNP32rr: case X86::CMOVNP32rm:
+      case X86::CMOVNP64rr: case X86::CMOVNP64rm:
+      case X86::CMOVP16rr: case X86::CMOVP16rm:
+      case X86::CMOVP32rr: case X86::CMOVP32rm:
+      case X86::CMOVP64rr: case X86::CMOVP64rm:
+        continue;
+      // Anything else: assume conservatively.
+      default: return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// Check whether or not the chain ending in StoreNode is suitable for doing
+/// the {load; increment or decrement; store} to modify transformation.
+static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
+                                SDValue StoredVal, SelectionDAG *CurDAG,
+                                LoadSDNode* &LoadNode, SDValue &InputChain) {
+
+  // is the value stored the result of a DEC or INC?
+  if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false;
+
+  // is the stored value result 0 of the load?
+  if (StoredVal.getResNo() != 0) return false;
+
+  // are there other uses of the loaded value than the inc or dec?
+  if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
+
+  // is the store non-extending and non-indexed?
+  if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
+    return false;
+
+  SDValue Load = StoredVal->getOperand(0);
+  // Is the stored value a non-extending and non-indexed load?
+  if (!ISD::isNormalLoad(Load.getNode())) return false;
+
+  // Return LoadNode by reference.
+  LoadNode = cast<LoadSDNode>(Load);
+  // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8)
+  EVT LdVT = LoadNode->getMemoryVT();
+  if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 &&
+      LdVT != MVT::i8)
+    return false;
+
+  // Is store the only read of the loaded value?
+  if (!Load.hasOneUse())
+    return false;
+
+  // Is the address of the store the same as the load?
+  if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
+      LoadNode->getOffset() != StoreNode->getOffset())
+    return false;
+
+  // Check if the chain is produced by the load or is a TokenFactor with
+  // the load output chain as an operand. Return InputChain by reference.
+  SDValue Chain = StoreNode->getChain();
+
+  bool ChainCheck = false;
+  if (Chain == Load.getValue(1)) {
+    ChainCheck = true;
+    InputChain = LoadNode->getChain();
+  } else if (Chain.getOpcode() == ISD::TokenFactor) {
+    SmallVector<SDValue, 4> ChainOps;
+    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
+      SDValue Op = Chain.getOperand(i);
+      if (Op == Load.getValue(1)) {
+        ChainCheck = true;
+        continue;
+      }
+
+      // Make sure using Op as part of the chain would not cause a cycle here.
+      // In theory, we could check whether the chain node is a predecessor of
+      // the load. But that can be very expensive. Instead visit the uses and
+      // make sure they all have smaller node id than the load.
+      int LoadId = LoadNode->getNodeId();
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+             UE = UI->use_end(); UI != UE; ++UI) {
+        if (UI.getUse().getResNo() != 0)
+          continue;
+        if (UI->getNodeId() > LoadId)
+          return false;
+      }
+
+      ChainOps.push_back(Op);
+    }
+
+    if (ChainCheck)
+      // Make a new TokenFactor with all the other input chains except
+      // for the load.
+      InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
+                                   MVT::Other, ChainOps);
+  }
+  if (!ChainCheck)
+    return false;
+
+  return true;
+}
+
+/// Get the appropriate X86 opcode for an in-memory increment or decrement.
+/// Opc should be X86ISD::DEC or X86ISD::INC.
+static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
+  if (Opc == X86ISD::DEC) {
+    if (LdVT == MVT::i64) return X86::DEC64m;
+    if (LdVT == MVT::i32) return X86::DEC32m;
+    if (LdVT == MVT::i16) return X86::DEC16m;
+    if (LdVT == MVT::i8)  return X86::DEC8m;
+  } else {
+    assert(Opc == X86ISD::INC && "unrecognized opcode");
+    if (LdVT == MVT::i64) return X86::INC64m;
+    if (LdVT == MVT::i32) return X86::INC32m;
+    if (LdVT == MVT::i16) return X86::INC16m;
+    if (LdVT == MVT::i8)  return X86::INC8m;
+  }
+  llvm_unreachable("unrecognized size for LdVT");
+}
+
+/// Customized ISel for GATHER operations.
+bool X86DAGToDAGISel::tryGather(SDNode *Node, unsigned Opc) {
+  // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
+  SDValue Chain = Node->getOperand(0);
+  SDValue VSrc = Node->getOperand(2);
+  SDValue Base = Node->getOperand(3);
+  SDValue VIdx = Node->getOperand(4);
+  SDValue VMask = Node->getOperand(5);
+  ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
+  if (!Scale)
+    return false;
+
+  SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
+                                   MVT::Other);
+
+  SDLoc DL(Node);
+
+  // Memory Operands: Base, Scale, Index, Disp, Segment
+  SDValue Disp = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  SDValue Segment = CurDAG->getRegister(0, MVT::i32);
+  const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue(), DL), VIdx,
+                          Disp, Segment, VMask, Chain};
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
+  // Node has 2 outputs: VDst and MVT::Other.
+  // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
+  // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
+  // of ResNode.
+  ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+  ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
+  CurDAG->RemoveDeadNode(Node);
+  return true;
+}
+
+void X86DAGToDAGISel::Select(SDNode *Node) {
+  MVT NVT = Node->getSimpleValueType(0);
+  unsigned Opc, MOpc;
+  unsigned Opcode = Node->getOpcode();
+  SDLoc dl(Node);
+
+  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
+
+  if (Node->isMachineOpcode()) {
+    DEBUG(dbgs() << "== ";  Node->dump(CurDAG); dbgs() << '\n');
+    Node->setNodeId(-1);
+    return;   // Already selected.
+  }
+
+  switch (Opcode) {
+  default: break;
+  case ISD::BRIND: {
+    if (Subtarget->isTargetNaCl())
+      // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
+      // leave the instruction alone.
+      break;
+    if (Subtarget->isTarget64BitILP32()) {
+      // Converts a 32-bit register to a 64-bit, zero-extended version of
+      // it. This is needed because x86-64 can do many things, but jmp %r32
+      // ain't one of them.
+      const SDValue &Target = Node->getOperand(1);
+      assert(Target.getSimpleValueType() == llvm::MVT::i32);
+      SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
+      SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
+                                      Node->getOperand(0), ZextTarget);
+      ReplaceNode(Node, Brind.getNode());
+      SelectCode(ZextTarget.getNode());
+      SelectCode(Brind.getNode());
+      return;
+    }
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default: break;
+    case Intrinsic::x86_avx2_gather_d_pd:
+    case Intrinsic::x86_avx2_gather_d_pd_256:
+    case Intrinsic::x86_avx2_gather_q_pd:
+    case Intrinsic::x86_avx2_gather_q_pd_256:
+    case Intrinsic::x86_avx2_gather_d_ps:
+    case Intrinsic::x86_avx2_gather_d_ps_256:
+    case Intrinsic::x86_avx2_gather_q_ps:
+    case Intrinsic::x86_avx2_gather_q_ps_256:
+    case Intrinsic::x86_avx2_gather_d_q:
+    case Intrinsic::x86_avx2_gather_d_q_256:
+    case Intrinsic::x86_avx2_gather_q_q:
+    case Intrinsic::x86_avx2_gather_q_q_256:
+    case Intrinsic::x86_avx2_gather_d_d:
+    case Intrinsic::x86_avx2_gather_d_d_256:
+    case Intrinsic::x86_avx2_gather_q_d:
+    case Intrinsic::x86_avx2_gather_q_d_256: {
+      if (!Subtarget->hasAVX2())
+        break;
+      unsigned Opc;
+      switch (IntNo) {
+      default: llvm_unreachable("Impossible intrinsic");
+      case Intrinsic::x86_avx2_gather_d_pd:     Opc = X86::VGATHERDPDrm;  break;
+      case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
+      case Intrinsic::x86_avx2_gather_q_pd:     Opc = X86::VGATHERQPDrm;  break;
+      case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
+      case Intrinsic::x86_avx2_gather_d_ps:     Opc = X86::VGATHERDPSrm;  break;
+      case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
+      case Intrinsic::x86_avx2_gather_q_ps:     Opc = X86::VGATHERQPSrm;  break;
+      case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
+      case Intrinsic::x86_avx2_gather_d_q:      Opc = X86::VPGATHERDQrm;  break;
+      case Intrinsic::x86_avx2_gather_d_q_256:  Opc = X86::VPGATHERDQYrm; break;
+      case Intrinsic::x86_avx2_gather_q_q:      Opc = X86::VPGATHERQQrm;  break;
+      case Intrinsic::x86_avx2_gather_q_q_256:  Opc = X86::VPGATHERQQYrm; break;
+      case Intrinsic::x86_avx2_gather_d_d:      Opc = X86::VPGATHERDDrm;  break;
+      case Intrinsic::x86_avx2_gather_d_d_256:  Opc = X86::VPGATHERDDYrm; break;
+      case Intrinsic::x86_avx2_gather_q_d:      Opc = X86::VPGATHERQDrm;  break;
+      case Intrinsic::x86_avx2_gather_q_d_256:  Opc = X86::VPGATHERQDYrm; break;
+      }
+      if (tryGather(Node, Opc))
+        return;
+      break;
+    }
+    }
+    break;
+  }
+  case X86ISD::GlobalBaseReg:
+    ReplaceNode(Node, getGlobalBaseReg());
+    return;
+
+  case X86ISD::SHRUNKBLEND: {
+    // SHRUNKBLEND selects like a regular VSELECT.
+    SDValue VSelect = CurDAG->getNode(
+        ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
+        Node->getOperand(1), Node->getOperand(2));
+    ReplaceUses(SDValue(Node, 0), VSelect);
+    SelectCode(VSelect.getNode());
+    // We already called ReplaceUses.
+    return;
+  }
+
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    // For operations of the form (x << C1) op C2, check if we can use a smaller
+    // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
+      break;
+
+    // i8 is unshrinkable, i16 should be promoted to i32.
+    if (NVT != MVT::i32 && NVT != MVT::i64)
+      break;
+
+    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
+    ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+    if (!Cst || !ShlCst)
+      break;
+
+    int64_t Val = Cst->getSExtValue();
+    uint64_t ShlVal = ShlCst->getZExtValue();
+
+    // Make sure that we don't change the operation by removing bits.
+    // This only matters for OR and XOR, AND is unaffected.
+    uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
+    if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
+      break;
+
+    unsigned ShlOp, AddOp, Op;
+    MVT CstVT = NVT;
+
+    // Check the minimum bitwidth for the new constant.
+    // TODO: AND32ri is the same as AND64ri32 with zext imm.
+    // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
+    // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
+    if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
+      CstVT = MVT::i8;
+    else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
+      CstVT = MVT::i32;
+
+    // Bail if there is no smaller encoding.
+    if (NVT == CstVT)
+      break;
+
+    switch (NVT.SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i32:
+      assert(CstVT == MVT::i8);
+      ShlOp = X86::SHL32ri;
+      AddOp = X86::ADD32rr;
+
+      switch (Opcode) {
+      default: llvm_unreachable("Impossible opcode");
+      case ISD::AND: Op = X86::AND32ri8; break;
+      case ISD::OR:  Op =  X86::OR32ri8; break;
+      case ISD::XOR: Op = X86::XOR32ri8; break;
+      }
+      break;
+    case MVT::i64:
+      assert(CstVT == MVT::i8 || CstVT == MVT::i32);
+      ShlOp = X86::SHL64ri;
+      AddOp = X86::ADD64rr;
+
+      switch (Opcode) {
+      default: llvm_unreachable("Impossible opcode");
+      case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
+      case ISD::OR:  Op = CstVT==MVT::i8?  X86::OR64ri8 :  X86::OR64ri32; break;
+      case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
+      }
+      break;
+    }
+
+    // Emit the smaller op and the shift.
+    SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT);
+    SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
+    if (ShlVal == 1)
+      CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0),
+                           SDValue(New, 0));
+    else
+      CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
+                           getI8Imm(ShlVal, dl));
+    return;
+  }
+  case X86ISD::UMUL8:
+  case X86ISD::SMUL8: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r);
+
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::AL,
+                                          N0, SDValue()).getValue(1);
+
+    SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32);
+    SDValue Ops[] = {N1, InFlag};
+    SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+
+    ReplaceNode(Node, CNode);
+    return;
+  }
+
+  case X86ISD::UMUL: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    unsigned LoReg;
+    switch (NVT.SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i8:  LoReg = X86::AL;  Opc = X86::MUL8r; break;
+    case MVT::i16: LoReg = X86::AX;  Opc = X86::MUL16r; break;
+    case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break;
+    case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break;
+    }
+
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
+                                          N0, SDValue()).getValue(1);
+
+    SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
+    SDValue Ops[] = {N1, InFlag};
+    SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+
+    ReplaceNode(Node, CNode);
+    return;
+  }
+
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    bool isSigned = Opcode == ISD::SMUL_LOHI;
+    bool hasBMI2 = Subtarget->hasBMI2();
+    if (!isSigned) {
+      switch (NVT.SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
+      case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
+      case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
+                     MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
+      case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
+                     MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
+      }
+    } else {
+      switch (NVT.SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::IMUL8r;  MOpc = X86::IMUL8m;  break;
+      case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
+      case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
+      case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
+      }
+    }
+
+    unsigned SrcReg, LoReg, HiReg;
+    switch (Opc) {
+    default: llvm_unreachable("Unknown MUL opcode!");
+    case X86::IMUL8r:
+    case X86::MUL8r:
+      SrcReg = LoReg = X86::AL; HiReg = X86::AH;
+      break;
+    case X86::IMUL16r:
+    case X86::MUL16r:
+      SrcReg = LoReg = X86::AX; HiReg = X86::DX;
+      break;
+    case X86::IMUL32r:
+    case X86::MUL32r:
+      SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
+      break;
+    case X86::IMUL64r:
+    case X86::MUL64r:
+      SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
+      break;
+    case X86::MULX32rr:
+      SrcReg = X86::EDX; LoReg = HiReg = 0;
+      break;
+    case X86::MULX64rr:
+      SrcReg = X86::RDX; LoReg = HiReg = 0;
+      break;
+    }
+
+    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+    bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+    // Multiply is commmutative.
+    if (!foldedLoad) {
+      foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+      if (foldedLoad)
+        std::swap(N0, N1);
+    }
+
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
+                                          N0, SDValue()).getValue(1);
+    SDValue ResHi, ResLo;
+
+    if (foldedLoad) {
+      SDValue Chain;
+      MachineSDNode *CNode = nullptr;
+      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+                        InFlag };
+      if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) {
+        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue);
+        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+        ResHi = SDValue(CNode, 0);
+        ResLo = SDValue(CNode, 1);
+        Chain = SDValue(CNode, 2);
+        InFlag = SDValue(CNode, 3);
+      } else {
+        SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+        Chain = SDValue(CNode, 0);
+        InFlag = SDValue(CNode, 1);
+      }
+
+      // Update the chain.
+      ReplaceUses(N1.getValue(1), Chain);
+      // Record the mem-refs
+      LoadSDNode *LoadNode = cast<LoadSDNode>(N1);
+      if (LoadNode) {
+        MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+        MemOp[0] = LoadNode->getMemOperand();
+        CNode->setMemRefs(MemOp, MemOp + 1);
+      }
+    } else {
+      SDValue Ops[] = { N1, InFlag };
+      if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) {
+        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+        ResHi = SDValue(CNode, 0);
+        ResLo = SDValue(CNode, 1);
+        InFlag = SDValue(CNode, 2);
+      } else {
+        SDVTList VTs = CurDAG->getVTList(MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+        InFlag = SDValue(CNode, 0);
+      }
+    }
+
+    // Prevent use of AH in a REX instruction by referencing AX instead.
+    if (HiReg == X86::AH && Subtarget->is64Bit() &&
+        !SDValue(Node, 1).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              X86::AX, MVT::i16, InFlag);
+      InFlag = Result.getValue(2);
+      // Get the low part if needed. Don't use getCopyFromReg for aliasing
+      // registers.
+      if (!SDValue(Node, 0).use_empty())
+        ReplaceUses(SDValue(Node, 1),
+          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+
+      // Shift AX down 8 bits.
+      Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+                                              Result,
+                                     CurDAG->getTargetConstant(8, dl, MVT::i8)),
+                       0);
+      // Then truncate it down to i8.
+      ReplaceUses(SDValue(Node, 1),
+        CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+    }
+    // Copy the low half of the result, if it is needed.
+    if (!SDValue(Node, 0).use_empty()) {
+      if (!ResLo.getNode()) {
+        assert(LoReg && "Register for low half is not defined!");
+        ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
+                                       InFlag);
+        InFlag = ResLo.getValue(2);
+      }
+      ReplaceUses(SDValue(Node, 0), ResLo);
+      DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+    // Copy the high half of the result, if it is needed.
+    if (!SDValue(Node, 1).use_empty()) {
+      if (!ResHi.getNode()) {
+        assert(HiReg && "Register for high half is not defined!");
+        ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
+                                       InFlag);
+        InFlag = ResHi.getValue(2);
+      }
+      ReplaceUses(SDValue(Node, 1), ResHi);
+      DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+
+    return;
+  }
+
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+  case X86ISD::SDIVREM8_SEXT_HREG:
+  case X86ISD::UDIVREM8_ZEXT_HREG: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    bool isSigned = (Opcode == ISD::SDIVREM ||
+                     Opcode == X86ISD::SDIVREM8_SEXT_HREG);
+    if (!isSigned) {
+      switch (NVT.SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
+      case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
+      case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
+      case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+      }
+    } else {
+      switch (NVT.SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
+      case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+      case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+      case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+      }
+    }
+
+    unsigned LoReg, HiReg, ClrReg;
+    unsigned SExtOpcode;
+    switch (NVT.SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i8:
+      LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
+      SExtOpcode = X86::CBW;
+      break;
+    case MVT::i16:
+      LoReg = X86::AX;  HiReg = X86::DX;
+      ClrReg = X86::DX;
+      SExtOpcode = X86::CWD;
+      break;
+    case MVT::i32:
+      LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
+      SExtOpcode = X86::CDQ;
+      break;
+    case MVT::i64:
+      LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
+      SExtOpcode = X86::CQO;
+      break;
+    }
+
+    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+    bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+    bool signBitIsZero = CurDAG->SignBitIsZero(N0);
+
+    SDValue InFlag;
+    if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
+      // Special case for div8, just use a move with zero extension to AX to
+      // clear the upper 8 bits (AH).
+      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
+      if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
+        Move =
+          SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
+                                         MVT::Other, Ops), 0);
+        Chain = Move.getValue(1);
+        ReplaceUses(N0.getValue(1), Chain);
+      } else {
+        Move =
+          SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0);
+        Chain = CurDAG->getEntryNode();
+      }
+      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue());
+      InFlag = Chain.getValue(1);
+    } else {
+      InFlag =
+        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
+                             LoReg, N0, SDValue()).getValue(1);
+      if (isSigned && !signBitIsZero) {
+        // Sign extend the low part into the high part.
+        InFlag =
+          SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
+      } else {
+        // Zero out the high part, effectively zero extending the input.
+        SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
+        switch (NVT.SimpleTy) {
+        case MVT::i16:
+          ClrNode =
+              SDValue(CurDAG->getMachineNode(
+                          TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
+                          CurDAG->getTargetConstant(X86::sub_16bit, dl,
+                                                    MVT::i32)),
+                      0);
+          break;
+        case MVT::i32:
+          break;
+        case MVT::i64:
+          ClrNode =
+              SDValue(CurDAG->getMachineNode(
+                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+                          CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
+                          CurDAG->getTargetConstant(X86::sub_32bit, dl,
+                                                    MVT::i32)),
+                      0);
+          break;
+        default:
+          llvm_unreachable("Unexpected division source");
+        }
+
+        InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
+                                      ClrNode, InFlag).getValue(1);
+      }
+    }
+
+    if (foldedLoad) {
+      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+                        InFlag };
+      SDNode *CNode =
+        CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
+      InFlag = SDValue(CNode, 1);
+      // Update the chain.
+      ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+    } else {
+      InFlag =
+        SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
+    }
+
+    // Prevent use of AH in a REX instruction by explicitly copying it to
+    // an ABCD_L register.
+    //
+    // The current assumption of the register allocator is that isel
+    // won't generate explicit references to the GR8_ABCD_H registers. If
+    // the allocator and/or the backend get enhanced to be more robust in
+    // that regard, this can be, and should be, removed.
+    if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
+      SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
+      unsigned AHExtOpcode =
+          isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8;
+
+      SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
+                                             MVT::Glue, AHCopy, InFlag);
+      SDValue Result(RNode, 0);
+      InFlag = SDValue(RNode, 1);
+
+      if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG ||
+          Opcode == X86ISD::SDIVREM8_SEXT_HREG) {
+        if (Node->getValueType(1) == MVT::i64) {
+          // It's not possible to directly movsx AH to a 64bit register, because
+          // the latter needs the REX prefix, but the former can't have it.
+          assert(Opcode != X86ISD::SDIVREM8_SEXT_HREG &&
+                 "Unexpected i64 sext of h-register");
+          Result =
+              SDValue(CurDAG->getMachineNode(
+                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+                          CurDAG->getTargetConstant(0, dl, MVT::i64), Result,
+                          CurDAG->getTargetConstant(X86::sub_32bit, dl,
+                                                    MVT::i32)),
+                      0);
+        }
+      } else {
+        Result =
+            CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
+      }
+      ReplaceUses(SDValue(Node, 1), Result);
+      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+    // Copy the division (low) result, if it is needed.
+    if (!SDValue(Node, 0).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                                LoReg, NVT, InFlag);
+      InFlag = Result.getValue(2);
+      ReplaceUses(SDValue(Node, 0), Result);
+      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+    // Copy the remainder (high) result, if it is needed.
+    if (!SDValue(Node, 1).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              HiReg, NVT, InFlag);
+      InFlag = Result.getValue(2);
+      ReplaceUses(SDValue(Node, 1), Result);
+      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+    return;
+  }
+
+  case X86ISD::CMP:
+  case X86ISD::SUB: {
+    // Sometimes a SUB is used to perform comparison.
+    if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0))
+      // This node is not a CMP.
+      break;
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
+        hasNoSignedComparisonUses(Node))
+      N0 = N0.getOperand(0);
+
+    // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
+    // use a smaller encoding.
+    // Look past the truncate if CMP is the only use of it.
+    if ((N0.getNode()->getOpcode() == ISD::AND ||
+         (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) &&
+        N0.getNode()->hasOneUse() &&
+        N0.getValueType() != MVT::i8 &&
+        X86::isZeroNode(N1)) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
+      if (!C) break;
+
+      // For example, convert "testl %eax, $8" to "testb %al, $8"
+      if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 &&
+          (!(C->getZExtValue() & 0x80) ||
+           hasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // On x86-32, only the ABCD registers have 8-bit subregisters.
+        if (!Subtarget->is64Bit()) {
+          const TargetRegisterClass *TRC;
+          switch (N0.getSimpleValueType().SimpleTy) {
+          case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+          case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+          default: llvm_unreachable("Unsupported TEST operand type!");
+          }
+          SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
+          Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+                                               Reg.getValueType(), Reg, RC), 0);
+        }
+
+        // Extract the l-register.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
+                                                        MVT::i8, Reg);
+
+        // Emit a testb.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return;
+      }
+
+      // For example, "testl %eax, $2048" to "testb %ah, $8".
+      if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 &&
+          (!(C->getZExtValue() & 0x8000) ||
+           hasNoSignedComparisonUses(Node))) {
+        // Shift the immediate right by 8 bits.
+        SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
+                                                       dl, MVT::i8);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Put the value in an ABCD register.
+        const TargetRegisterClass *TRC;
+        switch (N0.getSimpleValueType().SimpleTy) {
+        case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
+        case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+        case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+        default: llvm_unreachable("Unsupported TEST operand type!");
+        }
+        SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
+        Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+                                             Reg.getValueType(), Reg, RC), 0);
+
+        // Extract the h-register.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
+                                                        MVT::i8, Reg);
+
+        // Emit a testb.  The EXTRACT_SUBREG becomes a COPY that can only
+        // target GR8_NOREX registers, so make sure the register class is
+        // forced.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
+                                                 MVT::i32, Subreg, ShiftedImm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return;
+      }
+
+      // For example, "testl %eax, $32776" to "testw %ax, $32776".
+      if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 &&
+          N0.getValueType() != MVT::i16 &&
+          (!(C->getZExtValue() & 0x8000) ||
+           hasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
+                                                MVT::i16);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Extract the 16-bit subregister.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
+                                                        MVT::i16, Reg);
+
+        // Emit a testw.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return;
+      }
+
+      // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
+      if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 &&
+          N0.getValueType() == MVT::i64 &&
+          (!(C->getZExtValue() & 0x80000000) ||
+           hasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
+                                                MVT::i32);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Extract the 32-bit subregister.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
+                                                        MVT::i32, Reg);
+
+        // Emit a testl.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return;
+      }
+    }
+    break;
+  }
+  case ISD::STORE: {
+    // Change a chain of {load; incr or dec; store} of the same value into
+    // a simple increment or decrement through memory of that value, if the
+    // uses of the modified value and its address are suitable.
+    // The DEC64m tablegen pattern is currently not able to match the case where
+    // the EFLAGS on the original DEC are used. (This also applies to
+    // {INC,DEC}X{64,32,16,8}.)
+    // We'll need to improve tablegen to allow flags to be transferred from a
+    // node in the pattern to the result node.  probably with a new keyword
+    // for example, we have this
+    // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+    //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+    //   (implicit EFLAGS)]>;
+    // but maybe need something like this
+    // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+    //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+    //   (transferrable EFLAGS)]>;
+
+    StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
+    SDValue StoredVal = StoreNode->getOperand(1);
+    unsigned Opc = StoredVal->getOpcode();
+
+    LoadSDNode *LoadNode = nullptr;
+    SDValue InputChain;
+    if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG,
+                             LoadNode, InputChain))
+      break;
+
+    SDValue Base, Scale, Index, Disp, Segment;
+    if (!selectAddr(LoadNode, LoadNode->getBasePtr(),
+                    Base, Scale, Index, Disp, Segment))
+      break;
+
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
+    MemOp[0] = StoreNode->getMemOperand();
+    MemOp[1] = LoadNode->getMemOperand();
+    const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain };
+    EVT LdVT = LoadNode->getMemoryVT();
+    unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
+    MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
+                                                   SDLoc(Node),
+                                                   MVT::i32, MVT::Other, Ops);
+    Result->setMemRefs(MemOp, MemOp + 2);
+
+    ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
+    ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
+    CurDAG->RemoveDeadNode(Node);
+    return;
+  }
+  }
+
+  SelectCode(Node);
+}
+
+bool X86DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1, Op2, Op3, Op4;
+  switch (ConstraintID) {
+  default:
+    llvm_unreachable("Unexpected asm memory constraint");
+  case InlineAsm::Constraint_i:
+    // FIXME: It seems strange that 'i' is needed here since it's supposed to
+    //        be an immediate and not a memory constraint.
+    LLVM_FALLTHROUGH;
+  case InlineAsm::Constraint_o: // offsetable        ??
+  case InlineAsm::Constraint_v: // not offsetable    ??
+  case InlineAsm::Constraint_m: // memory
+  case InlineAsm::Constraint_X:
+    if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
+      return true;
+    break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  OutOps.push_back(Op2);
+  OutOps.push_back(Op3);
+  OutOps.push_back(Op4);
+  return false;
+}
+
+/// This pass converts a legalized DAG into a X86-specific DAG,
+/// ready for instruction scheduling.
+FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel) {
+  return new X86DAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
new file mode 100644
index 000000000000..b293dfa98f82
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -0,0 +1,34395 @@
+//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ISelLowering.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "X86CallingConv.h"
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86IntrinsicsInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86ShuffleDecodeConstantPool.h"
+#include "X86TargetMachine.h"
+#include "X86TargetObjectFile.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+#include <bitset>
+#include <cctype>
+#include <numeric>
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-isel"
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+static cl::opt<bool> ExperimentalVectorWideningLegalization(
+    "x86-experimental-vector-widening-legalization", cl::init(false),
+    cl::desc("Enable an experimental vector type legalization through widening "
+             "rather than promotion."),
+    cl::Hidden);
+
+X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
+                                     const X86Subtarget &STI)
+    : TargetLowering(TM), Subtarget(STI) {
+  bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
+  X86ScalarSSEf64 = Subtarget.hasSSE2();
+  X86ScalarSSEf32 = Subtarget.hasSSE1();
+  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
+
+  // Set up the TargetLowering object.
+
+  // X86 is weird. It always uses i8 for shift amounts and setcc results.
+  setBooleanContents(ZeroOrOneBooleanContent);
+  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  // For 64-bit, since we have so many registers, use the ILP scheduler.
+  // For 32-bit, use the register pressure specific scheduling.
+  // For Atom, always use ILP scheduling.
+  if (Subtarget.isAtom())
+    setSchedulingPreference(Sched::ILP);
+  else if (Subtarget.is64Bit())
+    setSchedulingPreference(Sched::ILP);
+  else
+    setSchedulingPreference(Sched::RegPressure);
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
+
+  // Bypass expensive divides on Atom when compiling with O2.
+  if (TM.getOptLevel() >= CodeGenOpt::Default) {
+    if (Subtarget.hasSlowDivide32())
+      addBypassSlowDiv(32, 8);
+    if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
+      addBypassSlowDiv(64, 16);
+  }
+
+  if (Subtarget.isTargetKnownWindowsMSVC() ||
+      Subtarget.isTargetWindowsItanium()) {
+    // Setup Windows compiler runtime calls.
+    setLibcallName(RTLIB::SDIV_I64, "_alldiv");
+    setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
+    setLibcallName(RTLIB::SREM_I64, "_allrem");
+    setLibcallName(RTLIB::UREM_I64, "_aullrem");
+    setLibcallName(RTLIB::MUL_I64, "_allmul");
+    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
+  }
+
+  if (Subtarget.isTargetDarwin()) {
+    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
+    setUseUnderscoreSetJmp(false);
+    setUseUnderscoreLongJmp(false);
+  } else if (Subtarget.isTargetWindowsGNU()) {
+    // MS runtime is weird: it exports _setjmp, but longjmp!
+    setUseUnderscoreSetJmp(true);
+    setUseUnderscoreLongJmp(false);
+  } else {
+    setUseUnderscoreSetJmp(true);
+    setUseUnderscoreLongJmp(true);
+  }
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i8, &X86::GR8RegClass);
+  addRegisterClass(MVT::i16, &X86::GR16RegClass);
+  addRegisterClass(MVT::i32, &X86::GR32RegClass);
+  if (Subtarget.is64Bit())
+    addRegisterClass(MVT::i64, &X86::GR64RegClass);
+
+  for (MVT VT : MVT::integer_valuetypes())
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+
+  // We don't accept any truncstore of integer registers.
+  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
+  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
+  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // SETOEQ and SETUNE require checking two conditions.
+  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
+
+  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+  // operation.
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
+
+  if (Subtarget.is64Bit()) {
+    if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
+      // f32/f64 are legal, f80 is custom.
+      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
+    else
+      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
+  } else if (!Subtarget.useSoftFloat()) {
+    // We have an algorithm for SSE2->double, and we turn this into a
+    // 64-bit FILD followed by conditional FADD for other targets.
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
+    // We have an algorithm for SSE2, and we turn this into a 64-bit
+    // FILD or VCVTUSI2SS/SD for other targets.
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
+  }
+
+  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+  // this operation.
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
+
+  if (!Subtarget.useSoftFloat()) {
+    // SSE has no i16 to fp conversion, only i32.
+    if (X86ScalarSSEf32) {
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
+      // f32 and f64 cases are Legal, f80 case is not
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
+    } else {
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
+    }
+  } else {
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
+  }
+
+  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+  // this operation.
+  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
+  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
+
+  if (!Subtarget.useSoftFloat()) {
+    // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
+    // are Legal, f80 is custom lowered.
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
+
+    if (X86ScalarSSEf32) {
+      setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
+      // f32 and f64 cases are Legal, f80 case is not
+      setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+    } else {
+      setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
+      setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+    }
+  } else {
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Expand);
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Expand);
+  }
+
+  // Handle FP_TO_UINT by promoting the destination to a larger signed
+  // conversion.
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
+
+  if (Subtarget.is64Bit()) {
+    if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
+      // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
+    } else {
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
+    }
+  } else if (!Subtarget.useSoftFloat()) {
+    // Since AVX is a superset of SSE3, only check for SSE here.
+    if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
+      // Expand FP_TO_UINT into a select.
+      // FIXME: We would like to use a Custom expander here eventually to do
+      // the optimal thing for SSE vs. the default expansion in the legalizer.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
+    else
+      // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
+      // With SSE3 we can use fisttpll to convert to a signed i64; without
+      // SSE, we're stuck with a fistpll.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
+
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
+  }
+
+  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
+  if (!X86ScalarSSEf64) {
+    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
+    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
+    if (Subtarget.is64Bit()) {
+      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
+      // Without SSE, i64->f64 goes through memory.
+      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
+    }
+  } else if (!Subtarget.is64Bit())
+    setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
+
+  // Scalar integer divide and remainder are lowered to use operations that
+  // produce two results, to match the available instructions. This exposes
+  // the two-result form to trivial CSE, which is able to combine x/y and x%y
+  // into a single instruction.
+  //
+  // Scalar integer multiply-high is also lowered to use two-result
+  // operations, to match the available instructions. However, plain multiply
+  // (low) operations are left as Legal, as there are single-result
+  // instructions for this in x86. Using the two-result multiply instructions
+  // when both high and low results are needed must be arranged by dagcombine.
+  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+    setOperationAction(ISD::MULHS, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
+    setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+  }
+
+  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+    if (VT == MVT::i64 && !Subtarget.is64Bit())
+      continue;
+    // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
+    setOperationAction(ISD::ADDC, VT, Custom);
+    setOperationAction(ISD::ADDE, VT, Custom);
+    setOperationAction(ISD::SUBC, VT, Custom);
+    setOperationAction(ISD::SUBE, VT, Custom);
+  }
+
+  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
+  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
+                   MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
+    setOperationAction(ISD::BR_CC,     VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+  }
+  if (Subtarget.is64Bit())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
+  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
+
+  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
+  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
+
+  // Promote the i8 variants and force them on up to i32 which has a shorter
+  // encoding.
+  setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
+  setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
+  if (!Subtarget.hasBMI()) {
+    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
+    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
+    if (Subtarget.is64Bit()) {
+      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
+    }
+  }
+
+  if (Subtarget.hasLZCNT()) {
+    // When promoting the i8 variants, force them to i32 for a shorter
+    // encoding.
+    setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
+    setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
+  } else {
+    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
+    if (Subtarget.is64Bit()) {
+      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+    }
+  }
+
+  // Special handling for half-precision floating point conversions.
+  // If we don't have F16C support, then lower half float conversions
+  // into library calls.
+  if (Subtarget.useSoftFloat() ||
+      (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
+    setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+    setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+  }
+
+  // There's never any support for operations beyond MVT::f32.
+  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+  setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f80, MVT::f16, Expand);
+
+  if (Subtarget.hasPOPCNT()) {
+    setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
+  } else {
+    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
+    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
+    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
+    if (Subtarget.is64Bit())
+      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
+  }
+
+  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
+
+  if (!Subtarget.hasMOVBE())
+    setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
+
+  // These should be promoted to a larger select which is supported.
+  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
+  // X86 wants to expand cmov itself.
+  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
+    setOperationAction(ISD::SELECT, VT, Custom);
+    setOperationAction(ISD::SETCC, VT, Custom);
+  }
+  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+    if (VT == MVT::i64 && !Subtarget.is64Bit())
+      continue;
+    setOperationAction(ISD::SELECT, VT, Custom);
+    setOperationAction(ISD::SETCC,  VT, Custom);
+    setOperationAction(ISD::SETCCE, VT, Custom);
+  }
+  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
+  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
+  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
+  // support continuation, user-level threading, and etc.. As a result, no
+  // other SjLj exception interfaces are implemented and please don't build
+  // your own exception handling based on them.
+  // LLVM/Clang supports zero-cost DWARF exception handling.
+  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+  setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
+  if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
+    setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
+
+  // Darwin ABI issue.
+  for (auto VT : { MVT::i32, MVT::i64 }) {
+    if (VT == MVT::i64 && !Subtarget.is64Bit())
+      continue;
+    setOperationAction(ISD::ConstantPool    , VT, Custom);
+    setOperationAction(ISD::JumpTable       , VT, Custom);
+    setOperationAction(ISD::GlobalAddress   , VT, Custom);
+    setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
+    setOperationAction(ISD::ExternalSymbol  , VT, Custom);
+    setOperationAction(ISD::BlockAddress    , VT, Custom);
+  }
+  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+  for (auto VT : { MVT::i32, MVT::i64 }) {
+    if (VT == MVT::i64 && !Subtarget.is64Bit())
+      continue;
+    setOperationAction(ISD::SHL_PARTS, VT, Custom);
+    setOperationAction(ISD::SRA_PARTS, VT, Custom);
+    setOperationAction(ISD::SRL_PARTS, VT, Custom);
+  }
+
+  if (Subtarget.hasSSE1())
+    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
+
+  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
+
+  // Expand certain atomics
+  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
+    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
+  }
+
+  if (Subtarget.hasCmpxchg16b()) {
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
+  }
+
+  // FIXME - use subtarget debug flags
+  if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
+      !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
+      TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
+    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+  }
+
+  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
+  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
+
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  bool Is64Bit = Subtarget.is64Bit();
+  setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
+
+  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
+
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
+
+  // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
+  setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
+  setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
+
+  if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
+    // f32 and f64 use SSE.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
+                                                     : &X86::FR32RegClass);
+    addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
+                                                     : &X86::FR64RegClass);
+
+    for (auto VT : { MVT::f32, MVT::f64 }) {
+      // Use ANDPD to simulate FABS.
+      setOperationAction(ISD::FABS, VT, Custom);
+
+      // Use XORP to simulate FNEG.
+      setOperationAction(ISD::FNEG, VT, Custom);
+
+      // Use ANDPD and ORPD to simulate FCOPYSIGN.
+      setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+
+      // We don't support sin/cos/fmod
+      setOperationAction(ISD::FSIN   , VT, Expand);
+      setOperationAction(ISD::FCOS   , VT, Expand);
+      setOperationAction(ISD::FSINCOS, VT, Expand);
+    }
+
+    // Lower this to MOVMSK plus an AND.
+    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
+    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
+
+    // Expand FP immediates into loads from the stack, except for the special
+    // cases we handle.
+    addLegalFPImmediate(APFloat(+0.0)); // xorpd
+    addLegalFPImmediate(APFloat(+0.0f)); // xorps
+  } else if (UseX87 && X86ScalarSSEf32) {
+    // Use SSE for f32, x87 for f64.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
+                                                     : &X86::FR32RegClass);
+    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+
+    // Use ANDPS to simulate FABS.
+    setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+    // Use XORP to simulate FNEG.
+    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
+
+    // Use ANDPS and ORPS to simulate FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+    // We don't support sin/cos/fmod
+    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+    // Special cases we handle for FP constants.
+    addLegalFPImmediate(APFloat(+0.0f)); // xorps
+    addLegalFPImmediate(APFloat(+0.0)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+
+    if (!TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    }
+  } else if (UseX87) {
+    // f32 and f64 in x87.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
+
+    for (auto VT : { MVT::f32, MVT::f64 }) {
+      setOperationAction(ISD::UNDEF,     VT, Expand);
+      setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+
+      if (!TM.Options.UnsafeFPMath) {
+        setOperationAction(ISD::FSIN   , VT, Expand);
+        setOperationAction(ISD::FCOS   , VT, Expand);
+        setOperationAction(ISD::FSINCOS, VT, Expand);
+      }
+    }
+    addLegalFPImmediate(APFloat(+0.0)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
+  }
+
+  // We don't support FMA.
+  setOperationAction(ISD::FMA, MVT::f64, Expand);
+  setOperationAction(ISD::FMA, MVT::f32, Expand);
+
+  // Long double always uses X87, except f128 in MMX.
+  if (UseX87) {
+    if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
+      addRegisterClass(MVT::f128, &X86::FR128RegClass);
+      ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
+      setOperationAction(ISD::FABS , MVT::f128, Custom);
+      setOperationAction(ISD::FNEG , MVT::f128, Custom);
+      setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
+    }
+
+    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
+    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
+    {
+      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
+      addLegalFPImmediate(TmpFlt);  // FLD0
+      TmpFlt.changeSign();
+      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
+
+      bool ignored;
+      APFloat TmpFlt2(+1.0);
+      TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
+                      &ignored);
+      addLegalFPImmediate(TmpFlt2);  // FLD1
+      TmpFlt2.changeSign();
+      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
+    }
+
+    if (!TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FSIN   , MVT::f80, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f80, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
+    }
+
+    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
+    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
+    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
+    setOperationAction(ISD::FMA, MVT::f80, Expand);
+  }
+
+  // Always use a library call for pow.
+  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
+  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
+  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
+
+  setOperationAction(ISD::FLOG, MVT::f80, Expand);
+  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
+  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
+  setOperationAction(ISD::FEXP, MVT::f80, Expand);
+  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+  setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
+  setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
+
+  // Some FP actions are always expanded for vector types.
+  for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
+                   MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
+    setOperationAction(ISD::FSIN,      VT, Expand);
+    setOperationAction(ISD::FSINCOS,   VT, Expand);
+    setOperationAction(ISD::FCOS,      VT, Expand);
+    setOperationAction(ISD::FREM,      VT, Expand);
+    setOperationAction(ISD::FPOWI,     VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+    setOperationAction(ISD::FPOW,      VT, Expand);
+    setOperationAction(ISD::FLOG,      VT, Expand);
+    setOperationAction(ISD::FLOG2,     VT, Expand);
+    setOperationAction(ISD::FLOG10,    VT, Expand);
+    setOperationAction(ISD::FEXP,      VT, Expand);
+    setOperationAction(ISD::FEXP2,     VT, Expand);
+  }
+
+  // First set operation action for all vector types to either promote
+  // (for widening) or expand (for scalarization). Then we will selectively
+  // turn on ones that can be effectively codegen'd.
+  for (MVT VT : MVT::vector_valuetypes()) {
+    setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
+    setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
+    setOperationAction(ISD::FMA,  VT, Expand);
+    setOperationAction(ISD::FFLOOR, VT, Expand);
+    setOperationAction(ISD::FCEIL, VT, Expand);
+    setOperationAction(ISD::FTRUNC, VT, Expand);
+    setOperationAction(ISD::FRINT, VT, Expand);
+    setOperationAction(ISD::FNEARBYINT, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::MULHS, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
+    setOperationAction(ISD::BSWAP, VT, Expand);
+    setOperationAction(ISD::SETCC, VT, Expand);
+    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
+    setOperationAction(ISD::TRUNCATE, VT, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
+    setOperationAction(ISD::ANY_EXTEND, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+    for (MVT InnerVT : MVT::vector_valuetypes()) {
+      setTruncStoreAction(InnerVT, VT, Expand);
+
+      setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
+
+      // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
+      // types, we have to deal with them whether we ask for Expansion or not.
+      // Setting Expand causes its own optimisation problems though, so leave
+      // them legal.
+      if (VT.getVectorElementType() == MVT::i1)
+        setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+
+      // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
+      // split/scalarized right now.
+      if (VT.getVectorElementType() == MVT::f16)
+        setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+    }
+  }
+
+  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
+  // with -msoft-float, disable use of MMX as well.
+  if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
+    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
+    // No operations on x86mmx supported, everything uses intrinsics.
+  }
+
+  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
+    addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
+                                                    : &X86::VR128RegClass);
+
+    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
+    setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
+  }
+
+  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
+    addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
+                                                    : &X86::VR128RegClass);
+
+    // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
+    // registers cannot be used even for integer operations.
+    addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
+                                                    : &X86::VR128RegClass);
+    addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
+                                                    : &X86::VR128RegClass);
+    addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
+                                                    : &X86::VR128RegClass);
+    addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
+                                                    : &X86::VR128RegClass);
+
+    setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
+    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
+    setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
+    setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
+    setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
+    setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
+    setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
+    setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
+    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
+    setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
+
+    setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
+
+    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
+
+    setOperationAction(ISD::CTPOP,              MVT::v16i8, Custom);
+    setOperationAction(ISD::CTPOP,              MVT::v8i16, Custom);
+    setOperationAction(ISD::CTPOP,              MVT::v4i32, Custom);
+    setOperationAction(ISD::CTPOP,              MVT::v2i64, Custom);
+
+    setOperationAction(ISD::CTTZ,               MVT::v16i8, Custom);
+    setOperationAction(ISD::CTTZ,               MVT::v8i16, Custom);
+    setOperationAction(ISD::CTTZ,               MVT::v4i32, Custom);
+    setOperationAction(ISD::CTTZ,               MVT::v2i64, Custom);
+
+    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::VSELECT,            VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+    }
+
+    // We support custom legalizing of sext and anyext loads for specific
+    // memory vector types which we can load as a scalar (or sequence of
+    // scalars) and extend in-register to a legal 128-bit vector type. For sext
+    // loads these must work with a single scalar load.
+    for (MVT VT : MVT::integer_vector_valuetypes()) {
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
+    }
+
+    for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::VSELECT,            VT, Custom);
+
+      if (VT == MVT::v2i64 && !Subtarget.is64Bit())
+        continue;
+
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+    }
+
+    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
+    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+      setOperationPromotedToType(ISD::AND,    VT, MVT::v2i64);
+      setOperationPromotedToType(ISD::OR,     VT, MVT::v2i64);
+      setOperationPromotedToType(ISD::XOR,    VT, MVT::v2i64);
+      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
+      setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
+    }
+
+    // Custom lower v2i64 and v2f64 selects.
+    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
+
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
+
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
+
+    // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
+
+    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
+    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
+
+    for (MVT VT : MVT::fp_vector_valuetypes())
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
+
+    setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
+    setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
+    setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
+
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
+
+    for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+    }
+
+    // In the customized shift lowering, the legal cases in AVX2 will be
+    // recognized.
+    for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+    }
+  }
+
+  if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
+    setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
+    setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
+    setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
+    setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
+    setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
+  }
+
+  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
+    for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
+      setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
+      setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
+      setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
+      setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
+      setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
+    }
+
+    setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
+
+    // FIXME: Do we need to handle scalar-to-vector here?
+    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
+
+    // We directly match byte blends in the backend as they match the VSELECT
+    // condition form.
+    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
+
+    // SSE41 brings specific instructions for doing vector sign extend even in
+    // cases where we don't have SRA.
+    for (MVT VT : MVT::integer_vector_valuetypes()) {
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
+    }
+
+    // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+
+    // i8 vectors are custom because the source register and source
+    // source memory operand types are not the same width.
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
+  }
+
+  if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
+    for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
+                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
+      setOperationAction(ISD::ROTL, VT, Custom);
+
+    // XOP can efficiently perform BITREVERSE with VPPERM.
+    for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
+      setOperationAction(ISD::BITREVERSE, VT, Custom);
+
+    for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
+                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
+      setOperationAction(ISD::BITREVERSE, VT, Custom);
+  }
+
+  if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
+    bool HasInt256 = Subtarget.hasInt256();
+
+    addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                     : &X86::VR256RegClass);
+    addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                     : &X86::VR256RegClass);
+    addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                     : &X86::VR256RegClass);
+    addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                     : &X86::VR256RegClass);
+    addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                     : &X86::VR256RegClass);
+    addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                     : &X86::VR256RegClass);
+
+    for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
+      setOperationAction(ISD::FFLOOR,     VT, Legal);
+      setOperationAction(ISD::FCEIL,      VT, Legal);
+      setOperationAction(ISD::FTRUNC,     VT, Legal);
+      setOperationAction(ISD::FRINT,      VT, Legal);
+      setOperationAction(ISD::FNEARBYINT, VT, Legal);
+      setOperationAction(ISD::FNEG,       VT, Custom);
+      setOperationAction(ISD::FABS,       VT, Custom);
+      setOperationAction(ISD::FCOPYSIGN,  VT, Custom);
+    }
+
+    // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
+    // even though v8i16 is a legal type.
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
+
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
+    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
+
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
+
+    for (MVT VT : MVT::fp_vector_valuetypes())
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
+
+    for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+    }
+
+    setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
+
+    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
+    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
+    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
+
+    setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
+    setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
+    setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
+    setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
+    setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
+
+    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+      setOperationAction(ISD::CTPOP,           VT, Custom);
+      setOperationAction(ISD::CTTZ,            VT, Custom);
+      setOperationAction(ISD::CTLZ,            VT, Custom);
+    }
+
+    if (Subtarget.hasAnyFMA()) {
+      for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
+                       MVT::v2f64, MVT::v4f64 })
+        setOperationAction(ISD::FMA, VT, Legal);
+    }
+
+    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+      setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
+      setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
+    }
+
+    setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
+    setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
+
+    setOperationAction(ISD::UMUL_LOHI, MVT::v8i32,  Custom);
+    setOperationAction(ISD::SMUL_LOHI, MVT::v8i32,  Custom);
+
+    setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
+    setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
+
+    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
+      setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
+      setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
+      setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
+      setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
+    }
+
+    if (HasInt256) {
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64,  Custom);
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32,  Custom);
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
+
+      // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
+      // when we have a 256bit-wide blend with immediate.
+      setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
+
+      // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
+
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
+    }
+
+    // In the customized shift lowering, the legal cases in AVX2 will be
+    // recognized.
+    for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+    }
+
+    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+                     MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
+      setOperationAction(ISD::MLOAD,  VT, Legal);
+      setOperationAction(ISD::MSTORE, VT, Legal);
+    }
+
+    // Extract subvector is special because the value type
+    // (result) is 128-bit but the source is 256-bit wide.
+    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
+                     MVT::v4f32, MVT::v2f64 }) {
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+    }
+
+    // Custom lower several nodes for 256-bit types.
+    for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
+                    MVT::v8f32, MVT::v4f64 }) {
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::VSELECT,            VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
+    }
+
+    if (HasInt256)
+      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
+
+    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
+    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
+      setOperationPromotedToType(ISD::AND,    VT, MVT::v4i64);
+      setOperationPromotedToType(ISD::OR,     VT, MVT::v4i64);
+      setOperationPromotedToType(ISD::XOR,    VT, MVT::v4i64);
+      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
+      setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
+    }
+  }
+
+  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
+    addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
+    addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
+    addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
+    addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
+
+    addRegisterClass(MVT::i1,     &X86::VK1RegClass);
+    addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
+    addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
+
+    for (MVT VT : MVT::fp_vector_valuetypes())
+      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
+
+    for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
+      setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
+      setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
+      setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8,  Legal);
+      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
+      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
+      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
+    }
+    setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
+    setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
+    setOperationAction(ISD::SETCCE,             MVT::i1,    Custom);
+    setOperationAction(ISD::SELECT_CC,          MVT::i1,    Expand);
+    setOperationAction(ISD::XOR,                MVT::i1,    Legal);
+    setOperationAction(ISD::OR,                 MVT::i1,    Legal);
+    setOperationAction(ISD::AND,                MVT::i1,    Legal);
+    setOperationAction(ISD::SUB,                MVT::i1,    Custom);
+    setOperationAction(ISD::ADD,                MVT::i1,    Custom);
+    setOperationAction(ISD::MUL,                MVT::i1,    Custom);
+
+    for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
+                   MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
+                   MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+      setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
+      setLoadExtAction(ISD::EXTLOAD,  VT, MaskVT, Custom);
+      setTruncStoreAction(VT, MaskVT, Custom);
+    }
+
+    for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
+      setOperationAction(ISD::FNEG,  VT, Custom);
+      setOperationAction(ISD::FABS,  VT, Custom);
+      setOperationAction(ISD::FMA,   VT, Legal);
+      setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+    }
+
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i8, Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1, Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i1, Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i1,  Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i1,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i1,  Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v2i1,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v2i1,  Custom);
+    setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
+    setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
+
+    setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
+    setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
+    setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
+    setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
+    setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
+    if (Subtarget.hasVLX()){
+      setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
+      setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
+      setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
+      setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
+      setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
+
+      setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
+      setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
+      setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
+      setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
+      setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
+    } else {
+      for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+           MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
+        setOperationAction(ISD::MLOAD,  VT, Custom);
+        setOperationAction(ISD::MSTORE, VT, Custom);
+      }
+    }
+    setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i1,  Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v16i1, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v8i1,  Expand);
+    setOperationAction(ISD::VSELECT,            MVT::v16i1, Expand);
+    if (Subtarget.hasDQI()) {
+      setOperationAction(ISD::SINT_TO_FP,       MVT::v8i64, Legal);
+      setOperationAction(ISD::SINT_TO_FP,       MVT::v4i64, Legal);
+      setOperationAction(ISD::SINT_TO_FP,       MVT::v2i64, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v8i64, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v4i64, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v2i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::v8i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::v4i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::v2i64, Legal);
+      setOperationAction(ISD::FP_TO_UINT,       MVT::v8i64, Legal);
+      setOperationAction(ISD::FP_TO_UINT,       MVT::v4i64, Legal);
+      setOperationAction(ISD::FP_TO_UINT,       MVT::v2i64, Legal);
+
+      if (Subtarget.hasVLX()) {
+        // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
+        setOperationAction(ISD::SINT_TO_FP,    MVT::v2f32, Custom);
+        setOperationAction(ISD::FP_TO_SINT,    MVT::v2f32, Custom);
+        setOperationAction(ISD::FP_TO_UINT,    MVT::v2f32, Custom);
+      }
+    }
+    if (Subtarget.hasVLX()) {
+      setOperationAction(ISD::SINT_TO_FP,       MVT::v8i32, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v8i32, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::v8i32, Legal);
+      setOperationAction(ISD::FP_TO_UINT,       MVT::v8i32, Legal);
+      setOperationAction(ISD::SINT_TO_FP,       MVT::v4i32, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::v4i32, Legal);
+      setOperationAction(ISD::FP_TO_UINT,       MVT::v4i32, Legal);
+      setOperationAction(ISD::ZERO_EXTEND,      MVT::v4i32, Custom);
+      setOperationAction(ISD::ZERO_EXTEND,      MVT::v2i64, Custom);
+
+      // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
+      setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8,  Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8,  Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+    }
+
+    setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
+    setOperationAction(ISD::ANY_EXTEND,         MVT::v16i32, Custom);
+    setOperationAction(ISD::ANY_EXTEND,         MVT::v8i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
+    if (Subtarget.hasDQI()) {
+      setOperationAction(ISD::SIGN_EXTEND,        MVT::v4i32, Custom);
+      setOperationAction(ISD::SIGN_EXTEND,        MVT::v2i64, Custom);
+    }
+    for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
+      setOperationAction(ISD::FFLOOR,     VT, Legal);
+      setOperationAction(ISD::FCEIL,      VT, Legal);
+      setOperationAction(ISD::FTRUNC,     VT, Legal);
+      setOperationAction(ISD::FRINT,      VT, Legal);
+      setOperationAction(ISD::FNEARBYINT, VT, Legal);
+    }
+
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64,  Custom);
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
+
+    // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
+    setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
+
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1,   Custom);
+
+    setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
+
+    setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v16i1, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v8i1,  Custom);
+
+    setOperationAction(ISD::SMAX,               MVT::v16i32, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v8i64, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v16i32, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v8i64, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v16i32, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v8i64, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v16i32, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v8i64, Legal);
+
+    setOperationAction(ISD::ADD,                MVT::v8i1,  Expand);
+    setOperationAction(ISD::ADD,                MVT::v16i1, Expand);
+    setOperationAction(ISD::SUB,                MVT::v8i1,  Expand);
+    setOperationAction(ISD::SUB,                MVT::v16i1, Expand);
+    setOperationAction(ISD::MUL,                MVT::v8i1,  Expand);
+    setOperationAction(ISD::MUL,                MVT::v16i1, Expand);
+
+    setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
+
+    for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+      setOperationAction(ISD::CTPOP, VT, Custom);
+      setOperationAction(ISD::CTTZ, VT, Custom);
+    }
+
+    // Need to promote to 64-bit even though we have 32-bit masked instructions
+    // because the IR optimizers rearrange bitcasts around logic ops leaving
+    // too many variations to handle if we don't promote them.
+    setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
+    setOperationPromotedToType(ISD::OR,  MVT::v16i32, MVT::v8i64);
+    setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
+
+    if (Subtarget.hasCDI()) {
+      setOperationAction(ISD::CTLZ,             MVT::v8i64,  Legal);
+      setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
+
+      setOperationAction(ISD::CTLZ,             MVT::v8i16,  Custom);
+      setOperationAction(ISD::CTLZ,             MVT::v16i8,  Custom);
+      setOperationAction(ISD::CTLZ,             MVT::v16i16, Custom);
+      setOperationAction(ISD::CTLZ,             MVT::v32i8,  Custom);
+
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i64,  Custom);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v16i32, Custom);
+
+      if (Subtarget.hasVLX()) {
+        setOperationAction(ISD::CTLZ,             MVT::v4i64, Legal);
+        setOperationAction(ISD::CTLZ,             MVT::v8i32, Legal);
+        setOperationAction(ISD::CTLZ,             MVT::v2i64, Legal);
+        setOperationAction(ISD::CTLZ,             MVT::v4i32, Legal);
+      } else {
+        setOperationAction(ISD::CTLZ,             MVT::v4i64, Custom);
+        setOperationAction(ISD::CTLZ,             MVT::v8i32, Custom);
+        setOperationAction(ISD::CTLZ,             MVT::v2i64, Custom);
+        setOperationAction(ISD::CTLZ,             MVT::v4i32, Custom);
+      }
+
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i64, Custom);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i32, Custom);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v2i64, Custom);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i32, Custom);
+    } // Subtarget.hasCDI()
+
+    if (Subtarget.hasDQI()) {
+      // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
+      setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
+      setOperationAction(ISD::MUL,             MVT::v4i64, Legal);
+      setOperationAction(ISD::MUL,             MVT::v8i64, Legal);
+    }
+
+    // Custom lower several nodes.
+    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+                     MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
+      setOperationAction(ISD::MGATHER,  VT, Custom);
+      setOperationAction(ISD::MSCATTER, VT, Custom);
+    }
+    // Extract subvector is special because the value type
+    // (result) is 256-bit but the source is 512-bit wide.
+    // 128-bit was made Custom under AVX1.
+    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
+                     MVT::v8f32, MVT::v4f64 })
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+    for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
+                     MVT::v16i1, MVT::v32i1, MVT::v64i1 })
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+
+    for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
+      setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
+      setOperationAction(ISD::VSELECT,             VT, Legal);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
+      setOperationAction(ISD::MLOAD,               VT, Legal);
+      setOperationAction(ISD::MSTORE,              VT, Legal);
+      setOperationAction(ISD::MGATHER,             VT, Legal);
+      setOperationAction(ISD::MSCATTER,            VT, Custom);
+    }
+    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
+      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v8i64);
+      setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
+    }
+  }// has  AVX-512
+
+  if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
+    addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
+    addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
+
+    addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
+    addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
+
+    setOperationAction(ISD::ADD,                MVT::v32i1, Expand);
+    setOperationAction(ISD::ADD,                MVT::v64i1, Expand);
+    setOperationAction(ISD::SUB,                MVT::v32i1, Expand);
+    setOperationAction(ISD::SUB,                MVT::v64i1, Expand);
+    setOperationAction(ISD::MUL,                MVT::v32i1, Expand);
+    setOperationAction(ISD::MUL,                MVT::v64i1, Expand);
+
+    setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
+    setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
+    setOperationAction(ISD::MUL,                MVT::v64i8, Custom);
+    setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
+    setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i16, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1,  Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i16, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v64i8, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
+    setOperationAction(ISD::ANY_EXTEND,         MVT::v32i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v32i16, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v64i8, Legal);
+    setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i1, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v32i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v64i1, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v32i1, Expand);
+    setOperationAction(ISD::VSELECT,            MVT::v64i1, Expand);
+    setOperationAction(ISD::BITREVERSE,         MVT::v64i8, Custom);
+
+    setOperationAction(ISD::SMAX,               MVT::v64i8, Legal);
+    setOperationAction(ISD::SMAX,               MVT::v32i16, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v64i8, Legal);
+    setOperationAction(ISD::UMAX,               MVT::v32i16, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v64i8, Legal);
+    setOperationAction(ISD::SMIN,               MVT::v32i16, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v64i8, Legal);
+    setOperationAction(ISD::UMIN,               MVT::v32i16, Legal);
+
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
+
+    setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
+    if (Subtarget.hasVLX()) {
+      setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
+      setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
+    }
+
+    LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
+    for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
+      setOperationAction(ISD::MLOAD,               VT, Action);
+      setOperationAction(ISD::MSTORE,              VT, Action);
+    }
+
+    if (Subtarget.hasCDI()) {
+      setOperationAction(ISD::CTLZ,            MVT::v32i16, Custom);
+      setOperationAction(ISD::CTLZ,            MVT::v64i8,  Custom);
+    }
+
+    for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+      setOperationAction(ISD::VSELECT,      VT, Legal);
+      setOperationAction(ISD::SRL,          VT, Custom);
+      setOperationAction(ISD::SHL,          VT, Custom);
+      setOperationAction(ISD::SRA,          VT, Custom);
+      setOperationAction(ISD::MLOAD,        VT, Legal);
+      setOperationAction(ISD::MSTORE,       VT, Legal);
+      setOperationAction(ISD::CTPOP,        VT, Custom);
+      setOperationAction(ISD::CTTZ,         VT, Custom);
+
+      setOperationPromotedToType(ISD::AND,  VT, MVT::v8i64);
+      setOperationPromotedToType(ISD::OR,   VT, MVT::v8i64);
+      setOperationPromotedToType(ISD::XOR,  VT, MVT::v8i64);
+    }
+
+    for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
+      setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
+      if (Subtarget.hasVLX()) {
+        // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
+        setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
+        setLoadExtAction(ExtType, MVT::v8i16,  MVT::v8i8,  Legal);
+      }
+    }
+  }
+
+  if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
+    addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
+    addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
+
+    for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
+      setOperationAction(ISD::ADD,                VT, Expand);
+      setOperationAction(ISD::SUB,                VT, Expand);
+      setOperationAction(ISD::MUL,                VT, Expand);
+      setOperationAction(ISD::VSELECT,            VT, Expand);
+
+      setOperationAction(ISD::TRUNCATE,           VT, Custom);
+      setOperationAction(ISD::SETCC,              VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::SELECT,             VT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+    }
+
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
+
+    for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
+      setOperationAction(ISD::SMAX, VT, Legal);
+      setOperationAction(ISD::UMAX, VT, Legal);
+      setOperationAction(ISD::SMIN, VT, Legal);
+      setOperationAction(ISD::UMIN, VT, Legal);
+    }
+  }
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  if (!Subtarget.is64Bit()) {
+    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+  }
+
+  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
+  // handle type legalization for these operations here.
+  //
+  // FIXME: We really should do custom legalization for addition and
+  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
+  // than generic legalization for 64-bit multiplication-with-overflow, though.
+  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+    if (VT == MVT::i64 && !Subtarget.is64Bit())
+      continue;
+    // Add/Sub/Mul with overflow operations are custom lowered.
+    setOperationAction(ISD::SADDO, VT, Custom);
+    setOperationAction(ISD::UADDO, VT, Custom);
+    setOperationAction(ISD::SSUBO, VT, Custom);
+    setOperationAction(ISD::USUBO, VT, Custom);
+    setOperationAction(ISD::SMULO, VT, Custom);
+    setOperationAction(ISD::UMULO, VT, Custom);
+  }
+
+  if (!Subtarget.is64Bit()) {
+    // These libcalls are not available in 32-bit.
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
+  }
+
+  // Combine sin / cos into one node or libcall if possible.
+  if (Subtarget.hasSinCos()) {
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+    if (Subtarget.isTargetDarwin()) {
+      // For MacOSX, we don't want the normal expansion of a libcall to sincos.
+      // We want to issue a libcall to __sincos_stret to avoid memory traffic.
+      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+    }
+  }
+
+  if (Subtarget.isTargetWin64()) {
+    setOperationAction(ISD::SDIV, MVT::i128, Custom);
+    setOperationAction(ISD::UDIV, MVT::i128, Custom);
+    setOperationAction(ISD::SREM, MVT::i128, Custom);
+    setOperationAction(ISD::UREM, MVT::i128, Custom);
+    setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
+  }
+
+  // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
+  // is. We should promote the value to 64-bits to solve this.
+  // This is what the CRT headers do - `fmodf` is an inline header
+  // function casting to f64 and calling `fmod`.
+  if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
+                              Subtarget.isTargetWindowsItanium()))
+    for (ISD::NodeType Op :
+         {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
+          ISD::FLOG10, ISD::FPOW, ISD::FSIN})
+      if (isOperationExpand(Op, MVT::f32))
+        setOperationAction(Op, MVT::f32, Promote);
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::BITCAST);
+  setTargetDAGCombine(ISD::VSELECT);
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::FSUB);
+  setTargetDAGCombine(ISD::FNEG);
+  setTargetDAGCombine(ISD::FMA);
+  setTargetDAGCombine(ISD::FMINNUM);
+  setTargetDAGCombine(ISD::FMAXNUM);
+  setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::LOAD);
+  setTargetDAGCombine(ISD::MLOAD);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::MSTORE);
+  setTargetDAGCombine(ISD::TRUNCATE);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
+  setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine(ISD::MSCATTER);
+  setTargetDAGCombine(ISD::MGATHER);
+
+  computeRegisterProperties(Subtarget.getRegisterInfo());
+
+  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+  MaxStoresPerMemsetOptSize = 8;
+  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
+  MaxStoresPerMemcpyOptSize = 4;
+  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
+  MaxStoresPerMemmoveOptSize = 4;
+  setPrefLoopAlignment(4); // 2^4 bytes.
+
+  // An out-of-order CPU can speculatively execute past a predictable branch,
+  // but a conditional move could be stalled by an expensive earlier operation.
+  PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
+  EnableExtLdPromotion = true;
+  setPrefFunctionAlignment(4); // 2^4 bytes.
+
+  verifyIntrinsicTables();
+}
+
+// This has so far only been implemented for 64-bit MachO.
+bool X86TargetLowering::useLoadStackGuardNode() const {
+  return Subtarget.isTargetMachO() && Subtarget.is64Bit();
+}
+
+TargetLoweringBase::LegalizeTypeAction
+X86TargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (ExperimentalVectorWideningLegalization &&
+      VT.getVectorNumElements() != 1 &&
+      VT.getVectorElementType().getSimpleVT() != MVT::i1)
+    return TypeWidenVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
+                                          LLVMContext& Context,
+                                          EVT VT) const {
+  if (!VT.isVector())
+    return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
+
+  if (VT.isSimple()) {
+    MVT VVT = VT.getSimpleVT();
+    const unsigned NumElts = VVT.getVectorNumElements();
+    MVT EltVT = VVT.getVectorElementType();
+    if (VVT.is512BitVector()) {
+      if (Subtarget.hasAVX512())
+        if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
+            EltVT == MVT::f32 || EltVT == MVT::f64)
+          switch(NumElts) {
+          case  8: return MVT::v8i1;
+          case 16: return MVT::v16i1;
+        }
+      if (Subtarget.hasBWI())
+        if (EltVT == MVT::i8 || EltVT == MVT::i16)
+          switch(NumElts) {
+          case 32: return MVT::v32i1;
+          case 64: return MVT::v64i1;
+        }
+    }
+
+    if (Subtarget.hasBWI() && Subtarget.hasVLX())
+      return MVT::getVectorVT(MVT::i1, NumElts);
+
+    if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
+      EVT LegalVT = getTypeToTransformTo(Context, VT);
+      EltVT = LegalVT.getVectorElementType().getSimpleVT();
+    }
+
+    if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
+      switch(NumElts) {
+      case 2: return MVT::v2i1;
+      case 4: return MVT::v4i1;
+      case 8: return MVT::v8i1;
+      }
+  }
+
+  return VT.changeVectorElementTypeToInteger();
+}
+
+/// Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
+  if (MaxAlign == 16)
+    return;
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    if (VTy->getBitWidth() == 128)
+      MaxAlign = 16;
+  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    unsigned EltAlign = 0;
+    getMaxByValAlign(ATy->getElementType(), EltAlign);
+    if (EltAlign > MaxAlign)
+      MaxAlign = EltAlign;
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    for (auto *EltTy : STy->elements()) {
+      unsigned EltAlign = 0;
+      getMaxByValAlign(EltTy, EltAlign);
+      if (EltAlign > MaxAlign)
+        MaxAlign = EltAlign;
+      if (MaxAlign == 16)
+        break;
+    }
+  }
+}
+
+/// Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area. For X86, aggregates
+/// that contain SSE vectors are placed at 16-byte boundaries while the rest
+/// are at 4-byte boundaries.
+unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
+                                                  const DataLayout &DL) const {
+  if (Subtarget.is64Bit()) {
+    // Max of 8 and alignment of type.
+    unsigned TyAlign = DL.getABITypeAlignment(Ty);
+    if (TyAlign > 8)
+      return TyAlign;
+    return 8;
+  }
+
+  unsigned Align = 4;
+  if (Subtarget.hasSSE1())
+    getMaxByValAlign(Ty, Align);
+  return Align;
+}
+
+/// Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove
+/// lowering. If DstAlign is zero that means it's safe to destination
+/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+/// means there isn't a need to check it against alignment requirement,
+/// probably because the source does not need to be loaded. If 'IsMemset' is
+/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+/// source is constant so it does not need to be loaded.
+/// It returns EVT::Other if the type should be determined using generic
+/// target-independent logic.
+EVT
+X86TargetLowering::getOptimalMemOpType(uint64_t Size,
+                                       unsigned DstAlign, unsigned SrcAlign,
+                                       bool IsMemset, bool ZeroMemset,
+                                       bool MemcpyStrSrc,
+                                       MachineFunction &MF) const {
+  const Function *F = MF.getFunction();
+  if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+    if (Size >= 16 &&
+        (!Subtarget.isUnalignedMem16Slow() ||
+         ((DstAlign == 0 || DstAlign >= 16) &&
+          (SrcAlign == 0 || SrcAlign >= 16)))) {
+      // FIXME: Check if unaligned 32-byte accesses are slow.
+      if (Size >= 32 && Subtarget.hasAVX()) {
+        // Although this isn't a well-supported type for AVX1, we'll let
+        // legalization and shuffle lowering produce the optimal codegen. If we
+        // choose an optimal type with a vector element larger than a byte,
+        // getMemsetStores() may create an intermediate splat (using an integer
+        // multiply) before we splat as a vector.
+        return MVT::v32i8;
+      }
+      if (Subtarget.hasSSE2())
+        return MVT::v16i8;
+      // TODO: Can SSE1 handle a byte vector?
+      if (Subtarget.hasSSE1())
+        return MVT::v4f32;
+    } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
+               !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
+      // Do not use f64 to lower memcpy if source is string constant. It's
+      // better to use i32 to avoid the loads.
+      // Also, do not use f64 to lower memset unless this is a memset of zeros.
+      // The gymnastics of splatting a byte value into an XMM register and then
+      // only using 8-byte stores (because this is a CPU with slow unaligned
+      // 16-byte accesses) makes that a loser.
+      return MVT::f64;
+    }
+  }
+  // This is a compromise. If we reach here, unaligned accesses may be slow on
+  // this target. However, creating smaller, aligned accesses could be even
+  // slower and would certainly be a lot more code.
+  if (Subtarget.is64Bit() && Size >= 8)
+    return MVT::i64;
+  return MVT::i32;
+}
+
+bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
+  if (VT == MVT::f32)
+    return X86ScalarSSEf32;
+  else if (VT == MVT::f64)
+    return X86ScalarSSEf64;
+  return true;
+}
+
+bool
+X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                  unsigned,
+                                                  unsigned,
+                                                  bool *Fast) const {
+  if (Fast) {
+    switch (VT.getSizeInBits()) {
+    default:
+      // 8-byte and under are always assumed to be fast.
+      *Fast = true;
+      break;
+    case 128:
+      *Fast = !Subtarget.isUnalignedMem16Slow();
+      break;
+    case 256:
+      *Fast = !Subtarget.isUnalignedMem32Slow();
+      break;
+    // TODO: What about AVX-512 (512-bit) accesses?
+    }
+  }
+  // Misaligned accesses of any size are always allowed.
+  return true;
+}
+
+/// Return the entry encoding for a jump table in the
+/// current function.  The returned value is a member of the
+/// MachineJumpTableInfo::JTEntryKind enum.
+unsigned X86TargetLowering::getJumpTableEncoding() const {
+  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
+  // symbol.
+  if (isPositionIndependent() && Subtarget.isPICStyleGOT())
+    return MachineJumpTableInfo::EK_Custom32;
+
+  // Otherwise, use the normal jump table encoding heuristics.
+  return TargetLowering::getJumpTableEncoding();
+}
+
+bool X86TargetLowering::useSoftFloat() const {
+  return Subtarget.useSoftFloat();
+}
+
+const MCExpr *
+X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                                             const MachineBasicBlock *MBB,
+                                             unsigned uid,MCContext &Ctx) const{
+  assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
+  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
+  // entries.
+  return MCSymbolRefExpr::create(MBB->getSymbol(),
+                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
+}
+
+/// Returns relocation base for the given PIC jumptable.
+SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+                                                    SelectionDAG &DAG) const {
+  if (!Subtarget.is64Bit())
+    // This doesn't have SDLoc associated with it, but is not really the
+    // same as a Register.
+    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+                       getPointerTy(DAG.getDataLayout()));
+  return Table;
+}
+
+/// This returns the relocation base for the given PIC jumptable,
+/// the same as getPICJumpTableRelocBase, but as an MCExpr.
+const MCExpr *X86TargetLowering::
+getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
+                             MCContext &Ctx) const {
+  // X86-64 uses RIP relative addressing based on the jump table label.
+  if (Subtarget.isPICStyleRIPRel())
+    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
+
+  // Otherwise, the reference is relative to the PIC base.
+  return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
+}
+
+std::pair<const TargetRegisterClass *, uint8_t>
+X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+                                           MVT VT) const {
+  const TargetRegisterClass *RRC = nullptr;
+  uint8_t Cost = 1;
+  switch (VT.SimpleTy) {
+  default:
+    return TargetLowering::findRepresentativeClass(TRI, VT);
+  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
+    RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
+    break;
+  case MVT::x86mmx:
+    RRC = &X86::VR64RegClass;
+    break;
+  case MVT::f32: case MVT::f64:
+  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
+  case MVT::v4f32: case MVT::v2f64:
+  case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
+  case MVT::v8f32: case MVT::v4f64:
+  case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
+  case MVT::v16f32: case MVT::v8f64:
+    RRC = &X86::VR128XRegClass;
+    break;
+  }
+  return std::make_pair(RRC, Cost);
+}
+
+unsigned X86TargetLowering::getAddressSpace() const {
+  if (Subtarget.is64Bit())
+    return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
+  return 256;
+}
+
+Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+  // glibc has a special slot for the stack guard in tcbhead_t, use it instead
+  // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
+  if (!Subtarget.isTargetGlibc())
+    return TargetLowering::getIRStackGuard(IRB);
+
+  // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
+  // %gs:0x14 on i386
+  unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
+  unsigned AddressSpace = getAddressSpace();
+  return ConstantExpr::getIntToPtr(
+      ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
+      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+}
+
+void X86TargetLowering::insertSSPDeclarations(Module &M) const {
+  // MSVC CRT provides functionalities for stack protection.
+  if (Subtarget.getTargetTriple().isOSMSVCRT()) {
+    // MSVC CRT has a global variable holding security cookie.
+    M.getOrInsertGlobal("__security_cookie",
+                        Type::getInt8PtrTy(M.getContext()));
+
+    // MSVC CRT has a function to validate security cookie.
+    auto *SecurityCheckCookie = cast<Function>(
+        M.getOrInsertFunction("__security_check_cookie",
+                              Type::getVoidTy(M.getContext()),
+                              Type::getInt8PtrTy(M.getContext()), nullptr));
+    SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
+    SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
+    return;
+  }
+  // glibc has a special slot for the stack guard.
+  if (Subtarget.isTargetGlibc())
+    return;
+  TargetLowering::insertSSPDeclarations(M);
+}
+
+Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
+  // MSVC CRT has a global variable holding security cookie.
+  if (Subtarget.getTargetTriple().isOSMSVCRT())
+    return M.getGlobalVariable("__security_cookie");
+  return TargetLowering::getSDagStackGuard(M);
+}
+
+Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
+  // MSVC CRT has a function to validate security cookie.
+  if (Subtarget.getTargetTriple().isOSMSVCRT())
+    return M.getFunction("__security_check_cookie");
+  return TargetLowering::getSSPStackGuardCheck(M);
+}
+
+Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+  if (Subtarget.getTargetTriple().isOSContiki())
+    return getDefaultSafeStackPointerLocation(IRB, false);
+
+  if (!Subtarget.isTargetAndroid())
+    return TargetLowering::getSafeStackPointerLocation(IRB);
+
+  // Android provides a fixed TLS slot for the SafeStack pointer. See the
+  // definition of TLS_SLOT_SAFESTACK in
+  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+  unsigned AddressSpace, Offset;
+
+  // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+  // %gs:0x24 on i386
+  Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
+  AddressSpace = getAddressSpace();
+  return ConstantExpr::getIntToPtr(
+      ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
+      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+}
+
+bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+                                            unsigned DestAS) const {
+  assert(SrcAS != DestAS && "Expected different address spaces!");
+
+  return SrcAS < 256 && DestAS < 256;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "X86GenCallingConv.inc"
+
+bool X86TargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_X86);
+}
+
+const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
+  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
+  return ScratchRegs;
+}
+
+/// Lowers masks values (v*i1) to the local register values
+/// \returns DAG node after lowering to register type
+static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
+                               const SDLoc &Dl, SelectionDAG &DAG) {
+  EVT ValVT = ValArg.getValueType();
+
+  if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
+      (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
+    // Two stage lowering might be required
+    // bitcast:   v8i1 -> i8 / v16i1 -> i16
+    // anyextend: i8   -> i32 / i16   -> i32
+    EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
+    SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
+    if (ValLoc == MVT::i32)
+      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
+    return ValToCopy;
+  } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
+             (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
+    // One stage lowering is required
+    // bitcast:   v32i1 -> i32 / v64i1 -> i64
+    return DAG.getBitcast(ValLoc, ValArg);
+  } else
+    return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
+}
+
+/// Breaks v64i1 value into two registers and adds the new node to the DAG
+static void Passv64i1ArgInRegs(
+    const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
+    SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
+    CCValAssign &NextVA, const X86Subtarget &Subtarget) {
+  assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
+         "Expected AVX512BW or AVX512BMI target!");
+  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
+  assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
+  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
+         "The value should reside in two registers");
+
+  // Before splitting the value we cast it to i64
+  Arg = DAG.getBitcast(MVT::i64, Arg);
+
+  // Splitting the value into two i32 types
+  SDValue Lo, Hi;
+  Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
+                   DAG.getConstant(0, Dl, MVT::i32));
+  Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
+                   DAG.getConstant(1, Dl, MVT::i32));
+
+  // Attach the two i32 types into corresponding registers
+  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
+  RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
+}
+
+SDValue
+X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
+                               const SDLoc &dl, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+  if (CallConv == CallingConv::X86_INTR && !Outs.empty())
+    report_fatal_error("X86 interrupts may not return any value");
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+  SDValue Flag;
+  SmallVector<SDValue, 6> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+  // Operand #1 = Bytes To Pop
+  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
+                   MVT::i32));
+
+  // Copy the result values into the output registers.
+  for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
+       ++I, ++OutsIndex) {
+    CCValAssign &VA = RVLocs[I];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue ValToCopy = OutVals[OutsIndex];
+    EVT ValVT = ValToCopy.getValueType();
+
+    // Promote values to the appropriate types.
+    if (VA.getLocInfo() == CCValAssign::SExt)
+      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::ZExt)
+      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::AExt) {
+      if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
+        ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
+      else
+        ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    }
+    else if (VA.getLocInfo() == CCValAssign::BCvt)
+      ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
+
+    assert(VA.getLocInfo() != CCValAssign::FPExt &&
+           "Unexpected FP-extend for return value.");
+
+    // If this is x86-64, and we disabled SSE, we can't return FP values,
+    // or SSE or MMX vectors.
+    if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
+         VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
+          (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
+      report_fatal_error("SSE register return with SSE disabled");
+    }
+    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
+    // llvm-gcc has never done it right and no one has noticed, so this
+    // should be OK for now.
+    if (ValVT == MVT::f64 &&
+        (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
+      report_fatal_error("SSE2 register return with SSE2 disabled");
+
+    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
+    // the RET instruction and handled by the FP Stackifier.
+    if (VA.getLocReg() == X86::FP0 ||
+        VA.getLocReg() == X86::FP1) {
+      // If this is a copy from an xmm register to ST(0), use an FPExtend to
+      // change the value to the FP stack register class.
+      if (isScalarFPTypeInSSEReg(VA.getValVT()))
+        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
+      RetOps.push_back(ValToCopy);
+      // Don't emit a copytoreg.
+      continue;
+    }
+
+    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
+    // which is returned in RAX / RDX.
+    if (Subtarget.is64Bit()) {
+      if (ValVT == MVT::x86mmx) {
+        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
+          ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
+          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+                                  ValToCopy);
+          // If we don't have SSE2 available, convert to v4f32 so the generated
+          // register is legal.
+          if (!Subtarget.hasSSE2())
+            ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
+        }
+      }
+    }
+
+    SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+    if (VA.needsCustom()) {
+      assert(VA.getValVT() == MVT::v64i1 &&
+             "Currently the only custom case is when we split v64i1 to 2 regs");
+
+      Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
+                         Subtarget);
+
+      assert(2 == RegsToPass.size() &&
+             "Expecting two registers after Pass64BitArgInRegs");
+    } else {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
+    }
+
+    // Add nodes to the DAG and add the values into the RetOps list
+    for (auto &Reg : RegsToPass) {
+      Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
+      Flag = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
+    }
+  }
+
+  // Swift calling convention does not require we copy the sret argument
+  // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
+
+  // All x86 ABIs require that for returning structs by value we copy
+  // the sret argument into %rax/%eax (depending on ABI) for the return.
+  // We saved the argument into a virtual register in the entry block,
+  // so now we copy the value out and into %rax/%eax.
+  //
+  // Checking Function.hasStructRetAttr() here is insufficient because the IR
+  // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
+  // false, then an sret argument may be implicitly inserted in the SelDAG. In
+  // either case FuncInfo->setSRetReturnReg() will have been called.
+  if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
+    // When we have both sret and another return value, we should use the
+    // original Chain stored in RetOps[0], instead of the current Chain updated
+    // in the above loop. If we only have sret, RetOps[0] equals to Chain.
+
+    // For the case of sret and another return value, we have
+    //   Chain_0 at the function entry
+    //   Chain_1 = getCopyToReg(Chain_0) in the above loop
+    // If we use Chain_1 in getCopyFromReg, we will have
+    //   Val = getCopyFromReg(Chain_1)
+    //   Chain_2 = getCopyToReg(Chain_1, Val) from below
+
+    // getCopyToReg(Chain_0) will be glued together with
+    // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
+    // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
+    //   Data dependency from Unit B to Unit A due to usage of Val in
+    //     getCopyToReg(Chain_1, Val)
+    //   Chain dependency from Unit A to Unit B
+
+    // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
+    SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
+                                     getPointerTy(MF.getDataLayout()));
+
+    unsigned RetValReg
+        = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
+          X86::RAX : X86::EAX;
+    Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
+    Flag = Chain.getValue(1);
+
+    // RAX/EAX now acts like a return value.
+    RetOps.push_back(
+        DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
+  }
+
+  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const MCPhysReg *I =
+      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+  if (I) {
+    for (; *I; ++I) {
+      if (X86::GR64RegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+      else
+        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+    }
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  X86ISD::NodeType opcode = X86ISD::RET_FLAG;
+  if (CallConv == CallingConv::X86_INTR)
+    opcode = X86ISD::IRET;
+  return DAG.getNode(opcode, dl, MVT::Other, RetOps);
+}
+
+bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
+  if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
+    return false;
+
+  SDValue TCChain = Chain;
+  SDNode *Copy = *N->use_begin();
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+    return false;
+
+  bool HasRet = false;
+  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+       UI != UE; ++UI) {
+    if (UI->getOpcode() != X86ISD::RET_FLAG)
+      return false;
+    // If we are returning more than one value, we can definitely
+    // not make a tail call see PR19530
+    if (UI->getNumOperands() > 4)
+      return false;
+    if (UI->getNumOperands() == 4 &&
+        UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
+      return false;
+    HasRet = true;
+  }
+
+  if (!HasRet)
+    return false;
+
+  Chain = TCChain;
+  return true;
+}
+
+EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
+                                           ISD::NodeType ExtendKind) const {
+  MVT ReturnMVT = MVT::i32;
+
+  bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
+  if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
+    // The ABI does not require i1, i8 or i16 to be extended.
+    //
+    // On Darwin, there is code in the wild relying on Clang's old behaviour of
+    // always extending i8/i16 return values, so keep doing that for now.
+    // (PR26665).
+    ReturnMVT = MVT::i8;
+  }
+
+  EVT MinVT = getRegisterType(Context, ReturnMVT);
+  return VT.bitsLT(MinVT) ? MinVT : VT;
+}
+
+/// Reads two 32 bit registers and creates a 64 bit mask value.
+/// \param VA The current 32 bit value that need to be assigned.
+/// \param NextVA The next 32 bit value that need to be assigned.
+/// \param Root The parent DAG node.
+/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
+///                        glue purposes. In the case the DAG is already using
+///                        physical register instead of virtual, we should glue
+///                        our new SDValue to InFlag SDvalue.
+/// \return a new SDvalue of size 64bit.
+static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
+                                SDValue &Root, SelectionDAG &DAG,
+                                const SDLoc &Dl, const X86Subtarget &Subtarget,
+                                SDValue *InFlag = nullptr) {
+  assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
+  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
+  assert(VA.getValVT() == MVT::v64i1 &&
+         "Expecting first location of 64 bit width type");
+  assert(NextVA.getValVT() == VA.getValVT() &&
+         "The locations should have the same type");
+  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
+         "The values should reside in two registers");
+
+  SDValue Lo, Hi;
+  unsigned Reg;
+  SDValue ArgValueLo, ArgValueHi;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetRegisterClass *RC = &X86::GR32RegClass;
+
+  // Read a 32 bit value from the registers
+  if (nullptr == InFlag) {
+    // When no physical register is present,
+    // create an intermediate virtual register
+    Reg = MF.addLiveIn(VA.getLocReg(), RC);
+    ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
+    Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
+    ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
+  } else {
+    // When a physical register is available read the value from it and glue
+    // the reads together.
+    ArgValueLo =
+      DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
+    *InFlag = ArgValueLo.getValue(2);
+    ArgValueHi =
+      DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
+    *InFlag = ArgValueHi.getValue(2);
+  }
+
+  // Convert the i32 type into v32i1 type
+  Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
+
+  // Convert the i32 type into v32i1 type
+  Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
+
+  // Concantenate the two values together
+  return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
+}
+
+/// The function will lower a register of various sizes (8/16/32/64)
+/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
+/// \returns a DAG node contains the operand after lowering to mask type.
+static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
+                               const EVT &ValLoc, const SDLoc &Dl,
+                               SelectionDAG &DAG) {
+  SDValue ValReturned = ValArg;
+
+  if (ValVT == MVT::v64i1) {
+    // In 32 bit machine, this case is handled by getv64i1Argument
+    assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
+    // In 64 bit machine, There is no need to truncate the value only bitcast
+  } else {
+    MVT maskLen;
+    switch (ValVT.getSimpleVT().SimpleTy) {
+    case MVT::v8i1:
+      maskLen = MVT::i8;
+      break;
+    case MVT::v16i1:
+      maskLen = MVT::i16;
+      break;
+    case MVT::v32i1:
+      maskLen = MVT::i32;
+      break;
+    default:
+      llvm_unreachable("Expecting a vector of i1 types");
+    }
+
+    ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
+  }
+
+  return DAG.getBitcast(ValVT, ValReturned);
+}
+
+/// Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue X86TargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  bool Is64Bit = Subtarget.is64Bit();
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
+       ++I, ++InsIndex) {
+    CCValAssign &VA = RVLocs[I];
+    EVT CopyVT = VA.getLocVT();
+
+    // If this is x86-64, and we disabled SSE, we can't return FP values
+    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
+        ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
+      report_fatal_error("SSE register return with SSE disabled");
+    }
+
+    // If we prefer to use the value in xmm registers, copy it out as f80 and
+    // use a truncate to move it from fp stack reg to xmm reg.
+    bool RoundAfterCopy = false;
+    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+        isScalarFPTypeInSSEReg(VA.getValVT())) {
+      if (!Subtarget.hasX87())
+        report_fatal_error("X87 register return with X87 disabled");
+      CopyVT = MVT::f80;
+      RoundAfterCopy = (CopyVT != VA.getLocVT());
+    }
+
+    SDValue Val;
+    if (VA.needsCustom()) {
+      assert(VA.getValVT() == MVT::v64i1 &&
+             "Currently the only custom case is when we split v64i1 to 2 regs");
+      Val =
+          getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
+    } else {
+      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
+                  .getValue(1);
+      Val = Chain.getValue(0);
+      InFlag = Chain.getValue(2);
+    }
+
+    if (RoundAfterCopy)
+      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+                        // This truncation won't change the value.
+                        DAG.getIntPtrConstant(1, dl));
+
+    if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
+      if (VA.getValVT().isVector() &&
+          ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
+           (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
+        // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
+        Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
+      } else
+        Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//                C & StdCall & Fast Calling Convention implementation
+//===----------------------------------------------------------------------===//
+//  StdCall calling convention seems to be standard for many Windows' API
+//  routines and around. It differs from C calling convention just a little:
+//  callee should clean up the stack, not caller. Symbols should be also
+//  decorated in some fancy way :) It doesn't support any vector arguments.
+//  For info on fast calling convention see Fast Calling Convention (tail call)
+//  implementation LowerX86_32FastCCCallTo.
+
+/// CallIsStructReturn - Determines whether a call uses struct return
+/// semantics.
+enum StructReturnType {
+  NotStructReturn,
+  RegStructReturn,
+  StackStructReturn
+};
+static StructReturnType
+callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
+  if (Outs.empty())
+    return NotStructReturn;
+
+  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
+  if (!Flags.isSRet())
+    return NotStructReturn;
+  if (Flags.isInReg() || IsMCU)
+    return RegStructReturn;
+  return StackStructReturn;
+}
+
+/// Determines whether a function uses struct return semantics.
+static StructReturnType
+argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
+  if (Ins.empty())
+    return NotStructReturn;
+
+  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
+  if (!Flags.isSRet())
+    return NotStructReturn;
+  if (Flags.isInReg() || IsMCU)
+    return RegStructReturn;
+  return StackStructReturn;
+}
+
+/// Make a copy of an aggregate at address specified by "Src" to address
+/// "Dst" with size and alignment information specified by the specific
+/// parameter attribute. The copy will be passed as a byval function parameter.
+static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
+                                         SDValue Chain, ISD::ArgFlagsTy Flags,
+                                         SelectionDAG &DAG, const SDLoc &dl) {
+  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
+
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       /*isVolatile*/false, /*AlwaysInline=*/true,
+                       /*isTailCall*/false,
+                       MachinePointerInfo(), MachinePointerInfo());
+}
+
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) {
+  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
+          CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
+          CC == CallingConv::HHVM);
+}
+
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+  switch (CC) {
+  // C calling conventions:
+  case CallingConv::C:
+  case CallingConv::X86_64_Win64:
+  case CallingConv::X86_64_SysV:
+  // Callee pop conventions:
+  case CallingConv::X86_ThisCall:
+  case CallingConv::X86_StdCall:
+  case CallingConv::X86_VectorCall:
+  case CallingConv::X86_FastCall:
+    return true;
+  default:
+    return canGuaranteeTCO(CC);
+  }
+}
+
+/// Return true if the function is being made into a tailcall target by
+/// changing its ABI.
+static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
+  return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
+}
+
+bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  auto Attr =
+      CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
+  if (!CI->isTailCall() || Attr.getValueAsString() == "true")
+    return false;
+
+  CallSite CS(CI);
+  CallingConv::ID CalleeCC = CS.getCallingConv();
+  if (!mayTailCallThisCC(CalleeCC))
+    return false;
+
+  return true;
+}
+
+SDValue
+X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    const SDLoc &dl, SelectionDAG &DAG,
+                                    const CCValAssign &VA,
+                                    MachineFrameInfo &MFI, unsigned i) const {
+  // Create the nodes corresponding to a load from this parameter slot.
+  ISD::ArgFlagsTy Flags = Ins[i].Flags;
+  bool AlwaysUseMutable = shouldGuaranteeTCO(
+      CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
+  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
+  EVT ValVT;
+
+  // If value is passed by pointer we have address passed instead of the value
+  // itself. No need to extend if the mask value and location share the same
+  // absolute size.
+  bool ExtendedInMem =
+      VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
+      VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
+
+  if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
+    ValVT = VA.getLocVT();
+  else
+    ValVT = VA.getValVT();
+
+  // Calculate SP offset of interrupt parameter, re-arrange the slot normally
+  // taken by a return address.
+  int Offset = 0;
+  if (CallConv == CallingConv::X86_INTR) {
+    const X86Subtarget& Subtarget =
+        static_cast<const X86Subtarget&>(DAG.getSubtarget());
+    // X86 interrupts may take one or two arguments.
+    // On the stack there will be no return address as in regular call.
+    // Offset of last argument need to be set to -4/-8 bytes.
+    // Where offset of the first argument out of two, should be set to 0 bytes.
+    Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
+  }
+
+  // FIXME: For now, all byval parameter objects are marked mutable. This can be
+  // changed with more analysis.
+  // In case of tail call optimization mark all arguments mutable. Since they
+  // could be overwritten by lowering of arguments in case of a tail call.
+  if (Flags.isByVal()) {
+    unsigned Bytes = Flags.getByValSize();
+    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
+    int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
+    // Adjust SP offset of interrupt parameter.
+    if (CallConv == CallingConv::X86_INTR) {
+      MFI.setObjectOffset(FI, Offset);
+    }
+    return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+  } else {
+    int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8,
+                                   VA.getLocMemOffset(), isImmutable);
+
+    // Set SExt or ZExt flag.
+    if (VA.getLocInfo() == CCValAssign::ZExt) {
+      MFI.setObjectZExt(FI, true);
+    } else if (VA.getLocInfo() == CCValAssign::SExt) {
+      MFI.setObjectSExt(FI, true);
+    }
+
+    // Adjust SP offset of interrupt parameter.
+    if (CallConv == CallingConv::X86_INTR) {
+      MFI.setObjectOffset(FI, Offset);
+    }
+
+    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+    SDValue Val = DAG.getLoad(
+        ValVT, dl, Chain, FIN,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+    return ExtendedInMem ?
+      DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
+  }
+}
+
+// FIXME: Get this from tablegen.
+static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
+                                                const X86Subtarget &Subtarget) {
+  assert(Subtarget.is64Bit());
+
+  if (Subtarget.isCallingConvWin64(CallConv)) {
+    static const MCPhysReg GPR64ArgRegsWin64[] = {
+      X86::RCX, X86::RDX, X86::R8,  X86::R9
+    };
+    return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
+  }
+
+  static const MCPhysReg GPR64ArgRegs64Bit[] = {
+    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+  };
+  return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
+}
+
+// FIXME: Get this from tablegen.
+static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
+                                                CallingConv::ID CallConv,
+                                                const X86Subtarget &Subtarget) {
+  assert(Subtarget.is64Bit());
+  if (Subtarget.isCallingConvWin64(CallConv)) {
+    // The XMM registers which might contain var arg parameters are shadowed
+    // in their paired GPR.  So we only need to save the GPR to their home
+    // slots.
+    // TODO: __vectorcall will change this.
+    return None;
+  }
+
+  const Function *Fn = MF.getFunction();
+  bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
+  bool isSoftFloat = Subtarget.useSoftFloat();
+  assert(!(isSoftFloat && NoImplicitFloatOps) &&
+         "SSE register cannot be used when SSE is disabled!");
+  if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
+    // Kernel mode asks for SSE to be disabled, so there are no XMM argument
+    // registers.
+    return None;
+
+  static const MCPhysReg XMMArgRegs64Bit[] = {
+    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+  };
+  return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
+}
+
+static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
+  return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
+                        [](const CCValAssign &A, const CCValAssign &B) -> bool {
+                          return A.getValNo() < B.getValNo();
+                        });
+}
+
+SDValue X86TargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
+
+  const Function *Fn = MF.getFunction();
+  if (Fn->hasExternalLinkage() &&
+      Subtarget.isTargetCygMing() &&
+      Fn->getName() == "main")
+    FuncInfo->setForceFramePointer(true);
+
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool Is64Bit = Subtarget.is64Bit();
+  bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
+
+  assert(
+      !(isVarArg && canGuaranteeTCO(CallConv)) &&
+      "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
+
+  if (CallConv == CallingConv::X86_INTR) {
+    bool isLegal = Ins.size() == 1 ||
+                   (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
+                                        (!Is64Bit && Ins[1].VT == MVT::i32)));
+    if (!isLegal)
+      report_fatal_error("X86 interrupts may take one or two arguments");
+  }
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+
+  // Allocate shadow area for Win64.
+  if (IsWin64)
+    CCInfo.AllocateStack(32, 8);
+
+  CCInfo.AnalyzeArguments(Ins, CC_X86);
+
+  // In vectorcall calling convention a second pass is required for the HVA
+  // types.
+  if (CallingConv::X86_VectorCall == CallConv) {
+    CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
+  }
+
+  // The next loop assumes that the locations are in the same order of the
+  // input arguments.
+  if (!isSortedByValueNo(ArgLocs))
+    llvm_unreachable("Argument Location list must be sorted before lowering");
+
+  SDValue ArgValue;
+  for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
+       ++I, ++InsIndex) {
+    assert(InsIndex < Ins.size() && "Invalid Ins index");
+    CCValAssign &VA = ArgLocs[I];
+
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+      if (VA.needsCustom()) {
+        assert(
+            VA.getValVT() == MVT::v64i1 &&
+            "Currently the only custom case is when we split v64i1 to 2 regs");
+
+        // v64i1 values, in regcall calling convention, that are
+        // compiled to 32 bit arch, are splited up into two registers.
+        ArgValue =
+            getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
+      } else {
+        const TargetRegisterClass *RC;
+        if (RegVT == MVT::i32)
+          RC = &X86::GR32RegClass;
+        else if (Is64Bit && RegVT == MVT::i64)
+          RC = &X86::GR64RegClass;
+        else if (RegVT == MVT::f32)
+          RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
+        else if (RegVT == MVT::f64)
+          RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
+        else if (RegVT == MVT::f80)
+          RC = &X86::RFP80RegClass;
+        else if (RegVT == MVT::f128)
+          RC = &X86::FR128RegClass;
+        else if (RegVT.is512BitVector())
+          RC = &X86::VR512RegClass;
+        else if (RegVT.is256BitVector())
+          RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
+        else if (RegVT.is128BitVector())
+          RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
+        else if (RegVT == MVT::x86mmx)
+          RC = &X86::VR64RegClass;
+        else if (RegVT == MVT::i1)
+          RC = &X86::VK1RegClass;
+        else if (RegVT == MVT::v8i1)
+          RC = &X86::VK8RegClass;
+        else if (RegVT == MVT::v16i1)
+          RC = &X86::VK16RegClass;
+        else if (RegVT == MVT::v32i1)
+          RC = &X86::VK32RegClass;
+        else if (RegVT == MVT::v64i1)
+          RC = &X86::VK64RegClass;
+        else
+          llvm_unreachable("Unknown argument type!");
+
+        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+        ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+      }
+
+      // If this is an 8 or 16-bit value, it is really passed promoted to 32
+      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
+      // right size.
+      if (VA.getLocInfo() == CCValAssign::SExt)
+        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::ZExt)
+        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::BCvt)
+        ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
+
+      if (VA.isExtInLoc()) {
+        // Handle MMX values passed in XMM regs.
+        if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
+          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
+        else if (VA.getValVT().isVector() &&
+                 VA.getValVT().getScalarType() == MVT::i1 &&
+                 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
+                  (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
+          // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
+          ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
+        } else
+          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+      }
+    } else {
+      assert(VA.isMemLoc());
+      ArgValue =
+          LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
+    }
+
+    // If value is passed via pointer - do a load.
+    if (VA.getLocInfo() == CCValAssign::Indirect)
+      ArgValue =
+          DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
+
+    InVals.push_back(ArgValue);
+  }
+
+  for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
+    // Swift calling convention does not require we copy the sret argument
+    // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
+    if (CallConv == CallingConv::Swift)
+      continue;
+
+    // All x86 ABIs require that for returning structs by value we copy the
+    // sret argument into %rax/%eax (depending on ABI) for the return. Save
+    // the argument into a virtual register so that we can access it from the
+    // return points.
+    if (Ins[I].Flags.isSRet()) {
+      unsigned Reg = FuncInfo->getSRetReturnReg();
+      if (!Reg) {
+        MVT PtrTy = getPointerTy(DAG.getDataLayout());
+        Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+        FuncInfo->setSRetReturnReg(Reg);
+      }
+      SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+      break;
+    }
+  }
+
+  unsigned StackSize = CCInfo.getNextStackOffset();
+  // Align stack specially for tail calls.
+  if (shouldGuaranteeTCO(CallConv,
+                         MF.getTarget().Options.GuaranteedTailCallOpt))
+    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start. We
+  // can skip this if there are no va_start calls.
+  if (MFI.hasVAStart() &&
+      (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
+                   CallConv != CallingConv::X86_ThisCall))) {
+    FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
+  }
+
+  // Figure out if XMM registers are in use.
+  assert(!(Subtarget.useSoftFloat() &&
+           Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
+         "SSE register cannot be used when SSE is disabled!");
+
+  // 64-bit calling conventions support varargs and register parameters, so we
+  // have to do extra work to spill them in the prologue.
+  if (Is64Bit && isVarArg && MFI.hasVAStart()) {
+    // Find the first unallocated argument registers.
+    ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
+    ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
+    unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
+    assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
+           "SSE register cannot be used when SSE is disabled!");
+
+    // Gather all the live in physical registers.
+    SmallVector<SDValue, 6> LiveGPRs;
+    SmallVector<SDValue, 8> LiveXMMRegs;
+    SDValue ALVal;
+    for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
+      unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
+      LiveGPRs.push_back(
+          DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
+    }
+    if (!ArgXMMs.empty()) {
+      unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+      ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
+      for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
+        unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
+        LiveXMMRegs.push_back(
+            DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
+      }
+    }
+
+    if (IsWin64) {
+      // Get to the caller-allocated home save location.  Add 8 to account
+      // for the return address.
+      int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
+      FuncInfo->setRegSaveFrameIndex(
+          MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
+      // Fixup to set vararg frame on shadow area (4 x i64).
+      if (NumIntRegs < 4)
+        FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
+    } else {
+      // For X86-64, if there are vararg parameters that are passed via
+      // registers, then we must store them to their spots on the stack so
+      // they may be loaded by dereferencing the result of va_next.
+      FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+      FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
+      FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
+          ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
+    }
+
+    // Store the integer parameter registers.
+    SmallVector<SDValue, 8> MemOps;
+    SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+                                      getPointerTy(DAG.getDataLayout()));
+    unsigned Offset = FuncInfo->getVarArgsGPOffset();
+    for (SDValue Val : LiveGPRs) {
+      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                                RSFIN, DAG.getIntPtrConstant(Offset, dl));
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                       MachinePointerInfo::getFixedStack(
+                           DAG.getMachineFunction(),
+                           FuncInfo->getRegSaveFrameIndex(), Offset));
+      MemOps.push_back(Store);
+      Offset += 8;
+    }
+
+    if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
+      // Now store the XMM (fp + vector) parameter registers.
+      SmallVector<SDValue, 12> SaveXMMOps;
+      SaveXMMOps.push_back(Chain);
+      SaveXMMOps.push_back(ALVal);
+      SaveXMMOps.push_back(DAG.getIntPtrConstant(
+                             FuncInfo->getRegSaveFrameIndex(), dl));
+      SaveXMMOps.push_back(DAG.getIntPtrConstant(
+                             FuncInfo->getVarArgsFPOffset(), dl));
+      SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
+                        LiveXMMRegs.end());
+      MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
+                                   MVT::Other, SaveXMMOps));
+    }
+
+    if (!MemOps.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+  }
+
+  if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
+    // Find the largest legal vector type.
+    MVT VecVT = MVT::Other;
+    // FIXME: Only some x86_32 calling conventions support AVX512.
+    if (Subtarget.hasAVX512() &&
+        (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
+                     CallConv == CallingConv::Intel_OCL_BI)))
+      VecVT = MVT::v16f32;
+    else if (Subtarget.hasAVX())
+      VecVT = MVT::v8f32;
+    else if (Subtarget.hasSSE2())
+      VecVT = MVT::v4f32;
+
+    // We forward some GPRs and some vector types.
+    SmallVector<MVT, 2> RegParmTypes;
+    MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
+    RegParmTypes.push_back(IntVT);
+    if (VecVT != MVT::Other)
+      RegParmTypes.push_back(VecVT);
+
+    // Compute the set of forwarded registers. The rest are scratch.
+    SmallVectorImpl<ForwardedRegister> &Forwards =
+        FuncInfo->getForwardedMustTailRegParms();
+    CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
+
+    // Conservatively forward AL on x86_64, since it might be used for varargs.
+    if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
+      unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+      Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
+    }
+
+    // Copy all forwards from physical to virtual registers.
+    for (ForwardedRegister &F : Forwards) {
+      // FIXME: Can we use a less constrained schedule?
+      SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
+      F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
+      Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
+    }
+  }
+
+  // Some CCs need callee pop.
+  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
+    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
+  } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
+    // X86 interrupts must pop the error code if present
+    FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
+  } else {
+    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
+    // If this is an sret function, the return should pop the hidden pointer.
+    if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
+        !Subtarget.getTargetTriple().isOSMSVCRT() &&
+        argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
+      FuncInfo->setBytesToPopOnReturn(4);
+  }
+
+  if (!Is64Bit) {
+    // RegSaveFrameIndex is X86-64 only.
+    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
+    if (CallConv == CallingConv::X86_FastCall ||
+        CallConv == CallingConv::X86_ThisCall)
+      // fastcc functions can't have varargs.
+      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
+  }
+
+  FuncInfo->setArgumentStackSize(StackSize);
+
+  if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
+    EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
+    if (Personality == EHPersonality::CoreCLR) {
+      assert(Is64Bit);
+      // TODO: Add a mechanism to frame lowering that will allow us to indicate
+      // that we'd prefer this slot be allocated towards the bottom of the frame
+      // (i.e. near the stack pointer after allocating the frame).  Every
+      // funclet needs a copy of this slot in its (mostly empty) frame, and the
+      // offset from the bottom of this and each funclet's frame must be the
+      // same, so the size of funclets' (mostly empty) frames is dictated by
+      // how far this slot is from the bottom (since they allocate just enough
+      // space to accommodate holding this slot at the correct offset).
+      int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
+      EHInfo->PSPSymFrameIdx = PSPSymFI;
+    }
+  }
+
+  return Chain;
+}
+
+SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
+                                            SDValue Arg, const SDLoc &dl,
+                                            SelectionDAG &DAG,
+                                            const CCValAssign &VA,
+                                            ISD::ArgFlagsTy Flags) const {
+  unsigned LocMemOffset = VA.getLocMemOffset();
+  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                       StackPtr, PtrOff);
+  if (Flags.isByVal())
+    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
+
+  return DAG.getStore(
+      Chain, dl, Arg, PtrOff,
+      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
+}
+
+/// Emit a load of return address if tail call
+/// optimization is performed and it is required.
+SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
+    SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
+    bool Is64Bit, int FPDiff, const SDLoc &dl) const {
+  // Adjust the Return address stack slot.
+  EVT VT = getPointerTy(DAG.getDataLayout());
+  OutRetAddr = getReturnAddressFrameIndex(DAG);
+
+  // Load the "old" Return address.
+  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
+  return SDValue(OutRetAddr.getNode(), 1);
+}
+
+/// Emit a store of the return address if tail call
+/// optimization is performed and it is required (FPDiff!=0).
+static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
+                                        SDValue Chain, SDValue RetAddrFrIdx,
+                                        EVT PtrVT, unsigned SlotSize,
+                                        int FPDiff, const SDLoc &dl) {
+  // Store the return address to the appropriate stack slot.
+  if (!FPDiff) return Chain;
+  // Calculate the new stack slot for the return address.
+  int NewReturnAddrFI =
+    MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
+                                         false);
+  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
+  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
+                       MachinePointerInfo::getFixedStack(
+                           DAG.getMachineFunction(), NewReturnAddrFI));
+  return Chain;
+}
+
+/// Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
+                       SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> Mask;
+  Mask.push_back(NumElems);
+  for (unsigned i = 1; i != NumElems; ++i)
+    Mask.push_back(i);
+  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
+}
+
+SDValue
+X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                             SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool &isTailCall                      = CLI.IsTailCall;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool Is64Bit        = Subtarget.is64Bit();
+  bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
+  StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
+  bool IsSibcall      = false;
+  X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
+  auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
+
+  if (CallConv == CallingConv::X86_INTR)
+    report_fatal_error("X86 interrupts may not be called directly");
+
+  if (Attr.getValueAsString() == "true")
+    isTailCall = false;
+
+  if (Subtarget.isPICStyleGOT() &&
+      !MF.getTarget().Options.GuaranteedTailCallOpt) {
+    // If we are using a GOT, disable tail calls to external symbols with
+    // default visibility. Tail calling such a symbol requires using a GOT
+    // relocation, which forces early binding of the symbol. This breaks code
+    // that require lazy function symbol resolution. Using musttail or
+    // GuaranteedTailCallOpt will override this.
+    GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+    if (!G || (!G->getGlobal()->hasLocalLinkage() &&
+               G->getGlobal()->hasDefaultVisibility()))
+      isTailCall = false;
+  }
+
+  bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
+  if (IsMustTail) {
+    // Force this to be a tail call.  The verifier rules are enough to ensure
+    // that we can lower this successfully without moving the return address
+    // around.
+    isTailCall = true;
+  } else if (isTailCall) {
+    // Check if it's really possible to do a tail call.
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                    isVarArg, SR != NotStructReturn,
+                    MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
+                    Outs, OutVals, Ins, DAG);
+
+    // Sibcalls are automatically detected tailcalls which do not require
+    // ABI changes.
+    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
+      IsSibcall = true;
+
+    if (isTailCall)
+      ++NumTailCalls;
+  }
+
+  assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
+         "Var args not supported with calling convention fastcc, ghc or hipe");
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+
+  // Allocate shadow area for Win64.
+  if (IsWin64)
+    CCInfo.AllocateStack(32, 8);
+
+  CCInfo.AnalyzeArguments(Outs, CC_X86);
+
+  // In vectorcall calling convention a second pass is required for the HVA
+  // types.
+  if (CallingConv::X86_VectorCall == CallConv) {
+    CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
+  }
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
+  if (IsSibcall)
+    // This is a sibcall. The memory operands are available in caller's
+    // own caller's stack.
+    NumBytes = 0;
+  else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+           canGuaranteeTCO(CallConv))
+    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
+
+  int FPDiff = 0;
+  if (isTailCall && !IsSibcall && !IsMustTail) {
+    // Lower arguments at fp - stackoffset + fpdiff.
+    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
+
+    FPDiff = NumBytesCallerPushed - NumBytes;
+
+    // Set the delta of movement of the returnaddr stackslot.
+    // But only set if delta is greater than previous delta.
+    if (FPDiff < X86Info->getTCReturnAddrDelta())
+      X86Info->setTCReturnAddrDelta(FPDiff);
+  }
+
+  unsigned NumBytesToPush = NumBytes;
+  unsigned NumBytesToPop = NumBytes;
+
+  // If we have an inalloca argument, all stack space has already been allocated
+  // for us and be right at the top of the stack.  We don't support multiple
+  // arguments passed in memory when using inalloca.
+  if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
+    NumBytesToPush = 0;
+    if (!ArgLocs.back().isMemLoc())
+      report_fatal_error("cannot use inalloca attribute on a register "
+                         "parameter");
+    if (ArgLocs.back().getLocMemOffset() != 0)
+      report_fatal_error("any parameter with the inalloca attribute must be "
+                         "the only memory argument");
+  }
+
+  if (!IsSibcall)
+    Chain = DAG.getCALLSEQ_START(
+        Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
+
+  SDValue RetAddrFrIdx;
+  // Load return address for tail calls.
+  if (isTailCall && FPDiff)
+    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
+                                    Is64Bit, FPDiff, dl);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+  SDValue StackPtr;
+
+  // The next loop assumes that the locations are in the same order of the
+  // input arguments.
+  if (!isSortedByValueNo(ArgLocs))
+    llvm_unreachable("Argument Location list must be sorted before lowering");
+
+  // Walk the register/memloc assignments, inserting copies/loads.  In the case
+  // of tail call optimization arguments are handle later.
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
+       ++I, ++OutIndex) {
+    assert(OutIndex < Outs.size() && "Invalid Out index");
+    // Skip inalloca arguments, they have already been written.
+    ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
+    if (Flags.isInAlloca())
+      continue;
+
+    CCValAssign &VA = ArgLocs[I];
+    EVT RegVT = VA.getLocVT();
+    SDValue Arg = OutVals[OutIndex];
+    bool isByVal = Flags.isByVal();
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::AExt:
+      if (Arg.getValueType().isVector() &&
+          Arg.getValueType().getVectorElementType() == MVT::i1)
+        Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
+      else if (RegVT.is128BitVector()) {
+        // Special case: passing MMX values in XMM registers.
+        Arg = DAG.getBitcast(MVT::i64, Arg);
+        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
+        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
+      } else
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getBitcast(RegVT, Arg);
+      break;
+    case CCValAssign::Indirect: {
+      // Store the argument.
+      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
+      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+      Chain = DAG.getStore(
+          Chain, dl, Arg, SpillSlot,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+      Arg = SpillSlot;
+      break;
+    }
+    }
+
+    if (VA.needsCustom()) {
+      assert(VA.getValVT() == MVT::v64i1 &&
+             "Currently the only custom case is when we split v64i1 to 2 regs");
+      // Split v64i1 value into two registers
+      Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
+                         Subtarget);
+    } else if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      if (isVarArg && IsWin64) {
+        // Win64 ABI requires argument XMM reg to be copied to the corresponding
+        // shadow reg if callee is a varargs function.
+        unsigned ShadowReg = 0;
+        switch (VA.getLocReg()) {
+        case X86::XMM0: ShadowReg = X86::RCX; break;
+        case X86::XMM1: ShadowReg = X86::RDX; break;
+        case X86::XMM2: ShadowReg = X86::R8; break;
+        case X86::XMM3: ShadowReg = X86::R9; break;
+        }
+        if (ShadowReg)
+          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
+      }
+    } else if (!IsSibcall && (!isTailCall || isByVal)) {
+      assert(VA.isMemLoc());
+      if (!StackPtr.getNode())
+        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+                                      getPointerTy(DAG.getDataLayout()));
+      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+                                             dl, DAG, VA, Flags));
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  if (Subtarget.isPICStyleGOT()) {
+    // ELF / PIC requires GOT in the EBX register before function calls via PLT
+    // GOT pointer.
+    if (!isTailCall) {
+      RegsToPass.push_back(std::make_pair(
+          unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+                                          getPointerTy(DAG.getDataLayout()))));
+    } else {
+      // If we are tail calling and generating PIC/GOT style code load the
+      // address of the callee into ECX. The value in ecx is used as target of
+      // the tail jump. This is done to circumvent the ebx/callee-saved problem
+      // for tail calls on PIC/GOT architectures. Normally we would just put the
+      // address of GOT into ebx and then call target@PLT. But for tail calls
+      // ebx would be restored (since ebx is callee saved) before jumping to the
+      // target@PLT.
+
+      // Note: The actual moving to ECX is done further down.
+      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+      if (G && !G->getGlobal()->hasLocalLinkage() &&
+          G->getGlobal()->hasDefaultVisibility())
+        Callee = LowerGlobalAddress(Callee, DAG);
+      else if (isa<ExternalSymbolSDNode>(Callee))
+        Callee = LowerExternalSymbol(Callee, DAG);
+    }
+  }
+
+  if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
+    // From AMD64 ABI document:
+    // For calls that may call functions that use varargs or stdargs
+    // (prototype-less calls or calls to functions containing ellipsis (...) in
+    // the declaration) %al is used as hidden argument to specify the number
+    // of SSE registers used. The contents of %al do not need to match exactly
+    // the number of registers, but must be an ubound on the number of SSE
+    // registers used and is in the range 0 - 8 inclusive.
+
+    // Count the number of XMM registers allocated.
+    static const MCPhysReg XMMArgRegs[] = {
+      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+    };
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
+    assert((Subtarget.hasSSE1() || !NumXMMRegs)
+           && "SSE registers cannot be used when SSE is disabled");
+
+    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
+                                        DAG.getConstant(NumXMMRegs, dl,
+                                                        MVT::i8)));
+  }
+
+  if (isVarArg && IsMustTail) {
+    const auto &Forwards = X86Info->getForwardedMustTailRegParms();
+    for (const auto &F : Forwards) {
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
+      RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+    }
+  }
+
+  // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
+  // don't need this because the eligibility check rejects calls that require
+  // shuffling arguments passed in memory.
+  if (!IsSibcall && isTailCall) {
+    // Force all the incoming stack arguments to be loaded from the stack
+    // before any new outgoing arguments are stored to the stack, because the
+    // outgoing stack slots may alias the incoming argument stack slots, and
+    // the alias isn't otherwise explicit. This is slightly more conservative
+    // than necessary, because it means that each store effectively depends
+    // on every argument instead of just those arguments it would clobber.
+    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
+
+    SmallVector<SDValue, 8> MemOpChains2;
+    SDValue FIN;
+    int FI = 0;
+    for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
+         ++I, ++OutsIndex) {
+      CCValAssign &VA = ArgLocs[I];
+
+      if (VA.isRegLoc()) {
+        if (VA.needsCustom()) {
+          assert((CallConv == CallingConv::X86_RegCall) &&
+                 "Expecting custome case only in regcall calling convention");
+          // This means that we are in special case where one argument was
+          // passed through two register locations - Skip the next location
+          ++I;
+        }
+
+        continue;
+      }
+
+      assert(VA.isMemLoc());
+      SDValue Arg = OutVals[OutsIndex];
+      ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
+      // Skip inalloca arguments.  They don't require any work.
+      if (Flags.isInAlloca())
+        continue;
+      // Create frame index.
+      int32_t Offset = VA.getLocMemOffset()+FPDiff;
+      uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+      FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
+      FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+
+      if (Flags.isByVal()) {
+        // Copy relative to framepointer.
+        SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
+        if (!StackPtr.getNode())
+          StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+                                        getPointerTy(DAG.getDataLayout()));
+        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                             StackPtr, Source);
+
+        MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+                                                         ArgChain,
+                                                         Flags, DAG, dl));
+      } else {
+        // Store relative to framepointer.
+        MemOpChains2.push_back(DAG.getStore(
+            ArgChain, dl, Arg, FIN,
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
+      }
+    }
+
+    if (!MemOpChains2.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
+
+    // Store the return address to the appropriate stack slot.
+    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
+                                     getPointerTy(DAG.getDataLayout()),
+                                     RegInfo->getSlotSize(), FPDiff, dl);
+  }
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into registers.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
+    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
+    // In the 64-bit large code model, we have to make all calls
+    // through a register, since the call instruction's 32-bit
+    // pc-relative offset may not be large enough to hold the whole
+    // address.
+  } else if (Callee->getOpcode() == ISD::GlobalAddress) {
+    // If the callee is a GlobalAddress node (quite common, every direct call
+    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
+    // it.
+    GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
+
+    // We should use extra load for direct calls to dllimported functions in
+    // non-JIT mode.
+    const GlobalValue *GV = G->getGlobal();
+    if (!GV->hasDLLImportStorageClass()) {
+      unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
+
+      Callee = DAG.getTargetGlobalAddress(
+          GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
+
+      if (OpFlags == X86II::MO_GOTPCREL) {
+        // Add a wrapper.
+        Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
+          getPointerTy(DAG.getDataLayout()), Callee);
+        // Add extra indirection
+        Callee = DAG.getLoad(
+            getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
+            MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+      }
+    }
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
+    unsigned char OpFlags =
+        Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
+
+    Callee = DAG.getTargetExternalSymbol(
+        S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
+  } else if (Subtarget.isTarget64BitILP32() &&
+             Callee->getValueType(0) == MVT::i32) {
+    // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
+    Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+
+  if (!IsSibcall && isTailCall) {
+    Chain = DAG.getCALLSEQ_END(Chain,
+                               DAG.getIntPtrConstant(NumBytesToPop, dl, true),
+                               DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
+    InFlag = Chain.getValue(1);
+  }
+
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  if (isTailCall)
+    Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+
+  // If this is an invoke in a 32-bit function using a funclet-based
+  // personality, assume the function clobbers all registers. If an exception
+  // is thrown, the runtime will not restore CSRs.
+  // FIXME: Model this more precisely so that we can register allocate across
+  // the normal edge and spill and fill across the exceptional edge.
+  if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
+    const Function *CallerFn = MF.getFunction();
+    EHPersonality Pers =
+        CallerFn->hasPersonalityFn()
+            ? classifyEHPersonality(CallerFn->getPersonalityFn())
+            : EHPersonality::Unknown;
+    if (isFuncletEHPersonality(Pers))
+      Mask = RegInfo->getNoPreservedMask();
+  }
+
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  if (isTailCall) {
+    // We used to do:
+    //// If this is the first return lowered for this function, add the regs
+    //// to the liveout set for the function.
+    // This isn't right, although it's probably harmless on x86; liveouts
+    // should be computed from returns not tail calls.  Consider a void
+    // function making a tail call to a function returning int.
+    MF.getFrameInfo().setHasTailCall();
+    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
+  }
+
+  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  unsigned NumBytesForCalleeToPop;
+  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+                       DAG.getTarget().Options.GuaranteedTailCallOpt))
+    NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
+  else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
+           !Subtarget.getTargetTriple().isOSMSVCRT() &&
+           SR == StackStructReturn)
+    // If this is a call to a struct-return function, the callee
+    // pops the hidden struct pointer, so we have to push it back.
+    // This is common for Darwin/X86, Linux & Mingw32 targets.
+    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
+    NumBytesForCalleeToPop = 4;
+  else
+    NumBytesForCalleeToPop = 0;  // Callee pops nothing.
+
+  if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
+    // No need to reset the stack after the call if the call doesn't return. To
+    // make the MI verify, we'll pretend the callee does it for us.
+    NumBytesForCalleeToPop = NumBytes;
+  }
+
+  // Returns a flag for retval copy to use.
+  if (!IsSibcall) {
+    Chain = DAG.getCALLSEQ_END(Chain,
+                               DAG.getIntPtrConstant(NumBytesToPop, dl, true),
+                               DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
+                                                     true),
+                               InFlag, dl);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+//===----------------------------------------------------------------------===//
+//                Fast Calling Convention (tail call) implementation
+//===----------------------------------------------------------------------===//
+
+//  Like std call, callee cleans arguments, convention except that ECX is
+//  reserved for storing the tail called function address. Only 2 registers are
+//  free for argument passing (inreg). Tail call optimization is performed
+//  provided:
+//                * tailcallopt is enabled
+//                * caller/callee are fastcc
+//  On X86_64 architecture with GOT-style position independent code only local
+//  (within module) calls are supported at the moment.
+//  To keep the stack aligned according to platform abi the function
+//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
+//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
+//  If a tail called function callee has more arguments than the caller the
+//  caller needs to make sure that there is room to move the RETADDR to. This is
+//  achieved by reserving an area the size of the argument delta right after the
+//  original RETADDR, but before the saved framepointer or the spilled registers
+//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
+//  stack layout:
+//    arg1
+//    arg2
+//    RETADDR
+//    [ new RETADDR
+//      move area ]
+//    (possible EBP)
+//    ESI
+//    EDI
+//    local1 ..
+
+/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
+/// requirement.
+unsigned
+X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
+                                               SelectionDAG& DAG) const {
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
+  unsigned StackAlignment = TFI.getStackAlignment();
+  uint64_t AlignMask = StackAlignment - 1;
+  int64_t Offset = StackSize;
+  unsigned SlotSize = RegInfo->getSlotSize();
+  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
+    // Number smaller than 12 so just add the difference.
+    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
+  } else {
+    // Mask out lower bits, add stackalignment once plus the 12 bytes.
+    Offset = ((~AlignMask) & Offset) + StackAlignment +
+      (StackAlignment-SlotSize);
+  }
+  return Offset;
+}
+
+/// Return true if the given stack call argument is already available in the
+/// same position (relatively) of the caller's incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+                         MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
+                         const X86InstrInfo *TII, const CCValAssign &VA) {
+  unsigned Bytes = Arg.getValueSizeInBits() / 8;
+
+  for (;;) {
+    // Look through nodes that don't alter the bits of the incoming value.
+    unsigned Op = Arg.getOpcode();
+    if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
+      Arg = Arg.getOperand(0);
+      continue;
+    }
+    if (Op == ISD::TRUNCATE) {
+      const SDValue &TruncInput = Arg.getOperand(0);
+      if (TruncInput.getOpcode() == ISD::AssertZext &&
+          cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
+              Arg.getValueType()) {
+        Arg = TruncInput.getOperand(0);
+        continue;
+      }
+    }
+    break;
+  }
+
+  int FI = INT_MAX;
+  if (Arg.getOpcode() == ISD::CopyFromReg) {
+    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(VR))
+      return false;
+    MachineInstr *Def = MRI->getVRegDef(VR);
+    if (!Def)
+      return false;
+    if (!Flags.isByVal()) {
+      if (!TII->isLoadFromStackSlot(*Def, FI))
+        return false;
+    } else {
+      unsigned Opcode = Def->getOpcode();
+      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
+           Opcode == X86::LEA64_32r) &&
+          Def->getOperand(1).isFI()) {
+        FI = Def->getOperand(1).getIndex();
+        Bytes = Flags.getByValSize();
+      } else
+        return false;
+    }
+  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+    if (Flags.isByVal())
+      // ByVal argument is passed in as a pointer but it's now being
+      // dereferenced. e.g.
+      // define @foo(%struct.X* %A) {
+      //   tail call @bar(%struct.X* byval %A)
+      // }
+      return false;
+    SDValue Ptr = Ld->getBasePtr();
+    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+    if (!FINode)
+      return false;
+    FI = FINode->getIndex();
+  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
+    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
+    FI = FINode->getIndex();
+    Bytes = Flags.getByValSize();
+  } else
+    return false;
+
+  assert(FI != INT_MAX);
+  if (!MFI.isFixedObjectIndex(FI))
+    return false;
+
+  if (Offset != MFI.getObjectOffset(FI))
+    return false;
+
+  if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
+    // If the argument location is wider than the argument type, check that any
+    // extension flags match.
+    if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
+        Flags.isSExt() != MFI.isObjectSExt(FI)) {
+      return false;
+    }
+  }
+
+  return Bytes == MFI.getObjectSize(FI);
+}
+
+/// Check whether the call is eligible for tail call optimization. Targets
+/// that want to do tail call optimization should implement this function.
+bool X86TargetLowering::IsEligibleForTailCallOptimization(
+    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+    bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+  if (!mayTailCallThisCC(CalleeCC))
+    return false;
+
+  // If -tailcallopt is specified, make fastcc functions tail-callable.
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *CallerF = MF.getFunction();
+
+  // If the function return type is x86_fp80 and the callee return type is not,
+  // then the FP_EXTEND of the call result is not a nop. It's not safe to
+  // perform a tailcall optimization here.
+  if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
+    return false;
+
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+  bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
+  bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
+
+  // Win64 functions have extra shadow space for argument homing. Don't do the
+  // sibcall if the caller and callee have mismatched expectations for this
+  // space.
+  if (IsCalleeWin64 != IsCallerWin64)
+    return false;
+
+  if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
+    if (canGuaranteeTCO(CalleeCC) && CCMatch)
+      return true;
+    return false;
+  }
+
+  // Look for obvious safe cases to perform tail call optimization that do not
+  // require ABI changes. This is what gcc calls sibcall.
+
+  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
+  // emit a special epilogue.
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  if (RegInfo->needsStackRealignment(MF))
+    return false;
+
+  // Also avoid sibcall optimization if either caller or callee uses struct
+  // return semantics.
+  if (isCalleeStructRet || isCallerStructRet)
+    return false;
+
+  // Do not sibcall optimize vararg calls unless all arguments are passed via
+  // registers.
+  LLVMContext &C = *DAG.getContext();
+  if (isVarArg && !Outs.empty()) {
+    // Optimizing for varargs on Win64 is unlikely to be safe without
+    // additional testing.
+    if (IsCalleeWin64 || IsCallerWin64)
+      return false;
+
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
+
+    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+      if (!ArgLocs[i].isRegLoc())
+        return false;
+  }
+
+  // If the call result is in ST0 / ST1, it needs to be popped off the x87
+  // stack.  Therefore, if it's not used by the call it is not safe to optimize
+  // this into a sibcall.
+  bool Unused = false;
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    if (!Ins[i].Used) {
+      Unused = true;
+      break;
+    }
+  }
+  if (Unused) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
+    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+      CCValAssign &VA = RVLocs[i];
+      if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
+        return false;
+    }
+  }
+
+  // Check that the call results are passed in the same way.
+  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
+                                  RetCC_X86, RetCC_X86))
+    return false;
+  // The callee has to preserve all registers the caller needs to preserve.
+  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
+  if (!CCMatch) {
+    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
+      return false;
+  }
+
+  unsigned StackArgsSize = 0;
+
+  // If the callee takes no arguments then go on to check the results of the
+  // call.
+  if (!Outs.empty()) {
+    // Check if stack adjustment is needed. For now, do not do this if any
+    // argument is passed on the stack.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
+
+    // Allocate shadow area for Win64
+    if (IsCalleeWin64)
+      CCInfo.AllocateStack(32, 8);
+
+    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+    StackArgsSize = CCInfo.getNextStackOffset();
+
+    if (CCInfo.getNextStackOffset()) {
+      // Check if the arguments are already laid out in the right way as
+      // the caller's fixed stack objects.
+      MachineFrameInfo &MFI = MF.getFrameInfo();
+      const MachineRegisterInfo *MRI = &MF.getRegInfo();
+      const X86InstrInfo *TII = Subtarget.getInstrInfo();
+      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+        CCValAssign &VA = ArgLocs[i];
+        SDValue Arg = OutVals[i];
+        ISD::ArgFlagsTy Flags = Outs[i].Flags;
+        if (VA.getLocInfo() == CCValAssign::Indirect)
+          return false;
+        if (!VA.isRegLoc()) {
+          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+                                   MFI, MRI, TII, VA))
+            return false;
+        }
+      }
+    }
+
+    bool PositionIndependent = isPositionIndependent();
+    // If the tailcall address may be in a register, then make sure it's
+    // possible to register allocate for it. In 32-bit, the call address can
+    // only target EAX, EDX, or ECX since the tail call must be scheduled after
+    // callee-saved registers are restored. These happen to be the same
+    // registers used to pass 'inreg' arguments so watch out for those.
+    if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
+                                  !isa<ExternalSymbolSDNode>(Callee)) ||
+                                 PositionIndependent)) {
+      unsigned NumInRegs = 0;
+      // In PIC we need an extra register to formulate the address computation
+      // for the callee.
+      unsigned MaxInRegs = PositionIndependent ? 2 : 3;
+
+      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+        CCValAssign &VA = ArgLocs[i];
+        if (!VA.isRegLoc())
+          continue;
+        unsigned Reg = VA.getLocReg();
+        switch (Reg) {
+        default: break;
+        case X86::EAX: case X86::EDX: case X86::ECX:
+          if (++NumInRegs == MaxInRegs)
+            return false;
+          break;
+        }
+      }
+    }
+
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+      return false;
+  }
+
+  bool CalleeWillPop =
+      X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
+                       MF.getTarget().Options.GuaranteedTailCallOpt);
+
+  if (unsigned BytesToPop =
+          MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
+    // If we have bytes to pop, the callee must pop them.
+    bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
+    if (!CalleePopMatches)
+      return false;
+  } else if (CalleeWillPop && StackArgsSize > 0) {
+    // If we don't have bytes to pop, make sure the callee doesn't pop any.
+    return false;
+  }
+
+  return true;
+}
+
+FastISel *
+X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                  const TargetLibraryInfo *libInfo) const {
+  return X86::createFastISel(funcInfo, libInfo);
+}
+
+//===----------------------------------------------------------------------===//
+//                           Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+static bool MayFoldLoad(SDValue Op) {
+  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
+}
+
+static bool MayFoldIntoStore(SDValue Op) {
+  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
+}
+
+static bool MayFoldIntoZeroExtend(SDValue Op) {
+  if (Op.hasOneUse()) {
+    unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
+    return (ISD::ZERO_EXTEND == Opcode);
+  }
+  return false;
+}
+
+static bool isTargetShuffle(unsigned Opcode) {
+  switch(Opcode) {
+  default: return false;
+  case X86ISD::BLENDI:
+  case X86ISD::PSHUFB:
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+  case X86ISD::SHUFP:
+  case X86ISD::INSERTPS:
+  case X86ISD::PALIGNR:
+  case X86ISD::VSHLDQ:
+  case X86ISD::VSRLDQ:
+  case X86ISD::MOVLHPS:
+  case X86ISD::MOVLHPD:
+  case X86ISD::MOVHLPS:
+  case X86ISD::MOVLPS:
+  case X86ISD::MOVLPD:
+  case X86ISD::MOVSHDUP:
+  case X86ISD::MOVSLDUP:
+  case X86ISD::MOVDDUP:
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD:
+  case X86ISD::UNPCKL:
+  case X86ISD::UNPCKH:
+  case X86ISD::VBROADCAST:
+  case X86ISD::VPERMILPI:
+  case X86ISD::VPERMILPV:
+  case X86ISD::VPERM2X128:
+  case X86ISD::VPERMIL2:
+  case X86ISD::VPERMI:
+  case X86ISD::VPPERM:
+  case X86ISD::VPERMV:
+  case X86ISD::VPERMV3:
+  case X86ISD::VPERMIV3:
+  case X86ISD::VZEXT_MOVL:
+    return true;
+  }
+}
+
+static bool isTargetShuffleVariableMask(unsigned Opcode) {
+  switch (Opcode) {
+  default: return false;
+  // Target Shuffles.
+  case X86ISD::PSHUFB:
+  case X86ISD::VPERMILPV:
+  case X86ISD::VPERMIL2:
+  case X86ISD::VPPERM:
+  case X86ISD::VPERMV:
+  case X86ISD::VPERMV3:
+  case X86ISD::VPERMIV3:
+    return true;
+  // 'Faux' Target Shuffles.
+  case ISD::AND:
+    return true;
+  }
+}
+
+SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+  if (ReturnAddrIndex == 0) {
+    // Set up a frame object for the return address.
+    unsigned SlotSize = RegInfo->getSlotSize();
+    ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
+                                                          -(int64_t)SlotSize,
+                                                          false);
+    FuncInfo->setRAIndex(ReturnAddrIndex);
+  }
+
+  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
+}
+
+bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+                                       bool hasSymbolicDisplacement) {
+  // Offset should fit into 32 bit immediate field.
+  if (!isInt<32>(Offset))
+    return false;
+
+  // If we don't have a symbolic displacement - we don't have any extra
+  // restrictions.
+  if (!hasSymbolicDisplacement)
+    return true;
+
+  // FIXME: Some tweaks might be needed for medium code model.
+  if (M != CodeModel::Small && M != CodeModel::Kernel)
+    return false;
+
+  // For small code model we assume that latest object is 16MB before end of 31
+  // bits boundary. We may also accept pretty large negative constants knowing
+  // that all objects are in the positive half of address space.
+  if (M == CodeModel::Small && Offset < 16*1024*1024)
+    return true;
+
+  // For kernel code model we know that all object resist in the negative half
+  // of 32bits address space. We may not accept negative offsets, since they may
+  // be just off and we may accept pretty large positive ones.
+  if (M == CodeModel::Kernel && Offset >= 0)
+    return true;
+
+  return false;
+}
+
+/// Determines whether the callee is required to pop its own arguments.
+/// Callee pop is necessary to support tail calls.
+bool X86::isCalleePop(CallingConv::ID CallingConv,
+                      bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
+  // If GuaranteeTCO is true, we force some calls to be callee pop so that we
+  // can guarantee TCO.
+  if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
+    return true;
+
+  switch (CallingConv) {
+  default:
+    return false;
+  case CallingConv::X86_StdCall:
+  case CallingConv::X86_FastCall:
+  case CallingConv::X86_ThisCall:
+  case CallingConv::X86_VectorCall:
+    return !is64Bit;
+  }
+}
+
+/// \brief Return true if the condition is an unsigned comparison operation.
+static bool isX86CCUnsigned(unsigned X86CC) {
+  switch (X86CC) {
+  default:
+    llvm_unreachable("Invalid integer condition!");
+  case X86::COND_E:
+  case X86::COND_NE:
+  case X86::COND_B:
+  case X86::COND_A:
+  case X86::COND_BE:
+  case X86::COND_AE:
+    return true;
+  case X86::COND_G:
+  case X86::COND_GE:
+  case X86::COND_L:
+  case X86::COND_LE:
+    return false;
+  }
+}
+
+static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Invalid integer condition!");
+  case ISD::SETEQ:  return X86::COND_E;
+  case ISD::SETGT:  return X86::COND_G;
+  case ISD::SETGE:  return X86::COND_GE;
+  case ISD::SETLT:  return X86::COND_L;
+  case ISD::SETLE:  return X86::COND_LE;
+  case ISD::SETNE:  return X86::COND_NE;
+  case ISD::SETULT: return X86::COND_B;
+  case ISD::SETUGT: return X86::COND_A;
+  case ISD::SETULE: return X86::COND_BE;
+  case ISD::SETUGE: return X86::COND_AE;
+  }
+}
+
+/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
+/// condition code, returning the condition code and the LHS/RHS of the
+/// comparison to make.
+static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
+                               bool isFP, SDValue &LHS, SDValue &RHS,
+                               SelectionDAG &DAG) {
+  if (!isFP) {
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+        // X > -1   -> X == 0, jump !sign.
+        RHS = DAG.getConstant(0, DL, RHS.getValueType());
+        return X86::COND_NS;
+      }
+      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+        // X < 0   -> X == 0, jump on sign.
+        return X86::COND_S;
+      }
+      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+        // X < 1   -> X <= 0
+        RHS = DAG.getConstant(0, DL, RHS.getValueType());
+        return X86::COND_LE;
+      }
+    }
+
+    return TranslateIntegerX86CC(SetCCOpcode);
+  }
+
+  // First determine if it is required or is profitable to flip the operands.
+
+  // If LHS is a foldable load, but RHS is not, flip the condition.
+  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
+      !ISD::isNON_EXTLoad(RHS.getNode())) {
+    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
+    std::swap(LHS, RHS);
+  }
+
+  switch (SetCCOpcode) {
+  default: break;
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    std::swap(LHS, RHS);
+    break;
+  }
+
+  // On a floating point condition, the flags are set as follows:
+  // ZF  PF  CF   op
+  //  0 | 0 | 0 | X > Y
+  //  0 | 0 | 1 | X < Y
+  //  1 | 0 | 0 | X == Y
+  //  1 | 1 | 1 | unordered
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Condcode should be pre-legalized away");
+  case ISD::SETUEQ:
+  case ISD::SETEQ:   return X86::COND_E;
+  case ISD::SETOLT:              // flipped
+  case ISD::SETOGT:
+  case ISD::SETGT:   return X86::COND_A;
+  case ISD::SETOLE:              // flipped
+  case ISD::SETOGE:
+  case ISD::SETGE:   return X86::COND_AE;
+  case ISD::SETUGT:              // flipped
+  case ISD::SETULT:
+  case ISD::SETLT:   return X86::COND_B;
+  case ISD::SETUGE:              // flipped
+  case ISD::SETULE:
+  case ISD::SETLE:   return X86::COND_BE;
+  case ISD::SETONE:
+  case ISD::SETNE:   return X86::COND_NE;
+  case ISD::SETUO:   return X86::COND_P;
+  case ISD::SETO:    return X86::COND_NP;
+  case ISD::SETOEQ:
+  case ISD::SETUNE:  return X86::COND_INVALID;
+  }
+}
+
+/// Is there a floating point cmov for the specific X86 condition code?
+/// Current x86 isa includes the following FP cmov instructions:
+/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
+static bool hasFPCMov(unsigned X86CC) {
+  switch (X86CC) {
+  default:
+    return false;
+  case X86::COND_B:
+  case X86::COND_BE:
+  case X86::COND_E:
+  case X86::COND_P:
+  case X86::COND_A:
+  case X86::COND_AE:
+  case X86::COND_NE:
+  case X86::COND_NP:
+    return true;
+  }
+}
+
+
+bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                           const CallInst &I,
+                                           unsigned Intrinsic) const {
+
+  const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
+  if (!IntrData)
+    return false;
+
+  Info.opc = ISD::INTRINSIC_W_CHAIN;
+  Info.readMem = false;
+  Info.writeMem = false;
+  Info.vol = false;
+  Info.offset = 0;
+
+  switch (IntrData->Type) {
+  case EXPAND_FROM_MEM: {
+    Info.ptrVal = I.getArgOperand(0);
+    Info.memVT = MVT::getVT(I.getType());
+    Info.align = 1;
+    Info.readMem = true;
+    break;
+  }
+  case COMPRESS_TO_MEM: {
+    Info.ptrVal = I.getArgOperand(0);
+    Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
+    Info.align = 1;
+    Info.writeMem = true;
+    break;
+  }
+  case TRUNCATE_TO_MEM_VI8:
+  case TRUNCATE_TO_MEM_VI16:
+  case TRUNCATE_TO_MEM_VI32: {
+    Info.ptrVal = I.getArgOperand(0);
+    MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
+    MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+    if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
+      ScalarVT = MVT::i8;
+    else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
+      ScalarVT = MVT::i16;
+    else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
+      ScalarVT = MVT::i32;
+
+    Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
+    Info.align = 1;
+    Info.writeMem = true;
+    break;
+  }
+  default:
+    return false;
+  }
+
+  return true;
+}
+
+/// Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
+    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
+      return true;
+  }
+  return false;
+}
+
+bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
+                                              ISD::LoadExtType ExtTy,
+                                              EVT NewVT) const {
+  // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
+  // relocation target a movq or addq instruction: don't let the load shrink.
+  SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
+  if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
+    if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
+      return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
+  return true;
+}
+
+/// \brief Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                          Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0 || BitSize > 64)
+    return false;
+  return true;
+}
+
+bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
+                                                unsigned Index) const {
+  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+    return false;
+
+  return (Index == 0 || Index == ResVT.getVectorNumElements());
+}
+
+bool X86TargetLowering::isCheapToSpeculateCttz() const {
+  // Speculate cttz only if we can directly use TZCNT.
+  return Subtarget.hasBMI();
+}
+
+bool X86TargetLowering::isCheapToSpeculateCtlz() const {
+  // Speculate ctlz only if we can directly use LZCNT.
+  return Subtarget.hasLZCNT();
+}
+
+bool X86TargetLowering::isCtlzFast() const {
+  return Subtarget.hasFastLZCNT();
+}
+
+bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
+  if (!Subtarget.hasBMI())
+    return false;
+
+  // There are only 32-bit and 64-bit forms for 'andn'.
+  EVT VT = Y.getValueType();
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return false;
+
+  return true;
+}
+
+/// Val is the undef sentinel value or equal to the specified value.
+static bool isUndefOrEqual(int Val, int CmpVal) {
+  return ((Val == SM_SentinelUndef) || (Val == CmpVal));
+}
+
+/// Val is either the undef or zero sentinel value.
+static bool isUndefOrZero(int Val) {
+  return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
+}
+
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size is the undef sentinel value.
+static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
+  for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
+    if (Mask[i] != SM_SentinelUndef)
+      return false;
+  return true;
+}
+
+/// Return true if Val is undef or if its value falls within the
+/// specified range (L, H].
+static bool isUndefOrInRange(int Val, int Low, int Hi) {
+  return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
+}
+
+/// Return true if every element in Mask is undef or if its value
+/// falls within the specified range (L, H].
+static bool isUndefOrInRange(ArrayRef<int> Mask,
+                             int Low, int Hi) {
+  for (int M : Mask)
+    if (!isUndefOrInRange(M, Low, Hi))
+      return false;
+  return true;
+}
+
+/// Return true if Val is undef, zero or if its value falls within the
+/// specified range (L, H].
+static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
+  return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
+}
+
+/// Return true if every element in Mask is undef, zero or if its value
+/// falls within the specified range (L, H].
+static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
+  for (int M : Mask)
+    if (!isUndefOrZeroOrInRange(M, Low, Hi))
+      return false;
+  return true;
+}
+
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (Low, Low+Size]. or is undef.
+static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
+                                       unsigned Pos, unsigned Size, int Low) {
+  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+    if (!isUndefOrEqual(Mask[i], Low))
+      return false;
+  return true;
+}
+
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (Low, Low+Size], or is undef or is zero.
+static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
+                                             unsigned Size, int Low) {
+  for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
+    if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
+      return false;
+  return true;
+}
+
+/// Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size is undef or is zero.
+static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
+                                 unsigned Size) {
+  for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
+    if (!isUndefOrZero(Mask[i]))
+      return false;
+  return true;
+}
+
+/// \brief Helper function to test whether a shuffle mask could be
+/// simplified by widening the elements being shuffled.
+///
+/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
+/// leaves it in an unspecified state.
+///
+/// NOTE: This must handle normal vector shuffle masks and *target* vector
+/// shuffle masks. The latter have the special property of a '-2' representing
+/// a zero-ed lane of a vector.
+static bool canWidenShuffleElements(ArrayRef<int> Mask,
+                                    SmallVectorImpl<int> &WidenedMask) {
+  WidenedMask.assign(Mask.size() / 2, 0);
+  for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
+    // If both elements are undef, its trivial.
+    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
+      WidenedMask[i / 2] = SM_SentinelUndef;
+      continue;
+    }
+
+    // Check for an undef mask and a mask value properly aligned to fit with
+    // a pair of values. If we find such a case, use the non-undef mask's value.
+    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 &&
+        Mask[i + 1] % 2 == 1) {
+      WidenedMask[i / 2] = Mask[i + 1] / 2;
+      continue;
+    }
+    if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
+      WidenedMask[i / 2] = Mask[i] / 2;
+      continue;
+    }
+
+    // When zeroing, we need to spread the zeroing across both lanes to widen.
+    if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
+      if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
+          (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
+        WidenedMask[i / 2] = SM_SentinelZero;
+        continue;
+      }
+      return false;
+    }
+
+    // Finally check if the two mask values are adjacent and aligned with
+    // a pair.
+    if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 &&
+        Mask[i] + 1 == Mask[i + 1]) {
+      WidenedMask[i / 2] = Mask[i] / 2;
+      continue;
+    }
+
+    // Otherwise we can't safely widen the elements used in this shuffle.
+    return false;
+  }
+  assert(WidenedMask.size() == Mask.size() / 2 &&
+         "Incorrect size of mask after widening the elements!");
+
+  return true;
+}
+
+/// Helper function to scale a shuffle or target shuffle mask, replacing each
+/// mask index with the scaled sequential indices for an equivalent narrowed
+/// mask. This is the reverse process to canWidenShuffleElements, but can always
+/// succeed.
+static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
+                             SmallVectorImpl<int> &ScaledMask) {
+  assert(0 < Scale && "Unexpected scaling factor");
+  int NumElts = Mask.size();
+  ScaledMask.assign(NumElts * Scale, -1);
+
+  for (int i = 0; i != NumElts; ++i) {
+    int M = Mask[i];
+
+    // Repeat sentinel values in every mask element.
+    if (M < 0) {
+      for (int s = 0; s != Scale; ++s)
+        ScaledMask[(Scale * i) + s] = M;
+      continue;
+    }
+
+    // Scale mask element and increment across each mask element.
+    for (int s = 0; s != Scale; ++s)
+      ScaledMask[(Scale * i) + s] = (Scale * M) + s;
+  }
+}
+
+/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
+/// extract that is suitable for instruction that extract 128 or 256 bit vectors
+static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
+  assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
+  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
+    return false;
+
+  // The index should be aligned on a vecWidth-bit boundary.
+  uint64_t Index =
+    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+
+  MVT VT = N->getSimpleValueType(0);
+  unsigned ElSize = VT.getScalarSizeInBits();
+  bool Result = (Index * ElSize) % vecWidth == 0;
+
+  return Result;
+}
+
+/// Return true if the specified INSERT_SUBVECTOR
+/// operand specifies a subvector insert that is suitable for input to
+/// insertion of 128 or 256-bit subvectors
+static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
+  assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
+  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
+    return false;
+  // The index should be aligned on a vecWidth-bit boundary.
+  uint64_t Index =
+    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+
+  MVT VT = N->getSimpleValueType(0);
+  unsigned ElSize = VT.getScalarSizeInBits();
+  bool Result = (Index * ElSize) % vecWidth == 0;
+
+  return Result;
+}
+
+bool X86::isVINSERT128Index(SDNode *N) {
+  return isVINSERTIndex(N, 128);
+}
+
+bool X86::isVINSERT256Index(SDNode *N) {
+  return isVINSERTIndex(N, 256);
+}
+
+bool X86::isVEXTRACT128Index(SDNode *N) {
+  return isVEXTRACTIndex(N, 128);
+}
+
+bool X86::isVEXTRACT256Index(SDNode *N) {
+  return isVEXTRACTIndex(N, 256);
+}
+
+static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
+  assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
+  assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
+         "Illegal extract subvector for VEXTRACT");
+
+  uint64_t Index =
+    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+
+  MVT VecVT = N->getOperand(0).getSimpleValueType();
+  MVT ElVT = VecVT.getVectorElementType();
+
+  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
+  return Index / NumElemsPerChunk;
+}
+
+static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
+  assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
+  assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
+         "Illegal insert subvector for VINSERT");
+
+  uint64_t Index =
+    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+
+  MVT VecVT = N->getSimpleValueType(0);
+  MVT ElVT = VecVT.getVectorElementType();
+
+  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
+  return Index / NumElemsPerChunk;
+}
+
+/// Return the appropriate immediate to extract the specified
+/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
+unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
+  return getExtractVEXTRACTImmediate(N, 128);
+}
+
+/// Return the appropriate immediate to extract the specified
+/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
+unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
+  return getExtractVEXTRACTImmediate(N, 256);
+}
+
+/// Return the appropriate immediate to insert at the specified
+/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
+unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
+  return getInsertVINSERTImmediate(N, 128);
+}
+
+/// Return the appropriate immediate to insert at the specified
+/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
+unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
+  return getInsertVINSERTImmediate(N, 256);
+}
+
+/// Returns true if Elt is a constant zero or a floating point constant +0.0.
+bool X86::isZeroNode(SDValue Elt) {
+  return isNullConstant(Elt) || isNullFPConstant(Elt);
+}
+
+// Build a vector of constants
+// Use an UNDEF node if MaskElt == -1.
+// Spilt 64-bit constants in the 32-bit mode.
+static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
+                              const SDLoc &dl, bool IsMask = false) {
+
+  SmallVector<SDValue, 32>  Ops;
+  bool Split = false;
+
+  MVT ConstVecVT = VT;
+  unsigned NumElts = VT.getVectorNumElements();
+  bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
+  if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
+    ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
+    Split = true;
+  }
+
+  MVT EltVT = ConstVecVT.getVectorElementType();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    bool IsUndef = Values[i] < 0 && IsMask;
+    SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
+      DAG.getConstant(Values[i], dl, EltVT);
+    Ops.push_back(OpNode);
+    if (Split)
+      Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
+                    DAG.getConstant(0, dl, EltVT));
+  }
+  SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
+  if (Split)
+    ConstsNode = DAG.getBitcast(VT, ConstsNode);
+  return ConstsNode;
+}
+
+static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs,
+                              MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
+  assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays");
+  SmallVector<SDValue, 32> Ops;
+  bool Split = false;
+
+  MVT ConstVecVT = VT;
+  unsigned NumElts = VT.getVectorNumElements();
+  bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
+  if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
+    ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
+    Split = true;
+  }
+
+  MVT EltVT = ConstVecVT.getVectorElementType();
+  for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
+    if (Undefs[i]) {
+      Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
+      continue;
+    }
+    const APInt &V = Bits[i];
+    assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
+    if (Split) {
+      Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
+      Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
+    } else if (EltVT == MVT::f32) {
+      APFloat FV(APFloat::IEEEsingle(), V);
+      Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
+    } else if (EltVT == MVT::f64) {
+      APFloat FV(APFloat::IEEEdouble(), V);
+      Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
+    } else {
+      Ops.push_back(DAG.getConstant(V, dl, EltVT));
+    }
+  }
+
+  SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
+  return DAG.getBitcast(VT, ConstsNode);
+}
+
+/// Returns a vector of specified type with all zero elements.
+static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
+                             SelectionDAG &DAG, const SDLoc &dl) {
+  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
+          VT.getVectorElementType() == MVT::i1) &&
+         "Unexpected vector type");
+
+  // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
+  // type. This ensures they get CSE'd. But if the integer type is not
+  // available, use a floating-point +0.0 instead.
+  SDValue Vec;
+  if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
+    Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
+  } else if (VT.getVectorElementType() == MVT::i1) {
+    assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
+           "Unexpected vector type");
+    assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
+           "Unexpected vector type");
+    Vec = DAG.getConstant(0, dl, VT);
+  } else {
+    unsigned Num32BitElts = VT.getSizeInBits() / 32;
+    Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
+  }
+  return DAG.getBitcast(VT, Vec);
+}
+
+static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
+                                const SDLoc &dl, unsigned vectorWidth) {
+  EVT VT = Vec.getValueType();
+  EVT ElVT = VT.getVectorElementType();
+  unsigned Factor = VT.getSizeInBits()/vectorWidth;
+  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+                                  VT.getVectorNumElements()/Factor);
+
+  // Extract from UNDEF is UNDEF.
+  if (Vec.isUndef())
+    return DAG.getUNDEF(ResultVT);
+
+  // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
+  unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
+  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
+
+  // This is the index of the first element of the vectorWidth-bit chunk
+  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+  IdxVal &= ~(ElemsPerChunk - 1);
+
+  // If the input is a buildvector just emit a smaller one.
+  if (Vec.getOpcode() == ISD::BUILD_VECTOR)
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
+                       makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
+
+  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
+}
+
+/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
+/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
+/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
+/// instructions or a simple subregister reference. Idx is an index in the
+/// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
+/// lowering EXTRACT_VECTOR_ELT operations easier.
+static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
+                                   SelectionDAG &DAG, const SDLoc &dl) {
+  assert((Vec.getValueType().is256BitVector() ||
+          Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
+  return extractSubVector(Vec, IdxVal, DAG, dl, 128);
+}
+
+/// Generate a DAG to grab 256-bits from a 512-bit vector.
+static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
+                                   SelectionDAG &DAG, const SDLoc &dl) {
+  assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
+  return extractSubVector(Vec, IdxVal, DAG, dl, 256);
+}
+
+static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+                               SelectionDAG &DAG, const SDLoc &dl,
+                               unsigned vectorWidth) {
+  assert((vectorWidth == 128 || vectorWidth == 256) &&
+         "Unsupported vector width");
+  // Inserting UNDEF is Result
+  if (Vec.isUndef())
+    return Result;
+  EVT VT = Vec.getValueType();
+  EVT ElVT = VT.getVectorElementType();
+  EVT ResultVT = Result.getValueType();
+
+  // Insert the relevant vectorWidth bits.
+  unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
+  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
+
+  // This is the index of the first element of the vectorWidth-bit chunk
+  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+  IdxVal &= ~(ElemsPerChunk - 1);
+
+  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
+}
+
+/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
+/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
+/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
+/// simple superregister reference.  Idx is an index in the 128 bits
+/// we want.  It need not be aligned to a 128-bit boundary.  That makes
+/// lowering INSERT_VECTOR_ELT operations easier.
+static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+                                  SelectionDAG &DAG, const SDLoc &dl) {
+  assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
+
+  // For insertion into the zero index (low half) of a 256-bit vector, it is
+  // more efficient to generate a blend with immediate instead of an insert*128.
+  // We are still creating an INSERT_SUBVECTOR below with an undef node to
+  // extend the subvector to the size of the result vector. Make sure that
+  // we are not recursing on that node by checking for undef here.
+  if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
+      !Result.isUndef()) {
+    EVT ResultVT = Result.getValueType();
+    SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
+    SDValue Undef = DAG.getUNDEF(ResultVT);
+    SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
+                                 Vec, ZeroIndex);
+
+    // The blend instruction, and therefore its mask, depend on the data type.
+    MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
+    if (ScalarType.isFloatingPoint()) {
+      // Choose either vblendps (float) or vblendpd (double).
+      unsigned ScalarSize = ScalarType.getSizeInBits();
+      assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
+      unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
+      SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
+      return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
+    }
+
+    const X86Subtarget &Subtarget =
+    static_cast<const X86Subtarget &>(DAG.getSubtarget());
+
+    // AVX2 is needed for 256-bit integer blend support.
+    // Integers must be cast to 32-bit because there is only vpblendd;
+    // vpblendw can't be used for this because it has a handicapped mask.
+
+    // If we don't have AVX2, then cast to float. Using a wrong domain blend
+    // is still more efficient than using the wrong domain vinsertf128 that
+    // will be created by InsertSubVector().
+    MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
+
+    SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
+    Result = DAG.getBitcast(CastVT, Result);
+    Vec256 = DAG.getBitcast(CastVT, Vec256);
+    Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
+    return DAG.getBitcast(ResultVT, Vec256);
+  }
+
+  return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
+}
+
+static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+                                  SelectionDAG &DAG, const SDLoc &dl) {
+  assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
+  return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
+}
+
+/// Insert i1-subvector to i1-vector.
+static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
+                                const X86Subtarget &Subtarget) {
+
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue SubVec = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+
+  if (!isa<ConstantSDNode>(Idx))
+    return SDValue();
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  if (IdxVal == 0  && Vec.isUndef()) // the operation is legal
+    return Op;
+
+  MVT OpVT = Op.getSimpleValueType();
+  MVT SubVecVT = SubVec.getSimpleValueType();
+  unsigned NumElems = OpVT.getVectorNumElements();
+  unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
+
+  assert(IdxVal + SubVecNumElems <= NumElems &&
+         IdxVal % SubVecVT.getSizeInBits() == 0 &&
+         "Unexpected index value in INSERT_SUBVECTOR");
+
+  // There are 3 possible cases:
+  // 1. Subvector should be inserted in the lower part (IdxVal == 0)
+  // 2. Subvector should be inserted in the upper part
+  //    (IdxVal + SubVecNumElems == NumElems)
+  // 3. Subvector should be inserted in the middle (for example v2i1
+  //    to v16i1, index 2)
+
+  // extend to natively supported kshift
+  MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+  MVT WideOpVT = OpVT;
+  if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
+    WideOpVT = MinVT;
+
+  SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+  SDValue Undef = DAG.getUNDEF(WideOpVT);
+  SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+                                   Undef, SubVec, ZeroIdx);
+
+  // Extract sub-vector if require.
+  auto ExtractSubVec = [&](SDValue V) {
+    return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
+                                                OpVT, V, ZeroIdx);
+  };
+
+  if (Vec.isUndef()) {
+    if (IdxVal != 0) {
+      SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
+      WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
+    }
+    return ExtractSubVec(WideSubVec);
+  }
+
+  if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
+    NumElems = WideOpVT.getVectorNumElements();
+    unsigned ShiftLeft = NumElems - SubVecNumElems;
+    unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
+    Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
+                             DAG.getConstant(ShiftLeft, dl, MVT::i8));
+    Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
+      DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
+    return ExtractSubVec(Vec);
+  }
+
+  if (IdxVal == 0) {
+    // Zero lower bits of the Vec
+    SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
+    Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
+    // Merge them together, SubVec should be zero extended.
+    WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+                             getZeroVector(WideOpVT, Subtarget, DAG, dl),
+                             SubVec, ZeroIdx);
+    Vec =  DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
+    return ExtractSubVec(Vec);
+  }
+
+  // Simple case when we put subvector in the upper part
+  if (IdxVal + SubVecNumElems == NumElems) {
+    // Zero upper bits of the Vec
+    WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
+                             DAG.getConstant(IdxVal, dl, MVT::i8));
+    SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
+    Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
+    return ExtractSubVec(Vec);
+  }
+  // Subvector should be inserted in the middle - use shuffle
+  WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
+                           SubVec, ZeroIdx);
+  SmallVector<int, 64> Mask;
+  for (unsigned i = 0; i < NumElems; ++i)
+    Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
+                    i : i + NumElems);
+  return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
+}
+
+/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
+/// instructions. This is used because creating CONCAT_VECTOR nodes of
+/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
+/// large BUILD_VECTORS.
+static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+                                   unsigned NumElems, SelectionDAG &DAG,
+                                   const SDLoc &dl) {
+  SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+  return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
+}
+
+static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
+                                   unsigned NumElems, SelectionDAG &DAG,
+                                   const SDLoc &dl) {
+  SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+  return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
+}
+
+/// Returns a vector of specified type with all bits set.
+/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
+/// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
+/// Then bitcast to their original type, ensuring they get CSE'd.
+static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
+                             SelectionDAG &DAG, const SDLoc &dl) {
+  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
+         "Expected a 128/256/512-bit vector type");
+
+  APInt Ones = APInt::getAllOnesValue(32);
+  unsigned NumElts = VT.getSizeInBits() / 32;
+  SDValue Vec;
+  if (!Subtarget.hasInt256() && NumElts == 8) {
+    Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
+    Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
+  } else {
+    Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
+  }
+  return DAG.getBitcast(VT, Vec);
+}
+
+/// Generate unpacklo/unpackhi shuffle mask.
+static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
+                                    bool Unary) {
+  assert(Mask.empty() && "Expected an empty shuffle mask vector");
+  int NumElts = VT.getVectorNumElements();
+  int NumEltsInLane = 128 / VT.getScalarSizeInBits();
+
+  for (int i = 0; i < NumElts; ++i) {
+    unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
+    int Pos = (i % NumEltsInLane) / 2 + LaneStart;
+    Pos += (Unary ? 0 : NumElts * (i % 2));
+    Pos += (Lo ? 0 : NumEltsInLane / 2);
+    Mask.push_back(Pos);
+  }
+}
+
+/// Returns a vector_shuffle node for an unpackl operation.
+static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
+                          SDValue V1, SDValue V2) {
+  SmallVector<int, 8> Mask;
+  createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
+  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
+}
+
+/// Returns a vector_shuffle node for an unpackh operation.
+static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
+                          SDValue V1, SDValue V2) {
+  SmallVector<int, 8> Mask;
+  createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
+  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
+}
+
+/// Return a vector_shuffle of the specified vector of zero or undef vector.
+/// This produces a shuffle where the low element of V2 is swizzled into the
+/// zero/undef vector, landing at element Idx.
+/// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
+                                           bool IsZero,
+                                           const X86Subtarget &Subtarget,
+                                           SelectionDAG &DAG) {
+  MVT VT = V2.getSimpleValueType();
+  SDValue V1 = IsZero
+    ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
+  int NumElems = VT.getVectorNumElements();
+  SmallVector<int, 16> MaskVec(NumElems);
+  for (int i = 0; i != NumElems; ++i)
+    // If this is the insertion idx, put the low elt of V2 here.
+    MaskVec[i] = (i == Idx) ? NumElems : i;
+  return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
+}
+
+static SDValue peekThroughBitcasts(SDValue V) {
+  while (V.getNode() && V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+  return V;
+}
+
+static SDValue peekThroughOneUseBitcasts(SDValue V) {
+  while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
+         V.getOperand(0).hasOneUse())
+    V = V.getOperand(0);
+  return V;
+}
+
+static const Constant *getTargetConstantFromNode(SDValue Op) {
+  Op = peekThroughBitcasts(Op);
+
+  auto *Load = dyn_cast<LoadSDNode>(Op);
+  if (!Load)
+    return nullptr;
+
+  SDValue Ptr = Load->getBasePtr();
+  if (Ptr->getOpcode() == X86ISD::Wrapper ||
+      Ptr->getOpcode() == X86ISD::WrapperRIP)
+    Ptr = Ptr->getOperand(0);
+
+  auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
+  if (!CNode || CNode->isMachineConstantPoolEntry())
+    return nullptr;
+
+  return dyn_cast<Constant>(CNode->getConstVal());
+}
+
+// Extract raw constant bits from constant pools.
+static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
+                                          SmallBitVector &UndefElts,
+                                          SmallVectorImpl<APInt> &EltBits) {
+  assert(UndefElts.empty() && "Expected an empty UndefElts vector");
+  assert(EltBits.empty() && "Expected an empty EltBits vector");
+
+  Op = peekThroughBitcasts(Op);
+
+  EVT VT = Op.getValueType();
+  unsigned SizeInBits = VT.getSizeInBits();
+  assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
+  unsigned NumElts = SizeInBits / EltSizeInBits;
+
+  // Extract all the undef/constant element data and pack into single bitsets.
+  APInt UndefBits(SizeInBits, 0);
+  APInt MaskBits(SizeInBits, 0);
+
+  // Split the undef/constant single bitset data into the target elements.
+  auto SplitBitData = [&]() {
+    UndefElts = SmallBitVector(NumElts, false);
+    EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
+
+    for (unsigned i = 0; i != NumElts; ++i) {
+      APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits);
+      UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits);
+
+      // Only treat an element as UNDEF if all bits are UNDEF, otherwise
+      // treat it as zero.
+      if (UndefEltBits.isAllOnesValue()) {
+        UndefElts[i] = true;
+        continue;
+      }
+
+      APInt Bits = MaskBits.lshr(i * EltSizeInBits);
+      Bits = Bits.zextOrTrunc(EltSizeInBits);
+      EltBits[i] = Bits.getZExtValue();
+    }
+    return true;
+  };
+
+  auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask,
+                                          APInt &Undefs) {
+    if (!Cst)
+      return false;
+    unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
+    if (isa<UndefValue>(Cst)) {
+      Mask = APInt::getNullValue(SizeInBits);
+      Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits);
+      return true;
+    }
+    if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
+      Mask = CInt->getValue().zextOrTrunc(SizeInBits);
+      Undefs = APInt::getNullValue(SizeInBits);
+      return true;
+    }
+    if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
+      Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits);
+      Undefs = APInt::getNullValue(SizeInBits);
+      return true;
+    }
+    return false;
+  };
+
+  // Extract constant bits from constant pool vector.
+  if (auto *Cst = getTargetConstantFromNode(Op)) {
+    Type *CstTy = Cst->getType();
+    if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
+      return false;
+
+    unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
+    for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) {
+      APInt Bits, Undefs;
+      if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs))
+        return false;
+      MaskBits |= Bits.shl(i * CstEltSizeInBits);
+      UndefBits |= Undefs.shl(i * CstEltSizeInBits);
+    }
+
+    return SplitBitData();
+  }
+
+  // Extract constant bits from a broadcasted constant pool scalar.
+  if (Op.getOpcode() == X86ISD::VBROADCAST &&
+      EltSizeInBits <= Op.getScalarValueSizeInBits()) {
+    if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
+      APInt Bits, Undefs;
+      if (ExtractConstantBits(Broadcast, Bits, Undefs)) {
+        unsigned NumBroadcastBits = Op.getScalarValueSizeInBits();
+        unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits;
+        for (unsigned i = 0; i != NumBroadcastElts; ++i) {
+          MaskBits |= Bits.shl(i * NumBroadcastBits);
+          UndefBits |= Undefs.shl(i * NumBroadcastBits);
+        }
+        return SplitBitData();
+      }
+    }
+  }
+
+  return false;
+}
+
+// TODO: Merge more of this with getTargetConstantBitsFromNode.
+static bool getTargetShuffleMaskIndices(SDValue MaskNode,
+                                        unsigned MaskEltSizeInBits,
+                                        SmallVectorImpl<uint64_t> &RawMask) {
+  MaskNode = peekThroughBitcasts(MaskNode);
+
+  MVT VT = MaskNode.getSimpleValueType();
+  assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
+  unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits;
+
+  // Split an APInt element into MaskEltSizeInBits sized pieces and
+  // insert into the shuffle mask.
+  auto SplitElementToMask = [&](APInt Element) {
+    // Note that this is x86 and so always little endian: the low byte is
+    // the first byte of the mask.
+    int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
+    for (int i = 0; i < Split; ++i) {
+      APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
+      Element = Element.lshr(MaskEltSizeInBits);
+      RawMask.push_back(RawElt.getZExtValue());
+    }
+  };
+
+  if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
+    // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
+    // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
+    if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
+      return false;
+    if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
+      const APInt &MaskElement = CN->getAPIntValue();
+      for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+        APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
+        RawMask.push_back(RawElt.getZExtValue());
+      }
+    }
+    return false;
+  }
+
+  if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
+      MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
+    if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
+      if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) {
+        RawMask.push_back(CN->getZExtValue());
+        RawMask.append(NumMaskElts - 1, 0);
+        return true;
+      }
+
+      if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) {
+        unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
+        SplitElementToMask(CN->getAPIntValue());
+        RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  // We can always decode if the buildvector is all zero constants,
+  // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
+  if (all_of(MaskNode->ops(), X86::isZeroNode)) {
+    RawMask.append(NumMaskElts, 0);
+    return true;
+  }
+
+  // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
+  if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
+    return false;
+
+  for (SDValue Op : MaskNode->ops()) {
+    if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
+      SplitElementToMask(CN->getAPIntValue());
+    else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
+      SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
+    else
+      return false;
+  }
+
+  return true;
+}
+
+/// Calculates the shuffle mask corresponding to the target-specific opcode.
+/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
+/// operands in \p Ops, and returns true.
+/// Sets \p IsUnary to true if only one source is used. Note that this will set
+/// IsUnary for shuffles which use a single input multiple times, and in those
+/// cases it will adjust the mask to only have indices within that single input.
+/// It is an error to call this with non-empty Mask/Ops vectors.
+static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
+                                 SmallVectorImpl<SDValue> &Ops,
+                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SDValue ImmN;
+
+  assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
+  assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
+
+  IsUnary = false;
+  bool IsFakeUnary = false;
+  switch(N->getOpcode()) {
+  case X86ISD::BLENDI:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    break;
+  case X86ISD::SHUFP:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    break;
+  case X86ISD::INSERTPS:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    break;
+  case X86ISD::UNPCKH:
+    DecodeUNPCKHMask(VT, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    break;
+  case X86ISD::UNPCKL:
+    DecodeUNPCKLMask(VT, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    break;
+  case X86ISD::MOVHLPS:
+    DecodeMOVHLPSMask(NumElems, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    break;
+  case X86ISD::MOVLHPS:
+    DecodeMOVLHPSMask(NumElems, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    break;
+  case X86ISD::PALIGNR:
+    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    Ops.push_back(N->getOperand(1));
+    Ops.push_back(N->getOperand(0));
+    break;
+  case X86ISD::VSHLDQ:
+    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+    ImmN = N->getOperand(N->getNumOperands() - 1);
+    DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::VSRLDQ:
+    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+    ImmN = N->getOperand(N->getNumOperands() - 1);
+    DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::PSHUFD:
+  case X86ISD::VPERMILPI:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::PSHUFHW:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::PSHUFLW:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::VZEXT_MOVL:
+    DecodeZeroMoveLowMask(VT, Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::VBROADCAST: {
+    // We only decode broadcasts of same-sized vectors at the moment.
+    if (N->getOperand(0).getValueType() == VT) {
+      DecodeVectorBroadcast(VT, Mask);
+      IsUnary = true;
+      break;
+    }
+    return false;
+  }
+  case X86ISD::VPERMILPV: {
+    IsUnary = true;
+    SDValue MaskNode = N->getOperand(1);
+    unsigned MaskEltSize = VT.getScalarSizeInBits();
+    SmallVector<uint64_t, 32> RawMask;
+    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
+      DecodeVPERMILPMask(VT, RawMask, Mask);
+      break;
+    }
+    if (auto *C = getTargetConstantFromNode(MaskNode)) {
+      DecodeVPERMILPMask(C, MaskEltSize, Mask);
+      break;
+    }
+    return false;
+  }
+  case X86ISD::PSHUFB: {
+    IsUnary = true;
+    SDValue MaskNode = N->getOperand(1);
+    SmallVector<uint64_t, 32> RawMask;
+    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
+      DecodePSHUFBMask(RawMask, Mask);
+      break;
+    }
+    if (auto *C = getTargetConstantFromNode(MaskNode)) {
+      DecodePSHUFBMask(C, Mask);
+      break;
+    }
+    return false;
+  }
+  case X86ISD::VPERMI:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD:
+    DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
+    break;
+  case X86ISD::VPERM2X128:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    break;
+  case X86ISD::MOVSLDUP:
+    DecodeMOVSLDUPMask(VT, Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::MOVSHDUP:
+    DecodeMOVSHDUPMask(VT, Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::MOVDDUP:
+    DecodeMOVDDUPMask(VT, Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::MOVLHPD:
+  case X86ISD::MOVLPD:
+  case X86ISD::MOVLPS:
+    // Not yet implemented
+    return false;
+  case X86ISD::VPERMIL2: {
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    unsigned MaskEltSize = VT.getScalarSizeInBits();
+    SDValue MaskNode = N->getOperand(2);
+    SDValue CtrlNode = N->getOperand(3);
+    if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
+      unsigned CtrlImm = CtrlOp->getZExtValue();
+      SmallVector<uint64_t, 32> RawMask;
+      if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
+        DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
+        break;
+      }
+      if (auto *C = getTargetConstantFromNode(MaskNode)) {
+        DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
+        break;
+      }
+    }
+    return false;
+  }
+  case X86ISD::VPPERM: {
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    SDValue MaskNode = N->getOperand(2);
+    SmallVector<uint64_t, 32> RawMask;
+    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
+      DecodeVPPERMMask(RawMask, Mask);
+      break;
+    }
+    if (auto *C = getTargetConstantFromNode(MaskNode)) {
+      DecodeVPPERMMask(C, Mask);
+      break;
+    }
+    return false;
+  }
+  case X86ISD::VPERMV: {
+    IsUnary = true;
+    // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
+    Ops.push_back(N->getOperand(1));
+    SDValue MaskNode = N->getOperand(0);
+    SmallVector<uint64_t, 32> RawMask;
+    unsigned MaskEltSize = VT.getScalarSizeInBits();
+    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
+      DecodeVPERMVMask(RawMask, Mask);
+      break;
+    }
+    if (auto *C = getTargetConstantFromNode(MaskNode)) {
+      DecodeVPERMVMask(C, MaskEltSize, Mask);
+      break;
+    }
+    return false;
+  }
+  case X86ISD::VPERMV3: {
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
+    // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
+    Ops.push_back(N->getOperand(0));
+    Ops.push_back(N->getOperand(2));
+    SDValue MaskNode = N->getOperand(1);
+    unsigned MaskEltSize = VT.getScalarSizeInBits();
+    if (auto *C = getTargetConstantFromNode(MaskNode)) {
+      DecodeVPERMV3Mask(C, MaskEltSize, Mask);
+      break;
+    }
+    return false;
+  }
+  case X86ISD::VPERMIV3: {
+    IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
+    // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
+    Ops.push_back(N->getOperand(1));
+    Ops.push_back(N->getOperand(2));
+    SDValue MaskNode = N->getOperand(0);
+    unsigned MaskEltSize = VT.getScalarSizeInBits();
+    if (auto *C = getTargetConstantFromNode(MaskNode)) {
+      DecodeVPERMV3Mask(C, MaskEltSize, Mask);
+      break;
+    }
+    return false;
+  }
+  default: llvm_unreachable("unknown target shuffle node");
+  }
+
+  // Empty mask indicates the decode failed.
+  if (Mask.empty())
+    return false;
+
+  // Check if we're getting a shuffle mask with zero'd elements.
+  if (!AllowSentinelZero)
+    if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
+      return false;
+
+  // If we have a fake unary shuffle, the shuffle mask is spread across two
+  // inputs that are actually the same node. Re-map the mask to always point
+  // into the first input.
+  if (IsFakeUnary)
+    for (int &M : Mask)
+      if (M >= (int)Mask.size())
+        M -= Mask.size();
+
+  // If we didn't already add operands in the opcode-specific code, default to
+  // adding 1 or 2 operands starting at 0.
+  if (Ops.empty()) {
+    Ops.push_back(N->getOperand(0));
+    if (!IsUnary || IsFakeUnary)
+      Ops.push_back(N->getOperand(1));
+  }
+
+  return true;
+}
+
+/// Check a target shuffle mask's inputs to see if we can set any values to
+/// SM_SentinelZero - this is for elements that are known to be zero
+/// (not just zeroable) from their inputs.
+/// Returns true if the target shuffle mask was decoded.
+static bool setTargetShuffleZeroElements(SDValue N,
+                                         SmallVectorImpl<int> &Mask,
+                                         SmallVectorImpl<SDValue> &Ops) {
+  bool IsUnary;
+  if (!isTargetShuffle(N.getOpcode()))
+    return false;
+
+  MVT VT = N.getSimpleValueType();
+  if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
+    return false;
+
+  SDValue V1 = Ops[0];
+  SDValue V2 = IsUnary ? V1 : Ops[1];
+
+  V1 = peekThroughBitcasts(V1);
+  V2 = peekThroughBitcasts(V2);
+
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    int M = Mask[i];
+
+    // Already decoded as SM_SentinelZero / SM_SentinelUndef.
+    if (M < 0)
+      continue;
+
+    // Determine shuffle input and normalize the mask.
+    SDValue V = M < Size ? V1 : V2;
+    M %= Size;
+
+    // We are referencing an UNDEF input.
+    if (V.isUndef()) {
+      Mask[i] = SM_SentinelUndef;
+      continue;
+    }
+
+    // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
+    if (V.getOpcode() != ISD::BUILD_VECTOR)
+      continue;
+
+    // If the BUILD_VECTOR has fewer elements then the (larger) source
+    // element must be UNDEF/ZERO.
+    // TODO: Is it worth testing the individual bits of a constant?
+    if ((Size % V.getNumOperands()) == 0) {
+      int Scale = Size / V->getNumOperands();
+      SDValue Op = V.getOperand(M / Scale);
+      if (Op.isUndef())
+        Mask[i] = SM_SentinelUndef;
+      else if (X86::isZeroNode(Op))
+        Mask[i] = SM_SentinelZero;
+      continue;
+    }
+
+    // If the BUILD_VECTOR has more elements then all the (smaller) source
+    // elements must be all UNDEF or all ZERO.
+    if ((V.getNumOperands() % Size) == 0) {
+      int Scale = V->getNumOperands() / Size;
+      bool AllUndef = true;
+      bool AllZero = true;
+      for (int j = 0; j < Scale; ++j) {
+        SDValue Op = V.getOperand((M * Scale) + j);
+        AllUndef &= Op.isUndef();
+        AllZero &= X86::isZeroNode(Op);
+      }
+      if (AllUndef)
+        Mask[i] = SM_SentinelUndef;
+      else if (AllZero)
+        Mask[i] = SM_SentinelZero;
+      continue;
+    }
+  }
+
+  assert(VT.getVectorNumElements() == Mask.size() &&
+         "Different mask size from vector size!");
+  return true;
+}
+
+// Attempt to decode ops that could be represented as a shuffle mask.
+// The decoded shuffle mask may contain a different number of elements to the
+// destination value type.
+static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
+                               SmallVectorImpl<SDValue> &Ops) {
+  Mask.clear();
+  Ops.clear();
+
+  MVT VT = N.getSimpleValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumSizeInBits = VT.getSizeInBits();
+  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+  assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
+         "Expected byte aligned value types");
+
+  unsigned Opcode = N.getOpcode();
+  switch (Opcode) {
+  case ISD::AND: {
+    // Attempt to decode as a per-byte mask.
+    SmallBitVector UndefElts;
+    SmallVector<APInt, 32> EltBits;
+    if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits))
+      return false;
+    for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
+      if (UndefElts[i]) {
+        Mask.push_back(SM_SentinelUndef);
+        continue;
+      }
+      uint64_t ByteBits = EltBits[i].getZExtValue();
+      if (ByteBits != 0 && ByteBits != 255)
+        return false;
+      Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i);
+    }
+    Ops.push_back(N.getOperand(0));
+    return true;
+  }
+  case X86ISD::VSHLI:
+  case X86ISD::VSRLI: {
+    uint64_t ShiftVal = N.getConstantOperandVal(1);
+    // Out of range bit shifts are guaranteed to be zero.
+    if (NumBitsPerElt <= ShiftVal) {
+      Mask.append(NumElts, SM_SentinelZero);
+      return true;
+    }
+
+    // We can only decode 'whole byte' bit shifts as shuffles.
+    if ((ShiftVal % 8) != 0)
+      break;
+
+    uint64_t ByteShift = ShiftVal / 8;
+    unsigned NumBytes = NumSizeInBits / 8;
+    unsigned NumBytesPerElt = NumBitsPerElt / 8;
+    Ops.push_back(N.getOperand(0));
+
+    // Clear mask to all zeros and insert the shifted byte indices.
+    Mask.append(NumBytes, SM_SentinelZero);
+
+    if (X86ISD::VSHLI == Opcode) {
+      for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
+        for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
+          Mask[i + j] = i + j - ByteShift;
+    } else {
+      for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
+        for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
+          Mask[i + j - ByteShift] = i + j;
+    }
+    return true;
+  }
+  case X86ISD::VZEXT: {
+    // TODO - add support for VPMOVZX with smaller input vector types.
+    SDValue Src = N.getOperand(0);
+    MVT SrcVT = Src.getSimpleValueType();
+    if (NumSizeInBits != SrcVT.getSizeInBits())
+      break;
+    DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
+    Ops.push_back(Src);
+    return true;
+  }
+  }
+
+  return false;
+}
+
+/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
+/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
+/// remaining input indices in case we now have a unary shuffle and adjust the
+/// Op0/Op1 inputs accordingly.
+/// Returns true if the target shuffle mask was decoded.
+static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
+                                       SmallVectorImpl<int> &Mask) {
+  SmallVector<SDValue, 2> Ops;
+  if (!setTargetShuffleZeroElements(Op, Mask, Ops))
+    if (!getFauxShuffleMask(Op, Mask, Ops))
+      return false;
+
+  int NumElts = Mask.size();
+  bool Op0InUse = any_of(Mask, [NumElts](int Idx) {
+    return 0 <= Idx && Idx < NumElts;
+  });
+  bool Op1InUse = any_of(Mask, [NumElts](int Idx) { return NumElts <= Idx; });
+
+  Op0 = Op0InUse ? Ops[0] : SDValue();
+  Op1 = Op1InUse ? Ops[1] : SDValue();
+
+  // We're only using Op1 - commute the mask and inputs.
+  if (!Op0InUse && Op1InUse) {
+    for (int &M : Mask)
+      if (NumElts <= M)
+        M -= NumElts;
+    Op0 = Op1;
+    Op1 = SDValue();
+  }
+
+  return true;
+}
+
+/// Returns the scalar element that will make up the ith
+/// element of the result of the vector shuffle.
+static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
+                                   unsigned Depth) {
+  if (Depth == 6)
+    return SDValue();  // Limit search depth.
+
+  SDValue V = SDValue(N, 0);
+  EVT VT = V.getValueType();
+  unsigned Opcode = V.getOpcode();
+
+  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
+  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
+    int Elt = SV->getMaskElt(Index);
+
+    if (Elt < 0)
+      return DAG.getUNDEF(VT.getVectorElementType());
+
+    unsigned NumElems = VT.getVectorNumElements();
+    SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
+                                         : SV->getOperand(1);
+    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
+  }
+
+  // Recurse into target specific vector shuffles to find scalars.
+  if (isTargetShuffle(Opcode)) {
+    MVT ShufVT = V.getSimpleValueType();
+    MVT ShufSVT = ShufVT.getVectorElementType();
+    int NumElems = (int)ShufVT.getVectorNumElements();
+    SmallVector<int, 16> ShuffleMask;
+    SmallVector<SDValue, 16> ShuffleOps;
+    bool IsUnary;
+
+    if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
+      return SDValue();
+
+    int Elt = ShuffleMask[Index];
+    if (Elt == SM_SentinelZero)
+      return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
+                                 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
+    if (Elt == SM_SentinelUndef)
+      return DAG.getUNDEF(ShufSVT);
+
+    assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
+    SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
+    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
+                               Depth+1);
+  }
+
+  // Actual nodes that may contain scalar elements
+  if (Opcode == ISD::BITCAST) {
+    V = V.getOperand(0);
+    EVT SrcVT = V.getValueType();
+    unsigned NumElems = VT.getVectorNumElements();
+
+    if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
+      return SDValue();
+  }
+
+  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+    return (Index == 0) ? V.getOperand(0)
+                        : DAG.getUNDEF(VT.getVectorElementType());
+
+  if (V.getOpcode() == ISD::BUILD_VECTOR)
+    return V.getOperand(Index);
+
+  return SDValue();
+}
+
+/// Custom lower build_vector of v16i8.
+static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
+                                       unsigned NumNonZero, unsigned NumZero,
+                                       SelectionDAG &DAG,
+                                       const X86Subtarget &Subtarget,
+                                       const TargetLowering &TLI) {
+  if (NumNonZero > 8)
+    return SDValue();
+
+  SDLoc dl(Op);
+  SDValue V;
+  bool First = true;
+
+  // SSE4.1 - use PINSRB to insert each byte directly.
+  if (Subtarget.hasSSE41()) {
+    for (unsigned i = 0; i < 16; ++i) {
+      bool isNonZero = (NonZeros & (1 << i)) != 0;
+      if (isNonZero) {
+        if (First) {
+          if (NumZero)
+            V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
+          else
+            V = DAG.getUNDEF(MVT::v16i8);
+          First = false;
+        }
+        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+                        MVT::v16i8, V, Op.getOperand(i),
+                        DAG.getIntPtrConstant(i, dl));
+      }
+    }
+
+    return V;
+  }
+
+  // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
+  for (unsigned i = 0; i < 16; ++i) {
+    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
+    if (ThisIsNonZero && First) {
+      if (NumZero)
+        V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+      else
+        V = DAG.getUNDEF(MVT::v8i16);
+      First = false;
+    }
+
+    if ((i & 1) != 0) {
+      SDValue ThisElt, LastElt;
+      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+      if (LastIsNonZero) {
+        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
+                              MVT::i16, Op.getOperand(i-1));
+      }
+      if (ThisIsNonZero) {
+        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
+        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
+                              ThisElt, DAG.getConstant(8, dl, MVT::i8));
+        if (LastIsNonZero)
+          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
+      } else
+        ThisElt = LastElt;
+
+      if (ThisElt.getNode())
+        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+                        DAG.getIntPtrConstant(i/2, dl));
+    }
+  }
+
+  return DAG.getBitcast(MVT::v16i8, V);
+}
+
+/// Custom lower build_vector of v8i16.
+static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
+                                     unsigned NumNonZero, unsigned NumZero,
+                                     SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget,
+                                     const TargetLowering &TLI) {
+  if (NumNonZero > 4)
+    return SDValue();
+
+  SDLoc dl(Op);
+  SDValue V;
+  bool First = true;
+  for (unsigned i = 0; i < 8; ++i) {
+    bool isNonZero = (NonZeros & (1 << i)) != 0;
+    if (isNonZero) {
+      if (First) {
+        if (NumZero)
+          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+        else
+          V = DAG.getUNDEF(MVT::v8i16);
+        First = false;
+      }
+      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+                      MVT::v8i16, V, Op.getOperand(i),
+                      DAG.getIntPtrConstant(i, dl));
+    }
+  }
+
+  return V;
+}
+
+/// Custom lower build_vector of v4i32 or v4f32.
+static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget,
+                                     const TargetLowering &TLI) {
+  // Find all zeroable elements.
+  std::bitset<4> Zeroable;
+  for (int i=0; i < 4; ++i) {
+    SDValue Elt = Op->getOperand(i);
+    Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
+  }
+  assert(Zeroable.size() - Zeroable.count() > 1 &&
+         "We expect at least two non-zero elements!");
+
+  // We only know how to deal with build_vector nodes where elements are either
+  // zeroable or extract_vector_elt with constant index.
+  SDValue FirstNonZero;
+  unsigned FirstNonZeroIdx;
+  for (unsigned i=0; i < 4; ++i) {
+    if (Zeroable[i])
+      continue;
+    SDValue Elt = Op->getOperand(i);
+    if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(Elt.getOperand(1)))
+      return SDValue();
+    // Make sure that this node is extracting from a 128-bit vector.
+    MVT VT = Elt.getOperand(0).getSimpleValueType();
+    if (!VT.is128BitVector())
+      return SDValue();
+    if (!FirstNonZero.getNode()) {
+      FirstNonZero = Elt;
+      FirstNonZeroIdx = i;
+    }
+  }
+
+  assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
+  SDValue V1 = FirstNonZero.getOperand(0);
+  MVT VT = V1.getSimpleValueType();
+
+  // See if this build_vector can be lowered as a blend with zero.
+  SDValue Elt;
+  unsigned EltMaskIdx, EltIdx;
+  int Mask[4];
+  for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
+    if (Zeroable[EltIdx]) {
+      // The zero vector will be on the right hand side.
+      Mask[EltIdx] = EltIdx+4;
+      continue;
+    }
+
+    Elt = Op->getOperand(EltIdx);
+    // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
+    EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
+    if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
+      break;
+    Mask[EltIdx] = EltIdx;
+  }
+
+  if (EltIdx == 4) {
+    // Let the shuffle legalizer deal with blend operations.
+    SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
+    if (V1.getSimpleValueType() != VT)
+      V1 = DAG.getBitcast(VT, V1);
+    return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
+  }
+
+  // See if we can lower this build_vector to a INSERTPS.
+  if (!Subtarget.hasSSE41())
+    return SDValue();
+
+  SDValue V2 = Elt.getOperand(0);
+  if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
+    V1 = SDValue();
+
+  bool CanFold = true;
+  for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
+    if (Zeroable[i])
+      continue;
+
+    SDValue Current = Op->getOperand(i);
+    SDValue SrcVector = Current->getOperand(0);
+    if (!V1.getNode())
+      V1 = SrcVector;
+    CanFold = SrcVector == V1 &&
+      cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
+  }
+
+  if (!CanFold)
+    return SDValue();
+
+  assert(V1.getNode() && "Expected at least two non-zero elements!");
+  if (V1.getSimpleValueType() != MVT::v4f32)
+    V1 = DAG.getBitcast(MVT::v4f32, V1);
+  if (V2.getSimpleValueType() != MVT::v4f32)
+    V2 = DAG.getBitcast(MVT::v4f32, V2);
+
+  // Ok, we can emit an INSERTPS instruction.
+  unsigned ZMask = Zeroable.to_ulong();
+
+  unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
+  assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+  SDLoc DL(Op);
+  SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+                               DAG.getIntPtrConstant(InsertPSMask, DL));
+  return DAG.getBitcast(VT, Result);
+}
+
+/// Return a vector logical shift node.
+static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
+                         SelectionDAG &DAG, const TargetLowering &TLI,
+                         const SDLoc &dl) {
+  assert(VT.is128BitVector() && "Unknown type for VShift");
+  MVT ShVT = MVT::v16i8;
+  unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
+  SrcOp = DAG.getBitcast(ShVT, SrcOp);
+  MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+  assert(NumBits % 8 == 0 && "Only support byte sized shifts");
+  SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
+  return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
+}
+
+static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
+                                      SelectionDAG &DAG) {
+
+  // Check if the scalar load can be widened into a vector load. And if
+  // the address is "base + cst" see if the cst can be "absorbed" into
+  // the shuffle mask.
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
+    SDValue Ptr = LD->getBasePtr();
+    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
+      return SDValue();
+    EVT PVT = LD->getValueType(0);
+    if (PVT != MVT::i32 && PVT != MVT::f32)
+      return SDValue();
+
+    int FI = -1;
+    int64_t Offset = 0;
+    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
+      FI = FINode->getIndex();
+      Offset = 0;
+    } else if (DAG.isBaseWithConstantOffset(Ptr) &&
+               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+      Offset = Ptr.getConstantOperandVal(1);
+      Ptr = Ptr.getOperand(0);
+    } else {
+      return SDValue();
+    }
+
+    // FIXME: 256-bit vector instructions don't require a strict alignment,
+    // improve this code to support it better.
+    unsigned RequiredAlign = VT.getSizeInBits()/8;
+    SDValue Chain = LD->getChain();
+    // Make sure the stack object alignment is at least 16 or 32.
+    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
+      if (MFI.isFixedObjectIndex(FI)) {
+        // Can't change the alignment. FIXME: It's possible to compute
+        // the exact stack offset and reference FI + adjust offset instead.
+        // If someone *really* cares about this. That's the way to implement it.
+        return SDValue();
+      } else {
+        MFI.setObjectAlignment(FI, RequiredAlign);
+      }
+    }
+
+    // (Offset % 16 or 32) must be multiple of 4. Then address is then
+    // Ptr + (Offset & ~15).
+    if (Offset < 0)
+      return SDValue();
+    if ((Offset % RequiredAlign) & 3)
+      return SDValue();
+    int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
+    if (StartOffset) {
+      SDLoc DL(Ptr);
+      Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                        DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
+    }
+
+    int EltNo = (Offset - StartOffset) >> 2;
+    unsigned NumElems = VT.getVectorNumElements();
+
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
+    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
+                             LD->getPointerInfo().getWithOffset(StartOffset));
+
+    SmallVector<int, 8> Mask(NumElems, EltNo);
+
+    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
+  }
+
+  return SDValue();
+}
+
+/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
+/// elements can be replaced by a single large load which has the same value as
+/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
+///
+/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
+static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
+                                        SDLoc &DL, SelectionDAG &DAG,
+                                        bool isAfterLegalize) {
+  unsigned NumElems = Elts.size();
+
+  int LastLoadedElt = -1;
+  SmallBitVector LoadMask(NumElems, false);
+  SmallBitVector ZeroMask(NumElems, false);
+  SmallBitVector UndefMask(NumElems, false);
+
+  // For each element in the initializer, see if we've found a load, zero or an
+  // undef.
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue Elt = peekThroughBitcasts(Elts[i]);
+    if (!Elt.getNode())
+      return SDValue();
+
+    if (Elt.isUndef())
+      UndefMask[i] = true;
+    else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
+      ZeroMask[i] = true;
+    else if (ISD::isNON_EXTLoad(Elt.getNode())) {
+      LoadMask[i] = true;
+      LastLoadedElt = i;
+      // Each loaded element must be the correct fractional portion of the
+      // requested vector load.
+      if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
+        return SDValue();
+    } else
+      return SDValue();
+  }
+  assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
+         "Incomplete element masks");
+
+  // Handle Special Cases - all undef or undef/zero.
+  if (UndefMask.count() == NumElems)
+    return DAG.getUNDEF(VT);
+
+  // FIXME: Should we return this as a BUILD_VECTOR instead?
+  if ((ZeroMask | UndefMask).count() == NumElems)
+    return VT.isInteger() ? DAG.getConstant(0, DL, VT)
+                          : DAG.getConstantFP(0.0, DL, VT);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  int FirstLoadedElt = LoadMask.find_first();
+  SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
+  LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
+  EVT LDBaseVT = EltBase.getValueType();
+
+  // Consecutive loads can contain UNDEFS but not ZERO elements.
+  // Consecutive loads with UNDEFs and ZEROs elements require a
+  // an additional shuffle stage to clear the ZERO elements.
+  bool IsConsecutiveLoad = true;
+  bool IsConsecutiveLoadWithZeros = true;
+  for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
+    if (LoadMask[i]) {
+      SDValue Elt = peekThroughBitcasts(Elts[i]);
+      LoadSDNode *LD = cast<LoadSDNode>(Elt);
+      if (!DAG.areNonVolatileConsecutiveLoads(
+              LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
+              i - FirstLoadedElt)) {
+        IsConsecutiveLoad = false;
+        IsConsecutiveLoadWithZeros = false;
+        break;
+      }
+    } else if (ZeroMask[i]) {
+      IsConsecutiveLoad = false;
+    }
+  }
+
+  auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
+    auto MMOFlags = LDBase->getMemOperand()->getFlags();
+    assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
+           "Cannot merge volatile loads.");
+    SDValue NewLd =
+        DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+                    LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
+
+    if (LDBase->hasAnyUseOfValue(1)) {
+      SDValue NewChain =
+          DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
+                      SDValue(NewLd.getNode(), 1));
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+                             SDValue(NewLd.getNode(), 1));
+    }
+
+    return NewLd;
+  };
+
+  // LOAD - all consecutive load/undefs (must start/end with a load).
+  // If we have found an entire vector of loads and undefs, then return a large
+  // load of the entire vector width starting at the base pointer.
+  // If the vector contains zeros, then attempt to shuffle those elements.
+  if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
+      (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
+    assert(LDBase && "Did not find base load for merging consecutive loads");
+    EVT EltVT = LDBase->getValueType(0);
+    // Ensure that the input vector size for the merged loads matches the
+    // cumulative size of the input elements.
+    if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
+      return SDValue();
+
+    if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
+      return SDValue();
+
+    if (IsConsecutiveLoad)
+      return CreateLoad(VT, LDBase);
+
+    // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
+    // vector and a zero vector to clear out the zero elements.
+    if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
+      SmallVector<int, 4> ClearMask(NumElems, -1);
+      for (unsigned i = 0; i < NumElems; ++i) {
+        if (ZeroMask[i])
+          ClearMask[i] = i + NumElems;
+        else if (LoadMask[i])
+          ClearMask[i] = i;
+      }
+      SDValue V = CreateLoad(VT, LDBase);
+      SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
+                                 : DAG.getConstantFP(0.0, DL, VT);
+      return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
+    }
+  }
+
+  int LoadSize =
+      (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
+
+  // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
+  if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
+      (LoadSize == 32 || LoadSize == 64) &&
+      ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
+    MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
+                                      : MVT::getIntegerVT(LoadSize);
+    MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
+    if (TLI.isTypeLegal(VecVT)) {
+      SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
+      SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
+      SDValue ResNode =
+          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
+                                  LDBase->getPointerInfo(),
+                                  LDBase->getAlignment(),
+                                  false/*isVolatile*/, true/*ReadMem*/,
+                                  false/*WriteMem*/);
+
+      // Make sure the newly-created LOAD is in the same position as LDBase in
+      // terms of dependency. We create a TokenFactor for LDBase and ResNode,
+      // and update uses of LDBase's output chain to use the TokenFactor.
+      if (LDBase->hasAnyUseOfValue(1)) {
+        SDValue NewChain =
+            DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
+                        SDValue(ResNode.getNode(), 1));
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+        DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+                               SDValue(ResNode.getNode(), 1));
+      }
+
+      return DAG.getBitcast(VT, ResNode);
+    }
+  }
+
+  return SDValue();
+}
+
+static Constant *getConstantVector(MVT VT, APInt SplatValue,
+                                   unsigned SplatBitSize, LLVMContext &C) {
+  unsigned ScalarSize = VT.getScalarSizeInBits();
+  unsigned NumElm = SplatBitSize / ScalarSize;
+
+  SmallVector<Constant *, 32> ConstantVec;
+  for (unsigned i = 0; i < NumElm; i++) {
+    APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize);
+    Constant *Const;
+    if (VT.isFloatingPoint()) {
+      assert((ScalarSize == 32 || ScalarSize == 64) &&
+             "Unsupported floating point scalar size");
+      if (ScalarSize == 32)
+        Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
+      else
+        Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
+    } else
+      Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
+    ConstantVec.push_back(Const);
+  }
+  return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
+}
+
+static bool isUseOfShuffle(SDNode *N) {
+  for (auto *U : N->uses()) {
+    if (isTargetShuffle(U->getOpcode()))
+      return true;
+    if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
+      return isUseOfShuffle(U);
+  }
+  return false;
+}
+
+/// Attempt to use the vbroadcast instruction to generate a splat value for the
+/// following cases:
+/// 1. A splat BUILD_VECTOR which uses:
+///    a. A single scalar load, or a constant.
+///    b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
+/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
+/// a scalar load, or a constant.
+///
+/// The VBROADCAST node is returned when a pattern is found,
+/// or SDValue() otherwise.
+static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget,
+                                    SelectionDAG &DAG) {
+  // VBROADCAST requires AVX.
+  // TODO: Splats could be generated for non-AVX CPUs using SSE
+  // instructions, but there's less potential gain for only 128-bit vectors.
+  if (!Subtarget.hasAVX())
+    return SDValue();
+
+  MVT VT = BVOp->getSimpleValueType(0);
+  SDLoc dl(BVOp);
+
+  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
+         "Unsupported vector type for broadcast.");
+
+  BitVector UndefElements;
+  SDValue Ld = BVOp->getSplatValue(&UndefElements);
+
+  // We need a splat of a single value to use broadcast, and it doesn't
+  // make any sense if the value is only in one element of the vector.
+  if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
+    APInt SplatValue, Undef;
+    unsigned SplatBitSize;
+    bool HasUndef;
+    // Check if this is a repeated constant pattern suitable for broadcasting.
+    if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
+        SplatBitSize > VT.getScalarSizeInBits() &&
+        SplatBitSize < VT.getSizeInBits()) {
+      // Avoid replacing with broadcast when it's a use of a shuffle
+      // instruction to preserve the present custom lowering of shuffles.
+      if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
+        return SDValue();
+      // replace BUILD_VECTOR with broadcast of the repeated constants.
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      LLVMContext *Ctx = DAG.getContext();
+      MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
+      if (Subtarget.hasAVX()) {
+        if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
+            !(SplatBitSize == 64 && Subtarget.is32Bit())) {
+          // Splatted value can fit in one INTEGER constant in constant pool.
+          // Load the constant and broadcast it.
+          MVT CVT = MVT::getIntegerVT(SplatBitSize);
+          Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
+          Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
+          SDValue CP = DAG.getConstantPool(C, PVT);
+          unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
+
+          unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+          Ld = DAG.getLoad(
+              CVT, dl, DAG.getEntryNode(), CP,
+              MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+              Alignment);
+          SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
+                                       MVT::getVectorVT(CVT, Repeat), Ld);
+          return DAG.getBitcast(VT, Brdcst);
+        } else if (SplatBitSize == 32 || SplatBitSize == 64) {
+          // Splatted value can fit in one FLOAT constant in constant pool.
+          // Load the constant and broadcast it.
+          // AVX have support for 32 and 64 bit broadcast for floats only.
+          // No 64bit integer in 32bit subtarget.
+          MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
+          Constant *C = SplatBitSize == 32
+                            ? ConstantFP::get(Type::getFloatTy(*Ctx),
+                                              SplatValue.bitsToFloat())
+                            : ConstantFP::get(Type::getDoubleTy(*Ctx),
+                                              SplatValue.bitsToDouble());
+          SDValue CP = DAG.getConstantPool(C, PVT);
+          unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
+
+          unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+          Ld = DAG.getLoad(
+              CVT, dl, DAG.getEntryNode(), CP,
+              MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+              Alignment);
+          SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
+                                       MVT::getVectorVT(CVT, Repeat), Ld);
+          return DAG.getBitcast(VT, Brdcst);
+        } else if (SplatBitSize > 64) {
+          // Load the vector of constants and broadcast it.
+          MVT CVT = VT.getScalarType();
+          Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
+                                             *Ctx);
+          SDValue VCP = DAG.getConstantPool(VecC, PVT);
+          unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
+          unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
+          Ld = DAG.getLoad(
+              MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
+              MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+              Alignment);
+          SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
+          return DAG.getBitcast(VT, Brdcst);
+        }
+      }
+    }
+    return SDValue();
+  }
+
+  bool ConstSplatVal =
+      (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
+
+  // Make sure that all of the users of a non-constant load are from the
+  // BUILD_VECTOR node.
+  if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
+    return SDValue();
+
+  unsigned ScalarSize = Ld.getValueSizeInBits();
+  bool IsGE256 = (VT.getSizeInBits() >= 256);
+
+  // When optimizing for size, generate up to 5 extra bytes for a broadcast
+  // instruction to save 8 or more bytes of constant pool data.
+  // TODO: If multiple splats are generated to load the same constant,
+  // it may be detrimental to overall size. There needs to be a way to detect
+  // that condition to know if this is truly a size win.
+  bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
+
+  // Handle broadcasting a single constant scalar from the constant pool
+  // into a vector.
+  // On Sandybridge (no AVX2), it is still better to load a constant vector
+  // from the constant pool and not to broadcast it from a scalar.
+  // But override that restriction when optimizing for size.
+  // TODO: Check if splatting is recommended for other AVX-capable CPUs.
+  if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
+    EVT CVT = Ld.getValueType();
+    assert(!CVT.isVector() && "Must not broadcast a vector type");
+
+    // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
+    // For size optimization, also splat v2f64 and v2i64, and for size opt
+    // with AVX2, also splat i8 and i16.
+    // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
+    if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+        (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
+      const Constant *C = nullptr;
+      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
+        C = CI->getConstantIntValue();
+      else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
+        C = CF->getConstantFPValue();
+
+      assert(C && "Invalid constant type");
+
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      SDValue CP =
+          DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
+      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+      Ld = DAG.getLoad(
+          CVT, dl, DAG.getEntryNode(), CP,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+          Alignment);
+
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+    }
+  }
+
+  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
+
+  // Handle AVX2 in-register broadcasts.
+  if (!IsLoad && Subtarget.hasInt256() &&
+      (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+  // The scalar source must be a normal load.
+  if (!IsLoad)
+    return SDValue();
+
+  if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+      (Subtarget.hasVLX() && ScalarSize == 64))
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
+  // double since there is no vbroadcastsd xmm
+  if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
+    if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+  }
+
+  // Unsupported broadcast.
+  return SDValue();
+}
+
+/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
+/// underlying vector and index.
+///
+/// Modifies \p ExtractedFromVec to the real vector and returns the real
+/// index.
+static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
+                                         SDValue ExtIdx) {
+  int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+  if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
+    return Idx;
+
+  // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
+  // lowered this:
+  //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
+  // to:
+  //   (extract_vector_elt (vector_shuffle<2,u,u,u>
+  //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
+  //                           undef)
+  //                       Constant<0>)
+  // In this case the vector is the extract_subvector expression and the index
+  // is 2, as specified by the shuffle.
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
+  SDValue ShuffleVec = SVOp->getOperand(0);
+  MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
+  assert(ShuffleVecVT.getVectorElementType() ==
+         ExtractedFromVec.getSimpleValueType().getVectorElementType());
+
+  int ShuffleIdx = SVOp->getMaskElt(Idx);
+  if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
+    ExtractedFromVec = ShuffleVec;
+    return ShuffleIdx;
+  }
+  return Idx;
+}
+
+static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+
+  // Skip if insert_vec_elt is not supported.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
+    return SDValue();
+
+  SDLoc DL(Op);
+  unsigned NumElems = Op.getNumOperands();
+
+  SDValue VecIn1;
+  SDValue VecIn2;
+  SmallVector<unsigned, 4> InsertIndices;
+  SmallVector<int, 8> Mask(NumElems, -1);
+
+  for (unsigned i = 0; i != NumElems; ++i) {
+    unsigned Opc = Op.getOperand(i).getOpcode();
+
+    if (Opc == ISD::UNDEF)
+      continue;
+
+    if (Opc != ISD::EXTRACT_VECTOR_ELT) {
+      // Quit if more than 1 elements need inserting.
+      if (InsertIndices.size() > 1)
+        return SDValue();
+
+      InsertIndices.push_back(i);
+      continue;
+    }
+
+    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
+    SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+    // Quit if non-constant index.
+    if (!isa<ConstantSDNode>(ExtIdx))
+      return SDValue();
+    int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
+
+    // Quit if extracted from vector of different type.
+    if (ExtractedFromVec.getValueType() != VT)
+      return SDValue();
+
+    if (!VecIn1.getNode())
+      VecIn1 = ExtractedFromVec;
+    else if (VecIn1 != ExtractedFromVec) {
+      if (!VecIn2.getNode())
+        VecIn2 = ExtractedFromVec;
+      else if (VecIn2 != ExtractedFromVec)
+        // Quit if more than 2 vectors to shuffle
+        return SDValue();
+    }
+
+    if (ExtractedFromVec == VecIn1)
+      Mask[i] = Idx;
+    else if (ExtractedFromVec == VecIn2)
+      Mask[i] = Idx + NumElems;
+  }
+
+  if (!VecIn1.getNode())
+    return SDValue();
+
+  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
+  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
+    unsigned Idx = InsertIndices[i];
+    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
+                     DAG.getIntPtrConstant(Idx, DL));
+  }
+
+  return NV;
+}
+
+static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
+  assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
+         Op.getScalarValueSizeInBits() == 1 &&
+         "Can not convert non-constant vector");
+  uint64_t Immediate = 0;
+  for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
+    SDValue In = Op.getOperand(idx);
+    if (!In.isUndef())
+      Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+  }
+  SDLoc dl(Op);
+  MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
+  return DAG.getConstant(Immediate, dl, VT);
+}
+// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
+SDValue
+X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
+
+  MVT VT = Op.getSimpleValueType();
+  assert((VT.getVectorElementType() == MVT::i1) &&
+         "Unexpected type in LowerBUILD_VECTORvXi1!");
+
+  SDLoc dl(Op);
+  if (ISD::isBuildVectorAllZeros(Op.getNode()))
+    return DAG.getTargetConstant(0, dl, VT);
+
+  if (ISD::isBuildVectorAllOnes(Op.getNode()))
+    return DAG.getTargetConstant(1, dl, VT);
+
+  if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+    SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
+    if (Imm.getValueSizeInBits() == VT.getSizeInBits())
+      return DAG.getBitcast(VT, Imm);
+    SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+                        DAG.getIntPtrConstant(0, dl));
+  }
+
+  // Vector has one or more non-const elements
+  uint64_t Immediate = 0;
+  SmallVector<unsigned, 16> NonConstIdx;
+  bool IsSplat = true;
+  bool HasConstElts = false;
+  int SplatIdx = -1;
+  for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
+    SDValue In = Op.getOperand(idx);
+    if (In.isUndef())
+      continue;
+    if (!isa<ConstantSDNode>(In))
+      NonConstIdx.push_back(idx);
+    else {
+      Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+      HasConstElts = true;
+    }
+    if (SplatIdx < 0)
+      SplatIdx = idx;
+    else if (In != Op.getOperand(SplatIdx))
+      IsSplat = false;
+  }
+
+  // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
+  if (IsSplat)
+    return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
+                       DAG.getConstant(1, dl, VT),
+                       DAG.getConstant(0, dl, VT));
+
+  // insert elements one by one
+  SDValue DstVec;
+  SDValue Imm;
+  if (Immediate) {
+    MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
+    Imm = DAG.getConstant(Immediate, dl, ImmVT);
+  }
+  else if (HasConstElts)
+    Imm = DAG.getConstant(0, dl, VT);
+  else
+    Imm = DAG.getUNDEF(VT);
+  if (Imm.getValueSizeInBits() == VT.getSizeInBits())
+    DstVec = DAG.getBitcast(VT, Imm);
+  else {
+    SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
+    DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+                         DAG.getIntPtrConstant(0, dl));
+  }
+
+  for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
+    unsigned InsertIdx = NonConstIdx[i];
+    DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+                         Op.getOperand(InsertIdx),
+                         DAG.getIntPtrConstant(InsertIdx, dl));
+  }
+  return DstVec;
+}
+
+/// \brief Return true if \p N implements a horizontal binop and return the
+/// operands for the horizontal binop into V0 and V1.
+///
+/// This is a helper function of LowerToHorizontalOp().
+/// This function checks that the build_vector \p N in input implements a
+/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
+/// operation to match.
+/// For example, if \p Opcode is equal to ISD::ADD, then this function
+/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
+/// is equal to ISD::SUB, then this function checks if this is a horizontal
+/// arithmetic sub.
+///
+/// This function only analyzes elements of \p N whose indices are
+/// in range [BaseIdx, LastIdx).
+static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
+                              SelectionDAG &DAG,
+                              unsigned BaseIdx, unsigned LastIdx,
+                              SDValue &V0, SDValue &V1) {
+  EVT VT = N->getValueType(0);
+
+  assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
+  assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
+         "Invalid Vector in input!");
+
+  bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
+  bool CanFold = true;
+  unsigned ExpectedVExtractIdx = BaseIdx;
+  unsigned NumElts = LastIdx - BaseIdx;
+  V0 = DAG.getUNDEF(VT);
+  V1 = DAG.getUNDEF(VT);
+
+  // Check if N implements a horizontal binop.
+  for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
+    SDValue Op = N->getOperand(i + BaseIdx);
+
+    // Skip UNDEFs.
+    if (Op->isUndef()) {
+      // Update the expected vector extract index.
+      if (i * 2 == NumElts)
+        ExpectedVExtractIdx = BaseIdx;
+      ExpectedVExtractIdx += 2;
+      continue;
+    }
+
+    CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
+
+    if (!CanFold)
+      break;
+
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    // Try to match the following pattern:
+    // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
+    CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        Op0.getOperand(0) == Op1.getOperand(0) &&
+        isa<ConstantSDNode>(Op0.getOperand(1)) &&
+        isa<ConstantSDNode>(Op1.getOperand(1)));
+    if (!CanFold)
+      break;
+
+    unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+    unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
+
+    if (i * 2 < NumElts) {
+      if (V0.isUndef()) {
+        V0 = Op0.getOperand(0);
+        if (V0.getValueType() != VT)
+          return false;
+      }
+    } else {
+      if (V1.isUndef()) {
+        V1 = Op0.getOperand(0);
+        if (V1.getValueType() != VT)
+          return false;
+      }
+      if (i * 2 == NumElts)
+        ExpectedVExtractIdx = BaseIdx;
+    }
+
+    SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
+    if (I0 == ExpectedVExtractIdx)
+      CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
+    else if (IsCommutable && I1 == ExpectedVExtractIdx) {
+      // Try to match the following dag sequence:
+      // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
+      CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
+    } else
+      CanFold = false;
+
+    ExpectedVExtractIdx += 2;
+  }
+
+  return CanFold;
+}
+
+/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
+/// a concat_vector.
+///
+/// This is a helper function of LowerToHorizontalOp().
+/// This function expects two 256-bit vectors called V0 and V1.
+/// At first, each vector is split into two separate 128-bit vectors.
+/// Then, the resulting 128-bit vectors are used to implement two
+/// horizontal binary operations.
+///
+/// The kind of horizontal binary operation is defined by \p X86Opcode.
+///
+/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
+/// the two new horizontal binop.
+/// When Mode is set, the first horizontal binop dag node would take as input
+/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
+/// horizontal binop dag node would take as input the lower 128-bit of V1
+/// and the upper 128-bit of V1.
+///   Example:
+///     HADD V0_LO, V0_HI
+///     HADD V1_LO, V1_HI
+///
+/// Otherwise, the first horizontal binop dag node takes as input the lower
+/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
+/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
+///   Example:
+///     HADD V0_LO, V1_LO
+///     HADD V0_HI, V1_HI
+///
+/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
+/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
+/// the upper 128-bits of the result.
+static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
+                                     const SDLoc &DL, SelectionDAG &DAG,
+                                     unsigned X86Opcode, bool Mode,
+                                     bool isUndefLO, bool isUndefHI) {
+  MVT VT = V0.getSimpleValueType();
+  assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
+         "Invalid nodes in input!");
+
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
+  SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
+  SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
+  SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
+  MVT NewVT = V0_LO.getSimpleValueType();
+
+  SDValue LO = DAG.getUNDEF(NewVT);
+  SDValue HI = DAG.getUNDEF(NewVT);
+
+  if (Mode) {
+    // Don't emit a horizontal binop if the result is expected to be UNDEF.
+    if (!isUndefLO && !V0->isUndef())
+      LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
+    if (!isUndefHI && !V1->isUndef())
+      HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
+  } else {
+    // Don't emit a horizontal binop if the result is expected to be UNDEF.
+    if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
+      LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
+
+    if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
+      HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
+  }
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
+}
+
+/// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
+/// node.
+static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
+                             const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+  MVT VT = BV->getSimpleValueType(0);
+  if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
+      (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
+    return SDValue();
+
+  SDLoc DL(BV);
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue InVec0 = DAG.getUNDEF(VT);
+  SDValue InVec1 = DAG.getUNDEF(VT);
+
+  assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+          VT == MVT::v2f64) && "build_vector with an invalid type found!");
+
+  // Odd-numbered elements in the input build vector are obtained from
+  // adding two integer/float elements.
+  // Even-numbered elements in the input build vector are obtained from
+  // subtracting two integer/float elements.
+  unsigned ExpectedOpcode = ISD::FSUB;
+  unsigned NextExpectedOpcode = ISD::FADD;
+  bool AddFound = false;
+  bool SubFound = false;
+
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    SDValue Op = BV->getOperand(i);
+
+    // Skip 'undef' values.
+    unsigned Opcode = Op.getOpcode();
+    if (Opcode == ISD::UNDEF) {
+      std::swap(ExpectedOpcode, NextExpectedOpcode);
+      continue;
+    }
+
+    // Early exit if we found an unexpected opcode.
+    if (Opcode != ExpectedOpcode)
+      return SDValue();
+
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    // Try to match the following pattern:
+    // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
+    // Early exit if we cannot match that sequence.
+    if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(Op0.getOperand(1)) ||
+        !isa<ConstantSDNode>(Op1.getOperand(1)) ||
+        Op0.getOperand(1) != Op1.getOperand(1))
+      return SDValue();
+
+    unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+    if (I0 != i)
+      return SDValue();
+
+    // We found a valid add/sub node. Update the information accordingly.
+    if (i & 1)
+      AddFound = true;
+    else
+      SubFound = true;
+
+    // Update InVec0 and InVec1.
+    if (InVec0.isUndef()) {
+      InVec0 = Op0.getOperand(0);
+      if (InVec0.getSimpleValueType() != VT)
+        return SDValue();
+    }
+    if (InVec1.isUndef()) {
+      InVec1 = Op1.getOperand(0);
+      if (InVec1.getSimpleValueType() != VT)
+        return SDValue();
+    }
+
+    // Make sure that operands in input to each add/sub node always
+    // come from a same pair of vectors.
+    if (InVec0 != Op0.getOperand(0)) {
+      if (ExpectedOpcode == ISD::FSUB)
+        return SDValue();
+
+      // FADD is commutable. Try to commute the operands
+      // and then test again.
+      std::swap(Op0, Op1);
+      if (InVec0 != Op0.getOperand(0))
+        return SDValue();
+    }
+
+    if (InVec1 != Op1.getOperand(0))
+      return SDValue();
+
+    // Update the pair of expected opcodes.
+    std::swap(ExpectedOpcode, NextExpectedOpcode);
+  }
+
+  // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
+  if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
+    return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
+
+  return SDValue();
+}
+
+/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
+static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
+                                   const X86Subtarget &Subtarget,
+                                   SelectionDAG &DAG) {
+  MVT VT = BV->getSimpleValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumUndefsLO = 0;
+  unsigned NumUndefsHI = 0;
+  unsigned Half = NumElts/2;
+
+  // Count the number of UNDEF operands in the build_vector in input.
+  for (unsigned i = 0, e = Half; i != e; ++i)
+    if (BV->getOperand(i)->isUndef())
+      NumUndefsLO++;
+
+  for (unsigned i = Half, e = NumElts; i != e; ++i)
+    if (BV->getOperand(i)->isUndef())
+      NumUndefsHI++;
+
+  // Early exit if this is either a build_vector of all UNDEFs or all the
+  // operands but one are UNDEF.
+  if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
+    return SDValue();
+
+  SDLoc DL(BV);
+  SDValue InVec0, InVec1;
+  if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
+    // Try to match an SSE3 float HADD/HSUB.
+    if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
+
+    if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+  } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
+    // Try to match an SSSE3 integer HADD/HSUB.
+    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
+
+    if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
+      return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
+  }
+
+  if (!Subtarget.hasAVX())
+    return SDValue();
+
+  if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
+    // Try to match an AVX horizontal add/sub of packed single/double
+    // precision floating point values from 256-bit vectors.
+    SDValue InVec2, InVec3;
+    if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
+        ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
+      return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
+
+    if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
+        ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
+      return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
+  } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
+    // Try to match an AVX2 horizontal add/sub of signed integers.
+    SDValue InVec2, InVec3;
+    unsigned X86Opcode;
+    bool CanFold = true;
+
+    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
+        ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
+      X86Opcode = X86ISD::HADD;
+    else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
+        isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
+        ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
+        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
+      X86Opcode = X86ISD::HSUB;
+    else
+      CanFold = false;
+
+    if (CanFold) {
+      // Fold this build_vector into a single horizontal add/sub.
+      // Do this only if the target has AVX2.
+      if (Subtarget.hasAVX2())
+        return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
+
+      // Do not try to expand this build_vector into a pair of horizontal
+      // add/sub if we can emit a pair of scalar add/sub.
+      if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
+        return SDValue();
+
+      // Convert this build_vector into a pair of horizontal binop followed by
+      // a concat vector.
+      bool isUndefLO = NumUndefsLO == Half;
+      bool isUndefHI = NumUndefsHI == Half;
+      return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
+                                   isUndefLO, isUndefHI);
+    }
+  }
+
+  if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
+       VT == MVT::v16i16) && Subtarget.hasAVX()) {
+    unsigned X86Opcode;
+    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::HADD;
+    else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::HSUB;
+    else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::FHADD;
+    else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
+      X86Opcode = X86ISD::FHSUB;
+    else
+      return SDValue();
+
+    // Don't try to expand this build_vector into a pair of horizontal add/sub
+    // if we can simply emit a pair of scalar add/sub.
+    if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
+      return SDValue();
+
+    // Convert this build_vector into two horizontal add/sub followed by
+    // a concat vector.
+    bool isUndefLO = NumUndefsLO == Half;
+    bool isUndefHI = NumUndefsHI == Half;
+    return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
+                                 isUndefLO, isUndefHI);
+  }
+
+  return SDValue();
+}
+
+/// If a BUILD_VECTOR's source elements all apply the same bit operation and
+/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
+/// just apply the bit to the vectors.
+/// NOTE: Its not in our interest to start make a general purpose vectorizer
+/// from this, but enough scalar bit operations are created from the later
+/// legalization + scalarization stages to need basic support.
+static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  MVT VT = Op->getSimpleValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Check that all elements have the same opcode.
+  // TODO: Should we allow UNDEFS and if so how many?
+  unsigned Opcode = Op->getOperand(0).getOpcode();
+  for (unsigned i = 1; i < NumElems; ++i)
+    if (Opcode != Op->getOperand(i).getOpcode())
+      return SDValue();
+
+  // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
+  switch (Opcode) {
+  default:
+    return SDValue();
+  case ISD::AND:
+  case ISD::XOR:
+  case ISD::OR:
+    if (!TLI.isOperationLegalOrPromote(Opcode, VT))
+      return SDValue();
+    break;
+  }
+
+  SmallVector<SDValue, 4> LHSElts, RHSElts;
+  for (SDValue Elt : Op->ops()) {
+    SDValue LHS = Elt.getOperand(0);
+    SDValue RHS = Elt.getOperand(1);
+
+    // We expect the canonicalized RHS operand to be the constant.
+    if (!isa<ConstantSDNode>(RHS))
+      return SDValue();
+    LHSElts.push_back(LHS);
+    RHSElts.push_back(RHS);
+  }
+
+  SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
+  SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
+  return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+}
+
+/// Create a vector constant without a load. SSE/AVX provide the bare minimum
+/// functionality to do this, so it's all zeros, all ones, or some derivation
+/// that is cheap to calculate.
+static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
+                                         const X86Subtarget &Subtarget) {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  // Vectors containing all zeros can be matched by pxor and xorps.
+  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
+    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
+    if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
+      return Op;
+
+    return getZeroVector(VT, Subtarget, DAG, DL);
+  }
+
+  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
+  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
+  // vpcmpeqd on 256-bit vectors.
+  if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
+    if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
+        (VT == MVT::v8i32 && Subtarget.hasInt256()))
+      return Op;
+
+    return getOnesVector(VT, Subtarget, DAG, DL);
+  }
+
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+
+  MVT VT = Op.getSimpleValueType();
+  MVT ExtVT = VT.getVectorElementType();
+  unsigned NumElems = Op.getNumOperands();
+
+  // Generate vectors for predicate vectors.
+  if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
+    return LowerBUILD_VECTORvXi1(Op, DAG);
+
+  if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
+    return VectorConstant;
+
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
+  if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
+    return AddSub;
+  if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
+    return HorizontalOp;
+  if (SDValue Broadcast = LowerVectorBroadcast(BV, Subtarget, DAG))
+    return Broadcast;
+  if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
+    return BitOp;
+
+  unsigned EVTBits = ExtVT.getSizeInBits();
+
+  unsigned NumZero  = 0;
+  unsigned NumNonZero = 0;
+  uint64_t NonZeros = 0;
+  bool IsAllConstants = true;
+  SmallSet<SDValue, 8> Values;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue Elt = Op.getOperand(i);
+    if (Elt.isUndef())
+      continue;
+    Values.insert(Elt);
+    if (Elt.getOpcode() != ISD::Constant &&
+        Elt.getOpcode() != ISD::ConstantFP)
+      IsAllConstants = false;
+    if (X86::isZeroNode(Elt))
+      NumZero++;
+    else {
+      assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
+      NonZeros |= ((uint64_t)1 << i);
+      NumNonZero++;
+    }
+  }
+
+  // All undef vector. Return an UNDEF.  All zero vectors were handled above.
+  if (NumNonZero == 0)
+    return DAG.getUNDEF(VT);
+
+  // Special case for single non-zero, non-undef, element.
+  if (NumNonZero == 1) {
+    unsigned Idx = countTrailingZeros(NonZeros);
+    SDValue Item = Op.getOperand(Idx);
+
+    // If this is an insertion of an i64 value on x86-32, and if the top bits of
+    // the value are obviously zero, truncate the value to i32 and do the
+    // insertion that way.  Only do this if the value is non-constant or if the
+    // value is a constant being inserted into element 0.  It is cheaper to do
+    // a constant pool load than it is to do a movd + shuffle.
+    if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
+        (!IsAllConstants || Idx == 0)) {
+      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
+        // Handle SSE only.
+        assert(VT == MVT::v2i64 && "Expected an SSE value type!");
+        MVT VecVT = MVT::v4i32;
+
+        // Truncate the value (which may itself be a constant) to i32, and
+        // convert it to a vector with movd (S2V+shuffle to zero extend).
+        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
+        return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
+                                      Item, Idx * 2, true, Subtarget, DAG));
+      }
+    }
+
+    // If we have a constant or non-constant insertion into the low element of
+    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
+    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
+    // depending on what the source datatype is.
+    if (Idx == 0) {
+      if (NumZero == 0)
+        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+
+      if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
+          (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
+        assert((VT.is128BitVector() || VT.is256BitVector() ||
+                VT.is512BitVector()) &&
+               "Expected an SSE value type!");
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+      }
+
+      // We can't directly insert an i8 or i16 into a vector, so zero extend
+      // it to i32 first.
+      if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
+        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
+        if (VT.getSizeInBits() >= 256) {
+          MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
+          if (Subtarget.hasAVX()) {
+            Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
+            Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+          } else {
+            // Without AVX, we need to extend to a 128-bit vector and then
+            // insert into the 256-bit vector.
+            Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
+            SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
+            Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
+          }
+        } else {
+          assert(VT.is128BitVector() && "Expected an SSE value type!");
+          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
+          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+        }
+        return DAG.getBitcast(VT, Item);
+      }
+    }
+
+    // Is it a vector logical left shift?
+    if (NumElems == 2 && Idx == 1 &&
+        X86::isZeroNode(Op.getOperand(0)) &&
+        !X86::isZeroNode(Op.getOperand(1))) {
+      unsigned NumBits = VT.getSizeInBits();
+      return getVShift(true, VT,
+                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                   VT, Op.getOperand(1)),
+                       NumBits/2, DAG, *this, dl);
+    }
+
+    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
+      return SDValue();
+
+    // Otherwise, if this is a vector with i32 or f32 elements, and the element
+    // is a non-constant being inserted into an element other than the low one,
+    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
+    // movd/movss) to move this into the low element, then shuffle it into
+    // place.
+    if (EVTBits == 32) {
+      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+      return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
+    }
+  }
+
+  // Splat is obviously ok. Let legalizer expand it to a shuffle.
+  if (Values.size() == 1) {
+    if (EVTBits == 32) {
+      // Instead of a shuffle like this:
+      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
+      // Check if it's possible to issue this instead.
+      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
+      unsigned Idx = countTrailingZeros(NonZeros);
+      SDValue Item = Op.getOperand(Idx);
+      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
+        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
+    }
+    return SDValue();
+  }
+
+  // A vector full of immediates; various special cases are already
+  // handled, so this is best done with a single constant-pool load.
+  if (IsAllConstants)
+    return SDValue();
+
+  // See if we can use a vector load to get all of the elements.
+  if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
+    SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
+    if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
+      return LD;
+  }
+
+  // For AVX-length vectors, build the individual 128-bit pieces and use
+  // shuffles to put them in place.
+  if (VT.is256BitVector() || VT.is512BitVector()) {
+    SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
+
+    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
+
+    // Build both the lower and upper subvector.
+    SDValue Lower =
+        DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
+    SDValue Upper = DAG.getBuildVector(
+        HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
+
+    // Recreate the wider vector with the lower and upper part.
+    if (VT.is256BitVector())
+      return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+    return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+  }
+
+  // Let legalizer expand 2-wide build_vectors.
+  if (EVTBits == 64) {
+    if (NumNonZero == 1) {
+      // One half is zero or undef.
+      unsigned Idx = countTrailingZeros(NonZeros);
+      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
+                               Op.getOperand(Idx));
+      return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
+    }
+    return SDValue();
+  }
+
+  // If element VT is < 32 bits, convert it to inserts into a zero vector.
+  if (EVTBits == 8 && NumElems == 16)
+    if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
+                                          DAG, Subtarget, *this))
+      return V;
+
+  if (EVTBits == 16 && NumElems == 8)
+    if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
+                                          DAG, Subtarget, *this))
+      return V;
+
+  // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
+  if (EVTBits == 32 && NumElems == 4)
+    if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
+      return V;
+
+  // If element VT is == 32 bits, turn it into a number of shuffles.
+  if (NumElems == 4 && NumZero > 0) {
+    SmallVector<SDValue, 8> Ops(NumElems);
+    for (unsigned i = 0; i < 4; ++i) {
+      bool isZero = !(NonZeros & (1ULL << i));
+      if (isZero)
+        Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
+      else
+        Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
+        default: break;
+        case 0:
+          Ops[i] = Ops[i*2];  // Must be a zero vector.
+          break;
+        case 1:
+          Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
+          break;
+        case 2:
+          Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
+          break;
+        case 3:
+          Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
+          break;
+      }
+    }
+
+    bool Reverse1 = (NonZeros & 0x3) == 2;
+    bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+    int MaskVec[] = {
+      Reverse1 ? 1 : 0,
+      Reverse1 ? 0 : 1,
+      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
+      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
+    };
+    return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
+  }
+
+  if (Values.size() > 1 && VT.is128BitVector()) {
+    // Check for a build vector from mostly shuffle plus few inserting.
+    if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
+      return Sh;
+
+    // For SSE 4.1, use insertps to put the high elements into the low element.
+    if (Subtarget.hasSSE41()) {
+      SDValue Result;
+      if (!Op.getOperand(0).isUndef())
+        Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
+      else
+        Result = DAG.getUNDEF(VT);
+
+      for (unsigned i = 1; i < NumElems; ++i) {
+        if (Op.getOperand(i).isUndef()) continue;
+        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
+                             Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
+      }
+      return Result;
+    }
+
+    // Otherwise, expand into a number of unpckl*, start by extending each of
+    // our (non-undef) elements to the full vector width with the element in the
+    // bottom slot of the vector (which generates no code for SSE).
+    SmallVector<SDValue, 8> Ops(NumElems);
+    for (unsigned i = 0; i < NumElems; ++i) {
+      if (!Op.getOperand(i).isUndef())
+        Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+      else
+        Ops[i] = DAG.getUNDEF(VT);
+    }
+
+    // Next, we iteratively mix elements, e.g. for v4f32:
+    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
+    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
+    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
+    unsigned EltStride = NumElems >> 1;
+    while (EltStride != 0) {
+      for (unsigned i = 0; i < EltStride; ++i) {
+        // If Ops[i+EltStride] is undef and this is the first round of mixing,
+        // then it is safe to just drop this shuffle: V[i] is already in the
+        // right place, the one element (since it's the first round) being
+        // inserted as undef can be dropped.  This isn't safe for successive
+        // rounds because they will permute elements within both vectors.
+        if (Ops[i+EltStride].isUndef() &&
+            EltStride == NumElems/2)
+          continue;
+
+        Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
+      }
+      EltStride >>= 1;
+    }
+    return Ops[0];
+  }
+  return SDValue();
+}
+
+// 256-bit AVX can use the vinsertf128 instruction
+// to create 256-bit vectors from two other 128-bit ones.
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  MVT ResVT = Op.getSimpleValueType();
+
+  assert((ResVT.is256BitVector() ||
+          ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
+
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  unsigned NumElems = ResVT.getVectorNumElements();
+  if (ResVT.is256BitVector())
+    return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+
+  if (Op.getNumOperands() == 4) {
+    MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
+                                  ResVT.getVectorNumElements()/2);
+    SDValue V3 = Op.getOperand(2);
+    SDValue V4 = Op.getOperand(3);
+    return concat256BitVectors(
+        concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
+        concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
+        NumElems, DAG, dl);
+  }
+  return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+}
+
+static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG & DAG) {
+  SDLoc dl(Op);
+  MVT ResVT = Op.getSimpleValueType();
+  unsigned NumOfOperands = Op.getNumOperands();
+
+  assert(isPowerOf2_32(NumOfOperands) &&
+         "Unexpected number of operands in CONCAT_VECTORS");
+
+  SDValue Undef = DAG.getUNDEF(ResVT);
+  if (NumOfOperands > 2) {
+    // Specialize the cases when all, or all but one, of the operands are undef.
+    unsigned NumOfDefinedOps = 0;
+    unsigned OpIdx = 0;
+    for (unsigned i = 0; i < NumOfOperands; i++)
+      if (!Op.getOperand(i).isUndef()) {
+        NumOfDefinedOps++;
+        OpIdx = i;
+      }
+    if (NumOfDefinedOps == 0)
+      return Undef;
+    if (NumOfDefinedOps == 1) {
+      unsigned SubVecNumElts =
+        Op.getOperand(OpIdx).getValueType().getVectorNumElements();
+      SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
+      return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
+                         Op.getOperand(OpIdx), IdxVal);
+    }
+
+    MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
+                                  ResVT.getVectorNumElements()/2);
+    SmallVector<SDValue, 2> Ops;
+    for (unsigned i = 0; i < NumOfOperands/2; i++)
+      Ops.push_back(Op.getOperand(i));
+    SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
+    Ops.clear();
+    for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
+      Ops.push_back(Op.getOperand(i));
+    SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+  }
+
+  // 2 operands
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  unsigned NumElems = ResVT.getVectorNumElements();
+  assert(V1.getValueType() == V2.getValueType() &&
+         V1.getValueType().getVectorNumElements() == NumElems/2 &&
+         "Unexpected operands in CONCAT_VECTORS");
+
+  if (ResVT.getSizeInBits() >= 16)
+    return Op; // The operation is legal with KUNPCK
+
+  bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
+  SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
+  if (IsZeroV1 && IsZeroV2)
+    return ZeroVec;
+
+  SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+  if (V2.isUndef())
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
+  if (IsZeroV2)
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
+
+  SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
+  if (V1.isUndef())
+    V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
+
+  if (IsZeroV1)
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
+
+  V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op,
+                                   const X86Subtarget &Subtarget,
+                                   SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  if (VT.getVectorElementType() == MVT::i1)
+    return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
+
+  assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
+         (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
+          Op.getNumOperands() == 4)));
+
+  // AVX can use the vinsertf128 instruction to create 256-bit vectors
+  // from two other 128-bit ones.
+
+  // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
+  return LowerAVXCONCAT_VECTORS(Op, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+// Vector shuffle lowering
+//
+// This is an experimental code path for lowering vector shuffles on x86. It is
+// designed to handle arbitrary vector shuffles and blends, gracefully
+// degrading performance as necessary. It works hard to recognize idiomatic
+// shuffles and lower them to optimal instruction patterns without leaving
+// a framework that allows reasonably efficient handling of all vector shuffle
+// patterns.
+//===----------------------------------------------------------------------===//
+
+/// \brief Tiny helper function to identify a no-op mask.
+///
+/// This is a somewhat boring predicate function. It checks whether the mask
+/// array input, which is assumed to be a single-input shuffle mask of the kind
+/// used by the X86 shuffle instructions (not a fully general
+/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
+/// in-place shuffle are 'no-op's.
+static bool isNoopShuffleMask(ArrayRef<int> Mask) {
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    assert(Mask[i] >= -1 && "Out of bound mask element!");
+    if (Mask[i] >= 0 && Mask[i] != i)
+      return false;
+  }
+  return true;
+}
+
+/// \brief Test whether there are elements crossing 128-bit lanes in this
+/// shuffle mask.
+///
+/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
+/// and we routinely test for these.
+static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
+  int LaneSize = 128 / VT.getScalarSizeInBits();
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+      return true;
+  return false;
+}
+
+/// \brief Test whether a shuffle mask is equivalent within each sub-lane.
+///
+/// This checks a shuffle mask to see if it is performing the same
+/// lane-relative shuffle in each sub-lane. This trivially implies
+/// that it is also not lane-crossing. It may however involve a blend from the
+/// same lane of a second vector.
+///
+/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
+/// non-trivial to compute in the face of undef lanes. The representation is
+/// suitable for use with existing 128-bit shuffles as entries from the second
+/// vector have been remapped to [LaneSize, 2*LaneSize).
+static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
+                                  ArrayRef<int> Mask,
+                                  SmallVectorImpl<int> &RepeatedMask) {
+  int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
+  RepeatedMask.assign(LaneSize, -1);
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i) {
+    assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
+    if (Mask[i] < 0)
+      continue;
+    if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+      // This entry crosses lanes, so there is no way to model this shuffle.
+      return false;
+
+    // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+    // Adjust second vector indices to start at LaneSize instead of Size.
+    int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
+                                : Mask[i] % LaneSize + LaneSize;
+    if (RepeatedMask[i % LaneSize] < 0)
+      // This is the first non-undef entry in this slot of a 128-bit lane.
+      RepeatedMask[i % LaneSize] = LocalM;
+    else if (RepeatedMask[i % LaneSize] != LocalM)
+      // Found a mismatch with the repeated mask.
+      return false;
+  }
+  return true;
+}
+
+/// Test whether a shuffle mask is equivalent within each 128-bit lane.
+static bool
+is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+                                SmallVectorImpl<int> &RepeatedMask) {
+  return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
+}
+
+/// Test whether a shuffle mask is equivalent within each 256-bit lane.
+static bool
+is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+                                SmallVectorImpl<int> &RepeatedMask) {
+  return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
+}
+
+/// Test whether a target shuffle mask is equivalent within each sub-lane.
+/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
+static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
+                                        ArrayRef<int> Mask,
+                                        SmallVectorImpl<int> &RepeatedMask) {
+  int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
+  RepeatedMask.assign(LaneSize, SM_SentinelUndef);
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i) {
+    assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
+    if (Mask[i] == SM_SentinelUndef)
+      continue;
+    if (Mask[i] == SM_SentinelZero) {
+      if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
+        return false;
+      RepeatedMask[i % LaneSize] = SM_SentinelZero;
+      continue;
+    }
+    if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+      // This entry crosses lanes, so there is no way to model this shuffle.
+      return false;
+
+    // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+    // Adjust second vector indices to start at LaneSize instead of Size.
+    int LocalM =
+        Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
+    if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
+      // This is the first non-undef entry in this slot of a 128-bit lane.
+      RepeatedMask[i % LaneSize] = LocalM;
+    else if (RepeatedMask[i % LaneSize] != LocalM)
+      // Found a mismatch with the repeated mask.
+      return false;
+  }
+  return true;
+}
+
+/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
+/// arguments.
+///
+/// This is a fast way to test a shuffle mask against a fixed pattern:
+///
+///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
+///
+/// It returns true if the mask is exactly as wide as the argument list, and
+/// each element of the mask is either -1 (signifying undef) or the value given
+/// in the argument.
+static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
+                                ArrayRef<int> ExpectedMask) {
+  if (Mask.size() != ExpectedMask.size())
+    return false;
+
+  int Size = Mask.size();
+
+  // If the values are build vectors, we can look through them to find
+  // equivalent inputs that make the shuffles equivalent.
+  auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
+  auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
+
+  for (int i = 0; i < Size; ++i) {
+    assert(Mask[i] >= -1 && "Out of bound mask element!");
+    if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
+      auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
+      auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
+      if (!MaskBV || !ExpectedBV ||
+          MaskBV->getOperand(Mask[i] % Size) !=
+              ExpectedBV->getOperand(ExpectedMask[i] % Size))
+        return false;
+    }
+}
+
+  return true;
+}
+
+/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
+///
+/// The masks must be exactly the same width.
+///
+/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
+/// value in ExpectedMask is always accepted. Otherwise the indices must match.
+///
+/// SM_SentinelZero is accepted as a valid negative index but must match in both.
+static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
+                                      ArrayRef<int> ExpectedMask) {
+  int Size = Mask.size();
+  if (Size != (int)ExpectedMask.size())
+    return false;
+
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] == SM_SentinelUndef)
+      continue;
+    else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
+      return false;
+    else if (Mask[i] != ExpectedMask[i])
+      return false;
+
+  return true;
+}
+
+/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
+///
+/// This helper function produces an 8-bit shuffle immediate corresponding to
+/// the ubiquitous shuffle encoding scheme used in x86 instructions for
+/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
+/// example.
+///
+/// NB: We rely heavily on "undef" masks preserving the input lane.
+static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
+  assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
+  assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
+  assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
+  assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
+  assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
+
+  unsigned Imm = 0;
+  Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
+  Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
+  Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
+  Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
+  return Imm;
+}
+
+static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
+                                          SelectionDAG &DAG) {
+  return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
+}
+
+/// \brief Compute whether each element of a shuffle is zeroable.
+///
+/// A "zeroable" vector shuffle element is one which can be lowered to zero.
+/// Either it is an undef element in the shuffle mask, the element of the input
+/// referenced is undef, or the element of the input referenced is known to be
+/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
+/// as many lanes with this technique as possible to simplify the remaining
+/// shuffle.
+static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
+                                                     SDValue V1, SDValue V2) {
+  SmallBitVector Zeroable(Mask.size(), false);
+  V1 = peekThroughBitcasts(V1);
+  V2 = peekThroughBitcasts(V2);
+
+  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+  int VectorSizeInBits = V1.getValueSizeInBits();
+  int ScalarSizeInBits = VectorSizeInBits / Mask.size();
+  assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
+
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    int M = Mask[i];
+    // Handle the easy cases.
+    if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
+      Zeroable[i] = true;
+      continue;
+    }
+
+    // Determine shuffle input and normalize the mask.
+    SDValue V = M < Size ? V1 : V2;
+    M %= Size;
+
+    // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
+    if (V.getOpcode() != ISD::BUILD_VECTOR)
+      continue;
+
+    // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
+    // the (larger) source element must be UNDEF/ZERO.
+    if ((Size % V.getNumOperands()) == 0) {
+      int Scale = Size / V->getNumOperands();
+      SDValue Op = V.getOperand(M / Scale);
+      if (Op.isUndef() || X86::isZeroNode(Op))
+        Zeroable[i] = true;
+      else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
+        APInt Val = Cst->getAPIntValue();
+        Val = Val.lshr((M % Scale) * ScalarSizeInBits);
+        Val = Val.getLoBits(ScalarSizeInBits);
+        Zeroable[i] = (Val == 0);
+      } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
+        APInt Val = Cst->getValueAPF().bitcastToAPInt();
+        Val = Val.lshr((M % Scale) * ScalarSizeInBits);
+        Val = Val.getLoBits(ScalarSizeInBits);
+        Zeroable[i] = (Val == 0);
+      }
+      continue;
+    }
+
+    // If the BUILD_VECTOR has more elements then all the (smaller) source
+    // elements must be UNDEF or ZERO.
+    if ((V.getNumOperands() % Size) == 0) {
+      int Scale = V->getNumOperands() / Size;
+      bool AllZeroable = true;
+      for (int j = 0; j < Scale; ++j) {
+        SDValue Op = V.getOperand((M * Scale) + j);
+        AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
+      }
+      Zeroable[i] = AllZeroable;
+      continue;
+    }
+  }
+
+  return Zeroable;
+}
+
+/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
+static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
+                                            ArrayRef<int> Mask, SDValue V1,
+                                            SDValue V2,
+                                            const SmallBitVector &Zeroable,
+                                            const X86Subtarget &Subtarget,
+                                            SelectionDAG &DAG) {
+  int Size = Mask.size();
+  int LaneSize = 128 / VT.getScalarSizeInBits();
+  const int NumBytes = VT.getSizeInBits() / 8;
+  const int NumEltBytes = VT.getScalarSizeInBits() / 8;
+
+  assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
+         (Subtarget.hasAVX2() && VT.is256BitVector()) ||
+         (Subtarget.hasBWI() && VT.is512BitVector()));
+
+  SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
+  // Sign bit set in i8 mask means zero element.
+  SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
+
+  SDValue V;
+  for (int i = 0; i < NumBytes; ++i) {
+    int M = Mask[i / NumEltBytes];
+    if (M < 0) {
+      PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
+      continue;
+    }
+    if (Zeroable[i / NumEltBytes]) {
+      PSHUFBMask[i] = ZeroMask;
+      continue;
+    }
+
+    // We can only use a single input of V1 or V2.
+    SDValue SrcV = (M >= Size ? V2 : V1);
+    if (V && V != SrcV)
+      return SDValue();
+    V = SrcV;
+    M %= Size;
+
+    // PSHUFB can't cross lanes, ensure this doesn't happen.
+    if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
+      return SDValue();
+
+    M = M % LaneSize;
+    M = M * NumEltBytes + (i % NumEltBytes);
+    PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
+  }
+  assert(V && "Failed to find a source input");
+
+  MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
+  return DAG.getBitcast(
+      VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
+                      DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
+}
+
+// X86 has dedicated unpack instructions that can handle specific blend
+// operations: UNPCKH and UNPCKL.
+static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
+                                           ArrayRef<int> Mask, SDValue V1,
+                                           SDValue V2, SelectionDAG &DAG) {
+  SmallVector<int, 8> Unpckl;
+  createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
+  if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
+
+  SmallVector<int, 8> Unpckh;
+  createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
+  if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
+
+  // Commute and try again.
+  ShuffleVectorSDNode::commuteMask(Unpckl);
+  if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
+
+  ShuffleVectorSDNode::commuteMask(Unpckh);
+  if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
+
+  return SDValue();
+}
+
+/// \brief Try to emit a bitmask instruction for a shuffle.
+///
+/// This handles cases where we can model a blend exactly as a bitmask due to
+/// one of the inputs being zeroable.
+static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
+                                           SDValue V2, ArrayRef<int> Mask,
+                                           const SmallBitVector &Zeroable,
+                                           SelectionDAG &DAG) {
+  assert(!VT.isFloatingPoint() && "Floating point types are not supported");
+  MVT EltVT = VT.getVectorElementType();
+  SDValue Zero = DAG.getConstant(0, DL, EltVT);
+  SDValue AllOnes =
+      DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT);
+  SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
+  SDValue V;
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    if (Zeroable[i])
+      continue;
+    if (Mask[i] % Size != i)
+      return SDValue(); // Not a blend.
+    if (!V)
+      V = Mask[i] < Size ? V1 : V2;
+    else if (V != (Mask[i] < Size ? V1 : V2))
+      return SDValue(); // Can only let one input through the mask.
+
+    VMaskOps[i] = AllOnes;
+  }
+  if (!V)
+    return SDValue(); // No non-zeroable elements!
+
+  SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
+  return DAG.getNode(ISD::AND, DL, VT, V, VMask);
+}
+
+/// \brief Try to emit a blend instruction for a shuffle using bit math.
+///
+/// This is used as a fallback approach when first class blend instructions are
+/// unavailable. Currently it is only suitable for integer vectors, but could
+/// be generalized for floating point vectors if desirable.
+static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
+                                            SDValue V2, ArrayRef<int> Mask,
+                                            SelectionDAG &DAG) {
+  assert(VT.isInteger() && "Only supports integer vector types!");
+  MVT EltVT = VT.getVectorElementType();
+  int NumEltBits = EltVT.getSizeInBits();
+  SDValue Zero = DAG.getConstant(0, DL, EltVT);
+  SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
+                                    EltVT);
+  SmallVector<SDValue, 16> MaskOps;
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
+      return SDValue(); // Shuffled input!
+    MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
+  }
+
+  SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
+  V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
+  // We have to cast V2 around.
+  MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+  V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
+                                      DAG.getBitcast(MaskVT, V1Mask),
+                                      DAG.getBitcast(MaskVT, V2)));
+  return DAG.getNode(ISD::OR, DL, VT, V1, V2);
+}
+
+/// \brief Try to emit a blend instruction for a shuffle.
+///
+/// This doesn't do any checks for the availability of instructions for blending
+/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
+/// be matched in the backend with the type given. What it does check for is
+/// that the shuffle mask is a blend, or convertible into a blend with zero.
+static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
+                                         SDValue V2, ArrayRef<int> Original,
+                                         const SmallBitVector &Zeroable,
+                                         const X86Subtarget &Subtarget,
+                                         SelectionDAG &DAG) {
+  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+  SmallVector<int, 8> Mask(Original.begin(), Original.end());
+  bool ForceV1Zero = false, ForceV2Zero = false;
+
+  // Attempt to generate the binary blend mask. If an input is zero then
+  // we can use any lane.
+  // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
+  unsigned BlendMask = 0;
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    int M = Mask[i];
+    if (M < 0)
+      continue;
+    if (M == i)
+      continue;
+    if (M == i + Size) {
+      BlendMask |= 1u << i;
+      continue;
+    }
+    if (Zeroable[i]) {
+      if (V1IsZero) {
+        ForceV1Zero = true;
+        Mask[i] = i;
+        continue;
+      }
+      if (V2IsZero) {
+        ForceV2Zero = true;
+        BlendMask |= 1u << i;
+        Mask[i] = i + Size;
+        continue;
+      }
+    }
+    return SDValue(); // Shuffled input!
+  }
+
+  // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
+  if (ForceV1Zero)
+    V1 = getZeroVector(VT, Subtarget, DAG, DL);
+  if (ForceV2Zero)
+    V2 = getZeroVector(VT, Subtarget, DAG, DL);
+
+  auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
+    unsigned ScaledMask = 0;
+    for (int i = 0; i != Size; ++i)
+      if (BlendMask & (1u << i))
+        for (int j = 0; j != Scale; ++j)
+          ScaledMask |= 1u << (i * Scale + j);
+    return ScaledMask;
+  };
+
+  switch (VT.SimpleTy) {
+  case MVT::v2f64:
+  case MVT::v4f32:
+  case MVT::v4f64:
+  case MVT::v8f32:
+    return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+                       DAG.getConstant(BlendMask, DL, MVT::i8));
+
+  case MVT::v4i64:
+  case MVT::v8i32:
+    assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
+    LLVM_FALLTHROUGH;
+  case MVT::v2i64:
+  case MVT::v4i32:
+    // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
+    // that instruction.
+    if (Subtarget.hasAVX2()) {
+      // Scale the blend by the number of 32-bit dwords per element.
+      int Scale =  VT.getScalarSizeInBits() / 32;
+      BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+      MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
+      V1 = DAG.getBitcast(BlendVT, V1);
+      V2 = DAG.getBitcast(BlendVT, V2);
+      return DAG.getBitcast(
+          VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
+                          DAG.getConstant(BlendMask, DL, MVT::i8)));
+    }
+    LLVM_FALLTHROUGH;
+  case MVT::v8i16: {
+    // For integer shuffles we need to expand the mask and cast the inputs to
+    // v8i16s prior to blending.
+    int Scale = 8 / VT.getVectorNumElements();
+    BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+    V1 = DAG.getBitcast(MVT::v8i16, V1);
+    V2 = DAG.getBitcast(MVT::v8i16, V2);
+    return DAG.getBitcast(VT,
+                          DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
+                                      DAG.getConstant(BlendMask, DL, MVT::i8)));
+  }
+
+  case MVT::v16i16: {
+    assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
+    SmallVector<int, 8> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+      // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
+      assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
+      BlendMask = 0;
+      for (int i = 0; i < 8; ++i)
+        if (RepeatedMask[i] >= 8)
+          BlendMask |= 1u << i;
+      return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
+                         DAG.getConstant(BlendMask, DL, MVT::i8));
+    }
+    LLVM_FALLTHROUGH;
+  }
+  case MVT::v16i8:
+  case MVT::v32i8: {
+    assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
+           "256-bit byte-blends require AVX2 support!");
+
+    // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
+    if (SDValue Masked =
+            lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
+      return Masked;
+
+    // Scale the blend by the number of bytes per element.
+    int Scale = VT.getScalarSizeInBits() / 8;
+
+    // This form of blend is always done on bytes. Compute the byte vector
+    // type.
+    MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+
+    // Compute the VSELECT mask. Note that VSELECT is really confusing in the
+    // mix of LLVM's code generator and the x86 backend. We tell the code
+    // generator that boolean values in the elements of an x86 vector register
+    // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
+    // mapping a select to operand #1, and 'false' mapping to operand #2. The
+    // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
+    // of the element (the remaining are ignored) and 0 in that high bit would
+    // mean operand #1 while 1 in the high bit would mean operand #2. So while
+    // the LLVM model for boolean values in vector elements gets the relevant
+    // bit set, it is set backwards and over constrained relative to x86's
+    // actual model.
+    SmallVector<SDValue, 32> VSELECTMask;
+    for (int i = 0, Size = Mask.size(); i < Size; ++i)
+      for (int j = 0; j < Scale; ++j)
+        VSELECTMask.push_back(
+            Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
+                        : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
+                                          MVT::i8));
+
+    V1 = DAG.getBitcast(BlendVT, V1);
+    V2 = DAG.getBitcast(BlendVT, V2);
+    return DAG.getBitcast(
+        VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
+                        DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
+  }
+
+  default:
+    llvm_unreachable("Not a supported integer vector type!");
+  }
+}
+
+/// \brief Try to lower as a blend of elements from two inputs followed by
+/// a single-input permutation.
+///
+/// This matches the pattern where we can blend elements from two inputs and
+/// then reduce the shuffle to a single-input permutation.
+static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
+                                                   SDValue V1, SDValue V2,
+                                                   ArrayRef<int> Mask,
+                                                   SelectionDAG &DAG) {
+  // We build up the blend mask while checking whether a blend is a viable way
+  // to reduce the shuffle.
+  SmallVector<int, 32> BlendMask(Mask.size(), -1);
+  SmallVector<int, 32> PermuteMask(Mask.size(), -1);
+
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
+
+    assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
+
+    if (BlendMask[Mask[i] % Size] < 0)
+      BlendMask[Mask[i] % Size] = Mask[i];
+    else if (BlendMask[Mask[i] % Size] != Mask[i])
+      return SDValue(); // Can't blend in the needed input!
+
+    PermuteMask[i] = Mask[i] % Size;
+  }
+
+  SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+  return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
+}
+
+/// \brief Generic routine to decompose a shuffle and blend into indepndent
+/// blends and permutes.
+///
+/// This matches the extremely common pattern for handling combined
+/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
+/// operations. It will try to pick the best arrangement of shuffles and
+/// blends.
+static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
+                                                          MVT VT, SDValue V1,
+                                                          SDValue V2,
+                                                          ArrayRef<int> Mask,
+                                                          SelectionDAG &DAG) {
+  // Shuffle the input elements into the desired positions in V1 and V2 and
+  // blend them together.
+  SmallVector<int, 32> V1Mask(Mask.size(), -1);
+  SmallVector<int, 32> V2Mask(Mask.size(), -1);
+  SmallVector<int, 32> BlendMask(Mask.size(), -1);
+  for (int i = 0, Size = Mask.size(); i < Size; ++i)
+    if (Mask[i] >= 0 && Mask[i] < Size) {
+      V1Mask[i] = Mask[i];
+      BlendMask[i] = i;
+    } else if (Mask[i] >= Size) {
+      V2Mask[i] = Mask[i] - Size;
+      BlendMask[i] = i + Size;
+    }
+
+  // Try to lower with the simpler initial blend strategy unless one of the
+  // input shuffles would be a no-op. We prefer to shuffle inputs as the
+  // shuffle may be able to fold with a load or other benefit. However, when
+  // we'll have to do 2x as many shuffles in order to achieve this, blending
+  // first is a better strategy.
+  if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
+    if (SDValue BlendPerm =
+            lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+      return BlendPerm;
+
+  V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+  V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+  return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+}
+
+/// \brief Try to lower a vector shuffle as a rotation.
+///
+/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
+static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
+                                      ArrayRef<int> Mask) {
+  int NumElts = Mask.size();
+
+  // We need to detect various ways of spelling a rotation:
+  //   [11, 12, 13, 14, 15,  0,  1,  2]
+  //   [-1, 12, 13, 14, -1, -1,  1, -1]
+  //   [-1, -1, -1, -1, -1, -1,  1,  2]
+  //   [ 3,  4,  5,  6,  7,  8,  9, 10]
+  //   [-1,  4,  5,  6, -1, -1,  9, -1]
+  //   [-1,  4,  5,  6, -1, -1, -1, -1]
+  int Rotation = 0;
+  SDValue Lo, Hi;
+  for (int i = 0; i < NumElts; ++i) {
+    int M = Mask[i];
+    assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
+           "Unexpected mask index.");
+    if (M < 0)
+      continue;
+
+    // Determine where a rotated vector would have started.
+    int StartIdx = i - (M % NumElts);
+    if (StartIdx == 0)
+      // The identity rotation isn't interesting, stop.
+      return -1;
+
+    // If we found the tail of a vector the rotation must be the missing
+    // front. If we found the head of a vector, it must be how much of the
+    // head.
+    int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
+
+    if (Rotation == 0)
+      Rotation = CandidateRotation;
+    else if (Rotation != CandidateRotation)
+      // The rotations don't match, so we can't match this mask.
+      return -1;
+
+    // Compute which value this mask is pointing at.
+    SDValue MaskV = M < NumElts ? V1 : V2;
+
+    // Compute which of the two target values this index should be assigned
+    // to. This reflects whether the high elements are remaining or the low
+    // elements are remaining.
+    SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
+
+    // Either set up this value if we've not encountered it before, or check
+    // that it remains consistent.
+    if (!TargetV)
+      TargetV = MaskV;
+    else if (TargetV != MaskV)
+      // This may be a rotation, but it pulls from the inputs in some
+      // unsupported interleaving.
+      return -1;
+  }
+
+  // Check that we successfully analyzed the mask, and normalize the results.
+  assert(Rotation != 0 && "Failed to locate a viable rotation!");
+  assert((Lo || Hi) && "Failed to find a rotated input vector!");
+  if (!Lo)
+    Lo = Hi;
+  else if (!Hi)
+    Hi = Lo;
+
+  V1 = Lo;
+  V2 = Hi;
+
+  return Rotation;
+}
+
+/// \brief Try to lower a vector shuffle as a byte rotation.
+///
+/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
+/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
+/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
+/// try to generically lower a vector shuffle through such an pattern. It
+/// does not check for the profitability of lowering either as PALIGNR or
+/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
+/// This matches shuffle vectors that look like:
+///
+///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
+///
+/// Essentially it concatenates V1 and V2, shifts right by some number of
+/// elements, and takes the low elements as the result. Note that while this is
+/// specified as a *right shift* because x86 is little-endian, it is a *left
+/// rotate* of the vector lanes.
+static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
+                                          ArrayRef<int> Mask) {
+  // Don't accept any shuffles with zero elements.
+  if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
+    return -1;
+
+  // PALIGNR works on 128-bit lanes.
+  SmallVector<int, 16> RepeatedMask;
+  if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
+    return -1;
+
+  int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
+  if (Rotation <= 0)
+    return -1;
+
+  // PALIGNR rotates bytes, so we need to scale the
+  // rotation based on how many bytes are in the vector lane.
+  int NumElts = RepeatedMask.size();
+  int Scale = 16 / NumElts;
+  return Rotation * Scale;
+}
+
+static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
+                                              SDValue V1, SDValue V2,
+                                              ArrayRef<int> Mask,
+                                              const X86Subtarget &Subtarget,
+                                              SelectionDAG &DAG) {
+  assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+
+  SDValue Lo = V1, Hi = V2;
+  int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
+  if (ByteRotation <= 0)
+    return SDValue();
+
+  // Cast the inputs to i8 vector of correct length to match PALIGNR or
+  // PSLLDQ/PSRLDQ.
+  MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+  Lo = DAG.getBitcast(ByteVT, Lo);
+  Hi = DAG.getBitcast(ByteVT, Hi);
+
+  // SSSE3 targets can use the palignr instruction.
+  if (Subtarget.hasSSSE3()) {
+    assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
+           "512-bit PALIGNR requires BWI instructions");
+    return DAG.getBitcast(
+        VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
+                        DAG.getConstant(ByteRotation, DL, MVT::i8)));
+  }
+
+  assert(VT.is128BitVector() &&
+         "Rotate-based lowering only supports 128-bit lowering!");
+  assert(Mask.size() <= 16 &&
+         "Can shuffle at most 16 bytes in a 128-bit vector!");
+  assert(ByteVT == MVT::v16i8 &&
+         "SSE2 rotate lowering only needed for v16i8!");
+
+  // Default SSE2 implementation
+  int LoByteShift = 16 - ByteRotation;
+  int HiByteShift = ByteRotation;
+
+  SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
+                                DAG.getConstant(LoByteShift, DL, MVT::i8));
+  SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
+                                DAG.getConstant(HiByteShift, DL, MVT::i8));
+  return DAG.getBitcast(VT,
+                        DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
+}
+
+/// \brief Try to lower a vector shuffle as a dword/qword rotation.
+///
+/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
+/// rotation of the concatenation of two vectors; This routine will
+/// try to generically lower a vector shuffle through such an pattern.
+///
+/// Essentially it concatenates V1 and V2, shifts right by some number of
+/// elements, and takes the low elements as the result. Note that while this is
+/// specified as a *right shift* because x86 is little-endian, it is a *left
+/// rotate* of the vector lanes.
+static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
+                                          SDValue V1, SDValue V2,
+                                          ArrayRef<int> Mask,
+                                          const X86Subtarget &Subtarget,
+                                          SelectionDAG &DAG) {
+  assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
+         "Only 32-bit and 64-bit elements are supported!");
+
+  // 128/256-bit vectors are only supported with VLX.
+  assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
+         && "VLX required for 128/256-bit vectors");
+
+  SDValue Lo = V1, Hi = V2;
+  int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
+  if (Rotation <= 0)
+    return SDValue();
+
+  return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
+                     DAG.getConstant(Rotation, DL, MVT::i8));
+}
+
+/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
+///
+/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
+/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
+/// matches elements from one of the input vectors shuffled to the left or
+/// right with zeroable elements 'shifted in'. It handles both the strictly
+/// bit-wise element shifts and the byte shift across an entire 128-bit double
+/// quad word lane.
+///
+/// PSHL : (little-endian) left bit shift.
+/// [ zz, 0, zz,  2 ]
+/// [ -1, 4, zz, -1 ]
+/// PSRL : (little-endian) right bit shift.
+/// [  1, zz,  3, zz]
+/// [ -1, -1,  7, zz]
+/// PSLLDQ : (little-endian) left byte shift
+/// [ zz,  0,  1,  2,  3,  4,  5,  6]
+/// [ zz, zz, -1, -1,  2,  3,  4, -1]
+/// [ zz, zz, zz, zz, zz, zz, -1,  1]
+/// PSRLDQ : (little-endian) right byte shift
+/// [  5, 6,  7, zz, zz, zz, zz, zz]
+/// [ -1, 5,  6,  7, zz, zz, zz, zz]
+/// [  1, 2, -1, -1, -1, -1, zz, zz]
+static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
+                                     unsigned ScalarSizeInBits,
+                                     ArrayRef<int> Mask, int MaskOffset,
+                                     const SmallBitVector &Zeroable,
+                                     const X86Subtarget &Subtarget) {
+  int Size = Mask.size();
+  unsigned SizeInBits = Size * ScalarSizeInBits;
+
+  auto CheckZeros = [&](int Shift, int Scale, bool Left) {
+    for (int i = 0; i < Size; i += Scale)
+      for (int j = 0; j < Shift; ++j)
+        if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
+          return false;
+
+    return true;
+  };
+
+  auto MatchShift = [&](int Shift, int Scale, bool Left) {
+    for (int i = 0; i != Size; i += Scale) {
+      unsigned Pos = Left ? i + Shift : i;
+      unsigned Low = Left ? i : i + Shift;
+      unsigned Len = Scale - Shift;
+      if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
+        return -1;
+    }
+
+    int ShiftEltBits = ScalarSizeInBits * Scale;
+    bool ByteShift = ShiftEltBits > 64;
+    Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
+                  : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
+    int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
+
+    // Normalize the scale for byte shifts to still produce an i64 element
+    // type.
+    Scale = ByteShift ? Scale / 2 : Scale;
+
+    // We need to round trip through the appropriate type for the shift.
+    MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
+    ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
+                        : MVT::getVectorVT(ShiftSVT, Size / Scale);
+    return (int)ShiftAmt;
+  };
+
+  // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
+  // keep doubling the size of the integer elements up to that. We can
+  // then shift the elements of the integer vector by whole multiples of
+  // their width within the elements of the larger integer vector. Test each
+  // multiple to see if we can find a match with the moved element indices
+  // and that the shifted in elements are all zeroable.
+  unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
+  for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
+    for (int Shift = 1; Shift != Scale; ++Shift)
+      for (bool Left : {true, false})
+        if (CheckZeros(Shift, Scale, Left)) {
+          int ShiftAmt = MatchShift(Shift, Scale, Left);
+          if (0 < ShiftAmt)
+            return ShiftAmt;
+        }
+
+  // no match
+  return -1;
+}
+
+static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
+                                         SDValue V2, ArrayRef<int> Mask,
+                                         const SmallBitVector &Zeroable,
+                                         const X86Subtarget &Subtarget,
+                                         SelectionDAG &DAG) {
+  int Size = Mask.size();
+  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+  MVT ShiftVT;
+  SDValue V = V1;
+  unsigned Opcode;
+
+  // Try to match shuffle against V1 shift.
+  int ShiftAmt = matchVectorShuffleAsShift(
+      ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
+
+  // If V1 failed, try to match shuffle against V2 shift.
+  if (ShiftAmt < 0) {
+    ShiftAmt =
+        matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
+                                  Mask, Size, Zeroable, Subtarget);
+    V = V2;
+  }
+
+  if (ShiftAmt < 0)
+    return SDValue();
+
+  assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
+         "Illegal integer vector type");
+  V = DAG.getBitcast(ShiftVT, V);
+  V = DAG.getNode(Opcode, DL, ShiftVT, V,
+                  DAG.getConstant(ShiftAmt, DL, MVT::i8));
+  return DAG.getBitcast(VT, V);
+}
+
+/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
+static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
+                                           SDValue V2, ArrayRef<int> Mask,
+                                           const SmallBitVector &Zeroable,
+                                           SelectionDAG &DAG) {
+  int Size = Mask.size();
+  int HalfSize = Size / 2;
+  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+  assert(!Zeroable.all() && "Fully zeroable shuffle mask");
+
+  // Upper half must be undefined.
+  if (!isUndefInRange(Mask, HalfSize, HalfSize))
+    return SDValue();
+
+  // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
+  // Remainder of lower half result is zero and upper half is all undef.
+  auto LowerAsEXTRQ = [&]() {
+    // Determine the extraction length from the part of the
+    // lower half that isn't zeroable.
+    int Len = HalfSize;
+    for (; Len > 0; --Len)
+      if (!Zeroable[Len - 1])
+        break;
+    assert(Len > 0 && "Zeroable shuffle mask");
+
+    // Attempt to match first Len sequential elements from the lower half.
+    SDValue Src;
+    int Idx = -1;
+    for (int i = 0; i != Len; ++i) {
+      int M = Mask[i];
+      if (M < 0)
+        continue;
+      SDValue &V = (M < Size ? V1 : V2);
+      M = M % Size;
+
+      // The extracted elements must start at a valid index and all mask
+      // elements must be in the lower half.
+      if (i > M || M >= HalfSize)
+        return SDValue();
+
+      if (Idx < 0 || (Src == V && Idx == (M - i))) {
+        Src = V;
+        Idx = M - i;
+        continue;
+      }
+      return SDValue();
+    }
+
+    if (Idx < 0)
+      return SDValue();
+
+    assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
+    int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+    int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+    return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
+                       DAG.getConstant(BitLen, DL, MVT::i8),
+                       DAG.getConstant(BitIdx, DL, MVT::i8));
+  };
+
+  if (SDValue ExtrQ = LowerAsEXTRQ())
+    return ExtrQ;
+
+  // INSERTQ: Extract lowest Len elements from lower half of second source and
+  // insert over first source, starting at Idx.
+  // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
+  auto LowerAsInsertQ = [&]() {
+    for (int Idx = 0; Idx != HalfSize; ++Idx) {
+      SDValue Base;
+
+      // Attempt to match first source from mask before insertion point.
+      if (isUndefInRange(Mask, 0, Idx)) {
+        /* EMPTY */
+      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+        Base = V1;
+      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+        Base = V2;
+      } else {
+        continue;
+      }
+
+      // Extend the extraction length looking to match both the insertion of
+      // the second source and the remaining elements of the first.
+      for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
+        SDValue Insert;
+        int Len = Hi - Idx;
+
+        // Match insertion.
+        if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
+          Insert = V1;
+        } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
+          Insert = V2;
+        } else {
+          continue;
+        }
+
+        // Match the remaining elements of the lower half.
+        if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
+          /* EMPTY */
+        } else if ((!Base || (Base == V1)) &&
+                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
+          Base = V1;
+        } else if ((!Base || (Base == V2)) &&
+                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
+                                              Size + Hi)) {
+          Base = V2;
+        } else {
+          continue;
+        }
+
+        // We may not have a base (first source) - this can safely be undefined.
+        if (!Base)
+          Base = DAG.getUNDEF(VT);
+
+        int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+        int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+        return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
+                           DAG.getConstant(BitLen, DL, MVT::i8),
+                           DAG.getConstant(BitIdx, DL, MVT::i8));
+      }
+    }
+
+    return SDValue();
+  };
+
+  if (SDValue InsertQ = LowerAsInsertQ())
+    return InsertQ;
+
+  return SDValue();
+}
+
+/// \brief Lower a vector shuffle as a zero or any extension.
+///
+/// Given a specific number of elements, element bit width, and extension
+/// stride, produce either a zero or any extension based on the available
+/// features of the subtarget. The extended elements are consecutive and
+/// begin and can start from an offseted element index in the input; to
+/// avoid excess shuffling the offset must either being in the bottom lane
+/// or at the start of a higher lane. All extended elements must be from
+/// the same lane.
+static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
+    const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
+    ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+  assert(Scale > 1 && "Need a scale to extend.");
+  int EltBits = VT.getScalarSizeInBits();
+  int NumElements = VT.getVectorNumElements();
+  int NumEltsPerLane = 128 / EltBits;
+  int OffsetLane = Offset / NumEltsPerLane;
+  assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
+         "Only 8, 16, and 32 bit elements can be extended.");
+  assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
+  assert(0 <= Offset && "Extension offset must be positive.");
+  assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
+         "Extension offset must be in the first lane or start an upper lane.");
+
+  // Check that an index is in same lane as the base offset.
+  auto SafeOffset = [&](int Idx) {
+    return OffsetLane == (Idx / NumEltsPerLane);
+  };
+
+  // Shift along an input so that the offset base moves to the first element.
+  auto ShuffleOffset = [&](SDValue V) {
+    if (!Offset)
+      return V;
+
+    SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+    for (int i = 0; i * Scale < NumElements; ++i) {
+      int SrcIdx = i + Offset;
+      ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
+    }
+    return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
+  };
+
+  // Found a valid zext mask! Try various lowering strategies based on the
+  // input type and available ISA extensions.
+  if (Subtarget.hasSSE41()) {
+    // Not worth offseting 128-bit vectors if scale == 2, a pattern using
+    // PUNPCK will catch this in a later shuffle match.
+    if (Offset && Scale == 2 && VT.is128BitVector())
+      return SDValue();
+    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
+                                 NumElements / Scale);
+    InputV = ShuffleOffset(InputV);
+
+    // For 256-bit vectors, we only need the lower (128-bit) input half.
+    // For 512-bit vectors, we only need the lower input half or quarter.
+    if (VT.getSizeInBits() > 128)
+      InputV = extractSubVector(InputV, 0, DAG, DL,
+                                std::max(128, (int)VT.getSizeInBits() / Scale));
+
+    InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
+    return DAG.getBitcast(VT, InputV);
+  }
+
+  assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
+
+  // For any extends we can cheat for larger element sizes and use shuffle
+  // instructions that can fold with a load and/or copy.
+  if (AnyExt && EltBits == 32) {
+    int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
+                         -1};
+    return DAG.getBitcast(
+        VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+                        DAG.getBitcast(MVT::v4i32, InputV),
+                        getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+  }
+  if (AnyExt && EltBits == 16 && Scale > 2) {
+    int PSHUFDMask[4] = {Offset / 2, -1,
+                         SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
+    InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+                         DAG.getBitcast(MVT::v4i32, InputV),
+                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
+    int PSHUFWMask[4] = {1, -1, -1, -1};
+    unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
+    return DAG.getBitcast(
+        VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
+                        DAG.getBitcast(MVT::v8i16, InputV),
+                        getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
+  }
+
+  // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
+  // to 64-bits.
+  if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
+    assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
+    assert(VT.is128BitVector() && "Unexpected vector width!");
+
+    int LoIdx = Offset * EltBits;
+    SDValue Lo = DAG.getBitcast(
+        MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                DAG.getConstant(EltBits, DL, MVT::i8),
+                                DAG.getConstant(LoIdx, DL, MVT::i8)));
+
+    if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
+        !SafeOffset(Offset + 1))
+      return DAG.getBitcast(VT, Lo);
+
+    int HiIdx = (Offset + 1) * EltBits;
+    SDValue Hi = DAG.getBitcast(
+        MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+                                DAG.getConstant(EltBits, DL, MVT::i8),
+                                DAG.getConstant(HiIdx, DL, MVT::i8)));
+    return DAG.getBitcast(VT,
+                          DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
+  }
+
+  // If this would require more than 2 unpack instructions to expand, use
+  // pshufb when available. We can only use more than 2 unpack instructions
+  // when zero extending i8 elements which also makes it easier to use pshufb.
+  if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
+    assert(NumElements == 16 && "Unexpected byte vector width!");
+    SDValue PSHUFBMask[16];
+    for (int i = 0; i < 16; ++i) {
+      int Idx = Offset + (i / Scale);
+      PSHUFBMask[i] = DAG.getConstant(
+          (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
+    }
+    InputV = DAG.getBitcast(MVT::v16i8, InputV);
+    return DAG.getBitcast(
+        VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
+                        DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
+  }
+
+  // If we are extending from an offset, ensure we start on a boundary that
+  // we can unpack from.
+  int AlignToUnpack = Offset % (NumElements / Scale);
+  if (AlignToUnpack) {
+    SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+    for (int i = AlignToUnpack; i < NumElements; ++i)
+      ShMask[i - AlignToUnpack] = i;
+    InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
+    Offset -= AlignToUnpack;
+  }
+
+  // Otherwise emit a sequence of unpacks.
+  do {
+    unsigned UnpackLoHi = X86ISD::UNPCKL;
+    if (Offset >= (NumElements / 2)) {
+      UnpackLoHi = X86ISD::UNPCKH;
+      Offset -= (NumElements / 2);
+    }
+
+    MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+    SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
+                         : getZeroVector(InputVT, Subtarget, DAG, DL);
+    InputV = DAG.getBitcast(InputVT, InputV);
+    InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
+    Scale /= 2;
+    EltBits *= 2;
+    NumElements /= 2;
+  } while (Scale > 1);
+  return DAG.getBitcast(VT, InputV);
+}
+
+/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
+///
+/// This routine will try to do everything in its power to cleverly lower
+/// a shuffle which happens to match the pattern of a zero extend. It doesn't
+/// check for the profitability of this lowering,  it tries to aggressively
+/// match this pattern. It will use all of the micro-architectural details it
+/// can to emit an efficient lowering. It handles both blends with all-zero
+/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
+/// masking out later).
+///
+/// The reason we have dedicated lowering for zext-style shuffles is that they
+/// are both incredibly common and often quite performance sensitive.
+static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
+    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
+    SelectionDAG &DAG) {
+  int Bits = VT.getSizeInBits();
+  int NumLanes = Bits / 128;
+  int NumElements = VT.getVectorNumElements();
+  int NumEltsPerLane = NumElements / NumLanes;
+  assert(VT.getScalarSizeInBits() <= 32 &&
+         "Exceeds 32-bit integer zero extension limit");
+  assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
+
+  // Define a helper function to check a particular ext-scale and lower to it if
+  // valid.
+  auto Lower = [&](int Scale) -> SDValue {
+    SDValue InputV;
+    bool AnyExt = true;
+    int Offset = 0;
+    int Matches = 0;
+    for (int i = 0; i < NumElements; ++i) {
+      int M = Mask[i];
+      if (M < 0)
+        continue; // Valid anywhere but doesn't tell us anything.
+      if (i % Scale != 0) {
+        // Each of the extended elements need to be zeroable.
+        if (!Zeroable[i])
+          return SDValue();
+
+        // We no longer are in the anyext case.
+        AnyExt = false;
+        continue;
+      }
+
+      // Each of the base elements needs to be consecutive indices into the
+      // same input vector.
+      SDValue V = M < NumElements ? V1 : V2;
+      M = M % NumElements;
+      if (!InputV) {
+        InputV = V;
+        Offset = M - (i / Scale);
+      } else if (InputV != V)
+        return SDValue(); // Flip-flopping inputs.
+
+      // Offset must start in the lowest 128-bit lane or at the start of an
+      // upper lane.
+      // FIXME: Is it ever worth allowing a negative base offset?
+      if (!((0 <= Offset && Offset < NumEltsPerLane) ||
+            (Offset % NumEltsPerLane) == 0))
+        return SDValue();
+
+      // If we are offsetting, all referenced entries must come from the same
+      // lane.
+      if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
+        return SDValue();
+
+      if ((M % NumElements) != (Offset + (i / Scale)))
+        return SDValue(); // Non-consecutive strided elements.
+      Matches++;
+    }
+
+    // If we fail to find an input, we have a zero-shuffle which should always
+    // have already been handled.
+    // FIXME: Maybe handle this here in case during blending we end up with one?
+    if (!InputV)
+      return SDValue();
+
+    // If we are offsetting, don't extend if we only match a single input, we
+    // can always do better by using a basic PSHUF or PUNPCK.
+    if (Offset != 0 && Matches < 2)
+      return SDValue();
+
+    return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
+        DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
+  };
+
+  // The widest scale possible for extending is to a 64-bit integer.
+  assert(Bits % 64 == 0 &&
+         "The number of bits in a vector must be divisible by 64 on x86!");
+  int NumExtElements = Bits / 64;
+
+  // Each iteration, try extending the elements half as much, but into twice as
+  // many elements.
+  for (; NumExtElements < NumElements; NumExtElements *= 2) {
+    assert(NumElements % NumExtElements == 0 &&
+           "The input vector size must be divisible by the extended size.");
+    if (SDValue V = Lower(NumElements / NumExtElements))
+      return V;
+  }
+
+  // General extends failed, but 128-bit vectors may be able to use MOVQ.
+  if (Bits != 128)
+    return SDValue();
+
+  // Returns one of the source operands if the shuffle can be reduced to a
+  // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
+  auto CanZExtLowHalf = [&]() {
+    for (int i = NumElements / 2; i != NumElements; ++i)
+      if (!Zeroable[i])
+        return SDValue();
+    if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
+      return V1;
+    if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
+      return V2;
+    return SDValue();
+  };
+
+  if (SDValue V = CanZExtLowHalf()) {
+    V = DAG.getBitcast(MVT::v2i64, V);
+    V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
+    return DAG.getBitcast(VT, V);
+  }
+
+  // No viable ext lowering found.
+  return SDValue();
+}
+
+/// \brief Try to get a scalar value for a specific element of a vector.
+///
+/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
+static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
+                                              SelectionDAG &DAG) {
+  MVT VT = V.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  V = peekThroughBitcasts(V);
+
+  // If the bitcasts shift the element size, we can't extract an equivalent
+  // element from it.
+  MVT NewVT = V.getSimpleValueType();
+  if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+    return SDValue();
+
+  if (V.getOpcode() == ISD::BUILD_VECTOR ||
+      (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
+    // Ensure the scalar operand is the same size as the destination.
+    // FIXME: Add support for scalar truncation where possible.
+    SDValue S = V.getOperand(Idx);
+    if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
+      return DAG.getBitcast(EltVT, S);
+  }
+
+  return SDValue();
+}
+
+/// \brief Helper to test for a load that can be folded with x86 shuffles.
+///
+/// This is particularly important because the set of instructions varies
+/// significantly based on whether the operand is a load or not.
+static bool isShuffleFoldableLoad(SDValue V) {
+  V = peekThroughBitcasts(V);
+  return ISD::isNON_EXTLoad(V.getNode());
+}
+
+/// \brief Try to lower insertion of a single element into a zero vector.
+///
+/// This is a common pattern that we have especially efficient patterns to lower
+/// across all subtarget feature sets.
+static SDValue lowerVectorShuffleAsElementInsertion(
+    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
+    SelectionDAG &DAG) {
+  MVT ExtVT = VT;
+  MVT EltVT = VT.getVectorElementType();
+
+  int V2Index =
+      find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
+      Mask.begin();
+  bool IsV1Zeroable = true;
+  for (int i = 0, Size = Mask.size(); i < Size; ++i)
+    if (i != V2Index && !Zeroable[i]) {
+      IsV1Zeroable = false;
+      break;
+    }
+
+  // Check for a single input from a SCALAR_TO_VECTOR node.
+  // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
+  // all the smarts here sunk into that routine. However, the current
+  // lowering of BUILD_VECTOR makes that nearly impossible until the old
+  // vector shuffle lowering is dead.
+  SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
+                                               DAG);
+  if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
+    // We need to zext the scalar if it is smaller than an i32.
+    V2S = DAG.getBitcast(EltVT, V2S);
+    if (EltVT == MVT::i8 || EltVT == MVT::i16) {
+      // Using zext to expand a narrow element won't work for non-zero
+      // insertions.
+      if (!IsV1Zeroable)
+        return SDValue();
+
+      // Zero-extend directly to i32.
+      ExtVT = MVT::v4i32;
+      V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
+    }
+    V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
+  } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
+             EltVT == MVT::i16) {
+    // Either not inserting from the low element of the input or the input
+    // element size is too small to use VZEXT_MOVL to clear the high bits.
+    return SDValue();
+  }
+
+  if (!IsV1Zeroable) {
+    // If V1 can't be treated as a zero vector we have fewer options to lower
+    // this. We can't support integer vectors or non-zero targets cheaply, and
+    // the V1 elements can't be permuted in any way.
+    assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
+    if (!VT.isFloatingPoint() || V2Index != 0)
+      return SDValue();
+    SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
+    V1Mask[V2Index] = -1;
+    if (!isNoopShuffleMask(V1Mask))
+      return SDValue();
+    // This is essentially a special case blend operation, but if we have
+    // general purpose blend operations, they are always faster. Bail and let
+    // the rest of the lowering handle these as blends.
+    if (Subtarget.hasSSE41())
+      return SDValue();
+
+    // Otherwise, use MOVSD or MOVSS.
+    assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
+           "Only two types of floating point element types to handle!");
+    return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
+                       ExtVT, V1, V2);
+  }
+
+  // This lowering only works for the low element with floating point vectors.
+  if (VT.isFloatingPoint() && V2Index != 0)
+    return SDValue();
+
+  V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
+  if (ExtVT != VT)
+    V2 = DAG.getBitcast(VT, V2);
+
+  if (V2Index != 0) {
+    // If we have 4 or fewer lanes we can cheaply shuffle the element into
+    // the desired position. Otherwise it is more efficient to do a vector
+    // shift left. We know that we can do a vector shift left because all
+    // the inputs are zero.
+    if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
+      SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
+      V2Shuffle[V2Index] = 0;
+      V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
+    } else {
+      V2 = DAG.getBitcast(MVT::v16i8, V2);
+      V2 = DAG.getNode(
+          X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
+          DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
+                          DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
+                              DAG.getDataLayout(), VT)));
+      V2 = DAG.getBitcast(VT, V2);
+    }
+  }
+  return V2;
+}
+
+/// Try to lower broadcast of a single - truncated - integer element,
+/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
+///
+/// This assumes we have AVX2.
+static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
+                                                  SDValue V0, int BroadcastIdx,
+                                                  const X86Subtarget &Subtarget,
+                                                  SelectionDAG &DAG) {
+  assert(Subtarget.hasAVX2() &&
+         "We can only lower integer broadcasts with AVX2!");
+
+  EVT EltVT = VT.getVectorElementType();
+  EVT V0VT = V0.getValueType();
+
+  assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
+  assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
+
+  EVT V0EltVT = V0VT.getVectorElementType();
+  if (!V0EltVT.isInteger())
+    return SDValue();
+
+  const unsigned EltSize = EltVT.getSizeInBits();
+  const unsigned V0EltSize = V0EltVT.getSizeInBits();
+
+  // This is only a truncation if the original element type is larger.
+  if (V0EltSize <= EltSize)
+    return SDValue();
+
+  assert(((V0EltSize % EltSize) == 0) &&
+         "Scalar type sizes must all be powers of 2 on x86!");
+
+  const unsigned V0Opc = V0.getOpcode();
+  const unsigned Scale = V0EltSize / EltSize;
+  const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
+
+  if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
+      V0Opc != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  SDValue Scalar = V0.getOperand(V0BroadcastIdx);
+
+  // If we're extracting non-least-significant bits, shift so we can truncate.
+  // Hopefully, we can fold away the trunc/srl/load into the broadcast.
+  // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
+  // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
+  if (const int OffsetIdx = BroadcastIdx % Scale)
+    Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
+            DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
+
+  return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
+                     DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
+}
+
+/// \brief Try to lower broadcast of a single element.
+///
+/// For convenience, this code also bundles all of the subtarget feature set
+/// filtering. While a little annoying to re-dispatch on type here, there isn't
+/// a convenient way to factor it out.
+/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
+static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
+                                             SDValue V1, SDValue V2,
+                                             ArrayRef<int> Mask,
+                                             const X86Subtarget &Subtarget,
+                                             SelectionDAG &DAG) {
+  if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
+        (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
+        (Subtarget.hasAVX2() && VT.isInteger())))
+    return SDValue();
+
+  // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
+  // we can only broadcast from a register with AVX2.
+  unsigned NumElts = Mask.size();
+  unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
+  bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
+
+  // Check that the mask is a broadcast.
+  int BroadcastIdx = -1;
+  for (int i = 0; i != (int)NumElts; ++i) {
+    SmallVector<int, 8> BroadcastMask(NumElts, i);
+    if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
+      BroadcastIdx = i;
+      break;
+    }
+  }
+
+  if (BroadcastIdx < 0)
+    return SDValue();
+  assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
+                                            "a sorted mask where the broadcast "
+                                            "comes from V1.");
+
+  // Go up the chain of (vector) values to find a scalar load that we can
+  // combine with the broadcast.
+  SDValue V = V1;
+  for (;;) {
+    switch (V.getOpcode()) {
+    case ISD::BITCAST: {
+      SDValue VSrc = V.getOperand(0);
+      MVT SrcVT = VSrc.getSimpleValueType();
+      if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
+        break;
+      V = VSrc;
+      continue;
+    }
+    case ISD::CONCAT_VECTORS: {
+      int OperandSize = Mask.size() / V.getNumOperands();
+      V = V.getOperand(BroadcastIdx / OperandSize);
+      BroadcastIdx %= OperandSize;
+      continue;
+    }
+    case ISD::INSERT_SUBVECTOR: {
+      SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
+      auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
+      if (!ConstantIdx)
+        break;
+
+      int BeginIdx = (int)ConstantIdx->getZExtValue();
+      int EndIdx =
+          BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
+      if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
+        BroadcastIdx -= BeginIdx;
+        V = VInner;
+      } else {
+        V = VOuter;
+      }
+      continue;
+    }
+    }
+    break;
+  }
+
+  // Check if this is a broadcast of a scalar. We special case lowering
+  // for scalars so that we can more effectively fold with loads.
+  // First, look through bitcast: if the original value has a larger element
+  // type than the shuffle, the broadcast element is in essence truncated.
+  // Make that explicit to ease folding.
+  if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
+    if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
+            DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
+      return TruncBroadcast;
+
+  MVT BroadcastVT = VT;
+
+  // Peek through any bitcast (only useful for loads).
+  SDValue BC = peekThroughBitcasts(V);
+
+  // Also check the simpler case, where we can directly reuse the scalar.
+  if (V.getOpcode() == ISD::BUILD_VECTOR ||
+      (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
+    V = V.getOperand(BroadcastIdx);
+
+    // If we can't broadcast from a register, check that the input is a load.
+    if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
+      return SDValue();
+  } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
+    // 32-bit targets need to load i64 as a f64 and then bitcast the result.
+    if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
+      BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
+      Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
+    }
+
+    // If we are broadcasting a load that is only used by the shuffle
+    // then we can reduce the vector load to the broadcasted scalar load.
+    LoadSDNode *Ld = cast<LoadSDNode>(BC);
+    SDValue BaseAddr = Ld->getOperand(1);
+    EVT SVT = BroadcastVT.getScalarType();
+    unsigned Offset = BroadcastIdx * SVT.getStoreSize();
+    SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
+    V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
+                    DAG.getMachineFunction().getMachineMemOperand(
+                        Ld->getMemOperand(), Offset, SVT.getStoreSize()));
+
+    // Make sure the newly-created LOAD is in the same position as Ld in
+    // terms of dependency. We create a TokenFactor for Ld and V,
+    // and update uses of Ld's output chain to use the TokenFactor.
+    if (Ld->hasAnyUseOfValue(1)) {
+      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                     SDValue(Ld, 1), SDValue(V.getNode(), 1));
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
+      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
+                             SDValue(V.getNode(), 1));
+    }
+  } else if (!BroadcastFromReg) {
+    // We can't broadcast from a vector register.
+    return SDValue();
+  } else if (BroadcastIdx != 0) {
+    // We can only broadcast from the zero-element of a vector register,
+    // but it can be advantageous to broadcast from the zero-element of a
+    // subvector.
+    if (!VT.is256BitVector() && !VT.is512BitVector())
+      return SDValue();
+
+    // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
+    if (VT == MVT::v4f64 || VT == MVT::v4i64)
+      return SDValue();
+
+    // Only broadcast the zero-element of a 128-bit subvector.
+    unsigned EltSize = VT.getScalarSizeInBits();
+    if (((BroadcastIdx * EltSize) % 128) != 0)
+      return SDValue();
+
+    MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
+    V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
+                    DAG.getIntPtrConstant(BroadcastIdx, DL));
+  }
+
+  if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
+    V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
+                    DAG.getBitcast(MVT::f64, V));
+
+  // Bitcast back to the same scalar type as BroadcastVT.
+  MVT SrcVT = V.getSimpleValueType();
+  if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
+    assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
+           "Unexpected vector element size");
+    if (SrcVT.isVector()) {
+      unsigned NumSrcElts = SrcVT.getVectorNumElements();
+      SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
+    } else {
+      SrcVT = BroadcastVT.getScalarType();
+    }
+    V = DAG.getBitcast(SrcVT, V);
+  }
+
+  // 32-bit targets need to load i64 as a f64 and then bitcast the result.
+  if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
+    V = DAG.getBitcast(MVT::f64, V);
+    unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
+    BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
+  }
+
+  return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
+}
+
+// Check for whether we can use INSERTPS to perform the shuffle. We only use
+// INSERTPS when the V1 elements are already in the correct locations
+// because otherwise we can just always use two SHUFPS instructions which
+// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
+// perform INSERTPS if a single V1 element is out of place and all V2
+// elements are zeroable.
+static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
+                                         unsigned &InsertPSMask,
+                                         const SmallBitVector &Zeroable,
+                                         ArrayRef<int> Mask,
+                                         SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
+  assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+  // Attempt to match INSERTPS with one element from VA or VB being
+  // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
+  // are updated.
+  auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
+                             ArrayRef<int> CandidateMask) {
+    unsigned ZMask = 0;
+    int VADstIndex = -1;
+    int VBDstIndex = -1;
+    bool VAUsedInPlace = false;
+
+    for (int i = 0; i < 4; ++i) {
+      // Synthesize a zero mask from the zeroable elements (includes undefs).
+      if (Zeroable[i]) {
+        ZMask |= 1 << i;
+        continue;
+      }
+
+      // Flag if we use any VA inputs in place.
+      if (i == CandidateMask[i]) {
+        VAUsedInPlace = true;
+        continue;
+      }
+
+      // We can only insert a single non-zeroable element.
+      if (VADstIndex >= 0 || VBDstIndex >= 0)
+        return false;
+
+      if (CandidateMask[i] < 4) {
+        // VA input out of place for insertion.
+        VADstIndex = i;
+      } else {
+        // VB input for insertion.
+        VBDstIndex = i;
+      }
+    }
+
+    // Don't bother if we have no (non-zeroable) element for insertion.
+    if (VADstIndex < 0 && VBDstIndex < 0)
+      return false;
+
+    // Determine element insertion src/dst indices. The src index is from the
+    // start of the inserted vector, not the start of the concatenated vector.
+    unsigned VBSrcIndex = 0;
+    if (VADstIndex >= 0) {
+      // If we have a VA input out of place, we use VA as the V2 element
+      // insertion and don't use the original V2 at all.
+      VBSrcIndex = CandidateMask[VADstIndex];
+      VBDstIndex = VADstIndex;
+      VB = VA;
+    } else {
+      VBSrcIndex = CandidateMask[VBDstIndex] - 4;
+    }
+
+    // If no V1 inputs are used in place, then the result is created only from
+    // the zero mask and the V2 insertion - so remove V1 dependency.
+    if (!VAUsedInPlace)
+      VA = DAG.getUNDEF(MVT::v4f32);
+
+    // Update V1, V2 and InsertPSMask accordingly.
+    V1 = VA;
+    V2 = VB;
+
+    // Insert the V2 element into the desired position.
+    InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
+    assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+    return true;
+  };
+
+  if (matchAsInsertPS(V1, V2, Mask))
+    return true;
+
+  // Commute and try again.
+  SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
+  ShuffleVectorSDNode::commuteMask(CommutedMask);
+  if (matchAsInsertPS(V2, V1, CommutedMask))
+    return true;
+
+  return false;
+}
+
+static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
+                                            SDValue V2, ArrayRef<int> Mask,
+                                            const SmallBitVector &Zeroable,
+                                            SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+
+  // Attempt to match the insertps pattern.
+  unsigned InsertPSMask;
+  if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
+    return SDValue();
+
+  // Insert the V2 element into the desired position.
+  return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+                     DAG.getConstant(InsertPSMask, DL, MVT::i8));
+}
+
+/// \brief Try to lower a shuffle as a permute of the inputs followed by an
+/// UNPCK instruction.
+///
+/// This specifically targets cases where we end up with alternating between
+/// the two inputs, and so can permute them into something that feeds a single
+/// UNPCK instruction. Note that this routine only targets integer vectors
+/// because for floating point vectors we have a generalized SHUFPS lowering
+/// strategy that handles everything that doesn't *exactly* match an unpack,
+/// making this clever lowering unnecessary.
+static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
+                                                    SDValue V1, SDValue V2,
+                                                    ArrayRef<int> Mask,
+                                                    SelectionDAG &DAG) {
+  assert(!VT.isFloatingPoint() &&
+         "This routine only supports integer vectors.");
+  assert(VT.is128BitVector() &&
+         "This routine only works on 128-bit vectors.");
+  assert(!V2.isUndef() &&
+         "This routine should only be used when blending two inputs.");
+  assert(Mask.size() >= 2 && "Single element masks are invalid.");
+
+  int Size = Mask.size();
+
+  int NumLoInputs =
+      count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
+  int NumHiInputs =
+      count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
+
+  bool UnpackLo = NumLoInputs >= NumHiInputs;
+
+  auto TryUnpack = [&](int ScalarSize, int Scale) {
+    SmallVector<int, 16> V1Mask((unsigned)Size, -1);
+    SmallVector<int, 16> V2Mask((unsigned)Size, -1);
+
+    for (int i = 0; i < Size; ++i) {
+      if (Mask[i] < 0)
+        continue;
+
+      // Each element of the unpack contains Scale elements from this mask.
+      int UnpackIdx = i / Scale;
+
+      // We only handle the case where V1 feeds the first slots of the unpack.
+      // We rely on canonicalization to ensure this is the case.
+      if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
+        return SDValue();
+
+      // Setup the mask for this input. The indexing is tricky as we have to
+      // handle the unpack stride.
+      SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
+      VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
+          Mask[i] % Size;
+    }
+
+    // If we will have to shuffle both inputs to use the unpack, check whether
+    // we can just unpack first and shuffle the result. If so, skip this unpack.
+    if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
+        !isNoopShuffleMask(V2Mask))
+      return SDValue();
+
+    // Shuffle the inputs into place.
+    V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+    V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+
+    // Cast the inputs to the type we will use to unpack them.
+    MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
+    V1 = DAG.getBitcast(UnpackVT, V1);
+    V2 = DAG.getBitcast(UnpackVT, V2);
+
+    // Unpack the inputs and cast the result back to the desired type.
+    return DAG.getBitcast(
+        VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+                        UnpackVT, V1, V2));
+  };
+
+  // We try each unpack from the largest to the smallest to try and find one
+  // that fits this mask.
+  int OrigScalarSize = VT.getScalarSizeInBits();
+  for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
+    if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
+      return Unpack;
+
+  // If none of the unpack-rooted lowerings worked (or were profitable) try an
+  // initial unpack.
+  if (NumLoInputs == 0 || NumHiInputs == 0) {
+    assert((NumLoInputs > 0 || NumHiInputs > 0) &&
+           "We have to have *some* inputs!");
+    int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
+
+    // FIXME: We could consider the total complexity of the permute of each
+    // possible unpacking. Or at the least we should consider how many
+    // half-crossings are created.
+    // FIXME: We could consider commuting the unpacks.
+
+    SmallVector<int, 32> PermMask((unsigned)Size, -1);
+    for (int i = 0; i < Size; ++i) {
+      if (Mask[i] < 0)
+        continue;
+
+      assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
+
+      PermMask[i] =
+          2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
+    }
+    return DAG.getVectorShuffle(
+        VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
+                            DL, VT, V1, V2),
+        DAG.getUNDEF(VT), PermMask);
+  }
+
+  return SDValue();
+}
+
+/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
+///
+/// This is the basis function for the 2-lane 64-bit shuffles as we have full
+/// support for floating point shuffles but not integer shuffles. These
+/// instructions will incur a domain crossing penalty on some chips though so
+/// it is better to avoid lowering through this for integer vectors where
+/// possible.
+static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
+  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+  if (V2.isUndef()) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+            DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
+      return Broadcast;
+
+    // Straight shuffle of a single input vector. Simulate this by using the
+    // single input as both of the "inputs" to this instruction..
+    unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
+
+    if (Subtarget.hasAVX()) {
+      // If we have AVX, we can use VPERMILPS which will allow folding a load
+      // into the shuffle.
+      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
+                         DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+    }
+
+    return DAG.getNode(
+        X86ISD::SHUFP, DL, MVT::v2f64,
+        Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
+        Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
+        DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+  }
+  assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
+  assert(Mask[1] >= 2 && "Non-canonicalized blend!");
+
+  // If we have a single input, insert that into V1 if we can do so cheaply.
+  if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
+    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+            DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
+      return Insertion;
+    // Try inverting the insertion since for v2 masks it is easy to do and we
+    // can't reliably sort the mask one way or the other.
+    int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
+                          Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
+    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+            DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
+      return Insertion;
+  }
+
+  // Try to use one of the special instruction patterns to handle two common
+  // blend patterns if a zero-blend above didn't work.
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+      isShuffleEquivalent(V1, V2, Mask, {1, 3}))
+    if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
+      // We can either use a special instruction to load over the low double or
+      // to move just the low double.
+      return DAG.getNode(
+          isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
+          DL, MVT::v2f64, V2,
+          DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
+
+  if (Subtarget.hasSSE41())
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
+                                                  Zeroable, Subtarget, DAG))
+      return Blend;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
+    return V;
+
+  unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
+  return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
+                     DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+}
+
+/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
+///
+/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
+/// the integer unit to minimize domain crossing penalties. However, for blends
+/// it falls back to the floating point shuffle operation with appropriate bit
+/// casting.
+static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
+  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
+
+  if (V2.isUndef()) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+            DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+      return Broadcast;
+
+    // Straight shuffle of a single input vector. For everything from SSE2
+    // onward this has a single fast instruction with no scary immediates.
+    // We have to map the mask as it is actually a v4i32 shuffle instruction.
+    V1 = DAG.getBitcast(MVT::v4i32, V1);
+    int WidenedMask[4] = {
+        std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
+        std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
+    return DAG.getBitcast(
+        MVT::v2i64,
+        DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
+                    getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
+  }
+  assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
+  assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
+  assert(Mask[0] < 2 && "We sort V1 to be the first input.");
+  assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+
+  // If we have a blend of two same-type PACKUS operations and the blend aligns
+  // with the low and high halves, we can just merge the PACKUS operations.
+  // This is particularly important as it lets us merge shuffles that this
+  // routine itself creates.
+  auto GetPackNode = [](SDValue V) {
+    V = peekThroughBitcasts(V);
+    return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
+  };
+  if (SDValue V1Pack = GetPackNode(V1))
+    if (SDValue V2Pack = GetPackNode(V2)) {
+      EVT PackVT = V1Pack.getValueType();
+      if (PackVT == V2Pack.getValueType())
+        return DAG.getBitcast(MVT::v2i64,
+                              DAG.getNode(X86ISD::PACKUS, DL, PackVT,
+                                          Mask[0] == 0 ? V1Pack.getOperand(0)
+                                                       : V1Pack.getOperand(1),
+                                          Mask[1] == 2 ? V2Pack.getOperand(0)
+                                                       : V2Pack.getOperand(1)));
+    }
+
+  // Try to use shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Shift;
+
+  // When loading a scalar and then shuffling it into a vector we can often do
+  // the insertion cheaply.
+  if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+          DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    return Insertion;
+  // Try inverting the insertion since for v2 masks it is easy to do and we
+  // can't reliably sort the mask one way or the other.
+  int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
+  if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+          DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
+    return Insertion;
+
+  // We have different paths for blend lowering, but they all must use the
+  // *exact* same predicate.
+  bool IsBlendSupported = Subtarget.hasSSE41();
+  if (IsBlendSupported)
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
+                                                  Zeroable, Subtarget, DAG))
+      return Blend;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
+    return V;
+
+  // Try to use byte rotation instructions.
+  // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
+  if (Subtarget.hasSSSE3())
+    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+            DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+      return Rotate;
+
+  // If we have direct support for blends, we should lower by decomposing into
+  // a permute. That will be faster than the domain cross.
+  if (IsBlendSupported)
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
+                                                      Mask, DAG);
+
+  // We implement this with SHUFPD which is pretty lame because it will likely
+  // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
+  // However, all the alternatives are still more cycles and newer chips don't
+  // have this problem. It would be really nice if x86 had better shuffles here.
+  V1 = DAG.getBitcast(MVT::v2f64, V1);
+  V2 = DAG.getBitcast(MVT::v2f64, V2);
+  return DAG.getBitcast(MVT::v2i64,
+                        DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
+}
+
+/// \brief Test whether this can be lowered with a single SHUFPS instruction.
+///
+/// This is used to disable more specialized lowerings when the shufps lowering
+/// will happen to be efficient.
+static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
+  // This routine only handles 128-bit shufps.
+  assert(Mask.size() == 4 && "Unsupported mask size!");
+  assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
+  assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
+  assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
+  assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
+
+  // To lower with a single SHUFPS we need to have the low half and high half
+  // each requiring a single input.
+  if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
+    return false;
+  if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
+    return false;
+
+  return true;
+}
+
+/// \brief Lower a vector shuffle using the SHUFPS instruction.
+///
+/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
+/// It makes no assumptions about whether this is the *best* lowering, it simply
+/// uses it.
+static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
+                                            ArrayRef<int> Mask, SDValue V1,
+                                            SDValue V2, SelectionDAG &DAG) {
+  SDValue LowV = V1, HighV = V2;
+  int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
+
+  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
+
+  if (NumV2Elements == 1) {
+    int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
+
+    // Compute the index adjacent to V2Index and in the same half by toggling
+    // the low bit.
+    int V2AdjIndex = V2Index ^ 1;
+
+    if (Mask[V2AdjIndex] < 0) {
+      // Handles all the cases where we have a single V2 element and an undef.
+      // This will only ever happen in the high lanes because we commute the
+      // vector otherwise.
+      if (V2Index < 2)
+        std::swap(LowV, HighV);
+      NewMask[V2Index] -= 4;
+    } else {
+      // Handle the case where the V2 element ends up adjacent to a V1 element.
+      // To make this work, blend them together as the first step.
+      int V1Index = V2AdjIndex;
+      int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
+      V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
+                       getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
+
+      // Now proceed to reconstruct the final blend as we have the necessary
+      // high or low half formed.
+      if (V2Index < 2) {
+        LowV = V2;
+        HighV = V1;
+      } else {
+        HighV = V2;
+      }
+      NewMask[V1Index] = 2; // We put the V1 element in V2[2].
+      NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
+    }
+  } else if (NumV2Elements == 2) {
+    if (Mask[0] < 4 && Mask[1] < 4) {
+      // Handle the easy case where we have V1 in the low lanes and V2 in the
+      // high lanes.
+      NewMask[2] -= 4;
+      NewMask[3] -= 4;
+    } else if (Mask[2] < 4 && Mask[3] < 4) {
+      // We also handle the reversed case because this utility may get called
+      // when we detect a SHUFPS pattern but can't easily commute the shuffle to
+      // arrange things in the right direction.
+      NewMask[0] -= 4;
+      NewMask[1] -= 4;
+      HighV = V1;
+      LowV = V2;
+    } else {
+      // We have a mixture of V1 and V2 in both low and high lanes. Rather than
+      // trying to place elements directly, just blend them and set up the final
+      // shuffle to place them.
+
+      // The first two blend mask elements are for V1, the second two are for
+      // V2.
+      int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
+                          Mask[2] < 4 ? Mask[2] : Mask[3],
+                          (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
+                          (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
+      V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
+                       getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
+
+      // Now we do a normal shuffle of V1 by giving V1 as both operands to
+      // a blend.
+      LowV = HighV = V1;
+      NewMask[0] = Mask[0] < 4 ? 0 : 2;
+      NewMask[1] = Mask[0] < 4 ? 2 : 0;
+      NewMask[2] = Mask[2] < 4 ? 1 : 3;
+      NewMask[3] = Mask[2] < 4 ? 3 : 1;
+    }
+  }
+  return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
+                     getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
+}
+
+/// \brief Lower 4-lane 32-bit floating point shuffles.
+///
+/// Uses instructions exclusively from the floating point unit to minimize
+/// domain crossing penalties, as these are sufficient to implement all v4f32
+/// shuffles.
+static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
+
+  if (NumV2Elements == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+            DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
+      return Broadcast;
+
+    // Use even/odd duplicate instructions for masks that match their pattern.
+    if (Subtarget.hasSSE3()) {
+      if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+        return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
+      if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
+        return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
+    }
+
+    if (Subtarget.hasAVX()) {
+      // If we have AVX, we can use VPERMILPS which will allow folding a load
+      // into the shuffle.
+      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
+                         getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+    }
+
+    // Otherwise, use a straight shuffle of a single input vector. We pass the
+    // input vector to both operands to simulate this with a SHUFPS.
+    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
+                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+  }
+
+  // There are special ways we can lower some single-element blends. However, we
+  // have custom ways we can lower more complex single-element blends below that
+  // we defer to if both this and BLENDPS fail to match, so restrict this to
+  // when the V2 input is targeting element 0 of the mask -- that is the fast
+  // case here.
+  if (NumV2Elements == 1 && Mask[0] >= 4)
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(
+            DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+      return V;
+
+  if (Subtarget.hasSSE41()) {
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
+                                                  Zeroable, Subtarget, DAG))
+      return Blend;
+
+    // Use INSERTPS if we can complete the shuffle efficiently.
+    if (SDValue V =
+            lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
+      return V;
+
+    if (!isSingleSHUFPSMask(Mask))
+      if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
+              DL, MVT::v4f32, V1, V2, Mask, DAG))
+        return BlendPerm;
+  }
+
+  // Use low/high mov instructions.
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
+    return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
+  if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
+    return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
+    return V;
+
+  // Otherwise fall back to a SHUFPS lowering strategy.
+  return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
+}
+
+/// \brief Lower 4-lane i32 vector shuffles.
+///
+/// We try to handle these with integer-domain shuffles where we can, but for
+/// blends we use the floating point domain blend instructions.
+static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative. It also allows us to fold memory operands into the
+  // shuffle in many cases.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+          DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    return ZExt;
+
+  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
+
+  if (NumV2Elements == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+            DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+      return Broadcast;
+
+    // Straight shuffle of a single input vector. For everything from SSE2
+    // onward this has a single fast instruction with no scary immediates.
+    // We coerce the shuffle pattern to be compatible with UNPCK instructions
+    // but we aren't actually going to use the UNPCK instruction because doing
+    // so prevents folding a load into this instruction or making a copy.
+    const int UnpackLoMask[] = {0, 0, 1, 1};
+    const int UnpackHiMask[] = {2, 2, 3, 3};
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
+      Mask = UnpackLoMask;
+    else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
+      Mask = UnpackHiMask;
+
+    return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
+                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+  }
+
+  // Try to use shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Shift;
+
+  // There are special ways we can lower some single-element blends.
+  if (NumV2Elements == 1)
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(
+            DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+      return V;
+
+  // We have different paths for blend lowering, but they all must use the
+  // *exact* same predicate.
+  bool IsBlendSupported = Subtarget.hasSSE41();
+  if (IsBlendSupported)
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
+                                                  Zeroable, Subtarget, DAG))
+      return Blend;
+
+  if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
+                                                   Zeroable, DAG))
+    return Masked;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
+    return V;
+
+  // Try to use byte rotation instructions.
+  // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
+  if (Subtarget.hasSSSE3())
+    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+            DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+      return Rotate;
+
+  // Assume that a single SHUFPS is faster than an alternative sequence of
+  // multiple instructions (even if the CPU has a domain penalty).
+  // If some CPU is harmed by the domain switch, we can fix it in a later pass.
+  if (!isSingleSHUFPSMask(Mask)) {
+    // If we have direct support for blends, we should lower by decomposing into
+    // a permute. That will be faster than the domain cross.
+    if (IsBlendSupported)
+      return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
+                                                        Mask, DAG);
+
+    // Try to lower by permuting the inputs into an unpack instruction.
+    if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
+            DL, MVT::v4i32, V1, V2, Mask, DAG))
+      return Unpack;
+  }
+
+  // We implement this with SHUFPS because it can blend from two vectors.
+  // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
+  // up the inputs, bypassing domain shift penalties that we would encur if we
+  // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
+  // relevant.
+  SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
+  SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
+  SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
+  return DAG.getBitcast(MVT::v4i32, ShufPS);
+}
+
+/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
+/// shuffle lowering, and the most complex part.
+///
+/// The lowering strategy is to try to form pairs of input lanes which are
+/// targeted at the same half of the final vector, and then use a dword shuffle
+/// to place them onto the right half, and finally unpack the paired lanes into
+/// their final position.
+///
+/// The exact breakdown of how to form these dword pairs and align them on the
+/// correct sides is really tricky. See the comments within the function for
+/// more of the details.
+///
+/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
+/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
+/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
+/// vector, form the analogous 128-bit 8-element Mask.
+static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
+    const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
+    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+  assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
+  MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+
+  assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
+  MutableArrayRef<int> LoMask = Mask.slice(0, 4);
+  MutableArrayRef<int> HiMask = Mask.slice(4, 4);
+
+  SmallVector<int, 4> LoInputs;
+  std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
+               [](int M) { return M >= 0; });
+  std::sort(LoInputs.begin(), LoInputs.end());
+  LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
+  SmallVector<int, 4> HiInputs;
+  std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
+               [](int M) { return M >= 0; });
+  std::sort(HiInputs.begin(), HiInputs.end());
+  HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
+  int NumLToL =
+      std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
+  int NumHToL = LoInputs.size() - NumLToL;
+  int NumLToH =
+      std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
+  int NumHToH = HiInputs.size() - NumLToH;
+  MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
+  MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
+  MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
+  MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
+
+  // If we are splatting two values from one half - one to each half, then
+  // we can shuffle that half so each is splatted to a dword, then splat those
+  // to their respective halves.
+  auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
+                        int DOffset) {
+    int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
+    int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
+    V = DAG.getNode(ShufWOp, DL, VT, V,
+                    getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
+    V = DAG.getBitcast(PSHUFDVT, V);
+    V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
+                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
+    return DAG.getBitcast(VT, V);
+  };
+
+  if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
+    return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
+  if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
+    return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
+
+  // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
+  // such inputs we can swap two of the dwords across the half mark and end up
+  // with <=2 inputs to each half in each half. Once there, we can fall through
+  // to the generic code below. For example:
+  //
+  // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
+  // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
+  //
+  // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
+  // and an existing 2-into-2 on the other half. In this case we may have to
+  // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
+  // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
+  // Fortunately, we don't have to handle anything but a 2-into-2 pattern
+  // because any other situation (including a 3-into-1 or 1-into-3 in the other
+  // half than the one we target for fixing) will be fixed when we re-enter this
+  // path. We will also combine away any sequence of PSHUFD instructions that
+  // result into a single instruction. Here is an example of the tricky case:
+  //
+  // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
+  // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
+  //
+  // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
+  //
+  // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
+  // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
+  //
+  // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
+  // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
+  //
+  // The result is fine to be handled by the generic logic.
+  auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
+                          ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
+                          int AOffset, int BOffset) {
+    assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
+           "Must call this with A having 3 or 1 inputs from the A half.");
+    assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
+           "Must call this with B having 1 or 3 inputs from the B half.");
+    assert(AToAInputs.size() + BToAInputs.size() == 4 &&
+           "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
+
+    bool ThreeAInputs = AToAInputs.size() == 3;
+
+    // Compute the index of dword with only one word among the three inputs in
+    // a half by taking the sum of the half with three inputs and subtracting
+    // the sum of the actual three inputs. The difference is the remaining
+    // slot.
+    int ADWord, BDWord;
+    int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
+    int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
+    int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
+    ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
+    int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
+    int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
+    int TripleNonInputIdx =
+        TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
+    TripleDWord = TripleNonInputIdx / 2;
+
+    // We use xor with one to compute the adjacent DWord to whichever one the
+    // OneInput is in.
+    OneInputDWord = (OneInput / 2) ^ 1;
+
+    // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
+    // and BToA inputs. If there is also such a problem with the BToB and AToB
+    // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
+    // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
+    // is essential that we don't *create* a 3<-1 as then we might oscillate.
+    if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
+      // Compute how many inputs will be flipped by swapping these DWords. We
+      // need
+      // to balance this to ensure we don't form a 3-1 shuffle in the other
+      // half.
+      int NumFlippedAToBInputs =
+          std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
+          std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
+      int NumFlippedBToBInputs =
+          std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
+          std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
+      if ((NumFlippedAToBInputs == 1 &&
+           (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
+          (NumFlippedBToBInputs == 1 &&
+           (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
+        // We choose whether to fix the A half or B half based on whether that
+        // half has zero flipped inputs. At zero, we may not be able to fix it
+        // with that half. We also bias towards fixing the B half because that
+        // will more commonly be the high half, and we have to bias one way.
+        auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
+                                                       ArrayRef<int> Inputs) {
+          int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
+          bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
+          // Determine whether the free index is in the flipped dword or the
+          // unflipped dword based on where the pinned index is. We use this bit
+          // in an xor to conditionally select the adjacent dword.
+          int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
+          bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
+          if (IsFixIdxInput == IsFixFreeIdxInput)
+            FixFreeIdx += 1;
+          IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
+          assert(IsFixIdxInput != IsFixFreeIdxInput &&
+                 "We need to be changing the number of flipped inputs!");
+          int PSHUFHalfMask[] = {0, 1, 2, 3};
+          std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
+          V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
+                          MVT::v8i16, V,
+                          getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
+
+          for (int &M : Mask)
+            if (M >= 0 && M == FixIdx)
+              M = FixFreeIdx;
+            else if (M >= 0 && M == FixFreeIdx)
+              M = FixIdx;
+        };
+        if (NumFlippedBToBInputs != 0) {
+          int BPinnedIdx =
+              BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
+          FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
+        } else {
+          assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
+          int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
+          FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
+        }
+      }
+    }
+
+    int PSHUFDMask[] = {0, 1, 2, 3};
+    PSHUFDMask[ADWord] = BDWord;
+    PSHUFDMask[BDWord] = ADWord;
+    V = DAG.getBitcast(
+        VT,
+        DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
+                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+
+    // Adjust the mask to match the new locations of A and B.
+    for (int &M : Mask)
+      if (M >= 0 && M/2 == ADWord)
+        M = 2 * BDWord + M % 2;
+      else if (M >= 0 && M/2 == BDWord)
+        M = 2 * ADWord + M % 2;
+
+    // Recurse back into this routine to re-compute state now that this isn't
+    // a 3 and 1 problem.
+    return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
+                                                     DAG);
+  };
+  if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
+    return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
+  else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
+    return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
+
+  // At this point there are at most two inputs to the low and high halves from
+  // each half. That means the inputs can always be grouped into dwords and
+  // those dwords can then be moved to the correct half with a dword shuffle.
+  // We use at most one low and one high word shuffle to collect these paired
+  // inputs into dwords, and finally a dword shuffle to place them.
+  int PSHUFLMask[4] = {-1, -1, -1, -1};
+  int PSHUFHMask[4] = {-1, -1, -1, -1};
+  int PSHUFDMask[4] = {-1, -1, -1, -1};
+
+  // First fix the masks for all the inputs that are staying in their
+  // original halves. This will then dictate the targets of the cross-half
+  // shuffles.
+  auto fixInPlaceInputs =
+      [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
+                    MutableArrayRef<int> SourceHalfMask,
+                    MutableArrayRef<int> HalfMask, int HalfOffset) {
+    if (InPlaceInputs.empty())
+      return;
+    if (InPlaceInputs.size() == 1) {
+      SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+          InPlaceInputs[0] - HalfOffset;
+      PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
+      return;
+    }
+    if (IncomingInputs.empty()) {
+      // Just fix all of the in place inputs.
+      for (int Input : InPlaceInputs) {
+        SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
+        PSHUFDMask[Input / 2] = Input / 2;
+      }
+      return;
+    }
+
+    assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
+    SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+        InPlaceInputs[0] - HalfOffset;
+    // Put the second input next to the first so that they are packed into
+    // a dword. We find the adjacent index by toggling the low bit.
+    int AdjIndex = InPlaceInputs[0] ^ 1;
+    SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
+    std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
+    PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
+  };
+  fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
+  fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
+
+  // Now gather the cross-half inputs and place them into a free dword of
+  // their target half.
+  // FIXME: This operation could almost certainly be simplified dramatically to
+  // look more like the 3-1 fixing operation.
+  auto moveInputsToRightHalf = [&PSHUFDMask](
+      MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
+      MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
+      MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
+      int DestOffset) {
+    auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
+      return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
+    };
+    auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
+                                               int Word) {
+      int LowWord = Word & ~1;
+      int HighWord = Word | 1;
+      return isWordClobbered(SourceHalfMask, LowWord) ||
+             isWordClobbered(SourceHalfMask, HighWord);
+    };
+
+    if (IncomingInputs.empty())
+      return;
+
+    if (ExistingInputs.empty()) {
+      // Map any dwords with inputs from them into the right half.
+      for (int Input : IncomingInputs) {
+        // If the source half mask maps over the inputs, turn those into
+        // swaps and use the swapped lane.
+        if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
+          if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
+            SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
+                Input - SourceOffset;
+            // We have to swap the uses in our half mask in one sweep.
+            for (int &M : HalfMask)
+              if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
+                M = Input;
+              else if (M == Input)
+                M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+          } else {
+            assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
+                       Input - SourceOffset &&
+                   "Previous placement doesn't match!");
+          }
+          // Note that this correctly re-maps both when we do a swap and when
+          // we observe the other side of the swap above. We rely on that to
+          // avoid swapping the members of the input list directly.
+          Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
+        }
+
+        // Map the input's dword into the correct half.
+        if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
+          PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
+        else
+          assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
+                     Input / 2 &&
+                 "Previous placement doesn't match!");
+      }
+
+      // And just directly shift any other-half mask elements to be same-half
+      // as we will have mirrored the dword containing the element into the
+      // same position within that half.
+      for (int &M : HalfMask)
+        if (M >= SourceOffset && M < SourceOffset + 4) {
+          M = M - SourceOffset + DestOffset;
+          assert(M >= 0 && "This should never wrap below zero!");
+        }
+      return;
+    }
+
+    // Ensure we have the input in a viable dword of its current half. This
+    // is particularly tricky because the original position may be clobbered
+    // by inputs being moved and *staying* in that half.
+    if (IncomingInputs.size() == 1) {
+      if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+        int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
+                         SourceOffset;
+        SourceHalfMask[InputFixed - SourceOffset] =
+            IncomingInputs[0] - SourceOffset;
+        std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
+                     InputFixed);
+        IncomingInputs[0] = InputFixed;
+      }
+    } else if (IncomingInputs.size() == 2) {
+      if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
+          isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
+        // We have two non-adjacent or clobbered inputs we need to extract from
+        // the source half. To do this, we need to map them into some adjacent
+        // dword slot in the source mask.
+        int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
+                              IncomingInputs[1] - SourceOffset};
+
+        // If there is a free slot in the source half mask adjacent to one of
+        // the inputs, place the other input in it. We use (Index XOR 1) to
+        // compute an adjacent index.
+        if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
+            SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
+          SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
+          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+          InputsFixed[1] = InputsFixed[0] ^ 1;
+        } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
+                   SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
+          SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
+          SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
+          InputsFixed[0] = InputsFixed[1] ^ 1;
+        } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
+                   SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
+          // The two inputs are in the same DWord but it is clobbered and the
+          // adjacent DWord isn't used at all. Move both inputs to the free
+          // slot.
+          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
+          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
+          InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
+          InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
+        } else {
+          // The only way we hit this point is if there is no clobbering
+          // (because there are no off-half inputs to this half) and there is no
+          // free slot adjacent to one of the inputs. In this case, we have to
+          // swap an input with a non-input.
+          for (int i = 0; i < 4; ++i)
+            assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
+                   "We can't handle any clobbers here!");
+          assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
+                 "Cannot have adjacent inputs here!");
+
+          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+          SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
+
+          // We also have to update the final source mask in this case because
+          // it may need to undo the above swap.
+          for (int &M : FinalSourceHalfMask)
+            if (M == (InputsFixed[0] ^ 1) + SourceOffset)
+              M = InputsFixed[1] + SourceOffset;
+            else if (M == InputsFixed[1] + SourceOffset)
+              M = (InputsFixed[0] ^ 1) + SourceOffset;
+
+          InputsFixed[1] = InputsFixed[0] ^ 1;
+        }
+
+        // Point everything at the fixed inputs.
+        for (int &M : HalfMask)
+          if (M == IncomingInputs[0])
+            M = InputsFixed[0] + SourceOffset;
+          else if (M == IncomingInputs[1])
+            M = InputsFixed[1] + SourceOffset;
+
+        IncomingInputs[0] = InputsFixed[0] + SourceOffset;
+        IncomingInputs[1] = InputsFixed[1] + SourceOffset;
+      }
+    } else {
+      llvm_unreachable("Unhandled input size!");
+    }
+
+    // Now hoist the DWord down to the right half.
+    int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
+    assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
+    PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
+    for (int &M : HalfMask)
+      for (int Input : IncomingInputs)
+        if (M == Input)
+          M = FreeDWord * 2 + Input % 2;
+  };
+  moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
+                        /*SourceOffset*/ 4, /*DestOffset*/ 0);
+  moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
+                        /*SourceOffset*/ 0, /*DestOffset*/ 4);
+
+  // Now enact all the shuffles we've computed to move the inputs into their
+  // target half.
+  if (!isNoopShuffleMask(PSHUFLMask))
+    V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
+                    getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
+  if (!isNoopShuffleMask(PSHUFHMask))
+    V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
+                    getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
+  if (!isNoopShuffleMask(PSHUFDMask))
+    V = DAG.getBitcast(
+        VT,
+        DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
+                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+
+  // At this point, each half should contain all its inputs, and we can then
+  // just shuffle them into their final position.
+  assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
+         "Failed to lift all the high half inputs to the low mask!");
+  assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
+         "Failed to lift all the low half inputs to the high mask!");
+
+  // Do a half shuffle for the low mask.
+  if (!isNoopShuffleMask(LoMask))
+    V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
+                    getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
+
+  // Do a half shuffle with the high mask after shifting its values down.
+  for (int &M : HiMask)
+    if (M >= 0)
+      M -= 4;
+  if (!isNoopShuffleMask(HiMask))
+    V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
+                    getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
+
+  return V;
+}
+
+/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
+/// blend if only one input is used.
+static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
+    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse,
+    bool &V2InUse) {
+  SDValue V1Mask[16];
+  SDValue V2Mask[16];
+  V1InUse = false;
+  V2InUse = false;
+
+  int Size = Mask.size();
+  int Scale = 16 / Size;
+  for (int i = 0; i < 16; ++i) {
+    if (Mask[i / Scale] < 0) {
+      V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
+    } else {
+      const int ZeroMask = 0x80;
+      int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
+                                          : ZeroMask;
+      int V2Idx = Mask[i / Scale] < Size
+                      ? ZeroMask
+                      : (Mask[i / Scale] - Size) * Scale + i % Scale;
+      if (Zeroable[i / Scale])
+        V1Idx = V2Idx = ZeroMask;
+      V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
+      V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
+      V1InUse |= (ZeroMask != V1Idx);
+      V2InUse |= (ZeroMask != V2Idx);
+    }
+  }
+
+  if (V1InUse)
+    V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+                     DAG.getBitcast(MVT::v16i8, V1),
+                     DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
+  if (V2InUse)
+    V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+                     DAG.getBitcast(MVT::v16i8, V2),
+                     DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
+
+  // If we need shuffled inputs from both, blend the two.
+  SDValue V;
+  if (V1InUse && V2InUse)
+    V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+  else
+    V = V1InUse ? V1 : V2;
+
+  // Cast the result back to the correct type.
+  return DAG.getBitcast(VT, V);
+}
+
+/// \brief Generic lowering of 8-lane i16 shuffles.
+///
+/// This handles both single-input shuffles and combined shuffle/blends with
+/// two inputs. The single input shuffles are immediately delegated to
+/// a dedicated lowering routine.
+///
+/// The blends are lowered in one of three fundamental ways. If there are few
+/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
+/// of the input is significantly cheaper when lowered as an interleaving of
+/// the two inputs, try to interleave them. Otherwise, blend the low and high
+/// halves of the inputs separately (making them have relatively few inputs)
+/// and then concatenate them.
+static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+          DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    return ZExt;
+
+  int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
+
+  if (NumV2Inputs == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+            DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+      return Broadcast;
+
+    // Try to use shift instructions.
+    if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
+                                                  Zeroable, Subtarget, DAG))
+      return Shift;
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    if (SDValue V =
+            lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+      return V;
+
+    // Try to use byte rotation instructions.
+    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
+                                                        Mask, Subtarget, DAG))
+      return Rotate;
+
+    // Make a copy of the mask so it can be modified.
+    SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
+    return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
+                                                     MutableMask, Subtarget,
+                                                     DAG);
+  }
+
+  assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
+         "All single-input shuffles should be canonicalized to be V1-input "
+         "shuffles.");
+
+  // Try to use shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Shift;
+
+  // See if we can use SSE4A Extraction / Insertion.
+  if (Subtarget.hasSSE4A())
+    if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
+                                                Zeroable, DAG))
+      return V;
+
+  // There are special ways we can lower some single-element blends.
+  if (NumV2Inputs == 1)
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(
+            DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+      return V;
+
+  // We have different paths for blend lowering, but they all must use the
+  // *exact* same predicate.
+  bool IsBlendSupported = Subtarget.hasSSE41();
+  if (IsBlendSupported)
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
+                                                  Zeroable, Subtarget, DAG))
+      return Blend;
+
+  if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
+                                                   Zeroable, DAG))
+    return Masked;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+    return V;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
+  if (SDValue BitBlend =
+          lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+    return BitBlend;
+
+  // Try to lower by permuting the inputs into an unpack instruction.
+  if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
+                                                            V2, Mask, DAG))
+    return Unpack;
+
+  // If we can't directly blend but can use PSHUFB, that will be better as it
+  // can both shuffle and set up the inefficient blend.
+  if (!IsBlendSupported && Subtarget.hasSSSE3()) {
+    bool V1InUse, V2InUse;
+    return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
+                                              Zeroable, DAG, V1InUse, V2InUse);
+  }
+
+  // We can always bit-blend if we have to so the fallback strategy is to
+  // decompose into single-input permutes and blends.
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
+                                                    Mask, DAG);
+}
+
+/// \brief Check whether a compaction lowering can be done by dropping even
+/// elements and compute how many times even elements must be dropped.
+///
+/// This handles shuffles which take every Nth element where N is a power of
+/// two. Example shuffle masks:
+///
+///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
+///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
+///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
+///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
+///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
+///
+/// Any of these lanes can of course be undef.
+///
+/// This routine only supports N <= 3.
+/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
+/// for larger N.
+///
+/// \returns N above, or the number of times even elements must be dropped if
+/// there is such a number. Otherwise returns zero.
+static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
+                                          bool IsSingleInput) {
+  // The modulus for the shuffle vector entries is based on whether this is
+  // a single input or not.
+  int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
+  assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
+         "We should only be called with masks with a power-of-2 size!");
+
+  uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
+
+  // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
+  // and 2^3 simultaneously. This is because we may have ambiguity with
+  // partially undef inputs.
+  bool ViableForN[3] = {true, true, true};
+
+  for (int i = 0, e = Mask.size(); i < e; ++i) {
+    // Ignore undef lanes, we'll optimistically collapse them to the pattern we
+    // want.
+    if (Mask[i] < 0)
+      continue;
+
+    bool IsAnyViable = false;
+    for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+      if (ViableForN[j]) {
+        uint64_t N = j + 1;
+
+        // The shuffle mask must be equal to (i * 2^N) % M.
+        if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
+          IsAnyViable = true;
+        else
+          ViableForN[j] = false;
+      }
+    // Early exit if we exhaust the possible powers of two.
+    if (!IsAnyViable)
+      break;
+  }
+
+  for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+    if (ViableForN[j])
+      return j + 1;
+
+  // Return 0 as there is no viable power of two.
+  return 0;
+}
+
+/// \brief Generic lowering of v16i8 shuffles.
+///
+/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
+/// detect any complexity reducing interleaving. If that doesn't help, it uses
+/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
+/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
+/// back together.
+static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
+  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+  // Try to use shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
+  // Try to use a zext lowering.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+          DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    return ZExt;
+
+  // See if we can use SSE4A Extraction / Insertion.
+  if (Subtarget.hasSSE4A())
+    if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
+                                                Zeroable, DAG))
+      return V;
+
+  int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
+
+  // For single-input shuffles, there are some nicer lowering tricks we can use.
+  if (NumV2Elements == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+            DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+      return Broadcast;
+
+    // Check whether we can widen this to an i16 shuffle by duplicating bytes.
+    // Notably, this handles splat and partial-splat shuffles more efficiently.
+    // However, it only makes sense if the pre-duplication shuffle simplifies
+    // things significantly. Currently, this means we need to be able to
+    // express the pre-duplication shuffle as an i16 shuffle.
+    //
+    // FIXME: We should check for other patterns which can be widened into an
+    // i16 shuffle as well.
+    auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
+      for (int i = 0; i < 16; i += 2)
+        if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
+          return false;
+
+      return true;
+    };
+    auto tryToWidenViaDuplication = [&]() -> SDValue {
+      if (!canWidenViaDuplication(Mask))
+        return SDValue();
+      SmallVector<int, 4> LoInputs;
+      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
+                   [](int M) { return M >= 0 && M < 8; });
+      std::sort(LoInputs.begin(), LoInputs.end());
+      LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
+                     LoInputs.end());
+      SmallVector<int, 4> HiInputs;
+      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
+                   [](int M) { return M >= 8; });
+      std::sort(HiInputs.begin(), HiInputs.end());
+      HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
+                     HiInputs.end());
+
+      bool TargetLo = LoInputs.size() >= HiInputs.size();
+      ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
+      ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
+
+      int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+      SmallDenseMap<int, int, 8> LaneMap;
+      for (int I : InPlaceInputs) {
+        PreDupI16Shuffle[I/2] = I/2;
+        LaneMap[I] = I;
+      }
+      int j = TargetLo ? 0 : 4, je = j + 4;
+      for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
+        // Check if j is already a shuffle of this input. This happens when
+        // there are two adjacent bytes after we move the low one.
+        if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
+          // If we haven't yet mapped the input, search for a slot into which
+          // we can map it.
+          while (j < je && PreDupI16Shuffle[j] >= 0)
+            ++j;
+
+          if (j == je)
+            // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
+            return SDValue();
+
+          // Map this input with the i16 shuffle.
+          PreDupI16Shuffle[j] = MovingInputs[i] / 2;
+        }
+
+        // Update the lane map based on the mapping we ended up with.
+        LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
+      }
+      V1 = DAG.getBitcast(
+          MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
+                               DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
+
+      // Unpack the bytes to form the i16s that will be shuffled into place.
+      V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+                       MVT::v16i8, V1, V1);
+
+      int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+      for (int i = 0; i < 16; ++i)
+        if (Mask[i] >= 0) {
+          int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
+          assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
+          if (PostDupI16Shuffle[i / 2] < 0)
+            PostDupI16Shuffle[i / 2] = MappedMask;
+          else
+            assert(PostDupI16Shuffle[i / 2] == MappedMask &&
+                   "Conflicting entrties in the original shuffle!");
+        }
+      return DAG.getBitcast(
+          MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
+                               DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
+    };
+    if (SDValue V = tryToWidenViaDuplication())
+      return V;
+  }
+
+  if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
+                                                   Zeroable, DAG))
+    return Masked;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+    return V;
+
+  // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
+  // with PSHUFB. It is important to do this before we attempt to generate any
+  // blends but after all of the single-input lowerings. If the single input
+  // lowerings can find an instruction sequence that is faster than a PSHUFB, we
+  // want to preserve that and we can DAG combine any longer sequences into
+  // a PSHUFB in the end. But once we start blending from multiple inputs,
+  // the complexity of DAG combining bad patterns back into PSHUFB is too high,
+  // and there are *very* few patterns that would actually be faster than the
+  // PSHUFB approach because of its ability to zero lanes.
+  //
+  // FIXME: The only exceptions to the above are blends which are exact
+  // interleavings with direct instructions supporting them. We currently don't
+  // handle those well here.
+  if (Subtarget.hasSSSE3()) {
+    bool V1InUse = false;
+    bool V2InUse = false;
+
+    SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
+        DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
+
+    // If both V1 and V2 are in use and we can use a direct blend or an unpack,
+    // do so. This avoids using them to handle blends-with-zero which is
+    // important as a single pshufb is significantly faster for that.
+    if (V1InUse && V2InUse) {
+      if (Subtarget.hasSSE41())
+        if (SDValue Blend = lowerVectorShuffleAsBlend(
+                DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+          return Blend;
+
+      // We can use an unpack to do the blending rather than an or in some
+      // cases. Even though the or may be (very minorly) more efficient, we
+      // preference this lowering because there are common cases where part of
+      // the complexity of the shuffles goes away when we do the final blend as
+      // an unpack.
+      // FIXME: It might be worth trying to detect if the unpack-feeding
+      // shuffles will both be pshufb, in which case we shouldn't bother with
+      // this.
+      if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
+              DL, MVT::v16i8, V1, V2, Mask, DAG))
+        return Unpack;
+    }
+
+    return PSHUFB;
+  }
+
+  // There are special ways we can lower some single-element blends.
+  if (NumV2Elements == 1)
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(
+            DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+      return V;
+
+  if (SDValue BitBlend =
+          lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
+    return BitBlend;
+
+  // Check whether a compaction lowering can be done. This handles shuffles
+  // which take every Nth element for some even N. See the helper function for
+  // details.
+  //
+  // We special case these as they can be particularly efficiently handled with
+  // the PACKUSB instruction on x86 and they show up in common patterns of
+  // rearranging bytes to truncate wide elements.
+  bool IsSingleInput = V2.isUndef();
+  if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
+    // NumEvenDrops is the power of two stride of the elements. Another way of
+    // thinking about it is that we need to drop the even elements this many
+    // times to get the original input.
+
+    // First we need to zero all the dropped bytes.
+    assert(NumEvenDrops <= 3 &&
+           "No support for dropping even elements more than 3 times.");
+    // We use the mask type to pick which bytes are preserved based on how many
+    // elements are dropped.
+    MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
+    SDValue ByteClearMask = DAG.getBitcast(
+        MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
+    V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
+    if (!IsSingleInput)
+      V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
+
+    // Now pack things back together.
+    V1 = DAG.getBitcast(MVT::v8i16, V1);
+    V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
+    SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
+    for (int i = 1; i < NumEvenDrops; ++i) {
+      Result = DAG.getBitcast(MVT::v8i16, Result);
+      Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
+    }
+
+    return Result;
+  }
+
+  // Handle multi-input cases by blending single-input shuffles.
+  if (NumV2Elements > 0)
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
+                                                      Mask, DAG);
+
+  // The fallback path for single-input shuffles widens this into two v8i16
+  // vectors with unpacks, shuffles those, and then pulls them back together
+  // with a pack.
+  SDValue V = V1;
+
+  std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
+  std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
+  for (int i = 0; i < 16; ++i)
+    if (Mask[i] >= 0)
+      (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
+
+  SDValue VLoHalf, VHiHalf;
+  // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+  // them out and avoid using UNPCK{L,H} to extract the elements of V as
+  // i16s.
+  if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
+      none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
+    // Use a mask to drop the high bytes.
+    VLoHalf = DAG.getBitcast(MVT::v8i16, V);
+    VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
+                          DAG.getConstant(0x00FF, DL, MVT::v8i16));
+
+    // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
+    VHiHalf = DAG.getUNDEF(MVT::v8i16);
+
+    // Squash the masks to point directly into VLoHalf.
+    for (int &M : LoBlendMask)
+      if (M >= 0)
+        M /= 2;
+    for (int &M : HiBlendMask)
+      if (M >= 0)
+        M /= 2;
+  } else {
+    // Otherwise just unpack the low half of V into VLoHalf and the high half into
+    // VHiHalf so that we can blend them as i16s.
+    SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
+
+    VLoHalf = DAG.getBitcast(
+        MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+    VHiHalf = DAG.getBitcast(
+        MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+  }
+
+  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
+  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
+
+  return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
+}
+
+/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
+///
+/// This routine breaks down the specific type of 128-bit shuffle and
+/// dispatches to the lowering routines accordingly.
+static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                        MVT VT, SDValue V1, SDValue V2,
+                                        const SmallBitVector &Zeroable,
+                                        const X86Subtarget &Subtarget,
+                                        SelectionDAG &DAG) {
+  switch (VT.SimpleTy) {
+  case MVT::v2i64:
+    return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+  case MVT::v2f64:
+    return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+  case MVT::v4i32:
+    return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+  case MVT::v4f32:
+    return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+  case MVT::v8i16:
+    return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+  case MVT::v16i8:
+    return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+
+  default:
+    llvm_unreachable("Unimplemented!");
+  }
+}
+
+/// \brief Generic routine to split vector shuffle into half-sized shuffles.
+///
+/// This routine just extracts two subvectors, shuffles them independently, and
+/// then concatenates them back together. This should work effectively with all
+/// AVX vector shuffle types.
+static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
+                                          SDValue V2, ArrayRef<int> Mask,
+                                          SelectionDAG &DAG) {
+  assert(VT.getSizeInBits() >= 256 &&
+         "Only for 256-bit or wider vector shuffles!");
+  assert(V1.getSimpleValueType() == VT && "Bad operand type!");
+  assert(V2.getSimpleValueType() == VT && "Bad operand type!");
+
+  ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
+  ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
+
+  int NumElements = VT.getVectorNumElements();
+  int SplitNumElements = NumElements / 2;
+  MVT ScalarVT = VT.getVectorElementType();
+  MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
+
+  // Rather than splitting build-vectors, just build two narrower build
+  // vectors. This helps shuffling with splats and zeros.
+  auto SplitVector = [&](SDValue V) {
+    V = peekThroughBitcasts(V);
+
+    MVT OrigVT = V.getSimpleValueType();
+    int OrigNumElements = OrigVT.getVectorNumElements();
+    int OrigSplitNumElements = OrigNumElements / 2;
+    MVT OrigScalarVT = OrigVT.getVectorElementType();
+    MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
+
+    SDValue LoV, HiV;
+
+    auto *BV = dyn_cast<BuildVectorSDNode>(V);
+    if (!BV) {
+      LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+                        DAG.getIntPtrConstant(0, DL));
+      HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+                        DAG.getIntPtrConstant(OrigSplitNumElements, DL));
+    } else {
+
+      SmallVector<SDValue, 16> LoOps, HiOps;
+      for (int i = 0; i < OrigSplitNumElements; ++i) {
+        LoOps.push_back(BV->getOperand(i));
+        HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
+      }
+      LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
+      HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
+    }
+    return std::make_pair(DAG.getBitcast(SplitVT, LoV),
+                          DAG.getBitcast(SplitVT, HiV));
+  };
+
+  SDValue LoV1, HiV1, LoV2, HiV2;
+  std::tie(LoV1, HiV1) = SplitVector(V1);
+  std::tie(LoV2, HiV2) = SplitVector(V2);
+
+  // Now create two 4-way blends of these half-width vectors.
+  auto HalfBlend = [&](ArrayRef<int> HalfMask) {
+    bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
+    SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
+    SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
+    SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
+    for (int i = 0; i < SplitNumElements; ++i) {
+      int M = HalfMask[i];
+      if (M >= NumElements) {
+        if (M >= NumElements + SplitNumElements)
+          UseHiV2 = true;
+        else
+          UseLoV2 = true;
+        V2BlendMask[i] = M - NumElements;
+        BlendMask[i] = SplitNumElements + i;
+      } else if (M >= 0) {
+        if (M >= SplitNumElements)
+          UseHiV1 = true;
+        else
+          UseLoV1 = true;
+        V1BlendMask[i] = M;
+        BlendMask[i] = i;
+      }
+    }
+
+    // Because the lowering happens after all combining takes place, we need to
+    // manually combine these blend masks as much as possible so that we create
+    // a minimal number of high-level vector shuffle nodes.
+
+    // First try just blending the halves of V1 or V2.
+    if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
+      return DAG.getUNDEF(SplitVT);
+    if (!UseLoV2 && !UseHiV2)
+      return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+    if (!UseLoV1 && !UseHiV1)
+      return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+
+    SDValue V1Blend, V2Blend;
+    if (UseLoV1 && UseHiV1) {
+      V1Blend =
+        DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+    } else {
+      // We only use half of V1 so map the usage down into the final blend mask.
+      V1Blend = UseLoV1 ? LoV1 : HiV1;
+      for (int i = 0; i < SplitNumElements; ++i)
+        if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
+          BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
+    }
+    if (UseLoV2 && UseHiV2) {
+      V2Blend =
+        DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+    } else {
+      // We only use half of V2 so map the usage down into the final blend mask.
+      V2Blend = UseLoV2 ? LoV2 : HiV2;
+      for (int i = 0; i < SplitNumElements; ++i)
+        if (BlendMask[i] >= SplitNumElements)
+          BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
+    }
+    return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
+  };
+  SDValue Lo = HalfBlend(LoMask);
+  SDValue Hi = HalfBlend(HiMask);
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+}
+
+/// \brief Either split a vector in halves or decompose the shuffles and the
+/// blend.
+///
+/// This is provided as a good fallback for many lowerings of non-single-input
+/// shuffles with more than one 128-bit lane. In those cases, we want to select
+/// between splitting the shuffle into 128-bit components and stitching those
+/// back together vs. extracting the single-input shuffles and blending those
+/// results.
+static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
+                                                SDValue V1, SDValue V2,
+                                                ArrayRef<int> Mask,
+                                                SelectionDAG &DAG) {
+  assert(!V2.isUndef() && "This routine must not be used to lower single-input "
+         "shuffles as it could then recurse on itself.");
+  int Size = Mask.size();
+
+  // If this can be modeled as a broadcast of two elements followed by a blend,
+  // prefer that lowering. This is especially important because broadcasts can
+  // often fold with memory operands.
+  auto DoBothBroadcast = [&] {
+    int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
+    for (int M : Mask)
+      if (M >= Size) {
+        if (V2BroadcastIdx < 0)
+          V2BroadcastIdx = M - Size;
+        else if (M - Size != V2BroadcastIdx)
+          return false;
+      } else if (M >= 0) {
+        if (V1BroadcastIdx < 0)
+          V1BroadcastIdx = M;
+        else if (M != V1BroadcastIdx)
+          return false;
+      }
+    return true;
+  };
+  if (DoBothBroadcast())
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
+                                                      DAG);
+
+  // If the inputs all stem from a single 128-bit lane of each input, then we
+  // split them rather than blending because the split will decompose to
+  // unusually few instructions.
+  int LaneCount = VT.getSizeInBits() / 128;
+  int LaneSize = Size / LaneCount;
+  SmallBitVector LaneInputs[2];
+  LaneInputs[0].resize(LaneCount, false);
+  LaneInputs[1].resize(LaneCount, false);
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0)
+      LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
+  if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
+    return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+  // Otherwise, just fall back to decomposed shuffles and a blend. This requires
+  // that the decomposed single-input shuffles don't end up here.
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
+}
+
+/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
+/// a permutation and blend of those lanes.
+///
+/// This essentially blends the out-of-lane inputs to each lane into the lane
+/// from a permuted copy of the vector. This lowering strategy results in four
+/// instructions in the worst case for a single-input cross lane shuffle which
+/// is lower than any other fully general cross-lane shuffle strategy I'm aware
+/// of. Special cases for each particular shuffle pattern should be handled
+/// prior to trying this lowering.
+static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
+                                                       SDValue V1, SDValue V2,
+                                                       ArrayRef<int> Mask,
+                                                       SelectionDAG &DAG) {
+  // FIXME: This should probably be generalized for 512-bit vectors as well.
+  assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
+  int Size = Mask.size();
+  int LaneSize = Size / 2;
+
+  // If there are only inputs from one 128-bit lane, splitting will in fact be
+  // less expensive. The flags track whether the given lane contains an element
+  // that crosses to another lane.
+  bool LaneCrossing[2] = {false, false};
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+      LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
+  if (!LaneCrossing[0] || !LaneCrossing[1])
+    return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+  assert(V2.isUndef() &&
+         "This last part of this routine only works on single input shuffles");
+
+  SmallVector<int, 32> FlippedBlendMask(Size);
+  for (int i = 0; i < Size; ++i)
+    FlippedBlendMask[i] =
+        Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
+                                ? Mask[i]
+                                : Mask[i] % LaneSize +
+                                      (i / LaneSize) * LaneSize + Size);
+
+  // Flip the vector, and blend the results which should now be in-lane. The
+  // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
+  // 5 for the high source. The value 3 selects the high half of source 2 and
+  // the value 2 selects the low half of source 2. We only use source 2 to
+  // allow folding it into a memory operand.
+  unsigned PERMMask = 3 | 2 << 4;
+  SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
+                                V1, DAG.getConstant(PERMMask, DL, MVT::i8));
+  return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
+}
+
+/// \brief Handle lowering 2-lane 128-bit shuffles.
+static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
+                                        SDValue V2, ArrayRef<int> Mask,
+                                        const SmallBitVector &Zeroable,
+                                        const X86Subtarget &Subtarget,
+                                        SelectionDAG &DAG) {
+  // TODO: If minimizing size and one of the inputs is a zero vector and the
+  // the zero vector has only one use, we could use a VPERM2X128 to save the
+  // instruction bytes needed to explicitly generate the zero vector.
+
+  // Blends are faster and handle all the non-lane-crossing cases.
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
+
+  bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+  // If either input operand is a zero vector, use VPERM2X128 because its mask
+  // allows us to replace the zero input with an implicit zero.
+  if (!IsV1Zero && !IsV2Zero) {
+    // Check for patterns which can be matched with a single insert of a 128-bit
+    // subvector.
+    bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
+    if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
+      // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
+      if (Subtarget.hasAVX2() && V2.isUndef())
+        return SDValue();
+
+      MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+                                   VT.getVectorNumElements() / 2);
+      SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+                                DAG.getIntPtrConstant(0, DL));
+      SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+                                OnlyUsesV1 ? V1 : V2,
+                                DAG.getIntPtrConstant(0, DL));
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+    }
+  }
+
+  // Otherwise form a 128-bit permutation. After accounting for undefs,
+  // convert the 64-bit shuffle mask selection values into 128-bit
+  // selection bits by dividing the indexes by 2 and shifting into positions
+  // defined by a vperm2*128 instruction's immediate control byte.
+
+  // The immediate permute control byte looks like this:
+  //    [1:0] - select 128 bits from sources for low half of destination
+  //    [2]   - ignore
+  //    [3]   - zero low half of destination
+  //    [5:4] - select 128 bits from sources for high half of destination
+  //    [6]   - ignore
+  //    [7]   - zero high half of destination
+
+  int MaskLO = Mask[0];
+  if (MaskLO == SM_SentinelUndef)
+    MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
+
+  int MaskHI = Mask[2];
+  if (MaskHI == SM_SentinelUndef)
+    MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
+
+  unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
+
+  // If either input is a zero vector, replace it with an undef input.
+  // Shuffle mask values <  4 are selecting elements of V1.
+  // Shuffle mask values >= 4 are selecting elements of V2.
+  // Adjust each half of the permute mask by clearing the half that was
+  // selecting the zero vector and setting the zero mask bit.
+  if (IsV1Zero) {
+    V1 = DAG.getUNDEF(VT);
+    if (MaskLO < 4)
+      PermMask = (PermMask & 0xf0) | 0x08;
+    if (MaskHI < 4)
+      PermMask = (PermMask & 0x0f) | 0x80;
+  }
+  if (IsV2Zero) {
+    V2 = DAG.getUNDEF(VT);
+    if (MaskLO >= 4)
+      PermMask = (PermMask & 0xf0) | 0x08;
+    if (MaskHI >= 4)
+      PermMask = (PermMask & 0x0f) | 0x80;
+  }
+
+  return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
+                     DAG.getConstant(PermMask, DL, MVT::i8));
+}
+
+/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
+/// shuffling each lane.
+///
+/// This will only succeed when the result of fixing the 128-bit lanes results
+/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
+/// each 128-bit lanes. This handles many cases where we can quickly blend away
+/// the lane crosses early and then use simpler shuffles within each lane.
+///
+/// FIXME: It might be worthwhile at some point to support this without
+/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
+/// in x86 only floating point has interesting non-repeating shuffles, and even
+/// those are still *marginally* more expensive.
+static SDValue lowerVectorShuffleByMerging128BitLanes(
+    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+  assert(!V2.isUndef() && "This is only useful with multiple inputs.");
+
+  int Size = Mask.size();
+  int LaneSize = 128 / VT.getScalarSizeInBits();
+  int NumLanes = Size / LaneSize;
+  assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
+
+  // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
+  // check whether the in-128-bit lane shuffles share a repeating pattern.
+  SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
+  SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
+  for (int i = 0; i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
+
+    int j = i / LaneSize;
+
+    if (Lanes[j] < 0) {
+      // First entry we've seen for this lane.
+      Lanes[j] = Mask[i] / LaneSize;
+    } else if (Lanes[j] != Mask[i] / LaneSize) {
+      // This doesn't match the lane selected previously!
+      return SDValue();
+    }
+
+    // Check that within each lane we have a consistent shuffle mask.
+    int k = i % LaneSize;
+    if (InLaneMask[k] < 0) {
+      InLaneMask[k] = Mask[i] % LaneSize;
+    } else if (InLaneMask[k] != Mask[i] % LaneSize) {
+      // This doesn't fit a repeating in-lane mask.
+      return SDValue();
+    }
+  }
+
+  // First shuffle the lanes into place.
+  MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
+                                VT.getSizeInBits() / 64);
+  SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
+  for (int i = 0; i < NumLanes; ++i)
+    if (Lanes[i] >= 0) {
+      LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
+      LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
+    }
+
+  V1 = DAG.getBitcast(LaneVT, V1);
+  V2 = DAG.getBitcast(LaneVT, V2);
+  SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
+
+  // Cast it back to the type we actually want.
+  LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
+
+  // Now do a simple shuffle that isn't lane crossing.
+  SmallVector<int, 8> NewMask((unsigned)Size, -1);
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0)
+      NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
+  assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
+         "Must not introduce lane crosses at this point!");
+
+  return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
+}
+
+/// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
+/// This allows for fast cases such as subvector extraction/insertion
+/// or shuffling smaller vector types which can lower more efficiently.
+static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
+                                               SDValue V1, SDValue V2,
+                                               ArrayRef<int> Mask,
+                                               const X86Subtarget &Subtarget,
+                                               SelectionDAG &DAG) {
+  assert(VT.is256BitVector() && "Expected 256-bit vector");
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned HalfNumElts = NumElts / 2;
+  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+
+  bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
+  bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
+  if (!UndefLower && !UndefUpper)
+    return SDValue();
+
+  // Upper half is undef and lower half is whole upper subvector.
+  // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  if (UndefUpper &&
+      isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+                             DAG.getIntPtrConstant(HalfNumElts, DL));
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+                       DAG.getIntPtrConstant(0, DL));
+  }
+
+  // Lower half is undef and upper half is whole lower subvector.
+  // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  if (UndefLower &&
+      isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+                             DAG.getIntPtrConstant(0, DL));
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+                       DAG.getIntPtrConstant(HalfNumElts, DL));
+  }
+
+  // If the shuffle only uses two of the four halves of the input operands,
+  // then extract them and perform the 'half' shuffle at half width.
+  // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
+  int HalfIdx1 = -1, HalfIdx2 = -1;
+  SmallVector<int, 8> HalfMask(HalfNumElts);
+  unsigned Offset = UndefLower ? HalfNumElts : 0;
+  for (unsigned i = 0; i != HalfNumElts; ++i) {
+    int M = Mask[i + Offset];
+    if (M < 0) {
+      HalfMask[i] = M;
+      continue;
+    }
+
+    // Determine which of the 4 half vectors this element is from.
+    // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
+    int HalfIdx = M / HalfNumElts;
+
+    // Determine the element index into its half vector source.
+    int HalfElt = M % HalfNumElts;
+
+    // We can shuffle with up to 2 half vectors, set the new 'half'
+    // shuffle mask accordingly.
+    if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
+      HalfMask[i] = HalfElt;
+      HalfIdx1 = HalfIdx;
+      continue;
+    }
+    if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
+      HalfMask[i] = HalfElt + HalfNumElts;
+      HalfIdx2 = HalfIdx;
+      continue;
+    }
+
+    // Too many half vectors referenced.
+    return SDValue();
+  }
+  assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
+
+  // Only shuffle the halves of the inputs when useful.
+  int NumLowerHalves =
+      (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
+  int NumUpperHalves =
+      (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
+
+  // uuuuXXXX - don't extract uppers just to insert again.
+  if (UndefLower && NumUpperHalves != 0)
+    return SDValue();
+
+  // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
+  if (UndefUpper && NumUpperHalves == 2)
+    return SDValue();
+
+  // AVX2 - XXXXuuuu - always extract lowers.
+  if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
+    // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
+    if (VT == MVT::v4f64 || VT == MVT::v4i64)
+      return SDValue();
+    // AVX2 supports variable 32-bit element cross-lane shuffles.
+    if (VT == MVT::v8f32 || VT == MVT::v8i32) {
+      // XXXXuuuu - don't extract lowers and uppers.
+      if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
+        return SDValue();
+    }
+  }
+
+  auto GetHalfVector = [&](int HalfIdx) {
+    if (HalfIdx < 0)
+      return DAG.getUNDEF(HalfVT);
+    SDValue V = (HalfIdx < 2 ? V1 : V2);
+    HalfIdx = (HalfIdx % 2) * HalfNumElts;
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
+                       DAG.getIntPtrConstant(HalfIdx, DL));
+  };
+
+  SDValue Half1 = GetHalfVector(HalfIdx1);
+  SDValue Half2 = GetHalfVector(HalfIdx2);
+  SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
+                     DAG.getIntPtrConstant(Offset, DL));
+}
+
+/// \brief Test whether the specified input (0 or 1) is in-place blended by the
+/// given mask.
+///
+/// This returns true if the elements from a particular input are already in the
+/// slot required by the given mask and require no permutation.
+static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
+  assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
+      return false;
+
+  return true;
+}
+
+/// Handle case where shuffle sources are coming from the same 128-bit lane and
+/// every lane can be represented as the same repeating mask - allowing us to
+/// shuffle the sources with the repeating shuffle and then permute the result
+/// to the destination lanes.
+static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
+    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+  int NumElts = VT.getVectorNumElements();
+  int NumLanes = VT.getSizeInBits() / 128;
+  int NumLaneElts = NumElts / NumLanes;
+
+  // On AVX2 we may be able to just shuffle the lowest elements and then
+  // broadcast the result.
+  if (Subtarget.hasAVX2()) {
+    for (unsigned BroadcastSize : {16, 32, 64}) {
+      if (BroadcastSize <= VT.getScalarSizeInBits())
+        continue;
+      int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
+
+      // Attempt to match a repeating pattern every NumBroadcastElts,
+      // accounting for UNDEFs but only references the lowest 128-bit
+      // lane of the inputs.
+      auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
+        for (int i = 0; i != NumElts; i += NumBroadcastElts)
+          for (int j = 0; j != NumBroadcastElts; ++j) {
+            int M = Mask[i + j];
+            if (M < 0)
+              continue;
+            int &R = RepeatMask[j];
+            if (0 != ((M % NumElts) / NumLaneElts))
+              return false;
+            if (0 <= R && R != M)
+              return false;
+            R = M;
+          }
+        return true;
+      };
+
+      SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
+      if (!FindRepeatingBroadcastMask(RepeatMask))
+        continue;
+
+      // Shuffle the (lowest) repeated elements in place for broadcast.
+      SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
+
+      // Shuffle the actual broadcast.
+      SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
+      for (int i = 0; i != NumElts; i += NumBroadcastElts)
+        for (int j = 0; j != NumBroadcastElts; ++j)
+          BroadcastMask[i + j] = j;
+      return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
+                                  BroadcastMask);
+    }
+  }
+
+  // Bail if the shuffle mask doesn't cross 128-bit lanes.
+  if (!is128BitLaneCrossingShuffleMask(VT, Mask))
+    return SDValue();
+
+  // Bail if we already have a repeated lane shuffle mask.
+  SmallVector<int, 8> RepeatedShuffleMask;
+  if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
+    return SDValue();
+
+  // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
+  // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
+  int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
+  int NumSubLanes = NumLanes * SubLaneScale;
+  int NumSubLaneElts = NumLaneElts / SubLaneScale;
+
+  // Check that all the sources are coming from the same lane and see if we can
+  // form a repeating shuffle mask (local to each sub-lane). At the same time,
+  // determine the source sub-lane for each destination sub-lane.
+  int TopSrcSubLane = -1;
+  SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
+  SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
+      SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
+      SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
+
+  for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
+    // Extract the sub-lane mask, check that it all comes from the same lane
+    // and normalize the mask entries to come from the first lane.
+    int SrcLane = -1;
+    SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
+    for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
+      int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
+      if (M < 0)
+        continue;
+      int Lane = (M % NumElts) / NumLaneElts;
+      if ((0 <= SrcLane) && (SrcLane != Lane))
+        return SDValue();
+      SrcLane = Lane;
+      int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
+      SubLaneMask[Elt] = LocalM;
+    }
+
+    // Whole sub-lane is UNDEF.
+    if (SrcLane < 0)
+      continue;
+
+    // Attempt to match against the candidate repeated sub-lane masks.
+    for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
+      auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
+        for (int i = 0; i != NumSubLaneElts; ++i) {
+          if (M1[i] < 0 || M2[i] < 0)
+            continue;
+          if (M1[i] != M2[i])
+            return false;
+        }
+        return true;
+      };
+
+      auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
+      if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
+        continue;
+
+      // Merge the sub-lane mask into the matching repeated sub-lane mask.
+      for (int i = 0; i != NumSubLaneElts; ++i) {
+        int M = SubLaneMask[i];
+        if (M < 0)
+          continue;
+        assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
+               "Unexpected mask element");
+        RepeatedSubLaneMask[i] = M;
+      }
+
+      // Track the top most source sub-lane - by setting the remaining to UNDEF
+      // we can greatly simplify shuffle matching.
+      int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
+      TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
+      Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
+      break;
+    }
+
+    // Bail if we failed to find a matching repeated sub-lane mask.
+    if (Dst2SrcSubLanes[DstSubLane] < 0)
+      return SDValue();
+  }
+  assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
+         "Unexpected source lane");
+
+  // Create a repeating shuffle mask for the entire vector.
+  SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
+  for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
+    int Lane = SubLane / SubLaneScale;
+    auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
+    for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
+      int M = RepeatedSubLaneMask[Elt];
+      if (M < 0)
+        continue;
+      int Idx = (SubLane * NumSubLaneElts) + Elt;
+      RepeatedMask[Idx] = M + (Lane * NumLaneElts);
+    }
+  }
+  SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
+
+  // Shuffle each source sub-lane to its destination.
+  SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
+  for (int i = 0; i != NumElts; i += NumSubLaneElts) {
+    int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
+    if (SrcSubLane < 0)
+      continue;
+    for (int j = 0; j != NumSubLaneElts; ++j)
+      SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
+  }
+
+  return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
+                              SubLaneMask);
+}
+
+static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
+                                         unsigned &ShuffleImm,
+                                         ArrayRef<int> Mask) {
+  int NumElts = VT.getVectorNumElements();
+  assert(VT.getScalarType() == MVT::f64 &&
+         (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
+         "Unexpected data type for VSHUFPD");
+
+  // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
+  // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
+  ShuffleImm = 0;
+  bool ShufpdMask = true;
+  bool CommutableMask = true;
+  for (int i = 0; i < NumElts; ++i) {
+    if (Mask[i] == SM_SentinelUndef)
+      continue;
+    if (Mask[i] < 0)
+      return false;
+    int Val = (i & 6) + NumElts * (i & 1);
+    int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
+    if (Mask[i] < Val || Mask[i] > Val + 1)
+      ShufpdMask = false;
+    if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
+      CommutableMask = false;
+    ShuffleImm |= (Mask[i] % 2) << i;
+  }
+
+  if (ShufpdMask)
+    return true;
+  if (CommutableMask) {
+    std::swap(V1, V2);
+    return true;
+  }
+
+  return false;
+}
+
+static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
+                                            ArrayRef<int> Mask, SDValue V1,
+                                            SDValue V2, SelectionDAG &DAG) {
+  unsigned Immediate = 0;
+  if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
+    return SDValue();
+
+  return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
+                     DAG.getConstant(Immediate, DL, MVT::i8));
+}
+
+static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
+                                           ArrayRef<int> Mask, SDValue V1,
+                                           SDValue V2, SelectionDAG &DAG) {
+  MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+  MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
+
+  SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
+  if (V2.isUndef())
+    return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+
+  return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
+}
+
+/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
+///
+/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
+/// isn't available.
+static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+  SmallVector<int, 4> WidenedMask;
+  if (canWidenShuffleElements(Mask, WidenedMask))
+    if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
+                                             Zeroable, Subtarget, DAG))
+      return V;
+
+  if (V2.isUndef()) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
+            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+      return Broadcast;
+
+    // Use low duplicate instructions for masks that match their pattern.
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+      return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
+
+    if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
+      // Non-half-crossing single input shuffles can be lowered with an
+      // interleaved permutation.
+      unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
+                              ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
+      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
+                         DAG.getConstant(VPERMILPMask, DL, MVT::i8));
+    }
+
+    // With AVX2 we have direct support for this permutation.
+    if (Subtarget.hasAVX2())
+      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
+                         getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+    // Try to create an in-lane repeating shuffle mask and then shuffle the
+    // the results into the target lanes.
+    if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+      return V;
+
+    // Otherwise, fall back.
+    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
+                                                   DAG);
+  }
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
+    return V;
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
+
+  // Check if the blend happens to exactly fit that of SHUFPD.
+  if (SDValue Op =
+      lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
+    return Op;
+
+  // Try to create an in-lane repeating shuffle mask and then shuffle the
+  // the results into the target lanes.
+  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+          DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle. However, if we have AVX2 and either inputs are already in place,
+  // we will be able to shuffle even across lanes the other input in a single
+  // instruction so skip this pattern.
+  if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+                                isShuffleMaskInputInPlace(1, Mask))))
+    if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+      return Result;
+
+  // If we have AVX2 then we always want to lower with a blend because an v4 we
+  // can fully permute the elements.
+  if (Subtarget.hasAVX2())
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
+                                                      Mask, DAG);
+
+  // Otherwise fall back on generic lowering.
+  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v4i64 shuffling..
+static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+  assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
+
+  SmallVector<int, 4> WidenedMask;
+  if (canWidenShuffleElements(Mask, WidenedMask))
+    if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
+                                             Zeroable, Subtarget, DAG))
+      return V;
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  if (V2.isUndef()) {
+    // When the shuffle is mirrored between the 128-bit lanes of the unit, we
+    // can use lower latency instructions that will operate on both lanes.
+    SmallVector<int, 2> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
+      SmallVector<int, 4> PSHUFDMask;
+      scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
+      return DAG.getBitcast(
+          MVT::v4i64,
+          DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
+                      DAG.getBitcast(MVT::v8i32, V1),
+                      getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+    }
+
+    // AVX2 provides a direct instruction for permuting a single input across
+    // lanes.
+    return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
+                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+  }
+
+  // Try to use shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Shift;
+
+  // If we have VLX support, we can use VALIGN.
+  if (Subtarget.hasVLX())
+    if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
+                                                    Mask, Subtarget, DAG))
+      return Rotate;
+
+  // Try to use PALIGNR.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
+                                                      Mask, Subtarget, DAG))
+    return Rotate;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
+    return V;
+
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle. However, if we have AVX2 and either inputs are already in place,
+  // we will be able to shuffle even across lanes the other input in a single
+  // instruction so skip this pattern.
+  if (!isShuffleMaskInputInPlace(0, Mask) &&
+      !isShuffleMaskInputInPlace(1, Mask))
+    if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+            DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
+      return Result;
+
+  // Otherwise fall back on generic blend lowering.
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
+                                                    Mask, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
+///
+/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
+/// isn't available.
+static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  // If the shuffle mask is repeated in each 128-bit lane, we have many more
+  // options to efficiently lower the shuffle.
+  SmallVector<int, 4> RepeatedMask;
+  if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
+    assert(RepeatedMask.size() == 4 &&
+           "Repeated masks must be half the mask width!");
+
+    // Use even/odd duplicate instructions for masks that match their pattern.
+    if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
+      return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
+    if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
+      return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
+
+    if (V2.isUndef())
+      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
+                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    if (SDValue V =
+            lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
+      return V;
+
+    // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
+    // have already handled any direct blends.
+    return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
+  }
+
+  // Try to create an in-lane repeating shuffle mask and then shuffle the
+  // the results into the target lanes.
+  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+          DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
+  // If we have a single input shuffle with different shuffle patterns in the
+  // two 128-bit lanes use the variable mask to VPERMILPS.
+  if (V2.isUndef()) {
+    SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
+    if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
+      return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
+
+    if (Subtarget.hasAVX2())
+      return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
+
+    // Otherwise, fall back.
+    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
+                                                   DAG);
+  }
+
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle.
+  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+          DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
+    return Result;
+
+  // If we have AVX2 then we always want to lower with a blend because at v8 we
+  // can fully permute the elements.
+  if (Subtarget.hasAVX2())
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
+                                                      Mask, DAG);
+
+  // Otherwise fall back on generic lowering.
+  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v8i32 shuffling..
+static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+  assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
+
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative. It also allows us to fold memory operands into the
+  // shuffle in many cases.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+          DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    return ZExt;
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  // If the shuffle mask is repeated in each 128-bit lane we can use more
+  // efficient instructions that mirror the shuffles across the two 128-bit
+  // lanes.
+  SmallVector<int, 4> RepeatedMask;
+  bool Is128BitLaneRepeatedShuffle =
+      is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
+  if (Is128BitLaneRepeatedShuffle) {
+    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+    if (V2.isUndef())
+      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
+                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    if (SDValue V =
+            lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
+      return V;
+  }
+
+  // Try to use shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Shift;
+
+  // If we have VLX support, we can use VALIGN.
+  if (Subtarget.hasVLX())
+    if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
+                                                    Mask, Subtarget, DAG))
+      return Rotate;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
+  // Try to create an in-lane repeating shuffle mask and then shuffle the
+  // results into the target lanes.
+  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
+  // If the shuffle patterns aren't repeated but it is a single input, directly
+  // generate a cross-lane VPERMD instruction.
+  if (V2.isUndef()) {
+    SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
+    return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
+  }
+
+  // Assume that a single SHUFPS is faster than an alternative sequence of
+  // multiple instructions (even if the CPU has a domain penalty).
+  // If some CPU is harmed by the domain switch, we can fix it in a later pass.
+  if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
+    SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
+    SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
+    SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
+                                                  CastV1, CastV2, DAG);
+    return DAG.getBitcast(MVT::v8i32, ShufPS);
+  }
+
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle.
+  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+    return Result;
+
+  // Otherwise fall back on generic blend lowering.
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
+                                                    Mask, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v16i16 shuffling..
+static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                        const SmallBitVector &Zeroable,
+                                        SDValue V1, SDValue V2,
+                                        const X86Subtarget &Subtarget,
+                                        SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
+  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+  assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
+
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative. It also allows us to fold memory operands into the
+  // shuffle in many cases.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+          DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    return ZExt;
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
+    return V;
+
+  // Try to use shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
+  // Try to create an in-lane repeating shuffle mask and then shuffle the
+  // the results into the target lanes.
+  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
+  if (V2.isUndef()) {
+    // There are no generalized cross-lane shuffle operations available on i16
+    // element types.
+    if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
+      return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
+                                                     Mask, DAG);
+
+    SmallVector<int, 8> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+      // As this is a single-input shuffle, the repeated mask should be
+      // a strictly valid v8i16 mask that we can pass through to the v8i16
+      // lowering to handle even the v16 case.
+      return lowerV8I16GeneralSingleInputVectorShuffle(
+          DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
+    }
+  }
+
+  if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
+          DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
+    return PSHUFB;
+
+  // AVX512BWVL can lower to VPERMW.
+  if (Subtarget.hasBWI() && Subtarget.hasVLX())
+    return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
+
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle.
+  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+    return Result;
+
+  // Otherwise fall back on generic lowering.
+  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v32i8 shuffling..
+static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
+  assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
+  assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
+
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative. It also allows us to fold memory operands into the
+  // shuffle in many cases.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+          DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    return ZExt;
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
+    return V;
+
+  // Try to use shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
+  // Try to create an in-lane repeating shuffle mask and then shuffle the
+  // the results into the target lanes.
+  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
+  // There are no generalized cross-lane shuffle operations available on i8
+  // element types.
+  if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
+    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
+                                                   DAG);
+
+  if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
+          DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
+    return PSHUFB;
+
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle.
+  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+    return Result;
+
+  // Otherwise fall back on generic lowering.
+  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
+}
+
+/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
+///
+/// This routine either breaks down the specific type of a 256-bit x86 vector
+/// shuffle or splits it into two 128-bit shuffles and fuses the results back
+/// together based on the available instructions.
+static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                        MVT VT, SDValue V1, SDValue V2,
+                                        const SmallBitVector &Zeroable,
+                                        const X86Subtarget &Subtarget,
+                                        SelectionDAG &DAG) {
+  // If we have a single input to the zero element, insert that into V1 if we
+  // can do so cheaply.
+  int NumElts = VT.getVectorNumElements();
+  int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
+
+  if (NumV2Elements == 1 && Mask[0] >= NumElts)
+    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+            DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
+      return Insertion;
+
+  // Handle special cases where the lower or upper half is UNDEF.
+  if (SDValue V =
+          lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
+  // There is a really nice hard cut-over between AVX1 and AVX2 that means we
+  // can check for those subtargets here and avoid much of the subtarget
+  // querying in the per-vector-type lowering routines. With AVX1 we have
+  // essentially *zero* ability to manipulate a 256-bit vector with integer
+  // types. Since we'll use floating point types there eventually, just
+  // immediately cast everything to a float and operate entirely in that domain.
+  if (VT.isInteger() && !Subtarget.hasAVX2()) {
+    int ElementBits = VT.getScalarSizeInBits();
+    if (ElementBits < 32) {
+      // No floating point type available, if we can't use the bit operations
+      // for masking/blending then decompose into 128-bit vectors.
+      if (SDValue V =
+              lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
+        return V;
+      if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+        return V;
+      return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+    }
+
+    MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
+                                VT.getVectorNumElements());
+    V1 = DAG.getBitcast(FpVT, V1);
+    V2 = DAG.getBitcast(FpVT, V2);
+    return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
+  }
+
+  switch (VT.SimpleTy) {
+  case MVT::v4f64:
+    return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+  case MVT::v4i64:
+    return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+  case MVT::v8f32:
+    return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+  case MVT::v8i32:
+    return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+  case MVT::v16i16:
+    return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+  case MVT::v32i8:
+    return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+
+  default:
+    llvm_unreachable("Not a valid 256-bit x86 vector type!");
+  }
+}
+
+/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
+static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
+                                        ArrayRef<int> Mask, SDValue V1,
+                                        SDValue V2, SelectionDAG &DAG) {
+  assert(VT.getScalarSizeInBits() == 64 &&
+         "Unexpected element type size for 128bit shuffle.");
+
+  // To handle 256 bit vector requires VLX and most probably
+  // function lowerV2X128VectorShuffle() is better solution.
+  assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
+
+  SmallVector<int, 4> WidenedMask;
+  if (!canWidenShuffleElements(Mask, WidenedMask))
+    return SDValue();
+
+  SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
+  // Insure elements came from the same Op.
+  int MaxOp1Index = VT.getVectorNumElements()/2 - 1;
+  for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
+    if (WidenedMask[i] == SM_SentinelZero)
+      return SDValue();
+    if (WidenedMask[i] == SM_SentinelUndef)
+      continue;
+
+    SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1;
+    unsigned OpIndex = (i < Size/2) ? 0 : 1;
+    if (Ops[OpIndex].isUndef())
+      Ops[OpIndex] = Op;
+    else if (Ops[OpIndex] != Op)
+      return SDValue();
+  }
+
+  // Form a 128-bit permutation.
+  // Convert the 64-bit shuffle mask selection values into 128-bit selection
+  // bits defined by a vshuf64x2 instruction's immediate control byte.
+  unsigned PermMask = 0, Imm = 0;
+  unsigned ControlBitsNum = WidenedMask.size() / 2;
+
+  for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
+    // Use first element in place of undef mask.
+    Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
+    PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
+  }
+
+  return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
+                     DAG.getConstant(PermMask, DL, MVT::i8));
+}
+
+/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
+static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+  if (V2.isUndef()) {
+    // Use low duplicate instructions for masks that match their pattern.
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
+      return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
+
+    if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
+      // Non-half-crossing single input shuffles can be lowered with an
+      // interleaved permutation.
+      unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
+                              ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
+                              ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
+                              ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
+      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
+                         DAG.getConstant(VPERMILPMask, DL, MVT::i8));
+    }
+
+    SmallVector<int, 4> RepeatedMask;
+    if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
+      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
+                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+  }
+
+  if (SDValue Shuf128 =
+          lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
+    return Shuf128;
+
+  if (SDValue Unpck =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
+    return Unpck;
+
+  // Check if the blend happens to exactly fit that of SHUFPD.
+  if (SDValue Op =
+      lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
+    return Op;
+
+  return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
+static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
+                                        SDValue V1, SDValue V2,
+                                        const X86Subtarget &Subtarget,
+                                        SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
+  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+  // If the shuffle mask is repeated in each 128-bit lane, we have many more
+  // options to efficiently lower the shuffle.
+  SmallVector<int, 4> RepeatedMask;
+  if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
+    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+
+    // Use even/odd duplicate instructions for masks that match their pattern.
+    if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
+      return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
+    if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
+      return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
+
+    if (V2.isUndef())
+      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
+                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    if (SDValue Unpck =
+            lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
+      return Unpck;
+
+    // Otherwise, fall back to a SHUFPS sequence.
+    return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
+  }
+
+  return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
+static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+  if (SDValue Shuf128 =
+          lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
+    return Shuf128;
+
+  if (V2.isUndef()) {
+    // When the shuffle is mirrored between the 128-bit lanes of the unit, we
+    // can use lower latency instructions that will operate on all four
+    // 128-bit lanes.
+    SmallVector<int, 2> Repeated128Mask;
+    if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
+      SmallVector<int, 4> PSHUFDMask;
+      scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
+      return DAG.getBitcast(
+          MVT::v8i64,
+          DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
+                      DAG.getBitcast(MVT::v16i32, V1),
+                      getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+    }
+
+    SmallVector<int, 4> Repeated256Mask;
+    if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
+      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
+                         getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
+  }
+
+  // Try to use shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Shift;
+
+  // Try to use VALIGN.
+  if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
+                                                  Mask, Subtarget, DAG))
+    return Rotate;
+
+  // Try to use PALIGNR.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
+                                                      Mask, Subtarget, DAG))
+    return Rotate;
+
+  if (SDValue Unpck =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
+    return Unpck;
+
+  return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
+static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                        const SmallBitVector &Zeroable,
+                                        SDValue V1, SDValue V2,
+                                        const X86Subtarget &Subtarget,
+                                        SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
+  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative. It also allows us to fold memory operands into the
+  // shuffle in many cases.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+          DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    return ZExt;
+
+  // If the shuffle mask is repeated in each 128-bit lane we can use more
+  // efficient instructions that mirror the shuffles across the four 128-bit
+  // lanes.
+  SmallVector<int, 4> RepeatedMask;
+  bool Is128BitLaneRepeatedShuffle =
+      is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
+  if (Is128BitLaneRepeatedShuffle) {
+    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+    if (V2.isUndef())
+      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
+                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    if (SDValue V =
+            lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
+      return V;
+  }
+
+  // Try to use shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Shift;
+
+  // Try to use VALIGN.
+  if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
+                                                  Mask, Subtarget, DAG))
+    return Rotate;
+
+  // Try to use byte rotation instructions.
+  if (Subtarget.hasBWI())
+    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+            DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
+      return Rotate;
+
+  // Assume that a single SHUFPS is faster than using a permv shuffle.
+  // If some CPU is harmed by the domain switch, we can fix it in a later pass.
+  if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
+    SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
+    SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
+    SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
+                                                  CastV1, CastV2, DAG);
+    return DAG.getBitcast(MVT::v16i32, ShufPS);
+  }
+
+  return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
+static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                        const SmallBitVector &Zeroable,
+                                        SDValue V1, SDValue V2,
+                                        const X86Subtarget &Subtarget,
+                                        SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
+  assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
+  assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
+
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative. It also allows us to fold memory operands into the
+  // shuffle in many cases.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+          DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    return ZExt;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
+    return V;
+
+  // Try to use shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
+  if (V2.isUndef()) {
+    SmallVector<int, 8> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
+      // As this is a single-input shuffle, the repeated mask should be
+      // a strictly valid v8i16 mask that we can pass through to the v8i16
+      // lowering to handle even the v32 case.
+      return lowerV8I16GeneralSingleInputVectorShuffle(
+          DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
+    }
+  }
+
+  return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
+}
+
+/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
+static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable,
+                                       SDValue V1, SDValue V2,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
+  assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
+  assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
+
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative. It also allows us to fold memory operands into the
+  // shuffle in many cases.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+          DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    return ZExt;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
+    return V;
+
+  // Try to use shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
+  if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
+          DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
+    return PSHUFB;
+
+  // VBMI can use VPERMV/VPERMV3 byte shuffles.
+  if (Subtarget.hasVBMI())
+    return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
+
+  // FIXME: Implement direct support for this type!
+  return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
+}
+
+/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
+///
+/// This routine either breaks down the specific type of a 512-bit x86 vector
+/// shuffle or splits it into two 256-bit shuffles and fuses the results back
+/// together based on the available instructions.
+static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                        MVT VT, SDValue V1, SDValue V2,
+                                        const SmallBitVector &Zeroable,
+                                        const X86Subtarget &Subtarget,
+                                        SelectionDAG &DAG) {
+  assert(Subtarget.hasAVX512() &&
+         "Cannot lower 512-bit vectors w/ basic ISA!");
+
+  // If we have a single input to the zero element, insert that into V1 if we
+  // can do so cheaply.
+  int NumElts = Mask.size();
+  int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
+
+  if (NumV2Elements == 1 && Mask[0] >= NumElts)
+    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+            DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
+      return Insertion;
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast =
+          lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
+    return Broadcast;
+
+  // Dispatch to each element type for lowering. If we don't have support for
+  // specific element type shuffles at 512 bits, immediately split them and
+  // lower them. Each lowering routine of a given type is allowed to assume that
+  // the requisite ISA extensions for that element type are available.
+  switch (VT.SimpleTy) {
+  case MVT::v8f64:
+    return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+  case MVT::v16f32:
+    return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+  case MVT::v8i64:
+    return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+  case MVT::v16i32:
+    return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+  case MVT::v32i16:
+    return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+  case MVT::v64i8:
+    return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+
+  default:
+    llvm_unreachable("Not a valid 512-bit x86 vector type!");
+  }
+}
+
+// Lower vXi1 vector shuffles.
+// There is no a dedicated instruction on AVX-512 that shuffles the masks.
+// The only way to shuffle bits is to sign-extend the mask vector to SIMD
+// vector, shuffle and then truncate it back.
+static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                      MVT VT, SDValue V1, SDValue V2,
+                                      const X86Subtarget &Subtarget,
+                                      SelectionDAG &DAG) {
+  assert(Subtarget.hasAVX512() &&
+         "Cannot lower 512-bit vectors w/o basic ISA!");
+  MVT ExtVT;
+  switch (VT.SimpleTy) {
+  default:
+    llvm_unreachable("Expected a vector of i1 elements");
+  case MVT::v2i1:
+    ExtVT = MVT::v2i64;
+    break;
+  case MVT::v4i1:
+    ExtVT = MVT::v4i32;
+    break;
+  case MVT::v8i1:
+    ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
+    break;
+  case MVT::v16i1:
+    ExtVT = MVT::v16i32;
+    break;
+  case MVT::v32i1:
+    ExtVT = MVT::v32i16;
+    break;
+  case MVT::v64i1:
+    ExtVT = MVT::v64i8;
+    break;
+  }
+
+  if (ISD::isBuildVectorAllZeros(V1.getNode()))
+    V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
+  else if (ISD::isBuildVectorAllOnes(V1.getNode()))
+    V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+  else
+    V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
+
+  if (V2.isUndef())
+    V2 = DAG.getUNDEF(ExtVT);
+  else if (ISD::isBuildVectorAllZeros(V2.getNode()))
+    V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
+  else if (ISD::isBuildVectorAllOnes(V2.getNode()))
+    V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+  else
+    V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
+
+  SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
+  // i1 was sign extended we can use X86ISD::CVT2MASK.
+  int NumElems = VT.getVectorNumElements();
+  if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
+      (Subtarget.hasDQI() && (NumElems < 32)))
+    return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
+
+  return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
+}
+
+/// Helper function that returns true if the shuffle mask should be
+/// commuted to improve canonicalization.
+static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
+  int NumElements = Mask.size();
+
+  int NumV1Elements = 0, NumV2Elements = 0, NumSentinelElements = 0;
+  for (int M : Mask)
+    if (M < 0)
+      ++NumSentinelElements;
+    else if (M < NumElements)
+      ++NumV1Elements;
+    else
+      ++NumV2Elements;
+
+  // Commute the shuffle as needed such that more elements come from V1 than
+  // V2. This allows us to match the shuffle pattern strictly on how many
+  // elements come from V1 without handling the symmetric cases.
+  if (NumV2Elements > NumV1Elements)
+    return true;
+
+  assert(NumV1Elements > 0 && "No V1 indices");
+
+  if (NumV2Elements == 0)
+    return false;
+
+  // When the number of V1 and V2 elements are the same, try to minimize the
+  // number of uses of V2 in the low half of the vector. When that is tied,
+  // ensure that the sum of indices for V1 is equal to or lower than the sum
+  // indices for V2. When those are equal, try to ensure that the number of odd
+  // indices for V1 is lower than the number of odd indices for V2.
+  if (NumV1Elements == NumV2Elements) {
+    int LowV1Elements = 0, LowV2Elements = 0;
+    for (int M : Mask.slice(0, NumElements / 2))
+      if (M >= NumElements)
+        ++LowV2Elements;
+      else if (M >= 0)
+        ++LowV1Elements;
+    if (LowV2Elements > LowV1Elements)
+      return true;
+    if (LowV2Elements == LowV1Elements) {
+      int SumV1Indices = 0, SumV2Indices = 0;
+      for (int i = 0, Size = Mask.size(); i < Size; ++i)
+        if (Mask[i] >= NumElements)
+          SumV2Indices += i;
+        else if (Mask[i] >= 0)
+          SumV1Indices += i;
+      if (SumV2Indices < SumV1Indices)
+        return true;
+      if (SumV2Indices == SumV1Indices) {
+        int NumV1OddIndices = 0, NumV2OddIndices = 0;
+        for (int i = 0, Size = Mask.size(); i < Size; ++i)
+          if (Mask[i] >= NumElements)
+            NumV2OddIndices += i % 2;
+          else if (Mask[i] >= 0)
+            NumV1OddIndices += i % 2;
+        if (NumV2OddIndices < NumV1OddIndices)
+          return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/// \brief Top-level lowering for x86 vector shuffles.
+///
+/// This handles decomposition, canonicalization, and lowering of all x86
+/// vector shuffles. Most of the specific lowering strategies are encapsulated
+/// above in helper routines. The canonicalization attempts to widen shuffles
+/// to involve fewer lanes of wider elements, consolidate symmetric patterns
+/// s.t. only one of the two inputs needs to be tested, etc.
+static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
+                                  SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT VT = Op.getSimpleValueType();
+  int NumElements = VT.getVectorNumElements();
+  SDLoc DL(Op);
+  bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
+
+  assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
+         "Can't lower MMX shuffles");
+
+  bool V1IsUndef = V1.isUndef();
+  bool V2IsUndef = V2.isUndef();
+  if (V1IsUndef && V2IsUndef)
+    return DAG.getUNDEF(VT);
+
+  // When we create a shuffle node we put the UNDEF node to second operand,
+  // but in some cases the first operand may be transformed to UNDEF.
+  // In this case we should just commute the node.
+  if (V1IsUndef)
+    return DAG.getCommutedVectorShuffle(*SVOp);
+
+  // Check for non-undef masks pointing at an undef vector and make the masks
+  // undef as well. This makes it easier to match the shuffle based solely on
+  // the mask.
+  if (V2IsUndef)
+    for (int M : Mask)
+      if (M >= NumElements) {
+        SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
+        for (int &M : NewMask)
+          if (M >= NumElements)
+            M = -1;
+        return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
+      }
+
+  // Check for illegal shuffle mask element index values.
+  int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
+  assert(llvm::all_of(Mask,
+                      [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
+         "Out of bounds shuffle index");
+
+  // We actually see shuffles that are entirely re-arrangements of a set of
+  // zero inputs. This mostly happens while decomposing complex shuffles into
+  // simple ones. Directly lower these as a buildvector of zeros.
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  if (Zeroable.all())
+    return getZeroVector(VT, Subtarget, DAG, DL);
+
+  // Try to collapse shuffles into using a vector type with fewer elements but
+  // wider element types. We cap this to not form integers or floating point
+  // elements wider than 64 bits, but it might be interesting to form i128
+  // integers to handle flipping the low and high halves of AVX 256-bit vectors.
+  SmallVector<int, 16> WidenedMask;
+  if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
+      canWidenShuffleElements(Mask, WidenedMask)) {
+    MVT NewEltVT = VT.isFloatingPoint()
+                       ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
+                       : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
+    MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
+    // Make sure that the new vector type is legal. For example, v2f64 isn't
+    // legal on SSE1.
+    if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
+      V1 = DAG.getBitcast(NewVT, V1);
+      V2 = DAG.getBitcast(NewVT, V2);
+      return DAG.getBitcast(
+          VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
+    }
+  }
+
+  // Commute the shuffle if it will improve canonicalization.
+  if (canonicalizeShuffleMaskWithCommute(Mask))
+    return DAG.getCommutedVectorShuffle(*SVOp);
+
+  // For each vector width, delegate to a specialized lowering routine.
+  if (VT.is128BitVector())
+    return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
+                                    DAG);
+
+  if (VT.is256BitVector())
+    return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
+                                    DAG);
+
+  if (VT.is512BitVector())
+    return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
+                                    DAG);
+
+  if (Is1BitVector)
+    return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
+
+  llvm_unreachable("Unimplemented!");
+}
+
+/// \brief Try to lower a VSELECT instruction to a vector shuffle.
+static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
+                                           const X86Subtarget &Subtarget,
+                                           SelectionDAG &DAG) {
+  SDValue Cond = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+  auto *CondBV = cast<BuildVectorSDNode>(Cond);
+
+  // Only non-legal VSELECTs reach this lowering, convert those into generic
+  // shuffles and re-use the shuffle lowering path for blends.
+  SmallVector<int, 32> Mask;
+  for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
+    SDValue CondElt = CondBV->getOperand(i);
+    Mask.push_back(
+        isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
+                                     : -1);
+  }
+  return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
+}
+
+SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+  // A vselect where all conditions and data are constants can be optimized into
+  // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
+  if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
+      ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
+      ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
+    return SDValue();
+
+  // Try to lower this to a blend-style vector shuffle. This can handle all
+  // constant condition cases.
+  if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
+    return BlendOp;
+
+  // Variable blends are only legal from SSE4.1 onward.
+  if (!Subtarget.hasSSE41())
+    return SDValue();
+
+  // Only some types will be legal on some subtargets. If we can emit a legal
+  // VSELECT-matching blend, return Op, and but if we need to expand, return
+  // a null value.
+  switch (Op.getSimpleValueType().SimpleTy) {
+  default:
+    // Most of the vector types have blends past SSE4.1.
+    return Op;
+
+  case MVT::v32i8:
+    // The byte blends for AVX vectors were introduced only in AVX2.
+    if (Subtarget.hasAVX2())
+      return Op;
+
+    return SDValue();
+
+  case MVT::v8i16:
+  case MVT::v16i16:
+    // AVX-512 BWI and VLX features support VSELECT with i16 elements.
+    if (Subtarget.hasBWI() && Subtarget.hasVLX())
+      return Op;
+
+    // FIXME: We should custom lower this by fixing the condition and using i8
+    // blends.
+    return SDValue();
+  }
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+
+  if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
+    return SDValue();
+
+  if (VT.getSizeInBits() == 8) {
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
+                                  Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+                                  DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  }
+
+  if (VT == MVT::f32) {
+    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
+    // the result back to FR32 register. It's only worth matching if the
+    // result has a single use which is a store or a bitcast to i32.  And in
+    // the case of a store, it's not worth it if the index is a constant 0,
+    // because a MOVSSmr can be used instead, which is smaller and faster.
+    if (!Op.hasOneUse())
+      return SDValue();
+    SDNode *User = *Op.getNode()->use_begin();
+    if ((User->getOpcode() != ISD::STORE ||
+         isNullConstant(Op.getOperand(1))) &&
+        (User->getOpcode() != ISD::BITCAST ||
+         User->getValueType(0) != MVT::i32))
+      return SDValue();
+    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                  DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
+                                  Op.getOperand(1));
+    return DAG.getBitcast(MVT::f32, Extract);
+  }
+
+  if (VT == MVT::i32 || VT == MVT::i64) {
+    // ExtractPS/pextrq works with constant index.
+    if (isa<ConstantSDNode>(Op.getOperand(1)))
+      return Op;
+  }
+
+  return SDValue();
+}
+
+/// Extract one bit from mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue
+X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Vec = Op.getOperand(0);
+  SDLoc dl(Vec);
+  MVT VecVT = Vec.getSimpleValueType();
+  SDValue Idx = Op.getOperand(1);
+  MVT EltVT = Op.getSimpleValueType();
+
+  assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
+  assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
+         "Unexpected vector type in ExtractBitFromMaskVector");
+
+  // variable index can't be handled in mask registers,
+  // extend vector to VR512
+  if (!isa<ConstantSDNode>(Idx)) {
+    MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
+    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                              ExtVT.getVectorElementType(), Ext, Idx);
+    return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+  }
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
+      (VecVT.getVectorNumElements() < 8)) {
+    // Use kshiftlw/rw instruction.
+    VecVT = MVT::v16i1;
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
+                      DAG.getUNDEF(VecVT),
+                      Vec,
+                      DAG.getIntPtrConstant(0, dl));
+  }
+  unsigned MaxSift = VecVT.getVectorNumElements() - 1;
+  if (MaxSift - IdxVal)
+    Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+                      DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
+  Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+                    DAG.getConstant(MaxSift, dl, MVT::i8));
+  return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
+                       DAG.getIntPtrConstant(0, dl));
+}
+
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  MVT VecVT = Vec.getSimpleValueType();
+  SDValue Idx = Op.getOperand(1);
+
+  if (Op.getSimpleValueType() == MVT::i1)
+    return ExtractBitFromMaskVector(Op, DAG);
+
+  if (!isa<ConstantSDNode>(Idx)) {
+    if (VecVT.is512BitVector() ||
+        (VecVT.is256BitVector() && Subtarget.hasInt256() &&
+         VecVT.getScalarSizeInBits() == 32)) {
+
+      MVT MaskEltVT =
+        MVT::getIntegerVT(VecVT.getScalarSizeInBits());
+      MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
+                                    MaskEltVT.getSizeInBits());
+
+      Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
+      auto PtrVT = getPointerTy(DAG.getDataLayout());
+      SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
+                                 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
+                                 DAG.getConstant(0, dl, PtrVT));
+      SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
+                         DAG.getConstant(0, dl, PtrVT));
+    }
+    return SDValue();
+  }
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
+  // If this is a 256-bit vector result, first extract the 128-bit vector and
+  // then extract the element from the 128-bit vector.
+  if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
+    // Get the 128-bit vector.
+    Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
+    MVT EltVT = VecVT.getVectorElementType();
+
+    unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
+    assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
+
+    // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
+    // this can be done with a mask.
+    IdxVal &= ElemsPerChunk - 1;
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
+                       DAG.getConstant(IdxVal, dl, MVT::i32));
+  }
+
+  assert(VecVT.is128BitVector() && "Unexpected vector length");
+
+  MVT VT = Op.getSimpleValueType();
+
+  if (VT.getSizeInBits() == 16) {
+    // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
+    // we're going to zero extend the register or fold the store (SSE41 only).
+    if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
+        !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                     DAG.getBitcast(MVT::v4i32, Vec), Idx));
+
+    // Transform it so it match pextrw which produces a 32-bit result.
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
+                                  Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+                                  DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  }
+
+  if (Subtarget.hasSSE41())
+    if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
+      return Res;
+
+  // TODO: handle v16i8.
+
+  if (VT.getSizeInBits() == 32) {
+    if (IdxVal == 0)
+      return Op;
+
+    // SHUFPS the element to the lowest double word, then movss.
+    int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
+    Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+                       DAG.getIntPtrConstant(0, dl));
+  }
+
+  if (VT.getSizeInBits() == 64) {
+    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
+    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
+    //        to match extract_elt for f64.
+    if (IdxVal == 0)
+      return Op;
+
+    // UNPCKHPD the element to the lowest double word, then movsd.
+    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
+    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
+    int Mask[2] = { 1, -1 };
+    Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+                       DAG.getIntPtrConstant(0, dl));
+  }
+
+  return SDValue();
+}
+
+/// Insert one bit to mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue
+X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue Elt = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+  MVT VecVT = Vec.getSimpleValueType();
+
+  if (!isa<ConstantSDNode>(Idx)) {
+    // Non constant index. Extend source and destination,
+    // insert element and then truncate the result.
+    MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
+    MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
+    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
+      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
+      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
+    return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
+  }
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
+  unsigned NumElems = VecVT.getVectorNumElements();
+
+  if(Vec.isUndef()) {
+    if (IdxVal)
+      EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+                             DAG.getConstant(IdxVal, dl, MVT::i8));
+    return EltInVec;
+  }
+
+  // Insertion of one bit into first or last position
+  // can be done with two SHIFTs + OR.
+  if (IdxVal == 0 ) {
+    // EltInVec already at correct index and other bits are 0.
+    // Clean the first bit in source vector.
+    Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+                      DAG.getConstant(1 , dl, MVT::i8));
+    Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+                      DAG.getConstant(1, dl, MVT::i8));
+
+    return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
+  }
+  if (IdxVal == NumElems -1) {
+    // Move the bit to the last position inside the vector.
+    EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+                           DAG.getConstant(IdxVal, dl, MVT::i8));
+    // Clean the last bit in the source vector.
+    Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+                           DAG.getConstant(1, dl, MVT::i8));
+    Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+                           DAG.getConstant(1 , dl, MVT::i8));
+
+    return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
+  }
+
+  // Use shuffle to insert element.
+  SmallVector<int, 64> MaskVec(NumElems);
+  for (unsigned i = 0; i != NumElems; ++i)
+    MaskVec[i] = (i == IdxVal) ? NumElems : i;
+
+  return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
+}
+
+SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  if (EltVT == MVT::i1)
+    return InsertBitToMaskVector(Op, DAG);
+
+  SDLoc dl(Op);
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2 = Op.getOperand(2);
+  if (!isa<ConstantSDNode>(N2))
+    return SDValue();
+  auto *N2C = cast<ConstantSDNode>(N2);
+  unsigned IdxVal = N2C->getZExtValue();
+
+  // If we are clearing out a element, we do this more efficiently with a
+  // blend shuffle than a costly integer insertion.
+  // TODO: would other rematerializable values (e.g. allbits) benefit as well?
+  // TODO: pre-SSE41 targets will tend to use bit masking - this could still
+  // be beneficial if we are inserting several zeros and can combine the masks.
+  if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
+    SmallVector<int, 8> ClearMask;
+    for (unsigned i = 0; i != NumElts; ++i)
+      ClearMask.push_back(i == IdxVal ? i + NumElts : i);
+    SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
+    return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
+  }
+
+  // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
+  // into that, and then insert the subvector back into the result.
+  if (VT.is256BitVector() || VT.is512BitVector()) {
+    // With a 256-bit vector, we can insert into the zero element efficiently
+    // using a blend if we have AVX or AVX2 and the right data type.
+    if (VT.is256BitVector() && IdxVal == 0) {
+      // TODO: It is worthwhile to cast integer to floating point and back
+      // and incur a domain crossing penalty if that's what we'll end up
+      // doing anyway after extracting to a 128-bit vector.
+      if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
+          (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
+        SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+        N2 = DAG.getIntPtrConstant(1, dl);
+        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
+      }
+    }
+
+    // Get the desired 128-bit vector chunk.
+    SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
+
+    // Insert the element into the desired chunk.
+    unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
+    assert(isPowerOf2_32(NumEltsIn128));
+    // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
+    unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
+
+    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
+                    DAG.getConstant(IdxIn128, dl, MVT::i32));
+
+    // Insert the changed part back into the bigger vector
+    return insert128BitVector(N0, V, IdxVal, DAG, dl);
+  }
+  assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
+
+  if (Subtarget.hasSSE41()) {
+    if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
+      unsigned Opc;
+      if (VT == MVT::v8i16) {
+        Opc = X86ISD::PINSRW;
+      } else {
+        assert(VT == MVT::v16i8);
+        Opc = X86ISD::PINSRB;
+      }
+
+      // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+      // argument.
+      if (N1.getValueType() != MVT::i32)
+        N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+      if (N2.getValueType() != MVT::i32)
+        N2 = DAG.getIntPtrConstant(IdxVal, dl);
+      return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+    }
+
+    if (EltVT == MVT::f32) {
+      // Bits [7:6] of the constant are the source select. This will always be
+      //   zero here. The DAG Combiner may combine an extract_elt index into
+      //   these bits. For example (insert (extract, 3), 2) could be matched by
+      //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
+      // Bits [5:4] of the constant are the destination select. This is the
+      //   value of the incoming immediate.
+      // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
+      //   combine either bitwise AND or insert of float 0.0 to set these bits.
+
+      bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
+      if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
+        // If this is an insertion of 32-bits into the low 32-bits of
+        // a vector, we prefer to generate a blend with immediate rather
+        // than an insertps. Blends are simpler operations in hardware and so
+        // will always have equal or better performance than insertps.
+        // But if optimizing for size and there's a load folding opportunity,
+        // generate insertps because blendps does not have a 32-bit memory
+        // operand form.
+        N2 = DAG.getIntPtrConstant(1, dl);
+        N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
+      }
+      N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
+      // Create this as a scalar to vector..
+      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+      return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
+    }
+
+    if (EltVT == MVT::i32 || EltVT == MVT::i64) {
+      // PINSR* works with constant index.
+      return Op;
+    }
+  }
+
+  if (EltVT == MVT::i8)
+    return SDValue();
+
+  if (EltVT.getSizeInBits() == 16) {
+    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
+    // as its second argument.
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getIntPtrConstant(IdxVal, dl);
+    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
+  }
+  return SDValue();
+}
+
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  MVT OpVT = Op.getSimpleValueType();
+
+  // If this is a 256-bit vector result, first insert into a 128-bit
+  // vector and then insert into the 256-bit vector.
+  if (!OpVT.is128BitVector()) {
+    // Insert into a 128-bit vector.
+    unsigned SizeFactor = OpVT.getSizeInBits()/128;
+    MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
+                                 OpVT.getVectorNumElements() / SizeFactor);
+
+    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
+
+    // Insert the 128-bit vector.
+    return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
+  }
+
+  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
+  assert(OpVT.is128BitVector() && "Expected an SSE type!");
+  return DAG.getBitcast(
+      OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
+}
+
+// Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
+// a simple subregister reference or explicit instructions to grab
+// upper bits of a vector.
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
+                                      SelectionDAG &DAG) {
+  assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
+
+  SDLoc dl(Op);
+  SDValue In =  Op.getOperand(0);
+  SDValue Idx = Op.getOperand(1);
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  MVT ResVT = Op.getSimpleValueType();
+
+  assert((In.getSimpleValueType().is256BitVector() ||
+          In.getSimpleValueType().is512BitVector()) &&
+         "Can only extract from 256-bit or 512-bit vectors");
+
+  if (ResVT.is128BitVector())
+    return extract128BitVector(In, IdxVal, DAG, dl);
+  if (ResVT.is256BitVector())
+    return extract256BitVector(In, IdxVal, DAG, dl);
+
+  llvm_unreachable("Unimplemented!");
+}
+
+static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I)
+    if (llvm::all_of(ValidUsers,
+                     [&I](SDValue V) { return V.getNode() != *I; }))
+      return false;
+  return true;
+}
+
+// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
+// simple superregister reference or explicit instructions to insert
+// the upper bits of a vector.
+static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
+                                     SelectionDAG &DAG) {
+  assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX");
+
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue SubVec = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  MVT OpVT = Op.getSimpleValueType();
+  MVT SubVecVT = SubVec.getSimpleValueType();
+
+  if (OpVT.getVectorElementType() == MVT::i1)
+    return insert1BitVector(Op, DAG, Subtarget);
+
+  assert((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
+         "Can only insert into 256-bit or 512-bit vectors");
+
+  // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
+  // load:
+  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+  //                   (load16 addr + 16), Elts/2)
+  // --> load32 addr
+  // or:
+  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+  //                   (load32 addr + 32), Elts/2)
+  // --> load64 addr
+  // or a 16-byte or 32-byte broadcast:
+  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+  //                   (load16 addr), Elts/2)
+  // --> X86SubVBroadcast(load16 addr)
+  // or:
+  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+  //                   (load32 addr), Elts/2)
+  // --> X86SubVBroadcast(load32 addr)
+  if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+      Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
+    auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
+    if (Idx2 && Idx2->getZExtValue() == 0) {
+      SDValue SubVec2 = Vec.getOperand(1);
+      // If needed, look through bitcasts to get to the load.
+      if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
+        bool Fast;
+        unsigned Alignment = FirstLd->getAlignment();
+        unsigned AS = FirstLd->getAddressSpace();
+        const X86TargetLowering *TLI = Subtarget.getTargetLowering();
+        if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+                                    OpVT, AS, Alignment, &Fast) && Fast) {
+          SDValue Ops[] = {SubVec2, SubVec};
+          if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
+            return Ld;
+        }
+      }
+      // If lower/upper loads are the same and the only users of the load, then
+      // lower to a VBROADCASTF128/VBROADCASTI128/etc.
+      if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
+        if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
+            areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) {
+          return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
+        }
+      }
+      // If this is subv_broadcast insert into both halves, use a larger
+      // subv_broadcast.
+      if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
+        return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
+                           SubVec.getOperand(0));
+      }
+    }
+  }
+
+  if (SubVecVT.is128BitVector())
+    return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
+  if (SubVecVT.is256BitVector())
+    return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
+  llvm_unreachable("Unimplemented!");
+}
+
+// Returns the appropriate wrapper opcode for a global reference.
+unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
+  // References to absolute symbols are never PC-relative.
+  if (GV && GV->isAbsoluteSymbolRef())
+    return X86ISD::Wrapper;
+
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+  if (Subtarget.isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    return X86ISD::WrapperRIP;
+
+  return X86ISD::Wrapper;
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDValue
+X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
+
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetConstantPool(
+      CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
+  SDLoc DL(CP);
+  Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
+  // With PIC, the address is actually $g + Offset.
+  if (OpFlag) {
+    Result =
+        DAG.getNode(ISD::ADD, DL, PtrVT,
+                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
+  }
+
+  return Result;
+}
+
+SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
+
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
+  SDLoc DL(JT);
+  Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (OpFlag)
+    Result =
+        DAG.getNode(ISD::ADD, DL, PtrVT,
+                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
+  unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
+
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
+
+  SDLoc DL(Op);
+  Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (isPositionIndependent() && !Subtarget.is64Bit()) {
+    Result =
+        DAG.getNode(ISD::ADD, DL, PtrVT,
+                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
+  }
+
+  // For symbols that require a load from a stub to get the address, emit the
+  // load.
+  if (isGlobalStubReference(OpFlag))
+    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+  // Create the TargetBlockAddressAddress node.
+  unsigned char OpFlags =
+    Subtarget.classifyBlockAddressReference();
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
+  SDLoc dl(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
+  Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (isGlobalRelativeToPICBase(OpFlags)) {
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT,
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
+  }
+
+  return Result;
+}
+
+SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
+                                              const SDLoc &dl, int64_t Offset,
+                                              SelectionDAG &DAG) const {
+  // Create the TargetGlobalAddress node, folding in the constant
+  // offset if it is legal.
+  unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
+  CodeModel::Model M = DAG.getTarget().getCodeModel();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue Result;
+  if (OpFlags == X86II::MO_NO_FLAG &&
+      X86::isOffsetSuitableForCodeModel(Offset, M)) {
+    // A direct static reference to a global.
+    Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
+    Offset = 0;
+  } else {
+    Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
+  }
+
+  Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (isGlobalRelativeToPICBase(OpFlags)) {
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT,
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
+  }
+
+  // For globals that require a load from a stub to get the address, emit the
+  // load.
+  if (isGlobalStubReference(OpFlags))
+    Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+
+  // If there was a non-zero offset that we didn't fold, create an explicit
+  // addition for it.
+  if (Offset != 0)
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
+                         DAG.getConstant(Offset, dl, PtrVT));
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+  return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
+}
+
+static SDValue
+GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
+           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
+           unsigned char OperandFlags, bool LocalDynamic = false) {
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDLoc dl(GA);
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                           GA->getValueType(0),
+                                           GA->getOffset(),
+                                           OperandFlags);
+
+  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
+                                           : X86ISD::TLSADDR;
+
+  if (InFlag) {
+    SDValue Ops[] = { Chain,  TGA, *InFlag };
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
+  } else {
+    SDValue Ops[]  = { Chain, TGA };
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
+  }
+
+  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+  MFI.setAdjustsStack(true);
+  MFI.setHasCalls(true);
+
+  SDValue Flag = Chain.getValue(1);
+  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
+static SDValue
+LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                const EVT PtrVT) {
+  SDValue InFlag;
+  SDLoc dl(GA);  // ? function entry point might be better
+  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+                                   DAG.getNode(X86ISD::GlobalBaseReg,
+                                               SDLoc(), PtrVT), InFlag);
+  InFlag = Chain.getValue(1);
+
+  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
+static SDValue
+LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                const EVT PtrVT) {
+  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
+                    X86::RAX, X86II::MO_TLSGD);
+}
+
+static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
+                                           SelectionDAG &DAG,
+                                           const EVT PtrVT,
+                                           bool is64Bit) {
+  SDLoc dl(GA);
+
+  // Get the start address of the TLS block for this module.
+  X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
+      .getInfo<X86MachineFunctionInfo>();
+  MFI->incNumLocalDynamicTLSAccesses();
+
+  SDValue Base;
+  if (is64Bit) {
+    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
+                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
+  } else {
+    SDValue InFlag;
+    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+        DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
+    InFlag = Chain.getValue(1);
+    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
+                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
+  }
+
+  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
+  // of Base.
+
+  // Build x@dtpoff.
+  unsigned char OperandFlags = X86II::MO_DTPOFF;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                           GA->getValueType(0),
+                                           GA->getOffset(), OperandFlags);
+  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+  // Add x@dtpoff with the base.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
+static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                   const EVT PtrVT, TLSModel::Model model,
+                                   bool is64Bit, bool isPIC) {
+  SDLoc dl(GA);
+
+  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
+  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
+                                                         is64Bit ? 257 : 256));
+
+  SDValue ThreadPointer =
+      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
+                  MachinePointerInfo(Ptr));
+
+  unsigned char OperandFlags = 0;
+  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
+  // initialexec.
+  unsigned WrapperKind = X86ISD::Wrapper;
+  if (model == TLSModel::LocalExec) {
+    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
+  } else if (model == TLSModel::InitialExec) {
+    if (is64Bit) {
+      OperandFlags = X86II::MO_GOTTPOFF;
+      WrapperKind = X86ISD::WrapperRIP;
+    } else {
+      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
+    }
+  } else {
+    llvm_unreachable("Unexpected model");
+  }
+
+  // emit "addl x@ntpoff,%eax" (local exec)
+  // or "addl x@indntpoff,%eax" (initial exec)
+  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
+  SDValue TGA =
+      DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
+                                 GA->getOffset(), OperandFlags);
+  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+  if (model == TLSModel::InitialExec) {
+    if (isPIC && !is64Bit) {
+      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
+                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
+                           Offset);
+    }
+
+    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
+                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+  }
+
+  // The address of the thread local variable is the add of the thread
+  // pointer with the offset of the variable.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+  if (DAG.getTarget().Options.EmulatedTLS)
+    return LowerToTLSEmulatedModel(GA, DAG);
+
+  const GlobalValue *GV = GA->getGlobal();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  bool PositionIndependent = isPositionIndependent();
+
+  if (Subtarget.isTargetELF()) {
+    TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
+    switch (model) {
+      case TLSModel::GeneralDynamic:
+        if (Subtarget.is64Bit())
+          return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
+        return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
+      case TLSModel::LocalDynamic:
+        return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
+                                           Subtarget.is64Bit());
+      case TLSModel::InitialExec:
+      case TLSModel::LocalExec:
+        return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
+                                   PositionIndependent);
+    }
+    llvm_unreachable("Unknown TLS model.");
+  }
+
+  if (Subtarget.isTargetDarwin()) {
+    // Darwin only has one model of TLS.  Lower to that.
+    unsigned char OpFlag = 0;
+    unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
+                           X86ISD::WrapperRIP : X86ISD::Wrapper;
+
+    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+    // global base reg.
+    bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
+    if (PIC32)
+      OpFlag = X86II::MO_TLVP_PIC_BASE;
+    else
+      OpFlag = X86II::MO_TLVP;
+    SDLoc DL(Op);
+    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
+                                                GA->getValueType(0),
+                                                GA->getOffset(), OpFlag);
+    SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
+
+    // With PIC32, the address is actually $g + Offset.
+    if (PIC32)
+      Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
+                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
+                           Offset);
+
+    // Lowering the machine isd will make sure everything is in the right
+    // location.
+    SDValue Chain = DAG.getEntryNode();
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
+    SDValue Args[] = { Chain, Offset };
+    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
+                               DAG.getIntPtrConstant(0, DL, true),
+                               Chain.getValue(1), DL);
+
+    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
+    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI.setAdjustsStack(true);
+
+    // And our return value (tls address) is in the standard call return value
+    // location.
+    unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+    return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
+  }
+
+  if (Subtarget.isTargetKnownWindowsMSVC() ||
+      Subtarget.isTargetWindowsItanium() ||
+      Subtarget.isTargetWindowsGNU()) {
+    // Just use the implicit TLS architecture
+    // Need to generate someting similar to:
+    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
+    //                                  ; from TEB
+    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
+    //   mov     rcx, qword [rdx+rcx*8]
+    //   mov     eax, .tls$:tlsvar
+    //   [rax+rcx] contains the address
+    // Windows 64bit: gs:0x58
+    // Windows 32bit: fs:__tls_array
+
+    SDLoc dl(GA);
+    SDValue Chain = DAG.getEntryNode();
+
+    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
+    // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
+    // use its literal value of 0x2C.
+    Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
+                                        ? Type::getInt8PtrTy(*DAG.getContext(),
+                                                             256)
+                                        : Type::getInt32PtrTy(*DAG.getContext(),
+                                                              257));
+
+    SDValue TlsArray = Subtarget.is64Bit()
+                           ? DAG.getIntPtrConstant(0x58, dl)
+                           : (Subtarget.isTargetWindowsGNU()
+                                  ? DAG.getIntPtrConstant(0x2C, dl)
+                                  : DAG.getExternalSymbol("_tls_array", PtrVT));
+
+    SDValue ThreadPointer =
+        DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
+
+    SDValue res;
+    if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
+      res = ThreadPointer;
+    } else {
+      // Load the _tls_index variable
+      SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
+      if (Subtarget.is64Bit())
+        IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
+                             MachinePointerInfo(), MVT::i32);
+      else
+        IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
+
+      auto &DL = DAG.getDataLayout();
+      SDValue Scale =
+          DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
+      IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
+
+      res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
+    }
+
+    res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
+
+    // Get the offset of start of .tls section
+    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                             GA->getValueType(0),
+                                             GA->getOffset(), X86II::MO_SECREL);
+    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
+
+    // The address of the thread local variable is the add of the thread
+    // pointer with the offset of the variable.
+    return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
+  }
+
+  llvm_unreachable("TLS not implemented for this target.");
+}
+
+/// Lower SRA_PARTS and friends, which return two i32 values
+/// and take a 2 x i32 value to shift plus a shift amount.
+static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  MVT VT = Op.getSimpleValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
+  // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
+  // during isel.
+  SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+                                  DAG.getConstant(VTBits - 1, dl, MVT::i8));
+  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
+                                     DAG.getConstant(VTBits - 1, dl, MVT::i8))
+                       : DAG.getConstant(0, dl, VT);
+
+  SDValue Tmp2, Tmp3;
+  if (Op.getOpcode() == ISD::SHL_PARTS) {
+    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
+  } else {
+    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
+    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
+  }
+
+  // If the shift amount is larger or equal than the width of a part we can't
+  // rely on the results of shld/shrd. Insert a test and select the appropriate
+  // values for large shift amounts.
+  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+                                DAG.getConstant(VTBits, dl, MVT::i8));
+  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                             AndNode, DAG.getConstant(0, dl, MVT::i8));
+
+  SDValue Hi, Lo;
+  SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
+  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
+
+  if (Op.getOpcode() == ISD::SHL_PARTS) {
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
+  } else {
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
+  }
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue Src = Op.getOperand(0);
+  MVT SrcVT = Src.getSimpleValueType();
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (SrcVT.isVector()) {
+    if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
+      return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
+                         DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+                                     DAG.getUNDEF(SrcVT)));
+    }
+    if (SrcVT.getVectorElementType() == MVT::i1) {
+      if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
+        return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+                           DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
+      MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
+      return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+                         DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
+    }
+    return SDValue();
+  }
+
+  assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
+         "Unknown SINT_TO_FP to lower!");
+
+  // These are really Legal; return the operand so the caller accepts it as
+  // Legal.
+  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
+    return Op;
+  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+      Subtarget.is64Bit()) {
+    return Op;
+  }
+
+  SDValue ValueToStore = Op.getOperand(0);
+  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+      !Subtarget.is64Bit())
+    // Bitcasting to f64 here allows us to do a single 64-bit store from
+    // an SSE register, avoiding the store forwarding penalty that would come
+    // with two 32-bit stores.
+    ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
+
+  unsigned Size = SrcVT.getSizeInBits()/8;
+  MachineFunction &MF = DAG.getMachineFunction();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
+  int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+  SDValue Chain = DAG.getStore(
+      DAG.getEntryNode(), dl, ValueToStore, StackSlot,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
+  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+}
+
+SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
+                                     SDValue StackSlot,
+                                     SelectionDAG &DAG) const {
+  // Build the FILD
+  SDLoc DL(Op);
+  SDVTList Tys;
+  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
+  if (useSSE)
+    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
+  else
+    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+
+  unsigned ByteSize = SrcVT.getSizeInBits()/8;
+
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
+  MachineMemOperand *MMO;
+  if (FI) {
+    int SSFI = FI->getIndex();
+    MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+        MachineMemOperand::MOLoad, ByteSize, ByteSize);
+  } else {
+    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
+    StackSlot = StackSlot.getOperand(1);
+  }
+  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
+  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
+                                           X86ISD::FILD, DL,
+                                           Tys, Ops, SrcVT, MMO);
+
+  if (useSSE) {
+    Chain = Result.getValue(1);
+    SDValue InFlag = Result.getValue(2);
+
+    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
+    // shouldn't be necessary except that RFP cannot be live across
+    // multiple blocks. When stackifier is fixed, they can be uncoupled.
+    MachineFunction &MF = DAG.getMachineFunction();
+    unsigned SSFISize = Op.getValueSizeInBits()/8;
+    int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
+    auto PtrVT = getPointerTy(MF.getDataLayout());
+    SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+    Tys = DAG.getVTList(MVT::Other);
+    SDValue Ops[] = {
+      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
+    };
+    MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+        MachineMemOperand::MOStore, SSFISize, SSFISize);
+
+    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
+                                    Ops, Op.getValueType(), MMO);
+    Result = DAG.getLoad(
+        Op.getValueType(), DL, Chain, StackSlot,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
+  }
+
+  return Result;
+}
+
+/// 64-bit unsigned integer to double expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  // This algorithm is not obvious. Here it is what we're trying to output:
+  /*
+     movq       %rax,  %xmm0
+     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
+     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
+     #ifdef __SSE3__
+       haddpd   %xmm0, %xmm0
+     #else
+       pshufd   $0x4e, %xmm0, %xmm1
+       addpd    %xmm1, %xmm0
+     #endif
+  */
+
+  SDLoc dl(Op);
+  LLVMContext *Context = DAG.getContext();
+
+  // Build some magic constants.
+  static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+  Constant *C0 = ConstantDataVector::get(*Context, CV0);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
+
+  SmallVector<Constant*,2> CV1;
+  CV1.push_back(
+    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
+                                      APInt(64, 0x4330000000000000ULL))));
+  CV1.push_back(
+    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
+                                      APInt(64, 0x4530000000000000ULL))));
+  Constant *C1 = ConstantVector::get(CV1);
+  SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
+
+  // Load the 64-bit value into an XMM register.
+  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+                            Op.getOperand(0));
+  SDValue CLod0 =
+      DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                  /* Alignment = */ 16);
+  SDValue Unpck1 =
+      getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
+
+  SDValue CLod1 =
+      DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                  /* Alignment = */ 16);
+  SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
+  // TODO: Are there any fast-math-flags to propagate here?
+  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+  SDValue Result;
+
+  if (Subtarget.hasSSE3()) {
+    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
+    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
+  } else {
+    SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
+    SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
+    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
+                         DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
+  }
+
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
+                     DAG.getIntPtrConstant(0, dl));
+}
+
+/// 32-bit unsigned integer to float expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  // FP constant to bias correct the final result.
+  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
+                                   MVT::f64);
+
+  // Load the 32-bit value into an XMM register.
+  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+                             Op.getOperand(0));
+
+  // Zero out the upper parts of the register.
+  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
+
+  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                     DAG.getBitcast(MVT::v2f64, Load),
+                     DAG.getIntPtrConstant(0, dl));
+
+  // Or the load with the bias.
+  SDValue Or = DAG.getNode(
+      ISD::OR, dl, MVT::v2i64,
+      DAG.getBitcast(MVT::v2i64,
+                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
+      DAG.getBitcast(MVT::v2i64,
+                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
+  Or =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                  DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
+
+  // Subtract the bias.
+  // TODO: Are there any fast-math-flags to propagate here?
+  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
+
+  // Handle final rounding.
+  MVT DestVT = Op.getSimpleValueType();
+
+  if (DestVT.bitsLT(MVT::f64))
+    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
+                       DAG.getIntPtrConstant(0, dl));
+  if (DestVT.bitsGT(MVT::f64))
+    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+
+  // Handle final rounding.
+  return Sub;
+}
+
+static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget, SDLoc &DL) {
+  if (Op.getSimpleValueType() != MVT::v2f64)
+    return SDValue();
+
+  SDValue N0 = Op.getOperand(0);
+  assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
+
+  // Legalize to v4i32 type.
+  N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+                   DAG.getUNDEF(MVT::v2i32));
+
+  if (Subtarget.hasAVX512())
+    return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
+
+  // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
+  // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
+  SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
+  SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
+
+  // Two to the power of half-word-size.
+  SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
+
+  // Clear upper part of LO, lower HI.
+  SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
+  SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
+
+  SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
+          fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
+  SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
+
+  // Add the two halves.
+  return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
+}
+
+static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
+  // The algorithm is the following:
+  // #ifdef __SSE4_1__
+  //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
+  //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
+  //                                 (uint4) 0x53000000, 0xaa);
+  // #else
+  //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
+  //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
+  // #endif
+  //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
+  //     return (float4) lo + fhi;
+
+  // We shouldn't use it when unsafe-fp-math is enabled though: we might later
+  // reassociate the two FADDs, and if we do that, the algorithm fails
+  // spectacularly (PR24512).
+  // FIXME: If we ever have some kind of Machine FMF, this should be marked
+  // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
+  // there's also the MachineCombiner reassociations happening on Machine IR.
+  if (DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
+  SDLoc DL(Op);
+  SDValue V = Op->getOperand(0);
+  MVT VecIntVT = V.getSimpleValueType();
+  bool Is128 = VecIntVT == MVT::v4i32;
+  MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
+  // If we convert to something else than the supported type, e.g., to v4f64,
+  // abort early.
+  if (VecFloatVT != Op->getSimpleValueType(0))
+    return SDValue();
+
+  assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
+         "Unsupported custom type");
+
+  // In the #idef/#else code, we have in common:
+  // - The vector of constants:
+  // -- 0x4b000000
+  // -- 0x53000000
+  // - A shift:
+  // -- v >> 16
+
+  // Create the splat vector for 0x4b000000.
+  SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
+  // Create the splat vector for 0x53000000.
+  SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
+
+  // Create the right shift.
+  SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
+  SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
+
+  SDValue Low, High;
+  if (Subtarget.hasSSE41()) {
+    MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
+    //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
+    SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
+    SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
+    // Low will be bitcasted right away, so do not bother bitcasting back to its
+    // original type.
+    Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
+                      VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
+    //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
+    //                                 (uint4) 0x53000000, 0xaa);
+    SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
+    SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
+    // High will be bitcasted right away, so do not bother bitcasting back to
+    // its original type.
+    High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
+                       VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
+  } else {
+    SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
+    //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
+    SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
+    Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
+
+    //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
+    High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
+  }
+
+  // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
+  SDValue VecCstFAdd = DAG.getConstantFP(
+      APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
+
+  //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
+  SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
+  // TODO: Are there any fast-math-flags to propagate here?
+  SDValue FHigh =
+      DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
+  //     return (float4) lo + fhi;
+  SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
+  return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
+}
+
+SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue N0 = Op.getOperand(0);
+  MVT SrcVT = N0.getSimpleValueType();
+  SDLoc dl(Op);
+
+  if (SrcVT.getVectorElementType() == MVT::i1) {
+    if (SrcVT == MVT::v2i1)
+      return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+                         DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
+    MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
+    return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+                       DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
+  }
+
+  switch (SrcVT.SimpleTy) {
+  default:
+    llvm_unreachable("Custom UINT_TO_FP is not supported!");
+  case MVT::v4i8:
+  case MVT::v4i16:
+  case MVT::v8i8:
+  case MVT::v8i16: {
+    MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
+    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+                       DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+  }
+  case MVT::v2i32:
+    return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
+  case MVT::v4i32:
+  case MVT::v8i32:
+    return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
+  case MVT::v16i8:
+  case MVT::v16i16:
+    assert(Subtarget.hasAVX512());
+    return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+                       DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
+  }
+}
+
+SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue N0 = Op.getOperand(0);
+  SDLoc dl(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
+  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+  // the optimization here.
+  if (DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
+
+  if (Op.getSimpleValueType().isVector())
+    return lowerUINT_TO_FP_vec(Op, DAG);
+
+  MVT SrcVT = N0.getSimpleValueType();
+  MVT DstVT = Op.getSimpleValueType();
+
+  if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
+      (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
+    // Conversions from unsigned i32 to f32/f64 are legal,
+    // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
+    return Op;
+  }
+
+  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
+    return LowerUINT_TO_FP_i64(Op, DAG);
+  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
+    return LowerUINT_TO_FP_i32(Op, DAG);
+  if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
+    return SDValue();
+
+  // Make a 64-bit buffer, and use it to build an FILD.
+  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
+  if (SrcVT == MVT::i32) {
+    SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
+    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+                                  StackSlot, MachinePointerInfo());
+    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
+                                  OffsetSlot, MachinePointerInfo());
+    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+    return Fild;
+  }
+
+  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
+  SDValue ValueToStore = Op.getOperand(0);
+  if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
+    // Bitcasting to f64 here allows us to do a single 64-bit store from
+    // an SSE register, avoiding the store forwarding penalty that would come
+    // with two 32-bit stores.
+    ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
+                               MachinePointerInfo());
+  // For i64 source, we need to add the appropriate power of 2 if the input
+  // was negative.  This is the same as the optimization in
+  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
+  // we must be careful to do the computation in x87 extended precision, not
+  // in SSE. (The generic code can't know it's OK to do this, or how to.)
+  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
+      MachineMemOperand::MOLoad, 8, 8);
+
+  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
+  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
+                                         MVT::i64, MMO);
+
+  APInt FF(32, 0x5F800000ULL);
+
+  // Check whether the sign bit is set.
+  SDValue SignSet = DAG.getSetCC(
+      dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
+      Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
+
+  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
+  SDValue FudgePtr = DAG.getConstantPool(
+      ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
+
+  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
+  SDValue Zero = DAG.getIntPtrConstant(0, dl);
+  SDValue Four = DAG.getIntPtrConstant(4, dl);
+  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
+                               Zero, Four);
+  FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
+
+  // Load the value out, extending it from f32 to f80.
+  // FIXME: Avoid the extend by constructing the right constant pool?
+  SDValue Fudge = DAG.getExtLoad(
+      ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
+      /* Alignment = */ 4);
+  // Extend everything to 80 bits to force it to be done on x87.
+  // TODO: Are there any fast-math-flags to propagate here?
+  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
+  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
+                     DAG.getIntPtrConstant(0, dl));
+}
+
+// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
+// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
+// just return an <SDValue(), SDValue()> pair.
+// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
+// to i16, i32 or i64, and we lower it to a legal sequence.
+// If lowered to the final integer result we return a <result, SDValue()> pair.
+// Otherwise we lower it to a sequence ending with a FIST, return a
+// <FIST, StackSlot> pair, and the caller is responsible for loading
+// the final integer result from StackSlot.
+std::pair<SDValue,SDValue>
+X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+                                   bool IsSigned, bool IsReplace) const {
+  SDLoc DL(Op);
+
+  EVT DstTy = Op.getValueType();
+  EVT TheVT = Op.getOperand(0).getValueType();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
+    // f16 must be promoted before using the lowering in this routine.
+    // fp128 does not use this lowering.
+    return std::make_pair(SDValue(), SDValue());
+  }
+
+  // If using FIST to compute an unsigned i64, we'll need some fixup
+  // to handle values above the maximum signed i64.  A FIST is always
+  // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
+  bool UnsignedFixup = !IsSigned &&
+                       DstTy == MVT::i64 &&
+                       (!Subtarget.is64Bit() ||
+                        !isScalarFPTypeInSSEReg(TheVT));
+
+  if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
+    // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
+    // The low 32 bits of the fist result will have the correct uint32 result.
+    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
+    DstTy = MVT::i64;
+  }
+
+  assert(DstTy.getSimpleVT() <= MVT::i64 &&
+         DstTy.getSimpleVT() >= MVT::i16 &&
+         "Unknown FP_TO_INT to lower!");
+
+  // These are really Legal.
+  if (DstTy == MVT::i32 &&
+      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+    return std::make_pair(SDValue(), SDValue());
+  if (Subtarget.is64Bit() &&
+      DstTy == MVT::i64 &&
+      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+    return std::make_pair(SDValue(), SDValue());
+
+  // We lower FP->int64 into FISTP64 followed by a load from a temporary
+  // stack slot.
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned MemSize = DstTy.getSizeInBits()/8;
+  int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+
+  unsigned Opc;
+  switch (DstTy.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
+  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+  }
+
+  SDValue Chain = DAG.getEntryNode();
+  SDValue Value = Op.getOperand(0);
+  SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
+
+  if (UnsignedFixup) {
+    //
+    // Conversion to unsigned i64 is implemented with a select,
+    // depending on whether the source value fits in the range
+    // of a signed i64.  Let Thresh be the FP equivalent of
+    // 0x8000000000000000ULL.
+    //
+    //  Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
+    //  FistSrc    = (Value < Thresh) ? Value : (Value - Thresh);
+    //  Fist-to-mem64 FistSrc
+    //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
+    //  to XOR'ing the high 32 bits with Adjust.
+    //
+    // Being a power of 2, Thresh is exactly representable in all FP formats.
+    // For X87 we'd like to use the smallest FP type for this constant, but
+    // for DAG type consistency we have to match the FP operand type.
+
+    APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
+    LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
+    bool LosesInfo = false;
+    if (TheVT == MVT::f64)
+      // The rounding mode is irrelevant as the conversion should be exact.
+      Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
+                              &LosesInfo);
+    else if (TheVT == MVT::f80)
+      Status = Thresh.convert(APFloat::x87DoubleExtended(),
+                              APFloat::rmNearestTiesToEven, &LosesInfo);
+
+    assert(Status == APFloat::opOK && !LosesInfo &&
+           "FP conversion should have been exact");
+
+    SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
+
+    SDValue Cmp = DAG.getSetCC(DL,
+                               getSetCCResultType(DAG.getDataLayout(),
+                                                  *DAG.getContext(), TheVT),
+                               Value, ThreshVal, ISD::SETLT);
+    Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
+                           DAG.getConstant(0, DL, MVT::i32),
+                           DAG.getConstant(0x80000000, DL, MVT::i32));
+    SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
+    Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
+                                              *DAG.getContext(), TheVT),
+                       Value, ThreshVal, ISD::SETLT);
+    Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
+  }
+
+  // FIXME This causes a redundant load/store if the SSE-class value is already
+  // in memory, such as if it is on the callstack.
+  if (isScalarFPTypeInSSEReg(TheVT)) {
+    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
+    Chain = DAG.getStore(Chain, DL, Value, StackSlot,
+                         MachinePointerInfo::getFixedStack(MF, SSFI));
+    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
+    SDValue Ops[] = {
+      Chain, StackSlot, DAG.getValueType(TheVT)
+    };
+
+    MachineMemOperand *MMO =
+        MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+                                MachineMemOperand::MOLoad, MemSize, MemSize);
+    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
+    Chain = Value.getValue(1);
+    SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
+    StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+  }
+
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+                              MachineMemOperand::MOStore, MemSize, MemSize);
+
+  if (UnsignedFixup) {
+
+    // Insert the FIST, load its result as two i32's,
+    // and XOR the high i32 with Adjust.
+
+    SDValue FistOps[] = { Chain, Value, StackSlot };
+    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+                                           FistOps, DstTy, MMO);
+
+    SDValue Low32 =
+        DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
+    SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
+
+    SDValue High32 =
+        DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
+    High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
+
+    if (Subtarget.is64Bit()) {
+      // Join High32 and Low32 into a 64-bit result.
+      // (High32 << 32) | Low32
+      Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
+      High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
+      High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
+                           DAG.getConstant(32, DL, MVT::i8));
+      SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
+      return std::make_pair(Result, SDValue());
+    }
+
+    SDValue ResultOps[] = { Low32, High32 };
+
+    SDValue pair = IsReplace
+      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
+      : DAG.getMergeValues(ResultOps, DL);
+    return std::make_pair(pair, SDValue());
+  } else {
+    // Build the FP_TO_INT*_IN_MEM
+    SDValue Ops[] = { Chain, Value, StackSlot };
+    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+                                           Ops, DstTy, MMO);
+    return std::make_pair(FIST, StackSlot);
+  }
+}
+
+static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
+                              const X86Subtarget &Subtarget) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+  SDLoc dl(Op);
+
+  if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
+
+  // Optimize vectors in AVX mode:
+  //
+  //   v8i16 -> v8i32
+  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
+  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
+  //   Concat upper and lower parts.
+  //
+  //   v4i32 -> v4i64
+  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
+  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
+  //   Concat upper and lower parts.
+  //
+
+  if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
+      ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
+      ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
+    return SDValue();
+
+  if (Subtarget.hasInt256())
+    return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
+
+  SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
+  SDValue Undef = DAG.getUNDEF(InVT);
+  bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
+  SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+  SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+
+  MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
+                             VT.getVectorNumElements()/2);
+
+  OpLo = DAG.getBitcast(HVT, OpLo);
+  OpHi = DAG.getBitcast(HVT, OpHi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
+                  const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+  SDLoc DL(Op);
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
+    return SDValue();
+
+  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+
+  assert(InVT.getVectorElementType() == MVT::i1);
+
+  // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
+  MVT ExtVT = VT;
+  if (!VT.is512BitVector() && !Subtarget.hasVLX())
+    ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+
+  SDValue One =
+   DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
+  SDValue Zero =
+   DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
+
+  SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
+  if (VT == ExtVT)
+    return SelectedVal;
+  return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
+}
+
+static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
+                               SelectionDAG &DAG) {
+  if (Subtarget.hasFp256())
+    if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
+      return Res;
+
+  return SDValue();
+}
+
+static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
+                                SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  MVT SVT = In.getSimpleValueType();
+
+  if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
+    return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
+
+  if (Subtarget.hasFp256())
+    if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
+      return Res;
+
+  assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
+         VT.getVectorNumElements() != SVT.getVectorNumElements());
+  return SDValue();
+}
+
+/// Helper to recursively truncate vector elements in half with PACKSS.
+/// It makes use of the fact that vector comparison results will be all-zeros
+/// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
+/// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
+/// within each 128-bit lane.
+static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
+                                               const SDLoc &DL,
+                                               SelectionDAG &DAG,
+                                               const X86Subtarget &Subtarget) {
+  // Requires SSE2 but AVX512 has fast truncate.
+  if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+    return SDValue();
+
+  EVT SrcVT = In.getValueType();
+
+  // No truncation required, we might get here due to recursive calls.
+  if (SrcVT == DstVT)
+    return In;
+
+  // We only support vector truncation to 128bits or greater from a
+  // 256bits or greater source.
+  if ((DstVT.getSizeInBits() % 128) != 0)
+    return SDValue();
+  if ((SrcVT.getSizeInBits() % 256) != 0)
+    return SDValue();
+
+  unsigned NumElems = SrcVT.getVectorNumElements();
+  assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
+  assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
+
+  EVT PackedSVT =
+      EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
+
+  // Extract lower/upper subvectors.
+  unsigned NumSubElts = NumElems / 2;
+  unsigned SrcSizeInBits = SrcVT.getSizeInBits();
+  SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
+  SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
+
+  // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
+  if (SrcVT.is256BitVector()) {
+    Lo = DAG.getBitcast(MVT::v8i16, Lo);
+    Hi = DAG.getBitcast(MVT::v8i16, Hi);
+    SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
+    return DAG.getBitcast(DstVT, Res);
+  }
+
+  // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
+  // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
+  if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
+    Lo = DAG.getBitcast(MVT::v16i16, Lo);
+    Hi = DAG.getBitcast(MVT::v16i16, Hi);
+    SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
+
+    // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
+    // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
+    Res = DAG.getBitcast(MVT::v4i64, Res);
+    Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
+
+    if (DstVT.is256BitVector())
+      return DAG.getBitcast(DstVT, Res);
+
+    // If 512bit -> 128bit truncate another stage.
+    EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
+    Res = DAG.getBitcast(PackedVT, Res);
+    return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
+  }
+
+  // Recursively pack lower/upper subvectors, concat result and pack again.
+  assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
+  EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
+  Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
+  Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
+
+  PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
+  SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
+  return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
+}
+
+static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
+                                  const X86Subtarget &Subtarget) {
+
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+
+  assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
+
+  // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
+  unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
+  if (InVT.getScalarSizeInBits() <= 16) {
+    if (Subtarget.hasBWI()) {
+      // legal, will go to VPMOVB2M, VPMOVW2M
+      // Shift packed bytes not supported natively, bitcast to word
+      MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
+      SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
+                                       DAG.getBitcast(ExtVT, In),
+                                       DAG.getConstant(ShiftInx, DL, ExtVT));
+      ShiftNode = DAG.getBitcast(InVT, ShiftNode);
+      return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
+    }
+    // Use TESTD/Q, extended vector to packed dword/qword.
+    assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
+           "Unexpected vector type.");
+    unsigned NumElts = InVT.getVectorNumElements();
+    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+    In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
+    InVT = ExtVT;
+    ShiftInx = InVT.getScalarSizeInBits() - 1;
+  }
+
+  SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
+                                   DAG.getConstant(ShiftInx, DL, InVT));
+  return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
+}
+
+SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+
+  if (VT == MVT::i1) {
+    assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
+           "Invalid scalar TRUNCATE operation");
+    if (InVT.getSizeInBits() >= 32)
+      return SDValue();
+    In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
+  }
+  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
+         "Invalid TRUNCATE operation");
+
+  if (VT.getVectorElementType() == MVT::i1)
+    return LowerTruncateVecI1(Op, DAG, Subtarget);
+
+  // vpmovqb/w/d, vpmovdb/w, vpmovwb
+  if (Subtarget.hasAVX512()) {
+    // word to byte only under BWI
+    if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
+      return DAG.getNode(X86ISD::VTRUNC, DL, VT,
+                         DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
+    return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
+  }
+
+  // Truncate with PACKSS if we are truncating a vector comparison result.
+  // TODO: We should be able to support other operations as long as we
+  // we are saturating+packing zero/all bits only.
+  auto IsPackableComparison = [](SDValue V) {
+    unsigned Opcode = V.getOpcode();
+    return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ ||
+            Opcode == X86ISD::CMPP);
+  };
+
+  if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS &&
+                                   all_of(In->ops(), IsPackableComparison))) {
+    if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
+      return V;
+  }
+
+  if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
+    // On AVX2, v4i64 -> v4i32 becomes VPERMD.
+    if (Subtarget.hasInt256()) {
+      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+      In = DAG.getBitcast(MVT::v8i32, In);
+      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
+                                ShufMask);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
+                         DAG.getIntPtrConstant(0, DL));
+    }
+
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                               DAG.getIntPtrConstant(0, DL));
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                               DAG.getIntPtrConstant(2, DL));
+    OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
+    OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
+    static const int ShufMask[] = {0, 2, 4, 6};
+    return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
+  }
+
+  if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
+    // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
+    if (Subtarget.hasInt256()) {
+      In = DAG.getBitcast(MVT::v32i8, In);
+
+      SmallVector<SDValue,32> pshufbMask;
+      for (unsigned i = 0; i < 2; ++i) {
+        pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
+        for (unsigned j = 0; j < 8; ++j)
+          pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
+      }
+      SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
+      In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
+      In = DAG.getBitcast(MVT::v4i64, In);
+
+      static const int ShufMask[] = {0,  2,  -1,  -1};
+      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
+                                ShufMask);
+      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                       DAG.getIntPtrConstant(0, DL));
+      return DAG.getBitcast(VT, In);
+    }
+
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+                               DAG.getIntPtrConstant(0, DL));
+
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+                               DAG.getIntPtrConstant(4, DL));
+
+    OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
+    OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
+
+    // The PSHUFB mask:
+    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
+                                   -1, -1, -1, -1, -1, -1, -1, -1};
+
+    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
+    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
+    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
+
+    OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
+    OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
+
+    // The MOVLHPS Mask:
+    static const int ShufMask2[] = {0, 1, 4, 5};
+    SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
+    return DAG.getBitcast(MVT::v8i16, res);
+  }
+
+  // Handle truncation of V256 to V128 using shuffles.
+  if (!VT.is128BitVector() || !InVT.is256BitVector())
+    return SDValue();
+
+  assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
+
+  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
+  // Prepare truncation shuffle mask
+  for (unsigned i = 0; i != NumElems; ++i)
+    MaskVec[i] = i * 2;
+  SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
+                                   DAG.getUNDEF(NVT), MaskVec);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
+                     DAG.getIntPtrConstant(0, DL));
+}
+
+SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
+                                          const X86Subtarget &Subtarget,
+                                          SelectionDAG &DAG) const {
+  bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
+
+  MVT VT = Op.getSimpleValueType();
+
+  if (VT.isVector()) {
+    assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
+    SDValue Src = Op.getOperand(0);
+    SDLoc dl(Op);
+    if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
+      return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI,
+                         dl, VT,
+                         DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+                                     DAG.getUNDEF(MVT::v2f32)));
+    }
+
+    return SDValue();
+  }
+
+  assert(!VT.isVector());
+
+  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+    IsSigned, /*IsReplace=*/ false);
+  SDValue FIST = Vals.first, StackSlot = Vals.second;
+  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+  if (!FIST.getNode())
+    return Op;
+
+  if (StackSlot.getNode())
+    // Load the result.
+    return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
+
+  // The node is the result.
+  return FIST;
+}
+
+static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  MVT SVT = In.getSimpleValueType();
+
+  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
+
+  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
+                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
+                                 In, DAG.getUNDEF(SVT)));
+}
+
+/// The only differences between FABS and FNEG are the mask and the logic op.
+/// FNEG also has a folding opportunity for FNEG(FABS(x)).
+static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
+  assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
+         "Wrong opcode for lowering FABS or FNEG.");
+
+  bool IsFABS = (Op.getOpcode() == ISD::FABS);
+
+  // If this is a FABS and it has an FNEG user, bail out to fold the combination
+  // into an FNABS. We'll lower the FABS after that if it is still in use.
+  if (IsFABS)
+    for (SDNode *User : Op->uses())
+      if (User->getOpcode() == ISD::FNEG)
+        return Op;
+
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  bool IsF128 = (VT == MVT::f128);
+
+  // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
+  // decide if we should generate a 16-byte constant mask when we only need 4 or
+  // 8 bytes for the scalar case.
+
+  MVT LogicVT;
+  MVT EltVT;
+
+  if (VT.isVector()) {
+    LogicVT = VT;
+    EltVT = VT.getVectorElementType();
+  } else if (IsF128) {
+    // SSE instructions are used for optimized f128 logical operations.
+    LogicVT = MVT::f128;
+    EltVT = VT;
+  } else {
+    // There are no scalar bitwise logical SSE/AVX instructions, so we
+    // generate a 16-byte vector constant and logic op even for the scalar case.
+    // Using a 16-byte mask allows folding the load of the mask with
+    // the logic op, so it can save (~4 bytes) on code size.
+    LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+    EltVT = VT;
+  }
+
+  unsigned EltBits = EltVT.getSizeInBits();
+  // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
+  APInt MaskElt =
+    IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
+  const fltSemantics &Sem =
+      EltVT == MVT::f64 ? APFloat::IEEEdouble() :
+          (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
+  SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
+
+  SDValue Op0 = Op.getOperand(0);
+  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
+  unsigned LogicOp =
+    IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
+  SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
+
+  if (VT.isVector() || IsF128)
+    return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+
+  // For the scalar case extend to a 128-bit vector, perform the logic op,
+  // and extract the scalar result back out.
+  Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
+  SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
+                     DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+  SDValue Mag = Op.getOperand(0);
+  SDValue Sign = Op.getOperand(1);
+  SDLoc dl(Op);
+
+  // If the sign operand is smaller, extend it first.
+  MVT VT = Op.getSimpleValueType();
+  if (Sign.getSimpleValueType().bitsLT(VT))
+    Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
+
+  // And if it is bigger, shrink it first.
+  if (Sign.getSimpleValueType().bitsGT(VT))
+    Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
+
+  // At this point the operands and the result should have the same
+  // type, and that won't be f80 since that is not custom lowered.
+  bool IsF128 = (VT == MVT::f128);
+  assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
+          VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+          VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
+         "Unexpected type in LowerFCOPYSIGN");
+
+  MVT EltVT = VT.getScalarType();
+  const fltSemantics &Sem =
+      EltVT == MVT::f64 ? APFloat::IEEEdouble()
+                        : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
+
+  // Perform all scalar logic operations as 16-byte vectors because there are no
+  // scalar FP logic instructions in SSE.
+  // TODO: This isn't necessary. If we used scalar types, we might avoid some
+  // unnecessary splats, but we might miss load folding opportunities. Should
+  // this decision be based on OptimizeForSize?
+  bool IsFakeVector = !VT.isVector() && !IsF128;
+  MVT LogicVT = VT;
+  if (IsFakeVector)
+    LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+
+  // The mask constants are automatically splatted for vector types.
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  SDValue SignMask = DAG.getConstantFP(
+      APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
+  SDValue MagMask = DAG.getConstantFP(
+      APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
+
+  // First, clear all bits but the sign bit from the second operand (sign).
+  if (IsFakeVector)
+    Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
+  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
+
+  // Next, clear the sign bit from the first operand (magnitude).
+  // TODO: If we had general constant folding for FP logic ops, this check
+  // wouldn't be necessary.
+  SDValue MagBits;
+  if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
+    APFloat APF = Op0CN->getValueAPF();
+    APF.clearSign();
+    MagBits = DAG.getConstantFP(APF, dl, LogicVT);
+  } else {
+    // If the magnitude operand wasn't a constant, we need to AND out the sign.
+    if (IsFakeVector)
+      Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
+    MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
+  }
+
+  // OR the magnitude value with the sign bit.
+  SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
+  return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
+                                          DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
+  SDValue N0 = Op.getOperand(0);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  MVT OpVT = N0.getSimpleValueType();
+  assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
+         "Unexpected type for FGETSIGN");
+
+  // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
+  MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
+  SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
+  Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
+  Res = DAG.getZExtOrTrunc(Res, dl, VT);
+  Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
+  return Res;
+}
+
+// Check whether an OR'd tree is PTEST-able.
+static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
+                                      SelectionDAG &DAG) {
+  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+
+  if (!Subtarget.hasSSE41())
+    return SDValue();
+
+  if (!Op->hasOneUse())
+    return SDValue();
+
+  SDNode *N = Op.getNode();
+  SDLoc DL(N);
+
+  SmallVector<SDValue, 8> Opnds;
+  DenseMap<SDValue, unsigned> VecInMap;
+  SmallVector<SDValue, 8> VecIns;
+  EVT VT = MVT::Other;
+
+  // Recognize a special case where a vector is casted into wide integer to
+  // test all 0s.
+  Opnds.push_back(N->getOperand(0));
+  Opnds.push_back(N->getOperand(1));
+
+  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+    SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
+    // BFS traverse all OR'd operands.
+    if (I->getOpcode() == ISD::OR) {
+      Opnds.push_back(I->getOperand(0));
+      Opnds.push_back(I->getOperand(1));
+      // Re-evaluate the number of nodes to be traversed.
+      e += 2; // 2 more nodes (LHS and RHS) are pushed.
+      continue;
+    }
+
+    // Quit if a non-EXTRACT_VECTOR_ELT
+    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // Quit if without a constant index.
+    SDValue Idx = I->getOperand(1);
+    if (!isa<ConstantSDNode>(Idx))
+      return SDValue();
+
+    SDValue ExtractedFromVec = I->getOperand(0);
+    DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
+    if (M == VecInMap.end()) {
+      VT = ExtractedFromVec.getValueType();
+      // Quit if not 128/256-bit vector.
+      if (!VT.is128BitVector() && !VT.is256BitVector())
+        return SDValue();
+      // Quit if not the same type.
+      if (VecInMap.begin() != VecInMap.end() &&
+          VT != VecInMap.begin()->first.getValueType())
+        return SDValue();
+      M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
+      VecIns.push_back(ExtractedFromVec);
+    }
+    M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+  }
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Not extracted from 128-/256-bit vector.");
+
+  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+
+  for (DenseMap<SDValue, unsigned>::const_iterator
+        I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
+    // Quit if not all elements are used.
+    if (I->second != FullMask)
+      return SDValue();
+  }
+
+  MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+
+  // Cast all vectors into TestVT for PTEST.
+  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
+    VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
+
+  // If more than one full vectors are evaluated, OR them first before PTEST.
+  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
+    // Each iteration will OR 2 nodes and append the result until there is only
+    // 1 node left, i.e. the final OR'd value of all vectors.
+    SDValue LHS = VecIns[Slot];
+    SDValue RHS = VecIns[Slot + 1];
+    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
+  }
+
+  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
+                     VecIns.back(), VecIns.back());
+}
+
+/// \brief return true if \c Op has a use that doesn't just read flags.
+static bool hasNonFlagsUse(SDValue Op) {
+  for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
+       ++UI) {
+    SDNode *User = *UI;
+    unsigned UOpNo = UI.getOperandNo();
+    if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+      // Look pass truncate.
+      UOpNo = User->use_begin().getOperandNo();
+      User = *User->use_begin();
+    }
+
+    if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
+        !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
+      return true;
+  }
+  return false;
+}
+
+// Emit KTEST instruction for bit vectors on AVX-512
+static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
+                         const X86Subtarget &Subtarget) {
+  if (Op.getOpcode() == ISD::BITCAST) {
+    auto hasKTEST = [&](MVT VT) {
+      unsigned SizeInBits = VT.getSizeInBits();
+      return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
+        (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
+    };
+    SDValue Op0 = Op.getOperand(0);
+    MVT Op0VT = Op0.getValueType().getSimpleVT();
+    if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
+        hasKTEST(Op0VT))
+      return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
+  }
+  return SDValue();
+}
+
+/// Emit nodes that will be selected as "test Op0,Op0", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
+                                    SelectionDAG &DAG) const {
+  if (Op.getValueType() == MVT::i1) {
+    SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
+                       DAG.getConstant(0, dl, MVT::i8));
+  }
+  // CF and OF aren't always set the way we want. Determine which
+  // of these we need.
+  bool NeedCF = false;
+  bool NeedOF = false;
+  switch (X86CC) {
+  default: break;
+  case X86::COND_A: case X86::COND_AE:
+  case X86::COND_B: case X86::COND_BE:
+    NeedCF = true;
+    break;
+  case X86::COND_G: case X86::COND_GE:
+  case X86::COND_L: case X86::COND_LE:
+  case X86::COND_O: case X86::COND_NO: {
+    // Check if we really need to set the
+    // Overflow flag. If NoSignedWrap is present
+    // that is not actually needed.
+    switch (Op->getOpcode()) {
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL:
+    case ISD::SHL: {
+      const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
+      if (BinNode->Flags.hasNoSignedWrap())
+        break;
+    }
+    default:
+      NeedOF = true;
+      break;
+    }
+    break;
+  }
+  }
+  // See if we can use the EFLAGS value from the operand instead of
+  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
+  // we prove that the arithmetic won't overflow, we can't use OF or CF.
+  if (Op.getResNo() != 0 || NeedOF || NeedCF) {
+    // Emit KTEST for bit vectors
+    if (auto Node = EmitKTEST(Op, DAG, Subtarget))
+      return Node;
+    // Emit a CMP with 0, which is the TEST pattern.
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                       DAG.getConstant(0, dl, Op.getValueType()));
+  }
+  unsigned Opcode = 0;
+  unsigned NumOperands = 0;
+
+  // Truncate operations may prevent the merge of the SETCC instruction
+  // and the arithmetic instruction before it. Attempt to truncate the operands
+  // of the arithmetic instruction and use a reduced bit-width instruction.
+  bool NeedTruncation = false;
+  SDValue ArithOp = Op;
+  if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
+    SDValue Arith = Op->getOperand(0);
+    // Both the trunc and the arithmetic op need to have one user each.
+    if (Arith->hasOneUse())
+      switch (Arith.getOpcode()) {
+        default: break;
+        case ISD::ADD:
+        case ISD::SUB:
+        case ISD::AND:
+        case ISD::OR:
+        case ISD::XOR: {
+          NeedTruncation = true;
+          ArithOp = Arith;
+        }
+      }
+  }
+
+  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
+  // which may be the result of a CAST.  We use the variable 'Op', which is the
+  // non-casted variable when we check for possible users.
+  switch (ArithOp.getOpcode()) {
+  case ISD::ADD:
+    // Due to an isel shortcoming, be conservative if this add is likely to be
+    // selected as part of a load-modify-store instruction. When the root node
+    // in a match is a store, isel doesn't know how to remap non-chain non-flag
+    // uses of other nodes in the match, such as the ADD in this case. This
+    // leads to the ADD being left around and reselected, with the result being
+    // two adds in the output.  Alas, even if none our users are stores, that
+    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
+    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
+    // climbing the DAG back to the root, and it doesn't seem to be worth the
+    // effort.
+    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+         UE = Op.getNode()->use_end(); UI != UE; ++UI)
+      if (UI->getOpcode() != ISD::CopyToReg &&
+          UI->getOpcode() != ISD::SETCC &&
+          UI->getOpcode() != ISD::STORE)
+        goto default_case;
+
+    if (ConstantSDNode *C =
+        dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
+      // An add of one will be selected as an INC.
+      if (C->isOne() && !Subtarget.slowIncDec()) {
+        Opcode = X86ISD::INC;
+        NumOperands = 1;
+        break;
+      }
+
+      // An add of negative one (subtract of one) will be selected as a DEC.
+      if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
+        Opcode = X86ISD::DEC;
+        NumOperands = 1;
+        break;
+      }
+    }
+
+    // Otherwise use a regular EFLAGS-setting add.
+    Opcode = X86ISD::ADD;
+    NumOperands = 2;
+    break;
+  case ISD::SHL:
+  case ISD::SRL:
+    // If we have a constant logical shift that's only used in a comparison
+    // against zero turn it into an equivalent AND. This allows turning it into
+    // a TEST instruction later.
+    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
+        isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
+      EVT VT = Op.getValueType();
+      unsigned BitWidth = VT.getSizeInBits();
+      unsigned ShAmt = Op->getConstantOperandVal(1);
+      if (ShAmt >= BitWidth) // Avoid undefined shifts.
+        break;
+      APInt Mask = ArithOp.getOpcode() == ISD::SRL
+                       ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
+                       : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
+      if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+        break;
+      Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
+                       DAG.getConstant(Mask, dl, VT));
+    }
+    break;
+
+  case ISD::AND:
+    // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
+    // because a TEST instruction will be better.
+    if (!hasNonFlagsUse(Op)) {
+      SDValue Op0 = ArithOp->getOperand(0);
+      SDValue Op1 = ArithOp->getOperand(1);
+      EVT VT = ArithOp.getValueType();
+      bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
+      bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
+
+      // But if we can combine this into an ANDN operation, then create an AND
+      // now and allow it to be pattern matched into an ANDN.
+      if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
+        break;
+    }
+    LLVM_FALLTHROUGH;
+  case ISD::SUB:
+  case ISD::OR:
+  case ISD::XOR:
+    // Due to the ISEL shortcoming noted above, be conservative if this op is
+    // likely to be selected as part of a load-modify-store instruction.
+    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+           UE = Op.getNode()->use_end(); UI != UE; ++UI)
+      if (UI->getOpcode() == ISD::STORE)
+        goto default_case;
+
+    // Otherwise use a regular EFLAGS-setting instruction.
+    switch (ArithOp.getOpcode()) {
+    default: llvm_unreachable("unexpected operator!");
+    case ISD::SUB: Opcode = X86ISD::SUB; break;
+    case ISD::XOR: Opcode = X86ISD::XOR; break;
+    case ISD::AND: Opcode = X86ISD::AND; break;
+    case ISD::OR: {
+      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+        if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
+          return EFLAGS;
+      }
+      Opcode = X86ISD::OR;
+      break;
+    }
+    }
+
+    NumOperands = 2;
+    break;
+  case X86ISD::ADD:
+  case X86ISD::SUB:
+  case X86ISD::INC:
+  case X86ISD::DEC:
+  case X86ISD::OR:
+  case X86ISD::XOR:
+  case X86ISD::AND:
+    return SDValue(Op.getNode(), 1);
+  default:
+  default_case:
+    break;
+  }
+
+  // If we found that truncation is beneficial, perform the truncation and
+  // update 'Op'.
+  if (NeedTruncation) {
+    EVT VT = Op.getValueType();
+    SDValue WideVal = Op->getOperand(0);
+    EVT WideVT = WideVal.getValueType();
+    unsigned ConvertedOp = 0;
+    // Use a target machine opcode to prevent further DAGCombine
+    // optimizations that may separate the arithmetic operations
+    // from the setcc node.
+    switch (WideVal.getOpcode()) {
+      default: break;
+      case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
+      case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
+      case ISD::AND: ConvertedOp = X86ISD::AND; break;
+      case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
+      case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
+    }
+
+    if (ConvertedOp) {
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
+        SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
+        SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
+        Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
+      }
+    }
+  }
+
+  if (Opcode == 0) {
+    // Emit KTEST for bit vectors
+    if (auto Node = EmitKTEST(Op, DAG, Subtarget))
+      return Node;
+
+    // Emit a CMP with 0, which is the TEST pattern.
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                       DAG.getConstant(0, dl, Op.getValueType()));
+  }
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
+
+  SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
+  DAG.ReplaceAllUsesWith(Op, New);
+  return SDValue(New.getNode(), 1);
+}
+
+/// Emit nodes that will be selected as "cmp Op0,Op1", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+                                   const SDLoc &dl, SelectionDAG &DAG) const {
+  if (isNullConstant(Op1))
+    return EmitTest(Op0, X86CC, dl, DAG);
+
+  assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
+         "Unexpected comparison operation for MVT::i1 operands");
+
+  if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
+       Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
+    // Only promote the compare up to I32 if it is a 16 bit operation
+    // with an immediate.  16 bit immediates are to be avoided.
+    if ((Op0.getValueType() == MVT::i16 &&
+         (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
+        !DAG.getMachineFunction().getFunction()->optForMinSize() &&
+        !Subtarget.isAtom()) {
+      unsigned ExtendOp =
+          isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+      Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
+      Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
+    }
+    // Use SUB instead of CMP to enable CSE between SUB and CMP.
+    SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
+    SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
+                              Op0, Op1);
+    return SDValue(Sub.getNode(), 1);
+  }
+  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+}
+
+/// Convert a comparison if required by the subtarget.
+SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
+                                                 SelectionDAG &DAG) const {
+  // If the subtarget does not support the FUCOMI instruction, floating-point
+  // comparisons have to be converted.
+  if (Subtarget.hasCMov() ||
+      Cmp.getOpcode() != X86ISD::CMP ||
+      !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
+      !Cmp.getOperand(1).getValueType().isFloatingPoint())
+    return Cmp;
+
+  // The instruction selector will select an FUCOM instruction instead of
+  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
+  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
+  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
+  SDLoc dl(Cmp);
+  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
+  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
+  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
+                            DAG.getConstant(8, dl, MVT::i8));
+  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+
+  // Some 64-bit targets lack SAHF support, but they do support FCOMI.
+  assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
+  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
+}
+
+/// Check if replacement of SQRT with RSQRT should be disabled.
+bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  // We never want to use both SQRT and RSQRT instructions for the same input.
+  if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
+    return false;
+
+  if (VT.isVector())
+    return Subtarget.hasFastVectorFSQRT();
+  return Subtarget.hasFastScalarFSQRT();
+}
+
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
+                                           SelectionDAG &DAG, int Enabled,
+                                           int &RefinementSteps,
+                                           bool &UseOneConstNR,
+                                           bool Reciprocal) const {
+  EVT VT = Op.getValueType();
+
+  // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
+  // TODO: Add support for AVX512 (v16f32).
+  // It is likely not profitable to do this for f64 because a double-precision
+  // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
+  // instructions: convert to single, rsqrtss, convert back to double, refine
+  // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
+  // along with FMA, this could be a throughput win.
+  if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
+      (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+      (VT == MVT::v8f32 && Subtarget.hasAVX())) {
+    if (RefinementSteps == ReciprocalEstimate::Unspecified)
+      RefinementSteps = 1;
+
+    UseOneConstNR = false;
+    return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+  }
+  return SDValue();
+}
+
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
+                                            int Enabled,
+                                            int &RefinementSteps) const {
+  EVT VT = Op.getValueType();
+
+  // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
+  // TODO: Add support for AVX512 (v16f32).
+  // It is likely not profitable to do this for f64 because a double-precision
+  // reciprocal estimate with refinement on x86 prior to FMA requires
+  // 15 instructions: convert to single, rcpss, convert back to double, refine
+  // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
+  // along with FMA, this could be a throughput win.
+
+  if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
+      (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+      (VT == MVT::v8f32 && Subtarget.hasAVX())) {
+    // Enable estimate codegen with 1 refinement step for vector division.
+    // Scalar division estimates are disabled because they break too much
+    // real-world code. These defaults are intended to match GCC behavior.
+    if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
+      return SDValue();
+
+    if (RefinementSteps == ReciprocalEstimate::Unspecified)
+      RefinementSteps = 1;
+
+    return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
+  }
+  return SDValue();
+}
+
+/// If we have at least two divisions that use the same divisor, convert to
+/// multplication by a reciprocal. This may need to be adjusted for a given
+/// CPU if a division's cost is not at least twice the cost of a multiplication.
+/// This is because we still need one division to calculate the reciprocal and
+/// then we need two multiplies by that reciprocal as replacements for the
+/// original divisions.
+unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
+  return 2;
+}
+
+/// Helper for creating a X86ISD::SETCC node.
+static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
+                        SelectionDAG &DAG) {
+  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                     DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
+}
+
+/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
+/// according to equal/not-equal condition code \p CC.
+static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
+                                   const SDLoc &dl, SelectionDAG &DAG) {
+  // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
+  // instruction.  Since the shift amount is in-range-or-undefined, we know
+  // that doing a bittest on the i32 value is ok.  We extend to i32 because
+  // the encoding for the i16 version is larger than the i32 version.
+  // Also promote i16 to i32 for performance / code size reason.
+  if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
+    Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
+
+  // See if we can use the 32-bit instruction instead of the 64-bit one for a
+  // shorter encoding. Since the former takes the modulo 32 of BitNo and the
+  // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
+  // known to be zero.
+  if (Src.getValueType() == MVT::i64 &&
+      DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
+    Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
+
+  // If the operand types disagree, extend the shift amount to match.  Since
+  // BT ignores high bits (like shifts) we can use anyextend.
+  if (Src.getValueType() != BitNo.getValueType())
+    BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
+
+  SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
+  X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+  return getSETCC(Cond, BT, dl , DAG);
+}
+
+/// Result of 'and' is compared against zero. Change to a BT node if possible.
+static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
+                            const SDLoc &dl, SelectionDAG &DAG) {
+  SDValue Op0 = And.getOperand(0);
+  SDValue Op1 = And.getOperand(1);
+  if (Op0.getOpcode() == ISD::TRUNCATE)
+    Op0 = Op0.getOperand(0);
+  if (Op1.getOpcode() == ISD::TRUNCATE)
+    Op1 = Op1.getOperand(0);
+
+  SDValue LHS, RHS;
+  if (Op1.getOpcode() == ISD::SHL)
+    std::swap(Op0, Op1);
+  if (Op0.getOpcode() == ISD::SHL) {
+    if (isOneConstant(Op0.getOperand(0))) {
+      // If we looked past a truncate, check that it's only truncating away
+      // known zeros.
+      unsigned BitWidth = Op0.getValueSizeInBits();
+      unsigned AndBitWidth = And.getValueSizeInBits();
+      if (BitWidth > AndBitWidth) {
+        APInt Zeros, Ones;
+        DAG.computeKnownBits(Op0, Zeros, Ones);
+        if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
+          return SDValue();
+      }
+      LHS = Op1;
+      RHS = Op0.getOperand(1);
+    }
+  } else if (Op1.getOpcode() == ISD::Constant) {
+    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
+    uint64_t AndRHSVal = AndRHS->getZExtValue();
+    SDValue AndLHS = Op0;
+
+    if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
+      LHS = AndLHS.getOperand(0);
+      RHS = AndLHS.getOperand(1);
+    }
+
+    // Use BT if the immediate can't be encoded in a TEST instruction.
+    if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
+      LHS = AndLHS;
+      RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
+    }
+  }
+
+  if (LHS.getNode())
+    return getBitTestCondition(LHS, RHS, CC, dl, DAG);
+
+  return SDValue();
+}
+
+// Convert (truncate (srl X, N) to i1) to (bt X, N)
+static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
+                                 const SDLoc &dl, SelectionDAG &DAG) {
+
+  assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
+         "Expected TRUNCATE to i1 node");
+
+  if (Op.getOperand(0).getOpcode() != ISD::SRL)
+    return SDValue();
+
+  SDValue ShiftRight = Op.getOperand(0);
+  return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
+                             CC, dl, DAG);
+}
+
+/// Result of 'and' or 'trunc to i1' is compared against zero.
+/// Change to a BT node if possible.
+SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
+                                     const SDLoc &dl, SelectionDAG &DAG) const {
+  if (Op.getOpcode() == ISD::AND)
+    return LowerAndToBT(Op, CC, dl, DAG);
+  if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
+    return LowerTruncateToBT(Op, CC, dl, DAG);
+  return SDValue();
+}
+
+/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
+/// CMPs.
+static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
+                              SDValue &Op1) {
+  unsigned SSECC;
+  bool Swap = false;
+
+  // SSE Condition code mapping:
+  //  0 - EQ
+  //  1 - LT
+  //  2 - LE
+  //  3 - UNORD
+  //  4 - NEQ
+  //  5 - NLT
+  //  6 - NLE
+  //  7 - ORD
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETOEQ:
+  case ISD::SETEQ:  SSECC = 0; break;
+  case ISD::SETOGT:
+  case ISD::SETGT:  Swap = true; LLVM_FALLTHROUGH;
+  case ISD::SETLT:
+  case ISD::SETOLT: SSECC = 1; break;
+  case ISD::SETOGE:
+  case ISD::SETGE:  Swap = true; LLVM_FALLTHROUGH;
+  case ISD::SETLE:
+  case ISD::SETOLE: SSECC = 2; break;
+  case ISD::SETUO:  SSECC = 3; break;
+  case ISD::SETUNE:
+  case ISD::SETNE:  SSECC = 4; break;
+  case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
+  case ISD::SETUGE: SSECC = 5; break;
+  case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
+  case ISD::SETUGT: SSECC = 6; break;
+  case ISD::SETO:   SSECC = 7; break;
+  case ISD::SETUEQ:
+  case ISD::SETONE: SSECC = 8; break;
+  }
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  return SSECC;
+}
+
+/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
+/// concatenate the result back.
+static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+
+  assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
+         "Unsupported value type for operation");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  SDLoc dl(Op);
+  SDValue CC = Op.getOperand(2);
+
+  // Extract the LHS vectors
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
+
+  // Extract the RHS vectors
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
+
+  // Issue the operation on the smaller types and concatenate the result back
+  MVT EltVT = VT.getVectorElementType();
+  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
+}
+
+static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+
+  assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+         "Unexpected type for boolean compare operation");
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
+                               DAG.getConstant(-1, dl, VT));
+  SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
+                               DAG.getConstant(-1, dl, VT));
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETEQ:
+    // (x == y) -> ~(x ^ y)
+    return DAG.getNode(ISD::XOR, dl, VT,
+                       DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
+                       DAG.getConstant(-1, dl, VT));
+  case ISD::SETNE:
+    // (x != y) -> (x ^ y)
+    return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
+  case ISD::SETUGT:
+  case ISD::SETGT:
+    // (x > y) -> (x & ~y)
+    return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
+  case ISD::SETULT:
+  case ISD::SETLT:
+    // (x < y) -> (~x & y)
+    return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
+  case ISD::SETULE:
+  case ISD::SETLE:
+    // (x <= y) -> (~x | y)
+    return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
+  case ISD::SETUGE:
+  case ISD::SETGE:
+    // (x >=y) -> (x | ~y)
+    return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
+  }
+}
+
+static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
+
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+
+  assert(VT.getVectorElementType() == MVT::i1 &&
+         "Cannot set masked compare for this operation");
+
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  unsigned  Opc = 0;
+  bool Unsigned = false;
+  bool Swap = false;
+  unsigned SSECC;
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETNE:  SSECC = 4; break;
+  case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
+  case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
+  case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
+  case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
+  case ISD::SETULT: SSECC = 1; Unsigned = true; break;
+  case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
+  case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
+  case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
+  case ISD::SETLE:  SSECC = 2; break;
+  }
+
+  if (Swap)
+    std::swap(Op0, Op1);
+  if (Opc)
+    return DAG.getNode(Opc, dl, VT, Op0, Op1);
+  Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
+  return DAG.getNode(Opc, dl, VT, Op0, Op1,
+                     DAG.getConstant(SSECC, dl, MVT::i8));
+}
+
+/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
+/// operand \p Op1.  If non-trivial (for example because it's not constant)
+/// return an empty value.
+static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
+                                      SelectionDAG &DAG) {
+  BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
+  if (!BV)
+    return SDValue();
+
+  MVT VT = Op1.getSimpleValueType();
+  MVT EVT = VT.getVectorElementType();
+  unsigned n = VT.getVectorNumElements();
+  SmallVector<SDValue, 8> ULTOp1;
+
+  for (unsigned i = 0; i < n; ++i) {
+    ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
+    if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
+      return SDValue();
+
+    // Avoid underflow.
+    APInt Val = Elt->getAPIntValue();
+    if (Val == 0)
+      return SDValue();
+
+    ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
+  }
+
+  return DAG.getBuildVector(VT, dl, ULTOp1);
+}
+
+static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
+                           SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  MVT VT = Op.getSimpleValueType();
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
+  SDLoc dl(Op);
+
+  if (isFP) {
+#ifndef NDEBUG
+    MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
+    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+#endif
+
+    unsigned Opc;
+    if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
+      assert(VT.getVectorNumElements() <= 16);
+      Opc = X86ISD::CMPM;
+    } else {
+      Opc = X86ISD::CMPP;
+      // The SSE/AVX packed FP comparison nodes are defined with a
+      // floating-point vector result that matches the operand type. This allows
+      // them to work with an SSE1 target (integer vector types are not legal).
+      VT = Op0.getSimpleValueType();
+    }
+
+    // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
+    // emit two comparisons and a logic op to tie them together.
+    // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
+    // available.
+    SDValue Cmp;
+    unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
+    if (SSECC == 8) {
+      // LLVM predicate is SETUEQ or SETONE.
+      unsigned CC0, CC1;
+      unsigned CombineOpc;
+      if (SetCCOpcode == ISD::SETUEQ) {
+        CC0 = 3; // UNORD
+        CC1 = 0; // EQ
+        CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
+                                           static_cast<unsigned>(ISD::OR);
+      } else {
+        assert(SetCCOpcode == ISD::SETONE);
+        CC0 = 7; // ORD
+        CC1 = 4; // NEQ
+        CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
+                                           static_cast<unsigned>(ISD::AND);
+      }
+
+      SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
+                                 DAG.getConstant(CC0, dl, MVT::i8));
+      SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
+                                 DAG.getConstant(CC1, dl, MVT::i8));
+      Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
+    } else {
+      // Handle all other FP comparisons here.
+      Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
+                        DAG.getConstant(SSECC, dl, MVT::i8));
+    }
+
+    // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
+    // result type of SETCC. The bitcast is expected to be optimized away
+    // during combining/isel.
+    if (Opc == X86ISD::CMPP)
+      Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+
+    return Cmp;
+  }
+
+  MVT VTOp0 = Op0.getSimpleValueType();
+  assert(VTOp0 == Op1.getSimpleValueType() &&
+         "Expected operands with same type!");
+  assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
+         "Invalid number of packed elements for source and destination!");
+
+  if (VT.is128BitVector() && VTOp0.is256BitVector()) {
+    // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
+    // legalizer to a wider vector type.  In the case of 'vsetcc' nodes, the
+    // legalizer firstly checks if the first operand in input to the setcc has
+    // a legal type. If so, then it promotes the return type to that same type.
+    // Otherwise, the return type is promoted to the 'next legal type' which,
+    // for a vector of MVT::i1 is always a 128-bit integer vector type.
+    //
+    // We reach this code only if the following two conditions are met:
+    // 1. Both return type and operand type have been promoted to wider types
+    //    by the type legalizer.
+    // 2. The original operand type has been promoted to a 256-bit vector.
+    //
+    // Note that condition 2. only applies for AVX targets.
+    SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
+    return DAG.getZExtOrTrunc(NewOp, dl, VT);
+  }
+
+  // The non-AVX512 code below works under the assumption that source and
+  // destination types are the same.
+  assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
+         "Value types for source and destination must be the same!");
+
+  // Break 256-bit integer vector compare into smaller ones.
+  if (VT.is256BitVector() && !Subtarget.hasInt256())
+    return Lower256IntVSETCC(Op, DAG);
+
+  // Operands are boolean (vectors of i1)
+  MVT OpVT = Op1.getSimpleValueType();
+  if (OpVT.getVectorElementType() == MVT::i1)
+    return LowerBoolVSETCC_AVX512(Op, DAG);
+
+  // The result is boolean, but operands are int/float
+  if (VT.getVectorElementType() == MVT::i1) {
+    // In AVX-512 architecture setcc returns mask with i1 elements,
+    // But there is no compare instruction for i8 and i16 elements in KNL.
+    // In this case use SSE compare
+    bool UseAVX512Inst =
+      (OpVT.is512BitVector() ||
+       OpVT.getScalarSizeInBits() >= 32 ||
+       (Subtarget.hasBWI() && Subtarget.hasVLX()));
+
+    if (UseAVX512Inst)
+      return LowerIntVSETCC_AVX512(Op, DAG);
+
+    return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                        DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
+  }
+
+  // Lower using XOP integer comparisons.
+  if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
+       VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
+    // Translate compare code to XOP PCOM compare mode.
+    unsigned CmpMode = 0;
+    switch (SetCCOpcode) {
+    default: llvm_unreachable("Unexpected SETCC condition");
+    case ISD::SETULT:
+    case ISD::SETLT: CmpMode = 0x00; break;
+    case ISD::SETULE:
+    case ISD::SETLE: CmpMode = 0x01; break;
+    case ISD::SETUGT:
+    case ISD::SETGT: CmpMode = 0x02; break;
+    case ISD::SETUGE:
+    case ISD::SETGE: CmpMode = 0x03; break;
+    case ISD::SETEQ: CmpMode = 0x04; break;
+    case ISD::SETNE: CmpMode = 0x05; break;
+    }
+
+    // Are we comparing unsigned or signed integers?
+    unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
+      ? X86ISD::VPCOMU : X86ISD::VPCOM;
+
+    return DAG.getNode(Opc, dl, VT, Op0, Op1,
+                       DAG.getConstant(CmpMode, dl, MVT::i8));
+  }
+
+  // We are handling one of the integer comparisons here.  Since SSE only has
+  // GT and EQ comparisons for integer, swapping operands and multiple
+  // operations may be required for some comparisons.
+  unsigned Opc;
+  bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
+  bool Subus = false;
+
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETNE:  Invert = true;
+  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
+  case ISD::SETLT:  Swap = true;
+  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
+  case ISD::SETGE:  Swap = true;
+  case ISD::SETLE:  Opc = X86ISD::PCMPGT;
+                    Invert = true; break;
+  case ISD::SETULT: Swap = true;
+  case ISD::SETUGT: Opc = X86ISD::PCMPGT;
+                    FlipSigns = true; break;
+  case ISD::SETUGE: Swap = true;
+  case ISD::SETULE: Opc = X86ISD::PCMPGT;
+                    FlipSigns = true; Invert = true; break;
+  }
+
+  // Special case: Use min/max operations for SETULE/SETUGE
+  MVT VET = VT.getVectorElementType();
+  bool hasMinMax =
+       (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
+    || (Subtarget.hasSSE2()  && (VET == MVT::i8));
+
+  if (hasMinMax) {
+    switch (SetCCOpcode) {
+    default: break;
+    case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
+    case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
+    }
+
+    if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
+  }
+
+  bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
+  if (!MinMax && hasSubus) {
+    // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
+    // Op0 u<= Op1:
+    //   t = psubus Op0, Op1
+    //   pcmpeq t, <0..0>
+    switch (SetCCOpcode) {
+    default: break;
+    case ISD::SETULT: {
+      // If the comparison is against a constant we can turn this into a
+      // setule.  With psubus, setule does not require a swap.  This is
+      // beneficial because the constant in the register is no longer
+      // destructed as the destination so it can be hoisted out of a loop.
+      // Only do this pre-AVX since vpcmp* is no longer destructive.
+      if (Subtarget.hasAVX())
+        break;
+      if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
+        Op1 = ULEOp1;
+        Subus = true; Invert = false; Swap = false;
+      }
+      break;
+    }
+    // Psubus is better than flip-sign because it requires no inversion.
+    case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
+    case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
+    }
+
+    if (Subus) {
+      Opc = X86ISD::SUBUS;
+      FlipSigns = false;
+    }
+  }
+
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  // Check that the operation in question is available (most are plain SSE2,
+  // but PCMPGTQ and PCMPEQQ have different requirements).
+  if (VT == MVT::v2i64) {
+    if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
+      assert(Subtarget.hasSSE2() && "Don't know how to lower!");
+
+      // First cast everything to the right type.
+      Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+      Op1 = DAG.getBitcast(MVT::v4i32, Op1);
+
+      // Since SSE has no unsigned integer comparisons, we need to flip the sign
+      // bits of the inputs before performing those operations. The lower
+      // compare is always unsigned.
+      SDValue SB;
+      if (FlipSigns) {
+        SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
+      } else {
+        SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
+        SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
+        SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
+      }
+      Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
+      Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
+
+      // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
+      SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
+      SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
+
+      // Create masks for only the low parts/high parts of the 64 bit integers.
+      static const int MaskHi[] = { 1, 1, 3, 3 };
+      static const int MaskLo[] = { 0, 0, 2, 2 };
+      SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
+      SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
+      SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
+
+      SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
+      Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
+
+      if (Invert)
+        Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+      return DAG.getBitcast(VT, Result);
+    }
+
+    if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
+      // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
+      // pcmpeqd + pshufd + pand.
+      assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
+
+      // First cast everything to the right type.
+      Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+      Op1 = DAG.getBitcast(MVT::v4i32, Op1);
+
+      // Do the compare.
+      SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
+
+      // Make sure the lower and upper halves are both all-ones.
+      static const int Mask[] = { 1, 0, 3, 2 };
+      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
+      Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
+
+      if (Invert)
+        Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+      return DAG.getBitcast(VT, Result);
+    }
+  }
+
+  // Since SSE has no unsigned integer comparisons, we need to flip the sign
+  // bits of the inputs before performing those operations.
+  if (FlipSigns) {
+    MVT EltVT = VT.getVectorElementType();
+    SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
+                                 VT);
+    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
+    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
+  }
+
+  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+
+  // If the logical-not of the result is required, perform that now.
+  if (Invert)
+    Result = DAG.getNOT(dl, Result, VT);
+
+  if (MinMax)
+    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
+
+  if (Subus)
+    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
+                         getZeroVector(VT, Subtarget, DAG, dl));
+
+  return Result;
+}
+
+SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+  MVT VT = Op.getSimpleValueType();
+
+  if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
+
+  assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
+         && "SetCC type must be 8-bit or 1-bit integer");
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDLoc dl(Op);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+  // Optimize to BT if possible.
+  // Lower (X & (1 << N)) == 0 to BT(X, N).
+  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
+  // Lower (trunc (X >> N) to i1) to BT(X, N).
+  if (Op0.hasOneUse() && isNullConstant(Op1) &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
+      if (VT == MVT::i1)
+        return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
+      return NewSetCC;
+    }
+  }
+
+  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
+  // these.
+  if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+    // If the input is a setcc, then reuse the input setcc or use a new one with
+    // the inverted condition.
+    if (Op0.getOpcode() == X86ISD::SETCC) {
+      X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
+      bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
+      if (!Invert)
+        return Op0;
+
+      CCode = X86::GetOppositeBranchCondition(CCode);
+      SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
+      if (VT == MVT::i1)
+        return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+      return SetCC;
+    }
+  }
+  if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    if (isOneConstant(Op1)) {
+      ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
+      return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
+    }
+    if (!isNullConstant(Op1)) {
+      SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
+      return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
+    }
+  }
+
+  bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
+  X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
+  if (X86CC == X86::COND_INVALID)
+    return SDValue();
+
+  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
+  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
+  SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
+  if (VT == MVT::i1)
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
+  return SetCC;
+}
+
+SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Carry = Op.getOperand(2);
+  SDValue Cond = Op.getOperand(3);
+  SDLoc DL(Op);
+
+  assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
+  X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
+
+  assert(Carry.getOpcode() != ISD::CARRY_FALSE);
+  SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+  SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
+  SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
+  if (Op.getSimpleValueType() == MVT::i1)
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+  return SetCC;
+}
+
+/// Return true if opcode is a X86 logical comparison.
+static bool isX86LogicalCmp(SDValue Op) {
+  unsigned Opc = Op.getOpcode();
+  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
+      Opc == X86ISD::SAHF)
+    return true;
+  if (Op.getResNo() == 1 &&
+      (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
+       Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
+       Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
+       Opc == X86ISD::XOR || Opc == X86ISD::AND))
+    return true;
+
+  if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
+    return true;
+
+  return false;
+}
+
+static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+  if (V.getOpcode() != ISD::TRUNCATE)
+    return false;
+
+  SDValue VOp0 = V.getOperand(0);
+  unsigned InBits = VOp0.getValueSizeInBits();
+  unsigned Bits = V.getValueSizeInBits();
+  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
+}
+
+SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  bool AddTest = true;
+  SDValue Cond  = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+  SDLoc DL(Op);
+  MVT VT = Op1.getSimpleValueType();
+  SDValue CC;
+
+  // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
+  // are available or VBLENDV if AVX is available.
+  // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
+  if (Cond.getOpcode() == ISD::SETCC &&
+      ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
+       (Subtarget.hasSSE1() && VT == MVT::f32)) &&
+      VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
+    SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
+    int SSECC = translateX86FSETCC(
+        cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
+
+    if (SSECC != 8) {
+      if (Subtarget.hasAVX512()) {
+        SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0,
+                                  CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
+        return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
+                           DL, VT, Cmp, Op1, Op2);
+      }
+
+      SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
+                                DAG.getConstant(SSECC, DL, MVT::i8));
+
+      // If we have AVX, we can use a variable vector select (VBLENDV) instead
+      // of 3 logic instructions for size savings and potentially speed.
+      // Unfortunately, there is no scalar form of VBLENDV.
+
+      // If either operand is a constant, don't try this. We can expect to
+      // optimize away at least one of the logic instructions later in that
+      // case, so that sequence would be faster than a variable blend.
+
+      // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
+      // uses XMM0 as the selection register. That may need just as many
+      // instructions as the AND/ANDN/OR sequence due to register moves, so
+      // don't bother.
+
+      if (Subtarget.hasAVX() &&
+          !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
+
+        // Convert to vectors, do a VSELECT, and convert back to scalar.
+        // All of the conversions should be optimized away.
+
+        MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
+        SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
+        SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
+        SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
+
+        MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
+        VCmp = DAG.getBitcast(VCmpVT, VCmp);
+
+        SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
+
+        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+                           VSel, DAG.getIntPtrConstant(0, DL));
+      }
+      SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
+      SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
+      return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
+    }
+  }
+
+  // AVX512 fallback is to lower selects of scalar floats to masked moves.
+  if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) &&
+      Subtarget.hasAVX512())
+    return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2);
+
+  if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
+    SDValue Op1Scalar;
+    if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
+      Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
+    else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
+      Op1Scalar = Op1.getOperand(0);
+    SDValue Op2Scalar;
+    if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
+      Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
+    else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
+      Op2Scalar = Op2.getOperand(0);
+    if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
+      SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
+                                      Op1Scalar.getValueType(),
+                                      Cond, Op1Scalar, Op2Scalar);
+      if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
+        return DAG.getBitcast(VT, newSelect);
+      SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
+                         DAG.getIntPtrConstant(0, DL));
+    }
+  }
+
+  if (VT == MVT::v4i1 || VT == MVT::v2i1) {
+    SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
+    Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
+                      DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
+    Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
+                      DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
+    SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
+                                    Cond, Op1, Op2);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
+  }
+
+  if (Cond.getOpcode() == ISD::SETCC) {
+    if (SDValue NewCond = LowerSETCC(Cond, DAG))
+      Cond = NewCond;
+  }
+
+  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
+  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
+  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
+  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
+  if (Cond.getOpcode() == X86ISD::SETCC &&
+      Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
+      isNullConstant(Cond.getOperand(1).getOperand(1))) {
+    SDValue Cmp = Cond.getOperand(1);
+
+    unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+
+    if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+        (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
+      SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
+
+      SDValue CmpOp0 = Cmp.getOperand(0);
+      // Apply further optimizations for special cases
+      // (select (x != 0), -1, 0) -> neg & sbb
+      // (select (x == 0), 0, -1) -> neg & sbb
+      if (isNullConstant(Y) &&
+            (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
+          SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+          SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
+                                    DAG.getConstant(0, DL,
+                                                    CmpOp0.getValueType()),
+                                    CmpOp0);
+          SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                                    DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                                    SDValue(Neg.getNode(), 1));
+          return Res;
+        }
+
+      Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+                        CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
+      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+
+      SDValue Res =   // Res = 0 or -1.
+        DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                    DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
+
+      if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
+        Res = DAG.getNOT(DL, Res, Res.getValueType());
+
+      if (!isNullConstant(Op2))
+        Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
+      return Res;
+    }
+  }
+
+  // Look past (and (setcc_carry (cmp ...)), 1).
+  if (Cond.getOpcode() == ISD::AND &&
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+      isOneConstant(Cond.getOperand(1)))
+    Cond = Cond.getOperand(0);
+
+  // If condition flag is set by a X86ISD::CMP, then use it as the condition
+  // setting operand in place of the X86ISD::SETCC.
+  unsigned CondOpcode = Cond.getOpcode();
+  if (CondOpcode == X86ISD::SETCC ||
+      CondOpcode == X86ISD::SETCC_CARRY) {
+    CC = Cond.getOperand(0);
+
+    SDValue Cmp = Cond.getOperand(1);
+    unsigned Opc = Cmp.getOpcode();
+    MVT VT = Op.getSimpleValueType();
+
+    bool IllegalFPCMov = false;
+    if (VT.isFloatingPoint() && !VT.isVector() &&
+        !isScalarFPTypeInSSEReg(VT))  // FPStack?
+      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
+
+    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
+        Opc == X86ISD::BT) { // FIXME
+      Cond = Cmp;
+      AddTest = false;
+    }
+  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+             ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+              Cond.getOperand(0).getValueType() != MVT::i8)) {
+    SDValue LHS = Cond.getOperand(0);
+    SDValue RHS = Cond.getOperand(1);
+    unsigned X86Opcode;
+    unsigned X86Cond;
+    SDVTList VTs;
+    switch (CondOpcode) {
+    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+    default: llvm_unreachable("unexpected overflowing operator");
+    }
+    if (CondOpcode == ISD::UMULO)
+      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+                          MVT::i32);
+    else
+      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+    SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
+
+    if (CondOpcode == ISD::UMULO)
+      Cond = X86Op.getValue(2);
+    else
+      Cond = X86Op.getValue(1);
+
+    CC = DAG.getConstant(X86Cond, DL, MVT::i8);
+    AddTest = false;
+  }
+
+  if (AddTest) {
+    // Look past the truncate if the high bits are known zero.
+    if (isTruncWithZeroHighBitsInput(Cond, DAG))
+      Cond = Cond.getOperand(0);
+
+    // We know the result of AND is compared against zero. Try to match
+    // it to BT.
+    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+      if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
+        CC = NewSetCC.getOperand(0);
+        Cond = NewSetCC.getOperand(1);
+        AddTest = false;
+      }
+    }
+  }
+
+  if (AddTest) {
+    CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
+    Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
+  }
+
+  // a <  b ? -1 :  0 -> RES = ~setcc_carry
+  // a <  b ?  0 : -1 -> RES = setcc_carry
+  // a >= b ? -1 :  0 -> RES = setcc_carry
+  // a >= b ?  0 : -1 -> RES = ~setcc_carry
+  if (Cond.getOpcode() == X86ISD::SUB) {
+    Cond = ConvertCmpIfNecessary(Cond, DAG);
+    unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
+
+    if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
+        (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+        (isNullConstant(Op1) || isNullConstant(Op2))) {
+      SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                                DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                                Cond);
+      if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
+        return DAG.getNOT(DL, Res, Res.getValueType());
+      return Res;
+    }
+  }
+
+  // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
+  // widen the cmov and push the truncate through. This avoids introducing a new
+  // branch during isel and doesn't add any extensions.
+  if (Op.getValueType() == MVT::i8 &&
+      Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
+    SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
+    if (T1.getValueType() == T2.getValueType() &&
+        // Blacklist CopyFromReg to avoid partial register stalls.
+        T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
+      SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
+      SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
+      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
+    }
+  }
+
+  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
+  // condition is true.
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  SDValue Ops[] = { Op2, Op1, CC, Cond };
+  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
+}
+
+static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+  MVT VTElt = VT.getVectorElementType();
+  MVT InVTElt = InVT.getVectorElementType();
+  SDLoc dl(Op);
+
+  // SKX processor
+  if ((InVTElt == MVT::i1) &&
+      (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
+        VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
+
+       ((Subtarget.hasBWI() && VT.is512BitVector() &&
+        VTElt.getSizeInBits() <= 16)) ||
+
+       ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
+        VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
+
+       ((Subtarget.hasDQI() && VT.is512BitVector() &&
+        VTElt.getSizeInBits() >= 32))))
+    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
+    return SDValue();
+
+  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
+    if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
+      return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
+    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+  }
+
+  assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+  MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+  SDValue NegOne = DAG.getConstant(
+      APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
+  SDValue Zero = DAG.getConstant(
+      APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
+
+  SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
+  if (VT.is512BitVector())
+    return V;
+  return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
+}
+
+// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
+// For sign extend this needs to handle all vector sizes and SSE4.1 and
+// non-SSE4.1 targets. For zero extend this should only handle inputs of
+// MVT::v64i8 when BWI is not supported, but AVX512 is.
+static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
+                                        const X86Subtarget &Subtarget,
+                                        SelectionDAG &DAG) {
+  SDValue In = Op->getOperand(0);
+  MVT VT = Op->getSimpleValueType(0);
+  MVT InVT = In.getSimpleValueType();
+  assert(VT.getSizeInBits() == InVT.getSizeInBits());
+
+  MVT SVT = VT.getVectorElementType();
+  MVT InSVT = InVT.getVectorElementType();
+  assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
+
+  if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
+    return SDValue();
+  if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
+    return SDValue();
+  if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
+      !(VT.is256BitVector() && Subtarget.hasInt256()) &&
+      !(VT.is512BitVector() && Subtarget.hasAVX512()))
+    return SDValue();
+
+  SDLoc dl(Op);
+
+  // For 256-bit vectors, we only need the lower (128-bit) half of the input.
+  // For 512-bit vectors, we need 128-bits or 256-bits.
+  if (VT.getSizeInBits() > 128) {
+    // Input needs to be at least the same number of elements as output, and
+    // at least 128-bits.
+    int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
+    In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
+  }
+
+  assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
+          InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
+
+  // SSE41 targets can use the pmovsx* instructions directly.
+  unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
+                      X86ISD::VSEXT : X86ISD::VZEXT;
+  if (Subtarget.hasSSE41())
+    return DAG.getNode(ExtOpc, dl, VT, In);
+
+  // We should only get here for sign extend.
+  assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
+         "Unexpected opcode!");
+
+  // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
+  SDValue Curr = In;
+  MVT CurrVT = InVT;
+
+  // As SRAI is only available on i16/i32 types, we expand only up to i32
+  // and handle i64 separately.
+  while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
+    Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
+    MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
+    CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
+    Curr = DAG.getBitcast(CurrVT, Curr);
+  }
+
+  SDValue SignExt = Curr;
+  if (CurrVT != InVT) {
+    unsigned SignExtShift =
+        CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
+    SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
+                          DAG.getConstant(SignExtShift, dl, MVT::i8));
+  }
+
+  if (CurrVT == VT)
+    return SignExt;
+
+  if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
+    SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
+                               DAG.getConstant(31, dl, MVT::i8));
+    SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
+    return DAG.getBitcast(VT, Ext);
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
+                                SelectionDAG &DAG) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+  SDLoc dl(Op);
+
+  if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
+    return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
+
+  if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
+      (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
+      (VT != MVT::v16i16 || InVT != MVT::v16i8))
+    return SDValue();
+
+  if (Subtarget.hasInt256())
+    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+  // Optimize vectors in AVX mode
+  // Sign extend  v8i16 to v8i32 and
+  //              v4i32 to v4i64
+  //
+  // Divide input vector into two parts
+  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+  // concat the vectors to original VT
+
+  unsigned NumElems = InVT.getVectorNumElements();
+  SDValue Undef = DAG.getUNDEF(InVT);
+
+  SmallVector<int,8> ShufMask1(NumElems, -1);
+  for (unsigned i = 0; i != NumElems/2; ++i)
+    ShufMask1[i] = i;
+
+  SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
+
+  SmallVector<int,8> ShufMask2(NumElems, -1);
+  for (unsigned i = 0; i != NumElems/2; ++i)
+    ShufMask2[i] = i + NumElems/2;
+
+  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
+
+  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
+                                VT.getVectorNumElements() / 2);
+
+  OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
+  OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+// Lower truncating store. We need a special lowering to vXi1 vectors
+static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
+                                    SelectionDAG &DAG) {
+  StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
+  SDLoc dl(St);
+  EVT MemVT = St->getMemoryVT();
+  assert(St->isTruncatingStore() && "We only custom truncating store.");
+  assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
+         "Expected truncstore of i1 vector");
+
+  SDValue Op = St->getValue();
+  MVT OpVT = Op.getValueType().getSimpleVT();
+  unsigned NumElts = OpVT.getVectorNumElements();
+  if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
+      NumElts == 16) {
+    // Truncate and store - everything is legal
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
+    if (MemVT.getSizeInBits() < 8)
+      Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+                       DAG.getUNDEF(MVT::v8i1), Op,
+                       DAG.getIntPtrConstant(0, dl));
+    return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
+                        St->getMemOperand());
+  }
+
+  // A subset, assume that we have only AVX-512F
+  if (NumElts <= 8) {
+    if (NumElts < 8) {
+      // Extend to 8-elts vector
+      MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
+      Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
+                        DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
+    }
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
+    return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
+                        St->getMemOperand());
+  }
+  // v32i8
+  assert(OpVT == MVT::v32i8 && "Unexpected operand type");
+  // Divide the vector into 2 parts and store each part separately
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
+                            DAG.getIntPtrConstant(0, dl));
+  Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
+  SDValue BasePtr = St->getBasePtr();
+  SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
+                              St->getMemOperand());
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
+                            DAG.getIntPtrConstant(16, dl));
+  Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
+
+  SDValue BasePtrHi =
+    DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                DAG.getConstant(2, dl, BasePtr.getValueType()));
+
+  SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
+                              BasePtrHi, St->getMemOperand());
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
+}
+
+static SDValue LowerExtended1BitVectorLoad(SDValue Op,
+                                           const X86Subtarget &Subtarget,
+                                           SelectionDAG &DAG) {
+
+  LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+  SDLoc dl(Ld);
+  EVT MemVT = Ld->getMemoryVT();
+  assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
+         "Expected i1 vector load");
+  unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
+    ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+  MVT VT = Op.getValueType().getSimpleVT();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
+      NumElts == 16) {
+    // Load and extend - everything is legal
+    if (NumElts < 8) {
+      SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
+                                 Ld->getBasePtr(),
+                                 Ld->getMemOperand());
+      // Replace chain users with the new chain.
+      assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+      MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
+      SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
+
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+                                   DAG.getIntPtrConstant(0, dl));
+    }
+    SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
+                               Ld->getBasePtr(),
+                               Ld->getMemOperand());
+    // Replace chain users with the new chain.
+    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+
+    // Finally, do a normal sign-extend to the desired register.
+    return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
+  }
+
+  if (NumElts <= 8) {
+    // A subset, assume that we have only AVX-512F
+    unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
+    MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
+    SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
+                              Ld->getBasePtr(),
+                              Ld->getMemOperand());
+    // Replace chain users with the new chain.
+    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
+    SDValue BitVec = DAG.getBitcast(MaskVT, Load);
+
+    if (NumElts == 8)
+      return DAG.getNode(ExtOpcode, dl, VT, BitVec);
+
+      // we should take care to v4i1 and v2i1
+
+    MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
+    SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+                        DAG.getIntPtrConstant(0, dl));
+  }
+
+  assert(VT == MVT::v32i8 && "Unexpected extload type");
+
+  SmallVector<SDValue, 2> Chains;
+
+  SDValue BasePtr = Ld->getBasePtr();
+  SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
+                               Ld->getBasePtr(),
+                               Ld->getMemOperand());
+  Chains.push_back(LoadLo.getValue(1));
+
+  SDValue BasePtrHi =
+    DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                DAG.getConstant(2, dl, BasePtr.getValueType()));
+
+  SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
+                               BasePtrHi,
+                               Ld->getMemOperand());
+  Chains.push_back(LoadHi.getValue(1));
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
+
+  SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
+  SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
+}
+
+// Lower vector extended loads using a shuffle. If SSSE3 is not available we
+// may emit an illegal shuffle but the expansion is still better than scalar
+// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
+// we'll emit a shuffle and a arithmetic shift.
+// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
+// TODO: It is possible to support ZExt by zeroing the undef values during
+// the shuffle phase or after the shuffle.
+static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
+                                 SelectionDAG &DAG) {
+  MVT RegVT = Op.getSimpleValueType();
+  assert(RegVT.isVector() && "We only custom lower vector sext loads.");
+  assert(RegVT.isInteger() &&
+         "We only custom lower integer vector sext loads.");
+
+  // Nothing useful we can do without SSE2 shuffles.
+  assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
+
+  LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+  SDLoc dl(Ld);
+  EVT MemVT = Ld->getMemoryVT();
+  if (MemVT.getScalarType() == MVT::i1)
+    return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned RegSz = RegVT.getSizeInBits();
+
+  ISD::LoadExtType Ext = Ld->getExtensionType();
+
+  assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
+         && "Only anyext and sext are currently implemented.");
+  assert(MemVT != RegVT && "Cannot extend to the same type");
+  assert(MemVT.isVector() && "Must load a vector from memory");
+
+  unsigned NumElems = RegVT.getVectorNumElements();
+  unsigned MemSz = MemVT.getSizeInBits();
+  assert(RegSz > MemSz && "Register size must be greater than the mem size");
+
+  if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
+    // The only way in which we have a legal 256-bit vector result but not the
+    // integer 256-bit operations needed to directly lower a sextload is if we
+    // have AVX1 but not AVX2. In that case, we can always emit a sextload to
+    // a 128-bit vector and a normal sign_extend to 256-bits that should get
+    // correctly legalized. We do this late to allow the canonical form of
+    // sextload to persist throughout the rest of the DAG combiner -- it wants
+    // to fold together any extensions it can, and so will fuse a sign_extend
+    // of an sextload into a sextload targeting a wider value.
+    SDValue Load;
+    if (MemSz == 128) {
+      // Just switch this to a normal load.
+      assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
+                                       "it must be a legal 128-bit vector "
+                                       "type!");
+      Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
+                         Ld->getPointerInfo(), Ld->getAlignment(),
+                         Ld->getMemOperand()->getFlags());
+    } else {
+      assert(MemSz < 128 &&
+             "Can't extend a type wider than 128 bits to a 256 bit vector!");
+      // Do an sext load to a 128-bit vector type. We want to use the same
+      // number of elements, but elements half as wide. This will end up being
+      // recursively lowered by this routine, but will succeed as we definitely
+      // have all the necessary features if we're using AVX1.
+      EVT HalfEltVT =
+          EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
+      EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
+      Load =
+          DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
+                         Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
+                         Ld->getMemOperand()->getFlags());
+    }
+
+    // Replace chain users with the new chain.
+    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+
+    // Finally, do a normal sign-extend to the desired register.
+    return DAG.getSExtOrTrunc(Load, dl, RegVT);
+  }
+
+  // All sizes must be a power of two.
+  assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
+         "Non-power-of-two elements are not custom lowered!");
+
+  // Attempt to load the original value using scalar loads.
+  // Find the largest scalar type that divides the total loaded size.
+  MVT SclrLoadTy = MVT::i8;
+  for (MVT Tp : MVT::integer_valuetypes()) {
+    if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
+      SclrLoadTy = Tp;
+    }
+  }
+
+  // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+  if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
+      (64 <= MemSz))
+    SclrLoadTy = MVT::f64;
+
+  // Calculate the number of scalar loads that we need to perform
+  // in order to load our vector from memory.
+  unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+
+  assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
+         "Can only lower sext loads with a single scalar load!");
+
+  unsigned loadRegZize = RegSz;
+  if (Ext == ISD::SEXTLOAD && RegSz >= 256)
+    loadRegZize = 128;
+
+  // Represent our vector as a sequence of elements which are the
+  // largest scalar that we can load.
+  EVT LoadUnitVecVT = EVT::getVectorVT(
+      *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
+
+  // Represent the data using the same element type that is stored in
+  // memory. In practice, we ''widen'' MemVT.
+  EVT WideVecVT =
+      EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+                       loadRegZize / MemVT.getScalarSizeInBits());
+
+  assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+         "Invalid vector type");
+
+  // We can't shuffle using an illegal type.
+  assert(TLI.isTypeLegal(WideVecVT) &&
+         "We only lower types that form legal widened vector types");
+
+  SmallVector<SDValue, 8> Chains;
+  SDValue Ptr = Ld->getBasePtr();
+  SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
+                                      TLI.getPointerTy(DAG.getDataLayout()));
+  SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+  for (unsigned i = 0; i < NumLoads; ++i) {
+    // Perform a single load.
+    SDValue ScalarLoad =
+        DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+                    Ld->getAlignment(), Ld->getMemOperand()->getFlags());
+    Chains.push_back(ScalarLoad.getValue(1));
+    // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+    // another round of DAGCombining.
+    if (i == 0)
+      Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+    else
+      Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+                        ScalarLoad, DAG.getIntPtrConstant(i, dl));
+
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+  }
+
+  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+
+  // Bitcast the loaded value to a vector of the original element type, in
+  // the size of the target vector type.
+  SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
+  unsigned SizeRatio = RegSz / MemSz;
+
+  if (Ext == ISD::SEXTLOAD) {
+    // If we have SSE4.1, we can directly emit a VSEXT node.
+    if (Subtarget.hasSSE41()) {
+      SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+      return Sext;
+    }
+
+    // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
+    // lanes.
+    assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
+           "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
+
+    SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+    return Shuff;
+  }
+
+  // Redistribute the loaded elements into the different locations.
+  SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+  for (unsigned i = 0; i != NumElems; ++i)
+    ShuffleVec[i * SizeRatio] = i;
+
+  SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+                                       DAG.getUNDEF(WideVecVT), ShuffleVec);
+
+  // Bitcast to the requested type.
+  Shuff = DAG.getBitcast(RegVT, Shuff);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+  return Shuff;
+}
+
+/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
+/// each of which has no other use apart from the AND / OR.
+static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
+  Opc = Op.getOpcode();
+  if (Opc != ISD::OR && Opc != ISD::AND)
+    return false;
+  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+          Op.getOperand(0).hasOneUse() &&
+          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
+          Op.getOperand(1).hasOneUse());
+}
+
+/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
+/// SETCC node has a single use.
+static bool isXor1OfSetCC(SDValue Op) {
+  if (Op.getOpcode() != ISD::XOR)
+    return false;
+  if (isOneConstant(Op.getOperand(1)))
+    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+           Op.getOperand(0).hasOneUse();
+  return false;
+}
+
+SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+  bool addTest = true;
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond  = Op.getOperand(1);
+  SDValue Dest  = Op.getOperand(2);
+  SDLoc dl(Op);
+  SDValue CC;
+  bool Inverted = false;
+
+  if (Cond.getOpcode() == ISD::SETCC) {
+    // Check for setcc([su]{add,sub,mul}o == 0).
+    if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
+        isNullConstant(Cond.getOperand(1)) &&
+        Cond.getOperand(0).getResNo() == 1 &&
+        (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
+         Cond.getOperand(0).getOpcode() == ISD::UADDO ||
+         Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
+         Cond.getOperand(0).getOpcode() == ISD::USUBO ||
+         Cond.getOperand(0).getOpcode() == ISD::SMULO ||
+         Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
+      Inverted = true;
+      Cond = Cond.getOperand(0);
+    } else {
+      if (SDValue NewCond = LowerSETCC(Cond, DAG))
+        Cond = NewCond;
+    }
+  }
+#if 0
+  // FIXME: LowerXALUO doesn't handle these!!
+  else if (Cond.getOpcode() == X86ISD::ADD  ||
+           Cond.getOpcode() == X86ISD::SUB  ||
+           Cond.getOpcode() == X86ISD::SMUL ||
+           Cond.getOpcode() == X86ISD::UMUL)
+    Cond = LowerXALUO(Cond, DAG);
+#endif
+
+  // Look pass (and (setcc_carry (cmp ...)), 1).
+  if (Cond.getOpcode() == ISD::AND &&
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+      isOneConstant(Cond.getOperand(1)))
+    Cond = Cond.getOperand(0);
+
+  // If condition flag is set by a X86ISD::CMP, then use it as the condition
+  // setting operand in place of the X86ISD::SETCC.
+  unsigned CondOpcode = Cond.getOpcode();
+  if (CondOpcode == X86ISD::SETCC ||
+      CondOpcode == X86ISD::SETCC_CARRY) {
+    CC = Cond.getOperand(0);
+
+    SDValue Cmp = Cond.getOperand(1);
+    unsigned Opc = Cmp.getOpcode();
+    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
+    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
+      Cond = Cmp;
+      addTest = false;
+    } else {
+      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
+      default: break;
+      case X86::COND_O:
+      case X86::COND_B:
+        // These can only come from an arithmetic instruction with overflow,
+        // e.g. SADDO, UADDO.
+        Cond = Cond.getOperand(1);
+        addTest = false;
+        break;
+      }
+    }
+  }
+  CondOpcode = Cond.getOpcode();
+  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+      CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+      ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+       Cond.getOperand(0).getValueType() != MVT::i8)) {
+    SDValue LHS = Cond.getOperand(0);
+    SDValue RHS = Cond.getOperand(1);
+    unsigned X86Opcode;
+    unsigned X86Cond;
+    SDVTList VTs;
+    // Keep this in sync with LowerXALUO, otherwise we might create redundant
+    // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
+    // X86ISD::INC).
+    switch (CondOpcode) {
+    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+    case ISD::SADDO:
+      if (isOneConstant(RHS)) {
+          X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
+          break;
+        }
+      X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+    case ISD::SSUBO:
+      if (isOneConstant(RHS)) {
+          X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
+          break;
+        }
+      X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+    default: llvm_unreachable("unexpected overflowing operator");
+    }
+    if (Inverted)
+      X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
+    if (CondOpcode == ISD::UMULO)
+      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+                          MVT::i32);
+    else
+      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+    SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
+
+    if (CondOpcode == ISD::UMULO)
+      Cond = X86Op.getValue(2);
+    else
+      Cond = X86Op.getValue(1);
+
+    CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+    addTest = false;
+  } else {
+    unsigned CondOpc;
+    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
+      SDValue Cmp = Cond.getOperand(0).getOperand(1);
+      if (CondOpc == ISD::OR) {
+        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
+        // two branches instead of an explicit OR instruction with a
+        // separate test.
+        if (Cmp == Cond.getOperand(1).getOperand(1) &&
+            isX86LogicalCmp(Cmp)) {
+          CC = Cond.getOperand(0).getOperand(0);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = Cond.getOperand(1).getOperand(0);
+          Cond = Cmp;
+          addTest = false;
+        }
+      } else { // ISD::AND
+        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
+        // two branches instead of an explicit AND instruction with a
+        // separate test. However, we only do this if this block doesn't
+        // have a fall-through edge, because this requires an explicit
+        // jmp when the condition is false.
+        if (Cmp == Cond.getOperand(1).getOperand(1) &&
+            isX86LogicalCmp(Cmp) &&
+            Op.getNode()->hasOneUse()) {
+          X86::CondCode CCode =
+            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+          CCode = X86::GetOppositeBranchCondition(CCode);
+          CC = DAG.getConstant(CCode, dl, MVT::i8);
+          SDNode *User = *Op.getNode()->use_begin();
+          // Look for an unconditional branch following this conditional branch.
+          // We need this because we need to reverse the successors in order
+          // to implement FCMP_OEQ.
+          if (User->getOpcode() == ISD::BR) {
+            SDValue FalseBB = User->getOperand(1);
+            SDNode *NewBR =
+              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+            assert(NewBR == User);
+            (void)NewBR;
+            Dest = FalseBB;
+
+            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                                Chain, Dest, CC, Cmp);
+            X86::CondCode CCode =
+              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
+            CCode = X86::GetOppositeBranchCondition(CCode);
+            CC = DAG.getConstant(CCode, dl, MVT::i8);
+            Cond = Cmp;
+            addTest = false;
+          }
+        }
+      }
+    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
+      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
+      // It should be transformed during dag combiner except when the condition
+      // is set by a arithmetics with overflow node.
+      X86::CondCode CCode =
+        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+      CCode = X86::GetOppositeBranchCondition(CCode);
+      CC = DAG.getConstant(CCode, dl, MVT::i8);
+      Cond = Cond.getOperand(0).getOperand(1);
+      addTest = false;
+    } else if (Cond.getOpcode() == ISD::SETCC &&
+               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
+      // For FCMP_OEQ, we can emit
+      // two branches instead of an explicit AND instruction with a
+      // separate test. However, we only do this if this block doesn't
+      // have a fall-through edge, because this requires an explicit
+      // jmp when the condition is false.
+      if (Op.getNode()->hasOneUse()) {
+        SDNode *User = *Op.getNode()->use_begin();
+        // Look for an unconditional branch following this conditional branch.
+        // We need this because we need to reverse the successors in order
+        // to implement FCMP_OEQ.
+        if (User->getOpcode() == ISD::BR) {
+          SDValue FalseBB = User->getOperand(1);
+          SDNode *NewBR =
+            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+          assert(NewBR == User);
+          (void)NewBR;
+          Dest = FalseBB;
+
+          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                                    Cond.getOperand(0), Cond.getOperand(1));
+          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+          CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
+          Cond = Cmp;
+          addTest = false;
+        }
+      }
+    } else if (Cond.getOpcode() == ISD::SETCC &&
+               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
+      // For FCMP_UNE, we can emit
+      // two branches instead of an explicit AND instruction with a
+      // separate test. However, we only do this if this block doesn't
+      // have a fall-through edge, because this requires an explicit
+      // jmp when the condition is false.
+      if (Op.getNode()->hasOneUse()) {
+        SDNode *User = *Op.getNode()->use_begin();
+        // Look for an unconditional branch following this conditional branch.
+        // We need this because we need to reverse the successors in order
+        // to implement FCMP_UNE.
+        if (User->getOpcode() == ISD::BR) {
+          SDValue FalseBB = User->getOperand(1);
+          SDNode *NewBR =
+            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+          assert(NewBR == User);
+          (void)NewBR;
+
+          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                                    Cond.getOperand(0), Cond.getOperand(1));
+          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+          CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
+          Cond = Cmp;
+          addTest = false;
+          Dest = FalseBB;
+        }
+      }
+    }
+  }
+
+  if (addTest) {
+    // Look pass the truncate if the high bits are known zero.
+    if (isTruncWithZeroHighBitsInput(Cond, DAG))
+        Cond = Cond.getOperand(0);
+
+    // We know the result is compared against zero. Try to match it to BT.
+    if (Cond.hasOneUse()) {
+      if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
+        CC = NewSetCC.getOperand(0);
+        Cond = NewSetCC.getOperand(1);
+        addTest = false;
+      }
+    }
+  }
+
+  if (addTest) {
+    X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
+    CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+    Cond = EmitTest(Cond, X86Cond, dl, DAG);
+  }
+  Cond = ConvertCmpIfNecessary(Cond, DAG);
+  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                     Chain, Dest, CC, Cond);
+}
+
+// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
+// Calls to _alloca are needed to probe the stack when allocating more than 4k
+// bytes in one go. Touching the stack at 4K increments is necessary to ensure
+// that the guard pages used by the OS virtual memory manager are allocated in
+// correct sequence.
+SDValue
+X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool SplitStack = MF.shouldSplitStack();
+  bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
+               SplitStack;
+  SDLoc dl(Op);
+
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  EVT VT = Node->getValueType(0);
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+
+  bool Is64Bit = Subtarget.is64Bit();
+  MVT SPTy = getPointerTy(DAG.getDataLayout());
+
+  SDValue Result;
+  if (!Lower) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+    assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+                    " not tell us which reg is the stack pointer!");
+
+    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+    Chain = SP.getValue(1);
+    const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
+    unsigned StackAlign = TFI.getStackAlignment();
+    Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+    if (Align > StackAlign)
+      Result = DAG.getNode(ISD::AND, dl, VT, Result,
+                         DAG.getConstant(-(uint64_t)Align, dl, VT));
+    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
+  } else if (SplitStack) {
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+
+    if (Is64Bit) {
+      // The 64 bit implementation of segmented stacks needs to clobber both r10
+      // r11. This makes it impossible to use it along with nested parameters.
+      const Function *F = MF.getFunction();
+      for (const auto &A : F->args()) {
+        if (A.hasNestAttr())
+          report_fatal_error("Cannot use segmented stacks with functions that "
+                             "have nested arguments.");
+      }
+    }
+
+    const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
+    unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
+    Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+    Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+                                DAG.getRegister(Vreg, SPTy));
+  } else {
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
+    MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
+
+    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+    unsigned SPReg = RegInfo->getStackRegister();
+    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
+    Chain = SP.getValue(1);
+
+    if (Align) {
+      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                       DAG.getConstant(-(uint64_t)Align, dl, VT));
+      Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
+    }
+
+    Result = SP;
+  }
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+                             DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+  SDValue Ops[2] = {Result, Chain};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  auto PtrVT = getPointerTy(MF.getDataLayout());
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  SDLoc DL(Op);
+
+  if (!Subtarget.is64Bit() ||
+      Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+    return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                        MachinePointerInfo(SV));
+  }
+
+  // __va_list_tag:
+  //   gp_offset         (0 - 6 * 8)
+  //   fp_offset         (48 - 48 + 8 * 16)
+  //   overflow_arg_area (point to parameters coming in memory).
+  //   reg_save_area
+  SmallVector<SDValue, 8> MemOps;
+  SDValue FIN = Op.getOperand(1);
+  // Store gp_offset
+  SDValue Store = DAG.getStore(
+      Op.getOperand(0), DL,
+      DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
+      MachinePointerInfo(SV));
+  MemOps.push_back(Store);
+
+  // Store fp_offset
+  FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
+  Store = DAG.getStore(
+      Op.getOperand(0), DL,
+      DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
+      MachinePointerInfo(SV, 4));
+  MemOps.push_back(Store);
+
+  // Store ptr to overflow_arg_area
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
+  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+  Store =
+      DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
+  MemOps.push_back(Store);
+
+  // Store ptr to reg_save_area.
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
+      Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
+  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
+  Store = DAG.getStore(
+      Op.getOperand(0), DL, RSFIN, FIN,
+      MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
+  MemOps.push_back(Store);
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+}
+
+SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget.is64Bit() &&
+         "LowerVAARG only handles 64-bit va_arg!");
+  assert(Op.getNumOperands() == 4);
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
+    // The Win64 ABI uses char* instead of a structure.
+    return DAG.expandVAArg(Op.getNode());
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue SrcPtr = Op.getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  unsigned Align = Op.getConstantOperandVal(3);
+  SDLoc dl(Op);
+
+  EVT ArgVT = Op.getNode()->getValueType(0);
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+  uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
+  uint8_t ArgMode;
+
+  // Decide which area this value should be read from.
+  // TODO: Implement the AMD64 ABI in its entirety. This simple
+  // selection mechanism works only for the basic types.
+  if (ArgVT == MVT::f80) {
+    llvm_unreachable("va_arg for f80 not yet implemented");
+  } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
+    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
+  } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
+    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
+  } else {
+    llvm_unreachable("Unhandled argument type in LowerVAARG");
+  }
+
+  if (ArgMode == 2) {
+    // Sanity Check: Make sure using fp_offset makes sense.
+    assert(!Subtarget.useSoftFloat() &&
+           !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
+           Subtarget.hasSSE1());
+  }
+
+  // Insert VAARG_64 node into the DAG
+  // VAARG_64 returns two values: Variable Argument Address, Chain
+  SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
+                       DAG.getConstant(ArgMode, dl, MVT::i8),
+                       DAG.getConstant(Align, dl, MVT::i32)};
+  SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
+  SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
+                                          VTs, InstOps, MVT::i64,
+                                          MachinePointerInfo(SV),
+                                          /*Align=*/0,
+                                          /*Volatile=*/false,
+                                          /*ReadMem=*/true,
+                                          /*WriteMem=*/true);
+  Chain = VAARG.getValue(1);
+
+  // Load the next argument and return it
+  return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
+}
+
+static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
+                           SelectionDAG &DAG) {
+  // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
+  // where a va_list is still an i8*.
+  assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
+  if (Subtarget.isCallingConvWin64(
+        DAG.getMachineFunction().getFunction()->getCallingConv()))
+    // Probably a Win64 va_copy.
+    return DAG.expandVACopy(Op.getNode());
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue DstPtr = Op.getOperand(1);
+  SDValue SrcPtr = Op.getOperand(2);
+  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  SDLoc DL(Op);
+
+  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
+                       DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
+                       false, false,
+                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
+}
+
+/// Handle vector element shifts where the shift amount is a constant.
+/// Takes immediate version of shift as input.
+static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
+                                          SDValue SrcOp, uint64_t ShiftAmt,
+                                          SelectionDAG &DAG) {
+  MVT ElementType = VT.getVectorElementType();
+
+  // Fold this packed shift into its first operand if ShiftAmt is 0.
+  if (ShiftAmt == 0)
+    return SrcOp;
+
+  // Check for ShiftAmt >= element width
+  if (ShiftAmt >= ElementType.getSizeInBits()) {
+    if (Opc == X86ISD::VSRAI)
+      ShiftAmt = ElementType.getSizeInBits() - 1;
+    else
+      return DAG.getConstant(0, dl, VT);
+  }
+
+  assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
+         && "Unknown target vector shift-by-constant node");
+
+  // Fold this packed vector shift into a build vector if SrcOp is a
+  // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
+  if (VT == SrcOp.getSimpleValueType() &&
+      ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
+    SmallVector<SDValue, 8> Elts;
+    unsigned NumElts = SrcOp->getNumOperands();
+    ConstantSDNode *ND;
+
+    switch(Opc) {
+    default: llvm_unreachable("Unknown opcode!");
+    case X86ISD::VSHLI:
+      for (unsigned i=0; i!=NumElts; ++i) {
+        SDValue CurrentOp = SrcOp->getOperand(i);
+        if (CurrentOp->isUndef()) {
+          Elts.push_back(CurrentOp);
+          continue;
+        }
+        ND = cast<ConstantSDNode>(CurrentOp);
+        const APInt &C = ND->getAPIntValue();
+        Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
+      }
+      break;
+    case X86ISD::VSRLI:
+      for (unsigned i=0; i!=NumElts; ++i) {
+        SDValue CurrentOp = SrcOp->getOperand(i);
+        if (CurrentOp->isUndef()) {
+          Elts.push_back(CurrentOp);
+          continue;
+        }
+        ND = cast<ConstantSDNode>(CurrentOp);
+        const APInt &C = ND->getAPIntValue();
+        Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
+      }
+      break;
+    case X86ISD::VSRAI:
+      for (unsigned i=0; i!=NumElts; ++i) {
+        SDValue CurrentOp = SrcOp->getOperand(i);
+        if (CurrentOp->isUndef()) {
+          Elts.push_back(CurrentOp);
+          continue;
+        }
+        ND = cast<ConstantSDNode>(CurrentOp);
+        const APInt &C = ND->getAPIntValue();
+        Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
+      }
+      break;
+    }
+
+    return DAG.getBuildVector(VT, dl, Elts);
+  }
+
+  return DAG.getNode(Opc, dl, VT, SrcOp,
+                     DAG.getConstant(ShiftAmt, dl, MVT::i8));
+}
+
+/// Handle vector element shifts where the shift amount may or may not be a
+/// constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
+                                   SDValue SrcOp, SDValue ShAmt,
+                                   SelectionDAG &DAG) {
+  MVT SVT = ShAmt.getSimpleValueType();
+  assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
+
+  // Catch shift-by-constant.
+  if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
+    return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
+                                      CShAmt->getZExtValue(), DAG);
+
+  // Change opcode to non-immediate version
+  switch (Opc) {
+    default: llvm_unreachable("Unknown target vector shift node");
+    case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
+    case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
+    case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
+  }
+
+  const X86Subtarget &Subtarget =
+      static_cast<const X86Subtarget &>(DAG.getSubtarget());
+  if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
+      ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
+    // Let the shuffle legalizer expand this shift amount node.
+    SDValue Op0 = ShAmt.getOperand(0);
+    Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
+    ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG);
+  } else {
+    // Need to build a vector containing shift amount.
+    // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
+    SmallVector<SDValue, 4> ShOps;
+    ShOps.push_back(ShAmt);
+    if (SVT == MVT::i32) {
+      ShOps.push_back(DAG.getConstant(0, dl, SVT));
+      ShOps.push_back(DAG.getUNDEF(SVT));
+    }
+    ShOps.push_back(DAG.getUNDEF(SVT));
+
+    MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
+    ShAmt = DAG.getBuildVector(BVT, dl, ShOps);
+  }
+
+  // The return type has to be a 128-bit type with the same element
+  // type as the input type.
+  MVT EltVT = VT.getVectorElementType();
+  MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
+
+  ShAmt = DAG.getBitcast(ShVT, ShAmt);
+  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+}
+
+/// \brief Return Mask with the necessary casting or extending
+/// for \p Mask according to \p MaskVT when lowering masking intrinsics
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+                           const X86Subtarget &Subtarget, SelectionDAG &DAG,
+                           const SDLoc &dl) {
+
+  if (isAllOnesConstant(Mask))
+    return DAG.getTargetConstant(1, dl, MaskVT);
+  if (X86::isZeroNode(Mask))
+    return DAG.getTargetConstant(0, dl, MaskVT);
+
+  if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
+    // Mask should be extended
+    Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
+                       MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
+  }
+
+  if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
+    if (MaskVT == MVT::v64i1) {
+      assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
+      // In case 32bit mode, bitcast i64 is illegal, extend/split it.
+      SDValue Lo, Hi;
+      Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+                          DAG.getConstant(0, dl, MVT::i32));
+      Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+                          DAG.getConstant(1, dl, MVT::i32));
+
+      Lo = DAG.getBitcast(MVT::v32i1, Lo);
+      Hi = DAG.getBitcast(MVT::v32i1, Hi);
+
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
+    } else {
+      // MaskVT require < 64bit. Truncate mask (should succeed in any case),
+      // and bitcast.
+      MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
+      return DAG.getBitcast(MaskVT,
+                            DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
+    }
+
+  } else {
+    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                     Mask.getSimpleValueType().getSizeInBits());
+    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+    // are extracted by EXTRACT_SUBVECTOR.
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                       DAG.getBitcast(BitcastVT, Mask),
+                       DAG.getIntPtrConstant(0, dl));
+  }
+}
+
+/// \brief Return (and \p Op, \p Mask) for compare instructions or
+/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
+/// necessary casting or extending for \p Mask when lowering masking intrinsics
+static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
+                  SDValue PreservedSrc,
+                  const X86Subtarget &Subtarget,
+                  SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+  unsigned OpcodeSelect = ISD::VSELECT;
+  SDLoc dl(Op);
+
+  if (isAllOnesConstant(Mask))
+    return Op;
+
+  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+  switch (Op.getOpcode()) {
+  default: break;
+  case X86ISD::PCMPEQM:
+  case X86ISD::PCMPGTM:
+  case X86ISD::CMPM:
+  case X86ISD::CMPMU:
+    return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+  case X86ISD::VFPCLASS:
+    case X86ISD::VFPCLASSS:
+    return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
+  case X86ISD::VTRUNC:
+  case X86ISD::VTRUNCS:
+  case X86ISD::VTRUNCUS:
+  case X86ISD::CVTPS2PH:
+    // We can't use ISD::VSELECT here because it is not always "Legal"
+    // for the destination type. For example vpmovqb require only AVX512
+    // and vselect that can operate on byte element type require BWI
+    OpcodeSelect = X86ISD::SELECT;
+    break;
+  }
+  if (PreservedSrc.isUndef())
+    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+  return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
+}
+
+/// \brief Creates an SDNode for a predicated scalar operation.
+/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
+/// The mask is coming as MVT::i8 and it should be truncated
+/// to MVT::i1 while lowering masking intrinsics.
+/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
+/// "X86select" instead of "vselect". We just can't create the "vselect" node
+/// for a scalar instruction.
+static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
+                                    SDValue PreservedSrc,
+                                    const X86Subtarget &Subtarget,
+                                    SelectionDAG &DAG) {
+  if (isAllOnesConstant(Mask))
+    return Op;
+
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+  // The mask should be of type MVT::i1
+  SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+
+  if (Op.getOpcode() == X86ISD::FSETCCM ||
+      Op.getOpcode() == X86ISD::FSETCCM_RND)
+    return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
+  if (Op.getOpcode() == X86ISD::VFPCLASS ||
+      Op.getOpcode() == X86ISD::VFPCLASSS)
+    return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
+
+  if (PreservedSrc.isUndef())
+    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+  return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
+}
+
+static int getSEHRegistrationNodeSize(const Function *Fn) {
+  if (!Fn->hasPersonalityFn())
+    report_fatal_error(
+        "querying registration node size for function without personality");
+  // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
+  // WinEHStatePass for the full struct definition.
+  switch (classifyEHPersonality(Fn->getPersonalityFn())) {
+  case EHPersonality::MSVC_X86SEH: return 24;
+  case EHPersonality::MSVC_CXX: return 16;
+  default: break;
+  }
+  report_fatal_error(
+      "can only recover FP for 32-bit MSVC EH personality functions");
+}
+
+/// When the MSVC runtime transfers control to us, either to an outlined
+/// function or when returning to a parent frame after catching an exception, we
+/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
+/// Here's the math:
+///   RegNodeBase = EntryEBP - RegNodeSize
+///   ParentFP = RegNodeBase - ParentFrameOffset
+/// Subtracting RegNodeSize takes us to the offset of the registration node, and
+/// subtracting the offset (negative on x86) takes us back to the parent FP.
+static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
+                                   SDValue EntryEBP) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDLoc dl;
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+
+  // It's possible that the parent function no longer has a personality function
+  // if the exceptional code was optimized away, in which case we just return
+  // the incoming EBP.
+  if (!Fn->hasPersonalityFn())
+    return EntryEBP;
+
+  // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
+  // registration, or the .set_setframe offset.
+  MCSymbol *OffsetSym =
+      MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
+          GlobalValue::getRealLinkageName(Fn->getName()));
+  SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
+  SDValue ParentFrameOffset =
+      DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
+
+  // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
+  // prologue to RBP in the parent function.
+  const X86Subtarget &Subtarget =
+      static_cast<const X86Subtarget &>(DAG.getSubtarget());
+  if (Subtarget.is64Bit())
+    return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
+
+  int RegNodeSize = getSEHRegistrationNodeSize(Fn);
+  // RegNodeBase = EntryEBP - RegNodeSize
+  // ParentFP = RegNodeBase - ParentFrameOffset
+  SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
+                                    DAG.getConstant(RegNodeSize, dl, PtrVT));
+  return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
+}
+
+static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  // Helper to detect if the operand is CUR_DIRECTION rounding mode.
+  auto isRoundModeCurDirection = [](SDValue Rnd) {
+    if (!isa<ConstantSDNode>(Rnd))
+      return false;
+
+    unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+    return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
+  };
+
+  SDLoc dl(Op);
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  MVT VT = Op.getSimpleValueType();
+  const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
+  if (IntrData) {
+    switch(IntrData->Type) {
+    case INTR_TYPE_1OP:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
+    case INTR_TYPE_2OP:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+        Op.getOperand(2));
+    case INTR_TYPE_3OP:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+        Op.getOperand(2), Op.getOperand(3));
+    case INTR_TYPE_4OP:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
+    case INTR_TYPE_1OP_MASK_RM: {
+      SDValue Src = Op.getOperand(1);
+      SDValue PassThru = Op.getOperand(2);
+      SDValue Mask = Op.getOperand(3);
+      SDValue RoundingMode;
+      // We always add rounding mode to the Node.
+      // If the rounding mode is not specified, we add the
+      // "current direction" mode.
+      if (Op.getNumOperands() == 4)
+        RoundingMode =
+          DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+      else
+        RoundingMode = Op.getOperand(4);
+      assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
+                                              RoundingMode),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_1OP_MASK: {
+      SDValue Src = Op.getOperand(1);
+      SDValue PassThru = Op.getOperand(2);
+      SDValue Mask = Op.getOperand(3);
+      // We add rounding mode to the Node when
+      //   - RM Opcode is specified and
+      //   - RM is not "current direction".
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(4);
+        if (!isRoundModeCurDirection(Rnd)) {
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                      dl, Op.getValueType(),
+                                      Src, Rnd),
+                                      Mask, PassThru, Subtarget, DAG);
+        }
+      }
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_SCALAR_MASK: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue passThru = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
+                                  Mask, passThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_SCALAR_MASK_RM: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src0 = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      // There are 2 kinds of intrinsics in this group:
+      // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
+      // (2) With rounding mode and sae - 7 operands.
+      if (Op.getNumOperands() == 6) {
+        SDValue Sae  = Op.getOperand(5);
+        return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+                                                Sae),
+                                    Mask, Src0, Subtarget, DAG);
+      }
+      assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
+      SDValue RoundingMode  = Op.getOperand(5);
+      SDValue Sae  = Op.getOperand(6);
+      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+                                              RoundingMode, Sae),
+                                  Mask, Src0, Subtarget, DAG);
+    }
+    case INTR_TYPE_2OP_MASK:
+    case INTR_TYPE_2OP_IMM8_MASK: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue PassThru = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+
+      if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
+        Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
+
+      // We specify 2 possible opcodes for intrinsics with rounding modes.
+      // First, we check if the intrinsic may have non-default rounding mode,
+      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(5);
+        if (!isRoundModeCurDirection(Rnd)) {
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                      dl, Op.getValueType(),
+                                      Src1, Src2, Rnd),
+                                      Mask, PassThru, Subtarget, DAG);
+        }
+      }
+      // TODO: Intrinsics should have fast-math-flags to propagate.
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_2OP_MASK_RM: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue PassThru = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      // We specify 2 possible modes for intrinsics, with/without rounding
+      // modes.
+      // First, we check if the intrinsic have rounding mode (6 operands),
+      // if not, we set rounding mode to "current".
+      SDValue Rnd;
+      if (Op.getNumOperands() == 6)
+        Rnd = Op.getOperand(5);
+      else
+        Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              Src1, Src2, Rnd),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_3OP_SCALAR_MASK_RM: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue PassThru = Op.getOperand(4);
+      SDValue Mask = Op.getOperand(5);
+      SDValue Sae  = Op.getOperand(6);
+
+      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
+                                              Src2, Src3, Sae),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_3OP_MASK_RM: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Imm = Op.getOperand(3);
+      SDValue PassThru = Op.getOperand(4);
+      SDValue Mask = Op.getOperand(5);
+      // We specify 2 possible modes for intrinsics, with/without rounding
+      // modes.
+      // First, we check if the intrinsic have rounding mode (7 operands),
+      // if not, we set rounding mode to "current".
+      SDValue Rnd;
+      if (Op.getNumOperands() == 7)
+        Rnd = Op.getOperand(6);
+      else
+        Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              Src1, Src2, Imm, Rnd),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_3OP_IMM8_MASK:
+    case INTR_TYPE_3OP_MASK:
+    case INSERT_SUBVEC: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue PassThru = Op.getOperand(4);
+      SDValue Mask = Op.getOperand(5);
+
+      if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
+        Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
+      else if (IntrData->Type == INSERT_SUBVEC) {
+        // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
+        assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
+        unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
+        Imm *= Src2.getSimpleValueType().getVectorNumElements();
+        Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
+      }
+
+      // We specify 2 possible opcodes for intrinsics with rounding modes.
+      // First, we check if the intrinsic may have non-default rounding mode,
+      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(6);
+        if (!isRoundModeCurDirection(Rnd)) {
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                      dl, Op.getValueType(),
+                                      Src1, Src2, Src3, Rnd),
+                                      Mask, PassThru, Subtarget, DAG);
+        }
+      }
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              Src1, Src2, Src3),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case VPERM_2OP_MASK : {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue PassThru = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+
+      // Swap Src1 and Src2 in the node creation
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case VPERM_3OP_MASKZ:
+    case VPERM_3OP_MASK:{
+      MVT VT = Op.getSimpleValueType();
+      // Src2 is the PassThru
+      SDValue Src1 = Op.getOperand(1);
+      // PassThru needs to be the same type as the destination in order
+      // to pattern match correctly.
+      SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
+      SDValue Src3 = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      SDValue PassThru = SDValue();
+
+      // set PassThru element
+      if (IntrData->Type == VPERM_3OP_MASKZ)
+        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+      else
+        PassThru = Src2;
+
+      // Swap Src1 and Src2 in the node creation
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+                                              dl, Op.getValueType(),
+                                              Src2, Src1, Src3),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case FMA_OP_MASK3:
+    case FMA_OP_MASKZ:
+    case FMA_OP_MASK: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      MVT VT = Op.getSimpleValueType();
+      SDValue PassThru = SDValue();
+
+      // set PassThru element
+      if (IntrData->Type == FMA_OP_MASKZ)
+        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+      else if (IntrData->Type == FMA_OP_MASK3)
+        PassThru = Src3;
+      else
+        PassThru = Src1;
+
+      // We specify 2 possible opcodes for intrinsics with rounding modes.
+      // First, we check if the intrinsic may have non-default rounding mode,
+      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(5);
+        if (!isRoundModeCurDirection(Rnd))
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                                  dl, Op.getValueType(),
+                                                  Src1, Src2, Src3, Rnd),
+                                      Mask, PassThru, Subtarget, DAG);
+      }
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+                                              dl, Op.getValueType(),
+                                              Src1, Src2, Src3),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case FMA_OP_SCALAR_MASK:
+    case FMA_OP_SCALAR_MASK3:
+    case FMA_OP_SCALAR_MASKZ: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      MVT VT = Op.getSimpleValueType();
+      SDValue PassThru = SDValue();
+
+      // set PassThru element
+      if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
+        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+      else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
+        PassThru = Src3;
+      else
+        PassThru = Src1;
+
+      SDValue Rnd = Op.getOperand(5);
+      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
+                                              Op.getValueType(), Src1, Src2,
+                                              Src3, Rnd),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case TERLOG_OP_MASK:
+    case TERLOG_OP_MASKZ: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
+      SDValue Mask = Op.getOperand(5);
+      MVT VT = Op.getSimpleValueType();
+      SDValue PassThru = Src1;
+      // Set PassThru element.
+      if (IntrData->Type == TERLOG_OP_MASKZ)
+        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              Src1, Src2, Src3, Src4),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case CVTPD2PS:
+      // ISD::FP_ROUND has a second argument that indicates if the truncation
+      // does not change the value. Set it to 0 since it can change.
+      return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
+                         DAG.getIntPtrConstant(0, dl));
+    case CVTPD2PS_MASK: {
+      SDValue Src = Op.getOperand(1);
+      SDValue PassThru = Op.getOperand(2);
+      SDValue Mask = Op.getOperand(3);
+      // We add rounding mode to the Node when
+      //   - RM Opcode is specified and
+      //   - RM is not "current direction".
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(4);
+        if (!isRoundModeCurDirection(Rnd)) {
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                      dl, Op.getValueType(),
+                                      Src, Rnd),
+                                      Mask, PassThru, Subtarget, DAG);
+        }
+      }
+      assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
+      // ISD::FP_ROUND has a second argument that indicates if the truncation
+      // does not change the value. Set it to 0 since it can change.
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
+                                              DAG.getIntPtrConstant(0, dl)),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case FPCLASS: {
+      // FPclass intrinsics with mask
+       SDValue Src1 = Op.getOperand(1);
+       MVT VT = Src1.getSimpleValueType();
+       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+       SDValue Imm = Op.getOperand(2);
+       SDValue Mask = Op.getOperand(3);
+       MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                     Mask.getSimpleValueType().getSizeInBits());
+       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
+       SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
+                                                 DAG.getTargetConstant(0, dl, MaskVT),
+                                                 Subtarget, DAG);
+       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+                                 DAG.getUNDEF(BitcastVT), FPclassMask,
+                                 DAG.getIntPtrConstant(0, dl));
+       return DAG.getBitcast(Op.getValueType(), Res);
+    }
+    case FPCLASSS: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Imm = Op.getOperand(2);
+      SDValue Mask = Op.getOperand(3);
+      SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
+      SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
+        DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
+      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask);
+    }
+    case CMP_MASK:
+    case CMP_MASK_CC: {
+      // Comparison intrinsics with masks.
+      // Example of transformation:
+      // (i8 (int_x86_avx512_mask_pcmpeq_q_128
+      //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
+      // (i8 (bitcast
+      //   (v8i1 (insert_subvector undef,
+      //           (v2i1 (and (PCMPEQM %a, %b),
+      //                      (extract_subvector
+      //                         (v8i1 (bitcast %mask)), 0))), 0))))
+      MVT VT = Op.getOperand(1).getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+      SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
+      MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+                                       Mask.getSimpleValueType().getSizeInBits());
+      SDValue Cmp;
+      if (IntrData->Type == CMP_MASK_CC) {
+        SDValue CC = Op.getOperand(3);
+        CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
+        // We specify 2 possible opcodes for intrinsics with rounding modes.
+        // First, we check if the intrinsic may have non-default rounding mode,
+        // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+        if (IntrData->Opc1 != 0) {
+          SDValue Rnd = Op.getOperand(5);
+          if (!isRoundModeCurDirection(Rnd))
+            Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
+                              Op.getOperand(2), CC, Rnd);
+        }
+        //default rounding mode
+        if(!Cmp.getNode())
+            Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+                              Op.getOperand(2), CC);
+
+      } else {
+        assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
+        Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+                          Op.getOperand(2));
+      }
+      SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
+                                             DAG.getTargetConstant(0, dl,
+                                                                   MaskVT),
+                                             Subtarget, DAG);
+      SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+                                DAG.getUNDEF(BitcastVT), CmpMask,
+                                DAG.getIntPtrConstant(0, dl));
+      return DAG.getBitcast(Op.getValueType(), Res);
+    }
+    case CMP_MASK_SCALAR_CC: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
+      SDValue Mask = Op.getOperand(4);
+
+      SDValue Cmp;
+      if (IntrData->Opc1 != 0) {
+        SDValue Rnd = Op.getOperand(5);
+        if (!isRoundModeCurDirection(Rnd))
+          Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
+      }
+      //default rounding mode
+      if(!Cmp.getNode())
+        Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
+
+      SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
+                                             DAG.getTargetConstant(0, dl,
+                                                                   MVT::i1),
+                                             Subtarget, DAG);
+
+      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
+    }
+    case COMI: { // Comparison intrinsics
+      ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
+      SDValue LHS = Op.getOperand(1);
+      SDValue RHS = Op.getOperand(2);
+      SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
+      SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
+      SDValue SetCC;
+      switch (CC) {
+      case ISD::SETEQ: { // (ZF = 0 and PF = 0)
+        SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
+        SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
+        SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
+        break;
+      }
+      case ISD::SETNE: { // (ZF = 1 or PF = 1)
+        SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
+        SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
+        SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
+        break;
+      }
+      case ISD::SETGT: // (CF = 0 and ZF = 0)
+        SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
+        break;
+      case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
+        SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
+        break;
+      }
+      case ISD::SETGE: // CF = 0
+        SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
+        break;
+      case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
+        SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
+        break;
+      default:
+        llvm_unreachable("Unexpected illegal condition!");
+      }
+      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+    }
+    case COMI_RM: { // Comparison intrinsics with Sae
+      SDValue LHS = Op.getOperand(1);
+      SDValue RHS = Op.getOperand(2);
+      unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      SDValue Sae = Op.getOperand(4);
+
+      SDValue FCmp;
+      if (isRoundModeCurDirection(Sae))
+        FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS,
+                                  DAG.getConstant(CondVal, dl, MVT::i8));
+      else
+        FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS,
+                                  DAG.getConstant(CondVal, dl, MVT::i8), Sae);
+      // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
+      return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
+    }
+    case VSHIFT:
+      return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
+                                 Op.getOperand(1), Op.getOperand(2), DAG);
+    case COMPRESS_EXPAND_IN_REG: {
+      SDValue Mask = Op.getOperand(3);
+      SDValue DataToCompress = Op.getOperand(1);
+      SDValue PassThru = Op.getOperand(2);
+      if (isAllOnesConstant(Mask)) // return data as is
+        return Op.getOperand(1);
+
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              DataToCompress),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case BROADCASTM: {
+      SDValue Mask = Op.getOperand(1);
+      MVT MaskVT = MVT::getVectorVT(MVT::i1,
+                                    Mask.getSimpleValueType().getSizeInBits());
+      Mask = DAG.getBitcast(MaskVT, Mask);
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
+    }
+    case KUNPCK: {
+      MVT VT = Op.getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
+
+      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+      // Arguments should be swapped.
+      SDValue Res = DAG.getNode(IntrData->Opc0, dl,
+                                MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
+                                Src2, Src1);
+      return DAG.getBitcast(VT, Res);
+    }
+    case FIXUPIMMS:
+    case FIXUPIMMS_MASKZ:
+    case FIXUPIMM:
+    case FIXUPIMM_MASKZ:{
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue Imm = Op.getOperand(4);
+      SDValue Mask = Op.getOperand(5);
+      SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
+                                         Src1 : getZeroVector(VT, Subtarget, DAG, dl);
+      // We specify 2 possible modes for intrinsics, with/without rounding
+      // modes.
+      // First, we check if the intrinsic have rounding mode (7 operands),
+      // if not, we set rounding mode to "current".
+      SDValue Rnd;
+      if (Op.getNumOperands() == 7)
+        Rnd = Op.getOperand(6);
+      else
+        Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+      if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
+        return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                                Src1, Src2, Src3, Imm, Rnd),
+                                    Mask, Passthru, Subtarget, DAG);
+      else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
+        return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                       Src1, Src2, Src3, Imm, Rnd),
+                                    Mask, Passthru, Subtarget, DAG);
+    }
+    case CONVERT_TO_MASK: {
+      MVT SrcVT = Op.getOperand(1).getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
+      MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
+
+      SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
+                                    Op.getOperand(1));
+      SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+                                DAG.getUNDEF(BitcastVT), CvtMask,
+                                DAG.getIntPtrConstant(0, dl));
+      return DAG.getBitcast(Op.getValueType(), Res);
+    }
+    case CONVERT_MASK_TO_VEC: {
+      SDValue Mask = Op.getOperand(1);
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+      return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
+    }
+    case BRCST_SUBVEC_TO_VEC: {
+      SDValue Src = Op.getOperand(1);
+      SDValue Passthru = Op.getOperand(2);
+      SDValue Mask = Op.getOperand(3);
+      EVT resVT = Passthru.getValueType();
+      SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
+                                       DAG.getUNDEF(resVT), Src,
+                                       DAG.getIntPtrConstant(0, dl));
+      SDValue immVal;
+      if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
+        immVal = DAG.getConstant(0x44, dl, MVT::i8);
+      else
+        immVal = DAG.getConstant(0, dl, MVT::i8);
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              subVec, subVec, immVal),
+                                  Mask, Passthru, Subtarget, DAG);
+    }
+    case BRCST32x2_TO_VEC: {
+      SDValue Src = Op.getOperand(1);
+      SDValue PassThru = Op.getOperand(2);
+      SDValue Mask = Op.getOperand(3);
+
+      assert((VT.getScalarType() == MVT::i32 ||
+              VT.getScalarType() == MVT::f32) && "Unexpected type!");
+      //bitcast Src to packed 64
+      MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
+      MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
+      Src = DAG.getBitcast(BitcastVT, Src);
+
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    default:
+      break;
+    }
+  }
+
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+
+  case Intrinsic::x86_avx2_permd:
+  case Intrinsic::x86_avx2_permps:
+    // Operands intentionally swapped. Mask is last operand to intrinsic,
+    // but second operand for node/instruction.
+    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(1));
+
+  // ptest and testp intrinsics. The intrinsic these come from are designed to
+  // return an integer value, not just an instruction so lower it to the ptest
+  // or testp pattern and a setcc for the result.
+  case Intrinsic::x86_sse41_ptestz:
+  case Intrinsic::x86_sse41_ptestc:
+  case Intrinsic::x86_sse41_ptestnzc:
+  case Intrinsic::x86_avx_ptestz_256:
+  case Intrinsic::x86_avx_ptestc_256:
+  case Intrinsic::x86_avx_ptestnzc_256:
+  case Intrinsic::x86_avx_vtestz_ps:
+  case Intrinsic::x86_avx_vtestc_ps:
+  case Intrinsic::x86_avx_vtestnzc_ps:
+  case Intrinsic::x86_avx_vtestz_pd:
+  case Intrinsic::x86_avx_vtestc_pd:
+  case Intrinsic::x86_avx_vtestnzc_pd:
+  case Intrinsic::x86_avx_vtestz_ps_256:
+  case Intrinsic::x86_avx_vtestc_ps_256:
+  case Intrinsic::x86_avx_vtestnzc_ps_256:
+  case Intrinsic::x86_avx_vtestz_pd_256:
+  case Intrinsic::x86_avx_vtestc_pd_256:
+  case Intrinsic::x86_avx_vtestnzc_pd_256: {
+    bool IsTestPacked = false;
+    X86::CondCode X86CC;
+    switch (IntNo) {
+    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
+    case Intrinsic::x86_avx_vtestz_ps:
+    case Intrinsic::x86_avx_vtestz_pd:
+    case Intrinsic::x86_avx_vtestz_ps_256:
+    case Intrinsic::x86_avx_vtestz_pd_256:
+      IsTestPacked = true;
+      LLVM_FALLTHROUGH;
+    case Intrinsic::x86_sse41_ptestz:
+    case Intrinsic::x86_avx_ptestz_256:
+      // ZF = 1
+      X86CC = X86::COND_E;
+      break;
+    case Intrinsic::x86_avx_vtestc_ps:
+    case Intrinsic::x86_avx_vtestc_pd:
+    case Intrinsic::x86_avx_vtestc_ps_256:
+    case Intrinsic::x86_avx_vtestc_pd_256:
+      IsTestPacked = true;
+      LLVM_FALLTHROUGH;
+    case Intrinsic::x86_sse41_ptestc:
+    case Intrinsic::x86_avx_ptestc_256:
+      // CF = 1
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_avx_vtestnzc_ps:
+    case Intrinsic::x86_avx_vtestnzc_pd:
+    case Intrinsic::x86_avx_vtestnzc_ps_256:
+    case Intrinsic::x86_avx_vtestnzc_pd_256:
+      IsTestPacked = true;
+      LLVM_FALLTHROUGH;
+    case Intrinsic::x86_sse41_ptestnzc:
+    case Intrinsic::x86_avx_ptestnzc_256:
+      // ZF and CF = 0
+      X86CC = X86::COND_A;
+      break;
+    }
+
+    SDValue LHS = Op.getOperand(1);
+    SDValue RHS = Op.getOperand(2);
+    unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
+    SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
+    SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+  case Intrinsic::x86_avx512_kortestz_w:
+  case Intrinsic::x86_avx512_kortestc_w: {
+    X86::CondCode X86CC =
+        (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
+    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
+    SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
+    SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+
+  case Intrinsic::x86_sse42_pcmpistria128:
+  case Intrinsic::x86_sse42_pcmpestria128:
+  case Intrinsic::x86_sse42_pcmpistric128:
+  case Intrinsic::x86_sse42_pcmpestric128:
+  case Intrinsic::x86_sse42_pcmpistrio128:
+  case Intrinsic::x86_sse42_pcmpestrio128:
+  case Intrinsic::x86_sse42_pcmpistris128:
+  case Intrinsic::x86_sse42_pcmpestris128:
+  case Intrinsic::x86_sse42_pcmpistriz128:
+  case Intrinsic::x86_sse42_pcmpestriz128: {
+    unsigned Opcode;
+    X86::CondCode X86CC;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse42_pcmpistria128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_A;
+      break;
+    case Intrinsic::x86_sse42_pcmpestria128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_A;
+      break;
+    case Intrinsic::x86_sse42_pcmpistric128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_sse42_pcmpestric128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_sse42_pcmpistrio128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_O;
+      break;
+    case Intrinsic::x86_sse42_pcmpestrio128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_O;
+      break;
+    case Intrinsic::x86_sse42_pcmpistris128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_S;
+      break;
+    case Intrinsic::x86_sse42_pcmpestris128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_S;
+      break;
+    case Intrinsic::x86_sse42_pcmpistriz128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_E;
+      break;
+    case Intrinsic::x86_sse42_pcmpestriz128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_E;
+      break;
+    }
+    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
+    SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+
+  case Intrinsic::x86_sse42_pcmpistri128:
+  case Intrinsic::x86_sse42_pcmpestri128: {
+    unsigned Opcode;
+    if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
+      Opcode = X86ISD::PCMPISTRI;
+    else
+      Opcode = X86ISD::PCMPESTRI;
+
+    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+    return DAG.getNode(Opcode, dl, VTs, NewOps);
+  }
+
+  case Intrinsic::eh_sjlj_lsda: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+    auto &Context = MF.getMMI().getContext();
+    MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
+                                            Twine(MF.getFunctionNumber()));
+    return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
+  }
+
+  case Intrinsic::x86_seh_lsda: {
+    // Compute the symbol for the LSDA. We know it'll get emitted later.
+    MachineFunction &MF = DAG.getMachineFunction();
+    SDValue Op1 = Op.getOperand(1);
+    auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
+    MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
+        GlobalValue::getRealLinkageName(Fn->getName()));
+
+    // Generate a simple absolute symbol reference. This intrinsic is only
+    // supported on 32-bit Windows, which isn't PIC.
+    SDValue Result = DAG.getMCSymbol(LSDASym, VT);
+    return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
+  }
+
+  case Intrinsic::x86_seh_recoverfp: {
+    SDValue FnOp = Op.getOperand(1);
+    SDValue IncomingFPOp = Op.getOperand(2);
+    GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
+    auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
+    if (!Fn)
+      report_fatal_error(
+          "llvm.x86.seh.recoverfp must take a function as the first argument");
+    return recoverFramePointer(DAG, Fn, IncomingFPOp);
+  }
+
+  case Intrinsic::localaddress: {
+    // Returns one of the stack, base, or frame pointer registers, depending on
+    // which is used to reference local variables.
+    MachineFunction &MF = DAG.getMachineFunction();
+    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+    unsigned Reg;
+    if (RegInfo->hasBasePointer(MF))
+      Reg = RegInfo->getBaseRegister();
+    else // This function handles the SP or FP case.
+      Reg = RegInfo->getPtrSizedFrameRegister(MF);
+    return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+  }
+  }
+}
+
+static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                              SDValue Src, SDValue Mask, SDValue Base,
+                              SDValue Index, SDValue ScaleOp, SDValue Chain,
+                              const X86Subtarget &Subtarget) {
+  SDLoc dl(Op);
+  auto *C = cast<ConstantSDNode>(ScaleOp);
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+  MVT MaskVT = MVT::getVectorVT(MVT::i1,
+                             Index.getSimpleValueType().getVectorNumElements());
+
+  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  if (Src.isUndef())
+    Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
+  SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+  return DAG.getMergeValues(RetOps, dl);
+}
+
+static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                               SDValue Src, SDValue Mask, SDValue Base,
+                               SDValue Index, SDValue ScaleOp, SDValue Chain,
+                               const X86Subtarget &Subtarget) {
+  SDLoc dl(Op);
+  auto *C = cast<ConstantSDNode>(ScaleOp);
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  MVT MaskVT = MVT::getVectorVT(MVT::i1,
+                             Index.getSimpleValueType().getVectorNumElements());
+
+  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  return SDValue(Res, 1);
+}
+
+static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                               SDValue Mask, SDValue Base, SDValue Index,
+                               SDValue ScaleOp, SDValue Chain,
+                               const X86Subtarget &Subtarget) {
+  SDLoc dl(Op);
+  auto *C = cast<ConstantSDNode>(ScaleOp);
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  MVT MaskVT =
+    MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
+  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+  //SDVTList VTs = DAG.getVTList(MVT::Other);
+  SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
+  return SDValue(Res, 0);
+}
+
+/// Handles the lowering of builtin intrinsic that return the value
+/// of the extended control register.
+static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
+                                       SelectionDAG &DAG,
+                                       const X86Subtarget &Subtarget,
+                                       SmallVectorImpl<SDValue> &Results) {
+  assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue LO, HI;
+
+  // The ECX register is used to select the index of the XCR register to
+  // return.
+  SDValue Chain =
+      DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
+  SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
+  Chain = SDValue(N1, 0);
+
+  // Reads the content of XCR and returns it in registers EDX:EAX.
+  if (Subtarget.is64Bit()) {
+    LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+                            LO.getValue(2));
+  } else {
+    LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+                            LO.getValue(2));
+  }
+  Chain = HI.getValue(1);
+
+  if (Subtarget.is64Bit()) {
+    // Merge the two 32-bit values into a 64-bit one..
+    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+                              DAG.getConstant(32, DL, MVT::i8));
+    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+    Results.push_back(Chain);
+    return;
+  }
+
+  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+  SDValue Ops[] = { LO, HI };
+  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+  Results.push_back(Pair);
+  Results.push_back(Chain);
+}
+
+/// Handles the lowering of builtin intrinsics that read performance monitor
+/// counters (x86_rdpmc).
+static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
+                                      SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget,
+                                      SmallVectorImpl<SDValue> &Results) {
+  assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue LO, HI;
+
+  // The ECX register is used to select the index of the performance counter
+  // to read.
+  SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
+                                   N->getOperand(2));
+  SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
+
+  // Reads the content of a 64-bit performance counter and returns it in the
+  // registers EDX:EAX.
+  if (Subtarget.is64Bit()) {
+    LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+                            LO.getValue(2));
+  } else {
+    LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+                            LO.getValue(2));
+  }
+  Chain = HI.getValue(1);
+
+  if (Subtarget.is64Bit()) {
+    // The EAX register is loaded with the low-order 32 bits. The EDX register
+    // is loaded with the supported high-order bits of the counter.
+    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+                              DAG.getConstant(32, DL, MVT::i8));
+    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+    Results.push_back(Chain);
+    return;
+  }
+
+  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+  SDValue Ops[] = { LO, HI };
+  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+  Results.push_back(Pair);
+  Results.push_back(Chain);
+}
+
+/// Handles the lowering of builtin intrinsics that read the time stamp counter
+/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
+/// READCYCLECOUNTER nodes.
+static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
+                                    SelectionDAG &DAG,
+                                    const X86Subtarget &Subtarget,
+                                    SmallVectorImpl<SDValue> &Results) {
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
+  SDValue LO, HI;
+
+  // The processor's time-stamp counter (a 64-bit MSR) is stored into the
+  // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
+  // and the EAX register is loaded with the low-order 32 bits.
+  if (Subtarget.is64Bit()) {
+    LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+                            LO.getValue(2));
+  } else {
+    LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+                            LO.getValue(2));
+  }
+  SDValue Chain = HI.getValue(1);
+
+  if (Opcode == X86ISD::RDTSCP_DAG) {
+    assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+
+    // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
+    // the ECX register. Add 'ecx' explicitly to the chain.
+    SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
+                                     HI.getValue(2));
+    // Explicitly store the content of ECX at the location passed in input
+    // to the 'rdtscp' intrinsic.
+    Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
+                         MachinePointerInfo());
+  }
+
+  if (Subtarget.is64Bit()) {
+    // The EDX register is loaded with the high-order 32 bits of the MSR, and
+    // the EAX register is loaded with the low-order 32 bits.
+    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+                              DAG.getConstant(32, DL, MVT::i8));
+    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+    Results.push_back(Chain);
+    return;
+  }
+
+  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+  SDValue Ops[] = { LO, HI };
+  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+  Results.push_back(Pair);
+  Results.push_back(Chain);
+}
+
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
+                                     SelectionDAG &DAG) {
+  SmallVector<SDValue, 2> Results;
+  SDLoc DL(Op);
+  getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
+                          Results);
+  return DAG.getMergeValues(Results, DL);
+}
+
+static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDValue Chain = Op.getOperand(0);
+  SDValue RegNode = Op.getOperand(2);
+  WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
+  if (!EHInfo)
+    report_fatal_error("EH registrations only live in functions using WinEH");
+
+  // Cast the operand to an alloca, and remember the frame index.
+  auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
+  if (!FINode)
+    report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
+  EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
+
+  // Return the chain operand without making any DAG nodes.
+  return Chain;
+}
+
+static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDValue Chain = Op.getOperand(0);
+  SDValue EHGuard = Op.getOperand(2);
+  WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
+  if (!EHInfo)
+    report_fatal_error("EHGuard only live in functions using WinEH");
+
+  // Cast the operand to an alloca, and remember the frame index.
+  auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
+  if (!FINode)
+    report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
+  EHInfo->EHGuardFrameIndex = FINode->getIndex();
+
+  // Return the chain operand without making any DAG nodes.
+  return Chain;
+}
+
+/// Emit Truncating Store with signed or unsigned saturation.
+static SDValue
+EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
+                SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
+                SelectionDAG &DAG) {
+
+  SDVTList VTs = DAG.getVTList(MVT::Other);
+  SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
+  SDValue Ops[] = { Chain, Val, Ptr, Undef };
+  return SignedSat ?
+    DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
+    DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
+}
+
+/// Emit Masked Truncating Store with signed or unsigned saturation.
+static SDValue
+EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
+                      SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
+                      MachineMemOperand *MMO, SelectionDAG &DAG) {
+
+  SDVTList VTs = DAG.getVTList(MVT::Other);
+  SDValue Ops[] = { Chain, Ptr, Mask, Val };
+  return SignedSat ?
+    DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
+    DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
+}
+
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
+                                      SelectionDAG &DAG) {
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+  const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
+  if (!IntrData) {
+    if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
+      return MarkEHRegistrationNode(Op, DAG);
+    if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
+      return MarkEHGuard(Op, DAG);
+    if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
+        IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
+        IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
+        IntNo == llvm::Intrinsic::x86_flags_write_u64) {
+      // We need a frame pointer because this will get lowered to a PUSH/POP
+      // sequence.
+      MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+      MFI.setHasCopyImplyingStackAdjustment(true);
+      // Don't do anything here, we will expand these intrinsics out later
+      // during ExpandISelPseudos in EmitInstrWithCustomInserter.
+      return SDValue();
+    }
+    return SDValue();
+  }
+
+  SDLoc dl(Op);
+  switch(IntrData->Type) {
+  default: llvm_unreachable("Unknown Intrinsic Type");
+  case RDSEED:
+  case RDRAND: {
+    // Emit the node with the right value type.
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
+    SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
+
+    // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
+    // Otherwise return the value from Rand, which is always 0, casted to i32.
+    SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+                      DAG.getConstant(1, dl, Op->getValueType(1)),
+                      DAG.getConstant(X86::COND_B, dl, MVT::i32),
+                      SDValue(Result.getNode(), 1) };
+    SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
+                                  DAG.getVTList(Op->getValueType(1), MVT::Glue),
+                                  Ops);
+
+    // Return { result, isValid, chain }.
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
+                       SDValue(Result.getNode(), 2));
+  }
+  case GATHER: {
+  //gather(v1, mask, index, base, scale);
+    SDValue Chain = Op.getOperand(0);
+    SDValue Src   = Op.getOperand(2);
+    SDValue Base  = Op.getOperand(3);
+    SDValue Index = Op.getOperand(4);
+    SDValue Mask  = Op.getOperand(5);
+    SDValue Scale = Op.getOperand(6);
+    return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
+                         Chain, Subtarget);
+  }
+  case SCATTER: {
+  //scatter(base, mask, index, v1, scale);
+    SDValue Chain = Op.getOperand(0);
+    SDValue Base  = Op.getOperand(2);
+    SDValue Mask  = Op.getOperand(3);
+    SDValue Index = Op.getOperand(4);
+    SDValue Src   = Op.getOperand(5);
+    SDValue Scale = Op.getOperand(6);
+    return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
+                          Scale, Chain, Subtarget);
+  }
+  case PREFETCH: {
+    SDValue Hint = Op.getOperand(6);
+    unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
+    assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
+    unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
+    SDValue Chain = Op.getOperand(0);
+    SDValue Mask  = Op.getOperand(2);
+    SDValue Index = Op.getOperand(3);
+    SDValue Base  = Op.getOperand(4);
+    SDValue Scale = Op.getOperand(5);
+    return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
+                           Subtarget);
+  }
+  // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
+  case RDTSC: {
+    SmallVector<SDValue, 2> Results;
+    getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
+                            Results);
+    return DAG.getMergeValues(Results, dl);
+  }
+  // Read Performance Monitoring Counters.
+  case RDPMC: {
+    SmallVector<SDValue, 2> Results;
+    getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
+    return DAG.getMergeValues(Results, dl);
+  }
+  // Get Extended Control Register.
+  case XGETBV: {
+    SmallVector<SDValue, 2> Results;
+    getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
+    return DAG.getMergeValues(Results, dl);
+  }
+  // XTEST intrinsics.
+  case XTEST: {
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
+    SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
+
+    SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
+    SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
+                       Ret, SDValue(InTrans.getNode(), 1));
+  }
+  // ADC/ADCX/SBB
+  case ADX: {
+    SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
+    SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
+    SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
+                                DAG.getConstant(-1, dl, MVT::i8));
+    SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
+                              Op.getOperand(4), GenCF.getValue(1));
+    SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
+                                 Op.getOperand(5), MachinePointerInfo());
+    SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
+    SDValue Results[] = { SetCC, Store };
+    return DAG.getMergeValues(Results, dl);
+  }
+  case COMPRESS_TO_MEM: {
+    SDValue Mask = Op.getOperand(4);
+    SDValue DataToCompress = Op.getOperand(3);
+    SDValue Addr = Op.getOperand(2);
+    SDValue Chain = Op.getOperand(0);
+    MVT VT = DataToCompress.getSimpleValueType();
+
+    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
+    assert(MemIntr && "Expected MemIntrinsicSDNode!");
+
+    if (isAllOnesConstant(Mask)) // return just a store
+      return DAG.getStore(Chain, dl, DataToCompress, Addr,
+                          MemIntr->getMemOperand());
+
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+    SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+    return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
+                              MemIntr->getMemOperand(),
+                              false /* truncating */, true /* compressing */);
+  }
+  case TRUNCATE_TO_MEM_VI8:
+  case TRUNCATE_TO_MEM_VI16:
+  case TRUNCATE_TO_MEM_VI32: {
+    SDValue Mask = Op.getOperand(4);
+    SDValue DataToTruncate = Op.getOperand(3);
+    SDValue Addr = Op.getOperand(2);
+    SDValue Chain = Op.getOperand(0);
+
+    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
+    assert(MemIntr && "Expected MemIntrinsicSDNode!");
+
+    EVT MemVT  = MemIntr->getMemoryVT();
+
+    uint16_t TruncationOp = IntrData->Opc0;
+    switch (TruncationOp) {
+    case X86ISD::VTRUNC: {
+      if (isAllOnesConstant(Mask)) // return just a truncate store
+        return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
+                                 MemIntr->getMemOperand());
+
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
+      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+      return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
+                                MemIntr->getMemOperand(), true /* truncating */);
+    }
+    case X86ISD::VTRUNCUS:
+    case X86ISD::VTRUNCS: {
+      bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
+      if (isAllOnesConstant(Mask))
+        return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
+                               MemIntr->getMemOperand(), DAG);
+
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
+      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+      return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
+                                   VMask, MemVT, MemIntr->getMemOperand(), DAG);
+    }
+    default:
+      llvm_unreachable("Unsupported truncstore intrinsic");
+    }
+  }
+
+  case EXPAND_FROM_MEM: {
+    SDValue Mask = Op.getOperand(4);
+    SDValue PassThru = Op.getOperand(3);
+    SDValue Addr = Op.getOperand(2);
+    SDValue Chain = Op.getOperand(0);
+    MVT VT = Op.getSimpleValueType();
+
+    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
+    assert(MemIntr && "Expected MemIntrinsicSDNode!");
+
+    if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
+      return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
+    if (X86::isZeroNode(Mask))
+      return DAG.getUNDEF(VT);
+
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+    SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+    return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
+                             MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
+                             true /* expanding */);
+  }
+  }
+}
+
+SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI.setReturnAddressIsTaken(true);
+
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDLoc dl(Op);
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+
+  if (Depth > 0) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+    SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
+                       MachinePointerInfo());
+  }
+
+  // Just load the return address.
+  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
+                     MachinePointerInfo());
+}
+
+SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
+  return getReturnAddressFrameIndex(DAG);
+}
+
+SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  EVT VT = Op.getValueType();
+
+  MFI.setFrameAddressIsTaken(true);
+
+  if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
+    // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
+    // is not possible to crawl up the stack without looking at the unwind codes
+    // simultaneously.
+    int FrameAddrIndex = FuncInfo->getFAIndex();
+    if (!FrameAddrIndex) {
+      // Set up a frame object for the return address.
+      unsigned SlotSize = RegInfo->getSlotSize();
+      FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
+          SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
+      FuncInfo->setFAIndex(FrameAddrIndex);
+    }
+    return DAG.getFrameIndex(FrameAddrIndex, VT);
+  }
+
+  unsigned FrameReg =
+      RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
+  SDLoc dl(Op);  // FIXME probably not meaningful
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
+          (FrameReg == X86::EBP && VT == MVT::i32)) &&
+         "Invalid Frame Register!");
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo());
+  return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                              SelectionDAG &DAG) const {
+  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
+  const MachineFunction &MF = DAG.getMachineFunction();
+
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("esp", X86::ESP)
+                       .Case("rsp", X86::RSP)
+                       .Case("ebp", X86::EBP)
+                       .Case("rbp", X86::RBP)
+                       .Default(0);
+
+  if (Reg == X86::EBP || Reg == X86::RBP) {
+    if (!TFI.hasFP(MF))
+      report_fatal_error("register " + StringRef(RegName) +
+                         " is allocatable: function has no frame pointer");
+#ifndef NDEBUG
+    else {
+      const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+      unsigned FrameReg =
+          RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
+      assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
+             "Invalid Frame Register!");
+    }
+#endif
+  }
+
+  if (Reg)
+    return Reg;
+
+  report_fatal_error("Invalid register name global variable");
+}
+
+SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
+}
+
+unsigned X86TargetLowering::getExceptionPointerRegister(
+    const Constant *PersonalityFn) const {
+  if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
+    return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
+
+  return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
+}
+
+unsigned X86TargetLowering::getExceptionSelectorRegister(
+    const Constant *PersonalityFn) const {
+  // Funclet personalities don't use selectors (the runtime does the selection).
+  assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
+  return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
+}
+
+bool X86TargetLowering::needsFixedCatchObjects() const {
+  return Subtarget.isTargetWin64();
+}
+
+SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain     = Op.getOperand(0);
+  SDValue Offset    = Op.getOperand(1);
+  SDValue Handler   = Op.getOperand(2);
+  SDLoc dl      (Op);
+
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
+  assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
+          (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
+         "Invalid Frame Register!");
+  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
+  unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
+
+  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
+                                 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
+                                                       dl));
+  StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
+  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
+  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
+
+  return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
+                     DAG.getRegister(StoreAddrReg, PtrVT));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  // If the subtarget is not 64bit, we may need the global base reg
+  // after isel expand pseudo, i.e., after CGBR pass ran.
+  // Therefore, ask for the GlobalBaseReg now, so that the pass
+  // inserts the code for us in case we need it.
+  // Otherwise, we will end up in a situation where we will
+  // reference a virtual register that is not defined!
+  if (!Subtarget.is64Bit()) {
+    const X86InstrInfo *TII = Subtarget.getInstrInfo();
+    (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
+  }
+  return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
+                     DAG.getVTList(MVT::i32, MVT::Other),
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
+                                                       SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
+                     Op.getOperand(0));
+}
+
+static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
+  return Op.getOperand(0);
+}
+
+SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDValue Root = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue FPtr = Op.getOperand(2); // nested function
+  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+  SDLoc dl (Op);
+
+  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+  if (Subtarget.is64Bit()) {
+    SDValue OutChains[6];
+
+    // Large code-model.
+    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
+    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
+
+    const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
+    const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
+
+    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
+
+    // Load the pointer to the nested function into R11.
+    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
+    SDValue Addr = Trmp;
+    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
+                                Addr, MachinePointerInfo(TrmpAddr));
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(2, dl, MVT::i64));
+    OutChains[1] =
+        DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
+                     /* Alignment = */ 2);
+
+    // Load the 'nest' parameter value into R10.
+    // R10 is specified in X86CallingConv.td
+    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(10, dl, MVT::i64));
+    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
+                                Addr, MachinePointerInfo(TrmpAddr, 10));
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(12, dl, MVT::i64));
+    OutChains[3] =
+        DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
+                     /* Alignment = */ 2);
+
+    // Jump to the nested function.
+    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(20, dl, MVT::i64));
+    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
+                                Addr, MachinePointerInfo(TrmpAddr, 20));
+
+    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(22, dl, MVT::i64));
+    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
+                                Addr, MachinePointerInfo(TrmpAddr, 22));
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+  } else {
+    const Function *Func =
+      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+    CallingConv::ID CC = Func->getCallingConv();
+    unsigned NestReg;
+
+    switch (CC) {
+    default:
+      llvm_unreachable("Unsupported calling convention");
+    case CallingConv::C:
+    case CallingConv::X86_StdCall: {
+      // Pass 'nest' parameter in ECX.
+      // Must be kept in sync with X86CallingConv.td
+      NestReg = X86::ECX;
+
+      // Check that ECX wasn't needed by an 'inreg' parameter.
+      FunctionType *FTy = Func->getFunctionType();
+      const AttributeSet &Attrs = Func->getAttributes();
+
+      if (!Attrs.isEmpty() && !Func->isVarArg()) {
+        unsigned InRegCount = 0;
+        unsigned Idx = 1;
+
+        for (FunctionType::param_iterator I = FTy->param_begin(),
+             E = FTy->param_end(); I != E; ++I, ++Idx)
+          if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
+            auto &DL = DAG.getDataLayout();
+            // FIXME: should only count parameters that are lowered to integers.
+            InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
+          }
+
+        if (InRegCount > 2) {
+          report_fatal_error("Nest register in use - reduce number of inreg"
+                             " parameters!");
+        }
+      }
+      break;
+    }
+    case CallingConv::X86_FastCall:
+    case CallingConv::X86_ThisCall:
+    case CallingConv::Fast:
+      // Pass 'nest' parameter in EAX.
+      // Must be kept in sync with X86CallingConv.td
+      NestReg = X86::EAX;
+      break;
+    }
+
+    SDValue OutChains[4];
+    SDValue Addr, Disp;
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(10, dl, MVT::i32));
+    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
+
+    // This is storing the opcode for MOV32ri.
+    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
+    const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
+    OutChains[0] =
+        DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
+                     Trmp, MachinePointerInfo(TrmpAddr));
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(1, dl, MVT::i32));
+    OutChains[1] =
+        DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
+                     /* Alignment = */ 1);
+
+    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(5, dl, MVT::i32));
+    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
+                                Addr, MachinePointerInfo(TrmpAddr, 5),
+                                /* Alignment = */ 1);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(6, dl, MVT::i32));
+    OutChains[3] =
+        DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
+                     /* Alignment = */ 1);
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+  }
+}
+
+SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  /*
+   The rounding mode is in bits 11:10 of FPSR, and has the following
+   settings:
+     00 Round to nearest
+     01 Round to -inf
+     10 Round to +inf
+     11 Round to 0
+
+  FLT_ROUNDS, on the other hand, expects the following:
+    -1 Undefined
+     0 Round to 0
+     1 Round to nearest
+     2 Round to +inf
+     3 Round to -inf
+
+  To perform the conversion, we do:
+    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
+  */
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
+  unsigned StackAlignment = TFI.getStackAlignment();
+  MVT VT = Op.getSimpleValueType();
+  SDLoc DL(Op);
+
+  // Save FP Control Word to stack slot
+  int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
+  SDValue StackSlot =
+      DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
+
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
+                              MachineMemOperand::MOStore, 2, 2);
+
+  SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
+  SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
+                                          DAG.getVTList(MVT::Other),
+                                          Ops, MVT::i16, MMO);
+
+  // Load FP Control Word from stack slot
+  SDValue CWD =
+      DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
+
+  // Transform as necessary
+  SDValue CWD1 =
+    DAG.getNode(ISD::SRL, DL, MVT::i16,
+                DAG.getNode(ISD::AND, DL, MVT::i16,
+                            CWD, DAG.getConstant(0x800, DL, MVT::i16)),
+                DAG.getConstant(11, DL, MVT::i8));
+  SDValue CWD2 =
+    DAG.getNode(ISD::SRL, DL, MVT::i16,
+                DAG.getNode(ISD::AND, DL, MVT::i16,
+                            CWD, DAG.getConstant(0x400, DL, MVT::i16)),
+                DAG.getConstant(9, DL, MVT::i8));
+
+  SDValue RetVal =
+    DAG.getNode(ISD::AND, DL, MVT::i16,
+                DAG.getNode(ISD::ADD, DL, MVT::i16,
+                            DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
+                            DAG.getConstant(1, DL, MVT::i16)),
+                DAG.getConstant(3, DL, MVT::i16));
+
+  return DAG.getNode((VT.getSizeInBits() < 16 ?
+                      ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
+}
+
+/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
+//
+// 1. i32/i64 128/256-bit vector (native support require VLX) are expended
+//    to 512-bit vector.
+// 2. i8/i16 vector implemented using dword LZCNT vector instruction
+//    ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
+//    split the vector, perform operation on it's Lo a Hi part and
+//    concatenate the results.
+static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getOpcode() == ISD::CTLZ);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (EltVT == MVT::i64 || EltVT == MVT::i32) {
+    // Extend to 512 bit vector.
+    assert((VT.is256BitVector() || VT.is128BitVector()) &&
+              "Unsupported value type for operation");
+
+    MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
+    SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
+                                 DAG.getUNDEF(NewVT),
+                                 Op.getOperand(0),
+                                 DAG.getIntPtrConstant(0, dl));
+    SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
+
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
+                       DAG.getIntPtrConstant(0, dl));
+  }
+
+  assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
+          "Unsupported element type");
+
+  if (16 < NumElems) {
+    // Split vector, it's Lo and Hi parts will be handled in next iteration.
+    SDValue Lo, Hi;
+    std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
+    MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+    Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
+    Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+  }
+
+  MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
+
+  assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
+          "Unsupported value type for operation");
+
+  // Use native supported vector instruction vplzcntd.
+  Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
+  SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
+  SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
+  SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
+
+  return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
+}
+
+// Lower CTLZ using a PSHUFB lookup table implementation.
+static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  int NumElts = VT.getVectorNumElements();
+  int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
+  MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
+
+  // Per-nibble leading zero PSHUFB lookup table.
+  const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
+                       /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
+                       /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
+                       /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
+
+  SmallVector<SDValue, 64> LUTVec;
+  for (int i = 0; i < NumBytes; ++i)
+    LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
+  SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
+
+  // Begin by bitcasting the input to byte vector, then split those bytes
+  // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
+  // If the hi input nibble is zero then we add both results together, otherwise
+  // we just take the hi result (by masking the lo result to zero before the
+  // add).
+  SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
+  SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
+
+  SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
+  SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
+  SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
+  SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
+  SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
+
+  Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
+  Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
+  Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
+  SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
+
+  // Merge result back from vXi8 back to VT, working on the lo/hi halves
+  // of the current vector width in the same way we did for the nibbles.
+  // If the upper half of the input element is zero then add the halves'
+  // leading zero counts together, otherwise just use the upper half's.
+  // Double the width of the result until we are at target width.
+  while (CurrVT != VT) {
+    int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
+    int CurrNumElts = CurrVT.getVectorNumElements();
+    MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
+    MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
+    SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
+
+    // Check if the upper half of the input element is zero.
+    SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
+                               DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+    HiZ = DAG.getBitcast(NextVT, HiZ);
+
+    // Move the upper/lower halves to the lower bits as we'll be extending to
+    // NextVT. Mask the lower result to zero if HiZ is true and add the results
+    // together.
+    SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
+    SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
+    SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
+    R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
+    Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
+    CurrVT = NextVT;
+  }
+
+  return Res;
+}
+
+static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
+                               const X86Subtarget &Subtarget,
+                               SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  SDValue Op0 = Op.getOperand(0);
+
+  if (Subtarget.hasAVX512())
+    return LowerVectorCTLZ_AVX512(Op, DAG);
+
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+    unsigned NumElems = VT.getVectorNumElements();
+
+    // Extract each 128-bit vector, perform ctlz and concat the result.
+    SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
+    SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+                       DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
+                       DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
+  }
+
+  assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
+  return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
+}
+
+static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
+                         SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT OpVT = VT;
+  unsigned NumBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  unsigned Opc = Op.getOpcode();
+
+  if (VT.isVector())
+    return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
+
+  Op = Op.getOperand(0);
+  if (VT == MVT::i8) {
+    // Zero extend to i32 since there is not an i8 bsr.
+    OpVT = MVT::i32;
+    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+  }
+
+  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+  if (Opc == ISD::CTLZ) {
+    // If src is zero (i.e. bsr sets ZF), returns NumBits.
+    SDValue Ops[] = {
+      Op,
+      DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
+      DAG.getConstant(X86::COND_E, dl, MVT::i8),
+      Op.getValue(1)
+    };
+    Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
+  }
+
+  // Finally xor with NumBits-1.
+  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
+                   DAG.getConstant(NumBits - 1, dl, OpVT));
+
+  if (VT == MVT::i8)
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+  return Op;
+}
+
+static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  unsigned NumBits = VT.getScalarSizeInBits();
+  SDLoc dl(Op);
+
+  if (VT.isVector()) {
+    SDValue N0 = Op.getOperand(0);
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+
+    // lsb(x) = (x & -x)
+    SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
+                              DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
+
+    // cttz_undef(x) = (width - 1) - ctlz(lsb)
+    if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
+      SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
+      return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
+                         DAG.getNode(ISD::CTLZ, dl, VT, LSB));
+    }
+
+    // cttz(x) = ctpop(lsb - 1)
+    SDValue One = DAG.getConstant(1, dl, VT);
+    return DAG.getNode(ISD::CTPOP, dl, VT,
+                       DAG.getNode(ISD::SUB, dl, VT, LSB, One));
+  }
+
+  assert(Op.getOpcode() == ISD::CTTZ &&
+         "Only scalar CTTZ requires custom lowering");
+
+  // Issue a bsf (scan bits forward) which also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
+
+  // If src is zero (i.e. bsf sets ZF), returns NumBits.
+  SDValue Ops[] = {
+    Op,
+    DAG.getConstant(NumBits, dl, VT),
+    DAG.getConstant(X86::COND_E, dl, MVT::i8),
+    Op.getValue(1)
+  };
+  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
+}
+
+/// Break a 256-bit integer operation into two new 128-bit ones and then
+/// concatenate the result back.
+static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+
+  assert(VT.is256BitVector() && VT.isInteger() &&
+         "Unsupported value type for operation");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  SDLoc dl(Op);
+
+  // Extract the LHS vectors
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
+
+  // Extract the RHS vectors
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
+
+  MVT EltVT = VT.getVectorElementType();
+  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
+}
+
+/// Break a 512-bit integer operation into two new 256-bit ones and then
+/// concatenate the result back.
+static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+
+  assert(VT.is512BitVector() && VT.isInteger() &&
+         "Unsupported value type for operation");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  SDLoc dl(Op);
+
+  // Extract the LHS vectors
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
+
+  // Extract the RHS vectors
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
+
+  MVT EltVT = VT.getVectorElementType();
+  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
+}
+
+static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
+  if (Op.getValueType() == MVT::i1)
+    return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(0), Op.getOperand(1));
+  assert(Op.getSimpleValueType().is256BitVector() &&
+         Op.getSimpleValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
+  if (Op.getValueType() == MVT::i1)
+    return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(0), Op.getOperand(1));
+  assert(Op.getSimpleValueType().is256BitVector() &&
+         Op.getSimpleValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getSimpleValueType().is256BitVector() &&
+         Op.getSimpleValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
+                        SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  if (VT == MVT::i1)
+    return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
+
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.is256BitVector() && !Subtarget.hasInt256())
+    return Lower256IntArith(Op, DAG);
+
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+
+  // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
+  // vector pairs, multiply and truncate.
+  if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
+    if (Subtarget.hasInt256()) {
+      // For 512-bit vectors, split into 256-bit vectors to allow the
+      // sign-extension to occur.
+      if (VT == MVT::v64i8)
+        return Lower512IntArith(Op, DAG);
+
+      // For 256-bit vectors, split into 128-bit vectors to allow the
+      // sign-extension to occur. We don't need this on AVX512BW as we can
+      // safely sign-extend to v32i16.
+      if (VT == MVT::v32i8 && !Subtarget.hasBWI())
+        return Lower256IntArith(Op, DAG);
+
+      MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
+      return DAG.getNode(
+          ISD::TRUNCATE, dl, VT,
+          DAG.getNode(ISD::MUL, dl, ExVT,
+                      DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
+                      DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
+    }
+
+    assert(VT == MVT::v16i8 &&
+           "Pre-AVX2 support only supports v16i8 multiplication");
+    MVT ExVT = MVT::v8i16;
+
+    // Extract the lo parts and sign extend to i16
+    SDValue ALo, BLo;
+    if (Subtarget.hasSSE41()) {
+      ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
+      BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
+    } else {
+      const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
+                              -1, 4, -1, 5, -1, 6, -1, 7};
+      ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+      BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+      ALo = DAG.getBitcast(ExVT, ALo);
+      BLo = DAG.getBitcast(ExVT, BLo);
+      ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
+      BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
+    }
+
+    // Extract the hi parts and sign extend to i16
+    SDValue AHi, BHi;
+    if (Subtarget.hasSSE41()) {
+      const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
+                              -1, -1, -1, -1, -1, -1, -1, -1};
+      AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+      BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+      AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
+      BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
+    } else {
+      const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
+                              -1, 12, -1, 13, -1, 14, -1, 15};
+      AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+      BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+      AHi = DAG.getBitcast(ExVT, AHi);
+      BHi = DAG.getBitcast(ExVT, BHi);
+      AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
+      BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
+    }
+
+    // Multiply, mask the lower 8bits of the lo/hi results and pack
+    SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+    SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+    RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
+    RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
+    return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+  }
+
+  // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
+  if (VT == MVT::v4i32) {
+    assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
+           "Should not custom lower when pmuldq is available!");
+
+    // Extract the odd parts.
+    static const int UnpackMask[] = { 1, -1, 3, -1 };
+    SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
+    SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
+
+    // Multiply the even parts.
+    SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
+    // Now multiply odd parts.
+    SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
+
+    Evens = DAG.getBitcast(VT, Evens);
+    Odds = DAG.getBitcast(VT, Odds);
+
+    // Merge the two vectors back together with a shuffle. This expands into 2
+    // shuffles.
+    static const int ShufMask[] = { 0, 4, 2, 6 };
+    return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
+  }
+
+  assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
+         "Only know how to lower V2I64/V4I64/V8I64 multiply");
+
+  // 32-bit vector types used for MULDQ/MULUDQ.
+  MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
+
+  // MULDQ returns the 64-bit result of the signed multiplication of the lower
+  // 32-bits. We can lower with this if the sign bits stretch that far.
+  if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
+      DAG.ComputeNumSignBits(B) > 32) {
+    return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
+                       DAG.getBitcast(MulVT, B));
+  }
+
+  //  Ahi = psrlqi(a, 32);
+  //  Bhi = psrlqi(b, 32);
+  //
+  //  AloBlo = pmuludq(a, b);
+  //  AloBhi = pmuludq(a, Bhi);
+  //  AhiBlo = pmuludq(Ahi, b);
+  //
+  //  Hi = psllqi(AloBhi + AhiBlo, 32);
+  //  return AloBlo + Hi;
+  APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
+  bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
+  bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
+
+  APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
+  bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
+  bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
+
+  // Bit cast to 32-bit vectors for MULUDQ.
+  SDValue Alo = DAG.getBitcast(MulVT, A);
+  SDValue Blo = DAG.getBitcast(MulVT, B);
+
+  SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
+
+  // Only multiply lo/hi halves that aren't known to be zero.
+  SDValue AloBlo = Zero;
+  if (!ALoIsZero && !BLoIsZero)
+    AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
+
+  SDValue AloBhi = Zero;
+  if (!ALoIsZero && !BHiIsZero) {
+    SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
+    Bhi = DAG.getBitcast(MulVT, Bhi);
+    AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
+  }
+
+  SDValue AhiBlo = Zero;
+  if (!AHiIsZero && !BLoIsZero) {
+    SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
+    Ahi = DAG.getBitcast(MulVT, Ahi);
+    AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
+  }
+
+  SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
+  Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
+
+  return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
+}
+
+static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
+                         SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.is256BitVector() && !Subtarget.hasInt256())
+    return Lower256IntArith(Op, DAG);
+
+  // Only i8 vectors should need custom lowering after this.
+  assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
+         "Unsupported vector type");
+
+  // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
+  // logical shift down the upper half and pack back to i8.
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+
+  // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
+  // and then ashr/lshr the upper bits down to the lower bits before multiply.
+  unsigned Opcode = Op.getOpcode();
+  unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
+  unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
+
+  // AVX2 implementations - extend xmm subvectors to ymm.
+  if (Subtarget.hasInt256()) {
+    SDValue Lo = DAG.getIntPtrConstant(0, dl);
+    SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
+
+    if (VT == MVT::v32i8) {
+      SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
+      SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
+      SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
+      SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
+      ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
+      BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
+      AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
+      BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
+      Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
+                       DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
+                       DAG.getConstant(8, dl, MVT::v16i16));
+      Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
+                       DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
+                       DAG.getConstant(8, dl, MVT::v16i16));
+      // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
+      // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
+      const int LoMask[] = {0,  1,  2,  3,  4,  5,  6,  7,
+                            16, 17, 18, 19, 20, 21, 22, 23};
+      const int HiMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
+                            24, 25, 26, 27, 28, 29, 30, 31};
+      return DAG.getNode(X86ISD::PACKUS, dl, VT,
+                         DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
+                         DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
+    }
+
+    SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
+    SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
+    SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
+    SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
+                               DAG.getConstant(8, dl, MVT::v16i16));
+    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
+    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
+    return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
+  }
+
+  assert(VT == MVT::v16i8 &&
+         "Pre-AVX2 support only supports v16i8 multiplication");
+  MVT ExVT = MVT::v8i16;
+
+  // Extract the lo parts and zero/sign extend to i16.
+  SDValue ALo, BLo;
+  if (Subtarget.hasSSE41()) {
+    ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
+    BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
+  } else {
+    const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
+                            -1, 4, -1, 5, -1, 6, -1, 7};
+    ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+    BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+    ALo = DAG.getBitcast(ExVT, ALo);
+    BLo = DAG.getBitcast(ExVT, BLo);
+    ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
+    BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
+  }
+
+  // Extract the hi parts and zero/sign extend to i16.
+  SDValue AHi, BHi;
+  if (Subtarget.hasSSE41()) {
+    const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
+                            -1, -1, -1, -1, -1, -1, -1, -1};
+    AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+    BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+    AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
+    BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
+  } else {
+    const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
+                            -1, 12, -1, 13, -1, 14, -1, 15};
+    AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+    BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+    AHi = DAG.getBitcast(ExVT, AHi);
+    BHi = DAG.getBitcast(ExVT, BHi);
+    AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
+    BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
+  }
+
+  // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
+  // pack back to v16i8.
+  SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+  SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+  RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
+  RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
+  return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+}
+
+SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget.isTargetWin64() && "Unexpected target");
+  EVT VT = Op.getValueType();
+  assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
+         "Unexpected return type for lowering");
+
+  RTLIB::Libcall LC;
+  bool isSigned;
+  switch (Op->getOpcode()) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
+  case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
+  case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
+  case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
+  case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
+  case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
+  }
+
+  SDLoc dl(Op);
+  SDValue InChain = DAG.getEntryNode();
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Op->getOperand(i).getValueType();
+    assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
+           "Unexpected argument type for lowering");
+    SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+    Entry.Node = StackPtr;
+    InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
+                           MachinePointerInfo(), /* Alignment = */ 16);
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Ty = PointerType::get(ArgTy,0);
+    Entry.isSExt = false;
+    Entry.isZExt = false;
+    Args.push_back(Entry);
+  }
+
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy(DAG.getDataLayout()));
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(InChain)
+    .setCallee(getLibcallCallingConv(LC),
+               static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
+               Callee, std::move(Args))
+    .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+  return DAG.getBitcast(VT, CallInfo.first);
+}
+
+static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
+                             SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+  MVT VT = Op0.getSimpleValueType();
+  SDLoc dl(Op);
+
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+    unsigned Opcode = Op.getOpcode();
+    unsigned NumElems = VT.getVectorNumElements();
+    MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
+    SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
+    SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
+    SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
+    SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
+    SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
+    SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
+    SDValue Ops[] = {
+      DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
+      DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
+    };
+    return DAG.getMergeValues(Ops, dl);
+  }
+
+  assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
+         (VT == MVT::v8i32 && Subtarget.hasInt256()));
+
+  // PMULxD operations multiply each even value (starting at 0) of LHS with
+  // the related value of RHS and produce a widen result.
+  // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+  // => <2 x i64> <ae|cg>
+  //
+  // In other word, to have all the results, we need to perform two PMULxD:
+  // 1. one with the even values.
+  // 2. one with the odd values.
+  // To achieve #2, with need to place the odd values at an even position.
+  //
+  // Place the odd value at an even position (basically, shift all values 1
+  // step to the left):
+  const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
+  // <a|b|c|d> => <b|undef|d|undef>
+  SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
+                             makeArrayRef(&Mask[0], VT.getVectorNumElements()));
+  // <e|f|g|h> => <f|undef|h|undef>
+  SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
+                             makeArrayRef(&Mask[0], VT.getVectorNumElements()));
+
+  // Emit two multiplies, one for the lower 2 ints and one for the higher 2
+  // ints.
+  MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
+  bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
+  unsigned Opcode =
+      (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+  // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+  // => <2 x i64> <ae|cg>
+  SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+  // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
+  // => <2 x i64> <bf|dh>
+  SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
+
+  // Shuffle it back into the right order.
+  SDValue Highs, Lows;
+  if (VT == MVT::v8i32) {
+    const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
+    Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+    const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
+    Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+  } else {
+    const int HighMask[] = {1, 5, 3, 7};
+    Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+    const int LowMask[] = {0, 4, 2, 6};
+    Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+  }
+
+  // If we have a signed multiply but no PMULDQ fix up the high parts of a
+  // unsigned multiply.
+  if (IsSigned && !Subtarget.hasSSE41()) {
+    SDValue ShAmt = DAG.getConstant(
+        31, dl,
+        DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
+    SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
+                             DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
+    SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
+                             DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
+
+    SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
+    Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
+  }
+
+  // The first result of MUL_LOHI is actually the low value, followed by the
+  // high value.
+  SDValue Ops[] = {Lows, Highs};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+// Return true if the required (according to Opcode) shift-imm form is natively
+// supported by the Subtarget
+static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
+                                        unsigned Opcode) {
+  if (VT.getScalarSizeInBits() < 16)
+    return false;
+
+  if (VT.is512BitVector() &&
+      (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
+    return true;
+
+  bool LShift = VT.is128BitVector() ||
+    (VT.is256BitVector() && Subtarget.hasInt256());
+
+  bool AShift = LShift && (Subtarget.hasVLX() ||
+    (VT != MVT::v2i64 && VT != MVT::v4i64));
+  return (Opcode == ISD::SRA) ? AShift : LShift;
+}
+
+// The shift amount is a variable, but it is the same for all vector lanes.
+// These instructions are defined together with shift-immediate.
+static
+bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
+                                      unsigned Opcode) {
+  return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
+}
+
+// Return true if the required (according to Opcode) variable-shift form is
+// natively supported by the Subtarget
+static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
+                                    unsigned Opcode) {
+
+  if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
+    return false;
+
+  // vXi16 supported only on AVX-512, BWI
+  if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
+    return false;
+
+  if (VT.is512BitVector() || Subtarget.hasVLX())
+    return true;
+
+  bool LShift = VT.is128BitVector() || VT.is256BitVector();
+  bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
+  return (Opcode == ISD::SRA) ? AShift : LShift;
+}
+
+static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
+                                         const X86Subtarget &Subtarget) {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+  SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
+
+  unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
+    (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
+
+  auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
+    assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
+    MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
+    SDValue Ex = DAG.getBitcast(ExVT, R);
+
+    if (ShiftAmt >= 32) {
+      // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
+      SDValue Upper =
+          getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
+      SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+                                                 ShiftAmt - 32, DAG);
+      if (VT == MVT::v2i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
+      if (VT == MVT::v4i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+                                  {9, 1, 11, 3, 13, 5, 15, 7});
+    } else {
+      // SRA upper i32, SHL whole i64 and select lower i32.
+      SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
+                                                 ShiftAmt, DAG);
+      SDValue Lower =
+          getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
+      Lower = DAG.getBitcast(ExVT, Lower);
+      if (VT == MVT::v2i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
+      if (VT == MVT::v4i64)
+        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
+                                  {8, 1, 10, 3, 12, 5, 14, 7});
+    }
+    return DAG.getBitcast(VT, Ex);
+  };
+
+  // Optimize shl/srl/sra with constant shift amount.
+  if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+    if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
+      uint64_t ShiftAmt = ShiftConst->getZExtValue();
+
+      if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+        return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+      // i64 SRA needs to be performed as partial shifts.
+      if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
+          Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
+        return ArithmeticShiftRight64(ShiftAmt);
+
+      if (VT == MVT::v16i8 ||
+          (Subtarget.hasInt256() && VT == MVT::v32i8) ||
+          VT == MVT::v64i8) {
+        unsigned NumElts = VT.getVectorNumElements();
+        MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+
+        // Simple i8 add case
+        if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
+          return DAG.getNode(ISD::ADD, dl, VT, R, R);
+
+        // ashr(R, 7)  === cmp_slt(R, 0)
+        if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
+          SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+          if (VT.is512BitVector()) {
+            assert(VT == MVT::v64i8 && "Unexpected element type!");
+            SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
+            return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
+          }
+          return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+        }
+
+        // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
+        if (VT == MVT::v16i8 && Subtarget.hasXOP())
+          return SDValue();
+
+        if (Op.getOpcode() == ISD::SHL) {
+          // Make a large shift.
+          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
+                                                   R, ShiftAmt, DAG);
+          SHL = DAG.getBitcast(VT, SHL);
+          // Zero out the rightmost bits.
+          return DAG.getNode(ISD::AND, dl, VT, SHL,
+                             DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
+        }
+        if (Op.getOpcode() == ISD::SRL) {
+          // Make a large shift.
+          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
+                                                   R, ShiftAmt, DAG);
+          SRL = DAG.getBitcast(VT, SRL);
+          // Zero out the leftmost bits.
+          return DAG.getNode(ISD::AND, dl, VT, SRL,
+                             DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
+        }
+        if (Op.getOpcode() == ISD::SRA) {
+          // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
+          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+
+          SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
+          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+          return Res;
+        }
+        llvm_unreachable("Unknown shift opcode.");
+      }
+    }
+  }
+
+  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+  if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
+      (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
+       (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
+
+    // Peek through any splat that was introduced for i64 shift vectorization.
+    int SplatIndex = -1;
+    if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
+      if (SVN->isSplat()) {
+        SplatIndex = SVN->getSplatIndex();
+        Amt = Amt.getOperand(0);
+        assert(SplatIndex < (int)VT.getVectorNumElements() &&
+               "Splat shuffle referencing second operand");
+      }
+
+    if (Amt.getOpcode() != ISD::BITCAST ||
+        Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
+      return SDValue();
+
+    Amt = Amt.getOperand(0);
+    unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
+                     VT.getVectorNumElements();
+    unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
+    uint64_t ShiftAmt = 0;
+    unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
+    for (unsigned i = 0; i != Ratio; ++i) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
+      if (!C)
+        return SDValue();
+      // 6 == Log2(64)
+      ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
+    }
+
+    // Check remaining shift amounts (if not a splat).
+    if (SplatIndex < 0) {
+      for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+        uint64_t ShAmt = 0;
+        for (unsigned j = 0; j != Ratio; ++j) {
+          ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
+          if (!C)
+            return SDValue();
+          // 6 == Log2(64)
+          ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
+        }
+        if (ShAmt != ShiftAmt)
+          return SDValue();
+      }
+    }
+
+    if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+      return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+
+    if (Op.getOpcode() == ISD::SRA)
+      return ArithmeticShiftRight64(ShiftAmt);
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
+                                        const X86Subtarget &Subtarget) {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+  SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
+
+  unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
+    (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
+
+  unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
+    (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
+
+  if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
+    SDValue BaseShAmt;
+    MVT EltVT = VT.getVectorElementType();
+
+    if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
+      // Check if this build_vector node is doing a splat.
+      // If so, then set BaseShAmt equal to the splat value.
+      BaseShAmt = BV->getSplatValue();
+      if (BaseShAmt && BaseShAmt.isUndef())
+        BaseShAmt = SDValue();
+    } else {
+      if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+        Amt = Amt.getOperand(0);
+
+      ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
+      if (SVN && SVN->isSplat()) {
+        unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
+        SDValue InVec = Amt.getOperand(0);
+        if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+          assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
+                 "Unexpected shuffle index found!");
+          BaseShAmt = InVec.getOperand(SplatIdx);
+        } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+           if (ConstantSDNode *C =
+               dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
+             if (C->getZExtValue() == SplatIdx)
+               BaseShAmt = InVec.getOperand(1);
+           }
+        }
+
+        if (!BaseShAmt)
+          // Avoid introducing an extract element from a shuffle.
+          BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
+                                  DAG.getIntPtrConstant(SplatIdx, dl));
+      }
+    }
+
+    if (BaseShAmt.getNode()) {
+      assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
+      if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
+        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
+      else if (EltVT.bitsLT(MVT::i32))
+        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
+
+      return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
+    }
+  }
+
+  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+  if (!Subtarget.is64Bit() && VT == MVT::v2i64  &&
+      Amt.getOpcode() == ISD::BITCAST &&
+      Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+    Amt = Amt.getOperand(0);
+    unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
+                     VT.getVectorNumElements();
+    std::vector<SDValue> Vals(Ratio);
+    for (unsigned i = 0; i != Ratio; ++i)
+      Vals[i] = Amt.getOperand(i);
+    for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+      for (unsigned j = 0; j != Ratio; ++j)
+        if (Vals[j] != Amt.getOperand(i + j))
+          return SDValue();
+    }
+
+    if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
+      return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
+  }
+  return SDValue();
+}
+
+static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
+                          SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
+  SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
+  bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
+
+  assert(VT.isVector() && "Custom lowering only for vector shifts!");
+  assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
+
+  if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
+    return V;
+
+  if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
+    return V;
+
+  if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
+    return Op;
+
+  // XOP has 128-bit variable logical/arithmetic shifts.
+  // +ve/-ve Amt = shift left/right.
+  if (Subtarget.hasXOP() &&
+      (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+       VT == MVT::v8i16 || VT == MVT::v16i8)) {
+    if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
+      SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
+      Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
+    }
+    if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
+      return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
+    if (Op.getOpcode() == ISD::SRA)
+      return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
+  }
+
+  // 2i64 vector logical shifts can efficiently avoid scalarization - do the
+  // shifts per-lane and then shuffle the partial results back together.
+  if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
+    // Splat the shift amounts so the scalar shifts above will catch it.
+    SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
+    SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
+    SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
+    SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
+    return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
+  }
+
+  // i64 vector arithmetic shift can be emulated with the transform:
+  // M = lshr(SIGN_BIT, Amt)
+  // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
+  if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
+      Op.getOpcode() == ISD::SRA) {
+    SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
+    SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
+    R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+    R = DAG.getNode(ISD::XOR, dl, VT, R, M);
+    R = DAG.getNode(ISD::SUB, dl, VT, R, M);
+    return R;
+  }
+
+  // If possible, lower this packed shift into a vector multiply instead of
+  // expanding it into a sequence of scalar shifts.
+  // Do this only if the vector shift count is a constant build_vector.
+  if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
+      (VT == MVT::v8i16 || VT == MVT::v4i32 ||
+       (Subtarget.hasInt256() && VT == MVT::v16i16))) {
+    SmallVector<SDValue, 8> Elts;
+    MVT SVT = VT.getVectorElementType();
+    unsigned SVTBits = SVT.getSizeInBits();
+    APInt One(SVTBits, 1);
+    unsigned NumElems = VT.getVectorNumElements();
+
+    for (unsigned i=0; i !=NumElems; ++i) {
+      SDValue Op = Amt->getOperand(i);
+      if (Op->isUndef()) {
+        Elts.push_back(Op);
+        continue;
+      }
+
+      ConstantSDNode *ND = cast<ConstantSDNode>(Op);
+      APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
+      uint64_t ShAmt = C.getZExtValue();
+      if (ShAmt >= SVTBits) {
+        Elts.push_back(DAG.getUNDEF(SVT));
+        continue;
+      }
+      Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
+    }
+    SDValue BV = DAG.getBuildVector(VT, dl, Elts);
+    return DAG.getNode(ISD::MUL, dl, VT, R, BV);
+  }
+
+  // Lower SHL with variable shift amount.
+  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
+    Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
+
+    Op = DAG.getNode(ISD::ADD, dl, VT, Op,
+                     DAG.getConstant(0x3f800000U, dl, VT));
+    Op = DAG.getBitcast(MVT::v4f32, Op);
+    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
+    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
+  }
+
+  // If possible, lower this shift as a sequence of two shifts by
+  // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
+  // Example:
+  //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
+  //
+  // Could be rewritten as:
+  //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
+  //
+  // The advantage is that the two shifts from the example would be
+  // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
+  // the vector shift into four scalar shifts plus four pairs of vector
+  // insert/extract.
+  if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
+    unsigned TargetOpcode = X86ISD::MOVSS;
+    bool CanBeSimplified;
+    // The splat value for the first packed shift (the 'X' from the example).
+    SDValue Amt1 = Amt->getOperand(0);
+    // The splat value for the second packed shift (the 'Y' from the example).
+    SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
+
+    // See if it is possible to replace this node with a sequence of
+    // two shifts followed by a MOVSS/MOVSD/PBLEND.
+    if (VT == MVT::v4i32) {
+      // Check if it is legal to use a MOVSS.
+      CanBeSimplified = Amt2 == Amt->getOperand(2) &&
+                        Amt2 == Amt->getOperand(3);
+      if (!CanBeSimplified) {
+        // Otherwise, check if we can still simplify this node using a MOVSD.
+        CanBeSimplified = Amt1 == Amt->getOperand(1) &&
+                          Amt->getOperand(2) == Amt->getOperand(3);
+        TargetOpcode = X86ISD::MOVSD;
+        Amt2 = Amt->getOperand(2);
+      }
+    } else {
+      // Do similar checks for the case where the machine value type
+      // is MVT::v8i16.
+      CanBeSimplified = Amt1 == Amt->getOperand(1);
+      for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
+        CanBeSimplified = Amt2 == Amt->getOperand(i);
+
+      if (!CanBeSimplified) {
+        TargetOpcode = X86ISD::MOVSD;
+        CanBeSimplified = true;
+        Amt2 = Amt->getOperand(4);
+        for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
+          CanBeSimplified = Amt1 == Amt->getOperand(i);
+        for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
+          CanBeSimplified = Amt2 == Amt->getOperand(j);
+      }
+    }
+
+    if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
+        isa<ConstantSDNode>(Amt2)) {
+      // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
+      MVT CastVT = MVT::v4i32;
+      SDValue Splat1 =
+          DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
+      SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
+      SDValue Splat2 =
+          DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
+      SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
+      SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
+      SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
+      if (TargetOpcode == X86ISD::MOVSD)
+        return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
+                                                       BitCast2, {0, 1, 6, 7}));
+      return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
+                                                     BitCast2, {0, 5, 6, 7}));
+    }
+  }
+
+  // v4i32 Non Uniform Shifts.
+  // If the shift amount is constant we can shift each lane using the SSE2
+  // immediate shifts, else we need to zero-extend each lane to the lower i64
+  // and shift using the SSE2 variable shifts.
+  // The separate results can then be blended together.
+  if (VT == MVT::v4i32) {
+    unsigned Opc = Op.getOpcode();
+    SDValue Amt0, Amt1, Amt2, Amt3;
+    if (ConstantAmt) {
+      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
+      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
+      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
+      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
+    } else {
+      // ISD::SHL is handled above but we include it here for completeness.
+      switch (Opc) {
+      default:
+        llvm_unreachable("Unknown target vector shift node");
+      case ISD::SHL:
+        Opc = X86ISD::VSHL;
+        break;
+      case ISD::SRL:
+        Opc = X86ISD::VSRL;
+        break;
+      case ISD::SRA:
+        Opc = X86ISD::VSRA;
+        break;
+      }
+      // The SSE2 shifts use the lower i64 as the same shift amount for
+      // all lanes and the upper i64 is ignored. These shuffle masks
+      // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
+      SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
+      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
+      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
+      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
+    }
+
+    SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
+    SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
+    SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
+    SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
+    SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
+    SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
+    return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
+  }
+
+  if (VT == MVT::v16i8 ||
+      (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
+    MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
+    unsigned ShiftOpcode = Op->getOpcode();
+
+    auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
+      // On SSE41 targets we make use of the fact that VSELECT lowers
+      // to PBLENDVB which selects bytes based just on the sign bit.
+      if (Subtarget.hasSSE41()) {
+        V0 = DAG.getBitcast(VT, V0);
+        V1 = DAG.getBitcast(VT, V1);
+        Sel = DAG.getBitcast(VT, Sel);
+        return DAG.getBitcast(SelVT,
+                              DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+      }
+      // On pre-SSE41 targets we test for the sign bit by comparing to
+      // zero - a negative value will set all bits of the lanes to true
+      // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
+      SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
+      SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
+      return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
+    };
+
+    // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
+    // We can safely do this using i16 shifts as we're only interested in
+    // the 3 lower bits of each byte.
+    Amt = DAG.getBitcast(ExtVT, Amt);
+    Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
+    Amt = DAG.getBitcast(VT, Amt);
+
+    if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
+      // r = VSELECT(r, shift(r, 4), a);
+      SDValue M =
+          DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
+      R = SignBitSelect(VT, Amt, M, R);
+
+      // a += a
+      Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+      // r = VSELECT(r, shift(r, 2), a);
+      M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
+      R = SignBitSelect(VT, Amt, M, R);
+
+      // a += a
+      Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+      // return VSELECT(r, shift(r, 1), a);
+      M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
+      R = SignBitSelect(VT, Amt, M, R);
+      return R;
+    }
+
+    if (Op->getOpcode() == ISD::SRA) {
+      // For SRA we need to unpack each byte to the higher byte of a i16 vector
+      // so we can correctly sign extend. We don't care what happens to the
+      // lower byte.
+      SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
+      SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
+      SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
+      SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
+      ALo = DAG.getBitcast(ExtVT, ALo);
+      AHi = DAG.getBitcast(ExtVT, AHi);
+      RLo = DAG.getBitcast(ExtVT, RLo);
+      RHi = DAG.getBitcast(ExtVT, RHi);
+
+      // r = VSELECT(r, shift(r, 4), a);
+      SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
+                                DAG.getConstant(4, dl, ExtVT));
+      SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
+                                DAG.getConstant(4, dl, ExtVT));
+      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+      // a += a
+      ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
+      AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
+
+      // r = VSELECT(r, shift(r, 2), a);
+      MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
+                        DAG.getConstant(2, dl, ExtVT));
+      MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
+                        DAG.getConstant(2, dl, ExtVT));
+      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+      // a += a
+      ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
+      AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
+
+      // r = VSELECT(r, shift(r, 1), a);
+      MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
+                        DAG.getConstant(1, dl, ExtVT));
+      MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
+                        DAG.getConstant(1, dl, ExtVT));
+      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
+      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
+
+      // Logical shift the result back to the lower byte, leaving a zero upper
+      // byte
+      // meaning that we can safely pack with PACKUSWB.
+      RLo =
+          DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
+      RHi =
+          DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
+      return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+    }
+  }
+
+  // It's worth extending once and using the v8i32 shifts for 16-bit types, but
+  // the extra overheads to get from v16i8 to v8i32 make the existing SSE
+  // solution better.
+  if (Subtarget.hasInt256() && VT == MVT::v8i16) {
+    MVT ExtVT = MVT::v8i32;
+    unsigned ExtOpc =
+        Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    R = DAG.getNode(ExtOpc, dl, ExtVT, R);
+    Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
+    return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                       DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
+  }
+
+  if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
+    MVT ExtVT = MVT::v8i32;
+    SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+    SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
+    SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
+    SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
+    SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
+    ALo = DAG.getBitcast(ExtVT, ALo);
+    AHi = DAG.getBitcast(ExtVT, AHi);
+    RLo = DAG.getBitcast(ExtVT, RLo);
+    RHi = DAG.getBitcast(ExtVT, RHi);
+    SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
+    SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
+    Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
+    Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
+    return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
+  }
+
+  if (VT == MVT::v8i16) {
+    unsigned ShiftOpcode = Op->getOpcode();
+
+    // If we have a constant shift amount, the non-SSE41 path is best as
+    // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
+    bool UseSSE41 = Subtarget.hasSSE41() &&
+                    !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
+
+    auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
+      // On SSE41 targets we make use of the fact that VSELECT lowers
+      // to PBLENDVB which selects bytes based just on the sign bit.
+      if (UseSSE41) {
+        MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
+        V0 = DAG.getBitcast(ExtVT, V0);
+        V1 = DAG.getBitcast(ExtVT, V1);
+        Sel = DAG.getBitcast(ExtVT, Sel);
+        return DAG.getBitcast(
+            VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
+      }
+      // On pre-SSE41 targets we splat the sign bit - a negative value will
+      // set all bits of the lanes to true and VSELECT uses that in
+      // its OR(AND(V0,C),AND(V1,~C)) lowering.
+      SDValue C =
+          DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
+      return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
+    };
+
+    // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
+    if (UseSSE41) {
+      // On SSE41 targets we need to replicate the shift mask in both
+      // bytes for PBLENDVB.
+      Amt = DAG.getNode(
+          ISD::OR, dl, VT,
+          DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
+          DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
+    } else {
+      Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
+    }
+
+    // r = VSELECT(r, shift(r, 8), a);
+    SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
+    R = SignBitSelect(Amt, M, R);
+
+    // a += a
+    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+    // r = VSELECT(r, shift(r, 4), a);
+    M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
+    R = SignBitSelect(Amt, M, R);
+
+    // a += a
+    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+    // r = VSELECT(r, shift(r, 2), a);
+    M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
+    R = SignBitSelect(Amt, M, R);
+
+    // a += a
+    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
+
+    // return VSELECT(r, shift(r, 1), a);
+    M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
+    R = SignBitSelect(Amt, M, R);
+    return R;
+  }
+
+  // Decompose 256-bit shifts into smaller 128-bit shifts.
+  if (VT.is256BitVector())
+    return Lower256IntArith(Op, DAG);
+
+  return SDValue();
+}
+
+static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
+                           SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc DL(Op);
+  SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
+
+  assert(VT.isVector() && "Custom lowering only for vector rotates!");
+  assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
+  assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
+
+  // XOP has 128-bit vector variable + immediate rotates.
+  // +ve/-ve Amt = rotate left/right.
+
+  // Split 256-bit integers.
+  if (VT.is256BitVector())
+    return Lower256IntArith(Op, DAG);
+
+  assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
+
+  // Attempt to rotate by immediate.
+  if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+    if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
+      uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
+      assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
+      return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
+                         DAG.getConstant(RotateAmt, DL, MVT::i8));
+    }
+  }
+
+  // Use general rotate by variable (per-element).
+  return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
+}
+
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
+  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
+  // looks for this combo and may remove the "setcc" instruction if the "setcc"
+  // has only one use.
+  SDNode *N = Op.getNode();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  unsigned BaseOp = 0;
+  X86::CondCode Cond;
+  SDLoc DL(Op);
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Unknown ovf instruction!");
+  case ISD::SADDO:
+    // A subtract of one will be selected as a INC. Note that INC doesn't
+    // set CF, so we can't do this for UADDO.
+    if (isOneConstant(RHS)) {
+        BaseOp = X86ISD::INC;
+        Cond = X86::COND_O;
+        break;
+      }
+    BaseOp = X86ISD::ADD;
+    Cond = X86::COND_O;
+    break;
+  case ISD::UADDO:
+    BaseOp = X86ISD::ADD;
+    Cond = X86::COND_B;
+    break;
+  case ISD::SSUBO:
+    // A subtract of one will be selected as a DEC. Note that DEC doesn't
+    // set CF, so we can't do this for USUBO.
+    if (isOneConstant(RHS)) {
+        BaseOp = X86ISD::DEC;
+        Cond = X86::COND_O;
+        break;
+      }
+    BaseOp = X86ISD::SUB;
+    Cond = X86::COND_O;
+    break;
+  case ISD::USUBO:
+    BaseOp = X86ISD::SUB;
+    Cond = X86::COND_B;
+    break;
+  case ISD::SMULO:
+    BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
+    Cond = X86::COND_O;
+    break;
+  case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
+    if (N->getValueType(0) == MVT::i8) {
+      BaseOp = X86ISD::UMUL8;
+      Cond = X86::COND_O;
+      break;
+    }
+    SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
+                                 MVT::i32);
+    SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
+
+    SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
+
+    if (N->getValueType(1) == MVT::i1)
+      SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+
+    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
+  }
+  }
+
+  // Also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+  SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
+
+  SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
+
+  if (N->getValueType(1) == MVT::i1)
+    SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
+}
+
+/// Returns true if the operand type is exactly twice the native width, and
+/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
+/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
+/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
+bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
+  unsigned OpWidth = MemType->getPrimitiveSizeInBits();
+
+  if (OpWidth == 64)
+    return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
+  else if (OpWidth == 128)
+    return Subtarget.hasCmpxchg16b();
+  else
+    return false;
+}
+
+bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  return needsCmpXchgNb(SI->getValueOperand()->getType());
+}
+
+// Note: this turns large loads into lock cmpxchg8b/16b.
+// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
+  return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
+                                               : AtomicExpansionKind::None;
+}
+
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+  Type *MemType = AI->getType();
+
+  // If the operand is too big, we must see if cmpxchg8/16b is available
+  // and default to library calls otherwise.
+  if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
+    return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
+                                   : AtomicExpansionKind::None;
+  }
+
+  AtomicRMWInst::BinOp Op = AI->getOperation();
+  switch (Op) {
+  default:
+    llvm_unreachable("Unknown atomic operation");
+  case AtomicRMWInst::Xchg:
+  case AtomicRMWInst::Add:
+  case AtomicRMWInst::Sub:
+    // It's better to use xadd, xsub or xchg for these in all cases.
+    return AtomicExpansionKind::None;
+  case AtomicRMWInst::Or:
+  case AtomicRMWInst::And:
+  case AtomicRMWInst::Xor:
+    // If the atomicrmw's result isn't actually used, we can just add a "lock"
+    // prefix to a normal instruction for these operations.
+    return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
+                            : AtomicExpansionKind::None;
+  case AtomicRMWInst::Nand:
+  case AtomicRMWInst::Max:
+  case AtomicRMWInst::Min:
+  case AtomicRMWInst::UMax:
+  case AtomicRMWInst::UMin:
+    // These always require a non-trivial set of data operations on x86. We must
+    // use a cmpxchg loop.
+    return AtomicExpansionKind::CmpXChg;
+  }
+}
+
+LoadInst *
+X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
+  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+  Type *MemType = AI->getType();
+  // Accesses larger than the native width are turned into cmpxchg/libcalls, so
+  // there is no benefit in turning such RMWs into loads, and it is actually
+  // harmful as it introduces a mfence.
+  if (MemType->getPrimitiveSizeInBits() > NativeWidth)
+    return nullptr;
+
+  auto Builder = IRBuilder<>(AI);
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  auto SynchScope = AI->getSynchScope();
+  // We must restrict the ordering to avoid generating loads with Release or
+  // ReleaseAcquire orderings.
+  auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
+  auto Ptr = AI->getPointerOperand();
+
+  // Before the load we need a fence. Here is an example lifted from
+  // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
+  // is required:
+  // Thread 0:
+  //   x.store(1, relaxed);
+  //   r1 = y.fetch_add(0, release);
+  // Thread 1:
+  //   y.fetch_add(42, acquire);
+  //   r2 = x.load(relaxed);
+  // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
+  // lowered to just a load without a fence. A mfence flushes the store buffer,
+  // making the optimization clearly correct.
+  // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
+  // otherwise, we might be able to be more aggressive on relaxed idempotent
+  // rmw. In practice, they do not look useful, so we don't try to be
+  // especially clever.
+  if (SynchScope == SingleThread)
+    // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
+    // the IR level, so we must wrap it in an intrinsic.
+    return nullptr;
+
+  if (!Subtarget.hasMFence())
+    // FIXME: it might make sense to use a locked operation here but on a
+    // different cache-line to prevent cache-line bouncing. In practice it
+    // is probably a small win, and x86 processors without mfence are rare
+    // enough that we do not bother.
+    return nullptr;
+
+  Function *MFence =
+      llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
+  Builder.CreateCall(MFence, {});
+
+  // Finally we can emit the atomic load.
+  LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
+          AI->getType()->getPrimitiveSizeInBits());
+  Loaded->setAtomic(Order, SynchScope);
+  AI->replaceAllUsesWith(Loaded);
+  AI->eraseFromParent();
+  return Loaded;
+}
+
+static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
+                                 SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
+    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+  // The only fence that needs an instruction is a sequentially-consistent
+  // cross-thread fence.
+  if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
+      FenceScope == CrossThread) {
+    if (Subtarget.hasMFence())
+      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
+
+    SDValue Chain = Op.getOperand(0);
+    SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+    SDValue Ops[] = {
+      DAG.getRegister(X86::ESP, MVT::i32),     // Base
+      DAG.getTargetConstant(1, dl, MVT::i8),   // Scale
+      DAG.getRegister(0, MVT::i32),            // Index
+      DAG.getTargetConstant(0, dl, MVT::i32),  // Disp
+      DAG.getRegister(0, MVT::i32),            // Segment.
+      Zero,
+      Chain
+    };
+    SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
+    return SDValue(Res, 0);
+  }
+
+  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
+                             SelectionDAG &DAG) {
+  MVT T = Op.getSimpleValueType();
+  SDLoc DL(Op);
+  unsigned Reg = 0;
+  unsigned size = 0;
+  switch(T.SimpleTy) {
+  default: llvm_unreachable("Invalid value type!");
+  case MVT::i8:  Reg = X86::AL;  size = 1; break;
+  case MVT::i16: Reg = X86::AX;  size = 2; break;
+  case MVT::i32: Reg = X86::EAX; size = 4; break;
+  case MVT::i64:
+    assert(Subtarget.is64Bit() && "Node not type legal!");
+    Reg = X86::RAX; size = 8;
+    break;
+  }
+  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
+                                  Op.getOperand(2), SDValue());
+  SDValue Ops[] = { cpIn.getValue(0),
+                    Op.getOperand(1),
+                    Op.getOperand(3),
+                    DAG.getTargetConstant(size, DL, MVT::i8),
+                    cpIn.getValue(1) };
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
+  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
+                                           Ops, T, MMO);
+
+  SDValue cpOut =
+    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
+  SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
+                                      MVT::i32, cpOut.getValue(2));
+  SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
+
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
+  return SDValue();
+}
+
+static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
+                            SelectionDAG &DAG) {
+  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+  MVT DstVT = Op.getSimpleValueType();
+
+  if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
+      SrcVT == MVT::i64) {
+    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+    if (DstVT != MVT::f64)
+      // This conversion needs to be expanded.
+      return SDValue();
+
+    SDValue Op0 = Op->getOperand(0);
+    SmallVector<SDValue, 16> Elts;
+    SDLoc dl(Op);
+    unsigned NumElts;
+    MVT SVT;
+    if (SrcVT.isVector()) {
+      NumElts = SrcVT.getVectorNumElements();
+      SVT = SrcVT.getVectorElementType();
+
+      // Widen the vector in input in the case of MVT::v2i32.
+      // Example: from MVT::v2i32 to MVT::v4i32.
+      for (unsigned i = 0, e = NumElts; i != e; ++i)
+        Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
+                                   DAG.getIntPtrConstant(i, dl)));
+    } else {
+      assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
+             "Unexpected source type in LowerBITCAST");
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
+                                 DAG.getIntPtrConstant(0, dl)));
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
+                                 DAG.getIntPtrConstant(1, dl)));
+      NumElts = 2;
+      SVT = MVT::i32;
+    }
+    // Explicitly mark the extra elements as Undef.
+    Elts.append(NumElts, DAG.getUNDEF(SVT));
+
+    EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+    SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
+    SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
+                       DAG.getIntPtrConstant(0, dl));
+  }
+
+  assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
+         Subtarget.hasMMX() && "Unexpected custom BITCAST");
+  assert((DstVT == MVT::i64 ||
+          (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
+         "Unexpected custom BITCAST");
+  // i64 <=> MMX conversions are Legal.
+  if (SrcVT==MVT::i64 && DstVT.isVector())
+    return Op;
+  if (DstVT==MVT::i64 && SrcVT.isVector())
+    return Op;
+  // MMX <=> MMX conversions are Legal.
+  if (SrcVT.isVector() && DstVT.isVector())
+    return Op;
+  // All other conversions need to be expanded.
+  return SDValue();
+}
+
+/// Compute the horizontal sum of bytes in V for the elements of VT.
+///
+/// Requires V to be a byte vector and VT to be an integer vector type with
+/// wider elements than V's type. The width of the elements of VT determines
+/// how many bytes of V are summed horizontally to produce each element of the
+/// result.
+static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
+                                      const X86Subtarget &Subtarget,
+                                      SelectionDAG &DAG) {
+  SDLoc DL(V);
+  MVT ByteVecVT = V.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
+         "Expected value to have byte element type.");
+  assert(EltVT != MVT::i8 &&
+         "Horizontal byte sum only makes sense for wider elements!");
+  unsigned VecSize = VT.getSizeInBits();
+  assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
+
+  // PSADBW instruction horizontally add all bytes and leave the result in i64
+  // chunks, thus directly computes the pop count for v2i64 and v4i64.
+  if (EltVT == MVT::i64) {
+    SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+    MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
+    V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
+    return DAG.getBitcast(VT, V);
+  }
+
+  if (EltVT == MVT::i32) {
+    // We unpack the low half and high half into i32s interleaved with zeros so
+    // that we can use PSADBW to horizontally sum them. The most useful part of
+    // this is that it lines up the results of two PSADBW instructions to be
+    // two v2i64 vectors which concatenated are the 4 population counts. We can
+    // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
+    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
+    SDValue V32 = DAG.getBitcast(VT, V);
+    SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
+    SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
+
+    // Do the horizontal sums into two v2i64s.
+    Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+    MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
+    Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
+                      DAG.getBitcast(ByteVecVT, Low), Zeros);
+    High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
+                       DAG.getBitcast(ByteVecVT, High), Zeros);
+
+    // Merge them together.
+    MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
+    V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
+                    DAG.getBitcast(ShortVecVT, Low),
+                    DAG.getBitcast(ShortVecVT, High));
+
+    return DAG.getBitcast(VT, V);
+  }
+
+  // The only element type left is i16.
+  assert(EltVT == MVT::i16 && "Unknown how to handle type");
+
+  // To obtain pop count for each i16 element starting from the pop count for
+  // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
+  // right by 8. It is important to shift as i16s as i8 vector shift isn't
+  // directly supported.
+  SDValue ShifterV = DAG.getConstant(8, DL, VT);
+  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
+  V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
+                  DAG.getBitcast(ByteVecVT, V));
+  return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
+}
+
+static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
+                                        const X86Subtarget &Subtarget,
+                                        SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned VecSize = VT.getSizeInBits();
+
+  // Implement a lookup table in register by using an algorithm based on:
+  // http://wm.ite.pl/articles/sse-popcount.html
+  //
+  // The general idea is that every lower byte nibble in the input vector is an
+  // index into a in-register pre-computed pop count table. We then split up the
+  // input vector in two new ones: (1) a vector with only the shifted-right
+  // higher nibbles for each byte and (2) a vector with the lower nibbles (and
+  // masked out higher ones) for each byte. PSHUB is used separately with both
+  // to index the in-register table. Next, both are added and the result is a
+  // i8 vector where each element contains the pop count for input byte.
+  //
+  // To obtain the pop count for elements != i8, we follow up with the same
+  // approach and use additional tricks as described below.
+  //
+  const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
+                       /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
+                       /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
+                       /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
+
+  int NumByteElts = VecSize / 8;
+  MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
+  SDValue In = DAG.getBitcast(ByteVecVT, Op);
+  SmallVector<SDValue, 64> LUTVec;
+  for (int i = 0; i < NumByteElts; ++i)
+    LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
+  SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
+  SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
+
+  // High nibbles
+  SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
+  SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
+
+  // Low nibbles
+  SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
+
+  // The input vector is used as the shuffle mask that index elements into the
+  // LUT. After counting low and high nibbles, add the vector to obtain the
+  // final pop count per i8 element.
+  SDValue HighPopCnt =
+      DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
+  SDValue LowPopCnt =
+      DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
+  SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
+
+  if (EltVT == MVT::i8)
+    return PopCnt;
+
+  return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
+}
+
+static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  assert(VT.is128BitVector() &&
+         "Only 128-bit vector bitmath lowering supported.");
+
+  int VecSize = VT.getSizeInBits();
+  MVT EltVT = VT.getVectorElementType();
+  int Len = EltVT.getSizeInBits();
+
+  // This is the vectorized version of the "best" algorithm from
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+  // with a minor tweak to use a series of adds + shifts instead of vector
+  // multiplications. Implemented for all integer vector types. We only use
+  // this when we don't have SSSE3 which allows a LUT-based lowering that is
+  // much faster, even faster than using native popcnt instructions.
+
+  auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
+    MVT VT = V.getSimpleValueType();
+    SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
+    return DAG.getNode(OpCode, DL, VT, V, ShifterV);
+  };
+  auto GetMask = [&](SDValue V, APInt Mask) {
+    MVT VT = V.getSimpleValueType();
+    SDValue MaskV = DAG.getConstant(Mask, DL, VT);
+    return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
+  };
+
+  // We don't want to incur the implicit masks required to SRL vNi8 vectors on
+  // x86, so set the SRL type to have elements at least i16 wide. This is
+  // correct because all of our SRLs are followed immediately by a mask anyways
+  // that handles any bits that sneak into the high bits of the byte elements.
+  MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
+
+  SDValue V = Op;
+
+  // v = v - ((v >> 1) & 0x55555555...)
+  SDValue Srl =
+      DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
+  SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
+  V = DAG.getNode(ISD::SUB, DL, VT, V, And);
+
+  // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+  SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
+  Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
+  SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
+  V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
+
+  // v = (v + (v >> 4)) & 0x0F0F0F0F...
+  Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
+  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
+  V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
+
+  // At this point, V contains the byte-wise population count, and we are
+  // merely doing a horizontal sum if necessary to get the wider element
+  // counts.
+  if (EltVT == MVT::i8)
+    return V;
+
+  return LowerHorizontalByteSum(
+      DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
+      DAG);
+}
+
+// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
+// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
+static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
+                                SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
+         "Unknown CTPOP type to handle");
+  SDLoc DL(Op.getNode());
+  SDValue Op0 = Op.getOperand(0);
+
+  if (!Subtarget.hasSSSE3()) {
+    // We can't use the fast LUT approach, so fall back on vectorized bitmath.
+    assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
+    return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
+  }
+
+  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+    unsigned NumElems = VT.getVectorNumElements();
+
+    // Extract each 128-bit vector, compute pop count and concat the result.
+    SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
+    SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+                       LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
+                       LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
+  }
+
+  if (VT.is512BitVector() && !Subtarget.hasBWI()) {
+    unsigned NumElems = VT.getVectorNumElements();
+
+    // Extract each 256-bit vector, compute pop count and concat the result.
+    SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
+    SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+                       LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
+                       LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
+  }
+
+  return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
+}
+
+static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
+                          SelectionDAG &DAG) {
+  assert(Op.getSimpleValueType().isVector() &&
+         "We only do custom lowering for vector population count.");
+  return LowerVectorCTPOP(Op, Subtarget, DAG);
+}
+
+static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  SDLoc DL(Op);
+
+  // For scalars, its still beneficial to transfer to/from the SIMD unit to
+  // perform the BITREVERSE.
+  if (!VT.isVector()) {
+    MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
+    SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
+    Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
+                       DAG.getIntPtrConstant(0, DL));
+  }
+
+  MVT SVT = VT.getVectorElementType();
+  int NumElts = VT.getVectorNumElements();
+  int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
+
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.is256BitVector()) {
+    SDValue Lo = extract128BitVector(In, 0, DAG, DL);
+    SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
+
+    MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+                       DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
+                       DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
+  }
+
+  assert(VT.is128BitVector() &&
+         "Only 128-bit vector bitreverse lowering supported.");
+
+  // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
+  // perform the BSWAP in the shuffle.
+  // Its best to shuffle using the second operand as this will implicitly allow
+  // memory folding for multiple vectors.
+  SmallVector<SDValue, 16> MaskElts;
+  for (int i = 0; i != NumElts; ++i) {
+    for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
+      int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
+      int PermuteByte = SourceByte | (2 << 5);
+      MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
+    }
+  }
+
+  SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
+  SDValue Res = DAG.getBitcast(MVT::v16i8, In);
+  Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
+                    Res, Mask);
+  return DAG.getBitcast(VT, Res);
+}
+
+static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
+                               SelectionDAG &DAG) {
+  if (Subtarget.hasXOP())
+    return LowerBITREVERSE_XOP(Op, DAG);
+
+  assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
+
+  MVT VT = Op.getSimpleValueType();
+  SDValue In = Op.getOperand(0);
+  SDLoc DL(Op);
+
+  unsigned NumElts = VT.getVectorNumElements();
+  assert(VT.getScalarType() == MVT::i8 &&
+         "Only byte vector BITREVERSE supported");
+
+  // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
+  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
+    MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
+    SDValue Lo = extract128BitVector(In, 0, DAG, DL);
+    SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
+    Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
+    Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+  }
+
+  // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
+  // two nibbles and a PSHUFB lookup to find the bitreverse of each
+  // 0-15 value (moved to the other nibble).
+  SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
+  SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
+  SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
+
+  const int LoLUT[16] = {
+      /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
+      /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
+      /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
+      /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
+  const int HiLUT[16] = {
+      /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
+      /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
+      /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
+      /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
+
+  SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
+    HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
+  }
+
+  SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
+  SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
+  Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
+  Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
+  return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
+}
+
+static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
+  unsigned NewOpc = 0;
+  switch (N->getOpcode()) {
+  case ISD::ATOMIC_LOAD_ADD:
+    NewOpc = X86ISD::LADD;
+    break;
+  case ISD::ATOMIC_LOAD_SUB:
+    NewOpc = X86ISD::LSUB;
+    break;
+  case ISD::ATOMIC_LOAD_OR:
+    NewOpc = X86ISD::LOR;
+    break;
+  case ISD::ATOMIC_LOAD_XOR:
+    NewOpc = X86ISD::LXOR;
+    break;
+  case ISD::ATOMIC_LOAD_AND:
+    NewOpc = X86ISD::LAND;
+    break;
+  default:
+    llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
+  }
+
+  MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
+  return DAG.getMemIntrinsicNode(
+      NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
+      {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
+      /*MemVT=*/N->getSimpleValueType(0), MMO);
+}
+
+/// Lower atomic_load_ops into LOCK-prefixed operations.
+static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
+                                const X86Subtarget &Subtarget) {
+  SDValue Chain = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  unsigned Opc = N->getOpcode();
+  MVT VT = N->getSimpleValueType(0);
+  SDLoc DL(N);
+
+  // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
+  // can only be lowered when the result is unused.  They should have already
+  // been transformed into a cmpxchg loop in AtomicExpand.
+  if (N->hasAnyUseOfValue(0)) {
+    // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
+    // select LXADD if LOCK_SUB can't be selected.
+    if (Opc == ISD::ATOMIC_LOAD_SUB) {
+      AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
+      RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
+      return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
+                           RHS, AN->getMemOperand());
+    }
+    assert(Opc == ISD::ATOMIC_LOAD_ADD &&
+           "Used AtomicRMW ops other than Add should have been expanded!");
+    return N;
+  }
+
+  SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
+  // RAUW the chain, but don't worry about the result, as it's unused.
+  assert(!N->hasAnyUseOfValue(0));
+  DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
+  return SDValue();
+}
+
+static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  SDLoc dl(Node);
+  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+  // Convert seq_cst store -> xchg
+  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
+  // FIXME: On 32-bit, store -> fist or movq would be more efficient
+  //        (The only way to get a 16-byte store is cmpxchg16b)
+  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
+  if (cast<AtomicSDNode>(Node)->getOrdering() ==
+          AtomicOrdering::SequentiallyConsistent ||
+      !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
+                                 Node->getOperand(0),
+                                 Node->getOperand(1), Node->getOperand(2),
+                                 cast<AtomicSDNode>(Node)->getMemOperand());
+    return Swap.getValue(1);
+  }
+  // Other atomic stores have a simple pattern.
+  return Op;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getNode()->getSimpleValueType(0);
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+  unsigned Opc;
+  bool ExtraOp = false;
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid code");
+  case ISD::ADDC: Opc = X86ISD::ADD; break;
+  case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
+  case ISD::SUBC: Opc = X86ISD::SUB; break;
+  case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
+  }
+
+  if (!ExtraOp)
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
+                       Op.getOperand(1));
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
+                     Op.getOperand(1), Op.getOperand(2));
+}
+
+static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
+                            SelectionDAG &DAG) {
+  assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
+
+  // For MacOSX, we want to call an alternative entry point: __sincos_stret,
+  // which returns the values as { float, float } (in XMM0) or
+  // { double, double } (which is returned in XMM0, XMM1).
+  SDLoc dl(Op);
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  bool isF64 = ArgVT == MVT::f64;
+  // Only optimize x86_64 for now. i386 is a bit messy. For f32,
+  // the small struct {f32, f32} is returned in (eax, edx). For f64,
+  // the results are returned via SRet in memory.
+  const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Callee =
+      DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
+
+  Type *RetTy = isF64
+    ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
+    : (Type*)VectorType::get(ArgTy, 4);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
+
+  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+  if (isF64)
+    // Returned in xmm0 and xmm1.
+    return CallResult.first;
+
+  // Returned in bits 0:31 and 32:64 xmm0.
+  SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
+                               CallResult.first, DAG.getIntPtrConstant(0, dl));
+  SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
+                               CallResult.first, DAG.getIntPtrConstant(1, dl));
+  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
+}
+
+/// Widen a vector input to a vector of NVT.  The
+/// input vector must have the same element type as NVT.
+static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
+                            bool FillWithZeroes = false) {
+  // Check if InOp already has the right width.
+  MVT InVT = InOp.getSimpleValueType();
+  if (InVT == NVT)
+    return InOp;
+
+  if (InOp.isUndef())
+    return DAG.getUNDEF(NVT);
+
+  assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
+         "input and widen element type must match");
+
+  unsigned InNumElts = InVT.getVectorNumElements();
+  unsigned WidenNumElts = NVT.getVectorNumElements();
+  assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
+         "Unexpected request for vector widening");
+
+  EVT EltVT = NVT.getVectorElementType();
+
+  SDLoc dl(InOp);
+  if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
+      InOp.getNumOperands() == 2) {
+    SDValue N1 = InOp.getOperand(1);
+    if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
+        N1.isUndef()) {
+      InOp = InOp.getOperand(0);
+      InVT = InOp.getSimpleValueType();
+      InNumElts = InVT.getVectorNumElements();
+    }
+  }
+  if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
+      ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
+    SmallVector<SDValue, 16> Ops;
+    for (unsigned i = 0; i < InNumElts; ++i)
+      Ops.push_back(InOp.getOperand(i));
+
+    SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
+      DAG.getUNDEF(EltVT);
+    for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
+      Ops.push_back(FillVal);
+    return DAG.getBuildVector(NVT, dl, Ops);
+  }
+  SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
+    DAG.getUNDEF(NVT);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
+                     InOp, DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
+                             SelectionDAG &DAG) {
+  assert(Subtarget.hasAVX512() &&
+         "MGATHER/MSCATTER are supported on AVX-512 arch only");
+
+  // X86 scatter kills mask register, so its type should be added to
+  // the list of return values.
+  // If the "scatter" has 2 return values, it is already handled.
+  if (Op.getNode()->getNumValues() == 2)
+    return Op;
+
+  MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
+  SDValue Src = N->getValue();
+  MVT VT = Src.getSimpleValueType();
+  assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
+  SDLoc dl(Op);
+
+  SDValue NewScatter;
+  SDValue Index = N->getIndex();
+  SDValue Mask = N->getMask();
+  SDValue Chain = N->getChain();
+  SDValue BasePtr = N->getBasePtr();
+  MVT MemVT = N->getMemoryVT().getSimpleVT();
+  MVT IndexVT = Index.getSimpleValueType();
+  MVT MaskVT = Mask.getSimpleValueType();
+
+  if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
+    // The v2i32 value was promoted to v2i64.
+    // Now we "redo" the type legalizer's work and widen the original
+    // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
+    // with a shuffle.
+    assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
+           "Unexpected memory type");
+    int ShuffleMask[] = {0, 2, -1, -1};
+    Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
+                               DAG.getUNDEF(MVT::v4i32), ShuffleMask);
+    // Now we have 4 elements instead of 2.
+    // Expand the index.
+    MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
+    Index = ExtendToType(Index, NewIndexVT, DAG);
+
+    // Expand the mask with zeroes
+    // Mask may be <2 x i64> or <2 x i1> at this moment
+    assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
+           "Unexpected mask type");
+    MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
+    Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
+    VT = MVT::v4i32;
+  }
+
+  unsigned NumElts = VT.getVectorNumElements();
+  if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
+      !Index.getSimpleValueType().is512BitVector()) {
+    // AVX512F supports only 512-bit vectors. Or data or index should
+    // be 512 bit wide. If now the both index and data are 256-bit, but
+    // the vector contains 8 elements, we just sign-extend the index
+    if (IndexVT == MVT::v8i32)
+      // Just extend index
+      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+    else {
+      // The minimal number of elts in scatter is 8
+      NumElts = 8;
+      // Index
+      MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
+      // Use original index here, do not modify the index twice
+      Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
+      if (IndexVT.getScalarType() == MVT::i32)
+        Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+      // Mask
+      // At this point we have promoted mask operand
+      assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
+      MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
+      // Use the original mask here, do not modify the mask twice
+      Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
+
+      // The value that should be stored
+      MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
+      Src = ExtendToType(Src, NewVT, DAG);
+    }
+  }
+  // If the mask is "wide" at this point - truncate it to i1 vector
+  MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+  Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
+
+  // The mask is killed by scatter, add it to the values
+  SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
+  SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
+  NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
+                                    N->getMemOperand());
+  DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
+  return SDValue(NewScatter.getNode(), 1);
+}
+
+static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
+                          SelectionDAG &DAG) {
+
+  MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+  MVT VT = Op.getSimpleValueType();
+  MVT ScalarVT = VT.getScalarType();
+  SDValue Mask = N->getMask();
+  SDLoc dl(Op);
+
+  assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
+         "Expanding masked load is supported on AVX-512 target only!");
+
+  assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
+         "Expanding masked load is supported for 32 and 64-bit types only!");
+
+  // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
+  // VLX. These types for exp-loads are handled here.
+  if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
+    return Op;
+
+  assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
+         "Cannot lower masked load op.");
+
+  assert((ScalarVT.getSizeInBits() >= 32 ||
+          (Subtarget.hasBWI() &&
+              (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
+         "Unsupported masked load op.");
+
+  // This operation is legal for targets with VLX, but without
+  // VLX the vector should be widened to 512 bit
+  unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
+  MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
+  SDValue Src0 = N->getSrc0();
+  Src0 = ExtendToType(Src0, WideDataVT, DAG);
+
+  // Mask element has to be i1.
+  MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
+  assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
+         "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
+
+  MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
+
+  Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+  if (MaskEltTy != MVT::i1)
+    Mask = DAG.getNode(ISD::TRUNCATE, dl,
+                       MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
+  SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
+                                      N->getBasePtr(), Mask, Src0,
+                                      N->getMemoryVT(), N->getMemOperand(),
+                                      N->getExtensionType(),
+                                      N->isExpandingLoad());
+
+  SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                               NewLoad.getValue(0),
+                               DAG.getIntPtrConstant(0, dl));
+  SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
+  return DAG.getMergeValues(RetOps, dl);
+}
+
+static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
+                           SelectionDAG &DAG) {
+  MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
+  SDValue DataToStore = N->getValue();
+  MVT VT = DataToStore.getSimpleValueType();
+  MVT ScalarVT = VT.getScalarType();
+  SDValue Mask = N->getMask();
+  SDLoc dl(Op);
+
+  assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
+         "Expanding masked load is supported on AVX-512 target only!");
+
+  assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
+         "Expanding masked load is supported for 32 and 64-bit types only!");
+
+  // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
+  if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
+    return Op;
+
+  assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
+         "Cannot lower masked store op.");
+
+  assert((ScalarVT.getSizeInBits() >= 32 ||
+          (Subtarget.hasBWI() &&
+              (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
+          "Unsupported masked store op.");
+
+  // This operation is legal for targets with VLX, but without
+  // VLX the vector should be widened to 512 bit
+  unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+  MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
+
+  // Mask element has to be i1.
+  MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
+  assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
+         "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
+
+  MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
+
+  DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
+  Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+  if (MaskEltTy != MVT::i1)
+    Mask = DAG.getNode(ISD::TRUNCATE, dl,
+                       MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
+  return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
+                            Mask, N->getMemoryVT(), N->getMemOperand(),
+                            N->isTruncatingStore(), N->isCompressingStore());
+}
+
+static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
+                            SelectionDAG &DAG) {
+  assert(Subtarget.hasAVX512() &&
+         "MGATHER/MSCATTER are supported on AVX-512 arch only");
+
+  MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue Index = N->getIndex();
+  SDValue Mask = N->getMask();
+  SDValue Src0 = N->getValue();
+  MVT IndexVT = Index.getSimpleValueType();
+  MVT MaskVT = Mask.getSimpleValueType();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
+
+  if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
+      !Index.getSimpleValueType().is512BitVector()) {
+    // AVX512F supports only 512-bit vectors. Or data or index should
+    // be 512 bit wide. If now the both index and data are 256-bit, but
+    // the vector contains 8 elements, we just sign-extend the index
+    if (NumElts == 8) {
+      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),  N->getOperand(2),
+                        N->getOperand(3), Index };
+      DAG.UpdateNodeOperands(N, Ops);
+      return Op;
+    }
+
+    // Minimal number of elements in Gather
+    NumElts = 8;
+    // Index
+    MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
+    Index = ExtendToType(Index, NewIndexVT, DAG);
+    if (IndexVT.getScalarType() == MVT::i32)
+      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+    // Mask
+    MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
+    // At this point we have promoted mask operand
+    assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
+    MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
+    Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
+    Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
+
+    // The pass-thru value
+    MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
+    Src0 = ExtendToType(Src0, NewVT, DAG);
+
+    SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+    SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
+                                            N->getMemoryVT(), dl, Ops,
+                                            N->getMemOperand());
+    SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                 NewGather.getValue(0),
+                                 DAG.getIntPtrConstant(0, dl));
+    SDValue RetOps[] = {Exract, NewGather.getValue(1)};
+    return DAG.getMergeValues(RetOps, dl);
+  }
+  return Op;
+}
+
+SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  // TODO: Eventually, the lowering of these nodes should be informed by or
+  // deferred to the GC strategy for the function in which they appear. For
+  // now, however, they must be lowered to something. Since they are logically
+  // no-ops in the case of a null GC strategy (or a GC strategy which does not
+  // require special handling for these nodes), lower them as literal NOOPs for
+  // the time being.
+  SmallVector<SDValue, 2> Ops;
+
+  Ops.push_back(Op.getOperand(0));
+  if (Op->getGluedNode())
+    Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
+
+  SDLoc OpDL(Op);
+  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
+
+  return NOOP;
+}
+
+SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  // TODO: Eventually, the lowering of these nodes should be informed by or
+  // deferred to the GC strategy for the function in which they appear. For
+  // now, however, they must be lowered to something. Since they are logically
+  // no-ops in the case of a null GC strategy (or a GC strategy which does not
+  // require special handling for these nodes), lower them as literal NOOPs for
+  // the time being.
+  SmallVector<SDValue, 2> Ops;
+
+  Ops.push_back(Op.getOperand(0));
+  if (Op->getGluedNode())
+    Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
+
+  SDLoc OpDL(Op);
+  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
+
+  return NOOP;
+}
+
+/// Provide custom lowering hooks for some operations.
+SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Should not custom lower this!");
+  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+    return LowerCMP_SWAP(Op, Subtarget, DAG);
+  case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
+  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
+  case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
+  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
+  case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
+  case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
+  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
+  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+  case ISD::SHL_PARTS:
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
+  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
+  case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
+  case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
+  case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
+  case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, Subtarget, DAG);
+  case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
+  case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
+  case ISD::FABS:
+  case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
+  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
+  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
+  case ISD::SETCC:              return LowerSETCC(Op, DAG);
+  case ISD::SETCCE:             return LowerSETCCE(Op, DAG);
+  case ISD::SELECT:             return LowerSELECT(Op, DAG);
+  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
+  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
+  case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::ADDROFRETURNADDR:   return LowerADDROFRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  case ISD::FRAME_TO_ARGS_OFFSET:
+                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
+  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
+  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
+  case ISD::EH_SJLJ_SETUP_DISPATCH:
+    return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
+  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
+  case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
+  case ISD::MULHS:
+  case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
+  case ISD::UMUL_LOHI:
+  case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
+  case ISD::ROTL:               return LowerRotate(Op, Subtarget, DAG);
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:              return LowerXALUO(Op, DAG);
+  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
+  case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::ADD:                return LowerADD(Op, DAG);
+  case ISD::SUB:                return LowerSUB(Op, DAG);
+  case ISD::SMAX:
+  case ISD::SMIN:
+  case ISD::UMAX:
+  case ISD::UMIN:               return LowerMINMAX(Op, DAG);
+  case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
+  case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
+  case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
+  case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
+  case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
+  case ISD::GC_TRANSITION_START:
+                                return LowerGC_TRANSITION_START(Op, DAG);
+  case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
+  case ISD::STORE:              return LowerTruncatingStore(Op, Subtarget, DAG);
+  }
+}
+
+/// Places new result values for the node in Results (their number
+/// and types must exactly match those of the original return values of
+/// the node), or leaves Results empty, which indicates that the node is not
+/// to be custom lowered after all.
+void X86TargetLowering::LowerOperationWrapper(SDNode *N,
+                                              SmallVectorImpl<SDValue> &Results,
+                                              SelectionDAG &DAG) const {
+  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+  if (!Res.getNode())
+    return;
+
+  assert((N->getNumValues() <= Res->getNumValues()) &&
+      "Lowering returned the wrong number of results!");
+
+  // Places new result values base on N result number.
+  // In some cases (LowerSINT_TO_FP for example) Res has more result values
+  // than original node, chain should be dropped(last value).
+  for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
+    Results.push_back(Res.getValue(I));
+}
+
+/// Replace a node with an illegal result type with a new node built out of
+/// custom code.
+void X86TargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) const {
+  SDLoc dl(N);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
+  case X86ISD::AVG: {
+    // Legalize types for X86ISD::AVG by expanding vectors.
+    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+
+    auto InVT = N->getValueType(0);
+    auto InVTSize = InVT.getSizeInBits();
+    const unsigned RegSize =
+        (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
+    assert((Subtarget.hasBWI() || RegSize < 512) &&
+           "512-bit vector requires AVX512BW");
+    assert((Subtarget.hasAVX2() || RegSize < 256) &&
+           "256-bit vector requires AVX2");
+
+    auto ElemVT = InVT.getVectorElementType();
+    auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
+                                  RegSize / ElemVT.getSizeInBits());
+    assert(RegSize % InVT.getSizeInBits() == 0);
+    unsigned NumConcat = RegSize / InVT.getSizeInBits();
+
+    SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
+    Ops[0] = N->getOperand(0);
+    SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
+    Ops[0] = N->getOperand(1);
+    SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
+
+    SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
+    Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
+                                  DAG.getIntPtrConstant(0, dl)));
+    return;
+  }
+  // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
+  case X86ISD::FMINC:
+  case X86ISD::FMIN:
+  case X86ISD::FMAXC:
+  case X86ISD::FMAX: {
+    EVT VT = N->getValueType(0);
+    assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
+    SDValue UNDEF = DAG.getUNDEF(VT);
+    SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+                              N->getOperand(0), UNDEF);
+    SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+                              N->getOperand(1), UNDEF);
+    Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
+    return;
+  }
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+  case ISD::SDIVREM:
+  case ISD::UDIVREM: {
+    SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
+    Results.push_back(V);
+    return;
+  }
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT: {
+    bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+
+    if (N->getValueType(0) == MVT::v2i32) {
+      assert((IsSigned || Subtarget.hasAVX512()) &&
+             "Can only handle signed conversion without AVX512");
+      assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+      SDValue Src = N->getOperand(0);
+      if (Src.getValueType() == MVT::v2f64) {
+        SDValue Idx = DAG.getIntPtrConstant(0, dl);
+        SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
+                                           : X86ISD::CVTTP2UI,
+                                  dl, MVT::v4i32, Src);
+        Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
+        Results.push_back(Res);
+        return;
+      }
+      if (Src.getValueType() == MVT::v2f32) {
+        SDValue Idx = DAG.getIntPtrConstant(0, dl);
+        SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+                                  DAG.getUNDEF(MVT::v2f32));
+        Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
+                                   : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
+        Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
+        Results.push_back(Res);
+        return;
+      }
+
+      // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
+      // so early out here.
+      return;
+    }
+
+    std::pair<SDValue,SDValue> Vals =
+        FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
+    SDValue FIST = Vals.first, StackSlot = Vals.second;
+    if (FIST.getNode()) {
+      EVT VT = N->getValueType(0);
+      // Return a load from the stack slot.
+      if (StackSlot.getNode())
+        Results.push_back(
+            DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
+      else
+        Results.push_back(FIST);
+    }
+    return;
+  }
+  case ISD::SINT_TO_FP: {
+    assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
+    SDValue Src = N->getOperand(0);
+    if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
+      return;
+    Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
+    return;
+  }
+  case ISD::UINT_TO_FP: {
+    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+    EVT VT = N->getValueType(0);
+    if (VT != MVT::v2f32)
+      return;
+    SDValue Src = N->getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
+      Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
+      return;
+    }
+    if (SrcVT != MVT::v2i32)
+      return;
+    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
+    SDValue VBias =
+        DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
+    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
+                             DAG.getBitcast(MVT::v2i64, VBias));
+    Or = DAG.getBitcast(MVT::v2f64, Or);
+    // TODO: Are there any fast-math-flags to propagate here?
+    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
+    Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
+    return;
+  }
+  case ISD::FP_ROUND: {
+    if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
+        return;
+    SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
+    Results.push_back(V);
+    return;
+  }
+  case ISD::FP_EXTEND: {
+    // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
+    // No other ValueType for FP_EXTEND should reach this point.
+    assert(N->getValueType(0) == MVT::v2f32 &&
+           "Do not know how to legalize this Node");
+    return;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default : llvm_unreachable("Do not know how to custom type "
+                               "legalize this intrinsic operation!");
+    case Intrinsic::x86_rdtsc:
+      return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+                                     Results);
+    case Intrinsic::x86_rdtscp:
+      return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
+                                     Results);
+    case Intrinsic::x86_rdpmc:
+      return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
+
+    case Intrinsic::x86_xgetbv:
+      return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
+    }
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
+      Results.push_back(V);
+    return;
+  }
+  case ISD::READCYCLECOUNTER: {
+    return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+                                   Results);
+  }
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
+    EVT T = N->getValueType(0);
+    assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
+    bool Regs64bit = T == MVT::i128;
+    MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
+    SDValue cpInL, cpInH;
+    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+                        DAG.getConstant(0, dl, HalfT));
+    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+                        DAG.getConstant(1, dl, HalfT));
+    cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
+                             Regs64bit ? X86::RAX : X86::EAX,
+                             cpInL, SDValue());
+    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
+                             Regs64bit ? X86::RDX : X86::EDX,
+                             cpInH, cpInL.getValue(1));
+    SDValue swapInL, swapInH;
+    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+                          DAG.getConstant(0, dl, HalfT));
+    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+                          DAG.getConstant(1, dl, HalfT));
+    swapInH =
+        DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
+                         swapInH, cpInH.getValue(1));
+    // If the current function needs the base pointer, RBX,
+    // we shouldn't use cmpxchg directly.
+    // Indeed the lowering of that instruction will clobber
+    // that register and since RBX will be a reserved register
+    // the register allocator will not make sure its value will
+    // be properly saved and restored around this live-range.
+    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+    SDValue Result;
+    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+    unsigned BasePtr = TRI->getBaseRegister();
+    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
+    if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
+        (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
+      // ISel prefers the LCMPXCHG64 variant.
+      // If that assert breaks, that means it is not the case anymore,
+      // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
+      // not just EBX. This is a matter of accepting i64 input for that
+      // pseudo, and restoring into the register of the right wide
+      // in expand pseudo. Everything else should just work.
+      assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
+             "Saving only half of the RBX");
+      unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
+                                  : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
+      SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
+                                           Regs64bit ? X86::RBX : X86::EBX,
+                                           HalfT, swapInH.getValue(1));
+      SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
+                       RBXSave,
+                       /*Glue*/ RBXSave.getValue(2)};
+      Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+    } else {
+      unsigned Opcode =
+          Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
+      swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
+                                 Regs64bit ? X86::RBX : X86::EBX, swapInL,
+                                 swapInH.getValue(1));
+      SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
+                       swapInL.getValue(1)};
+      Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
+    }
+    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
+                                        Regs64bit ? X86::RAX : X86::EAX,
+                                        HalfT, Result.getValue(1));
+    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
+                                        Regs64bit ? X86::RDX : X86::EDX,
+                                        HalfT, cpOutL.getValue(2));
+    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+
+    SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
+                                        MVT::i32, cpOutH.getValue(2));
+    SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
+    Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
+
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
+    Results.push_back(Success);
+    Results.push_back(EFLAGS.getValue(1));
+    return;
+  }
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_LOAD: {
+    // Delegate to generic TypeLegalization. Situations we can really handle
+    // should have already been dealt with by AtomicExpandPass.cpp.
+    break;
+  }
+  case ISD::BITCAST: {
+    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+    EVT DstVT = N->getValueType(0);
+    EVT SrcVT = N->getOperand(0)->getValueType(0);
+
+    if (SrcVT != MVT::f64 ||
+        (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
+      return;
+
+    unsigned NumElts = DstVT.getVectorNumElements();
+    EVT SVT = DstVT.getVectorElementType();
+    EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+    SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                   MVT::v2f64, N->getOperand(0));
+    SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
+
+    if (ExperimentalVectorWideningLegalization) {
+      // If we are legalizing vectors by widening, we already have the desired
+      // legal vector type, just return it.
+      Results.push_back(ToVecInt);
+      return;
+    }
+
+    SmallVector<SDValue, 8> Elts;
+    for (unsigned i = 0, e = NumElts; i != e; ++i)
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
+                                   ToVecInt, DAG.getIntPtrConstant(i, dl)));
+
+    Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
+  }
+  }
+}
+
+const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch ((X86ISD::NodeType)Opcode) {
+  case X86ISD::FIRST_NUMBER:       break;
+  case X86ISD::BSF:                return "X86ISD::BSF";
+  case X86ISD::BSR:                return "X86ISD::BSR";
+  case X86ISD::SHLD:               return "X86ISD::SHLD";
+  case X86ISD::SHRD:               return "X86ISD::SHRD";
+  case X86ISD::FAND:               return "X86ISD::FAND";
+  case X86ISD::FANDN:              return "X86ISD::FANDN";
+  case X86ISD::FOR:                return "X86ISD::FOR";
+  case X86ISD::FXOR:               return "X86ISD::FXOR";
+  case X86ISD::FILD:               return "X86ISD::FILD";
+  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
+  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
+  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
+  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+  case X86ISD::FLD:                return "X86ISD::FLD";
+  case X86ISD::FST:                return "X86ISD::FST";
+  case X86ISD::CALL:               return "X86ISD::CALL";
+  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
+  case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
+  case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
+  case X86ISD::BT:                 return "X86ISD::BT";
+  case X86ISD::CMP:                return "X86ISD::CMP";
+  case X86ISD::COMI:               return "X86ISD::COMI";
+  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
+  case X86ISD::CMPM:               return "X86ISD::CMPM";
+  case X86ISD::CMPMU:              return "X86ISD::CMPMU";
+  case X86ISD::CMPM_RND:           return "X86ISD::CMPM_RND";
+  case X86ISD::SETCC:              return "X86ISD::SETCC";
+  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
+  case X86ISD::FSETCC:             return "X86ISD::FSETCC";
+  case X86ISD::FSETCCM:            return "X86ISD::FSETCCM";
+  case X86ISD::FSETCCM_RND:        return "X86ISD::FSETCCM_RND";
+  case X86ISD::CMOV:               return "X86ISD::CMOV";
+  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
+  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
+  case X86ISD::IRET:               return "X86ISD::IRET";
+  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
+  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
+  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
+  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
+  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
+  case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
+  case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
+  case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
+  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
+  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
+  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
+  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
+  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
+  case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
+  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
+  case X86ISD::ANDNP:              return "X86ISD::ANDNP";
+  case X86ISD::BLENDI:             return "X86ISD::BLENDI";
+  case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
+  case X86ISD::ADDUS:              return "X86ISD::ADDUS";
+  case X86ISD::SUBUS:              return "X86ISD::SUBUS";
+  case X86ISD::HADD:               return "X86ISD::HADD";
+  case X86ISD::HSUB:               return "X86ISD::HSUB";
+  case X86ISD::FHADD:              return "X86ISD::FHADD";
+  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
+  case X86ISD::ABS:                return "X86ISD::ABS";
+  case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
+  case X86ISD::FMAX:               return "X86ISD::FMAX";
+  case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
+  case X86ISD::FMIN:               return "X86ISD::FMIN";
+  case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
+  case X86ISD::FMAXC:              return "X86ISD::FMAXC";
+  case X86ISD::FMINC:              return "X86ISD::FMINC";
+  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
+  case X86ISD::FRSQRTS:             return "X86ISD::FRSQRTS";
+  case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::FRCPS:              return "X86ISD::FRCPS";
+  case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
+  case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
+  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
+  case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
+  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
+  case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
+  case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
+  case X86ISD::EH_SJLJ_SETUP_DISPATCH:
+    return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
+  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
+  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
+  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
+  case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
+  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
+  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
+  case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
+  case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
+    return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
+  case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
+    return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
+  case X86ISD::LADD:               return "X86ISD::LADD";
+  case X86ISD::LSUB:               return "X86ISD::LSUB";
+  case X86ISD::LOR:                return "X86ISD::LOR";
+  case X86ISD::LXOR:               return "X86ISD::LXOR";
+  case X86ISD::LAND:               return "X86ISD::LAND";
+  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
+  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
+  case X86ISD::VZEXT:              return "X86ISD::VZEXT";
+  case X86ISD::VSEXT:              return "X86ISD::VSEXT";
+  case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
+  case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
+  case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
+  case X86ISD::VTRUNCSTORES:       return "X86ISD::VTRUNCSTORES";
+  case X86ISD::VTRUNCSTOREUS:      return "X86ISD::VTRUNCSTOREUS";
+  case X86ISD::VMTRUNCSTORES:      return "X86ISD::VMTRUNCSTORES";
+  case X86ISD::VMTRUNCSTOREUS:     return "X86ISD::VMTRUNCSTOREUS";
+  case X86ISD::VINSERT:            return "X86ISD::VINSERT";
+  case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
+  case X86ISD::VFPEXT_RND:         return "X86ISD::VFPEXT_RND";
+  case X86ISD::VFPEXTS_RND:        return "X86ISD::VFPEXTS_RND";
+  case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
+  case X86ISD::VFPROUND_RND:       return "X86ISD::VFPROUND_RND";
+  case X86ISD::VFPROUNDS_RND:      return "X86ISD::VFPROUNDS_RND";
+  case X86ISD::CVT2MASK:           return "X86ISD::CVT2MASK";
+  case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
+  case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
+  case X86ISD::VSHL:               return "X86ISD::VSHL";
+  case X86ISD::VSRL:               return "X86ISD::VSRL";
+  case X86ISD::VSRA:               return "X86ISD::VSRA";
+  case X86ISD::VSHLI:              return "X86ISD::VSHLI";
+  case X86ISD::VSRLI:              return "X86ISD::VSRLI";
+  case X86ISD::VSRAI:              return "X86ISD::VSRAI";
+  case X86ISD::VSRAV:              return "X86ISD::VSRAV";
+  case X86ISD::VROTLI:             return "X86ISD::VROTLI";
+  case X86ISD::VROTRI:             return "X86ISD::VROTRI";
+  case X86ISD::VPPERM:             return "X86ISD::VPPERM";
+  case X86ISD::CMPP:               return "X86ISD::CMPP";
+  case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
+  case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
+  case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
+  case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
+  case X86ISD::ADD:                return "X86ISD::ADD";
+  case X86ISD::SUB:                return "X86ISD::SUB";
+  case X86ISD::ADC:                return "X86ISD::ADC";
+  case X86ISD::SBB:                return "X86ISD::SBB";
+  case X86ISD::SMUL:               return "X86ISD::SMUL";
+  case X86ISD::UMUL:               return "X86ISD::UMUL";
+  case X86ISD::SMUL8:              return "X86ISD::SMUL8";
+  case X86ISD::UMUL8:              return "X86ISD::UMUL8";
+  case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
+  case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
+  case X86ISD::INC:                return "X86ISD::INC";
+  case X86ISD::DEC:                return "X86ISD::DEC";
+  case X86ISD::OR:                 return "X86ISD::OR";
+  case X86ISD::XOR:                return "X86ISD::XOR";
+  case X86ISD::AND:                return "X86ISD::AND";
+  case X86ISD::BEXTR:              return "X86ISD::BEXTR";
+  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
+  case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
+  case X86ISD::PTEST:              return "X86ISD::PTEST";
+  case X86ISD::TESTP:              return "X86ISD::TESTP";
+  case X86ISD::TESTM:              return "X86ISD::TESTM";
+  case X86ISD::TESTNM:             return "X86ISD::TESTNM";
+  case X86ISD::KORTEST:            return "X86ISD::KORTEST";
+  case X86ISD::KTEST:              return "X86ISD::KTEST";
+  case X86ISD::PACKSS:             return "X86ISD::PACKSS";
+  case X86ISD::PACKUS:             return "X86ISD::PACKUS";
+  case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
+  case X86ISD::VALIGN:             return "X86ISD::VALIGN";
+  case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
+  case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
+  case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
+  case X86ISD::SHUFP:              return "X86ISD::SHUFP";
+  case X86ISD::SHUF128:            return "X86ISD::SHUF128";
+  case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
+  case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
+  case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
+  case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
+  case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
+  case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
+  case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
+  case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
+  case X86ISD::MOVSD:              return "X86ISD::MOVSD";
+  case X86ISD::MOVSS:              return "X86ISD::MOVSS";
+  case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
+  case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
+  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
+  case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
+  case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
+  case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
+  case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
+  case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
+  case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
+  case X86ISD::VPERMV:             return "X86ISD::VPERMV";
+  case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
+  case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
+  case X86ISD::VPERMI:             return "X86ISD::VPERMI";
+  case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
+  case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
+  case X86ISD::VFIXUPIMMS:          return "X86ISD::VFIXUPIMMS";
+  case X86ISD::VRANGE:             return "X86ISD::VRANGE";
+  case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
+  case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
+  case X86ISD::PSADBW:             return "X86ISD::PSADBW";
+  case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
+  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
+  case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
+  case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
+  case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
+  case X86ISD::MFENCE:             return "X86ISD::MFENCE";
+  case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
+  case X86ISD::SAHF:               return "X86ISD::SAHF";
+  case X86ISD::RDRAND:             return "X86ISD::RDRAND";
+  case X86ISD::RDSEED:             return "X86ISD::RDSEED";
+  case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
+  case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
+  case X86ISD::VPROT:              return "X86ISD::VPROT";
+  case X86ISD::VPROTI:             return "X86ISD::VPROTI";
+  case X86ISD::VPSHA:              return "X86ISD::VPSHA";
+  case X86ISD::VPSHL:              return "X86ISD::VPSHL";
+  case X86ISD::VPCOM:              return "X86ISD::VPCOM";
+  case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
+  case X86ISD::VPERMIL2:           return "X86ISD::VPERMIL2";
+  case X86ISD::FMADD:              return "X86ISD::FMADD";
+  case X86ISD::FMSUB:              return "X86ISD::FMSUB";
+  case X86ISD::FNMADD:             return "X86ISD::FNMADD";
+  case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
+  case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
+  case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
+  case X86ISD::FMADD_RND:          return "X86ISD::FMADD_RND";
+  case X86ISD::FNMADD_RND:         return "X86ISD::FNMADD_RND";
+  case X86ISD::FMSUB_RND:          return "X86ISD::FMSUB_RND";
+  case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
+  case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
+  case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
+  case X86ISD::FMADDS1_RND:        return "X86ISD::FMADDS1_RND";
+  case X86ISD::FNMADDS1_RND:       return "X86ISD::FNMADDS1_RND";
+  case X86ISD::FMSUBS1_RND:        return "X86ISD::FMSUBS1_RND";
+  case X86ISD::FNMSUBS1_RND:       return "X86ISD::FNMSUBS1_RND";
+  case X86ISD::FMADDS3_RND:        return "X86ISD::FMADDS3_RND";
+  case X86ISD::FNMADDS3_RND:       return "X86ISD::FNMADDS3_RND";
+  case X86ISD::FMSUBS3_RND:        return "X86ISD::FMSUBS3_RND";
+  case X86ISD::FNMSUBS3_RND:       return "X86ISD::FNMSUBS3_RND";
+  case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";
+  case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";
+  case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
+  case X86ISD::VRNDSCALES:         return "X86ISD::VRNDSCALES";
+  case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
+  case X86ISD::VREDUCES:           return "X86ISD::VREDUCES";
+  case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
+  case X86ISD::VGETMANTS:          return "X86ISD::VGETMANTS";
+  case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
+  case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
+  case X86ISD::XTEST:              return "X86ISD::XTEST";
+  case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
+  case X86ISD::EXPAND:             return "X86ISD::EXPAND";
+  case X86ISD::SELECT:             return "X86ISD::SELECT";
+  case X86ISD::SELECTS:            return "X86ISD::SELECTS";
+  case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
+  case X86ISD::RCP28:              return "X86ISD::RCP28";
+  case X86ISD::RCP28S:             return "X86ISD::RCP28S";
+  case X86ISD::EXP2:               return "X86ISD::EXP2";
+  case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
+  case X86ISD::RSQRT28S:           return "X86ISD::RSQRT28S";
+  case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
+  case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
+  case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
+  case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
+  case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
+  case X86ISD::FSQRTS_RND:         return "X86ISD::FSQRTS_RND";
+  case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
+  case X86ISD::FGETEXPS_RND:       return "X86ISD::FGETEXPS_RND";
+  case X86ISD::SCALEF:             return "X86ISD::SCALEF";
+  case X86ISD::SCALEFS:            return "X86ISD::SCALEFS";
+  case X86ISD::ADDS:               return "X86ISD::ADDS";
+  case X86ISD::SUBS:               return "X86ISD::SUBS";
+  case X86ISD::AVG:                return "X86ISD::AVG";
+  case X86ISD::MULHRS:             return "X86ISD::MULHRS";
+  case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
+  case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
+  case X86ISD::CVTTP2SI:           return "X86ISD::CVTTP2SI";
+  case X86ISD::CVTTP2UI:           return "X86ISD::CVTTP2UI";
+  case X86ISD::CVTTP2SI_RND:       return "X86ISD::CVTTP2SI_RND";
+  case X86ISD::CVTTP2UI_RND:       return "X86ISD::CVTTP2UI_RND";
+  case X86ISD::CVTTS2SI_RND:       return "X86ISD::CVTTS2SI_RND";
+  case X86ISD::CVTTS2UI_RND:       return "X86ISD::CVTTS2UI_RND";
+  case X86ISD::CVTSI2P:            return "X86ISD::CVTSI2P";
+  case X86ISD::CVTUI2P:            return "X86ISD::CVTUI2P";
+  case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
+  case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
+  case X86ISD::MULTISHIFT:         return "X86ISD::MULTISHIFT";
+  case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
+  case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
+  case X86ISD::CVTPS2PH:           return "X86ISD::CVTPS2PH";
+  case X86ISD::CVTPH2PS:           return "X86ISD::CVTPH2PS";
+  case X86ISD::CVTP2SI:            return "X86ISD::CVTP2SI";
+  case X86ISD::CVTP2UI:            return "X86ISD::CVTP2UI";
+  case X86ISD::CVTP2SI_RND:        return "X86ISD::CVTP2SI_RND";
+  case X86ISD::CVTP2UI_RND:        return "X86ISD::CVTP2UI_RND";
+  case X86ISD::CVTS2SI_RND:        return "X86ISD::CVTS2SI_RND";
+  case X86ISD::CVTS2UI_RND:        return "X86ISD::CVTS2UI_RND";
+  }
+  return nullptr;
+}
+
+/// Return true if the addressing mode represented by AM is legal for this
+/// target, for a load/store of the specified type.
+bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                              const AddrMode &AM, Type *Ty,
+                                              unsigned AS) const {
+  // X86 supports extremely general addressing modes.
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+
+  // X86 allows a sign-extended 32-bit immediate field as a displacement.
+  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
+    return false;
+
+  if (AM.BaseGV) {
+    unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
+
+    // If a reference to this global requires an extra load, we can't fold it.
+    if (isGlobalStubReference(GVFlags))
+      return false;
+
+    // If BaseGV requires a register for the PIC base, we cannot also have a
+    // BaseReg specified.
+    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
+      return false;
+
+    // If lower 4G is not available, then we must use rip-relative addressing.
+    if ((M != CodeModel::Small || isPositionIndependent()) &&
+        Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
+      return false;
+  }
+
+  switch (AM.Scale) {
+  case 0:
+  case 1:
+  case 2:
+  case 4:
+  case 8:
+    // These scales always work.
+    break;
+  case 3:
+  case 5:
+  case 9:
+    // These scales are formed with basereg+scalereg.  Only accept if there is
+    // no basereg yet.
+    if (AM.HasBaseReg)
+      return false;
+    break;
+  default:  // Other stuff never works.
+    return false;
+  }
+
+  return true;
+}
+
+bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
+  unsigned Bits = Ty->getScalarSizeInBits();
+
+  // 8-bit shifts are always expensive, but versions with a scalar amount aren't
+  // particularly cheaper than those without.
+  if (Bits == 8)
+    return false;
+
+  // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
+  // variable shifts just as cheap as scalar ones.
+  if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
+    return false;
+
+  // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
+  // fully general vector.
+  return true;
+}
+
+bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  return NumBits1 > NumBits2;
+}
+
+bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+
+  if (!isTypeLegal(EVT::getEVT(Ty1)))
+    return false;
+
+  assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
+
+  // Assuming the caller doesn't have a zeroext or signext return parameter,
+  // truncation all the way down to i1 is valid.
+  return true;
+}
+
+bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  // Can also use sub to handle negated immediates.
+  return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  return NumBits1 > NumBits2;
+}
+
+bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  EVT VT1 = Val.getValueType();
+  if (isZExtFree(VT1, VT2))
+    return true;
+
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  if (!VT1.isSimple() || !VT1.isInteger() ||
+      !VT2.isSimple() || !VT2.isInteger())
+    return false;
+
+  switch (VT1.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    // X86 has 8, 16, and 32-bit zero-extending loads.
+    return true;
+  }
+
+  return false;
+}
+
+bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
+
+bool
+X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  if (!Subtarget.hasAnyFMA())
+    return false;
+
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
+  // i16 instructions are longer (0x66 prefix) and potentially slower.
+  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
+}
+
+/// Targets can use this to indicate that they only support *some*
+/// VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                      EVT VT) const {
+  if (!VT.isSimple())
+    return false;
+
+  // Not for i1 vectors
+  if (VT.getSimpleVT().getScalarType() == MVT::i1)
+    return false;
+
+  // Very little shuffling can be done for 64-bit vectors right now.
+  if (VT.getSimpleVT().getSizeInBits() == 64)
+    return false;
+
+  // We only care that the types being shuffled are legal. The lowering can
+  // handle any possible shuffle mask that results.
+  return isTypeLegal(VT.getSimpleVT());
+}
+
+bool
+X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+                                          EVT VT) const {
+  // Just delegate to the generic legality, clear masks aren't special.
+  return isShuffleMaskLegal(Mask, VT);
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+/// Utility function to emit xbegin specifying the start of an RTM region.
+static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
+                                     const TargetInstrInfo *TII) {
+  DebugLoc DL = MI.getDebugLoc();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = ++MBB->getIterator();
+
+  // For the v = xbegin(), we generate
+  //
+  // thisMBB:
+  //  xbegin sinkMBB
+  //
+  // mainMBB:
+  //  eax = -1
+  //
+  // sinkMBB:
+  //  v = eax
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineFunction *MF = MBB->getParent();
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  //  xbegin sinkMBB
+  //  # fallthrough to mainMBB
+  //  # abortion to sinkMBB
+  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(sinkMBB);
+
+  // mainMBB:
+  //  EAX = -1
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  // EAX is live into the sinkMBB
+  sinkMBB->addLiveIn(X86::EAX);
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
+          MI.getOperand(0).getReg())
+      .addReg(X86::EAX);
+
+  MI.eraseFromParent();
+  return sinkMBB;
+}
+
+// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
+// or XMM0_V32I8 in AVX all of this code can be replaced with that
+// in the .td file.
+static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII) {
+  unsigned Opc;
+  switch (MI.getOpcode()) {
+  default: llvm_unreachable("illegal opcode!");
+  case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
+  case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
+  case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
+  case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
+  case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
+  case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
+  case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
+  case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
+  }
+
+  DebugLoc dl = MI.getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+  unsigned NumArgs = MI.getNumOperands();
+  for (unsigned i = 1; i < NumArgs; ++i) {
+    MachineOperand &Op = MI.getOperand(i);
+    if (!(Op.isReg() && Op.isImplicit()))
+      MIB.addOperand(Op);
+  }
+  if (MI.hasOneMemOperand())
+    MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+      .addReg(X86::XMM0);
+
+  MI.eraseFromParent();
+  return BB;
+}
+
+// FIXME: Custom handling because TableGen doesn't support multiple implicit
+// defs in an instruction pattern
+static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII) {
+  unsigned Opc;
+  switch (MI.getOpcode()) {
+  default: llvm_unreachable("illegal opcode!");
+  case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
+  case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
+  case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
+  case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
+  case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
+  case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
+  case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
+  case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
+  }
+
+  DebugLoc dl = MI.getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+  unsigned NumArgs = MI.getNumOperands(); // remove the results
+  for (unsigned i = 1; i < NumArgs; ++i) {
+    MachineOperand &Op = MI.getOperand(i);
+    if (!(Op.isReg() && Op.isImplicit()))
+      MIB.addOperand(Op);
+  }
+  if (MI.hasOneMemOperand())
+    MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+      .addReg(X86::ECX);
+
+  MI.eraseFromParent();
+  return BB;
+}
+
+static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
+                                     const X86Subtarget &Subtarget) {
+  DebugLoc dl = MI.getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+
+  // insert input VAL into EAX
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
+      .addReg(MI.getOperand(0).getReg());
+  // insert zero to ECX
+  BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
+
+  // insert zero to EDX
+  BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
+
+  // insert WRPKRU instruction
+  BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
+
+  MI.eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
+                                     const X86Subtarget &Subtarget) {
+  DebugLoc dl = MI.getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+
+  // insert zero to ECX
+  BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
+
+  // insert RDPKRU instruction
+  BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+      .addReg(X86::EAX);
+
+  MI.eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
+                                      const X86Subtarget &Subtarget,
+                                      unsigned Opc) {
+  DebugLoc dl = MI.getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  // Address into RAX/EAX, other two args into ECX, EDX.
+  unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
+  unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
+  for (int i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI.getOperand(i));
+
+  unsigned ValOps = X86::AddrNumOperands;
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
+      .addReg(MI.getOperand(ValOps).getReg());
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
+      .addReg(MI.getOperand(ValOps + 1).getReg());
+
+  // The instruction doesn't actually take any operands though.
+  BuildMI(*BB, MI, dl, TII->get(Opc));
+
+  MI.eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
+                                                 MachineBasicBlock *MBB) const {
+  // Emit va_arg instruction on X86-64.
+
+  // Operands to this pseudo-instruction:
+  // 0  ) Output        : destination address (reg)
+  // 1-5) Input         : va_list address (addr, i64mem)
+  // 6  ) ArgSize       : Size (in bytes) of vararg type
+  // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
+  // 8  ) Align         : Alignment of type
+  // 9  ) EFLAGS (implicit-def)
+
+  assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
+  static_assert(X86::AddrNumOperands == 5,
+                "VAARG_64 assumes 5 address operands");
+
+  unsigned DestReg = MI.getOperand(0).getReg();
+  MachineOperand &Base = MI.getOperand(1);
+  MachineOperand &Scale = MI.getOperand(2);
+  MachineOperand &Index = MI.getOperand(3);
+  MachineOperand &Disp = MI.getOperand(4);
+  MachineOperand &Segment = MI.getOperand(5);
+  unsigned ArgSize = MI.getOperand(6).getImm();
+  unsigned ArgMode = MI.getOperand(7).getImm();
+  unsigned Align = MI.getOperand(8).getImm();
+
+  // Memory Reference
+  assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
+  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
+
+  // Machine Information
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
+  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
+  DebugLoc DL = MI.getDebugLoc();
+
+  // struct va_list {
+  //   i32   gp_offset
+  //   i32   fp_offset
+  //   i64   overflow_area (address)
+  //   i64   reg_save_area (address)
+  // }
+  // sizeof(va_list) = 24
+  // alignment(va_list) = 8
+
+  unsigned TotalNumIntRegs = 6;
+  unsigned TotalNumXMMRegs = 8;
+  bool UseGPOffset = (ArgMode == 1);
+  bool UseFPOffset = (ArgMode == 2);
+  unsigned MaxOffset = TotalNumIntRegs * 8 +
+                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
+
+  /* Align ArgSize to a multiple of 8 */
+  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
+  bool NeedsAlign = (Align > 8);
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *overflowMBB;
+  MachineBasicBlock *offsetMBB;
+  MachineBasicBlock *endMBB;
+
+  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
+  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
+  unsigned OffsetReg = 0;
+
+  if (!UseGPOffset && !UseFPOffset) {
+    // If we only pull from the overflow region, we don't create a branch.
+    // We don't need to alter control flow.
+    OffsetDestReg = 0; // unused
+    OverflowDestReg = DestReg;
+
+    offsetMBB = nullptr;
+    overflowMBB = thisMBB;
+    endMBB = thisMBB;
+  } else {
+    // First emit code to check if gp_offset (or fp_offset) is below the bound.
+    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
+    // If not, pull from overflow_area. (branch to overflowMBB)
+    //
+    //       thisMBB
+    //         |     .
+    //         |        .
+    //     offsetMBB   overflowMBB
+    //         |        .
+    //         |     .
+    //        endMBB
+
+    // Registers for the PHI in endMBB
+    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
+    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
+
+    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+    MachineFunction *MF = MBB->getParent();
+    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+    MachineFunction::iterator MBBIter = ++MBB->getIterator();
+
+    // Insert the new basic blocks
+    MF->insert(MBBIter, offsetMBB);
+    MF->insert(MBBIter, overflowMBB);
+    MF->insert(MBBIter, endMBB);
+
+    // Transfer the remainder of MBB and its successor edges to endMBB.
+    endMBB->splice(endMBB->begin(), thisMBB,
+                   std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
+    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
+
+    // Make offsetMBB and overflowMBB successors of thisMBB
+    thisMBB->addSuccessor(offsetMBB);
+    thisMBB->addSuccessor(overflowMBB);
+
+    // endMBB is a successor of both offsetMBB and overflowMBB
+    offsetMBB->addSuccessor(endMBB);
+    overflowMBB->addSuccessor(endMBB);
+
+    // Load the offset value into a register
+    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
+      .addOperand(Base)
+      .addOperand(Scale)
+      .addOperand(Index)
+      .addDisp(Disp, UseFPOffset ? 4 : 0)
+      .addOperand(Segment)
+      .setMemRefs(MMOBegin, MMOEnd);
+
+    // Check if there is enough room left to pull this argument.
+    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
+      .addReg(OffsetReg)
+      .addImm(MaxOffset + 8 - ArgSizeA8);
+
+    // Branch to "overflowMBB" if offset >= max
+    // Fall through to "offsetMBB" otherwise
+    BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
+      .addMBB(overflowMBB);
+  }
+
+  // In offsetMBB, emit code to use the reg_save_area.
+  if (offsetMBB) {
+    assert(OffsetReg != 0);
+
+    // Read the reg_save_area address.
+    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
+    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
+      .addOperand(Base)
+      .addOperand(Scale)
+      .addOperand(Index)
+      .addDisp(Disp, 16)
+      .addOperand(Segment)
+      .setMemRefs(MMOBegin, MMOEnd);
+
+    // Zero-extend the offset
+    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
+      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
+        .addImm(0)
+        .addReg(OffsetReg)
+        .addImm(X86::sub_32bit);
+
+    // Add the offset to the reg_save_area to get the final address.
+    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
+      .addReg(OffsetReg64)
+      .addReg(RegSaveReg);
+
+    // Compute the offset for the next argument
+    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
+      .addReg(OffsetReg)
+      .addImm(UseFPOffset ? 16 : 8);
+
+    // Store it back into the va_list.
+    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
+      .addOperand(Base)
+      .addOperand(Scale)
+      .addOperand(Index)
+      .addDisp(Disp, UseFPOffset ? 4 : 0)
+      .addOperand(Segment)
+      .addReg(NextOffsetReg)
+      .setMemRefs(MMOBegin, MMOEnd);
+
+    // Jump to endMBB
+    BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
+      .addMBB(endMBB);
+  }
+
+  //
+  // Emit code to use overflow area
+  //
+
+  // Load the overflow_area address into a register.
+  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
+  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
+    .addOperand(Base)
+    .addOperand(Scale)
+    .addOperand(Index)
+    .addDisp(Disp, 8)
+    .addOperand(Segment)
+    .setMemRefs(MMOBegin, MMOEnd);
+
+  // If we need to align it, do so. Otherwise, just copy the address
+  // to OverflowDestReg.
+  if (NeedsAlign) {
+    // Align the overflow address
+    assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
+    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
+
+    // aligned_addr = (addr + (align-1)) & ~(align-1)
+    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
+      .addReg(OverflowAddrReg)
+      .addImm(Align-1);
+
+    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
+      .addReg(TmpReg)
+      .addImm(~(uint64_t)(Align-1));
+  } else {
+    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
+      .addReg(OverflowAddrReg);
+  }
+
+  // Compute the next overflow address after this argument.
+  // (the overflow address should be kept 8-byte aligned)
+  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
+  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
+    .addReg(OverflowDestReg)
+    .addImm(ArgSizeA8);
+
+  // Store the new overflow address.
+  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
+    .addOperand(Base)
+    .addOperand(Scale)
+    .addOperand(Index)
+    .addDisp(Disp, 8)
+    .addOperand(Segment)
+    .addReg(NextAddrReg)
+    .setMemRefs(MMOBegin, MMOEnd);
+
+  // If we branched, emit the PHI to the front of endMBB.
+  if (offsetMBB) {
+    BuildMI(*endMBB, endMBB->begin(), DL,
+            TII->get(X86::PHI), DestReg)
+      .addReg(OffsetDestReg).addMBB(offsetMBB)
+      .addReg(OverflowDestReg).addMBB(overflowMBB);
+  }
+
+  // Erase the pseudo instruction
+  MI.eraseFromParent();
+
+  return endMBB;
+}
+
+MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
+    MachineInstr &MI, MachineBasicBlock *MBB) const {
+  // Emit code to save XMM registers to the stack. The ABI says that the
+  // number of registers to save is given in %al, so it's theoretically
+  // possible to do an indirect jump trick to avoid saving all of them,
+  // however this code takes a simpler approach and just executes all
+  // of the stores if %al is non-zero. It's less code, and it's probably
+  // easier on the hardware branch predictor, and stores aren't all that
+  // expensive anyway.
+
+  // Create the new basic blocks. One block contains all the XMM stores,
+  // and one block is the final destination regardless of whether any
+  // stores were performed.
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction *F = MBB->getParent();
+  MachineFunction::iterator MBBIter = ++MBB->getIterator();
+  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(MBBIter, XMMSaveMBB);
+  F->insert(MBBIter, EndMBB);
+
+  // Transfer the remainder of MBB and its successor edges to EndMBB.
+  EndMBB->splice(EndMBB->begin(), MBB,
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // The original block will now fall through to the XMM save block.
+  MBB->addSuccessor(XMMSaveMBB);
+  // The XMMSaveMBB will fall through to the end block.
+  XMMSaveMBB->addSuccessor(EndMBB);
+
+  // Now add the instructions.
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned CountReg = MI.getOperand(0).getReg();
+  int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
+  int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
+
+  if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
+    // If %al is 0, branch around the XMM save block.
+    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
+    BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
+    MBB->addSuccessor(EndMBB);
+  }
+
+  // Make sure the last operand is EFLAGS, which gets clobbered by the branch
+  // that was just emitted, but clearly shouldn't be "saved".
+  assert((MI.getNumOperands() <= 3 ||
+          !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
+          MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
+         "Expected last argument to be EFLAGS");
+  unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+  // In the XMM save block, save all the XMM argument registers.
+  for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
+    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
+    MachineMemOperand *MMO = F->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
+        MachineMemOperand::MOStore,
+        /*Size=*/16, /*Align=*/16);
+    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
+        .addFrameIndex(RegSaveFrameIndex)
+        .addImm(/*Scale=*/1)
+        .addReg(/*IndexReg=*/0)
+        .addImm(/*Disp=*/Offset)
+        .addReg(/*Segment=*/0)
+        .addReg(MI.getOperand(i).getReg())
+        .addMemOperand(MMO);
+  }
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+  return EndMBB;
+}
+
+// The EFLAGS operand of SelectItr might be missing a kill marker
+// because there were multiple uses of EFLAGS, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
+                                     MachineBasicBlock* BB,
+                                     const TargetRegisterInfo* TRI) {
+  // Scan forward through BB for a use/def of EFLAGS.
+  MachineBasicBlock::iterator miI(std::next(SelectItr));
+  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
+    const MachineInstr& mi = *miI;
+    if (mi.readsRegister(X86::EFLAGS))
+      return false;
+    if (mi.definesRegister(X86::EFLAGS))
+      break; // Should have kill-flag - update below.
+  }
+
+  // If we hit the end of the block, check whether EFLAGS is live into a
+  // successor.
+  if (miI == BB->end()) {
+    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+                                          sEnd = BB->succ_end();
+         sItr != sEnd; ++sItr) {
+      MachineBasicBlock* succ = *sItr;
+      if (succ->isLiveIn(X86::EFLAGS))
+        return false;
+    }
+  }
+
+  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
+  // out. SelectMI should have a kill flag on EFLAGS.
+  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
+  return true;
+}
+
+// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
+// together with other CMOV pseudo-opcodes into a single basic-block with
+// conditional jump around it.
+static bool isCMOVPseudo(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case X86::CMOV_FR32:
+  case X86::CMOV_FR64:
+  case X86::CMOV_GR8:
+  case X86::CMOV_GR16:
+  case X86::CMOV_GR32:
+  case X86::CMOV_RFP32:
+  case X86::CMOV_RFP64:
+  case X86::CMOV_RFP80:
+  case X86::CMOV_V2F64:
+  case X86::CMOV_V2I64:
+  case X86::CMOV_V4F32:
+  case X86::CMOV_V4F64:
+  case X86::CMOV_V4I64:
+  case X86::CMOV_V16F32:
+  case X86::CMOV_V8F32:
+  case X86::CMOV_V8F64:
+  case X86::CMOV_V8I64:
+  case X86::CMOV_V8I1:
+  case X86::CMOV_V16I1:
+  case X86::CMOV_V32I1:
+  case X86::CMOV_V64I1:
+    return true;
+
+  default:
+    return false;
+  }
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
+                                     MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = ++BB->getIterator();
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+
+  // This code lowers all pseudo-CMOV instructions. Generally it lowers these
+  // as described above, by inserting a BB, and then making a PHI at the join
+  // point to select the true and false operands of the CMOV in the PHI.
+  //
+  // The code also handles two different cases of multiple CMOV opcodes
+  // in a row.
+  //
+  // Case 1:
+  // In this case, there are multiple CMOVs in a row, all which are based on
+  // the same condition setting (or the exact opposite condition setting).
+  // In this case we can lower all the CMOVs using a single inserted BB, and
+  // then make a number of PHIs at the join point to model the CMOVs. The only
+  // trickiness here, is that in a case like:
+  //
+  // t2 = CMOV cond1 t1, f1
+  // t3 = CMOV cond1 t2, f2
+  //
+  // when rewriting this into PHIs, we have to perform some renaming on the
+  // temps since you cannot have a PHI operand refer to a PHI result earlier
+  // in the same block.  The "simple" but wrong lowering would be:
+  //
+  // t2 = PHI t1(BB1), f1(BB2)
+  // t3 = PHI t2(BB1), f2(BB2)
+  //
+  // but clearly t2 is not defined in BB1, so that is incorrect. The proper
+  // renaming is to note that on the path through BB1, t2 is really just a
+  // copy of t1, and do that renaming, properly generating:
+  //
+  // t2 = PHI t1(BB1), f1(BB2)
+  // t3 = PHI t1(BB1), f2(BB2)
+  //
+  // Case 2, we lower cascaded CMOVs such as
+  //
+  //   (CMOV (CMOV F, T, cc1), T, cc2)
+  //
+  // to two successives branches.  For that, we look for another CMOV as the
+  // following instruction.
+  //
+  // Without this, we would add a PHI between the two jumps, which ends up
+  // creating a few copies all around. For instance, for
+  //
+  //    (sitofp (zext (fcmp une)))
+  //
+  // we would generate:
+  //
+  //         ucomiss %xmm1, %xmm0
+  //         movss  <1.0f>, %xmm0
+  //         movaps  %xmm0, %xmm1
+  //         jne     .LBB5_2
+  //         xorps   %xmm1, %xmm1
+  // .LBB5_2:
+  //         jp      .LBB5_4
+  //         movaps  %xmm1, %xmm0
+  // .LBB5_4:
+  //         retq
+  //
+  // because this custom-inserter would have generated:
+  //
+  //   A
+  //   | \
+  //   |  B
+  //   | /
+  //   C
+  //   | \
+  //   |  D
+  //   | /
+  //   E
+  //
+  // A: X = ...; Y = ...
+  // B: empty
+  // C: Z = PHI [X, A], [Y, B]
+  // D: empty
+  // E: PHI [X, C], [Z, D]
+  //
+  // If we lower both CMOVs in a single step, we can instead generate:
+  //
+  //   A
+  //   | \
+  //   |  C
+  //   | /|
+  //   |/ |
+  //   |  |
+  //   |  D
+  //   | /
+  //   E
+  //
+  // A: X = ...; Y = ...
+  // D: empty
+  // E: PHI [X, A], [X, C], [Y, D]
+  //
+  // Which, in our sitofp/fcmp example, gives us something like:
+  //
+  //         ucomiss %xmm1, %xmm0
+  //         movss  <1.0f>, %xmm0
+  //         jne     .LBB5_4
+  //         jp      .LBB5_4
+  //         xorps   %xmm0, %xmm0
+  // .LBB5_4:
+  //         retq
+  //
+  MachineInstr *CascadedCMOV = nullptr;
+  MachineInstr *LastCMOV = &MI;
+  X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
+  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+  MachineBasicBlock::iterator NextMIIt =
+      std::next(MachineBasicBlock::iterator(MI));
+
+  // Check for case 1, where there are multiple CMOVs with the same condition
+  // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
+  // number of jumps the most.
+
+  if (isCMOVPseudo(MI)) {
+    // See if we have a string of CMOVS with the same condition.
+    while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
+           (NextMIIt->getOperand(3).getImm() == CC ||
+            NextMIIt->getOperand(3).getImm() == OppCC)) {
+      LastCMOV = &*NextMIIt;
+      ++NextMIIt;
+    }
+  }
+
+  // This checks for case 2, but only do this if we didn't already find
+  // case 1, as indicated by LastCMOV == MI.
+  if (LastCMOV == &MI && NextMIIt != BB->end() &&
+      NextMIIt->getOpcode() == MI.getOpcode() &&
+      NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
+      NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
+      NextMIIt->getOperand(1).isKill()) {
+    CascadedCMOV = &*NextMIIt;
+  }
+
+  MachineBasicBlock *jcc1MBB = nullptr;
+
+  // If we have a cascaded CMOV, we lower it to two successive branches to
+  // the same block.  EFLAGS is used by both, so mark it as live in the second.
+  if (CascadedCMOV) {
+    jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, jcc1MBB);
+    jcc1MBB->addLiveIn(X86::EFLAGS);
+  }
+
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // If the EFLAGS register isn't dead in the terminator, then claim that it's
+  // live into the sink and copy blocks.
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+  MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
+  if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
+      !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
+    copy0MBB->addLiveIn(X86::EFLAGS);
+    sinkMBB->addLiveIn(X86::EFLAGS);
+  }
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add the true and fallthrough blocks as its successors.
+  if (CascadedCMOV) {
+    // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
+    BB->addSuccessor(jcc1MBB);
+
+    // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
+    // jump to the sinkMBB.
+    jcc1MBB->addSuccessor(copy0MBB);
+    jcc1MBB->addSuccessor(sinkMBB);
+  } else {
+    BB->addSuccessor(copy0MBB);
+  }
+
+  // The true block target of the first (or only) branch is always sinkMBB.
+  BB->addSuccessor(sinkMBB);
+
+  // Create the conditional branch instruction.
+  unsigned Opc = X86::GetCondBranchFromCond(CC);
+  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
+
+  if (CascadedCMOV) {
+    unsigned Opc2 = X86::GetCondBranchFromCond(
+        (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
+    BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
+  }
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  copy0MBB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+  MachineBasicBlock::iterator MIItEnd =
+    std::next(MachineBasicBlock::iterator(LastCMOV));
+  MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
+  DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+  MachineInstrBuilder MIB;
+
+  // As we are creating the PHIs, we have to be careful if there is more than
+  // one.  Later CMOVs may reference the results of earlier CMOVs, but later
+  // PHIs have to reference the individual true/false inputs from earlier PHIs.
+  // That also means that PHI construction must work forward from earlier to
+  // later, and that the code must maintain a mapping from earlier PHI's
+  // destination registers, and the registers that went into the PHI.
+
+  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+    unsigned DestReg = MIIt->getOperand(0).getReg();
+    unsigned Op1Reg = MIIt->getOperand(1).getReg();
+    unsigned Op2Reg = MIIt->getOperand(2).getReg();
+
+    // If this CMOV we are generating is the opposite condition from
+    // the jump we generated, then we have to swap the operands for the
+    // PHI that is going to be generated.
+    if (MIIt->getOperand(3).getImm() == OppCC)
+        std::swap(Op1Reg, Op2Reg);
+
+    if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
+      Op1Reg = RegRewriteTable[Op1Reg].first;
+
+    if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
+      Op2Reg = RegRewriteTable[Op2Reg].second;
+
+    MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
+                  TII->get(X86::PHI), DestReg)
+          .addReg(Op1Reg).addMBB(copy0MBB)
+          .addReg(Op2Reg).addMBB(thisMBB);
+
+    // Add this PHI to the rewrite table.
+    RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
+  }
+
+  // If we have a cascaded CMOV, the second Jcc provides the same incoming
+  // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
+  if (CascadedCMOV) {
+    MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
+    // Copy the PHI result to the register defined by the second CMOV.
+    BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
+            DL, TII->get(TargetOpcode::COPY),
+            CascadedCMOV->getOperand(0).getReg())
+        .addReg(MI.getOperand(0).getReg());
+    CascadedCMOV->eraseFromParent();
+  }
+
+  // Now remove the CMOV(s).
+  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
+    (MIIt++)->eraseFromParent();
+
+  return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
+                                       MachineBasicBlock *BB) const {
+  // Combine the following atomic floating-point modification pattern:
+  //   a.store(reg OP a.load(acquire), release)
+  // Transform them into:
+  //   OPss (%gpr), %xmm
+  //   movss %xmm, (%gpr)
+  // Or sd equivalent for 64-bit operations.
+  unsigned MOp, FOp;
+  switch (MI.getOpcode()) {
+  default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
+  case X86::RELEASE_FADD32mr:
+    FOp = X86::ADDSSrm;
+    MOp = X86::MOVSSmr;
+    break;
+  case X86::RELEASE_FADD64mr:
+    FOp = X86::ADDSDrm;
+    MOp = X86::MOVSDmr;
+    break;
+  }
+  const X86InstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  unsigned ValOpIdx = X86::AddrNumOperands;
+  unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
+  MachineInstrBuilder MIB =
+      BuildMI(*BB, MI, DL, TII->get(FOp),
+              MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
+          .addReg(VSrc);
+  for (int i = 0; i < X86::AddrNumOperands; ++i) {
+    MachineOperand &Operand = MI.getOperand(i);
+    // Clear any kill flags on register operands as we'll create a second
+    // instruction using the same address operands.
+    if (Operand.isReg())
+      Operand.setIsKill(false);
+    MIB.addOperand(Operand);
+  }
+  MachineInstr *FOpMI = MIB;
+  MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
+  for (int i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI.getOperand(i));
+  MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
+                                        MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+
+  assert(MF->shouldSplitStack());
+
+  const bool Is64Bit = Subtarget.is64Bit();
+  const bool IsLP64 = Subtarget.isTarget64BitLP64();
+
+  const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
+  const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
+
+  // BB:
+  //  ... [Till the alloca]
+  // If stacklet is not large enough, jump to mallocMBB
+  //
+  // bumpMBB:
+  //  Allocate by subtracting from RSP
+  //  Jump to continueMBB
+  //
+  // mallocMBB:
+  //  Allocate by call to runtime
+  //
+  // continueMBB:
+  //  ...
+  //  [rest of original BB]
+  //
+
+  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterClass *AddrRegClass =
+      getRegClassFor(getPointerTy(MF->getDataLayout()));
+
+  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+           bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+           tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+           SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
+           sizeVReg = MI.getOperand(1).getReg(),
+           physSPReg =
+               IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
+
+  MachineFunction::iterator MBBIter = ++BB->getIterator();
+
+  MF->insert(MBBIter, bumpMBB);
+  MF->insert(MBBIter, mallocMBB);
+  MF->insert(MBBIter, continueMBB);
+
+  continueMBB->splice(continueMBB->begin(), BB,
+                      std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add code to the main basic block to check if the stack limit has been hit,
+  // and if so, jump to mallocMBB otherwise to bumpMBB.
+  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
+  BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
+    .addReg(tmpSPVReg).addReg(sizeVReg);
+  BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
+    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+    .addReg(SPLimitVReg);
+  BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
+
+  // bumpMBB simply decreases the stack pointer, since we know the current
+  // stacklet has enough space.
+  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
+    .addReg(SPLimitVReg);
+  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
+    .addReg(SPLimitVReg);
+  BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
+
+  // Calls into a routine in libgcc to allocate more space from the heap.
+  const uint32_t *RegMask =
+      Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
+  if (IsLP64) {
+    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
+      .addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+      .addExternalSymbol("__morestack_allocate_stack_space")
+      .addRegMask(RegMask)
+      .addReg(X86::RDI, RegState::Implicit)
+      .addReg(X86::RAX, RegState::ImplicitDefine);
+  } else if (Is64Bit) {
+    BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
+      .addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+      .addExternalSymbol("__morestack_allocate_stack_space")
+      .addRegMask(RegMask)
+      .addReg(X86::EDI, RegState::Implicit)
+      .addReg(X86::EAX, RegState::ImplicitDefine);
+  } else {
+    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
+      .addImm(12);
+    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
+      .addExternalSymbol("__morestack_allocate_stack_space")
+      .addRegMask(RegMask)
+      .addReg(X86::EAX, RegState::ImplicitDefine);
+  }
+
+  if (!Is64Bit)
+    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
+      .addImm(16);
+
+  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
+    .addReg(IsLP64 ? X86::RAX : X86::EAX);
+  BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
+
+  // Set up the CFG correctly.
+  BB->addSuccessor(bumpMBB);
+  BB->addSuccessor(mallocMBB);
+  mallocMBB->addSuccessor(continueMBB);
+  bumpMBB->addSuccessor(continueMBB);
+
+  // Take care of the PHI nodes.
+  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
+          MI.getOperand(0).getReg())
+      .addReg(mallocPtrVReg)
+      .addMBB(mallocMBB)
+      .addReg(bumpSPPtrVReg)
+      .addMBB(bumpMBB);
+
+  // Delete the original pseudo instruction.
+  MI.eraseFromParent();
+
+  // And we're done.
+  return continueMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
+                                       MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
+  DebugLoc DL = MI.getDebugLoc();
+
+  assert(!isAsynchronousEHPersonality(
+             classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
+         "SEH does not use catchret!");
+
+  // Only 32-bit EH needs to worry about manually restoring stack pointers.
+  if (!Subtarget.is32Bit())
+    return BB;
+
+  // C++ EH creates a new target block to hold the restore code, and wires up
+  // the new block to the return destination with a normal JMP_4.
+  MachineBasicBlock *RestoreMBB =
+      MF->CreateMachineBasicBlock(BB->getBasicBlock());
+  assert(BB->succ_size() == 1);
+  MF->insert(std::next(BB->getIterator()), RestoreMBB);
+  RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
+  BB->addSuccessor(RestoreMBB);
+  MI.getOperand(0).setMBB(RestoreMBB);
+
+  auto RestoreMBBI = RestoreMBB->begin();
+  BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
+  BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
+                                       MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  const Constant *PerFn = MF->getFunction()->getPersonalityFn();
+  bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
+  // Only 32-bit SEH requires special handling for catchpad.
+  if (IsSEH && Subtarget.is32Bit()) {
+    const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+    DebugLoc DL = MI.getDebugLoc();
+    BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
+  }
+  MI.eraseFromParent();
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
+                                      MachineBasicBlock *BB) const {
+  // So, here we replace TLSADDR with the sequence:
+  // adjust_stackdown -> TLSADDR -> adjust_stackup.
+  // We need this because TLSADDR is lowered into calls
+  // inside MC, therefore without the two markers shrink-wrapping
+  // may push the prologue/epilogue pass them.
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction &MF = *BB->getParent();
+
+  // Emit CALLSEQ_START right before the instruction.
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  MachineInstrBuilder CallseqStart =
+    BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
+  BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
+
+  // Emit CALLSEQ_END right after the instruction.
+  // We don't call erase from parent because we want to keep the
+  // original instruction around.
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  MachineInstrBuilder CallseqEnd =
+    BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
+  BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
+
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
+                                      MachineBasicBlock *BB) const {
+  // This is pretty easy.  We're taking the value that we received from
+  // our load from the relocation, sticking it in either RDI (x86-64)
+  // or EAX and doing an indirect call.  The return value will then
+  // be in the normal return register.
+  MachineFunction *F = BB->getParent();
+  const X86InstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
+  assert(MI.getOperand(3).isGlobal() && "This should be a global");
+
+  // Get a register mask for the lowered call.
+  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
+  // proper register mask.
+  const uint32_t *RegMask =
+      Subtarget.is64Bit() ?
+      Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
+      Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
+  if (Subtarget.is64Bit()) {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
+            .addReg(X86::RIP)
+            .addImm(0)
+            .addReg(0)
+            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+                              MI.getOperand(3).getTargetFlags())
+            .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
+    addDirectMem(MIB, X86::RDI);
+    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
+  } else if (!isPositionIndependent()) {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
+            .addReg(0)
+            .addImm(0)
+            .addReg(0)
+            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+                              MI.getOperand(3).getTargetFlags())
+            .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+    addDirectMem(MIB, X86::EAX);
+    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+  } else {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
+            .addReg(TII->getGlobalBaseReg(F))
+            .addImm(0)
+            .addReg(0)
+            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+                              MI.getOperand(3).getTargetFlags())
+            .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+    addDirectMem(MIB, X86::EAX);
+    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+  }
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
+                                    MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = ++MBB->getIterator();
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
+
+  unsigned DstReg;
+  unsigned MemOpndSlot = 0;
+
+  unsigned CurOp = 0;
+
+  DstReg = MI.getOperand(CurOp++).getReg();
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  unsigned mainDstReg = MRI.createVirtualRegister(RC);
+  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+  MemOpndSlot = CurOp;
+
+  MVT PVT = getPointerTy(MF->getDataLayout());
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+
+  // For v = setjmp(buf), we generate
+  //
+  // thisMBB:
+  //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
+  //  SjLjSetup restoreMBB
+  //
+  // mainMBB:
+  //  v_main = 0
+  //
+  // sinkMBB:
+  //  v = phi(main, restore)
+  //
+  // restoreMBB:
+  //  if base pointer being used, load it from frame
+  //  v_restore = 1
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+  MF->push_back(restoreMBB);
+  restoreMBB->setHasAddressTaken();
+
+  MachineInstrBuilder MIB;
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  unsigned PtrStoreOpc = 0;
+  unsigned LabelReg = 0;
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
+                     !isPositionIndependent();
+
+  // Prepare IP either in reg or imm.
+  if (!UseImmLabel) {
+    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+    LabelReg = MRI.createVirtualRegister(PtrRC);
+    if (Subtarget.is64Bit()) {
+      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
+              .addReg(X86::RIP)
+              .addImm(0)
+              .addReg(0)
+              .addMBB(restoreMBB)
+              .addReg(0);
+    } else {
+      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
+      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
+              .addReg(XII->getGlobalBaseReg(MF))
+              .addImm(0)
+              .addReg(0)
+              .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
+              .addReg(0);
+    }
+  } else
+    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+  // Store IP
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
+    else
+      MIB.addOperand(MI.getOperand(MemOpndSlot + i));
+  }
+  if (!UseImmLabel)
+    MIB.addReg(LabelReg);
+  else
+    MIB.addMBB(restoreMBB);
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Setup
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
+          .addMBB(restoreMBB);
+
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  MIB.addRegMask(RegInfo->getNoPreservedMask());
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(restoreMBB);
+
+  // mainMBB:
+  //  EAX = 0
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(X86::PHI), DstReg)
+    .addReg(mainDstReg).addMBB(mainMBB)
+    .addReg(restoreDstReg).addMBB(restoreMBB);
+
+  // restoreMBB:
+  if (RegInfo->hasBasePointer(*MF)) {
+    const bool Uses64BitFramePtr =
+        Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
+    X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+    X86FI->setRestoreBasePointer(MF);
+    unsigned FramePtr = RegInfo->getFrameRegister(*MF);
+    unsigned BasePtr = RegInfo->getBaseRegister();
+    unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
+    addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
+                 FramePtr, true, X86FI->getRestoreBasePointerOffset())
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
+  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
+  BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
+  restoreMBB->addSuccessor(sinkMBB);
+
+  MI.eraseFromParent();
+  return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
+                                     MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
+
+  MVT PVT = getPointerTy(MF->getDataLayout());
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+
+  const TargetRegisterClass *RC =
+    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+  unsigned Tmp = MRI.createVirtualRegister(RC);
+  // Since FP is only updated here but NOT referenced, it's treated as GPR.
+  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
+  unsigned SP = RegInfo->getStackRegister();
+
+  MachineInstrBuilder MIB;
+
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  const int64_t SPOffset = 2 * PVT.getStoreSize();
+
+  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
+  unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
+
+  // Reload FP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI.getOperand(i));
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Reload IP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI.getOperand(i), LabelOffset);
+    else
+      MIB.addOperand(MI.getOperand(i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Reload SP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI.getOperand(i), SPOffset);
+    else
+      MIB.addOperand(MI.getOperand(i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Jump
+  BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
+
+  MI.eraseFromParent();
+  return MBB;
+}
+
+void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
+                                               MachineBasicBlock *MBB,
+                                               MachineBasicBlock *DispatchBB,
+                                               int FI) const {
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+
+  MVT PVT = getPointerTy(MF->getDataLayout());
+  assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
+
+  unsigned Op = 0;
+  unsigned VR = 0;
+
+  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
+                     !isPositionIndependent();
+
+  if (UseImmLabel) {
+    Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+  } else {
+    const TargetRegisterClass *TRC =
+        (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+    VR = MRI->createVirtualRegister(TRC);
+    Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+
+    /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
+
+    if (Subtarget.is64Bit())
+      BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
+          .addReg(X86::RIP)
+          .addImm(1)
+          .addReg(0)
+          .addMBB(DispatchBB)
+          .addReg(0);
+    else
+      BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
+          .addReg(0) /* XII->getGlobalBaseReg(MF) */
+          .addImm(1)
+          .addReg(0)
+          .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
+          .addReg(0);
+  }
+
+  MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
+  addFrameReference(MIB, FI, 36);
+  if (UseImmLabel)
+    MIB.addMBB(DispatchBB);
+  else
+    MIB.addReg(VR);
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
+                                         MachineBasicBlock *BB) const {
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = BB->getParent();
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  int FI = MFI.getFunctionContextIndex();
+
+  // Get a mapping of the call site numbers to all of the landing pads they're
+  // associated with.
+  DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
+  unsigned MaxCSNum = 0;
+  for (auto &MBB : *MF) {
+    if (!MBB.isEHPad())
+      continue;
+
+    MCSymbol *Sym = nullptr;
+    for (const auto &MI : MBB) {
+      if (MI.isDebugValue())
+        continue;
+
+      assert(MI.isEHLabel() && "expected EH_LABEL");
+      Sym = MI.getOperand(0).getMCSymbol();
+      break;
+    }
+
+    if (!MF->hasCallSiteLandingPad(Sym))
+      continue;
+
+    for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
+      CallSiteNumToLPad[CSI].push_back(&MBB);
+      MaxCSNum = std::max(MaxCSNum, CSI);
+    }
+  }
+
+  // Get an ordered list of the machine basic blocks for the jump table.
+  std::vector<MachineBasicBlock *> LPadList;
+  SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
+  LPadList.reserve(CallSiteNumToLPad.size());
+
+  for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
+    for (auto &LP : CallSiteNumToLPad[CSI]) {
+      LPadList.push_back(LP);
+      InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
+    }
+  }
+
+  assert(!LPadList.empty() &&
+         "No landing pad destinations for the dispatch jump table!");
+
+  // Create the MBBs for the dispatch code.
+
+  // Shove the dispatch's address into the return slot in the function context.
+  MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
+  DispatchBB->setIsEHPad(true);
+
+  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+  BuildMI(TrapBB, DL, TII->get(X86::TRAP));
+  DispatchBB->addSuccessor(TrapBB);
+
+  MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
+  DispatchBB->addSuccessor(DispContBB);
+
+  // Insert MBBs.
+  MF->push_back(DispatchBB);
+  MF->push_back(DispContBB);
+  MF->push_back(TrapBB);
+
+  // Insert code into the entry block that creates and registers the function
+  // context.
+  SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
+
+  // Create the jump table and associated information
+  MachineJumpTableInfo *JTI =
+      MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
+  unsigned MJTI = JTI->createJumpTableIndex(LPadList);
+
+  const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
+  const X86RegisterInfo &RI = XII->getRegisterInfo();
+
+  // Add a register mask with no preserved registers.  This results in all
+  // registers being marked as clobbered.
+  if (RI.hasBasePointer(*MF)) {
+    const bool FPIs64Bit =
+        Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
+    X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
+    MFI->setRestoreBasePointer(MF);
+
+    unsigned FP = RI.getFrameRegister(*MF);
+    unsigned BP = RI.getBaseRegister();
+    unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
+    addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
+                 MFI->getRestoreBasePointerOffset())
+        .addRegMask(RI.getNoPreservedMask());
+  } else {
+    BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
+        .addRegMask(RI.getNoPreservedMask());
+  }
+
+  unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+  addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
+                    4);
+  BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
+      .addReg(IReg)
+      .addImm(LPadList.size());
+  BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
+
+  unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+  BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
+      .addReg(IReg)
+      .addImm(1);
+  BuildMI(DispContBB, DL,
+          TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
+      .addReg(0)
+      .addImm(Subtarget.is64Bit() ? 8 : 4)
+      .addReg(JReg)
+      .addJumpTableIndex(MJTI)
+      .addReg(0);
+
+  // Add the jump table entries as successors to the MBB.
+  SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
+  for (auto &LP : LPadList)
+    if (SeenMBBs.insert(LP).second)
+      DispContBB->addSuccessor(LP);
+
+  // N.B. the order the invoke BBs are processed in doesn't matter here.
+  SmallVector<MachineBasicBlock *, 64> MBBLPads;
+  const MCPhysReg *SavedRegs =
+      Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
+  for (MachineBasicBlock *MBB : InvokeBBs) {
+    // Remove the landing pad successor from the invoke block and replace it
+    // with the new dispatch block.
+    // Keep a copy of Successors since it's modified inside the loop.
+    SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
+                                                   MBB->succ_rend());
+    // FIXME: Avoid quadratic complexity.
+    for (auto MBBS : Successors) {
+      if (MBBS->isEHPad()) {
+        MBB->removeSuccessor(MBBS);
+        MBBLPads.push_back(MBBS);
+      }
+    }
+
+    MBB->addSuccessor(DispatchBB);
+
+    // Find the invoke call and mark all of the callee-saved registers as
+    // 'implicit defined' so that they're spilled.  This prevents code from
+    // moving instructions to before the EH block, where they will never be
+    // executed.
+    for (auto &II : reverse(*MBB)) {
+      if (!II.isCall())
+        continue;
+
+      DenseMap<unsigned, bool> DefRegs;
+      for (auto &MOp : II.operands())
+        if (MOp.isReg())
+          DefRegs[MOp.getReg()] = true;
+
+      MachineInstrBuilder MIB(*MF, &II);
+      for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
+        unsigned Reg = SavedRegs[RI];
+        if (!DefRegs[Reg])
+          MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
+      }
+
+      break;
+    }
+  }
+
+  // Mark all former landing pads as non-landing pads.  The dispatch is the only
+  // landing pad now.
+  for (auto &LP : MBBLPads)
+    LP->setIsEHPad(false);
+
+  // The instruction is gone now.
+  MI.eraseFromParent();
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                               MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  switch (MI.getOpcode()) {
+  default: llvm_unreachable("Unexpected instr type to insert");
+  case X86::TAILJMPd64:
+  case X86::TAILJMPr64:
+  case X86::TAILJMPm64:
+  case X86::TAILJMPr64_REX:
+  case X86::TAILJMPm64_REX:
+    llvm_unreachable("TAILJMP64 would not be touched here.");
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64:
+    return BB;
+  case X86::TLS_addr32:
+  case X86::TLS_addr64:
+  case X86::TLS_base_addr32:
+  case X86::TLS_base_addr64:
+    return EmitLoweredTLSAddr(MI, BB);
+  case X86::CATCHRET:
+    return EmitLoweredCatchRet(MI, BB);
+  case X86::CATCHPAD:
+    return EmitLoweredCatchPad(MI, BB);
+  case X86::SEG_ALLOCA_32:
+  case X86::SEG_ALLOCA_64:
+    return EmitLoweredSegAlloca(MI, BB);
+  case X86::TLSCall_32:
+  case X86::TLSCall_64:
+    return EmitLoweredTLSCall(MI, BB);
+  case X86::CMOV_FR32:
+  case X86::CMOV_FR64:
+  case X86::CMOV_FR128:
+  case X86::CMOV_GR8:
+  case X86::CMOV_GR16:
+  case X86::CMOV_GR32:
+  case X86::CMOV_RFP32:
+  case X86::CMOV_RFP64:
+  case X86::CMOV_RFP80:
+  case X86::CMOV_V2F64:
+  case X86::CMOV_V2I64:
+  case X86::CMOV_V4F32:
+  case X86::CMOV_V4F64:
+  case X86::CMOV_V4I64:
+  case X86::CMOV_V16F32:
+  case X86::CMOV_V8F32:
+  case X86::CMOV_V8F64:
+  case X86::CMOV_V8I64:
+  case X86::CMOV_V8I1:
+  case X86::CMOV_V16I1:
+  case X86::CMOV_V32I1:
+  case X86::CMOV_V64I1:
+    return EmitLoweredSelect(MI, BB);
+
+  case X86::RDFLAGS32:
+  case X86::RDFLAGS64: {
+    unsigned PushF =
+        MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
+    unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
+    MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
+    // Permit reads of the FLAGS register without it being defined.
+    // This intrinsic exists to read external processor state in flags, such as
+    // the trap flag, interrupt flag, and direction flag, none of which are
+    // modeled by the backend.
+    Push->getOperand(2).setIsUndef();
+    BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
+
+    MI.eraseFromParent(); // The pseudo is gone now.
+    return BB;
+  }
+
+  case X86::WRFLAGS32:
+  case X86::WRFLAGS64: {
+    unsigned Push =
+        MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
+    unsigned PopF =
+        MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
+    BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
+    BuildMI(*BB, MI, DL, TII->get(PopF));
+
+    MI.eraseFromParent(); // The pseudo is gone now.
+    return BB;
+  }
+
+  case X86::RELEASE_FADD32mr:
+  case X86::RELEASE_FADD64mr:
+    return EmitLoweredAtomicFP(MI, BB);
+
+  case X86::FP32_TO_INT16_IN_MEM:
+  case X86::FP32_TO_INT32_IN_MEM:
+  case X86::FP32_TO_INT64_IN_MEM:
+  case X86::FP64_TO_INT16_IN_MEM:
+  case X86::FP64_TO_INT32_IN_MEM:
+  case X86::FP64_TO_INT64_IN_MEM:
+  case X86::FP80_TO_INT16_IN_MEM:
+  case X86::FP80_TO_INT32_IN_MEM:
+  case X86::FP80_TO_INT64_IN_MEM: {
+    // Change the floating point control register to use "round towards zero"
+    // mode when truncating to an integer value.
+    int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+    // Load the old value of the high byte of the control word...
+    unsigned OldCW =
+      MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
+                      CWFrameIdx);
+
+    // Set the high part to be round to zero...
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
+      .addImm(0xC7F);
+
+    // Reload the modified control word now...
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+    // Restore the memory image of control word to original value
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
+      .addReg(OldCW);
+
+    // Get the X86 opcode to use.
+    unsigned Opc;
+    switch (MI.getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
+    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
+    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
+    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
+    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
+    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
+    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
+    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
+    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
+    }
+
+    X86AddressMode AM = getAddressFromInstr(&MI, 0);
+    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
+        .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
+
+    // Reload the original control word now.
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+    MI.eraseFromParent(); // The pseudo instruction is gone now.
+    return BB;
+  }
+    // String/text processing lowering.
+  case X86::PCMPISTRM128REG:
+  case X86::VPCMPISTRM128REG:
+  case X86::PCMPISTRM128MEM:
+  case X86::VPCMPISTRM128MEM:
+  case X86::PCMPESTRM128REG:
+  case X86::VPCMPESTRM128REG:
+  case X86::PCMPESTRM128MEM:
+  case X86::VPCMPESTRM128MEM:
+    assert(Subtarget.hasSSE42() &&
+           "Target must have SSE4.2 or AVX features enabled");
+    return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
+
+  // String/text processing lowering.
+  case X86::PCMPISTRIREG:
+  case X86::VPCMPISTRIREG:
+  case X86::PCMPISTRIMEM:
+  case X86::VPCMPISTRIMEM:
+  case X86::PCMPESTRIREG:
+  case X86::VPCMPESTRIREG:
+  case X86::PCMPESTRIMEM:
+  case X86::VPCMPESTRIMEM:
+    assert(Subtarget.hasSSE42() &&
+           "Target must have SSE4.2 or AVX features enabled");
+    return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
+
+  // Thread synchronization.
+  case X86::MONITOR:
+    return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
+  case X86::MONITORX:
+    return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
+  // PKU feature
+  case X86::WRPKRU:
+    return emitWRPKRU(MI, BB, Subtarget);
+  case X86::RDPKRU:
+    return emitRDPKRU(MI, BB, Subtarget);
+  // xbegin
+  case X86::XBEGIN:
+    return emitXBegin(MI, BB, Subtarget.getInstrInfo());
+
+  case X86::VASTART_SAVE_XMM_REGS:
+    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
+
+  case X86::VAARG_64:
+    return EmitVAARG64WithCustomInserter(MI, BB);
+
+  case X86::EH_SjLj_SetJmp32:
+  case X86::EH_SjLj_SetJmp64:
+    return emitEHSjLjSetJmp(MI, BB);
+
+  case X86::EH_SjLj_LongJmp32:
+  case X86::EH_SjLj_LongJmp64:
+    return emitEHSjLjLongJmp(MI, BB);
+
+  case X86::Int_eh_sjlj_setup_dispatch:
+    return EmitSjLjDispatchBlock(MI, BB);
+
+  case TargetOpcode::STATEPOINT:
+    // As an implementation detail, STATEPOINT shares the STACKMAP format at
+    // this point in the process.  We diverge later.
+    return emitPatchPoint(MI, BB);
+
+  case TargetOpcode::STACKMAP:
+  case TargetOpcode::PATCHPOINT:
+    return emitPatchPoint(MI, BB);
+
+  case X86::LCMPXCHG8B: {
+    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+    // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
+    // requires a memory operand. If it happens that current architecture is
+    // i686 and for current function we need a base pointer
+    // - which is ESI for i686 - register allocator would not be able to
+    // allocate registers for an address in form of X(%reg, %reg, Y)
+    // - there never would be enough unreserved registers during regalloc
+    // (without the need for base ptr the only option would be X(%edi, %esi, Y).
+    // We are giving a hand to register allocator by precomputing the address in
+    // a new vreg using LEA.
+
+    // If it is not i686 or there is no base pointer - nothing to do here.
+    if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
+      return BB;
+
+    // Even though this code does not necessarily needs the base pointer to
+    // be ESI, we check for that. The reason: if this assert fails, there are
+    // some changes happened in the compiler base pointer handling, which most
+    // probably have to be addressed somehow here.
+    assert(TRI->getBaseRegister() == X86::ESI &&
+           "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
+           "base pointer in mind");
+
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    MVT SPTy = getPointerTy(MF->getDataLayout());
+    const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
+    unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
+
+    X86AddressMode AM = getAddressFromInstr(&MI, 0);
+    // Regalloc does not need any help when the memory operand of CMPXCHG8B
+    // does not use index register.
+    if (AM.IndexReg == X86::NoRegister)
+      return BB;
+
+    // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
+    // four operand definitions that are E[ABCD] registers. We skip them and
+    // then insert the LEA.
+    MachineBasicBlock::iterator MBBI(MI);
+    while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
+           MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
+      --MBBI;
+    addFullAddress(
+        BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
+
+    setDirectAddressInInstr(&MI, 0, computedAddrVReg);
+
+    return BB;
+  }
+  case X86::LCMPXCHG16B:
+    return BB;
+  case X86::LCMPXCHG8B_SAVE_EBX:
+  case X86::LCMPXCHG16B_SAVE_RBX: {
+    unsigned BasePtr =
+        MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
+    if (!BB->isLiveIn(BasePtr))
+      BB->addLiveIn(BasePtr);
+    return BB;
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                      APInt &KnownZero,
+                                                      APInt &KnownOne,
+                                                      const SelectionDAG &DAG,
+                                                      unsigned Depth) const {
+  unsigned BitWidth = KnownZero.getBitWidth();
+  unsigned Opc = Op.getOpcode();
+  assert((Opc >= ISD::BUILTIN_OP_END ||
+          Opc == ISD::INTRINSIC_WO_CHAIN ||
+          Opc == ISD::INTRINSIC_W_CHAIN ||
+          Opc == ISD::INTRINSIC_VOID) &&
+         "Should use MaskedValueIsZero if you don't know whether Op"
+         " is a target node!");
+
+  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
+  switch (Opc) {
+  default: break;
+  case X86ISD::ADD:
+  case X86ISD::SUB:
+  case X86ISD::ADC:
+  case X86ISD::SBB:
+  case X86ISD::SMUL:
+  case X86ISD::UMUL:
+  case X86ISD::INC:
+  case X86ISD::DEC:
+  case X86ISD::OR:
+  case X86ISD::XOR:
+  case X86ISD::AND:
+    // These nodes' second result is a boolean.
+    if (Op.getResNo() == 0)
+      break;
+    LLVM_FALLTHROUGH;
+  case X86ISD::SETCC:
+    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    break;
+  case X86ISD::MOVMSK: {
+    unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
+    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
+    break;
+  }
+  case X86ISD::VZEXT: {
+    SDValue N0 = Op.getOperand(0);
+    unsigned NumElts = Op.getValueType().getVectorNumElements();
+    unsigned InNumElts = N0.getValueType().getVectorNumElements();
+    unsigned InBitWidth = N0.getValueType().getScalarSizeInBits();
+
+    KnownZero = KnownOne = APInt(InBitWidth, 0);
+    APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
+    DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1);
+    KnownOne = KnownOne.zext(BitWidth);
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth);
+    break;
+  }
+  }
+}
+
+unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
+    SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
+  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
+  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
+    return Op.getScalarValueSizeInBits();
+
+  if (Op.getOpcode() == X86ISD::VSEXT) {
+    EVT VT = Op.getValueType();
+    EVT SrcVT = Op.getOperand(0).getValueType();
+    unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits();
+    return Tmp;
+  }
+
+  // Fallback case.
+  return 1;
+}
+
+/// Returns true (and the GlobalValue and the offset) if the node is a
+/// GlobalAddress + offset.
+bool X86TargetLowering::isGAPlusOffset(SDNode *N,
+                                       const GlobalValue* &GA,
+                                       int64_t &Offset) const {
+  if (N->getOpcode() == X86ISD::Wrapper) {
+    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
+      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
+      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
+      return true;
+    }
+  }
+  return TargetLowering::isGAPlusOffset(N, GA, Offset);
+}
+
+// Attempt to match a combined shuffle mask against supported unary shuffle
+// instructions.
+// TODO: Investigate sharing more of this with shuffle lowering.
+static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+                                    const X86Subtarget &Subtarget,
+                                    unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
+  unsigned NumMaskElts = Mask.size();
+  unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
+  bool FloatDomain = MaskVT.isFloatingPoint() ||
+                     (!Subtarget.hasAVX2() && MaskVT.is256BitVector());
+
+  // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
+  if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
+      isUndefOrEqual(Mask[0], 0) &&
+      isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
+    Shuffle = X86ISD::VZEXT_MOVL;
+    SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+    return true;
+  }
+
+  // Match against a VZEXT instruction.
+  // TODO: Add 256/512-bit vector support.
+  if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) {
+    unsigned MaxScale = 64 / MaskEltSize;
+    for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
+      bool Match = true;
+      unsigned NumDstElts = NumMaskElts / Scale;
+      for (unsigned i = 0; i != NumDstElts && Match; ++i) {
+        Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
+        Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
+      }
+      if (Match) {
+        SrcVT = MaskVT;
+        DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
+        DstVT = MVT::getVectorVT(DstVT, NumDstElts);
+        Shuffle = X86ISD::VZEXT;
+        return true;
+      }
+    }
+  }
+
+  // Check if we have SSE3 which will let us use MOVDDUP etc. The
+  // instructions are no slower than UNPCKLPD but has the option to
+  // fold the input operand into even an unaligned memory load.
+  if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
+    if (isTargetShuffleEquivalent(Mask, {0, 0})) {
+      Shuffle = X86ISD::MOVDDUP;
+      SrcVT = DstVT = MVT::v2f64;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
+      Shuffle = X86ISD::MOVSLDUP;
+      SrcVT = DstVT = MVT::v4f32;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
+      Shuffle = X86ISD::MOVSHDUP;
+      SrcVT = DstVT = MVT::v4f32;
+      return true;
+    }
+  }
+
+  if (MaskVT.is256BitVector() && FloatDomain) {
+    assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
+    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
+      Shuffle = X86ISD::MOVDDUP;
+      SrcVT = DstVT = MVT::v4f64;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
+      Shuffle = X86ISD::MOVSLDUP;
+      SrcVT = DstVT = MVT::v8f32;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
+      Shuffle = X86ISD::MOVSHDUP;
+      SrcVT = DstVT = MVT::v8f32;
+      return true;
+    }
+  }
+
+  if (MaskVT.is512BitVector() && FloatDomain) {
+    assert(Subtarget.hasAVX512() &&
+           "AVX512 required for 512-bit vector shuffles");
+    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
+      Shuffle = X86ISD::MOVDDUP;
+      SrcVT = DstVT = MVT::v8f64;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(
+            Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
+      Shuffle = X86ISD::MOVSLDUP;
+      SrcVT = DstVT = MVT::v16f32;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(
+            Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
+      Shuffle = X86ISD::MOVSHDUP;
+      SrcVT = DstVT = MVT::v16f32;
+      return true;
+    }
+  }
+
+  // Attempt to match against broadcast-from-vector.
+  if (Subtarget.hasAVX2()) {
+    SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
+    if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
+      SrcVT = DstVT = MaskVT;
+      Shuffle = X86ISD::VBROADCAST;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Attempt to match a combined shuffle mask against supported unary immediate
+// permute instructions.
+// TODO: Investigate sharing more of this with shuffle lowering.
+static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+                                           const X86Subtarget &Subtarget,
+                                           unsigned &Shuffle, MVT &ShuffleVT,
+                                           unsigned &PermuteImm) {
+  unsigned NumMaskElts = Mask.size();
+  bool FloatDomain = MaskVT.isFloatingPoint();
+
+  bool ContainsZeros = false;
+  SmallBitVector Zeroable(NumMaskElts, false);
+  for (unsigned i = 0; i != NumMaskElts; ++i) {
+    int M = Mask[i];
+    Zeroable[i] = isUndefOrZero(M);
+    ContainsZeros |= (M == SM_SentinelZero);
+  }
+
+  // Attempt to match against byte/bit shifts.
+  // FIXME: Add 512-bit support.
+  if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+                       (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+    int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
+                                             MaskVT.getScalarSizeInBits(), Mask,
+                                             0, Zeroable, Subtarget);
+    if (0 < ShiftAmt) {
+      PermuteImm = (unsigned)ShiftAmt;
+      return true;
+    }
+  }
+
+  // Ensure we don't contain any zero elements.
+  if (ContainsZeros)
+    return false;
+
+  assert(llvm::all_of(Mask, [&](int M) {
+                        return SM_SentinelUndef <= M && M < (int)NumMaskElts;
+                      }) && "Expected unary shuffle");
+
+  unsigned InputSizeInBits = MaskVT.getSizeInBits();
+  unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
+  MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
+
+  // Handle PSHUFLW/PSHUFHW repeated patterns.
+  if (MaskScalarSizeInBits == 16) {
+    SmallVector<int, 4> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
+      ArrayRef<int> LoMask(Mask.data() + 0, 4);
+      ArrayRef<int> HiMask(Mask.data() + 4, 4);
+
+      // PSHUFLW: permute lower 4 elements only.
+      if (isUndefOrInRange(LoMask, 0, 4) &&
+          isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
+        Shuffle = X86ISD::PSHUFLW;
+        ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
+        PermuteImm = getV4X86ShuffleImm(LoMask);
+        return true;
+      }
+
+      // PSHUFHW: permute upper 4 elements only.
+      if (isUndefOrInRange(HiMask, 4, 8) &&
+          isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
+        // Offset the HiMask so that we can create the shuffle immediate.
+        int OffsetHiMask[4];
+        for (int i = 0; i != 4; ++i)
+          OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
+
+        Shuffle = X86ISD::PSHUFHW;
+        ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
+        PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
+        return true;
+      }
+
+      return false;
+    }
+    return false;
+  }
+
+  // We only support permutation of 32/64 bit elements after this.
+  if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
+    return false;
+
+  // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
+  // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
+  if (FloatDomain && !Subtarget.hasAVX())
+    return false;
+
+  // Pre-AVX2 we must use float shuffles on 256-bit vectors.
+  if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
+    FloatDomain = true;
+
+  // Check for lane crossing permutes.
+  if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
+    // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
+    if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
+      Shuffle = X86ISD::VPERMI;
+      ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
+      PermuteImm = getV4X86ShuffleImm(Mask);
+      return true;
+    }
+    if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
+      SmallVector<int, 4> RepeatedMask;
+      if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
+        Shuffle = X86ISD::VPERMI;
+        ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
+        PermuteImm = getV4X86ShuffleImm(RepeatedMask);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // VPERMILPD can permute with a non-repeating shuffle.
+  if (FloatDomain && MaskScalarSizeInBits == 64) {
+    Shuffle = X86ISD::VPERMILPI;
+    ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
+    PermuteImm = 0;
+    for (int i = 0, e = Mask.size(); i != e; ++i) {
+      int M = Mask[i];
+      if (M == SM_SentinelUndef)
+        continue;
+      assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
+      PermuteImm |= (M & 1) << i;
+    }
+    return true;
+  }
+
+  // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
+  SmallVector<int, 4> RepeatedMask;
+  if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
+    return false;
+
+  // Narrow the repeated mask for 32-bit element permutes.
+  SmallVector<int, 4> WordMask = RepeatedMask;
+  if (MaskScalarSizeInBits == 64)
+    scaleShuffleMask(2, RepeatedMask, WordMask);
+
+  Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
+  ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
+  ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
+  PermuteImm = getV4X86ShuffleImm(WordMask);
+  return true;
+}
+
+// Attempt to match a combined unary shuffle mask against supported binary
+// shuffle instructions.
+// TODO: Investigate sharing more of this with shuffle lowering.
+static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+                                     SDValue &V1, SDValue &V2,
+                                     const X86Subtarget &Subtarget,
+                                     unsigned &Shuffle, MVT &ShuffleVT,
+                                     bool IsUnary) {
+  bool FloatDomain = MaskVT.isFloatingPoint();
+  unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
+
+  if (MaskVT.is128BitVector()) {
+    if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
+      V2 = V1;
+      Shuffle = X86ISD::MOVLHPS;
+      ShuffleVT = MVT::v4f32;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
+      V2 = V1;
+      Shuffle = X86ISD::MOVHLPS;
+      ShuffleVT = MVT::v4f32;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
+        (FloatDomain || !Subtarget.hasSSE41())) {
+      std::swap(V1, V2);
+      Shuffle = X86ISD::MOVSD;
+      ShuffleVT = MaskVT;
+      return true;
+    }
+    if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
+        (FloatDomain || !Subtarget.hasSSE41())) {
+      Shuffle = X86ISD::MOVSS;
+      ShuffleVT = MaskVT;
+      return true;
+    }
+  }
+
+  // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
+  if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+      (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+      (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
+      (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
+      (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
+    MVT LegalVT = MaskVT;
+    if (LegalVT.is256BitVector() && !Subtarget.hasAVX2())
+      LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
+
+    SmallVector<int, 64> Unpckl, Unpckh;
+    if (IsUnary) {
+      createUnpackShuffleMask(MaskVT, Unpckl, true, true);
+      if (isTargetShuffleEquivalent(Mask, Unpckl)) {
+        V2 = V1;
+        Shuffle = X86ISD::UNPCKL;
+        ShuffleVT = LegalVT;
+        return true;
+      }
+
+      createUnpackShuffleMask(MaskVT, Unpckh, false, true);
+      if (isTargetShuffleEquivalent(Mask, Unpckh)) {
+        V2 = V1;
+        Shuffle = X86ISD::UNPCKH;
+        ShuffleVT = LegalVT;
+        return true;
+      }
+    } else {
+      createUnpackShuffleMask(MaskVT, Unpckl, true, false);
+      if (isTargetShuffleEquivalent(Mask, Unpckl)) {
+        Shuffle = X86ISD::UNPCKL;
+        ShuffleVT = LegalVT;
+        return true;
+      }
+
+      createUnpackShuffleMask(MaskVT, Unpckh, false, false);
+      if (isTargetShuffleEquivalent(Mask, Unpckh)) {
+        Shuffle = X86ISD::UNPCKH;
+        ShuffleVT = LegalVT;
+        return true;
+      }
+
+      ShuffleVectorSDNode::commuteMask(Unpckl);
+      if (isTargetShuffleEquivalent(Mask, Unpckl)) {
+        std::swap(V1, V2);
+        Shuffle = X86ISD::UNPCKL;
+        ShuffleVT = LegalVT;
+        return true;
+      }
+
+      ShuffleVectorSDNode::commuteMask(Unpckh);
+      if (isTargetShuffleEquivalent(Mask, Unpckh)) {
+        std::swap(V1, V2);
+        Shuffle = X86ISD::UNPCKH;
+        ShuffleVT = LegalVT;
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+                                            SDValue &V1, SDValue &V2,
+                                            SDLoc &DL, SelectionDAG &DAG,
+                                            const X86Subtarget &Subtarget,
+                                            unsigned &Shuffle, MVT &ShuffleVT,
+                                            unsigned &PermuteImm) {
+  unsigned NumMaskElts = Mask.size();
+  bool FloatDomain = MaskVT.isFloatingPoint();
+
+  // Attempt to match against PALIGNR byte rotate.
+  if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
+                       (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+    int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
+    if (0 < ByteRotation) {
+      Shuffle = X86ISD::PALIGNR;
+      ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
+      PermuteImm = ByteRotation;
+      return true;
+    }
+  }
+
+  // Attempt to combine to X86ISD::BLENDI.
+  if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
+                           (Subtarget.hasAVX() && MaskVT.is256BitVector()))) {
+    // Determine a type compatible with X86ISD::BLENDI.
+    // TODO - add 16i16 support (requires lane duplication).
+    MVT BlendVT = MaskVT;
+    if (Subtarget.hasAVX2()) {
+      if (BlendVT == MVT::v4i64)
+        BlendVT = MVT::v8i32;
+      else if (BlendVT == MVT::v2i64)
+        BlendVT = MVT::v4i32;
+    } else {
+      if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32)
+        BlendVT = MVT::v8i16;
+      else if (BlendVT == MVT::v4i64)
+        BlendVT = MVT::v4f64;
+      else if (BlendVT == MVT::v8i32)
+        BlendVT = MVT::v8f32;
+    }
+
+    unsigned BlendSize = BlendVT.getVectorNumElements();
+    unsigned MaskRatio = BlendSize / NumMaskElts;
+
+    // Can we blend with zero?
+    if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
+                                         /*Low*/ 0) &&
+        NumMaskElts <= BlendVT.getVectorNumElements()) {
+      PermuteImm = 0;
+      for (unsigned i = 0; i != BlendSize; ++i)
+        if (Mask[i / MaskRatio] < 0)
+          PermuteImm |= 1u << i;
+
+      V2 = getZeroVector(BlendVT, Subtarget, DAG, DL);
+      Shuffle = X86ISD::BLENDI;
+      ShuffleVT = BlendVT;
+      return true;
+    }
+
+    // Attempt to match as a binary blend.
+    if (NumMaskElts <= BlendVT.getVectorNumElements()) {
+      bool MatchBlend = true;
+      for (int i = 0; i != (int)NumMaskElts; ++i) {
+        int M = Mask[i];
+        if (M == SM_SentinelUndef)
+          continue;
+        else if (M == SM_SentinelZero)
+          MatchBlend = false;
+        else if ((M != i) && (M != (i + (int)NumMaskElts)))
+          MatchBlend = false;
+      }
+
+      if (MatchBlend) {
+        PermuteImm = 0;
+        for (unsigned i = 0; i != BlendSize; ++i)
+          if ((int)NumMaskElts <= Mask[i / MaskRatio])
+            PermuteImm |= 1u << i;
+
+        Shuffle = X86ISD::BLENDI;
+        ShuffleVT = BlendVT;
+        return true;
+      }
+    }
+  }
+
+  // Attempt to combine to INSERTPS.
+  if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) {
+    SmallBitVector Zeroable(4, false);
+    for (unsigned i = 0; i != NumMaskElts; ++i)
+      if (Mask[i] < 0)
+        Zeroable[i] = true;
+
+    if (Zeroable.any() &&
+        matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
+      Shuffle = X86ISD::INSERTPS;
+      ShuffleVT = MVT::v4f32;
+      return true;
+    }
+  }
+
+  // Attempt to combine to SHUFPD.
+  if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) ||
+      (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) ||
+      (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) {
+    if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
+      Shuffle = X86ISD::SHUFP;
+      ShuffleVT = MaskVT;
+      return true;
+    }
+  }
+
+  // Attempt to combine to SHUFPS.
+  if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+      (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
+      (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) {
+    SmallVector<int, 4> RepeatedMask;
+    if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
+      auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
+        int M0 = RepeatedMask[Offset];
+        int M1 = RepeatedMask[Offset + 1];
+
+        if (isUndefInRange(RepeatedMask, Offset, 2)) {
+          return DAG.getUNDEF(MaskVT);
+        } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
+          S0 = (SM_SentinelUndef == M0 ? -1 : 0);
+          S1 = (SM_SentinelUndef == M1 ? -1 : 1);
+          return getZeroVector(MaskVT, Subtarget, DAG, DL);
+        } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
+          S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
+          S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
+          return V1;
+        } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
+          S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
+          S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
+          return V2;
+        }
+
+        return SDValue();
+      };
+
+      int ShufMask[4] = {-1, -1, -1, -1};
+      SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
+      SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
+
+      if (Lo && Hi) {
+        V1 = Lo;
+        V2 = Hi;
+        Shuffle = X86ISD::SHUFP;
+        ShuffleVT = MaskVT;
+        PermuteImm = getV4X86ShuffleImm(ShufMask);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/// \brief Combine an arbitrary chain of shuffles into a single instruction if
+/// possible.
+///
+/// This is the leaf of the recursive combine below. When we have found some
+/// chain of single-use x86 shuffle instructions and accumulated the combined
+/// shuffle mask represented by them, this will try to pattern match that mask
+/// into either a single instruction if there is a special purpose instruction
+/// for this operation, or into a PSHUFB instruction which is a fully general
+/// instruction but should only be used to replace chains over a certain depth.
+static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
+                                   ArrayRef<int> BaseMask, int Depth,
+                                   bool HasVariableMask, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget &Subtarget) {
+  assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
+  assert((Inputs.size() == 1 || Inputs.size() == 2) &&
+         "Unexpected number of shuffle inputs!");
+
+  // Find the inputs that enter the chain. Note that multiple uses are OK
+  // here, we're not going to remove the operands we find.
+  bool UnaryShuffle = (Inputs.size() == 1);
+  SDValue V1 = peekThroughBitcasts(Inputs[0]);
+  SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1]));
+
+  MVT VT1 = V1.getSimpleValueType();
+  MVT VT2 = V2.getSimpleValueType();
+  MVT RootVT = Root.getSimpleValueType();
+  assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
+         VT2.getSizeInBits() == RootVT.getSizeInBits() &&
+         "Vector size mismatch");
+
+  SDLoc DL(Root);
+  SDValue Res;
+
+  unsigned NumBaseMaskElts = BaseMask.size();
+  if (NumBaseMaskElts == 1) {
+    assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  unsigned RootSizeInBits = RootVT.getSizeInBits();
+  unsigned NumRootElts = RootVT.getVectorNumElements();
+  unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
+  bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
+                     (RootVT.is256BitVector() && !Subtarget.hasAVX2());
+
+  // Don't combine if we are a AVX512/EVEX target and the mask element size
+  // is different from the root element size - this would prevent writemasks
+  // from being reused.
+  // TODO - this currently prevents all lane shuffles from occurring.
+  // TODO - check for writemasks usage instead of always preventing combining.
+  // TODO - attempt to narrow Mask back to writemask size.
+  bool IsEVEXShuffle =
+      RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
+  if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
+    return false;
+
+  // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
+
+  // Handle 128-bit lane shuffles of 256-bit vectors.
+  // TODO - this should support binary shuffles.
+  if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
+      !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
+    if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
+      return false; // Nothing to do!
+    MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
+    unsigned PermMask = 0;
+    PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
+    PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
+
+    Res = DAG.getBitcast(ShuffleVT, V1);
+    DCI.AddToWorklist(Res.getNode());
+    Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
+                      DAG.getUNDEF(ShuffleVT),
+                      DAG.getConstant(PermMask, DL, MVT::i8));
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // For masks that have been widened to 128-bit elements or more,
+  // narrow back down to 64-bit elements.
+  SmallVector<int, 64> Mask;
+  if (BaseMaskEltSizeInBits > 64) {
+    assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
+    int MaskScale = BaseMaskEltSizeInBits / 64;
+    scaleShuffleMask(MaskScale, BaseMask, Mask);
+  } else {
+    Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
+  }
+
+  unsigned NumMaskElts = Mask.size();
+  unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
+
+  // Determine the effective mask value type.
+  FloatDomain &= (32 <= MaskEltSizeInBits);
+  MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
+                           : MVT::getIntegerVT(MaskEltSizeInBits);
+  MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
+
+  // Only allow legal mask types.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
+    return false;
+
+  // Attempt to match the mask against known shuffle patterns.
+  MVT ShuffleSrcVT, ShuffleVT;
+  unsigned Shuffle, PermuteImm;
+
+  if (UnaryShuffle) {
+    // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
+    // directly if we don't shuffle the lower element and we shuffle the upper
+    // (zero) elements within themselves.
+    if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
+        (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
+      unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
+      ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
+      if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
+          isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
+        DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
+                      /*AddTo*/ true);
+        return true;
+      }
+    }
+
+    if (matchUnaryVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleSrcVT,
+                                ShuffleVT)) {
+      if (Depth == 1 && Root.getOpcode() == Shuffle)
+        return false; // Nothing to do!
+      if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+        return false; // AVX512 Writemask clash.
+      Res = DAG.getBitcast(ShuffleSrcVT, V1);
+      DCI.AddToWorklist(Res.getNode());
+      Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
+      DCI.AddToWorklist(Res.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                    /*AddTo*/ true);
+      return true;
+    }
+
+    if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Subtarget, Shuffle,
+                                       ShuffleVT, PermuteImm)) {
+      if (Depth == 1 && Root.getOpcode() == Shuffle)
+        return false; // Nothing to do!
+      if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+        return false; // AVX512 Writemask clash.
+      Res = DAG.getBitcast(ShuffleVT, V1);
+      DCI.AddToWorklist(Res.getNode());
+      Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
+                        DAG.getConstant(PermuteImm, DL, MVT::i8));
+      DCI.AddToWorklist(Res.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                    /*AddTo*/ true);
+      return true;
+    }
+  }
+
+  if (matchBinaryVectorShuffle(MaskVT, Mask, V1, V2, Subtarget, Shuffle,
+                               ShuffleVT, UnaryShuffle)) {
+    if (Depth == 1 && Root.getOpcode() == Shuffle)
+      return false; // Nothing to do!
+    if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+      return false; // AVX512 Writemask clash.
+    V1 = DAG.getBitcast(ShuffleVT, V1);
+    DCI.AddToWorklist(V1.getNode());
+    V2 = DAG.getBitcast(ShuffleVT, V2);
+    DCI.AddToWorklist(V2.getNode());
+    Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, V1, V2, DL, DAG, Subtarget,
+                                      Shuffle, ShuffleVT, PermuteImm)) {
+    if (Depth == 1 && Root.getOpcode() == Shuffle)
+      return false; // Nothing to do!
+    if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
+      return false; // AVX512 Writemask clash.
+    V1 = DAG.getBitcast(ShuffleVT, V1);
+    DCI.AddToWorklist(V1.getNode());
+    V2 = DAG.getBitcast(ShuffleVT, V2);
+    DCI.AddToWorklist(V2.getNode());
+    Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
+                      DAG.getConstant(PermuteImm, DL, MVT::i8));
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // Don't try to re-form single instruction chains under any circumstances now
+  // that we've done encoding canonicalization for them.
+  if (Depth < 2)
+    return false;
+
+  bool MaskContainsZeros =
+      any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+
+  if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
+    // If we have a single input lane-crossing shuffle then lower to VPERMV.
+    if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
+        ((Subtarget.hasAVX2() &&
+          (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
+         (Subtarget.hasAVX512() &&
+          (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+           MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+         (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
+         (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
+         (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
+         (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
+      MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
+      MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
+      SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+      DCI.AddToWorklist(VPermMask.getNode());
+      Res = DAG.getBitcast(MaskVT, V1);
+      DCI.AddToWorklist(Res.getNode());
+      Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
+      DCI.AddToWorklist(Res.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                    /*AddTo*/ true);
+      return true;
+    }
+
+    // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
+    // vector as the second source.
+    if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
+        ((Subtarget.hasAVX512() &&
+          (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+           MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+         (Subtarget.hasVLX() &&
+          (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+           MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
+         (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
+         (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
+         (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
+         (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
+      // Adjust shuffle mask - replace SM_SentinelZero with second source index.
+      for (unsigned i = 0; i != NumMaskElts; ++i)
+        if (Mask[i] == SM_SentinelZero)
+          Mask[i] = NumMaskElts + i;
+
+      MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
+      MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
+      SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+      DCI.AddToWorklist(VPermMask.getNode());
+      Res = DAG.getBitcast(MaskVT, V1);
+      DCI.AddToWorklist(Res.getNode());
+      SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
+      DCI.AddToWorklist(Zero.getNode());
+      Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
+      DCI.AddToWorklist(Res.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                    /*AddTo*/ true);
+      return true;
+    }
+
+    // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
+    if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
+        ((Subtarget.hasAVX512() &&
+          (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+           MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+         (Subtarget.hasVLX() &&
+          (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+           MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
+         (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
+         (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
+         (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
+         (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
+      MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
+      MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
+      SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+      DCI.AddToWorklist(VPermMask.getNode());
+      V1 = DAG.getBitcast(MaskVT, V1);
+      DCI.AddToWorklist(V1.getNode());
+      V2 = DAG.getBitcast(MaskVT, V2);
+      DCI.AddToWorklist(V2.getNode());
+      Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
+      DCI.AddToWorklist(Res.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                    /*AddTo*/ true);
+      return true;
+    }
+    return false;
+  }
+
+  // See if we can combine a single input shuffle with zeros to a bit-mask,
+  // which is much simpler than any shuffle.
+  if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
+      isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
+      DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
+    APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
+    APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
+    SmallBitVector UndefElts(NumMaskElts, false);
+    SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
+    for (unsigned i = 0; i != NumMaskElts; ++i) {
+      int M = Mask[i];
+      if (M == SM_SentinelUndef) {
+        UndefElts[i] = true;
+        continue;
+      }
+      if (M == SM_SentinelZero)
+        continue;
+      EltBits[i] = AllOnes;
+    }
+    SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
+    DCI.AddToWorklist(BitMask.getNode());
+    Res = DAG.getBitcast(MaskVT, V1);
+    DCI.AddToWorklist(Res.getNode());
+    unsigned AndOpcode =
+        FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
+    Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // If we have a single input shuffle with different shuffle patterns in the
+  // the 128-bit lanes use the variable mask to VPERMILPS.
+  // TODO Combine other mask types at higher depths.
+  if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
+      ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
+       (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
+    SmallVector<SDValue, 16> VPermIdx;
+    for (int M : Mask) {
+      SDValue Idx =
+          M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
+      VPermIdx.push_back(Idx);
+    }
+    MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
+    SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
+    DCI.AddToWorklist(VPermMask.getNode());
+    Res = DAG.getBitcast(MaskVT, V1);
+    DCI.AddToWorklist(Res.getNode());
+    Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
+  // to VPERMIL2PD/VPERMIL2PS.
+  if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
+      (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
+       MaskVT == MVT::v8f32)) {
+    // VPERMIL2 Operation.
+    // Bits[3] - Match Bit.
+    // Bits[2:1] - (Per Lane) PD Shuffle Mask.
+    // Bits[2:0] - (Per Lane) PS Shuffle Mask.
+    unsigned NumLanes = MaskVT.getSizeInBits() / 128;
+    unsigned NumEltsPerLane = NumMaskElts / NumLanes;
+    SmallVector<int, 8> VPerm2Idx;
+    MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
+    MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
+    unsigned M2ZImm = 0;
+    for (int M : Mask) {
+      if (M == SM_SentinelUndef) {
+        VPerm2Idx.push_back(-1);
+        continue;
+      }
+      if (M == SM_SentinelZero) {
+        M2ZImm = 2;
+        VPerm2Idx.push_back(8);
+        continue;
+      }
+      int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
+      Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
+      VPerm2Idx.push_back(Index);
+    }
+    V1 = DAG.getBitcast(MaskVT, V1);
+    DCI.AddToWorklist(V1.getNode());
+    V2 = DAG.getBitcast(MaskVT, V2);
+    DCI.AddToWorklist(V2.getNode());
+    SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
+    DCI.AddToWorklist(VPerm2MaskOp.getNode());
+    Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
+                      DAG.getConstant(M2ZImm, DL, MVT::i8));
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // If we have 3 or more shuffle instructions or a chain involving a variable
+  // mask, we can replace them with a single PSHUFB instruction profitably.
+  // Intel's manuals suggest only using PSHUFB if doing so replacing 5
+  // instructions, but in practice PSHUFB tends to be *very* fast so we're
+  // more aggressive.
+  if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
+      ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
+       (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
+       (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
+    SmallVector<SDValue, 16> PSHUFBMask;
+    int NumBytes = RootVT.getSizeInBits() / 8;
+    int Ratio = NumBytes / NumMaskElts;
+    for (int i = 0; i < NumBytes; ++i) {
+      int M = Mask[i / Ratio];
+      if (M == SM_SentinelUndef) {
+        PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
+        continue;
+      }
+      if (M == SM_SentinelZero) {
+        PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
+        continue;
+      }
+      M = Ratio * M + i % Ratio;
+      assert ((M / 16) == (i / 16) && "Lane crossing detected");
+      PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
+    }
+    MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
+    Res = DAG.getBitcast(ByteVT, V1);
+    DCI.AddToWorklist(Res.getNode());
+    SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
+    DCI.AddToWorklist(PSHUFBMaskOp.getNode());
+    Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // With XOP, if we have a 128-bit binary input shuffle we can always combine
+  // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
+  // slower than PSHUFB on targets that support both.
+  if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
+      Subtarget.hasXOP()) {
+    // VPPERM Mask Operation
+    // Bits[4:0] - Byte Index (0 - 31)
+    // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
+    SmallVector<SDValue, 16> VPPERMMask;
+    int NumBytes = 16;
+    int Ratio = NumBytes / NumMaskElts;
+    for (int i = 0; i < NumBytes; ++i) {
+      int M = Mask[i / Ratio];
+      if (M == SM_SentinelUndef) {
+        VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
+        continue;
+      }
+      if (M == SM_SentinelZero) {
+        VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
+        continue;
+      }
+      M = Ratio * M + i % Ratio;
+      VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
+    }
+    MVT ByteVT = MVT::v16i8;
+    V1 = DAG.getBitcast(ByteVT, V1);
+    DCI.AddToWorklist(V1.getNode());
+    V2 = DAG.getBitcast(ByteVT, V2);
+    DCI.AddToWorklist(V2.getNode());
+    SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
+    DCI.AddToWorklist(VPPERMMaskOp.getNode());
+    Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
+    DCI.AddToWorklist(Res.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // Failed to find any combines.
+  return false;
+}
+
+// Attempt to constant fold all of the constant source ops.
+// Returns true if the entire shuffle is folded to a constant.
+// TODO: Extend this to merge multiple constant Ops and update the mask.
+static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
+                                        ArrayRef<int> Mask, SDValue Root,
+                                        bool HasVariableMask, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const X86Subtarget &Subtarget) {
+  MVT VT = Root.getSimpleValueType();
+
+  unsigned SizeInBits = VT.getSizeInBits();
+  unsigned NumMaskElts = Mask.size();
+  unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
+  unsigned NumOps = Ops.size();
+
+  // Extract constant bits from each source op.
+  bool OneUseConstantOp = false;
+  SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps);
+  SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps);
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDValue SrcOp = Ops[i];
+    OneUseConstantOp |= SrcOp.hasOneUse();
+    if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
+                                       RawBitsOps[i]))
+      return false;
+  }
+
+  // Only fold if at least one of the constants is only used once or
+  // the combined shuffle has included a variable mask shuffle, this
+  // is to avoid constant pool bloat.
+  if (!OneUseConstantOp && !HasVariableMask)
+    return false;
+
+  // Shuffle the constant bits according to the mask.
+  SmallBitVector UndefElts(NumMaskElts, false);
+  SmallBitVector ZeroElts(NumMaskElts, false);
+  SmallBitVector ConstantElts(NumMaskElts, false);
+  SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
+                                        APInt::getNullValue(MaskSizeInBits));
+  for (unsigned i = 0; i != NumMaskElts; ++i) {
+    int M = Mask[i];
+    if (M == SM_SentinelUndef) {
+      UndefElts[i] = true;
+      continue;
+    } else if (M == SM_SentinelZero) {
+      ZeroElts[i] = true;
+      continue;
+    }
+    assert(0 <= M && M < (int)(NumMaskElts * NumOps));
+
+    unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
+    unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
+
+    auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
+    if (SrcUndefElts[SrcMaskIdx]) {
+      UndefElts[i] = true;
+      continue;
+    }
+
+    auto &SrcEltBits = RawBitsOps[SrcOpIdx];
+    APInt &Bits = SrcEltBits[SrcMaskIdx];
+    if (!Bits) {
+      ZeroElts[i] = true;
+      continue;
+    }
+
+    ConstantElts[i] = true;
+    ConstantBitData[i] = Bits;
+  }
+  assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts);
+
+  // Create the constant data.
+  MVT MaskSVT;
+  if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
+    MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
+  else
+    MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
+
+  MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
+
+  SDLoc DL(Root);
+  SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
+  DCI.AddToWorklist(CstOp.getNode());
+  DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
+  return true;
+}
+
+/// \brief Fully generic combining of x86 shuffle instructions.
+///
+/// This should be the last combine run over the x86 shuffle instructions. Once
+/// they have been fully optimized, this will recursively consider all chains
+/// of single-use shuffle instructions, build a generic model of the cumulative
+/// shuffle operation, and check for simpler instructions which implement this
+/// operation. We use this primarily for two purposes:
+///
+/// 1) Collapse generic shuffles to specialized single instructions when
+///    equivalent. In most cases, this is just an encoding size win, but
+///    sometimes we will collapse multiple generic shuffles into a single
+///    special-purpose shuffle.
+/// 2) Look for sequences of shuffle instructions with 3 or more total
+///    instructions, and replace them with the slightly more expensive SSSE3
+///    PSHUFB instruction if available. We do this as the last combining step
+///    to ensure we avoid using PSHUFB if we can implement the shuffle with
+///    a suitable short sequence of other instructions. The PSHUFB will either
+///    use a register or have to read from memory and so is slightly (but only
+///    slightly) more expensive than the other shuffle instructions.
+///
+/// Because this is inherently a quadratic operation (for each shuffle in
+/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
+/// This should never be an issue in practice as the shuffle lowering doesn't
+/// produce sequences of more than 8 instructions.
+///
+/// FIXME: We will currently miss some cases where the redundant shuffling
+/// would simplify under the threshold for PSHUFB formation because of
+/// combine-ordering. To fix this, we should do the redundant instruction
+/// combining in this recursive walk.
+static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
+                                          int SrcOpIndex, SDValue Root,
+                                          ArrayRef<int> RootMask,
+                                          int Depth, bool HasVariableMask,
+                                          SelectionDAG &DAG,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          const X86Subtarget &Subtarget) {
+  // Bound the depth of our recursive combine because this is ultimately
+  // quadratic in nature.
+  if (Depth > 8)
+    return false;
+
+  // Directly rip through bitcasts to find the underlying operand.
+  SDValue Op = SrcOps[SrcOpIndex];
+  Op = peekThroughOneUseBitcasts(Op);
+
+  MVT VT = Op.getSimpleValueType();
+  if (!VT.isVector())
+    return false; // Bail if we hit a non-vector.
+
+  assert(Root.getSimpleValueType().isVector() &&
+         "Shuffles operate on vector types!");
+  assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
+         "Can only combine shuffles of the same vector register size.");
+
+  // Extract target shuffle mask and resolve sentinels and inputs.
+  SDValue Input0, Input1;
+  SmallVector<int, 16> OpMask;
+  if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
+    return false;
+
+  // Add the inputs to the Ops list, avoiding duplicates.
+  SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end());
+
+  int InputIdx0 = -1, InputIdx1 = -1;
+  for (int i = 0, e = Ops.size(); i < e; ++i) {
+    SDValue BC = peekThroughBitcasts(Ops[i]);
+    if (Input0 && BC == peekThroughBitcasts(Input0))
+      InputIdx0 = i;
+    if (Input1 && BC == peekThroughBitcasts(Input1))
+      InputIdx1 = i;
+  }
+
+  if (Input0 && InputIdx0 < 0) {
+    InputIdx0 = SrcOpIndex;
+    Ops[SrcOpIndex] = Input0;
+  }
+  if (Input1 && InputIdx1 < 0) {
+    InputIdx1 = Ops.size();
+    Ops.push_back(Input1);
+  }
+
+  assert(((RootMask.size() > OpMask.size() &&
+           RootMask.size() % OpMask.size() == 0) ||
+          (OpMask.size() > RootMask.size() &&
+           OpMask.size() % RootMask.size() == 0) ||
+          OpMask.size() == RootMask.size()) &&
+         "The smaller number of elements must divide the larger.");
+  int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
+  int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
+  int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
+  assert(((RootRatio == 1 && OpRatio == 1) ||
+          (RootRatio == 1) != (OpRatio == 1)) &&
+         "Must not have a ratio for both incoming and op masks!");
+
+  SmallVector<int, 16> Mask;
+  Mask.reserve(MaskWidth);
+
+  // Merge this shuffle operation's mask into our accumulated mask. Note that
+  // this shuffle's mask will be the first applied to the input, followed by the
+  // root mask to get us all the way to the root value arrangement. The reason
+  // for this order is that we are recursing up the operation chain.
+  for (int i = 0; i < MaskWidth; ++i) {
+    int RootIdx = i / RootRatio;
+    if (RootMask[RootIdx] < 0) {
+      // This is a zero or undef lane, we're done.
+      Mask.push_back(RootMask[RootIdx]);
+      continue;
+    }
+
+    int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
+
+    // Just insert the scaled root mask value if it references an input other
+    // than the SrcOp we're currently inserting.
+    if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
+        (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
+      Mask.push_back(RootMaskedIdx);
+      continue;
+    }
+
+    RootMaskedIdx %= MaskWidth;
+
+    int OpIdx = RootMaskedIdx / OpRatio;
+    if (OpMask[OpIdx] < 0) {
+      // The incoming lanes are zero or undef, it doesn't matter which ones we
+      // are using.
+      Mask.push_back(OpMask[OpIdx]);
+      continue;
+    }
+
+    // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
+    int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
+    OpMaskedIdx %= MaskWidth;
+
+    if (OpMask[OpIdx] < (int)OpMask.size()) {
+      assert(0 <= InputIdx0 && "Unknown target shuffle input");
+      OpMaskedIdx += InputIdx0 * MaskWidth;
+    } else {
+      assert(0 <= InputIdx1 && "Unknown target shuffle input");
+      OpMaskedIdx += InputIdx1 * MaskWidth;
+    }
+
+    Mask.push_back(OpMaskedIdx);
+  }
+
+  // Handle the all undef/zero cases early.
+  if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
+    DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
+    return true;
+  }
+  if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
+    // TODO - should we handle the mixed zero/undef case as well? Just returning
+    // a zero mask will lose information on undef elements possibly reducing
+    // future combine possibilities.
+    DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
+                                                Subtarget, DAG, SDLoc(Root)));
+    return true;
+  }
+
+  // Remove unused shuffle source ops.
+  SmallVector<SDValue, 8> UsedOps;
+  for (int i = 0, e = Ops.size(); i < e; ++i) {
+    int lo = UsedOps.size() * MaskWidth;
+    int hi = lo + MaskWidth;
+    if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
+      UsedOps.push_back(Ops[i]);
+      continue;
+    }
+    for (int &M : Mask)
+      if (lo <= M)
+        M -= MaskWidth;
+  }
+  assert(!UsedOps.empty() && "Shuffle with no inputs detected");
+  Ops = UsedOps;
+
+  HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
+
+  // See if we can recurse into each shuffle source op (if it's a target shuffle).
+  for (int i = 0, e = Ops.size(); i < e; ++i)
+    if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode()))
+      if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1,
+                                        HasVariableMask, DAG, DCI, Subtarget))
+        return true;
+
+  // Attempt to constant fold all of the constant source ops.
+  if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
+                                  Subtarget))
+    return true;
+
+  // We can only combine unary and binary shuffle mask cases.
+  if (Ops.size() > 2)
+    return false;
+
+  // Minor canonicalization of the accumulated shuffle mask to make it easier
+  // to match below. All this does is detect masks with sequential pairs of
+  // elements, and shrink them to the half-width mask. It does this in a loop
+  // so it will reduce the size of the mask to the minimal width mask which
+  // performs an equivalent shuffle.
+  SmallVector<int, 16> WidenedMask;
+  while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
+    Mask = std::move(WidenedMask);
+  }
+
+  // Canonicalization of binary shuffle masks to improve pattern matching by
+  // commuting the inputs.
+  if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
+    ShuffleVectorSDNode::commuteMask(Mask);
+    std::swap(Ops[0], Ops[1]);
+  }
+
+  return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
+                                DCI, Subtarget);
+}
+
+/// \brief Get the PSHUF-style mask from PSHUF node.
+///
+/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
+/// PSHUF-style masks that can be reused with such instructions.
+static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+  MVT VT = N.getSimpleValueType();
+  SmallVector<int, 4> Mask;
+  SmallVector<SDValue, 2> Ops;
+  bool IsUnary;
+  bool HaveMask =
+      getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
+  (void)HaveMask;
+  assert(HaveMask);
+
+  // If we have more than 128-bits, only the low 128-bits of shuffle mask
+  // matter. Check that the upper masks are repeats and remove them.
+  if (VT.getSizeInBits() > 128) {
+    int LaneElts = 128 / VT.getScalarSizeInBits();
+#ifndef NDEBUG
+    for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
+      for (int j = 0; j < LaneElts; ++j)
+        assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
+               "Mask doesn't repeat in high 128-bit lanes!");
+#endif
+    Mask.resize(LaneElts);
+  }
+
+  switch (N.getOpcode()) {
+  case X86ISD::PSHUFD:
+    return Mask;
+  case X86ISD::PSHUFLW:
+    Mask.resize(4);
+    return Mask;
+  case X86ISD::PSHUFHW:
+    Mask.erase(Mask.begin(), Mask.begin() + 4);
+    for (int &M : Mask)
+      M -= 4;
+    return Mask;
+  default:
+    llvm_unreachable("No valid shuffle instruction found!");
+  }
+}
+
+/// \brief Search for a combinable shuffle across a chain ending in pshufd.
+///
+/// We walk up the chain and look for a combinable shuffle, skipping over
+/// shuffles that we could hoist this shuffle's transformation past without
+/// altering anything.
+static SDValue
+combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
+                             SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI) {
+  assert(N.getOpcode() == X86ISD::PSHUFD &&
+         "Called with something other than an x86 128-bit half shuffle!");
+  SDLoc DL(N);
+
+  // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
+  // of the shuffles in the chain so that we can form a fresh chain to replace
+  // this one.
+  SmallVector<SDValue, 8> Chain;
+  SDValue V = N.getOperand(0);
+  for (; V.hasOneUse(); V = V.getOperand(0)) {
+    switch (V.getOpcode()) {
+    default:
+      return SDValue(); // Nothing combined!
+
+    case ISD::BITCAST:
+      // Skip bitcasts as we always know the type for the target specific
+      // instructions.
+      continue;
+
+    case X86ISD::PSHUFD:
+      // Found another dword shuffle.
+      break;
+
+    case X86ISD::PSHUFLW:
+      // Check that the low words (being shuffled) are the identity in the
+      // dword shuffle, and the high words are self-contained.
+      if (Mask[0] != 0 || Mask[1] != 1 ||
+          !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
+        return SDValue();
+
+      Chain.push_back(V);
+      continue;
+
+    case X86ISD::PSHUFHW:
+      // Check that the high words (being shuffled) are the identity in the
+      // dword shuffle, and the low words are self-contained.
+      if (Mask[2] != 2 || Mask[3] != 3 ||
+          !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
+        return SDValue();
+
+      Chain.push_back(V);
+      continue;
+
+    case X86ISD::UNPCKL:
+    case X86ISD::UNPCKH:
+      // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
+      // shuffle into a preceding word shuffle.
+      if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
+          V.getSimpleValueType().getVectorElementType() != MVT::i16)
+        return SDValue();
+
+      // Search for a half-shuffle which we can combine with.
+      unsigned CombineOp =
+          V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+      if (V.getOperand(0) != V.getOperand(1) ||
+          !V->isOnlyUserOf(V.getOperand(0).getNode()))
+        return SDValue();
+      Chain.push_back(V);
+      V = V.getOperand(0);
+      do {
+        switch (V.getOpcode()) {
+        default:
+          return SDValue(); // Nothing to combine.
+
+        case X86ISD::PSHUFLW:
+        case X86ISD::PSHUFHW:
+          if (V.getOpcode() == CombineOp)
+            break;
+
+          Chain.push_back(V);
+
+          LLVM_FALLTHROUGH;
+        case ISD::BITCAST:
+          V = V.getOperand(0);
+          continue;
+        }
+        break;
+      } while (V.hasOneUse());
+      break;
+    }
+    // Break out of the loop if we break out of the switch.
+    break;
+  }
+
+  if (!V.hasOneUse())
+    // We fell out of the loop without finding a viable combining instruction.
+    return SDValue();
+
+  // Merge this node's mask and our incoming mask.
+  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+  for (int &M : Mask)
+    M = VMask[M];
+  V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
+                  getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+  // Rebuild the chain around this new shuffle.
+  while (!Chain.empty()) {
+    SDValue W = Chain.pop_back_val();
+
+    if (V.getValueType() != W.getOperand(0).getValueType())
+      V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
+
+    switch (W.getOpcode()) {
+    default:
+      llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
+
+    case X86ISD::UNPCKL:
+    case X86ISD::UNPCKH:
+      V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
+      break;
+
+    case X86ISD::PSHUFD:
+    case X86ISD::PSHUFLW:
+    case X86ISD::PSHUFHW:
+      V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
+      break;
+    }
+  }
+  if (V.getValueType() != N.getValueType())
+    V = DAG.getBitcast(N.getValueType(), V);
+
+  // Return the new chain to replace N.
+  return V;
+}
+
+/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
+/// pshufhw.
+///
+/// We walk up the chain, skipping shuffles of the other half and looking
+/// through shuffles which switch halves trying to find a shuffle of the same
+/// pair of dwords.
+static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
+                                        SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI) {
+  assert(
+      (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
+      "Called with something other than an x86 128-bit half shuffle!");
+  SDLoc DL(N);
+  unsigned CombineOpcode = N.getOpcode();
+
+  // Walk up a single-use chain looking for a combinable shuffle.
+  SDValue V = N.getOperand(0);
+  for (; V.hasOneUse(); V = V.getOperand(0)) {
+    switch (V.getOpcode()) {
+    default:
+      return false; // Nothing combined!
+
+    case ISD::BITCAST:
+      // Skip bitcasts as we always know the type for the target specific
+      // instructions.
+      continue;
+
+    case X86ISD::PSHUFLW:
+    case X86ISD::PSHUFHW:
+      if (V.getOpcode() == CombineOpcode)
+        break;
+
+      // Other-half shuffles are no-ops.
+      continue;
+    }
+    // Break out of the loop if we break out of the switch.
+    break;
+  }
+
+  if (!V.hasOneUse())
+    // We fell out of the loop without finding a viable combining instruction.
+    return false;
+
+  // Combine away the bottom node as its shuffle will be accumulated into
+  // a preceding shuffle.
+  DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
+
+  // Record the old value.
+  SDValue Old = V;
+
+  // Merge this node's mask and our incoming mask (adjusted to account for all
+  // the pshufd instructions encountered).
+  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+  for (int &M : Mask)
+    M = VMask[M];
+  V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
+                  getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+  // Check that the shuffles didn't cancel each other out. If not, we need to
+  // combine to the new one.
+  if (Old != V)
+    // Replace the combinable shuffle with the combined one, updating all users
+    // so that we re-evaluate the chain here.
+    DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
+
+  return true;
+}
+
+/// \brief Try to combine x86 target specific shuffles.
+static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const X86Subtarget &Subtarget) {
+  SDLoc DL(N);
+  MVT VT = N.getSimpleValueType();
+  SmallVector<int, 4> Mask;
+
+  unsigned Opcode = N.getOpcode();
+  switch (Opcode) {
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFLW:
+  case X86ISD::PSHUFHW:
+    Mask = getPSHUFShuffleMask(N);
+    assert(Mask.size() == 4);
+    break;
+  case X86ISD::UNPCKL: {
+    auto Op0 = N.getOperand(0);
+    auto Op1 = N.getOperand(1);
+    unsigned Opcode0 = Op0.getOpcode();
+    unsigned Opcode1 = Op1.getOpcode();
+
+    // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
+    // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
+    // TODO: Add other horizontal operations as required.
+    if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
+      return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
+
+    // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
+    // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
+    // moves upper half elements into the lower half part. For example:
+    //
+    // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
+    //     undef:v16i8
+    // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
+    //
+    // will be combined to:
+    //
+    // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
+
+    // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
+    // happen due to advanced instructions.
+    if (!VT.is128BitVector())
+      return SDValue();
+
+    if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
+      ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
+
+      unsigned NumElts = VT.getVectorNumElements();
+      SmallVector<int, 8> ExpectedMask(NumElts, -1);
+      std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
+                NumElts / 2);
+
+      auto ShufOp = Op1.getOperand(0);
+      if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
+        return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
+    }
+    return SDValue();
+  }
+  case X86ISD::BLENDI: {
+    SDValue V0 = N->getOperand(0);
+    SDValue V1 = N->getOperand(1);
+    assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
+           "Unexpected input vector types");
+
+    // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
+    // operands and changing the mask to 1. This saves us a bunch of
+    // pattern-matching possibilities related to scalar math ops in SSE/AVX.
+    // x86InstrInfo knows how to commute this back after instruction selection
+    // if it would help register allocation.
+
+    // TODO: If optimizing for size or a processor that doesn't suffer from
+    // partial register update stalls, this should be transformed into a MOVSD
+    // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
+
+    if (VT == MVT::v2f64)
+      if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+        if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
+          SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
+          return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
+        }
+
+    return SDValue();
+  }
+  case X86ISD::MOVSD:
+  case X86ISD::MOVSS: {
+    bool isFloat = VT.isFloatingPoint();
+    SDValue V0 = peekThroughBitcasts(N->getOperand(0));
+    SDValue V1 = peekThroughBitcasts(N->getOperand(1));
+    bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
+    bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
+    bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
+    bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
+    assert(!(isZero0 && isZero1) && "Zeroable shuffle detected.");
+
+    // We often lower to MOVSD/MOVSS from integer as well as native float
+    // types; remove unnecessary domain-crossing bitcasts if we can to make it
+    // easier to combine shuffles later on. We've already accounted for the
+    // domain switching cost when we decided to lower with it.
+    if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
+      MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
+                          : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
+      V0 = DAG.getBitcast(NewVT, V0);
+      V1 = DAG.getBitcast(NewVT, V1);
+      return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
+    }
+
+    return SDValue();
+  }
+  case X86ISD::INSERTPS: {
+    assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
+    SDValue Op0 = N.getOperand(0);
+    SDValue Op1 = N.getOperand(1);
+    SDValue Op2 = N.getOperand(2);
+    unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
+    unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
+    unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
+    unsigned ZeroMask = InsertPSMask & 0xF;
+
+    // If we zero out all elements from Op0 then we don't need to reference it.
+    if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
+      return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
+                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
+
+    // If we zero out the element from Op1 then we don't need to reference it.
+    if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
+      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
+                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
+
+    // Attempt to merge insertps Op1 with an inner target shuffle node.
+    SmallVector<int, 8> TargetMask1;
+    SmallVector<SDValue, 2> Ops1;
+    if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
+      int M = TargetMask1[SrcIdx];
+      if (isUndefOrZero(M)) {
+        // Zero/UNDEF insertion - zero out element and remove dependency.
+        InsertPSMask |= (1u << DstIdx);
+        return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
+                           DAG.getConstant(InsertPSMask, DL, MVT::i8));
+      }
+      // Update insertps mask srcidx and reference the source input directly.
+      assert(0 <= M && M < 8 && "Shuffle index out of range");
+      InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
+      Op1 = Ops1[M < 4 ? 0 : 1];
+      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
+                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
+    }
+
+    // Attempt to merge insertps Op0 with an inner target shuffle node.
+    SmallVector<int, 8> TargetMask0;
+    SmallVector<SDValue, 2> Ops0;
+    if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
+      return SDValue();
+
+    bool Updated = false;
+    bool UseInput00 = false;
+    bool UseInput01 = false;
+    for (int i = 0; i != 4; ++i) {
+      int M = TargetMask0[i];
+      if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
+        // No change if element is already zero or the inserted element.
+        continue;
+      } else if (isUndefOrZero(M)) {
+        // If the target mask is undef/zero then we must zero the element.
+        InsertPSMask |= (1u << i);
+        Updated = true;
+        continue;
+      }
+
+      // The input vector element must be inline.
+      if (M != i && M != (i + 4))
+        return SDValue();
+
+      // Determine which inputs of the target shuffle we're using.
+      UseInput00 |= (0 <= M && M < 4);
+      UseInput01 |= (4 <= M);
+    }
+
+    // If we're not using both inputs of the target shuffle then use the
+    // referenced input directly.
+    if (UseInput00 && !UseInput01) {
+      Updated = true;
+      Op0 = Ops0[0];
+    } else if (!UseInput00 && UseInput01) {
+      Updated = true;
+      Op0 = Ops0[1];
+    }
+
+    if (Updated)
+      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
+                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
+
+    return SDValue();
+  }
+  default:
+    return SDValue();
+  }
+
+  // Nuke no-op shuffles that show up after combining.
+  if (isNoopShuffleMask(Mask))
+    return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
+
+  // Look for simplifications involving one or two shuffle instructions.
+  SDValue V = N.getOperand(0);
+  switch (N.getOpcode()) {
+  default:
+    break;
+  case X86ISD::PSHUFLW:
+  case X86ISD::PSHUFHW:
+    assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
+
+    if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
+      return SDValue(); // We combined away this shuffle, so we're done.
+
+    // See if this reduces to a PSHUFD which is no more expensive and can
+    // combine with more operations. Note that it has to at least flip the
+    // dwords as otherwise it would have been removed as a no-op.
+    if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
+      int DMask[] = {0, 1, 2, 3};
+      int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
+      DMask[DOffset + 0] = DOffset + 1;
+      DMask[DOffset + 1] = DOffset + 0;
+      MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+      V = DAG.getBitcast(DVT, V);
+      DCI.AddToWorklist(V.getNode());
+      V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
+                      getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
+      DCI.AddToWorklist(V.getNode());
+      return DAG.getBitcast(VT, V);
+    }
+
+    // Look for shuffle patterns which can be implemented as a single unpack.
+    // FIXME: This doesn't handle the location of the PSHUFD generically, and
+    // only works when we have a PSHUFD followed by two half-shuffles.
+    if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
+        (V.getOpcode() == X86ISD::PSHUFLW ||
+         V.getOpcode() == X86ISD::PSHUFHW) &&
+        V.getOpcode() != N.getOpcode() &&
+        V.hasOneUse()) {
+      SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
+      if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
+        SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+        SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
+        int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+        int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+        int WordMask[8];
+        for (int i = 0; i < 4; ++i) {
+          WordMask[i + NOffset] = Mask[i] + NOffset;
+          WordMask[i + VOffset] = VMask[i] + VOffset;
+        }
+        // Map the word mask through the DWord mask.
+        int MappedMask[8];
+        for (int i = 0; i < 8; ++i)
+          MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
+        if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
+            makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
+          // We can replace all three shuffles with an unpack.
+          V = DAG.getBitcast(VT, D.getOperand(0));
+          DCI.AddToWorklist(V.getNode());
+          return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
+                                                : X86ISD::UNPCKH,
+                             DL, VT, V, V);
+        }
+      }
+    }
+
+    break;
+
+  case X86ISD::PSHUFD:
+    if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+      return NewN;
+
+    break;
+  }
+
+  return SDValue();
+}
+
+/// \brief Try to combine a shuffle into a target-specific add-sub node.
+///
+/// We combine this directly on the abstract vector shuffle nodes so it is
+/// easier to generically match. We also insert dummy vector shuffle nodes for
+/// the operands which explicitly discard the lanes which are unused by this
+/// operation to try to flow through the rest of the combiner the fact that
+/// they're unused.
+static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
+                                      SelectionDAG &DAG) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
+      (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
+    return SDValue();
+
+  // We only handle target-independent shuffles.
+  // FIXME: It would be easy and harmless to use the target shuffle mask
+  // extraction tool to support more.
+  if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
+    return SDValue();
+
+  ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
+  SmallVector<int, 8> Mask(OrigMask.begin(), OrigMask.end());
+
+  SDValue V1 = N->getOperand(0);
+  SDValue V2 = N->getOperand(1);
+
+  // We require the first shuffle operand to be the FSUB node, and the second to
+  // be the FADD node.
+  if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
+    ShuffleVectorSDNode::commuteMask(Mask);
+    std::swap(V1, V2);
+  } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
+    return SDValue();
+
+  // If there are other uses of these operations we can't fold them.
+  if (!V1->hasOneUse() || !V2->hasOneUse())
+    return SDValue();
+
+  // Ensure that both operations have the same operands. Note that we can
+  // commute the FADD operands.
+  SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
+  if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
+      (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
+    return SDValue();
+
+  // We're looking for blends between FADD and FSUB nodes. We insist on these
+  // nodes being lined up in a specific expected pattern.
+  if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+        isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
+        isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
+    return SDValue();
+
+  return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
+}
+
+// We are looking for a shuffle where both sources are concatenated with undef
+// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
+// if we can express this as a single-source shuffle, that's preferable.
+static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
+                                           const X86Subtarget &Subtarget) {
+  if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
+  if (!VT.is128BitVector() && !VT.is256BitVector())
+    return SDValue();
+
+  if (VT.getVectorElementType() != MVT::i32 &&
+      VT.getVectorElementType() != MVT::i64 &&
+      VT.getVectorElementType() != MVT::f32 &&
+      VT.getVectorElementType() != MVT::f64)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // Check that both sources are concats with undef.
+  if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
+      N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
+      N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
+      !N1.getOperand(1).isUndef())
+    return SDValue();
+
+  // Construct the new shuffle mask. Elements from the first source retain their
+  // index, but elements from the second source no longer need to skip an undef.
+  SmallVector<int, 8> Mask;
+  int NumElts = VT.getVectorNumElements();
+
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  for (int Elt : SVOp->getMask())
+    Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
+
+  SDLoc DL(N);
+  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
+                               N1.getOperand(0));
+  return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
+}
+
+static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
+                              TargetLowering::DAGCombinerInfo &DCI,
+                              const X86Subtarget &Subtarget) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  // Don't create instructions with illegal types after legalize types has run.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
+    return SDValue();
+
+  // If we have legalized the vector types, look for blends of FADD and FSUB
+  // nodes that we can fuse into an ADDSUB node.
+  if (TLI.isTypeLegal(VT))
+    if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
+      return AddSub;
+
+  // During Type Legalization, when promoting illegal vector types,
+  // the backend might introduce new shuffle dag nodes and bitcasts.
+  //
+  // This code performs the following transformation:
+  // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
+  //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
+  //
+  // We do this only if both the bitcast and the BINOP dag nodes have
+  // one use. Also, perform this transformation only if the new binary
+  // operation is legal. This is to avoid introducing dag nodes that
+  // potentially need to be further expanded (or custom lowered) into a
+  // less optimal sequence of dag nodes.
+  if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
+      N->getOpcode() == ISD::VECTOR_SHUFFLE &&
+      N->getOperand(0).getOpcode() == ISD::BITCAST &&
+      N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+
+    SDValue BC0 = N0.getOperand(0);
+    EVT SVT = BC0.getValueType();
+    unsigned Opcode = BC0.getOpcode();
+    unsigned NumElts = VT.getVectorNumElements();
+
+    if (BC0.hasOneUse() && SVT.isVector() &&
+        SVT.getVectorNumElements() * 2 == NumElts &&
+        TLI.isOperationLegal(Opcode, VT)) {
+      bool CanFold = false;
+      switch (Opcode) {
+      default : break;
+      case ISD::ADD:
+      case ISD::SUB:
+      case ISD::MUL:
+        // isOperationLegal lies for integer ops on floating point types.
+        CanFold = VT.isInteger();
+        break;
+      case ISD::FADD:
+      case ISD::FSUB:
+      case ISD::FMUL:
+        // isOperationLegal lies for floating point ops on integer types.
+        CanFold = VT.isFloatingPoint();
+        break;
+      }
+
+      unsigned SVTNumElts = SVT.getVectorNumElements();
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+      for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
+        CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
+      for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
+        CanFold = SVOp->getMaskElt(i) < 0;
+
+      if (CanFold) {
+        SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
+        SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
+        SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
+        return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
+      }
+    }
+  }
+
+  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
+  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
+  // consecutive, non-overlapping, and in the right order.
+  SmallVector<SDValue, 16> Elts;
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
+    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
+
+  if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
+    return LD;
+
+  // For AVX2, we sometimes want to combine
+  // (vector_shuffle <mask> (concat_vectors t1, undef)
+  //                        (concat_vectors t2, undef))
+  // Into:
+  // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
+  // Since the latter can be efficiently lowered with VPERMD/VPERMQ
+  if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
+    return ShufConcat;
+
+  if (isTargetShuffle(N->getOpcode())) {
+    SDValue Op(N, 0);
+    if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
+      return Shuffle;
+
+    // Try recursively combining arbitrary sequences of x86 shuffle
+    // instructions into higher-order shuffles. We do this after combining
+    // specific PSHUF instruction sequences into their minimal form so that we
+    // can evaluate how many specialized shuffle instructions are involved in
+    // a particular chain.
+    SmallVector<int, 1> NonceMask; // Just a placeholder.
+    NonceMask.push_back(0);
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+                                      /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+                                      DCI, Subtarget))
+      return SDValue(); // This routine will use CombineTo to replace N.
+  }
+
+  return SDValue();
+}
+
+/// Check if a vector extract from a target-specific shuffle of a load can be
+/// folded into a single element load.
+/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
+/// shuffles have been custom lowered so we need to handle those here.
+static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue InVec = N->getOperand(0);
+  SDValue EltNo = N->getOperand(1);
+  EVT EltVT = N->getValueType(0);
+
+  if (!isa<ConstantSDNode>(EltNo))
+    return SDValue();
+
+  EVT OriginalVT = InVec.getValueType();
+
+  if (InVec.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!InVec.hasOneUse())
+      return SDValue();
+    EVT BCVT = InVec.getOperand(0).getValueType();
+    if (!BCVT.isVector() ||
+        BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
+      return SDValue();
+    InVec = InVec.getOperand(0);
+  }
+
+  EVT CurrentVT = InVec.getValueType();
+
+  if (!isTargetShuffle(InVec.getOpcode()))
+    return SDValue();
+
+  // Don't duplicate a load with other uses.
+  if (!InVec.hasOneUse())
+    return SDValue();
+
+  SmallVector<int, 16> ShuffleMask;
+  SmallVector<SDValue, 2> ShuffleOps;
+  bool UnaryShuffle;
+  if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
+                            ShuffleOps, ShuffleMask, UnaryShuffle))
+    return SDValue();
+
+  // Select the input vector, guarding against out of range extract vector.
+  unsigned NumElems = CurrentVT.getVectorNumElements();
+  int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+  int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
+
+  if (Idx == SM_SentinelZero)
+    return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
+                             : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
+  if (Idx == SM_SentinelUndef)
+    return DAG.getUNDEF(EltVT);
+
+  assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
+  SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
+                                         : ShuffleOps[1];
+
+  // If inputs to shuffle are the same for both ops, then allow 2 uses
+  unsigned AllowedUses =
+      (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
+
+  if (LdNode.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
+      return SDValue();
+
+    AllowedUses = 1; // only allow 1 load use if we have a bitcast
+    LdNode = LdNode.getOperand(0);
+  }
+
+  if (!ISD::isNormalLoad(LdNode.getNode()))
+    return SDValue();
+
+  LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
+
+  if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
+    return SDValue();
+
+  // If there's a bitcast before the shuffle, check if the load type and
+  // alignment is valid.
+  unsigned Align = LN0->getAlignment();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
+      EltVT.getTypeForEVT(*DAG.getContext()));
+
+  if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
+    return SDValue();
+
+  // All checks match so transform back to vector_shuffle so that DAG combiner
+  // can finish the job
+  SDLoc dl(N);
+
+  // Create shuffle node taking into account the case that its a unary shuffle
+  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
+  Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
+                                 ShuffleMask);
+  Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
+                     EltNo);
+}
+
+static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
+                              const X86Subtarget &Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
+  // special and don't usually play with other vector types, it's better to
+  // handle them early to be sure we emit efficient code by avoiding
+  // store-load conversions.
+  if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
+      N0.getValueType() == MVT::v2i32 &&
+      isNullConstant(N0.getOperand(1))) {
+    SDValue N00 = N0->getOperand(0);
+    if (N00.getValueType() == MVT::i32)
+      return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
+  }
+
+  // Convert a bitcasted integer logic operation that has one bitcasted
+  // floating-point operand into a floating-point logic operation. This may
+  // create a load of a constant, but that is cheaper than materializing the
+  // constant in an integer register and transferring it to an SSE register or
+  // transferring the SSE operand to integer register and back.
+  unsigned FPOpcode;
+  switch (N0.getOpcode()) {
+    case ISD::AND: FPOpcode = X86ISD::FAND; break;
+    case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
+    case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
+    default: return SDValue();
+  }
+
+  if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
+        (Subtarget.hasSSE2() && VT == MVT::f64)))
+    return SDValue();
+
+  SDValue LogicOp0 = N0.getOperand(0);
+  SDValue LogicOp1 = N0.getOperand(1);
+  SDLoc DL0(N0);
+
+  // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
+  if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
+      LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
+      !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
+    SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
+    return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
+  }
+  // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
+  if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
+      LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
+      !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
+    SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
+    return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
+  }
+
+  return SDValue();
+}
+
+// Match a binop + shuffle pyramid that represents a horizontal reduction over
+// the elements of a vector.
+// Returns the vector that is being reduced on, or SDValue() if a reduction
+// was not matched.
+static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
+  // The pattern must end in an extract from index 0.
+  if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
+      !isNullConstant(Extract->getOperand(1)))
+    return SDValue();
+
+  unsigned Stages =
+      Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
+
+  SDValue Op = Extract->getOperand(0);
+  // At each stage, we're looking for something that looks like:
+  // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
+  //                    <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
+  //                               i32 undef, i32 undef, i32 undef, i32 undef>
+  // %a = binop <8 x i32> %op, %s
+  // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
+  // we expect something like:
+  // <4,5,6,7,u,u,u,u>
+  // <2,3,u,u,u,u,u,u>
+  // <1,u,u,u,u,u,u,u>
+  for (unsigned i = 0; i < Stages; ++i) {
+    if (Op.getOpcode() != BinOp)
+      return SDValue();
+
+    ShuffleVectorSDNode *Shuffle =
+        dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
+    if (Shuffle) {
+      Op = Op.getOperand(1);
+    } else {
+      Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
+      Op = Op.getOperand(0);
+    }
+
+    // The first operand of the shuffle should be the same as the other operand
+    // of the add.
+    if (!Shuffle || (Shuffle->getOperand(0) != Op))
+      return SDValue();
+
+    // Verify the shuffle has the expected (at this stage of the pyramid) mask.
+    for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
+      if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
+        return SDValue();
+  }
+
+  return Op;
+}
+
+// Given a select, detect the following pattern:
+// 1:    %2 = zext <N x i8> %0 to <N x i32>
+// 2:    %3 = zext <N x i8> %1 to <N x i32>
+// 3:    %4 = sub nsw <N x i32> %2, %3
+// 4:    %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
+// 5:    %6 = sub nsw <N x i32> zeroinitializer, %4
+// 6:    %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
+// This is useful as it is the input into a SAD pattern.
+static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
+                              SDValue &Op1) {
+  // Check the condition of the select instruction is greater-than.
+  SDValue SetCC = Select->getOperand(0);
+  if (SetCC.getOpcode() != ISD::SETCC)
+    return false;
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+  if (CC != ISD::SETGT)
+    return false;
+
+  SDValue SelectOp1 = Select->getOperand(1);
+  SDValue SelectOp2 = Select->getOperand(2);
+
+  // The second operand of the select should be the negation of the first
+  // operand, which is implemented as 0 - SelectOp1.
+  if (!(SelectOp2.getOpcode() == ISD::SUB &&
+        ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
+        SelectOp2.getOperand(1) == SelectOp1))
+    return false;
+
+  // The first operand of SetCC is the first operand of the select, which is the
+  // difference between the two input vectors.
+  if (SetCC.getOperand(0) != SelectOp1)
+    return false;
+
+  // The second operand of the comparison can be either -1 or 0.
+  if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
+        ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
+    return false;
+
+  // The first operand of the select is the difference between the two input
+  // vectors.
+  if (SelectOp1.getOpcode() != ISD::SUB)
+    return false;
+
+  Op0 = SelectOp1.getOperand(0);
+  Op1 = SelectOp1.getOperand(1);
+
+  // Check if the operands of the sub are zero-extended from vectors of i8.
+  if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
+      Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
+      Op1.getOpcode() != ISD::ZERO_EXTEND ||
+      Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
+    return false;
+
+  return true;
+}
+
+// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
+// to these zexts.
+static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
+                            const SDValue &Zext1, const SDLoc &DL) {
+
+  // Find the appropriate width for the PSADBW.
+  EVT InVT = Zext0.getOperand(0).getValueType();
+  unsigned RegSize = std::max(128u, InVT.getSizeInBits());
+
+  // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
+  // fill in the missing vector elements with 0.
+  unsigned NumConcat = RegSize / InVT.getSizeInBits();
+  SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
+  Ops[0] = Zext0.getOperand(0);
+  MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
+  SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+  Ops[0] = Zext1.getOperand(0);
+  SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+
+  // Actually build the SAD
+  MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
+  return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
+}
+
+static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
+  // PSADBW is only supported on SSE2 and up.
+  if (!Subtarget.hasSSE2())
+    return SDValue();
+
+  // Verify the type we're extracting from is appropriate
+  // TODO: There's nothing special about i32, any integer type above i16 should
+  // work just as well.
+  EVT VT = Extract->getOperand(0).getValueType();
+  if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32))
+    return SDValue();
+
+  unsigned RegSize = 128;
+  if (Subtarget.hasBWI())
+    RegSize = 512;
+  else if (Subtarget.hasAVX2())
+    RegSize = 256;
+
+  // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+  // TODO: We should be able to handle larger vectors by splitting them before
+  // feeding them into several SADs, and then reducing over those.
+  if (VT.getSizeInBits() / 4 > RegSize)
+    return SDValue();
+
+  // Match shuffle + add pyramid.
+  SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
+
+  // If there was a match, we want Root to be a select that is the root of an
+  // abs-diff pattern.
+  if (!Root || (Root.getOpcode() != ISD::VSELECT))
+    return SDValue();
+
+  // Check whether we have an abs-diff pattern feeding into the select.
+  SDValue Zext0, Zext1;
+  if (!detectZextAbsDiff(Root, Zext0, Zext1))
+    return SDValue();
+
+  // Create the SAD instruction
+  SDLoc DL(Extract);
+  SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
+
+  // If the original vector was wider than 8 elements, sum over the results
+  // in the SAD vector.
+  unsigned Stages = Log2_32(VT.getVectorNumElements());
+  MVT SadVT = SAD.getSimpleValueType();
+  if (Stages > 3) {
+    unsigned SadElems = SadVT.getVectorNumElements();
+
+    for(unsigned i = Stages - 3; i > 0; --i) {
+      SmallVector<int, 16> Mask(SadElems, -1);
+      for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
+        Mask[j] = MaskEnd + j;
+
+      SDValue Shuffle =
+          DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
+      SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
+    }
+  }
+
+  // Return the lowest i32.
+  MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);
+  SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,
+                     Extract->getOperand(1));
+}
+
+/// Detect vector gather/scatter index generation and convert it from being a
+/// bunch of shuffles and extracts into a somewhat faster sequence.
+/// For i686, the best sequence is apparently storing the value and loading
+/// scalars back, while for x64 we should use 64-bit extracts and shifts.
+static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       const X86Subtarget &Subtarget) {
+  if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
+    return NewOp;
+
+  SDValue InputVector = N->getOperand(0);
+  SDLoc dl(InputVector);
+  // Detect mmx to i32 conversion through a v2i32 elt extract.
+  if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+      N->getValueType(0) == MVT::i32 &&
+      InputVector.getValueType() == MVT::v2i32 &&
+      isa<ConstantSDNode>(N->getOperand(1)) &&
+      N->getConstantOperandVal(1) == 0) {
+    SDValue MMXSrc = InputVector.getOperand(0);
+
+    // The bitcast source is a direct mmx result.
+    if (MMXSrc.getValueType() == MVT::x86mmx)
+      return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
+  }
+
+  EVT VT = N->getValueType(0);
+
+  if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
+      InputVector.getOpcode() == ISD::BITCAST &&
+      isa<ConstantSDNode>(InputVector.getOperand(0))) {
+    uint64_t ExtractedElt =
+        cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    uint64_t InputValue =
+        cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
+    uint64_t Res = (InputValue >> ExtractedElt) & 1;
+    return DAG.getConstant(Res, dl, MVT::i1);
+  }
+
+  // Check whether this extract is the root of a sum of absolute differences
+  // pattern. This has to be done here because we really want it to happen
+  // pre-legalization,
+  if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
+    return SAD;
+
+  // Only operate on vectors of 4 elements, where the alternative shuffling
+  // gets to be more expensive.
+  if (InputVector.getValueType() != MVT::v4i32)
+    return SDValue();
+
+  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
+  // single use which is a sign-extend or zero-extend, and all elements are
+  // used.
+  SmallVector<SDNode *, 4> Uses;
+  unsigned ExtractedElements = 0;
+  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
+       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
+    if (UI.getUse().getResNo() != InputVector.getResNo())
+      return SDValue();
+
+    SDNode *Extract = *UI;
+    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    if (Extract->getValueType(0) != MVT::i32)
+      return SDValue();
+    if (!Extract->hasOneUse())
+      return SDValue();
+    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
+        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
+      return SDValue();
+    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
+      return SDValue();
+
+    // Record which element was extracted.
+    ExtractedElements |=
+      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
+
+    Uses.push_back(Extract);
+  }
+
+  // If not all the elements were used, this may not be worthwhile.
+  if (ExtractedElements != 15)
+    return SDValue();
+
+  // Ok, we've now decided to do the transformation.
+  // If 64-bit shifts are legal, use the extract-shift sequence,
+  // otherwise bounce the vector off the cache.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Vals[4];
+
+  if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
+    SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
+    auto &DL = DAG.getDataLayout();
+    EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
+    SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
+      DAG.getConstant(0, dl, VecIdxTy));
+    SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
+      DAG.getConstant(1, dl, VecIdxTy));
+
+    SDValue ShAmt = DAG.getConstant(
+        32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
+    Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
+    Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+      DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
+    Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
+    Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+      DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
+  } else {
+    // Store the value to a temporary stack slot.
+    SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+    SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
+                              MachinePointerInfo());
+
+    EVT ElementType = InputVector.getValueType().getVectorElementType();
+    unsigned EltSize = ElementType.getSizeInBits() / 8;
+
+    // Replace each use (extract) with a load of the appropriate element.
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t Offset = EltSize * i;
+      auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+      SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
+
+      SDValue ScalarAddr =
+          DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
+
+      // Load the scalar.
+      Vals[i] =
+          DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
+    }
+  }
+
+  // Replace the extracts
+  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
+    UE = Uses.end(); UI != UE; ++UI) {
+    SDNode *Extract = *UI;
+
+    SDValue Idx = Extract->getOperand(1);
+    uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
+  }
+
+  // The replacement was made in place; don't return anything.
+  return SDValue();
+}
+
+/// If a vector select has an operand that is -1 or 0, simplify the select to a
+/// bitwise logic operation.
+static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
+                                                const X86Subtarget &Subtarget) {
+  SDValue Cond = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  EVT VT = LHS.getValueType();
+  EVT CondVT = Cond.getValueType();
+  SDLoc DL(N);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  if (N->getOpcode() != ISD::VSELECT)
+    return SDValue();
+
+  bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+  // Check if the first operand is all zeros.This situation only
+  // applies to avx512.
+  if (FValIsAllZeros  && Subtarget.hasAVX512() && Cond.hasOneUse()) {
+      //Invert the cond to not(cond) : xor(op,allones)=not(op)
+      SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+        DAG.getConstant(1, DL, Cond.getValueType()));
+      //Vselect cond, op1, op2 = Vselect not(cond), op2, op1
+      return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
+  }
+  assert(CondVT.isVector() && "Vector select expects a vector selector!");
+
+  // To use the condition operand as a bitwise mask, it must have elements that
+  // are the same size as the select elements. Ie, the condition operand must
+  // have already been promoted from the IR select condition type <N x i1>.
+  // Don't check if the types themselves are equal because that excludes
+  // vector floating-point selects.
+  if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+    return SDValue();
+
+  bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
+  FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+  // Try to invert the condition if true value is not all 1s and false value is
+  // not all 0s.
+  if (!TValIsAllOnes && !FValIsAllZeros &&
+      // Check if the selector will be produced by CMPP*/PCMP*.
+      Cond.getOpcode() == ISD::SETCC &&
+      // Check if SETCC has already been promoted.
+      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
+          CondVT) {
+    bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+    bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
+
+    if (TValIsAllZeros || FValIsAllOnes) {
+      SDValue CC = Cond.getOperand(2);
+      ISD::CondCode NewCC =
+          ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+                               Cond.getOperand(0).getValueType().isInteger());
+      Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
+                          NewCC);
+      std::swap(LHS, RHS);
+      TValIsAllOnes = FValIsAllOnes;
+      FValIsAllZeros = TValIsAllZeros;
+    }
+  }
+
+  if (!TValIsAllOnes && !FValIsAllZeros)
+    return SDValue();
+
+  SDValue Ret;
+  if (TValIsAllOnes && FValIsAllZeros)
+    Ret = Cond;
+  else if (TValIsAllOnes)
+    Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
+  else if (FValIsAllZeros)
+    Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, DAG.getBitcast(CondVT, LHS));
+
+  return DAG.getBitcast(VT, Ret);
+}
+
+static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
+  SDValue Cond = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  SDLoc DL(N);
+
+  auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
+  auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
+  if (!TrueC || !FalseC)
+    return SDValue();
+
+  // Don't do this for crazy integer types.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
+    return SDValue();
+
+  // If this is efficiently invertible, canonicalize the LHSC/RHSC values
+  // so that TrueC (the true value) is larger than FalseC.
+  bool NeedsCondInvert = false;
+  if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
+      // Efficiently invertible.
+      (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
+       (Cond.getOpcode() == ISD::XOR &&  // xor(X, C) -> invertible.
+        isa<ConstantSDNode>(Cond.getOperand(1))))) {
+    NeedsCondInvert = true;
+    std::swap(TrueC, FalseC);
+  }
+
+  // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
+  if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
+    if (NeedsCondInvert) // Invert the condition if needed.
+      Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                         DAG.getConstant(1, DL, Cond.getValueType()));
+
+    // Zero extend the condition if needed.
+    Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
+
+    unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+    return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
+                       DAG.getConstant(ShAmt, DL, MVT::i8));
+  }
+
+  // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
+  if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) {
+    if (NeedsCondInvert) // Invert the condition if needed.
+      Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                         DAG.getConstant(1, DL, Cond.getValueType()));
+
+    // Zero extend the condition if needed.
+    Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
+    return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                       SDValue(FalseC, 0));
+  }
+
+  // Optimize cases that will turn into an LEA instruction.  This requires
+  // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+  if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+    uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
+    if (N->getValueType(0) == MVT::i32)
+      Diff = (unsigned)Diff;
+
+    bool isFastMultiplier = false;
+    if (Diff < 10) {
+      switch ((unsigned char)Diff) {
+      default:
+        break;
+      case 1: // result = add base, cond
+      case 2: // result = lea base(    , cond*2)
+      case 3: // result = lea base(cond, cond*2)
+      case 4: // result = lea base(    , cond*4)
+      case 5: // result = lea base(cond, cond*4)
+      case 8: // result = lea base(    , cond*8)
+      case 9: // result = lea base(cond, cond*8)
+        isFastMultiplier = true;
+        break;
+      }
+    }
+
+    if (isFastMultiplier) {
+      APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
+      if (NeedsCondInvert) // Invert the condition if needed.
+        Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                           DAG.getConstant(1, DL, Cond.getValueType()));
+
+      // Zero extend the condition if needed.
+      Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
+      // Scale the condition by the difference.
+      if (Diff != 1)
+        Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+                           DAG.getConstant(Diff, DL, Cond.getValueType()));
+
+      // Add the base if non-zero.
+      if (FalseC->getAPIntValue() != 0)
+        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                           SDValue(FalseC, 0));
+      return Cond;
+    }
+  }
+
+  return SDValue();
+}
+
+// If this is a bitcasted op that can be represented as another type, push the
+// the bitcast to the inputs. This allows more opportunities for pattern
+// matching masked instructions. This is called when we know that the operation
+// is used as one of the inputs of a vselect.
+static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
+                                      TargetLowering::DAGCombinerInfo &DCI) {
+  // Make sure we have a bitcast.
+  if (OrigOp.getOpcode() != ISD::BITCAST)
+    return false;
+
+  SDValue Op = OrigOp.getOperand(0);
+
+  // If the operation is used by anything other than the bitcast, we shouldn't
+  // do this combine as that would replicate the operation.
+  if (!Op.hasOneUse())
+    return false;
+
+  MVT VT = OrigOp.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  SDLoc DL(Op.getNode());
+
+  auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
+                                      SDValue Op2) {
+    Op0 = DAG.getBitcast(VT, Op0);
+    DCI.AddToWorklist(Op0.getNode());
+    Op1 = DAG.getBitcast(VT, Op1);
+    DCI.AddToWorklist(Op1.getNode());
+    DCI.CombineTo(OrigOp.getNode(),
+                  DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
+    return true;
+  };
+
+  unsigned Opcode = Op.getOpcode();
+  switch (Opcode) {
+  case X86ISD::PALIGNR:
+    // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
+    if (!VT.is128BitVector())
+      return false;
+    Opcode = X86ISD::VALIGN;
+    LLVM_FALLTHROUGH;
+  case X86ISD::VALIGN: {
+    if (EltVT != MVT::i32 && EltVT != MVT::i64)
+      return false;
+    uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+    MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
+    unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
+    unsigned EltSize = EltVT.getSizeInBits();
+    // Make sure we can represent the same shift with the new VT.
+    if ((ShiftAmt % EltSize) != 0)
+      return false;
+    Imm = ShiftAmt / EltSize;
+    return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
+                                    DAG.getConstant(Imm, DL, MVT::i8));
+  }
+  case X86ISD::SHUF128: {
+    if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
+      return false;
+    // Only change element size, not type.
+    if (VT.isInteger() != Op.getSimpleValueType().isInteger())
+      return false;
+    return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
+                                    Op.getOperand(2));
+  }
+  }
+
+  return false;
+}
+
+/// Do target-specific dag combines on SELECT and VSELECT nodes.
+static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI,
+                             const X86Subtarget &Subtarget) {
+  SDLoc DL(N);
+  SDValue Cond = N->getOperand(0);
+  // Get the LHS/RHS of the select.
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  EVT VT = LHS.getValueType();
+  EVT CondVT = Cond.getValueType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
+  // instructions match the semantics of the common C idiom x<y?x:y but not
+  // x<=y?x:y, because of how they handle negative zero (which can be
+  // ignored in unsafe-math mode).
+  // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
+  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
+      VT != MVT::f80 && VT != MVT::f128 &&
+      (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
+      (Subtarget.hasSSE2() ||
+       (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+    unsigned Opcode = 0;
+    // Check for x CC y ? x : y.
+    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+      switch (CC) {
+      default: break;
+      case ISD::SETULT:
+        // Converting this to a min would handle NaNs incorrectly, and swapping
+        // the operands would cause it to handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
+          if (!DAG.getTarget().Options.UnsafeFPMath &&
+              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+            break;
+          std::swap(LHS, RHS);
+        }
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETOLE:
+        // Converting this to a min would handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+          break;
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETULE:
+        // Converting this to a min would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
+        std::swap(LHS, RHS);
+      case ISD::SETOLT:
+      case ISD::SETLT:
+      case ISD::SETLE:
+        Opcode = X86ISD::FMIN;
+        break;
+
+      case ISD::SETOGE:
+        // Converting this to a max would handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+          break;
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETUGT:
+        // Converting this to a max would handle NaNs incorrectly, and swapping
+        // the operands would cause it to handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
+          if (!DAG.getTarget().Options.UnsafeFPMath &&
+              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+            break;
+          std::swap(LHS, RHS);
+        }
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETUGE:
+        // Converting this to a max would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
+        std::swap(LHS, RHS);
+      case ISD::SETOGT:
+      case ISD::SETGT:
+      case ISD::SETGE:
+        Opcode = X86ISD::FMAX;
+        break;
+      }
+    // Check for x CC y ? y : x -- a min/max with reversed arms.
+    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+      switch (CC) {
+      default: break;
+      case ISD::SETOGE:
+        // Converting this to a min would handle comparisons between positive
+        // and negative zero incorrectly, and swapping the operands would
+        // cause it to handle NaNs incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
+          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+            break;
+          std::swap(LHS, RHS);
+        }
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETUGT:
+        // Converting this to a min would handle NaNs incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+          break;
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETUGE:
+        // Converting this to a min would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
+        std::swap(LHS, RHS);
+      case ISD::SETOGT:
+      case ISD::SETGT:
+      case ISD::SETGE:
+        Opcode = X86ISD::FMIN;
+        break;
+
+      case ISD::SETULT:
+        // Converting this to a max would handle NaNs incorrectly.
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+          break;
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETOLE:
+        // Converting this to a max would handle comparisons between positive
+        // and negative zero incorrectly, and swapping the operands would
+        // cause it to handle NaNs incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
+          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+            break;
+          std::swap(LHS, RHS);
+        }
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETULE:
+        // Converting this to a max would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
+        std::swap(LHS, RHS);
+      case ISD::SETOLT:
+      case ISD::SETLT:
+      case ISD::SETLE:
+        Opcode = X86ISD::FMAX;
+        break;
+      }
+    }
+
+    if (Opcode)
+      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
+  }
+
+  // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
+  // lowering on KNL. In this case we convert it to
+  // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
+  // The same situation for all 128 and 256-bit vectors of i8 and i16.
+  // Since SKX these selects have a proper lowering.
+  if (Subtarget.hasAVX512() && CondVT.isVector() &&
+      CondVT.getVectorElementType() == MVT::i1 &&
+      (VT.is128BitVector() || VT.is256BitVector()) &&
+      (VT.getVectorElementType() == MVT::i8 ||
+       VT.getVectorElementType() == MVT::i16) &&
+      !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
+    Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
+    DCI.AddToWorklist(Cond.getNode());
+    return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
+  }
+
+  if (SDValue V = combineSelectOfTwoConstants(N, DAG))
+    return V;
+
+  // Canonicalize max and min:
+  // (x > y) ? x : y -> (x >= y) ? x : y
+  // (x < y) ? x : y -> (x <= y) ? x : y
+  // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
+  // the need for an extra compare
+  // against zero. e.g.
+  // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
+  // subl   %esi, %edi
+  // testl  %edi, %edi
+  // movl   $0, %eax
+  // cmovgl %edi, %eax
+  // =>
+  // xorl   %eax, %eax
+  // subl   %esi, $edi
+  // cmovsl %eax, %edi
+  if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
+      DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+    switch (CC) {
+    default: break;
+    case ISD::SETLT:
+    case ISD::SETGT: {
+      ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
+      Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
+                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
+      return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
+    }
+    }
+  }
+
+  // Early exit check
+  if (!TLI.isTypeLegal(VT))
+    return SDValue();
+
+  // Match VSELECTs into subs with unsigned saturation.
+  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+      // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
+      ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
+       (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+    // Check if one of the arms of the VSELECT is a zero vector. If it's on the
+    // left side invert the predicate to simplify logic below.
+    SDValue Other;
+    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+      Other = RHS;
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
+      Other = LHS;
+    }
+
+    if (Other.getNode() && Other->getNumOperands() == 2 &&
+        DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
+      SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
+      SDValue CondRHS = Cond->getOperand(1);
+
+      // Look for a general sub with unsigned saturation first.
+      // x >= y ? x-y : 0 --> subus x, y
+      // x >  y ? x-y : 0 --> subus x, y
+      if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
+          Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
+        return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+
+      if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
+        if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
+          if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
+            if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
+              // If the RHS is a constant we have to reverse the const
+              // canonicalization.
+              // x > C-1 ? x+-C : 0 --> subus x, C
+              if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+                  CondRHSConst->getAPIntValue() ==
+                      (-OpRHSConst->getAPIntValue() - 1))
+                return DAG.getNode(
+                    X86ISD::SUBUS, DL, VT, OpLHS,
+                    DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
+
+          // Another special case: If C was a sign bit, the sub has been
+          // canonicalized into a xor.
+          // FIXME: Would it be better to use computeKnownBits to determine
+          //        whether it's safe to decanonicalize the xor?
+          // x s< 0 ? x^C : 0 --> subus x, C
+          if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+              ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+              OpRHSConst->getAPIntValue().isSignBit())
+            // Note that we have to rebuild the RHS constant here to ensure we
+            // don't rely on particular values of undef lanes.
+            return DAG.getNode(
+                X86ISD::SUBUS, DL, VT, OpLHS,
+                DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
+        }
+    }
+  }
+
+  if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, Subtarget))
+    return V;
+
+  // If this is a *dynamic* select (non-constant condition) and we can match
+  // this node with one of the variable blend instructions, restructure the
+  // condition so that the blends can use the high bit of each element and use
+  // SimplifyDemandedBits to simplify the condition operand.
+  if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
+      !DCI.isBeforeLegalize() &&
+      !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
+    unsigned BitWidth = Cond.getScalarValueSizeInBits();
+
+    // Don't optimize vector selects that map to mask-registers.
+    if (BitWidth == 1)
+      return SDValue();
+
+    // We can only handle the cases where VSELECT is directly legal on the
+    // subtarget. We custom lower VSELECT nodes with constant conditions and
+    // this makes it hard to see whether a dynamic VSELECT will correctly
+    // lower, so we both check the operation's status and explicitly handle the
+    // cases where a *dynamic* blend will fail even though a constant-condition
+    // blend could be custom lowered.
+    // FIXME: We should find a better way to handle this class of problems.
+    // Potentially, we should combine constant-condition vselect nodes
+    // pre-legalization into shuffles and not mark as many types as custom
+    // lowered.
+    if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+      return SDValue();
+    // FIXME: We don't support i16-element blends currently. We could and
+    // should support them by making *all* the bits in the condition be set
+    // rather than just the high bit and using an i8-element blend.
+    if (VT.getVectorElementType() == MVT::i16)
+      return SDValue();
+    // Dynamic blending was only available from SSE4.1 onward.
+    if (VT.is128BitVector() && !Subtarget.hasSSE41())
+      return SDValue();
+    // Byte blends are only available in AVX2
+    if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
+      return SDValue();
+
+    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
+    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
+
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
+                                          DCI.isBeforeLegalizeOps());
+    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
+        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
+                                 TLO)) {
+      // If we changed the computation somewhere in the DAG, this change
+      // will affect all users of Cond.
+      // Make sure it is fine and update all the nodes so that we do not
+      // use the generic VSELECT anymore. Otherwise, we may perform
+      // wrong optimizations as we messed up with the actual expectation
+      // for the vector boolean values.
+      if (Cond != TLO.Old) {
+        // Check all uses of that condition operand to check whether it will be
+        // consumed by non-BLEND instructions, which may depend on all bits are
+        // set properly.
+        for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
+             I != E; ++I)
+          if (I->getOpcode() != ISD::VSELECT)
+            // TODO: Add other opcodes eventually lowered into BLEND.
+            return SDValue();
+
+        // Update all the users of the condition, before committing the change,
+        // so that the VSELECT optimizations that expect the correct vector
+        // boolean value will not be triggered.
+        for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
+             I != E; ++I)
+          DAG.ReplaceAllUsesOfValueWith(
+              SDValue(*I, 0),
+              DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
+                          Cond, I->getOperand(1), I->getOperand(2)));
+        DCI.CommitTargetLoweringOpt(TLO);
+        return SDValue();
+      }
+      // At this point, only Cond is changed. Change the condition
+      // just for N to keep the opportunity to optimize all other
+      // users their own way.
+      DAG.ReplaceAllUsesOfValueWith(
+          SDValue(N, 0),
+          DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
+                      TLO.New, N->getOperand(1), N->getOperand(2)));
+      return SDValue();
+    }
+  }
+
+  // Look for vselects with LHS/RHS being bitcasted from an operation that
+  // can be executed on another type. Push the bitcast to the inputs of
+  // the operation. This exposes opportunities for using masking instructions.
+  if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() &&
+      CondVT.getVectorElementType() == MVT::i1) {
+    if (combineBitcastForMaskedOp(LHS, DAG, DCI))
+      return SDValue(N, 0);
+    if (combineBitcastForMaskedOp(RHS, DAG, DCI))
+      return SDValue(N, 0);
+  }
+
+  return SDValue();
+}
+
+/// Combine:
+///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
+/// to:
+///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
+/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
+/// Note that this is only legal for some op/cc combinations.
+static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
+                                       SelectionDAG &DAG) {
+  // This combine only operates on CMP-like nodes.
+  if (!(Cmp.getOpcode() == X86ISD::CMP ||
+        (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
+    return SDValue();
+
+  // This only applies to variations of the common case:
+  //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
+  //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
+  //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
+  //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
+  // Using the proper condcodes (see below), overflow is checked for.
+
+  // FIXME: We can generalize both constraints:
+  // - XOR/OR/AND (if they were made to survive AtomicExpand)
+  // - LHS != 1
+  // if the result is compared.
+
+  SDValue CmpLHS = Cmp.getOperand(0);
+  SDValue CmpRHS = Cmp.getOperand(1);
+
+  if (!CmpLHS.hasOneUse())
+    return SDValue();
+
+  auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
+  if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
+    return SDValue();
+
+  const unsigned Opc = CmpLHS.getOpcode();
+
+  if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
+    return SDValue();
+
+  SDValue OpRHS = CmpLHS.getOperand(2);
+  auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
+  if (!OpRHSC)
+    return SDValue();
+
+  APInt Addend = OpRHSC->getAPIntValue();
+  if (Opc == ISD::ATOMIC_LOAD_SUB)
+    Addend = -Addend;
+
+  if (CC == X86::COND_S && Addend == 1)
+    CC = X86::COND_LE;
+  else if (CC == X86::COND_NS && Addend == 1)
+    CC = X86::COND_G;
+  else if (CC == X86::COND_G && Addend == -1)
+    CC = X86::COND_GE;
+  else if (CC == X86::COND_LE && Addend == -1)
+    CC = X86::COND_L;
+  else
+    return SDValue();
+
+  SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
+  DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
+                                DAG.getUNDEF(CmpLHS.getValueType()));
+  DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
+  return LockOp;
+}
+
+// Check whether a boolean test is testing a boolean value generated by
+// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
+// code.
+//
+// Simplify the following patterns:
+// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
+// to (Op EFLAGS Cond)
+//
+// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
+// to (Op EFLAGS !Cond)
+//
+// where Op could be BRCOND or CMOV.
+//
+static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+  // This combine only operates on CMP-like nodes.
+  if (!(Cmp.getOpcode() == X86ISD::CMP ||
+        (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
+    return SDValue();
+
+  // Quit if not used as a boolean value.
+  if (CC != X86::COND_E && CC != X86::COND_NE)
+    return SDValue();
+
+  // Check CMP operands. One of them should be 0 or 1 and the other should be
+  // an SetCC or extended from it.
+  SDValue Op1 = Cmp.getOperand(0);
+  SDValue Op2 = Cmp.getOperand(1);
+
+  SDValue SetCC;
+  const ConstantSDNode* C = nullptr;
+  bool needOppositeCond = (CC == X86::COND_E);
+  bool checkAgainstTrue = false; // Is it a comparison against 1?
+
+  if ((C = dyn_cast<ConstantSDNode>(Op1)))
+    SetCC = Op2;
+  else if ((C = dyn_cast<ConstantSDNode>(Op2)))
+    SetCC = Op1;
+  else // Quit if all operands are not constants.
+    return SDValue();
+
+  if (C->getZExtValue() == 1) {
+    needOppositeCond = !needOppositeCond;
+    checkAgainstTrue = true;
+  } else if (C->getZExtValue() != 0)
+    // Quit if the constant is neither 0 or 1.
+    return SDValue();
+
+  bool truncatedToBoolWithAnd = false;
+  // Skip (zext $x), (trunc $x), or (and $x, 1) node.
+  while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
+         SetCC.getOpcode() == ISD::TRUNCATE ||
+         SetCC.getOpcode() == ISD::AND) {
+    if (SetCC.getOpcode() == ISD::AND) {
+      int OpIdx = -1;
+      if (isOneConstant(SetCC.getOperand(0)))
+        OpIdx = 1;
+      if (isOneConstant(SetCC.getOperand(1)))
+        OpIdx = 0;
+      if (OpIdx < 0)
+        break;
+      SetCC = SetCC.getOperand(OpIdx);
+      truncatedToBoolWithAnd = true;
+    } else
+      SetCC = SetCC.getOperand(0);
+  }
+
+  switch (SetCC.getOpcode()) {
+  case X86ISD::SETCC_CARRY:
+    // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
+    // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
+    // i.e. it's a comparison against true but the result of SETCC_CARRY is not
+    // truncated to i1 using 'and'.
+    if (checkAgainstTrue && !truncatedToBoolWithAnd)
+      break;
+    assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
+           "Invalid use of SETCC_CARRY!");
+    LLVM_FALLTHROUGH;
+  case X86ISD::SETCC:
+    // Set the condition code or opposite one if necessary.
+    CC = X86::CondCode(SetCC.getConstantOperandVal(0));
+    if (needOppositeCond)
+      CC = X86::GetOppositeBranchCondition(CC);
+    return SetCC.getOperand(1);
+  case X86ISD::CMOV: {
+    // Check whether false/true value has canonical one, i.e. 0 or 1.
+    ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
+    ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
+    // Quit if true value is not a constant.
+    if (!TVal)
+      return SDValue();
+    // Quit if false value is not a constant.
+    if (!FVal) {
+      SDValue Op = SetCC.getOperand(0);
+      // Skip 'zext' or 'trunc' node.
+      if (Op.getOpcode() == ISD::ZERO_EXTEND ||
+          Op.getOpcode() == ISD::TRUNCATE)
+        Op = Op.getOperand(0);
+      // A special case for rdrand/rdseed, where 0 is set if false cond is
+      // found.
+      if ((Op.getOpcode() != X86ISD::RDRAND &&
+           Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
+        return SDValue();
+    }
+    // Quit if false value is not the constant 0 or 1.
+    bool FValIsFalse = true;
+    if (FVal && FVal->getZExtValue() != 0) {
+      if (FVal->getZExtValue() != 1)
+        return SDValue();
+      // If FVal is 1, opposite cond is needed.
+      needOppositeCond = !needOppositeCond;
+      FValIsFalse = false;
+    }
+    // Quit if TVal is not the constant opposite of FVal.
+    if (FValIsFalse && TVal->getZExtValue() != 1)
+      return SDValue();
+    if (!FValIsFalse && TVal->getZExtValue() != 0)
+      return SDValue();
+    CC = X86::CondCode(SetCC.getConstantOperandVal(2));
+    if (needOppositeCond)
+      CC = X86::GetOppositeBranchCondition(CC);
+    return SetCC.getOperand(3);
+  }
+  }
+
+  return SDValue();
+}
+
+/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
+/// Match:
+///   (X86or (X86setcc) (X86setcc))
+///   (X86cmp (and (X86setcc) (X86setcc)), 0)
+static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
+                                           X86::CondCode &CC1, SDValue &Flags,
+                                           bool &isAnd) {
+  if (Cond->getOpcode() == X86ISD::CMP) {
+    if (!isNullConstant(Cond->getOperand(1)))
+      return false;
+
+    Cond = Cond->getOperand(0);
+  }
+
+  isAnd = false;
+
+  SDValue SetCC0, SetCC1;
+  switch (Cond->getOpcode()) {
+  default: return false;
+  case ISD::AND:
+  case X86ISD::AND:
+    isAnd = true;
+    LLVM_FALLTHROUGH;
+  case ISD::OR:
+  case X86ISD::OR:
+    SetCC0 = Cond->getOperand(0);
+    SetCC1 = Cond->getOperand(1);
+    break;
+  };
+
+  // Make sure we have SETCC nodes, using the same flags value.
+  if (SetCC0.getOpcode() != X86ISD::SETCC ||
+      SetCC1.getOpcode() != X86ISD::SETCC ||
+      SetCC0->getOperand(1) != SetCC1->getOperand(1))
+    return false;
+
+  CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
+  CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
+  Flags = SetCC0->getOperand(1);
+  return true;
+}
+
+/// Optimize an EFLAGS definition used according to the condition code \p CC
+/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
+/// uses of chain values.
+static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
+                                  SelectionDAG &DAG) {
+  if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
+    return R;
+  return combineSetCCAtomicArith(EFLAGS, CC, DAG);
+}
+
+/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
+static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
+                           TargetLowering::DAGCombinerInfo &DCI,
+                           const X86Subtarget &Subtarget) {
+  SDLoc DL(N);
+
+  // If the flag operand isn't dead, don't touch this CMOV.
+  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
+    return SDValue();
+
+  SDValue FalseOp = N->getOperand(0);
+  SDValue TrueOp = N->getOperand(1);
+  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
+  SDValue Cond = N->getOperand(3);
+
+  if (CC == X86::COND_E || CC == X86::COND_NE) {
+    switch (Cond.getOpcode()) {
+    default: break;
+    case X86ISD::BSR:
+    case X86ISD::BSF:
+      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
+      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
+        return (CC == X86::COND_E) ? FalseOp : TrueOp;
+    }
+  }
+
+  // Try to simplify the EFLAGS and condition code operands.
+  // We can't always do this as FCMOV only supports a subset of X86 cond.
+  if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
+    if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
+      SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
+        Flags};
+      return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+    }
+  }
+
+  // If this is a select between two integer constants, try to do some
+  // optimizations.  Note that the operands are ordered the opposite of SELECT
+  // operands.
+  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
+    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
+      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
+      // larger than FalseC (the false value).
+      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
+        CC = X86::GetOppositeBranchCondition(CC);
+        std::swap(TrueC, FalseC);
+        std::swap(TrueOp, FalseOp);
+      }
+
+      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
+      // This is efficient for any integer data type (including i8/i16) and
+      // shift amount.
+      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
+        Cond = getSETCC(CC, Cond, DL, DAG);
+
+        // Zero extend the condition if needed.
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
+
+        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
+                           DAG.getConstant(ShAmt, DL, MVT::i8));
+        if (N->getNumValues() == 2)  // Dead flag value?
+          return DCI.CombineTo(N, Cond, SDValue());
+        return Cond;
+      }
+
+      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
+      // for any integer data type, including i8/i16.
+      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+        Cond = getSETCC(CC, Cond, DL, DAG);
+
+        // Zero extend the condition if needed.
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+                           FalseC->getValueType(0), Cond);
+        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                           SDValue(FalseC, 0));
+
+        if (N->getNumValues() == 2)  // Dead flag value?
+          return DCI.CombineTo(N, Cond, SDValue());
+        return Cond;
+      }
+
+      // Optimize cases that will turn into an LEA instruction.  This requires
+      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+        bool isFastMultiplier = false;
+        if (Diff < 10) {
+          switch ((unsigned char)Diff) {
+          default: break;
+          case 1:  // result = add base, cond
+          case 2:  // result = lea base(    , cond*2)
+          case 3:  // result = lea base(cond, cond*2)
+          case 4:  // result = lea base(    , cond*4)
+          case 5:  // result = lea base(cond, cond*4)
+          case 8:  // result = lea base(    , cond*8)
+          case 9:  // result = lea base(cond, cond*8)
+            isFastMultiplier = true;
+            break;
+          }
+        }
+
+        if (isFastMultiplier) {
+          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+          Cond = getSETCC(CC, Cond, DL ,DAG);
+          // Zero extend the condition if needed.
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+                             Cond);
+          // Scale the condition by the difference.
+          if (Diff != 1)
+            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+                               DAG.getConstant(Diff, DL, Cond.getValueType()));
+
+          // Add the base if non-zero.
+          if (FalseC->getAPIntValue() != 0)
+            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                               SDValue(FalseC, 0));
+          if (N->getNumValues() == 2)  // Dead flag value?
+            return DCI.CombineTo(N, Cond, SDValue());
+          return Cond;
+        }
+      }
+    }
+  }
+
+  // Handle these cases:
+  //   (select (x != c), e, c) -> select (x != c), e, x),
+  //   (select (x == c), c, e) -> select (x == c), x, e)
+  // where the c is an integer constant, and the "select" is the combination
+  // of CMOV and CMP.
+  //
+  // The rationale for this change is that the conditional-move from a constant
+  // needs two instructions, however, conditional-move from a register needs
+  // only one instruction.
+  //
+  // CAVEAT: By replacing a constant with a symbolic value, it may obscure
+  //  some instruction-combining opportunities. This opt needs to be
+  //  postponed as late as possible.
+  //
+  if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
+    // the DCI.xxxx conditions are provided to postpone the optimization as
+    // late as possible.
+
+    ConstantSDNode *CmpAgainst = nullptr;
+    if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
+        (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
+        !isa<ConstantSDNode>(Cond.getOperand(0))) {
+
+      if (CC == X86::COND_NE &&
+          CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
+        CC = X86::GetOppositeBranchCondition(CC);
+        std::swap(TrueOp, FalseOp);
+      }
+
+      if (CC == X86::COND_E &&
+          CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
+        SDValue Ops[] = { FalseOp, Cond.getOperand(0),
+                          DAG.getConstant(CC, DL, MVT::i8), Cond };
+        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
+      }
+    }
+  }
+
+  // Fold and/or of setcc's to double CMOV:
+  //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
+  //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
+  //
+  // This combine lets us generate:
+  //   cmovcc1 (jcc1 if we don't have CMOV)
+  //   cmovcc2 (same)
+  // instead of:
+  //   setcc1
+  //   setcc2
+  //   and/or
+  //   cmovne (jne if we don't have CMOV)
+  // When we can't use the CMOV instruction, it might increase branch
+  // mispredicts.
+  // When we can use CMOV, or when there is no mispredict, this improves
+  // throughput and reduces register pressure.
+  //
+  if (CC == X86::COND_NE) {
+    SDValue Flags;
+    X86::CondCode CC0, CC1;
+    bool isAndSetCC;
+    if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
+      if (isAndSetCC) {
+        std::swap(FalseOp, TrueOp);
+        CC0 = X86::GetOppositeBranchCondition(CC0);
+        CC1 = X86::GetOppositeBranchCondition(CC1);
+      }
+
+      SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
+        Flags};
+      SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
+      SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
+      SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
+      return CMOV;
+    }
+  }
+
+  return SDValue();
+}
+
+/// Different mul shrinking modes.
+enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
+
+static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
+  EVT VT = N->getOperand(0).getValueType();
+  if (VT.getScalarSizeInBits() != 32)
+    return false;
+
+  assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
+  unsigned SignBits[2] = {1, 1};
+  bool IsPositive[2] = {false, false};
+  for (unsigned i = 0; i < 2; i++) {
+    SDValue Opd = N->getOperand(i);
+
+    // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
+    // compute signbits for it separately.
+    if (Opd.getOpcode() == ISD::ANY_EXTEND) {
+      // For anyextend, it is safe to assume an appropriate number of leading
+      // sign/zero bits.
+      if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
+        SignBits[i] = 25;
+      else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
+               MVT::i16)
+        SignBits[i] = 17;
+      else
+        return false;
+      IsPositive[i] = true;
+    } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
+      // All the operands of BUILD_VECTOR need to be int constant.
+      // Find the smallest value range which all the operands belong to.
+      SignBits[i] = 32;
+      IsPositive[i] = true;
+      for (const SDValue &SubOp : Opd.getNode()->op_values()) {
+        if (SubOp.isUndef())
+          continue;
+        auto *CN = dyn_cast<ConstantSDNode>(SubOp);
+        if (!CN)
+          return false;
+        APInt IntVal = CN->getAPIntValue();
+        if (IntVal.isNegative())
+          IsPositive[i] = false;
+        SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
+      }
+    } else {
+      SignBits[i] = DAG.ComputeNumSignBits(Opd);
+      if (Opd.getOpcode() == ISD::ZERO_EXTEND)
+        IsPositive[i] = true;
+    }
+  }
+
+  bool AllPositive = IsPositive[0] && IsPositive[1];
+  unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
+  // When ranges are from -128 ~ 127, use MULS8 mode.
+  if (MinSignBits >= 25)
+    Mode = MULS8;
+  // When ranges are from 0 ~ 255, use MULU8 mode.
+  else if (AllPositive && MinSignBits >= 24)
+    Mode = MULU8;
+  // When ranges are from -32768 ~ 32767, use MULS16 mode.
+  else if (MinSignBits >= 17)
+    Mode = MULS16;
+  // When ranges are from 0 ~ 65535, use MULU16 mode.
+  else if (AllPositive && MinSignBits >= 16)
+    Mode = MULU16;
+  else
+    return false;
+  return true;
+}
+
+/// When the operands of vector mul are extended from smaller size values,
+/// like i8 and i16, the type of mul may be shrinked to generate more
+/// efficient code. Two typical patterns are handled:
+/// Pattern1:
+///     %2 = sext/zext <N x i8> %1 to <N x i32>
+///     %4 = sext/zext <N x i8> %3 to <N x i32>
+//   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
+///     %5 = mul <N x i32> %2, %4
+///
+/// Pattern2:
+///     %2 = zext/sext <N x i16> %1 to <N x i32>
+///     %4 = zext/sext <N x i16> %3 to <N x i32>
+///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
+///     %5 = mul <N x i32> %2, %4
+///
+/// There are four mul shrinking modes:
+/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
+/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
+/// generate pmullw+sext32 for it (MULS8 mode).
+/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
+/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
+/// generate pmullw+zext32 for it (MULU8 mode).
+/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
+/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
+/// generate pmullw+pmulhw for it (MULS16 mode).
+/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
+/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
+/// generate pmullw+pmulhuw for it (MULU16 mode).
+static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
+                               const X86Subtarget &Subtarget) {
+  // Check for legality
+  // pmullw/pmulhw are not supported by SSE.
+  if (!Subtarget.hasSSE2())
+    return SDValue();
+
+  // Check for profitability
+  // pmulld is supported since SSE41. It is better to use pmulld
+  // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
+  // the expansion.
+  bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
+  if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
+    return SDValue();
+
+  ShrinkMode Mode;
+  if (!canReduceVMulWidth(N, DAG, Mode))
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getOperand(0).getValueType();
+  unsigned RegSize = 128;
+  MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
+  EVT ReducedVT =
+      EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
+  // Shrink the operands of mul.
+  SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
+  SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
+
+  if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
+    // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
+    // lower part is needed.
+    SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
+    if (Mode == MULU8 || Mode == MULS8) {
+      return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
+                         DL, VT, MulLo);
+    } else {
+      MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+      // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
+      // the higher part is also needed.
+      SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
+                                  ReducedVT, NewN0, NewN1);
+
+      // Repack the lower part and higher part result of mul into a wider
+      // result.
+      // Generate shuffle functioning as punpcklwd.
+      SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
+      for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
+        ShuffleMask[2 * i] = i;
+        ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
+      }
+      SDValue ResLo =
+          DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+      ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
+      // Generate shuffle functioning as punpckhwd.
+      for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
+        ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
+        ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
+      }
+      SDValue ResHi =
+          DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+      ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
+    }
+  } else {
+    // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
+    // to legalize the mul explicitly because implicit legalization for type
+    // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
+    // instructions which will not exist when we explicitly legalize it by
+    // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
+    // <4 x i16> undef).
+    //
+    // Legalize the operands of mul.
+    // FIXME: We may be able to handle non-concatenated vectors by insertion.
+    unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
+    if ((RegSize % ReducedSizeInBits) != 0)
+      return SDValue();
+
+    SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
+                                 DAG.getUNDEF(ReducedVT));
+    Ops[0] = NewN0;
+    NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
+    Ops[0] = NewN1;
+    NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
+
+    if (Mode == MULU8 || Mode == MULS8) {
+      // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
+      // part is needed.
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
+
+      // convert the type of mul result to VT.
+      MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
+      SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
+                                              : ISD::SIGN_EXTEND_VECTOR_INREG,
+                                DL, ResVT, Mul);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+                         DAG.getIntPtrConstant(0, DL));
+    } else {
+      // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
+      // MULU16/MULS16, both parts are needed.
+      SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
+      SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
+                                  OpsVT, NewN0, NewN1);
+
+      // Repack the lower part and higher part result of mul into a wider
+      // result. Make sure the type of mul result is VT.
+      MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
+      SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
+      Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+                         DAG.getIntPtrConstant(0, DL));
+    }
+  }
+}
+
+/// Optimize a single multiply with constant into two operations in order to
+/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
+static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
+                          TargetLowering::DAGCombinerInfo &DCI,
+                          const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (DCI.isBeforeLegalize() && VT.isVector())
+    return reduceVMULWidth(N, DAG, Subtarget);
+
+  // An imul is usually smaller than the alternative sequence.
+  if (DAG.getMachineFunction().getFunction()->optForMinSize())
+    return SDValue();
+
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  if (VT != MVT::i64 && VT != MVT::i32)
+    return SDValue();
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!C)
+    return SDValue();
+  uint64_t MulAmt = C->getZExtValue();
+  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+    return SDValue();
+
+  uint64_t MulAmt1 = 0;
+  uint64_t MulAmt2 = 0;
+  if ((MulAmt % 9) == 0) {
+    MulAmt1 = 9;
+    MulAmt2 = MulAmt / 9;
+  } else if ((MulAmt % 5) == 0) {
+    MulAmt1 = 5;
+    MulAmt2 = MulAmt / 5;
+  } else if ((MulAmt % 3) == 0) {
+    MulAmt1 = 3;
+    MulAmt2 = MulAmt / 3;
+  }
+
+  SDLoc DL(N);
+  SDValue NewMul;
+  if (MulAmt2 &&
+      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
+
+    if (isPowerOf2_64(MulAmt2) &&
+        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
+      // If second multiplifer is pow2, issue it first. We want the multiply by
+      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
+      // is an add.
+      std::swap(MulAmt1, MulAmt2);
+
+    if (isPowerOf2_64(MulAmt1))
+      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                           DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
+    else
+      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+                           DAG.getConstant(MulAmt1, DL, VT));
+
+    if (isPowerOf2_64(MulAmt2))
+      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
+                           DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
+    else
+      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
+                           DAG.getConstant(MulAmt2, DL, VT));
+  }
+
+  if (!NewMul) {
+    assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
+           && "Both cases that could cause potential overflows should have "
+              "already been handled.");
+    if (isPowerOf2_64(MulAmt - 1))
+      // (mul x, 2^N + 1) => (add (shl x, N), x)
+      NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+                                DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                                DAG.getConstant(Log2_64(MulAmt - 1), DL,
+                                MVT::i8)));
+
+    else if (isPowerOf2_64(MulAmt + 1))
+      // (mul x, 2^N - 1) => (sub (shl x, N), x)
+      NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
+                                N->getOperand(0),
+                                DAG.getConstant(Log2_64(MulAmt + 1),
+                                DL, MVT::i8)), N->getOperand(0));
+  }
+
+  if (NewMul)
+    // Do not add new nodes to DAG combiner worklist.
+    DCI.CombineTo(N, NewMul, false);
+
+  return SDValue();
+}
+
+static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+
+  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
+  // since the result of setcc_c is all zero's or all ones.
+  if (VT.isInteger() && !VT.isVector() &&
+      N1C && N0.getOpcode() == ISD::AND &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    SDValue N00 = N0.getOperand(0);
+    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    const APInt &ShAmt = N1C->getAPIntValue();
+    Mask = Mask.shl(ShAmt);
+    bool MaskOK = false;
+    // We can handle cases concerning bit-widening nodes containing setcc_c if
+    // we carefully interrogate the mask to make sure we are semantics
+    // preserving.
+    // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
+    // of the underlying setcc_c operation if the setcc_c was zero extended.
+    // Consider the following example:
+    //   zext(setcc_c)                 -> i32 0x0000FFFF
+    //   c1                            -> i32 0x0000FFFF
+    //   c2                            -> i32 0x00000001
+    //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
+    //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
+    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+      MaskOK = true;
+    } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
+               N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+      MaskOK = true;
+    } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
+                N00.getOpcode() == ISD::ANY_EXTEND) &&
+               N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+      MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
+    }
+    if (MaskOK && Mask != 0) {
+      SDLoc DL(N);
+      return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
+    }
+  }
+
+  // Hardware support for vector shifts is sparse which makes us scalarize the
+  // vector operations in many cases. Also, on sandybridge ADD is faster than
+  // shl.
+  // (shl V, 1) -> add V,V
+  if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
+    if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
+      assert(N0.getValueType().isVector() && "Invalid vector shift type");
+      // We shift all of the values by one. In many cases we do not have
+      // hardware support for this operation. This is better expressed as an ADD
+      // of two values.
+      if (N1SplatC->getAPIntValue() == 1)
+        return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
+    }
+
+  return SDValue();
+}
+
+static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  unsigned Size = VT.getSizeInBits();
+
+  // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
+  // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
+  // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
+  // depending on sign of (SarConst - [56,48,32,24,16])
+
+  // sexts in X86 are MOVs. The MOVs have the same code size
+  // as above SHIFTs (only SHIFT on 1 has lower code size).
+  // However the MOVs have 2 advantages to a SHIFT:
+  // 1. MOVs can write to a register that differs from source
+  // 2. MOVs accept memory operands
+
+  if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
+      N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
+      N0.getOperand(1).getOpcode() != ISD::Constant)
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  SDValue N01 = N0.getOperand(1);
+  APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
+  APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
+  EVT CVT = N1.getValueType();
+
+  if (SarConst.isNegative())
+    return SDValue();
+
+  for (MVT SVT : MVT::integer_valuetypes()) {
+    unsigned ShiftSize = SVT.getSizeInBits();
+    // skipping types without corresponding sext/zext and
+    // ShlConst that is not one of [56,48,32,24,16]
+    if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
+      continue;
+    SDLoc DL(N);
+    SDValue NN =
+        DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
+    SarConst = SarConst - (Size - ShiftSize);
+    if (SarConst == 0)
+      return NN;
+    else if (SarConst.isNegative())
+      return DAG.getNode(ISD::SHL, DL, VT, NN,
+                         DAG.getConstant(-SarConst, DL, CVT));
+    else
+      return DAG.getNode(ISD::SRA, DL, VT, NN,
+                         DAG.getConstant(SarConst, DL, CVT));
+  }
+  return SDValue();
+}
+
+/// \brief Returns a vector of 0s if the node in input is a vector logical
+/// shift by a constant amount which is known to be bigger than or equal
+/// to the vector element size in bits.
+static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+
+  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
+      (!Subtarget.hasInt256() ||
+       (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
+    return SDValue();
+
+  SDValue Amt = N->getOperand(1);
+  SDLoc DL(N);
+  if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
+    if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
+      const APInt &ShiftAmt = AmtSplat->getAPIntValue();
+      unsigned MaxAmount =
+        VT.getSimpleVT().getScalarSizeInBits();
+
+      // SSE2/AVX2 logical shifts always return a vector of 0s
+      // if the shift amount is bigger than or equal to
+      // the element size. The constant shift amount will be
+      // encoded as a 8-bit immediate.
+      if (ShiftAmt.trunc(8).uge(MaxAmount))
+        return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
+    }
+
+  return SDValue();
+}
+
+static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            const X86Subtarget &Subtarget) {
+  if (N->getOpcode() == ISD::SHL)
+    if (SDValue V = combineShiftLeft(N, DAG))
+      return V;
+
+  if (N->getOpcode() == ISD::SRA)
+    if (SDValue V = combineShiftRightAlgebraic(N, DAG))
+      return V;
+
+  // Try to fold this logical shift into a zero vector.
+  if (N->getOpcode() != ISD::SRA)
+    if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
+      return V;
+
+  return SDValue();
+}
+
+static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget &Subtarget) {
+  assert((X86ISD::VSHLI == N->getOpcode() || X86ISD::VSRLI == N->getOpcode()) &&
+         "Unexpected opcode");
+  EVT VT = N->getValueType(0);
+  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+  // This fails for mask register (vXi1) shifts.
+  if ((NumBitsPerElt % 8) != 0)
+    return SDValue();
+
+  // Out of range logical bit shifts are guaranteed to be zero.
+  APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+  if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt))
+    return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+
+  // Shift N0 by zero -> N0.
+  if (!ShiftVal)
+    return N->getOperand(0);
+
+  // Shift zero -> zero.
+  if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+    return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+
+  // We can decode 'whole byte' logical bit shifts as shuffles.
+  if ((ShiftVal.getZExtValue() % 8) == 0) {
+    SDValue Op(N, 0);
+    SmallVector<int, 1> NonceMask; // Just a placeholder.
+    NonceMask.push_back(0);
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+                                      /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+                                      DCI, Subtarget))
+      return SDValue(); // This routine will use CombineTo to replace N.
+  }
+
+  return SDValue();
+}
+
+/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
+/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
+/// OR -> CMPNEQSS.
+static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget &Subtarget) {
+  unsigned opcode;
+
+  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
+  // we're requiring SSE2 for both.
+  if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue CMP0 = N0->getOperand(1);
+    SDValue CMP1 = N1->getOperand(1);
+    SDLoc DL(N);
+
+    // The SETCCs should both refer to the same CMP.
+    if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
+      return SDValue();
+
+    SDValue CMP00 = CMP0->getOperand(0);
+    SDValue CMP01 = CMP0->getOperand(1);
+    EVT     VT    = CMP00.getValueType();
+
+    if (VT == MVT::f32 || VT == MVT::f64) {
+      bool ExpectingFlags = false;
+      // Check for any users that want flags:
+      for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+           !ExpectingFlags && UI != UE; ++UI)
+        switch (UI->getOpcode()) {
+        default:
+        case ISD::BR_CC:
+        case ISD::BRCOND:
+        case ISD::SELECT:
+          ExpectingFlags = true;
+          break;
+        case ISD::CopyToReg:
+        case ISD::SIGN_EXTEND:
+        case ISD::ZERO_EXTEND:
+        case ISD::ANY_EXTEND:
+          break;
+        }
+
+      if (!ExpectingFlags) {
+        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
+        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
+
+        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
+          X86::CondCode tmp = cc0;
+          cc0 = cc1;
+          cc1 = tmp;
+        }
+
+        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
+            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
+          // FIXME: need symbolic constants for these magic numbers.
+          // See X86ATTInstPrinter.cpp:printSSECC().
+          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
+          if (Subtarget.hasAVX512()) {
+            SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00,
+                                         CMP01,
+                                         DAG.getConstant(x86cc, DL, MVT::i8));
+            if (N->getValueType(0) != MVT::i1)
+              return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
+                                 FSetCC);
+            return FSetCC;
+          }
+          SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
+                                              CMP00.getValueType(), CMP00, CMP01,
+                                              DAG.getConstant(x86cc, DL,
+                                                              MVT::i8));
+
+          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
+          MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
+
+          if (is64BitFP && !Subtarget.is64Bit()) {
+            // On a 32-bit target, we cannot bitcast the 64-bit float to a
+            // 64-bit integer, since that's not a legal type. Since
+            // OnesOrZeroesF is all ones of all zeroes, we don't need all the
+            // bits, but can do this little dance to extract the lowest 32 bits
+            // and work with those going forward.
+            SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
+                                           OnesOrZeroesF);
+            SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
+            OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
+                                        Vector32, DAG.getIntPtrConstant(0, DL));
+            IntVT = MVT::i32;
+          }
+
+          SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
+          SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
+                                      DAG.getConstant(1, DL, IntVT));
+          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
+                                              ANDed);
+          return OneBitOfTruth;
+        }
+      }
+    }
+  }
+  return SDValue();
+}
+
+/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
+static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == ISD::AND);
+
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
+    return SDValue();
+
+  // Canonicalize XOR to the left.
+  if (N1.getOpcode() == ISD::XOR)
+    std::swap(N0, N1);
+
+  if (N0.getOpcode() != ISD::XOR)
+    return SDValue();
+
+  SDValue N00 = N0->getOperand(0);
+  SDValue N01 = N0->getOperand(1);
+
+  N01 = peekThroughBitcasts(N01);
+
+  // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
+  // insert_subvector building a 256-bit AllOnes vector.
+  if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
+    if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
+      return SDValue();
+
+    SDValue V1 = N01->getOperand(0);
+    SDValue V2 = N01->getOperand(1);
+    if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
+        !V1.getOperand(0).isUndef() ||
+        !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
+        !ISD::isBuildVectorAllOnes(V2.getNode()))
+      return SDValue();
+  }
+  return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
+}
+
+// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
+// register. In most cases we actually compare or select YMM-sized registers
+// and mixing the two types creates horrible code. This method optimizes
+// some of the transition sequences.
+static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!VT.is256BitVector())
+    return SDValue();
+
+  assert((N->getOpcode() == ISD::ANY_EXTEND ||
+          N->getOpcode() == ISD::ZERO_EXTEND ||
+          N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
+
+  SDValue Narrow = N->getOperand(0);
+  EVT NarrowVT = Narrow->getValueType(0);
+  if (!NarrowVT.is128BitVector())
+    return SDValue();
+
+  if (Narrow->getOpcode() != ISD::XOR &&
+      Narrow->getOpcode() != ISD::AND &&
+      Narrow->getOpcode() != ISD::OR)
+    return SDValue();
+
+  SDValue N0  = Narrow->getOperand(0);
+  SDValue N1  = Narrow->getOperand(1);
+  SDLoc DL(Narrow);
+
+  // The Left side has to be a trunc.
+  if (N0.getOpcode() != ISD::TRUNCATE)
+    return SDValue();
+
+  // The type of the truncated inputs.
+  EVT WideVT = N0->getOperand(0)->getValueType(0);
+  if (WideVT != VT)
+    return SDValue();
+
+  // The right side has to be a 'trunc' or a constant vector.
+  bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
+  ConstantSDNode *RHSConstSplat = nullptr;
+  if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
+    RHSConstSplat = RHSBV->getConstantSplatNode();
+  if (!RHSTrunc && !RHSConstSplat)
+    return SDValue();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
+    return SDValue();
+
+  // Set N0 and N1 to hold the inputs to the new wide operation.
+  N0 = N0->getOperand(0);
+  if (RHSConstSplat) {
+    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
+                     SDValue(RHSConstSplat, 0));
+    N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
+  } else if (RHSTrunc) {
+    N1 = N1->getOperand(0);
+  }
+
+  // Generate the wide operation.
+  SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
+  unsigned Opcode = N->getOpcode();
+  switch (Opcode) {
+  case ISD::ANY_EXTEND:
+    return Op;
+  case ISD::ZERO_EXTEND: {
+    unsigned InBits = NarrowVT.getScalarSizeInBits();
+    APInt Mask = APInt::getAllOnesValue(InBits);
+    Mask = Mask.zext(VT.getScalarSizeInBits());
+    return DAG.getNode(ISD::AND, DL, VT,
+                       Op, DAG.getConstant(Mask, DL, VT));
+  }
+  case ISD::SIGN_EXTEND:
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
+                       Op, DAG.getValueType(NarrowVT));
+  default:
+    llvm_unreachable("Unexpected opcode");
+  }
+}
+
+/// If both input operands of a logic op are being cast from floating point
+/// types, try to convert this into a floating point logic node to avoid
+/// unnecessary moves from SSE to integer registers.
+static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
+                                        const X86Subtarget &Subtarget) {
+  unsigned FPOpcode = ISD::DELETED_NODE;
+  if (N->getOpcode() == ISD::AND)
+    FPOpcode = X86ISD::FAND;
+  else if (N->getOpcode() == ISD::OR)
+    FPOpcode = X86ISD::FOR;
+  else if (N->getOpcode() == ISD::XOR)
+    FPOpcode = X86ISD::FXOR;
+
+  assert(FPOpcode != ISD::DELETED_NODE &&
+         "Unexpected input node for FP logic conversion");
+
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+  if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
+      ((Subtarget.hasSSE1() && VT == MVT::i32) ||
+       (Subtarget.hasSSE2() && VT == MVT::i64))) {
+    SDValue N00 = N0.getOperand(0);
+    SDValue N10 = N1.getOperand(0);
+    EVT N00Type = N00.getValueType();
+    EVT N10Type = N10.getValueType();
+    if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
+      SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
+      return DAG.getBitcast(VT, FPLogic);
+    }
+  }
+  return SDValue();
+}
+
+/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
+/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
+/// eliminate loading the vector constant mask value. This relies on the fact
+/// that a PCMP always creates an all-ones or all-zeros bitmask per element.
+static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
+  SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
+
+  // TODO: Use AssertSext to mark any nodes that have the property of producing
+  // all-ones or all-zeros. Then check for that node rather than particular
+  // opcodes.
+  if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
+    return SDValue();
+
+  // The existence of the PCMP node guarantees that we have the required SSE2 or
+  // AVX2 for a shift of this vector type, but there is no vector shift by
+  // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
+  // masked compare nodes, so they should not make it here.
+  EVT VT0 = Op0.getValueType();
+  EVT VT1 = Op1.getValueType();
+  unsigned EltBitWidth = VT0.getScalarSizeInBits();
+  if (VT0 != VT1 || EltBitWidth == 8)
+    return SDValue();
+
+  assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
+
+  APInt SplatVal;
+  if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
+  SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
+  return DAG.getBitcast(N->getValueType(0), Shift);
+}
+
+static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
+                          TargetLowering::DAGCombinerInfo &DCI,
+                          const X86Subtarget &Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
+    return R;
+
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+    return FPLogic;
+
+  if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
+    return R;
+
+  if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
+    return ShiftRight;
+
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  // Attempt to recursively combine a bitmask AND with shuffles.
+  if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+    SDValue Op(N, 0);
+    SmallVector<int, 1> NonceMask; // Just a placeholder.
+    NonceMask.push_back(0);
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+                                      /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+                                      DCI, Subtarget))
+      return SDValue(); // This routine will use CombineTo to replace N.
+  }
+
+  // Create BEXTR instructions
+  // BEXTR is ((X >> imm) & (2**size-1))
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
+    return SDValue();
+  if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
+  ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  if (MaskNode && ShiftNode) {
+    uint64_t Mask = MaskNode->getZExtValue();
+    uint64_t Shift = ShiftNode->getZExtValue();
+    if (isMask_64(Mask)) {
+      uint64_t MaskSize = countPopulation(Mask);
+      if (Shift + MaskSize <= VT.getSizeInBits())
+        return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
+                           DAG.getConstant(Shift | (MaskSize << 8), DL,
+                                           VT));
+    }
+  }
+  return SDValue();
+}
+
+// Try to fold:
+//   (or (and (m, y), (pandn m, x)))
+// into:
+//   (vselect m, x, y)
+// As a special case, try to fold:
+//   (or (and (m, (sub 0, x)), (pandn m, x)))
+// into:
+//   (sub (xor X, M), M)
+static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
+                                            const X86Subtarget &Subtarget) {
+  assert(N->getOpcode() == ISD::OR);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
+    return SDValue();
+  assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
+
+  // Canonicalize pandn to RHS
+  if (N0.getOpcode() == X86ISD::ANDNP)
+    std::swap(N0, N1);
+
+  if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
+    return SDValue();
+
+  SDValue Mask = N1.getOperand(0);
+  SDValue X = N1.getOperand(1);
+  SDValue Y;
+  if (N0.getOperand(0) == Mask)
+    Y = N0.getOperand(1);
+  if (N0.getOperand(1) == Mask)
+    Y = N0.getOperand(0);
+
+  // Check to see if the mask appeared in both the AND and ANDNP.
+  if (!Y.getNode())
+    return SDValue();
+
+  // Validate that X, Y, and Mask are bitcasts, and see through them.
+  Mask = peekThroughBitcasts(Mask);
+  X = peekThroughBitcasts(X);
+  Y = peekThroughBitcasts(Y);
+
+  EVT MaskVT = Mask.getValueType();
+
+  // Validate that the Mask operand is a vector sra node.
+  // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
+  // there is no psrai.b
+  unsigned EltBits = MaskVT.getScalarSizeInBits();
+  unsigned SraAmt = ~0;
+  if (Mask.getOpcode() == ISD::SRA) {
+    if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
+      if (auto *AmtConst = AmtBV->getConstantSplatNode())
+        SraAmt = AmtConst->getZExtValue();
+  } else if (Mask.getOpcode() == X86ISD::VSRAI) {
+    SDValue SraC = Mask.getOperand(1);
+    SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
+  }
+  if ((SraAmt + 1) != EltBits)
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // Try to match:
+  //   (or (and (M, (sub 0, X)), (pandn M, X)))
+  // which is a special case of vselect:
+  //   (vselect M, (sub 0, X), X)
+  // Per:
+  // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
+  // We know that, if fNegate is 0 or 1:
+  //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
+  //
+  // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
+  //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
+  //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
+  // This lets us transform our vselect to:
+  //   (add (xor X, M), (and M, 1))
+  // And further to:
+  //   (sub (xor X, M), M)
+  if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+    auto IsNegV = [](SDNode *N, SDValue V) {
+      return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
+        ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
+    };
+    SDValue V;
+    if (IsNegV(Y.getNode(), X))
+      V = X;
+    else if (IsNegV(X.getNode(), Y))
+      V = Y;
+
+    if (V) {
+      assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
+      SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
+      SDValue SubOp2 = Mask;
+
+      // If the negate was on the false side of the select, then
+      // the operands of the SUB need to be swapped. PR 27251.
+      // This is because the pattern being matched above is
+      // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
+      // but if the pattern matched was
+      // (vselect M, X, (sub (0, X))), that is really negation of the pattern
+      // above, -(vselect M, (sub 0, X), X), and therefore the replacement
+      // pattern also needs to be a negation of the replacement pattern above.
+      // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
+      // sub accomplishes the negation of the replacement pattern.
+      if (V == Y)
+         std::swap(SubOp1, SubOp2);
+
+      return DAG.getBitcast(VT,
+                            DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
+    }
+  }
+
+  // PBLENDVB is only available on SSE 4.1.
+  if (!Subtarget.hasSSE41())
+    return SDValue();
+
+  MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
+
+  X = DAG.getBitcast(BlendVT, X);
+  Y = DAG.getBitcast(BlendVT, Y);
+  Mask = DAG.getBitcast(BlendVT, Mask);
+  Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+  return DAG.getBitcast(VT, Mask);
+}
+
+// Helper function for combineOrCmpEqZeroToCtlzSrl
+// Transforms:
+//   seteq(cmp x, 0)
+//   into:
+//   srl(ctlz x), log2(bitsize(x))
+// Input pattern is checked by caller.
+static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
+                                          SelectionDAG &DAG) {
+  SDValue Cmp = Op.getOperand(1);
+  EVT VT = Cmp.getOperand(0).getValueType();
+  unsigned Log2b = Log2_32(VT.getSizeInBits());
+  SDLoc dl(Op);
+  SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
+  // The result of the shift is true or false, and on X86, the 32-bit
+  // encoding of shr and lzcnt is more desirable.
+  SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
+  SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
+                            DAG.getConstant(Log2b, dl, VT));
+  return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
+}
+
+// Try to transform:
+//   zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
+//   into:
+//   srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
+// Will also attempt to match more generic cases, eg:
+//   zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
+// Only applies if the target supports the FastLZCNT feature.
+static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           const X86Subtarget &Subtarget) {
+  if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
+    return SDValue();
+
+  auto isORCandidate = [](SDValue N) {
+    return (N->getOpcode() == ISD::OR && N->hasOneUse());
+  };
+
+  // Check the zero extend is extending to 32-bit or more. The code generated by
+  // srl(ctlz) for 16-bit or less variants of the pattern would require extra
+  // instructions to clear the upper bits.
+  if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
+      !isORCandidate(N->getOperand(0)))
+    return SDValue();
+
+  // Check the node matches: setcc(eq, cmp 0)
+  auto isSetCCCandidate = [](SDValue N) {
+    return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
+           X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
+           N->getOperand(1).getOpcode() == X86ISD::CMP &&
+           N->getOperand(1).getConstantOperandVal(1) == 0 &&
+           N->getOperand(1).getValueType().bitsGE(MVT::i32);
+  };
+
+  SDNode *OR = N->getOperand(0).getNode();
+  SDValue LHS = OR->getOperand(0);
+  SDValue RHS = OR->getOperand(1);
+
+  // Save nodes matching or(or, setcc(eq, cmp 0)).
+  SmallVector<SDNode *, 2> ORNodes;
+  while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
+          (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
+    ORNodes.push_back(OR);
+    OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
+    LHS = OR->getOperand(0);
+    RHS = OR->getOperand(1);
+  }
+
+  // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
+  if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
+      !isORCandidate(SDValue(OR, 0)))
+    return SDValue();
+
+  // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
+  // to
+  // or(srl(ctlz),srl(ctlz)).
+  // The dag combiner can then fold it into:
+  // srl(or(ctlz, ctlz)).
+  EVT VT = OR->getValueType(0);
+  SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
+  SDValue Ret, NewRHS;
+  if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
+    Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
+
+  if (!Ret)
+    return SDValue();
+
+  // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
+  while (ORNodes.size() > 0) {
+    OR = ORNodes.pop_back_val();
+    LHS = OR->getOperand(0);
+    RHS = OR->getOperand(1);
+    // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
+    if (RHS->getOpcode() == ISD::OR)
+      std::swap(LHS, RHS);
+    EVT VT = OR->getValueType(0);
+    SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
+    if (!NewRHS)
+      return SDValue();
+    Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
+  }
+
+  if (Ret)
+    Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
+
+  return Ret;
+}
+
+static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
+                         TargetLowering::DAGCombinerInfo &DCI,
+                         const X86Subtarget &Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
+    return R;
+
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+    return FPLogic;
+
+  if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
+    return R;
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+  bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
+
+  // SHLD/SHRD instructions have lower register pressure, but on some
+  // platforms they have higher latency than the equivalent
+  // series of shifts/or that would otherwise be generated.
+  // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
+  // have higher latencies and we are not optimizing for size.
+  if (!OptForSize && Subtarget.isSHLDSlow())
+    return SDValue();
+
+  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
+    std::swap(N0, N1);
+  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
+    return SDValue();
+  if (!N0.hasOneUse() || !N1.hasOneUse())
+    return SDValue();
+
+  SDValue ShAmt0 = N0.getOperand(1);
+  if (ShAmt0.getValueType() != MVT::i8)
+    return SDValue();
+  SDValue ShAmt1 = N1.getOperand(1);
+  if (ShAmt1.getValueType() != MVT::i8)
+    return SDValue();
+  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
+    ShAmt0 = ShAmt0.getOperand(0);
+  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
+    ShAmt1 = ShAmt1.getOperand(0);
+
+  SDLoc DL(N);
+  unsigned Opc = X86ISD::SHLD;
+  SDValue Op0 = N0.getOperand(0);
+  SDValue Op1 = N1.getOperand(0);
+  if (ShAmt0.getOpcode() == ISD::SUB ||
+      ShAmt0.getOpcode() == ISD::XOR) {
+    Opc = X86ISD::SHRD;
+    std::swap(Op0, Op1);
+    std::swap(ShAmt0, ShAmt1);
+  }
+
+  // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
+  // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
+  // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
+  // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
+  unsigned Bits = VT.getSizeInBits();
+  if (ShAmt1.getOpcode() == ISD::SUB) {
+    SDValue Sum = ShAmt1.getOperand(0);
+    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
+      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
+      if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
+        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
+      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
+        return DAG.getNode(Opc, DL, VT,
+                           Op0, Op1,
+                           DAG.getNode(ISD::TRUNCATE, DL,
+                                       MVT::i8, ShAmt0));
+    }
+  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
+    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
+    if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
+      return DAG.getNode(Opc, DL, VT,
+                         N0.getOperand(0), N1.getOperand(0),
+                         DAG.getNode(ISD::TRUNCATE, DL,
+                                       MVT::i8, ShAmt0));
+  } else if (ShAmt1.getOpcode() == ISD::XOR) {
+    SDValue Mask = ShAmt1.getOperand(1);
+    if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
+      unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
+      SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
+      if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
+        ShAmt1Op0 = ShAmt1Op0.getOperand(0);
+      if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
+        if (Op1.getOpcode() == InnerShift &&
+            isa<ConstantSDNode>(Op1.getOperand(1)) &&
+            Op1.getConstantOperandVal(1) == 1) {
+          return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
+                             DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+        }
+        // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
+        if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
+            Op1.getOperand(0) == Op1.getOperand(1)) {
+          return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
+                     DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
+        }
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+/// Generate NEG and CMOV for integer abs.
+static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  // Since X86 does not have CMOV for 8-bit integer, we don't convert
+  // 8-bit integer abs to NEG and CMOV.
+  if (VT.isInteger() && VT.getSizeInBits() == 8)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+  // and change it to SUB and CMOV.
+  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+      N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
+    auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
+      // Generate SUB & CMOV.
+      SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
+                                DAG.getConstant(0, DL, VT), N0.getOperand(0));
+      SDValue Ops[] = {N0.getOperand(0), Neg,
+                       DAG.getConstant(X86::COND_GE, DL, MVT::i8),
+                       SDValue(Neg.getNode(), 1)};
+      return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
+    }
+  }
+  return SDValue();
+}
+
+/// Try to turn tests against the signbit in the form of:
+///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
+/// into:
+///   SETGT(X, -1)
+static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
+  // This is only worth doing if the output type is i8 or i1.
+  EVT ResultType = N->getValueType(0);
+  if (ResultType != MVT::i8 && ResultType != MVT::i1)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // We should be performing an xor against a truncated shift.
+  if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
+    return SDValue();
+
+  // Make sure we are performing an xor against one.
+  if (!isOneConstant(N1))
+    return SDValue();
+
+  // SetCC on x86 zero extends so only act on this if it's a logical shift.
+  SDValue Shift = N0.getOperand(0);
+  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
+    return SDValue();
+
+  // Make sure we are truncating from one of i16, i32 or i64.
+  EVT ShiftTy = Shift.getValueType();
+  if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
+    return SDValue();
+
+  // Make sure the shift amount extracts the sign bit.
+  if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
+      Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
+    return SDValue();
+
+  // Create a greater-than comparison against -1.
+  // N.B. Using SETGE against 0 works but we want a canonical looking
+  // comparison, using SETGT matches up with what TranslateX86CC.
+  SDLoc DL(N);
+  SDValue ShiftOp = Shift.getOperand(0);
+  EVT ShiftOpTy = ShiftOp.getValueType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
+                                               *DAG.getContext(), ResultType);
+  SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
+                              DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
+  if (SetCCResultType != ResultType)
+    Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
+  return Cond;
+}
+
+/// Turn vector tests of the signbit in the form of:
+///   xor (sra X, elt_size(X)-1), -1
+/// into:
+///   pcmpgt X, -1
+///
+/// This should be called before type legalization because the pattern may not
+/// persist after that.
+static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
+                                         const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isSimple())
+    return SDValue();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return SDValue();
+  case MVT::v16i8:
+  case MVT::v8i16:
+  case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
+  case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
+  case MVT::v32i8:
+  case MVT::v16i16:
+  case MVT::v8i32:
+  case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
+  }
+
+  // There must be a shift right algebraic before the xor, and the xor must be a
+  // 'not' operation.
+  SDValue Shift = N->getOperand(0);
+  SDValue Ones = N->getOperand(1);
+  if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
+      !ISD::isBuildVectorAllOnes(Ones.getNode()))
+    return SDValue();
+
+  // The shift should be smearing the sign bit across each vector element.
+  auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
+  if (!ShiftBV)
+    return SDValue();
+
+  EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
+  auto *ShiftAmt = ShiftBV->getConstantSplatNode();
+  if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
+    return SDValue();
+
+  // Create a greater-than comparison against -1. We don't use the more obvious
+  // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
+  return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
+}
+
+/// This function detects the AVG pattern between vectors of unsigned i8/i16,
+/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
+/// X86ISD::AVG instruction.
+static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
+                                const X86Subtarget &Subtarget,
+                                const SDLoc &DL) {
+  if (!VT.isVector() || !VT.isSimple())
+    return SDValue();
+  EVT InVT = In.getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  EVT ScalarVT = VT.getVectorElementType();
+  if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
+        isPowerOf2_32(NumElems)))
+    return SDValue();
+
+  // InScalarVT is the intermediate type in AVG pattern and it should be greater
+  // than the original input type (i8/i16).
+  EVT InScalarVT = InVT.getVectorElementType();
+  if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
+    return SDValue();
+
+  if (!Subtarget.hasSSE2())
+    return SDValue();
+  if (Subtarget.hasBWI()) {
+    if (VT.getSizeInBits() > 512)
+      return SDValue();
+  } else if (Subtarget.hasAVX2()) {
+    if (VT.getSizeInBits() > 256)
+      return SDValue();
+  } else {
+    if (VT.getSizeInBits() > 128)
+      return SDValue();
+  }
+
+  // Detect the following pattern:
+  //
+  //   %1 = zext <N x i8> %a to <N x i32>
+  //   %2 = zext <N x i8> %b to <N x i32>
+  //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
+  //   %4 = add nuw nsw <N x i32> %3, %2
+  //   %5 = lshr <N x i32> %N, <i32 1 x N>
+  //   %6 = trunc <N x i32> %5 to <N x i8>
+  //
+  // In AVX512, the last instruction can also be a trunc store.
+
+  if (In.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  // A lambda checking the given SDValue is a constant vector and each element
+  // is in the range [Min, Max].
+  auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
+    BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
+    if (!BV || !BV->isConstant())
+      return false;
+    for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
+      if (!C)
+        return false;
+      uint64_t Val = C->getZExtValue();
+      if (Val < Min || Val > Max)
+        return false;
+    }
+    return true;
+  };
+
+  // Check if each element of the vector is left-shifted by one.
+  auto LHS = In.getOperand(0);
+  auto RHS = In.getOperand(1);
+  if (!IsConstVectorInRange(RHS, 1, 1))
+    return SDValue();
+  if (LHS.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // Detect a pattern of a + b + 1 where the order doesn't matter.
+  SDValue Operands[3];
+  Operands[0] = LHS.getOperand(0);
+  Operands[1] = LHS.getOperand(1);
+
+  // Take care of the case when one of the operands is a constant vector whose
+  // element is in the range [1, 256].
+  if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
+      Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
+      Operands[0].getOperand(0).getValueType() == VT) {
+    // The pattern is detected. Subtract one from the constant vector, then
+    // demote it and emit X86ISD::AVG instruction.
+    SDValue VecOnes = DAG.getConstant(1, DL, InVT);
+    Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
+    Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
+    return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
+                       Operands[1]);
+  }
+
+  if (Operands[0].getOpcode() == ISD::ADD)
+    std::swap(Operands[0], Operands[1]);
+  else if (Operands[1].getOpcode() != ISD::ADD)
+    return SDValue();
+  Operands[2] = Operands[1].getOperand(0);
+  Operands[1] = Operands[1].getOperand(1);
+
+  // Now we have three operands of two additions. Check that one of them is a
+  // constant vector with ones, and the other two are promoted from i8/i16.
+  for (int i = 0; i < 3; ++i) {
+    if (!IsConstVectorInRange(Operands[i], 1, 1))
+      continue;
+    std::swap(Operands[i], Operands[2]);
+
+    // Check if Operands[0] and Operands[1] are results of type promotion.
+    for (int j = 0; j < 2; ++j)
+      if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
+          Operands[j].getOperand(0).getValueType() != VT)
+        return SDValue();
+
+    // The pattern is detected, emit X86ISD::AVG instruction.
+    return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
+                       Operands[1].getOperand(0));
+  }
+
+  return SDValue();
+}
+
+static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
+                           TargetLowering::DAGCombinerInfo &DCI,
+                           const X86Subtarget &Subtarget) {
+  LoadSDNode *Ld = cast<LoadSDNode>(N);
+  EVT RegVT = Ld->getValueType(0);
+  EVT MemVT = Ld->getMemoryVT();
+  SDLoc dl(Ld);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // For chips with slow 32-byte unaligned loads, break the 32-byte operation
+  // into two 16-byte operations.
+  ISD::LoadExtType Ext = Ld->getExtensionType();
+  bool Fast;
+  unsigned AddressSpace = Ld->getAddressSpace();
+  unsigned Alignment = Ld->getAlignment();
+  if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
+      Ext == ISD::NON_EXTLOAD &&
+      TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
+                             AddressSpace, Alignment, &Fast) && !Fast) {
+    unsigned NumElems = RegVT.getVectorNumElements();
+    if (NumElems < 2)
+      return SDValue();
+
+    SDValue Ptr = Ld->getBasePtr();
+
+    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+                                  NumElems/2);
+    SDValue Load1 =
+        DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+                    Alignment, Ld->getMemOperand()->getFlags());
+
+    Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
+    SDValue Load2 =
+        DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+                    std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                             Load1.getValue(1),
+                             Load2.getValue(1));
+
+    SDValue NewVec = DAG.getUNDEF(RegVT);
+    NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
+    NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
+    return DCI.CombineTo(N, NewVec, TF, true);
+  }
+
+  return SDValue();
+}
+
+/// If V is a build vector of boolean constants and exactly one of those
+/// constants is true, return the operand index of that true element.
+/// Otherwise, return -1.
+static int getOneTrueElt(SDValue V) {
+  // This needs to be a build vector of booleans.
+  // TODO: Checking for the i1 type matches the IR definition for the mask,
+  // but the mask check could be loosened to i8 or other types. That might
+  // also require checking more than 'allOnesValue'; eg, the x86 HW
+  // instructions only require that the MSB is set for each mask element.
+  // The ISD::MSTORE comments/definition do not specify how the mask operand
+  // is formatted.
+  auto *BV = dyn_cast<BuildVectorSDNode>(V);
+  if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
+    return -1;
+
+  int TrueIndex = -1;
+  unsigned NumElts = BV->getValueType(0).getVectorNumElements();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    const SDValue &Op = BV->getOperand(i);
+    if (Op.isUndef())
+      continue;
+    auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
+    if (!ConstNode)
+      return -1;
+    if (ConstNode->getAPIntValue().isAllOnesValue()) {
+      // If we already found a one, this is too many.
+      if (TrueIndex >= 0)
+        return -1;
+      TrueIndex = i;
+    }
+  }
+  return TrueIndex;
+}
+
+/// Given a masked memory load/store operation, return true if it has one mask
+/// bit set. If it has one mask bit set, then also return the memory address of
+/// the scalar element to load/store, the vector index to insert/extract that
+/// scalar element, and the alignment for the scalar memory access.
+static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
+                                         SelectionDAG &DAG, SDValue &Addr,
+                                         SDValue &Index, unsigned &Alignment) {
+  int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
+  if (TrueMaskElt < 0)
+    return false;
+
+  // Get the address of the one scalar element that is specified by the mask
+  // using the appropriate offset from the base pointer.
+  EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
+  Addr = MaskedOp->getBasePtr();
+  if (TrueMaskElt != 0) {
+    unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
+    Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
+  }
+
+  Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
+  Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
+  return true;
+}
+
+/// If exactly one element of the mask is set for a non-extending masked load,
+/// it is a scalar load and vector insert.
+/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
+/// mask have already been optimized in IR, so we don't bother with those here.
+static SDValue
+reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI) {
+  // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
+  // However, some target hooks may need to be added to know when the transform
+  // is profitable. Endianness would also have to be considered.
+
+  SDValue Addr, VecIndex;
+  unsigned Alignment;
+  if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
+    return SDValue();
+
+  // Load the one scalar element that is specified by the mask using the
+  // appropriate offset from the base pointer.
+  SDLoc DL(ML);
+  EVT VT = ML->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  SDValue Load =
+      DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
+                  Alignment, ML->getMemOperand()->getFlags());
+
+  // Insert the loaded element into the appropriate place in the vector.
+  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
+                               Load, VecIndex);
+  return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
+}
+
+static SDValue
+combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
+                              TargetLowering::DAGCombinerInfo &DCI) {
+  if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
+    return SDValue();
+
+  SDLoc DL(ML);
+  EVT VT = ML->getValueType(0);
+
+  // If we are loading the first and last elements of a vector, it is safe and
+  // always faster to load the whole vector. Replace the masked load with a
+  // vector load and select.
+  unsigned NumElts = VT.getVectorNumElements();
+  BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
+  bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
+  bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
+  if (LoadFirstElt && LoadLastElt) {
+    SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
+                                ML->getMemOperand());
+    SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
+    return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
+  }
+
+  // Convert a masked load with a constant mask into a masked load and a select.
+  // This allows the select operation to use a faster kind of select instruction
+  // (for example, vblendvps -> vblendps).
+
+  // Don't try this if the pass-through operand is already undefined. That would
+  // cause an infinite loop because that's what we're about to create.
+  if (ML->getSrc0().isUndef())
+    return SDValue();
+
+  // The new masked load has an undef pass-through operand. The select uses the
+  // original pass-through operand.
+  SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
+                                    ML->getMask(), DAG.getUNDEF(VT),
+                                    ML->getMemoryVT(), ML->getMemOperand(),
+                                    ML->getExtensionType());
+  SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
+
+  return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
+}
+
+static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget &Subtarget) {
+  MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
+
+  // TODO: Expanding load with constant mask may be optimized as well.
+  if (Mld->isExpandingLoad())
+    return SDValue();
+
+  if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
+    if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
+      return ScalarLoad;
+    // TODO: Do some AVX512 subsets benefit from this transform?
+    if (!Subtarget.hasAVX512())
+      if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
+        return Blend;
+  }
+
+  if (Mld->getExtensionType() != ISD::SEXTLOAD)
+    return SDValue();
+
+  // Resolve extending loads.
+  EVT VT = Mld->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT LdVT = Mld->getMemoryVT();
+  SDLoc dl(Mld);
+
+  assert(LdVT != VT && "Cannot extend to the same type");
+  unsigned ToSz = VT.getScalarSizeInBits();
+  unsigned FromSz = LdVT.getScalarSizeInBits();
+  // From/To sizes and ElemCount must be pow of two.
+  assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+    "Unexpected size for extending masked load");
+
+  unsigned SizeRatio  = ToSz / FromSz;
+  assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
+
+  // Create a type on which we perform the shuffle.
+  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+          LdVT.getScalarType(), NumElems*SizeRatio);
+  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+  // Convert Src0 value.
+  SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
+  if (!Mld->getSrc0().isUndef()) {
+    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
+
+    // Can't shuffle using an illegal type.
+    assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+           "WideVecVT should be legal");
+    WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
+                                    DAG.getUNDEF(WideVecVT), ShuffleVec);
+  }
+  // Prepare the new mask.
+  SDValue NewMask;
+  SDValue Mask = Mld->getMask();
+  if (Mask.getValueType() == VT) {
+    // Mask and original value have the same type.
+    NewMask = DAG.getBitcast(WideVecVT, Mask);
+    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
+    for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
+      ShuffleVec[i] = NumElems * SizeRatio;
+    NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+                                   DAG.getConstant(0, dl, WideVecVT),
+                                   ShuffleVec);
+  } else {
+    assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+    unsigned WidenNumElts = NumElems*SizeRatio;
+    unsigned MaskNumElts = VT.getVectorNumElements();
+    EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
+                                     WidenNumElts);
+
+    unsigned NumConcat = WidenNumElts / MaskNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
+    Ops[0] = Mask;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = ZeroVal;
+
+    NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+  }
+
+  SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
+                                     Mld->getBasePtr(), NewMask, WideSrc0,
+                                     Mld->getMemoryVT(), Mld->getMemOperand(),
+                                     ISD::NON_EXTLOAD);
+  SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
+  return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
+}
+
+/// If exactly one element of the mask is set for a non-truncating masked store,
+/// it is a vector extract and scalar store.
+/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
+/// mask have already been optimized in IR, so we don't bother with those here.
+static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
+                                              SelectionDAG &DAG) {
+  // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
+  // However, some target hooks may need to be added to know when the transform
+  // is profitable. Endianness would also have to be considered.
+
+  SDValue Addr, VecIndex;
+  unsigned Alignment;
+  if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
+    return SDValue();
+
+  // Extract the one scalar element that is actually being stored.
+  SDLoc DL(MS);
+  EVT VT = MS->getValue().getValueType();
+  EVT EltVT = VT.getVectorElementType();
+  SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
+                                MS->getValue(), VecIndex);
+
+  // Store that element at the appropriate offset from the base pointer.
+  return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
+                      Alignment, MS->getMemOperand()->getFlags());
+}
+
+static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget &Subtarget) {
+  MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
+
+  if (Mst->isCompressingStore())
+    return SDValue();
+
+  if (!Mst->isTruncatingStore())
+    return reduceMaskedStoreToScalarStore(Mst, DAG);
+
+  // Resolve truncating stores.
+  EVT VT = Mst->getValue().getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT StVT = Mst->getMemoryVT();
+  SDLoc dl(Mst);
+
+  assert(StVT != VT && "Cannot truncate to the same type");
+  unsigned FromSz = VT.getScalarSizeInBits();
+  unsigned ToSz = StVT.getScalarSizeInBits();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // The truncating store is legal in some cases. For example
+  // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
+  // are designated for truncate store.
+  // In this case we don't need any further transformations.
+  if (TLI.isTruncStoreLegal(VT, StVT))
+    return SDValue();
+
+  // From/To sizes and ElemCount must be pow of two.
+  assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+    "Unexpected size for truncating masked store");
+  // We are going to use the original vector elt for storing.
+  // Accumulated smaller vector elements must be a multiple of the store size.
+  assert (((NumElems * FromSz) % ToSz) == 0 &&
+          "Unexpected ratio for truncating masked store");
+
+  unsigned SizeRatio  = FromSz / ToSz;
+  assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+  // Create a type on which we perform the shuffle.
+  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+          StVT.getScalarType(), NumElems*SizeRatio);
+
+  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+  SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
+  SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+  for (unsigned i = 0; i != NumElems; ++i)
+    ShuffleVec[i] = i * SizeRatio;
+
+  // Can't shuffle using an illegal type.
+  assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+         "WideVecVT should be legal");
+
+  SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+                                              DAG.getUNDEF(WideVecVT),
+                                              ShuffleVec);
+
+  SDValue NewMask;
+  SDValue Mask = Mst->getMask();
+  if (Mask.getValueType() == VT) {
+    // Mask and original value have the same type.
+    NewMask = DAG.getBitcast(WideVecVT, Mask);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
+    for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+      ShuffleVec[i] = NumElems*SizeRatio;
+    NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+                                   DAG.getConstant(0, dl, WideVecVT),
+                                   ShuffleVec);
+  } else {
+    assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+    unsigned WidenNumElts = NumElems*SizeRatio;
+    unsigned MaskNumElts = VT.getVectorNumElements();
+    EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
+                                     WidenNumElts);
+
+    unsigned NumConcat = WidenNumElts / MaskNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
+    Ops[0] = Mask;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = ZeroVal;
+
+    NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+  }
+
+  return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
+                            Mst->getBasePtr(), NewMask, StVT,
+                            Mst->getMemOperand(), false);
+}
+
+static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
+                            const X86Subtarget &Subtarget) {
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  EVT VT = St->getValue().getValueType();
+  EVT StVT = St->getMemoryVT();
+  SDLoc dl(St);
+  SDValue StoredVal = St->getOperand(1);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // If we are saving a concatenation of two XMM registers and 32-byte stores
+  // are slow, such as on Sandy Bridge, perform two 16-byte stores.
+  bool Fast;
+  unsigned AddressSpace = St->getAddressSpace();
+  unsigned Alignment = St->getAlignment();
+  if (VT.is256BitVector() && StVT == VT &&
+      TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                             AddressSpace, Alignment, &Fast) &&
+      !Fast) {
+    unsigned NumElems = VT.getVectorNumElements();
+    if (NumElems < 2)
+      return SDValue();
+
+    SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
+    SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
+
+    SDValue Ptr0 = St->getBasePtr();
+    SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
+
+    SDValue Ch0 =
+        DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
+                     Alignment, St->getMemOperand()->getFlags());
+    SDValue Ch1 =
+        DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
+                     std::min(16U, Alignment), St->getMemOperand()->getFlags());
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+  }
+
+  // Optimize trunc store (of multiple scalars) to shuffle and store.
+  // First, pack all of the elements in one place. Next, store to memory
+  // in fewer chunks.
+  if (St->isTruncatingStore() && VT.isVector()) {
+    // Check if we can detect an AVG pattern from the truncation. If yes,
+    // replace the trunc store by a normal store with the result of X86ISD::AVG
+    // instruction.
+    if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
+                                       Subtarget, dl))
+      return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
+                          St->getPointerInfo(), St->getAlignment(),
+                          St->getMemOperand()->getFlags());
+
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned NumElems = VT.getVectorNumElements();
+    assert(StVT != VT && "Cannot truncate to the same type");
+    unsigned FromSz = VT.getScalarSizeInBits();
+    unsigned ToSz = StVT.getScalarSizeInBits();
+
+    // The truncating store is legal in some cases. For example
+    // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
+    // are designated for truncate store.
+    // In this case we don't need any further transformations.
+    if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
+      return SDValue();
+
+    // From, To sizes and ElemCount must be pow of two
+    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
+    // We are going to use the original vector elt for storing.
+    // Accumulated smaller vector elements must be a multiple of the store size.
+    if (0 != (NumElems * FromSz) % ToSz) return SDValue();
+
+    unsigned SizeRatio  = FromSz / ToSz;
+
+    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+    // Create a type on which we perform the shuffle
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StVT.getScalarType(), NumElems*SizeRatio);
+
+    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
+
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT))
+      return SDValue();
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+                                         DAG.getUNDEF(WideVecVT),
+                                         ShuffleVec);
+    // At this point all of the data is stored at the bottom of the
+    // register. We now need to save it to mem.
+
+    // Find the largest store unit
+    MVT StoreType = MVT::i8;
+    for (MVT Tp : MVT::integer_valuetypes()) {
+      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
+        StoreType = Tp;
+    }
+
+    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+    if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
+        (64 <= NumElems * ToSz))
+      StoreType = MVT::f64;
+
+    // Bitcast the original vector into a vector of store-size units
+    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
+    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+    SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
+    SmallVector<SDValue, 8> Chains;
+    SDValue Ptr = St->getBasePtr();
+
+    // Perform one or more big stores into memory.
+    for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
+      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                                   StoreType, ShuffWide,
+                                   DAG.getIntPtrConstant(i, dl));
+      SDValue Ch =
+          DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
+                       St->getAlignment(), St->getMemOperand()->getFlags());
+      Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
+      Chains.push_back(Ch);
+    }
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+  }
+
+  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
+  // the FP state in cases where an emms may be missing.
+  // A preferable solution to the general problem is to figure out the right
+  // places to insert EMMS.  This qualifies as a quick hack.
+
+  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
+  if (VT.getSizeInBits() != 64)
+    return SDValue();
+
+  const Function *F = DAG.getMachineFunction().getFunction();
+  bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
+  bool F64IsLegal =
+      !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
+  if ((VT.isVector() ||
+       (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
+      isa<LoadSDNode>(St->getValue()) &&
+      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
+      St->getChain().hasOneUse() && !St->isVolatile()) {
+    SDNode* LdVal = St->getValue().getNode();
+    LoadSDNode *Ld = nullptr;
+    int TokenFactorIndex = -1;
+    SmallVector<SDValue, 8> Ops;
+    SDNode* ChainVal = St->getChain().getNode();
+    // Must be a store of a load.  We currently handle two cases:  the load
+    // is a direct child, and it's under an intervening TokenFactor.  It is
+    // possible to dig deeper under nested TokenFactors.
+    if (ChainVal == LdVal)
+      Ld = cast<LoadSDNode>(St->getChain());
+    else if (St->getValue().hasOneUse() &&
+             ChainVal->getOpcode() == ISD::TokenFactor) {
+      for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
+        if (ChainVal->getOperand(i).getNode() == LdVal) {
+          TokenFactorIndex = i;
+          Ld = cast<LoadSDNode>(St->getValue());
+        } else
+          Ops.push_back(ChainVal->getOperand(i));
+      }
+    }
+
+    if (!Ld || !ISD::isNormalLoad(Ld))
+      return SDValue();
+
+    // If this is not the MMX case, i.e. we are just turning i64 load/store
+    // into f64 load/store, avoid the transformation if there are multiple
+    // uses of the loaded value.
+    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
+      return SDValue();
+
+    SDLoc LdDL(Ld);
+    SDLoc StDL(N);
+    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
+    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
+    // pair instead.
+    if (Subtarget.is64Bit() || F64IsLegal) {
+      MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
+      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
+                                  Ld->getPointerInfo(), Ld->getAlignment(),
+                                  Ld->getMemOperand()->getFlags());
+      SDValue NewChain = NewLd.getValue(1);
+      if (TokenFactorIndex >= 0) {
+        Ops.push_back(NewChain);
+        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
+      }
+      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
+                          St->getPointerInfo(), St->getAlignment(),
+                          St->getMemOperand()->getFlags());
+    }
+
+    // Otherwise, lower to two pairs of 32-bit loads / stores.
+    SDValue LoAddr = Ld->getBasePtr();
+    SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
+
+    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
+                               Ld->getPointerInfo(), Ld->getAlignment(),
+                               Ld->getMemOperand()->getFlags());
+    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
+                               Ld->getPointerInfo().getWithOffset(4),
+                               MinAlign(Ld->getAlignment(), 4),
+                               Ld->getMemOperand()->getFlags());
+
+    SDValue NewChain = LoLd.getValue(1);
+    if (TokenFactorIndex >= 0) {
+      Ops.push_back(LoLd);
+      Ops.push_back(HiLd);
+      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
+    }
+
+    LoAddr = St->getBasePtr();
+    HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
+
+    SDValue LoSt =
+        DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
+                     St->getAlignment(), St->getMemOperand()->getFlags());
+    SDValue HiSt = DAG.getStore(
+        NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
+        MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
+    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
+  }
+
+  // This is similar to the above case, but here we handle a scalar 64-bit
+  // integer store that is extracted from a vector on a 32-bit target.
+  // If we have SSE2, then we can treat it like a floating-point double
+  // to get past legalization. The execution dependencies fixup pass will
+  // choose the optimal machine instruction for the store if this really is
+  // an integer or v2f32 rather than an f64.
+  if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
+      St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    SDValue OldExtract = St->getOperand(1);
+    SDValue ExtOp0 = OldExtract.getOperand(0);
+    unsigned VecSize = ExtOp0.getValueSizeInBits();
+    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
+    SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
+    SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                                     BitCast, OldExtract.getOperand(1));
+    return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
+                        St->getPointerInfo(), St->getAlignment(),
+                        St->getMemOperand()->getFlags());
+  }
+
+  return SDValue();
+}
+
+/// Return 'true' if this vector operation is "horizontal"
+/// and return the operands for the horizontal operation in LHS and RHS.  A
+/// horizontal operation performs the binary operation on successive elements
+/// of its first operand, then on successive elements of its second operand,
+/// returning the resulting values in a vector.  For example, if
+///   A = < float a0, float a1, float a2, float a3 >
+/// and
+///   B = < float b0, float b1, float b2, float b3 >
+/// then the result of doing a horizontal operation on A and B is
+///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
+/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
+/// A horizontal-op B, for some already available A and B, and if so then LHS is
+/// set to A, RHS to B, and the routine returns 'true'.
+/// Note that the binary operation should have the property that if one of the
+/// operands is UNDEF then the result is UNDEF.
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
+  // Look for the following pattern: if
+  //   A = < float a0, float a1, float a2, float a3 >
+  //   B = < float b0, float b1, float b2, float b3 >
+  // and
+  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
+  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
+  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
+  // which is A horizontal-op B.
+
+  // At least one of the operands should be a vector shuffle.
+  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
+      RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
+    return false;
+
+  MVT VT = LHS.getSimpleValueType();
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for horizontal add/sub");
+
+  // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
+  // operate independently on 128-bit lanes.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+  assert((NumLaneElts % 2 == 0) &&
+         "Vector type should have an even number of elements in each lane");
+  unsigned HalfLaneElts = NumLaneElts/2;
+
+  // View LHS in the form
+  //   LHS = VECTOR_SHUFFLE A, B, LMask
+  // If LHS is not a shuffle then pretend it is the shuffle
+  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
+  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
+  // type VT.
+  SDValue A, B;
+  SmallVector<int, 16> LMask(NumElts);
+  if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (!LHS.getOperand(0).isUndef())
+      A = LHS.getOperand(0);
+    if (!LHS.getOperand(1).isUndef())
+      B = LHS.getOperand(1);
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
+    std::copy(Mask.begin(), Mask.end(), LMask.begin());
+  } else {
+    if (!LHS.isUndef())
+      A = LHS;
+    for (unsigned i = 0; i != NumElts; ++i)
+      LMask[i] = i;
+  }
+
+  // Likewise, view RHS in the form
+  //   RHS = VECTOR_SHUFFLE C, D, RMask
+  SDValue C, D;
+  SmallVector<int, 16> RMask(NumElts);
+  if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (!RHS.getOperand(0).isUndef())
+      C = RHS.getOperand(0);
+    if (!RHS.getOperand(1).isUndef())
+      D = RHS.getOperand(1);
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
+    std::copy(Mask.begin(), Mask.end(), RMask.begin());
+  } else {
+    if (!RHS.isUndef())
+      C = RHS;
+    for (unsigned i = 0; i != NumElts; ++i)
+      RMask[i] = i;
+  }
+
+  // Check that the shuffles are both shuffling the same vectors.
+  if (!(A == C && B == D) && !(A == D && B == C))
+    return false;
+
+  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
+  if (!A.getNode() && !B.getNode())
+    return false;
+
+  // If A and B occur in reverse order in RHS, then "swap" them (which means
+  // rewriting the mask).
+  if (A != C)
+    ShuffleVectorSDNode::commuteMask(RMask);
+
+  // At this point LHS and RHS are equivalent to
+  //   LHS = VECTOR_SHUFFLE A, B, LMask
+  //   RHS = VECTOR_SHUFFLE A, B, RMask
+  // Check that the masks correspond to performing a horizontal operation.
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      int LIdx = LMask[i+l], RIdx = RMask[i+l];
+
+      // Ignore any UNDEF components.
+      if (LIdx < 0 || RIdx < 0 ||
+          (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
+          (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
+        continue;
+
+      // Check that successive elements are being operated on.  If not, this is
+      // not a horizontal operation.
+      unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
+      int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
+      if (!(LIdx == Index && RIdx == Index + 1) &&
+          !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
+        return false;
+    }
+  }
+
+  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
+  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+  return true;
+}
+
+/// Do target-specific dag combines on floating-point adds/subs.
+static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
+                               const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  bool IsFadd = N->getOpcode() == ISD::FADD;
+  assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
+
+  // Try to synthesize horizontal add/sub from adds/subs of shuffles.
+  if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+       (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+      isHorizontalBinOp(LHS, RHS, IsFadd)) {
+    auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
+    return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
+  }
+  return SDValue();
+}
+
+/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
+static SDValue
+combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
+                                  SmallVector<SDValue, 8> &Regs) {
+  assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
+                             Regs[0].getValueType() == MVT::v2i64));
+  EVT OutVT = N->getValueType(0);
+  EVT OutSVT = OutVT.getVectorElementType();
+  EVT InVT = Regs[0].getValueType();
+  EVT InSVT = InVT.getVectorElementType();
+  SDLoc DL(N);
+
+  // First, use mask to unset all bits that won't appear in the result.
+  assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
+         "OutSVT can only be either i8 or i16.");
+  APInt Mask =
+      APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
+  SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
+  for (auto &Reg : Regs)
+    Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
+
+  MVT UnpackedVT, PackedVT;
+  if (OutSVT == MVT::i8) {
+    UnpackedVT = MVT::v8i16;
+    PackedVT = MVT::v16i8;
+  } else {
+    UnpackedVT = MVT::v4i32;
+    PackedVT = MVT::v8i16;
+  }
+
+  // In each iteration, truncate the type by a half size.
+  auto RegNum = Regs.size();
+  for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
+       j < e; j *= 2, RegNum /= 2) {
+    for (unsigned i = 0; i < RegNum; i++)
+      Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
+    for (unsigned i = 0; i < RegNum / 2; i++)
+      Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
+                            Regs[i * 2 + 1]);
+  }
+
+  // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
+  // then extract a subvector as the result since v8i8 is not a legal type.
+  if (OutVT == MVT::v8i8) {
+    Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
+    Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
+                          DAG.getIntPtrConstant(0, DL));
+    return Regs[0];
+  } else if (RegNum > 1) {
+    Regs.resize(RegNum);
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
+  } else
+    return Regs[0];
+}
+
+/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
+static SDValue
+combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
+                                  SmallVector<SDValue, 8> &Regs) {
+  assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
+  EVT OutVT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
+  SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
+  for (auto &Reg : Regs) {
+    Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+    Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+  }
+
+  for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
+    Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
+                          Regs[i * 2 + 1]);
+
+  if (Regs.size() > 2) {
+    Regs.resize(Regs.size() / 2);
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
+  } else
+    return Regs[0];
+}
+
+/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
+/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
+/// legalization the truncation will be translated into a BUILD_VECTOR with each
+/// element that is extracted from a vector and then truncated, and it is
+/// difficult to do this optimization based on them.
+static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
+                                       const X86Subtarget &Subtarget) {
+  EVT OutVT = N->getValueType(0);
+  if (!OutVT.isVector())
+    return SDValue();
+
+  SDValue In = N->getOperand(0);
+  if (!In.getValueType().isSimple())
+    return SDValue();
+
+  EVT InVT = In.getValueType();
+  unsigned NumElems = OutVT.getVectorNumElements();
+
+  // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
+  // SSE2, and we need to take care of it specially.
+  // AVX512 provides vpmovdb.
+  if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
+    return SDValue();
+
+  EVT OutSVT = OutVT.getVectorElementType();
+  EVT InSVT = InVT.getVectorElementType();
+  if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
+        (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
+        NumElems >= 8))
+    return SDValue();
+
+  // SSSE3's pshufb results in less instructions in the cases below.
+  if (Subtarget.hasSSSE3() && NumElems == 8 &&
+      ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
+       (InSVT == MVT::i32 && OutSVT == MVT::i16)))
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // Split a long vector into vectors of legal type.
+  unsigned RegNum = InVT.getSizeInBits() / 128;
+  SmallVector<SDValue, 8> SubVec(RegNum);
+  unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
+  EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
+
+  for (unsigned i = 0; i < RegNum; i++)
+    SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
+                            DAG.getIntPtrConstant(i * NumSubRegElts, DL));
+
+  // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
+  // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
+  // truncate 2 x v4i32 to v8i16.
+  if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
+    return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
+  else if (InSVT == MVT::i32)
+    return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
+  else
+    return SDValue();
+}
+
+/// This function transforms vector truncation of 'all or none' bits values.
+/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
+static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
+                                               SelectionDAG &DAG,
+                                               const X86Subtarget &Subtarget) {
+  // Requires SSE2 but AVX512 has fast truncate.
+  if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+    return SDValue();
+
+  if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
+    return SDValue();
+
+  SDValue In = N->getOperand(0);
+  if (!In.getValueType().isSimple())
+    return SDValue();
+
+  MVT VT = N->getValueType(0).getSimpleVT();
+  MVT SVT = VT.getScalarType();
+
+  MVT InVT = In.getValueType().getSimpleVT();
+  MVT InSVT = InVT.getScalarType();
+
+  // Use PACKSS if the input is a splatted sign bit.
+  // e.g. Comparison result, sext_in_reg, etc.
+  unsigned NumSignBits = DAG.ComputeNumSignBits(In);
+  if (NumSignBits != InSVT.getSizeInBits())
+    return SDValue();
+
+  // Check we have a truncation suited for PACKSS.
+  if (!VT.is128BitVector() && !VT.is256BitVector())
+    return SDValue();
+  if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
+    return SDValue();
+  if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
+    return SDValue();
+
+  return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
+}
+
+static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
+                               const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue Src = N->getOperand(0);
+  SDLoc DL(N);
+
+  // Try to detect AVG pattern first.
+  if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
+    return Avg;
+
+  // The bitcast source is a direct mmx result.
+  // Detect bitcasts between i32 to x86mmx
+  if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
+    SDValue BCSrc = Src.getOperand(0);
+    if (BCSrc.getValueType() == MVT::x86mmx)
+      return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
+  }
+
+  // Try to truncate extended sign bits with PACKSS.
+  if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
+    return V;
+
+  return combineVectorTruncation(N, DAG, Subtarget);
+}
+
+/// Returns the negated value if the node \p N flips sign of FP value.
+///
+/// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
+/// AVX512F does not have FXOR, so FNEG is lowered as
+/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
+/// In this case we go though all bitcasts.
+static SDValue isFNEG(SDNode *N) {
+  if (N->getOpcode() == ISD::FNEG)
+    return N->getOperand(0);
+
+  SDValue Op = peekThroughBitcasts(SDValue(N, 0));
+  if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
+    return SDValue();
+
+  SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
+  if (!Op1.getValueType().isFloatingPoint())
+    return SDValue();
+
+  SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
+
+  unsigned EltBits = Op1.getScalarValueSizeInBits();
+  auto isSignBitValue = [&](const ConstantFP *C) {
+    return C->getValueAPF().bitcastToAPInt() == APInt::getSignBit(EltBits);
+  };
+
+  // There is more than one way to represent the same constant on
+  // the different X86 targets. The type of the node may also depend on size.
+  //  - load scalar value and broadcast
+  //  - BUILD_VECTOR node
+  //  - load from a constant pool.
+  // We check all variants here.
+  if (Op1.getOpcode() == X86ISD::VBROADCAST) {
+    if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
+      if (isSignBitValue(cast<ConstantFP>(C)))
+        return Op0;
+
+  } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
+    if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
+      if (isSignBitValue(CN->getConstantFPValue()))
+        return Op0;
+
+  } else if (auto *C = getTargetConstantFromNode(Op1)) {
+    if (C->getType()->isVectorTy()) {
+      if (auto *SplatV = C->getSplatValue())
+        if (isSignBitValue(cast<ConstantFP>(SplatV)))
+          return Op0;
+    } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
+      if (isSignBitValue(FPConst))
+        return Op0;
+  }
+  return SDValue();
+}
+
+/// Do target-specific dag combines on floating point negations.
+static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
+                           const X86Subtarget &Subtarget) {
+  EVT OrigVT = N->getValueType(0);
+  SDValue Arg = isFNEG(N);
+  assert(Arg.getNode() && "N is expected to be an FNEG node");
+
+  EVT VT = Arg.getValueType();
+  EVT SVT = VT.getScalarType();
+  SDLoc DL(N);
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  // If we're negating a FMUL node on a target with FMA, then we can avoid the
+  // use of a constant by performing (-0 - A*B) instead.
+  // FIXME: Check rounding control flags as well once it becomes available.
+  if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
+      Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
+    SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
+    SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
+                                  Arg.getOperand(1), Zero);
+    return DAG.getBitcast(OrigVT, NewNode);
+  }
+
+  // If we're negating an FMA node, then we can adjust the
+  // instruction to include the extra negation.
+  unsigned NewOpcode = 0;
+  if (Arg.hasOneUse()) {
+    switch (Arg.getOpcode()) {
+    case X86ISD::FMADD:        NewOpcode = X86ISD::FNMSUB;       break;
+    case X86ISD::FMSUB:        NewOpcode = X86ISD::FNMADD;       break;
+    case X86ISD::FNMADD:       NewOpcode = X86ISD::FMSUB;        break;
+    case X86ISD::FNMSUB:       NewOpcode = X86ISD::FMADD;        break;
+    case X86ISD::FMADD_RND:    NewOpcode = X86ISD::FNMSUB_RND;   break;
+    case X86ISD::FMSUB_RND:    NewOpcode = X86ISD::FNMADD_RND;   break;
+    case X86ISD::FNMADD_RND:   NewOpcode = X86ISD::FMSUB_RND;    break;
+    case X86ISD::FNMSUB_RND:   NewOpcode = X86ISD::FMADD_RND;    break;
+    // We can't handle scalar intrinsic node here because it would only
+    // invert one element and not the whole vector. But we could try to handle
+    // a negation of the lower element only.
+    }
+  }
+  if (NewOpcode)
+    return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
+                                              Arg.getNode()->ops()));
+
+  return SDValue();
+}
+
+static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget &Subtarget) {
+  MVT VT = N->getSimpleValueType(0);
+  // If we have integer vector types available, use the integer opcodes.
+  if (VT.isVector() && Subtarget.hasSSE2()) {
+    SDLoc dl(N);
+
+    MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+
+    SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
+    SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
+    unsigned IntOpcode;
+    switch (N->getOpcode()) {
+    default: llvm_unreachable("Unexpected FP logic op");
+    case X86ISD::FOR: IntOpcode = ISD::OR; break;
+    case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
+    case X86ISD::FAND: IntOpcode = ISD::AND; break;
+    case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
+    }
+    SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
+    return DAG.getBitcast(VT, IntOp);
+  }
+  return SDValue();
+}
+
+static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
+                          TargetLowering::DAGCombinerInfo &DCI,
+                          const X86Subtarget &Subtarget) {
+  if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
+    return Cmp;
+
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
+    return RV;
+
+  if (Subtarget.hasCMov())
+    if (SDValue RV = combineIntegerAbs(N, DAG))
+      return RV;
+
+  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+    return FPLogic;
+
+  if (isFNEG(N))
+    return combineFneg(N, DAG, Subtarget);
+  return SDValue();
+}
+
+
+static bool isNullFPScalarOrVectorConst(SDValue V) {
+  return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
+}
+
+/// If a value is a scalar FP zero or a vector FP zero (potentially including
+/// undefined elements), return a zero constant that may be used to fold away
+/// that value. In the case of a vector, the returned constant will not contain
+/// undefined elements even if the input parameter does. This makes it suitable
+/// to be used as a replacement operand with operations (eg, bitwise-and) where
+/// an undef should not propagate.
+static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
+                                        const X86Subtarget &Subtarget) {
+  if (!isNullFPScalarOrVectorConst(V))
+    return SDValue();
+
+  if (V.getValueType().isVector())
+    return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
+
+  return V;
+}
+
+static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
+  if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
+        (VT == MVT::f64 && Subtarget.hasSSE2())))
+    return SDValue();
+
+  auto isAllOnesConstantFP = [](SDValue V) {
+    auto *C = dyn_cast<ConstantFPSDNode>(V);
+    return C && C->getConstantFPValue()->isAllOnesValue();
+  };
+
+  // fand (fxor X, -1), Y --> fandn X, Y
+  if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
+    return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
+
+  // fand X, (fxor Y, -1) --> fandn Y, X
+  if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
+    return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
+
+  return SDValue();
+}
+
+/// Do target-specific dag combines on X86ISD::FAND nodes.
+static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
+                           const X86Subtarget &Subtarget) {
+  // FAND(0.0, x) -> 0.0
+  if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
+    return V;
+
+  // FAND(x, 0.0) -> 0.0
+  if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
+    return V;
+
+  if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
+    return V;
+
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on X86ISD::FANDN nodes.
+static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
+                            const X86Subtarget &Subtarget) {
+  // FANDN(0.0, x) -> x
+  if (isNullFPScalarOrVectorConst(N->getOperand(0)))
+    return N->getOperand(1);
+
+  // FANDN(x, 0.0) -> 0.0
+  if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
+    return V;
+
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
+static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
+                          const X86Subtarget &Subtarget) {
+  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+
+  // F[X]OR(0.0, x) -> x
+  if (isNullFPScalarOrVectorConst(N->getOperand(0)))
+    return N->getOperand(1);
+
+  // F[X]OR(x, 0.0) -> x
+  if (isNullFPScalarOrVectorConst(N->getOperand(1)))
+    return N->getOperand(0);
+
+  if (isFNEG(N))
+    if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
+      return NewVal;
+
+  return lowerX86FPLogicOp(N, DAG, Subtarget);
+}
+
+/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
+static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
+
+  // Only perform optimizations if UnsafeMath is used.
+  if (!DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
+  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
+  // into FMINC and FMAXC, which are Commutative operations.
+  unsigned NewOp = 0;
+  switch (N->getOpcode()) {
+    default: llvm_unreachable("unknown opcode");
+    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
+    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
+  }
+
+  return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
+                     N->getOperand(0), N->getOperand(1));
+}
+
+static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
+  if (Subtarget.useSoftFloat())
+    return SDValue();
+
+  // TODO: Check for global or instruction-level "nnan". In that case, we
+  //       should be able to lower to FMAX/FMIN alone.
+  // TODO: If an operand is already known to be a NaN or not a NaN, this
+  //       should be an optional swap and FMAX/FMIN.
+
+  EVT VT = N->getValueType(0);
+  if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+        (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
+        (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
+    return SDValue();
+
+  // This takes at least 3 instructions, so favor a library call when operating
+  // on a scalar and minimizing code size.
+  if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
+    return SDValue();
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDLoc DL(N);
+  EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
+      DAG.getDataLayout(), *DAG.getContext(), VT);
+
+  // There are 4 possibilities involving NaN inputs, and these are the required
+  // outputs:
+  //                   Op1
+  //               Num     NaN
+  //            ----------------
+  //       Num  |  Max  |  Op0 |
+  // Op0        ----------------
+  //       NaN  |  Op1  |  NaN |
+  //            ----------------
+  //
+  // The SSE FP max/min instructions were not designed for this case, but rather
+  // to implement:
+  //   Min = Op1 < Op0 ? Op1 : Op0
+  //   Max = Op1 > Op0 ? Op1 : Op0
+  //
+  // So they always return Op0 if either input is a NaN. However, we can still
+  // use those instructions for fmaxnum by selecting away a NaN input.
+
+  // If either operand is NaN, the 2nd source operand (Op0) is passed through.
+  auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
+  SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
+  SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
+
+  // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
+  // are NaN, the NaN value of Op1 is the result.
+  auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
+  return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
+}
+
+static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
+                         TargetLowering::DAGCombinerInfo &DCI) {
+  // BT ignores high bits in the bit index operand.
+  SDValue Op1 = N->getOperand(1);
+  if (Op1.hasOneUse()) {
+    unsigned BitWidth = Op1.getValueSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                          !DCI.isBeforeLegalizeOps());
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
+        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
+      DCI.CommitTargetLoweringOpt(TLO);
+  }
+  return SDValue();
+}
+
+static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
+  SDLoc dl(N);
+
+  // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
+  // both SSE and AVX2 since there is no sign-extended shift right
+  // operation on a vector with 64-bit elements.
+  //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
+  // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
+  if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
+      N0.getOpcode() == ISD::SIGN_EXTEND)) {
+    SDValue N00 = N0.getOperand(0);
+
+    // EXTLOAD has a better solution on AVX2,
+    // it may be replaced with X86ISD::VSEXT node.
+    if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
+      if (!ISD::isNormalLoad(N00.getNode()))
+        return SDValue();
+
+    if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
+        SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
+                                  N00, N1);
+      return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
+    }
+  }
+  return SDValue();
+}
+
+/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
+/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
+/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
+/// opportunities to combine math ops, use an LEA, or use a complex addressing
+/// mode. This can eliminate extend, add, and shift instructions.
+static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
+                                   const X86Subtarget &Subtarget) {
+  if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
+      Ext->getOpcode() != ISD::ZERO_EXTEND)
+    return SDValue();
+
+  // TODO: This should be valid for other integer types.
+  EVT VT = Ext->getValueType(0);
+  if (VT != MVT::i64)
+    return SDValue();
+
+  SDValue Add = Ext->getOperand(0);
+  if (Add.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
+  bool NSW = Add->getFlags()->hasNoSignedWrap();
+  bool NUW = Add->getFlags()->hasNoUnsignedWrap();
+
+  // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
+  // into the 'zext'
+  if ((Sext && !NSW) || (!Sext && !NUW))
+    return SDValue();
+
+  // Having a constant operand to the 'add' ensures that we are not increasing
+  // the instruction count because the constant is extended for free below.
+  // A constant operand can also become the displacement field of an LEA.
+  auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
+  if (!AddOp1)
+    return SDValue();
+
+  // Don't make the 'add' bigger if there's no hope of combining it with some
+  // other 'add' or 'shl' instruction.
+  // TODO: It may be profitable to generate simpler LEA instructions in place
+  // of single 'add' instructions, but the cost model for selecting an LEA
+  // currently has a high threshold.
+  bool HasLEAPotential = false;
+  for (auto *User : Ext->uses()) {
+    if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
+      HasLEAPotential = true;
+      break;
+    }
+  }
+  if (!HasLEAPotential)
+    return SDValue();
+
+  // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
+  int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
+  SDValue AddOp0 = Add.getOperand(0);
+  SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
+  SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
+
+  // The wider add is guaranteed to not wrap because both operands are
+  // sign-extended.
+  SDNodeFlags Flags;
+  Flags.setNoSignedWrap(NSW);
+  Flags.setNoUnsignedWrap(NUW);
+  return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, &Flags);
+}
+
+/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
+/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
+/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
+/// extends from AH (which we otherwise need to do contortions to access).
+static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  auto OpcodeN = N->getOpcode();
+  auto OpcodeN0 = N0.getOpcode();
+  if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
+        (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  EVT InVT = N0.getValueType();
+  if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
+    return SDValue();
+
+  SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
+  auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
+                                               : X86ISD::UDIVREM8_ZEXT_HREG;
+  SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
+                          N0.getOperand(1));
+  DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
+  return R.getValue(1);
+}
+
+/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
+/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
+/// with UNDEFs) of the input to vectors of the same size as the target type
+/// which then extends the lowest elements.
+static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          const X86Subtarget &Subtarget) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
+    return SDValue();
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+  if (!Subtarget.hasSSE2())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT SVT = VT.getScalarType();
+  EVT InVT = N0.getValueType();
+  EVT InSVT = InVT.getScalarType();
+
+  // Input type must be a vector and we must be extending legal integer types.
+  if (!VT.isVector())
+    return SDValue();
+  if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
+    return SDValue();
+  if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
+    return SDValue();
+
+  // On AVX2+ targets, if the input/output types are both legal then we will be
+  // able to use SIGN_EXTEND/ZERO_EXTEND directly.
+  if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+      DAG.getTargetLoweringInfo().isTypeLegal(InVT))
+    return SDValue();
+
+  SDLoc DL(N);
+
+  auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
+    EVT InVT = N.getValueType();
+    EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
+                                 Size / InVT.getScalarSizeInBits());
+    SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
+                                  DAG.getUNDEF(InVT));
+    Opnds[0] = N;
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
+  };
+
+  // If target-size is less than 128-bits, extend to a type that would extend
+  // to 128 bits, extend that and extract the original target vector.
+  if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
+    unsigned Scale = 128 / VT.getSizeInBits();
+    EVT ExVT =
+        EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
+    SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
+    SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
+                       DAG.getIntPtrConstant(0, DL));
+  }
+
+  // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
+  // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
+  // Also use this if we don't have SSE41 to allow the legalizer do its job.
+  if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
+      (VT.is256BitVector() && Subtarget.hasInt256()) ||
+      (VT.is512BitVector() && Subtarget.hasAVX512())) {
+    SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
+    return Opcode == ISD::SIGN_EXTEND
+               ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
+               : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
+  }
+
+  auto SplitAndExtendInReg = [&](unsigned SplitSize) {
+    unsigned NumVecs = VT.getSizeInBits() / SplitSize;
+    unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
+    EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
+    EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
+
+    SmallVector<SDValue, 8> Opnds;
+    for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
+      SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
+                                   DAG.getIntPtrConstant(Offset, DL));
+      SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
+      SrcVec = Opcode == ISD::SIGN_EXTEND
+                   ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
+                   : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
+      Opnds.push_back(SrcVec);
+    }
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
+  };
+
+  // On pre-AVX2 targets, split into 128-bit nodes of
+  // ISD::*_EXTEND_VECTOR_INREG.
+  if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
+    return SplitAndExtendInReg(128);
+
+  // On pre-AVX512 targets, split into 256-bit nodes of
+  // ISD::*_EXTEND_VECTOR_INREG.
+  if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
+    return SplitAndExtendInReg(256);
+
+  return SDValue();
+}
+
+static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
+                           TargetLowering::DAGCombinerInfo &DCI,
+                           const X86Subtarget &Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT InVT = N0.getValueType();
+  SDLoc DL(N);
+
+  if (SDValue DivRem8 = getDivRem8(N, DAG))
+    return DivRem8;
+
+  if (!DCI.isBeforeLegalizeOps()) {
+    if (InVT == MVT::i1) {
+      SDValue Zero = DAG.getConstant(0, DL, VT);
+      SDValue AllOnes =
+          DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
+      return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
+    }
+    return SDValue();
+  }
+
+  if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
+    return V;
+
+  if (Subtarget.hasAVX() && VT.is256BitVector())
+    if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
+      return R;
+
+  if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
+    return NewAdd;
+
+  return SDValue();
+}
+
+static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
+                          const X86Subtarget &Subtarget) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  EVT ScalarVT = VT.getScalarType();
+  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
+    return SDValue();
+
+  SDValue A = N->getOperand(0);
+  SDValue B = N->getOperand(1);
+  SDValue C = N->getOperand(2);
+
+  auto invertIfNegative = [](SDValue &V) {
+    if (SDValue NegVal = isFNEG(V.getNode())) {
+      V = NegVal;
+      return true;
+    }
+    return false;
+  };
+
+  // Do not convert the passthru input of scalar intrinsics.
+  // FIXME: We could allow negations of the lower element only.
+  bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
+  bool NegB = invertIfNegative(B);
+  bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
+
+  // Negative multiplication when NegA xor NegB
+  bool NegMul = (NegA != NegB);
+
+  unsigned NewOpcode;
+  if (!NegMul)
+    NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
+  else
+    NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
+
+
+  if (N->getOpcode() == X86ISD::FMADD_RND) {
+    switch (NewOpcode) {
+    case X86ISD::FMADD:  NewOpcode = X86ISD::FMADD_RND; break;
+    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUB_RND; break;
+    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
+    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
+    }
+  } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
+    switch (NewOpcode) {
+    case X86ISD::FMADD:  NewOpcode = X86ISD::FMADDS1_RND; break;
+    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS1_RND; break;
+    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
+    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
+    }
+  } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
+    switch (NewOpcode) {
+    case X86ISD::FMADD:  NewOpcode = X86ISD::FMADDS3_RND; break;
+    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS3_RND; break;
+    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
+    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
+    }
+  } else {
+    assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
+           "Unexpected opcode!");
+    return DAG.getNode(NewOpcode, dl, VT, A, B, C);
+  }
+
+  return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
+}
+
+static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
+                           TargetLowering::DAGCombinerInfo &DCI,
+                           const X86Subtarget &Subtarget) {
+  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
+  //           (and (i32 x86isd::setcc_carry), 1)
+  // This eliminates the zext. This transformation is necessary because
+  // ISD::SETCC is always legalized to i8.
+  SDLoc dl(N);
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (N0.getOpcode() == ISD::AND &&
+      N0.hasOneUse() &&
+      N0.getOperand(0).hasOneUse()) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+      if (!isOneConstant(N0.getOperand(1)))
+        return SDValue();
+      return DAG.getNode(ISD::AND, dl, VT,
+                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+                                     N00.getOperand(0), N00.getOperand(1)),
+                         DAG.getConstant(1, dl, VT));
+    }
+  }
+
+  if (N0.getOpcode() == ISD::TRUNCATE &&
+      N0.hasOneUse() &&
+      N0.getOperand(0).hasOneUse()) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+      return DAG.getNode(ISD::AND, dl, VT,
+                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+                                     N00.getOperand(0), N00.getOperand(1)),
+                         DAG.getConstant(1, dl, VT));
+    }
+  }
+
+  if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
+    return V;
+
+  if (VT.is256BitVector())
+    if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
+      return R;
+
+  if (SDValue DivRem8 = getDivRem8(N, DAG))
+    return DivRem8;
+
+  if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
+    return NewAdd;
+
+  if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
+    return R;
+
+  return SDValue();
+}
+
+/// Optimize x == -y --> x+y == 0
+///          x != -y --> x+y != 0
+static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
+                            const X86Subtarget &Subtarget) {
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
+    if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
+      SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
+                                 LHS.getOperand(1));
+      return DAG.getSetCC(DL, N->getValueType(0), addV,
+                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+    }
+  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
+    if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
+      SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
+                                 RHS.getOperand(1));
+      return DAG.getSetCC(DL, N->getValueType(0), addV,
+                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+    }
+
+  if (VT.getScalarType() == MVT::i1 &&
+      (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
+    bool IsSEXT0 =
+        (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+        (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
+    bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+    if (!IsSEXT0 || !IsVZero1) {
+      // Swap the operands and update the condition code.
+      std::swap(LHS, RHS);
+      CC = ISD::getSetCCSwappedOperands(CC);
+
+      IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+                (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
+      IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+    }
+
+    if (IsSEXT0 && IsVZero1) {
+      assert(VT == LHS.getOperand(0).getValueType() &&
+             "Uexpected operand type");
+      if (CC == ISD::SETGT)
+        return DAG.getConstant(0, DL, VT);
+      if (CC == ISD::SETLE)
+        return DAG.getConstant(1, DL, VT);
+      if (CC == ISD::SETEQ || CC == ISD::SETGE)
+        return DAG.getNOT(DL, LHS.getOperand(0), VT);
+
+      assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
+             "Unexpected condition code!");
+      return LHS.getOperand(0);
+    }
+  }
+
+  // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
+  // to avoid scalarization via legalization because v4i32 is not a legal type.
+  if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
+      LHS.getValueType() == MVT::v4f32)
+    return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
+
+  return SDValue();
+}
+
+static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  // Gather and Scatter instructions use k-registers for masks. The type of
+  // the masks is v*i1. So the mask will be truncated anyway.
+  // The SIGN_EXTEND_INREG my be dropped.
+  SDValue Mask = N->getOperand(2);
+  if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+    NewOps[2] = Mask.getOperand(0);
+    DAG.UpdateNodeOperands(N, NewOps);
+  }
+  return SDValue();
+}
+
+// Helper function of performSETCCCombine. It is to materialize "setb reg"
+// as "sbb reg,reg", since it can be extended without zext and produces
+// an all-ones bit which is more useful than 0/1 in some cases.
+static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
+                               SelectionDAG &DAG, MVT VT) {
+  if (VT == MVT::i8)
+    return DAG.getNode(ISD::AND, DL, VT,
+                       DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+                                   DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                                   EFLAGS),
+                       DAG.getConstant(1, DL, VT));
+  assert (VT == MVT::i1 && "Unexpected type for SECCC node");
+  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
+                     DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                                 EFLAGS));
+}
+
+// Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
+static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
+                               TargetLowering::DAGCombinerInfo &DCI,
+                               const X86Subtarget &Subtarget) {
+  SDLoc DL(N);
+  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
+  SDValue EFLAGS = N->getOperand(1);
+
+  if (CC == X86::COND_A) {
+    // Try to convert COND_A into COND_B in an attempt to facilitate
+    // materializing "setb reg".
+    //
+    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+    // cannot take an immediate as its first operand.
+    //
+    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+        EFLAGS.getValueType().isInteger() &&
+        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
+                                   EFLAGS.getNode()->getVTList(),
+                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+      return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
+    }
+  }
+
+  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
+  // a zext and produces an all-ones bit which is more useful than 0/1 in some
+  // cases.
+  if (CC == X86::COND_B)
+    return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
+
+  // Try to simplify the EFLAGS and condition code operands.
+  if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
+    return getSETCC(CC, Flags, DL, DAG);
+
+  return SDValue();
+}
+
+/// Optimize branch condition evaluation.
+static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI,
+                             const X86Subtarget &Subtarget) {
+  SDLoc DL(N);
+  SDValue EFLAGS = N->getOperand(3);
+  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
+
+  // Try to simplify the EFLAGS and condition code operands.
+  // Make sure to not keep references to operands, as combineSetCCEFLAGS can
+  // RAUW them under us.
+  if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
+    SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
+    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
+                       N->getOperand(1), Cond, Flags);
+  }
+
+  return SDValue();
+}
+
+static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
+                                                  SelectionDAG &DAG) {
+  // Take advantage of vector comparisons producing 0 or -1 in each lane to
+  // optimize away operation when it's from a constant.
+  //
+  // The general transformation is:
+  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
+  //       AND(VECTOR_CMP(x,y), constant2)
+  //    constant2 = UNARYOP(constant)
+
+  // Early exit if this isn't a vector operation, the operand of the
+  // unary operation isn't a bitwise AND, or if the sizes of the operations
+  // aren't the same.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
+      N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
+      VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
+    return SDValue();
+
+  // Now check that the other operand of the AND is a constant. We could
+  // make the transformation for non-constant splats as well, but it's unclear
+  // that would be a benefit as it would not eliminate any operations, just
+  // perform one more step in scalar code before moving to the vector unit.
+  if (BuildVectorSDNode *BV =
+          dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+    // Bail out if the vector isn't a constant.
+    if (!BV->isConstant())
+      return SDValue();
+
+    // Everything checks out. Build up the new and improved node.
+    SDLoc DL(N);
+    EVT IntVT = BV->getValueType(0);
+    // Create a new constant of the appropriate type for the transformed
+    // DAG.
+    SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+    // The AND node needs bitcasts to/from an integer vector type around it.
+    SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
+    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
+                                 N->getOperand(0)->getOperand(0), MaskConst);
+    SDValue Res = DAG.getBitcast(VT, NewAnd);
+    return Res;
+  }
+
+  return SDValue();
+}
+
+static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
+                               const X86Subtarget &Subtarget) {
+  SDValue Op0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT InVT = Op0.getValueType();
+  EVT InSVT = InVT.getScalarType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
+  // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
+  if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
+    SDLoc dl(N);
+    EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                 InVT.getVectorNumElements());
+    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
+
+    if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
+      return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
+
+    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
+  }
+
+  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
+  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+  // the optimization here.
+  if (DAG.SignBitIsZero(Op0))
+    return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
+
+  return SDValue();
+}
+
+static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
+                               const X86Subtarget &Subtarget) {
+  // First try to optimize away the conversion entirely when it's
+  // conditionally from a constant. Vectors only.
+  if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
+    return Res;
+
+  // Now move on to more general possibilities.
+  SDValue Op0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT InVT = Op0.getValueType();
+  EVT InSVT = InVT.getScalarType();
+
+  // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
+  // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
+  // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
+  if (InVT.isVector() &&
+      (InSVT == MVT::i8 || InSVT == MVT::i16 ||
+       (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
+    SDLoc dl(N);
+    EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                 InVT.getVectorNumElements());
+    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
+  }
+
+  // Without AVX512DQ we only support i64 to float scalar conversion. For both
+  // vectors and scalars, see if we know that the upper bits are all the sign
+  // bit, in which case we can truncate the input to i32 and convert from that.
+  if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
+    unsigned BitWidth = InVT.getScalarSizeInBits();
+    unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
+    if (NumSignBits >= (BitWidth - 31)) {
+      EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
+      if (InVT.isVector())
+        TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
+                                   InVT.getVectorNumElements());
+      SDLoc dl(N);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
+      return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
+    }
+  }
+
+  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
+  // a 32-bit target where SSE doesn't support i64->FP operations.
+  if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
+    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
+    EVT LdVT = Ld->getValueType(0);
+
+    // This transformation is not supported if the result type is f16 or f128.
+    if (VT == MVT::f16 || VT == MVT::f128)
+      return SDValue();
+
+    if (!Ld->isVolatile() && !VT.isVector() &&
+        ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
+        !Subtarget.is64Bit() && LdVT == MVT::i64) {
+      SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
+          SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
+      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
+      return FILDChain;
+    }
+  }
+  return SDValue();
+}
+
+// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
+static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
+                          X86TargetLowering::DAGCombinerInfo &DCI) {
+  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
+  // the result is either zero or one (depending on the input carry bit).
+  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
+  if (X86::isZeroNode(N->getOperand(0)) &&
+      X86::isZeroNode(N->getOperand(1)) &&
+      // We don't have a good way to replace an EFLAGS use, so only do this when
+      // dead right now.
+      SDValue(N, 1).use_empty()) {
+    SDLoc DL(N);
+    EVT VT = N->getValueType(0);
+    SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
+    SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
+                               DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                                           DAG.getConstant(X86::COND_B, DL,
+                                                           MVT::i8),
+                                           N->getOperand(2)),
+                               DAG.getConstant(1, DL, VT));
+    return DCI.CombineTo(N, Res1, CarryOut);
+  }
+
+  return SDValue();
+}
+
+/// fold (add Y, (sete  X, 0)) -> adc  0, Y
+///      (add Y, (setne X, 0)) -> sbb -1, Y
+///      (sub (sete  X, 0), Y) -> sbb  0, Y
+///      (sub (setne X, 0), Y) -> adc -1, Y
+static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+
+  // Look through ZExts.
+  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
+  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
+    return SDValue();
+
+  SDValue SetCC = Ext.getOperand(0);
+  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
+    return SDValue();
+
+  X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
+  if (CC != X86::COND_E && CC != X86::COND_NE)
+    return SDValue();
+
+  SDValue Cmp = SetCC.getOperand(1);
+  if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
+      !X86::isZeroNode(Cmp.getOperand(1)) ||
+      !Cmp.getOperand(0).getValueType().isInteger())
+    return SDValue();
+
+  SDValue CmpOp0 = Cmp.getOperand(0);
+  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
+                               DAG.getConstant(1, DL, CmpOp0.getValueType()));
+
+  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
+  if (CC == X86::COND_NE)
+    return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
+                       DL, OtherVal.getValueType(), OtherVal,
+                       DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
+                       NewCmp);
+  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
+                     DL, OtherVal.getValueType(), OtherVal,
+                     DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
+}
+
+static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // TODO: There's nothing special about i32, any integer type above i16 should
+  // work just as well.
+  if (!VT.isVector() || !VT.isSimple() ||
+      !(VT.getVectorElementType() == MVT::i32))
+    return SDValue();
+
+  unsigned RegSize = 128;
+  if (Subtarget.hasBWI())
+    RegSize = 512;
+  else if (Subtarget.hasAVX2())
+    RegSize = 256;
+
+  // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+  // TODO: We should be able to handle larger vectors by splitting them before
+  // feeding them into several SADs, and then reducing over those.
+  if (VT.getSizeInBits() / 4 > RegSize)
+    return SDValue();
+
+  // We know N is a reduction add, which means one of its operands is a phi.
+  // To match SAD, we need the other operand to be a vector select.
+  SDValue SelectOp, Phi;
+  if (Op0.getOpcode() == ISD::VSELECT) {
+    SelectOp = Op0;
+    Phi = Op1;
+  } else if (Op1.getOpcode() == ISD::VSELECT) {
+    SelectOp = Op1;
+    Phi = Op0;
+  } else
+    return SDValue();
+
+  // Check whether we have an abs-diff pattern feeding into the select.
+  if(!detectZextAbsDiff(SelectOp, Op0, Op1))
+    return SDValue();
+
+  // SAD pattern detected. Now build a SAD instruction and an addition for
+  // reduction. Note that the number of elements of the result of SAD is less
+  // than the number of elements of its input. Therefore, we could only update
+  // part of elements in the reduction vector.
+  SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
+
+  // The output of PSADBW is a vector of i64.
+  // We need to turn the vector of i64 into a vector of i32.
+  // If the reduction vector is at least as wide as the psadbw result, just
+  // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
+  // anyway.
+  MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
+  if (VT.getSizeInBits() >= ResVT.getSizeInBits())
+    Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
+  else
+    Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
+
+  if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
+    // Update part of elements of the reduction vector. This is done by first
+    // extracting a sub-vector from it, updating this sub-vector, and inserting
+    // it back.
+    SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
+                                 DAG.getIntPtrConstant(0, DL));
+    SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
+                       DAG.getIntPtrConstant(0, DL));
+  } else
+    return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
+}
+
+static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
+                          const X86Subtarget &Subtarget) {
+  const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
+  if (Flags->hasVectorReduction()) {
+    if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
+      return Sad;
+  }
+  EVT VT = N->getValueType(0);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+       (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+      isHorizontalBinOp(Op0, Op1, true))
+    return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
+
+  return OptimizeConditionalInDecrement(N, DAG);
+}
+
+static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
+                          const X86Subtarget &Subtarget) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // X86 can't encode an immediate LHS of a sub. See if we can push the
+  // negation into a preceding instruction.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
+    // If the RHS of the sub is a XOR with one use and a constant, invert the
+    // immediate. Then add one to the LHS of the sub so we can turn
+    // X-Y -> X+~Y+1, saving one register.
+    if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
+        isa<ConstantSDNode>(Op1.getOperand(1))) {
+      APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
+      EVT VT = Op0.getValueType();
+      SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
+                                   Op1.getOperand(0),
+                                   DAG.getConstant(~XorC, SDLoc(Op1), VT));
+      return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
+                         DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
+    }
+  }
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  EVT VT = N->getValueType(0);
+  if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+       (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+      isHorizontalBinOp(Op0, Op1, true))
+    return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
+
+  return OptimizeConditionalInDecrement(N, DAG);
+}
+
+static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI,
+                             const X86Subtarget &Subtarget) {
+  SDLoc DL(N);
+  unsigned Opcode = N->getOpcode();
+  MVT VT = N->getSimpleValueType(0);
+  MVT SVT = VT.getVectorElementType();
+  SDValue Op = N->getOperand(0);
+  MVT OpVT = Op.getSimpleValueType();
+  MVT OpEltVT = OpVT.getVectorElementType();
+  unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
+
+  // Perform any constant folding.
+  // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
+  if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+    unsigned NumDstElts = VT.getVectorNumElements();
+    SmallBitVector Undefs(NumDstElts, false);
+    SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0));
+    for (unsigned i = 0; i != NumDstElts; ++i) {
+      SDValue OpElt = Op.getOperand(i);
+      if (OpElt.getOpcode() == ISD::UNDEF) {
+        Undefs[i] = true;
+        continue;
+      }
+      APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
+      Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits())
+                                        : Cst.sextOrTrunc(SVT.getSizeInBits());
+    }
+    return getConstVector(Vals, Undefs, VT, DAG, DL);
+  }
+
+  // (vzext (bitcast (vzext (x)) -> (vzext x)
+  // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
+  SDValue V = peekThroughBitcasts(Op);
+  if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
+    MVT InnerVT = V.getSimpleValueType();
+    MVT InnerEltVT = InnerVT.getVectorElementType();
+
+    // If the element sizes match exactly, we can just do one larger vzext. This
+    // is always an exact type match as vzext operates on integer types.
+    if (OpEltVT == InnerEltVT) {
+      assert(OpVT == InnerVT && "Types must match for vzext!");
+      return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
+    }
+
+    // The only other way we can combine them is if only a single element of the
+    // inner vzext is used in the input to the outer vzext.
+    if (InnerEltVT.getSizeInBits() < InputBits)
+      return SDValue();
+
+    // In this case, the inner vzext is completely dead because we're going to
+    // only look at bits inside of the low element. Just do the outer vzext on
+    // a bitcast of the input to the inner.
+    return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
+  }
+
+  // Check if we can bypass extracting and re-inserting an element of an input
+  // vector. Essentially:
+  // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
+  // TODO: Add X86ISD::VSEXT support
+  if (Opcode == X86ISD::VZEXT &&
+      V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
+    SDValue ExtractedV = V.getOperand(0);
+    SDValue OrigV = ExtractedV.getOperand(0);
+    if (isNullConstant(ExtractedV.getOperand(1))) {
+        MVT OrigVT = OrigV.getSimpleValueType();
+        // Extract a subvector if necessary...
+        if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
+          int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
+          OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
+                                    OrigVT.getVectorNumElements() / Ratio);
+          OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
+                              DAG.getIntPtrConstant(0, DL));
+        }
+        Op = DAG.getBitcast(OpVT, OrigV);
+        return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
+      }
+  }
+
+  return SDValue();
+}
+
+/// Canonicalize (LSUB p, 1) -> (LADD p, -1).
+static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget &Subtarget) {
+  SDValue Chain = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  MVT VT = RHS.getSimpleValueType();
+  SDLoc DL(N);
+
+  auto *C = dyn_cast<ConstantSDNode>(RHS);
+  if (!C || C->getZExtValue() != 1)
+    return SDValue();
+
+  RHS = DAG.getConstant(-1, DL, VT);
+  MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
+  return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
+                                 DAG.getVTList(MVT::i32, MVT::Other),
+                                 {Chain, LHS, RHS}, VT, MMO);
+}
+
+// TEST (AND a, b) ,(AND a, b) -> TEST a, b
+static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  return DAG.getNode(X86ISD::TESTM, DL, VT,
+                     Op0->getOperand(0), Op0->getOperand(1));
+}
+
+static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
+                                    const X86Subtarget &Subtarget) {
+  MVT VT = N->getSimpleValueType(0);
+  SDLoc DL(N);
+
+  if (N->getOperand(0) == N->getOperand(1)) {
+    if (N->getOpcode() == X86ISD::PCMPEQ)
+      return getOnesVector(VT, Subtarget, DAG, DL);
+    if (N->getOpcode() == X86ISD::PCMPGT)
+      return getZeroVector(VT, Subtarget, DAG, DL);
+  }
+
+  return SDValue();
+}
+
+
+SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::EXTRACT_VECTOR_ELT:
+    return combineExtractVectorElt(N, DAG, DCI, Subtarget);
+  case ISD::VSELECT:
+  case ISD::SELECT:
+  case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
+  case ISD::BITCAST:        return combineBitcast(N, DAG, Subtarget);
+  case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
+  case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
+  case ISD::SUB:            return combineSub(N, DAG, Subtarget);
+  case X86ISD::ADC:         return combineADC(N, DAG, DCI);
+  case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:            return combineShift(N, DAG, DCI, Subtarget);
+  case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
+  case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
+  case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
+  case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
+  case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
+  case ISD::STORE:          return combineStore(N, DAG, Subtarget);
+  case ISD::MSTORE:         return combineMaskedStore(N, DAG, Subtarget);
+  case ISD::SINT_TO_FP:     return combineSIntToFP(N, DAG, Subtarget);
+  case ISD::UINT_TO_FP:     return combineUIntToFP(N, DAG, Subtarget);
+  case ISD::FADD:
+  case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
+  case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
+  case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
+  case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
+  case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
+  case X86ISD::FXOR:
+  case X86ISD::FOR:         return combineFOr(N, DAG, Subtarget);
+  case X86ISD::FMIN:
+  case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
+  case X86ISD::BT:          return combineBT(N, DAG, DCI);
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
+  case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
+  case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
+  case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
+  case X86ISD::SETCC:       return combineX86SetCC(N, DAG, DCI, Subtarget);
+  case X86ISD::BRCOND:      return combineBrCond(N, DAG, DCI, Subtarget);
+  case X86ISD::VSHLI:
+  case X86ISD::VSRLI:       return combineVectorShift(N, DAG, DCI, Subtarget);
+  case X86ISD::VSEXT:
+  case X86ISD::VZEXT:       return combineVSZext(N, DAG, DCI, Subtarget);
+  case X86ISD::SHUFP:       // Handle all target specific shuffles
+  case X86ISD::INSERTPS:
+  case X86ISD::PALIGNR:
+  case X86ISD::VSHLDQ:
+  case X86ISD::VSRLDQ:
+  case X86ISD::BLENDI:
+  case X86ISD::UNPCKH:
+  case X86ISD::UNPCKL:
+  case X86ISD::MOVHLPS:
+  case X86ISD::MOVLHPS:
+  case X86ISD::PSHUFB:
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+  case X86ISD::MOVSHDUP:
+  case X86ISD::MOVSLDUP:
+  case X86ISD::MOVDDUP:
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD:
+  case X86ISD::VPPERM:
+  case X86ISD::VPERMI:
+  case X86ISD::VPERMV:
+  case X86ISD::VPERMV3:
+  case X86ISD::VPERMIV3:
+  case X86ISD::VPERMIL2:
+  case X86ISD::VPERMILPI:
+  case X86ISD::VPERMILPV:
+  case X86ISD::VPERM2X128:
+  case X86ISD::VZEXT_MOVL:
+  case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
+  case X86ISD::FMADD:
+  case X86ISD::FMADD_RND:
+  case X86ISD::FMADDS1_RND:
+  case X86ISD::FMADDS3_RND:
+  case ISD::FMA:            return combineFMA(N, DAG, Subtarget);
+  case ISD::MGATHER:
+  case ISD::MSCATTER:       return combineGatherScatter(N, DAG);
+  case X86ISD::LSUB:        return combineLockSub(N, DAG, Subtarget);
+  case X86ISD::TESTM:       return combineTestM(N, DAG);
+  case X86ISD::PCMPEQ:
+  case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
+  }
+
+  return SDValue();
+}
+
+/// Return true if the target has native support for the specified value type
+/// and it is 'desirable' to use the type for the given node type. e.g. On x86
+/// i16 is legal, but undesirable since i16 instruction encodings are longer and
+/// some i16 instructions are slow.
+bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
+  if (!isTypeLegal(VT))
+    return false;
+  if (VT != MVT::i16)
+    return true;
+
+  switch (Opc) {
+  default:
+    return true;
+  case ISD::LOAD:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SUB:
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    return false;
+  }
+}
+
+/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
+/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
+/// we don't adjust the stack we clobber the first frame index.
+/// See X86InstrInfo::copyPhysReg.
+bool X86TargetLowering::hasCopyImplyingStackAdjustment(
+    MachineFunction *MF) const {
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  return any_of(MRI.reg_instructions(X86::EFLAGS),
+                [](const MachineInstr &RI) { return RI.isCopy(); });
+}
+
+/// This method query the target whether it is beneficial for dag combiner to
+/// promote the specified node. If true, it should return the desired promotion
+/// type by reference.
+bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
+  EVT VT = Op.getValueType();
+  if (VT != MVT::i16)
+    return false;
+
+  bool Promote = false;
+  bool Commute = false;
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+    Promote = true;
+    break;
+  case ISD::SHL:
+  case ISD::SRL: {
+    SDValue N0 = Op.getOperand(0);
+    // Look out for (store (shl (load), x)).
+    if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
+      return false;
+    Promote = true;
+    break;
+  }
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    Commute = true;
+    LLVM_FALLTHROUGH;
+  case ISD::SUB: {
+    SDValue N0 = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
+    if (!Commute && MayFoldLoad(N1))
+      return false;
+    // Avoid disabling potential load folding opportunities.
+    if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
+      return false;
+    if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
+      return false;
+    Promote = true;
+  }
+  }
+
+  PVT = MVT::i32;
+  return Promote;
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+// Helper to match a string separated by whitespace.
+static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
+  S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
+
+  for (StringRef Piece : Pieces) {
+    if (!S.startswith(Piece)) // Check if the piece matches.
+      return false;
+
+    S = S.substr(Piece.size());
+    StringRef::size_type Pos = S.find_first_not_of(" \t");
+    if (Pos == 0) // We matched a prefix.
+      return false;
+
+    S = S.substr(Pos);
+  }
+
+  return S.empty();
+}
+
+static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
+
+  if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
+    if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
+        std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
+        std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
+
+      if (AsmPieces.size() == 3)
+        return true;
+      else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
+  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+
+  const std::string &AsmStr = IA->getAsmString();
+
+  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+  if (!Ty || Ty->getBitWidth() % 16 != 0)
+    return false;
+
+  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
+  SmallVector<StringRef, 4> AsmPieces;
+  SplitString(AsmStr, AsmPieces, ";\n");
+
+  switch (AsmPieces.size()) {
+  default: return false;
+  case 1:
+    // FIXME: this should verify that we are targeting a 486 or better.  If not,
+    // we will turn this bswap into something that will be lowered to logical
+    // ops instead of emitting the bswap asm.  For now, we don't support 486 or
+    // lower so don't worry about this.
+    // bswap $0
+    if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
+        matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
+        matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
+        matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
+        matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
+        matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
+      // No need to check constraints, nothing other than the equivalent of
+      // "=r,0" would be valid here.
+      return IntrinsicLowering::LowerToByteSwap(CI);
+    }
+
+    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
+    if (CI->getType()->isIntegerTy(16) &&
+        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+        (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
+         matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
+      AsmPieces.clear();
+      StringRef ConstraintsStr = IA->getConstraintString();
+      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
+      if (clobbersFlagRegisters(AsmPieces))
+        return IntrinsicLowering::LowerToByteSwap(CI);
+    }
+    break;
+  case 3:
+    if (CI->getType()->isIntegerTy(32) &&
+        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+        matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
+        matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
+        matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
+      AsmPieces.clear();
+      StringRef ConstraintsStr = IA->getConstraintString();
+      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
+      if (clobbersFlagRegisters(AsmPieces))
+        return IntrinsicLowering::LowerToByteSwap(CI);
+    }
+
+    if (CI->getType()->isIntegerTy(64)) {
+      InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
+      if (Constraints.size() >= 2 &&
+          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
+          Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+        // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
+        if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
+            matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
+            matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
+          return IntrinsicLowering::LowerToByteSwap(CI);
+      }
+    }
+    break;
+  }
+  return false;
+}
+
+/// Given a constraint letter, return the type of constraint for this target.
+X86TargetLowering::ConstraintType
+X86TargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'R':
+    case 'q':
+    case 'Q':
+    case 'f':
+    case 't':
+    case 'u':
+    case 'y':
+    case 'x':
+    case 'v':
+    case 'Y':
+    case 'l':
+      return C_RegisterClass;
+    case 'k': // AVX512 masking registers.
+    case 'a':
+    case 'b':
+    case 'c':
+    case 'd':
+    case 'S':
+    case 'D':
+    case 'A':
+      return C_Register;
+    case 'I':
+    case 'J':
+    case 'K':
+    case 'L':
+    case 'M':
+    case 'N':
+    case 'G':
+    case 'C':
+    case 'e':
+    case 'Z':
+      return C_Other;
+    default:
+      break;
+    }
+  }
+  else if (Constraint.size() == 2) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'Y':
+      switch (Constraint[1]) {
+      default:
+        break;
+      case 'k':
+        return C_Register;
+      }
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+  X86TargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+  case 'R':
+  case 'q':
+  case 'Q':
+  case 'a':
+  case 'b':
+  case 'c':
+  case 'd':
+  case 'S':
+  case 'D':
+  case 'A':
+    if (CallOperandVal->getType()->isIntegerTy())
+      weight = CW_SpecificReg;
+    break;
+  case 'f':
+  case 't':
+  case 'u':
+    if (type->isFloatingPointTy())
+      weight = CW_SpecificReg;
+    break;
+  case 'y':
+    if (type->isX86_MMXTy() && Subtarget.hasMMX())
+      weight = CW_SpecificReg;
+    break;
+  case 'Y':
+    // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
+    if (constraint[1] == 'k') {
+      // Support for 'Yk' (similarly to the 'k' variant below).
+      weight = CW_SpecificReg;
+      break;
+    }
+  // Else fall through (handle "Y" constraint).
+    LLVM_FALLTHROUGH;
+  case 'v':
+    if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
+      weight = CW_Register;
+    LLVM_FALLTHROUGH;
+  case 'x':
+    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
+        ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
+      weight = CW_Register;
+    break;
+  case 'k':
+    // Enable conditional vector operations using %k<#> registers.
+    weight = CW_SpecificReg;
+    break;
+  case 'I':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
+      if (C->getZExtValue() <= 31)
+        weight = CW_Constant;
+    }
+    break;
+  case 'J':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 63)
+        weight = CW_Constant;
+    }
+    break;
+  case 'K':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
+        weight = CW_Constant;
+    }
+    break;
+  case 'L':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
+        weight = CW_Constant;
+    }
+    break;
+  case 'M':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 3)
+        weight = CW_Constant;
+    }
+    break;
+  case 'N':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 0xff)
+        weight = CW_Constant;
+    }
+    break;
+  case 'G':
+  case 'C':
+    if (isa<ConstantFP>(CallOperandVal)) {
+      weight = CW_Constant;
+    }
+    break;
+  case 'e':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getSExtValue() >= -0x80000000LL) &&
+          (C->getSExtValue() <= 0x7fffffffLL))
+        weight = CW_Constant;
+    }
+    break;
+  case 'Z':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 0xffffffff)
+        weight = CW_Constant;
+    }
+    break;
+  }
+  return weight;
+}
+
+/// Try to replace an X constraint, which matches anything, with another that
+/// has more specific requirements based on the type of the corresponding
+/// operand.
+const char *X86TargetLowering::
+LowerXConstraint(EVT ConstraintVT) const {
+  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
+  // 'f' like normal targets.
+  if (ConstraintVT.isFloatingPoint()) {
+    if (Subtarget.hasSSE2())
+      return "Y";
+    if (Subtarget.hasSSE1())
+      return "x";
+  }
+
+  return TargetLowering::LowerXConstraint(ConstraintVT);
+}
+
+/// Lower the specified operand into the Ops vector.
+/// If it is invalid, don't add anything to Ops.
+void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result;
+
+  // Only support length 1 constraints for now.
+  if (Constraint.length() > 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break;
+  case 'I':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 31) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'J':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 63) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'K':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (isInt<8>(C->getSExtValue())) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'L':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
+          (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
+        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'M':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 3) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'N':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 255) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'O':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 127) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'e': {
+    // 32-bit signed value
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+                                           C->getSExtValue())) {
+        // Widen to 64 bits here to get it sign extended.
+        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
+        break;
+      }
+    // FIXME gcc accepts some relocatable values here too, but only in certain
+    // memory models; it's complicated.
+    }
+    return;
+  }
+  case 'Z': {
+    // 32-bit unsigned value
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+                                           C->getZExtValue())) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+                                       Op.getValueType());
+        break;
+      }
+    }
+    // FIXME gcc accepts some relocatable values here too, but only in certain
+    // memory models; it's complicated.
+    return;
+  }
+  case 'i': {
+    // Literal immediates are always ok.
+    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
+      // Widen to 64 bits here to get it sign extended.
+      Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
+      break;
+    }
+
+    // In any sort of PIC mode addresses need to be computed at runtime by
+    // adding in a register or some sort of table lookup.  These can't
+    // be used as immediates.
+    if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
+      return;
+
+    // If we are in non-pic codegen mode, we allow the address of a global (with
+    // an optional displacement) to be used with 'i'.
+    GlobalAddressSDNode *GA = nullptr;
+    int64_t Offset = 0;
+
+    // Match either (GA), (GA+C), (GA+C1+C2), etc.
+    while (1) {
+      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
+        Offset += GA->getOffset();
+        break;
+      } else if (Op.getOpcode() == ISD::ADD) {
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+          Offset += C->getZExtValue();
+          Op = Op.getOperand(0);
+          continue;
+        }
+      } else if (Op.getOpcode() == ISD::SUB) {
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+          Offset += -C->getZExtValue();
+          Op = Op.getOperand(0);
+          continue;
+        }
+      }
+
+      // Otherwise, this isn't something we can handle, reject it.
+      return;
+    }
+
+    const GlobalValue *GV = GA->getGlobal();
+    // If we require an extra load to get this address, as in PIC mode, we
+    // can't accept it.
+    if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
+      return;
+
+    Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
+                                        GA->getValueType(0), Offset);
+    break;
+  }
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+/// Check if \p RC is a general purpose register class.
+/// I.e., GR* or one of their variant.
+static bool isGRClass(const TargetRegisterClass &RC) {
+  return RC.hasSuperClassEq(&X86::GR8RegClass) ||
+         RC.hasSuperClassEq(&X86::GR16RegClass) ||
+         RC.hasSuperClassEq(&X86::GR32RegClass) ||
+         RC.hasSuperClassEq(&X86::GR64RegClass) ||
+         RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
+}
+
+/// Check if \p RC is a vector register class.
+/// I.e., FR* / VR* or one of their variant.
+static bool isFRClass(const TargetRegisterClass &RC) {
+  return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
+         RC.hasSuperClassEq(&X86::FR64XRegClass) ||
+         RC.hasSuperClassEq(&X86::VR128XRegClass) ||
+         RC.hasSuperClassEq(&X86::VR256XRegClass) ||
+         RC.hasSuperClassEq(&X86::VR512RegClass);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                StringRef Constraint,
+                                                MVT VT) const {
+  // First, see if this is a constraint that directly corresponds to an LLVM
+  // register class.
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+      // TODO: Slight differences here in allocation order and leaving
+      // RIP in the class. Do they matter any more here than they do
+      // in the normal allocation?
+    case 'k':
+      if (Subtarget.hasAVX512()) {
+        //  Only supported in AVX512 or later.
+        switch (VT.SimpleTy) {
+        default: break;
+        case MVT::i32:
+          return std::make_pair(0U, &X86::VK32RegClass);
+        case MVT::i16:
+          return std::make_pair(0U, &X86::VK16RegClass);
+        case MVT::i8:
+          return std::make_pair(0U, &X86::VK8RegClass);
+        case MVT::i1:
+          return std::make_pair(0U, &X86::VK1RegClass);
+        case MVT::i64:
+          return std::make_pair(0U, &X86::VK64RegClass);
+        }
+      }
+      break;
+    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
+      if (Subtarget.is64Bit()) {
+        if (VT == MVT::i32 || VT == MVT::f32)
+          return std::make_pair(0U, &X86::GR32RegClass);
+        if (VT == MVT::i16)
+          return std::make_pair(0U, &X86::GR16RegClass);
+        if (VT == MVT::i8 || VT == MVT::i1)
+          return std::make_pair(0U, &X86::GR8RegClass);
+        if (VT == MVT::i64 || VT == MVT::f64)
+          return std::make_pair(0U, &X86::GR64RegClass);
+        break;
+      }
+      // 32-bit fallthrough
+    case 'Q':   // Q_REGS
+      if (VT == MVT::i32 || VT == MVT::f32)
+        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
+      if (VT == MVT::i8 || VT == MVT::i1)
+        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
+      if (VT == MVT::i64)
+        return std::make_pair(0U, &X86::GR64_ABCDRegClass);
+      break;
+    case 'r':   // GENERAL_REGS
+    case 'l':   // INDEX_REGS
+      if (VT == MVT::i8 || VT == MVT::i1)
+        return std::make_pair(0U, &X86::GR8RegClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, &X86::GR16RegClass);
+      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
+        return std::make_pair(0U, &X86::GR32RegClass);
+      return std::make_pair(0U, &X86::GR64RegClass);
+    case 'R':   // LEGACY_REGS
+      if (VT == MVT::i8 || VT == MVT::i1)
+        return std::make_pair(0U, &X86::GR8_NOREXRegClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, &X86::GR16_NOREXRegClass);
+      if (VT == MVT::i32 || !Subtarget.is64Bit())
+        return std::make_pair(0U, &X86::GR32_NOREXRegClass);
+      return std::make_pair(0U, &X86::GR64_NOREXRegClass);
+    case 'f':  // FP Stack registers.
+      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
+      // value to the correct fpstack register class.
+      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
+        return std::make_pair(0U, &X86::RFP32RegClass);
+      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
+        return std::make_pair(0U, &X86::RFP64RegClass);
+      return std::make_pair(0U, &X86::RFP80RegClass);
+    case 'y':   // MMX_REGS if MMX allowed.
+      if (!Subtarget.hasMMX()) break;
+      return std::make_pair(0U, &X86::VR64RegClass);
+    case 'Y':   // SSE_REGS if SSE2 allowed
+      if (!Subtarget.hasSSE2()) break;
+      LLVM_FALLTHROUGH;
+    case 'v':
+    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
+      if (!Subtarget.hasSSE1()) break;
+      bool VConstraint = (Constraint[0] == 'v');
+
+      switch (VT.SimpleTy) {
+      default: break;
+      // Scalar SSE types.
+      case MVT::f32:
+      case MVT::i32:
+        if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
+          return std::make_pair(0U, &X86::FR32XRegClass);
+        return std::make_pair(0U, &X86::FR32RegClass);
+      case MVT::f64:
+      case MVT::i64:
+        if (VConstraint && Subtarget.hasVLX())
+          return std::make_pair(0U, &X86::FR64XRegClass);
+        return std::make_pair(0U, &X86::FR64RegClass);
+      // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
+      // Vector types.
+      case MVT::v16i8:
+      case MVT::v8i16:
+      case MVT::v4i32:
+      case MVT::v2i64:
+      case MVT::v4f32:
+      case MVT::v2f64:
+        if (VConstraint && Subtarget.hasVLX())
+          return std::make_pair(0U, &X86::VR128XRegClass);
+        return std::make_pair(0U, &X86::VR128RegClass);
+      // AVX types.
+      case MVT::v32i8:
+      case MVT::v16i16:
+      case MVT::v8i32:
+      case MVT::v4i64:
+      case MVT::v8f32:
+      case MVT::v4f64:
+        if (VConstraint && Subtarget.hasVLX())
+          return std::make_pair(0U, &X86::VR256XRegClass);
+        return std::make_pair(0U, &X86::VR256RegClass);
+      case MVT::v8f64:
+      case MVT::v16f32:
+      case MVT::v16i32:
+      case MVT::v8i64:
+        return std::make_pair(0U, &X86::VR512RegClass);
+      }
+      break;
+    }
+  } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
+    switch (Constraint[1]) {
+    default:
+      break;
+    case 'k':
+      // This register class doesn't allocate k0 for masked vector operation.
+      if (Subtarget.hasAVX512()) { // Only supported in AVX512.
+        switch (VT.SimpleTy) {
+        default: break;
+        case MVT::i32:
+          return std::make_pair(0U, &X86::VK32WMRegClass);
+        case MVT::i16:
+          return std::make_pair(0U, &X86::VK16WMRegClass);
+        case MVT::i8:
+          return std::make_pair(0U, &X86::VK8WMRegClass);
+        case MVT::i1:
+          return std::make_pair(0U, &X86::VK1WMRegClass);
+        case MVT::i64:
+          return std::make_pair(0U, &X86::VK64WMRegClass);
+        }
+      }
+      break;
+    }
+  }
+
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  std::pair<unsigned, const TargetRegisterClass*> Res;
+  Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+
+  // Not found as a standard register?
+  if (!Res.second) {
+    // Map st(0) -> st(7) -> ST0
+    if (Constraint.size() == 7 && Constraint[0] == '{' &&
+        tolower(Constraint[1]) == 's' &&
+        tolower(Constraint[2]) == 't' &&
+        Constraint[3] == '(' &&
+        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
+        Constraint[5] == ')' &&
+        Constraint[6] == '}') {
+
+      Res.first = X86::FP0+Constraint[4]-'0';
+      Res.second = &X86::RFP80RegClass;
+      return Res;
+    }
+
+    // GCC allows "st(0)" to be called just plain "st".
+    if (StringRef("{st}").equals_lower(Constraint)) {
+      Res.first = X86::FP0;
+      Res.second = &X86::RFP80RegClass;
+      return Res;
+    }
+
+    // flags -> EFLAGS
+    if (StringRef("{flags}").equals_lower(Constraint)) {
+      Res.first = X86::EFLAGS;
+      Res.second = &X86::CCRRegClass;
+      return Res;
+    }
+
+    // 'A' means EAX + EDX.
+    if (Constraint == "A") {
+      Res.first = X86::EAX;
+      Res.second = &X86::GR32_ADRegClass;
+      return Res;
+    }
+    return Res;
+  }
+
+  // Otherwise, check to see if this is a register class of the wrong value
+  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
+  // turn into {ax},{dx}.
+  // MVT::Other is used to specify clobber names.
+  if (Res.second->hasType(VT) || VT == MVT::Other)
+    return Res;   // Correct type already, nothing to do.
+
+  // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
+  // return "eax". This should even work for things like getting 64bit integer
+  // registers when given an f64 type.
+  const TargetRegisterClass *Class = Res.second;
+  // The generic code will match the first register class that contains the
+  // given register. Thus, based on the ordering of the tablegened file,
+  // the "plain" GR classes might not come first.
+  // Therefore, use a helper method.
+  if (isGRClass(*Class)) {
+    unsigned Size = VT.getSizeInBits();
+    if (Size == 1) Size = 8;
+    unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
+    if (DestReg > 0) {
+      Res.first = DestReg;
+      Res.second = Size == 8 ? &X86::GR8RegClass
+                 : Size == 16 ? &X86::GR16RegClass
+                 : Size == 32 ? &X86::GR32RegClass
+                 : &X86::GR64RegClass;
+      assert(Res.second->contains(Res.first) && "Register in register class");
+    } else {
+      // No register found/type mismatch.
+      Res.first = 0;
+      Res.second = nullptr;
+    }
+  } else if (isFRClass(*Class)) {
+    // Handle references to XMM physical registers that got mapped into the
+    // wrong class.  This can happen with constraints like {xmm0} where the
+    // target independent register mapper will just pick the first match it can
+    // find, ignoring the required type.
+
+    // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
+    if (VT == MVT::f32 || VT == MVT::i32)
+      Res.second = &X86::FR32RegClass;
+    else if (VT == MVT::f64 || VT == MVT::i64)
+      Res.second = &X86::FR64RegClass;
+    else if (X86::VR128RegClass.hasType(VT))
+      Res.second = &X86::VR128RegClass;
+    else if (X86::VR256RegClass.hasType(VT))
+      Res.second = &X86::VR256RegClass;
+    else if (X86::VR512RegClass.hasType(VT))
+      Res.second = &X86::VR512RegClass;
+    else {
+      // Type mismatch and not a clobber: Return an error;
+      Res.first = 0;
+      Res.second = nullptr;
+    }
+  }
+
+  return Res;
+}
+
+int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
+                                            const AddrMode &AM, Type *Ty,
+                                            unsigned AS) const {
+  // Scaling factors are not free at all.
+  // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
+  // will take 2 allocations in the out of order engine instead of 1
+  // for plain addressing mode, i.e. inst (reg1).
+  // E.g.,
+  // vaddps (%rsi,%drx), %ymm0, %ymm1
+  // Requires two allocations (one for the load, one for the computation)
+  // whereas:
+  // vaddps (%rsi), %ymm0, %ymm1
+  // Requires just 1 allocation, i.e., freeing allocations for other operations
+  // and having less micro operations to execute.
+  //
+  // For some X86 architectures, this is even worse because for instance for
+  // stores, the complex addressing mode forces the instruction to use the
+  // "load" ports instead of the dedicated "store" port.
+  // E.g., on Haswell:
+  // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
+  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
+  if (isLegalAddressingMode(DL, AM, Ty, AS))
+    // Scale represents reg2 * scale, thus account for 1
+    // as soon as we use a second register.
+    return AM.Scale != 0;
+  return -1;
+}
+
+bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+  // Integer division on x86 is expensive. However, when aggressively optimizing
+  // for code size, we prefer to use a div instruction, as it is usually smaller
+  // than the alternative sequence.
+  // The exception to this is vector division. Since x86 doesn't have vector
+  // integer division, leaving the division as-is is a loss even in terms of
+  // size, because it will have to be scalarized, while the alternative code
+  // sequence can be performed in vector form.
+  bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
+                                   Attribute::MinSize);
+  return OptSize && !VT.isVector();
+}
+
+void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+  if (!Subtarget.is64Bit())
+    return;
+
+  // Update IsSplitCSR in X86MachineFunctionInfo.
+  X86MachineFunctionInfo *AFI =
+    Entry->getParent()->getInfo<X86MachineFunctionInfo>();
+  AFI->setIsSplitCSR(true);
+}
+
+void X86TargetLowering::insertCopiesSplitCSR(
+    MachineBasicBlock *Entry,
+    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+  if (!IStart)
+    return;
+
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  MachineBasicBlock::iterator MBBI = Entry->begin();
+  for (const MCPhysReg *I = IStart; *I; ++I) {
+    const TargetRegisterClass *RC = nullptr;
+    if (X86::GR64RegClass.contains(*I))
+      RC = &X86::GR64RegClass;
+    else
+      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+    unsigned NewVR = MRI->createVirtualRegister(RC);
+    // Create copy from CSR to a virtual register.
+    // FIXME: this currently does not emit CFI pseudo-instructions, it works
+    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+    // nounwind. If we want to generalize this later, we may need to emit
+    // CFI pseudo-instructions.
+    assert(Entry->getParent()->getFunction()->hasFnAttribute(
+               Attribute::NoUnwind) &&
+           "Function should be nounwind in insertCopiesSplitCSR!");
+    Entry->addLiveIn(*I);
+    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+        .addReg(*I);
+
+    // Insert the copy-back instructions right before the terminator.
+    for (auto *Exit : Exits)
+      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), *I)
+          .addReg(NewVR);
+  }
+}
+
+bool X86TargetLowering::supportSwiftError() const {
+  return Subtarget.is64Bit();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
new file mode 100644
index 000000000000..37f9353042b1
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
@@ -0,0 +1,1382 @@
+//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
+#define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+namespace llvm {
+  class X86Subtarget;
+  class X86TargetMachine;
+
+  namespace X86ISD {
+    // X86 Specific DAG Nodes
+    enum NodeType : unsigned {
+      // Start the numbering where the builtin ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// Bit scan forward.
+      BSF,
+      /// Bit scan reverse.
+      BSR,
+
+      /// Double shift instructions. These correspond to
+      /// X86::SHLDxx and X86::SHRDxx instructions.
+      SHLD,
+      SHRD,
+
+      /// Bitwise logical AND of floating point values. This corresponds
+      /// to X86::ANDPS or X86::ANDPD.
+      FAND,
+
+      /// Bitwise logical OR of floating point values. This corresponds
+      /// to X86::ORPS or X86::ORPD.
+      FOR,
+
+      /// Bitwise logical XOR of floating point values. This corresponds
+      /// to X86::XORPS or X86::XORPD.
+      FXOR,
+
+      ///  Bitwise logical ANDNOT of floating point values. This
+      /// corresponds to X86::ANDNPS or X86::ANDNPD.
+      FANDN,
+
+      /// These operations represent an abstract X86 call
+      /// instruction, which includes a bunch of information.  In particular the
+      /// operands of these node are:
+      ///
+      ///     #0 - The incoming token chain
+      ///     #1 - The callee
+      ///     #2 - The number of arg bytes the caller pushes on the stack.
+      ///     #3 - The number of arg bytes the callee pops off the stack.
+      ///     #4 - The value to pass in AL/AX/EAX (optional)
+      ///     #5 - The value to pass in DL/DX/EDX (optional)
+      ///
+      /// The result values of these nodes are:
+      ///
+      ///     #0 - The outgoing token chain
+      ///     #1 - The first register result value (optional)
+      ///     #2 - The second register result value (optional)
+      ///
+      CALL,
+
+      /// This operation implements the lowering for readcyclecounter.
+      RDTSC_DAG,
+
+      /// X86 Read Time-Stamp Counter and Processor ID.
+      RDTSCP_DAG,
+
+      /// X86 Read Performance Monitoring Counters.
+      RDPMC_DAG,
+
+      /// X86 compare and logical compare instructions.
+      CMP, COMI, UCOMI,
+
+      /// X86 bit-test instructions.
+      BT,
+
+      /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
+      /// operand, usually produced by a CMP instruction.
+      SETCC,
+
+      /// X86 Select
+      SELECT, SELECTS,
+
+      // Same as SETCC except it's materialized with a sbb and the value is all
+      // one's or all zero's.
+      SETCC_CARRY,  // R = carry_bit ? ~0 : 0
+
+      /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
+      /// Operands are two FP values to compare; result is a mask of
+      /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
+      FSETCC,
+
+      /// X86 FP SETCC, similar to above, but with output as an i1 mask and
+      /// with optional rounding mode.
+      FSETCCM, FSETCCM_RND,
+
+      /// X86 conditional moves. Operand 0 and operand 1 are the two values
+      /// to select from. Operand 2 is the condition code, and operand 3 is the
+      /// flag operand produced by a CMP or TEST instruction. It also writes a
+      /// flag result.
+      CMOV,
+
+      /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+      /// is the block to branch if condition is true, operand 2 is the
+      /// condition code, and operand 3 is the flag operand produced by a CMP
+      /// or TEST instruction.
+      BRCOND,
+
+      /// Return with a flag operand. Operand 0 is the chain operand, operand
+      /// 1 is the number of bytes of stack to pop.
+      RET_FLAG,
+
+      /// Return from interrupt. Operand 0 is the number of bytes to pop.
+      IRET,
+
+      /// Repeat fill, corresponds to X86::REP_STOSx.
+      REP_STOS,
+
+      /// Repeat move, corresponds to X86::REP_MOVSx.
+      REP_MOVS,
+
+      /// On Darwin, this node represents the result of the popl
+      /// at function entry, used for PIC code.
+      GlobalBaseReg,
+
+      /// A wrapper node for TargetConstantPool, TargetJumpTable,
+      /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
+      /// MCSymbol and TargetBlockAddress.
+      Wrapper,
+
+      /// Special wrapper used under X86-64 PIC mode for RIP
+      /// relative displacements.
+      WrapperRIP,
+
+      /// Copies a 64-bit value from the low word of an XMM vector
+      /// to an MMX vector.  If you think this is too close to the previous
+      /// mnemonic, so do I; blame Intel.
+      MOVDQ2Q,
+
+      /// Copies a 32-bit value from the low word of a MMX
+      /// vector to a GPR.
+      MMX_MOVD2W,
+
+      /// Copies a GPR into the low 32-bit word of a MMX vector
+      /// and zero out the high word.
+      MMX_MOVW2D,
+
+      /// Extract an 8-bit value from a vector and zero extend it to
+      /// i32, corresponds to X86::PEXTRB.
+      PEXTRB,
+
+      /// Extract a 16-bit value from a vector and zero extend it to
+      /// i32, corresponds to X86::PEXTRW.
+      PEXTRW,
+
+      /// Insert any element of a 4 x float vector into any element
+      /// of a destination 4 x floatvector.
+      INSERTPS,
+
+      /// Insert the lower 8-bits of a 32-bit value to a vector,
+      /// corresponds to X86::PINSRB.
+      PINSRB,
+
+      /// Insert the lower 16-bits of a 32-bit value to a vector,
+      /// corresponds to X86::PINSRW.
+      PINSRW, MMX_PINSRW,
+
+      /// Shuffle 16 8-bit values within a vector.
+      PSHUFB,
+
+      /// Compute Sum of Absolute Differences.
+      PSADBW,
+      /// Compute Double Block Packed Sum-Absolute-Differences
+      DBPSADBW,
+
+      /// Bitwise Logical AND NOT of Packed FP values.
+      ANDNP,
+
+      /// Blend where the selector is an immediate.
+      BLENDI,
+
+      /// Blend where the condition has been shrunk.
+      /// This is used to emphasize that the condition mask is
+      /// no more valid for generic VSELECT optimizations.
+      SHRUNKBLEND,
+
+      /// Combined add and sub on an FP vector.
+      ADDSUB,
+
+      //  FP vector ops with rounding mode.
+      FADD_RND,
+      FSUB_RND,
+      FMUL_RND,
+      FDIV_RND,
+      FMAX_RND,
+      FMIN_RND,
+      FSQRT_RND, FSQRTS_RND,
+
+      // FP vector get exponent.
+      FGETEXP_RND, FGETEXPS_RND,
+      // Extract Normalized Mantissas.
+      VGETMANT, VGETMANTS,
+      // FP Scale.
+      SCALEF,
+      SCALEFS,
+
+      // Integer add/sub with unsigned saturation.
+      ADDUS,
+      SUBUS,
+
+      // Integer add/sub with signed saturation.
+      ADDS,
+      SUBS,
+
+      // Unsigned Integer average.
+      AVG,
+
+      /// Integer horizontal add/sub.
+      HADD,
+      HSUB,
+
+      /// Floating point horizontal add/sub.
+      FHADD,
+      FHSUB,
+
+      // Integer absolute value
+      ABS,
+
+      // Detect Conflicts Within a Vector
+      CONFLICT,
+
+      /// Floating point max and min.
+      FMAX, FMIN,
+
+      /// Commutative FMIN and FMAX.
+      FMAXC, FMINC,
+
+      /// Floating point reciprocal-sqrt and reciprocal approximation.
+      /// Note that these typically require refinement
+      /// in order to obtain suitable precision.
+      FRSQRT, FRCP,
+      FRSQRTS, FRCPS,
+
+      // Thread Local Storage.
+      TLSADDR,
+
+      // Thread Local Storage. A call to get the start address
+      // of the TLS block for the current module.
+      TLSBASEADDR,
+
+      // Thread Local Storage.  When calling to an OS provided
+      // thunk at the address from an earlier relocation.
+      TLSCALL,
+
+      // Exception Handling helpers.
+      EH_RETURN,
+
+      // SjLj exception handling setjmp.
+      EH_SJLJ_SETJMP,
+
+      // SjLj exception handling longjmp.
+      EH_SJLJ_LONGJMP,
+
+      // SjLj exception handling dispatch.
+      EH_SJLJ_SETUP_DISPATCH,
+
+      /// Tail call return. See X86TargetLowering::LowerCall for
+      /// the list of operands.
+      TC_RETURN,
+
+      // Vector move to low scalar and zero higher vector elements.
+      VZEXT_MOVL,
+
+      // Vector integer zero-extend.
+      VZEXT,
+      // Vector integer signed-extend.
+      VSEXT,
+
+      // Vector integer truncate.
+      VTRUNC,
+      // Vector integer truncate with unsigned/signed saturation.
+      VTRUNCUS, VTRUNCS,
+
+      // Vector FP extend.
+      VFPEXT, VFPEXT_RND, VFPEXTS_RND,
+
+      // Vector FP round.
+      VFPROUND, VFPROUND_RND, VFPROUNDS_RND,
+
+      // Convert a vector to mask, set bits base on MSB.
+      CVT2MASK,
+
+      // 128-bit vector logical left / right shift
+      VSHLDQ, VSRLDQ,
+
+      // Vector shift elements
+      VSHL, VSRL, VSRA,
+
+      // Vector variable shift right arithmetic.
+      // Unlike ISD::SRA, in case shift count greater then element size
+      // use sign bit to fill destination data element.
+      VSRAV,
+
+      // Vector shift elements by immediate
+      VSHLI, VSRLI, VSRAI,
+
+      // Bit rotate by immediate
+      VROTLI, VROTRI,
+
+      // Vector packed double/float comparison.
+      CMPP,
+
+      // Vector integer comparisons.
+      PCMPEQ, PCMPGT,
+      // Vector integer comparisons, the result is in a mask vector.
+      PCMPEQM, PCMPGTM,
+
+      MULTISHIFT,
+
+      /// Vector comparison generating mask bits for fp and
+      /// integer signed and unsigned data types.
+      CMPM,
+      CMPMU,
+      // Vector comparison with rounding mode for FP values
+      CMPM_RND,
+
+      // Arithmetic operations with FLAGS results.
+      ADD, SUB, ADC, SBB, SMUL,
+      INC, DEC, OR, XOR, AND,
+
+      // Bit field extract.
+      BEXTR,
+
+      // LOW, HI, FLAGS = umul LHS, RHS.
+      UMUL,
+
+      // 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS.
+      SMUL8, UMUL8,
+
+      // 8-bit divrem that zero-extend the high result (AH).
+      UDIVREM8_ZEXT_HREG,
+      SDIVREM8_SEXT_HREG,
+
+      // X86-specific multiply by immediate.
+      MUL_IMM,
+
+      // Vector sign bit extraction.
+      MOVMSK,
+
+      // Vector bitwise comparisons.
+      PTEST,
+
+      // Vector packed fp sign bitwise comparisons.
+      TESTP,
+
+      // Vector "test" in AVX-512, the result is in a mask vector.
+      TESTM,
+      TESTNM,
+
+      // OR/AND test for masks.
+      KORTEST,
+      KTEST,
+
+      // Several flavors of instructions with vector shuffle behaviors.
+      // Saturated signed/unnsigned packing.
+      PACKSS,
+      PACKUS,
+      // Intra-lane alignr.
+      PALIGNR,
+      // AVX512 inter-lane alignr.
+      VALIGN,
+      PSHUFD,
+      PSHUFHW,
+      PSHUFLW,
+      SHUFP,
+      //Shuffle Packed Values at 128-bit granularity.
+      SHUF128,
+      MOVDDUP,
+      MOVSHDUP,
+      MOVSLDUP,
+      MOVLHPS,
+      MOVLHPD,
+      MOVHLPS,
+      MOVLPS,
+      MOVLPD,
+      MOVSD,
+      MOVSS,
+      UNPCKL,
+      UNPCKH,
+      VPERMILPV,
+      VPERMILPI,
+      VPERMI,
+      VPERM2X128,
+
+      // Variable Permute (VPERM).
+      // Res = VPERMV MaskV, V0
+      VPERMV,
+
+      // 3-op Variable Permute (VPERMT2).
+      // Res = VPERMV3 V0, MaskV, V1
+      VPERMV3,
+
+      // 3-op Variable Permute overwriting the index (VPERMI2).
+      // Res = VPERMIV3 V0, MaskV, V1
+      VPERMIV3,
+
+      // Bitwise ternary logic.
+      VPTERNLOG,
+      // Fix Up Special Packed Float32/64 values.
+      VFIXUPIMM,
+      VFIXUPIMMS,
+      // Range Restriction Calculation For Packed Pairs of Float32/64 values.
+      VRANGE,
+      // Reduce - Perform Reduction Transformation on scalar\packed FP.
+      VREDUCE, VREDUCES,
+      // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
+      VRNDSCALE, VRNDSCALES,
+      // Tests Types Of a FP Values for packed types.
+      VFPCLASS,
+      // Tests Types Of a FP Values for scalar types.
+      VFPCLASSS,
+
+      // Broadcast scalar to vector.
+      VBROADCAST,
+      // Broadcast mask to vector.
+      VBROADCASTM,
+      // Broadcast subvector to vector.
+      SUBV_BROADCAST,
+
+      // Insert/Extract vector element.
+      VINSERT,
+      VEXTRACT,
+
+      /// SSE4A Extraction and Insertion.
+      EXTRQI, INSERTQI,
+
+      // XOP variable/immediate rotations.
+      VPROT, VPROTI,
+      // XOP arithmetic/logical shifts.
+      VPSHA, VPSHL,
+      // XOP signed/unsigned integer comparisons.
+      VPCOM, VPCOMU,
+      // XOP packed permute bytes.
+      VPPERM,
+      // XOP two source permutation.
+      VPERMIL2,
+
+      // Vector multiply packed unsigned doubleword integers.
+      PMULUDQ,
+      // Vector multiply packed signed doubleword integers.
+      PMULDQ,
+      // Vector Multiply Packed UnsignedIntegers with Round and Scale.
+      MULHRS,
+
+      // Multiply and Add Packed Integers.
+      VPMADDUBSW, VPMADDWD,
+      VPMADD52L, VPMADD52H,
+
+      // FMA nodes.
+      FMADD,
+      FNMADD,
+      FMSUB,
+      FNMSUB,
+      FMADDSUB,
+      FMSUBADD,
+
+      // FMA with rounding mode.
+      FMADD_RND,
+      FNMADD_RND,
+      FMSUB_RND,
+      FNMSUB_RND,
+      FMADDSUB_RND,
+      FMSUBADD_RND,
+
+      // Scalar intrinsic FMA with rounding mode.
+      // Two versions, passthru bits on op1 or op3.
+      FMADDS1_RND, FMADDS3_RND,
+      FNMADDS1_RND, FNMADDS3_RND,
+      FMSUBS1_RND, FMSUBS3_RND,
+      FNMSUBS1_RND, FNMSUBS3_RND,
+
+      // Compress and expand.
+      COMPRESS,
+      EXPAND,
+
+      // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
+      SINT_TO_FP_RND, UINT_TO_FP_RND,
+      SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND,
+
+      // Vector float/double to signed/unsigned integer.
+      CVTP2SI, CVTP2UI, CVTP2SI_RND, CVTP2UI_RND,
+      // Scalar float/double to signed/unsigned integer.
+      CVTS2SI_RND, CVTS2UI_RND,
+
+      // Vector float/double to signed/unsigned integer with truncation.
+      CVTTP2SI, CVTTP2UI, CVTTP2SI_RND, CVTTP2UI_RND,
+      // Scalar float/double to signed/unsigned integer with truncation.
+      CVTTS2SI_RND, CVTTS2UI_RND,
+
+      // Vector signed/unsigned integer to float/double.
+      CVTSI2P, CVTUI2P,
+
+      // Save xmm argument registers to the stack, according to %al. An operator
+      // is needed so that this can be expanded with control flow.
+      VASTART_SAVE_XMM_REGS,
+
+      // Windows's _chkstk call to do stack probing.
+      WIN_ALLOCA,
+
+      // For allocating variable amounts of stack space when using
+      // segmented stacks. Check if the current stacklet has enough space, and
+      // falls back to heap allocation if not.
+      SEG_ALLOCA,
+
+      // Memory barriers.
+      MEMBARRIER,
+      MFENCE,
+
+      // Store FP status word into i16 register.
+      FNSTSW16r,
+
+      // Store contents of %ah into %eflags.
+      SAHF,
+
+      // Get a random integer and indicate whether it is valid in CF.
+      RDRAND,
+
+      // Get a NIST SP800-90B & C compliant random integer and
+      // indicate whether it is valid in CF.
+      RDSEED,
+
+      // SSE42 string comparisons.
+      PCMPISTRI,
+      PCMPESTRI,
+
+      // Test if in transactional execution.
+      XTEST,
+
+      // ERI instructions.
+      RSQRT28, RSQRT28S, RCP28, RCP28S, EXP2,
+
+      // Conversions between float and half-float.
+      CVTPS2PH, CVTPH2PS,
+
+      // Compare and swap.
+      LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      LCMPXCHG8_DAG,
+      LCMPXCHG16_DAG,
+      LCMPXCHG8_SAVE_EBX_DAG,
+      LCMPXCHG16_SAVE_RBX_DAG,
+
+      /// LOCK-prefixed arithmetic read-modify-write instructions.
+      /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
+      LADD, LSUB, LOR, LXOR, LAND,
+
+      // Load, scalar_to_vector, and zero extend.
+      VZEXT_LOAD,
+
+      // Store FP control world into i16 memory.
+      FNSTCW16m,
+
+      /// This instruction implements FP_TO_SINT with the
+      /// integer destination in memory and a FP reg source.  This corresponds
+      /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+      /// has two inputs (token chain and address) and two outputs (int value
+      /// and token chain).
+      FP_TO_INT16_IN_MEM,
+      FP_TO_INT32_IN_MEM,
+      FP_TO_INT64_IN_MEM,
+
+      /// This instruction implements SINT_TO_FP with the
+      /// integer source in memory and FP reg result.  This corresponds to the
+      /// X86::FILD*m instructions. It has three inputs (token chain, address,
+      /// and source type) and two outputs (FP value and token chain). FILD_FLAG
+      /// also produces a flag).
+      FILD,
+      FILD_FLAG,
+
+      /// This instruction implements an extending load to FP stack slots.
+      /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+      /// operand, ptr to load from, and a ValueType node indicating the type
+      /// to load to.
+      FLD,
+
+      /// This instruction implements a truncating store to FP stack
+      /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+      /// chain operand, value to store, address, and a ValueType to store it
+      /// as.
+      FST,
+
+      /// This instruction grabs the address of the next argument
+      /// from a va_list. (reads and modifies the va_list in memory)
+      VAARG_64,
+
+      // Vector truncating store with unsigned/signed saturation
+      VTRUNCSTOREUS, VTRUNCSTORES,
+      // Vector truncating masked store with unsigned/signed saturation
+      VMTRUNCSTOREUS, VMTRUNCSTORES
+
+      // WARNING: Do not add anything in the end unless you want the node to
+      // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
+      // opcodes will be thought as target memory ops!
+    };
+  } // end namespace X86ISD
+
+  /// Define some predicates that are used for node matching.
+  namespace X86 {
+    /// Return true if the specified
+    /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+    /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions.
+    bool isVEXTRACT128Index(SDNode *N);
+
+    /// Return true if the specified
+    /// INSERT_SUBVECTOR operand specifies a subvector insert that is
+    /// suitable for input to VINSERTF128, VINSERTI128 instructions.
+    bool isVINSERT128Index(SDNode *N);
+
+    /// Return true if the specified
+    /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+    /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions.
+    bool isVEXTRACT256Index(SDNode *N);
+
+    /// Return true if the specified
+    /// INSERT_SUBVECTOR operand specifies a subvector insert that is
+    /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions.
+    bool isVINSERT256Index(SDNode *N);
+
+    /// Return the appropriate
+    /// immediate to extract the specified EXTRACT_SUBVECTOR index
+    /// with VEXTRACTF128, VEXTRACTI128 instructions.
+    unsigned getExtractVEXTRACT128Immediate(SDNode *N);
+
+    /// Return the appropriate
+    /// immediate to insert at the specified INSERT_SUBVECTOR index
+    /// with VINSERTF128, VINSERT128 instructions.
+    unsigned getInsertVINSERT128Immediate(SDNode *N);
+
+    /// Return the appropriate
+    /// immediate to extract the specified EXTRACT_SUBVECTOR index
+    /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions.
+    unsigned getExtractVEXTRACT256Immediate(SDNode *N);
+
+    /// Return the appropriate
+    /// immediate to insert at the specified INSERT_SUBVECTOR index
+    /// with VINSERTF64x4, VINSERTI64x4 instructions.
+    unsigned getInsertVINSERT256Immediate(SDNode *N);
+
+    /// Returns true if Elt is a constant zero or floating point constant +0.0.
+    bool isZeroNode(SDValue Elt);
+
+    /// Returns true of the given offset can be
+    /// fit into displacement field of the instruction.
+    bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+                                      bool hasSymbolicDisplacement = true);
+
+    /// Determines whether the callee is required to pop its
+    /// own arguments. Callee pop is necessary to support tail calls.
+    bool isCalleePop(CallingConv::ID CallingConv,
+                     bool is64Bit, bool IsVarArg, bool GuaranteeTCO);
+
+  } // end namespace X86
+
+  //===--------------------------------------------------------------------===//
+  //  X86 Implementation of the TargetLowering interface
+  class X86TargetLowering final : public TargetLowering {
+  public:
+    explicit X86TargetLowering(const X86TargetMachine &TM,
+                               const X86Subtarget &STI);
+
+    unsigned getJumpTableEncoding() const override;
+    bool useSoftFloat() const override;
+
+    MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+      return MVT::i8;
+    }
+
+    const MCExpr *
+    LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                              const MachineBasicBlock *MBB, unsigned uid,
+                              MCContext &Ctx) const override;
+
+    /// Returns relocation base for the given PIC jumptable.
+    SDValue getPICJumpTableRelocBase(SDValue Table,
+                                     SelectionDAG &DAG) const override;
+    const MCExpr *
+    getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+                                 unsigned JTI, MCContext &Ctx) const override;
+
+    /// Return the desired alignment for ByVal aggregate
+    /// function arguments in the caller parameter area. For X86, aggregates
+    /// that contains are placed at 16-byte boundaries while the rest are at
+    /// 4-byte boundaries.
+    unsigned getByValTypeAlignment(Type *Ty,
+                                   const DataLayout &DL) const override;
+
+    /// Returns the target specific optimal type for load
+    /// and store operations as a result of memset, memcpy, and memmove
+    /// lowering. If DstAlign is zero that means it's safe to destination
+    /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+    /// means there isn't a need to check it against alignment requirement,
+    /// probably because the source does not need to be loaded. If 'IsMemset' is
+    /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+    /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+    /// source is constant so it does not need to be loaded.
+    /// It returns EVT::Other if the type should be determined using generic
+    /// target-independent logic.
+    EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+                            bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                            MachineFunction &MF) const override;
+
+    /// Returns true if it's safe to use load / store of the
+    /// specified type to expand memcpy / memset inline. This is mostly true
+    /// for all types except for some special cases. For example, on X86
+    /// targets without SSE2 f64 load / store are done with fldl / fstpl which
+    /// also does type conversion. Note the specified type doesn't have to be
+    /// legal as the hook is used before type legalization.
+    bool isSafeMemOpType(MVT VT) const override;
+
+    /// Returns true if the target allows unaligned memory accesses of the
+    /// specified type. Returns whether it is "fast" in the last argument.
+    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
+                                       bool *Fast) const override;
+
+    /// Provide custom lowering hooks for some operations.
+    ///
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    /// Places new result values for the node in Results (their number
+    /// and types must exactly match those of the original return values of
+    /// the node), or leaves Results empty, which indicates that the node is not
+    /// to be custom lowered after all.
+    void LowerOperationWrapper(SDNode *N,
+                               SmallVectorImpl<SDValue> &Results,
+                               SelectionDAG &DAG) const override;
+
+    /// Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
+
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+    /// Return true if the target has native support for
+    /// the specified value type and it is 'desirable' to use the type for the
+    /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+    /// instruction encodings are longer and some i16 instructions are slow.
+    bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
+
+    /// Return true if the target has native support for the
+    /// specified value type and it is 'desirable' to use the type. e.g. On x86
+    /// i16 is legal, but undesirable since i16 instruction encodings are longer
+    /// and some i16 instructions are slow.
+    bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
+
+    /// Return true if the MachineFunction contains a COPY which would imply
+    /// HasOpaqueSPAdjustment.
+    bool hasCopyImplyingStackAdjustment(MachineFunction *MF) const override;
+
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *MBB) const override;
+
+    /// This method returns the name of a target specific DAG node.
+    const char *getTargetNodeName(unsigned Opcode) const override;
+
+    bool isCheapToSpeculateCttz() const override;
+
+    bool isCheapToSpeculateCtlz() const override;
+
+    bool isCtlzFast() const override;
+
+    bool hasBitPreservingFPLogic(EVT VT) const override {
+      return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
+    }
+
+    bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
+      // If the pair to store is a mixture of float and int values, we will
+      // save two bitwise instructions and one float-to-int instruction and
+      // increase one store instruction. There is potentially a more
+      // significant benefit because it avoids the float->int domain switch
+      // for input value. So It is more likely a win.
+      if ((LTy.isFloatingPoint() && HTy.isInteger()) ||
+          (LTy.isInteger() && HTy.isFloatingPoint()))
+        return true;
+      // If the pair only contains int values, we will save two bitwise
+      // instructions and increase one store instruction (costing one more
+      // store buffer). Since the benefit is more blurred so we leave
+      // such pair out until we get testcase to prove it is a win.
+      return false;
+    }
+
+    bool hasAndNotCompare(SDValue Y) const override;
+
+    /// Return the value type to use for ISD::SETCC.
+    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                           EVT VT) const override;
+
+    /// Determine which of the bits specified in Mask are known to be either
+    /// zero or one and return them in the KnownZero/KnownOne bitsets.
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
+
+    /// Determine the number of bits in the operation that are sign bits.
+    unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                             const SelectionDAG &DAG,
+                                             unsigned Depth) const override;
+
+    bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
+                        int64_t &Offset) const override;
+
+    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+
+    bool ExpandInlineAsm(CallInst *CI) const override;
+
+    ConstraintType getConstraintType(StringRef Constraint) const override;
+
+    /// Examine constraint string and operand type and determine a weight value.
+    /// The operand object must already have been set up with the operand type.
+    ConstraintWeight
+      getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                     const char *constraint) const override;
+
+    const char *LowerXConstraint(EVT ConstraintVT) const override;
+
+    /// Lower the specified operand into the Ops vector. If it is invalid, don't
+    /// add anything to Ops. If hasMemory is true it means one of the asm
+    /// constraint of the inline asm instruction being processed is 'm'.
+    void LowerAsmOperandForConstraint(SDValue Op,
+                                      std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
+
+    unsigned
+    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+      if (ConstraintCode == "i")
+        return InlineAsm::Constraint_i;
+      else if (ConstraintCode == "o")
+        return InlineAsm::Constraint_o;
+      else if (ConstraintCode == "v")
+        return InlineAsm::Constraint_v;
+      else if (ConstraintCode == "X")
+        return InlineAsm::Constraint_X;
+      return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+    }
+
+    /// Given a physical register constraint
+    /// (e.g. {edx}), return the register number and the register class for the
+    /// register.  This should only be used for C_Register constraints.  On
+    /// error, this returns a register number of 0.
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 StringRef Constraint, MVT VT) const override;
+
+    /// Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
+
+    /// Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalICmpImmediate(int64_t Imm) const override;
+
+    /// Return true if the specified immediate is legal
+    /// add immediate, that is the target has add instructions which can
+    /// add a register and the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalAddImmediate(int64_t Imm) const override;
+
+    /// \brief Return the cost of the scaling factor used in the addressing
+    /// mode represented by AM for this target, for a load/store
+    /// of the specified type.
+    /// If the AM is supported, the return value must be >= 0.
+    /// If the AM is not supported, it returns a negative value.
+    int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
+
+    bool isVectorShiftByScalarCheap(Type *Ty) const override;
+
+    /// Return true if it's free to truncate a value of
+    /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
+    /// register EAX to i16 by referencing its sub-register AX.
+    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+    bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+    bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
+
+    /// Return true if any actual instruction that defines a
+    /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+    /// register. This does not necessarily include registers defined in
+    /// unknown ways, such as incoming arguments, or copies from unknown
+    /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
+    /// does not necessarily apply to truncate instructions. e.g. on x86-64,
+    /// all instructions that define 32-bit values implicit zero-extend the
+    /// result out to 64 bits.
+    bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+    bool isZExtFree(EVT VT1, EVT VT2) const override;
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+    /// Return true if folding a vector load into ExtVal (a sign, zero, or any
+    /// extend node) is profitable.
+    bool isVectorLoadExtDesirable(SDValue) const override;
+
+    /// Return true if an FMA operation is faster than a pair of fmul and fadd
+    /// instructions. fmuladd intrinsics will be expanded to FMAs when this
+    /// method returns true, otherwise fmuladd is expanded to fmul + fadd.
+    bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+
+    /// Return true if it's profitable to narrow
+    /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
+    /// from i32 to i8 but not from i32 to i16.
+    bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
+
+    /// Given an intrinsic, checks if on the target the intrinsic will need to map
+    /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
+    /// true and stores the intrinsic information into the IntrinsicInfo that was
+    /// passed to the function.
+    bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                            unsigned Intrinsic) const override;
+
+    /// Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
+
+    /// Targets can use this to indicate that they only support *some*
+    /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
+    /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
+    /// be legal.
+    bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+                            EVT VT) const override;
+
+    /// Similar to isShuffleMaskLegal. This is used by Targets can use this to
+    /// indicate if there is a suitable VECTOR_SHUFFLE that can be used to
+    /// replace a VAND with a constant pool entry.
+    bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+                                EVT VT) const override;
+
+    /// If true, then instruction selection should
+    /// seek to shrink the FP constant of the specified type to a smaller type
+    /// in order to save space and / or reduce runtime.
+    bool ShouldShrinkFPConstant(EVT VT) const override {
+      // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
+      // expensive than a straight movsd. On the other hand, it's important to
+      // shrink long double fp constant since fldt is very slow.
+      return !X86ScalarSSEf64 || VT == MVT::f80;
+    }
+
+    /// Return true if we believe it is correct and profitable to reduce the
+    /// load node to a smaller type.
+    bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
+                               EVT NewVT) const override;
+
+    /// Return true if the specified scalar FP type is computed in an SSE
+    /// register, not on the X87 floating point stack.
+    bool isScalarFPTypeInSSEReg(EVT VT) const {
+      return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+             (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
+    }
+
+    /// \brief Returns true if it is beneficial to convert a load of a constant
+    /// to just the constant itself.
+    bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                           Type *Ty) const override;
+
+    /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
+    /// with this index.
+    bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override;
+
+    /// Intel processors have a unified instruction and data cache
+    const char * getClearCacheBuiltinName() const override {
+      return nullptr; // nothing to do, move along.
+    }
+
+    unsigned getRegisterByName(const char* RegName, EVT VT,
+                               SelectionDAG &DAG) const override;
+
+    /// If a physical register, this returns the register that receives the
+    /// exception address on entry to an EH pad.
+    unsigned
+    getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+    /// If a physical register, this returns the register that receives the
+    /// exception typeid on entry to a landing pad.
+    unsigned
+    getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
+    virtual bool needsFixedCatchObjects() const override;
+
+    /// This method returns a target specific FastISel object,
+    /// or null if the target does not support "fast" ISel.
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo) const override;
+
+    /// If the target has a standard location for the stack protector cookie,
+    /// returns the address of that location. Otherwise, returns nullptr.
+    Value *getIRStackGuard(IRBuilder<> &IRB) const override;
+
+    bool useLoadStackGuardNode() const override;
+    void insertSSPDeclarations(Module &M) const override;
+    Value *getSDagStackGuard(const Module &M) const override;
+    Value *getSSPStackGuardCheck(const Module &M) const override;
+
+    /// Return true if the target stores SafeStack pointer at a fixed offset in
+    /// some non-standard address space, and populates the address space and
+    /// offset as appropriate.
+    Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
+
+    SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
+                      SelectionDAG &DAG) const;
+
+    bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
+    /// \brief Customize the preferred legalization strategy for certain types.
+    LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
+
+    bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+
+    bool supportSwiftError() const override;
+
+    unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
+
+    /// \brief Lower interleaved load(s) into target specific
+    /// instructions/intrinsics.
+    bool lowerInterleavedLoad(LoadInst *LI,
+                              ArrayRef<ShuffleVectorInst *> Shuffles,
+                              ArrayRef<unsigned> Indices,
+                              unsigned Factor) const override;
+  protected:
+    std::pair<const TargetRegisterClass *, uint8_t>
+    findRepresentativeClass(const TargetRegisterInfo *TRI,
+                            MVT VT) const override;
+
+  private:
+    /// Keep a reference to the X86Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const X86Subtarget &Subtarget;
+
+    /// Select between SSE or x87 floating point ops.
+    /// When SSE is available, use it for f32 operations.
+    /// When SSE2 is available, use it for f64 operations.
+    bool X86ScalarSSEf32;
+    bool X86ScalarSSEf64;
+
+    /// A list of legal FP immediates.
+    std::vector<APFloat> LegalFPImmediates;
+
+    /// Indicate that this x86 target can instruction
+    /// select the specified FP immediate natively.
+    void addLegalFPImmediate(const APFloat& Imm) {
+      LegalFPImmediates.push_back(Imm);
+    }
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            const SDLoc &dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
+                             const SmallVectorImpl<ISD::InputArg> &ArgInfo,
+                             const SDLoc &dl, SelectionDAG &DAG,
+                             const CCValAssign &VA, MachineFrameInfo &MFI,
+                             unsigned i) const;
+    SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+                             const SDLoc &dl, SelectionDAG &DAG,
+                             const CCValAssign &VA,
+                             ISD::ArgFlagsTy Flags) const;
+
+    // Call lowering helpers.
+
+    /// Check whether the call is eligible for tail call optimization. Targets
+    /// that want to do tail call optimization should implement this function.
+    bool IsEligibleForTailCallOptimization(SDValue Callee,
+                                           CallingConv::ID CalleeCC,
+                                           bool isVarArg,
+                                           bool isCalleeStructRet,
+                                           bool isCallerStructRet,
+                                           Type *RetTy,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                           SelectionDAG& DAG) const;
+    SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
+                                    SDValue Chain, bool IsTailCall,
+                                    bool Is64Bit, int FPDiff,
+                                    const SDLoc &dl) const;
+
+    unsigned GetAlignedArgumentStackSize(unsigned StackSize,
+                                         SelectionDAG &DAG) const;
+
+    unsigned getAddressSpace(void) const;
+
+    std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+                                               bool isSigned,
+                                               bool isReplace) const;
+
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const;
+    SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+
+    unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(const GlobalValue *GV, const SDLoc &dl,
+                               int64_t Offset, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_INT(SDValue Op, const X86Subtarget &Subtarget,
+                           SelectionDAG &DAG) const;
+    SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
+                      SelectionDAG &DAG) const;
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue LowerCall(CallLoweringInfo &CLI,
+                      SmallVectorImpl<SDValue> &InVals) const override;
+
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
+
+    bool supportSplitCSR(MachineFunction *MF) const override {
+      return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+          MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+    }
+    void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+    void insertCopiesSplitCSR(
+      MachineBasicBlock *Entry,
+      const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
+    bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+
+    bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+
+    EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
+                            ISD::NodeType ExtendKind) const override;
+
+    bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                        bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
+
+    const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+
+    TargetLoweringBase::AtomicExpansionKind
+    shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
+    bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+    TargetLoweringBase::AtomicExpansionKind
+    shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
+    LoadInst *
+    lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+
+    bool needsCmpXchgNb(Type *MemType) const;
+
+    void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
+                                MachineBasicBlock *DispatchBB, int FI) const;
+
+    // Utility function to emit the low-level va_arg code for X86-64.
+    MachineBasicBlock *
+    EmitVAARG64WithCustomInserter(MachineInstr &MI,
+                                  MachineBasicBlock *MBB) const;
+
+    /// Utility function to emit the xmm reg save portion of va_start.
+    MachineBasicBlock *
+    EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr,
+                                             MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
+                                         MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr &I,
+                                           MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
+                                           MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI,
+                                           MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
+                                            MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
+                                          MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
+                                          MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
+                                        MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
+                                         MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *emitFMA3Instr(MachineInstr &MI,
+                                     MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
+                                             MachineBasicBlock *MBB) const;
+
+    /// Emit nodes that will be selected as "test Op0,Op0", or something
+    /// equivalent, for use with the given x86 condition code.
+    SDValue EmitTest(SDValue Op0, unsigned X86CC, const SDLoc &dl,
+                     SelectionDAG &DAG) const;
+
+    /// Emit nodes that will be selected as "cmp Op0,Op1", or something
+    /// equivalent, for use with the given x86 condition code.
+    SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl,
+                    SelectionDAG &DAG) const;
+
+    /// Convert a comparison if required by the subtarget.
+    SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
+
+    /// Check if replacement of SQRT with RSQRT should be disabled.
+    bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
+
+    /// Use rsqrt* to speed up sqrt calculations.
+    SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                            int &RefinementSteps, bool &UseOneConstNR,
+                            bool Reciprocal) const override;
+
+    /// Use rcp* to speed up fdiv calculations.
+    SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                             int &RefinementSteps) const override;
+
+    /// Reassociate floating point divisions into multiply by reciprocal.
+    unsigned combineRepeatedFPDivisors() const override;
+  };
+
+  namespace X86 {
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo);
+  } // end namespace X86
+
+  // Base class for all X86 non-masked store operations.
+  class X86StoreSDNode : public MemSDNode {
+  public:
+    X86StoreSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
+                   SDVTList VTs, EVT MemVT,
+                   MachineMemOperand *MMO)
+      :MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
+    const SDValue &getValue() const { return getOperand(1); }
+    const SDValue &getBasePtr() const { return getOperand(2); }
+
+    static bool classof(const SDNode *N) {
+      return N->getOpcode() == X86ISD::VTRUNCSTORES ||
+        N->getOpcode() == X86ISD::VTRUNCSTOREUS;
+    }
+  };
+
+  // Base class for all X86 masked store operations.
+  // The class has the same order of operands as MaskedStoreSDNode for
+  // convenience.
+  class X86MaskedStoreSDNode : public MemSDNode {
+  public:
+    X86MaskedStoreSDNode(unsigned Opcode, unsigned Order,
+                         const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+                         MachineMemOperand *MMO)
+      : MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
+
+    const SDValue &getBasePtr() const { return getOperand(1); }
+    const SDValue &getMask()    const { return getOperand(2); }
+    const SDValue &getValue()   const { return getOperand(3); }
+
+    static bool classof(const SDNode *N) {
+      return N->getOpcode() == X86ISD::VMTRUNCSTORES ||
+        N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
+    }
+  };
+
+  // X86 Truncating Store with Signed saturation.
+  class TruncSStoreSDNode : public X86StoreSDNode {
+  public:
+    TruncSStoreSDNode(unsigned Order, const DebugLoc &dl,
+                        SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
+      : X86StoreSDNode(X86ISD::VTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
+
+    static bool classof(const SDNode *N) {
+      return N->getOpcode() == X86ISD::VTRUNCSTORES;
+    }
+  };
+
+  // X86 Truncating Store with Unsigned saturation.
+  class TruncUSStoreSDNode : public X86StoreSDNode {
+  public:
+    TruncUSStoreSDNode(unsigned Order, const DebugLoc &dl,
+                      SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
+      : X86StoreSDNode(X86ISD::VTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
+
+    static bool classof(const SDNode *N) {
+      return N->getOpcode() == X86ISD::VTRUNCSTOREUS;
+    }
+  };
+
+  // X86 Truncating Masked Store with Signed saturation.
+  class MaskedTruncSStoreSDNode : public X86MaskedStoreSDNode {
+  public:
+    MaskedTruncSStoreSDNode(unsigned Order,
+                         const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+                         MachineMemOperand *MMO)
+      : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
+
+    static bool classof(const SDNode *N) {
+      return N->getOpcode() == X86ISD::VMTRUNCSTORES;
+    }
+  };
+
+  // X86 Truncating Masked Store with Unsigned saturation.
+  class MaskedTruncUSStoreSDNode : public X86MaskedStoreSDNode {
+  public:
+    MaskedTruncUSStoreSDNode(unsigned Order,
+                            const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+                            MachineMemOperand *MMO)
+      : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
+
+    static bool classof(const SDNode *N) {
+      return N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
+    }
+  };
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
new file mode 100644
index 000000000000..ba1aede3c1a0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
@@ -0,0 +1,103 @@
+//===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the 3DNow! instruction set, which extends MMX to support
+// floating point and also adds a few more random instructions for good measure.
+//
+//===----------------------------------------------------------------------===//
+
+class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat>
+      : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> {
+}
+
+class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+      : I3DNow<o, F, (outs VR64:$dst), ins,
+          !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>,
+        Has3DNow0F0FOpcode {
+  // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+  let isAsmParserOnly = 1;
+  let Constraints = "$src1 = $dst";
+}
+
+class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+      : I3DNow<o, F, (outs VR64:$dst), ins,
+          !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>,
+        Has3DNow0F0FOpcode {
+  // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+  let isAsmParserOnly = 1;
+}
+
+multiclass I3DNow_binop_rm<bits<8> opc, string Mn> {
+  def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>;
+  def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>;
+}
+
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+  def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>;
+  def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1,
+        (bitconvert (load_mmx addr:$src2))))]>;
+}
+
+multiclass I3DNow_conv_rm<bits<8> opc, string Mn> {
+  def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>;
+  def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>;
+}
+
+multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+  def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>;
+  def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn))
+        (bitconvert (load_mmx addr:$src))))]>;
+}
+
+defm PAVGUSB  : I3DNow_binop_rm_int<0xBF, "pavgusb">;
+defm PF2ID    : I3DNow_conv_rm_int<0x1D, "pf2id">;
+defm PFACC    : I3DNow_binop_rm_int<0xAE, "pfacc">;
+defm PFADD    : I3DNow_binop_rm_int<0x9E, "pfadd">;
+defm PFCMPEQ  : I3DNow_binop_rm_int<0xB0, "pfcmpeq">;
+defm PFCMPGE  : I3DNow_binop_rm_int<0x90, "pfcmpge">;
+defm PFCMPGT  : I3DNow_binop_rm_int<0xA0, "pfcmpgt">;
+defm PFMAX    : I3DNow_binop_rm_int<0xA4, "pfmax">;
+defm PFMIN    : I3DNow_binop_rm_int<0x94, "pfmin">;
+defm PFMUL    : I3DNow_binop_rm_int<0xB4, "pfmul">;
+defm PFRCP    : I3DNow_conv_rm_int<0x96, "pfrcp">;
+defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">;
+defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">;
+defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">;
+defm PFRSQRT  : I3DNow_conv_rm_int<0x97, "pfrsqrt">;
+defm PFSUB    : I3DNow_binop_rm_int<0x9A, "pfsub">;
+defm PFSUBR   : I3DNow_binop_rm_int<0xAA, "pfsubr">;
+defm PI2FD    : I3DNow_conv_rm_int<0x0D, "pi2fd">;
+defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw">;
+
+
+def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
+                   [(int_x86_mmx_femms)]>;
+
+def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr),
+                      "prefetch\t$addr",
+                      [(prefetch addr:$addr, (i32 0), imm, (i32 1))]>;
+
+def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
+                  [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))]>, TB,
+                Requires<[HasPrefetchW]>;
+
+// "3DNowA" instructions
+defm PF2IW    : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
+defm PI2FW    : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">;
+defm PFNACC   : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">;
+defm PFPNACC  : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">;
+defm PSWAPD   : I3DNow_conv_rm_int<0xBB, "pswapd", "a">;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
new file mode 100644
index 000000000000..da7437ea0ccb
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -0,0 +1,9181 @@
+//===-- X86InstrAVX512.td - AVX512 Instruction Set ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 AVX512 instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+// Group template arguments that can be derived from the vector type (EltNum x
+// EltVT).  These are things like the register class for the writemask, etc.
+// The idea is to pass one of these as the template argument rather than the
+// individual arguments.
+// The template is also used for scalar types, in this case numelts is 1.
+class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
+                      string suffix = ""> {
+  RegisterClass RC = rc;
+  ValueType EltVT = eltvt;
+  int NumElts = numelts;
+
+  // Corresponding mask register class.
+  RegisterClass KRC = !cast<RegisterClass>("VK" # NumElts);
+
+  // Corresponding write-mask register class.
+  RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM");
+
+  // The mask VT.
+  ValueType KVT = !cast<ValueType>(!if (!eq (NumElts, 1), "i1",
+                                                          "v" # NumElts # "i1"));
+
+  // The GPR register class that can hold the write mask.  Use GR8 for fewer
+  // than 8 elements.  Use shift-right and equal to work around the lack of
+  // !lt in tablegen.
+  RegisterClass MRC =
+    !cast<RegisterClass>("GR" #
+                         !if (!eq (!srl(NumElts, 3), 0), 8, NumElts));
+
+  // Suffix used in the instruction mnemonic.
+  string Suffix = suffix;
+
+  // VTName is a string name for vector VT. For vector types it will be
+  // v # NumElts # EltVT, so for vector of 8 elements of i32 it will be v8i32
+  // It is a little bit complex for scalar types, where NumElts = 1.
+  // In this case we build v4f32 or v2f64
+  string VTName = "v" # !if (!eq (NumElts, 1),
+                        !if (!eq (EltVT.Size, 32), 4,
+                        !if (!eq (EltVT.Size, 64), 2, NumElts)), NumElts) # EltVT;
+
+  // The vector VT.
+  ValueType VT = !cast<ValueType>(VTName);
+
+  string EltTypeName = !cast<string>(EltVT);
+  // Size of the element type in bits, e.g. 32 for v16i32.
+  string EltSizeName = !subst("i", "", !subst("f", "", EltTypeName));
+  int EltSize = EltVT.Size;
+
+  // "i" for integer types and "f" for floating-point types
+  string TypeVariantName = !subst(EltSizeName, "", EltTypeName);
+
+  // Size of RC in bits, e.g. 512 for VR512.
+  int Size = VT.Size;
+
+  // The corresponding memory operand, e.g. i512mem for VR512.
+  X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem");
+  X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem");
+
+  // Load patterns
+  // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64
+  //       due to load promotion during legalization
+  PatFrag LdFrag = !cast<PatFrag>("load" #
+                                  !if (!eq (TypeVariantName, "i"),
+                                       !if (!eq (Size, 128), "v2i64",
+                                       !if (!eq (Size, 256), "v4i64",
+                                       !if (!eq (Size, 512), "v8i64",
+                                            VTName))), VTName));
+
+  PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" #
+                                         !if (!eq (TypeVariantName, "i"),
+                                               !if (!eq (Size, 128), "v2i64",
+                                               !if (!eq (Size, 256), "v4i64",
+                                               !if (!eq (Size, 512), "v8i64",
+                                                   VTName))), VTName));
+
+  PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
+
+  // The corresponding float type, e.g. v16f32 for v16i32
+  // Note: For EltSize < 32, FloatVT is illegal and TableGen
+  //       fails to compile, so we choose FloatVT = VT
+  ValueType FloatVT = !cast<ValueType>(
+                        !if (!eq (!srl(EltSize,5),0),
+                             VTName,
+                             !if (!eq(TypeVariantName, "i"),
+                                  "v" # NumElts # "f" # EltSize,
+                                  VTName)));
+
+  ValueType IntVT = !cast<ValueType>(
+                        !if (!eq (!srl(EltSize,5),0),
+                             VTName,
+                             !if (!eq(TypeVariantName, "f"),
+                                  "v" # NumElts # "i" # EltSize,
+                                  VTName)));
+  // The string to specify embedded broadcast in assembly.
+  string BroadcastStr = "{1to" # NumElts # "}";
+
+  // 8-bit compressed displacement tuple/subvector format.  This is only
+  // defined for NumElts <= 8.
+  CD8VForm CD8TupleForm = !if (!eq (!srl(NumElts, 4), 0),
+                               !cast<CD8VForm>("CD8VT" # NumElts), ?);
+
+  SubRegIndex SubRegIdx = !if (!eq (Size, 128), sub_xmm,
+                          !if (!eq (Size, 256), sub_ymm, ?));
+
+  Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle,
+                     !if (!eq (EltTypeName, "f64"), SSEPackedDouble,
+                     SSEPackedInt));
+
+  RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X);
+
+  // A vector tye of the same width with element type i64. This is used to
+  // create patterns for logic ops.
+  ValueType i64VT = !cast<ValueType>("v" # !srl(Size, 6) # "i64");
+
+  // A vector type of the same width with element type i32.  This is used to
+  // create the canonical constant zero node ImmAllZerosV.
+  ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32");
+  dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV)));
+
+  string ZSuffix = !if (!eq (Size, 128), "Z128",
+                   !if (!eq (Size, 256), "Z256", "Z"));
+}
+
+def v64i8_info  : X86VectorVTInfo<64,  i8, VR512, "b">;
+def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">;
+def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">;
+def v8i64_info  : X86VectorVTInfo<8,  i64, VR512, "q">;
+def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">;
+def v8f64_info  : X86VectorVTInfo<8,  f64, VR512, "pd">;
+
+// "x" in v32i8x_info means RC = VR256X
+def v32i8x_info  : X86VectorVTInfo<32,  i8, VR256X, "b">;
+def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">;
+def v8i32x_info  : X86VectorVTInfo<8,  i32, VR256X, "d">;
+def v4i64x_info  : X86VectorVTInfo<4,  i64, VR256X, "q">;
+def v8f32x_info  : X86VectorVTInfo<8,  f32, VR256X, "ps">;
+def v4f64x_info  : X86VectorVTInfo<4,  f64, VR256X, "pd">;
+
+def v16i8x_info  : X86VectorVTInfo<16,  i8, VR128X, "b">;
+def v8i16x_info  : X86VectorVTInfo<8,  i16, VR128X, "w">;
+def v4i32x_info  : X86VectorVTInfo<4,  i32, VR128X, "d">;
+def v2i64x_info  : X86VectorVTInfo<2,  i64, VR128X, "q">;
+def v4f32x_info  : X86VectorVTInfo<4,  f32, VR128X, "ps">;
+def v2f64x_info  : X86VectorVTInfo<2,  f64, VR128X, "pd">;
+
+// We map scalar types to the smallest (128-bit) vector type
+// with the appropriate element type. This allows to use the same masking logic.
+def i32x_info    : X86VectorVTInfo<1,  i32, GR32, "si">;
+def i64x_info    : X86VectorVTInfo<1,  i64, GR64, "sq">;
+def f32x_info    : X86VectorVTInfo<1,  f32, VR128X, "ss">;
+def f64x_info    : X86VectorVTInfo<1,  f64, VR128X, "sd">;
+
+class AVX512VLVectorVTInfo<X86VectorVTInfo i512, X86VectorVTInfo i256,
+                           X86VectorVTInfo i128> {
+  X86VectorVTInfo info512 = i512;
+  X86VectorVTInfo info256 = i256;
+  X86VectorVTInfo info128 = i128;
+}
+
+def avx512vl_i8_info  : AVX512VLVectorVTInfo<v64i8_info, v32i8x_info,
+                                             v16i8x_info>;
+def avx512vl_i16_info : AVX512VLVectorVTInfo<v32i16_info, v16i16x_info,
+                                             v8i16x_info>;
+def avx512vl_i32_info : AVX512VLVectorVTInfo<v16i32_info, v8i32x_info,
+                                             v4i32x_info>;
+def avx512vl_i64_info : AVX512VLVectorVTInfo<v8i64_info, v4i64x_info,
+                                             v2i64x_info>;
+def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info,
+                                             v4f32x_info>;
+def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info,
+                                             v2f64x_info>;
+
+// This multiclass generates the masking variants from the non-masking
+// variant.  It only provides the assembly pieces for the masking variants.
+// It assumes custom ISel patterns for masking which can be provided as
+// template arguments.
+multiclass AVX512_maskable_custom<bits<8> O, Format F,
+                                  dag Outs,
+                                  dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  list<dag> Pattern,
+                                  list<dag> MaskingPattern,
+                                  list<dag> ZeroMaskingPattern,
+                                  string MaskingConstraint = "",
+                                  InstrItinClass itin = NoItinerary,
+                                  bit IsCommutable = 0,
+                                  bit IsKCommutable = 0> {
+  let isCommutable = IsCommutable in
+    def NAME: AVX512<O, F, Outs, Ins,
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
+                                     "$dst, "#IntelSrcAsm#"}",
+                       Pattern, itin>;
+
+  // Prefer over VMOV*rrk Pat<>
+  let AddedComplexity = 20, isCommutable = IsKCommutable in
+    def NAME#k: AVX512<O, F, Outs, MaskingIns,
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
+                                     "$dst {${mask}}, "#IntelSrcAsm#"}",
+                       MaskingPattern, itin>,
+              EVEX_K {
+      // In case of the 3src subclass this is overridden with a let.
+      string Constraints = MaskingConstraint;
+    }
+
+  // Zero mask does not add any restrictions to commute operands transformation.
+  // So, it is Ok to use IsCommutable instead of IsKCommutable.
+  let AddedComplexity = 30, isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<>
+    def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
+                                     "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
+                       ZeroMaskingPattern,
+                       itin>,
+              EVEX_KZ;
+}
+
+
+// Common base class of AVX512_maskable and AVX512_maskable_3src.
+multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
+                                  dag Outs,
+                                  dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  dag RHS, dag MaskingRHS,
+                                  SDNode Select = vselect,
+                                  string MaskingConstraint = "",
+                                  InstrItinClass itin = NoItinerary,
+                                  bit IsCommutable = 0,
+                                  bit IsKCommutable = 0> :
+  AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
+                         AttSrcAsm, IntelSrcAsm,
+                         [(set _.RC:$dst, RHS)],
+                         [(set _.RC:$dst, MaskingRHS)],
+                         [(set _.RC:$dst,
+                               (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
+                         MaskingConstraint, NoItinerary, IsCommutable,
+                         IsKCommutable>;
+
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the vector instruction.  In the masking case, the
+// perserved vector elements come from a new dummy input operand tied to $dst.
+multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
+                           dag Outs, dag Ins, string OpcodeStr,
+                           string AttSrcAsm, string IntelSrcAsm,
+                           dag RHS,
+                           InstrItinClass itin = NoItinerary,
+                           bit IsCommutable = 0, bit IsKCommutable = 0,
+                           SDNode Select = vselect> :
+   AVX512_maskable_common<O, F, _, Outs, Ins,
+                          !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+                          !con((ins _.KRCWM:$mask), Ins),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+                          (Select _.KRCWM:$mask, RHS, _.RC:$src0), Select,
+                          "$src0 = $dst", itin, IsCommutable, IsKCommutable>;
+
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the scalar instruction.
+multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
+                           dag Outs, dag Ins, string OpcodeStr,
+                           string AttSrcAsm, string IntelSrcAsm,
+                           dag RHS,
+                           InstrItinClass itin = NoItinerary,
+                           bit IsCommutable = 0> :
+   AVX512_maskable_common<O, F, _, Outs, Ins,
+                          !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+                          !con((ins _.KRCWM:$mask), Ins),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+                          (X86selects _.KRCWM:$mask, RHS, _.RC:$src0),
+                          X86selects, "$src0 = $dst", itin, IsCommutable>;
+
+// Similar to AVX512_maskable but in this case one of the source operands
+// ($src1) is already tied to $dst so we just use that for the preserved
+// vector elements.  NOTE that the NonTiedIns (the ins dag) should exclude
+// $src1.
+multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
+                                dag Outs, dag NonTiedIns, string OpcodeStr,
+                                string AttSrcAsm, string IntelSrcAsm,
+                                dag RHS, bit IsCommutable = 0,
+                                bit IsKCommutable = 0> :
+   AVX512_maskable_common<O, F, _, Outs,
+                          !con((ins _.RC:$src1), NonTiedIns),
+                          !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+                          !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+                          (vselect _.KRCWM:$mask, RHS, _.RC:$src1),
+                          vselect, "", NoItinerary, IsCommutable, IsKCommutable>;
+
+multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
+                                     dag Outs, dag NonTiedIns, string OpcodeStr,
+                                     string AttSrcAsm, string IntelSrcAsm,
+                                     dag RHS, bit IsCommutable = 0,
+                                     bit IsKCommutable = 0> :
+   AVX512_maskable_common<O, F, _, Outs,
+                          !con((ins _.RC:$src1), NonTiedIns),
+                          !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+                          !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+                          (X86selects _.KRCWM:$mask, RHS, _.RC:$src1),
+                          X86selects, "", NoItinerary, IsCommutable,
+                          IsKCommutable>;
+
+multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
+                                  dag Outs, dag Ins,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  list<dag> Pattern> :
+   AVX512_maskable_custom<O, F, Outs, Ins,
+                          !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+                          !con((ins _.KRCWM:$mask), Ins),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [],
+                          "$src0 = $dst">;
+
+
+// Instruction with mask that puts result in mask register,
+// like "compare" and "vptest"
+multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
+                                  dag Outs,
+                                  dag Ins, dag MaskingIns,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  list<dag> Pattern,
+                                  list<dag> MaskingPattern,
+                                  bit IsCommutable = 0> {
+    let isCommutable = IsCommutable in
+    def NAME: AVX512<O, F, Outs, Ins,
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
+                                     "$dst, "#IntelSrcAsm#"}",
+                       Pattern, NoItinerary>;
+
+    def NAME#k: AVX512<O, F, Outs, MaskingIns,
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
+                                     "$dst {${mask}}, "#IntelSrcAsm#"}",
+                       MaskingPattern, NoItinerary>, EVEX_K;
+}
+
+multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
+                                  dag Outs,
+                                  dag Ins, dag MaskingIns,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  dag RHS, dag MaskingRHS,
+                                  bit IsCommutable = 0> :
+  AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr,
+                         AttSrcAsm, IntelSrcAsm,
+                         [(set _.KRC:$dst, RHS)],
+                         [(set _.KRC:$dst, MaskingRHS)], IsCommutable>;
+
+multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
+                           dag Outs, dag Ins, string OpcodeStr,
+                           string AttSrcAsm, string IntelSrcAsm,
+                           dag RHS, bit IsCommutable = 0> :
+   AVX512_maskable_common_cmp<O, F, _, Outs, Ins,
+                          !con((ins _.KRCWM:$mask), Ins),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+                          (and _.KRCWM:$mask, RHS), IsCommutable>;
+
+multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _,
+                           dag Outs, dag Ins, string OpcodeStr,
+                           string AttSrcAsm, string IntelSrcAsm> :
+   AVX512_maskable_custom_cmp<O, F, Outs,
+                             Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr,
+                             AttSrcAsm, IntelSrcAsm, [],[]>;
+
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the vector instruction.  In the masking case, the
+// perserved vector elements come from a new dummy input operand tied to $dst.
+multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _,
+                           dag Outs, dag Ins, string OpcodeStr,
+                           string AttSrcAsm, string IntelSrcAsm,
+                           dag RHS, dag MaskedRHS,
+                           InstrItinClass itin = NoItinerary,
+                           bit IsCommutable = 0, SDNode Select = vselect> :
+   AVX512_maskable_custom<O, F, Outs, Ins,
+                          !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+                          !con((ins _.KRCWM:$mask), Ins),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm,
+                          [(set _.RC:$dst, RHS)],
+                          [(set _.RC:$dst,
+                                (Select _.KRCWM:$mask, MaskedRHS, _.RC:$src0))],
+                          [(set _.RC:$dst,
+                                (Select _.KRCWM:$mask, MaskedRHS,
+                                        _.ImmAllZerosV))],
+                          "$src0 = $dst", itin, IsCommutable>;
+
+// Bitcasts between 512-bit vector types. Return the original type since
+// no instruction is needed for the conversion.
+def : Pat<(v8f64  (bitconvert (v8i64  VR512:$src))), (v8f64  VR512:$src)>;
+def : Pat<(v8f64  (bitconvert (v16i32 VR512:$src))), (v8f64  VR512:$src)>;
+def : Pat<(v8f64  (bitconvert (v32i16 VR512:$src))), (v8f64  VR512:$src)>;
+def : Pat<(v8f64  (bitconvert (v64i8  VR512:$src))), (v8f64  VR512:$src)>;
+def : Pat<(v8f64  (bitconvert (v16f32 VR512:$src))), (v8f64  VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v8i64  VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v64i8  VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v8f64  VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v8i64  (bitconvert (v16i32 VR512:$src))), (v8i64  VR512:$src)>;
+def : Pat<(v8i64  (bitconvert (v32i16 VR512:$src))), (v8i64  VR512:$src)>;
+def : Pat<(v8i64  (bitconvert (v64i8  VR512:$src))), (v8i64  VR512:$src)>;
+def : Pat<(v8i64  (bitconvert (v8f64  VR512:$src))), (v8i64  VR512:$src)>;
+def : Pat<(v8i64  (bitconvert (v16f32 VR512:$src))), (v8i64  VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v8i64  VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v64i8  VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v8f64  VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v8i64  VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v64i8  VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v8f64  VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v64i8  (bitconvert (v8i64  VR512:$src))), (v64i8  VR512:$src)>;
+def : Pat<(v64i8  (bitconvert (v16i32 VR512:$src))), (v64i8  VR512:$src)>;
+def : Pat<(v64i8  (bitconvert (v32i16 VR512:$src))), (v64i8  VR512:$src)>;
+def : Pat<(v64i8  (bitconvert (v8f64  VR512:$src))), (v64i8  VR512:$src)>;
+def : Pat<(v64i8  (bitconvert (v16f32 VR512:$src))), (v64i8  VR512:$src)>;
+
+// Alias instruction that maps zero vector to pxor / xorp* for AVX-512.
+// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
+// swizzled by ExecutionDepsFix to pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteZero] in {
+def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
+               [(set VR512:$dst, (v16i32 immAllZerosV))]>;
+def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
+               [(set VR512:$dst, (v16i32 immAllOnesV))]>;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, Predicates = [HasVLX], SchedRW = [WriteZero] in {
+def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "",
+               [(set VR128X:$dst, (v4i32 immAllZerosV))]>;
+def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "",
+               [(set VR256X:$dst, (v8i32 immAllZerosV))]>;
+}
+
+// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
+// This is expanded by ExpandPostRAPseudos.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasVLX, HasDQI] in {
+  def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "",
+                          [(set FR32X:$dst, fp32imm0)]>;
+  def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "",
+                          [(set FR64X:$dst, fpimm0)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VECTOR INSERT
+//
+multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To,
+                                                       PatFrag vinsert_insert> {
+  let ExeDomain = To.ExeDomain in {
+    defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
+                   (ins To.RC:$src1, From.RC:$src2, i32u8imm:$src3),
+                   "vinsert" # From.EltTypeName # "x" # From.NumElts,
+                   "$src3, $src2, $src1", "$src1, $src2, $src3",
+                   (vinsert_insert:$src3 (To.VT To.RC:$src1),
+                                         (From.VT From.RC:$src2),
+                                         (iPTR imm))>, AVX512AIi8Base, EVEX_4V;
+
+    defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
+                   (ins To.RC:$src1, From.MemOp:$src2, i32u8imm:$src3),
+                   "vinsert" # From.EltTypeName # "x" # From.NumElts,
+                   "$src3, $src2, $src1", "$src1, $src2, $src3",
+                   (vinsert_insert:$src3 (To.VT To.RC:$src1),
+                               (From.VT (bitconvert (From.LdFrag addr:$src2))),
+                               (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
+                   EVEX_CD8<From.EltSize, From.CD8TupleForm>;
+  }
+}
+
+multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
+                       X86VectorVTInfo To, PatFrag vinsert_insert,
+                       SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> {
+  let Predicates = p in {
+    def : Pat<(vinsert_insert:$ins
+                     (To.VT To.RC:$src1), (From.VT From.RC:$src2), (iPTR imm)),
+              (To.VT (!cast<Instruction>(InstrStr#"rr")
+                     To.RC:$src1, From.RC:$src2,
+                     (INSERT_get_vinsert_imm To.RC:$ins)))>;
+
+    def : Pat<(vinsert_insert:$ins
+                  (To.VT To.RC:$src1),
+                  (From.VT (bitconvert (From.LdFrag addr:$src2))),
+                  (iPTR imm)),
+              (To.VT (!cast<Instruction>(InstrStr#"rm")
+                  To.RC:$src1, addr:$src2,
+                  (INSERT_get_vinsert_imm To.RC:$ins)))>;
+  }
+}
+
+multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
+                            ValueType EltVT64, int Opcode256> {
+
+  let Predicates = [HasVLX] in
+    defm NAME # "32x4Z256" : vinsert_for_size<Opcode128,
+                                 X86VectorVTInfo< 4, EltVT32, VR128X>,
+                                 X86VectorVTInfo< 8, EltVT32, VR256X>,
+                                 vinsert128_insert>, EVEX_V256;
+
+  defm NAME # "32x4Z" : vinsert_for_size<Opcode128,
+                                 X86VectorVTInfo< 4, EltVT32, VR128X>,
+                                 X86VectorVTInfo<16, EltVT32, VR512>,
+                                 vinsert128_insert>, EVEX_V512;
+
+  defm NAME # "64x4Z" : vinsert_for_size<Opcode256,
+                                 X86VectorVTInfo< 4, EltVT64, VR256X>,
+                                 X86VectorVTInfo< 8, EltVT64, VR512>,
+                                 vinsert256_insert>, VEX_W, EVEX_V512;
+
+  let Predicates = [HasVLX, HasDQI] in
+    defm NAME # "64x2Z256" : vinsert_for_size<Opcode128,
+                                   X86VectorVTInfo< 2, EltVT64, VR128X>,
+                                   X86VectorVTInfo< 4, EltVT64, VR256X>,
+                                   vinsert128_insert>, VEX_W, EVEX_V256;
+
+  let Predicates = [HasDQI] in {
+    defm NAME # "64x2Z" : vinsert_for_size<Opcode128,
+                                 X86VectorVTInfo< 2, EltVT64, VR128X>,
+                                 X86VectorVTInfo< 8, EltVT64, VR512>,
+                                 vinsert128_insert>, VEX_W, EVEX_V512;
+
+    defm NAME # "32x8Z" : vinsert_for_size<Opcode256,
+                                   X86VectorVTInfo< 8, EltVT32, VR256X>,
+                                   X86VectorVTInfo<16, EltVT32, VR512>,
+                                   vinsert256_insert>, EVEX_V512;
+  }
+}
+
+defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>;
+defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>;
+
+// Codegen pattern with the alternative types,
+// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
+
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
+
+defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info,
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info,
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
+
+// Codegen pattern with the alternative types insert VEC128 into VEC256
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+// Codegen pattern with the alternative types insert VEC128 into VEC512
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
+               vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+// Codegen pattern with the alternative types insert VEC256 into VEC512
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+
+// vinsertps - insert f32 to XMM
+let ExeDomain = SSEPackedSingle in {
+def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
+      (ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
+      "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+      [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+      EVEX_4V;
+def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
+      (ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
+      "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+      [(set VR128X:$dst, (X86insertps VR128X:$src1,
+                          (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+                          imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 VECTOR EXTRACT
+//---
+
+multiclass vextract_for_size<int Opcode,
+                             X86VectorVTInfo From, X86VectorVTInfo To,
+                             PatFrag vextract_extract,
+                             SDNodeXForm EXTRACT_get_vextract_imm> {
+
+  let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
+    // use AVX512_maskable_in_asm (AVX512_maskable can't be used due to
+    // vextract_extract), we interesting only in patterns without mask,
+    // intrinsics pattern match generated bellow.
+    defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst),
+                (ins From.RC:$src1, i32u8imm:$idx),
+                "vextract" # To.EltTypeName # "x" # To.NumElts,
+                "$idx, $src1", "$src1, $idx",
+                [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1),
+                                                         (iPTR imm)))]>,
+              AVX512AIi8Base, EVEX;
+    def mr  : AVX512AIi8<Opcode, MRMDestMem, (outs),
+                    (ins To.MemOp:$dst, From.RC:$src1, i32u8imm:$idx),
+                    "vextract" # To.EltTypeName # "x" # To.NumElts #
+                        "\t{$idx, $src1, $dst|$dst, $src1, $idx}",
+                    [(store (To.VT (vextract_extract:$idx
+                                    (From.VT From.RC:$src1), (iPTR imm))),
+                             addr:$dst)]>, EVEX;
+
+    let mayStore = 1, hasSideEffects = 0 in
+    def mrk : AVX512AIi8<Opcode, MRMDestMem, (outs),
+                    (ins To.MemOp:$dst, To.KRCWM:$mask,
+                                        From.RC:$src1, i32u8imm:$idx),
+                     "vextract" # To.EltTypeName # "x" # To.NumElts #
+                          "\t{$idx, $src1, $dst {${mask}}|"
+                          "$dst {${mask}}, $src1, $idx}",
+                    []>, EVEX_K, EVEX;
+  }
+
+  def : Pat<(To.VT (vselect To.KRCWM:$mask,
+                            (vextract_extract:$ext (From.VT From.RC:$src1),
+                                                   (iPTR imm)),
+                            To.RC:$src0)),
+            (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+                                From.ZSuffix # "rrk")
+                To.RC:$src0, To.KRCWM:$mask, From.RC:$src1,
+                (EXTRACT_get_vextract_imm To.RC:$ext))>;
+
+  def : Pat<(To.VT (vselect To.KRCWM:$mask,
+                            (vextract_extract:$ext (From.VT From.RC:$src1),
+                                                   (iPTR imm)),
+                            To.ImmAllZerosV)),
+            (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+                                From.ZSuffix # "rrkz")
+                To.KRCWM:$mask, From.RC:$src1,
+                (EXTRACT_get_vextract_imm To.RC:$ext))>;
+
+  // Intrinsic call with masking.
+  def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
+                              "x" # To.NumElts # "_" # From.Size)
+                From.RC:$src1, (iPTR imm:$idx), To.RC:$src0, To.MRC:$mask),
+            (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+                                From.ZSuffix # "rrk")
+                To.RC:$src0,
+                (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
+                From.RC:$src1, imm:$idx)>;
+
+  // Intrinsic call with zero-masking.
+  def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
+                              "x" # To.NumElts # "_" # From.Size)
+                From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, To.MRC:$mask),
+            (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+                                From.ZSuffix # "rrkz")
+                (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
+                From.RC:$src1, imm:$idx)>;
+
+  // Intrinsic call without masking.
+  def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
+                              "x" # To.NumElts # "_" # From.Size)
+                From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)),
+            (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
+                                From.ZSuffix # "rr")
+                From.RC:$src1, imm:$idx)>;
+}
+
+// Codegen pattern for the alternative types
+multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
+                X86VectorVTInfo To, PatFrag vextract_extract,
+                SDNodeXForm EXTRACT_get_vextract_imm, list<Predicate> p> {
+  let Predicates = p in {
+     def : Pat<(vextract_extract:$ext (From.VT From.RC:$src1), (iPTR imm)),
+               (To.VT (!cast<Instruction>(InstrStr#"rr")
+                          From.RC:$src1,
+                          (EXTRACT_get_vextract_imm To.RC:$ext)))>;
+     def : Pat<(store (To.VT (vextract_extract:$ext (From.VT From.RC:$src1),
+                              (iPTR imm))), addr:$dst),
+               (!cast<Instruction>(InstrStr#"mr") addr:$dst, From.RC:$src1,
+                (EXTRACT_get_vextract_imm To.RC:$ext))>;
+  }
+}
+
+multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
+                             ValueType EltVT64, int Opcode256> {
+  defm NAME # "32x4Z" : vextract_for_size<Opcode128,
+                                 X86VectorVTInfo<16, EltVT32, VR512>,
+                                 X86VectorVTInfo< 4, EltVT32, VR128X>,
+                                 vextract128_extract,
+                                 EXTRACT_get_vextract128_imm>,
+                                     EVEX_V512, EVEX_CD8<32, CD8VT4>;
+  defm NAME # "64x4Z" : vextract_for_size<Opcode256,
+                                 X86VectorVTInfo< 8, EltVT64, VR512>,
+                                 X86VectorVTInfo< 4, EltVT64, VR256X>,
+                                 vextract256_extract,
+                                 EXTRACT_get_vextract256_imm>,
+                                     VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>;
+  let Predicates = [HasVLX] in
+    defm NAME # "32x4Z256" : vextract_for_size<Opcode128,
+                                 X86VectorVTInfo< 8, EltVT32, VR256X>,
+                                 X86VectorVTInfo< 4, EltVT32, VR128X>,
+                                 vextract128_extract,
+                                 EXTRACT_get_vextract128_imm>,
+                                     EVEX_V256, EVEX_CD8<32, CD8VT4>;
+  let Predicates = [HasVLX, HasDQI] in
+    defm NAME # "64x2Z256" : vextract_for_size<Opcode128,
+                                 X86VectorVTInfo< 4, EltVT64, VR256X>,
+                                 X86VectorVTInfo< 2, EltVT64, VR128X>,
+                                 vextract128_extract,
+                                 EXTRACT_get_vextract128_imm>,
+                                     VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>;
+  let Predicates = [HasDQI] in {
+    defm NAME # "64x2Z" : vextract_for_size<Opcode128,
+                                 X86VectorVTInfo< 8, EltVT64, VR512>,
+                                 X86VectorVTInfo< 2, EltVT64, VR128X>,
+                                 vextract128_extract,
+                                 EXTRACT_get_vextract128_imm>,
+                                     VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>;
+    defm NAME # "32x8Z" : vextract_for_size<Opcode256,
+                                 X86VectorVTInfo<16, EltVT32, VR512>,
+                                 X86VectorVTInfo< 8, EltVT32, VR256X>,
+                                 vextract256_extract,
+                                 EXTRACT_get_vextract256_imm>,
+                                     EVEX_V512, EVEX_CD8<32, CD8VT8>;
+  }
+}
+
+defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>;
+defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>;
+
+// extract_subvector codegen patterns with the alternative types.
+// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
+
+defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info,
+          vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info,
+          vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
+
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
+
+// Codegen pattern with the alternative types extract VEC128 from VEC256
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
+
+// Codegen pattern with the alternative types extract VEC128 from VEC512
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
+                 vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
+                 vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+// Codegen pattern with the alternative types extract VEC256 from VEC512
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
+                 vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
+                 vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+
+// A 128-bit subvector extract from the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
+          (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>;
+def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
+          (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
+def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
+          (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>;
+def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
+          (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
+def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 0))),
+          (v8i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_xmm))>;
+def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 0))),
+          (v16i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_xmm))>;
+
+// A 256-bit subvector extract from the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
+          (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>;
+def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
+          (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>;
+def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
+          (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>;
+def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
+          (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>;
+def : Pat<(v16i16 (extract_subvector (v32i16 VR512:$src), (iPTR 0))),
+          (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm))>;
+def : Pat<(v32i8 (extract_subvector (v64i8 VR512:$src), (iPTR 0))),
+          (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm))>;
+
+let AddedComplexity = 25 in { // to give priority over vinsertf128rm
+// A 128-bit subvector insert to the first 512-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v8i64 (insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+def : Pat<(v8f64 (insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+def : Pat<(v16i32 (insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+def : Pat<(v16f32 (insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+def : Pat<(v32i16 (insert_subvector undef, (v8i16 VR128X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+def : Pat<(v64i8 (insert_subvector undef, (v16i8 VR128X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
+
+// A 256-bit subvector insert to the first 512-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v8i64 (insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(v8f64 (insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(v16i32 (insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(v16f32 (insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(v32i16 (insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(v64i8 (insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0))),
+          (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+}
+
+// vextractps - extract 32 bits from XMM
+def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
+      (ins VR128X:$src1, u8imm:$src2),
+      "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
+      EVEX;
+
+def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs),
+      (ins f32mem:$dst, VR128X:$src1, u8imm:$src2),
+      "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
+                          addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>;
+
+//===---------------------------------------------------------------------===//
+// AVX-512 BROADCAST
+//---
+// broadcast with a scalar argument.
+multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr,
+                            X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
+
+  let isCodeGenOnly = 1 in {
+  def r_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst),
+               (ins SrcInfo.FRC:$src), OpcodeStr#"\t{$src, $dst|$dst, $src}",
+               [(set DestInfo.RC:$dst, (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)))]>,
+               Requires<[HasAVX512]>, T8PD, EVEX;
+
+  let Constraints = "$src0 = $dst" in
+  def rk_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst),
+                (ins DestInfo.RC:$src0, DestInfo.KRCWM:$mask, SrcInfo.FRC:$src),
+                OpcodeStr#"\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
+                [(set DestInfo.RC:$dst,
+                     (vselect DestInfo.KRCWM:$mask,
+                              (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
+                              DestInfo.RC:$src0))]>,
+              Requires<[HasAVX512]>, T8PD, EVEX, EVEX_K;
+
+  def rkz_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst),
+                (ins DestInfo.KRCWM:$mask, SrcInfo.FRC:$src),
+                OpcodeStr#"\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+                [(set DestInfo.RC:$dst,
+                     (vselect DestInfo.KRCWM:$mask,
+                              (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
+                              DestInfo.ImmAllZerosV))]>,
+                Requires<[HasAVX512]>, T8PD, EVEX, EVEX_KZ;
+  } // let isCodeGenOnly = 1 in
+}
+
+multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
+                            X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
+  let ExeDomain = DestInfo.ExeDomain in {
+  defm r : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+                   (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src",
+                   (DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))>,
+                   T8PD, EVEX;
+  defm m : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+                   (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+                   (DestInfo.VT (X86VBroadcast
+                                   (SrcInfo.ScalarLdFrag addr:$src)))>,
+                   T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>;
+  }
+
+  def : Pat<(DestInfo.VT (X86VBroadcast
+                          (SrcInfo.VT (scalar_to_vector
+                                       (SrcInfo.ScalarLdFrag addr:$src))))),
+            (!cast<Instruction>(NAME#DestInfo.ZSuffix#m) addr:$src)>;
+  let AddedComplexity = 20 in
+  def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
+                          (X86VBroadcast
+                           (SrcInfo.VT (scalar_to_vector
+                                        (SrcInfo.ScalarLdFrag addr:$src)))),
+                          DestInfo.RC:$src0)),
+            (!cast<Instruction>(NAME#DestInfo.ZSuffix#mk)
+             DestInfo.RC:$src0, DestInfo.KRCWM:$mask, addr:$src)>;
+  let AddedComplexity = 30 in
+  def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
+                          (X86VBroadcast
+                           (SrcInfo.VT (scalar_to_vector
+                                        (SrcInfo.ScalarLdFrag addr:$src)))),
+                          DestInfo.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#DestInfo.ZSuffix#mkz)
+             DestInfo.KRCWM:$mask, addr:$src)>;
+}
+
+multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
+                                                       AVX512VLVectorVTInfo _> {
+  let Predicates = [HasAVX512] in
+    defm Z  : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+              avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>,
+                               EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z256  : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+                 avx512_broadcast_scalar<opc, OpcodeStr, _.info256, _.info128>,
+                             EVEX_V256;
+  }
+}
+
+multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
+                                                       AVX512VLVectorVTInfo _> {
+  let Predicates = [HasAVX512] in
+    defm Z  : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+              avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>,
+                               EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z256  : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+                 avx512_broadcast_scalar<opc, OpcodeStr, _.info256, _.info128>,
+                             EVEX_V256;
+    defm Z128  : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>,
+                 avx512_broadcast_scalar<opc, OpcodeStr, _.info128, _.info128>,
+                             EVEX_V128;
+  }
+}
+defm VBROADCASTSS  : avx512_fp_broadcast_ss<0x18, "vbroadcastss",
+                                       avx512vl_f32_info>;
+defm VBROADCASTSD  : avx512_fp_broadcast_sd<0x19, "vbroadcastsd",
+                                       avx512vl_f64_info>, VEX_W;
+
+def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src),
+          (VBROADCASTSSZm addr:$src)>;
+def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
+          (VBROADCASTSDZm addr:$src)>;
+
+multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _,
+                                    RegisterClass SrcRC> {
+  defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins SrcRC:$src),
+                         "vpbroadcast"##_.Suffix, "$src", "$src",
+                         (_.VT (X86VBroadcast SrcRC:$src))>, T8PD, EVEX;
+}
+
+multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _,
+                                       RegisterClass SrcRC, Predicate prd> {
+  let Predicates = [prd] in
+    defm Z : avx512_int_broadcast_reg<opc, _.info512, SrcRC>, EVEX_V512;
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_int_broadcast_reg<opc, _.info256, SrcRC>, EVEX_V256;
+    defm Z128 : avx512_int_broadcast_reg<opc, _.info128, SrcRC>, EVEX_V128;
+  }
+}
+
+let isCodeGenOnly = 1 in {
+defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, GR8,
+                                                 HasBWI>;
+defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR16,
+                                                 HasBWI>;
+}
+let isAsmParserOnly = 1 in {
+  defm VPBROADCASTBr_Alt : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info,
+                                                       GR32, HasBWI>;
+  defm VPBROADCASTWr_Alt : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info,
+                                                       GR32, HasBWI>;
+}
+defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, GR32,
+                                                 HasAVX512>;
+defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64,
+                                                 HasAVX512>, VEX_W;
+
+def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
+           (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
+def : Pat <(v8i64 (X86vzext VK8WM:$mask)),
+           (VPBROADCASTQrZrkz VK8WM:$mask, (i64 (MOV64ri 0x1)))>;
+
+// Provide aliases for broadcast from the same register class that
+// automatically does the extract.
+multiclass avx512_int_broadcast_rm_lowering<X86VectorVTInfo DestInfo,
+                                            X86VectorVTInfo SrcInfo> {
+  def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))),
+            (!cast<Instruction>(NAME#DestInfo.ZSuffix#"r")
+                (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm))>;
+}
+
+multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr,
+                                        AVX512VLVectorVTInfo _, Predicate prd> {
+  let Predicates = [prd] in {
+    defm Z :   avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+               avx512_int_broadcast_rm_lowering<_.info512, _.info256>,
+                                  EVEX_V512;
+    // Defined separately to avoid redefinition.
+    defm Z_Alt : avx512_int_broadcast_rm_lowering<_.info512, _.info512>;
+  }
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+                avx512_int_broadcast_rm_lowering<_.info256, _.info256>,
+                                 EVEX_V256;
+    defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>,
+                                 EVEX_V128;
+  }
+}
+
+defm VPBROADCASTB  : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb",
+                                           avx512vl_i8_info, HasBWI>;
+defm VPBROADCASTW  : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw",
+                                           avx512vl_i16_info, HasBWI>;
+defm VPBROADCASTD  : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd",
+                                           avx512vl_i32_info, HasAVX512>;
+defm VPBROADCASTQ  : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
+                                           avx512vl_i64_info, HasAVX512>, VEX_W;
+
+multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
+                          X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+                           (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
+                           (_Dst.VT (X86SubVBroadcast
+                             (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
+                            AVX5128IBase, EVEX;
+}
+
+let Predicates = [HasVLX, HasBWI] in {
+  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
+  // This means we'll encounter truncated i32 loads; match that here.
+  def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+            (VPBROADCASTWZ128m addr:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+            (VPBROADCASTWZ256m addr:$src)>;
+  def : Pat<(v8i16 (X86VBroadcast
+              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+            (VPBROADCASTWZ128m addr:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast
+              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+            (VPBROADCASTWZ256m addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 BROADCAST SUBVECTORS
+//
+
+defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
+                       v16i32_info, v4i32x_info>,
+                       EVEX_V512, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTF32X4 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
+                       v16f32_info, v4f32x_info>,
+                       EVEX_V512, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTI64X4 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti64x4",
+                       v8i64_info, v4i64x_info>, VEX_W,
+                       EVEX_V512, EVEX_CD8<64, CD8VT4>;
+defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4",
+                       v8f64_info, v4f64x_info>, VEX_W,
+                       EVEX_V512, EVEX_CD8<64, CD8VT4>;
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))),
+          (VBROADCASTI64X4rm addr:$src)>;
+def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))),
+          (VBROADCASTI64X4rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
+          (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v8f32 VR256X:$src), 1)>;
+def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))),
+          (VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v4f64 VR256X:$src), 1)>; 
+def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))),
+          (VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v4i64 VR256X:$src), 1)>; 
+def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
+          (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v8i32 VR256X:$src), 1)>;
+def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))),
+          (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v16i16 VR256X:$src), 1)>;
+def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))),
+          (VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v32i8 VR256X:$src), 1)>;
+
+def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+          (VBROADCASTI32X4rm addr:$src)>;
+def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+          (VBROADCASTI32X4rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
+          (VINSERTF64x4Zrr
+           (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
+                                           VR128X:$src, sub_xmm),
+                            VR128X:$src, 1),
+           (EXTRACT_SUBREG
+            (v8f64 (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
+                                                   VR128X:$src, sub_xmm),
+                                    VR128X:$src, 1)), sub_ymm), 1)>;
+def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
+          (VINSERTI64x4Zrr
+           (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
+                                           VR128X:$src, sub_xmm),
+                            VR128X:$src, 1),
+           (EXTRACT_SUBREG
+            (v8i64 (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
+                                                   VR128X:$src, sub_xmm),
+                                    VR128X:$src, 1)), sub_ymm), 1)>;
+
+def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
+          (VINSERTI64x4Zrr
+           (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)),
+                                           VR128X:$src, sub_xmm),
+                            VR128X:$src, 1),
+           (EXTRACT_SUBREG
+            (v32i16 (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)),
+                                                    VR128X:$src, sub_xmm),
+                                     VR128X:$src, 1)), sub_ymm), 1)>;
+def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
+          (VINSERTI64x4Zrr
+           (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)),
+                                           VR128X:$src, sub_xmm),
+                            VR128X:$src, 1),
+           (EXTRACT_SUBREG
+            (v64i8 (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)),
+                                                   VR128X:$src, sub_xmm),
+                                    VR128X:$src, 1)), sub_ymm), 1)>;
+}
+
+let Predicates = [HasVLX] in {
+defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
+                           v8i32x_info, v4i32x_info>,
+                           EVEX_V256, EVEX_CD8<32, CD8VT4>;
+defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
+                           v8f32x_info, v4f32x_info>,
+                           EVEX_V256, EVEX_CD8<32, CD8VT4>;
+
+def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+          (VBROADCASTI32X4Z256rm addr:$src)>;
+def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+          (VBROADCASTI32X4Z256rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
+          (VINSERTF32x4Z256rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                              (v4f32 VR128X:$src), 1)>;
+def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
+          (VINSERTI32x4Z256rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                              (v4i32 VR128X:$src), 1)>;
+def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
+          (VINSERTI32x4Z256rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                              (v8i16 VR128X:$src), 1)>;
+def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
+          (VINSERTI32x4Z256rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                              (v16i8 VR128X:$src), 1)>;
+}
+
+let Predicates = [HasVLX, HasDQI] in {
+defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
+                           v4i64x_info, v2i64x_info>, VEX_W,
+                           EVEX_V256, EVEX_CD8<64, CD8VT2>;
+defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2",
+                           v4f64x_info, v2f64x_info>, VEX_W,
+                           EVEX_V256, EVEX_CD8<64, CD8VT2>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
+          (VINSERTF64x2Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                              (v2f64 VR128X:$src), 1)>;
+def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
+          (VINSERTI64x2Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                              (v2i64 VR128X:$src), 1)>;
+}
+
+let Predicates = [HasVLX, NoDQI] in {
+def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
+          (VBROADCASTF32X4Z256rm addr:$src)>;
+def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+          (VBROADCASTI32X4Z256rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
+          (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                              (v2f64 VR128X:$src), 1)>;
+def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
+          (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                              (v2i64 VR128X:$src), 1)>;
+}
+
+let Predicates = [HasAVX512, NoDQI] in {
+def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
+          (VBROADCASTF32X4rm addr:$src)>;
+def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+          (VBROADCASTI32X4rm addr:$src)>;
+
+def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
+          (VINSERTF64x4Zrr
+           (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+                                           VR128X:$src, sub_xmm),
+                            VR128X:$src, 1),
+           (EXTRACT_SUBREG
+            (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+                                                    VR128X:$src, sub_xmm),
+                                     VR128X:$src, 1)), sub_ymm), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
+          (VINSERTI64x4Zrr
+           (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+                                           VR128X:$src, sub_xmm),
+                            VR128X:$src, 1),
+           (EXTRACT_SUBREG
+            (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+                                                    VR128X:$src, sub_xmm),
+                                     VR128X:$src, 1)), sub_ymm), 1)>;
+
+def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
+          (VBROADCASTF64X4rm addr:$src)>;
+def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))),
+          (VBROADCASTI64X4rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
+          (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v8f32 VR256X:$src), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
+          (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v8i32 VR256X:$src), 1)>;
+}
+
+let Predicates = [HasDQI] in {
+defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
+                       v8i64_info, v2i64x_info>, VEX_W,
+                       EVEX_V512, EVEX_CD8<64, CD8VT2>;
+defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti32x8",
+                       v16i32_info, v8i32x_info>,
+                       EVEX_V512, EVEX_CD8<32, CD8VT8>;
+defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2",
+                       v8f64_info, v2f64x_info>, VEX_W,
+                       EVEX_V512, EVEX_CD8<64, CD8VT2>;
+defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8",
+                       v16f32_info, v8f32x_info>,
+                       EVEX_V512, EVEX_CD8<32, CD8VT8>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
+          (VINSERTF32x8Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v8f32 VR256X:$src), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
+          (VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v8i32 VR256X:$src), 1)>;
+
+def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
+          (VINSERTF32x8Zrr
+           (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+                                           VR128X:$src, sub_xmm),
+                            VR128X:$src, 1),
+           (EXTRACT_SUBREG
+            (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+                                                    VR128X:$src, sub_xmm),
+                                     VR128X:$src, 1)), sub_ymm), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
+          (VINSERTI32x8Zrr
+           (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+                                           VR128X:$src, sub_xmm),
+                            VR128X:$src, 1),
+           (EXTRACT_SUBREG
+            (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+                                                    VR128X:$src, sub_xmm),
+                                     VR128X:$src, 1)), sub_ymm), 1)>;
+}
+
+multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
+                         AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> {
+  let Predicates = [HasDQI] in
+    defm Z :    avx512_broadcast_rm<opc, OpcodeStr, _Dst.info512, _Src.info128>,
+                                  EVEX_V512;
+  let Predicates = [HasDQI, HasVLX] in
+    defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info256, _Src.info128>,
+                                  EVEX_V256;
+}
+
+multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
+                         AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> :
+  avx512_common_broadcast_32x2<opc, OpcodeStr, _Dst, _Src> {
+
+  let Predicates = [HasDQI, HasVLX] in
+    defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info128, _Src.info128>,
+                                      EVEX_V128;
+}
+
+defm VBROADCASTI32X2  : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
+                                          avx512vl_i32_info, avx512vl_i64_info>;
+defm VBROADCASTF32X2  : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
+                                          avx512vl_f32_info, avx512vl_f64_info>;
+
+def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
+          (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
+def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))),
+          (VBROADCASTSSZr (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>;
+
+def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))),
+          (VBROADCASTSDZr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
+def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))),
+          (VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 BROADCAST MASK TO VECTOR REGISTER
+//---
+multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr,
+                                  X86VectorVTInfo _, RegisterClass KRC> {
+  def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX;
+}
+
+multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
+                                 AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> {
+  let Predicates = [HasCDI] in
+    defm Z : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info512, KRC>, EVEX_V512;
+  let Predicates = [HasCDI, HasVLX] in {
+    defm Z256 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info256, KRC>, EVEX_V256;
+    defm Z128 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info128, KRC>, EVEX_V128;
+  }
+}
+
+defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d",
+                                               avx512vl_i32_info, VK16>;
+defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q",
+                                               avx512vl_i64_info, VK8>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// -- VPERMI2 - 3 source operands form --
+multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+  // The index operand in the pattern should really be an integer type. However,
+  // if we do that and it happens to come from a bitcast, then it becomes
+  // difficult to find the bitcast needed to convert the index to the
+  // destination type for the passthru since it will be folded with the bitcast
+  // of the index operand.
+  defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3)), 1>, EVEX_4V,
+         AVX5128IBase;
+
+  defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+            (ins _.RC:$src2, _.MemOp:$src3),
+            OpcodeStr, "$src3, $src2", "$src2, $src3",
+            (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2,
+                   (_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>,
+            EVEX_4V, AVX5128IBase;
+  }
+}
+multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
+                            X86VectorVTInfo _> {
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
+  defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+              (ins _.RC:$src2, _.ScalarMemOp:$src3),
+              OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
+              !strconcat("$src2, ${src3}", _.BroadcastStr ),
+              (_.VT (X86VPermi2X _.RC:$src1,
+               _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
+              1>, AVX5128IBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr,
+                               AVX512VLVectorVTInfo VTInfo> {
+  defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512>,
+            avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+  let Predicates = [HasVLX] in {
+  defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128>,
+                 avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+  defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256>,
+                 avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+  }
+}
+
+multiclass avx512_perm_i_sizes_bw<bits<8> opc, string OpcodeStr,
+                                 AVX512VLVectorVTInfo VTInfo,
+                                 Predicate Prd> {
+  let Predicates = [Prd] in
+  defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+  let Predicates = [Prd, HasVLX] in {
+  defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+  defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256>,  EVEX_V256;
+  }
+}
+
+defm VPERMI2D  : avx512_perm_i_sizes<0x76, "vpermi2d",
+                  avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2Q  : avx512_perm_i_sizes<0x76, "vpermi2q",
+                  avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2W  : avx512_perm_i_sizes_bw<0x75, "vpermi2w",
+                  avx512vl_i16_info, HasBWI>,
+                  VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMI2B  : avx512_perm_i_sizes_bw<0x75, "vpermi2b",
+                  avx512vl_i8_info, HasVBMI>,
+                  EVEX_CD8<8, CD8VF>;
+defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps",
+                  avx512vl_f32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd",
+                  avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// VPERMT2
+multiclass avx512_perm_t<bits<8> opc, string OpcodeStr,
+                         X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+  defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins IdxVT.RC:$src2, _.RC:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>,
+          EVEX_4V, AVX5128IBase;
+
+  defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+            (ins IdxVT.RC:$src2, _.MemOp:$src3),
+            OpcodeStr, "$src3, $src2", "$src2, $src3",
+            (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
+                   (bitconvert (_.LdFrag addr:$src3)))), 1>,
+            EVEX_4V, AVX5128IBase;
+  }
+}
+multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
+                            X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
+  defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+              (ins IdxVT.RC:$src2, _.ScalarMemOp:$src3),
+              OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
+              !strconcat("$src2, ${src3}", _.BroadcastStr ),
+              (_.VT (X86VPermt2 _.RC:$src1,
+               IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
+              1>, AVX5128IBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr,
+                               AVX512VLVectorVTInfo VTInfo,
+                               AVX512VLVectorVTInfo ShuffleMask> {
+  defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
+                              ShuffleMask.info512>,
+            avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info512,
+                              ShuffleMask.info512>, EVEX_V512;
+  let Predicates = [HasVLX] in {
+  defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
+                              ShuffleMask.info128>,
+                 avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info128,
+                              ShuffleMask.info128>, EVEX_V128;
+  defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
+                              ShuffleMask.info256>,
+                 avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info256,
+                              ShuffleMask.info256>, EVEX_V256;
+  }
+}
+
+multiclass avx512_perm_t_sizes_bw<bits<8> opc, string OpcodeStr,
+                                 AVX512VLVectorVTInfo VTInfo,
+                                 AVX512VLVectorVTInfo Idx,
+                                 Predicate Prd> {
+  let Predicates = [Prd] in
+  defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
+                           Idx.info512>, EVEX_V512;
+  let Predicates = [Prd, HasVLX] in {
+  defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
+                               Idx.info128>, EVEX_V128;
+  defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
+                               Idx.info256>, EVEX_V256;
+  }
+}
+
+defm VPERMT2D  : avx512_perm_t_sizes<0x7E, "vpermt2d",
+                  avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMT2Q  : avx512_perm_t_sizes<0x7E, "vpermt2q",
+                  avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMT2W  : avx512_perm_t_sizes_bw<0x7D, "vpermt2w",
+                  avx512vl_i16_info, avx512vl_i16_info, HasBWI>,
+                  VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPERMT2B  : avx512_perm_t_sizes_bw<0x7D, "vpermt2b",
+                  avx512vl_i8_info, avx512vl_i8_info, HasVBMI>,
+                  EVEX_CD8<8, CD8VF>;
+defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps",
+                  avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd",
+                  avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - BLEND using mask
+//
+multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
+  let hasSideEffects = 0 in
+  def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+             (ins _.RC:$src1, _.RC:$src2),
+             !strconcat(OpcodeStr,
+             "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
+             []>, EVEX_4V;
+  def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+             (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+             !strconcat(OpcodeStr,
+             "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+             [(set _.RC:$dst, (vselect _.KRCWM:$mask,
+                                (_.VT _.RC:$src2),
+                                (_.VT _.RC:$src1)))]>, EVEX_4V, EVEX_K;
+  let hasSideEffects = 0 in
+  def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
+             (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+             !strconcat(OpcodeStr,
+             "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
+             []>, EVEX_4V, EVEX_KZ;
+  let mayLoad = 1, hasSideEffects = 0 in
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+             (ins _.RC:$src1, _.MemOp:$src2),
+             !strconcat(OpcodeStr,
+             "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
+             []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+  def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+             (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+             !strconcat(OpcodeStr,
+             "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+             [(set _.RC:$dst, (vselect _.KRCWM:$mask,
+                                 (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                                 (_.VT _.RC:$src1)))]>,
+              EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>;
+  let mayLoad = 1, hasSideEffects = 0 in
+  def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+             (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+             !strconcat(OpcodeStr,
+             "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
+             []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>;
+  }
+}
+multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+
+  def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+      (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
+       !strconcat(OpcodeStr,
+            "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+            "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+      [(set _.RC:$dst,(vselect _.KRCWM:$mask,
+                        (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+                        (_.VT _.RC:$src1)))]>,
+      EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+
+  let mayLoad = 1, hasSideEffects = 0 in
+  def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
+      (ins _.RC:$src1, _.ScalarMemOp:$src2),
+       !strconcat(OpcodeStr,
+            "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
+            "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+      []>,  EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+
+}
+
+multiclass blendmask_dq <bits<8> opc, string OpcodeStr,
+                                 AVX512VLVectorVTInfo VTInfo> {
+  defm Z : avx512_blendmask      <opc, OpcodeStr, VTInfo.info512>,
+           avx512_blendmask_rmb  <opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z256 : avx512_blendmask<opc, OpcodeStr, VTInfo.info256>,
+                avx512_blendmask_rmb  <opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+    defm Z128 : avx512_blendmask<opc, OpcodeStr, VTInfo.info128>,
+                avx512_blendmask_rmb  <opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+  }
+}
+
+multiclass blendmask_bw <bits<8> opc, string OpcodeStr,
+                         AVX512VLVectorVTInfo VTInfo> {
+  let Predicates = [HasBWI] in
+    defm Z : avx512_blendmask    <opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [HasBWI, HasVLX] in {
+    defm Z256 : avx512_blendmask <opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+    defm Z128 : avx512_blendmask <opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+  }
+}
+
+
+defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", avx512vl_f32_info>;
+defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", avx512vl_f64_info>, VEX_W;
+defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", avx512vl_i32_info>;
+defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", avx512vl_i64_info>, VEX_W;
+defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>;
+defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W;
+
+
+let Predicates = [HasAVX512, NoVLX] in {
+def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
+                            (v8f32 VR256X:$src2))),
+            (EXTRACT_SUBREG
+              (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+            (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+            (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
+                            (v8i32 VR256X:$src2))),
+            (EXTRACT_SUBREG
+                (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+}
+//===----------------------------------------------------------------------===//
+// Compare Instructions
+//===----------------------------------------------------------------------===//
+
+// avx512_cmp_scalar - AVX512 CMPSS and CMPSD
+
+multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>{
+
+  defm  rr_Int  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+                      (outs _.KRC:$dst),
+                      (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+                      "vcmp${cc}"#_.Suffix,
+                      "$src2, $src1", "$src1, $src2",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              imm:$cc)>, EVEX_4V;
+  defm  rm_Int  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+                    (outs _.KRC:$dst),
+                    (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+                    "vcmp${cc}"#_.Suffix,
+                    "$src2, $src1", "$src1, $src2",
+                    (OpNode (_.VT _.RC:$src1),
+                        (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                        imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+
+  defm  rrb_Int  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+                     (outs _.KRC:$dst),
+                     (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+                     "vcmp${cc}"#_.Suffix,
+                     "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+                     (OpNodeRnd (_.VT _.RC:$src1),
+                                (_.VT _.RC:$src2),
+                                imm:$cc,
+                                (i32 FROUND_NO_EXC))>, EVEX_4V, EVEX_B;
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    defm  rri_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+                        (outs VK1:$dst),
+                        (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+                        "vcmp"#_.Suffix,
+                        "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V;
+    defm  rmi_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
+                        (outs _.KRC:$dst),
+                        (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
+                        "vcmp"#_.Suffix,
+                        "$cc, $src2, $src1", "$src1, $src2, $cc">,
+                        EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+
+    defm  rrb_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+                       (outs _.KRC:$dst),
+                       (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+                       "vcmp"#_.Suffix,
+                       "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">,
+                       EVEX_4V, EVEX_B;
+  }// let isAsmParserOnly = 1, hasSideEffects = 0
+
+  let isCodeGenOnly = 1 in {
+    let isCommutable = 1 in
+    def rr : AVX512Ii8<0xC2, MRMSrcReg,
+                (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc),
+                !strconcat("vcmp${cc}", _.Suffix,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set _.KRC:$dst, (OpNode _.FRC:$src1,
+                                          _.FRC:$src2,
+                                          imm:$cc))],
+                IIC_SSE_ALU_F32S_RR>, EVEX_4V;
+    def rm : AVX512Ii8<0xC2, MRMSrcMem,
+              (outs _.KRC:$dst),
+              (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+              !strconcat("vcmp${cc}", _.Suffix,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+              [(set _.KRC:$dst, (OpNode _.FRC:$src1,
+                                        (_.ScalarLdFrag addr:$src2),
+                                        imm:$cc))],
+              IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+  }
+}
+
+let Predicates = [HasAVX512] in {
+  defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>,
+                                   AVX512XSIi8Base;
+  defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>,
+                                   AVX512XDIi8Base, VEX_W;
+}
+
+multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+              X86VectorVTInfo _, bit IsCommutable> {
+  let isCommutable = IsCommutable in
+  def rr : AVX512BI<opc, MRMSrcReg,
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))],
+             IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+  def rm : AVX512BI<opc, MRMSrcMem,
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+                                     (_.VT (bitconvert (_.LdFrag addr:$src2)))))],
+             IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  def rrk : AVX512BI<opc, MRMSrcReg,
+              (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
+                          "$dst {${mask}}, $src1, $src2}"),
+              [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                   (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))],
+              IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+  def rmk : AVX512BI<opc, MRMSrcMem,
+              (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
+                          "$dst {${mask}}, $src1, $src2}"),
+              [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                   (OpNode (_.VT _.RC:$src1),
+                                       (_.VT (bitconvert
+                                              (_.LdFrag addr:$src2))))))],
+              IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+}
+
+multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+              X86VectorVTInfo _, bit IsCommutable> :
+           avx512_icmp_packed<opc, OpcodeStr, OpNode, _, IsCommutable> {
+  def rmb : AVX512BI<opc, MRMSrcMem,
+              (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
+              !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
+                                    "|$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+              [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+                              (X86VBroadcast (_.ScalarLdFrag addr:$src2))))],
+              IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+  def rmbk : AVX512BI<opc, MRMSrcMem,
+               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+                                       _.ScalarMemOp:$src2),
+               !strconcat(OpcodeStr,
+                          "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+                          "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+               [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                      (OpNode (_.VT _.RC:$src1),
+                                        (X86VBroadcast
+                                          (_.ScalarLdFrag addr:$src2)))))],
+               IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+}
+
+multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 AVX512VLVectorVTInfo VTInfo, Predicate prd,
+                                 bit IsCommutable = 0> {
+  let Predicates = [prd] in
+  defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info512,
+                              IsCommutable>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info256,
+                                   IsCommutable>, EVEX_V256;
+    defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info128,
+                                   IsCommutable>, EVEX_V128;
+  }
+}
+
+multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, AVX512VLVectorVTInfo VTInfo,
+                                  Predicate prd, bit IsCommutable = 0> {
+  let Predicates = [prd] in
+  defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info512,
+                                  IsCommutable>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info256,
+                                       IsCommutable>, EVEX_V256;
+    defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info128,
+                                       IsCommutable>, EVEX_V128;
+  }
+}
+
+defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm,
+                      avx512vl_i8_info, HasBWI, 1>,
+                EVEX_CD8<8, CD8VF>;
+
+defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm,
+                      avx512vl_i16_info, HasBWI, 1>,
+                EVEX_CD8<16, CD8VF>;
+
+defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm,
+                      avx512vl_i32_info, HasAVX512, 1>,
+                EVEX_CD8<32, CD8VF>;
+
+defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm,
+                      avx512vl_i64_info, HasAVX512, 1>,
+                T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm,
+                      avx512vl_i8_info, HasBWI>,
+                EVEX_CD8<8, CD8VF>;
+
+defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm,
+                      avx512vl_i16_info, HasBWI>,
+                EVEX_CD8<16, CD8VF>;
+
+defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm,
+                      avx512vl_i32_info, HasAVX512>,
+                EVEX_CD8<32, CD8VF>;
+
+defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm,
+                      avx512vl_i64_info, HasAVX512>,
+                T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+let Predicates = [HasAVX512, NoVLX] in {
+def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (COPY_TO_REGCLASS (VPCMPGTDZrr
+            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
+
+def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (COPY_TO_REGCLASS (VPCMPEQDZrr
+            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
+}
+
+multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
+                          X86VectorVTInfo _> {
+  let isCommutable = 1 in
+  def rri : AVX512AIi8<opc, MRMSrcReg,
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc),
+             !strconcat("vpcmp${cc}", Suffix,
+                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                                       imm:$cc))],
+             IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+  def rmi : AVX512AIi8<opc, MRMSrcMem,
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc),
+             !strconcat("vpcmp${cc}", Suffix,
+                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+                              (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                              imm:$cc))],
+             IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  def rrik : AVX512AIi8<opc, MRMSrcReg,
+              (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
+                                      AVX512ICC:$cc),
+              !strconcat("vpcmp${cc}", Suffix,
+                         "\t{$src2, $src1, $dst {${mask}}|",
+                         "$dst {${mask}}, $src1, $src2}"),
+              [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                  (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                                          imm:$cc)))],
+              IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+  def rmik : AVX512AIi8<opc, MRMSrcMem,
+              (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
+                                    AVX512ICC:$cc),
+              !strconcat("vpcmp${cc}", Suffix,
+                         "\t{$src2, $src1, $dst {${mask}}|",
+                         "$dst {${mask}}, $src1, $src2}"),
+              [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                   (OpNode (_.VT _.RC:$src1),
+                                      (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                                      imm:$cc)))],
+              IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    def rri_alt : AVX512AIi8<opc, MRMSrcReg,
+               (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+               !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
+                          "$dst, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+    let mayLoad = 1 in
+    def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+               !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
+                          "$dst, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+    def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
+               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
+                                       u8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                          "\t{$cc, $src2, $src1, $dst {${mask}}|",
+                          "$dst {${mask}}, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+    let mayLoad = 1 in
+    def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
+                                       u8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                          "\t{$cc, $src2, $src1, $dst {${mask}}|",
+                          "$dst {${mask}}, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+  }
+}
+
+multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
+                              X86VectorVTInfo _> :
+           avx512_icmp_cc<opc, Suffix, OpNode, _> {
+  def rmib : AVX512AIi8<opc, MRMSrcMem,
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
+                                     AVX512ICC:$cc),
+             !strconcat("vpcmp${cc}", Suffix,
+                        "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
+                        "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+                               (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+                               imm:$cc))],
+             IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+  def rmibk : AVX512AIi8<opc, MRMSrcMem,
+              (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+                                       _.ScalarMemOp:$src2, AVX512ICC:$cc),
+              !strconcat("vpcmp${cc}", Suffix,
+                       "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+                       "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+              [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                  (OpNode (_.VT _.RC:$src1),
+                                    (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+                                    imm:$cc)))],
+              IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in {
+    def rmib_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
+                                       u8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                   "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|",
+                   "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
+               [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+    def rmibk_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+                                       _.ScalarMemOp:$src2, u8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                  "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+                  "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
+               [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+  }
+}
+
+multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, SDNode OpNode,
+                             AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+  let Predicates = [prd] in
+  defm Z : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info256>, EVEX_V256;
+    defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info128>, EVEX_V128;
+  }
+}
+
+multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, SDNode OpNode,
+                                AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+  let Predicates = [prd] in
+  defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info512>,
+           EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info256>,
+                EVEX_V256;
+    defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info128>,
+                EVEX_V128;
+  }
+}
+
+defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, avx512vl_i8_info,
+                                HasBWI>, EVEX_CD8<8, CD8VF>;
+defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, avx512vl_i8_info,
+                                 HasBWI>, EVEX_CD8<8, CD8VF>;
+
+defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, avx512vl_i16_info,
+                                HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, avx512vl_i16_info,
+                                 HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;
+
+defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, avx512vl_i32_info,
+                                    HasAVX512>, EVEX_CD8<32, CD8VF>;
+defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, avx512vl_i32_info,
+                                     HasAVX512>, EVEX_CD8<32, CD8VF>;
+
+defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info,
+                                    HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info,
+                                     HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_vcmp_common<X86VectorVTInfo _> {
+
+  defm  rri  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+                   (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,AVXCC:$cc),
+                   "vcmp${cc}"#_.Suffix,
+                   "$src2, $src1", "$src1, $src2",
+                   (X86cmpm (_.VT _.RC:$src1),
+                         (_.VT _.RC:$src2),
+                           imm:$cc), 1>;
+
+  defm  rmi  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+                (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
+                "vcmp${cc}"#_.Suffix,
+                "$src2, $src1", "$src1, $src2",
+                (X86cmpm (_.VT _.RC:$src1),
+                        (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                        imm:$cc)>;
+
+  defm  rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
+                (outs _.KRC:$dst),
+                (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+                "vcmp${cc}"#_.Suffix,
+                "${src2}"##_.BroadcastStr##", $src1",
+                "$src1, ${src2}"##_.BroadcastStr,
+                (X86cmpm (_.VT _.RC:$src1),
+                        (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+                        imm:$cc)>,EVEX_B;
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    defm  rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+                         (outs _.KRC:$dst),
+                         (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+                         "vcmp"#_.Suffix,
+                         "$cc, $src2, $src1", "$src1, $src2, $cc">;
+
+    let mayLoad = 1 in {
+      defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
+                             (outs _.KRC:$dst),
+                             (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
+                             "vcmp"#_.Suffix,
+                             "$cc, $src2, $src1", "$src1, $src2, $cc">;
+
+      defm  rmbi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
+                         (outs _.KRC:$dst),
+                         (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
+                         "vcmp"#_.Suffix,
+                         "$cc, ${src2}"##_.BroadcastStr##", $src1",
+                         "$src1, ${src2}"##_.BroadcastStr##", $cc">,EVEX_B;
+    }
+ }
+}
+
+multiclass avx512_vcmp_sae<X86VectorVTInfo _> {
+  // comparison code form (VCMP[EQ/LT/LE/...]
+  defm  rrib  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
+                     (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
+                     "vcmp${cc}"#_.Suffix,
+                     "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+                     (X86cmpmRnd (_.VT _.RC:$src1),
+                                    (_.VT _.RC:$src2),
+                                    imm:$cc,
+                                (i32 FROUND_NO_EXC))>, EVEX_B;
+
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    defm  rrib_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
+                         (outs _.KRC:$dst),
+                         (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+                         "vcmp"#_.Suffix,
+                         "$cc, {sae}, $src2, $src1",
+                         "$src1, $src2, {sae}, $cc">, EVEX_B;
+   }
+}
+
+multiclass avx512_vcmp<AVX512VLVectorVTInfo _> {
+  let Predicates = [HasAVX512] in {
+    defm Z    : avx512_vcmp_common<_.info512>,
+                avx512_vcmp_sae<_.info512>, EVEX_V512;
+
+  }
+  let Predicates = [HasAVX512,HasVLX] in {
+   defm Z128 : avx512_vcmp_common<_.info128>, EVEX_V128;
+   defm Z256 : avx512_vcmp_common<_.info256>, EVEX_V256;
+  }
+}
+
+defm VCMPPD : avx512_vcmp<avx512vl_f64_info>,
+                          AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VCMPPS : avx512_vcmp<avx512vl_f32_info>,
+                          AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VCMPPSZrri
+            (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+            (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VPCMPDZrri
+            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VPCMPUDZrri
+            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+
+// ----------------------------------------------------------------
+// FPClass
+//handle fpclass instruction  mask =  op(reg_scalar,imm)
+//                                    op(mem_scalar,imm)
+multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 X86VectorVTInfo _, Predicate prd> {
+  let Predicates = [prd] in {
+      def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),//_.KRC:$dst),
+                      (ins _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
+                              (i32 imm:$src2)))], NoItinerary>;
+      def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+                      (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix#
+                      "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+                      [(set _.KRC:$dst,(or _.KRCWM:$mask,
+                                      (OpNode (_.VT _.RC:$src1),
+                                      (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+    let AddedComplexity = 20 in {
+      def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                      (ins _.MemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix##
+                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set _.KRC:$dst,
+                            (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                    (i32 imm:$src2)))], NoItinerary>;
+      def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                      (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix##
+                      "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+                      [(set _.KRC:$dst,(or _.KRCWM:$mask,
+                          (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                              (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+    }
+  }
+}
+
+//handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm)
+//                                  fpclass(reg_vec, mem_vec, imm)
+//                                  fpclass(reg_vec, broadcast(eltVt), imm)
+multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 X86VectorVTInfo _, string mem, string broadcast>{
+  def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+                      (ins _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
+                                       (i32 imm:$src2)))], NoItinerary>;
+  def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
+                      (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix#
+                      "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+                      [(set _.KRC:$dst,(or _.KRCWM:$mask,
+                                       (OpNode (_.VT _.RC:$src1),
+                                       (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+  def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                    (ins _.MemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix##mem#
+                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set _.KRC:$dst,(OpNode
+                                     (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                     (i32 imm:$src2)))], NoItinerary>;
+  def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                    (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix##mem#
+                    "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+                    [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode
+                                  (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                  (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+  def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                    (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+                                      _.BroadcastStr##", $dst|$dst, ${src1}"
+                                                  ##_.BroadcastStr##", $src2}",
+                    [(set _.KRC:$dst,(OpNode
+                                     (_.VT (X86VBroadcast
+                                           (_.ScalarLdFrag addr:$src1))),
+                                     (i32 imm:$src2)))], NoItinerary>,EVEX_B;
+  def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                    (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
+                          _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"##
+                                                   _.BroadcastStr##", $src2}",
+                    [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode
+                                     (_.VT (X86VBroadcast
+                                           (_.ScalarLdFrag addr:$src1))),
+                                     (i32 imm:$src2))))], NoItinerary>,
+                                                          EVEX_B, EVEX_K;
+}
+
+multiclass avx512_vector_fpclass_all<string OpcodeStr,
+            AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd,
+                                                              string broadcast>{
+  let Predicates = [prd] in {
+    defm Z    : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info512, "{z}",
+                                      broadcast>, EVEX_V512;
+  }
+  let Predicates = [prd, HasVLX] in {
+    defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info128, "{x}",
+                                      broadcast>, EVEX_V128;
+    defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info256, "{y}",
+                                      broadcast>, EVEX_V256;
+  }
+}
+
+multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
+             bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{
+  defm PS : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f32_info, opcVec,
+                                      VecOpNode, prd, "{l}">, EVEX_CD8<32, CD8VF>;
+  defm PD : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f64_info, opcVec,
+                                      VecOpNode, prd, "{q}">,EVEX_CD8<64, CD8VF> , VEX_W;
+  defm SS : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
+                                      f32x_info, prd>, EVEX_CD8<32, CD8VT1>;
+  defm SD : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
+                                      f64x_info, prd>, EVEX_CD8<64, CD8VT1>, VEX_W;
+}
+
+defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass,
+                                      X86Vfpclasss, HasDQI>, AVX512AIi8Base,EVEX;
+
+//-----------------------------------------------------------------
+// Mask register copy, including
+// - copy between mask registers
+// - load/store mask registers
+// - copy from GPR to mask register and vice versa
+//
+multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
+                         string OpcodeStr, RegisterClass KRC,
+                         ValueType vvt, X86MemOperand x86memop> {
+  let hasSideEffects = 0 in
+  def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+             !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+  def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
+             !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+             [(set KRC:$dst, (vvt (load addr:$src)))]>;
+  def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
+             !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+             [(store KRC:$src, addr:$dst)]>;
+}
+
+multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
+                             string OpcodeStr,
+                             RegisterClass KRC, RegisterClass GRC> {
+  let hasSideEffects = 0 in {
+    def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+    def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+  }
+}
+
+let Predicates = [HasDQI] in
+  defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>,
+               avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>,
+               VEX, PD;
+
+let Predicates = [HasAVX512] in
+  defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
+               avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
+               VEX, PS;
+
+let Predicates = [HasBWI] in {
+  defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>,
+               VEX, PD, VEX_W;
+  defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>,
+               VEX, XD;
+  defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>,
+               VEX, PS, VEX_W;
+  defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>,
+               VEX, XD, VEX_W;
+}
+
+// GR from/to mask register
+def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
+          (COPY_TO_REGCLASS GR16:$src, VK16)>;
+def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
+          (COPY_TO_REGCLASS VK16:$src, GR16)>;
+
+def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
+          (COPY_TO_REGCLASS GR8:$src, VK8)>;
+def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
+          (COPY_TO_REGCLASS VK8:$src, GR8)>;
+
+def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
+          (KMOVWrk VK16:$src)>;
+def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
+          (i32 (INSERT_SUBREG (IMPLICIT_DEF),
+                (i16 (COPY_TO_REGCLASS VK16:$src, GR16)), sub_16bit))>;
+
+def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
+          (MOVZX32rr8 (COPY_TO_REGCLASS VK8:$src, GR8))>, Requires<[NoDQI]>;
+def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
+          (KMOVBrk VK8:$src)>, Requires<[HasDQI]>;
+def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
+          (i32 (INSERT_SUBREG (IMPLICIT_DEF),
+                (i8 (COPY_TO_REGCLASS VK8:$src, GR8)), sub_8bit))>;
+
+def : Pat<(v32i1 (bitconvert (i32 GR32:$src))),
+          (COPY_TO_REGCLASS GR32:$src, VK32)>;
+def : Pat<(i32 (bitconvert (v32i1 VK32:$src))),
+          (COPY_TO_REGCLASS VK32:$src, GR32)>;
+def : Pat<(v64i1 (bitconvert (i64 GR64:$src))),
+          (COPY_TO_REGCLASS GR64:$src, VK64)>;
+def : Pat<(i64 (bitconvert (v64i1 VK64:$src))),
+          (COPY_TO_REGCLASS VK64:$src, GR64)>;
+
+// Load/store kreg
+let Predicates = [HasDQI] in {
+  def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
+            (KMOVBmk addr:$dst, VK8:$src)>;
+  def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
+            (KMOVBkm addr:$src)>;
+
+  def : Pat<(store VK4:$src, addr:$dst),
+            (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>;
+  def : Pat<(store VK2:$src, addr:$dst),
+            (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>;
+  def : Pat<(store VK1:$src, addr:$dst),
+            (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>;
+
+  def : Pat<(v2i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK2)>;
+  def : Pat<(v4i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK4)>;
+}
+let Predicates = [HasAVX512, NoDQI] in {
+  def : Pat<(store VK1:$src, addr:$dst),
+            (MOV8mr addr:$dst,
+             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)),
+              sub_8bit))>;
+  def : Pat<(store VK2:$src, addr:$dst),
+            (MOV8mr addr:$dst,
+             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK2:$src, VK16)),
+              sub_8bit))>;
+  def : Pat<(store VK4:$src, addr:$dst),
+            (MOV8mr addr:$dst,
+             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK4:$src, VK16)),
+              sub_8bit))>;
+  def : Pat<(store VK8:$src, addr:$dst),
+            (MOV8mr addr:$dst,
+             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
+              sub_8bit))>;
+
+  def : Pat<(v8i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>;
+  def : Pat<(v2i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK2)>;
+  def : Pat<(v4i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK4)>;
+}
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
+            (KMOVWmk addr:$dst, VK16:$src)>;
+  def : Pat<(i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), VK1)>;
+  def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
+            (KMOVWkm addr:$src)>;
+}
+let Predicates = [HasBWI] in {
+  def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst),
+            (KMOVDmk addr:$dst, VK32:$src)>;
+  def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))),
+            (KMOVDkm addr:$src)>;
+  def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst),
+            (KMOVQmk addr:$dst, VK64:$src)>;
+  def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))),
+            (KMOVQkm addr:$src)>;
+}
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(i1 (trunc (i64 GR64:$src))),
+            (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit),
+                                        (i32 1))), VK1)>;
+
+  def : Pat<(i1 (trunc (i32 GR32:$src))),
+            (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 $src, (i32 1))), VK1)>;
+
+  def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))),
+            (COPY_TO_REGCLASS GR32:$src, VK1)>;
+
+  def : Pat<(i1 (trunc (i8 GR8:$src))),
+       (COPY_TO_REGCLASS
+        (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                          GR8:$src, sub_8bit), (i32 1))),
+       VK1)>;
+
+  def : Pat<(i1 (trunc (i16 GR16:$src))),
+       (COPY_TO_REGCLASS
+        (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                          GR16:$src, sub_16bit), (i32 1))),
+       VK1)>;
+
+  def : Pat<(i32 (zext VK1:$src)),
+            (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
+
+  def : Pat<(i32 (anyext VK1:$src)),
+            (COPY_TO_REGCLASS VK1:$src, GR32)>;
+
+  def : Pat<(i8 (zext VK1:$src)),
+            (EXTRACT_SUBREG
+             (AND32ri8 (KMOVWrk
+                        (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>;
+
+  def : Pat<(i8 (anyext VK1:$src)),
+            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_8bit)>;
+
+  def : Pat<(i64 (zext VK1:$src)),
+            (AND64ri8 (SUBREG_TO_REG (i64 0),
+             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>;
+
+  def : Pat<(i64 (anyext VK1:$src)),
+            (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+             (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_32bit)>;
+
+  def : Pat<(i16 (zext VK1:$src)),
+            (EXTRACT_SUBREG
+             (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
+              sub_16bit)>;
+
+  def : Pat<(i16 (anyext VK1:$src)),
+            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_16bit)>;
+}
+def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK16)>;
+def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK8)>;
+def : Pat<(v4i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK4)>;
+def : Pat<(v2i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK2)>;
+def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK32)>;
+def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
+          (COPY_TO_REGCLASS VK1:$src, VK64)>;
+
+def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1  1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1  0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
+
+def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))), (COPY_TO_REGCLASS VK64:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), (COPY_TO_REGCLASS VK32:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))), (COPY_TO_REGCLASS VK16:$src, VK1)>;
+def : Pat<(i1 (X86Vextract VK8:$src,  (iPTR 0))), (COPY_TO_REGCLASS VK8:$src,  VK1)>;
+def : Pat<(i1 (X86Vextract VK4:$src,  (iPTR 0))), (COPY_TO_REGCLASS VK4:$src,  VK1)>;
+def : Pat<(i1 (X86Vextract VK2:$src,  (iPTR 0))), (COPY_TO_REGCLASS VK2:$src,  VK1)>;
+
+// Mask unary operation
+// - KNOT
+multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr,
+                            RegisterClass KRC, SDPatternOperator OpNode,
+                            Predicate prd> {
+  let Predicates = [prd] in
+    def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+               [(set KRC:$dst, (OpNode KRC:$src))]>;
+}
+
+multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
+                                SDPatternOperator OpNode> {
+  defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
+                            HasDQI>, VEX, PD;
+  defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
+                            HasAVX512>, VEX, PS;
+  defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
+                            HasBWI>, VEX, PD, VEX_W;
+  defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
+                            HasBWI>, VEX, PS, VEX_W;
+}
+
+defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot>;
+
+multiclass avx512_mask_unop_int<string IntName, string InstName> {
+  let Predicates = [HasAVX512] in
+    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
+                (i16 GR16:$src)),
+              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
+              (v16i1 (COPY_TO_REGCLASS GR16:$src, VK16))), GR16)>;
+}
+defm : avx512_mask_unop_int<"knot", "KNOT">;
+
+// KNL does not support KMOVB, 8-bit mask is promoted to 16-bit
+let Predicates = [HasAVX512, NoDQI] in
+def : Pat<(vnot VK8:$src),
+          (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>;
+
+def : Pat<(vnot VK4:$src),
+          (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src, VK16)), VK4)>;
+def : Pat<(vnot VK2:$src),
+          (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src, VK16)), VK2)>;
+
+// Mask binary operation
+// - KAND, KANDN, KOR, KXNOR, KXOR
+multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
+                           RegisterClass KRC, SDPatternOperator OpNode,
+                           Predicate prd, bit IsCommutable> {
+  let Predicates = [prd], isCommutable = IsCommutable in
+    def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
+               !strconcat(OpcodeStr,
+                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>;
+}
+
+multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
+                               SDPatternOperator OpNode, bit IsCommutable,
+                               Predicate prdW = HasAVX512> {
+  defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
+                             HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
+  defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
+                             prdW, IsCommutable>, VEX_4V, VEX_L, PS;
+  defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
+                             HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD;
+  defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
+                             HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS;
+}
+
+def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
+def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
+// These nodes use 'vnot' instead of 'not' to support vectors.
+def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>;
+def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>;
+
+defm KAND  : avx512_mask_binop_all<0x41, "kand",  and,   1>;
+defm KOR   : avx512_mask_binop_all<0x45, "kor",   or,    1>;
+defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, 1>;
+defm KXOR  : avx512_mask_binop_all<0x47, "kxor",  xor,   1>;
+defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, 0>;
+defm KADD  : avx512_mask_binop_all<0x4A, "kadd",  add,   1, HasDQI>;
+
+multiclass avx512_mask_binop_int<string IntName, string InstName> {
+  let Predicates = [HasAVX512] in
+    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
+                (i16 GR16:$src1), (i16 GR16:$src2)),
+              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
+              (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)),
+              (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>;
+}
+
+defm : avx512_mask_binop_int<"kand",  "KAND">;
+defm : avx512_mask_binop_int<"kandn", "KANDN">;
+defm : avx512_mask_binop_int<"kor",   "KOR">;
+defm : avx512_mask_binop_int<"kxnor", "KXNOR">;
+defm : avx512_mask_binop_int<"kxor",  "KXOR">;
+
+multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode,
+                            Instruction Inst> {
+  // With AVX512F, 8-bit mask is promoted to 16-bit mask,
+  // for the DQI set, this type is legal and KxxxB instruction is used
+  let Predicates = [NoDQI] in
+  def : Pat<(VOpNode VK8:$src1, VK8:$src2),
+            (COPY_TO_REGCLASS
+              (Inst (COPY_TO_REGCLASS VK8:$src1, VK16),
+                    (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>;
+
+  // All types smaller than 8 bits require conversion anyway
+  def : Pat<(OpNode VK1:$src1, VK1:$src2),
+        (COPY_TO_REGCLASS (Inst
+                           (COPY_TO_REGCLASS VK1:$src1, VK16),
+                           (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
+  def : Pat<(VOpNode VK2:$src1, VK2:$src2),
+        (COPY_TO_REGCLASS (Inst
+                           (COPY_TO_REGCLASS VK2:$src1, VK16),
+                           (COPY_TO_REGCLASS VK2:$src2, VK16)), VK1)>;
+  def : Pat<(VOpNode VK4:$src1, VK4:$src2),
+        (COPY_TO_REGCLASS (Inst
+                           (COPY_TO_REGCLASS VK4:$src1, VK16),
+                           (COPY_TO_REGCLASS VK4:$src2, VK16)), VK1)>;
+}
+
+defm : avx512_binop_pat<and,   and,  KANDWrr>;
+defm : avx512_binop_pat<vandn, andn, KANDNWrr>;
+defm : avx512_binop_pat<or,    or,   KORWrr>;
+defm : avx512_binop_pat<vxnor, xnor, KXNORWrr>;
+defm : avx512_binop_pat<xor,   xor,  KXORWrr>;
+
+// Mask unpacking
+multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
+                             RegisterClass KRCSrc, Predicate prd> {
+  let Predicates = [prd] in {
+    let hasSideEffects = 0 in
+    def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst),
+               (ins KRC:$src1, KRC:$src2),
+               "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               VEX_4V, VEX_L;
+
+    def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)),
+              (!cast<Instruction>(NAME##rr)
+                        (COPY_TO_REGCLASS KRCSrc:$src2, KRC),
+                        (COPY_TO_REGCLASS KRCSrc:$src1, KRC))>;
+  }
+}
+
+defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, HasAVX512>, PD;
+defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS;
+defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W;
+
+// Mask bit testing
+multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                              SDNode OpNode, Predicate prd> {
+  let Predicates = [prd], Defs = [EFLAGS] in
+    def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+               [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>;
+}
+
+multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                Predicate prdW = HasAVX512> {
+  defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, HasDQI>,
+                                                                VEX, PD;
+  defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, prdW>,
+                                                                VEX, PS;
+  defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, HasBWI>,
+                                                                VEX, PS, VEX_W;
+  defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, HasBWI>,
+                                                                VEX, PD, VEX_W;
+}
+
+defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>;
+defm KTEST   : avx512_mask_testop_w<0x99, "ktest", X86ktest, HasDQI>;
+
+// Mask shift
+multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                             SDNode OpNode> {
+  let Predicates = [HasAVX512] in
+    def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm),
+                 !strconcat(OpcodeStr,
+                            "\t{$imm, $src, $dst|$dst, $src, $imm}"),
+                            [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>;
+}
+
+multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
+                               SDNode OpNode> {
+  defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+                               VEX, TAPD, VEX_W;
+  let Predicates = [HasDQI] in
+  defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode>,
+                               VEX, TAPD;
+  let Predicates = [HasBWI] in {
+  defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode>,
+                               VEX, TAPD, VEX_W;
+  defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>,
+                               VEX, TAPD;
+  }
+}
+
+defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>;
+defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86vsrli>;
+
+// Mask setting all 0s or 1s
+multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
+  let Predicates = [HasAVX512] in
+    let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in
+      def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "",
+                     [(set KRC:$dst, (VT Val))]>;
+}
+
+multiclass avx512_mask_setop_w<PatFrag Val> {
+  defm B : avx512_mask_setop<VK8,   v8i1, Val>;
+  defm W : avx512_mask_setop<VK16, v16i1, Val>;
+  defm D : avx512_mask_setop<VK32,  v32i1, Val>;
+  defm Q : avx512_mask_setop<VK64, v64i1, Val>;
+}
+
+defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
+defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
+
+// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
+let Predicates = [HasAVX512] in {
+  def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
+  def : Pat<(v4i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK4)>;
+  def : Pat<(v2i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK2)>;
+  def : Pat<(v8i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK8)>;
+  def : Pat<(v4i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK4)>;
+  def : Pat<(v2i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK2)>;
+  def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>;
+  def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
+  def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
+}
+
+// Patterns for kmask insert_subvector/extract_subvector to/from index=0
+multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subVT,
+                                             RegisterClass RC, ValueType VT> {
+  def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))),
+            (subVT (COPY_TO_REGCLASS RC:$src, subRC))>;
+
+  def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))),
+            (VT (COPY_TO_REGCLASS subRC:$src, RC))>;
+}
+
+defm : operation_subvector_mask_lowering<VK2,  v2i1,  VK4,  v4i1>;
+defm : operation_subvector_mask_lowering<VK2,  v2i1,  VK8,  v8i1>;
+defm : operation_subvector_mask_lowering<VK2,  v2i1,  VK16, v16i1>;
+defm : operation_subvector_mask_lowering<VK2,  v2i1,  VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK2,  v2i1,  VK64, v64i1>;
+
+defm : operation_subvector_mask_lowering<VK4,  v4i1,  VK8,  v8i1>;
+defm : operation_subvector_mask_lowering<VK4,  v4i1,  VK16, v16i1>;
+defm : operation_subvector_mask_lowering<VK4,  v4i1,  VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK4,  v4i1,  VK64, v64i1>;
+
+defm : operation_subvector_mask_lowering<VK8,  v8i1,  VK16, v16i1>;
+defm : operation_subvector_mask_lowering<VK8,  v8i1,  VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK8,  v8i1,  VK64, v64i1>;
+
+defm : operation_subvector_mask_lowering<VK16, v16i1, VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK16, v16i1, VK64, v64i1>;
+
+defm : operation_subvector_mask_lowering<VK32, v32i1, VK64, v64i1>;
+
+def : Pat<(v2i1 (extract_subvector (v4i1 VK4:$src), (iPTR 2))),
+          (v2i1 (COPY_TO_REGCLASS
+                  (KSHIFTRWri (COPY_TO_REGCLASS VK4:$src, VK16), (i8 2)),
+                   VK2))>;
+def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 4))),
+          (v4i1 (COPY_TO_REGCLASS
+                  (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (i8 4)),
+                   VK4))>;
+def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
+          (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
+def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))),
+          (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>;
+def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))),
+          (v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>;
+
+
+// Patterns for kmask shift
+multiclass mask_shift_lowering<RegisterClass RC, ValueType VT> {
+  def : Pat<(VT (X86vshli RC:$src, (i8 imm:$imm))),
+            (VT (COPY_TO_REGCLASS
+                   (KSHIFTLWri (COPY_TO_REGCLASS RC:$src, VK16),
+                               (I8Imm $imm)),
+                   RC))>;
+  def : Pat<(VT (X86vsrli RC:$src, (i8 imm:$imm))),
+            (VT (COPY_TO_REGCLASS
+                   (KSHIFTRWri (COPY_TO_REGCLASS RC:$src, VK16),
+                               (I8Imm $imm)),
+                   RC))>;
+}
+
+defm : mask_shift_lowering<VK8, v8i1>, Requires<[HasAVX512, NoDQI]>;
+defm : mask_shift_lowering<VK4, v4i1>, Requires<[HasAVX512]>;
+defm : mask_shift_lowering<VK2, v2i1>, Requires<[HasAVX512]>;
+//===----------------------------------------------------------------------===//
+// AVX-512 - Aligned and unaligned load and store
+//
+
+
+multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         PatFrag ld_frag, PatFrag mload,
+                         SDPatternOperator SelectOprr = vselect> {
+  let hasSideEffects = 0 in {
+  def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
+                    _.ExeDomain>, EVEX;
+  def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
+                      (ins _.KRCWM:$mask,  _.RC:$src),
+                      !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+                       "${dst} {${mask}} {z}, $src}"),
+                       [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
+                                           (_.VT _.RC:$src),
+                                           _.ImmAllZerosV)))], _.ExeDomain>,
+                       EVEX, EVEX_KZ;
+
+  let canFoldAsLoad = 1, isReMaterializable = 1,
+      SchedRW = [WriteLoad] in
+  def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))],
+                    _.ExeDomain>, EVEX;
+
+  let Constraints = "$src0 = $dst" in {
+  def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
+                    (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1),
+                    !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
+                    "${dst} {${mask}}, $src1}"),
+                    [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
+                                        (_.VT _.RC:$src1),
+                                        (_.VT _.RC:$src0))))], _.ExeDomain>,
+                     EVEX, EVEX_K;
+    let SchedRW = [WriteLoad] in
+    def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
+                     (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1),
+                     !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
+                      "${dst} {${mask}}, $src1}"),
+                     [(set _.RC:$dst, (_.VT
+                         (vselect _.KRCWM:$mask,
+                          (_.VT (bitconvert (ld_frag addr:$src1))),
+                           (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K;
+  }
+  let SchedRW = [WriteLoad] in
+  def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
+                  (ins _.KRCWM:$mask, _.MemOp:$src),
+                  OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"#
+                                "${dst} {${mask}} {z}, $src}",
+                  [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
+                    (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))],
+                  _.ExeDomain>, EVEX, EVEX_KZ;
+  }
+  def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
+            (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
+
+  def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, _.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
+
+  def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src0))),
+            (!cast<Instruction>(NAME#_.ZSuffix##rmk) _.RC:$src0,
+             _.KRCWM:$mask, addr:$ptr)>;
+}
+
+multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr,
+                                  AVX512VLVectorVTInfo _,
+                                  Predicate prd> {
+  let Predicates = [prd] in
+  defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.AlignedLdFrag,
+                       masked_load_aligned512>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+  defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.AlignedLdFrag,
+                          masked_load_aligned256>, EVEX_V256;
+  defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.AlignedLdFrag,
+                          masked_load_aligned128>, EVEX_V128;
+  }
+}
+
+multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
+                                  AVX512VLVectorVTInfo _,
+                                  Predicate prd,
+                                  SDPatternOperator SelectOprr = vselect> {
+  let Predicates = [prd] in
+  defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.LdFrag,
+                       masked_load_unaligned, SelectOprr>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+  defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.LdFrag,
+                         masked_load_unaligned, SelectOprr>, EVEX_V256;
+  defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag,
+                         masked_load_unaligned, SelectOprr>, EVEX_V128;
+  }
+}
+
+multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                        PatFrag st_frag, PatFrag mstore> {
+
+  let hasSideEffects = 0 in {
+  def rr_REV  : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
+                         OpcodeStr # ".s\t{$src, $dst|$dst, $src}",
+                         [], _.ExeDomain>, EVEX;
+  def rrk_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
+                         (ins _.KRCWM:$mask, _.RC:$src),
+                         OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"#
+                         "${dst} {${mask}}, $src}",
+                         [], _.ExeDomain>,  EVEX, EVEX_K;
+  def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
+                          (ins _.KRCWM:$mask, _.RC:$src),
+                          OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" #
+                          "${dst} {${mask}} {z}, $src}",
+                          [], _.ExeDomain>, EVEX, EVEX_KZ;
+  }
+
+  def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(st_frag (_.VT _.RC:$src), addr:$dst)], _.ExeDomain>, EVEX;
+  def mrk : AVX512PI<opc, MRMDestMem, (outs),
+                     (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
+              OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}",
+               [], _.ExeDomain>, EVEX, EVEX_K;
+
+  def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)),
+           (!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr,
+                                                    _.KRCWM:$mask, _.RC:$src)>;
+}
+
+
+multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
+                            AVX512VLVectorVTInfo _, Predicate prd> {
+  let Predicates = [prd] in
+  defm Z : avx512_store<opc, OpcodeStr, _.info512, store,
+                        masked_store_unaligned>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store,
+                             masked_store_unaligned>, EVEX_V256;
+    defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store,
+                             masked_store_unaligned>, EVEX_V128;
+  }
+}
+
+multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
+                                  AVX512VLVectorVTInfo _,  Predicate prd> {
+  let Predicates = [prd] in
+  defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512,
+                        masked_store_aligned512>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_store<opc, OpcodeStr, _.info256, alignedstore256,
+                             masked_store_aligned256>, EVEX_V256;
+    defm Z128 : avx512_store<opc, OpcodeStr, _.info128, alignedstore,
+                             masked_store_aligned128>, EVEX_V128;
+  }
+}
+
+defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info,
+                                     HasAVX512>,
+               avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info,
+                                      HasAVX512>,  PS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
+                                     HasAVX512>,
+               avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info,
+                                     HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
+                              null_frag>,
+               avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512>,
+                              PS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
+                              null_frag>,
+               avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>,
+               PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
+                                       HasAVX512>,
+                 avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
+                                       HasAVX512>, PD, EVEX_CD8<32, CD8VF>;
+
+defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
+                                       HasAVX512>,
+                 avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
+                                    HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>,
+                 avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
+                                 HasBWI>, XD, EVEX_CD8<8, CD8VF>;
+
+defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>,
+                 avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
+                                 HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>;
+
+defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
+                                null_frag>,
+                 avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
+                                 HasAVX512>, XS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
+                                null_frag>,
+                 avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
+                                 HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// Special instructions to help with spilling when we don't have VLX. We need
+// to load or store from a ZMM register instead. These are converted in
+// expandPostRAPseudos.
+let isReMaterializable = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteLoad], mayLoad = 1, hasSideEffects = 0 in {
+def VMOVAPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
+                            "", []>;
+def VMOVAPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
+                            "", []>;
+def VMOVUPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
+                            "", []>;
+def VMOVUPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
+                            "", []>;
+}
+
+let isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+def VMOVAPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
+                            "", []>;
+def VMOVAPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
+                            "", []>;
+def VMOVUPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
+                            "", []>;
+def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
+                            "", []>;
+}
+
+def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
+                          (v8i64 VR512:$src))),
+   (VMOVDQA64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
+                                              VK8), VR512:$src)>;
+
+def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
+                           (v16i32 VR512:$src))),
+                  (VMOVDQA32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
+
+// These patterns exist to prevent the above patterns from introducing a second
+// mask inversion when one already exists.
+def : Pat<(v8i64 (vselect (xor VK8:$mask, (v8i1 immAllOnesV)),
+                          (bc_v8i64 (v16i32 immAllZerosV)),
+                          (v8i64 VR512:$src))),
+                 (VMOVDQA64Zrrkz VK8:$mask, VR512:$src)>;
+def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)),
+                           (v16i32 immAllZerosV),
+                           (v16i32 VR512:$src))),
+                  (VMOVDQA32Zrrkz VK16WM:$mask, VR512:$src)>;
+
+let Predicates = [HasVLX, NoBWI] in {
+  // 128-bit load/store without BWI.
+  def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst),
+            (VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst),
+            (VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;
+  def : Pat<(store (v8i16 VR128X:$src), addr:$dst),
+            (VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;
+  def : Pat<(store (v16i8 VR128X:$src), addr:$dst),
+            (VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;
+
+  // 256-bit load/store without BWI.
+  def : Pat<(alignedstore256 (v16i16 VR256X:$src), addr:$dst),
+            (VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;
+  def : Pat<(alignedstore256 (v32i8 VR256X:$src), addr:$dst),
+            (VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;
+  def : Pat<(store (v16i16 VR256X:$src), addr:$dst),
+            (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;
+  def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
+            (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;
+}
+
+let Predicates = [HasVLX] in {
+  // Special patterns for storing subvector extracts of lower 128-bits of 256.
+  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+  def : Pat<(alignedstore (v2f64 (extract_subvector
+                                  (v4f64 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVAPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4f32 (extract_subvector
+                                  (v8f32 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVAPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v2i64 (extract_subvector
+                                  (v4i64 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4i32 (extract_subvector
+                                  (v8i32 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v8i16 (extract_subvector
+                                  (v16i16 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v16i8 (extract_subvector
+                                  (v32i8 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+
+  def : Pat<(store (v2f64 (extract_subvector
+                           (v4f64 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVUPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(store (v4f32 (extract_subvector
+                           (v8f32 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVUPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(store (v2i64 (extract_subvector
+                           (v4i64 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(store (v4i32 (extract_subvector
+                           (v8i32 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(store (v8i16 (extract_subvector
+                           (v16i16 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+  def : Pat<(store (v16i8 (extract_subvector
+                           (v32i8 VR256X:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
+
+  // Special patterns for storing subvector extracts of lower 128-bits of 512.
+  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+  def : Pat<(alignedstore (v2f64 (extract_subvector
+                                  (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVAPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4f32 (extract_subvector
+                                  (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVAPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v2i64 (extract_subvector
+                                  (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4i32 (extract_subvector
+                                  (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v8i16 (extract_subvector
+                                  (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v16i8 (extract_subvector
+                                  (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+
+  def : Pat<(store (v2f64 (extract_subvector
+                           (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVUPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(store (v4f32 (extract_subvector
+                           (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVUPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(store (v2i64 (extract_subvector
+                           (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(store (v4i32 (extract_subvector
+                           (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(store (v8i16 (extract_subvector
+                           (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+  def : Pat<(store (v16i8 (extract_subvector
+                           (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
+
+  // Special patterns for storing subvector extracts of lower 256-bits of 512.
+  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+  def : Pat<(alignedstore256 (v4f64 (extract_subvector
+                                     (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVAPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(alignedstore (v8f32 (extract_subvector
+                                  (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVAPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(alignedstore256 (v4i64 (extract_subvector
+                                     (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA64Z256mr addr:$dst, (v4i64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(alignedstore256 (v8i32 (extract_subvector
+                                     (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z256mr addr:$dst, (v8i32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(alignedstore256 (v16i16 (extract_subvector
+                                      (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z256mr addr:$dst, (v16i16 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(alignedstore256 (v32i8 (extract_subvector
+                                     (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQA32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+
+  def : Pat<(store (v4f64 (extract_subvector
+                           (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVUPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(store (v8f32 (extract_subvector
+                           (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVUPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(store (v4i64 (extract_subvector
+                           (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU64Z256mr addr:$dst, (v4i64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(store (v8i32 (extract_subvector
+                           (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z256mr addr:$dst, (v8i32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(store (v16i16 (extract_subvector
+                            (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z256mr addr:$dst, (v16i16 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+  def : Pat<(store (v32i8 (extract_subvector
+                           (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
+     (VMOVDQU32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+}
+
+
+// Move Int Doubleword to Packed Double Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+                        EVEX;
+def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                        [(set VR128X:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))],
+                          IIC_SSE_MOVDQ>, EVEX, VEX_W;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
+                      (ins i64mem:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}", []>,
+                      EVEX, VEX_W, EVEX_CD8<64, CD8VT1>;
+let isCodeGenOnly = 1 in {
+def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src),
+                       "vmovq\t{$src, $dst|$dst, $src}",
+                       [(set FR64X:$dst, (bitconvert GR64:$src))],
+                       IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
+                         "vmovq\t{$src, $dst|$dst, $src}",
+                         [(set GR64:$dst, (bitconvert FR64X:$src))],
+                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src),
+                         "vmovq\t{$src, $dst|$dst, $src}",
+                         [(store (i64 (bitconvert FR64X:$src)), addr:$dst)],
+                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
+                         EVEX_CD8<64, CD8VT1>;
+}
+} // ExeDomain = SSEPackedInt
+
+// Move Int Doubleword to Single Scalar
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+def VMOVDI2SSZrr  : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set FR32X:$dst, (bitconvert GR32:$src))],
+                      IIC_SSE_MOVDQ>, EVEX;
+
+def VMOVDI2SSZrm  : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
+                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+// Move doubleword from xmm register to r/m32
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVPDI2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
+                       "vmovd\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
+                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
+                       EVEX;
+def VMOVPDI2DIZmr  : AVX512BI<0x7E, MRMDestMem, (outs),
+                       (ins i32mem:$dst, VR128X:$src),
+                       "vmovd\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (extractelt (v4i32 VR128X:$src),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+                       EVEX, EVEX_CD8<32, CD8VT1>;
+} // ExeDomain = SSEPackedInt
+
+// Move quadword from xmm1 register to r/m64
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
+                                                   (iPTR 0)))],
+                      IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
+                      Requires<[HasAVX512, In64BitMode]>;
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
+                      Requires<[HasAVX512, In64BitMode]>;
+
+def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
+                      (ins i64mem:$dst, VR128X:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
+                              addr:$dst)], IIC_SSE_MOVDQ>,
+                      EVEX, PD, VEX_W, EVEX_CD8<64, CD8VT1>,
+                      Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;
+
+let hasSideEffects = 0 in
+def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
+                             (ins VR128X:$src),
+                             "vmovq.s\t{$src, $dst|$dst, $src}",[]>,
+                             EVEX, VEX_W;
+} // ExeDomain = SSEPackedInt
+
+// Move Scalar Single to Double Int
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+def VMOVSS2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
+                      (ins FR32X:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (bitconvert FR32X:$src))],
+                      IIC_SSE_MOVD_ToGP>, EVEX;
+def VMOVSS2DIZmr  : AVX512BI<0x7E, MRMDestMem, (outs),
+                      (ins i32mem:$dst, FR32X:$src),
+                      "vmovd\t{$src, $dst|$dst, $src}",
+                      [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
+                      IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+// Move Quadword Int to Packed Quadword Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
+                      (ins i64mem:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
+                      EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
+} // ExeDomain = SSEPackedInt
+
+//===----------------------------------------------------------------------===//
+// AVX-512  MOVSS, MOVSD
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_move_scalar<string asm, SDNode OpNode,
+                              X86VectorVTInfo _> {
+  def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
+             (ins _.RC:$src1, _.FRC:$src2),
+             !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1,
+                                    (scalar_to_vector _.FRC:$src2))))],
+             _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
+  def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
+              (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+              !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
+              "$dst {${mask}} {z}, $src1, $src2}"),
+              [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
+                                      (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+                                      _.ImmAllZerosV)))],
+              _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ;
+  let Constraints = "$src0 = $dst"  in
+  def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
+             (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+             !strconcat(asm, "\t{$src2, $src1, $dst {${mask}}|",
+             "$dst {${mask}}, $src1, $src2}"),
+             [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
+                                     (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+                                     (_.VT _.RC:$src0))))],
+             _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K;
+  let canFoldAsLoad = 1, isReMaterializable = 1 in
+  def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
+             !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+             [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
+             _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX;
+  let mayLoad = 1, hasSideEffects = 0 in {
+    let Constraints = "$src0 = $dst" in
+    def rmk : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
+               (ins _.RC:$src0, _.KRCWM:$mask, _.ScalarMemOp:$src),
+               !strconcat(asm, "\t{$src, $dst {${mask}}|",
+               "$dst {${mask}}, $src}"),
+               [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_K;
+    def rmkz : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
+               (ins _.KRCWM:$mask, _.ScalarMemOp:$src),
+               !strconcat(asm, "\t{$src, $dst {${mask}} {z}|",
+               "$dst {${mask}} {z}, $src}"),
+               [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_KZ;
+  }
+  def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
+             !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+             [(store _.FRC:$src, addr:$dst)],  _.ExeDomain, IIC_SSE_MOV_S_MR>,
+             EVEX;
+  let mayStore = 1, hasSideEffects = 0 in
+  def mrk: AVX512PI<0x11, MRMDestMem, (outs),
+              (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
+              !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+              [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K;
+}
+
+defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
+                                  VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;
+
+defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
+                                  VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+
+multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
+                                       PatLeaf ZeroFP, X86VectorVTInfo _> {
+
+def : Pat<(_.VT (OpNode _.RC:$src0,
+                        (_.VT (scalar_to_vector
+                                  (_.EltVT (X86selects (i1 (trunc GR32:$mask)),
+                                                       (_.EltVT _.FRC:$src1),
+                                                       (_.EltVT _.FRC:$src2))))))),
+          (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrk)
+                                          (COPY_TO_REGCLASS _.FRC:$src2, _.RC),
+                                          (COPY_TO_REGCLASS GR32:$mask, VK1WM),
+                                          (_.VT _.RC:$src0),
+                                          (COPY_TO_REGCLASS _.FRC:$src1, _.RC)),
+                            _.RC)>;
+
+def : Pat<(_.VT (OpNode _.RC:$src0,
+                        (_.VT (scalar_to_vector
+                                  (_.EltVT (X86selects (i1 (trunc GR32:$mask)),
+                                                       (_.EltVT _.FRC:$src1),
+                                                       (_.EltVT ZeroFP))))))),
+          (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrkz)
+                                          (COPY_TO_REGCLASS GR32:$mask, VK1WM),
+                                          (_.VT _.RC:$src0),
+                                          (COPY_TO_REGCLASS _.FRC:$src1, _.RC)),
+                            _.RC)>;
+
+}
+
+multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
+                                        dag Mask, RegisterClass MaskRC> {
+
+def : Pat<(masked_store addr:$dst, Mask,
+             (_.info512.VT (insert_subvector undef,
+                               (_.info256.VT (insert_subvector undef,
+                                                 (_.info128.VT _.info128.RC:$src),
+                                                 (i64 0))),
+                               (i64 0)))),
+          (!cast<Instruction>(InstrStr#mrk) addr:$dst,
+                      (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+                      (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+
+}
+
+multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
+                                       dag Mask, RegisterClass MaskRC> {
+
+def : Pat<(_.info128.VT (extract_subvector
+                         (_.info512.VT (masked_load addr:$srcAddr, Mask,
+                                        (_.info512.VT (bitconvert
+                                                       (v16i32 immAllZerosV))))),
+                           (i64 0))),
+          (!cast<Instruction>(InstrStr#rmkz)
+                      (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+                      addr:$srcAddr)>;
+
+def : Pat<(_.info128.VT (extract_subvector
+                (_.info512.VT (masked_load addr:$srcAddr, Mask,
+                      (_.info512.VT (insert_subvector undef,
+                            (_.info256.VT (insert_subvector undef,
+                                  (_.info128.VT (X86vzmovl _.info128.RC:$src)),
+                                  (i64 0))),
+                            (i64 0))))),
+                (i64 0))),
+          (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
+                      (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+                      addr:$srcAddr)>;
+
+}
+
+defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>;
+defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;
+
+defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
+                   (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
+defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
+                   (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>;
+defm : avx512_store_scalar_lowering<"VMOVSDZ", avx512vl_f64_info,
+                   (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>;
+
+defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
+                   (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
+defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
+                   (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>;
+defm : avx512_load_scalar_lowering<"VMOVSDZ", avx512vl_f64_info,
+                   (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>;
+
+def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
+          (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
+           VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
+
+def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
+          (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
+           VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
+
+def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
+          (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)),
+           (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+
+let hasSideEffects = 0 in
+defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info,
+                           (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
+                           "vmovss.s", "$src2, $src1", "$src1, $src2", []>,
+                           XS, EVEX_4V, VEX_LIG;
+
+let hasSideEffects = 0 in
+defm VMOVSSDrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info,
+                           (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
+                           "vmovsd.s", "$src2, $src1", "$src1, $src2", []>,
+                           XD, EVEX_4V, VEX_LIG, VEX_W;
+
+let Predicates = [HasAVX512] in {
+  let AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128X then do a
+  // MOVS{S,D} to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))),
+            (VMOVSSZrr (v4f32 (V_SET0)), FR32X:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
+            (VMOVSSZrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
+            (VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))),
+            (VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>;
+  }
+
+  // Move low f32 and clear high bits.
+  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSZrr (v4f32 (V_SET0)),
+              (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSZrr (v4i32 (V_SET0)),
+              (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
+  def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSZrr (v4f32 (V_SET0)),
+              (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)), sub_xmm)>;
+  def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSZrr (v4i32 (V_SET0)),
+              (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), sub_xmm)>;
+
+  let AddedComplexity = 20 in {
+  // MOVSSrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+  def : Pat<(v4f32 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+
+  // MOVSDrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+
+  // Represent the same patterns above but in the form they appear for
+  // 256-bit types
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+  def : Pat<(v8f32 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+
+  // Represent the same patterns above but in the form they appear for
+  // 512-bit types
+  def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
+                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+  def : Pat<(v16f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+  def : Pat<(v16f32 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+  def : Pat<(v8f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+  def : Pat<(v8f64 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+  }
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (V_SET0)),
+                                            FR32X:$src)), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (V_SET0)),
+                                     FR64X:$src)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
+
+  // Move low f64 and clear high bits.
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSDZrr (v2f64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
+  def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSDZrr (v2f64 (V_SET0)),
+                       (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>;
+
+  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
+  def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
+                       (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>;
+
+  // Extract and store.
+  def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
+
+  // Shuffle with VMOVSS
+  def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
+            (VMOVSSZrr (v4i32 VR128X:$src1),
+                      (COPY_TO_REGCLASS (v4i32 VR128X:$src2), FR32X))>;
+  def : Pat<(v4f32 (X86Movss VR128X:$src1, VR128X:$src2)),
+            (VMOVSSZrr (v4f32 VR128X:$src1),
+                      (COPY_TO_REGCLASS (v4f32 VR128X:$src2), FR32X))>;
+
+  // 256-bit variants
+  def : Pat<(v8i32 (X86Movss VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSZrr (EXTRACT_SUBREG (v8i32 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8i32 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v8f32 (X86Movss VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSZrr (EXTRACT_SUBREG (v8f32 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8f32 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  // Shuffle with VMOVSD
+  def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4f32 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4i32 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+
+  // 256-bit variants
+  def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDZrr (EXTRACT_SUBREG (v4i64 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4i64 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v4f64 (X86Movsd VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDZrr (EXTRACT_SUBREG (v4f64 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4f64 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v2i64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4i32 (X86Movlps VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+}
+
+let AddedComplexity = 15 in
+def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
+                                (ins VR128X:$src),
+                                "vmovq\t{$src, $dst|$dst, $src}",
+                                [(set VR128X:$dst, (v2i64 (X86vzmovl
+                                                   (v2i64 VR128X:$src))))],
+                                IIC_SSE_MOVQ_RR>, EVEX, VEX_W;
+
+let Predicates = [HasAVX512] in {
+  let AddedComplexity = 15 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (VMOVDI2PDIZrr GR32:$src)>;
+
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+              (VMOV64toPQIZrr GR64:$src)>;
+
+    def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                                 (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+              (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
+
+    def : Pat<(v8i64 (X86vzmovl (insert_subvector undef,
+                                 (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+              (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
+  }
+  // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzload addr:$src)),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v8i32 (X86vzload addr:$src)),
+              (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+              (VMOVQI2PQIZrm addr:$src)>;
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
+              (VMOVZPQILo2PQIZrr VR128X:$src)>;
+    def : Pat<(v2i64 (X86vzload addr:$src)),
+              (VMOVQI2PQIZrm addr:$src)>;
+    def : Pat<(v4i64 (X86vzload addr:$src)),
+              (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
+  }
+
+  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+  def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
+                                (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+
+  // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
+  def : Pat<(v16i32 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+  def : Pat<(v8i64 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
+}
+
+def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
+
+def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Non-temporals
+//===----------------------------------------------------------------------===//
+let SchedRW = [WriteLoad] in {
+  def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
+                        (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
+                        [(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))],
+                        SSEPackedInt>, EVEX, T8PD, EVEX_V512,
+                        EVEX_CD8<64, CD8VF>;
+
+  let Predicates = [HasVLX] in {
+    def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
+                         (ins i256mem:$src),
+                         "vmovntdqa\t{$src, $dst|$dst, $src}",
+                         [(set VR256X:$dst, (int_x86_avx2_movntdqa addr:$src))],
+                         SSEPackedInt>, EVEX, T8PD, EVEX_V256,
+                         EVEX_CD8<64, CD8VF>;
+
+    def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
+                        (ins i128mem:$src),
+                        "vmovntdqa\t{$src, $dst|$dst, $src}",
+                        [(set VR128X:$dst, (int_x86_sse41_movntdqa addr:$src))],
+                        SSEPackedInt>, EVEX, T8PD, EVEX_V128,
+                        EVEX_CD8<64, CD8VF>;
+  }
+}
+
+multiclass avx512_movnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                        PatFrag st_frag = alignednontemporalstore,
+                        InstrItinClass itin = IIC_SSE_MOVNT> {
+  let SchedRW = [WriteStore], AddedComplexity = 400 in
+  def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(st_frag (_.VT _.RC:$src), addr:$dst)],
+                    _.ExeDomain, itin>, EVEX, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr,
+                                                  AVX512VLVectorVTInfo VTInfo> {
+  let Predicates = [HasAVX512] in
+    defm Z : avx512_movnt<opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z256 : avx512_movnt<opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+    defm Z128 : avx512_movnt<opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+  }
+}
+
+defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", avx512vl_i64_info>, PD;
+defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", avx512vl_f64_info>, PD, VEX_W;
+defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", avx512vl_f32_info>, PS;
+
+let Predicates = [HasAVX512], AddedComplexity = 400 in {
+  def : Pat<(alignednontemporalstore (v16i32 VR512:$src), addr:$dst),
+            (VMOVNTDQZmr addr:$dst, VR512:$src)>;
+  def : Pat<(alignednontemporalstore (v32i16 VR512:$src), addr:$dst),
+            (VMOVNTDQZmr addr:$dst, VR512:$src)>;
+  def : Pat<(alignednontemporalstore (v64i8 VR512:$src), addr:$dst),
+            (VMOVNTDQZmr addr:$dst, VR512:$src)>;
+
+  def : Pat<(v8f64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v16f32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v8i64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v16i32 (bitconvert (v8i64 (alignednontemporalload addr:$src)))),
+            (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v32i16 (bitconvert (v8i64 (alignednontemporalload addr:$src)))),
+            (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v64i8 (bitconvert (v8i64 (alignednontemporalload addr:$src)))),
+            (VMOVNTDQAZrm addr:$src)>;
+}
+
+let Predicates = [HasVLX], AddedComplexity = 400 in {
+  def : Pat<(alignednontemporalstore (v8i32 VR256X:$src), addr:$dst),
+            (VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;
+  def : Pat<(alignednontemporalstore (v16i16 VR256X:$src), addr:$dst),
+            (VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;
+  def : Pat<(alignednontemporalstore (v32i8 VR256X:$src), addr:$dst),
+            (VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;
+
+  def : Pat<(v4f64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v8f32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v4i64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v8i32 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
+            (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v16i16 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
+            (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v32i8 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
+            (VMOVNTDQAZ256rm addr:$src)>;
+
+  def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst),
+            (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
+  def : Pat<(alignednontemporalstore (v8i16 VR128X:$src), addr:$dst),
+            (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
+  def : Pat<(alignednontemporalstore (v16i8 VR128X:$src), addr:$dst),
+            (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
+
+  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
+            (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
+            (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
+            (VMOVNTDQAZ128rm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Integer arithmetic
+//
+multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           X86VectorVTInfo _, OpndItins itins,
+                           bit IsCommutable = 0> {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                    "$src2, $src1", "$src1, $src2",
+                    (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+                    itins.rr, IsCommutable>,
+            AVX512BIBase, EVEX_4V;
+
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                  "$src2, $src1", "$src1, $src2",
+                  (_.VT (OpNode _.RC:$src1,
+                                (bitconvert (_.LdFrag addr:$src2)))),
+                  itins.rm>,
+            AVX512BIBase, EVEX_4V;
+}
+
+multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _, OpndItins itins,
+                            bit IsCommutable = 0> :
+           avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
+  defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                  "${src2}"##_.BroadcastStr##", $src1",
+                  "$src1, ${src2}"##_.BroadcastStr,
+                  (_.VT (OpNode _.RC:$src1,
+                                (X86VBroadcast
+                                    (_.ScalarLdFrag addr:$src2)))),
+                  itins.rm>,
+             AVX512BIBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_binop_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              AVX512VLVectorVTInfo VTInfo, OpndItins itins,
+                              Predicate prd, bit IsCommutable = 0> {
+  let Predicates = [prd] in
+    defm Z : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+                             IsCommutable>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
+                             IsCommutable>, EVEX_V256;
+    defm Z128 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
+                             IsCommutable>, EVEX_V128;
+  }
+}
+
+multiclass avx512_binop_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                               AVX512VLVectorVTInfo VTInfo, OpndItins itins,
+                               Predicate prd, bit IsCommutable = 0> {
+  let Predicates = [prd] in
+    defm Z : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+                             IsCommutable>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
+                             IsCommutable>, EVEX_V256;
+    defm Z128 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
+                             IsCommutable>, EVEX_V128;
+  }
+}
+
+multiclass avx512_binop_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                OpndItins itins, Predicate prd,
+                                bit IsCommutable = 0> {
+  defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+                               itins, prd, IsCommutable>,
+                               VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                OpndItins itins, Predicate prd,
+                                bit IsCommutable = 0> {
+  defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+                               itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                OpndItins itins, Predicate prd,
+                                bit IsCommutable = 0> {
+  defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i16_info,
+                              itins, prd, IsCommutable>, EVEX_CD8<16, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                OpndItins itins, Predicate prd,
+                                bit IsCommutable = 0> {
+  defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i8_info,
+                              itins, prd, IsCommutable>, EVEX_CD8<8, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
+                                 SDNode OpNode, OpndItins itins, Predicate prd,
+                                 bit IsCommutable = 0> {
+  defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd,
+                                   IsCommutable>;
+
+  defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd,
+                                   IsCommutable>;
+}
+
+multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
+                                 SDNode OpNode, OpndItins itins, Predicate prd,
+                                 bit IsCommutable = 0> {
+  defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, itins, prd,
+                                   IsCommutable>;
+
+  defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, itins, prd,
+                                   IsCommutable>;
+}
+
+multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
+                                  bits<8> opc_d, bits<8> opc_q,
+                                  string OpcodeStr, SDNode OpNode,
+                                  OpndItins itins, bit IsCommutable = 0> {
+  defm NAME : avx512_binop_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode,
+                                    itins, HasAVX512, IsCommutable>,
+              avx512_binop_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode,
+                                    itins, HasBWI, IsCommutable>;
+}
+
+multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
+                            SDNode OpNode,X86VectorVTInfo _Src,
+                            X86VectorVTInfo _Dst, X86VectorVTInfo _Brdct,
+                            bit IsCommutable = 0> {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
+                            (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+                            "$src2, $src1","$src1, $src2",
+                            (_Dst.VT (OpNode
+                                         (_Src.VT _Src.RC:$src1),
+                                         (_Src.VT _Src.RC:$src2))),
+                            itins.rr, IsCommutable>,
+                            AVX512BIBase, EVEX_4V;
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+                        (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
+                        "$src2, $src1", "$src1, $src2",
+                        (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
+                                      (bitconvert (_Src.LdFrag addr:$src2)))),
+                        itins.rm>,
+                        AVX512BIBase, EVEX_4V;
+
+  defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+                    (ins _Src.RC:$src1, _Brdct.ScalarMemOp:$src2),
+                    OpcodeStr,
+                    "${src2}"##_Brdct.BroadcastStr##", $src1",
+                     "$src1, ${src2}"##_Brdct.BroadcastStr,
+                    (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
+                                 (_Brdct.VT (X86VBroadcast
+                                          (_Brdct.ScalarLdFrag addr:$src2)))))),
+                    itins.rm>,
+                    AVX512BIBase, EVEX_4V, EVEX_B;
+}
+
+defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add,
+                                    SSE_INTALU_ITINS_P, 1>;
+defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub,
+                                    SSE_INTALU_ITINS_P, 0>;
+defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds,
+                                    SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs,
+                                    SSE_INTALU_ITINS_P, HasBWI, 0>;
+defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus,
+                                     SSE_INTALU_ITINS_P, HasBWI, 0>;
+defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,
+                                    SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul,
+                                    SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul,
+                                    SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD;
+defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SSE_INTALU_ITINS_P,
+                                    HasBWI, 1>;
+defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SSE_INTMUL_ITINS_P,
+                                     HasBWI, 1>;
+defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SSE_INTMUL_ITINS_P,
+                                      HasBWI, 1>, T8PD;
+defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
+                                   SSE_INTALU_ITINS_P, HasBWI, 1>;
+
+multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins,
+                            AVX512VLVectorVTInfo _SrcVTInfo, AVX512VLVectorVTInfo _DstVTInfo,
+                            SDNode OpNode, Predicate prd,  bit IsCommutable = 0> {
+  let Predicates = [prd] in
+    defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+                                 _SrcVTInfo.info512, _DstVTInfo.info512,
+                                 v8i64_info, IsCommutable>,
+                                  EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W;
+  let Predicates = [HasVLX, prd] in {
+    defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+                                      _SrcVTInfo.info256, _DstVTInfo.info256,
+                                      v4i64x_info, IsCommutable>,
+                                      EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W;
+    defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
+                                      _SrcVTInfo.info128, _DstVTInfo.info128,
+                                      v2i64x_info, IsCommutable>,
+                                     EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W;
+  }
+}
+
+defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P,
+                                avx512vl_i32_info, avx512vl_i64_info,
+                                X86pmuldq, HasAVX512, 1>,T8PD;
+defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P,
+                                avx512vl_i32_info, avx512vl_i64_info,
+                                X86pmuludq, HasAVX512, 1>;
+defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SSE_INTALU_ITINS_P,
+                                avx512vl_i8_info, avx512vl_i8_info,
+                                X86multishift, HasVBMI, 0>, T8PD;
+
+multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _Src, X86VectorVTInfo _Dst> {
+  defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+                    (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
+                    OpcodeStr,
+                    "${src2}"##_Src.BroadcastStr##", $src1",
+                     "$src1, ${src2}"##_Src.BroadcastStr,
+                    (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
+                                 (_Src.VT (X86VBroadcast
+                                          (_Src.ScalarLdFrag addr:$src2))))))>,
+                    EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>;
+}
+
+multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode,X86VectorVTInfo _Src,
+                            X86VectorVTInfo _Dst, bit IsCommutable = 0> {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
+                            (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
+                            "$src2, $src1","$src1, $src2",
+                            (_Dst.VT (OpNode
+                                         (_Src.VT _Src.RC:$src1),
+                                         (_Src.VT _Src.RC:$src2))),
+                            NoItinerary, IsCommutable>,
+                            EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V;
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+                        (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
+                        "$src2, $src1", "$src1, $src2",
+                        (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
+                                      (bitconvert (_Src.LdFrag addr:$src2))))>,
+                         EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>;
+}
+
+multiclass avx512_packs_all_i32_i16<bits<8> opc, string OpcodeStr,
+                                    SDNode OpNode> {
+  let Predicates = [HasBWI] in
+  defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i32_info,
+                                 v32i16_info>,
+                avx512_packs_rmb<opc, OpcodeStr, OpNode, v16i32_info,
+                                 v32i16_info>, EVEX_V512;
+  let Predicates = [HasBWI, HasVLX] in {
+    defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i32x_info,
+                                     v16i16x_info>,
+                     avx512_packs_rmb<opc, OpcodeStr, OpNode, v8i32x_info,
+                                     v16i16x_info>, EVEX_V256;
+    defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v4i32x_info,
+                                     v8i16x_info>,
+                     avx512_packs_rmb<opc, OpcodeStr, OpNode, v4i32x_info,
+                                     v8i16x_info>, EVEX_V128;
+  }
+}
+multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode> {
+  let Predicates = [HasBWI] in
+  defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v32i16_info,
+                                v64i8_info>, EVEX_V512;
+  let Predicates = [HasBWI, HasVLX] in {
+    defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i16x_info,
+                                    v32i8x_info>, EVEX_V256;
+    defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i16x_info,
+                                    v16i8x_info>, EVEX_V128;
+  }
+}
+
+multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode, AVX512VLVectorVTInfo _Src,
+                            AVX512VLVectorVTInfo _Dst, bit IsCommutable = 0> {
+  let Predicates = [HasBWI] in
+  defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512,
+                                _Dst.info512, IsCommutable>, EVEX_V512;
+  let Predicates = [HasBWI, HasVLX] in {
+    defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256,
+                                     _Dst.info256, IsCommutable>, EVEX_V256;
+    defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128,
+                                     _Dst.info128, IsCommutable>, EVEX_V128;
+  }
+}
+
+defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, AVX512BIBase;
+defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, AVX5128IBase;
+defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase;
+defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase;
+
+defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw,
+                     avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD;
+defm VPMADDWD   : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
+                     avx512vl_i16_info, avx512vl_i32_info, 1>, AVX512BIBase;
+
+defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", smax,
+                                     SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+
+defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", umax,
+                                     SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+
+defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", smin,
+                                     SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+
+defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin,
+                                     SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+
+// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
+let Predicates = [HasDQI, NoVLX] in {
+  def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+            (EXTRACT_SUBREG
+                (VPMULLQZrr
+                    (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+                    (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+             sub_ymm)>;
+
+  def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+            (EXTRACT_SUBREG
+                (VPMULLQZrr
+                    (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+                    (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
+             sub_xmm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Logical Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           X86VectorVTInfo _, OpndItins itins,
+                           bit IsCommutable = 0> {
+  defm rr : AVX512_maskable_logic<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                    "$src2, $src1", "$src1, $src2",
+                    (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
+                                     (bitconvert (_.VT _.RC:$src2)))),
+                    (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
+                                                       _.RC:$src2)))),
+                    itins.rr, IsCommutable>,
+            AVX512BIBase, EVEX_4V;
+
+  defm rm : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                  "$src2, $src1", "$src1, $src2",
+                  (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
+                                   (bitconvert (_.LdFrag addr:$src2)))),
+                  (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
+                                     (bitconvert (_.LdFrag addr:$src2)))))),
+                  itins.rm>,
+            AVX512BIBase, EVEX_4V;
+}
+
+multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _, OpndItins itins,
+                            bit IsCommutable = 0> :
+           avx512_logic_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
+  defm rmb : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                  "${src2}"##_.BroadcastStr##", $src1",
+                  "$src1, ${src2}"##_.BroadcastStr,
+                  (_.i64VT (OpNode _.RC:$src1,
+                                   (bitconvert
+                                    (_.VT (X86VBroadcast
+                                            (_.ScalarLdFrag addr:$src2)))))),
+                  (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
+                                     (bitconvert
+                                      (_.VT (X86VBroadcast
+                                             (_.ScalarLdFrag addr:$src2)))))))),
+                  itins.rm>,
+             AVX512BIBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                               AVX512VLVectorVTInfo VTInfo, OpndItins itins,
+                               Predicate prd, bit IsCommutable = 0> {
+  let Predicates = [prd] in
+    defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+                             IsCommutable>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
+                             IsCommutable>, EVEX_V256;
+    defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
+                             IsCommutable>, EVEX_V128;
+  }
+}
+
+multiclass avx512_logic_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                OpndItins itins, Predicate prd,
+                                bit IsCommutable = 0> {
+  defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+                               itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
+}
+
+multiclass avx512_logic_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                OpndItins itins, Predicate prd,
+                                bit IsCommutable = 0> {
+  defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+                               itins, prd, IsCommutable>,
+                               VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_logic_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
+                                 SDNode OpNode, OpndItins itins, Predicate prd,
+                                 bit IsCommutable = 0> {
+  defm Q : avx512_logic_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd,
+                                IsCommutable>;
+
+  defm D : avx512_logic_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd,
+                                IsCommutable>;
+}
+
+defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and,
+                                  SSE_INTALU_ITINS_P, HasAVX512, 1>;
+defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or,
+                                  SSE_INTALU_ITINS_P, HasAVX512, 1>;
+defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
+                                  SSE_INTALU_ITINS_P, HasAVX512, 1>;
+defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
+                                  SSE_INTALU_ITINS_P, HasAVX512, 0>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  FP arithmetic
+//===----------------------------------------------------------------------===//
+multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+                         SDNode OpNode, SDNode VecNode, OpndItins itins,
+                         bit IsCommutable> {
+  let ExeDomain = _.ExeDomain in {
+  defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                           "$src2, $src1", "$src1, $src2",
+                           (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                           (i32 FROUND_CURRENT)),
+                           itins.rr>;
+
+  defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (VecNode (_.VT _.RC:$src1),
+                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                           (i32 FROUND_CURRENT)),
+                         itins.rm>;
+  let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
+  def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
+                         (ins _.FRC:$src1, _.FRC:$src2),
+                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
+                          itins.rr> {
+    let isCommutable = IsCommutable;
+  }
+  def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
+                         (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+                         OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         [(set _.FRC:$dst, (OpNode _.FRC:$src1,
+                         (_.ScalarLdFrag addr:$src2)))], itins.rm>;
+  }
+  }
+}
+
+multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+                         SDNode VecNode, OpndItins itins, bit IsCommutable = 0> {
+  let ExeDomain = _.ExeDomain in
+  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                          (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
+                          "$rc, $src2, $src1", "$src1, $src2, $rc",
+                          (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                          (i32 imm:$rc)), itins.rr, IsCommutable>,
+                          EVEX_B, EVEX_RC;
+}
+multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+                         SDNode VecNode, OpndItins itins, bit IsCommutable> {
+  let ExeDomain = _.ExeDomain in
+  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                            "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+                            (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                            (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SDNode VecNode,
+                                  SizeItins itins, bit IsCommutable> {
+  defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
+                              itins.s, IsCommutable>,
+             avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, VecNode,
+                              itins.s, IsCommutable>,
+                              XS, EVEX_4V, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
+  defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
+                              itins.d,                  IsCommutable>,
+             avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, VecNode,
+                              itins.d, IsCommutable>,
+                              XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+}
+
+multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SDNode VecNode,
+                                  SizeItins itins, bit IsCommutable> {
+  defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
+                              itins.s, IsCommutable>,
+             avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, VecNode,
+                              itins.s, IsCommutable>,
+                              XS, EVEX_4V, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
+  defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
+                              itins.d,                  IsCommutable>,
+             avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, VecNode,
+                              itins.d, IsCommutable>,
+                              XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+}
+defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnd, SSE_ALU_ITINS_S, 1>;
+defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_MUL_ITINS_S, 1>;
+defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnd, SSE_ALU_ITINS_S, 0>;
+defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_DIV_ITINS_S, 0>;
+defm VMIN : avx512_binop_s_sae  <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 0>;
+defm VMAX : avx512_binop_s_sae  <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 0>;
+
+// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
+// X86fminc and X86fmaxc instead of X86fmin and X86fmax
+multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
+                          X86VectorVTInfo _, SDNode OpNode, OpndItins itins> {
+  let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
+  def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
+                         (ins _.FRC:$src1, _.FRC:$src2),
+                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
+                          itins.rr> {
+    let isCommutable = 1;
+  }
+  def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
+                         (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+                         OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         [(set _.FRC:$dst, (OpNode _.FRC:$src1,
+                         (_.ScalarLdFrag addr:$src2)))], itins.rm>;
+  }
+}
+defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
+                                SSE_ALU_ITINS_S.s>, XS, EVEX_4V, VEX_LIG,
+                                EVEX_CD8<32, CD8VT1>;
+
+defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
+                                SSE_ALU_ITINS_S.d>, XD, VEX_W, EVEX_4V, VEX_LIG,
+                                EVEX_CD8<64, CD8VT1>;
+
+defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
+                                SSE_ALU_ITINS_S.s>, XS, EVEX_4V, VEX_LIG,
+                                EVEX_CD8<32, CD8VT1>;
+
+defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
+                                SSE_ALU_ITINS_S.d>, XD, VEX_W, EVEX_4V, VEX_LIG,
+                                EVEX_CD8<64, CD8VT1>;
+
+multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                            X86VectorVTInfo _, OpndItins itins,
+                            bit IsCommutable> {
+  let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
+  defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+                  "$src2, $src1", "$src1, $src2",
+                  (_.VT (OpNode _.RC:$src1, _.RC:$src2)), itins.rr,
+                  IsCommutable>, EVEX_4V;
+  let mayLoad = 1 in {
+    defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+                    "$src2, $src1", "$src1, $src2",
+                    (OpNode _.RC:$src1, (_.LdFrag addr:$src2)), itins.rm>,
+                    EVEX_4V;
+    defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+                     "${src2}"##_.BroadcastStr##", $src1",
+                     "$src1, ${src2}"##_.BroadcastStr,
+                     (OpNode  _.RC:$src1, (_.VT (X86VBroadcast
+                                                (_.ScalarLdFrag addr:$src2)))),
+                     itins.rm>, EVEX_4V, EVEX_B;
+    }
+  }
+}
+
+multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd,
+                                  X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in
+  defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix,
+                  "$rc, $src2, $src1", "$src1, $src2, $rc",
+                  (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>,
+                  EVEX_4V, EVEX_B, EVEX_RC;
+}
+
+
+multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd,
+                                X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in
+  defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+                  "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+                  (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>,
+                  EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                             Predicate prd, SizeItins itins,
+                             bit IsCommutable = 0> {
+  let Predicates = [prd] in {
+  defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
+                              itins.s, IsCommutable>, EVEX_V512, PS,
+                              EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info,
+                              itins.d, IsCommutable>, EVEX_V512, PD, VEX_W,
+                              EVEX_CD8<64, CD8VF>;
+  }
+
+    // Define only if AVX512VL feature is present.
+  let Predicates = [prd, HasVLX] in {
+    defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info,
+                                   itins.s, IsCommutable>, EVEX_V128, PS,
+                                   EVEX_CD8<32, CD8VF>;
+    defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info,
+                                   itins.s, IsCommutable>, EVEX_V256, PS,
+                                   EVEX_CD8<32, CD8VF>;
+    defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info,
+                                   itins.d, IsCommutable>, EVEX_V128, PD, VEX_W,
+                                   EVEX_CD8<64, CD8VF>;
+    defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info,
+                                   itins.d, IsCommutable>, EVEX_V256, PD, VEX_W,
+                                   EVEX_CD8<64, CD8VF>;
+  }
+}
+
+multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> {
+  defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>,
+                              EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>,
+                              EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> {
+  defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>,
+                              EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>,
+                              EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
+}
+
+defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, HasAVX512,
+                              SSE_ALU_ITINS_P, 1>,
+            avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>;
+defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, HasAVX512,
+                              SSE_MUL_ITINS_P, 1>,
+            avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>;
+defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512, SSE_ALU_ITINS_P>,
+            avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>;
+defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512, SSE_DIV_ITINS_P>,
+            avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>;
+defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512,
+                              SSE_ALU_ITINS_P, 0>,
+            avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd>;
+defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512,
+                              SSE_ALU_ITINS_P, 0>,
+            avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd>;
+let isCodeGenOnly = 1 in {
+  defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512,
+                                 SSE_ALU_ITINS_P, 1>;
+  defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512,
+                                 SSE_ALU_ITINS_P, 1>;
+}
+defm VAND  : avx512_fp_binop_p<0x54, "vand", null_frag, HasDQI,
+                               SSE_ALU_ITINS_P, 1>;
+defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, HasDQI,
+                               SSE_ALU_ITINS_P, 0>;
+defm VOR   : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI,
+                               SSE_ALU_ITINS_P, 1>;
+defm VXOR  : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI,
+                               SSE_ALU_ITINS_P, 1>;
+
+// Patterns catch floating point selects with bitcasted integer logic ops.
+multiclass avx512_fp_logical_lowering<string InstrStr, SDNode OpNode,
+                                      X86VectorVTInfo _, Predicate prd> {
+let Predicates = [prd] in {
+  // Masked register-register logical operations.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))),
+                   _.RC:$src0)),
+            (!cast<Instruction>(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask,
+             _.RC:$src1, _.RC:$src2)>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1,
+             _.RC:$src2)>;
+  // Masked register-memory logical operations.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert (_.i64VT (OpNode _.RC:$src1,
+                                         (load addr:$src2)))),
+                   _.RC:$src0)),
+            (!cast<Instruction>(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask,
+             _.RC:$src1, addr:$src2)>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert (_.i64VT (OpNode _.RC:$src1, (load addr:$src2)))),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(InstrStr#rmkz) _.KRCWM:$mask, _.RC:$src1,
+             addr:$src2)>;
+  // Register-broadcast logical operations.
+  def : Pat<(_.i64VT (OpNode _.RC:$src1,
+                      (bitconvert (_.VT (X86VBroadcast
+                                         (_.ScalarLdFrag addr:$src2)))))),
+            (!cast<Instruction>(InstrStr#rmb) _.RC:$src1, addr:$src2)>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert
+                    (_.i64VT (OpNode _.RC:$src1,
+                              (bitconvert (_.VT
+                                           (X86VBroadcast
+                                            (_.ScalarLdFrag addr:$src2))))))),
+                   _.RC:$src0)),
+            (!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask,
+             _.RC:$src1, addr:$src2)>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert
+                    (_.i64VT (OpNode _.RC:$src1,
+                              (bitconvert (_.VT
+                                           (X86VBroadcast
+                                            (_.ScalarLdFrag addr:$src2))))))),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(InstrStr#rmbkz)  _.KRCWM:$mask,
+             _.RC:$src1, addr:$src2)>;
+}
+}
+
+multiclass avx512_fp_logical_lowering_sizes<string InstrStr, SDNode OpNode> {
+  defm : avx512_fp_logical_lowering<InstrStr#DZ128, OpNode, v4f32x_info, HasVLX>;
+  defm : avx512_fp_logical_lowering<InstrStr#QZ128, OpNode, v2f64x_info, HasVLX>;
+  defm : avx512_fp_logical_lowering<InstrStr#DZ256, OpNode, v8f32x_info, HasVLX>;
+  defm : avx512_fp_logical_lowering<InstrStr#QZ256, OpNode, v4f64x_info, HasVLX>;
+  defm : avx512_fp_logical_lowering<InstrStr#DZ, OpNode, v16f32_info, HasAVX512>;
+  defm : avx512_fp_logical_lowering<InstrStr#QZ, OpNode, v8f64_info, HasAVX512>;
+}
+
+defm : avx512_fp_logical_lowering_sizes<"VPAND", and>;
+defm : avx512_fp_logical_lowering_sizes<"VPOR", or>;
+defm : avx512_fp_logical_lowering_sizes<"VPXOR", xor>;
+defm : avx512_fp_logical_lowering_sizes<"VPANDN", X86andnp>;
+
+let Predicates = [HasVLX,HasDQI] in {
+  // Use packed logical operations for scalar ops.
+  def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)),
+            (COPY_TO_REGCLASS (VANDPDZ128rr
+                               (COPY_TO_REGCLASS FR64X:$src1, VR128X),
+                               (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+  def : Pat<(f64 (X86for FR64X:$src1, FR64X:$src2)),
+            (COPY_TO_REGCLASS (VORPDZ128rr
+                               (COPY_TO_REGCLASS FR64X:$src1, VR128X),
+                               (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+  def : Pat<(f64 (X86fxor FR64X:$src1, FR64X:$src2)),
+            (COPY_TO_REGCLASS (VXORPDZ128rr
+                               (COPY_TO_REGCLASS FR64X:$src1, VR128X),
+                               (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+  def : Pat<(f64 (X86fandn FR64X:$src1, FR64X:$src2)),
+            (COPY_TO_REGCLASS (VANDNPDZ128rr
+                               (COPY_TO_REGCLASS FR64X:$src1, VR128X),
+                               (COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
+
+  def : Pat<(f32 (X86fand FR32X:$src1, FR32X:$src2)),
+            (COPY_TO_REGCLASS (VANDPSZ128rr
+                               (COPY_TO_REGCLASS FR32X:$src1, VR128X),
+                               (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+  def : Pat<(f32 (X86for FR32X:$src1, FR32X:$src2)),
+            (COPY_TO_REGCLASS (VORPSZ128rr
+                               (COPY_TO_REGCLASS FR32X:$src1, VR128X),
+                               (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+  def : Pat<(f32 (X86fxor FR32X:$src1, FR32X:$src2)),
+            (COPY_TO_REGCLASS (VXORPSZ128rr
+                               (COPY_TO_REGCLASS FR32X:$src1, VR128X),
+                               (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+  def : Pat<(f32 (X86fandn FR32X:$src1, FR32X:$src2)),
+            (COPY_TO_REGCLASS (VANDNPSZ128rr
+                               (COPY_TO_REGCLASS FR32X:$src1, VR128X),
+                               (COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
+}
+
+multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> {
+  defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+                  "$src2, $src1", "$src1, $src2",
+                  (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>, EVEX_4V;
+  defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+                  "$src2, $src1", "$src1, $src2",
+                  (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>, EVEX_4V;
+  defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+                   "${src2}"##_.BroadcastStr##", $src1",
+                   "$src1, ${src2}"##_.BroadcastStr,
+                   (OpNode  _.RC:$src1, (_.VT (X86VBroadcast
+                                              (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>,
+                   EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> {
+  defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+                  "$src2, $src1", "$src1, $src2",
+                  (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>;
+  defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+                  "$src2, $src1", "$src1, $src2",
+                  (OpNode _.RC:$src1,
+                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                          (i32 FROUND_CURRENT))>;
+}
+
+multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode, SDNode OpNodeScal> {
+  defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>,
+             avx512_fp_round_packed<opc, OpcodeStr, OpNode, v16f32_info>,
+                              EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>,
+             avx512_fp_round_packed<opc, OpcodeStr, OpNode, v8f64_info>,
+                              EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+  defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, f32x_info>,
+                avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNodeScal, SSE_ALU_ITINS_S.s>,
+                              EVEX_4V,EVEX_CD8<32, CD8VT1>;
+  defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, f64x_info>,
+                avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNodeScal, SSE_ALU_ITINS_S.d>,
+                              EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+
+  // Define only if AVX512VL feature is present.
+  let Predicates = [HasVLX] in {
+    defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f32x_info>,
+                                   EVEX_V128, EVEX_CD8<32, CD8VF>;
+    defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f32x_info>,
+                                   EVEX_V256, EVEX_CD8<32, CD8VF>;
+    defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v2f64x_info>,
+                                   EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+    defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f64x_info>,
+                                   EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+  }
+}
+defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef, X86scalefs>, T8PD;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  VPTESTM instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> {
+  let isCommutable = 1 in
+  defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
+                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                      "$src2, $src1", "$src1, $src2",
+                   (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
+                    EVEX_4V;
+  defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
+                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                       "$src2, $src1", "$src1, $src2",
+                   (OpNode (_.VT _.RC:$src1),
+                    (_.VT (bitconvert (_.LdFrag addr:$src2))))>,
+                    EVEX_4V,
+                   EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> {
+  defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
+                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                    "${src2}"##_.BroadcastStr##", $src1",
+                    "$src1, ${src2}"##_.BroadcastStr,
+                    (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast
+                                                (_.ScalarLdFrag addr:$src2))))>,
+                    EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+// Use 512bit version to implement 128/256 bit in case NoVLX.
+multiclass avx512_vptest_lowering<SDNode OpNode, X86VectorVTInfo ExtendInfo,
+                                  X86VectorVTInfo _, string Suffix> {
+    def : Pat<(_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))),
+              (_.KVT (COPY_TO_REGCLASS
+                       (!cast<Instruction>(NAME # Suffix # "Zrr")
+                         (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                                        _.RC:$src1, _.SubRegIdx),
+                         (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                                        _.RC:$src2, _.SubRegIdx)),
+                     _.KRC))>;
+}
+
+multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  AVX512VLVectorVTInfo _, string Suffix> {
+  let Predicates  = [HasAVX512] in
+  defm Z : avx512_vptest<opc, OpcodeStr, OpNode, _.info512>,
+           avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+  let Predicates = [HasAVX512, HasVLX] in {
+  defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, _.info256>,
+              avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+  defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, _.info128>,
+              avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+  }
+  let Predicates = [HasAVX512, NoVLX] in {
+  defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, Suffix>;
+  defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, Suffix>;
+  }
+}
+
+multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode,
+                                 avx512vl_i32_info, "D">;
+  defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode,
+                                 avx512vl_i64_info, "Q">, VEX_W;
+}
+
+multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
+                                 SDNode OpNode> {
+  let Predicates = [HasBWI] in {
+  defm WZ:    avx512_vptest<opc, OpcodeStr#"w", OpNode, v32i16_info>,
+              EVEX_V512, VEX_W;
+  defm BZ:    avx512_vptest<opc, OpcodeStr#"b", OpNode, v64i8_info>,
+              EVEX_V512;
+  }
+  let Predicates = [HasVLX, HasBWI] in {
+
+  defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, v16i16x_info>,
+              EVEX_V256, VEX_W;
+  defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, v8i16x_info>,
+              EVEX_V128, VEX_W;
+  defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, v32i8x_info>,
+              EVEX_V256;
+  defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, v16i8x_info>,
+              EVEX_V128;
+  }
+
+  let Predicates = [HasAVX512, NoVLX] in {
+  defm BZ256_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v32i8x_info, "B">;
+  defm BZ128_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v16i8x_info, "B">;
+  defm WZ256_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v16i16x_info, "W">;
+  defm WZ128_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v8i16x_info, "W">;
+  }
+
+}
+
+multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
+                                   SDNode OpNode> :
+  avx512_vptest_wb <opc_wb, OpcodeStr, OpNode>,
+  avx512_vptest_dq<opc_dq, OpcodeStr, OpNode>;
+
+defm VPTESTM   : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86testm>, T8PD;
+defm VPTESTNM  : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm>, T8XS;
+
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Shift instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
+                         string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
+  defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, u8imm:$src2), OpcodeStr,
+                      "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))),
+                   SSE_INTSHIFT_ITINS_P.rr>;
+  defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
+                   (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
+                       "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                          (i8 imm:$src2))),
+                   SSE_INTSHIFT_ITINS_P.rm>;
+  }
+}
+
+multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
+                         string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in
+  defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
+                   (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
+      "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
+     (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))),
+     SSE_INTSHIFT_ITINS_P.rm>, EVEX_B;
+}
+
+multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> {
+   // src2 is always 128-bit
+  let ExeDomain = _.ExeDomain in {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, VR128X:$src2), OpcodeStr,
+                      "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))),
+                   SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V;
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, i128mem:$src2), OpcodeStr,
+                       "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))),
+                   SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase,
+                   EVEX_4V;
+  }
+}
+
+multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  ValueType SrcVT, PatFrag bc_frag,
+                                  AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+  let Predicates = [prd] in
+  defm Z    : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
+                            VTInfo.info512>, EVEX_V512,
+                            EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ;
+  let Predicates = [prd, HasVLX] in {
+  defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
+                            VTInfo.info256>, EVEX_V256,
+                            EVEX_CD8<VTInfo.info256.EltSize, CD8VH>;
+  defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
+                            VTInfo.info128>, EVEX_V128,
+                            EVEX_CD8<VTInfo.info128.EltSize, CD8VF>;
+  }
+}
+
+multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw,
+                              string OpcodeStr, SDNode OpNode> {
+  defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, v4i32, bc_v4i32,
+                                 avx512vl_i32_info, HasAVX512>;
+  defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, v2i64, bc_v2i64,
+                                 avx512vl_i64_info, HasAVX512>, VEX_W;
+  defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, v8i16, bc_v8i16,
+                                 avx512vl_i16_info, HasBWI>;
+}
+
+multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
+                                 string OpcodeStr, SDNode OpNode,
+                                 AVX512VLVectorVTInfo VTInfo> {
+  let Predicates = [HasAVX512] in
+  defm Z:    avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info512>,
+             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info512>, EVEX_V512;
+  let Predicates = [HasAVX512, HasVLX] in {
+  defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info256>,
+             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info256>, EVEX_V256;
+  defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info128>,
+             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info128>, EVEX_V128;
+  }
+}
+
+multiclass avx512_shift_rmi_w<bits<8> opcw,
+                                 Format ImmFormR, Format ImmFormM,
+                                 string OpcodeStr, SDNode OpNode> {
+  let Predicates = [HasBWI] in
+  defm WZ:    avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                               v32i16_info>, EVEX_V512;
+  let Predicates = [HasVLX, HasBWI] in {
+  defm WZ256: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                               v16i16x_info>, EVEX_V256;
+  defm WZ128: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                               v8i16x_info>, EVEX_V128;
+  }
+}
+
+multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq,
+                                 Format ImmFormR, Format ImmFormM,
+                                 string OpcodeStr, SDNode OpNode> {
+  defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode,
+                                 avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+  defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode,
+                                 avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W;
+}
+
+defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli>,
+             avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli>, AVX512BIi8Base, EVEX_4V;
+
+defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>,
+             avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli>, AVX512BIi8Base, EVEX_4V;
+
+defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai>,
+             avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>, AVX512BIi8Base, EVEX_4V;
+
+defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri>, AVX512BIi8Base, EVEX_4V;
+defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli>, AVX512BIi8Base, EVEX_4V;
+
+defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>;
+defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>;
+defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>;
+
+//===-------------------------------------------------------------------===//
+// Variable Bit Shifts
+//===-------------------------------------------------------------------===//
+multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                      "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))),
+                   SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V;
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
+                       "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode _.RC:$src1,
+                   (_.VT (bitconvert (_.LdFrag addr:$src2))))),
+                   SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V,
+                   EVEX_CD8<_.EltSize, CD8VF>;
+  }
+}
+
+multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in
+  defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                    "${src2}"##_.BroadcastStr##", $src1",
+                    "$src1, ${src2}"##_.BroadcastStr,
+                    (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast
+                                                (_.ScalarLdFrag addr:$src2))))),
+                    SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B,
+                    EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+}
+multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  AVX512VLVectorVTInfo _> {
+  let Predicates  = [HasAVX512] in
+  defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
+           avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+  let Predicates = [HasAVX512, HasVLX] in {
+  defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
+              avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+  defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>,
+              avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+  }
+}
+
+multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr,
+                                 SDNode OpNode> {
+  defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode,
+                                 avx512vl_i32_info>;
+  defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode,
+                                 avx512vl_i64_info>, VEX_W;
+}
+
+// Use 512bit version to implement 128/256 bit in case NoVLX.
+multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> {
+  let Predicates = [HasBWI, NoVLX] in {
+  def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1),
+                                  (_.info256.VT _.info256.RC:$src2))),
+            (EXTRACT_SUBREG
+                (!cast<Instruction>(NAME#"WZrr")
+                    (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+                    (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+             sub_ymm)>;
+
+  def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1),
+                                  (_.info128.VT _.info128.RC:$src2))),
+            (EXTRACT_SUBREG
+                (!cast<Instruction>(NAME#"WZrr")
+                    (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+                    (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
+             sub_xmm)>;
+  }
+}
+
+multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
+                                 SDNode OpNode> {
+  let Predicates = [HasBWI] in
+  defm WZ:    avx512_var_shift<opc, OpcodeStr, OpNode, v32i16_info>,
+              EVEX_V512, VEX_W;
+  let Predicates = [HasVLX, HasBWI] in {
+
+  defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, v16i16x_info>,
+              EVEX_V256, VEX_W;
+  defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, v8i16x_info>,
+              EVEX_V128, VEX_W;
+  }
+}
+
+defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>,
+              avx512_var_shift_w<0x12, "vpsllvw", shl>,
+              avx512_var_shift_w_lowering<avx512vl_i16_info, shl>;
+
+defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>,
+              avx512_var_shift_w<0x11, "vpsravw", sra>,
+              avx512_var_shift_w_lowering<avx512vl_i16_info, sra>;
+
+defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>,
+              avx512_var_shift_w<0x10, "vpsrlvw", srl>,
+              avx512_var_shift_w_lowering<avx512vl_i16_info, srl>;
+defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>;
+defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>;
+
+// Special handing for handling VPSRAV intrinsics.
+multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
+                                         list<Predicate> p> {
+  let Predicates = p in {
+    def : Pat<(_.VT (X86vsrav _.RC:$src1, _.RC:$src2)),
+              (!cast<Instruction>(InstrStr#_.ZSuffix#rr) _.RC:$src1,
+               _.RC:$src2)>;
+    def : Pat<(_.VT (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))),
+              (!cast<Instruction>(InstrStr#_.ZSuffix##rm)
+               _.RC:$src1, addr:$src2)>;
+    let AddedComplexity = 20 in {
+    def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                     (X86vsrav _.RC:$src1, _.RC:$src2), _.RC:$src0)),
+              (!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0,
+               _.KRC:$mask, _.RC:$src1, _.RC:$src2)>;
+    def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                     (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))),
+                     _.RC:$src0)),
+              (!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0,
+               _.KRC:$mask, _.RC:$src1, addr:$src2)>;
+    }
+    let AddedComplexity = 30 in {
+    def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                     (X86vsrav _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)),
+              (!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask,
+               _.RC:$src1, _.RC:$src2)>;
+    def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                     (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))),
+                     _.ImmAllZerosV)),
+              (!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask,
+               _.RC:$src1, addr:$src2)>;
+    }
+  }
+}
+
+multiclass avx512_var_shift_int_lowering_mb<string InstrStr, X86VectorVTInfo _,
+                                         list<Predicate> p> :
+           avx512_var_shift_int_lowering<InstrStr, _, p> {
+  let Predicates = p in {
+    def : Pat<(_.VT (X86vsrav _.RC:$src1,
+                     (X86VBroadcast (_.ScalarLdFrag addr:$src2)))),
+              (!cast<Instruction>(InstrStr#_.ZSuffix##rmb)
+               _.RC:$src1, addr:$src2)>;
+    let AddedComplexity = 20 in
+    def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                     (X86vsrav _.RC:$src1,
+                      (X86VBroadcast (_.ScalarLdFrag addr:$src2))),
+                     _.RC:$src0)),
+              (!cast<Instruction>(InstrStr#_.ZSuffix##rmbk) _.RC:$src0,
+               _.KRC:$mask, _.RC:$src1, addr:$src2)>;
+    let AddedComplexity = 30 in
+    def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                     (X86vsrav _.RC:$src1,
+                      (X86VBroadcast (_.ScalarLdFrag addr:$src2))),
+                     _.ImmAllZerosV)),
+              (!cast<Instruction>(InstrStr#_.ZSuffix##rmbkz) _.KRC:$mask,
+               _.RC:$src1, addr:$src2)>;
+  }
+}
+
+defm : avx512_var_shift_int_lowering<"VPSRAVW", v8i16x_info, [HasVLX, HasBWI]>;
+defm : avx512_var_shift_int_lowering<"VPSRAVW", v16i16x_info, [HasVLX, HasBWI]>;
+defm : avx512_var_shift_int_lowering<"VPSRAVW", v32i16_info, [HasBWI]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v4i32x_info, [HasVLX]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v8i32x_info, [HasVLX]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v16i32_info, [HasAVX512]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>;
+defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>;
+
+//===-------------------------------------------------------------------===//
+// 1-src variable permutation VPERMW/D/Q
+//===-------------------------------------------------------------------===//
+multiclass avx512_vperm_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  AVX512VLVectorVTInfo _> {
+  let Predicates  = [HasAVX512] in
+  defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
+           avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+  let Predicates = [HasAVX512, HasVLX] in
+  defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
+              avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+}
+
+multiclass avx512_vpermi_dq_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
+                                 string OpcodeStr, SDNode OpNode,
+                                 AVX512VLVectorVTInfo VTInfo> {
+  let Predicates = [HasAVX512] in
+  defm Z:    avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info512>,
+             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info512>, EVEX_V512;
+  let Predicates = [HasAVX512, HasVLX] in
+  defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info256>,
+             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info256>, EVEX_V256;
+}
+
+multiclass avx512_vperm_bw<bits<8> opc, string OpcodeStr,
+                              Predicate prd, SDNode OpNode,
+                              AVX512VLVectorVTInfo _> {
+  let Predicates = [prd] in
+  defm Z:    avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
+              EVEX_V512 ;
+  let Predicates = [HasVLX, prd] in {
+  defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
+              EVEX_V256 ;
+  defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>,
+              EVEX_V128 ;
+  }
+}
+
+defm VPERMW  : avx512_vperm_bw<0x8D, "vpermw", HasBWI, X86VPermv,
+                                  avx512vl_i16_info>, VEX_W;
+defm VPERMB  : avx512_vperm_bw<0x8D, "vpermb", HasVBMI, X86VPermv,
+                                  avx512vl_i8_info>;
+
+defm VPERMD : avx512_vperm_dq_sizes<0x36, "vpermd", X86VPermv,
+                                    avx512vl_i32_info>;
+defm VPERMQ : avx512_vperm_dq_sizes<0x36, "vpermq", X86VPermv,
+                                    avx512vl_i64_info>, VEX_W;
+defm VPERMPS : avx512_vperm_dq_sizes<0x16, "vpermps", X86VPermv,
+                                    avx512vl_f32_info>;
+defm VPERMPD : avx512_vperm_dq_sizes<0x16, "vpermpd", X86VPermv,
+                                    avx512vl_f64_info>, VEX_W;
+
+defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq",
+                             X86VPermi, avx512vl_i64_info>,
+                             EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd",
+                             X86VPermi, avx512vl_f64_info>,
+                             EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPERMIL
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr,  SDNode OpNode,
+                             X86VectorVTInfo _, X86VectorVTInfo Ctrl> {
+  defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr,
+                  "$src2, $src1", "$src1, $src2",
+                  (_.VT (OpNode _.RC:$src1,
+                               (Ctrl.VT Ctrl.RC:$src2)))>,
+                  T8PD, EVEX_4V;
+  defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
+                  "$src2, $src1", "$src1, $src2",
+                  (_.VT (OpNode
+                           _.RC:$src1,
+                           (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
+                  T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+  defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                   "${src2}"##_.BroadcastStr##", $src1",
+                   "$src1, ${src2}"##_.BroadcastStr,
+                   (_.VT (OpNode
+                            _.RC:$src1,
+                            (Ctrl.VT (X86VBroadcast
+                                       (Ctrl.ScalarLdFrag addr:$src2)))))>,
+                   T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar,
+                             AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
+  let Predicates = [HasAVX512] in {
+    defm Z    : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info512,
+                                  Ctrl.info512>, EVEX_V512;
+  }
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info128,
+                                  Ctrl.info128>, EVEX_V128;
+    defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info256,
+                                  Ctrl.info256>, EVEX_V256;
+  }
+}
+
+multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar,
+                         AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
+
+  defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, _, Ctrl>;
+  defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr,
+                                    X86VPermilpi, _>,
+                    EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>;
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info,
+                               avx512vl_i32_info>;
+let ExeDomain = SSEPackedDouble in
+defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info,
+                               avx512vl_i64_info>, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW
+//===----------------------------------------------------------------------===//
+
+defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd",
+                             X86PShufd, avx512vl_i32_info>,
+                             EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>;
+defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw",
+                                  X86PShufhw>, EVEX, AVX512XSIi8Base;
+defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw",
+                                  X86PShuflw>, EVEX, AVX512XDIi8Base;
+
+multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  let Predicates = [HasBWI] in
+  defm Z:    avx512_var_shift<opc, OpcodeStr, OpNode, v64i8_info>, EVEX_V512;
+
+  let Predicates = [HasVLX, HasBWI] in {
+  defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, v32i8x_info>, EVEX_V256;
+  defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, v16i8x_info>, EVEX_V128;
+  }
+}
+
+defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>;
+
+//===----------------------------------------------------------------------===//
+// Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
+          (ins VR128X:$src1, VR128X:$src2),
+          "vmovlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))],
+           IIC_SSE_MOV_LH>, EVEX_4V;
+def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
+          (ins VR128X:$src1, VR128X:$src2),
+          "vmovhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))],
+          IIC_SSE_MOV_LH>, EVEX_4V;
+
+let Predicates = [HasAVX512] in {
+  // MOVLHPS patterns
+  def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+            (VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+            (VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>;
+
+  // MOVHLPS patterns
+  def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)),
+            (VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VMOVHPS/PD VMOVLPS Instructions
+// All patterns was taken from SSS implementation.
+//===----------------------------------------------------------------------===//
+multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  X86VectorVTInfo _> {
+  def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst),
+                  (ins _.RC:$src1, f64mem:$src2),
+                  !strconcat(OpcodeStr,
+                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set _.RC:$dst,
+                     (OpNode _.RC:$src1,
+                       (_.VT (bitconvert
+                         (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))],
+                  IIC_SSE_MOV_LH>, EVEX_4V;
+}
+
+defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps,
+                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Movlhpd,
+                                  v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
+defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", X86Movlps,
+                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movlpd,
+                                  v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
+
+let Predicates = [HasAVX512] in {
+  // VMOVHPS patterns
+  def : Pat<(X86Movlhps VR128X:$src1,
+               (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+          (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128X:$src1,
+               (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+          (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
+  // VMOVHPD patterns
+  def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
+                    (scalar_to_vector (loadf64 addr:$src2)))),
+           (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
+                    (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+           (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
+  // VMOVLPS patterns
+  def : Pat<(v4f32 (X86Movlps VR128X:$src1, (load addr:$src2))),
+          (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86Movlps VR128X:$src1, (load addr:$src2))),
+          (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
+  // VMOVLPD patterns
+  def : Pat<(v2f64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
+          (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
+          (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Movsd VR128X:$src1,
+                           (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+          (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
+}
+
+def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs),
+                       (ins f64mem:$dst, VR128X:$src),
+                       "vmovhps\t{$src, $dst|$dst, $src}",
+                       [(store (f64 (extractelt
+                                     (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)),
+                                                (bc_v2f64 (v4f32 VR128X:$src))),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
+                       EVEX, EVEX_CD8<32, CD8VT2>;
+def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs),
+                       (ins f64mem:$dst, VR128X:$src),
+                       "vmovhpd\t{$src, $dst|$dst, $src}",
+                       [(store (f64 (extractelt
+                                     (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
+                       EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs),
+                       (ins f64mem:$dst, VR128X:$src),
+                       "vmovlps\t{$src, $dst|$dst, $src}",
+                       [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128X:$src)),
+                                     (iPTR 0))), addr:$dst)],
+                                     IIC_SSE_MOV_LH>,
+                       EVEX, EVEX_CD8<32, CD8VT2>;
+def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs),
+                       (ins f64mem:$dst, VR128X:$src),
+                       "vmovlpd\t{$src, $dst|$dst, $src}",
+                       [(store (f64 (extractelt (v2f64 VR128X:$src),
+                                     (iPTR 0))), addr:$dst)],
+                                     IIC_SSE_MOV_LH>,
+                       EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
+
+let Predicates = [HasAVX512] in {
+  // VMOVHPD patterns
+  def : Pat<(store (f64 (extractelt
+                           (v2f64 (X86VPermilpi VR128X:$src, (i8 1))),
+                           (iPTR 0))), addr:$dst),
+           (VMOVHPDZ128mr addr:$dst, VR128X:$src)>;
+  // VMOVLPS patterns
+  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128X:$src2)),
+                   addr:$src1),
+            (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
+  def : Pat<(store (v4i32 (X86Movlps
+                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128X:$src2)), addr:$src1),
+            (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
+  // VMOVLPD patterns
+  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
+                   addr:$src1),
+            (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
+  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
+                   addr:$src1),
+            (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
+}
+//===----------------------------------------------------------------------===//
+// FMA - Fused Multiply Operations
+//
+
+multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                               X86VectorVTInfo _, string Suff> {
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+  defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
+         AVX512FMA3Base;
+
+  defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.MemOp:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
+          AVX512FMA3Base;
+
+  defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+            (ins _.RC:$src2, _.ScalarMemOp:$src3),
+            OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
+            !strconcat("$src2, ${src3}", _.BroadcastStr ),
+            (OpNode _.RC:$src2,
+             _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>,
+            AVX512FMA3Base, EVEX_B;
+  }
+
+  // Additional pattern for folding broadcast nodes in other orders.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src1, _.RC:$src2,
+                    (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1,
+             _.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
+}
+
+multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 X86VectorVTInfo _, string Suff> {
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
+  defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+          OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+          (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>,
+          AVX512FMA3Base, EVEX_B, EVEX_RC;
+}
+
+multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                   SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
+                                   string Suff> {
+  let Predicates = [HasAVX512] in {
+    defm Z      : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
+                  avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, _.info512,
+                      Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+  }
+  let Predicates = [HasVLX, HasAVX512] in {
+    defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
+                      EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+    defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
+                      EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+  }
+}
+
+multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              SDNode OpNodeRnd > {
+    defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
+                                      avx512vl_f32_info, "PS">;
+    defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
+                                      avx512vl_f64_info, "PD">, VEX_W;
+}
+
+defm VFMADD213    : avx512_fma3p_213_f<0xA8, "vfmadd213", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB213    : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD213   : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB213   : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubRnd>;
+
+
+multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                               X86VectorVTInfo _, string Suff> {
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+  defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
+         AVX512FMA3Base;
+
+  defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.MemOp:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
+         AVX512FMA3Base;
+
+  defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+         (ins _.RC:$src2, _.ScalarMemOp:$src3),
+         OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
+         "$src2, ${src3}"##_.BroadcastStr,
+         (_.VT (OpNode _.RC:$src2,
+                      (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+                      _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B;
+  }
+
+  // Additional patterns for folding broadcast nodes in other orders.
+  def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                   _.RC:$src2, _.RC:$src1)),
+            (!cast<Instruction>(NAME#Suff#_.ZSuffix#mb) _.RC:$src1,
+             _.RC:$src2, addr:$src3)>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src2, _.RC:$src1),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1,
+             _.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src2, _.RC:$src1),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbkz) _.RC:$src1,
+             _.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
+}
+
+multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 X86VectorVTInfo _, string Suff> {
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
+  defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+          OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+          (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), 1, 1>,
+          AVX512FMA3Base, EVEX_B, EVEX_RC;
+}
+
+multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                   SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
+                                   string Suff> {
+  let Predicates = [HasAVX512] in {
+    defm Z      : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
+                  avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, _.info512,
+                      Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+  }
+  let Predicates = [HasVLX, HasAVX512] in {
+    defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
+                      EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+    defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
+                      EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+  }
+}
+
+multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              SDNode OpNodeRnd > {
+    defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
+                                      avx512vl_f32_info, "PS">;
+    defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
+                                      avx512vl_f64_info, "PD">, VEX_W;
+}
+
+defm VFMADD231    : avx512_fma3p_231_f<0xB8, "vfmadd231", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB231    : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD231   : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB231   : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>;
+
+multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                               X86VectorVTInfo _, string Suff> {
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+  defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>,
+         AVX512FMA3Base;
+
+  defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.MemOp:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src3), _.RC:$src2)), 1, 0>,
+         AVX512FMA3Base;
+
+  defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+         (ins _.RC:$src2, _.ScalarMemOp:$src3),
+         OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
+         "$src2, ${src3}"##_.BroadcastStr,
+         (_.VT (OpNode _.RC:$src1,
+                      (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+                      _.RC:$src2)), 1, 0>, AVX512FMA3Base, EVEX_B;
+  }
+
+  // Additional patterns for folding broadcast nodes in other orders.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src1, _.RC:$src2),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1,
+             _.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
+}
+
+multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 X86VectorVTInfo _, string Suff> {
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
+  defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+          OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+          (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))), 1, 1>,
+          AVX512FMA3Base, EVEX_B, EVEX_RC;
+}
+
+multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                   SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
+                                   string Suff> {
+  let Predicates = [HasAVX512] in {
+    defm Z      : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
+                  avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, _.info512,
+                      Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+  }
+  let Predicates = [HasVLX, HasAVX512] in {
+    defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
+                      EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+    defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
+                      EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+  }
+}
+
+multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              SDNode OpNodeRnd > {
+    defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
+                                      avx512vl_f32_info, "PS">;
+    defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
+                                      avx512vl_f64_info, "PD">, VEX_W;
+}
+
+defm VFMADD132    : avx512_fma3p_132_f<0x98, "vfmadd132", X86Fmadd, X86FmaddRnd>;
+defm VFMSUB132    : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD132   : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB132   : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubRnd>;
+
+// Scalar FMA
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                               dag RHS_VEC_r, dag RHS_VEC_m, dag RHS_VEC_rb,
+                                                        dag RHS_r, dag RHS_m > {
+  defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3), OpcodeStr,
+          "$src3, $src2", "$src2, $src3", RHS_VEC_r, 1, 1>, AVX512FMA3Base;
+
+  defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr,
+          "$src3, $src2", "$src2, $src3", RHS_VEC_m, 1, 1>, AVX512FMA3Base;
+
+  defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+         (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
+         OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb, 1, 1>,
+                                       AVX512FMA3Base, EVEX_B, EVEX_RC;
+
+  let isCodeGenOnly = 1, isCommutable = 1 in {
+    def r     : AVX512FMA3<opc, MRMSrcReg, (outs _.FRC:$dst),
+                     (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
+                     !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                     [RHS_r]>;
+    def m     : AVX512FMA3<opc, MRMSrcMem, (outs _.FRC:$dst),
+                    (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
+                    !strconcat(OpcodeStr,
+                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    [RHS_m]>;
+  }// isCodeGenOnly = 1
+}
+}// Constraints = "$src1 = $dst"
+
+multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
+                            string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
+                            SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> {
+
+  defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ ,
+                // Operands for intrinsic are in 123 order to preserve passthu
+                // semantics.
+                (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 FROUND_CURRENT))),
+                (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2,
+                         (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), (i32 FROUND_CURRENT))),
+                (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
+                         (i32 imm:$rc))),
+                (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
+                         _.FRC:$src3))),
+                (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
+                         (_.ScalarLdFrag addr:$src3))))>;
+
+  defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ ,
+                (_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))),
+                (_.VT (OpNodeRnds3 _.RC:$src2,
+                       (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
+                              _.RC:$src1, (i32 FROUND_CURRENT))),
+                (_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
+                                  (i32 imm:$rc))),
+                (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
+                                          _.FRC:$src1))),
+                (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2,
+                            (_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>;
+
+  defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ ,
+                (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))),
+                (_.VT (OpNodeRnds1 _.RC:$src1,
+                       (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
+                              _.RC:$src2, (i32 FROUND_CURRENT))),
+                (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2,
+                         (i32 imm:$rc))),
+                (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
+                         _.FRC:$src2))),
+                (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1,
+                          (_.ScalarLdFrag addr:$src3), _.FRC:$src2)))>;
+}
+
+multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
+                        string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
+                        SDNode OpNodeRnds3> {
+  let Predicates = [HasAVX512] in {
+    defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
+                                 OpNodeRnds1, OpNodeRnds3, f32x_info, "SS">,
+                                 EVEX_CD8<32, CD8VT1>, VEX_LIG;
+    defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
+                                 OpNodeRnds1, OpNodeRnds3, f64x_info, "SD">,
+                                 EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
+  }
+}
+
+defm VFMADD  : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnds1,
+                            X86FmaddRnds3>;
+defm VFMSUB  : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnds1,
+                            X86FmsubRnds3>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd,
+                            X86FnmaddRnds1, X86FnmaddRnds3>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub,
+                            X86FnmsubRnds1, X86FnmsubRnds3>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
+//===----------------------------------------------------------------------===//
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _> {
+  defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
+         AVX512FMA3Base;
+
+  defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.MemOp:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>,
+          AVX512FMA3Base;
+
+  defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+            (ins _.RC:$src2, _.ScalarMemOp:$src3),
+            OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
+            !strconcat("$src2, ${src3}", _.BroadcastStr ),
+            (OpNode _.RC:$src1,
+             _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
+            AVX512FMA3Base, EVEX_B;
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass avx512_pmadd52_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                     AVX512VLVectorVTInfo _> {
+  let Predicates = [HasIFMA] in {
+    defm Z      : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info512>,
+                      EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
+  }
+  let Predicates = [HasVLX, HasIFMA] in {
+    defm Z256 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info256>,
+                      EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
+    defm Z128 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info128>,
+                      EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
+  }
+}
+
+defm VPMADD52LUQ : avx512_pmadd52_common<0xb4, "vpmadd52luq", x86vpmadd52l,
+                                  avx512vl_i64_info>, VEX_W;
+defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h,
+                                  avx512vl_i64_info>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Scalar convert from sign integer to float/double
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
+                    X86VectorVTInfo DstVT, X86MemOperand x86memop,
+                    PatFrag ld_frag, string asm> {
+  let hasSideEffects = 0 in {
+    def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst),
+              (ins DstVT.FRC:$src1, SrcRC:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              EVEX_4V;
+    let mayLoad = 1 in
+      def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst),
+              (ins DstVT.FRC:$src1, x86memop:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              EVEX_4V;
+  } // hasSideEffects = 0
+  let isCodeGenOnly = 1 in {
+    def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
+                  (ins DstVT.RC:$src1, SrcRC:$src2),
+                  !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set DstVT.RC:$dst,
+                        (OpNode (DstVT.VT DstVT.RC:$src1),
+                                 SrcRC:$src2,
+                                 (i32 FROUND_CURRENT)))]>, EVEX_4V;
+
+    def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
+                  (ins DstVT.RC:$src1, x86memop:$src2),
+                  !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set DstVT.RC:$dst,
+                        (OpNode (DstVT.VT DstVT.RC:$src1),
+                                 (ld_frag addr:$src2),
+                                 (i32 FROUND_CURRENT)))]>, EVEX_4V;
+  }//isCodeGenOnly = 1
+}
+
+multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
+                    X86VectorVTInfo DstVT, string asm> {
+  def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
+              (ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc),
+              !strconcat(asm,
+                  "\t{$src2, $rc, $src1, $dst|$dst, $src1, $rc, $src2}"),
+              [(set DstVT.RC:$dst,
+                    (OpNode (DstVT.VT DstVT.RC:$src1),
+                             SrcRC:$src2,
+                             (i32 imm:$rc)))]>, EVEX_4V, EVEX_B, EVEX_RC;
+}
+
+multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
+                    X86VectorVTInfo DstVT, X86MemOperand x86memop,
+                    PatFrag ld_frag, string asm> {
+  defm NAME : avx512_vcvtsi_round<opc, OpNode, SrcRC, DstVT, asm>,
+              avx512_vcvtsi<opc, OpNode, SrcRC, DstVT, x86memop, ld_frag, asm>,
+                        VEX_LIG;
+}
+
+let Predicates = [HasAVX512] in {
+defm VCVTSI2SSZ  : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32,
+                                 v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">,
+                                 XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64,
+                                 v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">,
+                                 XS, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTSI2SDZ  : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32,
+                                 v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">,
+                                 XD, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64,
+                                 v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">,
+                                 XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+              (VCVTSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+              (VCVTSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+
+def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+          (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+          (VCVTSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+          (VCVTSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+          (VCVTSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (sint_to_fp GR32:$src)),
+          (VCVTSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (sint_to_fp GR64:$src)),
+          (VCVTSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (sint_to_fp GR32:$src)),
+          (VCVTSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (sint_to_fp GR64:$src)),
+          (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+
+defm VCVTUSI2SSZ   : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR32,
+                                  v4f32x_info, i32mem, loadi32,
+                                  "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
+                                  v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">,
+                                  XS, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, X86UintToFpRnd, GR32, v2f64x_info,
+                                  i32mem, loadi32, "cvtusi2sd{l}">,
+                                  XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
+                                  v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">,
+                                  XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+              (VCVTUSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+def : InstAlias<"vcvtusi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+              (VCVTUSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
+
+def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))),
+          (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))),
+          (VCVTUSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (uint_to_fp (loadi32 addr:$src))),
+          (VCVTUSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (uint_to_fp (loadi64 addr:$src))),
+          (VCVTUSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (uint_to_fp GR32:$src)),
+          (VCVTUSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (uint_to_fp GR64:$src)),
+          (VCVTUSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (uint_to_fp GR32:$src)),
+          (VCVTUSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (uint_to_fp GR64:$src)),
+          (VCVTUSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Scalar convert from float/double to integer
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT ,
+                                  X86VectorVTInfo DstVT, SDNode OpNode, string asm> {
+  let Predicates = [HasAVX512] in {
+    def rr : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src),
+                !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+                [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 FROUND_CURRENT)))]>,
+                EVEX, VEX_LIG;
+    def rb : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc),
+                !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
+                [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))]>,
+                EVEX, VEX_LIG, EVEX_B, EVEX_RC;
+    def rm : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.ScalarMemOp:$src),
+                !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+                [(set DstVT.RC:$dst, (OpNode
+                      (SrcVT.VT (scalar_to_vector (SrcVT.ScalarLdFrag addr:$src))),
+                      (i32 FROUND_CURRENT)))]>,
+                EVEX, VEX_LIG;
+  } // Predicates = [HasAVX512]
+}
+
+// Convert float/double to signed/unsigned int 32/64
+defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info,
+                                   X86cvts2si, "cvtss2si">,
+                                   XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info,
+                                   X86cvts2si, "cvtss2si">,
+                                   XS, VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, f32x_info, i32x_info,
+                                   X86cvts2usi, "cvtss2usi">,
+                                   XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, f32x_info, i64x_info,
+                                   X86cvts2usi, "cvtss2usi">, XS, VEX_W,
+                                   EVEX_CD8<32, CD8VT1>;
+defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info,
+                                   X86cvts2si, "cvtsd2si">,
+                                   XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info,
+                                   X86cvts2si, "cvtsd2si">,
+                                   XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USIZ:   avx512_cvt_s_int_round<0x79, f64x_info, i32x_info,
+                                   X86cvts2usi, "cvtsd2usi">,
+                                   XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info,
+                                   X86cvts2usi, "cvtsd2usi">, XD, VEX_W,
+                                   EVEX_CD8<64, CD8VT1>;
+
+// The SSE version of these instructions are disabled for AVX512.
+// Therefore, the SSE intrinsics are mapped to the AVX512 instructions.
+let Predicates = [HasAVX512] in {
+  def : Pat<(i32 (int_x86_sse_cvtss2si (v4f32 VR128X:$src))),
+            (VCVTSS2SIZrr VR128X:$src)>;
+  def : Pat<(i32 (int_x86_sse_cvtss2si (sse_load_f32 addr:$src))),
+            (VCVTSS2SIZrm addr:$src)>;
+  def : Pat<(i64 (int_x86_sse_cvtss2si64 (v4f32 VR128X:$src))),
+            (VCVTSS2SI64Zrr VR128X:$src)>;
+  def : Pat<(i64 (int_x86_sse_cvtss2si64 (sse_load_f32 addr:$src))),
+            (VCVTSS2SI64Zrm addr:$src)>;
+  def : Pat<(i32 (int_x86_sse2_cvtsd2si (v2f64 VR128X:$src))),
+            (VCVTSD2SIZrr VR128X:$src)>;
+  def : Pat<(i32 (int_x86_sse2_cvtsd2si (sse_load_f64 addr:$src))),
+            (VCVTSD2SIZrm addr:$src)>;
+  def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (v2f64 VR128X:$src))),
+            (VCVTSD2SI64Zrr VR128X:$src)>;
+  def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (sse_load_f64 addr:$src))),
+            (VCVTSD2SI64Zrm addr:$src)>;
+} // HasAVX512
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(int_x86_sse_cvtsi2ss VR128X:$src1, GR32:$src2),
+            (VCVTSI2SSZrr_Int VR128X:$src1, GR32:$src2)>;
+  def : Pat<(int_x86_sse_cvtsi2ss VR128X:$src1, (loadi32 addr:$src2)),
+            (VCVTSI2SSZrm_Int VR128X:$src1, addr:$src2)>;
+  def : Pat<(int_x86_sse_cvtsi642ss VR128X:$src1, GR64:$src2),
+            (VCVTSI642SSZrr_Int VR128X:$src1, GR64:$src2)>;
+  def : Pat<(int_x86_sse_cvtsi642ss VR128X:$src1, (loadi64 addr:$src2)),
+            (VCVTSI642SSZrm_Int VR128X:$src1, addr:$src2)>;
+  def : Pat<(int_x86_sse2_cvtsi2sd VR128X:$src1, GR32:$src2),
+            (VCVTSI2SDZrr_Int VR128X:$src1, GR32:$src2)>;
+  def : Pat<(int_x86_sse2_cvtsi2sd VR128X:$src1, (loadi32 addr:$src2)),
+            (VCVTSI2SDZrm_Int VR128X:$src1, addr:$src2)>;
+  def : Pat<(int_x86_sse2_cvtsi642sd VR128X:$src1, GR64:$src2),
+            (VCVTSI642SDZrr_Int VR128X:$src1, GR64:$src2)>;
+  def : Pat<(int_x86_sse2_cvtsi642sd VR128X:$src1, (loadi64 addr:$src2)),
+            (VCVTSI642SDZrm_Int VR128X:$src1, addr:$src2)>;
+  def : Pat<(int_x86_avx512_cvtusi2sd VR128X:$src1, GR32:$src2),
+            (VCVTUSI2SDZrr_Int VR128X:$src1, GR32:$src2)>;
+  def : Pat<(int_x86_avx512_cvtusi2sd VR128X:$src1, (loadi32 addr:$src2)),
+            (VCVTUSI2SDZrm_Int VR128X:$src1, addr:$src2)>;
+} // Predicates = [HasAVX512]
+
+// Convert float/double to signed/unsigned int 32/64 with truncation
+multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
+                            X86VectorVTInfo _DstRC, SDNode OpNode,
+                            SDNode OpNodeRnd, string aliasStr>{
+let Predicates = [HasAVX512] in {
+  def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+              [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, EVEX;
+  let hasSideEffects = 0 in
+  def rb : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
+                !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+                []>, EVEX, EVEX_B;
+  def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+              [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>,
+              EVEX;
+
+  def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+          (!cast<Instruction>(NAME # "rr") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>;
+  def : InstAlias<asm # aliasStr # "\t\t{{sae}, $src, $dst|$dst, $src, {sae}}",
+          (!cast<Instruction>(NAME # "rb") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>;
+  def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
+          (!cast<Instruction>(NAME # "rm") _DstRC.RC:$dst,
+                                          _SrcRC.ScalarMemOp:$src), 0>;
+
+  let isCodeGenOnly = 1 in {
+    def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+             [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
+                                   (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG;
+    def rb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+              !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
+              [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
+                                    (i32 FROUND_NO_EXC)))]>,
+                                    EVEX,VEX_LIG , EVEX_B;
+    let mayLoad = 1, hasSideEffects = 0 in
+      def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
+                  (ins _SrcRC.MemOp:$src),
+                  !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+                  []>, EVEX, VEX_LIG;
+
+  } // isCodeGenOnly = 1
+} //HasAVX512
+}
+
+
+defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info,
+                        fp_to_sint, X86cvtts2IntRnd, "{l}">,
+                        XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info,
+                        fp_to_sint, X86cvtts2IntRnd, "{q}">,
+                        VEX_W, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info,
+                        fp_to_sint, X86cvtts2IntRnd, "{l}">,
+                        XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info,
+                        fp_to_sint, X86cvtts2IntRnd, "{q}">,
+                        VEX_W, XD, EVEX_CD8<64, CD8VT1>;
+
+defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info,
+                        fp_to_uint, X86cvtts2UIntRnd, "{l}">,
+                        XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info,
+                        fp_to_uint, X86cvtts2UIntRnd, "{q}">,
+                        XS,VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info,
+                        fp_to_uint, X86cvtts2UIntRnd, "{l}">,
+                        XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
+                        fp_to_uint, X86cvtts2UIntRnd, "{q}">,
+                        XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+let Predicates = [HasAVX512] in {
+  def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))),
+            (VCVTTSS2SIZrr_Int VR128X:$src)>;
+  def : Pat<(i32 (int_x86_sse_cvttss2si (sse_load_f32 addr:$src))),
+            (VCVTTSS2SIZrm_Int addr:$src)>;
+  def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))),
+            (VCVTTSS2SI64Zrr_Int VR128X:$src)>;
+  def : Pat<(i64 (int_x86_sse_cvttss2si64 (sse_load_f32 addr:$src))),
+            (VCVTTSS2SI64Zrm_Int addr:$src)>;
+  def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))),
+            (VCVTTSD2SIZrr_Int VR128X:$src)>;
+  def : Pat<(i32 (int_x86_sse2_cvttsd2si (sse_load_f64 addr:$src))),
+            (VCVTTSD2SIZrm_Int addr:$src)>;
+  def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))),
+            (VCVTTSD2SI64Zrr_Int VR128X:$src)>;
+  def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (sse_load_f64 addr:$src))),
+            (VCVTTSD2SI64Zrm_Int addr:$src)>;
+} // HasAVX512
+//===----------------------------------------------------------------------===//
+// AVX-512  Convert form float to double and back
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNode> {
+  defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (_.VT (OpNode (_.VT _.RC:$src1),
+                                       (_Src.VT _Src.RC:$src2),
+                                       (i32 FROUND_CURRENT)))>,
+                         EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
+  defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (_.VT (OpNode (_.VT _.RC:$src1),
+                                  (_Src.VT (scalar_to_vector
+                                            (_Src.ScalarLdFrag addr:$src2))),
+                                  (i32 FROUND_CURRENT)))>,
+                         EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+// Scalar Coversion with SAE - suppress all exceptions
+multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
+                        "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+                        (_.VT (OpNodeRnd (_.VT _.RC:$src1),
+                                         (_Src.VT _Src.RC:$src2),
+                                         (i32 FROUND_NO_EXC)))>,
+                        EVEX_4V, VEX_LIG, EVEX_B;
+}
+
+// Scalar Conversion with rounding control (RC)
+multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
+                        "$rc, $src2, $src1", "$src1, $src2, $rc",
+                        (_.VT (OpNodeRnd (_.VT _.RC:$src1),
+                                         (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>,
+                        EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
+                        EVEX_B, EVEX_RC;
+}
+multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNodeRnd, X86VectorVTInfo _src,
+                                                        X86VectorVTInfo _dst> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
+             avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
+                               OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
+  }
+}
+
+multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr,
+                                    SDNode OpNodeRnd, X86VectorVTInfo _src,
+                                                          X86VectorVTInfo _dst> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
+             avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
+             EVEX_CD8<32, CD8VT1>, XS;
+  }
+}
+defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss",
+                                         X86froundRnd, f64x_info, f32x_info>;
+defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd",
+                                          X86fpextRnd,f32x_info, f64x_info >;
+
+def : Pat<(f64 (fpextend FR32X:$src)),
+          (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X),
+                               (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>,
+          Requires<[HasAVX512]>;
+def : Pat<(f64 (fpextend (loadf32 addr:$src))),
+          (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+          Requires<[HasAVX512]>;
+
+def : Pat<(f64 (extloadf32 addr:$src)),
+      (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+      Requires<[HasAVX512, OptForSize]>;
+
+def : Pat<(f64 (extloadf32 addr:$src)),
+          (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)),
+                    (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>,
+          Requires<[HasAVX512, OptForSpeed]>;
+
+def : Pat<(f32 (fpround FR64X:$src)),
+          (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X),
+                    (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>,
+           Requires<[HasAVX512]>;
+//===----------------------------------------------------------------------===//
+// AVX-512  Vector convert from signed/unsigned integer to float/double
+//          and from float/double to signed/unsigned integer
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNode,
+                         string Broadcast = _.BroadcastStr,
+                         string Alias = "", X86MemOperand MemOp = _Src.MemOp> {
+
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _Src.RC:$src), OpcodeStr, "$src", "$src",
+                         (_.VT (OpNode (_Src.VT _Src.RC:$src)))>, EVEX;
+
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins MemOp:$src), OpcodeStr#Alias, "$src", "$src",
+                         (_.VT (OpNode (_Src.VT
+                             (bitconvert (_Src.LdFrag addr:$src)))))>, EVEX;
+
+  defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _Src.ScalarMemOp:$src), OpcodeStr,
+                         "${src}"##Broadcast, "${src}"##Broadcast,
+                         (_.VT (OpNode (_Src.VT
+                                  (X86VBroadcast (_Src.ScalarLdFrag addr:$src)))
+                            ))>, EVEX, EVEX_B;
+}
+// Coversion with SAE - suppress all exceptions
+multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+  defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _Src.RC:$src), OpcodeStr,
+                        "{sae}, $src", "$src, {sae}",
+                        (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src),
+                               (i32 FROUND_NO_EXC)))>,
+                        EVEX, EVEX_B;
+}
+
+// Conversion with rounding control (RC)
+multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+  defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr,
+                        "$rc, $src", "$src, $rc",
+                        (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>,
+                        EVEX, EVEX_B, EVEX_RC;
+}
+
+// Extend Float to Double
+multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fpextend>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
+                                X86vfpextRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info,
+                               X86vfpext, "{1to2}", "", f64mem>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend>,
+                                     EVEX_V256;
+  }
+}
+
+// Truncate Double to Float
+multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
+                               X86vfproundRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
+                               X86vfpround, "{1to2}", "{x}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fpround,
+                               "{1to4}", "{y}">, EVEX_V256;
+
+    def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+                    (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
+    def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+                    (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0>;
+    def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+                    (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
+    def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+                    (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0>;
+  }
+}
+
+defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps">,
+                                  VEX_W, PD, EVEX_CD8<64, CD8VF>;
+defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd">,
+                                  PS, EVEX_CD8<32, CD8VH>;
+
+def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+            (VCVTPS2PDZrm addr:$src)>;
+
+let Predicates = [HasVLX] in {
+  let AddedComplexity = 15 in
+  def : Pat<(X86vzmovl (v2f64 (bitconvert
+                               (v4f32 (X86vfpround (v2f64 VR128X:$src)))))),
+            (VCVTPD2PSZ128rr VR128X:$src)>;
+  def : Pat<(v2f64 (extloadv2f32 addr:$src)),
+              (VCVTPS2PDZ128rm addr:$src)>;
+  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
+              (VCVTPS2PDZ256rm addr:$src)>;
+}
+
+// Convert Signed/Unsigned Doubleword to Double
+multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           SDNode OpNode128> {
+  // No rounding in this op
+  let Predicates = [HasAVX512] in
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode>,
+                                     EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info,
+                                     OpNode128, "{1to2}", "", i64mem>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode>,
+                                     EVEX_V256;
+  }
+}
+
+// Convert Signed/Unsigned Doubleword to Float
+multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info,
+                               OpNodeRnd>, EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode>,
+                                     EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode>,
+                                     EVEX_V256;
+  }
+}
+
+// Convert Float to Signed/Unsigned Doubleword with truncation
+multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
+                                OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>,
+                                     EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>,
+                                     EVEX_V256;
+  }
+}
+
+// Convert Float to Signed/Unsigned Doubleword
+multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info,
+                                OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>,
+                                     EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>,
+                                     EVEX_V256;
+  }
+}
+
+// Convert Double to Signed/Unsigned Doubleword with truncation
+multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            SDNode OpNode128, SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
+                                OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+    // memory forms of these instructions in Asm Parser. They have the same
+    // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+    // due to the same reason.
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
+                               OpNode128, "{1to2}", "{x}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
+                               "{1to4}", "{y}">, EVEX_V256;
+
+    def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+                    (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
+    def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+                    (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0>;
+    def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+                    (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
+    def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+                    (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0>;
+  }
+}
+
+// Convert Double to Signed/Unsigned Doubleword
+multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasAVX512] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasVLX] in {
+    // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+    // memory forms of these instructions in Asm Parcer. They have the same
+    // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+    // due to the same reason.
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
+                               "{1to2}", "{x}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
+                               "{1to4}", "{y}">, EVEX_V256;
+
+    def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+                    (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
+    def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+                    (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0>;
+    def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+                    (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
+    def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+                    (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0>;
+  }
+}
+
+// Convert Double to Signed/Unsigned Quardword
+multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>,
+                               EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Double to Signed/Unsigned Quardword with truncation
+multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>,
+                               EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Signed/Unsigned Quardword to Double
+multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode>,
+                               EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Float to Signed/Unsigned Quardword
+multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    // Explicitly specified broadcast string, since we take only 2 elements
+    // from v4f32x_info source
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
+                               "{1to2}", "", f64mem>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Float to Signed/Unsigned Quardword with truncation
+multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            SDNode OpNode128, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    // Explicitly specified broadcast string, since we take only 2 elements
+    // from v4f32x_info source
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode128,
+                               "{1to2}", "", f64mem>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Signed/Unsigned Quardword to Float
+multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SDNode OpNode128, SDNode OpNodeRnd> {
+  let Predicates = [HasDQI] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info,
+                               OpNodeRnd>, EVEX_V512;
+  }
+  let Predicates = [HasDQI, HasVLX] in {
+    // we need "x"/"y" suffixes in order to distinguish between 128 and 256
+    // memory forms of these instructions in Asm Parcer. They have the same
+    // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
+    // due to the same reason.
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode128,
+                               "{1to2}", "{x}">, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
+                               "{1to4}", "{y}">, EVEX_V256;
+
+    def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+                    (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
+    def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+                    (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0>;
+    def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+                    (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
+    def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+                    (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0>;
+  }
+}
+
+defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP>,
+                                XS, EVEX_CD8<32, CD8VH>;
+
+defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp,
+                                X86VSintToFpRnd>,
+                                PS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint,
+                                X86cvttp2siRnd>,
+                                XS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, X86cvttp2si,
+                                 X86cvttp2siRnd>,
+                                 PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint,
+                                 X86cvttp2uiRnd>, PS,
+                                 EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint,
+                                 X86cvttp2ui, X86cvttp2uiRnd>, PS, VEX_W,
+                                 EVEX_CD8<64, CD8VF>;
+
+defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, X86VUintToFP>,
+                                 XS, EVEX_CD8<32, CD8VH>;
+
+defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp,
+                                 X86VUintToFpRnd>, XD,
+                                 EVEX_CD8<32, CD8VF>;
+
+defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int,
+                                 X86cvtp2IntRnd>, PD, EVEX_CD8<32, CD8VF>;
+
+defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int,
+                                 X86cvtp2IntRnd>, XD, VEX_W,
+                                 EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt,
+                                 X86cvtp2UIntRnd>,
+                                 PS, EVEX_CD8<32, CD8VF>;
+defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt,
+                                 X86cvtp2UIntRnd>, VEX_W,
+                                 PS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int,
+                                 X86cvtp2IntRnd>, VEX_W,
+                                 PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int,
+                                 X86cvtp2IntRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt,
+                                 X86cvtp2UIntRnd>, VEX_W,
+                                 PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt,
+                                 X86cvtp2UIntRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", fp_to_sint,
+                                 X86cvttp2siRnd>, VEX_W,
+                                 PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint, X86cvttp2si,
+                                 X86cvttp2siRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", fp_to_uint,
+                                 X86cvttp2uiRnd>, VEX_W,
+                                 PD, EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint, X86cvttp2ui,
+                                 X86cvttp2uiRnd>, PD, EVEX_CD8<32, CD8VH>;
+
+defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp,
+                            X86VSintToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp,
+                            X86VUintToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, X86VSintToFP,
+                            X86VSintToFpRnd>, VEX_W, PS, EVEX_CD8<64, CD8VF>;
+
+defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUintToFP,
+                            X86VUintToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>;
+
+let Predicates = [HasAVX512, NoVLX] in {
+def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
+          (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
+           (v16f32 (INSERT_SUBREG (IMPLICIT_DEF),
+                                  VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
+           (v16f32 (INSERT_SUBREG (IMPLICIT_DEF),
+                                  VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src1))),
+          (EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr
+           (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+                                 VR256X:$src1, sub_ymm)))), sub_xmm)>;
+
+def : Pat<(v4i32 (X86cvttp2ui (v2f64 VR128X:$src))),
+          (EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr
+           (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+                                 VR128X:$src, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))),
+          (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
+           (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
+                                  VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
+           (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
+                                  VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
+           (v8i32 (INSERT_SUBREG (IMPLICIT_DEF),
+                                 VR128X:$src1, sub_xmm)))), sub_ymm)>;
+
+def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
+           (v8i32 (INSERT_SUBREG (IMPLICIT_DEF),
+                                 VR128X:$src1, sub_xmm)))), sub_xmm)>;
+}
+
+let Predicates = [HasAVX512, HasVLX] in {
+  let AddedComplexity = 15 in {
+    def : Pat<(X86vzmovl (v2i64 (bitconvert
+                                (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))),
+              (VCVTPD2DQZ128rr VR128X:$src)>;
+    def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
+                                 (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))))),
+              (VCVTPD2UDQZ128rr VR128X:$src)>;
+    def : Pat<(X86vzmovl (v2i64 (bitconvert
+                                (v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))),
+              (VCVTTPD2DQZ128rr VR128X:$src)>;
+    def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
+                                 (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))))),
+              (VCVTTPD2UDQZ128rr VR128X:$src)>;
+  }
+}
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))),
+            (VCVTPD2PSZrm addr:$src)>;
+  def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+            (VCVTPS2PDZrm addr:$src)>;
+}
+
+let Predicates = [HasDQI, HasVLX] in {
+  let AddedComplexity = 15 in {
+    def : Pat<(X86vzmovl (v2f64 (bitconvert
+                                (v4f32 (X86VSintToFP (v2i64 VR128X:$src)))))),
+              (VCVTQQ2PSZ128rr VR128X:$src)>;
+    def : Pat<(X86vzmovl (v2f64 (bitconvert
+                                (v4f32 (X86VUintToFP (v2i64 VR128X:$src)))))),
+              (VCVTUQQ2PSZ128rr VR128X:$src)>;
+  }
+}
+
+let Predicates = [HasDQI, NoVLX] in {
+def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src1))),
+          (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr
+           (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+                                  VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v8i64 (VCVTTPS2QQZrr
+           (v8f32 (INSERT_SUBREG (IMPLICIT_DEF),
+                                  VR128X:$src1, sub_xmm)))), sub_ymm)>;
+
+def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src1))),
+          (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr
+           (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+                                  VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src1))),
+          (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr
+           (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+                                  VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v8i64 (VCVTTPS2UQQZrr
+           (v8f32 (INSERT_SUBREG (IMPLICIT_DEF),
+                                  VR128X:$src1, sub_xmm)))), sub_ymm)>;
+
+def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src1))),
+          (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr
+           (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
+                                  VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v4f32 (sint_to_fp (v4i64 VR256X:$src1))),
+          (EXTRACT_SUBREG (v8f32 (VCVTQQ2PSZrr
+           (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+                                  VR256X:$src1, sub_ymm)))), sub_xmm)>;
+
+def : Pat<(v2f64 (sint_to_fp (v2i64 VR128X:$src1))),
+          (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr
+           (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+                                  VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4f64 (sint_to_fp (v4i64 VR256X:$src1))),
+          (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr
+           (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+                                  VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v4f32 (uint_to_fp (v4i64 VR256X:$src1))),
+          (EXTRACT_SUBREG (v8f32 (VCVTUQQ2PSZrr
+           (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+                                  VR256X:$src1, sub_ymm)))), sub_xmm)>;
+
+def : Pat<(v2f64 (uint_to_fp (v2i64 VR128X:$src1))),
+          (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr
+           (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+                                  VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))),
+          (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr
+           (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+                                  VR256X:$src1, sub_ymm)))), sub_ymm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
+                           X86MemOperand x86memop, PatFrag ld_frag> {
+  defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
+                    "vcvtph2ps", "$src", "$src",
+                   (X86cvtph2ps (_src.VT _src.RC:$src),
+                                                (i32 FROUND_CURRENT))>, T8PD;
+  defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src),
+                    "vcvtph2ps", "$src", "$src",
+                    (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))),
+                                     (i32 FROUND_CURRENT))>, T8PD;
+}
+
+multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
+  defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
+                    "vcvtph2ps", "{sae}, $src", "$src, {sae}",
+                   (X86cvtph2ps (_src.VT _src.RC:$src),
+                                                (i32 FROUND_NO_EXC))>, T8PD, EVEX_B;
+
+}
+
+let Predicates = [HasAVX512] in {
+  defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>,
+                    avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>,
+                    EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
+  let Predicates = [HasVLX] in {
+    defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
+                         loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+    defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
+                         loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+  }
+}
+
+multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
+                           X86MemOperand x86memop> {
+  defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
+                   (ins _src.RC:$src1, i32u8imm:$src2),
+                   "vcvtps2ph", "$src2, $src1", "$src1, $src2",
+                   (X86cvtps2ph (_src.VT _src.RC:$src1),
+                                (i32 imm:$src2)),
+                   NoItinerary, 0, 0, X86select>, AVX512AIi8Base;
+  def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
+             (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
+             "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+             [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1),
+                                     (i32 imm:$src2))),
+                                     addr:$dst)]>;
+  let hasSideEffects = 0, mayStore = 1 in
+  def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
+             (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+             "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+              []>, EVEX_K;
+}
+multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
+  let hasSideEffects = 0 in
+  defm rb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest,
+                   (outs _dest.RC:$dst),
+                   (ins _src.RC:$src1, i32u8imm:$src2),
+                   "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2",
+                   []>, EVEX_B, AVX512AIi8Base;
+}
+let Predicates = [HasAVX512] in {
+  defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>,
+                    avx512_cvtps2ph_sae<v16i16x_info, v16f32_info>,
+                      EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
+  let Predicates = [HasVLX] in {
+    defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>,
+                        EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+    defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f128mem>,
+                        EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+  }
+}
+
+// Patterns for matching conversions from float to half-float and vice versa.
+let Predicates = [HasVLX] in {
+  // Use MXCSR.RC for rounding instead of explicitly specifying the default
+  // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
+  // configurations we support (the default). However, falling back to MXCSR is
+  // more consistent with other instructions, which are always controlled by it.
+  // It's encoded as 0b100.
+  def : Pat<(fp_to_f16 FR32X:$src),
+            (i16 (EXTRACT_SUBREG (VMOVPDI2DIZrr (VCVTPS2PHZ128rr
+              (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), sub_16bit))>;
+
+  def : Pat<(f16_to_fp GR16:$src),
+            (f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr
+              (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)), FR32X)) >;
+
+  def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
+            (f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr
+              (VCVTPS2PHZ128rr (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), FR32X)) >;
+}
+
+// Patterns for matching float to half-float conversion when AVX512 is supported
+// but F16C isn't. In that case we have to use 512-bit vectors.
+let Predicates = [HasAVX512, NoVLX, NoF16C] in {
+  def : Pat<(fp_to_f16 FR32X:$src),
+            (i16 (EXTRACT_SUBREG
+                  (VMOVPDI2DIZrr
+                   (v8i16 (EXTRACT_SUBREG
+                    (VCVTPS2PHZrr
+                     (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+                      (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
+                      sub_xmm), 4), sub_xmm))), sub_16bit))>;
+
+  def : Pat<(f16_to_fp GR16:$src),
+            (f32 (COPY_TO_REGCLASS
+                  (v4f32 (EXTRACT_SUBREG
+                   (VCVTPH2PSZrr
+                    (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)),
+                     (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)),
+                     sub_xmm)), sub_xmm)), FR32X))>;
+
+  def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
+            (f32 (COPY_TO_REGCLASS
+                  (v4f32 (EXTRACT_SUBREG
+                          (VCVTPH2PSZrr
+                           (VCVTPS2PHZrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+                            (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
+                            sub_xmm), 4)), sub_xmm)), FR32X))>;
+}
+
+//  Unordered/Ordered scalar fp compare with Sea and set EFLAGS
+multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
+                            string OpcodeStr> {
+  def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
+                 !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"),
+                 [], IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128,
+                 Sched<[WriteFAdd]>;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+  defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss">,
+                                   AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+  defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd">,
+                                   AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+  defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss">,
+                                   AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
+  defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd">,
+                                   AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+  defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
+                                 "ucomiss">, PS, EVEX, VEX_LIG,
+                                 EVEX_CD8<32, CD8VT1>;
+  defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd">, PD, EVEX,
+                                  VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  let Pattern = []<dag> in {
+    defm VCOMISSZ  : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32,
+                                   "comiss">, PS, EVEX, VEX_LIG,
+                                   EVEX_CD8<32, CD8VT1>;
+    defm VCOMISDZ  : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64,
+                                   "comisd">, PD, EVEX,
+                                    VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
+  let isCodeGenOnly = 1 in {
+    defm Int_VUCOMISSZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem,
+                              load, "ucomiss">, PS, EVEX, VEX_LIG,
+                              EVEX_CD8<32, CD8VT1>;
+    defm Int_VUCOMISDZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem,
+                              load, "ucomisd">, PD, EVEX,
+                              VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+    defm Int_VCOMISSZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem,
+                              load, "comiss">, PS, EVEX, VEX_LIG,
+                              EVEX_CD8<32, CD8VT1>;
+    defm Int_VCOMISDZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem,
+                              load, "comisd">, PD, EVEX,
+                              VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
+}
+
+/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
+multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> {
+  let AddedComplexity = 20 , Predicates = [HasAVX512] in {
+  defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                           "$src2, $src1", "$src1, $src2",
+                           (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, EVEX_4V;
+  defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (OpNode (_.VT _.RC:$src1),
+                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))))>, EVEX_4V;
+}
+}
+
+defm VRCP14SS   : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>,
+                  EVEX_CD8<32, CD8VT1>, T8PD;
+defm VRCP14SD   : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>,
+                  VEX_W, EVEX_CD8<64, CD8VT1>, T8PD;
+defm VRSQRT14SS   : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>,
+                  EVEX_CD8<32, CD8VT1>, T8PD;
+defm VRSQRT14SD   : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>,
+                  VEX_W, EVEX_CD8<64, CD8VT1>, T8PD;
+
+/// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
+multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         X86VectorVTInfo _> {
+  defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src), OpcodeStr, "$src", "$src",
+                         (_.FloatVT (OpNode _.RC:$src))>, EVEX, T8PD;
+  defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+                         (OpNode (_.FloatVT
+                           (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD;
+  defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                          (ins _.ScalarMemOp:$src), OpcodeStr,
+                          "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+                          (OpNode (_.FloatVT
+                            (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+                          EVEX, T8PD, EVEX_B;
+}
+
+multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, v16f32_info>,
+                          EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, v8f64_info>,
+                          EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+  // Define only if AVX512VL feature is present.
+  let Predicates = [HasVLX] in {
+    defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
+                                OpNode, v4f32x_info>,
+                               EVEX_V128, EVEX_CD8<32, CD8VF>;
+    defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
+                                OpNode, v8f32x_info>,
+                               EVEX_V256, EVEX_CD8<32, CD8VF>;
+    defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
+                                OpNode, v2f64x_info>,
+                               EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+    defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
+                                OpNode, v4f64x_info>,
+                               EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+  }
+}
+
+defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>;
+defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;
+
+/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
+multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+                         SDNode OpNode> {
+
+  defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                           "$src2, $src1", "$src1, $src2",
+                           (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                           (i32 FROUND_CURRENT))>;
+
+  defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                            "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+                            (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                            (i32 FROUND_NO_EXC))>, EVEX_B;
+
+  defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (OpNode (_.VT _.RC:$src1),
+                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                         (i32 FROUND_CURRENT))>;
+}
+
+multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm SS : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode>,
+              EVEX_CD8<32, CD8VT1>;
+  defm SD : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode>,
+              EVEX_CD8<64, CD8VT1>, VEX_W;
+}
+
+let Predicates = [HasERI] in {
+  defm VRCP28   : avx512_eri_s<0xCB, "vrcp28",   X86rcp28s>,   T8PD, EVEX_4V;
+  defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V;
+}
+
+defm VGETEXP   : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V;
+/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
+
+multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         SDNode OpNode> {
+
+  defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src), OpcodeStr, "$src", "$src",
+                         (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>;
+
+  defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+                         (OpNode (_.FloatVT
+                             (bitconvert (_.LdFrag addr:$src))),
+                          (i32 FROUND_CURRENT))>;
+
+  defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.ScalarMemOp:$src), OpcodeStr,
+                         "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+                         (OpNode (_.FloatVT
+                                  (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+                                 (i32 FROUND_CURRENT))>, EVEX_B;
+}
+multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         SDNode OpNode> {
+  defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _.RC:$src), OpcodeStr,
+                        "{sae}, $src", "$src, {sae}",
+                        (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass  avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+   defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode>,
+             avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode>,
+             T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+   defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode>,
+             avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode>,
+             T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode> {
+  // Define only if AVX512VL feature is present.
+  let Predicates = [HasVLX] in {
+    defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode>,
+                                     EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>;
+    defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode>,
+                                     EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>;
+    defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode>,
+                                     EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
+    defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode>,
+                                     EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
+  }
+}
+let Predicates = [HasERI] in {
+
+ defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX;
+ defm VRCP28   : avx512_eri<0xCA, "vrcp28",   X86rcp28>,   EVEX;
+ defm VEXP2    : avx512_eri<0xC8, "vexp2",    X86exp2>,    EVEX;
+}
+defm VGETEXP   : avx512_eri<0x42, "vgetexp", X86fgetexpRnd>,
+                 avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd> , EVEX;
+
+multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
+                              SDNode OpNodeRnd, X86VectorVTInfo _>{
+  defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc",
+                         (_.VT (OpNodeRnd _.RC:$src, (i32 imm:$rc)))>,
+                         EVEX, EVEX_B, EVEX_RC;
+}
+
+multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
+                              SDNode OpNode, X86VectorVTInfo _>{
+  defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src), OpcodeStr, "$src", "$src",
+                         (_.FloatVT (OpNode _.RC:$src))>, EVEX;
+  defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+                         (OpNode (_.FloatVT
+                           (bitconvert (_.LdFrag addr:$src))))>, EVEX;
+
+  defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                          (ins _.ScalarMemOp:$src), OpcodeStr,
+                          "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+                          (OpNode (_.FloatVT
+                            (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+                          EVEX, EVEX_B;
+}
+
+multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode> {
+  defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+                                v16f32_info>,
+                                EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+                                v8f64_info>,
+                                EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+  // Define only if AVX512VL feature is present.
+  let Predicates = [HasVLX] in {
+    defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
+                                     OpNode, v4f32x_info>,
+                                     EVEX_V128, PS, EVEX_CD8<32, CD8VF>;
+    defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
+                                     OpNode, v8f32x_info>,
+                                     EVEX_V256, PS, EVEX_CD8<32, CD8VF>;
+    defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
+                                     OpNode, v2f64x_info>,
+                                     EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+    defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
+                                     OpNode, v4f64x_info>,
+                                     EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+  }
+}
+
+multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
+                                          SDNode OpNodeRnd> {
+  defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"), OpNodeRnd,
+                                v16f32_info>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"), OpNodeRnd,
+                                v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
+                              string SUFF, SDNode OpNode, SDNode OpNodeRnd> {
+
+  defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (OpNodeRnd (_.VT _.RC:$src1),
+                                    (_.VT _.RC:$src2),
+                                    (i32 FROUND_CURRENT))>;
+  defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                       (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                       "$src2, $src1", "$src1, $src2",
+                       (OpNodeRnd (_.VT _.RC:$src1),
+                                  (_.VT (scalar_to_vector
+                                            (_.ScalarLdFrag addr:$src2))),
+                                  (i32 FROUND_CURRENT))>;
+
+  defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
+                         "$rc, $src2, $src1", "$src1, $src2, $rc",
+                         (OpNodeRnd (_.VT _.RC:$src1),
+                                     (_.VT _.RC:$src2),
+                                     (i32 imm:$rc))>,
+                         EVEX_B, EVEX_RC;
+
+  let isCodeGenOnly = 1, hasSideEffects = 0 in {
+    def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
+               (ins _.FRC:$src1, _.FRC:$src2),
+               OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
+
+    let mayLoad = 1 in
+      def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
+                 (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+                 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
+  }
+
+  def : Pat<(_.EltVT (OpNode _.FRC:$src)),
+            (!cast<Instruction>(NAME#SUFF#Zr)
+                (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
+
+  def : Pat<(_.EltVT (OpNode (load addr:$src))),
+            (!cast<Instruction>(NAME#SUFF#Zm)
+                (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512, OptForSize]>;
+}
+
+multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> {
+  defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt,
+                        X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
+  defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt,
+                        X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W;
+}
+
+defm VSQRT   : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>,
+               avx512_sqrt_packed_all_round<0x51, "vsqrt", X86fsqrtRnd>;
+
+defm VSQRT   : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG;
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(f32 (X86frsqrt FR32X:$src)),
+            (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>;
+  def : Pat<(f32 (X86frsqrt (load addr:$src))),
+            (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+            Requires<[OptForSize]>;
+  def : Pat<(f32 (X86frcp FR32X:$src)),
+            (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>;
+  def : Pat<(f32 (X86frcp (load addr:$src))),
+            (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+            Requires<[OptForSize]>;
+}
+
+multiclass
+avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+
+  let ExeDomain = _.ExeDomain in {
+  defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
+                           "$src3, $src2, $src1", "$src1, $src2, $src3",
+                           (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                            (i32 imm:$src3), (i32 FROUND_CURRENT)))>;
+
+  defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
+                         "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
+                         (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                         (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B;
+
+  defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+                         OpcodeStr,
+                         "$src3, $src2, $src1", "$src1, $src2, $src3",
+                         (_.VT (X86RndScales (_.VT _.RC:$src1),
+                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                          (i32 imm:$src3), (i32 FROUND_CURRENT)))>;
+  }
+  let Predicates = [HasAVX512] in {
+  def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x1))), _.FRC)>;
+  def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x2))), _.FRC)>;
+  def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x3))), _.FRC)>;
+  def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>;
+  def : Pat<(fnearbyint _.FRC:$src), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xc))), _.FRC)>;
+
+  def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+             addr:$src, (i32 0x1))), _.FRC)>;
+  def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+             addr:$src, (i32 0x2))), _.FRC)>;
+  def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+             addr:$src, (i32 0x3))), _.FRC)>;
+  def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+             addr:$src, (i32 0x4))), _.FRC)>;
+  def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
+             (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
+             addr:$src, (i32 0xc))), _.FRC)>;
+  }
+}
+
+defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>,
+                                AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W,
+                                AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>;
+
+//-------------------------------------------------
+// Integer truncate and extend operations
+//-------------------------------------------------
+
+multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo,
+                              X86MemOperand x86memop> {
+  let ExeDomain = DestInfo.ExeDomain in
+  defm rr  : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst),
+                      (ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1",
+                      (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>,
+                       EVEX, T8XS;
+
+  // for intrinsic patter match
+  def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+                           (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+                           undef)),
+            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask ,
+                                      SrcInfo.RC:$src1)>;
+
+  def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+                           (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+                           DestInfo.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask ,
+                                      SrcInfo.RC:$src1)>;
+
+  def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
+                           (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+                           DestInfo.RC:$src0)),
+            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrk) DestInfo.RC:$src0,
+                                      DestInfo.KRCWM:$mask ,
+                                      SrcInfo.RC:$src1)>;
+
+  let mayStore = 1, mayLoad = 1, hasSideEffects = 0,
+      ExeDomain = DestInfo.ExeDomain in {
+    def mr : AVX512XS8I<opc, MRMDestMem, (outs),
+               (ins x86memop:$dst, SrcInfo.RC:$src),
+               OpcodeStr # "\t{$src, $dst|$dst, $src}",
+               []>, EVEX;
+
+    def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
+               (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src),
+               OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+               []>, EVEX, EVEX_K;
+  }//mayStore = 1, mayLoad = 1, hasSideEffects = 0
+}
+
+multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo,
+                                    X86VectorVTInfo DestInfo,
+                                    PatFrag truncFrag, PatFrag mtruncFrag > {
+
+  def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst),
+            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr)
+                                    addr:$dst, SrcInfo.RC:$src)>;
+
+  def : Pat<(mtruncFrag addr:$dst, SrcInfo.KRCWM:$mask,
+                                               (SrcInfo.VT SrcInfo.RC:$src)),
+            (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk)
+                            addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>;
+}
+
+multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode,
+         AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
+         X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
+         X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
+         X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag,
+                                                     Predicate prd = HasAVX512>{
+
+  let Predicates = [HasVLX, prd] in {
+    defm Z128:  avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128,
+                             DestInfoZ128, x86memopZ128>,
+                avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
+                             truncFrag, mtruncFrag>, EVEX_V128;
+
+    defm Z256:  avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256,
+                             DestInfoZ256, x86memopZ256>,
+                avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
+                             truncFrag, mtruncFrag>, EVEX_V256;
+  }
+  let Predicates = [prd] in
+    defm Z:     avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512,
+                             DestInfoZ, x86memopZ>,
+                avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ,
+                             truncFrag, mtruncFrag>, EVEX_V512;
+}
+
+multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           PatFrag StoreNode, PatFrag MaskedStoreNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+               v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem,
+               StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VO>;
+}
+
+multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           PatFrag StoreNode, PatFrag MaskedStoreNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+               v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem,
+               StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VQ>;
+}
+
+multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           PatFrag StoreNode, PatFrag MaskedStoreNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+               v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem,
+               StoreNode, MaskedStoreNode>, EVEX_CD8<32, CD8VH>;
+}
+
+multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           PatFrag StoreNode, PatFrag MaskedStoreNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+               v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem,
+               StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VQ>;
+}
+
+multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           PatFrag StoreNode, PatFrag MaskedStoreNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+              v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem,
+              StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VH>;
+}
+
+multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           PatFrag StoreNode, PatFrag MaskedStoreNode> {
+  defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i16_info,
+              v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem,
+              StoreNode, MaskedStoreNode, HasBWI>, EVEX_CD8<16, CD8VH>;
+}
+
+defm VPMOVQB    : avx512_trunc_qb<0x32, "vpmovqb",   X86vtrunc,
+                                  truncstorevi8, masked_truncstorevi8>;
+defm VPMOVSQB   : avx512_trunc_qb<0x22, "vpmovsqb",  X86vtruncs,
+                                  truncstore_s_vi8, masked_truncstore_s_vi8>;
+defm VPMOVUSQB  : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus,
+                                  truncstore_us_vi8, masked_truncstore_us_vi8>;
+
+defm VPMOVQW    : avx512_trunc_qw<0x34, "vpmovqw",   X86vtrunc,
+                                  truncstorevi16, masked_truncstorevi16>;
+defm VPMOVSQW   : avx512_trunc_qw<0x24, "vpmovsqw",  X86vtruncs,
+                                  truncstore_s_vi16, masked_truncstore_s_vi16>;
+defm VPMOVUSQW  : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus,
+                                  truncstore_us_vi16, masked_truncstore_us_vi16>;
+
+defm VPMOVQD    : avx512_trunc_qd<0x35, "vpmovqd",   X86vtrunc,
+                                  truncstorevi32, masked_truncstorevi32>;
+defm VPMOVSQD   : avx512_trunc_qd<0x25, "vpmovsqd",  X86vtruncs,
+                                  truncstore_s_vi32, masked_truncstore_s_vi32>;
+defm VPMOVUSQD  : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus,
+                                  truncstore_us_vi32, masked_truncstore_us_vi32>;
+
+defm VPMOVDB    : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc,
+                                  truncstorevi8, masked_truncstorevi8>;
+defm VPMOVSDB   : avx512_trunc_db<0x21, "vpmovsdb",   X86vtruncs,
+                                  truncstore_s_vi8, masked_truncstore_s_vi8>;
+defm VPMOVUSDB  : avx512_trunc_db<0x11, "vpmovusdb",  X86vtruncus,
+                                  truncstore_us_vi8, masked_truncstore_us_vi8>;
+
+defm VPMOVDW    : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc,
+                                  truncstorevi16, masked_truncstorevi16>;
+defm VPMOVSDW   : avx512_trunc_dw<0x23, "vpmovsdw",   X86vtruncs,
+                                  truncstore_s_vi16, masked_truncstore_s_vi16>;
+defm VPMOVUSDW  : avx512_trunc_dw<0x13, "vpmovusdw",  X86vtruncus,
+                                  truncstore_us_vi16, masked_truncstore_us_vi16>;
+
+defm VPMOVWB    : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc,
+                                  truncstorevi8, masked_truncstorevi8>;
+defm VPMOVSWB   : avx512_trunc_wb<0x20, "vpmovswb",   X86vtruncs,
+                                  truncstore_s_vi8, masked_truncstore_s_vi8>;
+defm VPMOVUSWB  : avx512_trunc_wb<0x10, "vpmovuswb",  X86vtruncus,
+                                  truncstore_us_vi8, masked_truncstore_us_vi8>;
+
+let Predicates = [HasAVX512, NoVLX] in {
+def: Pat<(v8i16 (X86vtrunc (v8i32 VR256X:$src))),
+         (v8i16 (EXTRACT_SUBREG
+                 (v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
+                                          VR256X:$src, sub_ymm)))), sub_xmm))>;
+def: Pat<(v4i32 (X86vtrunc (v4i64 VR256X:$src))),
+         (v4i32 (EXTRACT_SUBREG
+                 (v8i32 (VPMOVQDZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
+                                           VR256X:$src, sub_ymm)))), sub_xmm))>;
+}
+
+let Predicates = [HasBWI, NoVLX] in {
+def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))),
+         (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF),
+                                            VR256X:$src, sub_ymm))), sub_xmm))>;
+}
+
+multiclass avx512_extend_common<bits<8> opc, string OpcodeStr,
+              X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo,
+              X86MemOperand x86memop, PatFrag LdFrag, SDPatternOperator OpNode>{
+  let ExeDomain = DestInfo.ExeDomain in {
+  defm rr   : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+                    (ins SrcInfo.RC:$src), OpcodeStr ,"$src", "$src",
+                    (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>,
+                  EVEX;
+
+  defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+                  (ins x86memop:$src), OpcodeStr ,"$src", "$src",
+                  (DestInfo.VT (LdFrag addr:$src))>,
+                EVEX;
+  }
+}
+
+multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr,
+          SDPatternOperator OpNode,
+          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+  let Predicates = [HasVLX, HasBWI] in {
+    defm Z128:  avx512_extend_common<opc, OpcodeStr, v8i16x_info,
+                    v16i8x_info, i64mem, LdFrag, OpNode>,
+                     EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128;
+
+    defm Z256:  avx512_extend_common<opc, OpcodeStr, v16i16x_info,
+                    v16i8x_info, i128mem, LdFrag, OpNode>,
+                     EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256;
+  }
+  let Predicates = [HasBWI] in {
+    defm Z   :  avx512_extend_common<opc, OpcodeStr, v32i16_info,
+                    v32i8x_info, i256mem, LdFrag, OpNode>,
+                     EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512;
+  }
+}
+
+multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr,
+          SDPatternOperator OpNode,
+          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+  let Predicates = [HasVLX, HasAVX512] in {
+    defm Z128:  avx512_extend_common<opc, OpcodeStr, v4i32x_info,
+                   v16i8x_info, i32mem, LdFrag, OpNode>,
+                         EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128;
+
+    defm Z256:  avx512_extend_common<opc, OpcodeStr, v8i32x_info,
+                   v16i8x_info, i64mem, LdFrag, OpNode>,
+                         EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256;
+  }
+  let Predicates = [HasAVX512] in {
+    defm Z   :  avx512_extend_common<opc, OpcodeStr, v16i32_info,
+                   v16i8x_info, i128mem, LdFrag, OpNode>,
+                         EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512;
+  }
+}
+
+multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr,
+          SDPatternOperator OpNode,
+          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+  let Predicates = [HasVLX, HasAVX512] in {
+    defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
+                   v16i8x_info, i16mem, LdFrag, OpNode>,
+                     EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128;
+
+    defm Z256:  avx512_extend_common<opc, OpcodeStr, v4i64x_info,
+                   v16i8x_info, i32mem, LdFrag, OpNode>,
+                     EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256;
+  }
+  let Predicates = [HasAVX512] in {
+    defm Z   :  avx512_extend_common<opc, OpcodeStr, v8i64_info,
+                   v16i8x_info, i64mem, LdFrag, OpNode>,
+                     EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512;
+  }
+}
+
+multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr,
+         SDPatternOperator OpNode,
+         string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+  let Predicates = [HasVLX, HasAVX512] in {
+    defm Z128:  avx512_extend_common<opc, OpcodeStr, v4i32x_info,
+                   v8i16x_info, i64mem, LdFrag, OpNode>,
+                     EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128;
+
+    defm Z256:  avx512_extend_common<opc, OpcodeStr, v8i32x_info,
+                   v8i16x_info, i128mem, LdFrag, OpNode>,
+                     EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256;
+  }
+  let Predicates = [HasAVX512] in {
+    defm Z   :  avx512_extend_common<opc, OpcodeStr, v16i32_info,
+                   v16i16x_info, i256mem, LdFrag, OpNode>,
+                     EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512;
+  }
+}
+
+multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr,
+         SDPatternOperator OpNode,
+         string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+  let Predicates = [HasVLX, HasAVX512] in {
+    defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
+                   v8i16x_info, i32mem, LdFrag, OpNode>,
+                     EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128;
+
+    defm Z256:  avx512_extend_common<opc, OpcodeStr, v4i64x_info,
+                   v8i16x_info, i64mem, LdFrag, OpNode>,
+                     EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256;
+  }
+  let Predicates = [HasAVX512] in {
+    defm Z   :  avx512_extend_common<opc, OpcodeStr, v8i64_info,
+                   v8i16x_info, i128mem, LdFrag, OpNode>,
+                     EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512;
+  }
+}
+
+multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr,
+         SDPatternOperator OpNode,
+         string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
+
+  let Predicates = [HasVLX, HasAVX512] in {
+    defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
+                   v4i32x_info, i64mem, LdFrag, OpNode>,
+                     EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128;
+
+    defm Z256:  avx512_extend_common<opc, OpcodeStr, v4i64x_info,
+                   v4i32x_info, i128mem, LdFrag, OpNode>,
+                     EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256;
+  }
+  let Predicates = [HasAVX512] in {
+    defm Z   :  avx512_extend_common<opc, OpcodeStr, v8i64_info,
+                   v8i32x_info, i256mem, LdFrag, OpNode>,
+                     EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512;
+  }
+}
+
+defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, "z">;
+defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, "z">;
+defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, "z">;
+defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, "z">;
+defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, "z">;
+defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, "z">;
+
+defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, "s">;
+defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, "s">;
+defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, "s">;
+defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, "s">;
+defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, "s">;
+defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, "s">;
+
+// EXTLOAD patterns, implemented using vpmovz
+multiclass avx512_ext_lowering<string InstrStr, X86VectorVTInfo To,
+                               X86VectorVTInfo From, PatFrag LdFrag> {
+  def : Pat<(To.VT (LdFrag addr:$src)),
+            (!cast<Instruction>("VPMOVZX"#InstrStr#"rm") addr:$src)>;
+  def : Pat<(To.VT (vselect To.KRCWM:$mask, (LdFrag addr:$src), To.RC:$src0)),
+            (!cast<Instruction>("VPMOVZX"#InstrStr#"rmk") To.RC:$src0,
+             To.KRC:$mask, addr:$src)>;
+  def : Pat<(To.VT (vselect To.KRCWM:$mask, (LdFrag addr:$src),
+                    To.ImmAllZerosV)),
+            (!cast<Instruction>("VPMOVZX"#InstrStr#"rmkz") To.KRC:$mask,
+             addr:$src)>;
+}
+
+let Predicates = [HasVLX, HasBWI] in {
+  defm : avx512_ext_lowering<"BWZ128", v8i16x_info,  v16i8x_info,  extloadvi8>;
+  defm : avx512_ext_lowering<"BWZ256", v16i16x_info, v16i8x_info,  extloadvi8>;
+}
+let Predicates = [HasBWI] in {
+  defm : avx512_ext_lowering<"BWZ",    v32i16_info,  v32i8x_info,  extloadvi8>;
+}
+let Predicates = [HasVLX, HasAVX512] in {
+  defm : avx512_ext_lowering<"BDZ128", v4i32x_info,  v16i8x_info,  extloadvi8>;
+  defm : avx512_ext_lowering<"BDZ256", v8i32x_info,  v16i8x_info,  extloadvi8>;
+  defm : avx512_ext_lowering<"BQZ128", v2i64x_info,  v16i8x_info,  extloadvi8>;
+  defm : avx512_ext_lowering<"BQZ256", v4i64x_info,  v16i8x_info,  extloadvi8>;
+  defm : avx512_ext_lowering<"WDZ128", v4i32x_info,  v8i16x_info,  extloadvi16>;
+  defm : avx512_ext_lowering<"WDZ256", v8i32x_info,  v8i16x_info,  extloadvi16>;
+  defm : avx512_ext_lowering<"WQZ128", v2i64x_info,  v8i16x_info,  extloadvi16>;
+  defm : avx512_ext_lowering<"WQZ256", v4i64x_info,  v8i16x_info,  extloadvi16>;
+  defm : avx512_ext_lowering<"DQZ128", v2i64x_info,  v4i32x_info,  extloadvi32>;
+  defm : avx512_ext_lowering<"DQZ256", v4i64x_info,  v4i32x_info,  extloadvi32>;
+}
+let Predicates = [HasAVX512] in {
+  defm : avx512_ext_lowering<"BDZ",    v16i32_info,  v16i8x_info,  extloadvi8>;
+  defm : avx512_ext_lowering<"BQZ",    v8i64_info,   v16i8x_info,  extloadvi8>;
+  defm : avx512_ext_lowering<"WDZ",    v16i32_info,  v16i16x_info, extloadvi16>;
+  defm : avx512_ext_lowering<"WQZ",    v8i64_info,   v8i16x_info,  extloadvi16>;
+  defm : avx512_ext_lowering<"DQZ",    v8i64_info,   v8i32x_info,  extloadvi32>;
+}
+
+multiclass AVX512_pmovx_patterns<string OpcPrefix, string ExtTy,
+                                 SDNode ExtOp, PatFrag ExtLoad16> {
+  // 128-bit patterns
+  let Predicates = [HasVLX, HasBWI] in {
+  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+  def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+  def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+  def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
+  }
+  let Predicates = [HasVLX] in {
+  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
+
+  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
+
+  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
+
+  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
+
+  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
+  }
+  // 256-bit patterns
+  let Predicates = [HasVLX, HasBWI] in {
+  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
+  def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
+  def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
+  }
+  let Predicates = [HasVLX] in {
+  def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+
+  def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
+
+  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
+
+  def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+
+  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
+  }
+  // 512-bit patterns
+  let Predicates = [HasBWI] in {
+  def : Pat<(v32i16 (ExtOp (bc_v32i8 (loadv4i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWZrm) addr:$src)>;
+  }
+  let Predicates = [HasAVX512] in {
+  def : Pat<(v16i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDZrm) addr:$src)>;
+
+  def : Pat<(v8i64 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
+  def : Pat<(v8i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
+
+  def : Pat<(v16i32 (ExtOp (bc_v16i16 (loadv4i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDZrm) addr:$src)>;
+
+  def : Pat<(v8i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQZrm) addr:$src)>;
+
+  def : Pat<(v8i64 (ExtOp (bc_v8i32 (loadv4i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQZrm) addr:$src)>;
+  }
+}
+
+defm : AVX512_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
+defm : AVX512_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
+
+//===----------------------------------------------------------------------===//
+// GATHER - SCATTER Operations
+
+multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         X86MemOperand memop, PatFrag GatherNode> {
+  let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb",
+      ExeDomain = _.ExeDomain in
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, _.KRCWM:$mask_wb),
+            (ins _.RC:$src1, _.KRCWM:$mask, memop:$src2),
+            !strconcat(OpcodeStr#_.Suffix,
+            "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+            [(set _.RC:$dst, _.KRCWM:$mask_wb,
+              (GatherNode  (_.VT _.RC:$src1), _.KRCWM:$mask,
+                     vectoraddr:$src2))]>, EVEX, EVEX_K,
+             EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc,
+                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+  defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512,
+                                      vy512mem, mgatherv8i32>, EVEX_V512, VEX_W;
+  defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info512,
+                                      vz512mem,  mgatherv8i64>, EVEX_V512, VEX_W;
+let Predicates = [HasVLX] in {
+  defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
+                              vx256xmem, mgatherv4i32>, EVEX_V256, VEX_W;
+  defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info256,
+                              vy256xmem, mgatherv4i64>, EVEX_V256, VEX_W;
+  defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
+                              vx128xmem, mgatherv4i32>, EVEX_V128, VEX_W;
+  defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
+                              vx128xmem, mgatherv2i64>, EVEX_V128, VEX_W;
+}
+}
+
+multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc,
+                       AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+  defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz512mem,
+                                       mgatherv16i32>, EVEX_V512;
+  defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz512mem,
+                                       mgatherv8i64>, EVEX_V512;
+let Predicates = [HasVLX] in {
+  defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
+                                          vy256xmem, mgatherv8i32>, EVEX_V256;
+  defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info128,
+                                          vy128xmem, mgatherv4i64>, EVEX_V256;
+  defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
+                                          vx128xmem, mgatherv4i32>, EVEX_V128;
+  defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
+                                          vx64xmem, mgatherv2i64>, EVEX_V128;
+}
+}
+
+
+defm VGATHER : avx512_gather_q_pd<0x92, 0x93, avx512vl_f64_info, "vgather", "PD">,
+               avx512_gather_d_ps<0x92, 0x93, avx512vl_f32_info, "vgather", "PS">;
+
+defm VPGATHER : avx512_gather_q_pd<0x90, 0x91, avx512vl_i64_info, "vpgather", "Q">,
+                avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">;
+
+multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                          X86MemOperand memop, PatFrag ScatterNode> {
+
+let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in
+
+  def mr  : AVX5128I<opc, MRMDestMem, (outs _.KRCWM:$mask_wb),
+            (ins memop:$dst, _.KRCWM:$mask, _.RC:$src),
+            !strconcat(OpcodeStr#_.Suffix,
+            "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
+            [(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src),
+                                     _.KRCWM:$mask,  vectoraddr:$dst))]>,
+            EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc,
+                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+  defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512,
+                                      vy512mem, mscatterv8i32>, EVEX_V512, VEX_W;
+  defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info512,
+                                      vz512mem,  mscatterv8i64>, EVEX_V512, VEX_W;
+let Predicates = [HasVLX] in {
+  defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
+                              vx256xmem, mscatterv4i32>, EVEX_V256, VEX_W;
+  defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info256,
+                              vy256xmem, mscatterv4i64>, EVEX_V256, VEX_W;
+  defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
+                              vx128xmem, mscatterv4i32>, EVEX_V128, VEX_W;
+  defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
+                              vx128xmem, mscatterv2i64>, EVEX_V128, VEX_W;
+}
+}
+
+multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc,
+                       AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
+  defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz512mem,
+                                       mscatterv16i32>, EVEX_V512;
+  defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz512mem,
+                                       mscatterv8i64>, EVEX_V512;
+let Predicates = [HasVLX] in {
+  defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
+                                          vy256xmem, mscatterv8i32>, EVEX_V256;
+  defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
+                                          vy128xmem, mscatterv4i64>, EVEX_V256;
+  defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
+                                          vx128xmem, mscatterv4i32>, EVEX_V128;
+  defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
+                                          vx64xmem, mscatterv2i64>, EVEX_V128;
+}
+}
+
+defm VSCATTER : avx512_scatter_q_pd<0xA2, 0xA3, avx512vl_f64_info, "vscatter", "PD">,
+               avx512_scatter_d_ps<0xA2, 0xA3, avx512vl_f32_info, "vscatter", "PS">;
+
+defm VPSCATTER : avx512_scatter_q_pd<0xA0, 0xA1, avx512vl_i64_info, "vpscatter", "Q">,
+                avx512_scatter_d_ps<0xA0, 0xA1, avx512vl_i32_info, "vpscatter", "D">;
+
+// prefetch
+multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
+                       RegisterClass KRC, X86MemOperand memop> {
+  let Predicates = [HasPFI], hasSideEffects = 1 in
+  def m  : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
+            !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"),
+            []>, EVEX, EVEX_K;
+}
+
+defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
+                     VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
+                     VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
+                     VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
+                     VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
+                     VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
+                     VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
+                     VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
+                     VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps",
+                     VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
+                     VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
+                     VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
+                     VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps",
+                     VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
+                     VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
+                     VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
+                     VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+// Helper fragments to match sext vXi1 to vXiY.
+def v64i1sextv64i8 : PatLeaf<(v64i8
+                              (X86vsext
+                               (v64i1 (X86pcmpgtm
+                                (bc_v64i8 (v16i32 immAllZerosV)),
+                                VR512:$src))))>;
+def v32i1sextv32i16 : PatLeaf<(v32i16 (X86vsrai VR512:$src, (i8 15)))>;
+def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
+def v8i1sextv8i64   : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;
+
+multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > {
+def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
+                  !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
+                  [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX;
+}
+
+multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
+                                 string OpcodeStr, Predicate prd> {
+let Predicates = [prd] in
+  defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
+    defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
+  }
+}
+
+multiclass avx512_convert_mask_to_vector<string OpcodeStr> {
+  defm NAME##B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info,  OpcodeStr,
+                                       HasBWI>;
+  defm NAME##W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, OpcodeStr,
+                                       HasBWI>, VEX_W;
+  defm NAME##D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, OpcodeStr,
+                                       HasDQI>;
+  defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr,
+                                       HasDQI>, VEX_W;
+}
+
+defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">;
+
+multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > {
+    def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
+                        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                        [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX;
+}
+
+// Use 512bit version to implement 128/256 bit in case NoVLX.
+multiclass convert_vector_to_mask_lowering<X86VectorVTInfo ExtendInfo,
+                                                            X86VectorVTInfo _> {
+
+  def : Pat<(_.KVT (X86cvt2mask (_.VT _.RC:$src))),
+            (_.KVT (COPY_TO_REGCLASS
+                     (!cast<Instruction>(NAME#"Zrr")
+                       (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                                      _.RC:$src, _.SubRegIdx)),
+                   _.KRC))>;
+}
+
+multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
+                                   AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+  let Predicates = [prd] in
+    defm Z : convert_vector_to_mask_common <opc, VTInfo.info512, OpcodeStr>,
+                                            EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : convert_vector_to_mask_common<opc, VTInfo.info256, OpcodeStr>,
+                                              EVEX_V256;
+    defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>,
+                                               EVEX_V128;
+  }
+  let Predicates = [prd, NoVLX] in {
+    defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256>;
+    defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128>;
+  }
+}
+
+defm VPMOVB2M : avx512_convert_vector_to_mask<0x29, "vpmovb2m",
+                                              avx512vl_i8_info, HasBWI>;
+defm VPMOVW2M : avx512_convert_vector_to_mask<0x29, "vpmovw2m",
+                                              avx512vl_i16_info, HasBWI>, VEX_W;
+defm VPMOVD2M : avx512_convert_vector_to_mask<0x39, "vpmovd2m",
+                                              avx512vl_i32_info, HasDQI>;
+defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m",
+                                              avx512vl_i64_info, HasDQI>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - COMPRESS and EXPAND
+//
+
+multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _,
+                                 string OpcodeStr> {
+  defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst),
+              (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
+              (_.VT (X86compress _.RC:$src1))>, AVX5128IBase;
+
+  let mayStore = 1, hasSideEffects = 0 in
+  def mr : AVX5128I<opc, MRMDestMem, (outs),
+              (ins _.MemOp:$dst, _.RC:$src),
+              OpcodeStr # "\t{$src, $dst|$dst, $src}",
+              []>, EVEX_CD8<_.EltSize, CD8VT1>;
+
+  def mrk : AVX5128I<opc, MRMDestMem, (outs),
+              (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
+              OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+              []>,
+              EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass compress_by_vec_width_lowering<X86VectorVTInfo _ > {
+
+  def : Pat<(X86mCompressingStore addr:$dst, _.KRCWM:$mask,
+                                               (_.VT _.RC:$src)),
+            (!cast<Instruction>(NAME#_.ZSuffix##mrk)
+                            addr:$dst, _.KRCWM:$mask, _.RC:$src)>;
+}
+
+multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr,
+                                 AVX512VLVectorVTInfo VTInfo> {
+  defm Z : compress_by_vec_width_common<opc, VTInfo.info512, OpcodeStr>,
+           compress_by_vec_width_lowering<VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z256 : compress_by_vec_width_common<opc, VTInfo.info256, OpcodeStr>,
+                compress_by_vec_width_lowering<VTInfo.info256>, EVEX_V256;
+    defm Z128 : compress_by_vec_width_common<opc, VTInfo.info128, OpcodeStr>,
+                compress_by_vec_width_lowering<VTInfo.info128>, EVEX_V128;
+  }
+}
+
+defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", avx512vl_i32_info>,
+                                         EVEX;
+defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", avx512vl_i64_info>,
+                                         EVEX, VEX_W;
+defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", avx512vl_f32_info>,
+                                         EVEX;
+defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>,
+                                         EVEX, VEX_W;
+
+// expand
+multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
+                                 string OpcodeStr> {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+              (ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
+              (_.VT (X86expand _.RC:$src1))>, AVX5128IBase;
+
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+              (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1",
+              (_.VT (X86expand (_.VT (bitconvert
+                                      (_.LdFrag addr:$src1)))))>,
+            AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass expand_by_vec_width_lowering<X86VectorVTInfo _ > {
+
+  def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, undef)),
+            (!cast<Instruction>(NAME#_.ZSuffix##rmkz)
+                                        _.KRCWM:$mask, addr:$src)>;
+
+  def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask,
+                                               (_.VT _.RC:$src0))),
+            (!cast<Instruction>(NAME#_.ZSuffix##rmk)
+                            _.RC:$src0, _.KRCWM:$mask, addr:$src)>;
+}
+
+multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr,
+                                 AVX512VLVectorVTInfo VTInfo> {
+  defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr>,
+           expand_by_vec_width_lowering<VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr>,
+                expand_by_vec_width_lowering<VTInfo.info256>, EVEX_V256;
+    defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr>,
+                expand_by_vec_width_lowering<VTInfo.info128>, EVEX_V128;
+  }
+}
+
+defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", avx512vl_i32_info>,
+                                         EVEX;
+defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", avx512vl_i64_info>,
+                                         EVEX, VEX_W;
+defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>,
+                                         EVEX;
+defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>,
+                                         EVEX, VEX_W;
+
+//handle instruction  reg_vec1 = op(reg_vec,imm)
+//                               op(mem_vec,imm)
+//                               op(broadcast(eltVt),imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                      X86VectorVTInfo _>{
+  let ExeDomain = _.ExeDomain in {
+  defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
+                      (OpNode (_.VT _.RC:$src1),
+                              (i32 imm:$src2),
+                              (i32 FROUND_CURRENT))>;
+  defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.MemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
+                    (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                            (i32 imm:$src2),
+                            (i32 FROUND_CURRENT))>;
+  defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
+                    "${src1}"##_.BroadcastStr##", $src2",
+                    (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))),
+                            (i32 imm:$src2),
+                            (i32 FROUND_CURRENT))>, EVEX_B;
+  }
+}
+
+//handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
+                                             SDNode OpNode, X86VectorVTInfo _>{
+  let ExeDomain = _.ExeDomain in
+  defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, i32u8imm:$src2),
+                      OpcodeStr##_.Suffix, "$src2, {sae}, $src1",
+                      "$src1, {sae}, $src2",
+                      (OpNode (_.VT _.RC:$src1),
+                              (i32 imm:$src2),
+                              (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
+            AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+  let Predicates = [prd] in {
+    defm Z    : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+                avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+                                  EVEX_V512;
+  }
+  let Predicates = [prd, HasVLX] in {
+    defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>,
+                                  EVEX_V128;
+    defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>,
+                                  EVEX_V256;
+  }
+}
+
+//handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
+//                               op(reg_vec2,mem_vec,imm)
+//                               op(reg_vec2,broadcast(eltVt),imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                X86VectorVTInfo _>{
+  let ExeDomain = _.ExeDomain in {
+  defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (i32 imm:$src3),
+                              (i32 FROUND_CURRENT))>;
+  defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
+                    OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                    (OpNode (_.VT _.RC:$src1),
+                            (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                            (i32 imm:$src3),
+                            (i32 FROUND_CURRENT))>;
+  defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+                    OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
+                    "$src1, ${src2}"##_.BroadcastStr##", $src3",
+                    (OpNode (_.VT _.RC:$src1),
+                            (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+                            (i32 imm:$src3),
+                            (i32 FROUND_CURRENT))>, EVEX_B;
+  }
+}
+
+//handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
+//                               op(reg_vec2,mem_vec,imm)
+multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                             X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo>{
+  let ExeDomain = DestInfo.ExeDomain in {
+  defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+                  (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3),
+                  OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                  (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
+                               (SrcInfo.VT SrcInfo.RC:$src2),
+                               (i8 imm:$src3)))>;
+  defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+                (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
+                OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
+                             (SrcInfo.VT (bitconvert
+                                                (SrcInfo.LdFrag addr:$src2))),
+                             (i8 imm:$src3)))>;
+  }
+}
+
+//handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
+//                               op(reg_vec2,mem_vec,imm)
+//                               op(reg_vec2,broadcast(eltVt),imm)
+multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           X86VectorVTInfo _>:
+  avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{
+
+  let ExeDomain = _.ExeDomain in
+  defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+                    OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
+                    "$src1, ${src2}"##_.BroadcastStr##", $src3",
+                    (OpNode (_.VT _.RC:$src1),
+                            (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+                            (i8 imm:$src3))>, EVEX_B;
+}
+
+//handle scalar instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
+//                                      op(reg_vec2,mem_scalar,imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
+  defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (i32 imm:$src3),
+                              (i32 FROUND_CURRENT))>;
+  defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+                    OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                    (OpNode (_.VT _.RC:$src1),
+                            (_.VT (scalar_to_vector
+                                      (_.ScalarLdFrag addr:$src2))),
+                            (i32 imm:$src3),
+                            (i32 FROUND_CURRENT))>;
+  }
+}
+
+//handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
+                                             SDNode OpNode, X86VectorVTInfo _>{
+  let ExeDomain = _.ExeDomain in
+  defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+                      OpcodeStr, "$src3, {sae}, $src2, $src1",
+                      "$src1, $src2, {sae}, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (i32 imm:$src3),
+                              (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+//handle scalar instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr,
+                                             SDNode OpNode, X86VectorVTInfo _> {
+  defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
+                      OpcodeStr, "$src3, {sae}, $src2, $src1",
+                      "$src1, $src2, {sae}, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (i32 imm:$src3),
+                              (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+
+multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr,
+            AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+  let Predicates = [prd] in {
+    defm Z    : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+                avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+                                  EVEX_V512;
+
+  }
+  let Predicates = [prd, HasVLX] in {
+    defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>,
+                                  EVEX_V128;
+    defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>,
+                                  EVEX_V256;
+  }
+}
+
+multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr,
+                   AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo>{
+  let Predicates = [HasBWI] in {
+    defm Z    : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info512,
+                           SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V;
+  }
+  let Predicates = [HasBWI, HasVLX] in {
+    defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info128,
+                           SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V;
+    defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode,  DestInfo.info256,
+                           SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V;
+  }
+}
+
+multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
+                                bits<8> opc, SDNode OpNode>{
+  let Predicates = [HasAVX512] in {
+    defm Z    : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+  }
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+    defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+  }
+}
+
+multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
+                  X86VectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+  let Predicates = [prd] in {
+     defm Z128 : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, _>,
+                 avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNode, _>;
+  }
+}
+
+multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
+                    bits<8> opcPs, bits<8> opcPd, SDNode OpNode, Predicate prd>{
+  defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
+                            opcPs, OpNode, prd>, EVEX_CD8<32, CD8VF>;
+  defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
+                            opcPd, OpNode, prd>, EVEX_CD8<64, CD8VF>, VEX_W;
+}
+
+
+defm VREDUCE   : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
+                              X86VReduce, HasDQI>, AVX512AIi8Base, EVEX;
+defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
+                              X86VRndScale, HasAVX512>, AVX512AIi8Base, EVEX;
+defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
+                              X86VGetMant, HasAVX512>, AVX512AIi8Base, EVEX;
+
+
+defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
+                                                       0x50, X86VRange, HasDQI>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info,
+                                                       0x50, X86VRange, HasDQI>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+
+defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd", f64x_info,
+                                                 0x51, X86VRange, HasDQI>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
+                                                 0x51, X86VRange, HasDQI>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
+                                                 0x57, X86Reduces, HasDQI>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
+                                                 0x57, X86Reduces, HasDQI>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
+                                                 0x27, X86GetMants, HasAVX512>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
+                                                 0x27, X86GetMants, HasAVX512>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _,
+                                       bits<8> opc, SDNode OpNode = X86Shuf128>{
+  let Predicates = [HasAVX512] in {
+    defm Z    : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+  }
+  let Predicates = [HasAVX512, HasVLX] in {
+     defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+  }
+}
+let Predicates = [HasAVX512] in {
+def : Pat<(v16f32 (ffloor VR512:$src)),
+          (VRNDSCALEPSZrri VR512:$src, (i32 0x1))>;
+def : Pat<(v16f32 (fnearbyint VR512:$src)),
+          (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
+def : Pat<(v16f32 (fceil VR512:$src)),
+          (VRNDSCALEPSZrri VR512:$src, (i32 0x2))>;
+def : Pat<(v16f32 (frint VR512:$src)),
+          (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
+def : Pat<(v16f32 (ftrunc VR512:$src)),
+          (VRNDSCALEPSZrri VR512:$src, (i32 0x3))>;
+
+def : Pat<(v8f64 (ffloor VR512:$src)),
+          (VRNDSCALEPDZrri VR512:$src, (i32 0x1))>;
+def : Pat<(v8f64 (fnearbyint VR512:$src)),
+          (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
+def : Pat<(v8f64 (fceil VR512:$src)),
+          (VRNDSCALEPDZrri VR512:$src, (i32 0x2))>;
+def : Pat<(v8f64 (frint VR512:$src)),
+          (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
+def : Pat<(v8f64 (ftrunc VR512:$src)),
+          (VRNDSCALEPDZrri VR512:$src, (i32 0x3))>;
+}
+
+defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2",avx512vl_f64_info, 0x23>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+
+multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> {
+  defm NAME:       avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>,
+                           AVX512AIi8Base, EVEX_4V;
+}
+
+defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info>,
+                                                  EVEX_CD8<32, CD8VF>;
+defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info>,
+                                                  EVEX_CD8<64, CD8VF>, VEX_W;
+
+multiclass avx512_vpalignr_lowering<X86VectorVTInfo _ , list<Predicate> p>{
+  let Predicates = p in
+    def NAME#_.VTName#rri:
+          Pat<(_.VT (X86PAlignr _.RC:$src1, _.RC:$src2, (i8 imm:$imm))),
+              (!cast<Instruction>(NAME#_.ZSuffix#rri)
+                    _.RC:$src1, _.RC:$src2, imm:$imm)>;
+}
+
+multiclass avx512_vpalignr_lowering_common<AVX512VLVectorVTInfo _>:
+      avx512_vpalignr_lowering<_.info512, [HasBWI]>,
+      avx512_vpalignr_lowering<_.info128, [HasBWI, HasVLX]>,
+      avx512_vpalignr_lowering<_.info256, [HasBWI, HasVLX]>;
+
+defm VPALIGNR:   avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" ,
+                                          avx512vl_i8_info, avx512vl_i8_info>,
+                avx512_vpalignr_lowering_common<avx512vl_i16_info>,
+                avx512_vpalignr_lowering_common<avx512vl_i32_info>,
+                avx512_vpalignr_lowering_common<avx512vl_f32_info>,
+                avx512_vpalignr_lowering_common<avx512vl_i64_info>,
+                avx512_vpalignr_lowering_common<avx512vl_f64_info>,
+                EVEX_CD8<8, CD8VF>;
+
+defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" ,
+                    avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>;
+
+multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           X86VectorVTInfo _> {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1), OpcodeStr,
+                    "$src1", "$src1",
+                    (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase;
+
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.MemOp:$src1), OpcodeStr,
+                  "$src1", "$src1",
+                  (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
+            EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _> :
+           avx512_unary_rm<opc, OpcodeStr, OpNode, _> {
+  defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                  (ins _.ScalarMemOp:$src1), OpcodeStr,
+                  "${src1}"##_.BroadcastStr,
+                  "${src1}"##_.BroadcastStr,
+                  (_.VT (OpNode (X86VBroadcast
+                                    (_.ScalarLdFrag addr:$src1))))>,
+             EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+}
+
+multiclass avx512_unary_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+  let Predicates = [prd] in
+    defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>,
+                              EVEX_V256;
+    defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info128>,
+                              EVEX_V128;
+  }
+}
+
+multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                               AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+  let Predicates = [prd] in
+    defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>,
+                              EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>,
+                                 EVEX_V256;
+    defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>,
+                                 EVEX_V128;
+  }
+}
+
+multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
+                                 SDNode OpNode, Predicate prd> {
+  defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, avx512vl_i64_info,
+                               prd>, VEX_W;
+  defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, avx512vl_i32_info,
+                               prd>;
+}
+
+multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
+                                 SDNode OpNode, Predicate prd> {
+  defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, avx512vl_i16_info, prd>;
+  defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, avx512vl_i8_info, prd>;
+}
+
+multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
+                                  bits<8> opc_d, bits<8> opc_q,
+                                  string OpcodeStr, SDNode OpNode> {
+  defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode,
+                                    HasAVX512>,
+              avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode,
+                                    HasBWI>;
+}
+
+defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", X86Abs>;
+
+def avx512_v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
+                                                      VR128X:$src))>;
+def avx512_v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128X:$src, (i8 15)))>;
+def avx512_v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128X:$src, (i8 31)))>;
+def avx512_v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
+                                                      VR256X:$src))>;
+def avx512_v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256X:$src, (i8 15)))>;
+def avx512_v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256X:$src, (i8 31)))>;
+
+let Predicates = [HasBWI, HasVLX] in {
+  def : Pat<(xor
+            (bc_v2i64 (avx512_v16i1sextv16i8)),
+            (bc_v2i64 (add (v16i8 VR128X:$src), (avx512_v16i1sextv16i8)))),
+            (VPABSBZ128rr VR128X:$src)>;
+  def : Pat<(xor
+            (bc_v2i64 (avx512_v8i1sextv8i16)),
+            (bc_v2i64 (add (v8i16 VR128X:$src), (avx512_v8i1sextv8i16)))),
+            (VPABSWZ128rr VR128X:$src)>;
+  def : Pat<(xor
+            (bc_v4i64 (avx512_v32i1sextv32i8)),
+            (bc_v4i64 (add (v32i8 VR256X:$src), (avx512_v32i1sextv32i8)))),
+            (VPABSBZ256rr VR256X:$src)>;
+  def : Pat<(xor
+            (bc_v4i64 (avx512_v16i1sextv16i16)),
+            (bc_v4i64 (add (v16i16 VR256X:$src), (avx512_v16i1sextv16i16)))),
+            (VPABSWZ256rr VR256X:$src)>;
+}
+let Predicates = [HasAVX512, HasVLX] in {
+  def : Pat<(xor
+            (bc_v2i64 (avx512_v4i1sextv4i32)),
+            (bc_v2i64 (add (v4i32 VR128X:$src), (avx512_v4i1sextv4i32)))),
+            (VPABSDZ128rr VR128X:$src)>;
+  def : Pat<(xor
+            (bc_v4i64 (avx512_v8i1sextv8i32)),
+            (bc_v4i64 (add (v8i32 VR256X:$src), (avx512_v8i1sextv8i32)))),
+            (VPABSDZ256rr VR256X:$src)>;
+}
+
+let Predicates = [HasAVX512] in {
+def : Pat<(xor
+          (bc_v8i64 (v16i1sextv16i32)),
+          (bc_v8i64 (add (v16i32 VR512:$src), (v16i1sextv16i32)))),
+          (VPABSDZrr VR512:$src)>;
+def : Pat<(xor
+          (bc_v8i64 (v8i1sextv8i64)),
+          (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))),
+          (VPABSQZrr VR512:$src)>;
+}
+let Predicates = [HasBWI] in {
+def : Pat<(xor
+          (bc_v8i64 (v64i1sextv64i8)),
+          (bc_v8i64 (add (v64i8 VR512:$src), (v64i1sextv64i8)))),
+          (VPABSBZrr VR512:$src)>;
+def : Pat<(xor
+          (bc_v8i64 (v32i1sextv32i16)),
+          (bc_v8i64 (add (v32i16 VR512:$src), (v32i1sextv32i16)))),
+          (VPABSWZrr VR512:$src)>;
+}
+
+multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{
+
+  defm NAME :          avx512_unary_rm_vl_dq<opc, opc, OpcodeStr, ctlz, prd>;
+}
+
+defm VPLZCNT    : avx512_ctlz<0x44, "vplzcnt", HasCDI>;
+defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, HasCDI>;
+
+//===---------------------------------------------------------------------===//
+// Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode>{
+  defm NAME:       avx512_unary_rm_vl<opc, OpcodeStr, OpNode, avx512vl_f32_info,
+                                      HasAVX512>, XS;
+}
+
+defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup>;
+defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - MOVDDUP
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _> {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                   (ins _.RC:$src), OpcodeStr, "$src", "$src",
+                   (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX;
+  defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                 (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
+                 (_.VT (OpNode (_.VT (scalar_to_vector
+                                       (_.ScalarLdFrag addr:$src)))))>,
+                 EVEX, EVEX_CD8<_.EltSize, CD8VH>;
+}
+
+multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                  AVX512VLVectorVTInfo VTInfo> {
+
+  defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>,
+                               EVEX_V256;
+    defm Z128 : avx512_movddup_128<opc, OpcodeStr, OpNode, VTInfo.info128>,
+                               EVEX_V128;
+  }
+}
+
+multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode>{
+  defm NAME:      avx512_movddup_common<opc, OpcodeStr, OpNode,
+                                        avx512vl_f64_info>, XD, VEX_W;
+}
+
+defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>;
+
+let Predicates = [HasVLX] in {
+def : Pat<(X86Movddup (loadv2f64 addr:$src)),
+          (VMOVDDUPZ128rm addr:$src)>;
+def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+          (VMOVDDUPZ128rm addr:$src)>;
+def : Pat<(v2f64 (X86VBroadcast f64:$src)),
+          (VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Unpack Instructions
+//===----------------------------------------------------------------------===//
+defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512,
+                                 SSE_ALU_ITINS_S>;
+defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512,
+                                 SSE_ALU_ITINS_S>;
+
+defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl,
+                                       SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKHBW : avx512_binop_rm_vl_b<0x68, "vpunpckhbw", X86Unpckh,
+                                       SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKLWD : avx512_binop_rm_vl_w<0x61, "vpunpcklwd", X86Unpckl,
+                                       SSE_INTALU_ITINS_P, HasBWI>;
+defm VPUNPCKHWD : avx512_binop_rm_vl_w<0x69, "vpunpckhwd", X86Unpckh,
+                                       SSE_INTALU_ITINS_P, HasBWI>;
+
+defm VPUNPCKLDQ : avx512_binop_rm_vl_d<0x62, "vpunpckldq", X86Unpckl,
+                                       SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKHDQ : avx512_binop_rm_vl_d<0x6A, "vpunpckhdq", X86Unpckh,
+                                       SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKLQDQ : avx512_binop_rm_vl_q<0x6C, "vpunpcklqdq", X86Unpckl,
+                                       SSE_INTALU_ITINS_P, HasAVX512>;
+defm VPUNPCKHQDQ : avx512_binop_rm_vl_q<0x6D, "vpunpckhqdq", X86Unpckh,
+                                       SSE_INTALU_ITINS_P, HasAVX512>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Extract & Insert Integer Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _> {
+  def mr : AVX512Ii8<opc, MRMDestMem, (outs),
+              (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
+              OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+              [(store (_.EltVT (trunc (assertzext (OpNode (_.VT _.RC:$src1),
+                                                          imm:$src2)))),
+                      addr:$dst)]>,
+              EVEX, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
+  let Predicates = [HasBWI] in {
+    def rr : AVX512Ii8<0x14, MRMDestReg, (outs GR32orGR64:$dst),
+                  (ins _.RC:$src1, u8imm:$src2),
+                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  [(set GR32orGR64:$dst,
+                        (X86pextrb (_.VT _.RC:$src1), imm:$src2))]>,
+                  EVEX, TAPD;
+
+    defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD;
+  }
+}
+
+multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
+  let Predicates = [HasBWI] in {
+    def rr : AVX512Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst),
+                  (ins _.RC:$src1, u8imm:$src2),
+                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  [(set GR32orGR64:$dst,
+                        (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>,
+                  EVEX, PD;
+
+    let hasSideEffects = 0 in
+    def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
+                   (ins _.RC:$src1, u8imm:$src2),
+                   OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                   EVEX, TAPD;
+
+    defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD;
+  }
+}
+
+multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _,
+                                                            RegisterClass GRC> {
+  let Predicates = [HasDQI] in {
+    def rr : AVX512Ii8<0x16, MRMDestReg, (outs GRC:$dst),
+                  (ins _.RC:$src1, u8imm:$src2),
+                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  [(set GRC:$dst,
+                      (extractelt (_.VT _.RC:$src1), imm:$src2))]>,
+                  EVEX, TAPD;
+
+    def mr : AVX512Ii8<0x16, MRMDestMem, (outs),
+                (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
+                OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                [(store (extractelt (_.VT _.RC:$src1),
+                                    imm:$src2),addr:$dst)]>,
+                EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD;
+  }
+}
+
+defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>;
+defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>;
+defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>;
+defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W;
+
+multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                            X86VectorVTInfo _, PatFrag LdFrag> {
+  def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst),
+      (ins _.RC:$src1,  _.ScalarMemOp:$src2, u8imm:$src3),
+      OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+      [(set _.RC:$dst,
+          (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>,
+      EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+}
+
+multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                            X86VectorVTInfo _, PatFrag LdFrag> {
+  let Predicates = [HasBWI] in {
+    def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst),
+        (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3),
+        OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+        [(set _.RC:$dst,
+            (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V;
+
+    defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>;
+  }
+}
+
+multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
+                                         X86VectorVTInfo _, RegisterClass GRC> {
+  let Predicates = [HasDQI] in {
+    def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst),
+        (ins _.RC:$src1, GRC:$src2, u8imm:$src3),
+        OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+        [(set _.RC:$dst,
+            (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>,
+        EVEX_4V, TAPD;
+
+    defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
+                                    _.ScalarLdFrag>, TAPD;
+  }
+}
+
+defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info,
+                                     extloadi8>, TAPD;
+defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info,
+                                     extloadi16>, PD;
+defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>;
+defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W;
+//===----------------------------------------------------------------------===//
+// VSHUFPS - VSHUFPD Operations
+//===----------------------------------------------------------------------===//
+multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
+                                                AVX512VLVectorVTInfo VTInfo_FP>{
+  defm NAME:     avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp>,
+                                   EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
+                                   AVX512AIi8Base, EVEX_4V;
+}
+
+defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS;
+defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W;
+//===----------------------------------------------------------------------===//
+// AVX-512 - Byte shift Left/Right
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
+                             Format MRMm, string OpcodeStr, X86VectorVTInfo _>{
+  def rr : AVX512<opc, MRMr,
+             (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>;
+  def rm : AVX512<opc, MRMm,
+           (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set _.RC:$dst,(_.VT (OpNode
+                                 (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                 (i8 imm:$src2))))]>;
+}
+
+multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr,
+                                 Format MRMm, string OpcodeStr, Predicate prd>{
+  let Predicates = [prd] in
+    defm Z512 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
+                                    OpcodeStr, v64i8_info>, EVEX_V512;
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
+                                    OpcodeStr, v32i8x_info>, EVEX_V256;
+    defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
+                                    OpcodeStr, v16i8x_info>, EVEX_V128;
+  }
+}
+defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq",
+                                       HasBWI>, AVX512PDIi8Base, EVEX_4V;
+defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
+                                       HasBWI>, AVX512PDIi8Base, EVEX_4V;
+
+
+multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
+                                string OpcodeStr, X86VectorVTInfo _dst,
+                                X86VectorVTInfo _src>{
+  def rr : AVX512BI<opc, MRMSrcReg,
+             (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _dst.RC:$dst,(_dst.VT
+                                (OpNode (_src.VT _src.RC:$src1),
+                                        (_src.VT _src.RC:$src2))))]>;
+  def rm : AVX512BI<opc, MRMSrcMem,
+           (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set _dst.RC:$dst,(_dst.VT
+                              (OpNode (_src.VT _src.RC:$src1),
+                              (_src.VT (bitconvert
+                                        (_src.LdFrag addr:$src2))))))]>;
+}
+
+multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
+                                    string OpcodeStr, Predicate prd> {
+  let Predicates = [prd] in
+    defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v8i64_info,
+                                    v64i8_info>, EVEX_V512;
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v4i64x_info,
+                                    v32i8x_info>, EVEX_V256;
+    defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v2i64x_info,
+                                    v16i8x_info>, EVEX_V128;
+  }
+}
+
+defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
+                                       HasBWI>, EVEX_4V;
+
+multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          X86VectorVTInfo _>{
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+  defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.RC:$src3, u8imm:$src4),
+                      OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (_.VT _.RC:$src3),
+                              (i8 imm:$src4)), 1, 1>, AVX512AIi8Base, EVEX_4V;
+  defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
+                    OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
+                    (OpNode (_.VT _.RC:$src1),
+                            (_.VT _.RC:$src2),
+                            (_.VT (bitconvert (_.LdFrag addr:$src3))),
+                            (i8 imm:$src4)), 1, 0>,
+                    AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+  defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
+                    OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2",
+                    "$src2, ${src3}"##_.BroadcastStr##", $src4",
+                    (OpNode (_.VT _.RC:$src1),
+                            (_.VT _.RC:$src2),
+                            (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+                            (i8 imm:$src4)), 1, 0>, EVEX_B,
+                    AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+  }// Constraints = "$src1 = $dst"
+}
+
+multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{
+  let Predicates = [HasAVX512] in
+    defm Z    : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info512>, EVEX_V512;
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info128>, EVEX_V128;
+    defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info256>, EVEX_V256;
+  }
+}
+
+defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", avx512vl_i32_info>;
+defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - FixupImm
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  X86VectorVTInfo _>{
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+    defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
+                         OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+                        (OpNode (_.VT _.RC:$src1),
+                                (_.VT _.RC:$src2),
+                                (_.IntVT _.RC:$src3),
+                                (i32 imm:$src4),
+                                (i32 FROUND_CURRENT))>;
+    defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4),
+                      OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (_.IntVT (bitconvert (_.LdFrag addr:$src3))),
+                              (i32 imm:$src4),
+                              (i32 FROUND_CURRENT))>;
+    defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
+                    OpcodeStr##_.Suffix, "$src4, ${src3}"##_.BroadcastStr##", $src2",
+                    "$src2, ${src3}"##_.BroadcastStr##", $src4",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (_.IntVT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+                              (i32 imm:$src4),
+                              (i32 FROUND_CURRENT))>, EVEX_B;
+  } // Constraints = "$src1 = $dst"
+}
+
+multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr,
+                                      SDNode OpNode, X86VectorVTInfo _>{
+let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+  defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
+                      OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
+                      "$src2, $src3, {sae}, $src4",
+                      (OpNode (_.VT _.RC:$src1),
+                                (_.VT _.RC:$src2),
+                                (_.IntVT _.RC:$src3),
+                                (i32 imm:$src4),
+                                (i32 FROUND_NO_EXC))>, EVEX_B;
+  }
+}
+
+multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  X86VectorVTInfo _, X86VectorVTInfo _src3VT> {
+  let Constraints = "$src1 = $dst" , Predicates = [HasAVX512],
+      ExeDomain = _.ExeDomain in {
+    defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
+                      OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (_src3VT.VT _src3VT.RC:$src3),
+                              (i32 imm:$src4),
+                              (i32 FROUND_CURRENT))>;
+
+    defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
+                      OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
+                      "$src2, $src3, {sae}, $src4",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (_src3VT.VT _src3VT.RC:$src3),
+                              (i32 imm:$src4),
+                              (i32 FROUND_NO_EXC))>, EVEX_B;
+    defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                     (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
+                     OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+                     (OpNode (_.VT _.RC:$src1),
+                             (_.VT _.RC:$src2),
+                             (_src3VT.VT (scalar_to_vector
+                                       (_src3VT.ScalarLdFrag addr:$src3))),
+                             (i32 imm:$src4),
+                             (i32 FROUND_CURRENT))>;
+  }
+}
+
+multiclass avx512_fixupimm_packed_all<AVX512VLVectorVTInfo _Vec>{
+  let Predicates = [HasAVX512] in
+    defm Z    : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info512>,
+                avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, _Vec.info512>,
+                                  AVX512AIi8Base, EVEX_4V, EVEX_V512;
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info128>,
+                                  AVX512AIi8Base, EVEX_4V, EVEX_V128;
+    defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info256>,
+                                  AVX512AIi8Base, EVEX_4V, EVEX_V256;
+  }
+}
+
+defm VFIXUPIMMSS : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
+                                          f32x_info, v4i32x_info>,
+                         AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+defm VFIXUPIMMSD : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
+                                          f64x_info, v2i64x_info>,
+                         AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VFIXUPIMMPS : avx512_fixupimm_packed_all<avx512vl_f32_info>,
+                         EVEX_CD8<32, CD8VF>;
+defm VFIXUPIMMPD : avx512_fixupimm_packed_all<avx512vl_f64_info>,
+                         EVEX_CD8<64, CD8VF>, VEX_W;
+
+
+
+// Patterns used to select SSE scalar fp arithmetic instructions from
+// either:
+//
+// (1) a scalar fp operation followed by a blend
+//
+// The effect is that the backend no longer emits unnecessary vector
+// insert instructions immediately after SSE scalar fp instructions
+// like addss or mulss.
+//
+// For example, given the following code:
+//   __m128 foo(__m128 A, __m128 B) {
+//     A[0] += B[0];
+//     return A;
+//   }
+//
+// Previously we generated:
+//   addss %xmm0, %xmm1
+//   movss %xmm1, %xmm0
+//
+// We now generate:
+//   addss %xmm1, %xmm0
+//
+// (2) a vector packed single/double fp operation followed by a vector insert
+//
+// The effect is that the backend converts the packed fp instruction
+// followed by a vector insert into a single SSE scalar fp instruction.
+//
+// For example, given the following code:
+//   __m128 foo(__m128 A, __m128 B) {
+//     __m128 C = A + B;
+//     return (__m128) {c[0], a[1], a[2], a[3]};
+//   }
+//
+// Previously we generated:
+//   addps %xmm0, %xmm1
+//   movss %xmm1, %xmm0
+//
+// We now generate:
+//   addss %xmm1, %xmm0
+
+// TODO: Some canonicalization in lowering would simplify the number of
+// patterns we have to try to match.
+multiclass AVX512_scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
+  let Predicates = [HasAVX512] in {
+    // extracted scalar math op with insert via movss
+    def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector
+          (Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))),
+          FR32X:$src))))),
+      (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst,
+          (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
+
+    // extracted scalar math op with insert via blend
+    def : Pat<(v4f32 (X86Blendi (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector
+          (Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))),
+          FR32X:$src))), (i8 1))),
+      (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst,
+          (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
+
+    // vector math op with insert via movss
+    def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst),
+          (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)))),
+      (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>;
+
+    // vector math op with insert via blend
+    def : Pat<(v4f32 (X86Blendi (v4f32 VR128X:$dst),
+          (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)), (i8 1))),
+      (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>;
+
+    // extracted masked scalar math op with insert via movss
+    def : Pat<(X86Movss (v4f32 VR128X:$src1),
+               (scalar_to_vector
+                (X86selects VK1WM:$mask,
+                            (Op (f32 (extractelt (v4f32 VR128X:$src1), (iPTR 0))),
+                                FR32X:$src2),
+                            FR32X:$src0))),
+      (!cast<I>("V"#OpcPrefix#SSZrr_Intk) (COPY_TO_REGCLASS FR32X:$src0, VR128X),
+          VK1WM:$mask, v4f32:$src1,
+          (COPY_TO_REGCLASS FR32X:$src2, VR128X))>;
+  }
+}
+
+defm : AVX512_scalar_math_f32_patterns<fadd, "ADD">;
+defm : AVX512_scalar_math_f32_patterns<fsub, "SUB">;
+defm : AVX512_scalar_math_f32_patterns<fmul, "MUL">;
+defm : AVX512_scalar_math_f32_patterns<fdiv, "DIV">;
+
+multiclass AVX512_scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
+  let Predicates = [HasAVX512] in {
+    // extracted scalar math op with insert via movsd
+    def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector
+          (Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))),
+          FR64X:$src))))),
+      (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst,
+          (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+    // extracted scalar math op with insert via blend
+    def : Pat<(v2f64 (X86Blendi (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector
+          (Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))),
+          FR64X:$src))), (i8 1))),
+      (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst,
+          (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+    // vector math op with insert via movsd
+    def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst),
+          (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)))),
+      (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>;
+
+    // vector math op with insert via blend
+    def : Pat<(v2f64 (X86Blendi (v2f64 VR128X:$dst),
+          (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)), (i8 1))),
+      (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>;
+
+    // extracted masked scalar math op with insert via movss
+    def : Pat<(X86Movsd (v2f64 VR128X:$src1),
+               (scalar_to_vector
+                (X86selects VK1WM:$mask,
+                            (Op (f64 (extractelt (v2f64 VR128X:$src1), (iPTR 0))),
+                                FR64X:$src2),
+                            FR64X:$src0))),
+      (!cast<I>("V"#OpcPrefix#SDZrr_Intk) (COPY_TO_REGCLASS FR64X:$src0, VR128X),
+          VK1WM:$mask, v2f64:$src1,
+          (COPY_TO_REGCLASS FR64X:$src2, VR128X))>;
+  }
+}
+
+defm : AVX512_scalar_math_f64_patterns<fadd, "ADD">;
+defm : AVX512_scalar_math_f64_patterns<fsub, "SUB">;
+defm : AVX512_scalar_math_f64_patterns<fmul, "MUL">;
+defm : AVX512_scalar_math_f64_patterns<fdiv, "DIV">;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
new file mode 100644
index 000000000000..bfd21c062aa2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -0,0 +1,1375 @@
+//===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the integer arithmetic instructions in the X86
+// architecture.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LEA - Load Effective Address
+let SchedRW = [WriteLEA] in {
+let hasSideEffects = 0 in
+def LEA16r   : I<0x8D, MRMSrcMem,
+                 (outs GR16:$dst), (ins anymem:$src),
+                 "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize16;
+let isReMaterializable = 1 in
+def LEA32r   : I<0x8D, MRMSrcMem,
+                 (outs GR32:$dst), (ins anymem:$src),
+                 "lea{l}\t{$src|$dst}, {$dst|$src}",
+                 [(set GR32:$dst, lea32addr:$src)], IIC_LEA>,
+                 OpSize32, Requires<[Not64BitMode]>;
+
+def LEA64_32r : I<0x8D, MRMSrcMem,
+                  (outs GR32:$dst), (ins lea64_32mem:$src),
+                  "lea{l}\t{$src|$dst}, {$dst|$src}",
+                  [(set GR32:$dst, lea64_32addr:$src)], IIC_LEA>,
+                  OpSize32, Requires<[In64BitMode]>;
+
+let isReMaterializable = 1 in
+def LEA64r   : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
+                  "lea{q}\t{$src|$dst}, {$dst|$src}",
+                  [(set GR64:$dst, lea64addr:$src)], IIC_LEA>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+//  Fixed-Register Multiplication and Division Instructions.
+//
+
+// SchedModel info for instruction that loads one value and gets the second
+// (and possibly third) value from a register.
+// This is used for instructions that put the memory operands before other
+// uses.
+class SchedLoadReg<SchedWrite SW> : Sched<[SW,
+  // Memory operand.
+  ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+  // Register reads (implicit or explicit).
+  ReadAfterLd, ReadAfterLd]>;
+
+// Extra precision multiplication
+
+// AL is really implied by AX, but the registers in Defs must match the
+// SDNode results (i8, i32).
+// AL,AH = AL*GR8
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8r  : I<0xF6, MRM4r, (outs),  (ins GR8:$src), "mul{b}\t$src",
+               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+               // This probably ought to be moved to a def : Pat<> if the
+               // syntax can be accepted.
+               [(set AL, (mul AL, GR8:$src)),
+                (implicit EFLAGS)], IIC_MUL8>, Sched<[WriteIMul]>;
+// AX,DX = AX*GR16
+let Defs = [AX,DX,EFLAGS], Uses = [AX], hasSideEffects = 0 in
+def MUL16r : I<0xF7, MRM4r, (outs),  (ins GR16:$src),
+               "mul{w}\t$src",
+               [], IIC_MUL16_REG>, OpSize16, Sched<[WriteIMul]>;
+// EAX,EDX = EAX*GR32
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], hasSideEffects = 0 in
+def MUL32r : I<0xF7, MRM4r, (outs),  (ins GR32:$src),
+               "mul{l}\t$src",
+               [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/],
+               IIC_MUL32_REG>, OpSize32, Sched<[WriteIMul]>;
+// RAX,RDX = RAX*GR64
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], hasSideEffects = 0 in
+def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
+                "mul{q}\t$src",
+                [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/],
+                IIC_MUL64>, Sched<[WriteIMul]>;
+// AL,AH = AL*[mem8]
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8m  : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
+               "mul{b}\t$src",
+               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+               // This probably ought to be moved to a def : Pat<> if the
+               // syntax can be accepted.
+               [(set AL, (mul AL, (loadi8 addr:$src))),
+                (implicit EFLAGS)], IIC_MUL8>, SchedLoadReg<WriteIMulLd>;
+// AX,DX = AX*[mem16]
+let mayLoad = 1, hasSideEffects = 0 in {
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
+               "mul{w}\t$src",
+               [], IIC_MUL16_MEM>, OpSize16, SchedLoadReg<WriteIMulLd>;
+// EAX,EDX = EAX*[mem32]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
+              "mul{l}\t$src",
+              [], IIC_MUL32_MEM>, OpSize32, SchedLoadReg<WriteIMulLd>;
+// RAX,RDX = RAX*[mem64]
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
+                "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>;
+}
+
+let hasSideEffects = 0 in {
+// AL,AH = AL*GR8
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8r  : I<0xF6, MRM5r, (outs),  (ins GR8:$src), "imul{b}\t$src", [],
+              IIC_IMUL8>, Sched<[WriteIMul]>;
+// AX,DX = AX*GR16
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16r : I<0xF7, MRM5r, (outs),  (ins GR16:$src), "imul{w}\t$src", [],
+              IIC_IMUL16_RR>, OpSize16, Sched<[WriteIMul]>;
+// EAX,EDX = EAX*GR32
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32r : I<0xF7, MRM5r, (outs),  (ins GR32:$src), "imul{l}\t$src", [],
+              IIC_IMUL32_RR>, OpSize32, Sched<[WriteIMul]>;
+// RAX,RDX = RAX*GR64
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", [],
+              IIC_IMUL64_RR>, Sched<[WriteIMul]>;
+
+let mayLoad = 1 in {
+// AL,AH = AL*[mem8]
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8m  : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
+                "imul{b}\t$src", [], IIC_IMUL8>, SchedLoadReg<WriteIMulLd>;
+// AX,DX = AX*[mem16]
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
+                "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize16,
+              SchedLoadReg<WriteIMulLd>;
+// EAX,EDX = EAX*[mem32]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
+                "imul{l}\t$src", [], IIC_IMUL32_MEM>, OpSize32,
+              SchedLoadReg<WriteIMulLd>;
+// RAX,RDX = RAX*[mem64]
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
+                 "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>;
+}
+} // hasSideEffects
+
+
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst" in {
+
+let isCommutable = 1, SchedRW = [WriteIMul] in {
+// X = IMUL Y, Z --> X = IMUL Z, Y
+// Register-Register Signed Integer Multiply
+def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
+                 "imul{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, EFLAGS,
+                       (X86smul_flag GR16:$src1, GR16:$src2))], IIC_IMUL16_RR>,
+                       TB, OpSize16;
+def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
+                 "imul{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, EFLAGS,
+                       (X86smul_flag GR32:$src1, GR32:$src2))], IIC_IMUL32_RR>,
+                 TB, OpSize32;
+def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
+                                   (ins GR64:$src1, GR64:$src2),
+                  "imul{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, EFLAGS,
+                        (X86smul_flag GR64:$src1, GR64:$src2))], IIC_IMUL64_RR>,
+                 TB;
+} // isCommutable, SchedRW
+
+// Register-Memory Signed Integer Multiply
+let SchedRW = [WriteIMulLd, ReadAfterLd] in {
+def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
+                                  (ins GR16:$src1, i16mem:$src2),
+                 "imul{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, EFLAGS,
+                       (X86smul_flag GR16:$src1, (load addr:$src2)))],
+                       IIC_IMUL16_RM>,
+               TB, OpSize16;
+def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst),
+                 (ins GR32:$src1, i32mem:$src2),
+                 "imul{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, EFLAGS,
+                       (X86smul_flag GR32:$src1, (load addr:$src2)))],
+                       IIC_IMUL32_RM>,
+               TB, OpSize32;
+def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
+                                   (ins GR64:$src1, i64mem:$src2),
+                  "imul{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, EFLAGS,
+                        (X86smul_flag GR64:$src1, (load addr:$src2)))],
+                        IIC_IMUL64_RM>,
+               TB;
+} // SchedRW
+} // Constraints = "$src1 = $dst"
+
+} // Defs = [EFLAGS]
+
+// Surprisingly enough, these are not two address instructions!
+let Defs = [EFLAGS] in {
+let SchedRW = [WriteIMul] in {
+// Register-Integer Signed Integer Multiply
+def IMUL16rri  : Ii16<0x69, MRMSrcReg,                      // GR16 = GR16*I16
+                      (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR16:$dst, EFLAGS,
+                            (X86smul_flag GR16:$src1, imm:$src2))],
+                            IIC_IMUL16_RRI>, OpSize16;
+def IMUL16rri8 : Ii8<0x6B, MRMSrcReg,                       // GR16 = GR16*I8
+                     (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+                     "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR16:$dst, EFLAGS,
+                           (X86smul_flag GR16:$src1, i16immSExt8:$src2))],
+                           IIC_IMUL16_RRI>, OpSize16;
+def IMUL32rri  : Ii32<0x69, MRMSrcReg,                      // GR32 = GR32*I32
+                      (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR32:$dst, EFLAGS,
+                            (X86smul_flag GR32:$src1, imm:$src2))],
+                            IIC_IMUL32_RRI>, OpSize32;
+def IMUL32rri8 : Ii8<0x6B, MRMSrcReg,                       // GR32 = GR32*I8
+                     (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+                     "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR32:$dst, EFLAGS,
+                           (X86smul_flag GR32:$src1, i32immSExt8:$src2))],
+                           IIC_IMUL32_RRI>, OpSize32;
+def IMUL64rri32 : RIi32S<0x69, MRMSrcReg,                    // GR64 = GR64*I32
+                         (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                         "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         [(set GR64:$dst, EFLAGS,
+                             (X86smul_flag GR64:$src1, i64immSExt32:$src2))],
+                             IIC_IMUL64_RRI>;
+def IMUL64rri8 : RIi8<0x6B, MRMSrcReg,                      // GR64 = GR64*I8
+                      (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                      "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR64:$dst, EFLAGS,
+                            (X86smul_flag GR64:$src1, i64immSExt8:$src2))],
+                            IIC_IMUL64_RRI>;
+} // SchedRW
+
+// Memory-Integer Signed Integer Multiply
+let SchedRW = [WriteIMulLd] in {
+def IMUL16rmi  : Ii16<0x69, MRMSrcMem,                     // GR16 = [mem16]*I16
+                      (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
+                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR16:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1), imm:$src2))],
+                            IIC_IMUL16_RMI>,
+                 OpSize16;
+def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR16 = [mem16]*I8
+                     (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
+                     "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR16:$dst, EFLAGS,
+                           (X86smul_flag (load addr:$src1),
+                                         i16immSExt8:$src2))], IIC_IMUL16_RMI>,
+                                         OpSize16;
+def IMUL32rmi  : Ii32<0x69, MRMSrcMem,                     // GR32 = [mem32]*I32
+                      (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
+                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR32:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1), imm:$src2))],
+                            IIC_IMUL32_RMI>, OpSize32;
+def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR32 = [mem32]*I8
+                     (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
+                     "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR32:$dst, EFLAGS,
+                           (X86smul_flag (load addr:$src1),
+                                         i32immSExt8:$src2))],
+                                         IIC_IMUL32_RMI>, OpSize32;
+def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem,                   // GR64 = [mem64]*I32
+                         (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
+                         "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         [(set GR64:$dst, EFLAGS,
+                              (X86smul_flag (load addr:$src1),
+                                            i64immSExt32:$src2))],
+                                            IIC_IMUL64_RMI>;
+def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem,                      // GR64 = [mem64]*I8
+                      (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
+                      "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR64:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1),
+                                          i64immSExt8:$src2))],
+                                          IIC_IMUL64_RMI>;
+} // SchedRW
+} // Defs = [EFLAGS]
+
+
+
+
+// unsigned division/remainder
+let hasSideEffects = 1 in { // so that we don't speculatively execute
+let SchedRW = [WriteIDiv] in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8r  : I<0xF6, MRM6r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
+               "div{b}\t$src", [], IIC_DIV8_REG>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16r : I<0xF7, MRM6r, (outs),  (ins GR16:$src),   // DX:AX/r16 = AX,DX
+               "div{w}\t$src", [], IIC_DIV16>, OpSize16;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def DIV32r : I<0xF7, MRM6r, (outs),  (ins GR32:$src),   // EDX:EAX/r32 = EAX,EDX
+               "div{l}\t$src", [], IIC_DIV32>, OpSize32;
+// RDX:RAX/r64 = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
+                "div{q}\t$src", [], IIC_DIV64>;
+} // SchedRW
+
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8m  : I<0xF6, MRM6m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
+               "div{b}\t$src", [], IIC_DIV8_MEM>,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src),  // DX:AX/[mem16] = AX,DX
+               "div{w}\t$src", [], IIC_DIV16>, OpSize16,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
+def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),
+               "div{l}\t$src", [], IIC_DIV32>,
+             SchedLoadReg<WriteIDivLd>, OpSize32;
+// RDX:RAX/[mem64] = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
+                "div{q}\t$src", [], IIC_DIV64>,
+             SchedLoadReg<WriteIDivLd>;
+}
+
+// Signed division/remainder.
+let SchedRW = [WriteIDiv] in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8r : I<0xF6, MRM7r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
+               "idiv{b}\t$src", [], IIC_IDIV8>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16r: I<0xF7, MRM7r, (outs),  (ins GR16:$src),   // DX:AX/r16 = AX,DX
+               "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def IDIV32r: I<0xF7, MRM7r, (outs),  (ins GR32:$src),   // EDX:EAX/r32 = EAX,EDX
+               "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32;
+// RDX:RAX/r64 = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
+                "idiv{q}\t$src", [], IIC_IDIV64>;
+} // SchedRW
+
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
+               "idiv{b}\t$src", [], IIC_IDIV8>,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src),  // DX:AX/[mem16] = AX,DX
+               "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
+def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
+               "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
+def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
+                "idiv{q}\t$src", [], IIC_IDIV64>,
+             SchedLoadReg<WriteIDivLd>;
+}
+} // hasSideEffects = 0
+
+//===----------------------------------------------------------------------===//
+//  Two address Instructions.
+//
+
+// unary instructions
+let CodeSize = 2 in {
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+def NEG8r  : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "neg{b}\t$dst",
+               [(set GR8:$dst, (ineg GR8:$src1)),
+                (implicit EFLAGS)], IIC_UNARY_REG>;
+def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+               "neg{w}\t$dst",
+               [(set GR16:$dst, (ineg GR16:$src1)),
+                (implicit EFLAGS)], IIC_UNARY_REG>, OpSize16;
+def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+               "neg{l}\t$dst",
+               [(set GR32:$dst, (ineg GR32:$src1)),
+                (implicit EFLAGS)], IIC_UNARY_REG>, OpSize32;
+def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst",
+                [(set GR64:$dst, (ineg GR64:$src1)),
+                 (implicit EFLAGS)], IIC_UNARY_REG>;
+} // Constraints = "$src1 = $dst", SchedRW
+
+// Read-modify-write negate.
+let SchedRW = [WriteALULd, WriteRMW] in {
+def NEG8m  : I<0xF6, MRM3m, (outs), (ins i8mem :$dst),
+               "neg{b}\t$dst",
+               [(store (ineg (loadi8 addr:$dst)), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>;
+def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst),
+               "neg{w}\t$dst",
+               [(store (ineg (loadi16 addr:$dst)), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst),
+               "neg{l}\t$dst",
+               [(store (ineg (loadi32 addr:$dst)), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
+                [(store (ineg (loadi64 addr:$dst)), addr:$dst),
+                 (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // SchedRW
+} // Defs = [EFLAGS]
+
+
+// Note: NOT does not set EFLAGS!
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+// Match xor -1 to not. Favors these over a move imm + xor to save code size.
+let AddedComplexity = 15 in {
+def NOT8r  : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "not{b}\t$dst",
+               [(set GR8:$dst, (not GR8:$src1))], IIC_UNARY_REG>;
+def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+               "not{w}\t$dst",
+               [(set GR16:$dst, (not GR16:$src1))], IIC_UNARY_REG>, OpSize16;
+def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+               "not{l}\t$dst",
+               [(set GR32:$dst, (not GR32:$src1))], IIC_UNARY_REG>, OpSize32;
+def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst",
+                [(set GR64:$dst, (not GR64:$src1))], IIC_UNARY_REG>;
+}
+} // Constraints = "$src1 = $dst", SchedRW
+
+let SchedRW = [WriteALULd, WriteRMW] in {
+def NOT8m  : I<0xF6, MRM2m, (outs), (ins i8mem :$dst),
+               "not{b}\t$dst",
+               [(store (not (loadi8 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst),
+               "not{w}\t$dst",
+               [(store (not (loadi16 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+               OpSize16;
+def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst),
+               "not{l}\t$dst",
+               [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+               OpSize32;
+def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
+                [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+} // SchedRW
+} // CodeSize
+
+// TODO: inc/dec is slow for P4, but fast for Pentium-M.
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let CodeSize = 2 in
+def INC8r  : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "inc{b}\t$dst",
+               [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))],
+               IIC_UNARY_REG>;
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
+def INC16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+               "inc{w}\t$dst",
+               [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))],
+               IIC_UNARY_REG>, OpSize16;
+def INC32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+               "inc{l}\t$dst",
+               [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
+               IIC_UNARY_REG>, OpSize32;
+def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst",
+                [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))],
+                IIC_UNARY_REG>;
+} // isConvertibleToThreeAddress = 1, CodeSize = 2
+
+// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
+let CodeSize = 1, hasSideEffects = 0 in {
+def INC16r_alt : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+                   "inc{w}\t$dst", [], IIC_UNARY_REG>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def INC32r_alt : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+                   "inc{l}\t$dst", [], IIC_UNARY_REG>,
+                 OpSize32, Requires<[Not64BitMode]>;
+} // CodeSize = 1, hasSideEffects = 0
+} // Constraints = "$src1 = $dst", SchedRW
+
+let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+  def INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
+               [(store (add (loadi8 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>;
+  def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+               [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+  def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+               [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+  def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
+                  [(store (add (loadi64 addr:$dst), 1), addr:$dst),
+                   (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // CodeSize = 2, SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let CodeSize = 2 in
+def DEC8r  : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "dec{b}\t$dst",
+               [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))],
+               IIC_UNARY_REG>;
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA.
+def DEC16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+               "dec{w}\t$dst",
+               [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
+               IIC_UNARY_REG>, OpSize16;
+def DEC32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+               "dec{l}\t$dst",
+               [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
+               IIC_UNARY_REG>, OpSize32;
+def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst",
+                [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))],
+                IIC_UNARY_REG>;
+} // isConvertibleToThreeAddress = 1, CodeSize = 2
+
+// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
+let CodeSize = 1, hasSideEffects = 0 in {
+def DEC16r_alt : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+                   "dec{w}\t$dst", [], IIC_UNARY_REG>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def DEC32r_alt : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+                   "dec{l}\t$dst", [], IIC_UNARY_REG>,
+                 OpSize32, Requires<[Not64BitMode]>;
+} // CodeSize = 1, hasSideEffects = 0
+} // Constraints = "$src1 = $dst", SchedRW
+
+
+let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+  def DEC8m  : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
+               [(store (add (loadi8 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>;
+  def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+               [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16;
+  def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+               [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+  def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+                  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+                   (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // CodeSize = 2, SchedRW
+} // Defs = [EFLAGS]
+
+/// X86TypeInfo - This is a bunch of information that describes relevant X86
+/// information about value types.  For example, it can tell you what the
+/// register class and preferred load to use.
+class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
+                  PatFrag loadnode, X86MemOperand memoperand, ImmType immkind,
+                  Operand immoperand, SDPatternOperator immoperator,
+                  Operand imm8operand, SDPatternOperator imm8operator,
+                  bit hasOddOpcode, OperandSize opSize,
+                  bit hasREX_WPrefix> {
+  /// VT - This is the value type itself.
+  ValueType VT = vt;
+
+  /// InstrSuffix - This is the suffix used on instructions with this type.  For
+  /// example, i8 -> "b", i16 -> "w", i32 -> "l", i64 -> "q".
+  string InstrSuffix = instrsuffix;
+
+  /// RegClass - This is the register class associated with this type.  For
+  /// example, i8 -> GR8, i16 -> GR16, i32 -> GR32, i64 -> GR64.
+  RegisterClass RegClass = regclass;
+
+  /// LoadNode - This is the load node associated with this type.  For
+  /// example, i8 -> loadi8, i16 -> loadi16, i32 -> loadi32, i64 -> loadi64.
+  PatFrag LoadNode = loadnode;
+
+  /// MemOperand - This is the memory operand associated with this type.  For
+  /// example, i8 -> i8mem, i16 -> i16mem, i32 -> i32mem, i64 -> i64mem.
+  X86MemOperand MemOperand = memoperand;
+
+  /// ImmEncoding - This is the encoding of an immediate of this type.  For
+  /// example, i8 -> Imm8, i16 -> Imm16, i32 -> Imm32.  Note that i64 -> Imm32
+  /// since the immediate fields of i64 instructions is a 32-bit sign extended
+  /// value.
+  ImmType ImmEncoding = immkind;
+
+  /// ImmOperand - This is the operand kind of an immediate of this type.  For
+  /// example, i8 -> i8imm, i16 -> i16imm, i32 -> i32imm.  Note that i64 ->
+  /// i64i32imm since the immediate fields of i64 instructions is a 32-bit sign
+  /// extended value.
+  Operand ImmOperand = immoperand;
+
+  /// ImmOperator - This is the operator that should be used to match an
+  /// immediate of this kind in a pattern (e.g. imm, or i64immSExt32).
+  SDPatternOperator ImmOperator = immoperator;
+
+  /// Imm8Operand - This is the operand kind to use for an imm8 of this type.
+  /// For example, i8 -> <invalid>, i16 -> i16i8imm, i32 -> i32i8imm.  This is
+  /// only used for instructions that have a sign-extended imm8 field form.
+  Operand Imm8Operand = imm8operand;
+
+  /// Imm8Operator - This is the operator that should be used to match an 8-bit
+  /// sign extended immediate of this kind in a pattern (e.g. imm16immSExt8).
+  SDPatternOperator Imm8Operator = imm8operator;
+
+  /// HasOddOpcode - This bit is true if the instruction should have an odd (as
+  /// opposed to even) opcode.  Operations on i8 are usually even, operations on
+  /// other datatypes are odd.
+  bit HasOddOpcode = hasOddOpcode;
+
+  /// OpSize - Selects whether the instruction needs a 0x66 prefix based on
+  /// 16-bit vs 32-bit mode. i8/i64 set this to OpSizeFixed. i16 sets this
+  /// to Opsize16. i32 sets this to OpSize32.
+  OperandSize OpSize = opSize;
+
+  /// HasREX_WPrefix - This bit is set to true if the instruction should have
+  /// the 0x40 REX prefix.  This is set for i64 types.
+  bit HasREX_WPrefix = hasREX_WPrefix;
+}
+
+def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
+
+
+def Xi8  : X86TypeInfo<i8, "b", GR8, loadi8, i8mem,
+                       Imm8, i8imm, imm8_su, i8imm, invalid_node,
+                       0, OpSizeFixed, 0>;
+def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem,
+                       Imm16, i16imm, imm16_su, i16i8imm, i16immSExt8_su,
+                       1, OpSize16, 0>;
+def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
+                       Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su,
+                       1, OpSize32, 0>;
+def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
+                       Imm32S, i64i32imm, i64immSExt32_su, i64i8imm, i64immSExt8_su,
+                       1, OpSizeFixed, 1>;
+
+/// ITy - This instruction base class takes the type info for the instruction.
+/// Using this, it:
+/// 1. Concatenates together the instruction mnemonic with the appropriate
+///    suffix letter, a tab, and the arguments.
+/// 2. Infers whether the instruction should have a 0x66 prefix byte.
+/// 3. Infers whether the instruction should have a 0x40 REX_W prefix.
+/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations)
+///    or 1 (for i16,i32,i64 operations).
+class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
+          string mnemonic, string args, list<dag> pattern,
+          InstrItinClass itin = IIC_BIN_NONMEM>
+  : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4},
+       opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode },
+      f, outs, ins,
+      !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern,
+      itin> {
+
+  // Infer instruction prefixes from type info.
+  let OpSize = typeinfo.OpSize;
+  let hasREX_WPrefix  = typeinfo.HasREX_WPrefix;
+}
+
+// BinOpRR - Instructions like "add reg, reg, reg".
+class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              dag outlist, list<dag> pattern, InstrItinClass itin,
+              Format f = MRMDestReg>
+  : ITy<opcode, f, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+    Sched<[WriteALU]>;
+
+// BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has
+// just a EFLAGS as a result.
+class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                SDPatternOperator opnode, Format f = MRMDestReg>
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs),
+            [(set EFLAGS,
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+            IIC_BIN_NONMEM, f>;
+
+// BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has
+// both a regclass and EFLAGS as a result.
+class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode>
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+                  IIC_BIN_NONMEM>;
+
+// BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has
+// both a regclass and EFLAGS as a result, and has EFLAGS as input.
+class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode>
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2,
+                          EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+
+// BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding).
+class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 InstrItinClass itin = IIC_BIN_NONMEM>
+  : ITy<opcode, MRMSrcReg, typeinfo,
+        (outs typeinfo.RegClass:$dst),
+        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+        mnemonic, "{$src2, $dst|$dst, $src2}", [], itin>,
+    Sched<[WriteALU]> {
+  // The disassembler should know about this, but not the asmparser.
+  let isCodeGenOnly = 1;
+  let ForceDisassemble = 1;
+  let hasSideEffects = 0;
+}
+
+// BinOpRR_RDD_Rev - Instructions like "adc reg, reg, reg" (reversed encoding).
+class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+  : BinOpRR_Rev<opcode, mnemonic, typeinfo, IIC_BIN_CARRY_NONMEM>;
+
+// BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding).
+class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+  : ITy<opcode, MRMSrcReg, typeinfo, (outs),
+        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM>,
+    Sched<[WriteALU]> {
+  // The disassembler should know about this, but not the asmparser.
+  let isCodeGenOnly = 1;
+  let ForceDisassemble = 1;
+  let hasSideEffects = 0;
+}
+
+// BinOpRM - Instructions like "add reg, reg, [mem]".
+class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              dag outlist, list<dag> pattern,
+              InstrItinClass itin = IIC_BIN_MEM>
+  : ITy<opcode, MRMSrcMem, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+    Sched<[WriteALULd, ReadAfterLd]>;
+
+// BinOpRM_R - Instructions like "add reg, reg, [mem]".
+class BinOpRM_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              SDNode opnode>
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst,
+            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_F - Instructions like "cmp reg, [mem]".
+class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              SDPatternOperator opnode>
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs),
+            [(set EFLAGS,
+            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_RF - Instructions like "add reg, reg, [mem]".
+class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode>
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_RFF - Instructions like "adc reg, reg, [mem]".
+class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode>
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
+                    EFLAGS))], IIC_BIN_CARRY_MEM>;
+
+// BinOpRI - Instructions like "add reg, reg, imm".
+class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              Format f, dag outlist, list<dag> pattern,
+              InstrItinClass itin = IIC_BIN_NONMEM>
+  : ITy<opcode, f, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+    Sched<[WriteALU]> {
+  let ImmT = typeinfo.ImmEncoding;
+}
+
+// BinOpRI_F - Instructions like "cmp reg, imm".
+class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                SDPatternOperator opnode, Format f>
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs),
+            [(set EFLAGS,
+                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+
+// BinOpRI_RF - Instructions like "add reg, reg, imm".
+class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode, Format f>
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+// BinOpRI_RFF - Instructions like "adc reg, reg, imm".
+class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode, Format f>
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2,
+                        EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+
+// BinOpRI8 - Instructions like "add reg, reg, imm8".
+class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+               Format f, dag outlist, list<dag> pattern,
+               InstrItinClass itin = IIC_BIN_NONMEM>
+  : ITy<opcode, f, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+    Sched<[WriteALU]> {
+  let ImmT = Imm8; // Always 8-bit immediate.
+}
+
+// BinOpRI8_F - Instructions like "cmp reg, imm8".
+class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDPatternOperator opnode, Format f>
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs),
+             [(set EFLAGS,
+               (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_RF - Instructions like "add reg, reg, imm8".
+class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDPatternOperator opnode, Format f>
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+             [(set typeinfo.RegClass:$dst, EFLAGS,
+               (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_RFF - Instructions like "adc reg, reg, imm8".
+class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                   SDPatternOperator opnode, Format f>
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+             [(set typeinfo.RegClass:$dst, EFLAGS,
+               (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2,
+                       EFLAGS))], IIC_BIN_CARRY_NONMEM>;
+
+// BinOpMR - Instructions like "add [mem], reg".
+class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              list<dag> pattern, InstrItinClass itin = IIC_BIN_MEM>
+  : ITy<opcode, MRMDestMem, typeinfo,
+        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
+    Sched<[WriteALULd, WriteRMW]>;
+
+// BinOpMR_RMW - Instructions like "add [mem], reg".
+class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode>
+  : BinOpMR<opcode, mnemonic, typeinfo,
+          [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst),
+           (implicit EFLAGS)]>;
+
+// BinOpMR_RMW_FF - Instructions like "adc [mem], reg".
+class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                    SDNode opnode>
+  : BinOpMR<opcode, mnemonic, typeinfo,
+          [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
+                  addr:$dst),
+           (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+
+// BinOpMR_F - Instructions like "cmp [mem], reg".
+class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode>
+  : BinOpMR<opcode, mnemonic, typeinfo,
+            [(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>;
+
+// BinOpMI - Instructions like "add [mem], imm".
+class BinOpMI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              Format f, list<dag> pattern,
+              InstrItinClass itin = IIC_BIN_MEM>
+  : ITy<opcode, f, typeinfo,
+        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
+    Sched<[WriteALULd, WriteRMW]> {
+  let ImmT = typeinfo.ImmEncoding;
+}
+
+// BinOpMI_RMW - Instructions like "add [mem], imm".
+class BinOpMI_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode, Format f>
+  : BinOpMI<opcode, mnemonic, typeinfo, f,
+            [(store (opnode (typeinfo.VT (load addr:$dst)),
+                            typeinfo.ImmOperator:$src), addr:$dst),
+             (implicit EFLAGS)]>;
+// BinOpMI_RMW_FF - Instructions like "adc [mem], imm".
+class BinOpMI_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                     SDNode opnode, Format f>
+  : BinOpMI<opcode, mnemonic, typeinfo, f,
+            [(store (opnode (typeinfo.VT (load addr:$dst)),
+                            typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
+             (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+
+// BinOpMI_F - Instructions like "cmp [mem], imm".
+class BinOpMI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                SDPatternOperator opnode, Format f>
+  : BinOpMI<opcode, mnemonic, typeinfo, f,
+            [(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)),
+                                               typeinfo.ImmOperator:$src))]>;
+
+// BinOpMI8 - Instructions like "add [mem], imm8".
+class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
+               Format f, list<dag> pattern,
+               InstrItinClass itin = IIC_BIN_MEM>
+  : ITy<0x82, f, typeinfo,
+        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
+    Sched<[WriteALULd, WriteRMW]> {
+  let ImmT = Imm8; // Always 8-bit immediate.
+}
+
+// BinOpMI8_RMW - Instructions like "add [mem], imm8".
+class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo,
+                   SDPatternOperator opnode, Format f>
+  : BinOpMI8<mnemonic, typeinfo, f,
+             [(store (opnode (load addr:$dst),
+                             typeinfo.Imm8Operator:$src), addr:$dst),
+              (implicit EFLAGS)]>;
+
+// BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8".
+class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
+                      SDPatternOperator opnode, Format f>
+  : BinOpMI8<mnemonic, typeinfo, f,
+             [(store (opnode (load addr:$dst),
+                             typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst),
+              (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
+
+// BinOpMI8_F - Instructions like "cmp [mem], imm8".
+class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
+                 SDPatternOperator opnode, Format f>
+  : BinOpMI8<mnemonic, typeinfo, f,
+             [(set EFLAGS, (opnode (load addr:$dst),
+                                   typeinfo.Imm8Operator:$src))]>;
+
+// BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS.
+class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              Register areg, string operands,
+              InstrItinClass itin = IIC_BIN_NONMEM>
+  : ITy<opcode, RawFrm, typeinfo,
+        (outs), (ins typeinfo.ImmOperand:$src),
+        mnemonic, operands, [], itin>, Sched<[WriteALU]> {
+  let ImmT = typeinfo.ImmEncoding;
+  let Uses = [areg];
+  let Defs = [areg, EFLAGS];
+  let hasSideEffects = 0;
+}
+
+// BinOpAI_RFF - Instructions like "adc %eax, %eax, imm", that implicitly define
+// and use EFLAGS.
+class BinOpAI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  Register areg, string operands>
+  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands,
+            IIC_BIN_CARRY_NONMEM> {
+  let Uses = [areg, EFLAGS];
+}
+
+// BinOpAI_F - Instructions like "cmp %eax, %eax, imm", that imp-def EFLAGS.
+class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                Register areg, string operands>
+  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> {
+  let Defs = [EFLAGS];
+}
+
+/// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
+/// defined with "(set GPR:$dst, EFLAGS, (...".
+///
+/// It would be nice to get rid of the second and third argument here, but
+/// tblgen can't handle dependent type references aggressively enough: PR8330
+multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+                         string mnemonic, Format RegMRM, Format MemMRM,
+                         SDNode opnodeflag, SDNode opnode,
+                         bit CommutableRR, bit ConvertibleToThreeAddress> {
+  let Defs = [EFLAGS] in {
+    let Constraints = "$src1 = $dst" in {
+      let isCommutable = CommutableRR in {
+        def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
+        let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+          def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
+          def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
+          def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
+        } // isConvertibleToThreeAddress
+      } // isCommutable
+
+      def NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
+      def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
+      def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
+      def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+
+      def NAME#8rm   : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
+      def NAME#16rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
+      def NAME#32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
+      def NAME#64rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
+
+      def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+
+      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        // NOTE: These are order specific, we want the ri8 forms to be listed
+        // first so that they are slightly preferred to the ri forms.
+        def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>;
+        def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>;
+        def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>;
+
+        def NAME#16ri  : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>;
+        def NAME#32ri  : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>;
+        def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>;
+      }
+    } // Constraints = "$src1 = $dst"
+
+    def NAME#8mr    : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>;
+    def NAME#16mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>;
+    def NAME#32mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>;
+    def NAME#64mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>;
+
+    // NOTE: These are order specific, we want the mi8 forms to be listed
+    // first so that they are slightly preferred to the mi forms.
+    def NAME#16mi8  : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi8  : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi8  : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>;
+
+    def NAME#8mi    : BinOpMI_RMW<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi   : BinOpMI_RMW<0x80, mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi   : BinOpMI_RMW<0x80, mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32 : BinOpMI_RMW<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+    // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+    // not in 64-bit mode.
+    let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+        hasSideEffects = 0 in {
+      let Constraints = "$src1 = $dst" in
+        def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+      let mayLoad = 1, mayStore = 1 in
+        def NAME#8mi8 : BinOpMI8_RMW<mnemonic, Xi8, null_frag, MemMRM>;
+    }
+  } // Defs = [EFLAGS]
+
+  def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                           "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                           "{$src, %ax|ax, $src}">;
+  def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                           "{$src, %eax|eax, $src}">;
+  def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+                           "{$src, %rax|rax, $src}">;
+}
+
+/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is
+/// defined with "(set GPR:$dst, EFLAGS, (node LHS, RHS, EFLAGS))" like ADC and
+/// SBB.
+///
+/// It would be nice to get rid of the second and third argument here, but
+/// tblgen can't handle dependent type references aggressively enough: PR8330
+multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+                          string mnemonic, Format RegMRM, Format MemMRM,
+                          SDNode opnode, bit CommutableRR,
+                           bit ConvertibleToThreeAddress> {
+  let Uses = [EFLAGS], Defs = [EFLAGS] in {
+    let Constraints = "$src1 = $dst" in {
+      let isCommutable = CommutableRR in {
+        def NAME#8rr  : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>;
+        let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+          def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>;
+          def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>;
+          def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
+        } // isConvertibleToThreeAddress
+      } // isCommutable
+
+      def NAME#8rr_REV  : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>;
+      def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>;
+      def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>;
+      def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>;
+
+      def NAME#8rm   : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
+      def NAME#16rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
+      def NAME#32rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>;
+      def NAME#64rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>;
+
+      def NAME#8ri   : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+
+      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        // NOTE: These are order specific, we want the ri8 forms to be listed
+        // first so that they are slightly preferred to the ri forms.
+        def NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>;
+        def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>;
+        def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+        def NAME#16ri  : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>;
+        def NAME#32ri  : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>;
+        def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>;
+      }
+    } // Constraints = "$src1 = $dst"
+
+    def NAME#8mr    : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>;
+    def NAME#16mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>;
+    def NAME#32mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>;
+    def NAME#64mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>;
+
+    // NOTE: These are order specific, we want the mi8 forms to be listed
+    // first so that they are slightly preferred to the mi forms.
+    def NAME#16mi8  : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi8  : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi8  : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
+
+    def NAME#8mi    : BinOpMI_RMW_FF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi   : BinOpMI_RMW_FF<0x80, mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi   : BinOpMI_RMW_FF<0x80, mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32 : BinOpMI_RMW_FF<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+    // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+    // not in 64-bit mode.
+    let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+        hasSideEffects = 0 in {
+      let Constraints = "$src1 = $dst" in
+        def NAME#8ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+      let mayLoad = 1, mayStore = 1 in
+        def NAME#8mi8 : BinOpMI8_RMW_FF<mnemonic, Xi8, null_frag, MemMRM>;
+    }
+  } // Uses = [EFLAGS], Defs = [EFLAGS]
+
+  def NAME#8i8   : BinOpAI_RFF<BaseOpc4, mnemonic, Xi8 , AL,
+                               "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi16, AX,
+                               "{$src, %ax|ax, $src}">;
+  def NAME#32i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi32, EAX,
+                               "{$src, %eax|eax, $src}">;
+  def NAME#64i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi64, RAX,
+                               "{$src, %rax|rax, $src}">;
+}
+
+/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is
+/// defined with "(set EFLAGS, (...".  It would be really nice to find a way
+/// to factor this with the other ArithBinOp_*.
+///
+multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+                        string mnemonic, Format RegMRM, Format MemMRM,
+                        SDNode opnode,
+                        bit CommutableRR, bit ConvertibleToThreeAddress> {
+  let Defs = [EFLAGS] in {
+    let isCommutable = CommutableRR in {
+      def NAME#8rr  : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>;
+        def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>;
+        def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
+      }
+    } // isCommutable
+
+    def NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
+    def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
+    def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
+    def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
+
+    def NAME#8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
+    def NAME#16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
+    def NAME#32rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>;
+    def NAME#64rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
+
+    def NAME#8ri   : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+
+    let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+      // NOTE: These are order specific, we want the ri8 forms to be listed
+      // first so that they are slightly preferred to the ri forms.
+      def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, opnode, RegMRM>;
+      def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>;
+      def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+      def NAME#16ri  : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>;
+      def NAME#32ri  : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>;
+      def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>;
+    }
+
+    def NAME#8mr    : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+    def NAME#16mr   : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>;
+    def NAME#32mr   : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>;
+    def NAME#64mr   : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
+
+    // NOTE: These are order specific, we want the mi8 forms to be listed
+    // first so that they are slightly preferred to the mi forms.
+    def NAME#16mi8  : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi8  : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi8  : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>;
+
+    def NAME#8mi    : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi   : BinOpMI_F<0x80, mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi   : BinOpMI_F<0x80, mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32 : BinOpMI_F<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+    // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+    // not in 64-bit mode.
+    let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
+        hasSideEffects = 0 in {
+      def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, null_frag, RegMRM>;
+      let mayLoad = 1 in
+        def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, null_frag, MemMRM>;
+    }
+  } // Defs = [EFLAGS]
+
+  def NAME#8i8   : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
+                             "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX,
+                             "{$src, %ax|ax, $src}">;
+  def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX,
+                             "{$src, %eax|eax, $src}">;
+  def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX,
+                             "{$src, %rax|rax, $src}">;
+}
+
+
+defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m,
+                         X86and_flag, and, 1, 0>;
+defm OR  : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m,
+                         X86or_flag, or, 1, 0>;
+defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m,
+                         X86xor_flag, xor, 1, 0>;
+defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m,
+                         X86add_flag, add, 1, 1>;
+let isCompare = 1 in {
+defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
+                         X86sub_flag, sub, 0, 0>;
+}
+
+// Arithmetic.
+defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
+                          1, 0>;
+defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag,
+                          0, 0>;
+
+let isCompare = 1 in {
+defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Semantically, test instructions are similar like AND, except they don't
+// generate a result.  From an encoding perspective, they are very different:
+// they don't have all the usual imm8 and REV forms, and are encoded into a
+// different space.
+def X86testpat : PatFrag<(ops node:$lhs, node:$rhs),
+                         (X86cmp (and_su node:$lhs, node:$rhs), 0)>;
+
+let isCompare = 1 in {
+  let Defs = [EFLAGS] in {
+    let isCommutable = 1 in {
+      def TEST8rr  : BinOpRR_F<0x84, "test", Xi8 , X86testpat>;
+      def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat>;
+      def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat>;
+      def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat>;
+    } // isCommutable
+
+    def TEST8rm    : BinOpRM_F<0x84, "test", Xi8 , X86testpat>;
+    def TEST16rm   : BinOpRM_F<0x84, "test", Xi16, X86testpat>;
+    def TEST32rm   : BinOpRM_F<0x84, "test", Xi32, X86testpat>;
+    def TEST64rm   : BinOpRM_F<0x84, "test", Xi64, X86testpat>;
+
+    def TEST8ri    : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
+    def TEST16ri   : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
+    def TEST32ri   : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
+    def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
+
+    def TEST8mi    : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
+    def TEST16mi   : BinOpMI_F<0xF6, "test", Xi16, X86testpat, MRM0m>;
+    def TEST32mi   : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>;
+    def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>;
+
+    // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
+    // register class is constrained to GR8_NOREX. This pseudo is explicitly
+    // marked side-effect free, since it doesn't have an isel pattern like
+    // other test instructions.
+    let isPseudo = 1, hasSideEffects = 0 in
+    def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
+                          "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+  } // Defs = [EFLAGS]
+
+  def TEST8i8    : BinOpAI_F<0xA8, "test", Xi8 , AL,
+                             "{$src, %al|al, $src}">;
+  def TEST16i16  : BinOpAI_F<0xA8, "test", Xi16, AX,
+                             "{$src, %ax|ax, $src}">;
+  def TEST32i32  : BinOpAI_F<0xA8, "test", Xi32, EAX,
+                             "{$src, %eax|eax, $src}">;
+  def TEST64i32  : BinOpAI_F<0xA8, "test", Xi64, RAX,
+                             "{$src, %rax|rax, $src}">;
+} // isCompare
+
+//===----------------------------------------------------------------------===//
+// ANDN Instruction
+//
+multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
+                    PatFrag ld_frag> {
+  def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+            !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))],
+            IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+  def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+            !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [(set RC:$dst, EFLAGS,
+             (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))], IIC_BIN_MEM>,
+           Sched<[WriteALULd, ReadAfterLd]>;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8PS, VEX_4V;
+  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8PS, VEX_4V, VEX_W;
+}
+
+let Predicates = [HasBMI] in {
+  def : Pat<(and (not GR32:$src1), GR32:$src2),
+            (ANDN32rr GR32:$src1, GR32:$src2)>;
+  def : Pat<(and (not GR64:$src1), GR64:$src2),
+            (ANDN64rr GR64:$src1, GR64:$src2)>;
+  def : Pat<(and (not GR32:$src1), (loadi32 addr:$src2)),
+            (ANDN32rm GR32:$src1, addr:$src2)>;
+  def : Pat<(and (not GR64:$src1), (loadi64 addr:$src2)),
+            (ANDN64rm GR64:$src1, addr:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// MULX Instruction
+//
+multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+  let isCommutable = 1 in
+  def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
+             !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+             [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul, WriteIMulH]>;
+
+  let mayLoad = 1 in
+  def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
+             !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+             [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd, WriteIMulH]>;
+}
+}
+
+let Predicates = [HasBMI2] in {
+  let Uses = [EDX] in
+    defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem>;
+  let Uses = [RDX] in
+    defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem>, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// ADCX Instruction
+//
+let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS],
+    Constraints = "$src0 = $dst", AddedComplexity = 10 in {
+  let SchedRW = [WriteALU] in {
+  def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst),
+             (ins GR32:$src0, GR32:$src), "adcx{l}\t{$src, $dst|$dst, $src}",
+             [(set GR32:$dst, EFLAGS,
+                 (X86adc_flag GR32:$src0, GR32:$src, EFLAGS))],
+             IIC_BIN_CARRY_NONMEM>, T8PD;
+  def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst),
+             (ins GR64:$src0, GR64:$src), "adcx{q}\t{$src, $dst|$dst, $src}",
+             [(set GR64:$dst, EFLAGS,
+                 (X86adc_flag GR64:$src0, GR64:$src, EFLAGS))],
+             IIC_BIN_CARRY_NONMEM>, T8PD;
+  } // SchedRW
+
+  let mayLoad = 1, SchedRW = [WriteALULd] in {
+  def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst),
+             (ins GR32:$src0, i32mem:$src), "adcx{l}\t{$src, $dst|$dst, $src}",
+             [(set GR32:$dst, EFLAGS,
+                 (X86adc_flag GR32:$src0, (loadi32 addr:$src), EFLAGS))],
+             IIC_BIN_CARRY_MEM>, T8PD;
+
+  def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst),
+             (ins GR64:$src0, i64mem:$src), "adcx{q}\t{$src, $dst|$dst, $src}",
+             [(set GR64:$dst, EFLAGS,
+                 (X86adc_flag GR64:$src0, (loadi64 addr:$src), EFLAGS))],
+             IIC_BIN_CARRY_MEM>, T8PD;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// ADOX Instruction
+//
+let Predicates = [HasADX], hasSideEffects = 0, Defs = [EFLAGS],
+    Uses = [EFLAGS] in {
+  let SchedRW = [WriteALU] in {
+  def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+             "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
+
+  def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+             "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS;
+  } // SchedRW
+
+  let mayLoad = 1, SchedRW = [WriteALULd] in {
+  def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+             "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS;
+
+  def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+             "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS;
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
new file mode 100644
index 000000000000..ba970bc2048e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
@@ -0,0 +1,233 @@
+//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle X86'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call.  X86 memory references
+// can be very complex expressions (described in the README), so wrapping them
+// up behind an easier to use interface makes sense.  Descriptions of the
+// functions are included below.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Scale, Index, Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
+#define LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include <cassert>
+
+namespace llvm {
+
+/// X86AddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with BP or SP and Disp being offsetted accordingly.  The displacement may
+/// also include the offset of a global value.
+struct X86AddressMode {
+  enum {
+    RegBase,
+    FrameIndexBase
+  } BaseType;
+
+  union {
+    unsigned Reg;
+    int FrameIndex;
+  } Base;
+
+  unsigned Scale;
+  unsigned IndexReg;
+  int Disp;
+  const GlobalValue *GV;
+  unsigned GVOpFlags;
+
+  X86AddressMode()
+    : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(nullptr),
+      GVOpFlags(0) {
+    Base.Reg = 0;
+  }
+
+  void getFullAddress(SmallVectorImpl<MachineOperand> &MO) {
+    assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8);
+
+    if (BaseType == X86AddressMode::RegBase)
+      MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false, false,
+                                             false, false, false, 0, false));
+    else {
+      assert(BaseType == X86AddressMode::FrameIndexBase);
+      MO.push_back(MachineOperand::CreateFI(Base.FrameIndex));
+    }
+
+    MO.push_back(MachineOperand::CreateImm(Scale));
+    MO.push_back(MachineOperand::CreateReg(IndexReg, false, false, false, false,
+                                           false, false, 0, false));
+
+    if (GV)
+      MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags));
+    else
+      MO.push_back(MachineOperand::CreateImm(Disp));
+
+    MO.push_back(MachineOperand::CreateReg(0, false, false, false, false, false,
+                                           false, 0, false));
+  }
+};
+
+/// Compute the addressing mode from an machine instruction starting with the
+/// given operand.
+static inline X86AddressMode getAddressFromInstr(const MachineInstr *MI,
+                                                 unsigned Operand) {
+  X86AddressMode AM;
+  const MachineOperand &Op0 = MI->getOperand(Operand);
+  if (Op0.isReg()) {
+    AM.BaseType = X86AddressMode::RegBase;
+    AM.Base.Reg = Op0.getReg();
+  } else {
+    AM.BaseType = X86AddressMode::FrameIndexBase;
+    AM.Base.FrameIndex = Op0.getIndex();
+  }
+
+  const MachineOperand &Op1 = MI->getOperand(Operand + 1);
+  AM.Scale = Op1.getImm();
+
+  const MachineOperand &Op2 = MI->getOperand(Operand + 2);
+  AM.IndexReg = Op2.getReg();
+
+  const MachineOperand &Op3 = MI->getOperand(Operand + 3);
+  if (Op3.isGlobal())
+    AM.GV = Op3.getGlobal();
+  else
+    AM.Disp = Op3.getImm();
+
+  return AM;
+}
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no scale, index or displacement. An example is: DWORD PTR [EAX].
+///
+static inline const MachineInstrBuilder &
+addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
+  // Because memory references are always represented with five
+  // values, this adds: Reg, 1, NoReg, 0, NoReg to the instruction.
+  return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0).addReg(0);
+}
+
+/// Replace the address used in the instruction with the direct memory
+/// reference.
+static inline void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand,
+                                           unsigned Reg) {
+  // Direct memory address is in a form of: Reg, 1 (Scale), NoReg, 0, NoReg.
+  MI->getOperand(Operand).setReg(Reg);
+  MI->getOperand(Operand + 1).setImm(1);
+  MI->getOperand(Operand + 2).setReg(0);
+  MI->getOperand(Operand + 3).setImm(0);
+  MI->getOperand(Operand + 4).setReg(0);
+}
+
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, int Offset) {
+  return MIB.addImm(1).addReg(0).addImm(Offset).addReg(0);
+}
+
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, const MachineOperand& Offset) {
+  return MIB.addImm(1).addReg(0).addOperand(Offset).addReg(0);
+}
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no scale or index, but with a
+/// displacement. An example is: DWORD PTR [EAX + 4].
+///
+static inline const MachineInstrBuilder &
+addRegOffset(const MachineInstrBuilder &MIB,
+             unsigned Reg, bool isKill, int Offset) {
+  return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+static inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
+                                            unsigned Reg1, bool isKill1,
+                                            unsigned Reg2, bool isKill2) {
+  return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1)
+    .addReg(Reg2, getKillRegState(isKill2)).addImm(0).addReg(0);
+}
+
+static inline const MachineInstrBuilder &
+addFullAddress(const MachineInstrBuilder &MIB,
+               const X86AddressMode &AM) {
+  assert(AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
+
+  if (AM.BaseType == X86AddressMode::RegBase)
+    MIB.addReg(AM.Base.Reg);
+  else {
+    assert(AM.BaseType == X86AddressMode::FrameIndexBase);
+    MIB.addFrameIndex(AM.Base.FrameIndex);
+  }
+
+  MIB.addImm(AM.Scale).addReg(AM.IndexReg);
+  if (AM.GV)
+    MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags);
+  else
+    MIB.addImm(AM.Disp);
+
+  return MIB.addReg(0);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+  MachineInstr *MI = MIB;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const MCInstrDesc &MCID = MI->getDesc();
+  auto Flags = MachineMemOperand::MONone;
+  if (MCID.mayLoad())
+    Flags |= MachineMemOperand::MOLoad;
+  if (MCID.mayStore())
+    Flags |= MachineMemOperand::MOStore;
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags,
+      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+  return addOffset(MIB.addFrameIndex(FI), Offset)
+            .addMemOperand(MMO);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool.  The
+/// reference uses the abstract ConstantPoolIndex which is retained until
+/// either machine code emission or assembly output. In PIC mode on x86-32,
+/// the GlobalBaseReg parameter can be used to make this a
+/// GlobalBaseReg-relative reference.
+///
+static inline const MachineInstrBuilder &
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+                         unsigned GlobalBaseReg, unsigned char OpFlags) {
+  //FIXME: factor this
+  return MIB.addReg(GlobalBaseReg).addImm(1).addReg(0)
+    .addConstantPoolIndex(CPI, 0, OpFlags).addReg(0);
+}
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
new file mode 100644
index 000000000000..c73c95019f8d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -0,0 +1,112 @@
+//===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 conditional move and set on condition
+// instructions.
+//
+//===----------------------------------------------------------------------===//
+
+
+// CMOV instructions.
+multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
+  let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+      isCommutable = 1, SchedRW = [WriteALU] in {
+    def NAME#16rr
+      : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+          !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR16:$dst,
+                (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))],
+                IIC_CMOV16_RR>, TB, OpSize16;
+    def NAME#32rr
+      : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+          !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR32:$dst,
+                (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))],
+                IIC_CMOV32_RR>, TB, OpSize32;
+    def NAME#64rr
+      :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+          !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR64:$dst,
+                (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))],
+                IIC_CMOV32_RR>, TB;
+  }
+
+  let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+      SchedRW = [WriteALULd, ReadAfterLd] in {
+    def NAME#16rm
+      : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+          !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                    CondNode, EFLAGS))], IIC_CMOV16_RM>,
+                                    TB, OpSize16;
+    def NAME#32rm
+      : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+          !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                    CondNode, EFLAGS))], IIC_CMOV32_RM>,
+                                    TB, OpSize32;
+    def NAME#64rm
+      :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+          !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    CondNode, EFLAGS))], IIC_CMOV32_RM>, TB;
+  } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
+} // end multiclass
+
+
+// Conditional Moves.
+defm CMOVO  : CMOV<0x40, "cmovo" , X86_COND_O>;
+defm CMOVNO : CMOV<0x41, "cmovno", X86_COND_NO>;
+defm CMOVB  : CMOV<0x42, "cmovb" , X86_COND_B>;
+defm CMOVAE : CMOV<0x43, "cmovae", X86_COND_AE>;
+defm CMOVE  : CMOV<0x44, "cmove" , X86_COND_E>;
+defm CMOVNE : CMOV<0x45, "cmovne", X86_COND_NE>;
+defm CMOVBE : CMOV<0x46, "cmovbe", X86_COND_BE>;
+defm CMOVA  : CMOV<0x47, "cmova" , X86_COND_A>;
+defm CMOVS  : CMOV<0x48, "cmovs" , X86_COND_S>;
+defm CMOVNS : CMOV<0x49, "cmovns", X86_COND_NS>;
+defm CMOVP  : CMOV<0x4A, "cmovp" , X86_COND_P>;
+defm CMOVNP : CMOV<0x4B, "cmovnp", X86_COND_NP>;
+defm CMOVL  : CMOV<0x4C, "cmovl" , X86_COND_L>;
+defm CMOVGE : CMOV<0x4D, "cmovge", X86_COND_GE>;
+defm CMOVLE : CMOV<0x4E, "cmovle", X86_COND_LE>;
+defm CMOVG  : CMOV<0x4F, "cmovg" , X86_COND_G>;
+
+
+// SetCC instructions.
+multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> {
+  let Uses = [EFLAGS] in {
+    def r    : I<opc, MRMXr,  (outs GR8:$dst), (ins),
+                     !strconcat(Mnemonic, "\t$dst"),
+                     [(set GR8:$dst, (X86setcc OpNode, EFLAGS))],
+                     IIC_SET_R>, TB, Sched<[WriteALU]>;
+    def m    : I<opc, MRMXm,  (outs), (ins i8mem:$dst),
+                     !strconcat(Mnemonic, "\t$dst"),
+                     [(store (X86setcc OpNode, EFLAGS), addr:$dst)],
+                     IIC_SET_M>, TB, Sched<[WriteALU, WriteStore]>;
+  } // Uses = [EFLAGS]
+}
+
+defm SETO  : SETCC<0x90, "seto",  X86_COND_O>;   // is overflow bit set
+defm SETNO : SETCC<0x91, "setno", X86_COND_NO>;  // is overflow bit not set
+defm SETB  : SETCC<0x92, "setb",  X86_COND_B>;   // unsigned less than
+defm SETAE : SETCC<0x93, "setae", X86_COND_AE>;  // unsigned greater or equal
+defm SETE  : SETCC<0x94, "sete",  X86_COND_E>;   // equal to
+defm SETNE : SETCC<0x95, "setne", X86_COND_NE>;  // not equal to
+defm SETBE : SETCC<0x96, "setbe", X86_COND_BE>;  // unsigned less than or equal
+defm SETA  : SETCC<0x97, "seta",  X86_COND_A>;   // unsigned greater than
+defm SETS  : SETCC<0x98, "sets",  X86_COND_S>;   // is signed bit set
+defm SETNS : SETCC<0x99, "setns", X86_COND_NS>;  // is not signed
+defm SETP  : SETCC<0x9A, "setp",  X86_COND_P>;   // is parity bit set
+defm SETNP : SETCC<0x9B, "setnp", X86_COND_NP>;  // is parity bit not set
+defm SETL  : SETCC<0x9C, "setl",  X86_COND_L>;   // signed less than
+defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>;  // signed greater or equal
+defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>;  // signed less than or equal
+defm SETG  : SETCC<0x9F, "setg",  X86_COND_G>;   // signed greater than
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
new file mode 100644
index 000000000000..3c27eb8077d0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -0,0 +1,1932 @@
+//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the various pseudo instructions used by the compiler,
+// as well as Pat patterns used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pattern Matching Support
+
+def GetLo32XForm : SDNodeXForm<imm, [{
+  // Transformation function: get the low 32 bits.
+  return getI32Imm((unsigned)N->getZExtValue(), SDLoc(N));
+}]>;
+
+def GetLo8XForm : SDNodeXForm<imm, [{
+  // Transformation function: get the low 8 bits.
+  return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
+}]>;
+
+
+//===----------------------------------------------------------------------===//
+// Random Pseudo Instructions.
+
+// PIC base construction.  This expands to code that looks like this:
+//     call  $next_inst
+//     popl %destreg"
+let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in
+  def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
+                      "", []>;
+
+
+// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [ESP, EFLAGS], Uses = [ESP] in {
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                           "#ADJCALLSTACKDOWN",
+                           []>,
+                          Requires<[NotLP64]>;
+def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                           "#ADJCALLSTACKUP",
+                           [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+                          Requires<[NotLP64]>;
+}
+def : Pat<(X86callseq_start timm:$amt1),
+          (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
+
+
+// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [RSP, EFLAGS], Uses = [RSP] in {
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                           "#ADJCALLSTACKDOWN",
+                           []>,
+                          Requires<[IsLP64]>;
+def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                           "#ADJCALLSTACKUP",
+                           [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+                          Requires<[IsLP64]>;
+}
+def : Pat<(X86callseq_start timm:$amt1),
+          (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
+
+
+// x86-64 va_start lowering magic.
+let usesCustomInserter = 1, Defs = [EFLAGS] in {
+def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
+                              (outs),
+                              (ins GR8:$al,
+                                   i64imm:$regsavefi, i64imm:$offset,
+                                   variable_ops),
+                              "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
+                              [(X86vastart_save_xmm_regs GR8:$al,
+                                                         imm:$regsavefi,
+                                                         imm:$offset),
+                               (implicit EFLAGS)]>;
+
+// The VAARG_64 pseudo-instruction takes the address of the va_list,
+// and places the address of the next argument into a register.
+let Defs = [EFLAGS] in
+def VAARG_64 : I<0, Pseudo,
+                 (outs GR64:$dst),
+                 (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
+                 "#VAARG_64 $dst, $ap, $size, $mode, $align",
+                 [(set GR64:$dst,
+                    (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
+                  (implicit EFLAGS)]>;
+
+
+// When using segmented stacks these are lowered into instructions which first
+// check if the current stacklet has enough free memory. If it does, memory is
+// allocated by bumping the stack pointer. Otherwise memory is allocated from
+// the heap.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
+                      "# variable sized alloca for segmented stacks",
+                      [(set GR32:$dst,
+                         (X86SegAlloca GR32:$size))]>,
+                    Requires<[NotLP64]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
+def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
+                      "# variable sized alloca for segmented stacks",
+                      [(set GR64:$dst,
+                         (X86SegAlloca GR64:$size))]>,
+                    Requires<[In64BitMode]>;
+}
+
+// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
+// targets.  These calls are needed to probe the stack when allocating more than
+// 4k bytes in one go. Touching the stack at 4K increments is necessary to
+// ensure that the guard pages used by the OS virtual memory manager are
+// allocated in correct sequence.
+// The main point of having separate instruction are extra unmodelled effects
+// (compared to ordinary calls) like stack pointer change.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+def WIN_ALLOCA_32 : I<0, Pseudo, (outs), (ins GR32:$size),
+                     "# dynamic stack allocation",
+                     [(X86WinAlloca GR32:$size)]>,
+                     Requires<[NotLP64]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
+def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size),
+                     "# dynamic stack allocation",
+                     [(X86WinAlloca GR64:$size)]>,
+                     Requires<[In64BitMode]>;
+
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let SchedRW = [WriteSystem] in {
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN   : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
+                    "ret\t#eh_return, addr: $addr",
+                    [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+
+}
+
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN64   : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
+                     "ret\t#eh_return, addr: $addr",
+                     [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+
+}
+
+let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
+    isCodeGenOnly = 1, isReturn = 1 in {
+  def CLEANUPRET : I<0, Pseudo, (outs), (ins), "# CLEANUPRET", [(cleanupret)]>;
+
+  // CATCHRET needs a custom inserter for SEH.
+  let usesCustomInserter = 1 in
+    def CATCHRET : I<0, Pseudo, (outs), (ins brtarget32:$dst, brtarget32:$from),
+                     "# CATCHRET",
+                     [(catchret bb:$dst, bb:$from)]>;
+}
+
+let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1,
+    usesCustomInserter = 1 in
+def CATCHPAD : I<0, Pseudo, (outs), (ins), "# CATCHPAD", [(catchpad)]>;
+
+// This instruction is responsible for re-establishing stack pointers after an
+// exception has been caught and we are rejoining normal control flow in the
+// parent function or funclet. It generally sets ESP and EBP, and optionally
+// ESI. It is only needed for 32-bit WinEH, as the runtime restores CSRs for us
+// elsewhere.
+let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in
+def EH_RESTORE : I<0, Pseudo, (outs), (ins), "# EH_RESTORE", []>;
+
+let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+    usesCustomInserter = 1 in {
+  def EH_SjLj_SetJmp32  : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
+                            "#EH_SJLJ_SETJMP32",
+                            [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[Not64BitMode]>;
+  def EH_SjLj_SetJmp64  : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf),
+                            "#EH_SJLJ_SETJMP64",
+                            [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[In64BitMode]>;
+  let isTerminator = 1 in {
+  def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf),
+                            "#EH_SJLJ_LONGJMP32",
+                            [(X86eh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[Not64BitMode]>;
+  def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf),
+                            "#EH_SJLJ_LONGJMP64",
+                            [(X86eh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[In64BitMode]>;
+  }
+}
+} // SchedRW
+
+let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
+  def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
+                        "#EH_SjLj_Setup\t$dst", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions used by unwind info.
+//
+let isPseudo = 1 in {
+  def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg),
+                            "#SEH_PushReg $reg", []>;
+  def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
+                            "#SEH_SaveReg $reg, $dst", []>;
+  def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
+                            "#SEH_SaveXMM $reg, $dst", []>;
+  def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size),
+                            "#SEH_StackAlloc $size", []>;
+  def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset),
+                            "#SEH_SetFrame $reg, $offset", []>;
+  def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode),
+                            "#SEH_PushFrame $mode", []>;
+  def SEH_EndPrologue : I<0, Pseudo, (outs), (ins),
+                            "#SEH_EndPrologue", []>;
+  def SEH_Epilogue : I<0, Pseudo, (outs), (ins),
+                            "#SEH_Epilogue", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions used by segmented stacks.
+//
+
+// This is lowered into a RET instruction by MCInstLower.  We need
+// this so that we don't have to have a MachineBasicBlock which ends
+// with a RET and also has successors.
+let isPseudo = 1 in {
+def MORESTACK_RET: I<0, Pseudo, (outs), (ins),
+                          "", []>;
+
+// This instruction is lowered to a RET followed by a MOV.  The two
+// instructions are not generated on a higher level since then the
+// verifier sees a MachineBasicBlock ending with a non-terminator.
+def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
+                                  "", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instruction mapping movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
+    isPseudo = 1, AddedComplexity = 20 in
+def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+                 [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
+
+// Other widths can also make use of the 32-bit xor, which may have a smaller
+// encoding and avoid partial register updates.
+def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
+def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
+def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
+  let AddedComplexity = 20;
+}
+
+let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
+    AddedComplexity = 15 in {
+  // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
+  // which only require 3 bytes compared to MOV32ri which requires 5.
+  let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
+    def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+                        [(set GR32:$dst, 1)]>;
+    def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+                        [(set GR32:$dst, -1)]>;
+  }
+
+  // MOV16ri is 4 bytes, so the instructions above are smaller.
+  def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>;
+  def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
+}
+
+let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in {
+// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
+// FIXME: Add itinerary class and Schedule.
+def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
+                       [(set GR32:$dst, i32immSExt8:$src)]>,
+                     Requires<[OptForMinSize, NotWin64WithoutFP]>;
+def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
+                       [(set GR64:$dst, i64immSExt8:$src)]>,
+                     Requires<[OptForMinSize, NotWin64WithoutFP]>;
+}
+
+// Materialize i64 constant where top 32-bits are zero. This could theoretically
+// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
+// that would make it more difficult to rematerialize.
+let isReMaterializable = 1, isAsCheapAsAMove = 1,
+    isPseudo = 1, hasSideEffects = 0 in
+def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", []>;
+
+// This 64-bit pseudo-move can be used for both a 64-bit constant that is
+// actually the zero-extension of a 32-bit constant and for labels in the
+// x86-64 small code model.
+def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>;
+
+let AddedComplexity = 1 in
+def : Pat<(i64 mov64imm32:$src),
+          (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>;
+
+// Use sbb to materialize carry bit.
+let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
+// FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
+// However, Pat<> can't replicate the destination reg into the inputs of the
+// result.
+def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
+                 [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
+                 [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+                 [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
+                 [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+} // isCodeGenOnly
+
+
+def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C16r)>;
+def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C64r)>;
+
+def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C16r)>;
+def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C64r)>;
+
+// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and
+// will be eliminated and that the sbb can be extended up to a wider type.  When
+// this happens, it is great.  However, if we are left with an 8-bit sbb and an
+// and, we might as well just match it as a setb.
+def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
+          (SETBr)>;
+
+// (add OP, SETB) -> (adc OP, 0)
+def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op),
+          (ADC8ri GR8:$op, 0)>;
+def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op),
+          (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op),
+          (ADC64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETB) -> (sbb OP, 0)
+def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETCC_CARRY) -> (adc OP, 0)
+def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC64ri8 GR64:$op, 0)>;
+
+//===----------------------------------------------------------------------===//
+// String Pseudo Instructions
+//
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
+def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+                    [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+                   Requires<[Not64BitMode]>;
+def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
+                   Requires<[Not64BitMode]>;
+def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
+                   Requires<[Not64BitMode]>;
+}
+
+let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
+def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+                    [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+                   Requires<[In64BitMode]>;
+def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
+                   Requires<[In64BitMode]>;
+def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
+                   Requires<[In64BitMode]>;
+def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
+                    [(X86rep_movs i64)], IIC_REP_MOVS>, REP,
+                   Requires<[In64BitMode]>;
+}
+
+// FIXME: Should use "(X86rep_stos AL)" as the pattern.
+let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
+  let Uses = [AL,ECX,EDI] in
+  def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+                      [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+                     Requires<[Not64BitMode]>;
+  let Uses = [AX,ECX,EDI] in
+  def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
+                     Requires<[Not64BitMode]>;
+  let Uses = [EAX,ECX,EDI] in
+  def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+                      [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
+                     Requires<[Not64BitMode]>;
+}
+
+let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
+  let Uses = [AL,RCX,RDI] in
+  def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+                      [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+                     Requires<[In64BitMode]>;
+  let Uses = [AX,RCX,RDI] in
+  def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
+                     Requires<[In64BitMode]>;
+  let Uses = [RAX,RCX,RDI] in
+  def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+                      [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
+                     Requires<[In64BitMode]>;
+
+  let Uses = [RAX,RCX,RDI] in
+  def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
+                      [(X86rep_stos i64)], IIC_REP_STOS>, REP,
+                     Requires<[In64BitMode]>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//
+
+// ELF TLS Support
+// All calls clobber the non-callee saved registers. ESP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+            ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
+            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+    usesCustomInserter = 1, Uses = [ESP] in {
+def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                  "# TLS_addr32",
+                  [(X86tlsaddr tls32addr:$sym)]>,
+                  Requires<[Not64BitMode]>;
+def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                  "# TLS_base_addr32",
+                  [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
+                  Requires<[Not64BitMode]>;
+}
+
+// All calls clobber the non-callee saved registers. RSP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+            FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+            ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
+            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+    usesCustomInserter = 1, Uses = [RSP] in {
+def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                   "# TLS_addr64",
+                  [(X86tlsaddr tls64addr:$sym)]>,
+                  Requires<[In64BitMode]>;
+def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                   "# TLS_base_addr64",
+                  [(X86tlsbaseaddr tls64baseaddr:$sym)]>,
+                  Requires<[In64BitMode]>;
+}
+
+// Darwin TLS Support
+// For i386, the address of the thunk is passed on the stack, on return the
+// address of the variable is in %eax.  %ecx is trashed during the function
+// call.  All other registers are preserved.
+let Defs = [EAX, ECX, EFLAGS],
+    Uses = [ESP],
+    usesCustomInserter = 1 in
+def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                "# TLSCall_32",
+                [(X86TLSCall addr:$sym)]>,
+                Requires<[Not64BitMode]>;
+
+// For x86_64, the address of the thunk is passed in %rdi, but the
+// pseudo directly use the symbol, so do not add an implicit use of
+// %rdi. The lowering will do the right thing with RDI.
+// On return the address of the variable is in %rax.  All other
+// registers are preserved.
+let Defs = [RAX, EFLAGS],
+    Uses = [RSP],
+    usesCustomInserter = 1 in
+def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                  "# TLSCall_64",
+                  [(X86TLSCall addr:$sym)]>,
+                  Requires<[In64BitMode]>;
+
+
+//===----------------------------------------------------------------------===//
+// Conditional Move Pseudo Instructions
+
+// CMOV* - Used to implement the SELECT DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> {
+  def CMOV#NAME  : I<0, Pseudo,
+                    (outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond),
+                    "#CMOV_"#NAME#" PSEUDO!",
+                    [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, imm:$cond,
+                                                EFLAGS)))]>;
+}
+
+let usesCustomInserter = 1, Uses = [EFLAGS] in {
+  // X86 doesn't have 8-bit conditional moves. Use a customInserter to
+  // emit control flow. An alternative to this is to mark i8 SELECT as Promote,
+  // however that requires promoting the operands, and can induce additional
+  // i8 register pressure.
+  defm _GR8 : CMOVrr_PSEUDO<GR8, i8>;
+
+  let Predicates = [NoCMov] in {
+    defm _GR32 : CMOVrr_PSEUDO<GR32, i32>;
+    defm _GR16 : CMOVrr_PSEUDO<GR16, i16>;
+  } // Predicates = [NoCMov]
+
+  // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
+  // SSE1/SSE2.
+  let Predicates = [FPStackf32] in
+    defm _RFP32 : CMOVrr_PSEUDO<RFP32, f32>;
+
+  let Predicates = [FPStackf64] in
+    defm _RFP64 : CMOVrr_PSEUDO<RFP64, f64>;
+
+  defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>;
+
+  defm _FR32   : CMOVrr_PSEUDO<FR32, f32>;
+  defm _FR64   : CMOVrr_PSEUDO<FR64, f64>;
+  defm _FR128  : CMOVrr_PSEUDO<FR128, f128>;
+  defm _V4F32  : CMOVrr_PSEUDO<VR128, v4f32>;
+  defm _V2F64  : CMOVrr_PSEUDO<VR128, v2f64>;
+  defm _V2I64  : CMOVrr_PSEUDO<VR128, v2i64>;
+  defm _V8F32  : CMOVrr_PSEUDO<VR256, v8f32>;
+  defm _V4F64  : CMOVrr_PSEUDO<VR256, v4f64>;
+  defm _V4I64  : CMOVrr_PSEUDO<VR256, v4i64>;
+  defm _V8I64  : CMOVrr_PSEUDO<VR512, v8i64>;
+  defm _V8F64  : CMOVrr_PSEUDO<VR512, v8f64>;
+  defm _V16F32 : CMOVrr_PSEUDO<VR512, v16f32>;
+  defm _V8I1   : CMOVrr_PSEUDO<VK8,  v8i1>;
+  defm _V16I1  : CMOVrr_PSEUDO<VK16, v16i1>;
+  defm _V32I1  : CMOVrr_PSEUDO<VK32, v32i1>;
+  defm _V64I1  : CMOVrr_PSEUDO<VK64, v64i1>;
+} // usesCustomInserter = 1, Uses = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+// FIXME: Use normal instructions and add lock prefix dynamically.
+
+// Memory barriers
+
+// TODO: Get this to fold the constant into the instruction.
+let isCodeGenOnly = 1, Defs = [EFLAGS] in
+def OR32mrLocked  : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
+                      "or{l}\t{$zero, $dst|$dst, $zero}", [],
+                      IIC_ALU_MEM>, Requires<[Not64BitMode]>, OpSize32, LOCK,
+                    Sched<[WriteALULd, WriteRMW]>;
+
+let hasSideEffects = 1 in
+def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
+                     "#MEMBARRIER",
+                     [(X86MemBarrier)]>, Sched<[WriteLoad]>;
+
+// RegOpc corresponds to the mr version of the instruction
+// ImmOpc corresponds to the mi version of the instruction
+// ImmOpc8 corresponds to the mi8 version of the instruction
+// ImmMod corresponds to the instruction format of the mi and mi8 versions
+multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
+                           Format ImmMod, SDPatternOperator Op, string mnemonic> {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+    SchedRW = [WriteALULd, WriteRMW] in {
+
+def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                  RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
+                  MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+                  !strconcat(mnemonic, "{b}\t",
+                             "{$src2, $dst|$dst, $src2}"),
+                  [(set EFLAGS, (Op addr:$dst, GR8:$src2))],
+                  IIC_ALU_NONMEM>, LOCK;
+
+def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                   RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                   MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                   !strconcat(mnemonic, "{w}\t",
+                              "{$src2, $dst|$dst, $src2}"),
+                   [(set EFLAGS, (Op addr:$dst, GR16:$src2))],
+                   IIC_ALU_NONMEM>, OpSize16, LOCK;
+
+def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                   RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                   MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                   !strconcat(mnemonic, "{l}\t",
+                              "{$src2, $dst|$dst, $src2}"),
+                   [(set EFLAGS, (Op addr:$dst, GR32:$src2))],
+                   IIC_ALU_NONMEM>, OpSize32, LOCK;
+
+def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                    MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                    !strconcat(mnemonic, "{q}\t",
+                               "{$src2, $dst|$dst, $src2}"),
+                    [(set EFLAGS, (Op addr:$dst, GR64:$src2))],
+                    IIC_ALU_NONMEM>, LOCK;
+
+def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                    ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
+                    ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
+                    !strconcat(mnemonic, "{b}\t",
+                               "{$src2, $dst|$dst, $src2}"),
+                    [(set EFLAGS, (Op addr:$dst, (i8 imm:$src2)))],
+                    IIC_ALU_MEM>, LOCK;
+
+def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                      ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                      ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
+                      !strconcat(mnemonic, "{w}\t",
+                                 "{$src2, $dst|$dst, $src2}"),
+                      [(set EFLAGS, (Op addr:$dst, (i16 imm:$src2)))],
+                      IIC_ALU_MEM>, OpSize16, LOCK;
+
+def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                      ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                      ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
+                      !strconcat(mnemonic, "{l}\t",
+                                 "{$src2, $dst|$dst, $src2}"),
+                      [(set EFLAGS, (Op addr:$dst, (i32 imm:$src2)))],
+                      IIC_ALU_MEM>, OpSize32, LOCK;
+
+def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                          ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                          ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
+                          !strconcat(mnemonic, "{q}\t",
+                                     "{$src2, $dst|$dst, $src2}"),
+                          [(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))],
+                          IIC_ALU_MEM>, LOCK;
+
+def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                      ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                      ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
+                      !strconcat(mnemonic, "{w}\t",
+                                 "{$src2, $dst|$dst, $src2}"),
+                      [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))],
+                      IIC_ALU_MEM>, OpSize16, LOCK;
+
+def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                      ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                      ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
+                      !strconcat(mnemonic, "{l}\t",
+                                 "{$src2, $dst|$dst, $src2}"),
+                      [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))],
+                      IIC_ALU_MEM>, OpSize32, LOCK;
+
+def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                       ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
+                       !strconcat(mnemonic, "{q}\t",
+                                  "{$src2, $dst|$dst, $src2}"),
+                       [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))],
+                       IIC_ALU_MEM>, LOCK;
+
+}
+
+}
+
+defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, X86lock_add, "add">;
+defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, X86lock_sub, "sub">;
+defm LOCK_OR  : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, X86lock_or , "or">;
+defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, X86lock_and, "and">;
+defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, X86lock_xor, "xor">;
+
+multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
+                          int Increment, string mnemonic> {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+    SchedRW = [WriteALULd, WriteRMW], Predicates = [NotSlowIncDec] in {
+def NAME#8m  : I<Opc8, Form, (outs), (ins i8mem :$dst),
+                 !strconcat(mnemonic, "{b}\t$dst"),
+                 [(set EFLAGS, (X86lock_add addr:$dst, (i8 Increment)))],
+                  IIC_UNARY_MEM>, LOCK;
+def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
+                 !strconcat(mnemonic, "{w}\t$dst"),
+                 [(set EFLAGS, (X86lock_add addr:$dst, (i16 Increment)))],
+                 IIC_UNARY_MEM>, OpSize16, LOCK;
+def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
+                 !strconcat(mnemonic, "{l}\t$dst"),
+                 [(set EFLAGS, (X86lock_add addr:$dst, (i32 Increment)))],
+                 IIC_UNARY_MEM>, OpSize32, LOCK;
+def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
+                  !strconcat(mnemonic, "{q}\t$dst"),
+                  [(set EFLAGS, (X86lock_add addr:$dst, (i64 Increment)))],
+                  IIC_UNARY_MEM>, LOCK;
+}
+}
+
+defm LOCK_INC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m,  1, "inc">;
+defm LOCK_DEC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, -1, "dec">;
+
+// Atomic compare and swap.
+multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
+                         SDPatternOperator frag, X86MemOperand x86memop,
+                         InstrItinClass itin> {
+let isCodeGenOnly = 1, usesCustomInserter = 1 in {
+  def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
+               !strconcat(mnemonic, "\t$ptr"),
+               [(frag addr:$ptr)], itin>, TB, LOCK;
+}
+}
+
+multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
+                          string mnemonic, SDPatternOperator frag,
+                          InstrItinClass itin8, InstrItinClass itin> {
+let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in {
+  let Defs = [AL, EFLAGS], Uses = [AL] in
+  def NAME#8  : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
+                  !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
+                  [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
+  let Defs = [AX, EFLAGS], Uses = [AX] in
+  def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
+                  !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
+                  [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize16, LOCK;
+  let Defs = [EAX, EFLAGS], Uses = [EAX] in
+  def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
+                  !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
+                  [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, OpSize32, LOCK;
+  let Defs = [RAX, EFLAGS], Uses = [RAX] in
+  def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
+                   !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
+                   [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
+}
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
+    SchedRW = [WriteALULd, WriteRMW] in {
+defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
+                                X86cas8, i64mem,
+                                IIC_CMPX_LOCK_8B>;
+}
+
+// This pseudo must be used when the frame uses RBX as
+// the base pointer. Indeed, in such situation RBX is a reserved
+// register and the register allocator will ignore any use/def of
+// it. In other words, the register will not fix the clobbering of
+// RBX that will happen when setting the arguments for the instrucion.
+// 
+// Unlike the actual related instuction, we mark that this one
+// defines EBX (instead of using EBX).
+// The rationale is that we will define RBX during the expansion of
+// the pseudo. The argument feeding EBX is ebx_input.
+//
+// The additional argument, $ebx_save, is a temporary register used to
+// save the value of RBX accross the actual instruction.
+//
+// To make sure the register assigned to $ebx_save does not interfere with
+// the definition of the actual instruction, we use a definition $dst which
+// is tied to $rbx_save. That way, the live-range of $rbx_save spans accross
+// the instruction and we are sure we will have a valid register to restore
+// the value of RBX.
+let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX],
+    SchedRW = [WriteALULd, WriteRMW], isCodeGenOnly = 1, isPseudo = 1,
+    Constraints = "$ebx_save = $dst", usesCustomInserter = 1 in {
+def LCMPXCHG8B_SAVE_EBX :
+    I<0, Pseudo, (outs GR32:$dst),
+      (ins i64mem:$ptr, GR32:$ebx_input, GR32:$ebx_save),
+      !strconcat("cmpxchg8b", "\t$ptr"),
+      [(set GR32:$dst, (X86cas8save_ebx addr:$ptr, GR32:$ebx_input,
+                                        GR32:$ebx_save))],
+      IIC_CMPX_LOCK_8B>;
+}
+
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
+    Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in {
+defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
+                                 X86cas16, i128mem,
+                                 IIC_CMPX_LOCK_16B>, REX_W;
+}
+
+// Same as LCMPXCHG8B_SAVE_RBX but for the 16 Bytes variant.
+let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
+    Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW],
+    isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst",
+    usesCustomInserter = 1 in {
+def LCMPXCHG16B_SAVE_RBX :
+    I<0, Pseudo, (outs GR64:$dst),
+      (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save),
+      !strconcat("cmpxchg16b", "\t$ptr"),
+      [(set GR64:$dst, (X86cas16save_rbx addr:$ptr, GR64:$rbx_input,
+                                                    GR64:$rbx_save))],
+      IIC_CMPX_LOCK_16B>;
+}
+
+defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
+                               X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
+
+// Atomic exchange and add
+multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
+                             string frag,
+                             InstrItinClass itin8, InstrItinClass itin> {
+  let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
+      SchedRW = [WriteALULd, WriteRMW] in {
+    def NAME#8  : I<opc8, MRMSrcMem, (outs GR8:$dst),
+                    (ins GR8:$val, i8mem:$ptr),
+                    !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+                    [(set GR8:$dst,
+                          (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+                    itin8>;
+    def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
+                    (ins GR16:$val, i16mem:$ptr),
+                    !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+                    [(set
+                       GR16:$dst,
+                       (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+                    itin>, OpSize16;
+    def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
+                    (ins GR32:$val, i32mem:$ptr),
+                    !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+                    [(set
+                       GR32:$dst,
+                       (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+                    itin>, OpSize32;
+    def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
+                     (ins GR64:$val, i64mem:$ptr),
+                     !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+                     [(set
+                        GR64:$dst,
+                        (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+                     itin>;
+  }
+}
+
+defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
+                               IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
+             TB, LOCK;
+
+/* The following multiclass tries to make sure that in code like
+ *    x.store (immediate op x.load(acquire), release)
+ * and
+ *    x.store (register op x.load(acquire), release)
+ * an operation directly on memory is generated instead of wasting a register.
+ * It is not automatic as atomic_store/load are only lowered to MOV instructions
+ * extremely late to prevent them from being accidentally reordered in the backend
+ * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions)
+ */
+multiclass RELEASE_BINOP_MI<SDNode op> {
+    def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
+        "#BINOP "#NAME#"8mi PSEUDO!",
+        [(atomic_store_8 addr:$dst, (op
+            (atomic_load_8 addr:$dst), (i8 imm:$src)))]>;
+    def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src),
+        "#BINOP "#NAME#"8mr PSEUDO!",
+        [(atomic_store_8 addr:$dst, (op
+            (atomic_load_8 addr:$dst), GR8:$src))]>;
+    // NAME#16 is not generated as 16-bit arithmetic instructions are considered
+    // costly and avoided as far as possible by this backend anyway
+    def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
+        "#BINOP "#NAME#"32mi PSEUDO!",
+        [(atomic_store_32 addr:$dst, (op
+            (atomic_load_32 addr:$dst), (i32 imm:$src)))]>;
+    def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
+        "#BINOP "#NAME#"32mr PSEUDO!",
+        [(atomic_store_32 addr:$dst, (op
+            (atomic_load_32 addr:$dst), GR32:$src))]>;
+    def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
+        "#BINOP "#NAME#"64mi32 PSEUDO!",
+        [(atomic_store_64 addr:$dst, (op
+            (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>;
+    def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
+        "#BINOP "#NAME#"64mr PSEUDO!",
+        [(atomic_store_64 addr:$dst, (op
+            (atomic_load_64 addr:$dst), GR64:$src))]>;
+}
+let Defs = [EFLAGS] in {
+  defm RELEASE_ADD : RELEASE_BINOP_MI<add>;
+  defm RELEASE_AND : RELEASE_BINOP_MI<and>;
+  defm RELEASE_OR  : RELEASE_BINOP_MI<or>;
+  defm RELEASE_XOR : RELEASE_BINOP_MI<xor>;
+  // Note: we don't deal with sub, because substractions of constants are
+  //       optimized into additions before this code can run.
+}
+
+// Same as above, but for floating-point.
+// FIXME: imm version.
+// FIXME: Version that doesn't clobber $src, using AVX's VADDSS.
+// FIXME: This could also handle SIMD operations with *ps and *pd instructions.
+let usesCustomInserter = 1 in {
+multiclass RELEASE_FP_BINOP_MI<SDNode op> {
+    def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src),
+        "#BINOP "#NAME#"32mr PSEUDO!",
+        [(atomic_store_32 addr:$dst,
+	   (i32 (bitconvert (op
+             (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))),
+	      FR32:$src))))]>, Requires<[HasSSE1]>;
+    def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src),
+        "#BINOP "#NAME#"64mr PSEUDO!",
+        [(atomic_store_64 addr:$dst,
+	   (i64 (bitconvert (op
+             (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))),
+	      FR64:$src))))]>, Requires<[HasSSE2]>;
+}
+defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>;
+// FIXME: Add fsub, fmul, fdiv, ...
+}
+
+multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> {
+    def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst),
+        "#UNOP "#NAME#"8m PSEUDO!",
+        [(atomic_store_8 addr:$dst, dag8)]>;
+    def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst),
+        "#UNOP "#NAME#"16m PSEUDO!",
+        [(atomic_store_16 addr:$dst, dag16)]>;
+    def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst),
+        "#UNOP "#NAME#"32m PSEUDO!",
+        [(atomic_store_32 addr:$dst, dag32)]>;
+    def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst),
+        "#UNOP "#NAME#"64m PSEUDO!",
+        [(atomic_store_64 addr:$dst, dag64)]>;
+}
+
+let Defs = [EFLAGS] in {
+  defm RELEASE_INC : RELEASE_UNOP<
+      (add (atomic_load_8  addr:$dst), (i8 1)),
+      (add (atomic_load_16 addr:$dst), (i16 1)),
+      (add (atomic_load_32 addr:$dst), (i32 1)),
+      (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>;
+  defm RELEASE_DEC : RELEASE_UNOP<
+      (add (atomic_load_8  addr:$dst), (i8 -1)),
+      (add (atomic_load_16 addr:$dst), (i16 -1)),
+      (add (atomic_load_32 addr:$dst), (i32 -1)),
+      (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>;
+}
+/*
+TODO: These don't work because the type inference of TableGen fails.
+TODO: find a way to fix it.
+let Defs = [EFLAGS] in {
+  defm RELEASE_NEG : RELEASE_UNOP<
+      (ineg (atomic_load_8  addr:$dst)),
+      (ineg (atomic_load_16 addr:$dst)),
+      (ineg (atomic_load_32 addr:$dst)),
+      (ineg (atomic_load_64 addr:$dst))>;
+}
+// NOT doesn't set flags.
+defm RELEASE_NOT : RELEASE_UNOP<
+    (not (atomic_load_8  addr:$dst)),
+    (not (atomic_load_16 addr:$dst)),
+    (not (atomic_load_32 addr:$dst)),
+    (not (atomic_load_64 addr:$dst))>;
+*/
+
+def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
+			"#RELEASE_MOV8mi PSEUDO!",
+			[(atomic_store_8 addr:$dst, (i8 imm:$src))]>;
+def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src),
+			"#RELEASE_MOV16mi PSEUDO!",
+			[(atomic_store_16 addr:$dst, (i16 imm:$src))]>;
+def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
+			"#RELEASE_MOV32mi PSEUDO!",
+			[(atomic_store_32 addr:$dst, (i32 imm:$src))]>;
+def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
+			"#RELEASE_MOV64mi32 PSEUDO!",
+			[(atomic_store_64 addr:$dst, i64immSExt32:$src)]>;
+
+def RELEASE_MOV8mr  : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
+                        "#RELEASE_MOV8mr PSEUDO!",
+                        [(atomic_store_8  addr:$dst, GR8 :$src)]>;
+def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
+                        "#RELEASE_MOV16mr PSEUDO!",
+                        [(atomic_store_16 addr:$dst, GR16:$src)]>;
+def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
+                        "#RELEASE_MOV32mr PSEUDO!",
+                        [(atomic_store_32 addr:$dst, GR32:$src)]>;
+def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
+                        "#RELEASE_MOV64mr PSEUDO!",
+                        [(atomic_store_64 addr:$dst, GR64:$src)]>;
+
+def ACQUIRE_MOV8rm  : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
+                      "#ACQUIRE_MOV8rm PSEUDO!",
+                      [(set GR8:$dst,  (atomic_load_8  addr:$src))]>;
+def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
+                      "#ACQUIRE_MOV16rm PSEUDO!",
+                      [(set GR16:$dst, (atomic_load_16 addr:$src))]>;
+def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
+                      "#ACQUIRE_MOV32rm PSEUDO!",
+                      [(set GR32:$dst, (atomic_load_32 addr:$src))]>;
+def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
+                      "#ACQUIRE_MOV64rm PSEUDO!",
+                      [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
+
+//===----------------------------------------------------------------------===//
+// DAG Pattern Matching Rules
+//===----------------------------------------------------------------------===//
+
+// Use AND/OR to store 0/-1 in memory when optimizing for minsize. This saves
+// binary size compared to a regular MOV, but it introduces an unnecessary
+// load, so is not suitable for regular or optsize functions.
+let Predicates = [OptForMinSize] in {
+def : Pat<(store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>;
+def : Pat<(store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>;
+def : Pat<(store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>;
+def : Pat<(store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>;
+def : Pat<(store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>;
+def : Pat<(store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>;
+}
+
+// In kernel code model, we can get the address of a label
+// into a register with 'movq'.  FIXME: This is a hack, the 'imm' predicate of
+// the MOV64ri32 should accept these.
+def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
+          (MOV64ri32 tconstpool  :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
+          (MOV64ri32 tjumptable  :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+          (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+          (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper mcsym:$dst)),
+          (MOV64ri32 mcsym:$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+          (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>;
+
+// If we have small model and -static mode, it is safe to store global addresses
+// directly as immediates.  FIXME: This is really a hack, the 'imm' predicate
+// for MOV64mi32 should handle this sort of thing.
+def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tconstpool:$src)>,
+          Requires<[NearData, IsNotPIC]>;
+def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tjumptable:$src)>,
+          Requires<[NearData, IsNotPIC]>;
+def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
+          Requires<[NearData, IsNotPIC]>;
+def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, texternalsym:$src)>,
+          Requires<[NearData, IsNotPIC]>;
+def : Pat<(store (i64 (X86Wrapper mcsym:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, mcsym:$src)>,
+          Requires<[NearData, IsNotPIC]>;
+def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tblockaddress:$src)>,
+          Requires<[NearData, IsNotPIC]>;
+
+def : Pat<(i32 (X86RecoverFrameAlloc mcsym:$dst)), (MOV32ri mcsym:$dst)>;
+def : Pat<(i64 (X86RecoverFrameAlloc mcsym:$dst)), (MOV64ri mcsym:$dst)>;
+
+// Calls
+
+// tls has some funny stuff here...
+// This corresponds to movabs $foo@tpoff, %rax
+def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
+          (MOV64ri32 tglobaltlsaddr :$dst)>;
+// This corresponds to add $foo@tpoff, %rax
+def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
+          (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;
+
+
+// Direct PC relative function call for small code model. 32-bit displacement
+// sign extended to 64-bit.
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+          (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+          (CALL64pcrel32 texternalsym:$dst)>;
+
+// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
+// can never use callee-saved registers. That is the purpose of the GR64_TC
+// register classes.
+//
+// The only volatile register that is never used by the calling convention is
+// %r11. This happens when calling a vararg function with 6 arguments.
+//
+// Match an X86tcret that uses less than 7 volatile registers.
+def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
+                             (X86tcret node:$ptr, node:$off), [{
+  // X86tcret args: (*chain, ptr, imm, regs..., glue)
+  unsigned NumRegs = 0;
+  for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
+    if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
+      return false;
+  return true;
+}]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+          (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
+          Requires<[Not64BitMode]>;
+
+// FIXME: This is disabled for 32-bit PIC mode because the global base
+// register which is part of the address mode may be assigned a
+// callee-saved register.
+def : Pat<(X86tcret (load addr:$dst), imm:$off),
+          (TCRETURNmi addr:$dst, imm:$off)>,
+          Requires<[Not64BitMode, IsNotPIC]>;
+
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
+          (TCRETURNdi tglobaladdr:$dst, imm:$off)>,
+          Requires<[NotLP64]>;
+
+def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
+          (TCRETURNdi texternalsym:$dst, imm:$off)>,
+          Requires<[NotLP64]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+          (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
+          Requires<[In64BitMode]>;
+
+// Don't fold loads into X86tcret requiring more than 6 regs.
+// There wouldn't be enough scratch registers for base+index.
+def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
+          (TCRETURNmi64 addr:$dst, imm:$off)>,
+          Requires<[In64BitMode]>;
+
+def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
+          (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
+          Requires<[IsLP64]>;
+
+def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
+          (TCRETURNdi64 texternalsym:$dst, imm:$off)>,
+          Requires<[IsLP64]>;
+
+// Normal calls, with various flavors of addresses.
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+          (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+          (CALLpcrel32 texternalsym:$dst)>;
+def : Pat<(X86call (i32 imm:$dst)),
+          (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(X86cmp GR8:$src1, 0),
+          (TEST8rr GR8:$src1, GR8:$src1)>;
+def : Pat<(X86cmp GR16:$src1, 0),
+          (TEST16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(X86cmp GR32:$src1, 0),
+          (TEST32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(X86cmp GR64:$src1, 0),
+          (TEST64rr GR64:$src1, GR64:$src1)>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32,
+                  Instruction Inst64> {
+  let Predicates = [HasCMov] in {
+    def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
+              (Inst16 GR16:$src2, addr:$src1)>;
+    def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
+              (Inst32 GR32:$src2, addr:$src1)>;
+    def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
+              (Inst64 GR64:$src2, addr:$src1)>;
+  }
+}
+
+defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>;
+defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>;
+defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>;
+defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>;
+defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>;
+defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>;
+defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>;
+defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>;
+defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>;
+defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>;
+defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>;
+defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>;
+defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>;
+defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>;
+defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>;
+defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
+
+// zextload bool -> zextload byte
+// i1 stored in one byte in zero-extended form.
+// Upper bits cleanup should be executed before Store.
+def : Pat<(zextloadi8i1  addr:$src), (MOV8rm addr:$src)>;
+def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(zextloadi64i1 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+
+// extload bool -> extload byte
+// When extloading from 16-bit and smaller memory locations into 64-bit
+// registers, use zero-extending loads so that the entire 64-bit register is
+// defined, avoiding partial-register updates.
+
+def : Pat<(extloadi8i1 addr:$src),   (MOV8rm      addr:$src)>;
+def : Pat<(extloadi16i1 addr:$src),  (MOVZX16rm8  addr:$src)>;
+def : Pat<(extloadi32i1 addr:$src),  (MOVZX32rm8  addr:$src)>;
+def : Pat<(extloadi16i8 addr:$src),  (MOVZX16rm8  addr:$src)>;
+def : Pat<(extloadi32i8 addr:$src),  (MOVZX32rm8  addr:$src)>;
+def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
+
+// For other extloads, use subregs, since the high contents of the register are
+// defined after an extload.
+def : Pat<(extloadi64i1 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i8 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i16 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i32 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
+
+// anyext. Define these to do an explicit zero-extend to
+// avoid partial-register updates.
+def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG
+                                     (MOVZX32rr8 GR8 :$src), sub_16bit)>;
+def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>;
+
+// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32.
+def : Pat<(i32 (anyext GR16:$src)),
+          (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;
+
+def : Pat<(i64 (anyext GR8 :$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr8  GR8  :$src), sub_32bit)>;
+def : Pat<(i64 (anyext GR16:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>;
+def : Pat<(i64 (anyext GR32:$src)),
+          (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. Any other 32-bit operation will zero-extend
+// up to 64 bits.
+def def32 : PatLeaf<(i32 GR32:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg &&
+         N->getOpcode() != ISD::AssertSext;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)),
+          (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+//===----------------------------------------------------------------------===//
+// Pattern match OR as ADD
+//===----------------------------------------------------------------------===//
+
+// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be
+// 3-addressified into an LEA instruction to avoid copies.  However, we also
+// want to finally emit these instructions as an or at the end of the code
+// generator to make the generated code easier to read.  To do this, we select
+// into "disjoint bits" pseudo ops.
+
+// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
+def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
+
+  APInt KnownZero0, KnownOne0;
+  CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
+  APInt KnownZero1, KnownOne1;
+  CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
+  return (~KnownZero0 & ~KnownZero1) == 0;
+}]>;
+
+
+// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
+// Try this before the selecting to OR.
+let AddedComplexity = 5, SchedRW = [WriteALU] in {
+
+let isConvertibleToThreeAddress = 1,
+    Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
+let isCommutable = 1 in {
+def ADD16rr_DB  : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    "", // orw/addw REG, REG
+                    [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
+def ADD32rr_DB  : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                    "", // orl/addl REG, REG
+                    [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>;
+def ADD64rr_DB  : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                    "", // orq/addq REG, REG
+                    [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>;
+} // isCommutable
+
+// NOTE: These are order specific, we want the ri8 forms to be listed
+// first so that they are slightly preferred to the ri forms.
+
+def ADD16ri8_DB : I<0, Pseudo,
+                    (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+                    "", // orw/addw REG, imm8
+                    [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>;
+def ADD16ri_DB  : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    "", // orw/addw REG, imm
+                    [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>;
+
+def ADD32ri8_DB : I<0, Pseudo,
+                    (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+                    "", // orl/addl REG, imm8
+                    [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>;
+def ADD32ri_DB  : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                    "", // orl/addl REG, imm
+                    [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>;
+
+
+def ADD64ri8_DB : I<0, Pseudo,
+                    (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                    "", // orq/addq REG, imm8
+                    [(set GR64:$dst, (or_is_add GR64:$src1,
+                                                i64immSExt8:$src2))]>;
+def ADD64ri32_DB : I<0, Pseudo,
+                     (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                      "", // orq/addq REG, imm
+                      [(set GR64:$dst, (or_is_add GR64:$src1,
+                                                  i64immSExt32:$src2))]>;
+}
+} // AddedComplexity, SchedRW
+
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// Odd encoding trick: -128 fits into an 8-bit immediate field while
+// +128 doesn't, so in this special case use a sub instead of an add.
+def : Pat<(add GR16:$src1, 128),
+          (SUB16ri8 GR16:$src1, -128)>;
+def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
+          (SUB16mi8 addr:$dst, -128)>;
+
+def : Pat<(add GR32:$src1, 128),
+          (SUB32ri8 GR32:$src1, -128)>;
+def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
+          (SUB32mi8 addr:$dst, -128)>;
+
+def : Pat<(add GR64:$src1, 128),
+          (SUB64ri8 GR64:$src1, -128)>;
+def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
+          (SUB64mi8 addr:$dst, -128)>;
+
+// The same trick applies for 32-bit immediate fields in 64-bit
+// instructions.
+def : Pat<(add GR64:$src1, 0x0000000080000000),
+          (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
+def : Pat<(store (add (loadi64 addr:$dst), 0x0000000080000000), addr:$dst),
+          (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
+
+// To avoid needing to materialize an immediate in a register, use a 32-bit and
+// with implicit zero-extension instead of a 64-bit and if the immediate has at
+// least 32 bits of leading zeros. If in addition the last 32 bits can be
+// represented with a sign extension of a 8 bit constant, use that.
+// This can also reduce instruction size by eliminating the need for the REX
+// prefix.
+
+// AddedComplexity is needed to give priority over i64immSExt8 and i64immSExt32.
+let AddedComplexity = 1 in {
+def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm),
+          (SUBREG_TO_REG
+            (i64 0),
+            (AND32ri8
+              (EXTRACT_SUBREG GR64:$src, sub_32bit),
+              (i32 (GetLo8XForm imm:$imm))),
+            sub_32bit)>;
+
+def : Pat<(and GR64:$src, i64immZExt32:$imm),
+          (SUBREG_TO_REG
+            (i64 0),
+            (AND32ri
+              (EXTRACT_SUBREG GR64:$src, sub_32bit),
+              (i32 (GetLo32XForm imm:$imm))),
+            sub_32bit)>;
+} // AddedComplexity = 1
+
+
+// AddedComplexity is needed due to the increased complexity on the
+// i64immZExt32SExt8 and i64immZExt32 patterns above. Applying this to all
+// the MOVZX patterns keeps thems together in DAGIsel tables.
+let AddedComplexity = 1 in {
+// r & (2^16-1) ==> movz
+def : Pat<(and GR32:$src1, 0xffff),
+          (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1,
+                                                             GR32_ABCD)),
+                                      sub_8bit))>,
+      Requires<[Not64BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+           (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG
+            (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)),
+             sub_16bit)>,
+      Requires<[Not64BitMode]>;
+
+// r & (2^32-1) ==> movz
+def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
+          (SUBREG_TO_REG (i64 0),
+                         (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)),
+                         sub_32bit)>;
+// r & (2^16-1) ==> movz
+def : Pat<(and GR64:$src, 0xffff),
+          (SUBREG_TO_REG (i64 0),
+                      (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))),
+                      sub_32bit)>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR64:$src, 0xff),
+          (SUBREG_TO_REG (i64 0),
+                         (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))),
+                         sub_32bit)>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+           (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>,
+      Requires<[In64BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+           (EXTRACT_SUBREG (MOVZX32rr8 (i8
+            (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>,
+      Requires<[In64BitMode]>;
+} // AddedComplexity = 1
+
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR32:$src, i16),
+          (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+          (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+                                                             GR32_ABCD)),
+                                      sub_8bit))>,
+      Requires<[Not64BitMode]>;
+
+def : Pat<(sext_inreg GR16:$src, i8),
+           (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG
+            (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))),
+             sub_16bit)>,
+      Requires<[Not64BitMode]>;
+
+def : Pat<(sext_inreg GR64:$src, i32),
+          (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
+def : Pat<(sext_inreg GR64:$src, i16),
+          (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR64:$src, i8),
+          (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+          (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>,
+      Requires<[In64BitMode]>;
+def : Pat<(sext_inreg GR16:$src, i8),
+           (EXTRACT_SUBREG (MOVSX32rr8
+            (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>,
+      Requires<[In64BitMode]>;
+
+// sext, sext_load, zext, zext_load
+def: Pat<(i16 (sext GR8:$src)),
+          (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(sextloadi16i8 addr:$src),
+          (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>;
+def: Pat<(i16 (zext GR8:$src)),
+          (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(zextloadi16i8 addr:$src),
+          (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
+
+// trunc patterns
+def : Pat<(i16 (trunc GR32:$src)),
+          (EXTRACT_SUBREG GR32:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                          sub_8bit)>,
+      Requires<[Not64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                          sub_8bit)>,
+      Requires<[Not64BitMode]>;
+def : Pat<(i32 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, sub_32bit)>;
+def : Pat<(i16 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, sub_8bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+          (EXTRACT_SUBREG GR32:$src, sub_8bit)>,
+      Requires<[In64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG GR16:$src, sub_8bit)>,
+      Requires<[In64BitMode]>;
+
+// h-register tricks
+def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
+          (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                          sub_8bit_hi)>,
+      Requires<[Not64BitMode]>;
+def : Pat<(i8 (trunc (srl_su (i32 (anyext GR16:$src)), (i8 8)))),
+          (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                          sub_8bit_hi)>,
+      Requires<[Not64BitMode]>;
+def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                          sub_8bit_hi)>,
+      Requires<[Not64BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+          (EXTRACT_SUBREG
+            (MOVZX32rr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              sub_8bit_hi)),
+            sub_16bit)>,
+      Requires<[Not64BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+                                                             GR16_ABCD)),
+                                      sub_8bit_hi))>,
+      Requires<[Not64BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+                                                             GR16_ABCD)),
+                                      sub_8bit_hi))>,
+      Requires<[Not64BitMode]>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+                                                             GR32_ABCD)),
+                                      sub_8bit_hi))>,
+      Requires<[Not64BitMode]>;
+def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+                                                             GR32_ABCD)),
+                                      sub_8bit_hi))>,
+      Requires<[Not64BitMode]>;
+
+// h-register tricks.
+// For now, be conservative on x86-64 and use an h-register extract only if the
+// value is immediately zero-extended or stored, which are somewhat common
+// cases. This uses a bunch of code to prevent a register requiring a REX prefix
+// from being allocated in the same instruction as the h register, as there's
+// currently no way to describe this requirement to the register allocator.
+
+// h-register extract and zero-extend.
+def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+                              sub_8bit_hi)),
+            sub_32bit)>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+          (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+                                                                   GR32_ABCD)),
+                                             sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+          (EXTRACT_SUBREG
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              sub_8bit_hi)),
+            sub_16bit)>,
+      Requires<[In64BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              sub_8bit_hi)),
+            sub_32bit)>;
+def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              sub_8bit_hi)),
+            sub_32bit)>;
+
+// h-register extract and store.
+def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+                            sub_8bit_hi))>;
+def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+
+
+// (shl x, 1) ==> (add x, x)
+// Note that if x is undef (immediate or otherwise), we could theoretically
+// end up with the two uses of x getting different values, producing a result
+// where the least significant bit is not 0. However, the probability of this
+// happening is considered low enough that this is officially not a
+// "real problem".
+def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr  GR8 :$src1, GR8 :$src1)>;
+def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
+
+// Helper imms that check if a mask doesn't change significant shift bits.
+def immShift32 : ImmLeaf<i8, [{
+  return countTrailingOnes<uint64_t>(Imm) >= 5;
+}]>;
+def immShift64 : ImmLeaf<i8, [{
+  return countTrailingOnes<uint64_t>(Imm) >= 6;
+}]>;
+
+// Shift amount is implicitly masked.
+multiclass MaskedShiftAmountPats<SDNode frag, string name> {
+  // (shift x (and y, 31)) ==> (shift x, y)
+  def : Pat<(frag GR8:$src1, (and CL, immShift32)),
+            (!cast<Instruction>(name # "8rCL") GR8:$src1)>;
+  def : Pat<(frag GR16:$src1, (and CL, immShift32)),
+            (!cast<Instruction>(name # "16rCL") GR16:$src1)>;
+  def : Pat<(frag GR32:$src1, (and CL, immShift32)),
+            (!cast<Instruction>(name # "32rCL") GR32:$src1)>;
+  def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
+            (!cast<Instruction>(name # "8mCL") addr:$dst)>;
+  def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
+            (!cast<Instruction>(name # "16mCL") addr:$dst)>;
+  def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+            (!cast<Instruction>(name # "32mCL") addr:$dst)>;
+
+  // (shift x (and y, 63)) ==> (shift x, y)
+  def : Pat<(frag GR64:$src1, (and CL, immShift64)),
+            (!cast<Instruction>(name # "64rCL") GR64:$src1)>;
+  def : Pat<(store (frag (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
+            (!cast<Instruction>(name # "64mCL") addr:$dst)>;
+}
+
+defm : MaskedShiftAmountPats<shl, "SHL">;
+defm : MaskedShiftAmountPats<srl, "SHR">;
+defm : MaskedShiftAmountPats<sra, "SAR">;
+defm : MaskedShiftAmountPats<rotl, "ROL">;
+defm : MaskedShiftAmountPats<rotr, "ROR">;
+
+// Double shift amount is implicitly masked.
+multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> {
+  // (shift x (and y, 31)) ==> (shift x, y)
+  def : Pat<(frag GR16:$src1, GR16:$src2, (and CL, immShift32)),
+            (!cast<Instruction>(name # "16rrCL") GR16:$src1, GR16:$src2)>;
+  def : Pat<(frag GR32:$src1, GR32:$src2, (and CL, immShift32)),
+            (!cast<Instruction>(name # "32rrCL") GR32:$src1, GR32:$src2)>;
+
+  // (shift x (and y, 63)) ==> (shift x, y)
+  def : Pat<(frag GR64:$src1, GR64:$src2, (and CL, immShift64)),
+            (!cast<Instruction>(name # "64rrCL") GR64:$src1, GR64:$src2)>;
+}
+
+defm : MaskedDoubleShiftAmountPats<X86shld, "SHLD">;
+defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">;
+
+// (anyext (setcc_carry)) -> (setcc_carry)
+def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C16r)>;
+def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+
+//===----------------------------------------------------------------------===//
+// EFLAGS-defining Patterns
+//===----------------------------------------------------------------------===//
+
+// add reg, reg
+def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
+
+// add reg, mem
+def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
+          (ADD8rm GR8:$src1, addr:$src2)>;
+def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
+          (ADD16rm GR16:$src1, addr:$src2)>;
+def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
+          (ADD32rm GR32:$src1, addr:$src2)>;
+
+// add reg, imm
+def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri  GR8:$src1 , imm:$src2)>;
+def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(add GR16:$src1, i16immSExt8:$src2),
+          (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(add GR32:$src1, i32immSExt8:$src2),
+          (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// sub reg, reg
+def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
+
+// sub reg, mem
+def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
+          (SUB8rm GR8:$src1, addr:$src2)>;
+def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
+          (SUB16rm GR16:$src1, addr:$src2)>;
+def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
+          (SUB32rm GR32:$src1, addr:$src2)>;
+
+// sub reg, imm
+def : Pat<(sub GR8:$src1, imm:$src2),
+          (SUB8ri GR8:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, imm:$src2),
+          (SUB16ri GR16:$src1, imm:$src2)>;
+def : Pat<(sub GR32:$src1, imm:$src2),
+          (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
+          (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
+          (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// sub 0, reg
+def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r  GR8 :$src)>;
+def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
+def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
+def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
+
+// mul reg, reg
+def : Pat<(mul GR16:$src1, GR16:$src2),
+          (IMUL16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(mul GR32:$src1, GR32:$src2),
+          (IMUL32rr GR32:$src1, GR32:$src2)>;
+
+// mul reg, mem
+def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
+          (IMUL16rm GR16:$src1, addr:$src2)>;
+def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
+          (IMUL32rm GR32:$src1, addr:$src2)>;
+
+// mul reg, imm
+def : Pat<(mul GR16:$src1, imm:$src2),
+          (IMUL16rri GR16:$src1, imm:$src2)>;
+def : Pat<(mul GR32:$src1, imm:$src2),
+          (IMUL32rri GR32:$src1, imm:$src2)>;
+def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
+          (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
+          (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// reg = mul mem, imm
+def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
+          (IMUL16rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
+          (IMUL32rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
+          (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
+          (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
+
+// Patterns for nodes that do not produce flags, for instructions that do.
+
+// addition
+def : Pat<(add GR64:$src1, GR64:$src2),
+          (ADD64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt8:$src2),
+          (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt32:$src2),
+          (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
+          (ADD64rm GR64:$src1, addr:$src2)>;
+
+// subtraction
+def : Pat<(sub GR64:$src1, GR64:$src2),
+          (SUB64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
+          (SUB64rm GR64:$src1, addr:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
+          (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
+          (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Multiply
+def : Pat<(mul GR64:$src1, GR64:$src2),
+          (IMUL64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
+          (IMUL64rm GR64:$src1, addr:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
+          (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
+          (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
+          (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
+          (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
+
+// Increment/Decrement reg.
+// Do not make INC/DEC if it is slow
+let Predicates = [NotSlowIncDec] in {
+  def : Pat<(add GR8:$src, 1),   (INC8r GR8:$src)>;
+  def : Pat<(add GR16:$src, 1),  (INC16r GR16:$src)>;
+  def : Pat<(add GR32:$src, 1),  (INC32r GR32:$src)>;
+  def : Pat<(add GR64:$src, 1),  (INC64r GR64:$src)>;
+  def : Pat<(add GR8:$src, -1),  (DEC8r GR8:$src)>;
+  def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>;
+  def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>;
+  def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>;
+}
+
+// or reg/reg.
+def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>;
+
+// or reg/mem
+def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
+          (OR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
+          (OR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
+          (OR32rm GR32:$src1, addr:$src2)>;
+def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
+          (OR64rm GR64:$src1, addr:$src2)>;
+
+// or reg/imm
+def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri  GR8 :$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, i16immSExt8:$src2),
+          (OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(or GR32:$src1, i32immSExt8:$src2),
+          (OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt8:$src2),
+          (OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt32:$src2),
+          (OR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// xor reg/reg
+def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>;
+
+// xor reg/mem
+def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
+          (XOR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
+          (XOR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
+          (XOR32rm GR32:$src1, addr:$src2)>;
+def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
+          (XOR64rm GR64:$src1, addr:$src2)>;
+
+// xor reg/imm
+def : Pat<(xor GR8:$src1, imm:$src2),
+          (XOR8ri GR8:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, imm:$src2),
+          (XOR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(xor GR32:$src1, imm:$src2),
+          (XOR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, i16immSExt8:$src2),
+          (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(xor GR32:$src1, i32immSExt8:$src2),
+          (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt8:$src2),
+          (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt32:$src2),
+          (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// and reg/reg
+def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>;
+
+// and reg/mem
+def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
+          (AND8rm GR8:$src1, addr:$src2)>;
+def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
+          (AND16rm GR16:$src1, addr:$src2)>;
+def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
+          (AND32rm GR32:$src1, addr:$src2)>;
+def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
+          (AND64rm GR64:$src1, addr:$src2)>;
+
+// and reg/imm
+def : Pat<(and GR8:$src1, imm:$src2),
+          (AND8ri GR8:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, imm:$src2),
+          (AND16ri GR16:$src1, imm:$src2)>;
+def : Pat<(and GR32:$src1, imm:$src2),
+          (AND32ri GR32:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, i16immSExt8:$src2),
+          (AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(and GR32:$src1, i32immSExt8:$src2),
+          (AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt8:$src2),
+          (AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt32:$src2),
+          (AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Bit scan instruction patterns to match explicit zero-undef behavior.
+def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
+def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
+def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
+def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
+
+// When HasMOVBE is enabled it is possible to get a non-legalized
+// register-register 16 bit bswap. This maps it to a ROL instruction.
+let Predicates = [HasMOVBE] in {
+ def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td
new file mode 100644
index 000000000000..4ea223e82be9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td
@@ -0,0 +1,358 @@
+//===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 jump, return, call, and related instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+// Return instructions.
+//
+// The X86retflag return instructions are variadic because we may add ST0 and
+// ST1 arguments when returning values on the x87 stack.
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
+  def RETL   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+                    "ret{l}", [], IIC_RET>, OpSize32,
+                    Requires<[Not64BitMode]>;
+  def RETQ   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+                    "ret{q}", [], IIC_RET>, OpSize32,
+                    Requires<[In64BitMode]>;
+  def RETW   : I   <0xC3, RawFrm, (outs), (ins),
+                    "ret{w}",
+                    [], IIC_RET>, OpSize16;
+  def RETIL  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+                    "ret{l}\t$amt",
+                    [], IIC_RET_IMM>, OpSize32,
+               Requires<[Not64BitMode]>;
+  def RETIQ  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+                    "ret{q}\t$amt",
+                    [], IIC_RET_IMM>, OpSize32,
+               Requires<[In64BitMode]>;
+  def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
+                    "ret{w}\t$amt",
+                    [], IIC_RET_IMM>, OpSize16;
+  def LRETL  : I   <0xCB, RawFrm, (outs), (ins),
+                    "{l}ret{l|f}", [], IIC_RET>, OpSize32;
+  def LRETQ  : RI  <0xCB, RawFrm, (outs), (ins),
+                    "{l}ret{|f}q", [], IIC_RET>, Requires<[In64BitMode]>;
+  def LRETW  : I   <0xCB, RawFrm, (outs), (ins),
+                    "{l}ret{w|f}", [], IIC_RET>, OpSize16;
+  def LRETIL : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                    "{l}ret{l|f}\t$amt", [], IIC_RET>, OpSize32;
+  def LRETIQ : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                    "{l}ret{|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>;
+  def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                    "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize16;
+
+  // The machine return from interrupt instruction, but sometimes we need to
+  // perform a post-epilogue stack adjustment. Codegen emits the pseudo form
+  // which expands to include an SP adjustment if necessary.
+  def IRET16 : I   <0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>,
+               OpSize16;
+  def IRET32 : I   <0xcf, RawFrm, (outs), (ins), "iret{l|d}", [],
+                    IIC_IRET>, OpSize32;
+  def IRET64 : RI  <0xcf, RawFrm, (outs), (ins), "iretq", [],
+                    IIC_IRET>, Requires<[In64BitMode]>;
+  let isCodeGenOnly = 1 in
+  def IRET : PseudoI<(outs), (ins i32imm:$adj), [(X86iret timm:$adj)]>;
+  def RET  : PseudoI<(outs), (ins i32imm:$adj, variable_ops), [(X86retflag timm:$adj)]>;
+}
+
+// Unconditional branches.
+let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
+  def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
+                       "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
+  let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+    def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget16:$dst),
+                          "jmp\t$dst", [], IIC_JMP_REL>, OpSize16;
+    def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget32:$dst),
+                          "jmp\t$dst", [], IIC_JMP_REL>, OpSize32;
+  }
+}
+
+// Conditional Branches.
+let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
+  multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
+    def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm,
+                       [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>;
+    let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+      def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm,
+                         [], IIC_Jcc>, OpSize16, TB;
+      def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm,
+                         [], IIC_Jcc>, TB, OpSize32;
+    }
+  }
+}
+
+defm JO  : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>;
+defm JNO : ICBr<0x71, 0x81, "jno\t$dst", X86_COND_NO>;
+defm JB  : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>;
+defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>;
+defm JE  : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>;
+defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>;
+defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>;
+defm JA  : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>;
+defm JS  : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>;
+defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>;
+defm JP  : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>;
+defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>;
+defm JL  : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>;
+defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>;
+defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
+defm JG  : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
+
+// jcx/jecx/jrcx instructions.
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in {
+  // These are the 32-bit versions of this instruction for the asmparser.  In
+  // 32-bit mode, the address size prefix is jcxz and the unprefixed version is
+  // jecxz.
+  let Uses = [CX] in
+    def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+                        "jcxz\t$dst", [], IIC_JCXZ>, AdSize16,
+                        Requires<[Not64BitMode]>;
+  let Uses = [ECX] in
+    def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+                        "jecxz\t$dst", [], IIC_JCXZ>, AdSize32;
+
+  let Uses = [RCX] in
+    def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+                         "jrcxz\t$dst", [], IIC_JCXZ>, AdSize64,
+                         Requires<[In64BitMode]>;
+}
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def JMP16r     : I<0xFF, MRM4r, (outs), (ins GR16:$dst), "jmp{w}\t{*}$dst",
+                     [(brind GR16:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
+                   OpSize16, Sched<[WriteJump]>;
+  def JMP16m     : I<0xFF, MRM4m, (outs), (ins i16mem:$dst), "jmp{w}\t{*}$dst",
+                     [(brind (loadi16 addr:$dst))], IIC_JMP_MEM>,
+                   Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>;
+
+  def JMP32r     : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
+                     [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
+                   OpSize32, Sched<[WriteJump]>;
+  def JMP32m     : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
+                     [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>,
+                   Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>;
+
+  def JMP64r     : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
+                     [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>,
+                   Sched<[WriteJump]>;
+  def JMP64m     : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
+                     [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>,
+                   Requires<[In64BitMode]>, Sched<[WriteJumpLd]>;
+
+  let Predicates = [Not64BitMode] in {
+    def FARJMP16i  : Iseg16<0xEA, RawFrmImm16, (outs),
+                            (ins i16imm:$off, i16imm:$seg),
+                            "ljmp{w}\t$seg, $off", [],
+                            IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
+    def FARJMP32i  : Iseg32<0xEA, RawFrmImm16, (outs),
+                            (ins i32imm:$off, i16imm:$seg),
+                            "ljmp{l}\t$seg, $off", [],
+                            IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+  }
+  def FARJMP64   : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
+                      "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
+                   Sched<[WriteJump]>;
+
+  def FARJMP16m  : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
+                     "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize16,
+                   Sched<[WriteJumpLd]>;
+  def FARJMP32m  : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
+                     "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize32,
+                   Sched<[WriteJumpLd]>;
+}
+
+
+// Loop instructions
+let SchedRW = [WriteJump] in {
+def LOOP   : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>;
+def LOOPE  : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>;
+def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. ESP is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Uses = [ESP] in {
+    def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
+                           (outs), (ins i32imm_pcrel:$dst),
+                           "call{l}\t$dst", [], IIC_CALL_RI>, OpSize32,
+                      Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+    let hasSideEffects = 0 in
+      def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
+                             (outs), (ins i16imm_pcrel:$dst),
+                             "call{w}\t$dst", [], IIC_CALL_RI>, OpSize16,
+                        Sched<[WriteJump]>;
+    def CALL16r     : I<0xFF, MRM2r, (outs), (ins GR16:$dst),
+                        "call{w}\t{*}$dst", [(X86call GR16:$dst)], IIC_CALL_RI>,
+                      OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+    def CALL16m     : I<0xFF, MRM2m, (outs), (ins i16mem:$dst),
+                        "call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))],
+                        IIC_CALL_MEM>, OpSize16,
+                      Requires<[Not64BitMode,FavorMemIndirectCall]>,
+                      Sched<[WriteJumpLd]>;
+    def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
+                        "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
+                      OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
+    def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
+                        "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))],
+                        IIC_CALL_MEM>, OpSize32,
+                      Requires<[Not64BitMode,FavorMemIndirectCall]>,
+                      Sched<[WriteJumpLd]>;
+
+    let Predicates = [Not64BitMode] in {
+      def FARCALL16i  : Iseg16<0x9A, RawFrmImm16, (outs),
+                               (ins i16imm:$off, i16imm:$seg),
+                               "lcall{w}\t$seg, $off", [],
+                               IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
+      def FARCALL32i  : Iseg32<0x9A, RawFrmImm16, (outs),
+                               (ins i32imm:$off, i16imm:$seg),
+                               "lcall{l}\t$seg, $off", [],
+                               IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
+    }
+
+    def FARCALL16m  : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
+                        "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16,
+                      Sched<[WriteJumpLd]>;
+    def FARCALL32m  : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
+                        "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize32,
+                      Sched<[WriteJumpLd]>;
+  }
+
+
+// Tail call stuff.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+  let Uses = [ESP] in {
+  def TCRETURNdi : PseudoI<(outs),
+                     (ins i32imm_pcrel:$dst, i32imm:$offset), []>;
+  def TCRETURNri : PseudoI<(outs),
+                     (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+  let mayLoad = 1 in
+  def TCRETURNmi : PseudoI<(outs),
+                     (ins i32mem_TC:$dst, i32imm:$offset), []>;
+
+  // FIXME: The should be pseudo instructions that are lowered when going to
+  // mcinst.
+  def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
+                           (ins i32imm_pcrel:$dst),
+                           "jmp\t$dst",
+                           [], IIC_JMP_REL>;
+
+  def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+                   "", [], IIC_JMP_REG>;  // FIXME: Remove encoding when JIT is dead.
+  let mayLoad = 1 in
+  def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
+                   "jmp{l}\t{*}$dst", [], IIC_JMP_MEM>;
+}
+
+// Conditional tail calls are similar to the above, but they are branches
+// rather than barriers, and they use EFLAGS.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
+    isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+  let Uses = [ESP, EFLAGS] in {
+  def TCRETURNdicc : PseudoI<(outs),
+                     (ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>;
+
+  // This gets substituted to a conditional jump instruction in MC lowering.
+  def TAILJMPd_CC : Ii32PCRel<0x80, RawFrm, (outs),
+                           (ins i32imm_pcrel:$dst, i32imm:$cond),
+                           "",
+                           [], IIC_JMP_REL>;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+
+// RSP is marked as a use to prevent stack-pointer assignments that appear
+// immediately before calls from potentially appearing dead. Uses for argument
+// registers are added manually.
+let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
+  // NOTE: this pattern doesn't match "X86call imm", because we do not know
+  // that the offset between an arbitrary immediate and the call will fit in
+  // the 32-bit pcrel field that we have.
+  def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
+                        (outs), (ins i64i32imm_pcrel:$dst),
+                        "call{q}\t$dst", [], IIC_CALL_RI>, OpSize32,
+                      Requires<[In64BitMode]>;
+  def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
+                        "call{q}\t{*}$dst", [(X86call GR64:$dst)],
+                        IIC_CALL_RI>,
+                      Requires<[In64BitMode]>;
+  def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
+                        "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
+                        IIC_CALL_MEM>,
+                      Requires<[In64BitMode,FavorMemIndirectCall]>;
+
+  def FARCALL64   : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
+                       "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
+}
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1,
+    SchedRW = [WriteJump] in {
+  def TCRETURNdi64   : PseudoI<(outs),
+                        (ins i64i32imm_pcrel:$dst, i32imm:$offset),
+                        []>;
+  def TCRETURNri64   : PseudoI<(outs),
+                        (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+  let mayLoad = 1 in
+  def TCRETURNmi64   : PseudoI<(outs),
+                        (ins i64mem_TC:$dst, i32imm:$offset), []>;
+
+  def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst),
+                   "jmp\t$dst", [], IIC_JMP_REL>;
+
+  def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+                     "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+
+  let mayLoad = 1 in
+  def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
+                     "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+
+  // Win64 wants indirect jumps leaving the function to have a REX_W prefix.
+  let hasREX_WPrefix = 1 in {
+    def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+                           "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+
+    let mayLoad = 1 in
+    def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
+                           "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
+  }
+}
+
+// Conditional tail calls are similar to the above, but they are branches
+// rather than barriers, and they use EFLAGS.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
+    isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+  let Uses = [RSP, EFLAGS] in {
+  def TCRETURNdi64cc : PseudoI<(outs),
+                           (ins i64i32imm_pcrel:$dst, i32imm:$offset,
+                            i32imm:$cond), []>;
+
+  // This gets substituted to a conditional jump instruction in MC lowering.
+  def TAILJMPd64_CC : Ii32PCRel<0x80, RawFrm, (outs),
+                           (ins i64i32imm_pcrel:$dst, i32imm:$cond),
+                           "",
+                           [], IIC_JMP_REL>;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
new file mode 100644
index 000000000000..af43d9f53325
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
@@ -0,0 +1,186 @@
+//===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the sign and zero extension operations.
+//
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in {
+  let Defs = [AX], Uses = [AL] in
+  def CBW : I<0x98, RawFrm, (outs), (ins),
+              "{cbtw|cbw}", [], IIC_CBW>, OpSize16;  // AX = signext(AL)
+  let Defs = [EAX], Uses = [AX] in
+  def CWDE : I<0x98, RawFrm, (outs), (ins),
+              "{cwtl|cwde}", [], IIC_CBW>, OpSize32; // EAX = signext(AX)
+
+  let Defs = [AX,DX], Uses = [AX] in
+  def CWD : I<0x99, RawFrm, (outs), (ins),
+              "{cwtd|cwd}", [], IIC_CBW>, OpSize16; // DX:AX = signext(AX)
+  let Defs = [EAX,EDX], Uses = [EAX] in
+  def CDQ : I<0x99, RawFrm, (outs), (ins),
+              "{cltd|cdq}", [], IIC_CBW>, OpSize32; // EDX:EAX = signext(EAX)
+
+
+  let Defs = [RAX], Uses = [EAX] in
+  def CDQE : RI<0x98, RawFrm, (outs), (ins),
+               "{cltq|cdqe}", [], IIC_CBW>;     // RAX = signext(EAX)
+
+  let Defs = [RAX,RDX], Uses = [RAX] in
+  def CQO  : RI<0x99, RawFrm, (outs), (ins),
+                "{cqto|cqo}", [], IIC_CBW>; // RDX:RAX = signext(RAX)
+}
+
+
+
+// Sign/Zero extenders
+let hasSideEffects = 0 in {
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+                   "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>,
+                   TB, OpSize16, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+                   "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>,
+                   TB, OpSize16, Sched<[WriteALULd]>;
+} // hasSideEffects = 0
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
+                   "movs{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+                   OpSize32, Sched<[WriteALU]>;
+def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+                   "movs{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB,
+                   OpSize32, Sched<[WriteALULd]>;
+def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+                   "movs{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+                   OpSize32, Sched<[WriteALU]>;
+def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+                   "movs{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>,
+                   OpSize32, TB, Sched<[WriteALULd]>;
+
+let hasSideEffects = 0 in {
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+                   "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>,
+                   TB, OpSize16, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+                   "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>,
+                   TB, OpSize16, Sched<[WriteALULd]>;
+} // hasSideEffects = 0
+def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+                   "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB,
+                   OpSize32, Sched<[WriteALU]>;
+def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+                   "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB,
+                   OpSize32, Sched<[WriteALULd]>;
+def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+                   "movz{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB,
+                   OpSize32, Sched<[WriteALU]>;
+def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+                   "movz{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zextloadi32i16 addr:$src))], IIC_MOVZX>,
+                   TB, OpSize32, Sched<[WriteALULd]>;
+
+// These are the same as the regular MOVZX32rr8 and MOVZX32rm8
+// except that they use GR32_NOREX for the output operand register class
+// instead of GR32. This allows them to operate on h registers on x86-64.
+let hasSideEffects = 0, isCodeGenOnly = 1 in {
+def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
+                         (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         [], IIC_MOVZX>, TB, OpSize32, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
+                         (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         [], IIC_MOVZX>, TB, OpSize32, Sched<[WriteALULd]>;
+
+def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg,
+                         (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
+                         "movs{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         [], IIC_MOVSX>, TB, OpSize32, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem,
+                         (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
+                         "movs{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         [], IIC_MOVSX>, TB, OpSize32, Sched<[WriteALULd]>;
+}
+
+// MOVSX64rr8 always has a REX prefix and it has an 8-bit register
+// operand, which makes it a rare instruction with an 8-bit register
+// operand that can never access an h register. If support for h registers
+// were generalized, this would require a special register class.
+def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+                    "movs{bq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+                    Sched<[WriteALU]>;
+def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+                    "movs{bq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i8 addr:$src))], IIC_MOVSX>,
+                    TB, Sched<[WriteALULd]>;
+def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+                    "movs{wq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+                    Sched<[WriteALU]>;
+def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                    "movs{wq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i16 addr:$src))], IIC_MOVSX>,
+                    TB, Sched<[WriteALULd]>;
+def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+                    "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>,
+                    Sched<[WriteALU]>, Requires<[In64BitMode]>;
+def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+                    "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>,
+                    Sched<[WriteALULd]>, Requires<[In64BitMode]>;
+
+// movzbq and movzwq encodings for the disassembler
+let hasSideEffects = 0 in {
+def MOVZX64rr8 : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
+                     "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                     TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX64rm8 : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
+                     "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                     TB, Sched<[WriteALULd]>;
+def MOVZX64rr16 : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+                     "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                     TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX64rm16 : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                     "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                     TB, Sched<[WriteALULd]>;
+}
+
+// 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a
+// 32-bit register.
+def : Pat<(i64 (zext GR8:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8:$src), sub_32bit)>;
+def : Pat<(zextloadi64i8 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+
+def : Pat<(i64 (zext GR16:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16:$src), sub_32bit)>;
+def : Pat<(zextloadi64i16 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
+
+// The preferred way to do 32-bit-to-64-bit zero extension on x86-64 is to use a
+// SUBREG_TO_REG to utilize implicit zero-extension, however this isn't possible
+// when the 32-bit value is defined by a truncate or is copied from something
+// where the high bits aren't necessarily all zero. In such cases, we fall back
+// to these explicit zext instructions.
+def : Pat<(i64 (zext GR32:$src)),
+          (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src), sub_32bit)>;
+def : Pat<(i64 (zextloadi64i32 addr:$src)),
+          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
new file mode 100644
index 000000000000..4b19f801dae1
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
@@ -0,0 +1,443 @@
+//===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes FMA (Fused Multiply-Add) instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FMA3 - Intel 3 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+// For all FMA opcodes declared in fma3p_rm and fma3s_rm milticlasses defined
+// below, both the register and memory variants are commutable.
+// For the register form the commutable operands are 1, 2 and 3.
+// For the memory variant the folded operand must be in 3. Thus,
+// in that case, only the operands 1 and 2 can be swapped.
+// Commuting some of operands may require the opcode change.
+// FMA*213*:
+//   operands 1 and 2 (memory & register forms): *213* --> *213*(no changes);
+//   operands 1 and 3 (register forms only):     *213* --> *231*;
+//   operands 2 and 3 (register forms only):     *213* --> *132*.
+// FMA*132*:
+//   operands 1 and 2 (memory & register forms): *132* --> *231*;
+//   operands 1 and 3 (register forms only):     *132* --> *132*(no changes);
+//   operands 2 and 3 (register forms only):     *132* --> *213*.
+// FMA*231*:
+//   operands 1 and 2 (memory & register forms): *231* --> *132*;
+//   operands 1 and 3 (register forms only):     *231* --> *213*;
+//   operands 2 and 3 (register forms only):     *231* --> *231*(no changes).
+
+let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
+multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
+                    PatFrag MemFrag128, PatFrag MemFrag256,
+                    ValueType OpVT128, ValueType OpVT256,
+                    SDPatternOperator Op = null_frag> {
+  def r     : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, VR128:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, (OpVT128 (Op VR128:$src2,
+                                               VR128:$src1, VR128:$src3)))]>;
+
+  let mayLoad = 1 in
+  def m     : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
+                                               (MemFrag128 addr:$src3))))]>;
+
+  def Yr    : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
+                   (ins VR256:$src1, VR256:$src2, VR256:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
+                                               VR256:$src3)))]>, VEX_L;
+
+  let mayLoad = 1 in
+  def Ym    : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
+                   (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR256:$dst,
+                     (OpVT256 (Op VR256:$src2, VR256:$src1,
+                               (MemFrag256 addr:$src3))))]>, VEX_L;
+}
+
+multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                       string OpcodeStr, string PackTy, string Suff,
+                       PatFrag MemFrag128, PatFrag MemFrag256,
+                       SDNode Op, ValueType OpTy128, ValueType OpTy256> {
+  defm NAME#213#Suff : fma3p_rm<opc213,
+                                !strconcat(OpcodeStr, "213", PackTy),
+                                MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
+  defm NAME#132#Suff : fma3p_rm<opc132,
+                                !strconcat(OpcodeStr, "132", PackTy),
+                                MemFrag128, MemFrag256, OpTy128, OpTy256>;
+  defm NAME#231#Suff : fma3p_rm<opc231,
+                                !strconcat(OpcodeStr, "231", PackTy),
+                                MemFrag128, MemFrag256, OpTy128, OpTy256>;
+}
+
+// Fused Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+  defm VFMADD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", "PS",
+                               loadv4f32, loadv8f32, X86Fmadd, v4f32, v8f32>;
+  defm VFMSUB    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
+                               loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32>;
+  defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS",
+                               loadv4f32, loadv8f32, X86Fmaddsub,
+                               v4f32, v8f32>;
+  defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", "PS",
+                               loadv4f32, loadv8f32, X86Fmsubadd,
+                               v4f32, v8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+  defm VFMADD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", "PD",
+                               loadv2f64, loadv4f64, X86Fmadd, v2f64,
+                               v4f64>, VEX_W;
+  defm VFMSUB    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD",
+                               loadv2f64, loadv4f64, X86Fmsub, v2f64,
+                               v4f64>, VEX_W;
+  defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD",
+                               loadv2f64, loadv4f64, X86Fmaddsub,
+                               v2f64, v4f64>, VEX_W;
+  defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", "PD",
+                               loadv2f64, loadv4f64, X86Fmsubadd,
+                               v2f64, v4f64>, VEX_W;
+}
+
+// Fused Negative Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+  defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32,
+                             loadv8f32, X86Fnmadd, v4f32, v8f32>;
+  defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32,
+                             loadv8f32, X86Fnmsub, v4f32, v8f32>;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64,
+                             loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
+  defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64,
+                             loadv4f64, X86Fnmsub, v2f64, v4f64>, VEX_W;
+}
+
+// All source register operands of FMA opcodes defined in fma3s_rm multiclass
+// can be commuted. In many cases such commute transformation requres an opcode
+// adjustment, for example, commuting the operands 1 and 2 in FMA*132 form
+// would require an opcode change to FMA*231:
+//     FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2;
+//     -->
+//     FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2;
+// Please see more detailed comment at the very beginning of the section
+// defining FMA3 opcodes above.
+let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
+multiclass fma3s_rm<bits<8> opc, string OpcodeStr,
+                    X86MemOperand x86memop, RegisterClass RC,
+                    SDPatternOperator OpNode = null_frag> {
+  def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>;
+
+  let mayLoad = 1 in
+  def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, x86memop:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>;
+}
+
+// These FMA*_Int instructions are defined specially for being used when
+// the scalar FMA intrinsics are lowered to machine instructions, and in that
+// sense, they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc.
+// instructions.
+//
+// All of the FMA*_Int opcodes are defined as commutable here.
+// Commuting the 2nd and 3rd source register operands of FMAs is quite trivial
+// and the corresponding optimizations have been developed.
+// Commuting the 1st operand of FMA*_Int requires some additional analysis,
+// the commute optimization is legal only if all users of FMA*_Int use only
+// the lowest element of the FMA*_Int instruction. Even though such analysis
+// may be not implemented yet we allow the routines doing the actual commute
+// transformation to decide if one or another instruction is commutable or not.
+let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
+    hasSideEffects = 0 in
+multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
+                        Operand memopr, RegisterClass RC> {
+  def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   []>;
+
+  let mayLoad = 1 in
+  def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, memopr:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   []>;
+}
+
+multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                       string OpStr, string PackTy, string Suff,
+                       SDNode OpNode, RegisterClass RC,
+                       X86MemOperand x86memop> {
+  defm NAME#132#Suff : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
+                                x86memop, RC>;
+  defm NAME#213#Suff : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
+                                x86memop, RC, OpNode>;
+  defm NAME#231#Suff : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
+                                x86memop, RC>;
+}
+
+// The FMA 213 form is created for lowering of scalar FMA intrinscis
+// to machine instructions.
+// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands
+// of FMA 213 form.
+// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132
+// forms and is possible only after special analysis of all uses of the initial
+// instruction. Such analysis do not exist yet and thus introducing the 231
+// form of FMA*_Int instructions is done using an optimistic assumption that
+// such analysis will be implemented eventually.
+multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                           string OpStr, string PackTy, string Suff,
+                           RegisterClass RC, Operand memop> {
+  defm NAME#132#Suff : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy),
+                                    memop, RC>;
+  defm NAME#213#Suff : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
+                                    memop, RC>;
+  defm NAME#231#Suff : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy),
+                                    memop, RC>;
+}
+
+multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                 string OpStr, Intrinsic IntF32, Intrinsic IntF64,
+                 SDNode OpNode> {
+  let ExeDomain = SSEPackedSingle in
+  defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", OpNode,
+                          FR32, f32mem>,
+              fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", "SS",
+                              VR128, ssmem>;
+
+  let ExeDomain = SSEPackedDouble in
+  defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "SD", OpNode,
+                        FR64, f64mem>,
+              fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", "SD",
+                              VR128, sdmem>, VEX_W;
+
+  // These patterns use the 123 ordering, instead of 213, even though
+  // they match the intrinsic to the 213 version of the instruction.
+  // This is because src1 is tied to dest, and the scalar intrinsics
+  // require the pass-through values to come from the first source
+  // operand, not the second.
+  let Predicates = [HasFMA] in {
+    def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
+              (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"213SSr_Int")
+               $src1, $src2, $src3), VR128)>;
+
+    def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
+              (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"213SDr_Int")
+               $src1, $src2, $src3), VR128)>;
+  }
+}
+
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
+                    int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
+                    int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
+
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
+                     int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
+                     int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
+
+
+//===----------------------------------------------------------------------===//
+// FMA4 - AMD 4 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+
+multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                 X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
+                 PatFrag mem_frag> {
+  let isCommutable = 1 in
+  def rr : FMA4<opc, MRMSrcRegOp4, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, RC:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set RC:$dst,
+             (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG;
+  def rm : FMA4<opc, MRMSrcMemOp4, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, x86memop:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
+                           (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG;
+  def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, x86memop:$src2, RC:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set RC:$dst,
+             (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG;
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2, RC:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+               VEX_LIG;
+}
+
+multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
+                     ComplexPattern mem_cpat, Intrinsic Int> {
+let isCodeGenOnly = 1 in {
+  def rr_Int : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst,
+                 (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG;
+  def rm_Int : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, memop:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
+                                  mem_cpat:$src3))]>, VEX_W, VEX_LIG;
+  def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, memop:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst,
+                 (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG;
+} // isCodeGenOnly = 1
+}
+
+multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                 ValueType OpVT128, ValueType OpVT256,
+                 PatFrag ld_frag128, PatFrag ld_frag256> {
+  let isCommutable = 1 in
+  def rr : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+             (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
+           VEX_W;
+  def rm : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
+                              (ld_frag128 addr:$src3)))]>, VEX_W;
+  def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+             (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
+  let isCommutable = 1 in
+  def Yrr : FMA4<opc, MRMSrcRegOp4, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst,
+             (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
+           VEX_W, VEX_L;
+  def Yrm : FMA4<opc, MRMSrcMemOp4, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
+                              (ld_frag256 addr:$src3)))]>, VEX_W, VEX_L;
+  def Ymr : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst, (OpNode VR256:$src1,
+                              (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L;
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+  def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+  def Yrr_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+                (ins VR256:$src1, VR256:$src2, VR256:$src3),
+                !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+                VEX_L;
+} // isCodeGenOnly = 1
+}
+
+let ExeDomain = SSEPackedSingle in {
+  // Scalar Instructions
+  defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
+                    fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
+                              int_x86_fma_vfmadd_ss>;
+  defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
+                    fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
+                              int_x86_fma_vfmsub_ss>;
+  defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
+                          X86Fnmadd, loadf32>,
+                    fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
+                              int_x86_fma_vfnmadd_ss>;
+  defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
+                          X86Fnmsub, loadf32>,
+                    fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
+                              int_x86_fma_vfnmsub_ss>;
+  // Packed Instructions
+  defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+  defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+  defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+  defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+  defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+  defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
+                            loadv4f32, loadv8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+  // Scalar Instructions
+  defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
+                    fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfmadd_sd>;
+  defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
+                    fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfmsub_sd>;
+  defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
+                          X86Fnmadd, loadf64>,
+                    fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfnmadd_sd>;
+  defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
+                          X86Fnmsub, loadf64>,
+                    fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
+                              int_x86_fma_vfnmsub_sd>;
+  // Packed Instructions
+  defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+  defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+  defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+  defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+  defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+  defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
+                            loadv2f64, loadv4f64>;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
new file mode 100644
index 000000000000..db83497ee69d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -0,0 +1,285 @@
+//===-- X86InstrFMA3Info.cpp - X86 FMA3 Instruction Information -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the classes providing information
+// about existing X86 FMA3 opcodes, classifying and grouping them.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrFMA3Info.h"
+#include "X86InstrInfo.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Threading.h"
+using namespace llvm;
+
+/// This flag is used in the method llvm::call_once() used below to make the
+/// initialization of the map 'OpcodeToGroup' thread safe.
+LLVM_DEFINE_ONCE_FLAG(InitGroupsOnceFlag);
+
+static ManagedStatic<X86InstrFMA3Info> X86InstrFMA3InfoObj;
+X86InstrFMA3Info *X86InstrFMA3Info::getX86InstrFMA3Info() {
+  return &*X86InstrFMA3InfoObj;
+}
+
+void X86InstrFMA3Info::initRMGroup(const uint16_t *RegOpcodes,
+                                   const uint16_t *MemOpcodes, unsigned Attr) {
+  // Create a new instance of this class that would hold a group of FMA opcodes.
+  X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, MemOpcodes, Attr);
+
+  // Add the references from indvidual opcodes to the group holding them.
+  assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] &&
+          !OpcodeToGroup[RegOpcodes[2]] && !OpcodeToGroup[MemOpcodes[0]] &&
+          !OpcodeToGroup[MemOpcodes[1]] && !OpcodeToGroup[MemOpcodes[2]]) &&
+         "Duplication or rewrite of elements in OpcodeToGroup.");
+  OpcodeToGroup[RegOpcodes[0]] = G;
+  OpcodeToGroup[RegOpcodes[1]] = G;
+  OpcodeToGroup[RegOpcodes[2]] = G;
+  OpcodeToGroup[MemOpcodes[0]] = G;
+  OpcodeToGroup[MemOpcodes[1]] = G;
+  OpcodeToGroup[MemOpcodes[2]] = G;
+}
+
+void X86InstrFMA3Info::initRGroup(const uint16_t *RegOpcodes, unsigned Attr) {
+  // Create a new instance of this class that would hold a group of FMA opcodes.
+  X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, nullptr, Attr);
+
+  // Add the references from indvidual opcodes to the group holding them.
+  assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] &&
+          !OpcodeToGroup[RegOpcodes[2]]) &&
+         "Duplication or rewrite of elements in OpcodeToGroup.");
+  OpcodeToGroup[RegOpcodes[0]] = G;
+  OpcodeToGroup[RegOpcodes[1]] = G;
+  OpcodeToGroup[RegOpcodes[2]] = G;
+}
+
+void X86InstrFMA3Info::initMGroup(const uint16_t *MemOpcodes, unsigned Attr) {
+  // Create a new instance of this class that would hold a group of FMA opcodes.
+  X86InstrFMA3Group *G = new X86InstrFMA3Group(nullptr, MemOpcodes, Attr);
+
+  // Add the references from indvidual opcodes to the group holding them.
+  assert((!OpcodeToGroup[MemOpcodes[0]] && !OpcodeToGroup[MemOpcodes[1]] &&
+          !OpcodeToGroup[MemOpcodes[2]]) &&
+         "Duplication or rewrite of elements in OpcodeToGroup.");
+  OpcodeToGroup[MemOpcodes[0]] = G;
+  OpcodeToGroup[MemOpcodes[1]] = G;
+  OpcodeToGroup[MemOpcodes[2]] = G;
+}
+
+#define FMA3RM(R132, R213, R231, M132, M213, M231)                             \
+  static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231};      \
+  static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231};      \
+  initRMGroup(Reg##R132, Mem##R132);
+
+#define FMA3RMA(R132, R213, R231, M132, M213, M231, Attrs)                     \
+  static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231};      \
+  static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231};      \
+  initRMGroup(Reg##R132, Mem##R132, (Attrs));
+
+#define FMA3R(R132, R213, R231)                                                \
+  static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231};      \
+  initRGroup(Reg##R132);
+
+#define FMA3RA(R132, R213, R231, Attrs)                                        \
+  static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231};      \
+  initRGroup(Reg##R132, (Attrs));
+
+#define FMA3M(M132, M213, M231)                                                \
+  static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231};      \
+  initMGroup(Mem##M132);
+
+#define FMA3MA(M132, M213, M231, Attrs)                                        \
+  static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231};      \
+  initMGroup(Mem##M132, (Attrs));
+
+#define FMA3_AVX2_VECTOR_GROUP(Name)                                           \
+  FMA3RM(Name##132PSr, Name##213PSr, Name##231PSr,                             \
+         Name##132PSm, Name##213PSm, Name##231PSm);                            \
+  FMA3RM(Name##132PDr, Name##213PDr, Name##231PDr,                             \
+         Name##132PDm, Name##213PDm, Name##231PDm);                            \
+  FMA3RM(Name##132PSYr, Name##213PSYr, Name##231PSYr,                          \
+         Name##132PSYm, Name##213PSYm, Name##231PSYm);                         \
+  FMA3RM(Name##132PDYr, Name##213PDYr, Name##231PDYr,                          \
+         Name##132PDYm, Name##213PDYm, Name##231PDYm);
+
+#define FMA3_AVX2_SCALAR_GROUP(Name)                                           \
+  FMA3RM(Name##132SSr, Name##213SSr, Name##231SSr,                             \
+         Name##132SSm, Name##213SSm, Name##231SSm);                            \
+  FMA3RM(Name##132SDr, Name##213SDr, Name##231SDr,                             \
+         Name##132SDm, Name##213SDm, Name##231SDm);                            \
+  FMA3RMA(Name##132SSr_Int, Name##213SSr_Int, Name##231SSr_Int,                \
+          Name##132SSm_Int, Name##213SSm_Int, Name##231SSm_Int,                \
+          X86InstrFMA3Group::X86FMA3Intrinsic);                                \
+  FMA3RMA(Name##132SDr_Int, Name##213SDr_Int, Name##231SDr_Int,                \
+          Name##132SDm_Int, Name##213SDm_Int, Name##231SDm_Int,                \
+          X86InstrFMA3Group::X86FMA3Intrinsic);
+
+#define FMA3_AVX2_FULL_GROUP(Name)                                             \
+  FMA3_AVX2_VECTOR_GROUP(Name);                                                \
+  FMA3_AVX2_SCALAR_GROUP(Name);
+
+#define FMA3_AVX512_VECTOR_GROUP(Name)                                         \
+  FMA3RM(Name##132PSZ128r, Name##213PSZ128r, Name##231PSZ128r,                 \
+         Name##132PSZ128m, Name##213PSZ128m, Name##231PSZ128m);                \
+  FMA3RM(Name##132PDZ128r, Name##213PDZ128r, Name##231PDZ128r,                 \
+         Name##132PDZ128m, Name##213PDZ128m, Name##231PDZ128m);                \
+  FMA3RM(Name##132PSZ256r, Name##213PSZ256r, Name##231PSZ256r,                 \
+         Name##132PSZ256m, Name##213PSZ256m, Name##231PSZ256m);                \
+  FMA3RM(Name##132PDZ256r, Name##213PDZ256r, Name##231PDZ256r,                 \
+         Name##132PDZ256m, Name##213PDZ256m, Name##231PDZ256m);                \
+  FMA3RM(Name##132PSZr,    Name##213PSZr,    Name##231PSZr,                    \
+         Name##132PSZm,    Name##213PSZm,    Name##231PSZm);                   \
+  FMA3RM(Name##132PDZr,    Name##213PDZr,    Name##231PDZr,                    \
+         Name##132PDZm,    Name##213PDZm,    Name##231PDZm);                   \
+  FMA3RMA(Name##132PSZ128rk, Name##213PSZ128rk, Name##231PSZ128rk,             \
+          Name##132PSZ128mk, Name##213PSZ128mk, Name##231PSZ128mk,             \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PDZ128rk, Name##213PDZ128rk, Name##231PDZ128rk,             \
+          Name##132PDZ128mk, Name##213PDZ128mk, Name##231PDZ128mk,             \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PSZ256rk, Name##213PSZ256rk, Name##231PSZ256rk,             \
+          Name##132PSZ256mk, Name##213PSZ256mk, Name##231PSZ256mk,             \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PDZ256rk, Name##213PDZ256rk, Name##231PDZ256rk,             \
+          Name##132PDZ256mk, Name##213PDZ256mk, Name##231PDZ256mk,             \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PSZrk,    Name##213PSZrk,    Name##231PSZrk,                \
+          Name##132PSZmk,    Name##213PSZmk,    Name##231PSZmk,                \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PDZrk,    Name##213PDZrk,    Name##231PDZrk,                \
+          Name##132PDZmk,    Name##213PDZmk,    Name##231PDZmk,                \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PSZ128rkz, Name##213PSZ128rkz, Name##231PSZ128rkz,          \
+          Name##132PSZ128mkz, Name##213PSZ128mkz, Name##231PSZ128mkz,          \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3RMA(Name##132PDZ128rkz, Name##213PDZ128rkz, Name##231PDZ128rkz,          \
+          Name##132PDZ128mkz, Name##213PDZ128mkz, Name##231PDZ128mkz,          \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3RMA(Name##132PSZ256rkz, Name##213PSZ256rkz, Name##231PSZ256rkz,          \
+          Name##132PSZ256mkz, Name##213PSZ256mkz, Name##231PSZ256mkz,          \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3RMA(Name##132PDZ256rkz, Name##213PDZ256rkz, Name##231PDZ256rkz,          \
+          Name##132PDZ256mkz, Name##213PDZ256mkz, Name##231PDZ256mkz,          \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3RMA(Name##132PSZrkz,    Name##213PSZrkz,    Name##231PSZrkz,             \
+          Name##132PSZmkz,    Name##213PSZmkz,    Name##231PSZmkz,             \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3RMA(Name##132PDZrkz,    Name##213PDZrkz,    Name##231PDZrkz,             \
+          Name##132PDZmkz,    Name##213PDZmkz,    Name##231PDZmkz,             \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3R(Name##132PSZrb, Name##213PSZrb, Name##231PSZrb);                       \
+  FMA3R(Name##132PDZrb, Name##213PDZrb, Name##231PDZrb);                       \
+  FMA3RA(Name##132PSZrbk, Name##213PSZrbk, Name##231PSZrbk,                    \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3RA(Name##132PDZrbk, Name##213PDZrbk, Name##231PDZrbk,                    \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3RA(Name##132PSZrbkz, Name##213PSZrbkz, Name##231PSZrbkz,                 \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3RA(Name##132PDZrbkz, Name##213PDZrbkz, Name##231PDZrbkz,                 \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3M(Name##132PSZ128mb, Name##213PSZ128mb, Name##231PSZ128mb);              \
+  FMA3M(Name##132PDZ128mb, Name##213PDZ128mb, Name##231PDZ128mb);              \
+  FMA3M(Name##132PSZ256mb, Name##213PSZ256mb, Name##231PSZ256mb);              \
+  FMA3M(Name##132PDZ256mb, Name##213PDZ256mb, Name##231PDZ256mb);              \
+  FMA3M(Name##132PSZmb, Name##213PSZmb, Name##231PSZmb);                       \
+  FMA3M(Name##132PDZmb, Name##213PDZmb, Name##231PDZmb);                       \
+  FMA3MA(Name##132PSZ128mbk, Name##213PSZ128mbk, Name##231PSZ128mbk,           \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PDZ128mbk, Name##213PDZ128mbk, Name##231PDZ128mbk,           \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PSZ256mbk, Name##213PSZ256mbk, Name##231PSZ256mbk,           \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PDZ256mbk, Name##213PDZ256mbk, Name##231PDZ256mbk,           \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PSZmbk,    Name##213PSZmbk,    Name##231PSZmbk,              \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PDZmbk,    Name##213PDZmbk,    Name##231PDZmbk,              \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PSZ128mbkz, Name##213PSZ128mbkz, Name##231PSZ128mbkz,        \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3MA(Name##132PDZ128mbkz, Name##213PDZ128mbkz, Name##231PDZ128mbkz,        \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3MA(Name##132PSZ256mbkz, Name##213PSZ256mbkz, Name##231PSZ256mbkz,        \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3MA(Name##132PDZ256mbkz, Name##213PDZ256mbkz, Name##231PDZ256mbkz,        \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3MA(Name##132PSZmbkz, Name##213PSZmbkz, Name##231PSZmbkz,                 \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3MA(Name##132PDZmbkz, Name##213PDZmbkz, Name##231PDZmbkz,                 \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);
+
+#define FMA3_AVX512_SCALAR_GROUP(Name)                                         \
+  FMA3RM(Name##132SSZr,      Name##213SSZr,     Name##231SSZr,                 \
+         Name##132SSZm,      Name##213SSZm,     Name##231SSZm);                \
+  FMA3RM(Name##132SDZr,      Name##213SDZr,     Name##231SDZr,                 \
+         Name##132SDZm,      Name##213SDZm,     Name##231SDZm);                \
+  FMA3RMA(Name##132SSZr_Int, Name##213SSZr_Int, Name##231SSZr_Int,             \
+          Name##132SSZm_Int, Name##213SSZm_Int, Name##231SSZm_Int,             \
+          X86InstrFMA3Group::X86FMA3Intrinsic);                                \
+  FMA3RMA(Name##132SDZr_Int, Name##213SDZr_Int, Name##231SDZr_Int,             \
+          Name##132SDZm_Int, Name##213SDZm_Int, Name##231SDZm_Int,             \
+          X86InstrFMA3Group::X86FMA3Intrinsic);                                \
+  FMA3RMA(Name##132SSZr_Intk, Name##213SSZr_Intk, Name##231SSZr_Intk,          \
+          Name##132SSZm_Intk, Name##213SSZm_Intk, Name##231SSZm_Intk,          \
+          X86InstrFMA3Group::X86FMA3Intrinsic |                                \
+              X86InstrFMA3Group::X86FMA3KMergeMasked);                         \
+  FMA3RMA(Name##132SDZr_Intk, Name##213SDZr_Intk, Name##231SDZr_Intk,          \
+          Name##132SDZm_Intk, Name##213SDZm_Intk, Name##231SDZm_Intk,          \
+          X86InstrFMA3Group::X86FMA3Intrinsic |                                \
+              X86InstrFMA3Group::X86FMA3KMergeMasked);                         \
+  FMA3RMA(Name##132SSZr_Intkz, Name##213SSZr_Intkz, Name##231SSZr_Intkz,       \
+          Name##132SSZm_Intkz, Name##213SSZm_Intkz, Name##231SSZm_Intkz,       \
+          X86InstrFMA3Group::X86FMA3Intrinsic |                                \
+              X86InstrFMA3Group::X86FMA3KZeroMasked);                          \
+  FMA3RMA(Name##132SDZr_Intkz, Name##213SDZr_Intkz, Name##231SDZr_Intkz,       \
+          Name##132SDZm_Intkz, Name##213SDZm_Intkz, Name##231SDZm_Intkz,       \
+          X86InstrFMA3Group::X86FMA3Intrinsic |                                \
+              X86InstrFMA3Group::X86FMA3KZeroMasked);                          \
+  FMA3RA(Name##132SSZrb_Int, Name##213SSZrb_Int, Name##231SSZrb_Int,           \
+         X86InstrFMA3Group::X86FMA3Intrinsic);                                 \
+  FMA3RA(Name##132SDZrb_Int, Name##213SDZrb_Int, Name##231SDZrb_Int,           \
+         X86InstrFMA3Group::X86FMA3Intrinsic);                                 \
+  FMA3RA(Name##132SSZrb_Intk, Name##213SSZrb_Intk, Name##231SSZrb_Intk,        \
+         X86InstrFMA3Group::X86FMA3Intrinsic |                                 \
+             X86InstrFMA3Group::X86FMA3KMergeMasked);                          \
+  FMA3RA(Name##132SDZrb_Intk, Name##213SDZrb_Intk, Name##231SDZrb_Intk,        \
+         X86InstrFMA3Group::X86FMA3Intrinsic |                                 \
+             X86InstrFMA3Group::X86FMA3KMergeMasked);                          \
+  FMA3RA(Name##132SSZrb_Intkz, Name##213SSZrb_Intkz, Name##231SSZrb_Intkz,     \
+         X86InstrFMA3Group::X86FMA3Intrinsic |                                 \
+             X86InstrFMA3Group::X86FMA3KZeroMasked);                           \
+  FMA3RA(Name##132SDZrb_Intkz, Name##213SDZrb_Intkz, Name##231SDZrb_Intkz,     \
+         X86InstrFMA3Group::X86FMA3Intrinsic |                                 \
+             X86InstrFMA3Group::X86FMA3KZeroMasked);
+
+#define FMA3_AVX512_FULL_GROUP(Name)                                           \
+  FMA3_AVX512_VECTOR_GROUP(Name);                                              \
+  FMA3_AVX512_SCALAR_GROUP(Name);
+
+void X86InstrFMA3Info::initGroupsOnceImpl() {
+  FMA3_AVX2_FULL_GROUP(VFMADD);
+  FMA3_AVX2_FULL_GROUP(VFMSUB);
+  FMA3_AVX2_FULL_GROUP(VFNMADD);
+  FMA3_AVX2_FULL_GROUP(VFNMSUB);
+
+  FMA3_AVX2_VECTOR_GROUP(VFMADDSUB);
+  FMA3_AVX2_VECTOR_GROUP(VFMSUBADD);
+
+  FMA3_AVX512_FULL_GROUP(VFMADD);
+  FMA3_AVX512_FULL_GROUP(VFMSUB);
+  FMA3_AVX512_FULL_GROUP(VFNMADD);
+  FMA3_AVX512_FULL_GROUP(VFNMSUB);
+
+  FMA3_AVX512_VECTOR_GROUP(VFMADDSUB);
+  FMA3_AVX512_VECTOR_GROUP(VFMSUBADD);
+}
+
+void X86InstrFMA3Info::initGroupsOnce() {
+  llvm::call_once(InitGroupsOnceFlag,
+                  []() { getX86InstrFMA3Info()->initGroupsOnceImpl(); });
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h
new file mode 100644
index 000000000000..025cee3b2b90
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h
@@ -0,0 +1,315 @@
+//===-- X86InstrFMA3Info.h - X86 FMA3 Instruction Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the classes providing information
+// about existing X86 FMA3 opcodes, classifying and grouping them.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
+#define LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
+
+#include "X86.h"
+#include "llvm/ADT/DenseMap.h"
+#include <cassert>
+#include <set>
+
+namespace llvm {
+/// This class is used to group {132, 213, 231} forms of FMA opcodes together.
+/// Each of the groups has either 3 register opcodes, 3 memory opcodes,
+/// or 6 register and memory opcodes. Also, each group has an attrubutes field
+/// describing it.
+class X86InstrFMA3Group {
+private:
+  /// Reference to an array holding 3 forms of register FMA opcodes.
+  /// It may be set to nullptr if the group of FMA opcodes does not have
+  /// any register form opcodes.
+  const uint16_t *RegOpcodes;
+
+  /// Reference to an array holding 3 forms of memory FMA opcodes.
+  /// It may be set to nullptr if the group of FMA opcodes does not have
+  /// any register form opcodes.
+  const uint16_t *MemOpcodes;
+
+  /// This bitfield specifies the attributes associated with the created
+  /// FMA groups of opcodes.
+  unsigned Attributes;
+
+  static const unsigned Form132 = 0;
+  static const unsigned Form213 = 1;
+  static const unsigned Form231 = 2;
+
+public:
+  /// This bit must be set in the 'Attributes' field of FMA group if such
+  /// group of FMA opcodes consists of FMA intrinsic opcodes.
+  static const unsigned X86FMA3Intrinsic = 0x1;
+
+  /// This bit must be set in the 'Attributes' field of FMA group if such
+  /// group of FMA opcodes consists of AVX512 opcodes accepting a k-mask and
+  /// passing the elements from the 1st operand to the result of the operation
+  /// when the correpondings bits in the k-mask are unset.
+  static const unsigned X86FMA3KMergeMasked = 0x2;
+
+  /// This bit must be set in the 'Attributes' field of FMA group if such
+  /// group of FMA opcodes consists of AVX512 opcodes accepting a k-zeromask.
+  static const unsigned X86FMA3KZeroMasked = 0x4;
+
+  /// Constructor. Creates a new group of FMA opcodes with three register form
+  /// FMA opcodes \p RegOpcodes and three memory form FMA opcodes \p MemOpcodes.
+  /// The parameters \p RegOpcodes and \p MemOpcodes may be set to nullptr,
+  /// which means that the created group of FMA opcodes does not have the
+  /// corresponding (register or memory) opcodes.
+  /// The parameter \p Attr specifies the attributes describing the created
+  /// group.
+  X86InstrFMA3Group(const uint16_t *RegOpcodes, const uint16_t *MemOpcodes,
+                    unsigned Attr)
+      : RegOpcodes(RegOpcodes), MemOpcodes(MemOpcodes), Attributes(Attr) {
+    assert((RegOpcodes || MemOpcodes) &&
+           "Cannot create a group not having any opcodes.");
+  }
+
+  /// Returns a memory form opcode that is the equivalent of the given register
+  /// form opcode \p RegOpcode. 0 is returned if the group does not have
+  /// either register of memory opcodes.
+  unsigned getMemOpcode(unsigned RegOpcode) const {
+    if (!RegOpcodes || !MemOpcodes)
+      return 0;
+    for (unsigned Form = 0; Form < 3; Form++)
+      if (RegOpcodes[Form] == RegOpcode)
+        return MemOpcodes[Form];
+    return 0;
+  }
+
+  /// Returns the 132 form of FMA register opcode.
+  unsigned getReg132Opcode() const {
+    assert(RegOpcodes && "The group does not have register opcodes.");
+    return RegOpcodes[Form132];
+  }
+
+  /// Returns the 213 form of FMA register opcode.
+  unsigned getReg213Opcode() const {
+    assert(RegOpcodes && "The group does not have register opcodes.");
+    return RegOpcodes[Form213];
+  }
+
+  /// Returns the 231 form of FMA register opcode.
+  unsigned getReg231Opcode() const {
+    assert(RegOpcodes && "The group does not have register opcodes.");
+    return RegOpcodes[Form231];
+  }
+
+  /// Returns the 132 form of FMA memory opcode.
+  unsigned getMem132Opcode() const {
+    assert(MemOpcodes && "The group does not have memory opcodes.");
+    return MemOpcodes[Form132];
+  }
+
+  /// Returns the 213 form of FMA memory opcode.
+  unsigned getMem213Opcode() const {
+    assert(MemOpcodes && "The group does not have memory opcodes.");
+    return MemOpcodes[Form213];
+  }
+
+  /// Returns the 231 form of FMA memory opcode.
+  unsigned getMem231Opcode() const {
+    assert(MemOpcodes && "The group does not have memory opcodes.");
+    return MemOpcodes[Form231];
+  }
+
+  /// Returns true iff the group of FMA opcodes holds intrinsic opcodes.
+  bool isIntrinsic() const { return (Attributes & X86FMA3Intrinsic) != 0; }
+
+  /// Returns true iff the group of FMA opcodes holds k-merge-masked opcodes.
+  bool isKMergeMasked() const {
+    return (Attributes & X86FMA3KMergeMasked) != 0;
+  }
+
+  /// Returns true iff the group of FMA opcodes holds k-zero-masked opcodes.
+  bool isKZeroMasked() const { return (Attributes & X86FMA3KZeroMasked) != 0; }
+
+  /// Returns true iff the group of FMA opcodes holds any of k-masked opcodes.
+  bool isKMasked() const {
+    return (Attributes & (X86FMA3KMergeMasked | X86FMA3KZeroMasked)) != 0;
+  }
+
+  /// Returns true iff the given \p Opcode is a register opcode from the
+  /// groups of FMA opcodes.
+  bool isRegOpcodeFromGroup(unsigned Opcode) const {
+    if (!RegOpcodes)
+      return false;
+    for (unsigned Form = 0; Form < 3; Form++)
+      if (Opcode == RegOpcodes[Form])
+        return true;
+    return false;
+  }
+
+  /// Returns true iff the given \p Opcode is a memory opcode from the
+  /// groups of FMA opcodes.
+  bool isMemOpcodeFromGroup(unsigned Opcode) const {
+    if (!MemOpcodes)
+      return false;
+    for (unsigned Form = 0; Form < 3; Form++)
+      if (Opcode == MemOpcodes[Form])
+        return true;
+    return false;
+  }
+};
+
+/// This class provides information about all existing FMA3 opcodes
+///
+class X86InstrFMA3Info {
+private:
+  /// A map that is used to find the group of FMA opcodes using any FMA opcode
+  /// from the group.
+  DenseMap<unsigned, const X86InstrFMA3Group *> OpcodeToGroup;
+
+  /// Creates groups of FMA opcodes and initializes Opcode-to-Group map.
+  /// This method can be called many times, but the actual initialization is
+  /// called only once.
+  static void initGroupsOnce();
+
+  /// Creates groups of FMA opcodes and initializes Opcode-to-Group map.
+  /// This method must be called ONLY from initGroupsOnce(). Otherwise, such
+  /// call is not thread safe.
+  void initGroupsOnceImpl();
+
+  /// Creates one group of FMA opcodes having the register opcodes
+  /// \p RegOpcodes and memory opcodes \p MemOpcodes. The parameter \p Attr
+  /// specifies the attributes describing the created group.
+  void initRMGroup(const uint16_t *RegOpcodes,
+                   const uint16_t *MemOpcodes, unsigned Attr = 0);
+
+  /// Creates one group of FMA opcodes having only the register opcodes
+  /// \p RegOpcodes. The parameter \p Attr specifies the attributes describing
+  /// the created group.
+  void initRGroup(const uint16_t *RegOpcodes, unsigned Attr = 0);
+
+  /// Creates one group of FMA opcodes having only the memory opcodes
+  /// \p MemOpcodes. The parameter \p Attr specifies the attributes describing
+  /// the created group.
+  void initMGroup(const uint16_t *MemOpcodes, unsigned Attr = 0);
+
+public:
+  /// Returns the reference to an object of this class. It is assumed that
+  /// only one object may exist.
+  static X86InstrFMA3Info *getX86InstrFMA3Info();
+
+  /// Constructor. Just creates an object of the class.
+  X86InstrFMA3Info() {}
+
+  /// Destructor. Deallocates the memory used for FMA3 Groups.
+  ~X86InstrFMA3Info() {
+    std::set<const X86InstrFMA3Group *> DeletedGroups;
+    auto E = OpcodeToGroup.end();
+    for (auto I = OpcodeToGroup.begin(); I != E; I++) {
+      const X86InstrFMA3Group *G = I->second;
+      if (DeletedGroups.find(G) == DeletedGroups.end()) {
+        DeletedGroups.insert(G);
+        delete G;
+      }
+    }
+  }
+
+  /// Returns a reference to a group of FMA3 opcodes to where the given
+  /// \p Opcode is included. If the given \p Opcode is not recognized as FMA3
+  /// and not included into any FMA3 group, then nullptr is returned.
+  static const X86InstrFMA3Group *getFMA3Group(unsigned Opcode) {
+    // Ensure that the groups of opcodes are initialized.
+    initGroupsOnce();
+
+    // Find the group including the given opcode.
+    const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info();
+    auto I = FMA3Info->OpcodeToGroup.find(Opcode);
+    if (I == FMA3Info->OpcodeToGroup.end())
+      return nullptr;
+
+    return I->second;
+  }
+
+  /// Returns true iff the given \p Opcode is recognized as FMA3 by this class.
+  static bool isFMA3(unsigned Opcode) {
+    return getFMA3Group(Opcode) != nullptr;
+  }
+
+  /// Iterator that is used to walk on FMA register opcodes having memory
+  /// form equivalents.
+  class rm_iterator {
+  private:
+    /// Iterator associated with the OpcodeToGroup map. It must always be
+    /// initialized with an entry from OpcodeToGroup for which I->first
+    /// points to a register FMA opcode and I->second points to a group of
+    /// FMA opcodes having memory form equivalent of I->first.
+    DenseMap<unsigned, const X86InstrFMA3Group *>::const_iterator I;
+
+  public:
+    /// Constructor. Creates rm_iterator. The parameter \p I must be an
+    /// iterator to OpcodeToGroup map entry having I->first pointing to
+    /// register form FMA opcode and I->second pointing to a group of FMA
+    /// opcodes holding memory form equivalent for I->fist.
+    rm_iterator(DenseMap<unsigned, const X86InstrFMA3Group *>::const_iterator I)
+        : I(I) {}
+
+    /// Returns the register form FMA opcode.
+    unsigned getRegOpcode() const { return I->first; };
+
+    /// Returns the memory form equivalent opcode for FMA register opcode
+    /// referenced by I->first.
+    unsigned getMemOpcode() const {
+      unsigned Opcode = I->first;
+      const X86InstrFMA3Group *Group = I->second;
+      return Group->getMemOpcode(Opcode);
+    }
+
+    /// Returns a reference to a group of FMA opcodes.
+    const X86InstrFMA3Group *getGroup() const { return I->second; }
+
+    bool operator==(const rm_iterator &OtherIt) const { return I == OtherIt.I; }
+    bool operator!=(const rm_iterator &OtherIt) const { return I != OtherIt.I; }
+
+    /// Increment. Advances the 'I' iterator to the next OpcodeToGroup entry
+    /// having I->first pointing to register form FMA and I->second pointing
+    /// to a group of FMA opcodes holding memory form equivalen for I->first.
+    rm_iterator &operator++() {
+      auto E = getX86InstrFMA3Info()->OpcodeToGroup.end();
+      for (++I; I != E; ++I) {
+        unsigned RegOpcode = I->first;
+        const X86InstrFMA3Group *Group = I->second;
+        if (Group->getMemOpcode(RegOpcode) != 0)
+          break;
+      }
+      return *this;
+    }
+  };
+
+  /// Returns rm_iterator pointing to the first entry of OpcodeToGroup map
+  /// with a register FMA opcode having memory form opcode equivalent.
+  static rm_iterator rm_begin() {
+    initGroupsOnce();
+    const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info();
+    auto I = FMA3Info->OpcodeToGroup.begin();
+    auto E = FMA3Info->OpcodeToGroup.end();
+    while (I != E) {
+      unsigned Opcode = I->first;
+      const X86InstrFMA3Group *G = I->second;
+      if (G->getMemOpcode(Opcode) != 0)
+        break;
+      I++;
+    }
+    return rm_iterator(I);
+  }
+
+  /// Returns the last rm_iterator.
+  static rm_iterator rm_end() {
+    initGroupsOnce();
+    return rm_iterator(getX86InstrFMA3Info()->OpcodeToGroup.end());
+  }
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
new file mode 100644
index 000000000000..10f3839ea8ed
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -0,0 +1,729 @@
+//===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 x87 FPU instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPStack specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FpGet2    : SDTypeProfile<2, 0, [SDTCisVT<0, f80>,
+                                           SDTCisVT<1, f80>]>;
+def SDTX86Fld       : SDTypeProfile<1, 2, [SDTCisFP<0>,
+                                           SDTCisPtrTy<1>,
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fst       : SDTypeProfile<0, 3, [SDTCisFP<0>,
+                                           SDTCisPtrTy<1>,
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fild      : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fnstsw    : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
+def SDTX86FpToIMem  : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+
+def SDTX86CwdStore  : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def X86fld          : SDNode<"X86ISD::FLD", SDTX86Fld,
+                             [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86fst          : SDNode<"X86ISD::FST", SDTX86Fst,
+                             [SDNPHasChain, SDNPInGlue, SDNPMayStore,
+                              SDNPMemOperand]>;
+def X86fild         : SDNode<"X86ISD::FILD", SDTX86Fild,
+                             [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86fildflag     : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
+                             [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
+                              SDNPMemOperand]>;
+def X86fp_stsw      : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>;
+def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m",          SDTX86CwdStore,
+                             [SDNPHasChain, SDNPMayStore, SDNPSideEffect,
+                              SDNPMemOperand]>;
+
+//===----------------------------------------------------------------------===//
+// FPStack pattern fragments
+//===----------------------------------------------------------------------===//
+
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-0.0);
+}]>;
+
+def fpimm1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+1.0);
+}]>;
+
+def fpimmneg1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-1.0);
+}]>;
+
+// Some 'special' instructions
+let usesCustomInserter = 1 in {  // Expanded after instruction selection.
+  def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src),
+                              [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
+  def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src),
+                              [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>;
+  def FP32_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP32:$src),
+                              [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>;
+  def FP64_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP64:$src),
+                              [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>;
+  def FP64_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP64:$src),
+                              [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>;
+  def FP64_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP64:$src),
+                              [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>;
+  def FP80_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP80:$src),
+                              [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>;
+  def FP80_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP80:$src),
+                              [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
+  def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src),
+                              [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
+}
+
+// All FP Stack operations are represented with four instructions here.  The
+// first three instructions, generated by the instruction selector, use "RFP32"
+// "RFP64" or "RFP80" registers: traditional register files to reference 32-bit,
+// 64-bit or 80-bit floating point values.  These sizes apply to the values,
+// not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be
+// copied to each other without losing information.  These instructions are all
+// pseudo instructions and use the "_Fp" suffix.
+// In some cases there are additional variants with a mixture of different
+// register sizes.
+// The second instruction is defined with FPI, which is the actual instruction
+// emitted by the assembler.  These use "RST" registers, although frequently
+// the actual register(s) used are implicit.  These are always 80 bits.
+// The FP stackifier pass converts one to the other after register allocation
+// occurs.
+//
+// Note that the FpI instruction should have instruction selection info (e.g.
+// a pattern) and the FPI instruction should have emission info (e.g. opcode
+// encoding and asm printing info).
+
+// FpIf32, FpIf64 - Floating Point Pseudo Instruction template.
+// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
+// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
+// f80 instructions cannot use SSE and use neither of these.
+class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>;
+class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
+
+// Factoring for arithmetic.
+multiclass FPBinary_rr<SDNode OpNode> {
+// Register op register -> register
+// These are separated out because they have no reversed form.
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP,
+                [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP,
+                [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
+                [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>;
+}
+// The FopST0 series are not included here because of the irregularities
+// in where the 'r' goes in assembly output.
+// These instructions cannot address 80-bit memory.
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring,
+                    bit Forward = 1> {
+// ST(0) = ST(0) + [mem]
+def _Fp32m  : FpIf32<(outs RFP32:$dst),
+                     (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
+                  [!if(Forward,
+                       (set RFP32:$dst,
+                        (OpNode RFP32:$src1, (loadf32 addr:$src2))),
+                       (set RFP32:$dst,
+                        (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>;
+def _Fp64m  : FpIf64<(outs RFP64:$dst),
+                     (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
+                  [!if(Forward,
+                       (set RFP64:$dst,
+                        (OpNode RFP64:$src1, (loadf64 addr:$src2))),
+                       (set RFP64:$dst,
+                        (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>;
+def _Fp64m32: FpIf64<(outs RFP64:$dst),
+                     (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
+                  [!if(Forward,
+                       (set RFP64:$dst,
+                        (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))),
+                       (set RFP64:$dst,
+                        (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>;
+def _Fp80m32: FpI_<(outs RFP80:$dst),
+                   (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
+                  [!if(Forward,
+                       (set RFP80:$dst,
+                        (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))),
+                       (set RFP80:$dst,
+                        (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>;
+def _Fp80m64: FpI_<(outs RFP80:$dst),
+                   (ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
+                  [!if(Forward,
+                       (set RFP80:$dst,
+                        (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))),
+                       (set RFP80:$dst,
+                        (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>;
+let mayLoad = 1 in
+def _F32m  : FPI<0xD8, fp, (outs), (ins f32mem:$src),
+                 !strconcat("f", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
+def _F64m  : FPI<0xDC, fp, (outs), (ins f64mem:$src),
+                 !strconcat("f", asmstring, "{l}\t$src")>;
+// ST(0) = ST(0) + [memint]
+def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2),
+                       OneArgFPRW,
+                       [!if(Forward,
+                            (set RFP32:$dst,
+                             (OpNode RFP32:$src1, (X86fild addr:$src2, i16))),
+                            (set RFP32:$dst,
+                             (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>;
+def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2),
+                       OneArgFPRW,
+                       [!if(Forward,
+                            (set RFP32:$dst,
+                             (OpNode RFP32:$src1, (X86fild addr:$src2, i32))),
+                            (set RFP32:$dst,
+                             (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>;
+def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2),
+                       OneArgFPRW,
+                       [!if(Forward,
+                            (set RFP64:$dst,
+                             (OpNode RFP64:$src1, (X86fild addr:$src2, i16))),
+                            (set RFP64:$dst,
+                             (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>;
+def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2),
+                       OneArgFPRW,
+                       [!if(Forward,
+                            (set RFP64:$dst,
+                             (OpNode RFP64:$src1, (X86fild addr:$src2, i32))),
+                            (set RFP64:$dst,
+                             (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>;
+def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2),
+                     OneArgFPRW,
+                     [!if(Forward,
+                          (set RFP80:$dst,
+                           (OpNode RFP80:$src1, (X86fild addr:$src2, i16))),
+                          (set RFP80:$dst,
+                           (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>;
+def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2),
+                     OneArgFPRW,
+                     [!if(Forward,
+                          (set RFP80:$dst,
+                           (OpNode RFP80:$src1, (X86fild addr:$src2, i32))),
+                          (set RFP80:$dst,
+                           (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>;
+let mayLoad = 1 in
+def _FI16m  : FPI<0xDE, fp, (outs), (ins i16mem:$src),
+                  !strconcat("fi", asmstring, "{s}\t$src")>;
+let mayLoad = 1 in
+def _FI32m  : FPI<0xDA, fp, (outs), (ins i32mem:$src),
+                  !strconcat("fi", asmstring, "{l}\t$src")>;
+}
+
+let Defs = [FPSW] in {
+// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
+// resources.
+defm ADD : FPBinary_rr<fadd>;
+defm SUB : FPBinary_rr<fsub>;
+defm MUL : FPBinary_rr<fmul>;
+defm DIV : FPBinary_rr<fdiv>;
+// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
+let SchedRW = [WriteFAddLd] in {
+defm ADD : FPBinary<fadd, MRM0m, "add">;
+defm SUB : FPBinary<fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
+}
+let SchedRW = [WriteFMulLd] in {
+defm MUL : FPBinary<fmul, MRM1m, "mul">;
+}
+let SchedRW = [WriteFDivLd] in {
+defm DIV : FPBinary<fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>;
+}
+}
+
+class FPST0rInst<Format fp, string asm>
+  : FPI<0xD8, fp, (outs), (ins RST:$op), asm>;
+class FPrST0Inst<Format fp, string asm>
+  : FPI<0xDC, fp, (outs), (ins RST:$op), asm>;
+class FPrST0PInst<Format fp, string asm>
+  : FPI<0xDE, fp, (outs), (ins RST:$op), asm>;
+
+// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
+// of some of the 'reverse' forms of the fsub and fdiv instructions.  As such,
+// we have to put some 'r's in and take them out of weird places.
+let SchedRW = [WriteFAdd] in {
+def ADD_FST0r   : FPST0rInst <MRM0r, "fadd\t$op">;
+def ADD_FrST0   : FPrST0Inst <MRM0r, "fadd\t{%st(0), $op|$op, st(0)}">;
+def ADD_FPrST0  : FPrST0PInst<MRM0r, "faddp\t$op">;
+def SUBR_FST0r  : FPST0rInst <MRM5r, "fsubr\t$op">;
+def SUB_FrST0   : FPrST0Inst <MRM5r, "fsub{r}\t{%st(0), $op|$op, st(0)}">;
+def SUB_FPrST0  : FPrST0PInst<MRM5r, "fsub{r}p\t$op">;
+def SUB_FST0r   : FPST0rInst <MRM4r, "fsub\t$op">;
+def SUBR_FrST0  : FPrST0Inst <MRM4r, "fsub{|r}\t{%st(0), $op|$op, st(0)}">;
+def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t$op">;
+} // SchedRW
+let SchedRW = [WriteFMul] in {
+def MUL_FST0r   : FPST0rInst <MRM1r, "fmul\t$op">;
+def MUL_FrST0   : FPrST0Inst <MRM1r, "fmul\t{%st(0), $op|$op, st(0)}">;
+def MUL_FPrST0  : FPrST0PInst<MRM1r, "fmulp\t$op">;
+} // SchedRW
+let SchedRW = [WriteFDiv] in {
+def DIVR_FST0r  : FPST0rInst <MRM7r, "fdivr\t$op">;
+def DIV_FrST0   : FPrST0Inst <MRM7r, "fdiv{r}\t{%st(0), $op|$op, st(0)}">;
+def DIV_FPrST0  : FPrST0PInst<MRM7r, "fdiv{r}p\t$op">;
+def DIV_FST0r   : FPST0rInst <MRM6r, "fdiv\t$op">;
+def DIVR_FrST0  : FPrST0Inst <MRM6r, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">;
+def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t$op">;
+} // SchedRW
+
+def COM_FST0r   : FPST0rInst <MRM2r, "fcom\t$op">;
+def COMP_FST0r  : FPST0rInst <MRM3r, "fcomp\t$op">;
+
+// Unary operations.
+multiclass FPUnary<SDNode OpNode, Format fp, string asmstring> {
+def _Fp32  : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
+                 [(set RFP32:$dst, (OpNode RFP32:$src))]>;
+def _Fp64  : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
+                 [(set RFP64:$dst, (OpNode RFP64:$src))]>;
+def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
+                 [(set RFP80:$dst, (OpNode RFP80:$src))]>;
+def _F     : FPI<0xD9, fp, (outs), (ins), asmstring>;
+}
+
+let Defs = [FPSW] in {
+defm CHS : FPUnary<fneg, MRM_E0, "fchs">;
+defm ABS : FPUnary<fabs, MRM_E1, "fabs">;
+let SchedRW = [WriteFSqrt] in {
+defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">;
+}
+defm SIN : FPUnary<fsin, MRM_FE, "fsin">;
+defm COS : FPUnary<fcos, MRM_FF, "fcos">;
+
+let hasSideEffects = 0 in {
+def TST_Fp32  : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
+def TST_Fp64  : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
+def TST_Fp80  : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
+}
+def TST_F  : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">;
+} // Defs = [FPSW]
+
+// Versions of FP instructions that take a single memory operand.  Added for the
+//   disassembler; remove as they are included with patterns elsewhere.
+def FCOM32m  : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
+def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
+
+def FLDENVm  : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
+def FSTENVm  : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">;
+
+def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">;
+def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
+
+def FCOM64m  : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">;
+def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">;
+
+def FRSTORm  : FPI<0xDD, MRM4m, (outs), (ins f32mem:$dst), "frstor\t$dst">;
+def FSAVEm   : FPI<0xDD, MRM6m, (outs), (ins f32mem:$dst), "fnsave\t$dst">;
+def FNSTSWm  : FPI<0xDD, MRM7m, (outs), (ins i16mem:$dst), "fnstsw\t$dst">;
+
+def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">;
+def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">;
+
+def FBLDm    : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">;
+def FBSTPm   : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">;
+
+// Floating point cmovs.
+class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>;
+class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>;
+
+multiclass FPCMov<PatLeaf cc> {
+  def _Fp32  : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
+                       CondMovFP,
+                     [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
+                                        cc, EFLAGS))]>;
+  def _Fp64  : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2),
+                       CondMovFP,
+                     [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
+                                        cc, EFLAGS))]>;
+  def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
+                     CondMovFP,
+                     [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
+                                        cc, EFLAGS))]>,
+                                        Requires<[HasCMov]>;
+}
+
+let Defs = [FPSW] in {
+let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
+defm CMOVB  : FPCMov<X86_COND_B>;
+defm CMOVBE : FPCMov<X86_COND_BE>;
+defm CMOVE  : FPCMov<X86_COND_E>;
+defm CMOVP  : FPCMov<X86_COND_P>;
+defm CMOVNB : FPCMov<X86_COND_AE>;
+defm CMOVNBE: FPCMov<X86_COND_A>;
+defm CMOVNE : FPCMov<X86_COND_NE>;
+defm CMOVNP : FPCMov<X86_COND_NP>;
+} // Uses = [EFLAGS], Constraints = "$src1 = $dst"
+
+let Predicates = [HasCMov] in {
+// These are not factored because there's no clean way to pass DA/DB.
+def CMOVB_F  : FPI<0xDA, MRM0r, (outs), (ins RST:$op),
+                  "fcmovb\t{$op, %st(0)|st(0), $op}">;
+def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RST:$op),
+                  "fcmovbe\t{$op, %st(0)|st(0), $op}">;
+def CMOVE_F  : FPI<0xDA, MRM1r, (outs), (ins RST:$op),
+                  "fcmove\t{$op, %st(0)|st(0), $op}">;
+def CMOVP_F  : FPI<0xDA, MRM3r, (outs), (ins RST:$op),
+                  "fcmovu\t{$op, %st(0)|st(0), $op}">;
+def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RST:$op),
+                  "fcmovnb\t{$op, %st(0)|st(0), $op}">;
+def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RST:$op),
+                  "fcmovnbe\t{$op, %st(0)|st(0), $op}">;
+def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RST:$op),
+                  "fcmovne\t{$op, %st(0)|st(0), $op}">;
+def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RST:$op),
+                  "fcmovnu\t{$op, %st(0)|st(0), $op}">;
+} // Predicates = [HasCMov]
+
+// Floating point loads & stores.
+let canFoldAsLoad = 1 in {
+def LD_Fp32m   : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (loadf32 addr:$src))]>;
+let isReMaterializable = 1 in
+  def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (loadf64 addr:$src))]>;
+def LD_Fp80m   : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (loadf80 addr:$src))]>;
+}
+def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>;
+def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>;
+def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>;
+def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i64))]>;
+
+def ST_Fp32m   : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
+                  [(store RFP32:$src, addr:$op)]>;
+def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
+                  [(truncstoref32 RFP64:$src, addr:$op)]>;
+def ST_Fp64m   : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP,
+                  [(store RFP64:$src, addr:$op)]>;
+def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP,
+                  [(truncstoref32 RFP80:$src, addr:$op)]>;
+def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP,
+                  [(truncstoref64 RFP80:$src, addr:$op)]>;
+// FST does not support 80-bit memory target; FSTP must be used.
+
+let mayStore = 1, hasSideEffects = 0 in {
+def ST_FpP32m    : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>;
+def ST_FpP64m32  : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP64m    : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP80m32  : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>;
+def ST_FpP80m64  : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+def ST_FpP80m    : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
+                    [(store RFP80:$src, addr:$op)]>;
+let mayStore = 1, hasSideEffects = 0 in {
+def IST_Fp16m32  : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32  : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp64m32  : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp16m64  : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64  : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp64m64  : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp16m80  : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp32m80  : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp64m80  : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def LD_F32m   : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src",
+                    IIC_FLD>;
+def LD_F64m   : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src",
+                    IIC_FLD>;
+def LD_F80m   : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src",
+                    IIC_FLD80>;
+def ILD_F16m  : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src",
+                    IIC_FILD>;
+def ILD_F32m  : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src",
+                    IIC_FILD>;
+def ILD_F64m  : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src",
+                    IIC_FILD>;
+}
+let mayStore = 1, SchedRW = [WriteStore] in {
+def ST_F32m   : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst",
+                    IIC_FST>;
+def ST_F64m   : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst",
+                    IIC_FST>;
+def ST_FP32m  : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst",
+                    IIC_FST>;
+def ST_FP64m  : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst",
+                    IIC_FST>;
+def ST_FP80m  : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst",
+                    IIC_FST80>;
+def IST_F16m  : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst",
+                    IIC_FIST>;
+def IST_F32m  : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst",
+                    IIC_FIST>;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst",
+                    IIC_FIST>;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst",
+                    IIC_FIST>;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst",
+                    IIC_FIST>;
+}
+
+// FISTTP requires SSE3 even though it's a FPStack op.
+let Predicates = [HasSSE3] in {
+def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP80:$src, addr:$op)]>;
+def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP80:$src, addr:$op)]>;
+def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP80:$src, addr:$op)]>;
+} // Predicates = [HasSSE3]
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst",
+  IIC_FST>;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst",
+  IIC_FST>;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst),
+  "fisttp{ll}\t$dst", IIC_FST>;
+}
+
+// FP Stack manipulation instructions.
+let SchedRW = [WriteMove] in {
+def LD_Frr   : FPI<0xD9, MRM0r, (outs), (ins RST:$op), "fld\t$op", IIC_FLD>;
+def ST_Frr   : FPI<0xDD, MRM2r, (outs), (ins RST:$op), "fst\t$op", IIC_FST>;
+def ST_FPrr  : FPI<0xDD, MRM3r, (outs), (ins RST:$op), "fstp\t$op", IIC_FST>;
+def XCH_F    : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op", IIC_FXCH>;
+}
+
+// Floating point constant loads.
+let isReMaterializable = 1 in {
+def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+                [(set RFP32:$dst, fpimm0)]>;
+def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+                [(set RFP32:$dst, fpimm1)]>;
+def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+                [(set RFP64:$dst, fpimm0)]>;
+def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+                [(set RFP64:$dst, fpimm1)]>;
+def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+                [(set RFP80:$dst, fpimm0)]>;
+def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+                [(set RFP80:$dst, fpimm1)]>;
+}
+
+let SchedRW = [WriteZero] in {
+def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz", IIC_FLDZ>;
+def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1", IIC_FIST>;
+}
+
+// Floating point compares.
+let SchedRW = [WriteFAdd] in {
+def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+                        [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>;
+def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+                        [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>;
+def UCOM_Fpr80 : FpI_  <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+                        [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>;
+} // SchedRW
+} // Defs = [FPSW]
+
+let SchedRW = [WriteFAdd] in {
+// CC = ST(0) cmp ST(i)
+let Defs = [EFLAGS, FPSW] in {
+def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+                  [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>;
+def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+                  [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>;
+def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+                  [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>;
+}
+
+let Defs = [FPSW], Uses = [ST0] in {
+def UCOM_Fr    : FPI<0xDD, MRM4r,    // FPSW = cmp ST(0) with ST(i)
+                    (outs), (ins RST:$reg), "fucom\t$reg", IIC_FUCOM>;
+def UCOM_FPr   : FPI<0xDD, MRM5r,    // FPSW = cmp ST(0) with ST(i), pop
+                    (outs), (ins RST:$reg), "fucomp\t$reg", IIC_FUCOM>;
+def UCOM_FPPr  : FPI<0xDA, MRM_E9,       // cmp ST(0) with ST(1), pop, pop
+                    (outs), (ins), "fucompp", IIC_FUCOM>;
+}
+
+let Defs = [EFLAGS, FPSW], Uses = [ST0] in {
+def UCOM_FIr   : FPI<0xDB, MRM5r,     // CC = cmp ST(0) with ST(i)
+                    (outs), (ins RST:$reg), "fucomi\t$reg", IIC_FUCOMI>;
+def UCOM_FIPr  : FPI<0xDF, MRM5r,     // CC = cmp ST(0) with ST(i), pop
+                    (outs), (ins RST:$reg), "fucompi\t$reg", IIC_FUCOMI>;
+}
+
+let Defs = [EFLAGS, FPSW] in {
+def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RST:$reg),
+                  "fcomi\t$reg", IIC_FCOMI>;
+def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RST:$reg),
+                   "fcompi\t$reg", IIC_FCOMI>;
+}
+} // SchedRW
+
+// Floating point flag ops.
+let SchedRW = [WriteALU] in {
+let Defs = [AX], Uses = [FPSW] in
+def FNSTSW16r : I<0xDF, MRM_E0,                  // AX = fp flags
+                  (outs), (ins), "fnstsw\t{%ax|ax}",
+                  [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>;
+
+def FNSTCW16m : I<0xD9, MRM7m,                   // [mem16] = X87 control world
+                  (outs), (ins i16mem:$dst), "fnstcw\t$dst",
+                  [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>;
+} // SchedRW
+let mayLoad = 1 in
+def FLDCW16m  : I<0xD9, MRM5m,                   // X87 control world = [mem16]
+                  (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>,
+                Sched<[WriteLoad]>;
+
+// FPU control instructions
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [FPSW] in
+def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", [], IIC_FNINIT>;
+def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg),
+                "ffree\t$reg", IIC_FFREE>;
+// Clear exceptions
+
+let Defs = [FPSW] in
+def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", [], IIC_FNCLEX>;
+} // SchedRW
+
+// Operandless floating-point instructions for the disassembler.
+let SchedRW = [WriteMicrocoded] in {
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
+
+def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", [], IIC_FNOP>;
+def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", [], IIC_FXAM>;
+def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", [], IIC_FLDL>;
+def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", [], IIC_FLDL>;
+def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", [], IIC_FLDL>;
+def FLDLG2 : I<0xD9, MRM_EC, (outs), (ins), "fldlg2", [], IIC_FLDL>;
+def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", [], IIC_FLDL>;
+def F2XM1 : I<0xD9, MRM_F0, (outs), (ins), "f2xm1", [], IIC_F2XM1>;
+def FYL2X : I<0xD9, MRM_F1, (outs), (ins), "fyl2x", [], IIC_FYL2X>;
+def FPTAN : I<0xD9, MRM_F2, (outs), (ins), "fptan", [], IIC_FPTAN>;
+def FPATAN : I<0xD9, MRM_F3, (outs), (ins), "fpatan", [], IIC_FPATAN>;
+def FXTRACT : I<0xD9, MRM_F4, (outs), (ins), "fxtract", [], IIC_FXTRACT>;
+def FPREM1 : I<0xD9, MRM_F5, (outs), (ins), "fprem1", [], IIC_FPREM1>;
+def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", [], IIC_FPSTP>;
+def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", [], IIC_FPSTP>;
+def FPREM : I<0xD9, MRM_F8, (outs), (ins), "fprem", [], IIC_FPREM>;
+def FYL2XP1 : I<0xD9, MRM_F9, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>;
+def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", [], IIC_FSINCOS>;
+def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>;
+def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>;
+def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>;
+
+let Predicates = [HasFXSR] in {
+  def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
+                 "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB;
+  def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
+                    "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
+                    IIC_FXSAVE>, TB, Requires<[In64BitMode]>;
+  def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+                "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB;
+  def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+                     "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
+                     IIC_FXRSTOR>, TB, Requires<[In64BitMode]>;
+} // Predicates = [FeatureFXSR]
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Required for RET of f32 / f64 / f80 values.
+def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>;
+def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>;
+
+// Required for CALL which return f32 / f64 / f80 values.
+def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op,
+                                                          RFP64:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op,
+                                                          RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op,
+                                                          RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op,
+                                                         RFP80:$src)>;
+
+// Floating point constant -0.0 and -1.0
+def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>;
+def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>;
+def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>;
+def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>;
+def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
+def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
+
+// Used to conv. i64 to f64 since there isn't a SSE version.
+def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
+
+// FP extensions map onto simple pseudo-value conversions if they are to/from
+// the FP stack.
+def : Pat<(f64 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
+          Requires<[FPStackf32]>;
+def : Pat<(f80 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
+           Requires<[FPStackf32]>;
+def : Pat<(f80 (fpextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
+           Requires<[FPStackf64]>;
+
+// FP truncations map onto simple pseudo-value conversions if they are to/from
+// the FP stack.  We have validated that only value-preserving truncations make
+// it through isel.
+def : Pat<(f32 (fpround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
+          Requires<[FPStackf32]>;
+def : Pat<(f32 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
+           Requires<[FPStackf32]>;
+def : Pat<(f64 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
+           Requires<[FPStackf64]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
new file mode 100644
index 000000000000..610756aa37da
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
@@ -0,0 +1,957 @@
+//===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<7> val> {
+  bits<7> Value = val;
+}
+
+def Pseudo        : Format<0>;
+def RawFrm        : Format<1>;
+def AddRegFrm     : Format<2>;
+def RawFrmMemOffs : Format<3>;
+def RawFrmSrc     : Format<4>;
+def RawFrmDst     : Format<5>;
+def RawFrmDstSrc  : Format<6>;
+def RawFrmImm8    : Format<7>;
+def RawFrmImm16   : Format<8>;
+def MRMDestMem     : Format<32>;
+def MRMSrcMem      : Format<33>;
+def MRMSrcMem4VOp3 : Format<34>;
+def MRMSrcMemOp4   : Format<35>;
+def MRMXm  : Format<39>;
+def MRM0m  : Format<40>;  def MRM1m  : Format<41>;  def MRM2m  : Format<42>;
+def MRM3m  : Format<43>;  def MRM4m  : Format<44>;  def MRM5m  : Format<45>;
+def MRM6m  : Format<46>;  def MRM7m  : Format<47>;
+def MRMDestReg     : Format<48>;
+def MRMSrcReg      : Format<49>;
+def MRMSrcReg4VOp3 : Format<50>;
+def MRMSrcRegOp4   : Format<51>;
+def MRMXr  : Format<55>;
+def MRM0r  : Format<56>;  def MRM1r  : Format<57>;  def MRM2r  : Format<58>;
+def MRM3r  : Format<59>;  def MRM4r  : Format<60>;  def MRM5r  : Format<61>;
+def MRM6r  : Format<62>;  def MRM7r  : Format<63>;
+def MRM_C0 : Format<64>;  def MRM_C1 : Format<65>;  def MRM_C2 : Format<66>;
+def MRM_C3 : Format<67>;  def MRM_C4 : Format<68>;  def MRM_C5 : Format<69>;
+def MRM_C6 : Format<70>;  def MRM_C7 : Format<71>;  def MRM_C8 : Format<72>;
+def MRM_C9 : Format<73>;  def MRM_CA : Format<74>;  def MRM_CB : Format<75>;
+def MRM_CC : Format<76>;  def MRM_CD : Format<77>;  def MRM_CE : Format<78>;
+def MRM_CF : Format<79>;  def MRM_D0 : Format<80>;  def MRM_D1 : Format<81>;
+def MRM_D2 : Format<82>;  def MRM_D3 : Format<83>;  def MRM_D4 : Format<84>;
+def MRM_D5 : Format<85>;  def MRM_D6 : Format<86>;  def MRM_D7 : Format<87>;
+def MRM_D8 : Format<88>;  def MRM_D9 : Format<89>;  def MRM_DA : Format<90>;
+def MRM_DB : Format<91>;  def MRM_DC : Format<92>;  def MRM_DD : Format<93>;
+def MRM_DE : Format<94>;  def MRM_DF : Format<95>;  def MRM_E0 : Format<96>;
+def MRM_E1 : Format<97>;  def MRM_E2 : Format<98>;  def MRM_E3 : Format<99>;
+def MRM_E4 : Format<100>; def MRM_E5 : Format<101>; def MRM_E6 : Format<102>;
+def MRM_E7 : Format<103>; def MRM_E8 : Format<104>; def MRM_E9 : Format<105>;
+def MRM_EA : Format<106>; def MRM_EB : Format<107>; def MRM_EC : Format<108>;
+def MRM_ED : Format<109>; def MRM_EE : Format<110>; def MRM_EF : Format<111>;
+def MRM_F0 : Format<112>; def MRM_F1 : Format<113>; def MRM_F2 : Format<114>;
+def MRM_F3 : Format<115>; def MRM_F4 : Format<116>; def MRM_F5 : Format<117>;
+def MRM_F6 : Format<118>; def MRM_F7 : Format<119>; def MRM_F8 : Format<120>;
+def MRM_F9 : Format<121>; def MRM_FA : Format<122>; def MRM_FB : Format<123>;
+def MRM_FC : Format<124>; def MRM_FD : Format<125>; def MRM_FE : Format<126>;
+def MRM_FF : Format<127>;
+
+// ImmType - This specifies the immediate type used by an instruction. This is
+// part of the ad-hoc solution used to emit machine instruction encodings by our
+// machine code emitter.
+class ImmType<bits<4> val> {
+  bits<4> Value = val;
+}
+def NoImm      : ImmType<0>;
+def Imm8       : ImmType<1>;
+def Imm8PCRel  : ImmType<2>;
+def Imm8Reg    : ImmType<3>; // Register encoded in [7:4].
+def Imm16      : ImmType<4>;
+def Imm16PCRel : ImmType<5>;
+def Imm32      : ImmType<6>;
+def Imm32PCRel : ImmType<7>;
+def Imm32S     : ImmType<8>;
+def Imm64      : ImmType<9>;
+
+// FPFormat - This specifies what form this FP instruction has.  This is used by
+// the Floating-Point stackifier pass.
+class FPFormat<bits<3> val> {
+  bits<3> Value = val;
+}
+def NotFP      : FPFormat<0>;
+def ZeroArgFP  : FPFormat<1>;
+def OneArgFP   : FPFormat<2>;
+def OneArgFPRW : FPFormat<3>;
+def TwoArgFP   : FPFormat<4>;
+def CompareFP  : FPFormat<5>;
+def CondMovFP  : FPFormat<6>;
+def SpecialFP  : FPFormat<7>;
+
+// Class specifying the SSE execution domain, used by the SSEDomainFix pass.
+// Keep in sync with tables in X86InstrInfo.cpp.
+class Domain<bits<2> val> {
+  bits<2> Value = val;
+}
+def GenericDomain   : Domain<0>;
+def SSEPackedSingle : Domain<1>;
+def SSEPackedDouble : Domain<2>;
+def SSEPackedInt    : Domain<3>;
+
+// Class specifying the vector form of the decompressed
+// displacement of 8-bit.
+class CD8VForm<bits<3> val> {
+  bits<3> Value = val;
+}
+def CD8VF  : CD8VForm<0>;  // v := VL
+def CD8VH  : CD8VForm<1>;  // v := VL/2
+def CD8VQ  : CD8VForm<2>;  // v := VL/4
+def CD8VO  : CD8VForm<3>;  // v := VL/8
+// The tuple (subvector) forms.
+def CD8VT1 : CD8VForm<4>;  // v := 1
+def CD8VT2 : CD8VForm<5>;  // v := 2
+def CD8VT4 : CD8VForm<6>;  // v := 4
+def CD8VT8 : CD8VForm<7>;  // v := 8
+
+// Class specifying the prefix used an opcode extension.
+class Prefix<bits<3> val> {
+  bits<3> Value = val;
+}
+def NoPrfx : Prefix<0>;
+def PS     : Prefix<1>;
+def PD     : Prefix<2>;
+def XS     : Prefix<3>;
+def XD     : Prefix<4>;
+
+// Class specifying the opcode map.
+class Map<bits<3> val> {
+  bits<3> Value = val;
+}
+def OB   : Map<0>;
+def TB   : Map<1>;
+def T8   : Map<2>;
+def TA   : Map<3>;
+def XOP8 : Map<4>;
+def XOP9 : Map<5>;
+def XOPA : Map<6>;
+
+// Class specifying the encoding
+class Encoding<bits<2> val> {
+  bits<2> Value = val;
+}
+def EncNormal : Encoding<0>;
+def EncVEX    : Encoding<1>;
+def EncXOP    : Encoding<2>;
+def EncEVEX   : Encoding<3>;
+
+// Operand size for encodings that change based on mode.
+class OperandSize<bits<2> val> {
+  bits<2> Value = val;
+}
+def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix.
+def OpSize16    : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode.
+def OpSize32    : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode.
+
+// Address size for encodings that change based on mode.
+class AddressSize<bits<2> val> {
+  bits<2> Value = val;
+}
+def AdSizeX  : AddressSize<0>; // Address size determined using addr operand.
+def AdSize16 : AddressSize<1>; // Encodes a 16-bit address.
+def AdSize32 : AddressSize<2>; // Encodes a 32-bit address.
+def AdSize64 : AddressSize<3>; // Encodes a 64-bit address.
+
+// Prefix byte classes which are used to indicate to the ad-hoc machine code
+// emitter that various prefix bytes are required.
+class OpSize16 { OperandSize OpSize = OpSize16; }
+class OpSize32 { OperandSize OpSize = OpSize32; }
+class AdSize16 { AddressSize AdSize = AdSize16; }
+class AdSize32 { AddressSize AdSize = AdSize32; }
+class AdSize64 { AddressSize AdSize = AdSize64; }
+class REX_W  { bit hasREX_WPrefix = 1; }
+class LOCK   { bit hasLockPrefix = 1; }
+class REP    { bit hasREPPrefix = 1; }
+class TB     { Map OpMap = TB; }
+class T8     { Map OpMap = T8; }
+class TA     { Map OpMap = TA; }
+class XOP8   { Map OpMap = XOP8; Prefix OpPrefix = PS; }
+class XOP9   { Map OpMap = XOP9; Prefix OpPrefix = PS; }
+class XOPA   { Map OpMap = XOPA; Prefix OpPrefix = PS; }
+class OBXS   { Prefix OpPrefix = XS; }
+class PS   : TB { Prefix OpPrefix = PS; }
+class PD   : TB { Prefix OpPrefix = PD; }
+class XD   : TB { Prefix OpPrefix = XD; }
+class XS   : TB { Prefix OpPrefix = XS; }
+class T8PS : T8 { Prefix OpPrefix = PS; }
+class T8PD : T8 { Prefix OpPrefix = PD; }
+class T8XD : T8 { Prefix OpPrefix = XD; }
+class T8XS : T8 { Prefix OpPrefix = XS; }
+class TAPS : TA { Prefix OpPrefix = PS; }
+class TAPD : TA { Prefix OpPrefix = PD; }
+class TAXD : TA { Prefix OpPrefix = XD; }
+class VEX    { Encoding OpEnc = EncVEX; }
+class VEX_W  { bit hasVEX_WPrefix = 1; }
+class VEX_4V : VEX { bit hasVEX_4V = 1; }
+class VEX_L  { bit hasVEX_L = 1; }
+class VEX_LIG { bit ignoresVEX_L = 1; }
+class EVEX : VEX { Encoding OpEnc = EncEVEX; }
+class EVEX_4V : VEX_4V { Encoding OpEnc = EncEVEX; }
+class EVEX_K { bit hasEVEX_K = 1; }
+class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; }
+class EVEX_B { bit hasEVEX_B = 1; }
+class EVEX_RC { bit hasEVEX_RC = 1; }
+class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; }
+class EVEX_V256 { bit hasEVEX_L2 = 0; bit hasVEX_L = 1; }
+class EVEX_V128 { bit hasEVEX_L2 = 0; bit hasVEX_L = 0; }
+
+// Specify AVX512 8-bit compressed displacement encoding based on the vector
+// element size in bits (8, 16, 32, 64) and the CDisp8 form.
+class EVEX_CD8<int esize, CD8VForm form> {
+  int CD8_EltSize = !srl(esize, 3);
+  bits<3> CD8_Form = form.Value;
+}
+
+class Has3DNow0F0FOpcode  { bit has3DNow0F0FOpcode = 1; }
+class XOP { Encoding OpEnc = EncXOP; }
+class XOP_4V : XOP { bit hasVEX_4V = 1; }
+
+class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
+              string AsmStr,
+              InstrItinClass itin,
+              Domain d = GenericDomain>
+  : Instruction {
+  let Namespace = "X86";
+
+  bits<8> Opcode = opcod;
+  Format Form = f;
+  bits<7> FormBits = Form.Value;
+  ImmType ImmT = i;
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  string AsmString = AsmStr;
+
+  // If this is a pseudo instruction, mark it isCodeGenOnly.
+  let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
+
+  let Itinerary = itin;
+
+  //
+  // Attributes specific to X86 instructions...
+  //
+  bit ForceDisassemble = 0; // Force instruction to disassemble even though it's
+                            // isCodeGenonly. Needed to hide an ambiguous
+                            // AsmString from the parser, but still disassemble.
+
+  OperandSize OpSize = OpSizeFixed; // Does this instruction's encoding change
+                                    // based on operand size of the mode?
+  bits<2> OpSizeBits = OpSize.Value;
+  AddressSize AdSize = AdSizeX; // Does this instruction's encoding change
+                                // based on address size of the mode?
+  bits<2> AdSizeBits = AdSize.Value;
+
+  Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have?
+  bits<3> OpPrefixBits = OpPrefix.Value;
+  Map OpMap = OB;           // Which opcode map does this inst have?
+  bits<3> OpMapBits = OpMap.Value;
+  bit hasREX_WPrefix  = 0;  // Does this inst require the REX.W prefix?
+  FPFormat FPForm = NotFP;  // What flavor of FP instruction is this?
+  bit hasLockPrefix = 0;    // Does this inst have a 0xF0 prefix?
+  Domain ExeDomain = d;
+  bit hasREPPrefix = 0;     // Does this inst have a REP prefix?
+  Encoding OpEnc = EncNormal; // Encoding used by this instruction
+  bits<2> OpEncBits = OpEnc.Value;
+  bit hasVEX_WPrefix = 0;   // Does this inst set the VEX_W field?
+  bit hasVEX_4V = 0;        // Does this inst require the VEX.VVVV field?
+  bit hasVEX_L = 0;         // Does this inst use large (256-bit) registers?
+  bit ignoresVEX_L = 0;     // Does this instruction ignore the L-bit
+  bit hasEVEX_K = 0;        // Does this inst require masking?
+  bit hasEVEX_Z = 0;        // Does this inst set the EVEX_Z field?
+  bit hasEVEX_L2 = 0;       // Does this inst set the EVEX_L2 field?
+  bit hasEVEX_B = 0;        // Does this inst set the EVEX_B field?
+  bits<3> CD8_Form = 0;     // Compressed disp8 form - vector-width.
+  // Declare it int rather than bits<4> so that all bits are defined when
+  // assigning to bits<7>.
+  int CD8_EltSize = 0;      // Compressed disp8 form - element-size in bytes.
+  bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
+  bit hasEVEX_RC = 0;       // Explicitly specified rounding control in FP instruction.
+
+  bits<2> EVEX_LL;
+  let EVEX_LL{0} = hasVEX_L;
+  let EVEX_LL{1} = hasEVEX_L2;
+  // Vector size in bytes.
+  bits<7> VectSize = !shl(16, EVEX_LL);
+
+  // The scaling factor for AVX512's compressed displacement is either
+  //   - the size of a  power-of-two number of elements or
+  //   - the size of a single element for broadcasts or
+  //   - the total vector size divided by a power-of-two number.
+  // Possible values are: 0 (non-AVX512 inst), 1, 2, 4, 8, 16, 32 and 64.
+  bits<7> CD8_Scale = !if (!eq (OpEnc.Value, EncEVEX.Value),
+                           !if (CD8_Form{2},
+                                !shl(CD8_EltSize, CD8_Form{1-0}),
+                                !if (hasEVEX_B,
+                                     CD8_EltSize,
+                                     !srl(VectSize, CD8_Form{1-0}))), 0);
+
+  // TSFlags layout should be kept in sync with X86BaseInfo.h.
+  let TSFlags{6-0}   = FormBits;
+  let TSFlags{8-7}   = OpSizeBits;
+  let TSFlags{10-9}  = AdSizeBits;
+  let TSFlags{13-11} = OpPrefixBits;
+  let TSFlags{16-14} = OpMapBits;
+  let TSFlags{17}    = hasREX_WPrefix;
+  let TSFlags{21-18} = ImmT.Value;
+  let TSFlags{24-22} = FPForm.Value;
+  let TSFlags{25}    = hasLockPrefix;
+  let TSFlags{26}    = hasREPPrefix;
+  let TSFlags{28-27} = ExeDomain.Value;
+  let TSFlags{30-29} = OpEncBits;
+  let TSFlags{38-31} = Opcode;
+  let TSFlags{39}    = hasVEX_WPrefix;
+  let TSFlags{40}    = hasVEX_4V;
+  let TSFlags{41}    = hasVEX_L;
+  let TSFlags{42}    = hasEVEX_K;
+  let TSFlags{43}    = hasEVEX_Z;
+  let TSFlags{44}    = hasEVEX_L2;
+  let TSFlags{45}    = hasEVEX_B;
+  // If we run out of TSFlags bits, it's possible to encode this in 3 bits.
+  let TSFlags{52-46} = CD8_Scale;
+  let TSFlags{53}    = has3DNow0F0FOpcode;
+  let TSFlags{54}    = hasEVEX_RC;
+}
+
+class PseudoI<dag oops, dag iops, list<dag> pattern>
+  : X86Inst<0, Pseudo, NoImm, oops, iops, "", NoItinerary> {
+  let Pattern = pattern;
+}
+
+class I<bits<8> o, Format f, dag outs, dag ins, string asm,
+        list<dag> pattern, InstrItinClass itin = NoItinerary,
+        Domain d = GenericDomain>
+  : X86Inst<o, f, NoImm, outs, ins, asm, itin, d> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary,
+           Domain d = GenericDomain>
+  : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii8Reg<bits<8> o, Format f, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary,
+             Domain d = GenericDomain>
+  : X86Inst<o, f, Imm8Reg, outs, ins, asm, itin, d> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii32S<bits<8> o, Format f, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm32S, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+           : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+// FPStack Instruction Templates:
+// FPI - Floating Point Instruction template.
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          InstrItinClass itin = NoItinerary>
+  : I<o, F, outs, ins, asm, [], itin> {}
+
+// FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
+class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
+           InstrItinClass itin = NoItinerary>
+  : X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> {
+  let FPForm = fp;
+  let Pattern = pattern;
+}
+
+// Templates for instructions that use a 16- or 32-bit segmented address as
+//  their only operand: lcall (FAR CALL) and ljmp (FAR JMP)
+//
+//   Iseg16 - 16-bit segment selector, 16-bit offset
+//   Iseg32 - 16-bit segment selector, 32-bit offset
+
+class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+// SI - SSE 1 & 2 scalar instructions
+class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+         list<dag> pattern, InstrItinClass itin = NoItinerary,
+         Domain d = GenericDomain>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
+                   !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+                   !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2],
+                   !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+                   [UseSSE1])))));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
+}
+
+// SI - SSE 1 & 2 scalar intrinsics - vex form available on AVX512
+class SI_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
+         list<dag> pattern, InstrItinClass itin = NoItinerary,
+         Domain d = GenericDomain>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+                   !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+                   !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2],
+                   !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+                   [UseSSE1])))));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
+}
+// SIi8 - SSE 1 & 2 scalar instructions - vex form available on AVX512
+class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin> {
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+                   !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
+                   [UseSSE2])));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
+}
+
+// PI - SSE 1 & 2 packed instructions
+class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+         InstrItinClass itin, Domain d>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+                   !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+                   [UseSSE1])));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
+}
+
+// MMXPI - SSE 1 & 2 packed instructions with MMX operands
+class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+            InstrItinClass itin, Domain d>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(!eq(OpPrefix.Value, PD.Value), [HasSSE2],
+                       [HasSSE1]);
+}
+
+// PIi8 - SSE 1 & 2 packed instructions with immediate
+class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin, Domain d>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+                   !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
+                   [UseSSE1])));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm),
+                  !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm),
+                  asm));
+}
+
+// SSE1 Instruction Templates:
+//
+//   SSI   - SSE1 instructions with XS prefix.
+//   PSI   - SSE1 instructions with PS prefix.
+//   PSIi8 - SSE1 instructions with ImmT == Imm8 and PS prefix.
+//   VSSI  - SSE1 instructions with XS prefix in AVX form.
+//   VPSI  - SSE1 instructions with PS prefix in AVX form, packed single.
+
+class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+        Requires<[UseSSE1]>;
+class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+        Requires<[UseSSE1]>;
+class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+        Requires<[HasAVX]>;
+class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, PS,
+        Requires<[HasAVX]>;
+
+// SSE2 Instruction Templates:
+//
+//   SDI    - SSE2 instructions with XD prefix.
+//   SDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix.
+//   S2SI   - SSE2 instructions with XS prefix.
+//   SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
+//   PDI    - SSE2 instructions with PD prefix, packed double domain.
+//   PDIi8  - SSE2 instructions with ImmT == Imm8 and PD prefix.
+//   VSDI   - SSE2 scalar instructions with XD prefix in AVX form.
+//   VPDI   - SSE2 vector instructions with PD prefix in AVX form,
+//                 packed double domain.
+//   VS2I   - SSE2 scalar instructions with PD prefix in AVX form.
+//   S2I    - SSE2 scalar instructions with PD prefix.
+//   MMXSDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix as well as
+//               MMX operands.
+//   MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as
+//               MMX operands.
+
+class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>;
+class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
+class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+        Requires<[UseSSE2]>;
+class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+        Requires<[UseSSE2]>;
+class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
+        Requires<[UseAVX]>;
+class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+        Requires<[HasAVX]>;
+class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>,
+        PD, Requires<[HasAVX]>;
+class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, PD,
+        Requires<[UseAVX]>;
+class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[UseSSE2]>;
+class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+                list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+
+// SSE3 Instruction Templates:
+//
+//   S3I   - SSE3 instructions with PD prefixes.
+//   S3SI  - SSE3 instructions with XS prefix.
+//   S3DI  - SSE3 instructions with XD prefix.
+
+class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS,
+        Requires<[UseSSE3]>;
+class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD,
+        Requires<[UseSSE3]>;
+class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+        Requires<[UseSSE3]>;
+
+
+// SSSE3 Instruction Templates:
+//
+//   SS38I - SSSE3 instructions with T8 prefix.
+//   SS3AI - SSSE3 instructions with TA prefix.
+//   MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands.
+//   MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands.
+//
+// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
+// uses the MMX registers. The 64-bit versions are grouped with the MMX
+// classes. They need to be enabled even if AVX is enabled.
+
+class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[UseSSSE3]>;
+class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[UseSSSE3]>;
+class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PS,
+        Requires<[HasSSSE3]>;
+class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPS,
+        Requires<[HasSSSE3]>;
+
+// SSE4.1 Instruction Templates:
+//
+//   SS48I - SSE 4.1 instructions with T8 prefix.
+//   SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
+//
+class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[UseSSE41]>;
+class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[UseSSE41]>;
+
+// SSE4.2 Instruction Templates:
+//
+//   SS428I - SSE 4.2 instructions with T8 prefix.
+class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[UseSSE42]>;
+
+//   SS42FI - SSE 4.2 instructions with T8XD prefix.
+// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns.
+class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>;
+
+//   SS42AI = SSE 4.2 instructions with TA prefix
+class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[UseSSE42]>;
+
+// AVX Instruction Templates:
+//   Instructions introduced in AVX (no SSE equivalent forms)
+//
+//   AVX8I - AVX instructions with T8PD prefix.
+//   AVXAIi8 - AVX instructions with TAPD prefix and ImmT = Imm8.
+class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[HasAVX]>;
+class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[HasAVX]>;
+
+// AVX2 Instruction Templates:
+//   Instructions introduced in AVX2 (no SSE equivalent forms)
+//
+//   AVX28I - AVX2 instructions with T8PD prefix.
+//   AVX2AIi8 - AVX2 instructions with TAPD prefix and ImmT = Imm8.
+class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[HasAVX2]>;
+class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[HasAVX2]>;
+
+
+// AVX-512 Instruction Templates:
+//   Instructions introduced in AVX-512 (no SSE equivalent forms)
+//
+//   AVX5128I - AVX-512 instructions with T8PD prefix.
+//   AVX512AIi8 - AVX-512 instructions with TAPD prefix and ImmT = Imm8.
+//   AVX512PDI  - AVX-512 instructions with PD, double packed.
+//   AVX512PSI  - AVX-512 instructions with PS, single packed.
+//   AVX512XS8I - AVX-512 instructions with T8 and XS prefixes.
+//   AVX512XSI  - AVX-512 instructions with XS prefix, generic domain.
+//   AVX512BI   - AVX-512 instructions with PD, int packed domain.
+//   AVX512SI   - AVX-512 scalar instructions with PD prefix.
+
+class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[HasAVX512]>;
+class AVX5128IBase : T8PD {
+  Domain ExeDomain = SSEPackedInt;
+}
+class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS,
+        Requires<[HasAVX512]>;
+class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, XS,
+        Requires<[HasAVX512]>;
+class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, XD,
+        Requires<[HasAVX512]>;
+class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
+        Requires<[HasAVX512]>;
+class AVX512BIBase : PD {
+  Domain ExeDomain = SSEPackedInt;
+}
+class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
+        Requires<[HasAVX512]>;
+class AVX512BIi8Base : PD {
+  Domain ExeDomain = SSEPackedInt;
+  ImmType ImmT = Imm8;
+}
+class AVX512XSIi8Base : XS {
+  Domain ExeDomain = SSEPackedInt;
+  ImmType ImmT = Imm8;
+}
+class AVX512XDIi8Base : XD {
+  Domain ExeDomain = SSEPackedInt;
+  ImmType ImmT = Imm8;
+}
+class AVX512PSIi8Base : PS {
+  Domain ExeDomain = SSEPackedSingle;
+  ImmType ImmT = Imm8;
+}
+class AVX512PDIi8Base : PD {
+  Domain ExeDomain = SSEPackedDouble;
+  ImmType ImmT = Imm8;
+}
+class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[HasAVX512]>;
+class AVX512AIi8Base : TAPD {
+  ImmType ImmT = Imm8;
+}
+class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>,
+        Requires<[HasAVX512]>;
+class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD,
+        Requires<[HasAVX512]>;
+class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS,
+        Requires<[HasAVX512]>;
+class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+        EVEX_4V, Requires<[HasAVX512]>;
+class AVX512FMA3Base : T8PD, EVEX_4V;
+
+class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, Requires<[HasAVX512]>;
+
+// AES Instruction Templates:
+//
+// AES8I
+// These use the same encoding as the SSE4.2 T8 and TA encodings.
+class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag>pattern, InstrItinClass itin = IIC_AES>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
+        Requires<[HasAES]>;
+
+class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[HasAES]>;
+
+// PCLMUL Instruction Templates
+class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        Requires<[HasPCLMUL]>;
+
+class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+                  list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        VEX_4V, Requires<[HasAVX, HasPCLMUL]>;
+
+// FMA3 Instruction Templates
+class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+        VEX_4V, FMASC, Requires<[HasFMA, NoVLX]>;
+
+// FMA4 Instruction Templates
+class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,
+        VEX_4V, FMASC, Requires<[HasFMA4]>;
+
+// XOP 2, 3 and 4 Operand Instruction Template
+class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+         XOP9, Requires<[HasXOP]>;
+
+// XOP 2 and 3 Operand Instruction Templates with imm byte
+class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+         XOP8, Requires<[HasXOP]>;
+// XOP 4 Operand Instruction Templates with imm byte
+class IXOPi8Reg<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8Reg<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+         XOP8, Requires<[HasXOP]>;
+
+//  XOP 5 operand instruction (VEX encoding!)
+class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : Ii8Reg<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
+        VEX_4V, Requires<[HasXOP]>;
+
+// X86-64 Instruction templates...
+//
+
+class RI<bits<8> o, Format F, dag outs, dag ins, string asm,
+         list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi16 <bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii16<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii32<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi32S <bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii32S<o, F, outs, ins, asm, pattern, itin>, REX_W;
+
+class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm64, outs, ins, asm, itin>, REX_W {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class RIi64_NOREX<bits<8> o, Format f, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm64, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class RS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : S2I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : VS2I<o, F, outs, ins, asm, pattern, itin>, VEX_W;
+
+// MMX Instruction templates
+//
+
+// MMXI   - MMX instructions with TB prefix.
+// MMXI32 - MMX instructions with TB prefix valid only in 32 bit mode.
+// MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode.
+// MMX2I  - MMX / SSE2 instructions with PD prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
+// MMXID  - MMX instructions with XD prefix.
+// MMXIS  - MMX instructions with XS prefix.
+class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
+class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,Not64BitMode]>;
+class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,In64BitMode]>;
+class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PS, REX_W, Requires<[HasMMX]>;
+class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[HasMMX]>;
+class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>;
+class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>;
+class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
new file mode 100644
index 000000000000..c5689d7c698c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -0,0 +1,1099 @@
+//===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides pattern fragments useful for SIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+// Low word of MMX to GPR.
+def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>;
+// GPR to low word of MMX.
+def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, x86mmx>, SDTCisVT<1, i32>]>>;
+
+//===----------------------------------------------------------------------===//
+// MMX Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>;
+def load_mvmmx : PatFrag<(ops node:$ptr),
+                         (x86mmx (MMX_X86movw2d (load node:$ptr)))>;
+
+//===----------------------------------------------------------------------===//
+// SSE specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>,
+                                       SDTCisFP<1>, SDTCisVT<3, i8>,
+                                       SDTCisVec<1>]>;
+def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, 
+                                     SDTCisSameAs<1, 2>, SDTCisInt<3>]>;
+
+def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
+def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
+
+// Commutative and Associative FMIN and FMAX.
+def X86fminc    : SDNode<"X86ISD::FMINC", SDTFPBinOp,
+    [SDNPCommutative, SDNPAssociative]>;
+def X86fmaxc    : SDNode<"X86ISD::FMAXC", SDTFPBinOp,
+    [SDNPCommutative, SDNPAssociative]>;
+
+def X86fand    : SDNode<"X86ISD::FAND",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86for     : SDNode<"X86ISD::FOR",       SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86fxor    : SDNode<"X86ISD::FXOR",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86fandn   : SDNode<"X86ISD::FANDN",     SDTFPBinOp>;
+def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
+def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
+def X86frsqrt14s: SDNode<"X86ISD::FRSQRTS",  SDTFPBinOp>;
+def X86frcp14s : SDNode<"X86ISD::FRCPS",    SDTFPBinOp>;
+def X86fhadd   : SDNode<"X86ISD::FHADD",     SDTFPBinOp>;
+def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
+def X86hadd    : SDNode<"X86ISD::HADD",      SDTIntBinOp>;
+def X86hsub    : SDNode<"X86ISD::HSUB",      SDTIntBinOp>;
+def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
+def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
+def X86cmps    : SDNode<"X86ISD::FSETCC",     SDTX86Cmps>;
+def X86pshufb  : SDNode<"X86ISD::PSHUFB",
+                 SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>,
+                                      SDTCisSameAs<0,2>]>>;
+def X86psadbw  : SDNode<"X86ISD::PSADBW",
+                 SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+                                      SDTCVecEltisVT<1, i8>,
+                                      SDTCisSameSizeAs<0,1>,
+                                      SDTCisSameAs<1,2>]>, [SDNPCommutative]>;
+def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW",
+                  SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>,
+                                       SDTCVecEltisVT<1, i8>,
+                                       SDTCisSameSizeAs<0,1>,
+                                       SDTCisSameAs<1,2>, SDTCisInt<3>]>>;
+def X86andnp   : SDNode<"X86ISD::ANDNP",
+                 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisSameAs<0,2>]>>;
+def X86multishift   : SDNode<"X86ISD::MULTISHIFT",
+                 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                      SDTCisSameAs<1,2>]>>;
+def X86pextrb  : SDNode<"X86ISD::PEXTRB",
+                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>,
+                                      SDTCisPtrTy<2>]>>;
+def X86pextrw  : SDNode<"X86ISD::PEXTRW",
+                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v8i16>,
+                                      SDTCisPtrTy<2>]>>;
+def X86pinsrb  : SDNode<"X86ISD::PINSRB",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86pinsrw  : SDNode<"X86ISD::PINSRW",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86insertps : SDNode<"X86ISD::INSERTPS",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, v4f32>, SDTCisVT<3, i8>]>>;
+def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
+                 SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+
+def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def X86vzext   : SDNode<"X86ISD::VZEXT",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisOpSmallerThanOp<1, 0>]>>;
+
+def X86vsext   : SDNode<"X86ISD::VSEXT",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisOpSmallerThanOp<1, 0>]>>;
+
+def SDTVtrunc    : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                       SDTCisInt<0>, SDTCisInt<1>,
+                                       SDTCisOpSmallerThanOp<0, 1>]>;
+
+def X86vtrunc    : SDNode<"X86ISD::VTRUNC",   SDTVtrunc>;
+def X86vtruncs   : SDNode<"X86ISD::VTRUNCS",  SDTVtrunc>;
+def X86vtruncus  : SDNode<"X86ISD::VTRUNCUS", SDTVtrunc>;
+
+def X86vfpext  : SDNode<"X86ISD::VFPEXT",
+                        SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>,
+                                             SDTCVecEltisVT<1, f32>,
+                                             SDTCisSameSizeAs<0, 1>]>>;
+def X86vfpround: SDNode<"X86ISD::VFPROUND",
+                        SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
+                                             SDTCVecEltisVT<1, f64>,
+                                             SDTCisSameSizeAs<0, 1>]>>;
+
+def X86froundRnd: SDNode<"X86ISD::VFPROUNDS_RND",
+                        SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
+                                             SDTCisSameAs<0, 1>,
+                                             SDTCVecEltisVT<2, f64>,
+                                             SDTCisSameSizeAs<0, 2>,
+                                             SDTCisVT<3, i32>]>>;
+
+def X86fpextRnd  : SDNode<"X86ISD::VFPEXTS_RND",
+                        SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f64>,
+                                             SDTCisSameAs<0, 1>,
+                                             SDTCVecEltisVT<2, f32>,
+                                             SDTCisSameSizeAs<0, 2>,
+                                             SDTCisVT<3, i32>]>>;
+
+def X86vshldq  : SDNode<"X86ISD::VSHLDQ",    SDTIntShiftOp>;
+def X86vshrdq  : SDNode<"X86ISD::VSRLDQ",    SDTIntShiftOp>;
+def X86cmpp    : SDNode<"X86ISD::CMPP",      SDTX86VFCMP>;
+def X86pcmpeq  : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpgt  : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
+
+def X86IntCmpMask : SDTypeProfile<1, 2,
+    [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, SDTCisSameAs<1, 2>, SDTCisInt<1>,
+     SDTCisSameNumEltsAs<0, 1>]>;
+def X86pcmpeqm  : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>;
+def X86pcmpgtm  : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>;
+
+def X86CmpMaskCC :
+      SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
+                       SDTCisVec<1>, SDTCisSameAs<2, 1>,
+                       SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>;
+def X86CmpMaskCCRound :
+      SDTypeProfile<1, 4, [SDTCisVec<0>,SDTCVecEltisVT<0, i1>,
+                       SDTCisVec<1>, SDTCisSameAs<2, 1>,
+                       SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>,
+                       SDTCisVT<4, i32>]>;
+def X86CmpMaskCCScalar :
+      SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
+def X86CmpMaskCCScalarRound :
+      SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>,
+                           SDTCisVT<4, i32>]>;
+
+def X86cmpm     : SDNode<"X86ISD::CMPM",     X86CmpMaskCC>;
+def X86cmpmRnd  : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>;
+def X86cmpmu    : SDNode<"X86ISD::CMPMU",    X86CmpMaskCC>;
+def X86cmpms    : SDNode<"X86ISD::FSETCCM",   X86CmpMaskCCScalar>;
+def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND",   X86CmpMaskCCScalarRound>;
+
+def X86vshl    : SDNode<"X86ISD::VSHL",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisVec<2>]>>;
+def X86vsrl    : SDNode<"X86ISD::VSRL",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisVec<2>]>>;
+def X86vsra    : SDNode<"X86ISD::VSRA",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisVec<2>]>>;
+
+def X86vsrav   : SDNode<"X86ISD::VSRAV" ,
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>]>>;
+
+def X86vshli   : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
+def X86vsrli   : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
+def X86vsrai   : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
+
+def X86vrotli  : SDNode<"X86ISD::VROTLI", SDTIntShiftOp>;
+def X86vrotri  : SDNode<"X86ISD::VROTRI", SDTIntShiftOp>;
+
+def X86vprot   : SDNode<"X86ISD::VPROT",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>]>>;
+def X86vproti  : SDNode<"X86ISD::VPROTI",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisVT<2, i8>]>>;
+
+def X86vpshl   : SDNode<"X86ISD::VPSHL",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>]>>;
+def X86vpsha   : SDNode<"X86ISD::VPSHA",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>]>>;
+
+def X86vpcom   : SDNode<"X86ISD::VPCOM",
+                        SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>,
+                                             SDTCisVT<3, i8>]>>;
+def X86vpcomu  : SDNode<"X86ISD::VPCOMU",
+                        SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>,
+                                             SDTCisVT<3, i8>]>>;
+def X86vpermil2 : SDNode<"X86ISD::VPERMIL2",
+                        SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>,
+                                             SDTCisSameSizeAs<0,3>,
+                                             SDTCisSameNumEltsAs<0, 3>,
+                                             SDTCisVT<4, i8>]>>;
+def X86vpperm : SDNode<"X86ISD::VPPERM",
+                        SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>]>>;
+
+def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                                          SDTCisVec<1>,
+                                          SDTCisSameAs<2, 1>]>;
+
+def SDTX86Testm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                       SDTCisSameAs<2, 1>, SDTCVecEltisVT<0, i1>,
+                                       SDTCisSameNumEltsAs<0, 1>]>;
+
+def X86addus   : SDNode<"X86ISD::ADDUS", SDTIntBinOp, [SDNPCommutative]>;
+def X86subus   : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
+def X86adds    : SDNode<"X86ISD::ADDS", SDTIntBinOp, [SDNPCommutative]>;
+def X86subs    : SDNode<"X86ISD::SUBS", SDTIntBinOp>;
+def X86mulhrs  : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>;
+def X86avg     : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>;
+def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
+def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
+def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
+def X86ktest   : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
+def X86testm   : SDNode<"X86ISD::TESTM", SDTX86Testm, [SDNPCommutative]>;
+def X86testnm  : SDNode<"X86ISD::TESTNM", SDTX86Testm, [SDNPCommutative]>;
+
+def X86movmsk : SDNode<"X86ISD::MOVMSK",
+                        SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVec<1>]>>;
+
+def X86select  : SDNode<"X86ISD::SELECT",
+                        SDTypeProfile<1, 3, [SDTCVecEltisVT<1, i1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<2, 3>,
+                                             SDTCisSameNumEltsAs<0, 1>]>>;
+
+def X86selects : SDNode<"X86ISD::SELECTS",
+                        SDTypeProfile<1, 3, [SDTCisVT<1, i1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<2, 3>]>>;
+
+def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+                                             SDTCVecEltisVT<1, i32>,
+                                             SDTCisSameSizeAs<0,1>,
+                                             SDTCisSameAs<1,2>]>,
+                                             [SDNPCommutative]>;
+def X86pmuldq  : SDNode<"X86ISD::PMULDQ",
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>,
+                                             SDTCVecEltisVT<1, i32>,
+                                             SDTCisSameSizeAs<0,1>,
+                                             SDTCisSameAs<1,2>]>,
+                                             [SDNPCommutative]>;
+
+def X86extrqi : SDNode<"X86ISD::EXTRQI",
+                  SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
+                                       SDTCisVT<2, i8>, SDTCisVT<3, i8>]>>;
+def X86insertqi : SDNode<"X86ISD::INSERTQI",
+                    SDTypeProfile<1, 4, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>,
+                                         SDTCisSameAs<1,2>, SDTCisVT<3, i8>,
+                                         SDTCisVT<4, i8>]>>;
+
+// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
+// translated into one of the target nodes below during lowering.
+// Note: this is a work in progress...
+def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                SDTCisSameAs<0,2>]>;
+
+def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                        SDTCisSameSizeAs<0,2>,
+                                        SDTCisSameNumEltsAs<0,2>]>;
+def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                 SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>;
+def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                 SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>;
+def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                             SDTCisSameAs<0,2>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
+def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>,
+                                                 SDTCisSameAs<0,2>,
+                                                 SDTCisInt<3>,
+                                                 SDTCisSameSizeAs<0, 3>,
+                                                 SDTCisSameNumEltsAs<0, 3>,
+                                                 SDTCisVT<4, i32>,
+                                                 SDTCisVT<5, i32>]>;
+def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                              SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
+def SDTVBroadcast  : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>,
+                                          SDTCisInt<0>, SDTCisInt<1>]>;
+
+def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                             SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>;
+
+def SDTTernlog  : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                SDTCisSameAs<0,2>, SDTCisSameAs<0,3>,
+                                SDTCisVT<4, i8>]>;
+
+def SDTFPBinOpRound : SDTypeProfile<1, 3, [      // fadd_round, fmul_round, etc.
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisVT<3, i32>]>;
+
+def SDTFPUnaryOpRound : SDTypeProfile<1, 2, [      // fsqrt_round, fgetexp_round, etc.
+  SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisVT<2, i32>]>;
+
+def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
+                           SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
+def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
+                           SDTCisSameAs<1,2>, SDTCisSameAs<1,3>,
+                           SDTCisVT<4, i32>]>;
+
+def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
+def X86VAlign  : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
+
+def X86Abs      : SDNode<"X86ISD::ABS", SDTIntUnaryOp>;
+def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>;
+
+def X86PShufd  : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
+def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
+def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>;
+
+def X86Shufp   : SDNode<"X86ISD::SHUFP", SDTShuff3OpI>;
+def X86Shuf128 : SDNode<"X86ISD::SHUF128", SDTShuff3OpI>;
+
+def X86Movddup  : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
+def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
+def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
+
+def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>;
+def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>;
+
+def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>;
+def X86Movlhpd : SDNode<"X86ISD::MOVLHPD", SDTShuff2Op>;
+def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
+
+def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
+def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
+
+def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                   SDTCisSameSizeAs<0,1>,
+                                   SDTCisSameAs<1,2>]>;
+def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>;
+def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
+
+def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
+def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
+
+def X86vpmaddubsw  : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>;
+def X86vpmaddwd    : SDNode<"X86ISD::VPMADDWD"   , SDTPack, [SDNPCommutative]>;
+
+def X86VPermilpv  : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
+def X86VPermilpi  : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
+def X86VPermv     : SDNode<"X86ISD::VPERMV",
+                           SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<1>,
+                                                SDTCisSameNumEltsAs<0,1>,
+                                                SDTCisSameSizeAs<0,1>,
+                                                SDTCisSameAs<0,2>]>>;
+def X86VPermi     : SDNode<"X86ISD::VPERMI",    SDTShuff2OpI>;
+def X86VPermt2     : SDNode<"X86ISD::VPERMV3",
+                    SDTypeProfile<1, 3, [SDTCisVec<0>,
+                                         SDTCisSameAs<0,1>, SDTCisInt<2>,
+                                         SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>,
+                                         SDTCisSameSizeAs<0,2>,
+                                         SDTCisSameAs<0,3>]>, []>;
+
+// Even though the index operand should be integer, we need to make it match the
+// destination type so that we can pattern match the masked version where the
+// index is also the passthru operand.
+def X86VPermi2X   : SDNode<"X86ISD::VPERMIV3",
+                    SDTypeProfile<1, 3, [SDTCisVec<0>,
+                                         SDTCisSameAs<0,1>,
+                                         SDTCisSameAs<0,2>,
+                                         SDTCisSameAs<0,3>]>, []>;
+
+def X86vpternlog  : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>;
+
+def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
+
+def X86VFixupimm   : SDNode<"X86ISD::VFIXUPIMM", SDTFPTernaryOpImmRound>;
+def X86VFixupimmScalar   : SDNode<"X86ISD::VFIXUPIMMS", SDTFPTernaryOpImmRound>;
+def X86VRange      : SDNode<"X86ISD::VRANGE",    SDTFPBinOpImmRound>;
+def X86VReduce     : SDNode<"X86ISD::VREDUCE",   SDTFPUnaryOpImmRound>;
+def X86VRndScale   : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>;
+def X86VGetMant    : SDNode<"X86ISD::VGETMANT",  SDTFPUnaryOpImmRound>;
+def X86Vfpclass    : SDNode<"X86ISD::VFPCLASS",
+                       SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
+                                            SDTCisVec<1>, SDTCisFP<1>,
+                                            SDTCisSameNumEltsAs<0,1>,
+                                            SDTCisVT<2, i32>]>, []>;
+def X86Vfpclasss   : SDNode<"X86ISD::VFPCLASSS",
+                       SDTypeProfile<1, 2, [SDTCisVT<0, i1>,
+                                            SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>;
+
+def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
+                    SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                         SDTCisSubVecOfVec<1, 0>]>, []>;
+
+def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
+def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
+def X86Vinsert   : SDNode<"X86ISD::VINSERT",  SDTypeProfile<1, 3,
+                              [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>,
+                               SDTCisPtrTy<3>]>, []>;
+def X86Vextract   : SDNode<"X86ISD::VEXTRACT",  SDTypeProfile<1, 2,
+                              [SDTCisEltOfVec<0, 1>, SDTCisVec<1>,
+                               SDTCisPtrTy<2>]>, []>;
+
+def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
+
+def X86Addsub    : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>;
+
+def X86faddRnd   : SDNode<"X86ISD::FADD_RND",  SDTFPBinOpRound>;
+def X86fsubRnd   : SDNode<"X86ISD::FSUB_RND",  SDTFPBinOpRound>;
+def X86fmulRnd   : SDNode<"X86ISD::FMUL_RND",  SDTFPBinOpRound>;
+def X86fdivRnd   : SDNode<"X86ISD::FDIV_RND",  SDTFPBinOpRound>;
+def X86fmaxRnd   : SDNode<"X86ISD::FMAX_RND",       SDTFPBinOpRound>;
+def X86scalef    : SDNode<"X86ISD::SCALEF",         SDTFPBinOpRound>;
+def X86scalefs   : SDNode<"X86ISD::SCALEFS",        SDTFPBinOpRound>;
+def X86fminRnd   : SDNode<"X86ISD::FMIN_RND",       SDTFPBinOpRound>;
+def X86fsqrtRnd     : SDNode<"X86ISD::FSQRT_RND",   SDTFPUnaryOpRound>;
+def X86fsqrtRnds    : SDNode<"X86ISD::FSQRTS_RND", SDTFPBinOpRound>;
+def X86fgetexpRnd   : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>;
+def X86fgetexpRnds  : SDNode<"X86ISD::FGETEXPS_RND", SDTFPBinOpRound>;
+
+def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
+def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
+def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFma>;
+def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFma>;
+def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
+def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
+
+def X86FmaddRnd     : SDNode<"X86ISD::FMADD_RND",     SDTFmaRound>;
+def X86FnmaddRnd    : SDNode<"X86ISD::FNMADD_RND",    SDTFmaRound>;
+def X86FmsubRnd     : SDNode<"X86ISD::FMSUB_RND",     SDTFmaRound>;
+def X86FnmsubRnd    : SDNode<"X86ISD::FNMSUB_RND",    SDTFmaRound>;
+def X86FmaddsubRnd  : SDNode<"X86ISD::FMADDSUB_RND",  SDTFmaRound>;
+def X86FmsubaddRnd  : SDNode<"X86ISD::FMSUBADD_RND",  SDTFmaRound>;
+
+// Scalar FMA intrinsics with passthru bits in operand 1.
+def X86FmaddRnds1   : SDNode<"X86ISD::FMADDS1_RND",     SDTFmaRound>;
+def X86FnmaddRnds1  : SDNode<"X86ISD::FNMADDS1_RND",    SDTFmaRound>;
+def X86FmsubRnds1   : SDNode<"X86ISD::FMSUBS1_RND",     SDTFmaRound>;
+def X86FnmsubRnds1  : SDNode<"X86ISD::FNMSUBS1_RND",    SDTFmaRound>;
+
+// Scalar FMA intrinsics with passthru bits in operand 3.
+def X86FmaddRnds3   : SDNode<"X86ISD::FMADDS3_RND",     SDTFmaRound>;
+def X86FnmaddRnds3  : SDNode<"X86ISD::FNMADDS3_RND",    SDTFmaRound>;
+def X86FmsubRnds3   : SDNode<"X86ISD::FMSUBS3_RND",     SDTFmaRound>;
+def X86FnmsubRnds3  : SDNode<"X86ISD::FNMSUBS3_RND",    SDTFmaRound>;
+
+def x86vpmadd52l     : SDNode<"X86ISD::VPMADD52L",     SDTFma>;
+def x86vpmadd52h     : SDNode<"X86ISD::VPMADD52H",     SDTFma>;
+
+def X86rsqrt28   : SDNode<"X86ISD::RSQRT28",  SDTFPUnaryOpRound>;
+def X86rcp28     : SDNode<"X86ISD::RCP28",    SDTFPUnaryOpRound>;
+def X86exp2      : SDNode<"X86ISD::EXP2",     SDTFPUnaryOpRound>;
+
+def X86rsqrt28s  : SDNode<"X86ISD::RSQRT28S",   SDTFPBinOpRound>;
+def X86rcp28s    : SDNode<"X86ISD::RCP28S",     SDTFPBinOpRound>;
+def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImmRound>;
+def X86Reduces   : SDNode<"X86ISD::VREDUCES",   SDTFPBinOpImmRound>;
+def X86GetMants  : SDNode<"X86ISD::VGETMANTS",  SDTFPBinOpImmRound>;
+
+def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                         SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
+                                         SDTCisVT<4, i8>]>;
+def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                         SDTCisVT<2, v16i8>, SDTCisVT<3, i32>,
+                                         SDTCisVT<4, v16i8>, SDTCisVT<5, i32>,
+                                         SDTCisVT<6, i8>]>;
+
+def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>;
+def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>;
+
+def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1,
+                              [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
+def X86expand  : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1,
+                              [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
+
+def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>,
+                                          SDTCisSameAs<0,1>, SDTCisInt<2>,
+                                          SDTCisVT<3, i32>]>;
+
+def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                        SDTCisInt<0>, SDTCisFP<1>]>;
+def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                           SDTCisInt<0>, SDTCisFP<1>,
+                                           SDTCisVT<2, i32>]>;
+def SDTSFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>,
+                                            SDTCisVec<1>, SDTCisVT<2, i32>]>;
+
+def SDTVintToFP: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                      SDTCisFP<0>, SDTCisInt<1>]>;
+def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                           SDTCisFP<0>, SDTCisInt<1>,
+                                           SDTCisVT<2, i32>]>;
+
+// Scalar
+def X86SintToFpRnd  : SDNode<"X86ISD::SCALAR_SINT_TO_FP_RND",  SDTintToFPRound>;
+def X86UintToFpRnd  : SDNode<"X86ISD::SCALAR_UINT_TO_FP_RND",  SDTintToFPRound>;
+
+def X86cvtts2IntRnd  : SDNode<"X86ISD::CVTTS2SI_RND",  SDTSFloatToIntRnd>;
+def X86cvtts2UIntRnd : SDNode<"X86ISD::CVTTS2UI_RND",  SDTSFloatToIntRnd>;
+
+def  X86cvts2si  : SDNode<"X86ISD::CVTS2SI_RND", SDTSFloatToIntRnd>;
+def  X86cvts2usi : SDNode<"X86ISD::CVTS2UI_RND", SDTSFloatToIntRnd>;
+
+// Vector with rounding mode
+
+// cvtt fp-to-int staff
+def X86cvttp2siRnd    : SDNode<"X86ISD::CVTTP2SI_RND", SDTFloatToIntRnd>;
+def X86cvttp2uiRnd    : SDNode<"X86ISD::CVTTP2UI_RND", SDTFloatToIntRnd>;
+
+def X86VSintToFpRnd   : SDNode<"X86ISD::SINT_TO_FP_RND",  SDTVintToFPRound>;
+def X86VUintToFpRnd   : SDNode<"X86ISD::UINT_TO_FP_RND",  SDTVintToFPRound>;
+
+// cvt fp-to-int staff
+def X86cvtp2IntRnd      : SDNode<"X86ISD::CVTP2SI_RND",  SDTFloatToIntRnd>;
+def X86cvtp2UIntRnd     : SDNode<"X86ISD::CVTP2UI_RND",  SDTFloatToIntRnd>;
+
+// Vector without rounding mode
+
+// cvtt fp-to-int staff
+def X86cvttp2si      : SDNode<"X86ISD::CVTTP2SI",  SDTFloatToInt>;
+def X86cvttp2ui      : SDNode<"X86ISD::CVTTP2UI",  SDTFloatToInt>;
+
+def X86VSintToFP      : SDNode<"X86ISD::CVTSI2P",  SDTVintToFP>;
+def X86VUintToFP      : SDNode<"X86ISD::CVTUI2P",  SDTVintToFP>;
+
+// cvt int-to-fp staff
+def X86cvtp2Int      : SDNode<"X86ISD::CVTP2SI",  SDTFloatToInt>;
+def X86cvtp2UInt     : SDNode<"X86ISD::CVTP2UI",  SDTFloatToInt>;
+
+def X86cvtph2ps     : SDNode<"X86ISD::CVTPH2PS",
+                              SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
+                                                   SDTCVecEltisVT<1, i16>,
+                                                   SDTCisVT<2, i32>]> >;
+
+def X86cvtps2ph   : SDNode<"X86ISD::CVTPS2PH",
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
+                                             SDTCVecEltisVT<1, f32>,
+                                             SDTCisVT<2, i32>]> >;
+def X86vfpextRnd  : SDNode<"X86ISD::VFPEXT_RND",
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
+                                             SDTCVecEltisVT<1, f32>,
+                                             SDTCisOpSmallerThanOp<1, 0>,
+                                             SDTCisVT<2, i32>]>>;
+def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND",
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
+                                             SDTCVecEltisVT<1, f64>,
+                                             SDTCisOpSmallerThanOp<0, 1>,
+                                             SDTCisVT<2, i32>]>>;
+
+def X86cvt2mask   : SDNode<"X86ISD::CVT2MASK", SDTIntTruncOp>;
+
+//===----------------------------------------------------------------------===//
+// SSE Complex Patterns
+//===----------------------------------------------------------------------===//
+
+// These are 'extloads' from a scalar to the low element of a vector, zeroing
+// the top elements.  These are used for the SSE 'ss' and 'sd' instruction
+// forms.
+def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [],
+                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
+                                   SDNPWantRoot]>;
+def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [],
+                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
+                                   SDNPWantRoot]>;
+
+def ssmem : Operand<v4f32> {
+  let PrintMethod = "printf32mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
+  let ParserMatchClass = X86Mem32AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+def sdmem : Operand<v2f64> {
+  let PrintMethod = "printf64mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
+  let ParserMatchClass = X86Mem64AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+//===----------------------------------------------------------------------===//
+// SSE pattern fragments
+//===----------------------------------------------------------------------===//
+
+// 128-bit load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+
+// 256-bit load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
+def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
+def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
+
+// 512-bit load pattern fragments
+def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
+def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
+def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
+
+// 128-/256-/512-bit extload pattern fragments
+def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
+def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
+def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
+
+// Like 'store', but always requires 128-bit vector alignment.
+def alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                           (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'store', but always requires 256-bit vector alignment.
+def alignedstore256 : PatFrag<(ops node:$val, node:$ptr),
+                              (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+// Like 'store', but always requires 512-bit vector alignment.
+def alignedstore512 : PatFrag<(ops node:$val, node:$ptr),
+                              (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 64;
+}]>;
+
+// Like 'load', but always requires 128-bit vector alignment.
+def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'load', but always requires 256-bit vector alignment.
+def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+// Like 'load', but always requires 512-bit vector alignment.
+def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 64;
+}]>;
+
+// 128-bit aligned load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def alignedloadv4f32 : PatFrag<(ops node:$ptr),
+                               (v4f32 (alignedload node:$ptr))>;
+def alignedloadv2f64 : PatFrag<(ops node:$ptr),
+                               (v2f64 (alignedload node:$ptr))>;
+def alignedloadv2i64 : PatFrag<(ops node:$ptr),
+                               (v2i64 (alignedload node:$ptr))>;
+
+// 256-bit aligned load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def alignedloadv8f32 : PatFrag<(ops node:$ptr),
+                               (v8f32 (alignedload256 node:$ptr))>;
+def alignedloadv4f64 : PatFrag<(ops node:$ptr),
+                               (v4f64 (alignedload256 node:$ptr))>;
+def alignedloadv4i64 : PatFrag<(ops node:$ptr),
+                               (v4i64 (alignedload256 node:$ptr))>;
+
+// 512-bit aligned load pattern fragments
+def alignedloadv16f32 : PatFrag<(ops node:$ptr),
+                                (v16f32 (alignedload512 node:$ptr))>;
+def alignedloadv8f64  : PatFrag<(ops node:$ptr),
+                                (v8f64  (alignedload512 node:$ptr))>;
+def alignedloadv8i64  : PatFrag<(ops node:$ptr),
+                                (v8i64  (alignedload512 node:$ptr))>;
+
+// Like 'load', but uses special alignment checks suitable for use in
+// memory operands in most SSE instructions, which are required to
+// be naturally aligned on some targets but not on others.  If the subtarget
+// allows unaligned accesses, match any load, though this may require
+// setting a feature bit in the processor (on startup, for example).
+// Opteron 10h and later implement such a feature.
+def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return    Subtarget->hasSSEUnalignedMem()
+         || cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// 128-bit memop pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
+def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
+def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+
+// These are needed to match a scalar memop that is used in a vector-only
+// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
+// The memory operand is required to be a 128-bit load, so it must be converted
+// from a vector to a scalar.
+def memopfsf32_128 : PatFrag<(ops node:$ptr),
+  (f32 (extractelt (memopv4f32 node:$ptr), (iPTR 0)))>;
+def memopfsf64_128 : PatFrag<(ops node:$ptr),
+  (f64 (extractelt (memopv2f64 node:$ptr), (iPTR 0)))>;
+
+
+// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
+// 16-byte boundary.
+// FIXME: 8 byte alignment for mmx reads is not required
+def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
+def memopmmx  : PatFrag<(ops node:$ptr), (x86mmx  (memop64 node:$ptr))>;
+
+def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_gather node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+    return (Mgt->getIndex().getValueType() == MVT::v4i32 ||
+            Mgt->getBasePtr().getValueType() == MVT::v4i32);
+  return false;
+}]>;
+
+def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_gather node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+    return (Mgt->getIndex().getValueType() == MVT::v8i32 ||
+            Mgt->getBasePtr().getValueType() == MVT::v8i32);
+  return false;
+}]>;
+
+def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_gather node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+    return (Mgt->getIndex().getValueType() == MVT::v2i64 ||
+            Mgt->getBasePtr().getValueType() == MVT::v2i64);
+  return false;
+}]>;
+def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_gather node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+    return (Mgt->getIndex().getValueType() == MVT::v4i64 ||
+            Mgt->getBasePtr().getValueType() == MVT::v4i64);
+  return false;
+}]>;
+def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_gather node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+    return (Mgt->getIndex().getValueType() == MVT::v8i64 ||
+            Mgt->getBasePtr().getValueType() == MVT::v8i64);
+  return false;
+}]>;
+def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_gather node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
+    return (Mgt->getIndex().getValueType() == MVT::v16i32 ||
+            Mgt->getBasePtr().getValueType() == MVT::v16i32);
+  return false;
+}]>;
+
+def mscatterv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+    return (Sc->getIndex().getValueType() == MVT::v2i64 ||
+            Sc->getBasePtr().getValueType() == MVT::v2i64);
+  return false;
+}]>;
+
+def mscatterv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+    return (Sc->getIndex().getValueType() == MVT::v4i32 ||
+            Sc->getBasePtr().getValueType() == MVT::v4i32);
+  return false;
+}]>;
+
+def mscatterv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+    return (Sc->getIndex().getValueType() == MVT::v4i64 ||
+            Sc->getBasePtr().getValueType() == MVT::v4i64);
+  return false;
+}]>;
+
+def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+    return (Sc->getIndex().getValueType() == MVT::v8i32 ||
+            Sc->getBasePtr().getValueType() == MVT::v8i32);
+  return false;
+}]>;
+
+def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+    return (Sc->getIndex().getValueType() == MVT::v8i64 ||
+            Sc->getBasePtr().getValueType() == MVT::v8i64);
+  return false;
+}]>;
+def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (masked_scatter node:$src1, node:$src2, node:$src3) , [{
+  if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
+    return (Sc->getIndex().getValueType() == MVT::v16i32 ||
+            Sc->getBasePtr().getValueType() == MVT::v16i32);
+  return false;
+}]>;
+
+// 128-bit bitconvert pattern fragments
+def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
+def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
+def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
+def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
+def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
+def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
+
+// 256-bit bitconvert pattern fragments
+def bc_v32i8 : PatFrag<(ops node:$in), (v32i8 (bitconvert node:$in))>;
+def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>;
+def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>;
+def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>;
+def bc_v8f32 : PatFrag<(ops node:$in), (v8f32 (bitconvert node:$in))>;
+
+// 512-bit bitconvert pattern fragments
+def bc_v64i8 : PatFrag<(ops node:$in), (v64i8 (bitconvert node:$in))>;
+def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>;
+def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
+def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>;
+def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>;
+
+def vzmovl_v2i64 : PatFrag<(ops node:$src),
+                           (bitconvert (v2i64 (X86vzmovl
+                             (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
+def vzmovl_v4i32 : PatFrag<(ops node:$src),
+                           (bitconvert (v4i32 (X86vzmovl
+                             (v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
+
+def vzload_v2i64 : PatFrag<(ops node:$src),
+                           (bitconvert (v2i64 (X86vzload node:$src)))>;
+
+
+def fp32imm0 : PatLeaf<(f32 fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def fp64imm0 : PatLeaf<(f64 fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def I8Imm : SDNodeXForm<imm, [{
+  // Transformation function: get the low 8 bits.
+  return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
+}]>;
+
+def FROUND_NO_EXC : ImmLeaf<i32, [{ return Imm == 8; }]>;
+def FROUND_CURRENT : ImmLeaf<i32, [{
+  return Imm == X86::STATIC_ROUNDING::CUR_DIRECTION;
+}]>;
+
+// BYTE_imm - Transform bit immediates into byte immediates.
+def BYTE_imm  : SDNodeXForm<imm, [{
+  // Transformation function: imm >> 3
+  return getI32Imm(N->getZExtValue() >> 3, SDLoc(N));
+}]>;
+
+// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index
+// to VEXTRACTF128/VEXTRACTI128 imm.
+def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{
+  return getI8Imm(X86::getExtractVEXTRACT128Immediate(N), SDLoc(N));
+}]>;
+
+// INSERT_get_vinsert128_imm xform function: convert insert_subvector index to
+// VINSERTF128/VINSERTI128 imm.
+def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{
+  return getI8Imm(X86::getInsertVINSERT128Immediate(N), SDLoc(N));
+}]>;
+
+// EXTRACT_get_vextract256_imm xform function: convert extract_subvector index
+// to VEXTRACTF64x4 imm.
+def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{
+  return getI8Imm(X86::getExtractVEXTRACT256Immediate(N), SDLoc(N));
+}]>;
+
+// INSERT_get_vinsert256_imm xform function: convert insert_subvector index to
+// VINSERTF64x4 imm.
+def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{
+  return getI8Imm(X86::getInsertVINSERT256Immediate(N), SDLoc(N));
+}]>;
+
+def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index),
+                                   (extract_subvector node:$bigvec,
+                                                      node:$index), [{
+  return X86::isVEXTRACT128Index(N);
+}], EXTRACT_get_vextract128_imm>;
+
+def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+                                      node:$index),
+                                 (insert_subvector node:$bigvec, node:$smallvec,
+                                                   node:$index), [{
+  return X86::isVINSERT128Index(N);
+}], INSERT_get_vinsert128_imm>;
+
+
+def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index),
+                                   (extract_subvector node:$bigvec,
+                                                      node:$index), [{
+  return X86::isVEXTRACT256Index(N);
+}], EXTRACT_get_vextract256_imm>;
+
+def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+                                      node:$index),
+                                 (insert_subvector node:$bigvec, node:$smallvec,
+                                                   node:$index), [{
+  return X86::isVINSERT256Index(N);
+}], INSERT_get_vinsert256_imm>;
+
+def X86mload : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (masked_load node:$src1, node:$src2, node:$src3), [{
+  return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
+    cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+}]>;
+
+def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (X86mload node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedLoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def masked_load_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (X86mload node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedLoadSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+def masked_load_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (X86mload node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedLoadSDNode>(N)->getAlignment() >= 64;
+}]>;
+
+def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (masked_load node:$src1, node:$src2, node:$src3), [{
+  return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
+    cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+}]>;
+
+def X86mExpandingLoad : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (masked_load node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedLoadSDNode>(N)->isExpandingLoad();
+}]>;
+
+// Masked store fragments.
+// X86mstore can't be implemented in core DAG files because some targets
+// do not support vector types (llvm-tblgen will fail).
+def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                        (masked_store node:$src1, node:$src2, node:$src3), [{
+  return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) &&
+         (!cast<MaskedStoreSDNode>(N)->isCompressingStore());
+}]>;
+
+def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (X86mstore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedStoreSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (X86mstore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedStoreSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (X86mstore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedStoreSDNode>(N)->getAlignment() >= 64;
+}]>;
+
+def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                         (masked_store node:$src1, node:$src2, node:$src3), [{
+  return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) &&
+         (!cast<MaskedStoreSDNode>(N)->isCompressingStore());
+}]>;
+
+def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                             (masked_store node:$src1, node:$src2, node:$src3), [{
+    return cast<MaskedStoreSDNode>(N)->isCompressingStore();
+}]>;
+
+// masked truncstore fragments
+// X86mtruncstore can't be implemented in core DAG files because some targets
+// doesn't support vector type ( llvm-tblgen will fail)
+def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                             (masked_store node:$src1, node:$src2, node:$src3), [{
+    return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+}]>;
+def masked_truncstorevi8 :
+  PatFrag<(ops node:$src1, node:$src2, node:$src3),
+          (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def masked_truncstorevi16 :
+  PatFrag<(ops node:$src1, node:$src2, node:$src3),
+          (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+def masked_truncstorevi32 :
+  PatFrag<(ops node:$src1, node:$src2, node:$src3),
+          (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def X86TruncSStore : SDNode<"X86ISD::VTRUNCSTORES",  SDTStore,
+                       [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def X86TruncUSStore : SDNode<"X86ISD::VTRUNCSTOREUS",  SDTStore,
+                       [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def X86MTruncSStore : SDNode<"X86ISD::VMTRUNCSTORES",  SDTMaskedStore,
+                       [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS",  SDTMaskedStore,
+                       [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def truncstore_s_vi8 : PatFrag<(ops node:$val, node:$ptr),
+                               (X86TruncSStore node:$val, node:$ptr), [{
+  return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+
+def truncstore_us_vi8 : PatFrag<(ops node:$val, node:$ptr),
+                               (X86TruncUSStore node:$val, node:$ptr), [{
+  return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+
+def truncstore_s_vi16 : PatFrag<(ops node:$val, node:$ptr),
+                               (X86TruncSStore node:$val, node:$ptr), [{
+  return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+
+def truncstore_us_vi16 : PatFrag<(ops node:$val, node:$ptr),
+                               (X86TruncUSStore node:$val, node:$ptr), [{
+  return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+
+def truncstore_s_vi32 : PatFrag<(ops node:$val, node:$ptr),
+                               (X86TruncSStore node:$val, node:$ptr), [{
+  return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def truncstore_us_vi32 : PatFrag<(ops node:$val, node:$ptr),
+                               (X86TruncUSStore node:$val, node:$ptr), [{
+  return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def masked_truncstore_s_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                     (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+
+def masked_truncstore_us_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                               (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+
+def masked_truncstore_s_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                               (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+
+def masked_truncstore_us_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                               (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+
+def masked_truncstore_s_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                               (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def masked_truncstore_us_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                               (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
+  return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def assertzext_i1 :
+  PatFrag<(ops node:$src), (assertzext node:$src), [{
+    return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1;
+}]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
new file mode 100644
index 000000000000..579359794fbd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -0,0 +1,9731 @@
+//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-instr-info"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "X86GenInstrInfo.inc"
+
+static cl::opt<bool>
+NoFusing("disable-spill-fusing",
+         cl::desc("Disable fusing of spill code into instructions"));
+static cl::opt<bool>
+PrintFailedFusing("print-failed-fuse-candidates",
+                  cl::desc("Print instructions that the allocator wants to"
+                           " fuse, but the X86 backend currently can't"),
+                  cl::Hidden);
+static cl::opt<bool>
+ReMatPICStubLoad("remat-pic-stub-load",
+                 cl::desc("Re-materialize load from stub in PIC mode"),
+                 cl::init(false), cl::Hidden);
+static cl::opt<unsigned>
+PartialRegUpdateClearance("partial-reg-update-clearance",
+                          cl::desc("Clearance between two register writes "
+                                   "for inserting XOR to avoid partial "
+                                   "register update"),
+                          cl::init(64), cl::Hidden);
+static cl::opt<unsigned>
+UndefRegClearance("undef-reg-clearance",
+                  cl::desc("How many idle instructions we would like before "
+                           "certain undef register reads"),
+                  cl::init(128), cl::Hidden);
+
+enum {
+  // Select which memory operand is being unfolded.
+  // (stored in bits 0 - 3)
+  TB_INDEX_0    = 0,
+  TB_INDEX_1    = 1,
+  TB_INDEX_2    = 2,
+  TB_INDEX_3    = 3,
+  TB_INDEX_4    = 4,
+  TB_INDEX_MASK = 0xf,
+
+  // Do not insert the reverse map (MemOp -> RegOp) into the table.
+  // This may be needed because there is a many -> one mapping.
+  TB_NO_REVERSE   = 1 << 4,
+
+  // Do not insert the forward map (RegOp -> MemOp) into the table.
+  // This is needed for Native Client, which prohibits branch
+  // instructions from using a memory operand.
+  TB_NO_FORWARD   = 1 << 5,
+
+  TB_FOLDED_LOAD  = 1 << 6,
+  TB_FOLDED_STORE = 1 << 7,
+
+  // Minimum alignment required for load/store.
+  // Used for RegOp->MemOp conversion.
+  // (stored in bits 8 - 15)
+  TB_ALIGN_SHIFT = 8,
+  TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
+  TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
+  TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
+  TB_ALIGN_64    =   64 << TB_ALIGN_SHIFT,
+  TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT
+};
+
+struct X86MemoryFoldTableEntry {
+  uint16_t RegOp;
+  uint16_t MemOp;
+  uint16_t Flags;
+};
+
+// Pin the vtable to this file.
+void X86InstrInfo::anchor() {}
+
+X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
+    : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
+                                               : X86::ADJCALLSTACKDOWN32),
+                      (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
+                                               : X86::ADJCALLSTACKUP32),
+                      X86::CATCHRET,
+                      (STI.is64Bit() ? X86::RETQ : X86::RETL)),
+      Subtarget(STI), RI(STI.getTargetTriple()) {
+
+  static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
+    { X86::ADC32ri,     X86::ADC32mi,    0 },
+    { X86::ADC32ri8,    X86::ADC32mi8,   0 },
+    { X86::ADC32rr,     X86::ADC32mr,    0 },
+    { X86::ADC64ri32,   X86::ADC64mi32,  0 },
+    { X86::ADC64ri8,    X86::ADC64mi8,   0 },
+    { X86::ADC64rr,     X86::ADC64mr,    0 },
+    { X86::ADD16ri,     X86::ADD16mi,    0 },
+    { X86::ADD16ri8,    X86::ADD16mi8,   0 },
+    { X86::ADD16ri_DB,  X86::ADD16mi,    TB_NO_REVERSE },
+    { X86::ADD16ri8_DB, X86::ADD16mi8,   TB_NO_REVERSE },
+    { X86::ADD16rr,     X86::ADD16mr,    0 },
+    { X86::ADD16rr_DB,  X86::ADD16mr,    TB_NO_REVERSE },
+    { X86::ADD32ri,     X86::ADD32mi,    0 },
+    { X86::ADD32ri8,    X86::ADD32mi8,   0 },
+    { X86::ADD32ri_DB,  X86::ADD32mi,    TB_NO_REVERSE },
+    { X86::ADD32ri8_DB, X86::ADD32mi8,   TB_NO_REVERSE },
+    { X86::ADD32rr,     X86::ADD32mr,    0 },
+    { X86::ADD32rr_DB,  X86::ADD32mr,    TB_NO_REVERSE },
+    { X86::ADD64ri32,   X86::ADD64mi32,  0 },
+    { X86::ADD64ri8,    X86::ADD64mi8,   0 },
+    { X86::ADD64ri32_DB,X86::ADD64mi32,  TB_NO_REVERSE },
+    { X86::ADD64ri8_DB, X86::ADD64mi8,   TB_NO_REVERSE },
+    { X86::ADD64rr,     X86::ADD64mr,    0 },
+    { X86::ADD64rr_DB,  X86::ADD64mr,    TB_NO_REVERSE },
+    { X86::ADD8ri,      X86::ADD8mi,     0 },
+    { X86::ADD8rr,      X86::ADD8mr,     0 },
+    { X86::AND16ri,     X86::AND16mi,    0 },
+    { X86::AND16ri8,    X86::AND16mi8,   0 },
+    { X86::AND16rr,     X86::AND16mr,    0 },
+    { X86::AND32ri,     X86::AND32mi,    0 },
+    { X86::AND32ri8,    X86::AND32mi8,   0 },
+    { X86::AND32rr,     X86::AND32mr,    0 },
+    { X86::AND64ri32,   X86::AND64mi32,  0 },
+    { X86::AND64ri8,    X86::AND64mi8,   0 },
+    { X86::AND64rr,     X86::AND64mr,    0 },
+    { X86::AND8ri,      X86::AND8mi,     0 },
+    { X86::AND8rr,      X86::AND8mr,     0 },
+    { X86::DEC16r,      X86::DEC16m,     0 },
+    { X86::DEC32r,      X86::DEC32m,     0 },
+    { X86::DEC64r,      X86::DEC64m,     0 },
+    { X86::DEC8r,       X86::DEC8m,      0 },
+    { X86::INC16r,      X86::INC16m,     0 },
+    { X86::INC32r,      X86::INC32m,     0 },
+    { X86::INC64r,      X86::INC64m,     0 },
+    { X86::INC8r,       X86::INC8m,      0 },
+    { X86::NEG16r,      X86::NEG16m,     0 },
+    { X86::NEG32r,      X86::NEG32m,     0 },
+    { X86::NEG64r,      X86::NEG64m,     0 },
+    { X86::NEG8r,       X86::NEG8m,      0 },
+    { X86::NOT16r,      X86::NOT16m,     0 },
+    { X86::NOT32r,      X86::NOT32m,     0 },
+    { X86::NOT64r,      X86::NOT64m,     0 },
+    { X86::NOT8r,       X86::NOT8m,      0 },
+    { X86::OR16ri,      X86::OR16mi,     0 },
+    { X86::OR16ri8,     X86::OR16mi8,    0 },
+    { X86::OR16rr,      X86::OR16mr,     0 },
+    { X86::OR32ri,      X86::OR32mi,     0 },
+    { X86::OR32ri8,     X86::OR32mi8,    0 },
+    { X86::OR32rr,      X86::OR32mr,     0 },
+    { X86::OR64ri32,    X86::OR64mi32,   0 },
+    { X86::OR64ri8,     X86::OR64mi8,    0 },
+    { X86::OR64rr,      X86::OR64mr,     0 },
+    { X86::OR8ri,       X86::OR8mi,      0 },
+    { X86::OR8rr,       X86::OR8mr,      0 },
+    { X86::ROL16r1,     X86::ROL16m1,    0 },
+    { X86::ROL16rCL,    X86::ROL16mCL,   0 },
+    { X86::ROL16ri,     X86::ROL16mi,    0 },
+    { X86::ROL32r1,     X86::ROL32m1,    0 },
+    { X86::ROL32rCL,    X86::ROL32mCL,   0 },
+    { X86::ROL32ri,     X86::ROL32mi,    0 },
+    { X86::ROL64r1,     X86::ROL64m1,    0 },
+    { X86::ROL64rCL,    X86::ROL64mCL,   0 },
+    { X86::ROL64ri,     X86::ROL64mi,    0 },
+    { X86::ROL8r1,      X86::ROL8m1,     0 },
+    { X86::ROL8rCL,     X86::ROL8mCL,    0 },
+    { X86::ROL8ri,      X86::ROL8mi,     0 },
+    { X86::ROR16r1,     X86::ROR16m1,    0 },
+    { X86::ROR16rCL,    X86::ROR16mCL,   0 },
+    { X86::ROR16ri,     X86::ROR16mi,    0 },
+    { X86::ROR32r1,     X86::ROR32m1,    0 },
+    { X86::ROR32rCL,    X86::ROR32mCL,   0 },
+    { X86::ROR32ri,     X86::ROR32mi,    0 },
+    { X86::ROR64r1,     X86::ROR64m1,    0 },
+    { X86::ROR64rCL,    X86::ROR64mCL,   0 },
+    { X86::ROR64ri,     X86::ROR64mi,    0 },
+    { X86::ROR8r1,      X86::ROR8m1,     0 },
+    { X86::ROR8rCL,     X86::ROR8mCL,    0 },
+    { X86::ROR8ri,      X86::ROR8mi,     0 },
+    { X86::SAR16r1,     X86::SAR16m1,    0 },
+    { X86::SAR16rCL,    X86::SAR16mCL,   0 },
+    { X86::SAR16ri,     X86::SAR16mi,    0 },
+    { X86::SAR32r1,     X86::SAR32m1,    0 },
+    { X86::SAR32rCL,    X86::SAR32mCL,   0 },
+    { X86::SAR32ri,     X86::SAR32mi,    0 },
+    { X86::SAR64r1,     X86::SAR64m1,    0 },
+    { X86::SAR64rCL,    X86::SAR64mCL,   0 },
+    { X86::SAR64ri,     X86::SAR64mi,    0 },
+    { X86::SAR8r1,      X86::SAR8m1,     0 },
+    { X86::SAR8rCL,     X86::SAR8mCL,    0 },
+    { X86::SAR8ri,      X86::SAR8mi,     0 },
+    { X86::SBB32ri,     X86::SBB32mi,    0 },
+    { X86::SBB32ri8,    X86::SBB32mi8,   0 },
+    { X86::SBB32rr,     X86::SBB32mr,    0 },
+    { X86::SBB64ri32,   X86::SBB64mi32,  0 },
+    { X86::SBB64ri8,    X86::SBB64mi8,   0 },
+    { X86::SBB64rr,     X86::SBB64mr,    0 },
+    { X86::SHL16r1,     X86::SHL16m1,    0 },
+    { X86::SHL16rCL,    X86::SHL16mCL,   0 },
+    { X86::SHL16ri,     X86::SHL16mi,    0 },
+    { X86::SHL32r1,     X86::SHL32m1,    0 },
+    { X86::SHL32rCL,    X86::SHL32mCL,   0 },
+    { X86::SHL32ri,     X86::SHL32mi,    0 },
+    { X86::SHL64r1,     X86::SHL64m1,    0 },
+    { X86::SHL64rCL,    X86::SHL64mCL,   0 },
+    { X86::SHL64ri,     X86::SHL64mi,    0 },
+    { X86::SHL8r1,      X86::SHL8m1,     0 },
+    { X86::SHL8rCL,     X86::SHL8mCL,    0 },
+    { X86::SHL8ri,      X86::SHL8mi,     0 },
+    { X86::SHLD16rrCL,  X86::SHLD16mrCL, 0 },
+    { X86::SHLD16rri8,  X86::SHLD16mri8, 0 },
+    { X86::SHLD32rrCL,  X86::SHLD32mrCL, 0 },
+    { X86::SHLD32rri8,  X86::SHLD32mri8, 0 },
+    { X86::SHLD64rrCL,  X86::SHLD64mrCL, 0 },
+    { X86::SHLD64rri8,  X86::SHLD64mri8, 0 },
+    { X86::SHR16r1,     X86::SHR16m1,    0 },
+    { X86::SHR16rCL,    X86::SHR16mCL,   0 },
+    { X86::SHR16ri,     X86::SHR16mi,    0 },
+    { X86::SHR32r1,     X86::SHR32m1,    0 },
+    { X86::SHR32rCL,    X86::SHR32mCL,   0 },
+    { X86::SHR32ri,     X86::SHR32mi,    0 },
+    { X86::SHR64r1,     X86::SHR64m1,    0 },
+    { X86::SHR64rCL,    X86::SHR64mCL,   0 },
+    { X86::SHR64ri,     X86::SHR64mi,    0 },
+    { X86::SHR8r1,      X86::SHR8m1,     0 },
+    { X86::SHR8rCL,     X86::SHR8mCL,    0 },
+    { X86::SHR8ri,      X86::SHR8mi,     0 },
+    { X86::SHRD16rrCL,  X86::SHRD16mrCL, 0 },
+    { X86::SHRD16rri8,  X86::SHRD16mri8, 0 },
+    { X86::SHRD32rrCL,  X86::SHRD32mrCL, 0 },
+    { X86::SHRD32rri8,  X86::SHRD32mri8, 0 },
+    { X86::SHRD64rrCL,  X86::SHRD64mrCL, 0 },
+    { X86::SHRD64rri8,  X86::SHRD64mri8, 0 },
+    { X86::SUB16ri,     X86::SUB16mi,    0 },
+    { X86::SUB16ri8,    X86::SUB16mi8,   0 },
+    { X86::SUB16rr,     X86::SUB16mr,    0 },
+    { X86::SUB32ri,     X86::SUB32mi,    0 },
+    { X86::SUB32ri8,    X86::SUB32mi8,   0 },
+    { X86::SUB32rr,     X86::SUB32mr,    0 },
+    { X86::SUB64ri32,   X86::SUB64mi32,  0 },
+    { X86::SUB64ri8,    X86::SUB64mi8,   0 },
+    { X86::SUB64rr,     X86::SUB64mr,    0 },
+    { X86::SUB8ri,      X86::SUB8mi,     0 },
+    { X86::SUB8rr,      X86::SUB8mr,     0 },
+    { X86::XOR16ri,     X86::XOR16mi,    0 },
+    { X86::XOR16ri8,    X86::XOR16mi8,   0 },
+    { X86::XOR16rr,     X86::XOR16mr,    0 },
+    { X86::XOR32ri,     X86::XOR32mi,    0 },
+    { X86::XOR32ri8,    X86::XOR32mi8,   0 },
+    { X86::XOR32rr,     X86::XOR32mr,    0 },
+    { X86::XOR64ri32,   X86::XOR64mi32,  0 },
+    { X86::XOR64ri8,    X86::XOR64mi8,   0 },
+    { X86::XOR64rr,     X86::XOR64mr,    0 },
+    { X86::XOR8ri,      X86::XOR8mi,     0 },
+    { X86::XOR8rr,      X86::XOR8mr,     0 }
+  };
+
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) {
+    AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
+                  Entry.RegOp, Entry.MemOp,
+                  // Index 0, folded load and store, no alignment requirement.
+                  Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+  }
+
+  static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
+    { X86::BT16ri8,     X86::BT16mi8,       TB_FOLDED_LOAD },
+    { X86::BT32ri8,     X86::BT32mi8,       TB_FOLDED_LOAD },
+    { X86::BT64ri8,     X86::BT64mi8,       TB_FOLDED_LOAD },
+    { X86::CALL32r,     X86::CALL32m,       TB_FOLDED_LOAD },
+    { X86::CALL64r,     X86::CALL64m,       TB_FOLDED_LOAD },
+    { X86::CMP16ri,     X86::CMP16mi,       TB_FOLDED_LOAD },
+    { X86::CMP16ri8,    X86::CMP16mi8,      TB_FOLDED_LOAD },
+    { X86::CMP16rr,     X86::CMP16mr,       TB_FOLDED_LOAD },
+    { X86::CMP32ri,     X86::CMP32mi,       TB_FOLDED_LOAD },
+    { X86::CMP32ri8,    X86::CMP32mi8,      TB_FOLDED_LOAD },
+    { X86::CMP32rr,     X86::CMP32mr,       TB_FOLDED_LOAD },
+    { X86::CMP64ri32,   X86::CMP64mi32,     TB_FOLDED_LOAD },
+    { X86::CMP64ri8,    X86::CMP64mi8,      TB_FOLDED_LOAD },
+    { X86::CMP64rr,     X86::CMP64mr,       TB_FOLDED_LOAD },
+    { X86::CMP8ri,      X86::CMP8mi,        TB_FOLDED_LOAD },
+    { X86::CMP8rr,      X86::CMP8mr,        TB_FOLDED_LOAD },
+    { X86::DIV16r,      X86::DIV16m,        TB_FOLDED_LOAD },
+    { X86::DIV32r,      X86::DIV32m,        TB_FOLDED_LOAD },
+    { X86::DIV64r,      X86::DIV64m,        TB_FOLDED_LOAD },
+    { X86::DIV8r,       X86::DIV8m,         TB_FOLDED_LOAD },
+    { X86::EXTRACTPSrr, X86::EXTRACTPSmr,   TB_FOLDED_STORE },
+    { X86::IDIV16r,     X86::IDIV16m,       TB_FOLDED_LOAD },
+    { X86::IDIV32r,     X86::IDIV32m,       TB_FOLDED_LOAD },
+    { X86::IDIV64r,     X86::IDIV64m,       TB_FOLDED_LOAD },
+    { X86::IDIV8r,      X86::IDIV8m,        TB_FOLDED_LOAD },
+    { X86::IMUL16r,     X86::IMUL16m,       TB_FOLDED_LOAD },
+    { X86::IMUL32r,     X86::IMUL32m,       TB_FOLDED_LOAD },
+    { X86::IMUL64r,     X86::IMUL64m,       TB_FOLDED_LOAD },
+    { X86::IMUL8r,      X86::IMUL8m,        TB_FOLDED_LOAD },
+    { X86::JMP32r,      X86::JMP32m,        TB_FOLDED_LOAD },
+    { X86::JMP64r,      X86::JMP64m,        TB_FOLDED_LOAD },
+    { X86::MOV16ri,     X86::MOV16mi,       TB_FOLDED_STORE },
+    { X86::MOV16rr,     X86::MOV16mr,       TB_FOLDED_STORE },
+    { X86::MOV32ri,     X86::MOV32mi,       TB_FOLDED_STORE },
+    { X86::MOV32rr,     X86::MOV32mr,       TB_FOLDED_STORE },
+    { X86::MOV64ri32,   X86::MOV64mi32,     TB_FOLDED_STORE },
+    { X86::MOV64rr,     X86::MOV64mr,       TB_FOLDED_STORE },
+    { X86::MOV8ri,      X86::MOV8mi,        TB_FOLDED_STORE },
+    { X86::MOV8rr,      X86::MOV8mr,        TB_FOLDED_STORE },
+    { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
+    { X86::MOVAPDrr,    X86::MOVAPDmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVAPSrr,    X86::MOVAPSmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVDQArr,    X86::MOVDQAmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVDQUrr,    X86::MOVDQUmr,      TB_FOLDED_STORE },
+    { X86::MOVPDI2DIrr, X86::MOVPDI2DImr,   TB_FOLDED_STORE },
+    { X86::MOVPQIto64rr,X86::MOVPQI2QImr,   TB_FOLDED_STORE },
+    { X86::MOVSDto64rr, X86::MOVSDto64mr,   TB_FOLDED_STORE },
+    { X86::MOVSS2DIrr,  X86::MOVSS2DImr,    TB_FOLDED_STORE },
+    { X86::MOVUPDrr,    X86::MOVUPDmr,      TB_FOLDED_STORE },
+    { X86::MOVUPSrr,    X86::MOVUPSmr,      TB_FOLDED_STORE },
+    { X86::MUL16r,      X86::MUL16m,        TB_FOLDED_LOAD },
+    { X86::MUL32r,      X86::MUL32m,        TB_FOLDED_LOAD },
+    { X86::MUL64r,      X86::MUL64m,        TB_FOLDED_LOAD },
+    { X86::MUL8r,       X86::MUL8m,         TB_FOLDED_LOAD },
+    { X86::PEXTRDrr,    X86::PEXTRDmr,      TB_FOLDED_STORE },
+    { X86::PEXTRQrr,    X86::PEXTRQmr,      TB_FOLDED_STORE },
+    { X86::PUSH16r,     X86::PUSH16rmm,     TB_FOLDED_LOAD },
+    { X86::PUSH32r,     X86::PUSH32rmm,     TB_FOLDED_LOAD },
+    { X86::PUSH64r,     X86::PUSH64rmm,     TB_FOLDED_LOAD },
+    { X86::SETAEr,      X86::SETAEm,        TB_FOLDED_STORE },
+    { X86::SETAr,       X86::SETAm,         TB_FOLDED_STORE },
+    { X86::SETBEr,      X86::SETBEm,        TB_FOLDED_STORE },
+    { X86::SETBr,       X86::SETBm,         TB_FOLDED_STORE },
+    { X86::SETEr,       X86::SETEm,         TB_FOLDED_STORE },
+    { X86::SETGEr,      X86::SETGEm,        TB_FOLDED_STORE },
+    { X86::SETGr,       X86::SETGm,         TB_FOLDED_STORE },
+    { X86::SETLEr,      X86::SETLEm,        TB_FOLDED_STORE },
+    { X86::SETLr,       X86::SETLm,         TB_FOLDED_STORE },
+    { X86::SETNEr,      X86::SETNEm,        TB_FOLDED_STORE },
+    { X86::SETNOr,      X86::SETNOm,        TB_FOLDED_STORE },
+    { X86::SETNPr,      X86::SETNPm,        TB_FOLDED_STORE },
+    { X86::SETNSr,      X86::SETNSm,        TB_FOLDED_STORE },
+    { X86::SETOr,       X86::SETOm,         TB_FOLDED_STORE },
+    { X86::SETPr,       X86::SETPm,         TB_FOLDED_STORE },
+    { X86::SETSr,       X86::SETSm,         TB_FOLDED_STORE },
+    { X86::TAILJMPr,    X86::TAILJMPm,      TB_FOLDED_LOAD },
+    { X86::TAILJMPr64,  X86::TAILJMPm64,    TB_FOLDED_LOAD },
+    { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
+    { X86::TEST16ri,    X86::TEST16mi,      TB_FOLDED_LOAD },
+    { X86::TEST32ri,    X86::TEST32mi,      TB_FOLDED_LOAD },
+    { X86::TEST64ri32,  X86::TEST64mi32,    TB_FOLDED_LOAD },
+    { X86::TEST8ri,     X86::TEST8mi,       TB_FOLDED_LOAD },
+
+    // AVX 128-bit versions of foldable instructions
+    { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr,  TB_FOLDED_STORE  },
+    { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPDrr,   X86::VMOVAPDmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPSrr,   X86::VMOVAPSmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVDQArr,   X86::VMOVDQAmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVDQUrr,   X86::VMOVDQUmr,     TB_FOLDED_STORE },
+    { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr,  TB_FOLDED_STORE },
+    { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
+    { X86::VMOVSDto64rr,X86::VMOVSDto64mr,  TB_FOLDED_STORE },
+    { X86::VMOVSS2DIrr, X86::VMOVSS2DImr,   TB_FOLDED_STORE },
+    { X86::VMOVUPDrr,   X86::VMOVUPDmr,     TB_FOLDED_STORE },
+    { X86::VMOVUPSrr,   X86::VMOVUPSmr,     TB_FOLDED_STORE },
+    { X86::VPEXTRDrr,   X86::VPEXTRDmr,     TB_FOLDED_STORE },
+    { X86::VPEXTRQrr,   X86::VPEXTRQmr,     TB_FOLDED_STORE },
+
+    // AVX 256-bit foldable instructions
+    { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPDYrr,  X86::VMOVAPDYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVAPSYrr,  X86::VMOVAPSYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVDQAYrr,  X86::VMOVDQAYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVDQUYrr,  X86::VMOVDQUYmr,    TB_FOLDED_STORE },
+    { X86::VMOVUPDYrr,  X86::VMOVUPDYmr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE },
+
+    // AVX-512 foldable instructions
+    { X86::VEXTRACTF32x4Zrr,X86::VEXTRACTF32x4Zmr, TB_FOLDED_STORE },
+    { X86::VEXTRACTF32x8Zrr,X86::VEXTRACTF32x8Zmr, TB_FOLDED_STORE },
+    { X86::VEXTRACTF64x2Zrr,X86::VEXTRACTF64x2Zmr, TB_FOLDED_STORE },
+    { X86::VEXTRACTF64x4Zrr,X86::VEXTRACTF64x4Zmr, TB_FOLDED_STORE },
+    { X86::VEXTRACTI32x4Zrr,X86::VEXTRACTI32x4Zmr, TB_FOLDED_STORE },
+    { X86::VEXTRACTI32x8Zrr,X86::VEXTRACTI32x8Zmr, TB_FOLDED_STORE },
+    { X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE },
+    { X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
+    { X86::VEXTRACTPSZrr,   X86::VEXTRACTPSZmr,    TB_FOLDED_STORE },
+    { X86::VMOVPDI2DIZrr,   X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
+    { X86::VMOVAPDZrr,      X86::VMOVAPDZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
+    { X86::VMOVAPSZrr,      X86::VMOVAPSZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
+    { X86::VMOVDQA32Zrr,    X86::VMOVDQA32Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
+    { X86::VMOVDQA64Zrr,    X86::VMOVDQA64Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
+    { X86::VMOVUPDZrr,      X86::VMOVUPDZmr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSZrr,      X86::VMOVUPSZmr,    TB_FOLDED_STORE },
+    { X86::VMOVDQU8Zrr,     X86::VMOVDQU8Zmr,   TB_FOLDED_STORE },
+    { X86::VMOVDQU16Zrr,    X86::VMOVDQU16Zmr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU32Zrr,    X86::VMOVDQU32Zmr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU64Zrr,    X86::VMOVDQU64Zmr,  TB_FOLDED_STORE },
+    { X86::VPMOVDBZrr,      X86::VPMOVDBZmr,    TB_FOLDED_STORE },
+    { X86::VPMOVDWZrr,      X86::VPMOVDWZmr,    TB_FOLDED_STORE },
+    { X86::VPMOVQDZrr,      X86::VPMOVQDZmr,    TB_FOLDED_STORE },
+    { X86::VPMOVQWZrr,      X86::VPMOVQWZmr,    TB_FOLDED_STORE },
+    { X86::VPMOVWBZrr,      X86::VPMOVWBZmr,    TB_FOLDED_STORE },
+    { X86::VPMOVSDBZrr,     X86::VPMOVSDBZmr,   TB_FOLDED_STORE },
+    { X86::VPMOVSDWZrr,     X86::VPMOVSDWZmr,   TB_FOLDED_STORE },
+    { X86::VPMOVSQDZrr,     X86::VPMOVSQDZmr,   TB_FOLDED_STORE },
+    { X86::VPMOVSQWZrr,     X86::VPMOVSQWZmr,   TB_FOLDED_STORE },
+    { X86::VPMOVSWBZrr,     X86::VPMOVSWBZmr,   TB_FOLDED_STORE },
+    { X86::VPMOVUSDBZrr,    X86::VPMOVUSDBZmr,  TB_FOLDED_STORE },
+    { X86::VPMOVUSDWZrr,    X86::VPMOVUSDWZmr,  TB_FOLDED_STORE },
+    { X86::VPMOVUSQDZrr,    X86::VPMOVUSQDZmr,  TB_FOLDED_STORE },
+    { X86::VPMOVUSQWZrr,    X86::VPMOVUSQWZmr,  TB_FOLDED_STORE },
+    { X86::VPMOVUSWBZrr,    X86::VPMOVUSWBZmr,  TB_FOLDED_STORE },
+
+    // AVX-512 foldable instructions (256-bit versions)
+    { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE },
+    { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE },
+    { X86::VEXTRACTI32x4Z256rr,X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE },
+    { X86::VEXTRACTI64x2Z256rr,X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE },
+    { X86::VMOVAPDZ256rr,      X86::VMOVAPDZ256mr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVAPSZ256rr,      X86::VMOVAPSZ256mr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVDQA32Z256rr,    X86::VMOVDQA32Z256mr,  TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVDQA64Z256rr,    X86::VMOVDQA64Z256mr,  TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVUPDZ256rr,      X86::VMOVUPDZ256mr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSZ256rr,      X86::VMOVUPSZ256mr,    TB_FOLDED_STORE },
+    { X86::VMOVDQU8Z256rr,     X86::VMOVDQU8Z256mr,   TB_FOLDED_STORE },
+    { X86::VMOVDQU16Z256rr,    X86::VMOVDQU16Z256mr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU32Z256rr,    X86::VMOVDQU32Z256mr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU64Z256rr,    X86::VMOVDQU64Z256mr,  TB_FOLDED_STORE },
+    { X86::VPMOVDWZ256rr,      X86::VPMOVDWZ256mr,    TB_FOLDED_STORE },
+    { X86::VPMOVQDZ256rr,      X86::VPMOVQDZ256mr,    TB_FOLDED_STORE },
+    { X86::VPMOVWBZ256rr,      X86::VPMOVWBZ256mr,    TB_FOLDED_STORE },
+    { X86::VPMOVSDWZ256rr,     X86::VPMOVSDWZ256mr,   TB_FOLDED_STORE },
+    { X86::VPMOVSQDZ256rr,     X86::VPMOVSQDZ256mr,   TB_FOLDED_STORE },
+    { X86::VPMOVSWBZ256rr,     X86::VPMOVSWBZ256mr,   TB_FOLDED_STORE },
+    { X86::VPMOVUSDWZ256rr,    X86::VPMOVUSDWZ256mr,  TB_FOLDED_STORE },
+    { X86::VPMOVUSQDZ256rr,    X86::VPMOVUSQDZ256mr,  TB_FOLDED_STORE },
+    { X86::VPMOVUSWBZ256rr,    X86::VPMOVUSWBZ256mr,  TB_FOLDED_STORE },
+
+    // AVX-512 foldable instructions (128-bit versions)
+    { X86::VMOVAPDZ128rr,      X86::VMOVAPDZ128mr,    TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPSZ128rr,      X86::VMOVAPSZ128mr,    TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVDQA32Z128rr,    X86::VMOVDQA32Z128mr,  TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVDQA64Z128rr,    X86::VMOVDQA64Z128mr,  TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVUPDZ128rr,      X86::VMOVUPDZ128mr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSZ128rr,      X86::VMOVUPSZ128mr,    TB_FOLDED_STORE },
+    { X86::VMOVDQU8Z128rr,     X86::VMOVDQU8Z128mr,   TB_FOLDED_STORE },
+    { X86::VMOVDQU16Z128rr,    X86::VMOVDQU16Z128mr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU32Z128rr,    X86::VMOVDQU32Z128mr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU64Z128rr,    X86::VMOVDQU64Z128mr,  TB_FOLDED_STORE },
+
+    // F16C foldable instructions
+    { X86::VCVTPS2PHrr,        X86::VCVTPS2PHmr,      TB_FOLDED_STORE },
+    { X86::VCVTPS2PHYrr,       X86::VCVTPS2PHYmr,     TB_FOLDED_STORE }
+  };
+
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) {
+    AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
+                  Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags);
+  }
+
+  static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
+    { X86::BSF16rr,         X86::BSF16rm,             0 },
+    { X86::BSF32rr,         X86::BSF32rm,             0 },
+    { X86::BSF64rr,         X86::BSF64rm,             0 },
+    { X86::BSR16rr,         X86::BSR16rm,             0 },
+    { X86::BSR32rr,         X86::BSR32rm,             0 },
+    { X86::BSR64rr,         X86::BSR64rm,             0 },
+    { X86::CMP16rr,         X86::CMP16rm,             0 },
+    { X86::CMP32rr,         X86::CMP32rm,             0 },
+    { X86::CMP64rr,         X86::CMP64rm,             0 },
+    { X86::CMP8rr,          X86::CMP8rm,              0 },
+    { X86::CVTSD2SSrr,      X86::CVTSD2SSrm,          0 },
+    { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm,        0 },
+    { X86::CVTSI2SDrr,      X86::CVTSI2SDrm,          0 },
+    { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm,        0 },
+    { X86::CVTSI2SSrr,      X86::CVTSI2SSrm,          0 },
+    { X86::CVTSS2SDrr,      X86::CVTSS2SDrm,          0 },
+    { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm,       0 },
+    { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm,         0 },
+    { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm,       0 },
+    { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm,         0 },
+    { X86::IMUL16rri,       X86::IMUL16rmi,           0 },
+    { X86::IMUL16rri8,      X86::IMUL16rmi8,          0 },
+    { X86::IMUL32rri,       X86::IMUL32rmi,           0 },
+    { X86::IMUL32rri8,      X86::IMUL32rmi8,          0 },
+    { X86::IMUL64rri32,     X86::IMUL64rmi32,         0 },
+    { X86::IMUL64rri8,      X86::IMUL64rmi8,          0 },
+    { X86::Int_COMISDrr,    X86::Int_COMISDrm,        TB_NO_REVERSE },
+    { X86::Int_COMISSrr,    X86::Int_COMISSrm,        TB_NO_REVERSE },
+    { X86::CVTSD2SI64rr,    X86::CVTSD2SI64rm,        TB_NO_REVERSE },
+    { X86::CVTSD2SIrr,      X86::CVTSD2SIrm,          TB_NO_REVERSE },
+    { X86::CVTSS2SI64rr,    X86::CVTSS2SI64rm,        TB_NO_REVERSE },
+    { X86::CVTSS2SIrr,      X86::CVTSS2SIrm,          TB_NO_REVERSE },
+    { X86::CVTDQ2PDrr,      X86::CVTDQ2PDrm,          TB_NO_REVERSE },
+    { X86::CVTDQ2PSrr,      X86::CVTDQ2PSrm,          TB_ALIGN_16 },
+    { X86::CVTPD2DQrr,      X86::CVTPD2DQrm,          TB_ALIGN_16 },
+    { X86::CVTPD2PSrr,      X86::CVTPD2PSrm,          TB_ALIGN_16 },
+    { X86::CVTPS2DQrr,      X86::CVTPS2DQrm,          TB_ALIGN_16 },
+    { X86::CVTPS2PDrr,      X86::CVTPS2PDrm,          TB_NO_REVERSE },
+    { X86::CVTTPD2DQrr,     X86::CVTTPD2DQrm,         TB_ALIGN_16 },
+    { X86::CVTTPS2DQrr,     X86::CVTTPS2DQrm,         TB_ALIGN_16 },
+    { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm,  TB_NO_REVERSE },
+    { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm,     TB_NO_REVERSE },
+    { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm,  TB_NO_REVERSE },
+    { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm,     TB_NO_REVERSE },
+    { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm,       TB_NO_REVERSE },
+    { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm,       TB_NO_REVERSE },
+    { X86::MOV16rr,         X86::MOV16rm,             0 },
+    { X86::MOV32rr,         X86::MOV32rm,             0 },
+    { X86::MOV64rr,         X86::MOV64rm,             0 },
+    { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm,         0 },
+    { X86::MOV64toSDrr,     X86::MOV64toSDrm,         0 },
+    { X86::MOV8rr,          X86::MOV8rm,              0 },
+    { X86::MOVAPDrr,        X86::MOVAPDrm,            TB_ALIGN_16 },
+    { X86::MOVAPSrr,        X86::MOVAPSrm,            TB_ALIGN_16 },
+    { X86::MOVDDUPrr,       X86::MOVDDUPrm,           0 },
+    { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm,         0 },
+    { X86::MOVDI2SSrr,      X86::MOVDI2SSrm,          0 },
+    { X86::MOVDQArr,        X86::MOVDQArm,            TB_ALIGN_16 },
+    { X86::MOVDQUrr,        X86::MOVDQUrm,            0 },
+    { X86::MOVSHDUPrr,      X86::MOVSHDUPrm,          TB_ALIGN_16 },
+    { X86::MOVSLDUPrr,      X86::MOVSLDUPrm,          TB_ALIGN_16 },
+    { X86::MOVSX16rr8,      X86::MOVSX16rm8,          0 },
+    { X86::MOVSX32rr16,     X86::MOVSX32rm16,         0 },
+    { X86::MOVSX32rr8,      X86::MOVSX32rm8,          0 },
+    { X86::MOVSX64rr16,     X86::MOVSX64rm16,         0 },
+    { X86::MOVSX64rr32,     X86::MOVSX64rm32,         0 },
+    { X86::MOVSX64rr8,      X86::MOVSX64rm8,          0 },
+    { X86::MOVUPDrr,        X86::MOVUPDrm,            0 },
+    { X86::MOVUPSrr,        X86::MOVUPSrm,            0 },
+    { X86::MOVZPQILo2PQIrr, X86::MOVQI2PQIrm,         TB_NO_REVERSE },
+    { X86::MOVZX16rr8,      X86::MOVZX16rm8,          0 },
+    { X86::MOVZX32rr16,     X86::MOVZX32rm16,         0 },
+    { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8,   0 },
+    { X86::MOVZX32rr8,      X86::MOVZX32rm8,          0 },
+    { X86::PABSBrr,         X86::PABSBrm,             TB_ALIGN_16 },
+    { X86::PABSDrr,         X86::PABSDrm,             TB_ALIGN_16 },
+    { X86::PABSWrr,         X86::PABSWrm,             TB_ALIGN_16 },
+    { X86::PCMPESTRIrr,     X86::PCMPESTRIrm,         TB_ALIGN_16 },
+    { X86::PCMPESTRM128rr,  X86::PCMPESTRM128rm,      TB_ALIGN_16 },
+    { X86::PCMPISTRIrr,     X86::PCMPISTRIrm,         TB_ALIGN_16 },
+    { X86::PCMPISTRM128rr,  X86::PCMPISTRM128rm,      TB_ALIGN_16 },
+    { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128,     TB_ALIGN_16 },
+    { X86::PMOVSXBDrr,      X86::PMOVSXBDrm,          TB_NO_REVERSE },
+    { X86::PMOVSXBQrr,      X86::PMOVSXBQrm,          TB_NO_REVERSE },
+    { X86::PMOVSXBWrr,      X86::PMOVSXBWrm,          TB_NO_REVERSE },
+    { X86::PMOVSXDQrr,      X86::PMOVSXDQrm,          TB_NO_REVERSE },
+    { X86::PMOVSXWDrr,      X86::PMOVSXWDrm,          TB_NO_REVERSE },
+    { X86::PMOVSXWQrr,      X86::PMOVSXWQrm,          TB_NO_REVERSE },
+    { X86::PMOVZXBDrr,      X86::PMOVZXBDrm,          TB_NO_REVERSE },
+    { X86::PMOVZXBQrr,      X86::PMOVZXBQrm,          TB_NO_REVERSE },
+    { X86::PMOVZXBWrr,      X86::PMOVZXBWrm,          TB_NO_REVERSE },
+    { X86::PMOVZXDQrr,      X86::PMOVZXDQrm,          TB_NO_REVERSE },
+    { X86::PMOVZXWDrr,      X86::PMOVZXWDrm,          TB_NO_REVERSE },
+    { X86::PMOVZXWQrr,      X86::PMOVZXWQrm,          TB_NO_REVERSE },
+    { X86::PSHUFDri,        X86::PSHUFDmi,            TB_ALIGN_16 },
+    { X86::PSHUFHWri,       X86::PSHUFHWmi,           TB_ALIGN_16 },
+    { X86::PSHUFLWri,       X86::PSHUFLWmi,           TB_ALIGN_16 },
+    { X86::PTESTrr,         X86::PTESTrm,             TB_ALIGN_16 },
+    { X86::RCPPSr,          X86::RCPPSm,              TB_ALIGN_16 },
+    { X86::RCPSSr,          X86::RCPSSm,              0 },
+    { X86::RCPSSr_Int,      X86::RCPSSm_Int,          TB_NO_REVERSE },
+    { X86::ROUNDPDr,        X86::ROUNDPDm,            TB_ALIGN_16 },
+    { X86::ROUNDPSr,        X86::ROUNDPSm,            TB_ALIGN_16 },
+    { X86::ROUNDSDr,        X86::ROUNDSDm,            0 },
+    { X86::ROUNDSSr,        X86::ROUNDSSm,            0 },
+    { X86::RSQRTPSr,        X86::RSQRTPSm,            TB_ALIGN_16 },
+    { X86::RSQRTSSr,        X86::RSQRTSSm,            0 },
+    { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int,        TB_NO_REVERSE },
+    { X86::SQRTPDr,         X86::SQRTPDm,             TB_ALIGN_16 },
+    { X86::SQRTPSr,         X86::SQRTPSm,             TB_ALIGN_16 },
+    { X86::SQRTSDr,         X86::SQRTSDm,             0 },
+    { X86::SQRTSDr_Int,     X86::SQRTSDm_Int,         TB_NO_REVERSE },
+    { X86::SQRTSSr,         X86::SQRTSSm,             0 },
+    { X86::SQRTSSr_Int,     X86::SQRTSSm_Int,         TB_NO_REVERSE },
+    { X86::TEST16rr,        X86::TEST16rm,            0 },
+    { X86::TEST32rr,        X86::TEST32rm,            0 },
+    { X86::TEST64rr,        X86::TEST64rm,            0 },
+    { X86::TEST8rr,         X86::TEST8rm,             0 },
+    // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
+    { X86::UCOMISDrr,       X86::UCOMISDrm,           0 },
+    { X86::UCOMISSrr,       X86::UCOMISSrm,           0 },
+
+    // MMX version of foldable instructions
+    { X86::MMX_CVTPD2PIirr,   X86::MMX_CVTPD2PIirm,   0 },
+    { X86::MMX_CVTPI2PDirr,   X86::MMX_CVTPI2PDirm,   0 },
+    { X86::MMX_CVTPS2PIirr,   X86::MMX_CVTPS2PIirm,   0 },
+    { X86::MMX_CVTTPD2PIirr,  X86::MMX_CVTTPD2PIirm,  0 },
+    { X86::MMX_CVTTPS2PIirr,  X86::MMX_CVTTPS2PIirm,  0 },
+    { X86::MMX_MOVD64to64rr,  X86::MMX_MOVQ64rm,      0 },
+    { X86::MMX_PABSBrr64,     X86::MMX_PABSBrm64,     0 },
+    { X86::MMX_PABSDrr64,     X86::MMX_PABSDrm64,     0 },
+    { X86::MMX_PABSWrr64,     X86::MMX_PABSWrm64,     0 },
+    { X86::MMX_PSHUFWri,      X86::MMX_PSHUFWmi,      0 },
+
+    // 3DNow! version of foldable instructions
+    { X86::PF2IDrr,         X86::PF2IDrm,             0 },
+    { X86::PF2IWrr,         X86::PF2IWrm,             0 },
+    { X86::PFRCPrr,         X86::PFRCPrm,             0 },
+    { X86::PFRSQRTrr,       X86::PFRSQRTrm,           0 },
+    { X86::PI2FDrr,         X86::PI2FDrm,             0 },
+    { X86::PI2FWrr,         X86::PI2FWrm,             0 },
+    { X86::PSWAPDrr,        X86::PSWAPDrm,            0 },
+
+    // AVX 128-bit versions of foldable instructions
+    { X86::Int_VCOMISDrr,   X86::Int_VCOMISDrm,       TB_NO_REVERSE },
+    { X86::Int_VCOMISSrr,   X86::Int_VCOMISSrm,       TB_NO_REVERSE },
+    { X86::Int_VUCOMISDrr,  X86::Int_VUCOMISDrm,      TB_NO_REVERSE },
+    { X86::Int_VUCOMISSrr,  X86::Int_VUCOMISSrm,      TB_NO_REVERSE },
+    { X86::VCVTTSD2SI64rr,  X86::VCVTTSD2SI64rm,      0 },
+    { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,TB_NO_REVERSE },
+    { X86::VCVTTSD2SIrr,    X86::VCVTTSD2SIrm,        0 },
+    { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm,    TB_NO_REVERSE },
+    { X86::VCVTTSS2SI64rr,  X86::VCVTTSS2SI64rm,      0 },
+    { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,TB_NO_REVERSE },
+    { X86::VCVTTSS2SIrr,    X86::VCVTTSS2SIrm,        0 },
+    { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm,    TB_NO_REVERSE },
+    { X86::VCVTSD2SI64rr,   X86::VCVTSD2SI64rm,       TB_NO_REVERSE },
+    { X86::VCVTSD2SIrr,     X86::VCVTSD2SIrm,         TB_NO_REVERSE },
+    { X86::VCVTSS2SI64rr,   X86::VCVTSS2SI64rm,       TB_NO_REVERSE },
+    { X86::VCVTSS2SIrr,     X86::VCVTSS2SIrm,         TB_NO_REVERSE },
+    { X86::VCVTDQ2PDrr,     X86::VCVTDQ2PDrm,         TB_NO_REVERSE },
+    { X86::VCVTDQ2PSrr,     X86::VCVTDQ2PSrm,         0 },
+    { X86::VCVTPD2DQrr,     X86::VCVTPD2DQrm,         0 },
+    { X86::VCVTPD2PSrr,     X86::VCVTPD2PSrm,         0 },
+    { X86::VCVTPS2DQrr,     X86::VCVTPS2DQrm,         0 },
+    { X86::VCVTPS2PDrr,     X86::VCVTPS2PDrm,         TB_NO_REVERSE },
+    { X86::VCVTTPD2DQrr,    X86::VCVTTPD2DQrm,        0 },
+    { X86::VCVTTPS2DQrr,    X86::VCVTTPS2DQrm,        0 },
+    { X86::VMOV64toPQIrr,   X86::VMOVQI2PQIrm,        0 },
+    { X86::VMOV64toSDrr,    X86::VMOV64toSDrm,        0 },
+    { X86::VMOVAPDrr,       X86::VMOVAPDrm,           TB_ALIGN_16 },
+    { X86::VMOVAPSrr,       X86::VMOVAPSrm,           TB_ALIGN_16 },
+    { X86::VMOVDDUPrr,      X86::VMOVDDUPrm,          0 },
+    { X86::VMOVDI2PDIrr,    X86::VMOVDI2PDIrm,        0 },
+    { X86::VMOVDI2SSrr,     X86::VMOVDI2SSrm,         0 },
+    { X86::VMOVDQArr,       X86::VMOVDQArm,           TB_ALIGN_16 },
+    { X86::VMOVDQUrr,       X86::VMOVDQUrm,           0 },
+    { X86::VMOVSLDUPrr,     X86::VMOVSLDUPrm,         0 },
+    { X86::VMOVSHDUPrr,     X86::VMOVSHDUPrm,         0 },
+    { X86::VMOVUPDrr,       X86::VMOVUPDrm,           0 },
+    { X86::VMOVUPSrr,       X86::VMOVUPSrm,           0 },
+    { X86::VMOVZPQILo2PQIrr,X86::VMOVQI2PQIrm,        TB_NO_REVERSE },
+    { X86::VPABSBrr,        X86::VPABSBrm,            0 },
+    { X86::VPABSDrr,        X86::VPABSDrm,            0 },
+    { X86::VPABSWrr,        X86::VPABSWrm,            0 },
+    { X86::VPCMPESTRIrr,    X86::VPCMPESTRIrm,        0 },
+    { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm,     0 },
+    { X86::VPCMPISTRIrr,    X86::VPCMPISTRIrm,        0 },
+    { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm,     0 },
+    { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128,   0 },
+    { X86::VPERMILPDri,     X86::VPERMILPDmi,         0 },
+    { X86::VPERMILPSri,     X86::VPERMILPSmi,         0 },
+    { X86::VPMOVSXBDrr,     X86::VPMOVSXBDrm,         TB_NO_REVERSE },
+    { X86::VPMOVSXBQrr,     X86::VPMOVSXBQrm,         TB_NO_REVERSE },
+    { X86::VPMOVSXBWrr,     X86::VPMOVSXBWrm,         TB_NO_REVERSE },
+    { X86::VPMOVSXDQrr,     X86::VPMOVSXDQrm,         TB_NO_REVERSE },
+    { X86::VPMOVSXWDrr,     X86::VPMOVSXWDrm,         TB_NO_REVERSE },
+    { X86::VPMOVSXWQrr,     X86::VPMOVSXWQrm,         TB_NO_REVERSE },
+    { X86::VPMOVZXBDrr,     X86::VPMOVZXBDrm,         TB_NO_REVERSE },
+    { X86::VPMOVZXBQrr,     X86::VPMOVZXBQrm,         TB_NO_REVERSE },
+    { X86::VPMOVZXBWrr,     X86::VPMOVZXBWrm,         TB_NO_REVERSE },
+    { X86::VPMOVZXDQrr,     X86::VPMOVZXDQrm,         TB_NO_REVERSE },
+    { X86::VPMOVZXWDrr,     X86::VPMOVZXWDrm,         TB_NO_REVERSE },
+    { X86::VPMOVZXWQrr,     X86::VPMOVZXWQrm,         TB_NO_REVERSE },
+    { X86::VPSHUFDri,       X86::VPSHUFDmi,           0 },
+    { X86::VPSHUFHWri,      X86::VPSHUFHWmi,          0 },
+    { X86::VPSHUFLWri,      X86::VPSHUFLWmi,          0 },
+    { X86::VPTESTrr,        X86::VPTESTrm,            0 },
+    { X86::VRCPPSr,         X86::VRCPPSm,             0 },
+    { X86::VROUNDPDr,       X86::VROUNDPDm,           0 },
+    { X86::VROUNDPSr,       X86::VROUNDPSm,           0 },
+    { X86::VRSQRTPSr,       X86::VRSQRTPSm,           0 },
+    { X86::VSQRTPDr,        X86::VSQRTPDm,            0 },
+    { X86::VSQRTPSr,        X86::VSQRTPSm,            0 },
+    { X86::VTESTPDrr,       X86::VTESTPDrm,           0 },
+    { X86::VTESTPSrr,       X86::VTESTPSrm,           0 },
+    { X86::VUCOMISDrr,      X86::VUCOMISDrm,          0 },
+    { X86::VUCOMISSrr,      X86::VUCOMISSrm,          0 },
+
+    // AVX 256-bit foldable instructions
+    { X86::VCVTDQ2PDYrr,    X86::VCVTDQ2PDYrm,        TB_NO_REVERSE },
+    { X86::VCVTDQ2PSYrr,    X86::VCVTDQ2PSYrm,        0 },
+    { X86::VCVTPD2DQYrr,    X86::VCVTPD2DQYrm,        0 },
+    { X86::VCVTPD2PSYrr,    X86::VCVTPD2PSYrm,        0 },
+    { X86::VCVTPS2DQYrr,    X86::VCVTPS2DQYrm,        0 },
+    { X86::VCVTPS2PDYrr,    X86::VCVTPS2PDYrm,        TB_NO_REVERSE },
+    { X86::VCVTTPD2DQYrr,   X86::VCVTTPD2DQYrm,       0 },
+    { X86::VCVTTPS2DQYrr,   X86::VCVTTPS2DQYrm,       0 },
+    { X86::VMOVAPDYrr,      X86::VMOVAPDYrm,          TB_ALIGN_32 },
+    { X86::VMOVAPSYrr,      X86::VMOVAPSYrm,          TB_ALIGN_32 },
+    { X86::VMOVDDUPYrr,     X86::VMOVDDUPYrm,         0 },
+    { X86::VMOVDQAYrr,      X86::VMOVDQAYrm,          TB_ALIGN_32 },
+    { X86::VMOVDQUYrr,      X86::VMOVDQUYrm,          0 },
+    { X86::VMOVSLDUPYrr,    X86::VMOVSLDUPYrm,        0 },
+    { X86::VMOVSHDUPYrr,    X86::VMOVSHDUPYrm,        0 },
+    { X86::VMOVUPDYrr,      X86::VMOVUPDYrm,          0 },
+    { X86::VMOVUPSYrr,      X86::VMOVUPSYrm,          0 },
+    { X86::VPERMILPDYri,    X86::VPERMILPDYmi,        0 },
+    { X86::VPERMILPSYri,    X86::VPERMILPSYmi,        0 },
+    { X86::VPTESTYrr,       X86::VPTESTYrm,           0 },
+    { X86::VRCPPSYr,        X86::VRCPPSYm,            0 },
+    { X86::VROUNDYPDr,      X86::VROUNDYPDm,          0 },
+    { X86::VROUNDYPSr,      X86::VROUNDYPSm,          0 },
+    { X86::VRSQRTPSYr,      X86::VRSQRTPSYm,          0 },
+    { X86::VSQRTPDYr,       X86::VSQRTPDYm,           0 },
+    { X86::VSQRTPSYr,       X86::VSQRTPSYm,           0 },
+    { X86::VTESTPDYrr,      X86::VTESTPDYrm,          0 },
+    { X86::VTESTPSYrr,      X86::VTESTPSYrm,          0 },
+
+    // AVX2 foldable instructions
+
+    // VBROADCASTS{SD}rr register instructions were an AVX2 addition while the
+    // VBROADCASTS{SD}rm memory instructions were available from AVX1.
+    // TB_NO_REVERSE prevents unfolding from introducing an illegal instruction
+    // on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions
+    // so they don't need an equivalent limitation.
+    { X86::VBROADCASTSSrr,  X86::VBROADCASTSSrm,      TB_NO_REVERSE },
+    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
+    { X86::VPABSBYrr,       X86::VPABSBYrm,           0 },
+    { X86::VPABSDYrr,       X86::VPABSDYrm,           0 },
+    { X86::VPABSWYrr,       X86::VPABSWYrm,           0 },
+    { X86::VPBROADCASTBrr,  X86::VPBROADCASTBrm,      TB_NO_REVERSE },
+    { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm,     TB_NO_REVERSE },
+    { X86::VPBROADCASTDrr,  X86::VPBROADCASTDrm,      TB_NO_REVERSE },
+    { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm,     TB_NO_REVERSE },
+    { X86::VPBROADCASTQrr,  X86::VPBROADCASTQrm,      TB_NO_REVERSE },
+    { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm,     TB_NO_REVERSE },
+    { X86::VPBROADCASTWrr,  X86::VPBROADCASTWrm,      TB_NO_REVERSE },
+    { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm,     TB_NO_REVERSE },
+    { X86::VPERMPDYri,      X86::VPERMPDYmi,          0 },
+    { X86::VPERMQYri,       X86::VPERMQYmi,           0 },
+    { X86::VPMOVSXBDYrr,    X86::VPMOVSXBDYrm,        TB_NO_REVERSE },
+    { X86::VPMOVSXBQYrr,    X86::VPMOVSXBQYrm,        TB_NO_REVERSE },
+    { X86::VPMOVSXBWYrr,    X86::VPMOVSXBWYrm,        0 },
+    { X86::VPMOVSXDQYrr,    X86::VPMOVSXDQYrm,        0 },
+    { X86::VPMOVSXWDYrr,    X86::VPMOVSXWDYrm,        0 },
+    { X86::VPMOVSXWQYrr,    X86::VPMOVSXWQYrm,        TB_NO_REVERSE },
+    { X86::VPMOVZXBDYrr,    X86::VPMOVZXBDYrm,        TB_NO_REVERSE },
+    { X86::VPMOVZXBQYrr,    X86::VPMOVZXBQYrm,        TB_NO_REVERSE },
+    { X86::VPMOVZXBWYrr,    X86::VPMOVZXBWYrm,        0 },
+    { X86::VPMOVZXDQYrr,    X86::VPMOVZXDQYrm,        0 },
+    { X86::VPMOVZXWDYrr,    X86::VPMOVZXWDYrm,        0 },
+    { X86::VPMOVZXWQYrr,    X86::VPMOVZXWQYrm,        TB_NO_REVERSE },
+    { X86::VPSHUFDYri,      X86::VPSHUFDYmi,          0 },
+    { X86::VPSHUFHWYri,     X86::VPSHUFHWYmi,         0 },
+    { X86::VPSHUFLWYri,     X86::VPSHUFLWYmi,         0 },
+
+    // XOP foldable instructions
+    { X86::VFRCZPDrr,          X86::VFRCZPDrm,        0 },
+    { X86::VFRCZPDrrY,         X86::VFRCZPDrmY,       0 },
+    { X86::VFRCZPSrr,          X86::VFRCZPSrm,        0 },
+    { X86::VFRCZPSrrY,         X86::VFRCZPSrmY,       0 },
+    { X86::VFRCZSDrr,          X86::VFRCZSDrm,        0 },
+    { X86::VFRCZSSrr,          X86::VFRCZSSrm,        0 },
+    { X86::VPHADDBDrr,         X86::VPHADDBDrm,       0 },
+    { X86::VPHADDBQrr,         X86::VPHADDBQrm,       0 },
+    { X86::VPHADDBWrr,         X86::VPHADDBWrm,       0 },
+    { X86::VPHADDDQrr,         X86::VPHADDDQrm,       0 },
+    { X86::VPHADDWDrr,         X86::VPHADDWDrm,       0 },
+    { X86::VPHADDWQrr,         X86::VPHADDWQrm,       0 },
+    { X86::VPHADDUBDrr,        X86::VPHADDUBDrm,      0 },
+    { X86::VPHADDUBQrr,        X86::VPHADDUBQrm,      0 },
+    { X86::VPHADDUBWrr,        X86::VPHADDUBWrm,      0 },
+    { X86::VPHADDUDQrr,        X86::VPHADDUDQrm,      0 },
+    { X86::VPHADDUWDrr,        X86::VPHADDUWDrm,      0 },
+    { X86::VPHADDUWQrr,        X86::VPHADDUWQrm,      0 },
+    { X86::VPHSUBBWrr,         X86::VPHSUBBWrm,       0 },
+    { X86::VPHSUBDQrr,         X86::VPHSUBDQrm,       0 },
+    { X86::VPHSUBWDrr,         X86::VPHSUBWDrm,       0 },
+    { X86::VPROTBri,           X86::VPROTBmi,         0 },
+    { X86::VPROTBrr,           X86::VPROTBmr,         0 },
+    { X86::VPROTDri,           X86::VPROTDmi,         0 },
+    { X86::VPROTDrr,           X86::VPROTDmr,         0 },
+    { X86::VPROTQri,           X86::VPROTQmi,         0 },
+    { X86::VPROTQrr,           X86::VPROTQmr,         0 },
+    { X86::VPROTWri,           X86::VPROTWmi,         0 },
+    { X86::VPROTWrr,           X86::VPROTWmr,         0 },
+    { X86::VPSHABrr,           X86::VPSHABmr,         0 },
+    { X86::VPSHADrr,           X86::VPSHADmr,         0 },
+    { X86::VPSHAQrr,           X86::VPSHAQmr,         0 },
+    { X86::VPSHAWrr,           X86::VPSHAWmr,         0 },
+    { X86::VPSHLBrr,           X86::VPSHLBmr,         0 },
+    { X86::VPSHLDrr,           X86::VPSHLDmr,         0 },
+    { X86::VPSHLQrr,           X86::VPSHLQmr,         0 },
+    { X86::VPSHLWrr,           X86::VPSHLWmr,         0 },
+
+    // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
+    { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
+    { X86::BEXTR64rr,       X86::BEXTR64rm,           0 },
+    { X86::BEXTRI32ri,      X86::BEXTRI32mi,          0 },
+    { X86::BEXTRI64ri,      X86::BEXTRI64mi,          0 },
+    { X86::BLCFILL32rr,     X86::BLCFILL32rm,         0 },
+    { X86::BLCFILL64rr,     X86::BLCFILL64rm,         0 },
+    { X86::BLCI32rr,        X86::BLCI32rm,            0 },
+    { X86::BLCI64rr,        X86::BLCI64rm,            0 },
+    { X86::BLCIC32rr,       X86::BLCIC32rm,           0 },
+    { X86::BLCIC64rr,       X86::BLCIC64rm,           0 },
+    { X86::BLCMSK32rr,      X86::BLCMSK32rm,          0 },
+    { X86::BLCMSK64rr,      X86::BLCMSK64rm,          0 },
+    { X86::BLCS32rr,        X86::BLCS32rm,            0 },
+    { X86::BLCS64rr,        X86::BLCS64rm,            0 },
+    { X86::BLSFILL32rr,     X86::BLSFILL32rm,         0 },
+    { X86::BLSFILL64rr,     X86::BLSFILL64rm,         0 },
+    { X86::BLSI32rr,        X86::BLSI32rm,            0 },
+    { X86::BLSI64rr,        X86::BLSI64rm,            0 },
+    { X86::BLSIC32rr,       X86::BLSIC32rm,           0 },
+    { X86::BLSIC64rr,       X86::BLSIC64rm,           0 },
+    { X86::BLSMSK32rr,      X86::BLSMSK32rm,          0 },
+    { X86::BLSMSK64rr,      X86::BLSMSK64rm,          0 },
+    { X86::BLSR32rr,        X86::BLSR32rm,            0 },
+    { X86::BLSR64rr,        X86::BLSR64rm,            0 },
+    { X86::BZHI32rr,        X86::BZHI32rm,            0 },
+    { X86::BZHI64rr,        X86::BZHI64rm,            0 },
+    { X86::LZCNT16rr,       X86::LZCNT16rm,           0 },
+    { X86::LZCNT32rr,       X86::LZCNT32rm,           0 },
+    { X86::LZCNT64rr,       X86::LZCNT64rm,           0 },
+    { X86::POPCNT16rr,      X86::POPCNT16rm,          0 },
+    { X86::POPCNT32rr,      X86::POPCNT32rm,          0 },
+    { X86::POPCNT64rr,      X86::POPCNT64rm,          0 },
+    { X86::RORX32ri,        X86::RORX32mi,            0 },
+    { X86::RORX64ri,        X86::RORX64mi,            0 },
+    { X86::SARX32rr,        X86::SARX32rm,            0 },
+    { X86::SARX64rr,        X86::SARX64rm,            0 },
+    { X86::SHRX32rr,        X86::SHRX32rm,            0 },
+    { X86::SHRX64rr,        X86::SHRX64rm,            0 },
+    { X86::SHLX32rr,        X86::SHLX32rm,            0 },
+    { X86::SHLX64rr,        X86::SHLX64rm,            0 },
+    { X86::T1MSKC32rr,      X86::T1MSKC32rm,          0 },
+    { X86::T1MSKC64rr,      X86::T1MSKC64rm,          0 },
+    { X86::TZCNT16rr,       X86::TZCNT16rm,           0 },
+    { X86::TZCNT32rr,       X86::TZCNT32rm,           0 },
+    { X86::TZCNT64rr,       X86::TZCNT64rm,           0 },
+    { X86::TZMSK32rr,       X86::TZMSK32rm,           0 },
+    { X86::TZMSK64rr,       X86::TZMSK64rm,           0 },
+
+    // AVX-512 foldable instructions
+    { X86::VBROADCASTSSZr,   X86::VBROADCASTSSZm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSDZr,   X86::VBROADCASTSDZm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm,     TB_NO_REVERSE },
+    { X86::VMOV64toPQIZrr,   X86::VMOVQI2PQIZrm,      0 },
+    { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm,      TB_NO_REVERSE },
+    { X86::VMOVDI2SSZrr,     X86::VMOVDI2SSZrm,       0 },
+    { X86::VMOVAPDZrr,       X86::VMOVAPDZrm,         TB_ALIGN_64 },
+    { X86::VMOVAPSZrr,       X86::VMOVAPSZrm,         TB_ALIGN_64 },
+    { X86::VMOVDQA32Zrr,     X86::VMOVDQA32Zrm,       TB_ALIGN_64 },
+    { X86::VMOVDQA64Zrr,     X86::VMOVDQA64Zrm,       TB_ALIGN_64 },
+    { X86::VMOVDQU8Zrr,      X86::VMOVDQU8Zrm,        0 },
+    { X86::VMOVDQU16Zrr,     X86::VMOVDQU16Zrm,       0 },
+    { X86::VMOVDQU32Zrr,     X86::VMOVDQU32Zrm,       0 },
+    { X86::VMOVDQU64Zrr,     X86::VMOVDQU64Zrm,       0 },
+    { X86::VMOVUPDZrr,       X86::VMOVUPDZrm,         0 },
+    { X86::VMOVUPSZrr,       X86::VMOVUPSZrm,         0 },
+    { X86::VPABSDZrr,        X86::VPABSDZrm,          0 },
+    { X86::VPABSQZrr,        X86::VPABSQZrm,          0 },
+    { X86::VPERMILPDZri,     X86::VPERMILPDZmi,       0 },
+    { X86::VPERMILPSZri,     X86::VPERMILPSZmi,       0 },
+    { X86::VPERMPDZri,       X86::VPERMPDZmi,         0 },
+    { X86::VPERMQZri,        X86::VPERMQZmi,          0 },
+    { X86::VPMOVSXBDZrr,     X86::VPMOVSXBDZrm,       0 },
+    { X86::VPMOVSXBQZrr,     X86::VPMOVSXBQZrm,       TB_NO_REVERSE },
+    { X86::VPMOVSXBWZrr,     X86::VPMOVSXBWZrm,       0 },
+    { X86::VPMOVSXDQZrr,     X86::VPMOVSXDQZrm,       0 },
+    { X86::VPMOVSXWDZrr,     X86::VPMOVSXWDZrm,       0 },
+    { X86::VPMOVSXWQZrr,     X86::VPMOVSXWQZrm,       0 },
+    { X86::VPMOVZXBDZrr,     X86::VPMOVZXBDZrm,       0 },
+    { X86::VPMOVZXBQZrr,     X86::VPMOVZXBQZrm,       TB_NO_REVERSE },
+    { X86::VPMOVZXBWZrr,     X86::VPMOVZXBWZrm,       0 },
+    { X86::VPMOVZXDQZrr,     X86::VPMOVZXDQZrm,       0 },
+    { X86::VPMOVZXWDZrr,     X86::VPMOVZXWDZrm,       0 },
+    { X86::VPMOVZXWQZrr,     X86::VPMOVZXWQZrm,       0 },
+    { X86::VPSHUFDZri,       X86::VPSHUFDZmi,         0 },
+    { X86::VPSHUFHWZri,      X86::VPSHUFHWZmi,        0 },
+    { X86::VPSHUFLWZri,      X86::VPSHUFLWZmi,        0 },
+
+    // AVX-512 foldable instructions (256-bit versions)
+    { X86::VBROADCASTSSZ256r,    X86::VBROADCASTSSZ256m,    TB_NO_REVERSE },
+    { X86::VBROADCASTSSZ256r_s,  X86::VBROADCASTSSZ256m,    TB_NO_REVERSE },
+    { X86::VBROADCASTSDZ256r,    X86::VBROADCASTSDZ256m,    TB_NO_REVERSE },
+    { X86::VBROADCASTSDZ256r_s,  X86::VBROADCASTSDZ256m,    TB_NO_REVERSE },
+    { X86::VMOVAPDZ256rr,        X86::VMOVAPDZ256rm,        TB_ALIGN_32 },
+    { X86::VMOVAPSZ256rr,        X86::VMOVAPSZ256rm,        TB_ALIGN_32 },
+    { X86::VMOVDQA32Z256rr,      X86::VMOVDQA32Z256rm,      TB_ALIGN_32 },
+    { X86::VMOVDQA64Z256rr,      X86::VMOVDQA64Z256rm,      TB_ALIGN_32 },
+    { X86::VMOVDQU8Z256rr,       X86::VMOVDQU8Z256rm,       0 },
+    { X86::VMOVDQU16Z256rr,      X86::VMOVDQU16Z256rm,      0 },
+    { X86::VMOVDQU32Z256rr,      X86::VMOVDQU32Z256rm,      0 },
+    { X86::VMOVDQU64Z256rr,      X86::VMOVDQU64Z256rm,      0 },
+    { X86::VMOVUPDZ256rr,        X86::VMOVUPDZ256rm,        0 },
+    { X86::VMOVUPSZ256rr,        X86::VMOVUPSZ256rm,        0 },
+    { X86::VPERMILPDZ256ri,      X86::VPERMILPDZ256mi,      0 },
+    { X86::VPERMILPSZ256ri,      X86::VPERMILPSZ256mi,      0 },
+    { X86::VPERMPDZ256ri,        X86::VPERMPDZ256mi,        0 },
+    { X86::VPERMQZ256ri,         X86::VPERMQZ256mi,         0 },
+    { X86::VPMOVSXBDZ256rr,      X86::VPMOVSXBDZ256rm,      TB_NO_REVERSE },
+    { X86::VPMOVSXBQZ256rr,      X86::VPMOVSXBQZ256rm,      TB_NO_REVERSE },
+    { X86::VPMOVSXBWZ256rr,      X86::VPMOVSXBWZ256rm,      0 },
+    { X86::VPMOVSXDQZ256rr,      X86::VPMOVSXDQZ256rm,      0 },
+    { X86::VPMOVSXWDZ256rr,      X86::VPMOVSXWDZ256rm,      0 },
+    { X86::VPMOVSXWQZ256rr,      X86::VPMOVSXWQZ256rm,      TB_NO_REVERSE },
+    { X86::VPMOVZXBDZ256rr,      X86::VPMOVZXBDZ256rm,      TB_NO_REVERSE },
+    { X86::VPMOVZXBQZ256rr,      X86::VPMOVZXBQZ256rm,      TB_NO_REVERSE },
+    { X86::VPMOVZXBWZ256rr,      X86::VPMOVZXBWZ256rm,      0 },
+    { X86::VPMOVZXDQZ256rr,      X86::VPMOVZXDQZ256rm,      0 },
+    { X86::VPMOVZXWDZ256rr,      X86::VPMOVZXWDZ256rm,      0 },
+    { X86::VPMOVZXWQZ256rr,      X86::VPMOVZXWQZ256rm,      TB_NO_REVERSE },
+    { X86::VPSHUFDZ256ri,        X86::VPSHUFDZ256mi,        0 },
+    { X86::VPSHUFHWZ256ri,       X86::VPSHUFHWZ256mi,       0 },
+    { X86::VPSHUFLWZ256ri,       X86::VPSHUFLWZ256mi,       0 },
+
+    // AVX-512 foldable instructions (128-bit versions)
+    { X86::VBROADCASTSSZ128r,    X86::VBROADCASTSSZ128m,    TB_NO_REVERSE },
+    { X86::VBROADCASTSSZ128r_s,  X86::VBROADCASTSSZ128m,    TB_NO_REVERSE },
+    { X86::VMOVAPDZ128rr,        X86::VMOVAPDZ128rm,        TB_ALIGN_16 },
+    { X86::VMOVAPSZ128rr,        X86::VMOVAPSZ128rm,        TB_ALIGN_16 },
+    { X86::VMOVDQA32Z128rr,      X86::VMOVDQA32Z128rm,      TB_ALIGN_16 },
+    { X86::VMOVDQA64Z128rr,      X86::VMOVDQA64Z128rm,      TB_ALIGN_16 },
+    { X86::VMOVDQU8Z128rr,       X86::VMOVDQU8Z128rm,       0 },
+    { X86::VMOVDQU16Z128rr,      X86::VMOVDQU16Z128rm,      0 },
+    { X86::VMOVDQU32Z128rr,      X86::VMOVDQU32Z128rm,      0 },
+    { X86::VMOVDQU64Z128rr,      X86::VMOVDQU64Z128rm,      0 },
+    { X86::VMOVUPDZ128rr,        X86::VMOVUPDZ128rm,        0 },
+    { X86::VMOVUPSZ128rr,        X86::VMOVUPSZ128rm,        0 },
+    { X86::VPERMILPDZ128ri,      X86::VPERMILPDZ128mi,      0 },
+    { X86::VPERMILPSZ128ri,      X86::VPERMILPSZ128mi,      0 },
+    { X86::VPMOVSXBDZ128rr,      X86::VPMOVSXBDZ128rm,      TB_NO_REVERSE },
+    { X86::VPMOVSXBQZ128rr,      X86::VPMOVSXBQZ128rm,      TB_NO_REVERSE },
+    { X86::VPMOVSXBWZ128rr,      X86::VPMOVSXBWZ128rm,      TB_NO_REVERSE },
+    { X86::VPMOVSXDQZ128rr,      X86::VPMOVSXDQZ128rm,      TB_NO_REVERSE },
+    { X86::VPMOVSXWDZ128rr,      X86::VPMOVSXWDZ128rm,      TB_NO_REVERSE },
+    { X86::VPMOVSXWQZ128rr,      X86::VPMOVSXWQZ128rm,      TB_NO_REVERSE },
+    { X86::VPMOVZXBDZ128rr,      X86::VPMOVZXBDZ128rm,      TB_NO_REVERSE },
+    { X86::VPMOVZXBQZ128rr,      X86::VPMOVZXBQZ128rm,      TB_NO_REVERSE },
+    { X86::VPMOVZXBWZ128rr,      X86::VPMOVZXBWZ128rm,      TB_NO_REVERSE },
+    { X86::VPMOVZXDQZ128rr,      X86::VPMOVZXDQZ128rm,      TB_NO_REVERSE },
+    { X86::VPMOVZXWDZ128rr,      X86::VPMOVZXWDZ128rm,      TB_NO_REVERSE },
+    { X86::VPMOVZXWQZ128rr,      X86::VPMOVZXWQZ128rm,      TB_NO_REVERSE },
+    { X86::VPSHUFDZ128ri,        X86::VPSHUFDZ128mi,        0 },
+    { X86::VPSHUFHWZ128ri,       X86::VPSHUFHWZ128mi,       0 },
+    { X86::VPSHUFLWZ128ri,       X86::VPSHUFLWZ128mi,       0 },
+
+    // F16C foldable instructions
+    { X86::VCVTPH2PSrr,        X86::VCVTPH2PSrm,            0 },
+    { X86::VCVTPH2PSYrr,       X86::VCVTPH2PSYrm,           0 },
+
+    // AES foldable instructions
+    { X86::AESIMCrr,              X86::AESIMCrm,              TB_ALIGN_16 },
+    { X86::AESKEYGENASSIST128rr,  X86::AESKEYGENASSIST128rm,  TB_ALIGN_16 },
+    { X86::VAESIMCrr,             X86::VAESIMCrm,             0 },
+    { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
+  };
+
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) {
+    AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
+                  Entry.RegOp, Entry.MemOp,
+                  // Index 1, folded load
+                  Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
+  }
+
+  static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
+    { X86::ADC32rr,         X86::ADC32rm,       0 },
+    { X86::ADC64rr,         X86::ADC64rm,       0 },
+    { X86::ADD16rr,         X86::ADD16rm,       0 },
+    { X86::ADD16rr_DB,      X86::ADD16rm,       TB_NO_REVERSE },
+    { X86::ADD32rr,         X86::ADD32rm,       0 },
+    { X86::ADD32rr_DB,      X86::ADD32rm,       TB_NO_REVERSE },
+    { X86::ADD64rr,         X86::ADD64rm,       0 },
+    { X86::ADD64rr_DB,      X86::ADD64rm,       TB_NO_REVERSE },
+    { X86::ADD8rr,          X86::ADD8rm,        0 },
+    { X86::ADDPDrr,         X86::ADDPDrm,       TB_ALIGN_16 },
+    { X86::ADDPSrr,         X86::ADDPSrm,       TB_ALIGN_16 },
+    { X86::ADDSDrr,         X86::ADDSDrm,       0 },
+    { X86::ADDSDrr_Int,     X86::ADDSDrm_Int,   TB_NO_REVERSE },
+    { X86::ADDSSrr,         X86::ADDSSrm,       0 },
+    { X86::ADDSSrr_Int,     X86::ADDSSrm_Int,   TB_NO_REVERSE },
+    { X86::ADDSUBPDrr,      X86::ADDSUBPDrm,    TB_ALIGN_16 },
+    { X86::ADDSUBPSrr,      X86::ADDSUBPSrm,    TB_ALIGN_16 },
+    { X86::AND16rr,         X86::AND16rm,       0 },
+    { X86::AND32rr,         X86::AND32rm,       0 },
+    { X86::AND64rr,         X86::AND64rm,       0 },
+    { X86::AND8rr,          X86::AND8rm,        0 },
+    { X86::ANDNPDrr,        X86::ANDNPDrm,      TB_ALIGN_16 },
+    { X86::ANDNPSrr,        X86::ANDNPSrm,      TB_ALIGN_16 },
+    { X86::ANDPDrr,         X86::ANDPDrm,       TB_ALIGN_16 },
+    { X86::ANDPSrr,         X86::ANDPSrm,       TB_ALIGN_16 },
+    { X86::BLENDPDrri,      X86::BLENDPDrmi,    TB_ALIGN_16 },
+    { X86::BLENDPSrri,      X86::BLENDPSrmi,    TB_ALIGN_16 },
+    { X86::BLENDVPDrr0,     X86::BLENDVPDrm0,   TB_ALIGN_16 },
+    { X86::BLENDVPSrr0,     X86::BLENDVPSrm0,   TB_ALIGN_16 },
+    { X86::CMOVA16rr,       X86::CMOVA16rm,     0 },
+    { X86::CMOVA32rr,       X86::CMOVA32rm,     0 },
+    { X86::CMOVA64rr,       X86::CMOVA64rm,     0 },
+    { X86::CMOVAE16rr,      X86::CMOVAE16rm,    0 },
+    { X86::CMOVAE32rr,      X86::CMOVAE32rm,    0 },
+    { X86::CMOVAE64rr,      X86::CMOVAE64rm,    0 },
+    { X86::CMOVB16rr,       X86::CMOVB16rm,     0 },
+    { X86::CMOVB32rr,       X86::CMOVB32rm,     0 },
+    { X86::CMOVB64rr,       X86::CMOVB64rm,     0 },
+    { X86::CMOVBE16rr,      X86::CMOVBE16rm,    0 },
+    { X86::CMOVBE32rr,      X86::CMOVBE32rm,    0 },
+    { X86::CMOVBE64rr,      X86::CMOVBE64rm,    0 },
+    { X86::CMOVE16rr,       X86::CMOVE16rm,     0 },
+    { X86::CMOVE32rr,       X86::CMOVE32rm,     0 },
+    { X86::CMOVE64rr,       X86::CMOVE64rm,     0 },
+    { X86::CMOVG16rr,       X86::CMOVG16rm,     0 },
+    { X86::CMOVG32rr,       X86::CMOVG32rm,     0 },
+    { X86::CMOVG64rr,       X86::CMOVG64rm,     0 },
+    { X86::CMOVGE16rr,      X86::CMOVGE16rm,    0 },
+    { X86::CMOVGE32rr,      X86::CMOVGE32rm,    0 },
+    { X86::CMOVGE64rr,      X86::CMOVGE64rm,    0 },
+    { X86::CMOVL16rr,       X86::CMOVL16rm,     0 },
+    { X86::CMOVL32rr,       X86::CMOVL32rm,     0 },
+    { X86::CMOVL64rr,       X86::CMOVL64rm,     0 },
+    { X86::CMOVLE16rr,      X86::CMOVLE16rm,    0 },
+    { X86::CMOVLE32rr,      X86::CMOVLE32rm,    0 },
+    { X86::CMOVLE64rr,      X86::CMOVLE64rm,    0 },
+    { X86::CMOVNE16rr,      X86::CMOVNE16rm,    0 },
+    { X86::CMOVNE32rr,      X86::CMOVNE32rm,    0 },
+    { X86::CMOVNE64rr,      X86::CMOVNE64rm,    0 },
+    { X86::CMOVNO16rr,      X86::CMOVNO16rm,    0 },
+    { X86::CMOVNO32rr,      X86::CMOVNO32rm,    0 },
+    { X86::CMOVNO64rr,      X86::CMOVNO64rm,    0 },
+    { X86::CMOVNP16rr,      X86::CMOVNP16rm,    0 },
+    { X86::CMOVNP32rr,      X86::CMOVNP32rm,    0 },
+    { X86::CMOVNP64rr,      X86::CMOVNP64rm,    0 },
+    { X86::CMOVNS16rr,      X86::CMOVNS16rm,    0 },
+    { X86::CMOVNS32rr,      X86::CMOVNS32rm,    0 },
+    { X86::CMOVNS64rr,      X86::CMOVNS64rm,    0 },
+    { X86::CMOVO16rr,       X86::CMOVO16rm,     0 },
+    { X86::CMOVO32rr,       X86::CMOVO32rm,     0 },
+    { X86::CMOVO64rr,       X86::CMOVO64rm,     0 },
+    { X86::CMOVP16rr,       X86::CMOVP16rm,     0 },
+    { X86::CMOVP32rr,       X86::CMOVP32rm,     0 },
+    { X86::CMOVP64rr,       X86::CMOVP64rm,     0 },
+    { X86::CMOVS16rr,       X86::CMOVS16rm,     0 },
+    { X86::CMOVS32rr,       X86::CMOVS32rm,     0 },
+    { X86::CMOVS64rr,       X86::CMOVS64rm,     0 },
+    { X86::CMPPDrri,        X86::CMPPDrmi,      TB_ALIGN_16 },
+    { X86::CMPPSrri,        X86::CMPPSrmi,      TB_ALIGN_16 },
+    { X86::CMPSDrr,         X86::CMPSDrm,       0 },
+    { X86::CMPSSrr,         X86::CMPSSrm,       0 },
+    { X86::CRC32r32r32,     X86::CRC32r32m32,   0 },
+    { X86::CRC32r64r64,     X86::CRC32r64m64,   0 },
+    { X86::DIVPDrr,         X86::DIVPDrm,       TB_ALIGN_16 },
+    { X86::DIVPSrr,         X86::DIVPSrm,       TB_ALIGN_16 },
+    { X86::DIVSDrr,         X86::DIVSDrm,       0 },
+    { X86::DIVSDrr_Int,     X86::DIVSDrm_Int,   TB_NO_REVERSE },
+    { X86::DIVSSrr,         X86::DIVSSrm,       0 },
+    { X86::DIVSSrr_Int,     X86::DIVSSrm_Int,   TB_NO_REVERSE },
+    { X86::DPPDrri,         X86::DPPDrmi,       TB_ALIGN_16 },
+    { X86::DPPSrri,         X86::DPPSrmi,       TB_ALIGN_16 },
+    { X86::HADDPDrr,        X86::HADDPDrm,      TB_ALIGN_16 },
+    { X86::HADDPSrr,        X86::HADDPSrm,      TB_ALIGN_16 },
+    { X86::HSUBPDrr,        X86::HSUBPDrm,      TB_ALIGN_16 },
+    { X86::HSUBPSrr,        X86::HSUBPSrm,      TB_ALIGN_16 },
+    { X86::IMUL16rr,        X86::IMUL16rm,      0 },
+    { X86::IMUL32rr,        X86::IMUL32rm,      0 },
+    { X86::IMUL64rr,        X86::IMUL64rm,      0 },
+    { X86::Int_CMPSDrr,     X86::Int_CMPSDrm,   TB_NO_REVERSE },
+    { X86::Int_CMPSSrr,     X86::Int_CMPSSrm,   TB_NO_REVERSE },
+    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm,      TB_NO_REVERSE },
+    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm,    0 },
+    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm,      0 },
+    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm,    0 },
+    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm,      0 },
+    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm,      TB_NO_REVERSE },
+    { X86::MAXPDrr,         X86::MAXPDrm,       TB_ALIGN_16 },
+    { X86::MAXCPDrr,        X86::MAXCPDrm,      TB_ALIGN_16 },
+    { X86::MAXPSrr,         X86::MAXPSrm,       TB_ALIGN_16 },
+    { X86::MAXCPSrr,        X86::MAXCPSrm,      TB_ALIGN_16 },
+    { X86::MAXSDrr,         X86::MAXSDrm,       0 },
+    { X86::MAXCSDrr,        X86::MAXCSDrm,      0 },
+    { X86::MAXSDrr_Int,     X86::MAXSDrm_Int,   TB_NO_REVERSE },
+    { X86::MAXSSrr,         X86::MAXSSrm,       0 },
+    { X86::MAXCSSrr,        X86::MAXCSSrm,      0 },
+    { X86::MAXSSrr_Int,     X86::MAXSSrm_Int,   TB_NO_REVERSE },
+    { X86::MINPDrr,         X86::MINPDrm,       TB_ALIGN_16 },
+    { X86::MINCPDrr,        X86::MINCPDrm,      TB_ALIGN_16 },
+    { X86::MINPSrr,         X86::MINPSrm,       TB_ALIGN_16 },
+    { X86::MINCPSrr,        X86::MINCPSrm,      TB_ALIGN_16 },
+    { X86::MINSDrr,         X86::MINSDrm,       0 },
+    { X86::MINCSDrr,        X86::MINCSDrm,      0 },
+    { X86::MINSDrr_Int,     X86::MINSDrm_Int,   TB_NO_REVERSE },
+    { X86::MINSSrr,         X86::MINSSrm,       0 },
+    { X86::MINCSSrr,        X86::MINCSSrm,      0 },
+    { X86::MINSSrr_Int,     X86::MINSSrm_Int,   TB_NO_REVERSE },
+    { X86::MOVLHPSrr,       X86::MOVHPSrm,      TB_NO_REVERSE },
+    { X86::MPSADBWrri,      X86::MPSADBWrmi,    TB_ALIGN_16 },
+    { X86::MULPDrr,         X86::MULPDrm,       TB_ALIGN_16 },
+    { X86::MULPSrr,         X86::MULPSrm,       TB_ALIGN_16 },
+    { X86::MULSDrr,         X86::MULSDrm,       0 },
+    { X86::MULSDrr_Int,     X86::MULSDrm_Int,   TB_NO_REVERSE },
+    { X86::MULSSrr,         X86::MULSSrm,       0 },
+    { X86::MULSSrr_Int,     X86::MULSSrm_Int,   TB_NO_REVERSE },
+    { X86::OR16rr,          X86::OR16rm,        0 },
+    { X86::OR32rr,          X86::OR32rm,        0 },
+    { X86::OR64rr,          X86::OR64rm,        0 },
+    { X86::OR8rr,           X86::OR8rm,         0 },
+    { X86::ORPDrr,          X86::ORPDrm,        TB_ALIGN_16 },
+    { X86::ORPSrr,          X86::ORPSrm,        TB_ALIGN_16 },
+    { X86::PACKSSDWrr,      X86::PACKSSDWrm,    TB_ALIGN_16 },
+    { X86::PACKSSWBrr,      X86::PACKSSWBrm,    TB_ALIGN_16 },
+    { X86::PACKUSDWrr,      X86::PACKUSDWrm,    TB_ALIGN_16 },
+    { X86::PACKUSWBrr,      X86::PACKUSWBrm,    TB_ALIGN_16 },
+    { X86::PADDBrr,         X86::PADDBrm,       TB_ALIGN_16 },
+    { X86::PADDDrr,         X86::PADDDrm,       TB_ALIGN_16 },
+    { X86::PADDQrr,         X86::PADDQrm,       TB_ALIGN_16 },
+    { X86::PADDSBrr,        X86::PADDSBrm,      TB_ALIGN_16 },
+    { X86::PADDSWrr,        X86::PADDSWrm,      TB_ALIGN_16 },
+    { X86::PADDUSBrr,       X86::PADDUSBrm,     TB_ALIGN_16 },
+    { X86::PADDUSWrr,       X86::PADDUSWrm,     TB_ALIGN_16 },
+    { X86::PADDWrr,         X86::PADDWrm,       TB_ALIGN_16 },
+    { X86::PALIGNRrri,      X86::PALIGNRrmi,    TB_ALIGN_16 },
+    { X86::PANDNrr,         X86::PANDNrm,       TB_ALIGN_16 },
+    { X86::PANDrr,          X86::PANDrm,        TB_ALIGN_16 },
+    { X86::PAVGBrr,         X86::PAVGBrm,       TB_ALIGN_16 },
+    { X86::PAVGWrr,         X86::PAVGWrm,       TB_ALIGN_16 },
+    { X86::PBLENDVBrr0,     X86::PBLENDVBrm0,   TB_ALIGN_16 },
+    { X86::PBLENDWrri,      X86::PBLENDWrmi,    TB_ALIGN_16 },
+    { X86::PCLMULQDQrr,     X86::PCLMULQDQrm,   TB_ALIGN_16 },
+    { X86::PCMPEQBrr,       X86::PCMPEQBrm,     TB_ALIGN_16 },
+    { X86::PCMPEQDrr,       X86::PCMPEQDrm,     TB_ALIGN_16 },
+    { X86::PCMPEQQrr,       X86::PCMPEQQrm,     TB_ALIGN_16 },
+    { X86::PCMPEQWrr,       X86::PCMPEQWrm,     TB_ALIGN_16 },
+    { X86::PCMPGTBrr,       X86::PCMPGTBrm,     TB_ALIGN_16 },
+    { X86::PCMPGTDrr,       X86::PCMPGTDrm,     TB_ALIGN_16 },
+    { X86::PCMPGTQrr,       X86::PCMPGTQrm,     TB_ALIGN_16 },
+    { X86::PCMPGTWrr,       X86::PCMPGTWrm,     TB_ALIGN_16 },
+    { X86::PHADDDrr,        X86::PHADDDrm,      TB_ALIGN_16 },
+    { X86::PHADDWrr,        X86::PHADDWrm,      TB_ALIGN_16 },
+    { X86::PHADDSWrr128,    X86::PHADDSWrm128,  TB_ALIGN_16 },
+    { X86::PHSUBDrr,        X86::PHSUBDrm,      TB_ALIGN_16 },
+    { X86::PHSUBSWrr128,    X86::PHSUBSWrm128,  TB_ALIGN_16 },
+    { X86::PHSUBWrr,        X86::PHSUBWrm,      TB_ALIGN_16 },
+    { X86::PINSRBrr,        X86::PINSRBrm,      0 },
+    { X86::PINSRDrr,        X86::PINSRDrm,      0 },
+    { X86::PINSRQrr,        X86::PINSRQrm,      0 },
+    { X86::PINSRWrri,       X86::PINSRWrmi,     0 },
+    { X86::PMADDUBSWrr,     X86::PMADDUBSWrm,   TB_ALIGN_16 },
+    { X86::PMADDWDrr,       X86::PMADDWDrm,     TB_ALIGN_16 },
+    { X86::PMAXSWrr,        X86::PMAXSWrm,      TB_ALIGN_16 },
+    { X86::PMAXUBrr,        X86::PMAXUBrm,      TB_ALIGN_16 },
+    { X86::PMINSWrr,        X86::PMINSWrm,      TB_ALIGN_16 },
+    { X86::PMINUBrr,        X86::PMINUBrm,      TB_ALIGN_16 },
+    { X86::PMINSBrr,        X86::PMINSBrm,      TB_ALIGN_16 },
+    { X86::PMINSDrr,        X86::PMINSDrm,      TB_ALIGN_16 },
+    { X86::PMINUDrr,        X86::PMINUDrm,      TB_ALIGN_16 },
+    { X86::PMINUWrr,        X86::PMINUWrm,      TB_ALIGN_16 },
+    { X86::PMAXSBrr,        X86::PMAXSBrm,      TB_ALIGN_16 },
+    { X86::PMAXSDrr,        X86::PMAXSDrm,      TB_ALIGN_16 },
+    { X86::PMAXUDrr,        X86::PMAXUDrm,      TB_ALIGN_16 },
+    { X86::PMAXUWrr,        X86::PMAXUWrm,      TB_ALIGN_16 },
+    { X86::PMULDQrr,        X86::PMULDQrm,      TB_ALIGN_16 },
+    { X86::PMULHRSWrr,      X86::PMULHRSWrm,    TB_ALIGN_16 },
+    { X86::PMULHUWrr,       X86::PMULHUWrm,     TB_ALIGN_16 },
+    { X86::PMULHWrr,        X86::PMULHWrm,      TB_ALIGN_16 },
+    { X86::PMULLDrr,        X86::PMULLDrm,      TB_ALIGN_16 },
+    { X86::PMULLWrr,        X86::PMULLWrm,      TB_ALIGN_16 },
+    { X86::PMULUDQrr,       X86::PMULUDQrm,     TB_ALIGN_16 },
+    { X86::PORrr,           X86::PORrm,         TB_ALIGN_16 },
+    { X86::PSADBWrr,        X86::PSADBWrm,      TB_ALIGN_16 },
+    { X86::PSHUFBrr,        X86::PSHUFBrm,      TB_ALIGN_16 },
+    { X86::PSIGNBrr128,     X86::PSIGNBrm128,   TB_ALIGN_16 },
+    { X86::PSIGNWrr128,     X86::PSIGNWrm128,   TB_ALIGN_16 },
+    { X86::PSIGNDrr128,     X86::PSIGNDrm128,   TB_ALIGN_16 },
+    { X86::PSLLDrr,         X86::PSLLDrm,       TB_ALIGN_16 },
+    { X86::PSLLQrr,         X86::PSLLQrm,       TB_ALIGN_16 },
+    { X86::PSLLWrr,         X86::PSLLWrm,       TB_ALIGN_16 },
+    { X86::PSRADrr,         X86::PSRADrm,       TB_ALIGN_16 },
+    { X86::PSRAWrr,         X86::PSRAWrm,       TB_ALIGN_16 },
+    { X86::PSRLDrr,         X86::PSRLDrm,       TB_ALIGN_16 },
+    { X86::PSRLQrr,         X86::PSRLQrm,       TB_ALIGN_16 },
+    { X86::PSRLWrr,         X86::PSRLWrm,       TB_ALIGN_16 },
+    { X86::PSUBBrr,         X86::PSUBBrm,       TB_ALIGN_16 },
+    { X86::PSUBDrr,         X86::PSUBDrm,       TB_ALIGN_16 },
+    { X86::PSUBQrr,         X86::PSUBQrm,       TB_ALIGN_16 },
+    { X86::PSUBSBrr,        X86::PSUBSBrm,      TB_ALIGN_16 },
+    { X86::PSUBSWrr,        X86::PSUBSWrm,      TB_ALIGN_16 },
+    { X86::PSUBUSBrr,       X86::PSUBUSBrm,     TB_ALIGN_16 },
+    { X86::PSUBUSWrr,       X86::PSUBUSWrm,     TB_ALIGN_16 },
+    { X86::PSUBWrr,         X86::PSUBWrm,       TB_ALIGN_16 },
+    { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm,   TB_ALIGN_16 },
+    { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm,   TB_ALIGN_16 },
+    { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm,  TB_ALIGN_16 },
+    { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm,  TB_ALIGN_16 },
+    { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm,   TB_ALIGN_16 },
+    { X86::PXORrr,          X86::PXORrm,        TB_ALIGN_16 },
+    { X86::ROUNDSDr_Int,    X86::ROUNDSDm_Int,  TB_NO_REVERSE },
+    { X86::ROUNDSSr_Int,    X86::ROUNDSSm_Int,  TB_NO_REVERSE },
+    { X86::SBB32rr,         X86::SBB32rm,       0 },
+    { X86::SBB64rr,         X86::SBB64rm,       0 },
+    { X86::SHUFPDrri,       X86::SHUFPDrmi,     TB_ALIGN_16 },
+    { X86::SHUFPSrri,       X86::SHUFPSrmi,     TB_ALIGN_16 },
+    { X86::SUB16rr,         X86::SUB16rm,       0 },
+    { X86::SUB32rr,         X86::SUB32rm,       0 },
+    { X86::SUB64rr,         X86::SUB64rm,       0 },
+    { X86::SUB8rr,          X86::SUB8rm,        0 },
+    { X86::SUBPDrr,         X86::SUBPDrm,       TB_ALIGN_16 },
+    { X86::SUBPSrr,         X86::SUBPSrm,       TB_ALIGN_16 },
+    { X86::SUBSDrr,         X86::SUBSDrm,       0 },
+    { X86::SUBSDrr_Int,     X86::SUBSDrm_Int,   TB_NO_REVERSE },
+    { X86::SUBSSrr,         X86::SUBSSrm,       0 },
+    { X86::SUBSSrr_Int,     X86::SUBSSrm_Int,   TB_NO_REVERSE },
+    // FIXME: TEST*rr -> swapped operand of TEST*mr.
+    { X86::UNPCKHPDrr,      X86::UNPCKHPDrm,    TB_ALIGN_16 },
+    { X86::UNPCKHPSrr,      X86::UNPCKHPSrm,    TB_ALIGN_16 },
+    { X86::UNPCKLPDrr,      X86::UNPCKLPDrm,    TB_ALIGN_16 },
+    { X86::UNPCKLPSrr,      X86::UNPCKLPSrm,    TB_ALIGN_16 },
+    { X86::XOR16rr,         X86::XOR16rm,       0 },
+    { X86::XOR32rr,         X86::XOR32rm,       0 },
+    { X86::XOR64rr,         X86::XOR64rm,       0 },
+    { X86::XOR8rr,          X86::XOR8rm,        0 },
+    { X86::XORPDrr,         X86::XORPDrm,       TB_ALIGN_16 },
+    { X86::XORPSrr,         X86::XORPSrm,       TB_ALIGN_16 },
+
+    // MMX version of foldable instructions
+    { X86::MMX_CVTPI2PSirr,   X86::MMX_CVTPI2PSirm,   0 },
+    { X86::MMX_PACKSSDWirr,   X86::MMX_PACKSSDWirm,   0 },
+    { X86::MMX_PACKSSWBirr,   X86::MMX_PACKSSWBirm,   0 },
+    { X86::MMX_PACKUSWBirr,   X86::MMX_PACKUSWBirm,   0 },
+    { X86::MMX_PADDBirr,      X86::MMX_PADDBirm,      0 },
+    { X86::MMX_PADDDirr,      X86::MMX_PADDDirm,      0 },
+    { X86::MMX_PADDQirr,      X86::MMX_PADDQirm,      0 },
+    { X86::MMX_PADDSBirr,     X86::MMX_PADDSBirm,     0 },
+    { X86::MMX_PADDSWirr,     X86::MMX_PADDSWirm,     0 },
+    { X86::MMX_PADDUSBirr,    X86::MMX_PADDUSBirm,    0 },
+    { X86::MMX_PADDUSWirr,    X86::MMX_PADDUSWirm,    0 },
+    { X86::MMX_PADDWirr,      X86::MMX_PADDWirm,      0 },
+    { X86::MMX_PALIGNR64irr,  X86::MMX_PALIGNR64irm,  0 },
+    { X86::MMX_PANDNirr,      X86::MMX_PANDNirm,      0 },
+    { X86::MMX_PANDirr,       X86::MMX_PANDirm,       0 },
+    { X86::MMX_PAVGBirr,      X86::MMX_PAVGBirm,      0 },
+    { X86::MMX_PAVGWirr,      X86::MMX_PAVGWirm,      0 },
+    { X86::MMX_PCMPEQBirr,    X86::MMX_PCMPEQBirm,    0 },
+    { X86::MMX_PCMPEQDirr,    X86::MMX_PCMPEQDirm,    0 },
+    { X86::MMX_PCMPEQWirr,    X86::MMX_PCMPEQWirm,    0 },
+    { X86::MMX_PCMPGTBirr,    X86::MMX_PCMPGTBirm,    0 },
+    { X86::MMX_PCMPGTDirr,    X86::MMX_PCMPGTDirm,    0 },
+    { X86::MMX_PCMPGTWirr,    X86::MMX_PCMPGTWirm,    0 },
+    { X86::MMX_PHADDSWrr64,   X86::MMX_PHADDSWrm64,   0 },
+    { X86::MMX_PHADDWrr64,    X86::MMX_PHADDWrm64,    0 },
+    { X86::MMX_PHADDrr64,     X86::MMX_PHADDrm64,     0 },
+    { X86::MMX_PHSUBDrr64,    X86::MMX_PHSUBDrm64,    0 },
+    { X86::MMX_PHSUBSWrr64,   X86::MMX_PHSUBSWrm64,   0 },
+    { X86::MMX_PHSUBWrr64,    X86::MMX_PHSUBWrm64,    0 },
+    { X86::MMX_PINSRWirri,    X86::MMX_PINSRWirmi,    0 },
+    { X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 },
+    { X86::MMX_PMADDWDirr,    X86::MMX_PMADDWDirm,    0 },
+    { X86::MMX_PMAXSWirr,     X86::MMX_PMAXSWirm,     0 },
+    { X86::MMX_PMAXUBirr,     X86::MMX_PMAXUBirm,     0 },
+    { X86::MMX_PMINSWirr,     X86::MMX_PMINSWirm,     0 },
+    { X86::MMX_PMINUBirr,     X86::MMX_PMINUBirm,     0 },
+    { X86::MMX_PMULHRSWrr64,  X86::MMX_PMULHRSWrm64,  0 },
+    { X86::MMX_PMULHUWirr,    X86::MMX_PMULHUWirm,    0 },
+    { X86::MMX_PMULHWirr,     X86::MMX_PMULHWirm,     0 },
+    { X86::MMX_PMULLWirr,     X86::MMX_PMULLWirm,     0 },
+    { X86::MMX_PMULUDQirr,    X86::MMX_PMULUDQirm,    0 },
+    { X86::MMX_PORirr,        X86::MMX_PORirm,        0 },
+    { X86::MMX_PSADBWirr,     X86::MMX_PSADBWirm,     0 },
+    { X86::MMX_PSHUFBrr64,    X86::MMX_PSHUFBrm64,    0 },
+    { X86::MMX_PSIGNBrr64,    X86::MMX_PSIGNBrm64,    0 },
+    { X86::MMX_PSIGNDrr64,    X86::MMX_PSIGNDrm64,    0 },
+    { X86::MMX_PSIGNWrr64,    X86::MMX_PSIGNWrm64,    0 },
+    { X86::MMX_PSLLDrr,       X86::MMX_PSLLDrm,       0 },
+    { X86::MMX_PSLLQrr,       X86::MMX_PSLLQrm,       0 },
+    { X86::MMX_PSLLWrr,       X86::MMX_PSLLWrm,       0 },
+    { X86::MMX_PSRADrr,       X86::MMX_PSRADrm,       0 },
+    { X86::MMX_PSRAWrr,       X86::MMX_PSRAWrm,       0 },
+    { X86::MMX_PSRLDrr,       X86::MMX_PSRLDrm,       0 },
+    { X86::MMX_PSRLQrr,       X86::MMX_PSRLQrm,       0 },
+    { X86::MMX_PSRLWrr,       X86::MMX_PSRLWrm,       0 },
+    { X86::MMX_PSUBBirr,      X86::MMX_PSUBBirm,      0 },
+    { X86::MMX_PSUBDirr,      X86::MMX_PSUBDirm,      0 },
+    { X86::MMX_PSUBQirr,      X86::MMX_PSUBQirm,      0 },
+    { X86::MMX_PSUBSBirr,     X86::MMX_PSUBSBirm,     0 },
+    { X86::MMX_PSUBSWirr,     X86::MMX_PSUBSWirm,     0 },
+    { X86::MMX_PSUBUSBirr,    X86::MMX_PSUBUSBirm,    0 },
+    { X86::MMX_PSUBUSWirr,    X86::MMX_PSUBUSWirm,    0 },
+    { X86::MMX_PSUBWirr,      X86::MMX_PSUBWirm,      0 },
+    { X86::MMX_PUNPCKHBWirr,  X86::MMX_PUNPCKHBWirm,  0 },
+    { X86::MMX_PUNPCKHDQirr,  X86::MMX_PUNPCKHDQirm,  0 },
+    { X86::MMX_PUNPCKHWDirr,  X86::MMX_PUNPCKHWDirm,  0 },
+    { X86::MMX_PUNPCKLBWirr,  X86::MMX_PUNPCKLBWirm,  0 },
+    { X86::MMX_PUNPCKLDQirr,  X86::MMX_PUNPCKLDQirm,  0 },
+    { X86::MMX_PUNPCKLWDirr,  X86::MMX_PUNPCKLWDirm,  0 },
+    { X86::MMX_PXORirr,       X86::MMX_PXORirm,       0 },
+
+    // 3DNow! version of foldable instructions
+    { X86::PAVGUSBrr,         X86::PAVGUSBrm,         0 },
+    { X86::PFACCrr,           X86::PFACCrm,           0 },
+    { X86::PFADDrr,           X86::PFADDrm,           0 },
+    { X86::PFCMPEQrr,         X86::PFCMPEQrm,         0 },
+    { X86::PFCMPGErr,         X86::PFCMPGErm,         0 },
+    { X86::PFCMPGTrr,         X86::PFCMPGTrm,         0 },
+    { X86::PFMAXrr,           X86::PFMAXrm,           0 },
+    { X86::PFMINrr,           X86::PFMINrm,           0 },
+    { X86::PFMULrr,           X86::PFMULrm,           0 },
+    { X86::PFNACCrr,          X86::PFNACCrm,          0 },
+    { X86::PFPNACCrr,         X86::PFPNACCrm,         0 },
+    { X86::PFRCPIT1rr,        X86::PFRCPIT1rm,        0 },
+    { X86::PFRCPIT2rr,        X86::PFRCPIT2rm,        0 },
+    { X86::PFRSQIT1rr,        X86::PFRSQIT1rm,        0 },
+    { X86::PFSUBrr,           X86::PFSUBrm,           0 },
+    { X86::PFSUBRrr,          X86::PFSUBRrm,          0 },
+    { X86::PMULHRWrr,         X86::PMULHRWrm,         0 },
+
+    // AVX 128-bit versions of foldable instructions
+    { X86::VCVTSD2SSrr,       X86::VCVTSD2SSrm,        0 },
+    { X86::Int_VCVTSD2SSrr,   X86::Int_VCVTSD2SSrm,    TB_NO_REVERSE },
+    { X86::VCVTSI2SD64rr,     X86::VCVTSI2SD64rm,      0 },
+    { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm,  0 },
+    { X86::VCVTSI2SDrr,       X86::VCVTSI2SDrm,        0 },
+    { X86::Int_VCVTSI2SDrr,   X86::Int_VCVTSI2SDrm,    0 },
+    { X86::VCVTSI2SS64rr,     X86::VCVTSI2SS64rm,      0 },
+    { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm,  0 },
+    { X86::VCVTSI2SSrr,       X86::VCVTSI2SSrm,        0 },
+    { X86::Int_VCVTSI2SSrr,   X86::Int_VCVTSI2SSrm,    0 },
+    { X86::VCVTSS2SDrr,       X86::VCVTSS2SDrm,        0 },
+    { X86::Int_VCVTSS2SDrr,   X86::Int_VCVTSS2SDrm,    TB_NO_REVERSE },
+    { X86::VADDPDrr,          X86::VADDPDrm,           0 },
+    { X86::VADDPSrr,          X86::VADDPSrm,           0 },
+    { X86::VADDSDrr,          X86::VADDSDrm,           0 },
+    { X86::VADDSDrr_Int,      X86::VADDSDrm_Int,       TB_NO_REVERSE },
+    { X86::VADDSSrr,          X86::VADDSSrm,           0 },
+    { X86::VADDSSrr_Int,      X86::VADDSSrm_Int,       TB_NO_REVERSE },
+    { X86::VADDSUBPDrr,       X86::VADDSUBPDrm,        0 },
+    { X86::VADDSUBPSrr,       X86::VADDSUBPSrm,        0 },
+    { X86::VANDNPDrr,         X86::VANDNPDrm,          0 },
+    { X86::VANDNPSrr,         X86::VANDNPSrm,          0 },
+    { X86::VANDPDrr,          X86::VANDPDrm,           0 },
+    { X86::VANDPSrr,          X86::VANDPSrm,           0 },
+    { X86::VBLENDPDrri,       X86::VBLENDPDrmi,        0 },
+    { X86::VBLENDPSrri,       X86::VBLENDPSrmi,        0 },
+    { X86::VBLENDVPDrr,       X86::VBLENDVPDrm,        0 },
+    { X86::VBLENDVPSrr,       X86::VBLENDVPSrm,        0 },
+    { X86::VCMPPDrri,         X86::VCMPPDrmi,          0 },
+    { X86::VCMPPSrri,         X86::VCMPPSrmi,          0 },
+    { X86::VCMPSDrr,          X86::VCMPSDrm,           0 },
+    { X86::VCMPSSrr,          X86::VCMPSSrm,           0 },
+    { X86::VDIVPDrr,          X86::VDIVPDrm,           0 },
+    { X86::VDIVPSrr,          X86::VDIVPSrm,           0 },
+    { X86::VDIVSDrr,          X86::VDIVSDrm,           0 },
+    { X86::VDIVSDrr_Int,      X86::VDIVSDrm_Int,       TB_NO_REVERSE },
+    { X86::VDIVSSrr,          X86::VDIVSSrm,           0 },
+    { X86::VDIVSSrr_Int,      X86::VDIVSSrm_Int,       TB_NO_REVERSE },
+    { X86::VDPPDrri,          X86::VDPPDrmi,           0 },
+    { X86::VDPPSrri,          X86::VDPPSrmi,           0 },
+    { X86::VHADDPDrr,         X86::VHADDPDrm,          0 },
+    { X86::VHADDPSrr,         X86::VHADDPSrm,          0 },
+    { X86::VHSUBPDrr,         X86::VHSUBPDrm,          0 },
+    { X86::VHSUBPSrr,         X86::VHSUBPSrm,          0 },
+    { X86::Int_VCMPSDrr,      X86::Int_VCMPSDrm,       TB_NO_REVERSE },
+    { X86::Int_VCMPSSrr,      X86::Int_VCMPSSrm,       TB_NO_REVERSE },
+    { X86::VMAXCPDrr,         X86::VMAXCPDrm,          0 },
+    { X86::VMAXCPSrr,         X86::VMAXCPSrm,          0 },
+    { X86::VMAXCSDrr,         X86::VMAXCSDrm,          0 },
+    { X86::VMAXCSSrr,         X86::VMAXCSSrm,          0 },
+    { X86::VMAXPDrr,          X86::VMAXPDrm,           0 },
+    { X86::VMAXPSrr,          X86::VMAXPSrm,           0 },
+    { X86::VMAXSDrr,          X86::VMAXSDrm,           0 },
+    { X86::VMAXSDrr_Int,      X86::VMAXSDrm_Int,       TB_NO_REVERSE },
+    { X86::VMAXSSrr,          X86::VMAXSSrm,           0 },
+    { X86::VMAXSSrr_Int,      X86::VMAXSSrm_Int,       TB_NO_REVERSE },
+    { X86::VMINCPDrr,         X86::VMINCPDrm,          0 },
+    { X86::VMINCPSrr,         X86::VMINCPSrm,          0 },
+    { X86::VMINCSDrr,         X86::VMINCSDrm,          0 },
+    { X86::VMINCSSrr,         X86::VMINCSSrm,          0 },
+    { X86::VMINPDrr,          X86::VMINPDrm,           0 },
+    { X86::VMINPSrr,          X86::VMINPSrm,           0 },
+    { X86::VMINSDrr,          X86::VMINSDrm,           0 },
+    { X86::VMINSDrr_Int,      X86::VMINSDrm_Int,       TB_NO_REVERSE },
+    { X86::VMINSSrr,          X86::VMINSSrm,           0 },
+    { X86::VMINSSrr_Int,      X86::VMINSSrm_Int,       TB_NO_REVERSE },
+    { X86::VMOVLHPSrr,        X86::VMOVHPSrm,          TB_NO_REVERSE },
+    { X86::VMPSADBWrri,       X86::VMPSADBWrmi,        0 },
+    { X86::VMULPDrr,          X86::VMULPDrm,           0 },
+    { X86::VMULPSrr,          X86::VMULPSrm,           0 },
+    { X86::VMULSDrr,          X86::VMULSDrm,           0 },
+    { X86::VMULSDrr_Int,      X86::VMULSDrm_Int,       TB_NO_REVERSE },
+    { X86::VMULSSrr,          X86::VMULSSrm,           0 },
+    { X86::VMULSSrr_Int,      X86::VMULSSrm_Int,       TB_NO_REVERSE },
+    { X86::VORPDrr,           X86::VORPDrm,            0 },
+    { X86::VORPSrr,           X86::VORPSrm,            0 },
+    { X86::VPACKSSDWrr,       X86::VPACKSSDWrm,        0 },
+    { X86::VPACKSSWBrr,       X86::VPACKSSWBrm,        0 },
+    { X86::VPACKUSDWrr,       X86::VPACKUSDWrm,        0 },
+    { X86::VPACKUSWBrr,       X86::VPACKUSWBrm,        0 },
+    { X86::VPADDBrr,          X86::VPADDBrm,           0 },
+    { X86::VPADDDrr,          X86::VPADDDrm,           0 },
+    { X86::VPADDQrr,          X86::VPADDQrm,           0 },
+    { X86::VPADDSBrr,         X86::VPADDSBrm,          0 },
+    { X86::VPADDSWrr,         X86::VPADDSWrm,          0 },
+    { X86::VPADDUSBrr,        X86::VPADDUSBrm,         0 },
+    { X86::VPADDUSWrr,        X86::VPADDUSWrm,         0 },
+    { X86::VPADDWrr,          X86::VPADDWrm,           0 },
+    { X86::VPALIGNRrri,       X86::VPALIGNRrmi,        0 },
+    { X86::VPANDNrr,          X86::VPANDNrm,           0 },
+    { X86::VPANDrr,           X86::VPANDrm,            0 },
+    { X86::VPAVGBrr,          X86::VPAVGBrm,           0 },
+    { X86::VPAVGWrr,          X86::VPAVGWrm,           0 },
+    { X86::VPBLENDVBrr,       X86::VPBLENDVBrm,        0 },
+    { X86::VPBLENDWrri,       X86::VPBLENDWrmi,        0 },
+    { X86::VPCLMULQDQrr,      X86::VPCLMULQDQrm,       0 },
+    { X86::VPCMPEQBrr,        X86::VPCMPEQBrm,         0 },
+    { X86::VPCMPEQDrr,        X86::VPCMPEQDrm,         0 },
+    { X86::VPCMPEQQrr,        X86::VPCMPEQQrm,         0 },
+    { X86::VPCMPEQWrr,        X86::VPCMPEQWrm,         0 },
+    { X86::VPCMPGTBrr,        X86::VPCMPGTBrm,         0 },
+    { X86::VPCMPGTDrr,        X86::VPCMPGTDrm,         0 },
+    { X86::VPCMPGTQrr,        X86::VPCMPGTQrm,         0 },
+    { X86::VPCMPGTWrr,        X86::VPCMPGTWrm,         0 },
+    { X86::VPHADDDrr,         X86::VPHADDDrm,          0 },
+    { X86::VPHADDSWrr128,     X86::VPHADDSWrm128,      0 },
+    { X86::VPHADDWrr,         X86::VPHADDWrm,          0 },
+    { X86::VPHSUBDrr,         X86::VPHSUBDrm,          0 },
+    { X86::VPHSUBSWrr128,     X86::VPHSUBSWrm128,      0 },
+    { X86::VPHSUBWrr,         X86::VPHSUBWrm,          0 },
+    { X86::VPERMILPDrr,       X86::VPERMILPDrm,        0 },
+    { X86::VPERMILPSrr,       X86::VPERMILPSrm,        0 },
+    { X86::VPINSRBrr,         X86::VPINSRBrm,          0 },
+    { X86::VPINSRDrr,         X86::VPINSRDrm,          0 },
+    { X86::VPINSRQrr,         X86::VPINSRQrm,          0 },
+    { X86::VPINSRWrri,        X86::VPINSRWrmi,         0 },
+    { X86::VPMADDUBSWrr,      X86::VPMADDUBSWrm,       0 },
+    { X86::VPMADDWDrr,        X86::VPMADDWDrm,         0 },
+    { X86::VPMAXSWrr,         X86::VPMAXSWrm,          0 },
+    { X86::VPMAXUBrr,         X86::VPMAXUBrm,          0 },
+    { X86::VPMINSWrr,         X86::VPMINSWrm,          0 },
+    { X86::VPMINUBrr,         X86::VPMINUBrm,          0 },
+    { X86::VPMINSBrr,         X86::VPMINSBrm,          0 },
+    { X86::VPMINSDrr,         X86::VPMINSDrm,          0 },
+    { X86::VPMINUDrr,         X86::VPMINUDrm,          0 },
+    { X86::VPMINUWrr,         X86::VPMINUWrm,          0 },
+    { X86::VPMAXSBrr,         X86::VPMAXSBrm,          0 },
+    { X86::VPMAXSDrr,         X86::VPMAXSDrm,          0 },
+    { X86::VPMAXUDrr,         X86::VPMAXUDrm,          0 },
+    { X86::VPMAXUWrr,         X86::VPMAXUWrm,          0 },
+    { X86::VPMULDQrr,         X86::VPMULDQrm,          0 },
+    { X86::VPMULHRSWrr,       X86::VPMULHRSWrm,        0 },
+    { X86::VPMULHUWrr,        X86::VPMULHUWrm,         0 },
+    { X86::VPMULHWrr,         X86::VPMULHWrm,          0 },
+    { X86::VPMULLDrr,         X86::VPMULLDrm,          0 },
+    { X86::VPMULLWrr,         X86::VPMULLWrm,          0 },
+    { X86::VPMULUDQrr,        X86::VPMULUDQrm,         0 },
+    { X86::VPORrr,            X86::VPORrm,             0 },
+    { X86::VPSADBWrr,         X86::VPSADBWrm,          0 },
+    { X86::VPSHUFBrr,         X86::VPSHUFBrm,          0 },
+    { X86::VPSIGNBrr128,      X86::VPSIGNBrm128,       0 },
+    { X86::VPSIGNWrr128,      X86::VPSIGNWrm128,       0 },
+    { X86::VPSIGNDrr128,      X86::VPSIGNDrm128,       0 },
+    { X86::VPSLLDrr,          X86::VPSLLDrm,           0 },
+    { X86::VPSLLQrr,          X86::VPSLLQrm,           0 },
+    { X86::VPSLLWrr,          X86::VPSLLWrm,           0 },
+    { X86::VPSRADrr,          X86::VPSRADrm,           0 },
+    { X86::VPSRAWrr,          X86::VPSRAWrm,           0 },
+    { X86::VPSRLDrr,          X86::VPSRLDrm,           0 },
+    { X86::VPSRLQrr,          X86::VPSRLQrm,           0 },
+    { X86::VPSRLWrr,          X86::VPSRLWrm,           0 },
+    { X86::VPSUBBrr,          X86::VPSUBBrm,           0 },
+    { X86::VPSUBDrr,          X86::VPSUBDrm,           0 },
+    { X86::VPSUBQrr,          X86::VPSUBQrm,           0 },
+    { X86::VPSUBSBrr,         X86::VPSUBSBrm,          0 },
+    { X86::VPSUBSWrr,         X86::VPSUBSWrm,          0 },
+    { X86::VPSUBUSBrr,        X86::VPSUBUSBrm,         0 },
+    { X86::VPSUBUSWrr,        X86::VPSUBUSWrm,         0 },
+    { X86::VPSUBWrr,          X86::VPSUBWrm,           0 },
+    { X86::VPUNPCKHBWrr,      X86::VPUNPCKHBWrm,       0 },
+    { X86::VPUNPCKHDQrr,      X86::VPUNPCKHDQrm,       0 },
+    { X86::VPUNPCKHQDQrr,     X86::VPUNPCKHQDQrm,      0 },
+    { X86::VPUNPCKHWDrr,      X86::VPUNPCKHWDrm,       0 },
+    { X86::VPUNPCKLBWrr,      X86::VPUNPCKLBWrm,       0 },
+    { X86::VPUNPCKLDQrr,      X86::VPUNPCKLDQrm,       0 },
+    { X86::VPUNPCKLQDQrr,     X86::VPUNPCKLQDQrm,      0 },
+    { X86::VPUNPCKLWDrr,      X86::VPUNPCKLWDrm,       0 },
+    { X86::VPXORrr,           X86::VPXORrm,            0 },
+    { X86::VRCPSSr,           X86::VRCPSSm,            0 },
+    { X86::VRCPSSr_Int,       X86::VRCPSSm_Int,        TB_NO_REVERSE },
+    { X86::VRSQRTSSr,         X86::VRSQRTSSm,          0 },
+    { X86::VRSQRTSSr_Int,     X86::VRSQRTSSm_Int,      TB_NO_REVERSE },
+    { X86::VROUNDSDr,         X86::VROUNDSDm,          0 },
+    { X86::VROUNDSDr_Int,     X86::VROUNDSDm_Int,      TB_NO_REVERSE },
+    { X86::VROUNDSSr,         X86::VROUNDSSm,          0 },
+    { X86::VROUNDSSr_Int,     X86::VROUNDSSm_Int,      TB_NO_REVERSE },
+    { X86::VSHUFPDrri,        X86::VSHUFPDrmi,         0 },
+    { X86::VSHUFPSrri,        X86::VSHUFPSrmi,         0 },
+    { X86::VSQRTSDr,          X86::VSQRTSDm,           0 },
+    { X86::VSQRTSDr_Int,      X86::VSQRTSDm_Int,       TB_NO_REVERSE },
+    { X86::VSQRTSSr,          X86::VSQRTSSm,           0 },
+    { X86::VSQRTSSr_Int,      X86::VSQRTSSm_Int,       TB_NO_REVERSE },
+    { X86::VSUBPDrr,          X86::VSUBPDrm,           0 },
+    { X86::VSUBPSrr,          X86::VSUBPSrm,           0 },
+    { X86::VSUBSDrr,          X86::VSUBSDrm,           0 },
+    { X86::VSUBSDrr_Int,      X86::VSUBSDrm_Int,       TB_NO_REVERSE },
+    { X86::VSUBSSrr,          X86::VSUBSSrm,           0 },
+    { X86::VSUBSSrr_Int,      X86::VSUBSSrm_Int,       TB_NO_REVERSE },
+    { X86::VUNPCKHPDrr,       X86::VUNPCKHPDrm,        0 },
+    { X86::VUNPCKHPSrr,       X86::VUNPCKHPSrm,        0 },
+    { X86::VUNPCKLPDrr,       X86::VUNPCKLPDrm,        0 },
+    { X86::VUNPCKLPSrr,       X86::VUNPCKLPSrm,        0 },
+    { X86::VXORPDrr,          X86::VXORPDrm,           0 },
+    { X86::VXORPSrr,          X86::VXORPSrm,           0 },
+
+    // AVX 256-bit foldable instructions
+    { X86::VADDPDYrr,         X86::VADDPDYrm,          0 },
+    { X86::VADDPSYrr,         X86::VADDPSYrm,          0 },
+    { X86::VADDSUBPDYrr,      X86::VADDSUBPDYrm,       0 },
+    { X86::VADDSUBPSYrr,      X86::VADDSUBPSYrm,       0 },
+    { X86::VANDNPDYrr,        X86::VANDNPDYrm,         0 },
+    { X86::VANDNPSYrr,        X86::VANDNPSYrm,         0 },
+    { X86::VANDPDYrr,         X86::VANDPDYrm,          0 },
+    { X86::VANDPSYrr,         X86::VANDPSYrm,          0 },
+    { X86::VBLENDPDYrri,      X86::VBLENDPDYrmi,       0 },
+    { X86::VBLENDPSYrri,      X86::VBLENDPSYrmi,       0 },
+    { X86::VBLENDVPDYrr,      X86::VBLENDVPDYrm,       0 },
+    { X86::VBLENDVPSYrr,      X86::VBLENDVPSYrm,       0 },
+    { X86::VCMPPDYrri,        X86::VCMPPDYrmi,         0 },
+    { X86::VCMPPSYrri,        X86::VCMPPSYrmi,         0 },
+    { X86::VDIVPDYrr,         X86::VDIVPDYrm,          0 },
+    { X86::VDIVPSYrr,         X86::VDIVPSYrm,          0 },
+    { X86::VDPPSYrri,         X86::VDPPSYrmi,          0 },
+    { X86::VHADDPDYrr,        X86::VHADDPDYrm,         0 },
+    { X86::VHADDPSYrr,        X86::VHADDPSYrm,         0 },
+    { X86::VHSUBPDYrr,        X86::VHSUBPDYrm,         0 },
+    { X86::VHSUBPSYrr,        X86::VHSUBPSYrm,         0 },
+    { X86::VINSERTF128rr,     X86::VINSERTF128rm,      0 },
+    { X86::VMAXCPDYrr,        X86::VMAXCPDYrm,         0 },
+    { X86::VMAXCPSYrr,        X86::VMAXCPSYrm,         0 },
+    { X86::VMAXPDYrr,         X86::VMAXPDYrm,          0 },
+    { X86::VMAXPSYrr,         X86::VMAXPSYrm,          0 },
+    { X86::VMINCPDYrr,        X86::VMINCPDYrm,         0 },
+    { X86::VMINCPSYrr,        X86::VMINCPSYrm,         0 },
+    { X86::VMINPDYrr,         X86::VMINPDYrm,          0 },
+    { X86::VMINPSYrr,         X86::VMINPSYrm,          0 },
+    { X86::VMULPDYrr,         X86::VMULPDYrm,          0 },
+    { X86::VMULPSYrr,         X86::VMULPSYrm,          0 },
+    { X86::VORPDYrr,          X86::VORPDYrm,           0 },
+    { X86::VORPSYrr,          X86::VORPSYrm,           0 },
+    { X86::VPERM2F128rr,      X86::VPERM2F128rm,       0 },
+    { X86::VPERMILPDYrr,      X86::VPERMILPDYrm,       0 },
+    { X86::VPERMILPSYrr,      X86::VPERMILPSYrm,       0 },
+    { X86::VSHUFPDYrri,       X86::VSHUFPDYrmi,        0 },
+    { X86::VSHUFPSYrri,       X86::VSHUFPSYrmi,        0 },
+    { X86::VSUBPDYrr,         X86::VSUBPDYrm,          0 },
+    { X86::VSUBPSYrr,         X86::VSUBPSYrm,          0 },
+    { X86::VUNPCKHPDYrr,      X86::VUNPCKHPDYrm,       0 },
+    { X86::VUNPCKHPSYrr,      X86::VUNPCKHPSYrm,       0 },
+    { X86::VUNPCKLPDYrr,      X86::VUNPCKLPDYrm,       0 },
+    { X86::VUNPCKLPSYrr,      X86::VUNPCKLPSYrm,       0 },
+    { X86::VXORPDYrr,         X86::VXORPDYrm,          0 },
+    { X86::VXORPSYrr,         X86::VXORPSYrm,          0 },
+
+    // AVX2 foldable instructions
+    { X86::VINSERTI128rr,     X86::VINSERTI128rm,      0 },
+    { X86::VPACKSSDWYrr,      X86::VPACKSSDWYrm,       0 },
+    { X86::VPACKSSWBYrr,      X86::VPACKSSWBYrm,       0 },
+    { X86::VPACKUSDWYrr,      X86::VPACKUSDWYrm,       0 },
+    { X86::VPACKUSWBYrr,      X86::VPACKUSWBYrm,       0 },
+    { X86::VPADDBYrr,         X86::VPADDBYrm,          0 },
+    { X86::VPADDDYrr,         X86::VPADDDYrm,          0 },
+    { X86::VPADDQYrr,         X86::VPADDQYrm,          0 },
+    { X86::VPADDSBYrr,        X86::VPADDSBYrm,         0 },
+    { X86::VPADDSWYrr,        X86::VPADDSWYrm,         0 },
+    { X86::VPADDUSBYrr,       X86::VPADDUSBYrm,        0 },
+    { X86::VPADDUSWYrr,       X86::VPADDUSWYrm,        0 },
+    { X86::VPADDWYrr,         X86::VPADDWYrm,          0 },
+    { X86::VPALIGNRYrri,      X86::VPALIGNRYrmi,       0 },
+    { X86::VPANDNYrr,         X86::VPANDNYrm,          0 },
+    { X86::VPANDYrr,          X86::VPANDYrm,           0 },
+    { X86::VPAVGBYrr,         X86::VPAVGBYrm,          0 },
+    { X86::VPAVGWYrr,         X86::VPAVGWYrm,          0 },
+    { X86::VPBLENDDrri,       X86::VPBLENDDrmi,        0 },
+    { X86::VPBLENDDYrri,      X86::VPBLENDDYrmi,       0 },
+    { X86::VPBLENDVBYrr,      X86::VPBLENDVBYrm,       0 },
+    { X86::VPBLENDWYrri,      X86::VPBLENDWYrmi,       0 },
+    { X86::VPCMPEQBYrr,       X86::VPCMPEQBYrm,        0 },
+    { X86::VPCMPEQDYrr,       X86::VPCMPEQDYrm,        0 },
+    { X86::VPCMPEQQYrr,       X86::VPCMPEQQYrm,        0 },
+    { X86::VPCMPEQWYrr,       X86::VPCMPEQWYrm,        0 },
+    { X86::VPCMPGTBYrr,       X86::VPCMPGTBYrm,        0 },
+    { X86::VPCMPGTDYrr,       X86::VPCMPGTDYrm,        0 },
+    { X86::VPCMPGTQYrr,       X86::VPCMPGTQYrm,        0 },
+    { X86::VPCMPGTWYrr,       X86::VPCMPGTWYrm,        0 },
+    { X86::VPERM2I128rr,      X86::VPERM2I128rm,       0 },
+    { X86::VPERMDYrr,         X86::VPERMDYrm,          0 },
+    { X86::VPERMPSYrr,        X86::VPERMPSYrm,         0 },
+    { X86::VPHADDDYrr,        X86::VPHADDDYrm,         0 },
+    { X86::VPHADDSWrr256,     X86::VPHADDSWrm256,      0 },
+    { X86::VPHADDWYrr,        X86::VPHADDWYrm,         0 },
+    { X86::VPHSUBDYrr,        X86::VPHSUBDYrm,         0 },
+    { X86::VPHSUBSWrr256,     X86::VPHSUBSWrm256,      0 },
+    { X86::VPHSUBWYrr,        X86::VPHSUBWYrm,         0 },
+    { X86::VPMADDUBSWYrr,     X86::VPMADDUBSWYrm,      0 },
+    { X86::VPMADDWDYrr,       X86::VPMADDWDYrm,        0 },
+    { X86::VPMAXSWYrr,        X86::VPMAXSWYrm,         0 },
+    { X86::VPMAXUBYrr,        X86::VPMAXUBYrm,         0 },
+    { X86::VPMINSWYrr,        X86::VPMINSWYrm,         0 },
+    { X86::VPMINUBYrr,        X86::VPMINUBYrm,         0 },
+    { X86::VPMINSBYrr,        X86::VPMINSBYrm,         0 },
+    { X86::VPMINSDYrr,        X86::VPMINSDYrm,         0 },
+    { X86::VPMINUDYrr,        X86::VPMINUDYrm,         0 },
+    { X86::VPMINUWYrr,        X86::VPMINUWYrm,         0 },
+    { X86::VPMAXSBYrr,        X86::VPMAXSBYrm,         0 },
+    { X86::VPMAXSDYrr,        X86::VPMAXSDYrm,         0 },
+    { X86::VPMAXUDYrr,        X86::VPMAXUDYrm,         0 },
+    { X86::VPMAXUWYrr,        X86::VPMAXUWYrm,         0 },
+    { X86::VMPSADBWYrri,      X86::VMPSADBWYrmi,       0 },
+    { X86::VPMULDQYrr,        X86::VPMULDQYrm,         0 },
+    { X86::VPMULHRSWYrr,      X86::VPMULHRSWYrm,       0 },
+    { X86::VPMULHUWYrr,       X86::VPMULHUWYrm,        0 },
+    { X86::VPMULHWYrr,        X86::VPMULHWYrm,         0 },
+    { X86::VPMULLDYrr,        X86::VPMULLDYrm,         0 },
+    { X86::VPMULLWYrr,        X86::VPMULLWYrm,         0 },
+    { X86::VPMULUDQYrr,       X86::VPMULUDQYrm,        0 },
+    { X86::VPORYrr,           X86::VPORYrm,            0 },
+    { X86::VPSADBWYrr,        X86::VPSADBWYrm,         0 },
+    { X86::VPSHUFBYrr,        X86::VPSHUFBYrm,         0 },
+    { X86::VPSIGNBYrr256,     X86::VPSIGNBYrm256,      0 },
+    { X86::VPSIGNWYrr256,     X86::VPSIGNWYrm256,      0 },
+    { X86::VPSIGNDYrr256,     X86::VPSIGNDYrm256,      0 },
+    { X86::VPSLLDYrr,         X86::VPSLLDYrm,          0 },
+    { X86::VPSLLQYrr,         X86::VPSLLQYrm,          0 },
+    { X86::VPSLLWYrr,         X86::VPSLLWYrm,          0 },
+    { X86::VPSLLVDrr,         X86::VPSLLVDrm,          0 },
+    { X86::VPSLLVDYrr,        X86::VPSLLVDYrm,         0 },
+    { X86::VPSLLVQrr,         X86::VPSLLVQrm,          0 },
+    { X86::VPSLLVQYrr,        X86::VPSLLVQYrm,         0 },
+    { X86::VPSRADYrr,         X86::VPSRADYrm,          0 },
+    { X86::VPSRAWYrr,         X86::VPSRAWYrm,          0 },
+    { X86::VPSRAVDrr,         X86::VPSRAVDrm,          0 },
+    { X86::VPSRAVDYrr,        X86::VPSRAVDYrm,         0 },
+    { X86::VPSRLDYrr,         X86::VPSRLDYrm,          0 },
+    { X86::VPSRLQYrr,         X86::VPSRLQYrm,          0 },
+    { X86::VPSRLWYrr,         X86::VPSRLWYrm,          0 },
+    { X86::VPSRLVDrr,         X86::VPSRLVDrm,          0 },
+    { X86::VPSRLVDYrr,        X86::VPSRLVDYrm,         0 },
+    { X86::VPSRLVQrr,         X86::VPSRLVQrm,          0 },
+    { X86::VPSRLVQYrr,        X86::VPSRLVQYrm,         0 },
+    { X86::VPSUBBYrr,         X86::VPSUBBYrm,          0 },
+    { X86::VPSUBDYrr,         X86::VPSUBDYrm,          0 },
+    { X86::VPSUBQYrr,         X86::VPSUBQYrm,          0 },
+    { X86::VPSUBSBYrr,        X86::VPSUBSBYrm,         0 },
+    { X86::VPSUBSWYrr,        X86::VPSUBSWYrm,         0 },
+    { X86::VPSUBUSBYrr,       X86::VPSUBUSBYrm,        0 },
+    { X86::VPSUBUSWYrr,       X86::VPSUBUSWYrm,        0 },
+    { X86::VPSUBWYrr,         X86::VPSUBWYrm,          0 },
+    { X86::VPUNPCKHBWYrr,     X86::VPUNPCKHBWYrm,      0 },
+    { X86::VPUNPCKHDQYrr,     X86::VPUNPCKHDQYrm,      0 },
+    { X86::VPUNPCKHQDQYrr,    X86::VPUNPCKHQDQYrm,     0 },
+    { X86::VPUNPCKHWDYrr,     X86::VPUNPCKHWDYrm,      0 },
+    { X86::VPUNPCKLBWYrr,     X86::VPUNPCKLBWYrm,      0 },
+    { X86::VPUNPCKLDQYrr,     X86::VPUNPCKLDQYrm,      0 },
+    { X86::VPUNPCKLQDQYrr,    X86::VPUNPCKLQDQYrm,     0 },
+    { X86::VPUNPCKLWDYrr,     X86::VPUNPCKLWDYrm,      0 },
+    { X86::VPXORYrr,          X86::VPXORYrm,           0 },
+
+    // FMA4 foldable patterns
+    { X86::VFMADDSS4rr,       X86::VFMADDSS4mr,        TB_ALIGN_NONE },
+    { X86::VFMADDSS4rr_Int,   X86::VFMADDSS4mr_Int,    TB_NO_REVERSE },
+    { X86::VFMADDSD4rr,       X86::VFMADDSD4mr,        TB_ALIGN_NONE },
+    { X86::VFMADDSD4rr_Int,   X86::VFMADDSD4mr_Int,    TB_NO_REVERSE },
+    { X86::VFMADDPS4rr,       X86::VFMADDPS4mr,        TB_ALIGN_NONE },
+    { X86::VFMADDPD4rr,       X86::VFMADDPD4mr,        TB_ALIGN_NONE },
+    { X86::VFMADDPS4Yrr,      X86::VFMADDPS4Ymr,       TB_ALIGN_NONE },
+    { X86::VFMADDPD4Yrr,      X86::VFMADDPD4Ymr,       TB_ALIGN_NONE },
+    { X86::VFNMADDSS4rr,      X86::VFNMADDSS4mr,       TB_ALIGN_NONE },
+    { X86::VFNMADDSS4rr_Int,  X86::VFNMADDSS4mr_Int,   TB_NO_REVERSE },
+    { X86::VFNMADDSD4rr,      X86::VFNMADDSD4mr,       TB_ALIGN_NONE },
+    { X86::VFNMADDSD4rr_Int,  X86::VFNMADDSD4mr_Int,   TB_NO_REVERSE },
+    { X86::VFNMADDPS4rr,      X86::VFNMADDPS4mr,       TB_ALIGN_NONE },
+    { X86::VFNMADDPD4rr,      X86::VFNMADDPD4mr,       TB_ALIGN_NONE },
+    { X86::VFNMADDPS4Yrr,     X86::VFNMADDPS4Ymr,      TB_ALIGN_NONE },
+    { X86::VFNMADDPD4Yrr,     X86::VFNMADDPD4Ymr,      TB_ALIGN_NONE },
+    { X86::VFMSUBSS4rr,       X86::VFMSUBSS4mr,        TB_ALIGN_NONE },
+    { X86::VFMSUBSS4rr_Int,   X86::VFMSUBSS4mr_Int,    TB_NO_REVERSE },
+    { X86::VFMSUBSD4rr,       X86::VFMSUBSD4mr,        TB_ALIGN_NONE },
+    { X86::VFMSUBSD4rr_Int,   X86::VFMSUBSD4mr_Int,    TB_NO_REVERSE },
+    { X86::VFMSUBPS4rr,       X86::VFMSUBPS4mr,        TB_ALIGN_NONE },
+    { X86::VFMSUBPD4rr,       X86::VFMSUBPD4mr,        TB_ALIGN_NONE },
+    { X86::VFMSUBPS4Yrr,      X86::VFMSUBPS4Ymr,       TB_ALIGN_NONE },
+    { X86::VFMSUBPD4Yrr,      X86::VFMSUBPD4Ymr,       TB_ALIGN_NONE },
+    { X86::VFNMSUBSS4rr,      X86::VFNMSUBSS4mr,       TB_ALIGN_NONE },
+    { X86::VFNMSUBSS4rr_Int,  X86::VFNMSUBSS4mr_Int,   TB_NO_REVERSE },
+    { X86::VFNMSUBSD4rr,      X86::VFNMSUBSD4mr,       TB_ALIGN_NONE },
+    { X86::VFNMSUBSD4rr_Int,  X86::VFNMSUBSD4mr_Int,   TB_NO_REVERSE },
+    { X86::VFNMSUBPS4rr,      X86::VFNMSUBPS4mr,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPD4rr,      X86::VFNMSUBPD4mr,       TB_ALIGN_NONE },
+    { X86::VFNMSUBPS4Yrr,     X86::VFNMSUBPS4Ymr,      TB_ALIGN_NONE },
+    { X86::VFNMSUBPD4Yrr,     X86::VFNMSUBPD4Ymr,      TB_ALIGN_NONE },
+    { X86::VFMADDSUBPS4rr,    X86::VFMADDSUBPS4mr,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPD4rr,    X86::VFMADDSUBPD4mr,     TB_ALIGN_NONE },
+    { X86::VFMADDSUBPS4Yrr,   X86::VFMADDSUBPS4Ymr,    TB_ALIGN_NONE },
+    { X86::VFMADDSUBPD4Yrr,   X86::VFMADDSUBPD4Ymr,    TB_ALIGN_NONE },
+    { X86::VFMSUBADDPS4rr,    X86::VFMSUBADDPS4mr,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPD4rr,    X86::VFMSUBADDPD4mr,     TB_ALIGN_NONE },
+    { X86::VFMSUBADDPS4Yrr,   X86::VFMSUBADDPS4Ymr,    TB_ALIGN_NONE },
+    { X86::VFMSUBADDPD4Yrr,   X86::VFMSUBADDPD4Ymr,    TB_ALIGN_NONE },
+
+    // XOP foldable instructions
+    { X86::VPCMOVrrr,         X86::VPCMOVrmr,           0 },
+    { X86::VPCMOVrrrY,        X86::VPCMOVrmrY,          0 },
+    { X86::VPCOMBri,          X86::VPCOMBmi,            0 },
+    { X86::VPCOMDri,          X86::VPCOMDmi,            0 },
+    { X86::VPCOMQri,          X86::VPCOMQmi,            0 },
+    { X86::VPCOMWri,          X86::VPCOMWmi,            0 },
+    { X86::VPCOMUBri,         X86::VPCOMUBmi,           0 },
+    { X86::VPCOMUDri,         X86::VPCOMUDmi,           0 },
+    { X86::VPCOMUQri,         X86::VPCOMUQmi,           0 },
+    { X86::VPCOMUWri,         X86::VPCOMUWmi,           0 },
+    { X86::VPERMIL2PDrr,      X86::VPERMIL2PDmr,        0 },
+    { X86::VPERMIL2PDrrY,     X86::VPERMIL2PDmrY,       0 },
+    { X86::VPERMIL2PSrr,      X86::VPERMIL2PSmr,        0 },
+    { X86::VPERMIL2PSrrY,     X86::VPERMIL2PSmrY,       0 },
+    { X86::VPMACSDDrr,        X86::VPMACSDDrm,          0 },
+    { X86::VPMACSDQHrr,       X86::VPMACSDQHrm,         0 },
+    { X86::VPMACSDQLrr,       X86::VPMACSDQLrm,         0 },
+    { X86::VPMACSSDDrr,       X86::VPMACSSDDrm,         0 },
+    { X86::VPMACSSDQHrr,      X86::VPMACSSDQHrm,        0 },
+    { X86::VPMACSSDQLrr,      X86::VPMACSSDQLrm,        0 },
+    { X86::VPMACSSWDrr,       X86::VPMACSSWDrm,         0 },
+    { X86::VPMACSSWWrr,       X86::VPMACSSWWrm,         0 },
+    { X86::VPMACSWDrr,        X86::VPMACSWDrm,          0 },
+    { X86::VPMACSWWrr,        X86::VPMACSWWrm,          0 },
+    { X86::VPMADCSSWDrr,      X86::VPMADCSSWDrm,        0 },
+    { X86::VPMADCSWDrr,       X86::VPMADCSWDrm,         0 },
+    { X86::VPPERMrrr,         X86::VPPERMrmr,           0 },
+    { X86::VPROTBrr,          X86::VPROTBrm,            0 },
+    { X86::VPROTDrr,          X86::VPROTDrm,            0 },
+    { X86::VPROTQrr,          X86::VPROTQrm,            0 },
+    { X86::VPROTWrr,          X86::VPROTWrm,            0 },
+    { X86::VPSHABrr,          X86::VPSHABrm,            0 },
+    { X86::VPSHADrr,          X86::VPSHADrm,            0 },
+    { X86::VPSHAQrr,          X86::VPSHAQrm,            0 },
+    { X86::VPSHAWrr,          X86::VPSHAWrm,            0 },
+    { X86::VPSHLBrr,          X86::VPSHLBrm,            0 },
+    { X86::VPSHLDrr,          X86::VPSHLDrm,            0 },
+    { X86::VPSHLQrr,          X86::VPSHLQrm,            0 },
+    { X86::VPSHLWrr,          X86::VPSHLWrm,            0 },
+
+    // BMI/BMI2 foldable instructions
+    { X86::ANDN32rr,          X86::ANDN32rm,            0 },
+    { X86::ANDN64rr,          X86::ANDN64rm,            0 },
+    { X86::MULX32rr,          X86::MULX32rm,            0 },
+    { X86::MULX64rr,          X86::MULX64rm,            0 },
+    { X86::PDEP32rr,          X86::PDEP32rm,            0 },
+    { X86::PDEP64rr,          X86::PDEP64rm,            0 },
+    { X86::PEXT32rr,          X86::PEXT32rm,            0 },
+    { X86::PEXT64rr,          X86::PEXT64rm,            0 },
+
+    // ADX foldable instructions
+    { X86::ADCX32rr,          X86::ADCX32rm,            0 },
+    { X86::ADCX64rr,          X86::ADCX64rm,            0 },
+    { X86::ADOX32rr,          X86::ADOX32rm,            0 },
+    { X86::ADOX64rr,          X86::ADOX64rm,            0 },
+
+    // AVX-512 foldable instructions
+    { X86::VADDPDZrr,         X86::VADDPDZrm,           0 },
+    { X86::VADDPSZrr,         X86::VADDPSZrm,           0 },
+    { X86::VADDSDZrr,         X86::VADDSDZrm,           0 },
+    { X86::VADDSDZrr_Int,     X86::VADDSDZrm_Int,       TB_NO_REVERSE },
+    { X86::VADDSSZrr,         X86::VADDSSZrm,           0 },
+    { X86::VADDSSZrr_Int,     X86::VADDSSZrm_Int,       TB_NO_REVERSE },
+    { X86::VALIGNDZrri,       X86::VALIGNDZrmi,         0 },
+    { X86::VALIGNQZrri,       X86::VALIGNQZrmi,         0 },
+    { X86::VANDNPDZrr,        X86::VANDNPDZrm,          0 },
+    { X86::VANDNPSZrr,        X86::VANDNPSZrm,          0 },
+    { X86::VANDPDZrr,         X86::VANDPDZrm,           0 },
+    { X86::VANDPSZrr,         X86::VANDPSZrm,           0 },
+    { X86::VBROADCASTSSZrkz,  X86::VBROADCASTSSZmkz,    TB_NO_REVERSE },
+    { X86::VBROADCASTSDZrkz,  X86::VBROADCASTSDZmkz,    TB_NO_REVERSE },
+    { X86::VCMPPDZrri,        X86::VCMPPDZrmi,          0 },
+    { X86::VCMPPSZrri,        X86::VCMPPSZrmi,          0 },
+    { X86::VCMPSDZrr,         X86::VCMPSDZrm,           0 },
+    { X86::VCMPSDZrr_Int,     X86::VCMPSDZrm_Int,       TB_NO_REVERSE },
+    { X86::VCMPSSZrr,         X86::VCMPSSZrm,           0 },
+    { X86::VCMPSSZrr_Int,     X86::VCMPSSZrm_Int,       TB_NO_REVERSE },
+    { X86::VDIVPDZrr,         X86::VDIVPDZrm,           0 },
+    { X86::VDIVPSZrr,         X86::VDIVPSZrm,           0 },
+    { X86::VDIVSDZrr,         X86::VDIVSDZrm,           0 },
+    { X86::VDIVSDZrr_Int,     X86::VDIVSDZrm_Int,       TB_NO_REVERSE },
+    { X86::VDIVSSZrr,         X86::VDIVSSZrm,           0 },
+    { X86::VDIVSSZrr_Int,     X86::VDIVSSZrm_Int,       TB_NO_REVERSE },
+    { X86::VINSERTF32x4Zrr,   X86::VINSERTF32x4Zrm,     0 },
+    { X86::VINSERTF32x8Zrr,   X86::VINSERTF32x8Zrm,     0 },
+    { X86::VINSERTF64x2Zrr,   X86::VINSERTF64x2Zrm,     0 },
+    { X86::VINSERTF64x4Zrr,   X86::VINSERTF64x4Zrm,     0 },
+    { X86::VINSERTI32x4Zrr,   X86::VINSERTI32x4Zrm,     0 },
+    { X86::VINSERTI32x8Zrr,   X86::VINSERTI32x8Zrm,     0 },
+    { X86::VINSERTI64x2Zrr,   X86::VINSERTI64x2Zrm,     0 },
+    { X86::VINSERTI64x4Zrr,   X86::VINSERTI64x4Zrm,     0 },
+    { X86::VMAXCPDZrr,        X86::VMAXCPDZrm,          0 },
+    { X86::VMAXCPSZrr,        X86::VMAXCPSZrm,          0 },
+    { X86::VMAXCSDZrr,        X86::VMAXCSDZrm,          0 },
+    { X86::VMAXCSSZrr,        X86::VMAXCSSZrm,          0 },
+    { X86::VMAXPDZrr,         X86::VMAXPDZrm,           0 },
+    { X86::VMAXPSZrr,         X86::VMAXPSZrm,           0 },
+    { X86::VMAXSDZrr,         X86::VMAXSDZrm,           0 },
+    { X86::VMAXSDZrr_Int,     X86::VMAXSDZrm_Int,       TB_NO_REVERSE },
+    { X86::VMAXSSZrr,         X86::VMAXSSZrm,           0 },
+    { X86::VMAXSSZrr_Int,     X86::VMAXSSZrm_Int,       TB_NO_REVERSE },
+    { X86::VMINCPDZrr,        X86::VMINCPDZrm,          0 },
+    { X86::VMINCPSZrr,        X86::VMINCPSZrm,          0 },
+    { X86::VMINCSDZrr,        X86::VMINCSDZrm,          0 },
+    { X86::VMINCSSZrr,        X86::VMINCSSZrm,          0 },
+    { X86::VMINPDZrr,         X86::VMINPDZrm,           0 },
+    { X86::VMINPSZrr,         X86::VMINPSZrm,           0 },
+    { X86::VMINSDZrr,         X86::VMINSDZrm,           0 },
+    { X86::VMINSDZrr_Int,     X86::VMINSDZrm_Int,       TB_NO_REVERSE },
+    { X86::VMINSSZrr,         X86::VMINSSZrm,           0 },
+    { X86::VMINSSZrr_Int,     X86::VMINSSZrm_Int,       TB_NO_REVERSE },
+    { X86::VMULPDZrr,         X86::VMULPDZrm,           0 },
+    { X86::VMULPSZrr,         X86::VMULPSZrm,           0 },
+    { X86::VMULSDZrr,         X86::VMULSDZrm,           0 },
+    { X86::VMULSDZrr_Int,     X86::VMULSDZrm_Int,       TB_NO_REVERSE },
+    { X86::VMULSSZrr,         X86::VMULSSZrm,           0 },
+    { X86::VMULSSZrr_Int,     X86::VMULSSZrm_Int,       TB_NO_REVERSE },
+    { X86::VORPDZrr,          X86::VORPDZrm,            0 },
+    { X86::VORPSZrr,          X86::VORPSZrm,            0 },
+    { X86::VPADDBZrr,         X86::VPADDBZrm,           0 },
+    { X86::VPADDDZrr,         X86::VPADDDZrm,           0 },
+    { X86::VPADDQZrr,         X86::VPADDQZrm,           0 },
+    { X86::VPADDSBZrr,        X86::VPADDSBZrm,          0 },
+    { X86::VPADDSWZrr,        X86::VPADDSWZrm,          0 },
+    { X86::VPADDUSBZrr,       X86::VPADDUSBZrm,         0 },
+    { X86::VPADDUSWZrr,       X86::VPADDUSWZrm,         0 },
+    { X86::VPADDWZrr,         X86::VPADDWZrm,           0 },
+    { X86::VPALIGNRZrri,      X86::VPALIGNRZrmi,        0 },
+    { X86::VPANDDZrr,         X86::VPANDDZrm,           0 },
+    { X86::VPANDNDZrr,        X86::VPANDNDZrm,          0 },
+    { X86::VPANDNQZrr,        X86::VPANDNQZrm,          0 },
+    { X86::VPANDQZrr,         X86::VPANDQZrm,           0 },
+    { X86::VPCMPBZrri,        X86::VPCMPBZrmi,          0 },
+    { X86::VPCMPDZrri,        X86::VPCMPDZrmi,          0 },
+    { X86::VPCMPEQBZrr,       X86::VPCMPEQBZrm,         0 },
+    { X86::VPCMPEQDZrr,       X86::VPCMPEQDZrm,         0 },
+    { X86::VPCMPEQQZrr,       X86::VPCMPEQQZrm,         0 },
+    { X86::VPCMPEQWZrr,       X86::VPCMPEQWZrm,         0 },
+    { X86::VPCMPGTBZrr,       X86::VPCMPGTBZrm,         0 },
+    { X86::VPCMPGTDZrr,       X86::VPCMPGTDZrm,         0 },
+    { X86::VPCMPGTQZrr,       X86::VPCMPGTQZrm,         0 },
+    { X86::VPCMPGTWZrr,       X86::VPCMPGTWZrm,         0 },
+    { X86::VPCMPQZrri,        X86::VPCMPQZrmi,          0 },
+    { X86::VPCMPUBZrri,       X86::VPCMPUBZrmi,         0 },
+    { X86::VPCMPUDZrri,       X86::VPCMPUDZrmi,         0 },
+    { X86::VPCMPUQZrri,       X86::VPCMPUQZrmi,         0 },
+    { X86::VPCMPUWZrri,       X86::VPCMPUWZrmi,         0 },
+    { X86::VPCMPWZrri,        X86::VPCMPWZrmi,          0 },
+    { X86::VPERMBZrr,         X86::VPERMBZrm,           0 },
+    { X86::VPERMDZrr,         X86::VPERMDZrm,           0 },
+    { X86::VPERMILPDZrr,      X86::VPERMILPDZrm,        0 },
+    { X86::VPERMILPSZrr,      X86::VPERMILPSZrm,        0 },
+    { X86::VPERMPDZrr,        X86::VPERMPDZrm,          0 },
+    { X86::VPERMPSZrr,        X86::VPERMPSZrm,          0 },
+    { X86::VPERMQZrr,         X86::VPERMQZrm,           0 },
+    { X86::VPERMWZrr,         X86::VPERMWZrm,           0 },
+    { X86::VPMADDUBSWZrr,     X86::VPMADDUBSWZrm,       0 },
+    { X86::VPMADDWDZrr,       X86::VPMADDWDZrm,         0 },
+    { X86::VPMAXSDZrr,        X86::VPMAXSDZrm,          0 },
+    { X86::VPMAXSQZrr,        X86::VPMAXSQZrm,          0 },
+    { X86::VPMAXUDZrr,        X86::VPMAXUDZrm,          0 },
+    { X86::VPMAXUQZrr,        X86::VPMAXUQZrm,          0 },
+    { X86::VPMINSDZrr,        X86::VPMINSDZrm,          0 },
+    { X86::VPMINSQZrr,        X86::VPMINSQZrm,          0 },
+    { X86::VPMINUDZrr,        X86::VPMINUDZrm,          0 },
+    { X86::VPMINUQZrr,        X86::VPMINUQZrm,          0 },
+    { X86::VPMULDQZrr,        X86::VPMULDQZrm,          0 },
+    { X86::VPMULUDQZrr,       X86::VPMULUDQZrm,         0 },
+    { X86::VPORDZrr,          X86::VPORDZrm,            0 },
+    { X86::VPORQZrr,          X86::VPORQZrm,            0 },
+    { X86::VPSHUFBZrr,        X86::VPSHUFBZrm,          0 },
+    { X86::VPSLLVDZrr,        X86::VPSLLVDZrm,          0 },
+    { X86::VPSLLVQZrr,        X86::VPSLLVQZrm,          0 },
+    { X86::VPSRAVDZrr,        X86::VPSRAVDZrm,          0 },
+    { X86::VPSRLVDZrr,        X86::VPSRLVDZrm,          0 },
+    { X86::VPSRLVQZrr,        X86::VPSRLVQZrm,          0 },
+    { X86::VPSUBBZrr,         X86::VPSUBBZrm,           0 },
+    { X86::VPSUBDZrr,         X86::VPSUBDZrm,           0 },
+    { X86::VPSUBQZrr,         X86::VPSUBQZrm,           0 },
+    { X86::VPSUBSBZrr,        X86::VPSUBSBZrm,          0 },
+    { X86::VPSUBSWZrr,        X86::VPSUBSWZrm,          0 },
+    { X86::VPSUBUSBZrr,       X86::VPSUBUSBZrm,         0 },
+    { X86::VPSUBUSWZrr,       X86::VPSUBUSWZrm,         0 },
+    { X86::VPSUBWZrr,         X86::VPSUBWZrm,           0 },
+    { X86::VPUNPCKHBWZrr,     X86::VPUNPCKHBWZrm,       0 },
+    { X86::VPUNPCKHDQZrr,     X86::VPUNPCKHDQZrm,       0 },
+    { X86::VPUNPCKHQDQZrr,    X86::VPUNPCKHQDQZrm,      0 },
+    { X86::VPUNPCKHWDZrr,     X86::VPUNPCKHWDZrm,       0 },
+    { X86::VPUNPCKLBWZrr,     X86::VPUNPCKLBWZrm,       0 },
+    { X86::VPUNPCKLDQZrr,     X86::VPUNPCKLDQZrm,       0 },
+    { X86::VPUNPCKLQDQZrr,    X86::VPUNPCKLQDQZrm,      0 },
+    { X86::VPUNPCKLWDZrr,     X86::VPUNPCKLWDZrm,       0 },
+    { X86::VPXORDZrr,         X86::VPXORDZrm,           0 },
+    { X86::VPXORQZrr,         X86::VPXORQZrm,           0 },
+    { X86::VSHUFPDZrri,       X86::VSHUFPDZrmi,         0 },
+    { X86::VSHUFPSZrri,       X86::VSHUFPSZrmi,         0 },
+    { X86::VSUBPDZrr,         X86::VSUBPDZrm,           0 },
+    { X86::VSUBPSZrr,         X86::VSUBPSZrm,           0 },
+    { X86::VSUBSDZrr,         X86::VSUBSDZrm,           0 },
+    { X86::VSUBSDZrr_Int,     X86::VSUBSDZrm_Int,       TB_NO_REVERSE },
+    { X86::VSUBSSZrr,         X86::VSUBSSZrm,           0 },
+    { X86::VSUBSSZrr_Int,     X86::VSUBSSZrm_Int,       TB_NO_REVERSE },
+    { X86::VUNPCKHPDZrr,      X86::VUNPCKHPDZrm,        0 },
+    { X86::VUNPCKHPSZrr,      X86::VUNPCKHPSZrm,        0 },
+    { X86::VUNPCKLPDZrr,      X86::VUNPCKLPDZrm,        0 },
+    { X86::VUNPCKLPSZrr,      X86::VUNPCKLPSZrm,        0 },
+    { X86::VXORPDZrr,         X86::VXORPDZrm,           0 },
+    { X86::VXORPSZrr,         X86::VXORPSZrm,           0 },
+
+    // AVX-512{F,VL} foldable instructions
+    { X86::VADDPDZ128rr,      X86::VADDPDZ128rm,        0 },
+    { X86::VADDPDZ256rr,      X86::VADDPDZ256rm,        0 },
+    { X86::VADDPSZ128rr,      X86::VADDPSZ128rm,        0 },
+    { X86::VADDPSZ256rr,      X86::VADDPSZ256rm,        0 },
+    { X86::VALIGNDZ128rri,    X86::VALIGNDZ128rmi,      0 },
+    { X86::VALIGNDZ256rri,    X86::VALIGNDZ256rmi,      0 },
+    { X86::VALIGNQZ128rri,    X86::VALIGNQZ128rmi,      0 },
+    { X86::VALIGNQZ256rri,    X86::VALIGNQZ256rmi,      0 },
+    { X86::VANDNPDZ128rr,     X86::VANDNPDZ128rm,       0 },
+    { X86::VANDNPDZ256rr,     X86::VANDNPDZ256rm,       0 },
+    { X86::VANDNPSZ128rr,     X86::VANDNPSZ128rm,       0 },
+    { X86::VANDNPSZ256rr,     X86::VANDNPSZ256rm,       0 },
+    { X86::VANDPDZ128rr,      X86::VANDPDZ128rm,        0 },
+    { X86::VANDPDZ256rr,      X86::VANDPDZ256rm,        0 },
+    { X86::VANDPSZ128rr,      X86::VANDPSZ128rm,        0 },
+    { X86::VANDPSZ256rr,      X86::VANDPSZ256rm,        0 },
+    { X86::VBROADCASTSSZ128rkz,  X86::VBROADCASTSSZ128mkz,      TB_NO_REVERSE },
+    { X86::VBROADCASTSSZ256rkz,  X86::VBROADCASTSSZ256mkz,      TB_NO_REVERSE },
+    { X86::VBROADCASTSDZ256rkz,  X86::VBROADCASTSDZ256mkz,      TB_NO_REVERSE },
+    { X86::VCMPPDZ128rri,     X86::VCMPPDZ128rmi,       0 },
+    { X86::VCMPPDZ256rri,     X86::VCMPPDZ256rmi,       0 },
+    { X86::VCMPPSZ128rri,     X86::VCMPPSZ128rmi,       0 },
+    { X86::VCMPPSZ256rri,     X86::VCMPPSZ256rmi,       0 },
+    { X86::VDIVPDZ128rr,      X86::VDIVPDZ128rm,        0 },
+    { X86::VDIVPDZ256rr,      X86::VDIVPDZ256rm,        0 },
+    { X86::VDIVPSZ128rr,      X86::VDIVPSZ128rm,        0 },
+    { X86::VDIVPSZ256rr,      X86::VDIVPSZ256rm,        0 },
+    { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rm,  0 },
+    { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rm,  0 },
+    { X86::VINSERTI32x4Z256rr,X86::VINSERTI32x4Z256rm,  0 },
+    { X86::VINSERTI64x2Z256rr,X86::VINSERTI64x2Z256rm,  0 },
+    { X86::VMAXCPDZ128rr,     X86::VMAXCPDZ128rm,       0 },
+    { X86::VMAXCPDZ256rr,     X86::VMAXCPDZ256rm,       0 },
+    { X86::VMAXCPSZ128rr,     X86::VMAXCPSZ128rm,       0 },
+    { X86::VMAXCPSZ256rr,     X86::VMAXCPSZ256rm,       0 },
+    { X86::VMAXPDZ128rr,      X86::VMAXPDZ128rm,        0 },
+    { X86::VMAXPDZ256rr,      X86::VMAXPDZ256rm,        0 },
+    { X86::VMAXPSZ128rr,      X86::VMAXPSZ128rm,        0 },
+    { X86::VMAXPSZ256rr,      X86::VMAXPSZ256rm,        0 },
+    { X86::VMINCPDZ128rr,     X86::VMINCPDZ128rm,       0 },
+    { X86::VMINCPDZ256rr,     X86::VMINCPDZ256rm,       0 },
+    { X86::VMINCPSZ128rr,     X86::VMINCPSZ128rm,       0 },
+    { X86::VMINCPSZ256rr,     X86::VMINCPSZ256rm,       0 },
+    { X86::VMINPDZ128rr,      X86::VMINPDZ128rm,        0 },
+    { X86::VMINPDZ256rr,      X86::VMINPDZ256rm,        0 },
+    { X86::VMINPSZ128rr,      X86::VMINPSZ128rm,        0 },
+    { X86::VMINPSZ256rr,      X86::VMINPSZ256rm,        0 },
+    { X86::VMULPDZ128rr,      X86::VMULPDZ128rm,        0 },
+    { X86::VMULPDZ256rr,      X86::VMULPDZ256rm,        0 },
+    { X86::VMULPSZ128rr,      X86::VMULPSZ128rm,        0 },
+    { X86::VMULPSZ256rr,      X86::VMULPSZ256rm,        0 },
+    { X86::VORPDZ128rr,       X86::VORPDZ128rm,         0 },
+    { X86::VORPDZ256rr,       X86::VORPDZ256rm,         0 },
+    { X86::VORPSZ128rr,       X86::VORPSZ128rm,         0 },
+    { X86::VORPSZ256rr,       X86::VORPSZ256rm,         0 },
+    { X86::VPADDBZ128rr,      X86::VPADDBZ128rm,        0 },
+    { X86::VPADDBZ256rr,      X86::VPADDBZ256rm,        0 },
+    { X86::VPADDDZ128rr,      X86::VPADDDZ128rm,        0 },
+    { X86::VPADDDZ256rr,      X86::VPADDDZ256rm,        0 },
+    { X86::VPADDQZ128rr,      X86::VPADDQZ128rm,        0 },
+    { X86::VPADDQZ256rr,      X86::VPADDQZ256rm,        0 },
+    { X86::VPADDSBZ128rr,     X86::VPADDSBZ128rm,       0 },
+    { X86::VPADDSBZ256rr,     X86::VPADDSBZ256rm,       0 },
+    { X86::VPADDSWZ128rr,     X86::VPADDSWZ128rm,       0 },
+    { X86::VPADDSWZ256rr,     X86::VPADDSWZ256rm,       0 },
+    { X86::VPADDUSBZ128rr,    X86::VPADDUSBZ128rm,      0 },
+    { X86::VPADDUSBZ256rr,    X86::VPADDUSBZ256rm,      0 },
+    { X86::VPADDUSWZ128rr,    X86::VPADDUSWZ128rm,      0 },
+    { X86::VPADDUSWZ256rr,    X86::VPADDUSWZ256rm,      0 },
+    { X86::VPADDWZ128rr,      X86::VPADDWZ128rm,        0 },
+    { X86::VPADDWZ256rr,      X86::VPADDWZ256rm,        0 },
+    { X86::VPALIGNRZ128rri,   X86::VPALIGNRZ128rmi,     0 },
+    { X86::VPALIGNRZ256rri,   X86::VPALIGNRZ256rmi,     0 },
+    { X86::VPANDDZ128rr,      X86::VPANDDZ128rm,        0 },
+    { X86::VPANDDZ256rr,      X86::VPANDDZ256rm,        0 },
+    { X86::VPANDNDZ128rr,     X86::VPANDNDZ128rm,       0 },
+    { X86::VPANDNDZ256rr,     X86::VPANDNDZ256rm,       0 },
+    { X86::VPANDNQZ128rr,     X86::VPANDNQZ128rm,       0 },
+    { X86::VPANDNQZ256rr,     X86::VPANDNQZ256rm,       0 },
+    { X86::VPANDQZ128rr,      X86::VPANDQZ128rm,        0 },
+    { X86::VPANDQZ256rr,      X86::VPANDQZ256rm,        0 },
+    { X86::VPCMPBZ128rri,     X86::VPCMPBZ128rmi,       0 },
+    { X86::VPCMPBZ256rri,     X86::VPCMPBZ256rmi,       0 },
+    { X86::VPCMPDZ128rri,     X86::VPCMPDZ128rmi,       0 },
+    { X86::VPCMPDZ256rri,     X86::VPCMPDZ256rmi,       0 },
+    { X86::VPCMPEQBZ128rr,    X86::VPCMPEQBZ128rm,      0 },
+    { X86::VPCMPEQBZ256rr,    X86::VPCMPEQBZ256rm,      0 },
+    { X86::VPCMPEQDZ128rr,    X86::VPCMPEQDZ128rm,      0 },
+    { X86::VPCMPEQDZ256rr,    X86::VPCMPEQDZ256rm,      0 },
+    { X86::VPCMPEQQZ128rr,    X86::VPCMPEQQZ128rm,      0 },
+    { X86::VPCMPEQQZ256rr,    X86::VPCMPEQQZ256rm,      0 },
+    { X86::VPCMPEQWZ128rr,    X86::VPCMPEQWZ128rm,      0 },
+    { X86::VPCMPEQWZ256rr,    X86::VPCMPEQWZ256rm,      0 },
+    { X86::VPCMPGTBZ128rr,    X86::VPCMPGTBZ128rm,      0 },
+    { X86::VPCMPGTBZ256rr,    X86::VPCMPGTBZ256rm,      0 },
+    { X86::VPCMPGTDZ128rr,    X86::VPCMPGTDZ128rm,      0 },
+    { X86::VPCMPGTDZ256rr,    X86::VPCMPGTDZ256rm,      0 },
+    { X86::VPCMPGTQZ128rr,    X86::VPCMPGTQZ128rm,      0 },
+    { X86::VPCMPGTQZ256rr,    X86::VPCMPGTQZ256rm,      0 },
+    { X86::VPCMPGTWZ128rr,    X86::VPCMPGTWZ128rm,      0 },
+    { X86::VPCMPGTWZ256rr,    X86::VPCMPGTWZ256rm,      0 },
+    { X86::VPCMPQZ128rri,     X86::VPCMPQZ128rmi,       0 },
+    { X86::VPCMPQZ256rri,     X86::VPCMPQZ256rmi,       0 },
+    { X86::VPCMPUBZ128rri,    X86::VPCMPUBZ128rmi,      0 },
+    { X86::VPCMPUBZ256rri,    X86::VPCMPUBZ256rmi,      0 },
+    { X86::VPCMPUDZ128rri,    X86::VPCMPUDZ128rmi,      0 },
+    { X86::VPCMPUDZ256rri,    X86::VPCMPUDZ256rmi,      0 },
+    { X86::VPCMPUQZ128rri,    X86::VPCMPUQZ128rmi,      0 },
+    { X86::VPCMPUQZ256rri,    X86::VPCMPUQZ256rmi,      0 },
+    { X86::VPCMPUWZ128rri,    X86::VPCMPUWZ128rmi,      0 },
+    { X86::VPCMPUWZ256rri,    X86::VPCMPUWZ256rmi,      0 },
+    { X86::VPCMPWZ128rri,     X86::VPCMPWZ128rmi,       0 },
+    { X86::VPCMPWZ256rri,     X86::VPCMPWZ256rmi,       0 },
+    { X86::VPERMBZ128rr,      X86::VPERMBZ128rm,        0 },
+    { X86::VPERMBZ256rr,      X86::VPERMBZ256rm,        0 },
+    { X86::VPERMDZ256rr,      X86::VPERMDZ256rm,        0 },
+    { X86::VPERMILPDZ128rr,   X86::VPERMILPDZ128rm,     0 },
+    { X86::VPERMILPDZ256rr,   X86::VPERMILPDZ256rm,     0 },
+    { X86::VPERMILPSZ128rr,   X86::VPERMILPSZ128rm,     0 },
+    { X86::VPERMILPSZ256rr,   X86::VPERMILPSZ256rm,     0 },
+    { X86::VPERMPDZ256rr,     X86::VPERMPDZ256rm,       0 },
+    { X86::VPERMPSZ256rr,     X86::VPERMPSZ256rm,       0 },
+    { X86::VPERMQZ256rr,      X86::VPERMQZ256rm,        0 },
+    { X86::VPERMWZ128rr,      X86::VPERMWZ128rm,        0 },
+    { X86::VPERMWZ256rr,      X86::VPERMWZ256rm,        0 },
+    { X86::VPMADDUBSWZ128rr,  X86::VPMADDUBSWZ128rm,    0 },
+    { X86::VPMADDUBSWZ256rr,  X86::VPMADDUBSWZ256rm,    0 },
+    { X86::VPMADDWDZ128rr,    X86::VPMADDWDZ128rm,      0 },
+    { X86::VPMADDWDZ256rr,    X86::VPMADDWDZ256rm,      0 },
+    { X86::VPORDZ128rr,       X86::VPORDZ128rm,         0 },
+    { X86::VPORDZ256rr,       X86::VPORDZ256rm,         0 },
+    { X86::VPORQZ128rr,       X86::VPORQZ128rm,         0 },
+    { X86::VPORQZ256rr,       X86::VPORQZ256rm,         0 },
+    { X86::VPSHUFBZ128rr,     X86::VPSHUFBZ128rm,       0 },
+    { X86::VPSHUFBZ256rr,     X86::VPSHUFBZ256rm,       0 },
+    { X86::VPSUBBZ128rr,      X86::VPSUBBZ128rm,        0 },
+    { X86::VPSUBBZ256rr,      X86::VPSUBBZ256rm,        0 },
+    { X86::VPSUBDZ128rr,      X86::VPSUBDZ128rm,        0 },
+    { X86::VPSUBDZ256rr,      X86::VPSUBDZ256rm,        0 },
+    { X86::VPSUBQZ128rr,      X86::VPSUBQZ128rm,        0 },
+    { X86::VPSUBQZ256rr,      X86::VPSUBQZ256rm,        0 },
+    { X86::VPSUBSBZ128rr,     X86::VPSUBSBZ128rm,       0 },
+    { X86::VPSUBSBZ256rr,     X86::VPSUBSBZ256rm,       0 },
+    { X86::VPSUBSWZ128rr,     X86::VPSUBSWZ128rm,       0 },
+    { X86::VPSUBSWZ256rr,     X86::VPSUBSWZ256rm,       0 },
+    { X86::VPSUBUSBZ128rr,    X86::VPSUBUSBZ128rm,      0 },
+    { X86::VPSUBUSBZ256rr,    X86::VPSUBUSBZ256rm,      0 },
+    { X86::VPSUBUSWZ128rr,    X86::VPSUBUSWZ128rm,      0 },
+    { X86::VPSUBUSWZ256rr,    X86::VPSUBUSWZ256rm,      0 },
+    { X86::VPSUBWZ128rr,      X86::VPSUBWZ128rm,        0 },
+    { X86::VPSUBWZ256rr,      X86::VPSUBWZ256rm,        0 },
+    { X86::VPUNPCKHBWZ128rr,  X86::VPUNPCKHBWZ128rm,    0 },
+    { X86::VPUNPCKHBWZ256rr,  X86::VPUNPCKHBWZ256rm,    0 },
+    { X86::VPUNPCKHDQZ128rr,  X86::VPUNPCKHDQZ128rm,    0 },
+    { X86::VPUNPCKHDQZ256rr,  X86::VPUNPCKHDQZ256rm,    0 },
+    { X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm,   0 },
+    { X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm,   0 },
+    { X86::VPUNPCKHWDZ128rr,  X86::VPUNPCKHWDZ128rm,    0 },
+    { X86::VPUNPCKHWDZ256rr,  X86::VPUNPCKHWDZ256rm,    0 },
+    { X86::VPUNPCKLBWZ128rr,  X86::VPUNPCKLBWZ128rm,    0 },
+    { X86::VPUNPCKLBWZ256rr,  X86::VPUNPCKLBWZ256rm,    0 },
+    { X86::VPUNPCKLDQZ128rr,  X86::VPUNPCKLDQZ128rm,    0 },
+    { X86::VPUNPCKLDQZ256rr,  X86::VPUNPCKLDQZ256rm,    0 },
+    { X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm,   0 },
+    { X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm,   0 },
+    { X86::VPUNPCKLWDZ128rr,  X86::VPUNPCKLWDZ128rm,    0 },
+    { X86::VPUNPCKLWDZ256rr,  X86::VPUNPCKLWDZ256rm,    0 },
+    { X86::VPXORDZ128rr,      X86::VPXORDZ128rm,        0 },
+    { X86::VPXORDZ256rr,      X86::VPXORDZ256rm,        0 },
+    { X86::VPXORQZ128rr,      X86::VPXORQZ128rm,        0 },
+    { X86::VPXORQZ256rr,      X86::VPXORQZ256rm,        0 },
+    { X86::VSUBPDZ128rr,      X86::VSUBPDZ128rm,        0 },
+    { X86::VSUBPDZ256rr,      X86::VSUBPDZ256rm,        0 },
+    { X86::VSUBPSZ128rr,      X86::VSUBPSZ128rm,        0 },
+    { X86::VSUBPSZ256rr,      X86::VSUBPSZ256rm,        0 },
+    { X86::VUNPCKHPDZ128rr,   X86::VUNPCKHPDZ128rm,     0 },
+    { X86::VUNPCKHPDZ256rr,   X86::VUNPCKHPDZ256rm,     0 },
+    { X86::VUNPCKHPSZ128rr,   X86::VUNPCKHPSZ128rm,     0 },
+    { X86::VUNPCKHPSZ256rr,   X86::VUNPCKHPSZ256rm,     0 },
+    { X86::VUNPCKLPDZ128rr,   X86::VUNPCKLPDZ128rm,     0 },
+    { X86::VUNPCKLPDZ256rr,   X86::VUNPCKLPDZ256rm,     0 },
+    { X86::VUNPCKLPSZ128rr,   X86::VUNPCKLPSZ128rm,     0 },
+    { X86::VUNPCKLPSZ256rr,   X86::VUNPCKLPSZ256rm,     0 },
+    { X86::VXORPDZ128rr,      X86::VXORPDZ128rm,        0 },
+    { X86::VXORPDZ256rr,      X86::VXORPDZ256rm,        0 },
+    { X86::VXORPSZ128rr,      X86::VXORPSZ128rm,        0 },
+    { X86::VXORPSZ256rr,      X86::VXORPSZ256rm,        0 },
+
+    // AVX-512 masked foldable instructions
+    { X86::VPERMILPDZrikz,    X86::VPERMILPDZmikz,      0 },
+    { X86::VPERMILPSZrikz,    X86::VPERMILPSZmikz,      0 },
+    { X86::VPERMPDZrikz,      X86::VPERMPDZmikz,        0 },
+    { X86::VPERMQZrikz,       X86::VPERMQZmikz,         0 },
+    { X86::VPMOVSXBDZrrkz,    X86::VPMOVSXBDZrmkz,      0 },
+    { X86::VPMOVSXBQZrrkz,    X86::VPMOVSXBQZrmkz,      TB_NO_REVERSE },
+    { X86::VPMOVSXBWZrrkz,    X86::VPMOVSXBWZrmkz,      0 },
+    { X86::VPMOVSXDQZrrkz,    X86::VPMOVSXDQZrmkz,      0 },
+    { X86::VPMOVSXWDZrrkz,    X86::VPMOVSXWDZrmkz,      0 },
+    { X86::VPMOVSXWQZrrkz,    X86::VPMOVSXWQZrmkz,      0 },
+    { X86::VPMOVZXBDZrrkz,    X86::VPMOVZXBDZrmkz,      0 },
+    { X86::VPMOVZXBQZrrkz,    X86::VPMOVZXBQZrmkz,      TB_NO_REVERSE },
+    { X86::VPMOVZXBWZrrkz,    X86::VPMOVZXBWZrmkz,      0 },
+    { X86::VPMOVZXDQZrrkz,    X86::VPMOVZXDQZrmkz,      0 },
+    { X86::VPMOVZXWDZrrkz,    X86::VPMOVZXWDZrmkz,      0 },
+    { X86::VPMOVZXWQZrrkz,    X86::VPMOVZXWQZrmkz,      0 },
+    { X86::VPSHUFDZrikz,      X86::VPSHUFDZmikz,        0 },
+    { X86::VPSHUFHWZrikz,     X86::VPSHUFHWZmikz,       0 },
+    { X86::VPSHUFLWZrikz,     X86::VPSHUFLWZmikz,       0 },
+
+    // AVX-512VL 256-bit masked foldable instructions
+    { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz,   0 },
+    { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz,   0 },
+    { X86::VPERMPDZ256rikz,   X86::VPERMPDZ256mikz,     0 },
+    { X86::VPERMQZ256rikz,    X86::VPERMQZ256mikz,      0 },
+    { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz,   TB_NO_REVERSE },
+    { X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz,   TB_NO_REVERSE },
+    { X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz,   0 },
+    { X86::VPMOVSXDQZ256rrkz, X86::VPMOVSXDQZ256rmkz,   0 },
+    { X86::VPMOVSXWDZ256rrkz, X86::VPMOVSXWDZ256rmkz,   0 },
+    { X86::VPMOVSXWQZ256rrkz, X86::VPMOVSXWQZ256rmkz,   TB_NO_REVERSE },
+    { X86::VPMOVZXBDZ256rrkz, X86::VPMOVZXBDZ256rmkz,   TB_NO_REVERSE },
+    { X86::VPMOVZXBQZ256rrkz, X86::VPMOVZXBQZ256rmkz,   TB_NO_REVERSE },
+    { X86::VPMOVZXBWZ256rrkz, X86::VPMOVZXBWZ256rmkz,   0 },
+    { X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz,   0 },
+    { X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz,   0 },
+    { X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz,   TB_NO_REVERSE },
+    { X86::VPSHUFDZ256rikz,   X86::VPSHUFDZ256mikz,     0 },
+    { X86::VPSHUFHWZ256rikz,  X86::VPSHUFHWZ256mikz,    0 },
+    { X86::VPSHUFLWZ256rikz,  X86::VPSHUFLWZ256mikz,    0 },
+
+    // AVX-512VL 128-bit masked foldable instructions
+    { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz,   0 },
+    { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz,   0 },
+    { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz,   TB_NO_REVERSE },
+    { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz,   TB_NO_REVERSE },
+    { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz,   TB_NO_REVERSE },
+    { X86::VPMOVSXDQZ128rrkz, X86::VPMOVSXDQZ128rmkz,   TB_NO_REVERSE },
+    { X86::VPMOVSXWDZ128rrkz, X86::VPMOVSXWDZ128rmkz,   TB_NO_REVERSE },
+    { X86::VPMOVSXWQZ128rrkz, X86::VPMOVSXWQZ128rmkz,   TB_NO_REVERSE },
+    { X86::VPMOVZXBDZ128rrkz, X86::VPMOVZXBDZ128rmkz,   TB_NO_REVERSE },
+    { X86::VPMOVZXBQZ128rrkz, X86::VPMOVZXBQZ128rmkz,   TB_NO_REVERSE },
+    { X86::VPMOVZXBWZ128rrkz, X86::VPMOVZXBWZ128rmkz,   TB_NO_REVERSE },
+    { X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz,   TB_NO_REVERSE },
+    { X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz,   TB_NO_REVERSE },
+    { X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz,   TB_NO_REVERSE },
+    { X86::VPSHUFDZ128rikz,   X86::VPSHUFDZ128mikz,     0 },
+    { X86::VPSHUFHWZ128rikz,  X86::VPSHUFHWZ128mikz,    0 },
+    { X86::VPSHUFLWZ128rikz,  X86::VPSHUFLWZ128mikz,    0 },
+
+    // AES foldable instructions
+    { X86::AESDECLASTrr,      X86::AESDECLASTrm,        TB_ALIGN_16 },
+    { X86::AESDECrr,          X86::AESDECrm,            TB_ALIGN_16 },
+    { X86::AESENCLASTrr,      X86::AESENCLASTrm,        TB_ALIGN_16 },
+    { X86::AESENCrr,          X86::AESENCrm,            TB_ALIGN_16 },
+    { X86::VAESDECLASTrr,     X86::VAESDECLASTrm,       0 },
+    { X86::VAESDECrr,         X86::VAESDECrm,           0 },
+    { X86::VAESENCLASTrr,     X86::VAESENCLASTrm,       0 },
+    { X86::VAESENCrr,         X86::VAESENCrm,           0 },
+
+    // SHA foldable instructions
+    { X86::SHA1MSG1rr,        X86::SHA1MSG1rm,          TB_ALIGN_16 },
+    { X86::SHA1MSG2rr,        X86::SHA1MSG2rm,          TB_ALIGN_16 },
+    { X86::SHA1NEXTErr,       X86::SHA1NEXTErm,         TB_ALIGN_16 },
+    { X86::SHA1RNDS4rri,      X86::SHA1RNDS4rmi,        TB_ALIGN_16 },
+    { X86::SHA256MSG1rr,      X86::SHA256MSG1rm,        TB_ALIGN_16 },
+    { X86::SHA256MSG2rr,      X86::SHA256MSG2rm,        TB_ALIGN_16 },
+    { X86::SHA256RNDS2rr,     X86::SHA256RNDS2rm,       TB_ALIGN_16 }
+  };
+
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) {
+    AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
+                  Entry.RegOp, Entry.MemOp,
+                  // Index 2, folded load
+                  Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
+  }
+
+  static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
+    // FMA4 foldable patterns
+    { X86::VFMADDSS4rr,           X86::VFMADDSS4rm,           TB_ALIGN_NONE },
+    { X86::VFMADDSS4rr_Int,       X86::VFMADDSS4rm_Int,       TB_NO_REVERSE },
+    { X86::VFMADDSD4rr,           X86::VFMADDSD4rm,           TB_ALIGN_NONE },
+    { X86::VFMADDSD4rr_Int,       X86::VFMADDSD4rm_Int,       TB_NO_REVERSE },
+    { X86::VFMADDPS4rr,           X86::VFMADDPS4rm,           TB_ALIGN_NONE },
+    { X86::VFMADDPD4rr,           X86::VFMADDPD4rm,           TB_ALIGN_NONE },
+    { X86::VFMADDPS4Yrr,          X86::VFMADDPS4Yrm,          TB_ALIGN_NONE },
+    { X86::VFMADDPD4Yrr,          X86::VFMADDPD4Yrm,          TB_ALIGN_NONE },
+    { X86::VFNMADDSS4rr,          X86::VFNMADDSS4rm,          TB_ALIGN_NONE },
+    { X86::VFNMADDSS4rr_Int,      X86::VFNMADDSS4rm_Int,      TB_NO_REVERSE },
+    { X86::VFNMADDSD4rr,          X86::VFNMADDSD4rm,          TB_ALIGN_NONE },
+    { X86::VFNMADDSD4rr_Int,      X86::VFNMADDSD4rm_Int,      TB_NO_REVERSE },
+    { X86::VFNMADDPS4rr,          X86::VFNMADDPS4rm,          TB_ALIGN_NONE },
+    { X86::VFNMADDPD4rr,          X86::VFNMADDPD4rm,          TB_ALIGN_NONE },
+    { X86::VFNMADDPS4Yrr,         X86::VFNMADDPS4Yrm,         TB_ALIGN_NONE },
+    { X86::VFNMADDPD4Yrr,         X86::VFNMADDPD4Yrm,         TB_ALIGN_NONE },
+    { X86::VFMSUBSS4rr,           X86::VFMSUBSS4rm,           TB_ALIGN_NONE },
+    { X86::VFMSUBSS4rr_Int,       X86::VFMSUBSS4rm_Int,       TB_NO_REVERSE },
+    { X86::VFMSUBSD4rr,           X86::VFMSUBSD4rm,           TB_ALIGN_NONE },
+    { X86::VFMSUBSD4rr_Int,       X86::VFMSUBSD4rm_Int,       TB_NO_REVERSE },
+    { X86::VFMSUBPS4rr,           X86::VFMSUBPS4rm,           TB_ALIGN_NONE },
+    { X86::VFMSUBPD4rr,           X86::VFMSUBPD4rm,           TB_ALIGN_NONE },
+    { X86::VFMSUBPS4Yrr,          X86::VFMSUBPS4Yrm,          TB_ALIGN_NONE },
+    { X86::VFMSUBPD4Yrr,          X86::VFMSUBPD4Yrm,          TB_ALIGN_NONE },
+    { X86::VFNMSUBSS4rr,          X86::VFNMSUBSS4rm,          TB_ALIGN_NONE },
+    { X86::VFNMSUBSS4rr_Int,      X86::VFNMSUBSS4rm_Int,      TB_NO_REVERSE },
+    { X86::VFNMSUBSD4rr,          X86::VFNMSUBSD4rm,          TB_ALIGN_NONE },
+    { X86::VFNMSUBSD4rr_Int,      X86::VFNMSUBSD4rm_Int,      TB_NO_REVERSE },
+    { X86::VFNMSUBPS4rr,          X86::VFNMSUBPS4rm,          TB_ALIGN_NONE },
+    { X86::VFNMSUBPD4rr,          X86::VFNMSUBPD4rm,          TB_ALIGN_NONE },
+    { X86::VFNMSUBPS4Yrr,         X86::VFNMSUBPS4Yrm,         TB_ALIGN_NONE },
+    { X86::VFNMSUBPD4Yrr,         X86::VFNMSUBPD4Yrm,         TB_ALIGN_NONE },
+    { X86::VFMADDSUBPS4rr,        X86::VFMADDSUBPS4rm,        TB_ALIGN_NONE },
+    { X86::VFMADDSUBPD4rr,        X86::VFMADDSUBPD4rm,        TB_ALIGN_NONE },
+    { X86::VFMADDSUBPS4Yrr,       X86::VFMADDSUBPS4Yrm,       TB_ALIGN_NONE },
+    { X86::VFMADDSUBPD4Yrr,       X86::VFMADDSUBPD4Yrm,       TB_ALIGN_NONE },
+    { X86::VFMSUBADDPS4rr,        X86::VFMSUBADDPS4rm,        TB_ALIGN_NONE },
+    { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_NONE },
+    { X86::VFMSUBADDPS4Yrr,       X86::VFMSUBADDPS4Yrm,       TB_ALIGN_NONE },
+    { X86::VFMSUBADDPD4Yrr,       X86::VFMSUBADDPD4Yrm,       TB_ALIGN_NONE },
+
+    // XOP foldable instructions
+    { X86::VPCMOVrrr,             X86::VPCMOVrrm,             0 },
+    { X86::VPCMOVrrrY,            X86::VPCMOVrrmY,            0 },
+    { X86::VPERMIL2PDrr,          X86::VPERMIL2PDrm,          0 },
+    { X86::VPERMIL2PDrrY,         X86::VPERMIL2PDrmY,         0 },
+    { X86::VPERMIL2PSrr,          X86::VPERMIL2PSrm,          0 },
+    { X86::VPERMIL2PSrrY,         X86::VPERMIL2PSrmY,         0 },
+    { X86::VPPERMrrr,             X86::VPPERMrrm,             0 },
+
+    // AVX-512 instructions with 3 source operands.
+    { X86::VBLENDMPDZrr,          X86::VBLENDMPDZrm,          0 },
+    { X86::VBLENDMPSZrr,          X86::VBLENDMPSZrm,          0 },
+    { X86::VPBLENDMDZrr,          X86::VPBLENDMDZrm,          0 },
+    { X86::VPBLENDMQZrr,          X86::VPBLENDMQZrm,          0 },
+    { X86::VBROADCASTSSZrk,       X86::VBROADCASTSSZmk,       TB_NO_REVERSE },
+    { X86::VBROADCASTSDZrk,       X86::VBROADCASTSDZmk,       TB_NO_REVERSE },
+    { X86::VBROADCASTSSZ256rk,    X86::VBROADCASTSSZ256mk,    TB_NO_REVERSE },
+    { X86::VBROADCASTSDZ256rk,    X86::VBROADCASTSDZ256mk,    TB_NO_REVERSE },
+    { X86::VBROADCASTSSZ128rk,    X86::VBROADCASTSSZ128mk,    TB_NO_REVERSE },
+    { X86::VPERMI2Brr,            X86::VPERMI2Brm,            0 },
+    { X86::VPERMI2Drr,            X86::VPERMI2Drm,            0 },
+    { X86::VPERMI2PSrr,           X86::VPERMI2PSrm,           0 },
+    { X86::VPERMI2PDrr,           X86::VPERMI2PDrm,           0 },
+    { X86::VPERMI2Qrr,            X86::VPERMI2Qrm,            0 },
+    { X86::VPERMI2Wrr,            X86::VPERMI2Wrm,            0 },
+    { X86::VPERMT2Brr,            X86::VPERMT2Brm,            0 },
+    { X86::VPERMT2Drr,            X86::VPERMT2Drm,            0 },
+    { X86::VPERMT2PSrr,           X86::VPERMT2PSrm,           0 },
+    { X86::VPERMT2PDrr,           X86::VPERMT2PDrm,           0 },
+    { X86::VPERMT2Qrr,            X86::VPERMT2Qrm,            0 },
+    { X86::VPERMT2Wrr,            X86::VPERMT2Wrm,            0 },
+    { X86::VPTERNLOGDZrri,        X86::VPTERNLOGDZrmi,        0 },
+    { X86::VPTERNLOGQZrri,        X86::VPTERNLOGQZrmi,        0 },
+
+    // AVX-512VL 256-bit instructions with 3 source operands.
+    { X86::VPERMI2B256rr,         X86::VPERMI2B256rm,         0 },
+    { X86::VPERMI2D256rr,         X86::VPERMI2D256rm,         0 },
+    { X86::VPERMI2PD256rr,        X86::VPERMI2PD256rm,        0 },
+    { X86::VPERMI2PS256rr,        X86::VPERMI2PS256rm,        0 },
+    { X86::VPERMI2Q256rr,         X86::VPERMI2Q256rm,         0 },
+    { X86::VPERMI2W256rr,         X86::VPERMI2W256rm,         0 },
+    { X86::VPERMT2B256rr,         X86::VPERMT2B256rm,         0 },
+    { X86::VPERMT2D256rr,         X86::VPERMT2D256rm,         0 },
+    { X86::VPERMT2PD256rr,        X86::VPERMT2PD256rm,        0 },
+    { X86::VPERMT2PS256rr,        X86::VPERMT2PS256rm,        0 },
+    { X86::VPERMT2Q256rr,         X86::VPERMT2Q256rm,         0 },
+    { X86::VPERMT2W256rr,         X86::VPERMT2W256rm,         0 },
+    { X86::VPTERNLOGDZ256rri,     X86::VPTERNLOGDZ256rmi,     0 },
+    { X86::VPTERNLOGQZ256rri,     X86::VPTERNLOGQZ256rmi,     0 },
+
+    // AVX-512VL 128-bit instructions with 3 source operands.
+    { X86::VPERMI2B128rr,         X86::VPERMI2B128rm,         0 },
+    { X86::VPERMI2D128rr,         X86::VPERMI2D128rm,         0 },
+    { X86::VPERMI2PD128rr,        X86::VPERMI2PD128rm,        0 },
+    { X86::VPERMI2PS128rr,        X86::VPERMI2PS128rm,        0 },
+    { X86::VPERMI2Q128rr,         X86::VPERMI2Q128rm,         0 },
+    { X86::VPERMI2W128rr,         X86::VPERMI2W128rm,         0 },
+    { X86::VPERMT2B128rr,         X86::VPERMT2B128rm,         0 },
+    { X86::VPERMT2D128rr,         X86::VPERMT2D128rm,         0 },
+    { X86::VPERMT2PD128rr,        X86::VPERMT2PD128rm,        0 },
+    { X86::VPERMT2PS128rr,        X86::VPERMT2PS128rm,        0 },
+    { X86::VPERMT2Q128rr,         X86::VPERMT2Q128rm,         0 },
+    { X86::VPERMT2W128rr,         X86::VPERMT2W128rm,         0 },
+    { X86::VPTERNLOGDZ128rri,     X86::VPTERNLOGDZ128rmi,     0 },
+    { X86::VPTERNLOGQZ128rri,     X86::VPTERNLOGQZ128rmi,     0 },
+
+    // AVX-512 masked instructions
+    { X86::VADDPDZrrkz,           X86::VADDPDZrmkz,           0 },
+    { X86::VADDPSZrrkz,           X86::VADDPSZrmkz,           0 },
+    { X86::VALIGNDZrrikz,         X86::VALIGNDZrmikz,         0 },
+    { X86::VALIGNQZrrikz,         X86::VALIGNQZrmikz,         0 },
+    { X86::VANDNPDZrrkz,          X86::VANDNPDZrmkz,          0 },
+    { X86::VANDNPSZrrkz,          X86::VANDNPSZrmkz,          0 },
+    { X86::VANDPDZrrkz,           X86::VANDPDZrmkz,           0 },
+    { X86::VANDPSZrrkz,           X86::VANDPSZrmkz,           0 },
+    { X86::VDIVPDZrrkz,           X86::VDIVPDZrmkz,           0 },
+    { X86::VDIVPSZrrkz,           X86::VDIVPSZrmkz,           0 },
+    { X86::VINSERTF32x4Zrrkz,     X86::VINSERTF32x4Zrmkz,     0 },
+    { X86::VINSERTF32x8Zrrkz,     X86::VINSERTF32x8Zrmkz,     0 },
+    { X86::VINSERTF64x2Zrrkz,     X86::VINSERTF64x2Zrmkz,     0 },
+    { X86::VINSERTF64x4Zrrkz,     X86::VINSERTF64x4Zrmkz,     0 },
+    { X86::VINSERTI32x4Zrrkz,     X86::VINSERTI32x4Zrmkz,     0 },
+    { X86::VINSERTI32x8Zrrkz,     X86::VINSERTI32x8Zrmkz,     0 },
+    { X86::VINSERTI64x2Zrrkz,     X86::VINSERTI64x2Zrmkz,     0 },
+    { X86::VINSERTI64x4Zrrkz,     X86::VINSERTI64x4Zrmkz,     0 },
+    { X86::VMAXCPDZrrkz,          X86::VMAXCPDZrmkz,          0 },
+    { X86::VMAXCPSZrrkz,          X86::VMAXCPSZrmkz,          0 },
+    { X86::VMAXPDZrrkz,           X86::VMAXPDZrmkz,           0 },
+    { X86::VMAXPSZrrkz,           X86::VMAXPSZrmkz,           0 },
+    { X86::VMINCPDZrrkz,          X86::VMINCPDZrmkz,          0 },
+    { X86::VMINCPSZrrkz,          X86::VMINCPSZrmkz,          0 },
+    { X86::VMINPDZrrkz,           X86::VMINPDZrmkz,           0 },
+    { X86::VMINPSZrrkz,           X86::VMINPSZrmkz,           0 },
+    { X86::VMULPDZrrkz,           X86::VMULPDZrmkz,           0 },
+    { X86::VMULPSZrrkz,           X86::VMULPSZrmkz,           0 },
+    { X86::VORPDZrrkz,            X86::VORPDZrmkz,            0 },
+    { X86::VORPSZrrkz,            X86::VORPSZrmkz,            0 },
+    { X86::VPADDBZrrkz,           X86::VPADDBZrmkz,           0 },
+    { X86::VPADDDZrrkz,           X86::VPADDDZrmkz,           0 },
+    { X86::VPADDQZrrkz,           X86::VPADDQZrmkz,           0 },
+    { X86::VPADDSBZrrkz,          X86::VPADDSBZrmkz,          0 },
+    { X86::VPADDSWZrrkz,          X86::VPADDSWZrmkz,          0 },
+    { X86::VPADDUSBZrrkz,         X86::VPADDUSBZrmkz,         0 },
+    { X86::VPADDUSWZrrkz,         X86::VPADDUSWZrmkz,         0 },
+    { X86::VPADDWZrrkz,           X86::VPADDWZrmkz,           0 },
+    { X86::VPALIGNRZrrikz,        X86::VPALIGNRZrmikz,        0 },
+    { X86::VPANDDZrrkz,           X86::VPANDDZrmkz,           0 },
+    { X86::VPANDNDZrrkz,          X86::VPANDNDZrmkz,          0 },
+    { X86::VPANDNQZrrkz,          X86::VPANDNQZrmkz,          0 },
+    { X86::VPANDQZrrkz,           X86::VPANDQZrmkz,           0 },
+    { X86::VPERMBZrrkz,           X86::VPERMBZrmkz,           0 },
+    { X86::VPERMDZrrkz,           X86::VPERMDZrmkz,           0 },
+    { X86::VPERMILPDZrrkz,        X86::VPERMILPDZrmkz,        0 },
+    { X86::VPERMILPSZrrkz,        X86::VPERMILPSZrmkz,        0 },
+    { X86::VPERMPDZrrkz,          X86::VPERMPDZrmkz,          0 },
+    { X86::VPERMPSZrrkz,          X86::VPERMPSZrmkz,          0 },
+    { X86::VPERMQZrrkz,           X86::VPERMQZrmkz,           0 },
+    { X86::VPERMWZrrkz,           X86::VPERMWZrmkz,           0 },
+    { X86::VPMADDUBSWZrrkz,       X86::VPMADDUBSWZrmkz,       0 },
+    { X86::VPMADDWDZrrkz,         X86::VPMADDWDZrmkz,         0 },
+    { X86::VPORDZrrkz,            X86::VPORDZrmkz,            0 },
+    { X86::VPORQZrrkz,            X86::VPORQZrmkz,            0 },
+    { X86::VPSHUFBZrrkz,          X86::VPSHUFBZrmkz,          0 },
+    { X86::VPSUBBZrrkz,           X86::VPSUBBZrmkz,           0 },
+    { X86::VPSUBDZrrkz,           X86::VPSUBDZrmkz,           0 },
+    { X86::VPSUBQZrrkz,           X86::VPSUBQZrmkz,           0 },
+    { X86::VPSUBSBZrrkz,          X86::VPSUBSBZrmkz,          0 },
+    { X86::VPSUBSWZrrkz,          X86::VPSUBSWZrmkz,          0 },
+    { X86::VPSUBUSBZrrkz,         X86::VPSUBUSBZrmkz,         0 },
+    { X86::VPSUBUSWZrrkz,         X86::VPSUBUSWZrmkz,         0 },
+    { X86::VPSUBWZrrkz,           X86::VPSUBWZrmkz,           0 },
+    { X86::VPUNPCKHBWZrrkz,       X86::VPUNPCKHBWZrmkz,       0 },
+    { X86::VPUNPCKHDQZrrkz,       X86::VPUNPCKHDQZrmkz,       0 },
+    { X86::VPUNPCKHQDQZrrkz,      X86::VPUNPCKHQDQZrmkz,      0 },
+    { X86::VPUNPCKHWDZrrkz,       X86::VPUNPCKHWDZrmkz,       0 },
+    { X86::VPUNPCKLBWZrrkz,       X86::VPUNPCKLBWZrmkz,       0 },
+    { X86::VPUNPCKLDQZrrkz,       X86::VPUNPCKLDQZrmkz,       0 },
+    { X86::VPUNPCKLQDQZrrkz,      X86::VPUNPCKLQDQZrmkz,      0 },
+    { X86::VPUNPCKLWDZrrkz,       X86::VPUNPCKLWDZrmkz,       0 },
+    { X86::VPXORDZrrkz,           X86::VPXORDZrmkz,           0 },
+    { X86::VPXORQZrrkz,           X86::VPXORQZrmkz,           0 },
+    { X86::VSUBPDZrrkz,           X86::VSUBPDZrmkz,           0 },
+    { X86::VSUBPSZrrkz,           X86::VSUBPSZrmkz,           0 },
+    { X86::VUNPCKHPDZrrkz,        X86::VUNPCKHPDZrmkz,        0 },
+    { X86::VUNPCKHPSZrrkz,        X86::VUNPCKHPSZrmkz,        0 },
+    { X86::VUNPCKLPDZrrkz,        X86::VUNPCKLPDZrmkz,        0 },
+    { X86::VUNPCKLPSZrrkz,        X86::VUNPCKLPSZrmkz,        0 },
+    { X86::VXORPDZrrkz,           X86::VXORPDZrmkz,           0 },
+    { X86::VXORPSZrrkz,           X86::VXORPSZrmkz,           0 },
+
+    // AVX-512{F,VL} masked arithmetic instructions 256-bit
+    { X86::VADDPDZ256rrkz,        X86::VADDPDZ256rmkz,        0 },
+    { X86::VADDPSZ256rrkz,        X86::VADDPSZ256rmkz,        0 },
+    { X86::VALIGNDZ256rrikz,      X86::VALIGNDZ256rmikz,      0 },
+    { X86::VALIGNQZ256rrikz,      X86::VALIGNQZ256rmikz,      0 },
+    { X86::VANDNPDZ256rrkz,       X86::VANDNPDZ256rmkz,       0 },
+    { X86::VANDNPSZ256rrkz,       X86::VANDNPSZ256rmkz,       0 },
+    { X86::VANDPDZ256rrkz,        X86::VANDPDZ256rmkz,        0 },
+    { X86::VANDPSZ256rrkz,        X86::VANDPSZ256rmkz,        0 },
+    { X86::VDIVPDZ256rrkz,        X86::VDIVPDZ256rmkz,        0 },
+    { X86::VDIVPSZ256rrkz,        X86::VDIVPSZ256rmkz,        0 },
+    { X86::VINSERTF32x4Z256rrkz,  X86::VINSERTF32x4Z256rmkz,  0 },
+    { X86::VINSERTF64x2Z256rrkz,  X86::VINSERTF64x2Z256rmkz,  0 },
+    { X86::VINSERTI32x4Z256rrkz,  X86::VINSERTI32x4Z256rmkz,  0 },
+    { X86::VINSERTI64x2Z256rrkz,  X86::VINSERTI64x2Z256rmkz,  0 },
+    { X86::VMAXCPDZ256rrkz,       X86::VMAXCPDZ256rmkz,       0 },
+    { X86::VMAXCPSZ256rrkz,       X86::VMAXCPSZ256rmkz,       0 },
+    { X86::VMAXPDZ256rrkz,        X86::VMAXPDZ256rmkz,        0 },
+    { X86::VMAXPSZ256rrkz,        X86::VMAXPSZ256rmkz,        0 },
+    { X86::VMINCPDZ256rrkz,       X86::VMINCPDZ256rmkz,       0 },
+    { X86::VMINCPSZ256rrkz,       X86::VMINCPSZ256rmkz,       0 },
+    { X86::VMINPDZ256rrkz,        X86::VMINPDZ256rmkz,        0 },
+    { X86::VMINPSZ256rrkz,        X86::VMINPSZ256rmkz,        0 },
+    { X86::VMULPDZ256rrkz,        X86::VMULPDZ256rmkz,        0 },
+    { X86::VMULPSZ256rrkz,        X86::VMULPSZ256rmkz,        0 },
+    { X86::VORPDZ256rrkz,         X86::VORPDZ256rmkz,         0 },
+    { X86::VORPSZ256rrkz,         X86::VORPSZ256rmkz,         0 },
+    { X86::VPADDBZ256rrkz,        X86::VPADDBZ256rmkz,        0 },
+    { X86::VPADDDZ256rrkz,        X86::VPADDDZ256rmkz,        0 },
+    { X86::VPADDQZ256rrkz,        X86::VPADDQZ256rmkz,        0 },
+    { X86::VPADDSBZ256rrkz,       X86::VPADDSBZ256rmkz,       0 },
+    { X86::VPADDSWZ256rrkz,       X86::VPADDSWZ256rmkz,       0 },
+    { X86::VPADDUSBZ256rrkz,      X86::VPADDUSBZ256rmkz,      0 },
+    { X86::VPADDUSWZ256rrkz,      X86::VPADDUSWZ256rmkz,      0 },
+    { X86::VPADDWZ256rrkz,        X86::VPADDWZ256rmkz,        0 },
+    { X86::VPALIGNRZ256rrikz,     X86::VPALIGNRZ256rmikz,     0 },
+    { X86::VPANDDZ256rrkz,        X86::VPANDDZ256rmkz,        0 },
+    { X86::VPANDNDZ256rrkz,       X86::VPANDNDZ256rmkz,       0 },
+    { X86::VPANDNQZ256rrkz,       X86::VPANDNQZ256rmkz,       0 },
+    { X86::VPANDQZ256rrkz,        X86::VPANDQZ256rmkz,        0 },
+    { X86::VPERMBZ256rrkz,        X86::VPERMBZ256rmkz,        0 },
+    { X86::VPERMDZ256rrkz,        X86::VPERMDZ256rmkz,        0 },
+    { X86::VPERMILPDZ256rrkz,     X86::VPERMILPDZ256rmkz,     0 },
+    { X86::VPERMILPSZ256rrkz,     X86::VPERMILPSZ256rmkz,     0 },
+    { X86::VPERMPDZ256rrkz,       X86::VPERMPDZ256rmkz,       0 },
+    { X86::VPERMPSZ256rrkz,       X86::VPERMPSZ256rmkz,       0 },
+    { X86::VPERMQZ256rrkz,        X86::VPERMQZ256rmkz,        0 },
+    { X86::VPERMWZ256rrkz,        X86::VPERMWZ256rmkz,        0 },
+    { X86::VPMADDUBSWZ256rrkz,    X86::VPMADDUBSWZ256rmkz,    0 },
+    { X86::VPMADDWDZ256rrkz,      X86::VPMADDWDZ256rmkz,      0 },
+    { X86::VPORDZ256rrkz,         X86::VPORDZ256rmkz,         0 },
+    { X86::VPORQZ256rrkz,         X86::VPORQZ256rmkz,         0 },
+    { X86::VPSHUFBZ256rrkz,       X86::VPSHUFBZ256rmkz,       0 },
+    { X86::VPSUBBZ256rrkz,        X86::VPSUBBZ256rmkz,        0 },
+    { X86::VPSUBDZ256rrkz,        X86::VPSUBDZ256rmkz,        0 },
+    { X86::VPSUBQZ256rrkz,        X86::VPSUBQZ256rmkz,        0 },
+    { X86::VPSUBSBZ256rrkz,       X86::VPSUBSBZ256rmkz,       0 },
+    { X86::VPSUBSWZ256rrkz,       X86::VPSUBSWZ256rmkz,       0 },
+    { X86::VPSUBUSBZ256rrkz,      X86::VPSUBUSBZ256rmkz,      0 },
+    { X86::VPSUBUSWZ256rrkz,      X86::VPSUBUSWZ256rmkz,      0 },
+    { X86::VPSUBWZ256rrkz,        X86::VPSUBWZ256rmkz,        0 },
+    { X86::VPUNPCKHBWZ256rrkz,    X86::VPUNPCKHBWZ256rmkz,    0 },
+    { X86::VPUNPCKHDQZ256rrkz,    X86::VPUNPCKHDQZ256rmkz,    0 },
+    { X86::VPUNPCKHQDQZ256rrkz,   X86::VPUNPCKHQDQZ256rmkz,   0 },
+    { X86::VPUNPCKHWDZ256rrkz,    X86::VPUNPCKHWDZ256rmkz,    0 },
+    { X86::VPUNPCKLBWZ256rrkz,    X86::VPUNPCKLBWZ256rmkz,    0 },
+    { X86::VPUNPCKLDQZ256rrkz,    X86::VPUNPCKLDQZ256rmkz,    0 },
+    { X86::VPUNPCKLQDQZ256rrkz,   X86::VPUNPCKLQDQZ256rmkz,   0 },
+    { X86::VPUNPCKLWDZ256rrkz,    X86::VPUNPCKLWDZ256rmkz,    0 },
+    { X86::VPXORDZ256rrkz,        X86::VPXORDZ256rmkz,        0 },
+    { X86::VPXORQZ256rrkz,        X86::VPXORQZ256rmkz,        0 },
+    { X86::VSUBPDZ256rrkz,        X86::VSUBPDZ256rmkz,        0 },
+    { X86::VSUBPSZ256rrkz,        X86::VSUBPSZ256rmkz,        0 },
+    { X86::VUNPCKHPDZ256rrkz,     X86::VUNPCKHPDZ256rmkz,     0 },
+    { X86::VUNPCKHPSZ256rrkz,     X86::VUNPCKHPSZ256rmkz,     0 },
+    { X86::VUNPCKLPDZ256rrkz,     X86::VUNPCKLPDZ256rmkz,     0 },
+    { X86::VUNPCKLPSZ256rrkz,     X86::VUNPCKLPSZ256rmkz,     0 },
+    { X86::VXORPDZ256rrkz,        X86::VXORPDZ256rmkz,        0 },
+    { X86::VXORPSZ256rrkz,        X86::VXORPSZ256rmkz,        0 },
+
+    // AVX-512{F,VL} masked arithmetic instructions 128-bit
+    { X86::VADDPDZ128rrkz,        X86::VADDPDZ128rmkz,        0 },
+    { X86::VADDPSZ128rrkz,        X86::VADDPSZ128rmkz,        0 },
+    { X86::VALIGNDZ128rrikz,      X86::VALIGNDZ128rmikz,      0 },
+    { X86::VALIGNQZ128rrikz,      X86::VALIGNQZ128rmikz,      0 },
+    { X86::VANDNPDZ128rrkz,       X86::VANDNPDZ128rmkz,       0 },
+    { X86::VANDNPSZ128rrkz,       X86::VANDNPSZ128rmkz,       0 },
+    { X86::VANDPDZ128rrkz,        X86::VANDPDZ128rmkz,        0 },
+    { X86::VANDPSZ128rrkz,        X86::VANDPSZ128rmkz,        0 },
+    { X86::VDIVPDZ128rrkz,        X86::VDIVPDZ128rmkz,        0 },
+    { X86::VDIVPSZ128rrkz,        X86::VDIVPSZ128rmkz,        0 },
+    { X86::VMAXCPDZ128rrkz,       X86::VMAXCPDZ128rmkz,       0 },
+    { X86::VMAXCPSZ128rrkz,       X86::VMAXCPSZ128rmkz,       0 },
+    { X86::VMAXPDZ128rrkz,        X86::VMAXPDZ128rmkz,        0 },
+    { X86::VMAXPSZ128rrkz,        X86::VMAXPSZ128rmkz,        0 },
+    { X86::VMINCPDZ128rrkz,       X86::VMINCPDZ128rmkz,       0 },
+    { X86::VMINCPSZ128rrkz,       X86::VMINCPSZ128rmkz,       0 },
+    { X86::VMINPDZ128rrkz,        X86::VMINPDZ128rmkz,        0 },
+    { X86::VMINPSZ128rrkz,        X86::VMINPSZ128rmkz,        0 },
+    { X86::VMULPDZ128rrkz,        X86::VMULPDZ128rmkz,        0 },
+    { X86::VMULPSZ128rrkz,        X86::VMULPSZ128rmkz,        0 },
+    { X86::VORPDZ128rrkz,         X86::VORPDZ128rmkz,         0 },
+    { X86::VORPSZ128rrkz,         X86::VORPSZ128rmkz,         0 },
+    { X86::VPADDBZ128rrkz,        X86::VPADDBZ128rmkz,        0 },
+    { X86::VPADDDZ128rrkz,        X86::VPADDDZ128rmkz,        0 },
+    { X86::VPADDQZ128rrkz,        X86::VPADDQZ128rmkz,        0 },
+    { X86::VPADDSBZ128rrkz,       X86::VPADDSBZ128rmkz,       0 },
+    { X86::VPADDSWZ128rrkz,       X86::VPADDSWZ128rmkz,       0 },
+    { X86::VPADDUSBZ128rrkz,      X86::VPADDUSBZ128rmkz,      0 },
+    { X86::VPADDUSWZ128rrkz,      X86::VPADDUSWZ128rmkz,      0 },
+    { X86::VPADDWZ128rrkz,        X86::VPADDWZ128rmkz,        0 },
+    { X86::VPALIGNRZ128rrikz,     X86::VPALIGNRZ128rmikz,     0 },
+    { X86::VPANDDZ128rrkz,        X86::VPANDDZ128rmkz,        0 },
+    { X86::VPANDNDZ128rrkz,       X86::VPANDNDZ128rmkz,       0 },
+    { X86::VPANDNQZ128rrkz,       X86::VPANDNQZ128rmkz,       0 },
+    { X86::VPANDQZ128rrkz,        X86::VPANDQZ128rmkz,        0 },
+    { X86::VPERMBZ128rrkz,        X86::VPERMBZ128rmkz,        0 },
+    { X86::VPERMILPDZ128rrkz,     X86::VPERMILPDZ128rmkz,     0 },
+    { X86::VPERMILPSZ128rrkz,     X86::VPERMILPSZ128rmkz,     0 },
+    { X86::VPERMWZ128rrkz,        X86::VPERMWZ128rmkz,        0 },
+    { X86::VPMADDUBSWZ128rrkz,    X86::VPMADDUBSWZ128rmkz,    0 },
+    { X86::VPMADDWDZ128rrkz,      X86::VPMADDWDZ128rmkz,      0 },
+    { X86::VPORDZ128rrkz,         X86::VPORDZ128rmkz,         0 },
+    { X86::VPORQZ128rrkz,         X86::VPORQZ128rmkz,         0 },
+    { X86::VPSHUFBZ128rrkz,       X86::VPSHUFBZ128rmkz,       0 },
+    { X86::VPSUBBZ128rrkz,        X86::VPSUBBZ128rmkz,        0 },
+    { X86::VPSUBDZ128rrkz,        X86::VPSUBDZ128rmkz,        0 },
+    { X86::VPSUBQZ128rrkz,        X86::VPSUBQZ128rmkz,        0 },
+    { X86::VPSUBSBZ128rrkz,       X86::VPSUBSBZ128rmkz,       0 },
+    { X86::VPSUBSWZ128rrkz,       X86::VPSUBSWZ128rmkz,       0 },
+    { X86::VPSUBUSBZ128rrkz,      X86::VPSUBUSBZ128rmkz,      0 },
+    { X86::VPSUBUSWZ128rrkz,      X86::VPSUBUSWZ128rmkz,      0 },
+    { X86::VPSUBWZ128rrkz,        X86::VPSUBWZ128rmkz,        0 },
+    { X86::VPUNPCKHBWZ128rrkz,    X86::VPUNPCKHBWZ128rmkz,    0 },
+    { X86::VPUNPCKHDQZ128rrkz,    X86::VPUNPCKHDQZ128rmkz,    0 },
+    { X86::VPUNPCKHQDQZ128rrkz,   X86::VPUNPCKHQDQZ128rmkz,   0 },
+    { X86::VPUNPCKHWDZ128rrkz,    X86::VPUNPCKHWDZ128rmkz,    0 },
+    { X86::VPUNPCKLBWZ128rrkz,    X86::VPUNPCKLBWZ128rmkz,    0 },
+    { X86::VPUNPCKLDQZ128rrkz,    X86::VPUNPCKLDQZ128rmkz,    0 },
+    { X86::VPUNPCKLQDQZ128rrkz,   X86::VPUNPCKLQDQZ128rmkz,   0 },
+    { X86::VPUNPCKLWDZ128rrkz,    X86::VPUNPCKLWDZ128rmkz,    0 },
+    { X86::VPXORDZ128rrkz,        X86::VPXORDZ128rmkz,        0 },
+    { X86::VPXORQZ128rrkz,        X86::VPXORQZ128rmkz,        0 },
+    { X86::VSUBPDZ128rrkz,        X86::VSUBPDZ128rmkz,        0 },
+    { X86::VSUBPSZ128rrkz,        X86::VSUBPSZ128rmkz,        0 },
+    { X86::VUNPCKHPDZ128rrkz,     X86::VUNPCKHPDZ128rmkz,     0 },
+    { X86::VUNPCKHPSZ128rrkz,     X86::VUNPCKHPSZ128rmkz,     0 },
+    { X86::VUNPCKLPDZ128rrkz,     X86::VUNPCKLPDZ128rmkz,     0 },
+    { X86::VUNPCKLPSZ128rrkz,     X86::VUNPCKLPSZ128rmkz,     0 },
+    { X86::VXORPDZ128rrkz,        X86::VXORPDZ128rmkz,        0 },
+    { X86::VXORPSZ128rrkz,        X86::VXORPSZ128rmkz,        0 },
+
+    // AVX-512 masked foldable instructions
+    { X86::VPERMILPDZrik,         X86::VPERMILPDZmik,         0 },
+    { X86::VPERMILPSZrik,         X86::VPERMILPSZmik,         0 },
+    { X86::VPERMPDZrik,           X86::VPERMPDZmik,           0 },
+    { X86::VPERMQZrik,            X86::VPERMQZmik,            0 },
+    { X86::VPMOVSXBDZrrk,         X86::VPMOVSXBDZrmk,         0 },
+    { X86::VPMOVSXBQZrrk,         X86::VPMOVSXBQZrmk,         TB_NO_REVERSE },
+    { X86::VPMOVSXBWZrrk,         X86::VPMOVSXBWZrmk,         0 },
+    { X86::VPMOVSXDQZrrk,         X86::VPMOVSXDQZrmk,         0 },
+    { X86::VPMOVSXWDZrrk,         X86::VPMOVSXWDZrmk,         0 },
+    { X86::VPMOVSXWQZrrk,         X86::VPMOVSXWQZrmk,         0 },
+    { X86::VPMOVZXBDZrrk,         X86::VPMOVZXBDZrmk,         0 },
+    { X86::VPMOVZXBQZrrk,         X86::VPMOVZXBQZrmk,         TB_NO_REVERSE },
+    { X86::VPMOVZXBWZrrk,         X86::VPMOVZXBWZrmk,         0 },
+    { X86::VPMOVZXDQZrrk,         X86::VPMOVZXDQZrmk,         0 },
+    { X86::VPMOVZXWDZrrk,         X86::VPMOVZXWDZrmk,         0 },
+    { X86::VPMOVZXWQZrrk,         X86::VPMOVZXWQZrmk,         0 },
+    { X86::VPSHUFDZrik,           X86::VPSHUFDZmik,           0 },
+    { X86::VPSHUFHWZrik,          X86::VPSHUFHWZmik,          0 },
+    { X86::VPSHUFLWZrik,          X86::VPSHUFLWZmik,          0 },
+
+    // AVX-512VL 256-bit masked foldable instructions
+    { X86::VPERMILPDZ256rik,      X86::VPERMILPDZ256mik,      0 },
+    { X86::VPERMILPSZ256rik,      X86::VPERMILPSZ256mik,      0 },
+    { X86::VPERMPDZ256rik,        X86::VPERMPDZ256mik,        0 },
+    { X86::VPERMQZ256rik,         X86::VPERMQZ256mik,         0 },
+    { X86::VPMOVSXBDZ256rrk,      X86::VPMOVSXBDZ256rmk,      TB_NO_REVERSE },
+    { X86::VPMOVSXBQZ256rrk,      X86::VPMOVSXBQZ256rmk,      TB_NO_REVERSE },
+    { X86::VPMOVSXBWZ256rrk,      X86::VPMOVSXBWZ256rmk,      0 },
+    { X86::VPMOVSXDQZ256rrk,      X86::VPMOVSXDQZ256rmk,      0 },
+    { X86::VPMOVSXWDZ256rrk,      X86::VPMOVSXWDZ256rmk,      0 },
+    { X86::VPMOVSXWQZ256rrk,      X86::VPMOVSXWQZ256rmk,      TB_NO_REVERSE },
+    { X86::VPMOVZXBDZ256rrk,      X86::VPMOVZXBDZ256rmk,      TB_NO_REVERSE },
+    { X86::VPMOVZXBQZ256rrk,      X86::VPMOVZXBQZ256rmk,      TB_NO_REVERSE },
+    { X86::VPMOVZXBWZ256rrk,      X86::VPMOVZXBWZ256rmk,      0 },
+    { X86::VPMOVZXDQZ256rrk,      X86::VPMOVZXDQZ256rmk,      0 },
+    { X86::VPMOVZXWDZ256rrk,      X86::VPMOVZXWDZ256rmk,      0 },
+    { X86::VPMOVZXWQZ256rrk,      X86::VPMOVZXWQZ256rmk,      TB_NO_REVERSE },
+    { X86::VPSHUFDZ256rik,        X86::VPSHUFDZ256mik,        0 },
+    { X86::VPSHUFHWZ256rik,       X86::VPSHUFHWZ256mik,       0 },
+    { X86::VPSHUFLWZ256rik,       X86::VPSHUFLWZ256mik,       0 },
+
+    // AVX-512VL 128-bit masked foldable instructions
+    { X86::VPERMILPDZ128rik,      X86::VPERMILPDZ128mik,      0 },
+    { X86::VPERMILPSZ128rik,      X86::VPERMILPSZ128mik,      0 },
+    { X86::VPMOVSXBDZ128rrk,      X86::VPMOVSXBDZ128rmk,      TB_NO_REVERSE },
+    { X86::VPMOVSXBQZ128rrk,      X86::VPMOVSXBQZ128rmk,      TB_NO_REVERSE },
+    { X86::VPMOVSXBWZ128rrk,      X86::VPMOVSXBWZ128rmk,      TB_NO_REVERSE },
+    { X86::VPMOVSXDQZ128rrk,      X86::VPMOVSXDQZ128rmk,      TB_NO_REVERSE },
+    { X86::VPMOVSXWDZ128rrk,      X86::VPMOVSXWDZ128rmk,      TB_NO_REVERSE },
+    { X86::VPMOVSXWQZ128rrk,      X86::VPMOVSXWQZ128rmk,      TB_NO_REVERSE },
+    { X86::VPMOVZXBDZ128rrk,      X86::VPMOVZXBDZ128rmk,      TB_NO_REVERSE },
+    { X86::VPMOVZXBQZ128rrk,      X86::VPMOVZXBQZ128rmk,      TB_NO_REVERSE },
+    { X86::VPMOVZXBWZ128rrk,      X86::VPMOVZXBWZ128rmk,      TB_NO_REVERSE },
+    { X86::VPMOVZXDQZ128rrk,      X86::VPMOVZXDQZ128rmk,      TB_NO_REVERSE },
+    { X86::VPMOVZXWDZ128rrk,      X86::VPMOVZXWDZ128rmk,      TB_NO_REVERSE },
+    { X86::VPMOVZXWQZ128rrk,      X86::VPMOVZXWQZ128rmk,      TB_NO_REVERSE },
+    { X86::VPSHUFDZ128rik,        X86::VPSHUFDZ128mik,        0 },
+    { X86::VPSHUFHWZ128rik,       X86::VPSHUFHWZ128mik,       0 },
+    { X86::VPSHUFLWZ128rik,       X86::VPSHUFLWZ128mik,       0 },
+  };
+
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
+    AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+                  Entry.RegOp, Entry.MemOp,
+                  // Index 3, folded load
+                  Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
+  }
+  auto I = X86InstrFMA3Info::rm_begin();
+  auto E = X86InstrFMA3Info::rm_end();
+  for (; I != E; ++I) {
+    if (!I.getGroup()->isKMasked()) {
+      // Intrinsic forms need to pass TB_NO_REVERSE.
+      if (I.getGroup()->isIntrinsic()) {
+        AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+                      I.getRegOpcode(), I.getMemOpcode(),
+                      TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD | TB_NO_REVERSE);
+      } else {
+        AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+                      I.getRegOpcode(), I.getMemOpcode(),
+                      TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD);
+      }
+    }
+  }
+
+  static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
+    // AVX-512 foldable masked instructions
+    { X86::VADDPDZrrk,         X86::VADDPDZrmk,           0 },
+    { X86::VADDPSZrrk,         X86::VADDPSZrmk,           0 },
+    { X86::VALIGNDZrrik,       X86::VALIGNDZrmik,         0 },
+    { X86::VALIGNQZrrik,       X86::VALIGNQZrmik,         0 },
+    { X86::VANDNPDZrrk,        X86::VANDNPDZrmk,          0 },
+    { X86::VANDNPSZrrk,        X86::VANDNPSZrmk,          0 },
+    { X86::VANDPDZrrk,         X86::VANDPDZrmk,           0 },
+    { X86::VANDPSZrrk,         X86::VANDPSZrmk,           0 },
+    { X86::VDIVPDZrrk,         X86::VDIVPDZrmk,           0 },
+    { X86::VDIVPSZrrk,         X86::VDIVPSZrmk,           0 },
+    { X86::VINSERTF32x4Zrrk,   X86::VINSERTF32x4Zrmk,     0 },
+    { X86::VINSERTF32x8Zrrk,   X86::VINSERTF32x8Zrmk,     0 },
+    { X86::VINSERTF64x2Zrrk,   X86::VINSERTF64x2Zrmk,     0 },
+    { X86::VINSERTF64x4Zrrk,   X86::VINSERTF64x4Zrmk,     0 },
+    { X86::VINSERTI32x4Zrrk,   X86::VINSERTI32x4Zrmk,     0 },
+    { X86::VINSERTI32x8Zrrk,   X86::VINSERTI32x8Zrmk,     0 },
+    { X86::VINSERTI64x2Zrrk,   X86::VINSERTI64x2Zrmk,     0 },
+    { X86::VINSERTI64x4Zrrk,   X86::VINSERTI64x4Zrmk,     0 },
+    { X86::VMAXCPDZrrk,        X86::VMAXCPDZrmk,          0 },
+    { X86::VMAXCPSZrrk,        X86::VMAXCPSZrmk,          0 },
+    { X86::VMAXPDZrrk,         X86::VMAXPDZrmk,           0 },
+    { X86::VMAXPSZrrk,         X86::VMAXPSZrmk,           0 },
+    { X86::VMINCPDZrrk,        X86::VMINCPDZrmk,          0 },
+    { X86::VMINCPSZrrk,        X86::VMINCPSZrmk,          0 },
+    { X86::VMINPDZrrk,         X86::VMINPDZrmk,           0 },
+    { X86::VMINPSZrrk,         X86::VMINPSZrmk,           0 },
+    { X86::VMULPDZrrk,         X86::VMULPDZrmk,           0 },
+    { X86::VMULPSZrrk,         X86::VMULPSZrmk,           0 },
+    { X86::VORPDZrrk,          X86::VORPDZrmk,            0 },
+    { X86::VORPSZrrk,          X86::VORPSZrmk,            0 },
+    { X86::VPADDBZrrk,         X86::VPADDBZrmk,           0 },
+    { X86::VPADDDZrrk,         X86::VPADDDZrmk,           0 },
+    { X86::VPADDQZrrk,         X86::VPADDQZrmk,           0 },
+    { X86::VPADDSBZrrk,        X86::VPADDSBZrmk,          0 },
+    { X86::VPADDSWZrrk,        X86::VPADDSWZrmk,          0 },
+    { X86::VPADDUSBZrrk,       X86::VPADDUSBZrmk,         0 },
+    { X86::VPADDUSWZrrk,       X86::VPADDUSWZrmk,         0 },
+    { X86::VPADDWZrrk,         X86::VPADDWZrmk,           0 },
+    { X86::VPALIGNRZrrik,      X86::VPALIGNRZrmik,        0 },
+    { X86::VPANDDZrrk,         X86::VPANDDZrmk,           0 },
+    { X86::VPANDNDZrrk,        X86::VPANDNDZrmk,          0 },
+    { X86::VPANDNQZrrk,        X86::VPANDNQZrmk,          0 },
+    { X86::VPANDQZrrk,         X86::VPANDQZrmk,           0 },
+    { X86::VPERMBZrrk,         X86::VPERMBZrmk,           0 },
+    { X86::VPERMDZrrk,         X86::VPERMDZrmk,           0 },
+    { X86::VPERMI2Brrk,        X86::VPERMI2Brmk,          0 },
+    { X86::VPERMI2Drrk,        X86::VPERMI2Drmk,          0 },
+    { X86::VPERMI2PSrrk,       X86::VPERMI2PSrmk,         0 },
+    { X86::VPERMI2PDrrk,       X86::VPERMI2PDrmk,         0 },
+    { X86::VPERMI2Qrrk,        X86::VPERMI2Qrmk,          0 },
+    { X86::VPERMI2Wrrk,        X86::VPERMI2Wrmk,          0 },
+    { X86::VPERMILPDZrrk,      X86::VPERMILPDZrmk,        0 },
+    { X86::VPERMILPSZrrk,      X86::VPERMILPSZrmk,        0 },
+    { X86::VPERMPDZrrk,        X86::VPERMPDZrmk,          0 },
+    { X86::VPERMPSZrrk,        X86::VPERMPSZrmk,          0 },
+    { X86::VPERMQZrrk,         X86::VPERMQZrmk,           0 },
+    { X86::VPERMT2Brrk,        X86::VPERMT2Brmk,          0 },
+    { X86::VPERMT2Drrk,        X86::VPERMT2Drmk,          0 },
+    { X86::VPERMT2PSrrk,       X86::VPERMT2PSrmk,         0 },
+    { X86::VPERMT2PDrrk,       X86::VPERMT2PDrmk,         0 },
+    { X86::VPERMT2Qrrk,        X86::VPERMT2Qrmk,          0 },
+    { X86::VPERMT2Wrrk,        X86::VPERMT2Wrmk,          0 },
+    { X86::VPERMWZrrk,         X86::VPERMWZrmk,           0 },
+    { X86::VPMADDUBSWZrrk,     X86::VPMADDUBSWZrmk,       0 },
+    { X86::VPMADDWDZrrk,       X86::VPMADDWDZrmk,         0 },
+    { X86::VPORDZrrk,          X86::VPORDZrmk,            0 },
+    { X86::VPORQZrrk,          X86::VPORQZrmk,            0 },
+    { X86::VPSHUFBZrrk,        X86::VPSHUFBZrmk,          0 },
+    { X86::VPSUBBZrrk,         X86::VPSUBBZrmk,           0 },
+    { X86::VPSUBDZrrk,         X86::VPSUBDZrmk,           0 },
+    { X86::VPSUBQZrrk,         X86::VPSUBQZrmk,           0 },
+    { X86::VPSUBSBZrrk,        X86::VPSUBSBZrmk,          0 },
+    { X86::VPSUBSWZrrk,        X86::VPSUBSWZrmk,          0 },
+    { X86::VPSUBUSBZrrk,       X86::VPSUBUSBZrmk,         0 },
+    { X86::VPSUBUSWZrrk,       X86::VPSUBUSWZrmk,         0 },
+    { X86::VPTERNLOGDZrrik,    X86::VPTERNLOGDZrmik,      0 },
+    { X86::VPTERNLOGQZrrik,    X86::VPTERNLOGQZrmik,      0 },
+    { X86::VPUNPCKHBWZrrk,     X86::VPUNPCKHBWZrmk,       0 },
+    { X86::VPUNPCKHDQZrrk,     X86::VPUNPCKHDQZrmk,       0 },
+    { X86::VPUNPCKHQDQZrrk,    X86::VPUNPCKHQDQZrmk,      0 },
+    { X86::VPUNPCKHWDZrrk,     X86::VPUNPCKHWDZrmk,       0 },
+    { X86::VPUNPCKLBWZrrk,     X86::VPUNPCKLBWZrmk,       0 },
+    { X86::VPUNPCKLDQZrrk,     X86::VPUNPCKLDQZrmk,       0 },
+    { X86::VPUNPCKLQDQZrrk,    X86::VPUNPCKLQDQZrmk,      0 },
+    { X86::VPUNPCKLWDZrrk,     X86::VPUNPCKLWDZrmk,       0 },
+    { X86::VPXORDZrrk,         X86::VPXORDZrmk,           0 },
+    { X86::VPXORQZrrk,         X86::VPXORQZrmk,           0 },
+    { X86::VSUBPDZrrk,         X86::VSUBPDZrmk,           0 },
+    { X86::VSUBPSZrrk,         X86::VSUBPSZrmk,           0 },
+    { X86::VUNPCKHPDZrrk,      X86::VUNPCKHPDZrmk,        0 },
+    { X86::VUNPCKHPSZrrk,      X86::VUNPCKHPSZrmk,        0 },
+    { X86::VUNPCKLPDZrrk,      X86::VUNPCKLPDZrmk,        0 },
+    { X86::VUNPCKLPSZrrk,      X86::VUNPCKLPSZrmk,        0 },
+    { X86::VXORPDZrrk,         X86::VXORPDZrmk,           0 },
+    { X86::VXORPSZrrk,         X86::VXORPSZrmk,           0 },
+
+    // AVX-512{F,VL} foldable masked instructions 256-bit
+    { X86::VADDPDZ256rrk,      X86::VADDPDZ256rmk,        0 },
+    { X86::VADDPSZ256rrk,      X86::VADDPSZ256rmk,        0 },
+    { X86::VALIGNDZ256rrik,    X86::VALIGNDZ256rmik,      0 },
+    { X86::VALIGNQZ256rrik,    X86::VALIGNQZ256rmik,      0 },
+    { X86::VANDNPDZ256rrk,     X86::VANDNPDZ256rmk,       0 },
+    { X86::VANDNPSZ256rrk,     X86::VANDNPSZ256rmk,       0 },
+    { X86::VANDPDZ256rrk,      X86::VANDPDZ256rmk,        0 },
+    { X86::VANDPSZ256rrk,      X86::VANDPSZ256rmk,        0 },
+    { X86::VDIVPDZ256rrk,      X86::VDIVPDZ256rmk,        0 },
+    { X86::VDIVPSZ256rrk,      X86::VDIVPSZ256rmk,        0 },
+    { X86::VINSERTF32x4Z256rrk,X86::VINSERTF32x4Z256rmk,  0 },
+    { X86::VINSERTF64x2Z256rrk,X86::VINSERTF64x2Z256rmk,  0 },
+    { X86::VINSERTI32x4Z256rrk,X86::VINSERTI32x4Z256rmk,  0 },
+    { X86::VINSERTI64x2Z256rrk,X86::VINSERTI64x2Z256rmk,  0 },
+    { X86::VMAXCPDZ256rrk,     X86::VMAXCPDZ256rmk,       0 },
+    { X86::VMAXCPSZ256rrk,     X86::VMAXCPSZ256rmk,       0 },
+    { X86::VMAXPDZ256rrk,      X86::VMAXPDZ256rmk,        0 },
+    { X86::VMAXPSZ256rrk,      X86::VMAXPSZ256rmk,        0 },
+    { X86::VMINCPDZ256rrk,     X86::VMINCPDZ256rmk,       0 },
+    { X86::VMINCPSZ256rrk,     X86::VMINCPSZ256rmk,       0 },
+    { X86::VMINPDZ256rrk,      X86::VMINPDZ256rmk,        0 },
+    { X86::VMINPSZ256rrk,      X86::VMINPSZ256rmk,        0 },
+    { X86::VMULPDZ256rrk,      X86::VMULPDZ256rmk,        0 },
+    { X86::VMULPSZ256rrk,      X86::VMULPSZ256rmk,        0 },
+    { X86::VORPDZ256rrk,       X86::VORPDZ256rmk,         0 },
+    { X86::VORPSZ256rrk,       X86::VORPSZ256rmk,         0 },
+    { X86::VPADDBZ256rrk,      X86::VPADDBZ256rmk,        0 },
+    { X86::VPADDDZ256rrk,      X86::VPADDDZ256rmk,        0 },
+    { X86::VPADDQZ256rrk,      X86::VPADDQZ256rmk,        0 },
+    { X86::VPADDSBZ256rrk,     X86::VPADDSBZ256rmk,       0 },
+    { X86::VPADDSWZ256rrk,     X86::VPADDSWZ256rmk,       0 },
+    { X86::VPADDUSBZ256rrk,    X86::VPADDUSBZ256rmk,      0 },
+    { X86::VPADDUSWZ256rrk,    X86::VPADDUSWZ256rmk,      0 },
+    { X86::VPADDWZ256rrk,      X86::VPADDWZ256rmk,        0 },
+    { X86::VPALIGNRZ256rrik,   X86::VPALIGNRZ256rmik,     0 },
+    { X86::VPANDDZ256rrk,      X86::VPANDDZ256rmk,        0 },
+    { X86::VPANDNDZ256rrk,     X86::VPANDNDZ256rmk,       0 },
+    { X86::VPANDNQZ256rrk,     X86::VPANDNQZ256rmk,       0 },
+    { X86::VPANDQZ256rrk,      X86::VPANDQZ256rmk,        0 },
+    { X86::VPERMBZ256rrk,      X86::VPERMBZ256rmk,        0 },
+    { X86::VPERMDZ256rrk,      X86::VPERMDZ256rmk,        0 },
+    { X86::VPERMI2B256rrk,     X86::VPERMI2B256rmk,       0 },
+    { X86::VPERMI2D256rrk,     X86::VPERMI2D256rmk,       0 },
+    { X86::VPERMI2PD256rrk,    X86::VPERMI2PD256rmk,      0 },
+    { X86::VPERMI2PS256rrk,    X86::VPERMI2PS256rmk,      0 },
+    { X86::VPERMI2Q256rrk,     X86::VPERMI2Q256rmk,       0 },
+    { X86::VPERMI2W256rrk,     X86::VPERMI2W256rmk,       0 },
+    { X86::VPERMILPDZ256rrk,   X86::VPERMILPDZ256rmk,     0 },
+    { X86::VPERMILPSZ256rrk,   X86::VPERMILPSZ256rmk,     0 },
+    { X86::VPERMPDZ256rrk,     X86::VPERMPDZ256rmk,       0 },
+    { X86::VPERMPSZ256rrk,     X86::VPERMPSZ256rmk,       0 },
+    { X86::VPERMQZ256rrk,      X86::VPERMQZ256rmk,        0 },
+    { X86::VPERMT2B256rrk,     X86::VPERMT2B256rmk,       0 },
+    { X86::VPERMT2D256rrk,     X86::VPERMT2D256rmk,       0 },
+    { X86::VPERMT2PD256rrk,    X86::VPERMT2PD256rmk,      0 },
+    { X86::VPERMT2PS256rrk,    X86::VPERMT2PS256rmk,      0 },
+    { X86::VPERMT2Q256rrk,     X86::VPERMT2Q256rmk,       0 },
+    { X86::VPERMT2W256rrk,     X86::VPERMT2W256rmk,       0 },
+    { X86::VPERMWZ256rrk,      X86::VPERMWZ256rmk,        0 },
+    { X86::VPMADDUBSWZ256rrk,  X86::VPMADDUBSWZ256rmk,    0 },
+    { X86::VPMADDWDZ256rrk,    X86::VPMADDWDZ256rmk,      0 },
+    { X86::VPORDZ256rrk,       X86::VPORDZ256rmk,         0 },
+    { X86::VPORQZ256rrk,       X86::VPORQZ256rmk,         0 },
+    { X86::VPSHUFBZ256rrk,     X86::VPSHUFBZ256rmk,       0 },
+    { X86::VPSUBBZ256rrk,      X86::VPSUBBZ256rmk,        0 },
+    { X86::VPSUBDZ256rrk,      X86::VPSUBDZ256rmk,        0 },
+    { X86::VPSUBQZ256rrk,      X86::VPSUBQZ256rmk,        0 },
+    { X86::VPSUBSBZ256rrk,     X86::VPSUBSBZ256rmk,       0 },
+    { X86::VPSUBSWZ256rrk,     X86::VPSUBSWZ256rmk,       0 },
+    { X86::VPSUBUSBZ256rrk,    X86::VPSUBUSBZ256rmk,      0 },
+    { X86::VPSUBUSWZ256rrk,    X86::VPSUBUSWZ256rmk,      0 },
+    { X86::VPSUBWZ256rrk,      X86::VPSUBWZ256rmk,        0 },
+    { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik,   0 },
+    { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik,   0 },
+    { X86::VPUNPCKHBWZ256rrk,  X86::VPUNPCKHBWZ256rmk,    0 },
+    { X86::VPUNPCKHDQZ256rrk,  X86::VPUNPCKHDQZ256rmk,    0 },
+    { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk,   0 },
+    { X86::VPUNPCKHWDZ256rrk,  X86::VPUNPCKHWDZ256rmk,    0 },
+    { X86::VPUNPCKLBWZ256rrk,  X86::VPUNPCKLBWZ256rmk,    0 },
+    { X86::VPUNPCKLDQZ256rrk,  X86::VPUNPCKLDQZ256rmk,    0 },
+    { X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk,   0 },
+    { X86::VPUNPCKLWDZ256rrk,  X86::VPUNPCKLWDZ256rmk,    0 },
+    { X86::VPXORDZ256rrk,      X86::VPXORDZ256rmk,        0 },
+    { X86::VPXORQZ256rrk,      X86::VPXORQZ256rmk,        0 },
+    { X86::VSUBPDZ256rrk,      X86::VSUBPDZ256rmk,        0 },
+    { X86::VSUBPSZ256rrk,      X86::VSUBPSZ256rmk,        0 },
+    { X86::VUNPCKHPDZ256rrk,   X86::VUNPCKHPDZ256rmk,     0 },
+    { X86::VUNPCKHPSZ256rrk,   X86::VUNPCKHPSZ256rmk,     0 },
+    { X86::VUNPCKLPDZ256rrk,   X86::VUNPCKLPDZ256rmk,     0 },
+    { X86::VUNPCKLPSZ256rrk,   X86::VUNPCKLPSZ256rmk,     0 },
+    { X86::VXORPDZ256rrk,      X86::VXORPDZ256rmk,        0 },
+    { X86::VXORPSZ256rrk,      X86::VXORPSZ256rmk,        0 },
+
+    // AVX-512{F,VL} foldable instructions 128-bit
+    { X86::VADDPDZ128rrk,      X86::VADDPDZ128rmk,        0 },
+    { X86::VADDPSZ128rrk,      X86::VADDPSZ128rmk,        0 },
+    { X86::VALIGNDZ128rrik,    X86::VALIGNDZ128rmik,      0 },
+    { X86::VALIGNQZ128rrik,    X86::VALIGNQZ128rmik,      0 },
+    { X86::VANDNPDZ128rrk,     X86::VANDNPDZ128rmk,       0 },
+    { X86::VANDNPSZ128rrk,     X86::VANDNPSZ128rmk,       0 },
+    { X86::VANDPDZ128rrk,      X86::VANDPDZ128rmk,        0 },
+    { X86::VANDPSZ128rrk,      X86::VANDPSZ128rmk,        0 },
+    { X86::VDIVPDZ128rrk,      X86::VDIVPDZ128rmk,        0 },
+    { X86::VDIVPSZ128rrk,      X86::VDIVPSZ128rmk,        0 },
+    { X86::VMAXCPDZ128rrk,     X86::VMAXCPDZ128rmk,       0 },
+    { X86::VMAXCPSZ128rrk,     X86::VMAXCPSZ128rmk,       0 },
+    { X86::VMAXPDZ128rrk,      X86::VMAXPDZ128rmk,        0 },
+    { X86::VMAXPSZ128rrk,      X86::VMAXPSZ128rmk,        0 },
+    { X86::VMINCPDZ128rrk,     X86::VMINCPDZ128rmk,       0 },
+    { X86::VMINCPSZ128rrk,     X86::VMINCPSZ128rmk,       0 },
+    { X86::VMINPDZ128rrk,      X86::VMINPDZ128rmk,        0 },
+    { X86::VMINPSZ128rrk,      X86::VMINPSZ128rmk,        0 },
+    { X86::VMULPDZ128rrk,      X86::VMULPDZ128rmk,        0 },
+    { X86::VMULPSZ128rrk,      X86::VMULPSZ128rmk,        0 },
+    { X86::VORPDZ128rrk,       X86::VORPDZ128rmk,         0 },
+    { X86::VORPSZ128rrk,       X86::VORPSZ128rmk,         0 },
+    { X86::VPADDBZ128rrk,      X86::VPADDBZ128rmk,        0 },
+    { X86::VPADDDZ128rrk,      X86::VPADDDZ128rmk,        0 },
+    { X86::VPADDQZ128rrk,      X86::VPADDQZ128rmk,        0 },
+    { X86::VPADDSBZ128rrk,     X86::VPADDSBZ128rmk,       0 },
+    { X86::VPADDSWZ128rrk,     X86::VPADDSWZ128rmk,       0 },
+    { X86::VPADDUSBZ128rrk,    X86::VPADDUSBZ128rmk,      0 },
+    { X86::VPADDUSWZ128rrk,    X86::VPADDUSWZ128rmk,      0 },
+    { X86::VPADDWZ128rrk,      X86::VPADDWZ128rmk,        0 },
+    { X86::VPALIGNRZ128rrik,   X86::VPALIGNRZ128rmik,     0 },
+    { X86::VPANDDZ128rrk,      X86::VPANDDZ128rmk,        0 },
+    { X86::VPANDNDZ128rrk,     X86::VPANDNDZ128rmk,       0 },
+    { X86::VPANDNQZ128rrk,     X86::VPANDNQZ128rmk,       0 },
+    { X86::VPANDQZ128rrk,      X86::VPANDQZ128rmk,        0 },
+    { X86::VPERMBZ128rrk,      X86::VPERMBZ128rmk,        0 },
+    { X86::VPERMI2B128rrk,     X86::VPERMI2B128rmk,       0 },
+    { X86::VPERMI2D128rrk,     X86::VPERMI2D128rmk,       0 },
+    { X86::VPERMI2PD128rrk,    X86::VPERMI2PD128rmk,      0 },
+    { X86::VPERMI2PS128rrk,    X86::VPERMI2PS128rmk,      0 },
+    { X86::VPERMI2Q128rrk,     X86::VPERMI2Q128rmk,       0 },
+    { X86::VPERMI2W128rrk,     X86::VPERMI2W128rmk,       0 },
+    { X86::VPERMILPDZ128rrk,   X86::VPERMILPDZ128rmk,     0 },
+    { X86::VPERMILPSZ128rrk,   X86::VPERMILPSZ128rmk,     0 },
+    { X86::VPERMT2B128rrk,     X86::VPERMT2B128rmk,       0 },
+    { X86::VPERMT2D128rrk,     X86::VPERMT2D128rmk,       0 },
+    { X86::VPERMT2PD128rrk,    X86::VPERMT2PD128rmk,      0 },
+    { X86::VPERMT2PS128rrk,    X86::VPERMT2PS128rmk,      0 },
+    { X86::VPERMT2Q128rrk,     X86::VPERMT2Q128rmk,       0 },
+    { X86::VPERMT2W128rrk,     X86::VPERMT2W128rmk,       0 },
+    { X86::VPERMWZ128rrk,      X86::VPERMWZ128rmk,        0 },
+    { X86::VPMADDUBSWZ128rrk,  X86::VPMADDUBSWZ128rmk,    0 },
+    { X86::VPMADDWDZ128rrk,    X86::VPMADDWDZ128rmk,      0 },
+    { X86::VPORDZ128rrk,       X86::VPORDZ128rmk,         0 },
+    { X86::VPORQZ128rrk,       X86::VPORQZ128rmk,         0 },
+    { X86::VPSHUFBZ128rrk,     X86::VPSHUFBZ128rmk,       0 },
+    { X86::VPSUBBZ128rrk,      X86::VPSUBBZ128rmk,        0 },
+    { X86::VPSUBDZ128rrk,      X86::VPSUBDZ128rmk,        0 },
+    { X86::VPSUBQZ128rrk,      X86::VPSUBQZ128rmk,        0 },
+    { X86::VPSUBSBZ128rrk,     X86::VPSUBSBZ128rmk,       0 },
+    { X86::VPSUBSWZ128rrk,     X86::VPSUBSWZ128rmk,       0 },
+    { X86::VPSUBUSBZ128rrk,    X86::VPSUBUSBZ128rmk,      0 },
+    { X86::VPSUBUSWZ128rrk,    X86::VPSUBUSWZ128rmk,      0 },
+    { X86::VPSUBWZ128rrk,      X86::VPSUBWZ128rmk,        0 },
+    { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik,   0 },
+    { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik,   0 },
+    { X86::VPUNPCKHBWZ128rrk,  X86::VPUNPCKHBWZ128rmk,    0 },
+    { X86::VPUNPCKHDQZ128rrk,  X86::VPUNPCKHDQZ128rmk,    0 },
+    { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk,   0 },
+    { X86::VPUNPCKHWDZ128rrk,  X86::VPUNPCKHWDZ128rmk,    0 },
+    { X86::VPUNPCKLBWZ128rrk,  X86::VPUNPCKLBWZ128rmk,    0 },
+    { X86::VPUNPCKLDQZ128rrk,  X86::VPUNPCKLDQZ128rmk,    0 },
+    { X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk,   0 },
+    { X86::VPUNPCKLWDZ128rrk,  X86::VPUNPCKLWDZ128rmk,    0 },
+    { X86::VPXORDZ128rrk,      X86::VPXORDZ128rmk,        0 },
+    { X86::VPXORQZ128rrk,      X86::VPXORQZ128rmk,        0 },
+    { X86::VSUBPDZ128rrk,      X86::VSUBPDZ128rmk,        0 },
+    { X86::VSUBPSZ128rrk,      X86::VSUBPSZ128rmk,        0 },
+    { X86::VUNPCKHPDZ128rrk,   X86::VUNPCKHPDZ128rmk,     0 },
+    { X86::VUNPCKHPSZ128rrk,   X86::VUNPCKHPSZ128rmk,     0 },
+    { X86::VUNPCKLPDZ128rrk,   X86::VUNPCKLPDZ128rmk,     0 },
+    { X86::VUNPCKLPSZ128rrk,   X86::VUNPCKLPSZ128rmk,     0 },
+    { X86::VXORPDZ128rrk,      X86::VXORPDZ128rmk,        0 },
+    { X86::VXORPSZ128rrk,      X86::VXORPSZ128rmk,        0 },
+
+    // 512-bit three source instructions with zero masking.
+    { X86::VPERMI2Brrkz,       X86::VPERMI2Brmkz,         0 },
+    { X86::VPERMI2Drrkz,       X86::VPERMI2Drmkz,         0 },
+    { X86::VPERMI2PSrrkz,      X86::VPERMI2PSrmkz,        0 },
+    { X86::VPERMI2PDrrkz,      X86::VPERMI2PDrmkz,        0 },
+    { X86::VPERMI2Qrrkz,       X86::VPERMI2Qrmkz,         0 },
+    { X86::VPERMI2Wrrkz,       X86::VPERMI2Wrmkz,         0 },
+    { X86::VPERMT2Brrkz,       X86::VPERMT2Brmkz,         0 },
+    { X86::VPERMT2Drrkz,       X86::VPERMT2Drmkz,         0 },
+    { X86::VPERMT2PSrrkz,      X86::VPERMT2PSrmkz,        0 },
+    { X86::VPERMT2PDrrkz,      X86::VPERMT2PDrmkz,        0 },
+    { X86::VPERMT2Qrrkz,       X86::VPERMT2Qrmkz,         0 },
+    { X86::VPERMT2Wrrkz,       X86::VPERMT2Wrmkz,         0 },
+    { X86::VPTERNLOGDZrrikz,   X86::VPTERNLOGDZrmikz,     0 },
+    { X86::VPTERNLOGQZrrikz,   X86::VPTERNLOGQZrmikz,     0 },
+
+    // 256-bit three source instructions with zero masking.
+    { X86::VPERMI2B256rrkz,    X86::VPERMI2B256rmkz,      0 },
+    { X86::VPERMI2D256rrkz,    X86::VPERMI2D256rmkz,      0 },
+    { X86::VPERMI2PD256rrkz,   X86::VPERMI2PD256rmkz,     0 },
+    { X86::VPERMI2PS256rrkz,   X86::VPERMI2PS256rmkz,     0 },
+    { X86::VPERMI2Q256rrkz,    X86::VPERMI2Q256rmkz,      0 },
+    { X86::VPERMI2W256rrkz,    X86::VPERMI2W256rmkz,      0 },
+    { X86::VPERMT2B256rrkz,    X86::VPERMT2B256rmkz,      0 },
+    { X86::VPERMT2D256rrkz,    X86::VPERMT2D256rmkz,      0 },
+    { X86::VPERMT2PD256rrkz,   X86::VPERMT2PD256rmkz,     0 },
+    { X86::VPERMT2PS256rrkz,   X86::VPERMT2PS256rmkz,     0 },
+    { X86::VPERMT2Q256rrkz,    X86::VPERMT2Q256rmkz,      0 },
+    { X86::VPERMT2W256rrkz,    X86::VPERMT2W256rmkz,      0 },
+    { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz,  0 },
+    { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz,  0 },
+
+    // 128-bit three source instructions with zero masking.
+    { X86::VPERMI2B128rrkz,    X86::VPERMI2B128rmkz,      0 },
+    { X86::VPERMI2D128rrkz,    X86::VPERMI2D128rmkz,      0 },
+    { X86::VPERMI2PD128rrkz,   X86::VPERMI2PD128rmkz,     0 },
+    { X86::VPERMI2PS128rrkz,   X86::VPERMI2PS128rmkz,     0 },
+    { X86::VPERMI2Q128rrkz,    X86::VPERMI2Q128rmkz,      0 },
+    { X86::VPERMI2W128rrkz,    X86::VPERMI2W128rmkz,      0 },
+    { X86::VPERMT2B128rrkz,    X86::VPERMT2B128rmkz,      0 },
+    { X86::VPERMT2D128rrkz,    X86::VPERMT2D128rmkz,      0 },
+    { X86::VPERMT2PD128rrkz,   X86::VPERMT2PD128rmkz,     0 },
+    { X86::VPERMT2PS128rrkz,   X86::VPERMT2PS128rmkz,     0 },
+    { X86::VPERMT2Q128rrkz,    X86::VPERMT2Q128rmkz,      0 },
+    { X86::VPERMT2W128rrkz,    X86::VPERMT2W128rmkz,      0 },
+    { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz,  0 },
+    { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz,  0 },
+  };
+
+  for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
+    AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
+                  Entry.RegOp, Entry.MemOp,
+                  // Index 4, folded load
+                  Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
+  }
+  for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I) {
+    if (I.getGroup()->isKMasked()) {
+      // Intrinsics need to pass TB_NO_REVERSE.
+      if (I.getGroup()->isIntrinsic()) {
+        AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
+                      I.getRegOpcode(), I.getMemOpcode(),
+                      TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD | TB_NO_REVERSE);
+      } else {
+        AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
+                      I.getRegOpcode(), I.getMemOpcode(),
+                      TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD);
+      }
+    }
+  }
+}
+
+void
+X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
+                            MemOp2RegOpTableType &M2RTable,
+                            uint16_t RegOp, uint16_t MemOp, uint16_t Flags) {
+  if ((Flags & TB_NO_FORWARD) == 0) {
+    assert(!R2MTable.count(RegOp) && "Duplicate entry!");
+    R2MTable[RegOp] = std::make_pair(MemOp, Flags);
+  }
+  if ((Flags & TB_NO_REVERSE) == 0) {
+    assert(!M2RTable.count(MemOp) &&
+         "Duplicated entries in unfolding maps?");
+    M2RTable[MemOp] = std::make_pair(RegOp, Flags);
+  }
+}
+
+bool
+X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                    unsigned &SrcReg, unsigned &DstReg,
+                                    unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default: break;
+  case X86::MOVSX16rr8:
+  case X86::MOVZX16rr8:
+  case X86::MOVSX32rr8:
+  case X86::MOVZX32rr8:
+  case X86::MOVSX64rr8:
+    if (!Subtarget.is64Bit())
+      // It's not always legal to reference the low 8-bit of the larger
+      // register in 32-bit mode.
+      return false;
+  case X86::MOVSX32rr16:
+  case X86::MOVZX32rr16:
+  case X86::MOVSX64rr16:
+  case X86::MOVSX64rr32: {
+    if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+      // Be conservative.
+      return false;
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    switch (MI.getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::MOVSX16rr8:
+    case X86::MOVZX16rr8:
+    case X86::MOVSX32rr8:
+    case X86::MOVZX32rr8:
+    case X86::MOVSX64rr8:
+      SubIdx = X86::sub_8bit;
+      break;
+    case X86::MOVSX32rr16:
+    case X86::MOVZX32rr16:
+    case X86::MOVSX64rr16:
+      SubIdx = X86::sub_16bit;
+      break;
+    case X86::MOVSX64rr32:
+      SubIdx = X86::sub_32bit;
+      break;
+    }
+    return true;
+  }
+  }
+  return false;
+}
+
+int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+
+  if (MI.getOpcode() == getCallFrameSetupOpcode() ||
+      MI.getOpcode() == getCallFrameDestroyOpcode()) {
+    unsigned StackAlign = TFI->getStackAlignment();
+    int SPAdj =
+        (MI.getOperand(0).getImm() + StackAlign - 1) / StackAlign * StackAlign;
+
+    SPAdj -= MI.getOperand(1).getImm();
+
+    if (MI.getOpcode() == getCallFrameSetupOpcode())
+      return SPAdj;
+    else
+      return -SPAdj;
+  }
+
+  // To know whether a call adjusts the stack, we need information
+  // that is bound to the following ADJCALLSTACKUP pseudo.
+  // Look for the next ADJCALLSTACKUP that follows the call.
+  if (MI.isCall()) {
+    const MachineBasicBlock *MBB = MI.getParent();
+    auto I = ++MachineBasicBlock::const_iterator(MI);
+    for (auto E = MBB->end(); I != E; ++I) {
+      if (I->getOpcode() == getCallFrameDestroyOpcode() ||
+          I->isCall())
+        break;
+    }
+
+    // If we could not find a frame destroy opcode, then it has already
+    // been simplified, so we don't care.
+    if (I->getOpcode() != getCallFrameDestroyOpcode())
+      return 0;
+
+    return -(I->getOperand(1).getImm());
+  }
+
+  // Currently handle only PUSHes we can reasonably expect to see
+  // in call sequences
+  switch (MI.getOpcode()) {
+  default:
+    return 0;
+  case X86::PUSH32i8:
+  case X86::PUSH32r:
+  case X86::PUSH32rmm:
+  case X86::PUSH32rmr:
+  case X86::PUSHi32:
+    return 4;
+  case X86::PUSH64i8:
+  case X86::PUSH64r:
+  case X86::PUSH64rmm:
+  case X86::PUSH64rmr:
+  case X86::PUSH64i32:
+    return 8;
+  }
+}
+
+/// Return true and the FrameIndex if the specified
+/// operand and follow operands form a reference to the stack frame.
+bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
+                                  int &FrameIndex) const {
+  if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
+      MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
+      MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
+      MI.getOperand(Op + X86::AddrDisp).isImm() &&
+      MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
+      MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
+      MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
+    FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
+    return true;
+  }
+  return false;
+}
+
+static bool isFrameLoadOpcode(int Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp64m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVUPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVUPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVUPDYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVDQUYrm:
+  case X86::VMOVDQAYrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::VMOVSSZrm:
+  case X86::VMOVSDZrm:
+  case X86::VMOVAPSZrm:
+  case X86::VMOVAPSZ128rm:
+  case X86::VMOVAPSZ256rm:
+  case X86::VMOVAPSZ128rm_NOVLX:
+  case X86::VMOVAPSZ256rm_NOVLX:
+  case X86::VMOVUPSZrm:
+  case X86::VMOVUPSZ128rm:
+  case X86::VMOVUPSZ256rm:
+  case X86::VMOVUPSZ128rm_NOVLX:
+  case X86::VMOVUPSZ256rm_NOVLX:
+  case X86::VMOVAPDZrm:
+  case X86::VMOVAPDZ128rm:
+  case X86::VMOVAPDZ256rm:
+  case X86::VMOVUPDZrm:
+  case X86::VMOVUPDZ128rm:
+  case X86::VMOVUPDZ256rm:
+  case X86::VMOVDQA32Zrm:
+  case X86::VMOVDQA32Z128rm:
+  case X86::VMOVDQA32Z256rm:
+  case X86::VMOVDQU32Zrm:
+  case X86::VMOVDQU32Z128rm:
+  case X86::VMOVDQU32Z256rm:
+  case X86::VMOVDQA64Zrm:
+  case X86::VMOVDQA64Z128rm:
+  case X86::VMOVDQA64Z256rm:
+  case X86::VMOVDQU64Zrm:
+  case X86::VMOVDQU64Z128rm:
+  case X86::VMOVDQU64Z256rm:
+  case X86::VMOVDQU8Zrm:
+  case X86::VMOVDQU8Z128rm:
+  case X86::VMOVDQU8Z256rm:
+  case X86::VMOVDQU16Zrm:
+  case X86::VMOVDQU16Z128rm:
+  case X86::VMOVDQU16Z256rm:
+  case X86::KMOVBkm:
+  case X86::KMOVWkm:
+  case X86::KMOVDkm:
+  case X86::KMOVQkm:
+    return true;
+  }
+}
+
+static bool isFrameStoreOpcode(int Opcode) {
+  switch (Opcode) {
+  default: break;
+  case X86::MOV8mr:
+  case X86::MOV16mr:
+  case X86::MOV32mr:
+  case X86::MOV64mr:
+  case X86::ST_FpP64m:
+  case X86::MOVSSmr:
+  case X86::MOVSDmr:
+  case X86::MOVAPSmr:
+  case X86::MOVUPSmr:
+  case X86::MOVAPDmr:
+  case X86::MOVUPDmr:
+  case X86::MOVDQAmr:
+  case X86::MOVDQUmr:
+  case X86::VMOVSSmr:
+  case X86::VMOVSDmr:
+  case X86::VMOVAPSmr:
+  case X86::VMOVUPSmr:
+  case X86::VMOVAPDmr:
+  case X86::VMOVUPDmr:
+  case X86::VMOVDQAmr:
+  case X86::VMOVDQUmr:
+  case X86::VMOVUPSYmr:
+  case X86::VMOVAPSYmr:
+  case X86::VMOVUPDYmr:
+  case X86::VMOVAPDYmr:
+  case X86::VMOVDQUYmr:
+  case X86::VMOVDQAYmr:
+  case X86::VMOVSSZmr:
+  case X86::VMOVSDZmr:
+  case X86::VMOVUPSZmr:
+  case X86::VMOVUPSZ128mr:
+  case X86::VMOVUPSZ256mr:
+  case X86::VMOVUPSZ128mr_NOVLX:
+  case X86::VMOVUPSZ256mr_NOVLX:
+  case X86::VMOVAPSZmr:
+  case X86::VMOVAPSZ128mr:
+  case X86::VMOVAPSZ256mr:
+  case X86::VMOVAPSZ128mr_NOVLX:
+  case X86::VMOVAPSZ256mr_NOVLX:
+  case X86::VMOVUPDZmr:
+  case X86::VMOVUPDZ128mr:
+  case X86::VMOVUPDZ256mr:
+  case X86::VMOVAPDZmr:
+  case X86::VMOVAPDZ128mr:
+  case X86::VMOVAPDZ256mr:
+  case X86::VMOVDQA32Zmr:
+  case X86::VMOVDQA32Z128mr:
+  case X86::VMOVDQA32Z256mr:
+  case X86::VMOVDQU32Zmr:
+  case X86::VMOVDQU32Z128mr:
+  case X86::VMOVDQU32Z256mr:
+  case X86::VMOVDQA64Zmr:
+  case X86::VMOVDQA64Z128mr:
+  case X86::VMOVDQA64Z256mr:
+  case X86::VMOVDQU64Zmr:
+  case X86::VMOVDQU64Z128mr:
+  case X86::VMOVDQU64Z256mr:
+  case X86::VMOVDQU8Zmr:
+  case X86::VMOVDQU8Z128mr:
+  case X86::VMOVDQU8Z256mr:
+  case X86::VMOVDQU16Zmr:
+  case X86::VMOVDQU16Z128mr:
+  case X86::VMOVDQU16Z256mr:
+  case X86::MMX_MOVD64mr:
+  case X86::MMX_MOVQ64mr:
+  case X86::MMX_MOVNTQmr:
+  case X86::KMOVBmk:
+  case X86::KMOVWmk:
+  case X86::KMOVDmk:
+  case X86::KMOVQmk:
+    return true;
+  }
+  return false;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                           int &FrameIndex) const {
+  if (isFrameLoadOpcode(MI.getOpcode()))
+    if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
+      return MI.getOperand(0).getReg();
+  return 0;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
+                                                 int &FrameIndex) const {
+  if (isFrameLoadOpcode(MI.getOpcode())) {
+    unsigned Reg;
+    if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
+      return Reg;
+    // Check for post-frame index elimination operations
+    const MachineMemOperand *Dummy;
+    return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+  }
+  return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                          int &FrameIndex) const {
+  if (isFrameStoreOpcode(MI.getOpcode()))
+    if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
+        isFrameOperand(MI, 0, FrameIndex))
+      return MI.getOperand(X86::AddrNumOperands).getReg();
+  return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
+                                                int &FrameIndex) const {
+  if (isFrameStoreOpcode(MI.getOpcode())) {
+    unsigned Reg;
+    if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
+      return Reg;
+    // Check for post-frame index elimination operations
+    const MachineMemOperand *Dummy;
+    return hasStoreToStackSlot(MI, Dummy, FrameIndex);
+  }
+  return 0;
+}
+
+/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
+static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
+  // Don't waste compile time scanning use-def chains of physregs.
+  if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
+    return false;
+  bool isPICBase = false;
+  for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
+         E = MRI.def_instr_end(); I != E; ++I) {
+    MachineInstr *DefMI = &*I;
+    if (DefMI->getOpcode() != X86::MOVPC32r)
+      return false;
+    assert(!isPICBase && "More than one PIC base?");
+    isPICBase = true;
+  }
+  return isPICBase;
+}
+
+bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
+                                                     AliasAnalysis *AA) const {
+  switch (MI.getOpcode()) {
+  default: break;
+  case X86::MOV8rm:
+  case X86::MOV8rm_NOREX:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp64m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVUPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVUPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVUPDYrm:
+  case X86::VMOVDQAYrm:
+  case X86::VMOVDQUYrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  // AVX-512
+  case X86::VMOVSSZrm:
+  case X86::VMOVSDZrm:
+  case X86::VMOVAPDZ128rm:
+  case X86::VMOVAPDZ256rm:
+  case X86::VMOVAPDZrm:
+  case X86::VMOVAPSZ128rm:
+  case X86::VMOVAPSZ256rm:
+  case X86::VMOVAPSZ128rm_NOVLX:
+  case X86::VMOVAPSZ256rm_NOVLX:
+  case X86::VMOVAPSZrm:
+  case X86::VMOVDQA32Z128rm:
+  case X86::VMOVDQA32Z256rm:
+  case X86::VMOVDQA32Zrm:
+  case X86::VMOVDQA64Z128rm:
+  case X86::VMOVDQA64Z256rm:
+  case X86::VMOVDQA64Zrm:
+  case X86::VMOVDQU16Z128rm:
+  case X86::VMOVDQU16Z256rm:
+  case X86::VMOVDQU16Zrm:
+  case X86::VMOVDQU32Z128rm:
+  case X86::VMOVDQU32Z256rm:
+  case X86::VMOVDQU32Zrm:
+  case X86::VMOVDQU64Z128rm:
+  case X86::VMOVDQU64Z256rm:
+  case X86::VMOVDQU64Zrm:
+  case X86::VMOVDQU8Z128rm:
+  case X86::VMOVDQU8Z256rm:
+  case X86::VMOVDQU8Zrm:
+  case X86::VMOVUPDZ128rm:
+  case X86::VMOVUPDZ256rm:
+  case X86::VMOVUPDZrm:
+  case X86::VMOVUPSZ128rm:
+  case X86::VMOVUPSZ256rm:
+  case X86::VMOVUPSZ128rm_NOVLX:
+  case X86::VMOVUPSZ256rm_NOVLX:
+  case X86::VMOVUPSZrm: {
+    // Loads from constant pools are trivially rematerializable.
+    if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
+        MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
+        MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
+        MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
+        MI.isDereferenceableInvariantLoad(AA)) {
+      unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
+      if (BaseReg == 0 || BaseReg == X86::RIP)
+        return true;
+      // Allow re-materialization of PIC load.
+      if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())
+        return false;
+      const MachineFunction &MF = *MI.getParent()->getParent();
+      const MachineRegisterInfo &MRI = MF.getRegInfo();
+      return regIsPICBase(BaseReg, MRI);
+    }
+    return false;
+  }
+
+  case X86::LEA32r:
+  case X86::LEA64r: {
+    if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
+        MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
+        MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
+        !MI.getOperand(1 + X86::AddrDisp).isReg()) {
+      // lea fi#, lea GV, etc. are all rematerializable.
+      if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
+        return true;
+      unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
+      if (BaseReg == 0)
+        return true;
+      // Allow re-materialization of lea PICBase + x.
+      const MachineFunction &MF = *MI.getParent()->getParent();
+      const MachineRegisterInfo &MRI = MF.getRegInfo();
+      return regIsPICBase(BaseReg, MRI);
+    }
+    return false;
+  }
+  }
+
+  // All other instructions marked M_REMATERIALIZABLE are always trivially
+  // rematerializable.
+  return true;
+}
+
+bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I) const {
+  MachineBasicBlock::iterator E = MBB.end();
+
+  // For compile time consideration, if we are not able to determine the
+  // safety after visiting 4 instructions in each direction, we will assume
+  // it's not safe.
+  MachineBasicBlock::iterator Iter = I;
+  for (unsigned i = 0; Iter != E && i < 4; ++i) {
+    bool SeenDef = false;
+    for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+      MachineOperand &MO = Iter->getOperand(j);
+      if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
+        SeenDef = true;
+      if (!MO.isReg())
+        continue;
+      if (MO.getReg() == X86::EFLAGS) {
+        if (MO.isUse())
+          return false;
+        SeenDef = true;
+      }
+    }
+
+    if (SeenDef)
+      // This instruction defines EFLAGS, no need to look any further.
+      return true;
+    ++Iter;
+    // Skip over DBG_VALUE.
+    while (Iter != E && Iter->isDebugValue())
+      ++Iter;
+  }
+
+  // It is safe to clobber EFLAGS at the end of a block of no successor has it
+  // live in.
+  if (Iter == E) {
+    for (MachineBasicBlock *S : MBB.successors())
+      if (S->isLiveIn(X86::EFLAGS))
+        return false;
+    return true;
+  }
+
+  MachineBasicBlock::iterator B = MBB.begin();
+  Iter = I;
+  for (unsigned i = 0; i < 4; ++i) {
+    // If we make it to the beginning of the block, it's safe to clobber
+    // EFLAGS iff EFLAGS is not live-in.
+    if (Iter == B)
+      return !MBB.isLiveIn(X86::EFLAGS);
+
+    --Iter;
+    // Skip over DBG_VALUE.
+    while (Iter != B && Iter->isDebugValue())
+      --Iter;
+
+    bool SawKill = false;
+    for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+      MachineOperand &MO = Iter->getOperand(j);
+      // A register mask may clobber EFLAGS, but we should still look for a
+      // live EFLAGS def.
+      if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
+        SawKill = true;
+      if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
+        if (MO.isDef()) return MO.isDead();
+        if (MO.isKill()) SawKill = true;
+      }
+    }
+
+    if (SawKill)
+      // This instruction kills EFLAGS and doesn't redefine it, so
+      // there's no need to look further.
+      return true;
+  }
+
+  // Conservative answer.
+  return false;
+}
+
+void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I,
+                                 unsigned DestReg, unsigned SubIdx,
+                                 const MachineInstr &Orig,
+                                 const TargetRegisterInfo &TRI) const {
+  bool ClobbersEFLAGS = false;
+  for (const MachineOperand &MO : Orig.operands()) {
+    if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
+      ClobbersEFLAGS = true;
+      break;
+    }
+  }
+
+  if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) {
+    // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
+    // effects.
+    int Value;
+    switch (Orig.getOpcode()) {
+    case X86::MOV32r0:  Value = 0; break;
+    case X86::MOV32r1:  Value = 1; break;
+    case X86::MOV32r_1: Value = -1; break;
+    default:
+      llvm_unreachable("Unexpected instruction!");
+    }
+
+    const DebugLoc &DL = Orig.getDebugLoc();
+    BuildMI(MBB, I, DL, get(X86::MOV32ri))
+        .addOperand(Orig.getOperand(0))
+        .addImm(Value);
+  } else {
+    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
+    MBB.insert(I, MI);
+  }
+
+  MachineInstr &NewMI = *std::prev(I);
+  NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
+}
+
+/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
+bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const {
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (MO.isReg() && MO.isDef() &&
+        MO.getReg() == X86::EFLAGS && !MO.isDead()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Check whether the shift count for a machine operand is non-zero.
+inline static unsigned getTruncatedShiftCount(MachineInstr &MI,
+                                              unsigned ShiftAmtOperandIdx) {
+  // The shift count is six bits with the REX.W prefix and five bits without.
+  unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
+  unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
+  return Imm & ShiftCountMask;
+}
+
+/// Check whether the given shift count is appropriate
+/// can be represented by a LEA instruction.
+inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
+  // Left shift instructions can be transformed into load-effective-address
+  // instructions if we can encode them appropriately.
+  // A LEA instruction utilizes a SIB byte to encode its scale factor.
+  // The SIB.scale field is two bits wide which means that we can encode any
+  // shift amount less than 4.
+  return ShAmt < 4 && ShAmt > 0;
+}
+
+bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
+                                  unsigned Opc, bool AllowSP, unsigned &NewSrc,
+                                  bool &isKill, bool &isUndef,
+                                  MachineOperand &ImplicitOp,
+                                  LiveVariables *LV) const {
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetRegisterClass *RC;
+  if (AllowSP) {
+    RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
+  } else {
+    RC = Opc != X86::LEA32r ?
+      &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
+  }
+  unsigned SrcReg = Src.getReg();
+
+  // For both LEA64 and LEA32 the register already has essentially the right
+  // type (32-bit or 64-bit) we may just need to forbid SP.
+  if (Opc != X86::LEA64_32r) {
+    NewSrc = SrcReg;
+    isKill = Src.isKill();
+    isUndef = Src.isUndef();
+
+    if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
+        !MF.getRegInfo().constrainRegClass(NewSrc, RC))
+      return false;
+
+    return true;
+  }
+
+  // This is for an LEA64_32r and incoming registers are 32-bit. One way or
+  // another we need to add 64-bit registers to the final MI.
+  if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+    ImplicitOp = Src;
+    ImplicitOp.setImplicit();
+
+    NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
+    isKill = Src.isKill();
+    isUndef = Src.isUndef();
+  } else {
+    // Virtual register of the wrong class, we have to create a temporary 64-bit
+    // vreg to feed into the LEA.
+    NewSrc = MF.getRegInfo().createVirtualRegister(RC);
+    MachineInstr *Copy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                                 get(TargetOpcode::COPY))
+        .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
+        .addOperand(Src);
+
+    // Which is obviously going to be dead after we're done with it.
+    isKill = true;
+    isUndef = false;
+
+    if (LV)
+      LV->replaceKillInstruction(SrcReg, MI, *Copy);
+  }
+
+  // We've set all the parameters without issue.
+  return true;
+}
+
+/// Helper for convertToThreeAddress when 16-bit LEA is disabled, use 32-bit
+/// LEA to form 3-address code by promoting to a 32-bit superregister and then
+/// truncating back down to a 16-bit subregister.
+MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
+    unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI,
+    LiveVariables *LV) const {
+  MachineBasicBlock::iterator MBBI = MI.getIterator();
+  unsigned Dest = MI.getOperand(0).getReg();
+  unsigned Src = MI.getOperand(1).getReg();
+  bool isDead = MI.getOperand(0).isDead();
+  bool isKill = MI.getOperand(1).isKill();
+
+  MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
+  unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+  unsigned Opc, leaInReg;
+  if (Subtarget.is64Bit()) {
+    Opc = X86::LEA64_32r;
+    leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+  } else {
+    Opc = X86::LEA32r;
+    leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+  }
+
+  // Build and insert into an implicit UNDEF value. This is OK because
+  // well be shifting and then extracting the lower 16-bits.
+  // This has the potential to cause partial register stall. e.g.
+  //   movw    (%rbp,%rcx,2), %dx
+  //   leal    -65(%rdx), %esi
+  // But testing has shown this *does* help performance in 64-bit mode (at
+  // least on modern x86 machines).
+  BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
+  MachineInstr *InsMI =
+      BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+          .addReg(leaInReg, RegState::Define, X86::sub_16bit)
+          .addReg(Src, getKillRegState(isKill));
+
+  MachineInstrBuilder MIB =
+      BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opc), leaOutReg);
+  switch (MIOpc) {
+  default: llvm_unreachable("Unreachable!");
+  case X86::SHL16ri: {
+    unsigned ShAmt = MI.getOperand(2).getImm();
+    MIB.addReg(0).addImm(1ULL << ShAmt)
+       .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0);
+    break;
+  }
+  case X86::INC16r:
+    addRegOffset(MIB, leaInReg, true, 1);
+    break;
+  case X86::DEC16r:
+    addRegOffset(MIB, leaInReg, true, -1);
+    break;
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri_DB:
+  case X86::ADD16ri8_DB:
+    addRegOffset(MIB, leaInReg, true, MI.getOperand(2).getImm());
+    break;
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB: {
+    unsigned Src2 = MI.getOperand(2).getReg();
+    bool isKill2 = MI.getOperand(2).isKill();
+    unsigned leaInReg2 = 0;
+    MachineInstr *InsMI2 = nullptr;
+    if (Src == Src2) {
+      // ADD16rr %reg1028<kill>, %reg1028
+      // just a single insert_subreg.
+      addRegReg(MIB, leaInReg, true, leaInReg, false);
+    } else {
+      if (Subtarget.is64Bit())
+        leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+      else
+        leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+      // Build and insert into an implicit UNDEF value. This is OK because
+      // well be shifting and then extracting the lower 16-bits.
+      BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2);
+      InsMI2 = BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
+                   .addReg(leaInReg2, RegState::Define, X86::sub_16bit)
+                   .addReg(Src2, getKillRegState(isKill2));
+      addRegReg(MIB, leaInReg, true, leaInReg2, true);
+    }
+    if (LV && isKill2 && InsMI2)
+      LV->replaceKillInstruction(Src2, MI, *InsMI2);
+    break;
+  }
+  }
+
+  MachineInstr *NewMI = MIB;
+  MachineInstr *ExtMI =
+      BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+          .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+          .addReg(leaOutReg, RegState::Kill, X86::sub_16bit);
+
+  if (LV) {
+    // Update live variables
+    LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
+    LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
+    if (isKill)
+      LV->replaceKillInstruction(Src, MI, *InsMI);
+    if (isDead)
+      LV->replaceKillInstruction(Dest, MI, *ExtMI);
+  }
+
+  return ExtMI;
+}
+
+/// This method must be implemented by targets that
+/// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
+/// may be able to convert a two-address instruction into a true
+/// three-address instruction on demand.  This allows the X86 target (for
+/// example) to convert ADD and SHL instructions into LEA instructions if they
+/// would require register copies due to two-addressness.
+///
+/// This method returns a null pointer if the transformation cannot be
+/// performed, otherwise it returns the new instruction.
+///
+MachineInstr *
+X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                    MachineInstr &MI, LiveVariables *LV) const {
+  // The following opcodes also sets the condition code register(s). Only
+  // convert them to equivalent lea if the condition code register def's
+  // are dead!
+  if (hasLiveCondCodeDef(MI))
+    return nullptr;
+
+  MachineFunction &MF = *MI.getParent()->getParent();
+  // All instructions input are two-addr instructions.  Get the known operands.
+  const MachineOperand &Dest = MI.getOperand(0);
+  const MachineOperand &Src = MI.getOperand(1);
+
+  MachineInstr *NewMI = nullptr;
+  // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
+  // we have better subtarget support, enable the 16-bit LEA generation here.
+  // 16-bit LEA is also slow on Core2.
+  bool DisableLEA16 = true;
+  bool is64Bit = Subtarget.is64Bit();
+
+  unsigned MIOpc = MI.getOpcode();
+  switch (MIOpc) {
+  default: return nullptr;
+  case X86::SHL64ri: {
+    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+    // LEA can't handle RSP.
+    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+        !MF.getRegInfo().constrainRegClass(Src.getReg(),
+                                           &X86::GR64_NOSPRegClass))
+      return nullptr;
+
+    NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
+                .addOperand(Dest)
+                .addReg(0)
+                .addImm(1ULL << ShAmt)
+                .addOperand(Src)
+                .addImm(0)
+                .addReg(0);
+    break;
+  }
+  case X86::SHL32ri: {
+    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+    unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+    // LEA can't handle ESP.
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
+      return nullptr;
+
+    MachineInstrBuilder MIB =
+        BuildMI(MF, MI.getDebugLoc(), get(Opc))
+            .addOperand(Dest)
+            .addReg(0)
+            .addImm(1ULL << ShAmt)
+            .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+            .addImm(0)
+            .addReg(0);
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
+    NewMI = MIB;
+
+    break;
+  }
+  case X86::SHL16ri: {
+    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
+
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
+                     : nullptr;
+    NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
+                .addOperand(Dest)
+                .addReg(0)
+                .addImm(1ULL << ShAmt)
+                .addOperand(Src)
+                .addImm(0)
+                .addReg(0);
+    break;
+  }
+  case X86::INC64r:
+  case X86::INC32r: {
+    assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
+    unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
+      : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
+      return nullptr;
+
+    MachineInstrBuilder MIB =
+        BuildMI(MF, MI.getDebugLoc(), get(Opc))
+            .addOperand(Dest)
+            .addReg(SrcReg,
+                    getKillRegState(isKill) | getUndefRegState(isUndef));
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
+
+    NewMI = addOffset(MIB, 1);
+    break;
+  }
+  case X86::INC16r:
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
+                     : nullptr;
+    assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
+    NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
+                          .addOperand(Dest)
+                          .addOperand(Src),
+                      1);
+    break;
+  case X86::DEC64r:
+  case X86::DEC32r: {
+    assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
+    unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
+      : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
+      return nullptr;
+
+    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+                                  .addOperand(Dest)
+                                  .addReg(SrcReg, getUndefRegState(isUndef) |
+                                                      getKillRegState(isKill));
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
+
+    NewMI = addOffset(MIB, -1);
+
+    break;
+  }
+  case X86::DEC16r:
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
+                     : nullptr;
+    assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
+    NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
+                          .addOperand(Dest)
+                          .addOperand(Src),
+                      -1);
+    break;
+  case X86::ADD64rr:
+  case X86::ADD64rr_DB:
+  case X86::ADD32rr:
+  case X86::ADD32rr_DB: {
+    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+    unsigned Opc;
+    if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
+      Opc = X86::LEA64r;
+    else
+      Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
+      return nullptr;
+
+    const MachineOperand &Src2 = MI.getOperand(2);
+    bool isKill2, isUndef2;
+    unsigned SrcReg2;
+    MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
+                        SrcReg2, isKill2, isUndef2, ImplicitOp2, LV))
+      return nullptr;
+
+    MachineInstrBuilder MIB =
+        BuildMI(MF, MI.getDebugLoc(), get(Opc)).addOperand(Dest);
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
+    if (ImplicitOp2.getReg() != 0)
+      MIB.addOperand(ImplicitOp2);
+
+    NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
+
+    // Preserve undefness of the operands.
+    NewMI->getOperand(1).setIsUndef(isUndef);
+    NewMI->getOperand(3).setIsUndef(isUndef2);
+
+    if (LV && Src2.isKill())
+      LV->replaceKillInstruction(SrcReg2, MI, *NewMI);
+    break;
+  }
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB: {
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
+                     : nullptr;
+    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+    unsigned Src2 = MI.getOperand(2).getReg();
+    bool isKill2 = MI.getOperand(2).isKill();
+    NewMI = addRegReg(
+        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).addOperand(Dest),
+        Src.getReg(), Src.isKill(), Src2, isKill2);
+
+    // Preserve undefness of the operands.
+    bool isUndef = MI.getOperand(1).isUndef();
+    bool isUndef2 = MI.getOperand(2).isUndef();
+    NewMI->getOperand(1).setIsUndef(isUndef);
+    NewMI->getOperand(3).setIsUndef(isUndef2);
+
+    if (LV && isKill2)
+      LV->replaceKillInstruction(Src2, MI, *NewMI);
+    break;
+  }
+  case X86::ADD64ri32:
+  case X86::ADD64ri8:
+  case X86::ADD64ri32_DB:
+  case X86::ADD64ri8_DB:
+    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+    NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
+                          .addOperand(Dest)
+                          .addOperand(Src),
+                      MI.getOperand(2));
+    break;
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+  case X86::ADD32ri_DB:
+  case X86::ADD32ri8_DB: {
+    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+    unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+                        SrcReg, isKill, isUndef, ImplicitOp, LV))
+      return nullptr;
+
+    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+                                  .addOperand(Dest)
+                                  .addReg(SrcReg, getUndefRegState(isUndef) |
+                                                      getKillRegState(isKill));
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
+
+    NewMI = addOffset(MIB, MI.getOperand(2));
+    break;
+  }
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri_DB:
+  case X86::ADD16ri8_DB:
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
+                     : nullptr;
+    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+    NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
+                          .addOperand(Dest)
+                          .addOperand(Src),
+                      MI.getOperand(2));
+    break;
+  }
+
+  if (!NewMI) return nullptr;
+
+  if (LV) {  // Update live variables
+    if (Src.isKill())
+      LV->replaceKillInstruction(Src.getReg(), MI, *NewMI);
+    if (Dest.isDead())
+      LV->replaceKillInstruction(Dest.getReg(), MI, *NewMI);
+  }
+
+  MFI->insert(MI.getIterator(), NewMI); // Insert the new inst
+  return NewMI;
+}
+
+/// This determines which of three possible cases of a three source commute
+/// the source indexes correspond to taking into account any mask operands.
+/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
+/// possible.
+/// Case 0 - Possible to commute the first and second operands.
+/// Case 1 - Possible to commute the first and third operands.
+/// Case 2 - Possible to commute the second and third operands.
+static int getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
+                                  unsigned SrcOpIdx2) {
+  // Put the lowest index to SrcOpIdx1 to simplify the checks below.
+  if (SrcOpIdx1 > SrcOpIdx2)
+    std::swap(SrcOpIdx1, SrcOpIdx2);
+
+  unsigned Op1 = 1, Op2 = 2, Op3 = 3;
+  if (X86II::isKMasked(TSFlags)) {
+    // The k-mask operand cannot be commuted.
+    if (SrcOpIdx1 == 2)
+      return -1;
+
+    // For k-zero-masked operations it is Ok to commute the first vector
+    // operand.
+    // For regular k-masked operations a conservative choice is done as the
+    // elements of the first vector operand, for which the corresponding bit
+    // in the k-mask operand is set to 0, are copied to the result of the
+    // instruction.
+    // TODO/FIXME: The commute still may be legal if it is known that the
+    // k-mask operand is set to either all ones or all zeroes.
+    // It is also Ok to commute the 1st operand if all users of MI use only
+    // the elements enabled by the k-mask operand. For example,
+    //   v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
+    //                                                     : v1[i];
+    //   VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
+    //                                  // Ok, to commute v1 in FMADD213PSZrk.
+    if (X86II::isKMergeMasked(TSFlags) && SrcOpIdx1 == Op1)
+      return -1;
+    Op2++;
+    Op3++;
+  }
+
+  if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
+    return 0;
+  if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
+    return 1;
+  if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
+    return 2;
+  return -1;
+}
+
+unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
+    const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
+    const X86InstrFMA3Group &FMA3Group) const {
+
+  unsigned Opc = MI.getOpcode();
+
+  // Put the lowest index to SrcOpIdx1 to simplify the checks below.
+  if (SrcOpIdx1 > SrcOpIdx2)
+    std::swap(SrcOpIdx1, SrcOpIdx2);
+
+  // TODO: Commuting the 1st operand of FMA*_Int requires some additional
+  // analysis. The commute optimization is legal only if all users of FMA*_Int
+  // use only the lowest element of the FMA*_Int instruction. Such analysis are
+  // not implemented yet. So, just return 0 in that case.
+  // When such analysis are available this place will be the right place for
+  // calling it.
+  if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1)
+    return 0;
+
+  // Determine which case this commute is or if it can't be done.
+  int Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
+  if (Case < 0)
+    return 0;
+
+  // Define the FMA forms mapping array that helps to map input FMA form
+  // to output FMA form to preserve the operation semantics after
+  // commuting the operands.
+  const unsigned Form132Index = 0;
+  const unsigned Form213Index = 1;
+  const unsigned Form231Index = 2;
+  static const unsigned FormMapping[][3] = {
+    // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
+    // FMA132 A, C, b; ==> FMA231 C, A, b;
+    // FMA213 B, A, c; ==> FMA213 A, B, c;
+    // FMA231 C, A, b; ==> FMA132 A, C, b;
+    { Form231Index, Form213Index, Form132Index },
+    // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
+    // FMA132 A, c, B; ==> FMA132 B, c, A;
+    // FMA213 B, a, C; ==> FMA231 C, a, B;
+    // FMA231 C, a, B; ==> FMA213 B, a, C;
+    { Form132Index, Form231Index, Form213Index },
+    // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
+    // FMA132 a, C, B; ==> FMA213 a, B, C;
+    // FMA213 b, A, C; ==> FMA132 b, C, A;
+    // FMA231 c, A, B; ==> FMA231 c, B, A;
+    { Form213Index, Form132Index, Form231Index }
+  };
+
+  unsigned FMAForms[3];
+  if (FMA3Group.isRegOpcodeFromGroup(Opc)) {
+    FMAForms[0] = FMA3Group.getReg132Opcode();
+    FMAForms[1] = FMA3Group.getReg213Opcode();
+    FMAForms[2] = FMA3Group.getReg231Opcode();
+  } else {
+    FMAForms[0] = FMA3Group.getMem132Opcode();
+    FMAForms[1] = FMA3Group.getMem213Opcode();
+    FMAForms[2] = FMA3Group.getMem231Opcode();
+  }
+  unsigned FormIndex;
+  for (FormIndex = 0; FormIndex < 3; FormIndex++)
+    if (Opc == FMAForms[FormIndex])
+      break;
+
+  // Everything is ready, just adjust the FMA opcode and return it.
+  FormIndex = FormMapping[Case][FormIndex];
+  return FMAForms[FormIndex];
+}
+
+static bool commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
+                             unsigned SrcOpIdx2) {
+  uint64_t TSFlags = MI.getDesc().TSFlags;
+
+  // Determine which case this commute is or if it can't be done.
+  int Case = getThreeSrcCommuteCase(TSFlags, SrcOpIdx1, SrcOpIdx2);
+  if (Case < 0)
+    return false;
+
+  // For each case we need to swap two pairs of bits in the final immediate.
+  static const uint8_t SwapMasks[3][4] = {
+    { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
+    { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
+    { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
+  };
+
+  uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
+  // Clear out the bits we are swapping.
+  uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
+                           SwapMasks[Case][2] | SwapMasks[Case][3]);
+  // If the immediate had a bit of the pair set, then set the opposite bit.
+  if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1];
+  if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0];
+  if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
+  if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
+  MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
+
+  return true;
+}
+
+// Returns true if this is a VPERMI2 or VPERMT2 instrution that can be
+// commuted.
+static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
+#define VPERM_CASES(Suffix) \
+  case X86::VPERMI2##Suffix##128rr:    case X86::VPERMT2##Suffix##128rr:    \
+  case X86::VPERMI2##Suffix##256rr:    case X86::VPERMT2##Suffix##256rr:    \
+  case X86::VPERMI2##Suffix##rr:       case X86::VPERMT2##Suffix##rr:       \
+  case X86::VPERMI2##Suffix##128rm:    case X86::VPERMT2##Suffix##128rm:    \
+  case X86::VPERMI2##Suffix##256rm:    case X86::VPERMT2##Suffix##256rm:    \
+  case X86::VPERMI2##Suffix##rm:       case X86::VPERMT2##Suffix##rm:       \
+  case X86::VPERMI2##Suffix##128rrkz:  case X86::VPERMT2##Suffix##128rrkz:  \
+  case X86::VPERMI2##Suffix##256rrkz:  case X86::VPERMT2##Suffix##256rrkz:  \
+  case X86::VPERMI2##Suffix##rrkz:     case X86::VPERMT2##Suffix##rrkz:     \
+  case X86::VPERMI2##Suffix##128rmkz:  case X86::VPERMT2##Suffix##128rmkz:  \
+  case X86::VPERMI2##Suffix##256rmkz:  case X86::VPERMT2##Suffix##256rmkz:  \
+  case X86::VPERMI2##Suffix##rmkz:     case X86::VPERMT2##Suffix##rmkz:
+
+#define VPERM_CASES_BROADCAST(Suffix) \
+  VPERM_CASES(Suffix) \
+  case X86::VPERMI2##Suffix##128rmb:   case X86::VPERMT2##Suffix##128rmb:   \
+  case X86::VPERMI2##Suffix##256rmb:   case X86::VPERMT2##Suffix##256rmb:   \
+  case X86::VPERMI2##Suffix##rmb:      case X86::VPERMT2##Suffix##rmb:      \
+  case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
+  case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
+  case X86::VPERMI2##Suffix##rmbkz:    case X86::VPERMT2##Suffix##rmbkz:
+
+  switch (Opcode) {
+  default: return false;
+  VPERM_CASES(B)
+  VPERM_CASES_BROADCAST(D)
+  VPERM_CASES_BROADCAST(PD)
+  VPERM_CASES_BROADCAST(PS)
+  VPERM_CASES_BROADCAST(Q)
+  VPERM_CASES(W)
+    return true;
+  }
+#undef VPERM_CASES_BROADCAST
+#undef VPERM_CASES
+}
+
+// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
+// from the I opcod to the T opcode and vice versa.
+static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
+#define VPERM_CASES(Orig, New) \
+  case X86::Orig##128rr:    return X86::New##128rr;   \
+  case X86::Orig##128rrkz:  return X86::New##128rrkz; \
+  case X86::Orig##128rm:    return X86::New##128rm;   \
+  case X86::Orig##128rmkz:  return X86::New##128rmkz; \
+  case X86::Orig##256rr:    return X86::New##256rr;   \
+  case X86::Orig##256rrkz:  return X86::New##256rrkz; \
+  case X86::Orig##256rm:    return X86::New##256rm;   \
+  case X86::Orig##256rmkz:  return X86::New##256rmkz; \
+  case X86::Orig##rr:       return X86::New##rr;      \
+  case X86::Orig##rrkz:     return X86::New##rrkz;    \
+  case X86::Orig##rm:       return X86::New##rm;      \
+  case X86::Orig##rmkz:     return X86::New##rmkz;
+
+#define VPERM_CASES_BROADCAST(Orig, New) \
+  VPERM_CASES(Orig, New) \
+  case X86::Orig##128rmb:   return X86::New##128rmb;   \
+  case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
+  case X86::Orig##256rmb:   return X86::New##256rmb;   \
+  case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
+  case X86::Orig##rmb:      return X86::New##rmb;      \
+  case X86::Orig##rmbkz:    return X86::New##rmbkz;
+
+  switch (Opcode) {
+  VPERM_CASES(VPERMI2B, VPERMT2B)
+  VPERM_CASES_BROADCAST(VPERMI2D,  VPERMT2D)
+  VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
+  VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
+  VPERM_CASES_BROADCAST(VPERMI2Q,  VPERMT2Q)
+  VPERM_CASES(VPERMI2W, VPERMT2W)
+  VPERM_CASES(VPERMT2B, VPERMI2B)
+  VPERM_CASES_BROADCAST(VPERMT2D,  VPERMI2D)
+  VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
+  VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
+  VPERM_CASES_BROADCAST(VPERMT2Q,  VPERMI2Q)
+  VPERM_CASES(VPERMT2W, VPERMI2W)
+  }
+
+  llvm_unreachable("Unreachable!");
+#undef VPERM_CASES_BROADCAST
+#undef VPERM_CASES
+}
+
+MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+                                                   unsigned OpIdx1,
+                                                   unsigned OpIdx2) const {
+  auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
+    if (NewMI)
+      return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
+    return MI;
+  };
+
+  switch (MI.getOpcode()) {
+  case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
+  case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
+  case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
+  case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
+  case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
+  case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
+    unsigned Opc;
+    unsigned Size;
+    switch (MI.getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
+    case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
+    case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
+    case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
+    case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
+    case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
+    }
+    unsigned Amt = MI.getOperand(3).getImm();
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.setDesc(get(Opc));
+    WorkingMI.getOperand(3).setImm(Size - Amt);
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
+  }
+  case X86::BLENDPDrri:
+  case X86::BLENDPSrri:
+  case X86::PBLENDWrri:
+  case X86::VBLENDPDrri:
+  case X86::VBLENDPSrri:
+  case X86::VBLENDPDYrri:
+  case X86::VBLENDPSYrri:
+  case X86::VPBLENDDrri:
+  case X86::VPBLENDWrri:
+  case X86::VPBLENDDYrri:
+  case X86::VPBLENDWYrri:{
+    unsigned Mask;
+    switch (MI.getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::BLENDPDrri:    Mask = 0x03; break;
+    case X86::BLENDPSrri:    Mask = 0x0F; break;
+    case X86::PBLENDWrri:    Mask = 0xFF; break;
+    case X86::VBLENDPDrri:   Mask = 0x03; break;
+    case X86::VBLENDPSrri:   Mask = 0x0F; break;
+    case X86::VBLENDPDYrri:  Mask = 0x0F; break;
+    case X86::VBLENDPSYrri:  Mask = 0xFF; break;
+    case X86::VPBLENDDrri:   Mask = 0x0F; break;
+    case X86::VPBLENDWrri:   Mask = 0xFF; break;
+    case X86::VPBLENDDYrri:  Mask = 0xFF; break;
+    case X86::VPBLENDWYrri:  Mask = 0xFF; break;
+    }
+    // Only the least significant bits of Imm are used.
+    unsigned Imm = MI.getOperand(3).getImm() & Mask;
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.getOperand(3).setImm(Mask ^ Imm);
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
+  }
+  case X86::MOVSDrr:
+  case X86::MOVSSrr:
+  case X86::VMOVSDrr:
+  case X86::VMOVSSrr:{
+    // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
+    if (!Subtarget.hasSSE41())
+      return nullptr;
+
+    unsigned Mask, Opc;
+    switch (MI.getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::MOVSDrr:  Opc = X86::BLENDPDrri;  Mask = 0x02; break;
+    case X86::MOVSSrr:  Opc = X86::BLENDPSrri;  Mask = 0x0E; break;
+    case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
+    case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
+    }
+
+    // MOVSD/MOVSS's 2nd operand is a FR64/FR32 reg class - we need to copy
+    // this over to a VR128 class like the 1st operand to use a BLENDPD/BLENDPS.
+    auto &MRI = MI.getParent()->getParent()->getRegInfo();
+    auto VR128RC = MRI.getRegClass(MI.getOperand(1).getReg());
+    unsigned VR128 = MRI.createVirtualRegister(VR128RC);
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY),
+            VR128)
+        .addReg(MI.getOperand(2).getReg());
+
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.setDesc(get(Opc));
+    WorkingMI.getOperand(2).setReg(VR128);
+    WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
+  }
+  case X86::PCLMULQDQrr:
+  case X86::VPCLMULQDQrr:{
+    // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
+    // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
+    unsigned Imm = MI.getOperand(3).getImm();
+    unsigned Src1Hi = Imm & 0x01;
+    unsigned Src2Hi = Imm & 0x10;
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
+  }
+  case X86::CMPSDrr:
+  case X86::CMPSSrr:
+  case X86::CMPPDrri:
+  case X86::CMPPSrri:
+  case X86::VCMPSDrr:
+  case X86::VCMPSSrr:
+  case X86::VCMPPDrri:
+  case X86::VCMPPSrri:
+  case X86::VCMPPDYrri:
+  case X86::VCMPPSYrri:
+  case X86::VCMPSDZrr:
+  case X86::VCMPSSZrr:
+  case X86::VCMPPDZrri:
+  case X86::VCMPPSZrri:
+  case X86::VCMPPDZ128rri:
+  case X86::VCMPPSZ128rri:
+  case X86::VCMPPDZ256rri:
+  case X86::VCMPPSZ256rri: {
+    // Float comparison can be safely commuted for
+    // Ordered/Unordered/Equal/NotEqual tests
+    unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+    switch (Imm) {
+    case 0x00: // EQUAL
+    case 0x03: // UNORDERED
+    case 0x04: // NOT EQUAL
+    case 0x07: // ORDERED
+      return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+    default:
+      return nullptr;
+    }
+  }
+  case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
+  case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
+  case X86::VPCMPBZrri:    case X86::VPCMPUBZrri:
+  case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
+  case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
+  case X86::VPCMPDZrri:    case X86::VPCMPUDZrri:
+  case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
+  case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
+  case X86::VPCMPQZrri:    case X86::VPCMPUQZrri:
+  case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
+  case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
+  case X86::VPCMPWZrri:    case X86::VPCMPUWZrri: {
+    // Flip comparison mode immediate (if necessary).
+    unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+    switch (Imm) {
+    default: llvm_unreachable("Unreachable!");
+    case 0x01: Imm = 0x06; break; // LT  -> NLE
+    case 0x02: Imm = 0x05; break; // LE  -> NLT
+    case 0x05: Imm = 0x02; break; // NLT -> LE
+    case 0x06: Imm = 0x01; break; // NLE -> LT
+    case 0x00: // EQ
+    case 0x03: // FALSE
+    case 0x04: // NE
+    case 0x07: // TRUE
+      break;
+    }
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.getOperand(3).setImm(Imm);
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
+  }
+  case X86::VPCOMBri: case X86::VPCOMUBri:
+  case X86::VPCOMDri: case X86::VPCOMUDri:
+  case X86::VPCOMQri: case X86::VPCOMUQri:
+  case X86::VPCOMWri: case X86::VPCOMUWri: {
+    // Flip comparison mode immediate (if necessary).
+    unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+    switch (Imm) {
+    default: llvm_unreachable("Unreachable!");
+    case 0x00: Imm = 0x02; break; // LT -> GT
+    case 0x01: Imm = 0x03; break; // LE -> GE
+    case 0x02: Imm = 0x00; break; // GT -> LT
+    case 0x03: Imm = 0x01; break; // GE -> LE
+    case 0x04: // EQ
+    case 0x05: // NE
+    case 0x06: // FALSE
+    case 0x07: // TRUE
+      break;
+    }
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.getOperand(3).setImm(Imm);
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
+  }
+  case X86::VPERM2F128rr:
+  case X86::VPERM2I128rr: {
+    // Flip permute source immediate.
+    // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
+    // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
+    unsigned Imm = MI.getOperand(3).getImm() & 0xFF;
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
+  }
+  case X86::MOVHLPSrr:
+  case X86::UNPCKHPDrr: {
+    if (!Subtarget.hasSSE2())
+      return nullptr;
+
+    unsigned Opc = MI.getOpcode();
+    switch (Opc) {
+      default: llvm_unreachable("Unreachable!");
+      case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
+      case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
+    }
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.setDesc(get(Opc));
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
+  }
+  case X86::CMOVB16rr:  case X86::CMOVB32rr:  case X86::CMOVB64rr:
+  case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
+  case X86::CMOVE16rr:  case X86::CMOVE32rr:  case X86::CMOVE64rr:
+  case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
+  case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:
+  case X86::CMOVA16rr:  case X86::CMOVA32rr:  case X86::CMOVA64rr:
+  case X86::CMOVL16rr:  case X86::CMOVL32rr:  case X86::CMOVL64rr:
+  case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
+  case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
+  case X86::CMOVG16rr:  case X86::CMOVG32rr:  case X86::CMOVG64rr:
+  case X86::CMOVS16rr:  case X86::CMOVS32rr:  case X86::CMOVS64rr:
+  case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
+  case X86::CMOVP16rr:  case X86::CMOVP32rr:  case X86::CMOVP64rr:
+  case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
+  case X86::CMOVO16rr:  case X86::CMOVO32rr:  case X86::CMOVO64rr:
+  case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: {
+    unsigned Opc;
+    switch (MI.getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::CMOVB16rr:  Opc = X86::CMOVAE16rr; break;
+    case X86::CMOVB32rr:  Opc = X86::CMOVAE32rr; break;
+    case X86::CMOVB64rr:  Opc = X86::CMOVAE64rr; break;
+    case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
+    case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
+    case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
+    case X86::CMOVE16rr:  Opc = X86::CMOVNE16rr; break;
+    case X86::CMOVE32rr:  Opc = X86::CMOVNE32rr; break;
+    case X86::CMOVE64rr:  Opc = X86::CMOVNE64rr; break;
+    case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
+    case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
+    case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
+    case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
+    case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
+    case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
+    case X86::CMOVA16rr:  Opc = X86::CMOVBE16rr; break;
+    case X86::CMOVA32rr:  Opc = X86::CMOVBE32rr; break;
+    case X86::CMOVA64rr:  Opc = X86::CMOVBE64rr; break;
+    case X86::CMOVL16rr:  Opc = X86::CMOVGE16rr; break;
+    case X86::CMOVL32rr:  Opc = X86::CMOVGE32rr; break;
+    case X86::CMOVL64rr:  Opc = X86::CMOVGE64rr; break;
+    case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
+    case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
+    case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
+    case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
+    case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
+    case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
+    case X86::CMOVG16rr:  Opc = X86::CMOVLE16rr; break;
+    case X86::CMOVG32rr:  Opc = X86::CMOVLE32rr; break;
+    case X86::CMOVG64rr:  Opc = X86::CMOVLE64rr; break;
+    case X86::CMOVS16rr:  Opc = X86::CMOVNS16rr; break;
+    case X86::CMOVS32rr:  Opc = X86::CMOVNS32rr; break;
+    case X86::CMOVS64rr:  Opc = X86::CMOVNS64rr; break;
+    case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
+    case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
+    case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
+    case X86::CMOVP16rr:  Opc = X86::CMOVNP16rr; break;
+    case X86::CMOVP32rr:  Opc = X86::CMOVNP32rr; break;
+    case X86::CMOVP64rr:  Opc = X86::CMOVNP64rr; break;
+    case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
+    case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
+    case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
+    case X86::CMOVO16rr:  Opc = X86::CMOVNO16rr; break;
+    case X86::CMOVO32rr:  Opc = X86::CMOVNO32rr; break;
+    case X86::CMOVO64rr:  Opc = X86::CMOVNO64rr; break;
+    case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
+    case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
+    case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
+    }
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.setDesc(get(Opc));
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
+  }
+  case X86::VPTERNLOGDZrri:      case X86::VPTERNLOGDZrmi:
+  case X86::VPTERNLOGDZ128rri:   case X86::VPTERNLOGDZ128rmi:
+  case X86::VPTERNLOGDZ256rri:   case X86::VPTERNLOGDZ256rmi:
+  case X86::VPTERNLOGQZrri:      case X86::VPTERNLOGQZrmi:
+  case X86::VPTERNLOGQZ128rri:   case X86::VPTERNLOGQZ128rmi:
+  case X86::VPTERNLOGQZ256rri:   case X86::VPTERNLOGQZ256rmi:
+  case X86::VPTERNLOGDZrrik:     case X86::VPTERNLOGDZrmik:
+  case X86::VPTERNLOGDZ128rrik:  case X86::VPTERNLOGDZ128rmik:
+  case X86::VPTERNLOGDZ256rrik:  case X86::VPTERNLOGDZ256rmik:
+  case X86::VPTERNLOGQZrrik:     case X86::VPTERNLOGQZrmik:
+  case X86::VPTERNLOGQZ128rrik:  case X86::VPTERNLOGQZ128rmik:
+  case X86::VPTERNLOGQZ256rrik:  case X86::VPTERNLOGQZ256rmik:
+  case X86::VPTERNLOGDZrrikz:    case X86::VPTERNLOGDZrmikz:
+  case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
+  case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
+  case X86::VPTERNLOGQZrrikz:    case X86::VPTERNLOGQZrmikz:
+  case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
+  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: {
+    auto &WorkingMI = cloneIfNew(MI);
+    if (!commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2))
+      return nullptr;
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
+  }
+  default: {
+    if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
+      unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
+      auto &WorkingMI = cloneIfNew(MI);
+      WorkingMI.setDesc(get(Opc));
+      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                     OpIdx1, OpIdx2);
+    }
+
+    const X86InstrFMA3Group *FMA3Group =
+        X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
+    if (FMA3Group) {
+      unsigned Opc =
+        getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
+      if (Opc == 0)
+        return nullptr;
+      auto &WorkingMI = cloneIfNew(MI);
+      WorkingMI.setDesc(get(Opc));
+      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                     OpIdx1, OpIdx2);
+    }
+
+    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+  }
+  }
+}
+
+bool X86InstrInfo::findFMA3CommutedOpIndices(
+    const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2,
+    const X86InstrFMA3Group &FMA3Group) const {
+
+  if (!findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2))
+    return false;
+
+  // Check if we can adjust the opcode to preserve the semantics when
+  // commute the register operands.
+  return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0;
+}
+
+bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
+                                                 unsigned &SrcOpIdx1,
+                                                 unsigned &SrcOpIdx2) const {
+  uint64_t TSFlags = MI.getDesc().TSFlags;
+
+  unsigned FirstCommutableVecOp = 1;
+  unsigned LastCommutableVecOp = 3;
+  unsigned KMaskOp = 0;
+  if (X86II::isKMasked(TSFlags)) {
+    // The k-mask operand has index = 2 for masked and zero-masked operations.
+    KMaskOp = 2;
+
+    // The operand with index = 1 is used as a source for those elements for
+    // which the corresponding bit in the k-mask is set to 0.
+    if (X86II::isKMergeMasked(TSFlags))
+      FirstCommutableVecOp = 3;
+
+    LastCommutableVecOp++;
+  }
+
+  if (isMem(MI, LastCommutableVecOp))
+    LastCommutableVecOp--;
+
+  // Only the first RegOpsNum operands are commutable.
+  // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
+  // that the operand is not specified/fixed.
+  if (SrcOpIdx1 != CommuteAnyOperandIndex &&
+      (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
+       SrcOpIdx1 == KMaskOp))
+    return false;
+  if (SrcOpIdx2 != CommuteAnyOperandIndex &&
+      (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
+       SrcOpIdx2 == KMaskOp))
+    return false;
+
+  // Look for two different register operands assumed to be commutable
+  // regardless of the FMA opcode. The FMA opcode is adjusted later.
+  if (SrcOpIdx1 == CommuteAnyOperandIndex ||
+      SrcOpIdx2 == CommuteAnyOperandIndex) {
+    unsigned CommutableOpIdx1 = SrcOpIdx1;
+    unsigned CommutableOpIdx2 = SrcOpIdx2;
+
+    // At least one of operands to be commuted is not specified and
+    // this method is free to choose appropriate commutable operands.
+    if (SrcOpIdx1 == SrcOpIdx2)
+      // Both of operands are not fixed. By default set one of commutable
+      // operands to the last register operand of the instruction.
+      CommutableOpIdx2 = LastCommutableVecOp;
+    else if (SrcOpIdx2 == CommuteAnyOperandIndex)
+      // Only one of operands is not fixed.
+      CommutableOpIdx2 = SrcOpIdx1;
+
+    // CommutableOpIdx2 is well defined now. Let's choose another commutable
+    // operand and assign its index to CommutableOpIdx1.
+    unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
+    for (CommutableOpIdx1 = LastCommutableVecOp;
+         CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
+      // Just ignore and skip the k-mask operand.
+      if (CommutableOpIdx1 == KMaskOp)
+        continue;
+
+      // The commuted operands must have different registers.
+      // Otherwise, the commute transformation does not change anything and
+      // is useless then.
+      if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
+        break;
+    }
+
+    // No appropriate commutable operands were found.
+    if (CommutableOpIdx1 < FirstCommutableVecOp)
+      return false;
+
+    // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
+    // to return those values.
+    if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+                              CommutableOpIdx1, CommutableOpIdx2))
+      return false;
+  }
+
+  return true;
+}
+
+bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+                                         unsigned &SrcOpIdx2) const {
+  const MCInstrDesc &Desc = MI.getDesc();
+  if (!Desc.isCommutable())
+    return false;
+
+  switch (MI.getOpcode()) {
+  case X86::CMPSDrr:
+  case X86::CMPSSrr:
+  case X86::CMPPDrri:
+  case X86::CMPPSrri:
+  case X86::VCMPSDrr:
+  case X86::VCMPSSrr:
+  case X86::VCMPPDrri:
+  case X86::VCMPPSrri:
+  case X86::VCMPPDYrri:
+  case X86::VCMPPSYrri:
+  case X86::VCMPSDZrr:
+  case X86::VCMPSSZrr:
+  case X86::VCMPPDZrri:
+  case X86::VCMPPSZrri:
+  case X86::VCMPPDZ128rri:
+  case X86::VCMPPSZ128rri:
+  case X86::VCMPPDZ256rri:
+  case X86::VCMPPSZ256rri: {
+    // Float comparison can be safely commuted for
+    // Ordered/Unordered/Equal/NotEqual tests
+    unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+    switch (Imm) {
+    case 0x00: // EQUAL
+    case 0x03: // UNORDERED
+    case 0x04: // NOT EQUAL
+    case 0x07: // ORDERED
+      // The indices of the commutable operands are 1 and 2.
+      // Assign them to the returned operand indices here.
+      return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
+    }
+    return false;
+  }
+  case X86::MOVSDrr:
+  case X86::MOVSSrr:
+  case X86::VMOVSDrr:
+  case X86::VMOVSSrr: {
+    if (Subtarget.hasSSE41())
+      return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+    return false;
+  }
+  case X86::VPTERNLOGDZrri:      case X86::VPTERNLOGDZrmi:
+  case X86::VPTERNLOGDZ128rri:   case X86::VPTERNLOGDZ128rmi:
+  case X86::VPTERNLOGDZ256rri:   case X86::VPTERNLOGDZ256rmi:
+  case X86::VPTERNLOGQZrri:      case X86::VPTERNLOGQZrmi:
+  case X86::VPTERNLOGQZ128rri:   case X86::VPTERNLOGQZ128rmi:
+  case X86::VPTERNLOGQZ256rri:   case X86::VPTERNLOGQZ256rmi:
+  case X86::VPTERNLOGDZrrik:     case X86::VPTERNLOGDZrmik:
+  case X86::VPTERNLOGDZ128rrik:  case X86::VPTERNLOGDZ128rmik:
+  case X86::VPTERNLOGDZ256rrik:  case X86::VPTERNLOGDZ256rmik:
+  case X86::VPTERNLOGQZrrik:     case X86::VPTERNLOGQZrmik:
+  case X86::VPTERNLOGQZ128rrik:  case X86::VPTERNLOGQZ128rmik:
+  case X86::VPTERNLOGQZ256rrik:  case X86::VPTERNLOGQZ256rmik:
+  case X86::VPTERNLOGDZrrikz:    case X86::VPTERNLOGDZrmikz:
+  case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
+  case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
+  case X86::VPTERNLOGQZrrikz:    case X86::VPTERNLOGQZrmikz:
+  case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
+  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
+    return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+  default:
+    const X86InstrFMA3Group *FMA3Group =
+        X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
+    if (FMA3Group)
+      return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, *FMA3Group);
+
+    // Handled masked instructions since we need to skip over the mask input
+    // and the preserved input.
+    if (Desc.TSFlags & X86II::EVEX_K) {
+      // First assume that the first input is the mask operand and skip past it.
+      unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
+      unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
+      // Check if the first input is tied. If there isn't one then we only
+      // need to skip the mask operand which we did above.
+      if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
+                                             MCOI::TIED_TO) != -1)) {
+        // If this is zero masking instruction with a tied operand, we need to
+        // move the first index back to the first input since this must
+        // be a 3 input instruction and we want the first two non-mask inputs.
+        // Otherwise this is a 2 input instruction with a preserved input and
+        // mask, so we need to move the indices to skip one more input.
+        if (Desc.TSFlags & X86II::EVEX_Z)
+          --CommutableOpIdx1;
+        else {
+          ++CommutableOpIdx1;
+          ++CommutableOpIdx2;
+        }
+      }
+
+      if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+                                CommutableOpIdx1, CommutableOpIdx2))
+        return false;
+
+      if (!MI.getOperand(SrcOpIdx1).isReg() ||
+          !MI.getOperand(SrcOpIdx2).isReg())
+        // No idea.
+        return false;
+      return true;
+    }
+
+    return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+  }
+  return false;
+}
+
+static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
+  switch (BrOpc) {
+  default: return X86::COND_INVALID;
+  case X86::JE_1:  return X86::COND_E;
+  case X86::JNE_1: return X86::COND_NE;
+  case X86::JL_1:  return X86::COND_L;
+  case X86::JLE_1: return X86::COND_LE;
+  case X86::JG_1:  return X86::COND_G;
+  case X86::JGE_1: return X86::COND_GE;
+  case X86::JB_1:  return X86::COND_B;
+  case X86::JBE_1: return X86::COND_BE;
+  case X86::JA_1:  return X86::COND_A;
+  case X86::JAE_1: return X86::COND_AE;
+  case X86::JS_1:  return X86::COND_S;
+  case X86::JNS_1: return X86::COND_NS;
+  case X86::JP_1:  return X86::COND_P;
+  case X86::JNP_1: return X86::COND_NP;
+  case X86::JO_1:  return X86::COND_O;
+  case X86::JNO_1: return X86::COND_NO;
+  }
+}
+
+/// Return condition code of a SET opcode.
+static X86::CondCode getCondFromSETOpc(unsigned Opc) {
+  switch (Opc) {
+  default: return X86::COND_INVALID;
+  case X86::SETAr:  case X86::SETAm:  return X86::COND_A;
+  case X86::SETAEr: case X86::SETAEm: return X86::COND_AE;
+  case X86::SETBr:  case X86::SETBm:  return X86::COND_B;
+  case X86::SETBEr: case X86::SETBEm: return X86::COND_BE;
+  case X86::SETEr:  case X86::SETEm:  return X86::COND_E;
+  case X86::SETGr:  case X86::SETGm:  return X86::COND_G;
+  case X86::SETGEr: case X86::SETGEm: return X86::COND_GE;
+  case X86::SETLr:  case X86::SETLm:  return X86::COND_L;
+  case X86::SETLEr: case X86::SETLEm: return X86::COND_LE;
+  case X86::SETNEr: case X86::SETNEm: return X86::COND_NE;
+  case X86::SETNOr: case X86::SETNOm: return X86::COND_NO;
+  case X86::SETNPr: case X86::SETNPm: return X86::COND_NP;
+  case X86::SETNSr: case X86::SETNSm: return X86::COND_NS;
+  case X86::SETOr:  case X86::SETOm:  return X86::COND_O;
+  case X86::SETPr:  case X86::SETPm:  return X86::COND_P;
+  case X86::SETSr:  case X86::SETSm:  return X86::COND_S;
+  }
+}
+
+/// Return condition code of a CMov opcode.
+X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
+  switch (Opc) {
+  default: return X86::COND_INVALID;
+  case X86::CMOVA16rm:  case X86::CMOVA16rr:  case X86::CMOVA32rm:
+  case X86::CMOVA32rr:  case X86::CMOVA64rm:  case X86::CMOVA64rr:
+    return X86::COND_A;
+  case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm:
+  case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr:
+    return X86::COND_AE;
+  case X86::CMOVB16rm:  case X86::CMOVB16rr:  case X86::CMOVB32rm:
+  case X86::CMOVB32rr:  case X86::CMOVB64rm:  case X86::CMOVB64rr:
+    return X86::COND_B;
+  case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm:
+  case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr:
+    return X86::COND_BE;
+  case X86::CMOVE16rm:  case X86::CMOVE16rr:  case X86::CMOVE32rm:
+  case X86::CMOVE32rr:  case X86::CMOVE64rm:  case X86::CMOVE64rr:
+    return X86::COND_E;
+  case X86::CMOVG16rm:  case X86::CMOVG16rr:  case X86::CMOVG32rm:
+  case X86::CMOVG32rr:  case X86::CMOVG64rm:  case X86::CMOVG64rr:
+    return X86::COND_G;
+  case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm:
+  case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr:
+    return X86::COND_GE;
+  case X86::CMOVL16rm:  case X86::CMOVL16rr:  case X86::CMOVL32rm:
+  case X86::CMOVL32rr:  case X86::CMOVL64rm:  case X86::CMOVL64rr:
+    return X86::COND_L;
+  case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm:
+  case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr:
+    return X86::COND_LE;
+  case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm:
+  case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr:
+    return X86::COND_NE;
+  case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm:
+  case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr:
+    return X86::COND_NO;
+  case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm:
+  case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr:
+    return X86::COND_NP;
+  case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm:
+  case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr:
+    return X86::COND_NS;
+  case X86::CMOVO16rm:  case X86::CMOVO16rr:  case X86::CMOVO32rm:
+  case X86::CMOVO32rr:  case X86::CMOVO64rm:  case X86::CMOVO64rr:
+    return X86::COND_O;
+  case X86::CMOVP16rm:  case X86::CMOVP16rr:  case X86::CMOVP32rm:
+  case X86::CMOVP32rr:  case X86::CMOVP64rm:  case X86::CMOVP64rr:
+    return X86::COND_P;
+  case X86::CMOVS16rm:  case X86::CMOVS16rr:  case X86::CMOVS32rm:
+  case X86::CMOVS32rr:  case X86::CMOVS64rm:  case X86::CMOVS64rr:
+    return X86::COND_S;
+  }
+}
+
+unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case X86::COND_E:  return X86::JE_1;
+  case X86::COND_NE: return X86::JNE_1;
+  case X86::COND_L:  return X86::JL_1;
+  case X86::COND_LE: return X86::JLE_1;
+  case X86::COND_G:  return X86::JG_1;
+  case X86::COND_GE: return X86::JGE_1;
+  case X86::COND_B:  return X86::JB_1;
+  case X86::COND_BE: return X86::JBE_1;
+  case X86::COND_A:  return X86::JA_1;
+  case X86::COND_AE: return X86::JAE_1;
+  case X86::COND_S:  return X86::JS_1;
+  case X86::COND_NS: return X86::JNS_1;
+  case X86::COND_P:  return X86::JP_1;
+  case X86::COND_NP: return X86::JNP_1;
+  case X86::COND_O:  return X86::JO_1;
+  case X86::COND_NO: return X86::JNO_1;
+  }
+}
+
+/// Return the inverse of the specified condition,
+/// e.g. turning COND_E to COND_NE.
+X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case X86::COND_E:  return X86::COND_NE;
+  case X86::COND_NE: return X86::COND_E;
+  case X86::COND_L:  return X86::COND_GE;
+  case X86::COND_LE: return X86::COND_G;
+  case X86::COND_G:  return X86::COND_LE;
+  case X86::COND_GE: return X86::COND_L;
+  case X86::COND_B:  return X86::COND_AE;
+  case X86::COND_BE: return X86::COND_A;
+  case X86::COND_A:  return X86::COND_BE;
+  case X86::COND_AE: return X86::COND_B;
+  case X86::COND_S:  return X86::COND_NS;
+  case X86::COND_NS: return X86::COND_S;
+  case X86::COND_P:  return X86::COND_NP;
+  case X86::COND_NP: return X86::COND_P;
+  case X86::COND_O:  return X86::COND_NO;
+  case X86::COND_NO: return X86::COND_O;
+  case X86::COND_NE_OR_P:  return X86::COND_E_AND_NP;
+  case X86::COND_E_AND_NP: return X86::COND_NE_OR_P;
+  }
+}
+
+/// Assuming the flags are set by MI(a,b), return the condition code if we
+/// modify the instructions such that flags are set by MI(b,a).
+static X86::CondCode getSwappedCondition(X86::CondCode CC) {
+  switch (CC) {
+  default: return X86::COND_INVALID;
+  case X86::COND_E:  return X86::COND_E;
+  case X86::COND_NE: return X86::COND_NE;
+  case X86::COND_L:  return X86::COND_G;
+  case X86::COND_LE: return X86::COND_GE;
+  case X86::COND_G:  return X86::COND_L;
+  case X86::COND_GE: return X86::COND_LE;
+  case X86::COND_B:  return X86::COND_A;
+  case X86::COND_BE: return X86::COND_AE;
+  case X86::COND_A:  return X86::COND_B;
+  case X86::COND_AE: return X86::COND_BE;
+  }
+}
+
+/// Return a set opcode for the given condition and
+/// whether it has memory operand.
+unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
+  static const uint16_t Opc[16][2] = {
+    { X86::SETAr,  X86::SETAm  },
+    { X86::SETAEr, X86::SETAEm },
+    { X86::SETBr,  X86::SETBm  },
+    { X86::SETBEr, X86::SETBEm },
+    { X86::SETEr,  X86::SETEm  },
+    { X86::SETGr,  X86::SETGm  },
+    { X86::SETGEr, X86::SETGEm },
+    { X86::SETLr,  X86::SETLm  },
+    { X86::SETLEr, X86::SETLEm },
+    { X86::SETNEr, X86::SETNEm },
+    { X86::SETNOr, X86::SETNOm },
+    { X86::SETNPr, X86::SETNPm },
+    { X86::SETNSr, X86::SETNSm },
+    { X86::SETOr,  X86::SETOm  },
+    { X86::SETPr,  X86::SETPm  },
+    { X86::SETSr,  X86::SETSm  }
+  };
+
+  assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes");
+  return Opc[CC][HasMemoryOperand ? 1 : 0];
+}
+
+/// Return a cmov opcode for the given condition,
+/// register size in bytes, and operand type.
+unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
+                              bool HasMemoryOperand) {
+  static const uint16_t Opc[32][3] = {
+    { X86::CMOVA16rr,  X86::CMOVA32rr,  X86::CMOVA64rr  },
+    { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
+    { X86::CMOVB16rr,  X86::CMOVB32rr,  X86::CMOVB64rr  },
+    { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr },
+    { X86::CMOVE16rr,  X86::CMOVE32rr,  X86::CMOVE64rr  },
+    { X86::CMOVG16rr,  X86::CMOVG32rr,  X86::CMOVG64rr  },
+    { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr },
+    { X86::CMOVL16rr,  X86::CMOVL32rr,  X86::CMOVL64rr  },
+    { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr },
+    { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr },
+    { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr },
+    { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr },
+    { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr },
+    { X86::CMOVO16rr,  X86::CMOVO32rr,  X86::CMOVO64rr  },
+    { X86::CMOVP16rr,  X86::CMOVP32rr,  X86::CMOVP64rr  },
+    { X86::CMOVS16rr,  X86::CMOVS32rr,  X86::CMOVS64rr  },
+    { X86::CMOVA16rm,  X86::CMOVA32rm,  X86::CMOVA64rm  },
+    { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm },
+    { X86::CMOVB16rm,  X86::CMOVB32rm,  X86::CMOVB64rm  },
+    { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm },
+    { X86::CMOVE16rm,  X86::CMOVE32rm,  X86::CMOVE64rm  },
+    { X86::CMOVG16rm,  X86::CMOVG32rm,  X86::CMOVG64rm  },
+    { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm },
+    { X86::CMOVL16rm,  X86::CMOVL32rm,  X86::CMOVL64rm  },
+    { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm },
+    { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm },
+    { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm },
+    { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm },
+    { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm },
+    { X86::CMOVO16rm,  X86::CMOVO32rm,  X86::CMOVO64rm  },
+    { X86::CMOVP16rm,  X86::CMOVP32rm,  X86::CMOVP64rm  },
+    { X86::CMOVS16rm,  X86::CMOVS32rm,  X86::CMOVS64rm  }
+  };
+
+  assert(CC < 16 && "Can only handle standard cond codes");
+  unsigned Idx = HasMemoryOperand ? 16+CC : CC;
+  switch(RegBytes) {
+  default: llvm_unreachable("Illegal register size!");
+  case 2: return Opc[Idx][0];
+  case 4: return Opc[Idx][1];
+  case 8: return Opc[Idx][2];
+  }
+}
+
+bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
+  if (!MI.isTerminator()) return false;
+
+  // Conditional branch is a special case.
+  if (MI.isBranch() && !MI.isBarrier())
+    return true;
+  if (!MI.isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case X86::TCRETURNdi:
+  case X86::TCRETURNri:
+  case X86::TCRETURNmi:
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool X86InstrInfo::canMakeTailCallConditional(
+    SmallVectorImpl<MachineOperand> &BranchCond,
+    const MachineInstr &TailCall) const {
+  if (TailCall.getOpcode() != X86::TCRETURNdi &&
+      TailCall.getOpcode() != X86::TCRETURNdi64) {
+    // Only direct calls can be done with a conditional branch.
+    return false;
+  }
+
+  if (Subtarget.isTargetWin64()) {
+    // Conditional tail calls confuse the Win64 unwinder.
+    // TODO: Allow them for "leaf" functions; PR30337.
+    return false;
+  }
+
+  assert(BranchCond.size() == 1);
+  if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
+    // Can't make a conditional tail call with this condition.
+    return false;
+  }
+
+  const X86MachineFunctionInfo *X86FI =
+      TailCall.getParent()->getParent()->getInfo<X86MachineFunctionInfo>();
+  if (X86FI->getTCReturnAddrDelta() != 0 ||
+      TailCall.getOperand(1).getImm() != 0) {
+    // A conditional tail call cannot do any stack adjustment.
+    return false;
+  }
+
+  return true;
+}
+
+void X86InstrInfo::replaceBranchWithTailCall(
+    MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &BranchCond,
+    const MachineInstr &TailCall) const {
+  assert(canMakeTailCallConditional(BranchCond, TailCall));
+
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (!I->isBranch())
+      assert(0 && "Can't find the branch to replace!");
+
+    X86::CondCode CC = getCondFromBranchOpc(I->getOpcode());
+    assert(BranchCond.size() == 1);
+    if (CC != BranchCond[0].getImm())
+      continue;
+
+    break;
+  }
+
+  unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
+                                                         : X86::TCRETURNdi64cc;
+
+  auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
+  MIB->addOperand(TailCall.getOperand(0)); // Destination.
+  MIB.addImm(0); // Stack offset (not used).
+  MIB->addOperand(BranchCond[0]); // Condition.
+  MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
+
+  I->eraseFromParent();
+}
+
+// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
+// not be a fallthrough MBB now due to layout changes). Return nullptr if the
+// fallthrough MBB cannot be identified.
+static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB,
+                                            MachineBasicBlock *TBB) {
+  // Look for non-EHPad successors other than TBB. If we find exactly one, it
+  // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
+  // and fallthrough MBB. If we find more than one, we cannot identify the
+  // fallthrough MBB and should return nullptr.
+  MachineBasicBlock *FallthroughBB = nullptr;
+  for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
+    if ((*SI)->isEHPad() || (*SI == TBB && FallthroughBB))
+      continue;
+    // Return a nullptr if we found more than one fallthrough successor.
+    if (FallthroughBB && FallthroughBB != TBB)
+      return nullptr;
+    FallthroughBB = *SI;
+  }
+  return FallthroughBB;
+}
+
+bool X86InstrInfo::AnalyzeBranchImpl(
+    MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+    SmallVectorImpl<MachineOperand> &Cond,
+    SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
+
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+
+    // Working from the bottom, when we see a non-terminator instruction, we're
+    // done.
+    if (!isUnpredicatedTerminator(*I))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled by this
+    // analysis.
+    if (!I->isBranch())
+      return true;
+
+    // Handle unconditional branches.
+    if (I->getOpcode() == X86::JMP_1) {
+      UnCondBrIter = I;
+
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (std::next(I) != MBB.end())
+        std::next(I)->eraseFromParent();
+
+      Cond.clear();
+      FBB = nullptr;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = nullptr;
+        I->eraseFromParent();
+        I = MBB.end();
+        UnCondBrIter = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditional destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+
+    // Handle conditional branches.
+    X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
+    if (BranchCode == X86::COND_INVALID)
+      return true;  // Can't handle indirect branch.
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
+      if (AllowModify && UnCondBrIter != MBB.end() &&
+          MBB.isLayoutSuccessor(TargetBB)) {
+        // If we can modify the code and it ends in something like:
+        //
+        //     jCC L1
+        //     jmp L2
+        //   L1:
+        //     ...
+        //   L2:
+        //
+        // Then we can change this to:
+        //
+        //     jnCC L2
+        //   L1:
+        //     ...
+        //   L2:
+        //
+        // Which is a bit more efficient.
+        // We conditionally jump to the fall-through block.
+        BranchCode = GetOppositeBranchCondition(BranchCode);
+        unsigned JNCC = GetCondBranchFromCond(BranchCode);
+        MachineBasicBlock::iterator OldInst = I;
+
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
+          .addMBB(UnCondBrIter->getOperand(0).getMBB());
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
+          .addMBB(TargetBB);
+
+        OldInst->eraseFromParent();
+        UnCondBrIter->eraseFromParent();
+
+        // Restart the analysis.
+        UnCondBrIter = MBB.end();
+        I = MBB.end();
+        continue;
+      }
+
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      CondBranches.push_back(&*I);
+      continue;
+    }
+
+    // Handle subsequent conditional branches. Only handle the case where all
+    // conditional branches branch to the same destination and their condition
+    // opcodes fit one of the special multi-branch idioms.
+    assert(Cond.size() == 1);
+    assert(TBB);
+
+    // If the conditions are the same, we can leave them alone.
+    X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
+    auto NewTBB = I->getOperand(0).getMBB();
+    if (OldBranchCode == BranchCode && TBB == NewTBB)
+      continue;
+
+    // If they differ, see if they fit one of the known patterns. Theoretically,
+    // we could handle more patterns here, but we shouldn't expect to see them
+    // if instruction selection has done a reasonable job.
+    if (TBB == NewTBB &&
+               ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
+                (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
+      BranchCode = X86::COND_NE_OR_P;
+    } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
+               (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
+      if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
+        return true;
+
+      // X86::COND_E_AND_NP usually has two different branch destinations.
+      //
+      // JP B1
+      // JE B2
+      // JMP B1
+      // B1:
+      // B2:
+      //
+      // Here this condition branches to B2 only if NP && E. It has another
+      // equivalent form:
+      //
+      // JNE B1
+      // JNP B2
+      // JMP B1
+      // B1:
+      // B2:
+      //
+      // Similarly it branches to B2 only if E && NP. That is why this condition
+      // is named with COND_E_AND_NP.
+      BranchCode = X86::COND_E_AND_NP;
+    } else
+      return true;
+
+    // Update the MachineOperand.
+    Cond[0].setImm(BranchCode);
+    CondBranches.push_back(&*I);
+  }
+
+  return false;
+}
+
+bool X86InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                 MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  SmallVector<MachineInstr *, 4> CondBranches;
+  return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
+}
+
+bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
+                                          MachineBranchPredicate &MBP,
+                                          bool AllowModify) const {
+  using namespace std::placeholders;
+
+  SmallVector<MachineOperand, 4> Cond;
+  SmallVector<MachineInstr *, 4> CondBranches;
+  if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
+                        AllowModify))
+    return true;
+
+  if (Cond.size() != 1)
+    return true;
+
+  assert(MBP.TrueDest && "expected!");
+
+  if (!MBP.FalseDest)
+    MBP.FalseDest = MBB.getNextNode();
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+  MachineInstr *ConditionDef = nullptr;
+  bool SingleUseCondition = true;
+
+  for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) {
+    if (I->modifiesRegister(X86::EFLAGS, TRI)) {
+      ConditionDef = &*I;
+      break;
+    }
+
+    if (I->readsRegister(X86::EFLAGS, TRI))
+      SingleUseCondition = false;
+  }
+
+  if (!ConditionDef)
+    return true;
+
+  if (SingleUseCondition) {
+    for (auto *Succ : MBB.successors())
+      if (Succ->isLiveIn(X86::EFLAGS))
+        SingleUseCondition = false;
+  }
+
+  MBP.ConditionDef = ConditionDef;
+  MBP.SingleUseCondition = SingleUseCondition;
+
+  // Currently we only recognize the simple pattern:
+  //
+  //   test %reg, %reg
+  //   je %label
+  //
+  const unsigned TestOpcode =
+      Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
+
+  if (ConditionDef->getOpcode() == TestOpcode &&
+      ConditionDef->getNumOperands() == 3 &&
+      ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
+      (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
+    MBP.LHS = ConditionDef->getOperand(0);
+    MBP.RHS = MachineOperand::CreateImm(0);
+    MBP.Predicate = Cond[0].getImm() == X86::COND_NE
+                        ? MachineBranchPredicate::PRED_NE
+                        : MachineBranchPredicate::PRED_EQ;
+    return false;
+  }
+
+  return true;
+}
+
+unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                    int *BytesRemoved) const {
+  assert(!BytesRemoved && "code size not handled");
+
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (I->getOpcode() != X86::JMP_1 &&
+        getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock *TBB,
+                                    MachineBasicBlock *FBB,
+                                    ArrayRef<MachineOperand> Cond,
+                                    const DebugLoc &DL,
+                                    int *BytesAdded) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "X86 branch conditions have one component!");
+  assert(!BytesAdded && "code size not handled");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
+    return 1;
+  }
+
+  // If FBB is null, it is implied to be a fall-through block.
+  bool FallThru = FBB == nullptr;
+
+  // Conditional branch.
+  unsigned Count = 0;
+  X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
+  switch (CC) {
+  case X86::COND_NE_OR_P:
+    // Synthesize NE_OR_P with two branches.
+    BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB);
+    ++Count;
+    BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB);
+    ++Count;
+    break;
+  case X86::COND_E_AND_NP:
+    // Use the next block of MBB as FBB if it is null.
+    if (FBB == nullptr) {
+      FBB = getFallThroughMBB(&MBB, TBB);
+      assert(FBB && "MBB cannot be the last block in function when the false "
+                    "body is a fall-through.");
+    }
+    // Synthesize COND_E_AND_NP with two branches.
+    BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(FBB);
+    ++Count;
+    BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
+    ++Count;
+    break;
+  default: {
+    unsigned Opc = GetCondBranchFromCond(CC);
+    BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
+    ++Count;
+  }
+  }
+  if (!FallThru) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
+    ++Count;
+  }
+  return Count;
+}
+
+bool X86InstrInfo::
+canInsertSelect(const MachineBasicBlock &MBB,
+                ArrayRef<MachineOperand> Cond,
+                unsigned TrueReg, unsigned FalseReg,
+                int &CondCycles, int &TrueCycles, int &FalseCycles) const {
+  // Not all subtargets have cmov instructions.
+  if (!Subtarget.hasCMov())
+    return false;
+  if (Cond.size() != 1)
+    return false;
+  // We cannot do the composite conditions, at least not in SSA form.
+  if ((X86::CondCode)Cond[0].getImm() > X86::COND_S)
+    return false;
+
+  // Check register classes.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+    RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  if (!RC)
+    return false;
+
+  // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
+  if (X86::GR16RegClass.hasSubClassEq(RC) ||
+      X86::GR32RegClass.hasSubClassEq(RC) ||
+      X86::GR64RegClass.hasSubClassEq(RC)) {
+    // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
+    // Bridge. Probably Ivy Bridge as well.
+    CondCycles = 2;
+    TrueCycles = 2;
+    FalseCycles = 2;
+    return true;
+  }
+
+  // Can't do vectors.
+  return false;
+}
+
+void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I,
+                                const DebugLoc &DL, unsigned DstReg,
+                                ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+                                unsigned FalseReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  assert(Cond.size() == 1 && "Invalid Cond array");
+  unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
+                                 MRI.getRegClass(DstReg)->getSize(),
+                                 false /*HasMemoryOperand*/);
+  BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
+}
+
+/// Test if the given register is a physical h register.
+static bool isHReg(unsigned Reg) {
+  return X86::GR8_ABCD_HRegClass.contains(Reg);
+}
+
+// Try and copy between VR128/VR64 and GR64 registers.
+static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
+                                        const X86Subtarget &Subtarget) {
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
+
+  // SrcReg(MaskReg) -> DestReg(GR64)
+  // SrcReg(MaskReg) -> DestReg(GR32)
+  // SrcReg(MaskReg) -> DestReg(GR16)
+  // SrcReg(MaskReg) -> DestReg(GR8)
+
+  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+  if (X86::VK16RegClass.contains(SrcReg)) {
+    if (X86::GR64RegClass.contains(DestReg)) {
+      assert(Subtarget.hasBWI());
+      return X86::KMOVQrk;
+    }
+    if (X86::GR32RegClass.contains(DestReg))
+      return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk;
+    if (X86::GR16RegClass.contains(DestReg)) {
+      DestReg = getX86SubSuperRegister(DestReg, 32);
+      return X86::KMOVWrk;
+    }
+    if (X86::GR8RegClass.contains(DestReg)) {
+      DestReg = getX86SubSuperRegister(DestReg, 32);
+      return Subtarget.hasDQI() ? X86::KMOVBrk : X86::KMOVWrk;
+    }
+  }
+
+  // SrcReg(GR64) -> DestReg(MaskReg)
+  // SrcReg(GR32) -> DestReg(MaskReg)
+  // SrcReg(GR16) -> DestReg(MaskReg)
+  // SrcReg(GR8)  -> DestReg(MaskReg)
+
+  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+  if (X86::VK16RegClass.contains(DestReg)) {
+    if (X86::GR64RegClass.contains(SrcReg)) {
+      assert(Subtarget.hasBWI());
+      return X86::KMOVQkr;
+    }
+    if (X86::GR32RegClass.contains(SrcReg))
+      return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr;
+    if (X86::GR16RegClass.contains(SrcReg)) {
+      SrcReg = getX86SubSuperRegister(SrcReg, 32);
+      return X86::KMOVWkr;
+    }
+    if (X86::GR8RegClass.contains(SrcReg)) {
+      SrcReg = getX86SubSuperRegister(SrcReg, 32);
+      return Subtarget.hasDQI() ? X86::KMOVBkr : X86::KMOVWkr;
+    }
+  }
+
+
+  // SrcReg(VR128) -> DestReg(GR64)
+  // SrcReg(VR64)  -> DestReg(GR64)
+  // SrcReg(GR64)  -> DestReg(VR128)
+  // SrcReg(GR64)  -> DestReg(VR64)
+
+  if (X86::GR64RegClass.contains(DestReg)) {
+    if (X86::VR128XRegClass.contains(SrcReg))
+      // Copy from a VR128 register to a GR64 register.
+      return HasAVX512 ? X86::VMOVPQIto64Zrr :
+             HasAVX    ? X86::VMOVPQIto64rr  :
+                         X86::MOVPQIto64rr;
+    if (X86::VR64RegClass.contains(SrcReg))
+      // Copy from a VR64 register to a GR64 register.
+      return X86::MMX_MOVD64from64rr;
+  } else if (X86::GR64RegClass.contains(SrcReg)) {
+    // Copy from a GR64 register to a VR128 register.
+    if (X86::VR128XRegClass.contains(DestReg))
+      return HasAVX512 ? X86::VMOV64toPQIZrr :
+             HasAVX    ? X86::VMOV64toPQIrr  :
+                         X86::MOV64toPQIrr;
+    // Copy from a GR64 register to a VR64 register.
+    if (X86::VR64RegClass.contains(DestReg))
+      return X86::MMX_MOVD64to64rr;
+  }
+
+  // SrcReg(FR32) -> DestReg(GR32)
+  // SrcReg(GR32) -> DestReg(FR32)
+
+  if (X86::GR32RegClass.contains(DestReg) &&
+      X86::FR32XRegClass.contains(SrcReg))
+    // Copy from a FR32 register to a GR32 register.
+    return HasAVX512 ? X86::VMOVSS2DIZrr :
+           HasAVX    ? X86::VMOVSS2DIrr  :
+                       X86::MOVSS2DIrr;
+
+  if (X86::FR32XRegClass.contains(DestReg) &&
+      X86::GR32RegClass.contains(SrcReg))
+    // Copy from a GR32 register to a FR32 register.
+    return HasAVX512 ? X86::VMOVDI2SSZrr :
+           HasAVX    ? X86::VMOVDI2SSrr  :
+                       X86::MOVDI2SSrr;
+  return 0;
+}
+
+void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MI,
+                               const DebugLoc &DL, unsigned DestReg,
+                               unsigned SrcReg, bool KillSrc) const {
+  // First deal with the normal symmetric copies.
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasVLX = Subtarget.hasVLX();
+  unsigned Opc = 0;
+  if (X86::GR64RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV64rr;
+  else if (X86::GR32RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV32rr;
+  else if (X86::GR16RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV16rr;
+  else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
+    // Copying to or from a physical H register on x86-64 requires a NOREX
+    // move.  Otherwise use a normal move.
+    if ((isHReg(DestReg) || isHReg(SrcReg)) &&
+        Subtarget.is64Bit()) {
+      Opc = X86::MOV8rr_NOREX;
+      // Both operands must be encodable without an REX prefix.
+      assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
+             "8-bit H register can not be copied outside GR8_NOREX");
+    } else
+      Opc = X86::MOV8rr;
+  }
+  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MMX_MOVQ64rr;
+  else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
+    if (HasVLX)
+      Opc = X86::VMOVAPSZ128rr;
+    else if (X86::VR128RegClass.contains(DestReg, SrcReg))
+      Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
+    else {
+      // If this an extended register and we don't have VLX we need to use a
+      // 512-bit move.
+      Opc = X86::VMOVAPSZrr;
+      const TargetRegisterInfo *TRI = &getRegisterInfo();
+      DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm,
+                                         &X86::VR512RegClass);
+      SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm,
+                                        &X86::VR512RegClass);
+    }
+  } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
+    if (HasVLX)
+      Opc = X86::VMOVAPSZ256rr;
+    else if (X86::VR256RegClass.contains(DestReg, SrcReg))
+      Opc = X86::VMOVAPSYrr;
+    else {
+      // If this an extended register and we don't have VLX we need to use a
+      // 512-bit move.
+      Opc = X86::VMOVAPSZrr;
+      const TargetRegisterInfo *TRI = &getRegisterInfo();
+      DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm,
+                                         &X86::VR512RegClass);
+      SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm,
+                                        &X86::VR512RegClass);
+    }
+  } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
+    Opc = X86::VMOVAPSZrr;
+  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
+  else if (X86::VK16RegClass.contains(DestReg, SrcReg))
+    Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk;
+  if (!Opc)
+    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
+
+  if (Opc) {
+    BuildMI(MBB, MI, DL, get(Opc), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  bool FromEFLAGS = SrcReg == X86::EFLAGS;
+  bool ToEFLAGS = DestReg == X86::EFLAGS;
+  int Reg = FromEFLAGS ? DestReg : SrcReg;
+  bool is32 = X86::GR32RegClass.contains(Reg);
+  bool is64 = X86::GR64RegClass.contains(Reg);
+
+  if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
+    int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
+    int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
+    int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32;
+    int Pop = is64 ? X86::POP64r : X86::POP32r;
+    int PopF = is64 ? X86::POPF64 : X86::POPF32;
+    int AX = is64 ? X86::RAX : X86::EAX;
+
+    if (!Subtarget.hasLAHFSAHF()) {
+      assert(Subtarget.is64Bit() &&
+             "Not having LAHF/SAHF only happens on 64-bit.");
+      // Moving EFLAGS to / from another register requires a push and a pop.
+      // Notice that we have to adjust the stack if we don't want to clobber the
+      // first frame index. See X86FrameLowering.cpp - usesTheStack.
+      if (FromEFLAGS) {
+        BuildMI(MBB, MI, DL, get(PushF));
+        BuildMI(MBB, MI, DL, get(Pop), DestReg);
+      }
+      if (ToEFLAGS) {
+        BuildMI(MBB, MI, DL, get(Push))
+            .addReg(SrcReg, getKillRegState(KillSrc));
+        BuildMI(MBB, MI, DL, get(PopF));
+      }
+      return;
+    }
+
+    // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
+    // inefficient. Instead:
+    //   - Save the overflow flag OF into AL using SETO, and restore it using a
+    //     signed 8-bit addition of AL and INT8_MAX.
+    //   - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH
+    //     using LAHF/SAHF.
+    //   - When RAX/EAX is live and isn't the destination register, make sure it
+    //     isn't clobbered by PUSH/POP'ing it before and after saving/restoring
+    //     the flags.
+    // This approach is ~2.25x faster than using PUSHF/POPF.
+    //
+    // This is still somewhat inefficient because we don't know which flags are
+    // actually live inside EFLAGS. Were we able to do a single SETcc instead of
+    // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster.
+    //
+    // PUSHF/POPF is also potentially incorrect because it affects other flags
+    // such as TF/IF/DF, which LLVM doesn't model.
+    //
+    // Notice that we have to adjust the stack if we don't want to clobber the
+    // first frame index.
+    // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment.
+
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    MachineBasicBlock::LivenessQueryResult LQR =
+        MBB.computeRegisterLiveness(TRI, AX, MI);
+    // We do not want to save and restore AX if we do not have to.
+    // Moreover, if we do so whereas AX is dead, we would need to set
+    // an undef flag on the use of AX, otherwise the verifier will
+    // complain that we read an undef value.
+    // We do not want to change the behavior of the machine verifier
+    // as this is usually wrong to read an undef value.
+    if (MachineBasicBlock::LQR_Unknown == LQR) {
+      LivePhysRegs LPR(TRI);
+      LPR.addLiveOuts(MBB);
+      MachineBasicBlock::iterator I = MBB.end();
+      while (I != MI) {
+        --I;
+        LPR.stepBackward(*I);
+      }
+      // AX contains the top most register in the aliasing hierarchy.
+      // It may not be live, but one of its aliases may be.
+      for (MCRegAliasIterator AI(AX, TRI, true);
+           AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI)
+        LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live
+                                : MachineBasicBlock::LQR_Dead;
+    }
+    bool AXDead = (Reg == AX) || (MachineBasicBlock::LQR_Dead == LQR);
+    if (!AXDead)
+      BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
+    if (FromEFLAGS) {
+      BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
+      BuildMI(MBB, MI, DL, get(X86::LAHF));
+      BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX);
+    }
+    if (ToEFLAGS) {
+      BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc));
+      BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL)
+          .addReg(X86::AL)
+          .addImm(INT8_MAX);
+      BuildMI(MBB, MI, DL, get(X86::SAHF));
+    }
+    if (!AXDead)
+      BuildMI(MBB, MI, DL, get(Pop), AX);
+    return;
+  }
+
+  DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
+               << " to " << RI.getName(DestReg) << '\n');
+  llvm_unreachable("Cannot emit physreg copy instruction");
+}
+
+static unsigned getLoadStoreRegOpcode(unsigned Reg,
+                                      const TargetRegisterClass *RC,
+                                      bool isStackAligned,
+                                      const X86Subtarget &STI,
+                                      bool load) {
+  bool HasAVX = STI.hasAVX();
+  bool HasAVX512 = STI.hasAVX512();
+  bool HasVLX = STI.hasVLX();
+
+  switch (RC->getSize()) {
+  default:
+    llvm_unreachable("Unknown spill size");
+  case 1:
+    assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
+    if (STI.is64Bit())
+      // Copying to or from a physical H register on x86-64 requires a NOREX
+      // move.  Otherwise use a normal move.
+      if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
+        return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
+    return load ? X86::MOV8rm : X86::MOV8mr;
+  case 2:
+    if (X86::VK16RegClass.hasSubClassEq(RC))
+      return load ? X86::KMOVWkm : X86::KMOVWmk;
+    assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
+    return load ? X86::MOV16rm : X86::MOV16mr;
+  case 4:
+    if (X86::GR32RegClass.hasSubClassEq(RC))
+      return load ? X86::MOV32rm : X86::MOV32mr;
+    if (X86::FR32XRegClass.hasSubClassEq(RC))
+      return load ?
+        (HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
+        (HasAVX512 ? X86::VMOVSSZmr : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
+    if (X86::RFP32RegClass.hasSubClassEq(RC))
+      return load ? X86::LD_Fp32m : X86::ST_Fp32m;
+    if (X86::VK32RegClass.hasSubClassEq(RC))
+      return load ? X86::KMOVDkm : X86::KMOVDmk;
+    llvm_unreachable("Unknown 4-byte regclass");
+  case 8:
+    if (X86::GR64RegClass.hasSubClassEq(RC))
+      return load ? X86::MOV64rm : X86::MOV64mr;
+    if (X86::FR64XRegClass.hasSubClassEq(RC))
+      return load ?
+        (HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
+        (HasAVX512 ? X86::VMOVSDZmr : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
+    if (X86::VR64RegClass.hasSubClassEq(RC))
+      return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
+    if (X86::RFP64RegClass.hasSubClassEq(RC))
+      return load ? X86::LD_Fp64m : X86::ST_Fp64m;
+    if (X86::VK64RegClass.hasSubClassEq(RC))
+      return load ? X86::KMOVQkm : X86::KMOVQmk;
+    llvm_unreachable("Unknown 8-byte regclass");
+  case 10:
+    assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
+    return load ? X86::LD_Fp80m : X86::ST_FpP80m;
+  case 16: {
+    assert(X86::VR128XRegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
+    // If stack is realigned we can use aligned stores.
+    if (isStackAligned)
+      return load ?
+        (HasVLX    ? X86::VMOVAPSZ128rm :
+         HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
+         HasAVX    ? X86::VMOVAPSrm :
+                     X86::MOVAPSrm):
+        (HasVLX    ? X86::VMOVAPSZ128mr :
+         HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
+         HasAVX    ? X86::VMOVAPSmr :
+                     X86::MOVAPSmr);
+    else
+      return load ?
+        (HasVLX    ? X86::VMOVUPSZ128rm :
+         HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
+         HasAVX    ? X86::VMOVUPSrm :
+                     X86::MOVUPSrm):
+        (HasVLX    ? X86::VMOVUPSZ128mr :
+         HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
+         HasAVX    ? X86::VMOVUPSmr :
+                     X86::MOVUPSmr);
+  }
+  case 32:
+    assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
+    // If stack is realigned we can use aligned stores.
+    if (isStackAligned)
+      return load ?
+        (HasVLX    ? X86::VMOVAPSZ256rm :
+         HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
+                     X86::VMOVAPSYrm) :
+        (HasVLX    ? X86::VMOVAPSZ256mr :
+         HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX :
+                     X86::VMOVAPSYmr);
+    else
+      return load ?
+        (HasVLX    ? X86::VMOVUPSZ256rm :
+         HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX :
+                     X86::VMOVUPSYrm) :
+        (HasVLX    ? X86::VMOVUPSZ256mr :
+         HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX :
+                     X86::VMOVUPSYmr);
+  case 64:
+    assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
+    assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
+    if (isStackAligned)
+      return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
+    else
+      return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
+  }
+}
+
+bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg,
+                                         int64_t &Offset,
+                                         const TargetRegisterInfo *TRI) const {
+  const MCInstrDesc &Desc = MemOp.getDesc();
+  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
+  if (MemRefBegin < 0)
+    return false;
+
+  MemRefBegin += X86II::getOperandBias(Desc);
+
+  MachineOperand &BaseMO = MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
+  if (!BaseMO.isReg()) // Can be an MO_FrameIndex
+    return false;
+
+  BaseReg = BaseMO.getReg();
+  if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
+    return false;
+
+  if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
+      X86::NoRegister)
+    return false;
+
+  const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
+
+  // Displacement can be symbolic
+  if (!DispMO.isImm())
+    return false;
+
+  Offset = DispMO.getImm();
+
+  return true;
+}
+
+static unsigned getStoreRegOpcode(unsigned SrcReg,
+                                  const TargetRegisterClass *RC,
+                                  bool isStackAligned,
+                                  const X86Subtarget &STI) {
+  return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false);
+}
+
+
+static unsigned getLoadRegOpcode(unsigned DestReg,
+                                 const TargetRegisterClass *RC,
+                                 bool isStackAligned,
+                                 const X86Subtarget &STI) {
+  return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true);
+}
+
+void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       unsigned SrcReg, bool isKill, int FrameIdx,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  const MachineFunction &MF = *MBB.getParent();
+  assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= RC->getSize() &&
+         "Stack slot too small for store");
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  bool isAligned =
+      (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+      RI.canRealignStack(MF);
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
+    .addReg(SrcReg, getKillRegState(isKill));
+}
+
+void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+                                  bool isKill,
+                                  SmallVectorImpl<MachineOperand> &Addr,
+                                  const TargetRegisterClass *RC,
+                                  MachineInstr::mmo_iterator MMOBegin,
+                                  MachineInstr::mmo_iterator MMOEnd,
+                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  bool isAligned = MMOBegin != MMOEnd &&
+                   (*MMOBegin)->getAlignment() >= Alignment;
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
+  DebugLoc DL;
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  MIB.addReg(SrcReg, getKillRegState(isKill));
+  (*MIB).setMemRefs(MMOBegin, MMOEnd);
+  NewMIs.push_back(MIB);
+}
+
+
+void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        unsigned DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC,
+                                        const TargetRegisterInfo *TRI) const {
+  const MachineFunction &MF = *MBB.getParent();
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  bool isAligned =
+      (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+      RI.canRealignStack(MF);
+  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
+}
+
+void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                 SmallVectorImpl<MachineOperand> &Addr,
+                                 const TargetRegisterClass *RC,
+                                 MachineInstr::mmo_iterator MMOBegin,
+                                 MachineInstr::mmo_iterator MMOEnd,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  bool isAligned = MMOBegin != MMOEnd &&
+                   (*MMOBegin)->getAlignment() >= Alignment;
+  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
+  DebugLoc DL;
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  (*MIB).setMemRefs(MMOBegin, MMOEnd);
+  NewMIs.push_back(MIB);
+}
+
+bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                                  unsigned &SrcReg2, int &CmpMask,
+                                  int &CmpValue) const {
+  switch (MI.getOpcode()) {
+  default: break;
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP8ri:
+    if (!MI.getOperand(1).isImm())
+      return false;
+    SrcReg = MI.getOperand(0).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI.getOperand(1).getImm();
+    return true;
+  // A SUB can be used to perform comparison.
+  case X86::SUB64rm:
+  case X86::SUB32rm:
+  case X86::SUB16rm:
+  case X86::SUB8rm:
+    SrcReg = MI.getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case X86::SUB64rr:
+  case X86::SUB32rr:
+  case X86::SUB16rr:
+  case X86::SUB8rr:
+    SrcReg = MI.getOperand(1).getReg();
+    SrcReg2 = MI.getOperand(2).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB8ri:
+    if (!MI.getOperand(2).isImm())
+      return false;
+    SrcReg = MI.getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI.getOperand(2).getImm();
+    return true;
+  case X86::CMP64rr:
+  case X86::CMP32rr:
+  case X86::CMP16rr:
+  case X86::CMP8rr:
+    SrcReg = MI.getOperand(0).getReg();
+    SrcReg2 = MI.getOperand(1).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case X86::TEST8rr:
+  case X86::TEST16rr:
+  case X86::TEST32rr:
+  case X86::TEST64rr:
+    SrcReg = MI.getOperand(0).getReg();
+    if (MI.getOperand(1).getReg() != SrcReg)
+      return false;
+    // Compare against zero.
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  }
+  return false;
+}
+
+/// Check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// This function can be extended later on.
+/// SrcReg, SrcRegs: register operands for FlagI.
+/// ImmValue: immediate for FlagI if it takes an immediate.
+inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg,
+                                        unsigned SrcReg2, int ImmValue,
+                                        MachineInstr &OI) {
+  if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) ||
+       (FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) ||
+       (FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) ||
+       (FlagI.getOpcode() == X86::CMP8rr && OI.getOpcode() == X86::SUB8rr)) &&
+      ((OI.getOperand(1).getReg() == SrcReg &&
+        OI.getOperand(2).getReg() == SrcReg2) ||
+       (OI.getOperand(1).getReg() == SrcReg2 &&
+        OI.getOperand(2).getReg() == SrcReg)))
+    return true;
+
+  if (((FlagI.getOpcode() == X86::CMP64ri32 &&
+        OI.getOpcode() == X86::SUB64ri32) ||
+       (FlagI.getOpcode() == X86::CMP64ri8 &&
+        OI.getOpcode() == X86::SUB64ri8) ||
+       (FlagI.getOpcode() == X86::CMP32ri && OI.getOpcode() == X86::SUB32ri) ||
+       (FlagI.getOpcode() == X86::CMP32ri8 &&
+        OI.getOpcode() == X86::SUB32ri8) ||
+       (FlagI.getOpcode() == X86::CMP16ri && OI.getOpcode() == X86::SUB16ri) ||
+       (FlagI.getOpcode() == X86::CMP16ri8 &&
+        OI.getOpcode() == X86::SUB16ri8) ||
+       (FlagI.getOpcode() == X86::CMP8ri && OI.getOpcode() == X86::SUB8ri)) &&
+      OI.getOperand(1).getReg() == SrcReg &&
+      OI.getOperand(2).getImm() == ImmValue)
+    return true;
+  return false;
+}
+
+/// Check whether the definition can be converted
+/// to remove a comparison against zero.
+inline static bool isDefConvertible(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default: return false;
+
+  // The shift instructions only modify ZF if their shift count is non-zero.
+  // N.B.: The processor truncates the shift count depending on the encoding.
+  case X86::SAR8ri:    case X86::SAR16ri:  case X86::SAR32ri:case X86::SAR64ri:
+  case X86::SHR8ri:    case X86::SHR16ri:  case X86::SHR32ri:case X86::SHR64ri:
+     return getTruncatedShiftCount(MI, 2) != 0;
+
+  // Some left shift instructions can be turned into LEA instructions but only
+  // if their flags aren't used. Avoid transforming such instructions.
+  case X86::SHL8ri:    case X86::SHL16ri:  case X86::SHL32ri:case X86::SHL64ri:{
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (isTruncatedShiftCountForLEA(ShAmt)) return false;
+    return ShAmt != 0;
+  }
+
+  case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
+  case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
+     return getTruncatedShiftCount(MI, 3) != 0;
+
+  case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
+  case X86::SUB32ri8:  case X86::SUB16ri:  case X86::SUB16ri8:
+  case X86::SUB8ri:    case X86::SUB64rr:  case X86::SUB32rr:
+  case X86::SUB16rr:   case X86::SUB8rr:   case X86::SUB64rm:
+  case X86::SUB32rm:   case X86::SUB16rm:  case X86::SUB8rm:
+  case X86::DEC64r:    case X86::DEC32r:   case X86::DEC16r: case X86::DEC8r:
+  case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
+  case X86::ADD32ri8:  case X86::ADD16ri:  case X86::ADD16ri8:
+  case X86::ADD8ri:    case X86::ADD64rr:  case X86::ADD32rr:
+  case X86::ADD16rr:   case X86::ADD8rr:   case X86::ADD64rm:
+  case X86::ADD32rm:   case X86::ADD16rm:  case X86::ADD8rm:
+  case X86::INC64r:    case X86::INC32r:   case X86::INC16r: case X86::INC8r:
+  case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
+  case X86::AND32ri8:  case X86::AND16ri:  case X86::AND16ri8:
+  case X86::AND8ri:    case X86::AND64rr:  case X86::AND32rr:
+  case X86::AND16rr:   case X86::AND8rr:   case X86::AND64rm:
+  case X86::AND32rm:   case X86::AND16rm:  case X86::AND8rm:
+  case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
+  case X86::XOR32ri8:  case X86::XOR16ri:  case X86::XOR16ri8:
+  case X86::XOR8ri:    case X86::XOR64rr:  case X86::XOR32rr:
+  case X86::XOR16rr:   case X86::XOR8rr:   case X86::XOR64rm:
+  case X86::XOR32rm:   case X86::XOR16rm:  case X86::XOR8rm:
+  case X86::OR64ri32:  case X86::OR64ri8:  case X86::OR32ri:
+  case X86::OR32ri8:   case X86::OR16ri:   case X86::OR16ri8:
+  case X86::OR8ri:     case X86::OR64rr:   case X86::OR32rr:
+  case X86::OR16rr:    case X86::OR8rr:    case X86::OR64rm:
+  case X86::OR32rm:    case X86::OR16rm:   case X86::OR8rm:
+  case X86::NEG8r:     case X86::NEG16r:   case X86::NEG32r: case X86::NEG64r:
+  case X86::SAR8r1:    case X86::SAR16r1:  case X86::SAR32r1:case X86::SAR64r1:
+  case X86::SHR8r1:    case X86::SHR16r1:  case X86::SHR32r1:case X86::SHR64r1:
+  case X86::SHL8r1:    case X86::SHL16r1:  case X86::SHL32r1:case X86::SHL64r1:
+  case X86::ADC32ri:   case X86::ADC32ri8:
+  case X86::ADC32rr:   case X86::ADC64ri32:
+  case X86::ADC64ri8:  case X86::ADC64rr:
+  case X86::SBB32ri:   case X86::SBB32ri8:
+  case X86::SBB32rr:   case X86::SBB64ri32:
+  case X86::SBB64ri8:  case X86::SBB64rr:
+  case X86::ANDN32rr:  case X86::ANDN32rm:
+  case X86::ANDN64rr:  case X86::ANDN64rm:
+  case X86::BEXTR32rr: case X86::BEXTR64rr:
+  case X86::BEXTR32rm: case X86::BEXTR64rm:
+  case X86::BLSI32rr:  case X86::BLSI32rm:
+  case X86::BLSI64rr:  case X86::BLSI64rm:
+  case X86::BLSMSK32rr:case X86::BLSMSK32rm:
+  case X86::BLSMSK64rr:case X86::BLSMSK64rm:
+  case X86::BLSR32rr:  case X86::BLSR32rm:
+  case X86::BLSR64rr:  case X86::BLSR64rm:
+  case X86::BZHI32rr:  case X86::BZHI32rm:
+  case X86::BZHI64rr:  case X86::BZHI64rm:
+  case X86::LZCNT16rr: case X86::LZCNT16rm:
+  case X86::LZCNT32rr: case X86::LZCNT32rm:
+  case X86::LZCNT64rr: case X86::LZCNT64rm:
+  case X86::POPCNT16rr:case X86::POPCNT16rm:
+  case X86::POPCNT32rr:case X86::POPCNT32rm:
+  case X86::POPCNT64rr:case X86::POPCNT64rm:
+  case X86::TZCNT16rr: case X86::TZCNT16rm:
+  case X86::TZCNT32rr: case X86::TZCNT32rm:
+  case X86::TZCNT64rr: case X86::TZCNT64rm:
+    return true;
+  }
+}
+
+/// Check whether the use can be converted to remove a comparison against zero.
+static X86::CondCode isUseDefConvertible(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default: return X86::COND_INVALID;
+  case X86::LZCNT16rr: case X86::LZCNT16rm:
+  case X86::LZCNT32rr: case X86::LZCNT32rm:
+  case X86::LZCNT64rr: case X86::LZCNT64rm:
+    return X86::COND_B;
+  case X86::POPCNT16rr:case X86::POPCNT16rm:
+  case X86::POPCNT32rr:case X86::POPCNT32rm:
+  case X86::POPCNT64rr:case X86::POPCNT64rm:
+    return X86::COND_E;
+  case X86::TZCNT16rr: case X86::TZCNT16rm:
+  case X86::TZCNT32rr: case X86::TZCNT32rm:
+  case X86::TZCNT64rr: case X86::TZCNT64rm:
+    return X86::COND_B;
+  }
+}
+
+/// Check if there exists an earlier instruction that
+/// operates on the same source operands and sets flags in the same way as
+/// Compare; remove Compare if possible.
+bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+                                        unsigned SrcReg2, int CmpMask,
+                                        int CmpValue,
+                                        const MachineRegisterInfo *MRI) const {
+  // Check whether we can replace SUB with CMP.
+  unsigned NewOpcode = 0;
+  switch (CmpInstr.getOpcode()) {
+  default: break;
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB8ri:
+  case X86::SUB64rm:
+  case X86::SUB32rm:
+  case X86::SUB16rm:
+  case X86::SUB8rm:
+  case X86::SUB64rr:
+  case X86::SUB32rr:
+  case X86::SUB16rr:
+  case X86::SUB8rr: {
+    if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
+      return false;
+    // There is no use of the destination register, we can replace SUB with CMP.
+    switch (CmpInstr.getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::SUB64rm:   NewOpcode = X86::CMP64rm;   break;
+    case X86::SUB32rm:   NewOpcode = X86::CMP32rm;   break;
+    case X86::SUB16rm:   NewOpcode = X86::CMP16rm;   break;
+    case X86::SUB8rm:    NewOpcode = X86::CMP8rm;    break;
+    case X86::SUB64rr:   NewOpcode = X86::CMP64rr;   break;
+    case X86::SUB32rr:   NewOpcode = X86::CMP32rr;   break;
+    case X86::SUB16rr:   NewOpcode = X86::CMP16rr;   break;
+    case X86::SUB8rr:    NewOpcode = X86::CMP8rr;    break;
+    case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
+    case X86::SUB64ri8:  NewOpcode = X86::CMP64ri8;  break;
+    case X86::SUB32ri:   NewOpcode = X86::CMP32ri;   break;
+    case X86::SUB32ri8:  NewOpcode = X86::CMP32ri8;  break;
+    case X86::SUB16ri:   NewOpcode = X86::CMP16ri;   break;
+    case X86::SUB16ri8:  NewOpcode = X86::CMP16ri8;  break;
+    case X86::SUB8ri:    NewOpcode = X86::CMP8ri;    break;
+    }
+    CmpInstr.setDesc(get(NewOpcode));
+    CmpInstr.RemoveOperand(0);
+    // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
+    if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
+        NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
+      return false;
+  }
+  }
+
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI) return false;
+
+  // CmpInstr is the first instruction of the BB.
+  MachineBasicBlock::iterator I = CmpInstr, Def = MI;
+
+  // If we are comparing against zero, check whether we can use MI to update
+  // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
+  bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
+  if (IsCmpZero && MI->getParent() != CmpInstr.getParent())
+    return false;
+
+  // If we have a use of the source register between the def and our compare
+  // instruction we can eliminate the compare iff the use sets EFLAGS in the
+  // right way.
+  bool ShouldUpdateCC = false;
+  X86::CondCode NewCC = X86::COND_INVALID;
+  if (IsCmpZero && !isDefConvertible(*MI)) {
+    // Scan forward from the use until we hit the use we're looking for or the
+    // compare instruction.
+    for (MachineBasicBlock::iterator J = MI;; ++J) {
+      // Do we have a convertible instruction?
+      NewCC = isUseDefConvertible(*J);
+      if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
+          J->getOperand(1).getReg() == SrcReg) {
+        assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
+        ShouldUpdateCC = true; // Update CC later on.
+        // This is not a def of SrcReg, but still a def of EFLAGS. Keep going
+        // with the new def.
+        Def = J;
+        MI = &*Def;
+        break;
+      }
+
+      if (J == I)
+        return false;
+    }
+  }
+
+  // We are searching for an earlier instruction that can make CmpInstr
+  // redundant and that instruction will be saved in Sub.
+  MachineInstr *Sub = nullptr;
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+  // We iterate backward, starting from the instruction before CmpInstr and
+  // stop when reaching the definition of a source register or done with the BB.
+  // RI points to the instruction before CmpInstr.
+  // If the definition is in this basic block, RE points to the definition;
+  // otherwise, RE is the rend of the basic block.
+  MachineBasicBlock::reverse_iterator
+      RI = ++I.getReverse(),
+      RE = CmpInstr.getParent() == MI->getParent()
+               ? Def.getReverse() /* points to MI */
+               : CmpInstr.getParent()->rend();
+  MachineInstr *Movr0Inst = nullptr;
+  for (; RI != RE; ++RI) {
+    MachineInstr &Instr = *RI;
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if (!IsCmpZero &&
+        isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
+      Sub = &Instr;
+      break;
+    }
+
+    if (Instr.modifiesRegister(X86::EFLAGS, TRI) ||
+        Instr.readsRegister(X86::EFLAGS, TRI)) {
+      // This instruction modifies or uses EFLAGS.
+
+      // MOV32r0 etc. are implemented with xor which clobbers condition code.
+      // They are safe to move up, if the definition to EFLAGS is dead and
+      // earlier instructions do not read or write EFLAGS.
+      if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 &&
+          Instr.registerDefIsDead(X86::EFLAGS, TRI)) {
+        Movr0Inst = &Instr;
+        continue;
+      }
+
+      // We can't remove CmpInstr.
+      return false;
+    }
+  }
+
+  // Return false if no candidates exist.
+  if (!IsCmpZero && !Sub)
+    return false;
+
+  bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+                    Sub->getOperand(2).getReg() == SrcReg);
+
+  // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
+  // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
+  // If we are done with the basic block, we need to check whether EFLAGS is
+  // live-out.
+  bool IsSafe = false;
+  SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate;
+  MachineBasicBlock::iterator E = CmpInstr.getParent()->end();
+  for (++I; I != E; ++I) {
+    const MachineInstr &Instr = *I;
+    bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
+    bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
+    // We should check the usage if this instruction uses and updates EFLAGS.
+    if (!UseEFLAGS && ModifyEFLAGS) {
+      // It is safe to remove CmpInstr if EFLAGS is updated again.
+      IsSafe = true;
+      break;
+    }
+    if (!UseEFLAGS && !ModifyEFLAGS)
+      continue;
+
+    // EFLAGS is used by this instruction.
+    X86::CondCode OldCC = X86::COND_INVALID;
+    bool OpcIsSET = false;
+    if (IsCmpZero || IsSwapped) {
+      // We decode the condition code from opcode.
+      if (Instr.isBranch())
+        OldCC = getCondFromBranchOpc(Instr.getOpcode());
+      else {
+        OldCC = getCondFromSETOpc(Instr.getOpcode());
+        if (OldCC != X86::COND_INVALID)
+          OpcIsSET = true;
+        else
+          OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
+      }
+      if (OldCC == X86::COND_INVALID) return false;
+    }
+    if (IsCmpZero) {
+      switch (OldCC) {
+      default: break;
+      case X86::COND_A: case X86::COND_AE:
+      case X86::COND_B: case X86::COND_BE:
+      case X86::COND_G: case X86::COND_GE:
+      case X86::COND_L: case X86::COND_LE:
+      case X86::COND_O: case X86::COND_NO:
+        // CF and OF are used, we can't perform this optimization.
+        return false;
+      }
+
+      // If we're updating the condition code check if we have to reverse the
+      // condition.
+      if (ShouldUpdateCC)
+        switch (OldCC) {
+        default:
+          return false;
+        case X86::COND_E:
+          break;
+        case X86::COND_NE:
+          NewCC = GetOppositeBranchCondition(NewCC);
+          break;
+        }
+    } else if (IsSwapped) {
+      // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
+      // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+      // We swap the condition code and synthesize the new opcode.
+      NewCC = getSwappedCondition(OldCC);
+      if (NewCC == X86::COND_INVALID) return false;
+    }
+
+    if ((ShouldUpdateCC || IsSwapped) && NewCC != OldCC) {
+      // Synthesize the new opcode.
+      bool HasMemoryOperand = Instr.hasOneMemOperand();
+      unsigned NewOpc;
+      if (Instr.isBranch())
+        NewOpc = GetCondBranchFromCond(NewCC);
+      else if(OpcIsSET)
+        NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
+      else {
+        unsigned DstReg = Instr.getOperand(0).getReg();
+        NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(),
+                                 HasMemoryOperand);
+      }
+
+      // Push the MachineInstr to OpsToUpdate.
+      // If it is safe to remove CmpInstr, the condition code of these
+      // instructions will be modified.
+      OpsToUpdate.push_back(std::make_pair(&*I, NewOpc));
+    }
+    if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
+      // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
+      IsSafe = true;
+      break;
+    }
+  }
+
+  // If EFLAGS is not killed nor re-defined, we should check whether it is
+  // live-out. If it is live-out, do not optimize.
+  if ((IsCmpZero || IsSwapped) && !IsSafe) {
+    MachineBasicBlock *MBB = CmpInstr.getParent();
+    for (MachineBasicBlock *Successor : MBB->successors())
+      if (Successor->isLiveIn(X86::EFLAGS))
+        return false;
+  }
+
+  // The instruction to be updated is either Sub or MI.
+  Sub = IsCmpZero ? MI : Sub;
+  // Move Movr0Inst to the appropriate place before Sub.
+  if (Movr0Inst) {
+    // Look backwards until we find a def that doesn't use the current EFLAGS.
+    Def = Sub;
+    MachineBasicBlock::reverse_iterator InsertI = Def.getReverse(),
+                                        InsertE = Sub->getParent()->rend();
+    for (; InsertI != InsertE; ++InsertI) {
+      MachineInstr *Instr = &*InsertI;
+      if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
+          Instr->modifiesRegister(X86::EFLAGS, TRI)) {
+        Sub->getParent()->remove(Movr0Inst);
+        Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
+                                   Movr0Inst);
+        break;
+      }
+    }
+    if (InsertI == InsertE)
+      return false;
+  }
+
+  // Make sure Sub instruction defines EFLAGS and mark the def live.
+  unsigned i = 0, e = Sub->getNumOperands();
+  for (; i != e; ++i) {
+    MachineOperand &MO = Sub->getOperand(i);
+    if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
+      MO.setIsDead(false);
+      break;
+    }
+  }
+  assert(i != e && "Unable to locate a def EFLAGS operand");
+
+  CmpInstr.eraseFromParent();
+
+  // Modify the condition code of instructions in OpsToUpdate.
+  for (auto &Op : OpsToUpdate)
+    Op.first->setDesc(get(Op.second));
+  return true;
+}
+
+/// Try to remove the load by folding it to a register
+/// operand at the use. We fold the load instructions if load defines a virtual
+/// register, the virtual register is used once in the same BB, and the
+/// instructions in-between do not load or store, and have no side effects.
+MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
+                                              const MachineRegisterInfo *MRI,
+                                              unsigned &FoldAsLoadDefReg,
+                                              MachineInstr *&DefMI) const {
+  // Check whether we can move DefMI here.
+  DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
+  assert(DefMI);
+  bool SawStore = false;
+  if (!DefMI->isSafeToMove(nullptr, SawStore))
+    return nullptr;
+
+  // Collect information about virtual register operands of MI.
+  SmallVector<unsigned, 1> SrcOperandIds;
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg != FoldAsLoadDefReg)
+      continue;
+    // Do not fold if we have a subreg use or a def.
+    if (MO.getSubReg() || MO.isDef())
+      return nullptr;
+    SrcOperandIds.push_back(i);
+  }
+  if (SrcOperandIds.empty())
+    return nullptr;
+
+  // Check whether we can fold the def into SrcOperandId.
+  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
+    FoldAsLoadDefReg = 0;
+    return FoldMI;
+  }
+
+  return nullptr;
+}
+
+/// Expand a single-def pseudo instruction to a two-addr
+/// instruction with two undef reads of the register being defined.
+/// This is used for mapping:
+///   %xmm4 = V_SET0
+/// to:
+///   %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
+///
+static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
+                             const MCInstrDesc &Desc) {
+  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+  unsigned Reg = MIB->getOperand(0).getReg();
+  MIB->setDesc(Desc);
+
+  // MachineInstr::addOperand() will insert explicit operands before any
+  // implicit operands.
+  MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+  // But we don't trust that.
+  assert(MIB->getOperand(1).getReg() == Reg &&
+         MIB->getOperand(2).getReg() == Reg && "Misplaced operand");
+  return true;
+}
+
+/// Expand a single-def pseudo instruction to a two-addr
+/// instruction with two %k0 reads.
+/// This is used for mapping:
+///   %k4 = K_SET1
+/// to:
+///   %k4 = KXNORrr %k0, %k0
+static bool Expand2AddrKreg(MachineInstrBuilder &MIB,
+                            const MCInstrDesc &Desc, unsigned Reg) {
+  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+  MIB->setDesc(Desc);
+  MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+  return true;
+}
+
+static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
+                          bool MinusOne) {
+  MachineBasicBlock &MBB = *MIB->getParent();
+  DebugLoc DL = MIB->getDebugLoc();
+  unsigned Reg = MIB->getOperand(0).getReg();
+
+  // Insert the XOR.
+  BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
+      .addReg(Reg, RegState::Undef)
+      .addReg(Reg, RegState::Undef);
+
+  // Turn the pseudo into an INC or DEC.
+  MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
+  MIB.addReg(Reg);
+
+  return true;
+}
+
+static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
+                               const TargetInstrInfo &TII,
+                               const X86Subtarget &Subtarget) {
+  MachineBasicBlock &MBB = *MIB->getParent();
+  DebugLoc DL = MIB->getDebugLoc();
+  int64_t Imm = MIB->getOperand(1).getImm();
+  assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
+  MachineBasicBlock::iterator I = MIB.getInstr();
+
+  int StackAdjustment;
+
+  if (Subtarget.is64Bit()) {
+    assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
+           MIB->getOpcode() == X86::MOV32ImmSExti8);
+
+    // Can't use push/pop lowering if the function might write to the red zone.
+    X86MachineFunctionInfo *X86FI =
+        MBB.getParent()->getInfo<X86MachineFunctionInfo>();
+    if (X86FI->getUsesRedZone()) {
+      MIB->setDesc(TII.get(MIB->getOpcode() ==
+                           X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri));
+      return true;
+    }
+
+    // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
+    // widen the register if necessary.
+    StackAdjustment = 8;
+    BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm);
+    MIB->setDesc(TII.get(X86::POP64r));
+    MIB->getOperand(0)
+        .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64));
+  } else {
+    assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
+    StackAdjustment = 4;
+    BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
+    MIB->setDesc(TII.get(X86::POP32r));
+  }
+
+  // Build CFI if necessary.
+  MachineFunction &MF = *MBB.getParent();
+  const X86FrameLowering *TFL = Subtarget.getFrameLowering();
+  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+  bool NeedsDwarfCFI =
+      !IsWin64Prologue &&
+      (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry());
+  bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
+  if (EmitCFI) {
+    TFL->BuildCFI(MBB, I, DL,
+        MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
+    TFL->BuildCFI(MBB, std::next(I), DL,
+        MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
+  }
+
+  return true;
+}
+
+// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
+// code sequence is needed for other targets.
+static void expandLoadStackGuard(MachineInstrBuilder &MIB,
+                                 const TargetInstrInfo &TII) {
+  MachineBasicBlock &MBB = *MIB->getParent();
+  DebugLoc DL = MIB->getDebugLoc();
+  unsigned Reg = MIB->getOperand(0).getReg();
+  const GlobalValue *GV =
+      cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
+  auto Flags = MachineMemOperand::MOLoad |
+               MachineMemOperand::MODereferenceable |
+               MachineMemOperand::MOInvariant;
+  MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+      MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, 8);
+  MachineBasicBlock::iterator I = MIB.getInstr();
+
+  BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
+      .addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0)
+      .addMemOperand(MMO);
+  MIB->setDebugLoc(DL);
+  MIB->setDesc(TII.get(X86::MOV64rm));
+  MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
+}
+
+// This is used to handle spills for 128/256-bit registers when we have AVX512,
+// but not VLX. If it uses an extended register we need to use an instruction
+// that loads the lower 128/256-bit, but is available with only AVX512F.
+static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
+                            const TargetRegisterInfo *TRI,
+                            const MCInstrDesc &LoadDesc,
+                            const MCInstrDesc &BroadcastDesc,
+                            unsigned SubIdx) {
+  unsigned DestReg = MIB->getOperand(0).getReg();
+  // Check if DestReg is XMM16-31 or YMM16-31.
+  if (TRI->getEncodingValue(DestReg) < 16) {
+    // We can use a normal VEX encoded load.
+    MIB->setDesc(LoadDesc);
+  } else {
+    // Use a 128/256-bit VBROADCAST instruction.
+    MIB->setDesc(BroadcastDesc);
+    // Change the destination to a 512-bit register.
+    DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
+    MIB->getOperand(0).setReg(DestReg);
+  }
+  return true;
+}
+
+// This is used to handle spills for 128/256-bit registers when we have AVX512,
+// but not VLX. If it uses an extended register we need to use an instruction
+// that stores the lower 128/256-bit, but is available with only AVX512F.
+static bool expandNOVLXStore(MachineInstrBuilder &MIB,
+                             const TargetRegisterInfo *TRI,
+                             const MCInstrDesc &StoreDesc,
+                             const MCInstrDesc &ExtractDesc,
+                             unsigned SubIdx) {
+  unsigned SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg();
+  // Check if DestReg is XMM16-31 or YMM16-31.
+  if (TRI->getEncodingValue(SrcReg) < 16) {
+    // We can use a normal VEX encoded store.
+    MIB->setDesc(StoreDesc);
+  } else {
+    // Use a VEXTRACTF instruction.
+    MIB->setDesc(ExtractDesc);
+    // Change the destination to a 512-bit register.
+    SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
+    MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg);
+    MIB.addImm(0x0); // Append immediate to extract from the lower bits.
+  }
+
+  return true;
+}
+bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  bool HasAVX = Subtarget.hasAVX();
+  MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
+  switch (MI.getOpcode()) {
+  case X86::MOV32r0:
+    return Expand2AddrUndef(MIB, get(X86::XOR32rr));
+  case X86::MOV32r1:
+    return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
+  case X86::MOV32r_1:
+    return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
+  case X86::MOV32ImmSExti8:
+  case X86::MOV64ImmSExti8:
+    return ExpandMOVImmSExti8(MIB, *this, Subtarget);
+  case X86::SETB_C8r:
+    return Expand2AddrUndef(MIB, get(X86::SBB8rr));
+  case X86::SETB_C16r:
+    return Expand2AddrUndef(MIB, get(X86::SBB16rr));
+  case X86::SETB_C32r:
+    return Expand2AddrUndef(MIB, get(X86::SBB32rr));
+  case X86::SETB_C64r:
+    return Expand2AddrUndef(MIB, get(X86::SBB64rr));
+  case X86::V_SET0:
+  case X86::FsFLD0SS:
+  case X86::FsFLD0SD:
+    return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
+  case X86::AVX_SET0:
+    assert(HasAVX && "AVX not supported");
+    return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
+  case X86::AVX512_128_SET0:
+    return Expand2AddrUndef(MIB, get(X86::VPXORDZ128rr));
+  case X86::AVX512_256_SET0:
+    return Expand2AddrUndef(MIB, get(X86::VPXORDZ256rr));
+  case X86::AVX512_512_SET0:
+    return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
+  case X86::AVX512_FsFLD0SS:
+  case X86::AVX512_FsFLD0SD:
+    return Expand2AddrUndef(MIB, get(X86::VXORPSZ128rr));
+  case X86::V_SETALLONES:
+    return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
+  case X86::AVX2_SETALLONES:
+    return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+  case X86::AVX512_512_SETALLONES: {
+    unsigned Reg = MIB->getOperand(0).getReg();
+    MIB->setDesc(get(X86::VPTERNLOGDZrri));
+    // VPTERNLOGD needs 3 register inputs and an immediate.
+    // 0xff will return 1s for any input.
+    MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef)
+       .addReg(Reg, RegState::Undef).addImm(0xff);
+    return true;
+  }
+  case X86::VMOVAPSZ128rm_NOVLX:
+    return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
+                           get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
+  case X86::VMOVUPSZ128rm_NOVLX:
+    return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
+                           get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
+  case X86::VMOVAPSZ256rm_NOVLX:
+    return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
+                           get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
+  case X86::VMOVUPSZ256rm_NOVLX:
+    return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
+                           get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
+  case X86::VMOVAPSZ128mr_NOVLX:
+    return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
+                            get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
+  case X86::VMOVUPSZ128mr_NOVLX:
+    return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
+                            get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
+  case X86::VMOVAPSZ256mr_NOVLX:
+    return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
+                            get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
+  case X86::VMOVUPSZ256mr_NOVLX:
+    return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
+                            get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
+  case X86::TEST8ri_NOREX:
+    MI.setDesc(get(X86::TEST8ri));
+    return true;
+  case X86::MOV32ri64:
+    MI.setDesc(get(X86::MOV32ri));
+    return true;
+
+  // KNL does not recognize dependency-breaking idioms for mask registers,
+  // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
+  // Using %k0 as the undef input register is a performance heuristic based
+  // on the assumption that %k0 is used less frequently than the other mask
+  // registers, since it is not usable as a write mask.
+  // FIXME: A more advanced approach would be to choose the best input mask
+  // register based on context.
+  case X86::KSET0B:
+  case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
+  case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
+  case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
+  case X86::KSET1B:
+  case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
+  case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
+  case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
+  case TargetOpcode::LOAD_STACK_GUARD:
+    expandLoadStackGuard(MIB, *this);
+    return true;
+  }
+  return false;
+}
+
+static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
+                        int PtrOffset = 0) {
+  unsigned NumAddrOps = MOs.size();
+
+  if (NumAddrOps < 4) {
+    // FrameIndex only - add an immediate offset (whether its zero or not).
+    for (unsigned i = 0; i != NumAddrOps; ++i)
+      MIB.addOperand(MOs[i]);
+    addOffset(MIB, PtrOffset);
+  } else {
+    // General Memory Addressing - we need to add any offset to an existing
+    // offset.
+    assert(MOs.size() == 5 && "Unexpected memory operand list length");
+    for (unsigned i = 0; i != NumAddrOps; ++i) {
+      const MachineOperand &MO = MOs[i];
+      if (i == 3 && PtrOffset != 0) {
+        MIB.addDisp(MO, PtrOffset);
+      } else {
+        MIB.addOperand(MO);
+      }
+    }
+  }
+}
+
+static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
+                                     ArrayRef<MachineOperand> MOs,
+                                     MachineBasicBlock::iterator InsertPt,
+                                     MachineInstr &MI,
+                                     const TargetInstrInfo &TII) {
+  // Create the base instruction with the memory operand as the first part.
+  // Omit the implicit operands, something BuildMI can't do.
+  MachineInstr *NewMI =
+      MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, NewMI);
+  addOperands(MIB, MOs);
+
+  // Loop over the rest of the ri operands, converting them over.
+  unsigned NumOps = MI.getDesc().getNumOperands() - 2;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MachineOperand &MO = MI.getOperand(i + 2);
+    MIB.addOperand(MO);
+  }
+  for (unsigned i = NumOps + 2, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    MIB.addOperand(MO);
+  }
+
+  MachineBasicBlock *MBB = InsertPt->getParent();
+  MBB->insert(InsertPt, NewMI);
+
+  return MIB;
+}
+
+static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
+                              unsigned OpNo, ArrayRef<MachineOperand> MOs,
+                              MachineBasicBlock::iterator InsertPt,
+                              MachineInstr &MI, const TargetInstrInfo &TII,
+                              int PtrOffset = 0) {
+  // Omit the implicit operands, something BuildMI can't do.
+  MachineInstr *NewMI =
+      MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, NewMI);
+
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (i == OpNo) {
+      assert(MO.isReg() && "Expected to fold into reg operand!");
+      addOperands(MIB, MOs, PtrOffset);
+    } else {
+      MIB.addOperand(MO);
+    }
+  }
+
+  MachineBasicBlock *MBB = InsertPt->getParent();
+  MBB->insert(InsertPt, NewMI);
+
+  return MIB;
+}
+
+static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
+                                ArrayRef<MachineOperand> MOs,
+                                MachineBasicBlock::iterator InsertPt,
+                                MachineInstr &MI) {
+  MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
+                                    MI.getDebugLoc(), TII.get(Opcode));
+  addOperands(MIB, MOs);
+  return MIB.addImm(0);
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
+    MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
+    ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+    unsigned Size, unsigned Align) const {
+  switch (MI.getOpcode()) {
+  case X86::INSERTPSrr:
+  case X86::VINSERTPSrr:
+  case X86::VINSERTPSZrr:
+    // Attempt to convert the load of inserted vector into a fold load
+    // of a single float.
+    if (OpNum == 2) {
+      unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
+      unsigned ZMask = Imm & 15;
+      unsigned DstIdx = (Imm >> 4) & 3;
+      unsigned SrcIdx = (Imm >> 6) & 3;
+
+      unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
+      if (Size <= RCSize && 4 <= Align) {
+        int PtrOffset = SrcIdx * 4;
+        unsigned NewImm = (DstIdx << 4) | ZMask;
+        unsigned NewOpCode =
+            (MI.getOpcode() == X86::VINSERTPSZrr) ? X86::VINSERTPSZrm :
+            (MI.getOpcode() == X86::VINSERTPSrr)  ? X86::VINSERTPSrm  :
+                                                    X86::INSERTPSrm;
+        MachineInstr *NewMI =
+            FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
+        NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
+        return NewMI;
+      }
+    }
+    break;
+  case X86::MOVHLPSrr:
+  case X86::VMOVHLPSrr:
+  case X86::VMOVHLPSZrr:
+    // Move the upper 64-bits of the second operand to the lower 64-bits.
+    // To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
+    // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
+    if (OpNum == 2) {
+      unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
+      if (Size <= RCSize && 8 <= Align) {
+        unsigned NewOpCode =
+            (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm :
+            (MI.getOpcode() == X86::VMOVHLPSrr)  ? X86::VMOVLPSrm     :
+                                                   X86::MOVLPSrm;
+        MachineInstr *NewMI =
+            FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
+        return NewMI;
+      }
+    }
+    break;
+  };
+
+  return nullptr;
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
+    ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+    unsigned Size, unsigned Align, bool AllowCommute) const {
+  const DenseMap<unsigned,
+                 std::pair<uint16_t, uint16_t> > *OpcodeTablePtr = nullptr;
+  bool isCallRegIndirect = Subtarget.callRegIndirect();
+  bool isTwoAddrFold = false;
+
+  // For CPUs that favor the register form of a call or push,
+  // do not fold loads into calls or pushes, unless optimizing for size
+  // aggressively.
+  if (isCallRegIndirect && !MF.getFunction()->optForMinSize() &&
+      (MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r ||
+       MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r ||
+       MI.getOpcode() == X86::PUSH64r))
+    return nullptr;
+
+  unsigned NumOps = MI.getDesc().getNumOperands();
+  bool isTwoAddr =
+      NumOps > 1 && MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+  // FIXME: AsmPrinter doesn't know how to handle
+  // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
+  if (MI.getOpcode() == X86::ADD32ri &&
+      MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
+    return nullptr;
+
+  MachineInstr *NewMI = nullptr;
+
+  // Attempt to fold any custom cases we have.
+  if (MachineInstr *CustomMI =
+          foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
+    return CustomMI;
+
+  // Folding a memory location into the two-address part of a two-address
+  // instruction is different than folding it other places.  It requires
+  // replacing the *two* registers with the memory location.
+  if (isTwoAddr && NumOps >= 2 && OpNum < 2 && MI.getOperand(0).isReg() &&
+      MI.getOperand(1).isReg() &&
+      MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) {
+    OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+    isTwoAddrFold = true;
+  } else if (OpNum == 0) {
+    if (MI.getOpcode() == X86::MOV32r0) {
+      NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
+      if (NewMI)
+        return NewMI;
+    }
+
+    OpcodeTablePtr = &RegOp2MemOpTable0;
+  } else if (OpNum == 1) {
+    OpcodeTablePtr = &RegOp2MemOpTable1;
+  } else if (OpNum == 2) {
+    OpcodeTablePtr = &RegOp2MemOpTable2;
+  } else if (OpNum == 3) {
+    OpcodeTablePtr = &RegOp2MemOpTable3;
+  } else if (OpNum == 4) {
+    OpcodeTablePtr = &RegOp2MemOpTable4;
+  }
+
+  // If table selected...
+  if (OpcodeTablePtr) {
+    // Find the Opcode to fuse
+    auto I = OpcodeTablePtr->find(MI.getOpcode());
+    if (I != OpcodeTablePtr->end()) {
+      unsigned Opcode = I->second.first;
+      unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
+      if (Align < MinAlign)
+        return nullptr;
+      bool NarrowToMOV32rm = false;
+      if (Size) {
+        unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
+        if (Size < RCSize) {
+          // Check if it's safe to fold the load. If the size of the object is
+          // narrower than the load width, then it's not.
+          if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
+            return nullptr;
+          // If this is a 64-bit load, but the spill slot is 32, then we can do
+          // a 32-bit load which is implicitly zero-extended. This likely is
+          // due to live interval analysis remat'ing a load from stack slot.
+          if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+            return nullptr;
+          Opcode = X86::MOV32rm;
+          NarrowToMOV32rm = true;
+        }
+      }
+
+      if (isTwoAddrFold)
+        NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
+      else
+        NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
+
+      if (NarrowToMOV32rm) {
+        // If this is the special case where we use a MOV32rm to load a 32-bit
+        // value and zero-extend the top bits. Change the destination register
+        // to a 32-bit one.
+        unsigned DstReg = NewMI->getOperand(0).getReg();
+        if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+          NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
+        else
+          NewMI->getOperand(0).setSubReg(X86::sub_32bit);
+      }
+      return NewMI;
+    }
+  }
+
+  // If the instruction and target operand are commutable, commute the
+  // instruction and try again.
+  if (AllowCommute) {
+    unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
+    if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
+      bool HasDef = MI.getDesc().getNumDefs();
+      unsigned Reg0 = HasDef ? MI.getOperand(0).getReg() : 0;
+      unsigned Reg1 = MI.getOperand(CommuteOpIdx1).getReg();
+      unsigned Reg2 = MI.getOperand(CommuteOpIdx2).getReg();
+      bool Tied1 =
+          0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
+      bool Tied2 =
+          0 == MI.getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
+
+      // If either of the commutable operands are tied to the destination
+      // then we can not commute + fold.
+      if ((HasDef && Reg0 == Reg1 && Tied1) ||
+          (HasDef && Reg0 == Reg2 && Tied2))
+        return nullptr;
+
+      MachineInstr *CommutedMI =
+          commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+      if (!CommutedMI) {
+        // Unable to commute.
+        return nullptr;
+      }
+      if (CommutedMI != &MI) {
+        // New instruction. We can't fold from this.
+        CommutedMI->eraseFromParent();
+        return nullptr;
+      }
+
+      // Attempt to fold with the commuted version of the instruction.
+      NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
+                                    Size, Align, /*AllowCommute=*/false);
+      if (NewMI)
+        return NewMI;
+
+      // Folding failed again - undo the commute before returning.
+      MachineInstr *UncommutedMI =
+          commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+      if (!UncommutedMI) {
+        // Unable to commute.
+        return nullptr;
+      }
+      if (UncommutedMI != &MI) {
+        // New instruction. It doesn't need to be kept.
+        UncommutedMI->eraseFromParent();
+        return nullptr;
+      }
+
+      // Return here to prevent duplicate fuse failure report.
+      return nullptr;
+    }
+  }
+
+  // No fusion
+  if (PrintFailedFusing && !MI.isCopy())
+    dbgs() << "We failed to fuse operand " << OpNum << " in " << MI;
+  return nullptr;
+}
+
+/// Return true for all instructions that only update
+/// the first 32 or 64-bits of the destination register and leave the rest
+/// unmodified. This can be used to avoid folding loads if the instructions
+/// only update part of the destination register, and the non-updated part is
+/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
+/// instructions breaks the partial register dependency and it can improve
+/// performance. e.g.:
+///
+///   movss (%rdi), %xmm0
+///   cvtss2sd %xmm0, %xmm0
+///
+/// Instead of
+///   cvtss2sd (%rdi), %xmm0
+///
+/// FIXME: This should be turned into a TSFlags.
+///
+static bool hasPartialRegUpdate(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::CVTSI2SSrr:
+  case X86::CVTSI2SSrm:
+  case X86::CVTSI2SS64rr:
+  case X86::CVTSI2SS64rm:
+  case X86::CVTSI2SDrr:
+  case X86::CVTSI2SDrm:
+  case X86::CVTSI2SD64rr:
+  case X86::CVTSI2SD64rm:
+  case X86::CVTSD2SSrr:
+  case X86::CVTSD2SSrm:
+  case X86::CVTSS2SDrr:
+  case X86::CVTSS2SDrm:
+  case X86::MOVHPDrm:
+  case X86::MOVHPSrm:
+  case X86::MOVLPDrm:
+  case X86::MOVLPSrm:
+  case X86::RCPSSr:
+  case X86::RCPSSm:
+  case X86::RCPSSr_Int:
+  case X86::RCPSSm_Int:
+  case X86::ROUNDSDr:
+  case X86::ROUNDSDm:
+  case X86::ROUNDSSr:
+  case X86::ROUNDSSm:
+  case X86::RSQRTSSr:
+  case X86::RSQRTSSm:
+  case X86::RSQRTSSr_Int:
+  case X86::RSQRTSSm_Int:
+  case X86::SQRTSSr:
+  case X86::SQRTSSm:
+  case X86::SQRTSSr_Int:
+  case X86::SQRTSSm_Int:
+  case X86::SQRTSDr:
+  case X86::SQRTSDm:
+  case X86::SQRTSDr_Int:
+  case X86::SQRTSDm_Int:
+    return true;
+  }
+
+  return false;
+}
+
+/// Inform the ExeDepsFix pass how many idle
+/// instructions we would like before a partial register update.
+unsigned X86InstrInfo::getPartialRegUpdateClearance(
+    const MachineInstr &MI, unsigned OpNum,
+    const TargetRegisterInfo *TRI) const {
+  if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode()))
+    return 0;
+
+  // If MI is marked as reading Reg, the partial register update is wanted.
+  const MachineOperand &MO = MI.getOperand(0);
+  unsigned Reg = MO.getReg();
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (MO.readsReg() || MI.readsVirtualRegister(Reg))
+      return 0;
+  } else {
+    if (MI.readsRegister(Reg, TRI))
+      return 0;
+  }
+
+  // If any instructions in the clearance range are reading Reg, insert a
+  // dependency breaking instruction, which is inexpensive and is likely to
+  // be hidden in other instruction's cycles.
+  return PartialRegUpdateClearance;
+}
+
+// Return true for any instruction the copies the high bits of the first source
+// operand into the unused high bits of the destination operand.
+static bool hasUndefRegUpdate(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::VCVTSI2SSrr:
+  case X86::VCVTSI2SSrm:
+  case X86::Int_VCVTSI2SSrr:
+  case X86::Int_VCVTSI2SSrm:
+  case X86::VCVTSI2SS64rr:
+  case X86::VCVTSI2SS64rm:
+  case X86::Int_VCVTSI2SS64rr:
+  case X86::Int_VCVTSI2SS64rm:
+  case X86::VCVTSI2SDrr:
+  case X86::VCVTSI2SDrm:
+  case X86::Int_VCVTSI2SDrr:
+  case X86::Int_VCVTSI2SDrm:
+  case X86::VCVTSI2SD64rr:
+  case X86::VCVTSI2SD64rm:
+  case X86::Int_VCVTSI2SD64rr:
+  case X86::Int_VCVTSI2SD64rm:
+  case X86::VCVTSD2SSrr:
+  case X86::VCVTSD2SSrm:
+  case X86::Int_VCVTSD2SSrr:
+  case X86::Int_VCVTSD2SSrm:
+  case X86::VCVTSS2SDrr:
+  case X86::VCVTSS2SDrm:
+  case X86::Int_VCVTSS2SDrr:
+  case X86::Int_VCVTSS2SDrm:
+  case X86::VRCPSSr:
+  case X86::VRCPSSr_Int:
+  case X86::VRCPSSm:
+  case X86::VRCPSSm_Int:
+  case X86::VROUNDSDr:
+  case X86::VROUNDSDm:
+  case X86::VROUNDSDr_Int:
+  case X86::VROUNDSDm_Int:
+  case X86::VROUNDSSr:
+  case X86::VROUNDSSm:
+  case X86::VROUNDSSr_Int:
+  case X86::VROUNDSSm_Int:
+  case X86::VRSQRTSSr:
+  case X86::VRSQRTSSr_Int:
+  case X86::VRSQRTSSm:
+  case X86::VRSQRTSSm_Int:
+  case X86::VSQRTSSr:
+  case X86::VSQRTSSr_Int:
+  case X86::VSQRTSSm:
+  case X86::VSQRTSSm_Int:
+  case X86::VSQRTSDr:
+  case X86::VSQRTSDr_Int:
+  case X86::VSQRTSDm:
+  case X86::VSQRTSDm_Int:
+  // AVX-512
+  case X86::VCVTSI2SSZrr:
+  case X86::VCVTSI2SSZrm:
+  case X86::VCVTSI2SSZrr_Int:
+  case X86::VCVTSI2SSZrrb_Int:
+  case X86::VCVTSI2SSZrm_Int:
+  case X86::VCVTSI642SSZrr:
+  case X86::VCVTSI642SSZrm:
+  case X86::VCVTSI642SSZrr_Int:
+  case X86::VCVTSI642SSZrrb_Int:
+  case X86::VCVTSI642SSZrm_Int:
+  case X86::VCVTSI2SDZrr:
+  case X86::VCVTSI2SDZrm:
+  case X86::VCVTSI2SDZrr_Int:
+  case X86::VCVTSI2SDZrrb_Int:
+  case X86::VCVTSI2SDZrm_Int:
+  case X86::VCVTSI642SDZrr:
+  case X86::VCVTSI642SDZrm:
+  case X86::VCVTSI642SDZrr_Int:
+  case X86::VCVTSI642SDZrrb_Int:
+  case X86::VCVTSI642SDZrm_Int:
+  case X86::VCVTUSI2SSZrr:
+  case X86::VCVTUSI2SSZrm:
+  case X86::VCVTUSI2SSZrr_Int:
+  case X86::VCVTUSI2SSZrrb_Int:
+  case X86::VCVTUSI2SSZrm_Int:
+  case X86::VCVTUSI642SSZrr:
+  case X86::VCVTUSI642SSZrm:
+  case X86::VCVTUSI642SSZrr_Int:
+  case X86::VCVTUSI642SSZrrb_Int:
+  case X86::VCVTUSI642SSZrm_Int:
+  case X86::VCVTUSI2SDZrr:
+  case X86::VCVTUSI2SDZrm:
+  case X86::VCVTUSI2SDZrr_Int:
+  case X86::VCVTUSI2SDZrm_Int:
+  case X86::VCVTUSI642SDZrr:
+  case X86::VCVTUSI642SDZrm:
+  case X86::VCVTUSI642SDZrr_Int:
+  case X86::VCVTUSI642SDZrrb_Int:
+  case X86::VCVTUSI642SDZrm_Int:
+  case X86::VCVTSD2SSZrr:
+  case X86::VCVTSD2SSZrrb:
+  case X86::VCVTSD2SSZrm:
+  case X86::VCVTSS2SDZrr:
+  case X86::VCVTSS2SDZrrb:
+  case X86::VCVTSS2SDZrm:
+  case X86::VRNDSCALESDr:
+  case X86::VRNDSCALESDrb:
+  case X86::VRNDSCALESDm:
+  case X86::VRNDSCALESSr:
+  case X86::VRNDSCALESSrb:
+  case X86::VRNDSCALESSm:
+  case X86::VRCP14SSrr:
+  case X86::VRCP14SSrm:
+  case X86::VRSQRT14SSrr:
+  case X86::VRSQRT14SSrm:
+  case X86::VSQRTSSZr:
+  case X86::VSQRTSSZr_Int:
+  case X86::VSQRTSSZrb_Int:
+  case X86::VSQRTSSZm:
+  case X86::VSQRTSSZm_Int:
+  case X86::VSQRTSDZr:
+  case X86::VSQRTSDZr_Int:
+  case X86::VSQRTSDZrb_Int:
+  case X86::VSQRTSDZm:
+  case X86::VSQRTSDZm_Int:
+    return true;
+  }
+
+  return false;
+}
+
+/// Inform the ExeDepsFix pass how many idle instructions we would like before
+/// certain undef register reads.
+///
+/// This catches the VCVTSI2SD family of instructions:
+///
+/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
+///
+/// We should to be careful *not* to catch VXOR idioms which are presumably
+/// handled specially in the pipeline:
+///
+/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1
+///
+/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
+/// high bits that are passed-through are not live.
+unsigned
+X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
+                                   const TargetRegisterInfo *TRI) const {
+  if (!hasUndefRegUpdate(MI.getOpcode()))
+    return 0;
+
+  // Set the OpNum parameter to the first source operand.
+  OpNum = 1;
+
+  const MachineOperand &MO = MI.getOperand(OpNum);
+  if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+    return UndefRegClearance;
+  }
+  return 0;
+}
+
+void X86InstrInfo::breakPartialRegDependency(
+    MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
+  unsigned Reg = MI.getOperand(OpNum).getReg();
+  // If MI kills this register, the false dependence is already broken.
+  if (MI.killsRegister(Reg, TRI))
+    return;
+
+  if (X86::VR128RegClass.contains(Reg)) {
+    // These instructions are all floating point domain, so xorps is the best
+    // choice.
+    unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
+    MI.addRegisterKilled(Reg, TRI, true);
+  } else if (X86::VR256RegClass.contains(Reg)) {
+    // Use vxorps to clear the full ymm register.
+    // It wants to read and write the xmm sub-register.
+    unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
+        .addReg(XReg, RegState::Undef)
+        .addReg(XReg, RegState::Undef)
+        .addReg(Reg, RegState::ImplicitDefine);
+    MI.addRegisterKilled(Reg, TRI, true);
+  }
+}
+
+MachineInstr *
+X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                                    ArrayRef<unsigned> Ops,
+                                    MachineBasicBlock::iterator InsertPt,
+                                    int FrameIndex, LiveIntervals *LIS) const {
+  // Check switch flag
+  if (NoFusing)
+    return nullptr;
+
+  // Unless optimizing for size, don't fold to avoid partial
+  // register update stalls
+  if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode()))
+    return nullptr;
+
+  // Don't fold subreg spills, or reloads that use a high subreg.
+  for (auto Op : Ops) {
+    MachineOperand &MO = MI.getOperand(Op);
+    auto SubReg = MO.getSubReg();
+    if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
+      return nullptr;
+  }
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  unsigned Size = MFI.getObjectSize(FrameIndex);
+  unsigned Alignment = MFI.getObjectAlignment(FrameIndex);
+  // If the function stack isn't realigned we don't want to fold instructions
+  // that need increased alignment.
+  if (!RI.needsStackRealignment(MF))
+    Alignment =
+        std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment());
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    unsigned NewOpc = 0;
+    unsigned RCSize = 0;
+    switch (MI.getOpcode()) {
+    default: return nullptr;
+    case X86::TEST8rr:  NewOpc = X86::CMP8ri; RCSize = 1; break;
+    case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
+    case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
+    case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break;
+    }
+    // Check if it's safe to fold the load. If the size of the object is
+    // narrower than the load width, then it's not.
+    if (Size < RCSize)
+      return nullptr;
+    // Change to CMPXXri r, 0 first.
+    MI.setDesc(get(NewOpc));
+    MI.getOperand(1).ChangeToImmediate(0);
+  } else if (Ops.size() != 1)
+    return nullptr;
+
+  return foldMemoryOperandImpl(MF, MI, Ops[0],
+                               MachineOperand::CreateFI(FrameIndex), InsertPt,
+                               Size, Alignment, /*AllowCommute=*/true);
+}
+
+/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
+/// because the latter uses contents that wouldn't be defined in the folded
+/// version.  For instance, this transformation isn't legal:
+///   movss (%rdi), %xmm0
+///   addps %xmm0, %xmm0
+/// ->
+///   addps (%rdi), %xmm0
+///
+/// But this one is:
+///   movss (%rdi), %xmm0
+///   addss %xmm0, %xmm0
+/// ->
+///   addss (%rdi), %xmm0
+///
+static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
+                                             const MachineInstr &UserMI,
+                                             const MachineFunction &MF) {
+  unsigned Opc = LoadMI.getOpcode();
+  unsigned UserOpc = UserMI.getOpcode();
+  unsigned RegSize =
+      MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
+
+  if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm) &&
+      RegSize > 4) {
+    // These instructions only load 32 bits, we can't fold them if the
+    // destination register is wider than 32 bits (4 bytes), and its user
+    // instruction isn't scalar (SS).
+    switch (UserOpc) {
+    case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int:
+    case X86::Int_CMPSSrr: case X86::Int_VCMPSSrr: case X86::VCMPSSZrr_Int:
+    case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int:
+    case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int:
+    case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int:
+    case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
+    case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
+    case X86::VFMADDSS4rr_Int:   case X86::VFNMADDSS4rr_Int:
+    case X86::VFMSUBSS4rr_Int:   case X86::VFNMSUBSS4rr_Int:
+    case X86::VFMADD132SSr_Int:  case X86::VFNMADD132SSr_Int:
+    case X86::VFMADD213SSr_Int:  case X86::VFNMADD213SSr_Int:
+    case X86::VFMADD231SSr_Int:  case X86::VFNMADD231SSr_Int:
+    case X86::VFMSUB132SSr_Int:  case X86::VFNMSUB132SSr_Int:
+    case X86::VFMSUB213SSr_Int:  case X86::VFNMSUB213SSr_Int:
+    case X86::VFMSUB231SSr_Int:  case X86::VFNMSUB231SSr_Int:
+    case X86::VFMADD132SSZr_Int: case X86::VFNMADD132SSZr_Int:
+    case X86::VFMADD213SSZr_Int: case X86::VFNMADD213SSZr_Int:
+    case X86::VFMADD231SSZr_Int: case X86::VFNMADD231SSZr_Int:
+    case X86::VFMSUB132SSZr_Int: case X86::VFNMSUB132SSZr_Int:
+    case X86::VFMSUB213SSZr_Int: case X86::VFNMSUB213SSZr_Int:
+    case X86::VFMSUB231SSZr_Int: case X86::VFNMSUB231SSZr_Int:
+      return false;
+    default:
+      return true;
+    }
+  }
+
+  if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm) &&
+      RegSize > 8) {
+    // These instructions only load 64 bits, we can't fold them if the
+    // destination register is wider than 64 bits (8 bytes), and its user
+    // instruction isn't scalar (SD).
+    switch (UserOpc) {
+    case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int:
+    case X86::Int_CMPSDrr: case X86::Int_VCMPSDrr: case X86::VCMPSDZrr_Int:
+    case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int:
+    case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int:
+    case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int:
+    case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
+    case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
+    case X86::VFMADDSD4rr_Int:   case X86::VFNMADDSD4rr_Int:
+    case X86::VFMSUBSD4rr_Int:   case X86::VFNMSUBSD4rr_Int:
+    case X86::VFMADD132SDr_Int:  case X86::VFNMADD132SDr_Int:
+    case X86::VFMADD213SDr_Int:  case X86::VFNMADD213SDr_Int:
+    case X86::VFMADD231SDr_Int:  case X86::VFNMADD231SDr_Int:
+    case X86::VFMSUB132SDr_Int:  case X86::VFNMSUB132SDr_Int:
+    case X86::VFMSUB213SDr_Int:  case X86::VFNMSUB213SDr_Int:
+    case X86::VFMSUB231SDr_Int:  case X86::VFNMSUB231SDr_Int:
+    case X86::VFMADD132SDZr_Int: case X86::VFNMADD132SDZr_Int:
+    case X86::VFMADD213SDZr_Int: case X86::VFNMADD213SDZr_Int:
+    case X86::VFMADD231SDZr_Int: case X86::VFNMADD231SDZr_Int:
+    case X86::VFMSUB132SDZr_Int: case X86::VFNMSUB132SDZr_Int:
+    case X86::VFMSUB213SDZr_Int: case X86::VFNMSUB213SDZr_Int:
+    case X86::VFMSUB231SDZr_Int: case X86::VFNMSUB231SDZr_Int:
+      return false;
+    default:
+      return true;
+    }
+  }
+
+  return false;
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+    LiveIntervals *LIS) const {
+
+  // TODO: Support the case where LoadMI loads a wide register, but MI
+  // only uses a subreg.
+  for (auto Op : Ops) {
+    if (MI.getOperand(Op).getSubReg())
+      return nullptr;
+  }
+
+  // If loading from a FrameIndex, fold directly from the FrameIndex.
+  unsigned NumOps = LoadMI.getDesc().getNumOperands();
+  int FrameIndex;
+  if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
+    if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
+      return nullptr;
+    return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS);
+  }
+
+  // Check switch flag
+  if (NoFusing) return nullptr;
+
+  // Avoid partial register update stalls unless optimizing for size.
+  if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode()))
+    return nullptr;
+
+  // Determine the alignment of the load.
+  unsigned Alignment = 0;
+  if (LoadMI.hasOneMemOperand())
+    Alignment = (*LoadMI.memoperands_begin())->getAlignment();
+  else
+    switch (LoadMI.getOpcode()) {
+    case X86::AVX512_512_SET0:
+    case X86::AVX512_512_SETALLONES:
+      Alignment = 64;
+      break;
+    case X86::AVX2_SETALLONES:
+    case X86::AVX_SET0:
+    case X86::AVX512_256_SET0:
+      Alignment = 32;
+      break;
+    case X86::V_SET0:
+    case X86::V_SETALLONES:
+    case X86::AVX512_128_SET0:
+      Alignment = 16;
+      break;
+    case X86::FsFLD0SD:
+    case X86::AVX512_FsFLD0SD:
+      Alignment = 8;
+      break;
+    case X86::FsFLD0SS:
+    case X86::AVX512_FsFLD0SS:
+      Alignment = 4;
+      break;
+    default:
+      return nullptr;
+    }
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    unsigned NewOpc = 0;
+    switch (MI.getOpcode()) {
+    default: return nullptr;
+    case X86::TEST8rr:  NewOpc = X86::CMP8ri; break;
+    case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
+    case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
+    case X86::TEST64rr: NewOpc = X86::CMP64ri8; break;
+    }
+    // Change to CMPXXri r, 0 first.
+    MI.setDesc(get(NewOpc));
+    MI.getOperand(1).ChangeToImmediate(0);
+  } else if (Ops.size() != 1)
+    return nullptr;
+
+  // Make sure the subregisters match.
+  // Otherwise we risk changing the size of the load.
+  if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg())
+    return nullptr;
+
+  SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
+  switch (LoadMI.getOpcode()) {
+  case X86::V_SET0:
+  case X86::V_SETALLONES:
+  case X86::AVX2_SETALLONES:
+  case X86::AVX_SET0:
+  case X86::AVX512_128_SET0:
+  case X86::AVX512_256_SET0:
+  case X86::AVX512_512_SET0:
+  case X86::AVX512_512_SETALLONES:
+  case X86::FsFLD0SD:
+  case X86::AVX512_FsFLD0SD:
+  case X86::FsFLD0SS:
+  case X86::AVX512_FsFLD0SS: {
+    // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
+    // Create a constant-pool entry and operands to load from it.
+
+    // Medium and large mode can't fold loads this way.
+    if (MF.getTarget().getCodeModel() != CodeModel::Small &&
+        MF.getTarget().getCodeModel() != CodeModel::Kernel)
+      return nullptr;
+
+    // x86-32 PIC requires a PIC base register for constant pools.
+    unsigned PICBase = 0;
+    if (MF.getTarget().isPositionIndependent()) {
+      if (Subtarget.is64Bit())
+        PICBase = X86::RIP;
+      else
+        // FIXME: PICBase = getGlobalBaseReg(&MF);
+        // This doesn't work for several reasons.
+        // 1. GlobalBaseReg may have been spilled.
+        // 2. It may not be live at MI.
+        return nullptr;
+    }
+
+    // Create a constant-pool entry.
+    MachineConstantPool &MCP = *MF.getConstantPool();
+    Type *Ty;
+    unsigned Opc = LoadMI.getOpcode();
+    if (Opc == X86::FsFLD0SS || Opc == X86::AVX512_FsFLD0SS)
+      Ty = Type::getFloatTy(MF.getFunction()->getContext());
+    else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD)
+      Ty = Type::getDoubleTy(MF.getFunction()->getContext());
+    else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
+      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16);
+    else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
+             Opc == X86::AVX512_256_SET0)
+      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
+    else
+      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
+
+    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
+                      Opc == X86::AVX512_512_SETALLONES);
+    const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
+                                    Constant::getNullValue(Ty);
+    unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
+
+    // Create operands to load from the constant pool entry.
+    MOs.push_back(MachineOperand::CreateReg(PICBase, false));
+    MOs.push_back(MachineOperand::CreateImm(1));
+    MOs.push_back(MachineOperand::CreateReg(0, false));
+    MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
+    MOs.push_back(MachineOperand::CreateReg(0, false));
+    break;
+  }
+  default: {
+    if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
+      return nullptr;
+
+    // Folding a normal load. Just copy the load's address operands.
+    MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands,
+               LoadMI.operands_begin() + NumOps);
+    break;
+  }
+  }
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
+                               /*Size=*/0, Alignment, /*AllowCommute=*/true);
+}
+
+bool X86InstrInfo::unfoldMemoryOperand(
+    MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad,
+    bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
+  auto I = MemOp2RegOpTable.find(MI.getOpcode());
+  if (I == MemOp2RegOpTable.end())
+    return false;
+  unsigned Opc = I->second.first;
+  unsigned Index = I->second.second & TB_INDEX_MASK;
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+  if (UnfoldLoad && !FoldedLoad)
+    return false;
+  UnfoldLoad &= FoldedLoad;
+  if (UnfoldStore && !FoldedStore)
+    return false;
+  UnfoldStore &= FoldedStore;
+
+  const MCInstrDesc &MCID = get(Opc);
+  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+  // TODO: Check if 32-byte or greater accesses are slow too?
+  if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
+      Subtarget.isUnalignedMem16Slow())
+    // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
+    // conservatively assume the address is unaligned. That's bad for
+    // performance.
+    return false;
+  SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps;
+  SmallVector<MachineOperand,2> BeforeOps;
+  SmallVector<MachineOperand,2> AfterOps;
+  SmallVector<MachineOperand,4> ImpOps;
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &Op = MI.getOperand(i);
+    if (i >= Index && i < Index + X86::AddrNumOperands)
+      AddrOps.push_back(Op);
+    else if (Op.isReg() && Op.isImplicit())
+      ImpOps.push_back(Op);
+    else if (i < Index)
+      BeforeOps.push_back(Op);
+    else if (i > Index)
+      AfterOps.push_back(Op);
+  }
+
+  // Emit the load instruction.
+  if (UnfoldLoad) {
+    std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator> MMOs =
+        MF.extractLoadMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
+    if (UnfoldStore) {
+      // Address operands cannot be marked isKill.
+      for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
+        MachineOperand &MO = NewMIs[0]->getOperand(i);
+        if (MO.isReg())
+          MO.setIsKill(false);
+      }
+    }
+  }
+
+  // Emit the data processing instruction.
+  MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, DataMI);
+
+  if (FoldedStore)
+    MIB.addReg(Reg, RegState::Define);
+  for (MachineOperand &BeforeOp : BeforeOps)
+    MIB.addOperand(BeforeOp);
+  if (FoldedLoad)
+    MIB.addReg(Reg);
+  for (MachineOperand &AfterOp : AfterOps)
+    MIB.addOperand(AfterOp);
+  for (MachineOperand &ImpOp : ImpOps) {
+    MIB.addReg(ImpOp.getReg(),
+               getDefRegState(ImpOp.isDef()) |
+               RegState::Implicit |
+               getKillRegState(ImpOp.isKill()) |
+               getDeadRegState(ImpOp.isDead()) |
+               getUndefRegState(ImpOp.isUndef()));
+  }
+  // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
+  switch (DataMI->getOpcode()) {
+  default: break;
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP8ri: {
+    MachineOperand &MO0 = DataMI->getOperand(0);
+    MachineOperand &MO1 = DataMI->getOperand(1);
+    if (MO1.getImm() == 0) {
+      unsigned NewOpc;
+      switch (DataMI->getOpcode()) {
+      default: llvm_unreachable("Unreachable!");
+      case X86::CMP64ri8:
+      case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
+      case X86::CMP32ri8:
+      case X86::CMP32ri:   NewOpc = X86::TEST32rr; break;
+      case X86::CMP16ri8:
+      case X86::CMP16ri:   NewOpc = X86::TEST16rr; break;
+      case X86::CMP8ri:    NewOpc = X86::TEST8rr; break;
+      }
+      DataMI->setDesc(get(NewOpc));
+      MO1.ChangeToRegister(MO0.getReg(), false);
+    }
+  }
+  }
+  NewMIs.push_back(DataMI);
+
+  // Emit the store instruction.
+  if (UnfoldStore) {
+    const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
+    std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator> MMOs =
+        MF.extractStoreMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs);
+  }
+
+  return true;
+}
+
+bool
+X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                                  SmallVectorImpl<SDNode*> &NewNodes) const {
+  if (!N->isMachineOpcode())
+    return false;
+
+  auto I = MemOp2RegOpTable.find(N->getMachineOpcode());
+  if (I == MemOp2RegOpTable.end())
+    return false;
+  unsigned Opc = I->second.first;
+  unsigned Index = I->second.second & TB_INDEX_MASK;
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+  const MCInstrDesc &MCID = get(Opc);
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+  unsigned NumDefs = MCID.NumDefs;
+  std::vector<SDValue> AddrOps;
+  std::vector<SDValue> BeforeOps;
+  std::vector<SDValue> AfterOps;
+  SDLoc dl(N);
+  unsigned NumOps = N->getNumOperands();
+  for (unsigned i = 0; i != NumOps-1; ++i) {
+    SDValue Op = N->getOperand(i);
+    if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands)
+      AddrOps.push_back(Op);
+    else if (i < Index-NumDefs)
+      BeforeOps.push_back(Op);
+    else if (i > Index-NumDefs)
+      AfterOps.push_back(Op);
+  }
+  SDValue Chain = N->getOperand(NumOps-1);
+  AddrOps.push_back(Chain);
+
+  // Emit the load instruction.
+  SDNode *Load = nullptr;
+  if (FoldedLoad) {
+    EVT VT = *RC->vt_begin();
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+                            cast<MachineSDNode>(N)->memoperands_end());
+    if (!(*MMOs.first) &&
+        RC == &X86::VR128RegClass &&
+        Subtarget.isUnalignedMem16Slow())
+      // Do not introduce a slow unaligned load.
+      return false;
+    // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
+    // memory access is slow above.
+    unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+    bool isAligned = (*MMOs.first) &&
+                     (*MMOs.first)->getAlignment() >= Alignment;
+    Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
+                              VT, MVT::Other, AddrOps);
+    NewNodes.push_back(Load);
+
+    // Preserve memory reference information.
+    cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
+  }
+
+  // Emit the data processing instruction.
+  std::vector<EVT> VTs;
+  const TargetRegisterClass *DstRC = nullptr;
+  if (MCID.getNumDefs() > 0) {
+    DstRC = getRegClass(MCID, 0, &RI, MF);
+    VTs.push_back(*DstRC->vt_begin());
+  }
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+    EVT VT = N->getValueType(i);
+    if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
+      VTs.push_back(VT);
+  }
+  if (Load)
+    BeforeOps.push_back(SDValue(Load, 0));
+  BeforeOps.insert(BeforeOps.end(), AfterOps.begin(), AfterOps.end());
+  SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
+  NewNodes.push_back(NewNode);
+
+  // Emit the store instruction.
+  if (FoldedStore) {
+    AddrOps.pop_back();
+    AddrOps.push_back(SDValue(NewNode, 0));
+    AddrOps.push_back(Chain);
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+                             cast<MachineSDNode>(N)->memoperands_end());
+    if (!(*MMOs.first) &&
+        RC == &X86::VR128RegClass &&
+        Subtarget.isUnalignedMem16Slow())
+      // Do not introduce a slow unaligned store.
+      return false;
+    // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
+    // memory access is slow above.
+    unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+    bool isAligned = (*MMOs.first) &&
+                     (*MMOs.first)->getAlignment() >= Alignment;
+    SDNode *Store =
+        DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
+                           dl, MVT::Other, AddrOps);
+    NewNodes.push_back(Store);
+
+    // Preserve memory reference information.
+    cast<MachineSDNode>(Store)->setMemRefs(MMOs.first, MMOs.second);
+  }
+
+  return true;
+}
+
+unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                      bool UnfoldLoad, bool UnfoldStore,
+                                      unsigned *LoadRegIndex) const {
+  auto I = MemOp2RegOpTable.find(Opc);
+  if (I == MemOp2RegOpTable.end())
+    return 0;
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+  if (UnfoldLoad && !FoldedLoad)
+    return 0;
+  if (UnfoldStore && !FoldedStore)
+    return 0;
+  if (LoadRegIndex)
+    *LoadRegIndex = I->second.second & TB_INDEX_MASK;
+  return I->second.first;
+}
+
+bool
+X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                     int64_t &Offset1, int64_t &Offset2) const {
+  if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+    return false;
+  unsigned Opc1 = Load1->getMachineOpcode();
+  unsigned Opc2 = Load2->getMachineOpcode();
+  switch (Opc1) {
+  default: return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVUPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  // AVX load instructions
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVUPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVUPDYrm:
+  case X86::VMOVDQAYrm:
+  case X86::VMOVDQUYrm:
+  // AVX512 load instructions
+  case X86::VMOVSSZrm:
+  case X86::VMOVSDZrm:
+  case X86::VMOVAPSZ128rm:
+  case X86::VMOVUPSZ128rm:
+  case X86::VMOVAPSZ128rm_NOVLX:
+  case X86::VMOVUPSZ128rm_NOVLX:
+  case X86::VMOVAPDZ128rm:
+  case X86::VMOVUPDZ128rm:
+  case X86::VMOVDQU8Z128rm:
+  case X86::VMOVDQU16Z128rm:
+  case X86::VMOVDQA32Z128rm:
+  case X86::VMOVDQU32Z128rm:
+  case X86::VMOVDQA64Z128rm:
+  case X86::VMOVDQU64Z128rm:
+  case X86::VMOVAPSZ256rm:
+  case X86::VMOVUPSZ256rm:
+  case X86::VMOVAPSZ256rm_NOVLX:
+  case X86::VMOVUPSZ256rm_NOVLX:
+  case X86::VMOVAPDZ256rm:
+  case X86::VMOVUPDZ256rm:
+  case X86::VMOVDQU8Z256rm:
+  case X86::VMOVDQU16Z256rm:
+  case X86::VMOVDQA32Z256rm:
+  case X86::VMOVDQU32Z256rm:
+  case X86::VMOVDQA64Z256rm:
+  case X86::VMOVDQU64Z256rm:
+  case X86::VMOVAPSZrm:
+  case X86::VMOVUPSZrm:
+  case X86::VMOVAPDZrm:
+  case X86::VMOVUPDZrm:
+  case X86::VMOVDQU8Zrm:
+  case X86::VMOVDQU16Zrm:
+  case X86::VMOVDQA32Zrm:
+  case X86::VMOVDQU32Zrm:
+  case X86::VMOVDQA64Zrm:
+  case X86::VMOVDQU64Zrm:
+  case X86::KMOVBkm:
+  case X86::KMOVWkm:
+  case X86::KMOVDkm:
+  case X86::KMOVQkm:
+    break;
+  }
+  switch (Opc2) {
+  default: return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVUPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  // AVX load instructions
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVUPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVUPDYrm:
+  case X86::VMOVDQAYrm:
+  case X86::VMOVDQUYrm:
+  // AVX512 load instructions
+  case X86::VMOVSSZrm:
+  case X86::VMOVSDZrm:
+  case X86::VMOVAPSZ128rm:
+  case X86::VMOVUPSZ128rm:
+  case X86::VMOVAPSZ128rm_NOVLX:
+  case X86::VMOVUPSZ128rm_NOVLX:
+  case X86::VMOVAPDZ128rm:
+  case X86::VMOVUPDZ128rm:
+  case X86::VMOVDQU8Z128rm:
+  case X86::VMOVDQU16Z128rm:
+  case X86::VMOVDQA32Z128rm:
+  case X86::VMOVDQU32Z128rm:
+  case X86::VMOVDQA64Z128rm:
+  case X86::VMOVDQU64Z128rm:
+  case X86::VMOVAPSZ256rm:
+  case X86::VMOVUPSZ256rm:
+  case X86::VMOVAPSZ256rm_NOVLX:
+  case X86::VMOVUPSZ256rm_NOVLX:
+  case X86::VMOVAPDZ256rm:
+  case X86::VMOVUPDZ256rm:
+  case X86::VMOVDQU8Z256rm:
+  case X86::VMOVDQU16Z256rm:
+  case X86::VMOVDQA32Z256rm:
+  case X86::VMOVDQU32Z256rm:
+  case X86::VMOVDQA64Z256rm:
+  case X86::VMOVDQU64Z256rm:
+  case X86::VMOVAPSZrm:
+  case X86::VMOVUPSZrm:
+  case X86::VMOVAPDZrm:
+  case X86::VMOVUPDZrm:
+  case X86::VMOVDQU8Zrm:
+  case X86::VMOVDQU16Zrm:
+  case X86::VMOVDQA32Zrm:
+  case X86::VMOVDQU32Zrm:
+  case X86::VMOVDQA64Zrm:
+  case X86::VMOVDQU64Zrm:
+  case X86::KMOVBkm:
+  case X86::KMOVWkm:
+  case X86::KMOVDkm:
+  case X86::KMOVQkm:
+    break;
+  }
+
+  // Check if chain operands and base addresses match.
+  if (Load1->getOperand(0) != Load2->getOperand(0) ||
+      Load1->getOperand(5) != Load2->getOperand(5))
+    return false;
+  // Segment operands should match as well.
+  if (Load1->getOperand(4) != Load2->getOperand(4))
+    return false;
+  // Scale should be 1, Index should be Reg0.
+  if (Load1->getOperand(1) == Load2->getOperand(1) &&
+      Load1->getOperand(2) == Load2->getOperand(2)) {
+    if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
+      return false;
+
+    // Now let's examine the displacements.
+    if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
+        isa<ConstantSDNode>(Load2->getOperand(3))) {
+      Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
+      Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
+      return true;
+    }
+  }
+  return false;
+}
+
+bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                           int64_t Offset1, int64_t Offset2,
+                                           unsigned NumLoads) const {
+  assert(Offset2 > Offset1);
+  if ((Offset2 - Offset1) / 8 > 64)
+    return false;
+
+  unsigned Opc1 = Load1->getMachineOpcode();
+  unsigned Opc2 = Load2->getMachineOpcode();
+  if (Opc1 != Opc2)
+    return false;  // FIXME: overly conservative?
+
+  switch (Opc1) {
+  default: break;
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+    return false;
+  }
+
+  EVT VT = Load1->getValueType(0);
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    // XMM registers. In 64-bit mode we can be a bit more aggressive since we
+    // have 16 of them to play with.
+    if (Subtarget.is64Bit()) {
+      if (NumLoads >= 3)
+        return false;
+    } else if (NumLoads) {
+      return false;
+    }
+    break;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+  case MVT::f32:
+  case MVT::f64:
+    if (NumLoads)
+      return false;
+    break;
+  }
+
+  return true;
+}
+
+bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First,
+                                          const MachineInstr &Second) const {
+  // Check if this processor supports macro-fusion. Since this is a minor
+  // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
+  // proxy for SandyBridge+.
+  if (!Subtarget.hasAVX())
+    return false;
+
+  enum {
+    FuseTest,
+    FuseCmp,
+    FuseInc
+  } FuseKind;
+
+  switch (Second.getOpcode()) {
+  default:
+    return false;
+  case X86::JE_1:
+  case X86::JNE_1:
+  case X86::JL_1:
+  case X86::JLE_1:
+  case X86::JG_1:
+  case X86::JGE_1:
+    FuseKind = FuseInc;
+    break;
+  case X86::JB_1:
+  case X86::JBE_1:
+  case X86::JA_1:
+  case X86::JAE_1:
+    FuseKind = FuseCmp;
+    break;
+  case X86::JS_1:
+  case X86::JNS_1:
+  case X86::JP_1:
+  case X86::JNP_1:
+  case X86::JO_1:
+  case X86::JNO_1:
+    FuseKind = FuseTest;
+    break;
+  }
+  switch (First.getOpcode()) {
+  default:
+    return false;
+  case X86::TEST8rr:
+  case X86::TEST16rr:
+  case X86::TEST32rr:
+  case X86::TEST64rr:
+  case X86::TEST8ri:
+  case X86::TEST16ri:
+  case X86::TEST32ri:
+  case X86::TEST32i32:
+  case X86::TEST64i32:
+  case X86::TEST64ri32:
+  case X86::TEST8rm:
+  case X86::TEST16rm:
+  case X86::TEST32rm:
+  case X86::TEST64rm:
+  case X86::TEST8ri_NOREX:
+  case X86::AND16i16:
+  case X86::AND16ri:
+  case X86::AND16ri8:
+  case X86::AND16rm:
+  case X86::AND16rr:
+  case X86::AND32i32:
+  case X86::AND32ri:
+  case X86::AND32ri8:
+  case X86::AND32rm:
+  case X86::AND32rr:
+  case X86::AND64i32:
+  case X86::AND64ri32:
+  case X86::AND64ri8:
+  case X86::AND64rm:
+  case X86::AND64rr:
+  case X86::AND8i8:
+  case X86::AND8ri:
+  case X86::AND8rm:
+  case X86::AND8rr:
+    return true;
+  case X86::CMP16i16:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP16rm:
+  case X86::CMP16rr:
+  case X86::CMP32i32:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP32rm:
+  case X86::CMP32rr:
+  case X86::CMP64i32:
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP64rm:
+  case X86::CMP64rr:
+  case X86::CMP8i8:
+  case X86::CMP8ri:
+  case X86::CMP8rm:
+  case X86::CMP8rr:
+  case X86::ADD16i16:
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri8_DB:
+  case X86::ADD16ri_DB:
+  case X86::ADD16rm:
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB:
+  case X86::ADD32i32:
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+  case X86::ADD32ri8_DB:
+  case X86::ADD32ri_DB:
+  case X86::ADD32rm:
+  case X86::ADD32rr:
+  case X86::ADD32rr_DB:
+  case X86::ADD64i32:
+  case X86::ADD64ri32:
+  case X86::ADD64ri32_DB:
+  case X86::ADD64ri8:
+  case X86::ADD64ri8_DB:
+  case X86::ADD64rm:
+  case X86::ADD64rr:
+  case X86::ADD64rr_DB:
+  case X86::ADD8i8:
+  case X86::ADD8mi:
+  case X86::ADD8mr:
+  case X86::ADD8ri:
+  case X86::ADD8rm:
+  case X86::ADD8rr:
+  case X86::SUB16i16:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB16rm:
+  case X86::SUB16rr:
+  case X86::SUB32i32:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB32rm:
+  case X86::SUB32rr:
+  case X86::SUB64i32:
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB64rm:
+  case X86::SUB64rr:
+  case X86::SUB8i8:
+  case X86::SUB8ri:
+  case X86::SUB8rm:
+  case X86::SUB8rr:
+    return FuseKind == FuseCmp || FuseKind == FuseInc;
+  case X86::INC16r:
+  case X86::INC32r:
+  case X86::INC64r:
+  case X86::INC8r:
+  case X86::DEC16r:
+  case X86::DEC32r:
+  case X86::DEC64r:
+  case X86::DEC8r:
+    return FuseKind == FuseInc;
+  }
+}
+
+bool X86InstrInfo::
+reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid X86 branch condition!");
+  X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
+  Cond[0].setImm(GetOppositeBranchCondition(CC));
+  return false;
+}
+
+bool X86InstrInfo::
+isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+  // FIXME: Return false for x87 stack register classes for now. We can't
+  // allow any loads of these registers before FpGet_ST0_80.
+  return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
+           RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
+}
+
+/// Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
+///
+unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+  assert(!Subtarget.is64Bit() &&
+         "X86-64 PIC uses RIP relative addressing");
+
+  X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+  unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // Create the register. The code to initialize it is inserted
+  // later, by the CGBR pass (below).
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+  X86FI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}
+
+// These are the replaceable SSE instructions. Some of these have Int variants
+// that we don't include here. We don't want to replace instructions selected
+// by intrinsics.
+static const uint16_t ReplaceableInstrs[][3] = {
+  //PackedSingle     PackedDouble    PackedInt
+  { X86::MOVAPSmr,   X86::MOVAPDmr,  X86::MOVDQAmr  },
+  { X86::MOVAPSrm,   X86::MOVAPDrm,  X86::MOVDQArm  },
+  { X86::MOVAPSrr,   X86::MOVAPDrr,  X86::MOVDQArr  },
+  { X86::MOVUPSmr,   X86::MOVUPDmr,  X86::MOVDQUmr  },
+  { X86::MOVUPSrm,   X86::MOVUPDrm,  X86::MOVDQUrm  },
+  { X86::MOVLPSmr,   X86::MOVLPDmr,  X86::MOVPQI2QImr },
+  { X86::MOVSSmr,    X86::MOVSSmr,   X86::MOVPDI2DImr },
+  { X86::MOVSDrm,    X86::MOVSDrm,   X86::MOVQI2PQIrm },
+  { X86::MOVSSrm,    X86::MOVSSrm,   X86::MOVDI2PDIrm },
+  { X86::MOVNTPSmr,  X86::MOVNTPDmr, X86::MOVNTDQmr },
+  { X86::ANDNPSrm,   X86::ANDNPDrm,  X86::PANDNrm   },
+  { X86::ANDNPSrr,   X86::ANDNPDrr,  X86::PANDNrr   },
+  { X86::ANDPSrm,    X86::ANDPDrm,   X86::PANDrm    },
+  { X86::ANDPSrr,    X86::ANDPDrr,   X86::PANDrr    },
+  { X86::ORPSrm,     X86::ORPDrm,    X86::PORrm     },
+  { X86::ORPSrr,     X86::ORPDrr,    X86::PORrr     },
+  { X86::XORPSrm,    X86::XORPDrm,   X86::PXORrm    },
+  { X86::XORPSrr,    X86::XORPDrr,   X86::PXORrr    },
+  // AVX 128-bit support
+  { X86::VMOVAPSmr,  X86::VMOVAPDmr,  X86::VMOVDQAmr  },
+  { X86::VMOVAPSrm,  X86::VMOVAPDrm,  X86::VMOVDQArm  },
+  { X86::VMOVAPSrr,  X86::VMOVAPDrr,  X86::VMOVDQArr  },
+  { X86::VMOVUPSmr,  X86::VMOVUPDmr,  X86::VMOVDQUmr  },
+  { X86::VMOVUPSrm,  X86::VMOVUPDrm,  X86::VMOVDQUrm  },
+  { X86::VMOVLPSmr,  X86::VMOVLPDmr,  X86::VMOVPQI2QImr },
+  { X86::VMOVSSmr,   X86::VMOVSSmr,   X86::VMOVPDI2DImr },
+  { X86::VMOVSDrm,   X86::VMOVSDrm,   X86::VMOVQI2PQIrm },
+  { X86::VMOVSSrm,   X86::VMOVSSrm,   X86::VMOVDI2PDIrm },
+  { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
+  { X86::VANDNPSrm,  X86::VANDNPDrm,  X86::VPANDNrm   },
+  { X86::VANDNPSrr,  X86::VANDNPDrr,  X86::VPANDNrr   },
+  { X86::VANDPSrm,   X86::VANDPDrm,   X86::VPANDrm    },
+  { X86::VANDPSrr,   X86::VANDPDrr,   X86::VPANDrr    },
+  { X86::VORPSrm,    X86::VORPDrm,    X86::VPORrm     },
+  { X86::VORPSrr,    X86::VORPDrr,    X86::VPORrr     },
+  { X86::VXORPSrm,   X86::VXORPDrm,   X86::VPXORrm    },
+  { X86::VXORPSrr,   X86::VXORPDrr,   X86::VPXORrr    },
+  // AVX 256-bit support
+  { X86::VMOVAPSYmr,   X86::VMOVAPDYmr,   X86::VMOVDQAYmr  },
+  { X86::VMOVAPSYrm,   X86::VMOVAPDYrm,   X86::VMOVDQAYrm  },
+  { X86::VMOVAPSYrr,   X86::VMOVAPDYrr,   X86::VMOVDQAYrr  },
+  { X86::VMOVUPSYmr,   X86::VMOVUPDYmr,   X86::VMOVDQUYmr  },
+  { X86::VMOVUPSYrm,   X86::VMOVUPDYrm,   X86::VMOVDQUYrm  },
+  { X86::VMOVNTPSYmr,  X86::VMOVNTPDYmr,  X86::VMOVNTDQYmr },
+  // AVX512 support
+  { X86::VMOVLPSZ128mr,  X86::VMOVLPDZ128mr,  X86::VMOVPQI2QIZmr  },
+  { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
+  { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
+  { X86::VMOVNTPSZmr,    X86::VMOVNTPDZmr,    X86::VMOVNTDQZmr    },
+  { X86::VMOVSDZmr,      X86::VMOVSDZmr,      X86::VMOVPQI2QIZmr  },
+  { X86::VMOVSSZmr,      X86::VMOVSSZmr,      X86::VMOVPDI2DIZmr  },
+  { X86::VMOVSDZrm,      X86::VMOVSDZrm,      X86::VMOVQI2PQIZrm  },
+  { X86::VMOVSSZrm,      X86::VMOVSSZrm,      X86::VMOVDI2PDIZrm  },
+  { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128r, X86::VPBROADCASTDZ128r },
+  { X86::VBROADCASTSSZ128m, X86::VBROADCASTSSZ128m, X86::VPBROADCASTDZ128m },
+  { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256r, X86::VPBROADCASTDZ256r },
+  { X86::VBROADCASTSSZ256m, X86::VBROADCASTSSZ256m, X86::VPBROADCASTDZ256m },
+  { X86::VBROADCASTSSZr,    X86::VBROADCASTSSZr,    X86::VPBROADCASTDZr },
+  { X86::VBROADCASTSSZm,    X86::VBROADCASTSSZm,    X86::VPBROADCASTDZm },
+  { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256r, X86::VPBROADCASTQZ256r },
+  { X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m },
+  { X86::VBROADCASTSDZr,    X86::VBROADCASTSDZr,    X86::VPBROADCASTQZr },
+  { X86::VBROADCASTSDZm,    X86::VBROADCASTSDZm,    X86::VPBROADCASTQZm },
+};
+
+static const uint16_t ReplaceableInstrsAVX2[][3] = {
+  //PackedSingle       PackedDouble       PackedInt
+  { X86::VANDNPSYrm,   X86::VANDNPDYrm,   X86::VPANDNYrm   },
+  { X86::VANDNPSYrr,   X86::VANDNPDYrr,   X86::VPANDNYrr   },
+  { X86::VANDPSYrm,    X86::VANDPDYrm,    X86::VPANDYrm    },
+  { X86::VANDPSYrr,    X86::VANDPDYrr,    X86::VPANDYrr    },
+  { X86::VORPSYrm,     X86::VORPDYrm,     X86::VPORYrm     },
+  { X86::VORPSYrr,     X86::VORPDYrr,     X86::VPORYrr     },
+  { X86::VXORPSYrm,    X86::VXORPDYrm,    X86::VPXORYrm    },
+  { X86::VXORPSYrr,    X86::VXORPDYrr,    X86::VPXORYrr    },
+  { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
+  { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
+  { X86::VINSERTF128rm,  X86::VINSERTF128rm,  X86::VINSERTI128rm },
+  { X86::VINSERTF128rr,  X86::VINSERTF128rr,  X86::VINSERTI128rr },
+  { X86::VPERM2F128rm,   X86::VPERM2F128rm,   X86::VPERM2I128rm },
+  { X86::VPERM2F128rr,   X86::VPERM2F128rr,   X86::VPERM2I128rr },
+  { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
+  { X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr},
+  { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
+  { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
+  { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
+  { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm},
+  { X86::VBROADCASTF128,  X86::VBROADCASTF128,  X86::VBROADCASTI128 },
+};
+
+static const uint16_t ReplaceableInstrsAVX512[][4] = {
+  // Two integer columns for 64-bit and 32-bit elements.
+  //PackedSingle        PackedDouble        PackedInt             PackedInt
+  { X86::VMOVAPSZ128mr, X86::VMOVAPDZ128mr, X86::VMOVDQA64Z128mr, X86::VMOVDQA32Z128mr  },
+  { X86::VMOVAPSZ128rm, X86::VMOVAPDZ128rm, X86::VMOVDQA64Z128rm, X86::VMOVDQA32Z128rm  },
+  { X86::VMOVAPSZ128rr, X86::VMOVAPDZ128rr, X86::VMOVDQA64Z128rr, X86::VMOVDQA32Z128rr  },
+  { X86::VMOVUPSZ128mr, X86::VMOVUPDZ128mr, X86::VMOVDQU64Z128mr, X86::VMOVDQU32Z128mr  },
+  { X86::VMOVUPSZ128rm, X86::VMOVUPDZ128rm, X86::VMOVDQU64Z128rm, X86::VMOVDQU32Z128rm  },
+  { X86::VMOVAPSZ256mr, X86::VMOVAPDZ256mr, X86::VMOVDQA64Z256mr, X86::VMOVDQA32Z256mr  },
+  { X86::VMOVAPSZ256rm, X86::VMOVAPDZ256rm, X86::VMOVDQA64Z256rm, X86::VMOVDQA32Z256rm  },
+  { X86::VMOVAPSZ256rr, X86::VMOVAPDZ256rr, X86::VMOVDQA64Z256rr, X86::VMOVDQA32Z256rr  },
+  { X86::VMOVUPSZ256mr, X86::VMOVUPDZ256mr, X86::VMOVDQU64Z256mr, X86::VMOVDQU32Z256mr  },
+  { X86::VMOVUPSZ256rm, X86::VMOVUPDZ256rm, X86::VMOVDQU64Z256rm, X86::VMOVDQU32Z256rm  },
+  { X86::VMOVAPSZmr,    X86::VMOVAPDZmr,    X86::VMOVDQA64Zmr,    X86::VMOVDQA32Zmr     },
+  { X86::VMOVAPSZrm,    X86::VMOVAPDZrm,    X86::VMOVDQA64Zrm,    X86::VMOVDQA32Zrm     },
+  { X86::VMOVAPSZrr,    X86::VMOVAPDZrr,    X86::VMOVDQA64Zrr,    X86::VMOVDQA32Zrr     },
+  { X86::VMOVUPSZmr,    X86::VMOVUPDZmr,    X86::VMOVDQU64Zmr,    X86::VMOVDQU32Zmr     },
+  { X86::VMOVUPSZrm,    X86::VMOVUPDZrm,    X86::VMOVDQU64Zrm,    X86::VMOVDQU32Zrm     },
+};
+
+static const uint16_t ReplaceableInstrsAVX512DQ[][4] = {
+  // Two integer columns for 64-bit and 32-bit elements.
+  //PackedSingle        PackedDouble        PackedInt           PackedInt
+  { X86::VANDNPSZ128rm, X86::VANDNPDZ128rm, X86::VPANDNQZ128rm, X86::VPANDNDZ128rm },
+  { X86::VANDNPSZ128rr, X86::VANDNPDZ128rr, X86::VPANDNQZ128rr, X86::VPANDNDZ128rr },
+  { X86::VANDPSZ128rm,  X86::VANDPDZ128rm,  X86::VPANDQZ128rm,  X86::VPANDDZ128rm  },
+  { X86::VANDPSZ128rr,  X86::VANDPDZ128rr,  X86::VPANDQZ128rr,  X86::VPANDDZ128rr  },
+  { X86::VORPSZ128rm,   X86::VORPDZ128rm,   X86::VPORQZ128rm,   X86::VPORDZ128rm   },
+  { X86::VORPSZ128rr,   X86::VORPDZ128rr,   X86::VPORQZ128rr,   X86::VPORDZ128rr   },
+  { X86::VXORPSZ128rm,  X86::VXORPDZ128rm,  X86::VPXORQZ128rm,  X86::VPXORDZ128rm  },
+  { X86::VXORPSZ128rr,  X86::VXORPDZ128rr,  X86::VPXORQZ128rr,  X86::VPXORDZ128rr  },
+  { X86::VANDNPSZ256rm, X86::VANDNPDZ256rm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm },
+  { X86::VANDNPSZ256rr, X86::VANDNPDZ256rr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr },
+  { X86::VANDPSZ256rm,  X86::VANDPDZ256rm,  X86::VPANDQZ256rm,  X86::VPANDDZ256rm  },
+  { X86::VANDPSZ256rr,  X86::VANDPDZ256rr,  X86::VPANDQZ256rr,  X86::VPANDDZ256rr  },
+  { X86::VORPSZ256rm,   X86::VORPDZ256rm,   X86::VPORQZ256rm,   X86::VPORDZ256rm   },
+  { X86::VORPSZ256rr,   X86::VORPDZ256rr,   X86::VPORQZ256rr,   X86::VPORDZ256rr   },
+  { X86::VXORPSZ256rm,  X86::VXORPDZ256rm,  X86::VPXORQZ256rm,  X86::VPXORDZ256rm  },
+  { X86::VXORPSZ256rr,  X86::VXORPDZ256rr,  X86::VPXORQZ256rr,  X86::VPXORDZ256rr  },
+  { X86::VANDNPSZrm,    X86::VANDNPDZrm,    X86::VPANDNQZrm,    X86::VPANDNDZrm    },
+  { X86::VANDNPSZrr,    X86::VANDNPDZrr,    X86::VPANDNQZrr,    X86::VPANDNDZrr    },
+  { X86::VANDPSZrm,     X86::VANDPDZrm,     X86::VPANDQZrm,     X86::VPANDDZrm     },
+  { X86::VANDPSZrr,     X86::VANDPDZrr,     X86::VPANDQZrr,     X86::VPANDDZrr     },
+  { X86::VORPSZrm,      X86::VORPDZrm,      X86::VPORQZrm,      X86::VPORDZrm      },
+  { X86::VORPSZrr,      X86::VORPDZrr,      X86::VPORQZrr,      X86::VPORDZrr      },
+  { X86::VXORPSZrm,     X86::VXORPDZrm,     X86::VPXORQZrm,     X86::VPXORDZrm     },
+  { X86::VXORPSZrr,     X86::VXORPDZrr,     X86::VPXORQZrr,     X86::VPXORDZrr     },
+};
+
+static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = {
+  // Two integer columns for 64-bit and 32-bit elements.
+  //PackedSingle          PackedDouble
+  //PackedInt             PackedInt
+  { X86::VANDNPSZ128rmk,  X86::VANDNPDZ128rmk,
+    X86::VPANDNQZ128rmk,  X86::VPANDNDZ128rmk  },
+  { X86::VANDNPSZ128rmkz, X86::VANDNPDZ128rmkz,
+    X86::VPANDNQZ128rmkz, X86::VPANDNDZ128rmkz },
+  { X86::VANDNPSZ128rrk,  X86::VANDNPDZ128rrk,
+    X86::VPANDNQZ128rrk,  X86::VPANDNDZ128rrk  },
+  { X86::VANDNPSZ128rrkz, X86::VANDNPDZ128rrkz,
+    X86::VPANDNQZ128rrkz, X86::VPANDNDZ128rrkz },
+  { X86::VANDPSZ128rmk,   X86::VANDPDZ128rmk,
+    X86::VPANDQZ128rmk,   X86::VPANDDZ128rmk   },
+  { X86::VANDPSZ128rmkz,  X86::VANDPDZ128rmkz,
+    X86::VPANDQZ128rmkz,  X86::VPANDDZ128rmkz  },
+  { X86::VANDPSZ128rrk,   X86::VANDPDZ128rrk,
+    X86::VPANDQZ128rrk,   X86::VPANDDZ128rrk   },
+  { X86::VANDPSZ128rrkz,  X86::VANDPDZ128rrkz,
+    X86::VPANDQZ128rrkz,  X86::VPANDDZ128rrkz  },
+  { X86::VORPSZ128rmk,    X86::VORPDZ128rmk,
+    X86::VPORQZ128rmk,    X86::VPORDZ128rmk    },
+  { X86::VORPSZ128rmkz,   X86::VORPDZ128rmkz,
+    X86::VPORQZ128rmkz,   X86::VPORDZ128rmkz   },
+  { X86::VORPSZ128rrk,    X86::VORPDZ128rrk,
+    X86::VPORQZ128rrk,    X86::VPORDZ128rrk    },
+  { X86::VORPSZ128rrkz,   X86::VORPDZ128rrkz,
+    X86::VPORQZ128rrkz,   X86::VPORDZ128rrkz   },
+  { X86::VXORPSZ128rmk,   X86::VXORPDZ128rmk,
+    X86::VPXORQZ128rmk,   X86::VPXORDZ128rmk   },
+  { X86::VXORPSZ128rmkz,  X86::VXORPDZ128rmkz,
+    X86::VPXORQZ128rmkz,  X86::VPXORDZ128rmkz  },
+  { X86::VXORPSZ128rrk,   X86::VXORPDZ128rrk,
+    X86::VPXORQZ128rrk,   X86::VPXORDZ128rrk   },
+  { X86::VXORPSZ128rrkz,  X86::VXORPDZ128rrkz,
+    X86::VPXORQZ128rrkz,  X86::VPXORDZ128rrkz  },
+  { X86::VANDNPSZ256rmk,  X86::VANDNPDZ256rmk,
+    X86::VPANDNQZ256rmk,  X86::VPANDNDZ256rmk  },
+  { X86::VANDNPSZ256rmkz, X86::VANDNPDZ256rmkz,
+    X86::VPANDNQZ256rmkz, X86::VPANDNDZ256rmkz },
+  { X86::VANDNPSZ256rrk,  X86::VANDNPDZ256rrk,
+    X86::VPANDNQZ256rrk,  X86::VPANDNDZ256rrk  },
+  { X86::VANDNPSZ256rrkz, X86::VANDNPDZ256rrkz,
+    X86::VPANDNQZ256rrkz, X86::VPANDNDZ256rrkz },
+  { X86::VANDPSZ256rmk,   X86::VANDPDZ256rmk,
+    X86::VPANDQZ256rmk,   X86::VPANDDZ256rmk   },
+  { X86::VANDPSZ256rmkz,  X86::VANDPDZ256rmkz,
+    X86::VPANDQZ256rmkz,  X86::VPANDDZ256rmkz  },
+  { X86::VANDPSZ256rrk,   X86::VANDPDZ256rrk,
+    X86::VPANDQZ256rrk,   X86::VPANDDZ256rrk   },
+  { X86::VANDPSZ256rrkz,  X86::VANDPDZ256rrkz,
+    X86::VPANDQZ256rrkz,  X86::VPANDDZ256rrkz  },
+  { X86::VORPSZ256rmk,    X86::VORPDZ256rmk,
+    X86::VPORQZ256rmk,    X86::VPORDZ256rmk    },
+  { X86::VORPSZ256rmkz,   X86::VORPDZ256rmkz,
+    X86::VPORQZ256rmkz,   X86::VPORDZ256rmkz   },
+  { X86::VORPSZ256rrk,    X86::VORPDZ256rrk,
+    X86::VPORQZ256rrk,    X86::VPORDZ256rrk    },
+  { X86::VORPSZ256rrkz,   X86::VORPDZ256rrkz,
+    X86::VPORQZ256rrkz,   X86::VPORDZ256rrkz   },
+  { X86::VXORPSZ256rmk,   X86::VXORPDZ256rmk,
+    X86::VPXORQZ256rmk,   X86::VPXORDZ256rmk   },
+  { X86::VXORPSZ256rmkz,  X86::VXORPDZ256rmkz,
+    X86::VPXORQZ256rmkz,  X86::VPXORDZ256rmkz  },
+  { X86::VXORPSZ256rrk,   X86::VXORPDZ256rrk,
+    X86::VPXORQZ256rrk,   X86::VPXORDZ256rrk   },
+  { X86::VXORPSZ256rrkz,  X86::VXORPDZ256rrkz,
+    X86::VPXORQZ256rrkz,  X86::VPXORDZ256rrkz  },
+  { X86::VANDNPSZrmk,     X86::VANDNPDZrmk,
+    X86::VPANDNQZrmk,     X86::VPANDNDZrmk     },
+  { X86::VANDNPSZrmkz,    X86::VANDNPDZrmkz,
+    X86::VPANDNQZrmkz,    X86::VPANDNDZrmkz    },
+  { X86::VANDNPSZrrk,     X86::VANDNPDZrrk,
+    X86::VPANDNQZrrk,     X86::VPANDNDZrrk     },
+  { X86::VANDNPSZrrkz,    X86::VANDNPDZrrkz,
+    X86::VPANDNQZrrkz,    X86::VPANDNDZrrkz    },
+  { X86::VANDPSZrmk,      X86::VANDPDZrmk,
+    X86::VPANDQZrmk,      X86::VPANDDZrmk      },
+  { X86::VANDPSZrmkz,     X86::VANDPDZrmkz,
+    X86::VPANDQZrmkz,     X86::VPANDDZrmkz     },
+  { X86::VANDPSZrrk,      X86::VANDPDZrrk,
+    X86::VPANDQZrrk,      X86::VPANDDZrrk      },
+  { X86::VANDPSZrrkz,     X86::VANDPDZrrkz,
+    X86::VPANDQZrrkz,     X86::VPANDDZrrkz     },
+  { X86::VORPSZrmk,       X86::VORPDZrmk,
+    X86::VPORQZrmk,       X86::VPORDZrmk       },
+  { X86::VORPSZrmkz,      X86::VORPDZrmkz,
+    X86::VPORQZrmkz,      X86::VPORDZrmkz      },
+  { X86::VORPSZrrk,       X86::VORPDZrrk,
+    X86::VPORQZrrk,       X86::VPORDZrrk       },
+  { X86::VORPSZrrkz,      X86::VORPDZrrkz,
+    X86::VPORQZrrkz,      X86::VPORDZrrkz      },
+  { X86::VXORPSZrmk,      X86::VXORPDZrmk,
+    X86::VPXORQZrmk,      X86::VPXORDZrmk      },
+  { X86::VXORPSZrmkz,     X86::VXORPDZrmkz,
+    X86::VPXORQZrmkz,     X86::VPXORDZrmkz     },
+  { X86::VXORPSZrrk,      X86::VXORPDZrrk,
+    X86::VPXORQZrrk,      X86::VPXORDZrrk      },
+  { X86::VXORPSZrrkz,     X86::VXORPDZrrkz,
+    X86::VPXORQZrrkz,     X86::VPXORDZrrkz     },
+  // Broadcast loads can be handled the same as masked operations to avoid
+  // changing element size.
+  { X86::VANDNPSZ128rmb,  X86::VANDNPDZ128rmb,
+    X86::VPANDNQZ128rmb,  X86::VPANDNDZ128rmb  },
+  { X86::VANDPSZ128rmb,   X86::VANDPDZ128rmb,
+    X86::VPANDQZ128rmb,   X86::VPANDDZ128rmb   },
+  { X86::VORPSZ128rmb,    X86::VORPDZ128rmb,
+    X86::VPORQZ128rmb,    X86::VPORDZ128rmb    },
+  { X86::VXORPSZ128rmb,   X86::VXORPDZ128rmb,
+    X86::VPXORQZ128rmb,   X86::VPXORDZ128rmb   },
+  { X86::VANDNPSZ256rmb,  X86::VANDNPDZ256rmb,
+    X86::VPANDNQZ256rmb,  X86::VPANDNDZ256rmb  },
+  { X86::VANDPSZ256rmb,   X86::VANDPDZ256rmb,
+    X86::VPANDQZ256rmb,   X86::VPANDDZ256rmb   },
+  { X86::VORPSZ256rmb,    X86::VORPDZ256rmb,
+    X86::VPORQZ256rmb,    X86::VPORDZ256rmb    },
+  { X86::VXORPSZ256rmb,   X86::VXORPDZ256rmb,
+    X86::VPXORQZ256rmb,   X86::VPXORDZ256rmb   },
+  { X86::VANDNPSZrmb,     X86::VANDNPDZrmb,
+    X86::VPANDNQZrmb,     X86::VPANDNDZrmb     },
+  { X86::VANDPSZrmb,      X86::VANDPDZrmb,
+    X86::VPANDQZrmb,      X86::VPANDDZrmb      },
+  { X86::VANDPSZrmb,      X86::VANDPDZrmb,
+    X86::VPANDQZrmb,      X86::VPANDDZrmb      },
+  { X86::VORPSZrmb,       X86::VORPDZrmb,
+    X86::VPORQZrmb,       X86::VPORDZrmb       },
+  { X86::VXORPSZrmb,      X86::VXORPDZrmb,
+    X86::VPXORQZrmb,      X86::VPXORDZrmb      },
+  { X86::VANDNPSZ128rmbk, X86::VANDNPDZ128rmbk,
+    X86::VPANDNQZ128rmbk, X86::VPANDNDZ128rmbk },
+  { X86::VANDPSZ128rmbk,  X86::VANDPDZ128rmbk,
+    X86::VPANDQZ128rmbk,  X86::VPANDDZ128rmbk  },
+  { X86::VORPSZ128rmbk,   X86::VORPDZ128rmbk,
+    X86::VPORQZ128rmbk,   X86::VPORDZ128rmbk   },
+  { X86::VXORPSZ128rmbk,  X86::VXORPDZ128rmbk,
+    X86::VPXORQZ128rmbk,  X86::VPXORDZ128rmbk  },
+  { X86::VANDNPSZ256rmbk, X86::VANDNPDZ256rmbk,
+    X86::VPANDNQZ256rmbk, X86::VPANDNDZ256rmbk },
+  { X86::VANDPSZ256rmbk,  X86::VANDPDZ256rmbk,
+    X86::VPANDQZ256rmbk,  X86::VPANDDZ256rmbk  },
+  { X86::VORPSZ256rmbk,   X86::VORPDZ256rmbk,
+    X86::VPORQZ256rmbk,   X86::VPORDZ256rmbk   },
+  { X86::VXORPSZ256rmbk,  X86::VXORPDZ256rmbk,
+    X86::VPXORQZ256rmbk,  X86::VPXORDZ256rmbk  },
+  { X86::VANDNPSZrmbk,    X86::VANDNPDZrmbk,
+    X86::VPANDNQZrmbk,    X86::VPANDNDZrmbk    },
+  { X86::VANDPSZrmbk,     X86::VANDPDZrmbk,
+    X86::VPANDQZrmbk,     X86::VPANDDZrmbk     },
+  { X86::VANDPSZrmbk,     X86::VANDPDZrmbk,
+    X86::VPANDQZrmbk,     X86::VPANDDZrmbk     },
+  { X86::VORPSZrmbk,      X86::VORPDZrmbk,
+    X86::VPORQZrmbk,      X86::VPORDZrmbk      },
+  { X86::VXORPSZrmbk,     X86::VXORPDZrmbk,
+    X86::VPXORQZrmbk,     X86::VPXORDZrmbk     },
+  { X86::VANDNPSZ128rmbkz,X86::VANDNPDZ128rmbkz,
+    X86::VPANDNQZ128rmbkz,X86::VPANDNDZ128rmbkz},
+  { X86::VANDPSZ128rmbkz, X86::VANDPDZ128rmbkz,
+    X86::VPANDQZ128rmbkz, X86::VPANDDZ128rmbkz },
+  { X86::VORPSZ128rmbkz,  X86::VORPDZ128rmbkz,
+    X86::VPORQZ128rmbkz,  X86::VPORDZ128rmbkz  },
+  { X86::VXORPSZ128rmbkz, X86::VXORPDZ128rmbkz,
+    X86::VPXORQZ128rmbkz, X86::VPXORDZ128rmbkz },
+  { X86::VANDNPSZ256rmbkz,X86::VANDNPDZ256rmbkz,
+    X86::VPANDNQZ256rmbkz,X86::VPANDNDZ256rmbkz},
+  { X86::VANDPSZ256rmbkz, X86::VANDPDZ256rmbkz,
+    X86::VPANDQZ256rmbkz, X86::VPANDDZ256rmbkz },
+  { X86::VORPSZ256rmbkz,  X86::VORPDZ256rmbkz,
+    X86::VPORQZ256rmbkz,  X86::VPORDZ256rmbkz  },
+  { X86::VXORPSZ256rmbkz, X86::VXORPDZ256rmbkz,
+    X86::VPXORQZ256rmbkz, X86::VPXORDZ256rmbkz },
+  { X86::VANDNPSZrmbkz,   X86::VANDNPDZrmbkz,
+    X86::VPANDNQZrmbkz,   X86::VPANDNDZrmbkz   },
+  { X86::VANDPSZrmbkz,    X86::VANDPDZrmbkz,
+    X86::VPANDQZrmbkz,    X86::VPANDDZrmbkz    },
+  { X86::VANDPSZrmbkz,    X86::VANDPDZrmbkz,
+    X86::VPANDQZrmbkz,    X86::VPANDDZrmbkz    },
+  { X86::VORPSZrmbkz,     X86::VORPDZrmbkz,
+    X86::VPORQZrmbkz,     X86::VPORDZrmbkz     },
+  { X86::VXORPSZrmbkz,    X86::VXORPDZrmbkz,
+    X86::VPXORQZrmbkz,    X86::VPXORDZrmbkz    },
+};
+
+// FIXME: Some shuffle and unpack instructions have equivalents in different
+// domains, but they require a bit more work than just switching opcodes.
+
+static const uint16_t *lookup(unsigned opcode, unsigned domain,
+                              ArrayRef<uint16_t[3]> Table) {
+  for (const uint16_t (&Row)[3] : Table)
+    if (Row[domain-1] == opcode)
+      return Row;
+  return nullptr;
+}
+
+static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
+                                    ArrayRef<uint16_t[4]> Table) {
+  // If this is the integer domain make sure to check both integer columns.
+  for (const uint16_t (&Row)[4] : Table)
+    if (Row[domain-1] == opcode || (domain == 3 && Row[3] == opcode))
+      return Row;
+  return nullptr;
+}
+
+std::pair<uint16_t, uint16_t>
+X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
+  uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  unsigned opcode = MI.getOpcode();
+  uint16_t validDomains = 0;
+  if (domain) {
+    if (lookup(MI.getOpcode(), domain, ReplaceableInstrs)) {
+      validDomains = 0xe;
+    } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
+      validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
+    } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
+      validDomains = 0xe;
+    } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
+      validDomains = Subtarget.hasDQI() ? 0xe : 0x8;
+    } else if (const uint16_t *table = lookupAVX512(opcode, domain,
+                                             ReplaceableInstrsAVX512DQMasked)) {
+      if (domain == 1 || (domain == 3 && table[3] == opcode))
+        validDomains = Subtarget.hasDQI() ? 0xa : 0x8;
+      else
+        validDomains = Subtarget.hasDQI() ? 0xc : 0x8;
+    }
+  }
+  return std::make_pair(domain, validDomains);
+}
+
+void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
+  assert(Domain>0 && Domain<4 && "Invalid execution domain");
+  uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  assert(dom && "Not an SSE instruction");
+  const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
+  if (!table) { // try the other table
+    assert((Subtarget.hasAVX2() || Domain < 3) &&
+           "256-bit vector operations only available in AVX2");
+    table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
+  }
+  if (!table) { // try the AVX512 table
+    assert(Subtarget.hasAVX512() && "Requires AVX-512");
+    table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
+    // Don't change integer Q instructions to D instructions.
+    if (table && Domain == 3 && table[3] == MI.getOpcode())
+      Domain = 4;
+  }
+  if (!table) { // try the AVX512DQ table
+    assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
+    table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
+    // Don't change integer Q instructions to D instructions and
+    // use D intructions if we started with a PS instruction.
+    if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
+      Domain = 4;
+  }
+  if (!table) { // try the AVX512DQMasked table
+    assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
+    table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked);
+    if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
+      Domain = 4;
+  }
+  assert(table && "Cannot change domain");
+  MI.setDesc(get(table[Domain - 1]));
+}
+
+/// Return the noop instruction to use for a noop.
+void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(X86::NOOP);
+}
+
+bool X86InstrInfo::isHighLatencyDef(int opc) const {
+  switch (opc) {
+  default: return false;
+  case X86::DIVPDrm:
+  case X86::DIVPDrr:
+  case X86::DIVPSrm:
+  case X86::DIVPSrr:
+  case X86::DIVSDrm:
+  case X86::DIVSDrm_Int:
+  case X86::DIVSDrr:
+  case X86::DIVSDrr_Int:
+  case X86::DIVSSrm:
+  case X86::DIVSSrm_Int:
+  case X86::DIVSSrr:
+  case X86::DIVSSrr_Int:
+  case X86::SQRTPDm:
+  case X86::SQRTPDr:
+  case X86::SQRTPSm:
+  case X86::SQRTPSr:
+  case X86::SQRTSDm:
+  case X86::SQRTSDm_Int:
+  case X86::SQRTSDr:
+  case X86::SQRTSDr_Int:
+  case X86::SQRTSSm:
+  case X86::SQRTSSm_Int:
+  case X86::SQRTSSr:
+  case X86::SQRTSSr_Int:
+  // AVX instructions with high latency
+  case X86::VDIVPDrm:
+  case X86::VDIVPDrr:
+  case X86::VDIVPDYrm:
+  case X86::VDIVPDYrr:
+  case X86::VDIVPSrm:
+  case X86::VDIVPSrr:
+  case X86::VDIVPSYrm:
+  case X86::VDIVPSYrr:
+  case X86::VDIVSDrm:
+  case X86::VDIVSDrm_Int:
+  case X86::VDIVSDrr:
+  case X86::VDIVSDrr_Int:
+  case X86::VDIVSSrm:
+  case X86::VDIVSSrm_Int:
+  case X86::VDIVSSrr:
+  case X86::VDIVSSrr_Int:
+  case X86::VSQRTPDm:
+  case X86::VSQRTPDr:
+  case X86::VSQRTPDYm:
+  case X86::VSQRTPDYr:
+  case X86::VSQRTPSm:
+  case X86::VSQRTPSr:
+  case X86::VSQRTPSYm:
+  case X86::VSQRTPSYr:
+  case X86::VSQRTSDm:
+  case X86::VSQRTSDm_Int:
+  case X86::VSQRTSDr:
+  case X86::VSQRTSDr_Int:
+  case X86::VSQRTSSm:
+  case X86::VSQRTSSm_Int:
+  case X86::VSQRTSSr:
+  case X86::VSQRTSSr_Int:
+  // AVX512 instructions with high latency
+  case X86::VDIVPDZ128rm:
+  case X86::VDIVPDZ128rmb:
+  case X86::VDIVPDZ128rmbk:
+  case X86::VDIVPDZ128rmbkz:
+  case X86::VDIVPDZ128rmk:
+  case X86::VDIVPDZ128rmkz:
+  case X86::VDIVPDZ128rr:
+  case X86::VDIVPDZ128rrk:
+  case X86::VDIVPDZ128rrkz:
+  case X86::VDIVPDZ256rm:
+  case X86::VDIVPDZ256rmb:
+  case X86::VDIVPDZ256rmbk:
+  case X86::VDIVPDZ256rmbkz:
+  case X86::VDIVPDZ256rmk:
+  case X86::VDIVPDZ256rmkz:
+  case X86::VDIVPDZ256rr:
+  case X86::VDIVPDZ256rrk:
+  case X86::VDIVPDZ256rrkz:
+  case X86::VDIVPDZrb:
+  case X86::VDIVPDZrbk:
+  case X86::VDIVPDZrbkz:
+  case X86::VDIVPDZrm:
+  case X86::VDIVPDZrmb:
+  case X86::VDIVPDZrmbk:
+  case X86::VDIVPDZrmbkz:
+  case X86::VDIVPDZrmk:
+  case X86::VDIVPDZrmkz:
+  case X86::VDIVPDZrr:
+  case X86::VDIVPDZrrk:
+  case X86::VDIVPDZrrkz:
+  case X86::VDIVPSZ128rm:
+  case X86::VDIVPSZ128rmb:
+  case X86::VDIVPSZ128rmbk:
+  case X86::VDIVPSZ128rmbkz:
+  case X86::VDIVPSZ128rmk:
+  case X86::VDIVPSZ128rmkz:
+  case X86::VDIVPSZ128rr:
+  case X86::VDIVPSZ128rrk:
+  case X86::VDIVPSZ128rrkz:
+  case X86::VDIVPSZ256rm:
+  case X86::VDIVPSZ256rmb:
+  case X86::VDIVPSZ256rmbk:
+  case X86::VDIVPSZ256rmbkz:
+  case X86::VDIVPSZ256rmk:
+  case X86::VDIVPSZ256rmkz:
+  case X86::VDIVPSZ256rr:
+  case X86::VDIVPSZ256rrk:
+  case X86::VDIVPSZ256rrkz:
+  case X86::VDIVPSZrb:
+  case X86::VDIVPSZrbk:
+  case X86::VDIVPSZrbkz:
+  case X86::VDIVPSZrm:
+  case X86::VDIVPSZrmb:
+  case X86::VDIVPSZrmbk:
+  case X86::VDIVPSZrmbkz:
+  case X86::VDIVPSZrmk:
+  case X86::VDIVPSZrmkz:
+  case X86::VDIVPSZrr:
+  case X86::VDIVPSZrrk:
+  case X86::VDIVPSZrrkz:
+  case X86::VDIVSDZrm:
+  case X86::VDIVSDZrr:
+  case X86::VDIVSDZrm_Int:
+  case X86::VDIVSDZrm_Intk:
+  case X86::VDIVSDZrm_Intkz:
+  case X86::VDIVSDZrr_Int:
+  case X86::VDIVSDZrr_Intk:
+  case X86::VDIVSDZrr_Intkz:
+  case X86::VDIVSDZrrb:
+  case X86::VDIVSDZrrbk:
+  case X86::VDIVSDZrrbkz:
+  case X86::VDIVSSZrm:
+  case X86::VDIVSSZrr:
+  case X86::VDIVSSZrm_Int:
+  case X86::VDIVSSZrm_Intk:
+  case X86::VDIVSSZrm_Intkz:
+  case X86::VDIVSSZrr_Int:
+  case X86::VDIVSSZrr_Intk:
+  case X86::VDIVSSZrr_Intkz:
+  case X86::VDIVSSZrrb:
+  case X86::VDIVSSZrrbk:
+  case X86::VDIVSSZrrbkz:
+  case X86::VSQRTPDZ128m:
+  case X86::VSQRTPDZ128mb:
+  case X86::VSQRTPDZ128mbk:
+  case X86::VSQRTPDZ128mbkz:
+  case X86::VSQRTPDZ128mk:
+  case X86::VSQRTPDZ128mkz:
+  case X86::VSQRTPDZ128r:
+  case X86::VSQRTPDZ128rk:
+  case X86::VSQRTPDZ128rkz:
+  case X86::VSQRTPDZ256m:
+  case X86::VSQRTPDZ256mb:
+  case X86::VSQRTPDZ256mbk:
+  case X86::VSQRTPDZ256mbkz:
+  case X86::VSQRTPDZ256mk:
+  case X86::VSQRTPDZ256mkz:
+  case X86::VSQRTPDZ256r:
+  case X86::VSQRTPDZ256rk:
+  case X86::VSQRTPDZ256rkz:
+  case X86::VSQRTPDZm:
+  case X86::VSQRTPDZmb:
+  case X86::VSQRTPDZmbk:
+  case X86::VSQRTPDZmbkz:
+  case X86::VSQRTPDZmk:
+  case X86::VSQRTPDZmkz:
+  case X86::VSQRTPDZr:
+  case X86::VSQRTPDZrb:
+  case X86::VSQRTPDZrbk:
+  case X86::VSQRTPDZrbkz:
+  case X86::VSQRTPDZrk:
+  case X86::VSQRTPDZrkz:
+  case X86::VSQRTPSZ128m:
+  case X86::VSQRTPSZ128mb:
+  case X86::VSQRTPSZ128mbk:
+  case X86::VSQRTPSZ128mbkz:
+  case X86::VSQRTPSZ128mk:
+  case X86::VSQRTPSZ128mkz:
+  case X86::VSQRTPSZ128r:
+  case X86::VSQRTPSZ128rk:
+  case X86::VSQRTPSZ128rkz:
+  case X86::VSQRTPSZ256m:
+  case X86::VSQRTPSZ256mb:
+  case X86::VSQRTPSZ256mbk:
+  case X86::VSQRTPSZ256mbkz:
+  case X86::VSQRTPSZ256mk:
+  case X86::VSQRTPSZ256mkz:
+  case X86::VSQRTPSZ256r:
+  case X86::VSQRTPSZ256rk:
+  case X86::VSQRTPSZ256rkz:
+  case X86::VSQRTPSZm:
+  case X86::VSQRTPSZmb:
+  case X86::VSQRTPSZmbk:
+  case X86::VSQRTPSZmbkz:
+  case X86::VSQRTPSZmk:
+  case X86::VSQRTPSZmkz:
+  case X86::VSQRTPSZr:
+  case X86::VSQRTPSZrb:
+  case X86::VSQRTPSZrbk:
+  case X86::VSQRTPSZrbkz:
+  case X86::VSQRTPSZrk:
+  case X86::VSQRTPSZrkz:
+  case X86::VSQRTSDZm:
+  case X86::VSQRTSDZm_Int:
+  case X86::VSQRTSDZm_Intk:
+  case X86::VSQRTSDZm_Intkz:
+  case X86::VSQRTSDZr:
+  case X86::VSQRTSDZr_Int:
+  case X86::VSQRTSDZr_Intk:
+  case X86::VSQRTSDZr_Intkz:
+  case X86::VSQRTSDZrb_Int:
+  case X86::VSQRTSDZrb_Intk:
+  case X86::VSQRTSDZrb_Intkz:
+  case X86::VSQRTSSZm:
+  case X86::VSQRTSSZm_Int:
+  case X86::VSQRTSSZm_Intk:
+  case X86::VSQRTSSZm_Intkz:
+  case X86::VSQRTSSZr:
+  case X86::VSQRTSSZr_Int:
+  case X86::VSQRTSSZr_Intk:
+  case X86::VSQRTSSZr_Intkz:
+  case X86::VSQRTSSZrb_Int:
+  case X86::VSQRTSSZrb_Intk:
+  case X86::VSQRTSSZrb_Intkz:
+
+  case X86::VGATHERDPDYrm:
+  case X86::VGATHERDPDZ128rm:
+  case X86::VGATHERDPDZ256rm:
+  case X86::VGATHERDPDZrm:
+  case X86::VGATHERDPDrm:
+  case X86::VGATHERDPSYrm:
+  case X86::VGATHERDPSZ128rm:
+  case X86::VGATHERDPSZ256rm:
+  case X86::VGATHERDPSZrm:
+  case X86::VGATHERDPSrm:
+  case X86::VGATHERPF0DPDm:
+  case X86::VGATHERPF0DPSm:
+  case X86::VGATHERPF0QPDm:
+  case X86::VGATHERPF0QPSm:
+  case X86::VGATHERPF1DPDm:
+  case X86::VGATHERPF1DPSm:
+  case X86::VGATHERPF1QPDm:
+  case X86::VGATHERPF1QPSm:
+  case X86::VGATHERQPDYrm:
+  case X86::VGATHERQPDZ128rm:
+  case X86::VGATHERQPDZ256rm:
+  case X86::VGATHERQPDZrm:
+  case X86::VGATHERQPDrm:
+  case X86::VGATHERQPSYrm:
+  case X86::VGATHERQPSZ128rm:
+  case X86::VGATHERQPSZ256rm:
+  case X86::VGATHERQPSZrm:
+  case X86::VGATHERQPSrm:
+  case X86::VPGATHERDDYrm:
+  case X86::VPGATHERDDZ128rm:
+  case X86::VPGATHERDDZ256rm:
+  case X86::VPGATHERDDZrm:
+  case X86::VPGATHERDDrm:
+  case X86::VPGATHERDQYrm:
+  case X86::VPGATHERDQZ128rm:
+  case X86::VPGATHERDQZ256rm:
+  case X86::VPGATHERDQZrm:
+  case X86::VPGATHERDQrm:
+  case X86::VPGATHERQDYrm:
+  case X86::VPGATHERQDZ128rm:
+  case X86::VPGATHERQDZ256rm:
+  case X86::VPGATHERQDZrm:
+  case X86::VPGATHERQDrm:
+  case X86::VPGATHERQQYrm:
+  case X86::VPGATHERQQZ128rm:
+  case X86::VPGATHERQQZ256rm:
+  case X86::VPGATHERQQZrm:
+  case X86::VPGATHERQQrm:
+  case X86::VSCATTERDPDZ128mr:
+  case X86::VSCATTERDPDZ256mr:
+  case X86::VSCATTERDPDZmr:
+  case X86::VSCATTERDPSZ128mr:
+  case X86::VSCATTERDPSZ256mr:
+  case X86::VSCATTERDPSZmr:
+  case X86::VSCATTERPF0DPDm:
+  case X86::VSCATTERPF0DPSm:
+  case X86::VSCATTERPF0QPDm:
+  case X86::VSCATTERPF0QPSm:
+  case X86::VSCATTERPF1DPDm:
+  case X86::VSCATTERPF1DPSm:
+  case X86::VSCATTERPF1QPDm:
+  case X86::VSCATTERPF1QPSm:
+  case X86::VSCATTERQPDZ128mr:
+  case X86::VSCATTERQPDZ256mr:
+  case X86::VSCATTERQPDZmr:
+  case X86::VSCATTERQPSZ128mr:
+  case X86::VSCATTERQPSZ256mr:
+  case X86::VSCATTERQPSZmr:
+  case X86::VPSCATTERDDZ128mr:
+  case X86::VPSCATTERDDZ256mr:
+  case X86::VPSCATTERDDZmr:
+  case X86::VPSCATTERDQZ128mr:
+  case X86::VPSCATTERDQZ256mr:
+  case X86::VPSCATTERDQZmr:
+  case X86::VPSCATTERQDZ128mr:
+  case X86::VPSCATTERQDZ256mr:
+  case X86::VPSCATTERQDZmr:
+  case X86::VPSCATTERQQZ128mr:
+  case X86::VPSCATTERQQZ256mr:
+  case X86::VPSCATTERQQZmr:
+    return true;
+  }
+}
+
+bool X86InstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel,
+                                         const MachineRegisterInfo *MRI,
+                                         const MachineInstr &DefMI,
+                                         unsigned DefIdx,
+                                         const MachineInstr &UseMI,
+                                         unsigned UseIdx) const {
+  return isHighLatencyDef(DefMI.getOpcode());
+}
+
+bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst,
+                                           const MachineBasicBlock *MBB) const {
+  assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) &&
+         "Reassociation needs binary operators");
+
+  // Integer binary math/logic instructions have a third source operand:
+  // the EFLAGS register. That operand must be both defined here and never
+  // used; ie, it must be dead. If the EFLAGS operand is live, then we can
+  // not change anything because rearranging the operands could affect other
+  // instructions that depend on the exact status flags (zero, sign, etc.)
+  // that are set by using these particular operands with this operation.
+  if (Inst.getNumOperands() == 4) {
+    assert(Inst.getOperand(3).isReg() &&
+           Inst.getOperand(3).getReg() == X86::EFLAGS &&
+           "Unexpected operand in reassociable instruction");
+    if (!Inst.getOperand(3).isDead())
+      return false;
+  }
+
+  return TargetInstrInfo::hasReassociableOperands(Inst, MBB);
+}
+
+// TODO: There are many more machine instruction opcodes to match:
+//       1. Other data types (integer, vectors)
+//       2. Other math / logic operations (xor, or)
+//       3. Other forms of the same operation (intrinsics and other variants)
+bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
+  switch (Inst.getOpcode()) {
+  case X86::AND8rr:
+  case X86::AND16rr:
+  case X86::AND32rr:
+  case X86::AND64rr:
+  case X86::OR8rr:
+  case X86::OR16rr:
+  case X86::OR32rr:
+  case X86::OR64rr:
+  case X86::XOR8rr:
+  case X86::XOR16rr:
+  case X86::XOR32rr:
+  case X86::XOR64rr:
+  case X86::IMUL16rr:
+  case X86::IMUL32rr:
+  case X86::IMUL64rr:
+  case X86::PANDrr:
+  case X86::PORrr:
+  case X86::PXORrr:
+  case X86::ANDPDrr:
+  case X86::ANDPSrr:
+  case X86::ORPDrr:
+  case X86::ORPSrr:
+  case X86::XORPDrr:
+  case X86::XORPSrr:
+  case X86::PADDBrr:
+  case X86::PADDWrr:
+  case X86::PADDDrr:
+  case X86::PADDQrr:
+  case X86::VPANDrr:
+  case X86::VPANDYrr:
+  case X86::VPANDDZ128rr:
+  case X86::VPANDDZ256rr:
+  case X86::VPANDDZrr:
+  case X86::VPANDQZ128rr:
+  case X86::VPANDQZ256rr:
+  case X86::VPANDQZrr:
+  case X86::VPORrr:
+  case X86::VPORYrr:
+  case X86::VPORDZ128rr:
+  case X86::VPORDZ256rr:
+  case X86::VPORDZrr:
+  case X86::VPORQZ128rr:
+  case X86::VPORQZ256rr:
+  case X86::VPORQZrr:
+  case X86::VPXORrr:
+  case X86::VPXORYrr:
+  case X86::VPXORDZ128rr:
+  case X86::VPXORDZ256rr:
+  case X86::VPXORDZrr:
+  case X86::VPXORQZ128rr:
+  case X86::VPXORQZ256rr:
+  case X86::VPXORQZrr:
+  case X86::VANDPDrr:
+  case X86::VANDPSrr:
+  case X86::VANDPDYrr:
+  case X86::VANDPSYrr:
+  case X86::VANDPDZ128rr:
+  case X86::VANDPSZ128rr:
+  case X86::VANDPDZ256rr:
+  case X86::VANDPSZ256rr:
+  case X86::VANDPDZrr:
+  case X86::VANDPSZrr:
+  case X86::VORPDrr:
+  case X86::VORPSrr:
+  case X86::VORPDYrr:
+  case X86::VORPSYrr:
+  case X86::VORPDZ128rr:
+  case X86::VORPSZ128rr:
+  case X86::VORPDZ256rr:
+  case X86::VORPSZ256rr:
+  case X86::VORPDZrr:
+  case X86::VORPSZrr:
+  case X86::VXORPDrr:
+  case X86::VXORPSrr:
+  case X86::VXORPDYrr:
+  case X86::VXORPSYrr:
+  case X86::VXORPDZ128rr:
+  case X86::VXORPSZ128rr:
+  case X86::VXORPDZ256rr:
+  case X86::VXORPSZ256rr:
+  case X86::VXORPDZrr:
+  case X86::VXORPSZrr:
+  case X86::KADDBrr:
+  case X86::KADDWrr:
+  case X86::KADDDrr:
+  case X86::KADDQrr:
+  case X86::KANDBrr:
+  case X86::KANDWrr:
+  case X86::KANDDrr:
+  case X86::KANDQrr:
+  case X86::KORBrr:
+  case X86::KORWrr:
+  case X86::KORDrr:
+  case X86::KORQrr:
+  case X86::KXORBrr:
+  case X86::KXORWrr:
+  case X86::KXORDrr:
+  case X86::KXORQrr:
+  case X86::VPADDBrr:
+  case X86::VPADDWrr:
+  case X86::VPADDDrr:
+  case X86::VPADDQrr:
+  case X86::VPADDBYrr:
+  case X86::VPADDWYrr:
+  case X86::VPADDDYrr:
+  case X86::VPADDQYrr:
+  case X86::VPADDBZ128rr:
+  case X86::VPADDWZ128rr:
+  case X86::VPADDDZ128rr:
+  case X86::VPADDQZ128rr:
+  case X86::VPADDBZ256rr:
+  case X86::VPADDWZ256rr:
+  case X86::VPADDDZ256rr:
+  case X86::VPADDQZ256rr:
+  case X86::VPADDBZrr:
+  case X86::VPADDWZrr:
+  case X86::VPADDDZrr:
+  case X86::VPADDQZrr:
+  case X86::VPMULLWrr:
+  case X86::VPMULLWYrr:
+  case X86::VPMULLWZ128rr:
+  case X86::VPMULLWZ256rr:
+  case X86::VPMULLWZrr:
+  case X86::VPMULLDrr:
+  case X86::VPMULLDYrr:
+  case X86::VPMULLDZ128rr:
+  case X86::VPMULLDZ256rr:
+  case X86::VPMULLDZrr:
+  case X86::VPMULLQZ128rr:
+  case X86::VPMULLQZ256rr:
+  case X86::VPMULLQZrr:
+  // Normal min/max instructions are not commutative because of NaN and signed
+  // zero semantics, but these are. Thus, there's no need to check for global
+  // relaxed math; the instructions themselves have the properties we need.
+  case X86::MAXCPDrr:
+  case X86::MAXCPSrr:
+  case X86::MAXCSDrr:
+  case X86::MAXCSSrr:
+  case X86::MINCPDrr:
+  case X86::MINCPSrr:
+  case X86::MINCSDrr:
+  case X86::MINCSSrr:
+  case X86::VMAXCPDrr:
+  case X86::VMAXCPSrr:
+  case X86::VMAXCPDYrr:
+  case X86::VMAXCPSYrr:
+  case X86::VMAXCPDZ128rr:
+  case X86::VMAXCPSZ128rr:
+  case X86::VMAXCPDZ256rr:
+  case X86::VMAXCPSZ256rr:
+  case X86::VMAXCPDZrr:
+  case X86::VMAXCPSZrr:
+  case X86::VMAXCSDrr:
+  case X86::VMAXCSSrr:
+  case X86::VMAXCSDZrr:
+  case X86::VMAXCSSZrr:
+  case X86::VMINCPDrr:
+  case X86::VMINCPSrr:
+  case X86::VMINCPDYrr:
+  case X86::VMINCPSYrr:
+  case X86::VMINCPDZ128rr:
+  case X86::VMINCPSZ128rr:
+  case X86::VMINCPDZ256rr:
+  case X86::VMINCPSZ256rr:
+  case X86::VMINCPDZrr:
+  case X86::VMINCPSZrr:
+  case X86::VMINCSDrr:
+  case X86::VMINCSSrr:
+  case X86::VMINCSDZrr:
+  case X86::VMINCSSZrr:
+    return true;
+  case X86::ADDPDrr:
+  case X86::ADDPSrr:
+  case X86::ADDSDrr:
+  case X86::ADDSSrr:
+  case X86::MULPDrr:
+  case X86::MULPSrr:
+  case X86::MULSDrr:
+  case X86::MULSSrr:
+  case X86::VADDPDrr:
+  case X86::VADDPSrr:
+  case X86::VADDPDYrr:
+  case X86::VADDPSYrr:
+  case X86::VADDPDZ128rr:
+  case X86::VADDPSZ128rr:
+  case X86::VADDPDZ256rr:
+  case X86::VADDPSZ256rr:
+  case X86::VADDPDZrr:
+  case X86::VADDPSZrr:
+  case X86::VADDSDrr:
+  case X86::VADDSSrr:
+  case X86::VADDSDZrr:
+  case X86::VADDSSZrr:
+  case X86::VMULPDrr:
+  case X86::VMULPSrr:
+  case X86::VMULPDYrr:
+  case X86::VMULPSYrr:
+  case X86::VMULPDZ128rr:
+  case X86::VMULPSZ128rr:
+  case X86::VMULPDZ256rr:
+  case X86::VMULPSZ256rr:
+  case X86::VMULPDZrr:
+  case X86::VMULPSZrr:
+  case X86::VMULSDrr:
+  case X86::VMULSSrr:
+  case X86::VMULSDZrr:
+  case X86::VMULSSZrr:
+    return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+  default:
+    return false;
+  }
+}
+
+/// This is an architecture-specific helper function of reassociateOps.
+/// Set special operand attributes for new instructions after reassociation.
+void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
+                                         MachineInstr &OldMI2,
+                                         MachineInstr &NewMI1,
+                                         MachineInstr &NewMI2) const {
+  // Integer instructions define an implicit EFLAGS source register operand as
+  // the third source (fourth total) operand.
+  if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4)
+    return;
+
+  assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 &&
+         "Unexpected instruction type for reassociation");
+
+  MachineOperand &OldOp1 = OldMI1.getOperand(3);
+  MachineOperand &OldOp2 = OldMI2.getOperand(3);
+  MachineOperand &NewOp1 = NewMI1.getOperand(3);
+  MachineOperand &NewOp2 = NewMI2.getOperand(3);
+
+  assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() &&
+         "Must have dead EFLAGS operand in reassociable instruction");
+  assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() &&
+         "Must have dead EFLAGS operand in reassociable instruction");
+
+  (void)OldOp1;
+  (void)OldOp2;
+
+  assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS &&
+         "Unexpected operand in reassociable instruction");
+  assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS &&
+         "Unexpected operand in reassociable instruction");
+
+  // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
+  // of this pass or other passes. The EFLAGS operands must be dead in these new
+  // instructions because the EFLAGS operands in the original instructions must
+  // be dead in order for reassociation to occur.
+  NewOp1.setIsDead();
+  NewOp2.setIsDead();
+}
+
+std::pair<unsigned, unsigned>
+X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+  return std::make_pair(TF, 0u);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+  using namespace X86II;
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+      {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
+      {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
+      {MO_GOT, "x86-got"},
+      {MO_GOTOFF, "x86-gotoff"},
+      {MO_GOTPCREL, "x86-gotpcrel"},
+      {MO_PLT, "x86-plt"},
+      {MO_TLSGD, "x86-tlsgd"},
+      {MO_TLSLD, "x86-tlsld"},
+      {MO_TLSLDM, "x86-tlsldm"},
+      {MO_GOTTPOFF, "x86-gottpoff"},
+      {MO_INDNTPOFF, "x86-indntpoff"},
+      {MO_TPOFF, "x86-tpoff"},
+      {MO_DTPOFF, "x86-dtpoff"},
+      {MO_NTPOFF, "x86-ntpoff"},
+      {MO_GOTNTPOFF, "x86-gotntpoff"},
+      {MO_DLLIMPORT, "x86-dllimport"},
+      {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
+      {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
+      {MO_TLVP, "x86-tlvp"},
+      {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
+      {MO_SECREL, "x86-secrel"}};
+  return makeArrayRef(TargetFlags);
+}
+
+bool X86InstrInfo::isTailCall(const MachineInstr &Inst) const {
+  switch (Inst.getOpcode()) {
+    case X86::TCRETURNdi:
+    case X86::TCRETURNmi:
+    case X86::TCRETURNri:
+    case X86::TCRETURNdi64:
+    case X86::TCRETURNmi64:
+    case X86::TCRETURNri64:
+    case X86::TAILJMPd:
+    case X86::TAILJMPm:
+    case X86::TAILJMPr:
+    case X86::TAILJMPd64:
+    case X86::TAILJMPm64:
+    case X86::TAILJMPr64:
+    case X86::TAILJMPm64_REX:
+    case X86::TAILJMPr64_REX:
+      return true;
+    default:
+      return false;
+  }
+}
+
+namespace {
+  /// Create Global Base Reg pass. This initializes the PIC
+  /// global base register for x86-32.
+  struct CGBR : public MachineFunctionPass {
+    static char ID;
+    CGBR() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      const X86TargetMachine *TM =
+        static_cast<const X86TargetMachine *>(&MF.getTarget());
+      const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+
+      // Don't do anything if this is 64-bit as 64-bit PIC
+      // uses RIP relative addressing.
+      if (STI.is64Bit())
+        return false;
+
+      // Only emit a global base reg in PIC mode.
+      if (!TM->isPositionIndependent())
+        return false;
+
+      X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+      unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+
+      // If we didn't need a GlobalBaseReg, don't insert code.
+      if (GlobalBaseReg == 0)
+        return false;
+
+      // Insert the set of GlobalBaseReg into the first MBB of the function
+      MachineBasicBlock &FirstMBB = MF.front();
+      MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+      DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+      MachineRegisterInfo &RegInfo = MF.getRegInfo();
+      const X86InstrInfo *TII = STI.getInstrInfo();
+
+      unsigned PC;
+      if (STI.isPICStyleGOT())
+        PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+      else
+        PC = GlobalBaseReg;
+
+      // Operand of MovePCtoStack is completely ignored by asm printer. It's
+      // only used in JIT code emission as displacement to pc.
+      BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
+
+      // If we're using vanilla 'GOT' PIC style, we should use relative addressing
+      // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
+      if (STI.isPICStyleGOT()) {
+        // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
+        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
+          .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+                                        X86II::MO_GOT_ABSOLUTE_ADDRESS);
+      }
+
+      return true;
+    }
+
+    StringRef getPassName() const override {
+      return "X86 PIC Global Base Reg Initialization";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char CGBR::ID = 0;
+FunctionPass*
+llvm::createX86GlobalBaseRegPass() { return new CGBR(); }
+
+namespace {
+  struct LDTLSCleanup : public MachineFunctionPass {
+    static char ID;
+    LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      if (skipFunction(*MF.getFunction()))
+        return false;
+
+      X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
+      if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+        // No point folding accesses if there isn't at least two.
+        return false;
+      }
+
+      MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+      return VisitNode(DT->getRootNode(), 0);
+    }
+
+    // Visit the dominator subtree rooted at Node in pre-order.
+    // If TLSBaseAddrReg is non-null, then use that to replace any
+    // TLS_base_addr instructions. Otherwise, create the register
+    // when the first such instruction is seen, and then use it
+    // as we encounter more instructions.
+    bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+      MachineBasicBlock *BB = Node->getBlock();
+      bool Changed = false;
+
+      // Traverse the current block.
+      for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+           ++I) {
+        switch (I->getOpcode()) {
+          case X86::TLS_base_addr32:
+          case X86::TLS_base_addr64:
+            if (TLSBaseAddrReg)
+              I = ReplaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
+            else
+              I = SetRegister(*I, &TLSBaseAddrReg);
+            Changed = true;
+            break;
+          default:
+            break;
+        }
+      }
+
+      // Visit the children of this block in the dominator tree.
+      for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
+           I != E; ++I) {
+        Changed |= VisitNode(*I, TLSBaseAddrReg);
+      }
+
+      return Changed;
+    }
+
+    // Replace the TLS_base_addr instruction I with a copy from
+    // TLSBaseAddrReg, returning the new instruction.
+    MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr &I,
+                                         unsigned TLSBaseAddrReg) {
+      MachineFunction *MF = I.getParent()->getParent();
+      const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
+      const bool is64Bit = STI.is64Bit();
+      const X86InstrInfo *TII = STI.getInstrInfo();
+
+      // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
+      MachineInstr *Copy =
+          BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                  TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX)
+              .addReg(TLSBaseAddrReg);
+
+      // Erase the TLS_base_addr instruction.
+      I.eraseFromParent();
+
+      return Copy;
+    }
+
+    // Create a virtal register in *TLSBaseAddrReg, and populate it by
+    // inserting a copy instruction after I. Returns the new instruction.
+    MachineInstr *SetRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
+      MachineFunction *MF = I.getParent()->getParent();
+      const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
+      const bool is64Bit = STI.is64Bit();
+      const X86InstrInfo *TII = STI.getInstrInfo();
+
+      // Create a virtual register for the TLS base address.
+      MachineRegisterInfo &RegInfo = MF->getRegInfo();
+      *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit
+                                                      ? &X86::GR64RegClass
+                                                      : &X86::GR32RegClass);
+
+      // Insert a copy from RAX/EAX to TLSBaseAddrReg.
+      MachineInstr *Next = I.getNextNode();
+      MachineInstr *Copy =
+          BuildMI(*I.getParent(), Next, I.getDebugLoc(),
+                  TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
+              .addReg(is64Bit ? X86::RAX : X86::EAX);
+
+      return Copy;
+    }
+
+    StringRef getPassName() const override {
+      return "Local Dynamic TLS Access Clean-up";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass*
+llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
new file mode 100644
index 000000000000..8d746172dcbc
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
@@ -0,0 +1,608 @@
+//===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRINFO_H
+#define LLVM_LIB_TARGET_X86_X86INSTRINFO_H
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86InstrFMA3Info.h"
+#include "X86RegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "X86GenInstrInfo.inc"
+
+namespace llvm {
+  class MachineInstrBuilder;
+  class X86RegisterInfo;
+  class X86Subtarget;
+
+namespace X86 {
+  // X86 specific condition code. These correspond to X86_*_COND in
+  // X86InstrInfo.td. They must be kept in synch.
+enum CondCode {
+  COND_A = 0,
+  COND_AE = 1,
+  COND_B = 2,
+  COND_BE = 3,
+  COND_E = 4,
+  COND_G = 5,
+  COND_GE = 6,
+  COND_L = 7,
+  COND_LE = 8,
+  COND_NE = 9,
+  COND_NO = 10,
+  COND_NP = 11,
+  COND_NS = 12,
+  COND_O = 13,
+  COND_P = 14,
+  COND_S = 15,
+  LAST_VALID_COND = COND_S,
+
+  // Artificial condition codes. These are used by AnalyzeBranch
+  // to indicate a block terminated with two conditional branches that together
+  // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE,
+  // which can't be represented on x86 with a single condition. These
+  // are never used in MachineInstrs and are inverses of one another.
+  COND_NE_OR_P,
+  COND_E_AND_NP,
+
+  COND_INVALID
+};
+
+// Turn condition code into conditional branch opcode.
+unsigned GetCondBranchFromCond(CondCode CC);
+
+/// \brief Return a set opcode for the given condition and whether it has
+/// a memory operand.
+unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
+
+/// \brief Return a cmov opcode for the given condition, register size in
+/// bytes, and operand type.
+unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
+                         bool HasMemoryOperand = false);
+
+// Turn CMov opcode into condition code.
+CondCode getCondFromCMovOpc(unsigned Opc);
+
+/// GetOppositeBranchCondition - Return the inverse of the specified cond,
+/// e.g. turning COND_E to COND_NE.
+CondCode GetOppositeBranchCondition(CondCode CC);
+}  // end namespace X86;
+
+
+/// isGlobalStubReference - Return true if the specified TargetFlag operand is
+/// a reference to a stub for a global, not the global itself.
+inline static bool isGlobalStubReference(unsigned char TargetFlag) {
+  switch (TargetFlag) {
+  case X86II::MO_DLLIMPORT: // dllimport stub.
+  case X86II::MO_GOTPCREL:  // rip-relative GOT reference.
+  case X86II::MO_GOT:       // normal GOT reference.
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Normal $non_lazy_ptr ref.
+  case X86II::MO_DARWIN_NONLAZY:                 // Normal $non_lazy_ptr ref.
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// isGlobalRelativeToPICBase - Return true if the specified global value
+/// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg).  If this
+/// is true, the addressing mode has the PIC base register added in (e.g. EBX).
+inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
+  switch (TargetFlag) {
+  case X86II::MO_GOTOFF:                         // isPICStyleGOT: local global.
+  case X86II::MO_GOT:                            // isPICStyleGOT: other global.
+  case X86II::MO_PIC_BASE_OFFSET:                // Darwin local global.
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Darwin/32 external global.
+  case X86II::MO_TLVP:                           // ??? Pretty sure..
+    return true;
+  default:
+    return false;
+  }
+}
+
+inline static bool isScale(const MachineOperand &MO) {
+  return MO.isImm() &&
+    (MO.getImm() == 1 || MO.getImm() == 2 ||
+     MO.getImm() == 4 || MO.getImm() == 8);
+}
+
+inline static bool isLeaMem(const MachineInstr &MI, unsigned Op) {
+  if (MI.getOperand(Op).isFI())
+    return true;
+  return Op + X86::AddrSegmentReg <= MI.getNumOperands() &&
+         MI.getOperand(Op + X86::AddrBaseReg).isReg() &&
+         isScale(MI.getOperand(Op + X86::AddrScaleAmt)) &&
+         MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
+         (MI.getOperand(Op + X86::AddrDisp).isImm() ||
+          MI.getOperand(Op + X86::AddrDisp).isGlobal() ||
+          MI.getOperand(Op + X86::AddrDisp).isCPI() ||
+          MI.getOperand(Op + X86::AddrDisp).isJTI());
+}
+
+inline static bool isMem(const MachineInstr &MI, unsigned Op) {
+  if (MI.getOperand(Op).isFI())
+    return true;
+  return Op + X86::AddrNumOperands <= MI.getNumOperands() &&
+         MI.getOperand(Op + X86::AddrSegmentReg).isReg() && isLeaMem(MI, Op);
+}
+
+class X86InstrInfo final : public X86GenInstrInfo {
+  X86Subtarget &Subtarget;
+  const X86RegisterInfo RI;
+
+  /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
+  /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
+  ///
+  typedef DenseMap<unsigned,
+                   std::pair<uint16_t, uint16_t> > RegOp2MemOpTableType;
+  RegOp2MemOpTableType RegOp2MemOpTable2Addr;
+  RegOp2MemOpTableType RegOp2MemOpTable0;
+  RegOp2MemOpTableType RegOp2MemOpTable1;
+  RegOp2MemOpTableType RegOp2MemOpTable2;
+  RegOp2MemOpTableType RegOp2MemOpTable3;
+  RegOp2MemOpTableType RegOp2MemOpTable4;
+
+  /// MemOp2RegOpTable - Load / store unfolding opcode map.
+  ///
+  typedef DenseMap<unsigned,
+                   std::pair<uint16_t, uint16_t> > MemOp2RegOpTableType;
+  MemOp2RegOpTableType MemOp2RegOpTable;
+
+  static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
+                            MemOp2RegOpTableType &M2RTable,
+                            uint16_t RegOp, uint16_t MemOp, uint16_t Flags);
+
+  virtual void anchor();
+
+  bool AnalyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                         MachineBasicBlock *&FBB,
+                         SmallVectorImpl<MachineOperand> &Cond,
+                         SmallVectorImpl<MachineInstr *> &CondBranches,
+                         bool AllowModify) const;
+
+public:
+  explicit X86InstrInfo(X86Subtarget &STI);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const X86RegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// getSPAdjust - This returns the stack pointer adjustment made by
+  /// this instruction. For x86, we need to handle more complex call
+  /// sequences involving PUSHes.
+  int getSPAdjust(const MachineInstr &MI) const override;
+
+  /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
+  /// extension instruction. That is, it's like a copy where it's legal for the
+  /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
+  /// true, then it's expected the pre-extension value is available as a subreg
+  /// of the result register. This also returns the sub-register index in
+  /// SubIdx.
+  bool isCoalescableExtInstr(const MachineInstr &MI,
+                             unsigned &SrcReg, unsigned &DstReg,
+                             unsigned &SubIdx) const override;
+
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+  /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
+  /// stack locations as well.  This uses a heuristic so it isn't
+  /// reliable for correctness.
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
+                                     int &FrameIndex) const override;
+
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+  /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
+  /// stack locations as well.  This uses a heuristic so it isn't
+  /// reliable for correctness.
+  unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
+                                    int &FrameIndex) const override;
+
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
+                                         AliasAnalysis *AA) const override;
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, unsigned SubIdx,
+                     const MachineInstr &Orig,
+                     const TargetRegisterInfo &TRI) const override;
+
+  /// Given an operand within a MachineInstr, insert preceding code to put it
+  /// into the right format for a particular kind of LEA instruction. This may
+  /// involve using an appropriate super-register instead (with an implicit use
+  /// of the original) or creating a new virtual register and inserting COPY
+  /// instructions to get the data into the right class.
+  ///
+  /// Reference parameters are set to indicate how caller should add this
+  /// operand to the LEA instruction.
+  bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
+                      unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc,
+                      bool &isKill, bool &isUndef,
+                      MachineOperand &ImplicitOp, LiveVariables *LV) const;
+
+  /// convertToThreeAddress - This method must be implemented by targets that
+  /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
+  /// may be able to convert a two-address instruction into a true
+  /// three-address instruction on demand.  This allows the X86 target (for
+  /// example) to convert ADD and SHL instructions into LEA instructions if they
+  /// would require register copies due to two-addressness.
+  ///
+  /// This method returns a null pointer if the transformation cannot be
+  /// performed, otherwise it returns the new instruction.
+  ///
+  MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                      MachineInstr &MI,
+                                      LiveVariables *LV) const override;
+
+  /// Returns true iff the routine could find two commutable operands in the
+  /// given machine instruction.
+  /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
+  /// input values can be re-defined in this method only if the input values
+  /// are not pre-defined, which is designated by the special value
+  /// 'CommuteAnyOperandIndex' assigned to it.
+  /// If both of indices are pre-defined and refer to some operands, then the
+  /// method simply returns true if the corresponding operands are commutable
+  /// and returns false otherwise.
+  ///
+  /// For example, calling this method this way:
+  ///     unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
+  ///     findCommutedOpIndices(MI, Op1, Op2);
+  /// can be interpreted as a query asking to find an operand that would be
+  /// commutable with the operand#1.
+  bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+                             unsigned &SrcOpIdx2) const override;
+
+  /// Returns true if the routine could find two commutable operands
+  /// in the given FMA instruction \p MI. Otherwise, returns false.
+  ///
+  /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments.
+  /// The output indices of the commuted operands are returned in these
+  /// arguments. Also, the input values of these arguments may be preset either
+  /// to indices of operands that must be commuted or be equal to a special
+  /// value 'CommuteAnyOperandIndex' which means that the corresponding
+  /// operand index is not set and this method is free to pick any of
+  /// available commutable operands.
+  /// The parameter \p FMA3Group keeps the reference to the group of relative
+  /// FMA3 opcodes including register/memory forms of 132/213/231 opcodes.
+  ///
+  /// For example, calling this method this way:
+  ///     unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
+  ///     findFMA3CommutedOpIndices(MI, Idx1, Idx2, FMA3Group);
+  /// can be interpreted as a query asking if the operand #1 can be swapped
+  /// with any other available operand (e.g. operand #2, operand #3, etc.).
+  ///
+  /// The returned FMA opcode may differ from the opcode in the given MI.
+  /// For example, commuting the operands #1 and #3 in the following FMA
+  ///     FMA213 #1, #2, #3
+  /// results into instruction with adjusted opcode:
+  ///     FMA231 #3, #2, #1
+  bool findFMA3CommutedOpIndices(const MachineInstr &MI,
+                                 unsigned &SrcOpIdx1,
+                                 unsigned &SrcOpIdx2,
+                                 const X86InstrFMA3Group &FMA3Group) const;
+
+  /// Returns an adjusted FMA opcode that must be used in FMA instruction that
+  /// performs the same computations as the given \p MI but which has the
+  /// operands \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
+  /// It may return 0 if it is unsafe to commute the operands.
+  /// Note that a machine instruction (instead of its opcode) is passed as the
+  /// first parameter to make it possible to analyze the instruction's uses and
+  /// commute the first operand of FMA even when it seems unsafe when you look
+  /// at the opcode. For example, it is Ok to commute the first operand of
+  /// VFMADD*SD_Int, if ONLY the lowest 64-bit element of the result is used.
+  ///
+  /// The returned FMA opcode may differ from the opcode in the given \p MI.
+  /// For example, commuting the operands #1 and #3 in the following FMA
+  ///     FMA213 #1, #2, #3
+  /// results into instruction with adjusted opcode:
+  ///     FMA231 #3, #2, #1
+  unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI,
+                                          unsigned SrcOpIdx1,
+                                          unsigned SrcOpIdx2,
+                                          const X86InstrFMA3Group &FMA3Group) const;
+
+  // Branch analysis.
+  bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
+  bool isUnconditionalTailCall(const MachineInstr &MI) const override;
+  bool canMakeTailCallConditional(SmallVectorImpl<MachineOperand> &Cond,
+                                  const MachineInstr &TailCall) const override;
+  void replaceBranchWithTailCall(MachineBasicBlock &MBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 const MachineInstr &TailCall) const override;
+
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+                             int64_t &Offset,
+                             const TargetRegisterInfo *TRI) const override;
+  bool analyzeBranchPredicate(MachineBasicBlock &MBB,
+                              TargetInstrInfo::MachineBranchPredicate &MBP,
+                              bool AllowModify = false) const override;
+
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+  bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond,
+                       unsigned, unsigned, int&, int&, int&) const override;
+  void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                    const DebugLoc &DL, unsigned DstReg,
+                    ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+                    unsigned FalseReg) const override;
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                      SmallVectorImpl<MachineOperand> &Addr,
+                      const TargetRegisterClass *RC,
+                      MachineInstr::mmo_iterator MMOBegin,
+                      MachineInstr::mmo_iterator MMOEnd,
+                      SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                       SmallVectorImpl<MachineOperand> &Addr,
+                       const TargetRegisterClass *RC,
+                       MachineInstr::mmo_iterator MMOBegin,
+                       MachineInstr::mmo_iterator MMOEnd,
+                       SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+  /// Check whether the target can fold a load that feeds a subreg operand
+  /// (or a subreg operand that feeds a store).
+  bool isSubregFoldable() const override { return true; }
+
+  /// foldMemoryOperand - If this target supports it, fold a load or store of
+  /// the specified stack slot into the specified machine instruction for the
+  /// specified operand(s).  If this is possible, the target should perform the
+  /// folding and return true, otherwise it should return false.  If it folds
+  /// the instruction, it is likely that the MachineInstruction the iterator
+  /// references has been changed.
+  MachineInstr *
+  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                        ArrayRef<unsigned> Ops,
+                        MachineBasicBlock::iterator InsertPt, int FrameIndex,
+                        LiveIntervals *LIS = nullptr) const override;
+
+  /// foldMemoryOperand - Same as the previous version except it allows folding
+  /// of any load and store from / to any address, not just from a specific
+  /// stack slot.
+  MachineInstr *foldMemoryOperandImpl(
+      MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+      MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
+      LiveIntervals *LIS = nullptr) const override;
+
+  /// unfoldMemoryOperand - Separate a single instruction which folded a load or
+  /// a store or a load and a store into two or more instruction. If this is
+  /// possible, returns true as well as the new instructions by reference.
+  bool
+  unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, unsigned Reg,
+                      bool UnfoldLoad, bool UnfoldStore,
+                      SmallVectorImpl<MachineInstr *> &NewMIs) const override;
+
+  bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                           SmallVectorImpl<SDNode*> &NewNodes) const override;
+
+  /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
+  /// instruction after load / store are unfolded from an instruction of the
+  /// specified opcode. It returns zero if the specified unfolding is not
+  /// possible. If LoadRegIndex is non-null, it is filled in with the operand
+  /// index of the operand which will hold the register holding the loaded
+  /// value.
+  unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+                              bool UnfoldLoad, bool UnfoldStore,
+                              unsigned *LoadRegIndex = nullptr) const override;
+
+  /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
+  /// to determine if two loads are loading from the same base address. It
+  /// should only return true if the base pointers are the same and the
+  /// only differences between the two addresses are the offset. It also returns
+  /// the offsets by reference.
+  bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
+                               int64_t &Offset2) const override;
+
+  /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+  /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
+  /// be scheduled togther. On some targets if two loads are loading from
+  /// addresses in the same cache line, it's better if they are scheduled
+  /// together. This function takes two integers that represent the load offsets
+  /// from the common base address. It returns true if it decides it's desirable
+  /// to schedule the two loads together. "NumLoads" is the number of loads that
+  /// have already been scheduled after Load1.
+  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                               int64_t Offset1, int64_t Offset2,
+                               unsigned NumLoads) const override;
+
+  bool shouldScheduleAdjacent(const MachineInstr &First,
+                              const MachineInstr &Second) const override;
+
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  bool
+  reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
+  /// instruction that defines the specified register class.
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
+
+  /// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction tha
+  /// would clobber the EFLAGS condition register. Note the result may be
+  /// conservative. If it cannot definitely determine the safety after visiting
+  /// a few instructions in each direction it assumes it's not safe.
+  bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I) const;
+
+  /// True if MI has a condition code def, e.g. EFLAGS, that is
+  /// not marked dead.
+  bool hasLiveCondCodeDef(MachineInstr &MI) const;
+
+  /// getGlobalBaseReg - Return a virtual register initialized with the
+  /// the global base register value. Output instructions required to
+  /// initialize the register in the function entry block, if necessary.
+  ///
+  unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+  std::pair<uint16_t, uint16_t>
+  getExecutionDomain(const MachineInstr &MI) const override;
+
+  void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override;
+
+  unsigned
+  getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
+                               const TargetRegisterInfo *TRI) const override;
+  unsigned getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
+                                const TargetRegisterInfo *TRI) const override;
+  void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                                      unsigned OpNum,
+                                      ArrayRef<MachineOperand> MOs,
+                                      MachineBasicBlock::iterator InsertPt,
+                                      unsigned Size, unsigned Alignment,
+                                      bool AllowCommute) const;
+
+  bool isHighLatencyDef(int opc) const override;
+
+  bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
+                             const MachineRegisterInfo *MRI,
+                             const MachineInstr &DefMI, unsigned DefIdx,
+                             const MachineInstr &UseMI,
+                             unsigned UseIdx) const override;
+
+  bool useMachineCombiner() const override {
+    return true;
+  }
+
+  bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
+
+  bool hasReassociableOperands(const MachineInstr &Inst,
+                               const MachineBasicBlock *MBB) const override;
+
+  void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2,
+                             MachineInstr &NewMI1,
+                             MachineInstr &NewMI2) const override;
+
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2 if having two register operands, and the value it
+  /// compares against in CmpValue. Return true if the comparison instruction
+  /// can be analyzed.
+  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &CmpMask,
+                      int &CmpValue) const override;
+
+  /// optimizeCompareInstr - Check if there exists an earlier instruction that
+  /// operates on the same source operands and sets flags in the same way as
+  /// Compare; remove Compare if possible.
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int CmpMask, int CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
+
+  /// optimizeLoadInstr - Try to remove the load by folding it to a register
+  /// operand at the use. We fold the load instructions if and only if the
+  /// def and use are in the same BB. We only look at one load and see
+  /// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register
+  /// defined by the load we are trying to fold. DefMI returns the machine
+  /// instruction that defines FoldAsLoadDefReg, and the function returns
+  /// the machine instruction generated due to folding.
+  MachineInstr *optimizeLoadInstr(MachineInstr &MI,
+                                  const MachineRegisterInfo *MRI,
+                                  unsigned &FoldAsLoadDefReg,
+                                  MachineInstr *&DefMI) const override;
+
+  std::pair<unsigned, unsigned>
+  decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableDirectMachineOperandTargetFlags() const override;
+
+  bool isTailCall(const MachineInstr &Inst) const override;
+
+protected:
+  /// Commutes the operands in the given instruction by changing the operands
+  /// order and/or changing the instruction's opcode and/or the immediate value
+  /// operand.
+  ///
+  /// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands
+  /// to be commuted.
+  ///
+  /// Do not call this method for a non-commutable instruction or
+  /// non-commutable operands.
+  /// Even though the instruction is commutable, the method may still
+  /// fail to commute the operands, null pointer is returned in such cases.
+  MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+                                       unsigned CommuteOpIdx1,
+                                       unsigned CommuteOpIdx2) const override;
+
+private:
+  MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc,
+                                             MachineFunction::iterator &MFI,
+                                             MachineInstr &MI,
+                                             LiveVariables *LV) const;
+
+  /// Handles memory folding for special case instructions, for instance those
+  /// requiring custom manipulation of the address.
+  MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr &MI,
+                                        unsigned OpNum,
+                                        ArrayRef<MachineOperand> MOs,
+                                        MachineBasicBlock::iterator InsertPt,
+                                        unsigned Size, unsigned Align) const;
+
+  /// isFrameOperand - Return true and the FrameIndex if the specified
+  /// operand and follow operands form a reference to the stack frame.
+  bool isFrameOperand(const MachineInstr &MI, unsigned int Op,
+                      int &FrameIndex) const;
+
+  /// Returns true iff the routine could find two commutable operands in the
+  /// given machine instruction with 3 vector inputs.
+  /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
+  /// input values can be re-defined in this method only if the input values
+  /// are not pre-defined, which is designated by the special value
+  /// 'CommuteAnyOperandIndex' assigned to it.
+  /// If both of indices are pre-defined and refer to some operands, then the
+  /// method simply returns true if the corresponding operands are commutable
+  /// and returns false otherwise.
+  ///
+  /// For example, calling this method this way:
+  ///     unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
+  ///     findThreeSrcCommutedOpIndices(MI, Op1, Op2);
+  /// can be interpreted as a query asking to find an operand that would be
+  /// commutable with the operand#1.
+  bool findThreeSrcCommutedOpIndices(const MachineInstr &MI,
+                                     unsigned &SrcOpIdx1,
+                                     unsigned &SrcOpIdx2) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
new file mode 100644
index 000000000000..38036715a25a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
@@ -0,0 +1,3119 @@
+//===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 specific DAG Nodes.
+//
+
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                   SDTCisInt<0>, SDTCisInt<3>]>;
+
+def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>;
+
+def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+//def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
+def SDTX86Cmov    : SDTypeProfile<1, 4,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                   SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
+// Unary and binary operator instructions that set EFLAGS as a side-effect.
+def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
+                                           [SDTCisSameAs<0, 2>,
+                                            SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<4, i32>]>;
+// RES1, RES2, FLAGS = op LHS, RHS
+def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>, SDTCisVT<1, i32>]>;
+def SDTX86BrCond  : SDTypeProfile<0, 3,
+                                  [SDTCisVT<0, OtherVT>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86SetCC   : SDTypeProfile<1, 2,
+                                  [SDTCisVT<0, i8>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+def SDTX86SetCC_C : SDTypeProfile<1, 2,
+                                  [SDTCisInt<0>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>;
+
+def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
+                                     SDTCisVT<2, i8>]>;
+def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDTX86caspairSaveEbx8 : SDTypeProfile<1, 3,
+                                          [SDTCisVT<0, i32>, SDTCisPtrTy<1>,
+                                          SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+def SDTX86caspairSaveRbx16 : SDTypeProfile<1, 3,
+                                           [SDTCisVT<0, i64>, SDTCisPtrTy<1>,
+                                           SDTCisVT<2, i64>, SDTCisVT<3, i64>]>;
+
+def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                                                       SDTCisPtrTy<1>,
+                                                       SDTCisInt<2>]>;
+
+def SDTX86Ret     : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
+
+def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_X86CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32>]>;
+
+def SDT_X86Call   : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+
+def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
+                                                         SDTCisVT<1, iPTR>,
+                                                         SDTCisVT<2, iPTR>]>;
+
+def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
+                                            SDTCisPtrTy<1>,
+                                            SDTCisVT<2, i32>,
+                                            SDTCisVT<3, i8>,
+                                            SDTCisVT<4, i32>]>;
+
+def SDTX86RepStr  : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+
+def SDTX86Void    : SDTypeProfile<0, 0, []>;
+
+def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86WIN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+
+def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+
+def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
+
+def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
+
+def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
+                            [SDNPHasChain,SDNPSideEffect]>;
+def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
+                        [SDNPHasChain]>;
+
+
+def X86bsf     : SDNode<"X86ISD::BSF",      SDTUnaryArithWithFlags>;
+def X86bsr     : SDNode<"X86ISD::BSR",      SDTUnaryArithWithFlags>;
+def X86shld    : SDNode<"X86ISD::SHLD",     SDTIntShiftDOp>;
+def X86shrd    : SDNode<"X86ISD::SHRD",     SDTIntShiftDOp>;
+
+def X86cmp     : SDNode<"X86ISD::CMP" ,     SDTX86CmpTest>;
+def X86bt      : SDNode<"X86ISD::BT",       SDTX86CmpTest>;
+
+def X86cmov    : SDNode<"X86ISD::CMOV",     SDTX86Cmov>;
+def X86brcond  : SDNode<"X86ISD::BRCOND",   SDTX86BrCond,
+                        [SDNPHasChain]>;
+def X86setcc   : SDNode<"X86ISD::SETCC",    SDTX86SetCC>;
+def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
+
+def X86sahf    : SDNode<"X86ISD::SAHF",     SDTX86sahf>;
+
+def X86rdrand  : SDNode<"X86ISD::RDRAND",   SDTX86rdrand,
+                        [SDNPHasChain, SDNPSideEffect]>;
+
+def X86rdseed  : SDNode<"X86ISD::RDSEED",   SDTX86rdrand,
+                        [SDNPHasChain, SDNPSideEffect]>;
+
+def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86cas8save_ebx : SDNode<"X86ISD::LCMPXCHG8_SAVE_EBX_DAG",
+                                SDTX86caspairSaveEbx8,
+                                [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+def X86cas16save_rbx : SDNode<"X86ISD::LCMPXCHG16_SAVE_RBX_DAG",
+                                SDTX86caspairSaveRbx16,
+                                [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+
+def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret,
+                        [SDNPHasChain, SDNPOptInGlue]>;
+
+def X86vastart_save_xmm_regs :
+                 SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
+                        SDT_X86VASTART_SAVE_XMM_REGS,
+                        [SDNPHasChain, SDNPVariadic]>;
+def X86vaarg64 :
+                 SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+                         SDNPMemOperand]>;
+def X86callseq_start :
+                 SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
+                        [SDNPHasChain, SDNPOutGlue]>;
+def X86callseq_end :
+                 SDNode<"ISD::CALLSEQ_END",   SDT_X86CallSeqEnd,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86call    : SDNode<"X86ISD::CALL",     SDT_X86Call,
+                        [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                         SDNPVariadic]>;
+
+def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>;
+def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad]>;
+
+def X86rdtsc   : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
+                        [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def X86rdtscp  : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void,
+                        [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def X86rdpmc   : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void,
+                        [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+
+def X86Wrapper    : SDNode<"X86ISD::Wrapper",     SDTX86Wrapper>;
+def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
+
+def X86RecoverFrameAlloc : SDNode<"ISD::LOCAL_RECOVER",
+                                  SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+                                                       SDTCisInt<1>]>>;
+
+def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
+                        [SDNPHasChain]>;
+
+def X86eh_sjlj_setjmp  : SDNode<"X86ISD::EH_SJLJ_SETJMP",
+                                SDTypeProfile<1, 1, [SDTCisInt<0>,
+                                                     SDTCisPtrTy<1>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP",
+                                SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+def X86eh_sjlj_setup_dispatch : SDNode<"X86ISD::EH_SJLJ_SETUP_DISPATCH",
+                                       SDTypeProfile<0, 0, []>,
+                                       [SDNPHasChain, SDNPSideEffect]>;
+
+def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,
+                        [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+def X86add_flag  : SDNode<"X86ISD::ADD",  SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86sub_flag  : SDNode<"X86ISD::SUB",  SDTBinaryArithWithFlags>;
+def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86adc_flag  : SDNode<"X86ISD::ADC",  SDTBinaryArithWithFlagsInOut>;
+def X86sbb_flag  : SDNode<"X86ISD::SBB",  SDTBinaryArithWithFlagsInOut>;
+
+def X86inc_flag  : SDNode<"X86ISD::INC",  SDTUnaryArithWithFlags>;
+def X86dec_flag  : SDNode<"X86ISD::DEC",  SDTUnaryArithWithFlags>;
+def X86or_flag   : SDNode<"X86ISD::OR",   SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86xor_flag  : SDNode<"X86ISD::XOR",  SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86and_flag  : SDNode<"X86ISD::AND",  SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+
+def X86lock_add  : SDNode<"X86ISD::LADD",  SDTLockBinaryArithWithFlags,
+                          [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                           SDNPMemOperand]>;
+def X86lock_sub  : SDNode<"X86ISD::LSUB",  SDTLockBinaryArithWithFlags,
+                          [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                           SDNPMemOperand]>;
+def X86lock_or  : SDNode<"X86ISD::LOR",  SDTLockBinaryArithWithFlags,
+                         [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                          SDNPMemOperand]>;
+def X86lock_xor  : SDNode<"X86ISD::LXOR",  SDTLockBinaryArithWithFlags,
+                          [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                           SDNPMemOperand]>;
+def X86lock_and  : SDNode<"X86ISD::LAND",  SDTLockBinaryArithWithFlags,
+                          [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+                           SDNPMemOperand]>;
+
+def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
+
+def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
+
+def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
+                          [SDNPHasChain, SDNPOutGlue]>;
+
+def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
+                          [SDNPHasChain]>;
+
+def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Operand Definitions.
+//
+
+// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for
+// the index operand of an address, to conform to x86 encoding restrictions.
+def ptr_rc_nosp : PointerLikeRegClass<1>;
+
+// *mem - Operand definitions for the funky X86 addressing mode operands.
+//
+def X86MemAsmOperand : AsmOperandClass {
+ let Name = "Mem";
+}
+let RenderMethod = "addMemOperands", SuperClasses = [X86MemAsmOperand] in {
+  def X86Mem8AsmOperand   : AsmOperandClass { let Name = "Mem8"; }
+  def X86Mem16AsmOperand  : AsmOperandClass { let Name = "Mem16"; }
+  def X86Mem32AsmOperand  : AsmOperandClass { let Name = "Mem32"; }
+  def X86Mem64AsmOperand  : AsmOperandClass { let Name = "Mem64"; }
+  def X86Mem80AsmOperand  : AsmOperandClass { let Name = "Mem80"; }
+  def X86Mem128AsmOperand : AsmOperandClass { let Name = "Mem128"; }
+  def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; }
+  def X86Mem512AsmOperand : AsmOperandClass { let Name = "Mem512"; }
+  // Gather mem operands
+  def X86Mem64_RC128Operand  : AsmOperandClass { let Name = "Mem64_RC128"; }
+  def X86Mem128_RC128Operand : AsmOperandClass { let Name = "Mem128_RC128"; }
+  def X86Mem256_RC128Operand : AsmOperandClass { let Name = "Mem256_RC128"; }
+  def X86Mem128_RC256Operand : AsmOperandClass { let Name = "Mem128_RC256"; }
+  def X86Mem256_RC256Operand : AsmOperandClass { let Name = "Mem256_RC256"; }
+
+  def X86Mem64_RC128XOperand  : AsmOperandClass { let Name = "Mem64_RC128X"; }
+  def X86Mem128_RC128XOperand : AsmOperandClass { let Name = "Mem128_RC128X"; }
+  def X86Mem256_RC128XOperand : AsmOperandClass { let Name = "Mem256_RC128X"; }
+  def X86Mem128_RC256XOperand : AsmOperandClass { let Name = "Mem128_RC256X"; }
+  def X86Mem256_RC256XOperand : AsmOperandClass { let Name = "Mem256_RC256X"; }
+  def X86Mem512_RC256XOperand : AsmOperandClass { let Name = "Mem512_RC256X"; }
+  def X86Mem512_RC512Operand  : AsmOperandClass { let Name = "Mem512_RC512"; }
+}
+
+def X86AbsMemAsmOperand : AsmOperandClass {
+  let Name = "AbsMem";
+  let SuperClasses = [X86MemAsmOperand];
+}
+
+class X86MemOperand<string printMethod,
+          AsmOperandClass parserMatchClass = X86MemAsmOperand> : Operand<iPTR> {
+  let PrintMethod = printMethod;
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG);
+  let ParserMatchClass = parserMatchClass;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+// Gather mem operands
+class X86VMemOperand<RegisterClass RC, string printMethod,
+                     AsmOperandClass parserMatchClass>
+    : X86MemOperand<printMethod, parserMatchClass> {
+  let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, SEGMENT_REG);
+}
+
+def anymem : X86MemOperand<"printanymem">;
+
+def opaque32mem : X86MemOperand<"printopaquemem">;
+def opaque48mem : X86MemOperand<"printopaquemem">;
+def opaque80mem : X86MemOperand<"printopaquemem">;
+def opaque512mem : X86MemOperand<"printopaquemem">;
+
+def i8mem   : X86MemOperand<"printi8mem",   X86Mem8AsmOperand>;
+def i16mem  : X86MemOperand<"printi16mem",  X86Mem16AsmOperand>;
+def i32mem  : X86MemOperand<"printi32mem",  X86Mem32AsmOperand>;
+def i64mem  : X86MemOperand<"printi64mem",  X86Mem64AsmOperand>;
+def i128mem : X86MemOperand<"printi128mem", X86Mem128AsmOperand>;
+def i256mem : X86MemOperand<"printi256mem", X86Mem256AsmOperand>;
+def i512mem : X86MemOperand<"printi512mem", X86Mem512AsmOperand>;
+def f32mem  : X86MemOperand<"printf32mem",  X86Mem32AsmOperand>;
+def f64mem  : X86MemOperand<"printf64mem",  X86Mem64AsmOperand>;
+def f80mem  : X86MemOperand<"printf80mem",  X86Mem80AsmOperand>;
+def f128mem : X86MemOperand<"printf128mem", X86Mem128AsmOperand>;
+def f256mem : X86MemOperand<"printf256mem", X86Mem256AsmOperand>;
+def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>;
+
+def v512mem : X86VMemOperand<VR512, "printf512mem", X86Mem512AsmOperand>;
+
+// Gather mem operands
+def vx64mem  : X86VMemOperand<VR128,  "printi64mem",  X86Mem64_RC128Operand>;
+def vx128mem : X86VMemOperand<VR128,  "printi128mem", X86Mem128_RC128Operand>;
+def vx256mem : X86VMemOperand<VR128,  "printi256mem", X86Mem256_RC128Operand>;
+def vy128mem : X86VMemOperand<VR256,  "printi128mem", X86Mem128_RC256Operand>;
+def vy256mem : X86VMemOperand<VR256,  "printi256mem", X86Mem256_RC256Operand>;
+
+def vx64xmem  : X86VMemOperand<VR128X, "printi64mem",  X86Mem64_RC128XOperand>;
+def vx128xmem : X86VMemOperand<VR128X, "printi128mem", X86Mem128_RC128XOperand>;
+def vx256xmem : X86VMemOperand<VR128X, "printi256mem", X86Mem256_RC128XOperand>;
+def vy128xmem : X86VMemOperand<VR256,  "printi128mem", X86Mem128_RC256XOperand>;
+def vy256xmem : X86VMemOperand<VR256X, "printi256mem", X86Mem256_RC256XOperand>;
+def vy512mem  : X86VMemOperand<VR256X, "printi512mem", X86Mem512_RC256XOperand>;
+def vz512mem  : X86VMemOperand<VR512,  "printi512mem", X86Mem512_RC512Operand>;
+
+// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
+// of a plain GPR, so that it doesn't potentially require a REX prefix.
+def ptr_rc_norex : PointerLikeRegClass<2>;
+def ptr_rc_norex_nosp : PointerLikeRegClass<3>;
+
+def i8mem_NOREX : Operand<iPTR> {
+  let PrintMethod = "printi8mem";
+  let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm,
+                       SEGMENT_REG);
+  let ParserMatchClass = X86Mem8AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+// GPRs available for tailcall.
+// It represents GR32_TC, GR64_TC or GR64_TCW64.
+def ptr_rc_tailcall : PointerLikeRegClass<4>;
+
+// Special i32mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i32mem_TC : Operand<i32> {
+  let PrintMethod = "printi32mem";
+  let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall,
+                       i32imm, SEGMENT_REG);
+  let ParserMatchClass = X86Mem32AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+// Special i64mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i64mem_TC : Operand<i64> {
+  let PrintMethod = "printi64mem";
+  let MIOperandInfo = (ops ptr_rc_tailcall, i8imm,
+                       ptr_rc_tailcall, i32imm, SEGMENT_REG);
+  let ParserMatchClass = X86Mem64AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+let OperandType = "OPERAND_PCREL",
+    ParserMatchClass = X86AbsMemAsmOperand,
+    PrintMethod = "printPCRelImm" in {
+def i32imm_pcrel : Operand<i32>;
+def i16imm_pcrel : Operand<i16>;
+
+// Branch targets have OtherVT type and print as pc-relative values.
+def brtarget : Operand<OtherVT>;
+def brtarget8 : Operand<OtherVT>;
+
+}
+
+// Special parser to detect 16-bit mode to select 16-bit displacement.
+def X86AbsMem16AsmOperand : AsmOperandClass {
+  let Name = "AbsMem16";
+  let RenderMethod = "addAbsMemOperands";
+  let SuperClasses = [X86AbsMemAsmOperand];
+}
+
+// Branch targets have OtherVT type and print as pc-relative values.
+let OperandType = "OPERAND_PCREL",
+    PrintMethod = "printPCRelImm" in {
+let ParserMatchClass = X86AbsMem16AsmOperand in
+  def brtarget16 : Operand<OtherVT>;
+let ParserMatchClass = X86AbsMemAsmOperand in
+  def brtarget32 : Operand<OtherVT>;
+}
+
+let RenderMethod = "addSrcIdxOperands" in {
+  def X86SrcIdx8Operand : AsmOperandClass {
+    let Name = "SrcIdx8";
+    let SuperClasses = [X86Mem8AsmOperand];
+  }
+  def X86SrcIdx16Operand : AsmOperandClass {
+    let Name = "SrcIdx16";
+    let SuperClasses = [X86Mem16AsmOperand];
+  }
+  def X86SrcIdx32Operand : AsmOperandClass {
+    let Name = "SrcIdx32";
+    let SuperClasses = [X86Mem32AsmOperand];
+  }
+  def X86SrcIdx64Operand : AsmOperandClass {
+    let Name = "SrcIdx64";
+    let SuperClasses = [X86Mem64AsmOperand];
+  }
+} // RenderMethod = "addSrcIdxOperands"
+
+let RenderMethod = "addDstIdxOperands" in {
+ def X86DstIdx8Operand : AsmOperandClass {
+   let Name = "DstIdx8";
+   let SuperClasses = [X86Mem8AsmOperand];
+ }
+ def X86DstIdx16Operand : AsmOperandClass {
+   let Name = "DstIdx16";
+   let SuperClasses = [X86Mem16AsmOperand];
+ }
+ def X86DstIdx32Operand : AsmOperandClass {
+   let Name = "DstIdx32";
+   let SuperClasses = [X86Mem32AsmOperand];
+ }
+ def X86DstIdx64Operand : AsmOperandClass {
+   let Name = "DstIdx64";
+   let SuperClasses = [X86Mem64AsmOperand];
+ }
+} // RenderMethod = "addDstIdxOperands"
+
+let RenderMethod = "addMemOffsOperands" in {
+  def X86MemOffs16_8AsmOperand : AsmOperandClass {
+    let Name = "MemOffs16_8";
+    let SuperClasses = [X86Mem8AsmOperand];
+  }
+  def X86MemOffs16_16AsmOperand : AsmOperandClass {
+    let Name = "MemOffs16_16";
+    let SuperClasses = [X86Mem16AsmOperand];
+  }
+  def X86MemOffs16_32AsmOperand : AsmOperandClass {
+    let Name = "MemOffs16_32";
+    let SuperClasses = [X86Mem32AsmOperand];
+  }
+  def X86MemOffs32_8AsmOperand : AsmOperandClass {
+    let Name = "MemOffs32_8";
+    let SuperClasses = [X86Mem8AsmOperand];
+  }
+  def X86MemOffs32_16AsmOperand : AsmOperandClass {
+    let Name = "MemOffs32_16";
+    let SuperClasses = [X86Mem16AsmOperand];
+  }
+  def X86MemOffs32_32AsmOperand : AsmOperandClass {
+    let Name = "MemOffs32_32";
+    let SuperClasses = [X86Mem32AsmOperand];
+  }
+  def X86MemOffs32_64AsmOperand : AsmOperandClass {
+    let Name = "MemOffs32_64";
+    let SuperClasses = [X86Mem64AsmOperand];
+  }
+  def X86MemOffs64_8AsmOperand : AsmOperandClass {
+    let Name = "MemOffs64_8";
+    let SuperClasses = [X86Mem8AsmOperand];
+  }
+  def X86MemOffs64_16AsmOperand : AsmOperandClass {
+    let Name = "MemOffs64_16";
+    let SuperClasses = [X86Mem16AsmOperand];
+  }
+  def X86MemOffs64_32AsmOperand : AsmOperandClass {
+    let Name = "MemOffs64_32";
+    let SuperClasses = [X86Mem32AsmOperand];
+  }
+  def X86MemOffs64_64AsmOperand : AsmOperandClass {
+    let Name = "MemOffs64_64";
+    let SuperClasses = [X86Mem64AsmOperand];
+  }
+} // RenderMethod = "addMemOffsOperands"
+
+class X86SrcIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
+    : X86MemOperand<printMethod, parserMatchClass> {
+  let MIOperandInfo = (ops ptr_rc, SEGMENT_REG);
+}
+
+class X86DstIdxOperand<string printMethod, AsmOperandClass parserMatchClass>
+    : X86MemOperand<printMethod, parserMatchClass> {
+  let MIOperandInfo = (ops ptr_rc);
+}
+
+def srcidx8  : X86SrcIdxOperand<"printSrcIdx8",  X86SrcIdx8Operand>;
+def srcidx16 : X86SrcIdxOperand<"printSrcIdx16", X86SrcIdx16Operand>;
+def srcidx32 : X86SrcIdxOperand<"printSrcIdx32", X86SrcIdx32Operand>;
+def srcidx64 : X86SrcIdxOperand<"printSrcIdx64", X86SrcIdx64Operand>;
+def dstidx8  : X86DstIdxOperand<"printDstIdx8",  X86DstIdx8Operand>;
+def dstidx16 : X86DstIdxOperand<"printDstIdx16", X86DstIdx16Operand>;
+def dstidx32 : X86DstIdxOperand<"printDstIdx32", X86DstIdx32Operand>;
+def dstidx64 : X86DstIdxOperand<"printDstIdx64", X86DstIdx64Operand>;
+
+class X86MemOffsOperand<Operand immOperand, string printMethod,
+                        AsmOperandClass parserMatchClass>
+    : X86MemOperand<printMethod, parserMatchClass> {
+  let MIOperandInfo = (ops immOperand, SEGMENT_REG);
+}
+
+def offset16_8  : X86MemOffsOperand<i16imm, "printMemOffs8",
+                                    X86MemOffs16_8AsmOperand>;
+def offset16_16 : X86MemOffsOperand<i16imm, "printMemOffs16",
+                                    X86MemOffs16_16AsmOperand>;
+def offset16_32 : X86MemOffsOperand<i16imm, "printMemOffs32",
+                                    X86MemOffs16_32AsmOperand>;
+def offset32_8  : X86MemOffsOperand<i32imm, "printMemOffs8",
+                                    X86MemOffs32_8AsmOperand>;
+def offset32_16 : X86MemOffsOperand<i32imm, "printMemOffs16",
+                                    X86MemOffs32_16AsmOperand>;
+def offset32_32 : X86MemOffsOperand<i32imm, "printMemOffs32",
+                                    X86MemOffs32_32AsmOperand>;
+def offset32_64 : X86MemOffsOperand<i32imm, "printMemOffs64",
+                                    X86MemOffs32_64AsmOperand>;
+def offset64_8  : X86MemOffsOperand<i64imm, "printMemOffs8",
+                                    X86MemOffs64_8AsmOperand>;
+def offset64_16 : X86MemOffsOperand<i64imm, "printMemOffs16",
+                                    X86MemOffs64_16AsmOperand>;
+def offset64_32 : X86MemOffsOperand<i64imm, "printMemOffs32",
+                                    X86MemOffs64_32AsmOperand>;
+def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64",
+                                    X86MemOffs64_64AsmOperand>;
+
+def SSECC : Operand<i8> {
+  let PrintMethod = "printSSEAVXCC";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def i8immZExt3 : ImmLeaf<i8, [{
+  return Imm >= 0 && Imm < 8;
+}]>;
+
+def AVXCC : Operand<i8> {
+  let PrintMethod = "printSSEAVXCC";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def i8immZExt5 : ImmLeaf<i8, [{
+  return Imm >= 0 && Imm < 32;
+}]>;
+
+def AVX512ICC : Operand<i8> {
+  let PrintMethod = "printSSEAVXCC";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def XOPCC : Operand<i8> {
+  let PrintMethod = "printXOPCC";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+class ImmSExtAsmOperandClass : AsmOperandClass {
+  let SuperClasses = [ImmAsmOperand];
+  let RenderMethod = "addImmOperands";
+}
+
+def X86GR32orGR64AsmOperand : AsmOperandClass {
+  let Name = "GR32orGR64";
+}
+
+def GR32orGR64 : RegisterOperand<GR32> {
+  let ParserMatchClass = X86GR32orGR64AsmOperand;
+}
+def AVX512RCOperand : AsmOperandClass {
+  let Name = "AVX512RC";
+}
+def AVX512RC : Operand<i32> {
+  let PrintMethod = "printRoundingControl";
+  let OperandType = "OPERAND_IMMEDIATE";
+  let ParserMatchClass = AVX512RCOperand;
+}
+
+// Sign-extended immediate classes. We don't need to define the full lattice
+// here because there is no instruction with an ambiguity between ImmSExti64i32
+// and ImmSExti32i8.
+//
+// The strange ranges come from the fact that the assembler always works with
+// 64-bit immediates, but for a 16-bit target value we want to accept both "-1"
+// (which will be a -1ULL), and "0xFF" (-1 in 16-bits).
+
+// [0, 0x7FFFFFFF]                                            |
+//   [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass {
+  let Name = "ImmSExti64i32";
+}
+
+// [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] |
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass {
+  let Name = "ImmSExti16i8";
+  let SuperClasses = [ImmSExti64i32AsmOperand];
+}
+
+// [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] |
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass {
+  let Name = "ImmSExti32i8";
+}
+
+// [0, 0x0000007F]                                            |
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
+  let Name = "ImmSExti64i8";
+  let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand,
+                      ImmSExti64i32AsmOperand];
+}
+
+// Unsigned immediate used by SSE/AVX instructions
+// [0, 0xFF]
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmUnsignedi8AsmOperand : AsmOperandClass {
+  let Name = "ImmUnsignedi8";
+  let RenderMethod = "addImmOperands";
+}
+
+// A couple of more descriptive operand definitions.
+// 16-bits but only 8 bits are significant.
+def i16i8imm  : Operand<i16> {
+  let ParserMatchClass = ImmSExti16i8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+// 32-bits but only 8 bits are significant.
+def i32i8imm  : Operand<i32> {
+  let ParserMatchClass = ImmSExti32i8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 32 bits are significant.
+def i64i32imm  : Operand<i64> {
+  let ParserMatchClass = ImmSExti64i32AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 8 bits are significant.
+def i64i8imm   : Operand<i64> {
+  let ParserMatchClass = ImmSExti64i8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// Unsigned 8-bit immediate used by SSE/AVX instructions.
+def u8imm : Operand<i8> {
+  let PrintMethod = "printU8Imm";
+  let ParserMatchClass = ImmUnsignedi8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 32-bit immediate but only 8-bits are significant and they are unsigned.
+// Used by some SSE/AVX instructions that use intrinsics.
+def i32u8imm : Operand<i32> {
+  let PrintMethod = "printU8Imm";
+  let ParserMatchClass = ImmUnsignedi8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 32 bits are significant, and those bits are treated as being
+// pc relative.
+def i64i32imm_pcrel : Operand<i64> {
+  let PrintMethod = "printPCRelImm";
+  let ParserMatchClass = X86AbsMemAsmOperand;
+  let OperandType = "OPERAND_PCREL";
+}
+
+def lea64_32mem : Operand<i32> {
+  let PrintMethod = "printanymem";
+  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+// Memory operands that use 64-bit pointers in both ILP32 and LP64.
+def lea64mem : Operand<i64> {
+  let PrintMethod = "printanymem";
+  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+
+//===----------------------------------------------------------------------===//
+// X86 Complex Pattern Definitions.
+//
+
+// Define X86-specific addressing mode.
+def addr      : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>;
+def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr",
+                               [add, sub, mul, X86mul_imm, shl, or, frameindex],
+                               []>;
+// In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
+def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr",
+                                  [add, sub, mul, X86mul_imm, shl, or,
+                                   frameindex, X86WrapperRIP],
+                                  []>;
+
+def tls32addr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr",
+                        [add, sub, mul, X86mul_imm, shl, or, frameindex,
+                         X86WrapperRIP], []>;
+
+def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>;
+
+// A relocatable immediate is either an immediate operand or an operand that can
+// be relocated by the linker to an immediate, such as a regular symbol in
+// non-PIC code.
+def relocImm : ComplexPattern<iAny, 1, "selectRelocImm", [imm, X86Wrapper], [],
+                              0>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Predicate Definitions.
+def TruePredicate : Predicate<"true">;
+
+def HasCMov      : Predicate<"Subtarget->hasCMov()">;
+def NoCMov       : Predicate<"!Subtarget->hasCMov()">;
+
+def HasMMX       : Predicate<"Subtarget->hasMMX()">;
+def Has3DNow     : Predicate<"Subtarget->has3DNow()">;
+def Has3DNowA    : Predicate<"Subtarget->has3DNowA()">;
+def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
+def UseSSE1      : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
+def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
+def UseSSE2      : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">;
+def HasSSE3      : Predicate<"Subtarget->hasSSE3()">;
+def UseSSE3      : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">;
+def HasSSSE3     : Predicate<"Subtarget->hasSSSE3()">;
+def UseSSSE3     : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">;
+def HasSSE41     : Predicate<"Subtarget->hasSSE41()">;
+def NoSSE41      : Predicate<"!Subtarget->hasSSE41()">;
+def UseSSE41     : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
+def HasSSE42     : Predicate<"Subtarget->hasSSE42()">;
+def UseSSE42     : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
+def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
+def HasAVX       : Predicate<"Subtarget->hasAVX()">;
+def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
+def HasAVX1Only  : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
+def HasAVX512    : Predicate<"Subtarget->hasAVX512()">,
+                     AssemblerPredicate<"FeatureAVX512", "AVX-512 ISA">;
+def UseAVX       : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
+def UseAVX2      : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
+def NoAVX512     : Predicate<"!Subtarget->hasAVX512()">;
+def HasCDI       : Predicate<"Subtarget->hasCDI()">,
+                     AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">;
+def HasPFI       : Predicate<"Subtarget->hasPFI()">,
+                     AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">;
+def HasERI       : Predicate<"Subtarget->hasERI()">,
+                     AssemblerPredicate<"FeatureERI", "AVX-512 ER ISA">;
+def HasDQI       : Predicate<"Subtarget->hasDQI()">,
+                     AssemblerPredicate<"FeatureDQI", "AVX-512 DQ ISA">;
+def NoDQI        : Predicate<"!Subtarget->hasDQI()">;
+def HasBWI       : Predicate<"Subtarget->hasBWI()">,
+                     AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">;
+def NoBWI        : Predicate<"!Subtarget->hasBWI()">;
+def HasVLX       : Predicate<"Subtarget->hasVLX()">,
+                     AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">;
+def NoVLX        : Predicate<"!Subtarget->hasVLX()">;
+def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">;
+def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">;
+def PKU        : Predicate<"Subtarget->hasPKU()">;
+
+def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
+def HasAES       : Predicate<"Subtarget->hasAES()">;
+def HasFXSR      : Predicate<"Subtarget->hasFXSR()">;
+def HasXSAVE     : Predicate<"Subtarget->hasXSAVE()">;
+def HasXSAVEOPT  : Predicate<"Subtarget->hasXSAVEOPT()">;
+def HasXSAVEC    : Predicate<"Subtarget->hasXSAVEC()">;
+def HasXSAVES    : Predicate<"Subtarget->hasXSAVES()">;
+def HasPCLMUL    : Predicate<"Subtarget->hasPCLMUL()">;
+def HasFMA       : Predicate<"Subtarget->hasFMA()">;
+def UseFMAOnAVX  : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">;
+def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
+def HasXOP       : Predicate<"Subtarget->hasXOP()">;
+def HasTBM       : Predicate<"Subtarget->hasTBM()">;
+def HasMOVBE     : Predicate<"Subtarget->hasMOVBE()">;
+def HasRDRAND    : Predicate<"Subtarget->hasRDRAND()">;
+def HasF16C      : Predicate<"Subtarget->hasF16C()">;
+def NoF16C       : Predicate<"!Subtarget->hasF16C()">;
+def HasFSGSBase  : Predicate<"Subtarget->hasFSGSBase()">;
+def HasLZCNT     : Predicate<"Subtarget->hasLZCNT()">;
+def HasBMI       : Predicate<"Subtarget->hasBMI()">;
+def HasBMI2      : Predicate<"Subtarget->hasBMI2()">;
+def HasVBMI      : Predicate<"Subtarget->hasVBMI()">,
+                     AssemblerPredicate<"FeatureVBMI", "AVX-512 VBMI ISA">;
+def HasIFMA      : Predicate<"Subtarget->hasIFMA()">,
+                     AssemblerPredicate<"FeatureIFMA", "AVX-512 IFMA ISA">;
+def HasRTM       : Predicate<"Subtarget->hasRTM()">;
+def HasHLE       : Predicate<"Subtarget->hasHLE()">;
+def HasTSX       : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
+def HasADX       : Predicate<"Subtarget->hasADX()">;
+def HasSHA       : Predicate<"Subtarget->hasSHA()">;
+def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
+def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
+def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
+def HasLAHFSAHF  : Predicate<"Subtarget->hasLAHFSAHF()">;
+def HasMWAITX    : Predicate<"Subtarget->hasMWAITX()">;
+def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
+def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
+def HasMPX       : Predicate<"Subtarget->hasMPX()">;
+def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
+def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
+                             AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
+def In64BitMode  : Predicate<"Subtarget->is64Bit()">,
+                             AssemblerPredicate<"Mode64Bit", "64-bit mode">;
+def IsLP64  : Predicate<"Subtarget->isTarget64BitLP64()">;
+def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">;
+def In16BitMode  : Predicate<"Subtarget->is16Bit()">,
+                             AssemblerPredicate<"Mode16Bit", "16-bit mode">;
+def Not16BitMode : Predicate<"!Subtarget->is16Bit()">,
+                             AssemblerPredicate<"!Mode16Bit", "Not 16-bit mode">;
+def In32BitMode  : Predicate<"Subtarget->is32Bit()">,
+                             AssemblerPredicate<"Mode32Bit", "32-bit mode">;
+def IsWin64      : Predicate<"Subtarget->isTargetWin64()">;
+def NotWin64     : Predicate<"!Subtarget->isTargetWin64()">;
+def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||"
+                                  "Subtarget->getFrameLowering()->hasFP(*MF)">;
+def IsPS4        : Predicate<"Subtarget->isTargetPS4()">;
+def NotPS4       : Predicate<"!Subtarget->isTargetPS4()">;
+def IsNaCl       : Predicate<"Subtarget->isTargetNaCl()">;
+def NotNaCl      : Predicate<"!Subtarget->isTargetNaCl()">;
+def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
+def KernelCode   : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
+def NearData     : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
+                             "TM.getCodeModel() == CodeModel::Kernel">;
+def IsNotPIC     : Predicate<"!TM.isPositionIndependent()">;
+def OptForSize   : Predicate<"OptForSize">;
+def OptForMinSize : Predicate<"OptForMinSize">;
+def OptForSpeed  : Predicate<"!OptForSize">;
+def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
+def CallImmAddr  : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
+def FavorMemIndirectCall  : Predicate<"!Subtarget->callRegIndirect()">;
+def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
+def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
+def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
+def HasMFence    : Predicate<"Subtarget->hasMFence()">;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+include "X86InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments.
+//
+
+// X86 specific condition code. These correspond to CondCode in
+// X86InstrInfo.h. They must be kept in synch.
+def X86_COND_A   : PatLeaf<(i8 0)>;  // alt. COND_NBE
+def X86_COND_AE  : PatLeaf<(i8 1)>;  // alt. COND_NC
+def X86_COND_B   : PatLeaf<(i8 2)>;  // alt. COND_C
+def X86_COND_BE  : PatLeaf<(i8 3)>;  // alt. COND_NA
+def X86_COND_E   : PatLeaf<(i8 4)>;  // alt. COND_Z
+def X86_COND_G   : PatLeaf<(i8 5)>;  // alt. COND_NLE
+def X86_COND_GE  : PatLeaf<(i8 6)>;  // alt. COND_NL
+def X86_COND_L   : PatLeaf<(i8 7)>;  // alt. COND_NGE
+def X86_COND_LE  : PatLeaf<(i8 8)>;  // alt. COND_NG
+def X86_COND_NE  : PatLeaf<(i8 9)>;  // alt. COND_NZ
+def X86_COND_NO  : PatLeaf<(i8 10)>;
+def X86_COND_NP  : PatLeaf<(i8 11)>; // alt. COND_PO
+def X86_COND_NS  : PatLeaf<(i8 12)>;
+def X86_COND_O   : PatLeaf<(i8 13)>;
+def X86_COND_P   : PatLeaf<(i8 14)>; // alt. COND_PE
+def X86_COND_S   : PatLeaf<(i8 15)>;
+
+def i16immSExt8  : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>;
+def i32immSExt8  : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
+def i64immSExt8  : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
+def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
+
+// If we have multiple users of an immediate, it's much smaller to reuse
+// the register, rather than encode the immediate in every instruction.
+// This has the risk of increasing register pressure from stretched live
+// ranges, however, the immediates should be trivial to rematerialize by
+// the RA in the event of high register pressure.
+// TODO : This is currently enabled for stores and binary ops. There are more
+// cases for which this can be enabled, though this catches the bulk of the
+// issues.
+// TODO2 : This should really also be enabled under O2, but there's currently
+// an issue with RA where we don't pull the constants into their users
+// when we rematerialize them. I'll follow-up on enabling O2 after we fix that
+// issue.
+// TODO3 : This is currently limited to single basic blocks (DAG creation
+// pulls block immediates to the top and merges them if necessary).
+// Eventually, it would be nice to allow ConstantHoisting to merge constants
+// globally for potentially added savings.
+//
+def imm8_su : PatLeaf<(i8 relocImm), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def imm16_su : PatLeaf<(i16 relocImm), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def imm32_su : PatLeaf<(i32 relocImm), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64immSExt32_su : PatLeaf<(i64immSExt32), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
+def i16immSExt8_su : PatLeaf<(i16immSExt8), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i32immSExt8_su : PatLeaf<(i32immSExt8), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64immSExt8_su : PatLeaf<(i64immSExt8), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
+// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+// unsigned field.
+def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>;
+
+def i64immZExt32SExt8 : ImmLeaf<i64, [{
+  return isUInt<32>(Imm) && isInt<8>(static_cast<int32_t>(Imm));
+}]>;
+
+// Helper fragments for loads.
+// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
+// known to be 32-bit aligned or better. Ditto for i8 to i16.
+def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    return true;
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 2 && !LD->isVolatile();
+  return false;
+}]>;
+
+def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 2 && !LD->isVolatile();
+  return false;
+}]>;
+
+def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    return true;
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 4 && !LD->isVolatile();
+  return false;
+}]>;
+
+def loadi8   : PatFrag<(ops node:$ptr), (i8  (load node:$ptr))>;
+def loadi64  : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
+def loadf32  : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
+def loadf64  : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
+def loadf80  : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
+def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>;
+
+def sextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
+def sextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
+def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+def sextloadi64i8  : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
+def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi8i1   : PatFrag<(ops node:$ptr), (i8  (zextloadi1 node:$ptr))>;
+def zextloadi16i1  : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
+def zextloadi32i1  : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
+def zextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def zextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
+def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+def zextloadi64i1  : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
+def zextloadi64i8  : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
+def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+def extloadi8i1    : PatFrag<(ops node:$ptr), (i8  (extloadi1 node:$ptr))>;
+def extloadi16i1   : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
+def extloadi32i1   : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
+def extloadi16i8   : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
+def extloadi32i8   : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
+def extloadi32i16  : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+def extloadi64i1   : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
+def extloadi64i8   : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
+def extloadi64i16  : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+def extloadi64i32  : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+// An 'srl' node with a single use.
+def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+// An 'trunc' node with a single use.
+def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
+  return N->hasOneUse();
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list.
+//
+
+// Nop
+let hasSideEffects = 0, SchedRW = [WriteZero] in {
+  def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
+  def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero),
+                "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16;
+  def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero),
+                "nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32;
+}
+
+
+// Constructing a stack frame.
+def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
+                 "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>;
+
+let SchedRW = [WriteALU] in {
+let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, hasSideEffects=0 in
+def LEAVE    : I<0xC9, RawFrm,
+                 (outs), (ins), "leave", [], IIC_LEAVE>,
+                 Requires<[Not64BitMode]>;
+
+let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, hasSideEffects = 0 in
+def LEAVE64  : I<0xC9, RawFrm,
+                 (outs), (ins), "leave", [], IIC_LEAVE>,
+                 Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1 in
+  def Int_eh_sjlj_setup_dispatch
+    : PseudoI<(outs), (ins), [(X86eh_sjlj_setup_dispatch)]>;
+
+let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def POP16r  : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+                IIC_POP_REG16>, OpSize16;
+def POP32r  : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+                IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
+def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+                IIC_POP_REG>, OpSize16;
+def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [],
+                IIC_POP_MEM>, OpSize16;
+def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+                IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
+def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [],
+                IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>;
+} // mayLoad, SchedRW
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def PUSH16r  : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize16;
+def PUSH32r  : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
+def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize16;
+def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>;
+
+def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm),
+                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
+def PUSHi16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
+                   "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16;
+
+def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
+                   "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                   Requires<[Not64BitMode]>;
+def PUSHi32  : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
+                   "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                   Requires<[Not64BitMode]>;
+} // mayStore, SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
+                 IIC_PUSH_MEM>, OpSize16;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
+                 IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>;
+} // mayLoad, mayStore, SchedRW
+
+}
+
+let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
+    SchedRW = [WriteRMW], Defs = [ESP] in {
+  let Uses = [ESP] in
+  def RDFLAGS32 : PseudoI<(outs GR32:$dst), (ins),
+                   [(set GR32:$dst, (int_x86_flags_read_u32))]>,
+                Requires<[Not64BitMode]>;
+
+  let Uses = [RSP] in
+  def RDFLAGS64 : PseudoI<(outs GR64:$dst), (ins),
+                   [(set GR64:$dst, (int_x86_flags_read_u64))]>,
+                Requires<[In64BitMode]>;
+}
+
+let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
+    SchedRW = [WriteRMW] in {
+  let Defs = [ESP, EFLAGS], Uses = [ESP] in
+  def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src),
+                   [(int_x86_flags_write_u32 GR32:$src)]>,
+                Requires<[Not64BitMode]>;
+
+  let Defs = [RSP, EFLAGS], Uses = [RSP] in
+  def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src),
+                   [(int_x86_flags_write_u64 GR64:$src)]>,
+                Requires<[In64BitMode]>;
+}
+
+let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0,
+    SchedRW = [WriteLoad] in {
+def POPF16   : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>,
+                OpSize16;
+def POPF32   : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
+                OpSize32, Requires<[Not64BitMode]>;
+}
+
+let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, hasSideEffects=0,
+    SchedRW = [WriteStore] in {
+def PUSHF16  : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
+                 OpSize16;
+def PUSHF32  : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>,
+               OpSize32, Requires<[Not64BitMode]>;
+}
+
+let Defs = [RSP], Uses = [RSP], hasSideEffects=0 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def POP64r   : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
+                 IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
+def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
+                IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
+def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [],
+                IIC_POP_MEM>, OpSize32, Requires<[In64BitMode]>;
+} // mayLoad, SchedRW
+let mayStore = 1, SchedRW = [WriteStore] in {
+def PUSH64r  : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
+                 IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
+def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
+                 IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
+} // mayStore, SchedRW
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in {
+def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
+                 IIC_PUSH_MEM>, OpSize32, Requires<[In64BitMode]>;
+} // mayLoad, mayStore, SchedRW
+}
+
+let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1,
+    SchedRW = [WriteStore] in {
+def PUSH64i8   : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
+                    "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                    Requires<[In64BitMode]>;
+def PUSH64i32  : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
+                    "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+                    Requires<[In64BitMode]>;
+}
+
+let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in
+def POPF64   : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
+               OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>;
+let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, hasSideEffects=0 in
+def PUSHF64    : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
+                 OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>;
+
+let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
+    mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in {
+def POPA32   : I<0x61, RawFrm, (outs), (ins), "popal", [], IIC_POP_A>,
+               OpSize32, Requires<[Not64BitMode]>;
+def POPA16   : I<0x61, RawFrm, (outs), (ins), "popaw", [], IIC_POP_A>,
+               OpSize16, Requires<[Not64BitMode]>;
+}
+let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
+    mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pushal", [], IIC_PUSH_A>,
+               OpSize32, Requires<[Not64BitMode]>;
+def PUSHA16  : I<0x60, RawFrm, (outs), (ins), "pushaw", [], IIC_PUSH_A>,
+               OpSize16, Requires<[Not64BitMode]>;
+}
+
+let Constraints = "$src = $dst", SchedRW = [WriteALU] in {
+// GR32 = bswap GR32
+def BSWAP32r : I<0xC8, AddRegFrm,
+                 (outs GR32:$dst), (ins GR32:$src),
+                 "bswap{l}\t$dst",
+                 [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, OpSize32, TB;
+
+def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
+                  "bswap{q}\t$dst",
+                  [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB;
+} // Constraints = "$src = $dst", SchedRW
+
+// Bit scan instructions.
+let Defs = [EFLAGS] in {
+def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                 "bsf{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
+                  IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
+def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                 "bsf{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
+                  IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
+def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                 "bsf{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))],
+                 IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
+def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                 "bsf{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
+                 IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
+def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                  "bsf{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
+                  IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
+def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                  "bsf{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
+                  IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
+
+def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                 "bsr{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))],
+                 IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>;
+def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                 "bsr{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
+                 IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>;
+def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                 "bsr{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))],
+                 IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>;
+def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                 "bsr{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
+                 IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>;
+def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                  "bsr{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))],
+                  IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>;
+def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                  "bsr{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
+                  IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>;
+} // Defs = [EFLAGS]
+
+let SchedRW = [WriteMicrocoded] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
+def MOVSB : I<0xA4, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
+              "movsb\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
+def MOVSW : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
+              "movsw\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize16;
+def MOVSL : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
+              "movs{l|d}\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize32;
+def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
+               "movsq\t{$src, $dst|$dst, $src}", [], IIC_MOVS>;
+}
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
+def STOSB : I<0xAA, RawFrmDst, (outs), (ins dstidx8:$dst),
+              "stosb\t{%al, $dst|$dst, al}", [], IIC_STOS>;
+let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
+def STOSW : I<0xAB, RawFrmDst, (outs), (ins dstidx16:$dst),
+              "stosw\t{%ax, $dst|$dst, ax}", [], IIC_STOS>, OpSize16;
+let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
+def STOSL : I<0xAB, RawFrmDst, (outs), (ins dstidx32:$dst),
+              "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32;
+let Defs = [RDI], Uses = [RAX,RDI,EFLAGS] in
+def STOSQ : RI<0xAB, RawFrmDst, (outs), (ins dstidx64:$dst),
+               "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>;
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,EFLAGS], Uses = [AL,EDI,EFLAGS] in
+def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst),
+              "scasb\t{$dst, %al|al, $dst}", [], IIC_SCAS>;
+let Defs = [EDI,EFLAGS], Uses = [AX,EDI,EFLAGS] in
+def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst),
+              "scasw\t{$dst, %ax|ax, $dst}", [], IIC_SCAS>, OpSize16;
+let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,EFLAGS] in
+def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst),
+              "scas{l|d}\t{$dst, %eax|eax, $dst}", [], IIC_SCAS>, OpSize32;
+let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,EFLAGS] in
+def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst),
+               "scasq\t{$dst, %rax|rax, $dst}", [], IIC_SCAS>;
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,EFLAGS] in {
+def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src),
+              "cmpsb\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
+def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src),
+              "cmpsw\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize16;
+def CMPSL : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src),
+              "cmps{l|d}\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize32;
+def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src),
+               "cmpsq\t{$dst, $src|$src, $dst}", [], IIC_CMPS>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions.
+//
+let SchedRW = [WriteMove] in {
+let hasSideEffects = 0 in {
+def MOV8rr  : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri  : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}",
+                   [(set GR8:$dst, imm:$src)], IIC_MOV>;
+def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
+                   "mov{w}\t{$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize16;
+def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
+                   "mov{l}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, relocImm:$src)], IIC_MOV>, OpSize32;
+def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
+                       "mov{q}\t{$src, $dst|$dst, $src}",
+                       [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
+}
+let isReMaterializable = 1 in {
+def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
+                    "movabs{q}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, relocImm:$src)], IIC_MOV>;
+}
+
+// Longer forms that use a ModR/M byte. Needed for disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOV8ri_alt  : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src),
+                   "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
+                   "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+}
+} // SchedRW
+
+let SchedRW = [WriteStore] in {
+def MOV8mi  : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}",
+                   [(store (i8 imm8_su:$src), addr:$dst)], IIC_MOV_MEM>;
+def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
+                   "mov{w}\t{$src, $dst|$dst, $src}",
+                   [(store (i16 imm16_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16;
+def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
+                   "mov{l}\t{$src, $dst|$dst, $src}",
+                   [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
+def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+                       "mov{q}\t{$src, $dst|$dst, $src}",
+                       [(store i64immSExt32_su:$src, addr:$dst)], IIC_MOV_MEM>;
+} // SchedRW
+
+let hasSideEffects = 0 in {
+
+/// Memory offset versions of moves. The immediate is an address mode sized
+/// offset from the segment base.
+let SchedRW = [WriteALU] in {
+let mayLoad = 1 in {
+let Defs = [AL] in
+def MOV8ao32 : Ii32<0xA0, RawFrmMemOffs, (outs), (ins offset32_8:$src),
+                    "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
+                    AdSize32;
+let Defs = [AX] in
+def MOV16ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_16:$src),
+                     "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+                     OpSize16, AdSize32;
+let Defs = [EAX] in
+def MOV32ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_32:$src),
+                     "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+                     OpSize32, AdSize32;
+let Defs = [RAX] in
+def MOV64ao32 : RIi32<0xA1, RawFrmMemOffs, (outs), (ins offset32_64:$src),
+                      "mov{q}\t{$src, %rax|rax, $src}", [], IIC_MOV_MEM>,
+                      AdSize32;
+
+let Defs = [AL] in
+def MOV8ao16 : Ii16<0xA0, RawFrmMemOffs, (outs), (ins offset16_8:$src),
+                    "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, AdSize16;
+let Defs = [AX] in
+def MOV16ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_16:$src),
+                     "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+                     OpSize16, AdSize16;
+let Defs = [EAX] in
+def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src),
+                     "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+                     AdSize16, OpSize32;
+}
+let mayStore = 1 in {
+let Uses = [AL] in
+def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs), (ins offset32_8:$dst),
+                    "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize32;
+let Uses = [AX] in
+def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_16:$dst),
+                     "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+                     OpSize16, AdSize32;
+let Uses = [EAX] in
+def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_32:$dst),
+                     "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+                     OpSize32, AdSize32;
+let Uses = [RAX] in
+def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs), (ins offset32_64:$dst),
+                      "mov{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>,
+                      AdSize32;
+
+let Uses = [AL] in
+def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs), (ins offset16_8:$dst),
+                    "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize16;
+let Uses = [AX] in
+def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_16:$dst),
+                     "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+                     OpSize16, AdSize16;
+let Uses = [EAX] in
+def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_32:$dst),
+                     "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+                     OpSize32, AdSize16;
+}
+}
+
+// These forms all have full 64-bit absolute addresses in their instructions
+// and use the movabs mnemonic to indicate this specific form.
+let mayLoad = 1 in {
+let Defs = [AL] in
+def MOV8ao64 : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src),
+                     "movabs{b}\t{$src, %al|al, $src}", []>, AdSize64;
+let Defs = [AX] in
+def MOV16ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src),
+                     "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize16, AdSize64;
+let Defs = [EAX] in
+def MOV32ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src),
+                     "movabs{l}\t{$src, %eax|eax, $src}", []>, OpSize32,
+                     AdSize64;
+let Defs = [RAX] in
+def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src),
+                     "movabs{q}\t{$src, %rax|rax, $src}", []>, AdSize64;
+}
+
+let mayStore = 1 in {
+let Uses = [AL] in
+def MOV8o64a : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs), (ins offset64_8:$dst),
+                     "movabs{b}\t{%al, $dst|$dst, al}", []>, AdSize64;
+let Uses = [AX] in
+def MOV16o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_16:$dst),
+                     "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize16, AdSize64;
+let Uses = [EAX] in
+def MOV32o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_32:$dst),
+                     "movabs{l}\t{%eax, $dst|$dst, eax}", []>, OpSize32,
+                     AdSize64;
+let Uses = [RAX] in
+def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst),
+                     "movabs{q}\t{%rax, $dst|$dst, rax}", []>, AdSize64;
+}
+} // hasSideEffects = 0
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteMove] in {
+def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
+def MOV8rm  : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}",
+                [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>;
+def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}",
+                [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize16;
+def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}",
+                [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>, OpSize32;
+def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}",
+                 [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>;
+}
+
+let SchedRW = [WriteStore] in {
+def MOV8mr  : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}",
+                [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>;
+def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}",
+                [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize16;
+def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}",
+                [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>, OpSize32;
+def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}",
+                 [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>;
+} // SchedRW
+
+// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
+// that they can be used for copying and storing h registers, which can't be
+// encoded when a REX prefix is present.
+let isCodeGenOnly = 1 in {
+let hasSideEffects = 0 in
+def MOV8rr_NOREX : I<0x88, MRMDestReg,
+                     (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [], IIC_MOV>,
+                   Sched<[WriteMove]>;
+let mayStore = 1, hasSideEffects = 0 in
+def MOV8mr_NOREX : I<0x88, MRMDestMem,
+                     (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
+                     IIC_MOV_MEM>, Sched<[WriteStore]>;
+let mayLoad = 1, hasSideEffects = 0,
+    canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
+                     (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
+                     IIC_MOV_MEM>, Sched<[WriteLoad]>;
+}
+
+
+// Condition code ops, incl. set if equal/not equal/...
+let SchedRW = [WriteALU] in {
+let Defs = [EFLAGS], Uses = [AH] in
+def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf",
+                 [(set EFLAGS, (X86sahf AH))], IIC_AHF>,
+               Requires<[HasLAHFSAHF]>;
+let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in
+def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", [],
+                IIC_AHF>,  // AH = flags
+               Requires<[HasLAHFSAHF]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Bit tests instructions: BT, BTS, BTR, BTC.
+
+let Defs = [EFLAGS] in {
+let SchedRW = [WriteALU] in {
+def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+               "bt{w}\t{$src2, $src1|$src1, $src2}",
+               [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>,
+               OpSize16, TB;
+def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+               "bt{l}\t{$src2, $src1|$src1, $src2}",
+               [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>,
+               OpSize32, TB;
+def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+               "bt{q}\t{$src2, $src1|$src1, $src2}",
+               [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB;
+} // SchedRW
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Make these instructions disassembly
+// only for now.
+
+let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in {
+  def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                 "bt{w}\t{$src2, $src1|$src1, $src2}",
+  //               [(X86bt (loadi16 addr:$src1), GR16:$src2),
+  //                (implicit EFLAGS)]
+                 [], IIC_BT_MR
+                 >, OpSize16, TB, Requires<[FastBTMem]>;
+  def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                 "bt{l}\t{$src2, $src1|$src1, $src2}",
+  //               [(X86bt (loadi32 addr:$src1), GR32:$src2),
+  //                (implicit EFLAGS)]
+                 [], IIC_BT_MR
+                 >, OpSize32, TB, Requires<[FastBTMem]>;
+  def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "bt{q}\t{$src2, $src1|$src1, $src2}",
+  //               [(X86bt (loadi64 addr:$src1), GR64:$src2),
+  //                (implicit EFLAGS)]
+                  [], IIC_BT_MR
+                  >, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                "bt{w}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))],
+                IIC_BT_RI>, OpSize16, TB;
+def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                "bt{l}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))],
+                IIC_BT_RI>, OpSize32, TB;
+def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                "bt{q}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))],
+                IIC_BT_RI>, TB;
+} // SchedRW
+
+// Note that these instructions don't need FastBTMem because that
+// only applies when the other operand is in a register. When it's
+// an immediate, bt is still fast.
+let SchedRW = [WriteALU] in {
+def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                "bt{w}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2))
+                 ], IIC_BT_MI>, OpSize16, TB;
+def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                "bt{l}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2))
+                 ], IIC_BT_MI>, OpSize32, TB;
+def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                "bt{q}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt (loadi64 addr:$src1),
+                                     i64immSExt8:$src2))], IIC_BT_MI>, TB;
+} // SchedRW
+
+let hasSideEffects = 0 in {
+let SchedRW = [WriteALU] in {
+def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+                "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize16, TB;
+def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize32, TB;
+def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+                 "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize16, TB;
+def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize32, TB;
+def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                    "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize16, TB;
+def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize32, TB;
+def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                    "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize16, TB;
+def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize32, TB;
+def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+                "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize16, TB;
+def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize32, TB;
+def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+                 "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize16, TB;
+def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize32, TB;
+def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                    "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize16, TB;
+def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize32, TB;
+def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                    "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize16, TB;
+def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize32, TB;
+def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+                "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize16, TB;
+def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+                "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+              OpSize32, TB;
+def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+               "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+              "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+              OpSize16, TB;
+def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+              "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+              OpSize32, TB;
+def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                    "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize16, TB;
+def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize32, TB;
+def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                    "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize16, TB;
+def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize32, TB;
+def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+} // hasSideEffects = 0
+} // Defs = [EFLAGS]
+
+
+//===----------------------------------------------------------------------===//
+// Atomic support
+//
+
+// Atomic swap. These are just normal xchg instructions. But since a memory
+// operand is referenced, the atomicity is ensured.
+multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
+                       InstrItinClass itin> {
+  let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in {
+    def NAME#8rm  : I<opc8, MRMSrcMem, (outs GR8:$dst),
+                      (ins GR8:$val, i8mem:$ptr),
+                      !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+                      [(set
+                         GR8:$dst,
+                         (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+                      itin>;
+    def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
+                      (ins GR16:$val, i16mem:$ptr),
+                      !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+                      [(set
+                         GR16:$dst,
+                         (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+                      itin>, OpSize16;
+    def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
+                      (ins GR32:$val, i32mem:$ptr),
+                      !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+                      [(set
+                         GR32:$dst,
+                         (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+                      itin>, OpSize32;
+    def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
+                       (ins GR64:$val, i64mem:$ptr),
+                       !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+                       [(set
+                         GR64:$dst,
+                         (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+                       itin>;
+  }
+}
+
+defm XCHG    : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>;
+
+// Swap between registers.
+let SchedRW = [WriteALU] in {
+let Constraints = "$val = $dst" in {
+def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
+                "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
+                 "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>,
+                 OpSize16;
+def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src),
+                 "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>,
+                 OpSize32;
+def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
+                  "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+}
+
+// Swap between EAX and other registers.
+let Uses = [AX], Defs = [AX] in
+def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
+                  "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize16;
+let Uses = [EAX], Defs = [EAX] in
+def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
+                  "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
+                  OpSize32, Requires<[Not64BitMode]>;
+let Uses = [EAX], Defs = [EAX] in
+// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
+// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
+def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
+                   "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
+                   OpSize32, Requires<[In64BitMode]>;
+let Uses = [RAX], Defs = [RAX] in
+def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
+                  "xchg{q}\t{$src, %rax|rax, $src}", [], IIC_XCHG_REG>;
+} // SchedRW
+
+let SchedRW = [WriteALU] in {
+def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+                "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                 "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
+                 OpSize16;
+def XADD32rr  : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
+                 OpSize32;
+def XADD64rr  : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+                   "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def XADD8rm   : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+                 "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+def XADD16rm  : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                 "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+                 OpSize16;
+def XADD32rm  : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+                 OpSize32;
+def XADD64rm  : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                   "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+
+}
+
+let SchedRW = [WriteALU] in {
+def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+                   "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+                   IIC_CMPXCHG_REG8>, TB;
+def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                    "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+                    IIC_CMPXCHG_REG>, TB, OpSize16;
+def CMPXCHG32rr  : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_REG>, TB, OpSize32;
+def CMPXCHG64rr  : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+                      IIC_CMPXCHG_REG>, TB;
+} // SchedRW
+
+let SchedRW = [WriteALULd, WriteRMW] in {
+let mayLoad = 1, mayStore = 1 in {
+def CMPXCHG8rm   : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+                     "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM8>, TB;
+def CMPXCHG16rm  : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                     "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM>, TB, OpSize16;
+def CMPXCHG32rm  : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM>, TB, OpSize32;
+def CMPXCHG64rm  : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+                      IIC_CMPXCHG_MEM>, TB;
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
+def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
+                  "cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB;
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
+def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
+                    "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>,
+                    TB, Requires<[HasCmpxchg16b]>;
+} // SchedRW
+
+
+// Lock instruction prefix
+def LOCK_PREFIX : I<0xF0, RawFrm, (outs),  (ins), "lock", []>;
+
+// Rex64 instruction prefix
+def REX64_PREFIX : I<0x48, RawFrm, (outs),  (ins), "rex64", []>,
+                     Requires<[In64BitMode]>;
+
+// Data16 instruction prefix
+def DATA16_PREFIX : I<0x66, RawFrm, (outs),  (ins), "data16", []>;
+
+// Repeat string operation instruction prefixes
+// These uses the DF flag in the EFLAGS register to inc or dec ECX
+let Defs = [ECX], Uses = [ECX,EFLAGS] in {
+// Repeat (used with INS, OUTS, MOVS, LODS and STOS)
+def REP_PREFIX : I<0xF3, RawFrm, (outs),  (ins), "rep", []>;
+// Repeat while not equal (used with CMPS and SCAS)
+def REPNE_PREFIX : I<0xF2, RawFrm, (outs),  (ins), "repne", []>;
+}
+
+
+// String manipulation instructions
+let SchedRW = [WriteMicrocoded] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [AL,ESI], Uses = [ESI,EFLAGS] in
+def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src),
+              "lodsb\t{$src, %al|al, $src}", [], IIC_LODS>;
+let Defs = [AX,ESI], Uses = [ESI,EFLAGS] in
+def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src),
+              "lodsw\t{$src, %ax|ax, $src}", [], IIC_LODS>, OpSize16;
+let Defs = [EAX,ESI], Uses = [ESI,EFLAGS] in
+def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src),
+              "lods{l|d}\t{$src, %eax|eax, $src}", [], IIC_LODS>, OpSize32;
+let Defs = [RAX,ESI], Uses = [ESI,EFLAGS] in
+def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src),
+               "lodsq\t{$src, %rax|rax, $src}", [], IIC_LODS>;
+}
+
+let SchedRW = [WriteSystem] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [ESI], Uses = [DX,ESI,EFLAGS] in {
+def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src),
+             "outsb\t{$src, %dx|dx, $src}", [], IIC_OUTS>;
+def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src),
+              "outsw\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize16;
+def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src),
+              "outs{l|d}\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize32;
+}
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI], Uses = [DX,EDI,EFLAGS] in {
+def INSB : I<0x6C, RawFrmDst, (outs), (ins dstidx8:$dst),
+             "insb\t{%dx, $dst|$dst, dx}", [], IIC_INS>;
+def INSW : I<0x6D, RawFrmDst, (outs), (ins dstidx16:$dst),
+             "insw\t{%dx, $dst|$dst, dx}", [], IIC_INS>,  OpSize16;
+def INSL : I<0x6D, RawFrmDst, (outs), (ins dstidx32:$dst),
+             "ins{l|d}\t{%dx, $dst|$dst, dx}", [], IIC_INS>, OpSize32;
+}
+}
+
+// Flag instructions
+let SchedRW = [WriteALU] in {
+def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>;
+def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>;
+def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>;
+def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>;
+def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>;
+def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>;
+def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>;
+
+def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
+}
+
+// Table lookup instructions
+let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in
+def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>,
+           Sched<[WriteLoad]>;
+
+let SchedRW = [WriteMicrocoded] in {
+// ASCII Adjust After Addition
+let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
+            Requires<[Not64BitMode]>;
+
+// ASCII Adjust AX Before Division
+let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
+                 "aad\t$src", [], IIC_AAD>, Requires<[Not64BitMode]>;
+
+// ASCII Adjust AX After Multiply
+let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
+                 "aam\t$src", [], IIC_AAM>, Requires<[Not64BitMode]>;
+
+// ASCII Adjust AL After Subtraction - sets
+let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in
+def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>,
+            Requires<[Not64BitMode]>;
+
+// Decimal Adjust AL after Addition
+let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
+def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
+            Requires<[Not64BitMode]>;
+
+// Decimal Adjust AL after Subtraction
+let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in
+def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
+            Requires<[Not64BitMode]>;
+} // SchedRW
+
+let SchedRW = [WriteSystem] in {
+// Check Array Index Against Bounds
+def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize16,
+                   Requires<[Not64BitMode]>;
+def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize32,
+                   Requires<[Not64BitMode]>;
+
+// Adjust RPL Field of Segment Selector
+def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                 "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>,
+                 Requires<[Not64BitMode]>;
+def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                 "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
+                 Requires<[Not64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// MOVBE Instructions
+//
+let Predicates = [HasMOVBE] in {
+  let SchedRW = [WriteALULd] in {
+  def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "movbe{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>,
+                    OpSize16, T8PS;
+  def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "movbe{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>,
+                    OpSize32, T8PS;
+  def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "movbe{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>,
+                     T8PS;
+  }
+  let SchedRW = [WriteStore] in {
+  def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                    "movbe{w}\t{$src, $dst|$dst, $src}",
+                    [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>,
+                    OpSize16, T8PS;
+  def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                    "movbe{l}\t{$src, $dst|$dst, $src}",
+                    [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>,
+                    OpSize32, T8PS;
+  def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                     "movbe{q}\t{$src, $dst|$dst, $src}",
+                     [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>,
+                     T8PS;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// RDRAND Instruction
+//
+let Predicates = [HasRDRAND], Defs = [EFLAGS] in {
+  def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
+                    "rdrand{w}\t$dst",
+                    [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize16, TB;
+  def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
+                    "rdrand{l}\t$dst",
+                    [(set GR32:$dst, EFLAGS, (X86rdrand))]>, OpSize32, TB;
+  def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
+                     "rdrand{q}\t$dst",
+                     [(set GR64:$dst, EFLAGS, (X86rdrand))]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// RDSEED Instruction
+//
+let Predicates = [HasRDSEED], Defs = [EFLAGS] in {
+  def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins),
+                    "rdseed{w}\t$dst",
+                    [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, TB;
+  def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
+                    "rdseed{l}\t$dst",
+                    [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, TB;
+  def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins),
+                     "rdseed{q}\t$dst",
+                     [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// LZCNT Instruction
+//
+let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
+  def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "lzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize16;
+  def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "lzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (ctlz (loadi16 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize16;
+
+  def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "lzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize32;
+  def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "lzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (ctlz (loadi32 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize32;
+
+  def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "lzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>,
+                     XS;
+  def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "lzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (ctlz (loadi64 addr:$src))),
+                      (implicit EFLAGS)]>, XS;
+}
+
+//===----------------------------------------------------------------------===//
+// BMI Instructions
+//
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "tzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize16;
+  def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "tzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (cttz (loadi16 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize16;
+
+  def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "tzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize32;
+  def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "tzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (cttz (loadi32 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize32;
+
+  def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "tzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>,
+                     XS;
+  def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "tzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (cttz (loadi64 addr:$src))),
+                      (implicit EFLAGS)]>, XS;
+}
+
+multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
+                  RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+  def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
+             !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
+             []>, T8PS, VEX_4V;
+  let mayLoad = 1 in
+  def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
+             !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
+             []>, T8PS, VEX_4V;
+}
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem>;
+  defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem>, VEX_W;
+  defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem>;
+  defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem>, VEX_W;
+  defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem>;
+  defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem>, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments to auto generate BMI instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasBMI] in {
+  // FIXME: patterns for the load versions are not implemented
+  def : Pat<(and GR32:$src, (add GR32:$src, -1)),
+            (BLSR32rr GR32:$src)>;
+  def : Pat<(and GR64:$src, (add GR64:$src, -1)),
+            (BLSR64rr GR64:$src)>;
+
+  def : Pat<(xor GR32:$src, (add GR32:$src, -1)),
+            (BLSMSK32rr GR32:$src)>;
+  def : Pat<(xor GR64:$src, (add GR64:$src, -1)),
+            (BLSMSK64rr GR64:$src)>;
+
+  def : Pat<(and GR32:$src, (ineg GR32:$src)),
+            (BLSI32rr GR32:$src)>;
+  def : Pat<(and GR64:$src, (ineg GR64:$src)),
+            (BLSI64rr GR64:$src)>;
+}
+
+
+multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
+                          X86MemOperand x86memop, Intrinsic Int,
+                          PatFrag ld_frag> {
+  def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
+             T8PS, VEX;
+  def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
+              (implicit EFLAGS)]>, T8PS, VEX;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem,
+                                int_x86_bmi_bextr_32, loadi32>;
+  defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem,
+                                int_x86_bmi_bextr_64, loadi64>, VEX_W;
+}
+
+let Predicates = [HasBMI2], Defs = [EFLAGS] in {
+  defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
+                               int_x86_bmi_bzhi_32, loadi32>;
+  defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
+                               int_x86_bmi_bzhi_64, loadi64>, VEX_W;
+}
+
+
+def CountTrailingOnes : SDNodeXForm<imm, [{
+  // Count the trailing ones in the immediate.
+  return getI8Imm(countTrailingOnes(N->getZExtValue()), SDLoc(N));
+}]>;
+
+def BZHIMask : ImmLeaf<i64, [{
+  return isMask_64(Imm) && (countTrailingOnes<uint64_t>(Imm) > 32);
+}]>;
+
+let Predicates = [HasBMI2] in {
+  def : Pat<(and GR64:$src, BZHIMask:$mask),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+                             (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
+
+  def : Pat<(and GR32:$src, (add (shl 1, GR8:$lz), -1)),
+            (BZHI32rr GR32:$src,
+              (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  def : Pat<(and (loadi32 addr:$src), (add (shl 1, GR8:$lz), -1)),
+            (BZHI32rm addr:$src,
+              (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  def : Pat<(and GR64:$src, (add (shl 1, GR8:$lz), -1)),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)),
+            (BZHI64rm addr:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+} // HasBMI2
+
+let Predicates = [HasBMI] in {
+  def : Pat<(X86bextr GR32:$src1, GR32:$src2),
+            (BEXTR32rr GR32:$src1, GR32:$src2)>;
+  def : Pat<(X86bextr (loadi32 addr:$src1), GR32:$src2),
+            (BEXTR32rm addr:$src1, GR32:$src2)>;
+  def : Pat<(X86bextr GR64:$src1, GR64:$src2),
+            (BEXTR64rr GR64:$src1, GR64:$src2)>;
+  def : Pat<(X86bextr (loadi64 addr:$src1), GR64:$src2),
+            (BEXTR64rm addr:$src1, GR64:$src2)>;
+} // HasBMI
+
+multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
+                         X86MemOperand x86memop, Intrinsic Int,
+                         PatFrag ld_frag> {
+  def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int RC:$src1, RC:$src2))]>,
+             VEX_4V;
+  def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>, VEX_4V;
+}
+
+let Predicates = [HasBMI2] in {
+  defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
+                               int_x86_bmi_pdep_32, loadi32>, T8XD;
+  defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
+                               int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W;
+  defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
+                               int_x86_bmi_pext_32, loadi32>, T8XS;
+  defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
+                               int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// TBM Instructions
+//
+let Predicates = [HasTBM], Defs = [EFLAGS] in {
+
+multiclass tbm_ternary_imm_intr<bits<8> opc, RegisterClass RC, string OpcodeStr,
+                                X86MemOperand x86memop, PatFrag ld_frag,
+                                Intrinsic Int, Operand immtype,
+                                SDPatternOperator immoperator> {
+  def ri : Ii32<opc,  MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
+                !strconcat(OpcodeStr,
+                           "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+                [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))]>,
+           XOP, XOPA;
+  def mi : Ii32<opc,  MRMSrcMem, (outs RC:$dst),
+                (ins x86memop:$src1, immtype:$cntl),
+                !strconcat(OpcodeStr,
+                           "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+                [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))]>,
+           XOP, XOPA;
+}
+
+defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32,
+                                     int_x86_tbm_bextri_u32, i32imm, imm>;
+let ImmT = Imm32S in
+defm BEXTRI64 : tbm_ternary_imm_intr<0x10, GR64, "bextr", i64mem, loadi64,
+                                     int_x86_tbm_bextri_u64, i64i32imm,
+                                     i64immSExt32>, VEX_W;
+
+multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
+                         RegisterClass RC, string OpcodeStr,
+                         X86MemOperand x86memop, PatFrag ld_frag> {
+let hasSideEffects = 0 in {
+  def rr : I<opc,  FormReg, (outs RC:$dst), (ins RC:$src),
+             !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+             []>, XOP_4V, XOP9;
+  let mayLoad = 1 in
+  def rm : I<opc,  FormMem, (outs RC:$dst), (ins x86memop:$src),
+             !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+             []>, XOP_4V, XOP9;
+}
+}
+
+multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr,
+                           Format FormReg, Format FormMem> {
+  defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr, i32mem,
+                               loadi32>;
+  defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr, i64mem,
+                               loadi64>, VEX_W;
+}
+
+defm BLCFILL : tbm_binary_intr<0x01, "blcfill", MRM1r, MRM1m>;
+defm BLCI    : tbm_binary_intr<0x02, "blci", MRM6r, MRM6m>;
+defm BLCIC   : tbm_binary_intr<0x01, "blcic", MRM5r, MRM5m>;
+defm BLCMSK  : tbm_binary_intr<0x02, "blcmsk", MRM1r, MRM1m>;
+defm BLCS    : tbm_binary_intr<0x01, "blcs", MRM3r, MRM3m>;
+defm BLSFILL : tbm_binary_intr<0x01, "blsfill", MRM2r, MRM2m>;
+defm BLSIC   : tbm_binary_intr<0x01, "blsic", MRM6r, MRM6m>;
+defm T1MSKC  : tbm_binary_intr<0x01, "t1mskc", MRM7r, MRM7m>;
+defm TZMSK   : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>;
+} // HasTBM, EFLAGS
+
+//===----------------------------------------------------------------------===//
+// MONITORX/MWAITX Instructions
+//
+let SchedRW = [ WriteSystem ] in {
+  let usesCustomInserter = 1 in {
+    def MONITORX : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
+                           [(int_x86_monitorx addr:$src1, GR32:$src2, GR32:$src3)]>,
+                   Requires<[ HasMWAITX ]>;
+  }
+
+  let Uses = [ EAX, ECX, EDX ] in {
+    def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", [], IIC_SSE_MONITORX>,
+                      TB, Requires<[ HasMWAITX ]>;
+  }
+
+  let Uses = [ ECX, EAX, EBX ] in {
+    def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", 
+                    [(int_x86_mwaitx ECX, EAX, EBX)], IIC_SSE_MWAITX>,
+                    TB, Requires<[ HasMWAITX ]>;
+  }
+} // SchedRW
+
+def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrrr)>,
+      Requires<[ Not64BitMode ]>;
+def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrrr)>,
+      Requires<[ In64BitMode ]>;
+
+def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>,
+      Requires<[ Not64BitMode ]>;
+def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>,
+      Requires<[ In64BitMode ]>;
+
+//===----------------------------------------------------------------------===//
+// CLZERO Instruction
+//
+let Uses = [EAX] in
+def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, TB;
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments to auto generate TBM instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasTBM] in {
+  def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)),
+            (BEXTRI32ri GR32:$src1, imm:$src2)>;
+  def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)),
+            (BEXTRI32mi addr:$src1, imm:$src2)>;
+  def : Pat<(X86bextr GR64:$src1, i64immSExt32:$src2),
+            (BEXTRI64ri GR64:$src1, i64immSExt32:$src2)>;
+  def : Pat<(X86bextr (loadi64 addr:$src1), i64immSExt32:$src2),
+            (BEXTRI64mi addr:$src1, i64immSExt32:$src2)>;
+
+  // FIXME: patterns for the load versions are not implemented
+  def : Pat<(and GR32:$src, (add GR32:$src, 1)),
+            (BLCFILL32rr GR32:$src)>;
+  def : Pat<(and GR64:$src, (add GR64:$src, 1)),
+            (BLCFILL64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (not (add GR32:$src, 1))),
+            (BLCI32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (not (add GR64:$src, 1))),
+            (BLCI64rr GR64:$src)>;
+
+  // Extra patterns because opt can optimize the above patterns to this.
+  def : Pat<(or GR32:$src, (sub -2, GR32:$src)),
+            (BLCI32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (sub -2, GR64:$src)),
+            (BLCI64rr GR64:$src)>;
+
+  def : Pat<(and (not GR32:$src), (add GR32:$src, 1)),
+            (BLCIC32rr GR32:$src)>;
+  def : Pat<(and (not GR64:$src), (add GR64:$src, 1)),
+            (BLCIC64rr GR64:$src)>;
+
+  def : Pat<(xor GR32:$src, (add GR32:$src, 1)),
+            (BLCMSK32rr GR32:$src)>;
+  def : Pat<(xor GR64:$src, (add GR64:$src, 1)),
+            (BLCMSK64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (add GR32:$src, 1)),
+            (BLCS32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (add GR64:$src, 1)),
+            (BLCS64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (add GR32:$src, -1)),
+            (BLSFILL32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (add GR64:$src, -1)),
+            (BLSFILL64rr GR64:$src)>;
+
+  def : Pat<(or (not GR32:$src), (add GR32:$src, -1)),
+            (BLSIC32rr GR32:$src)>;
+  def : Pat<(or (not GR64:$src), (add GR64:$src, -1)),
+            (BLSIC64rr GR64:$src)>;
+
+  def : Pat<(or (not GR32:$src), (add GR32:$src, 1)),
+            (T1MSKC32rr GR32:$src)>;
+  def : Pat<(or (not GR64:$src), (add GR64:$src, 1)),
+            (T1MSKC64rr GR64:$src)>;
+
+  def : Pat<(and (not GR32:$src), (add GR32:$src, -1)),
+            (TZMSK32rr GR32:$src)>;
+  def : Pat<(and (not GR64:$src), (add GR64:$src, -1)),
+            (TZMSK64rr GR64:$src)>;
+} // HasTBM
+
+//===----------------------------------------------------------------------===//
+// Memory Instructions
+//
+
+def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+                   "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD;
+def CLWB       : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", []>, PD;
+def PCOMMIT    : I<0xAE, MRM_F8, (outs), (ins), "pcommit", []>, PD;
+
+
+//===----------------------------------------------------------------------===//
+// Subsystems.
+//===----------------------------------------------------------------------===//
+
+include "X86InstrArithmetic.td"
+include "X86InstrCMovSetCC.td"
+include "X86InstrExtension.td"
+include "X86InstrControl.td"
+include "X86InstrShiftRotate.td"
+
+// X87 Floating Point Stack.
+include "X86InstrFPStack.td"
+
+// SIMD support (SSE, MMX and AVX)
+include "X86InstrFragmentsSIMD.td"
+
+// FMA - Fused Multiply-Add support (requires FMA)
+include "X86InstrFMA.td"
+
+// XOP
+include "X86InstrXOP.td"
+
+// SSE, MMX and 3DNow! vector support.
+include "X86InstrSSE.td"
+include "X86InstrAVX512.td"
+include "X86InstrMMX.td"
+include "X86Instr3DNow.td"
+
+// MPX instructions
+include "X86InstrMPX.td"
+
+include "X86InstrVMX.td"
+include "X86InstrSVM.td"
+
+include "X86InstrTSX.td"
+include "X86InstrSGX.td"
+
+// System instructions.
+include "X86InstrSystem.td"
+
+// Compiler Pseudo Instructions and Pat Patterns
+include "X86InstrCompiler.td"
+
+//===----------------------------------------------------------------------===//
+// Assembler Mnemonic Aliases
+//===----------------------------------------------------------------------===//
+
+def : MnemonicAlias<"call", "callw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"cbw",  "cbtw", "att">;
+def : MnemonicAlias<"cwde", "cwtl", "att">;
+def : MnemonicAlias<"cwd",  "cwtd", "att">;
+def : MnemonicAlias<"cdq",  "cltd", "att">;
+def : MnemonicAlias<"cdqe", "cltq", "att">;
+def : MnemonicAlias<"cqo",  "cqto", "att">;
+
+// In 64-bit mode lret maps to lretl; it is not ambiguous with lretq.
+def : MnemonicAlias<"lret", "lretw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>;
+
+def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"loopz",  "loope">;
+def : MnemonicAlias<"loopnz", "loopne">;
+
+def : MnemonicAlias<"pop",   "popw",  "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pop",   "popl",  "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pop",   "popq",  "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popf",  "popfw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popf",  "popfl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"popf",  "popfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popfd", "popfl", "att">;
+
+// FIXME: This is wrong for "push reg".  "push %bx" should turn into pushw in
+// all modes.  However: "push (addr)" and "push $42" should default to
+// pushl/pushq depending on the current mode.  Similar for "pop %bx"
+def : MnemonicAlias<"push",   "pushw",  "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"push",   "pushl",  "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"push",   "pushq",  "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushf",  "pushfw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pushf",  "pushfl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pushf",  "pushfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushfd", "pushfl", "att">;
+
+def : MnemonicAlias<"popad",  "popal",  "intel">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"pushad", "pushal", "intel">, Requires<[Not64BitMode]>;
+def : MnemonicAlias<"popa",   "popaw",  "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pusha",  "pushaw", "intel">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popa",   "popal",  "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pusha",  "pushal", "intel">, Requires<[In32BitMode]>;
+
+def : MnemonicAlias<"popa",   "popaw",  "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"pusha",  "pushaw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"popa",   "popal",  "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pusha",  "pushal", "att">, Requires<[In32BitMode]>;
+
+def : MnemonicAlias<"repe",  "rep">;
+def : MnemonicAlias<"repz",  "rep">;
+def : MnemonicAlias<"repnz", "repne">;
+
+def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>;
+
+// Apply 'ret' behavior to 'retn'
+def : MnemonicAlias<"retn", "retw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"retn", "retl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"retn", "retq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"retn", "ret", "intel">;
+
+def : MnemonicAlias<"sal", "shl", "intel">;
+def : MnemonicAlias<"salb", "shlb", "att">;
+def : MnemonicAlias<"salw", "shlw", "att">;
+def : MnemonicAlias<"sall", "shll", "att">;
+def : MnemonicAlias<"salq", "shlq", "att">;
+
+def : MnemonicAlias<"smovb", "movsb", "att">;
+def : MnemonicAlias<"smovw", "movsw", "att">;
+def : MnemonicAlias<"smovl", "movsl", "att">;
+def : MnemonicAlias<"smovq", "movsq", "att">;
+
+def : MnemonicAlias<"ud2a",  "ud2",  "att">;
+def : MnemonicAlias<"verrw", "verr", "att">;
+
+// System instruction aliases.
+def : MnemonicAlias<"iret",    "iretw",    "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"iret",    "iretl",    "att">, Requires<[Not16BitMode]>;
+def : MnemonicAlias<"sysret",  "sysretl",  "att">;
+def : MnemonicAlias<"sysexit", "sysexitl", "att">;
+
+def : MnemonicAlias<"lgdt", "lgdtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lgdt", "lgdtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"lidt", "lidtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"lidt", "lidtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lidt", "lidtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sgdt", "sgdtq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sidt", "sidtw", "att">, Requires<[In16BitMode]>;
+def : MnemonicAlias<"sidt", "sidtl", "att">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sidt", "sidtq", "att">, Requires<[In64BitMode]>;
+
+
+// Floating point stack aliases.
+def : MnemonicAlias<"fcmovz",   "fcmove",   "att">;
+def : MnemonicAlias<"fcmova",   "fcmovnbe", "att">;
+def : MnemonicAlias<"fcmovnae", "fcmovb",   "att">;
+def : MnemonicAlias<"fcmovna",  "fcmovbe",  "att">;
+def : MnemonicAlias<"fcmovae",  "fcmovnb",  "att">;
+def : MnemonicAlias<"fcomip",   "fcompi">;
+def : MnemonicAlias<"fildq",    "fildll",   "att">;
+def : MnemonicAlias<"fistpq",   "fistpll",  "att">;
+def : MnemonicAlias<"fisttpq",  "fisttpll", "att">;
+def : MnemonicAlias<"fldcww",   "fldcw",    "att">;
+def : MnemonicAlias<"fnstcww",  "fnstcw",   "att">;
+def : MnemonicAlias<"fnstsww",  "fnstsw",   "att">;
+def : MnemonicAlias<"fucomip",  "fucompi">;
+def : MnemonicAlias<"fwait",    "wait">;
+
+def : MnemonicAlias<"fxsaveq",   "fxsave64",   "att">;
+def : MnemonicAlias<"fxrstorq",  "fxrstor64",  "att">;
+def : MnemonicAlias<"xsaveq",    "xsave64",    "att">;
+def : MnemonicAlias<"xrstorq",   "xrstor64",   "att">;
+def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">;
+def : MnemonicAlias<"xrstorsq",  "xrstors64",  "att">;
+def : MnemonicAlias<"xsavecq",   "xsavec64",   "att">;
+def : MnemonicAlias<"xsavesq",   "xsaves64",   "att">;
+
+class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond,
+                    string VariantName>
+  : MnemonicAlias<!strconcat(Prefix, OldCond, Suffix),
+                  !strconcat(Prefix, NewCond, Suffix), VariantName>;
+
+/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of
+/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for
+/// example "setz" -> "sete".
+multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix,
+                                        string V = ""> {
+  def C   : CondCodeAlias<Prefix, Suffix, "c",   "b",  V>; // setc   -> setb
+  def Z   : CondCodeAlias<Prefix, Suffix, "z" ,  "e",  V>; // setz   -> sete
+  def NA  : CondCodeAlias<Prefix, Suffix, "na",  "be", V>; // setna  -> setbe
+  def NB  : CondCodeAlias<Prefix, Suffix, "nb",  "ae", V>; // setnb  -> setae
+  def NC  : CondCodeAlias<Prefix, Suffix, "nc",  "ae", V>; // setnc  -> setae
+  def NG  : CondCodeAlias<Prefix, Suffix, "ng",  "le", V>; // setng  -> setle
+  def NL  : CondCodeAlias<Prefix, Suffix, "nl",  "ge", V>; // setnl  -> setge
+  def NZ  : CondCodeAlias<Prefix, Suffix, "nz",  "ne", V>; // setnz  -> setne
+  def PE  : CondCodeAlias<Prefix, Suffix, "pe",  "p",  V>; // setpe  -> setp
+  def PO  : CondCodeAlias<Prefix, Suffix, "po",  "np", V>; // setpo  -> setnp
+
+  def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b",  V>; // setnae -> setb
+  def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a",  V>; // setnbe -> seta
+  def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l",  V>; // setnge -> setl
+  def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g",  V>; // setnle -> setg
+}
+
+// Aliases for set<CC>
+defm : IntegerCondCodeMnemonicAlias<"set", "">;
+// Aliases for j<CC>
+defm : IntegerCondCodeMnemonicAlias<"j", "">;
+// Aliases for cmov<CC>{w,l,q}
+defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">;
+// No size suffix for intel-style asm.
+defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">;
+
+
+//===----------------------------------------------------------------------===//
+// Assembler Instruction Aliases
+//===----------------------------------------------------------------------===//
+
+// aad/aam default to base 10 if no operand is specified.
+def : InstAlias<"aad", (AAD8i8 10)>, Requires<[Not64BitMode]>;
+def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>;
+
+// Disambiguate the mem/imm form of bt-without-a-suffix as btl.
+// Likewise for btc/btr/bts.
+def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}",
+                (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}",
+                (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}",
+                (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}",
+                (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+
+// clr aliases.
+def : InstAlias<"clrb\t$reg", (XOR8rr  GR8 :$reg, GR8 :$reg), 0>;
+def : InstAlias<"clrw\t$reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
+def : InstAlias<"clrl\t$reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
+def : InstAlias<"clrq\t$reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
+
+// lods aliases. Accept the destination being omitted because it's implicit
+// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"lodsb\t$src", (LODSB srcidx8:$src),  0>;
+def : InstAlias<"lodsw\t$src", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods{l|d}\t$src", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lodsq\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lods\t{$src, %al|al, $src}", (LODSB srcidx8:$src),  0>;
+def : InstAlias<"lods\t{$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods\t{$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lods\t{$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"lods\t$src", (LODSB srcidx8:$src),  0>;
+def : InstAlias<"lods\t$src", (LODSW srcidx16:$src), 0>;
+def : InstAlias<"lods\t$src", (LODSL srcidx32:$src), 0>;
+def : InstAlias<"lods\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>;
+
+
+// stos aliases. Accept the source being omitted because it's implicit in
+// the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the source.
+def : InstAlias<"stosb\t$dst", (STOSB dstidx8:$dst),  0>;
+def : InstAlias<"stosw\t$dst", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos{l|d}\t$dst", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stosq\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stos\t{%al, $dst|$dst, al}", (STOSB dstidx8:$dst),  0>;
+def : InstAlias<"stos\t{%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos\t{%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stos\t{%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"stos\t$dst", (STOSB dstidx8:$dst),  0>;
+def : InstAlias<"stos\t$dst", (STOSW dstidx16:$dst), 0>;
+def : InstAlias<"stos\t$dst", (STOSL dstidx32:$dst), 0>;
+def : InstAlias<"stos\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+
+
+// scas aliases. Accept the destination being omitted because it's implicit
+// in the mnemonic, or the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"scasb\t$dst", (SCASB dstidx8:$dst),  0>;
+def : InstAlias<"scasw\t$dst", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas{l|d}\t$dst", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scasq\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scas\t{$dst, %al|al, $dst}", (SCASB dstidx8:$dst),  0>;
+def : InstAlias<"scas\t{$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas\t{$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scas\t{$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"scas\t$dst", (SCASB dstidx8:$dst),  0>;
+def : InstAlias<"scas\t$dst", (SCASW dstidx16:$dst), 0>;
+def : InstAlias<"scas\t$dst", (SCASL dstidx32:$dst), 0>;
+def : InstAlias<"scas\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>;
+
+// cmps aliases. Mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSB dstidx8:$dst, srcidx8:$src),  0>;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSW dstidx16:$dst, srcidx16:$src), 0>;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSL dstidx32:$dst, srcidx32:$src), 0>;
+def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSQ dstidx64:$dst, srcidx64:$src), 0>, Requires<[In64BitMode]>;
+
+// movs aliases. Mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSB dstidx8:$dst, srcidx8:$src),  0>;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSW dstidx16:$dst, srcidx16:$src), 0>;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSL dstidx32:$dst, srcidx32:$src), 0>;
+def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSQ dstidx64:$dst, srcidx64:$src), 0>, Requires<[In64BitMode]>;
+
+// div and idiv aliases for explicit A register.
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r  GR8 :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32r GR32:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64r GR64:$src)>;
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8m  i8mem :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16m i16mem:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32m i32mem:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64m i64mem:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8r  GR8 :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16r GR16:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32r GR32:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64r GR64:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8m  i8mem :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16m i16mem:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32m i32mem:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>;
+
+
+
+// Various unary fpstack operations default to operating on on ST1.
+// For example, "fxch" -> "fxch %st(1)"
+def : InstAlias<"faddp",        (ADD_FPrST0  ST1), 0>;
+def:  InstAlias<"fadd",         (ADD_FPrST0  ST1), 0>;
+def : InstAlias<"fsub{|r}p",    (SUBR_FPrST0 ST1), 0>;
+def : InstAlias<"fsub{r|}p",    (SUB_FPrST0  ST1), 0>;
+def : InstAlias<"fmul",         (MUL_FPrST0  ST1), 0>;
+def : InstAlias<"fmulp",        (MUL_FPrST0  ST1), 0>;
+def : InstAlias<"fdiv{|r}p",    (DIVR_FPrST0 ST1), 0>;
+def : InstAlias<"fdiv{r|}p",    (DIV_FPrST0  ST1), 0>;
+def : InstAlias<"fxch",         (XCH_F       ST1), 0>;
+def : InstAlias<"fcom",         (COM_FST0r   ST1), 0>;
+def : InstAlias<"fcomp",        (COMP_FST0r  ST1), 0>;
+def : InstAlias<"fcomi",        (COM_FIr     ST1), 0>;
+def : InstAlias<"fcompi",       (COM_FIPr    ST1), 0>;
+def : InstAlias<"fucom",        (UCOM_Fr     ST1), 0>;
+def : InstAlias<"fucomp",       (UCOM_FPr    ST1), 0>;
+def : InstAlias<"fucomi",       (UCOM_FIr    ST1), 0>;
+def : InstAlias<"fucompi",      (UCOM_FIPr   ST1), 0>;
+
+// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op.
+// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)".  We also disambiguate
+// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
+// gas.
+multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
+ def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"),
+                 (Inst RST:$op), EmitAlias>;
+ def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"),
+                 (Inst ST0), EmitAlias>;
+}
+
+defm : FpUnaryAlias<"fadd",   ADD_FST0r>;
+defm : FpUnaryAlias<"faddp",  ADD_FPrST0, 0>;
+defm : FpUnaryAlias<"fsub",   SUB_FST0r>;
+defm : FpUnaryAlias<"fsub{|r}p",  SUBR_FPrST0>;
+defm : FpUnaryAlias<"fsubr",  SUBR_FST0r>;
+defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>;
+defm : FpUnaryAlias<"fmul",   MUL_FST0r>;
+defm : FpUnaryAlias<"fmulp",  MUL_FPrST0>;
+defm : FpUnaryAlias<"fdiv",   DIV_FST0r>;
+defm : FpUnaryAlias<"fdiv{|r}p",  DIVR_FPrST0>;
+defm : FpUnaryAlias<"fdivr",  DIVR_FST0r>;
+defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>;
+defm : FpUnaryAlias<"fcomi",   COM_FIr, 0>;
+defm : FpUnaryAlias<"fucomi",  UCOM_FIr, 0>;
+defm : FpUnaryAlias<"fcompi",   COM_FIPr>;
+defm : FpUnaryAlias<"fucompi",  UCOM_FIPr>;
+
+
+// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they
+// commute.  We also allow fdiv[r]p/fsubrp even though they don't commute,
+// solely because gas supports it.
+def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>;
+def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>;
+def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>;
+def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>;
+def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>;
+def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>;
+
+// We accept "fnstsw %eax" even though it only writes %ax.
+def : InstAlias<"fnstsw\t{%eax|eax}", (FNSTSW16r)>;
+def : InstAlias<"fnstsw\t{%al|al}" , (FNSTSW16r)>;
+def : InstAlias<"fnstsw"     , (FNSTSW16r)>;
+
+// lcall and ljmp aliases.  This seems to be an odd mapping in 64-bit mode, but
+// this is compatible with what GAS does.
+def : InstAlias<"lcall\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"ljmp\t$seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"lcall\t{*}$dst",    (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp\t{*}$dst",     (FARJMP32m  opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp\t$seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall\t{*}$dst",    (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp\t{*}$dst",     (FARJMP16m  opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+
+def : InstAlias<"call\t{*}$dst",     (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"jmp\t{*}$dst",      (JMP64m  i64mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"call\t{*}$dst",     (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"jmp\t{*}$dst",      (JMP32m  i32mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"call\t{*}$dst",     (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp\t{*}$dst",      (JMP16m  i16mem:$dst), 0>, Requires<[In16BitMode]>;
+
+
+// "imul <imm>, B" is an alias for "imul <imm>, B, B".
+def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri  GR16:$r, GR16:$r, i16imm:$imm), 0>;
+def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>;
+def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri  GR32:$r, GR32:$r, i32imm:$imm), 0>;
+def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>;
+def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>;
+def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>;
+
+// ins aliases. Accept the mnemonic suffix being omitted because it's implicit
+// in the destination.
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSB dstidx8:$dst),  0>;
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSW dstidx16:$dst),  0>;
+def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSL dstidx32:$dst),  0>;
+
+// outs aliases. Accept the mnemonic suffix being omitted because it's implicit
+// in the source.
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSB srcidx8:$src),  0>;
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSW srcidx16:$src),  0>;
+def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSL srcidx32:$src),  0>;
+
+// inb %dx -> inb %al, %dx
+def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>;
+def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>;
+def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>;
+def : InstAlias<"inb\t$port", (IN8ri u8imm:$port), 0>;
+def : InstAlias<"inw\t$port", (IN16ri u8imm:$port), 0>;
+def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>;
+
+
+// jmp and call aliases for lcall and ljmp.  jmp $42,$5 -> ljmp
+def : InstAlias<"call\t$seg, $off",  (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp\t$seg, $off",   (FARJMP16i  i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
+def : InstAlias<"call\t$seg, $off",  (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>;
+def : InstAlias<"jmp\t$seg, $off",   (FARJMP32i  i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>;
+def : InstAlias<"callw\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
+def : InstAlias<"jmpw\t$seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
+def : InstAlias<"calll\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
+def : InstAlias<"jmpl\t$seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>;
+
+// Force mov without a suffix with a segment and mem to prefer the 'l' form of
+// the move.  All segment/mem forms are equivalent, this has the shortest
+// encoding.
+def : InstAlias<"mov\t{$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>;
+def : InstAlias<"mov\t{$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
+
+// Match 'movq <largeimm>, <reg>' as an alias for movabsq.
+def : InstAlias<"movq\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
+
+// Match 'movq GR64, MMX' as an alias for movd.
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+                (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+                (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;
+
+// movsx aliases
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
+
+// movzx aliases
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr8 GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr16 GR64:$dst, GR16:$src), 0>;
+// Note: No GR32->GR64 movzx form.
+
+// outb %dx -> outb %al, %dx
+def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>;
+def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>;
+def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>;
+def : InstAlias<"outb\t$port", (OUT8ir u8imm:$port), 0>;
+def : InstAlias<"outw\t$port", (OUT16ir u8imm:$port), 0>;
+def : InstAlias<"outl\t$port", (OUT32ir u8imm:$port), 0>;
+
+// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
+// effect (both store to a 16-bit mem).  Force to sldtw to avoid ambiguity
+// errors, since its encoding is the most compact.
+def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem), 0>;
+
+// shld/shrd op,op -> shld op, op, CL
+def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shld{l}\t{$r2, $r1|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shld{q}\t{$r2, $r1|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>;
+def : InstAlias<"shrd{w}\t{$r2, $r1|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shrd{l}\t{$r2, $r1|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shrd{q}\t{$r2, $r1|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>;
+
+def : InstAlias<"shld{w}\t{$reg, $mem|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shld{l}\t{$reg, $mem|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shld{q}\t{$reg, $mem|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>;
+def : InstAlias<"shrd{w}\t{$reg, $mem|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shrd{l}\t{$reg, $mem|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shrd{q}\t{$reg, $mem|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>;
+
+/*  FIXME: This is disabled because the asm matcher is currently incapable of
+ *  matching a fixed immediate like $1.
+// "shl X, $1" is an alias for "shl X".
+multiclass ShiftRotateByOneAlias<string Mnemonic, string Opc> {
+ def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "8r1")) GR8:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "16r1")) GR16:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "32r1")) GR32:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "64r1")) GR64:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "8m1")) i8mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "16m1")) i16mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "32m1")) i32mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "64m1")) i64mem:$op)>;
+}
+
+defm : ShiftRotateByOneAlias<"rcl", "RCL">;
+defm : ShiftRotateByOneAlias<"rcr", "RCR">;
+defm : ShiftRotateByOneAlias<"rol", "ROL">;
+defm : ShiftRotateByOneAlias<"ror", "ROR">;
+FIXME */
+
+// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
+def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}",
+                (TEST8rm  GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}",
+                (TEST16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}",
+                (TEST32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}",
+                (TEST64rm GR64:$val, i64mem:$mem), 0>;
+
+// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
+def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}",
+                (XCHG8rm  GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}",
+                (XCHG16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}",
+                (XCHG32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}",
+                (XCHG64rm GR64:$val, i64mem:$mem), 0>;
+
+// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
+def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src), 0>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
+                (XCHG32ar GR32:$src), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
+                (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>;
+
+// These aliases exist to get the parser to prioritize matching 8-bit
+// immediate encodings over matching the implicit ax/eax/rax encodings. By
+// explicitly mentioning the A register here, these entries will be ordered
+// first due to the more explicit immediate type.
+def : InstAlias<"adc{w}\t{$imm, %ax|ax, $imm}", (ADC16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"add{w}\t{$imm, %ax|ax, $imm}", (ADD16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"and{w}\t{$imm, %ax|ax, $imm}", (AND16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"cmp{w}\t{$imm, %ax|ax, $imm}", (CMP16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"or{w}\t{$imm, %ax|ax, $imm}",  (OR16ri8 AX,  i16i8imm:$imm), 0>;
+def : InstAlias<"sbb{w}\t{$imm, %ax|ax, $imm}", (SBB16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"sub{w}\t{$imm, %ax|ax, $imm}", (SUB16ri8 AX, i16i8imm:$imm), 0>;
+def : InstAlias<"xor{w}\t{$imm, %ax|ax, $imm}", (XOR16ri8 AX, i16i8imm:$imm), 0>;
+
+def : InstAlias<"adc{l}\t{$imm, %eax|eax, $imm}", (ADC32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"add{l}\t{$imm, %eax|eax, $imm}", (ADD32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"and{l}\t{$imm, %eax|eax, $imm}", (AND32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"cmp{l}\t{$imm, %eax|eax, $imm}", (CMP32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"or{l}\t{$imm, %eax|eax, $imm}",  (OR32ri8 EAX,  i32i8imm:$imm), 0>;
+def : InstAlias<"sbb{l}\t{$imm, %eax|eax, $imm}", (SBB32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"sub{l}\t{$imm, %eax|eax, $imm}", (SUB32ri8 EAX, i32i8imm:$imm), 0>;
+def : InstAlias<"xor{l}\t{$imm, %eax|eax, $imm}", (XOR32ri8 EAX, i32i8imm:$imm), 0>;
+
+def : InstAlias<"adc{q}\t{$imm, %rax|rax, $imm}", (ADC64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"add{q}\t{$imm, %rax|rax, $imm}", (ADD64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"and{q}\t{$imm, %rax|rax, $imm}", (AND64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"cmp{q}\t{$imm, %rax|rax, $imm}", (CMP64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"or{q}\t{$imm, %rax|rax, $imm}",  (OR64ri8 RAX,  i64i8imm:$imm), 0>;
+def : InstAlias<"sbb{q}\t{$imm, %rax|rax, $imm}", (SBB64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"sub{q}\t{$imm, %rax|rax, $imm}", (SUB64ri8 RAX, i64i8imm:$imm), 0>;
+def : InstAlias<"xor{q}\t{$imm, %rax|rax, $imm}", (XOR64ri8 RAX, i64i8imm:$imm), 0>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
new file mode 100644
index 000000000000..0bb106823983
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
@@ -0,0 +1,675 @@
+//===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MMX instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+// All instructions that use MMX should be in this file, even if they also use
+// SSE.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Multiclasses
+//===----------------------------------------------------------------------===//
+
+let Sched = WriteVecALU in {
+def MMX_INTALU_ITINS : OpndItins<
+  IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
+>;
+
+def MMX_INTALUQ_ITINS : OpndItins<
+  IIC_MMX_ALUQ_RR, IIC_MMX_ALUQ_RM
+>;
+
+def MMX_PHADDSUBW : OpndItins<
+  IIC_MMX_PHADDSUBW_RR, IIC_MMX_PHADDSUBW_RM
+>;
+
+def MMX_PHADDSUBD : OpndItins<
+  IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM
+>;
+}
+
+let Sched = WriteVecLogic in
+def MMX_INTALU_ITINS_VECLOGICSCHED : OpndItins<
+  IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
+>;
+
+let Sched = WriteVecIMul in
+def MMX_PMUL_ITINS : OpndItins<
+  IIC_MMX_PMUL, IIC_MMX_PMUL
+>;
+
+let Sched = WriteVecIMul in {
+def MMX_PSADBW_ITINS : OpndItins<
+  IIC_MMX_PSADBW, IIC_MMX_PSADBW
+>;
+
+def MMX_MISC_FUNC_ITINS : OpndItins<
+  IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG
+>;
+}
+
+def MMX_SHIFT_ITINS : ShiftOpndItins<
+  IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI
+>;
+
+let Sched = WriteShuffle in {
+def MMX_UNPCK_H_ITINS : OpndItins<
+  IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM
+>;
+
+def MMX_UNPCK_L_ITINS : OpndItins<
+  IIC_MMX_UNPCK_L, IIC_MMX_UNPCK_L
+>;
+
+def MMX_PCK_ITINS : OpndItins<
+  IIC_MMX_PCK_RR, IIC_MMX_PCK_RM
+>;
+
+def MMX_PSHUF_ITINS : OpndItins<
+  IIC_MMX_PSHUF, IIC_MMX_PSHUF
+>;
+} // Sched
+
+let Sched = WriteCvtF2I in {
+def MMX_CVT_PD_ITINS : OpndItins<
+  IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM
+>;
+
+def MMX_CVT_PS_ITINS : OpndItins<
+  IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM
+>;
+}
+
+let Constraints = "$src1 = $dst" in {
+  // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
+  // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
+  multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               OpndItins itins, bit Commutable = 0> {
+    def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+                 (ins VR64:$src1, VR64:$src2),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                 [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
+              Sched<[itins.Sched]> {
+      let isCommutable = Commutable;
+    }
+    def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+                 (ins VR64:$src1, i64mem:$src2),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                 [(set VR64:$dst, (IntId VR64:$src1,
+                                   (bitconvert (load_mmx addr:$src2))))],
+                 itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+
+  multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+                                string OpcodeStr, Intrinsic IntId,
+                                Intrinsic IntId2, ShiftOpndItins itins> {
+    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+                                  (ins VR64:$src1, VR64:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
+             Sched<[WriteVecShift]>;
+    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+                                  (ins VR64:$src1, i64mem:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (IntId VR64:$src1,
+                                    (bitconvert (load_mmx addr:$src2))))],
+                  itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+    def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
+                                   (ins VR64:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))], itins.ri>,
+           Sched<[WriteVecShift]>;
+  }
+}
+
+/// Unary MMX instructions requiring SSSE3.
+multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
+                               Intrinsic IntId64, OpndItins itins> {
+  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>,
+             Sched<[itins.Sched]>;
+
+  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst,
+                     (IntId64 (bitconvert (memopmmx addr:$src))))],
+                   itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+/// Binary MMX instructions requiring SSSE3.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
+                             Intrinsic IntId64, OpndItins itins,
+                             bit Commutable = 0> {
+  let isCommutable = Commutable in
+  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
+       (ins VR64:$src1, VR64:$src2),
+        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+       [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>,
+      Sched<[itins.Sched]>;
+  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
+       (ins VR64:$src1, i64mem:$src2),
+        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+       [(set VR64:$dst,
+         (IntId64 VR64:$src1,
+          (bitconvert (memopmmx addr:$src2))))], itins.rm>,
+      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+}
+
+/// PALIGN MMX instructions (require SSSE3).
+multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
+  def R64irr  : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+      (ins VR64:$src1, VR64:$src2, u8imm:$src3),
+      !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+      [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>,
+      Sched<[WriteShuffle]>;
+  def R64irm  : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+      (ins VR64:$src1, i64mem:$src2, u8imm:$src3),
+      !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+      [(set VR64:$dst, (IntId VR64:$src1,
+                       (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>,
+      Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                         Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
+                         string asm, OpndItins itins, Domain d> {
+  def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                  [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>,
+            Sched<[itins.Sched]>;
+  def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                  [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>,
+            Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
+                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+                    PatFrag ld_frag, string asm, Domain d> {
+  def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst),
+                  (ins DstRC:$src1, SrcRC:$src2), asm,
+                  [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+                  NoItinerary, d>, Sched<[WriteCvtI2F]>;
+  def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst),
+                  (ins DstRC:$src1, x86memop:$src2), asm,
+                  [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+                  NoItinerary, d>, Sched<[WriteCvtI2FLd]>;
+}
+
+//===----------------------------------------------------------------------===//
+// MMX EMMS Instruction
+//===----------------------------------------------------------------------===//
+
+def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms",
+                     [(int_x86_mmx_emms)], IIC_MMX_EMMS>;
+
+//===----------------------------------------------------------------------===//
+// MMX Scalar Instructions
+//===----------------------------------------------------------------------===//
+
+// Data Transfer Instructions
+def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst,
+                         (x86mmx (scalar_to_vector GR32:$src)))],
+                        IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst,
+                        (x86mmx (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
+
+let Predicates = [HasMMX] in {
+  let AddedComplexity = 15 in
+    def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)),
+              (MMX_MOVD64rr GR32:$src)>;
+  let AddedComplexity = 20 in
+    def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))),
+              (MMX_MOVD64rm addr:$src)>;
+}
+
+let mayStore = 1 in
+def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
+                        "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>,
+                   Sched<[WriteStore]>;
+
+def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
+                         "movd\t{$src, $dst|$dst, $src}",
+                         [(set GR32:$dst,
+                          (MMX_X86movd2w (x86mmx VR64:$src)))],
+                          IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>;
+
+let isBitcast = 1 in
+def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+                             "movd\t{$src, $dst|$dst, $src}",
+                             [(set VR64:$dst, (bitconvert GR64:$src))],
+                             IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst),
+                             (ins i64mem:$src), "movd\t{$src, $dst|$dst, $src}",
+                             [], IIC_MMX_MOVQ_RM>, Sched<[WriteLoad]>;
+
+// These are 64 bit moves, but since the OS X assembler doesn't
+// recognize a register-register movq, we write them as
+// movd.
+let SchedRW = [WriteMove], isBitcast = 1 in {
+def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
+                               (outs GR64:$dst), (ins VR64:$src),
+                               "movd\t{$src, $dst|$dst, $src}",
+                             [(set GR64:$dst,
+                              (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>;
+let hasSideEffects = 0 in
+def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}", [],
+                        IIC_MMX_MOVQ_RR>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}", [],
+                        IIC_MMX_MOVQ_RR>;
+}
+} // SchedRW
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def MMX_MOVD64from64rm : MMXRI<0x7E, MRMDestMem,
+                               (outs), (ins i64mem:$dst, VR64:$src),
+                               "movd\t{$src, $dst|$dst, $src}",
+                               [], IIC_MMX_MOV_REG_MM>, Sched<[WriteStore]>;
+
+let SchedRW = [WriteLoad] in {
+let canFoldAsLoad = 1 in
+def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst, (load_mmx addr:$src))],
+                        IIC_MMX_MOVQ_RM>;
+} // SchedRW
+let SchedRW = [WriteStore] in
+def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(store (x86mmx VR64:$src), addr:$dst)],
+                        IIC_MMX_MOVQ_RM>;
+
+let SchedRW = [WriteMove] in {
+def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+                             (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+                             [(set VR64:$dst,
+                               (x86mmx (bitconvert
+                               (i64 (extractelt (v2i64 VR128:$src),
+                                     (iPTR 0))))))],
+                             IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
+                              (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+                              [(set VR128:$dst,
+                                (v2i64
+                                  (scalar_to_vector
+                                    (i64 (bitconvert (x86mmx VR64:$src))))))],
+                              IIC_MMX_MOVQ_RR>;
+
+let isCodeGenOnly = 1, hasSideEffects = 1 in {
+def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
+                               (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+                               [], IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+                              (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+                              [], IIC_MMX_MOVQ_RR>;
+}
+} // SchedRW
+
+let Predicates = [HasSSE1] in
+def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+                         "movntq\t{$src, $dst|$dst, $src}",
+                         [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
+                         IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>;
+
+let Predicates = [HasMMX] in {
+  let AddedComplexity = 15 in
+  // movd to MMX register zero-extends
+  def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))),
+            (MMX_MOVD64rr GR32:$src)>;
+  let AddedComplexity = 20 in
+  def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))),
+            (MMX_MOVD64rm addr:$src)>;
+}
+
+// Arithmetic Instructions
+defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d,
+                                     MMX_INTALU_ITINS>;
+// -- Addition
+defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d,
+                                   MMX_INTALU_ITINS, 1>;
+let Predicates = [HasSSE2] in
+defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q,
+                                   MMX_INTALUQ_ITINS, 1>;
+defm MMX_PADDSB  : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDSW  : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PHADDW  : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w,
+                                   MMX_PHADDSUBW>;
+defm MMX_PHADD   : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
+                                   MMX_PHADDSUBD>;
+defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
+                                   MMX_PHADDSUBW>;
+
+
+// -- Subtraction
+defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
+                                   MMX_INTALU_ITINS>;
+defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
+                                   MMX_INTALU_ITINS>;
+defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
+                                   MMX_INTALU_ITINS>;
+let Predicates = [HasSSE2] in
+defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
+                                   MMX_INTALUQ_ITINS>;
+
+defm MMX_PSUBSB  : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b,
+                                   MMX_INTALU_ITINS>;
+defm MMX_PSUBSW  : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w,
+                                   MMX_INTALU_ITINS>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b,
+                                   MMX_INTALU_ITINS>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w,
+                                   MMX_INTALU_ITINS>;
+
+defm MMX_PHSUBW  : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w,
+                                   MMX_PHADDSUBW>;
+defm MMX_PHSUBD  : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d,
+                                   MMX_PHADDSUBD>;
+defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw,
+                                   MMX_PHADDSUBW>;
+
+// -- Multiplication
+defm MMX_PMULLW  : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w,
+                                     MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMULHW  : MMXI_binop_rm_int<0xE5, "pmulhw",  int_x86_mmx_pmulh_w,
+                                     MMX_PMUL_ITINS, 1>;
+let Predicates = [HasSSE1] in
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w,
+                                     MMX_PMUL_ITINS, 1>;
+let Predicates = [HasSSE2] in
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq,
+                                     MMX_PMUL_ITINS, 1>;
+defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw",
+                                     int_x86_ssse3_pmul_hr_sw,
+                                     MMX_PMUL_ITINS, 1>;
+
+// -- Miscellanea
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd,
+                                     MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw",
+                                     int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>;
+let Predicates = [HasSSE1] in {
+defm MMX_PAVGB   : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PAVGW   : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMINUB  : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMINSW  : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMAXUB  : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMAXSW  : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PSADBW  : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw,
+                                     MMX_PSADBW_ITINS, 1>;
+}
+
+defm MMX_PSIGNB :  SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b,
+                                        MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGNW :  SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w,
+                                        MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGND :  SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d,
+                                        MMX_MISC_FUNC_ITINS>;
+let Constraints = "$src1 = $dst" in
+  defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>;
+
+// Logical Instructions
+defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand,
+                                  MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+defm MMX_POR  : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,
+                                  MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor,
+                                  MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
+defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn,
+                                  MMX_INTALU_ITINS_VECLOGICSCHED>;
+
+// Shift Instructions
+defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+                                    int_x86_mmx_psrl_w, int_x86_mmx_psrli_w,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+                                    int_x86_mmx_psrl_d, int_x86_mmx_psrli_d,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+                                    int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
+                                    MMX_SHIFT_ITINS>;
+
+def : Pat<(int_x86_mmx_psrl_w VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSRLWrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psrl_d VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSRLDrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psrl_q VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSRLQrm VR64:$src1, addr:$src2)>;
+
+defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+                                    int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+                                    int_x86_mmx_psll_d, int_x86_mmx_pslli_d,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+                                    int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
+                                    MMX_SHIFT_ITINS>;
+
+def : Pat<(int_x86_mmx_psll_w VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSLLWrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psll_d VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSLLDrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psll_q VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSLLQrm VR64:$src1, addr:$src2)>;
+
+defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+                                    int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+                                    int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
+                                    MMX_SHIFT_ITINS>;
+
+def : Pat<(int_x86_mmx_psra_w VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSRAWrm VR64:$src1, addr:$src2)>;
+def : Pat<(int_x86_mmx_psra_d VR64:$src1, (load_mvmmx addr:$src2)),
+          (MMX_PSRADrm VR64:$src1, addr:$src2)>;
+
+// Comparison Instructions
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d,
+                                     MMX_INTALU_ITINS>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d,
+                                     MMX_INTALU_ITINS>;
+
+// -- Unpack Instructions
+defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw",
+                                       int_x86_mmx_punpckhbw,
+                                       MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd",
+                                       int_x86_mmx_punpckhwd,
+                                       MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq",
+                                       int_x86_mmx_punpckhdq,
+                                       MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw",
+                                       int_x86_mmx_punpcklbw,
+                                       MMX_UNPCK_L_ITINS>;
+defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd",
+                                       int_x86_mmx_punpcklwd,
+                                       MMX_UNPCK_L_ITINS>;
+defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq",
+                                       int_x86_mmx_punpckldq,
+                                       MMX_UNPCK_L_ITINS>;
+
+// -- Pack Instructions
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb,
+                                      MMX_PCK_ITINS>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw,
+                                      MMX_PCK_ITINS>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
+                                      MMX_PCK_ITINS>;
+
+// -- Shuffle Instructions
+defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
+                                       MMX_PSHUF_ITINS>;
+
+def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
+                          (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2),
+                          "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set VR64:$dst,
+                             (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))],
+                          IIC_MMX_PSHUF>, Sched<[WriteShuffle]>;
+def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
+                          (outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2),
+                          "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set VR64:$dst,
+                             (int_x86_sse_pshuf_w (load_mmx addr:$src1),
+                                                   imm:$src2))],
+                          IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>;
+
+
+
+
+// -- Conversion Instructions
+defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
+                      f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
+                      MMX_CVT_PS_ITINS, SSEPackedSingle>, PS;
+defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
+                      f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
+                      MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
+                       f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
+                       MMX_CVT_PS_ITINS, SSEPackedSingle>, PS;
+defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
+                       f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
+                       MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
+                         i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
+                         MMX_CVT_PD_ITINS, SSEPackedDouble>, PD;
+let Constraints = "$src1 = $dst" in {
+  defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
+                         int_x86_sse_cvtpi2ps,
+                         i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+                          SSEPackedSingle>, PS;
+}
+
+// Extract / Insert
+let Predicates = [HasSSE1] in
+def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
+                       (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2),
+                       "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
+                                               imm:$src2))],
+                       IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
+let Constraints = "$src1 = $dst" in {
+let Predicates = [HasSSE1] in {
+  def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
+                      (outs VR64:$dst),
+                      (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3),
+                      "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+                                        GR32orGR64:$src2, imm:$src3))],
+                      IIC_MMX_PINSRW>, Sched<[WriteShuffle]>;
+
+  def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
+                     (outs VR64:$dst),
+                     (ins VR64:$src1, i16mem:$src2, i32u8imm:$src3),
+                     "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+                                         (i32 (anyext (loadi16 addr:$src2))),
+                                       imm:$src3))],
+                     IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+}
+
+// Mask creation
+let Predicates = [HasSSE1] in
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+                          (ins VR64:$src),
+                          "pmovmskb\t{$src, $dst|$dst, $src}",
+                          [(set GR32orGR64:$dst,
+                                (int_x86_mmx_pmovmskb VR64:$src))]>;
+
+
+// Low word of XMM to MMX.
+def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
+
+def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
+          (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
+
+def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
+          (x86mmx (MMX_MOVQ64rm addr:$src))>;
+
+// Misc.
+let SchedRW = [WriteShuffle] in {
+let Uses = [EDI], Predicates = [HasSSE1,Not64BitMode] in
+def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+                          "maskmovq\t{$mask, $src|$src, $mask}",
+                          [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
+                          IIC_MMX_MASKMOV>;
+let Uses = [RDI], Predicates = [HasSSE1,In64BitMode] in
+def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+                           "maskmovq\t{$mask, $src|$src, $mask}",
+                           [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)],
+                           IIC_MMX_MASKMOV>;
+}
+
+// 64-bit bit convert.
+let Predicates = [HasSSE2] in {
+def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
+          (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
+          (MMX_MOVFR642Qrr FR64:$src)>;
+}
+
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMPX.td b/contrib/llvm/lib/Target/X86/X86InstrMPX.td
new file mode 100644
index 000000000000..309f601d1fce
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrMPX.td
@@ -0,0 +1,70 @@
+//===-- X86InstrMPX.td - MPX Instruction Set ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MPX instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
+  def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src),
+              OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
+              Requires<[HasMPX, Not64BitMode]>;
+  def 64rm: RI<opc, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+              OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
+              Requires<[HasMPX, In64BitMode]>;
+}
+
+defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
+
+multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
+  def 32rm: I<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, i32mem:$src2),
+              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
+              Requires<[HasMPX, Not64BitMode]>;
+  def 64rm: RI<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, i64mem:$src2),
+              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
+              Requires<[HasMPX, In64BitMode]>;
+  def 32rr: I<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR32:$src2),
+              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
+              Requires<[HasMPX, Not64BitMode]>;
+  def 64rr: RI<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR64:$src2),
+              OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
+              Requires<[HasMPX, In64BitMode]>;
+}
+defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS;
+defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD;
+defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD;
+
+def BNDMOVRMrr   : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX]>;
+def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX, Not64BitMode]>;
+def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX, In64BitMode]>;
+
+def BNDMOVMRrr   : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX]>;
+def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX, Not64BitMode]>;
+def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src),
+                    "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX, In64BitMode]>;
+
+def BNDSTXmr:      I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
+                    "bndstx\t{$src, $dst|$dst, $src}", []>, PS,
+                    Requires<[HasMPX]>;
+def BNDLDXrm:      I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+                    "bndldx\t{$src, $dst|$dst, $src}", []>, PS,
+                    Requires<[HasMPX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSGX.td b/contrib/llvm/lib/Target/X86/X86InstrSGX.td
new file mode 100644
index 000000000000..84119ad5eb35
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSGX.td
@@ -0,0 +1,24 @@
+//===-- X86InstrSGX.td - SGX Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel SGX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SGX instructions
+
+// ENCLS - Execute an Enclave System Function of Specified Leaf Number
+def ENCLS : I<0x01, MRM_CF, (outs), (ins),
+             "encls", []>, TB;
+
+// ENCLU - Execute an Enclave User Function of Specified Leaf Number
+def ENCLU : I<0x01, MRM_D7, (outs), (ins),
+             "enclu", []>, TB;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
new file mode 100644
index 000000000000..9d6a89363044
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -0,0 +1,8711 @@
+//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 SSE instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
+  InstrItinClass rr = arg_rr;
+  InstrItinClass rm = arg_rm;
+  // InstrSchedModel info.
+  X86FoldableSchedWrite Sched = WriteFAdd;
+}
+
+class SizeItins<OpndItins arg_s, OpndItins arg_d> {
+  OpndItins s = arg_s;
+  OpndItins d = arg_d;
+}
+
+
+class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
+  InstrItinClass arg_ri> {
+  InstrItinClass rr = arg_rr;
+  InstrItinClass rm = arg_rm;
+  InstrItinClass ri = arg_ri;
+}
+
+
+// scalar
+let Sched = WriteFAdd in {
+def SSE_ALU_F32S : OpndItins<
+  IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
+>;
+
+def SSE_ALU_F64S : OpndItins<
+  IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
+>;
+}
+
+def SSE_ALU_ITINS_S : SizeItins<
+  SSE_ALU_F32S, SSE_ALU_F64S
+>;
+
+let Sched = WriteFMul in {
+def SSE_MUL_F32S : OpndItins<
+  IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
+>;
+
+def SSE_MUL_F64S : OpndItins<
+  IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
+>;
+}
+
+def SSE_MUL_ITINS_S : SizeItins<
+  SSE_MUL_F32S, SSE_MUL_F64S
+>;
+
+let Sched = WriteFDiv in {
+def SSE_DIV_F32S : OpndItins<
+  IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
+>;
+
+def SSE_DIV_F64S : OpndItins<
+  IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
+>;
+}
+
+def SSE_DIV_ITINS_S : SizeItins<
+  SSE_DIV_F32S, SSE_DIV_F64S
+>;
+
+// parallel
+let Sched = WriteFAdd in {
+def SSE_ALU_F32P : OpndItins<
+  IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
+>;
+
+def SSE_ALU_F64P : OpndItins<
+  IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
+>;
+}
+
+def SSE_ALU_ITINS_P : SizeItins<
+  SSE_ALU_F32P, SSE_ALU_F64P
+>;
+
+let Sched = WriteFMul in {
+def SSE_MUL_F32P : OpndItins<
+  IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
+>;
+
+def SSE_MUL_F64P : OpndItins<
+  IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
+>;
+}
+
+def SSE_MUL_ITINS_P : SizeItins<
+  SSE_MUL_F32P, SSE_MUL_F64P
+>;
+
+let Sched = WriteFDiv in {
+def SSE_DIV_F32P : OpndItins<
+  IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
+>;
+
+def SSE_DIV_F64P : OpndItins<
+  IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
+>;
+}
+
+def SSE_DIV_ITINS_P : SizeItins<
+  SSE_DIV_F32P, SSE_DIV_F64P
+>;
+
+let Sched = WriteVecLogic in
+def SSE_VEC_BIT_ITINS_P : OpndItins<
+  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
+>;
+
+def SSE_BIT_ITINS_P : OpndItins<
+  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
+>;
+
+let Sched = WriteVecALU in {
+def SSE_INTALU_ITINS_P : OpndItins<
+  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+def SSE_INTALUQ_ITINS_P : OpndItins<
+  IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
+>;
+}
+
+let Sched = WriteVecIMul in
+def SSE_INTMUL_ITINS_P : OpndItins<
+  IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
+>;
+
+def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
+  IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
+>;
+
+def SSE_MOVA_ITINS : OpndItins<
+  IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
+>;
+
+def SSE_MOVU_ITINS : OpndItins<
+  IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
+>;
+
+def SSE_DPPD_ITINS : OpndItins<
+  IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
+>;
+
+def SSE_DPPS_ITINS : OpndItins<
+  IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
+>;
+
+def DEFAULT_ITINS : OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+def SSE_EXTRACT_ITINS : OpndItins<
+  IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
+>;
+
+def SSE_INSERT_ITINS : OpndItins<
+  IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
+>;
+
+let Sched = WriteMPSAD in
+def SSE_MPSADBW_ITINS : OpndItins<
+  IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
+>;
+
+let Sched = WriteVecIMul in
+def SSE_PMULLD_ITINS : OpndItins<
+  IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
+>;
+
+// Definitions for backward compatibility.
+// The instructions mapped on these definitions uses a different itinerary
+// than the actual scheduling model.
+let Sched = WriteShuffle in
+def DEFAULT_ITINS_SHUFFLESCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteVecIMul in
+def DEFAULT_ITINS_VECIMULSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteShuffle in
+def SSE_INTALU_ITINS_SHUFF_P : OpndItins<
+  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+let Sched = WriteMPSAD in
+def DEFAULT_ITINS_MPSADSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteFBlend in
+def DEFAULT_ITINS_FBLENDSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteBlend in
+def DEFAULT_ITINS_BLENDSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteVarBlend in
+def DEFAULT_ITINS_VARBLENDSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+let Sched = WriteFBlend in
+def SSE_INTALU_ITINS_FBLEND_P : OpndItins<
+  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+let Sched = WriteBlend in
+def SSE_INTALU_ITINS_BLEND_P : OpndItins<
+  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 Instructions Classes
+//===----------------------------------------------------------------------===//
+
+/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
+multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, X86MemOperand x86memop,
+                           Domain d, OpndItins itins, bit Is2Addr = 1> {
+  let isCommutable = 1 in {
+    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr, d>,
+       Sched<[itins.Sched]>;
+  }
+  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm, d>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
+multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
+                               SDPatternOperator Int, RegisterClass RC,
+                               string asm, Operand memopr,
+                               ComplexPattern mem_cpat, Domain d,
+                               OpndItins itins, bit Is2Addr = 1> {
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
+  def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr, d>,
+       Sched<[itins.Sched]>;
+  let mayLoad = 1 in
+  def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
+       !if(Is2Addr,
+           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (Int RC:$src1, mem_cpat:$src2))], itins.rm, d>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+}
+
+/// sse12_fp_packed - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, ValueType vt,
+                           X86MemOperand x86memop, PatFrag mem_frag,
+                           Domain d, OpndItins itins, bit Is2Addr = 1> {
+  let isCommutable = 1 in
+    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
+       Sched<[itins.Sched]>;
+  let mayLoad = 1 in
+    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
+          itins.rm, d>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
+                                      string OpcodeStr, X86MemOperand x86memop,
+                                      list<dag> pat_rr, list<dag> pat_rm,
+                                      bit Is2Addr = 1> {
+  let isCommutable = 1, hasSideEffects = 0 in
+    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       pat_rr, NoItinerary, d>,
+       Sched<[WriteVecLogic]>;
+  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       pat_rm, NoItinerary, d>,
+       Sched<[WriteVecLogicLd, ReadAfterLd]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Non-instruction patterns
+//===----------------------------------------------------------------------===//
+
+// A vector extract of the first f32/f64 position is a subregister copy
+def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
+          (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
+def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
+          (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
+
+// A 128-bit subvector extract from the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
+          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
+def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
+          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
+
+def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
+          (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
+def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
+          (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
+
+def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
+          (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
+def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
+          (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
+
+// A 128-bit subvector insert to the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+let AddedComplexity = 25 in { // to give priority over vinsertf128rm
+def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+}
+
+// Implicitly promote a 32-bit scalar to a vector.
+def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
+          (COPY_TO_REGCLASS FR32:$src, VR128)>;
+// Implicitly promote a 64-bit scalar to a vector.
+def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+          (COPY_TO_REGCLASS FR64:$src, VR128)>;
+
+// Bitcasts between 128-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(f128  (bitconvert (i128  FR128:$src))), (f128  FR128:$src)>;
+def : Pat<(i128  (bitconvert (f128  FR128:$src))), (i128  FR128:$src)>;
+
+// Bitcasts between 256-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+def : Pat<(v4i64  (bitconvert (v8i32  VR256:$src))), (v4i64  VR256:$src)>;
+def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64  VR256:$src)>;
+def : Pat<(v4i64  (bitconvert (v32i8  VR256:$src))), (v4i64  VR256:$src)>;
+def : Pat<(v4i64  (bitconvert (v8f32  VR256:$src))), (v4i64  VR256:$src)>;
+def : Pat<(v4i64  (bitconvert (v4f64  VR256:$src))), (v4i64  VR256:$src)>;
+def : Pat<(v8i32  (bitconvert (v4i64  VR256:$src))), (v8i32  VR256:$src)>;
+def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32  VR256:$src)>;
+def : Pat<(v8i32  (bitconvert (v32i8  VR256:$src))), (v8i32  VR256:$src)>;
+def : Pat<(v8i32  (bitconvert (v4f64  VR256:$src))), (v8i32  VR256:$src)>;
+def : Pat<(v8i32  (bitconvert (v8f32  VR256:$src))), (v8i32  VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v4i64  VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v8i32  VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v32i8  VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v4f64  VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v8f32  VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v32i8  (bitconvert (v4i64  VR256:$src))), (v32i8  VR256:$src)>;
+def : Pat<(v32i8  (bitconvert (v8i32  VR256:$src))), (v32i8  VR256:$src)>;
+def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8  VR256:$src)>;
+def : Pat<(v32i8  (bitconvert (v4f64  VR256:$src))), (v32i8  VR256:$src)>;
+def : Pat<(v32i8  (bitconvert (v8f32  VR256:$src))), (v32i8  VR256:$src)>;
+def : Pat<(v8f32  (bitconvert (v4i64  VR256:$src))), (v8f32  VR256:$src)>;
+def : Pat<(v8f32  (bitconvert (v8i32  VR256:$src))), (v8f32  VR256:$src)>;
+def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32  VR256:$src)>;
+def : Pat<(v8f32  (bitconvert (v32i8  VR256:$src))), (v8f32  VR256:$src)>;
+def : Pat<(v8f32  (bitconvert (v4f64  VR256:$src))), (v8f32  VR256:$src)>;
+def : Pat<(v4f64  (bitconvert (v4i64  VR256:$src))), (v4f64  VR256:$src)>;
+def : Pat<(v4f64  (bitconvert (v8i32  VR256:$src))), (v4f64  VR256:$src)>;
+def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64  VR256:$src)>;
+def : Pat<(v4f64  (bitconvert (v32i8  VR256:$src))), (v4f64  VR256:$src)>;
+def : Pat<(v4f64  (bitconvert (v8f32  VR256:$src))), (v4f64  VR256:$src)>;
+
+// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
+// This is expanded by ExpandPostRAPseudos.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteZero] in {
+  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
+                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoVLX_Or_NoDQI]>;
+  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
+                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoVLX_Or_NoDQI]>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX & SSE - Zero/One Vectors
+//===----------------------------------------------------------------------===//
+
+// Alias instruction that maps zero vector to pxor / xorp* for sse.
+// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
+// swizzled by ExecutionDepsFix to pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, Predicates = [NoVLX], SchedRW = [WriteZero] in {
+def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+}
+
+let Predicates = [NoVLX] in
+def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
+
+
+// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
+// and doesn't need it because on sandy bridge the register is set to zero
+// at the rename stage without using any execution unit, so SET0PSY
+// and SET0PDY can be used for vector int instructions without penalty
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, Predicates = [HasAVX, NoVLX], SchedRW = [WriteZero] in {
+def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
+}
+
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-ones value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteZero] in {
+  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+  let Predicates = [HasAVX2] in
+  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move FP Scalar Instructions
+//
+// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
+// register copies because it's a partial register update; Register-to-register
+// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
+// that the insert be implementable in terms of a copy, and just mentioned, we
+// don't use movss/movsd for copies.
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
+                         X86MemOperand x86memop, string base_opc,
+                         string asm_opr, Domain d = GenericDomain> {
+  let isCommutable = 1 in
+  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
+              (ins VR128:$src1, RC:$src2),
+              !strconcat(base_opc, asm_opr),
+              [(set VR128:$dst, (vt (OpNode VR128:$src1,
+                                 (scalar_to_vector RC:$src2))))],
+              IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>;
+
+  // For the disassembler
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+                  (ins VR128:$src1, RC:$src2),
+                  !strconcat(base_opc, asm_opr),
+                  [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
+}
+
+multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
+                      X86MemOperand x86memop, string OpcodeStr,
+                      Domain d = GenericDomain> {
+  // AVX
+  defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
+                              VEX_4V, VEX_LIG;
+
+  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
+                     VEX, VEX_LIG, Sched<[WriteStore]>;
+  // SSE1 & 2
+  let Constraints = "$src1 = $dst" in {
+    defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+                              "\t{$src2, $dst|$dst, $src2}", d>;
+  }
+
+  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
+                  Sched<[WriteStore]>;
+}
+
+// Loading from memory automatically zeroing upper bits.
+multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
+                         PatFrag mem_pat, string OpcodeStr,
+                         Domain d = GenericDomain> {
+  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set RC:$dst, (mem_pat addr:$src))],
+                     IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>;
+  def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set RC:$dst, (mem_pat addr:$src))],
+                     IIC_SSE_MOV_S_RM, d>, Sched<[WriteLoad]>;
+}
+
+defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
+                        SSEPackedSingle>, XS;
+defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
+                        SSEPackedDouble>, XD;
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+  defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
+                             SSEPackedSingle>, XS;
+
+  let AddedComplexity = 20 in
+    defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
+                               SSEPackedDouble>, XD;
+}
+
+// Patterns
+let Predicates = [UseAVX] in {
+  let AddedComplexity = 20 in {
+  // MOVSSrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+
+  // MOVSDrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+
+  // Represent the same patterns above but in the form they appear for
+  // 256-bit types
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
+  def : Pat<(v8f32 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
+  }
+
+  // Extract and store.
+  def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
+
+  // Shuffle with VMOVSS
+  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+            (VMOVSSrr (v4i32 VR128:$src1),
+                      (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
+  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+            (VMOVSSrr (v4f32 VR128:$src1),
+                      (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
+
+  // 256-bit variants
+  def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  // Shuffle with VMOVSD
+  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+
+  // 256-bit variants
+  def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+  // is during lowering, where it's not possible to recognize the fold cause
+  // it has two uses through a bitcast. One use disappears at isel time and the
+  // fold opportunity reappears.
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+}
+
+let Predicates = [UseSSE1] in {
+  let Predicates = [NoSSE41], AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVSS to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+            (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSSrm already zeros the high parts of the register.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+  }
+
+  // Extract and store.
+  def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
+
+  // Shuffle with MOVSS
+  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
+  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
+}
+
+let Predicates = [UseSSE2] in {
+  let Predicates = [NoSSE41], AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVSD to the lower bits.
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+            (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSDrm already zeros the high parts of the register.
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  }
+
+  // Shuffle with MOVSD
+  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+
+  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+  // is during lowering, where it's not possible to recognize the fold because
+  // it has two uses through a bitcast. One use disappears at isel time and the
+  // fold opportunity reappears.
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+}
+
+// Aliases to help the assembler pick two byte VEX encodings by swapping the
+// operands relative to the normal instructions to use VEX.R instead of VEX.B.
+def : InstAlias<"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                (VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
+def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                (VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
+                            X86MemOperand x86memop, PatFrag ld_frag,
+                            string asm, Domain d,
+                            OpndItins itins> {
+let hasSideEffects = 0 in
+  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
+           Sched<[WriteFShuffle]>;
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+                   [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
+           Sched<[WriteLoad]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
+                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+                              PS, VEX;
+defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+                              PD, VEX;
+defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
+                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+                              PS, VEX;
+defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
+                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
+                              PD, VEX;
+
+defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
+                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+                              PS, VEX, VEX_L;
+defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
+                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+                              PD, VEX, VEX_L;
+defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
+                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+                              PS, VEX, VEX_L;
+defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
+                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
+                              PD, VEX, VEX_L;
+}
+
+let Predicates = [UseSSE1] in {
+defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
+                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+                              PS;
+defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
+                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+                              PS;
+}
+let Predicates = [UseSSE2] in {
+defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+                              PD;
+defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
+                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
+                              PD;
+}
+
+let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX]  in {
+def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>, VEX;
+def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>, VEX;
+def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(store (v4f32 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>, VEX;
+def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v2f64 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>, VEX;
+def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(store (v8f32 VR256:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v4f64 VR256:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+} // SchedRW
+
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteFShuffle] in {
+  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
+                          (ins VR128:$src),
+                          "movaps\t{$src, $dst|$dst, $src}", [],
+                          IIC_SSE_MOVA_P_RR>, VEX;
+  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movapd\t{$src, $dst|$dst, $src}", [],
+                           IIC_SSE_MOVA_P_RR>, VEX;
+  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movups\t{$src, $dst|$dst, $src}", [],
+                           IIC_SSE_MOVU_P_RR>, VEX;
+  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movupd\t{$src, $dst|$dst, $src}", [],
+                           IIC_SSE_MOVU_P_RR>, VEX;
+  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movaps\t{$src, $dst|$dst, $src}", [],
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movapd\t{$src, $dst|$dst, $src}", [],
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movups\t{$src, $dst|$dst, $src}", [],
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movupd\t{$src, $dst|$dst, $src}", [],
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+}
+
+// Aliases to help the assembler pick two byte VEX encodings by swapping the
+// operands relative to the normal instructions to use VEX.R instead of VEX.B.
+def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
+                (VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
+                (VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
+                (VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
+                (VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}",
+                (VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
+def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}",
+                (VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
+def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}",
+                (VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
+def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
+                (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
+
+let SchedRW = [WriteStore] in {
+def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>;
+def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>;
+def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(store (v4f32 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>;
+def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v2f64 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>;
+} // SchedRW
+
+// For disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteFShuffle] in {
+  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movaps\t{$src, $dst|$dst, $src}", [],
+                         IIC_SSE_MOVA_P_RR>;
+  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movapd\t{$src, $dst|$dst, $src}", [],
+                         IIC_SSE_MOVA_P_RR>;
+  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movups\t{$src, $dst|$dst, $src}", [],
+                         IIC_SSE_MOVU_P_RR>;
+  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movupd\t{$src, $dst|$dst, $src}", [],
+                         IIC_SSE_MOVU_P_RR>;
+}
+
+// Use vmovaps/vmovups for AVX integer load/store.
+let Predicates = [HasAVX, NoVLX] in {
+  // 128-bit load/store
+  def : Pat<(alignedloadv2i64 addr:$src),
+            (VMOVAPSrm addr:$src)>;
+  def : Pat<(loadv2i64 addr:$src),
+            (VMOVUPSrm addr:$src)>;
+
+  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+
+  // 256-bit load/store
+  def : Pat<(alignedloadv4i64 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
+  def : Pat<(loadv4i64 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+  def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+
+  // Special patterns for storing subvector extracts of lower 128-bits
+  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+  def : Pat<(alignedstore (v2f64 (extract_subvector
+                                  (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4f32 (extract_subvector
+                                  (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v2i64 (extract_subvector
+                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4i32 (extract_subvector
+                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v8i16 (extract_subvector
+                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v16i8 (extract_subvector
+                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+
+  def : Pat<(store (v2f64 (extract_subvector
+                           (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4f32 (extract_subvector
+                           (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v2i64 (extract_subvector
+                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4i32 (extract_subvector
+                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v8i16 (extract_subvector
+                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v16i8 (extract_subvector
+                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  // 128-bit load/store
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+
+  // 256-bit load/store
+  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+}
+
+// Use movaps / movups for SSE integer load / store (one byte shorter).
+// The instructions selected below are then converted to MOVDQA/MOVDQU
+// during the SSE domain pass.
+let Predicates = [UseSSE1] in {
+  def : Pat<(alignedloadv2i64 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(loadv2i64 addr:$src),
+            (MOVUPSrm addr:$src)>;
+
+  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
+                                      string base_opc, string asm_opr,
+                                      InstrItinClass itin> {
+  def PSrm : PI<opc, MRMSrcMem,
+         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+         !strconcat(base_opc, "s", asm_opr),
+     [(set VR128:$dst,
+       (psnode VR128:$src1,
+              (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
+              itin, SSEPackedSingle>, PS,
+     Sched<[WriteFShuffleLd, ReadAfterLd]>;
+
+  def PDrm : PI<opc, MRMSrcMem,
+         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+         !strconcat(base_opc, "d", asm_opr),
+     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
+                              (scalar_to_vector (loadf64 addr:$src2)))))],
+              itin, SSEPackedDouble>, PD,
+     Sched<[WriteFShuffleLd, ReadAfterLd]>;
+
+}
+
+multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
+                                 string base_opc, InstrItinClass itin> {
+  let Predicates = [UseAVX] in
+    defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                                    itin>, VEX_4V;
+
+  let Constraints = "$src1 = $dst" in
+    defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+                                    "\t{$src2, $dst|$dst, $src2}",
+                                    itin>;
+}
+
+let AddedComplexity = 20 in {
+  defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
+                                    IIC_SSE_MOV_LH>;
+}
+
+let SchedRW = [WriteStore] in {
+let Predicates = [UseAVX] in {
+def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
+                                 (iPTR 0))), addr:$dst)],
+                                 IIC_SSE_MOV_LH>, VEX;
+def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (extractelt (v2f64 VR128:$src),
+                                 (iPTR 0))), addr:$dst)],
+                                 IIC_SSE_MOV_LH>, VEX;
+}// UseAVX
+def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
+                                 (iPTR 0))), addr:$dst)],
+                                 IIC_SSE_MOV_LH>;
+def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (extractelt (v2f64 VR128:$src),
+                                 (iPTR 0))), addr:$dst)],
+                                 IIC_SSE_MOV_LH>;
+} // SchedRW
+
+let Predicates = [UseAVX] in {
+  // Shuffle with VMOVLPS
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (VMOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (VMOVLPSrm VR128:$src1, addr:$src2)>;
+
+  // Shuffle with VMOVLPD
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1,
+                             (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v4i32 (X86Movlps
+                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
+            (VMOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPDmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+  def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
+                                 (iPTR 0))), addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+
+  // Shuffle with MOVLPS
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlps VR128:$src1,
+                      (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+                                      addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v4i32 (X86Movlps
+                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
+                              addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // Shuffle with MOVLPD
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1,
+                             (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                           addr:$src1),
+            (MOVLPDmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                           addr:$src1),
+            (MOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Hi packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 20 in {
+  defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
+                                    IIC_SSE_MOV_LH>;
+}
+
+let SchedRW = [WriteStore] in {
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+let Predicates = [UseAVX] in {
+def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (extractelt
+                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
+                                            (bc_v2f64 (v4f32 VR128:$src))),
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (extractelt
+                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+} // UseAVX
+def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (extractelt
+                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
+                                            (bc_v2f64 (v4f32 VR128:$src))),
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (extractelt
+                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+} // SchedRW
+
+let Predicates = [UseAVX] in {
+  // VMOVHPS patterns
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+            (VMOVHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+            (VMOVHPSrm VR128:$src1, addr:$src2)>;
+
+  // VMOVHPD patterns
+
+  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
+  // is during lowering, where it's not possible to recognize the load fold
+  // cause it has two uses through a bitcast. One use disappears at isel time
+  // and the fold opportunity reappears.
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+                      (scalar_to_vector (loadf64 addr:$src2)))),
+            (VMOVHPDrm VR128:$src1, addr:$src2)>;
+
+  // Also handle an i64 load because that may get selected as a faster way to
+  // load the data.
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+            (VMOVHPDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(store (f64 (extractelt
+                          (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
+                          (iPTR 0))), addr:$dst),
+            (VMOVHPDmr addr:$dst, VR128:$src)>;
+
+  def : Pat<(store (f64 (extractelt
+                          (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
+                          (iPTR 0))), addr:$dst),
+            (VMOVHPDmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [UseSSE1] in {
+  // MOVHPS patterns
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+            (MOVHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
+            (MOVHPSrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // MOVHPD patterns
+
+  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
+  // is during lowering, where it's not possible to recognize the load fold
+  // cause it has two uses through a bitcast. One use disappears at isel time
+  // and the fold opportunity reappears.
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+                      (scalar_to_vector (loadf64 addr:$src2)))),
+            (MOVHPDrm VR128:$src1, addr:$src2)>;
+
+  // Also handle an i64 load because that may get selected as a faster way to
+  // load the data.
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+            (MOVHPDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(store (f64 (extractelt
+                          (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
+                          (iPTR 0))), addr:$dst),
+            (MOVHPDmr addr:$dst, VR128:$src)>;
+
+  def : Pat<(store (f64 (extractelt
+                          (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
+                          (iPTR 0))), addr:$dst),
+            (MOVHPDmr addr:$dst, VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 20, Predicates = [UseAVX] in {
+  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
+                        IIC_SSE_MOV_LH>,
+                      VEX_4V, Sched<[WriteFShuffle]>;
+  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
+                        IIC_SSE_MOV_LH>,
+                      VEX_4V, Sched<[WriteFShuffle]>;
+}
+let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
+  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movlhps\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
+                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+  let isCommutable = 1 in
+  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movhlps\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
+                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+}
+
+let Predicates = [UseAVX] in {
+  // MOVLHPS patterns
+  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+  // MOVHLPS patterns
+  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+            (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+  // MOVLHPS patterns
+  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+  // MOVHLPS patterns
+  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+            (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Conversion Instructions
+//===----------------------------------------------------------------------===//
+
+def SSE_CVT_PD : OpndItins<
+  IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
+>;
+
+let Sched = WriteCvtI2F in
+def SSE_CVT_PS : OpndItins<
+  IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
+>;
+
+let Sched = WriteCvtI2F in
+def SSE_CVT_Scalar : OpndItins<
+  IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SS2SI_32 : OpndItins<
+  IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SS2SI_64 : OpndItins<
+  IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SD2SI : OpndItins<
+  IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
+>;
+
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
+multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
+                     string asm, OpndItins itins> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                        [(set DstRC:$dst, (OpNode SrcRC:$src))],
+                        itins.rr>, Sched<[itins.Sched]>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
+                        itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
+                       ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
+                       string asm, Domain d, OpndItins itins> {
+let hasSideEffects = 0 in {
+  def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
+             [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))],
+             itins.rr, d>, Sched<[itins.Sched]>;
+  let mayLoad = 1 in
+  def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
+             [(set RC:$dst, (DstTy (sint_to_fp
+                                    (SrcTy (bitconvert (ld_frag addr:$src))))))],
+             itins.rm, d>, Sched<[itins.Sched.Folded]>;
+}
+}
+
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
+multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                          X86MemOperand x86memop, string asm> {
+let hasSideEffects = 0, Predicates = [UseAVX] in {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+           Sched<[WriteCvtI2F]>;
+  let mayLoad = 1 in
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins DstRC:$src1, x86memop:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+           Sched<[WriteCvtI2FLd, ReadAfterLd]>;
+} // hasSideEffects = 0
+}
+
+let Predicates = [UseAVX] in {
+defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+                                "cvttss2si\t{$src, $dst|$dst, $src}",
+                                SSE_CVT_SS2SI_32>,
+                                XS, VEX, VEX_LIG;
+defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+                                "cvttss2si\t{$src, $dst|$dst, $src}",
+                                SSE_CVT_SS2SI_64>,
+                                XS, VEX, VEX_W, VEX_LIG;
+defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+                                "cvttsd2si\t{$src, $dst|$dst, $src}",
+                                SSE_CVT_SD2SI>,
+                                XD, VEX, VEX_LIG;
+defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+                                "cvttsd2si\t{$src, $dst|$dst, $src}",
+                                SSE_CVT_SD2SI>,
+                                XD, VEX, VEX_W, VEX_LIG;
+
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+}
+// The assembler can recognize rr 64-bit instructions by seeing a rxx
+// register, but the same isn't true when only using memory operands,
+// provide other assembly "l" and "q" forms to address this explicitly
+// where appropriate to do so.
+defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
+                                  XS, VEX_4V, VEX_LIG;
+defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
+                                  XS, VEX_4V, VEX_W, VEX_LIG;
+defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
+                                  XD, VEX_4V, VEX_LIG;
+defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
+                                  XD, VEX_4V, VEX_W, VEX_LIG;
+
+let Predicates = [UseAVX] in {
+  def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
+  def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
+
+  def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+            (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+            (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+  def : Pat<(f32 (sint_to_fp GR32:$src)),
+            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+  def : Pat<(f32 (sint_to_fp GR64:$src)),
+            (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+  def : Pat<(f64 (sint_to_fp GR32:$src)),
+            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+  def : Pat<(f64 (sint_to_fp GR64:$src)),
+            (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+}
+
+defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+                      "cvttss2si\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_SS2SI_32>, XS;
+defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+                      "cvttss2si\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_SS2SI_64>, XS, REX_W;
+defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+                      "cvttsd2si\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_SD2SI>, XD;
+defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+                      "cvttsd2si\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_SD2SI>, XD, REX_W;
+defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
+                      "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_Scalar>, XS;
+defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
+                      "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_Scalar>, XS, REX_W;
+defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
+                      "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_Scalar>, XD;
+defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
+                      "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_Scalar>, XD, REX_W;
+
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+
+def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
+                (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>;
+def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
+                (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>;
+
+// Conversion Instructions Intrinsics - Match intrinsics which expect MM
+// and/or XMM operand(s).
+
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
+multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                         Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
+                         string asm, OpndItins itins> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
+           Sched<[itins.Sched]>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
+           Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
+                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+                    PatFrag ld_frag, string asm, OpndItins itins,
+                    bit Is2Addr = 1> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
+              !if(Is2Addr,
+                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+              [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+              itins.rr>, Sched<[itins.Sched]>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins DstRC:$src1, x86memop:$src2),
+              !if(Is2Addr,
+                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+              [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+              itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [UseAVX] in {
+defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
+                  int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
+                  SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
+defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
+                    int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
+                    SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
+}
+defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
+                 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
+defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
+                   sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+
+
+let isCodeGenOnly = 1 in {
+  let Predicates = [UseAVX] in {
+  defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+            int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
+            SSE_CVT_Scalar, 0>, XS, VEX_4V;
+  defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+            int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
+            SSE_CVT_Scalar, 0>, XS, VEX_4V,
+            VEX_W;
+  defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+            int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
+            SSE_CVT_Scalar, 0>, XD, VEX_4V;
+  defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+            int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
+            SSE_CVT_Scalar, 0>, XD,
+            VEX_4V, VEX_W;
+  }
+  let Constraints = "$src1 = $dst" in {
+    defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+                          int_x86_sse_cvtsi2ss, i32mem, loadi32,
+                          "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
+    defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+                          int_x86_sse_cvtsi642ss, i64mem, loadi64,
+                          "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
+    defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+                          int_x86_sse2_cvtsi2sd, i32mem, loadi32,
+                          "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
+    defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+                          int_x86_sse2_cvtsi642sd, i64mem, loadi64,
+                          "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
+  }
+} // isCodeGenOnly = 1
+
+/// SSE 1 Only
+
+// Aliases for intrinsics
+let isCodeGenOnly = 1 in {
+let Predicates = [UseAVX] in {
+defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+                                    ssmem, sse_load_f32, "cvttss2si",
+                                    SSE_CVT_SS2SI_32>, XS, VEX;
+defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+                                   "cvttss2si", SSE_CVT_SS2SI_64>,
+                                   XS, VEX, VEX_W;
+defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+                                    sdmem, sse_load_f64, "cvttsd2si",
+                                    SSE_CVT_SD2SI>, XD, VEX;
+defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+                                  "cvttsd2si", SSE_CVT_SD2SI>,
+                                  XD, VEX, VEX_W;
+}
+defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+                                    ssmem, sse_load_f32, "cvttss2si",
+                                    SSE_CVT_SS2SI_32>, XS;
+defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+                                   "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
+defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+                                    sdmem, sse_load_f64, "cvttsd2si",
+                                    SSE_CVT_SD2SI>, XD;
+defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+                                  "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+} // isCodeGenOnly = 1
+
+let Predicates = [UseAVX] in {
+defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+                                  ssmem, sse_load_f32, "cvtss2si",
+                                  SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
+defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+                                  ssmem, sse_load_f32, "cvtss2si",
+                                  SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
+}
+defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+                               ssmem, sse_load_f32, "cvtss2si",
+                               SSE_CVT_SS2SI_32>, XS;
+defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+                                 ssmem, sse_load_f32, "cvtss2si",
+                                 SSE_CVT_SS2SI_64>, XS, REX_W;
+
+defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
+                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+                               SSEPackedSingle, SSE_CVT_PS>,
+                               PS, VEX, Requires<[HasAVX, NoVLX]>;
+defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
+                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+                               SSEPackedSingle, SSE_CVT_PS>,
+                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>;
+
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
+                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
+                            SSEPackedSingle, SSE_CVT_PS>,
+                            PS, Requires<[UseSSE2]>;
+
+let Predicates = [UseAVX] in {
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
+}
+
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
+
+/// SSE 2 Only
+
+// Convert scalar double to scalar single
+let hasSideEffects = 0, Predicates = [UseAVX] in {
+def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
+                       (ins FR64:$src1, FR64:$src2),
+                      "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+                      IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
+                      Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
+                       (ins FR64:$src1, f64mem:$src2),
+                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [], IIC_SSE_CVT_Scalar_RM>,
+                      XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
+                      Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+def : Pat<(f32 (fpround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
+          Requires<[UseAVX]>;
+
+def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
+                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (fpround FR64:$src))],
+                      IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
+def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
+                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (fpround (loadf64 addr:$src)))],
+                      IIC_SSE_CVT_Scalar_RM>,
+                      XD,
+                  Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+
+let isCodeGenOnly = 1 in {
+def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
+                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
+                       Sched<[WriteCvtF2F]>;
+def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
+                                          VR128:$src1, sse_load_f64:$src2))],
+                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>,
+                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+
+let Constraints = "$src1 = $dst" in {
+def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
+                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
+                       Sched<[WriteCvtF2F]>;
+def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
+                                          VR128:$src1, sse_load_f64:$src2))],
+                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
+                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+} // isCodeGenOnly = 1
+
+// Convert scalar single to scalar double
+// SSE2 instructions with XS prefix
+let hasSideEffects = 0, Predicates = [UseAVX] in {
+def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
+                    (ins FR32:$src1, FR32:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [], IIC_SSE_CVT_Scalar_RR>,
+                    XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
+                    Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
+                    (ins FR32:$src1, f32mem:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [], IIC_SSE_CVT_Scalar_RM>,
+                    XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
+                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+def : Pat<(f64 (fpextend FR32:$src)),
+    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
+def : Pat<(fpextend (loadf32 addr:$src)),
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
+
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+    Requires<[UseAVX, OptForSize]>;
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
+    Requires<[UseAVX, OptForSpeed]>;
+
+def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
+                   "cvtss2sd\t{$src, $dst|$dst, $src}",
+                   [(set FR64:$dst, (fpextend FR32:$src))],
+                   IIC_SSE_CVT_Scalar_RR>, XS,
+                 Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
+def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
+                   "cvtss2sd\t{$src, $dst|$dst, $src}",
+                   [(set FR64:$dst, (extloadf32 addr:$src))],
+                   IIC_SSE_CVT_Scalar_RM>, XS,
+                 Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+
+// extload f32 -> f64.  This matches load+fpextend because we have a hack in
+// the isel (PreprocessForFPConvert) that can introduce loads after dag
+// combine.
+// Since these loads aren't folded into the fpextend, we have to match it
+// explicitly here.
+def : Pat<(fpextend (loadf32 addr:$src)),
+          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(extloadf32 addr:$src),
+          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
+
+let isCodeGenOnly = 1 in {
+def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
+                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>,
+                    Sched<[WriteCvtF2F]>;
+def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
+                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
+                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>,
+                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
+def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
+                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
+                    Sched<[WriteCvtF2F]>;
+def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
+                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
+                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
+                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+} // isCodeGenOnly = 1
+
+// Convert packed single/double fp to doubleword
+def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
+                       IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvtps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
+                       IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                        "cvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR256:$dst,
+                          (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
+                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                        "cvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR256:$dst,
+                          (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
+                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "cvtps2dq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
+                     IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                     "cvtps2dq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
+                     IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+
+
+// Convert Packed Double FP to Packed DW Integers
+let Predicates = [HasAVX, NoVLX] in {
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
+                       VEX, Sched<[WriteCvtF2I]>;
+
+// XMM only
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
+def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                      "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
+                      Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+                (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0>;
+
+// YMM only
+def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
+                       VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
+                       VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
+                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
+def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
+                (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0>;
+}
+
+def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))],
+                      IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
+def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))],
+                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+
+// Convert with truncation packed single/double fp to doubleword
+// SSE2 packed instructions with XS prefix
+let Predicates = [HasAVX, NoVLX] in {
+def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                         "cvttps2dq\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst,
+                           (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
+                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                         "cvttps2dq\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst,
+                           (v4i32 (fp_to_sint (loadv4f32 addr:$src))))],
+                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                          "cvttps2dq\t{$src, $dst|$dst, $src}",
+                          [(set VR256:$dst,
+                            (v8i32 (fp_to_sint (v8f32 VR256:$src))))],
+                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                          "cvttps2dq\t{$src, $dst|$dst, $src}",
+                          [(set VR256:$dst,
+                            (v8i32 (fp_to_sint (loadv8f32 addr:$src))))],
+                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
+                          Sched<[WriteCvtF2ILd]>;
+}
+
+def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvttps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
+                       IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvttps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (v4i32 (fp_to_sint (memopv4f32 addr:$src))))],
+                       IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+
+let Predicates = [HasAVX, NoVLX] in
+def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
+                        IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
+
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+
+// XMM only
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
+let Predicates = [HasAVX, NoVLX] in
+def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                        "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))],
+                        IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+                (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0>;
+
+// YMM only
+let Predicates = [HasAVX, NoVLX] in {
+def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst,
+                           (v4i32 (fp_to_sint (v4f64 VR256:$src))))],
+                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst,
+                           (v4i32 (fp_to_sint (loadv4f64 addr:$src))))],
+                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+}
+def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
+                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
+def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
+                (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0>;
+
+let Predicates = [HasAVX, NoVLX] in {
+  let AddedComplexity = 15 in {
+    def : Pat<(X86vzmovl (v2i64 (bitconvert
+                                 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
+              (VCVTPD2DQrr VR128:$src)>;
+    def : Pat<(X86vzmovl (v2i64 (bitconvert
+                                 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
+              (VCVTTPD2DQrr VR128:$src)>;
+  }
+} // Predicates = [HasAVX]
+
+def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
+                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
+                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))],
+                      IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
+
+let Predicates = [UseSSE2] in {
+  let AddedComplexity = 15 in {
+    def : Pat<(X86vzmovl (v2i64 (bitconvert
+                                 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
+              (CVTPD2DQrr VR128:$src)>;
+    def : Pat<(X86vzmovl (v2i64 (bitconvert
+                                 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
+              (CVTTPD2DQrr VR128:$src)>;
+  }
+} // Predicates = [UseSSE2]
+
+// Convert packed single to packed double
+let Predicates = [HasAVX, NoVLX] in {
+                  // SSE2 instructions without OpSize prefix
+def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
+                    IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
+def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+                    IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
+def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))],
+                     IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))],
+                     IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+}
+
+let Predicates = [UseSSE2] in {
+def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "cvtps2pd\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
+                   IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
+def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                   "cvtps2pd\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+                   IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>;
+}
+
+// Convert Packed DW Integers to Packed Double FP
+let Predicates = [HasAVX, NoVLX] in {
+let hasSideEffects = 0, mayLoad = 1 in
+def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+                        VEX, Sched<[WriteCvtI2FLd]>;
+def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
+                        VEX, Sched<[WriteCvtI2F]>;
+def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
+                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                         [(set VR256:$dst,
+                           (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+                         VEX, VEX_L, Sched<[WriteCvtI2FLd]>;
+def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                         [(set VR256:$dst,
+                           (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
+                         VEX, VEX_L, Sched<[WriteCvtI2F]>;
+}
+
+let hasSideEffects = 0, mayLoad = 1 in
+def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))],
+                       IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
+def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (v2f64 (X86VSintToFP (v4i32 VR128:$src))))],
+                       IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
+
+// AVX register conversion intrinsics
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (VCVTDQ2PDrm addr:$src)>;
+} // Predicates = [HasAVX, NoVLX]
+
+// SSE2 register conversion intrinsics
+let Predicates = [UseSSE2] in {
+  def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (CVTDQ2PDrm addr:$src)>;
+} // Predicates = [UseSSE2]
+
+// Convert packed double to packed single
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+let Predicates = [HasAVX, NoVLX] in
+def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
+                       IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
+
+// XMM only
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
+let Predicates = [HasAVX, NoVLX] in
+def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))],
+                       IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+                (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0>;
+
+// YMM only
+let Predicates = [HasAVX, NoVLX] in {
+def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (fpround VR256:$src))],
+                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))],
+                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+}
+def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
+                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
+def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
+                (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0>;
+
+def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
+                     IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
+def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))],
+                     IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
+
+// AVX 256-bit register conversion intrinsics
+// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
+// whenever possible to avoid declaring two versions of each one.
+
+let Predicates = [HasAVX, NoVLX] in {
+  // Match fpround and fpextend for 128/256-bit conversions
+  let AddedComplexity = 15 in
+  def : Pat<(X86vzmovl (v2f64 (bitconvert
+                               (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
+            (VCVTPD2PSrr VR128:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // Match fpround and fpextend for 128 conversions
+  let AddedComplexity = 15 in
+  def : Pat<(X86vzmovl (v2f64 (bitconvert
+                               (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
+            (CVTPD2PSrr VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
+multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
+                            Operand CC, SDNode OpNode, ValueType VT,
+                            PatFrag ld_frag, string asm, string asm_alt,
+                            OpndItins itins, ImmLeaf immLeaf> {
+  let isCommutable = 1 in
+  def rr : SIi8<0xC2, MRMSrcReg,
+                (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))],
+                itins.rr>, Sched<[itins.Sched]>;
+  def rm : SIi8<0xC2, MRMSrcMem,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+                [(set RC:$dst, (OpNode (VT RC:$src1),
+                                         (ld_frag addr:$src2), immLeaf:$cc))],
+                                         itins.rm>,
+           Sched<[itins.Sched.Folded, ReadAfterLd]>;
+
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
+                      (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [],
+                      IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
+    let mayLoad = 1 in
+    def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
+                      (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [],
+                      IIC_SSE_ALU_F32S_RM>,
+                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+}
+
+defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
+                 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+                 SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG;
+defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
+                 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+                 SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare
+                 XD, VEX_4V, VEX_LIG;
+
+let Constraints = "$src1 = $dst" in {
+  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
+                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
+                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S,
+                  i8immZExt3>, XS;
+  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
+                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
+                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                  SSE_ALU_F64S, i8immZExt3>, XD;
+}
+
+multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
+                         Intrinsic Int, string asm, OpndItins itins,
+                         ImmLeaf immLeaf, ComplexPattern mem_cpat> {
+  def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src, CC:$cc), asm,
+                        [(set VR128:$dst, (Int VR128:$src1,
+                                               VR128:$src, immLeaf:$cc))],
+                                               itins.rr>,
+           Sched<[itins.Sched]>;
+  def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, memop:$src, CC:$cc), asm,
+                        [(set VR128:$dst, (Int VR128:$src1,
+                                               mem_cpat:$src, immLeaf:$cc))],
+                                               itins.rm>,
+           Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let isCodeGenOnly = 1 in {
+  // Aliases to match intrinsics which expect XMM operand(s).
+  defm Int_VCMPSS  : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
+                       "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+                       SSE_ALU_F32S, i8immZExt5, sse_load_f32>,
+                       XS, VEX_4V;
+  defm Int_VCMPSD  : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
+                       "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+                       SSE_ALU_F32S, i8immZExt5, sse_load_f64>, // same latency as f32
+                       XD, VEX_4V;
+  let Constraints = "$src1 = $dst" in {
+    defm Int_CMPSS  : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
+                         "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+                         SSE_ALU_F32S, i8immZExt3, sse_load_f32>, XS;
+    defm Int_CMPSD  : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
+                         "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+                         SSE_ALU_F64S, i8immZExt3, sse_load_f64>,
+                         XD;
+}
+}
+
+
+// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
+multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
+                            ValueType vt, X86MemOperand x86memop,
+                            PatFrag ld_frag, string OpcodeStr> {
+  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
+                     IIC_SSE_COMIS_RR>,
+          Sched<[WriteFAdd]>;
+  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+                     [(set EFLAGS, (OpNode (vt RC:$src1),
+                                           (ld_frag addr:$src2)))],
+                                           IIC_SSE_COMIS_RM>,
+          Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
+let Defs = [EFLAGS] in {
+  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+                                  "ucomiss">, PS, VEX, VEX_LIG;
+  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd">, PD, VEX, VEX_LIG;
+  let Pattern = []<dag> in {
+    defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
+                                    "comiss">, PS, VEX, VEX_LIG;
+    defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
+                                    "comisd">, PD, VEX, VEX_LIG;
+  }
+
+  let isCodeGenOnly = 1 in {
+    defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+                              load, "ucomiss">, PS, VEX;
+    defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+                              load, "ucomisd">, PD, VEX;
+
+    defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
+                              load, "comiss">, PS, VEX;
+    defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
+                              load, "comisd">, PD, VEX;
+  }
+  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+                                  "ucomiss">, PS;
+  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd">, PD;
+
+  let Pattern = []<dag> in {
+    defm COMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
+                                    "comiss">, PS;
+    defm COMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
+                                    "comisd">, PD;
+  }
+
+  let isCodeGenOnly = 1 in {
+    defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+                                load, "ucomiss">, PS;
+    defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+                                load, "ucomisd">, PD;
+
+    defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
+                                    "comiss">, PS;
+    defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
+                                    "comisd">, PD;
+  }
+} // Defs = [EFLAGS]
+
+// sse12_cmp_packed - sse 1 & 2 compare packed instructions
+multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
+                            Operand CC, Intrinsic Int, string asm,
+                            string asm_alt, Domain d, ImmLeaf immLeaf,
+                            PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> {
+  let isCommutable = 1 in
+  def rri : PIi8<0xC2, MRMSrcReg,
+             (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+             [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))],
+             itins.rr, d>,
+            Sched<[WriteFAdd]>;
+  def rmi : PIi8<0xC2, MRMSrcMem,
+             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+             [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))],
+             itins.rm, d>,
+            Sched<[WriteFAddLd, ReadAfterLd]>;
+
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    def rri_alt : PIi8<0xC2, MRMSrcReg,
+               (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
+               asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
+    let mayLoad = 1 in
+    def rmi_alt : PIi8<0xC2, MRMSrcMem,
+               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
+               asm_alt, [], itins.rm, d>,
+               Sched<[WriteFAddLd, ReadAfterLd]>;
+  }
+}
+
+defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
+               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V;
+defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
+               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V;
+defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
+               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L;
+defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
+               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in {
+  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
+                 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
+                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                 SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS;
+  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
+                 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
+                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                 SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+          (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
+          (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+          (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
+          (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
+          (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
+def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
+          (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
+          (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
+def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
+          (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [UseSSE1] in {
+def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+          (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
+          (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [UseSSE2] in {
+def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
+          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Shuffle Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
+multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
+                         ValueType vt, string asm, PatFrag mem_frag,
+                         Domain d> {
+  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
+                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
+                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+            Sched<[WriteFShuffleLd, ReadAfterLd]>;
+  def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+                 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
+                 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+                                     (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+            Sched<[WriteFShuffle]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
+           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv4f32, SSEPackedSingle>, PS, VEX_4V;
+  defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
+           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
+  defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv2f64, SSEPackedDouble>, PD, VEX_4V;
+  defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
+}
+let Constraints = "$src1 = $dst" in {
+  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
+                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    memopv4f32, SSEPackedSingle>, PS;
+  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
+                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    memopv2f64, SSEPackedDouble>, PD;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v4i32 (X86Shufp VR128:$src1,
+                       (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
+            (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+  def : Pat<(v2i64 (X86Shufp VR128:$src1,
+                       (loadv2i64 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+  // 256-bit patterns
+  def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+            (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+  def : Pat<(v8i32 (X86Shufp VR256:$src1,
+                      (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+            (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+
+  def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+            (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+  def : Pat<(v4i64 (X86Shufp VR256:$src1,
+                              (loadv4i64 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSE1] in {
+  def : Pat<(v4i32 (X86Shufp VR128:$src1,
+                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+            (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // Generic SHUFPD patterns
+  def : Pat<(v2i64 (X86Shufp VR128:$src1,
+                       (memopv2i64 addr:$src2), (i8 imm:$imm))),
+            (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Unpack FP Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
+multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
+                                   PatFrag mem_frag, RegisterClass RC,
+                                   X86MemOperand x86memop, string asm,
+                                   Domain d, bit IsCommutable = 0> {
+    let isCommutable = IsCommutable in
+    def rr : PI<opc, MRMSrcReg,
+                (outs RC:$dst), (ins RC:$src1, RC:$src2),
+                asm, [(set RC:$dst,
+                           (vt (OpNode RC:$src1, RC:$src2)))],
+                           IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>;
+    def rm : PI<opc, MRMSrcMem,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+                asm, [(set RC:$dst,
+                           (vt (OpNode RC:$src1,
+                                       (mem_frag addr:$src2))))],
+                                       IIC_SSE_UNPCK, d>,
+             Sched<[WriteFShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
+      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedSingle>, PS, VEX_4V;
+defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
+      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedDouble>, PD, VEX_4V;
+defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
+      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedSingle>, PS, VEX_4V;
+defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
+      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedDouble>, PD, VEX_4V;
+
+defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
+      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
+defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
+      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
+defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
+      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
+defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
+      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
+}// Predicates = [HasAVX, NoVLX]
+let Constraints = "$src1 = $dst" in {
+  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
+                       SSEPackedSingle>, PS;
+  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
+                       SSEPackedDouble, 1>, PD;
+  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
+                       SSEPackedSingle>, PS;
+  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
+                       SSEPackedDouble>, PD;
+} // Constraints = "$src1 = $dst"
+
+let Predicates = [HasAVX1Only] in {
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
+            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
+            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Extract Floating-Point Sign mask
+//===----------------------------------------------------------------------===//
+
+/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
+multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
+                                string asm, Domain d> {
+  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], IIC_SSE_MOVMSK, d>,
+              Sched<[WriteVecLogic]>;
+}
+
+let Predicates = [HasAVX] in {
+  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
+                                        SSEPackedSingle>, PS, VEX;
+  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
+                                        SSEPackedDouble>, PD, VEX;
+  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
+                                         SSEPackedSingle>, PS, VEX, VEX_L;
+  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
+                                         SSEPackedDouble>, PD, VEX, VEX_L;
+}
+
+defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
+                                     SSEPackedSingle>, PS;
+defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
+                                     SSEPackedDouble>, PD;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+/// PDI_binop_rm - Simple SSE2 binary operator.
+multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                        X86MemOperand x86memop, OpndItins itins,
+                        bit IsCommutable, bit Is2Addr> {
+  let isCommutable = IsCommutable in
+  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+       Sched<[itins.Sched]>;
+  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1,
+                                     (bitconvert (memop_frag addr:$src2)))))],
+                                     itins.rm>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+} // ExeDomain = SSEPackedInt
+
+multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
+                         ValueType OpVT128, ValueType OpVT256,
+                         OpndItins itins, bit IsCommutable = 0, Predicate prd> {
+let Predicates = [HasAVX, prd] in
+  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
+                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
+                           memopv2i64, i128mem, itins, IsCommutable, 1>;
+
+let Predicates = [HasAVX2, prd] in
+  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
+                               OpVT256, VR256, loadv4i64, i256mem, itins,
+                               IsCommutable, 0>, VEX_4V, VEX_L;
+}
+
+// These are ordered here for pattern ordering requirements with the fp versions
+
+defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
+                           SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
+defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
+                           SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
+defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
+                           SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
+defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
+                           SSE_VEC_BIT_ITINS_P, 0, NoVLX>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Logical Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
+///
+multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
+                                   SDNode OpNode> {
+  let Predicates = [HasAVX, NoVLX] in {
+  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
+        !strconcat(OpcodeStr, "ps"), f256mem,
+        [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
+                                  (bc_v4i64 (v8f32 VR256:$src2))))],
+        [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
+                           (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
+
+  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
+        !strconcat(OpcodeStr, "pd"), f256mem,
+        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
+                                  (bc_v4i64 (v4f64 VR256:$src2))))],
+        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
+                                  (loadv4i64 addr:$src2)))], 0>,
+                                  PD, VEX_4V, VEX_L;
+
+  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+       !strconcat(OpcodeStr, "ps"), f128mem,
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+                                 (bc_v2i64 (v4f32 VR128:$src2))))],
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+                                 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
+
+  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+       !strconcat(OpcodeStr, "pd"), f128mem,
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                 (bc_v2i64 (v2f64 VR128:$src2))))],
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                 (loadv2i64 addr:$src2)))], 0>,
+                                                 PD, VEX_4V;
+  }
+
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+         !strconcat(OpcodeStr, "ps"), f128mem,
+         [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+                                   (bc_v2i64 (v4f32 VR128:$src2))))],
+         [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+                                   (memopv2i64 addr:$src2)))]>, PS;
+
+    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+         !strconcat(OpcodeStr, "pd"), f128mem,
+         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                   (bc_v2i64 (v2f64 VR128:$src2))))],
+         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                   (memopv2i64 addr:$src2)))]>, PD;
+  }
+}
+
+defm AND  : sse12_fp_packed_logical<0x54, "and", and>;
+defm OR   : sse12_fp_packed_logical<0x56, "or", or>;
+defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
+let isCommutable = 0 in
+  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
+
+// If only AVX1 is supported, we need to handle integer operations with
+// floating point instructions since the integer versions aren't available.
+let Predicates = [HasAVX1Only] in {
+  def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
+            (VANDPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
+            (VORPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
+            (VXORPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
+            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
+            (VANDPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
+            (VORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
+            (VXORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
+            (VANDNPSYrm VR256:$src1, addr:$src2)>;
+}
+
+let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
+  // Use packed logical operations for scalar ops.
+  def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
+            (COPY_TO_REGCLASS (VANDPDrr
+                               (COPY_TO_REGCLASS FR64:$src1, VR128),
+                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+  def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
+            (COPY_TO_REGCLASS (VORPDrr
+                               (COPY_TO_REGCLASS FR64:$src1, VR128),
+                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+  def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
+            (COPY_TO_REGCLASS (VXORPDrr
+                               (COPY_TO_REGCLASS FR64:$src1, VR128),
+                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+  def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
+            (COPY_TO_REGCLASS (VANDNPDrr
+                               (COPY_TO_REGCLASS FR64:$src1, VR128),
+                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+
+  def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
+            (COPY_TO_REGCLASS (VANDPSrr
+                               (COPY_TO_REGCLASS FR32:$src1, VR128),
+                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+  def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
+            (COPY_TO_REGCLASS (VORPSrr
+                               (COPY_TO_REGCLASS FR32:$src1, VR128),
+                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+  def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
+            (COPY_TO_REGCLASS (VXORPSrr
+                               (COPY_TO_REGCLASS FR32:$src1, VR128),
+                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+  def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
+            (COPY_TO_REGCLASS (VANDNPSrr
+                               (COPY_TO_REGCLASS FR32:$src1, VR128),
+                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+}
+
+let Predicates = [UseSSE1] in {
+  // Use packed logical operations for scalar ops.
+  def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
+            (COPY_TO_REGCLASS (ANDPSrr
+                               (COPY_TO_REGCLASS FR32:$src1, VR128),
+                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+  def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
+            (COPY_TO_REGCLASS (ORPSrr
+                               (COPY_TO_REGCLASS FR32:$src1, VR128),
+                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+  def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
+            (COPY_TO_REGCLASS (XORPSrr
+                               (COPY_TO_REGCLASS FR32:$src1, VR128),
+                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+  def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
+            (COPY_TO_REGCLASS (ANDNPSrr
+                               (COPY_TO_REGCLASS FR32:$src1, VR128),
+                               (COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // Use packed logical operations for scalar ops.
+  def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
+            (COPY_TO_REGCLASS (ANDPDrr
+                               (COPY_TO_REGCLASS FR64:$src1, VR128),
+                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+  def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
+            (COPY_TO_REGCLASS (ORPDrr
+                               (COPY_TO_REGCLASS FR64:$src1, VR128),
+                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+  def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
+            (COPY_TO_REGCLASS (XORPDrr
+                               (COPY_TO_REGCLASS FR64:$src1, VR128),
+                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+  def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
+            (COPY_TO_REGCLASS (ANDNPDrr
+                               (COPY_TO_REGCLASS FR64:$src1, VR128),
+                               (COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
+}
+
+// Patterns for packed operations when we don't have integer type available.
+def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
+          (ANDPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
+          (ORPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
+          (XORPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
+          (ANDNPSrr VR128:$src1, VR128:$src2)>;
+
+def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
+          (ANDPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
+          (ORPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
+          (XORPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
+          (ANDNPSrm VR128:$src1, addr:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+
+/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
+/// vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem.
+///
+
+/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
+/// classes below
+multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SizeItins itins> {
+  let Predicates = [HasAVX, NoVLX] in {
+  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+                               VR128, v4f32, f128mem, loadv4f32,
+                               SSEPackedSingle, itins.s, 0>, PS, VEX_4V;
+  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+                               VR128, v2f64, f128mem, loadv2f64,
+                               SSEPackedDouble, itins.d, 0>, PD, VEX_4V;
+
+  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
+                        OpNode, VR256, v8f32, f256mem, loadv8f32,
+                        SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L;
+  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
+                        OpNode, VR256, v4f64, f256mem, loadv4f64,
+                        SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L;
+  }
+
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
+                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
+                              itins.s>, PS;
+    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
+                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
+                              itins.d>, PD;
+  }
+}
+
+multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SizeItins itins> {
+  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+                         OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>,
+                         XS, VEX_4V, VEX_LIG;
+  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+                         OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>,
+                         XD, VEX_4V, VEX_LIG;
+
+  let Constraints = "$src1 = $dst" in {
+    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+                              OpNode, FR32, f32mem, SSEPackedSingle,
+                              itins.s>, XS;
+    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+                              OpNode, FR64, f64mem, SSEPackedDouble,
+                              itins.d>, XD;
+  }
+}
+
+multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
+                                      SDPatternOperator IntSS,
+                                      SDPatternOperator IntSD,
+                                      SizeItins itins> {
+  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128,
+                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
+                   SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG;
+  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128,
+                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
+                   SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG;
+
+  let Constraints = "$src1 = $dst" in {
+    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128,
+                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
+                   SSEPackedSingle, itins.s>, XS;
+    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128,
+                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
+                   SSEPackedDouble, itins.d>, XD;
+  }
+}
+
+// Binary Arithmetic instructions
+defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
+           basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
+           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, null_frag,
+                                      SSE_ALU_ITINS_S>;
+defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
+           basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
+           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, null_frag,
+                                      SSE_MUL_ITINS_S>;
+let isCommutable = 0 in {
+  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, null_frag,
+                                        SSE_ALU_ITINS_S>;
+  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, null_frag,
+                                        SSE_DIV_ITINS_S>;
+  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5F, "max", int_x86_sse_max_ss,
+                                        int_x86_sse2_max_sd, SSE_ALU_ITINS_S>;
+  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5D, "min", int_x86_sse_min_ss,
+                                        int_x86_sse2_min_sd, SSE_ALU_ITINS_S>;
+}
+
+let isCodeGenOnly = 1 in {
+  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
+  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
+}
+
+// Patterns used to select SSE scalar fp arithmetic instructions from
+// either:
+//
+// (1) a scalar fp operation followed by a blend
+//
+// The effect is that the backend no longer emits unnecessary vector
+// insert instructions immediately after SSE scalar fp instructions
+// like addss or mulss.
+//
+// For example, given the following code:
+//   __m128 foo(__m128 A, __m128 B) {
+//     A[0] += B[0];
+//     return A;
+//   }
+//
+// Previously we generated:
+//   addss %xmm0, %xmm1
+//   movss %xmm1, %xmm0
+//
+// We now generate:
+//   addss %xmm1, %xmm0
+//
+// (2) a vector packed single/double fp operation followed by a vector insert
+//
+// The effect is that the backend converts the packed fp instruction
+// followed by a vector insert into a single SSE scalar fp instruction.
+//
+// For example, given the following code:
+//   __m128 foo(__m128 A, __m128 B) {
+//     __m128 C = A + B;
+//     return (__m128) {c[0], a[1], a[2], a[3]};
+//   }
+//
+// Previously we generated:
+//   addps %xmm0, %xmm1
+//   movss %xmm1, %xmm0
+//
+// We now generate:
+//   addss %xmm1, %xmm0
+
+// TODO: Some canonicalization in lowering would simplify the number of
+// patterns we have to try to match.
+multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
+  let Predicates = [UseSSE1] in {
+    // extracted scalar math op with insert via movss
+    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+          (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
+          FR32:$src))))),
+      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
+          (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+    // vector math op with insert via movss
+    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+  }
+
+  // With SSE 4.1, blendi is preferred to movsd, so match that too.
+  let Predicates = [UseSSE41] in {
+    // extracted scalar math op with insert via blend
+    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+          (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
+          FR32:$src))), (i8 1))),
+      (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
+          (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+    // vector math op with insert via blend
+    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+      (!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>;
+
+  }
+
+  // Repeat everything for AVX.
+  let Predicates = [UseAVX] in {
+    // extracted scalar math op with insert via movss
+    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+          (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
+          FR32:$src))))),
+      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
+          (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+    // extracted scalar math op with insert via blend
+    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+          (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
+          FR32:$src))), (i8 1))),
+      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
+          (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+    // vector math op with insert via movss
+    def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+
+    // vector math op with insert via blend
+    def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+          (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+      (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+  }
+}
+
+defm : scalar_math_f32_patterns<fadd, "ADD">;
+defm : scalar_math_f32_patterns<fsub, "SUB">;
+defm : scalar_math_f32_patterns<fmul, "MUL">;
+defm : scalar_math_f32_patterns<fdiv, "DIV">;
+
+multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
+  let Predicates = [UseSSE2] in {
+    // extracted scalar math op with insert via movsd
+    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
+          FR64:$src))))),
+      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
+          (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+    // vector math op with insert via movsd
+    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+  }
+
+  // With SSE 4.1, blendi is preferred to movsd, so match those too.
+  let Predicates = [UseSSE41] in {
+    // extracted scalar math op with insert via blend
+    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
+          FR64:$src))), (i8 1))),
+      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
+          (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+    // vector math op with insert via blend
+    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+      (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+  }
+
+  // Repeat everything for AVX.
+  let Predicates = [UseAVX] in {
+    // extracted scalar math op with insert via movsd
+    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
+          FR64:$src))))),
+      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
+          (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+    // extracted scalar math op with insert via blend
+    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+          (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
+          FR64:$src))), (i8 1))),
+      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
+          (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+    // vector math op with insert via movsd
+    def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+
+    // vector math op with insert via blend
+    def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+          (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+      (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+  }
+}
+
+defm : scalar_math_f64_patterns<fadd, "ADD">;
+defm : scalar_math_f64_patterns<fsub, "SUB">;
+defm : scalar_math_f64_patterns<fmul, "MUL">;
+defm : scalar_math_f64_patterns<fdiv, "DIV">;
+
+
+/// Unop Arithmetic
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+
+let Sched = WriteFSqrt in {
+def SSE_SQRTPS : OpndItins<
+  IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
+>;
+
+def SSE_SQRTSS : OpndItins<
+  IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
+>;
+
+def SSE_SQRTPD : OpndItins<
+  IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
+>;
+
+def SSE_SQRTSD : OpndItins<
+  IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
+>;
+}
+
+let Sched = WriteFRsqrt in {
+def SSE_RSQRTPS : OpndItins<
+  IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
+>;
+
+def SSE_RSQRTSS : OpndItins<
+  IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
+>;
+}
+
+let Sched = WriteFRcp in {
+def SSE_RCPP : OpndItins<
+  IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
+>;
+
+def SSE_RCPS : OpndItins<
+  IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
+>;
+}
+
+/// sse_fp_unop_s - SSE1 unops in scalar form
+/// For the non-AVX defs, we need $src1 to be tied to $dst because
+/// the HW instructions are 2 operand / destructive.
+multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                          ValueType vt, ValueType ScalarVT,
+                          X86MemOperand x86memop,
+                          Intrinsic Intr,
+                          SDNode OpNode, Domain d, OpndItins itins,
+                          Predicate target, string Suffix> {
+  let hasSideEffects = 0 in {
+  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
+              !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
+            [(set RC:$dst, (OpNode RC:$src1))], itins.rr, d>, Sched<[itins.Sched]>,
+            Requires<[target]>;
+  let mayLoad = 1 in
+  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
+            !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
+            [(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm, d>,
+            Sched<[itins.Sched.Folded, ReadAfterLd]>,
+            Requires<[target, OptForSize]>;
+
+  let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
+  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+              !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+            []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  let mayLoad = 1 in
+  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2),
+              !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+            []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+  }
+
+  let Predicates = [target] in {
+  // These are unary operations, but they are modeled as having 2 source operands
+  // because the high elements of the destination are unchanged in SSE.
+  def : Pat<(Intr VR128:$src),
+            (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>;
+  }
+  // We don't want to fold scalar loads into these instructions unless
+  // optimizing for size. This is because the folded instruction will have a
+  // partial register update, while the unfolded sequence will not, e.g.
+  // movss mem, %xmm0
+  // rcpss %xmm0, %xmm0
+  // which has a clobber before the rcp, vs.
+  // rcpss mem, %xmm0
+  let Predicates = [target, OptForSize] in {
+    def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
+               (!cast<Instruction>(NAME#Suffix##m_Int)
+                      (vt (IMPLICIT_DEF)), addr:$src2)>;
+  }
+}
+
+multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                          ValueType vt, ValueType ScalarVT,
+                          X86MemOperand x86memop,
+                          Intrinsic Intr, SDNode OpNode, Domain d,
+                          OpndItins itins, string Suffix> {
+  let hasSideEffects = 0 in {
+  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [], itins.rr, d>, Sched<[itins.Sched]>;
+  let mayLoad = 1 in
+  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  let isCodeGenOnly = 1 in {
+  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
+                (ins VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             []>, Sched<[itins.Sched.Folded]>;
+  let mayLoad = 1 in
+  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
+                (ins VR128:$src1, x86memop:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+  }
+
+  // We don't want to fold scalar loads into these instructions unless
+  // optimizing for size. This is because the folded instruction will have a
+  // partial register update, while the unfolded sequence will not, e.g.
+  // vmovss mem, %xmm0
+  // vrcpss %xmm0, %xmm0, %xmm0
+  // which has a clobber before the rcp, vs.
+  // vrcpss mem, %xmm0, %xmm0
+  // TODO: In theory, we could fold the load, and avoid the stall caused by
+  // the partial register store, either in ExeDepFix or with smarter RA.
+  let Predicates = [UseAVX] in {
+   def : Pat<(OpNode RC:$src),  (!cast<Instruction>("V"#NAME#Suffix##r)
+                                (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
+  }
+  let Predicates = [HasAVX] in {
+   def : Pat<(Intr VR128:$src),
+             (!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src,
+                                 VR128:$src)>;
+  }
+  let Predicates = [HasAVX, OptForSize] in {
+    def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
+              (!cast<Instruction>("V"#NAME#Suffix##m_Int)
+                    (vt (IMPLICIT_DEF)), addr:$src2)>;
+  }
+  let Predicates = [UseAVX, OptForSize] in {
+    def : Pat<(ScalarVT (OpNode (load addr:$src))),
+              (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
+            addr:$src)>;
+  }
+}
+
+/// sse1_fp_unop_p - SSE1 unops in packed form.
+multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          OpndItins itins, list<Predicate> prds> {
+let Predicates = prds in {
+  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       !strconcat("v", OpcodeStr,
+                                  "ps\t{$src, $dst|$dst, $src}"),
+                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
+                       itins.rr>, VEX, Sched<[itins.Sched]>;
+  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       !strconcat("v", OpcodeStr,
+                                  "ps\t{$src, $dst|$dst, $src}"),
+                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
+                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                        !strconcat("v", OpcodeStr,
+                                   "ps\t{$src, $dst|$dst, $src}"),
+                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
+                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                        !strconcat("v", OpcodeStr,
+                                   "ps\t{$src, $dst|$dst, $src}"),
+                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
+                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
+            Sched<[itins.Sched]>;
+  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
+            Sched<[itins.Sched.Folded]>;
+}
+
+/// sse2_fp_unop_p - SSE2 unops in vector forms.
+multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
+                          SDNode OpNode, OpndItins itins> {
+let Predicates = [HasAVX] in {
+  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       !strconcat("v", OpcodeStr,
+                                  "pd\t{$src, $dst|$dst, $src}"),
+                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
+                       itins.rr>, VEX, Sched<[itins.Sched]>;
+  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       !strconcat("v", OpcodeStr,
+                                  "pd\t{$src, $dst|$dst, $src}"),
+                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
+                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                        !strconcat("v", OpcodeStr,
+                                   "pd\t{$src, $dst|$dst, $src}"),
+                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
+                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                        !strconcat("v", OpcodeStr,
+                                   "pd\t{$src, $dst|$dst, $src}"),
+                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
+                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
+            Sched<[itins.Sched]>;
+  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
+            Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          OpndItins itins> {
+  defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
+                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
+                      SSEPackedSingle, itins, UseSSE1, "SS">, XS;
+  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
+                      f32mem,
+                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
+                      SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG;
+}
+
+multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          OpndItins itins> {
+  defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
+                         !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
+                         OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
+  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
+                         f64mem,
+                         !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
+                         OpNode, SSEPackedDouble, itins, "SD">,
+                         XD, VEX_4V, VEX_LIG;
+}
+
+// Square root.
+defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
+             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>,
+             sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>,
+             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
+             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >;
+defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
+             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>;
+
+// There is no f64 version of the reciprocal approximation instructions.
+
+// TODO: We should add *scalar* op patterns for these just like we have for
+// the binops above. If the binop and unop patterns could all be unified
+// that would be even better.
+
+multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix,
+                                      SDNode Move, ValueType VT,
+                                      Predicate BasePredicate> {
+  let Predicates = [BasePredicate] in {
+    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
+              (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+  }
+
+  // With SSE 4.1, blendi is preferred to movs*, so match that too.
+  let Predicates = [UseSSE41] in {
+    def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
+              (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+  }
+
+  // Repeat for AVX versions of the instructions.
+  let Predicates = [HasAVX] in {
+    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
+              (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+
+    def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
+              (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
+  }
+}
+
+defm : scalar_unary_math_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
+                                  v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
+                                  v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss,
+                                  v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd,
+                                  v2f64, UseSSE2>;
+
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Non-temporal stores
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+let SchedRW = [WriteStore] in {
+let Predicates = [HasAVX, NoVLX] in {
+def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
+                     (ins f128mem:$dst, VR128:$src),
+                     "movntps\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v4f32 VR128:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX;
+def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
+                     (ins f128mem:$dst, VR128:$src),
+                     "movntpd\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v2f64 VR128:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX;
+
+let ExeDomain = SSEPackedInt in
+def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
+                         (ins f128mem:$dst, VR128:$src),
+                         "movntdq\t{$src, $dst|$dst, $src}",
+                         [(alignednontemporalstore (v2i64 VR128:$src),
+                                                   addr:$dst)],
+                                                   IIC_SSE_MOVNT>, VEX;
+
+def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
+                     (ins f256mem:$dst, VR256:$src),
+                     "movntps\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v8f32 VR256:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
+                     (ins f256mem:$dst, VR256:$src),
+                     "movntpd\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v4f64 VR256:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+let ExeDomain = SSEPackedInt in
+def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
+                    (ins f256mem:$dst, VR256:$src),
+                    "movntdq\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v4i64 VR256:$src),
+                                              addr:$dst)],
+                                              IIC_SSE_MOVNT>, VEX, VEX_L;
+}
+
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntps\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
+                    IIC_SSE_MOVNT>;
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntpd\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
+                    IIC_SSE_MOVNT>;
+
+let ExeDomain = SSEPackedInt in
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntdq\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
+                    IIC_SSE_MOVNT>;
+
+// There is no AVX form for instructions below this point
+def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                 "movnti{l}\t{$src, $dst|$dst, $src}",
+                 [(nontemporalstore (i32 GR32:$src), addr:$dst)],
+                 IIC_SSE_MOVNT>,
+               PS, Requires<[HasSSE2]>;
+def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                     "movnti{q}\t{$src, $dst|$dst, $src}",
+                     [(nontemporalstore (i64 GR64:$src), addr:$dst)],
+                     IIC_SSE_MOVNT>,
+                  PS, Requires<[HasSSE2]>;
+} // SchedRW = [WriteStore]
+
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
+            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
+            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
+            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+
+  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+            (VMOVNTDQmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
+            (VMOVNTDQmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
+            (VMOVNTDQmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+            (MOVNTDQmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
+            (MOVNTDQmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
+            (MOVNTDQmr addr:$dst, VR128:$src)>;
+}
+
+} // AddedComplexity
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Prefetch and memory fence
+//===----------------------------------------------------------------------===//
+
+// Prefetch intrinsic.
+let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
+def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
+    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
+    IIC_SSE_PREFETCH>, TB;
+def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
+    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
+    IIC_SSE_PREFETCH>, TB;
+def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
+    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
+    IIC_SSE_PREFETCH>, TB;
+def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
+    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
+    IIC_SSE_PREFETCH>, TB;
+}
+
+// FIXME: How should flush instruction be modeled?
+let SchedRW = [WriteLoad] in {
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
+               IIC_SSE_PREFETCH>, PS, Requires<[HasSSE2]>;
+}
+
+let SchedRW = [WriteNop] in {
+// Pause. This "instruction" is encoded as "rep; nop", so even though it
+// was introduced with SSE2, it's backward compatible.
+def PAUSE : I<0x90, RawFrm, (outs), (ins),
+              "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>,
+              OBXS, Requires<[HasSSE2]>;
+}
+
+let SchedRW = [WriteFence] in {
+// Load, store, and memory fence
+// TODO: As with mfence, we may want to ease the availablity of sfence/lfence
+// to include any 64-bit target.
+def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
+               "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
+               PS, Requires<[HasSSE1]>;
+def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
+               "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
+               TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
+               "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
+               TB, Requires<[HasMFence]>;
+} // SchedRW
+
+def : Pat<(X86MFence), (MFENCE)>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Load/Store XCSR register
+//===----------------------------------------------------------------------===//
+
+def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+                  IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
+def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+                  IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
+
+let Predicates = [UseSSE1] in {
+def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
+                "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+                IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
+def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+                "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+                IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+let hasSideEffects = 0, SchedRW = [WriteMove] in {
+def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
+                    VEX;
+def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
+                    VEX, VEX_L;
+def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
+                    VEX;
+def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
+                    VEX, VEX_L;
+}
+
+// For Disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteMove] in {
+def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movdqa\t{$src, $dst|$dst, $src}", [],
+                        IIC_SSE_MOVA_P_RR>,
+                        VEX;
+def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+                        "movdqa\t{$src, $dst|$dst, $src}", [],
+                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movdqu\t{$src, $dst|$dst, $src}", [],
+                        IIC_SSE_MOVU_P_RR>,
+                        VEX;
+def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+                        "movdqu\t{$src, $dst|$dst, $src}", [],
+                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+}
+
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+    hasSideEffects = 0, SchedRW = [WriteLoad] in {
+def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
+                   VEX;
+def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
+                   VEX, VEX_L;
+let Predicates = [HasAVX] in {
+  def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+                    XS, VEX;
+  def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+                    XS, VEX, VEX_L;
+}
+}
+
+let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
+                     (ins i128mem:$dst, VR128:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
+                     VEX;
+def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
+                     (ins i256mem:$dst, VR256:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
+                     VEX, VEX_L;
+let Predicates = [HasAVX] in {
+def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
+                  XS, VEX;
+def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
+                  XS, VEX, VEX_L;
+}
+}
+
+let SchedRW = [WriteMove] in {
+let hasSideEffects = 0 in {
+def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
+
+def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movdqu\t{$src, $dst|$dst, $src}",
+                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+}
+
+// For Disassembler
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                       "movdqa\t{$src, $dst|$dst, $src}", [],
+                       IIC_SSE_MOVA_P_RR>;
+
+def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                       "movdqu\t{$src, $dst|$dst, $src}",
+                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+}
+} // SchedRW
+
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+    hasSideEffects = 0, SchedRW = [WriteLoad] in {
+def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}",
+                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
+                   IIC_SSE_MOVA_P_RM>;
+def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqu\t{$src, $dst|$dst, $src}",
+                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
+                   IIC_SSE_MOVU_P_RM>,
+                 XS, Requires<[UseSSE2]>;
+}
+
+let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}",
+                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
+                   IIC_SSE_MOVA_P_MR>;
+def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                   "movdqu\t{$src, $dst|$dst, $src}",
+                   [/*(store (v2i64 VR128:$src), addr:$dst)*/],
+                   IIC_SSE_MOVU_P_MR>,
+                 XS, Requires<[UseSSE2]>;
+}
+
+} // ExeDomain = SSEPackedInt
+
+// Aliases to help the assembler pick two byte VEX encodings by swapping the
+// operands relative to the normal instructions to use VEX.R instead of VEX.B.
+def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
+                (VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}",
+                (VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>;
+def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
+                (VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>;
+def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
+                (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Arithmetic Instructions
+//===---------------------------------------------------------------------===//
+
+let Sched = WriteVecIMul in
+def SSE_PMADD : OpndItins<
+  IIC_SSE_PMADD, IIC_SSE_PMADD
+>;
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
+multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+                         PatFrag memop_frag, X86MemOperand x86memop,
+                         OpndItins itins, bit Is2Addr = 1> {
+  let isCommutable = 1 in
+  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+       Sched<[itins.Sched]>;
+  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
+                                     (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+} // ExeDomain = SSEPackedInt
+
+defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
+                             SSE_INTALU_ITINS_P, 1, NoVLX>;
+defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
+                             SSE_INTALUQ_ITINS_P, 1, NoVLX>;
+defm PADDSB  : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PADDSW  : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
+                             SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
+                             SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
+                             SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
+                             SSE_INTALU_ITINS_P, 0, NoVLX>;
+defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
+                             SSE_INTALUQ_ITINS_P, 0, NoVLX>;
+defm PSUBSB  : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBSW  : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
+defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PAVGB   : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
+defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
+                              loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V;
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
+defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
+                               VR256, loadv4i64, i256mem, SSE_PMADD,
+                               0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
+                             memopv2i64, i128mem, SSE_PMADD>;
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
+defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
+                             loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
+                             VEX_4V;
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
+defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
+                             loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 0>,
+                             VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
+                            memopv2i64, i128mem, SSE_INTALU_ITINS_P>;
+
+let Predicates = [HasAVX, NoVLX] in
+defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
+                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
+                              VEX_4V;
+let Predicates = [HasAVX2, NoVLX] in
+defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
+                               VR256, loadv4i64, i256mem,
+                               SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
+                             memopv2i64, i128mem, SSE_INTMUL_ITINS_P>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
+                         string OpcodeStr, SDNode OpNode,
+                         SDNode OpNode2, RegisterClass RC,
+                         ValueType DstVT, ValueType SrcVT,
+                         PatFrag ld_frag, bit Is2Addr = 1> {
+  // src2 is always 128-bit
+  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
+       SSE_INTSHIFT_ITINS_P.rr>, Sched<[WriteVecShift]>;
+  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode RC:$src1,
+                       (SrcVT (bitconvert (ld_frag addr:$src2))))))],
+       SSE_INTSHIFT_ITINS_P.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
+       (ins RC:$src1, u8imm:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))],
+       SSE_INTSHIFT_ITINS_P.ri>, Sched<[WriteVecShift]>;
+}
+
+multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
+                             string OpcodeStr, SDNode OpNode,
+                             SDNode OpNode2, ValueType DstVT128,
+                             ValueType DstVT256, ValueType SrcVT,
+                             Predicate prd> {
+let Predicates = [HasAVX, prd] in
+  defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
+                              OpNode, OpNode2, VR128, DstVT128, SrcVT,
+                              loadv2i64, 0>, VEX_4V;
+let Predicates = [HasAVX2, prd] in
+  defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
+                                OpNode, OpNode2, VR256, DstVT256, SrcVT,
+                                loadv2i64, 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+  defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
+                           VR128, DstVT128, SrcVT, memopv2i64>;
+}
+
+multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
+                        SDNode OpNode, RegisterClass RC, ValueType VT,
+                        bit Is2Addr = 1> {
+  def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))],
+       IIC_SSE_INTSHDQ_P_RI>, Sched<[WriteVecShift]>;
+}
+
+multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
+                           SDNode OpNode> {
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
+  defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
+                             VR128, v16i8, 0>, VEX_4V;
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
+  defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
+                               VR256, v32i8, 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+  defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8>;
+}
+
+let ExeDomain = SSEPackedInt in {
+  defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
+                                 v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+  defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
+                                 v4i32, v8i32, v4i32, NoVLX>;
+  defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
+                                 v2i64, v4i64, v2i64, NoVLX>;
+
+  defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
+                                 v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+  defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
+                                 v4i32, v8i32, v4i32, NoVLX>;
+  defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
+                                 v2i64, v4i64, v2i64, NoVLX>;
+
+  defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
+                                 v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
+  defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
+                                 v4i32, v8i32, v4i32, NoVLX>;
+
+  defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq>;
+  defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq>;
+  // PSRADQri doesn't exist in SSE[1-3].
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Comparison Instructions
+//===---------------------------------------------------------------------===//
+
+defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1, TruePredicate>;
+defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1, TruePredicate>;
+defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
+                             SSE_INTALU_ITINS_P, 1, TruePredicate>;
+defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 0, TruePredicate>;
+defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 0, TruePredicate>;
+defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
+                             SSE_INTALU_ITINS_P, 0, TruePredicate>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Shuffle Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
+                         SDNode OpNode, Predicate prd> {
+let Predicates = [HasAVX, prd] in {
+  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, u8imm:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      [(set VR128:$dst,
+                        (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
+                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
+  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
+                      (ins i128mem:$src1, u8imm:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set VR128:$dst,
+                       (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
+                        (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
+                  Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [HasAVX2, prd] in {
+  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
+                       (ins VR256:$src1, u8imm:$src2),
+                       !strconcat("v", OpcodeStr,
+                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                       [(set VR256:$dst,
+                         (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
+                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
+  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
+                       (ins i256mem:$src1, u8imm:$src2),
+                       !strconcat("v", OpcodeStr,
+                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      [(set VR256:$dst,
+                        (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
+                         (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
+                   Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [UseSSE2] in {
+  def ri : Ii8<0x70, MRMSrcReg,
+               (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
+               !strconcat(OpcodeStr,
+                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set VR128:$dst,
+                  (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
+                IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
+  def mi : Ii8<0x70, MRMSrcMem,
+               (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
+               !strconcat(OpcodeStr,
+                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set VR128:$dst,
+                  (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
+                          (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
+           Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+}
+} // ExeDomain = SSEPackedInt
+
+defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, NoVLX>, PD;
+defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
+                             NoVLX_Or_NoBWI>, XS;
+defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
+                             NoVLX_Or_NoBWI>, XD;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
+            (VPSHUFDmi addr:$src1, imm:$imm)>;
+  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+            (VPSHUFDri VR128:$src1, imm:$imm)>;
+}
+
+let Predicates = [UseSSE2] in {
+  def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
+            (PSHUFDmi addr:$src1, imm:$imm)>;
+  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+            (PSHUFDri VR128:$src1, imm:$imm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// Packed Integer Pack Instructions (SSE & AVX)
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
+                     ValueType ArgVT, SDNode OpNode, PatFrag ld_frag,
+                     bit Is2Addr = 1> {
+  def rr : PDI<opc, MRMSrcReg,
+               (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+               !if(Is2Addr,
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   !strconcat(OpcodeStr,
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+               [(set VR128:$dst,
+                     (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
+               Sched<[WriteShuffle]>;
+  def rm : PDI<opc, MRMSrcMem,
+               (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+               !if(Is2Addr,
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   !strconcat(OpcodeStr,
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+               [(set VR128:$dst,
+                     (OutVT (OpNode (ArgVT VR128:$src1),
+                                    (bitconvert (ld_frag addr:$src2)))))]>,
+               Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
+                       ValueType ArgVT, SDNode OpNode> {
+  def Yrr : PDI<opc, MRMSrcReg,
+                (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+                !strconcat(OpcodeStr,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set VR256:$dst,
+                      (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
+                Sched<[WriteShuffle]>;
+  def Yrm : PDI<opc, MRMSrcMem,
+                (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+                !strconcat(OpcodeStr,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set VR256:$dst,
+                      (OutVT (OpNode (ArgVT VR256:$src1),
+                                     (bitconvert (loadv4i64 addr:$src2)))))]>,
+                Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
+                     ValueType ArgVT, SDNode OpNode, PatFrag ld_frag,
+                     bit Is2Addr = 1> {
+  def rr : SS48I<opc, MRMSrcReg,
+                 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                 !if(Is2Addr,
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     !strconcat(OpcodeStr,
+                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+                 [(set VR128:$dst,
+                       (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
+                 Sched<[WriteShuffle]>;
+  def rm : SS48I<opc, MRMSrcMem,
+                 (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                 !if(Is2Addr,
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     !strconcat(OpcodeStr,
+                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+                 [(set VR128:$dst,
+                       (OutVT (OpNode (ArgVT VR128:$src1),
+                                      (bitconvert (ld_frag addr:$src2)))))]>,
+                 Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
+                     ValueType ArgVT, SDNode OpNode> {
+  def Yrr : SS48I<opc, MRMSrcReg,
+                  (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+                  !strconcat(OpcodeStr,
+                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set VR256:$dst,
+                        (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
+                  Sched<[WriteShuffle]>;
+  def Yrm : SS48I<opc, MRMSrcMem,
+                  (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+                  !strconcat(OpcodeStr,
+                             "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set VR256:$dst,
+                        (OutVT (OpNode (ArgVT VR256:$src1),
+                                       (bitconvert (loadv4i64 addr:$src2)))))]>,
+                  Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+  defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
+                             loadv2i64, 0>, VEX_4V;
+  defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
+                             loadv2i64, 0>, VEX_4V;
+
+  defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
+                             loadv2i64, 0>, VEX_4V;
+  defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
+                             loadv2i64, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+  defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss>,
+                               VEX_4V, VEX_L;
+  defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss>,
+                               VEX_4V, VEX_L;
+
+  defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus>,
+                               VEX_4V, VEX_L;
+  defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus>,
+                               VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss,
+                            memopv2i64>;
+  defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss,
+                            memopv2i64>;
+
+  defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus,
+                            memopv2i64>;
+
+  defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus,
+                            memopv2i64>;
+}
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Unpack Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
+                       SDNode OpNode, PatFrag ld_frag, bit Is2Addr = 1> {
+  def rr : PDI<opc, MRMSrcReg,
+      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+      !if(Is2Addr,
+          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
+      IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
+  def rm : PDI<opc, MRMSrcMem,
+      (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+      !if(Is2Addr,
+          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set VR128:$dst, (vt (OpNode VR128:$src1,
+                                  (bitconvert (ld_frag addr:$src2)))))],
+                                               IIC_SSE_UNPCK>,
+      Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
+                         SDNode OpNode> {
+  def Yrr : PDI<opc, MRMSrcReg,
+      (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+      [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
+      Sched<[WriteShuffle]>;
+  def Yrm : PDI<opc, MRMSrcMem,
+      (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+      [(set VR256:$dst, (vt (OpNode VR256:$src1,
+                                  (bitconvert (loadv4i64 addr:$src2)))))]>,
+      Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
+                                 loadv2i64, 0>, VEX_4V;
+  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
+                                 loadv2i64, 0>, VEX_4V;
+  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
+                                 loadv2i64, 0>, VEX_4V;
+  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
+                                 loadv2i64, 0>, VEX_4V;
+}
+let Predicates = [HasAVX, NoVLX] in {
+  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
+                                 loadv2i64, 0>, VEX_4V;
+  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
+                                 loadv2i64, 0>, VEX_4V;
+  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
+                                 loadv2i64, 0>, VEX_4V;
+  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
+                                 loadv2i64, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+  defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl>,
+                                   VEX_4V, VEX_L;
+  defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl>,
+                                   VEX_4V, VEX_L;
+  defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh>,
+                                   VEX_4V, VEX_L;
+  defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh>,
+                                   VEX_4V, VEX_L;
+}
+let Predicates = [HasAVX2, NoVLX] in {
+  defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl>,
+                                   VEX_4V, VEX_L;
+  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl>,
+                                   VEX_4V, VEX_L;
+  defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh>,
+                                   VEX_4V, VEX_L;
+  defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh>,
+                                   VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
+                                memopv2i64>;
+  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
+                                memopv2i64>;
+  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
+                                memopv2i64>;
+  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
+                                memopv2i64>;
+
+  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
+                                memopv2i64>;
+  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
+                                memopv2i64>;
+  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
+                                memopv2i64>;
+  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
+                                memopv2i64>;
+}
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Extract and Insert
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pinsrw<bit Is2Addr = 1> {
+  def rri : Ii8<0xC4, MRMSrcReg,
+       (outs VR128:$dst), (ins VR128:$src1,
+        GR32orGR64:$src2, u8imm:$src3),
+       !if(Is2Addr,
+           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+       [(set VR128:$dst,
+         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
+       IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
+  def rmi : Ii8<0xC4, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1,
+                        i16mem:$src2, u8imm:$src3),
+       !if(Is2Addr,
+           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+       [(set VR128:$dst,
+         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
+                    imm:$src3))], IIC_SSE_PINSRW>,
+       Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+// Extract
+let Predicates = [HasAVX, NoBWI] in
+def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
+                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
+                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                            imm:$src2))]>, PD, VEX,
+                Sched<[WriteShuffle]>;
+def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
+                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
+                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                            imm:$src2))], IIC_SSE_PEXTRW>,
+               Sched<[WriteShuffleLd, ReadAfterLd]>;
+
+// Insert
+let Predicates = [HasAVX, NoBWI] in
+defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
+
+let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
+defm PINSRW : sse2_pinsrw, PD;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Mask Creation
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
+
+def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+           (ins VR128:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}",
+           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
+           IIC_SSE_MOVMSK>, VEX;
+
+let Predicates = [HasAVX2] in {
+def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+           (ins VR256:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}",
+           [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
+           VEX, VEX_L;
+}
+
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}",
+           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
+           IIC_SSE_MOVMSK>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Conditional Store
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
+
+let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
+def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
+           (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
+           IIC_SSE_MASKMOV>, VEX;
+let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
+def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
+           (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
+           IIC_SSE_MASKMOV>, VEX;
+
+let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
+def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
+           IIC_SSE_MASKMOV>;
+let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
+def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
+           IIC_SSE_MASKMOV>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Doubleword/Quadword
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Packed Double Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+                        VEX, Sched<[WriteMove]>;
+def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_SSE_MOVDQ>,
+                      VEX, Sched<[WriteLoad]>;
+def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))],
+                          IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>;
+let isCodeGenOnly = 1 in
+def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "movq\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))],
+                       IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+                  Sched<[WriteMove]>;
+def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))],
+                          IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
+def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+let isCodeGenOnly = 1 in
+def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))],
+                       IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Single Scalar
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert GR32:$src))],
+                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+  def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+                        IIC_SSE_MOVDQ>,
+                        VEX, Sched<[WriteLoad]>;
+  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert GR32:$src))],
+                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+
+  def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int to Packed Double Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
+                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
+                    Sched<[WriteMove]>;
+def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
+                       (ins i32mem:$dst, VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (extractelt (v4i32 VR128:$src),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+                                     VEX, Sched<[WriteStore]>;
+def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
+                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
+                   Sched<[WriteMove]>;
+def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (extractelt (v4i32 VR128:$src),
+                                     (iPTR 0))), addr:$dst)],
+                                     IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+} // ExeDomain = SSEPackedInt
+
+def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
+
+def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
+
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int first element to Doubleword Int
+//
+let ExeDomain = SSEPackedInt in {
+let SchedRW = [WriteMove] in {
+def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                          "movq\t{$src, $dst|$dst, $src}",
+                          [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
+                                                        (iPTR 0)))],
+                                                           IIC_SSE_MOVD_ToGP>,
+                      VEX;
+
+def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
+                                                         (iPTR 0)))],
+                                                         IIC_SSE_MOVD_ToGP>;
+} //SchedRW
+
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs),
+                          (ins i64mem:$dst, VR128:$src),
+                          "movq\t{$src, $dst|$dst, $src}",
+                          [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
+def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// Bitcast FR64 <-> GR64
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+  let Predicates = [UseAVX] in
+  def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                          "movq\t{$src, $dst|$dst, $src}",
+                          [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
+                          VEX, Sched<[WriteLoad]>;
+  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                           "movq\t{$src, $dst|$dst, $src}",
+                           [(set GR64:$dst, (bitconvert FR64:$src))],
+                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+  def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                           "movq\t{$src, $dst|$dst, $src}",
+                           [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+
+  def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                         "movq\t{$src, $dst|$dst, $src}",
+                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
+                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                         "mov{d|q}\t{$src, $dst|$dst, $src}",
+                         [(set GR64:$dst, (bitconvert FR64:$src))],
+                         IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+  def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                         "movq\t{$src, $dst|$dst, $src}",
+                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+                         IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+//===---------------------------------------------------------------------===//
+// Move Scalar Single to Double Int
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set GR32:$dst, (bitconvert FR32:$src))],
+                        IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
+  def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set GR32:$dst, (bitconvert FR32:$src))],
+                        IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+  def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+                        IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+let Predicates = [UseAVX] in {
+  let AddedComplexity = 15 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (VMOVDI2PDIrr GR32:$src)>;
+
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+              (VMOV64toPQIrr GR64:$src)>;
+
+    def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+              (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>;
+  }
+  // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
+  // These instructions also write zeros in the high part of a 256-bit register.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (VMOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (VMOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (VMOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzload addr:$src)),
+              (VMOVDI2PDIrm addr:$src)>;
+    def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+              (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
+    def : Pat<(v8i32 (X86vzload addr:$src)),
+              (SUBREG_TO_REG (i64 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
+  }
+  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
+}
+
+let Predicates = [UseSSE2] in {
+  let AddedComplexity = 15 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (MOVDI2PDIrr GR32:$src)>;
+
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+              (MOV64toPQIrr GR64:$src)>;
+  }
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (MOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (MOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (MOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzload addr:$src)),
+              (MOVDI2PDIrm addr:$src)>;
+  }
+}
+
+// These are the correct encodings of the instructions so that we know how to
+// read correct assembly, even though we continue to emit the wrong ones for
+// compatibility with Darwin's buggy assembler.
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
+// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Quadword
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Move Quadword Int to Packed Quadword Int
+//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteLoad] in {
+def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                    "vmovq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst,
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+                    VEX, Requires<[UseAVX]>;
+def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                    "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst,
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
+                      IIC_SSE_MOVDQ>, XS,
+                    Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
+} // ExeDomain, SchedRW
+
+//===---------------------------------------------------------------------===//
+// Move Packed Quadword Int to Quadword Int
+//
+let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
+def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}",
+                      [(store (i64 (extractelt (v2i64 VR128:$src),
+                                    (iPTR 0))), addr:$dst)],
+                                    IIC_SSE_MOVDQ>, VEX;
+def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}",
+                      [(store (i64 (extractelt (v2i64 VR128:$src),
+                                    (iPTR 0))), addr:$dst)],
+                                    IIC_SSE_MOVDQ>;
+} // ExeDomain, SchedRW
+
+// For disassembler only
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
+    SchedRW = [WriteVecLogic] in {
+def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                     "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX;
+def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
+}
+
+// Aliases to help the assembler pick two byte VEX encodings by swapping the
+// operands relative to the normal instructions to use VEX.R instead of VEX.B.
+def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}",
+                (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>;
+
+let Predicates = [UseAVX], AddedComplexity = 20 in {
+  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (VMOVQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+            (VMOVQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+            (VMOVQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzload addr:$src)),
+            (VMOVQI2PQIrm addr:$src)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+              (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>;
+}
+
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
+  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (MOVQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+            (MOVQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+            (MOVQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
+// IA32 document. movq xmm1, xmm2 does clear the high bits.
+//
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
+let AddedComplexity = 15 in
+def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "vmovq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
+                    IIC_SSE_MOVQ_RR>,
+                      XS, VEX, Requires<[UseAVX]>;
+let AddedComplexity = 15 in
+def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
+                    IIC_SSE_MOVQ_RR>,
+                      XS, Requires<[UseSSE2]>;
+} // ExeDomain, SchedRW
+
+let AddedComplexity = 20 in {
+  let Predicates = [UseAVX] in {
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+              (VMOVZPQILo2PQIrr VR128:$src)>;
+  }
+  let Predicates = [UseSSE2] in {
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+              (MOVZPQILo2PQIrr VR128:$src)>;
+  }
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
+                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
+                              X86MemOperand x86memop> {
+def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      [(set RC:$dst, (vt (OpNode RC:$src)))],
+                      IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
+                      IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
+  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
+  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
+  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
+}
+defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
+                                   memopv4f32, f128mem>;
+defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
+                                   memopv4f32, f128mem>;
+
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+            (VMOVSHDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
+            (VMOVSHDUPrm addr:$src)>;
+  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+            (VMOVSLDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
+            (VMOVSLDUPrm addr:$src)>;
+  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
+            (VMOVSHDUPYrr VR256:$src)>;
+  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
+            (VMOVSHDUPYrm addr:$src)>;
+  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
+            (VMOVSLDUPYrr VR256:$src)>;
+  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
+            (VMOVSLDUPYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE3] in {
+  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+            (MOVSHDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (MOVSHDUPrm addr:$src)>;
+  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+            (MOVSLDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (MOVSLDUPrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Double FP - MOVDDUP
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_replicate_dfp<string OpcodeStr> {
+def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))],
+                    IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
+def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst,
+                      (v2f64 (X86Movddup
+                              (scalar_to_vector (loadf64 addr:$src)))))],
+                              IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
+}
+
+// FIXME: Merge with above classe when there're patterns for the ymm version
+multiclass sse3_replicate_dfp_y<string OpcodeStr> {
+def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
+                    Sched<[WriteFShuffle]>;
+def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst,
+                      (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
+                    Sched<[WriteLoad]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
+  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
+}
+
+defm MOVDDUP : sse3_replicate_dfp<"movddup">;
+
+
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(X86Movddup (loadv2f64 addr:$src)),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+
+  // 256-bit version
+  def : Pat<(X86Movddup (loadv4i64 addr:$src)),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (v4i64 VR256:$src)),
+            (VMOVDDUPYrr VR256:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (bc_v2f64
+                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in
+def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+          (VMOVDDUPrm addr:$src)>;
+let Predicates = [HasAVX1Only] in
+def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
+          (VMOVDDUPrm addr:$src)>;
+
+let Predicates = [UseSSE3] in {
+  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64
+                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (MOVDDUPrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Move Unaligned Integer
+//===---------------------------------------------------------------------===//
+
+let SchedRW = [WriteLoad] in {
+let Predicates = [HasAVX] in {
+  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "vlddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
+  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                   "vlddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
+                   VEX, VEX_L;
+}
+def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "lddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
+                   IIC_SSE_LDDQU>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Arithmetic
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop, OpndItins itins,
+                       PatFrag ld_frag, bit Is2Addr = 1> {
+  def rr : I<0xD0, MRMSrcReg,
+       (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
+       Sched<[itins.Sched]>;
+  def rm : I<0xD0, MRMSrcMem,
+       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], itins.rr>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  let ExeDomain = SSEPackedSingle in {
+    defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
+                               f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V;
+    defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
+                        f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L;
+  }
+  let ExeDomain = SSEPackedDouble in {
+    defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
+                               f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V;
+    defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
+                        f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L;
+  }
+}
+let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
+  let ExeDomain = SSEPackedSingle in
+  defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
+                              f128mem, SSE_ALU_F32P, memopv4f32>, XD;
+  let ExeDomain = SSEPackedDouble in
+  defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
+                              f128mem, SSE_ALU_F64P, memopv2f64>, PD;
+}
+
+// Patterns used to select 'addsub' instructions.
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
+            (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
+  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (loadv4f32 addr:$rhs))),
+            (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
+  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
+            (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (loadv2f64 addr:$rhs))),
+            (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
+
+  def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))),
+            (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>;
+  def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (loadv8f32 addr:$rhs))),
+            (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>;
+  def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))),
+            (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
+  def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (loadv4f64 addr:$rhs))),
+            (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>;
+}
+
+let Predicates = [UseSSE3] in {
+  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
+            (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
+  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (memopv4f32 addr:$rhs))),
+            (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
+  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
+            (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (memopv2f64 addr:$rhs))),
+            (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 Instructions
+//===---------------------------------------------------------------------===//
+
+// Horizontal ops
+multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+                   X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
+                   bit Is2Addr = 1> {
+  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
+      Sched<[WriteFAdd]>;
+
+  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
+        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+                  X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
+                  bit Is2Addr = 1> {
+  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
+      Sched<[WriteFAdd]>;
+
+  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
+        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  let ExeDomain = SSEPackedSingle in {
+    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
+                            X86fhadd, loadv4f32, 0>, VEX_4V;
+    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
+                            X86fhsub, loadv4f32, 0>, VEX_4V;
+    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
+                            X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L;
+    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
+                            X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L;
+  }
+  let ExeDomain = SSEPackedDouble in {
+    defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
+                            X86fhadd, loadv2f64, 0>, VEX_4V;
+    defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
+                            X86fhsub, loadv2f64, 0>, VEX_4V;
+    defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
+                            X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L;
+    defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
+                            X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L;
+  }
+}
+
+let Constraints = "$src1 = $dst" in {
+  let ExeDomain = SSEPackedSingle in {
+    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
+                          memopv4f32>;
+    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
+                          memopv4f32>;
+  }
+  let ExeDomain = SSEPackedDouble in {
+    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
+                         memopv2f64>;
+    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
+                         memopv2f64>;
+  }
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Absolute Instructions
+//===---------------------------------------------------------------------===//
+
+
+/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
+                        SDNode OpNode, PatFrag ld_frag> {
+  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+                 (ins VR128:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (vt (OpNode VR128:$src)))],
+                 IIC_SSE_PABS_RR>, Sched<[WriteVecALU]>;
+
+  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+                 (ins i128mem:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst,
+                   (vt (OpNode (bitconvert (ld_frag addr:$src)))))],
+                 IIC_SSE_PABS_RM>, Sched<[WriteVecALULd]>;
+}
+
+/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
+                          SDNode OpNode> {
+  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+                  (ins VR256:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
+                  Sched<[WriteVecALU]>;
+
+  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+                  (ins i256mem:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR256:$dst,
+                    (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
+                  Sched<[WriteVecALULd]>;
+}
+
+// Helper fragments to match sext vXi1 to vXiY.
+def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
+                                               VR128:$src))>;
+def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
+def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
+def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
+                                               VR256:$src))>;
+def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
+def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, X86Abs, loadv2i64>, VEX;
+  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, X86Abs, loadv2i64>, VEX;
+}
+let Predicates = [HasAVX, NoVLX] in {
+  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, X86Abs, loadv2i64>, VEX;
+}
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+  def : Pat<(xor
+            (bc_v2i64 (v16i1sextv16i8)),
+            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
+            (VPABSBrr VR128:$src)>;
+  def : Pat<(xor
+            (bc_v2i64 (v8i1sextv8i16)),
+            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
+            (VPABSWrr VR128:$src)>;
+}
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(xor
+            (bc_v2i64 (v4i1sextv4i32)),
+            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
+            (VPABSDrr VR128:$src)>;
+}
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, X86Abs>, VEX, VEX_L;
+  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, X86Abs>, VEX, VEX_L;
+}
+let Predicates = [HasAVX2, NoVLX] in {
+  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, X86Abs>, VEX, VEX_L;
+}
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+  def : Pat<(xor
+            (bc_v4i64 (v32i1sextv32i8)),
+            (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
+            (VPABSBYrr VR256:$src)>;
+  def : Pat<(xor
+            (bc_v4i64 (v16i1sextv16i16)),
+            (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
+            (VPABSWYrr VR256:$src)>;
+}
+let Predicates = [HasAVX2, NoVLX] in {
+  def : Pat<(xor
+            (bc_v4i64 (v8i1sextv8i32)),
+            (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
+            (VPABSDYrr VR256:$src)>;
+}
+
+defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, X86Abs, memopv2i64>;
+defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, X86Abs, memopv2i64>;
+defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, X86Abs, memopv2i64>;
+
+let Predicates = [UseSSSE3] in {
+  def : Pat<(xor
+            (bc_v2i64 (v16i1sextv16i8)),
+            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
+            (PABSBrr VR128:$src)>;
+  def : Pat<(xor
+            (bc_v2i64 (v8i1sextv8i16)),
+            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
+            (PABSWrr VR128:$src)>;
+  def : Pat<(xor
+            (bc_v2i64 (v4i1sextv4i32)),
+            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
+            (PABSDrr VR128:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Binary Operator Instructions
+//===---------------------------------------------------------------------===//
+
+let Sched = WriteVecALU in {
+def SSE_PHADDSUBD : OpndItins<
+  IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
+>;
+def SSE_PHADDSUBSW : OpndItins<
+  IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
+>;
+def SSE_PHADDSUBW : OpndItins<
+  IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
+>;
+}
+let Sched = WriteShuffle in
+def SSE_PSHUFB : OpndItins<
+  IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
+>;
+let Sched = WriteVecALU in
+def SSE_PSIGN : OpndItins<
+  IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
+>;
+let Sched = WriteVecIMul in
+def SSE_PMULHRSW : OpndItins<
+  IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
+>;
+
+/// SS3I_binop_rm - Simple SSSE3 bin op
+multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         ValueType DstVT, ValueType OpVT, RegisterClass RC,
+                         PatFrag memop_frag, X86MemOperand x86memop,
+                         OpndItins itins, bit Is2Addr = 1> {
+  let isCommutable = 1 in
+  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))], itins.rr>,
+       Sched<[itins.Sched]>;
+  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst,
+         (DstVT (OpNode (OpVT RC:$src1),
+          (bitconvert (memop_frag addr:$src2)))))], itins.rm>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
+                             Intrinsic IntId128, OpndItins itins,
+                             PatFrag ld_frag, bit Is2Addr = 1> {
+  let isCommutable = 1 in
+  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+       Sched<[itins.Sched]>;
+  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst,
+         (IntId128 VR128:$src1,
+          (bitconvert (ld_frag addr:$src2))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
+                               Intrinsic IntId256,
+                               X86FoldableSchedWrite Sched> {
+  let isCommutable = 1 in
+  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+       (ins VR256:$src1, VR256:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
+       Sched<[Sched]>;
+  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+       (ins VR256:$src1, i256mem:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set VR256:$dst,
+         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
+       Sched<[Sched.Folded, ReadAfterLd]>;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+let isCommutable = 0 in {
+  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
+                                  VR128, loadv2i64, i128mem,
+                                  SSE_PSHUFB, 0>, VEX_4V;
+  defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
+                                  v16i8, VR128, loadv2i64, i128mem,
+                                  SSE_PMADD, 0>, VEX_4V;
+}
+defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
+                                  VR128, loadv2i64, i128mem,
+                                  SSE_PMULHRSW, 0>, VEX_4V;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX] in {
+let isCommutable = 0 in {
+  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V;
+  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PHADDSUBD, 0>, VEX_4V;
+  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V;
+  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
+                                  loadv2i64, i128mem,
+                                  SSE_PHADDSUBD, 0>, VEX_4V;
+  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
+                                      int_x86_ssse3_psign_b_128,
+                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
+                                      int_x86_ssse3_psign_w_128,
+                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
+                                      int_x86_ssse3_psign_d_128,
+                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
+                                      int_x86_ssse3_phadd_sw_128,
+                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
+  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
+                                      int_x86_ssse3_phsub_sw_128,
+                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
+}
+}
+
+let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+let isCommutable = 0 in {
+  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
+                                  VR256, loadv4i64, i256mem,
+                                  SSE_PSHUFB, 0>, VEX_4V, VEX_L;
+  defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
+                                   v32i8, VR256, loadv4i64, i256mem,
+                                   SSE_PMADD, 0>, VEX_4V, VEX_L;
+}
+defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
+                                  VR256, loadv4i64, i256mem,
+                                  SSE_PMULHRSW, 0>, VEX_4V, VEX_L;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX2] in {
+let isCommutable = 0 in {
+  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
+                                  VR256, loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
+                                  VR256, loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
+                                  loadv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPSIGNBY   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
+                                        WriteVecALU>, VEX_4V, VEX_L;
+  defm VPSIGNWY   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
+                                        WriteVecALU>, VEX_4V, VEX_L;
+  defm VPSIGNDY   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
+                                        WriteVecALU>, VEX_4V, VEX_L;
+  defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
+                                        int_x86_avx2_phadd_sw,
+                                        WriteVecALU>, VEX_4V, VEX_L;
+  defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
+                                        int_x86_avx2_phsub_sw,
+                                        WriteVecALU>, VEX_4V, VEX_L;
+}
+}
+
+// None of these have i8 immediate fields.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+let isCommutable = 0 in {
+  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
+                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
+  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
+                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
+  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
+                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
+  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
+                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
+  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
+                                     SSE_PSIGN, memopv2i64>;
+  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
+                                     SSE_PSIGN, memopv2i64>;
+  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
+                                     SSE_PSIGN, memopv2i64>;
+  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
+                                 memopv2i64, i128mem, SSE_PSHUFB>;
+  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
+                                     int_x86_ssse3_phadd_sw_128,
+                                     SSE_PHADDSUBSW, memopv2i64>;
+  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
+                                     int_x86_ssse3_phsub_sw_128,
+                                     SSE_PHADDSUBSW, memopv2i64>;
+  defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
+                                 v16i8, VR128, memopv2i64, i128mem,
+                                 SSE_PMADD>;
+}
+defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
+                                 VR128, memopv2i64, i128mem, SSE_PMULHRSW>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Align Instruction Patterns
+//===---------------------------------------------------------------------===//
+
+multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
+  let hasSideEffects = 0 in {
+  def rri : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>;
+  let mayLoad = 1 in
+  def rmi : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+  }
+}
+
+multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
+  let hasSideEffects = 0 in {
+  def Yrri : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
+      (ins VR256:$src1, VR256:$src2, u8imm:$src3),
+      !strconcat(asm,
+                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+      []>, Sched<[WriteShuffle]>;
+  let mayLoad = 1 in
+  def Yrmi : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
+      (ins VR256:$src1, i256mem:$src2, u8imm:$src3),
+      !strconcat(asm,
+                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+      []>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+  }
+}
+
+let Predicates = [HasAVX] in
+  defm VPALIGNR : ssse3_palignr<"vpalignr", 0>, VEX_4V;
+let Predicates = [HasAVX2] in
+  defm VPALIGNR : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
+  defm PALIGNR : ssse3_palignr<"palignr">;
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSSE3] in {
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Thread synchronization
+//===---------------------------------------------------------------------===//
+
+let SchedRW = [WriteSystem] in {
+let usesCustomInserter = 1 in {
+def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
+                [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
+                Requires<[HasSSE3]>;
+}
+
+let Uses = [EAX, ECX, EDX] in
+def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
+                 TB, Requires<[HasSSE3]>;
+
+let Uses = [ECX, EAX] in
+def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
+                [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
+                TB, Requires<[HasSSE3]>;
+} // SchedRW
+
+def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
+def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
+
+def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
+      Requires<[Not64BitMode]>;
+def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
+      Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Move with Sign/Zero Extend
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
+                          RegisterClass OutRC, RegisterClass InRC,
+                          OpndItins itins> {
+  def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [], itins.rr>,
+                 Sched<[itins.Sched]>;
+
+  def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [],
+                 itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
+                          X86MemOperand MemOp, X86MemOperand MemYOp,
+                          OpndItins SSEItins, OpndItins AVXItins,
+                          OpndItins AVX2Itins, Predicate prd> {
+  defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>;
+  let Predicates = [HasAVX, prd] in
+    defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
+                                     VR128, VR128, AVXItins>, VEX;
+  let Predicates = [HasAVX2, prd] in
+    defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
+                                     VR256, VR128, AVX2Itins>, VEX, VEX_L;
+}
+
+multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
+                          X86MemOperand MemYOp, Predicate prd> {
+  defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
+                                        MemOp, MemYOp,
+                                        SSE_INTALU_ITINS_SHUFF_P,
+                                        DEFAULT_ITINS_SHUFFLESCHED,
+                                        DEFAULT_ITINS_SHUFFLESCHED, prd>;
+  defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
+                                        !strconcat("pmovzx", OpcodeStr),
+                                        MemOp, MemYOp,
+                                        SSE_INTALU_ITINS_SHUFF_P,
+                                        DEFAULT_ITINS_SHUFFLESCHED,
+                                        DEFAULT_ITINS_SHUFFLESCHED, prd>;
+}
+
+defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
+defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
+defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
+
+defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
+defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
+
+defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
+
+// AVX2 Patterns
+multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> {
+  // Register-Register patterns
+  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+  def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
+  }
+  let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
+  def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
+
+  def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
+            (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
+  def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))),
+            (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
+
+  def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
+            (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
+  }
+
+  // Simple Register-Memory patterns
+  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+  def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+  }
+  let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+
+  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+
+  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
+            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+  }
+
+  // AVX2 Register-Memory patterns
+  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+  def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+  def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
+  }
+  let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+
+  def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
+
+  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+  def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
+
+  def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+
+  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+  def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
+  }
+}
+
+defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>;
+defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;
+
+// SSE4.1/AVX patterns.
+multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
+                                SDNode ExtOp, PatFrag ExtLoad16> {
+  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+  def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
+  }
+  let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
+  def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
+            (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
+
+  def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
+            (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
+  def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
+            (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
+
+  def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
+            (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
+  }
+  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+  def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  }
+  let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
+            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+
+  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
+            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+
+  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+  }
+  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
+  }
+  let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
+
+  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
+
+  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+  def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
+
+  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
+
+  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+  def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
+  }
+}
+
+defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
+defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
+
+let Predicates = [UseSSE41] in {
+  defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>;
+  defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Extract Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
+multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+                 (ins VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
+                                         imm:$src2))]>,
+                  Sched<[WriteShuffle]>;
+  let hasSideEffects = 0, mayStore = 1,
+      SchedRW = [WriteShuffleLd, WriteRMW] in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1),
+                                                 imm:$src2)))), addr:$dst)]>;
+}
+
+let Predicates = [HasAVX, NoBWI] in
+  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
+
+defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
+
+
+/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
+multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+                   (ins VR128:$src1, u8imm:$src2),
+                   !strconcat(OpcodeStr,
+                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   []>, Sched<[WriteShuffle]>;
+
+  let hasSideEffects = 0, mayStore = 1,
+      SchedRW = [WriteShuffleLd, WriteRMW] in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1),
+                                                  imm:$src2)))), addr:$dst)]>;
+}
+
+let Predicates = [HasAVX, NoBWI] in
+  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
+
+defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
+
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+                 (ins VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32:$dst,
+                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
+                  Sched<[WriteShuffle]>;
+  let SchedRW = [WriteShuffleLd, WriteRMW] in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
+                          addr:$dst)]>;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
+
+defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
+                 (ins VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR64:$dst,
+                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
+                  Sched<[WriteShuffle]>, REX_W;
+  let SchedRW = [WriteShuffleLd, WriteRMW] in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
+                          addr:$dst)]>, REX_W;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
+
+defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
+
+/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
+/// destination
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
+                            OpndItins itins = DEFAULT_ITINS> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+                 (ins VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32orGR64:$dst,
+                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
+                    itins.rr>, Sched<[WriteFBlend]>;
+  let SchedRW = [WriteFBlendLd, WriteRMW] in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
+                          addr:$dst)], itins.rm>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+  let Predicates = [UseAVX] in
+    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
+  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
+}
+
+// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+                                              imm:$src2))),
+                 addr:$dst),
+          (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+          Requires<[HasAVX]>;
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+                                              imm:$src2))),
+                 addr:$dst),
+          (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+          Requires<[UseSSE41]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Insert Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
+      Sched<[WriteShuffle]>;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
+                   imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoBWI] in
+  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
+
+multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, GR32:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
+      Sched<[WriteShuffle]>;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
+                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
+
+multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, GR64:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
+      Sched<[WriteShuffle]>;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
+                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoDQI] in
+  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
+let Constraints = "$src1 = $dst" in
+  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
+
+// insertps has a few different modes, there's the first two here below which
+// are optimized inserts that won't zero arbitrary elements in the destination
+// vector. The next one matches the intrinsic and could zero arbitrary elements
+// in the target vector.
+multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
+                           OpndItins itins = DEFAULT_ITINS> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
+      Sched<[WriteFShuffle]>;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86insertps VR128:$src1,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+                    imm:$src3))], itins.rm>,
+      Sched<[WriteFShuffleLd, ReadAfterLd]>;
+}
+
+let ExeDomain = SSEPackedSingle in {
+  let Predicates = [UseAVX] in
+    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
+  let Constraints = "$src1 = $dst" in
+    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
+}
+
+let Predicates = [UseSSE41] in {
+  // If we're inserting an element from a load or a null pshuf of a load,
+  // fold the load into the insertps instruction.
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
+                       (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
+                   imm:$src3)),
+            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
+                      (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
+            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
+let Predicates = [UseAVX] in {
+  // If we're inserting an element from a vbroadcast of a load, fold the
+  // load into the X86insertps instruction.
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+                (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
+            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+                (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
+            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Round Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse41_fp_unop_p<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+                           X86MemOperand x86memop, RegisterClass RC,
+                           PatFrag mem_frag32, PatFrag mem_frag64,
+                           Intrinsic V4F32Int, Intrinsic V2F64Int> {
+let ExeDomain = SSEPackedSingle in {
+  // Intrinsic operation, reg.
+  // Vector intrinsic operation, reg
+  def PSr : SS4AIi8<opcps, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
+                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
+
+  // Vector intrinsic operation, mem
+  def PSm : SS4AIi8<opcps, MRMSrcMem,
+                    (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst,
+                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
+                          IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
+} // ExeDomain = SSEPackedSingle
+
+let ExeDomain = SSEPackedDouble in {
+  // Vector intrinsic operation, reg
+  def PDr : SS4AIi8<opcpd, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
+                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
+
+  // Vector intrinsic operation, mem
+  def PDm : SS4AIi8<opcpd, MRMSrcMem,
+                    (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst,
+                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
+                          IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>;
+} // ExeDomain = SSEPackedDouble
+}
+
+multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
+                          string OpcodeStr> {
+let ExeDomain = GenericDomain, hasSideEffects = 0 in {
+  def SSr : SS4AIi8<opcss, MRMSrcReg,
+        (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
+        !strconcat(OpcodeStr,
+            "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+      []>, Sched<[WriteFAdd]>;
+
+  let mayLoad = 1 in
+  def SSm : SS4AIi8<opcss, MRMSrcMem,
+        (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
+        !strconcat(OpcodeStr,
+             "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+
+  def SDr : SS4AIi8<opcsd, MRMSrcReg,
+        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
+        !strconcat(OpcodeStr,
+              "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        []>, Sched<[WriteFAdd]>;
+
+  let mayLoad = 1 in
+  def SDm : SS4AIi8<opcsd, MRMSrcMem,
+        (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
+        !strconcat(OpcodeStr,
+             "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain, hasSideEffects = 0
+}
+
+multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
+                           string OpcodeStr> {
+let ExeDomain = GenericDomain, hasSideEffects = 0 in {
+  def SSr : SS4AIi8<opcss, MRMSrcReg,
+                    (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr,
+                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, Sched<[WriteFAdd]>;
+
+  let mayLoad = 1 in
+  def SSm : SS4AIi8<opcss, MRMSrcMem,
+                    (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr,
+                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+
+  def SDr : SS4AIi8<opcsd, MRMSrcReg,
+                    (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr,
+                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, Sched<[WriteFAdd]>;
+
+  let mayLoad = 1 in
+  def SDm : SS4AIi8<opcsd, MRMSrcMem,
+                    (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
+                    !strconcat(OpcodeStr,
+                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain, hasSideEffects = 0
+}
+
+multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
+                            string OpcodeStr,
+                            Intrinsic F32Int,
+                            Intrinsic F64Int, bit Is2Addr = 1> {
+let ExeDomain = GenericDomain, isCodeGenOnly = 1 in {
+  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+        Sched<[WriteFAdd]>;
+
+  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
+        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst,
+             (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
+        Sched<[WriteFAddLd, ReadAfterLd]>;
+
+  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+        Sched<[WriteFAdd]>;
+
+  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
+        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst,
+              (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
+        Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain, isCodeGenOnly = 1
+}
+
+// FP round - roundss, roundps, roundsd, roundpd
+let Predicates = [HasAVX] in {
+  // Intrinsic form
+  defm VROUND  : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128,
+                                 loadv4f32, loadv2f64,
+                                 int_x86_sse41_round_ps,
+                                 int_x86_sse41_round_pd>, VEX;
+  defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256,
+                                 loadv8f32, loadv4f64,
+                                 int_x86_avx_round_ps_256,
+                                 int_x86_avx_round_pd_256>, VEX, VEX_L;
+  defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround",
+                                 int_x86_sse41_round_ss,
+                                 int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+  defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;                                 
+}
+
+let Predicates = [UseAVX] in {
+  def : Pat<(ffloor FR32:$src),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
+  def : Pat<(f64 (ffloor FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
+  def : Pat<(f32 (fnearbyint FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+  def : Pat<(f64 (fnearbyint FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+  def : Pat<(f32 (fceil FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
+  def : Pat<(f64 (fceil FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
+  def : Pat<(f32 (frint FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+  def : Pat<(f64 (frint FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+  def : Pat<(f32 (ftrunc FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
+  def : Pat<(f64 (ftrunc FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (ffloor VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0x9))>;
+  def : Pat<(v4f32 (fnearbyint VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0xC))>;
+  def : Pat<(v4f32 (fceil VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0xA))>;
+  def : Pat<(v4f32 (frint VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0x4))>;
+  def : Pat<(v4f32 (ftrunc VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0xB))>;
+
+  def : Pat<(v2f64 (ffloor VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0x9))>;
+  def : Pat<(v2f64 (fnearbyint VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0xC))>;
+  def : Pat<(v2f64 (fceil VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0xA))>;
+  def : Pat<(v2f64 (frint VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0x4))>;
+  def : Pat<(v2f64 (ftrunc VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0xB))>;
+
+  def : Pat<(v8f32 (ffloor VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0x9))>;
+  def : Pat<(v8f32 (fnearbyint VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0xC))>;
+  def : Pat<(v8f32 (fceil VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0xA))>;
+  def : Pat<(v8f32 (frint VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0x4))>;
+  def : Pat<(v8f32 (ftrunc VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0xB))>;
+
+  def : Pat<(v4f64 (ffloor VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0x9))>;
+  def : Pat<(v4f64 (fnearbyint VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0xC))>;
+  def : Pat<(v4f64 (fceil VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0xA))>;
+  def : Pat<(v4f64 (frint VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0x4))>;
+  def : Pat<(v4f64 (ftrunc VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0xB))>;
+}
+
+defm ROUND  : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128,
+                              memopv4f32, memopv2f64, int_x86_sse41_round_ps,
+                              int_x86_sse41_round_pd>;
+
+defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round">;
+
+let Constraints = "$src1 = $dst" in
+defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round",
+                               int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(ffloor FR32:$src),
+            (ROUNDSSr FR32:$src, (i32 0x9))>;
+  def : Pat<(f64 (ffloor FR64:$src)),
+            (ROUNDSDr FR64:$src, (i32 0x9))>;
+  def : Pat<(f32 (fnearbyint FR32:$src)),
+            (ROUNDSSr FR32:$src, (i32 0xC))>;
+  def : Pat<(f64 (fnearbyint FR64:$src)),
+            (ROUNDSDr FR64:$src, (i32 0xC))>;
+  def : Pat<(f32 (fceil FR32:$src)),
+            (ROUNDSSr FR32:$src, (i32 0xA))>;
+  def : Pat<(f64 (fceil FR64:$src)),
+            (ROUNDSDr FR64:$src, (i32 0xA))>;
+  def : Pat<(f32 (frint FR32:$src)),
+            (ROUNDSSr FR32:$src, (i32 0x4))>;
+  def : Pat<(f64 (frint FR64:$src)),
+            (ROUNDSDr FR64:$src, (i32 0x4))>;
+  def : Pat<(f32 (ftrunc FR32:$src)),
+            (ROUNDSSr FR32:$src, (i32 0xB))>;
+  def : Pat<(f64 (ftrunc FR64:$src)),
+            (ROUNDSDr FR64:$src, (i32 0xB))>;
+
+  def : Pat<(v4f32 (ffloor VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0x9))>;
+  def : Pat<(v4f32 (fnearbyint VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0xC))>;
+  def : Pat<(v4f32 (fceil VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0xA))>;
+  def : Pat<(v4f32 (frint VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0x4))>;
+  def : Pat<(v4f32 (ftrunc VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0xB))>;
+
+  def : Pat<(v2f64 (ffloor VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0x9))>;
+  def : Pat<(v2f64 (fnearbyint VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0xC))>;
+  def : Pat<(v2f64 (fceil VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0xA))>;
+  def : Pat<(v2f64 (frint VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0x4))>;
+  def : Pat<(v2f64 (ftrunc VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0xB))>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Bit Test
+//===----------------------------------------------------------------------===//
+
+// ptest instruction we'll lower to this in X86ISelLowering primarily from
+// the intel intrinsic that corresponds to this.
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
+def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
+                Sched<[WriteVecLogic]>, VEX;
+def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
+                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
+
+def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
+                Sched<[WriteVecLogic]>, VEX, VEX_L;
+def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
+                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L;
+}
+
+let Defs = [EFLAGS] in {
+def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+              "ptest\t{$src2, $src1|$src1, $src2}",
+              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
+              Sched<[WriteVecLogic]>;
+def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+              "ptest\t{$src2, $src1|$src1, $src2}",
+              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
+              Sched<[WriteVecLogicLd, ReadAfterLd]>;
+}
+
+// The bit test instructions below are AVX only
+multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
+  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
+            Sched<[WriteVecLogic]>, VEX;
+  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
+            Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedSingle in {
+defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
+                            VEX_L;
+}
+let ExeDomain = SSEPackedDouble in {
+defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
+                            VEX_L;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Misc Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
+  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                     "popcnt{w}\t{$src, $dst|$dst, $src}",
+                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
+                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
+                     OpSize16, XS;
+  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                     "popcnt{w}\t{$src, $dst|$dst, $src}",
+                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
+                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+                      Sched<[WriteFAddLd]>, OpSize16, XS;
+
+  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                     "popcnt{l}\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
+                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
+                     OpSize32, XS;
+
+  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                     "popcnt{l}\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
+                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+                      Sched<[WriteFAddLd]>, OpSize32, XS;
+
+  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                      "popcnt{q}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
+                      IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS;
+  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                      "popcnt{q}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
+                       (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
+                       Sched<[WriteFAddLd]>, XS;
+}
+
+
+
+// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
+multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId128, PatFrag ld_frag,
+                                 X86FoldableSchedWrite Sched> {
+  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
+                    Sched<[Sched]>;
+  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                     (ins i128mem:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set VR128:$dst,
+                       (IntId128 (bitconvert (ld_frag addr:$src))))]>,
+                    Sched<[Sched.Folded]>;
+}
+
+// PHMIN has the same profile as PSAD, thus we use the same scheduling
+// model, although the naming is misleading.
+let Predicates = [HasAVX] in
+defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
+                                         int_x86_sse41_phminposuw, loadv2i64,
+                                         WriteVecIMul>, VEX;
+defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
+                                         int_x86_sse41_phminposuw, memopv2i64,
+                                         WriteVecIMul>;
+
+/// SS48I_binop_rm - Simple SSE41 binary operator.
+multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                          X86MemOperand x86memop, bit Is2Addr = 1,
+                          OpndItins itins = SSE_INTALU_ITINS_P> {
+  let isCommutable = 1 in
+  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+       Sched<[itins.Sched]>;
+  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst,
+         (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
+/// types.
+multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+                         PatFrag memop_frag, X86MemOperand x86memop,
+                         OpndItins itins,
+                         bit IsCommutable = 0, bit Is2Addr = 1> {
+  let isCommutable = IsCommutable in
+  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+       Sched<[itins.Sched]>;
+  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
+                                     (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
+                                   VR128, loadv2i64, i128mem,
+                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+}
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
+  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
+                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
+                                  VR256, loadv4i64, i256mem,
+                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
+}
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMULDQ   : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
+                                  VR128, memopv2i64, i128mem,
+                                  SSE_INTMUL_ITINS_P, 1>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
+                                 loadv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
+                                 VEX_4V;
+  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
+                                 loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                 VEX_4V;
+}
+let Predicates = [HasAVX2] in {
+  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
+                                  loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
+                                  VEX_4V, VEX_L;
+  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
+                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
+                                memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
+  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
+                                memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
+}
+
+/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
+multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
+                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
+                 X86MemOperand x86memop, bit Is2Addr = 1,
+                 OpndItins itins = DEFAULT_ITINS> {
+  let isCommutable = 1 in
+  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
+        Sched<[itins.Sched]>;
+  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set RC:$dst,
+          (IntId RC:$src1,
+           (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
+        Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
+multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                           X86MemOperand x86memop, bit Is2Addr = 1,
+                           OpndItins itins = DEFAULT_ITINS> {
+  let isCommutable = 1 in
+  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
+        itins.rr>, Sched<[itins.Sched]>;
+  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set RC:$dst,
+          (OpVT (OpNode RC:$src1,
+                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>,
+        Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  let isCommutable = 0 in {
+    defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
+                                        VR128, loadv2i64, i128mem, 0,
+                                        DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
+  }
+
+  let ExeDomain = SSEPackedSingle in {
+  defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32,
+                                  VR128, loadv4f32, f128mem, 0,
+                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+  defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32,
+                                   VR256, loadv8f32, f256mem, 0,
+                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
+  }
+  let ExeDomain = SSEPackedDouble in {
+  defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
+                                  VR128, loadv2f64, f128mem, 0,
+                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+  defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
+                                   VR256, loadv4f64, f256mem, 0,
+                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
+  }
+  defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
+                                  VR128, loadv2i64, i128mem, 0,
+                                  DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
+
+  let ExeDomain = SSEPackedSingle in
+  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
+                                   VR128, loadv4f32, f128mem, 0,
+                                   SSE_DPPS_ITINS>, VEX_4V;
+  let ExeDomain = SSEPackedDouble in
+  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
+                                   VR128, loadv2f64, f128mem, 0,
+                                   SSE_DPPS_ITINS>, VEX_4V;
+  let ExeDomain = SSEPackedSingle in
+  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
+                                    VR256, loadv8f32, i256mem, 0,
+                                    SSE_DPPS_ITINS>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX2] in {
+  let isCommutable = 0 in {
+  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
+                                  VR256, loadv4i64, i256mem, 0,
+                                  DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L;
+  }
+  defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
+                                   VR256, loadv4i64, i256mem, 0,
+                                   DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  let isCommutable = 0 in {
+  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
+                                     VR128, memopv2i64, i128mem,
+                                     1, SSE_MPSADBW_ITINS>;
+  }
+  let ExeDomain = SSEPackedSingle in
+  defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32,
+                                 VR128, memopv4f32, f128mem,
+                                 1, SSE_INTALU_ITINS_FBLEND_P>;
+  let ExeDomain = SSEPackedDouble in
+  defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64,
+                                 VR128, memopv2f64, f128mem,
+                                 1, SSE_INTALU_ITINS_FBLEND_P>;
+  defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16,
+                                 VR128, memopv2i64, i128mem,
+                                 1, SSE_INTALU_ITINS_BLEND_P>;
+  let ExeDomain = SSEPackedSingle in
+  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
+                                  VR128, memopv4f32, f128mem, 1,
+                                  SSE_DPPS_ITINS>;
+  let ExeDomain = SSEPackedDouble in
+  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
+                                  VR128, memopv2f64, f128mem, 1,
+                                  SSE_DPPD_ITINS>;
+}
+
+/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
+multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
+                                    RegisterClass RC, X86MemOperand x86memop,
+                                    PatFrag mem_frag, Intrinsic IntId,
+                                    X86FoldableSchedWrite Sched> {
+  def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
+                  (ins RC:$src1, RC:$src2, RC:$src3),
+                  !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                  [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
+                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V,
+                Sched<[Sched]>;
+
+  def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
+                  (ins RC:$src1, x86memop:$src2, RC:$src3),
+                  !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                  [(set RC:$dst,
+                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
+                               RC:$src3))],
+                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V,
+                Sched<[Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedDouble in {
+defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
+                                           loadv2f64, int_x86_sse41_blendvpd,
+                                           WriteFVarBlend>;
+defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
+                                  loadv4f64, int_x86_avx_blendv_pd_256,
+                                  WriteFVarBlend>, VEX_L;
+} // ExeDomain = SSEPackedDouble
+let ExeDomain = SSEPackedSingle in {
+defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
+                                           loadv4f32, int_x86_sse41_blendvps,
+                                           WriteFVarBlend>;
+defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
+                                  loadv8f32, int_x86_avx_blendv_ps_256,
+                                  WriteFVarBlend>, VEX_L;
+} // ExeDomain = SSEPackedSingle
+defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
+                                           loadv2i64, int_x86_sse41_pblendvb,
+                                           WriteVarBlend>;
+}
+
+let Predicates = [HasAVX2] in {
+defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
+                                      loadv4i64, int_x86_avx2_pblendvb,
+                                      WriteVarBlend>, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
+                            (v16i8 VR128:$src2))),
+            (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
+                            (v4i32 VR128:$src2))),
+            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
+                            (v4f32 VR128:$src2))),
+            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
+                            (v2i64 VR128:$src2))),
+            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
+                            (v2f64 VR128:$src2))),
+            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
+                            (v8i32 VR256:$src2))),
+            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
+                            (v8f32 VR256:$src2))),
+            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
+                            (v4i64 VR256:$src2))),
+            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
+                            (v4f64 VR256:$src2))),
+            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+}
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
+                            (v32i8 VR256:$src2))),
+            (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+}
+
+// Patterns
+// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
+// on targets where they have equal performance. These were changed to use
+// blends because blends have better throughput on SandyBridge and Haswell, but
+// movs[s/d] are 1-2 byte shorter instructions.
+let Predicates = [UseAVX] in {
+  let AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVS{S,D} to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+            (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+            (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+
+  // Move low f32 and clear high bits.
+  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
+            (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;
+
+  // Move low f64 and clear high bits.
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+            (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
+  }
+
+  // These will incur an FP/int domain crossing penalty, but it may be the only
+  // way without AVX2. Do not add any complexity because we may be able to match
+  // more optimal patterns defined earlier in this file.
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+            (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
+  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
+            (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
+}
+
+// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
+// on targets where they have equal performance. These were changed to use
+// blends because blends have better throughput on SandyBridge and Haswell, but
+// movs[s/d] are 1-2 byte shorter instructions.
+let Predicates = [UseSSE41], AddedComplexity = 15 in {
+  // With SSE41 we can use blends for these patterns.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
+}
+
+
+/// SS41I_ternary_int - SSE 4.1 ternary operator
+let Uses = [XMM0], Constraints = "$src1 = $dst" in {
+  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+                               X86MemOperand x86memop, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
+    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src2),
+                    !strconcat(OpcodeStr,
+                     "\t{$src2, $dst|$dst, $src2}"),
+                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
+                    itins.rr>, Sched<[itins.Sched]>;
+
+    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins VR128:$src1, x86memop:$src2),
+                    !strconcat(OpcodeStr,
+                     "\t{$src2, $dst|$dst, $src2}"),
+                    [(set VR128:$dst,
+                      (IntId VR128:$src1,
+                       (bitconvert (mem_frag addr:$src2)), XMM0))],
+                       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+}
+
+let ExeDomain = SSEPackedDouble in
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
+                                  int_x86_sse41_blendvpd,
+                                  DEFAULT_ITINS_FBLENDSCHED>;
+let ExeDomain = SSEPackedSingle in
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
+                                  int_x86_sse41_blendvps,
+                                  DEFAULT_ITINS_FBLENDSCHED>;
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
+                                  int_x86_sse41_pblendvb,
+                                  DEFAULT_ITINS_VARBLENDSCHED>;
+
+// Aliases with the implicit xmm0 argument
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
+                            (v16i8 VR128:$src2))),
+            (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
+                            (v4i32 VR128:$src2))),
+            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
+                            (v4f32 VR128:$src2))),
+            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
+                            (v2i64 VR128:$src2))),
+            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
+                            (v2f64 VR128:$src2))),
+            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+}
+
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+let SchedRW = [WriteLoad] in {
+let Predicates = [HasAVX, NoVLX] in
+def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                       "vmovntdqa\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
+                       VEX;
+let Predicates = [HasAVX2, NoVLX] in
+def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                         "vmovntdqa\t{$src, $dst|$dst, $src}",
+                         [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
+                         VEX, VEX_L;
+def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                       "movntdqa\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
+} // SchedRW
+
+let Predicates = [HasAVX2, NoVLX] in {
+  def : Pat<(v8f32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAYrm addr:$src)>;
+  def : Pat<(v4f64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAYrm addr:$src)>;
+  def : Pat<(v4i64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAYrm addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQArm addr:$src)>;
+  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQArm addr:$src)>;
+  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
+            (VMOVNTDQArm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
+            (MOVNTDQArm addr:$src)>;
+  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
+            (MOVNTDQArm addr:$src)>;
+  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
+            (MOVNTDQArm addr:$src)>;
+}
+
+} // AddedComplexity
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS42I_binop_rm - Simple SSE 4.2 binary operator
+multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                          X86MemOperand x86memop, bit Is2Addr = 1> {
+  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>;
+  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst,
+         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>;
+}
+
+let Predicates = [HasAVX] in
+  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
+                                 loadv2i64, i128mem, 0>, VEX_4V;
+
+let Predicates = [HasAVX2] in
+  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+
+let Constraints = "$src1 = $dst" in
+  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
+                                memopv2i64, i128mem>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - String/text Processing Instructions
+//===----------------------------------------------------------------------===//
+
+// Packed Compare Implicit Length Strings, Return Mask
+multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
+  def REG : PseudoI<(outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
+                                                  imm:$src3))]>;
+  def MEM : PseudoI<(outs VR128:$dst),
+                    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
+                       (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
+}
+
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
+                         Requires<[HasAVX]>;
+  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>,
+                         Requires<[UseSSE42]>;
+}
+
+multiclass pcmpistrm_SS42AI<string asm> {
+  def rr : SS42AI<0x62, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, Sched<[WritePCmpIStrM]>;
+  let mayLoad = 1 in
+  def rm :SS42AI<0x62, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>;
+}
+
+let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+  defm PCMPISTRM128  : pcmpistrm_SS42AI<"pcmpistrm"> ;
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> {
+  def REG : PseudoI<(outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
+                       VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+  def MEM : PseudoI<(outs VR128:$dst),
+                    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
+                       (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>,
+                         Requires<[HasAVX]>;
+  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>,
+                         Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestrm<string asm> {
+  def rr : SS42AI<0x60, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, Sched<[WritePCmpEStrM]>;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x60, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>;
+}
+
+let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+  defm PCMPESTRM128 :  SS42AI_pcmpestrm<"pcmpestrm">;
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
+  def REG : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
+  def MEM : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+    [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
+                              (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
+}
+
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+  defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
+                      Requires<[HasAVX]>;
+  defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>,
+                      Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpistri<string asm> {
+  def rr : SS42AI<0x63, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, Sched<[WritePCmpIStrI]>;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x63, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>;
+}
+
+let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
+  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> {
+  def REG : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+  def MEM : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX,
+       imm:$src5))]>;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+  defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>,
+                      Requires<[HasAVX]>;
+  defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>,
+                      Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestri<string asm> {
+  def rr : SS42AI<0x61, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, Sched<[WritePCmpEStrI]>;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x61, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>;
+}
+
+let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
+  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - CRC Instructions
+//===----------------------------------------------------------------------===//
+
+// No CRC instructions have AVX equivalents
+
+// crc intrinsic instruction
+// This set of instructions are only rm, the only difference is the size
+// of r and m.
+class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
+                   RegisterClass RCIn, SDPatternOperator Int> :
+  SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
+         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>,
+         Sched<[WriteFAdd]>;
+
+class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
+                   X86MemOperand x86memop, SDPatternOperator Int> :
+  SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
+         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
+         IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+
+let Constraints = "$src1 = $dst" in {
+  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
+                                 int_x86_sse42_crc32_32_8>;
+  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
+                                 int_x86_sse42_crc32_32_8>;
+  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
+                                 int_x86_sse42_crc32_32_16>, OpSize16;
+  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
+                                 int_x86_sse42_crc32_32_16>, OpSize16;
+  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
+                                 int_x86_sse42_crc32_32_32>, OpSize32;
+  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
+                                 int_x86_sse42_crc32_32_32>, OpSize32;
+  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
+                                 int_x86_sse42_crc32_64_64>, REX_W;
+  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
+                                 int_x86_sse42_crc32_64_64>, REX_W;
+  let hasSideEffects = 0 in {
+    let mayLoad = 1 in
+    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
+                                   null_frag>, REX_W;
+    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
+                                   null_frag>, REX_W;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// SHA-NI Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
+                      bit UsesXMM0 = 0> {
+  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             [!if(UsesXMM0,
+                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
+                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
+
+  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, i128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             [!if(UsesXMM0,
+                  (set VR128:$dst, (IntId VR128:$src1,
+                    (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
+                  (set VR128:$dst, (IntId VR128:$src1,
+                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
+}
+
+let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
+  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
+                         (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                         [(set VR128:$dst,
+                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
+                            (i8 imm:$src3)))]>, TA;
+  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
+                         (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                         [(set VR128:$dst,
+                           (int_x86_sha1rnds4 VR128:$src1,
+                            (bc_v4i32 (memopv2i64 addr:$src2)),
+                            (i8 imm:$src3)))]>, TA;
+
+  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
+  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
+  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;
+
+  let Uses=[XMM0] in
+  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;
+
+  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
+  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
+}
+
+// Aliases with explicit %xmm0
+def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
+def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// AES-NI Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
+                             PatFrag ld_frag, bit Is2Addr = 1> {
+  def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+       Sched<[WriteAESDecEnc]>;
+  def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst,
+         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
+       Sched<[WriteAESDecEncLd, ReadAfterLd]>;
+}
+
+// Perform One Round of an AES Encryption/Decryption Flow
+let Predicates = [HasAVX, HasAES] in {
+  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
+                         int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V;
+  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
+                         int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V;
+  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
+                         int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V;
+  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
+                         int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
+                         int_x86_aesni_aesenc, memopv2i64>;
+  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
+                         int_x86_aesni_aesenclast, memopv2i64>;
+  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
+                         int_x86_aesni_aesdec, memopv2i64>;
+  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
+                         int_x86_aesni_aesdeclast, memopv2i64>;
+}
+
+// Perform the AES InvMixColumn Transformation
+let Predicates = [HasAVX, HasAES] in {
+  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1),
+      "vaesimc\t{$src1, $dst|$dst, $src1}",
+      [(set VR128:$dst,
+        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
+      VEX;
+  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+      (ins i128mem:$src1),
+      "vaesimc\t{$src1, $dst|$dst, $src1}",
+      [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
+      Sched<[WriteAESIMCLd]>, VEX;
+}
+def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+  (ins VR128:$src1),
+  "aesimc\t{$src1, $dst|$dst, $src1}",
+  [(set VR128:$dst,
+    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
+def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+  (ins i128mem:$src1),
+  "aesimc\t{$src1, $dst|$dst, $src1}",
+  [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
+  Sched<[WriteAESIMCLd]>;
+
+// AES Round Key Generation Assist
+let Predicates = [HasAVX, HasAES] in {
+  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, u8imm:$src2),
+      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(set VR128:$dst,
+        (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+      Sched<[WriteAESKeyGen]>, VEX;
+  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+      (ins i128mem:$src1, u8imm:$src2),
+      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(set VR128:$dst,
+        (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
+      Sched<[WriteAESKeyGenLd]>, VEX;
+}
+def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+  (ins VR128:$src1, u8imm:$src2),
+  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+  [(set VR128:$dst,
+    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+  Sched<[WriteAESKeyGen]>;
+def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+  (ins i128mem:$src1, u8imm:$src2),
+  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+  [(set VR128:$dst,
+    (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
+  Sched<[WriteAESKeyGenLd]>;
+
+//===----------------------------------------------------------------------===//
+// PCLMUL Instructions
+//===----------------------------------------------------------------------===//
+
+// AVX carry-less Multiplication instructions
+let isCommutable = 1 in
+def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           [(set VR128:$dst,
+             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
+           Sched<[WriteCLMul]>;
+
+def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+                              (loadv2i64 addr:$src2), imm:$src3))]>,
+           Sched<[WriteCLMulLd, ReadAfterLd]>;
+
+// Carry-less Multiplication instructions
+let Constraints = "$src1 = $dst" in {
+let isCommutable = 1 in
+def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           [(set VR128:$dst,
+             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
+             IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
+
+def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+                              (memopv2i64 addr:$src2), imm:$src3))],
+                              IIC_SSE_PCLMULQDQ_RM>,
+           Sched<[WriteCLMulLd, ReadAfterLd]>;
+} // Constraints = "$src1 = $dst"
+
+
+multiclass pclmul_alias<string asm, int immop> {
+  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
+                  (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>;
+
+  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
+                  (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>;
+
+  def : InstAlias<!strconcat("vpclmul", asm,
+                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop),
+                  0>;
+
+  def : InstAlias<!strconcat("vpclmul", asm,
+                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop),
+                  0>;
+}
+defm : pclmul_alias<"hqhq", 0x11>;
+defm : pclmul_alias<"hqlq", 0x01>;
+defm : pclmul_alias<"lqhq", 0x10>;
+defm : pclmul_alias<"lqlq", 0x00>;
+
+//===----------------------------------------------------------------------===//
+// SSE4A Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE4A] in {
+
+let ExeDomain = SSEPackedInt in {
+let Constraints = "$src = $dst" in {
+def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
+                 (ins VR128:$src, u8imm:$len, u8imm:$idx),
+                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
+                 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
+                                    imm:$idx))]>, PD;
+def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
+              (ins VR128:$src, VR128:$mask),
+              "extrq\t{$mask, $src|$src, $mask}",
+              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
+                                 VR128:$mask))]>, PD;
+
+def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
+                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
+                   [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
+                                      imm:$len, imm:$idx))]>, XD;
+def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
+                 (ins VR128:$src, VR128:$mask),
+                 "insertq\t{$mask, $src|$src, $mask}",
+                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
+                                    VR128:$mask))]>, XD;
+}
+} // ExeDomain = SSEPackedInt
+
+// Non-temporal (unaligned) scalar stores.
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+let mayStore = 1, SchedRW = [WriteStore] in {
+def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
+                "movntss\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XS;
+
+def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                "movntsd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XD;
+} // SchedRW
+
+def : Pat<(nontemporalstore FR32:$src, addr:$dst),
+          (MOVNTSS addr:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+def : Pat<(nontemporalstore FR64:$src, addr:$dst),
+          (MOVNTSD addr:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+} // AddedComplexity
+} // HasSSE4A
+
+//===----------------------------------------------------------------------===//
+// AVX Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VBROADCAST - Load from memory and broadcast to all elements of the
+//              destination operand
+//
+class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                           X86MemOperand x86memop, ValueType VT,
+                           PatFrag ld_frag, SchedWrite Sched> :
+  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+        [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
+        Sched<[Sched]>, VEX;
+
+// AVX2 adds register forms
+class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                        ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
+  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
+         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+         [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
+         Sched<[Sched]>, VEX;
+
+let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
+  def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
+                                             f32mem, v4f32, loadf32, WriteLoad>;
+  def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
+                                             f32mem, v8f32, loadf32,
+                                             WriteFShuffleLd>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
+def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
+                                    v4f64, loadf64, WriteFShuffleLd>, VEX_L;
+
+let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
+  def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
+                                          v4f32, v4f32, WriteFShuffle>;
+  def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
+                                          v8f32, v4f32, WriteFShuffle256>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
+def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
+                                         v4f64, v2f64, WriteFShuffle256>, VEX_L;
+
+//===----------------------------------------------------------------------===//
+// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
+//                  halves of a 256-bit vector.
+//
+let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
+def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
+                           (ins i128mem:$src),
+                           "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
+                           Sched<[WriteLoad]>, VEX, VEX_L;
+
+let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX] in
+def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
+                           (ins f128mem:$src),
+                           "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
+                           Sched<[WriteFShuffleLd]>, VEX, VEX_L;
+
+let Predicates = [HasAVX2, NoVLX] in {
+def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+          (VBROADCASTI128 addr:$src)>;
+def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
+          (VBROADCASTI128 addr:$src)>;
+def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+          (VBROADCASTI128 addr:$src)>;
+def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+          (VBROADCASTI128 addr:$src)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
+          (VBROADCASTF128 addr:$src)>;
+def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
+          (VBROADCASTF128 addr:$src)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+          (VBROADCASTF128 addr:$src)>;
+def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
+          (VBROADCASTF128 addr:$src)>;
+def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+          (VBROADCASTF128 addr:$src)>;
+def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+          (VBROADCASTF128 addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VINSERTF128 - Insert packed floating-point values
+//
+let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
+def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
+          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L;
+let mayLoad = 1 in
+def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
+          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
+}
+
+multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
+                            PatFrag memop_frag> {
+  def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
+                                   (iPTR imm)),
+            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
+                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
+  def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
+                                    (From (bitconvert (memop_frag addr:$src2))),
+                                    (iPTR imm)),
+            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
+                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+  defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
+  defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
+}
+
+let Predicates = [HasAVX1Only] in {
+  defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64,  loadv2i64>;
+  defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32,  loadv2i64>;
+  defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>;
+  defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8,  loadv2i64>;
+}
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTF128 - Extract packed floating-point values
+//
+let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
+def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
+          (ins VR256:$src1, u8imm:$src2),
+          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, Sched<[WriteFShuffle]>, VEX, VEX_L;
+let mayStore = 1 in
+def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
+          (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
+          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, Sched<[WriteStore]>, VEX, VEX_L;
+}
+
+multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
+  def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
+            (To (!cast<Instruction>(InstrStr#rr)
+                                    (From VR256:$src1),
+                                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+  def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
+                                                 (iPTR imm))), addr:$dst),
+            (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
+             (EXTRACT_get_vextract128_imm VR128:$ext))>;
+}
+
+// AVX1 patterns
+let Predicates = [HasAVX, NoVLX] in {
+  defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
+  defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
+}
+
+let Predicates = [HasAVX1Only] in {
+  defm : vextract_lowering<"VEXTRACTF128", v4i64,  v2i64>;
+  defm : vextract_lowering<"VEXTRACTF128", v8i32,  v4i32>;
+  defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
+  defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
+}
+
+//===----------------------------------------------------------------------===//
+// VMASKMOV - Conditional SIMD Packed Loads and Stores
+//
+multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
+                          Intrinsic IntLd, Intrinsic IntLd256,
+                          Intrinsic IntSt, Intrinsic IntSt256> {
+  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, f128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
+             VEX_4V;
+  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, f256mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+             VEX_4V, VEX_L;
+  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
+             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
+             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
+                                 int_x86_avx_maskload_ps,
+                                 int_x86_avx_maskload_ps_256,
+                                 int_x86_avx_maskstore_ps,
+                                 int_x86_avx_maskstore_ps_256>;
+let ExeDomain = SSEPackedDouble in
+defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
+                                 int_x86_avx_maskload_pd,
+                                 int_x86_avx_maskload_pd_256,
+                                 int_x86_avx_maskstore_pd,
+                                 int_x86_avx_maskstore_pd_256>;
+
+//===----------------------------------------------------------------------===//
+// VPERMIL - Permute Single and Double Floating-Point Values
+//
+multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
+                      RegisterClass RC, X86MemOperand x86memop_f,
+                      X86MemOperand x86memop_i, PatFrag i_frag,
+                      ValueType f_vt, ValueType i_vt> {
+  let Predicates = [HasAVX, NoVLX] in {
+    def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
+               Sched<[WriteFShuffle]>;
+    def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
+               (ins RC:$src1, x86memop_i:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
+                              (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V,
+               Sched<[WriteFShuffleLd, ReadAfterLd]>;
+
+    def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
+             (ins RC:$src1, u8imm:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
+             Sched<[WriteFShuffle]>;
+    def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
+             (ins x86memop_f:$src1, u8imm:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst,
+               (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
+             Sched<[WriteFShuffleLd]>;
+  }// Predicates = [HasAVX, NoVLX]
+}
+
+let ExeDomain = SSEPackedSingle in {
+  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
+                               loadv2i64, v4f32, v4i32>;
+  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
+                               loadv4i64, v8f32, v8i32>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
+                               loadv2i64, v2f64, v2i64>;
+  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
+                               loadv4i64, v4f64, v4i64>, VEX_L;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
+          (VPERMILPSYrr VR256:$src1, VR256:$src2)>;
+def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+          (VPERMILPSYrm VR256:$src1, addr:$src2)>;
+def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))),
+          (VPERMILPDYrr VR256:$src1, VR256:$src2)>;
+def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))),
+          (VPERMILPDYrm VR256:$src1, addr:$src2)>;
+
+def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPSYri VR256:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPDYri VR256:$src1, imm:$imm)>;
+def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)),
+                               (i8 imm:$imm))),
+          (VPERMILPSYmi addr:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))),
+          (VPERMILPDYmi addr:$src1, imm:$imm)>;
+
+def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))),
+          (VPERMILPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
+          (VPERMILPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))),
+          (VPERMILPDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))),
+          (VPERMILPDrm VR128:$src1, addr:$src2)>;
+
+def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))),
+          (VPERMILPDri VR128:$src1, imm:$imm)>;
+def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),
+          (VPERMILPDmi addr:$src1, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
+//
+let ExeDomain = SSEPackedSingle in {
+let isCommutable = 1 in
+def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
+          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+                              (i8 imm:$src3))))]>, VEX_4V, VEX_L,
+          Sched<[WriteFShuffle]>;
+def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
+          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
+                             (i8 imm:$src3)))]>, VEX_4V, VEX_L,
+          Sched<[WriteFShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
+                  (loadv4f64 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
+                  (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
+                  (loadv4i64 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
+                  (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+                  (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VZERO - Zero YMM registers
+//
+let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
+  // Zero All YMM registers
+  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
+                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>;
+
+  // Zero Upper bits of YMM registers
+  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
+                     [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//===----------------------------------------------------------------------===//
+multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
+             "vcvtph2ps\t{$src, $dst|$dst, $src}",
+             [(set RC:$dst, (Int VR128:$src))]>,
+             T8PD, VEX, Sched<[WriteCvtF2F]>;
+  let hasSideEffects = 0, mayLoad = 1 in
+  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
+             Sched<[WriteCvtF2FLd]>;
+}
+
+multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
+               (ins RC:$src1, i32u8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
+               TAPD, VEX, Sched<[WriteCvtF2F]>;
+  let hasSideEffects = 0, mayStore = 1,
+      SchedRW = [WriteCvtF2FLd, WriteRMW] in
+  def mr : Ii8<0x1D, MRMDestMem, (outs),
+               (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               TAPD, VEX;
+}
+
+let Predicates = [HasF16C] in {
+  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
+  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
+  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
+  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
+
+  // Pattern match vcvtph2ps of a scalar i64 load.
+  def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
+            (VCVTPH2PSrm addr:$src)>;
+  def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
+            (VCVTPH2PSrm addr:$src)>;
+  def : Pat<(int_x86_vcvtph2ps_128 (bitconvert
+              (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (VCVTPH2PSrm addr:$src)>;
+
+  def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16
+                  (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
+                   addr:$dst),
+                   (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+  def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16
+                  (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
+                   addr:$dst),
+                   (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+  def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)),
+                   addr:$dst),
+                   (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
+}
+
+// Patterns for  matching conversions from float to half-float and vice versa.
+let Predicates = [HasF16C, NoVLX] in {
+  // Use MXCSR.RC for rounding instead of explicitly specifying the default
+  // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
+  // configurations we support (the default). However, falling back to MXCSR is
+  // more consistent with other instructions, which are always controlled by it.
+  // It's encoded as 0b100.
+  def : Pat<(fp_to_f16 FR32:$src),
+            (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr
+              (COPY_TO_REGCLASS FR32:$src, VR128), 4)), sub_16bit))>;
+
+  def : Pat<(f16_to_fp GR16:$src),
+            (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
+              (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >;
+
+  def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
+            (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
+              (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 4)), FR32)) >;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX2 Instructions
+//===----------------------------------------------------------------------===//
+
+/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate
+multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                          X86MemOperand x86memop> {
+  let isCommutable = 1 in
+  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, u8imm:$src3),
+        !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+        Sched<[WriteBlend]>, VEX_4V;
+  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
+        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+        !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set RC:$dst,
+          (OpVT (OpNode RC:$src1,
+           (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+        Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
+}
+
+defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32,
+                               VR128, loadv2i64, i128mem>;
+defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
+                                VR256, loadv4i64, i256mem>, VEX_L;
+
+//===----------------------------------------------------------------------===//
+// VPBROADCAST - Load from memory and broadcast to all elements of the
+//               destination operand
+//
+multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
+                          X86MemOperand x86memop, PatFrag ld_frag,
+                          ValueType OpVT128, ValueType OpVT256, Predicate prd> {
+  let Predicates = [HasAVX2, prd] in {
+    def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR128:$dst,
+                   (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
+                  Sched<[WriteShuffle]>, VEX;
+    def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR128:$dst,
+                   (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
+                  Sched<[WriteLoad]>, VEX;
+    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR256:$dst,
+                    (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
+                   Sched<[WriteShuffle256]>, VEX, VEX_L;
+    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR256:$dst,
+                    (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
+                   Sched<[WriteLoad]>, VEX, VEX_L;
+
+    // Provide aliases for broadcast from the same register class that
+    // automatically does the extract.
+    def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
+              (!cast<Instruction>(NAME#"Yrr")
+                  (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
+  }
+}
+
+defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
+                                    v16i8, v32i8, NoVLX_Or_NoBWI>;
+defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
+                                    v8i16, v16i16, NoVLX_Or_NoBWI>;
+defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
+                                    v4i32, v8i32, NoVLX>;
+defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
+                                    v2i64, v4i64, NoVLX>;
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
+  // This means we'll encounter truncated i32 loads; match that here.
+  def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+            (VPBROADCASTWrm addr:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
+            (VPBROADCASTWYrm addr:$src)>;
+  def : Pat<(v8i16 (X86VBroadcast
+              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+            (VPBROADCASTWrm addr:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast
+              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
+            (VPBROADCASTWYrm addr:$src)>;
+}
+
+let Predicates = [HasAVX2] in {
+  // Provide aliases for broadcast from the same register class that
+  // automatically does the extract.
+  def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
+            (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
+                                                    sub_xmm)))>;
+  def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
+            (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
+                                                    sub_xmm)))>;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+  // Provide fallback in case the load node that is used in the patterns above
+  // is used by additional users, which prevents the pattern selection.
+    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+              (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+              (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+              (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+}
+
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+  def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
+        (VPBROADCASTBrr (COPY_TO_REGCLASS
+                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                             GR8:$src, sub_8bit)),
+                         VR128))>;
+  def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
+        (VPBROADCASTBYrr (COPY_TO_REGCLASS
+                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                              GR8:$src, sub_8bit)),
+                          VR128))>;
+
+  def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
+        (VPBROADCASTWrr (COPY_TO_REGCLASS
+                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                             GR16:$src, sub_16bit)),
+                         VR128))>;
+  def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
+        (VPBROADCASTWYrr (COPY_TO_REGCLASS
+                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                              GR16:$src, sub_16bit)),
+                          VR128))>;
+}
+let Predicates = [HasAVX2, NoVLX] in {
+  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+            (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+            (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+            (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+
+  // The patterns for VPBROADCASTD are not needed because they would match
+  // the exact same thing as VBROADCASTSS patterns.
+
+  def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
+        (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+  // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
+}
+
+// AVX1 broadcast patterns
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSYrm addr:$src)>;
+def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+          (VBROADCASTSDYrm addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSrm addr:$src)>;
+}
+
+  // Provide fallback in case the load node that is used in the patterns above
+  // is used by additional users, which prevents the pattern selection.
+let Predicates = [HasAVX, NoVLX] in {
+  // 128bit broadcasts:
+  def : Pat<(v2f64 (X86VBroadcast f64:$src)),
+            (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+}
+
+let Predicates = [HasAVX1Only] in {
+  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+            (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
+  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
+  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
+
+  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+            (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
+  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
+  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
+
+  def : Pat<(v2i64 (X86VBroadcast i64:$src)),
+              (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPERM - Permute instructions
+//
+
+multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+                     ValueType OpVT, X86FoldableSchedWrite Sched> {
+  let Predicates = [HasAVX2, NoVLX] in {
+    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+                     (ins VR256:$src1, VR256:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set VR256:$dst,
+                       (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
+                     Sched<[Sched]>, VEX_4V, VEX_L;
+    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+                     (ins VR256:$src1, i256mem:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set VR256:$dst,
+                       (OpVT (X86VPermv VR256:$src1,
+                              (bitconvert (mem_frag addr:$src2)))))]>,
+                     Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L;
+  }
+}
+
+defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>;
+let ExeDomain = SSEPackedSingle in
+defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>;
+
+multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+                         ValueType OpVT, X86FoldableSchedWrite Sched> {
+  let Predicates = [HasAVX2, NoVLX] in {
+    def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
+                       (ins VR256:$src1, u8imm:$src2),
+                       !strconcat(OpcodeStr,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                       [(set VR256:$dst,
+                         (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
+                       Sched<[Sched]>, VEX, VEX_L;
+    def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
+                       (ins i256mem:$src1, u8imm:$src2),
+                       !strconcat(OpcodeStr,
+                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                       [(set VR256:$dst,
+                         (OpVT (X86VPermi (mem_frag addr:$src1),
+                                (i8 imm:$src2))))]>,
+                       Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L;
+  }
+}
+
+defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
+                            WriteShuffle256>, VEX_W;
+let ExeDomain = SSEPackedDouble in
+defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
+                             WriteFShuffle256>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
+//
+let isCommutable = 1 in
+def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
+          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+                            (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
+          VEX_4V, VEX_L;
+def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
+          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
+                             (i8 imm:$src3)))]>,
+          Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
+
+let Predicates = [HasAVX2] in {
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
+                  (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+                   (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
+                  (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// VINSERTI128 - Insert packed integer values
+//
+let hasSideEffects = 0 in {
+def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
+          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
+let mayLoad = 1 in
+def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
+          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+  defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64,  loadv2i64>;
+  defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32,  loadv2i64>;
+  defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>;
+  defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8,  loadv2i64>;
+}
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTI128 - Extract packed integer values
+//
+def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
+          (ins VR256:$src1, u8imm:$src2),
+          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+          Sched<[WriteShuffle256]>, VEX, VEX_L;
+let hasSideEffects = 0, mayStore = 1 in
+def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
+          (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
+          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+          Sched<[WriteStore]>, VEX, VEX_L;
+
+let Predicates = [HasAVX2, NoVLX] in {
+  defm : vextract_lowering<"VEXTRACTI128", v4i64,  v2i64>;
+  defm : vextract_lowering<"VEXTRACTI128", v8i32,  v4i32>;
+  defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
+  defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
+//
+multiclass avx2_pmovmask<string OpcodeStr,
+                         Intrinsic IntLd128, Intrinsic IntLd256,
+                         Intrinsic IntSt128, Intrinsic IntSt256> {
+  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, i128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
+  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, i256mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+             VEX_4V, VEX_L;
+  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
+             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
+             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
+}
+
+defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
+                                int_x86_avx2_maskload_d,
+                                int_x86_avx2_maskload_d_256,
+                                int_x86_avx2_maskstore_d,
+                                int_x86_avx2_maskstore_d_256>;
+defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
+                                int_x86_avx2_maskload_q,
+                                int_x86_avx2_maskload_q_256,
+                                int_x86_avx2_maskstore_q,
+                                int_x86_avx2_maskstore_q_256>, VEX_W;
+
+multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
+                          ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
+    // masked store
+    def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)),
+             (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
+    // masked load
+    def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)),
+             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
+    def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask),
+                              (VT (bitconvert (ZeroVT immAllZerosV))))),
+             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
+    def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
+             (!cast<Instruction>(BlendStr#"rr")
+                 RC:$src0,
+                 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr),
+                 RC:$mask)>;
+}
+let Predicates = [HasAVX] in {
+  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
+  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>;
+  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>;
+  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>;
+}
+let Predicates = [HasAVX1Only] in {
+  // load/store i32/i64 not supported use ps/pd version
+  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
+  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
+  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
+  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
+}
+let Predicates = [HasAVX2] in {
+  defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
+  defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
+  defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
+  defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
+}
+
+//===----------------------------------------------------------------------===//
+// SubVector Broadcasts
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+
+let Predicates = [HasAVX2, NoVLX] in {
+def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
+          (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+                         (v2i64 VR128:$src), 1)>;
+def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
+          (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+                         (v4i32 VR128:$src), 1)>;
+def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
+          (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+                         (v8i16 VR128:$src), 1)>;
+def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
+          (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+                         (v16i8 VR128:$src), 1)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
+          (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+                         (v2f64 VR128:$src), 1)>;
+def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
+          (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+                         (v4f32 VR128:$src), 1)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
+          (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+                         (v2i64 VR128:$src), 1)>;
+def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
+          (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+                         (v4i32 VR128:$src), 1)>;
+def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
+          (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+                         (v8i16 VR128:$src), 1)>;
+def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
+          (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
+                         (v16i8 VR128:$src), 1)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Variable Bit Shifts
+//
+multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          ValueType vt128, ValueType vt256> {
+  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst,
+               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
+             VEX_4V, Sched<[WriteVarVecShift]>;
+  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, i128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst,
+               (vt128 (OpNode VR128:$src1,
+                       (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+             VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+             (ins VR256:$src1, VR256:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst,
+               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
+             VEX_4V, VEX_L, Sched<[WriteVarVecShift]>;
+  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, i256mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst,
+               (vt256 (OpNode VR256:$src1,
+                       (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
+             VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX2, NoVLX] in {
+  defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
+  defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
+  defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
+  defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
+  defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
+
+  def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)),
+            (VPSRAVDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86vsrav VR128:$src1,
+                    (bitconvert (loadv2i64 addr:$src2)))),
+            (VPSRAVDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)),
+            (VPSRAVDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86vsrav VR256:$src1,
+                    (bitconvert (loadv4i64 addr:$src2)))),
+            (VPSRAVDYrm VR256:$src1, addr:$src2)>;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// VGATHER - GATHER Operations
+multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
+                       X86MemOperand memop128, X86MemOperand memop256> {
+  def rm  : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
+            (ins VR128:$src1, memop128:$src2, VR128:$mask),
+            !strconcat(OpcodeStr,
+              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+            []>, VEX;
+  def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
+            (ins RC256:$src1, memop256:$src2, RC256:$mask),
+            !strconcat(OpcodeStr,
+              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+            []>, VEX, VEX_L;
+}
+
+let mayLoad = 1, hasSideEffects = 0, Constraints
+  = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
+  in {
+  defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx128mem, vx256mem>, VEX_W;
+  defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx128mem, vy256mem>, VEX_W;
+  defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx128mem, vy256mem>;
+  defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx64mem, vy128mem>;
+
+  let ExeDomain = SSEPackedDouble in {
+    defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx128mem, vx256mem>, VEX_W;
+    defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx128mem, vy256mem>, VEX_W;
+  }
+
+  let ExeDomain = SSEPackedSingle in {
+    defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx128mem, vy256mem>;
+    defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx64mem, vy128mem>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Extra selection patterns for FR128, f128, f128mem
+
+// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
+def : Pat<(store (f128 FR128:$src), addr:$dst),
+          (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>;
+
+def : Pat<(loadf128 addr:$src),
+          (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>;
+
+// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
+def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)),
+          (COPY_TO_REGCLASS
+           (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+           FR128)>;
+
+def : Pat<(X86fand FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(and FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)),
+          (COPY_TO_REGCLASS
+           (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+           FR128)>;
+
+def : Pat<(X86for FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                   (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(or FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                   (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)),
+          (COPY_TO_REGCLASS
+           (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
+           FR128)>;
+
+def : Pat<(X86fxor FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(xor FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS
+           (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSVM.td b/contrib/llvm/lib/Target/X86/X86InstrSVM.td
new file mode 100644
index 000000000000..c847be7ec099
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSVM.td
@@ -0,0 +1,62 @@
+//===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the AMD SVM instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SVM instructions
+
+// 0F 01 D9
+def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB;
+
+// 0F 01 DC
+def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB;
+
+// 0F 01 DD
+def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
+
+// 0F 01 DE
+let Uses = [EAX] in
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB;
+
+// 0F 01 D8
+let Uses = [EAX] in
+def VMRUN32 : I<0x01, MRM_D8, (outs), (ins),
+                "vmrun\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMRUN64 : I<0x01, MRM_D8, (outs), (ins),
+                "vmrun\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DA
+let Uses = [EAX] in
+def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins),
+                "vmload\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins),
+                "vmload\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DB
+let Uses = [EAX] in
+def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins),
+                "vmsave\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX] in
+def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins),
+                "vmsave\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DF
+let Uses = [EAX, ECX] in
+def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
+                "invlpga\t{%ecx, %eax|eax, ecx}", []>, TB, Requires<[Not64BitMode]>;
+let Uses = [RAX, ECX] in
+def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
+                "invlpga\t{%ecx, %rax|rax, ecx}", []>, TB, Requires<[In64BitMode]>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
new file mode 100644
index 000000000000..e2be73532157
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -0,0 +1,970 @@
+//===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the shift and rotate instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// FIXME: Someone needs to smear multipattern goodness all over this file.
+
+let Defs = [EFLAGS] in {
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SHL8rCL  : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "shl{b}\t{%cl, $dst|$dst, cl}",
+                 [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>;
+def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shl{w}\t{%cl, $dst|$dst, cl}",
+                 [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize16;
+def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shl{l}\t{%cl, $dst|$dst, cl}",
+                 [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>, OpSize32;
+def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+                  "shl{q}\t{%cl, $dst|$dst, cl}",
+                  [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>;
+} // Uses = [CL]
+
+def SHL8ri   : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+                   "shl{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
+def SHL16ri  : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+                   "shl{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>,
+                   OpSize16;
+def SHL32ri  : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+                   "shl{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>,
+                   OpSize32;
+def SHL64ri  : RIi8<0xC1, MRM4r, (outs GR64:$dst),
+                    (ins GR64:$src1, u8imm:$src2),
+                    "shl{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))],
+                    IIC_SR>;
+} // isConvertibleToThreeAddress = 1
+
+// NOTE: We don't include patterns for shifts of a register by one, because
+// 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one).
+let hasSideEffects = 0 in {
+def SHL8r1   : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1),
+                 "shl{b}\t$dst", [], IIC_SR>;
+def SHL16r1  : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shl{w}\t$dst", [], IIC_SR>, OpSize16;
+def SHL32r1  : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shl{l}\t$dst", [], IIC_SR>, OpSize32;
+def SHL64r1  : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+                 "shl{q}\t$dst", [], IIC_SR>;
+} // hasSideEffects = 0
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+// FIXME: Why do we need an explicit "Uses = [CL]" when the instr has a pattern
+// using CL?
+let Uses = [CL] in {
+def SHL8mCL  : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
+                 "shl{b}\t{%cl, $dst|$dst, cl}",
+                 [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
+                 "shl{w}\t{%cl, $dst|$dst, cl}",
+                 [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 OpSize16;
+def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
+                 "shl{l}\t{%cl, $dst|$dst, cl}",
+                 [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 OpSize32;
+def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
+                  "shl{q}\t{%cl, $dst|$dst, cl}",
+                  [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
+}
+def SHL8mi   : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, u8imm:$src),
+                   "shl{b}\t{$src, $dst|$dst, $src}",
+                [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                IIC_SR>;
+def SHL16mi  : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, u8imm:$src),
+                   "shl{w}\t{$src, $dst|$dst, $src}",
+               [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SHL32mi  : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, u8imm:$src),
+                   "shl{l}\t{$src, $dst|$dst, $src}",
+               [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, u8imm:$src),
+                  "shl{q}\t{$src, $dst|$dst, $src}",
+                 [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                 IIC_SR>;
+
+// Shift by 1
+def SHL8m1   : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
+                 "shl{b}\t$dst",
+                [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+                IIC_SR>;
+def SHL16m1  : I<0xD1, MRM4m, (outs), (ins i16mem:$dst),
+                 "shl{w}\t$dst",
+               [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SHL32m1  : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
+                 "shl{l}\t$dst",
+               [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
+                  "shl{q}\t$dst",
+                 [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+                 IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SHR8rCL  : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "shr{b}\t{%cl, $dst|$dst, cl}",
+                 [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>;
+def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shr{w}\t{%cl, $dst|$dst, cl}",
+                 [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize16;
+def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shr{l}\t{%cl, $dst|$dst, cl}",
+                 [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>, OpSize32;
+def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+                  "shr{q}\t{%cl, $dst|$dst, cl}",
+                  [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>;
+}
+
+def SHR8ri   : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$src2),
+                   "shr{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def SHR16ri  : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+                   "shr{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize16;
+def SHR32ri  : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+                   "shr{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize32;
+def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2),
+                  "shr{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>;
+
+// Shift right by 1
+def SHR8r1   : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1),
+                 "shr{b}\t$dst",
+                 [(set GR8:$dst, (srl GR8:$src1, (i8 1)))], IIC_SR>;
+def SHR16r1  : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shr{w}\t$dst",
+                 [(set GR16:$dst, (srl GR16:$src1, (i8 1)))], IIC_SR>, OpSize16;
+def SHR32r1  : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shr{l}\t$dst",
+                 [(set GR32:$dst, (srl GR32:$src1, (i8 1)))], IIC_SR>, OpSize32;
+def SHR64r1  : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+                 "shr{q}\t$dst",
+                 [(set GR64:$dst, (srl GR64:$src1, (i8 1)))], IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SHR8mCL  : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
+                 "shr{b}\t{%cl, $dst|$dst, cl}",
+                 [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
+                 "shr{w}\t{%cl, $dst|$dst, cl}",
+                 [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 OpSize16;
+def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
+                 "shr{l}\t{%cl, $dst|$dst, cl}",
+                 [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 OpSize32;
+def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
+                  "shr{q}\t{%cl, $dst|$dst, cl}",
+                  [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
+}
+def SHR8mi   : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, u8imm:$src),
+                   "shr{b}\t{$src, $dst|$dst, $src}",
+                [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                IIC_SR>;
+def SHR16mi  : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, u8imm:$src),
+                   "shr{w}\t{$src, $dst|$dst, $src}",
+               [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SHR32mi  : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, u8imm:$src),
+                   "shr{l}\t{$src, $dst|$dst, $src}",
+               [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, u8imm:$src),
+                  "shr{q}\t{$src, $dst|$dst, $src}",
+                 [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                 IIC_SR>;
+
+// Shift by 1
+def SHR8m1   : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
+                 "shr{b}\t$dst",
+                [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+                IIC_SR>;
+def SHR16m1  : I<0xD1, MRM5m, (outs), (ins i16mem:$dst),
+                 "shr{w}\t$dst",
+               [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SHR32m1  : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
+                 "shr{l}\t$dst",
+               [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
+                  "shr{q}\t$dst",
+                 [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+                 IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SAR8rCL  : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "sar{b}\t{%cl, $dst|$dst, cl}",
+                 [(set GR8:$dst, (sra GR8:$src1, CL))],
+                 IIC_SR>;
+def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+                 "sar{w}\t{%cl, $dst|$dst, cl}",
+                 [(set GR16:$dst, (sra GR16:$src1, CL))],
+                 IIC_SR>, OpSize16;
+def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+                 "sar{l}\t{%cl, $dst|$dst, cl}",
+                 [(set GR32:$dst, (sra GR32:$src1, CL))],
+                 IIC_SR>, OpSize32;
+def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+                 "sar{q}\t{%cl, $dst|$dst, cl}",
+                 [(set GR64:$dst, (sra GR64:$src1, CL))],
+                 IIC_SR>;
+}
+
+def SAR8ri   : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+                   "sar{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))],
+                   IIC_SR>;
+def SAR16ri  : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+                   "sar{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize16;
+def SAR32ri  : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+                   "sar{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize32;
+def SAR64ri  : RIi8<0xC1, MRM7r, (outs GR64:$dst),
+                    (ins GR64:$src1, u8imm:$src2),
+                    "sar{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))],
+                    IIC_SR>;
+
+// Shift by 1
+def SAR8r1   : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "sar{b}\t$dst",
+                 [(set GR8:$dst, (sra GR8:$src1, (i8 1)))],
+                 IIC_SR>;
+def SAR16r1  : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+                 "sar{w}\t$dst",
+                 [(set GR16:$dst, (sra GR16:$src1, (i8 1)))],
+                 IIC_SR>, OpSize16;
+def SAR32r1  : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+                 "sar{l}\t$dst",
+                 [(set GR32:$dst, (sra GR32:$src1, (i8 1)))],
+                 IIC_SR>, OpSize32;
+def SAR64r1  : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+                 "sar{q}\t$dst",
+                 [(set GR64:$dst, (sra GR64:$src1, (i8 1)))],
+                 IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SAR8mCL  : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
+                 "sar{b}\t{%cl, $dst|$dst, cl}",
+                 [(store (sra (loadi8 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
+                 "sar{w}\t{%cl, $dst|$dst, cl}",
+                 [(store (sra (loadi16 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize16;
+def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst),
+                 "sar{l}\t{%cl, $dst|$dst, cl}",
+                 [(store (sra (loadi32 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize32;
+def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
+                 "sar{q}\t{%cl, $dst|$dst, cl}",
+                 [(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+}
+def SAR8mi   : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, u8imm:$src),
+                   "sar{b}\t{$src, $dst|$dst, $src}",
+                [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                IIC_SR>;
+def SAR16mi  : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, u8imm:$src),
+                   "sar{w}\t{$src, $dst|$dst, $src}",
+               [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SAR32mi  : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, u8imm:$src),
+                   "sar{l}\t{$src, $dst|$dst, $src}",
+               [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SAR64mi  : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, u8imm:$src),
+                    "sar{q}\t{$src, $dst|$dst, $src}",
+                 [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                 IIC_SR>;
+
+// Shift by 1
+def SAR8m1   : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
+                 "sar{b}\t$dst",
+                [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+                IIC_SR>;
+def SAR16m1  : I<0xD1, MRM7m, (outs), (ins i16mem:$dst),
+                 "sar{w}\t$dst",
+               [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize16;
+def SAR32m1  : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
+                 "sar{l}\t$dst",
+               [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>, OpSize32;
+def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
+                  "sar{q}\t$dst",
+                 [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+                 IIC_SR>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Rotate instructions
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+               "rcl{b}\t$dst", [], IIC_SR>;
+def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
+                 "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+                "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+                "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
+                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+let Uses = [CL] in
+def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+
+def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+                "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
+                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+let Uses = [CL] in
+def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+
+
+def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+                 "rcl{q}\t$dst", [], IIC_SR>;
+def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
+                   "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+
+def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+               "rcr{b}\t$dst", [], IIC_SR>;
+def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
+                 "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+                "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+                "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
+                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+let Uses = [CL] in
+def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+
+def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+                "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
+                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+let Uses = [CL] in
+def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+
+def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+                 "rcr{q}\t$dst", [], IIC_SR>;
+def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
+                   "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+} // Constraints = "$src = $dst"
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
+               "rcl{b}\t$dst", [], IIC_SR>;
+def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt),
+                 "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst),
+                "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, u8imm:$cnt),
+                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst),
+                "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, u8imm:$cnt),
+                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
+                 "rcl{q}\t$dst", [], IIC_SR>;
+def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, u8imm:$cnt),
+                   "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+
+def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst),
+               "rcr{b}\t$dst", [], IIC_SR>;
+def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, u8imm:$cnt),
+                 "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst),
+                "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
+def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, u8imm:$cnt),
+                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
+def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst),
+                "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
+def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, u8imm:$cnt),
+                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
+def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
+                 "rcr{q}\t$dst", [], IIC_SR>;
+def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt),
+                   "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+
+let Uses = [CL] in {
+def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
+                "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
+                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
+                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
+                  "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+
+def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
+                "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
+                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
+                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
+                  "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+}
+} // SchedRW
+} // hasSideEffects = 0
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+// FIXME: provide shorter instructions when imm8 == 1
+let Uses = [CL] in {
+def ROL8rCL  : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "rol{b}\t{%cl, $dst|$dst, cl}",
+                 [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>;
+def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rol{w}\t{%cl, $dst|$dst, cl}",
+                 [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize16;
+def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rol{l}\t{%cl, $dst|$dst, cl}",
+                 [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>, OpSize32;
+def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rol{q}\t{%cl, $dst|$dst, cl}",
+                  [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>;
+}
+
+def ROL8ri   : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+                   "rol{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def ROL16ri  : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+                   "rol{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize16;
+def ROL32ri  : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+                   "rol{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize32;
+def ROL64ri  : RIi8<0xC1, MRM0r, (outs GR64:$dst),
+                    (ins GR64:$src1, u8imm:$src2),
+                    "rol{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))],
+                    IIC_SR>;
+
+// Rotate by 1
+def ROL8r1   : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "rol{b}\t$dst",
+                 [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))],
+                 IIC_SR>;
+def ROL16r1  : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rol{w}\t$dst",
+                 [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))],
+                 IIC_SR>, OpSize16;
+def ROL32r1  : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rol{l}\t$dst",
+                 [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))],
+                 IIC_SR>, OpSize32;
+def ROL64r1  : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rol{q}\t$dst",
+                  [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))],
+                  IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def ROL8mCL  : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
+                 "rol{b}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
+                 "rol{w}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize16;
+def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
+                 "rol{l}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize32;
+def ROL64mCL :  RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
+                   "rol{q}\t{%cl, $dst|$dst, cl}",
+                   [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
+                   IIC_SR>;
+}
+def ROL8mi   : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, u8imm:$src1),
+                   "rol{b}\t{$src1, $dst|$dst, $src1}",
+               [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+               IIC_SR>;
+def ROL16mi  : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, u8imm:$src1),
+                   "rol{w}\t{$src1, $dst|$dst, $src1}",
+              [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+              IIC_SR>, OpSize16;
+def ROL32mi  : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, u8imm:$src1),
+                   "rol{l}\t{$src1, $dst|$dst, $src1}",
+              [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+              IIC_SR>, OpSize32;
+def ROL64mi  : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, u8imm:$src1),
+                    "rol{q}\t{$src1, $dst|$dst, $src1}",
+                [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+                IIC_SR>;
+
+// Rotate by 1
+def ROL8m1   : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
+                 "rol{b}\t$dst",
+               [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+def ROL16m1  : I<0xD1, MRM0m, (outs), (ins i16mem:$dst),
+                 "rol{w}\t$dst",
+              [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+              IIC_SR>, OpSize16;
+def ROL32m1  : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
+                 "rol{l}\t$dst",
+              [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+              IIC_SR>, OpSize32;
+def ROL64m1  : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
+                 "rol{q}\t$dst",
+               [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def ROR8rCL  : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "ror{b}\t{%cl, $dst|$dst, cl}",
+                 [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>;
+def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+                 "ror{w}\t{%cl, $dst|$dst, cl}",
+                 [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize16;
+def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+                 "ror{l}\t{%cl, $dst|$dst, cl}",
+                 [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>, OpSize32;
+def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+                  "ror{q}\t{%cl, $dst|$dst, cl}",
+                  [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>;
+}
+
+def ROR8ri   : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
+                   "ror{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (rotr GR8:$src1, (i8 relocImm:$src2)))],
+                   IIC_SR>;
+def ROR16ri  : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
+                   "ror{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (rotr GR16:$src1, (i8 relocImm:$src2)))],
+                   IIC_SR>, OpSize16;
+def ROR32ri  : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
+                   "ror{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (rotr GR32:$src1, (i8 relocImm:$src2)))],
+                   IIC_SR>, OpSize32;
+def ROR64ri  : RIi8<0xC1, MRM1r, (outs GR64:$dst),
+                    (ins GR64:$src1, u8imm:$src2),
+                    "ror{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (rotr GR64:$src1, (i8 relocImm:$src2)))],
+                    IIC_SR>;
+
+// Rotate by 1
+def ROR8r1   : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "ror{b}\t$dst",
+                 [(set GR8:$dst, (rotl GR8:$src1, (i8 7)))],
+                 IIC_SR>;
+def ROR16r1  : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+                 "ror{w}\t$dst",
+                 [(set GR16:$dst, (rotl GR16:$src1, (i8 15)))],
+                 IIC_SR>, OpSize16;
+def ROR32r1  : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+                 "ror{l}\t$dst",
+                 [(set GR32:$dst, (rotl GR32:$src1, (i8 31)))],
+                 IIC_SR>, OpSize32;
+def ROR64r1  : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+                  "ror{q}\t$dst",
+                  [(set GR64:$dst, (rotl GR64:$src1, (i8 63)))],
+                  IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def ROR8mCL  : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
+                 "ror{b}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
+                 "ror{w}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize16;
+def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst),
+                 "ror{l}\t{%cl, $dst|$dst, cl}",
+                 [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize32;
+def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
+                  "ror{q}\t{%cl, $dst|$dst, cl}",
+                  [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
+                  IIC_SR>;
+}
+def ROR8mi   : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, u8imm:$src),
+                   "ror{b}\t{$src, $dst|$dst, $src}",
+               [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>;
+def ROR16mi  : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, u8imm:$src),
+                   "ror{w}\t{$src, $dst|$dst, $src}",
+              [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+              IIC_SR>, OpSize16;
+def ROR32mi  : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, u8imm:$src),
+                   "ror{l}\t{$src, $dst|$dst, $src}",
+              [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+              IIC_SR>, OpSize32;
+def ROR64mi  : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src),
+                    "ror{q}\t{$src, $dst|$dst, $src}",
+                [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                IIC_SR>;
+
+// Rotate by 1
+def ROR8m1   : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
+                 "ror{b}\t$dst",
+               [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+def ROR16m1  : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
+                 "ror{w}\t$dst",
+              [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+              IIC_SR>, OpSize16;
+def ROR32m1  : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
+                 "ror{l}\t$dst",
+              [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+              IIC_SR>, OpSize32;
+def ROR64m1  : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
+                 "ror{q}\t$dst",
+               [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+} // SchedRW
+
+
+//===----------------------------------------------------------------------===//
+// Double shift instructions (generalizations of rotate)
+//===----------------------------------------------------------------------===//
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+
+let Uses = [CL] in {
+def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
+                   (ins GR16:$src1, GR16:$src2),
+                   "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))],
+                    IIC_SHD16_REG_CL>,
+                   TB, OpSize16;
+def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
+                   (ins GR16:$src1, GR16:$src2),
+                   "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))],
+                    IIC_SHD16_REG_CL>,
+                   TB, OpSize16;
+def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
+                   (ins GR32:$src1, GR32:$src2),
+                   "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))],
+                    IIC_SHD32_REG_CL>, TB, OpSize32;
+def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
+                   (ins GR32:$src1, GR32:$src2),
+                   "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))],
+                   IIC_SHD32_REG_CL>, TB, OpSize32;
+def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
+                    (ins GR64:$src1, GR64:$src2),
+                    "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                    [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))],
+                    IIC_SHD64_REG_CL>,
+                    TB;
+def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
+                    (ins GR64:$src1, GR64:$src2),
+                    "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                    [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))],
+                    IIC_SHD64_REG_CL>,
+                    TB;
+}
+
+let isCommutable = 1 in {  // These instructions commute to each other.
+def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
+                     (outs GR16:$dst),
+                     (ins GR16:$src1, GR16:$src2, u8imm:$src3),
+                     "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+                                      (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+                     TB, OpSize16;
+def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
+                     (outs GR16:$dst),
+                     (ins GR16:$src1, GR16:$src2, u8imm:$src3),
+                     "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+                                      (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+                     TB, OpSize16;
+def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
+                     (outs GR32:$dst),
+                     (ins GR32:$src1, GR32:$src2, u8imm:$src3),
+                     "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+                                      (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+                 TB, OpSize32;
+def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
+                     (outs GR32:$dst),
+                     (ins GR32:$src1, GR32:$src2, u8imm:$src3),
+                     "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+                                      (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+                 TB, OpSize32;
+def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
+                      (outs GR64:$dst),
+                      (ins GR64:$src1, GR64:$src2, u8imm:$src3),
+                      "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
+                                       (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+                 TB;
+def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
+                      (outs GR64:$dst),
+                      (ins GR64:$src1, GR64:$src2, u8imm:$src3),
+                      "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
+                                       (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+                 TB;
+}
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                   "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
+                     addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16;
+def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                  "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                  [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
+                    addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16;
+
+def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                   "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                   [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+                     addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
+def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                  "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                  [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
+                    addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32;
+
+def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                    "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                    [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
+                      addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                    "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
+                    [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
+                      addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+}
+
+def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
+                    (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
+                    "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
+                                      (i8 imm:$src3)), addr:$dst)],
+                                      IIC_SHD16_MEM_IM>,
+                    TB, OpSize16;
+def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
+                     (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
+                     "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
+                                      (i8 imm:$src3)), addr:$dst)],
+                                      IIC_SHD16_MEM_IM>,
+                     TB, OpSize16;
+
+def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
+                    (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
+                    "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
+                                      (i8 imm:$src3)), addr:$dst)],
+                                      IIC_SHD32_MEM_IM>,
+                    TB, OpSize32;
+def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
+                     (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
+                     "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
+                                       (i8 imm:$src3)), addr:$dst)],
+                                       IIC_SHD32_MEM_IM>,
+                     TB, OpSize32;
+
+def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
+                      (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
+                      "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shld (loadi64 addr:$dst), GR64:$src2,
+                                       (i8 imm:$src3)), addr:$dst)],
+                                       IIC_SHD64_MEM_IM>,
+                 TB;
+def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
+                      (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
+                      "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
+                                       (i8 imm:$src3)), addr:$dst)],
+                                       IIC_SHD64_MEM_IM>,
+                 TB;
+} // SchedRW
+
+} // Defs = [EFLAGS]
+
+def ROT32L2R_imm8  : SDNodeXForm<imm, [{
+  // Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
+  return getI8Imm(32 - N->getZExtValue(), SDLoc(N));
+}]>;
+
+def ROT64L2R_imm8  : SDNodeXForm<imm, [{
+  // Convert a ROTL shamt to a ROTR shamt on 64-bit integer.
+  return getI8Imm(64 - N->getZExtValue(), SDLoc(N));
+}]>;
+
+multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+  def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
+               !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               []>, TAXD, VEX, Sched<[WriteShift]>;
+  let mayLoad = 1 in
+  def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst),
+               (ins x86memop:$src1, u8imm:$src2),
+               !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               []>, TAXD, VEX, Sched<[WriteShiftLd]>;
+}
+}
+
+multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> {
+let hasSideEffects = 0 in {
+  def rr : I<0xF7, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+             !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+             VEX, Sched<[WriteShift]>;
+  let mayLoad = 1 in
+  def rm : I<0xF7, MRMSrcMem4VOp3,
+             (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+             !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+             VEX, Sched<[WriteShiftLd,
+                         // x86memop:$src1
+                         ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                         ReadDefault,
+                         // RC:$src1
+                         ReadAfterLd]>;
+}
+}
+
+let Predicates = [HasBMI2] in {
+  defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem>;
+  defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem>, VEX_W;
+  defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8XS;
+  defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8XS, VEX_W;
+  defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8XD;
+  defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, VEX_W;
+  defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8PD;
+  defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8PD, VEX_W;
+
+  // Prefer RORX which is non-destructive and doesn't update EFLAGS.
+  let AddedComplexity = 10 in {
+    def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
+              (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
+    def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
+              (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
+  }
+
+  def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)),
+            (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
+  def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
+            (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
+
+  // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
+  // immedidate shift, i.e. the following code is considered better
+  //
+  //  mov %edi, %esi
+  //  shl $imm, %esi
+  //  ... %edi, ...
+  //
+  // than
+  //
+  //  movb $imm, %sil
+  //  shlx %sil, %edi, %esi
+  //  ... %edi, ...
+  //
+  let AddedComplexity = 1 in {
+    def : Pat<(sra GR32:$src1, GR8:$src2),
+              (SARX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(sra GR64:$src1, GR8:$src2),
+              (SARX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+    def : Pat<(srl GR32:$src1, GR8:$src2),
+              (SHRX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(srl GR64:$src1, GR8:$src2),
+              (SHRX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+    def : Pat<(shl GR32:$src1, GR8:$src2),
+              (SHLX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(shl GR64:$src1, GR8:$src2),
+              (SHLX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  }
+
+  // Patterns on SARXrm/SHRXrm/SHLXrm are explicitly omitted to favor
+  //
+  //  mov (%ecx), %esi
+  //  shl $imm, $esi
+  //
+  // over
+  //
+  //  movb $imm %al
+  //  shlx %al, (%ecx), %esi
+  //
+  // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole
+  // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible.
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
new file mode 100644
index 000000000000..9265d64b3230
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
@@ -0,0 +1,622 @@
+//===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instructions that are generally used in
+// privileged modes.  These are not typically used by the compiler, but are
+// supported for the assembler and disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+let SchedRW = [WriteSystem] in {
+let Defs = [RAX, RDX] in
+  def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>,
+              TB;
+
+let Defs = [RAX, RCX, RDX] in
+  def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB;
+
+// CPU flow control instructions
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 1 in {
+  def TRAP    : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
+  def UD2B    : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB;
+}
+
+def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>;
+def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB;
+
+// Interrupt and SysCall Instructions.
+let Uses = [EFLAGS] in
+  def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
+def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
+              [(int_x86_int (i8 3))], IIC_INT3>;
+} // SchedRW
+
+// The long form of "int $3" turns into int3 as a size optimization.
+// FIXME: This doesn't work because InstAlias can't match immediate constants.
+//def : InstAlias<"int\t$3", (INT3)>;
+
+let SchedRW = [WriteSystem] in {
+
+def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap",
+              [(int_x86_int imm:$trap)], IIC_INT>;
+
+
+def SYSCALL  : I<0x05, RawFrm, (outs), (ins), "syscall", [], IIC_SYSCALL>, TB;
+def SYSRET   : I<0x07, RawFrm, (outs), (ins), "sysret{l}", [], IIC_SYSCALL>, TB;
+def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", [], IIC_SYSCALL>, TB,
+               Requires<[In64BitMode]>;
+
+def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", [],
+                 IIC_SYS_ENTER_EXIT>, TB;
+
+def SYSEXIT   : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [],
+                 IIC_SYS_ENTER_EXIT>, TB;
+def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", [],
+                 IIC_SYS_ENTER_EXIT>, TB, Requires<[In64BitMode]>;
+} // SchedRW
+
+def : Pat<(debugtrap),
+          (INT3)>, Requires<[NotPS4]>;
+def : Pat<(debugtrap),
+          (INT (i8 0x41))>, Requires<[IsPS4]>;
+
+//===----------------------------------------------------------------------===//
+//  Input/Output Instructions.
+//
+let SchedRW = [WriteSystem] in {
+let Defs = [AL], Uses = [DX] in
+def IN8rr  : I<0xEC, RawFrm, (outs), (ins),
+               "in{b}\t{%dx, %al|al, dx}", [], IIC_IN_RR>;
+let Defs = [AX], Uses = [DX] in
+def IN16rr : I<0xED, RawFrm, (outs), (ins),
+               "in{w}\t{%dx, %ax|ax, dx}", [], IIC_IN_RR>,  OpSize16;
+let Defs = [EAX], Uses = [DX] in
+def IN32rr : I<0xED, RawFrm, (outs), (ins),
+               "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>, OpSize32;
+
+let Defs = [AL] in
+def IN8ri  : Ii8<0xE4, RawFrm, (outs), (ins u8imm:$port),
+                  "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>;
+let Defs = [AX] in
+def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
+                  "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize16;
+let Defs = [EAX] in
+def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port),
+                  "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>, OpSize32;
+
+let Uses = [DX, AL] in
+def OUT8rr  : I<0xEE, RawFrm, (outs), (ins),
+                "out{b}\t{%al, %dx|dx, al}", [], IIC_OUT_RR>;
+let Uses = [DX, AX] in
+def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
+                "out{w}\t{%ax, %dx|dx, ax}", [], IIC_OUT_RR>, OpSize16;
+let Uses = [DX, EAX] in
+def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
+                "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>, OpSize32;
+
+let Uses = [AL] in
+def OUT8ir  : Ii8<0xE6, RawFrm, (outs), (ins u8imm:$port),
+                   "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>;
+let Uses = [AX] in
+def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
+                   "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize16;
+let Uses = [EAX] in
+def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port),
+                  "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>, OpSize32;
+
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from debug registers
+
+let SchedRW = [WriteSystem] in {
+def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB,
+                Requires<[Not64BitMode]>;
+def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB,
+                Requires<[In64BitMode]>;
+
+def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB,
+                Requires<[Not64BitMode]>;
+def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB,
+                Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from control registers
+
+let SchedRW = [WriteSystem] in {
+def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB,
+                Requires<[Not64BitMode]>;
+def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB,
+                Requires<[In64BitMode]>;
+
+def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB,
+                Requires<[Not64BitMode]>;
+def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB,
+                Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Segment override instruction prefixes
+
+def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>;
+def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>;
+def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>;
+def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>;
+def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>;
+def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
+
+
+//===----------------------------------------------------------------------===//
+// Moves to and from segment registers.
+//
+
+let SchedRW = [WriteMove] in {
+def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize16;
+def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize32;
+def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
+
+def MOV16ms : I<0x8C, MRMDestMem, (outs), (ins i16mem:$dst, SEGMENT_REG:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize16;
+def MOV32ms : I<0x8C, MRMDestMem, (outs), (ins i32mem:$dst, SEGMENT_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize32;
+def MOV64ms : RI<0x8C, MRMDestMem, (outs), (ins i64mem:$dst, SEGMENT_REG:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
+
+def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize16;
+def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize32;
+def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
+
+def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize16;
+def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize32;
+def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Segmentation support instructions.
+
+let SchedRW = [WriteSystem] in {
+def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
+
+def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
+                OpSize16;
+def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
+                OpSize16;
+
+// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
+def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
+                OpSize32;
+def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
+                OpSize32;
+// i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
+def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                 "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
+def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+                 "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
+
+def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
+                OpSize16;
+def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
+                OpSize16;
+def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
+                OpSize32;
+def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
+                OpSize32;
+def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                 "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB;
+def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                 "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
+
+def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr",
+               [], IIC_INVLPG>, TB;
+
+def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins),
+               "str{w}\t$dst", [], IIC_STR>, TB, OpSize16;
+def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
+               "str{l}\t$dst", [], IIC_STR>, TB, OpSize32;
+def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
+                "str{q}\t$dst", [], IIC_STR>, TB;
+def STRm   : I<0x00, MRM1m, (outs), (ins i16mem:$dst),
+               "str{w}\t$dst", [], IIC_STR>, TB;
+
+def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
+             "ltr{w}\t$src", [], IIC_LTR>, TB;
+def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
+             "ltr{w}\t$src", [], IIC_LTR>, TB;
+
+def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
+                 "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
+                 "push{l}\t{%cs|cs}", [], IIC_PUSH_CS>,
+                 OpSize32, Requires<[Not64BitMode]>;
+def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
+                 "push{w}\t{%ss|ss}", [], IIC_PUSH_SR>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
+                 "push{l}\t{%ss|ss}", [], IIC_PUSH_SR>,
+                 OpSize32, Requires<[Not64BitMode]>;
+def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
+                 "push{w}\t{%ds|ds}", [], IIC_PUSH_SR>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
+                 "push{l}\t{%ds|ds}", [], IIC_PUSH_SR>,
+                 OpSize32, Requires<[Not64BitMode]>;
+def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
+                 "push{w}\t{%es|es}", [], IIC_PUSH_SR>,
+                 OpSize16, Requires<[Not64BitMode]>;
+def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
+                 "push{l}\t{%es|es}", [], IIC_PUSH_SR>,
+                 OpSize32, Requires<[Not64BitMode]>;
+def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
+                 "push{w}\t{%fs|fs}", [], IIC_PUSH_SR>, OpSize16, TB;
+def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
+                 "push{l}\t{%fs|fs}", [], IIC_PUSH_SR>, TB,
+               OpSize32, Requires<[Not64BitMode]>;
+def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
+                 "push{w}\t{%gs|gs}", [], IIC_PUSH_SR>, OpSize16, TB;
+def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
+                 "push{l}\t{%gs|gs}", [], IIC_PUSH_SR>, TB,
+               OpSize32, Requires<[Not64BitMode]>;
+def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
+                 "push{q}\t{%fs|fs}", [], IIC_PUSH_SR>, TB,
+               OpSize32, Requires<[In64BitMode]>;
+def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
+                 "push{q}\t{%gs|gs}", [], IIC_PUSH_SR>, TB,
+               OpSize32, Requires<[In64BitMode]>;
+
+// No "pop cs" instruction.
+def POPSS16 : I<0x17, RawFrm, (outs), (ins),
+                "pop{w}\t{%ss|ss}", [], IIC_POP_SR_SS>,
+              OpSize16, Requires<[Not64BitMode]>;
+def POPSS32 : I<0x17, RawFrm, (outs), (ins),
+                "pop{l}\t{%ss|ss}", [], IIC_POP_SR_SS>,
+              OpSize32, Requires<[Not64BitMode]>;
+
+def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
+                "pop{w}\t{%ds|ds}", [], IIC_POP_SR>,
+              OpSize16, Requires<[Not64BitMode]>;
+def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
+                "pop{l}\t{%ds|ds}", [], IIC_POP_SR>,
+              OpSize32, Requires<[Not64BitMode]>;
+
+def POPES16 : I<0x07, RawFrm, (outs), (ins),
+                "pop{w}\t{%es|es}", [], IIC_POP_SR>,
+              OpSize16, Requires<[Not64BitMode]>;
+def POPES32 : I<0x07, RawFrm, (outs), (ins),
+                "pop{l}\t{%es|es}", [], IIC_POP_SR>,
+              OpSize32, Requires<[Not64BitMode]>;
+
+def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
+                "pop{w}\t{%fs|fs}", [], IIC_POP_SR>, OpSize16, TB;
+def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
+                "pop{l}\t{%fs|fs}", [], IIC_POP_SR>, TB,
+              OpSize32, Requires<[Not64BitMode]>;
+def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
+                "pop{q}\t{%fs|fs}", [], IIC_POP_SR>, TB,
+              OpSize32, Requires<[In64BitMode]>;
+
+def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
+                "pop{w}\t{%gs|gs}", [], IIC_POP_SR>, OpSize16, TB;
+def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
+                "pop{l}\t{%gs|gs}", [], IIC_POP_SR>, TB,
+              OpSize32, Requires<[Not64BitMode]>;
+def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
+                "pop{q}\t{%gs|gs}", [], IIC_POP_SR>, TB,
+              OpSize32, Requires<[In64BitMode]>;
+
+
+def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16,
+                Requires<[Not64BitMode]>;
+def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32,
+                Requires<[Not64BitMode]>;
+
+def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
+def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+                 "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16,
+                Requires<[Not64BitMode]>;
+def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32,
+                Requires<[Not64BitMode]>;
+
+def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
+def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+                 "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16;
+def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32;
+
+def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+                 "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+
+def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg),
+              "verr\t$seg", [], IIC_VERR>, TB;
+def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
+              "verr\t$seg", [], IIC_VERR>, TB;
+def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
+              "verw\t$seg", [], IIC_VERW_MEM>, TB;
+def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
+              "verw\t$seg", [], IIC_VERW_REG>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Descriptor-table support instructions
+
+let SchedRW = [WriteSystem] in {
+def SGDT16m : I<0x01, MRM0m, (outs), (ins opaque48mem:$dst),
+              "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SGDT32m : I<0x01, MRM0m, (outs), (ins opaque48mem:$dst),
+              "sgdt{l}\t$dst", [], IIC_SGDT>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SGDT64m : I<0x01, MRM0m, (outs), (ins opaque80mem:$dst),
+              "sgdt{q}\t$dst", [], IIC_SGDT>, TB, Requires <[In64BitMode]>;
+def SIDT16m : I<0x01, MRM1m, (outs), (ins opaque48mem:$dst),
+              "sidt{w}\t$dst", [], IIC_SIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def SIDT32m : I<0x01, MRM1m, (outs), (ins opaque48mem:$dst),
+              "sidt{l}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>;
+def SIDT64m : I<0x01, MRM1m, (outs), (ins opaque80mem:$dst),
+              "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
+def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
+                "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize16;
+def SLDT16m : I<0x00, MRM0m, (outs), (ins i16mem:$dst),
+                "sldt{w}\t$dst", [], IIC_SLDT>, TB;
+def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
+                "sldt{l}\t$dst", [], IIC_SLDT>, OpSize32, TB;
+
+// LLDT is not interpreted specially in 64-bit mode because there is no sign
+//   extension.
+def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
+                 "sldt{q}\t$dst", [], IIC_SLDT>, TB;
+def SLDT64m : RI<0x00, MRM0m, (outs), (ins i16mem:$dst),
+                 "sldt{q}\t$dst", [], IIC_SLDT>, TB;
+
+def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+              "lgdt{w}\t$src", [], IIC_LGDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LGDT32m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+              "lgdt{l}\t$src", [], IIC_LGDT>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LGDT64m : I<0x01, MRM2m, (outs), (ins opaque80mem:$src),
+              "lgdt{q}\t$src", [], IIC_LGDT>, TB, Requires<[In64BitMode]>;
+def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+              "lidt{w}\t$src", [], IIC_LIDT>, TB, OpSize16, Requires<[Not64BitMode]>;
+def LIDT32m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+              "lidt{l}\t$src", [], IIC_LIDT>, OpSize32, TB, Requires<[Not64BitMode]>;
+def LIDT64m : I<0x01, MRM3m, (outs), (ins opaque80mem:$src),
+              "lidt{q}\t$src", [], IIC_LIDT>, TB, Requires<[In64BitMode]>;
+def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
+                "lldt{w}\t$src", [], IIC_LLDT_REG>, TB;
+def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
+                "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Specialized register support
+let SchedRW = [WriteSystem] in {
+let Uses = [EAX, ECX, EDX] in
+def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB;
+let Defs = [EAX, EDX], Uses = [ECX] in
+def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
+
+let Defs = [RAX, RDX], Uses = [ECX] in
+  def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)], IIC_RDPMC>,
+              TB;
+
+def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins),
+                "smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB;
+def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins),
+                "smsw{l}\t$dst", [], IIC_SMSW>, OpSize32, TB;
+// no m form encodable; use SMSW16m
+def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins),
+                 "smsw{q}\t$dst", [], IIC_SMSW>, TB;
+
+// For memory operands, there is only a 16-bit form
+def SMSW16m : I<0x01, MRM4m, (outs), (ins i16mem:$dst),
+                "smsw{w}\t$dst", [], IIC_SMSW>, TB;
+
+def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
+                "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB;
+def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
+                "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
+
+let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in
+  def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Cache instructions
+let SchedRW = [WriteSystem] in {
+def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// XSAVE instructions
+let SchedRW = [WriteSystem] in {
+let Predicates = [HasXSAVE] in {
+let Defs = [EDX, EAX], Uses = [ECX] in
+  def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
+
+let Uses = [EDX, EAX, ECX] in
+  def XSETBV : I<0x01, MRM_D1, (outs), (ins), 
+                "xsetbv", 
+                [(int_x86_xsetbv ECX, EDX, EAX)]>, TB;
+
+} // HasXSAVE
+
+let Uses = [EDX, EAX] in {
+let Predicates = [HasXSAVE] in {
+  def XSAVE : I<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+                "xsave\t$dst",
+                [(int_x86_xsave addr:$dst, EDX, EAX)]>, TB;
+  def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+                   "xsave64\t$dst",
+                   [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+  def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+                 "xrstor\t$dst",
+                 [(int_x86_xrstor addr:$dst, EDX, EAX)]>, TB;
+  def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+                    "xrstor64\t$dst",
+                    [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVEOPT] in {
+  def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+                   "xsaveopt\t$dst",
+                   [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS;
+  def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+                      "xsaveopt64\t$dst",
+                      [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVEC] in {
+  def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+                 "xsavec\t$dst",
+                 [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB;
+  def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+                   "xsavec64\t$dst",
+                   [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+let Predicates = [HasXSAVES] in {
+  def XSAVES : I<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+                 "xsaves\t$dst",
+                 [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB;
+  def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+                    "xsaves64\t$dst",
+                    [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+  def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
+                  "xrstors\t$dst",
+                  [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB;
+  def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
+                     "xrstors64\t$dst",
+                     [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+}
+} // Uses
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// VIA PadLock crypto instructions
+let Defs = [RAX, RDI], Uses = [RDX, RDI] in
+  def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB;
+
+def : InstAlias<"xstorerng", (XSTORE)>;
+
+let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in {
+  def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB;
+  def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB;
+  def XCRYPTCTR : I<0xa7, MRM_D8, (outs), (ins), "xcryptctr", []>, TB;
+  def XCRYPTCFB : I<0xa7, MRM_E0, (outs), (ins), "xcryptcfb", []>, TB;
+  def XCRYPTOFB : I<0xa7, MRM_E8, (outs), (ins), "xcryptofb", []>, TB;
+}
+
+let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
+  def XSHA1 : I<0xa6, MRM_C8, (outs), (ins), "xsha1", []>, TB;
+  def XSHA256 : I<0xa6, MRM_D0, (outs), (ins), "xsha256", []>, TB;
+}
+let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
+  def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB;
+//==-----------------------------------------------------------------------===//
+// PKU  - enable protection key
+let usesCustomInserter = 1 in {
+  def WRPKRU : PseudoI<(outs), (ins GR32:$src),
+                [(int_x86_wrpkru GR32:$src)]>;
+  def RDPKRU : PseudoI<(outs GR32:$dst), (ins),
+                [(set GR32:$dst, (int_x86_rdpkru))]>;
+}
+
+let Defs = [EAX, EDX], Uses = [ECX] in
+  def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB;
+let Uses = [EAX, ECX, EDX] in
+  def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB;
+
+//===----------------------------------------------------------------------===//
+// FS/GS Base Instructions
+let Predicates = [HasFSGSBase, In64BitMode] in {
+  def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins),
+                   "rdfsbase{l}\t$dst",
+                   [(set GR32:$dst, (int_x86_rdfsbase_32))]>, XS;
+  def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins),
+                     "rdfsbase{q}\t$dst",
+                     [(set GR64:$dst, (int_x86_rdfsbase_64))]>, XS;
+  def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins),
+                   "rdgsbase{l}\t$dst",
+                   [(set GR32:$dst, (int_x86_rdgsbase_32))]>, XS;
+  def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins),
+                     "rdgsbase{q}\t$dst",
+                     [(set GR64:$dst, (int_x86_rdgsbase_64))]>, XS;
+  def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src),
+                   "wrfsbase{l}\t$src",
+                   [(int_x86_wrfsbase_32 GR32:$src)]>, XS;
+  def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src),
+                      "wrfsbase{q}\t$src",
+                      [(int_x86_wrfsbase_64 GR64:$src)]>, XS;
+  def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src),
+                   "wrgsbase{l}\t$src",
+                   [(int_x86_wrgsbase_32 GR32:$src)]>, XS;
+  def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src),
+                      "wrgsbase{q}\t$src",
+                      [(int_x86_wrgsbase_64 GR64:$src)]>, XS;
+}
+
+//===----------------------------------------------------------------------===//
+// INVPCID Instruction
+def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+                "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                Requires<[Not64BitMode]>;
+def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+                "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// SMAP Instruction
+let Defs = [EFLAGS] in {
+  def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB;
+  def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// SMX Instruction
+let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in {
+  def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrTSX.td b/contrib/llvm/lib/Target/X86/X86InstrTSX.td
new file mode 100644
index 000000000000..7267d752653e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrTSX.td
@@ -0,0 +1,50 @@
+//===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel TSX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TSX instructions
+
+def X86xtest: SDNode<"X86ISD::XTEST", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>,
+                     [SDNPHasChain, SDNPSideEffect]>;
+
+let usesCustomInserter = 1 in
+def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
+               "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>,
+             Requires<[HasRTM]>;
+
+let isBranch = 1, isTerminator = 1, Defs = [EAX] in {
+def XBEGIN_2 : Ii16PCRel<0xc7, MRM_F8, (outs), (ins brtarget16:$dst),
+                         "xbegin\t$dst", []>, OpSize16, Requires<[HasRTM]>;
+def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst),
+                         "xbegin\t$dst", []>, OpSize32, Requires<[HasRTM]>;
+}
+
+def XEND : I<0x01, MRM_D5, (outs), (ins),
+             "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
+
+let Defs = [EFLAGS] in
+def XTEST : I<0x01, MRM_D6, (outs), (ins),
+              "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasTSX]>;
+
+def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
+                 "xabort\t$imm",
+                 [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>;
+
+// HLE prefixes
+
+let isAsmParserOnly = 1 in {
+def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>;
+def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrTablesInfo.h b/contrib/llvm/lib/Target/X86/X86InstrTablesInfo.h
new file mode 100755
index 000000000000..5d2af829028a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrTablesInfo.h
@@ -0,0 +1,1148 @@
+//===-- X86AVX512Info.h - X86 Instruction Tables Information ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains related X86 Instruction Information Tables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRTABLESINFO_H
+#define LLVM_LIB_TARGET_X86_X86INSTRTABLESINFO_H
+
+using namespace llvm;
+
+struct X86EvexToVexCompressTableEntry {
+  uint16_t EvexOpcode;
+  uint16_t VexOpcode;
+};
+
+
+
+// X86 EVEX encoded instructions that have a VEX 128 encoding
+// (table format: <EVEX opcode, VEX-128 opcode>).
+static const X86EvexToVexCompressTableEntry 
+   X86EvexToVex128CompressTable[] = {
+  // EVEX scalar with corresponding VEX.
+  { X86::Int_VCOMISDZrm         ,  X86::Int_VCOMISDrm            },
+  { X86::Int_VCOMISDZrr         ,  X86::Int_VCOMISDrr            },
+  { X86::Int_VCOMISSZrm         ,  X86::Int_VCOMISSrm            },
+  { X86::Int_VCOMISSZrr         ,  X86::Int_VCOMISSrr            },
+  { X86::Int_VUCOMISDZrm        ,  X86::Int_VUCOMISDrm           },
+  { X86::Int_VUCOMISDZrr        ,  X86::Int_VUCOMISDrr           },
+  { X86::Int_VUCOMISSZrm        ,  X86::Int_VUCOMISSrm           },
+  { X86::Int_VUCOMISSZrr        ,  X86::Int_VUCOMISSrr           },
+  { X86::VADDSDZrm              ,  X86::VADDSDrm                 },
+  { X86::VADDSDZrm_Int          ,  X86::VADDSDrm_Int             },
+  { X86::VADDSDZrr              ,  X86::VADDSDrr                 },
+  { X86::VADDSDZrr_Int          ,  X86::VADDSDrr_Int             },
+  { X86::VADDSSZrm              ,  X86::VADDSSrm                 },
+  { X86::VADDSSZrm_Int          ,  X86::VADDSSrm_Int             },
+  { X86::VADDSSZrr              ,  X86::VADDSSrr                 },
+  { X86::VADDSSZrr_Int          ,  X86::VADDSSrr_Int             },
+  { X86::VCOMISDZrm             ,  X86::VCOMISDrm                },
+  { X86::VCOMISDZrr             ,  X86::VCOMISDrr                },
+  { X86::VCOMISSZrm             ,  X86::VCOMISSrm                },
+  { X86::VCOMISSZrr             ,  X86::VCOMISSrr                },
+  { X86::VCVTSD2SI64Zrm         ,  X86::VCVTSD2SI64rm            },
+  { X86::VCVTSD2SI64Zrr         ,  X86::VCVTSD2SI64rr            },
+  { X86::VCVTSD2SIZrm           ,  X86::VCVTSD2SIrm              },
+  { X86::VCVTSD2SIZrr           ,  X86::VCVTSD2SIrr              },
+  { X86::VCVTSD2SSZrm           ,  X86::VCVTSD2SSrm              },
+  { X86::VCVTSD2SSZrr           ,  X86::VCVTSD2SSrr              },
+  { X86::VCVTSI2SDZrm           ,  X86::VCVTSI2SDrm              },
+  { X86::VCVTSI2SDZrm_Int       ,  X86::Int_VCVTSI2SDrm          },
+  { X86::VCVTSI2SDZrr           ,  X86::VCVTSI2SDrr              },
+  { X86::VCVTSI2SDZrr_Int       ,  X86::Int_VCVTSI2SDrr          },
+  { X86::VCVTSI2SSZrm           ,  X86::VCVTSI2SSrm              },
+  { X86::VCVTSI2SSZrm_Int       ,  X86::Int_VCVTSI2SSrm          },
+  { X86::VCVTSI2SSZrr           ,  X86::VCVTSI2SSrr              },
+  { X86::VCVTSI2SSZrr_Int       ,  X86::Int_VCVTSI2SSrr          },
+  { X86::VCVTSS2SDZrm           ,  X86::VCVTSS2SDrm              },
+  { X86::VCVTSS2SDZrr           ,  X86::VCVTSS2SDrr              },
+  { X86::VCVTSS2SI64Zrm         ,  X86::VCVTSS2SI64rm            },
+  { X86::VCVTSS2SI64Zrr         ,  X86::VCVTSS2SI64rr            },
+  { X86::VCVTSS2SIZrm           ,  X86::VCVTSS2SIrm              },
+  { X86::VCVTSS2SIZrr           ,  X86::VCVTSS2SIrr              },
+  { X86::VCVTTSD2SI64Zrm        ,  X86::VCVTTSD2SI64rm           },
+  { X86::VCVTTSD2SI64Zrm_Int    ,  X86::Int_VCVTTSD2SI64rm       },
+  { X86::VCVTTSD2SI64Zrr        ,  X86::VCVTTSD2SI64rr           },
+  { X86::VCVTTSD2SI64Zrr_Int    ,  X86::Int_VCVTTSD2SI64rr       },
+  { X86::VCVTTSD2SIZrm          ,  X86::VCVTTSD2SIrm             },
+  { X86::VCVTTSD2SIZrm_Int      ,  X86::Int_VCVTTSD2SIrm         },
+  { X86::VCVTTSD2SIZrr          ,  X86::VCVTTSD2SIrr             },
+  { X86::VCVTTSD2SIZrr_Int      ,  X86::Int_VCVTTSD2SIrr         },
+  { X86::VCVTTSS2SI64Zrm        ,  X86::VCVTTSS2SI64rm           },
+  { X86::VCVTTSS2SI64Zrm_Int    ,  X86::Int_VCVTTSS2SI64rm       },
+  { X86::VCVTTSS2SI64Zrr        ,  X86::VCVTTSS2SI64rr           },
+  { X86::VCVTTSS2SI64Zrr_Int    ,  X86::Int_VCVTTSS2SI64rr       },
+  { X86::VCVTTSS2SIZrm          ,  X86::VCVTTSS2SIrm             },
+  { X86::VCVTTSS2SIZrm_Int      ,  X86::Int_VCVTTSS2SIrm         },
+  { X86::VCVTTSS2SIZrr          ,  X86::VCVTTSS2SIrr             },
+  { X86::VCVTTSS2SIZrr_Int      ,  X86::Int_VCVTTSS2SIrr         },
+  { X86::VDIVSDZrm              ,  X86::VDIVSDrm                 },
+  { X86::VDIVSDZrm_Int          ,  X86::VDIVSDrm_Int             },
+  { X86::VDIVSDZrr              ,  X86::VDIVSDrr                 },
+  { X86::VDIVSDZrr_Int          ,  X86::VDIVSDrr_Int             },
+  { X86::VDIVSSZrm              ,  X86::VDIVSSrm                 },
+  { X86::VDIVSSZrm_Int          ,  X86::VDIVSSrm_Int             },
+  { X86::VDIVSSZrr              ,  X86::VDIVSSrr                 },
+  { X86::VDIVSSZrr_Int          ,  X86::VDIVSSrr_Int             },
+  { X86::VFMADD132SDZm          ,  X86::VFMADD132SDm             },
+  { X86::VFMADD132SDZm_Int      ,  X86::VFMADD132SDm_Int         },
+  { X86::VFMADD132SDZr          ,  X86::VFMADD132SDr             },
+  { X86::VFMADD132SDZr_Int      ,  X86::VFMADD132SDr_Int         },
+  { X86::VFMADD132SSZm          ,  X86::VFMADD132SSm             },
+  { X86::VFMADD132SSZm_Int      ,  X86::VFMADD132SSm_Int         },
+  { X86::VFMADD132SSZr          ,  X86::VFMADD132SSr             },
+  { X86::VFMADD132SSZr_Int      ,  X86::VFMADD132SSr_Int         },
+  { X86::VFMADD213SDZm          ,  X86::VFMADD213SDm             },
+  { X86::VFMADD213SDZm_Int      ,  X86::VFMADD213SDm_Int         },
+  { X86::VFMADD213SDZr          ,  X86::VFMADD213SDr             },
+  { X86::VFMADD213SDZr_Int      ,  X86::VFMADD213SDr_Int         },
+  { X86::VFMADD213SSZm          ,  X86::VFMADD213SSm             },
+  { X86::VFMADD213SSZm_Int      ,  X86::VFMADD213SSm_Int         },
+  { X86::VFMADD213SSZr          ,  X86::VFMADD213SSr             },
+  { X86::VFMADD213SSZr_Int      ,  X86::VFMADD213SSr_Int         },
+  { X86::VFMADD231SDZm          ,  X86::VFMADD231SDm             },
+  { X86::VFMADD231SDZm_Int      ,  X86::VFMADD231SDm_Int         },
+  { X86::VFMADD231SDZr          ,  X86::VFMADD231SDr             },
+  { X86::VFMADD231SDZr_Int      ,  X86::VFMADD231SDr_Int         },
+  { X86::VFMADD231SSZm          ,  X86::VFMADD231SSm             },
+  { X86::VFMADD231SSZm_Int      ,  X86::VFMADD231SSm_Int         },
+  { X86::VFMADD231SSZr          ,  X86::VFMADD231SSr             },
+  { X86::VFMADD231SSZr_Int      ,  X86::VFMADD231SSr_Int         },
+  { X86::VFMSUB132SDZm          ,  X86::VFMSUB132SDm             },
+  { X86::VFMSUB132SDZm_Int      ,  X86::VFMSUB132SDm_Int         },
+  { X86::VFMSUB132SDZr          ,  X86::VFMSUB132SDr             },
+  { X86::VFMSUB132SDZr_Int      ,  X86::VFMSUB132SDr_Int         },
+  { X86::VFMSUB132SSZm          ,  X86::VFMSUB132SSm             },
+  { X86::VFMSUB132SSZm_Int      ,  X86::VFMSUB132SSm_Int         },
+  { X86::VFMSUB132SSZr          ,  X86::VFMSUB132SSr             },
+  { X86::VFMSUB132SSZr_Int      ,  X86::VFMSUB132SSr_Int         },
+  { X86::VFMSUB213SDZm          ,  X86::VFMSUB213SDm             },
+  { X86::VFMSUB213SDZm_Int      ,  X86::VFMSUB213SDm_Int         },
+  { X86::VFMSUB213SDZr          ,  X86::VFMSUB213SDr             },
+  { X86::VFMSUB213SDZr_Int      ,  X86::VFMSUB213SDr_Int         },
+  { X86::VFMSUB213SSZm          ,  X86::VFMSUB213SSm             },
+  { X86::VFMSUB213SSZm_Int      ,  X86::VFMSUB213SSm_Int         },
+  { X86::VFMSUB213SSZr          ,  X86::VFMSUB213SSr             },
+  { X86::VFMSUB213SSZr_Int      ,  X86::VFMSUB213SSr_Int         },
+  { X86::VFMSUB231SDZm          ,  X86::VFMSUB231SDm             },
+  { X86::VFMSUB231SDZm_Int      ,  X86::VFMSUB231SDm_Int         },
+  { X86::VFMSUB231SDZr          ,  X86::VFMSUB231SDr             },
+  { X86::VFMSUB231SDZr_Int      ,  X86::VFMSUB231SDr_Int         },
+  { X86::VFMSUB231SSZm          ,  X86::VFMSUB231SSm             },
+  { X86::VFMSUB231SSZm_Int      ,  X86::VFMSUB231SSm_Int         },
+  { X86::VFMSUB231SSZr          ,  X86::VFMSUB231SSr             },
+  { X86::VFMSUB231SSZr_Int      ,  X86::VFMSUB231SSr_Int         },
+  { X86::VFNMADD132SDZm         ,  X86::VFNMADD132SDm            },
+  { X86::VFNMADD132SDZm_Int     ,  X86::VFNMADD132SDm_Int        },
+  { X86::VFNMADD132SDZr         ,  X86::VFNMADD132SDr            },
+  { X86::VFNMADD132SDZr_Int     ,  X86::VFNMADD132SDr_Int        },
+  { X86::VFNMADD132SSZm         ,  X86::VFNMADD132SSm            },
+  { X86::VFNMADD132SSZm_Int     ,  X86::VFNMADD132SSm_Int        },
+  { X86::VFNMADD132SSZr         ,  X86::VFNMADD132SSr            },
+  { X86::VFNMADD132SSZr_Int     ,  X86::VFNMADD132SSr_Int        },
+  { X86::VFNMADD213SDZm         ,  X86::VFNMADD213SDm            },
+  { X86::VFNMADD213SDZm_Int     ,  X86::VFNMADD213SDm_Int        },
+  { X86::VFNMADD213SDZr         ,  X86::VFNMADD213SDr            },
+  { X86::VFNMADD213SDZr_Int     ,  X86::VFNMADD213SDr_Int        },
+  { X86::VFNMADD213SSZm         ,  X86::VFNMADD213SSm            },
+  { X86::VFNMADD213SSZm_Int     ,  X86::VFNMADD213SSm_Int        },
+  { X86::VFNMADD213SSZr         ,  X86::VFNMADD213SSr            },
+  { X86::VFNMADD213SSZr_Int     ,  X86::VFNMADD213SSr_Int        },
+  { X86::VFNMADD231SDZm         ,  X86::VFNMADD231SDm            },
+  { X86::VFNMADD231SDZm_Int     ,  X86::VFNMADD231SDm_Int        },
+  { X86::VFNMADD231SDZr         ,  X86::VFNMADD231SDr            },
+  { X86::VFNMADD231SDZr_Int     ,  X86::VFNMADD231SDr_Int        },
+  { X86::VFNMADD231SSZm         ,  X86::VFNMADD231SSm            },
+  { X86::VFNMADD231SSZm_Int     ,  X86::VFNMADD231SSm_Int        },
+  { X86::VFNMADD231SSZr         ,  X86::VFNMADD231SSr            },
+  { X86::VFNMADD231SSZr_Int     ,  X86::VFNMADD231SSr_Int        },
+  { X86::VFNMSUB132SDZm         ,  X86::VFNMSUB132SDm            },
+  { X86::VFNMSUB132SDZm_Int     ,  X86::VFNMSUB132SDm_Int        },
+  { X86::VFNMSUB132SDZr         ,  X86::VFNMSUB132SDr            },
+  { X86::VFNMSUB132SDZr_Int     ,  X86::VFNMSUB132SDr_Int        },
+  { X86::VFNMSUB132SSZm         ,  X86::VFNMSUB132SSm            },
+  { X86::VFNMSUB132SSZm_Int     ,  X86::VFNMSUB132SSm_Int        },
+  { X86::VFNMSUB132SSZr         ,  X86::VFNMSUB132SSr            },
+  { X86::VFNMSUB132SSZr_Int     ,  X86::VFNMSUB132SSr_Int        },
+  { X86::VFNMSUB213SDZm         ,  X86::VFNMSUB213SDm            },
+  { X86::VFNMSUB213SDZm_Int     ,  X86::VFNMSUB213SDm_Int        },
+  { X86::VFNMSUB213SDZr         ,  X86::VFNMSUB213SDr            },
+  { X86::VFNMSUB213SDZr_Int     ,  X86::VFNMSUB213SDr_Int        },
+  { X86::VFNMSUB213SSZm         ,  X86::VFNMSUB213SSm            },
+  { X86::VFNMSUB213SSZm_Int     ,  X86::VFNMSUB213SSm_Int        },
+  { X86::VFNMSUB213SSZr         ,  X86::VFNMSUB213SSr            },
+  { X86::VFNMSUB213SSZr_Int     ,  X86::VFNMSUB213SSr_Int        },
+  { X86::VFNMSUB231SDZm         ,  X86::VFNMSUB231SDm            },
+  { X86::VFNMSUB231SDZm_Int     ,  X86::VFNMSUB231SDm_Int        },
+  { X86::VFNMSUB231SDZr         ,  X86::VFNMSUB231SDr            },
+  { X86::VFNMSUB231SDZr_Int     ,  X86::VFNMSUB231SDr_Int        },
+  { X86::VFNMSUB231SSZm         ,  X86::VFNMSUB231SSm            },
+  { X86::VFNMSUB231SSZm_Int     ,  X86::VFNMSUB231SSm_Int        },
+  { X86::VFNMSUB231SSZr         ,  X86::VFNMSUB231SSr            },
+  { X86::VFNMSUB231SSZr_Int     ,  X86::VFNMSUB231SSr_Int        },
+  { X86::VMAXCSDZrm             ,  X86::VMAXCSDrm                },
+  { X86::VMAXCSDZrr             ,  X86::VMAXCSDrr                },
+  { X86::VMAXCSSZrm             ,  X86::VMAXCSSrm                },
+  { X86::VMAXCSSZrr             ,  X86::VMAXCSSrr                },
+  { X86::VMAXSDZrm              ,  X86::VMAXSDrm                 },
+  { X86::VMAXSDZrm_Int          ,  X86::VMAXSDrm_Int             },
+  { X86::VMAXSDZrr              ,  X86::VMAXSDrr                 },
+  { X86::VMAXSDZrr_Int          ,  X86::VMAXSDrr_Int             },
+  { X86::VMAXSSZrm              ,  X86::VMAXSSrm                 },
+  { X86::VMAXSSZrm_Int          ,  X86::VMAXSSrm_Int             },
+  { X86::VMAXSSZrr              ,  X86::VMAXSSrr                 },
+  { X86::VMAXSSZrr_Int          ,  X86::VMAXSSrr_Int             },
+  { X86::VMINCSDZrm             ,  X86::VMINCSDrm                },
+  { X86::VMINCSDZrr             ,  X86::VMINCSDrr                },
+  { X86::VMINCSSZrm             ,  X86::VMINCSSrm                },
+  { X86::VMINCSSZrr             ,  X86::VMINCSSrr                },
+  { X86::VMINSDZrm              ,  X86::VMINSDrm                 },
+  { X86::VMINSDZrm_Int          ,  X86::VMINSDrm_Int             },
+  { X86::VMINSDZrr              ,  X86::VMINSDrr                 },
+  { X86::VMINSDZrr_Int          ,  X86::VMINSDrr_Int             },
+  { X86::VMINSSZrm              ,  X86::VMINSSrm                 },
+  { X86::VMINSSZrm_Int          ,  X86::VMINSSrm_Int             },
+  { X86::VMINSSZrr              ,  X86::VMINSSrr                 },
+  { X86::VMINSSZrr_Int          ,  X86::VMINSSrr_Int             },
+  { X86::VMOV64toSDZrr          ,  X86::VMOV64toSDrr             },
+  { X86::VMOVDI2SSZrm           ,  X86::VMOVDI2SSrm              },
+  { X86::VMOVDI2SSZrr           ,  X86::VMOVDI2SSrr              },
+  { X86::VMOVSDZmr              ,  X86::VMOVSDmr                 },
+  { X86::VMOVSDZrm              ,  X86::VMOVSDrm                 },
+  { X86::VMOVSDZrr              ,  X86::VMOVSDrr                 },
+  { X86::VMOVSSZmr              ,  X86::VMOVSSmr                 },
+  { X86::VMOVSSZrm              ,  X86::VMOVSSrm                 },
+  { X86::VMOVSSZrr              ,  X86::VMOVSSrr                 },
+  { X86::VMOVSSZrr_REV          ,  X86::VMOVSSrr_REV             },
+  { X86::VMULSDZrm              ,  X86::VMULSDrm                 },
+  { X86::VMULSDZrm_Int          ,  X86::VMULSDrm_Int             },
+  { X86::VMULSDZrr              ,  X86::VMULSDrr                 },
+  { X86::VMULSDZrr_Int          ,  X86::VMULSDrr_Int             },
+  { X86::VMULSSZrm              ,  X86::VMULSSrm                 },
+  { X86::VMULSSZrm_Int          ,  X86::VMULSSrm_Int             },
+  { X86::VMULSSZrr              ,  X86::VMULSSrr                 },
+  { X86::VMULSSZrr_Int          ,  X86::VMULSSrr_Int             },
+  { X86::VSQRTSDZm              ,  X86::VSQRTSDm                 },
+  { X86::VSQRTSDZm_Int          ,  X86::VSQRTSDm_Int             },
+  { X86::VSQRTSDZr              ,  X86::VSQRTSDr                 },
+  { X86::VSQRTSDZr_Int          ,  X86::VSQRTSDr_Int             },
+  { X86::VSQRTSSZm              ,  X86::VSQRTSSm                 },
+  { X86::VSQRTSSZm_Int          ,  X86::VSQRTSSm_Int             },
+  { X86::VSQRTSSZr              ,  X86::VSQRTSSr                 },
+  { X86::VSQRTSSZr_Int          ,  X86::VSQRTSSr_Int             },
+  { X86::VSUBSDZrm              ,  X86::VSUBSDrm                 },
+  { X86::VSUBSDZrm_Int          ,  X86::VSUBSDrm_Int             },
+  { X86::VSUBSDZrr              ,  X86::VSUBSDrr                 },
+  { X86::VSUBSDZrr_Int          ,  X86::VSUBSDrr_Int             },
+  { X86::VSUBSSZrm              ,  X86::VSUBSSrm                 },
+  { X86::VSUBSSZrm_Int          ,  X86::VSUBSSrm_Int             },
+  { X86::VSUBSSZrr              ,  X86::VSUBSSrr                 },
+  { X86::VSUBSSZrr_Int          ,  X86::VSUBSSrr_Int             },
+  { X86::VUCOMISDZrm            ,  X86::VUCOMISDrm               },
+  { X86::VUCOMISDZrr            ,  X86::VUCOMISDrr               },
+  { X86::VUCOMISSZrm            ,  X86::VUCOMISSrm               },
+  { X86::VUCOMISSZrr            ,  X86::VUCOMISSrr               },
+                                                                               
+  { X86::VMOV64toPQIZrr         ,   X86::VMOV64toPQIrr           },
+  { X86::VMOV64toSDZrr          ,   X86::VMOV64toSDrr            },
+  { X86::VMOVDI2PDIZrm          ,   X86::VMOVDI2PDIrm            },
+  { X86::VMOVDI2PDIZrr          ,   X86::VMOVDI2PDIrr            },
+  { X86::VMOVLHPSZrr            ,   X86::VMOVLHPSrr              },
+  { X86::VMOVHLPSZrr            ,   X86::VMOVHLPSrr              },  
+  { X86::VMOVPDI2DIZmr          ,   X86::VMOVPDI2DImr            },
+  { X86::VMOVPDI2DIZrr          ,   X86::VMOVPDI2DIrr            },
+  { X86::VMOVPQI2QIZmr          ,   X86::VMOVPQI2QImr            },
+  { X86::VMOVPQIto64Zrr         ,   X86::VMOVPQIto64rr           },
+  { X86::VMOVQI2PQIZrm          ,   X86::VMOVQI2PQIrm            },
+  { X86::VMOVZPQILo2PQIZrr      ,   X86::VMOVZPQILo2PQIrr        },
+                                                                               
+  { X86::VPEXTRBZmr             ,   X86::VPEXTRBmr               },
+  { X86::VPEXTRBZrr             ,   X86::VPEXTRBrr               },
+  { X86::VPEXTRDZmr             ,   X86::VPEXTRDmr               },
+  { X86::VPEXTRDZrr             ,   X86::VPEXTRDrr               },
+  { X86::VPEXTRQZmr             ,   X86::VPEXTRQmr               },
+  { X86::VPEXTRQZrr             ,   X86::VPEXTRQrr               },
+  { X86::VPEXTRWZmr             ,   X86::VPEXTRWmr               },
+  { X86::VPEXTRWZrr             ,   X86::VPEXTRWri               },
+                                                                               
+  { X86::VPINSRBZrm             ,   X86::VPINSRBrm               },
+  { X86::VPINSRBZrr             ,   X86::VPINSRBrr               },
+  { X86::VPINSRDZrm             ,   X86::VPINSRDrm               },
+  { X86::VPINSRDZrr             ,   X86::VPINSRDrr               },
+  { X86::VPINSRQZrm             ,   X86::VPINSRQrm               },
+  { X86::VPINSRQZrr             ,   X86::VPINSRQrr               },
+  { X86::VPINSRWZrm             ,   X86::VPINSRWrmi              },
+  { X86::VPINSRWZrr             ,   X86::VPINSRWrri              },
+
+  // EVEX 128 with corresponding VEX.
+  { X86::VADDPDZ128rm           ,    X86::VADDPDrm               },
+  { X86::VADDPDZ128rr           ,    X86::VADDPDrr               },
+  { X86::VADDPSZ128rm           ,    X86::VADDPSrm               },
+  { X86::VADDPSZ128rr           ,    X86::VADDPSrr               },
+  { X86::VANDNPDZ128rm          ,    X86::VANDNPDrm              },
+  { X86::VANDNPDZ128rr          ,    X86::VANDNPDrr              },
+  { X86::VANDNPSZ128rm          ,    X86::VANDNPSrm              },
+  { X86::VANDNPSZ128rr          ,    X86::VANDNPSrr              },
+  { X86::VANDPDZ128rm           ,    X86::VANDPDrm               },
+  { X86::VANDPDZ128rr           ,    X86::VANDPDrr               },
+  { X86::VANDPSZ128rm           ,    X86::VANDPSrm               },
+  { X86::VANDPSZ128rr           ,    X86::VANDPSrr               },      
+  { X86::VBROADCASTSSZ128m      ,    X86::VBROADCASTSSrm         },
+  { X86::VBROADCASTSSZ128r      ,    X86::VBROADCASTSSrr         },
+  { X86::VBROADCASTSSZ128r_s    ,    X86::VBROADCASTSSrr         },
+  { X86::VCVTDQ2PDZ128rm        ,    X86::VCVTDQ2PDrm            },
+  { X86::VCVTDQ2PDZ128rr        ,    X86::VCVTDQ2PDrr            },
+  { X86::VCVTDQ2PSZ128rm        ,    X86::VCVTDQ2PSrm            },
+  { X86::VCVTDQ2PSZ128rr        ,    X86::VCVTDQ2PSrr            },
+  { X86::VCVTPD2DQZ128rm        ,    X86::VCVTPD2DQrm            },
+  { X86::VCVTPD2DQZ128rr        ,    X86::VCVTPD2DQrr            },
+  { X86::VCVTPD2PSZ128rm        ,    X86::VCVTPD2PSrm            },
+  { X86::VCVTPD2PSZ128rr        ,    X86::VCVTPD2PSrr            },
+  { X86::VCVTPH2PSZ128rm        ,    X86::VCVTPH2PSrm            },
+  { X86::VCVTPH2PSZ128rr        ,    X86::VCVTPH2PSrr            },
+  { X86::VCVTPS2DQZ128rm        ,    X86::VCVTPS2DQrm            },
+  { X86::VCVTPS2DQZ128rr        ,    X86::VCVTPS2DQrr            },
+  { X86::VCVTPS2PDZ128rm        ,    X86::VCVTPS2PDrm            },
+  { X86::VCVTPS2PDZ128rr        ,    X86::VCVTPS2PDrr            },
+  { X86::VCVTPS2PHZ128mr        ,    X86::VCVTPS2PHmr            },
+  { X86::VCVTPS2PHZ128rr        ,    X86::VCVTPS2PHrr            },
+  { X86::VCVTTPD2DQZ128rm       ,    X86::VCVTTPD2DQrm           },
+  { X86::VCVTTPD2DQZ128rr       ,    X86::VCVTTPD2DQrr           },
+  { X86::VCVTTPS2DQZ128rm       ,    X86::VCVTTPS2DQrm           },
+  { X86::VCVTTPS2DQZ128rr       ,    X86::VCVTTPS2DQrr           },
+  { X86::VDIVPDZ128rm           ,    X86::VDIVPDrm               },
+  { X86::VDIVPDZ128rr           ,    X86::VDIVPDrr               },
+  { X86::VDIVPSZ128rm           ,    X86::VDIVPSrm               },
+  { X86::VDIVPSZ128rr           ,    X86::VDIVPSrr               },
+  { X86::VFMADD132PDZ128m       ,    X86::VFMADD132PDm           },
+  { X86::VFMADD132PDZ128r       ,    X86::VFMADD132PDr           },
+  { X86::VFMADD132PSZ128m       ,    X86::VFMADD132PSm           },
+  { X86::VFMADD132PSZ128r       ,    X86::VFMADD132PSr           },
+  { X86::VFMADD213PDZ128m       ,    X86::VFMADD213PDm           },
+  { X86::VFMADD213PDZ128r       ,    X86::VFMADD213PDr           },
+  { X86::VFMADD213PSZ128m       ,    X86::VFMADD213PSm           },
+  { X86::VFMADD213PSZ128r       ,    X86::VFMADD213PSr           },
+  { X86::VFMADD231PDZ128m       ,    X86::VFMADD231PDm           },
+  { X86::VFMADD231PDZ128r       ,    X86::VFMADD231PDr           },
+  { X86::VFMADD231PSZ128m       ,    X86::VFMADD231PSm           },
+  { X86::VFMADD231PSZ128r       ,    X86::VFMADD231PSr           },
+  { X86::VFMADDSUB132PDZ128m    ,    X86::VFMADDSUB132PDm        },
+  { X86::VFMADDSUB132PDZ128r    ,    X86::VFMADDSUB132PDr        },
+  { X86::VFMADDSUB132PSZ128m    ,    X86::VFMADDSUB132PSm        },
+  { X86::VFMADDSUB132PSZ128r    ,    X86::VFMADDSUB132PSr        },
+  { X86::VFMADDSUB213PDZ128m    ,    X86::VFMADDSUB213PDm        },
+  { X86::VFMADDSUB213PDZ128r    ,    X86::VFMADDSUB213PDr        },
+  { X86::VFMADDSUB213PSZ128m    ,    X86::VFMADDSUB213PSm        },
+  { X86::VFMADDSUB213PSZ128r    ,    X86::VFMADDSUB213PSr        },
+  { X86::VFMADDSUB231PDZ128m    ,    X86::VFMADDSUB231PDm        },
+  { X86::VFMADDSUB231PDZ128r    ,    X86::VFMADDSUB231PDr        },
+  { X86::VFMADDSUB231PSZ128m    ,    X86::VFMADDSUB231PSm        },
+  { X86::VFMADDSUB231PSZ128r    ,    X86::VFMADDSUB231PSr        },
+  { X86::VFMSUB132PDZ128m       ,    X86::VFMSUB132PDm           },
+  { X86::VFMSUB132PDZ128r       ,    X86::VFMSUB132PDr           },
+  { X86::VFMSUB132PSZ128m       ,    X86::VFMSUB132PSm           },
+  { X86::VFMSUB132PSZ128r       ,    X86::VFMSUB132PSr           },
+  { X86::VFMSUB213PDZ128m       ,    X86::VFMSUB213PDm           },
+  { X86::VFMSUB213PDZ128r       ,    X86::VFMSUB213PDr           },
+  { X86::VFMSUB213PSZ128m       ,    X86::VFMSUB213PSm           },
+  { X86::VFMSUB213PSZ128r       ,    X86::VFMSUB213PSr           },
+  { X86::VFMSUB231PDZ128m       ,    X86::VFMSUB231PDm           },
+  { X86::VFMSUB231PDZ128r       ,    X86::VFMSUB231PDr           },
+  { X86::VFMSUB231PSZ128m       ,    X86::VFMSUB231PSm           },
+  { X86::VFMSUB231PSZ128r       ,    X86::VFMSUB231PSr           },
+  { X86::VFMSUBADD132PDZ128m    ,    X86::VFMSUBADD132PDm        },
+  { X86::VFMSUBADD132PDZ128r    ,    X86::VFMSUBADD132PDr        },
+  { X86::VFMSUBADD132PSZ128m    ,    X86::VFMSUBADD132PSm        },
+  { X86::VFMSUBADD132PSZ128r    ,    X86::VFMSUBADD132PSr        },
+  { X86::VFMSUBADD213PDZ128m    ,    X86::VFMSUBADD213PDm        },
+  { X86::VFMSUBADD213PDZ128r    ,    X86::VFMSUBADD213PDr        },
+  { X86::VFMSUBADD213PSZ128m    ,    X86::VFMSUBADD213PSm        },
+  { X86::VFMSUBADD213PSZ128r    ,    X86::VFMSUBADD213PSr        },
+  { X86::VFMSUBADD231PDZ128m    ,    X86::VFMSUBADD231PDm        },
+  { X86::VFMSUBADD231PDZ128r    ,    X86::VFMSUBADD231PDr        },
+  { X86::VFMSUBADD231PSZ128m    ,    X86::VFMSUBADD231PSm        },
+  { X86::VFMSUBADD231PSZ128r    ,    X86::VFMSUBADD231PSr        },
+  { X86::VFNMADD132PDZ128m      ,    X86::VFNMADD132PDm          },
+  { X86::VFNMADD132PDZ128r      ,    X86::VFNMADD132PDr          },
+  { X86::VFNMADD132PSZ128m      ,    X86::VFNMADD132PSm          },
+  { X86::VFNMADD132PSZ128r      ,    X86::VFNMADD132PSr          },
+  { X86::VFNMADD213PDZ128m      ,    X86::VFNMADD213PDm          },
+  { X86::VFNMADD213PDZ128r      ,    X86::VFNMADD213PDr          },
+  { X86::VFNMADD213PSZ128m      ,    X86::VFNMADD213PSm          },
+  { X86::VFNMADD213PSZ128r      ,    X86::VFNMADD213PSr          },
+  { X86::VFNMADD231PDZ128m      ,    X86::VFNMADD231PDm          },
+  { X86::VFNMADD231PDZ128r      ,    X86::VFNMADD231PDr          },
+  { X86::VFNMADD231PSZ128m      ,    X86::VFNMADD231PSm          },
+  { X86::VFNMADD231PSZ128r      ,    X86::VFNMADD231PSr          },
+  { X86::VFNMSUB132PDZ128m      ,    X86::VFNMSUB132PDm          },
+  { X86::VFNMSUB132PDZ128r      ,    X86::VFNMSUB132PDr          },
+  { X86::VFNMSUB132PSZ128m      ,    X86::VFNMSUB132PSm          },
+  { X86::VFNMSUB132PSZ128r      ,    X86::VFNMSUB132PSr          },
+  { X86::VFNMSUB213PDZ128m      ,    X86::VFNMSUB213PDm          },
+  { X86::VFNMSUB213PDZ128r      ,    X86::VFNMSUB213PDr          },
+  { X86::VFNMSUB213PSZ128m      ,    X86::VFNMSUB213PSm          },
+  { X86::VFNMSUB213PSZ128r      ,    X86::VFNMSUB213PSr          },
+  { X86::VFNMSUB231PDZ128m      ,    X86::VFNMSUB231PDm          },
+  { X86::VFNMSUB231PDZ128r      ,    X86::VFNMSUB231PDr          },
+  { X86::VFNMSUB231PSZ128m      ,    X86::VFNMSUB231PSm          },
+  { X86::VFNMSUB231PSZ128r      ,    X86::VFNMSUB231PSr          },
+  { X86::VMAXCPDZ128rm          ,    X86::VMAXCPDrm              },
+  { X86::VMAXCPDZ128rr          ,    X86::VMAXCPDrr              },
+  { X86::VMAXCPSZ128rm          ,    X86::VMAXCPSrm              },
+  { X86::VMAXCPSZ128rr          ,    X86::VMAXCPSrr              },
+  { X86::VMAXPDZ128rm           ,    X86::VMAXPDrm               },
+  { X86::VMAXPDZ128rr           ,    X86::VMAXPDrr               },
+  { X86::VMAXPSZ128rm           ,    X86::VMAXPSrm               },
+  { X86::VMAXPSZ128rr           ,    X86::VMAXPSrr               },
+  { X86::VMINCPDZ128rm          ,    X86::VMINCPDrm              },
+  { X86::VMINCPDZ128rr          ,    X86::VMINCPDrr              },
+  { X86::VMINCPSZ128rm          ,    X86::VMINCPSrm              },
+  { X86::VMINCPSZ128rr          ,    X86::VMINCPSrr              },
+  { X86::VMINPDZ128rm           ,    X86::VMINPDrm               },
+  { X86::VMINPDZ128rr           ,    X86::VMINPDrr               },
+  { X86::VMINPSZ128rm           ,    X86::VMINPSrm               },
+  { X86::VMINPSZ128rr           ,    X86::VMINPSrr               },
+  { X86::VMOVAPDZ128mr          ,    X86::VMOVAPDmr              },
+  { X86::VMOVAPDZ128rm          ,    X86::VMOVAPDrm              },
+  { X86::VMOVAPDZ128rr          ,    X86::VMOVAPDrr              },
+  { X86::VMOVAPDZ128rr_REV      ,    X86::VMOVAPDrr_REV          },
+  { X86::VMOVAPSZ128mr          ,    X86::VMOVAPSmr              },    
+  { X86::VMOVAPSZ128rm          ,    X86::VMOVAPSrm              },    
+  { X86::VMOVAPSZ128rr          ,    X86::VMOVAPSrr              },
+  { X86::VMOVAPSZ128rr_REV      ,    X86::VMOVAPSrr_REV          },
+  { X86::VMOVDDUPZ128rm         ,    X86::VMOVDDUPrm             },
+  { X86::VMOVDDUPZ128rr         ,    X86::VMOVDDUPrr             },
+  { X86::VMOVDQA32Z128mr        ,    X86::VMOVDQAmr              },
+  { X86::VMOVDQA32Z128rm        ,    X86::VMOVDQArm              },
+  { X86::VMOVDQA32Z128rr        ,    X86::VMOVDQArr              },
+  { X86::VMOVDQA32Z128rr_REV    ,    X86::VMOVDQArr_REV          },
+  { X86::VMOVDQA64Z128mr        ,    X86::VMOVDQAmr              },
+  { X86::VMOVDQA64Z128rm        ,    X86::VMOVDQArm              },
+  { X86::VMOVDQA64Z128rr        ,    X86::VMOVDQArr              },
+  { X86::VMOVDQA64Z128rr_REV    ,    X86::VMOVDQArr_REV          },
+  { X86::VMOVDQU16Z128mr        ,    X86::VMOVDQUmr              },
+  { X86::VMOVDQU16Z128rm        ,    X86::VMOVDQUrm              },
+  { X86::VMOVDQU16Z128rr        ,    X86::VMOVDQUrr              },
+  { X86::VMOVDQU16Z128rr_REV    ,    X86::VMOVDQUrr_REV          },
+  { X86::VMOVDQU32Z128mr        ,    X86::VMOVDQUmr              },
+  { X86::VMOVDQU32Z128rm        ,    X86::VMOVDQUrm              },
+  { X86::VMOVDQU32Z128rr        ,    X86::VMOVDQUrr              },
+  { X86::VMOVDQU32Z128rr_REV    ,    X86::VMOVDQUrr_REV          },
+  { X86::VMOVDQU64Z128mr        ,    X86::VMOVDQUmr              },
+  { X86::VMOVDQU64Z128rm        ,    X86::VMOVDQUrm              },
+  { X86::VMOVDQU64Z128rr        ,    X86::VMOVDQUrr              },
+  { X86::VMOVDQU64Z128rr_REV    ,    X86::VMOVDQUrr_REV          },
+  { X86::VMOVDQU8Z128mr         ,    X86::VMOVDQUmr              },
+  { X86::VMOVDQU8Z128rm         ,    X86::VMOVDQUrm              },
+  { X86::VMOVDQU8Z128rr         ,    X86::VMOVDQUrr              },
+  { X86::VMOVDQU8Z128rr_REV     ,    X86::VMOVDQUrr_REV          },
+  { X86::VMOVHPDZ128mr          ,    X86::VMOVHPDmr              },
+  { X86::VMOVHPDZ128rm          ,    X86::VMOVHPDrm              },
+  { X86::VMOVHPSZ128mr          ,    X86::VMOVHPSmr              },
+  { X86::VMOVHPSZ128rm          ,    X86::VMOVHPSrm              },
+  { X86::VMOVLPDZ128mr          ,    X86::VMOVLPDmr              },
+  { X86::VMOVLPDZ128rm          ,    X86::VMOVLPDrm              },
+  { X86::VMOVLPSZ128mr          ,    X86::VMOVLPSmr              },
+  { X86::VMOVLPSZ128rm          ,    X86::VMOVLPSrm              },
+  { X86::VMOVNTDQAZ128rm        ,    X86::VMOVNTDQArm            },
+  { X86::VMOVNTDQZ128mr         ,    X86::VMOVNTDQmr             },
+  { X86::VMOVNTPDZ128mr         ,    X86::VMOVNTPDmr             },
+  { X86::VMOVNTPSZ128mr         ,    X86::VMOVNTPSmr             },
+  { X86::VMOVSHDUPZ128rm        ,    X86::VMOVSHDUPrm            },
+  { X86::VMOVSHDUPZ128rr        ,    X86::VMOVSHDUPrr            },
+  { X86::VMOVSLDUPZ128rm        ,    X86::VMOVSLDUPrm            },
+  { X86::VMOVSLDUPZ128rr        ,    X86::VMOVSLDUPrr            },
+  { X86::VMOVUPDZ128mr          ,    X86::VMOVUPDmr              },
+  { X86::VMOVUPDZ128rm          ,    X86::VMOVUPDrm              },
+  { X86::VMOVUPDZ128rr          ,    X86::VMOVUPDrr              },
+  { X86::VMOVUPDZ128rr_REV      ,    X86::VMOVUPDrr_REV          },
+  { X86::VMOVUPSZ128mr          ,    X86::VMOVUPSmr              },    
+  { X86::VMOVUPSZ128rm          ,    X86::VMOVUPSrm              },    
+  { X86::VMOVUPSZ128rr          ,    X86::VMOVUPSrr              },
+  { X86::VMOVUPSZ128rr_REV      ,    X86::VMOVUPSrr_REV          },
+  { X86::VMULPDZ128rm           ,    X86::VMULPDrm               },
+  { X86::VMULPDZ128rr           ,    X86::VMULPDrr               },
+  { X86::VMULPSZ128rm           ,    X86::VMULPSrm               },
+  { X86::VMULPSZ128rr           ,    X86::VMULPSrr               },
+  { X86::VORPDZ128rm            ,    X86::VORPDrm                },
+  { X86::VORPDZ128rr            ,    X86::VORPDrr                },
+  { X86::VORPSZ128rm            ,    X86::VORPSrm                },
+  { X86::VORPSZ128rr            ,    X86::VORPSrr                },
+  { X86::VPABSBZ128rm           ,    X86::VPABSBrm               },
+  { X86::VPABSBZ128rr           ,    X86::VPABSBrr               },
+  { X86::VPABSDZ128rm           ,    X86::VPABSDrm               },
+  { X86::VPABSDZ128rr           ,    X86::VPABSDrr               },
+  { X86::VPABSWZ128rm           ,    X86::VPABSWrm               },
+  { X86::VPABSWZ128rr           ,    X86::VPABSWrr               },
+  { X86::VPACKSSDWZ128rm        ,    X86::VPACKSSDWrm            },
+  { X86::VPACKSSDWZ128rr        ,    X86::VPACKSSDWrr            },
+  { X86::VPACKSSWBZ128rm        ,    X86::VPACKSSWBrm            },
+  { X86::VPACKSSWBZ128rr        ,    X86::VPACKSSWBrr            },
+  { X86::VPACKUSDWZ128rm        ,    X86::VPACKUSDWrm            },
+  { X86::VPACKUSDWZ128rr        ,    X86::VPACKUSDWrr            },
+  { X86::VPACKUSWBZ128rm        ,    X86::VPACKUSWBrm            },
+  { X86::VPACKUSWBZ128rr        ,    X86::VPACKUSWBrr            },
+  { X86::VPADDBZ128rm           ,    X86::VPADDBrm               },
+  { X86::VPADDBZ128rr           ,    X86::VPADDBrr               },
+  { X86::VPADDDZ128rm           ,    X86::VPADDDrm               },
+  { X86::VPADDDZ128rr           ,    X86::VPADDDrr               },
+  { X86::VPADDQZ128rm           ,    X86::VPADDQrm               },
+  { X86::VPADDQZ128rr           ,    X86::VPADDQrr               },
+  { X86::VPADDSBZ128rm          ,    X86::VPADDSBrm              },
+  { X86::VPADDSBZ128rr          ,    X86::VPADDSBrr              },
+  { X86::VPADDSWZ128rm          ,    X86::VPADDSWrm              },
+  { X86::VPADDSWZ128rr          ,    X86::VPADDSWrr              },
+  { X86::VPADDUSBZ128rm         ,    X86::VPADDUSBrm             },
+  { X86::VPADDUSBZ128rr         ,    X86::VPADDUSBrr             },
+  { X86::VPADDUSWZ128rm         ,    X86::VPADDUSWrm             },
+  { X86::VPADDUSWZ128rr         ,    X86::VPADDUSWrr             },
+  { X86::VPADDWZ128rm           ,    X86::VPADDWrm               },
+  { X86::VPADDWZ128rr           ,    X86::VPADDWrr               },
+  { X86::VPALIGNRZ128rmi        ,    X86::VPALIGNRrmi            },
+  { X86::VPALIGNRZ128rri        ,    X86::VPALIGNRrri            },
+  { X86::VPANDDZ128rm           ,    X86::VPANDrm                },
+  { X86::VPANDDZ128rr           ,    X86::VPANDrr                },
+  { X86::VPANDQZ128rm           ,    X86::VPANDrm                },
+  { X86::VPANDQZ128rr           ,    X86::VPANDrr                },
+  { X86::VPAVGBZ128rm           ,    X86::VPAVGBrm               },
+  { X86::VPAVGBZ128rr           ,    X86::VPAVGBrr               },
+  { X86::VPAVGWZ128rm           ,    X86::VPAVGWrm               },
+  { X86::VPAVGWZ128rr           ,    X86::VPAVGWrr               },
+  { X86::VPBROADCASTBZ128m      ,    X86::VPBROADCASTBrm         },
+  { X86::VPBROADCASTBZ128r      ,    X86::VPBROADCASTBrr         },
+  { X86::VPBROADCASTDZ128m      ,    X86::VPBROADCASTDrm         },
+  { X86::VPBROADCASTDZ128r      ,    X86::VPBROADCASTDrr         },
+  { X86::VPBROADCASTQZ128m      ,    X86::VPBROADCASTQrm         }, 
+  { X86::VPBROADCASTQZ128r      ,    X86::VPBROADCASTQrr         }, 
+  { X86::VPBROADCASTWZ128m      ,    X86::VPBROADCASTWrm         },    
+  { X86::VPBROADCASTWZ128r      ,    X86::VPBROADCASTWrr         },
+  { X86::VPERMILPDZ128mi        ,    X86::VPERMILPDmi            },
+  { X86::VPERMILPDZ128ri        ,    X86::VPERMILPDri            },
+  { X86::VPERMILPDZ128rm        ,    X86::VPERMILPDrm            },
+  { X86::VPERMILPDZ128rr        ,    X86::VPERMILPDrr            },
+  { X86::VPERMILPSZ128mi        ,    X86::VPERMILPSmi            },
+  { X86::VPERMILPSZ128ri        ,    X86::VPERMILPSri            },
+  { X86::VPERMILPSZ128rm        ,    X86::VPERMILPSrm            },
+  { X86::VPERMILPSZ128rr        ,    X86::VPERMILPSrr            },
+  { X86::VPMADDUBSWZ128rm       ,    X86::VPMADDUBSWrm           },
+  { X86::VPMADDUBSWZ128rr       ,    X86::VPMADDUBSWrr           },
+  { X86::VPMADDWDZ128rm         ,    X86::VPMADDWDrm             },
+  { X86::VPMADDWDZ128rr         ,    X86::VPMADDWDrr             },
+  { X86::VPMAXSBZ128rm          ,    X86::VPMAXSBrm              },
+  { X86::VPMAXSBZ128rr          ,    X86::VPMAXSBrr              },
+  { X86::VPMAXSDZ128rm          ,    X86::VPMAXSDrm              },
+  { X86::VPMAXSDZ128rr          ,    X86::VPMAXSDrr              },
+  { X86::VPMAXSWZ128rm          ,    X86::VPMAXSWrm              },
+  { X86::VPMAXSWZ128rr          ,    X86::VPMAXSWrr              },
+  { X86::VPMAXUBZ128rm          ,    X86::VPMAXUBrm              },
+  { X86::VPMAXUBZ128rr          ,    X86::VPMAXUBrr              },
+  { X86::VPMAXUDZ128rm          ,    X86::VPMAXUDrm              },
+  { X86::VPMAXUDZ128rr          ,    X86::VPMAXUDrr              },
+  { X86::VPMAXUWZ128rm          ,    X86::VPMAXUWrm              },
+  { X86::VPMAXUWZ128rr          ,    X86::VPMAXUWrr              },
+  { X86::VPMINSBZ128rm          ,    X86::VPMINSBrm              },
+  { X86::VPMINSBZ128rr          ,    X86::VPMINSBrr              },
+  { X86::VPMINSDZ128rm          ,    X86::VPMINSDrm              },
+  { X86::VPMINSDZ128rr          ,    X86::VPMINSDrr              },
+  { X86::VPMINSWZ128rm          ,    X86::VPMINSWrm              },
+  { X86::VPMINSWZ128rr          ,    X86::VPMINSWrr              },
+  { X86::VPMINUBZ128rm          ,    X86::VPMINUBrm              },
+  { X86::VPMINUBZ128rr          ,    X86::VPMINUBrr              },
+  { X86::VPMINUDZ128rm          ,    X86::VPMINUDrm              },
+  { X86::VPMINUDZ128rr          ,    X86::VPMINUDrr              },
+  { X86::VPMINUWZ128rm          ,    X86::VPMINUWrm              },
+  { X86::VPMINUWZ128rr          ,    X86::VPMINUWrr              },
+  { X86::VPMOVSXBDZ128rm        ,    X86::VPMOVSXBDrm            },
+  { X86::VPMOVSXBDZ128rr        ,    X86::VPMOVSXBDrr            },
+  { X86::VPMOVSXBQZ128rm        ,    X86::VPMOVSXBQrm            },
+  { X86::VPMOVSXBQZ128rr        ,    X86::VPMOVSXBQrr            },
+  { X86::VPMOVSXBWZ128rm        ,    X86::VPMOVSXBWrm            },
+  { X86::VPMOVSXBWZ128rr        ,    X86::VPMOVSXBWrr            },
+  { X86::VPMOVSXDQZ128rm        ,    X86::VPMOVSXDQrm            },
+  { X86::VPMOVSXDQZ128rr        ,    X86::VPMOVSXDQrr            },
+  { X86::VPMOVSXWDZ128rm        ,    X86::VPMOVSXWDrm            },
+  { X86::VPMOVSXWDZ128rr        ,    X86::VPMOVSXWDrr            },
+  { X86::VPMOVSXWQZ128rm        ,    X86::VPMOVSXWQrm            },
+  { X86::VPMOVSXWQZ128rr        ,    X86::VPMOVSXWQrr            },
+  { X86::VPMOVZXBDZ128rm        ,    X86::VPMOVZXBDrm            },
+  { X86::VPMOVZXBDZ128rr        ,    X86::VPMOVZXBDrr            },
+  { X86::VPMOVZXBQZ128rm        ,    X86::VPMOVZXBQrm            },
+  { X86::VPMOVZXBQZ128rr        ,    X86::VPMOVZXBQrr            },
+  { X86::VPMOVZXBWZ128rm        ,    X86::VPMOVZXBWrm            },
+  { X86::VPMOVZXBWZ128rr        ,    X86::VPMOVZXBWrr            },
+  { X86::VPMOVZXDQZ128rm        ,    X86::VPMOVZXDQrm            },
+  { X86::VPMOVZXDQZ128rr        ,    X86::VPMOVZXDQrr            },
+  { X86::VPMOVZXWDZ128rm        ,    X86::VPMOVZXWDrm            },
+  { X86::VPMOVZXWDZ128rr        ,    X86::VPMOVZXWDrr            },
+  { X86::VPMOVZXWQZ128rm        ,    X86::VPMOVZXWQrm            },
+  { X86::VPMOVZXWQZ128rr        ,    X86::VPMOVZXWQrr            },    
+  { X86::VPMULDQZ128rm          ,    X86::VPMULDQrm              },
+  { X86::VPMULDQZ128rr          ,    X86::VPMULDQrr              },
+  { X86::VPMULHRSWZ128rm        ,    X86::VPMULHRSWrm            },
+  { X86::VPMULHRSWZ128rr        ,    X86::VPMULHRSWrr            },
+  { X86::VPMULHUWZ128rm         ,    X86::VPMULHUWrm             },
+  { X86::VPMULHUWZ128rr         ,    X86::VPMULHUWrr             },
+  { X86::VPMULHWZ128rm          ,    X86::VPMULHWrm              },
+  { X86::VPMULHWZ128rr          ,    X86::VPMULHWrr              },
+  { X86::VPMULLDZ128rm          ,    X86::VPMULLDrm              },
+  { X86::VPMULLDZ128rr          ,    X86::VPMULLDrr              },
+  { X86::VPMULLWZ128rm          ,    X86::VPMULLWrm              },
+  { X86::VPMULLWZ128rr          ,    X86::VPMULLWrr              },
+  { X86::VPMULUDQZ128rm         ,    X86::VPMULUDQrm             },
+  { X86::VPMULUDQZ128rr         ,    X86::VPMULUDQrr             },
+  { X86::VPORDZ128rm            ,    X86::VPORrm                 },
+  { X86::VPORDZ128rr            ,    X86::VPORrr                 },
+  { X86::VPORQZ128rm            ,    X86::VPORrm                 },
+  { X86::VPORQZ128rr            ,    X86::VPORrr                 },
+  { X86::VPSADBWZ128rm          ,    X86::VPSADBWrm              },
+  { X86::VPSADBWZ128rr          ,    X86::VPSADBWrr              },
+  { X86::VPSHUFBZ128rm          ,    X86::VPSHUFBrm              },
+  { X86::VPSHUFBZ128rr          ,    X86::VPSHUFBrr              },
+  { X86::VPSHUFDZ128mi          ,    X86::VPSHUFDmi              },
+  { X86::VPSHUFDZ128ri          ,    X86::VPSHUFDri              },
+  { X86::VPSHUFHWZ128mi         ,    X86::VPSHUFHWmi             },
+  { X86::VPSHUFHWZ128ri         ,    X86::VPSHUFHWri             },
+  { X86::VPSHUFLWZ128mi         ,    X86::VPSHUFLWmi             },
+  { X86::VPSHUFLWZ128ri         ,    X86::VPSHUFLWri             },
+  { X86::VPSLLDQZ128rr          ,    X86::VPSLLDQri              },    
+  { X86::VPSLLDZ128ri           ,    X86::VPSLLDri               },
+  { X86::VPSLLDZ128rm           ,    X86::VPSLLDrm               },
+  { X86::VPSLLDZ128rr           ,    X86::VPSLLDrr               },    
+  { X86::VPSLLQZ128ri           ,    X86::VPSLLQri               },
+  { X86::VPSLLQZ128rm           ,    X86::VPSLLQrm               },
+  { X86::VPSLLQZ128rr           ,    X86::VPSLLQrr               },
+  { X86::VPSLLVDZ128rm          ,    X86::VPSLLVDrm              },
+  { X86::VPSLLVDZ128rr          ,    X86::VPSLLVDrr              },
+  { X86::VPSLLVQZ128rm          ,    X86::VPSLLVQrm              },
+  { X86::VPSLLVQZ128rr          ,    X86::VPSLLVQrr              },
+  { X86::VPSLLWZ128ri           ,    X86::VPSLLWri               },
+  { X86::VPSLLWZ128rm           ,    X86::VPSLLWrm               },
+  { X86::VPSLLWZ128rr           ,    X86::VPSLLWrr               },
+  { X86::VPSRADZ128ri           ,    X86::VPSRADri               },
+  { X86::VPSRADZ128rm           ,    X86::VPSRADrm               },
+  { X86::VPSRADZ128rr           ,    X86::VPSRADrr               },
+  { X86::VPSRAVDZ128rm          ,    X86::VPSRAVDrm              },
+  { X86::VPSRAVDZ128rr          ,    X86::VPSRAVDrr              },
+  { X86::VPSRAWZ128ri           ,    X86::VPSRAWri               },
+  { X86::VPSRAWZ128rm           ,    X86::VPSRAWrm               },
+  { X86::VPSRAWZ128rr           ,    X86::VPSRAWrr               },
+  { X86::VPSRLDQZ128rr          ,    X86::VPSRLDQri              },
+  { X86::VPSRLDZ128ri           ,    X86::VPSRLDri               },
+  { X86::VPSRLDZ128rm           ,    X86::VPSRLDrm               },
+  { X86::VPSRLDZ128rr           ,    X86::VPSRLDrr               },
+  { X86::VPSRLQZ128ri           ,    X86::VPSRLQri               },
+  { X86::VPSRLQZ128rm           ,    X86::VPSRLQrm               },
+  { X86::VPSRLQZ128rr           ,    X86::VPSRLQrr               },
+  { X86::VPSRLVDZ128rm          ,    X86::VPSRLVDrm              },
+  { X86::VPSRLVDZ128rr          ,    X86::VPSRLVDrr              },
+  { X86::VPSRLVQZ128rm          ,    X86::VPSRLVQrm              },
+  { X86::VPSRLVQZ128rr          ,    X86::VPSRLVQrr              },
+  { X86::VPSRLWZ128ri           ,    X86::VPSRLWri               },
+  { X86::VPSRLWZ128rm           ,    X86::VPSRLWrm               },
+  { X86::VPSRLWZ128rr           ,    X86::VPSRLWrr               },
+  { X86::VPSUBBZ128rm           ,    X86::VPSUBBrm               },
+  { X86::VPSUBBZ128rr           ,    X86::VPSUBBrr               },
+  { X86::VPSUBDZ128rm           ,    X86::VPSUBDrm               },
+  { X86::VPSUBDZ128rr           ,    X86::VPSUBDrr               },
+  { X86::VPSUBQZ128rm           ,    X86::VPSUBQrm               },
+  { X86::VPSUBQZ128rr           ,    X86::VPSUBQrr               },
+  { X86::VPSUBSBZ128rm          ,    X86::VPSUBSBrm              },
+  { X86::VPSUBSBZ128rr          ,    X86::VPSUBSBrr              },
+  { X86::VPSUBSWZ128rm          ,    X86::VPSUBSWrm              },
+  { X86::VPSUBSWZ128rr          ,    X86::VPSUBSWrr              },
+  { X86::VPSUBUSBZ128rm         ,    X86::VPSUBUSBrm             },
+  { X86::VPSUBUSBZ128rr         ,    X86::VPSUBUSBrr             },
+  { X86::VPSUBUSWZ128rm         ,    X86::VPSUBUSWrm             },
+  { X86::VPSUBUSWZ128rr         ,    X86::VPSUBUSWrr             },
+  { X86::VPSUBWZ128rm           ,    X86::VPSUBWrm               },
+  { X86::VPSUBWZ128rr           ,    X86::VPSUBWrr               },
+  { X86::VPUNPCKHBWZ128rm       ,    X86::VPUNPCKHBWrm           },
+  { X86::VPUNPCKHBWZ128rr       ,    X86::VPUNPCKHBWrr           },
+  { X86::VPUNPCKHDQZ128rm       ,    X86::VPUNPCKHDQrm           },
+  { X86::VPUNPCKHDQZ128rr       ,    X86::VPUNPCKHDQrr           },
+  { X86::VPUNPCKHQDQZ128rm      ,    X86::VPUNPCKHQDQrm          },
+  { X86::VPUNPCKHQDQZ128rr      ,    X86::VPUNPCKHQDQrr          },
+  { X86::VPUNPCKHWDZ128rm       ,    X86::VPUNPCKHWDrm           },
+  { X86::VPUNPCKHWDZ128rr       ,    X86::VPUNPCKHWDrr           },
+  { X86::VPUNPCKLBWZ128rm       ,    X86::VPUNPCKLBWrm           },
+  { X86::VPUNPCKLBWZ128rr       ,    X86::VPUNPCKLBWrr           },
+  { X86::VPUNPCKLDQZ128rm       ,    X86::VPUNPCKLDQrm           },
+  { X86::VPUNPCKLDQZ128rr       ,    X86::VPUNPCKLDQrr           },
+  { X86::VPUNPCKLQDQZ128rm      ,    X86::VPUNPCKLQDQrm          },
+  { X86::VPUNPCKLQDQZ128rr      ,    X86::VPUNPCKLQDQrr          },
+  { X86::VPUNPCKLWDZ128rm       ,    X86::VPUNPCKLWDrm           },
+  { X86::VPUNPCKLWDZ128rr       ,    X86::VPUNPCKLWDrr           },
+  { X86::VPXORDZ128rm           ,    X86::VPXORrm                },
+  { X86::VPXORDZ128rr           ,    X86::VPXORrr                },
+  { X86::VPXORQZ128rm           ,    X86::VPXORrm                },
+  { X86::VPXORQZ128rr           ,    X86::VPXORrr                },
+  { X86::VSHUFPDZ128rmi         ,    X86::VSHUFPDrmi             },
+  { X86::VSHUFPDZ128rri         ,    X86::VSHUFPDrri             },
+  { X86::VSHUFPSZ128rmi         ,    X86::VSHUFPSrmi             },
+  { X86::VSHUFPSZ128rri         ,    X86::VSHUFPSrri             },
+  { X86::VSQRTPDZ128m           ,    X86::VSQRTPDm               },
+  { X86::VSQRTPDZ128r           ,    X86::VSQRTPDr               },
+  { X86::VSQRTPSZ128m           ,    X86::VSQRTPSm               },
+  { X86::VSQRTPSZ128r           ,    X86::VSQRTPSr               },
+  { X86::VSUBPDZ128rm           ,    X86::VSUBPDrm               },
+  { X86::VSUBPDZ128rr           ,    X86::VSUBPDrr               },
+  { X86::VSUBPSZ128rm           ,    X86::VSUBPSrm               },
+  { X86::VSUBPSZ128rr           ,    X86::VSUBPSrr               },
+  { X86::VUNPCKHPDZ128rm        ,    X86::VUNPCKHPDrm            },
+  { X86::VUNPCKHPDZ128rr        ,    X86::VUNPCKHPDrr            },
+  { X86::VUNPCKHPSZ128rm        ,    X86::VUNPCKHPSrm            },
+  { X86::VUNPCKHPSZ128rr        ,    X86::VUNPCKHPSrr            },
+  { X86::VUNPCKLPDZ128rm        ,    X86::VUNPCKLPDrm            },
+  { X86::VUNPCKLPDZ128rr        ,    X86::VUNPCKLPDrr            },
+  { X86::VUNPCKLPSZ128rm        ,    X86::VUNPCKLPSrm            },
+  { X86::VUNPCKLPSZ128rr        ,    X86::VUNPCKLPSrr            },
+  { X86::VXORPDZ128rm           ,    X86::VXORPDrm               },
+  { X86::VXORPDZ128rr           ,    X86::VXORPDrr               },
+  { X86::VXORPSZ128rm           ,    X86::VXORPSrm               },
+  { X86::VXORPSZ128rr           ,    X86::VXORPSrr               },
+};
+
+
+// X86 EVEX encoded instructions that have a VEX 256 encoding
+// (table format: <EVEX opcode, VEX-256 opcode>).
+ static const X86EvexToVexCompressTableEntry 
+   X86EvexToVex256CompressTable[] = {
+  { X86::VADDPDZ256rm           ,     X86::VADDPDYrm             },
+  { X86::VADDPDZ256rr           ,     X86::VADDPDYrr             },
+  { X86::VADDPSZ256rm           ,     X86::VADDPSYrm             },
+  { X86::VADDPSZ256rr           ,     X86::VADDPSYrr             },
+  { X86::VANDNPDZ256rm          ,     X86::VANDNPDYrm            },
+  { X86::VANDNPDZ256rr          ,     X86::VANDNPDYrr            },
+  { X86::VANDNPSZ256rm          ,     X86::VANDNPSYrm            },
+  { X86::VANDNPSZ256rr          ,     X86::VANDNPSYrr            },
+  { X86::VANDPDZ256rm           ,     X86::VANDPDYrm             },
+  { X86::VANDPDZ256rr           ,     X86::VANDPDYrr             },
+  { X86::VANDPSZ256rm           ,     X86::VANDPSYrm             },
+  { X86::VANDPSZ256rr           ,     X86::VANDPSYrr             },
+  { X86::VBROADCASTSDZ256m      ,     X86::VBROADCASTSDYrm       }, 
+  { X86::VBROADCASTSDZ256r      ,     X86::VBROADCASTSDYrr       }, 
+  { X86::VBROADCASTSDZ256r_s    ,     X86::VBROADCASTSDYrr       }, 
+  { X86::VBROADCASTSSZ256m      ,     X86::VBROADCASTSSYrm       },
+  { X86::VBROADCASTSSZ256r      ,     X86::VBROADCASTSSYrr       }, 
+  { X86::VBROADCASTSSZ256r_s    ,     X86::VBROADCASTSSYrr       },
+  { X86::VCVTDQ2PDZ256rm        ,     X86::VCVTDQ2PDYrm          },
+  { X86::VCVTDQ2PDZ256rr        ,     X86::VCVTDQ2PDYrr          },
+  { X86::VCVTDQ2PSZ256rm        ,     X86::VCVTDQ2PSYrm          },
+  { X86::VCVTDQ2PSZ256rr        ,     X86::VCVTDQ2PSYrr          },
+  { X86::VCVTPD2DQZ256rm        ,     X86::VCVTPD2DQYrm          },
+  { X86::VCVTPD2DQZ256rr        ,     X86::VCVTPD2DQYrr          },
+  { X86::VCVTPD2PSZ256rm        ,     X86::VCVTPD2PSYrm          },
+  { X86::VCVTPD2PSZ256rr        ,     X86::VCVTPD2PSYrr          },
+  { X86::VCVTPH2PSZ256rm        ,     X86::VCVTPH2PSYrm          },
+  { X86::VCVTPH2PSZ256rr        ,     X86::VCVTPH2PSYrr          },
+  { X86::VCVTPS2DQZ256rm        ,     X86::VCVTPS2DQYrm          },
+  { X86::VCVTPS2DQZ256rr        ,     X86::VCVTPS2DQYrr          },
+  { X86::VCVTPS2PDZ256rm        ,     X86::VCVTPS2PDYrm          },
+  { X86::VCVTPS2PDZ256rr        ,     X86::VCVTPS2PDYrr          },
+  { X86::VCVTPS2PHZ256mr        ,     X86::VCVTPS2PHYmr          },
+  { X86::VCVTPS2PHZ256rr        ,     X86::VCVTPS2PHYrr          },
+  { X86::VCVTTPD2DQZ256rm       ,     X86::VCVTTPD2DQYrm         },
+  { X86::VCVTTPD2DQZ256rr       ,     X86::VCVTTPD2DQYrr         },
+  { X86::VCVTTPS2DQZ256rm       ,     X86::VCVTTPS2DQYrm         },
+  { X86::VCVTTPS2DQZ256rr       ,     X86::VCVTTPS2DQYrr         },
+  { X86::VDIVPDZ256rm           ,     X86::VDIVPDYrm             },
+  { X86::VDIVPDZ256rr           ,     X86::VDIVPDYrr             },
+  { X86::VDIVPSZ256rm           ,     X86::VDIVPSYrm             },
+  { X86::VDIVPSZ256rr           ,     X86::VDIVPSYrr             },
+  { X86::VFMADD132PDZ256m       ,     X86::VFMADD132PDYm         },
+  { X86::VFMADD132PDZ256r       ,     X86::VFMADD132PDYr         },
+  { X86::VFMADD132PSZ256m       ,     X86::VFMADD132PSYm         },
+  { X86::VFMADD132PSZ256r       ,     X86::VFMADD132PSYr         },
+  { X86::VFMADD213PDZ256m       ,     X86::VFMADD213PDYm         },
+  { X86::VFMADD213PDZ256r       ,     X86::VFMADD213PDYr         },
+  { X86::VFMADD213PSZ256m       ,     X86::VFMADD213PSYm         },
+  { X86::VFMADD213PSZ256r       ,     X86::VFMADD213PSYr         },
+  { X86::VFMADD231PDZ256m       ,     X86::VFMADD231PDYm         },
+  { X86::VFMADD231PDZ256r       ,     X86::VFMADD231PDYr         },
+  { X86::VFMADD231PSZ256m       ,     X86::VFMADD231PSYm         },
+  { X86::VFMADD231PSZ256r       ,     X86::VFMADD231PSYr         },
+  { X86::VFMADDSUB132PDZ256m    ,     X86::VFMADDSUB132PDYm      },
+  { X86::VFMADDSUB132PDZ256r    ,     X86::VFMADDSUB132PDYr      },
+  { X86::VFMADDSUB132PSZ256m    ,     X86::VFMADDSUB132PSYm      },
+  { X86::VFMADDSUB132PSZ256r    ,     X86::VFMADDSUB132PSYr      },
+  { X86::VFMADDSUB213PDZ256m    ,     X86::VFMADDSUB213PDYm      },
+  { X86::VFMADDSUB213PDZ256r    ,     X86::VFMADDSUB213PDYr      },
+  { X86::VFMADDSUB213PSZ256m    ,     X86::VFMADDSUB213PSYm      },
+  { X86::VFMADDSUB213PSZ256r    ,     X86::VFMADDSUB213PSYr      },
+  { X86::VFMADDSUB231PDZ256m    ,     X86::VFMADDSUB231PDYm      },
+  { X86::VFMADDSUB231PDZ256r    ,     X86::VFMADDSUB231PDYr      },
+  { X86::VFMADDSUB231PSZ256m    ,     X86::VFMADDSUB231PSYm      },
+  { X86::VFMADDSUB231PSZ256r    ,     X86::VFMADDSUB231PSYr      },
+  { X86::VFMSUB132PDZ256m       ,     X86::VFMSUB132PDYm         },
+  { X86::VFMSUB132PDZ256r       ,     X86::VFMSUB132PDYr         },
+  { X86::VFMSUB132PSZ256m       ,     X86::VFMSUB132PSYm         },
+  { X86::VFMSUB132PSZ256r       ,     X86::VFMSUB132PSYr         },
+  { X86::VFMSUB213PDZ256m       ,     X86::VFMSUB213PDYm         },
+  { X86::VFMSUB213PDZ256r       ,     X86::VFMSUB213PDYr         },
+  { X86::VFMSUB213PSZ256m       ,     X86::VFMSUB213PSYm         },
+  { X86::VFMSUB213PSZ256r       ,     X86::VFMSUB213PSYr         },
+  { X86::VFMSUB231PDZ256m       ,     X86::VFMSUB231PDYm         },
+  { X86::VFMSUB231PDZ256r       ,     X86::VFMSUB231PDYr         },
+  { X86::VFMSUB231PSZ256m       ,     X86::VFMSUB231PSYm         },
+  { X86::VFMSUB231PSZ256r       ,     X86::VFMSUB231PSYr         },
+  { X86::VFMSUBADD132PDZ256m    ,     X86::VFMSUBADD132PDYm      },
+  { X86::VFMSUBADD132PDZ256r    ,     X86::VFMSUBADD132PDYr      },
+  { X86::VFMSUBADD132PSZ256m    ,     X86::VFMSUBADD132PSYm      },
+  { X86::VFMSUBADD132PSZ256r    ,     X86::VFMSUBADD132PSYr      },
+  { X86::VFMSUBADD213PDZ256m    ,     X86::VFMSUBADD213PDYm      },
+  { X86::VFMSUBADD213PDZ256r    ,     X86::VFMSUBADD213PDYr      },
+  { X86::VFMSUBADD213PSZ256m    ,     X86::VFMSUBADD213PSYm      },
+  { X86::VFMSUBADD213PSZ256r    ,     X86::VFMSUBADD213PSYr      },
+  { X86::VFMSUBADD231PDZ256m    ,     X86::VFMSUBADD231PDYm      },
+  { X86::VFMSUBADD231PDZ256r    ,     X86::VFMSUBADD231PDYr      },
+  { X86::VFMSUBADD231PSZ256m    ,     X86::VFMSUBADD231PSYm      },
+  { X86::VFMSUBADD231PSZ256r    ,     X86::VFMSUBADD231PSYr      },
+  { X86::VFNMADD132PDZ256m      ,     X86::VFNMADD132PDYm        },
+  { X86::VFNMADD132PDZ256r      ,     X86::VFNMADD132PDYr        },
+  { X86::VFNMADD132PSZ256m      ,     X86::VFNMADD132PSYm        },
+  { X86::VFNMADD132PSZ256r      ,     X86::VFNMADD132PSYr        },
+  { X86::VFNMADD213PDZ256m      ,     X86::VFNMADD213PDYm        },
+  { X86::VFNMADD213PDZ256r      ,     X86::VFNMADD213PDYr        },
+  { X86::VFNMADD213PSZ256m      ,     X86::VFNMADD213PSYm        },
+  { X86::VFNMADD213PSZ256r      ,     X86::VFNMADD213PSYr        },
+  { X86::VFNMADD231PDZ256m      ,     X86::VFNMADD231PDYm        },
+  { X86::VFNMADD231PDZ256r      ,     X86::VFNMADD231PDYr        },
+  { X86::VFNMADD231PSZ256m      ,     X86::VFNMADD231PSYm        },
+  { X86::VFNMADD231PSZ256r      ,     X86::VFNMADD231PSYr        },
+  { X86::VFNMSUB132PDZ256m      ,     X86::VFNMSUB132PDYm        },
+  { X86::VFNMSUB132PDZ256r      ,     X86::VFNMSUB132PDYr        },
+  { X86::VFNMSUB132PSZ256m      ,     X86::VFNMSUB132PSYm        },
+  { X86::VFNMSUB132PSZ256r      ,     X86::VFNMSUB132PSYr        },
+  { X86::VFNMSUB213PDZ256m      ,     X86::VFNMSUB213PDYm        },
+  { X86::VFNMSUB213PDZ256r      ,     X86::VFNMSUB213PDYr        },
+  { X86::VFNMSUB213PSZ256m      ,     X86::VFNMSUB213PSYm        },
+  { X86::VFNMSUB213PSZ256r      ,     X86::VFNMSUB213PSYr        },
+  { X86::VFNMSUB231PDZ256m      ,     X86::VFNMSUB231PDYm        },
+  { X86::VFNMSUB231PDZ256r      ,     X86::VFNMSUB231PDYr        },
+  { X86::VFNMSUB231PSZ256m      ,     X86::VFNMSUB231PSYm        },
+  { X86::VFNMSUB231PSZ256r      ,     X86::VFNMSUB231PSYr        },
+  { X86::VMAXCPDZ256rm          ,     X86::VMAXCPDYrm            },
+  { X86::VMAXCPDZ256rr          ,     X86::VMAXCPDYrr            },
+  { X86::VMAXCPSZ256rm          ,     X86::VMAXCPSYrm            },
+  { X86::VMAXCPSZ256rr          ,     X86::VMAXCPSYrr            },
+  { X86::VMAXPDZ256rm           ,     X86::VMAXPDYrm             },
+  { X86::VMAXPDZ256rr           ,     X86::VMAXPDYrr             },
+  { X86::VMAXPSZ256rm           ,     X86::VMAXPSYrm             },
+  { X86::VMAXPSZ256rr           ,     X86::VMAXPSYrr             },
+  { X86::VMINCPDZ256rm          ,     X86::VMINCPDYrm            },
+  { X86::VMINCPDZ256rr          ,     X86::VMINCPDYrr            },
+  { X86::VMINCPSZ256rm          ,     X86::VMINCPSYrm            },
+  { X86::VMINCPSZ256rr          ,     X86::VMINCPSYrr            },
+  { X86::VMINPDZ256rm           ,     X86::VMINPDYrm             },
+  { X86::VMINPDZ256rr           ,     X86::VMINPDYrr             },
+  { X86::VMINPSZ256rm           ,     X86::VMINPSYrm             },
+  { X86::VMINPSZ256rr           ,     X86::VMINPSYrr             },
+  { X86::VMOVAPDZ256mr          ,     X86::VMOVAPDYmr            },
+  { X86::VMOVAPDZ256rm          ,     X86::VMOVAPDYrm            },
+  { X86::VMOVAPDZ256rr          ,     X86::VMOVAPDYrr            },
+  { X86::VMOVAPDZ256rr_REV      ,     X86::VMOVAPDYrr_REV        },
+  { X86::VMOVAPSZ256mr          ,     X86::VMOVAPSYmr            },    
+  { X86::VMOVAPSZ256rm          ,     X86::VMOVAPSYrm            },    
+  { X86::VMOVAPSZ256rr          ,     X86::VMOVAPSYrr            },
+  { X86::VMOVAPSZ256rr_REV      ,     X86::VMOVAPSYrr_REV        },
+  { X86::VMOVDDUPZ256rm         ,     X86::VMOVDDUPYrm           },
+  { X86::VMOVDDUPZ256rr         ,     X86::VMOVDDUPYrr           },
+  { X86::VMOVDQA32Z256mr        ,     X86::VMOVDQAYmr            },
+  { X86::VMOVDQA32Z256rm        ,     X86::VMOVDQAYrm            },
+  { X86::VMOVDQA32Z256rr        ,     X86::VMOVDQAYrr            },
+  { X86::VMOVDQA32Z256rr_REV    ,     X86::VMOVDQAYrr_REV        },
+  { X86::VMOVDQA64Z256mr        ,     X86::VMOVDQAYmr            },
+  { X86::VMOVDQA64Z256rm        ,     X86::VMOVDQAYrm            },
+  { X86::VMOVDQA64Z256rr        ,     X86::VMOVDQAYrr            },
+  { X86::VMOVDQA64Z256rr_REV    ,     X86::VMOVDQAYrr_REV        },
+  { X86::VMOVDQU16Z256mr        ,     X86::VMOVDQUYmr            },
+  { X86::VMOVDQU16Z256rm        ,     X86::VMOVDQUYrm            },
+  { X86::VMOVDQU16Z256rr        ,     X86::VMOVDQUYrr            },
+  { X86::VMOVDQU16Z256rr_REV    ,     X86::VMOVDQUYrr_REV        },
+  { X86::VMOVDQU32Z256mr        ,     X86::VMOVDQUYmr            },
+  { X86::VMOVDQU32Z256rm        ,     X86::VMOVDQUYrm            },
+  { X86::VMOVDQU32Z256rr        ,     X86::VMOVDQUYrr            },
+  { X86::VMOVDQU32Z256rr_REV    ,     X86::VMOVDQUYrr_REV        },
+  { X86::VMOVDQU64Z256mr        ,     X86::VMOVDQUYmr            },
+  { X86::VMOVDQU64Z256rm        ,     X86::VMOVDQUYrm            },
+  { X86::VMOVDQU64Z256rr        ,     X86::VMOVDQUYrr            },
+  { X86::VMOVDQU64Z256rr_REV    ,     X86::VMOVDQUYrr_REV        },
+  { X86::VMOVDQU8Z256mr         ,     X86::VMOVDQUYmr            },
+  { X86::VMOVDQU8Z256rm         ,     X86::VMOVDQUYrm            },
+  { X86::VMOVDQU8Z256rr         ,     X86::VMOVDQUYrr            },
+  { X86::VMOVDQU8Z256rr_REV     ,     X86::VMOVDQUYrr_REV        },
+  { X86::VMOVNTDQAZ256rm        ,     X86::VMOVNTDQAYrm          },
+  { X86::VMOVNTDQZ256mr         ,     X86::VMOVNTDQYmr           },
+  { X86::VMOVNTPDZ256mr         ,     X86::VMOVNTPDYmr           },
+  { X86::VMOVNTPSZ256mr         ,     X86::VMOVNTPSYmr           },
+  { X86::VMOVSHDUPZ256rm        ,     X86::VMOVSHDUPYrm          },
+  { X86::VMOVSHDUPZ256rr        ,     X86::VMOVSHDUPYrr          },
+  { X86::VMOVSLDUPZ256rm        ,     X86::VMOVSLDUPYrm          },
+  { X86::VMOVSLDUPZ256rr        ,     X86::VMOVSLDUPYrr          },
+  { X86::VMOVUPDZ256mr          ,     X86::VMOVUPDYmr            },
+  { X86::VMOVUPDZ256rm          ,     X86::VMOVUPDYrm            },
+  { X86::VMOVUPDZ256rr          ,     X86::VMOVUPDYrr            },
+  { X86::VMOVUPDZ256rr_REV      ,     X86::VMOVUPDYrr_REV        },
+  { X86::VMOVUPSZ256mr          ,     X86::VMOVUPSYmr            },
+  { X86::VMOVUPSZ256rm          ,     X86::VMOVUPSYrm            },
+  { X86::VMOVUPSZ256rr          ,     X86::VMOVUPSYrr            },
+  { X86::VMOVUPSZ256rr_REV      ,     X86::VMOVUPSYrr_REV        },
+  { X86::VMULPDZ256rm           ,     X86::VMULPDYrm             },
+  { X86::VMULPDZ256rr           ,     X86::VMULPDYrr             },
+  { X86::VMULPSZ256rm           ,     X86::VMULPSYrm             },
+  { X86::VMULPSZ256rr           ,     X86::VMULPSYrr             },
+  { X86::VORPDZ256rm            ,     X86::VORPDYrm              },
+  { X86::VORPDZ256rr            ,     X86::VORPDYrr              },
+  { X86::VORPSZ256rm            ,     X86::VORPSYrm              },
+  { X86::VORPSZ256rr            ,     X86::VORPSYrr              },
+  { X86::VPABSBZ256rm           ,     X86::VPABSBYrm             },
+  { X86::VPABSBZ256rr           ,     X86::VPABSBYrr             },
+  { X86::VPABSDZ256rm           ,     X86::VPABSDYrm             },
+  { X86::VPABSDZ256rr           ,     X86::VPABSDYrr             },
+  { X86::VPABSWZ256rm           ,     X86::VPABSWYrm             },
+  { X86::VPABSWZ256rr           ,     X86::VPABSWYrr             },
+  { X86::VPACKSSDWZ256rm        ,     X86::VPACKSSDWYrm          },
+  { X86::VPACKSSDWZ256rr        ,     X86::VPACKSSDWYrr          },
+  { X86::VPACKSSWBZ256rm        ,     X86::VPACKSSWBYrm          },
+  { X86::VPACKSSWBZ256rr        ,     X86::VPACKSSWBYrr          },
+  { X86::VPACKUSDWZ256rm        ,     X86::VPACKUSDWYrm          },
+  { X86::VPACKUSDWZ256rr        ,     X86::VPACKUSDWYrr          },
+  { X86::VPACKUSWBZ256rm        ,     X86::VPACKUSWBYrm          },
+  { X86::VPACKUSWBZ256rr        ,     X86::VPACKUSWBYrr          },
+  { X86::VPADDBZ256rm           ,     X86::VPADDBYrm             },
+  { X86::VPADDBZ256rr           ,     X86::VPADDBYrr             },
+  { X86::VPADDDZ256rm           ,     X86::VPADDDYrm             },
+  { X86::VPADDDZ256rr           ,     X86::VPADDDYrr             },
+  { X86::VPADDQZ256rm           ,     X86::VPADDQYrm             },
+  { X86::VPADDQZ256rr           ,     X86::VPADDQYrr             },
+  { X86::VPADDSBZ256rm          ,     X86::VPADDSBYrm            },
+  { X86::VPADDSBZ256rr          ,     X86::VPADDSBYrr            },
+  { X86::VPADDSWZ256rm          ,     X86::VPADDSWYrm            },
+  { X86::VPADDSWZ256rr          ,     X86::VPADDSWYrr            },
+  { X86::VPADDUSBZ256rm         ,     X86::VPADDUSBYrm           },
+  { X86::VPADDUSBZ256rr         ,     X86::VPADDUSBYrr           },
+  { X86::VPADDUSWZ256rm         ,     X86::VPADDUSWYrm           },
+  { X86::VPADDUSWZ256rr         ,     X86::VPADDUSWYrr           },
+  { X86::VPADDWZ256rm           ,     X86::VPADDWYrm             },
+  { X86::VPADDWZ256rr           ,     X86::VPADDWYrr             },
+  { X86::VPALIGNRZ256rmi        ,     X86::VPALIGNRYrmi          },
+  { X86::VPALIGNRZ256rri        ,     X86::VPALIGNRYrri          },
+  { X86::VPANDDZ256rm           ,     X86::VPANDYrm              },
+  { X86::VPANDDZ256rr           ,     X86::VPANDYrr              },
+  { X86::VPANDQZ256rm           ,     X86::VPANDYrm              },
+  { X86::VPANDQZ256rr           ,     X86::VPANDYrr              },
+  { X86::VPAVGBZ256rm           ,     X86::VPAVGBYrm             },
+  { X86::VPAVGBZ256rr           ,     X86::VPAVGBYrr             },
+  { X86::VPAVGWZ256rm           ,     X86::VPAVGWYrm             },
+  { X86::VPAVGWZ256rr           ,     X86::VPAVGWYrr             },
+  { X86::VPBROADCASTBZ256m      ,     X86::VPBROADCASTBYrm       }, 
+  { X86::VPBROADCASTBZ256r      ,     X86::VPBROADCASTBYrr       },   
+  { X86::VPBROADCASTDZ256m      ,     X86::VPBROADCASTDYrm       }, 
+  { X86::VPBROADCASTDZ256r      ,     X86::VPBROADCASTDYrr       },   
+  { X86::VPBROADCASTQZ256m      ,     X86::VPBROADCASTQYrm       }, 
+  { X86::VPBROADCASTQZ256r      ,     X86::VPBROADCASTQYrr       }, 
+  { X86::VPBROADCASTWZ256m      ,     X86::VPBROADCASTWYrm       }, 
+  { X86::VPBROADCASTWZ256r      ,     X86::VPBROADCASTWYrr       }, 
+  { X86::VPERMDZ256rm           ,     X86::VPERMDYrm             },
+  { X86::VPERMDZ256rr           ,     X86::VPERMDYrr             },
+  { X86::VPERMILPDZ256mi        ,     X86::VPERMILPDYmi          },
+  { X86::VPERMILPDZ256ri        ,     X86::VPERMILPDYri          },
+  { X86::VPERMILPDZ256rm        ,     X86::VPERMILPDYrm          },
+  { X86::VPERMILPDZ256rr        ,     X86::VPERMILPDYrr          },
+  { X86::VPERMILPSZ256mi        ,     X86::VPERMILPSYmi          },
+  { X86::VPERMILPSZ256ri        ,     X86::VPERMILPSYri          },
+  { X86::VPERMILPSZ256rm        ,     X86::VPERMILPSYrm          },
+  { X86::VPERMILPSZ256rr        ,     X86::VPERMILPSYrr          },
+  { X86::VPERMPDZ256mi          ,     X86::VPERMPDYmi            },
+  { X86::VPERMPDZ256ri          ,     X86::VPERMPDYri            },
+  { X86::VPERMPSZ256rm          ,     X86::VPERMPSYrm            },
+  { X86::VPERMPSZ256rr          ,     X86::VPERMPSYrr            },
+  { X86::VPERMQZ256mi           ,     X86::VPERMQYmi             },
+  { X86::VPERMQZ256ri           ,     X86::VPERMQYri             },
+  { X86::VPMADDUBSWZ256rm       ,     X86::VPMADDUBSWYrm         },
+  { X86::VPMADDUBSWZ256rr       ,     X86::VPMADDUBSWYrr         },
+  { X86::VPMADDWDZ256rm         ,     X86::VPMADDWDYrm           },
+  { X86::VPMADDWDZ256rr         ,     X86::VPMADDWDYrr           },
+  { X86::VPMAXSBZ256rm          ,     X86::VPMAXSBYrm            },
+  { X86::VPMAXSBZ256rr          ,     X86::VPMAXSBYrr            },
+  { X86::VPMAXSDZ256rm          ,     X86::VPMAXSDYrm            },
+  { X86::VPMAXSDZ256rr          ,     X86::VPMAXSDYrr            },
+  { X86::VPMAXSWZ256rm          ,     X86::VPMAXSWYrm            },
+  { X86::VPMAXSWZ256rr          ,     X86::VPMAXSWYrr            },
+  { X86::VPMAXUBZ256rm          ,     X86::VPMAXUBYrm            },
+  { X86::VPMAXUBZ256rr          ,     X86::VPMAXUBYrr            },
+  { X86::VPMAXUDZ256rm          ,     X86::VPMAXUDYrm            },
+  { X86::VPMAXUDZ256rr          ,     X86::VPMAXUDYrr            },
+  { X86::VPMAXUWZ256rm          ,     X86::VPMAXUWYrm            },
+  { X86::VPMAXUWZ256rr          ,     X86::VPMAXUWYrr            },
+  { X86::VPMINSBZ256rm          ,     X86::VPMINSBYrm            },
+  { X86::VPMINSBZ256rr          ,     X86::VPMINSBYrr            },
+  { X86::VPMINSDZ256rm          ,     X86::VPMINSDYrm            },
+  { X86::VPMINSDZ256rr          ,     X86::VPMINSDYrr            },
+  { X86::VPMINSWZ256rm          ,     X86::VPMINSWYrm            },
+  { X86::VPMINSWZ256rr          ,     X86::VPMINSWYrr            },
+  { X86::VPMINUBZ256rm          ,     X86::VPMINUBYrm            },
+  { X86::VPMINUBZ256rr          ,     X86::VPMINUBYrr            },
+  { X86::VPMINUDZ256rm          ,     X86::VPMINUDYrm            },
+  { X86::VPMINUDZ256rr          ,     X86::VPMINUDYrr            },
+  { X86::VPMINUWZ256rm          ,     X86::VPMINUWYrm            },
+  { X86::VPMINUWZ256rr          ,     X86::VPMINUWYrr            },
+  { X86::VPMOVSXBDZ256rm        ,     X86::VPMOVSXBDYrm          },
+  { X86::VPMOVSXBDZ256rr        ,     X86::VPMOVSXBDYrr          },
+  { X86::VPMOVSXBQZ256rm        ,     X86::VPMOVSXBQYrm          },
+  { X86::VPMOVSXBQZ256rr        ,     X86::VPMOVSXBQYrr          },
+  { X86::VPMOVSXBWZ256rm        ,     X86::VPMOVSXBWYrm          },
+  { X86::VPMOVSXBWZ256rr        ,     X86::VPMOVSXBWYrr          },
+  { X86::VPMOVSXDQZ256rm        ,     X86::VPMOVSXDQYrm          },
+  { X86::VPMOVSXDQZ256rr        ,     X86::VPMOVSXDQYrr          },
+  { X86::VPMOVSXWDZ256rm        ,     X86::VPMOVSXWDYrm          },
+  { X86::VPMOVSXWDZ256rr        ,     X86::VPMOVSXWDYrr          },
+  { X86::VPMOVSXWQZ256rm        ,     X86::VPMOVSXWQYrm          },
+  { X86::VPMOVSXWQZ256rr        ,     X86::VPMOVSXWQYrr          },
+  { X86::VPMOVZXBDZ256rm        ,     X86::VPMOVZXBDYrm          },
+  { X86::VPMOVZXBDZ256rr        ,     X86::VPMOVZXBDYrr          },
+  { X86::VPMOVZXBQZ256rm        ,     X86::VPMOVZXBQYrm          },
+  { X86::VPMOVZXBQZ256rr        ,     X86::VPMOVZXBQYrr          },
+  { X86::VPMOVZXBWZ256rm        ,     X86::VPMOVZXBWYrm          },
+  { X86::VPMOVZXBWZ256rr        ,     X86::VPMOVZXBWYrr          },
+  { X86::VPMOVZXDQZ256rm        ,     X86::VPMOVZXDQYrm          },
+  { X86::VPMOVZXDQZ256rr        ,     X86::VPMOVZXDQYrr          },
+  { X86::VPMOVZXWDZ256rm        ,     X86::VPMOVZXWDYrm          },
+  { X86::VPMOVZXWDZ256rr        ,     X86::VPMOVZXWDYrr          },
+  { X86::VPMOVZXWQZ256rm        ,     X86::VPMOVZXWQYrm          },
+  { X86::VPMOVZXWQZ256rr        ,     X86::VPMOVZXWQYrr          },
+  { X86::VPMULDQZ256rm          ,     X86::VPMULDQYrm            },
+  { X86::VPMULDQZ256rr          ,     X86::VPMULDQYrr            },
+  { X86::VPMULHRSWZ256rm        ,     X86::VPMULHRSWYrm          },
+  { X86::VPMULHRSWZ256rr        ,     X86::VPMULHRSWYrr          },
+  { X86::VPMULHUWZ256rm         ,     X86::VPMULHUWYrm           },
+  { X86::VPMULHUWZ256rr         ,     X86::VPMULHUWYrr           },
+  { X86::VPMULHWZ256rm          ,     X86::VPMULHWYrm            },
+  { X86::VPMULHWZ256rr          ,     X86::VPMULHWYrr            },
+  { X86::VPMULLDZ256rm          ,     X86::VPMULLDYrm            },
+  { X86::VPMULLDZ256rr          ,     X86::VPMULLDYrr            },
+  { X86::VPMULLWZ256rm          ,     X86::VPMULLWYrm            },
+  { X86::VPMULLWZ256rr          ,     X86::VPMULLWYrr            },
+  { X86::VPMULUDQZ256rm         ,     X86::VPMULUDQYrm           },
+  { X86::VPMULUDQZ256rr         ,     X86::VPMULUDQYrr           },
+  { X86::VPORDZ256rm            ,     X86::VPORYrm               },
+  { X86::VPORDZ256rr            ,     X86::VPORYrr               },
+  { X86::VPORQZ256rm            ,     X86::VPORYrm               },
+  { X86::VPORQZ256rr            ,     X86::VPORYrr               },
+  { X86::VPSADBWZ256rm          ,     X86::VPSADBWYrm            },
+  { X86::VPSADBWZ256rr          ,     X86::VPSADBWYrr            },
+  { X86::VPSHUFBZ256rm          ,     X86::VPSHUFBYrm            },
+  { X86::VPSHUFBZ256rr          ,     X86::VPSHUFBYrr            },
+  { X86::VPSHUFDZ256mi          ,     X86::VPSHUFDYmi            },
+  { X86::VPSHUFDZ256ri          ,     X86::VPSHUFDYri            },
+  { X86::VPSHUFHWZ256mi         ,     X86::VPSHUFHWYmi           },
+  { X86::VPSHUFHWZ256ri         ,     X86::VPSHUFHWYri           },
+  { X86::VPSHUFLWZ256mi         ,     X86::VPSHUFLWYmi           },
+  { X86::VPSHUFLWZ256ri         ,     X86::VPSHUFLWYri           },
+  { X86::VPSLLDQZ256rr          ,     X86::VPSLLDQYri            },
+  { X86::VPSLLDZ256ri           ,     X86::VPSLLDYri             },
+  { X86::VPSLLDZ256rm           ,     X86::VPSLLDYrm             },
+  { X86::VPSLLDZ256rr           ,     X86::VPSLLDYrr             },    
+  { X86::VPSLLQZ256ri           ,     X86::VPSLLQYri             },
+  { X86::VPSLLQZ256rm           ,     X86::VPSLLQYrm             },
+  { X86::VPSLLQZ256rr           ,     X86::VPSLLQYrr             },
+  { X86::VPSLLVDZ256rm          ,     X86::VPSLLVDYrm            },
+  { X86::VPSLLVDZ256rr          ,     X86::VPSLLVDYrr            },
+  { X86::VPSLLVQZ256rm          ,     X86::VPSLLVQYrm            },
+  { X86::VPSLLVQZ256rr          ,     X86::VPSLLVQYrr            },
+  { X86::VPSLLWZ256ri           ,     X86::VPSLLWYri             },
+  { X86::VPSLLWZ256rm           ,     X86::VPSLLWYrm             },
+  { X86::VPSLLWZ256rr           ,     X86::VPSLLWYrr             },    
+  { X86::VPSRADZ256ri           ,     X86::VPSRADYri             },
+  { X86::VPSRADZ256rm           ,     X86::VPSRADYrm             },
+  { X86::VPSRADZ256rr           ,     X86::VPSRADYrr             },
+  { X86::VPSRAVDZ256rm          ,     X86::VPSRAVDYrm            },
+  { X86::VPSRAVDZ256rr          ,     X86::VPSRAVDYrr            },
+  { X86::VPSRAWZ256ri           ,     X86::VPSRAWYri             },
+  { X86::VPSRAWZ256rm           ,     X86::VPSRAWYrm             },
+  { X86::VPSRAWZ256rr           ,     X86::VPSRAWYrr             },
+  { X86::VPSRLDQZ256rr          ,     X86::VPSRLDQYri            },
+  { X86::VPSRLDZ256ri           ,     X86::VPSRLDYri             },
+  { X86::VPSRLDZ256rm           ,     X86::VPSRLDYrm             },
+  { X86::VPSRLDZ256rr           ,     X86::VPSRLDYrr             },   
+  { X86::VPSRLQZ256ri           ,     X86::VPSRLQYri             },
+  { X86::VPSRLQZ256rm           ,     X86::VPSRLQYrm             },
+  { X86::VPSRLQZ256rr           ,     X86::VPSRLQYrr             },
+  { X86::VPSRLVDZ256rm          ,     X86::VPSRLVDYrm            },
+  { X86::VPSRLVDZ256rr          ,     X86::VPSRLVDYrr            },
+  { X86::VPSRLVQZ256rm          ,     X86::VPSRLVQYrm            },
+  { X86::VPSRLVQZ256rr          ,     X86::VPSRLVQYrr            },
+  { X86::VPSRLWZ256ri           ,     X86::VPSRLWYri             },
+  { X86::VPSRLWZ256rm           ,     X86::VPSRLWYrm             },
+  { X86::VPSRLWZ256rr           ,     X86::VPSRLWYrr             },
+  { X86::VPSUBBZ256rm           ,     X86::VPSUBBYrm             },
+  { X86::VPSUBBZ256rr           ,     X86::VPSUBBYrr             },
+  { X86::VPSUBDZ256rm           ,     X86::VPSUBDYrm             },
+  { X86::VPSUBDZ256rr           ,     X86::VPSUBDYrr             },
+  { X86::VPSUBQZ256rm           ,     X86::VPSUBQYrm             },
+  { X86::VPSUBQZ256rr           ,     X86::VPSUBQYrr             },
+  { X86::VPSUBSBZ256rm          ,     X86::VPSUBSBYrm            },
+  { X86::VPSUBSBZ256rr          ,     X86::VPSUBSBYrr            },
+  { X86::VPSUBSWZ256rm          ,     X86::VPSUBSWYrm            },
+  { X86::VPSUBSWZ256rr          ,     X86::VPSUBSWYrr            },
+  { X86::VPSUBUSBZ256rm         ,     X86::VPSUBUSBYrm           },
+  { X86::VPSUBUSBZ256rr         ,     X86::VPSUBUSBYrr           },
+  { X86::VPSUBUSWZ256rm         ,     X86::VPSUBUSWYrm           },
+  { X86::VPSUBUSWZ256rr         ,     X86::VPSUBUSWYrr           },
+  { X86::VPSUBWZ256rm           ,     X86::VPSUBWYrm             },
+  { X86::VPSUBWZ256rr           ,     X86::VPSUBWYrr             },
+  { X86::VPUNPCKHBWZ256rm       ,     X86::VPUNPCKHBWYrm         },
+  { X86::VPUNPCKHBWZ256rr       ,     X86::VPUNPCKHBWYrr         },
+  { X86::VPUNPCKHDQZ256rm       ,     X86::VPUNPCKHDQYrm         },
+  { X86::VPUNPCKHDQZ256rr       ,     X86::VPUNPCKHDQYrr         },
+  { X86::VPUNPCKHQDQZ256rm      ,     X86::VPUNPCKHQDQYrm        },
+  { X86::VPUNPCKHQDQZ256rr      ,     X86::VPUNPCKHQDQYrr        },
+  { X86::VPUNPCKHWDZ256rm       ,     X86::VPUNPCKHWDYrm         },
+  { X86::VPUNPCKHWDZ256rr       ,     X86::VPUNPCKHWDYrr         },
+  { X86::VPUNPCKLBWZ256rm       ,     X86::VPUNPCKLBWYrm         },
+  { X86::VPUNPCKLBWZ256rr       ,     X86::VPUNPCKLBWYrr         },
+  { X86::VPUNPCKLDQZ256rm       ,     X86::VPUNPCKLDQYrm         },
+  { X86::VPUNPCKLDQZ256rr       ,     X86::VPUNPCKLDQYrr         },
+  { X86::VPUNPCKLQDQZ256rm      ,     X86::VPUNPCKLQDQYrm        },
+  { X86::VPUNPCKLQDQZ256rr      ,     X86::VPUNPCKLQDQYrr        },
+  { X86::VPUNPCKLWDZ256rm       ,     X86::VPUNPCKLWDYrm         },
+  { X86::VPUNPCKLWDZ256rr       ,     X86::VPUNPCKLWDYrr         },
+  { X86::VPXORDZ256rm           ,     X86::VPXORYrm              },
+  { X86::VPXORDZ256rr           ,     X86::VPXORYrr              },
+  { X86::VPXORQZ256rm           ,     X86::VPXORYrm              },
+  { X86::VPXORQZ256rr           ,     X86::VPXORYrr              },
+  { X86::VSHUFPDZ256rmi         ,     X86::VSHUFPDYrmi           },
+  { X86::VSHUFPDZ256rri         ,     X86::VSHUFPDYrri           },
+  { X86::VSHUFPSZ256rmi         ,     X86::VSHUFPSYrmi           },
+  { X86::VSHUFPSZ256rri         ,     X86::VSHUFPSYrri           },
+  { X86::VSQRTPDZ256m           ,     X86::VSQRTPDYm             },
+  { X86::VSQRTPDZ256r           ,     X86::VSQRTPDYr             },
+  { X86::VSQRTPSZ256m           ,     X86::VSQRTPSYm             },
+  { X86::VSQRTPSZ256r           ,     X86::VSQRTPSYr             },
+  { X86::VSUBPDZ256rm           ,     X86::VSUBPDYrm             },
+  { X86::VSUBPDZ256rr           ,     X86::VSUBPDYrr             },
+  { X86::VSUBPSZ256rm           ,     X86::VSUBPSYrm             },
+  { X86::VSUBPSZ256rr           ,     X86::VSUBPSYrr             },
+  { X86::VUNPCKHPDZ256rm        ,     X86::VUNPCKHPDYrm          },
+  { X86::VUNPCKHPDZ256rr        ,     X86::VUNPCKHPDYrr          },
+  { X86::VUNPCKHPSZ256rm        ,     X86::VUNPCKHPSYrm          },
+  { X86::VUNPCKHPSZ256rr        ,     X86::VUNPCKHPSYrr          },
+  { X86::VUNPCKLPDZ256rm        ,     X86::VUNPCKLPDYrm          },
+  { X86::VUNPCKLPDZ256rr        ,     X86::VUNPCKLPDYrr          },
+  { X86::VUNPCKLPSZ256rm        ,     X86::VUNPCKLPSYrm          },
+  { X86::VUNPCKLPSZ256rr        ,     X86::VUNPCKLPSYrr          },
+  { X86::VXORPDZ256rm           ,     X86::VXORPDYrm             },
+  { X86::VXORPDZ256rr           ,     X86::VXORPDYrr             },
+  { X86::VXORPSZ256rm           ,     X86::VXORPSYrm             },
+  { X86::VXORPSZ256rr           ,     X86::VXORPSYrr             },
+};
+
+#endif
+\ No newline at end of file
diff --git a/contrib/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
new file mode 100644
index 000000000000..2ea27a934b47
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
@@ -0,0 +1,66 @@
+//===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel VMX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VMX instructions
+
+// 66 0F 38 80
+def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+               "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+               Requires<[Not64BitMode]>;
+def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+               "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+               Requires<[In64BitMode]>;
+// 66 0F 38 81
+def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                Requires<[Not64BitMode]>;
+def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                Requires<[In64BitMode]>;
+// 0F 01 C1
+def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
+def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+  "vmclear\t$vmcs", []>, PD;
+// OF 01 D4
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB;
+// 0F 01 C2
+def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
+// 0F 01 C3
+def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
+def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+  "vmptrld\t$vmcs", []>, PS;
+def VMPTRSTm : I<0xC7, MRM7m, (outs), (ins i64mem:$vmcs),
+  "vmptrst\t$vmcs", []>, TB;
+def VMREAD64rm : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMREAD32rm : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+// 0F 01 C4
+def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB;
+def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
+  "vmxon\t$vmxon", []>, XS;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
new file mode 100644
index 000000000000..2b296e1e5b85
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
@@ -0,0 +1,427 @@
+//===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes XOP (eXtended OPerations)
+//
+//===----------------------------------------------------------------------===//
+
+multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
+}
+
+let ExeDomain = SSEPackedInt in {
+  defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>;
+  defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>;
+  defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>;
+  defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>;
+  defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>;
+  defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>;
+  defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>;
+  defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>;
+  defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>;
+  defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>;
+  defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>;
+  defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>;
+  defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>;
+  defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>;
+  defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>;
+}
+
+// Scalar load 2 addr operand instructions
+multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
+                     Operand memop, ComplexPattern mem_cpat> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP;
+}
+
+multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
+                     PatFrag memop> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
+}
+
+multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
+                     PatFrag memop> {
+  def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L;
+  def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L;
+}
+
+let ExeDomain = SSEPackedSingle in {
+  defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
+                           ssmem, sse_load_f32>;
+  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>;
+  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+  defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
+                           sdmem, sse_load_f64>;
+  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>;
+  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>;
+}
+
+multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                  ValueType vt128> {
+  def rr : IXOP<opc, MRMSrcReg4VOp3, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>,
+           XOP, Sched<[WriteVarVecShift]>;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1),
+                             (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+           XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>;
+  def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst),
+           (ins i128mem:$src1, VR128:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))),
+                             (vt128 VR128:$src2))))]>,
+             XOP, Sched<[WriteVarVecShift, ReadAfterLd]>;
+  // For disassembler
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rr_REV : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               []>,
+               XOP_4V, VEX_W, Sched<[WriteVarVecShift]>;
+}
+
+let ExeDomain = SSEPackedInt in {
+  defm VPROTB : xop3op<0x90, "vprotb", X86vprot, v16i8>;
+  defm VPROTD : xop3op<0x92, "vprotd", X86vprot, v4i32>;
+  defm VPROTQ : xop3op<0x93, "vprotq", X86vprot, v2i64>;
+  defm VPROTW : xop3op<0x91, "vprotw", X86vprot, v8i16>;
+  defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>;
+  defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>;
+  defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>;
+  defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16>;
+  defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8>;
+  defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32>;
+  defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64>;
+  defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16>;
+}
+
+multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                     ValueType vt128> {
+  def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, u8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>, XOP;
+  def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins i128mem:$src1, u8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>, XOP;
+}
+
+let ExeDomain = SSEPackedInt in {
+  defm VPROTB : xop3opimm<0xC0, "vprotb", X86vproti, v16i8>;
+  defm VPROTD : xop3opimm<0xC2, "vprotd", X86vproti, v4i32>;
+  defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vproti, v2i64>;
+  defm VPROTW : xop3opimm<0xC1, "vprotw", X86vproti, v8i16>;
+}
+
+// Instruction where second source can be memory, but third must be register
+multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  let isCommutable = 1 in
+  def rr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+              (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V;
+  def rm : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+              (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
+              VR128:$src3))]>, XOP_4V;
+}
+
+let ExeDomain = SSEPackedInt in {
+  defm VPMADCSWD  : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
+  defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
+  defm VPMACSWW   : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
+  defm VPMACSWD   : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
+  defm VPMACSSWW  : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
+  defm VPMACSSWD  : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
+  defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
+  defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
+  defm VPMACSSDD  : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
+  defm VPMACSDQL  : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
+  defm VPMACSDQH  : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
+  defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
+}
+
+// Instruction where second source can be memory, third must be imm8
+multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> {
+  let isCommutable = 1 in
+  def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, XOPCC:$cc),
+           !strconcat("vpcom${cc}", Suffix,
+           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+                             i8immZExt3:$cc)))]>,
+           XOP_4V;
+  def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, XOPCC:$cc),
+           !strconcat("vpcom${cc}", Suffix,
+           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1),
+                             (vt128 (bitconvert (loadv2i64 addr:$src2))),
+                              i8immZExt3:$cc)))]>,
+           XOP_4V;
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+                 (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+                 !strconcat("vpcom", Suffix,
+                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                 []>, XOP_4V;
+    let mayLoad = 1 in
+    def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+                 (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+                 !strconcat("vpcom", Suffix,
+                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                 []>, XOP_4V;
+  }
+}
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+  defm VPCOMB  : xopvpcom<0xCC, "b", X86vpcom, v16i8>;
+  defm VPCOMW  : xopvpcom<0xCD, "w", X86vpcom, v8i16>;
+  defm VPCOMD  : xopvpcom<0xCE, "d", X86vpcom, v4i32>;
+  defm VPCOMQ  : xopvpcom<0xCF, "q", X86vpcom, v2i64>;
+  defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8>;
+  defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16>;
+  defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32>;
+  defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64>;
+}
+
+multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                  ValueType vt128> {
+  def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
+            (ins VR128:$src1, VR128:$src2, VR128:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+                             (vt128 VR128:$src3))))]>,
+            XOP_4V;
+  def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
+            (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            [(set VR128:$dst,
+              (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+                             (vt128 (bitconvert (loadv2i64 addr:$src3))))))]>,
+            XOP_4V, VEX_W;
+  def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
+            (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            [(set VR128:$dst,
+              (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))),
+                             (vt128 VR128:$src3))))]>,
+            XOP_4V;
+  // For disassembler
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst),
+                (ins VR128:$src1, VR128:$src2, VR128:$src3),
+                !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                []>, XOP_4V, VEX_W;
+}
+
+let ExeDomain = SSEPackedInt in {
+  defm VPPERM : xop4op<0xA3, "vpperm", X86vpperm, v16i8>;
+}
+
+// Instruction where either second or third source can be memory
+multiclass xop4op_int<bits<8> opc, string OpcodeStr,
+                      Intrinsic Int128, Intrinsic Int256> {
+  // 128-bit Instruction
+  def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
+            (ins VR128:$src1, VR128:$src2, VR128:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>,
+            XOP_4V;
+  def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
+            (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            [(set VR128:$dst,
+              (Int128 VR128:$src1, VR128:$src2,
+               (bitconvert (loadv2i64 addr:$src3))))]>,
+            XOP_4V, VEX_W;
+  def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
+            (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            [(set VR128:$dst,
+              (Int128 VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
+               VR128:$src3))]>,
+            XOP_4V;
+  // For disassembler
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst),
+            (ins VR128:$src1, VR128:$src2, VR128:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            []>, XOP_4V, VEX_W;
+
+  // 256-bit Instruction
+  def rrrY : IXOPi8Reg<opc, MRMSrcReg, (outs VR256:$dst),
+             (ins VR256:$src1, VR256:$src2, VR256:$src3),
+             !strconcat(OpcodeStr,
+             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+             [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>,
+             XOP_4V, VEX_L;
+  def rrmY : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR256:$dst),
+             (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+             !strconcat(OpcodeStr,
+             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+             [(set VR256:$dst,
+               (Int256 VR256:$src1, VR256:$src2,
+               (bitconvert (loadv4i64 addr:$src3))))]>,
+             XOP_4V, VEX_W, VEX_L;
+  def rmrY : IXOPi8Reg<opc, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+             !strconcat(OpcodeStr,
+             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+             [(set VR256:$dst,
+               (Int256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2)),
+                VR256:$src3))]>,
+             XOP_4V, VEX_L;
+  // For disassembler
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rrrY_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR256:$dst),
+            (ins VR256:$src1, VR256:$src2, VR256:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            []>, XOP_4V, VEX_W, VEX_L;
+}
+
+let ExeDomain = SSEPackedInt in {
+  defm VPCMOV : xop4op_int<0xA2, "vpcmov",
+                           int_x86_xop_vpcmov, int_x86_xop_vpcmov_256>;
+}
+
+let Predicates = [HasXOP] in {
+  def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1),
+                       (X86andnp VR128:$src3, VR128:$src2))),
+            (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+
+  def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1),
+                       (X86andnp VR256:$src3, VR256:$src2))),
+            (VPCMOVrrrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+}
+
+multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                  ValueType vt128, ValueType vt256,
+                  ValueType id128, ValueType id256,
+                  PatFrag ld_128, PatFrag ld_256> {
+  def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
+        (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR128:$dst,
+           (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+                          (id128 VR128:$src3), (i8 imm:$src4))))]>;
+  def rm : IXOP5<opc, MRMSrcMemOp4, (outs VR128:$dst),
+        (ins VR128:$src1, VR128:$src2, i128mem:$src3, u8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR128:$dst,
+           (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+                          (id128 (bitconvert (loadv2i64 addr:$src3))),
+                          (i8 imm:$src4))))]>,
+        VEX_W;
+  def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
+        (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR128:$dst,
+           (vt128 (OpNode (vt128 VR128:$src1),
+                          (vt128 (bitconvert (ld_128 addr:$src2))),
+                          (id128 VR128:$src3), (i8 imm:$src4))))]>;
+  // For disassembler
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rr_REV : IXOP5<opc, MRMSrcRegOp4, (outs VR128:$dst),
+        (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        []>, VEX_W;
+
+  def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
+        (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR256:$dst,
+           (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
+                          (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
+  def rmY : IXOP5<opc, MRMSrcMemOp4, (outs VR256:$dst),
+        (ins VR256:$src1, VR256:$src2, i256mem:$src3, u8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR256:$dst,
+           (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
+                          (id256 (bitconvert (loadv4i64 addr:$src3))),
+                          (i8 imm:$src4))))]>, VEX_W, VEX_L;
+  def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
+        (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR256:$dst,
+           (vt256 (OpNode (vt256 VR256:$src1),
+                          (vt256 (bitconvert (ld_256 addr:$src2))),
+                          (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
+  // For disassembler
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rrY_REV : IXOP5<opc, MRMSrcRegOp4, (outs VR256:$dst),
+        (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        []>, VEX_W, VEX_L;
+}
+
+let ExeDomain = SSEPackedDouble in
+  defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", X86vpermil2, v2f64, v4f64,
+                           v2i64, v4i64, loadv2f64, loadv4f64>;
+
+let ExeDomain = SSEPackedSingle in
+  defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", X86vpermil2, v4f32, v8f32,
+                           v4i32, v8i32, loadv4f32, loadv8f32>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp
new file mode 100644
index 000000000000..d9edf4676faf
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -0,0 +1,221 @@
+//===--------- X86InterleavedAccess.cpp ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the X86 implementation of the interleaved accesses
+/// optimization generating X86-specific instructions/intrinsics for
+/// interleaved access groups.
+///
+//===--------------------------------------------------------------------===//
+
+#include "X86ISelLowering.h"
+#include "X86TargetMachine.h"
+
+using namespace llvm;
+
+/// \brief This class holds necessary information to represent an interleaved
+/// access group and supports utilities to lower the group into
+/// X86-specific instructions/intrinsics.
+///  E.g. A group of interleaving access loads (Factor = 2; accessing every
+///       other element)
+///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
+///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
+
+class X86InterleavedAccessGroup {
+  /// \brief Reference to the wide-load instruction of an interleaved access
+  /// group.
+  Instruction *const Inst;
+
+  /// \brief Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
+  ArrayRef<ShuffleVectorInst *> Shuffles;
+
+  /// \brief Reference to the starting index of each user-shuffle.
+  ArrayRef<unsigned> Indices;
+
+  /// \brief Reference to the interleaving stride in terms of elements.
+  const unsigned Factor;
+
+  /// \brief Reference to the underlying target.
+  const X86Subtarget &Subtarget;
+
+  const DataLayout &DL;
+
+  IRBuilder<> &Builder;
+
+  /// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
+  /// sub vectors of type \p T. Returns true and the sub-vectors in
+  /// \p DecomposedVectors if it decomposes the Inst, returns false otherwise.
+  bool decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
+                 SmallVectorImpl<Instruction *> &DecomposedVectors);
+
+  /// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
+  /// returns the transposed-vectors in \p TransposedVectors.
+  /// E.g.
+  /// InputVectors:
+  ///   In-V0 = p1, p2, p3, p4
+  ///   In-V1 = q1, q2, q3, q4
+  ///   In-V2 = r1, r2, r3, r4
+  ///   In-V3 = s1, s2, s3, s4
+  /// OutputVectors:
+  ///   Out-V0 = p1, q1, r1, s1
+  ///   Out-V1 = p2, q2, r2, s2
+  ///   Out-V2 = p3, q3, r3, s3
+  ///   Out-V3 = P4, q4, r4, s4
+  void transpose_4x4(ArrayRef<Instruction *> InputVectors,
+                     SmallVectorImpl<Value *> &TrasposedVectors);
+
+public:
+  /// In order to form an interleaved access group X86InterleavedAccessGroup
+  /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
+  /// \p Shuffs, reference to the first indices of each interleaved-vector
+  /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
+  /// X86-specific instructions/intrinsics it also requires the underlying
+  /// target information \p STarget.
+  explicit X86InterleavedAccessGroup(Instruction *I,
+                                     ArrayRef<ShuffleVectorInst *> Shuffs,
+                                     ArrayRef<unsigned> Ind,
+                                     const unsigned F,
+                                     const X86Subtarget &STarget,
+                                     IRBuilder<> &B)
+      : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
+        DL(Inst->getModule()->getDataLayout()), Builder(B) {}
+
+  /// \brief Returns true if this interleaved access group can be lowered into
+  /// x86-specific instructions/intrinsics, false otherwise.
+  bool isSupported() const;
+
+  /// \brief Lowers this interleaved access group into X86-specific
+  /// instructions/intrinsics.
+  bool lowerIntoOptimizedSequence();
+};
+
+bool X86InterleavedAccessGroup::isSupported() const {
+  VectorType *ShuffleVecTy = Shuffles[0]->getType();
+  uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy);
+  Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
+
+  if (DL.getTypeSizeInBits(Inst->getType()) < Factor * ShuffleVecSize)
+    return false;
+
+  // Currently, lowering is supported for 64 bits on AVX.
+  if (!Subtarget.hasAVX() || ShuffleVecSize != 256 ||
+      DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4)
+    return false;
+
+  return true;
+}
+
+bool X86InterleavedAccessGroup::decompose(
+    Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
+    SmallVectorImpl<Instruction *> &DecomposedVectors) {
+  Type *VecTy = VecInst->getType();
+  (void)VecTy;
+  assert(VecTy->isVectorTy() &&
+         DL.getTypeSizeInBits(VecTy) >=
+             DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
+         "Invalid Inst-size!!!");
+  assert(VecTy->getVectorElementType() == SubVecTy->getVectorElementType() &&
+         "Element type mismatched!!!");
+
+  if (!isa<LoadInst>(VecInst))
+    return false;
+
+  LoadInst *LI = cast<LoadInst>(VecInst);
+  Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
+
+  Value *VecBasePtr =
+      Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
+
+  // Generate N loads of T type
+  for (unsigned i = 0; i < NumSubVectors; i++) {
+    // TODO: Support inbounds GEP
+    Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
+    Instruction *NewLoad =
+        Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
+    DecomposedVectors.push_back(NewLoad);
+  }
+
+  return true;
+}
+
+void X86InterleavedAccessGroup::transpose_4x4(
+    ArrayRef<Instruction *> Matrix,
+    SmallVectorImpl<Value *> &TransposedMatrix) {
+  assert(Matrix.size() == 4 && "Invalid matrix size");
+  TransposedMatrix.resize(4);
+
+  // dst = src1[0,1],src2[0,1]
+  uint32_t IntMask1[] = {0, 1, 4, 5};
+  ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
+  Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
+  Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
+
+  // dst = src1[2,3],src2[2,3]
+  uint32_t IntMask2[] = {2, 3, 6, 7};
+  Mask = makeArrayRef(IntMask2, 4);
+  Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
+  Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
+
+  // dst = src1[0],src2[0],src1[2],src2[2]
+  uint32_t IntMask3[] = {0, 4, 2, 6};
+  Mask = makeArrayRef(IntMask3, 4);
+  TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
+  TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
+
+  // dst = src1[1],src2[1],src1[3],src2[3]
+  uint32_t IntMask4[] = {1, 5, 3, 7};
+  Mask = makeArrayRef(IntMask4, 4);
+  TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
+  TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
+}
+
+// Lowers this interleaved access group into X86-specific
+// instructions/intrinsics.
+bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
+  SmallVector<Instruction *, 4> DecomposedVectors;
+  VectorType *VecTy = Shuffles[0]->getType();
+  // Try to generate target-sized register(/instruction).
+  if (!decompose(Inst, Factor, VecTy, DecomposedVectors))
+    return false;
+
+  SmallVector<Value *, 4> TransposedVectors;
+  // Perform matrix-transposition in order to compute interleaved
+  // results by generating some sort of (optimized) target-specific
+  // instructions.
+  transpose_4x4(DecomposedVectors, TransposedVectors);
+
+  // Now replace the unoptimized-interleaved-vectors with the
+  // transposed-interleaved vectors.
+  for (unsigned i = 0; i < Shuffles.size(); i++)
+    Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
+
+  return true;
+}
+
+// Lower interleaved load(s) into target specific instructions/
+// intrinsics. Lowering sequence varies depending on the vector-types, factor,
+// number of shuffles and ISA.
+// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
+bool X86TargetLowering::lowerInterleavedLoad(
+    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    ArrayRef<unsigned> Indices, unsigned Factor) const {
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
+  assert(!Shuffles.empty() && "Empty shufflevector input");
+  assert(Shuffles.size() == Indices.size() &&
+         "Unmatched number of shufflevectors and indices");
+
+  // Create an interleaved access group.
+  IRBuilder<> Builder(LI);
+  X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
+                                Builder);
+
+  return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
new file mode 100644
index 000000000000..df47b4ad583d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -0,0 +1,1794 @@
+//===-- X86IntrinsicsInfo.h - X86 Intrinsics ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the details for lowering X86 intrinsics
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H
+#define LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H
+
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+
+namespace llvm {
+
+enum IntrinsicType : uint16_t {
+  INTR_NO_TYPE,
+  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASS, FPCLASSS,
+  INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
+  CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM,
+  CVTPD2PS, CVTPD2PS_MASK,
+  INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
+  INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
+  INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
+  FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3,
+  FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
+  VPERM_2OP_MASK, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
+  INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
+  COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, BRCST32x2_TO_VEC,
+  TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
+  EXPAND_FROM_MEM, INSERT_SUBVEC,
+  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
+  FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
+};
+
+struct IntrinsicData {
+
+  uint16_t      Id;
+  IntrinsicType Type;
+  uint16_t      Opc0;
+  uint16_t      Opc1;
+
+  bool operator<(const IntrinsicData &RHS) const {
+    return Id < RHS.Id;
+  }
+  bool operator==(const IntrinsicData &RHS) const {
+    return RHS.Id == Id;
+  }
+};
+
+#define X86_INTRINSIC_DATA(id, type, op0, op1) \
+  { Intrinsic::x86_##id, type, op0, op1 }
+
+/*
+ * IntrinsicsWithChain - the table should be sorted by Intrinsic ID - in
+ * the alphabetical order.
+ */
+static const IntrinsicData IntrinsicsWithChain[] = {
+  X86_INTRINSIC_DATA(addcarry_u32,  ADX, X86ISD::ADC, 0),
+  X86_INTRINSIC_DATA(addcarry_u64,  ADX, X86ISD::ADC, 0),
+  X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0),
+  X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0),
+
+  X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0),
+  X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0),
+
+  X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH,
+                     X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm),
+  X86_INTRINSIC_DATA(avx512_gatherpf_dps_512, PREFETCH,
+                     X86::VGATHERPF0DPSm, X86::VGATHERPF1DPSm),
+  X86_INTRINSIC_DATA(avx512_gatherpf_qpd_512, PREFETCH,
+                     X86::VGATHERPF0QPDm, X86::VGATHERPF1QPDm),
+  X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
+                     X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
+
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_d_128,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_d_256,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_d_512,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_128,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_256,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_512,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_128,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_256,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_512,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_q_128,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_q_256,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512,
+                     COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_d_128,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_d_256,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_d_512,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_128,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_256,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_512,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_128,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_256,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_512,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_q_128,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_q_256,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
+                     EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_512, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_128, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_256, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_512, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_128, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_256, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_512, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_128, TRUNCATE_TO_MEM_VI32,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_256, TRUNCATE_TO_MEM_VI32,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_512, TRUNCATE_TO_MEM_VI32,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_128, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_256, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_512, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_128, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_256, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_128, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_256, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_512, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_128, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_256, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_512, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_128, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_256, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_512, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_128, TRUNCATE_TO_MEM_VI32,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_256, TRUNCATE_TO_MEM_VI32,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_512, TRUNCATE_TO_MEM_VI32,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_128, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_256, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_512, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_128, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_256, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_512, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_128, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_256, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_512, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_128, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_256, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_512, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_128, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_256, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_512, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_128, TRUNCATE_TO_MEM_VI32,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_256, TRUNCATE_TO_MEM_VI32,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_512, TRUNCATE_TO_MEM_VI32,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_128, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_256, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_512, TRUNCATE_TO_MEM_VI16,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_128, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_256, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_512, TRUNCATE_TO_MEM_VI8,
+                     X86ISD::VTRUNCUS, 0),
+
+  X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0),
+  X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, X86::VSCATTERPF0DPDm,
+                     X86::VSCATTERPF1DPDm),
+  X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, X86::VSCATTERPF0DPSm,
+                     X86::VSCATTERPF1DPSm),
+  X86_INTRINSIC_DATA(avx512_scatterpf_qpd_512, PREFETCH, X86::VSCATTERPF0QPDm,
+                     X86::VSCATTERPF1QPDm),
+  X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, X86::VSCATTERPF0QPSm,
+                     X86::VSCATTERPF1QPSm),
+  X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0),
+  X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0),
+  X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0),
+  X86_INTRINSIC_DATA(avx512_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0),
+  X86_INTRINSIC_DATA(avx512_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0),
+  X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0),
+  X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0),
+  X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0),
+  X86_INTRINSIC_DATA(rdpmc,     RDPMC,  X86ISD::RDPMC_DAG, 0),
+  X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0),
+  X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0),
+  X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0),
+  X86_INTRINSIC_DATA(rdseed_16, RDSEED, X86ISD::RDSEED, 0),
+  X86_INTRINSIC_DATA(rdseed_32, RDSEED, X86ISD::RDSEED, 0),
+  X86_INTRINSIC_DATA(rdseed_64, RDSEED, X86ISD::RDSEED, 0),
+  X86_INTRINSIC_DATA(rdtsc,     RDTSC,  X86ISD::RDTSC_DAG, 0),
+  X86_INTRINSIC_DATA(rdtscp,    RDTSC,  X86ISD::RDTSCP_DAG, 0),
+
+  X86_INTRINSIC_DATA(subborrow_u32, ADX, X86ISD::SBB, 0),
+  X86_INTRINSIC_DATA(subborrow_u64, ADX, X86ISD::SBB, 0),
+  X86_INTRINSIC_DATA(xgetbv, XGETBV, X86::XGETBV, 0),
+  X86_INTRINSIC_DATA(xtest,     XTEST,  X86ISD::XTEST,  0),
+};
+
+/*
+ * Find Intrinsic data by intrinsic ID
+ */
+static const IntrinsicData* getIntrinsicWithChain(uint16_t IntNo) {
+
+  IntrinsicData IntrinsicToFind = {IntNo, INTR_NO_TYPE, 0, 0 };
+  const IntrinsicData *Data =  std::lower_bound(std::begin(IntrinsicsWithChain),
+                                                std::end(IntrinsicsWithChain),
+                                                IntrinsicToFind);
+  if (Data != std::end(IntrinsicsWithChain) && *Data == IntrinsicToFind)
+    return Data;
+  return nullptr;
+}
+
+/*
+ * IntrinsicsWithoutChain - the table should be sorted by Intrinsic ID - in
+ * the alphabetical order.
+ */
+static const IntrinsicData  IntrinsicsWithoutChain[] = {
+  X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0),
+  X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx_cvtdq2_ps_256, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx_cvtt_pd2dq_256,INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx_cvtt_ps2dq_256,INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx_hadd_pd_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
+  X86_INTRINSIC_DATA(avx_hadd_ps_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
+  X86_INTRINSIC_DATA(avx_hsub_pd_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+  X86_INTRINSIC_DATA(avx_hsub_ps_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+  X86_INTRINSIC_DATA(avx_max_pd_256,    INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx_max_ps_256,    INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx_min_pd_256,    INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx_min_ps_256,    INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx_movmsk_pd_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+  X86_INTRINSIC_DATA(avx_movmsk_ps_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+  X86_INTRINSIC_DATA(avx_rcp_ps_256,    INTR_TYPE_1OP, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx_rsqrt_ps_256,  INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx_sqrt_pd_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx_sqrt_ps_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(avx_vpermilvar_pd,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx_vpermilvar_ps,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx_vpermilvar_ps_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx2_pabs_b, INTR_TYPE_1OP, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx2_pabs_d, INTR_TYPE_1OP, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx2_pabs_w, INTR_TYPE_1OP, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx2_pavg_b,  INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx2_pavg_w,  INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
+  X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
+  X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+  X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+  X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
+  X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
+  X86_INTRINSIC_DATA(avx2_pmovmskb, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+  X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+  X86_INTRINSIC_DATA(avx2_pmul_hr_sw, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
+  X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+  X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+  X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+  X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx2_pslli_d, VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx2_pslli_q, VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx2_pslli_w, VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx2_psrai_w, VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx2_psrav_d,     INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrli_d, VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx2_psrli_q, VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx2_psrli_w, VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
+  X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2b_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2b_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2b_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2d_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2d_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2d_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2q_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2q_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2q_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2w_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2w_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtmask2w_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
+  X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtsi2sd64,  INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvtsi2ss32,  INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvtsi2ss64,  INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvtusi2ss,   INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
+  X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
+  X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
+
+  X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
+  X86ISD::FADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD,
+  X86ISD::FADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FADD_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FADD_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, BRCST32x2_TO_VEC,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, BRCST32x2_TO_VEC,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_256, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf32x8_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_256, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcastf64x4_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, BRCST32x2_TO_VEC,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, BRCST32x2_TO_VEC,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, BRCST32x2_TO_VEC,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_256, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti32x8_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_256, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_b_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_b_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_b_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_d_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_d_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_d_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM,
+                     X86ISD::CMPM_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM,
+                     X86ISD::CMPM_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_q_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_q_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_q_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_sd,     CMP_MASK_SCALAR_CC,
+                     X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_ss,     CMP_MASK_SCALAR_CC,
+                     X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_w_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_w_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_w_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_d_128,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_d_256,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_d_512,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_q_128,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_q_256,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_compress_q_512,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::COMPRESS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_d_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_q_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_q_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CONFLICT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_128, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_256, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), //er
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_256, INTR_TYPE_1OP_MASK,
+                    X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK,
+                    X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps,     INTR_TYPE_1OP_MASK,
+                    X86ISD::VFPROUND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, CVTPD2PS_MASK,
+                     ISD::FP_ROUND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, CVTPD2PS_MASK,
+                     ISD::FP_ROUND, X86ISD::VFPROUND_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VFPEXT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_EXTEND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_EXTEND, X86ISD::VFPEXT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_128, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_256, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTSI2P, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK,
+                     ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::VFPROUNDS_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::VFPEXTS_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK,
+                     ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_128, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_256, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_128, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_256, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTUI2P, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK,
+                     ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
+  X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_128, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::DBPSADBW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::DBPSADBW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_512, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::DBPSADBW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
+                     X86ISD::FDIV_RND),
+  X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
+                     X86ISD::FDIV_RND),
+  X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FDIV_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FDIV_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_d_128,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_d_256,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_d_512,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_pd_128, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_pd_256, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_pd_512, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_ps_128, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_ps_256, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_ps_512, COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_q_128,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_q_256,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_expand_q_512,  COMPRESS_EXPAND_IN_REG,
+                     X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fixupimm_sd, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fixupimm_ss, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_pd_512, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FGETEXPS_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FGETEXPS_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_RM,
+                     X86ISD::VGETMANTS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM,
+                     X86ISD::VGETMANTS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf32x4_256, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf32x4_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf32x8_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf64x2_256, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf64x2_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_insertf64x4_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti32x4_256, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti32x4_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti32x8_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti64x2_256, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti64x2_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_inserti64x4_512, INSERT_SUBVEC,
+                     ISD::INSERT_SUBVECTOR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_512, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_128, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_256, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_512, INTR_TYPE_1OP_MASK,
+                     ISD::CTLZ, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
+                     X86ISD::FMAX_RND),
+  X86_INTRINSIC_DATA(avx512_mask_max_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
+                     X86ISD::FMAX_RND),
+  X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FMAX_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FMAX_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
+                     X86ISD::FMIN_RND),
+  X86_INTRINSIC_DATA(avx512_mask_min_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
+                     X86ISD::FMIN_RND),
+  X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FMIN_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FMIN_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
+                     X86ISD::FMUL_RND),
+  X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
+                     X86ISD::FMUL_RND),
+  X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FMUL_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FMUL_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_d_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_d_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_d_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_q_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_q_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_q_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_w_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_w_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_w_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packssdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packssdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packssdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packsswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packsswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packsswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packusdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packusdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packusdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packuswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packuswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_packuswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padds_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padds_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_padds_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_paddus_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_paddus_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_paddus_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pavg_b_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pavg_b_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pavg_b_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pavg_w_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pavg_w_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pavg_w_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VBROADCAST, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_df_256, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_df_512, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_di_256, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_di_512, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_hi_128, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_hi_256, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_hi_512, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_qi_128, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_qi_256, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_qi_512, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_sf_256, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_sf_512, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_si_256, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_permvar_si_512, VPERM_2OP_MASK,
+                     X86ISD::VPERMV, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDUBSW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDUBSW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDUBSW, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDWD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDWD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::VPMADDWD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_db_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_dw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_qw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_db_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_db_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VTRUNCUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_128, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_256, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_512, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulh_w_128, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulh_w_256, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulh_w_512, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_128, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_256, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_512, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::MULTISHIFT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::MULTISHIFT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::MULTISHIFT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_d_128,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_d_256,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_d_512,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_q_128,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_q_256,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prol_q_512,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prolv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_d_128,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_d_256,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_d_512,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_q_128,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_q_256,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pror_q_512,  INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prorv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prorv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prorv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prorv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prorv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_prorv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubs_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubs_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubs_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubus_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubus_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubus_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_d_512, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_q_128, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_q_256, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCES, 0),
+  X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCES, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_sd,   INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::VRNDSCALES, 0),
+  X86_INTRINSIC_DATA(avx512_mask_rndscale_ss,   INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::VRNDSCALES, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_pd_512, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_ps_128, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_ps_256, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM,
+                     X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::SCALEFS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::SCALEFS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2_256, INTR_TYPE_3OP_IMM8_MASK,
+                     X86ISD::SHUF128, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
+                     X86ISD::FSQRT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
+                     X86ISD::FSQRT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FSQRTS_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FSQRTS_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
+                     X86ISD::FSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
+                     X86ISD::FSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FSUB_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FSUB_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::CVTPH2PS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::CVTPH2PS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::CVTPH2PS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK,
+                     X86ISD::CVTPS2PH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK,
+                     X86ISD::CVTPS2PH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::CVTPS2PH, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD,
+                     X86ISD::FMADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_128, FMA_OP_MASK, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_256, FMA_OP_MASK, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD,
+                     X86ISD::FMADD_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_512, FMA_OP_MASK, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_512, FMA_OP_MASK, X86ISD::FNMADD,
+                     X86ISD::FNMADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_512, FMA_OP_MASK, X86ISD::FNMADD,
+                     X86ISD::FNMADD_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_512, FMA_OP_MASK, X86ISD::FNMSUB,
+                     X86ISD::FNMSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB,
+                     X86ISD::FNMSUB_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMIV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_128, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_256, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_512, VPERM_3OP_MASK,
+                    X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , FMA_OP_MASK,
+                     X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , FMA_OP_MASK,
+                     X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , FMA_OP_MASK,
+                     X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , FMA_OP_MASK,
+                     X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , FMA_OP_MASK,
+                     X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , FMA_OP_MASK,
+                     X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD,
+                     X86ISD::FMADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD,
+                     X86ISD::FMADD_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_512, FMA_OP_MASK3, X86ISD::FMSUB,
+                     X86ISD::FMSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
+                     X86ISD::FMSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
+
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
+                     X86ISD::FMSUBADD_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
+                     X86ISD::FMSUBADD_RND),
+
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_512, FMA_OP_MASK3, X86ISD::FNMSUB,
+                     X86ISD::FNMSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
+                     X86ISD::FNMSUB_RND),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ,
+                     X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
+                     X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_512, FIXUPIMM_MASKZ,
+                     X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_128, FIXUPIMM_MASKZ,
+                     X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_256, FIXUPIMM_MASKZ,
+                     X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_512, FIXUPIMM_MASKZ,
+                     X86ISD::VFIXUPIMM, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_fixupimm_sd, FIXUPIMMS_MASKZ,
+                     X86ISD::VFIXUPIMMS, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMMS_MASKZ,
+                     X86ISD::VFIXUPIMMS, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_128, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_256, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_512, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_128, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_256, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_512, TERLOG_OP_MASKZ,
+                     X86ISD::VPTERNLOG, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, X86ISD::FMADD,
+                     X86ISD::FMADD_RND),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_128, FMA_OP_MASKZ, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_256, FMA_OP_MASKZ, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD,
+                     X86ISD::FMADD_RND),
+
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
+                     X86ISD::FMADDSUB_RND),
+
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_128, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_256, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_512, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_128, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_256, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_512, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_128, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_256, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_512, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_128, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_256, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_512, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_128, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_256, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_128, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_256, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_512, VPERM_3OP_MASKZ,
+                     X86ISD::VPERMV3, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, FMA_OP_MASKZ,
+                     X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, FMA_OP_MASKZ,
+                     X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, FMA_OP_MASKZ,
+                     X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, FMA_OP_MASKZ,
+                     X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, FMA_OP_MASKZ,
+                     X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, FMA_OP_MASKZ,
+                     X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_pmul_dq_512, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+  X86_INTRINSIC_DATA(avx512_pmulu_dq_512, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+  X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+  X86_INTRINSIC_DATA(avx512_pshuf_b_512, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+  X86_INTRINSIC_DATA(avx512_psll_d_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_psll_q_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_psll_w_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx512_pslli_d_512, VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_pslli_q_512, VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_pslli_w_512, VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_psllv_d_512, INTR_TYPE_2OP, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_psllv_q_512, INTR_TYPE_2OP, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_psllv_w_128, INTR_TYPE_2OP, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_psllv_w_256, INTR_TYPE_2OP, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_psllv_w_512, INTR_TYPE_2OP, ISD::SHL, 0),
+  X86_INTRINSIC_DATA(avx512_psra_d_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_psra_q_128, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_psra_q_256, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_psra_q_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_psra_w_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx512_psrai_d_512, VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_psrai_q_128, VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_psrai_q_256, VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_psrai_q_512, VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_psrai_w_512, VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_psrav_d_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx512_psrav_q_128, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx512_psrav_q_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx512_psrav_q_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx512_psrav_w_128, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx512_psrav_w_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx512_psrav_w_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
+  X86_INTRINSIC_DATA(avx512_psrl_d_512, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_psrl_q_512, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_psrl_w_512, INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx512_psrli_d_512, VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_psrli_q_512, VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_psrli_w_512, VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_psrlv_d_512, INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_psrlv_q_512, INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_b_128, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_b_256, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_b_512, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_d_128, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_d_256, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_d_512, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_q_128, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_q_256, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_q_512, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_w_128, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_w_256, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestm_w_512, CMP_MASK, X86ISD::TESTM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_b_128, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_b_256, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_b_512, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_d_128, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_d_256, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_d_512, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_q_128, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_q_256, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_q_512, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_w_128, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_w_256, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_ptestnm_w_512, CMP_MASK, X86ISD::TESTNM, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
+  X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
+  X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
+  X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
+  X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
+  X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
+  X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
+  X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
+  X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_vcvtsd2si64, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_vcvtsd2usi32, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_vcvtsd2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_vcvtss2si32, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_vcvtss2si64, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_vcvtss2usi32, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
+  X86_INTRINSIC_DATA(avx512_vpermilvar_pd_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx512_vpermilvar_ps_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(fma_vfmadd_pd,        INTR_TYPE_3OP, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmadd_pd_256,    INTR_TYPE_3OP, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmadd_ps,        INTR_TYPE_3OP, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmadd_ps_256,    INTR_TYPE_3OP, X86ISD::FMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmaddsub_pd,     INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmaddsub_ps,     INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmsub_pd,        INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmsub_pd_256,    INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmsub_ps,        INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmsub_ps_256,    INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmsubadd_pd,     INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmsubadd_ps,     INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(fma_vfmsubadd_ps_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
+  X86_INTRINSIC_DATA(fma_vfnmadd_pd,       INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfnmadd_pd_256,   INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfnmadd_ps,       INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfnmadd_ps_256,   INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+  X86_INTRINSIC_DATA(fma_vfnmsub_pd,       INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfnmsub_pd_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfnmsub_ps,       INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfnmsub_ps_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
+  X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
+  X86_INTRINSIC_DATA(sse_comigt_ss,     COMI, X86ISD::COMI, ISD::SETGT),
+  X86_INTRINSIC_DATA(sse_comile_ss,     COMI, X86ISD::COMI, ISD::SETLE),
+  X86_INTRINSIC_DATA(sse_comilt_ss,     COMI, X86ISD::COMI, ISD::SETLT),
+  X86_INTRINSIC_DATA(sse_comineq_ss,    COMI, X86ISD::COMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(sse_max_ps,        INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(sse_min_ps,        INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(sse_movmsk_ps,     INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+  X86_INTRINSIC_DATA(sse_rcp_ps,        INTR_TYPE_1OP, X86ISD::FRCP, 0),
+  X86_INTRINSIC_DATA(sse_rsqrt_ps,      INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
+  X86_INTRINSIC_DATA(sse_sqrt_ps,       INTR_TYPE_1OP, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(sse_ucomieq_ss,    COMI, X86ISD::UCOMI, ISD::SETEQ),
+  X86_INTRINSIC_DATA(sse_ucomige_ss,    COMI, X86ISD::UCOMI, ISD::SETGE),
+  X86_INTRINSIC_DATA(sse_ucomigt_ss,    COMI, X86ISD::UCOMI, ISD::SETGT),
+  X86_INTRINSIC_DATA(sse_ucomile_ss,    COMI, X86ISD::UCOMI, ISD::SETLE),
+  X86_INTRINSIC_DATA(sse_ucomilt_ss,    COMI, X86ISD::UCOMI, ISD::SETLT),
+  X86_INTRINSIC_DATA(sse_ucomineq_ss,   COMI, X86ISD::UCOMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(sse2_comieq_sd,    COMI, X86ISD::COMI, ISD::SETEQ),
+  X86_INTRINSIC_DATA(sse2_comige_sd,    COMI, X86ISD::COMI, ISD::SETGE),
+  X86_INTRINSIC_DATA(sse2_comigt_sd,    COMI, X86ISD::COMI, ISD::SETGT),
+  X86_INTRINSIC_DATA(sse2_comile_sd,    COMI, X86ISD::COMI, ISD::SETLE),
+  X86_INTRINSIC_DATA(sse2_comilt_sd,    COMI, X86ISD::COMI, ISD::SETLT),
+  X86_INTRINSIC_DATA(sse2_comineq_sd,   COMI, X86ISD::COMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(sse2_cvtdq2ps,     INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
+  X86_INTRINSIC_DATA(sse2_cvtpd2dq,     INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(sse2_cvtpd2ps,     INTR_TYPE_1OP, X86ISD::VFPROUND, 0),
+  X86_INTRINSIC_DATA(sse2_cvttpd2dq,    INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
+  X86_INTRINSIC_DATA(sse2_cvttps2dq,    INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
+  X86_INTRINSIC_DATA(sse2_max_pd,       INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(sse2_min_pd,       INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(sse2_movmsk_pd,    INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+  X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(sse2_padds_b,      INTR_TYPE_2OP, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(sse2_padds_w,      INTR_TYPE_2OP, X86ISD::ADDS, 0),
+  X86_INTRINSIC_DATA(sse2_paddus_b,     INTR_TYPE_2OP, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(sse2_paddus_w,     INTR_TYPE_2OP, X86ISD::ADDUS, 0),
+  X86_INTRINSIC_DATA(sse2_pavg_b,       INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(sse2_pavg_w,       INTR_TYPE_2OP, X86ISD::AVG, 0),
+  X86_INTRINSIC_DATA(sse2_pmadd_wd,     INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
+  X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+  X86_INTRINSIC_DATA(sse2_pmulh_w,      INTR_TYPE_2OP, ISD::MULHS, 0),
+  X86_INTRINSIC_DATA(sse2_pmulhu_w,     INTR_TYPE_2OP, ISD::MULHU, 0),
+  X86_INTRINSIC_DATA(sse2_pmulu_dq,     INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
+  X86_INTRINSIC_DATA(sse2_psad_bw,      INTR_TYPE_2OP, X86ISD::PSADBW, 0),
+  X86_INTRINSIC_DATA(sse2_psll_d,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(sse2_psll_q,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(sse2_psll_w,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(sse2_pslli_d,      VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(sse2_pslli_q,      VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(sse2_pslli_w,      VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(sse2_psra_d,       INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(sse2_psra_w,       INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(sse2_psrai_d,      VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(sse2_psrai_w,      VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(sse2_psrl_d,       INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(sse2_psrl_q,       INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(sse2_psrl_w,       INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(sse2_psrli_d,      VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(sse2_psrli_q,      VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(sse2_psrli_w,      VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(sse2_psubs_b,      INTR_TYPE_2OP, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(sse2_psubs_w,      INTR_TYPE_2OP, X86ISD::SUBS, 0),
+  X86_INTRINSIC_DATA(sse2_psubus_b,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(sse2_psubus_w,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(sse2_sqrt_pd,      INTR_TYPE_1OP, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(sse2_ucomieq_sd,   COMI, X86ISD::UCOMI, ISD::SETEQ),
+  X86_INTRINSIC_DATA(sse2_ucomige_sd,   COMI, X86ISD::UCOMI, ISD::SETGE),
+  X86_INTRINSIC_DATA(sse2_ucomigt_sd,   COMI, X86ISD::UCOMI, ISD::SETGT),
+  X86_INTRINSIC_DATA(sse2_ucomile_sd,   COMI, X86ISD::UCOMI, ISD::SETLE),
+  X86_INTRINSIC_DATA(sse2_ucomilt_sd,   COMI, X86ISD::UCOMI, ISD::SETLT),
+  X86_INTRINSIC_DATA(sse2_ucomineq_sd,  COMI, X86ISD::UCOMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(sse3_hadd_pd,      INTR_TYPE_2OP, X86ISD::FHADD, 0),
+  X86_INTRINSIC_DATA(sse3_hadd_ps,      INTR_TYPE_2OP, X86ISD::FHADD, 0),
+  X86_INTRINSIC_DATA(sse3_hsub_pd,      INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+  X86_INTRINSIC_DATA(sse3_hsub_ps,      INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+  X86_INTRINSIC_DATA(sse41_insertps,    INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
+  X86_INTRINSIC_DATA(sse41_packusdw,    INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(sse41_pmuldq,      INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+  X86_INTRINSIC_DATA(sse4a_extrqi,      INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
+  X86_INTRINSIC_DATA(sse4a_insertqi,    INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
+  X86_INTRINSIC_DATA(ssse3_pabs_b_128,  INTR_TYPE_1OP, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(ssse3_pabs_d_128,  INTR_TYPE_1OP, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(ssse3_pabs_w_128,  INTR_TYPE_1OP, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+  X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+  X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+  X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+  X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
+  X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
+  X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+  X86_INTRINSIC_DATA(xop_vpcomb,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+  X86_INTRINSIC_DATA(xop_vpcomd,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+  X86_INTRINSIC_DATA(xop_vpcomq,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+  X86_INTRINSIC_DATA(xop_vpcomub,       INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+  X86_INTRINSIC_DATA(xop_vpcomud,       INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+  X86_INTRINSIC_DATA(xop_vpcomuq,       INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+  X86_INTRINSIC_DATA(xop_vpcomuw,       INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
+  X86_INTRINSIC_DATA(xop_vpcomw,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+  X86_INTRINSIC_DATA(xop_vpermil2pd,     INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+  X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+  X86_INTRINSIC_DATA(xop_vpermil2ps,     INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+  X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+  X86_INTRINSIC_DATA(xop_vpperm,        INTR_TYPE_3OP, X86ISD::VPPERM, 0),
+  X86_INTRINSIC_DATA(xop_vprotb,        INTR_TYPE_2OP, X86ISD::VPROT, 0),
+  X86_INTRINSIC_DATA(xop_vprotbi,       INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+  X86_INTRINSIC_DATA(xop_vprotd,        INTR_TYPE_2OP, X86ISD::VPROT, 0),
+  X86_INTRINSIC_DATA(xop_vprotdi,       INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+  X86_INTRINSIC_DATA(xop_vprotq,        INTR_TYPE_2OP, X86ISD::VPROT, 0),
+  X86_INTRINSIC_DATA(xop_vprotqi,       INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+  X86_INTRINSIC_DATA(xop_vprotw,        INTR_TYPE_2OP, X86ISD::VPROT, 0),
+  X86_INTRINSIC_DATA(xop_vprotwi,       INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+  X86_INTRINSIC_DATA(xop_vpshab,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+  X86_INTRINSIC_DATA(xop_vpshad,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+  X86_INTRINSIC_DATA(xop_vpshaq,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+  X86_INTRINSIC_DATA(xop_vpshaw,        INTR_TYPE_2OP, X86ISD::VPSHA, 0),
+  X86_INTRINSIC_DATA(xop_vpshlb,        INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+  X86_INTRINSIC_DATA(xop_vpshld,        INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+  X86_INTRINSIC_DATA(xop_vpshlq,        INTR_TYPE_2OP, X86ISD::VPSHL, 0),
+  X86_INTRINSIC_DATA(xop_vpshlw,        INTR_TYPE_2OP, X86ISD::VPSHL, 0)
+};
+
+/*
+ * Retrieve data for Intrinsic without chain.
+ * Return nullptr if intrinsic is not defined in the table.
+ */
+static const IntrinsicData* getIntrinsicWithoutChain(uint16_t IntNo) {
+  IntrinsicData IntrinsicToFind = { IntNo, INTR_NO_TYPE, 0, 0 };
+  const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithoutChain),
+                                               std::end(IntrinsicsWithoutChain),
+                                               IntrinsicToFind);
+  if (Data != std::end(IntrinsicsWithoutChain) && *Data == IntrinsicToFind)
+    return Data;
+  return nullptr;
+}
+
+static void verifyIntrinsicTables() {
+  assert(std::is_sorted(std::begin(IntrinsicsWithoutChain),
+                        std::end(IntrinsicsWithoutChain)) &&
+         std::is_sorted(std::begin(IntrinsicsWithChain),
+                        std::end(IntrinsicsWithChain)) &&
+         "Intrinsic data tables should be sorted by Intrinsic ID");
+  assert((std::adjacent_find(std::begin(IntrinsicsWithoutChain),
+                             std::end(IntrinsicsWithoutChain)) ==
+          std::end(IntrinsicsWithoutChain)) &&
+         (std::adjacent_find(std::begin(IntrinsicsWithChain),
+                             std::end(IntrinsicsWithChain)) ==
+          std::end(IntrinsicsWithChain)) &&
+         "Intrinsic data tables should have unique entries");
+}
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
new file mode 100644
index 000000000000..2f69df064e7f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -0,0 +1,1795 @@
+//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower X86 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "X86RegisterInfo.h"
+#include "X86ShuffleDecodeConstantPool.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "InstPrinter/X86InstComments.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+namespace {
+
+/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class X86MCInstLower {
+  MCContext &Ctx;
+  const MachineFunction &MF;
+  const TargetMachine &TM;
+  const MCAsmInfo &MAI;
+  X86AsmPrinter &AsmPrinter;
+public:
+  X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter);
+
+  Optional<MCOperand> LowerMachineOperand(const MachineInstr *MI,
+                                          const MachineOperand &MO) const;
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+private:
+  MachineModuleInfoMachO &getMachOMMI() const;
+};
+
+} // end anonymous namespace
+
+// Emit a minimal sequence of nops spanning NumBytes bytes.
+static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
+                     const MCSubtargetInfo &STI);
+
+void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
+                                                 const MCSubtargetInfo &STI,
+                                                 MCCodeEmitter *CodeEmitter) {
+  if (InShadow) {
+    SmallString<256> Code;
+    SmallVector<MCFixup, 4> Fixups;
+    raw_svector_ostream VecOS(Code);
+    CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI);
+    CurrentShadowSize += Code.size();
+    if (CurrentShadowSize >= RequiredShadowSize)
+      InShadow = false; // The shadow is big enough. Stop counting.
+  }
+}
+
+void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
+    MCStreamer &OutStreamer, const MCSubtargetInfo &STI) {
+  if (InShadow && CurrentShadowSize < RequiredShadowSize) {
+    InShadow = false;
+    EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
+             MF->getSubtarget<X86Subtarget>().is64Bit(), STI);
+  }
+}
+
+void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
+  OutStreamer->EmitInstruction(Inst, getSubtargetInfo());
+  SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get());
+}
+
+X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
+                               X86AsmPrinter &asmprinter)
+    : Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), MAI(*TM.getMCAsmInfo()),
+      AsmPrinter(asmprinter) {}
+
+MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
+  return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
+}
+
+
+/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
+/// operand to an MCSymbol.
+MCSymbol *X86MCInstLower::
+GetSymbolFromOperand(const MachineOperand &MO) const {
+  const DataLayout &DL = MF.getDataLayout();
+  assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
+
+  MCSymbol *Sym = nullptr;
+  SmallString<128> Name;
+  StringRef Suffix;
+
+  switch (MO.getTargetFlags()) {
+  case X86II::MO_DLLIMPORT:
+    // Handle dllimport linkage.
+    Name += "__imp_";
+    break;
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+    Suffix = "$non_lazy_ptr";
+    break;
+  }
+
+  if (!Suffix.empty())
+    Name += DL.getPrivateGlobalPrefix();
+
+  if (MO.isGlobal()) {
+    const GlobalValue *GV = MO.getGlobal();
+    AsmPrinter.getNameWithPrefix(Name, GV);
+  } else if (MO.isSymbol()) {
+    Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
+  } else if (MO.isMBB()) {
+    assert(Suffix.empty());
+    Sym = MO.getMBB()->getSymbol();
+  }
+
+  Name += Suffix;
+  if (!Sym)
+    Sym = Ctx.getOrCreateSymbol(Name);
+
+  // If the target flags on the operand changes the name of the symbol, do that
+  // before we return the symbol.
+  switch (MO.getTargetFlags()) {
+  default: break;
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      getMachOMMI().getGVStubEntry(Sym);
+    if (!StubSym.getPointer()) {
+      assert(MO.isGlobal() && "Extern symbol not handled yet");
+      StubSym =
+        MachineModuleInfoImpl::
+        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
+                    !MO.getGlobal()->hasInternalLinkage());
+    }
+    break;
+  }
+  }
+
+  return Sym;
+}
+
+MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                             MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  const MCExpr *Expr = nullptr;
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG:    // No flag.
+  // These affect the name of the symbol, not any suffix.
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DLLIMPORT:
+    break;
+
+  case X86II::MO_TLVP:      RefKind = MCSymbolRefExpr::VK_TLVP; break;
+  case X86II::MO_TLVP_PIC_BASE:
+    Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+    // Subtract the pic base.
+    Expr = MCBinaryExpr::createSub(Expr,
+                                  MCSymbolRefExpr::create(MF.getPICBaseSymbol(),
+                                                           Ctx),
+                                   Ctx);
+    break;
+  case X86II::MO_SECREL:    RefKind = MCSymbolRefExpr::VK_SECREL; break;
+  case X86II::MO_TLSGD:     RefKind = MCSymbolRefExpr::VK_TLSGD; break;
+  case X86II::MO_TLSLD:     RefKind = MCSymbolRefExpr::VK_TLSLD; break;
+  case X86II::MO_TLSLDM:    RefKind = MCSymbolRefExpr::VK_TLSLDM; break;
+  case X86II::MO_GOTTPOFF:  RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
+  case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
+  case X86II::MO_TPOFF:     RefKind = MCSymbolRefExpr::VK_TPOFF; break;
+  case X86II::MO_DTPOFF:    RefKind = MCSymbolRefExpr::VK_DTPOFF; break;
+  case X86II::MO_NTPOFF:    RefKind = MCSymbolRefExpr::VK_NTPOFF; break;
+  case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break;
+  case X86II::MO_GOTPCREL:  RefKind = MCSymbolRefExpr::VK_GOTPCREL; break;
+  case X86II::MO_GOT:       RefKind = MCSymbolRefExpr::VK_GOT; break;
+  case X86II::MO_GOTOFF:    RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
+  case X86II::MO_PLT:       RefKind = MCSymbolRefExpr::VK_PLT; break;
+  case X86II::MO_PIC_BASE_OFFSET:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+    Expr = MCSymbolRefExpr::create(Sym, Ctx);
+    // Subtract the pic base.
+    Expr = MCBinaryExpr::createSub(Expr,
+                            MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx),
+                                   Ctx);
+    if (MO.isJTI()) {
+      assert(MAI.doesSetDirectiveSuppressReloc());
+      // If .set directive is supported, use it to reduce the number of
+      // relocations the assembler will generate for differences between
+      // local labels. This is only safe when the symbols are in the same
+      // section so we are restricting it to jumptable references.
+      MCSymbol *Label = Ctx.createTempSymbol();
+      AsmPrinter.OutStreamer->EmitAssignment(Label, Expr);
+      Expr = MCSymbolRefExpr::create(Label, Ctx);
+    }
+    break;
+  }
+
+  if (!Expr)
+    Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
+
+  if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
+    Expr = MCBinaryExpr::createAdd(Expr,
+                                   MCConstantExpr::create(MO.getOffset(), Ctx),
+                                   Ctx);
+  return MCOperand::createExpr(Expr);
+}
+
+
+/// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
+/// a short fixed-register form.
+static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
+  unsigned ImmOp = Inst.getNumOperands() - 1;
+  assert(Inst.getOperand(0).isReg() &&
+         (Inst.getOperand(ImmOp).isImm() || Inst.getOperand(ImmOp).isExpr()) &&
+         ((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() &&
+           Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) ||
+          Inst.getNumOperands() == 2) && "Unexpected instruction!");
+
+  // Check whether the destination register can be fixed.
+  unsigned Reg = Inst.getOperand(0).getReg();
+  if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+    return;
+
+  // If so, rewrite the instruction.
+  MCOperand Saved = Inst.getOperand(ImmOp);
+  Inst = MCInst();
+  Inst.setOpcode(Opcode);
+  Inst.addOperand(Saved);
+}
+
+/// \brief If a movsx instruction has a shorter encoding for the used register
+/// simplify the instruction to use it instead.
+static void SimplifyMOVSX(MCInst &Inst) {
+  unsigned NewOpcode = 0;
+  unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg();
+  switch (Inst.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected instruction!");
+  case X86::MOVSX16rr8:  // movsbw %al, %ax   --> cbtw
+    if (Op0 == X86::AX && Op1 == X86::AL)
+      NewOpcode = X86::CBW;
+    break;
+  case X86::MOVSX32rr16: // movswl %ax, %eax  --> cwtl
+    if (Op0 == X86::EAX && Op1 == X86::AX)
+      NewOpcode = X86::CWDE;
+    break;
+  case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq
+    if (Op0 == X86::RAX && Op1 == X86::EAX)
+      NewOpcode = X86::CDQE;
+    break;
+  }
+
+  if (NewOpcode != 0) {
+    Inst = MCInst();
+    Inst.setOpcode(NewOpcode);
+  }
+}
+
+/// \brief Simplify things like MOV32rm to MOV32o32a.
+static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
+                                  unsigned Opcode) {
+  // Don't make these simplifications in 64-bit mode; other assemblers don't
+  // perform them because they make the code larger.
+  if (Printer.getSubtarget().is64Bit())
+    return;
+
+  bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg();
+  unsigned AddrBase = IsStore;
+  unsigned RegOp = IsStore ? 0 : 5;
+  unsigned AddrOp = AddrBase + 3;
+  assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
+         Inst.getOperand(AddrBase + X86::AddrBaseReg).isReg() &&
+         Inst.getOperand(AddrBase + X86::AddrScaleAmt).isImm() &&
+         Inst.getOperand(AddrBase + X86::AddrIndexReg).isReg() &&
+         Inst.getOperand(AddrBase + X86::AddrSegmentReg).isReg() &&
+         (Inst.getOperand(AddrOp).isExpr() ||
+          Inst.getOperand(AddrOp).isImm()) &&
+         "Unexpected instruction!");
+
+  // Check whether the destination register can be fixed.
+  unsigned Reg = Inst.getOperand(RegOp).getReg();
+  if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+    return;
+
+  // Check whether this is an absolute address.
+  // FIXME: We know TLVP symbol refs aren't, but there should be a better way
+  // to do this here.
+  bool Absolute = true;
+  if (Inst.getOperand(AddrOp).isExpr()) {
+    const MCExpr *MCE = Inst.getOperand(AddrOp).getExpr();
+    if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(MCE))
+      if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP)
+        Absolute = false;
+  }
+
+  if (Absolute &&
+      (Inst.getOperand(AddrBase + X86::AddrBaseReg).getReg() != 0 ||
+       Inst.getOperand(AddrBase + X86::AddrScaleAmt).getImm() != 1 ||
+       Inst.getOperand(AddrBase + X86::AddrIndexReg).getReg() != 0))
+    return;
+
+  // If so, rewrite the instruction.
+  MCOperand Saved = Inst.getOperand(AddrOp);
+  MCOperand Seg = Inst.getOperand(AddrBase + X86::AddrSegmentReg);
+  Inst = MCInst();
+  Inst.setOpcode(Opcode);
+  Inst.addOperand(Saved);
+  Inst.addOperand(Seg);
+}
+
+static unsigned getRetOpcode(const X86Subtarget &Subtarget) {
+  return Subtarget.is64Bit() ? X86::RETQ : X86::RETL;
+}
+
+Optional<MCOperand>
+X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
+                                    const MachineOperand &MO) const {
+  switch (MO.getType()) {
+  default:
+    MI->dump();
+    llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit())
+      return None;
+    return MCOperand::createReg(MO.getReg());
+  case MachineOperand::MO_Immediate:
+    return MCOperand::createImm(MO.getImm());
+  case MachineOperand::MO_MachineBasicBlock:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+    return LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
+  case MachineOperand::MO_MCSymbol:
+    return LowerSymbolOperand(MO, MO.getMCSymbol());
+  case MachineOperand::MO_JumpTableIndex:
+    return LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()));
+  case MachineOperand::MO_ConstantPoolIndex:
+    return LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()));
+  case MachineOperand::MO_BlockAddress:
+    return LowerSymbolOperand(
+        MO, AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
+  case MachineOperand::MO_RegisterMask:
+    // Ignore call clobbers.
+    return None;
+  }
+}
+
+void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (const MachineOperand &MO : MI->operands())
+    if (auto MaybeMCOp = LowerMachineOperand(MI, MO))
+      OutMI.addOperand(MaybeMCOp.getValue());
+
+  // Handle a few special cases to eliminate operand modifiers.
+ReSimplify:
+  switch (OutMI.getOpcode()) {
+  case X86::LEA64_32r:
+  case X86::LEA64r:
+  case X86::LEA16r:
+  case X86::LEA32r:
+    // LEA should have a segment register, but it must be empty.
+    assert(OutMI.getNumOperands() == 1+X86::AddrNumOperands &&
+           "Unexpected # of LEA operands");
+    assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
+           "LEA has segment specified!");
+    break;
+
+  // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
+  // if one of the registers is extended, but other isn't.
+  case X86::VMOVZPQILo2PQIrr:
+  case X86::VMOVAPDrr:
+  case X86::VMOVAPDYrr:
+  case X86::VMOVAPSrr:
+  case X86::VMOVAPSYrr:
+  case X86::VMOVDQArr:
+  case X86::VMOVDQAYrr:
+  case X86::VMOVDQUrr:
+  case X86::VMOVDQUYrr:
+  case X86::VMOVUPDrr:
+  case X86::VMOVUPDYrr:
+  case X86::VMOVUPSrr:
+  case X86::VMOVUPSYrr: {
+    if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+        X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg())) {
+      unsigned NewOpc;
+      switch (OutMI.getOpcode()) {
+      default: llvm_unreachable("Invalid opcode");
+      case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr;   break;
+      case X86::VMOVAPDrr:        NewOpc = X86::VMOVAPDrr_REV;  break;
+      case X86::VMOVAPDYrr:       NewOpc = X86::VMOVAPDYrr_REV; break;
+      case X86::VMOVAPSrr:        NewOpc = X86::VMOVAPSrr_REV;  break;
+      case X86::VMOVAPSYrr:       NewOpc = X86::VMOVAPSYrr_REV; break;
+      case X86::VMOVDQArr:        NewOpc = X86::VMOVDQArr_REV;  break;
+      case X86::VMOVDQAYrr:       NewOpc = X86::VMOVDQAYrr_REV; break;
+      case X86::VMOVDQUrr:        NewOpc = X86::VMOVDQUrr_REV;  break;
+      case X86::VMOVDQUYrr:       NewOpc = X86::VMOVDQUYrr_REV; break;
+      case X86::VMOVUPDrr:        NewOpc = X86::VMOVUPDrr_REV;  break;
+      case X86::VMOVUPDYrr:       NewOpc = X86::VMOVUPDYrr_REV; break;
+      case X86::VMOVUPSrr:        NewOpc = X86::VMOVUPSrr_REV;  break;
+      case X86::VMOVUPSYrr:       NewOpc = X86::VMOVUPSYrr_REV; break;
+      }
+      OutMI.setOpcode(NewOpc);
+    }
+    break;
+  }
+  case X86::VMOVSDrr:
+  case X86::VMOVSSrr: {
+    if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+        X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
+      unsigned NewOpc;
+      switch (OutMI.getOpcode()) {
+      default: llvm_unreachable("Invalid opcode");
+      case X86::VMOVSDrr:   NewOpc = X86::VMOVSDrr_REV;   break;
+      case X86::VMOVSSrr:   NewOpc = X86::VMOVSSrr_REV;   break;
+      }
+      OutMI.setOpcode(NewOpc);
+    }
+    break;
+  }
+
+  // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register
+  // inputs modeled as normal uses instead of implicit uses.  As such, truncate
+  // off all but the first operand (the callee).  FIXME: Change isel.
+  case X86::TAILJMPr64:
+  case X86::TAILJMPr64_REX:
+  case X86::CALL64r:
+  case X86::CALL64pcrel32: {
+    unsigned Opcode = OutMI.getOpcode();
+    MCOperand Saved = OutMI.getOperand(0);
+    OutMI = MCInst();
+    OutMI.setOpcode(Opcode);
+    OutMI.addOperand(Saved);
+    break;
+  }
+
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    OutMI = MCInst();
+    OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
+    break;
+  }
+
+  case X86::CLEANUPRET: {
+    // Replace CATCHRET with the appropriate RET.
+    OutMI = MCInst();
+    OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
+    break;
+  }
+
+  case X86::CATCHRET: {
+    // Replace CATCHRET with the appropriate RET.
+    const X86Subtarget &Subtarget = AsmPrinter.getSubtarget();
+    unsigned ReturnReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+    OutMI = MCInst();
+    OutMI.setOpcode(getRetOpcode(Subtarget));
+    OutMI.addOperand(MCOperand::createReg(ReturnReg));
+    break;
+  }
+
+  // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump instruction.
+  { unsigned Opcode;
+  case X86::TAILJMPr:   Opcode = X86::JMP32r; goto SetTailJmpOpcode;
+  case X86::TAILJMPd:
+  case X86::TAILJMPd64: Opcode = X86::JMP_1;  goto SetTailJmpOpcode;
+  case X86::TAILJMPd_CC:
+  case X86::TAILJMPd64_CC:
+    Opcode = X86::GetCondBranchFromCond(
+        static_cast<X86::CondCode>(MI->getOperand(1).getImm()));
+    goto SetTailJmpOpcode;
+
+  SetTailJmpOpcode:
+    MCOperand Saved = OutMI.getOperand(0);
+    OutMI = MCInst();
+    OutMI.setOpcode(Opcode);
+    OutMI.addOperand(Saved);
+    break;
+  }
+
+  case X86::DEC16r:
+  case X86::DEC32r:
+  case X86::INC16r:
+  case X86::INC32r:
+    // If we aren't in 64-bit mode we can use the 1-byte inc/dec instructions.
+    if (!AsmPrinter.getSubtarget().is64Bit()) {
+      unsigned Opcode;
+      switch (OutMI.getOpcode()) {
+      default: llvm_unreachable("Invalid opcode");
+      case X86::DEC16r: Opcode = X86::DEC16r_alt; break;
+      case X86::DEC32r: Opcode = X86::DEC32r_alt; break;
+      case X86::INC16r: Opcode = X86::INC16r_alt; break;
+      case X86::INC32r: Opcode = X86::INC32r_alt; break;
+      }
+      OutMI.setOpcode(Opcode);
+    }
+    break;
+
+  // These are pseudo-ops for OR to help with the OR->ADD transformation.  We do
+  // this with an ugly goto in case the resultant OR uses EAX and needs the
+  // short form.
+  case X86::ADD16rr_DB:   OutMI.setOpcode(X86::OR16rr); goto ReSimplify;
+  case X86::ADD32rr_DB:   OutMI.setOpcode(X86::OR32rr); goto ReSimplify;
+  case X86::ADD64rr_DB:   OutMI.setOpcode(X86::OR64rr); goto ReSimplify;
+  case X86::ADD16ri_DB:   OutMI.setOpcode(X86::OR16ri); goto ReSimplify;
+  case X86::ADD32ri_DB:   OutMI.setOpcode(X86::OR32ri); goto ReSimplify;
+  case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify;
+  case X86::ADD16ri8_DB:  OutMI.setOpcode(X86::OR16ri8); goto ReSimplify;
+  case X86::ADD32ri8_DB:  OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
+  case X86::ADD64ri8_DB:  OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;
+
+  // Atomic load and store require a separate pseudo-inst because Acquire
+  // implies mayStore and Release implies mayLoad; fix these to regular MOV
+  // instructions here
+  case X86::ACQUIRE_MOV8rm:    OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV16rm:   OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV32rm:   OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV64rm:   OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
+  case X86::RELEASE_MOV8mr:    OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
+  case X86::RELEASE_MOV16mr:   OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
+  case X86::RELEASE_MOV32mr:   OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
+  case X86::RELEASE_MOV64mr:   OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
+  case X86::RELEASE_MOV8mi:    OutMI.setOpcode(X86::MOV8mi); goto ReSimplify;
+  case X86::RELEASE_MOV16mi:   OutMI.setOpcode(X86::MOV16mi); goto ReSimplify;
+  case X86::RELEASE_MOV32mi:   OutMI.setOpcode(X86::MOV32mi); goto ReSimplify;
+  case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify;
+  case X86::RELEASE_ADD8mi:    OutMI.setOpcode(X86::ADD8mi); goto ReSimplify;
+  case X86::RELEASE_ADD8mr:    OutMI.setOpcode(X86::ADD8mr); goto ReSimplify;
+  case X86::RELEASE_ADD32mi:   OutMI.setOpcode(X86::ADD32mi); goto ReSimplify;
+  case X86::RELEASE_ADD32mr:   OutMI.setOpcode(X86::ADD32mr); goto ReSimplify;
+  case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify;
+  case X86::RELEASE_ADD64mr:   OutMI.setOpcode(X86::ADD64mr); goto ReSimplify;
+  case X86::RELEASE_AND8mi:    OutMI.setOpcode(X86::AND8mi); goto ReSimplify;
+  case X86::RELEASE_AND8mr:    OutMI.setOpcode(X86::AND8mr); goto ReSimplify;
+  case X86::RELEASE_AND32mi:   OutMI.setOpcode(X86::AND32mi); goto ReSimplify;
+  case X86::RELEASE_AND32mr:   OutMI.setOpcode(X86::AND32mr); goto ReSimplify;
+  case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify;
+  case X86::RELEASE_AND64mr:   OutMI.setOpcode(X86::AND64mr); goto ReSimplify;
+  case X86::RELEASE_OR8mi:     OutMI.setOpcode(X86::OR8mi); goto ReSimplify;
+  case X86::RELEASE_OR8mr:     OutMI.setOpcode(X86::OR8mr); goto ReSimplify;
+  case X86::RELEASE_OR32mi:    OutMI.setOpcode(X86::OR32mi); goto ReSimplify;
+  case X86::RELEASE_OR32mr:    OutMI.setOpcode(X86::OR32mr); goto ReSimplify;
+  case X86::RELEASE_OR64mi32:  OutMI.setOpcode(X86::OR64mi32); goto ReSimplify;
+  case X86::RELEASE_OR64mr:    OutMI.setOpcode(X86::OR64mr); goto ReSimplify;
+  case X86::RELEASE_XOR8mi:    OutMI.setOpcode(X86::XOR8mi); goto ReSimplify;
+  case X86::RELEASE_XOR8mr:    OutMI.setOpcode(X86::XOR8mr); goto ReSimplify;
+  case X86::RELEASE_XOR32mi:   OutMI.setOpcode(X86::XOR32mi); goto ReSimplify;
+  case X86::RELEASE_XOR32mr:   OutMI.setOpcode(X86::XOR32mr); goto ReSimplify;
+  case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify;
+  case X86::RELEASE_XOR64mr:   OutMI.setOpcode(X86::XOR64mr); goto ReSimplify;
+  case X86::RELEASE_INC8m:     OutMI.setOpcode(X86::INC8m); goto ReSimplify;
+  case X86::RELEASE_INC16m:    OutMI.setOpcode(X86::INC16m); goto ReSimplify;
+  case X86::RELEASE_INC32m:    OutMI.setOpcode(X86::INC32m); goto ReSimplify;
+  case X86::RELEASE_INC64m:    OutMI.setOpcode(X86::INC64m); goto ReSimplify;
+  case X86::RELEASE_DEC8m:     OutMI.setOpcode(X86::DEC8m); goto ReSimplify;
+  case X86::RELEASE_DEC16m:    OutMI.setOpcode(X86::DEC16m); goto ReSimplify;
+  case X86::RELEASE_DEC32m:    OutMI.setOpcode(X86::DEC32m); goto ReSimplify;
+  case X86::RELEASE_DEC64m:    OutMI.setOpcode(X86::DEC64m); goto ReSimplify;
+
+  // We don't currently select the correct instruction form for instructions
+  // which have a short %eax, etc. form. Handle this by custom lowering, for
+  // now.
+  //
+  // Note, we are currently not handling the following instructions:
+  // MOV64ao8, MOV64o8a
+  // XCHG16ar, XCHG32ar, XCHG64ar
+  case X86::MOV8mr_NOREX:
+  case X86::MOV8mr:
+  case X86::MOV8rm_NOREX:
+  case X86::MOV8rm:
+  case X86::MOV16mr:
+  case X86::MOV16rm:
+  case X86::MOV32mr:
+  case X86::MOV32rm: {
+    unsigned NewOpc;
+    switch (OutMI.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::MOV8mr_NOREX:
+    case X86::MOV8mr:     NewOpc = X86::MOV8o32a; break;
+    case X86::MOV8rm_NOREX:
+    case X86::MOV8rm:     NewOpc = X86::MOV8ao32; break;
+    case X86::MOV16mr:    NewOpc = X86::MOV16o32a; break;
+    case X86::MOV16rm:    NewOpc = X86::MOV16ao32; break;
+    case X86::MOV32mr:    NewOpc = X86::MOV32o32a; break;
+    case X86::MOV32rm:    NewOpc = X86::MOV32ao32; break;
+    }
+    SimplifyShortMoveForm(AsmPrinter, OutMI, NewOpc);
+    break;
+  }
+
+  case X86::ADC8ri: case X86::ADC16ri: case X86::ADC32ri: case X86::ADC64ri32:
+  case X86::ADD8ri: case X86::ADD16ri: case X86::ADD32ri: case X86::ADD64ri32:
+  case X86::AND8ri: case X86::AND16ri: case X86::AND32ri: case X86::AND64ri32:
+  case X86::CMP8ri: case X86::CMP16ri: case X86::CMP32ri: case X86::CMP64ri32:
+  case X86::OR8ri:  case X86::OR16ri:  case X86::OR32ri:  case X86::OR64ri32:
+  case X86::SBB8ri: case X86::SBB16ri: case X86::SBB32ri: case X86::SBB64ri32:
+  case X86::SUB8ri: case X86::SUB16ri: case X86::SUB32ri: case X86::SUB64ri32:
+  case X86::TEST8ri:case X86::TEST16ri:case X86::TEST32ri:case X86::TEST64ri32:
+  case X86::XOR8ri: case X86::XOR16ri: case X86::XOR32ri: case X86::XOR64ri32: {
+    unsigned NewOpc;
+    switch (OutMI.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::ADC8ri:     NewOpc = X86::ADC8i8;    break;
+    case X86::ADC16ri:    NewOpc = X86::ADC16i16;  break;
+    case X86::ADC32ri:    NewOpc = X86::ADC32i32;  break;
+    case X86::ADC64ri32:  NewOpc = X86::ADC64i32;  break;
+    case X86::ADD8ri:     NewOpc = X86::ADD8i8;    break;
+    case X86::ADD16ri:    NewOpc = X86::ADD16i16;  break;
+    case X86::ADD32ri:    NewOpc = X86::ADD32i32;  break;
+    case X86::ADD64ri32:  NewOpc = X86::ADD64i32;  break;
+    case X86::AND8ri:     NewOpc = X86::AND8i8;    break;
+    case X86::AND16ri:    NewOpc = X86::AND16i16;  break;
+    case X86::AND32ri:    NewOpc = X86::AND32i32;  break;
+    case X86::AND64ri32:  NewOpc = X86::AND64i32;  break;
+    case X86::CMP8ri:     NewOpc = X86::CMP8i8;    break;
+    case X86::CMP16ri:    NewOpc = X86::CMP16i16;  break;
+    case X86::CMP32ri:    NewOpc = X86::CMP32i32;  break;
+    case X86::CMP64ri32:  NewOpc = X86::CMP64i32;  break;
+    case X86::OR8ri:      NewOpc = X86::OR8i8;     break;
+    case X86::OR16ri:     NewOpc = X86::OR16i16;   break;
+    case X86::OR32ri:     NewOpc = X86::OR32i32;   break;
+    case X86::OR64ri32:   NewOpc = X86::OR64i32;   break;
+    case X86::SBB8ri:     NewOpc = X86::SBB8i8;    break;
+    case X86::SBB16ri:    NewOpc = X86::SBB16i16;  break;
+    case X86::SBB32ri:    NewOpc = X86::SBB32i32;  break;
+    case X86::SBB64ri32:  NewOpc = X86::SBB64i32;  break;
+    case X86::SUB8ri:     NewOpc = X86::SUB8i8;    break;
+    case X86::SUB16ri:    NewOpc = X86::SUB16i16;  break;
+    case X86::SUB32ri:    NewOpc = X86::SUB32i32;  break;
+    case X86::SUB64ri32:  NewOpc = X86::SUB64i32;  break;
+    case X86::TEST8ri:    NewOpc = X86::TEST8i8;   break;
+    case X86::TEST16ri:   NewOpc = X86::TEST16i16; break;
+    case X86::TEST32ri:   NewOpc = X86::TEST32i32; break;
+    case X86::TEST64ri32: NewOpc = X86::TEST64i32; break;
+    case X86::XOR8ri:     NewOpc = X86::XOR8i8;    break;
+    case X86::XOR16ri:    NewOpc = X86::XOR16i16;  break;
+    case X86::XOR32ri:    NewOpc = X86::XOR32i32;  break;
+    case X86::XOR64ri32:  NewOpc = X86::XOR64i32;  break;
+    }
+    SimplifyShortImmForm(OutMI, NewOpc);
+    break;
+  }
+
+  // Try to shrink some forms of movsx.
+  case X86::MOVSX16rr8:
+  case X86::MOVSX32rr16:
+  case X86::MOVSX64rr32:
+    SimplifyMOVSX(OutMI);
+    break;
+  }
+}
+
+void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
+                                 const MachineInstr &MI) {
+
+  bool is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
+                  MI.getOpcode() == X86::TLS_base_addr64;
+
+  bool needsPadding = MI.getOpcode() == X86::TLS_addr64;
+
+  MCContext &context = OutStreamer->getContext();
+
+  if (needsPadding)
+    EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+
+  MCSymbolRefExpr::VariantKind SRVK;
+  switch (MI.getOpcode()) {
+    case X86::TLS_addr32:
+    case X86::TLS_addr64:
+      SRVK = MCSymbolRefExpr::VK_TLSGD;
+      break;
+    case X86::TLS_base_addr32:
+      SRVK = MCSymbolRefExpr::VK_TLSLDM;
+      break;
+    case X86::TLS_base_addr64:
+      SRVK = MCSymbolRefExpr::VK_TLSLD;
+      break;
+    default:
+      llvm_unreachable("unexpected opcode");
+  }
+
+  MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
+  const MCSymbolRefExpr *symRef = MCSymbolRefExpr::create(sym, SRVK, context);
+
+  MCInst LEA;
+  if (is64Bits) {
+    LEA.setOpcode(X86::LEA64r);
+    LEA.addOperand(MCOperand::createReg(X86::RDI)); // dest
+    LEA.addOperand(MCOperand::createReg(X86::RIP)); // base
+    LEA.addOperand(MCOperand::createImm(1));        // scale
+    LEA.addOperand(MCOperand::createReg(0));        // index
+    LEA.addOperand(MCOperand::createExpr(symRef));  // disp
+    LEA.addOperand(MCOperand::createReg(0));        // seg
+  } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) {
+    LEA.setOpcode(X86::LEA32r);
+    LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
+    LEA.addOperand(MCOperand::createReg(X86::EBX)); // base
+    LEA.addOperand(MCOperand::createImm(1));        // scale
+    LEA.addOperand(MCOperand::createReg(0));        // index
+    LEA.addOperand(MCOperand::createExpr(symRef));  // disp
+    LEA.addOperand(MCOperand::createReg(0));        // seg
+  } else {
+    LEA.setOpcode(X86::LEA32r);
+    LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
+    LEA.addOperand(MCOperand::createReg(0));        // base
+    LEA.addOperand(MCOperand::createImm(1));        // scale
+    LEA.addOperand(MCOperand::createReg(X86::EBX)); // index
+    LEA.addOperand(MCOperand::createExpr(symRef));  // disp
+    LEA.addOperand(MCOperand::createReg(0));        // seg
+  }
+  EmitAndCountInstruction(LEA);
+
+  if (needsPadding) {
+    EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+    EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+    EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
+  }
+
+  StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
+  MCSymbol *tlsGetAddr = context.getOrCreateSymbol(name);
+  const MCSymbolRefExpr *tlsRef =
+    MCSymbolRefExpr::create(tlsGetAddr,
+                            MCSymbolRefExpr::VK_PLT,
+                            context);
+
+  EmitAndCountInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32
+                                                 : X86::CALLpcrel32)
+                            .addExpr(tlsRef));
+}
+
+/// \brief Emit the largest nop instruction smaller than or equal to \p NumBytes
+/// bytes.  Return the size of nop emitted.
+static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
+                        const MCSubtargetInfo &STI) {
+  // This works only for 64bit. For 32bit we have to do additional checking if
+  // the CPU supports multi-byte nops.
+  assert(Is64Bit && "EmitNops only supports X86-64");
+
+  unsigned NopSize;
+  unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
+  Opc = IndexReg = Displacement = SegmentReg = 0;
+  BaseReg = X86::RAX;
+  ScaleVal = 1;
+  switch (NumBytes) {
+  case  0: llvm_unreachable("Zero nops?"); break;
+  case  1: NopSize = 1; Opc = X86::NOOP; break;
+  case  2: NopSize = 2; Opc = X86::XCHG16ar; break;
+  case  3: NopSize = 3; Opc = X86::NOOPL; break;
+  case  4: NopSize = 4; Opc = X86::NOOPL; Displacement = 8; break;
+  case  5: NopSize = 5; Opc = X86::NOOPL; Displacement = 8;
+           IndexReg = X86::RAX; break;
+  case  6: NopSize = 6; Opc = X86::NOOPW; Displacement = 8;
+           IndexReg = X86::RAX; break;
+  case  7: NopSize = 7; Opc = X86::NOOPL; Displacement = 512; break;
+  case  8: NopSize = 8; Opc = X86::NOOPL; Displacement = 512;
+           IndexReg = X86::RAX; break;
+  case  9: NopSize = 9; Opc = X86::NOOPW; Displacement = 512;
+           IndexReg = X86::RAX; break;
+  default: NopSize = 10; Opc = X86::NOOPW; Displacement = 512;
+           IndexReg = X86::RAX; SegmentReg = X86::CS; break;
+  }
+
+  unsigned NumPrefixes = std::min(NumBytes - NopSize, 5U);
+  NopSize += NumPrefixes;
+  for (unsigned i = 0; i != NumPrefixes; ++i)
+    OS.EmitBytes("\x66");
+
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+    break;
+  case X86::NOOP:
+    OS.EmitInstruction(MCInstBuilder(Opc), STI);
+    break;
+  case X86::XCHG16ar:
+    OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX), STI);
+    break;
+  case X86::NOOPL:
+  case X86::NOOPW:
+    OS.EmitInstruction(MCInstBuilder(Opc)
+                           .addReg(BaseReg)
+                           .addImm(ScaleVal)
+                           .addReg(IndexReg)
+                           .addImm(Displacement)
+                           .addReg(SegmentReg),
+                       STI);
+    break;
+  }
+  assert(NopSize <= NumBytes && "We overemitted?");
+  return NopSize;
+}
+
+/// \brief Emit the optimal amount of multi-byte nops on X86.
+static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
+                     const MCSubtargetInfo &STI) {
+  unsigned NopsToEmit = NumBytes;
+  (void)NopsToEmit;
+  while (NumBytes) {
+    NumBytes -= EmitNop(OS, NumBytes, Is64Bit, STI);
+    assert(NopsToEmit >= NumBytes && "Emitted more than I asked for!");
+  }
+}
+
+void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
+                                    X86MCInstLower &MCIL) {
+  assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64");
+
+  StatepointOpers SOpers(&MI);
+  if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
+    EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(),
+             getSubtargetInfo());
+  } else {
+    // Lower call target and choose correct opcode
+    const MachineOperand &CallTarget = SOpers.getCallTarget();
+    MCOperand CallTargetMCOp;
+    unsigned CallOpcode;
+    switch (CallTarget.getType()) {
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+      CallTargetMCOp = MCIL.LowerSymbolOperand(
+          CallTarget, MCIL.GetSymbolFromOperand(CallTarget));
+      CallOpcode = X86::CALL64pcrel32;
+      // Currently, we only support relative addressing with statepoints.
+      // Otherwise, we'll need a scratch register to hold the target
+      // address.  You'll fail asserts during load & relocation if this
+      // symbol is to far away. (TODO: support non-relative addressing)
+      break;
+    case MachineOperand::MO_Immediate:
+      CallTargetMCOp = MCOperand::createImm(CallTarget.getImm());
+      CallOpcode = X86::CALL64pcrel32;
+      // Currently, we only support relative addressing with statepoints.
+      // Otherwise, we'll need a scratch register to hold the target
+      // immediate.  You'll fail asserts during load & relocation if this
+      // address is to far away. (TODO: support non-relative addressing)
+      break;
+    case MachineOperand::MO_Register:
+      CallTargetMCOp = MCOperand::createReg(CallTarget.getReg());
+      CallOpcode = X86::CALL64r;
+      break;
+    default:
+      llvm_unreachable("Unsupported operand type in statepoint call target");
+      break;
+    }
+
+    // Emit call
+    MCInst CallInst;
+    CallInst.setOpcode(CallOpcode);
+    CallInst.addOperand(CallTargetMCOp);
+    OutStreamer->EmitInstruction(CallInst, getSubtargetInfo());
+  }
+
+  // Record our statepoint node in the same section used by STACKMAP
+  // and PATCHPOINT
+  SM.recordStatepoint(MI);
+}
+
+void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI,
+                                       X86MCInstLower &MCIL) {
+  // FAULTING_LOAD_OP <def>, <MBB handler>, <load opcode>, <load operands>
+
+  unsigned LoadDefRegister = MI.getOperand(0).getReg();
+  MCSymbol *HandlerLabel = MI.getOperand(1).getMBB()->getSymbol();
+  unsigned LoadOpcode = MI.getOperand(2).getImm();
+  unsigned LoadOperandsBeginIdx = 3;
+
+  FM.recordFaultingOp(FaultMaps::FaultingLoad, HandlerLabel);
+
+  MCInst LoadMI;
+  LoadMI.setOpcode(LoadOpcode);
+
+  if (LoadDefRegister != X86::NoRegister)
+    LoadMI.addOperand(MCOperand::createReg(LoadDefRegister));
+
+  for (auto I = MI.operands_begin() + LoadOperandsBeginIdx,
+            E = MI.operands_end();
+       I != E; ++I)
+    if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, *I))
+      LoadMI.addOperand(MaybeOperand.getValue());
+
+  OutStreamer->EmitInstruction(LoadMI, getSubtargetInfo());
+}
+
+void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
+                                      X86MCInstLower &MCIL) {
+  // PATCHABLE_OP minsize, opcode, operands
+
+  unsigned MinSize = MI.getOperand(0).getImm();
+  unsigned Opcode = MI.getOperand(1).getImm();
+
+  MCInst MCI;
+  MCI.setOpcode(Opcode);
+  for (auto &MO : make_range(MI.operands_begin() + 2, MI.operands_end()))
+    if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
+      MCI.addOperand(MaybeOperand.getValue());
+
+  SmallString<256> Code;
+  SmallVector<MCFixup, 4> Fixups;
+  raw_svector_ostream VecOS(Code);
+  CodeEmitter->encodeInstruction(MCI, VecOS, Fixups, getSubtargetInfo());
+
+  if (Code.size() < MinSize) {
+    if (MinSize == 2 && Opcode == X86::PUSH64r) {
+      // This is an optimization that lets us get away without emitting a nop in
+      // many cases.
+      //
+      // NB! In some cases the encoding for PUSH64r (e.g. PUSH64r %R9) takes two
+      // bytes too, so the check on MinSize is important.
+      MCI.setOpcode(X86::PUSH64rmr);
+    } else {
+      unsigned NopSize = EmitNop(*OutStreamer, MinSize, Subtarget->is64Bit(),
+                                 getSubtargetInfo());
+      assert(NopSize == MinSize && "Could not implement MinSize!");
+      (void) NopSize;
+    }
+  }
+
+  OutStreamer->EmitInstruction(MCI, getSubtargetInfo());
+}
+
+// Lower a stackmap of the form:
+// <id>, <shadowBytes>, ...
+void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
+  SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+  SM.recordStackMap(MI);
+  unsigned NumShadowBytes = MI.getOperand(1).getImm();
+  SMShadowTracker.reset(NumShadowBytes);
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
+void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
+                                    X86MCInstLower &MCIL) {
+  assert(Subtarget->is64Bit() && "Patchpoint currently only supports X86-64");
+
+  SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+
+  SM.recordPatchPoint(MI);
+
+  PatchPointOpers opers(&MI);
+  unsigned ScratchIdx = opers.getNextScratchIdx();
+  unsigned EncodedBytes = 0;
+  const MachineOperand &CalleeMO = opers.getCallTarget();
+
+  // Check for null target. If target is non-null (i.e. is non-zero or is
+  // symbolic) then emit a call.
+  if (!(CalleeMO.isImm() && !CalleeMO.getImm())) {
+    MCOperand CalleeMCOp;
+    switch (CalleeMO.getType()) {
+    default:
+      /// FIXME: Add a verifier check for bad callee types.
+      llvm_unreachable("Unrecognized callee operand type.");
+    case MachineOperand::MO_Immediate:
+      if (CalleeMO.getImm())
+        CalleeMCOp = MCOperand::createImm(CalleeMO.getImm());
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+    case MachineOperand::MO_GlobalAddress:
+      CalleeMCOp =
+        MCIL.LowerSymbolOperand(CalleeMO,
+                                MCIL.GetSymbolFromOperand(CalleeMO));
+      break;
+    }
+
+    // Emit MOV to materialize the target address and the CALL to target.
+    // This is encoded with 12-13 bytes, depending on which register is used.
+    unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg();
+    if (X86II::isX86_64ExtendedReg(ScratchReg))
+      EncodedBytes = 13;
+    else
+      EncodedBytes = 12;
+
+    EmitAndCountInstruction(
+        MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp));
+    EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg));
+  }
+
+  // Emit padding.
+  unsigned NumBytes = opers.getNumPatchBytes();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+
+  EmitNops(*OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(),
+           getSubtargetInfo());
+}
+
+void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
+                                                  X86MCInstLower &MCIL) {
+  // We want to emit the following pattern:
+  //
+  //   .p2align 1, ...
+  // .Lxray_sled_N:
+  //   jmp .tmpN
+  //   # 9 bytes worth of noops
+  // .tmpN
+  //
+  // We need the 9 bytes because at runtime, we'd be patching over the full 11
+  // bytes with the following pattern:
+  //
+  //   mov %r10, <function id, 32-bit>   // 6 bytes
+  //   call <relative offset, 32-bits>   // 5 bytes
+  //
+  auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+  OutStreamer->EmitCodeAlignment(2);
+  OutStreamer->EmitLabel(CurSled);
+  auto Target = OutContext.createTempSymbol();
+
+  // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
+  // an operand (computed as an offset from the jmp instruction).
+  // FIXME: Find another less hacky way do force the relative jump.
+  OutStreamer->EmitBytes("\xeb\x09");
+  EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
+  OutStreamer->EmitLabel(Target);
+  recordSled(CurSled, MI, SledKind::FUNCTION_ENTER);
+}
+
+void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
+                                       X86MCInstLower &MCIL) {
+  // Since PATCHABLE_RET takes the opcode of the return statement as an
+  // argument, we use that to emit the correct form of the RET that we want.
+  // i.e. when we see this:
+  //
+  //   PATCHABLE_RET X86::RET ...
+  //
+  // We should emit the RET followed by sleds.
+  //
+  //   .p2align 1, ...
+  // .Lxray_sled_N:
+  //   ret  # or equivalent instruction
+  //   # 10 bytes worth of noops
+  //
+  // This just makes sure that the alignment for the next instruction is 2.
+  auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+  OutStreamer->EmitCodeAlignment(2);
+  OutStreamer->EmitLabel(CurSled);
+  unsigned OpCode = MI.getOperand(0).getImm();
+  MCInst Ret;
+  Ret.setOpcode(OpCode);
+  for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
+    if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
+      Ret.addOperand(MaybeOperand.getValue());
+  OutStreamer->EmitInstruction(Ret, getSubtargetInfo());
+  EmitNops(*OutStreamer, 10, Subtarget->is64Bit(), getSubtargetInfo());
+  recordSled(CurSled, MI, SledKind::FUNCTION_EXIT);
+}
+
+void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL) {
+  // Like PATCHABLE_RET, we have the actual instruction in the operands to this
+  // instruction so we lower that particular instruction and its operands.
+  // Unlike PATCHABLE_RET though, we put the sled before the JMP, much like how
+  // we do it for PATCHABLE_FUNCTION_ENTER. The sled should be very similar to
+  // the PATCHABLE_FUNCTION_ENTER case, followed by the lowering of the actual
+  // tail call much like how we have it in PATCHABLE_RET.
+  auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+  OutStreamer->EmitCodeAlignment(2);
+  OutStreamer->EmitLabel(CurSled);
+  auto Target = OutContext.createTempSymbol();
+
+  // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
+  // an operand (computed as an offset from the jmp instruction).
+  // FIXME: Find another less hacky way do force the relative jump.
+  OutStreamer->EmitBytes("\xeb\x09");
+  EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
+  OutStreamer->EmitLabel(Target);
+  recordSled(CurSled, MI, SledKind::TAIL_CALL);
+
+  unsigned OpCode = MI.getOperand(0).getImm();
+  MCInst TC;
+  TC.setOpcode(OpCode);
+
+  // Before emitting the instruction, add a comment to indicate that this is
+  // indeed a tail call.
+  OutStreamer->AddComment("TAILCALL");
+  for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
+    if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
+      TC.addOperand(MaybeOperand.getValue());
+  OutStreamer->EmitInstruction(TC, getSubtargetInfo());
+}
+
+void X86AsmPrinter::EmitXRayTable() {
+  if (Sleds.empty())
+    return;
+  
+  auto PrevSection = OutStreamer->getCurrentSectionOnly();
+  auto Fn = MF->getFunction();
+  MCSection *Section = nullptr;
+  if (Subtarget->isTargetELF()) {
+    if (Fn->hasComdat()) {
+      Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+                                         ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
+                                         Fn->getComdat()->getName());
+    } else {
+      Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+                                         ELF::SHF_ALLOC);
+    }
+  } else if (Subtarget->isTargetMachO()) {
+    Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
+                                         SectionKind::getReadOnlyWithRel());
+  } else {
+    llvm_unreachable("Unsupported target");
+  }
+
+  // Before we switch over, we force a reference to a label inside the
+  // xray_instr_map section. Since EmitXRayTable() is always called just
+  // before the function's end, we assume that this is happening after the
+  // last return instruction.
+  //
+  // We then align the reference to 16 byte boundaries, which we determined
+  // experimentally to be beneficial to avoid causing decoder stalls.
+  MCSymbol *Tmp = OutContext.createTempSymbol("xray_synthetic_", true);
+  OutStreamer->EmitCodeAlignment(16);
+  OutStreamer->EmitSymbolValue(Tmp, 8, false);
+  OutStreamer->SwitchSection(Section);
+  OutStreamer->EmitLabel(Tmp);
+  for (const auto &Sled : Sleds) {
+    OutStreamer->EmitSymbolValue(Sled.Sled, 8);
+    OutStreamer->EmitSymbolValue(CurrentFnSym, 8);
+    auto Kind = static_cast<uint8_t>(Sled.Kind);
+    OutStreamer->EmitBytes(
+        StringRef(reinterpret_cast<const char *>(&Kind), 1));
+    OutStreamer->EmitBytes(
+        StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1));
+    OutStreamer->EmitZeros(14);
+  }
+  OutStreamer->SwitchSection(PrevSection);
+
+  Sleds.clear();
+}
+
+// Returns instruction preceding MBBI in MachineFunction.
+// If MBBI is the first instruction of the first basic block, returns null.
+static MachineBasicBlock::const_iterator
+PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
+  const MachineBasicBlock *MBB = MBBI->getParent();
+  while (MBBI == MBB->begin()) {
+    if (MBB == &MBB->getParent()->front())
+      return MachineBasicBlock::const_iterator();
+    MBB = MBB->getPrevNode();
+    MBBI = MBB->end();
+  }
+  return --MBBI;
+}
+
+static const Constant *getConstantFromPool(const MachineInstr &MI,
+                                           const MachineOperand &Op) {
+  if (!Op.isCPI())
+    return nullptr;
+
+  ArrayRef<MachineConstantPoolEntry> Constants =
+      MI.getParent()->getParent()->getConstantPool()->getConstants();
+  const MachineConstantPoolEntry &ConstantEntry =
+      Constants[Op.getIndex()];
+
+  // Bail if this is a machine constant pool entry, we won't be able to dig out
+  // anything useful.
+  if (ConstantEntry.isMachineConstantPoolEntry())
+    return nullptr;
+
+  auto *C = dyn_cast<Constant>(ConstantEntry.Val.ConstVal);
+  assert((!C || ConstantEntry.getType() == C->getType()) &&
+         "Expected a constant of the same type!");
+  return C;
+}
+
+static std::string getShuffleComment(const MachineInstr *MI,
+                                     unsigned SrcOp1Idx,
+                                     unsigned SrcOp2Idx,
+                                     ArrayRef<int> Mask) {
+  std::string Comment;
+
+  // Compute the name for a register. This is really goofy because we have
+  // multiple instruction printers that could (in theory) use different
+  // names. Fortunately most people use the ATT style (outside of Windows)
+  // and they actually agree on register naming here. Ultimately, this is
+  // a comment, and so its OK if it isn't perfect.
+  auto GetRegisterName = [](unsigned RegNum) -> StringRef {
+    return X86ATTInstPrinter::getRegisterName(RegNum);
+  };
+
+  const MachineOperand &DstOp = MI->getOperand(0);
+  const MachineOperand &SrcOp1 = MI->getOperand(SrcOp1Idx);
+  const MachineOperand &SrcOp2 = MI->getOperand(SrcOp2Idx);
+
+  StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem";
+  StringRef Src1Name =
+      SrcOp1.isReg() ? GetRegisterName(SrcOp1.getReg()) : "mem";
+  StringRef Src2Name =
+      SrcOp2.isReg() ? GetRegisterName(SrcOp2.getReg()) : "mem";
+
+  // One source operand, fix the mask to print all elements in one span.
+  SmallVector<int, 8> ShuffleMask(Mask.begin(), Mask.end());
+  if (Src1Name == Src2Name)
+    for (int i = 0, e = ShuffleMask.size(); i != e; ++i)
+      if (ShuffleMask[i] >= e)
+        ShuffleMask[i] -= e;
+
+  raw_string_ostream CS(Comment);
+  CS << DstName;
+
+  // Handle AVX512 MASK/MASXZ write mask comments.
+  // MASK: zmmX {%kY}
+  // MASKZ: zmmX {%kY} {z}
+  if (SrcOp1Idx > 1) {
+    assert((SrcOp1Idx == 2 || SrcOp1Idx == 3) && "Unexpected writemask");
+
+    const MachineOperand &WriteMaskOp = MI->getOperand(SrcOp1Idx - 1);
+    if (WriteMaskOp.isReg()) {
+      CS << " {%" << GetRegisterName(WriteMaskOp.getReg()) << "}";
+
+      if (SrcOp1Idx == 2) {
+        CS << " {z}";
+      }
+    }
+  }
+
+  CS << " = ";
+
+  for (int i = 0, e = ShuffleMask.size(); i != e; ++i) {
+    if (i != 0)
+      CS << ",";
+    if (ShuffleMask[i] == SM_SentinelZero) {
+      CS << "zero";
+      continue;
+    }
+
+    // Otherwise, it must come from src1 or src2.  Print the span of elements
+    // that comes from this src.
+    bool isSrc1 = ShuffleMask[i] < (int)e;
+    CS << (isSrc1 ? Src1Name : Src2Name) << '[';
+
+    bool IsFirst = true;
+    while (i != e && ShuffleMask[i] != SM_SentinelZero &&
+           (ShuffleMask[i] < (int)e) == isSrc1) {
+      if (!IsFirst)
+        CS << ',';
+      else
+        IsFirst = false;
+      if (ShuffleMask[i] == SM_SentinelUndef)
+        CS << "u";
+      else
+        CS << ShuffleMask[i] % (int)e;
+      ++i;
+    }
+    CS << ']';
+    --i; // For loop increments element #.
+  }
+  CS.flush();
+
+  return Comment;
+}
+
+void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  X86MCInstLower MCInstLowering(*MF, *this);
+  const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo();
+
+  // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
+  // are compressed from EVEX encoding to VEX encoding.
+  if (TM.Options.MCOptions.ShowMCEncoding) {
+    if (MI->getAsmPrinterFlags() & AC_EVEX_2_VEX)
+      OutStreamer->AddComment("EVEX TO VEX Compression ", false);
+  }
+
+  switch (MI->getOpcode()) {
+  case TargetOpcode::DBG_VALUE:
+    llvm_unreachable("Should be handled target independently");
+
+  // Emit nothing here but a comment if we can.
+  case X86::Int_MemBarrier:
+    OutStreamer->emitRawComment("MEMBARRIER");
+    return;
+
+
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    // Lower these as normal, but add some comments.
+    unsigned Reg = MI->getOperand(0).getReg();
+    OutStreamer->AddComment(StringRef("eh_return, addr: %") +
+                            X86ATTInstPrinter::getRegisterName(Reg));
+    break;
+  }
+  case X86::CLEANUPRET: {
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("CLEANUPRET");
+    break;
+  }
+
+  case X86::CATCHRET: {
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("CATCHRET");
+    break;
+  }
+
+  case X86::TAILJMPr:
+  case X86::TAILJMPm:
+  case X86::TAILJMPd:
+  case X86::TAILJMPd_CC:
+  case X86::TAILJMPr64:
+  case X86::TAILJMPm64:
+  case X86::TAILJMPd64:
+  case X86::TAILJMPd64_CC:
+  case X86::TAILJMPr64_REX:
+  case X86::TAILJMPm64_REX:
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("TAILCALL");
+    break;
+
+  case X86::TLS_addr32:
+  case X86::TLS_addr64:
+  case X86::TLS_base_addr32:
+  case X86::TLS_base_addr64:
+    return LowerTlsAddr(MCInstLowering, *MI);
+
+  case X86::MOVPC32r: {
+    // This is a pseudo op for a two instruction sequence with a label, which
+    // looks like:
+    //     call "L1$pb"
+    // "L1$pb":
+    //     popl %esi
+
+    // Emit the call.
+    MCSymbol *PICBase = MF->getPICBaseSymbol();
+    // FIXME: We would like an efficient form for this, so we don't have to do a
+    // lot of extra uniquing.
+    EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32)
+      .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
+
+    const X86FrameLowering* FrameLowering =
+        MF->getSubtarget<X86Subtarget>().getFrameLowering();
+    bool hasFP = FrameLowering->hasFP(*MF);
+    
+    // TODO: This is needed only if we require precise CFA.
+    bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
+                               !OutStreamer->getDwarfFrameInfos().back().End;
+
+    int stackGrowth = -RI->getSlotSize();
+
+    if (HasActiveDwarfFrame && !hasFP) {
+      OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth);
+    }
+
+    // Emit the label.
+    OutStreamer->EmitLabel(PICBase);
+
+    // popl $reg
+    EmitAndCountInstruction(MCInstBuilder(X86::POP32r)
+                            .addReg(MI->getOperand(0).getReg()));
+
+    if (HasActiveDwarfFrame && !hasFP) {
+      OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth);
+    }
+    return;
+  }
+
+  case X86::ADD32ri: {
+    // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
+    if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
+      break;
+
+    // Okay, we have something like:
+    //  EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
+
+    // For this, we want to print something like:
+    //   MYGLOBAL + (. - PICBASE)
+    // However, we can't generate a ".", so just emit a new label here and refer
+    // to it.
+    MCSymbol *DotSym = OutContext.createTempSymbol();
+    OutStreamer->EmitLabel(DotSym);
+
+    // Now that we have emitted the label, lower the complex operand expression.
+    MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
+
+    const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
+    const MCExpr *PICBase =
+      MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
+    DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);
+
+    DotExpr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(OpSym,OutContext),
+                                      DotExpr, OutContext);
+
+    EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addExpr(DotExpr));
+    return;
+  }
+  case TargetOpcode::STATEPOINT:
+    return LowerSTATEPOINT(*MI, MCInstLowering);
+
+  case TargetOpcode::FAULTING_LOAD_OP:
+    return LowerFAULTING_LOAD_OP(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_OP:
+    return LowerPATCHABLE_OP(*MI, MCInstLowering);
+
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(*MI);
+
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
+    return LowerPATCHABLE_FUNCTION_ENTER(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_RET:
+    return LowerPATCHABLE_RET(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_TAIL_CALL:
+    return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);
+
+  case X86::MORESTACK_RET:
+    EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+    return;
+
+  case X86::MORESTACK_RET_RESTORE_R10:
+    // Return, then restore R10.
+    EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+    EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr)
+                            .addReg(X86::R10)
+                            .addReg(X86::RAX));
+    return;
+
+  case X86::SEH_PushReg:
+    assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+    OutStreamer->EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm()));
+    return;
+
+  case X86::SEH_SaveReg:
+    assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+    OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+                                   MI->getOperand(1).getImm());
+    return;
+
+  case X86::SEH_SaveXMM:
+    assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+    OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+                                   MI->getOperand(1).getImm());
+    return;
+
+  case X86::SEH_StackAlloc:
+    assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+    OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm());
+    return;
+
+  case X86::SEH_SetFrame:
+    assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+    OutStreamer->EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+                                    MI->getOperand(1).getImm());
+    return;
+
+  case X86::SEH_PushFrame:
+    assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+    OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm());
+    return;
+
+  case X86::SEH_EndPrologue:
+    assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+    OutStreamer->EmitWinCFIEndProlog();
+    return;
+
+  case X86::SEH_Epilogue: {
+    assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+    MachineBasicBlock::const_iterator MBBI(MI);
+    // Check if preceded by a call and emit nop if so.
+    for (MBBI = PrevCrossBBInst(MBBI);
+         MBBI != MachineBasicBlock::const_iterator();
+         MBBI = PrevCrossBBInst(MBBI)) {
+      // Conservatively assume that pseudo instructions don't emit code and keep
+      // looking for a call. We may emit an unnecessary nop in some cases.
+      if (!MBBI->isPseudo()) {
+        if (MBBI->isCall())
+          EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+        break;
+      }
+    }
+    return;
+  }
+
+  // Lower PSHUFB and VPERMILP normally but add a comment if we can find
+  // a constant shuffle mask. We won't be able to do this at the MC layer
+  // because the mask isn't an immediate.
+  case X86::PSHUFBrm:
+  case X86::VPSHUFBrm:
+  case X86::VPSHUFBYrm:
+  case X86::VPSHUFBZ128rm:
+  case X86::VPSHUFBZ128rmk:
+  case X86::VPSHUFBZ128rmkz:
+  case X86::VPSHUFBZ256rm:
+  case X86::VPSHUFBZ256rmk:
+  case X86::VPSHUFBZ256rmkz:
+  case X86::VPSHUFBZrm:
+  case X86::VPSHUFBZrmk:
+  case X86::VPSHUFBZrmkz: {
+    if (!OutStreamer->isVerboseAsm())
+      break;
+    unsigned SrcIdx, MaskIdx;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::PSHUFBrm:
+    case X86::VPSHUFBrm:
+    case X86::VPSHUFBYrm:
+    case X86::VPSHUFBZ128rm:
+    case X86::VPSHUFBZ256rm:
+    case X86::VPSHUFBZrm:
+      SrcIdx = 1; MaskIdx = 5; break;
+    case X86::VPSHUFBZ128rmkz:
+    case X86::VPSHUFBZ256rmkz:
+    case X86::VPSHUFBZrmkz:
+      SrcIdx = 2; MaskIdx = 6; break;
+    case X86::VPSHUFBZ128rmk:
+    case X86::VPSHUFBZ256rmk:
+    case X86::VPSHUFBZrmk:
+      SrcIdx = 3; MaskIdx = 7; break;
+    }
+
+    assert(MI->getNumOperands() >= 6 &&
+           "We should always have at least 6 operands!");
+
+    const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
+    if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      SmallVector<int, 64> Mask;
+      DecodePSHUFBMask(C, Mask);
+      if (!Mask.empty())
+        OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
+    }
+    break;
+  }
+
+  case X86::VPERMILPSrm:
+  case X86::VPERMILPSYrm:
+  case X86::VPERMILPSZ128rm:
+  case X86::VPERMILPSZ128rmk:
+  case X86::VPERMILPSZ128rmkz:
+  case X86::VPERMILPSZ256rm:
+  case X86::VPERMILPSZ256rmk:
+  case X86::VPERMILPSZ256rmkz:
+  case X86::VPERMILPSZrm:
+  case X86::VPERMILPSZrmk:
+  case X86::VPERMILPSZrmkz:
+  case X86::VPERMILPDrm:
+  case X86::VPERMILPDYrm:
+  case X86::VPERMILPDZ128rm:
+  case X86::VPERMILPDZ128rmk:
+  case X86::VPERMILPDZ128rmkz:
+  case X86::VPERMILPDZ256rm:
+  case X86::VPERMILPDZ256rmk:
+  case X86::VPERMILPDZ256rmkz:
+  case X86::VPERMILPDZrm:
+  case X86::VPERMILPDZrmk:
+  case X86::VPERMILPDZrmkz: {
+    if (!OutStreamer->isVerboseAsm())
+      break;
+    unsigned SrcIdx, MaskIdx;
+    unsigned ElSize;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::VPERMILPSrm:
+    case X86::VPERMILPSYrm:
+    case X86::VPERMILPSZ128rm:
+    case X86::VPERMILPSZ256rm:
+    case X86::VPERMILPSZrm:
+      SrcIdx = 1; MaskIdx = 5; ElSize = 32; break;
+    case X86::VPERMILPSZ128rmkz:
+    case X86::VPERMILPSZ256rmkz:
+    case X86::VPERMILPSZrmkz:
+      SrcIdx = 2; MaskIdx = 6; ElSize = 32; break;
+    case X86::VPERMILPSZ128rmk:
+    case X86::VPERMILPSZ256rmk:
+    case X86::VPERMILPSZrmk:
+      SrcIdx = 3; MaskIdx = 7; ElSize = 32; break;
+    case X86::VPERMILPDrm:
+    case X86::VPERMILPDYrm:
+    case X86::VPERMILPDZ128rm:
+    case X86::VPERMILPDZ256rm:
+    case X86::VPERMILPDZrm:
+      SrcIdx = 1; MaskIdx = 5; ElSize = 64; break;
+    case X86::VPERMILPDZ128rmkz:
+    case X86::VPERMILPDZ256rmkz:
+    case X86::VPERMILPDZrmkz:
+      SrcIdx = 2; MaskIdx = 6; ElSize = 64; break;
+    case X86::VPERMILPDZ128rmk:
+    case X86::VPERMILPDZ256rmk:
+    case X86::VPERMILPDZrmk:
+      SrcIdx = 3; MaskIdx = 7; ElSize = 64; break;
+    }
+
+    assert(MI->getNumOperands() >= 6 &&
+           "We should always have at least 6 operands!");
+
+    const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
+    if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      SmallVector<int, 16> Mask;
+      DecodeVPERMILPMask(C, ElSize, Mask);
+      if (!Mask.empty())
+        OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
+    }
+    break;
+  }
+
+  case X86::VPERMIL2PDrm:
+  case X86::VPERMIL2PSrm:
+  case X86::VPERMIL2PDrmY:
+  case X86::VPERMIL2PSrmY: {
+    if (!OutStreamer->isVerboseAsm())
+      break;
+    assert(MI->getNumOperands() >= 8 &&
+           "We should always have at least 8 operands!");
+
+    const MachineOperand &CtrlOp = MI->getOperand(MI->getNumOperands() - 1);
+    if (!CtrlOp.isImm())
+      break;
+
+    unsigned ElSize;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::VPERMIL2PSrm: case X86::VPERMIL2PSrmY: ElSize = 32; break;
+    case X86::VPERMIL2PDrm: case X86::VPERMIL2PDrmY: ElSize = 64; break;
+    }
+
+    const MachineOperand &MaskOp = MI->getOperand(6);
+    if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      SmallVector<int, 16> Mask;
+      DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask);
+      if (!Mask.empty())
+        OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
+    }
+    break;
+  }
+
+  case X86::VPPERMrrm: {
+    if (!OutStreamer->isVerboseAsm())
+      break;
+    assert(MI->getNumOperands() >= 7 &&
+           "We should always have at least 7 operands!");
+
+    const MachineOperand &MaskOp = MI->getOperand(6);
+    if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      SmallVector<int, 16> Mask;
+      DecodeVPPERMMask(C, Mask);
+      if (!Mask.empty())
+        OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
+    }
+    break;
+  }
+
+#define MOV_CASE(Prefix, Suffix)        \
+  case X86::Prefix##MOVAPD##Suffix##rm: \
+  case X86::Prefix##MOVAPS##Suffix##rm: \
+  case X86::Prefix##MOVUPD##Suffix##rm: \
+  case X86::Prefix##MOVUPS##Suffix##rm: \
+  case X86::Prefix##MOVDQA##Suffix##rm: \
+  case X86::Prefix##MOVDQU##Suffix##rm:
+
+#define MOV_AVX512_CASE(Suffix)         \
+  case X86::VMOVDQA64##Suffix##rm:      \
+  case X86::VMOVDQA32##Suffix##rm:      \
+  case X86::VMOVDQU64##Suffix##rm:      \
+  case X86::VMOVDQU32##Suffix##rm:      \
+  case X86::VMOVDQU16##Suffix##rm:      \
+  case X86::VMOVDQU8##Suffix##rm:       \
+  case X86::VMOVAPS##Suffix##rm:        \
+  case X86::VMOVAPD##Suffix##rm:        \
+  case X86::VMOVUPS##Suffix##rm:        \
+  case X86::VMOVUPD##Suffix##rm:
+
+#define CASE_ALL_MOV_RM()               \
+  MOV_CASE(, )   /* SSE */              \
+  MOV_CASE(V, )  /* AVX-128 */          \
+  MOV_CASE(V, Y) /* AVX-256 */          \
+  MOV_AVX512_CASE(Z)                    \
+  MOV_AVX512_CASE(Z256)                 \
+  MOV_AVX512_CASE(Z128)
+
+  // For loads from a constant pool to a vector register, print the constant
+  // loaded.
+  CASE_ALL_MOV_RM()
+    if (!OutStreamer->isVerboseAsm())
+      break;
+    if (MI->getNumOperands() <= 4)
+      break;
+    if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+      std::string Comment;
+      raw_string_ostream CS(Comment);
+      const MachineOperand &DstOp = MI->getOperand(0);
+      CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
+      if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
+        CS << "[";
+        for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) {
+          if (i != 0)
+            CS << ",";
+          if (CDS->getElementType()->isIntegerTy())
+            CS << CDS->getElementAsInteger(i);
+          else if (CDS->getElementType()->isFloatTy())
+            CS << CDS->getElementAsFloat(i);
+          else if (CDS->getElementType()->isDoubleTy())
+            CS << CDS->getElementAsDouble(i);
+          else
+            CS << "?";
+        }
+        CS << "]";
+        OutStreamer->AddComment(CS.str());
+      } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
+        CS << "<";
+        for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) {
+          if (i != 0)
+            CS << ",";
+          Constant *COp = CV->getOperand(i);
+          if (isa<UndefValue>(COp)) {
+            CS << "u";
+          } else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
+            if (CI->getBitWidth() <= 64) {
+              CS << CI->getZExtValue();
+            } else {
+              // print multi-word constant as (w0,w1)
+              const auto &Val = CI->getValue();
+              CS << "(";
+              for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
+                if (i > 0)
+                  CS << ",";
+                CS << Val.getRawData()[i];
+              }
+              CS << ")";
+            }
+          } else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
+            SmallString<32> Str;
+            CF->getValueAPF().toString(Str);
+            CS << Str;
+          } else {
+            CS << "?";
+          }
+        }
+        CS << ">";
+        OutStreamer->AddComment(CS.str());
+      }
+    }
+    break;
+  }
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+
+  // Stackmap shadows cannot include branch targets, so we can count the bytes
+  // in a call towards the shadow, but must ensure that the no thread returns
+  // in to the stackmap shadow.  The only way to achieve this is if the call
+  // is at the end of the shadow.
+  if (MI->isCall()) {
+    // Count then size of the call towards the shadow
+    SMShadowTracker.count(TmpInst, getSubtargetInfo(), CodeEmitter.get());
+    // Then flush the shadow so that we fill with nops before the call, not
+    // after it.
+    SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
+    // Then emit the call
+    OutStreamer->EmitInstruction(TmpInst, getSubtargetInfo());
+    return;
+  }
+
+  EmitAndCountInstruction(TmpInst);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
new file mode 100644
index 000000000000..c9e636f1eb00
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -0,0 +1,33 @@
+//===-- X86MachineFunctionInfo.cpp - X86 machine function info ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+void X86MachineFunctionInfo::anchor() { }
+
+void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) {
+  if (!RestoreBasePointerOffset) {
+    const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      MF->getSubtarget().getRegisterInfo());
+    unsigned SlotSize = RegInfo->getSlotSize();
+    for (const MCPhysReg *CSR =
+      RegInfo->X86RegisterInfo::getCalleeSavedRegs(MF);
+      unsigned Reg = *CSR;
+       ++CSR)
+    {
+      if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
+        RestoreBasePointerOffset -= SlotSize;
+    }
+  }
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
new file mode 100644
index 000000000000..d517d82537a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -0,0 +1,185 @@
+//===-- X86MachineFunctionInfo.h - X86 machine function info ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares X86-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineValueType.h"
+
+namespace llvm {
+
+/// X86MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private X86 target-specific information for each MachineFunction.
+class X86MachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// ForceFramePointer - True if the function is required to use of frame
+  /// pointer for reasons other than it containing dynamic allocation or
+  /// that FP eliminatation is turned off. For example, Cygwin main function
+  /// contains stack pointer re-alignment code which requires FP.
+  bool ForceFramePointer = false;
+
+  /// RestoreBasePointerOffset - Non-zero if the function has base pointer
+  /// and makes call to llvm.eh.sjlj.setjmp. When non-zero, the value is a
+  /// displacement from the frame pointer to a slot where the base pointer
+  /// is stashed.
+  signed char RestoreBasePointerOffset = 0;
+
+  /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize = 0;
+
+  /// BytesToPopOnReturn - Number of bytes function pops on return (in addition
+  /// to the space used by the return address).
+  /// Used on windows platform for stdcall & fastcall name decoration
+  unsigned BytesToPopOnReturn = 0;
+
+  /// ReturnAddrIndex - FrameIndex for return slot.
+  int ReturnAddrIndex = 0;
+
+  /// \brief FrameIndex for return slot.
+  int FrameAddrIndex = 0;
+
+  /// TailCallReturnAddrDelta - The number of bytes by which return address
+  /// stack slot is moved as the result of tail call optimization.
+  int TailCallReturnAddrDelta = 0;
+
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg = 0;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg = 0;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex = 0;
+  /// RegSaveFrameIndex - X86-64 vararg func register save area.
+  int RegSaveFrameIndex = 0;
+  /// VarArgsGPOffset - X86-64 vararg func int reg offset.
+  unsigned VarArgsGPOffset = 0;
+  /// VarArgsFPOffset - X86-64 vararg func fp reg offset.
+  unsigned VarArgsFPOffset = 0;
+  /// ArgumentStackSize - The number of bytes on stack consumed by the arguments
+  /// being passed on the stack.
+  unsigned ArgumentStackSize = 0;
+  /// NumLocalDynamics - Number of local-dynamic TLS accesses.
+  unsigned NumLocalDynamics = 0;
+  /// HasPushSequences - Keeps track of whether this function uses sequences
+  /// of pushes to pass function parameters.
+  bool HasPushSequences = false;
+
+  /// True if the function recovers from an SEH exception, and therefore needs
+  /// to spill and restore the frame pointer.
+  bool HasSEHFramePtrSave = false;
+
+  /// The frame index of a stack object containing the original frame pointer
+  /// used to address arguments in a function using a base pointer.
+  int SEHFramePtrSaveIndex = 0;
+
+  /// True if this function has a subset of CSRs that is handled explicitly via
+  /// copies.
+  bool IsSplitCSR = false;
+
+  /// True if this function uses the red zone.
+  bool UsesRedZone = false;
+
+  /// True if this function has WIN_ALLOCA instructions.
+  bool HasWinAlloca = false;
+
+private:
+  /// ForwardedMustTailRegParms - A list of virtual and physical registers
+  /// that must be forwarded to every musttail call.
+  SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
+
+public:
+  X86MachineFunctionInfo() = default;
+
+  explicit X86MachineFunctionInfo(MachineFunction &MF) {}
+
+  bool getForceFramePointer() const { return ForceFramePointer;}
+  void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+
+  bool getHasPushSequences() const { return HasPushSequences; }
+  void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; }
+
+  bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; }
+  void setRestoreBasePointer(const MachineFunction *MF);
+  int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; }
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+  unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+  void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}
+
+  int getRAIndex() const { return ReturnAddrIndex; }
+  void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+  int getFAIndex() const { return FrameAddrIndex; }
+  void setFAIndex(int Index) { FrameAddrIndex = Index; }
+
+  int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
+  void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+  void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
+
+  int getRegSaveFrameIndex() const { return RegSaveFrameIndex; }
+  void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; }
+
+  unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; }
+  void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; }
+
+  unsigned getVarArgsFPOffset() const { return VarArgsFPOffset; }
+  void setVarArgsFPOffset(unsigned Offset) { VarArgsFPOffset = Offset; }
+
+  unsigned getArgumentStackSize() const { return ArgumentStackSize; }
+  void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
+  unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+
+  bool getHasSEHFramePtrSave() const { return HasSEHFramePtrSave; }
+  void setHasSEHFramePtrSave(bool V) { HasSEHFramePtrSave = V; }
+
+  int getSEHFramePtrSaveIndex() const { return SEHFramePtrSaveIndex; }
+  void setSEHFramePtrSaveIndex(int Index) { SEHFramePtrSaveIndex = Index; }
+
+  SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
+    return ForwardedMustTailRegParms;
+  }
+
+  bool isSplitCSR() const { return IsSplitCSR; }
+  void setIsSplitCSR(bool s) { IsSplitCSR = s; }
+
+  bool getUsesRedZone() const { return UsesRedZone; }
+  void setUsesRedZone(bool V) { UsesRedZone = V; }
+
+  bool hasWinAlloca() const { return HasWinAlloca; }
+  void setHasWinAlloca(bool v) { HasWinAlloca = v; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
new file mode 100644
index 000000000000..e1447006cd18
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -0,0 +1,645 @@
+//===-- X86OptimizeLEAs.cpp - optimize usage of LEA instructions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass that performs some optimizations with LEA
+// instructions in order to improve performance and code size.
+// Currently, it does two things:
+// 1) If there are two LEA instructions calculating addresses which only differ
+//    by displacement inside a basic block, one of them is removed.
+// 2) Address calculations in load and store instructions are replaced by
+//    existing LEA def registers where possible.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-optimize-LEAs"
+
+static cl::opt<bool>
+    DisableX86LEAOpt("disable-x86-lea-opt", cl::Hidden,
+                     cl::desc("X86: Disable LEA optimizations."),
+                     cl::init(false));
+
+STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions");
+STATISTIC(NumRedundantLEAs, "Number of redundant LEA instructions removed");
+
+/// \brief Returns true if two machine operands are identical and they are not
+/// physical registers.
+static inline bool isIdenticalOp(const MachineOperand &MO1,
+                                 const MachineOperand &MO2);
+
+/// \brief Returns true if two address displacement operands are of the same
+/// type and use the same symbol/index/address regardless of the offset.
+static bool isSimilarDispOp(const MachineOperand &MO1,
+                            const MachineOperand &MO2);
+
+/// \brief Returns true if the instruction is LEA.
+static inline bool isLEA(const MachineInstr &MI);
+
+namespace {
+/// A key based on instruction's memory operands.
+class MemOpKey {
+public:
+  MemOpKey(const MachineOperand *Base, const MachineOperand *Scale,
+           const MachineOperand *Index, const MachineOperand *Segment,
+           const MachineOperand *Disp)
+      : Disp(Disp) {
+    Operands[0] = Base;
+    Operands[1] = Scale;
+    Operands[2] = Index;
+    Operands[3] = Segment;
+  }
+
+  bool operator==(const MemOpKey &Other) const {
+    // Addresses' bases, scales, indices and segments must be identical.
+    for (int i = 0; i < 4; ++i)
+      if (!isIdenticalOp(*Operands[i], *Other.Operands[i]))
+        return false;
+
+    // Addresses' displacements don't have to be exactly the same. It only
+    // matters that they use the same symbol/index/address. Immediates' or
+    // offsets' differences will be taken care of during instruction
+    // substitution.
+    return isSimilarDispOp(*Disp, *Other.Disp);
+  }
+
+  // Address' base, scale, index and segment operands.
+  const MachineOperand *Operands[4];
+
+  // Address' displacement operand.
+  const MachineOperand *Disp;
+};
+} // end anonymous namespace
+
+/// Provide DenseMapInfo for MemOpKey.
+namespace llvm {
+template <> struct DenseMapInfo<MemOpKey> {
+  typedef DenseMapInfo<const MachineOperand *> PtrInfo;
+
+  static inline MemOpKey getEmptyKey() {
+    return MemOpKey(PtrInfo::getEmptyKey(), PtrInfo::getEmptyKey(),
+                    PtrInfo::getEmptyKey(), PtrInfo::getEmptyKey(),
+                    PtrInfo::getEmptyKey());
+  }
+
+  static inline MemOpKey getTombstoneKey() {
+    return MemOpKey(PtrInfo::getTombstoneKey(), PtrInfo::getTombstoneKey(),
+                    PtrInfo::getTombstoneKey(), PtrInfo::getTombstoneKey(),
+                    PtrInfo::getTombstoneKey());
+  }
+
+  static unsigned getHashValue(const MemOpKey &Val) {
+    // Checking any field of MemOpKey is enough to determine if the key is
+    // empty or tombstone.
+    assert(Val.Disp != PtrInfo::getEmptyKey() && "Cannot hash the empty key");
+    assert(Val.Disp != PtrInfo::getTombstoneKey() &&
+           "Cannot hash the tombstone key");
+
+    hash_code Hash = hash_combine(*Val.Operands[0], *Val.Operands[1],
+                                  *Val.Operands[2], *Val.Operands[3]);
+
+    // If the address displacement is an immediate, it should not affect the
+    // hash so that memory operands which differ only be immediate displacement
+    // would have the same hash. If the address displacement is something else,
+    // we should reflect symbol/index/address in the hash.
+    switch (Val.Disp->getType()) {
+    case MachineOperand::MO_Immediate:
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_JumpTableIndex:
+      Hash = hash_combine(Hash, Val.Disp->getIndex());
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      Hash = hash_combine(Hash, Val.Disp->getSymbolName());
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      Hash = hash_combine(Hash, Val.Disp->getGlobal());
+      break;
+    case MachineOperand::MO_BlockAddress:
+      Hash = hash_combine(Hash, Val.Disp->getBlockAddress());
+      break;
+    case MachineOperand::MO_MCSymbol:
+      Hash = hash_combine(Hash, Val.Disp->getMCSymbol());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      Hash = hash_combine(Hash, Val.Disp->getMBB());
+      break;
+    default:
+      llvm_unreachable("Invalid address displacement operand");
+    }
+
+    return (unsigned)Hash;
+  }
+
+  static bool isEqual(const MemOpKey &LHS, const MemOpKey &RHS) {
+    // Checking any field of MemOpKey is enough to determine if the key is
+    // empty or tombstone.
+    if (RHS.Disp == PtrInfo::getEmptyKey())
+      return LHS.Disp == PtrInfo::getEmptyKey();
+    if (RHS.Disp == PtrInfo::getTombstoneKey())
+      return LHS.Disp == PtrInfo::getTombstoneKey();
+    return LHS == RHS;
+  }
+};
+}
+
+/// \brief Returns a hash table key based on memory operands of \p MI. The
+/// number of the first memory operand of \p MI is specified through \p N.
+static inline MemOpKey getMemOpKey(const MachineInstr &MI, unsigned N) {
+  assert((isLEA(MI) || MI.mayLoadOrStore()) &&
+         "The instruction must be a LEA, a load or a store");
+  return MemOpKey(&MI.getOperand(N + X86::AddrBaseReg),
+                  &MI.getOperand(N + X86::AddrScaleAmt),
+                  &MI.getOperand(N + X86::AddrIndexReg),
+                  &MI.getOperand(N + X86::AddrSegmentReg),
+                  &MI.getOperand(N + X86::AddrDisp));
+}
+
+static inline bool isIdenticalOp(const MachineOperand &MO1,
+                                 const MachineOperand &MO2) {
+  return MO1.isIdenticalTo(MO2) &&
+         (!MO1.isReg() ||
+          !TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+}
+
+#ifndef NDEBUG
+static bool isValidDispOp(const MachineOperand &MO) {
+  return MO.isImm() || MO.isCPI() || MO.isJTI() || MO.isSymbol() ||
+         MO.isGlobal() || MO.isBlockAddress() || MO.isMCSymbol() || MO.isMBB();
+}
+#endif
+
+static bool isSimilarDispOp(const MachineOperand &MO1,
+                            const MachineOperand &MO2) {
+  assert(isValidDispOp(MO1) && isValidDispOp(MO2) &&
+         "Address displacement operand is not valid");
+  return (MO1.isImm() && MO2.isImm()) ||
+         (MO1.isCPI() && MO2.isCPI() && MO1.getIndex() == MO2.getIndex()) ||
+         (MO1.isJTI() && MO2.isJTI() && MO1.getIndex() == MO2.getIndex()) ||
+         (MO1.isSymbol() && MO2.isSymbol() &&
+          MO1.getSymbolName() == MO2.getSymbolName()) ||
+         (MO1.isGlobal() && MO2.isGlobal() &&
+          MO1.getGlobal() == MO2.getGlobal()) ||
+         (MO1.isBlockAddress() && MO2.isBlockAddress() &&
+          MO1.getBlockAddress() == MO2.getBlockAddress()) ||
+         (MO1.isMCSymbol() && MO2.isMCSymbol() &&
+          MO1.getMCSymbol() == MO2.getMCSymbol()) ||
+         (MO1.isMBB() && MO2.isMBB() && MO1.getMBB() == MO2.getMBB());
+}
+
+static inline bool isLEA(const MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
+  return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+         Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
+namespace {
+class OptimizeLEAPass : public MachineFunctionPass {
+public:
+  OptimizeLEAPass() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "X86 LEA Optimize"; }
+
+  /// \brief Loop over all of the basic blocks, replacing address
+  /// calculations in load and store instructions, if it's already
+  /// been calculated by LEA. Also, remove redundant LEAs.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  typedef DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>> MemOpMap;
+
+  /// \brief Returns a distance between two instructions inside one basic block.
+  /// Negative result means, that instructions occur in reverse order.
+  int calcInstrDist(const MachineInstr &First, const MachineInstr &Last);
+
+  /// \brief Choose the best \p LEA instruction from the \p List to replace
+  /// address calculation in \p MI instruction. Return the address displacement
+  /// and the distance between \p MI and the chosen \p BestLEA in
+  /// \p AddrDispShift and \p Dist.
+  bool chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+                     const MachineInstr &MI, MachineInstr *&BestLEA,
+                     int64_t &AddrDispShift, int &Dist);
+
+  /// \brief Returns the difference between addresses' displacements of \p MI1
+  /// and \p MI2. The numbers of the first memory operands for the instructions
+  /// are specified through \p N1 and \p N2.
+  int64_t getAddrDispShift(const MachineInstr &MI1, unsigned N1,
+                           const MachineInstr &MI2, unsigned N2) const;
+
+  /// \brief Returns true if the \p Last LEA instruction can be replaced by the
+  /// \p First. The difference between displacements of the addresses calculated
+  /// by these LEAs is returned in \p AddrDispShift. It'll be used for proper
+  /// replacement of the \p Last LEA's uses with the \p First's def register.
+  bool isReplaceable(const MachineInstr &First, const MachineInstr &Last,
+                     int64_t &AddrDispShift) const;
+
+  /// \brief Find all LEA instructions in the basic block. Also, assign position
+  /// numbers to all instructions in the basic block to speed up calculation of
+  /// distance between them.
+  void findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs);
+
+  /// \brief Removes redundant address calculations.
+  bool removeRedundantAddrCalc(MemOpMap &LEAs);
+
+  /// \brief Removes LEAs which calculate similar addresses.
+  bool removeRedundantLEAs(MemOpMap &LEAs);
+
+  DenseMap<const MachineInstr *, unsigned> InstrPos;
+
+  MachineRegisterInfo *MRI;
+  const X86InstrInfo *TII;
+  const X86RegisterInfo *TRI;
+
+  static char ID;
+};
+char OptimizeLEAPass::ID = 0;
+}
+
+FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); }
+
+int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
+                                   const MachineInstr &Last) {
+  // Both instructions must be in the same basic block and they must be
+  // presented in InstrPos.
+  assert(Last.getParent() == First.getParent() &&
+         "Instructions are in different basic blocks");
+  assert(InstrPos.find(&First) != InstrPos.end() &&
+         InstrPos.find(&Last) != InstrPos.end() &&
+         "Instructions' positions are undefined");
+
+  return InstrPos[&Last] - InstrPos[&First];
+}
+
+// Find the best LEA instruction in the List to replace address recalculation in
+// MI. Such LEA must meet these requirements:
+// 1) The address calculated by the LEA differs only by the displacement from
+//    the address used in MI.
+// 2) The register class of the definition of the LEA is compatible with the
+//    register class of the address base register of MI.
+// 3) Displacement of the new memory operand should fit in 1 byte if possible.
+// 4) The LEA should be as close to MI as possible, and prior to it if
+//    possible.
+bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
+                                    const MachineInstr &MI,
+                                    MachineInstr *&BestLEA,
+                                    int64_t &AddrDispShift, int &Dist) {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const MCInstrDesc &Desc = MI.getDesc();
+  int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags) +
+                X86II::getOperandBias(Desc);
+
+  BestLEA = nullptr;
+
+  // Loop over all LEA instructions.
+  for (auto DefMI : List) {
+    // Get new address displacement.
+    int64_t AddrDispShiftTemp = getAddrDispShift(MI, MemOpNo, *DefMI, 1);
+
+    // Make sure address displacement fits 4 bytes.
+    if (!isInt<32>(AddrDispShiftTemp))
+      continue;
+
+    // Check that LEA def register can be used as MI address base. Some
+    // instructions can use a limited set of registers as address base, for
+    // example MOV8mr_NOREX. We could constrain the register class of the LEA
+    // def to suit MI, however since this case is very rare and hard to
+    // reproduce in a test it's just more reliable to skip the LEA.
+    if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI, *MF) !=
+        MRI->getRegClass(DefMI->getOperand(0).getReg()))
+      continue;
+
+    // Choose the closest LEA instruction from the list, prior to MI if
+    // possible. Note that we took into account resulting address displacement
+    // as well. Also note that the list is sorted by the order in which the LEAs
+    // occur, so the break condition is pretty simple.
+    int DistTemp = calcInstrDist(*DefMI, MI);
+    assert(DistTemp != 0 &&
+           "The distance between two different instructions cannot be zero");
+    if (DistTemp > 0 || BestLEA == nullptr) {
+      // Do not update return LEA, if the current one provides a displacement
+      // which fits in 1 byte, while the new candidate does not.
+      if (BestLEA != nullptr && !isInt<8>(AddrDispShiftTemp) &&
+          isInt<8>(AddrDispShift))
+        continue;
+
+      BestLEA = DefMI;
+      AddrDispShift = AddrDispShiftTemp;
+      Dist = DistTemp;
+    }
+
+    // FIXME: Maybe we should not always stop at the first LEA after MI.
+    if (DistTemp < 0)
+      break;
+  }
+
+  return BestLEA != nullptr;
+}
+
+// Get the difference between the addresses' displacements of the two
+// instructions \p MI1 and \p MI2. The numbers of the first memory operands are
+// passed through \p N1 and \p N2.
+int64_t OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1, unsigned N1,
+                                          const MachineInstr &MI2,
+                                          unsigned N2) const {
+  const MachineOperand &Op1 = MI1.getOperand(N1 + X86::AddrDisp);
+  const MachineOperand &Op2 = MI2.getOperand(N2 + X86::AddrDisp);
+
+  assert(isSimilarDispOp(Op1, Op2) &&
+         "Address displacement operands are not compatible");
+
+  // After the assert above we can be sure that both operands are of the same
+  // valid type and use the same symbol/index/address, thus displacement shift
+  // calculation is rather simple.
+  if (Op1.isJTI())
+    return 0;
+  return Op1.isImm() ? Op1.getImm() - Op2.getImm()
+                     : Op1.getOffset() - Op2.getOffset();
+}
+
+// Check that the Last LEA can be replaced by the First LEA. To be so,
+// these requirements must be met:
+// 1) Addresses calculated by LEAs differ only by displacement.
+// 2) Def registers of LEAs belong to the same class.
+// 3) All uses of the Last LEA def register are replaceable, thus the
+//    register is used only as address base.
+bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
+                                    const MachineInstr &Last,
+                                    int64_t &AddrDispShift) const {
+  assert(isLEA(First) && isLEA(Last) &&
+         "The function works only with LEA instructions");
+
+  // Get new address displacement.
+  AddrDispShift = getAddrDispShift(Last, 1, First, 1);
+
+  // Make sure that LEA def registers belong to the same class. There may be
+  // instructions (like MOV8mr_NOREX) which allow a limited set of registers to
+  // be used as their operands, so we must be sure that replacing one LEA
+  // with another won't lead to putting a wrong register in the instruction.
+  if (MRI->getRegClass(First.getOperand(0).getReg()) !=
+      MRI->getRegClass(Last.getOperand(0).getReg()))
+    return false;
+
+  // Loop over all uses of the Last LEA to check that its def register is
+  // used only as address base for memory accesses. If so, it can be
+  // replaced, otherwise - no.
+  for (auto &MO : MRI->use_operands(Last.getOperand(0).getReg())) {
+    MachineInstr &MI = *MO.getParent();
+
+    // Get the number of the first memory operand.
+    const MCInstrDesc &Desc = MI.getDesc();
+    int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags);
+
+    // If the use instruction has no memory operand - the LEA is not
+    // replaceable.
+    if (MemOpNo < 0)
+      return false;
+
+    MemOpNo += X86II::getOperandBias(Desc);
+
+    // If the address base of the use instruction is not the LEA def register -
+    // the LEA is not replaceable.
+    if (!isIdenticalOp(MI.getOperand(MemOpNo + X86::AddrBaseReg), MO))
+      return false;
+
+    // If the LEA def register is used as any other operand of the use
+    // instruction - the LEA is not replaceable.
+    for (unsigned i = 0; i < MI.getNumOperands(); i++)
+      if (i != (unsigned)(MemOpNo + X86::AddrBaseReg) &&
+          isIdenticalOp(MI.getOperand(i), MO))
+        return false;
+
+    // Check that the new address displacement will fit 4 bytes.
+    if (MI.getOperand(MemOpNo + X86::AddrDisp).isImm() &&
+        !isInt<32>(MI.getOperand(MemOpNo + X86::AddrDisp).getImm() +
+                   AddrDispShift))
+      return false;
+  }
+
+  return true;
+}
+
+void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs) {
+  unsigned Pos = 0;
+  for (auto &MI : MBB) {
+    // Assign the position number to the instruction. Note that we are going to
+    // move some instructions during the optimization however there will never
+    // be a need to move two instructions before any selected instruction. So to
+    // avoid multiple positions' updates during moves we just increase position
+    // counter by two leaving a free space for instructions which will be moved.
+    InstrPos[&MI] = Pos += 2;
+
+    if (isLEA(MI))
+      LEAs[getMemOpKey(MI, 1)].push_back(const_cast<MachineInstr *>(&MI));
+  }
+}
+
+// Try to find load and store instructions which recalculate addresses already
+// calculated by some LEA and replace their memory operands with its def
+// register.
+bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
+  bool Changed = false;
+
+  assert(!LEAs.empty());
+  MachineBasicBlock *MBB = (*LEAs.begin()->second.begin())->getParent();
+
+  // Process all instructions in basic block.
+  for (auto I = MBB->begin(), E = MBB->end(); I != E;) {
+    MachineInstr &MI = *I++;
+
+    // Instruction must be load or store.
+    if (!MI.mayLoadOrStore())
+      continue;
+
+    // Get the number of the first memory operand.
+    const MCInstrDesc &Desc = MI.getDesc();
+    int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags);
+
+    // If instruction has no memory operand - skip it.
+    if (MemOpNo < 0)
+      continue;
+
+    MemOpNo += X86II::getOperandBias(Desc);
+
+    // Get the best LEA instruction to replace address calculation.
+    MachineInstr *DefMI;
+    int64_t AddrDispShift;
+    int Dist;
+    if (!chooseBestLEA(LEAs[getMemOpKey(MI, MemOpNo)], MI, DefMI, AddrDispShift,
+                       Dist))
+      continue;
+
+    // If LEA occurs before current instruction, we can freely replace
+    // the instruction. If LEA occurs after, we can lift LEA above the
+    // instruction and this way to be able to replace it. Since LEA and the
+    // instruction have similar memory operands (thus, the same def
+    // instructions for these operands), we can always do that, without
+    // worries of using registers before their defs.
+    if (Dist < 0) {
+      DefMI->removeFromParent();
+      MBB->insert(MachineBasicBlock::iterator(&MI), DefMI);
+      InstrPos[DefMI] = InstrPos[&MI] - 1;
+
+      // Make sure the instructions' position numbers are sane.
+      assert(((InstrPos[DefMI] == 1 &&
+               MachineBasicBlock::iterator(DefMI) == MBB->begin()) ||
+              InstrPos[DefMI] >
+                  InstrPos[&*std::prev(MachineBasicBlock::iterator(DefMI))]) &&
+             "Instruction positioning is broken");
+    }
+
+    // Since we can possibly extend register lifetime, clear kill flags.
+    MRI->clearKillFlags(DefMI->getOperand(0).getReg());
+
+    ++NumSubstLEAs;
+    DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump(););
+
+    // Change instruction operands.
+    MI.getOperand(MemOpNo + X86::AddrBaseReg)
+        .ChangeToRegister(DefMI->getOperand(0).getReg(), false);
+    MI.getOperand(MemOpNo + X86::AddrScaleAmt).ChangeToImmediate(1);
+    MI.getOperand(MemOpNo + X86::AddrIndexReg)
+        .ChangeToRegister(X86::NoRegister, false);
+    MI.getOperand(MemOpNo + X86::AddrDisp).ChangeToImmediate(AddrDispShift);
+    MI.getOperand(MemOpNo + X86::AddrSegmentReg)
+        .ChangeToRegister(X86::NoRegister, false);
+
+    DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump(););
+
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+// Try to find similar LEAs in the list and replace one with another.
+bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
+  bool Changed = false;
+
+  // Loop over all entries in the table.
+  for (auto &E : LEAs) {
+    auto &List = E.second;
+
+    // Loop over all LEA pairs.
+    auto I1 = List.begin();
+    while (I1 != List.end()) {
+      MachineInstr &First = **I1;
+      auto I2 = std::next(I1);
+      while (I2 != List.end()) {
+        MachineInstr &Last = **I2;
+        int64_t AddrDispShift;
+
+        // LEAs should be in occurrence order in the list, so we can freely
+        // replace later LEAs with earlier ones.
+        assert(calcInstrDist(First, Last) > 0 &&
+               "LEAs must be in occurrence order in the list");
+
+        // Check that the Last LEA instruction can be replaced by the First.
+        if (!isReplaceable(First, Last, AddrDispShift)) {
+          ++I2;
+          continue;
+        }
+
+        // Loop over all uses of the Last LEA and update their operands. Note
+        // that the correctness of this has already been checked in the
+        // isReplaceable function.
+        for (auto UI = MRI->use_begin(Last.getOperand(0).getReg()),
+                  UE = MRI->use_end();
+             UI != UE;) {
+          MachineOperand &MO = *UI++;
+          MachineInstr &MI = *MO.getParent();
+
+          // Get the number of the first memory operand.
+          const MCInstrDesc &Desc = MI.getDesc();
+          int MemOpNo =
+              X86II::getMemoryOperandNo(Desc.TSFlags) +
+              X86II::getOperandBias(Desc);
+
+          // Update address base.
+          MO.setReg(First.getOperand(0).getReg());
+
+          // Update address disp.
+          MachineOperand &Op = MI.getOperand(MemOpNo + X86::AddrDisp);
+          if (Op.isImm())
+            Op.setImm(Op.getImm() + AddrDispShift);
+          else if (!Op.isJTI())
+            Op.setOffset(Op.getOffset() + AddrDispShift);
+        }
+
+        // Since we can possibly extend register lifetime, clear kill flags.
+        MRI->clearKillFlags(First.getOperand(0).getReg());
+
+        ++NumRedundantLEAs;
+        DEBUG(dbgs() << "OptimizeLEAs: Remove redundant LEA: "; Last.dump(););
+
+        // By this moment, all of the Last LEA's uses must be replaced. So we
+        // can freely remove it.
+        assert(MRI->use_empty(Last.getOperand(0).getReg()) &&
+               "The LEA's def register must have no uses");
+        Last.eraseFromParent();
+
+        // Erase removed LEA from the list.
+        I2 = List.erase(I2);
+
+        Changed = true;
+      }
+      ++I1;
+    }
+  }
+
+  return Changed;
+}
+
+bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+
+  if (DisableX86LEAOpt || skipFunction(*MF.getFunction()))
+    return false;
+
+  MRI = &MF.getRegInfo();
+  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+  TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+
+  // Process all basic blocks.
+  for (auto &MBB : MF) {
+    MemOpMap LEAs;
+    InstrPos.clear();
+
+    // Find all LEA instructions in basic block.
+    findLEAs(MBB, LEAs);
+
+    // If current basic block has no LEAs, move on to the next one.
+    if (LEAs.empty())
+      continue;
+
+    // Remove redundant LEA instructions.
+    Changed |= removeRedundantLEAs(LEAs);
+
+    // Remove redundant address calculations. Do it only for -Os/-Oz since only
+    // a code size gain is expected from this part of the pass.
+    if (MF.getFunction()->optForSize())
+      Changed |= removeRedundantAddrCalc(LEAs);
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
new file mode 100644
index 000000000000..3069d1fd3497
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -0,0 +1,219 @@
+//===-------- X86PadShortFunction.cpp - pad short functions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which will pad short functions to prevent
+// a stall if a function returns before the return address is ready. This
+// is needed for some Intel Atom processors.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-pad-short-functions"
+
+STATISTIC(NumBBsPadded, "Number of basic blocks padded");
+
+namespace {
+  struct VisitedBBInfo {
+    // HasReturn - Whether the BB contains a return instruction
+    bool HasReturn;
+
+    // Cycles - Number of cycles until return if HasReturn is true, otherwise
+    // number of cycles until end of the BB
+    unsigned int Cycles;
+
+    VisitedBBInfo() : HasReturn(false), Cycles(0) {}
+    VisitedBBInfo(bool HasReturn, unsigned int Cycles)
+      : HasReturn(HasReturn), Cycles(Cycles) {}
+  };
+
+  struct PadShortFunc : public MachineFunctionPass {
+    static char ID;
+    PadShortFunc() : MachineFunctionPass(ID)
+                   , Threshold(4), STI(nullptr), TII(nullptr) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    StringRef getPassName() const override {
+      return "X86 Atom pad short functions";
+    }
+
+  private:
+    void findReturns(MachineBasicBlock *MBB,
+                     unsigned int Cycles = 0);
+
+    bool cyclesUntilReturn(MachineBasicBlock *MBB,
+                           unsigned int &Cycles);
+
+    void addPadding(MachineBasicBlock *MBB,
+                    MachineBasicBlock::iterator &MBBI,
+                    unsigned int NOOPsToAdd);
+
+    const unsigned int Threshold;
+
+    // ReturnBBs - Maps basic blocks that return to the minimum number of
+    // cycles until the return, starting from the entry block.
+    DenseMap<MachineBasicBlock*, unsigned int> ReturnBBs;
+
+    // VisitedBBs - Cache of previously visited BBs.
+    DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs;
+
+    const X86Subtarget *STI;
+    const TargetInstrInfo *TII;
+  };
+
+  char PadShortFunc::ID = 0;
+}
+
+FunctionPass *llvm::createX86PadShortFunctions() {
+  return new PadShortFunc();
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// NOOP instructions before early exits.
+bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  if (MF.getFunction()->optForSize()) {
+    return false;
+  }
+
+  STI = &MF.getSubtarget<X86Subtarget>();
+  if (!STI->padShortFunctions())
+    return false;
+
+  TII = STI->getInstrInfo();
+
+  // Search through basic blocks and mark the ones that have early returns
+  ReturnBBs.clear();
+  VisitedBBs.clear();
+  findReturns(&MF.front());
+
+  bool MadeChange = false;
+
+  MachineBasicBlock *MBB;
+  unsigned int Cycles = 0;
+
+  // Pad the identified basic blocks with NOOPs
+  for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin();
+       I != ReturnBBs.end(); ++I) {
+    MBB = I->first;
+    Cycles = I->second;
+
+    if (Cycles < Threshold) {
+      // BB ends in a return. Skip over any DBG_VALUE instructions
+      // trailing the terminator.
+      assert(MBB->size() > 0 &&
+             "Basic block should contain at least a RET but is empty");
+      MachineBasicBlock::iterator ReturnLoc = --MBB->end();
+
+      while (ReturnLoc->isDebugValue())
+        --ReturnLoc;
+      assert(ReturnLoc->isReturn() && !ReturnLoc->isCall() &&
+             "Basic block does not end with RET");
+
+      addPadding(MBB, ReturnLoc, Threshold - Cycles);
+      NumBBsPadded++;
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+/// findReturn - Starting at MBB, follow control flow and add all
+/// basic blocks that contain a return to ReturnBBs.
+void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) {
+  // If this BB has a return, note how many cycles it takes to get there.
+  bool hasReturn = cyclesUntilReturn(MBB, Cycles);
+  if (Cycles >= Threshold)
+    return;
+
+  if (hasReturn) {
+    ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles);
+    return;
+  }
+
+  // Follow branches in BB and look for returns
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin();
+       I != MBB->succ_end(); ++I) {
+    if (*I == MBB)
+      continue;
+    findReturns(*I, Cycles);
+  }
+}
+
+/// cyclesUntilReturn - return true if the MBB has a return instruction,
+/// and return false otherwise.
+/// Cycles will be incremented by the number of cycles taken to reach the
+/// return or the end of the BB, whichever occurs first.
+bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
+                                     unsigned int &Cycles) {
+  // Return cached result if BB was previously visited
+  DenseMap<MachineBasicBlock*, VisitedBBInfo>::iterator it
+    = VisitedBBs.find(MBB);
+  if (it != VisitedBBs.end()) {
+    VisitedBBInfo BBInfo = it->second;
+    Cycles += BBInfo.Cycles;
+    return BBInfo.HasReturn;
+  }
+
+  unsigned int CyclesToEnd = 0;
+
+  for (MachineInstr &MI : *MBB) {
+    // Mark basic blocks with a return instruction. Calls to other
+    // functions do not count because the called function will be padded,
+    // if necessary.
+    if (MI.isReturn() && !MI.isCall()) {
+      VisitedBBs[MBB] = VisitedBBInfo(true, CyclesToEnd);
+      Cycles += CyclesToEnd;
+      return true;
+    }
+
+    CyclesToEnd += TII->getInstrLatency(STI->getInstrItineraryData(), MI);
+  }
+
+  VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd);
+  Cycles += CyclesToEnd;
+  return false;
+}
+
+/// addPadding - Add the given number of NOOP instructions to the function
+/// just prior to the return at MBBI
+void PadShortFunc::addPadding(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              unsigned int NOOPsToAdd) {
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  while (NOOPsToAdd-- > 0) {
+    BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
+    BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
new file mode 100644
index 000000000000..65f438f94b04
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -0,0 +1,758 @@
+//===-- X86RegisterInfo.cpp - X86 Register Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+// This file is responsible for the frame pointer elimination optimization
+// on X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86RegisterInfo.h"
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define GET_REGINFO_TARGET_DESC
+#include "X86GenRegisterInfo.inc"
+
+static cl::opt<bool>
+EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
+          cl::desc("Enable use of a base pointer for complex stack frames"));
+
+X86RegisterInfo::X86RegisterInfo(const Triple &TT)
+    : X86GenRegisterInfo((TT.isArch64Bit() ? X86::RIP : X86::EIP),
+                         X86_MC::getDwarfRegFlavour(TT, false),
+                         X86_MC::getDwarfRegFlavour(TT, true),
+                         (TT.isArch64Bit() ? X86::RIP : X86::EIP)) {
+  X86_MC::initLLVMToSEHAndCVRegMapping(this);
+
+  // Cache some information.
+  Is64Bit = TT.isArch64Bit();
+  IsWin64 = Is64Bit && TT.isOSWindows();
+
+  // Use a callee-saved register as the base pointer.  These registers must
+  // not conflict with any ABI requirements.  For example, in 32-bit mode PIC
+  // requires GOT in the EBX register before function calls via PLT GOT pointer.
+  if (Is64Bit) {
+    SlotSize = 8;
+    // This matches the simplified 32-bit pointer code in the data layout
+    // computation.
+    // FIXME: Should use the data layout?
+    bool Use64BitReg = TT.getEnvironment() != Triple::GNUX32;
+    StackPtr = Use64BitReg ? X86::RSP : X86::ESP;
+    FramePtr = Use64BitReg ? X86::RBP : X86::EBP;
+    BasePtr = Use64BitReg ? X86::RBX : X86::EBX;
+  } else {
+    SlotSize = 4;
+    StackPtr = X86::ESP;
+    FramePtr = X86::EBP;
+    BasePtr = X86::ESI;
+  }
+}
+
+bool
+X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  // ExeDepsFixer and PostRAScheduler require liveness.
+  return true;
+}
+
+int
+X86RegisterInfo::getSEHRegNum(unsigned i) const {
+  return getEncodingValue(i);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
+                                       unsigned Idx) const {
+  // The sub_8bit sub-register index is more constrained in 32-bit mode.
+  // It behaves just like the sub_8bit_hi index.
+  if (!Is64Bit && Idx == X86::sub_8bit)
+    Idx = X86::sub_8bit_hi;
+
+  // Forward to TableGen's default version.
+  return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+                                          const TargetRegisterClass *B,
+                                          unsigned SubIdx) const {
+  // The sub_8bit sub-register index is more constrained in 32-bit mode.
+  if (!Is64Bit && SubIdx == X86::sub_8bit) {
+    A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi);
+    if (!A)
+      return nullptr;
+  }
+  return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                                           const MachineFunction &MF) const {
+  // Don't allow super-classes of GR8_NOREX.  This class is only used after
+  // extracting sub_8bit_hi sub-registers.  The H sub-registers cannot be copied
+  // to the full GR8 register class in 64-bit mode, so we cannot allow the
+  // reigster class inflation.
+  //
+  // The GR8_NOREX class is always used in a way that won't be constrained to a
+  // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the
+  // full GR8 class.
+  if (RC == &X86::GR8_NOREXRegClass)
+    return RC;
+
+  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+
+  const TargetRegisterClass *Super = RC;
+  TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
+  do {
+    switch (Super->getID()) {
+    case X86::FR32RegClassID:
+    case X86::FR64RegClassID:
+      // If AVX-512 isn't supported we should only inflate to these classes.
+      if (!Subtarget.hasAVX512() && Super->getSize() == RC->getSize())
+        return Super;
+      break;
+    case X86::VR128RegClassID:
+    case X86::VR256RegClassID:
+      // If VLX isn't supported we should only inflate to these classes.
+      if (!Subtarget.hasVLX() && Super->getSize() == RC->getSize())
+        return Super;
+      break;
+    case X86::VR128XRegClassID:
+    case X86::VR256XRegClassID:
+      // If VLX isn't support we shouldn't inflate to these classes.
+      if (Subtarget.hasVLX() && Super->getSize() == RC->getSize())
+        return Super;
+      break;
+    case X86::FR32XRegClassID:
+    case X86::FR64XRegClassID:
+      // If AVX-512 isn't support we shouldn't inflate to these classes.
+      if (Subtarget.hasAVX512() && Super->getSize() == RC->getSize())
+        return Super;
+      break;
+    case X86::GR8RegClassID:
+    case X86::GR16RegClassID:
+    case X86::GR32RegClassID:
+    case X86::GR64RegClassID:
+    case X86::RFP32RegClassID:
+    case X86::RFP64RegClassID:
+    case X86::RFP80RegClassID:
+    case X86::VR512RegClassID:
+      // Don't return a super-class that would shrink the spill size.
+      // That can happen with the vector and float classes.
+      if (Super->getSize() == RC->getSize())
+        return Super;
+    }
+    Super = *I++;
+  } while (Super);
+  return RC;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                    unsigned Kind) const {
+  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+  switch (Kind) {
+  default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
+  case 0: // Normal GPRs.
+    if (Subtarget.isTarget64BitLP64())
+      return &X86::GR64RegClass;
+    // If the target is 64bit but we have been told to use 32bit addresses,
+    // we can still use 64-bit register as long as we know the high bits
+    // are zeros.
+    // Reflect that in the returned register class.
+    if (Is64Bit) {
+      // When the target also allows 64-bit frame pointer and we do have a
+      // frame, this is fine to use it for the address accesses as well.
+      const X86FrameLowering *TFI = getFrameLowering(MF);
+      return TFI->hasFP(MF) && TFI->Uses64BitFramePtr
+                 ? &X86::LOW32_ADDR_ACCESS_RBPRegClass
+                 : &X86::LOW32_ADDR_ACCESSRegClass;
+    }
+    return &X86::GR32RegClass;
+  case 1: // Normal GPRs except the stack pointer (for encoding reasons).
+    if (Subtarget.isTarget64BitLP64())
+      return &X86::GR64_NOSPRegClass;
+    // NOSP does not contain RIP, so no special case here.
+    return &X86::GR32_NOSPRegClass;
+  case 2: // NOREX GPRs.
+    if (Subtarget.isTarget64BitLP64())
+      return &X86::GR64_NOREXRegClass;
+    return &X86::GR32_NOREXRegClass;
+  case 3: // NOREX GPRs except the stack pointer (for encoding reasons).
+    if (Subtarget.isTarget64BitLP64())
+      return &X86::GR64_NOREX_NOSPRegClass;
+    // NOSP does not contain RIP, so no special case here.
+    return &X86::GR32_NOREX_NOSPRegClass;
+  case 4: // Available for tailcall (not callee-saved GPRs).
+    return getGPRsForTailCall(MF);
+  }
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
+  const Function *F = MF.getFunction();
+  if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64))
+    return &X86::GR64_TCW64RegClass;
+  else if (Is64Bit)
+    return &X86::GR64_TCRegClass;
+
+  bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false);
+  if (hasHipeCC)
+    return &X86::GR32RegClass;
+  return &X86::GR32_TCRegClass;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &X86::CCRRegClass) {
+    if (Is64Bit)
+      return &X86::GR64RegClass;
+    else
+      return &X86::GR32RegClass;
+  }
+  return RC;
+}
+
+unsigned
+X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                     MachineFunction &MF) const {
+  const X86FrameLowering *TFI = getFrameLowering(MF);
+
+  unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0;
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case X86::GR32RegClassID:
+    return 4 - FPDiff;
+  case X86::GR64RegClassID:
+    return 12 - FPDiff;
+  case X86::VR128RegClassID:
+    return Is64Bit ? 10 : 4;
+  case X86::VR64RegClassID:
+    return 4;
+  }
+}
+
+const MCPhysReg *
+X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  assert(MF && "MachineFunction required");
+
+  const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>();
+  bool HasSSE = Subtarget.hasSSE1();
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
+  bool CallsEHReturn = MF->callsEHReturn();
+
+  switch (MF->getFunction()->getCallingConv()) {
+  case CallingConv::GHC:
+  case CallingConv::HiPE:
+    return CSR_NoRegs_SaveList;
+  case CallingConv::AnyReg:
+    if (HasAVX)
+      return CSR_64_AllRegs_AVX_SaveList;
+    return CSR_64_AllRegs_SaveList;
+  case CallingConv::PreserveMost:
+    return CSR_64_RT_MostRegs_SaveList;
+  case CallingConv::PreserveAll:
+    if (HasAVX)
+      return CSR_64_RT_AllRegs_AVX_SaveList;
+    return CSR_64_RT_AllRegs_SaveList;
+  case CallingConv::CXX_FAST_TLS:
+    if (Is64Bit)
+      return MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR() ?
+             CSR_64_CXX_TLS_Darwin_PE_SaveList : CSR_64_TLS_Darwin_SaveList;
+    break;
+  case CallingConv::Intel_OCL_BI: {
+    if (HasAVX512 && IsWin64)
+      return CSR_Win64_Intel_OCL_BI_AVX512_SaveList;
+    if (HasAVX512 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_AVX512_SaveList;
+    if (HasAVX && IsWin64)
+      return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
+    if (HasAVX && Is64Bit)
+      return CSR_64_Intel_OCL_BI_AVX_SaveList;
+    if (!HasAVX && !IsWin64 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_SaveList;
+    break;
+  }
+  case CallingConv::HHVM:
+    return CSR_64_HHVM_SaveList;
+  case CallingConv::X86_RegCall:
+    if (Is64Bit) {
+      if (IsWin64) {
+        return (HasSSE ? CSR_Win64_RegCall_SaveList : 
+                         CSR_Win64_RegCall_NoSSE_SaveList);
+      } else {
+        return (HasSSE ? CSR_SysV64_RegCall_SaveList : 
+                         CSR_SysV64_RegCall_NoSSE_SaveList);
+      }
+    } else {
+      return (HasSSE ? CSR_32_RegCall_SaveList : 
+                       CSR_32_RegCall_NoSSE_SaveList);
+    }
+  case CallingConv::Cold:
+    if (Is64Bit)
+      return CSR_64_MostRegs_SaveList;
+    break;
+  case CallingConv::X86_64_Win64:
+    if (!HasSSE)
+      return CSR_Win64_NoSSE_SaveList;
+    return CSR_Win64_SaveList;
+  case CallingConv::X86_64_SysV:
+    if (CallsEHReturn)
+      return CSR_64EHRet_SaveList;
+    return CSR_64_SaveList;
+  case CallingConv::X86_INTR:
+    if (Is64Bit) {
+      if (HasAVX512)
+        return CSR_64_AllRegs_AVX512_SaveList;
+      if (HasAVX)
+        return CSR_64_AllRegs_AVX_SaveList;
+      return CSR_64_AllRegs_SaveList;
+    } else {
+      if (HasAVX512)
+        return CSR_32_AllRegs_AVX512_SaveList;
+      if (HasAVX)
+        return CSR_32_AllRegs_AVX_SaveList;
+      if (HasSSE)
+        return CSR_32_AllRegs_SSE_SaveList;
+      return CSR_32_AllRegs_SaveList;
+    }
+  default:
+    break;
+  }
+
+  if (Is64Bit) {
+    if (IsWin64) {
+      if (!HasSSE)
+        return CSR_Win64_NoSSE_SaveList;
+      return CSR_Win64_SaveList;
+    }
+    if (CallsEHReturn)
+      return CSR_64EHRet_SaveList;
+    if (Subtarget.getTargetLowering()->supportSwiftError() &&
+        MF->getFunction()->getAttributes().hasAttrSomewhere(
+            Attribute::SwiftError))
+      return CSR_64_SwiftError_SaveList;
+    return CSR_64_SaveList;
+  }
+  if (CallsEHReturn)
+    return CSR_32EHRet_SaveList;
+  return CSR_32_SaveList;
+}
+
+const MCPhysReg *X86RegisterInfo::getCalleeSavedRegsViaCopy(
+    const MachineFunction *MF) const {
+  assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+      MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR())
+    return CSR_64_CXX_TLS_Darwin_ViaCopy_SaveList;
+  return nullptr;
+}
+
+const uint32_t *
+X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                      CallingConv::ID CC) const {
+  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+  bool HasSSE = Subtarget.hasSSE1();
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
+
+  switch (CC) {
+  case CallingConv::GHC:
+  case CallingConv::HiPE:
+    return CSR_NoRegs_RegMask;
+  case CallingConv::AnyReg:
+    if (HasAVX)
+      return CSR_64_AllRegs_AVX_RegMask;
+    return CSR_64_AllRegs_RegMask;
+  case CallingConv::PreserveMost:
+    return CSR_64_RT_MostRegs_RegMask;
+  case CallingConv::PreserveAll:
+    if (HasAVX)
+      return CSR_64_RT_AllRegs_AVX_RegMask;
+    return CSR_64_RT_AllRegs_RegMask;
+  case CallingConv::CXX_FAST_TLS:
+    if (Is64Bit)
+      return CSR_64_TLS_Darwin_RegMask;
+    break;
+  case CallingConv::Intel_OCL_BI: {
+    if (HasAVX512 && IsWin64)
+      return CSR_Win64_Intel_OCL_BI_AVX512_RegMask;
+    if (HasAVX512 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_AVX512_RegMask;
+    if (HasAVX && IsWin64)
+      return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
+    if (HasAVX && Is64Bit)
+      return CSR_64_Intel_OCL_BI_AVX_RegMask;
+    if (!HasAVX && !IsWin64 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_RegMask;
+    break;
+  }
+  case CallingConv::HHVM:
+    return CSR_64_HHVM_RegMask;
+  case CallingConv::X86_RegCall:
+    if (Is64Bit) {
+      if (IsWin64) { 
+        return (HasSSE ? CSR_Win64_RegCall_RegMask : 
+                         CSR_Win64_RegCall_NoSSE_RegMask);
+      } else {
+        return (HasSSE ? CSR_SysV64_RegCall_RegMask : 
+                         CSR_SysV64_RegCall_NoSSE_RegMask);
+      }
+    } else {
+      return (HasSSE ? CSR_32_RegCall_RegMask : 
+                       CSR_32_RegCall_NoSSE_RegMask);
+    }
+  case CallingConv::Cold:
+    if (Is64Bit)
+      return CSR_64_MostRegs_RegMask;
+    break;
+  case CallingConv::X86_64_Win64:
+    return CSR_Win64_RegMask;
+  case CallingConv::X86_64_SysV:
+    return CSR_64_RegMask;
+  case CallingConv::X86_INTR:
+    if (Is64Bit) {
+      if (HasAVX512)
+        return CSR_64_AllRegs_AVX512_RegMask;
+      if (HasAVX)
+        return CSR_64_AllRegs_AVX_RegMask;
+      return CSR_64_AllRegs_RegMask;
+    } else {
+      if (HasAVX512)
+        return CSR_32_AllRegs_AVX512_RegMask;
+      if (HasAVX)
+        return CSR_32_AllRegs_AVX_RegMask;
+      if (HasSSE)
+        return CSR_32_AllRegs_SSE_RegMask;
+      return CSR_32_AllRegs_RegMask;
+    }
+  default:
+    break;
+  }
+
+  // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check
+  // callsEHReturn().
+  if (Is64Bit) {
+    if (IsWin64)
+      return CSR_Win64_RegMask;
+    if (Subtarget.getTargetLowering()->supportSwiftError() &&
+        MF.getFunction()->getAttributes().hasAttrSomewhere(
+            Attribute::SwiftError))
+      return CSR_64_SwiftError_RegMask;
+    return CSR_64_RegMask;
+  }
+  return CSR_32_RegMask;
+}
+
+const uint32_t*
+X86RegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
+}
+
+const uint32_t *X86RegisterInfo::getDarwinTLSCallPreservedMask() const {
+  return CSR_64_TLS_Darwin_RegMask;
+}
+
+BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const X86FrameLowering *TFI = getFrameLowering(MF);
+
+  // Set the stack-pointer register and its aliases as reserved.
+  for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid();
+       ++I)
+    Reserved.set(*I);
+
+  // Set the instruction pointer register and its aliases as reserved.
+  for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid();
+       ++I)
+    Reserved.set(*I);
+
+  // Set the frame-pointer register and its aliases as reserved if needed.
+  if (TFI->hasFP(MF)) {
+    for (MCSubRegIterator I(X86::RBP, this, /*IncludeSelf=*/true); I.isValid();
+         ++I)
+      Reserved.set(*I);
+  }
+
+  // Set the base-pointer register and its aliases as reserved if needed.
+  if (hasBasePointer(MF)) {
+    CallingConv::ID CC = MF.getFunction()->getCallingConv();
+    const uint32_t *RegMask = getCallPreservedMask(MF, CC);
+    if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister()))
+      report_fatal_error(
+        "Stack realignment in presence of dynamic allocas is not supported with"
+        "this calling convention.");
+
+    unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), 64);
+    for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true);
+         I.isValid(); ++I)
+      Reserved.set(*I);
+  }
+
+  // Mark the segment registers as reserved.
+  Reserved.set(X86::CS);
+  Reserved.set(X86::SS);
+  Reserved.set(X86::DS);
+  Reserved.set(X86::ES);
+  Reserved.set(X86::FS);
+  Reserved.set(X86::GS);
+
+  // Mark the floating point stack registers as reserved.
+  for (unsigned n = 0; n != 8; ++n)
+    Reserved.set(X86::ST0 + n);
+
+  // Reserve the registers that only exist in 64-bit mode.
+  if (!Is64Bit) {
+    // These 8-bit registers are part of the x86-64 extension even though their
+    // super-registers are old 32-bits.
+    Reserved.set(X86::SIL);
+    Reserved.set(X86::DIL);
+    Reserved.set(X86::BPL);
+    Reserved.set(X86::SPL);
+
+    for (unsigned n = 0; n != 8; ++n) {
+      // R8, R9, ...
+      for (MCRegAliasIterator AI(X86::R8 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+
+      // XMM8, XMM9, ...
+      for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+    }
+  }
+  if (!Is64Bit || !MF.getSubtarget<X86Subtarget>().hasAVX512()) {
+    for (unsigned n = 16; n != 32; ++n) {
+      for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+    }
+  }
+
+  assert(checkAllSuperRegsMarked(Reserved,
+                                 {X86::SIL, X86::DIL, X86::BPL, X86::SPL}));
+  return Reserved;
+}
+
+void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
+  // Check if the EFLAGS register is marked as live-out. This shouldn't happen,
+  // because the calling convention defines the EFLAGS register as NOT
+  // preserved.
+  //
+  // Unfortunatelly the EFLAGS show up as live-out after branch folding. Adding
+  // an assert to track this and clear the register afterwards to avoid
+  // unnecessary crashes during release builds.
+  assert(!(Mask[X86::EFLAGS / 32] & (1U << (X86::EFLAGS % 32))) &&
+         "EFLAGS are not live-out from a patchpoint.");
+
+  // Also clean other registers that don't need preserving (IP).
+  for (auto Reg : {X86::EFLAGS, X86::RIP, X86::EIP, X86::IP})
+    Mask[Reg / 32] &= ~(1U << (Reg % 32));
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+static bool CantUseSP(const MachineFrameInfo &MFI) {
+  return MFI.hasVarSizedObjects() || MFI.hasOpaqueSPAdjustment();
+}
+
+bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+   const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+   if (!EnableBasePointer)
+     return false;
+
+   // When we need stack realignment, we can't address the stack from the frame
+   // pointer.  When we have dynamic allocas or stack-adjusting inline asm, we
+   // can't address variables from the stack pointer.  MS inline asm can
+   // reference locals while also adjusting the stack pointer.  When we can't
+   // use both the SP and the FP, we need a separate base pointer register.
+   bool CantUseFP = needsStackRealignment(MF);
+   return CantUseFP && CantUseSP(MFI);
+}
+
+bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  if (!TargetRegisterInfo::canRealignStack(MF))
+    return false;
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+  // Stack realignment requires a frame pointer.  If we already started
+  // register allocation with frame pointer elimination, it is too late now.
+  if (!MRI->canReserveReg(FramePtr))
+    return false;
+
+  // If a base pointer is necessary.  Check that it isn't too late to reserve
+  // it.
+  if (CantUseSP(MFI))
+    return MRI->canReserveReg(BasePtr);
+  return true;
+}
+
+bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+                                           unsigned Reg, int &FrameIdx) const {
+  // Since X86 defines assignCalleeSavedSpillSlots which always return true
+  // this function neither used nor tested.
+  llvm_unreachable("Unused function on X86. Otherwise need a test case.");
+}
+
+// tryOptimizeLEAtoMOV - helper function that tries to replace a LEA instruction
+// of the form 'lea (%esp), %ebx' --> 'mov %esp, %ebx'.
+// TODO: In this case we should be really trying first to entirely eliminate
+// this instruction which is a plain copy.
+static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) {
+  MachineInstr &MI = *II;
+  unsigned Opc = II->getOpcode();
+  // Check if this is a LEA of the form 'lea (%esp), %ebx'
+  if ((Opc != X86::LEA32r && Opc != X86::LEA64r && Opc != X86::LEA64_32r) ||
+      MI.getOperand(2).getImm() != 1 ||
+      MI.getOperand(3).getReg() != X86::NoRegister ||
+      MI.getOperand(4).getImm() != 0 ||
+      MI.getOperand(5).getReg() != X86::NoRegister)
+    return false;
+  unsigned BasePtr = MI.getOperand(1).getReg();
+  // In X32 mode, ensure the base-pointer is a 32-bit operand, so the LEA will
+  // be replaced with a 32-bit operand MOV which will zero extend the upper
+  // 32-bits of the super register.
+  if (Opc == X86::LEA64_32r)
+    BasePtr = getX86SubSuperRegister(BasePtr, 32);
+  unsigned NewDestReg = MI.getOperand(0).getReg();
+  const X86InstrInfo *TII =
+      MI.getParent()->getParent()->getSubtarget<X86Subtarget>().getInstrInfo();
+  TII->copyPhysReg(*MI.getParent(), II, MI.getDebugLoc(), NewDestReg, BasePtr,
+                   MI.getOperand(1).isKill());
+  MI.eraseFromParent();
+  return true;
+}
+
+void
+X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                     int SPAdj, unsigned FIOperandNum,
+                                     RegScavenger *RS) const {
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const X86FrameLowering *TFI = getFrameLowering(MF);
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  unsigned BasePtr;
+
+  unsigned Opc = MI.getOpcode();
+  bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm ||
+                    Opc == X86::TCRETURNmi || Opc == X86::TCRETURNmi64;
+
+  if (hasBasePointer(MF))
+    BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister());
+  else if (needsStackRealignment(MF))
+    BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
+  else if (AfterFPPop)
+    BasePtr = StackPtr;
+  else
+    BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr);
+
+  // LOCAL_ESCAPE uses a single offset, with no register. It only works in the
+  // simple FP case, and doesn't work with stack realignment. On 32-bit, the
+  // offset is from the traditional base pointer location.  On 64-bit, the
+  // offset is from the SP at the end of the prologue, not the FP location. This
+  // matches the behavior of llvm.frameaddress.
+  unsigned IgnoredFrameReg;
+  if (Opc == TargetOpcode::LOCAL_ESCAPE) {
+    MachineOperand &FI = MI.getOperand(FIOperandNum);
+    int Offset;
+    Offset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
+    FI.ChangeToImmediate(Offset);
+    return;
+  }
+
+  // For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit
+  // register as source operand, semantic is the same and destination is
+  // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided.
+  // Don't change BasePtr since it is used later for stack adjustment.
+  unsigned MachineBasePtr = BasePtr;
+  if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr))
+    MachineBasePtr = getX86SubSuperRegister(BasePtr, 64);
+
+  // This must be part of a four operand memory reference.  Replace the
+  // FrameIndex with base register.  Add an offset to the offset.
+  MI.getOperand(FIOperandNum).ChangeToRegister(MachineBasePtr, false);
+
+  // Now add the frame object offset to the offset from EBP.
+  int FIOffset;
+  if (AfterFPPop) {
+    // Tail call jmp happens after FP is popped.
+    const MachineFrameInfo &MFI = MF.getFrameInfo();
+    FIOffset = MFI.getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea();
+  } else
+    FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
+
+  if (BasePtr == StackPtr)
+    FIOffset += SPAdj;
+
+  // The frame index format for stackmaps and patchpoints is different from the
+  // X86 format. It only has a FI and an offset.
+  if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) {
+    assert(BasePtr == FramePtr && "Expected the FP as base register");
+    int64_t Offset = MI.getOperand(FIOperandNum + 1).getImm() + FIOffset;
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  if (MI.getOperand(FIOperandNum+3).isImm()) {
+    // Offset is a 32-bit integer.
+    int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
+    int Offset = FIOffset + Imm;
+    assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
+           "Requesting 64-bit offset in 32-bit immediate!");
+    if (Offset != 0 || !tryOptimizeLEAtoMOV(II))
+      MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
+  } else {
+    // Offset is symbolic. This is extremely rare.
+    uint64_t Offset = FIOffset +
+      (uint64_t)MI.getOperand(FIOperandNum+3).getOffset();
+    MI.getOperand(FIOperandNum + 3).setOffset(Offset);
+  }
+}
+
+unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const X86FrameLowering *TFI = getFrameLowering(MF);
+  return TFI->hasFP(MF) ? FramePtr : StackPtr;
+}
+
+unsigned
+X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const {
+  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+  unsigned FrameReg = getFrameRegister(MF);
+  if (Subtarget.isTarget64BitILP32())
+    FrameReg = getX86SubSuperRegister(FrameReg, 32);
+  return FrameReg;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
new file mode 100644
index 000000000000..58fa31e94fba
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -0,0 +1,142 @@
+//===-- X86RegisterInfo.h - X86 Register Information Impl -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
+#define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "X86GenRegisterInfo.inc"
+
+namespace llvm {
+  class Triple;
+
+class X86RegisterInfo final : public X86GenRegisterInfo {
+private:
+  /// Is64Bit - Is the target 64-bits.
+  ///
+  bool Is64Bit;
+
+  /// IsWin64 - Is the target on of win64 flavours
+  ///
+  bool IsWin64;
+
+  /// SlotSize - Stack slot size in bytes.
+  ///
+  unsigned SlotSize;
+
+  /// StackPtr - X86 physical register used as stack ptr.
+  ///
+  unsigned StackPtr;
+
+  /// FramePtr - X86 physical register used as frame ptr.
+  ///
+  unsigned FramePtr;
+
+  /// BasePtr - X86 physical register used as a base ptr in complex stack
+  /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
+  /// variable size stack objects.
+  unsigned BasePtr;
+
+public:
+  X86RegisterInfo(const Triple &TT);
+
+  // FIXME: This should be tablegen'd like getDwarfRegNum is
+  int getSEHRegNum(unsigned i) const;
+
+  /// Code Generation virtual methods...
+  ///
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+  /// getMatchingSuperRegClass - Return a subclass of the specified register
+  /// class A so that each register in it has a sub-register of the
+  /// specified sub-register index which is in the specified register class B.
+  const TargetRegisterClass *
+  getMatchingSuperRegClass(const TargetRegisterClass *A,
+                           const TargetRegisterClass *B,
+                           unsigned Idx) const override;
+
+  const TargetRegisterClass *
+  getSubClassWithSubReg(const TargetRegisterClass *RC,
+                        unsigned Idx) const override;
+
+  const TargetRegisterClass *
+  getLargestLegalSuperClass(const TargetRegisterClass *RC,
+                            const MachineFunction &MF) const override;
+
+  /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
+  /// values.
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+
+  /// getCrossCopyRegClass - Returns a legal register class to copy a register
+  /// in the specified class to or from. Returns NULL if it is possible to copy
+  /// between a two registers of the specified class.
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+  /// getGPRsForTailCall - Returns a register class with registers that can be
+  /// used in forming tail calls.
+  const TargetRegisterClass *
+  getGPRsForTailCall(const MachineFunction &MF) const;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
+
+  /// getCalleeSavedRegs - Return a null-terminated list of all of the
+  /// callee-save registers on this target.
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction* MF) const override;
+  const MCPhysReg *
+  getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID) const override;
+  const uint32_t *getNoPreservedMask() const override;
+
+  // Calls involved in thread-local variable lookup save more registers than
+  // normal calls, so they need a different mask to represent this.
+  const uint32_t *getDarwinTLSCallPreservedMask() const;
+
+  /// getReservedRegs - Returns a bitset indexed by physical register number
+  /// indicating if a register is a special register that has particular uses and
+  /// should be considered unavailable at all times, e.g. SP, RA. This is used by
+  /// register scavenger to determine what registers are free.
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  void adjustStackMapLiveOutMask(uint32_t *Mask) const override;
+
+  bool hasBasePointer(const MachineFunction &MF) const;
+
+  bool canRealignStack(const MachineFunction &MF) const override;
+
+  bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+                            int &FrameIdx) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+  unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const;
+  unsigned getStackRegister() const { return StackPtr; }
+  unsigned getBaseRegister() const { return BasePtr; }
+  // FIXME: Move to FrameInfok
+  unsigned getSlotSize() const { return SlotSize; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
new file mode 100644
index 000000000000..372a15aff15a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -0,0 +1,530 @@
+//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 Register file, defining the registers themselves,
+// aliases between the registers, and the register classes built out of the
+// registers.
+//
+//===----------------------------------------------------------------------===//
+
+class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n> {
+  let Namespace = "X86";
+  let HWEncoding = Enc;
+  let SubRegs = subregs;
+}
+
+// Subregister indices.
+let Namespace = "X86" in {
+  def sub_8bit    : SubRegIndex<8>;
+  def sub_8bit_hi : SubRegIndex<8, 8>;
+  def sub_16bit   : SubRegIndex<16>;
+  def sub_32bit   : SubRegIndex<32>;
+  def sub_xmm     : SubRegIndex<128>;
+  def sub_ymm     : SubRegIndex<256>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Register definitions...
+//
+
+// In the register alias definitions below, we define which registers alias
+// which others.  We only specify which registers the small registers alias,
+// because the register file generator is smart enough to figure out that
+// AL aliases AX if we tell it that AX aliased AL (for example).
+
+// Dwarf numbering is different for 32-bit and 64-bit, and there are
+// variations by target as well. Currently the first entry is for X86-64,
+// second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux
+// and debug information on X86-32/Darwin)
+
+// 8-bit registers
+// Low registers
+def AL : X86Reg<"al", 0>;
+def DL : X86Reg<"dl", 2>;
+def CL : X86Reg<"cl", 1>;
+def BL : X86Reg<"bl", 3>;
+
+// High registers. On x86-64, these cannot be used in any instruction
+// with a REX prefix.
+def AH : X86Reg<"ah", 4>;
+def DH : X86Reg<"dh", 6>;
+def CH : X86Reg<"ch", 5>;
+def BH : X86Reg<"bh", 7>;
+
+// X86-64 only, requires REX.
+let CostPerUse = 1 in {
+def SIL  : X86Reg<"sil",   6>;
+def DIL  : X86Reg<"dil",   7>;
+def BPL  : X86Reg<"bpl",   5>;
+def SPL  : X86Reg<"spl",   4>;
+def R8B  : X86Reg<"r8b",   8>;
+def R9B  : X86Reg<"r9b",   9>;
+def R10B : X86Reg<"r10b", 10>;
+def R11B : X86Reg<"r11b", 11>;
+def R12B : X86Reg<"r12b", 12>;
+def R13B : X86Reg<"r13b", 13>;
+def R14B : X86Reg<"r14b", 14>;
+def R15B : X86Reg<"r15b", 15>;
+}
+
+// 16-bit registers
+let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in {
+def AX : X86Reg<"ax", 0, [AL,AH]>;
+def DX : X86Reg<"dx", 2, [DL,DH]>;
+def CX : X86Reg<"cx", 1, [CL,CH]>;
+def BX : X86Reg<"bx", 3, [BL,BH]>;
+}
+let SubRegIndices = [sub_8bit] in {
+def SI : X86Reg<"si", 6, [SIL]>;
+def DI : X86Reg<"di", 7, [DIL]>;
+def BP : X86Reg<"bp", 5, [BPL]>;
+def SP : X86Reg<"sp", 4, [SPL]>;
+}
+def IP : X86Reg<"ip", 0>;
+
+// X86-64 only, requires REX.
+let SubRegIndices = [sub_8bit], CostPerUse = 1 in {
+def R8W  : X86Reg<"r8w",   8, [R8B]>;
+def R9W  : X86Reg<"r9w",   9, [R9B]>;
+def R10W : X86Reg<"r10w", 10, [R10B]>;
+def R11W : X86Reg<"r11w", 11, [R11B]>;
+def R12W : X86Reg<"r12w", 12, [R12B]>;
+def R13W : X86Reg<"r13w", 13, [R13B]>;
+def R14W : X86Reg<"r14w", 14, [R14B]>;
+def R15W : X86Reg<"r15w", 15, [R15B]>;
+}
+
+// 32-bit registers
+let SubRegIndices = [sub_16bit] in {
+def EAX : X86Reg<"eax", 0, [AX]>, DwarfRegNum<[-2, 0, 0]>;
+def EDX : X86Reg<"edx", 2, [DX]>, DwarfRegNum<[-2, 2, 2]>;
+def ECX : X86Reg<"ecx", 1, [CX]>, DwarfRegNum<[-2, 1, 1]>;
+def EBX : X86Reg<"ebx", 3, [BX]>, DwarfRegNum<[-2, 3, 3]>;
+def ESI : X86Reg<"esi", 6, [SI]>, DwarfRegNum<[-2, 6, 6]>;
+def EDI : X86Reg<"edi", 7, [DI]>, DwarfRegNum<[-2, 7, 7]>;
+def EBP : X86Reg<"ebp", 5, [BP]>, DwarfRegNum<[-2, 4, 5]>;
+def ESP : X86Reg<"esp", 4, [SP]>, DwarfRegNum<[-2, 5, 4]>;
+def EIP : X86Reg<"eip", 0, [IP]>, DwarfRegNum<[-2, 8, 8]>;
+
+// X86-64 only, requires REX
+let CostPerUse = 1 in {
+def R8D  : X86Reg<"r8d",   8, [R8W]>;
+def R9D  : X86Reg<"r9d",   9, [R9W]>;
+def R10D : X86Reg<"r10d", 10, [R10W]>;
+def R11D : X86Reg<"r11d", 11, [R11W]>;
+def R12D : X86Reg<"r12d", 12, [R12W]>;
+def R13D : X86Reg<"r13d", 13, [R13W]>;
+def R14D : X86Reg<"r14d", 14, [R14W]>;
+def R15D : X86Reg<"r15d", 15, [R15W]>;
+}}
+
+// 64-bit registers, X86-64 only
+let SubRegIndices = [sub_32bit] in {
+def RAX : X86Reg<"rax", 0, [EAX]>, DwarfRegNum<[0, -2, -2]>;
+def RDX : X86Reg<"rdx", 2, [EDX]>, DwarfRegNum<[1, -2, -2]>;
+def RCX : X86Reg<"rcx", 1, [ECX]>, DwarfRegNum<[2, -2, -2]>;
+def RBX : X86Reg<"rbx", 3, [EBX]>, DwarfRegNum<[3, -2, -2]>;
+def RSI : X86Reg<"rsi", 6, [ESI]>, DwarfRegNum<[4, -2, -2]>;
+def RDI : X86Reg<"rdi", 7, [EDI]>, DwarfRegNum<[5, -2, -2]>;
+def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>;
+def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>;
+
+// These also require REX.
+let CostPerUse = 1 in {
+def R8  : X86Reg<"r8",   8, [R8D]>,  DwarfRegNum<[ 8, -2, -2]>;
+def R9  : X86Reg<"r9",   9, [R9D]>,  DwarfRegNum<[ 9, -2, -2]>;
+def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>;
+def R11 : X86Reg<"r11", 11, [R11D]>, DwarfRegNum<[11, -2, -2]>;
+def R12 : X86Reg<"r12", 12, [R12D]>, DwarfRegNum<[12, -2, -2]>;
+def R13 : X86Reg<"r13", 13, [R13D]>, DwarfRegNum<[13, -2, -2]>;
+def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>;
+def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>;
+def RIP : X86Reg<"rip",  0, [EIP]>,  DwarfRegNum<[16, -2, -2]>;
+}}
+
+// MMX Registers. These are actually aliased to ST0 .. ST7
+def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>;
+def MM1 : X86Reg<"mm1", 1>, DwarfRegNum<[42, 30, 30]>;
+def MM2 : X86Reg<"mm2", 2>, DwarfRegNum<[43, 31, 31]>;
+def MM3 : X86Reg<"mm3", 3>, DwarfRegNum<[44, 32, 32]>;
+def MM4 : X86Reg<"mm4", 4>, DwarfRegNum<[45, 33, 33]>;
+def MM5 : X86Reg<"mm5", 5>, DwarfRegNum<[46, 34, 34]>;
+def MM6 : X86Reg<"mm6", 6>, DwarfRegNum<[47, 35, 35]>;
+def MM7 : X86Reg<"mm7", 7>, DwarfRegNum<[48, 36, 36]>;
+
+// Pseudo Floating Point registers
+def FP0 : X86Reg<"fp0", 0>;
+def FP1 : X86Reg<"fp1", 0>;
+def FP2 : X86Reg<"fp2", 0>;
+def FP3 : X86Reg<"fp3", 0>;
+def FP4 : X86Reg<"fp4", 0>;
+def FP5 : X86Reg<"fp5", 0>;
+def FP6 : X86Reg<"fp6", 0>;
+def FP7 : X86Reg<"fp7", 0>;
+
+// XMM Registers, used by the various SSE instruction set extensions.
+def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>;
+def XMM1: X86Reg<"xmm1", 1>, DwarfRegNum<[18, 22, 22]>;
+def XMM2: X86Reg<"xmm2", 2>, DwarfRegNum<[19, 23, 23]>;
+def XMM3: X86Reg<"xmm3", 3>, DwarfRegNum<[20, 24, 24]>;
+def XMM4: X86Reg<"xmm4", 4>, DwarfRegNum<[21, 25, 25]>;
+def XMM5: X86Reg<"xmm5", 5>, DwarfRegNum<[22, 26, 26]>;
+def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>;
+def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>;
+
+// X86-64 only
+let CostPerUse = 1 in {
+def XMM8:  X86Reg<"xmm8",   8>, DwarfRegNum<[25, -2, -2]>;
+def XMM9:  X86Reg<"xmm9",   9>, DwarfRegNum<[26, -2, -2]>;
+def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>;
+def XMM11: X86Reg<"xmm11", 11>, DwarfRegNum<[28, -2, -2]>;
+def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>;
+def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
+def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
+def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
+
+def XMM16:  X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>;
+def XMM17:  X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>;
+def XMM18:  X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>;
+def XMM19:  X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>;
+def XMM20:  X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>;
+def XMM21:  X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>;
+def XMM22:  X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>;
+def XMM23:  X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>;
+def XMM24:  X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>;
+def XMM25:  X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>;
+def XMM26:  X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>;
+def XMM27:  X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>;
+def XMM28:  X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>;
+def XMM29:  X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>;
+def XMM30:  X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>;
+def XMM31:  X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>;
+
+} // CostPerUse
+
+// YMM0-15 registers, used by AVX instructions and
+// YMM16-31 registers, used by AVX-512 instructions.
+let SubRegIndices = [sub_xmm] in {
+  foreach  Index = 0-31 in {
+    def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast<X86Reg>("XMM"#Index)]>,
+                    DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+  }
+}
+
+// ZMM Registers, used by AVX-512 instructions.
+let SubRegIndices = [sub_ymm] in {
+  foreach  Index = 0-31 in {
+    def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast<X86Reg>("YMM"#Index)]>,
+                    DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+  }
+}
+
+// Mask Registers, used by AVX-512 instructions.
+def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118,  93,  93]>;
+def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119,  94,  94]>;
+def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120,  95,  95]>;
+def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121,  96,  96]>;
+def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122,  97,  97]>;
+def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123,  98,  98]>;
+def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124,  99,  99]>;
+def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, 100, 100]>;
+
+// Floating point stack registers. These don't map one-to-one to the FP
+// pseudo registers, but we still mark them as aliasing FP registers. That
+// way both kinds can be live without exceeding the stack depth. ST registers
+// are only live around inline assembly.
+def ST0 : X86Reg<"st(0)", 0>, DwarfRegNum<[33, 12, 11]>;
+def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>;
+def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>;
+def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>;
+def ST4 : X86Reg<"st(4)", 4>, DwarfRegNum<[37, 16, 15]>;
+def ST5 : X86Reg<"st(5)", 5>, DwarfRegNum<[38, 17, 16]>;
+def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>;
+def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>;
+
+// Floating-point status word
+def FPSW : X86Reg<"fpsw", 0>;
+
+// Status flags register
+def EFLAGS : X86Reg<"flags", 0>;
+
+// Segment registers
+def CS : X86Reg<"cs", 1>;
+def DS : X86Reg<"ds", 3>;
+def SS : X86Reg<"ss", 2>;
+def ES : X86Reg<"es", 0>;
+def FS : X86Reg<"fs", 4>;
+def GS : X86Reg<"gs", 5>;
+
+// Debug registers
+def DR0  : X86Reg<"dr0",   0>;
+def DR1  : X86Reg<"dr1",   1>;
+def DR2  : X86Reg<"dr2",   2>;
+def DR3  : X86Reg<"dr3",   3>;
+def DR4  : X86Reg<"dr4",   4>;
+def DR5  : X86Reg<"dr5",   5>;
+def DR6  : X86Reg<"dr6",   6>;
+def DR7  : X86Reg<"dr7",   7>;
+def DR8  : X86Reg<"dr8",   8>;
+def DR9  : X86Reg<"dr9",   9>;
+def DR10 : X86Reg<"dr10", 10>;
+def DR11 : X86Reg<"dr11", 11>;
+def DR12 : X86Reg<"dr12", 12>;
+def DR13 : X86Reg<"dr13", 13>;
+def DR14 : X86Reg<"dr14", 14>;
+def DR15 : X86Reg<"dr15", 15>;
+
+// Control registers
+def CR0  : X86Reg<"cr0",   0>;
+def CR1  : X86Reg<"cr1",   1>;
+def CR2  : X86Reg<"cr2",   2>;
+def CR3  : X86Reg<"cr3",   3>;
+def CR4  : X86Reg<"cr4",   4>;
+def CR5  : X86Reg<"cr5",   5>;
+def CR6  : X86Reg<"cr6",   6>;
+def CR7  : X86Reg<"cr7",   7>;
+def CR8  : X86Reg<"cr8",   8>;
+def CR9  : X86Reg<"cr9",   9>;
+def CR10 : X86Reg<"cr10", 10>;
+def CR11 : X86Reg<"cr11", 11>;
+def CR12 : X86Reg<"cr12", 12>;
+def CR13 : X86Reg<"cr13", 13>;
+def CR14 : X86Reg<"cr14", 14>;
+def CR15 : X86Reg<"cr15", 15>;
+
+// Pseudo index registers
+def EIZ : X86Reg<"eiz", 4>;
+def RIZ : X86Reg<"riz", 4>;
+
+// Bound registers, used in MPX instructions
+def BND0 : X86Reg<"bnd0",   0>;
+def BND1 : X86Reg<"bnd1",   1>;
+def BND2 : X86Reg<"bnd2",   2>;
+def BND3 : X86Reg<"bnd3",   3>;
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes.  The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// List call-clobbered registers before callee-save registers. RBX, RBP, (and
+// R12, R13, R14, and R15 for X86-64) are callee-save registers.
+// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
+// R8B, ... R15B.
+// Allocate R12 and R13 last, as these require an extra byte when
+// encoded in x86_64 instructions.
+// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in
+// 64-bit mode. The main complication is that they cannot be encoded in an
+// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc.
+// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d"
+// cannot be encoded.
+def GR8 : RegisterClass<"X86", [i8],  8,
+                        (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL,
+                             R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> {
+  let AltOrders = [(sub GR8, AH, BH, CH, DH)];
+  let AltOrderSelect = [{
+    return MF.getSubtarget<X86Subtarget>().is64Bit();
+  }];
+}
+
+def GR16 : RegisterClass<"X86", [i16], 16,
+                         (add AX, CX, DX, SI, DI, BX, BP, SP,
+                              R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>;
+
+def GR32 : RegisterClass<"X86", [i32], 32,
+                         (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+                              R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>;
+
+// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
+// RIP isn't really a register and it can't be used anywhere except in an
+// address, but it doesn't cause trouble.
+// FIXME: it *does* cause trouble - CheckBaseRegAndIndexReg() has extra
+// tests because of the inclusion of RIP in this register class.
+def GR64 : RegisterClass<"X86", [i64], 64,
+                         (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+                              RBX, R14, R15, R12, R13, RBP, RSP, RIP)>;
+
+// Segment registers for use by MOV instructions (and others) that have a
+//   segment register as one operand.  Always contain a 16-bit segment
+//   descriptor.
+def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>;
+
+// Debug registers.
+def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 7)>;
+
+// Control registers.
+def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>;
+
+// GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of
+// GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d"
+// registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers
+// that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD,
+// and GR64_ABCD are classes for registers that support 8-bit h-register
+// operations.
+def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>;
+def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>;
+def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>;
+def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>;
+def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>;
+def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
+def GR64_TC   : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
+                                                     R8, R9, R11, RIP)>;
+def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
+                                                      R8, R9, R10, R11, RIP)>;
+
+// GR8_NOREX - GR8 registers which do not require a REX prefix.
+def GR8_NOREX : RegisterClass<"X86", [i8], 8,
+                              (add AL, CL, DL, AH, CH, DH, BL, BH)> {
+  let AltOrders = [(sub GR8_NOREX, AH, BH, CH, DH)];
+  let AltOrderSelect = [{
+    return MF.getSubtarget<X86Subtarget>().is64Bit();
+  }];
+}
+// GR16_NOREX - GR16 registers which do not require a REX prefix.
+def GR16_NOREX : RegisterClass<"X86", [i16], 16,
+                               (add AX, CX, DX, SI, DI, BX, BP, SP)>;
+// GR32_NOREX - GR32 registers which do not require a REX prefix.
+def GR32_NOREX : RegisterClass<"X86", [i32], 32,
+                               (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)>;
+// GR64_NOREX - GR64 registers which do not require a REX prefix.
+def GR64_NOREX : RegisterClass<"X86", [i64], 64,
+                            (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>;
+
+// GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit
+// mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs
+// to clear upper 32-bits of RAX so is not a NOP.
+def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)>;
+
+// GR32_NOSP - GR32 registers except ESP.
+def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>;
+
+// GR64_NOSP - GR64 registers except RSP (and RIP).
+def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)>;
+
+// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except
+// ESP.
+def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32,
+                                    (and GR32_NOREX, GR32_NOSP)>;
+
+// GR64_NOREX_NOSP - GR64_NOREX registers except RSP.
+def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
+                                    (and GR64_NOREX, GR64_NOSP)>;
+
+// Register classes used for ABIs that use 32-bit address accesses,
+// while using the whole x84_64 ISA.
+
+// In such cases, it is fine to use RIP as we are sure the 32 high
+// bits are not set. We do not need variants for NOSP as RIP is not
+// allowed there.
+// RIP is not spilled anywhere for now, so stick to 32-bit alignment
+// to save on memory space.
+// FIXME: We could allow all 64bit registers, but we would need
+// something to check that the 32 high bits are not set,
+// which we do not have right now.
+def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>;
+
+// When RBP is used as a base pointer in a 32-bit addresses environement,
+// this is also safe to use the full register to access addresses.
+// Since RBP will never be spilled, stick to a 32 alignment to save
+// on memory consumption.
+def LOW32_ADDR_ACCESS_RBP : RegisterClass<"X86", [i32], 32,
+                                          (add LOW32_ADDR_ACCESS, RBP)>;
+
+// A class to support the 'A' assembler constraint: EAX then EDX.
+def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>;
+
+// Scalar SSE2 floating point registers.
+def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
+
+def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
+
+def FR128 : RegisterClass<"X86", [i128, f128], 128, (add FR32)>;
+
+
+// FIXME: This sets up the floating point register files as though they are f64
+// values, though they really are f80 values.  This will cause us to spill
+// values as 64-bit quantities instead of 80-bit quantities, which is much much
+// faster on common hardware.  In reality, this should be controlled by a
+// command line option or something.
+
+def RFP32 : RegisterClass<"X86",[f32], 32, (sequence "FP%u", 0, 6)>;
+def RFP64 : RegisterClass<"X86",[f64], 32, (add RFP32)>;
+def RFP80 : RegisterClass<"X86",[f80], 32, (add RFP32)>;
+
+// Floating point stack registers (these are not allocatable by the
+// register allocator - the floating point stackifier is responsible
+// for transforming FPn allocations to STn registers)
+def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
+  let isAllocatable = 0;
+}
+
+// Generic vector registers: VR64 and VR128.
+// Ensure that float types are declared first - only float is legal on SSE1.
+def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
+def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+                          128, (add FR32)>;
+def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+                          256, (sequence "YMM%u", 0, 15)>;
+
+// Special classes that help the assembly parser choose some alternate
+// instructions to favor 2-byte VEX encodings.
+def VR128L : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+                           128, (sequence "XMM%u", 0, 7)>;
+def VR128H : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+                           128, (sequence "XMM%u", 8, 15)>;
+def VR256L : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+                           256, (sequence "YMM%u", 0, 7)>;
+def VR256H : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+                           256, (sequence "YMM%u", 8, 15)>;
+
+// Status flags registers.
+def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+  let isAllocatable = 0;
+}
+def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+  let isAllocatable = 0;
+}
+
+// AVX-512 vector/mask registers.
+def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64],
+                          512, (sequence "ZMM%u", 0, 31)>;
+
+// Scalar AVX-512 floating point registers.
+def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
+
+def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
+
+// Extended VR128 and VR256 for AVX-512 instructions
+def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64],
+                           128, (add FR32X)>;
+def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+                           256, (sequence "YMM%u", 0, 31)>;
+
+// Mask registers
+def VK1     : RegisterClass<"X86", [i1],    16,  (sequence "K%u", 0, 7)> {let Size = 16;}
+def VK2     : RegisterClass<"X86", [v2i1],  16,  (add VK1)> {let Size = 16;}
+def VK4     : RegisterClass<"X86", [v4i1],  16,  (add VK2)> {let Size = 16;}
+def VK8     : RegisterClass<"X86", [v8i1],  16,  (add VK4)> {let Size = 16;}
+def VK16    : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
+def VK32    : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;}
+def VK64    : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;}
+
+def VK1WM   : RegisterClass<"X86", [i1],    16,  (sub VK1, K0)> {let Size = 16;}
+def VK2WM   : RegisterClass<"X86", [v2i1],  16,  (sub VK2, K0)> {let Size = 16;}
+def VK4WM   : RegisterClass<"X86", [v4i1],  16,  (sub VK4, K0)> {let Size = 16;}
+def VK8WM   : RegisterClass<"X86", [v8i1],  16,  (sub VK8, K0)> {let Size = 16;}
+def VK16WM  : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>   {let Size = 16;}
+def VK32WM  : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;}
+def VK64WM  : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
+
+// Bound registers
+def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
new file mode 100644
index 000000000000..677e82459766
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -0,0 +1,2147 @@
+//=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Haswell to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def HaswellModel : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and HW can decode 4
+  // instructions per cycle.
+  let IssueWidth = 4;
+  let MicroOpBufferSize = 192; // Based on the reorder buffer.
+  let LoadLatency = 4;
+  let MispredictPenalty = 16;
+
+  // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+  let LoopMicroOpBufferSize = 50;
+
+  // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
+}
+
+let SchedModel = HaswellModel in {
+
+// Haswell can issue micro-ops to 8 different ports in one cycle.
+
+// Ports 0, 1, 5, and 6 handle all computation.
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores. Port 7 can handle address calculations.
+def HWPort0 : ProcResource<1>;
+def HWPort1 : ProcResource<1>;
+def HWPort2 : ProcResource<1>;
+def HWPort3 : ProcResource<1>;
+def HWPort4 : ProcResource<1>;
+def HWPort5 : ProcResource<1>;
+def HWPort6 : ProcResource<1>;
+def HWPort7 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def HWPort01  : ProcResGroup<[HWPort0, HWPort1]>;
+def HWPort23  : ProcResGroup<[HWPort2, HWPort3]>;
+def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>;
+def HWPort04  : ProcResGroup<[HWPort0, HWPort4]>;
+def HWPort05  : ProcResGroup<[HWPort0, HWPort5]>;
+def HWPort06  : ProcResGroup<[HWPort0, HWPort6]>;
+def HWPort15  : ProcResGroup<[HWPort1, HWPort5]>;
+def HWPort16  : ProcResGroup<[HWPort1, HWPort6]>;
+def HWPort56  : ProcResGroup<[HWPort5, HWPort6]>;
+def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>;
+def HWPort056 : ProcResGroup<[HWPort0, HWPort5, HWPort6]>;
+def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>;
+
+// 60 Entry Unified Scheduler
+def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4,
+                              HWPort5, HWPort6, HWPort7]> {
+  let BufferSize=60;
+}
+
+// Integer division issued on port 0.
+def HWDivider : ProcResource<1>;
+
+// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> {
+     let Latency = !add(Lat, 4);
+  }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [HWPort4]>;
+
+// Store_addr on 237.
+// Store_data on 4.
+def : WriteRes<WriteStore, [HWPort237, HWPort4]>;
+def : WriteRes<WriteLoad,  [HWPort23]> { let Latency = 4; }
+def : WriteRes<WriteMove,  [HWPort0156]>;
+def : WriteRes<WriteZero,  []>;
+
+defm : HWWriteResPair<WriteALU,   HWPort0156, 1>;
+defm : HWWriteResPair<WriteIMul,  HWPort1,   3>;
+def  : WriteRes<WriteIMulH, []> { let Latency = 3; }
+defm : HWWriteResPair<WriteShift, HWPort06,  1>;
+defm : HWWriteResPair<WriteJump,  HWPort06,   1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [HWPort15]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [HWPort0, HWDivider]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [HWPort23, HWPort0, HWDivider]> {
+  let Latency = 29;
+  let ResourceCycles = [1, 1, 10];
+}
+
+// Scalar and vector floating point.
+defm : HWWriteResPair<WriteFAdd,   HWPort1, 3>;
+defm : HWWriteResPair<WriteFMul,   HWPort0, 5>;
+defm : HWWriteResPair<WriteFDiv,   HWPort0, 12>; // 10-14 cycles.
+defm : HWWriteResPair<WriteFRcp,   HWPort0, 5>;
+defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>;
+defm : HWWriteResPair<WriteFSqrt,  HWPort0, 15>;
+defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
+defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
+defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>;
+defm : HWWriteResPair<WriteFShuffle,  HWPort5,  1>;
+defm : HWWriteResPair<WriteFBlend,  HWPort015,  1>;
+defm : HWWriteResPair<WriteFShuffle256,  HWPort5,  3>;
+
+def : WriteRes<WriteFVarBlend, [HWPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteFVarBlendLd, [HWPort5, HWPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1];
+}
+
+// Vector integer operations.
+defm : HWWriteResPair<WriteVecShift, HWPort0,  1>;
+defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>;
+defm : HWWriteResPair<WriteVecALU,   HWPort15,  1>;
+defm : HWWriteResPair<WriteVecIMul,  HWPort0,   5>;
+defm : HWWriteResPair<WriteShuffle,  HWPort5,  1>;
+defm : HWWriteResPair<WriteBlend,  HWPort15,  1>;
+defm : HWWriteResPair<WriteShuffle256,  HWPort5,  3>;
+
+def : WriteRes<WriteVarBlend, [HWPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteVarBlendLd, [HWPort5, HWPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteVarVecShift, [HWPort0, HWPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteVarVecShiftLd, [HWPort0, HWPort5, HWPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1, 1];
+}
+
+def : WriteRes<WriteMPSAD, [HWPort0, HWPort5]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteMPSADLd, [HWPort23, HWPort0, HWPort5]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 2];
+}
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [HWPort0]> {
+  let Latency = 10;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [HWPort0, HWPort23]> {
+  let Latency = 10;
+  let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [HWPort0, HWPort16, HWPort5]> {
+  let Latency = 10;
+  let ResourceCycles = [3, 2, 4];
+}
+def : WriteRes<WritePCmpEStrMLd, [HWPort05, HWPort16, HWPort23]> {
+  let Latency = 10;
+  let ResourceCycles = [6, 2, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [HWPort0]> {
+  let Latency = 11;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [HWPort0, HWPort23]> {
+  let Latency = 11;
+  let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [HWPort05, HWPort16]> {
+  let Latency = 11;
+  let ResourceCycles = [6, 2];
+}
+def : WriteRes<WritePCmpEStrILd, [HWPort0, HWPort16, HWPort5, HWPort23]> {
+  let Latency = 11;
+  let ResourceCycles = [3, 2, 2, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [HWPort5]> {
+  let Latency = 7;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESDecEncLd, [HWPort5, HWPort23]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 1];
+}
+
+def : WriteRes<WriteAESIMC, [HWPort5]> {
+  let Latency = 14;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [HWPort5, HWPort23]> {
+  let Latency = 14;
+  let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [HWPort0, HWPort5]> {
+  let Latency = 10;
+  let ResourceCycles = [2, 8];
+}
+def : WriteRes<WriteAESKeyGenLd, [HWPort0, HWPort5, HWPort23]> {
+  let Latency = 10;
+  let ResourceCycles = [2, 7, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [HWPort0, HWPort5]> {
+  let Latency = 7;
+  let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteCLMulLd, [HWPort0, HWPort5, HWPort23]> {
+  let Latency = 7;
+  let ResourceCycles = [2, 1, 1];
+}
+
+def : WriteRes<WriteSystem,     [HWPort0156]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; }
+def : WriteRes<WriteFence,  [HWPort23, HWPort4]>;
+def : WriteRes<WriteNop, []>;
+
+//================ Exceptions ================//
+
+//-- Specific Scheduling Models --//
+
+// Starting with P0.
+def WriteP0 : SchedWriteRes<[HWPort0]>;
+
+def WriteP0_P1_Lat4 : SchedWriteRes<[HWPort0, HWPort1]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+
+def WriteP0_P1_Lat4Ld : SchedWriteRes<[HWPort0, HWPort1, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+
+def WriteP01 : SchedWriteRes<[HWPort01]>;
+
+def Write2P01 : SchedWriteRes<[HWPort01]> {
+  let NumMicroOps = 2;
+}
+def Write3P01 : SchedWriteRes<[HWPort01]> {
+  let NumMicroOps = 3;
+}
+
+def WriteP015 : SchedWriteRes<[HWPort015]>;
+
+def WriteP01_P5 : SchedWriteRes<[HWPort01, HWPort5]> {
+  let NumMicroOps = 2;
+}
+def WriteP06 : SchedWriteRes<[HWPort06]>;
+
+def Write2P06 : SchedWriteRes<[HWPort06]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+
+def Write3P06_Lat2 : SchedWriteRes<[HWPort06]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [3];
+}
+
+def WriteP0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
+  let NumMicroOps = 2;
+}
+
+def Write2P0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+
+def Write2P0156_Lat2 : SchedWriteRes<[HWPort0156]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def Write2P0156_Lat2Ld : SchedWriteRes<[HWPort0156, HWPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1];
+}
+
+def Write5P0156 : SchedWriteRes<[HWPort0156]> {
+  let NumMicroOps = 5;
+  let ResourceCycles = [5];
+}
+
+def WriteP0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+  let Latency = 1;
+  let ResourceCycles = [1, 2, 1];
+}
+
+def Write2P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+  let Latency = 1;
+  let ResourceCycles = [2, 2, 1];
+}
+
+def Write3P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+  let Latency = 1;
+  let ResourceCycles = [3, 2, 1];
+}
+
+// Starting with P1.
+def WriteP1 : SchedWriteRes<[HWPort1]>;
+
+def WriteP1_P23 : SchedWriteRes<[HWPort1, HWPort23]> {
+  let NumMicroOps = 2;
+}
+def WriteP1_Lat3 : SchedWriteRes<[HWPort1]> {
+  let Latency = 3;
+}
+def WriteP1_Lat3Ld : SchedWriteRes<[HWPort1, HWPort23]> {
+  let Latency = 7;
+}
+
+def Write2P1 : SchedWriteRes<[HWPort1]> {
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def Write2P1_P23 : SchedWriteRes<[HWPort1, HWPort23]> {
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def WriteP15 : SchedWriteRes<[HWPort15]>;
+def WriteP15Ld : SchedWriteRes<[HWPort15, HWPort23]> {
+  let Latency = 4;
+}
+
+def WriteP1_P5_Lat4 : SchedWriteRes<[HWPort1, HWPort5]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+
+def WriteP1_P5_Lat4Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+
+def WriteP1_P5_Lat6 : SchedWriteRes<[HWPort1, HWPort5]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+
+def WriteP1_P5_Lat6Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+
+// Starting with P2.
+def Write2P237_P4 : SchedWriteRes<[HWPort237, HWPort4]> {
+  let Latency = 1;
+  let ResourceCycles = [2, 1];
+}
+
+// Starting with P5.
+def WriteP5 : SchedWriteRes<[HWPort5]>;
+def WriteP5Ld : SchedWriteRes<[HWPort5, HWPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+
+// Notation:
+// - r: register.
+// - mm: 64 bit mmx register.
+// - x = 128 bit xmm register.
+// - (x)mm = mmx or xmm register.
+// - y = 256 bit ymm register.
+// - v = any vector register.
+// - m = memory.
+
+//=== Integer Instructions ===//
+//-- Move instructions --//
+
+// MOV.
+// r16,m.
+def : InstRW<[WriteALULd], (instregex "MOV16rm")>;
+
+// MOVSX, MOVZX.
+// r,m.
+def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>;
+
+// CMOVcc.
+// r,r.
+def : InstRW<[Write2P0156_Lat2],
+      (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd],
+      (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>;
+
+// XCHG.
+// r,r.
+def WriteXCHG : SchedWriteRes<[HWPort0156]> {
+  let Latency = 2;
+  let ResourceCycles = [3];
+}
+
+def : InstRW<[WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>;
+
+// r,m.
+def WriteXCHGrm : SchedWriteRes<[]> {
+  let Latency = 21;
+  let NumMicroOps = 8;
+}
+def : InstRW<[WriteXCHGrm], (instregex "XCHG(8|16|32|64)rm")>;
+
+// XLAT.
+def WriteXLAT : SchedWriteRes<[]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteXLAT], (instregex "XLAT")>;
+
+// PUSH.
+// m.
+def : InstRW<[Write2P237_P4], (instregex "PUSH(16|32)rmm")>;
+
+// PUSHF.
+def WritePushF : SchedWriteRes<[HWPort1, HWPort4, HWPort237, HWPort06]> {
+  let NumMicroOps = 4;
+}
+def : InstRW<[WritePushF], (instregex "PUSHF(16|32)")>;
+
+// PUSHA.
+def WritePushA : SchedWriteRes<[]> {
+  let NumMicroOps = 19;
+}
+def : InstRW<[WritePushA], (instregex "PUSHA(16|32)")>;
+
+// POP.
+// m.
+def : InstRW<[Write2P237_P4], (instregex "POP(16|32)rmm")>;
+
+// POPF.
+def WritePopF : SchedWriteRes<[]> {
+  let NumMicroOps = 9;
+}
+def : InstRW<[WritePopF], (instregex "POPF(16|32)")>;
+
+// POPA.
+def WritePopA : SchedWriteRes<[]> {
+  let NumMicroOps = 18;
+}
+def : InstRW<[WritePopA], (instregex "POPA(16|32)")>;
+
+// LAHF SAHF.
+def : InstRW<[WriteP06], (instregex "(S|L)AHF")>;
+
+// BSWAP.
+// r32.
+def WriteBSwap32 : SchedWriteRes<[HWPort15]>;
+def : InstRW<[WriteBSwap32], (instregex "BSWAP32r")>;
+
+// r64.
+def WriteBSwap64 : SchedWriteRes<[HWPort06, HWPort15]> {
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteBSwap64], (instregex "BSWAP64r")>;
+
+// MOVBE.
+// r16,m16 / r64,m64.
+def : InstRW<[Write2P0156_Lat2Ld], (instregex "MOVBE(16|64)rm")>;
+
+// r32, m32.
+def WriteMoveBE32rm : SchedWriteRes<[HWPort15, HWPort23]> {
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteMoveBE32rm], (instregex "MOVBE32rm")>;
+
+// m16,r16.
+def WriteMoveBE16mr : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMoveBE16mr], (instregex "MOVBE16mr")>;
+
+// m32,r32.
+def WriteMoveBE32mr : SchedWriteRes<[HWPort15, HWPort237, HWPort4]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMoveBE32mr], (instregex "MOVBE32mr")>;
+
+// m64,r64.
+def WriteMoveBE64mr : SchedWriteRes<[HWPort06, HWPort15, HWPort237, HWPort4]> {
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteMoveBE64mr], (instregex "MOVBE64mr")>;
+
+//-- Arithmetic instructions --//
+
+// ADD SUB.
+// m,r/i.
+def : InstRW<[Write2P0156_2P237_P4],
+              (instregex "(ADD|SUB)(8|16|32|64)m(r|i)",
+              "(ADD|SUB)(8|16|32|64)mi8", "(ADD|SUB)64mi32")>;
+
+// ADC SBB.
+// r,r/i.
+def : InstRW<[Write2P0156_Lat2], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)",
+                           "(ADC|SBB)(16|32|64)ri8",
+                           "(ADC|SBB)64ri32",
+                           "(ADC|SBB)(8|16|32|64)rr_REV")>;
+
+// r,m.
+def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], (instregex "(ADC|SBB)(8|16|32|64)rm")>;
+
+// m,r/i.
+def : InstRW<[Write3P0156_2P237_P4],
+             (instregex "(ADC|SBB)(8|16|32|64)m(r|i)",
+              "(ADC|SBB)(16|32|64)mi8",
+              "(ADC|SBB)64mi32")>;
+
+// INC DEC NOT NEG.
+// m.
+def : InstRW<[WriteP0156_2P237_P4],
+             (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m",
+              "(INC|DEC)64(16|32)m")>;
+
+// MUL IMUL.
+// r16.
+def WriteMul16 : SchedWriteRes<[HWPort1, HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteMul16], (instregex "IMUL16r", "MUL16r")>;
+
+// m16.
+def WriteMul16Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 5;
+}
+def : InstRW<[WriteMul16Ld], (instregex "IMUL16m", "MUL16m")>;
+
+// r32.
+def WriteMul32 : SchedWriteRes<[HWPort1, HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMul32], (instregex "IMUL32r", "MUL32r")>;
+
+// m32.
+def WriteMul32Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteMul32Ld], (instregex "IMUL32m", "MUL32m")>;
+
+// r64.
+def WriteMul64 : SchedWriteRes<[HWPort1, HWPort6]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteMul64], (instregex "IMUL64r", "MUL64r")>;
+
+// m64.
+def WriteMul64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMul64Ld], (instregex "IMUL64m", "MUL64m")>;
+
+// r16,r16.
+def WriteMul16rri : SchedWriteRes<[HWPort1, HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>;
+
+// r16,m16.
+def WriteMul16rmi : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMul16rmi], (instregex "IMUL16rmi", "IMUL16rmi8")>;
+
+// MULX.
+// r32,r32,r32.
+def WriteMulX32 : SchedWriteRes<[HWPort1, HWPort056]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteMulX32], (instregex "MULX32rr")>;
+
+// r32,r32,m32.
+def WriteMulX32Ld : SchedWriteRes<[HWPort1, HWPort056, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WriteMulX32Ld], (instregex "MULX32rm")>;
+
+// r64,r64,r64.
+def WriteMulX64 : SchedWriteRes<[HWPort1, HWPort6]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteMulX64], (instregex "MULX64rr")>;
+
+// r64,r64,m64.
+def WriteMulX64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMulX64Ld], (instregex "MULX64rm")>;
+
+// DIV.
+// r8.
+def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 22;
+  let NumMicroOps = 9;
+}
+def : InstRW<[WriteDiv8], (instregex "DIV8r")>;
+
+// r16.
+def WriteDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 23;
+  let NumMicroOps = 10;
+}
+def : InstRW<[WriteDiv16], (instregex "DIV16r")>;
+
+// r32.
+def WriteDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 22;
+  let NumMicroOps = 10;
+}
+def : InstRW<[WriteDiv32], (instregex "DIV32r")>;
+
+// r64.
+def WriteDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 32;
+  let NumMicroOps = 36;
+}
+def : InstRW<[WriteDiv64], (instregex "DIV64r")>;
+
+// IDIV.
+// r8.
+def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 23;
+  let NumMicroOps = 9;
+}
+def : InstRW<[WriteIDiv8], (instregex "IDIV8r")>;
+
+// r16.
+def WriteIDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 23;
+  let NumMicroOps = 10;
+}
+def : InstRW<[WriteIDiv16], (instregex "IDIV16r")>;
+
+// r32.
+def WriteIDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 22;
+  let NumMicroOps = 9;
+}
+def : InstRW<[WriteIDiv32], (instregex "IDIV32r")>;
+
+// r64.
+def WriteIDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 39;
+  let NumMicroOps = 59;
+}
+def : InstRW<[WriteIDiv64], (instregex "IDIV64r")>;
+
+//-- Logic instructions --//
+
+// AND OR XOR.
+// m,r/i.
+def : InstRW<[Write2P0156_2P237_P4],
+             (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)",
+              "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>;
+
+// SHR SHL SAR.
+// m,i.
+def WriteShiftRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteShiftRMW], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>;
+
+// r,cl.
+def : InstRW<[Write3P06_Lat2], (instregex "S(A|H)(R|L)(8|16|32|64)rCL")>;
+
+// m,cl.
+def WriteShiftClLdRMW : SchedWriteRes<[HWPort06, HWPort23, HWPort4]> {
+  let NumMicroOps = 6;
+  let ResourceCycles = [3, 2, 1];
+}
+def : InstRW<[WriteShiftClLdRMW], (instregex "S(A|H)(R|L)(8|16|32|64)mCL")>;
+
+// ROR ROL.
+// r,1.
+def : InstRW<[Write2P06], (instregex "RO(R|L)(8|16|32|64)r1")>;
+
+// m,i.
+def WriteRotateRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+  let NumMicroOps = 5;
+  let ResourceCycles = [2, 2, 1];
+}
+def : InstRW<[WriteRotateRMW], (instregex "RO(R|L)(8|16|32|64)mi")>;
+
+// r,cl.
+def : InstRW<[Write3P06_Lat2], (instregex "RO(R|L)(8|16|32|64)rCL")>;
+
+// m,cl.
+def WriteRotateRMWCL : SchedWriteRes<[]> {
+  let NumMicroOps = 6;
+}
+def : InstRW<[WriteRotateRMWCL], (instregex "RO(R|L)(8|16|32|64)mCL")>;
+
+// RCR RCL.
+// r,1.
+def WriteRCr1 : SchedWriteRes<[HWPort06, HWPort0156]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteRCr1], (instregex "RC(R|L)(8|16|32|64)r1")>;
+
+// m,1.
+def WriteRCm1 : SchedWriteRes<[]> {
+  let NumMicroOps = 6;
+}
+def : InstRW<[WriteRCm1], (instregex "RC(R|L)(8|16|32|64)m1")>;
+
+// r,i.
+def WriteRCri : SchedWriteRes<[HWPort0156]> {
+  let Latency = 6;
+  let NumMicroOps = 8;
+}
+def : InstRW<[WriteRCri], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>;
+
+// m,i.
+def WriteRCmi : SchedWriteRes<[]> {
+  let NumMicroOps = 11;
+}
+def : InstRW<[WriteRCmi], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>;
+
+// SHRD SHLD.
+// r,r,i.
+def WriteShDrr : SchedWriteRes<[HWPort1]> {
+  let Latency = 3;
+}
+def : InstRW<[WriteShDrr], (instregex "SH(R|L)D(16|32|64)rri8")>;
+
+// m,r,i.
+def WriteShDmr : SchedWriteRes<[]> {
+  let NumMicroOps = 5;
+}
+def : InstRW<[WriteShDmr], (instregex "SH(R|L)D(16|32|64)mri8")>;
+
+// r,r,cl.
+def WriteShlDCL : SchedWriteRes<[HWPort0156]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteShlDCL], (instregex "SHLD(16|32|64)rrCL")>;
+
+// r,r,cl.
+def WriteShrDCL : SchedWriteRes<[HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteShrDCL], (instregex "SHRD(16|32|64)rrCL")>;
+
+// m,r,cl.
+def WriteShDmrCL : SchedWriteRes<[]> {
+  let NumMicroOps = 7;
+}
+def : InstRW<[WriteShDmrCL], (instregex "SH(R|L)D(16|32|64)mrCL")>;
+
+// BT.
+// r,r/i.
+def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>;
+
+// m,r.
+def WriteBTmr : SchedWriteRes<[]> {
+  let NumMicroOps = 10;
+}
+def : InstRW<[WriteBTmr], (instregex "BT(16|32|64)mr")>;
+
+// m,i.
+def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>;
+
+// BTR BTS BTC.
+// r,r,i.
+def : InstRW<[WriteShift], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>;
+
+// m,r.
+def WriteBTRSCmr : SchedWriteRes<[]> {
+  let NumMicroOps = 11;
+}
+def : InstRW<[WriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>;
+
+// m,i.
+def : InstRW<[WriteShiftLd], (instregex "BT(R|S|C)(16|32|64)mi8")>;
+
+// BSF BSR.
+// r,r.
+def : InstRW<[WriteP1_Lat3], (instregex "BS(R|F)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP1_Lat3Ld], (instregex "BS(R|F)(16|32|64)rm")>;
+
+// SETcc.
+// r.
+def : InstRW<[WriteShift],
+             (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>;
+// m.
+def WriteSetCCm : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteSetCCm],
+             (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>;
+
+// CLD STD.
+def WriteCldStd : SchedWriteRes<[HWPort15, HWPort6]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteCldStd], (instregex "STD", "CLD")>;
+
+// LZCNT TZCNT.
+// r,r.
+def : InstRW<[WriteP1_Lat3], (instregex "(L|TZCNT)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP1_Lat3Ld], (instregex "(L|TZCNT)(16|32|64)rm")>;
+
+// ANDN.
+// r,r.
+def : InstRW<[WriteP15], (instregex "ANDN(32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP15Ld], (instregex "ANDN(32|64)rm")>;
+
+// BLSI BLSMSK BLSR.
+// r,r.
+def : InstRW<[WriteP15], (instregex "BLS(I|MSK|R)(32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP15Ld], (instregex "BLS(I|MSK|R)(32|64)rm")>;
+
+// BEXTR.
+// r,r,r.
+def : InstRW<[Write2P0156_Lat2], (instregex "BEXTR(32|64)rr")>;
+// r,m,r.
+def : InstRW<[Write2P0156_Lat2Ld], (instregex "BEXTR(32|64)rm")>;
+
+// BZHI.
+// r,r,r.
+def : InstRW<[WriteP15], (instregex "BZHI(32|64)rr")>;
+// r,m,r.
+def : InstRW<[WriteP15Ld], (instregex "BZHI(32|64)rm")>;
+
+// PDEP PEXT.
+// r,r,r.
+def : InstRW<[WriteP1_Lat3], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>;
+// r,m,r.
+def : InstRW<[WriteP1_Lat3Ld], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>;
+
+//-- Control transfer instructions --//
+
+// J(E|R)CXZ.
+def WriteJCXZ : SchedWriteRes<[HWPort0156, HWPort6]> {
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>;
+
+// LOOP.
+def WriteLOOP : SchedWriteRes<[]> {
+  let NumMicroOps = 7;
+}
+def : InstRW<[WriteLOOP], (instregex "LOOP")>;
+
+// LOOP(N)E
+def WriteLOOPE : SchedWriteRes<[]> {
+  let NumMicroOps = 11;
+}
+def : InstRW<[WriteLOOPE], (instregex "LOOPE", "LOOPNE")>;
+
+// CALL.
+// r.
+def WriteCALLr : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteCALLr], (instregex "CALL(16|32)r")>;
+
+// m.
+def WriteCALLm : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteCALLm], (instregex "CALL(16|32)m")>;
+
+// RET.
+def WriteRET : SchedWriteRes<[HWPort237, HWPort6]> {
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)")>;
+
+// i.
+def WriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> {
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WriteRETI], (instregex "RETI(L|Q|W)", "LRETI(L|Q|W)")>;
+
+// BOUND.
+// r,m.
+def WriteBOUND : SchedWriteRes<[]> {
+  let NumMicroOps = 15;
+}
+def : InstRW<[WriteBOUND], (instregex "BOUNDS(16|32)rm")>;
+
+// INTO.
+def WriteINTO : SchedWriteRes<[]> {
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteINTO], (instregex "INTO")>;
+
+//-- String instructions --//
+
+// LODSB/W.
+def : InstRW<[Write2P0156_P23], (instregex "LODS(B|W)")>;
+
+// LODSD/Q.
+def : InstRW<[WriteP0156_P23], (instregex "LODS(L|Q)")>;
+
+// STOS.
+def WriteSTOS : SchedWriteRes<[HWPort23, HWPort0156, HWPort4]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteSTOS], (instregex "STOS(B|L|Q|W)")>;
+
+// MOVS.
+def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 5;
+  let ResourceCycles = [2, 1, 2];
+}
+def : InstRW<[WriteMOVS], (instregex "MOVS(B|L|Q|W)")>;
+
+// SCAS.
+def : InstRW<[Write2P0156_P23], (instregex "SCAS(B|W|L|Q)")>;
+
+// CMPS.
+def WriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 5;
+  let ResourceCycles = [2, 3];
+}
+def : InstRW<[WriteCMPS], (instregex "CMPS(B|L|Q|W)")>;
+
+//-- Synchronization instructions --//
+
+// XADD.
+def WriteXADD : SchedWriteRes<[]> {
+  let NumMicroOps = 5;
+}
+def : InstRW<[WriteXADD], (instregex "XADD(8|16|32|64)rm")>;
+
+// CMPXCHG.
+def WriteCMPXCHG : SchedWriteRes<[]> {
+  let NumMicroOps = 6;
+}
+def : InstRW<[WriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>;
+
+// CMPXCHG8B.
+def WriteCMPXCHG8B : SchedWriteRes<[]> {
+  let NumMicroOps = 15;
+}
+def : InstRW<[WriteCMPXCHG8B], (instregex "CMPXCHG8B")>;
+
+// CMPXCHG16B.
+def WriteCMPXCHG16B : SchedWriteRes<[]> {
+  let NumMicroOps = 22;
+}
+def : InstRW<[WriteCMPXCHG16B], (instregex "CMPXCHG16B")>;
+
+//-- Other --//
+
+// PAUSE.
+def WritePAUSE : SchedWriteRes<[HWPort05, HWPort6]> {
+  let NumMicroOps = 5;
+  let ResourceCycles = [1, 3];
+}
+def : InstRW<[WritePAUSE], (instregex "PAUSE")>;
+
+// LEAVE.
+def : InstRW<[Write2P0156_P23], (instregex "LEAVE")>;
+
+// XGETBV.
+def WriteXGETBV : SchedWriteRes<[]> {
+  let NumMicroOps = 8;
+}
+def : InstRW<[WriteXGETBV], (instregex "XGETBV")>;
+
+// RDTSC.
+def WriteRDTSC : SchedWriteRes<[]> {
+  let NumMicroOps = 15;
+}
+def : InstRW<[WriteRDTSC], (instregex "RDTSC")>;
+
+// RDPMC.
+def WriteRDPMC : SchedWriteRes<[]> {
+  let NumMicroOps = 34;
+}
+def : InstRW<[WriteRDPMC], (instregex "RDPMC")>;
+
+// RDRAND.
+def WriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> {
+  let NumMicroOps = 17;
+  let ResourceCycles = [1, 16];
+}
+def : InstRW<[WriteRDRAND], (instregex "RDRAND(16|32|64)r")>;
+
+//=== Floating Point x87 Instructions ===//
+//-- Move instructions --//
+
+// FLD.
+// m80.
+def : InstRW<[WriteP01], (instregex "LD_Frr")>;
+
+def WriteLD_F80m : SchedWriteRes<[HWPort01, HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 2];
+}
+def : InstRW<[WriteLD_F80m], (instregex "LD_F80m")>;
+
+// FBLD.
+// m80.
+def WriteFBLD : SchedWriteRes<[]> {
+  let Latency = 47;
+  let NumMicroOps = 43;
+}
+def : InstRW<[WriteFBLD], (instregex "FBLDm")>;
+
+// FST(P).
+// r.
+def : InstRW<[WriteP01], (instregex "ST_(F|FP)rr")>;
+
+// m80.
+def WriteST_FP80m : SchedWriteRes<[HWPort0156, HWPort23, HWPort4]> {
+  let NumMicroOps = 7;
+  let ResourceCycles = [3, 2, 2];
+}
+def : InstRW<[WriteST_FP80m], (instregex "ST_FP80m")>;
+
+// FBSTP.
+// m80.
+def WriteFBSTP : SchedWriteRes<[]> {
+  let NumMicroOps = 226;
+}
+def : InstRW<[WriteFBSTP], (instregex "FBSTPm")>;
+
+// FXCHG.
+def : InstRW<[WriteNop], (instregex "XCH_F")>;
+
+// FILD.
+def WriteFILD : SchedWriteRes<[HWPort01, HWPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteFILD], (instregex "ILD_F(16|32|64)m")>;
+
+// FIST(P) FISTTP.
+def WriteFIST : SchedWriteRes<[HWPort1, HWPort23, HWPort4]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteFIST], (instregex "IST_(F|FP)(16|32)m")>;
+
+// FLDZ.
+def : InstRW<[WriteP01], (instregex "LD_F0")>;
+
+// FLD1.
+def : InstRW<[Write2P01], (instregex "LD_F1")>;
+
+// FLDPI FLDL2E etc.
+def : InstRW<[Write2P01], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>;
+
+// FCMOVcc.
+def WriteFCMOVcc : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteFCMOVcc], (instregex "CMOV(B|BE|P|NB|NBE|NE|NP)_F")>;
+
+// FNSTSW.
+// AX.
+def WriteFNSTSW : SchedWriteRes<[HWPort0, HWPort0156]> {
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteFNSTSW], (instregex "FNSTSW16r")>;
+
+// m16.
+def WriteFNSTSWm : SchedWriteRes<[HWPort0, HWPort4, HWPort237]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteFNSTSWm], (instregex "FNSTSWm")>;
+
+// FLDCW.
+def WriteFLDCW : SchedWriteRes<[HWPort01, HWPort23, HWPort6]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteFLDCW], (instregex "FLDCW16m")>;
+
+// FNSTCW.
+def WriteFNSTCW : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteFNSTCW], (instregex "FNSTCW16m")>;
+
+// FINCSTP FDECSTP.
+def : InstRW<[WriteP01], (instregex "FINCSTP", "FDECSTP")>;
+
+// FFREE.
+def : InstRW<[WriteP01], (instregex "FFREE")>;
+
+// FNSAVE.
+def WriteFNSAVE : SchedWriteRes<[]> {
+  let NumMicroOps = 147;
+}
+def : InstRW<[WriteFNSAVE], (instregex "FSAVEm")>;
+
+// FRSTOR.
+def WriteFRSTOR : SchedWriteRes<[]> {
+  let NumMicroOps = 90;
+}
+def : InstRW<[WriteFRSTOR], (instregex "FRSTORm")>;
+
+//-- Arithmetic instructions --//
+
+// FABS.
+def : InstRW<[WriteP0], (instregex "ABS_F")>;
+
+// FCHS.
+def : InstRW<[WriteP0], (instregex "CHS_F")>;
+
+// FCOM(P) FUCOM(P).
+// r.
+def : InstRW<[WriteP1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr",
+                         "UCOM_FPr")>;
+// m.
+def : InstRW<[WriteP1_P23], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>;
+
+// FCOMPP FUCOMPP.
+// r.
+def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>;
+
+// FCOMI(P) FUCOMI(P).
+// m.
+def : InstRW<[Write3P01], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr",
+                           "UCOM_FIPr")>;
+
+// FICOM(P).
+def : InstRW<[Write2P1_P23], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>;
+
+// FTST.
+def : InstRW<[WriteP1], (instregex "TST_F")>;
+
+// FXAM.
+def : InstRW<[Write2P1], (instregex "FXAM")>;
+
+// FPREM.
+def WriteFPREM : SchedWriteRes<[]> {
+  let Latency = 19;
+  let NumMicroOps = 28;
+}
+def : InstRW<[WriteFPREM], (instregex "FPREM")>;
+
+// FPREM1.
+def WriteFPREM1 : SchedWriteRes<[]> {
+  let Latency = 27;
+  let NumMicroOps = 41;
+}
+def : InstRW<[WriteFPREM1], (instregex "FPREM1")>;
+
+// FRNDINT.
+def WriteFRNDINT : SchedWriteRes<[]> {
+  let Latency = 11;
+  let NumMicroOps = 17;
+}
+def : InstRW<[WriteFRNDINT], (instregex "FRNDINT")>;
+
+//-- Math instructions --//
+
+// FSCALE.
+def WriteFSCALE : SchedWriteRes<[]> {
+  let Latency = 75; // 49-125
+  let NumMicroOps = 50; // 25-75
+}
+def : InstRW<[WriteFSCALE], (instregex "FSCALE")>;
+
+// FXTRACT.
+def WriteFXTRACT : SchedWriteRes<[]> {
+  let Latency = 15;
+  let NumMicroOps = 17;
+}
+def : InstRW<[WriteFXTRACT], (instregex "FXTRACT")>;
+
+//-- Other instructions --//
+
+// FNOP.
+def : InstRW<[WriteP01], (instregex "FNOP")>;
+
+// WAIT.
+def : InstRW<[Write2P01], (instregex "WAIT")>;
+
+// FNCLEX.
+def : InstRW<[Write5P0156], (instregex "FNCLEX")>;
+
+// FNINIT.
+def WriteFNINIT : SchedWriteRes<[]> {
+  let NumMicroOps = 26;
+}
+def : InstRW<[WriteFNINIT], (instregex "FNINIT")>;
+
+//=== Integer MMX and XMM Instructions ===//
+//-- Move instructions --//
+
+// MOVD.
+// r32/64 <- (x)mm.
+def : InstRW<[WriteP0], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr",
+                         "VMOVPDI2DIrr", "MOVPDI2DIrr")>;
+
+// (x)mm <- r32/64.
+def : InstRW<[WriteP5], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr",
+                         "VMOVDI2PDIrr", "MOVDI2PDIrr")>;
+
+// MOVQ.
+// r64 <- (x)mm.
+def : InstRW<[WriteP0], (instregex "VMOVPQIto64rr")>;
+
+// (x)mm <- r64.
+def : InstRW<[WriteP5], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>;
+
+// (x)mm <- (x)mm.
+def : InstRW<[WriteP015], (instregex "MMX_MOVQ64rr")>;
+
+// (V)MOVDQA/U.
+// x <- x.
+def : InstRW<[WriteP015], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr",
+                           "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV",
+                           "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>;
+
+// MOVDQ2Q.
+def : InstRW<[WriteP01_P5], (instregex "MMX_MOVDQ2Qrr")>;
+
+// MOVQ2DQ.
+def : InstRW<[WriteP015], (instregex "MMX_MOVQ2DQrr")>;
+
+
+// PACKSSWB/DW.
+// mm <- mm.
+def WriteMMXPACKSSrr : SchedWriteRes<[HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [3];
+}
+def : InstRW<[WriteMMXPACKSSrr], (instregex "MMX_PACKSSDWirr",
+                                  "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>;
+
+// mm <- m64.
+def WriteMMXPACKSSrm : SchedWriteRes<[HWPort23, HWPort5]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 3];
+}
+def : InstRW<[WriteMMXPACKSSrm], (instregex "MMX_PACKSSDWirm",
+                                  "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>;
+
+// VPMOVSX/ZX BW BD BQ DW DQ.
+// y <- x.
+def WriteVPMOVSX : SchedWriteRes<[HWPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+def : InstRW<[WriteVPMOVSX], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>;
+
+// PBLENDW.
+// x,x,i / v,v,v,i
+def WritePBLENDWr : SchedWriteRes<[HWPort5]>;
+def : InstRW<[WritePBLENDWr], (instregex "(V?)PBLENDW(Y?)rri")>;
+
+// x,m,i / v,v,m,i
+def WritePBLENDWm : SchedWriteRes<[HWPort5, HWPort23]> {
+  let NumMicroOps = 2;
+  let Latency = 4;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePBLENDWm, ReadAfterLd], (instregex "(V?)PBLENDW(Y?)rmi")>;
+
+// VPBLENDD.
+// v,v,v,i.
+def WriteVPBLENDDr : SchedWriteRes<[HWPort015]>;
+def : InstRW<[WriteVPBLENDDr], (instregex "VPBLENDD(Y?)rri")>;
+
+// v,v,m,i
+def WriteVPBLENDDm : SchedWriteRes<[HWPort015, HWPort23]> {
+  let NumMicroOps = 2;
+  let Latency = 4;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteVPBLENDDm, ReadAfterLd], (instregex "VPBLENDD(Y?)rmi")>;
+
+// MASKMOVQ.
+def WriteMASKMOVQ : SchedWriteRes<[HWPort0, HWPort4, HWPort23]> {
+  let Latency = 13;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 2];
+}
+def : InstRW<[WriteMASKMOVQ], (instregex "MMX_MASKMOVQ(64)?")>;
+
+// MASKMOVDQU.
+def WriteMASKMOVDQU : SchedWriteRes<[HWPort04, HWPort56, HWPort23]> {
+  let Latency = 14;
+  let NumMicroOps = 10;
+  let ResourceCycles = [4, 2, 4];
+}
+def : InstRW<[WriteMASKMOVDQU], (instregex "(V?)MASKMOVDQU(64)?")>;
+
+// VPMASKMOV D/Q.
+// v,v,m.
+def WriteVPMASKMOVr : SchedWriteRes<[HWPort5, HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVPMASKMOVr, ReadAfterLd],
+                               (instregex "VPMASKMOV(D|Q)(Y?)rm")>;
+
+// m, v,v.
+def WriteVPMASKMOVm : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
+  let Latency = 13;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteVPMASKMOVm], (instregex "VPMASKMOV(D|Q)(Y?)mr")>;
+
+// PMOVMSKB.
+def WritePMOVMSKB : SchedWriteRes<[HWPort0]> {
+  let Latency = 3;
+}
+def : InstRW<[WritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKB(Y?)rr")>;
+
+// PEXTR B/W/D/Q.
+// r32,x,i.
+def WritePEXTRr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>;
+
+// m8,x,i.
+def WritePEXTRm : SchedWriteRes<[HWPort23, HWPort4, HWPort5]> {
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>;
+
+// VPBROADCAST B/W.
+// x, m8/16.
+def WriteVPBROADCAST128Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteVPBROADCAST128Ld, ReadAfterLd],
+                                     (instregex "VPBROADCAST(B|W)rm")>;
+
+// y, m8/16
+def WriteVPBROADCAST256Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteVPBROADCAST256Ld, ReadAfterLd],
+                                     (instregex "VPBROADCAST(B|W)Yrm")>;
+
+// VPGATHERDD.
+// x.
+def WriteVPGATHERDD128 : SchedWriteRes<[]> {
+  let NumMicroOps = 20;
+}
+def : InstRW<[WriteVPGATHERDD128, ReadAfterLd], (instregex "VPGATHERDDrm")>;
+
+// y.
+def WriteVPGATHERDD256 : SchedWriteRes<[]> {
+  let NumMicroOps = 34;
+}
+def : InstRW<[WriteVPGATHERDD256, ReadAfterLd], (instregex "VPGATHERDDYrm")>;
+
+// VPGATHERQD.
+// x.
+def WriteVPGATHERQD128 : SchedWriteRes<[]> {
+  let NumMicroOps = 15;
+}
+def : InstRW<[WriteVPGATHERQD128, ReadAfterLd], (instregex "VPGATHERQDrm")>;
+
+// y.
+def WriteVPGATHERQD256 : SchedWriteRes<[]> {
+  let NumMicroOps = 22;
+}
+def : InstRW<[WriteVPGATHERQD256, ReadAfterLd], (instregex "VPGATHERQDYrm")>;
+
+// VPGATHERDQ.
+// x.
+def WriteVPGATHERDQ128 : SchedWriteRes<[]> {
+  let NumMicroOps = 12;
+}
+def : InstRW<[WriteVPGATHERDQ128, ReadAfterLd], (instregex "VPGATHERDQrm")>;
+
+// y.
+def WriteVPGATHERDQ256 : SchedWriteRes<[]> {
+  let NumMicroOps = 20;
+}
+def : InstRW<[WriteVPGATHERDQ256, ReadAfterLd], (instregex "VPGATHERDQYrm")>;
+
+// VPGATHERQQ.
+// x.
+def WriteVPGATHERQQ128 : SchedWriteRes<[]> {
+  let NumMicroOps = 14;
+}
+def : InstRW<[WriteVPGATHERQQ128, ReadAfterLd], (instregex "VPGATHERQQrm")>;
+
+// y.
+def WriteVPGATHERQQ256 : SchedWriteRes<[]> {
+  let NumMicroOps = 22;
+}
+def : InstRW<[WriteVPGATHERQQ256, ReadAfterLd], (instregex "VPGATHERQQYrm")>;
+
+//-- Arithmetic instructions --//
+
+// PHADD|PHSUB (S) W/D.
+// v <- v,v.
+def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2];
+}
+def : InstRW<[WritePHADDSUBr], (instregex "MMX_PHADD(W?)rr64",
+                               "MMX_PHADDSWrr64",
+                               "MMX_PHSUB(W|D)rr64",
+                               "MMX_PHSUBSWrr64",
+                               "(V?)PH(ADD|SUB)(W|D)(Y?)rr",
+                               "(V?)PH(ADD|SUB)SWrr(256)?")>;
+
+// v <- v,m.
+def WritePHADDSUBm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WritePHADDSUBm, ReadAfterLd],
+                              (instregex "MMX_PHADD(W?)rm64",
+                               "MMX_PHADDSWrm64",
+                               "MMX_PHSUB(W|D)rm64",
+                               "MMX_PHSUBSWrm64",
+                               "(V?)PH(ADD|SUB)(W|D)(Y?)rm",
+                               "(V?)PH(ADD|SUB)SWrm(128|256)?")>;
+
+// PCMPGTQ.
+// v <- v,v.
+def WritePCMPGTQr : SchedWriteRes<[HWPort0]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+def : InstRW<[WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
+
+// v <- v,m.
+def WritePCMPGTQm : SchedWriteRes<[HWPort0, HWPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePCMPGTQm, ReadAfterLd], (instregex "(V?)PCMPGTQ(Y?)rm")>;
+
+// PMULLD.
+// x,x / y,y,y.
+def WritePMULLDr : SchedWriteRes<[HWPort0]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def : InstRW<[WritePMULLDr], (instregex "(V?)PMULLD(Y?)rr")>;
+
+// x,m / y,y,m.
+def WritePMULLDm : SchedWriteRes<[HWPort0, HWPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WritePMULLDm, ReadAfterLd], (instregex "(V?)PMULLD(Y?)rm")>;
+
+//-- Logic instructions --//
+
+// PTEST.
+// v,v.
+def WritePTESTr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rr")>;
+
+// v,m.
+def WritePTESTm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rm")>;
+
+// PSLL,PSRL,PSRA W/D/Q.
+// x,x / v,v,x.
+def WritePShift : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)(Y?)rr")>;
+
+// PSLL,PSRL DQ.
+def : InstRW<[WriteP5], (instregex "(V?)PS(R|L)LDQ(Y?)ri")>;
+
+//-- Other --//
+
+// EMMS.
+def WriteEMMS : SchedWriteRes<[]> {
+  let Latency = 13;
+  let NumMicroOps = 31;
+}
+def : InstRW<[WriteEMMS], (instregex "MMX_EMMS")>;
+
+//=== Floating Point XMM and YMM Instructions ===//
+//-- Move instructions --//
+
+// MOVMSKP S/D.
+// r32 <- x.
+def WriteMOVMSKPr : SchedWriteRes<[HWPort0]> {
+  let Latency = 3;
+}
+def : InstRW<[WriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)rr")>;
+
+// r32 <- y.
+def WriteVMOVMSKPYr : SchedWriteRes<[HWPort0]> {
+  let Latency = 2;
+}
+def : InstRW<[WriteVMOVMSKPYr], (instregex "VMOVMSKP(S|D)Yrr")>;
+
+// VPERM2F128.
+def : InstRW<[WriteFShuffle256], (instregex "VPERM2F128rr")>;
+def : InstRW<[WriteFShuffle256Ld, ReadAfterLd], (instregex "VPERM2F128rm")>;
+
+// BLENDVP S/D.
+def : InstRW<[WriteFVarBlend], (instregex "BLENDVP(S|D)rr0")>;
+def : InstRW<[WriteFVarBlendLd, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>;
+
+// VBROADCASTF128.
+def : InstRW<[WriteLoad], (instregex "VBROADCASTF128")>;
+
+// EXTRACTPS.
+// r32,x,i.
+def WriteEXTRACTPSr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>;
+
+// m32,x,i.
+def WriteEXTRACTPSm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>;
+
+// VEXTRACTF128.
+// x,y,i.
+def : InstRW<[WriteFShuffle256], (instregex "VEXTRACTF128rr")>;
+
+// m128,y,i.
+def WriteVEXTRACTF128m : SchedWriteRes<[HWPort23, HWPort4]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteVEXTRACTF128m], (instregex "VEXTRACTF128mr")>;
+
+// VINSERTF128.
+// y,y,x,i.
+def : InstRW<[WriteFShuffle256], (instregex "VINSERTF128rr")>;
+
+// y,y,m128,i.
+def WriteVINSERTF128m : SchedWriteRes<[HWPort015, HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteFShuffle256, ReadAfterLd], (instregex "VINSERTF128rm")>;
+
+// VMASKMOVP S/D.
+// v,v,m.
+def WriteVMASKMOVPrm : SchedWriteRes<[HWPort5, HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVMASKMOVPrm], (instregex "VMASKMOVP(S|D)(Y?)rm")>;
+
+// m128,x,x.
+def WriteVMASKMOVPmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
+  let Latency = 13;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteVMASKMOVPmr], (instregex "VMASKMOVP(S|D)mr")>;
+
+// m256,y,y.
+def WriteVMASKMOVPYmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
+  let Latency = 14;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>;
+
+// VGATHERDPS.
+// x.
+def WriteVGATHERDPS128 : SchedWriteRes<[]> {
+  let NumMicroOps = 20;
+}
+def : InstRW<[WriteVGATHERDPS128, ReadAfterLd], (instregex "VGATHERDPSrm")>;
+
+// y.
+def WriteVGATHERDPS256 : SchedWriteRes<[]> {
+  let NumMicroOps = 34;
+}
+def : InstRW<[WriteVGATHERDPS256, ReadAfterLd], (instregex "VGATHERDPSYrm")>;
+
+// VGATHERQPS.
+// x.
+def WriteVGATHERQPS128 : SchedWriteRes<[]> {
+  let NumMicroOps = 15;
+}
+def : InstRW<[WriteVGATHERQPS128, ReadAfterLd], (instregex "VGATHERQPSrm")>;
+
+// y.
+def WriteVGATHERQPS256 : SchedWriteRes<[]> {
+  let NumMicroOps = 22;
+}
+def : InstRW<[WriteVGATHERQPS256, ReadAfterLd], (instregex "VGATHERQPSYrm")>;
+
+// VGATHERDPD.
+// x.
+def WriteVGATHERDPD128 : SchedWriteRes<[]> {
+  let NumMicroOps = 12;
+}
+def : InstRW<[WriteVGATHERDPD128, ReadAfterLd], (instregex "VGATHERDPDrm")>;
+
+// y.
+def WriteVGATHERDPD256 : SchedWriteRes<[]> {
+  let NumMicroOps = 20;
+}
+def : InstRW<[WriteVGATHERDPD256, ReadAfterLd], (instregex "VGATHERDPDYrm")>;
+
+// VGATHERQPD.
+// x.
+def WriteVGATHERQPD128 : SchedWriteRes<[]> {
+  let NumMicroOps = 14;
+}
+def : InstRW<[WriteVGATHERQPD128, ReadAfterLd], (instregex "VGATHERQPDrm")>;
+
+// y.
+def WriteVGATHERQPD256 : SchedWriteRes<[]> {
+  let NumMicroOps = 22;
+}
+def : InstRW<[WriteVGATHERQPD256, ReadAfterLd], (instregex "VGATHERQPDYrm")>;
+
+//-- Conversion instructions --//
+
+// CVTPD2PS.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVTPD2PSrr")>;
+
+// x,m128.
+def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVTPD2PS(X?)rm")>;
+
+// x,y.
+def WriteCVTPD2PSYrr : SchedWriteRes<[HWPort1, HWPort5]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTPD2PSYrr], (instregex "(V?)CVTPD2PSYrr")>;
+
+// x,m256.
+def WriteCVTPD2PSYrm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteCVTPD2PSYrm], (instregex "(V?)CVTPD2PSYrm")>;
+
+// CVTSD2SS.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V)?CVTSD2SSrr")>;
+
+// x,m64.
+def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(Int_)?(V)?CVTSD2SSrm")>;
+
+// CVTPS2PD.
+// x,x.
+def WriteCVTPS2PDrr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTPS2PDrr], (instregex "(V?)CVTPS2PDrr")>;
+
+// x,m64.
+// y,m128.
+def WriteCVTPS2PDrm : SchedWriteRes<[HWPort0, HWPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTPS2PDrm], (instregex "(V?)CVTPS2PD(Y?)rm")>;
+
+// y,x.
+def WriteVCVTPS2PDYrr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteVCVTPS2PDYrr], (instregex "VCVTPS2PDYrr")>;
+
+// CVTSS2SD.
+// x,x.
+def WriteCVTSS2SDrr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTSS2SDrr], (instregex "(Int_)?(V?)CVTSS2SDrr")>;
+
+// x,m32.
+def WriteCVTSS2SDrm : SchedWriteRes<[HWPort0, HWPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTSS2SDrm], (instregex "(Int_)?(V?)CVTSS2SDrm")>;
+
+// CVTDQ2PD.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(V)?CVTDQ2PDrr")>;
+
+// y,x.
+def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVTDQ2PDYrr")>;
+
+// CVT(T)PD2DQ.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVT(T?)PD2DQrr")>;
+// x,m128.
+def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVT(T?)PD2DQrm")>;
+// x,y.
+def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVT(T?)PD2DQYrr")>;
+// x,m256.
+def : InstRW<[WriteP1_P5_Lat6Ld], (instregex "VCVT(T?)PD2DQYrm")>;
+
+// CVT(T)PS2PI.
+// mm,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PS2PIirr")>;
+
+// CVTPI2PD.
+// x,mm.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PI2PDirr")>;
+
+// CVT(T)PD2PI.
+// mm,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PD2PIirr")>;
+
+// CVSTSI2SS.
+// x,r32.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>;
+
+// CVT(T)SS2SI.
+// r32,x.
+def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>;
+// r32,m32.
+def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>;
+
+// CVTSI2SD.
+// x,r32/64.
+def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>;
+
+// CVTSD2SI.
+// r32/64
+def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rr")>;
+// r32,m32.
+def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rm")>;
+
+// VCVTPS2PH.
+// x,v,i.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPS2PH(Y?)rr")>;
+// m,v,i.
+def : InstRW<[WriteP1_P5_Lat4Ld, WriteRMW], (instregex "VCVTPS2PH(Y?)mr")>;
+
+// VCVTPH2PS.
+// v,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPH2PS(Y?)rr")>;
+
+//-- Arithmetic instructions --//
+
+// HADD, HSUB PS/PD
+// x,x / v,v,v.
+def WriteHADDSUBPr : SchedWriteRes<[HWPort1, HWPort5]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteHADDSUBPr], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rr")>;
+
+// x,m / v,v,m.
+def WriteHADDSUBPm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WriteHADDSUBPm], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rm")>;
+
+// MULL SS/SD PS/PD.
+// x,x / v,v,v.
+def WriteMULr : SchedWriteRes<[HWPort01]> {
+  let Latency = 5;
+}
+def : InstRW<[WriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>;
+
+// x,m / v,v,m.
+def WriteMULm : SchedWriteRes<[HWPort01, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteMULm], (instregex "(V?)MUL(P|S)(S|D)rm")>;
+
+// VDIVPS.
+// y,y,y.
+def WriteVDIVPSYrr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 19; // 18-21 cycles.
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVDIVPSYrr], (instregex "VDIVPSYrr")>;
+
+// y,y,m256.
+def WriteVDIVPSYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 23; // 18-21 + 4 cycles.
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVDIVPSYrm, ReadAfterLd], (instregex "VDIVPSYrm")>;
+
+// VDIVPD.
+// y,y,y.
+def WriteVDIVPDYrr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 27; // 19-35 cycles.
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVDIVPDYrr], (instregex "VDIVPDYrr")>;
+
+// y,y,m256.
+def WriteVDIVPDYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 31; // 19-35 + 4 cycles.
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVDIVPDYrm, ReadAfterLd], (instregex "VDIVPDYrm")>;
+
+// VRCPPS.
+// y,y.
+def WriteVRCPPSr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>;
+
+// y,m256.
+def WriteVRCPPSm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 11;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVRCPPSm], (instregex "VRCPPSYm(_Int)?")>;
+
+// ROUND SS/SD PS/PD.
+// v,v,i.
+def WriteROUNDr : SchedWriteRes<[HWPort1]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def : InstRW<[WriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>;
+
+// v,m,i.
+def WriteROUNDm : SchedWriteRes<[HWPort1, HWPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>;
+
+// DPPS.
+// x,x,i / v,v,v,i.
+def WriteDPPSr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> {
+  let Latency = 14;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteDPPSr], (instregex "(V?)DPPS(Y?)rri")>;
+
+// x,m,i / v,v,m,i.
+def WriteDPPSm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23, HWPort6]> {
+  let Latency = 18;
+  let NumMicroOps = 6;
+  let ResourceCycles = [2, 1, 1, 1, 1];
+}
+def : InstRW<[WriteDPPSm, ReadAfterLd], (instregex "(V?)DPPS(Y?)rmi")>;
+
+// DPPD.
+// x,x,i.
+def WriteDPPDr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteDPPDr], (instregex "(V?)DPPDrri")>;
+
+// x,m,i.
+def WriteDPPDm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23]> {
+  let Latency = 13;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteDPPDm], (instregex "(V?)DPPDrmi")>;
+
+// VFMADD.
+// v,v,v.
+def WriteFMADDr : SchedWriteRes<[HWPort01]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+def : InstRW<[WriteFMADDr],
+    (instregex
+    // 3p forms.
+    "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)r(Y)?",
+    // 3s forms.
+    "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)r",
+    // 4s/4s_int forms.
+    "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?",
+    // 4p forms.
+    "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>;
+
+// v,v,m.
+def WriteFMADDm : SchedWriteRes<[HWPort01, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteFMADDm],
+    (instregex
+    // 3p forms.
+    "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)m(Y)?",
+    // 3s forms.
+    "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)m",
+    // 4s/4s_int forms.
+    "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?",
+    // 4p forms.
+    "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>;
+
+//-- Math instructions --//
+
+// VSQRTPS.
+// y,y.
+def WriteVSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 19;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVSQRTPSYr], (instregex "VSQRTPSYr")>;
+
+// y,m256.
+def WriteVSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 23;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVSQRTPSYm], (instregex "VSQRTPSYm")>;
+
+// VSQRTPD.
+// y,y.
+def WriteVSQRTPDYr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 28;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVSQRTPDYr], (instregex "VSQRTPDYr")>;
+
+// y,m256.
+def WriteVSQRTPDYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 32;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVSQRTPDYm], (instregex "VSQRTPDYm")>;
+
+// RSQRT SS/PS.
+// x,x.
+def WriteRSQRTr : SchedWriteRes<[HWPort0]> {
+  let Latency = 5;
+}
+def : InstRW<[WriteRSQRTr], (instregex "(V?)RSQRT(SS|PS)r(_Int)?")>;
+
+// x,m128.
+def WriteRSQRTm : SchedWriteRes<[HWPort0, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteRSQRTm], (instregex "(V?)RSQRT(SS|PS)m(_Int)?")>;
+
+// RSQRTPS 256.
+// y,y.
+def WriteRSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>;
+
+// y,m256.
+def WriteRSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 11;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteRSQRTPSYm], (instregex "VRSQRTPSYm(_Int)?")>;
+
+//-- Logic instructions --//
+
+// AND, ANDN, OR, XOR PS/PD.
+// x,x / v,v,v.
+def : InstRW<[WriteP5], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>;
+// x,m / v,v,m.
+def : InstRW<[WriteP5Ld, ReadAfterLd],
+                         (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>;
+
+//-- Other instructions --//
+
+// VZEROUPPER.
+def WriteVZEROUPPER : SchedWriteRes<[]> {
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteVZEROUPPER], (instregex "VZEROUPPER")>;
+
+// VZEROALL.
+def WriteVZEROALL : SchedWriteRes<[]> {
+  let NumMicroOps = 12;
+}
+def : InstRW<[WriteVZEROALL], (instregex "VZEROALL")>;
+
+// LDMXCSR.
+def WriteLDMXCSR : SchedWriteRes<[HWPort0, HWPort6, HWPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteLDMXCSR], (instregex "(V)?LDMXCSR")>;
+
+// STMXCSR.
+def WriteSTMXCSR : SchedWriteRes<[HWPort0, HWPort4, HWPort6, HWPort237]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteSTMXCSR], (instregex "(V)?STMXCSR")>;
+
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
new file mode 100644
index 000000000000..eca65c2892b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -0,0 +1,250 @@
+//=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Sandy Bridge to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SandyBridgeModel : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and SB can decode 4
+  // instructions per cycle.
+  // FIXME: Identify instructions that aren't a single fused micro-op.
+  let IssueWidth = 4;
+  let MicroOpBufferSize = 168; // Based on the reorder buffer.
+  let LoadLatency = 4;
+  let MispredictPenalty = 16;
+
+  // Based on the LSD (loop-stream detector) queue size.
+  let LoopMicroOpBufferSize = 28;
+
+  // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
+}
+
+let SchedModel = SandyBridgeModel in {
+
+// Sandy Bridge can issue micro-ops to 6 different ports in one cycle.
+
+// Ports 0, 1, and 5 handle all computation.
+def SBPort0 : ProcResource<1>;
+def SBPort1 : ProcResource<1>;
+def SBPort5 : ProcResource<1>;
+
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores.
+def SBPort23 : ProcResource<2>;
+
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+def SBPort4 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def SBPort05  : ProcResGroup<[SBPort0, SBPort5]>;
+def SBPort15  : ProcResGroup<[SBPort1, SBPort5]>;
+def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>;
+
+// 54 Entry Unified Scheduler
+def SBPortAny : ProcResGroup<[SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]> {
+  let BufferSize=54;
+}
+
+// Integer division issued on port 0.
+def SBDivider : ProcResource<1>;
+
+// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [SBPort23, ExePort]> {
+     let Latency = !add(Lat, 4);
+  }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SBPort4]>;
+
+def : WriteRes<WriteStore, [SBPort23, SBPort4]>;
+def : WriteRes<WriteLoad,  [SBPort23]> { let Latency = 4; }
+def : WriteRes<WriteMove,  [SBPort015]>;
+def : WriteRes<WriteZero,  []>;
+
+defm : SBWriteResPair<WriteALU,   SBPort015, 1>;
+defm : SBWriteResPair<WriteIMul,  SBPort1,   3>;
+def  : WriteRes<WriteIMulH, []> { let Latency = 3; }
+defm : SBWriteResPair<WriteShift, SBPort05,  1>;
+defm : SBWriteResPair<WriteJump,  SBPort5,   1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [SBPort15]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [SBPort0, SBDivider]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> {
+  let Latency = 29;
+  let ResourceCycles = [1, 1, 10];
+}
+
+// Scalar and vector floating point.
+defm : SBWriteResPair<WriteFAdd,   SBPort1, 3>;
+defm : SBWriteResPair<WriteFMul,   SBPort0, 5>;
+defm : SBWriteResPair<WriteFDiv,   SBPort0, 12>; // 10-14 cycles.
+defm : SBWriteResPair<WriteFRcp,   SBPort0, 5>;
+defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>;
+defm : SBWriteResPair<WriteFSqrt,  SBPort0, 15>;
+defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
+defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
+defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>;
+defm : SBWriteResPair<WriteFShuffle,  SBPort5,  1>;
+defm : SBWriteResPair<WriteFBlend,  SBPort05,  1>;
+def : WriteRes<WriteFVarBlend, [SBPort0, SBPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteFVarBlendLd, [SBPort0, SBPort5, SBPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 1];
+}
+
+// Vector integer operations.
+defm : SBWriteResPair<WriteVecShift, SBPort05,  1>;
+defm : SBWriteResPair<WriteVecLogic, SBPort015, 1>;
+defm : SBWriteResPair<WriteVecALU,   SBPort15,  1>;
+defm : SBWriteResPair<WriteVecIMul,  SBPort0,   5>;
+defm : SBWriteResPair<WriteShuffle,  SBPort15,  1>;
+defm : SBWriteResPair<WriteBlend,  SBPort15,  1>;
+def : WriteRes<WriteVarBlend, [SBPort1, SBPort5]> {
+  let Latency = 2;
+  let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteVarBlendLd, [SBPort1, SBPort5, SBPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 1];
+}
+def : WriteRes<WriteMPSAD, [SBPort0, SBPort1, SBPort5]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 1];
+}
+def : WriteRes<WriteMPSADLd, [SBPort0, SBPort1, SBPort5, SBPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [SBPort015]> {
+  let Latency = 11;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [SBPort015, SBPort23]> {
+  let Latency = 11;
+  let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [SBPort015]> {
+  let Latency = 11;
+  let ResourceCycles = [8];
+}
+def : WriteRes<WritePCmpEStrMLd, [SBPort015, SBPort23]> {
+  let Latency = 11;
+  let ResourceCycles = [7, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [SBPort015]> {
+  let Latency = 3;
+  let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [SBPort015, SBPort23]> {
+  let Latency = 3;
+  let ResourceCycles = [3, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [SBPort015]> {
+  let Latency = 4;
+  let ResourceCycles = [8];
+}
+def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> {
+  let Latency = 4;
+  let ResourceCycles = [7, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [SBPort015]> {
+  let Latency = 8;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESDecEncLd, [SBPort015, SBPort23]> {
+  let Latency = 8;
+  let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESIMC, [SBPort015]> {
+  let Latency = 8;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [SBPort015, SBPort23]> {
+  let Latency = 8;
+  let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [SBPort015]> {
+  let Latency = 8;
+  let ResourceCycles = [11];
+}
+def : WriteRes<WriteAESKeyGenLd, [SBPort015, SBPort23]> {
+  let Latency = 8;
+  let ResourceCycles = [10, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [SBPort015]> {
+  let Latency = 14;
+  let ResourceCycles = [18];
+}
+def : WriteRes<WriteCLMulLd, [SBPort015, SBPort23]> {
+  let Latency = 14;
+  let ResourceCycles = [17, 1];
+}
+
+
+def : WriteRes<WriteSystem,     [SBPort015]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; }
+def : WriteRes<WriteFence, [SBPort23, SBPort4]>;
+def : WriteRes<WriteNop, []>;
+
+// AVX2 is not supported on that architecture, but we should define the basic
+// scheduling resources anyway.
+defm : SBWriteResPair<WriteFShuffle256, SBPort0,  1>;
+defm : SBWriteResPair<WriteShuffle256, SBPort0,  1>;
+defm : SBWriteResPair<WriteVarVecShift, SBPort0,  1>;
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td
new file mode 100644
index 000000000000..35257f89100c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Schedule.td
@@ -0,0 +1,661 @@
+//===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// InstrSchedModel annotations for out-of-order CPUs.
+//
+// These annotations are independent of the itinerary classes defined below.
+
+// Instructions with folded loads need to read the memory operand immediately,
+// but other register operands don't have to be read until the load is ready.
+// These operands are marked with ReadAfterLd.
+def ReadAfterLd : SchedRead;
+
+// Instructions with both a load and a store folded are modeled as a folded
+// load + WriteRMW.
+def WriteRMW : SchedWrite;
+
+// Most instructions can fold loads, so almost every SchedWrite comes in two
+// variants: With and without a folded load.
+// An X86FoldableSchedWrite holds a reference to the corresponding SchedWrite
+// with a folded load.
+class X86FoldableSchedWrite : SchedWrite {
+  // The SchedWrite to use when a load is folded into the instruction.
+  SchedWrite Folded;
+}
+
+// Multiclass that produces a linked pair of SchedWrites.
+multiclass X86SchedWritePair {
+  // Register-Memory operation.
+  def Ld : SchedWrite;
+  // Register-Register operation.
+  def NAME : X86FoldableSchedWrite {
+    let Folded = !cast<SchedWrite>(NAME#"Ld");
+  }
+}
+
+// Arithmetic.
+defm WriteALU  : X86SchedWritePair; // Simple integer ALU op.
+defm WriteIMul : X86SchedWritePair; // Integer multiplication.
+def  WriteIMulH : SchedWrite;       // Integer multiplication, high part.
+defm WriteIDiv : X86SchedWritePair; // Integer division.
+def  WriteLEA  : SchedWrite;        // LEA instructions can't fold loads.
+
+// Integer shifts and rotates.
+defm WriteShift : X86SchedWritePair;
+
+// Loads, stores, and moves, not folded with other operations.
+def WriteLoad  : SchedWrite;
+def WriteStore : SchedWrite;
+def WriteMove  : SchedWrite;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def WriteZero : SchedWrite;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm WriteJump : X86SchedWritePair;
+
+// Floating point. This covers both scalar and vector operations.
+defm WriteFAdd   : X86SchedWritePair; // Floating point add/sub/compare.
+defm WriteFMul   : X86SchedWritePair; // Floating point multiplication.
+defm WriteFDiv   : X86SchedWritePair; // Floating point division.
+defm WriteFSqrt  : X86SchedWritePair; // Floating point square root.
+defm WriteFRcp   : X86SchedWritePair; // Floating point reciprocal estimate.
+defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate.
+defm WriteFMA    : X86SchedWritePair; // Fused Multiply Add.
+defm WriteFShuffle  : X86SchedWritePair; // Floating point vector shuffles.
+defm WriteFBlend  : X86SchedWritePair; // Floating point vector blends.
+defm WriteFVarBlend  : X86SchedWritePair; // Fp vector variable blends.
+
+// FMA Scheduling helper class.
+class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
+
+// Vector integer operations.
+defm WriteVecALU   : X86SchedWritePair; // Vector integer ALU op, no logicals.
+defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
+defm WriteVecIMul  : X86SchedWritePair; // Vector integer multiply.
+defm WriteShuffle  : X86SchedWritePair; // Vector shuffles.
+defm WriteBlend  : X86SchedWritePair; // Vector blends.
+defm WriteVarBlend  : X86SchedWritePair; // Vector variable blends.
+defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD.
+
+// Vector bitwise operations.
+// These are often used on both floating point and integer vectors.
+defm WriteVecLogic : X86SchedWritePair; // Vector and/or/xor.
+
+// Conversion between integer and float.
+defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer.
+defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float.
+defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion.
+
+// Strings instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+defm WritePCmpIStrM : X86SchedWritePair;
+// Packed Compare Explicit Length Strings, Return Mask
+defm WritePCmpEStrM : X86SchedWritePair;
+// Packed Compare Implicit Length Strings, Return Index
+defm WritePCmpIStrI : X86SchedWritePair;
+// Packed Compare Explicit Length Strings, Return Index
+defm WritePCmpEStrI : X86SchedWritePair;
+
+// AES instructions.
+defm WriteAESDecEnc : X86SchedWritePair; // Decryption, encryption.
+defm WriteAESIMC : X86SchedWritePair; // InvMixColumn.
+defm WriteAESKeyGen : X86SchedWritePair; // Key Generation.
+
+// Carry-less multiplication instructions.
+defm WriteCLMul : X86SchedWritePair;
+
+// Catch-all for expensive system instructions.
+def WriteSystem : SchedWrite;
+
+// AVX2.
+defm WriteFShuffle256 : X86SchedWritePair; // Fp 256-bit width vector shuffles.
+defm WriteShuffle256 : X86SchedWritePair; // 256-bit width vector shuffles.
+defm WriteVarVecShift : X86SchedWritePair; // Variable vector shifts.
+
+// Old microcoded instructions that nobody use.
+def WriteMicrocoded : SchedWrite;
+
+// Fence instructions.
+def WriteFence : SchedWrite;
+
+// Nop, not very useful expect it provides a model for nops!
+def WriteNop : SchedWrite;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for X86
+def IIC_ALU_MEM     : InstrItinClass;
+def IIC_ALU_NONMEM  : InstrItinClass;
+def IIC_LEA         : InstrItinClass;
+def IIC_LEA_16      : InstrItinClass;
+def IIC_MUL8        : InstrItinClass;
+def IIC_MUL16_MEM   : InstrItinClass;
+def IIC_MUL16_REG   : InstrItinClass;
+def IIC_MUL32_MEM   : InstrItinClass;
+def IIC_MUL32_REG   : InstrItinClass;
+def IIC_MUL64       : InstrItinClass;
+// imul by al, ax, eax, tax
+def IIC_IMUL8       : InstrItinClass;
+def IIC_IMUL16_MEM  : InstrItinClass;
+def IIC_IMUL16_REG  : InstrItinClass;
+def IIC_IMUL32_MEM  : InstrItinClass;
+def IIC_IMUL32_REG  : InstrItinClass;
+def IIC_IMUL64      : InstrItinClass;
+// imul reg by reg|mem
+def IIC_IMUL16_RM   : InstrItinClass;
+def IIC_IMUL16_RR   : InstrItinClass;
+def IIC_IMUL32_RM   : InstrItinClass;
+def IIC_IMUL32_RR   : InstrItinClass;
+def IIC_IMUL64_RM   : InstrItinClass;
+def IIC_IMUL64_RR   : InstrItinClass;
+// imul reg = reg/mem * imm
+def IIC_IMUL16_RMI  : InstrItinClass;
+def IIC_IMUL16_RRI  : InstrItinClass;
+def IIC_IMUL32_RMI  : InstrItinClass;
+def IIC_IMUL32_RRI  : InstrItinClass;
+def IIC_IMUL64_RMI  : InstrItinClass;
+def IIC_IMUL64_RRI  : InstrItinClass;
+// div
+def IIC_DIV8_MEM    : InstrItinClass;
+def IIC_DIV8_REG    : InstrItinClass;
+def IIC_DIV16       : InstrItinClass;
+def IIC_DIV32       : InstrItinClass;
+def IIC_DIV64       : InstrItinClass;
+// idiv
+def IIC_IDIV8       : InstrItinClass;
+def IIC_IDIV16      : InstrItinClass;
+def IIC_IDIV32      : InstrItinClass;
+def IIC_IDIV64      : InstrItinClass;
+// neg/not/inc/dec
+def IIC_UNARY_REG   : InstrItinClass;
+def IIC_UNARY_MEM   : InstrItinClass;
+// add/sub/and/or/xor/sbc/cmp/test
+def IIC_BIN_MEM     : InstrItinClass;
+def IIC_BIN_NONMEM  : InstrItinClass;
+// adc/sbc
+def IIC_BIN_CARRY_MEM     : InstrItinClass;
+def IIC_BIN_CARRY_NONMEM  : InstrItinClass;
+// shift/rotate
+def IIC_SR          : InstrItinClass;
+// shift double
+def IIC_SHD16_REG_IM : InstrItinClass;
+def IIC_SHD16_REG_CL : InstrItinClass;
+def IIC_SHD16_MEM_IM : InstrItinClass;
+def IIC_SHD16_MEM_CL : InstrItinClass;
+def IIC_SHD32_REG_IM : InstrItinClass;
+def IIC_SHD32_REG_CL : InstrItinClass;
+def IIC_SHD32_MEM_IM : InstrItinClass;
+def IIC_SHD32_MEM_CL : InstrItinClass;
+def IIC_SHD64_REG_IM : InstrItinClass;
+def IIC_SHD64_REG_CL : InstrItinClass;
+def IIC_SHD64_MEM_IM : InstrItinClass;
+def IIC_SHD64_MEM_CL : InstrItinClass;
+// cmov
+def IIC_CMOV16_RM : InstrItinClass;
+def IIC_CMOV16_RR : InstrItinClass;
+def IIC_CMOV32_RM : InstrItinClass;
+def IIC_CMOV32_RR : InstrItinClass;
+def IIC_CMOV64_RM : InstrItinClass;
+def IIC_CMOV64_RR : InstrItinClass;
+// set
+def IIC_SET_R : InstrItinClass;
+def IIC_SET_M : InstrItinClass;
+// jmp/jcc/jcxz
+def IIC_Jcc : InstrItinClass;
+def IIC_JCXZ : InstrItinClass;
+def IIC_JMP_REL : InstrItinClass;
+def IIC_JMP_REG : InstrItinClass;
+def IIC_JMP_MEM : InstrItinClass;
+def IIC_JMP_FAR_MEM : InstrItinClass;
+def IIC_JMP_FAR_PTR : InstrItinClass;
+// loop
+def IIC_LOOP : InstrItinClass;
+def IIC_LOOPE : InstrItinClass;
+def IIC_LOOPNE : InstrItinClass;
+// call
+def IIC_CALL_RI : InstrItinClass;
+def IIC_CALL_MEM : InstrItinClass;
+def IIC_CALL_FAR_MEM : InstrItinClass;
+def IIC_CALL_FAR_PTR : InstrItinClass;
+// ret
+def IIC_RET : InstrItinClass;
+def IIC_RET_IMM : InstrItinClass;
+//sign extension movs
+def IIC_MOVSX : InstrItinClass;
+def IIC_MOVSX_R16_R8 : InstrItinClass;
+def IIC_MOVSX_R16_M8 : InstrItinClass;
+def IIC_MOVSX_R16_R16 : InstrItinClass;
+def IIC_MOVSX_R32_R32 : InstrItinClass;
+//zero extension movs
+def IIC_MOVZX : InstrItinClass;
+def IIC_MOVZX_R16_R8 : InstrItinClass;
+def IIC_MOVZX_R16_M8 : InstrItinClass;
+
+def IIC_REP_MOVS : InstrItinClass;
+def IIC_REP_STOS : InstrItinClass;
+
+// SSE scalar/parallel binary operations
+def IIC_SSE_ALU_F32S_RR : InstrItinClass;
+def IIC_SSE_ALU_F32S_RM : InstrItinClass;
+def IIC_SSE_ALU_F64S_RR : InstrItinClass;
+def IIC_SSE_ALU_F64S_RM : InstrItinClass;
+def IIC_SSE_MUL_F32S_RR : InstrItinClass;
+def IIC_SSE_MUL_F32S_RM : InstrItinClass;
+def IIC_SSE_MUL_F64S_RR : InstrItinClass;
+def IIC_SSE_MUL_F64S_RM : InstrItinClass;
+def IIC_SSE_DIV_F32S_RR : InstrItinClass;
+def IIC_SSE_DIV_F32S_RM : InstrItinClass;
+def IIC_SSE_DIV_F64S_RR : InstrItinClass;
+def IIC_SSE_DIV_F64S_RM : InstrItinClass;
+def IIC_SSE_ALU_F32P_RR : InstrItinClass;
+def IIC_SSE_ALU_F32P_RM : InstrItinClass;
+def IIC_SSE_ALU_F64P_RR : InstrItinClass;
+def IIC_SSE_ALU_F64P_RM : InstrItinClass;
+def IIC_SSE_MUL_F32P_RR : InstrItinClass;
+def IIC_SSE_MUL_F32P_RM : InstrItinClass;
+def IIC_SSE_MUL_F64P_RR : InstrItinClass;
+def IIC_SSE_MUL_F64P_RM : InstrItinClass;
+def IIC_SSE_DIV_F32P_RR : InstrItinClass;
+def IIC_SSE_DIV_F32P_RM : InstrItinClass;
+def IIC_SSE_DIV_F64P_RR : InstrItinClass;
+def IIC_SSE_DIV_F64P_RM : InstrItinClass;
+
+def IIC_SSE_COMIS_RR : InstrItinClass;
+def IIC_SSE_COMIS_RM : InstrItinClass;
+
+def IIC_SSE_HADDSUB_RR : InstrItinClass;
+def IIC_SSE_HADDSUB_RM : InstrItinClass;
+
+def IIC_SSE_BIT_P_RR  : InstrItinClass;
+def IIC_SSE_BIT_P_RM  : InstrItinClass;
+
+def IIC_SSE_INTALU_P_RR  : InstrItinClass;
+def IIC_SSE_INTALU_P_RM  : InstrItinClass;
+def IIC_SSE_INTALUQ_P_RR  : InstrItinClass;
+def IIC_SSE_INTALUQ_P_RM  : InstrItinClass;
+
+def IIC_SSE_INTMUL_P_RR : InstrItinClass;
+def IIC_SSE_INTMUL_P_RM : InstrItinClass;
+
+def IIC_SSE_INTSH_P_RR : InstrItinClass;
+def IIC_SSE_INTSH_P_RM : InstrItinClass;
+def IIC_SSE_INTSH_P_RI : InstrItinClass;
+
+def IIC_SSE_INTSHDQ_P_RI : InstrItinClass;
+
+def IIC_SSE_SHUFP : InstrItinClass;
+def IIC_SSE_PSHUF_RI : InstrItinClass;
+def IIC_SSE_PSHUF_MI : InstrItinClass;
+
+def IIC_SSE_UNPCK : InstrItinClass;
+
+def IIC_SSE_MOVMSK : InstrItinClass;
+def IIC_SSE_MASKMOV : InstrItinClass;
+
+def IIC_SSE_PEXTRW : InstrItinClass;
+def IIC_SSE_PINSRW : InstrItinClass;
+
+def IIC_SSE_PABS_RR : InstrItinClass;
+def IIC_SSE_PABS_RM : InstrItinClass;
+
+def IIC_SSE_SQRTPS_RR : InstrItinClass;
+def IIC_SSE_SQRTPS_RM : InstrItinClass;
+def IIC_SSE_SQRTSS_RR : InstrItinClass;
+def IIC_SSE_SQRTSS_RM : InstrItinClass;
+def IIC_SSE_SQRTPD_RR : InstrItinClass;
+def IIC_SSE_SQRTPD_RM : InstrItinClass;
+def IIC_SSE_SQRTSD_RR : InstrItinClass;
+def IIC_SSE_SQRTSD_RM : InstrItinClass;
+
+def IIC_SSE_RSQRTPS_RR : InstrItinClass;
+def IIC_SSE_RSQRTPS_RM : InstrItinClass;
+def IIC_SSE_RSQRTSS_RR : InstrItinClass;
+def IIC_SSE_RSQRTSS_RM : InstrItinClass;
+
+def IIC_SSE_RCPP_RR : InstrItinClass;
+def IIC_SSE_RCPP_RM : InstrItinClass;
+def IIC_SSE_RCPS_RR : InstrItinClass;
+def IIC_SSE_RCPS_RM : InstrItinClass;
+
+def IIC_SSE_MOV_S_RR : InstrItinClass;
+def IIC_SSE_MOV_S_RM : InstrItinClass;
+def IIC_SSE_MOV_S_MR : InstrItinClass;
+
+def IIC_SSE_MOVA_P_RR : InstrItinClass;
+def IIC_SSE_MOVA_P_RM : InstrItinClass;
+def IIC_SSE_MOVA_P_MR : InstrItinClass;
+
+def IIC_SSE_MOVU_P_RR : InstrItinClass;
+def IIC_SSE_MOVU_P_RM : InstrItinClass;
+def IIC_SSE_MOVU_P_MR : InstrItinClass;
+
+def IIC_SSE_MOVDQ : InstrItinClass;
+def IIC_SSE_MOVD_ToGP : InstrItinClass;
+def IIC_SSE_MOVQ_RR : InstrItinClass;
+
+def IIC_SSE_MOV_LH : InstrItinClass;
+
+def IIC_SSE_LDDQU : InstrItinClass;
+
+def IIC_SSE_MOVNT : InstrItinClass;
+
+def IIC_SSE_PHADDSUBD_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBD_RM : InstrItinClass;
+def IIC_SSE_PHADDSUBSW_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBSW_RM : InstrItinClass;
+def IIC_SSE_PHADDSUBW_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBW_RM : InstrItinClass;
+def IIC_SSE_PSHUFB_RR : InstrItinClass;
+def IIC_SSE_PSHUFB_RM : InstrItinClass;
+def IIC_SSE_PSIGN_RR : InstrItinClass;
+def IIC_SSE_PSIGN_RM : InstrItinClass;
+
+def IIC_SSE_PMADD : InstrItinClass;
+def IIC_SSE_PMULHRSW : InstrItinClass;
+def IIC_SSE_PALIGNRR : InstrItinClass;
+def IIC_SSE_PALIGNRM : InstrItinClass;
+def IIC_SSE_MWAIT : InstrItinClass;
+def IIC_SSE_MONITOR : InstrItinClass;
+def IIC_SSE_MWAITX : InstrItinClass;
+def IIC_SSE_MONITORX : InstrItinClass;
+
+def IIC_SSE_PREFETCH : InstrItinClass;
+def IIC_SSE_PAUSE : InstrItinClass;
+def IIC_SSE_LFENCE : InstrItinClass;
+def IIC_SSE_MFENCE : InstrItinClass;
+def IIC_SSE_SFENCE : InstrItinClass;
+def IIC_SSE_LDMXCSR : InstrItinClass;
+def IIC_SSE_STMXCSR : InstrItinClass;
+
+def IIC_SSE_CVT_PD_RR : InstrItinClass;
+def IIC_SSE_CVT_PD_RM : InstrItinClass;
+def IIC_SSE_CVT_PS_RR : InstrItinClass;
+def IIC_SSE_CVT_PS_RM : InstrItinClass;
+def IIC_SSE_CVT_PI2PS_RR : InstrItinClass;
+def IIC_SSE_CVT_PI2PS_RM : InstrItinClass;
+def IIC_SSE_CVT_Scalar_RR : InstrItinClass;
+def IIC_SSE_CVT_Scalar_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI32_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI32_RR : InstrItinClass;
+def IIC_SSE_CVT_SS2SI64_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass;
+def IIC_SSE_CVT_SD2SI_RM : InstrItinClass;
+def IIC_SSE_CVT_SD2SI_RR : InstrItinClass;
+
+// MMX
+def IIC_MMX_MOV_MM_RM : InstrItinClass;
+def IIC_MMX_MOV_REG_MM : InstrItinClass;
+def IIC_MMX_MOVQ_RM : InstrItinClass;
+def IIC_MMX_MOVQ_RR : InstrItinClass;
+
+def IIC_MMX_ALU_RM : InstrItinClass;
+def IIC_MMX_ALU_RR : InstrItinClass;
+def IIC_MMX_ALUQ_RM : InstrItinClass;
+def IIC_MMX_ALUQ_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RR : InstrItinClass;
+def IIC_MMX_PMUL : InstrItinClass;
+def IIC_MMX_MISC_FUNC_MEM : InstrItinClass;
+def IIC_MMX_MISC_FUNC_REG : InstrItinClass;
+def IIC_MMX_PSADBW : InstrItinClass;
+def IIC_MMX_SHIFT_RI : InstrItinClass;
+def IIC_MMX_SHIFT_RM : InstrItinClass;
+def IIC_MMX_SHIFT_RR : InstrItinClass;
+def IIC_MMX_UNPCK_H_RM : InstrItinClass;
+def IIC_MMX_UNPCK_H_RR : InstrItinClass;
+def IIC_MMX_UNPCK_L : InstrItinClass;
+def IIC_MMX_PCK_RM : InstrItinClass;
+def IIC_MMX_PCK_RR : InstrItinClass;
+def IIC_MMX_PSHUF : InstrItinClass;
+def IIC_MMX_PEXTR : InstrItinClass;
+def IIC_MMX_PINSRW : InstrItinClass;
+def IIC_MMX_MASKMOV : InstrItinClass;
+
+def IIC_MMX_CVT_PD_RR : InstrItinClass;
+def IIC_MMX_CVT_PD_RM : InstrItinClass;
+def IIC_MMX_CVT_PS_RR : InstrItinClass;
+def IIC_MMX_CVT_PS_RM : InstrItinClass;
+
+def IIC_CMPX_LOCK : InstrItinClass;
+def IIC_CMPX_LOCK_8 : InstrItinClass;
+def IIC_CMPX_LOCK_8B : InstrItinClass;
+def IIC_CMPX_LOCK_16B : InstrItinClass;
+
+def IIC_XADD_LOCK_MEM : InstrItinClass;
+def IIC_XADD_LOCK_MEM8 : InstrItinClass;
+
+def IIC_FILD : InstrItinClass;
+def IIC_FLD : InstrItinClass;
+def IIC_FLD80 : InstrItinClass;
+def IIC_FST : InstrItinClass;
+def IIC_FST80 : InstrItinClass;
+def IIC_FIST : InstrItinClass;
+def IIC_FLDZ : InstrItinClass;
+def IIC_FUCOM : InstrItinClass;
+def IIC_FUCOMI : InstrItinClass;
+def IIC_FCOMI : InstrItinClass;
+def IIC_FNSTSW : InstrItinClass;
+def IIC_FNSTCW : InstrItinClass;
+def IIC_FLDCW : InstrItinClass;
+def IIC_FNINIT : InstrItinClass;
+def IIC_FFREE : InstrItinClass;
+def IIC_FNCLEX : InstrItinClass;
+def IIC_WAIT : InstrItinClass;
+def IIC_FXAM : InstrItinClass;
+def IIC_FNOP : InstrItinClass;
+def IIC_FLDL : InstrItinClass;
+def IIC_F2XM1 : InstrItinClass;
+def IIC_FYL2X : InstrItinClass;
+def IIC_FPTAN : InstrItinClass;
+def IIC_FPATAN : InstrItinClass;
+def IIC_FXTRACT : InstrItinClass;
+def IIC_FPREM1 : InstrItinClass;
+def IIC_FPSTP : InstrItinClass;
+def IIC_FPREM : InstrItinClass;
+def IIC_FYL2XP1 : InstrItinClass;
+def IIC_FSINCOS : InstrItinClass;
+def IIC_FRNDINT : InstrItinClass;
+def IIC_FSCALE : InstrItinClass;
+def IIC_FCOMPP : InstrItinClass;
+def IIC_FXSAVE : InstrItinClass;
+def IIC_FXRSTOR : InstrItinClass;
+
+def IIC_FXCH : InstrItinClass;
+
+// System instructions
+def IIC_CPUID : InstrItinClass;
+def IIC_INT : InstrItinClass;
+def IIC_INT3 : InstrItinClass;
+def IIC_INVD : InstrItinClass;
+def IIC_INVLPG : InstrItinClass;
+def IIC_IRET : InstrItinClass;
+def IIC_HLT : InstrItinClass;
+def IIC_LXS : InstrItinClass;
+def IIC_LTR : InstrItinClass;
+def IIC_RDTSC : InstrItinClass;
+def IIC_RSM : InstrItinClass;
+def IIC_SIDT : InstrItinClass;
+def IIC_SGDT : InstrItinClass;
+def IIC_SLDT : InstrItinClass;
+def IIC_STR : InstrItinClass;
+def IIC_SWAPGS : InstrItinClass;
+def IIC_SYSCALL : InstrItinClass;
+def IIC_SYS_ENTER_EXIT : InstrItinClass;
+def IIC_IN_RR : InstrItinClass;
+def IIC_IN_RI : InstrItinClass;
+def IIC_OUT_RR : InstrItinClass;
+def IIC_OUT_IR : InstrItinClass;
+def IIC_INS : InstrItinClass;
+def IIC_MOV_REG_DR : InstrItinClass;
+def IIC_MOV_DR_REG : InstrItinClass;
+def IIC_MOV_REG_CR : InstrItinClass;
+def IIC_MOV_CR_REG : InstrItinClass;
+def IIC_MOV_REG_SR : InstrItinClass;
+def IIC_MOV_MEM_SR : InstrItinClass;
+def IIC_MOV_SR_REG : InstrItinClass;
+def IIC_MOV_SR_MEM : InstrItinClass;
+def IIC_LAR_RM : InstrItinClass;
+def IIC_LAR_RR : InstrItinClass;
+def IIC_LSL_RM : InstrItinClass;
+def IIC_LSL_RR : InstrItinClass;
+def IIC_LGDT : InstrItinClass;
+def IIC_LIDT : InstrItinClass;
+def IIC_LLDT_REG : InstrItinClass;
+def IIC_LLDT_MEM : InstrItinClass;
+def IIC_PUSH_CS : InstrItinClass;
+def IIC_PUSH_SR : InstrItinClass;
+def IIC_POP_SR : InstrItinClass;
+def IIC_POP_SR_SS : InstrItinClass;
+def IIC_VERR : InstrItinClass;
+def IIC_VERW_REG : InstrItinClass;
+def IIC_VERW_MEM : InstrItinClass;
+def IIC_WRMSR : InstrItinClass;
+def IIC_RDMSR : InstrItinClass;
+def IIC_RDPMC : InstrItinClass;
+def IIC_SMSW : InstrItinClass;
+def IIC_LMSW_REG : InstrItinClass;
+def IIC_LMSW_MEM : InstrItinClass;
+def IIC_ENTER : InstrItinClass;
+def IIC_LEAVE : InstrItinClass;
+def IIC_POP_MEM : InstrItinClass;
+def IIC_POP_REG16 : InstrItinClass;
+def IIC_POP_REG : InstrItinClass;
+def IIC_POP_F : InstrItinClass;
+def IIC_POP_FD : InstrItinClass;
+def IIC_POP_A : InstrItinClass;
+def IIC_PUSH_IMM : InstrItinClass;
+def IIC_PUSH_MEM : InstrItinClass;
+def IIC_PUSH_REG : InstrItinClass;
+def IIC_PUSH_F : InstrItinClass;
+def IIC_PUSH_A : InstrItinClass;
+def IIC_BSWAP : InstrItinClass;
+def IIC_BIT_SCAN_MEM : InstrItinClass;
+def IIC_BIT_SCAN_REG : InstrItinClass;
+def IIC_MOVS : InstrItinClass;
+def IIC_STOS : InstrItinClass;
+def IIC_SCAS : InstrItinClass;
+def IIC_CMPS : InstrItinClass;
+def IIC_MOV : InstrItinClass;
+def IIC_MOV_MEM : InstrItinClass;
+def IIC_AHF : InstrItinClass;
+def IIC_BT_MI : InstrItinClass;
+def IIC_BT_MR : InstrItinClass;
+def IIC_BT_RI : InstrItinClass;
+def IIC_BT_RR : InstrItinClass;
+def IIC_BTX_MI : InstrItinClass;
+def IIC_BTX_MR : InstrItinClass;
+def IIC_BTX_RI : InstrItinClass;
+def IIC_BTX_RR : InstrItinClass;
+def IIC_XCHG_REG : InstrItinClass;
+def IIC_XCHG_MEM : InstrItinClass;
+def IIC_XADD_REG : InstrItinClass;
+def IIC_XADD_MEM : InstrItinClass;
+def IIC_CMPXCHG_MEM : InstrItinClass;
+def IIC_CMPXCHG_REG : InstrItinClass;
+def IIC_CMPXCHG_MEM8 : InstrItinClass;
+def IIC_CMPXCHG_REG8 : InstrItinClass;
+def IIC_CMPXCHG_8B : InstrItinClass;
+def IIC_CMPXCHG_16B : InstrItinClass;
+def IIC_LODS : InstrItinClass;
+def IIC_OUTS : InstrItinClass;
+def IIC_CLC : InstrItinClass;
+def IIC_CLD : InstrItinClass;
+def IIC_CLI : InstrItinClass;
+def IIC_CMC : InstrItinClass;
+def IIC_CLTS : InstrItinClass;
+def IIC_STC : InstrItinClass;
+def IIC_STI : InstrItinClass;
+def IIC_STD : InstrItinClass;
+def IIC_XLAT : InstrItinClass;
+def IIC_AAA : InstrItinClass;
+def IIC_AAD : InstrItinClass;
+def IIC_AAM : InstrItinClass;
+def IIC_AAS : InstrItinClass;
+def IIC_DAA : InstrItinClass;
+def IIC_DAS : InstrItinClass;
+def IIC_BOUND : InstrItinClass;
+def IIC_ARPL_REG : InstrItinClass;
+def IIC_ARPL_MEM : InstrItinClass;
+def IIC_MOVBE : InstrItinClass;
+def IIC_AES   : InstrItinClass;
+def IIC_BLEND_MEM : InstrItinClass;
+def IIC_BLEND_NOMEM : InstrItinClass;
+def IIC_CBW   : InstrItinClass;
+def IIC_CRC32_REG : InstrItinClass;
+def IIC_CRC32_MEM : InstrItinClass;
+def IIC_SSE_DPPD_RR : InstrItinClass;
+def IIC_SSE_DPPD_RM : InstrItinClass;
+def IIC_SSE_DPPS_RR : InstrItinClass;
+def IIC_SSE_DPPS_RM : InstrItinClass;
+def IIC_MMX_EMMS : InstrItinClass;
+def IIC_SSE_EXTRACTPS_RR : InstrItinClass;
+def IIC_SSE_EXTRACTPS_RM : InstrItinClass;
+def IIC_SSE_INSERTPS_RR : InstrItinClass;
+def IIC_SSE_INSERTPS_RM : InstrItinClass;
+def IIC_SSE_MPSADBW_RR : InstrItinClass;
+def IIC_SSE_MPSADBW_RM : InstrItinClass;
+def IIC_SSE_PMULLD_RR : InstrItinClass;
+def IIC_SSE_PMULLD_RM : InstrItinClass;
+def IIC_SSE_ROUNDPS_REG : InstrItinClass;
+def IIC_SSE_ROUNDPS_MEM : InstrItinClass;
+def IIC_SSE_ROUNDPD_REG : InstrItinClass;
+def IIC_SSE_ROUNDPD_MEM : InstrItinClass;
+def IIC_SSE_POPCNT_RR : InstrItinClass;
+def IIC_SSE_POPCNT_RM : InstrItinClass;
+def IIC_SSE_PCLMULQDQ_RR : InstrItinClass;
+def IIC_SSE_PCLMULQDQ_RM : InstrItinClass;
+
+def IIC_NOP : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+// IssueWidth is analogous to the number of decode units. Core and its
+// descendents, including Nehalem and SandyBridge have 4 decoders.
+// Resources beyond the decoder operate on micro-ops and are bufferred
+// so adjacent micro-ops don't directly compete.
+//
+// MicroOpBufferSize > 1 indicates that RAW dependencies can be
+// decoded in the same cycle. The value 32 is a reasonably arbitrary
+// number of in-flight instructions.
+//
+// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef
+// indicates high latency opcodes. Alternatively, InstrItinData
+// entries may be included here to define specific operand
+// latencies. Since these latencies are not used for pipeline hazards,
+// they do not need to be exact.
+//
+// The GenericX86Model contains no instruction itineraries
+// and disables PostRAScheduler.
+class GenericX86Model : SchedMachineModel {
+  let IssueWidth = 4;
+  let MicroOpBufferSize = 32;
+  let LoadLatency = 4;
+  let HighLatency = 10;
+  let PostRAScheduler = 0;
+  let CompleteModel = 0;
+}
+
+def GenericModel : GenericX86Model;
+
+// Define a model with the PostRAScheduler enabled.
+def GenericPostRAModel : GenericX86Model {
+  let PostRAScheduler = 1;
+}
+
+include "X86ScheduleAtom.td"
+include "X86SchedSandyBridge.td"
+include "X86SchedHaswell.td"
+include "X86ScheduleSLM.td"
+include "X86ScheduleBtVer2.td"
+
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
new file mode 100644
index 000000000000..a5b440182aa9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -0,0 +1,550 @@
+//===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Intel Atom
+// in order (Saltwell-32nm/Bonnell-45nm) processors.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Scheduling information derived from the "Intel 64 and IA32 Architectures
+// Optimization Reference Manual", Chapter 13, Section 4.
+// Functional Units
+//    Port 0
+def Port0 : FuncUnit; // ALU: ALU0, shift/rotate, load/store
+                      // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide
+def Port1 : FuncUnit; // ALU: ALU1, bit processing, jump, and LEA
+                      // SIMD/FP: SIMD ALU, FP Adder
+
+def AtomItineraries : ProcessorItineraries<
+  [ Port0, Port1 ],
+  [], [
+  // P0 only
+  // InstrItinData<class, [InstrStage<N, [P0]>] >,
+  // P0 or P1
+  // InstrItinData<class, [InstrStage<N, [P0, P1]>] >,
+  // P0 and P1
+  // InstrItinData<class, [InstrStage<N, [P0], 0>,  InstrStage<N, [P1]>] >,
+  //
+  // Default is 1 cycle, port0 or port1
+  InstrItinData<IIC_ALU_MEM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_LEA, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_LEA_16, [InstrStage<2, [Port0, Port1]>] >,
+  // mul
+  InstrItinData<IIC_MUL8, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL64, [InstrStage<12, [Port0, Port1]>] >,
+  // imul by al, ax, eax, rax
+  InstrItinData<IIC_IMUL8, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL64, [InstrStage<12, [Port0, Port1]>] >,
+  // imul reg by reg|mem
+  InstrItinData<IIC_IMUL16_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL16_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_RM, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_IMUL32_RR, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_IMUL64_RM, [InstrStage<12, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL64_RR, [InstrStage<12, [Port0, Port1]>] >,
+  // imul reg = reg/mem * imm
+  InstrItinData<IIC_IMUL16_RRI, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_RRI, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_IMUL64_RRI, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL16_RMI, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_RMI, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_IMUL64_RMI, [InstrStage<14, [Port0, Port1]>] >,
+  // idiv
+  InstrItinData<IIC_IDIV8, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_IDIV16, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_IDIV32, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_IDIV64, [InstrStage<130, [Port0, Port1]>] >,
+  // div
+  InstrItinData<IIC_DIV8_REG, [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_DIV8_MEM, [InstrStage<68, [Port0, Port1]>] >,
+  InstrItinData<IIC_DIV16, [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_DIV32, [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_DIV64, [InstrStage<130, [Port0, Port1]>] >,
+  // neg/not/inc/dec
+  InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >,
+  // add/sub/and/or/xor/cmp/test
+  InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >,
+  // adc/sbc
+  InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<1, [Port0]>] >,
+  // shift/rotate
+  InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >,
+  // shift double
+  InstrItinData<IIC_SHD16_REG_IM, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD16_REG_CL, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD32_REG_CL, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD64_REG_IM, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD64_REG_CL, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<9, [Port0, Port1]>] >,
+  // cmov
+  InstrItinData<IIC_CMOV16_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CMOV16_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMOV32_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CMOV32_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >,
+  // set
+  InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >,
+  // jcc
+  InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >,
+  // jcxz/jecxz/jrcxz
+  InstrItinData<IIC_JCXZ, [InstrStage<4, [Port0, Port1]>] >,
+  // jmp rel
+  InstrItinData<IIC_JMP_REL, [InstrStage<1, [Port1]>] >,
+  // jmp indirect
+  InstrItinData<IIC_JMP_REG, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_JMP_MEM, [InstrStage<2, [Port0, Port1]>] >,
+  // jmp far
+  InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<32, [Port0, Port1]>] >,
+  InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<31, [Port0, Port1]>] >,
+  // loop/loope/loopne
+  InstrItinData<IIC_LOOP, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_LOOPE, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_LOOPNE, [InstrStage<17, [Port0, Port1]>] >,
+  // call - all but reg/imm
+  InstrItinData<IIC_CALL_RI, [InstrStage<1, [Port0], 0>,
+                              InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_CALL_MEM, [InstrStage<15, [Port0, Port1]>] >,
+  InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<40, [Port0, Port1]>] >,
+  InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<39, [Port0, Port1]>] >,
+  //ret
+  InstrItinData<IIC_RET, [InstrStage<79, [Port0, Port1]>] >,
+  InstrItinData<IIC_RET_IMM, [InstrStage<1, [Port0], 0>,  InstrStage<1, [Port1]>] >,
+  //sign extension movs
+  InstrItinData<IIC_MOVSX,[InstrStage<1, [Port0] >] >,
+  InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [Port0, Port1]>] >,
+  //zero extension movs
+  InstrItinData<IIC_MOVZX,[InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_REP_MOVS, [InstrStage<75, [Port0, Port1]>] >,
+  InstrItinData<IIC_REP_STOS, [InstrStage<74, [Port0, Port1]>] >,
+
+  // SSE binary operations
+  // arithmetic fp scalar
+  InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<5, [Port0], 0>,
+                                   InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<5, [Port0], 0>,
+                                   InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<34, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<34, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<62, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<10, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<9, [Port0, Port1]>] >,
+
+  // arithmetic fp parallel
+  InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<5, [Port0], 0>,
+                                   InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<125, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<125, [Port0, Port1]>] >,
+
+  // bitwise parallel
+  InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [Port0]>] >,
+
+  // arithmetic int parallel
+  InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+
+  // multiply int parallel
+  InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [Port0]>] >,
+
+  // shift parallel
+  InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<34, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<34, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<125, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<125, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_MOVMSK, [InstrStage<3, [Port0]>] >,
+  InstrItinData<IIC_SSE_MASKMOV, [InstrStage<2, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<2, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_LDDQU, [InstrStage<3, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<3, [Port0]>] >,
+  InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PAUSE, [InstrStage<17, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_STMXCSR, [InstrStage<15, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >,
+
+  // conversions
+  // to/from PD ...
+  InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+  // to/from PS except to/from PD and PS2PI
+  InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >,
+
+  // MMX MOVs
+  InstrItinData<IIC_MMX_MOV_MM_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<3, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+  // other MMX
+  InstrItinData<IIC_MMX_ALU_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_ALU_RR,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PMUL, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PSADBW,   [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PCK_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PCK_RR,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PSHUF,   [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PEXTR,   [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PINSRW,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >,
+  // conversions
+  // from/to PD
+  InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+  // from/to PI
+  InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<5, [Port0], 0>,
+                                    InstrStage<5, [Port1]>]>,
+
+  InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FILD, [InstrStage<5, [Port0], 0>, InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_FLD,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_FLD80, [InstrStage<4, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FST,   [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_FIST,  [InstrStage<6, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FLDZ,   [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FUCOM,  [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_FCOMI,  [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNSTSW, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNSTCW, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_FLDCW,  [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNINIT, [InstrStage<63, [Port0, Port1]>] >,
+  InstrItinData<IIC_FFREE,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNCLEX, [InstrStage<25, [Port0, Port1]>] >,
+  InstrItinData<IIC_WAIT,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXAM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_FNOP,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FLDL,  [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_F2XM1,  [InstrStage<99, [Port0, Port1]>] >,
+  InstrItinData<IIC_FYL2X,  [InstrStage<146, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPTAN,  [InstrStage<168, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPATAN,  [InstrStage<183, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXTRACT,  [InstrStage<25, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPREM1,  [InstrStage<71, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPSTP,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPREM,  [InstrStage<55, [Port0, Port1]>] >,
+  InstrItinData<IIC_FYL2XP1,  [InstrStage<147, [Port0, Port1]>] >,
+  InstrItinData<IIC_FSINCOS,  [InstrStage<174, [Port0, Port1]>] >,
+  InstrItinData<IIC_FRNDINT,  [InstrStage<46, [Port0, Port1]>] >,
+  InstrItinData<IIC_FSCALE,  [InstrStage<77, [Port0, Port1]>] >,
+  InstrItinData<IIC_FCOMPP,  [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_FXSAVE,  [InstrStage<140, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXRSTOR,  [InstrStage<141, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
+
+  // System instructions
+  InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >,
+  InstrItinData<IIC_INT,   [InstrStage<127, [Port0, Port1]>] >,
+  InstrItinData<IIC_INT3,  [InstrStage<130, [Port0, Port1]>] >,
+  InstrItinData<IIC_INVD,  [InstrStage<1003, [Port0, Port1]>] >,
+  InstrItinData<IIC_INVLPG, [InstrStage<71, [Port0, Port1]>] >,
+  InstrItinData<IIC_IRET,  [InstrStage<109, [Port0, Port1]>] >,
+  InstrItinData<IIC_HLT,   [InstrStage<121, [Port0, Port1]>] >,
+  InstrItinData<IIC_LXS,   [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_LTR,   [InstrStage<83, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >,
+  InstrItinData<IIC_RSM,   [InstrStage<741, [Port0, Port1]>] >,
+  InstrItinData<IIC_SIDT,  [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SGDT,  [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SLDT,  [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_STR,    [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SWAPGS, [InstrStage<22, [Port0, Port1]>] >,
+  InstrItinData<IIC_SYSCALL, [InstrStage<96, [Port0, Port1]>] >,
+  InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<88, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_IN_RR,  [InstrStage<94, [Port0, Port1]>] >,
+  InstrItinData<IIC_IN_RI,  [InstrStage<92, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUT_RR, [InstrStage<68, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUT_IR, [InstrStage<72, [Port0, Port1]>] >,
+  InstrItinData<IIC_INS,    [InstrStage<59, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_MOV_REG_DR, [InstrStage<88, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_DR_REG, [InstrStage<123, [Port0, Port1]>] >,
+  // worst case for mov REG_CRx
+  InstrItinData<IIC_MOV_REG_CR, [InstrStage<12, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_CR_REG, [InstrStage<136, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MOV_MEM_SR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_SR_REG, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_SR_MEM, [InstrStage<26, [Port0, Port1]>] >,
+  // LAR
+  InstrItinData<IIC_LAR_RM,  [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_LAR_RR,  [InstrStage<54, [Port0, Port1]>] >,
+  // LSL
+  InstrItinData<IIC_LSL_RM,  [InstrStage<46, [Port0, Port1]>] >,
+  InstrItinData<IIC_LSL_RR,  [InstrStage<49, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_LGDT, [InstrStage<44, [Port0, Port1]>] >,
+  InstrItinData<IIC_LIDT, [InstrStage<44, [Port0, Port1]>] >,
+  InstrItinData<IIC_LLDT_REG, [InstrStage<60, [Port0, Port1]>] >,
+  InstrItinData<IIC_LLDT_MEM, [InstrStage<64, [Port0, Port1]>] >,
+  // push control register, segment registers
+  InstrItinData<IIC_PUSH_CS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_SR, [InstrStage<2, [Port0, Port1]>] >,
+  // pop control register, segment registers
+  InstrItinData<IIC_POP_SR,    [InstrStage<29, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_SR_SS, [InstrStage<48, [Port0, Port1]>] >,
+  // VERR, VERW
+  InstrItinData<IIC_VERR,     [InstrStage<41, [Port0, Port1]>] >,
+  InstrItinData<IIC_VERW_REG, [InstrStage<51, [Port0, Port1]>] >,
+  InstrItinData<IIC_VERW_MEM, [InstrStage<50, [Port0, Port1]>] >,
+  // WRMSR, RDMSR
+  InstrItinData<IIC_WRMSR, [InstrStage<202, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDMSR, [InstrStage<78, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDPMC, [InstrStage<46, [Port0, Port1]>] >,
+  // SMSW, LMSW
+  InstrItinData<IIC_SMSW, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_LMSW_REG, [InstrStage<69, [Port0, Port1]>] >,
+  InstrItinData<IIC_LMSW_MEM, [InstrStage<67, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_ENTER, [InstrStage<32, [Port0, Port1]>] >,
+  InstrItinData<IIC_LEAVE, [InstrStage<2, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_POP_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_REG16, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_REG, [InstrStage<1, [Port0], 0>,
+                            InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_POP_F, [InstrStage<32, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_FD, [InstrStage<26, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_A, [InstrStage<9, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [Port0], 0>,
+                               InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_PUSH_MEM, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_REG, [InstrStage<1, [Port0], 0>,
+                               InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_PUSH_F, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPS, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_MEM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_AHF, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_MI, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_MR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_RI, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BT_RR, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BTX_MI, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_BTX_MR, [InstrStage<11, [Port0, Port1]>] >,
+  InstrItinData<IIC_BTX_RI, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BTX_RR, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_XCHG_REG, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XCHG_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_REG, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_REG, [InstrStage<15, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_8B, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >,
+  InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >,
+  InstrItinData<IIC_STC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >,
+  InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAA, [InstrStage<13, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAD, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAM, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAS, [InstrStage<13, [Port0, Port1]>] >,
+  InstrItinData<IIC_DAA, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_DAS, [InstrStage<20, [Port0, Port1]>] >,
+  InstrItinData<IIC_BOUND, [InstrStage<11, [Port0, Port1]>] >,
+  InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >,
+  InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CBW, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_EMMS, [InstrStage<5, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] >
+  ]>;
+
+// Atom machine model.
+def AtomModel : SchedMachineModel {
+  let IssueWidth = 2;  // Allows 2 instructions per scheduling group.
+  let MicroOpBufferSize = 0; // In-order execution, always hide latency.
+  let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
+  let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+
+  // On the Atom, the throughput for taken branches is 2 cycles. For small
+  // simple loops, expand by a small factor to hide the backedge cost.
+  let LoopMicroOpBufferSize = 10;
+  let PostRAScheduler = 1;
+  let CompleteModel = 0;
+
+  let Itineraries = AtomItineraries;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td
new file mode 100644
index 000000000000..ce1ece34e431
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -0,0 +1,341 @@
+//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for AMD btver2 (Jaguar) to support
+// instruction scheduling and other instruction cost heuristics. Based off AMD Software
+// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix.
+//
+//===----------------------------------------------------------------------===//
+
+def BtVer2Model : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and btver2 can
+  // decode 2 instructions per cycle.
+  let IssueWidth = 2;
+  let MicroOpBufferSize = 64; // Retire Control Unit
+  let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency)
+  let HighLatency = 25;
+  let MispredictPenalty = 14; // Minimum branch misdirection penalty
+  let PostRAScheduler = 1;
+
+  // FIXME: SSE4/AVX is unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
+}
+
+let SchedModel = BtVer2Model in {
+
+// Jaguar can issue up to 6 micro-ops in one cycle
+def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam)
+def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV
+def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU
+def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
+def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
+def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
+
+// Any pipe - FIXME we need this until we can discriminate between int/fpu load/store/moves properly
+def JAny : ProcResGroup<[JALU0, JALU1, JLAGU, JSAGU, JFPU0, JFPU1]>;
+
+// Integer Pipe Scheduler
+def JALU01 : ProcResGroup<[JALU0, JALU1]> {
+  let BufferSize=20;
+}
+
+// AGU Pipe Scheduler
+def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> {
+  let BufferSize=12;
+}
+
+// Fpu Pipe Scheduler
+def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
+  let BufferSize=18;
+}
+
+def JDiv    : ProcResource<1>; // integer division
+def JMul    : ProcResource<1>; // integer multiplication
+def JVALU0  : ProcResource<1>; // vector integer
+def JVALU1  : ProcResource<1>; // vector integer
+def JVIMUL  : ProcResource<1>; // vector integer multiplication
+def JSTC    : ProcResource<1>; // vector store/convert
+def JFPM    : ProcResource<1>; // FP multiplication
+def JFPA    : ProcResource<1>; // FP addition
+
+// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when dispatched by the schedulers.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> {
+     let Latency = !add(Lat, 3);
+  }
+}
+
+multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> {
+     let Latency = !add(Lat, 5);
+  }
+}
+
+// A folded store needs a cycle on the SAGU for the store data.
+def : WriteRes<WriteRMW, [JSAGU]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Arithmetic.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteALU,   JALU01, 1>;
+defm : JWriteResIntPair<WriteIMul,  JALU1,  3>;
+
+def  : WriteRes<WriteIMulH, [JALU1]> {
+  let Latency = 6;
+  let ResourceCycles = [4];
+}
+
+// FIXME 8/16 bit divisions
+def : WriteRes<WriteIDiv, [JALU1, JDiv]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 25];
+}
+def : WriteRes<WriteIDivLd, [JALU1, JLAGU, JDiv]> {
+  let Latency = 41;
+  let ResourceCycles = [1, 1, 25];
+}
+
+// This is for simple LEAs with one or two input operands.
+// FIXME: SAGU 3-operand LEA
+def : WriteRes<WriteLEA, [JALU01]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Integer shifts and rotates.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteShift, JALU01, 1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Loads, stores, and moves, not folded with other operations.
+// FIXME: Split x86 and SSE load/store/moves
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteLoad,  [JLAGU]> { let Latency = 5; }
+def : WriteRes<WriteStore, [JSAGU]>;
+def : WriteRes<WriteMove,  [JAny]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteZero,  []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteJump,  JALU01, 1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Floating point. This covers both scalar and vector operations.
+// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions?
+// FIXME: Double precision latencies
+// FIXME: SS vs PS latencies
+// FIXME: ymm latencies
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResFpuPair<WriteFAdd,        JFPU0,  3>;
+defm : JWriteResFpuPair<WriteFMul,        JFPU1,  2>;
+defm : JWriteResFpuPair<WriteFRcp,        JFPU1,  2>;
+defm : JWriteResFpuPair<WriteFRsqrt,      JFPU1,  2>;
+defm : JWriteResFpuPair<WriteFShuffle,   JFPU01,  1>;
+defm : JWriteResFpuPair<WriteFBlend,     JFPU01,  1>;
+defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>;
+
+def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> {
+  let Latency = 21;
+  let ResourceCycles = [1, 1, 21];
+}
+def : WriteRes<WriteFSqrtLd, [JFPU1, JLAGU, JFPM]> {
+  let Latency = 26;
+  let ResourceCycles = [1, 1, 21];
+}
+
+def : WriteRes<WriteFDiv, [JFPU1, JLAGU, JFPM]> {
+  let Latency = 19;
+  let ResourceCycles = [1, 1, 19];
+}
+def : WriteRes<WriteFDivLd, [JFPU1, JLAGU, JFPM]> {
+  let Latency = 24;
+  let ResourceCycles = [1, 1, 19];
+}
+
+// FIXME: integer pipes
+defm : JWriteResFpuPair<WriteCvtF2I,    JFPU1,  3>; // Float -> Integer.
+defm : JWriteResFpuPair<WriteCvtI2F,    JFPU1,  3>; // Integer -> Float.
+defm : JWriteResFpuPair<WriteCvtF2F,    JFPU1,  3>; // Float -> Float size conversion.
+
+def : WriteRes<WriteFVarBlend, [JFPU01]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteFVarBlendLd, [JLAGU, JFPU01]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 2];
+}
+
+// Vector integer operations.
+defm : JWriteResFpuPair<WriteVecALU,   JFPU01,  1>;
+defm : JWriteResFpuPair<WriteVecShift, JFPU01,  1>;
+defm : JWriteResFpuPair<WriteVecIMul,  JFPU0,   2>;
+defm : JWriteResFpuPair<WriteShuffle,  JFPU01,  1>;
+defm : JWriteResFpuPair<WriteBlend,    JFPU01,  1>;
+defm : JWriteResFpuPair<WriteVecLogic, JFPU01,  1>;
+defm : JWriteResFpuPair<WriteShuffle256, JFPU01, 1>;
+
+def : WriteRes<WriteVarBlend, [JFPU01]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteVarBlendLd, [JLAGU, JFPU01]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 2];
+}
+
+// FIXME: why do we need to define AVX2 resource on CPU that doesn't have AVX2?
+def : WriteRes<WriteVarVecShift, [JFPU01]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteVarVecShiftLd, [JLAGU, JFPU01]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1];
+}
+
+def : WriteRes<WriteMPSAD, [JFPU0]> {
+  let Latency = 3;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteMPSADLd, [JLAGU, JFPU0]> {
+  let Latency = 8;
+  let ResourceCycles = [1, 2];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+// FIXME: approximate latencies + pipe dependencies
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WritePCmpIStrM, [JFPU01]> {
+  let Latency = 7;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WritePCmpIStrMLd, [JLAGU, JFPU01]> {
+  let Latency = 12;
+  let ResourceCycles = [1, 2];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [JFPU01]> {
+  let Latency = 13;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WritePCmpEStrMLd, [JLAGU, JFPU01]> {
+  let Latency = 18;
+  let ResourceCycles = [1, 5];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [JFPU01]> {
+  let Latency = 6;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WritePCmpIStrILd, [JLAGU, JFPU01]> {
+  let Latency = 11;
+  let ResourceCycles = [1, 2];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [JFPU01]> {
+  let Latency = 13;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WritePCmpEStrILd, [JLAGU, JFPU01]> {
+  let Latency = 18;
+  let ResourceCycles = [1, 5];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// AES Instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteAESDecEnc, [JFPU01, JVIMUL]> {
+  let Latency = 3;
+  let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteAESDecEncLd, [JFPU01, JLAGU, JVIMUL]> {
+  let Latency = 8;
+  let ResourceCycles = [1, 1, 1];
+}
+
+def : WriteRes<WriteAESIMC, [JVIMUL]> {
+  let Latency = 2;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESIMCLd, [JLAGU, JVIMUL]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [JVIMUL]> {
+  let Latency = 2;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESKeyGenLd, [JLAGU, JVIMUL]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 1];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Carry-less multiplication instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteCLMul, [JVIMUL]> {
+  let Latency = 2;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteCLMulLd, [JLAGU, JVIMUL]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 1];
+}
+
+// FIXME: pipe for system/microcode?
+def : WriteRes<WriteSystem,     [JAny]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [JAny]> { let Latency = 100; }
+def : WriteRes<WriteFence,  [JSAGU]>;
+def : WriteRes<WriteNop, []>;
+} // SchedModel
+
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td
new file mode 100644
index 000000000000..f95d4fa04177
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -0,0 +1,233 @@
+//=- X86ScheduleSLM.td - X86 Silvermont Scheduling -----------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Intel Silvermont to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SLMModel : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and SLM can decode 2
+  // instructions per cycle.
+  let IssueWidth = 2;
+  let MicroOpBufferSize = 32; // Based on the reorder buffer.
+  let LoadLatency = 3;
+  let MispredictPenalty = 10;
+  let PostRAScheduler = 1;
+
+  // For small loops, expand by a small factor to hide the backedge cost.
+  let LoopMicroOpBufferSize = 10;
+
+  // FIXME: SSE4 is unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
+}
+
+let SchedModel = SLMModel in {
+
+// Silvermont has 5 reservation stations for micro-ops
+
+def IEC_RSV0 : ProcResource<1>;
+def IEC_RSV1 : ProcResource<1>;
+def FPC_RSV0 : ProcResource<1> { let BufferSize = 1; }
+def FPC_RSV1 : ProcResource<1> { let BufferSize = 1; }
+def MEC_RSV  : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def IEC_RSV01  : ProcResGroup<[IEC_RSV0, IEC_RSV1]>;
+def FPC_RSV01  : ProcResGroup<[FPC_RSV0, FPC_RSV1]>;
+
+def SMDivider      : ProcResource<1>;
+def SMFPMultiplier : ProcResource<1>;
+def SMFPDivider    : ProcResource<1>;
+
+// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SMWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on MEC_RSV and adds 3 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [MEC_RSV, ExePort]> {
+     let Latency = !add(Lat, 3);
+  }
+}
+
+// A folded store needs a cycle on MEC_RSV for the store data, but it does not
+// need an extra port cycle to recompute the address.
+def : WriteRes<WriteRMW, [MEC_RSV]>;
+
+def : WriteRes<WriteStore, [IEC_RSV01, MEC_RSV]>;
+def : WriteRes<WriteLoad,  [MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteMove,  [IEC_RSV01]>;
+def : WriteRes<WriteZero,  []>;
+
+defm : SMWriteResPair<WriteALU,   IEC_RSV01, 1>;
+defm : SMWriteResPair<WriteIMul,  IEC_RSV1,  3>;
+defm : SMWriteResPair<WriteShift, IEC_RSV0,  1>;
+defm : SMWriteResPair<WriteJump,  IEC_RSV1,   1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [IEC_RSV1]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [IEC_RSV01, SMDivider]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 25];
+}
+def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> {
+  let Latency = 29;
+  let ResourceCycles = [1, 1, 25];
+}
+
+// Scalar and vector floating point.
+defm : SMWriteResPair<WriteFAdd,   FPC_RSV1, 3>;
+defm : SMWriteResPair<WriteFRcp,   FPC_RSV0, 5>;
+defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>;
+defm : SMWriteResPair<WriteFSqrt,  FPC_RSV0, 15>;
+defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteCvtF2F, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteFShuffle,  FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteFBlend,  FPC_RSV0,  1>;
+
+// This is quite rough, latency depends on precision
+def : WriteRes<WriteFMul, [FPC_RSV0, SMFPMultiplier]> {
+  let Latency = 5;
+  let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteFMulLd, [MEC_RSV, FPC_RSV0, SMFPMultiplier]> {
+  let Latency = 8;
+  let ResourceCycles = [1, 1, 2];
+}
+
+def : WriteRes<WriteFDiv, [FPC_RSV0, SMFPDivider]> {
+  let Latency = 34;
+  let ResourceCycles = [1, 34];
+}
+def : WriteRes<WriteFDivLd, [MEC_RSV, FPC_RSV0, SMFPDivider]> {
+  let Latency = 37;
+  let ResourceCycles = [1, 1, 34];
+}
+
+// Vector integer operations.
+defm : SMWriteResPair<WriteVecShift, FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteVecLogic, FPC_RSV01, 1>;
+defm : SMWriteResPair<WriteVecALU,   FPC_RSV01,  1>;
+defm : SMWriteResPair<WriteVecIMul,  FPC_RSV0,   4>;
+defm : SMWriteResPair<WriteShuffle,  FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteBlend,  FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteMPSAD,  FPC_RSV0,  7>;
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [FPC_RSV0]> {
+  let Latency = 13;
+  let ResourceCycles = [13];
+}
+def : WriteRes<WritePCmpIStrMLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 13;
+  let ResourceCycles = [13, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [FPC_RSV0]> {
+  let Latency = 17;
+  let ResourceCycles = [17];
+}
+def : WriteRes<WritePCmpEStrMLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 17;
+  let ResourceCycles = [17, 1];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [FPC_RSV0]> {
+  let Latency = 17;
+  let ResourceCycles = [17];
+}
+def : WriteRes<WritePCmpIStrILd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 17;
+  let ResourceCycles = [17, 1];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [FPC_RSV0]> {
+  let Latency = 21;
+  let ResourceCycles = [21];
+}
+def : WriteRes<WritePCmpEStrILd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 21;
+  let ResourceCycles = [21, 1];
+}
+
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [FPC_RSV0]> {
+  let Latency = 8;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESDecEncLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 8;
+  let ResourceCycles = [5, 1];
+}
+
+def : WriteRes<WriteAESIMC, [FPC_RSV0]> {
+  let Latency = 8;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESIMCLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 8;
+  let ResourceCycles = [5, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [FPC_RSV0]> {
+  let Latency = 8;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESKeyGenLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 8;
+  let ResourceCycles = [5, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [FPC_RSV0]> {
+  let Latency = 10;
+  let ResourceCycles = [10];
+}
+def : WriteRes<WriteCLMulLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 10;
+  let ResourceCycles = [10, 1];
+}
+
+
+def : WriteRes<WriteSystem,     [FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteFence, [MEC_RSV]>;
+def : WriteRes<WriteNop, []>;
+
+// AVX is not supported on that architecture, but we should define the basic
+// scheduling resources anyway.
+def  : WriteRes<WriteIMulH, [FPC_RSV0]>;
+defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteShuffle256, FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0,  1>;
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
new file mode 100644
index 000000000000..f031a281e5dd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -0,0 +1,283 @@
+//===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86ISelLowering.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86SelectionDAGInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-selectiondag-info"
+
+bool X86SelectionDAGInfo::isBaseRegConflictPossible(
+    SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
+  // We cannot use TRI->hasBasePointer() until *after* we select all basic
+  // blocks.  Legalization may introduce new stack temporaries with large
+  // alignment requirements.  Fall back to generic code if there are any
+  // dynamic stack adjustments (hopefully rare) and the base pointer would
+  // conflict if we had to use it.
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  if (!MFI.hasVarSizedObjects() && !MFI.hasOpaqueSPAdjustment())
+    return false;
+
+  const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
+  unsigned BaseReg = TRI->getBaseRegister();
+  for (unsigned R : ClobberSet)
+    if (BaseReg == R)
+      return true;
+  return false;
+}
+
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo) const {
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  const X86Subtarget &Subtarget =
+      DAG.getMachineFunction().getSubtarget<X86Subtarget>();
+
+#ifndef NDEBUG
+  // If the base register might conflict with our physical registers, bail out.
+  const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
+                                  X86::ECX, X86::EAX, X86::EDI};
+  assert(!isBaseRegConflictPossible(DAG, ClobberSet));
+#endif
+
+  // If to a segment-relative address space, use the default lowering.
+  if (DstPtrInfo.getAddrSpace() >= 256)
+    return SDValue();
+
+  // If not DWORD aligned or size is more than the threshold, call the library.
+  // The libc version is likely to be faster for these cases. It can use the
+  // address value and run time information about the CPU.
+  if ((Align & 3) != 0 || !ConstantSize ||
+      ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
+    // Check to see if there is a specialized entry-point for memory zeroing.
+    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+
+    if (const char *bzeroEntry = V &&
+        V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
+      Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+      TargetLowering::ArgListTy Args;
+      TargetLowering::ArgListEntry Entry;
+      Entry.Node = Dst;
+      Entry.Ty = IntPtrTy;
+      Args.push_back(Entry);
+      Entry.Node = Size;
+      Args.push_back(Entry);
+
+      TargetLowering::CallLoweringInfo CLI(DAG);
+      CLI.setDebugLoc(dl).setChain(Chain)
+        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                   DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args))
+        .setDiscardResult();
+
+      std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+      return CallResult.second;
+    }
+
+    // Otherwise have the target-independent code call memset.
+    return SDValue();
+  }
+
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  SDValue InFlag;
+  EVT AVT;
+  SDValue Count;
+  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
+  unsigned BytesLeft = 0;
+  bool TwoRepStos = false;
+  if (ValC) {
+    unsigned ValReg;
+    uint64_t Val = ValC->getZExtValue() & 255;
+
+    // If the value is a constant, then we can potentially use larger sets.
+    switch (Align & 3) {
+    case 2:   // WORD aligned
+      AVT = MVT::i16;
+      ValReg = X86::AX;
+      Val = (Val << 8) | Val;
+      break;
+    case 0:  // DWORD aligned
+      AVT = MVT::i32;
+      ValReg = X86::EAX;
+      Val = (Val << 8)  | Val;
+      Val = (Val << 16) | Val;
+      if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
+        AVT = MVT::i64;
+        ValReg = X86::RAX;
+        Val = (Val << 32) | Val;
+      }
+      break;
+    default:  // Byte aligned
+      AVT = MVT::i8;
+      ValReg = X86::AL;
+      Count = DAG.getIntPtrConstant(SizeVal, dl);
+      break;
+    }
+
+    if (AVT.bitsGT(MVT::i8)) {
+      unsigned UBytes = AVT.getSizeInBits() / 8;
+      Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
+      BytesLeft = SizeVal % UBytes;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
+                             InFlag);
+    InFlag = Chain.getValue(1);
+  } else {
+    AVT = MVT::i8;
+    Count  = DAG.getIntPtrConstant(SizeVal, dl);
+    Chain  = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
+                           Count, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
+                           Dst, InFlag);
+  InFlag = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+  Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
+
+  if (TwoRepStos) {
+    InFlag = Chain.getValue(1);
+    Count  = Size;
+    EVT CVT = Count.getValueType();
+    SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
+                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl,
+                                               CVT));
+    Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
+                             Left, InFlag);
+    InFlag = Chain.getValue(1);
+    Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
+    Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
+  } else if (BytesLeft) {
+    // Handle the last 1 - 7 bytes.
+    unsigned Offset = SizeVal - BytesLeft;
+    EVT AddrVT = Dst.getValueType();
+    EVT SizeVT = Size.getValueType();
+
+    Chain = DAG.getMemset(Chain, dl,
+                          DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
+                                      DAG.getConstant(Offset, dl, AddrVT)),
+                          Src,
+                          DAG.getConstant(BytesLeft, dl, SizeVT),
+                          Align, isVolatile, false,
+                          DstPtrInfo.getWithOffset(Offset));
+  }
+
+  // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
+  return Chain;
+}
+
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+  // This requires the copy size to be a constant, preferably
+  // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  const X86Subtarget &Subtarget =
+      DAG.getMachineFunction().getSubtarget<X86Subtarget>();
+  if (!ConstantSize)
+    return SDValue();
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
+    return SDValue();
+
+  /// If not DWORD aligned, it is more efficient to call the library.  However
+  /// if calling the library is not allowed (AlwaysInline), then soldier on as
+  /// the code generated here is better than the long load-store sequence we
+  /// would otherwise get.
+  if (!AlwaysInline && (Align & 3) != 0)
+    return SDValue();
+
+  // If to a segment-relative address space, use the default lowering.
+  if (DstPtrInfo.getAddrSpace() >= 256 ||
+      SrcPtrInfo.getAddrSpace() >= 256)
+    return SDValue();
+
+  // If the base register might conflict with our physical registers, bail out.
+  const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
+                                  X86::ECX, X86::ESI, X86::EDI};
+  if (isBaseRegConflictPossible(DAG, ClobberSet))
+    return SDValue();
+
+  MVT AVT;
+  if (Align & 1)
+    AVT = MVT::i8;
+  else if (Align & 2)
+    AVT = MVT::i16;
+  else if (Align & 4)
+    // DWORD aligned
+    AVT = MVT::i32;
+  else
+    // QWORD aligned
+    AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
+
+  unsigned UBytes = AVT.getSizeInBits() / 8;
+  unsigned CountVal = SizeVal / UBytes;
+  SDValue Count = DAG.getIntPtrConstant(CountVal, dl);
+  unsigned BytesLeft = SizeVal % UBytes;
+
+  SDValue InFlag;
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
+                           Count, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
+                           Dst, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI,
+                           Src, InFlag);
+  InFlag = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+  SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
+
+  SmallVector<SDValue, 4> Results;
+  Results.push_back(RepMovs);
+  if (BytesLeft) {
+    // Handle the last 1 - 7 bytes.
+    unsigned Offset = SizeVal - BytesLeft;
+    EVT DstVT = Dst.getValueType();
+    EVT SrcVT = Src.getValueType();
+    EVT SizeVT = Size.getValueType();
+    Results.push_back(DAG.getMemcpy(Chain, dl,
+                                    DAG.getNode(ISD::ADD, dl, DstVT, Dst,
+                                                DAG.getConstant(Offset, dl,
+                                                                DstVT)),
+                                    DAG.getNode(ISD::ADD, dl, SrcVT, Src,
+                                                DAG.getConstant(Offset, dl,
+                                                                SrcVT)),
+                                    DAG.getConstant(BytesLeft, dl, SizeVT),
+                                    Align, isVolatile, AlwaysInline, false,
+                                    DstPtrInfo.getWithOffset(Offset),
+                                    SrcPtrInfo.getWithOffset(Offset)));
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h
new file mode 100644
index 000000000000..f4a285a5f916
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h
@@ -0,0 +1,50 @@
+//===-- X86SelectionDAGInfo.h - X86 SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+
+namespace llvm {
+
+class X86TargetLowering;
+class X86TargetMachine;
+class X86Subtarget;
+
+class X86SelectionDAGInfo : public SelectionDAGTargetInfo {
+  /// Returns true if it is possible for the base register to conflict with the
+  /// given set of clobbers for a memory intrinsic.
+  bool isBaseRegConflictPossible(SelectionDAG &DAG,
+                                 ArrayRef<MCPhysReg> ClobberSet) const;
+
+public:
+  explicit X86SelectionDAGInfo() = default;
+
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const override;
+
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
new file mode 100644
index 000000000000..11115524c810
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -0,0 +1,333 @@
+//===-- X86ShuffleDecodeConstantPool.cpp - X86 shuffle decode -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics using
+// constants from the constant pool.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecodeConstantPool.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/IR/Constants.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
+                                SmallBitVector &UndefElts,
+                                SmallVectorImpl<uint64_t> &RawMask) {
+  // It is not an error for shuffle masks to not be a vector of
+  // MaskEltSizeInBits because the constant pool uniques constants by their
+  // bit representation.
+  // e.g. the following take up the same space in the constant pool:
+  //   i128 -170141183420855150465331762880109871104
+  //
+  //   <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
+  //
+  //   <4 x i32> <i32 -2147483648, i32 -2147483648,
+  //              i32 -2147483648, i32 -2147483648>
+  Type *CstTy = C->getType();
+  if (!CstTy->isVectorTy())
+    return false;
+
+  Type *CstEltTy = CstTy->getVectorElementType();
+  if (!CstEltTy->isIntegerTy())
+    return false;
+
+  unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
+  unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
+  unsigned NumCstElts = CstTy->getVectorNumElements();
+
+  // Extract all the undef/constant element data and pack into single bitsets.
+  APInt UndefBits(CstSizeInBits, 0);
+  APInt MaskBits(CstSizeInBits, 0);
+  for (unsigned i = 0; i != NumCstElts; ++i) {
+    Constant *COp = C->getAggregateElement(i);
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+      return false;
+
+    if (isa<UndefValue>(COp)) {
+      APInt EltUndef = APInt::getLowBitsSet(CstSizeInBits, CstEltSizeInBits);
+      UndefBits |= EltUndef.shl(i * CstEltSizeInBits);
+      continue;
+    }
+
+    APInt EltBits = cast<ConstantInt>(COp)->getValue();
+    EltBits = EltBits.zextOrTrunc(CstSizeInBits);
+    MaskBits |= EltBits.shl(i * CstEltSizeInBits);
+  }
+
+  // Now extract the undef/constant bit data into the raw shuffle masks.
+  assert((CstSizeInBits % MaskEltSizeInBits) == 0 &&
+         "Unaligned shuffle mask size");
+
+  unsigned NumMaskElts = CstSizeInBits / MaskEltSizeInBits;
+  UndefElts = SmallBitVector(NumMaskElts, false);
+  RawMask.resize(NumMaskElts, 0);
+
+  for (unsigned i = 0; i != NumMaskElts; ++i) {
+    APInt EltUndef = UndefBits.lshr(i * MaskEltSizeInBits);
+    EltUndef = EltUndef.zextOrTrunc(MaskEltSizeInBits);
+
+    // Only treat the element as UNDEF if all bits are UNDEF, otherwise
+    // treat it as zero.
+    if (EltUndef.isAllOnesValue()) {
+      UndefElts[i] = true;
+      RawMask[i] = 0;
+      continue;
+    }
+
+    APInt EltBits = MaskBits.lshr(i * MaskEltSizeInBits);
+    EltBits = EltBits.zextOrTrunc(MaskEltSizeInBits);
+    RawMask[i] = EltBits.getZExtValue();
+  }
+
+  return true;
+}
+
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+         "Unexpected vector size.");
+
+  // The shuffle mask requires a byte vector.
+  SmallBitVector UndefElts;
+  SmallVector<uint64_t, 32> RawMask;
+  if (!extractConstantMask(C, 8, UndefElts, RawMask))
+    return;
+
+  unsigned NumElts = RawMask.size();
+  assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
+         "Unexpected number of vector elements.");
+
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+
+    uint64_t Element = RawMask[i];
+    // If the high bit (7) of the byte is set, the element is zeroed.
+    if (Element & (1 << 7))
+      ShuffleMask.push_back(SM_SentinelZero);
+    else {
+      // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
+      // lane of the vector we're inside.
+      unsigned Base = i & ~0xf;
+
+      // Only the least significant 4 bits of the byte are used.
+      int Index = Base + (Element & 0xf);
+      ShuffleMask.push_back(Index);
+    }
+  }
+}
+
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+         "Unexpected vector size.");
+  assert((ElSize == 32 || ElSize == 64) && "Unexpected vector element size.");
+
+  // The shuffle mask requires elements the same size as the target.
+  SmallBitVector UndefElts;
+  SmallVector<uint64_t, 8> RawMask;
+  if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
+    return;
+
+  unsigned NumElts = RawMask.size();
+  unsigned NumEltsPerLane = 128 / ElSize;
+  assert((NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16) &&
+         "Unexpected number of vector elements.");
+
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+
+    int Index = i & ~(NumEltsPerLane - 1);
+    uint64_t Element = RawMask[i];
+    if (ElSize == 64)
+      Index += (Element >> 1) & 0x1;
+    else
+      Index += Element & 0x3;
+
+    ShuffleMask.push_back(Index);
+  }
+}
+
+void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
+                         SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert((MaskTySize == 128 || MaskTySize == 256) && "Unexpected vector size.");
+
+  // The shuffle mask requires elements the same size as the target.
+  SmallBitVector UndefElts;
+  SmallVector<uint64_t, 8> RawMask;
+  if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
+    return;
+
+  unsigned NumElts = RawMask.size();
+  unsigned NumEltsPerLane = 128 / ElSize;
+  assert((NumElts == 2 || NumElts == 4 || NumElts == 8) &&
+         "Unexpected number of vector elements.");
+
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+
+    // VPERMIL2 Operation.
+    // Bits[3] - Match Bit.
+    // Bits[2:1] - (Per Lane) PD Shuffle Mask.
+    // Bits[2:0] - (Per Lane) PS Shuffle Mask.
+    uint64_t Selector = RawMask[i];
+    unsigned MatchBit = (Selector >> 3) & 0x1;
+
+    // M2Z[0:1]     MatchBit
+    //   0Xb           X        Source selected by Selector index.
+    //   10b           0        Source selected by Selector index.
+    //   10b           1        Zero.
+    //   11b           0        Zero.
+    //   11b           1        Source selected by Selector index.
+    if ((M2Z & 0x2) != 0u && MatchBit != (M2Z & 0x1)) {
+      ShuffleMask.push_back(SM_SentinelZero);
+      continue;
+    }
+
+    int Index = i & ~(NumEltsPerLane - 1);
+    if (ElSize == 64)
+      Index += (Selector >> 1) & 0x1;
+    else
+      Index += Selector & 0x3;
+
+    int Src = (Selector >> 2) & 0x1;
+    Index += Src * NumElts;
+    ShuffleMask.push_back(Index);
+  }
+}
+
+void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+  assert(C->getType()->getPrimitiveSizeInBits() == 128 &&
+         "Unexpected vector size.");
+
+  // The shuffle mask requires a byte vector.
+  SmallBitVector UndefElts;
+  SmallVector<uint64_t, 32> RawMask;
+  if (!extractConstantMask(C, 8, UndefElts, RawMask))
+    return;
+
+  unsigned NumElts = RawMask.size();
+  assert(NumElts == 16 && "Unexpected number of vector elements.");
+
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+
+    // VPPERM Operation
+    // Bits[4:0] - Byte Index (0 - 31)
+    // Bits[7:5] - Permute Operation
+    //
+    // Permute Operation:
+    // 0 - Source byte (no logical operation).
+    // 1 - Invert source byte.
+    // 2 - Bit reverse of source byte.
+    // 3 - Bit reverse of inverted source byte.
+    // 4 - 00h (zero - fill).
+    // 5 - FFh (ones - fill).
+    // 6 - Most significant bit of source byte replicated in all bit positions.
+    // 7 - Invert most significant bit of source byte and replicate in all bit
+    // positions.
+    uint64_t Element = RawMask[i];
+    uint64_t Index = Element & 0x1F;
+    uint64_t PermuteOp = (Element >> 5) & 0x7;
+
+    if (PermuteOp == 4) {
+      ShuffleMask.push_back(SM_SentinelZero);
+      continue;
+    }
+    if (PermuteOp != 0) {
+      ShuffleMask.clear();
+      return;
+    }
+    ShuffleMask.push_back((int)Index);
+  }
+}
+
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+         "Unexpected vector size.");
+  assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
+         "Unexpected vector element size.");
+
+  // The shuffle mask requires elements the same size as the target.
+  SmallBitVector UndefElts;
+  SmallVector<uint64_t, 8> RawMask;
+  if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
+    return;
+
+  unsigned NumElts = RawMask.size();
+
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+    int Index = RawMask[i] & (NumElts - 1);
+    ShuffleMask.push_back(Index);
+  }
+}
+
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+         "Unexpected vector size.");
+  assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
+         "Unexpected vector element size.");
+
+  // The shuffle mask requires elements the same size as the target.
+  SmallBitVector UndefElts;
+  SmallVector<uint64_t, 8> RawMask;
+  if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
+    return;
+
+  unsigned NumElts = RawMask.size();
+
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+    int Index = RawMask[i] & (NumElts*2 - 1);
+    ShuffleMask.push_back(Index);
+  }
+}
+} // llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
new file mode 100644
index 000000000000..b703cbbd2b29
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -0,0 +1,52 @@
+//===-- X86ShuffleDecodeConstantPool.h - X86 shuffle decode -----*-C++-*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics using
+// constants from the constant pool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
+#define LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
+
+#include "llvm/ADT/SmallVector.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class Constant;
+class MVT;
+
+/// Decode a PSHUFB mask from an IR-level vector constant.
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMILP variable mask from an IR-level vector constant.
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+                        SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMILP2 variable mask from an IR-level vector constant.
+void DecodeVPERMIL2PMask(const Constant *C, unsigned MatchImm, unsigned ElSize,
+                         SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPPERM variable mask from an IR-level vector constant.
+void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
+                       SmallVectorImpl<int> &ShuffleMask);
+
+} // llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
new file mode 100644
index 000000000000..727ff70c3ff6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -0,0 +1,365 @@
+//===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Subtarget.h"
+#include "X86InstrInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
+#define DEBUG_TYPE "subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "X86GenSubtargetInfo.inc"
+
+// Temporary option to control early if-conversion for x86 while adding machine
+// models.
+static cl::opt<bool>
+X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
+               cl::desc("Enable early if-conversion on X86"));
+
+
+/// Classify a blockaddress reference for the current subtarget according to how
+/// we should reference it in a non-pcrel context.
+unsigned char X86Subtarget::classifyBlockAddressReference() const {
+  return classifyLocalReference(nullptr);
+}
+
+/// Classify a global variable reference for the current subtarget according to
+/// how we should reference it in a non-pcrel context.
+unsigned char
+X86Subtarget::classifyGlobalReference(const GlobalValue *GV) const {
+  return classifyGlobalReference(GV, *GV->getParent());
+}
+
+unsigned char
+X86Subtarget::classifyLocalReference(const GlobalValue *GV) const {
+  // 64 bits can use %rip addressing for anything local.
+  if (is64Bit())
+    return X86II::MO_NO_FLAG;
+
+  // If this is for a position dependent executable, the static linker can
+  // figure it out.
+  if (!isPositionIndependent())
+    return X86II::MO_NO_FLAG;
+
+  // The COFF dynamic linker just patches the executable sections.
+  if (isTargetCOFF())
+    return X86II::MO_NO_FLAG;
+
+  if (isTargetDarwin()) {
+    // 32 bit macho has no relocation for a-b if a is undefined, even if
+    // b is in the section that is being relocated.
+    // This means we have to use o load even for GVs that are known to be
+    // local to the dso.
+    if (GV && (GV->isDeclarationForLinker() || GV->hasCommonLinkage()))
+      return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+
+    return X86II::MO_PIC_BASE_OFFSET;
+  }
+
+  return X86II::MO_GOTOFF;
+}
+
+unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
+                                                    const Module &M) const {
+  // Large model never uses stubs.
+  if (TM.getCodeModel() == CodeModel::Large)
+    return X86II::MO_NO_FLAG;
+
+  // Absolute symbols can be referenced directly.
+  if (GV && GV->isAbsoluteSymbolRef())
+    return X86II::MO_NO_FLAG;
+
+  if (TM.shouldAssumeDSOLocal(M, GV))
+    return classifyLocalReference(GV);
+
+  if (isTargetCOFF())
+    return X86II::MO_DLLIMPORT;
+
+  if (is64Bit())
+    return X86II::MO_GOTPCREL;
+
+  if (isTargetDarwin()) {
+    if (!isPositionIndependent())
+      return X86II::MO_DARWIN_NONLAZY;
+    return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+  }
+
+  return X86II::MO_GOT;
+}
+
+unsigned char
+X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV) const {
+  return classifyGlobalFunctionReference(GV, *GV->getParent());
+}
+
+unsigned char
+X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
+                                              const Module &M) const {
+  if (TM.shouldAssumeDSOLocal(M, GV))
+    return X86II::MO_NO_FLAG;
+
+  assert(!isTargetCOFF());
+
+  if (isTargetELF())
+    return X86II::MO_PLT;
+
+  if (is64Bit()) {
+    auto *F = dyn_cast_or_null<Function>(GV);
+    if (F && F->hasFnAttribute(Attribute::NonLazyBind))
+      // If the function is marked as non-lazy, generate an indirect call
+      // which loads from the GOT directly. This avoids runtime overhead
+      // at the cost of eager binding (and one extra byte of encoding).
+      return X86II::MO_GOTPCREL;
+    return X86II::MO_NO_FLAG;
+  }
+
+  return X86II::MO_NO_FLAG;
+}
+
+/// This function returns the name of a function which has an interface like
+/// the non-standard bzero function, if such a function exists on the
+/// current subtarget and it is considered preferable over memset with zero
+/// passed as the second argument. Otherwise it returns null.
+const char *X86Subtarget::getBZeroEntry() const {
+  // Darwin 10 has a __bzero entry point for this purpose.
+  if (getTargetTriple().isMacOSX() &&
+      !getTargetTriple().isMacOSXVersionLT(10, 6))
+    return "__bzero";
+
+  return nullptr;
+}
+
+bool X86Subtarget::hasSinCos() const {
+  return getTargetTriple().isMacOSX() &&
+    !getTargetTriple().isMacOSXVersionLT(10, 9) &&
+    is64Bit();
+}
+
+/// Return true if the subtarget allows calls to immediate address.
+bool X86Subtarget::isLegalToCallImmediateAddr() const {
+  // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32
+  // but WinCOFFObjectWriter::RecordRelocation cannot emit them.  Once it does,
+  // the following check for Win32 should be removed.
+  if (In64BitMode || isTargetWin32())
+    return false;
+  return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
+}
+
+void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "generic";
+
+  // Make sure 64-bit features are available in 64-bit mode. (But make sure
+  // SSE2 can be turned off explicitly.)
+  std::string FullFS = FS;
+  if (In64BitMode) {
+    if (!FullFS.empty())
+      FullFS = "+64bit,+sse2," + FullFS;
+    else
+      FullFS = "+64bit,+sse2";
+  }
+
+  // LAHF/SAHF are always supported in non-64-bit mode.
+  if (!In64BitMode) {
+    if (!FullFS.empty())
+      FullFS = "+sahf," + FullFS;
+    else
+      FullFS = "+sahf";
+  }
+
+
+  // Parse features string and set the CPU.
+  ParseSubtargetFeatures(CPUName, FullFS);
+
+  // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of
+  // 16-bytes and under that are reasonably fast. These features were
+  // introduced with Intel's Nehalem/Silvermont and AMD's Family10h
+  // micro-architectures respectively.
+  if (hasSSE42() || hasSSE4A())
+    IsUAMem16Slow = false;
+  
+  InstrItins = getInstrItineraryForCPU(CPUName);
+
+  // It's important to keep the MCSubtargetInfo feature bits in sync with
+  // target data structure which is shared with MC code emitter, etc.
+  if (In64BitMode)
+    ToggleFeature(X86::Mode64Bit);
+  else if (In32BitMode)
+    ToggleFeature(X86::Mode32Bit);
+  else if (In16BitMode)
+    ToggleFeature(X86::Mode16Bit);
+  else
+    llvm_unreachable("Not 16-bit, 32-bit or 64-bit mode!");
+
+  DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
+               << ", 3DNowLevel " << X863DNowLevel
+               << ", 64bit " << HasX86_64 << "\n");
+  assert((!In64BitMode || HasX86_64) &&
+         "64-bit code requested on a subtarget that doesn't support it!");
+
+  // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and Solaris (both
+  // 32 and 64 bit) and for all 64-bit targets.
+  if (StackAlignOverride)
+    stackAlignment = StackAlignOverride;
+  else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
+           isTargetKFreeBSD() || In64BitMode)
+    stackAlignment = 16;
+
+  assert((!isPMULLDSlow() || hasSSE41()) &&
+         "Feature Slow PMULLD can only be set on a subtarget with SSE4.1");
+}
+
+void X86Subtarget::initializeEnvironment() {
+  X86SSELevel = NoSSE;
+  X863DNowLevel = NoThreeDNow;
+  HasX87 = false;
+  HasCMov = false;
+  HasX86_64 = false;
+  HasPOPCNT = false;
+  HasSSE4A = false;
+  HasAES = false;
+  HasFXSR = false;
+  HasXSAVE = false;
+  HasXSAVEOPT = false;
+  HasXSAVEC = false;
+  HasXSAVES = false;
+  HasPCLMUL = false;
+  HasFMA = false;
+  HasFMA4 = false;
+  HasXOP = false;
+  HasTBM = false;
+  HasMOVBE = false;
+  HasRDRAND = false;
+  HasF16C = false;
+  HasFSGSBase = false;
+  HasLZCNT = false;
+  HasBMI = false;
+  HasBMI2 = false;
+  HasVBMI = false;
+  HasIFMA = false;
+  HasRTM = false;
+  HasHLE = false;
+  HasERI = false;
+  HasCDI = false;
+  HasPFI = false;
+  HasDQI = false;
+  HasBWI = false;
+  HasVLX = false;
+  HasADX = false;
+  HasPKU = false;
+  HasSHA = false;
+  HasPRFCHW = false;
+  HasRDSEED = false;
+  HasLAHFSAHF = false;
+  HasMWAITX = false;
+  HasMPX = false;
+  IsBTMemSlow = false;
+  IsPMULLDSlow = false;
+  IsSHLDSlow = false;
+  IsUAMem16Slow = false;
+  IsUAMem32Slow = false;
+  HasSSEUnalignedMem = false;
+  HasCmpxchg16b = false;
+  UseLeaForSP = false;
+  HasFastPartialYMMWrite = false;
+  HasFastScalarFSQRT = false;
+  HasFastVectorFSQRT = false;
+  HasFastLZCNT = false;
+  HasSlowDivide32 = false;
+  HasSlowDivide64 = false;
+  PadShortFunctions = false;
+  CallRegIndirect = false;
+  LEAUsesAG = false;
+  SlowLEA = false;
+  SlowIncDec = false;
+  stackAlignment = 4;
+  // FIXME: this is a known good value for Yonah. How about others?
+  MaxInlineSizeThreshold = 128;
+  UseSoftFloat = false;
+}
+
+X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                            StringRef FS) {
+  initializeEnvironment();
+  initSubtargetFeatures(CPU, FS);
+  return *this;
+}
+
+X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                           const X86TargetMachine &TM,
+                           unsigned StackAlignOverride)
+    : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
+      PICStyle(PICStyles::None), TM(TM), TargetTriple(TT),
+      StackAlignOverride(StackAlignOverride),
+      In64BitMode(TargetTriple.getArch() == Triple::x86_64),
+      In32BitMode(TargetTriple.getArch() == Triple::x86 &&
+                  TargetTriple.getEnvironment() != Triple::CODE16),
+      In16BitMode(TargetTriple.getArch() == Triple::x86 &&
+                  TargetTriple.getEnvironment() == Triple::CODE16),
+      TSInfo(), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+      TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) {
+  // Determine the PICStyle based on the target selected.
+  if (!isPositionIndependent())
+    setPICStyle(PICStyles::None);
+  else if (is64Bit())
+    setPICStyle(PICStyles::RIPRel);
+  else if (isTargetCOFF())
+    setPICStyle(PICStyles::None);
+  else if (isTargetDarwin())
+    setPICStyle(PICStyles::StubPIC);
+  else if (isTargetELF())
+    setPICStyle(PICStyles::GOT);
+}
+
+const CallLowering *X86Subtarget::getCallLowering() const {
+  assert(GISel && "Access to GlobalISel APIs not set");
+  return GISel->getCallLowering();
+}
+
+const InstructionSelector *X86Subtarget::getInstructionSelector() const {
+  assert(GISel && "Access to GlobalISel APIs not set");
+  return GISel->getInstructionSelector();
+}
+
+const LegalizerInfo *X86Subtarget::getLegalizerInfo() const {
+  assert(GISel && "Access to GlobalISel APIs not set");
+  return GISel->getLegalizerInfo();
+}
+
+const RegisterBankInfo *X86Subtarget::getRegBankInfo() const {
+  assert(GISel && "Access to GlobalISel APIs not set");
+  return GISel->getRegBankInfo();
+}
+
+bool X86Subtarget::enableEarlyIfConversion() const {
+  return hasCMov() && X86EarlyIfConv;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h
new file mode 100644
index 000000000000..92c16214aa4a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h
@@ -0,0 +1,633 @@
+//===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SUBTARGET_H
+#define LLVM_LIB_TARGET_X86_X86SUBTARGET_H
+
+#include "X86FrameLowering.h"
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "X86SelectionDAGInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "X86GenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+class TargetMachine;
+
+/// The X86 backend supports a number of different styles of PIC.
+///
+namespace PICStyles {
+enum Style {
+  StubPIC,          // Used on i386-darwin in pic mode.
+  GOT,              // Used on 32 bit elf on when in pic mode.
+  RIPRel,           // Used on X86-64 when in pic mode.
+  None              // Set when not in pic mode.
+};
+}
+
+class X86Subtarget final : public X86GenSubtargetInfo {
+
+protected:
+  enum X86SSEEnum {
+    NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
+  };
+
+  enum X863DNowEnum {
+    NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
+  };
+
+  enum X86ProcFamilyEnum {
+    Others, IntelAtom, IntelSLM
+  };
+
+  /// X86 processor family: Intel Atom, and others
+  X86ProcFamilyEnum X86ProcFamily;
+
+  /// Which PIC style to use
+  PICStyles::Style PICStyle;
+
+  const TargetMachine &TM;
+
+  /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
+  X86SSEEnum X86SSELevel;
+
+  /// MMX, 3DNow, 3DNow Athlon, or none supported.
+  X863DNowEnum X863DNowLevel;
+
+  /// True if the processor supports X87 instructions.
+  bool HasX87;
+
+  /// True if this processor has conditional move instructions
+  /// (generally pentium pro+).
+  bool HasCMov;
+
+  /// True if the processor supports X86-64 instructions.
+  bool HasX86_64;
+
+  /// True if the processor supports POPCNT.
+  bool HasPOPCNT;
+
+  /// True if the processor supports SSE4A instructions.
+  bool HasSSE4A;
+
+  /// Target has AES instructions
+  bool HasAES;
+
+  /// Target has FXSAVE/FXRESTOR instructions
+  bool HasFXSR;
+
+  /// Target has XSAVE instructions
+  bool HasXSAVE;
+  /// Target has XSAVEOPT instructions
+  bool HasXSAVEOPT;
+  /// Target has XSAVEC instructions
+  bool HasXSAVEC;
+  /// Target has XSAVES instructions
+  bool HasXSAVES;
+
+  /// Target has carry-less multiplication
+  bool HasPCLMUL;
+
+  /// Target has 3-operand fused multiply-add
+  bool HasFMA;
+
+  /// Target has 4-operand fused multiply-add
+  bool HasFMA4;
+
+  /// Target has XOP instructions
+  bool HasXOP;
+
+  /// Target has TBM instructions.
+  bool HasTBM;
+
+  /// True if the processor has the MOVBE instruction.
+  bool HasMOVBE;
+
+  /// True if the processor has the RDRAND instruction.
+  bool HasRDRAND;
+
+  /// Processor has 16-bit floating point conversion instructions.
+  bool HasF16C;
+
+  /// Processor has FS/GS base insturctions.
+  bool HasFSGSBase;
+
+  /// Processor has LZCNT instruction.
+  bool HasLZCNT;
+
+  /// Processor has BMI1 instructions.
+  bool HasBMI;
+
+  /// Processor has BMI2 instructions.
+  bool HasBMI2;
+
+  /// Processor has VBMI instructions.
+  bool HasVBMI;
+
+  /// Processor has Integer Fused Multiply Add
+  bool HasIFMA;
+
+  /// Processor has RTM instructions.
+  bool HasRTM;
+
+  /// Processor has HLE.
+  bool HasHLE;
+
+  /// Processor has ADX instructions.
+  bool HasADX;
+
+  /// Processor has SHA instructions.
+  bool HasSHA;
+
+  /// Processor has PRFCHW instructions.
+  bool HasPRFCHW;
+
+  /// Processor has RDSEED instructions.
+  bool HasRDSEED;
+
+  /// Processor has LAHF/SAHF instructions.
+  bool HasLAHFSAHF;
+
+  /// Processor has MONITORX/MWAITX instructions.
+  bool HasMWAITX;
+
+  /// Processor has Prefetch with intent to Write instruction
+  bool HasPFPREFETCHWT1;
+
+  /// True if BT (bit test) of memory instructions are slow.
+  bool IsBTMemSlow;
+
+  /// True if SHLD instructions are slow.
+  bool IsSHLDSlow;
+
+  /// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and
+  //  PMULUDQ.
+  bool IsPMULLDSlow;
+
+  /// True if unaligned memory accesses of 16-bytes are slow.
+  bool IsUAMem16Slow;
+
+  /// True if unaligned memory accesses of 32-bytes are slow.
+  bool IsUAMem32Slow;
+
+  /// True if SSE operations can have unaligned memory operands.
+  /// This may require setting a configuration bit in the processor.
+  bool HasSSEUnalignedMem;
+
+  /// True if this processor has the CMPXCHG16B instruction;
+  /// this is true for most x86-64 chips, but not the first AMD chips.
+  bool HasCmpxchg16b;
+
+  /// True if the LEA instruction should be used for adjusting
+  /// the stack pointer. This is an optimization for Intel Atom processors.
+  bool UseLeaForSP;
+
+  /// True if there is no performance penalty to writing only the lower parts
+  /// of a YMM register without clearing the upper part.
+  bool HasFastPartialYMMWrite;
+
+  /// True if hardware SQRTSS instruction is at least as fast (latency) as
+  /// RSQRTSS followed by a Newton-Raphson iteration.
+  bool HasFastScalarFSQRT;
+
+  /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
+  /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
+  bool HasFastVectorFSQRT;
+
+  /// True if 8-bit divisions are significantly faster than
+  /// 32-bit divisions and should be used when possible.
+  bool HasSlowDivide32;
+
+  /// True if 16-bit divides are significantly faster than
+  /// 64-bit divisions and should be used when possible.
+  bool HasSlowDivide64;
+
+  /// True if LZCNT instruction is fast.
+  bool HasFastLZCNT;
+
+  /// True if the short functions should be padded to prevent
+  /// a stall when returning too early.
+  bool PadShortFunctions;
+
+  /// True if the Calls with memory reference should be converted
+  /// to a register-based indirect call.
+  bool CallRegIndirect;
+
+  /// True if the LEA instruction inputs have to be ready at address generation
+  /// (AG) time.
+  bool LEAUsesAG;
+
+  /// True if the LEA instruction with certain arguments is slow
+  bool SlowLEA;
+
+  /// True if INC and DEC instructions are slow when writing to flags
+  bool SlowIncDec;
+
+  /// Processor has AVX-512 PreFetch Instructions
+  bool HasPFI;
+
+  /// Processor has AVX-512 Exponential and Reciprocal Instructions
+  bool HasERI;
+
+  /// Processor has AVX-512 Conflict Detection Instructions
+  bool HasCDI;
+
+  /// Processor has AVX-512 Doubleword and Quadword instructions
+  bool HasDQI;
+
+  /// Processor has AVX-512 Byte and Word instructions
+  bool HasBWI;
+
+  /// Processor has AVX-512 Vector Length eXtenstions
+  bool HasVLX;
+
+  /// Processor has PKU extenstions
+  bool HasPKU;
+
+  /// Processor supports MPX - Memory Protection Extensions
+  bool HasMPX;
+
+  /// Processor supports Invalidate Process-Context Identifier
+  bool HasInvPCId;
+
+  /// Processor has VM Functions
+  bool HasVMFUNC;
+
+  /// Processor has Supervisor Mode Access Protection
+  bool HasSMAP;
+
+  /// Processor has Software Guard Extensions
+  bool HasSGX;
+
+  /// Processor supports Flush Cache Line instruction
+  bool HasCLFLUSHOPT;
+
+  /// Processor has Persistent Commit feature
+  bool HasPCOMMIT;
+
+  /// Processor supports Cache Line Write Back instruction
+  bool HasCLWB;
+
+  /// Use software floating point for code generation.
+  bool UseSoftFloat;
+
+  /// The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned stackAlignment;
+
+  /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
+  ///
+  unsigned MaxInlineSizeThreshold;
+
+  /// What processor and OS we're targeting.
+  Triple TargetTriple;
+
+  /// Instruction itineraries for scheduling
+  InstrItineraryData InstrItins;
+
+  /// Gather the accessor points to GlobalISel-related APIs.
+  /// This is used to avoid ifndefs spreading around while GISel is
+  /// an optional library.
+  std::unique_ptr<GISelAccessor> GISel;
+private:
+
+  /// Override the stack alignment.
+  unsigned StackAlignOverride;
+
+  /// True if compiling for 64-bit, false for 16-bit or 32-bit.
+  bool In64BitMode;
+
+  /// True if compiling for 32-bit, false for 16-bit or 64-bit.
+  bool In32BitMode;
+
+  /// True if compiling for 16-bit, false for 32-bit or 64-bit.
+  bool In16BitMode;
+
+  X86SelectionDAGInfo TSInfo;
+  // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
+  // X86TargetLowering needs.
+  X86InstrInfo InstrInfo;
+  X86TargetLowering TLInfo;
+  X86FrameLowering FrameLowering;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
+               const X86TargetMachine &TM, unsigned StackAlignOverride);
+
+  /// This object will take onwership of \p GISelAccessor.
+  void setGISelAccessor(GISelAccessor &GISel) { this->GISel.reset(&GISel); }
+
+  const X86TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const X86FrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const X86RegisterInfo *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
+  }
+
+  /// Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return stackAlignment; }
+
+  /// Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// Methods used by Global ISel
+  const CallLowering *getCallLowering() const override;
+  const InstructionSelector *getInstructionSelector() const override;
+  const LegalizerInfo *getLegalizerInfo() const override;
+  const RegisterBankInfo *getRegBankInfo() const override;
+private:
+  /// Initialize the full set of dependencies so we can use an initializer
+  /// list for X86Subtarget.
+  X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+  void initializeEnvironment();
+  void initSubtargetFeatures(StringRef CPU, StringRef FS);
+public:
+  /// Is this x86_64? (disregarding specific ABI / programming model)
+  bool is64Bit() const {
+    return In64BitMode;
+  }
+
+  bool is32Bit() const {
+    return In32BitMode;
+  }
+
+  bool is16Bit() const {
+    return In16BitMode;
+  }
+
+  /// Is this x86_64 with the ILP32 programming model (x32 ABI)?
+  bool isTarget64BitILP32() const {
+    return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 ||
+                           TargetTriple.isOSNaCl());
+  }
+
+  /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
+  bool isTarget64BitLP64() const {
+    return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 &&
+                           !TargetTriple.isOSNaCl());
+  }
+
+  PICStyles::Style getPICStyle() const { return PICStyle; }
+  void setPICStyle(PICStyles::Style Style)  { PICStyle = Style; }
+
+  bool hasX87() const { return HasX87; }
+  bool hasCMov() const { return HasCMov; }
+  bool hasSSE1() const { return X86SSELevel >= SSE1; }
+  bool hasSSE2() const { return X86SSELevel >= SSE2; }
+  bool hasSSE3() const { return X86SSELevel >= SSE3; }
+  bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
+  bool hasSSE41() const { return X86SSELevel >= SSE41; }
+  bool hasSSE42() const { return X86SSELevel >= SSE42; }
+  bool hasAVX() const { return X86SSELevel >= AVX; }
+  bool hasAVX2() const { return X86SSELevel >= AVX2; }
+  bool hasAVX512() const { return X86SSELevel >= AVX512F; }
+  bool hasFp256() const { return hasAVX(); }
+  bool hasInt256() const { return hasAVX2(); }
+  bool hasSSE4A() const { return HasSSE4A; }
+  bool hasMMX() const { return X863DNowLevel >= MMX; }
+  bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
+  bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
+  bool hasPOPCNT() const { return HasPOPCNT; }
+  bool hasAES() const { return HasAES; }
+  bool hasFXSR() const { return HasFXSR; }
+  bool hasXSAVE() const { return HasXSAVE; }
+  bool hasXSAVEOPT() const { return HasXSAVEOPT; }
+  bool hasXSAVEC() const { return HasXSAVEC; }
+  bool hasXSAVES() const { return HasXSAVES; }
+  bool hasPCLMUL() const { return HasPCLMUL; }
+  // Prefer FMA4 to FMA - its better for commutation/memory folding and
+  // has equal or better performance on all supported targets.
+  bool hasFMA() const { return HasFMA && !HasFMA4; }
+  bool hasFMA4() const { return HasFMA4; }
+  bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); }
+  bool hasXOP() const { return HasXOP; }
+  bool hasTBM() const { return HasTBM; }
+  bool hasMOVBE() const { return HasMOVBE; }
+  bool hasRDRAND() const { return HasRDRAND; }
+  bool hasF16C() const { return HasF16C; }
+  bool hasFSGSBase() const { return HasFSGSBase; }
+  bool hasLZCNT() const { return HasLZCNT; }
+  bool hasBMI() const { return HasBMI; }
+  bool hasBMI2() const { return HasBMI2; }
+  bool hasVBMI() const { return HasVBMI; }
+  bool hasIFMA() const { return HasIFMA; }
+  bool hasRTM() const { return HasRTM; }
+  bool hasHLE() const { return HasHLE; }
+  bool hasADX() const { return HasADX; }
+  bool hasSHA() const { return HasSHA; }
+  bool hasPRFCHW() const { return HasPRFCHW; }
+  bool hasRDSEED() const { return HasRDSEED; }
+  bool hasLAHFSAHF() const { return HasLAHFSAHF; }
+  bool hasMWAITX() const { return HasMWAITX; }
+  bool isBTMemSlow() const { return IsBTMemSlow; }
+  bool isSHLDSlow() const { return IsSHLDSlow; }
+  bool isPMULLDSlow() const { return IsPMULLDSlow; }
+  bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
+  bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
+  bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
+  bool hasCmpxchg16b() const { return HasCmpxchg16b; }
+  bool useLeaForSP() const { return UseLeaForSP; }
+  bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
+  bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
+  bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
+  bool hasFastLZCNT() const { return HasFastLZCNT; }
+  bool hasSlowDivide32() const { return HasSlowDivide32; }
+  bool hasSlowDivide64() const { return HasSlowDivide64; }
+  bool padShortFunctions() const { return PadShortFunctions; }
+  bool callRegIndirect() const { return CallRegIndirect; }
+  bool LEAusesAG() const { return LEAUsesAG; }
+  bool slowLEA() const { return SlowLEA; }
+  bool slowIncDec() const { return SlowIncDec; }
+  bool hasCDI() const { return HasCDI; }
+  bool hasPFI() const { return HasPFI; }
+  bool hasERI() const { return HasERI; }
+  bool hasDQI() const { return HasDQI; }
+  bool hasBWI() const { return HasBWI; }
+  bool hasVLX() const { return HasVLX; }
+  bool hasPKU() const { return HasPKU; }
+  bool hasMPX() const { return HasMPX; }
+
+  virtual bool isXRaySupported() const override { return is64Bit(); }
+
+  bool isAtom() const { return X86ProcFamily == IntelAtom; }
+  bool isSLM() const { return X86ProcFamily == IntelSLM; }
+  bool useSoftFloat() const { return UseSoftFloat; }
+
+  /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
+  /// no-sse2). There isn't any reason to disable it if the target processor
+  /// supports it.
+  bool hasMFence() const { return hasSSE2() || is64Bit(); }
+
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+  bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); }
+  bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); }
+  bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); }
+  bool isTargetPS4() const { return TargetTriple.isPS4CPU(); }
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+  bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool isTargetKFreeBSD() const { return TargetTriple.isOSKFreeBSD(); }
+  bool isTargetGlibc() const { return TargetTriple.isOSGlibc(); }
+  bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
+  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+  bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
+  bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
+  bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
+
+  bool isTargetWindowsMSVC() const {
+    return TargetTriple.isWindowsMSVCEnvironment();
+  }
+
+  bool isTargetKnownWindowsMSVC() const {
+    return TargetTriple.isKnownWindowsMSVCEnvironment();
+  }
+
+  bool isTargetWindowsCoreCLR() const {
+    return TargetTriple.isWindowsCoreCLREnvironment();
+  }
+
+  bool isTargetWindowsCygwin() const {
+    return TargetTriple.isWindowsCygwinEnvironment();
+  }
+
+  bool isTargetWindowsGNU() const {
+    return TargetTriple.isWindowsGNUEnvironment();
+  }
+
+  bool isTargetWindowsItanium() const {
+    return TargetTriple.isWindowsItaniumEnvironment();
+  }
+
+  bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
+
+  bool isOSWindows() const { return TargetTriple.isOSWindows(); }
+
+  bool isTargetWin64() const {
+    return In64BitMode && TargetTriple.isOSWindows();
+  }
+
+  bool isTargetWin32() const {
+    return !In64BitMode && (isTargetCygMing() || isTargetKnownWindowsMSVC());
+  }
+
+  bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
+  bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }
+
+  bool isPICStyleStubPIC() const {
+    return PICStyle == PICStyles::StubPIC;
+  }
+
+  bool isPositionIndependent() const { return TM.isPositionIndependent(); }
+
+  bool isCallingConvWin64(CallingConv::ID CC) const {
+    switch (CC) {
+    // On Win64, all these conventions just use the default convention.
+    case CallingConv::C:
+    case CallingConv::Fast:
+    case CallingConv::X86_FastCall:
+    case CallingConv::X86_StdCall:
+    case CallingConv::X86_ThisCall:
+    case CallingConv::X86_VectorCall:
+    case CallingConv::Intel_OCL_BI:
+      return isTargetWin64();
+    // This convention allows using the Win64 convention on other targets.
+    case CallingConv::X86_64_Win64:
+      return true;
+    // This convention allows using the SysV convention on Windows targets.
+    case CallingConv::X86_64_SysV:
+      return false;
+    // Otherwise, who knows what this is.
+    default:
+      return false;
+    }
+  }
+
+  /// Classify a global variable reference for the current subtarget according
+  /// to how we should reference it in a non-pcrel context.
+  unsigned char classifyLocalReference(const GlobalValue *GV) const;
+
+  unsigned char classifyGlobalReference(const GlobalValue *GV,
+                                        const Module &M) const;
+  unsigned char classifyGlobalReference(const GlobalValue *GV) const;
+
+  /// Classify a global function reference for the current subtarget.
+  unsigned char classifyGlobalFunctionReference(const GlobalValue *GV,
+                                                const Module &M) const;
+  unsigned char classifyGlobalFunctionReference(const GlobalValue *GV) const;
+
+  /// Classify a blockaddress reference for the current subtarget according to
+  /// how we should reference it in a non-pcrel context.
+  unsigned char classifyBlockAddressReference() const;
+
+  /// Return true if the subtarget allows calls to immediate address.
+  bool isLegalToCallImmediateAddr() const;
+
+  /// This function returns the name of a function which has an interface
+  /// like the non-standard bzero function, if such a function exists on
+  /// the current subtarget and it is considered prefereable over
+  /// memset with zero passed as the second argument. Otherwise it
+  /// returns null.
+  const char *getBZeroEntry() const;
+
+  /// This function returns true if the target has sincos() routine in its
+  /// compiler runtime or math libraries.
+  bool hasSinCos() const;
+
+  /// Enable the MachineScheduler pass for all X86 subtargets.
+  bool enableMachineScheduler() const override { return true; }
+
+  bool enableEarlyIfConversion() const override;
+
+  /// Return the instruction itineraries based on the subtarget selection.
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
+
+  AntiDepBreakMode getAntiDepBreakMode() const override {
+    return TargetSubtargetInfo::ANTIDEP_CRITICAL;
+  }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
new file mode 100644
index 000000000000..aa5cfc64e9eb
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -0,0 +1,405 @@
+//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetMachine.h"
+#include "X86.h"
+#include "X86CallLowering.h"
+#include "X86TargetObjectFile.h"
+#include "X86TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
+                               cl::desc("Enable the machine combiner pass"),
+                               cl::init(true), cl::Hidden);
+
+namespace llvm {
+void initializeWinEHStatePassPass(PassRegistry &);
+}
+
+extern "C" void LLVMInitializeX86Target() {
+  // Register the target.
+  RegisterTargetMachine<X86TargetMachine> X(getTheX86_32Target());
+  RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target());
+
+  PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeGlobalISel(PR);
+  initializeWinEHStatePassPass(PR);
+  initializeFixupBWInstPassPass(PR);
+  initializeEvexToVexInstPassPass(PR);
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO()) {
+    if (TT.getArch() == Triple::x86_64)
+      return make_unique<X86_64MachoTargetObjectFile>();
+    return make_unique<TargetLoweringObjectFileMachO>();
+  }
+
+  if (TT.isOSFreeBSD())
+    return make_unique<X86FreeBSDTargetObjectFile>();
+  if (TT.isOSLinux() || TT.isOSNaCl())
+    return make_unique<X86LinuxNaClTargetObjectFile>();
+  if (TT.isOSFuchsia())
+    return make_unique<X86FuchsiaTargetObjectFile>();
+  if (TT.isOSBinFormatELF())
+    return make_unique<X86ELFTargetObjectFile>();
+  if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment())
+    return make_unique<X86WindowsTargetObjectFile>();
+  if (TT.isOSBinFormatCOFF())
+    return make_unique<TargetLoweringObjectFileCOFF>();
+  llvm_unreachable("unknown subtarget type");
+}
+
+static std::string computeDataLayout(const Triple &TT) {
+  // X86 is little endian
+  std::string Ret = "e";
+
+  Ret += DataLayout::getManglingComponent(TT);
+  // X86 and x32 have 32 bit pointers.
+  if ((TT.isArch64Bit() &&
+       (TT.getEnvironment() == Triple::GNUX32 || TT.isOSNaCl())) ||
+      !TT.isArch64Bit())
+    Ret += "-p:32:32";
+
+  // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
+  if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl())
+    Ret += "-i64:64";
+  else if (TT.isOSIAMCU())
+    Ret += "-i64:32-f64:32";
+  else
+    Ret += "-f64:32:64";
+
+  // Some ABIs align long double to 128 bits, others to 32.
+  if (TT.isOSNaCl() || TT.isOSIAMCU())
+    ; // No f80
+  else if (TT.isArch64Bit() || TT.isOSDarwin())
+    Ret += "-f80:128";
+  else
+    Ret += "-f80:32";
+
+  if (TT.isOSIAMCU())
+    Ret += "-f128:32";
+
+  // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
+  if (TT.isArch64Bit())
+    Ret += "-n8:16:32:64";
+  else
+    Ret += "-n8:16:32";
+
+  // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
+  if ((!TT.isArch64Bit() && TT.isOSWindows()) || TT.isOSIAMCU())
+    Ret += "-a:0:32-S32";
+  else
+    Ret += "-S128";
+
+  return Ret;
+}
+
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+                                           Optional<Reloc::Model> RM) {
+  bool is64Bit = TT.getArch() == Triple::x86_64;
+  if (!RM.hasValue()) {
+    // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
+    // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
+    // use static relocation model by default.
+    if (TT.isOSDarwin()) {
+      if (is64Bit)
+        return Reloc::PIC_;
+      return Reloc::DynamicNoPIC;
+    }
+    if (TT.isOSWindows() && is64Bit)
+      return Reloc::PIC_;
+    return Reloc::Static;
+  }
+
+  // ELF and X86-64 don't have a distinct DynamicNoPIC model.  DynamicNoPIC
+  // is defined as a model for code which may be used in static or dynamic
+  // executables but not necessarily a shared library. On X86-32 we just
+  // compile in -static mode, in x86-64 we use PIC.
+  if (*RM == Reloc::DynamicNoPIC) {
+    if (is64Bit)
+      return Reloc::PIC_;
+    if (!TT.isOSDarwin())
+      return Reloc::Static;
+  }
+
+  // If we are on Darwin, disallow static relocation model in X86-64 mode, since
+  // the Mach-O file format doesn't support it.
+  if (*RM == Reloc::Static && TT.isOSDarwin() && is64Bit)
+    return Reloc::PIC_;
+
+  return *RM;
+}
+
+/// Create an X86 target.
+///
+X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
+                                   StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
+                                   Optional<Reloc::Model> RM,
+                                   CodeModel::Model CM, CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
+                        getEffectiveRelocModel(TT, RM), CM, OL),
+      TLOF(createTLOF(getTargetTriple())) {
+  // Windows stack unwinder gets confused when execution flow "falls through"
+  // after a call to 'noreturn' function.
+  // To prevent that, we emit a trap for 'unreachable' IR instructions.
+  // (which on X86, happens to be the 'ud2' instruction)
+  // On PS4, the "return address" of a 'noreturn' call must still be within
+  // the calling function, and TrapUnreachable is an easy way to get that.
+  // The check here for 64-bit windows is a bit icky, but as we're unlikely
+  // to ever want to mix 32 and 64-bit windows code in a single module
+  // this should be fine.
+  if ((TT.isOSWindows() && TT.getArch() == Triple::x86_64) || TT.isPS4())
+    this->Options.TrapUnreachable = true;
+
+  initAsmInfo();
+}
+
+X86TargetMachine::~X86TargetMachine() {}
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+struct X86GISelActualAccessor : public GISelAccessor {
+  std::unique_ptr<CallLowering> CL;
+  X86GISelActualAccessor(CallLowering* CL): CL(CL) {}
+  const CallLowering *getCallLowering() const override {
+    return CL.get();
+  }
+  const InstructionSelector *getInstructionSelector() const override {
+    //TODO: Implement
+    return nullptr;
+  }
+  const LegalizerInfo *getLegalizerInfo() const override {
+    //TODO: Implement
+    return nullptr;
+  }
+  const RegisterBankInfo *getRegBankInfo() const override {
+    //TODO: Implement
+    return nullptr;
+  }
+};
+} // End anonymous namespace.
+#endif
+const X86Subtarget *
+X86TargetMachine::getSubtargetImpl(const Function &F) const {
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  StringRef CPU = !CPUAttr.hasAttribute(Attribute::None)
+                      ? CPUAttr.getValueAsString()
+                      : (StringRef)TargetCPU;
+  StringRef FS = !FSAttr.hasAttribute(Attribute::None)
+                     ? FSAttr.getValueAsString()
+                     : (StringRef)TargetFS;
+
+  SmallString<512> Key;
+  Key.reserve(CPU.size() + FS.size());
+  Key += CPU;
+  Key += FS;
+
+  // FIXME: This is related to the code below to reset the target options,
+  // we need to know whether or not the soft float flag is set on the
+  // function before we can generate a subtarget. We also need to use
+  // it as a key for the subtarget since that can be the only difference
+  // between two functions.
+  bool SoftFloat =
+      F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+  // If the soft float attribute is set on the function turn on the soft float
+  // subtarget feature.
+  if (SoftFloat)
+    Key += FS.empty() ? "+soft-float" : ",+soft-float";
+
+  FS = Key.substr(CPU.size());
+
+  auto &I = SubtargetMap[Key];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
+                                        Options.StackAlignmentOverride);
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+    GISelAccessor *GISel = new GISelAccessor();
+#else
+    X86GISelActualAccessor *GISel = new X86GISelActualAccessor(
+        new X86CallLowering(*I->getTargetLowering()));
+#endif
+    I->setGISelAccessor(*GISel);
+  }
+  return I.get();
+}
+
+//===----------------------------------------------------------------------===//
+// Command line options for x86
+//===----------------------------------------------------------------------===//
+static cl::opt<bool>
+UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
+  cl::desc("Minimize AVX to SSE transition penalty"),
+  cl::init(true));
+
+//===----------------------------------------------------------------------===//
+// X86 TTI query.
+//===----------------------------------------------------------------------===//
+
+TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(X86TTIImpl(this, F));
+  });
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// X86 Code Generator Pass Configuration Options.
+class X86PassConfig : public TargetPassConfig {
+public:
+  X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  X86TargetMachine &getX86TargetMachine() const {
+    return getTM<X86TargetMachine>();
+  }
+
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override {
+    ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+    DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
+    return DAG;
+  }
+
+  void addIRPasses() override;
+  bool addInstSelector() override;
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+  bool addIRTranslator() override;
+  bool addLegalizeMachineIR() override;
+  bool addRegBankSelect() override;
+  bool addGlobalInstructionSelect() override;
+#endif
+bool addILPOpts() override;
+  bool addPreISel() override;
+  void addPreRegAlloc() override;
+  void addPostRegAlloc() override;
+  void addPreEmitPass() override;
+  void addPreSched2() override;
+};
+} // namespace
+
+TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new X86PassConfig(this, PM);
+}
+
+void X86PassConfig::addIRPasses() {
+  addPass(createAtomicExpandPass(&getX86TargetMachine()));
+
+  TargetPassConfig::addIRPasses();
+
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createInterleavedAccessPass(TM));
+}
+
+bool X86PassConfig::addInstSelector() {
+  // Install an instruction selector.
+  addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
+
+  // For ELF, cleanup any local-dynamic TLS accesses.
+  if (TM->getTargetTriple().isOSBinFormatELF() &&
+      getOptLevel() != CodeGenOpt::None)
+    addPass(createCleanupLocalDynamicTLSPass());
+
+  addPass(createX86GlobalBaseRegPass());
+  return false;
+}
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+bool X86PassConfig::addIRTranslator() {
+  addPass(new IRTranslator());
+  return false;
+}
+
+bool X86PassConfig::addLegalizeMachineIR() {
+  //TODO: Implement
+  return false;
+}
+
+bool X86PassConfig::addRegBankSelect() {
+  //TODO: Implement
+  return false;
+}
+
+bool X86PassConfig::addGlobalInstructionSelect() {
+  //TODO: Implement
+  return false;
+}
+#endif
+
+bool X86PassConfig::addILPOpts() {
+  addPass(&EarlyIfConverterID);
+  if (EnableMachineCombinerPass)
+    addPass(&MachineCombinerID);
+  return true;
+}
+
+bool X86PassConfig::addPreISel() {
+  // Only add this pass for 32-bit x86 Windows.
+  const Triple &TT = TM->getTargetTriple();
+  if (TT.isOSWindows() && TT.getArch() == Triple::x86)
+    addPass(createX86WinEHStatePass());
+  return true;
+}
+
+void X86PassConfig::addPreRegAlloc() {
+  if (getOptLevel() != CodeGenOpt::None) {
+    addPass(createX86FixupSetCC());
+    addPass(createX86OptimizeLEAs());
+    addPass(createX86CallFrameOptimization());
+  }
+
+  addPass(createX86WinAllocaExpander());
+}
+
+void X86PassConfig::addPostRegAlloc() {
+  addPass(createX86FloatingPointStackifierPass());
+}
+
+void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }
+
+void X86PassConfig::addPreEmitPass() {
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createExecutionDependencyFixPass(&X86::VR128XRegClass));
+
+  if (UseVZeroUpper)
+    addPass(createX86IssueVZeroUpperPass());
+
+  if (getOptLevel() != CodeGenOpt::None) {
+    addPass(createX86FixupBWInsts());
+    addPass(createX86PadShortFunctions());
+    addPass(createX86FixupLEAs());
+    addPass(createX86EvexToVexInsts());
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
new file mode 100644
index 000000000000..d756d07926dd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
@@ -0,0 +1,48 @@
+//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
+#define LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class StringRef;
+
+class X86TargetMachine final : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap;
+
+public:
+  X86TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                   StringRef FS, const TargetOptions &Options,
+                   Optional<Reloc::Model> RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL);
+  ~X86TargetMachine() override;
+  const X86Subtarget *getSubtargetImpl(const Function &F) const override;
+
+  TargetIRAnalysis getTargetIRAnalysis() override;
+
+  // Set up the pass pipeline.
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
new file mode 100644
index 000000000000..7f70829cb6c6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -0,0 +1,184 @@
+//===-- X86TargetObjectFile.cpp - X86 Object Info -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetObjectFile.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+using namespace dwarf;
+
+const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
+    MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+
+  // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
+  // is an indirect pc-relative reference.
+  if ((Encoding & DW_EH_PE_indirect) && (Encoding & DW_EH_PE_pcrel)) {
+    const MCSymbol *Sym = TM.getSymbol(GV);
+    const MCExpr *Res =
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+    const MCExpr *Four = MCConstantExpr::create(4, getContext());
+    return MCBinaryExpr::createAdd(Res, Four, getContext());
+  }
+
+  return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+      GV, Encoding, TM, MMI, Streamer);
+}
+
+MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol(
+    const GlobalValue *GV, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  return TM.getSymbol(GV);
+}
+
+const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
+    const MCSymbol *Sym, const MCValue &MV, int64_t Offset,
+    MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+  // On Darwin/X86-64, we need to use foo@GOTPCREL+4 to access the got entry
+  // from a data section. In case there's an additional offset, then use
+  // foo@GOTPCREL+4+<offset>.
+  unsigned FinalOff = Offset+MV.getConstant()+4;
+  const MCExpr *Res =
+    MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+  const MCExpr *Off = MCConstantExpr::create(FinalOff, getContext());
+  return MCBinaryExpr::createAdd(Res, Off, getContext());
+}
+
+const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol(
+    const MCSymbol *Sym) const {
+  return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext());
+}
+
+void
+X86FreeBSDTargetObjectFile::Initialize(MCContext &Ctx,
+                                       const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
+void
+X86FuchsiaTargetObjectFile::Initialize(MCContext &Ctx,
+                                       const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
+void
+X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx,
+                                         const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
+const MCExpr *X86WindowsTargetObjectFile::lowerRelativeReference(
+    const GlobalValue *LHS, const GlobalValue *RHS,
+    const TargetMachine &TM) const {
+  // Our symbols should exist in address space zero, cowardly no-op if
+  // otherwise.
+  if (LHS->getType()->getPointerAddressSpace() != 0 ||
+      RHS->getType()->getPointerAddressSpace() != 0)
+    return nullptr;
+
+  // Both ptrtoint instructions must wrap global objects:
+  // - Only global variables are eligible for image relative relocations.
+  // - The subtrahend refers to the special symbol __ImageBase, a GlobalVariable.
+  // We expect __ImageBase to be a global variable without a section, externally
+  // defined.
+  //
+  // It should look something like this: @__ImageBase = external constant i8
+  if (!isa<GlobalObject>(LHS) || !isa<GlobalVariable>(RHS) ||
+      LHS->isThreadLocal() || RHS->isThreadLocal() ||
+      RHS->getName() != "__ImageBase" || !RHS->hasExternalLinkage() ||
+      cast<GlobalVariable>(RHS)->hasInitializer() || RHS->hasSection())
+    return nullptr;
+
+  return MCSymbolRefExpr::create(TM.getSymbol(LHS),
+                                 MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                 getContext());
+}
+
+static std::string APIntToHexString(const APInt &AI) {
+  unsigned Width = (AI.getBitWidth() / 8) * 2;
+  std::string HexString = utohexstr(AI.getLimitedValue(), /*LowerCase=*/true);
+  unsigned Size = HexString.size();
+  assert(Width >= Size && "hex string is too large!");
+  HexString.insert(HexString.begin(), Width - Size, '0');
+
+  return HexString;
+}
+
+static std::string scalarConstantToHexString(const Constant *C) {
+  Type *Ty = C->getType();
+  if (isa<UndefValue>(C)) {
+    return APIntToHexString(APInt::getNullValue(Ty->getPrimitiveSizeInBits()));
+  } else if (const auto *CFP = dyn_cast<ConstantFP>(C)) {
+    return APIntToHexString(CFP->getValueAPF().bitcastToAPInt());
+  } else if (const auto *CI = dyn_cast<ConstantInt>(C)) {
+    return APIntToHexString(CI->getValue());
+  } else {
+    unsigned NumElements;
+    if (isa<VectorType>(Ty))
+      NumElements = Ty->getVectorNumElements();
+    else
+      NumElements = Ty->getArrayNumElements();
+    std::string HexString;
+    for (int I = NumElements - 1, E = -1; I != E; --I)
+      HexString += scalarConstantToHexString(C->getAggregateElement(I));
+    return HexString;
+  }
+}
+
+MCSection *X86WindowsTargetObjectFile::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C,
+    unsigned &Align) const {
+  if (Kind.isMergeableConst() && C) {
+    const unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                     COFF::IMAGE_SCN_MEM_READ |
+                                     COFF::IMAGE_SCN_LNK_COMDAT;
+    std::string COMDATSymName;
+    if (Kind.isMergeableConst4()) {
+      if (Align <= 4) {
+        COMDATSymName = "__real@" + scalarConstantToHexString(C);
+        Align = 4;
+      }
+    } else if (Kind.isMergeableConst8()) {
+      if (Align <= 8) {
+        COMDATSymName = "__real@" + scalarConstantToHexString(C);
+        Align = 8;
+      }
+    } else if (Kind.isMergeableConst16()) {
+      if (Align <= 16) {
+        COMDATSymName = "__xmm@" + scalarConstantToHexString(C);
+        Align = 16;
+      }
+    } else if (Kind.isMergeableConst32()) {
+      if (Align <= 32) {
+        COMDATSymName = "__ymm@" + scalarConstantToHexString(C);
+        Align = 32;
+      }
+    }
+
+    if (!COMDATSymName.empty())
+      return getContext().getCOFFSection(".rdata", Characteristics, Kind,
+                                         COMDATSymName,
+                                         COFF::IMAGE_COMDAT_SELECT_ANY);
+  }
+
+  return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C, Align);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
new file mode 100644
index 000000000000..39d2e84e5ed7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -0,0 +1,84 @@
+//===-- X86TargetObjectFile.h - X86 Object Info -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+
+  /// X86_64MachoTargetObjectFile - This TLOF implementation is used for Darwin
+  /// x86-64.
+  class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+  public:
+    const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+                                          unsigned Encoding,
+                                          const TargetMachine &TM,
+                                          MachineModuleInfo *MMI,
+                                          MCStreamer &Streamer) const override;
+
+    // getCFIPersonalitySymbol - The symbol that gets passed to
+    // .cfi_personality.
+    MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV,
+                                      const TargetMachine &TM,
+                                      MachineModuleInfo *MMI) const override;
+
+    const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
+                                            const MCValue &MV, int64_t Offset,
+                                            MachineModuleInfo *MMI,
+                                            MCStreamer &Streamer) const override;
+  };
+
+  /// \brief This implemenatation is used for X86 ELF targets that don't
+  /// have a further specialization.
+  class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+  public:
+    X86ELFTargetObjectFile() {
+      PLTRelativeVariantKind = MCSymbolRefExpr::VK_PLT;
+    }
+
+    /// \brief Describe a TLS variable address within debug info.
+    const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
+  };
+
+  /// X86FreeBSDTargetObjectFile - This implementation is used for FreeBSD
+  /// on x86 and x86-64.
+  class X86FreeBSDTargetObjectFile : public X86ELFTargetObjectFile {
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+  };
+
+  /// \brief This implementation is used for Fuchsia on x86-64.
+  class X86FuchsiaTargetObjectFile : public X86ELFTargetObjectFile {
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+  };
+
+  /// X86LinuxNaClTargetObjectFile - This implementation is used for linux and
+  /// Native Client on x86 and x86-64.
+  class X86LinuxNaClTargetObjectFile : public X86ELFTargetObjectFile {
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+  };
+
+  /// \brief This implementation is used for Windows targets on x86 and x86-64.
+  class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF {
+    const MCExpr *
+    lowerRelativeReference(const GlobalValue *LHS, const GlobalValue *RHS,
+                           const TargetMachine &TM) const override;
+
+    /// \brief Given a mergeable constant with the specified size and relocation
+    /// information, return a section that it should be placed in.
+    MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
+                                     const Constant *C,
+                                     unsigned &Align) const override;
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
new file mode 100644
index 000000000000..2b0e672d56f2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -0,0 +1,2250 @@
+//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// X86 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+/// About Cost Model numbers used below it's necessary to say the following:
+/// the numbers correspond to some "generic" X86 CPU instead of usage of
+/// concrete CPU model. Usually the numbers correspond to CPU where the feature
+/// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
+/// the lookups below the cost is based on Nehalem as that was the first CPU
+/// to support that feature level and thus has most likely the worst case cost.
+/// Some examples of other technologies/CPUs:
+///   SSE 3   - Pentium4 / Athlon64
+///   SSE 4.1 - Penryn
+///   SSE 4.2 - Nehalem
+///   AVX     - Sandy Bridge
+///   AVX2    - Haswell
+///   AVX-512 - Xeon Phi / Skylake
+/// And some examples of instruction target dependent costs (latency)
+///                   divss     sqrtss          rsqrtss
+///   AMD K7            11-16     19              3
+///   Piledriver        9-24      13-15           5
+///   Jaguar            14        16              2
+///   Pentium II,III    18        30              2
+///   Nehalem           7-14      7-18            3
+///   Haswell           10-13     11              5
+/// TODO: Develop and implement  the target dependent cost model and
+/// specialize cost numbers for different Cost Model Targets such as throughput,
+/// code size, latency and uop count.
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86tti"
+
+//===----------------------------------------------------------------------===//
+//
+// X86 cost model.
+//
+//===----------------------------------------------------------------------===//
+
+TargetTransformInfo::PopcntSupportKind
+X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  // TODO: Currently the __builtin_popcount() implementation using SSE3
+  //   instructions is inefficient. Once the problem is fixed, we should
+  //   call ST->hasSSE3() instead of ST->hasPOPCNT().
+  return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
+}
+
+unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
+  if (Vector && !ST->hasSSE1())
+    return 0;
+
+  if (ST->is64Bit()) {
+    if (Vector && ST->hasAVX512())
+      return 32;
+    return 16;
+  }
+  return 8;
+}
+
+unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
+  if (Vector) {
+    if (ST->hasAVX512()) return 512;
+    if (ST->hasAVX()) return 256;
+    if (ST->hasSSE1()) return 128;
+    return 0;
+  }
+
+  if (ST->is64Bit())
+    return 64;
+
+  return 32;
+}
+
+unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
+  // If the loop will not be vectorized, don't interleave the loop.
+  // Let regular unroll to unroll the loop, which saves the overflow
+  // check and memory check cost.
+  if (VF == 1)
+    return 1;
+
+  if (ST->isAtom())
+    return 1;
+
+  // Sandybridge and Haswell have multiple execution ports and pipelined
+  // vector units.
+  if (ST->hasAVX())
+    return 4;
+
+  return 2;
+}
+
+int X86TTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
+    TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  if (ISD == ISD::SDIV &&
+      Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
+    // On X86, vector signed division by constants power-of-two are
+    // normally expanded to the sequence SRA + SRL + ADD + SRA.
+    // The OperandValue properties many not be same as that of previous
+    // operation;conservatively assume OP_None.
+    int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
+                                          Op2Info, TargetTransformInfo::OP_None,
+                                          TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+
+    return Cost;
+  }
+
+  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
+    { ISD::SDIV, MVT::v32i16,  6 }, // vpmulhw sequence
+    { ISD::UDIV, MVT::v32i16,  6 }, // vpmulhuw sequence
+  };
+
+  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      ST->hasBWI()) {
+    if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX512UniformConstCostTable[] = {
+    { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
+    { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
+  };
+
+  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      ST->hasAVX512()) {
+    if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX2UniformConstCostTable[] = {
+    { ISD::SRA,  MVT::v4i64,   4 }, // 2 x psrad + shuffle.
+
+    { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
+    { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
+    { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
+    { ISD::UDIV, MVT::v8i32,  15 }, // vpmuludq sequence
+  };
+
+  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      ST->hasAVX2()) {
+    if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry SSE2UniformConstCostTable[] = {
+    { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence
+    { ISD::SDIV, MVT::v8i16,   6 }, // pmulhw sequence
+    { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence
+    { ISD::UDIV, MVT::v8i16,   6 }, // pmulhuw sequence
+    { ISD::SDIV, MVT::v8i32,  38 }, // pmuludq sequence
+    { ISD::SDIV, MVT::v4i32,  19 }, // pmuludq sequence
+    { ISD::UDIV, MVT::v8i32,  30 }, // pmuludq sequence
+    { ISD::UDIV, MVT::v4i32,  15 }, // pmuludq sequence
+  };
+
+  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      ST->hasSSE2()) {
+    // pmuldq sequence.
+    if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
+      return LT.first * 30;
+    if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
+      return LT.first * 15;
+
+    if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX512DQCostTable[] = {
+    { ISD::MUL,  MVT::v2i64, 1 },
+    { ISD::MUL,  MVT::v4i64, 1 },
+    { ISD::MUL,  MVT::v8i64, 1 }
+  };
+
+  // Look for AVX512DQ lowering tricks for custom cases.
+  if (ST->hasDQI()) {
+    if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX512BWCostTable[] = {
+    { ISD::MUL,   MVT::v64i8,     11 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,   MVT::v32i8,      4 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,   MVT::v16i8,      4 }, // extend/pmullw/trunc sequence.
+
+    // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+    { ISD::SDIV,  MVT::v64i8,  64*20 },
+    { ISD::SDIV,  MVT::v32i16, 32*20 },
+    { ISD::SDIV,  MVT::v16i32, 16*20 },
+    { ISD::SDIV,  MVT::v8i64,   8*20 },
+    { ISD::UDIV,  MVT::v64i8,  64*20 },
+    { ISD::UDIV,  MVT::v32i16, 32*20 },
+    { ISD::UDIV,  MVT::v16i32, 16*20 },
+    { ISD::UDIV,  MVT::v8i64,   8*20 },
+  };
+
+  // Look for AVX512BW lowering tricks for custom cases.
+  if (ST->hasBWI()) {
+    if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX512CostTable[] = {
+    { ISD::SHL,     MVT::v16i32,    1 },
+    { ISD::SRL,     MVT::v16i32,    1 },
+    { ISD::SRA,     MVT::v16i32,    1 },
+    { ISD::SHL,     MVT::v8i64,     1 },
+    { ISD::SRL,     MVT::v8i64,     1 },
+    { ISD::SRA,     MVT::v8i64,     1 },
+
+    { ISD::MUL,     MVT::v32i8,    13 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,     MVT::v16i8,     5 }, // extend/pmullw/trunc sequence.
+  };
+
+  if (ST->hasAVX512()) {
+    if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX2CostTable[] = {
+    // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
+    // customize them to detect the cases where shift amount is a scalar one.
+    { ISD::SHL,     MVT::v4i32,    1 },
+    { ISD::SRL,     MVT::v4i32,    1 },
+    { ISD::SRA,     MVT::v4i32,    1 },
+    { ISD::SHL,     MVT::v8i32,    1 },
+    { ISD::SRL,     MVT::v8i32,    1 },
+    { ISD::SRA,     MVT::v8i32,    1 },
+    { ISD::SHL,     MVT::v2i64,    1 },
+    { ISD::SRL,     MVT::v2i64,    1 },
+    { ISD::SHL,     MVT::v4i64,    1 },
+    { ISD::SRL,     MVT::v4i64,    1 },
+  };
+
+  // Look for AVX2 lowering tricks.
+  if (ST->hasAVX2()) {
+    if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
+        (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+         Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+      // On AVX2, a packed v16i16 shift left by a constant build_vector
+      // is lowered into a vector multiply (vpmullw).
+      return LT.first;
+
+    if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry XOPCostTable[] = {
+    // 128bit shifts take 1cy, but right shifts require negation beforehand.
+    { ISD::SHL,     MVT::v16i8,    1 },
+    { ISD::SRL,     MVT::v16i8,    2 },
+    { ISD::SRA,     MVT::v16i8,    2 },
+    { ISD::SHL,     MVT::v8i16,    1 },
+    { ISD::SRL,     MVT::v8i16,    2 },
+    { ISD::SRA,     MVT::v8i16,    2 },
+    { ISD::SHL,     MVT::v4i32,    1 },
+    { ISD::SRL,     MVT::v4i32,    2 },
+    { ISD::SRA,     MVT::v4i32,    2 },
+    { ISD::SHL,     MVT::v2i64,    1 },
+    { ISD::SRL,     MVT::v2i64,    2 },
+    { ISD::SRA,     MVT::v2i64,    2 },
+    // 256bit shifts require splitting if AVX2 didn't catch them above.
+    { ISD::SHL,     MVT::v32i8,    2 },
+    { ISD::SRL,     MVT::v32i8,    4 },
+    { ISD::SRA,     MVT::v32i8,    4 },
+    { ISD::SHL,     MVT::v16i16,   2 },
+    { ISD::SRL,     MVT::v16i16,   4 },
+    { ISD::SRA,     MVT::v16i16,   4 },
+    { ISD::SHL,     MVT::v8i32,    2 },
+    { ISD::SRL,     MVT::v8i32,    4 },
+    { ISD::SRA,     MVT::v8i32,    4 },
+    { ISD::SHL,     MVT::v4i64,    2 },
+    { ISD::SRL,     MVT::v4i64,    4 },
+    { ISD::SRA,     MVT::v4i64,    4 },
+  };
+
+  // Look for XOP lowering tricks.
+  if (ST->hasXOP()) {
+    if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX2CustomCostTable[] = {
+    { ISD::SHL,  MVT::v32i8,      11 }, // vpblendvb sequence.
+    { ISD::SHL,  MVT::v16i16,     10 }, // extend/vpsrlvd/pack sequence.
+
+    { ISD::SRL,  MVT::v32i8,      11 }, // vpblendvb sequence.
+    { ISD::SRL,  MVT::v16i16,     10 }, // extend/vpsrlvd/pack sequence.
+
+    { ISD::SRA,  MVT::v32i8,      24 }, // vpblendvb sequence.
+    { ISD::SRA,  MVT::v16i16,     10 }, // extend/vpsravd/pack sequence.
+    { ISD::SRA,  MVT::v2i64,       4 }, // srl/xor/sub sequence.
+    { ISD::SRA,  MVT::v4i64,       4 }, // srl/xor/sub sequence.
+
+    { ISD::MUL,   MVT::v32i8,     17 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,   MVT::v16i8,      7 }, // extend/pmullw/trunc sequence.
+
+    { ISD::FDIV,  MVT::f32,        7 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV,  MVT::v4f32,      7 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV,  MVT::v8f32,     14 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV,  MVT::f64,       14 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV,  MVT::v2f64,     14 }, // Haswell from http://www.agner.org/
+    { ISD::FDIV,  MVT::v4f64,     28 }, // Haswell from http://www.agner.org/
+  };
+
+  // Look for AVX2 lowering tricks for custom cases.
+  if (ST->hasAVX2()) {
+    if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVXCustomCostTable[] = {
+    { ISD::MUL,   MVT::v32i8,  26 }, // extend/pmullw/trunc sequence.
+
+    { ISD::FDIV,  MVT::f32,    14 }, // SNB from http://www.agner.org/
+    { ISD::FDIV,  MVT::v4f32,  14 }, // SNB from http://www.agner.org/
+    { ISD::FDIV,  MVT::v8f32,  28 }, // SNB from http://www.agner.org/
+    { ISD::FDIV,  MVT::f64,    22 }, // SNB from http://www.agner.org/
+    { ISD::FDIV,  MVT::v2f64,  22 }, // SNB from http://www.agner.org/
+    { ISD::FDIV,  MVT::v4f64,  44 }, // SNB from http://www.agner.org/
+
+    // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+    { ISD::SDIV,  MVT::v32i8,  32*20 },
+    { ISD::SDIV,  MVT::v16i16, 16*20 },
+    { ISD::SDIV,  MVT::v8i32,  8*20 },
+    { ISD::SDIV,  MVT::v4i64,  4*20 },
+    { ISD::UDIV,  MVT::v32i8,  32*20 },
+    { ISD::UDIV,  MVT::v16i16, 16*20 },
+    { ISD::UDIV,  MVT::v8i32,  8*20 },
+    { ISD::UDIV,  MVT::v4i64,  4*20 },
+  };
+
+  // Look for AVX2 lowering tricks for custom cases.
+  if (ST->hasAVX()) {
+    if (const auto *Entry = CostTableLookup(AVXCustomCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry SSE42FloatCostTable[] = {
+    { ISD::FDIV,  MVT::f32,   14 }, // Nehalem from http://www.agner.org/
+    { ISD::FDIV,  MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
+    { ISD::FDIV,  MVT::f64,   22 }, // Nehalem from http://www.agner.org/
+    { ISD::FDIV,  MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
+  };
+
+  if (ST->hasSSE42()) {
+    if (const auto *Entry = CostTableLookup(SSE42FloatCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry
+  SSE2UniformCostTable[] = {
+    // Uniform splats are cheaper for the following instructions.
+    { ISD::SHL,  MVT::v16i8,  1 }, // psllw.
+    { ISD::SHL,  MVT::v32i8,  2 }, // psllw.
+    { ISD::SHL,  MVT::v8i16,  1 }, // psllw.
+    { ISD::SHL,  MVT::v16i16, 2 }, // psllw.
+    { ISD::SHL,  MVT::v4i32,  1 }, // pslld
+    { ISD::SHL,  MVT::v8i32,  2 }, // pslld
+    { ISD::SHL,  MVT::v2i64,  1 }, // psllq.
+    { ISD::SHL,  MVT::v4i64,  2 }, // psllq.
+
+    { ISD::SRL,  MVT::v16i8,  1 }, // psrlw.
+    { ISD::SRL,  MVT::v32i8,  2 }, // psrlw.
+    { ISD::SRL,  MVT::v8i16,  1 }, // psrlw.
+    { ISD::SRL,  MVT::v16i16, 2 }, // psrlw.
+    { ISD::SRL,  MVT::v4i32,  1 }, // psrld.
+    { ISD::SRL,  MVT::v8i32,  2 }, // psrld.
+    { ISD::SRL,  MVT::v2i64,  1 }, // psrlq.
+    { ISD::SRL,  MVT::v4i64,  2 }, // psrlq.
+
+    { ISD::SRA,  MVT::v16i8,  4 }, // psrlw, pand, pxor, psubb.
+    { ISD::SRA,  MVT::v32i8,  8 }, // psrlw, pand, pxor, psubb.
+    { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
+    { ISD::SRA,  MVT::v16i16, 2 }, // psraw.
+    { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
+    { ISD::SRA,  MVT::v8i32,  2 }, // psrad.
+    { ISD::SRA,  MVT::v2i64,  4 }, // 2 x psrad + shuffle.
+    { ISD::SRA,  MVT::v4i64,  8 }, // 2 x psrad + shuffle.
+  };
+
+  if (ST->hasSSE2() &&
+      ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
+       (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+    if (const auto *Entry =
+            CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  if (ISD == ISD::SHL &&
+      Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
+    MVT VT = LT.second;
+    // Vector shift left by non uniform constant can be lowered
+    // into vector multiply (pmullw/pmulld).
+    if ((VT == MVT::v8i16 && ST->hasSSE2()) ||
+        (VT == MVT::v4i32 && ST->hasSSE41()))
+      return LT.first;
+
+    // v16i16 and v8i32 shifts by non-uniform constants are lowered into a
+    // sequence of extract + two vector multiply + insert.
+    if ((VT == MVT::v8i32 || VT == MVT::v16i16) &&
+       (ST->hasAVX() && !ST->hasAVX2()))
+      ISD = ISD::MUL;
+
+    // A vector shift left by non uniform constant is converted
+    // into a vector multiply; the new multiply is eventually
+    // lowered into a sequence of shuffles and 2 x pmuludq.
+    if (VT == MVT::v4i32 && ST->hasSSE2())
+      ISD = ISD::MUL;
+  }
+
+  static const CostTblEntry SSE41CostTable[] = {
+    { ISD::SHL,  MVT::v16i8,    11 }, // pblendvb sequence.
+    { ISD::SHL,  MVT::v32i8,  2*11 }, // pblendvb sequence.
+    { ISD::SHL,  MVT::v8i16,    14 }, // pblendvb sequence.
+    { ISD::SHL,  MVT::v16i16, 2*14 }, // pblendvb sequence.
+
+    { ISD::SRL,  MVT::v16i8,    12 }, // pblendvb sequence.
+    { ISD::SRL,  MVT::v32i8,  2*12 }, // pblendvb sequence.
+    { ISD::SRL,  MVT::v8i16,    14 }, // pblendvb sequence.
+    { ISD::SRL,  MVT::v16i16, 2*14 }, // pblendvb sequence.
+    { ISD::SRL,  MVT::v4i32,    11 }, // Shift each lane + blend.
+    { ISD::SRL,  MVT::v8i32,  2*11 }, // Shift each lane + blend.
+
+    { ISD::SRA,  MVT::v16i8,    24 }, // pblendvb sequence.
+    { ISD::SRA,  MVT::v32i8,  2*24 }, // pblendvb sequence.
+    { ISD::SRA,  MVT::v8i16,    14 }, // pblendvb sequence.
+    { ISD::SRA,  MVT::v16i16, 2*14 }, // pblendvb sequence.
+    { ISD::SRA,  MVT::v4i32,    12 }, // Shift each lane + blend.
+    { ISD::SRA,  MVT::v8i32,  2*12 }, // Shift each lane + blend.
+  };
+
+  if (ST->hasSSE41()) {
+    if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry SSE2CostTable[] = {
+    // We don't correctly identify costs of casts because they are marked as
+    // custom.
+    { ISD::SHL,  MVT::v16i8,    26 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v32i8,  2*26 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v8i16,    32 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v16i16, 2*32 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v4i32,   2*5 }, // We optimized this using mul.
+    { ISD::SHL,  MVT::v8i32, 2*2*5 }, // We optimized this using mul.
+    { ISD::SHL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
+    { ISD::SHL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
+
+    { ISD::SRL,  MVT::v16i8,    26 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v32i8,  2*26 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v8i16,    32 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v16i16, 2*32 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v4i32,    16 }, // Shift each lane + blend.
+    { ISD::SRL,  MVT::v8i32,  2*16 }, // Shift each lane + blend.
+    { ISD::SRL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
+    { ISD::SRL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
+
+    { ISD::SRA,  MVT::v16i8,    54 }, // unpacked cmpgtb sequence.
+    { ISD::SRA,  MVT::v32i8,  2*54 }, // unpacked cmpgtb sequence.
+    { ISD::SRA,  MVT::v8i16,    32 }, // cmpgtb sequence.
+    { ISD::SRA,  MVT::v16i16, 2*32 }, // cmpgtb sequence.
+    { ISD::SRA,  MVT::v4i32,    16 }, // Shift each lane + blend.
+    { ISD::SRA,  MVT::v8i32,  2*16 }, // Shift each lane + blend.
+    { ISD::SRA,  MVT::v2i64,    12 }, // srl/xor/sub sequence.
+    { ISD::SRA,  MVT::v4i64,  2*12 }, // srl/xor/sub sequence.
+
+    { ISD::MUL,  MVT::v16i8,    12 }, // extend/pmullw/trunc sequence.
+
+    { ISD::FDIV, MVT::f32,      23 }, // Pentium IV from http://www.agner.org/
+    { ISD::FDIV, MVT::v4f32,    39 }, // Pentium IV from http://www.agner.org/
+    { ISD::FDIV, MVT::f64,      38 }, // Pentium IV from http://www.agner.org/
+    { ISD::FDIV, MVT::v2f64,    69 }, // Pentium IV from http://www.agner.org/
+
+    // It is not a good idea to vectorize division. We have to scalarize it and
+    // in the process we will often end up having to spilling regular
+    // registers. The overhead of division is going to dominate most kernels
+    // anyways so try hard to prevent vectorization of division - it is
+    // generally a bad idea. Assume somewhat arbitrarily that we have to be able
+    // to hide "20 cycles" for each lane.
+    { ISD::SDIV,  MVT::v16i8,  16*20 },
+    { ISD::SDIV,  MVT::v8i16,  8*20 },
+    { ISD::SDIV,  MVT::v4i32,  4*20 },
+    { ISD::SDIV,  MVT::v2i64,  2*20 },
+    { ISD::UDIV,  MVT::v16i8,  16*20 },
+    { ISD::UDIV,  MVT::v8i16,  8*20 },
+    { ISD::UDIV,  MVT::v4i32,  4*20 },
+    { ISD::UDIV,  MVT::v2i64,  2*20 },
+  };
+
+  if (ST->hasSSE2()) {
+    if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX1CostTable[] = {
+    // We don't have to scalarize unsupported ops. We can issue two half-sized
+    // operations and we only need to extract the upper YMM half.
+    // Two ops + 1 extract + 1 insert = 4.
+    { ISD::MUL,     MVT::v16i16,   4 },
+    { ISD::MUL,     MVT::v8i32,    4 },
+    { ISD::SUB,     MVT::v32i8,    4 },
+    { ISD::ADD,     MVT::v32i8,    4 },
+    { ISD::SUB,     MVT::v16i16,   4 },
+    { ISD::ADD,     MVT::v16i16,   4 },
+    { ISD::SUB,     MVT::v8i32,    4 },
+    { ISD::ADD,     MVT::v8i32,    4 },
+    { ISD::SUB,     MVT::v4i64,    4 },
+    { ISD::ADD,     MVT::v4i64,    4 },
+    // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
+    // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
+    // Because we believe v4i64 to be a legal type, we must also include the
+    // split factor of two in the cost table. Therefore, the cost here is 16
+    // instead of 8.
+    { ISD::MUL,     MVT::v4i64,    16 },
+  };
+
+  // Look for AVX1 lowering tricks.
+  if (ST->hasAVX() && !ST->hasAVX2()) {
+    MVT VT = LT.second;
+
+    if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, VT))
+      return LT.first * Entry->Cost;
+  }
+
+  // Custom lowering of vectors.
+  static const CostTblEntry CustomLowered[] = {
+    // A v2i64/v4i64 and multiply is custom lowered as a series of long
+    // multiplies(3), shifts(3) and adds(2).
+    { ISD::MUL,     MVT::v2i64,    8 },
+    { ISD::MUL,     MVT::v4i64,    8 },
+    { ISD::MUL,     MVT::v8i64,    8 }
+  };
+  if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second))
+    return LT.first * Entry->Cost;
+
+  // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
+  // 2x pmuludq, 2x shuffle.
+  if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
+      !ST->hasSSE41())
+    return LT.first * 6;
+
+  static const CostTblEntry SSE1FloatCostTable[] = {
+    { ISD::FDIV, MVT::f32,   17 }, // Pentium III from http://www.agner.org/
+    { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
+  };
+
+  if (ST->hasSSE1())
+    if (const auto *Entry = CostTableLookup(SSE1FloatCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  // Fallback to the default implementation.
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
+}
+
+int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                               Type *SubTp) {
+
+  if (Kind == TTI::SK_Reverse) {
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+    static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+      { ISD::VECTOR_SHUFFLE, MVT::v64i8,  1 }, // vpermb
+      { ISD::VECTOR_SHUFFLE, MVT::v32i8,  1 }  // vpermb
+    };
+
+    if (ST->hasVBMI())
+      if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
+                                              ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
+
+    static const CostTblEntry AVX512BWShuffleTbl[] = {
+      { ISD::VECTOR_SHUFFLE, MVT::v32i16, 1 }, // vpermw
+      { ISD::VECTOR_SHUFFLE, MVT::v16i16, 1 }, // vpermw
+      { ISD::VECTOR_SHUFFLE, MVT::v64i8,  6 }  // vextracti64x4 + 2*vperm2i128
+                                               // + 2*pshufb + vinserti64x4
+    };
+
+    if (ST->hasBWI())
+      if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
+                                              ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
+
+    static const CostTblEntry AVX512ShuffleTbl[] = {
+      { ISD::VECTOR_SHUFFLE, MVT::v8f64,  1 }, // vpermpd
+      { ISD::VECTOR_SHUFFLE, MVT::v16f32, 1 }, // vpermps
+      { ISD::VECTOR_SHUFFLE, MVT::v8i64,  1 }, // vpermq
+      { ISD::VECTOR_SHUFFLE, MVT::v16i32, 1 }, // vpermd
+    };
+
+    if (ST->hasAVX512())
+      if (const auto *Entry =
+              CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
+
+    static const CostTblEntry AVX2ShuffleTbl[] = {
+      { ISD::VECTOR_SHUFFLE, MVT::v4f64,  1 }, // vpermpd
+      { ISD::VECTOR_SHUFFLE, MVT::v8f32,  1 }, // vpermps
+      { ISD::VECTOR_SHUFFLE, MVT::v4i64,  1 }, // vpermq
+      { ISD::VECTOR_SHUFFLE, MVT::v8i32,  1 }, // vpermd
+      { ISD::VECTOR_SHUFFLE, MVT::v16i16, 2 }, // vperm2i128 + pshufb
+      { ISD::VECTOR_SHUFFLE, MVT::v32i8,  2 }  // vperm2i128 + pshufb
+    };
+
+    if (ST->hasAVX2())
+      if (const auto *Entry =
+              CostTableLookup(AVX2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
+
+    static const CostTblEntry AVX1ShuffleTbl[] = {
+      { ISD::VECTOR_SHUFFLE, MVT::v4f64,  2 }, // vperm2f128 + vpermilpd
+      { ISD::VECTOR_SHUFFLE, MVT::v8f32,  2 }, // vperm2f128 + vpermilps
+      { ISD::VECTOR_SHUFFLE, MVT::v4i64,  2 }, // vperm2f128 + vpermilpd
+      { ISD::VECTOR_SHUFFLE, MVT::v8i32,  2 }, // vperm2f128 + vpermilps
+      { ISD::VECTOR_SHUFFLE, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
+                                               // + vinsertf128
+      { ISD::VECTOR_SHUFFLE, MVT::v32i8,  4 }  // vextractf128 + 2*pshufb
+                                               // + vinsertf128
+    };
+
+    if (ST->hasAVX())
+      if (const auto *Entry =
+              CostTableLookup(AVX1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
+
+    static const CostTblEntry SSSE3ShuffleTbl[] = {
+      { ISD::VECTOR_SHUFFLE, MVT::v8i16, 1 }, // pshufb
+      { ISD::VECTOR_SHUFFLE, MVT::v16i8, 1 }  // pshufb
+    };
+
+    if (ST->hasSSSE3())
+      if (const auto *Entry =
+              CostTableLookup(SSSE3ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
+
+    static const CostTblEntry SSE2ShuffleTbl[] = {
+      { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 }, // shufpd
+      { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 }, // pshufd
+      { ISD::VECTOR_SHUFFLE, MVT::v4i32, 1 }, // pshufd
+      { ISD::VECTOR_SHUFFLE, MVT::v8i16, 3 }, // pshuflw + pshufhw  + pshufd
+      { ISD::VECTOR_SHUFFLE, MVT::v16i8, 9 }  // 2*pshuflw + 2*pshufhw
+                                              // + 2*pshufd + 2*unpck + packus
+    };
+
+    if (ST->hasSSE2())
+      if (const auto *Entry =
+              CostTableLookup(SSE2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
+
+    static const CostTblEntry SSE1ShuffleTbl[] = {
+        { ISD::VECTOR_SHUFFLE, MVT::v4f32, 1 }, // shufps
+    };
+
+    if (ST->hasSSE1())
+      if (const auto *Entry =
+              CostTableLookup(SSE1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
+
+  } else if (Kind == TTI::SK_Alternate) {
+    // 64-bit packed float vectors (v2f32) are widened to type v4f32.
+    // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+    // The backend knows how to generate a single VEX.256 version of
+    // instruction VPBLENDW if the target supports AVX2.
+    if (ST->hasAVX2() && LT.second == MVT::v16i16)
+      return LT.first;
+
+    static const CostTblEntry AVXAltShuffleTbl[] = {
+      {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1},  // vblendpd
+      {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1},  // vblendpd
+
+      {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1},  // vblendps
+      {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1},  // vblendps
+
+      // This shuffle is custom lowered into a sequence of:
+      //  2x  vextractf128 , 2x vpblendw , 1x vinsertf128
+      {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5},
+
+      // This shuffle is custom lowered into a long sequence of:
+      //  2x vextractf128 , 4x vpshufb , 2x vpor ,  1x vinsertf128
+      {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9}
+    };
+
+    if (ST->hasAVX())
+      if (const auto *Entry = CostTableLookup(AVXAltShuffleTbl,
+                                              ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
+
+    static const CostTblEntry SSE41AltShuffleTbl[] = {
+      // These are lowered into movsd.
+      {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+      {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+
+      // packed float vectors with four elements are lowered into BLENDI dag
+      // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'.
+      {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
+      {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+
+      // This shuffle generates a single pshufw.
+      {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+
+      // There is no instruction that matches a v16i8 alternate shuffle.
+      // The backend will expand it into the sequence 'pshufb + pshufb + or'.
+      {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}
+    };
+
+    if (ST->hasSSE41())
+      if (const auto *Entry = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE,
+                                              LT.second))
+        return LT.first * Entry->Cost;
+
+    static const CostTblEntry SSSE3AltShuffleTbl[] = {
+      {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // movsd
+      {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // movsd
+
+      // SSE3 doesn't have 'blendps'. The following shuffles are expanded into
+      // the sequence 'shufps + pshufd'
+      {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+      {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+
+      {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or
+      {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}  // pshufb + pshufb + or
+    };
+
+    if (ST->hasSSSE3())
+      if (const auto *Entry = CostTableLookup(SSSE3AltShuffleTbl,
+                                              ISD::VECTOR_SHUFFLE, LT.second))
+        return LT.first * Entry->Cost;
+
+    static const CostTblEntry SSEAltShuffleTbl[] = {
+      {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // movsd
+      {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // movsd
+
+      {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd
+      {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd
+
+      // This is expanded into a long sequence of four extract + four insert.
+      {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw.
+
+      // 8 x (pinsrw + pextrw + and + movb + movzb + or)
+      {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48}
+    };
+
+    // Fall-back (SSE3 and SSE2).
+    if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl,
+                                            ISD::VECTOR_SHUFFLE, LT.second))
+      return LT.first * Entry->Cost;
+
+  } else if (Kind == TTI::SK_PermuteTwoSrc) {
+    // We assume that source and destination have the same vector type.
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+    int NumOfDests = LT.first;
+    int NumOfShufflesPerDest = LT.first * 2 - 1;
+    int NumOfShuffles = NumOfDests * NumOfShufflesPerDest;
+
+    static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+        {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermt2b
+        {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1}, // vpermt2b
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}  // vpermt2b
+    };
+
+    if (ST->hasVBMI())
+      if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
+                                              ISD::VECTOR_SHUFFLE, LT.second))
+        return NumOfShuffles * Entry->Cost;
+
+    static const CostTblEntry AVX512BWShuffleTbl[] = {
+        {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermt2w
+        {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermt2w
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},  // vpermt2w
+        {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3},  // zext + vpermt2w + trunc
+        {ISD::VECTOR_SHUFFLE, MVT::v64i8, 19}, // 6 * v32i8 + 1
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}   // zext + vpermt2w + trunc
+    };
+
+    if (ST->hasBWI())
+      if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
+                                              ISD::VECTOR_SHUFFLE, LT.second))
+        return NumOfShuffles * Entry->Cost;
+
+    static const CostTblEntry AVX512ShuffleTbl[] = {
+        {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1},  // vpermt2pd
+        {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermt2ps
+        {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1},  // vpermt2q
+        {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermt2d
+        {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1},  // vpermt2pd
+        {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1},  // vpermt2ps
+        {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1},  // vpermt2q
+        {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1},  // vpermt2d
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // vpermt2pd
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},  // vpermt2ps
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // vpermt2q
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}   // vpermt2d
+    };
+
+    if (ST->hasAVX512())
+      if (const auto *Entry =
+              CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return NumOfShuffles * Entry->Cost;
+
+  } else if (Kind == TTI::SK_PermuteSingleSrc) {
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+    if (LT.first == 1) {
+
+      static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+          {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermb
+          {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1}  // vpermb
+      };
+
+      if (ST->hasVBMI())
+        if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
+                                                ISD::VECTOR_SHUFFLE, LT.second))
+          return Entry->Cost;
+
+      static const CostTblEntry AVX512BWShuffleTbl[] = {
+          {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermw
+          {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermw
+          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},  // vpermw
+          {ISD::VECTOR_SHUFFLE, MVT::v64i8, 8},  // extend to v32i16
+          {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3}   // vpermw + zext/trunc
+      };
+
+      if (ST->hasBWI())
+        if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
+                                                ISD::VECTOR_SHUFFLE, LT.second))
+          return Entry->Cost;
+
+      static const CostTblEntry AVX512ShuffleTbl[] = {
+          {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1},  // vpermpd
+          {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1},  // vpermpd
+          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // vpermpd
+          {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermps
+          {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1},  // vpermps
+          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},  // vpermps
+          {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1},  // vpermq
+          {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1},  // vpermq
+          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // vpermq
+          {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermd
+          {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1},  // vpermd
+          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},  // vpermd
+          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}   // pshufb
+      };
+
+      if (ST->hasAVX512())
+        if (const auto *Entry =
+            CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+          return Entry->Cost;
+
+    } else {
+      // We are going to permute multiple sources and the result will be in
+      // multiple destinations. Providing an accurate cost only for splits where
+      // the element type remains the same.
+
+      MVT LegalVT = LT.second;
+      if (LegalVT.getVectorElementType().getSizeInBits() ==
+              Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
+          LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
+
+        unsigned VecTySize = DL.getTypeStoreSize(Tp);
+        unsigned LegalVTSize = LegalVT.getStoreSize();
+        // Number of source vectors after legalization:
+        unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+        // Number of destination vectors after legalization:
+        unsigned NumOfDests = LT.first;
+
+        Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
+                                           LegalVT.getVectorNumElements());
+
+        unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
+        return NumOfShuffles *
+               getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
+      }
+    }
+  }
+
+  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  // FIXME: Need a better design of the cost table to handle non-simple types of
+  // potential massive combinations (elem_num x src_type x dst_type).
+
+  static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
+    { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
+    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
+    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
+
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
+
+    { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f32,  1 },
+    { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f32,  1 },
+    { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f32,  1 },
+    { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f64,  1 },
+    { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f64,  1 },
+    { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f64,  1 },
+
+    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f64,  1 },
+    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f64,  1 },
+    { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f64,  1 },
+  };
+
+  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
+  // 256-bit wide vectors.
+
+  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
+    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
+    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
+    { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },
+
+    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 1 },
+    { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 1 },
+    { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  1 },
+    { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 },
+
+    // v16i1 -> v16i32 - load + broadcast
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
+
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
+
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i16,  5 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  5 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64, 12 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
+
+    { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, 1 },
+  };
+
+  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
+
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  2 },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  2 },
+    { ISD::TRUNCATE,    MVT::v8i32,  MVT::v8i64,  4 },
+
+    { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  3 },
+    { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  3 },
+
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  8 },
+  };
+
+  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,  6 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,  4 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,  7 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,  4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,  6 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  7 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 6 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
+
+    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i16, 4 },
+    { ISD::TRUNCATE,    MVT::v8i8,  MVT::v8i32,  4 },
+    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32,  5 },
+    { ISD::TRUNCATE,    MVT::v4i8,  MVT::v4i64,  4 },
+    { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i64,  4 },
+    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64,  4 },
+    { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64,  9 },
+
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1,  3 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i1,  3 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i1,  8 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  3 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i8,  3 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  8 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 3 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i16, 3 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 1 },
+
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1,  7 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i1,  7 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i1,  6 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i8,  2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  5 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 6 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 6 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i32, 6 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 9 },
+    // The generic code to compute the scalar overhead is currently broken.
+    // Workaround this limitation by estimating the scalarization overhead
+    // here. We have roughly 10 instructions per scalar element.
+    // Multiply that by the vector width.
+    // FIXME: remove that when PR19268 is fixed.
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 10 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 20 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
+
+    { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
+    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 7 },
+    // This node is expanded into scalarized operations but BasicTTI is overly
+    // optimistic estimating its cost.  It computes 3 per element (one
+    // vector-extract, one scalar conversion and one vector-insert).  The
+    // problem is that the inserts form a read-modify-write chain so latency
+    // should be factored in too.  Inflating the cost per element by 1.
+    { ISD::FP_TO_UINT,  MVT::v8i32, MVT::v8f32, 8*4 },
+    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f64, 4*4 },
+
+    { ISD::FP_EXTEND,   MVT::v4f64,  MVT::v4f32,  1 },
+    { ISD::FP_ROUND,    MVT::v4f32,  MVT::v4f64,  1 },
+  };
+
+  static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
+    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8,    2 },
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8,    2 },
+    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16,   2 },
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32,   2 },
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32,   2 },
+
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   2 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   1 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   2 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  4 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  1 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  1 },
+    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  1 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  3 },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  3 },
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
+
+  };
+
+  static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
+    // These are somewhat magic numbers justified by looking at the output of
+    // Intel's IACA, running some kernels and making sure when we take
+    // legalization into account the throughput will be overestimated.
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+    { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+
+    { ISD::FP_TO_SINT,  MVT::v2i32,  MVT::v2f64,  3 },
+
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   2 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i8,   3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   8 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i8,   2 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   6 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  3 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  4 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  9 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  12 },
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i16,  2 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  10 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  4 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  5 },
+
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  4 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  3 },
+    { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  3 },
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  4 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 7 },
+    { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 10 },
+  };
+
+  std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
+  std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
+
+  if (ST->hasSSE2() && !ST->hasAVX()) {
+    if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+                                                   LTDest.second, LTSrc.second))
+      return LTSrc.first * Entry->Cost;
+  }
+
+  EVT SrcTy = TLI->getValueType(DL, Src);
+  EVT DstTy = TLI->getValueType(DL, Dst);
+
+  // The function getSimpleVT only handles simple value types.
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return BaseT::getCastInstrCost(Opcode, Dst, Src);
+
+  if (ST->hasDQI())
+    if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+
+  if (ST->hasAVX512())
+    if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+
+  if (ST->hasAVX2()) {
+    if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
+  if (ST->hasAVX()) {
+    if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
+  if (ST->hasSSE41()) {
+    if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
+  if (ST->hasSSE2()) {
+    if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+                                                   DstTy.getSimpleVT(),
+                                                   SrcTy.getSimpleVT()))
+      return Entry->Cost;
+  }
+
+  return BaseT::getCastInstrCost(Opcode, Dst, Src);
+}
+
+int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+  MVT MTy = LT.second;
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  static const CostTblEntry SSE2CostTbl[] = {
+    { ISD::SETCC,   MVT::v2i64,   8 },
+    { ISD::SETCC,   MVT::v4i32,   1 },
+    { ISD::SETCC,   MVT::v8i16,   1 },
+    { ISD::SETCC,   MVT::v16i8,   1 },
+  };
+
+  static const CostTblEntry SSE42CostTbl[] = {
+    { ISD::SETCC,   MVT::v2f64,   1 },
+    { ISD::SETCC,   MVT::v4f32,   1 },
+    { ISD::SETCC,   MVT::v2i64,   1 },
+  };
+
+  static const CostTblEntry AVX1CostTbl[] = {
+    { ISD::SETCC,   MVT::v4f64,   1 },
+    { ISD::SETCC,   MVT::v8f32,   1 },
+    // AVX1 does not support 8-wide integer compare.
+    { ISD::SETCC,   MVT::v4i64,   4 },
+    { ISD::SETCC,   MVT::v8i32,   4 },
+    { ISD::SETCC,   MVT::v16i16,  4 },
+    { ISD::SETCC,   MVT::v32i8,   4 },
+  };
+
+  static const CostTblEntry AVX2CostTbl[] = {
+    { ISD::SETCC,   MVT::v4i64,   1 },
+    { ISD::SETCC,   MVT::v8i32,   1 },
+    { ISD::SETCC,   MVT::v16i16,  1 },
+    { ISD::SETCC,   MVT::v32i8,   1 },
+  };
+
+  static const CostTblEntry AVX512CostTbl[] = {
+    { ISD::SETCC,   MVT::v8i64,   1 },
+    { ISD::SETCC,   MVT::v16i32,  1 },
+    { ISD::SETCC,   MVT::v8f64,   1 },
+    { ISD::SETCC,   MVT::v16f32,  1 },
+  };
+
+  if (ST->hasAVX512())
+    if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasAVX2())
+    if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasAVX())
+    if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasSSE42())
+    if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasSSE2())
+    if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+                                      ArrayRef<Type *> Tys, FastMathFlags FMF) {
+  // Costs should match the codegen from:
+  // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
+  // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
+  // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
+  // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
+  // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
+  static const CostTblEntry XOPCostTbl[] = {
+    { ISD::BITREVERSE, MVT::v4i64,   4 },
+    { ISD::BITREVERSE, MVT::v8i32,   4 },
+    { ISD::BITREVERSE, MVT::v16i16,  4 },
+    { ISD::BITREVERSE, MVT::v32i8,   4 },
+    { ISD::BITREVERSE, MVT::v2i64,   1 },
+    { ISD::BITREVERSE, MVT::v4i32,   1 },
+    { ISD::BITREVERSE, MVT::v8i16,   1 },
+    { ISD::BITREVERSE, MVT::v16i8,   1 },
+    { ISD::BITREVERSE, MVT::i64,     3 },
+    { ISD::BITREVERSE, MVT::i32,     3 },
+    { ISD::BITREVERSE, MVT::i16,     3 },
+    { ISD::BITREVERSE, MVT::i8,      3 }
+  };
+  static const CostTblEntry AVX2CostTbl[] = {
+    { ISD::BITREVERSE, MVT::v4i64,   5 },
+    { ISD::BITREVERSE, MVT::v8i32,   5 },
+    { ISD::BITREVERSE, MVT::v16i16,  5 },
+    { ISD::BITREVERSE, MVT::v32i8,   5 },
+    { ISD::BSWAP,      MVT::v4i64,   1 },
+    { ISD::BSWAP,      MVT::v8i32,   1 },
+    { ISD::BSWAP,      MVT::v16i16,  1 },
+    { ISD::CTLZ,       MVT::v4i64,  23 },
+    { ISD::CTLZ,       MVT::v8i32,  18 },
+    { ISD::CTLZ,       MVT::v16i16, 14 },
+    { ISD::CTLZ,       MVT::v32i8,   9 },
+    { ISD::CTPOP,      MVT::v4i64,   7 },
+    { ISD::CTPOP,      MVT::v8i32,  11 },
+    { ISD::CTPOP,      MVT::v16i16,  9 },
+    { ISD::CTPOP,      MVT::v32i8,   6 },
+    { ISD::CTTZ,       MVT::v4i64,  10 },
+    { ISD::CTTZ,       MVT::v8i32,  14 },
+    { ISD::CTTZ,       MVT::v16i16, 12 },
+    { ISD::CTTZ,       MVT::v32i8,   9 },
+    { ISD::FSQRT,      MVT::f32,     7 }, // Haswell from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v4f32,   7 }, // Haswell from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v8f32,  14 }, // Haswell from http://www.agner.org/
+    { ISD::FSQRT,      MVT::f64,    14 }, // Haswell from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v2f64,  14 }, // Haswell from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v4f64,  28 }, // Haswell from http://www.agner.org/
+  };
+  static const CostTblEntry AVX1CostTbl[] = {
+    { ISD::BITREVERSE, MVT::v4i64,  10 },
+    { ISD::BITREVERSE, MVT::v8i32,  10 },
+    { ISD::BITREVERSE, MVT::v16i16, 10 },
+    { ISD::BITREVERSE, MVT::v32i8,  10 },
+    { ISD::BSWAP,      MVT::v4i64,   4 },
+    { ISD::BSWAP,      MVT::v8i32,   4 },
+    { ISD::BSWAP,      MVT::v16i16,  4 },
+    { ISD::CTLZ,       MVT::v4i64,  46 },
+    { ISD::CTLZ,       MVT::v8i32,  36 },
+    { ISD::CTLZ,       MVT::v16i16, 28 },
+    { ISD::CTLZ,       MVT::v32i8,  18 },
+    { ISD::CTPOP,      MVT::v4i64,  14 },
+    { ISD::CTPOP,      MVT::v8i32,  22 },
+    { ISD::CTPOP,      MVT::v16i16, 18 },
+    { ISD::CTPOP,      MVT::v32i8,  12 },
+    { ISD::CTTZ,       MVT::v4i64,  20 },
+    { ISD::CTTZ,       MVT::v8i32,  28 },
+    { ISD::CTTZ,       MVT::v16i16, 24 },
+    { ISD::CTTZ,       MVT::v32i8,  18 },
+    { ISD::FSQRT,      MVT::f32,    14 }, // SNB from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v4f32,  14 }, // SNB from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v8f32,  28 }, // SNB from http://www.agner.org/
+    { ISD::FSQRT,      MVT::f64,    21 }, // SNB from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v2f64,  21 }, // SNB from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v4f64,  43 }, // SNB from http://www.agner.org/
+  };
+  static const CostTblEntry SSE42CostTbl[] = {
+    { ISD::FSQRT, MVT::f32,   18 }, // Nehalem from http://www.agner.org/
+    { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
+  };
+  static const CostTblEntry SSSE3CostTbl[] = {
+    { ISD::BITREVERSE, MVT::v2i64,   5 },
+    { ISD::BITREVERSE, MVT::v4i32,   5 },
+    { ISD::BITREVERSE, MVT::v8i16,   5 },
+    { ISD::BITREVERSE, MVT::v16i8,   5 },
+    { ISD::BSWAP,      MVT::v2i64,   1 },
+    { ISD::BSWAP,      MVT::v4i32,   1 },
+    { ISD::BSWAP,      MVT::v8i16,   1 },
+    { ISD::CTLZ,       MVT::v2i64,  23 },
+    { ISD::CTLZ,       MVT::v4i32,  18 },
+    { ISD::CTLZ,       MVT::v8i16,  14 },
+    { ISD::CTLZ,       MVT::v16i8,   9 },
+    { ISD::CTPOP,      MVT::v2i64,   7 },
+    { ISD::CTPOP,      MVT::v4i32,  11 },
+    { ISD::CTPOP,      MVT::v8i16,   9 },
+    { ISD::CTPOP,      MVT::v16i8,   6 },
+    { ISD::CTTZ,       MVT::v2i64,  10 },
+    { ISD::CTTZ,       MVT::v4i32,  14 },
+    { ISD::CTTZ,       MVT::v8i16,  12 },
+    { ISD::CTTZ,       MVT::v16i8,   9 }
+  };
+  static const CostTblEntry SSE2CostTbl[] = {
+    { ISD::BSWAP,      MVT::v2i64,   7 },
+    { ISD::BSWAP,      MVT::v4i32,   7 },
+    { ISD::BSWAP,      MVT::v8i16,   7 },
+    { ISD::CTLZ,       MVT::v2i64,  25 },
+    { ISD::CTLZ,       MVT::v4i32,  26 },
+    { ISD::CTLZ,       MVT::v8i16,  20 },
+    { ISD::CTLZ,       MVT::v16i8,  17 },
+    { ISD::CTPOP,      MVT::v2i64,  12 },
+    { ISD::CTPOP,      MVT::v4i32,  15 },
+    { ISD::CTPOP,      MVT::v8i16,  13 },
+    { ISD::CTPOP,      MVT::v16i8,  10 },
+    { ISD::CTTZ,       MVT::v2i64,  14 },
+    { ISD::CTTZ,       MVT::v4i32,  18 },
+    { ISD::CTTZ,       MVT::v8i16,  16 },
+    { ISD::CTTZ,       MVT::v16i8,  13 },
+    { ISD::FSQRT,      MVT::f64,    32 }, // Nehalem from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v2f64,  32 }, // Nehalem from http://www.agner.org/
+  };
+  static const CostTblEntry SSE1CostTbl[] = {
+    { ISD::FSQRT, MVT::f32,   28 }, // Pentium III from http://www.agner.org/
+    { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
+  };
+
+  unsigned ISD = ISD::DELETED_NODE;
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::bitreverse:
+    ISD = ISD::BITREVERSE;
+    break;
+  case Intrinsic::bswap:
+    ISD = ISD::BSWAP;
+    break;
+  case Intrinsic::ctlz:
+    ISD = ISD::CTLZ;
+    break;
+  case Intrinsic::ctpop:
+    ISD = ISD::CTPOP;
+    break;
+  case Intrinsic::cttz:
+    ISD = ISD::CTTZ;
+    break;
+  case Intrinsic::sqrt:
+    ISD = ISD::FSQRT;
+    break;
+  }
+
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
+  MVT MTy = LT.second;
+
+  // Attempt to lookup cost.
+  if (ST->hasXOP())
+    if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasAVX2())
+    if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasAVX())
+    if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasSSE42())
+    if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasSSSE3())
+    if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasSSE2())
+    if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasSSE1())
+    if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF);
+}
+
+int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+                                      ArrayRef<Value *> Args, FastMathFlags FMF) {
+  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF);
+}
+
+int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  Type *ScalarType = Val->getScalarType();
+
+  if (Index != -1U) {
+    // Legalize the type.
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
+
+    // This type is legalized to a scalar type.
+    if (!LT.second.isVector())
+      return 0;
+
+    // The type may be split. Normalize the index to the new type.
+    unsigned Width = LT.second.getVectorNumElements();
+    Index = Index % Width;
+
+    // Floating point scalars are already located in index #0.
+    if (ScalarType->isFloatingPointTy() && Index == 0)
+      return 0;
+  }
+
+  // Add to the base cost if we know that the extracted element of a vector is
+  // destined to be moved to and used in the integer register file.
+  int RegisterFileMoveCost = 0;
+  if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
+    RegisterFileMoveCost = 1;
+
+  return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
+}
+
+int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
+  assert (Ty->isVectorTy() && "Can only scalarize vectors");
+  int Cost = 0;
+
+  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+    if (Insert)
+      Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i);
+    if (Extract)
+      Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+  }
+
+  return Cost;
+}
+
+int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                unsigned AddressSpace) {
+  // Handle non-power-of-two vectors such as <3 x float>
+  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
+    unsigned NumElem = VTy->getVectorNumElements();
+
+    // Handle a few common cases:
+    // <3 x float>
+    if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
+      // Cost = 64 bit store + extract + 32 bit store.
+      return 3;
+
+    // <3 x double>
+    if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
+      // Cost = 128 bit store + unpack + 64 bit store.
+      return 3;
+
+    // Assume that all other non-power-of-two numbers are scalarized.
+    if (!isPowerOf2_32(NumElem)) {
+      int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
+                                        AddressSpace);
+      int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
+                                               Opcode == Instruction::Store);
+      return NumElem * Cost + SplitCost;
+    }
+  }
+
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+         "Invalid Opcode");
+
+  // Each load/store unit costs 1.
+  int Cost = LT.first * 1;
+
+  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
+  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
+  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
+    Cost *= 2;
+
+  return Cost;
+}
+
+int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
+                                      unsigned Alignment,
+                                      unsigned AddressSpace) {
+  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
+  if (!SrcVTy)
+    // To calculate scalar take the regular cost, without mask
+    return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
+
+  unsigned NumElem = SrcVTy->getVectorNumElements();
+  VectorType *MaskTy =
+    VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
+  if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) ||
+      (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) ||
+      !isPowerOf2_32(NumElem)) {
+    // Scalarization
+    int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
+    int ScalarCompareCost = getCmpSelInstrCost(
+        Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
+    int BranchCost = getCFInstrCost(Instruction::Br);
+    int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
+
+    int ValueSplitCost = getScalarizationOverhead(
+        SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
+    int MemopCost =
+        NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+                                         Alignment, AddressSpace);
+    return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
+  }
+
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+  auto VT = TLI->getValueType(DL, SrcVTy);
+  int Cost = 0;
+  if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
+      LT.second.getVectorNumElements() == NumElem)
+    // Promotion requires expand/truncate for data and a shuffle for mask.
+    Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
+            getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);
+
+  else if (LT.second.getVectorNumElements() > NumElem) {
+    VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
+                                            LT.second.getVectorNumElements());
+    // Expanding requires fill mask with zeroes
+    Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
+  }
+  if (!ST->hasAVX512())
+    return Cost + LT.first*4; // Each maskmov costs 4
+
+  // AVX-512 masked load/store is cheapper
+  return Cost+LT.first;
+}
+
+int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+  // Address computations in vectorized code with non-consecutive addresses will
+  // likely result in more instructions compared to scalar code where the
+  // computation can more often be merged into the index mode. The resulting
+  // extra micro-ops can significantly decrease throughput.
+  unsigned NumVectorInstToHideOverhead = 10;
+
+  if (Ty->isVectorTy() && IsComplex)
+    return NumVectorInstToHideOverhead;
+
+  return BaseT::getAddressComputationCost(Ty, IsComplex);
+}
+
+int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
+                                 bool IsPairwise) {
+
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+  MVT MTy = LT.second;
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
+  // and make it as the cost.
+
+  static const CostTblEntry SSE42CostTblPairWise[] = {
+    { ISD::FADD,  MVT::v2f64,   2 },
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
+    { ISD::ADD,   MVT::v8i16,   5 },
+  };
+
+  static const CostTblEntry AVX1CostTblPairWise[] = {
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::FADD,  MVT::v4f64,   5 },
+    { ISD::FADD,  MVT::v8f32,   7 },
+    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
+    { ISD::ADD,   MVT::v4i64,   5 },      // The data reported by the IACA tool is "4.8".
+    { ISD::ADD,   MVT::v8i16,   5 },
+    { ISD::ADD,   MVT::v8i32,   5 },
+  };
+
+  static const CostTblEntry SSE42CostTblNoPairWise[] = {
+    { ISD::FADD,  MVT::v2f64,   2 },
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
+    { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
+  };
+
+  static const CostTblEntry AVX1CostTblNoPairWise[] = {
+    { ISD::FADD,  MVT::v4f32,   3 },
+    { ISD::FADD,  MVT::v4f64,   3 },
+    { ISD::FADD,  MVT::v8f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "2.8".
+    { ISD::ADD,   MVT::v4i64,   3 },
+    { ISD::ADD,   MVT::v8i16,   4 },
+    { ISD::ADD,   MVT::v8i32,   5 },
+  };
+
+  if (IsPairwise) {
+    if (ST->hasAVX())
+      if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
+
+    if (ST->hasSSE42())
+      if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
+  } else {
+    if (ST->hasAVX())
+      if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
+
+    if (ST->hasSSE42())
+      if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
+        return LT.first * Entry->Cost;
+  }
+
+  return BaseT::getReductionCost(Opcode, ValTy, IsPairwise);
+}
+
+/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+int X86TTIImpl::getIntImmCost(int64_t Val) {
+  if (Val == 0)
+    return TTI::TCC_Free;
+
+  if (isInt<32>(Val))
+    return TTI::TCC_Basic;
+
+  return 2 * TTI::TCC_Basic;
+}
+
+int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  // Never hoist constants larger than 128bit, because this might lead to
+  // incorrect code generation or assertions in codegen.
+  // Fixme: Create a cost model for types larger than i128 once the codegen
+  // issues have been fixed.
+  if (BitSize > 128)
+    return TTI::TCC_Free;
+
+  if (Imm == 0)
+    return TTI::TCC_Free;
+
+  // Sign-extend all constants to a multiple of 64-bit.
+  APInt ImmVal = Imm;
+  if (BitSize & 0x3f)
+    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
+
+  // Split the constant into 64-bit chunks and calculate the cost for each
+  // chunk.
+  int Cost = 0;
+  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+    int64_t Val = Tmp.getSExtValue();
+    Cost += getIntImmCost(Val);
+  }
+  // We need at least one instruction to materialize the constant.
+  return std::max(1, Cost);
+}
+
+int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                              Type *Ty) {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TTI::TCC_Free;
+
+  unsigned ImmIdx = ~0U;
+  switch (Opcode) {
+  default:
+    return TTI::TCC_Free;
+  case Instruction::GetElementPtr:
+    // Always hoist the base address of a GetElementPtr. This prevents the
+    // creation of new constants for every base constant that gets constant
+    // folded with the offset.
+    if (Idx == 0)
+      return 2 * TTI::TCC_Basic;
+    return TTI::TCC_Free;
+  case Instruction::Store:
+    ImmIdx = 0;
+    break;
+  case Instruction::ICmp:
+    // This is an imperfect hack to prevent constant hoisting of
+    // compares that might be trying to check if a 64-bit value fits in
+    // 32-bits. The backend can optimize these cases using a right shift by 32.
+    // Ideally we would check the compare predicate here. There also other
+    // similar immediates the backend can use shifts for.
+    if (Idx == 1 && Imm.getBitWidth() == 64) {
+      uint64_t ImmVal = Imm.getZExtValue();
+      if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff)
+        return TTI::TCC_Free;
+    }
+    ImmIdx = 1;
+    break;
+  case Instruction::And:
+    // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
+    // by using a 32-bit operation with implicit zero extension. Detect such
+    // immediates here as the normal path expects bit 31 to be sign extended.
+    if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
+      return TTI::TCC_Free;
+    LLVM_FALLTHROUGH;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::Or:
+  case Instruction::Xor:
+    ImmIdx = 1;
+    break;
+  // Always return TCC_Free for the shift value of a shift instruction.
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    if (Idx == 1)
+      return TTI::TCC_Free;
+    break;
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+  case Instruction::BitCast:
+  case Instruction::PHI:
+  case Instruction::Call:
+  case Instruction::Select:
+  case Instruction::Ret:
+  case Instruction::Load:
+    break;
+  }
+
+  if (Idx == ImmIdx) {
+    int NumConstants = (BitSize + 63) / 64;
+    int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
+    return (Cost <= NumConstants * TTI::TCC_Basic)
+               ? static_cast<int>(TTI::TCC_Free)
+               : Cost;
+  }
+
+  return X86TTIImpl::getIntImmCost(Imm, Ty);
+}
+
+int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                              Type *Ty) {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TTI::TCC_Free;
+
+  switch (IID) {
+  default:
+    return TTI::TCC_Free;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
+      return TTI::TCC_Free;
+    break;
+  case Intrinsic::experimental_stackmap:
+    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TTI::TCC_Free;
+    break;
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TTI::TCC_Free;
+    break;
+  }
+  return X86TTIImpl::getIntImmCost(Imm, Ty);
+}
+
+// Return an average cost of Gather / Scatter instruction, maybe improved later
+int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
+                                unsigned Alignment, unsigned AddressSpace) {
+
+  assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
+  unsigned VF = SrcVTy->getVectorNumElements();
+
+  // Try to reduce index size from 64 bit (default for GEP)
+  // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
+  // operation will use 16 x 64 indices which do not fit in a zmm and needs
+  // to split. Also check that the base pointer is the same for all lanes,
+  // and that there's at most one variable index.
+  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
+    unsigned IndexSize = DL.getPointerSizeInBits();
+    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+    if (IndexSize < 64 || !GEP)
+      return IndexSize;
+
+    unsigned NumOfVarIndices = 0;
+    Value *Ptrs = GEP->getPointerOperand();
+    if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
+      return IndexSize;
+    for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
+      if (isa<Constant>(GEP->getOperand(i)))
+        continue;
+      Type *IndxTy = GEP->getOperand(i)->getType();
+      if (IndxTy->isVectorTy())
+        IndxTy = IndxTy->getVectorElementType();
+      if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
+          !isa<SExtInst>(GEP->getOperand(i))) ||
+         ++NumOfVarIndices > 1)
+        return IndexSize; // 64
+    }
+    return (unsigned)32;
+  };
+
+
+  // Trying to reduce IndexSize to 32 bits for vector 16.
+  // By default the IndexSize is equal to pointer size.
+  unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
+    DL.getPointerSizeInBits();
+
+  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
+                                                    IndexSize), VF);
+  std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
+  std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+  int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
+  if (SplitFactor > 1) {
+    // Handle splitting of vector of pointers
+    Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
+    return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
+                                         AddressSpace);
+  }
+
+  // The gather / scatter cost is given by Intel architects. It is a rough
+  // number since we are looking at one instruction in a time.
+  const int GSOverhead = 2;
+  return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+                                           Alignment, AddressSpace);
+}
+
+/// Return the cost of full scalarization of gather / scatter operation.
+///
+/// Opcode - Load or Store instruction.
+/// SrcVTy - The type of the data vector that should be gathered or scattered.
+/// VariableMask - The mask is non-constant at compile time.
+/// Alignment - Alignment for one element.
+/// AddressSpace - pointer[s] address space.
+///
+int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
+                                bool VariableMask, unsigned Alignment,
+                                unsigned AddressSpace) {
+  unsigned VF = SrcVTy->getVectorNumElements();
+
+  int MaskUnpackCost = 0;
+  if (VariableMask) {
+    VectorType *MaskTy =
+      VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
+    MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
+    int ScalarCompareCost =
+      getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
+                         nullptr);
+    int BranchCost = getCFInstrCost(Instruction::Br);
+    MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
+  }
+
+  // The cost of the scalar loads/stores.
+  int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+                                          Alignment, AddressSpace);
+
+  int InsertExtractCost = 0;
+  if (Opcode == Instruction::Load)
+    for (unsigned i = 0; i < VF; ++i)
+      // Add the cost of inserting each scalar load into the vector
+      InsertExtractCost +=
+        getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
+  else
+    for (unsigned i = 0; i < VF; ++i)
+      // Add the cost of extracting each element out of the data vector
+      InsertExtractCost +=
+        getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
+
+  return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
+}
+
+/// Calculate the cost of Gather / Scatter operation
+int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
+                                       Value *Ptr, bool VariableMask,
+                                       unsigned Alignment) {
+  assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
+  unsigned VF = SrcVTy->getVectorNumElements();
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy && Ptr->getType()->isVectorTy())
+    PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
+  assert(PtrTy && "Unexpected type for Ptr argument");
+  unsigned AddressSpace = PtrTy->getAddressSpace();
+
+  bool Scalarize = false;
+  if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
+      (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
+    Scalarize = true;
+  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
+  // Vector-4 of gather/scatter instruction does not exist on KNL.
+  // We can extend it to 8 elements, but zeroing upper bits of
+  // the mask vector will add more instructions. Right now we give the scalar
+  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
+  // is better in the VariableMask case.
+  if (VF == 2 || (VF == 4 && !ST->hasVLX()))
+    Scalarize = true;
+
+  if (Scalarize)
+    return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
+                           AddressSpace);
+
+  return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
+}
+
+bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
+  Type *ScalarTy = DataTy->getScalarType();
+  int DataWidth = isa<PointerType>(ScalarTy) ?
+    DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
+
+  return ((DataWidth == 32 || DataWidth == 64) && ST->hasAVX()) ||
+         ((DataWidth == 8 || DataWidth == 16) && ST->hasBWI());
+}
+
+bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
+  return isLegalMaskedLoad(DataType);
+}
+
+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
+  // This function is called now in two cases: from the Loop Vectorizer
+  // and from the Scalarizer.
+  // When the Loop Vectorizer asks about legality of the feature,
+  // the vectorization factor is not calculated yet. The Loop Vectorizer
+  // sends a scalar type and the decision is based on the width of the
+  // scalar element.
+  // Later on, the cost model will estimate usage this intrinsic based on
+  // the vector type.
+  // The Scalarizer asks again about legality. It sends a vector type.
+  // In this case we can reject non-power-of-2 vectors.
+  if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements()))
+    return false;
+  Type *ScalarTy = DataTy->getScalarType();
+  int DataWidth = isa<PointerType>(ScalarTy) ?
+    DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
+
+  // AVX-512 allows gather and scatter
+  return (DataWidth == 32 || DataWidth == 64) && ST->hasAVX512();
+}
+
+bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
+  return isLegalMaskedGather(DataType);
+}
+
+bool X86TTIImpl::areInlineCompatible(const Function *Caller,
+                                     const Function *Callee) const {
+  const TargetMachine &TM = getTLI()->getTargetMachine();
+
+  // Work this as a subsetting of subtarget features.
+  const FeatureBitset &CallerBits =
+      TM.getSubtargetImpl(*Caller)->getFeatureBits();
+  const FeatureBitset &CalleeBits =
+      TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+  // FIXME: This is likely too limiting as it will include subtarget features
+  // that we might not care about for inlining, but it is conservatively
+  // correct.
+  return (CallerBits & CalleeBits) == CalleeBits;
+}
+
+bool X86TTIImpl::enableInterleavedAccessVectorization() {
+  // TODO: We expect this to be beneficial regardless of arch,
+  // but there are currently some unexplained performance artifacts on Atom.
+  // As a temporary solution, disable on Atom.
+  return !(ST->isAtom() || ST->isSLM());
+}
+
+// Get estimation for interleaved load/store operations and strided load.
+// \p Indices contains indices for strided load.
+// \p Factor - the factor of interleaving.
+// AVX-512 provides 3-src shuffles that significantly reduces the cost.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
+                                                 unsigned Factor,
+                                                 ArrayRef<unsigned> Indices,
+                                                 unsigned Alignment,
+                                                 unsigned AddressSpace) {
+
+  // VecTy for interleave memop is <VF*Factor x Elt>.
+  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+  // VecTy = <12 x i32>.
+
+  // Calculate the number of memory operations (NumOfMemOps), required
+  // for load/store the VecTy.
+  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+  unsigned LegalVTSize = LegalVT.getStoreSize();
+  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+  // Get the cost of one memory operation.
+  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
+                                        LegalVT.getVectorNumElements());
+  unsigned MemOpCost =
+      getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+
+  if (Opcode == Instruction::Load) {
+    // Kind of shuffle depends on number of loaded values.
+    // If we load the entire data in one register, we can use a 1-src shuffle.
+    // Otherwise, we'll merge 2 sources in each operation.
+    TTI::ShuffleKind ShuffleKind =
+        (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
+
+    unsigned ShuffleCost =
+        getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
+
+    unsigned NumOfLoadsInInterleaveGrp =
+        Indices.size() ? Indices.size() : Factor;
+    Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
+                                     VecTy->getVectorNumElements() / Factor);
+    unsigned NumOfResults =
+        getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
+        NumOfLoadsInInterleaveGrp;
+
+    // About a half of the loads may be folded in shuffles when we have only
+    // one result. If we have more than one result, we do not fold loads at all.
+    unsigned NumOfUnfoldedLoads =
+        NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
+
+    // Get a number of shuffle operations per result.
+    unsigned NumOfShufflesPerResult =
+        std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
+
+    // The SK_MergeTwoSrc shuffle clobbers one of src operands.
+    // When we have more than one destination, we need additional instructions
+    // to keep sources.
+    unsigned NumOfMoves = 0;
+    if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
+      NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
+
+    int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
+               NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
+
+    return Cost;
+  }
+
+  // Store.
+  assert(Opcode == Instruction::Store &&
+         "Expected Store Instruction at this  point");
+
+  // There is no strided stores meanwhile. And store can't be folded in
+  // shuffle.
+  unsigned NumOfSources = Factor; // The number of values to be merged.
+  unsigned ShuffleCost =
+      getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
+  unsigned NumOfShufflesPerStore = NumOfSources - 1;
+
+  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
+  // We need additional instructions to keep sources.
+  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
+  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
+             NumOfMoves;
+  return Cost;
+}
+
+int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                           unsigned Factor,
+                                           ArrayRef<unsigned> Indices,
+                                           unsigned Alignment,
+                                           unsigned AddressSpace) {
+  auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) {
+    RequiresBW = false;
+    Type *EltTy = VecTy->getVectorElementType();
+    if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
+        EltTy->isIntegerTy(32) || EltTy->isPointerTy())
+      return true;
+    if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) {
+      RequiresBW = true;
+      return true;
+    }
+    return false;
+  };
+  bool RequiresBW;
+  bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW);
+  if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
+    return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
+                                            Alignment, AddressSpace);
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
new file mode 100644
index 000000000000..f6bcb9f569e4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -0,0 +1,116 @@
+//===-- X86TargetTransformInfo.h - X86 specific TTI -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// X86 target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H
+
+#include "X86.h"
+#include "X86TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
+  typedef BasicTTIImplBase<X86TTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const X86Subtarget *ST;
+  const X86TargetLowering *TLI;
+
+  int getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+
+  const X86Subtarget *getST() const { return ST; }
+  const X86TargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  /// \name Scalar TTI Implementations
+  /// @{
+  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getMaxInterleaveFactor(unsigned VF);
+  int getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                      unsigned AddressSpace);
+  int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                            unsigned AddressSpace);
+  int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+                             bool VariableMask, unsigned Alignment);
+  int getAddressComputationCost(Type *PtrTy, bool IsComplex);
+
+  int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<Type *> Tys, FastMathFlags FMF);
+  int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<Value *> Args, FastMathFlags FMF);
+
+  int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
+
+  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                 unsigned Factor, ArrayRef<unsigned> Indices,
+                                 unsigned Alignment, unsigned AddressSpace);
+  int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
+                                 unsigned Factor, ArrayRef<unsigned> Indices,
+                                 unsigned Alignment, unsigned AddressSpace);
+
+  int getIntImmCost(int64_t);
+
+  int getIntImmCost(const APInt &Imm, Type *Ty);
+
+  int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+  int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                    Type *Ty);
+  bool isLegalMaskedLoad(Type *DataType);
+  bool isLegalMaskedStore(Type *DataType);
+  bool isLegalMaskedGather(Type *DataType);
+  bool isLegalMaskedScatter(Type *DataType);
+  bool areInlineCompatible(const Function *Caller,
+                           const Function *Callee) const;
+
+  bool enableInterleavedAccessVectorization();
+private:
+  int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
+                      unsigned Alignment, unsigned AddressSpace);
+  int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+                      unsigned Alignment, unsigned AddressSpace);
+
+  /// @}
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
new file mode 100644
index 000000000000..9766b84be652
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -0,0 +1,328 @@
+//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which inserts x86 AVX vzeroupper instructions
+// before calls to SSE encoded functions. This avoids transition latency
+// penalty when transferring control between AVX encoded instructions and old
+// SSE encoding mode.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-vzeroupper"
+
+STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
+
+namespace {
+
+  class VZeroUpperInserter : public MachineFunctionPass {
+  public:
+
+    VZeroUpperInserter() : MachineFunctionPass(ID) {}
+    bool runOnMachineFunction(MachineFunction &MF) override;
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+    StringRef getPassName() const override { return "X86 vzeroupper inserter"; }
+
+  private:
+
+    void processBasicBlock(MachineBasicBlock &MBB);
+    void insertVZeroUpper(MachineBasicBlock::iterator I,
+                          MachineBasicBlock &MBB);
+    void addDirtySuccessor(MachineBasicBlock &MBB);
+
+    typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState;
+    static const char* getBlockExitStateName(BlockExitState ST);
+
+    // Core algorithm state:
+    // BlockState - Each block is either:
+    //   - PASS_THROUGH: There are neither YMM dirtying instructions nor
+    //                   vzeroupper instructions in this block.
+    //   - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
+    //                  block that will ensure that YMM is clean on exit.
+    //   - EXITS_DIRTY: An instruction in the block dirties YMM and no
+    //                  subsequent vzeroupper in the block clears it.
+    //
+    // AddedToDirtySuccessors - This flag is raised when a block is added to the
+    //                          DirtySuccessors list to ensure that it's not
+    //                          added multiple times.
+    //
+    // FirstUnguardedCall - Records the location of the first unguarded call in
+    //                      each basic block that may need to be guarded by a
+    //                      vzeroupper. We won't know whether it actually needs
+    //                      to be guarded until we discover a predecessor that
+    //                      is DIRTY_OUT.
+    struct BlockState {
+      BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {}
+      BlockExitState ExitState;
+      bool AddedToDirtySuccessors;
+      MachineBasicBlock::iterator FirstUnguardedCall;
+    };
+    typedef SmallVector<BlockState, 8> BlockStateMap;
+    typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList;
+
+    BlockStateMap BlockStates;
+    DirtySuccessorsWorkList DirtySuccessors;
+    bool EverMadeChange;
+    bool IsX86INTR;
+    const TargetInstrInfo *TII;
+
+    static char ID;
+  };
+
+  char VZeroUpperInserter::ID = 0;
+}
+
+FunctionPass *llvm::createX86IssueVZeroUpperPass() {
+  return new VZeroUpperInserter();
+}
+
+const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
+  switch (ST) {
+    case PASS_THROUGH: return "Pass-through";
+    case EXITS_DIRTY: return "Exits-dirty";
+    case EXITS_CLEAN: return "Exits-clean";
+  }
+  llvm_unreachable("Invalid block exit state.");
+}
+
+static bool isYmmReg(unsigned Reg) {
+  return (Reg >= X86::YMM0 && Reg <= X86::YMM15);
+}
+
+static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
+  for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
+       E = MRI.livein_end(); I != E; ++I)
+    if (isYmmReg(I->first))
+      return true;
+
+  return false;
+}
+
+static bool clobbersAllYmmRegs(const MachineOperand &MO) {
+  for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
+    if (!MO.clobbersPhysReg(reg))
+      return false;
+  }
+  return true;
+}
+
+static bool hasYmmReg(MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
+      return true;
+    if (!MO.isReg())
+      continue;
+    if (MO.isDebug())
+      continue;
+    if (isYmmReg(MO.getReg()))
+      return true;
+  }
+  return false;
+}
+
+/// Check if any YMM register will be clobbered by this instruction.
+static bool callClobbersAnyYmmReg(MachineInstr &MI) {
+  assert(MI.isCall() && "Can only be called on call instructions.");
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isRegMask())
+      continue;
+    for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
+      if (MO.clobbersPhysReg(reg))
+        return true;
+    }
+  }
+  return false;
+}
+
+/// Insert a vzeroupper instruction before I.
+void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I,
+                                          MachineBasicBlock &MBB) {
+  DebugLoc dl = I->getDebugLoc();
+  BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER));
+  ++NumVZU;
+  EverMadeChange = true;
+}
+
+/// Add MBB to the DirtySuccessors list if it hasn't already been added.
+void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
+  if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) {
+    DirtySuccessors.push_back(&MBB);
+    BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true;
+  }
+}
+
+/// Loop over all of the instructions in the basic block, inserting vzeroupper
+/// instructions before function calls.
+void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
+
+  // Start by assuming that the block is PASS_THROUGH which implies no unguarded
+  // calls.
+  BlockExitState CurState = PASS_THROUGH;
+  BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
+
+  for (MachineInstr &MI : MBB) {
+    // No need for vzeroupper before iret in interrupt handler function,
+    // epilogue will restore YMM registers if needed.
+    bool IsReturnFromX86INTR = IsX86INTR && MI.isReturn();
+    bool IsControlFlow = MI.isCall() || MI.isReturn();
+
+    // An existing VZERO* instruction resets the state.
+    if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) {
+      CurState = EXITS_CLEAN;
+      continue;
+    }
+
+    // Shortcut: don't need to check regular instructions in dirty state.
+    if ((!IsControlFlow || IsReturnFromX86INTR) && CurState == EXITS_DIRTY)
+      continue;
+
+    if (hasYmmReg(MI)) {
+      // We found a ymm-using instruction; this could be an AVX instruction,
+      // or it could be control flow.
+      CurState = EXITS_DIRTY;
+      continue;
+    }
+
+    // Check for control-flow out of the current function (which might
+    // indirectly execute SSE instructions).
+    if (!IsControlFlow || IsReturnFromX86INTR)
+      continue;
+
+    // If the call won't clobber any YMM register, skip it as well. It usually
+    // happens on helper function calls (such as '_chkstk', '_ftol2') where
+    // standard calling convention is not used (RegMask is not used to mark
+    // register clobbered and register usage (def/imp-def/use) is well-defined
+    // and explicitly specified.
+    if (MI.isCall() && !callClobbersAnyYmmReg(MI))
+      continue;
+
+    // The VZEROUPPER instruction resets the upper 128 bits of all AVX
+    // registers. In addition, the processor changes back to Clean state, after
+    // which execution of SSE instructions or AVX instructions has no transition
+    // penalty. Add the VZEROUPPER instruction before any function call/return
+    // that might execute SSE code.
+    // FIXME: In some cases, we may want to move the VZEROUPPER into a
+    // predecessor block.
+    if (CurState == EXITS_DIRTY) {
+      // After the inserted VZEROUPPER the state becomes clean again, but
+      // other YMM may appear before other subsequent calls or even before
+      // the end of the BB.
+      insertVZeroUpper(MI, MBB);
+      CurState = EXITS_CLEAN;
+    } else if (CurState == PASS_THROUGH) {
+      // If this block is currently in pass-through state and we encounter a
+      // call then whether we need a vzeroupper or not depends on whether this
+      // block has successors that exit dirty. Record the location of the call,
+      // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet.
+      // It will be inserted later if necessary.
+      BlockStates[MBB.getNumber()].FirstUnguardedCall = MI;
+      CurState = EXITS_CLEAN;
+    }
+  }
+
+  DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
+               << getBlockExitStateName(CurState) << '\n');
+
+  if (CurState == EXITS_DIRTY)
+    for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+                                          SE = MBB.succ_end();
+         SI != SE; ++SI)
+      addDirtySuccessor(**SI);
+
+  BlockStates[MBB.getNumber()].ExitState = CurState;
+}
+
+/// Loop over all of the basic blocks, inserting vzeroupper instructions before
+/// function calls.
+bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
+  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+  if (!ST.hasAVX() || ST.hasAVX512() || ST.hasFastPartialYMMWrite())
+    return false;
+  TII = ST.getInstrInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  EverMadeChange = false;
+  IsX86INTR = MF.getFunction()->getCallingConv() == CallingConv::X86_INTR;
+
+  bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
+
+  // Fast check: if the function doesn't use any ymm registers, we don't need
+  // to insert any VZEROUPPER instructions.  This is constant-time, so it is
+  // cheap in the common case of no ymm use.
+  bool YMMUsed = FnHasLiveInYmm;
+  if (!YMMUsed) {
+    const TargetRegisterClass *RC = &X86::VR256RegClass;
+    for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
+         i++) {
+      if (!MRI.reg_nodbg_empty(*i)) {
+        YMMUsed = true;
+        break;
+      }
+    }
+  }
+  if (!YMMUsed) {
+    return false;
+  }
+
+  assert(BlockStates.empty() && DirtySuccessors.empty() &&
+         "X86VZeroUpper state should be clear");
+  BlockStates.resize(MF.getNumBlockIDs());
+
+  // Process all blocks. This will compute block exit states, record the first
+  // unguarded call in each block, and add successors of dirty blocks to the
+  // DirtySuccessors list.
+  for (MachineBasicBlock &MBB : MF)
+    processBasicBlock(MBB);
+
+  // If any YMM regs are live-in to this function, add the entry block to the
+  // DirtySuccessors list
+  if (FnHasLiveInYmm)
+    addDirtySuccessor(MF.front());
+
+  // Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add
+  // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY
+  // through PASS_THROUGH blocks.
+  while (!DirtySuccessors.empty()) {
+    MachineBasicBlock &MBB = *DirtySuccessors.back();
+    DirtySuccessors.pop_back();
+    BlockState &BBState = BlockStates[MBB.getNumber()];
+
+    // MBB is a successor of a dirty block, so its first call needs to be
+    // guarded.
+    if (BBState.FirstUnguardedCall != MBB.end())
+      insertVZeroUpper(BBState.FirstUnguardedCall, MBB);
+
+    // If this successor was a pass-through block, then it is now dirty. Its
+    // successors need to be added to the worklist (if they haven't been
+    // already).
+    if (BBState.ExitState == PASS_THROUGH) {
+      DEBUG(dbgs() << "MBB #" << MBB.getNumber()
+                   << " was Pass-through, is now Dirty-out.\n");
+      for (MachineBasicBlock *Succ : MBB.successors())
+        addDirtySuccessor(*Succ);
+    }
+  }
+
+  BlockStates.clear();
+  return EverMadeChange;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp b/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
new file mode 100644
index 000000000000..fc08f1582ad7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -0,0 +1,295 @@
+//===----- X86WinAllocaExpander.cpp - Expand WinAlloca pseudo instruction -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that expands WinAlloca pseudo-instructions.
+//
+// It performs a conservative analysis to determine whether each allocation
+// falls within a region of the stack that is safe to use, or whether stack
+// probes must be emitted.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class X86WinAllocaExpander : public MachineFunctionPass {
+public:
+  X86WinAllocaExpander() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  /// Strategies for lowering a WinAlloca.
+  enum Lowering { TouchAndSub, Sub, Probe };
+
+  /// Deterministic-order map from WinAlloca instruction to desired lowering.
+  typedef MapVector<MachineInstr*, Lowering> LoweringMap;
+
+  /// Compute which lowering to use for each WinAlloca instruction.
+  void computeLowerings(MachineFunction &MF, LoweringMap& Lowerings);
+
+  /// Get the appropriate lowering based on current offset and amount.
+  Lowering getLowering(int64_t CurrentOffset, int64_t AllocaAmount);
+
+  /// Lower a WinAlloca instruction.
+  void lower(MachineInstr* MI, Lowering L);
+
+  MachineRegisterInfo *MRI;
+  const X86Subtarget *STI;
+  const TargetInstrInfo *TII;
+  const X86RegisterInfo *TRI;
+  unsigned StackPtr;
+  unsigned SlotSize;
+  int64_t StackProbeSize;
+
+  StringRef getPassName() const override { return "X86 WinAlloca Expander"; }
+  static char ID;
+};
+
+char X86WinAllocaExpander::ID = 0;
+
+} // end anonymous namespace
+
+FunctionPass *llvm::createX86WinAllocaExpander() {
+  return new X86WinAllocaExpander();
+}
+
+/// Return the allocation amount for a WinAlloca instruction, or -1 if unknown.
+static int64_t getWinAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) {
+  assert(MI->getOpcode() == X86::WIN_ALLOCA_32 ||
+         MI->getOpcode() == X86::WIN_ALLOCA_64);
+  assert(MI->getOperand(0).isReg());
+
+  unsigned AmountReg = MI->getOperand(0).getReg();
+  MachineInstr *Def = MRI->getUniqueVRegDef(AmountReg);
+
+  // Look through copies.
+  while (Def && Def->isCopy() && Def->getOperand(1).isReg())
+    Def = MRI->getUniqueVRegDef(Def->getOperand(1).getReg());
+
+  if (!Def ||
+      (Def->getOpcode() != X86::MOV32ri && Def->getOpcode() != X86::MOV64ri) ||
+      !Def->getOperand(1).isImm())
+    return -1;
+
+  return Def->getOperand(1).getImm();
+}
+
+X86WinAllocaExpander::Lowering
+X86WinAllocaExpander::getLowering(int64_t CurrentOffset,
+                                  int64_t AllocaAmount) {
+  // For a non-constant amount or a large amount, we have to probe.
+  if (AllocaAmount < 0 || AllocaAmount > StackProbeSize)
+    return Probe;
+
+  // If it fits within the safe region of the stack, just subtract.
+  if (CurrentOffset + AllocaAmount <= StackProbeSize)
+    return Sub;
+
+  // Otherwise, touch the current tip of the stack, then subtract.
+  return TouchAndSub;
+}
+
+static bool isPushPop(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case X86::PUSH32i8:
+  case X86::PUSH32r:
+  case X86::PUSH32rmm:
+  case X86::PUSH32rmr:
+  case X86::PUSHi32:
+  case X86::PUSH64i8:
+  case X86::PUSH64r:
+  case X86::PUSH64rmm:
+  case X86::PUSH64rmr:
+  case X86::PUSH64i32:
+  case X86::POP32r:
+  case X86::POP64r:
+    return true;
+  default:
+    return false;
+  }
+}
+
+void X86WinAllocaExpander::computeLowerings(MachineFunction &MF,
+                                            LoweringMap &Lowerings) {
+  // Do a one-pass reverse post-order walk of the CFG to conservatively estimate
+  // the offset between the stack pointer and the lowest touched part of the
+  // stack, and use that to decide how to lower each WinAlloca instruction.
+
+  // Initialize OutOffset[B], the stack offset at exit from B, to something big.
+  DenseMap<MachineBasicBlock *, int64_t> OutOffset;
+  for (MachineBasicBlock &MBB : MF)
+    OutOffset[&MBB] = INT32_MAX;
+
+  // Note: we don't know the offset at the start of the entry block since the
+  // prologue hasn't been inserted yet, and how much that will adjust the stack
+  // pointer depends on register spills, which have not been computed yet.
+
+  // Compute the reverse post-order.
+  ReversePostOrderTraversal<MachineFunction*> RPO(&MF);
+
+  for (MachineBasicBlock *MBB : RPO) {
+    int64_t Offset = -1;
+    for (MachineBasicBlock *Pred : MBB->predecessors())
+      Offset = std::max(Offset, OutOffset[Pred]);
+    if (Offset == -1) Offset = INT32_MAX;
+
+    for (MachineInstr &MI : *MBB) {
+      if (MI.getOpcode() == X86::WIN_ALLOCA_32 ||
+          MI.getOpcode() == X86::WIN_ALLOCA_64) {
+        // A WinAlloca moves StackPtr, and potentially touches it.
+        int64_t Amount = getWinAllocaAmount(&MI, MRI);
+        Lowering L = getLowering(Offset, Amount);
+        Lowerings[&MI] = L;
+        switch (L) {
+        case Sub:
+          Offset += Amount;
+          break;
+        case TouchAndSub:
+          Offset = Amount;
+          break;
+        case Probe:
+          Offset = 0;
+          break;
+        }
+      } else if (MI.isCall() || isPushPop(MI)) {
+        // Calls, pushes and pops touch the tip of the stack.
+        Offset = 0;
+      } else if (MI.getOpcode() == X86::ADJCALLSTACKUP32 ||
+                 MI.getOpcode() == X86::ADJCALLSTACKUP64) {
+        Offset -= MI.getOperand(0).getImm();
+      } else if (MI.getOpcode() == X86::ADJCALLSTACKDOWN32 ||
+                 MI.getOpcode() == X86::ADJCALLSTACKDOWN64) {
+        Offset += MI.getOperand(0).getImm();
+      } else if (MI.modifiesRegister(StackPtr, TRI)) {
+        // Any other modification of SP means we've lost track of it.
+        Offset = INT32_MAX;
+      }
+    }
+
+    OutOffset[MBB] = Offset;
+  }
+}
+
+static unsigned getSubOpcode(bool Is64Bit, int64_t Amount) {
+  if (Is64Bit)
+    return isInt<8>(Amount) ? X86::SUB64ri8 : X86::SUB64ri32;
+  return isInt<8>(Amount) ? X86::SUB32ri8 : X86::SUB32ri;
+}
+
+void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
+  DebugLoc DL = MI->getDebugLoc();
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineBasicBlock::iterator I = *MI;
+
+  int64_t Amount = getWinAllocaAmount(MI, MRI);
+  if (Amount == 0) {
+    MI->eraseFromParent();
+    return;
+  }
+
+  bool Is64Bit = STI->is64Bit();
+  assert(SlotSize == 4 || SlotSize == 8);
+  unsigned RegA = (SlotSize == 8) ? X86::RAX : X86::EAX;
+
+  switch (L) {
+  case TouchAndSub:
+    assert(Amount >= SlotSize);
+
+    // Use a push to touch the top of the stack.
+    BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+        .addReg(RegA, RegState::Undef);
+    Amount -= SlotSize;
+    if (!Amount)
+      break;
+
+    // Fall through to make any remaining adjustment.
+    LLVM_FALLTHROUGH;
+  case Sub:
+    assert(Amount > 0);
+    if (Amount == SlotSize) {
+      // Use push to save size.
+      BuildMI(*MBB, I, DL, TII->get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+          .addReg(RegA, RegState::Undef);
+    } else {
+      // Sub.
+      BuildMI(*MBB, I, DL, TII->get(getSubOpcode(Is64Bit, Amount)), StackPtr)
+          .addReg(StackPtr)
+          .addImm(Amount);
+    }
+    break;
+  case Probe:
+    // The probe lowering expects the amount in RAX/EAX.
+    BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegA)
+        .addReg(MI->getOperand(0).getReg());
+
+    // Do the probe.
+    STI->getFrameLowering()->emitStackProbe(*MBB->getParent(), *MBB, MI, DL,
+                                            /*InPrologue=*/false);
+    break;
+  }
+
+  unsigned AmountReg = MI->getOperand(0).getReg();
+  MI->eraseFromParent();
+
+  // Delete the definition of AmountReg, possibly walking a chain of copies.
+  for (;;) {
+    if (!MRI->use_empty(AmountReg))
+      break;
+    MachineInstr *AmountDef = MRI->getUniqueVRegDef(AmountReg);
+    if (!AmountDef)
+      break;
+    if (AmountDef->isCopy() && AmountDef->getOperand(1).isReg())
+      AmountReg = AmountDef->getOperand(1).isReg();
+    AmountDef->eraseFromParent();
+    break;
+  }
+}
+
+bool X86WinAllocaExpander::runOnMachineFunction(MachineFunction &MF) {
+  if (!MF.getInfo<X86MachineFunctionInfo>()->hasWinAlloca())
+    return false;
+
+  MRI = &MF.getRegInfo();
+  STI = &MF.getSubtarget<X86Subtarget>();
+  TII = STI->getInstrInfo();
+  TRI = STI->getRegisterInfo();
+  StackPtr = TRI->getStackRegister();
+  SlotSize = TRI->getSlotSize();
+
+  StackProbeSize = 4096;
+  if (MF.getFunction()->hasFnAttribute("stack-probe-size")) {
+    MF.getFunction()
+        ->getFnAttribute("stack-probe-size")
+        .getValueAsString()
+        .getAsInteger(0, StackProbeSize);
+  }
+
+  LoweringMap Lowerings;
+  computeLowerings(MF, Lowerings);
+  for (auto &P : Lowerings)
+    lower(P.first, P.second);
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
new file mode 100644
index 000000000000..bc14630584e5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -0,0 +1,796 @@
+//===-- X86WinEHState - Insert EH state updates for win32 exceptions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// All functions using an MSVC EH personality use an explicitly updated state
+// number stored in an exception registration stack object. The registration
+// object is linked into a thread-local chain of registrations stored at fs:00.
+// This pass adds the registration object and EH state updates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include <deque>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "winehstate"
+
+namespace llvm {
+void initializeWinEHStatePassPass(PassRegistry &);
+}
+
+namespace {
+const int OverdefinedState = INT_MIN;
+
+class WinEHStatePass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid.
+
+  WinEHStatePass() : FunctionPass(ID) {
+    initializeWinEHStatePassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &Fn) override;
+
+  bool doInitialization(Module &M) override;
+
+  bool doFinalization(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  StringRef getPassName() const override {
+    return "Windows 32-bit x86 EH state insertion";
+  }
+
+private:
+  void emitExceptionRegistrationRecord(Function *F);
+
+  void linkExceptionRegistration(IRBuilder<> &Builder, Function *Handler);
+  void unlinkExceptionRegistration(IRBuilder<> &Builder);
+  void addStateStores(Function &F, WinEHFuncInfo &FuncInfo);
+  void insertStateNumberStore(Instruction *IP, int State);
+
+  Value *emitEHLSDA(IRBuilder<> &Builder, Function *F);
+
+  Function *generateLSDAInEAXThunk(Function *ParentFunc);
+
+  bool isStateStoreNeeded(EHPersonality Personality, CallSite CS);
+  void rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F, CallSite CS,
+                             Value *State);
+  int getBaseStateForBB(DenseMap<BasicBlock *, ColorVector> &BlockColors,
+                        WinEHFuncInfo &FuncInfo, BasicBlock *BB);
+  int getStateForCallSite(DenseMap<BasicBlock *, ColorVector> &BlockColors,
+                          WinEHFuncInfo &FuncInfo, CallSite CS);
+
+  // Module-level type getters.
+  Type *getEHLinkRegistrationType();
+  Type *getSEHRegistrationType();
+  Type *getCXXEHRegistrationType();
+
+  // Per-module data.
+  Module *TheModule = nullptr;
+  StructType *EHLinkRegistrationTy = nullptr;
+  StructType *CXXEHRegistrationTy = nullptr;
+  StructType *SEHRegistrationTy = nullptr;
+  Constant *SetJmp3 = nullptr;
+  Constant *CxxLongjmpUnwind = nullptr;
+
+  // Per-function state
+  EHPersonality Personality = EHPersonality::Unknown;
+  Function *PersonalityFn = nullptr;
+  bool UseStackGuard = false;
+  int ParentBaseState;
+  Constant *SehLongjmpUnwind = nullptr;
+  Constant *Cookie = nullptr;
+
+  /// The stack allocation containing all EH data, including the link in the
+  /// fs:00 chain and the current state.
+  AllocaInst *RegNode = nullptr;
+
+  // The allocation containing the EH security guard.
+  AllocaInst *EHGuardNode = nullptr;
+
+  /// The index of the state field of RegNode.
+  int StateFieldIndex = ~0U;
+
+  /// The linked list node subobject inside of RegNode.
+  Value *Link = nullptr;
+};
+}
+
+FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); }
+
+char WinEHStatePass::ID = 0;
+
+INITIALIZE_PASS(WinEHStatePass, "x86-winehstate",
+                "Insert stores for EH state numbers", false, false)
+
+bool WinEHStatePass::doInitialization(Module &M) {
+  TheModule = &M;
+  return false;
+}
+
+bool WinEHStatePass::doFinalization(Module &M) {
+  assert(TheModule == &M);
+  TheModule = nullptr;
+  EHLinkRegistrationTy = nullptr;
+  CXXEHRegistrationTy = nullptr;
+  SEHRegistrationTy = nullptr;
+  SetJmp3 = nullptr;
+  CxxLongjmpUnwind = nullptr;
+  SehLongjmpUnwind = nullptr;
+  Cookie = nullptr;
+  return false;
+}
+
+void WinEHStatePass::getAnalysisUsage(AnalysisUsage &AU) const {
+  // This pass should only insert a stack allocation, memory accesses, and
+  // localrecovers.
+  AU.setPreservesCFG();
+}
+
+bool WinEHStatePass::runOnFunction(Function &F) {
+  // Check the personality. Do nothing if this personality doesn't use funclets.
+  if (!F.hasPersonalityFn())
+    return false;
+  PersonalityFn =
+      dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts());
+  if (!PersonalityFn)
+    return false;
+  Personality = classifyEHPersonality(PersonalityFn);
+  if (!isFuncletEHPersonality(Personality))
+    return false;
+
+  // Skip this function if there are no EH pads and we aren't using IR-level
+  // outlining.
+  bool HasPads = false;
+  for (BasicBlock &BB : F) {
+    if (BB.isEHPad()) {
+      HasPads = true;
+      break;
+    }
+  }
+  if (!HasPads)
+    return false;
+
+  Type *Int8PtrType = Type::getInt8PtrTy(TheModule->getContext());
+  SetJmp3 = TheModule->getOrInsertFunction(
+      "_setjmp3", FunctionType::get(
+                      Type::getInt32Ty(TheModule->getContext()),
+                      {Int8PtrType, Type::getInt32Ty(TheModule->getContext())},
+                      /*isVarArg=*/true));
+
+  // Disable frame pointer elimination in this function.
+  // FIXME: Do the nested handlers need to keep the parent ebp in ebp, or can we
+  // use an arbitrary register?
+  F.addFnAttr("no-frame-pointer-elim", "true");
+
+  emitExceptionRegistrationRecord(&F);
+
+  // The state numbers calculated here in IR must agree with what we calculate
+  // later on for the MachineFunction. In particular, if an IR pass deletes an
+  // unreachable EH pad after this point before machine CFG construction, we
+  // will be in trouble. If this assumption is ever broken, we should turn the
+  // numbers into an immutable analysis pass.
+  WinEHFuncInfo FuncInfo;
+  addStateStores(F, FuncInfo);
+
+  // Reset per-function state.
+  PersonalityFn = nullptr;
+  Personality = EHPersonality::Unknown;
+  UseStackGuard = false;
+  RegNode = nullptr;
+  EHGuardNode = nullptr;
+
+  return true;
+}
+
+/// Get the common EH registration subobject:
+///   typedef _EXCEPTION_DISPOSITION (*PEXCEPTION_ROUTINE)(
+///       _EXCEPTION_RECORD *, void *, _CONTEXT *, void *);
+///   struct EHRegistrationNode {
+///     EHRegistrationNode *Next;
+///     PEXCEPTION_ROUTINE Handler;
+///   };
+Type *WinEHStatePass::getEHLinkRegistrationType() {
+  if (EHLinkRegistrationTy)
+    return EHLinkRegistrationTy;
+  LLVMContext &Context = TheModule->getContext();
+  EHLinkRegistrationTy = StructType::create(Context, "EHRegistrationNode");
+  Type *FieldTys[] = {
+      EHLinkRegistrationTy->getPointerTo(0), // EHRegistrationNode *Next
+      Type::getInt8PtrTy(Context) // EXCEPTION_DISPOSITION (*Handler)(...)
+  };
+  EHLinkRegistrationTy->setBody(FieldTys, false);
+  return EHLinkRegistrationTy;
+}
+
+/// The __CxxFrameHandler3 registration node:
+///   struct CXXExceptionRegistration {
+///     void *SavedESP;
+///     EHRegistrationNode SubRecord;
+///     int32_t TryLevel;
+///   };
+Type *WinEHStatePass::getCXXEHRegistrationType() {
+  if (CXXEHRegistrationTy)
+    return CXXEHRegistrationTy;
+  LLVMContext &Context = TheModule->getContext();
+  Type *FieldTys[] = {
+      Type::getInt8PtrTy(Context), // void *SavedESP
+      getEHLinkRegistrationType(), // EHRegistrationNode SubRecord
+      Type::getInt32Ty(Context)    // int32_t TryLevel
+  };
+  CXXEHRegistrationTy =
+      StructType::create(FieldTys, "CXXExceptionRegistration");
+  return CXXEHRegistrationTy;
+}
+
+/// The _except_handler3/4 registration node:
+///   struct EH4ExceptionRegistration {
+///     void *SavedESP;
+///     _EXCEPTION_POINTERS *ExceptionPointers;
+///     EHRegistrationNode SubRecord;
+///     int32_t EncodedScopeTable;
+///     int32_t TryLevel;
+///   };
+Type *WinEHStatePass::getSEHRegistrationType() {
+  if (SEHRegistrationTy)
+    return SEHRegistrationTy;
+  LLVMContext &Context = TheModule->getContext();
+  Type *FieldTys[] = {
+      Type::getInt8PtrTy(Context), // void *SavedESP
+      Type::getInt8PtrTy(Context), // void *ExceptionPointers
+      getEHLinkRegistrationType(), // EHRegistrationNode SubRecord
+      Type::getInt32Ty(Context),   // int32_t EncodedScopeTable
+      Type::getInt32Ty(Context)    // int32_t TryLevel
+  };
+  SEHRegistrationTy = StructType::create(FieldTys, "SEHExceptionRegistration");
+  return SEHRegistrationTy;
+}
+
+// Emit an exception registration record. These are stack allocations with the
+// common subobject of two pointers: the previous registration record (the old
+// fs:00) and the personality function for the current frame. The data before
+// and after that is personality function specific.
+void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
+  assert(Personality == EHPersonality::MSVC_CXX ||
+         Personality == EHPersonality::MSVC_X86SEH);
+
+  // Struct type of RegNode. Used for GEPing.
+  Type *RegNodeTy;
+
+  IRBuilder<> Builder(&F->getEntryBlock(), F->getEntryBlock().begin());
+  Type *Int8PtrType = Builder.getInt8PtrTy();
+  Type *Int32Ty = Builder.getInt32Ty();
+  Type *VoidTy = Builder.getVoidTy();
+
+  if (Personality == EHPersonality::MSVC_CXX) {
+    RegNodeTy = getCXXEHRegistrationType();
+    RegNode = Builder.CreateAlloca(RegNodeTy);
+    // SavedESP = llvm.stacksave()
+    Value *SP = Builder.CreateCall(
+        Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {});
+    Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
+    // TryLevel = -1
+    StateFieldIndex = 2;
+    ParentBaseState = -1;
+    insertStateNumberStore(&*Builder.GetInsertPoint(), ParentBaseState);
+    // Handler = __ehhandler$F
+    Function *Trampoline = generateLSDAInEAXThunk(F);
+    Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 1);
+    linkExceptionRegistration(Builder, Trampoline);
+
+    CxxLongjmpUnwind = TheModule->getOrInsertFunction(
+        "__CxxLongjmpUnwind",
+        FunctionType::get(VoidTy, Int8PtrType, /*isVarArg=*/false));
+    cast<Function>(CxxLongjmpUnwind->stripPointerCasts())
+        ->setCallingConv(CallingConv::X86_StdCall);
+  } else if (Personality == EHPersonality::MSVC_X86SEH) {
+    // If _except_handler4 is in use, some additional guard checks and prologue
+    // stuff is required.
+    StringRef PersonalityName = PersonalityFn->getName();
+    UseStackGuard = (PersonalityName == "_except_handler4");
+
+    // Allocate local structures.
+    RegNodeTy = getSEHRegistrationType();
+    RegNode = Builder.CreateAlloca(RegNodeTy);
+    if (UseStackGuard)
+      EHGuardNode = Builder.CreateAlloca(Int32Ty);
+
+    // SavedESP = llvm.stacksave()
+    Value *SP = Builder.CreateCall(
+        Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {});
+    Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
+    // TryLevel = -2 / -1
+    StateFieldIndex = 4;
+    ParentBaseState = UseStackGuard ? -2 : -1;
+    insertStateNumberStore(&*Builder.GetInsertPoint(), ParentBaseState);
+    // ScopeTable = llvm.x86.seh.lsda(F)
+    Value *LSDA = emitEHLSDA(Builder, F);
+    LSDA = Builder.CreatePtrToInt(LSDA, Int32Ty);
+    // If using _except_handler4, xor the address of the table with
+    // __security_cookie.
+    if (UseStackGuard) {
+      Cookie = TheModule->getOrInsertGlobal("__security_cookie", Int32Ty);
+      Value *Val = Builder.CreateLoad(Int32Ty, Cookie, "cookie");
+      LSDA = Builder.CreateXor(LSDA, Val);
+    }
+    Builder.CreateStore(LSDA, Builder.CreateStructGEP(RegNodeTy, RegNode, 3));
+
+    // If using _except_handler4, the EHGuard contains: FramePtr xor Cookie.
+    if (UseStackGuard) {
+      Value *Val = Builder.CreateLoad(Int32Ty, Cookie);
+      Value *FrameAddr = Builder.CreateCall(
+          Intrinsic::getDeclaration(TheModule, Intrinsic::frameaddress),
+          Builder.getInt32(0), "frameaddr");
+      Value *FrameAddrI32 = Builder.CreatePtrToInt(FrameAddr, Int32Ty);
+      FrameAddrI32 = Builder.CreateXor(FrameAddrI32, Val);
+      Builder.CreateStore(FrameAddrI32, EHGuardNode);
+    }
+
+    // Register the exception handler.
+    Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 2);
+    linkExceptionRegistration(Builder, PersonalityFn);
+
+    SehLongjmpUnwind = TheModule->getOrInsertFunction(
+        UseStackGuard ? "_seh_longjmp_unwind4" : "_seh_longjmp_unwind",
+        FunctionType::get(Type::getVoidTy(TheModule->getContext()), Int8PtrType,
+                          /*isVarArg=*/false));
+    cast<Function>(SehLongjmpUnwind->stripPointerCasts())
+        ->setCallingConv(CallingConv::X86_StdCall);
+  } else {
+    llvm_unreachable("unexpected personality function");
+  }
+
+  // Insert an unlink before all returns.
+  for (BasicBlock &BB : *F) {
+    TerminatorInst *T = BB.getTerminator();
+    if (!isa<ReturnInst>(T))
+      continue;
+    Builder.SetInsertPoint(T);
+    unlinkExceptionRegistration(Builder);
+  }
+}
+
+Value *WinEHStatePass::emitEHLSDA(IRBuilder<> &Builder, Function *F) {
+  Value *FI8 = Builder.CreateBitCast(F, Type::getInt8PtrTy(F->getContext()));
+  return Builder.CreateCall(
+      Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_lsda), FI8);
+}
+
+/// Generate a thunk that puts the LSDA of ParentFunc in EAX and then calls
+/// PersonalityFn, forwarding the parameters passed to PEXCEPTION_ROUTINE:
+///   typedef _EXCEPTION_DISPOSITION (*PEXCEPTION_ROUTINE)(
+///       _EXCEPTION_RECORD *, void *, _CONTEXT *, void *);
+/// We essentially want this code:
+///   movl $lsda, %eax
+///   jmpl ___CxxFrameHandler3
+Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
+  LLVMContext &Context = ParentFunc->getContext();
+  Type *Int32Ty = Type::getInt32Ty(Context);
+  Type *Int8PtrType = Type::getInt8PtrTy(Context);
+  Type *ArgTys[5] = {Int8PtrType, Int8PtrType, Int8PtrType, Int8PtrType,
+                     Int8PtrType};
+  FunctionType *TrampolineTy =
+      FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 4),
+                        /*isVarArg=*/false);
+  FunctionType *TargetFuncTy =
+      FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 5),
+                        /*isVarArg=*/false);
+  Function *Trampoline =
+      Function::Create(TrampolineTy, GlobalValue::InternalLinkage,
+                       Twine("__ehhandler$") + GlobalValue::getRealLinkageName(
+                                                   ParentFunc->getName()),
+                       TheModule);
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline);
+  IRBuilder<> Builder(EntryBB);
+  Value *LSDA = emitEHLSDA(Builder, ParentFunc);
+  Value *CastPersonality =
+      Builder.CreateBitCast(PersonalityFn, TargetFuncTy->getPointerTo());
+  auto AI = Trampoline->arg_begin();
+  Value *Args[5] = {LSDA, &*AI++, &*AI++, &*AI++, &*AI++};
+  CallInst *Call = Builder.CreateCall(CastPersonality, Args);
+  // Can't use musttail due to prototype mismatch, but we can use tail.
+  Call->setTailCall(true);
+  // Set inreg so we pass it in EAX.
+  Call->addAttribute(1, Attribute::InReg);
+  Builder.CreateRet(Call);
+  return Trampoline;
+}
+
+void WinEHStatePass::linkExceptionRegistration(IRBuilder<> &Builder,
+                                               Function *Handler) {
+  // Emit the .safeseh directive for this function.
+  Handler->addFnAttr("safeseh");
+
+  Type *LinkTy = getEHLinkRegistrationType();
+  // Handler = Handler
+  Value *HandlerI8 = Builder.CreateBitCast(Handler, Builder.getInt8PtrTy());
+  Builder.CreateStore(HandlerI8, Builder.CreateStructGEP(LinkTy, Link, 1));
+  // Next = [fs:00]
+  Constant *FSZero =
+      Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257));
+  Value *Next = Builder.CreateLoad(FSZero);
+  Builder.CreateStore(Next, Builder.CreateStructGEP(LinkTy, Link, 0));
+  // [fs:00] = Link
+  Builder.CreateStore(Link, FSZero);
+}
+
+void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) {
+  // Clone Link into the current BB for better address mode folding.
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(Link)) {
+    GEP = cast<GetElementPtrInst>(GEP->clone());
+    Builder.Insert(GEP);
+    Link = GEP;
+  }
+  Type *LinkTy = getEHLinkRegistrationType();
+  // [fs:00] = Link->Next
+  Value *Next =
+      Builder.CreateLoad(Builder.CreateStructGEP(LinkTy, Link, 0));
+  Constant *FSZero =
+      Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257));
+  Builder.CreateStore(Next, FSZero);
+}
+
+// Calls to setjmp(p) are lowered to _setjmp3(p, 0) by the frontend.
+// The idea behind _setjmp3 is that it takes an optional number of personality
+// specific parameters to indicate how to restore the personality-specific frame
+// state when longjmp is initiated.  Typically, the current TryLevel is saved.
+void WinEHStatePass::rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F,
+                                           CallSite CS, Value *State) {
+  // Don't rewrite calls with a weird number of arguments.
+  if (CS.getNumArgOperands() != 2)
+    return;
+
+  Instruction *Inst = CS.getInstruction();
+
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  CS.getOperandBundlesAsDefs(OpBundles);
+
+  SmallVector<Value *, 3> OptionalArgs;
+  if (Personality == EHPersonality::MSVC_CXX) {
+    OptionalArgs.push_back(CxxLongjmpUnwind);
+    OptionalArgs.push_back(State);
+    OptionalArgs.push_back(emitEHLSDA(Builder, &F));
+  } else if (Personality == EHPersonality::MSVC_X86SEH) {
+    OptionalArgs.push_back(SehLongjmpUnwind);
+    OptionalArgs.push_back(State);
+    if (UseStackGuard)
+      OptionalArgs.push_back(Cookie);
+  } else {
+    llvm_unreachable("unhandled personality!");
+  }
+
+  SmallVector<Value *, 5> Args;
+  Args.push_back(
+      Builder.CreateBitCast(CS.getArgOperand(0), Builder.getInt8PtrTy()));
+  Args.push_back(Builder.getInt32(OptionalArgs.size()));
+  Args.append(OptionalArgs.begin(), OptionalArgs.end());
+
+  CallSite NewCS;
+  if (CS.isCall()) {
+    auto *CI = cast<CallInst>(Inst);
+    CallInst *NewCI = Builder.CreateCall(SetJmp3, Args, OpBundles);
+    NewCI->setTailCallKind(CI->getTailCallKind());
+    NewCS = NewCI;
+  } else {
+    auto *II = cast<InvokeInst>(Inst);
+    NewCS = Builder.CreateInvoke(
+        SetJmp3, II->getNormalDest(), II->getUnwindDest(), Args, OpBundles);
+  }
+  NewCS.setCallingConv(CS.getCallingConv());
+  NewCS.setAttributes(CS.getAttributes());
+  NewCS->setDebugLoc(CS->getDebugLoc());
+
+  Instruction *NewInst = NewCS.getInstruction();
+  NewInst->takeName(Inst);
+  Inst->replaceAllUsesWith(NewInst);
+  Inst->eraseFromParent();
+}
+
+// Figure out what state we should assign calls in this block.
+int WinEHStatePass::getBaseStateForBB(
+    DenseMap<BasicBlock *, ColorVector> &BlockColors, WinEHFuncInfo &FuncInfo,
+    BasicBlock *BB) {
+  int BaseState = ParentBaseState;
+  auto &BBColors = BlockColors[BB];
+
+  assert(BBColors.size() == 1 && "multi-color BB not removed by preparation");
+  BasicBlock *FuncletEntryBB = BBColors.front();
+  if (auto *FuncletPad =
+          dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI())) {
+    auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad);
+    if (BaseStateI != FuncInfo.FuncletBaseStateMap.end())
+      BaseState = BaseStateI->second;
+  }
+
+  return BaseState;
+}
+
+// Calculate the state a call-site is in.
+int WinEHStatePass::getStateForCallSite(
+    DenseMap<BasicBlock *, ColorVector> &BlockColors, WinEHFuncInfo &FuncInfo,
+    CallSite CS) {
+  if (auto *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+    // Look up the state number of the EH pad this unwinds to.
+    assert(FuncInfo.InvokeStateMap.count(II) && "invoke has no state!");
+    return FuncInfo.InvokeStateMap[II];
+  }
+  // Possibly throwing call instructions have no actions to take after
+  // an unwind. Ensure they are in the -1 state.
+  return getBaseStateForBB(BlockColors, FuncInfo, CS.getParent());
+}
+
+// Calculate the intersection of all the FinalStates for a BasicBlock's
+// predecessors.
+static int getPredState(DenseMap<BasicBlock *, int> &FinalStates, Function &F,
+                        int ParentBaseState, BasicBlock *BB) {
+  // The entry block has no predecessors but we know that the prologue always
+  // sets us up with a fixed state.
+  if (&F.getEntryBlock() == BB)
+    return ParentBaseState;
+
+  // This is an EH Pad, conservatively report this basic block as overdefined.
+  if (BB->isEHPad())
+    return OverdefinedState;
+
+  int CommonState = OverdefinedState;
+  for (BasicBlock *PredBB : predecessors(BB)) {
+    // We didn't manage to get a state for one of these predecessors,
+    // conservatively report this basic block as overdefined.
+    auto PredEndState = FinalStates.find(PredBB);
+    if (PredEndState == FinalStates.end())
+      return OverdefinedState;
+
+    // This code is reachable via exceptional control flow,
+    // conservatively report this basic block as overdefined.
+    if (isa<CatchReturnInst>(PredBB->getTerminator()))
+      return OverdefinedState;
+
+    int PredState = PredEndState->second;
+    assert(PredState != OverdefinedState &&
+           "overdefined BBs shouldn't be in FinalStates");
+    if (CommonState == OverdefinedState)
+      CommonState = PredState;
+
+    // At least two predecessors have different FinalStates,
+    // conservatively report this basic block as overdefined.
+    if (CommonState != PredState)
+      return OverdefinedState;
+  }
+
+  return CommonState;
+}
+
+// Calculate the intersection of all the InitialStates for a BasicBlock's
+// successors.
+static int getSuccState(DenseMap<BasicBlock *, int> &InitialStates, Function &F,
+                        int ParentBaseState, BasicBlock *BB) {
+  // This block rejoins normal control flow,
+  // conservatively report this basic block as overdefined.
+  if (isa<CatchReturnInst>(BB->getTerminator()))
+    return OverdefinedState;
+
+  int CommonState = OverdefinedState;
+  for (BasicBlock *SuccBB : successors(BB)) {
+    // We didn't manage to get a state for one of these predecessors,
+    // conservatively report this basic block as overdefined.
+    auto SuccStartState = InitialStates.find(SuccBB);
+    if (SuccStartState == InitialStates.end())
+      return OverdefinedState;
+
+    // This is an EH Pad, conservatively report this basic block as overdefined.
+    if (SuccBB->isEHPad())
+      return OverdefinedState;
+
+    int SuccState = SuccStartState->second;
+    assert(SuccState != OverdefinedState &&
+           "overdefined BBs shouldn't be in FinalStates");
+    if (CommonState == OverdefinedState)
+      CommonState = SuccState;
+
+    // At least two successors have different InitialStates,
+    // conservatively report this basic block as overdefined.
+    if (CommonState != SuccState)
+      return OverdefinedState;
+  }
+
+  return CommonState;
+}
+
+bool WinEHStatePass::isStateStoreNeeded(EHPersonality Personality,
+                                        CallSite CS) {
+  if (!CS)
+    return false;
+
+  // If the function touches memory, it needs a state store.
+  if (isAsynchronousEHPersonality(Personality))
+    return !CS.doesNotAccessMemory();
+
+  // If the function throws, it needs a state store.
+  return !CS.doesNotThrow();
+}
+
+void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
+  // Mark the registration node. The backend needs to know which alloca it is so
+  // that it can recover the original frame pointer.
+  IRBuilder<> Builder(RegNode->getNextNode());
+  Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getInt8PtrTy());
+  Builder.CreateCall(
+      Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehregnode),
+      {RegNodeI8});
+
+  if (EHGuardNode) {
+    IRBuilder<> Builder(EHGuardNode->getNextNode());
+    Value *EHGuardNodeI8 =
+        Builder.CreateBitCast(EHGuardNode, Builder.getInt8PtrTy());
+    Builder.CreateCall(
+        Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehguard),
+        {EHGuardNodeI8});
+  }
+
+  // Calculate state numbers.
+  if (isAsynchronousEHPersonality(Personality))
+    calculateSEHStateNumbers(&F, FuncInfo);
+  else
+    calculateWinCXXEHStateNumbers(&F, FuncInfo);
+
+  // Iterate all the instructions and emit state number stores.
+  DenseMap<BasicBlock *, ColorVector> BlockColors = colorEHFunclets(F);
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+
+  // InitialStates yields the state of the first call-site for a BasicBlock.
+  DenseMap<BasicBlock *, int> InitialStates;
+  // FinalStates yields the state of the last call-site for a BasicBlock.
+  DenseMap<BasicBlock *, int> FinalStates;
+  // Worklist used to revisit BasicBlocks with indeterminate
+  // Initial/Final-States.
+  std::deque<BasicBlock *> Worklist;
+  // Fill in InitialStates and FinalStates for BasicBlocks with call-sites.
+  for (BasicBlock *BB : RPOT) {
+    int InitialState = OverdefinedState;
+    int FinalState;
+    if (&F.getEntryBlock() == BB)
+      InitialState = FinalState = ParentBaseState;
+    for (Instruction &I : *BB) {
+      CallSite CS(&I);
+      if (!isStateStoreNeeded(Personality, CS))
+        continue;
+
+      int State = getStateForCallSite(BlockColors, FuncInfo, CS);
+      if (InitialState == OverdefinedState)
+        InitialState = State;
+      FinalState = State;
+    }
+    // No call-sites in this basic block? That's OK, we will come back to these
+    // in a later pass.
+    if (InitialState == OverdefinedState) {
+      Worklist.push_back(BB);
+      continue;
+    }
+    DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+                 << " InitialState=" << InitialState << '\n');
+    DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+                 << " FinalState=" << FinalState << '\n');
+    InitialStates.insert({BB, InitialState});
+    FinalStates.insert({BB, FinalState});
+  }
+
+  // Try to fill-in InitialStates and FinalStates which have no call-sites.
+  while (!Worklist.empty()) {
+    BasicBlock *BB = Worklist.front();
+    Worklist.pop_front();
+    // This BasicBlock has already been figured out, nothing more we can do.
+    if (InitialStates.count(BB) != 0)
+      continue;
+
+    int PredState = getPredState(FinalStates, F, ParentBaseState, BB);
+    if (PredState == OverdefinedState)
+      continue;
+
+    // We successfully inferred this BasicBlock's state via it's predecessors;
+    // enqueue it's successors to see if we can infer their states.
+    InitialStates.insert({BB, PredState});
+    FinalStates.insert({BB, PredState});
+    for (BasicBlock *SuccBB : successors(BB))
+      Worklist.push_back(SuccBB);
+  }
+
+  // Try to hoist stores from successors.
+  for (BasicBlock *BB : RPOT) {
+    int SuccState = getSuccState(InitialStates, F, ParentBaseState, BB);
+    if (SuccState == OverdefinedState)
+      continue;
+
+    // Update our FinalState to reflect the common InitialState of our
+    // successors.
+    FinalStates.insert({BB, SuccState});
+  }
+
+  // Finally, insert state stores before call-sites which transition us to a new
+  // state.
+  for (BasicBlock *BB : RPOT) {
+    auto &BBColors = BlockColors[BB];
+    BasicBlock *FuncletEntryBB = BBColors.front();
+    if (isa<CleanupPadInst>(FuncletEntryBB->getFirstNonPHI()))
+      continue;
+
+    int PrevState = getPredState(FinalStates, F, ParentBaseState, BB);
+    DEBUG(dbgs() << "X86WinEHState: " << BB->getName()
+                 << " PrevState=" << PrevState << '\n');
+
+    for (Instruction &I : *BB) {
+      CallSite CS(&I);
+      if (!isStateStoreNeeded(Personality, CS))
+        continue;
+
+      int State = getStateForCallSite(BlockColors, FuncInfo, CS);
+      if (State != PrevState)
+        insertStateNumberStore(&I, State);
+      PrevState = State;
+    }
+
+    // We might have hoisted a state store into this block, emit it now.
+    auto EndState = FinalStates.find(BB);
+    if (EndState != FinalStates.end())
+      if (EndState->second != PrevState)
+        insertStateNumberStore(BB->getTerminator(), EndState->second);
+  }
+
+  SmallVector<CallSite, 1> SetJmp3CallSites;
+  for (BasicBlock *BB : RPOT) {
+    for (Instruction &I : *BB) {
+      CallSite CS(&I);
+      if (!CS)
+        continue;
+      if (CS.getCalledValue()->stripPointerCasts() !=
+          SetJmp3->stripPointerCasts())
+        continue;
+
+      SetJmp3CallSites.push_back(CS);
+    }
+  }
+
+  for (CallSite CS : SetJmp3CallSites) {
+    auto &BBColors = BlockColors[CS->getParent()];
+    BasicBlock *FuncletEntryBB = BBColors.front();
+    bool InCleanup = isa<CleanupPadInst>(FuncletEntryBB->getFirstNonPHI());
+
+    IRBuilder<> Builder(CS.getInstruction());
+    Value *State;
+    if (InCleanup) {
+      Value *StateField =
+          Builder.CreateStructGEP(nullptr, RegNode, StateFieldIndex);
+      State = Builder.CreateLoad(StateField);
+    } else {
+      State = Builder.getInt32(getStateForCallSite(BlockColors, FuncInfo, CS));
+    }
+    rewriteSetJmpCallSite(Builder, F, CS, State);
+  }
+}
+
+void WinEHStatePass::insertStateNumberStore(Instruction *IP, int State) {
+  IRBuilder<> Builder(IP);
+  Value *StateField =
+      Builder.CreateStructGEP(nullptr, RegNode, StateFieldIndex);
+  Builder.CreateStore(Builder.getInt32(State), StateField);
+}
diff --git a/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
new file mode 100644
index 000000000000..059b75ef482a
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -0,0 +1,785 @@
+//===- XCoreDisassembler.cpp - Disassembler for XCore -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file is part of the XCore Disassembler.
+///
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreRegisterInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// \brief A disassembler class for XCore.
+class XCoreDisassembler : public MCDisassembler {
+public:
+  XCoreDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+    MCDisassembler(STI, Ctx) {}
+
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
+};
+}
+
+static bool readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              uint64_t &Size, uint16_t &Insn) {
+  // We want to read exactly 2 Bytes of data.
+  if (Bytes.size() < 2) {
+    Size = 0;
+    return false;
+  }
+  // Encoded as a little-endian 16-bit word in the stream.
+  Insn = (Bytes[0] << 0) | (Bytes[1] << 8);
+  return true;
+}
+
+static bool readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              uint64_t &Size, uint32_t &Insn) {
+  // We want to read exactly 4 Bytes of data.
+  if (Bytes.size() < 4) {
+    Size = 0;
+    return false;
+  }
+  // Encoded as a little-endian 32-bit word in the stream.
+  Insn =
+      (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) | (Bytes[3] << 24);
+  return true;
+}
+
+static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
+  const XCoreDisassembler *Dis = static_cast<const XCoreDisassembler*>(D);
+  const MCRegisterInfo *RegInfo = Dis->getContext().getRegisterInfo();
+  return *(RegInfo->getRegClass(RC).begin() + RegNo);
+}
+
+static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
+                                      uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeNegImmOperand(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder);
+
+static DecodeStatus Decode2RInstruction(MCInst &Inst,
+                                        unsigned Insn,
+                                        uint64_t Address,
+                                        const void *Decoder);
+
+static DecodeStatus Decode2RImmInstruction(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeR2RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus Decode2RSrcDstInstruction(MCInst &Inst,
+                                              unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeRUSInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeRUSBitpInstruction(MCInst &Inst,
+                                             unsigned Insn,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeRUSSrcDstBitpInstruction(MCInst &Inst,
+                                                   unsigned Insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+
+static DecodeStatus DecodeL2RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeLR2RInstruction(MCInst &Inst,
+                                          unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+static DecodeStatus Decode3RInstruction(MCInst &Inst,
+                                        unsigned Insn,
+                                        uint64_t Address,
+                                        const void *Decoder);
+
+static DecodeStatus Decode3RImmInstruction(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus Decode2RUSInstruction(MCInst &Inst,
+                                          unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst,
+                                              unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeL3RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst,
+                                               unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst,
+                                               unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeL6RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeL5RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeL4RSrcDstInstruction(MCInst &Inst,
+                                               unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst,
+                                                     unsigned Insn,
+                                                     uint64_t Address,
+                                                     const void *Decoder);
+
+#include "XCoreGenDisassemblerTables.inc"
+
+static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder)
+{
+  if (RegNo > 11)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, XCore::GRRegsRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder)
+{
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, XCore::RRegsRegClassID, RegNo);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
+                                      uint64_t Address, const void *Decoder) {
+  if (Val > 11)
+    return MCDisassembler::Fail;
+  static const unsigned Values[] = {
+    32 /*bpw*/, 1, 2, 3, 4, 5, 6, 7, 8, 16, 24, 32
+  };
+  Inst.addOperand(MCOperand::createImm(Values[Val]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeNegImmOperand(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::createImm(-(int64_t)Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+Decode2OpInstruction(unsigned Insn, unsigned &Op1, unsigned &Op2) {
+  unsigned Combined = fieldFromInstruction(Insn, 6, 5);
+  if (Combined < 27)
+    return MCDisassembler::Fail;
+  if (fieldFromInstruction(Insn, 5, 1)) {
+    if (Combined == 31)
+      return MCDisassembler::Fail;
+    Combined += 5;
+  }
+  Combined -= 27;
+  unsigned Op1High = Combined % 3;
+  unsigned Op2High = Combined / 3;
+  Op1 = (Op1High << 2) | fieldFromInstruction(Insn, 2, 2);
+  Op2 = (Op2High << 2) | fieldFromInstruction(Insn, 0, 2);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+Decode3OpInstruction(unsigned Insn, unsigned &Op1, unsigned &Op2,
+                     unsigned &Op3) {
+  unsigned Combined = fieldFromInstruction(Insn, 6, 5);
+  if (Combined >= 27)
+    return MCDisassembler::Fail;
+
+  unsigned Op1High = Combined % 3;
+  unsigned Op2High = (Combined / 3) % 3;
+  unsigned Op3High = Combined / 9;
+  Op1 = (Op1High << 2) | fieldFromInstruction(Insn, 4, 2);
+  Op2 = (Op2High << 2) | fieldFromInstruction(Insn, 2, 2);
+  Op3 = (Op3High << 2) | fieldFromInstruction(Insn, 0, 2);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+Decode2OpInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
+                         const void *Decoder) {
+  // Try and decode as a 3R instruction.
+  unsigned Opcode = fieldFromInstruction(Insn, 11, 5);
+  switch (Opcode) {
+  case 0x0:
+    Inst.setOpcode(XCore::STW_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x1:
+    Inst.setOpcode(XCore::LDW_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x2:
+    Inst.setOpcode(XCore::ADD_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x3:
+    Inst.setOpcode(XCore::SUB_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x4:
+    Inst.setOpcode(XCore::SHL_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x5:
+    Inst.setOpcode(XCore::SHR_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x6:
+    Inst.setOpcode(XCore::EQ_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x7:
+    Inst.setOpcode(XCore::AND_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x8:
+    Inst.setOpcode(XCore::OR_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x9:
+    Inst.setOpcode(XCore::LDW_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x10:
+    Inst.setOpcode(XCore::LD16S_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x11:
+    Inst.setOpcode(XCore::LD8U_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x12:
+    Inst.setOpcode(XCore::ADD_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x13:
+    Inst.setOpcode(XCore::SUB_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x14:
+    Inst.setOpcode(XCore::SHL_2rus);
+    return Decode2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x15:
+    Inst.setOpcode(XCore::SHR_2rus);
+    return Decode2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x16:
+    Inst.setOpcode(XCore::EQ_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x17:
+    Inst.setOpcode(XCore::TSETR_3r);
+    return Decode3RImmInstruction(Inst, Insn, Address, Decoder);
+  case 0x18:
+    Inst.setOpcode(XCore::LSS_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x19:
+    Inst.setOpcode(XCore::LSU_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  }
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus
+Decode2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                    const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+Decode2RImmInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                       const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  Inst.addOperand(MCOperand::createImm(Op1));
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeR2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op2, Op1);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+Decode2RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                          const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeRUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  Inst.addOperand(MCOperand::createImm(Op2));
+  return S;
+}
+
+static DecodeStatus
+DecodeRUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                         const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeBitpOperand(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeRUSSrcDstBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeBitpOperand(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeL2OpInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
+                          const void *Decoder) {
+  // Try and decode as a L3R / L2RUS instruction.
+  unsigned Opcode = fieldFromInstruction(Insn, 16, 4) |
+                    fieldFromInstruction(Insn, 27, 5) << 4;
+  switch (Opcode) {
+  case 0x0c:
+    Inst.setOpcode(XCore::STW_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x1c:
+    Inst.setOpcode(XCore::XOR_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x2c:
+    Inst.setOpcode(XCore::ASHR_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x3c:
+    Inst.setOpcode(XCore::LDAWF_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x4c:
+    Inst.setOpcode(XCore::LDAWB_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x5c:
+    Inst.setOpcode(XCore::LDA16F_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x6c:
+    Inst.setOpcode(XCore::LDA16B_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x7c:
+    Inst.setOpcode(XCore::MUL_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x8c:
+    Inst.setOpcode(XCore::DIVS_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x9c:
+    Inst.setOpcode(XCore::DIVU_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x10c:
+    Inst.setOpcode(XCore::ST16_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x11c:
+    Inst.setOpcode(XCore::ST8_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x12c:
+    Inst.setOpcode(XCore::ASHR_l2rus);
+    return DecodeL2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x12d:
+    Inst.setOpcode(XCore::OUTPW_l2rus);
+    return DecodeL2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x12e:
+    Inst.setOpcode(XCore::INPW_l2rus);
+    return DecodeL2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x13c:
+    Inst.setOpcode(XCore::LDAWF_l2rus);
+    return DecodeL2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x14c:
+    Inst.setOpcode(XCore::LDAWB_l2rus);
+    return DecodeL2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x15c:
+    Inst.setOpcode(XCore::CRC_l3r);
+    return DecodeL3RSrcDstInstruction(Inst, Insn, Address, Decoder);
+  case 0x18c:
+    Inst.setOpcode(XCore::REMS_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x19c:
+    Inst.setOpcode(XCore::REMU_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  }
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus
+DecodeL2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(fieldFromInstruction(Insn, 0, 16),
+                                        Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return DecodeL2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeLR2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(fieldFromInstruction(Insn, 0, 16),
+                                        Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return DecodeL2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+Decode3RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                    const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+Decode3RImmInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                       const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    Inst.addOperand(MCOperand::createImm(Op1));
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+Decode2RUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                      const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    Inst.addOperand(MCOperand::createImm(Op3));
+  }
+  return S;
+}
+
+static DecodeStatus
+Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                      const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeBitpOperand(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL3RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                           const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                       const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    Inst.addOperand(MCOperand::createImm(Op3));
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                           const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeBitpOperand(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL6RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2, Op3, Op4, Op5, Op6;
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S != MCDisassembler::Success)
+    return S;
+  S = Decode3OpInstruction(fieldFromInstruction(Insn, 16, 16), Op4, Op5, Op6);
+  if (S != MCDisassembler::Success)
+    return S;
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op5, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op6, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeL5RInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  // Try and decode as a L6R instruction.
+  Inst.clear();
+  unsigned Opcode = fieldFromInstruction(Insn, 27, 5);
+  switch (Opcode) {
+  case 0x00:
+    Inst.setOpcode(XCore::LMUL_l6r);
+    return DecodeL6RInstruction(Inst, Insn, Address, Decoder);
+  }
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus
+DecodeL5RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2, Op3, Op4, Op5;
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S != MCDisassembler::Success)
+    return DecodeL5RInstructionFail(Inst, Insn, Address, Decoder);
+  S = Decode2OpInstruction(fieldFromInstruction(Insn, 16, 16), Op4, Op5);
+  if (S != MCDisassembler::Success)
+    return DecodeL5RInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op5, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeL4RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                           const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  unsigned Op4 = fieldFromInstruction(Insn, 16, 4);
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    S = DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+  }
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  unsigned Op4 = fieldFromInstruction(Insn, 16, 4);
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    S = DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+  }
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+MCDisassembler::DecodeStatus XCoreDisassembler::getInstruction(
+    MCInst &instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address,
+    raw_ostream &vStream, raw_ostream &cStream) const {
+  uint16_t insn16;
+
+  if (!readInstruction16(Bytes, Address, Size, insn16)) {
+    return Fail;
+  }
+
+  // Calling the auto-generated decoder function.
+  DecodeStatus Result = decodeInstruction(DecoderTable16, instr, insn16,
+                                          Address, this, STI);
+  if (Result != Fail) {
+    Size = 2;
+    return Result;
+  }
+
+  uint32_t insn32;
+
+  if (!readInstruction32(Bytes, Address, Size, insn32)) {
+    return Fail;
+  }
+
+  // Calling the auto-generated decoder function.
+  Result = decodeInstruction(DecoderTable32, instr, insn32, Address, this, STI);
+  if (Result != Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  return Fail;
+}
+
+namespace llvm {
+  Target &getTheXCoreTarget();
+}
+
+static MCDisassembler *createXCoreDisassembler(const Target &T,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new XCoreDisassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeXCoreDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(getTheXCoreTarget(),
+                                         createXCoreDisassembler);
+}
diff --git a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
new file mode 100644
index 000000000000..500c84d2a418
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
@@ -0,0 +1,88 @@
+//===-- XCoreInstPrinter.cpp - Convert XCore MCInst to assembly syntax ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an XCore MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreInstPrinter.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#include "XCoreGenAsmWriter.inc"
+
+void XCoreInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << StringRef(getRegisterName(RegNo)).lower();
+}
+
+void XCoreInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                 StringRef Annot, const MCSubtargetInfo &STI) {
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+void XCoreInstPrinter::
+printInlineJT(const MCInst *MI, int opNum, raw_ostream &O) {
+  report_fatal_error("can't handle InlineJT");
+}
+
+void XCoreInstPrinter::
+printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O) {
+  report_fatal_error("can't handle InlineJT32");
+}
+
+static void printExpr(const MCExpr *Expr, const MCAsmInfo *MAI,
+                      raw_ostream &OS) {
+  int Offset = 0;
+  const MCSymbolRefExpr *SRE;
+
+  if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr)) {
+    SRE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(BE->getRHS());
+    assert(SRE && CE && "Binary expression must be sym+const.");
+    Offset = CE->getValue();
+  } else {
+    SRE = dyn_cast<MCSymbolRefExpr>(Expr);
+    assert(SRE && "Unexpected MCExpr type.");
+  }
+  assert(SRE->getKind() == MCSymbolRefExpr::VK_None);
+
+  SRE->getSymbol().print(OS, MAI);
+
+  if (Offset) {
+    if (Offset > 0)
+      OS << '+';
+    OS << Offset;
+  }
+}
+
+void XCoreInstPrinter::
+printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    printRegName(O, Op.getReg());
+    return;
+  }
+
+  if (Op.isImm()) {
+    O << Op.getImm();
+    return;
+  }
+
+  assert(Op.isExpr() && "unknown operand kind in printOperand");
+  printExpr(Op.getExpr(), &MAI, O);
+}
diff --git a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
new file mode 100644
index 000000000000..dc513f7b225b
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
@@ -0,0 +1,43 @@
+//== XCoreInstPrinter.h - Convert XCore MCInst to assembly syntax -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the declaration of the XCoreInstPrinter class,
+/// which is used to print XCore MCInst to a .s file.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
+#define LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class XCoreInstPrinter : public MCInstPrinter {
+public:
+  XCoreInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                  const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
+                 const MCSubtargetInfo &STI) override;
+private:
+  void printInlineJT(const MCInst *MI, int opNum, raw_ostream &O);
+  void printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
new file mode 100644
index 000000000000..3178a4edbb3b
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
@@ -0,0 +1,33 @@
+//===-- XCoreMCAsmInfo.cpp - XCore asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMCAsmInfo.h"
+using namespace llvm;
+
+void XCoreMCAsmInfo::anchor() { }
+
+XCoreMCAsmInfo::XCoreMCAsmInfo(const Triple &TT) {
+  SupportsDebugInformation = true;
+  Data16bitsDirective = "\t.short\t";
+  Data32bitsDirective = "\t.long\t";
+  Data64bitsDirective = nullptr;
+  ZeroDirective = "\t.space\t";
+  CommentString = "#";
+
+  AscizDirective = ".asciiz";
+
+  HiddenVisibilityAttr = MCSA_Invalid;
+  HiddenDeclarationVisibilityAttr = MCSA_Invalid;
+  ProtectedVisibilityAttr = MCSA_Invalid;
+
+  // Debug
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+  DwarfRegNumForCFI = true;
+}
+
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
new file mode 100644
index 000000000000..39581e424e8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- XCoreMCAsmInfo.h - XCore asm properties ----------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the XCoreMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCASMINFO_H
+#define LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfoELF.h"
+
+namespace llvm {
+class Triple;
+
+class XCoreMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit XCoreMCAsmInfo(const Triple &TT);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
new file mode 100644
index 000000000000..c5859b7786f7
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -0,0 +1,151 @@
+//===-- XCoreMCTargetDesc.cpp - XCore Target Descriptions -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides XCore specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMCTargetDesc.h"
+#include "InstPrinter/XCoreInstPrinter.h"
+#include "XCoreMCAsmInfo.h"
+#include "XCoreTargetStreamer.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "XCoreGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "XCoreGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "XCoreGenRegisterInfo.inc"
+
+static MCInstrInfo *createXCoreMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitXCoreMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createXCoreMCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitXCoreMCRegisterInfo(X, XCore::LR);
+  return X;
+}
+
+static MCSubtargetInfo *
+createXCoreMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
+  return createXCoreMCSubtargetInfoImpl(TT, CPU, FS);
+}
+
+static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI,
+                                       const Triple &TT) {
+  MCAsmInfo *MAI = new XCoreMCAsmInfo(TT);
+
+  // Initial state of the frame pointer is SP.
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, XCore::SP, 0);
+  MAI->addInitialFrameState(Inst);
+
+  return MAI;
+}
+
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+                              CodeModel::Model &CM) {
+  if (CM == CodeModel::Default) {
+    CM = CodeModel::Small;
+  }
+  if (CM != CodeModel::Small && CM != CodeModel::Large)
+    report_fatal_error("Target only supports CodeModel Small or Large");
+}
+
+static MCInstPrinter *createXCoreMCInstPrinter(const Triple &T,
+                                               unsigned SyntaxVariant,
+                                               const MCAsmInfo &MAI,
+                                               const MCInstrInfo &MII,
+                                               const MCRegisterInfo &MRI) {
+  return new XCoreInstPrinter(MAI, MII, MRI);
+}
+
+XCoreTargetStreamer::XCoreTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+XCoreTargetStreamer::~XCoreTargetStreamer() {}
+
+namespace {
+
+class XCoreTargetAsmStreamer : public XCoreTargetStreamer {
+  formatted_raw_ostream &OS;
+public:
+  XCoreTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+  void emitCCTopData(StringRef Name) override;
+  void emitCCTopFunction(StringRef Name) override;
+  void emitCCBottomData(StringRef Name) override;
+  void emitCCBottomFunction(StringRef Name) override;
+};
+
+XCoreTargetAsmStreamer::XCoreTargetAsmStreamer(MCStreamer &S,
+                                               formatted_raw_ostream &OS)
+    : XCoreTargetStreamer(S), OS(OS) {}
+
+void XCoreTargetAsmStreamer::emitCCTopData(StringRef Name) {
+  OS << "\t.cc_top " << Name << ".data," << Name << '\n';
+}
+
+void XCoreTargetAsmStreamer::emitCCTopFunction(StringRef Name) {
+  OS << "\t.cc_top " << Name << ".function," << Name << '\n';
+}
+
+void XCoreTargetAsmStreamer::emitCCBottomData(StringRef Name) {
+  OS << "\t.cc_bottom " << Name << ".data\n";
+}
+
+void XCoreTargetAsmStreamer::emitCCBottomFunction(StringRef Name) {
+  OS << "\t.cc_bottom " << Name << ".function\n";
+}
+}
+
+static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S,
+                                                 formatted_raw_ostream &OS,
+                                                 MCInstPrinter *InstPrint,
+                                                 bool isVerboseAsm) {
+  return new XCoreTargetAsmStreamer(S, OS);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(getTheXCoreTarget(), createXCoreMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::registerMCAdjustCodeGenOpts(getTheXCoreTarget(),
+                                              adjustCodeGenOpts);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(getTheXCoreTarget(),
+                                      createXCoreMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(getTheXCoreTarget(),
+                                    createXCoreMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(getTheXCoreTarget(),
+                                          createXCoreMCSubtargetInfo);
+
+  // Register the MCInstPrinter
+  TargetRegistry::RegisterMCInstPrinter(getTheXCoreTarget(),
+                                        createXCoreMCInstPrinter);
+
+  TargetRegistry::RegisterAsmTargetStreamer(getTheXCoreTarget(),
+                                            createTargetAsmStreamer);
+}
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
new file mode 100644
index 000000000000..ac0f3fefbae7
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
@@ -0,0 +1,39 @@
+//===-- XCoreMCTargetDesc.h - XCore Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides XCore specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCTARGETDESC_H
+#define LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class Target;
+Target &getTheXCoreTarget();
+
+} // End llvm namespace
+
+// Defines symbolic names for XCore registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "XCoreGenRegisterInfo.inc"
+
+// Defines symbolic names for the XCore instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "XCoreGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "XCoreGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp b/contrib/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
new file mode 100644
index 000000000000..df5774c7e8ea
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
@@ -0,0 +1,22 @@
+//===-- XCoreTargetInfo.cpp - XCore Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target &llvm::getTheXCoreTarget() {
+  static Target TheXCoreTarget;
+  return TheXCoreTarget;
+}
+
+extern "C" void LLVMInitializeXCoreTargetInfo() {
+  RegisterTarget<Triple::xcore> X(getTheXCoreTarget(), "xcore", "XCore");
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCore.h b/contrib/llvm/lib/Target/XCore/XCore.h
new file mode 100644
index 000000000000..ba6ca843671e
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCore.h
@@ -0,0 +1,37 @@
+//===-- XCore.h - Top-level interface for XCore representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// XCore back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCORE_H
+#define LLVM_LIB_TARGET_XCORE_XCORE_H
+
+#include "MCTargetDesc/XCoreMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class FunctionPass;
+  class ModulePass;
+  class TargetMachine;
+  class XCoreTargetMachine;
+  class formatted_raw_ostream;
+
+  void initializeXCoreLowerThreadLocalPass(PassRegistry &p);
+
+  FunctionPass *createXCoreFrameToArgsOffsetEliminationPass();
+  FunctionPass *createXCoreISelDag(XCoreTargetMachine &TM,
+                                   CodeGenOpt::Level OptLevel);
+  ModulePass *createXCoreLowerThreadLocalPass();
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCore.td b/contrib/llvm/lib/Target/XCore/XCore.td
new file mode 100644
index 000000000000..04a1dd5e95be
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCore.td
@@ -0,0 +1,47 @@
+//===-- XCore.td - Describe the XCore Target Machine -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the XCore target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Descriptions
+//===----------------------------------------------------------------------===//
+
+include "XCoreRegisterInfo.td"
+include "XCoreInstrInfo.td"
+include "XCoreCallingConv.td"
+
+def XCoreInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// XCore processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",      []>;
+def : Proc<"xs1b-generic", []>;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def XCore : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = XCoreInstrInfo;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
new file mode 100644
index 000000000000..b35aa0b95821
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -0,0 +1,300 @@
+//===-- XCoreAsmPrinter.cpp - XCore LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the XAS-format XCore assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "InstPrinter/XCoreInstPrinter.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreMCInstLower.h"
+#include "XCoreSubtarget.h"
+#include "XCoreTargetMachine.h"
+#include "XCoreTargetStreamer.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <algorithm>
+#include <cctype>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+  class XCoreAsmPrinter : public AsmPrinter {
+    XCoreMCInstLower MCInstLowering;
+    XCoreTargetStreamer &getTargetStreamer();
+
+  public:
+    explicit XCoreAsmPrinter(TargetMachine &TM,
+                             std::unique_ptr<MCStreamer> Streamer)
+        : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(*this) {}
+
+    StringRef getPassName() const override { return "XCore Assembly Printer"; }
+
+    void printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O,
+                       const std::string &directive = ".jmptable");
+    void printInlineJT32(const MachineInstr *MI, int opNum, raw_ostream &O) {
+      printInlineJT(MI, opNum, O, ".jmptable32");
+    }
+    void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &O) override;
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &O) override;
+
+    void emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV);
+    void EmitGlobalVariable(const GlobalVariable *GV) override;
+
+    void EmitFunctionEntryLabel() override;
+    void EmitInstruction(const MachineInstr *MI) override;
+    void EmitFunctionBodyStart() override;
+    void EmitFunctionBodyEnd() override;
+  };
+} // end of anonymous namespace
+
+XCoreTargetStreamer &XCoreAsmPrinter::getTargetStreamer() {
+  return static_cast<XCoreTargetStreamer&>(*OutStreamer->getTargetStreamer());
+}
+
+void XCoreAsmPrinter::emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV) {
+  assert( ( GV->hasExternalLinkage() || GV->hasWeakLinkage() ||
+            GV->hasLinkOnceLinkage() || GV->hasCommonLinkage() ) &&
+          "Unexpected linkage");
+  if (ArrayType *ATy = dyn_cast<ArrayType>(GV->getValueType())) {
+
+    MCSymbol *SymGlob = OutContext.getOrCreateSymbol(
+                          Twine(Sym->getName() + StringRef(".globound")));
+    OutStreamer->EmitSymbolAttribute(SymGlob, MCSA_Global);
+    OutStreamer->EmitAssignment(SymGlob,
+                                MCConstantExpr::create(ATy->getNumElements(),
+                                                       OutContext));
+    if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+        GV->hasCommonLinkage()) {
+      OutStreamer->EmitSymbolAttribute(SymGlob, MCSA_Weak);
+    }
+  }
+}
+
+void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+  if (!GV->hasInitializer() ||
+      EmitSpecialLLVMGlobal(GV))
+    return;
+
+  const DataLayout &DL = getDataLayout();
+  OutStreamer->SwitchSection(getObjFileLowering().SectionForGlobal(GV, TM));
+
+  MCSymbol *GVSym = getSymbol(GV);
+  const Constant *C = GV->getInitializer();
+  unsigned Align = (unsigned)DL.getPreferredTypeAlignmentShift(C->getType());
+
+  // Mark the start of the global
+  getTargetStreamer().emitCCTopData(GVSym->getName());
+
+  switch (GV->getLinkage()) {
+  case GlobalValue::AppendingLinkage:
+    report_fatal_error("AppendingLinkage is not supported by this target!");
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+  case GlobalValue::ExternalLinkage:
+  case GlobalValue::CommonLinkage:
+    emitArrayBound(GVSym, GV);
+    OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Global);
+
+    if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+        GV->hasCommonLinkage())
+      OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Weak);
+    LLVM_FALLTHROUGH;
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::PrivateLinkage:
+    break;
+  default:
+    llvm_unreachable("Unknown linkage type!");
+  }
+
+  EmitAlignment(Align > 2 ? Align : 2, GV);
+  
+  if (GV->isThreadLocal()) {
+    report_fatal_error("TLS is not supported by this target!");
+  }
+  unsigned Size = DL.getTypeAllocSize(C->getType());
+  if (MAI->hasDotTypeDotSizeDirective()) {
+    OutStreamer->EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject);
+    OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext));
+  }
+  OutStreamer->EmitLabel(GVSym);
+
+  EmitGlobalConstant(DL, C);
+  // The ABI requires that unsigned scalar types smaller than 32 bits
+  // are padded to 32 bits.
+  if (Size < 4)
+    OutStreamer->EmitZeros(4 - Size);
+  
+  // Mark the end of the global
+  getTargetStreamer().emitCCBottomData(GVSym->getName());
+}
+
+void XCoreAsmPrinter::EmitFunctionBodyStart() {
+  MCInstLowering.Initialize(&MF->getContext());
+}
+
+/// EmitFunctionBodyEnd - Targets can override this to emit stuff after
+/// the last basic block in the function.
+void XCoreAsmPrinter::EmitFunctionBodyEnd() {
+  // Emit function end directives
+  getTargetStreamer().emitCCBottomFunction(CurrentFnSym->getName());
+}
+
+void XCoreAsmPrinter::EmitFunctionEntryLabel() {
+  // Mark the start of the function
+  getTargetStreamer().emitCCTopFunction(CurrentFnSym->getName());
+  OutStreamer->EmitLabel(CurrentFnSym);
+}
+
+void XCoreAsmPrinter::
+printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O,
+              const std::string &directive) {
+  unsigned JTI = MI->getOperand(opNum).getIndex();
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+  O << "\t" << directive << " ";
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    if (i > 0)
+      O << ",";
+    MBB->getSymbol()->print(O, MAI);
+  }
+}
+
+void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                   raw_ostream &O) {
+  const DataLayout &DL = getDataLayout();
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << XCoreInstPrinter::getRegisterName(MO.getReg());
+    break;
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MO.getMBB()->getSymbol()->print(O, MAI);
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    getSymbol(MO.getGlobal())->print(O, MAI);
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+      << MO.getIndex();
+    break;
+  case MachineOperand::MO_BlockAddress:
+    GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI);
+    break;
+  default:
+    llvm_unreachable("not implemented");
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,const char *ExtraCode,
+                                      raw_ostream &O) {
+  // Print the operand if there is no operand modifier.
+  if (!ExtraCode || !ExtraCode[0]) {
+    printOperand(MI, OpNo, O);
+    return false;
+  }
+
+  // Otherwise fallback on the default implementation.
+  return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+}
+
+bool XCoreAsmPrinter::
+PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                      unsigned AsmVariant, const char *ExtraCode,
+                      raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    return true; // Unknown modifier.
+  }
+  printOperand(MI, OpNum, O);
+  O << '[';
+  printOperand(MI, OpNum + 1, O);
+  O << ']';
+  return false;
+}
+
+void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+
+  switch (MI->getOpcode()) {
+  case XCore::DBG_VALUE:
+    llvm_unreachable("Should be handled target independently");
+  case XCore::ADD_2rus:
+    if (MI->getOperand(2).getImm() == 0) {
+      O << "\tmov "
+        << XCoreInstPrinter::getRegisterName(MI->getOperand(0).getReg()) << ", "
+        << XCoreInstPrinter::getRegisterName(MI->getOperand(1).getReg());
+      OutStreamer->EmitRawText(O.str());
+      return;
+    }
+    break;
+  case XCore::BR_JT:
+  case XCore::BR_JT32:
+    O << "\tbru "
+      << XCoreInstPrinter::getRegisterName(MI->getOperand(1).getReg()) << '\n';
+    if (MI->getOpcode() == XCore::BR_JT)
+      printInlineJT(MI, 0, O);
+    else
+      printInlineJT32(MI, 0, O);
+    O << '\n';
+    OutStreamer->EmitRawText(O.str());
+    return;
+  }
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+
+  EmitToStreamer(*OutStreamer, TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreAsmPrinter() { 
+  RegisterAsmPrinter<XCoreAsmPrinter> X(getTheXCoreTarget());
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreCallingConv.td b/contrib/llvm/lib/Target/XCore/XCoreCallingConv.td
new file mode 100644
index 000000000000..e149e6d9ec20
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreCallingConv.td
@@ -0,0 +1,40 @@
+//===- XCoreCallingConv.td - Calling Conventions for XCore -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for XCore architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XCore Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_XCore : CallingConv<[
+  // i32 are returned in registers R0, R1, R2, R3
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+
+  // Integer values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32], CCAssignToStack<4, 4>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// XCore Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_XCore : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R11.
+  CCIfNest<CCAssignToReg<[R11]>>,
+
+  // The first 4 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+
+  // Integer values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32], CCAssignToStack<4, 4>>
+]>;
diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
new file mode 100644
index 000000000000..e0e2e0319964
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -0,0 +1,592 @@
+//===-- XCoreFrameLowering.cpp - Frame info for XCore Target --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains XCore frame information that doesn't fit anywhere else
+// cleanly...
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreFrameLowering.h"
+#include "XCore.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "XCoreSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include <algorithm>    // std::sort
+
+using namespace llvm;
+
+static const unsigned FramePtr = XCore::R10;
+static const int MaxImmU16 = (1<<16) - 1;
+
+// helper functions. FIXME: Eliminate.
+static inline bool isImmU6(unsigned val) {
+  return val < (1 << 6);
+}
+
+static inline bool isImmU16(unsigned val) {
+  return val < (1 << 16);
+}
+
+// Helper structure with compare function for handling stack slots.
+namespace {
+struct StackSlotInfo {
+  int FI;
+  int Offset;
+  unsigned Reg;
+  StackSlotInfo(int f, int o, int r) : FI(f), Offset(o), Reg(r){};
+};
+}  // end anonymous namespace
+
+static bool CompareSSIOffset(const StackSlotInfo& a, const StackSlotInfo& b) {
+  return a.Offset < b.Offset;
+}
+
+static void EmitDefCfaRegister(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+                               const DebugLoc &dl, const TargetInstrInfo &TII,
+                               MachineFunction &MF, unsigned DRegNum) {
+  unsigned CFIIndex = MF.addFrameInst(
+      MCCFIInstruction::createDefCfaRegister(nullptr, DRegNum));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+}
+
+static void EmitDefCfaOffset(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI,
+                             const DebugLoc &dl, const TargetInstrInfo &TII,
+                             int Offset) {
+  MachineFunction &MF = *MBB.getParent();
+  unsigned CFIIndex =
+      MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -Offset));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+}
+
+static void EmitCfiOffset(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI, const DebugLoc &dl,
+                          const TargetInstrInfo &TII, unsigned DRegNum,
+                          int Offset) {
+  MachineFunction &MF = *MBB.getParent();
+  unsigned CFIIndex = MF.addFrameInst(
+      MCCFIInstruction::createOffset(nullptr, DRegNum, Offset));
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+}
+
+/// The SP register is moved in steps of 'MaxImmU16' towards the bottom of the
+/// frame. During these steps, it may be necessary to spill registers.
+/// IfNeededExtSP emits the necessary EXTSP instructions to move the SP only
+/// as far as to make 'OffsetFromBottom' reachable using an STWSP_lru6.
+/// \param OffsetFromTop the spill offset from the top of the frame.
+/// \param [in,out] Adjusted the current SP offset from the top of the frame.
+static void IfNeededExtSP(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI, const DebugLoc &dl,
+                          const TargetInstrInfo &TII, int OffsetFromTop,
+                          int &Adjusted, int FrameSize, bool emitFrameMoves) {
+  while (OffsetFromTop > Adjusted) {
+    assert(Adjusted < FrameSize && "OffsetFromTop is beyond FrameSize");
+    int remaining = FrameSize - Adjusted;
+    int OpImm = (remaining > MaxImmU16) ? MaxImmU16 : remaining;
+    int Opcode = isImmU6(OpImm) ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
+    BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(OpImm);
+    Adjusted += OpImm;
+    if (emitFrameMoves)
+      EmitDefCfaOffset(MBB, MBBI, dl, TII, Adjusted*4);
+  }
+}
+
+/// The SP register is moved in steps of 'MaxImmU16' towards the top of the
+/// frame. During these steps, it may be necessary to re-load registers.
+/// IfNeededLDAWSP emits the necessary LDAWSP instructions to move the SP only
+/// as far as to make 'OffsetFromTop' reachable using an LDAWSP_lru6.
+/// \param OffsetFromTop the spill offset from the top of the frame.
+/// \param [in,out] RemainingAdj the current SP offset from the top of the
+/// frame.
+static void IfNeededLDAWSP(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, const DebugLoc &dl,
+                           const TargetInstrInfo &TII, int OffsetFromTop,
+                           int &RemainingAdj) {
+  while (OffsetFromTop < RemainingAdj - MaxImmU16) {
+    assert(RemainingAdj && "OffsetFromTop is beyond FrameSize");
+    int OpImm = (RemainingAdj > MaxImmU16) ? MaxImmU16 : RemainingAdj;
+    int Opcode = isImmU6(OpImm) ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
+    BuildMI(MBB, MBBI, dl, TII.get(Opcode), XCore::SP).addImm(OpImm);
+    RemainingAdj -= OpImm;
+  }
+}
+
+/// Creates an ordered list of registers that are spilled
+/// during the emitPrologue/emitEpilogue.
+/// Registers are ordered according to their frame offset.
+/// As offsets are negative, the largest offsets will be first.
+static void GetSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
+                         MachineFrameInfo &MFI, XCoreFunctionInfo *XFI,
+                         bool fetchLR, bool fetchFP) {
+  if (fetchLR) {
+    int Offset = MFI.getObjectOffset(XFI->getLRSpillSlot());
+    SpillList.push_back(StackSlotInfo(XFI->getLRSpillSlot(),
+                                      Offset,
+                                      XCore::LR));
+  }
+  if (fetchFP) {
+    int Offset = MFI.getObjectOffset(XFI->getFPSpillSlot());
+    SpillList.push_back(StackSlotInfo(XFI->getFPSpillSlot(),
+                                      Offset,
+                                      FramePtr));
+  }
+  std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
+}
+
+/// Creates an ordered list of EH info register 'spills'.
+/// These slots are only used by the unwinder and calls to llvm.eh.return().
+/// Registers are ordered according to their frame offset.
+/// As offsets are negative, the largest offsets will be first.
+static void GetEHSpillList(SmallVectorImpl<StackSlotInfo> &SpillList,
+                           MachineFrameInfo &MFI, XCoreFunctionInfo *XFI,
+                           const Constant *PersonalityFn,
+                           const TargetLowering *TL) {
+  assert(XFI->hasEHSpillSlot() && "There are no EH register spill slots");
+  const int *EHSlot = XFI->getEHSpillSlot();
+  SpillList.push_back(
+      StackSlotInfo(EHSlot[0], MFI.getObjectOffset(EHSlot[0]),
+                    TL->getExceptionPointerRegister(PersonalityFn)));
+  SpillList.push_back(
+      StackSlotInfo(EHSlot[0], MFI.getObjectOffset(EHSlot[1]),
+                    TL->getExceptionSelectorRegister(PersonalityFn)));
+  std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset);
+}
+
+static MachineMemOperand *getFrameIndexMMO(MachineBasicBlock &MBB,
+                                           int FrameIndex,
+                                           MachineMemOperand::Flags flags) {
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FrameIndex), flags,
+      MFI.getObjectSize(FrameIndex), MFI.getObjectAlignment(FrameIndex));
+  return MMO;
+}
+
+
+/// Restore clobbered registers with their spill slot value.
+/// The SP will be adjusted at the same time, thus the SpillList must be ordered
+/// with the largest (negative) offsets first.
+static void RestoreSpillList(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI,
+                             const DebugLoc &dl, const TargetInstrInfo &TII,
+                             int &RemainingAdj,
+                             SmallVectorImpl<StackSlotInfo> &SpillList) {
+  for (unsigned i = 0, e = SpillList.size(); i != e; ++i) {
+    assert(SpillList[i].Offset % 4 == 0 && "Misaligned stack offset");
+    assert(SpillList[i].Offset <= 0 && "Unexpected positive stack offset");
+    int OffsetFromTop = - SpillList[i].Offset/4;
+    IfNeededLDAWSP(MBB, MBBI, dl, TII, OffsetFromTop, RemainingAdj);
+    int Offset = RemainingAdj - OffsetFromTop;
+    int Opcode = isImmU6(Offset) ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
+    BuildMI(MBB, MBBI, dl, TII.get(Opcode), SpillList[i].Reg)
+      .addImm(Offset)
+      .addMemOperand(getFrameIndexMMO(MBB, SpillList[i].FI,
+                                      MachineMemOperand::MOLoad));
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// XCoreFrameLowering:
+//===----------------------------------------------------------------------===//
+
+XCoreFrameLowering::XCoreFrameLowering(const XCoreSubtarget &sti)
+  : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 4, 0) {
+  // Do nothing
+}
+
+bool XCoreFrameLowering::hasFP(const MachineFunction &MF) const {
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+         MF.getFrameInfo().hasVarSizedObjects();
+}
+
+void XCoreFrameLowering::emitPrologue(MachineFunction &MF,
+                                      MachineBasicBlock &MBB) const {
+  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = &MF.getMMI();
+  const MCRegisterInfo *MRI = MMI->getContext().getRegisterInfo();
+  const XCoreInstrInfo &TII = *MF.getSubtarget<XCoreSubtarget>().getInstrInfo();
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc dl;
+
+  if (MFI.getMaxAlignment() > getStackAlignment())
+    report_fatal_error("emitPrologue unsupported alignment: "
+                       + Twine(MFI.getMaxAlignment()));
+
+  const AttributeSet &PAL = MF.getFunction()->getAttributes();
+  if (PAL.hasAttrSomewhere(Attribute::Nest))
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::LDWSP_ru6), XCore::R11).addImm(0);
+    // FIX: Needs addMemOperand() but can't use getFixedStack() or getStack().
+
+  // Work out frame sizes.
+  // We will adjust the SP in stages towards the final FrameSize.
+  assert(MFI.getStackSize()%4 == 0 && "Misaligned frame size");
+  const int FrameSize = MFI.getStackSize() / 4;
+  int Adjusted = 0;
+
+  bool saveLR = XFI->hasLRSpillSlot();
+  bool UseENTSP = saveLR && FrameSize
+                  && (MFI.getObjectOffset(XFI->getLRSpillSlot()) == 0);
+  if (UseENTSP)
+    saveLR = false;
+  bool FP = hasFP(MF);
+  bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(MF);
+
+  if (UseENTSP) {
+    // Allocate space on the stack at the same time as saving LR.
+    Adjusted = (FrameSize > MaxImmU16) ? MaxImmU16 : FrameSize;
+    int Opcode = isImmU6(Adjusted) ? XCore::ENTSP_u6 : XCore::ENTSP_lu6;
+    MBB.addLiveIn(XCore::LR);
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opcode));
+    MIB.addImm(Adjusted);
+    MIB->addRegisterKilled(XCore::LR, MF.getSubtarget().getRegisterInfo(),
+                           true);
+    if (emitFrameMoves) {
+      EmitDefCfaOffset(MBB, MBBI, dl, TII, Adjusted*4);
+      unsigned DRegNum = MRI->getDwarfRegNum(XCore::LR, true);
+      EmitCfiOffset(MBB, MBBI, dl, TII, DRegNum, 0);
+    }
+  }
+
+  // If necessary, save LR and FP to the stack, as we EXTSP.
+  SmallVector<StackSlotInfo,2> SpillList;
+  GetSpillList(SpillList, MFI, XFI, saveLR, FP);
+  // We want the nearest (negative) offsets first, so reverse list.
+  std::reverse(SpillList.begin(), SpillList.end());
+  for (unsigned i = 0, e = SpillList.size(); i != e; ++i) {
+    assert(SpillList[i].Offset % 4 == 0 && "Misaligned stack offset");
+    assert(SpillList[i].Offset <= 0 && "Unexpected positive stack offset");
+    int OffsetFromTop = - SpillList[i].Offset/4;
+    IfNeededExtSP(MBB, MBBI, dl, TII, OffsetFromTop, Adjusted, FrameSize,
+                  emitFrameMoves);
+    int Offset = Adjusted - OffsetFromTop;
+    int Opcode = isImmU6(Offset) ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
+    MBB.addLiveIn(SpillList[i].Reg);
+    BuildMI(MBB, MBBI, dl, TII.get(Opcode))
+      .addReg(SpillList[i].Reg, RegState::Kill)
+      .addImm(Offset)
+      .addMemOperand(getFrameIndexMMO(MBB, SpillList[i].FI,
+                                      MachineMemOperand::MOStore));
+    if (emitFrameMoves) {
+      unsigned DRegNum = MRI->getDwarfRegNum(SpillList[i].Reg, true);
+      EmitCfiOffset(MBB, MBBI, dl, TII, DRegNum, SpillList[i].Offset);
+    }
+  }
+
+  // Complete any remaining Stack adjustment.
+  IfNeededExtSP(MBB, MBBI, dl, TII, FrameSize, Adjusted, FrameSize,
+                emitFrameMoves);
+  assert(Adjusted==FrameSize && "IfNeededExtSP has not completed adjustment");
+
+  if (FP) {
+    // Set the FP from the SP.
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::LDAWSP_ru6), FramePtr).addImm(0);
+    if (emitFrameMoves)
+      EmitDefCfaRegister(MBB, MBBI, dl, TII, MF,
+                         MRI->getDwarfRegNum(FramePtr, true));
+  }
+
+  if (emitFrameMoves) {
+    // Frame moves for callee saved.
+    for (const auto &SpillLabel : XFI->getSpillLabels()) {
+      MachineBasicBlock::iterator Pos = SpillLabel.first;
+      ++Pos;
+      const CalleeSavedInfo &CSI = SpillLabel.second;
+      int Offset = MFI.getObjectOffset(CSI.getFrameIdx());
+      unsigned DRegNum = MRI->getDwarfRegNum(CSI.getReg(), true);
+      EmitCfiOffset(MBB, Pos, dl, TII, DRegNum, Offset);
+    }
+    if (XFI->hasEHSpillSlot()) {
+      // The unwinder requires stack slot & CFI offsets for the exception info.
+      // We do not save/spill these registers.
+      const Function *Fn = MF.getFunction();
+      const Constant *PersonalityFn =
+          Fn->hasPersonalityFn() ? Fn->getPersonalityFn() : nullptr;
+      SmallVector<StackSlotInfo, 2> SpillList;
+      GetEHSpillList(SpillList, MFI, XFI, PersonalityFn,
+                     MF.getSubtarget().getTargetLowering());
+      assert(SpillList.size()==2 && "Unexpected SpillList size");
+      EmitCfiOffset(MBB, MBBI, dl, TII,
+                    MRI->getDwarfRegNum(SpillList[0].Reg, true),
+                    SpillList[0].Offset);
+      EmitCfiOffset(MBB, MBBI, dl, TII,
+                    MRI->getDwarfRegNum(SpillList[1].Reg, true),
+                    SpillList[1].Offset);
+    }
+  }
+}
+
+void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  const XCoreInstrInfo &TII = *MF.getSubtarget<XCoreSubtarget>().getInstrInfo();
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  DebugLoc dl = MBBI->getDebugLoc();
+  unsigned RetOpcode = MBBI->getOpcode();
+
+  // Work out frame sizes.
+  // We will adjust the SP in stages towards the final FrameSize.
+  int RemainingAdj = MFI.getStackSize();
+  assert(RemainingAdj%4 == 0 && "Misaligned frame size");
+  RemainingAdj /= 4;
+
+  if (RetOpcode == XCore::EH_RETURN) {
+    // 'Restore' the exception info the unwinder has placed into the stack
+    // slots.
+    const Function *Fn = MF.getFunction();
+    const Constant *PersonalityFn =
+        Fn->hasPersonalityFn() ? Fn->getPersonalityFn() : nullptr;
+    SmallVector<StackSlotInfo, 2> SpillList;
+    GetEHSpillList(SpillList, MFI, XFI, PersonalityFn,
+                   MF.getSubtarget().getTargetLowering());
+    RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList);
+
+    // Return to the landing pad.
+    unsigned EhStackReg = MBBI->getOperand(0).getReg();
+    unsigned EhHandlerReg = MBBI->getOperand(1).getReg();
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::SETSP_1r)).addReg(EhStackReg);
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::BAU_1r)).addReg(EhHandlerReg);
+    MBB.erase(MBBI);  // Erase the previous return instruction.
+    return;
+  }
+
+  bool restoreLR = XFI->hasLRSpillSlot();
+  bool UseRETSP = restoreLR && RemainingAdj
+                  && (MFI.getObjectOffset(XFI->getLRSpillSlot()) == 0);
+  if (UseRETSP)
+    restoreLR = false;
+  bool FP = hasFP(MF);
+
+  if (FP) // Restore the stack pointer.
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::SETSP_1r)).addReg(FramePtr);
+
+  // If necessary, restore LR and FP from the stack, as we EXTSP.
+  SmallVector<StackSlotInfo,2> SpillList;
+  GetSpillList(SpillList, MFI, XFI, restoreLR, FP);
+  RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList);
+
+  if (RemainingAdj) {
+    // Complete all but one of the remaining Stack adjustments.
+    IfNeededLDAWSP(MBB, MBBI, dl, TII, 0, RemainingAdj);
+    if (UseRETSP) {
+      // Fold prologue into return instruction
+      assert(RetOpcode == XCore::RETSP_u6
+             || RetOpcode == XCore::RETSP_lu6);
+      int Opcode = isImmU6(RemainingAdj) ? XCore::RETSP_u6 : XCore::RETSP_lu6;
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opcode))
+                                  .addImm(RemainingAdj);
+      for (unsigned i = 3, e = MBBI->getNumOperands(); i < e; ++i)
+        MIB->addOperand(MBBI->getOperand(i)); // copy any variadic operands
+      MBB.erase(MBBI);  // Erase the previous return instruction.
+    } else {
+      int Opcode = isImmU6(RemainingAdj) ? XCore::LDAWSP_ru6 :
+                                           XCore::LDAWSP_lru6;
+      BuildMI(MBB, MBBI, dl, TII.get(Opcode), XCore::SP).addImm(RemainingAdj);
+      // Don't erase the return instruction.
+    }
+  } // else Don't erase the return instruction.
+}
+
+bool XCoreFrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return true;
+
+  MachineFunction *MF = MBB.getParent();
+  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+  XCoreFunctionInfo *XFI = MF->getInfo<XCoreFunctionInfo>();
+  bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(*MF);
+
+  DebugLoc DL;
+  if (MI != MBB.end() && !MI->isDebugValue())
+    DL = MI->getDebugLoc();
+
+  for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
+                                                    it != CSI.end(); ++it) {
+    unsigned Reg = it->getReg();
+    assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) &&
+           "LR & FP are always handled in emitPrologue");
+
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, it->getFrameIdx(), RC, TRI);
+    if (emitFrameMoves) {
+      auto Store = MI;
+      --Store;
+      XFI->getSpillLabels().push_back(std::make_pair(Store, *it));
+    }
+  }
+  return true;
+}
+
+bool XCoreFrameLowering::
+restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const std::vector<CalleeSavedInfo> &CSI,
+                            const TargetRegisterInfo *TRI) const{
+  MachineFunction *MF = MBB.getParent();
+  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+  bool AtStart = MI == MBB.begin();
+  MachineBasicBlock::iterator BeforeI = MI;
+  if (!AtStart)
+    --BeforeI;
+  for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
+                                                    it != CSI.end(); ++it) {
+    unsigned Reg = it->getReg();
+    assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) &&
+           "LR & FP are always handled in emitEpilogue");
+
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.loadRegFromStackSlot(MBB, MI, Reg, it->getFrameIdx(), RC, TRI);
+    assert(MI != MBB.begin() &&
+           "loadRegFromStackSlot didn't insert any code!");
+    // Insert in reverse order.  loadRegFromStackSlot can insert multiple
+    // instructions.
+    if (AtStart)
+      MI = MBB.begin();
+    else {
+      MI = BeforeI;
+      ++MI;
+    }
+  }
+  return true;
+}
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+MachineBasicBlock::iterator XCoreFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  const XCoreInstrInfo &TII = *MF.getSubtarget<XCoreSubtarget>().getInstrInfo();
+  if (!hasReservedCallFrame(MF)) {
+    // Turn the adjcallstackdown instruction into 'extsp <amt>' and the
+    // adjcallstackup instruction into 'ldaw sp, sp[<amt>]'
+    MachineInstr &Old = *I;
+    uint64_t Amount = Old.getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      assert(Amount%4 == 0);
+      Amount /= 4;
+
+      bool isU6 = isImmU6(Amount);
+      if (!isU6 && !isImmU16(Amount)) {
+        // FIX could emit multiple instructions in this case.
+#ifndef NDEBUG
+        errs() << "eliminateCallFramePseudoInstr size too big: "
+               << Amount << "\n";
+#endif
+        llvm_unreachable(nullptr);
+      }
+
+      MachineInstr *New;
+      if (Old.getOpcode() == XCore::ADJCALLSTACKDOWN) {
+        int Opcode = isU6 ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
+        New = BuildMI(MF, Old.getDebugLoc(), TII.get(Opcode)).addImm(Amount);
+      } else {
+        assert(Old.getOpcode() == XCore::ADJCALLSTACKUP);
+        int Opcode = isU6 ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
+        New = BuildMI(MF, Old.getDebugLoc(), TII.get(Opcode), XCore::SP)
+                  .addImm(Amount);
+      }
+
+      // Replace the pseudo instruction with a new instruction...
+      MBB.insert(I, New);
+    }
+  }
+
+  return MBB.erase(I);
+}
+
+void XCoreFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                              BitVector &SavedRegs,
+                                              RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  bool LRUsed = MRI.isPhysRegModified(XCore::LR);
+
+  if (!LRUsed && !MF.getFunction()->isVarArg() &&
+      MF.getFrameInfo().estimateStackSize(MF))
+    // If we need to extend the stack it is more efficient to use entsp / retsp.
+    // We force the LR to be saved so these instructions are used.
+    LRUsed = true;
+
+  if (MF.callsUnwindInit() || MF.callsEHReturn()) {
+    // The unwinder expects to find spill slots for the exception info regs R0
+    // & R1. These are used during llvm.eh.return() to 'restore' the exception
+    // info. N.B. we do not spill or restore R0, R1 during normal operation.
+    XFI->createEHSpillSlot(MF);
+    // As we will  have a stack, we force the LR to be saved.
+    LRUsed = true;
+  }
+
+  if (LRUsed) {
+    // We will handle the LR in the prologue/epilogue
+    // and allocate space on the stack ourselves.
+    SavedRegs.reset(XCore::LR);
+    XFI->createLRSpillSlot(MF);
+  }
+
+  if (hasFP(MF))
+    // A callee save register is used to hold the FP.
+    // This needs saving / restoring in the epilogue / prologue.
+    XFI->createFPSpillSlot(MF);
+}
+
+void XCoreFrameLowering::
+processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                    RegScavenger *RS) const {
+  assert(RS && "requiresRegisterScavenging failed");
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  // Reserve slots close to SP or frame pointer for Scavenging spills.
+  // When using SP for small frames, we don't need any scratch registers.
+  // When using SP for large frames, we may need 2 scratch registers.
+  // When using FP, for large or small frames, we may need 1 scratch register.
+  if (XFI->isLargeFrame(MF) || hasFP(MF))
+    RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
+                                                      RC->getAlignment(),
+                                                      false));
+  if (XFI->isLargeFrame(MF) && !hasFP(MF))
+    RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
+                                                      RC->getAlignment(),
+                                                      false));
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h
new file mode 100644
index 000000000000..8729d2208bb2
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h
@@ -0,0 +1,63 @@
+//===-- XCoreFrameLowering.h - Frame info for XCore Target ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains XCore frame information that doesn't fit anywhere else
+// cleanly...
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREFRAMELOWERING_H
+#define LLVM_LIB_TARGET_XCORE_XCOREFRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class XCoreSubtarget;
+
+  class XCoreFrameLowering: public TargetFrameLowering {
+  public:
+    XCoreFrameLowering(const XCoreSubtarget &STI);
+
+    /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+    /// the function.
+    void emitPrologue(MachineFunction &MF,
+                      MachineBasicBlock &MBB) const override;
+    void emitEpilogue(MachineFunction &MF,
+                      MachineBasicBlock &MBB) const override;
+
+    bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+    bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+    MachineBasicBlock::iterator
+    eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
+
+    bool hasFP(const MachineFunction &MF) const override;
+
+    void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                              RegScavenger *RS = nullptr) const override;
+
+    void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                     RegScavenger *RS = nullptr) const override;
+
+    //! Stack slot size (4 bytes)
+    static int stackSlotSize() {
+      return 4;
+    }
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp b/contrib/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
new file mode 100644
index 000000000000..4b10e71be03d
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
@@ -0,0 +1,66 @@
+//===-- XCoreFrameToArgsOffsetElim.cpp ----------------------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Replace Pseudo FRAME_TO_ARGS_OFFSET with the appropriate real offset.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+namespace {
+  struct XCoreFTAOElim : public MachineFunctionPass {
+    static char ID;
+    XCoreFTAOElim() : MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    StringRef getPassName() const override {
+      return "XCore FRAME_TO_ARGS_OFFSET Elimination";
+    }
+  };
+  char XCoreFTAOElim::ID = 0;
+}
+
+/// createXCoreFrameToArgsOffsetEliminationPass - returns an instance of the
+/// Frame to args offset elimination pass
+FunctionPass *llvm::createXCoreFrameToArgsOffsetEliminationPass() {
+  return new XCoreFTAOElim();
+}
+
+bool XCoreFTAOElim::runOnMachineFunction(MachineFunction &MF) {
+  const XCoreInstrInfo &TII =
+      *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  unsigned StackSize = MF.getFrameInfo().getStackSize();
+  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (MachineBasicBlock::iterator MBBI = MBB.begin(), EE = MBB.end();
+         MBBI != EE; ++MBBI) {
+      if (MBBI->getOpcode() == XCore::FRAME_TO_ARGS_OFFSET) {
+        MachineInstr &OldInst = *MBBI;
+        unsigned Reg = OldInst.getOperand(0).getReg();
+        MBBI = TII.loadImmediate(MBB, MBBI, Reg, StackSize);
+        OldInst.eraseFromParent();
+      }
+    }
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
new file mode 100644
index 000000000000..086d1d544f69
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -0,0 +1,282 @@
+//===-- XCoreISelDAGToDAG.cpp - A dag to dag inst selector for XCore ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the XCore target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+/// XCoreDAGToDAGISel - XCore specific code to select XCore machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+  class XCoreDAGToDAGISel : public SelectionDAGISel {
+
+  public:
+    XCoreDAGToDAGISel(XCoreTargetMachine &TM, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(TM, OptLevel) {}
+
+    void Select(SDNode *N) override;
+    bool tryBRIND(SDNode *N);
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
+      return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
+    }
+
+    inline bool immMskBitp(SDNode *inN) const {
+      ConstantSDNode *N = cast<ConstantSDNode>(inN);
+      uint32_t value = (uint32_t)N->getZExtValue();
+      if (!isMask_32(value)) {
+        return false;
+      }
+      int msksize = 32 - countLeadingZeros(value);
+      return (msksize >= 1 && msksize <= 8) ||
+              msksize == 16 || msksize == 24 || msksize == 32;
+    }
+
+    // Complex Pattern Selectors.
+    bool SelectADDRspii(SDValue Addr, SDValue &Base, SDValue &Offset);
+
+    bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                                      std::vector<SDValue> &OutOps) override;
+
+    StringRef getPassName() const override {
+      return "XCore DAG->DAG Pattern Instruction Selection";
+    }
+
+    // Include the pieces autogenerated from the target description.
+  #include "XCoreGenDAGISel.inc"
+  };
+}  // end anonymous namespace
+
+/// createXCoreISelDag - This pass converts a legalized DAG into a
+/// XCore-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createXCoreISelDag(XCoreTargetMachine &TM,
+                                       CodeGenOpt::Level OptLevel) {
+  return new XCoreDAGToDAGISel(TM, OptLevel);
+}
+
+bool XCoreDAGToDAGISel::SelectADDRspii(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) {
+  FrameIndexSDNode *FIN = nullptr;
+  if ((FIN = dyn_cast<FrameIndexSDNode>(Addr))) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::ADD) {
+    ConstantSDNode *CN = nullptr;
+    if ((FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+      && (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && (CN->getSExtValue() % 4 == 0 && CN->getSExtValue() >= 0)) {
+      // Constant positive word offset from frame index
+      Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(Addr),
+                                         MVT::i32);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool XCoreDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Reg;
+  switch (ConstraintID) {
+  default: return true;
+  case InlineAsm::Constraint_m: // Memory.
+    switch (Op.getOpcode()) {
+    default: return true;
+    case XCoreISD::CPRelativeWrapper:
+      Reg = CurDAG->getRegister(XCore::CP, MVT::i32);
+      break;
+    case XCoreISD::DPRelativeWrapper:
+      Reg = CurDAG->getRegister(XCore::DP, MVT::i32);
+      break;
+    }
+  }
+  OutOps.push_back(Reg);
+  OutOps.push_back(Op.getOperand(0));
+  return false;
+}
+
+void XCoreDAGToDAGISel::Select(SDNode *N) {
+  SDLoc dl(N);
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    uint64_t Val = cast<ConstantSDNode>(N)->getZExtValue();
+    if (immMskBitp(N)) {
+      // Transformation function: get the size of a mask
+      // Look for the first non-zero bit
+      SDValue MskSize = getI32Imm(32 - countLeadingZeros((uint32_t)Val), dl);
+      ReplaceNode(N, CurDAG->getMachineNode(XCore::MKMSK_rus, dl,
+                                            MVT::i32, MskSize));
+      return;
+    }
+    else if (!isUInt<16>(Val)) {
+      SDValue CPIdx = CurDAG->getTargetConstantPool(
+          ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
+          getTargetLowering()->getPointerTy(CurDAG->getDataLayout()));
+      SDNode *node = CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32,
+                                            MVT::Other, CPIdx,
+                                            CurDAG->getEntryNode());
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] =
+          MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
+                                   MachineMemOperand::MOLoad, 4, 4);
+      cast<MachineSDNode>(node)->setMemRefs(MemOp, MemOp + 1);
+      ReplaceNode(N, node);
+      return;
+    }
+    break;
+  }
+  case XCoreISD::LADD: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        N->getOperand(2) };
+    ReplaceNode(N, CurDAG->getMachineNode(XCore::LADD_l5r, dl, MVT::i32,
+                                          MVT::i32, Ops));
+    return;
+  }
+  case XCoreISD::LSUB: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        N->getOperand(2) };
+    ReplaceNode(N, CurDAG->getMachineNode(XCore::LSUB_l5r, dl, MVT::i32,
+                                          MVT::i32, Ops));
+    return;
+  }
+  case XCoreISD::MACCU: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2), N->getOperand(3) };
+    ReplaceNode(N, CurDAG->getMachineNode(XCore::MACCU_l4r, dl, MVT::i32,
+                                          MVT::i32, Ops));
+    return;
+  }
+  case XCoreISD::MACCS: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2), N->getOperand(3) };
+    ReplaceNode(N, CurDAG->getMachineNode(XCore::MACCS_l4r, dl, MVT::i32,
+                                          MVT::i32, Ops));
+    return;
+  }
+  case XCoreISD::LMUL: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2), N->getOperand(3) };
+    ReplaceNode(N, CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32,
+                                          MVT::i32, Ops));
+    return;
+  }
+  case XCoreISD::CRC8: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+    ReplaceNode(N, CurDAG->getMachineNode(XCore::CRC8_l4r, dl, MVT::i32,
+                                          MVT::i32, Ops));
+    return;
+  }
+  case ISD::BRIND:
+    if (tryBRIND(N))
+      return;
+    break;
+  // Other cases are autogenerated.
+  }
+  SelectCode(N);
+}
+
+/// Given a chain return a new chain where any appearance of Old is replaced
+/// by New. There must be at most one instruction between Old and Chain and
+/// this instruction must be a TokenFactor. Returns an empty SDValue if
+/// these conditions don't hold.
+static SDValue
+replaceInChain(SelectionDAG *CurDAG, SDValue Chain, SDValue Old, SDValue New)
+{
+  if (Chain == Old)
+    return New;
+  if (Chain->getOpcode() != ISD::TokenFactor)
+    return SDValue();
+  SmallVector<SDValue, 8> Ops;
+  bool found = false;
+  for (unsigned i = 0, e = Chain->getNumOperands(); i != e; ++i) {
+    if (Chain->getOperand(i) == Old) {
+      Ops.push_back(New);
+      found = true;
+    } else {
+      Ops.push_back(Chain->getOperand(i));
+    }
+  }
+  if (!found)
+    return SDValue();
+  return CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, Ops);
+}
+
+bool XCoreDAGToDAGISel::tryBRIND(SDNode *N) {
+  SDLoc dl(N);
+  // (brind (int_xcore_checkevent (addr)))
+  SDValue Chain = N->getOperand(0);
+  SDValue Addr = N->getOperand(1);
+  if (Addr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return false;
+  unsigned IntNo = cast<ConstantSDNode>(Addr->getOperand(1))->getZExtValue();
+  if (IntNo != Intrinsic::xcore_checkevent)
+    return false;
+  SDValue nextAddr = Addr->getOperand(2);
+  SDValue CheckEventChainOut(Addr.getNode(), 1);
+  if (!CheckEventChainOut.use_empty()) {
+    // If the chain out of the checkevent intrinsic is an operand of the
+    // indirect branch or used in a TokenFactor which is the operand of the
+    // indirect branch then build a new chain which uses the chain coming into
+    // the checkevent intrinsic instead.
+    SDValue CheckEventChainIn = Addr->getOperand(0);
+    SDValue NewChain = replaceInChain(CurDAG, Chain, CheckEventChainOut,
+                                      CheckEventChainIn);
+    if (!NewChain.getNode())
+      return false;
+    Chain = NewChain;
+  }
+  // Enable events on the thread using setsr 1 and then disable them immediately
+  // after with clrsr 1. If any resources owned by the thread are ready an event
+  // will be taken. If no resource is ready we branch to the address which was
+  // the operand to the checkevent intrinsic.
+  SDValue constOne = getI32Imm(1, dl);
+  SDValue Glue =
+    SDValue(CurDAG->getMachineNode(XCore::SETSR_branch_u6, dl, MVT::Glue,
+                                   constOne, Chain), 0);
+  Glue =
+    SDValue(CurDAG->getMachineNode(XCore::CLRSR_branch_u6, dl, MVT::Glue,
+                                   constOne, Glue), 0);
+  if (nextAddr->getOpcode() == XCoreISD::PCRelativeWrapper &&
+      nextAddr->getOperand(0)->getOpcode() == ISD::TargetBlockAddress) {
+    CurDAG->SelectNodeTo(N, XCore::BRFU_lu6, MVT::Other,
+                         nextAddr->getOperand(0), Glue);
+    return true;
+  }
+  CurDAG->SelectNodeTo(N, XCore::BAU_1r, MVT::Other, nextAddr, Glue);
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
new file mode 100644
index 000000000000..9244d594460f
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -0,0 +1,1948 @@
+//===-- XCoreISelLowering.cpp - XCore DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the XCoreTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreISelLowering.h"
+#include "XCore.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "XCoreSubtarget.h"
+#include "XCoreTargetMachine.h"
+#include "XCoreTargetObjectFile.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-lower"
+
+const char *XCoreTargetLowering::
+getTargetNodeName(unsigned Opcode) const
+{
+  switch ((XCoreISD::NodeType)Opcode)
+  {
+    case XCoreISD::FIRST_NUMBER      : break;
+    case XCoreISD::BL                : return "XCoreISD::BL";
+    case XCoreISD::PCRelativeWrapper : return "XCoreISD::PCRelativeWrapper";
+    case XCoreISD::DPRelativeWrapper : return "XCoreISD::DPRelativeWrapper";
+    case XCoreISD::CPRelativeWrapper : return "XCoreISD::CPRelativeWrapper";
+    case XCoreISD::LDWSP             : return "XCoreISD::LDWSP";
+    case XCoreISD::STWSP             : return "XCoreISD::STWSP";
+    case XCoreISD::RETSP             : return "XCoreISD::RETSP";
+    case XCoreISD::LADD              : return "XCoreISD::LADD";
+    case XCoreISD::LSUB              : return "XCoreISD::LSUB";
+    case XCoreISD::LMUL              : return "XCoreISD::LMUL";
+    case XCoreISD::MACCU             : return "XCoreISD::MACCU";
+    case XCoreISD::MACCS             : return "XCoreISD::MACCS";
+    case XCoreISD::CRC8              : return "XCoreISD::CRC8";
+    case XCoreISD::BR_JT             : return "XCoreISD::BR_JT";
+    case XCoreISD::BR_JT32           : return "XCoreISD::BR_JT32";
+    case XCoreISD::FRAME_TO_ARGS_OFFSET : return "XCoreISD::FRAME_TO_ARGS_OFFSET";
+    case XCoreISD::EH_RETURN         : return "XCoreISD::EH_RETURN";
+    case XCoreISD::MEMBARRIER        : return "XCoreISD::MEMBARRIER";
+  }
+  return nullptr;
+}
+
+XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM,
+                                         const XCoreSubtarget &Subtarget)
+    : TargetLowering(TM), TM(TM), Subtarget(Subtarget) {
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &XCore::GRRegsRegClass);
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties(Subtarget.getRegisterInfo());
+
+  setStackPointerRegisterToSaveRestore(XCore::SP);
+
+  setSchedulingPreference(Sched::Source);
+
+  // Use i32 for setcc operations results (slt, sgt, ...).
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
+
+  // XCore does not have the NodeTypes below.
+  setOperationAction(ISD::BR_CC,     MVT::i32,   Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32,   Expand);
+  setOperationAction(ISD::ADDC, MVT::i32, Expand);
+  setOperationAction(ISD::ADDE, MVT::i32, Expand);
+  setOperationAction(ISD::SUBC, MVT::i32, Expand);
+  setOperationAction(ISD::SUBE, MVT::i32, Expand);
+
+  // 64bit
+  setOperationAction(ISD::ADD, MVT::i64, Custom);
+  setOperationAction(ISD::SUB, MVT::i64, Custom);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom);
+  setOperationAction(ISD::MULHS, MVT::i32, Expand);
+  setOperationAction(ISD::MULHU, MVT::i32, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+  // Bit Manipulation
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL , MVT::i32, Expand);
+  setOperationAction(ISD::ROTR , MVT::i32, Expand);
+
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // Jump tables.
+  setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
+  setOperationAction(ISD::BlockAddress, MVT::i32 , Custom);
+
+  // Conversion of i64 -> double produces constantpool nodes
+  setOperationAction(ISD::ConstantPool, MVT::i32,   Custom);
+
+  // Loads
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Expand);
+  }
+
+  // Custom expand misaligned loads / stores.
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::i32, Custom);
+
+  // Varargs
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+
+  // Dynamic stack
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+
+  // Exception handling
+  setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
+  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
+
+  // Atomic operations
+  // We request a fence for ATOMIC_* instructions, to reduce them to Monotonic.
+  // As we are always Sequential Consistent, an ATOMIC_FENCE becomes a no OP.
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
+
+  // TRAMPOLINE is custom lowered.
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 4;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize
+    = MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 2;
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+
+  setMinFunctionAlignment(1);
+  setPrefFunctionAlignment(2);
+}
+
+bool XCoreTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  EVT VT1 = Val.getValueType();
+  if (!VT1.isSimple() || !VT1.isInteger() ||
+      !VT2.isSimple() || !VT2.isInteger())
+    return false;
+
+  switch (VT1.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::i8:
+    return true;
+  }
+
+  return false;
+}
+
+SDValue XCoreTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode())
+  {
+  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
+  case ISD::LOAD:               return LowerLOAD(Op, DAG);
+  case ISD::STORE:              return LowerSTORE(Op, DAG);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG);
+  case ISD::SMUL_LOHI:          return LowerSMUL_LOHI(Op, DAG);
+  case ISD::UMUL_LOHI:          return LowerUMUL_LOHI(Op, DAG);
+  // FIXME: Remove these when LegalizeDAGTypes lands.
+  case ISD::ADD:
+  case ISD::SUB:                return ExpandADDSUB(Op.getNode(), DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAME_TO_ARGS_OFFSET: return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
+  case ISD::ATOMIC_LOAD:        return LowerATOMIC_LOAD(Op, DAG);
+  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
+  default:
+    llvm_unreachable("unimplemented operand");
+  }
+}
+
+/// ReplaceNodeResults - Replace the results of node with an illegal result
+/// type with new values built out of custom code.
+void XCoreTargetLowering::ReplaceNodeResults(SDNode *N,
+                                             SmallVectorImpl<SDValue>&Results,
+                                             SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to custom expand this!");
+  case ISD::ADD:
+  case ISD::SUB:
+    Results.push_back(ExpandADDSUB(N, DAG));
+    return;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Misc Lower Operation implementation
+//===----------------------------------------------------------------------===//
+
+SDValue XCoreTargetLowering::getGlobalAddressWrapper(SDValue GA,
+                                                     const GlobalValue *GV,
+                                                     SelectionDAG &DAG) const {
+  // FIXME there is no actual debug info here
+  SDLoc dl(GA);
+
+  if (GV->getValueType()->isFunctionTy())
+    return DAG.getNode(XCoreISD::PCRelativeWrapper, dl, MVT::i32, GA);
+
+  const auto *GVar = dyn_cast<GlobalVariable>(GV);
+  if ((GV->hasSection() && GV->getSection().startswith(".cp.")) ||
+      (GVar && GVar->isConstant() && GV->hasLocalLinkage()))
+    return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA);
+
+  return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, GA);
+}
+
+static bool IsSmallObject(const GlobalValue *GV, const XCoreTargetLowering &XTL) {
+  if (XTL.getTargetMachine().getCodeModel() == CodeModel::Small)
+    return true;
+
+  Type *ObjType = GV->getValueType();
+  if (!ObjType->isSized())
+    return false;
+
+  auto &DL = GV->getParent()->getDataLayout();
+  unsigned ObjSize = DL.getTypeAllocSize(ObjType);
+  return ObjSize < CodeModelLargeSize && ObjSize != 0;
+}
+
+SDValue XCoreTargetLowering::
+LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
+{
+  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GN->getGlobal();
+  SDLoc DL(GN);
+  int64_t Offset = GN->getOffset();
+  if (IsSmallObject(GV, *this)) {
+    // We can only fold positive offsets that are a multiple of the word size.
+    int64_t FoldedOffset = std::max(Offset & ~3, (int64_t)0);
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, FoldedOffset);
+    GA = getGlobalAddressWrapper(GA, GV, DAG);
+    // Handle the rest of the offset.
+    if (Offset != FoldedOffset) {
+      SDValue Remaining = DAG.getConstant(Offset - FoldedOffset, DL, MVT::i32);
+      GA = DAG.getNode(ISD::ADD, DL, MVT::i32, GA, Remaining);
+    }
+    return GA;
+  } else {
+    // Ideally we would not fold in offset with an index <= 11.
+    Type *Ty = Type::getInt8PtrTy(*DAG.getContext());
+    Constant *GA = ConstantExpr::getBitCast(const_cast<GlobalValue*>(GV), Ty);
+    Ty = Type::getInt32Ty(*DAG.getContext());
+    Constant *Idx = ConstantInt::get(Ty, Offset);
+    Constant *GAI = ConstantExpr::getGetElementPtr(
+        Type::getInt8Ty(*DAG.getContext()), GA, Idx);
+    SDValue CP = DAG.getConstantPool(GAI, MVT::i32);
+    return DAG.getLoad(getPointerTy(DAG.getDataLayout()), DL,
+                       DAG.getEntryNode(), CP, MachinePointerInfo());
+  }
+}
+
+SDValue XCoreTargetLowering::
+LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
+{
+  SDLoc DL(Op);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT);
+
+  return DAG.getNode(XCoreISD::PCRelativeWrapper, DL, PtrVT, Result);
+}
+
+SDValue XCoreTargetLowering::
+LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
+{
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  // FIXME there isn't really debug info here
+  SDLoc dl(CP);
+  EVT PtrVT = Op.getValueType();
+  SDValue Res;
+  if (CP->isMachineConstantPoolEntry()) {
+    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+                                    CP->getAlignment(), CP->getOffset());
+  } else {
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+                                    CP->getAlignment(), CP->getOffset());
+  }
+  return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, Res);
+}
+
+unsigned XCoreTargetLowering::getJumpTableEncoding() const {
+  return MachineJumpTableInfo::EK_Inline;
+}
+
+SDValue XCoreTargetLowering::
+LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Chain = Op.getOperand(0);
+  SDValue Table = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+  SDLoc dl(Op);
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+  unsigned JTI = JT->getIndex();
+  MachineFunction &MF = DAG.getMachineFunction();
+  const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+  SDValue TargetJT = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32);
+
+  unsigned NumEntries = MJTI->getJumpTables()[JTI].MBBs.size();
+  if (NumEntries <= 32) {
+    return DAG.getNode(XCoreISD::BR_JT, dl, MVT::Other, Chain, TargetJT, Index);
+  }
+  assert((NumEntries >> 31) == 0);
+  SDValue ScaledIndex = DAG.getNode(ISD::SHL, dl, MVT::i32, Index,
+                                    DAG.getConstant(1, dl, MVT::i32));
+  return DAG.getNode(XCoreISD::BR_JT32, dl, MVT::Other, Chain, TargetJT,
+                     ScaledIndex);
+}
+
+SDValue XCoreTargetLowering::lowerLoadWordFromAlignedBasePlusOffset(
+    const SDLoc &DL, SDValue Chain, SDValue Base, int64_t Offset,
+    SelectionDAG &DAG) const {
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  if ((Offset & 0x3) == 0) {
+    return DAG.getLoad(PtrVT, DL, Chain, Base, MachinePointerInfo());
+  }
+  // Lower to pair of consecutive word aligned loads plus some bit shifting.
+  int32_t HighOffset = alignTo(Offset, 4);
+  int32_t LowOffset = HighOffset - 4;
+  SDValue LowAddr, HighAddr;
+  if (GlobalAddressSDNode *GASD =
+        dyn_cast<GlobalAddressSDNode>(Base.getNode())) {
+    LowAddr = DAG.getGlobalAddress(GASD->getGlobal(), DL, Base.getValueType(),
+                                   LowOffset);
+    HighAddr = DAG.getGlobalAddress(GASD->getGlobal(), DL, Base.getValueType(),
+                                    HighOffset);
+  } else {
+    LowAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, Base,
+                          DAG.getConstant(LowOffset, DL, MVT::i32));
+    HighAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, Base,
+                           DAG.getConstant(HighOffset, DL, MVT::i32));
+  }
+  SDValue LowShift = DAG.getConstant((Offset - LowOffset) * 8, DL, MVT::i32);
+  SDValue HighShift = DAG.getConstant((HighOffset - Offset) * 8, DL, MVT::i32);
+
+  SDValue Low = DAG.getLoad(PtrVT, DL, Chain, LowAddr, MachinePointerInfo());
+  SDValue High = DAG.getLoad(PtrVT, DL, Chain, HighAddr, MachinePointerInfo());
+  SDValue LowShifted = DAG.getNode(ISD::SRL, DL, MVT::i32, Low, LowShift);
+  SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High, HighShift);
+  SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, LowShifted, HighShifted);
+  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Low.getValue(1),
+                      High.getValue(1));
+  SDValue Ops[] = { Result, Chain };
+  return DAG.getMergeValues(Ops, DL);
+}
+
+static bool isWordAligned(SDValue Value, SelectionDAG &DAG)
+{
+  APInt KnownZero, KnownOne;
+  DAG.computeKnownBits(Value, KnownZero, KnownOne);
+  return KnownZero.countTrailingOnes() >= 2;
+}
+
+SDValue XCoreTargetLowering::
+LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  LoadSDNode *LD = cast<LoadSDNode>(Op);
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+         "Unexpected extension type");
+  assert(LD->getMemoryVT() == MVT::i32 && "Unexpected load EVT");
+  if (allowsMisalignedMemoryAccesses(LD->getMemoryVT(),
+                                     LD->getAddressSpace(),
+                                     LD->getAlignment()))
+    return SDValue();
+
+  auto &TD = DAG.getDataLayout();
+  unsigned ABIAlignment = TD.getABITypeAlignment(
+      LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+  // Leave aligned load alone.
+  if (LD->getAlignment() >= ABIAlignment)
+    return SDValue();
+
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  SDLoc DL(Op);
+
+  if (!LD->isVolatile()) {
+    const GlobalValue *GV;
+    int64_t Offset = 0;
+    if (DAG.isBaseWithConstantOffset(BasePtr) &&
+        isWordAligned(BasePtr->getOperand(0), DAG)) {
+      SDValue NewBasePtr = BasePtr->getOperand(0);
+      Offset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
+      return lowerLoadWordFromAlignedBasePlusOffset(DL, Chain, NewBasePtr,
+                                                    Offset, DAG);
+    }
+    if (TLI.isGAPlusOffset(BasePtr.getNode(), GV, Offset) &&
+        MinAlign(GV->getAlignment(), 4) == 4) {
+      SDValue NewBasePtr = DAG.getGlobalAddress(GV, DL,
+                                                BasePtr->getValueType(0));
+      return lowerLoadWordFromAlignedBasePlusOffset(DL, Chain, NewBasePtr,
+                                                    Offset, DAG);
+    }
+  }
+
+  if (LD->getAlignment() == 2) {
+    SDValue Low =
+        DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i32, Chain, BasePtr,
+                       LD->getPointerInfo(), MVT::i16,
+                       /* Alignment = */ 2, LD->getMemOperand()->getFlags());
+    SDValue HighAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+                                   DAG.getConstant(2, DL, MVT::i32));
+    SDValue High =
+        DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, HighAddr,
+                       LD->getPointerInfo().getWithOffset(2), MVT::i16,
+                       /* Alignment = */ 2, LD->getMemOperand()->getFlags());
+    SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High,
+                                      DAG.getConstant(16, DL, MVT::i32));
+    SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Low, HighShifted);
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Low.getValue(1),
+                             High.getValue(1));
+    SDValue Ops[] = { Result, Chain };
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  // Lower to a call to __misaligned_load(BasePtr).
+  Type *IntPtrTy = TD.getIntPtrType(*DAG.getContext());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Ty = IntPtrTy;
+  Entry.Node = BasePtr;
+  Args.push_back(Entry);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL).setChain(Chain).setCallee(
+      CallingConv::C, IntPtrTy,
+      DAG.getExternalSymbol("__misaligned_load",
+                            getPointerTy(DAG.getDataLayout())),
+      std::move(Args));
+
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  SDValue Ops[] = { CallResult.first, CallResult.second };
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue XCoreTargetLowering::
+LowerSTORE(SDValue Op, SelectionDAG &DAG) const
+{
+  StoreSDNode *ST = cast<StoreSDNode>(Op);
+  assert(!ST->isTruncatingStore() && "Unexpected store type");
+  assert(ST->getMemoryVT() == MVT::i32 && "Unexpected store EVT");
+  if (allowsMisalignedMemoryAccesses(ST->getMemoryVT(),
+                                     ST->getAddressSpace(),
+                                     ST->getAlignment())) {
+    return SDValue();
+  }
+  unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(
+      ST->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+  // Leave aligned store alone.
+  if (ST->getAlignment() >= ABIAlignment) {
+    return SDValue();
+  }
+  SDValue Chain = ST->getChain();
+  SDValue BasePtr = ST->getBasePtr();
+  SDValue Value = ST->getValue();
+  SDLoc dl(Op);
+
+  if (ST->getAlignment() == 2) {
+    SDValue Low = Value;
+    SDValue High = DAG.getNode(ISD::SRL, dl, MVT::i32, Value,
+                                      DAG.getConstant(16, dl, MVT::i32));
+    SDValue StoreLow = DAG.getTruncStore(
+        Chain, dl, Low, BasePtr, ST->getPointerInfo(), MVT::i16,
+        /* Alignment = */ 2, ST->getMemOperand()->getFlags());
+    SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, BasePtr,
+                                   DAG.getConstant(2, dl, MVT::i32));
+    SDValue StoreHigh = DAG.getTruncStore(
+        Chain, dl, High, HighAddr, ST->getPointerInfo().getWithOffset(2),
+        MVT::i16, /* Alignment = */ 2, ST->getMemOperand()->getFlags());
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StoreLow, StoreHigh);
+  }
+
+  // Lower to a call to __misaligned_store(BasePtr, Value).
+  Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Ty = IntPtrTy;
+  Entry.Node = BasePtr;
+  Args.push_back(Entry);
+
+  Entry.Node = Value;
+  Args.push_back(Entry);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain).setCallee(
+      CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+      DAG.getExternalSymbol("__misaligned_store",
+                            getPointerTy(DAG.getDataLayout())),
+      std::move(Args));
+
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.second;
+}
+
+SDValue XCoreTargetLowering::
+LowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const
+{
+  assert(Op.getValueType() == MVT::i32 && Op.getOpcode() == ISD::SMUL_LOHI &&
+         "Unexpected operand to lower!");
+  SDLoc dl(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+  SDValue Hi = DAG.getNode(XCoreISD::MACCS, dl,
+                           DAG.getVTList(MVT::i32, MVT::i32), Zero, Zero,
+                           LHS, RHS);
+  SDValue Lo(Hi.getNode(), 1);
+  SDValue Ops[] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue XCoreTargetLowering::
+LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const
+{
+  assert(Op.getValueType() == MVT::i32 && Op.getOpcode() == ISD::UMUL_LOHI &&
+         "Unexpected operand to lower!");
+  SDLoc dl(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+  SDValue Hi = DAG.getNode(XCoreISD::LMUL, dl,
+                           DAG.getVTList(MVT::i32, MVT::i32), LHS, RHS,
+                           Zero, Zero);
+  SDValue Lo(Hi.getNode(), 1);
+  SDValue Ops[] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+/// isADDADDMUL - Return whether Op is in a form that is equivalent to
+/// add(add(mul(x,y),a),b). If requireIntermediatesHaveOneUse is true then
+/// each intermediate result in the calculation must also have a single use.
+/// If the Op is in the correct form the constituent parts are written to Mul0,
+/// Mul1, Addend0 and Addend1.
+static bool
+isADDADDMUL(SDValue Op, SDValue &Mul0, SDValue &Mul1, SDValue &Addend0,
+            SDValue &Addend1, bool requireIntermediatesHaveOneUse)
+{
+  if (Op.getOpcode() != ISD::ADD)
+    return false;
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue AddOp;
+  SDValue OtherOp;
+  if (N0.getOpcode() == ISD::ADD) {
+    AddOp = N0;
+    OtherOp = N1;
+  } else if (N1.getOpcode() == ISD::ADD) {
+    AddOp = N1;
+    OtherOp = N0;
+  } else {
+    return false;
+  }
+  if (requireIntermediatesHaveOneUse && !AddOp.hasOneUse())
+    return false;
+  if (OtherOp.getOpcode() == ISD::MUL) {
+    // add(add(a,b),mul(x,y))
+    if (requireIntermediatesHaveOneUse && !OtherOp.hasOneUse())
+      return false;
+    Mul0 = OtherOp.getOperand(0);
+    Mul1 = OtherOp.getOperand(1);
+    Addend0 = AddOp.getOperand(0);
+    Addend1 = AddOp.getOperand(1);
+    return true;
+  }
+  if (AddOp.getOperand(0).getOpcode() == ISD::MUL) {
+    // add(add(mul(x,y),a),b)
+    if (requireIntermediatesHaveOneUse && !AddOp.getOperand(0).hasOneUse())
+      return false;
+    Mul0 = AddOp.getOperand(0).getOperand(0);
+    Mul1 = AddOp.getOperand(0).getOperand(1);
+    Addend0 = AddOp.getOperand(1);
+    Addend1 = OtherOp;
+    return true;
+  }
+  if (AddOp.getOperand(1).getOpcode() == ISD::MUL) {
+    // add(add(a,mul(x,y)),b)
+    if (requireIntermediatesHaveOneUse && !AddOp.getOperand(1).hasOneUse())
+      return false;
+    Mul0 = AddOp.getOperand(1).getOperand(0);
+    Mul1 = AddOp.getOperand(1).getOperand(1);
+    Addend0 = AddOp.getOperand(0);
+    Addend1 = OtherOp;
+    return true;
+  }
+  return false;
+}
+
+SDValue XCoreTargetLowering::
+TryExpandADDWithMul(SDNode *N, SelectionDAG &DAG) const
+{
+  SDValue Mul;
+  SDValue Other;
+  if (N->getOperand(0).getOpcode() == ISD::MUL) {
+    Mul = N->getOperand(0);
+    Other = N->getOperand(1);
+  } else if (N->getOperand(1).getOpcode() == ISD::MUL) {
+    Mul = N->getOperand(1);
+    Other = N->getOperand(0);
+  } else {
+    return SDValue();
+  }
+  SDLoc dl(N);
+  SDValue LL, RL, AddendL, AddendH;
+  LL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                   Mul.getOperand(0), DAG.getConstant(0, dl, MVT::i32));
+  RL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                   Mul.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
+  AddendL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                        Other, DAG.getConstant(0, dl, MVT::i32));
+  AddendH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                        Other, DAG.getConstant(1, dl, MVT::i32));
+  APInt HighMask = APInt::getHighBitsSet(64, 32);
+  unsigned LHSSB = DAG.ComputeNumSignBits(Mul.getOperand(0));
+  unsigned RHSSB = DAG.ComputeNumSignBits(Mul.getOperand(1));
+  if (DAG.MaskedValueIsZero(Mul.getOperand(0), HighMask) &&
+      DAG.MaskedValueIsZero(Mul.getOperand(1), HighMask)) {
+    // The inputs are both zero-extended.
+    SDValue Hi = DAG.getNode(XCoreISD::MACCU, dl,
+                             DAG.getVTList(MVT::i32, MVT::i32), AddendH,
+                             AddendL, LL, RL);
+    SDValue Lo(Hi.getNode(), 1);
+    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+  }
+  if (LHSSB > 32 && RHSSB > 32) {
+    // The inputs are both sign-extended.
+    SDValue Hi = DAG.getNode(XCoreISD::MACCS, dl,
+                             DAG.getVTList(MVT::i32, MVT::i32), AddendH,
+                             AddendL, LL, RL);
+    SDValue Lo(Hi.getNode(), 1);
+    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+  }
+  SDValue LH, RH;
+  LH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                   Mul.getOperand(0), DAG.getConstant(1, dl, MVT::i32));
+  RH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                   Mul.getOperand(1), DAG.getConstant(1, dl, MVT::i32));
+  SDValue Hi = DAG.getNode(XCoreISD::MACCU, dl,
+                           DAG.getVTList(MVT::i32, MVT::i32), AddendH,
+                           AddendL, LL, RL);
+  SDValue Lo(Hi.getNode(), 1);
+  RH = DAG.getNode(ISD::MUL, dl, MVT::i32, LL, RH);
+  LH = DAG.getNode(ISD::MUL, dl, MVT::i32, LH, RL);
+  Hi = DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, RH);
+  Hi = DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, LH);
+  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+}
+
+SDValue XCoreTargetLowering::
+ExpandADDSUB(SDNode *N, SelectionDAG &DAG) const
+{
+  assert(N->getValueType(0) == MVT::i64 &&
+         (N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
+        "Unknown operand to lower!");
+
+  if (N->getOpcode() == ISD::ADD)
+    if (SDValue Result = TryExpandADDWithMul(N, DAG))
+      return Result;
+
+  SDLoc dl(N);
+
+  // Extract components
+  SDValue LHSL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             N->getOperand(0),
+                             DAG.getConstant(0, dl, MVT::i32));
+  SDValue LHSH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             N->getOperand(0),
+                             DAG.getConstant(1, dl, MVT::i32));
+  SDValue RHSL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             N->getOperand(1),
+                             DAG.getConstant(0, dl, MVT::i32));
+  SDValue RHSH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             N->getOperand(1),
+                             DAG.getConstant(1, dl, MVT::i32));
+
+  // Expand
+  unsigned Opcode = (N->getOpcode() == ISD::ADD) ? XCoreISD::LADD :
+                                                   XCoreISD::LSUB;
+  SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
+  SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                           LHSL, RHSL, Zero);
+  SDValue Carry(Lo.getNode(), 1);
+
+  SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                           LHSH, RHSH, Carry);
+  SDValue Ignored(Hi.getNode(), 1);
+  // Merge the pieces
+  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+}
+
+SDValue XCoreTargetLowering::
+LowerVAARG(SDValue Op, SelectionDAG &DAG) const
+{
+  // Whist llvm does not support aggregate varargs we can ignore
+  // the possibility of the ValueType being an implicit byVal vararg.
+  SDNode *Node = Op.getNode();
+  EVT VT = Node->getValueType(0); // not an aggregate
+  SDValue InChain = Node->getOperand(0);
+  SDValue VAListPtr = Node->getOperand(1);
+  EVT PtrVT = VAListPtr.getValueType();
+  const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  SDLoc dl(Node);
+  SDValue VAList =
+      DAG.getLoad(PtrVT, dl, InChain, VAListPtr, MachinePointerInfo(SV));
+  // Increment the pointer, VAList, to the next vararg
+  SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAList,
+                                DAG.getIntPtrConstant(VT.getSizeInBits() / 8,
+                                                      dl));
+  // Store the incremented VAList to the legalized pointer
+  InChain = DAG.getStore(VAList.getValue(1), dl, nextPtr, VAListPtr,
+                         MachinePointerInfo(SV));
+  // Load the actual argument out of the pointer VAList
+  return DAG.getLoad(VT, dl, InChain, VAList, MachinePointerInfo());
+}
+
+SDValue XCoreTargetLowering::
+LowerVASTART(SDValue Op, SelectionDAG &DAG) const
+{
+  SDLoc dl(Op);
+  // vastart stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument
+  MachineFunction &MF = DAG.getMachineFunction();
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  SDValue Addr = DAG.getFrameIndex(XFI->getVarArgsFrameIndex(), MVT::i32);
+  return DAG.getStore(Op.getOperand(0), dl, Addr, Op.getOperand(1),
+                      MachinePointerInfo());
+}
+
+SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  // This nodes represent llvm.frameaddress on the DAG.
+  // It takes one operand, the index of the frame address to return.
+  // An index of zero corresponds to the current function's frame address.
+  // An index of one to the parent's frame address, and so on.
+  // Depths > 0 not supported yet!
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+    return SDValue();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op),
+                            RegInfo->getFrameRegister(MF), MVT::i32);
+}
+
+SDValue XCoreTargetLowering::
+LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
+  // This nodes represent llvm.returnaddress on the DAG.
+  // It takes one operand, the index of the return address to return.
+  // An index of zero corresponds to the current function's return address.
+  // An index of one to the parent's return address, and so on.
+  // Depths > 0 not supported yet!
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+    return SDValue();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  int FI = XFI->createLRSpillSlot(MF);
+  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+  return DAG.getLoad(getPointerTy(DAG.getDataLayout()), SDLoc(Op),
+                     DAG.getEntryNode(), FIN,
+                     MachinePointerInfo::getFixedStack(MF, FI));
+}
+
+SDValue XCoreTargetLowering::
+LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const {
+  // This node represents offset from frame pointer to first on-stack argument.
+  // This is needed for correct stack adjustment during unwind.
+  // However, we don't know the offset until after the frame has be finalised.
+  // This is done during the XCoreFTAOElim pass.
+  return DAG.getNode(XCoreISD::FRAME_TO_ARGS_OFFSET, SDLoc(Op), MVT::i32);
+}
+
+SDValue XCoreTargetLowering::
+LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
+  // OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER)
+  // This node represents 'eh_return' gcc dwarf builtin, which is used to
+  // return from exception. The general meaning is: adjust stack by OFFSET and
+  // pass execution to HANDLER.
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDValue Chain     = Op.getOperand(0);
+  SDValue Offset    = Op.getOperand(1);
+  SDValue Handler   = Op.getOperand(2);
+  SDLoc dl(Op);
+
+  // Absolute SP = (FP + FrameToArgs) + Offset
+  const TargetRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  SDValue Stack = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                            RegInfo->getFrameRegister(MF), MVT::i32);
+  SDValue FrameToArgs = DAG.getNode(XCoreISD::FRAME_TO_ARGS_OFFSET, dl,
+                                    MVT::i32);
+  Stack = DAG.getNode(ISD::ADD, dl, MVT::i32, Stack, FrameToArgs);
+  Stack = DAG.getNode(ISD::ADD, dl, MVT::i32, Stack, Offset);
+
+  // R0=ExceptionPointerRegister R1=ExceptionSelectorRegister
+  // which leaves 2 caller saved registers, R2 & R3 for us to use.
+  unsigned StackReg = XCore::R2;
+  unsigned HandlerReg = XCore::R3;
+
+  SDValue OutChains[] = {
+    DAG.getCopyToReg(Chain, dl, StackReg, Stack),
+    DAG.getCopyToReg(Chain, dl, HandlerReg, Handler)
+  };
+
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+
+  return DAG.getNode(XCoreISD::EH_RETURN, dl, MVT::Other, Chain,
+                     DAG.getRegister(StackReg, MVT::i32),
+                     DAG.getRegister(HandlerReg, MVT::i32));
+
+}
+
+SDValue XCoreTargetLowering::
+LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
+  return Op.getOperand(0);
+}
+
+SDValue XCoreTargetLowering::
+LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue FPtr = Op.getOperand(2); // nested function
+  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+
+  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+  // .align 4
+  // LDAPF_u10 r11, nest
+  // LDW_2rus r11, r11[0]
+  // STWSP_ru6 r11, sp[0]
+  // LDAPF_u10 r11, fptr
+  // LDW_2rus r11, r11[0]
+  // BAU_1r r11
+  // nest:
+  // .word nest
+  // fptr:
+  // .word fptr
+  SDValue OutChains[5];
+
+  SDValue Addr = Trmp;
+
+  SDLoc dl(Op);
+  OutChains[0] =
+      DAG.getStore(Chain, dl, DAG.getConstant(0x0a3cd805, dl, MVT::i32), Addr,
+                   MachinePointerInfo(TrmpAddr));
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                     DAG.getConstant(4, dl, MVT::i32));
+  OutChains[1] =
+      DAG.getStore(Chain, dl, DAG.getConstant(0xd80456c0, dl, MVT::i32), Addr,
+                   MachinePointerInfo(TrmpAddr, 4));
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                     DAG.getConstant(8, dl, MVT::i32));
+  OutChains[2] =
+      DAG.getStore(Chain, dl, DAG.getConstant(0x27fb0a3c, dl, MVT::i32), Addr,
+                   MachinePointerInfo(TrmpAddr, 8));
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                     DAG.getConstant(12, dl, MVT::i32));
+  OutChains[3] =
+      DAG.getStore(Chain, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12));
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                     DAG.getConstant(16, dl, MVT::i32));
+  OutChains[4] =
+      DAG.getStore(Chain, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 16));
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+}
+
+SDValue XCoreTargetLowering::
+LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  switch (IntNo) {
+    case Intrinsic::xcore_crc8:
+      EVT VT = Op.getValueType();
+      SDValue Data =
+        DAG.getNode(XCoreISD::CRC8, DL, DAG.getVTList(VT, VT),
+                    Op.getOperand(1), Op.getOperand(2) , Op.getOperand(3));
+      SDValue Crc(Data.getNode(), 1);
+      SDValue Results[] = { Crc, Data };
+      return DAG.getMergeValues(Results, DL);
+  }
+  return SDValue();
+}
+
+SDValue XCoreTargetLowering::
+LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(XCoreISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
+}
+
+SDValue XCoreTargetLowering::
+LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const {
+  AtomicSDNode *N = cast<AtomicSDNode>(Op);
+  assert(N->getOpcode() == ISD::ATOMIC_LOAD && "Bad Atomic OP");
+  assert((N->getOrdering() == AtomicOrdering::Unordered ||
+          N->getOrdering() == AtomicOrdering::Monotonic) &&
+         "setInsertFencesForAtomic(true) expects unordered / monotonic");
+  if (N->getMemoryVT() == MVT::i32) {
+    if (N->getAlignment() < 4)
+      report_fatal_error("atomic load must be aligned");
+    return DAG.getLoad(getPointerTy(DAG.getDataLayout()), SDLoc(Op),
+                       N->getChain(), N->getBasePtr(), N->getPointerInfo(),
+                       N->getAlignment(), N->getMemOperand()->getFlags(),
+                       N->getAAInfo(), N->getRanges());
+  }
+  if (N->getMemoryVT() == MVT::i16) {
+    if (N->getAlignment() < 2)
+      report_fatal_error("atomic load must be aligned");
+    return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), MVT::i32, N->getChain(),
+                          N->getBasePtr(), N->getPointerInfo(), MVT::i16,
+                          N->getAlignment(), N->getMemOperand()->getFlags(),
+                          N->getAAInfo());
+  }
+  if (N->getMemoryVT() == MVT::i8)
+    return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), MVT::i32, N->getChain(),
+                          N->getBasePtr(), N->getPointerInfo(), MVT::i8,
+                          N->getAlignment(), N->getMemOperand()->getFlags(),
+                          N->getAAInfo());
+  return SDValue();
+}
+
+SDValue XCoreTargetLowering::
+LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const {
+  AtomicSDNode *N = cast<AtomicSDNode>(Op);
+  assert(N->getOpcode() == ISD::ATOMIC_STORE && "Bad Atomic OP");
+  assert((N->getOrdering() == AtomicOrdering::Unordered ||
+          N->getOrdering() == AtomicOrdering::Monotonic) &&
+         "setInsertFencesForAtomic(true) expects unordered / monotonic");
+  if (N->getMemoryVT() == MVT::i32) {
+    if (N->getAlignment() < 4)
+      report_fatal_error("atomic store must be aligned");
+    return DAG.getStore(N->getChain(), SDLoc(Op), N->getVal(), N->getBasePtr(),
+                        N->getPointerInfo(), N->getAlignment(),
+                        N->getMemOperand()->getFlags(), N->getAAInfo());
+  }
+  if (N->getMemoryVT() == MVT::i16) {
+    if (N->getAlignment() < 2)
+      report_fatal_error("atomic store must be aligned");
+    return DAG.getTruncStore(N->getChain(), SDLoc(Op), N->getVal(),
+                             N->getBasePtr(), N->getPointerInfo(), MVT::i16,
+                             N->getAlignment(), N->getMemOperand()->getFlags(),
+                             N->getAAInfo());
+  }
+  if (N->getMemoryVT() == MVT::i8)
+    return DAG.getTruncStore(N->getChain(), SDLoc(Op), N->getVal(),
+                             N->getBasePtr(), N->getPointerInfo(), MVT::i8,
+                             N->getAlignment(), N->getMemOperand()->getFlags(),
+                             N->getAAInfo());
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "XCoreGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+//                  Call Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// XCore call implementation
+SDValue
+XCoreTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                               SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  // XCore target does not yet support tail call optimization.
+  isTailCall = false;
+
+  // For now, only CallingConv::C implemented
+  switch (CallConv)
+  {
+    default:
+      llvm_unreachable("Unsupported calling convention");
+    case CallingConv::Fast:
+    case CallingConv::C:
+      return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
+                            Outs, OutVals, Ins, dl, DAG, InVals);
+  }
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers / memory locations.
+static SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                               const SmallVectorImpl<CCValAssign> &RVLocs,
+                               const SDLoc &dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) {
+  SmallVector<std::pair<int, unsigned>, 4> ResultMemLocs;
+  // Copy results out of physical registers.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    const CCValAssign &VA = RVLocs[i];
+    if (VA.isRegLoc()) {
+      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getValVT(),
+                                 InFlag).getValue(1);
+      InFlag = Chain.getValue(2);
+      InVals.push_back(Chain.getValue(0));
+    } else {
+      assert(VA.isMemLoc());
+      ResultMemLocs.push_back(std::make_pair(VA.getLocMemOffset(),
+                                             InVals.size()));
+      // Reserve space for this result.
+      InVals.push_back(SDValue());
+    }
+  }
+
+  // Copy results out of memory.
+  SmallVector<SDValue, 4> MemOpChains;
+  for (unsigned i = 0, e = ResultMemLocs.size(); i != e; ++i) {
+    int offset = ResultMemLocs[i].first;
+    unsigned index = ResultMemLocs[i].second;
+    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+    SDValue Ops[] = { Chain, DAG.getConstant(offset / 4, dl, MVT::i32) };
+    SDValue load = DAG.getNode(XCoreISD::LDWSP, dl, VTs, Ops);
+    InVals[index] = load;
+    MemOpChains.push_back(load.getValue(1));
+  }
+
+  // Transform all loads nodes into one single node because
+  // all load nodes are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  return Chain;
+}
+
+/// LowerCCCCallTo - functions arguments are copied from virtual
+/// regs to (physical regs)/(stack frame), CALLSEQ_START and
+/// CALLSEQ_END are emitted.
+/// TODO: isTailCall, sret.
+SDValue XCoreTargetLowering::LowerCCCCallTo(
+    SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
+    bool isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+
+  // The ABI dictates there should be one stack slot available to the callee
+  // on function entry (for saving lr).
+  CCInfo.AllocateStack(4, 4);
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_XCore);
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  // Analyze return values to determine the number of bytes of stack required.
+  CCState RetCCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                    *DAG.getContext());
+  RetCCInfo.AllocateStack(CCInfo.getNextStackOffset(), 4);
+  RetCCInfo.AnalyzeCallResult(Ins, RetCC_XCore);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = RetCCInfo.getNextStackOffset();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  Chain = DAG.getCALLSEQ_START(Chain,
+                               DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[i];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+
+    // Arguments that can be passed on register must be kept at
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      int Offset = VA.getLocMemOffset();
+
+      MemOpChains.push_back(DAG.getNode(XCoreISD::STWSP, dl, MVT::Other,
+                                        Chain, Arg,
+                                        DAG.getConstant(Offset/4, dl,
+                                                        MVT::i32)));
+    }
+  }
+
+  // Transform all store nodes into one single node because
+  // all store nodes are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emitted instructions must be
+  // stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
+
+  // XCoreBranchLink = #chain, #target_address, #opt_in_flags...
+  //             = Chain, Callee, Reg#1, Reg#2, ...
+  //
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain  = DAG.getNode(XCoreISD::BL, dl, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getConstant(NumBytes, dl, PtrVT, true),
+                             DAG.getConstant(0, dl, PtrVT, true), InFlag, dl);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, RVLocs, dl, DAG, InVals);
+}
+
+//===----------------------------------------------------------------------===//
+//             Formal Arguments Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+  struct ArgDataPair { SDValue SDV; ISD::ArgFlagsTy Flags; };
+}
+
+/// XCore formal arguments implementation
+SDValue XCoreTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  switch (CallConv)
+  {
+    default:
+      llvm_unreachable("Unsupported calling convention");
+    case CallingConv::C:
+    case CallingConv::Fast:
+      return LowerCCCArguments(Chain, CallConv, isVarArg,
+                               Ins, dl, DAG, InVals);
+  }
+}
+
+/// LowerCCCArguments - transform physical registers into
+/// virtual registers and generate load operations for
+/// arguments places on the stack.
+/// TODO: sret
+SDValue XCoreTargetLowering::LowerCCCArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_XCore);
+
+  unsigned StackSlotSize = XCoreFrameLowering::stackSlotSize();
+
+  unsigned LRSaveSize = StackSlotSize;
+
+  if (!isVarArg)
+    XFI->setReturnStackOffset(CCInfo.getNextStackOffset() + LRSaveSize);
+
+  // All getCopyFromReg ops must precede any getMemcpys to prevent the
+  // scheduler clobbering a register before it has been copied.
+  // The stages are:
+  // 1. CopyFromReg (and load) arg & vararg registers.
+  // 2. Chain CopyFromReg nodes into a TokenFactor.
+  // 3. Memcpy 'byVal' args & push final InVals.
+  // 4. Chain mem ops nodes into a TokenFactor.
+  SmallVector<SDValue, 4> CFRegNode;
+  SmallVector<ArgDataPair, 4> ArgData;
+  SmallVector<SDValue, 4> MemOps;
+
+  // 1a. CopyFromReg (and load) arg registers.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+
+    CCValAssign &VA = ArgLocs[i];
+    SDValue ArgIn;
+
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      EVT RegVT = VA.getLocVT();
+      switch (RegVT.getSimpleVT().SimpleTy) {
+      default:
+        {
+#ifndef NDEBUG
+          errs() << "LowerFormalArguments Unhandled argument type: "
+                 << RegVT.getEVTString() << "\n";
+#endif
+          llvm_unreachable(nullptr);
+        }
+      case MVT::i32:
+        unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        ArgIn = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
+        CFRegNode.push_back(ArgIn.getValue(ArgIn->getNumValues() - 1));
+      }
+    } else {
+      // sanity check
+      assert(VA.isMemLoc());
+      // Load the argument to a virtual register
+      unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
+      if (ObjSize > StackSlotSize) {
+        errs() << "LowerFormalArguments Unhandled argument type: "
+               << EVT(VA.getLocVT()).getEVTString()
+               << "\n";
+      }
+      // Create the frame index object for this incoming parameter...
+      int FI = MFI.CreateFixedObject(ObjSize,
+                                     LRSaveSize + VA.getLocMemOffset(),
+                                     true);
+
+      // Create the SelectionDAG nodes corresponding to a load
+      //from this parameter
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+      ArgIn = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
+                          MachinePointerInfo::getFixedStack(MF, FI));
+    }
+    const ArgDataPair ADP = { ArgIn, Ins[i].Flags };
+    ArgData.push_back(ADP);
+  }
+
+  // 1b. CopyFromReg vararg registers.
+  if (isVarArg) {
+    // Argument registers
+    static const MCPhysReg ArgRegs[] = {
+      XCore::R0, XCore::R1, XCore::R2, XCore::R3
+    };
+    XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+    unsigned FirstVAReg = CCInfo.getFirstUnallocated(ArgRegs);
+    if (FirstVAReg < array_lengthof(ArgRegs)) {
+      int offset = 0;
+      // Save remaining registers, storing higher register numbers at a higher
+      // address
+      for (int i = array_lengthof(ArgRegs) - 1; i >= (int)FirstVAReg; --i) {
+        // Create a stack slot
+        int FI = MFI.CreateFixedObject(4, offset, true);
+        if (i == (int)FirstVAReg) {
+          XFI->setVarArgsFrameIndex(FI);
+        }
+        offset -= StackSlotSize;
+        SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+        // Move argument from phys reg -> virt reg
+        unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
+        RegInfo.addLiveIn(ArgRegs[i], VReg);
+        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+        CFRegNode.push_back(Val.getValue(Val->getNumValues() - 1));
+        // Move argument from virt reg -> stack
+        SDValue Store =
+            DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
+        MemOps.push_back(Store);
+      }
+    } else {
+      // This will point to the next argument passed via stack.
+      XFI->setVarArgsFrameIndex(
+        MFI.CreateFixedObject(4, LRSaveSize + CCInfo.getNextStackOffset(),
+                              true));
+    }
+  }
+
+  // 2. chain CopyFromReg nodes into a TokenFactor.
+  if (!CFRegNode.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, CFRegNode);
+
+  // 3. Memcpy 'byVal' args & push final InVals.
+  // Aggregates passed "byVal" need to be copied by the callee.
+  // The callee will use a pointer to this copy, rather than the original
+  // pointer.
+  for (SmallVectorImpl<ArgDataPair>::const_iterator ArgDI = ArgData.begin(),
+                                                    ArgDE = ArgData.end();
+       ArgDI != ArgDE; ++ArgDI) {
+    if (ArgDI->Flags.isByVal() && ArgDI->Flags.getByValSize()) {
+      unsigned Size = ArgDI->Flags.getByValSize();
+      unsigned Align = std::max(StackSlotSize, ArgDI->Flags.getByValAlign());
+      // Create a new object on the stack and copy the pointee into it.
+      int FI = MFI.CreateStackObject(Size, Align, false);
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+      InVals.push_back(FIN);
+      MemOps.push_back(DAG.getMemcpy(Chain, dl, FIN, ArgDI->SDV,
+                                     DAG.getConstant(Size, dl, MVT::i32),
+                                     Align, false, false, false,
+                                     MachinePointerInfo(),
+                                     MachinePointerInfo()));
+    } else {
+      InVals.push_back(ArgDI->SDV);
+    }
+  }
+
+  // 4, chain mem ops nodes into a TokenFactor.
+  if (!MemOps.empty()) {
+    MemOps.push_back(Chain);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+bool XCoreTargetLowering::
+CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+               bool isVarArg,
+               const SmallVectorImpl<ISD::OutputArg> &Outs,
+               LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+  if (!CCInfo.CheckReturn(Outs, RetCC_XCore))
+    return false;
+  if (CCInfo.getNextStackOffset() != 0 && isVarArg)
+    return false;
+  return true;
+}
+
+SDValue
+XCoreTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                 bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 const SDLoc &dl, SelectionDAG &DAG) const {
+
+  XCoreFunctionInfo *XFI =
+    DAG.getMachineFunction().getInfo<XCoreFunctionInfo>();
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+  // CCValAssign - represent the assignment of
+  // the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  // Analyze return values.
+  if (!isVarArg)
+    CCInfo.AllocateStack(XFI->getReturnStackOffset(), 4);
+
+  CCInfo.AnalyzeReturn(Outs, RetCC_XCore);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Return on XCore is always a "retsp 0"
+  RetOps.push_back(DAG.getConstant(0, dl, MVT::i32));
+
+  SmallVector<SDValue, 4> MemOpChains;
+  // Handle return values that must be copied to memory.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    if (VA.isRegLoc())
+      continue;
+    assert(VA.isMemLoc());
+    if (isVarArg) {
+      report_fatal_error("Can't return value from vararg function in memory");
+    }
+
+    int Offset = VA.getLocMemOffset();
+    unsigned ObjSize = VA.getLocVT().getSizeInBits() / 8;
+    // Create the frame index object for the memory location.
+    int FI = MFI.CreateFixedObject(ObjSize, Offset, false);
+
+    // Create a SelectionDAG node corresponding to a store
+    // to this memory location.
+    SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+    MemOpChains.push_back(DAG.getStore(
+        Chain, dl, OutVals[i], FIN,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
+  }
+
+  // Transform all store nodes into one single node because
+  // all stores are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
+  // Now handle return values copied to registers.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    if (!VA.isRegLoc())
+      continue;
+    // Copy the result values into the output registers.
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), OutVals[i], Flag);
+
+    // guarantee that all emitted copies are
+    // stuck together, avoiding something bad
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(XCoreISD::RETSP, dl, MVT::Other, RetOps);
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                                 MachineBasicBlock *BB) const {
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  DebugLoc dl = MI.getDebugLoc();
+  assert((MI.getOpcode() == XCore::SELECT_CC) &&
+         "Unexpected instr type to insert");
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = ++BB->getIterator();
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, dl, TII.get(XCore::BRFT_lru6))
+      .addReg(MI.getOperand(1).getReg())
+      .addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = sinkMBB;
+  BuildMI(*BB, BB->begin(), dl, TII.get(XCore::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(3).getReg())
+      .addMBB(copy0MBB)
+      .addReg(MI.getOperand(2).getReg())
+      .addMBB(thisMBB);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::INTRINSIC_VOID:
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    case Intrinsic::xcore_outt:
+    case Intrinsic::xcore_outct:
+    case Intrinsic::xcore_chkct: {
+      SDValue OutVal = N->getOperand(3);
+      // These instructions ignore the high bits.
+      if (OutVal.hasOneUse()) {
+        unsigned BitWidth = OutVal.getValueSizeInBits();
+        APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
+        APInt KnownZero, KnownOne;
+        TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                              !DCI.isBeforeLegalizeOps());
+        const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+        if (TLO.ShrinkDemandedConstant(OutVal, DemandedMask) ||
+            TLI.SimplifyDemandedBits(OutVal, DemandedMask, KnownZero, KnownOne,
+                                     TLO))
+          DCI.CommitTargetLoweringOpt(TLO);
+      }
+      break;
+    }
+    case Intrinsic::xcore_setpt: {
+      SDValue Time = N->getOperand(3);
+      // This instruction ignores the high bits.
+      if (Time.hasOneUse()) {
+        unsigned BitWidth = Time.getValueSizeInBits();
+        APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
+        APInt KnownZero, KnownOne;
+        TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                              !DCI.isBeforeLegalizeOps());
+        const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+        if (TLO.ShrinkDemandedConstant(Time, DemandedMask) ||
+            TLI.SimplifyDemandedBits(Time, DemandedMask, KnownZero, KnownOne,
+                                     TLO))
+          DCI.CommitTargetLoweringOpt(TLO);
+      }
+      break;
+    }
+    }
+    break;
+  case XCoreISD::LADD: {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    EVT VT = N0.getValueType();
+
+    // canonicalize constant to RHS
+    if (N0C && !N1C)
+      return DAG.getNode(XCoreISD::LADD, dl, DAG.getVTList(VT, VT), N1, N0, N2);
+
+    // fold (ladd 0, 0, x) -> 0, x & 1
+    if (N0C && N0C->isNullValue() && N1C && N1C->isNullValue()) {
+      SDValue Carry = DAG.getConstant(0, dl, VT);
+      SDValue Result = DAG.getNode(ISD::AND, dl, VT, N2,
+                                   DAG.getConstant(1, dl, VT));
+      SDValue Ops[] = { Result, Carry };
+      return DAG.getMergeValues(Ops, dl);
+    }
+
+    // fold (ladd x, 0, y) -> 0, add x, y iff carry is unused and y has only the
+    // low bit set
+    if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 1)) {
+      APInt KnownZero, KnownOne;
+      APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                         VT.getSizeInBits() - 1);
+      DAG.computeKnownBits(N2, KnownZero, KnownOne);
+      if ((KnownZero & Mask) == Mask) {
+        SDValue Carry = DAG.getConstant(0, dl, VT);
+        SDValue Result = DAG.getNode(ISD::ADD, dl, VT, N0, N2);
+        SDValue Ops[] = { Result, Carry };
+        return DAG.getMergeValues(Ops, dl);
+      }
+    }
+  }
+  break;
+  case XCoreISD::LSUB: {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    EVT VT = N0.getValueType();
+
+    // fold (lsub 0, 0, x) -> x, -x iff x has only the low bit set
+    if (N0C && N0C->isNullValue() && N1C && N1C->isNullValue()) {
+      APInt KnownZero, KnownOne;
+      APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                         VT.getSizeInBits() - 1);
+      DAG.computeKnownBits(N2, KnownZero, KnownOne);
+      if ((KnownZero & Mask) == Mask) {
+        SDValue Borrow = N2;
+        SDValue Result = DAG.getNode(ISD::SUB, dl, VT,
+                                     DAG.getConstant(0, dl, VT), N2);
+        SDValue Ops[] = { Result, Borrow };
+        return DAG.getMergeValues(Ops, dl);
+      }
+    }
+
+    // fold (lsub x, 0, y) -> 0, sub x, y iff borrow is unused and y has only the
+    // low bit set
+    if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 1)) {
+      APInt KnownZero, KnownOne;
+      APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                         VT.getSizeInBits() - 1);
+      DAG.computeKnownBits(N2, KnownZero, KnownOne);
+      if ((KnownZero & Mask) == Mask) {
+        SDValue Borrow = DAG.getConstant(0, dl, VT);
+        SDValue Result = DAG.getNode(ISD::SUB, dl, VT, N0, N2);
+        SDValue Ops[] = { Result, Borrow };
+        return DAG.getMergeValues(Ops, dl);
+      }
+    }
+  }
+  break;
+  case XCoreISD::LMUL: {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    SDValue N3 = N->getOperand(3);
+    ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    EVT VT = N0.getValueType();
+    // Canonicalize multiplicative constant to RHS. If both multiplicative
+    // operands are constant canonicalize smallest to RHS.
+    if ((N0C && !N1C) ||
+        (N0C && N1C && N0C->getZExtValue() < N1C->getZExtValue()))
+      return DAG.getNode(XCoreISD::LMUL, dl, DAG.getVTList(VT, VT),
+                         N1, N0, N2, N3);
+
+    // lmul(x, 0, a, b)
+    if (N1C && N1C->isNullValue()) {
+      // If the high result is unused fold to add(a, b)
+      if (N->hasNUsesOfValue(0, 0)) {
+        SDValue Lo = DAG.getNode(ISD::ADD, dl, VT, N2, N3);
+        SDValue Ops[] = { Lo, Lo };
+        return DAG.getMergeValues(Ops, dl);
+      }
+      // Otherwise fold to ladd(a, b, 0)
+      SDValue Result =
+        DAG.getNode(XCoreISD::LADD, dl, DAG.getVTList(VT, VT), N2, N3, N1);
+      SDValue Carry(Result.getNode(), 1);
+      SDValue Ops[] = { Carry, Result };
+      return DAG.getMergeValues(Ops, dl);
+    }
+  }
+  break;
+  case ISD::ADD: {
+    // Fold 32 bit expressions such as add(add(mul(x,y),a),b) ->
+    // lmul(x, y, a, b). The high result of lmul will be ignored.
+    // This is only profitable if the intermediate results are unused
+    // elsewhere.
+    SDValue Mul0, Mul1, Addend0, Addend1;
+    if (N->getValueType(0) == MVT::i32 &&
+        isADDADDMUL(SDValue(N, 0), Mul0, Mul1, Addend0, Addend1, true)) {
+      SDValue Ignored = DAG.getNode(XCoreISD::LMUL, dl,
+                                    DAG.getVTList(MVT::i32, MVT::i32), Mul0,
+                                    Mul1, Addend0, Addend1);
+      SDValue Result(Ignored.getNode(), 1);
+      return Result;
+    }
+    APInt HighMask = APInt::getHighBitsSet(64, 32);
+    // Fold 64 bit expression such as add(add(mul(x,y),a),b) ->
+    // lmul(x, y, a, b) if all operands are zero-extended. We do this
+    // before type legalization as it is messy to match the operands after
+    // that.
+    if (N->getValueType(0) == MVT::i64 &&
+        isADDADDMUL(SDValue(N, 0), Mul0, Mul1, Addend0, Addend1, false) &&
+        DAG.MaskedValueIsZero(Mul0, HighMask) &&
+        DAG.MaskedValueIsZero(Mul1, HighMask) &&
+        DAG.MaskedValueIsZero(Addend0, HighMask) &&
+        DAG.MaskedValueIsZero(Addend1, HighMask)) {
+      SDValue Mul0L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                  Mul0, DAG.getConstant(0, dl, MVT::i32));
+      SDValue Mul1L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                  Mul1, DAG.getConstant(0, dl, MVT::i32));
+      SDValue Addend0L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                     Addend0, DAG.getConstant(0, dl, MVT::i32));
+      SDValue Addend1L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                     Addend1, DAG.getConstant(0, dl, MVT::i32));
+      SDValue Hi = DAG.getNode(XCoreISD::LMUL, dl,
+                               DAG.getVTList(MVT::i32, MVT::i32), Mul0L, Mul1L,
+                               Addend0L, Addend1L);
+      SDValue Lo(Hi.getNode(), 1);
+      return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+    }
+  }
+  break;
+  case ISD::STORE: {
+    // Replace unaligned store of unaligned load with memmove.
+    StoreSDNode *ST  = cast<StoreSDNode>(N);
+    if (!DCI.isBeforeLegalize() ||
+        allowsMisalignedMemoryAccesses(ST->getMemoryVT(),
+                                       ST->getAddressSpace(),
+                                       ST->getAlignment()) ||
+        ST->isVolatile() || ST->isIndexed()) {
+      break;
+    }
+    SDValue Chain = ST->getChain();
+
+    unsigned StoreBits = ST->getMemoryVT().getStoreSizeInBits();
+    assert((StoreBits % 8) == 0 &&
+           "Store size in bits must be a multiple of 8");
+    unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(
+        ST->getMemoryVT().getTypeForEVT(*DCI.DAG.getContext()));
+    unsigned Alignment = ST->getAlignment();
+    if (Alignment >= ABIAlignment) {
+      break;
+    }
+
+    if (LoadSDNode *LD = dyn_cast<LoadSDNode>(ST->getValue())) {
+      if (LD->hasNUsesOfValue(1, 0) && ST->getMemoryVT() == LD->getMemoryVT() &&
+        LD->getAlignment() == Alignment &&
+        !LD->isVolatile() && !LD->isIndexed() &&
+        Chain.reachesChainWithoutSideEffects(SDValue(LD, 1))) {
+        bool isTail = isInTailCallPosition(DAG, ST, Chain);
+        return DAG.getMemmove(Chain, dl, ST->getBasePtr(),
+                              LD->getBasePtr(),
+                              DAG.getConstant(StoreBits/8, dl, MVT::i32),
+                              Alignment, false, isTail, ST->getPointerInfo(),
+                              LD->getPointerInfo());
+      }
+    }
+    break;
+  }
+  }
+  return SDValue();
+}
+
+void XCoreTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                        APInt &KnownZero,
+                                                        APInt &KnownOne,
+                                                        const SelectionDAG &DAG,
+                                                        unsigned Depth) const {
+  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
+  switch (Op.getOpcode()) {
+  default: break;
+  case XCoreISD::LADD:
+  case XCoreISD::LSUB:
+    if (Op.getResNo() == 1) {
+      // Top bits of carry / borrow are clear.
+      KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+                                        KnownZero.getBitWidth() - 1);
+    }
+    break;
+  case ISD::INTRINSIC_W_CHAIN:
+    {
+      unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      switch (IntNo) {
+      case Intrinsic::xcore_getts:
+        // High bits are known to be zero.
+        KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+                                          KnownZero.getBitWidth() - 16);
+        break;
+      case Intrinsic::xcore_int:
+      case Intrinsic::xcore_inct:
+        // High bits are known to be zero.
+        KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+                                          KnownZero.getBitWidth() - 8);
+        break;
+      case Intrinsic::xcore_testct:
+        // Result is either 0 or 1.
+        KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+                                          KnownZero.getBitWidth() - 1);
+        break;
+      case Intrinsic::xcore_testwct:
+        // Result is in the range 0 - 4.
+        KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+                                          KnownZero.getBitWidth() - 3);
+        break;
+      }
+    }
+    break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Addressing mode description hooks
+//===----------------------------------------------------------------------===//
+
+static inline bool isImmUs(int64_t val)
+{
+  return (val >= 0 && val <= 11);
+}
+
+static inline bool isImmUs2(int64_t val)
+{
+  return (val%2 == 0 && isImmUs(val/2));
+}
+
+static inline bool isImmUs4(int64_t val)
+{
+  return (val%4 == 0 && isImmUs(val/4));
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool XCoreTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                const AddrMode &AM, Type *Ty,
+                                                unsigned AS) const {
+  if (Ty->getTypeID() == Type::VoidTyID)
+    return AM.Scale == 0 && isImmUs(AM.BaseOffs) && isImmUs4(AM.BaseOffs);
+
+  unsigned Size = DL.getTypeAllocSize(Ty);
+  if (AM.BaseGV) {
+    return Size >= 4 && !AM.HasBaseReg && AM.Scale == 0 &&
+                 AM.BaseOffs%4 == 0;
+  }
+
+  switch (Size) {
+  case 1:
+    // reg + imm
+    if (AM.Scale == 0) {
+      return isImmUs(AM.BaseOffs);
+    }
+    // reg + reg
+    return AM.Scale == 1 && AM.BaseOffs == 0;
+  case 2:
+  case 3:
+    // reg + imm
+    if (AM.Scale == 0) {
+      return isImmUs2(AM.BaseOffs);
+    }
+    // reg + reg<<1
+    return AM.Scale == 2 && AM.BaseOffs == 0;
+  default:
+    // reg + imm
+    if (AM.Scale == 0) {
+      return isImmUs4(AM.BaseOffs);
+    }
+    // reg + reg<<2
+    return AM.Scale == 4 && AM.BaseOffs == 0;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           XCore Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+std::pair<unsigned, const TargetRegisterClass *>
+XCoreTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                  StringRef Constraint,
+                                                  MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default : break;
+    case 'r':
+      return std::make_pair(0U, &XCore::GRRegsRegClass);
+    }
+  }
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h
new file mode 100644
index 000000000000..41813bbb8156
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h
@@ -0,0 +1,234 @@
+//===-- XCoreISelLowering.h - XCore DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that XCore uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREISELLOWERING_H
+#define LLVM_LIB_TARGET_XCORE_XCOREISELLOWERING_H
+
+#include "XCore.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+  // Forward delcarations
+  class XCoreSubtarget;
+  class XCoreTargetMachine;
+
+  namespace XCoreISD {
+    enum NodeType : unsigned {
+      // Start the numbering where the builtin ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      // Branch and link (call)
+      BL,
+
+      // pc relative address
+      PCRelativeWrapper,
+
+      // dp relative address
+      DPRelativeWrapper,
+
+      // cp relative address
+      CPRelativeWrapper,
+
+      // Load word from stack
+      LDWSP,
+
+      // Store word to stack
+      STWSP,
+
+      // Corresponds to retsp instruction
+      RETSP,
+
+      // Corresponds to LADD instruction
+      LADD,
+
+      // Corresponds to LSUB instruction
+      LSUB,
+
+      // Corresponds to LMUL instruction
+      LMUL,
+
+      // Corresponds to MACCU instruction
+      MACCU,
+
+      // Corresponds to MACCS instruction
+      MACCS,
+
+      // Corresponds to CRC8 instruction
+      CRC8,
+
+      // Jumptable branch.
+      BR_JT,
+
+      // Jumptable branch using long branches for each entry.
+      BR_JT32,
+
+      // Offset from frame pointer to the first (possible) on-stack argument
+      FRAME_TO_ARGS_OFFSET,
+
+      // Exception handler return. The stack is restored to the first
+      // followed by a jump to the second argument.
+      EH_RETURN,
+
+      // Memory barrier.
+      MEMBARRIER
+    };
+  }
+
+  //===--------------------------------------------------------------------===//
+  // TargetLowering Implementation
+  //===--------------------------------------------------------------------===//
+  class XCoreTargetLowering : public TargetLowering
+  {
+  public:
+    explicit XCoreTargetLowering(const TargetMachine &TM,
+                                 const XCoreSubtarget &Subtarget);
+
+    using TargetLowering::isZExtFree;
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+
+    unsigned getJumpTableEncoding() const override;
+    MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override {
+      return MVT::i32;
+    }
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    //  DAG node.
+    const char *getTargetNodeName(unsigned Opcode) const override;
+
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr &MI,
+                                MachineBasicBlock *MBB) const override;
+
+    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
+                               Type *Ty, unsigned AS) const override;
+
+    /// If a physical register, this returns the register that receives the
+    /// exception address on entry to an EH pad.
+    unsigned
+    getExceptionPointerRegister(const Constant *PersonalityFn) const override {
+      return XCore::R0;
+    }
+
+    /// If a physical register, this returns the register that receives the
+    /// exception typeid on entry to a landing pad.
+    unsigned
+    getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
+      return XCore::R1;
+    }
+
+  private:
+    const TargetMachine &TM;
+    const XCoreSubtarget &Subtarget;
+
+    // Lower Operand helpers
+    SDValue LowerCCCArguments(SDValue Chain, CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              const SDLoc &dl, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           bool isTailCall,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<SDValue> &OutVals,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           const SDLoc &dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const;
+    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+    SDValue getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
+                                    SelectionDAG &DAG) const;
+    SDValue lowerLoadWordFromAlignedBasePlusOffset(const SDLoc &DL,
+                                                   SDValue Chain, SDValue Base,
+                                                   int64_t Offset,
+                                                   SelectionDAG &DAG) const;
+
+    // Lower Operand specifics
+    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
+
+    // Inline asm support
+    std::pair<unsigned, const TargetRegisterClass *>
+    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                 StringRef Constraint, MVT VT) const override;
+
+    // Expand specifics
+    SDValue TryExpandADDWithMul(SDNode *Op, SelectionDAG &DAG) const;
+    SDValue ExpandADDSUB(SDNode *Op, SelectionDAG &DAG) const;
+
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
+
+    SDValue
+    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals) const override;
+
+    SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const override;
+
+    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        const SDLoc &dl, SelectionDAG &DAG) const override;
+
+    bool
+      CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                     bool isVarArg,
+                     const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
+                     LLVMContext &Context) const override;
+    bool shouldInsertFencesForAtomic(const Instruction *I) const override {
+      return true;
+    }
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrFormats.td b/contrib/llvm/lib/Target/XCore/XCoreInstrFormats.td
new file mode 100644
index 000000000000..379cc39aa617
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrFormats.td
@@ -0,0 +1,277 @@
+//===-- XCoreInstrFormats.td - XCore Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+class InstXCore<int sz, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "XCore";
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+  let Size = sz;
+  field bits<32> SoftFail = 0;
+}
+
+// XCore pseudo instructions format
+class PseudoInstXCore<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstXCore<0, outs, ins, asmstr, pattern> {
+  let isPseudo = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction formats
+//===----------------------------------------------------------------------===//
+
+class _F3R<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc;
+  let DecoderMethod = "Decode3RInstruction";
+}
+
+// 3R with first operand as an immediate. Used for TSETR where the first
+// operand is treated as an immediate since it refers to a register number in
+// another thread.
+class _F3RImm<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : _F3R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "Decode3RImmInstruction";
+}
+
+class _FL3R<bits<9> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc{8-4};
+  let Inst{26-20} = 0b1111110;
+  let Inst{19-16} = opc{3-0};
+
+  let Inst{15-11} = 0b11111;
+  let DecoderMethod = "DecodeL3RInstruction";
+}
+
+// L3R with first operand as both a source and a destination.
+class _FL3RSrcDst<bits<9> opc, dag outs, dag ins, string asmstr,
+                  list<dag> pattern> : _FL3R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeL3RSrcDstInstruction";
+}
+
+class _F2RUS<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc;
+  let DecoderMethod = "Decode2RUSInstruction";
+}
+
+// 2RUS with bitp operand
+class _F2RUSBitp<bits<5> opc, dag outs, dag ins, string asmstr,
+                 list<dag> pattern>
+    : _F2RUS<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "Decode2RUSBitpInstruction";
+}
+
+class _FL2RUS<bits<9> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc{8-4};
+  let Inst{26-20} = 0b1111110;
+  let Inst{19-16} = opc{3-0};
+
+  let Inst{15-11} = 0b11111;
+  let DecoderMethod = "DecodeL2RUSInstruction";
+}
+
+// L2RUS with bitp operand
+class _FL2RUSBitp<bits<9> opc, dag outs, dag ins, string asmstr,
+                  list<dag> pattern>
+    : _FL2RUS<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeL2RUSBitpInstruction";
+}
+
+class _FRU6<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  bits<4> a;
+  bits<6> b;
+
+  let Inst{15-10} = opc;
+  let Inst{9-6} = a;
+  let Inst{5-0} = b;
+}
+
+class _FLRU6<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  bits<4> a;
+  bits<16> b;
+
+  let Inst{31-26} = opc;
+  let Inst{25-22} = a;
+  let Inst{21-16} = b{5-0};
+  let Inst{15-10} = 0b111100;
+  let Inst{9-0} = b{15-6};
+}
+
+class _FU6<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  bits<6> a;
+
+  let Inst{15-6} = opc;
+  let Inst{5-0} = a;
+}
+
+class _FLU6<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  bits<16> a;
+
+  let Inst{31-22} = opc;
+  let Inst{21-16} = a{5-0};
+  let Inst{15-10} = 0b111100;
+  let Inst{9-0} = a{15-6};
+}
+
+class _FU10<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  bits<10> a;
+
+  let Inst{15-10} = opc;
+  let Inst{9-0} = a;
+}
+
+class _FLU10<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  bits<20> a;
+
+  let Inst{31-26} = opc;
+  let Inst{25-16} = a{9-0};
+  let Inst{15-10} = 0b111100;
+  let Inst{9-0} = a{19-10};
+}
+
+class _F2R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc{5-1};
+  let Inst{4} = opc{0};
+  let DecoderMethod = "Decode2RInstruction";
+}
+
+// 2R with first operand as an immediate. Used for TSETMR where the first
+// operand is treated as an immediate since it refers to a register number in
+// another thread.
+class _F2RImm<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : _F2R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "Decode2RImmInstruction";
+}
+
+// 2R with first operand as both a source and a destination.
+class _F2RSrcDst<bits<6> opc, dag outs, dag ins, string asmstr,
+                 list<dag> pattern> : _F2R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "Decode2RSrcDstInstruction";
+}
+
+// Same as 2R with last two operands swapped
+class _FR2R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : _F2R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeR2RInstruction";
+}
+
+class _FRUS<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc{5-1};
+  let Inst{4} = opc{0};
+  let DecoderMethod = "DecodeRUSInstruction";
+}
+
+// RUS with bitp operand
+class _FRUSBitp<bits<6> opc, dag outs, dag ins, string asmstr,
+                list<dag> pattern>
+    : _FRUS<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeRUSBitpInstruction";
+}
+
+// RUS with first operand as both a source and a destination and a bitp second
+// operand
+class _FRUSSrcDstBitp<bits<6> opc, dag outs, dag ins, string asmstr,
+                      list<dag> pattern>
+    : _FRUS<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeRUSSrcDstBitpInstruction";
+}
+
+class _FL2R<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc{9-5};
+  let Inst{26-20} = 0b1111110;
+  let Inst{19-16} = opc{4-1};
+
+  let Inst{15-11} = 0b11111;
+  let Inst{4} = opc{0};
+  let DecoderMethod = "DecodeL2RInstruction";
+}
+
+// Same as L2R with last two operands swapped
+class _FLR2R<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : _FL2R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeLR2RInstruction";
+}
+
+class _F1R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  bits<4> a;
+
+  let Inst{15-11} = opc{5-1};
+  let Inst{10-5} = 0b111111;
+  let Inst{4} = opc{0};
+  let Inst{3-0} = a;
+}
+
+class _F0R<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc{9-5};
+  let Inst{10-5} = 0b111111;
+  let Inst{4-0} = opc{4-0};
+}
+
+class _FL4R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  bits<4> d;
+
+  let Inst{31-27} = opc{5-1};
+  let Inst{26-21} = 0b111111;
+  let Inst{20} = opc{0};
+  let Inst{19-16} = d;
+  let Inst{15-11} = 0b11111;
+}
+
+// L4R with 4th operand as both a source and a destination.
+class _FL4RSrcDst<bits<6> opc, dag outs, dag ins, string asmstr,
+                  list<dag> pattern>
+    : _FL4R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeL4RSrcDstInstruction";
+}
+
+// L4R with 1st and 4th operand as both a source and a destination.
+class _FL4RSrcDstSrcDst<bits<6> opc, dag outs, dag ins, string asmstr,
+                        list<dag> pattern>
+    : _FL4R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeL4RSrcDstSrcDstInstruction";
+}
+
+class _FL5R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc{5-1};
+  let Inst{20} = opc{0};
+  let Inst{15-11} = 0b11111;
+
+  let DecoderMethod = "DecodeL5RInstruction";
+}
+
+class _FL6R<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc;
+  let Inst{15-11} = 0b11111;
+
+  let DecoderMethod = "DecodeL6RInstruction";
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
new file mode 100644
index 000000000000..7a9c6fc93f8a
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -0,0 +1,451 @@
+//===-- XCoreInstrInfo.cpp - XCore Instruction Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreInstrInfo.h"
+#include "XCore.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_CTOR_DTOR
+#include "XCoreGenInstrInfo.inc"
+
+namespace llvm {
+namespace XCore {
+
+  // XCore Condition Codes
+  enum CondCode {
+    COND_TRUE,
+    COND_FALSE,
+    COND_INVALID
+  };
+}
+}
+
+// Pin the vtable to this file.
+void XCoreInstrInfo::anchor() {}
+
+XCoreInstrInfo::XCoreInstrInfo()
+  : XCoreGenInstrInfo(XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
+    RI() {
+}
+
+static bool isZeroImm(const MachineOperand &op) {
+  return op.isImm() && op.getImm() == 0;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned XCoreInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                             int &FrameIndex) const {
+  int Opcode = MI.getOpcode();
+  if (Opcode == XCore::LDWFI) 
+  {
+    if ((MI.getOperand(1).isFI()) &&  // is a stack slot
+        (MI.getOperand(2).isImm()) && // the imm is zero
+        (isZeroImm(MI.getOperand(2)))) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+  
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+unsigned XCoreInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                            int &FrameIndex) const {
+  int Opcode = MI.getOpcode();
+  if (Opcode == XCore::STWFI)
+  {
+    if ((MI.getOperand(1).isFI()) &&  // is a stack slot
+        (MI.getOperand(2).isImm()) && // the imm is zero
+        (isZeroImm(MI.getOperand(2)))) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Analysis
+//===----------------------------------------------------------------------===//
+
+static inline bool IsBRU(unsigned BrOpc) {
+  return BrOpc == XCore::BRFU_u6
+      || BrOpc == XCore::BRFU_lu6
+      || BrOpc == XCore::BRBU_u6
+      || BrOpc == XCore::BRBU_lu6;
+}
+
+static inline bool IsBRT(unsigned BrOpc) {
+  return BrOpc == XCore::BRFT_ru6
+      || BrOpc == XCore::BRFT_lru6
+      || BrOpc == XCore::BRBT_ru6
+      || BrOpc == XCore::BRBT_lru6;
+}
+
+static inline bool IsBRF(unsigned BrOpc) {
+  return BrOpc == XCore::BRFF_ru6
+      || BrOpc == XCore::BRFF_lru6
+      || BrOpc == XCore::BRBF_ru6
+      || BrOpc == XCore::BRBF_lru6;
+}
+
+static inline bool IsCondBranch(unsigned BrOpc) {
+  return IsBRF(BrOpc) || IsBRT(BrOpc);
+}
+
+static inline bool IsBR_JT(unsigned BrOpc) {
+  return BrOpc == XCore::BR_JT
+      || BrOpc == XCore::BR_JT32;
+}
+
+/// GetCondFromBranchOpc - Return the XCore CC that matches 
+/// the correspondent Branch instruction opcode.
+static XCore::CondCode GetCondFromBranchOpc(unsigned BrOpc) 
+{
+  if (IsBRT(BrOpc)) {
+    return XCore::COND_TRUE;
+  } else if (IsBRF(BrOpc)) {
+    return XCore::COND_FALSE;
+  } else {
+    return XCore::COND_INVALID;
+  }
+}
+
+/// GetCondBranchFromCond - Return the Branch instruction
+/// opcode that matches the cc.
+static inline unsigned GetCondBranchFromCond(XCore::CondCode CC) 
+{
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case XCore::COND_TRUE   : return XCore::BRFT_lru6;
+  case XCore::COND_FALSE  : return XCore::BRFF_lru6;
+  }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified 
+/// condition, e.g. turning COND_E to COND_NE.
+static inline XCore::CondCode GetOppositeBranchCondition(XCore::CondCode CC)
+{
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case XCore::COND_TRUE   : return XCore::COND_FALSE;
+  case XCore::COND_FALSE  : return XCore::COND_TRUE;
+  }
+}
+
+/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
+/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+/// implemented for a target).  Upon success, this returns false and returns
+/// with the following information in various cases:
+///
+/// 1. If this block ends with no branches (it just falls through to its succ)
+///    just return false, leaving TBB/FBB null.
+/// 2. If this block ends with only an unconditional branch, it sets TBB to be
+///    the destination block.
+/// 3. If this block ends with an conditional branch and it falls through to
+///    an successor block, it sets TBB to be the branch destination block and a
+///    list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+/// 4. If this block ends with an conditional branch and an unconditional
+///    block, it returns the 'true' destination in TBB, the 'false' destination
+///    in FBB, and a list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+///
+/// Note that removeBranch and insertBranch must be implemented to support
+/// cases where this method returns success.
+///
+bool XCoreInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return false;
+
+  if (!isUnpredicatedTerminator(*I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = &*I;
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+    if (IsBRU(LastInst->getOpcode())) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    
+    XCore::CondCode BranchCode = GetCondFromBranchOpc(LastInst->getOpcode());
+    if (BranchCode == XCore::COND_INVALID)
+      return true;  // Can't handle indirect branch.
+    
+    // Conditional branch
+    // Block ends with fall-through condbranch.
+
+    TBB = LastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+    Cond.push_back(LastInst->getOperand(0));
+    return false;
+  }
+  
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = &*I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
+    return true;
+  
+  unsigned SecondLastOpc    = SecondLastInst->getOpcode();
+  XCore::CondCode BranchCode = GetCondFromBranchOpc(SecondLastOpc);
+  
+  // If the block ends with conditional branch followed by unconditional,
+  // handle it.
+  if (BranchCode != XCore::COND_INVALID
+    && IsBRU(LastInst->getOpcode())) {
+
+    TBB = SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+    Cond.push_back(SecondLastInst->getOperand(0));
+
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+  
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (IsBRU(SecondLastInst->getOpcode()) && 
+      IsBRU(LastInst->getOpcode())) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Likewise if it ends with a branch table followed by an unconditional branch.
+  if (IsBR_JT(SecondLastInst->getOpcode()) && IsBRU(LastInst->getOpcode())) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned XCoreInstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                      MachineBasicBlock *TBB,
+                                      MachineBasicBlock *FBB,
+                                      ArrayRef<MachineOperand> Cond,
+                                      const DebugLoc &DL,
+                                      int *BytesAdded) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "Unexpected number of components!");
+  assert(!BytesAdded && "code size not handled");
+
+  if (!FBB) { // One way branch.
+    if (Cond.empty()) {
+      // Unconditional branch
+      BuildMI(&MBB, DL, get(XCore::BRFU_lu6)).addMBB(TBB);
+    } else {
+      // Conditional branch.
+      unsigned Opc = GetCondBranchFromCond((XCore::CondCode)Cond[0].getImm());
+      BuildMI(&MBB, DL, get(Opc)).addReg(Cond[1].getReg())
+                             .addMBB(TBB);
+    }
+    return 1;
+  }
+  
+  // Two-way Conditional branch.
+  assert(Cond.size() == 2 && "Unexpected number of components!");
+  unsigned Opc = GetCondBranchFromCond((XCore::CondCode)Cond[0].getImm());
+  BuildMI(&MBB, DL, get(Opc)).addReg(Cond[1].getReg())
+                         .addMBB(TBB);
+  BuildMI(&MBB, DL, get(XCore::BRFU_lu6)).addMBB(FBB);
+  return 2;
+}
+
+unsigned
+XCoreInstrInfo::removeBranch(MachineBasicBlock &MBB, int *BytesRemoved) const {
+  assert(!BytesRemoved && "code size not handled");
+
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return 0;
+
+  if (!IsBRU(I->getOpcode()) && !IsCondBranch(I->getOpcode()))
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (!IsCondBranch(I->getOpcode()))
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+void XCoreInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I,
+                                 const DebugLoc &DL, unsigned DestReg,
+                                 unsigned SrcReg, bool KillSrc) const {
+  bool GRDest = XCore::GRRegsRegClass.contains(DestReg);
+  bool GRSrc  = XCore::GRRegsRegClass.contains(SrcReg);
+
+  if (GRDest && GRSrc) {
+    BuildMI(MBB, I, DL, get(XCore::ADD_2rus), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc))
+      .addImm(0);
+    return;
+  }
+  
+  if (GRDest && SrcReg == XCore::SP) {
+    BuildMI(MBB, I, DL, get(XCore::LDAWSP_ru6), DestReg).addImm(0);
+    return;
+  }
+
+  if (DestReg == XCore::SP && GRSrc) {
+    BuildMI(MBB, I, DL, get(XCore::SETSP_1r))
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  llvm_unreachable("Impossible reg-to-reg copy");
+}
+
+void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned SrcReg, bool isKill,
+                                         int FrameIndex,
+                                         const TargetRegisterClass *RC,
+                                         const TargetRegisterInfo *TRI) const
+{
+  DebugLoc DL;
+  if (I != MBB.end() && !I->isDebugValue())
+    DL = I->getDebugLoc();
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FrameIndex),
+      MachineMemOperand::MOStore, MFI.getObjectSize(FrameIndex),
+      MFI.getObjectAlignment(FrameIndex));
+  BuildMI(MBB, I, DL, get(XCore::STWFI))
+    .addReg(SrcReg, getKillRegState(isKill))
+    .addFrameIndex(FrameIndex)
+    .addImm(0)
+    .addMemOperand(MMO);
+}
+
+void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          unsigned DestReg, int FrameIndex,
+                                          const TargetRegisterClass *RC,
+                                          const TargetRegisterInfo *TRI) const
+{
+  DebugLoc DL;
+  if (I != MBB.end() && !I->isDebugValue())
+    DL = I->getDebugLoc();
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FrameIndex),
+      MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
+      MFI.getObjectAlignment(FrameIndex));
+  BuildMI(MBB, I, DL, get(XCore::LDWFI), DestReg)
+    .addFrameIndex(FrameIndex)
+    .addImm(0)
+    .addMemOperand(MMO);
+}
+
+bool XCoreInstrInfo::
+reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert((Cond.size() == 2) &&
+          "Invalid XCore branch condition!");
+  Cond[0].setImm(GetOppositeBranchCondition((XCore::CondCode)Cond[0].getImm()));
+  return false;
+}
+
+static inline bool isImmU6(unsigned val) {
+  return val < (1 << 6);
+}
+
+static inline bool isImmU16(unsigned val) {
+  return val < (1 << 16);
+}
+
+static bool isImmMskBitp(unsigned val) {
+  if (!isMask_32(val)) {
+    return false;
+  }
+  int N = Log2_32(val) + 1;
+  return (N >= 1 && N <= 8) || N == 16 || N == 24 || N == 32;
+}
+
+MachineBasicBlock::iterator XCoreInstrInfo::loadImmediate(
+                                              MachineBasicBlock &MBB,
+                                              MachineBasicBlock::iterator MI,
+                                              unsigned Reg, uint64_t Value) const {
+  DebugLoc dl;
+  if (MI != MBB.end() && !MI->isDebugValue())
+    dl = MI->getDebugLoc();
+  if (isImmMskBitp(Value)) {
+    int N = Log2_32(Value) + 1;
+    return BuildMI(MBB, MI, dl, get(XCore::MKMSK_rus), Reg)
+        .addImm(N)
+        .getInstr();
+  }
+  if (isImmU16(Value)) {
+    int Opcode = isImmU6(Value) ? XCore::LDC_ru6 : XCore::LDC_lru6;
+    return BuildMI(MBB, MI, dl, get(Opcode), Reg).addImm(Value).getInstr();
+  }
+  MachineConstantPool *ConstantPool = MBB.getParent()->getConstantPool();
+  const Constant *C = ConstantInt::get(
+        Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Value);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+  return BuildMI(MBB, MI, dl, get(XCore::LDWCP_lru6), Reg)
+      .addConstantPoolIndex(Idx)
+      .getInstr();
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h
new file mode 100644
index 000000000000..a377784caf4b
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h
@@ -0,0 +1,94 @@
+//===-- XCoreInstrInfo.h - XCore Instruction Information --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREINSTRINFO_H
+#define LLVM_LIB_TARGET_XCORE_XCOREINSTRINFO_H
+
+#include "XCoreRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "XCoreGenInstrInfo.inc"
+
+namespace llvm {
+
+class XCoreInstrInfo : public XCoreGenInstrInfo {
+  const XCoreRegisterInfo RI;
+  virtual void anchor();
+public:
+  XCoreInstrInfo();
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool reverseBranchCondition(
+                          SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  // Emit code before MBBI to load immediate value into physical register Reg.
+  // Returns an iterator to the new instruction.
+  MachineBasicBlock::iterator loadImmediate(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MI,
+                                            unsigned Reg, uint64_t Value) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td
new file mode 100644
index 000000000000..f1d52d5a191f
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td
@@ -0,0 +1,1312 @@
+//===-- XCoreInstrInfo.td - Target Description for XCore ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the XCore instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+// Uses of CP, DP are not currently reflected in the patterns, since
+// having a physical register as an operand prevents loop hoisting and
+// since the value of these registers never changes during the life of the
+// function.
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass.
+//===----------------------------------------------------------------------===//
+
+include "XCoreInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// XCore specific DAG Nodes.
+//
+
+// Call
+def SDT_XCoreBranchLink : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def XCoreBranchLink     : SDNode<"XCoreISD::BL",SDT_XCoreBranchLink,
+                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                             SDNPVariadic]>;
+
+def XCoreRetsp : SDNode<"XCoreISD::RETSP", SDTBrind,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPMayLoad, SDNPVariadic]>;
+
+def SDT_XCoreEhRet : SDTypeProfile<0, 2,
+                            [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def XCoreEhRet       : SDNode<"XCoreISD::EH_RETURN", SDT_XCoreEhRet,
+                         [SDNPHasChain, SDNPOptInGlue]>;
+
+def SDT_XCoreBR_JT    : SDTypeProfile<0, 2,
+                                      [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+def XCoreBR_JT : SDNode<"XCoreISD::BR_JT", SDT_XCoreBR_JT,
+                        [SDNPHasChain]>;
+
+def XCoreBR_JT32 : SDNode<"XCoreISD::BR_JT32", SDT_XCoreBR_JT,
+                        [SDNPHasChain]>;
+
+def SDT_XCoreAddress    : SDTypeProfile<1, 1,
+                            [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def pcrelwrapper : SDNode<"XCoreISD::PCRelativeWrapper", SDT_XCoreAddress,
+                           []>;
+
+def dprelwrapper : SDNode<"XCoreISD::DPRelativeWrapper", SDT_XCoreAddress,
+                           []>;
+
+def cprelwrapper : SDNode<"XCoreISD::CPRelativeWrapper", SDT_XCoreAddress,
+                           []>;
+
+def frametoargsoffset : SDNode<"XCoreISD::FRAME_TO_ARGS_OFFSET", SDTIntLeaf,
+                               []>;
+
+def SDT_XCoreStwsp    : SDTypeProfile<0, 2, [SDTCisInt<1>]>;
+def XCoreStwsp        : SDNode<"XCoreISD::STWSP", SDT_XCoreStwsp,
+                               [SDNPHasChain, SDNPMayStore]>;
+
+def SDT_XCoreLdwsp    : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
+def XCoreLdwsp        : SDNode<"XCoreISD::LDWSP", SDT_XCoreLdwsp,
+                               [SDNPHasChain, SDNPMayLoad]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_XCoreCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_XCoreCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_XCoreCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def SDT_XCoreMEMBARRIER : SDTypeProfile<0, 0, []>;
+
+def XCoreMemBarrier : SDNode<"XCoreISD::MEMBARRIER", SDT_XCoreMEMBARRIER,
+                             [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Pattern Stuff
+//===----------------------------------------------------------------------===//
+
+def div4_xform : SDNodeXForm<imm, [{
+  // Transformation function: imm/4
+  assert(N->getZExtValue() % 4 == 0);
+  return getI32Imm(N->getZExtValue()/4, SDLoc(N));
+}]>;
+
+def msksize_xform : SDNodeXForm<imm, [{
+  // Transformation function: get the size of a mask
+  assert(isMask_32(N->getZExtValue()));
+  // look for the first non-zero bit
+  return getI32Imm(32 - countLeadingZeros((uint32_t)N->getZExtValue()),
+                   SDLoc(N));
+}]>;
+
+def neg_xform : SDNodeXForm<imm, [{
+  // Transformation function: -imm
+  uint32_t value = N->getZExtValue();
+  return getI32Imm(-value, SDLoc(N));
+}]>;
+
+def bpwsub_xform : SDNodeXForm<imm, [{
+  // Transformation function: 32-imm
+  uint32_t value = N->getZExtValue();
+  return getI32Imm(32 - value, SDLoc(N));
+}]>;
+
+def div4neg_xform : SDNodeXForm<imm, [{
+  // Transformation function: -imm/4
+  uint32_t value = N->getZExtValue();
+  assert(-value % 4 == 0);
+  return getI32Imm(-value/4, SDLoc(N));
+}]>;
+
+def immUs4Neg : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return (-value)%4 == 0 && (-value)/4 <= 11;
+}]>;
+
+def immUs4 : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return value%4 == 0 && value/4 <= 11;
+}]>;
+
+def immUsNeg : PatLeaf<(imm), [{
+  return -((uint32_t)N->getZExtValue()) <= 11;
+}]>;
+
+def immUs : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() <= 11;
+}]>;
+
+def immU6 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 6);
+}]>;
+
+def immU16 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 16);
+}]>;
+
+def immMskBitp : PatLeaf<(imm), [{ return immMskBitp(N); }]>;
+
+def immBitp : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return (value >= 1 && value <= 8)
+          || value == 16
+          || value == 24
+          || value == 32;
+}]>;
+
+def immBpwSubBitp : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return (value >= 24 && value <= 31)
+          || value == 16
+          || value == 8
+          || value == 0;
+}]>;
+
+def lda16f : PatFrag<(ops node:$addr, node:$offset),
+                     (add node:$addr, (shl node:$offset, 1))>;
+def lda16b : PatFrag<(ops node:$addr, node:$offset),
+                     (sub node:$addr, (shl node:$offset, 1))>;
+def ldawf : PatFrag<(ops node:$addr, node:$offset),
+                     (add node:$addr, (shl node:$offset, 2))>;
+def ldawb : PatFrag<(ops node:$addr, node:$offset),
+                     (sub node:$addr, (shl node:$offset, 2))>;
+
+// Instruction operand types
+def pcrel_imm  : Operand<i32>;
+def pcrel_imm_neg  : Operand<i32> {
+  let DecoderMethod = "DecodeNegImmOperand";
+}
+def brtarget : Operand<OtherVT>;
+def brtarget_neg : Operand<OtherVT> {
+  let DecoderMethod = "DecodeNegImmOperand";
+}
+
+// Addressing modes
+def ADDRspii : ComplexPattern<i32, 2, "SelectADDRspii", [add, frameindex], []>;
+
+// Address operands
+def MEMii : Operand<i32> {
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+// Jump tables.
+def InlineJT : Operand<i32> {
+  let PrintMethod = "printInlineJT";
+}
+
+def InlineJT32 : Operand<i32> {
+  let PrintMethod = "printInlineJT32";
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+// Three operand short
+
+multiclass F3R_2RUS<bits<5> opc1, bits<5> opc2, string OpcStr, SDNode OpNode> {
+  def _3r: _F3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                !strconcat(OpcStr, " $dst, $b, $c"),
+                [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _2rus : _F2RUS<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                     !strconcat(OpcStr, " $dst, $b, $c"),
+                     [(set GRRegs:$dst, (OpNode GRRegs:$b, immUs:$c))]>;
+}
+
+multiclass F3R_2RUS_np<bits<5> opc1, bits<5> opc2, string OpcStr> {
+  def _3r: _F3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                !strconcat(OpcStr, " $dst, $b, $c"), []>;
+  def _2rus : _F2RUS<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                     !strconcat(OpcStr, " $dst, $b, $c"), []>;
+}
+
+multiclass F3R_2RBITP<bits<5> opc1, bits<5> opc2, string OpcStr,
+                      SDNode OpNode> {
+  def _3r: _F3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                !strconcat(OpcStr, " $dst, $b, $c"),
+                [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _2rus : _F2RUSBitp<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                         !strconcat(OpcStr, " $dst, $b, $c"),
+                         [(set GRRegs:$dst, (OpNode GRRegs:$b, immBitp:$c))]>;
+}
+
+class F3R<bits<5> opc, string OpcStr, SDNode OpNode> :
+  _F3R<opc, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+       !strconcat(OpcStr, " $dst, $b, $c"),
+       [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+
+class F3R_np<bits<5> opc, string OpcStr> :
+  _F3R<opc, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+       !strconcat(OpcStr, " $dst, $b, $c"), []>;
+// Three operand long
+
+/// FL3R_L2RUS multiclass - Define a normal FL3R/FL2RUS pattern in one shot.
+multiclass FL3R_L2RUS<bits<9> opc1, bits<9> opc2, string OpcStr,
+                      SDNode OpNode> {
+  def _l3r: _FL3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                  !strconcat(OpcStr, " $dst, $b, $c"),
+                  [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _l2rus : _FL2RUS<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                       !strconcat(OpcStr, " $dst, $b, $c"),
+                       [(set GRRegs:$dst, (OpNode GRRegs:$b, immUs:$c))]>;
+}
+
+/// FL3R_L2RUS multiclass - Define a normal FL3R/FL2RUS pattern in one shot.
+multiclass FL3R_L2RBITP<bits<9> opc1, bits<9> opc2, string OpcStr,
+                        SDNode OpNode> {
+  def _l3r: _FL3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                  !strconcat(OpcStr, " $dst, $b, $c"),
+                  [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _l2rus : _FL2RUSBitp<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                           !strconcat(OpcStr, " $dst, $b, $c"),
+                           [(set GRRegs:$dst, (OpNode GRRegs:$b, immBitp:$c))]>;
+}
+
+class FL3R<bits<9> opc, string OpcStr, SDNode OpNode> :
+  _FL3R<opc, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+        !strconcat(OpcStr, " $dst, $b, $c"),
+        [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+
+// Register - U6
+// Operand register - U6
+multiclass FRU6_LRU6_branch<bits<6> opc, string OpcStr> {
+  def _ru6: _FRU6<opc, (outs), (ins GRRegs:$a, brtarget:$b),
+                  !strconcat(OpcStr, " $a, $b"), []>;
+  def _lru6: _FLRU6<opc, (outs), (ins GRRegs:$a, brtarget:$b),
+                    !strconcat(OpcStr, " $a, $b"), []>;
+}
+
+multiclass FRU6_LRU6_backwards_branch<bits<6> opc, string OpcStr> {
+  def _ru6: _FRU6<opc, (outs), (ins GRRegs:$a, brtarget_neg:$b),
+                  !strconcat(OpcStr, " $a, $b"), []>;
+  def _lru6: _FLRU6<opc, (outs), (ins GRRegs:$a, brtarget_neg:$b),
+                    !strconcat(OpcStr, " $a, $b"), []>;
+}
+
+
+// U6
+multiclass FU6_LU6<bits<10> opc, string OpcStr, SDNode OpNode> {
+  def _u6: _FU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+                [(OpNode immU6:$a)]>;
+  def _lu6: _FLU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+                  [(OpNode immU16:$a)]>;
+}
+
+multiclass FU6_LU6_int<bits<10> opc, string OpcStr, Intrinsic Int> {
+  def _u6: _FU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+                [(Int immU6:$a)]>;
+  def _lu6: _FLU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+                  [(Int immU16:$a)]>;
+}
+
+multiclass FU6_LU6_np<bits<10> opc, string OpcStr> {
+  def _u6: _FU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"), []>;
+  def _lu6: _FLU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"), []>;
+}
+
+// Two operand short
+
+class F2R_np<bits<6> opc, string OpcStr> :
+  _F2R<opc, (outs GRRegs:$dst), (ins GRRegs:$b),
+       !strconcat(OpcStr, " $dst, $b"), []>;
+
+// Two operand long
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [SP], Uses = [SP] in {
+def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt),
+                               "# ADJCALLSTACKDOWN $amt",
+                               [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : PseudoInstXCore<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            "# ADJCALLSTACKUP $amt1",
+                            [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let isReMaterializable = 1 in
+def FRAME_TO_ARGS_OFFSET : PseudoInstXCore<(outs GRRegs:$dst), (ins),
+                               "# FRAME_TO_ARGS_OFFSET $dst",
+                               [(set GRRegs:$dst, (frametoargsoffset))]>;
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in
+def EH_RETURN : PseudoInstXCore<(outs), (ins GRRegs:$s, GRRegs:$handler),
+                               "# EH_RETURN $s, $handler",
+                               [(XCoreEhRet GRRegs:$s, GRRegs:$handler)]>;
+
+def LDWFI : PseudoInstXCore<(outs GRRegs:$dst), (ins MEMii:$addr),
+                             "# LDWFI $dst, $addr",
+                             [(set GRRegs:$dst, (load ADDRspii:$addr))]>;
+
+def LDAWFI : PseudoInstXCore<(outs GRRegs:$dst), (ins MEMii:$addr),
+                             "# LDAWFI $dst, $addr",
+                             [(set GRRegs:$dst, ADDRspii:$addr)]>;
+
+def STWFI : PseudoInstXCore<(outs), (ins GRRegs:$src, MEMii:$addr),
+                            "# STWFI $src, $addr",
+                            [(store GRRegs:$src, ADDRspii:$addr)]>;
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+let usesCustomInserter = 1 in {
+  def SELECT_CC : PseudoInstXCore<(outs GRRegs:$dst),
+                              (ins GRRegs:$cond, GRRegs:$T, GRRegs:$F),
+                              "# SELECT_CC PSEUDO!",
+                              [(set GRRegs:$dst,
+                                 (select GRRegs:$cond, GRRegs:$T, GRRegs:$F))]>;
+}
+
+let hasSideEffects = 1 in
+def Int_MemBarrier : PseudoInstXCore<(outs), (ins), "#MEMBARRIER",
+                                     [(XCoreMemBarrier)]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+// Three operand short
+defm ADD : F3R_2RUS<0b00010, 0b10010, "add", add>;
+defm SUB : F3R_2RUS<0b00011, 0b10011, "sub", sub>;
+let hasSideEffects = 0 in {
+defm EQ : F3R_2RUS_np<0b00110, 0b10110, "eq">;
+def LSS_3r : F3R_np<0b11000, "lss">;
+def LSU_3r : F3R_np<0b11001, "lsu">;
+}
+def AND_3r : F3R<0b00111, "and", and>;
+def OR_3r : F3R<0b01000, "or", or>;
+
+let mayLoad=1 in {
+def LDW_3r : _F3R<0b01001, (outs GRRegs:$dst),
+                  (ins GRRegs:$addr, GRRegs:$offset),
+                  "ldw $dst, $addr[$offset]", []>;
+
+def LDW_2rus : _F2RUS<0b00001, (outs GRRegs:$dst),
+                      (ins GRRegs:$addr, i32imm:$offset),
+                      "ldw $dst, $addr[$offset]", []>;
+
+def LD16S_3r :  _F3R<0b10000, (outs GRRegs:$dst),
+                     (ins GRRegs:$addr, GRRegs:$offset),
+                     "ld16s $dst, $addr[$offset]", []>;
+
+def LD8U_3r :  _F3R<0b10001, (outs GRRegs:$dst),
+                    (ins GRRegs:$addr, GRRegs:$offset),
+                    "ld8u $dst, $addr[$offset]", []>;
+}
+
+let mayStore=1 in {
+def STW_l3r : _FL3R<0b000001100, (outs),
+                    (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                    "stw $val, $addr[$offset]", []>;
+
+def STW_2rus : _F2RUS<0b00000, (outs),
+                      (ins GRRegs:$val, GRRegs:$addr, i32imm:$offset),
+                      "stw $val, $addr[$offset]", []>;
+}
+
+defm SHL : F3R_2RBITP<0b00100, 0b10100, "shl", shl>;
+defm SHR : F3R_2RBITP<0b00101, 0b10101, "shr", srl>;
+
+// The first operand is treated as an immediate since it refers to a register
+// number in another thread.
+def TSETR_3r : _F3RImm<0b10111, (outs), (ins i32imm:$a, GRRegs:$b, GRRegs:$c),
+                       "set t[$c]:r$a, $b", []>;
+
+// Three operand long
+def LDAWF_l3r : _FL3R<0b000111100, (outs GRRegs:$dst),
+                      (ins GRRegs:$addr, GRRegs:$offset),
+                      "ldaw $dst, $addr[$offset]",
+                      [(set GRRegs:$dst,
+                         (ldawf GRRegs:$addr, GRRegs:$offset))]>;
+
+let hasSideEffects = 0 in
+def LDAWF_l2rus : _FL2RUS<0b100111100, (outs GRRegs:$dst),
+                          (ins GRRegs:$addr, i32imm:$offset),
+                          "ldaw $dst, $addr[$offset]", []>;
+
+def LDAWB_l3r : _FL3R<0b001001100, (outs GRRegs:$dst),
+                      (ins GRRegs:$addr, GRRegs:$offset),
+                      "ldaw $dst, $addr[-$offset]",
+                      [(set GRRegs:$dst,
+                         (ldawb GRRegs:$addr, GRRegs:$offset))]>;
+
+let hasSideEffects = 0 in
+def LDAWB_l2rus : _FL2RUS<0b101001100, (outs GRRegs:$dst),
+                         (ins GRRegs:$addr, i32imm:$offset),
+                         "ldaw $dst, $addr[-$offset]", []>;
+
+def LDA16F_l3r : _FL3R<0b001011100, (outs GRRegs:$dst),
+                       (ins GRRegs:$addr, GRRegs:$offset),
+                       "lda16 $dst, $addr[$offset]",
+                       [(set GRRegs:$dst,
+                          (lda16f GRRegs:$addr, GRRegs:$offset))]>;
+
+def LDA16B_l3r : _FL3R<0b001101100, (outs GRRegs:$dst),
+                       (ins GRRegs:$addr, GRRegs:$offset),
+                       "lda16 $dst, $addr[-$offset]",
+                       [(set GRRegs:$dst,
+                          (lda16b GRRegs:$addr, GRRegs:$offset))]>;
+
+def MUL_l3r : FL3R<0b001111100, "mul", mul>;
+// Instructions which may trap are marked as side effecting.
+let hasSideEffects = 1 in {
+def DIVS_l3r : FL3R<0b010001100, "divs", sdiv>;
+def DIVU_l3r : FL3R<0b010011100, "divu", udiv>;
+def REMS_l3r : FL3R<0b110001100, "rems", srem>;
+def REMU_l3r : FL3R<0b110011100, "remu", urem>;
+}
+def XOR_l3r : FL3R<0b000011100, "xor", xor>;
+defm ASHR : FL3R_L2RBITP<0b000101100, 0b100101100, "ashr", sra>;
+
+let Constraints = "$src1 = $dst" in
+def CRC_l3r : _FL3RSrcDst<0b101011100, (outs GRRegs:$dst),
+                          (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                          "crc32 $dst, $src2, $src3",
+                          [(set GRRegs:$dst,
+                             (int_xcore_crc32 GRRegs:$src1, GRRegs:$src2,
+                                              GRRegs:$src3))]>;
+
+let mayStore=1 in {
+def ST16_l3r : _FL3R<0b100001100, (outs),
+                     (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                     "st16 $val, $addr[$offset]", []>;
+
+def ST8_l3r : _FL3R<0b100011100, (outs),
+                    (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                    "st8 $val, $addr[$offset]", []>;
+}
+
+def INPW_l2rus : _FL2RUSBitp<0b100101110, (outs GRRegs:$a),
+                             (ins GRRegs:$b, i32imm:$c), "inpw $a, res[$b], $c",
+                             []>;
+
+def OUTPW_l2rus : _FL2RUSBitp<0b100101101, (outs),
+                              (ins GRRegs:$a, GRRegs:$b, i32imm:$c),
+                              "outpw res[$b], $a, $c", []>;
+
+// Four operand long
+let Constraints = "$e = $a,$f = $b" in {
+def MACCU_l4r : _FL4RSrcDstSrcDst<
+  0b000001, (outs GRRegs:$a, GRRegs:$b),
+  (ins GRRegs:$e, GRRegs:$f, GRRegs:$c, GRRegs:$d), "maccu $a, $b, $c, $d", []>;
+
+def MACCS_l4r : _FL4RSrcDstSrcDst<
+  0b000010, (outs GRRegs:$a, GRRegs:$b),
+  (ins GRRegs:$e, GRRegs:$f, GRRegs:$c, GRRegs:$d), "maccs $a, $b, $c, $d", []>;
+}
+
+let Constraints = "$e = $b" in
+def CRC8_l4r : _FL4RSrcDst<0b000000, (outs GRRegs:$a, GRRegs:$b),
+                           (ins GRRegs:$e, GRRegs:$c, GRRegs:$d),
+                           "crc8 $b, $a, $c, $d", []>;
+
+// Five operand long
+
+def LADD_l5r : _FL5R<0b000001, (outs GRRegs:$dst1, GRRegs:$dst2),
+                     (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                     "ladd $dst2, $dst1, $src1, $src2, $src3",
+                     []>;
+
+def LSUB_l5r : _FL5R<0b000010, (outs GRRegs:$dst1, GRRegs:$dst2),
+                     (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                     "lsub $dst2, $dst1, $src1, $src2, $src3", []>;
+
+def LDIVU_l5r : _FL5R<0b000000, (outs GRRegs:$dst1, GRRegs:$dst2),
+                      (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                      "ldivu $dst1, $dst2, $src3, $src1, $src2", []>;
+
+// Six operand long
+
+def LMUL_l6r : _FL6R<
+  0b00000, (outs GRRegs:$dst1, GRRegs:$dst2),
+  (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3, GRRegs:$src4),
+  "lmul $dst1, $dst2, $src1, $src2, $src3, $src4", []>;
+
+// Register - U6
+
+//let Uses = [DP] in ...
+let hasSideEffects = 0, isReMaterializable = 1 in
+def LDAWDP_ru6: _FRU6<0b011000, (outs RRegs:$a), (ins i32imm:$b),
+                      "ldaw $a, dp[$b]", []>;
+
+let isReMaterializable = 1 in                    
+def LDAWDP_lru6: _FLRU6<0b011000, (outs RRegs:$a), (ins i32imm:$b),
+                        "ldaw $a, dp[$b]",
+                        [(set RRegs:$a, (dprelwrapper tglobaladdr:$b))]>;
+
+let mayLoad=1 in
+def LDWDP_ru6: _FRU6<0b010110, (outs RRegs:$a), (ins i32imm:$b),
+                     "ldw $a, dp[$b]", []>;
+
+def LDWDP_lru6: _FLRU6<0b010110, (outs RRegs:$a), (ins i32imm:$b),
+                       "ldw $a, dp[$b]",
+                       [(set RRegs:$a, (load (dprelwrapper tglobaladdr:$b)))]>;
+
+let mayStore=1 in
+def STWDP_ru6 : _FRU6<0b010100, (outs), (ins RRegs:$a, i32imm:$b),
+                      "stw $a, dp[$b]", []>;
+
+def STWDP_lru6 : _FLRU6<0b010100, (outs), (ins RRegs:$a, i32imm:$b),
+                        "stw $a, dp[$b]",
+                        [(store RRegs:$a, (dprelwrapper tglobaladdr:$b))]>;
+
+//let Uses = [CP] in ..
+let mayLoad = 1, isReMaterializable = 1, hasSideEffects = 0 in {
+def LDWCP_ru6 : _FRU6<0b011011, (outs RRegs:$a), (ins i32imm:$b),
+                      "ldw $a, cp[$b]", []>;
+def LDWCP_lru6: _FLRU6<0b011011, (outs RRegs:$a), (ins i32imm:$b),
+                       "ldw $a, cp[$b]",
+                       [(set RRegs:$a, (load (cprelwrapper tglobaladdr:$b)))]>;
+}
+
+let Uses = [SP] in {
+let mayStore=1 in {
+def STWSP_ru6 : _FRU6<0b010101, (outs), (ins RRegs:$a, i32imm:$b),
+                      "stw $a, sp[$b]",
+                      [(XCoreStwsp RRegs:$a, immU6:$b)]>;
+
+def STWSP_lru6 : _FLRU6<0b010101, (outs), (ins RRegs:$a, i32imm:$b),
+                        "stw $a, sp[$b]",
+                        [(XCoreStwsp RRegs:$a, immU16:$b)]>;
+}
+
+let mayLoad=1 in {
+def LDWSP_ru6 : _FRU6<0b010111, (outs RRegs:$a), (ins i32imm:$b),
+                      "ldw $a, sp[$b]",
+                      [(set RRegs:$a, (XCoreLdwsp immU6:$b))]>;
+
+def LDWSP_lru6 : _FLRU6<0b010111, (outs RRegs:$a), (ins i32imm:$b),
+                        "ldw $a, sp[$b]",
+                        [(set RRegs:$a, (XCoreLdwsp immU16:$b))]>;
+}
+
+let hasSideEffects = 0 in {
+def LDAWSP_ru6 : _FRU6<0b011001, (outs RRegs:$a), (ins i32imm:$b),
+                       "ldaw $a, sp[$b]", []>;
+
+def LDAWSP_lru6 : _FLRU6<0b011001, (outs RRegs:$a), (ins i32imm:$b),
+                         "ldaw $a, sp[$b]", []>;
+}
+}
+
+let isReMaterializable = 1 in {
+def LDC_ru6 : _FRU6<0b011010, (outs RRegs:$a), (ins i32imm:$b),
+                    "ldc $a, $b", [(set RRegs:$a, immU6:$b)]>;
+
+def LDC_lru6 : _FLRU6<0b011010, (outs RRegs:$a), (ins i32imm:$b),
+                      "ldc $a, $b", [(set RRegs:$a, immU16:$b)]>;
+}
+
+def SETC_ru6 : _FRU6<0b111010, (outs), (ins GRRegs:$a, i32imm:$b),
+                     "setc res[$a], $b",
+                     [(int_xcore_setc GRRegs:$a, immU6:$b)]>;
+
+def SETC_lru6 : _FLRU6<0b111010, (outs), (ins GRRegs:$a, i32imm:$b),
+                       "setc res[$a], $b",
+                       [(int_xcore_setc GRRegs:$a, immU16:$b)]>;
+
+// Operand register - U6
+let isBranch = 1, isTerminator = 1 in {
+defm BRFT: FRU6_LRU6_branch<0b011100, "bt">;
+defm BRBT: FRU6_LRU6_backwards_branch<0b011101, "bt">;
+defm BRFF: FRU6_LRU6_branch<0b011110, "bf">;
+defm BRBF: FRU6_LRU6_backwards_branch<0b011111, "bf">;
+}
+
+// U6
+let Defs = [SP], Uses = [SP] in {
+let hasSideEffects = 0 in
+defm EXTSP : FU6_LU6_np<0b0111011110, "extsp">;
+
+let mayStore = 1 in
+defm ENTSP : FU6_LU6_np<0b0111011101, "entsp">;
+
+let isReturn = 1, isTerminator = 1, mayLoad = 1, isBarrier = 1 in {
+defm RETSP : FU6_LU6<0b0111011111, "retsp", XCoreRetsp>;
+}
+}
+
+let hasSideEffects = 0 in
+defm EXTDP : FU6_LU6_np<0b0111001110, "extdp">;
+
+let Uses = [R11], isCall=1 in
+defm BLAT : FU6_LU6_np<0b0111001101, "blat">;
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+def BRBU_u6 : _FU6<0b0111011100, (outs), (ins brtarget_neg:$a), "bu $a", []>;
+
+def BRBU_lu6 : _FLU6<0b0111011100, (outs), (ins brtarget_neg:$a), "bu $a", []>;
+
+def BRFU_u6 : _FU6<0b0111001100, (outs), (ins brtarget:$a), "bu $a", []>;
+
+def BRFU_lu6 : _FLU6<0b0111001100, (outs), (ins brtarget:$a), "bu $a", []>;
+}
+
+//let Uses = [CP] in ...
+let Defs = [R11], hasSideEffects = 0, isReMaterializable = 1 in
+def LDAWCP_u6: _FU6<0b0111111101, (outs), (ins i32imm:$a), "ldaw r11, cp[$a]",
+                    []>;
+
+let Defs = [R11], isReMaterializable = 1 in
+def LDAWCP_lu6: _FLU6<0b0111111101, (outs), (ins i32imm:$a), "ldaw r11, cp[$a]",
+                      [(set R11, (cprelwrapper tglobaladdr:$a))]>;
+
+let Defs = [R11] in
+defm GETSR : FU6_LU6_np<0b0111111100, "getsr r11,">;
+
+defm SETSR : FU6_LU6_int<0b0111101101, "setsr", int_xcore_setsr>;
+
+defm CLRSR : FU6_LU6_int<0b0111101100, "clrsr", int_xcore_clrsr>;
+
+// setsr may cause a branch if it is used to enable events. clrsr may
+// branch if it is executed while events are enabled.
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1,
+    isCodeGenOnly = 1 in {
+defm SETSR_branch : FU6_LU6_np<0b0111101101, "setsr">;
+defm CLRSR_branch : FU6_LU6_np<0b0111101100, "clrsr">;
+}
+
+defm KCALL : FU6_LU6_np<0b0111001111, "kcall">;
+
+let Uses = [SP], Defs = [SP], mayStore = 1 in
+defm KENTSP : FU6_LU6_np<0b0111101110, "kentsp">;
+
+let Uses = [SP], Defs = [SP], mayLoad = 1 in
+defm KRESTSP : FU6_LU6_np<0b0111101111, "krestsp">;
+
+// U10
+
+let Defs = [R11], isReMaterializable = 1 in {
+let hasSideEffects = 0 in
+def LDAPF_u10 : _FU10<0b110110, (outs), (ins pcrel_imm:$a), "ldap r11, $a", []>;
+
+def LDAPF_lu10 : _FLU10<0b110110, (outs), (ins pcrel_imm:$a), "ldap r11, $a",
+                        [(set R11, (pcrelwrapper tglobaladdr:$a))]>;
+
+let hasSideEffects = 0 in
+def LDAPB_u10 : _FU10<0b110111, (outs), (ins pcrel_imm_neg:$a), "ldap r11, $a",
+                      []>;
+
+let hasSideEffects = 0 in
+def LDAPB_lu10 : _FLU10<0b110111, (outs), (ins pcrel_imm_neg:$a),
+                        "ldap r11, $a",
+                        [(set R11, (pcrelwrapper tglobaladdr:$a))]>;
+
+let isCodeGenOnly = 1 in
+def LDAPF_lu10_ba : _FLU10<0b110110, (outs), (ins pcrel_imm:$a), "ldap r11, $a",
+                           [(set R11, (pcrelwrapper tblockaddress:$a))]>;
+}
+
+let isCall=1,
+// All calls clobber the link register and the non-callee-saved registers:
+Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
+def BLACP_u10 : _FU10<0b111000, (outs), (ins i32imm:$a), "bla cp[$a]", []>;
+
+def BLACP_lu10 : _FLU10<0b111000, (outs), (ins i32imm:$a), "bla cp[$a]", []>;
+
+def BLRF_u10 : _FU10<0b110100, (outs), (ins pcrel_imm:$a), "bl $a",
+                     []>;
+
+def BLRF_lu10 : _FLU10<0b110100, (outs), (ins pcrel_imm:$a), "bl $a",
+                       [(XCoreBranchLink tglobaladdr:$a)]>;
+
+def BLRB_u10 : _FU10<0b110101, (outs), (ins pcrel_imm_neg:$a), "bl $a", []>;
+
+def BLRB_lu10 : _FLU10<0b110101, (outs), (ins pcrel_imm_neg:$a), "bl $a", []>;
+}
+
+let Defs = [R11], mayLoad = 1, isReMaterializable = 1,
+    hasSideEffects = 0 in {
+def LDWCP_u10 : _FU10<0b111001, (outs), (ins i32imm:$a), "ldw r11, cp[$a]", []>;
+
+def LDWCP_lu10 : _FLU10<0b111001, (outs), (ins i32imm:$a), "ldw r11, cp[$a]",
+                        []>;
+}
+
+// Two operand short
+def NOT : _F2R<0b100010, (outs GRRegs:$dst), (ins GRRegs:$b),
+                "not $dst, $b", [(set GRRegs:$dst, (not GRRegs:$b))]>;
+
+def NEG : _F2R<0b100100, (outs GRRegs:$dst), (ins GRRegs:$b),
+                "neg $dst, $b", [(set GRRegs:$dst, (ineg GRRegs:$b))]>;
+
+let Constraints = "$src1 = $dst" in {
+def SEXT_rus :
+  _FRUSSrcDstBitp<0b001101, (outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
+                  "sext $dst, $src2",
+                  [(set GRRegs:$dst, (int_xcore_sext GRRegs:$src1,
+                                                     immBitp:$src2))]>;
+
+def SEXT_2r :
+  _F2RSrcDst<0b001100, (outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+             "sext $dst, $src2",
+             [(set GRRegs:$dst, (int_xcore_sext GRRegs:$src1, GRRegs:$src2))]>;
+
+def ZEXT_rus :
+  _FRUSSrcDstBitp<0b010001, (outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
+                  "zext $dst, $src2",
+                  [(set GRRegs:$dst, (int_xcore_zext GRRegs:$src1,
+                                                     immBitp:$src2))]>;
+
+def ZEXT_2r :
+  _F2RSrcDst<0b010000, (outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+             "zext $dst, $src2",
+             [(set GRRegs:$dst, (int_xcore_zext GRRegs:$src1, GRRegs:$src2))]>;
+
+def ANDNOT_2r :
+  _F2RSrcDst<0b001010, (outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+             "andnot $dst, $src2",
+             [(set GRRegs:$dst, (and GRRegs:$src1, (not GRRegs:$src2)))]>;
+}
+
+let isReMaterializable = 1, hasSideEffects = 0 in
+def MKMSK_rus : _FRUSBitp<0b101001, (outs GRRegs:$dst), (ins i32imm:$size),
+                          "mkmsk $dst, $size", []>;
+
+def MKMSK_2r : _F2R<0b101000, (outs GRRegs:$dst), (ins GRRegs:$size),
+                    "mkmsk $dst, $size",
+                    [(set GRRegs:$dst, (add (shl 1, GRRegs:$size), -1))]>;
+
+def GETR_rus : _FRUS<0b100000, (outs GRRegs:$dst), (ins i32imm:$type),
+                     "getr $dst, $type",
+                     [(set GRRegs:$dst, (int_xcore_getr immUs:$type))]>;
+
+def GETTS_2r : _F2R<0b001110, (outs GRRegs:$dst), (ins GRRegs:$r),
+                    "getts $dst, res[$r]",
+                    [(set GRRegs:$dst, (int_xcore_getts GRRegs:$r))]>;
+
+def SETPT_2r : _FR2R<0b001111, (outs), (ins GRRegs:$r, GRRegs:$val),
+                     "setpt res[$r], $val",
+                     [(int_xcore_setpt GRRegs:$r, GRRegs:$val)]>;
+
+def OUTCT_2r : _F2R<0b010010, (outs), (ins GRRegs:$r, GRRegs:$val),
+                    "outct res[$r], $val",
+                    [(int_xcore_outct GRRegs:$r, GRRegs:$val)]>;
+
+def OUTCT_rus : _FRUS<0b010011, (outs), (ins GRRegs:$r, i32imm:$val),
+                       "outct res[$r], $val",
+                       [(int_xcore_outct GRRegs:$r, immUs:$val)]>;
+
+def OUTT_2r : _FR2R<0b000011, (outs), (ins GRRegs:$r, GRRegs:$val),
+                    "outt res[$r], $val",
+                    [(int_xcore_outt GRRegs:$r, GRRegs:$val)]>;
+
+def OUT_2r : _FR2R<0b101010, (outs), (ins GRRegs:$r, GRRegs:$val),
+                   "out res[$r], $val",
+                   [(int_xcore_out GRRegs:$r, GRRegs:$val)]>;
+
+let Constraints = "$src = $dst" in
+def OUTSHR_2r :
+  _F2RSrcDst<0b101011, (outs GRRegs:$dst), (ins GRRegs:$src, GRRegs:$r),
+             "outshr res[$r], $src",
+             [(set GRRegs:$dst, (int_xcore_outshr GRRegs:$r, GRRegs:$src))]>;
+
+def INCT_2r : _F2R<0b100001, (outs GRRegs:$dst), (ins GRRegs:$r),
+                   "inct $dst, res[$r]",
+                   [(set GRRegs:$dst, (int_xcore_inct GRRegs:$r))]>;
+
+def INT_2r : _F2R<0b100011, (outs GRRegs:$dst), (ins GRRegs:$r),
+                  "int $dst, res[$r]",
+                  [(set GRRegs:$dst, (int_xcore_int GRRegs:$r))]>;
+
+def IN_2r : _F2R<0b101100, (outs GRRegs:$dst), (ins GRRegs:$r),
+                 "in $dst, res[$r]",
+                 [(set GRRegs:$dst, (int_xcore_in GRRegs:$r))]>;
+
+let Constraints = "$src = $dst" in
+def INSHR_2r :
+  _F2RSrcDst<0b101101, (outs GRRegs:$dst), (ins GRRegs:$src, GRRegs:$r),
+             "inshr $dst, res[$r]",
+             [(set GRRegs:$dst, (int_xcore_inshr GRRegs:$r, GRRegs:$src))]>;
+
+def CHKCT_2r : _F2R<0b110010, (outs), (ins GRRegs:$r, GRRegs:$val),
+                    "chkct res[$r], $val",
+                    [(int_xcore_chkct GRRegs:$r, GRRegs:$val)]>;
+
+def CHKCT_rus : _FRUSBitp<0b110011, (outs), (ins GRRegs:$r, i32imm:$val),
+                          "chkct res[$r], $val",
+                          [(int_xcore_chkct GRRegs:$r, immUs:$val)]>;
+
+def TESTCT_2r : _F2R<0b101111, (outs GRRegs:$dst), (ins GRRegs:$src),
+                     "testct $dst, res[$src]",
+                     [(set GRRegs:$dst, (int_xcore_testct GRRegs:$src))]>;
+
+def TESTWCT_2r : _F2R<0b110001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                      "testwct $dst, res[$src]",
+                      [(set GRRegs:$dst, (int_xcore_testwct GRRegs:$src))]>;
+
+def SETD_2r : _FR2R<0b000101, (outs), (ins GRRegs:$r, GRRegs:$val),
+                    "setd res[$r], $val",
+                    [(int_xcore_setd GRRegs:$r, GRRegs:$val)]>;
+
+def SETPSC_2r : _FR2R<0b110000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                      "setpsc res[$src1], $src2",
+                      [(int_xcore_setpsc GRRegs:$src1, GRRegs:$src2)]>;
+
+def GETST_2r : _F2R<0b000001, (outs GRRegs:$dst), (ins GRRegs:$r),
+                    "getst $dst, res[$r]",
+                    [(set GRRegs:$dst, (int_xcore_getst GRRegs:$r))]>;
+
+def INITSP_2r : _F2R<0b000100, (outs), (ins GRRegs:$src, GRRegs:$t),
+                     "init t[$t]:sp, $src",
+                     [(int_xcore_initsp GRRegs:$t, GRRegs:$src)]>;
+
+def INITPC_2r : _F2R<0b000000, (outs), (ins GRRegs:$src, GRRegs:$t),
+                     "init t[$t]:pc, $src",
+                     [(int_xcore_initpc GRRegs:$t, GRRegs:$src)]>;
+
+def INITCP_2r : _F2R<0b000110, (outs), (ins GRRegs:$src, GRRegs:$t),
+                     "init t[$t]:cp, $src",
+                     [(int_xcore_initcp GRRegs:$t, GRRegs:$src)]>;
+
+def INITDP_2r : _F2R<0b000010, (outs), (ins GRRegs:$src, GRRegs:$t),
+                     "init t[$t]:dp, $src",
+                     [(int_xcore_initdp GRRegs:$t, GRRegs:$src)]>;
+
+def PEEK_2r : _F2R<0b101110, (outs GRRegs:$dst), (ins GRRegs:$src),
+                    "peek $dst, res[$src]",
+                    [(set GRRegs:$dst, (int_xcore_peek GRRegs:$src))]>;
+
+def ENDIN_2r : _F2R<0b100101, (outs GRRegs:$dst), (ins GRRegs:$src),
+                     "endin $dst, res[$src]",
+                     [(set GRRegs:$dst, (int_xcore_endin GRRegs:$src))]>;
+
+def EEF_2r : _F2R<0b001011, (outs), (ins GRRegs:$a, GRRegs:$b),
+                  "eef $a, res[$b]", []>;
+
+def EET_2r : _F2R<0b001001, (outs), (ins GRRegs:$a, GRRegs:$b),
+                  "eet $a, res[$b]", []>;
+
+def TSETMR_2r : _F2RImm<0b000111, (outs), (ins i32imm:$a, GRRegs:$b),
+                        "tsetmr r$a, $b", []>;
+
+// Two operand long
+def BITREV_l2r : _FL2R<0b0000011000, (outs GRRegs:$dst), (ins GRRegs:$src),
+                       "bitrev $dst, $src",
+                       [(set GRRegs:$dst, (int_xcore_bitrev GRRegs:$src))]>;
+
+def BYTEREV_l2r : _FL2R<0b0000011001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                        "byterev $dst, $src",
+                        [(set GRRegs:$dst, (bswap GRRegs:$src))]>;
+
+def CLZ_l2r : _FL2R<0b0000111000, (outs GRRegs:$dst), (ins GRRegs:$src),
+                    "clz $dst, $src",
+                    [(set GRRegs:$dst, (ctlz GRRegs:$src))]>;
+
+def GETD_l2r : _FL2R<0b0001111001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                     "getd $dst, res[$src]", []>;
+
+def GETN_l2r : _FL2R<0b0011011001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                     "getn $dst, res[$src]", []>;
+
+def SETC_l2r : _FL2R<0b0010111001, (outs), (ins GRRegs:$r, GRRegs:$val),
+                     "setc res[$r], $val",
+                     [(int_xcore_setc GRRegs:$r, GRRegs:$val)]>;
+
+def SETTW_l2r : _FLR2R<0b0010011001, (outs), (ins GRRegs:$r, GRRegs:$val),
+                       "settw res[$r], $val",
+                       [(int_xcore_settw GRRegs:$r, GRRegs:$val)]>;
+
+def GETPS_l2r : _FL2R<0b0001011001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                      "get $dst, ps[$src]",
+                      [(set GRRegs:$dst, (int_xcore_getps GRRegs:$src))]>;
+
+def SETPS_l2r : _FLR2R<0b0001111000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                       "set ps[$src1], $src2",
+                       [(int_xcore_setps GRRegs:$src1, GRRegs:$src2)]>;
+
+def INITLR_l2r : _FL2R<0b0001011000, (outs), (ins GRRegs:$src, GRRegs:$t),
+                       "init t[$t]:lr, $src",
+                       [(int_xcore_initlr GRRegs:$t, GRRegs:$src)]>;
+
+def SETCLK_l2r : _FLR2R<0b0000111001, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                        "setclk res[$src1], $src2",
+                        [(int_xcore_setclk GRRegs:$src1, GRRegs:$src2)]>;
+
+def SETN_l2r : _FLR2R<0b0011011000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                      "setn res[$src1], $src2", []>;
+
+def SETRDY_l2r : _FLR2R<0b0010111000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                        "setrdy res[$src1], $src2",
+                        [(int_xcore_setrdy GRRegs:$src1, GRRegs:$src2)]>;
+
+def TESTLCL_l2r : _FL2R<0b0010011000, (outs GRRegs:$dst), (ins GRRegs:$src),
+                        "testlcl $dst, res[$src]", []>;
+
+// One operand short
+def MSYNC_1r : _F1R<0b000111, (outs), (ins GRRegs:$a),
+                    "msync res[$a]",
+                    [(int_xcore_msync GRRegs:$a)]>;
+def MJOIN_1r : _F1R<0b000101, (outs), (ins GRRegs:$a),
+                    "mjoin res[$a]",
+                    [(int_xcore_mjoin GRRegs:$a)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BAU_1r : _F1R<0b001001, (outs), (ins GRRegs:$a),
+                 "bau $a",
+                 [(brind GRRegs:$a)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BR_JT : PseudoInstXCore<(outs), (ins InlineJT:$t, GRRegs:$i),
+                            "bru $i\n$t",
+                            [(XCoreBR_JT tjumptable:$t, GRRegs:$i)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BR_JT32 : PseudoInstXCore<(outs), (ins InlineJT32:$t, GRRegs:$i),
+                              "bru $i\n$t",
+                              [(XCoreBR_JT32 tjumptable:$t, GRRegs:$i)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BRU_1r : _F1R<0b001010, (outs), (ins GRRegs:$a), "bru $a", []>;
+
+let Defs=[SP], hasSideEffects=0 in
+def SETSP_1r : _F1R<0b001011, (outs), (ins GRRegs:$a), "set sp, $a", []>;
+
+let hasSideEffects=0 in
+def SETDP_1r : _F1R<0b001100, (outs), (ins GRRegs:$a), "set dp, $a", []>;
+
+let hasSideEffects=0 in
+def SETCP_1r : _F1R<0b001101, (outs), (ins GRRegs:$a), "set cp, $a", []>;
+
+let hasCtrlDep = 1 in 
+def ECALLT_1r : _F1R<0b010011, (outs), (ins GRRegs:$a),
+                 "ecallt $a",
+                 []>;
+
+let hasCtrlDep = 1 in 
+def ECALLF_1r : _F1R<0b010010, (outs), (ins GRRegs:$a),
+                 "ecallf $a",
+                 []>;
+
+let isCall=1, 
+// All calls clobber the link register and the non-callee-saved registers:
+Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
+def BLA_1r : _F1R<0b001000, (outs), (ins GRRegs:$a),
+                 "bla $a",
+                 [(XCoreBranchLink GRRegs:$a)]>;
+}
+
+def SYNCR_1r : _F1R<0b100001, (outs), (ins GRRegs:$a),
+                 "syncr res[$a]",
+                 [(int_xcore_syncr GRRegs:$a)]>;
+
+def FREER_1r : _F1R<0b000100, (outs), (ins GRRegs:$a),
+               "freer res[$a]",
+               [(int_xcore_freer GRRegs:$a)]>;
+
+let Uses=[R11] in {
+def SETV_1r : _F1R<0b010001, (outs), (ins GRRegs:$a),
+                   "setv res[$a], r11",
+                   [(int_xcore_setv GRRegs:$a, R11)]>;
+
+def SETEV_1r : _F1R<0b001111, (outs), (ins GRRegs:$a),
+                    "setev res[$a], r11",
+                    [(int_xcore_setev GRRegs:$a, R11)]>;
+}
+
+def DGETREG_1r : _F1R<0b001110, (outs GRRegs:$a), (ins), "dgetreg $a", []>;
+
+def EDU_1r : _F1R<0b000000, (outs), (ins GRRegs:$a), "edu res[$a]",
+                  [(int_xcore_edu GRRegs:$a)]>;
+
+def EEU_1r : _F1R<0b000001, (outs), (ins GRRegs:$a),
+               "eeu res[$a]",
+               [(int_xcore_eeu GRRegs:$a)]>;
+
+def KCALL_1r : _F1R<0b010000, (outs), (ins GRRegs:$a), "kcall $a", []>;
+
+def WAITEF_1R : _F1R<0b000011, (outs), (ins GRRegs:$a), "waitef $a", []>;
+
+def WAITET_1R : _F1R<0b000010, (outs), (ins GRRegs:$a), "waitet $a", []>;
+
+def TSTART_1R : _F1R<0b000110, (outs), (ins GRRegs:$a), "start t[$a]", []>;
+
+def CLRPT_1R : _F1R<0b100000, (outs), (ins GRRegs:$a), "clrpt res[$a]",
+                    [(int_xcore_clrpt GRRegs:$a)]>;
+
+// Zero operand short
+
+def CLRE_0R : _F0R<0b0000001101, (outs), (ins), "clre", [(int_xcore_clre)]>;
+
+def DCALL_0R : _F0R<0b0000011100, (outs), (ins), "dcall", []>;
+
+let Defs = [SP], Uses = [SP] in
+def DENTSP_0R : _F0R<0b0001001100, (outs), (ins), "dentsp", []>;
+
+let Defs = [SP] in
+def DRESTSP_0R : _F0R<0b0001001101, (outs), (ins), "drestsp", []>;
+
+def DRET_0R : _F0R<0b0000011110, (outs), (ins), "dret", []>;
+
+def FREET_0R : _F0R<0b0000001111, (outs), (ins), "freet", []>;
+
+let Defs = [R11] in {
+def GETID_0R : _F0R<0b0001001110, (outs), (ins),
+                    "get r11, id",
+                    [(set R11, (int_xcore_getid))]>;
+
+def GETED_0R : _F0R<0b0000111110, (outs), (ins),
+                    "get r11, ed",
+                    [(set R11, (int_xcore_geted))]>;
+
+def GETET_0R : _F0R<0b0000111111, (outs), (ins),
+                    "get r11, et",
+                    [(set R11, (int_xcore_getet))]>;
+
+def GETKEP_0R : _F0R<0b0001001111, (outs), (ins),
+                     "get r11, kep", []>;
+
+def GETKSP_0R : _F0R<0b0001011100, (outs), (ins),
+                     "get r11, ksp", []>;
+}
+
+let Defs = [SP] in
+def KRET_0R : _F0R<0b0000011101, (outs), (ins), "kret", []>;
+
+let Uses = [SP], mayLoad = 1 in {
+def LDET_0R : _F0R<0b0001011110, (outs), (ins), "ldw et, sp[4]", []>;
+
+def LDSED_0R : _F0R<0b0001011101, (outs), (ins), "ldw sed, sp[3]", []>;
+
+def LDSPC_0R : _F0R<0b0000101100, (outs), (ins), "ldw spc, sp[1]", []>;
+
+def LDSSR_0R : _F0R<0b0000101110, (outs), (ins), "ldw ssr, sp[2]", []>;
+}
+
+let Uses=[R11] in
+def SETKEP_0R : _F0R<0b0000011111, (outs), (ins), "set kep, r11", []>;
+
+def SSYNC_0r : _F0R<0b0000001110, (outs), (ins),
+                    "ssync",
+                    [(int_xcore_ssync)]>;
+
+let Uses = [SP], mayStore = 1 in {
+def STET_0R : _F0R<0b0000111101, (outs), (ins), "stw et, sp[4]", []>;
+
+def STSED_0R : _F0R<0b0000111100, (outs), (ins), "stw sed, sp[3]", []>;
+
+def STSPC_0R : _F0R<0b0000101101, (outs), (ins), "stw spc, sp[1]", []>;
+
+def STSSR_0R : _F0R<0b0000101111, (outs), (ins), "stw ssr, sp[2]", []>;
+}
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1,
+    hasSideEffects = 1 in
+def WAITEU_0R : _F0R<0b0000001100, (outs), (ins),
+                     "waiteu",
+                     [(brind (int_xcore_waitevent))]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat<(XCoreBranchLink texternalsym:$addr), (BLRF_lu10 texternalsym:$addr)>;
+
+/// sext_inreg
+def : Pat<(sext_inreg GRRegs:$b, i1), (SEXT_rus GRRegs:$b, 1)>;
+def : Pat<(sext_inreg GRRegs:$b, i8), (SEXT_rus GRRegs:$b, 8)>;
+def : Pat<(sext_inreg GRRegs:$b, i16), (SEXT_rus GRRegs:$b, 16)>;
+
+/// loads
+def : Pat<(zextloadi8 (add GRRegs:$addr, GRRegs:$offset)),
+          (LD8U_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(zextloadi8 GRRegs:$addr), (LD8U_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(sextloadi16 (lda16f GRRegs:$addr, GRRegs:$offset)),
+          (LD16S_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(sextloadi16 GRRegs:$addr), (LD16S_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(load (ldawf GRRegs:$addr, GRRegs:$offset)),
+          (LDW_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(load (add GRRegs:$addr, immUs4:$offset)),
+          (LDW_2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+def : Pat<(load GRRegs:$addr), (LDW_2rus GRRegs:$addr, 0)>;
+
+/// anyext
+def : Pat<(extloadi8 (add GRRegs:$addr, GRRegs:$offset)),
+          (LD8U_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(extloadi8 GRRegs:$addr), (LD8U_3r GRRegs:$addr, (LDC_ru6 0))>;
+def : Pat<(extloadi16 (lda16f GRRegs:$addr, GRRegs:$offset)),
+          (LD16S_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(extloadi16 GRRegs:$addr), (LD16S_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+/// stores
+def : Pat<(truncstorei8 GRRegs:$val, (add GRRegs:$addr, GRRegs:$offset)),
+          (ST8_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(truncstorei8 GRRegs:$val, GRRegs:$addr),
+          (ST8_l3r GRRegs:$val, GRRegs:$addr, (LDC_ru6 0))>;
+          
+def : Pat<(truncstorei16 GRRegs:$val, (lda16f GRRegs:$addr, GRRegs:$offset)),
+          (ST16_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(truncstorei16 GRRegs:$val, GRRegs:$addr),
+          (ST16_l3r GRRegs:$val, GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(store GRRegs:$val, (ldawf GRRegs:$addr, GRRegs:$offset)),
+          (STW_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(store GRRegs:$val, (add GRRegs:$addr, immUs4:$offset)),
+          (STW_2rus GRRegs:$val, GRRegs:$addr, (div4_xform immUs4:$offset))>;
+def : Pat<(store GRRegs:$val, GRRegs:$addr),
+          (STW_2rus GRRegs:$val, GRRegs:$addr, 0)>;
+
+/// cttz
+def : Pat<(cttz GRRegs:$src), (CLZ_l2r (BITREV_l2r GRRegs:$src))>;
+
+/// trap
+def : Pat<(trap), (ECALLF_1r (LDC_ru6 0))>;
+
+///
+/// branch patterns
+///
+
+// unconditional branch
+def : Pat<(br bb:$addr), (BRFU_lu6 bb:$addr)>;
+
+// direct match equal/notequal zero brcond
+def : Pat<(brcond (setne GRRegs:$lhs, 0), bb:$dst),
+          (BRFT_lru6 GRRegs:$lhs, bb:$dst)>;
+def : Pat<(brcond (seteq GRRegs:$lhs, 0), bb:$dst),
+          (BRFF_lru6 GRRegs:$lhs, bb:$dst)>;
+
+def : Pat<(brcond (setle GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSS_3r GRRegs:$rhs, GRRegs:$lhs), bb:$dst)>;
+def : Pat<(brcond (setule GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSU_3r GRRegs:$rhs, GRRegs:$lhs), bb:$dst)>;
+def : Pat<(brcond (setge GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSS_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setuge GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSU_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setne GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (EQ_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setne GRRegs:$lhs, immUs:$rhs), bb:$dst),
+          (BRFF_lru6 (EQ_2rus GRRegs:$lhs, immUs:$rhs), bb:$dst)>;
+
+// generic brcond pattern
+def : Pat<(brcond GRRegs:$cond, bb:$addr), (BRFT_lru6 GRRegs:$cond, bb:$addr)>;
+
+
+///
+/// Select patterns
+///
+
+// direct match equal/notequal zero select
+def : Pat<(select (setne GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+        (SELECT_CC GRRegs:$lhs, GRRegs:$T, GRRegs:$F)>;
+
+def : Pat<(select (seteq GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+        (SELECT_CC GRRegs:$lhs, GRRegs:$F, GRRegs:$T)>;
+
+def : Pat<(select (setle GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSS_3r GRRegs:$rhs, GRRegs:$lhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setule GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSU_3r GRRegs:$rhs, GRRegs:$lhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setge GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSS_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setuge GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSU_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setne GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (EQ_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setne GRRegs:$lhs, immUs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (EQ_2rus GRRegs:$lhs, immUs:$rhs), GRRegs:$F, GRRegs:$T)>;
+
+///
+/// setcc patterns, only matched when none of the above brcond
+/// patterns match
+///
+
+// setcc 2 register operands
+def : Pat<(setle GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSS_3r GRRegs:$rhs, GRRegs:$lhs), 0)>;
+def : Pat<(setule GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSU_3r GRRegs:$rhs, GRRegs:$lhs), 0)>;
+
+def : Pat<(setgt GRRegs:$lhs, GRRegs:$rhs),
+          (LSS_3r GRRegs:$rhs, GRRegs:$lhs)>;
+def : Pat<(setugt GRRegs:$lhs, GRRegs:$rhs),
+          (LSU_3r GRRegs:$rhs, GRRegs:$lhs)>;
+
+def : Pat<(setge GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSS_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+def : Pat<(setuge GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSU_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+
+def : Pat<(setlt GRRegs:$lhs, GRRegs:$rhs),
+          (LSS_3r GRRegs:$lhs, GRRegs:$rhs)>;
+def : Pat<(setult GRRegs:$lhs, GRRegs:$rhs),
+          (LSU_3r GRRegs:$lhs, GRRegs:$rhs)>;
+
+def : Pat<(setne GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (EQ_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+
+def : Pat<(seteq GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_3r GRRegs:$lhs, GRRegs:$rhs)>;
+
+// setcc reg/imm operands
+def : Pat<(seteq GRRegs:$lhs, immUs:$rhs),
+          (EQ_2rus GRRegs:$lhs, immUs:$rhs)>;
+def : Pat<(setne GRRegs:$lhs, immUs:$rhs),
+          (EQ_2rus (EQ_2rus GRRegs:$lhs, immUs:$rhs), 0)>;
+
+// misc
+def : Pat<(add GRRegs:$addr, immUs4:$offset),
+          (LDAWF_l2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+
+def : Pat<(sub GRRegs:$addr, immUs4:$offset),
+          (LDAWB_l2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+
+def : Pat<(and GRRegs:$val, immMskBitp:$mask),
+          (ZEXT_rus GRRegs:$val, (msksize_xform immMskBitp:$mask))>;
+
+// (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
+def : Pat<(add GRRegs:$src1, immUsNeg:$src2),
+          (SUB_2rus GRRegs:$src1, (neg_xform immUsNeg:$src2))>;
+
+def : Pat<(add GRRegs:$src1, immUs4Neg:$src2),
+          (LDAWB_l2rus GRRegs:$src1, (div4neg_xform immUs4Neg:$src2))>;
+
+///
+/// Some peepholes
+///
+
+def : Pat<(mul GRRegs:$src, 3),
+          (LDA16F_l3r GRRegs:$src, GRRegs:$src)>;
+
+def : Pat<(mul GRRegs:$src, 5),
+          (LDAWF_l3r GRRegs:$src, GRRegs:$src)>;
+
+def : Pat<(mul GRRegs:$src, -3),
+          (LDAWB_l3r GRRegs:$src, GRRegs:$src)>;
+
+// ashr X, 32 is equivalent to ashr X, 31 on the XCore.
+def : Pat<(sra GRRegs:$src, 31),
+          (ASHR_l2rus GRRegs:$src, 32)>;
+
+def : Pat<(brcond (setlt GRRegs:$lhs, 0), bb:$dst),
+          (BRFT_lru6 (ASHR_l2rus GRRegs:$lhs, 32), bb:$dst)>;
+
+// setge X, 0 is canonicalized to setgt X, -1
+def : Pat<(brcond (setgt GRRegs:$lhs, -1), bb:$dst),
+          (BRFF_lru6 (ASHR_l2rus GRRegs:$lhs, 32), bb:$dst)>;
+
+def : Pat<(select (setlt GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (ASHR_l2rus GRRegs:$lhs, 32), GRRegs:$T, GRRegs:$F)>;
+
+def : Pat<(select (setgt GRRegs:$lhs, -1), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (ASHR_l2rus GRRegs:$lhs, 32), GRRegs:$F, GRRegs:$T)>;
+
+def : Pat<(setgt GRRegs:$lhs, -1),
+          (EQ_2rus (ASHR_l2rus GRRegs:$lhs, 32), 0)>;
+
+def : Pat<(sra (shl GRRegs:$src, immBpwSubBitp:$imm), immBpwSubBitp:$imm),
+          (SEXT_rus GRRegs:$src, (bpwsub_xform immBpwSubBitp:$imm))>;
+
+def : Pat<(load (cprelwrapper tconstpool:$b)),
+          (LDWCP_lru6 tconstpool:$b)>;
+
+def : Pat<(cprelwrapper tconstpool:$b),
+          (LDAWCP_lu6 tconstpool:$b)>;
diff --git a/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
new file mode 100644
index 000000000000..5cc51cd7a992
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -0,0 +1,234 @@
+//===-- XCoreLowerThreadLocal - Lower thread local variables --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains a pass that lowers thread local variables on the
+///        XCore.
+///
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "xcore-lower-thread-local"
+
+using namespace llvm;
+
+static cl::opt<unsigned> MaxThreads(
+  "xcore-max-threads", cl::Optional,
+  cl::desc("Maximum number of threads (for emulation thread-local storage)"),
+  cl::Hidden, cl::value_desc("number"), cl::init(8));
+
+namespace {
+  /// Lowers thread local variables on the XCore. Each thread local variable is
+  /// expanded to an array of n elements indexed by the thread ID where n is the
+  /// fixed number hardware threads supported by the device.
+  struct XCoreLowerThreadLocal : public ModulePass {
+    static char ID;
+
+    XCoreLowerThreadLocal() : ModulePass(ID) {
+      initializeXCoreLowerThreadLocalPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool lowerGlobal(GlobalVariable *GV);
+
+    bool runOnModule(Module &M) override;
+  };
+}
+
+char XCoreLowerThreadLocal::ID = 0;
+
+INITIALIZE_PASS(XCoreLowerThreadLocal, "xcore-lower-thread-local",
+                "Lower thread local variables", false, false)
+
+ModulePass *llvm::createXCoreLowerThreadLocalPass() {
+  return new XCoreLowerThreadLocal();
+}
+
+static ArrayType *createLoweredType(Type *OriginalType) {
+  return ArrayType::get(OriginalType, MaxThreads);
+}
+
+static Constant *
+createLoweredInitializer(ArrayType *NewType, Constant *OriginalInitializer) {
+  SmallVector<Constant *, 8> Elements(MaxThreads);
+  for (unsigned i = 0; i != MaxThreads; ++i) {
+    Elements[i] = OriginalInitializer;
+  }
+  return ConstantArray::get(NewType, Elements);
+}
+
+static Instruction *
+createReplacementInstr(ConstantExpr *CE, Instruction *Instr) {
+  IRBuilder<NoFolder> Builder(Instr);
+  unsigned OpCode = CE->getOpcode();
+  switch (OpCode) {
+    case Instruction::GetElementPtr: {
+      SmallVector<Value *,4> CEOpVec(CE->op_begin(), CE->op_end());
+      ArrayRef<Value *> CEOps(CEOpVec);
+      return dyn_cast<Instruction>(Builder.CreateInBoundsGEP(
+          cast<GEPOperator>(CE)->getSourceElementType(), CEOps[0],
+          CEOps.slice(1)));
+    }
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+      return dyn_cast<Instruction>(
+                  Builder.CreateBinOp((Instruction::BinaryOps)OpCode,
+                                      CE->getOperand(0), CE->getOperand(1),
+                                      CE->getName()));
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+      return dyn_cast<Instruction>(
+                  Builder.CreateCast((Instruction::CastOps)OpCode,
+                                     CE->getOperand(0), CE->getType(),
+                                     CE->getName()));
+    default:
+      llvm_unreachable("Unhandled constant expression!\n");
+  }
+}
+
+static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
+  do {
+    SmallVector<WeakVH,8> WUsers(CE->user_begin(), CE->user_end());
+    std::sort(WUsers.begin(), WUsers.end());
+    WUsers.erase(std::unique(WUsers.begin(), WUsers.end()), WUsers.end());
+    while (!WUsers.empty())
+      if (WeakVH WU = WUsers.pop_back_val()) {
+        if (PHINode *PN = dyn_cast<PHINode>(WU)) {
+          for (int I = 0, E = PN->getNumIncomingValues(); I < E; ++I)
+            if (PN->getIncomingValue(I) == CE) {
+              BasicBlock *PredBB = PN->getIncomingBlock(I);
+              if (PredBB->getTerminator()->getNumSuccessors() > 1)
+                PredBB = SplitEdge(PredBB, PN->getParent());
+              Instruction *InsertPos = PredBB->getTerminator();
+              Instruction *NewInst = createReplacementInstr(CE, InsertPos);
+              PN->setOperand(I, NewInst);
+            }
+        } else if (Instruction *Instr = dyn_cast<Instruction>(WU)) {
+          Instruction *NewInst = createReplacementInstr(CE, Instr);
+          Instr->replaceUsesOfWith(CE, NewInst);
+        } else {
+          ConstantExpr *CExpr = dyn_cast<ConstantExpr>(WU);
+          if (!CExpr || !replaceConstantExprOp(CExpr, P))
+            return false;
+        }
+      }
+  } while (CE->hasNUsesOrMore(1)); // We need to check because a recursive
+  // sibling may have used 'CE' when createReplacementInstr was called.
+  CE->destroyConstant();
+  return true;
+}
+
+static bool rewriteNonInstructionUses(GlobalVariable *GV, Pass *P) {
+  SmallVector<WeakVH,8> WUsers;
+  for (User *U : GV->users())
+    if (!isa<Instruction>(U))
+      WUsers.push_back(WeakVH(U));
+  while (!WUsers.empty())
+    if (WeakVH WU = WUsers.pop_back_val()) {
+      ConstantExpr *CE = dyn_cast<ConstantExpr>(WU);
+      if (!CE || !replaceConstantExprOp(CE, P))
+        return false;
+    }
+  return true;
+}
+
+static bool isZeroLengthArray(Type *Ty) {
+  ArrayType *AT = dyn_cast<ArrayType>(Ty);
+  return AT && (AT->getNumElements() == 0);
+}
+
+bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
+  Module *M = GV->getParent();
+  if (!GV->isThreadLocal())
+    return false;
+
+  // Skip globals that we can't lower and leave it for the backend to error.
+  if (!rewriteNonInstructionUses(GV, this) ||
+      !GV->getType()->isSized() || isZeroLengthArray(GV->getType()))
+    return false;
+
+  // Create replacement global.
+  ArrayType *NewType = createLoweredType(GV->getValueType());
+  Constant *NewInitializer = nullptr;
+  if (GV->hasInitializer())
+    NewInitializer = createLoweredInitializer(NewType,
+                                              GV->getInitializer());
+  GlobalVariable *NewGV =
+    new GlobalVariable(*M, NewType, GV->isConstant(), GV->getLinkage(),
+                       NewInitializer, "", nullptr,
+                       GlobalVariable::NotThreadLocal,
+                       GV->getType()->getAddressSpace(),
+                       GV->isExternallyInitialized());
+
+  // Update uses.
+  SmallVector<User *, 16> Users(GV->user_begin(), GV->user_end());
+  for (unsigned I = 0, E = Users.size(); I != E; ++I) {
+    User *U = Users[I];
+    Instruction *Inst = cast<Instruction>(U);
+    IRBuilder<> Builder(Inst);
+    Function *GetID = Intrinsic::getDeclaration(GV->getParent(),
+                                                Intrinsic::xcore_getid);
+    Value *ThreadID = Builder.CreateCall(GetID, {});
+    Value *Addr = Builder.CreateInBoundsGEP(NewGV->getValueType(), NewGV,
+                                            {Builder.getInt64(0), ThreadID});
+    U->replaceUsesOfWith(GV, Addr);
+  }
+
+  // Remove old global.
+  NewGV->takeName(GV);
+  GV->eraseFromParent();
+  return true;
+}
+
+bool XCoreLowerThreadLocal::runOnModule(Module &M) {
+  // Find thread local globals.
+  bool MadeChange = false;
+  SmallVector<GlobalVariable *, 16> ThreadLocalGlobals;
+  for (GlobalVariable &GV : M.globals())
+    if (GV.isThreadLocal())
+      ThreadLocalGlobals.push_back(&GV);
+  for (unsigned I = 0, E = ThreadLocalGlobals.size(); I != E; ++I) {
+    MadeChange |= lowerGlobal(ThreadLocalGlobals[I]);
+  }
+  return MadeChange;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp
new file mode 100644
index 000000000000..7763ccc8f4af
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp
@@ -0,0 +1,114 @@
+//===-- XCoreMCInstLower.cpp - Convert XCore MachineInstr to MCInst -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains code to lower XCore MachineInstrs to their
+/// corresponding MCInst records.
+///
+//===----------------------------------------------------------------------===//
+#include "XCoreMCInstLower.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+XCoreMCInstLower::XCoreMCInstLower(class AsmPrinter &asmprinter)
+    : Printer(asmprinter) {}
+
+void XCoreMCInstLower::Initialize(MCContext *C) { Ctx = C; }
+
+MCOperand XCoreMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                               MachineOperandType MOTy,
+                                               unsigned Offset) const {
+  MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+  const MCSymbol *Symbol;
+
+  switch (MOTy) {
+    case MachineOperand::MO_MachineBasicBlock:
+      Symbol = MO.getMBB()->getSymbol();
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      Symbol = Printer.getSymbol(MO.getGlobal());
+      Offset += MO.getOffset();
+      break;
+    case MachineOperand::MO_BlockAddress:
+      Symbol = Printer.GetBlockAddressSymbol(MO.getBlockAddress());
+      Offset += MO.getOffset();
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      Symbol = Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+      Offset += MO.getOffset();
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      Symbol = Printer.GetJTISymbol(MO.getIndex());
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      Symbol = Printer.GetCPISymbol(MO.getIndex());
+      Offset += MO.getOffset();
+      break;
+    default:
+      llvm_unreachable("<unknown operand type>");
+  }
+
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Symbol, Kind, *Ctx);
+
+  if (!Offset)
+    return MCOperand::createExpr(MCSym);
+
+  // Assume offset is never negative.
+  assert(Offset > 0);
+
+  const MCConstantExpr *OffsetExpr =  MCConstantExpr::create(Offset, *Ctx);
+  const MCBinaryExpr *Add = MCBinaryExpr::createAdd(MCSym, OffsetExpr, *Ctx);
+  return MCOperand::createExpr(Add);
+}
+
+MCOperand XCoreMCInstLower::LowerOperand(const MachineOperand &MO,
+                                         unsigned offset) const {
+  MachineOperandType MOTy = MO.getType();
+
+  switch (MOTy) {
+    default: llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) break;
+      return MCOperand::createReg(MO.getReg());
+    case MachineOperand::MO_Immediate:
+      return MCOperand::createImm(MO.getImm() + offset);
+    case MachineOperand::MO_MachineBasicBlock:
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+    case MachineOperand::MO_JumpTableIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_BlockAddress:
+      return LowerSymbolOperand(MO, MOTy, offset);
+    case MachineOperand::MO_RegisterMask:
+      break;
+  }
+
+  return MCOperand();
+}
+
+void XCoreMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCOp = LowerOperand(MO);
+
+    if (MCOp.isValid())
+      OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h
new file mode 100644
index 000000000000..8fb1593cc6e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h
@@ -0,0 +1,41 @@
+//===-- XCoreMCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREMCINSTLOWER_H
+#define LLVM_LIB_TARGET_XCORE_XCOREMCINSTLOWER_H
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MachineInstr;
+  class MachineFunction;
+  class Mangler;
+  class AsmPrinter;
+
+/// \brief This class is used to lower an MachineInstr into an MCInst.
+class LLVM_LIBRARY_VISIBILITY XCoreMCInstLower {
+  typedef MachineOperand::MachineOperandType MachineOperandType;
+  MCContext *Ctx;
+  AsmPrinter &Printer;
+public:
+  XCoreMCInstLower(class AsmPrinter &asmprinter);
+  void Initialize(MCContext *C);
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+  MCOperand LowerOperand(const MachineOperand& MO, unsigned offset = 0) const;
+
+private:
+  MCOperand LowerSymbolOperand(const MachineOperand &MO,
+                               MachineOperandType MOTy, unsigned Offset) const;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..e91536ca1e83
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
@@ -0,0 +1,72 @@
+//===-- XCoreMachineFunctionInfo.cpp - XCore machine function info --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMachineFunctionInfo.h"
+#include "XCoreInstrInfo.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+void XCoreFunctionInfo::anchor() { }
+
+bool XCoreFunctionInfo::isLargeFrame(const MachineFunction &MF) const {
+  if (CachedEStackSize == -1) {
+    CachedEStackSize = MF.getFrameInfo().estimateStackSize(MF);
+  }
+  // isLargeFrame() is used when deciding if spill slots should be added to
+  // allow eliminateFrameIndex() to scavenge registers.
+  // This is only required when there is no FP and offsets are greater than
+  // ~256KB (~64Kwords). Thus only for code run on the emulator!
+  //
+  // The arbitrary value of 0xf000 allows frames of up to ~240KB before spill
+  // slots are added for the use of eliminateFrameIndex() register scavenging.
+  // For frames less than 240KB, it is assumed that there will be less than
+  // 16KB of function arguments.
+  return CachedEStackSize > 0xf000;
+}
+
+int XCoreFunctionInfo::createLRSpillSlot(MachineFunction &MF) {
+  if (LRSpillSlotSet) {
+    return LRSpillSlot;
+  }
+  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (! MF.getFunction()->isVarArg()) {
+    // A fixed offset of 0 allows us to save / restore LR using entsp / retsp.
+    LRSpillSlot = MFI.CreateFixedObject(RC->getSize(), 0, true);
+  } else {
+    LRSpillSlot = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+  }
+  LRSpillSlotSet = true;
+  return LRSpillSlot;
+}
+
+int XCoreFunctionInfo::createFPSpillSlot(MachineFunction &MF) {
+  if (FPSpillSlotSet) {
+    return FPSpillSlot;
+  }
+  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  FPSpillSlot = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+  FPSpillSlotSet = true;
+  return FPSpillSlot;
+}
+
+const int* XCoreFunctionInfo::createEHSpillSlot(MachineFunction &MF) {
+  if (EHSpillSlotSet) {
+    return EHSpillSlot;
+  }
+  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  EHSpillSlot[0] = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+  EHSpillSlot[1] = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+  EHSpillSlotSet = true;
+  return EHSpillSlot;
+}
+
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
new file mode 100644
index 000000000000..cdcc52fdc32d
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
@@ -0,0 +1,106 @@
+//===- XCoreMachineFunctionInfo.h - XCore machine function info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares XCore-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_XCORE_XCOREMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include <vector>
+
+namespace llvm {
+
+// Forward declarations
+class Function;
+
+/// XCoreFunctionInfo - This class is derived from MachineFunction private
+/// XCore target-specific information for each MachineFunction.
+class XCoreFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+  bool LRSpillSlotSet;
+  int LRSpillSlot;
+  bool FPSpillSlotSet;
+  int FPSpillSlot;
+  bool EHSpillSlotSet;
+  int EHSpillSlot[2];
+  unsigned ReturnStackOffset;
+  bool ReturnStackOffsetSet;
+  int VarArgsFrameIndex;
+  mutable int CachedEStackSize;
+  std::vector<std::pair<MachineBasicBlock::iterator, CalleeSavedInfo>>
+  SpillLabels;
+
+public:
+  XCoreFunctionInfo() :
+    LRSpillSlotSet(false),
+    FPSpillSlotSet(false),
+    EHSpillSlotSet(false),
+    ReturnStackOffsetSet(false),
+    VarArgsFrameIndex(0),
+    CachedEStackSize(-1) {}
+  
+  explicit XCoreFunctionInfo(MachineFunction &MF) :
+    LRSpillSlotSet(false),
+    FPSpillSlotSet(false),
+    EHSpillSlotSet(false),
+    ReturnStackOffsetSet(false),
+    VarArgsFrameIndex(0),
+    CachedEStackSize(-1) {}
+  
+  ~XCoreFunctionInfo() {}
+  
+  void setVarArgsFrameIndex(int off) { VarArgsFrameIndex = off; }
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+
+  int createLRSpillSlot(MachineFunction &MF);
+  bool hasLRSpillSlot() { return LRSpillSlotSet; }
+  int getLRSpillSlot() const {
+    assert(LRSpillSlotSet && "LR Spill slot not set");
+    return LRSpillSlot;
+  }
+
+  int createFPSpillSlot(MachineFunction &MF);
+  bool hasFPSpillSlot() { return FPSpillSlotSet; }
+  int getFPSpillSlot() const {
+    assert(FPSpillSlotSet && "FP Spill slot not set");
+    return FPSpillSlot;
+  }
+
+  const int* createEHSpillSlot(MachineFunction &MF);
+  bool hasEHSpillSlot() { return EHSpillSlotSet; }
+  const int* getEHSpillSlot() const {
+    assert(EHSpillSlotSet && "EH Spill slot not set");
+    return EHSpillSlot;
+  }
+
+  void setReturnStackOffset(unsigned value) {
+    assert(!ReturnStackOffsetSet && "Return stack offset set twice");
+    ReturnStackOffset = value;
+    ReturnStackOffsetSet = true;
+  }
+
+  unsigned getReturnStackOffset() const {
+    assert(ReturnStackOffsetSet && "Return stack offset not set");
+    return ReturnStackOffset;
+  }
+
+  bool isLargeFrame(const MachineFunction &MF) const;
+
+  std::vector<std::pair<MachineBasicBlock::iterator, CalleeSavedInfo>> &
+  getSpillLabels() {
+    return SpillLabels;
+  }
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
new file mode 100644
index 000000000000..d34e928b14f7
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -0,0 +1,330 @@
+//===-- XCoreRegisterInfo.cpp - XCore Register Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreRegisterInfo.h"
+#include "XCore.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "XCoreSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-reg-info"
+
+#define GET_REGINFO_TARGET_DESC
+#include "XCoreGenRegisterInfo.inc"
+
+XCoreRegisterInfo::XCoreRegisterInfo()
+  : XCoreGenRegisterInfo(XCore::LR) {
+}
+
+// helper functions
+static inline bool isImmUs(unsigned val) {
+  return val <= 11;
+}
+
+static inline bool isImmU6(unsigned val) {
+  return val < (1 << 6);
+}
+
+static inline bool isImmU16(unsigned val) {
+  return val < (1 << 16);
+}
+
+
+static void InsertFPImmInst(MachineBasicBlock::iterator II,
+                            const XCoreInstrInfo &TII,
+                            unsigned Reg, unsigned FrameReg, int Offset ) {
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+
+  switch (MI.getOpcode()) {
+  case XCore::LDWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDW_2rus), Reg)
+          .addReg(FrameReg)
+          .addImm(Offset)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::STWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::STW_2rus))
+          .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
+          .addReg(FrameReg)
+          .addImm(Offset)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::LDAWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l2rus), Reg)
+          .addReg(FrameReg)
+          .addImm(Offset);
+    break;
+  default:
+    llvm_unreachable("Unexpected Opcode");
+  }
+}
+
+static void InsertFPConstInst(MachineBasicBlock::iterator II,
+                              const XCoreInstrInfo &TII,
+                              unsigned Reg, unsigned FrameReg,
+                              int Offset, RegScavenger *RS ) {
+  assert(RS && "requiresRegisterScavenging failed");
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+  unsigned ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
+  RS->setRegUsed(ScratchOffset);
+  TII.loadImmediate(MBB, II, ScratchOffset, Offset);
+
+  switch (MI.getOpcode()) {
+  case XCore::LDWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDW_3r), Reg)
+          .addReg(FrameReg)
+          .addReg(ScratchOffset, RegState::Kill)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::STWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::STW_l3r))
+          .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
+          .addReg(FrameReg)
+          .addReg(ScratchOffset, RegState::Kill)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::LDAWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l3r), Reg)
+          .addReg(FrameReg)
+          .addReg(ScratchOffset, RegState::Kill);
+    break;
+  default:
+    llvm_unreachable("Unexpected Opcode");
+  }
+}
+
+static void InsertSPImmInst(MachineBasicBlock::iterator II,
+                            const XCoreInstrInfo &TII,
+                            unsigned Reg, int Offset) {
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+  bool isU6 = isImmU6(Offset);
+
+  switch (MI.getOpcode()) {
+  int NewOpcode;
+  case XCore::LDWFI:
+    NewOpcode = (isU6) ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
+    BuildMI(MBB, II, dl, TII.get(NewOpcode), Reg)
+          .addImm(Offset)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::STWFI:
+    NewOpcode = (isU6) ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
+    BuildMI(MBB, II, dl, TII.get(NewOpcode))
+          .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
+          .addImm(Offset)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::LDAWFI:
+    NewOpcode = (isU6) ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
+    BuildMI(MBB, II, dl, TII.get(NewOpcode), Reg)
+          .addImm(Offset);
+    break;
+  default:
+    llvm_unreachable("Unexpected Opcode");
+  }
+}
+
+static void InsertSPConstInst(MachineBasicBlock::iterator II,
+                                const XCoreInstrInfo &TII,
+                                unsigned Reg, int Offset, RegScavenger *RS ) {
+  assert(RS && "requiresRegisterScavenging failed");
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+  unsigned OpCode = MI.getOpcode();
+
+  unsigned ScratchBase;
+  if (OpCode==XCore::STWFI) {
+    ScratchBase = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
+    RS->setRegUsed(ScratchBase);
+  } else
+    ScratchBase = Reg;
+  BuildMI(MBB, II, dl, TII.get(XCore::LDAWSP_ru6), ScratchBase).addImm(0);
+  unsigned ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
+  RS->setRegUsed(ScratchOffset);
+  TII.loadImmediate(MBB, II, ScratchOffset, Offset);
+
+  switch (OpCode) {
+  case XCore::LDWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDW_3r), Reg)
+          .addReg(ScratchBase, RegState::Kill)
+          .addReg(ScratchOffset, RegState::Kill)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::STWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::STW_l3r))
+          .addReg(Reg, getKillRegState(MI.getOperand(0).isKill()))
+          .addReg(ScratchBase, RegState::Kill)
+          .addReg(ScratchOffset, RegState::Kill)
+          .addMemOperand(*MI.memoperands_begin());
+    break;
+  case XCore::LDAWFI:
+    BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l3r), Reg)
+          .addReg(ScratchBase, RegState::Kill)
+          .addReg(ScratchOffset, RegState::Kill);
+    break;
+  default:
+    llvm_unreachable("Unexpected Opcode");
+  }
+}
+
+bool XCoreRegisterInfo::needsFrameMoves(const MachineFunction &MF) {
+  return MF.getMMI().hasDebugInfo() ||
+    MF.getFunction()->needsUnwindTableEntry();
+}
+
+const MCPhysReg *
+XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  // The callee saved registers LR & FP are explicitly handled during
+  // emitPrologue & emitEpilogue and related functions.
+  static const MCPhysReg CalleeSavedRegs[] = {
+    XCore::R4, XCore::R5, XCore::R6, XCore::R7,
+    XCore::R8, XCore::R9, XCore::R10,
+    0
+  };
+  static const MCPhysReg CalleeSavedRegsFP[] = {
+    XCore::R4, XCore::R5, XCore::R6, XCore::R7,
+    XCore::R8, XCore::R9,
+    0
+  };
+  const XCoreFrameLowering *TFI = getFrameLowering(*MF);
+  if (TFI->hasFP(*MF))
+    return CalleeSavedRegsFP;
+  return CalleeSavedRegs;
+}
+
+BitVector XCoreRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const XCoreFrameLowering *TFI = getFrameLowering(MF);
+
+  Reserved.set(XCore::CP);
+  Reserved.set(XCore::DP);
+  Reserved.set(XCore::SP);
+  Reserved.set(XCore::LR);
+  if (TFI->hasFP(MF)) {
+    Reserved.set(XCore::R10);
+  }
+  return Reserved;
+}
+
+bool
+XCoreRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool
+XCoreRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return true;
+}
+
+bool
+XCoreRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+  return false;
+}
+
+void
+XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                       int SPAdj, unsigned FIOperandNum,
+                                       RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+  MachineInstr &MI = *II;
+  MachineOperand &FrameOp = MI.getOperand(FIOperandNum);
+  int FrameIndex = FrameOp.getIndex();
+
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const XCoreInstrInfo &TII =
+      *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  const XCoreFrameLowering *TFI = getFrameLowering(MF);
+  int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex);
+  int StackSize = MF.getFrameInfo().getStackSize();
+
+  #ifndef NDEBUG
+  DEBUG(errs() << "\nFunction         : " 
+        << MF.getName() << "\n");
+  DEBUG(errs() << "<--------->\n");
+  DEBUG(MI.print(errs()));
+  DEBUG(errs() << "FrameIndex         : " << FrameIndex << "\n");
+  DEBUG(errs() << "FrameOffset        : " << Offset << "\n");
+  DEBUG(errs() << "StackSize          : " << StackSize << "\n");
+  #endif
+
+  Offset += StackSize;
+
+  unsigned FrameReg = getFrameRegister(MF);
+
+  // Special handling of DBG_VALUE instructions.
+  if (MI.isDebugValue()) {
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // fold constant into offset.
+  Offset += MI.getOperand(FIOperandNum + 1).getImm();
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
+  
+  assert(Offset%4 == 0 && "Misaligned stack offset");
+  DEBUG(errs() << "Offset             : " << Offset << "\n" << "<--------->\n");
+  Offset/=4;
+  
+  unsigned Reg = MI.getOperand(0).getReg();
+  assert(XCore::GRRegsRegClass.contains(Reg) && "Unexpected register operand");
+
+  if (TFI->hasFP(MF)) {
+    if (isImmUs(Offset))
+      InsertFPImmInst(II, TII, Reg, FrameReg, Offset);
+    else
+      InsertFPConstInst(II, TII, Reg, FrameReg, Offset, RS);
+  } else {
+    if (isImmU16(Offset))
+      InsertSPImmInst(II, TII, Reg, Offset);
+    else
+      InsertSPConstInst(II, TII, Reg, Offset, RS);
+  }
+  // Erase old instruction.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MBB.erase(II);
+}
+
+
+unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const XCoreFrameLowering *TFI = getFrameLowering(MF);
+
+  return TFI->hasFP(MF) ? XCore::R10 : XCore::SP;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h
new file mode 100644
index 000000000000..010fccd797a6
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h
@@ -0,0 +1,55 @@
+//===-- XCoreRegisterInfo.h - XCore Register Information Impl ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREREGISTERINFO_H
+#define LLVM_LIB_TARGET_XCORE_XCOREREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "XCoreGenRegisterInfo.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+
+struct XCoreRegisterInfo : public XCoreGenRegisterInfo {
+public:
+  XCoreRegisterInfo();
+
+  /// Code Generation virtual methods...
+
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+  
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
+  //! Return whether to emit frame moves
+  static bool needsFrameMoves(const MachineFunction &MF);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.td b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.td
new file mode 100644
index 000000000000..6694b2882aca
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.td
@@ -0,0 +1,59 @@
+//===-- XCoreRegisterInfo.td - XCore Register defs ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the XCore register file 
+//===----------------------------------------------------------------------===//
+
+class XCoreReg<string n> : Register<n> {
+  field bits<4> Num;
+  let Namespace = "XCore";
+}
+
+// Registers are identified with 4-bit ID numbers.
+// Ri - 32-bit integer registers
+class Ri<bits<4> num, string n> : XCoreReg<n> {
+  let Num = num;
+}
+
+// CPU registers
+def R0  : Ri< 0, "r0">, DwarfRegNum<[0]>;
+def R1  : Ri< 1, "r1">, DwarfRegNum<[1]>;
+def R2  : Ri< 2, "r2">, DwarfRegNum<[2]>; 
+def R3  : Ri< 3, "r3">, DwarfRegNum<[3]>;
+def R4  : Ri< 4, "r4">, DwarfRegNum<[4]>;
+def R5  : Ri< 5, "r5">, DwarfRegNum<[5]>; 
+def R6  : Ri< 6, "r6">, DwarfRegNum<[6]>;
+def R7  : Ri< 7, "r7">, DwarfRegNum<[7]>;
+def R8  : Ri< 8, "r8">, DwarfRegNum<[8]>;
+def R9  : Ri< 9, "r9">, DwarfRegNum<[9]>; 
+def R10 : Ri<10, "r10">, DwarfRegNum<[10]>;
+def R11 : Ri<11, "r11">, DwarfRegNum<[11]>;
+def CP : Ri<12, "cp">, DwarfRegNum<[12]>; 
+def DP : Ri<13, "dp">, DwarfRegNum<[13]>;
+def SP : Ri<14, "sp">, DwarfRegNum<[14]>;
+def LR : Ri<15, "lr">, DwarfRegNum<[15]>;
+
+// Register classes.
+//
+def GRRegs : RegisterClass<"XCore", [i32], 32,
+  // Return values and arguments
+  (add R0, R1, R2, R3,
+  // Callee save
+  R4, R5, R6, R7, R8, R9, R10,
+  // Not preserved across procedure calls
+  R11)>;
+
+// Reserved
+def RRegs : RegisterClass<"XCore", [i32], 32,
+  (add R0, R1, R2, R3,
+   R4, R5, R6, R7, R8, R9, R10,
+   R11, CP, DP, SP, LR)> {
+  let isAllocatable = 0;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..c03b0afceba3
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -0,0 +1,51 @@
+//===-- XCoreSelectionDAGInfo.cpp - XCore SelectionDAG Info ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the XCoreSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreTargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-selectiondag-info"
+
+SDValue XCoreSelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+  unsigned SizeBitWidth = Size.getValueSizeInBits();
+  // Call __memcpy_4 if the src, dst and size are all 4 byte aligned.
+  if (!AlwaysInline && (Align & 3) == 0 &&
+      DAG.MaskedValueIsZero(Size, APInt(SizeBitWidth, 3))) {
+    const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering();
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+    Entry.Node = Dst; Args.push_back(Entry);
+    Entry.Node = Src; Args.push_back(Entry);
+    Entry.Node = Size; Args.push_back(Entry);
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl)
+        .setChain(Chain)
+        .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+                   Type::getVoidTy(*DAG.getContext()),
+                   DAG.getExternalSymbol("__memcpy_4",
+                                         TLI.getPointerTy(DAG.getDataLayout())),
+                   std::move(Args))
+        .setDiscardResult();
+
+    std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+    return CallResult.second;
+  }
+
+  // Otherwise have the target-independent code call memcpy.
+  return SDValue();
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h
new file mode 100644
index 000000000000..7cd0d8216e91
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h
@@ -0,0 +1,35 @@
+//===-- XCoreSelectionDAGInfo.h - XCore SelectionDAG Info -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the XCore subclass for SelectionDAGTargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCORESELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_XCORE_XCORESELECTIONDAGINFO_H
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+namespace llvm {
+
+class XCoreTargetMachine;
+
+class XCoreSelectionDAGInfo : public SelectionDAGTargetInfo {
+public:
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Op1, SDValue Op2,
+                                  SDValue Op3, unsigned Align, bool isVolatile,
+                                  bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp
new file mode 100644
index 000000000000..99ad2c88504f
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp
@@ -0,0 +1,31 @@
+//===-- XCoreSubtarget.cpp - XCore Subtarget Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the XCore specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreSubtarget.h"
+#include "XCore.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "XCoreGenSubtargetInfo.inc"
+
+void XCoreSubtarget::anchor() { }
+
+XCoreSubtarget::XCoreSubtarget(const Triple &TT, const std::string &CPU,
+                               const std::string &FS, const TargetMachine &TM)
+    : XCoreGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(*this),
+      TLInfo(TM, *this), TSInfo() {}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h
new file mode 100644
index 000000000000..f01fb6714d86
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h
@@ -0,0 +1,66 @@
+//===-- XCoreSubtarget.h - Define Subtarget for the XCore -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the XCore specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCORESUBTARGET_H
+#define LLVM_LIB_TARGET_XCORE_XCORESUBTARGET_H
+
+#include "XCoreFrameLowering.h"
+#include "XCoreISelLowering.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreSelectionDAGInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "XCoreGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class XCoreSubtarget : public XCoreGenSubtargetInfo {
+  virtual void anchor();
+  XCoreInstrInfo InstrInfo;
+  XCoreFrameLowering FrameLowering;
+  XCoreTargetLowering TLInfo;
+  XCoreSelectionDAGInfo TSInfo;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  XCoreSubtarget(const Triple &TT, const std::string &CPU,
+                 const std::string &FS, const TargetMachine &TM);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  const XCoreInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const XCoreFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const XCoreTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const XCoreSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const TargetRegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
new file mode 100644
index 000000000000..bf3138f2164a
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -0,0 +1,99 @@
+//===-- XCoreTargetMachine.cpp - Define TargetMachine for XCore -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreTargetMachine.h"
+#include "XCoreTargetObjectFile.h"
+#include "XCoreTargetTransformInfo.h"
+#include "XCore.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::Static;
+  return *RM;
+}
+
+/// Create an ILP32 architecture model
+///
+XCoreTargetMachine::XCoreTargetMachine(const Target &T, const Triple &TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+    : LLVMTargetMachine(
+          T, "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32",
+          TT, CPU, FS, Options, getEffectiveRelocModel(RM), CM, OL),
+      TLOF(make_unique<XCoreTargetObjectFile>()),
+      Subtarget(TT, CPU, FS, *this) {
+  initAsmInfo();
+}
+
+XCoreTargetMachine::~XCoreTargetMachine() {}
+
+namespace {
+/// XCore Code Generator Pass Configuration Options.
+class XCorePassConfig : public TargetPassConfig {
+public:
+  XCorePassConfig(XCoreTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  XCoreTargetMachine &getXCoreTargetMachine() const {
+    return getTM<XCoreTargetMachine>();
+  }
+
+  void addIRPasses() override;
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  void addPreEmitPass() override;
+};
+} // namespace
+
+TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new XCorePassConfig(this, PM);
+}
+
+void XCorePassConfig::addIRPasses() {
+  addPass(createAtomicExpandPass(&getXCoreTargetMachine()));
+
+  TargetPassConfig::addIRPasses();
+}
+
+bool XCorePassConfig::addPreISel() {
+  addPass(createXCoreLowerThreadLocalPass());
+  return false;
+}
+
+bool XCorePassConfig::addInstSelector() {
+  addPass(createXCoreISelDag(getXCoreTargetMachine(), getOptLevel()));
+  return false;
+}
+
+void XCorePassConfig::addPreEmitPass() {
+  addPass(createXCoreFrameToArgsOffsetEliminationPass(), false);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreTarget() {
+  RegisterTargetMachine<XCoreTargetMachine> X(getTheXCoreTarget());
+}
+
+TargetIRAnalysis XCoreTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(XCoreTTIImpl(this, F));
+  });
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h
new file mode 100644
index 000000000000..4bd25bc8776c
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h
@@ -0,0 +1,48 @@
+//===-- XCoreTargetMachine.h - Define TargetMachine for XCore ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the XCore specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCORETARGETMACHINE_H
+#define LLVM_LIB_TARGET_XCORE_XCORETARGETMACHINE_H
+
+#include "XCoreSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class XCoreTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  XCoreSubtarget Subtarget;
+public:
+  XCoreTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                     StringRef FS, const TargetOptions &Options,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+  ~XCoreTargetMachine() override;
+
+  const XCoreSubtarget *getSubtargetImpl() const { return &Subtarget; }
+  const XCoreSubtarget *getSubtargetImpl(const Function &) const override {
+    return &Subtarget;
+  }
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  TargetIRAnalysis getTargetIRAnalysis() override;
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
new file mode 100644
index 000000000000..ad8693fd325e
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
@@ -0,0 +1,156 @@
+//===-- XCoreTargetObjectFile.cpp - XCore object files --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreTargetObjectFile.h"
+#include "XCoreSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+
+void XCoreTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+
+  BSSSection = Ctx.getELFSection(".dp.bss", ELF::SHT_NOBITS,
+                                 ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                                     ELF::XCORE_SHF_DP_SECTION);
+  BSSSectionLarge = Ctx.getELFSection(".dp.bss.large", ELF::SHT_NOBITS,
+                                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                                          ELF::XCORE_SHF_DP_SECTION);
+  DataSection = Ctx.getELFSection(".dp.data", ELF::SHT_PROGBITS,
+                                  ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                                      ELF::XCORE_SHF_DP_SECTION);
+  DataSectionLarge = Ctx.getELFSection(".dp.data.large", ELF::SHT_PROGBITS,
+                                       ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                                           ELF::XCORE_SHF_DP_SECTION);
+  DataRelROSection = Ctx.getELFSection(".dp.rodata", ELF::SHT_PROGBITS,
+                                       ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                                           ELF::XCORE_SHF_DP_SECTION);
+  DataRelROSectionLarge = Ctx.getELFSection(
+      ".dp.rodata.large", ELF::SHT_PROGBITS,
+      ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::XCORE_SHF_DP_SECTION);
+  ReadOnlySection =
+      Ctx.getELFSection(".cp.rodata", ELF::SHT_PROGBITS,
+                        ELF::SHF_ALLOC | ELF::XCORE_SHF_CP_SECTION);
+  ReadOnlySectionLarge =
+      Ctx.getELFSection(".cp.rodata.large", ELF::SHT_PROGBITS,
+                        ELF::SHF_ALLOC | ELF::XCORE_SHF_CP_SECTION);
+  MergeableConst4Section = Ctx.getELFSection(
+      ".cp.rodata.cst4", ELF::SHT_PROGBITS,
+      ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::XCORE_SHF_CP_SECTION, 4, "");
+  MergeableConst8Section = Ctx.getELFSection(
+      ".cp.rodata.cst8", ELF::SHT_PROGBITS,
+      ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::XCORE_SHF_CP_SECTION, 8, "");
+  MergeableConst16Section = Ctx.getELFSection(
+      ".cp.rodata.cst16", ELF::SHT_PROGBITS,
+      ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::XCORE_SHF_CP_SECTION, 16, "");
+  CStringSection =
+      Ctx.getELFSection(".cp.rodata.string", ELF::SHT_PROGBITS,
+                        ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::SHF_STRINGS |
+                            ELF::XCORE_SHF_CP_SECTION);
+  // TextSection       - see MObjectFileInfo.cpp
+  // StaticCtorSection - see MObjectFileInfo.cpp
+  // StaticDtorSection - see MObjectFileInfo.cpp
+ }
+
+static unsigned getXCoreSectionType(SectionKind K) {
+  if (K.isBSS())
+    return ELF::SHT_NOBITS;
+  return ELF::SHT_PROGBITS;
+}
+
+static unsigned getXCoreSectionFlags(SectionKind K, bool IsCPRel) {
+  unsigned Flags = 0;
+
+  if (!K.isMetadata())
+    Flags |= ELF::SHF_ALLOC;
+
+  if (K.isText())
+    Flags |= ELF::SHF_EXECINSTR;
+  else if (IsCPRel)
+    Flags |= ELF::XCORE_SHF_CP_SECTION;
+  else
+    Flags |= ELF::XCORE_SHF_DP_SECTION;
+
+  if (K.isWriteable())
+    Flags |= ELF::SHF_WRITE;
+
+  if (K.isMergeableCString() || K.isMergeableConst4() ||
+      K.isMergeableConst8() || K.isMergeableConst16())
+    Flags |= ELF::SHF_MERGE;
+
+  if (K.isMergeableCString())
+    Flags |= ELF::SHF_STRINGS;
+
+  return Flags;
+}
+
+MCSection *XCoreTargetObjectFile::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  StringRef SectionName = GO->getSection();
+  // Infer section flags from the section name if we can.
+  bool IsCPRel = SectionName.startswith(".cp.");
+  if (IsCPRel && !Kind.isReadOnly())
+    report_fatal_error("Using .cp. section for writeable object.");
+  return getContext().getELFSection(SectionName, getXCoreSectionType(Kind),
+                                    getXCoreSectionFlags(Kind, IsCPRel));
+}
+
+MCSection *XCoreTargetObjectFile::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+
+  bool UseCPRel = GO->hasLocalLinkage();
+
+  if (Kind.isText())                    return TextSection;
+  if (UseCPRel) {
+    if (Kind.isMergeable1ByteCString()) return CStringSection;
+    if (Kind.isMergeableConst4())       return MergeableConst4Section;
+    if (Kind.isMergeableConst8())       return MergeableConst8Section;
+    if (Kind.isMergeableConst16())      return MergeableConst16Section;
+  }
+  Type *ObjType = GO->getValueType();
+  auto &DL = GO->getParent()->getDataLayout();
+  if (TM.getCodeModel() == CodeModel::Small || !ObjType->isSized() ||
+      DL.getTypeAllocSize(ObjType) < CodeModelLargeSize) {
+    if (Kind.isReadOnly())              return UseCPRel? ReadOnlySection
+                                                       : DataRelROSection;
+    if (Kind.isBSS() || Kind.isCommon())return BSSSection;
+    if (Kind.isData())
+      return DataSection;
+    if (Kind.isReadOnlyWithRel())       return DataRelROSection;
+  } else {
+    if (Kind.isReadOnly())              return UseCPRel? ReadOnlySectionLarge
+                                                       : DataRelROSectionLarge;
+    if (Kind.isBSS() || Kind.isCommon())return BSSSectionLarge;
+    if (Kind.isData())
+      return DataSectionLarge;
+    if (Kind.isReadOnlyWithRel())       return DataRelROSectionLarge;
+  }
+
+  assert((Kind.isThreadLocal() || Kind.isCommon()) && "Unknown section kind");
+  report_fatal_error("Target does not support TLS or Common sections");
+}
+
+MCSection *XCoreTargetObjectFile::getSectionForConstant(const DataLayout &DL,
+                                                        SectionKind Kind,
+                                                        const Constant *C,
+                                                        unsigned &Align) const {
+  if (Kind.isMergeableConst4())           return MergeableConst4Section;
+  if (Kind.isMergeableConst8())           return MergeableConst8Section;
+  if (Kind.isMergeableConst16())          return MergeableConst16Section;
+  assert((Kind.isReadOnly() || Kind.isReadOnlyWithRel()) &&
+         "Unknown section kind");
+  // We assume the size of the object is never greater than CodeModelLargeSize.
+  // To handle CodeModelLargeSize changes to AsmPrinter would be required.
+  return ReadOnlySection;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h
new file mode 100644
index 000000000000..5eb423a7435e
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h
@@ -0,0 +1,40 @@
+//===-- XCoreTargetObjectFile.h - XCore Object Info -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCORETARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_XCORE_XCORETARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+static const unsigned CodeModelLargeSize = 256;
+
+  class XCoreTargetObjectFile : public TargetLoweringObjectFileELF {
+    MCSection *BSSSectionLarge;
+    MCSection *DataSectionLarge;
+    MCSection *ReadOnlySectionLarge;
+    MCSection *DataRelROSectionLarge;
+
+  public:
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+    MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind,
+                                        const TargetMachine &TM) const override;
+
+    MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+                                      const TargetMachine &TM) const override;
+
+    MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
+                                     const Constant *C,
+                                     unsigned &Align) const override;
+  };
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetStreamer.h b/contrib/llvm/lib/Target/XCore/XCoreTargetStreamer.h
new file mode 100644
index 000000000000..3563dbc5cb7b
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetStreamer.h
@@ -0,0 +1,27 @@
+//===-- XCoreTargetStreamer.h - XCore Target Streamer ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCORETARGETSTREAMER_H
+#define LLVM_LIB_TARGET_XCORE_XCORETARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class XCoreTargetStreamer : public MCTargetStreamer {
+public:
+  XCoreTargetStreamer(MCStreamer &S);
+  ~XCoreTargetStreamer() override;
+  virtual void emitCCTopData(StringRef Name) = 0;
+  virtual void emitCCTopFunction(StringRef Name) = 0;
+  virtual void emitCCBottomData(StringRef Name) = 0;
+  virtual void emitCCBottomFunction(StringRef Name) = 0;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h b/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
new file mode 100644
index 000000000000..9617796f4861
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
@@ -0,0 +1,54 @@
+//===-- XCoreTargetTransformInfo.h - XCore specific TTI ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// XCore target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_XCORE_XCORETARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_XCORE_XCORETARGETTRANSFORMINFO_H
+
+#include "XCore.h"
+#include "XCoreTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class XCoreTTIImpl : public BasicTTIImplBase<XCoreTTIImpl> {
+  typedef BasicTTIImplBase<XCoreTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const XCoreSubtarget *ST;
+  const XCoreTargetLowering *TLI;
+
+  const XCoreSubtarget *getST() const { return ST; }
+  const XCoreTargetLowering *getTLI() const { return TLI; }
+
+public:
+  explicit XCoreTTIImpl(const XCoreTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
+        TLI(ST->getTargetLowering()) {}
+
+  unsigned getNumberOfRegisters(bool Vector) {
+    if (Vector) {
+      return 0;
+    }
+    return 12;
+  }
+};
+
+} // end namespace llvm
+
+#endif